aboutsummaryrefslogtreecommitdiff
path: root/test/CodeGen/X86
diff options
context:
space:
mode:
Diffstat (limited to 'test/CodeGen/X86')
-rw-r--r--test/CodeGen/X86/2003-08-03-CallArgLiveRanges.ll2
-rw-r--r--test/CodeGen/X86/2003-08-23-DeadBlockTest.ll2
-rw-r--r--test/CodeGen/X86/2003-11-03-GlobalBool.ll2
-rw-r--r--test/CodeGen/X86/2004-02-13-FrameReturnAddress.ll2
-rw-r--r--test/CodeGen/X86/2004-02-14-InefficientStackPointer.ll2
-rw-r--r--test/CodeGen/X86/2004-02-22-Casts.ll2
-rw-r--r--test/CodeGen/X86/2004-03-30-Select-Max.ll2
-rw-r--r--test/CodeGen/X86/2004-04-13-FPCMOV-Crash.ll2
-rw-r--r--test/CodeGen/X86/2004-06-10-StackifierCrash.ll2
-rw-r--r--test/CodeGen/X86/2004-10-08-SelectSetCCFold.ll2
-rw-r--r--test/CodeGen/X86/2005-01-17-CycleInDAG.ll2
-rw-r--r--test/CodeGen/X86/2005-02-14-IllegalAssembler.ll2
-rw-r--r--test/CodeGen/X86/2005-05-08-FPStackifierPHI.ll2
-rw-r--r--test/CodeGen/X86/2006-01-19-ISelFoldingBug.ll2
-rw-r--r--test/CodeGen/X86/2006-03-01-InstrSchedBug.ll2
-rw-r--r--test/CodeGen/X86/2006-03-02-InstrSchedBug.ll2
-rw-r--r--test/CodeGen/X86/2006-04-04-CrossBlockCrash.ll2
-rw-r--r--test/CodeGen/X86/2006-04-27-ISelFoldingBug.ll2
-rw-r--r--test/CodeGen/X86/2006-05-01-SchedCausingSpills.ll2
-rw-r--r--test/CodeGen/X86/2006-05-02-InstrSched2.ll2
-rw-r--r--test/CodeGen/X86/2006-05-08-CoalesceSubRegClass.ll2
-rw-r--r--test/CodeGen/X86/2006-05-08-InstrSched.ll2
-rw-r--r--test/CodeGen/X86/2006-05-11-InstrSched.ll4
-rw-r--r--test/CodeGen/X86/2006-05-17-VectorArg.ll2
-rw-r--r--test/CodeGen/X86/2006-05-22-FPSetEQ.ll4
-rw-r--r--test/CodeGen/X86/2006-05-25-CycleInDAG.ll2
-rw-r--r--test/CodeGen/X86/2006-07-10-InlineAsmAConstraint.ll2
-rw-r--r--test/CodeGen/X86/2006-07-12-InlineAsmQConstraint.ll2
-rw-r--r--test/CodeGen/X86/2006-07-20-InlineAsm.ll2
-rw-r--r--test/CodeGen/X86/2006-07-28-AsmPrint-Long-As-Pointer.ll2
-rw-r--r--test/CodeGen/X86/2006-07-31-SingleRegClass.ll2
-rw-r--r--test/CodeGen/X86/2006-08-07-CycleInDAG.ll2
-rw-r--r--test/CodeGen/X86/2006-08-16-CycleInDAG.ll2
-rw-r--r--test/CodeGen/X86/2006-08-21-ExtraMovInst.ll2
-rw-r--r--test/CodeGen/X86/2006-09-01-CycleInDAG.ll2
-rw-r--r--test/CodeGen/X86/2006-10-09-CycleInDAG.ll2
-rw-r--r--test/CodeGen/X86/2006-10-10-FindModifiedNodeSlotBug.ll2
-rw-r--r--test/CodeGen/X86/2006-10-12-CycleInDAG.ll2
-rw-r--r--test/CodeGen/X86/2006-10-13-CycleInDAG.ll2
-rw-r--r--test/CodeGen/X86/2006-10-19-SwitchUnnecessaryBranching.ll2
-rw-r--r--test/CodeGen/X86/2006-11-12-CSRetCC.ll2
-rw-r--r--test/CodeGen/X86/2006-11-17-IllegalMove.ll2
-rw-r--r--test/CodeGen/X86/2006-11-27-SelectLegalize.ll2
-rw-r--r--test/CodeGen/X86/2006-12-16-InlineAsmCrash.ll2
-rw-r--r--test/CodeGen/X86/2006-12-19-IntelSyntax.ll2
-rw-r--r--test/CodeGen/X86/2007-01-13-StackPtrIndex.ll2
-rw-r--r--test/CodeGen/X86/2007-01-29-InlineAsm-ir.ll2
-rw-r--r--test/CodeGen/X86/2007-02-04-OrAddrMode.ll2
-rw-r--r--test/CodeGen/X86/2007-02-19-LiveIntervalAssert.ll2
-rw-r--r--test/CodeGen/X86/2007-02-23-DAGCombine-Miscompile.ll2
-rw-r--r--test/CodeGen/X86/2007-02-25-FastCCStack.ll2
-rw-r--r--test/CodeGen/X86/2007-03-15-GEP-Idx-Sink.ll2
-rw-r--r--test/CodeGen/X86/2007-03-16-InlineAsm.ll2
-rw-r--r--test/CodeGen/X86/2007-03-18-LiveIntervalAssert.ll2
-rw-r--r--test/CodeGen/X86/2007-03-24-InlineAsmMultiRegConstraint.ll2
-rw-r--r--test/CodeGen/X86/2007-03-24-InlineAsmPModifier.ll2
-rw-r--r--test/CodeGen/X86/2007-03-24-InlineAsmVectorOp.ll2
-rw-r--r--test/CodeGen/X86/2007-03-24-InlineAsmXConstraint.ll2
-rw-r--r--test/CodeGen/X86/2007-03-26-CoalescerBug.ll2
-rw-r--r--test/CodeGen/X86/2007-04-11-InlineAsmVectorResult.ll2
-rw-r--r--test/CodeGen/X86/2007-04-24-Huge-Stack.ll2
-rw-r--r--test/CodeGen/X86/2007-05-05-VecCastExpand.ll2
-rw-r--r--test/CodeGen/X86/2007-05-14-LiveIntervalAssert.ll2
-rw-r--r--test/CodeGen/X86/2007-05-17-ShuffleISelBug.ll2
-rw-r--r--test/CodeGen/X86/2007-06-28-X86-64-isel.ll2
-rw-r--r--test/CodeGen/X86/2007-06-29-DAGCombinerBug.ll2
-rw-r--r--test/CodeGen/X86/2007-06-29-VecFPConstantCSEBug.ll2
-rw-r--r--test/CodeGen/X86/2007-08-01-LiveVariablesBug.ll2
-rw-r--r--test/CodeGen/X86/2007-08-10-SignExtSubreg.ll2
-rw-r--r--test/CodeGen/X86/2007-10-04-AvoidEFLAGSCopy.ll2
-rw-r--r--test/CodeGen/X86/2007-10-12-CoalesceExtSubReg.ll2
-rw-r--r--test/CodeGen/X86/2007-10-12-SpillerUnfold1.ll2
-rw-r--r--test/CodeGen/X86/2007-10-12-SpillerUnfold2.ll2
-rw-r--r--test/CodeGen/X86/2007-10-19-SpillerUnfold.ll2
-rw-r--r--test/CodeGen/X86/2007-10-29-ExtendSetCC.ll2
-rw-r--r--test/CodeGen/X86/2007-10-30-LSRCrash.ll2
-rw-r--r--test/CodeGen/X86/2007-10-31-extractelement-i64.ll2
-rw-r--r--test/CodeGen/X86/2007-11-01-ISelCrash.ll2
-rw-r--r--test/CodeGen/X86/2007-11-06-InstrSched.ll2
-rw-r--r--test/CodeGen/X86/2007-11-07-MulBy4.ll2
-rw-r--r--test/CodeGen/X86/2007-11-30-LoadFolding-Bug.ll2
-rw-r--r--test/CodeGen/X86/2007-12-18-LoadCSEBug.ll2
-rw-r--r--test/CodeGen/X86/2008-01-08-SchedulerCrash.ll2
-rw-r--r--test/CodeGen/X86/2008-01-16-FPStackifierAssert.ll2
-rw-r--r--test/CodeGen/X86/2008-01-16-InvalidDAGCombineXform.ll2
-rw-r--r--test/CodeGen/X86/2008-02-05-ISelCrash.ll2
-rw-r--r--test/CodeGen/X86/2008-02-06-LoadFoldingBug.ll2
-rw-r--r--test/CodeGen/X86/2008-02-14-BitMiscompile.ll2
-rw-r--r--test/CodeGen/X86/2008-02-18-TailMergingBug.ll2
-rw-r--r--test/CodeGen/X86/2008-02-22-LocalRegAllocBug.ll2
-rw-r--r--test/CodeGen/X86/2008-02-25-X86-64-CoalescerBug.ll2
-rw-r--r--test/CodeGen/X86/2008-02-26-AsmDirectMemOp.ll2
-rw-r--r--test/CodeGen/X86/2008-02-27-DeadSlotElimBug.ll2
-rw-r--r--test/CodeGen/X86/2008-02-27-PEICrash.ll2
-rw-r--r--test/CodeGen/X86/2008-03-06-frem-fpstack.ll2
-rw-r--r--test/CodeGen/X86/2008-03-07-APIntBug.ll2
-rw-r--r--test/CodeGen/X86/2008-03-13-TwoAddrPassCrash.ll2
-rw-r--r--test/CodeGen/X86/2008-03-19-DAGCombinerBug.ll2
-rw-r--r--test/CodeGen/X86/2008-03-25-TwoAddrPassBug.ll2
-rw-r--r--test/CodeGen/X86/2008-04-09-BranchFolding.ll2
-rw-r--r--test/CodeGen/X86/2008-04-16-CoalescerBug.ll2
-rw-r--r--test/CodeGen/X86/2008-04-24-MemCpyBug.ll2
-rw-r--r--test/CodeGen/X86/2008-04-28-CyclicSchedUnit.ll2
-rw-r--r--test/CodeGen/X86/2008-05-01-InvalidOrdCompare.ll2
-rw-r--r--test/CodeGen/X86/2008-05-09-PHIElimBug.ll2
-rw-r--r--test/CodeGen/X86/2008-05-09-ShuffleLoweringBug.ll2
-rw-r--r--test/CodeGen/X86/2008-05-21-CoalescerBug.ll2
-rw-r--r--test/CodeGen/X86/2008-05-22-FoldUnalignedLoad.ll2
-rw-r--r--test/CodeGen/X86/2008-06-13-NotVolatileLoadStore.ll4
-rw-r--r--test/CodeGen/X86/2008-06-25-VecISelBug.ll2
-rw-r--r--test/CodeGen/X86/2008-07-11-SHLBy1.ll2
-rw-r--r--test/CodeGen/X86/2008-07-22-CombinerCrash.ll2
-rw-r--r--test/CodeGen/X86/2008-07-23-VSetCC.ll2
-rw-r--r--test/CodeGen/X86/2008-08-06-CmpStride.ll2
-rw-r--r--test/CodeGen/X86/2008-08-06-RewriterBug.ll2
-rw-r--r--test/CodeGen/X86/2008-08-23-64Bit-maskmovq.ll2
-rw-r--r--test/CodeGen/X86/2008-09-05-sinttofp-2xi32.ll2
-rw-r--r--test/CodeGen/X86/2008-09-11-CoalescerBug.ll2
-rw-r--r--test/CodeGen/X86/2008-09-11-CoalescerBug2.ll2
-rw-r--r--test/CodeGen/X86/2008-09-17-inline-asm-1.ll4
-rw-r--r--test/CodeGen/X86/2008-09-18-inline-asm-2.ll6
-rw-r--r--test/CodeGen/X86/2008-09-25-sseregparm-1.ll4
-rw-r--r--test/CodeGen/X86/2008-09-29-VolatileBug.ll2
-rw-r--r--test/CodeGen/X86/2008-10-06-x87ld-nan-1.ll2
-rw-r--r--test/CodeGen/X86/2008-10-06-x87ld-nan-2.ll2
-rw-r--r--test/CodeGen/X86/2008-10-07-SSEISelBug.ll2
-rw-r--r--test/CodeGen/X86/2008-10-13-CoalescerBug.ll2
-rw-r--r--test/CodeGen/X86/2008-10-16-VecUnaryOp.ll2
-rw-r--r--test/CodeGen/X86/2008-10-17-Asm64bitRConstraint.ll4
-rw-r--r--test/CodeGen/X86/2008-10-20-AsmDoubleInI32.ll4
-rw-r--r--test/CodeGen/X86/2008-10-24-FlippedCompare.ll2
-rw-r--r--test/CodeGen/X86/2008-10-29-ExpandVAARG.ll2
-rw-r--r--test/CodeGen/X86/2008-11-03-F80VAARG.ll2
-rw-r--r--test/CodeGen/X86/2008-12-02-dagcombine-1.ll2
-rw-r--r--test/CodeGen/X86/2008-12-02-dagcombine-2.ll2
-rw-r--r--test/CodeGen/X86/2008-12-02-dagcombine-3.ll4
-rw-r--r--test/CodeGen/X86/2008-12-16-dagcombine-4.ll2
-rw-r--r--test/CodeGen/X86/2008-12-22-dagcombine-5.ll2
-rw-r--r--test/CodeGen/X86/2008-12-23-crazy-address.ll2
-rw-r--r--test/CodeGen/X86/2008-12-23-dagcombine-6.ll2
-rw-r--r--test/CodeGen/X86/2009-01-13-DoubleUpdate.ll2
-rw-r--r--test/CodeGen/X86/2009-01-16-UIntToFP.ll2
-rw-r--r--test/CodeGen/X86/2009-01-25-NoSSE.ll2
-rw-r--r--test/CodeGen/X86/2009-01-26-WrongCheck.ll2
-rw-r--r--test/CodeGen/X86/2009-01-31-BigShift.ll2
-rw-r--r--test/CodeGen/X86/2009-01-31-BigShift2.ll2
-rw-r--r--test/CodeGen/X86/2009-01-31-BigShift3.ll2
-rw-r--r--test/CodeGen/X86/2009-02-01-LargeMask.ll2
-rw-r--r--test/CodeGen/X86/2009-02-03-AnalyzedTwice.ll2
-rw-r--r--test/CodeGen/X86/2009-02-08-CoalescerBug.ll2
-rw-r--r--test/CodeGen/X86/2009-02-12-DebugInfoVLA.ll5
-rw-r--r--test/CodeGen/X86/2009-02-12-InlineAsm-nieZ-constraints.ll2
-rw-r--r--test/CodeGen/X86/2009-02-12-SpillerBug.ll2
-rw-r--r--test/CodeGen/X86/2009-02-25-CommuteBug.ll2
-rw-r--r--test/CodeGen/X86/2009-02-26-MachineLICMBug.ll4
-rw-r--r--test/CodeGen/X86/2009-03-03-BTHang.ll2
-rw-r--r--test/CodeGen/X86/2009-03-03-BitcastLongDouble.ll2
-rw-r--r--test/CodeGen/X86/2009-03-07-FPConstSelect.ll2
-rw-r--r--test/CodeGen/X86/2009-03-09-APIntCrash.ll2
-rw-r--r--test/CodeGen/X86/2009-03-25-TestBug.ll2
-rw-r--r--test/CodeGen/X86/2009-03-26-NoImplicitFPBug.ll2
-rw-r--r--test/CodeGen/X86/2009-04-12-FastIselOverflowCrash.ll2
-rw-r--r--test/CodeGen/X86/2009-04-12-picrel.ll2
-rw-r--r--test/CodeGen/X86/2009-04-24.ll2
-rw-r--r--test/CodeGen/X86/2009-04-25-CoalescerBug.ll2
-rw-r--r--test/CodeGen/X86/2009-04-scale.ll2
-rw-r--r--test/CodeGen/X86/2009-05-11-tailmerge-crash.ll2
-rw-r--r--test/CodeGen/X86/2009-05-19-SingleElementExtractElement.ll2
-rw-r--r--test/CodeGen/X86/2009-05-28-DAGCombineCrash.ll2
-rw-r--r--test/CodeGen/X86/2009-05-30-ISelBug.ll2
-rw-r--r--test/CodeGen/X86/2009-06-04-VirtualLiveIn.ll2
-rw-r--r--test/CodeGen/X86/2009-06-05-VZextByteShort.ll2
-rw-r--r--test/CodeGen/X86/2009-06-05-sitofpCrash.ll2
-rw-r--r--test/CodeGen/X86/2009-06-12-x86_64-tail-call-conv-out-of-sync-bug.ll4
-rw-r--r--test/CodeGen/X86/2009-06-15-not-a-tail-call.ll2
-rw-r--r--test/CodeGen/X86/2009-06-18-movlp-shuffle-register.ll2
-rw-r--r--test/CodeGen/X86/2009-07-06-TwoAddrAssert.ll2
-rw-r--r--test/CodeGen/X86/2009-07-07-SplitICmp.ll2
-rw-r--r--test/CodeGen/X86/2009-07-09-ExtractBoolFromVector.ll2
-rw-r--r--test/CodeGen/X86/2009-07-19-AsmExtraOperands.ll2
-rw-r--r--test/CodeGen/X86/2009-07-20-DAGCombineBug.ll2
-rw-r--r--test/CodeGen/X86/2009-08-19-LoadNarrowingMiscompile.ll2
-rw-r--r--test/CodeGen/X86/2009-08-23-SubRegReuseUndo.ll2
-rw-r--r--test/CodeGen/X86/20090313-signext.ll2
-rw-r--r--test/CodeGen/X86/2010-01-05-ZExt-Shl.ll4
-rw-r--r--test/CodeGen/X86/2010-01-15-SelectionDAGCycle.ll2
-rw-r--r--test/CodeGen/X86/2010-01-18-DbgValue.ll5
-rw-r--r--test/CodeGen/X86/2010-02-03-DualUndef.ll2
-rw-r--r--test/CodeGen/X86/2010-02-11-NonTemporal.ll2
-rw-r--r--test/CodeGen/X86/2010-02-12-CoalescerBug-Impdef.ll4
-rw-r--r--test/CodeGen/X86/2010-02-23-DAGCombineBug.ll2
-rw-r--r--test/CodeGen/X86/2010-02-23-RematImplicitSubreg.ll2
-rw-r--r--test/CodeGen/X86/2010-03-05-EFLAGS-Redef.ll2
-rw-r--r--test/CodeGen/X86/2010-04-08-CoalescerBug.ll4
-rw-r--r--test/CodeGen/X86/2010-05-12-FastAllocKills.ll42
-rw-r--r--test/CodeGen/X86/2010-05-26-DotDebugLoc.ll2
-rw-r--r--test/CodeGen/X86/2010-05-28-Crash.ll2
-rw-r--r--test/CodeGen/X86/2010-06-01-DeadArg-DbgInfo.ll2
-rw-r--r--test/CodeGen/X86/2010-06-14-fast-isel-fs-load.ll2
-rw-r--r--test/CodeGen/X86/2010-06-28-FastAllocTiedOperand.ll2
-rw-r--r--test/CodeGen/X86/2010-08-04-MaskedSignedCompare.ll20
-rw-r--r--test/CodeGen/X86/2010-09-17-SideEffectsInChain.ll2
-rw-r--r--test/CodeGen/X86/2010-10-08-cmpxchg8b.ll2
-rw-r--r--test/CodeGen/X86/2010-11-09-MOVLPS.ll2
-rw-r--r--test/CodeGen/X86/2010-11-18-SelectOfExtload.ll2
-rw-r--r--test/CodeGen/X86/2011-01-24-DbgValue-Before-Use.ll7
-rw-r--r--test/CodeGen/X86/2011-02-21-VirtRegRewriter-KillSubReg.ll2
-rw-r--r--test/CodeGen/X86/2011-03-02-DAGCombiner.ll2
-rw-r--r--test/CodeGen/X86/2011-03-30-CreateFixedObjCrash.ll2
-rw-r--r--test/CodeGen/X86/2011-05-09-loaduse.ll2
-rw-r--r--test/CodeGen/X86/2011-06-01-fildll.ll2
-rw-r--r--test/CodeGen/X86/2011-06-03-x87chain.ll2
-rw-r--r--test/CodeGen/X86/2011-06-06-fgetsign80bit.ll2
-rw-r--r--test/CodeGen/X86/2011-06-14-PreschedRegalias.ll2
-rw-r--r--test/CodeGen/X86/2011-07-13-BadFrameIndexDisplacement.ll2
-rw-r--r--test/CodeGen/X86/2011-08-23-PerformSubCombine128.ll2
-rw-r--r--test/CodeGen/X86/2011-08-23-Trampoline.ll4
-rw-r--r--test/CodeGen/X86/2011-08-29-BlockConstant.ll2
-rw-r--r--test/CodeGen/X86/2011-09-14-valcoalesce.ll14
-rw-r--r--test/CodeGen/X86/2011-09-18-sse2cmp.ll2
-rw-r--r--test/CodeGen/X86/2011-09-21-setcc-bug.ll2
-rw-r--r--test/CodeGen/X86/2011-10-18-FastISel-VectorParams.ll2
-rw-r--r--test/CodeGen/X86/2011-10-19-LegelizeLoad.ll2
-rw-r--r--test/CodeGen/X86/2011-10-19-widen_vselect.ll23
-rw-r--r--test/CodeGen/X86/2011-10-21-widen-cmp.ll6
-rw-r--r--test/CodeGen/X86/2011-10-27-tstore.ll2
-rw-r--r--test/CodeGen/X86/2011-10-30-padd.ll2
-rw-r--r--test/CodeGen/X86/2011-11-07-LegalizeBuildVector.ll2
-rw-r--r--test/CodeGen/X86/2011-11-30-or.ll2
-rw-r--r--test/CodeGen/X86/2011-12-06-BitcastVectorGlobal.ll2
-rw-r--r--test/CodeGen/X86/2011-12-08-AVXISelBugs.ll2
-rw-r--r--test/CodeGen/X86/2011-12-15-vec_shift.ll4
-rw-r--r--test/CodeGen/X86/2011-12-26-extractelement-duplicate-load.ll10
-rw-r--r--test/CodeGen/X86/2011-12-8-bitcastintprom.ll4
-rw-r--r--test/CodeGen/X86/2011-20-21-zext-ui2fp.ll2
-rw-r--r--test/CodeGen/X86/2012-01-11-split-cv.ll2
-rw-r--r--test/CodeGen/X86/2012-01-12-extract-sv.ll8
-rw-r--r--test/CodeGen/X86/2012-01-18-vbitcast.ll2
-rw-r--r--test/CodeGen/X86/2012-02-23-mmx-inlineasm.ll2
-rw-r--r--test/CodeGen/X86/2012-04-26-sdglue.ll5
-rw-r--r--test/CodeGen/X86/2012-07-10-extload64.ll6
-rw-r--r--test/CodeGen/X86/2012-07-10-shufnorm.ll2
-rw-r--r--test/CodeGen/X86/2012-07-15-BuildVectorPromote.ll2
-rw-r--r--test/CodeGen/X86/2012-07-15-broadcastfold.ll2
-rw-r--r--test/CodeGen/X86/2012-07-15-tconst_shl.ll2
-rw-r--r--test/CodeGen/X86/2012-07-15-vshl.ll2
-rw-r--r--test/CodeGen/X86/2012-07-16-LeaUndef.ll2
-rw-r--r--test/CodeGen/X86/2012-07-16-fp2ui-i1.ll2
-rw-r--r--test/CodeGen/X86/2012-07-17-vtrunc.ll2
-rw-r--r--test/CodeGen/X86/2012-07-23-select_cc.ll2
-rw-r--r--test/CodeGen/X86/2012-08-16-setcc.ll8
-rw-r--r--test/CodeGen/X86/2012-09-13-dagco-fneg.ll2
-rw-r--r--test/CodeGen/X86/2012-1-10-buildvector.ll7
-rw-r--r--test/CodeGen/X86/2012-10-18-crash-dagco.ll4
-rw-r--r--test/CodeGen/X86/2012-11-28-merge-store-alias.ll2
-rw-r--r--test/CodeGen/X86/2012-12-1-merge-multiple.ll16
-rw-r--r--test/CodeGen/X86/2012-12-12-DAGCombineCrash.ll2
-rw-r--r--test/CodeGen/X86/2012-12-14-v8fp80-crash.ll2
-rw-r--r--test/CodeGen/X86/2013-05-06-ConactVectorCrash.ll2
-rw-r--r--test/CodeGen/X86/2014-05-29-factorial.ll2
-rw-r--r--test/CodeGen/X86/3dnow-intrinsics.ll2
-rw-r--r--test/CodeGen/X86/3dnow-schedule.ll394
-rw-r--r--test/CodeGen/X86/4char-promote.ll2
-rw-r--r--test/CodeGen/X86/AppendingLinkage.ll2
-rw-r--r--test/CodeGen/X86/Atomics-64.ll5
-rw-r--r--test/CodeGen/X86/DbgValueOtherTargets.test4
-rw-r--r--test/CodeGen/X86/GlobalISel/GV.ll16
-rw-r--r--test/CodeGen/X86/GlobalISel/add-scalar.ll66
-rw-r--r--test/CodeGen/X86/GlobalISel/add-vec.ll56
-rw-r--r--test/CodeGen/X86/GlobalISel/and-scalar.ll23
-rw-r--r--test/CodeGen/X86/GlobalISel/binop.ll44
-rw-r--r--test/CodeGen/X86/GlobalISel/br.ll2
-rw-r--r--test/CodeGen/X86/GlobalISel/brcond.ll90
-rw-r--r--test/CodeGen/X86/GlobalISel/callingconv.ll353
-rw-r--r--test/CodeGen/X86/GlobalISel/cmp.ll26
-rw-r--r--test/CodeGen/X86/GlobalISel/constant.ll14
-rw-r--r--test/CodeGen/X86/GlobalISel/ext-x86-64.ll8
-rw-r--r--test/CodeGen/X86/GlobalISel/ext.ll46
-rw-r--r--test/CodeGen/X86/GlobalISel/fadd-scalar.ll4
-rw-r--r--test/CodeGen/X86/GlobalISel/fconstant.ll40
-rw-r--r--test/CodeGen/X86/GlobalISel/fdiv-scalar.ll4
-rw-r--r--test/CodeGen/X86/GlobalISel/fmul-scalar.ll4
-rw-r--r--test/CodeGen/X86/GlobalISel/fpext-scalar.ll12
-rw-r--r--test/CodeGen/X86/GlobalISel/frameIndex.ll7
-rw-r--r--test/CodeGen/X86/GlobalISel/fsub-scalar.ll4
-rw-r--r--test/CodeGen/X86/GlobalISel/gep.ll36
-rw-r--r--test/CodeGen/X86/GlobalISel/irtranslator-callingconv.ll816
-rw-r--r--test/CodeGen/X86/GlobalISel/legalize-GV.mir2
-rw-r--r--test/CodeGen/X86/GlobalISel/legalize-add-v128.mir49
-rw-r--r--test/CodeGen/X86/GlobalISel/legalize-add-v256.mir133
-rw-r--r--test/CodeGen/X86/GlobalISel/legalize-add-v512.mir235
-rw-r--r--test/CodeGen/X86/GlobalISel/legalize-add.mir104
-rw-r--r--test/CodeGen/X86/GlobalISel/legalize-and-scalar.mir73
-rw-r--r--test/CodeGen/X86/GlobalISel/legalize-brcond.mir58
-rw-r--r--test/CodeGen/X86/GlobalISel/legalize-cmp.mir79
-rw-r--r--test/CodeGen/X86/GlobalISel/legalize-constant.mir88
-rw-r--r--test/CodeGen/X86/GlobalISel/legalize-ext-x86-64.mir210
-rw-r--r--test/CodeGen/X86/GlobalISel/legalize-ext.mir398
-rw-r--r--test/CodeGen/X86/GlobalISel/legalize-fadd-scalar.mir25
-rw-r--r--test/CodeGen/X86/GlobalISel/legalize-fdiv-scalar.mir25
-rw-r--r--test/CodeGen/X86/GlobalISel/legalize-fmul-scalar.mir25
-rw-r--r--test/CodeGen/X86/GlobalISel/legalize-fpext-scalar.mir34
-rw-r--r--test/CodeGen/X86/GlobalISel/legalize-fsub-scalar.mir25
-rw-r--r--test/CodeGen/X86/GlobalISel/legalize-gep.mir61
-rw-r--r--test/CodeGen/X86/GlobalISel/legalize-insert-vec256.mir6
-rw-r--r--test/CodeGen/X86/GlobalISel/legalize-insert-vec512.mir29
-rw-r--r--test/CodeGen/X86/GlobalISel/legalize-memop-scalar.mir100
-rw-r--r--test/CodeGen/X86/GlobalISel/legalize-mul-scalar.mir91
-rw-r--r--test/CodeGen/X86/GlobalISel/legalize-mul-v128.mir18
-rw-r--r--test/CodeGen/X86/GlobalISel/legalize-mul-v256.mir18
-rw-r--r--test/CodeGen/X86/GlobalISel/legalize-mul-v512.mir18
-rw-r--r--test/CodeGen/X86/GlobalISel/legalize-or-scalar.mir76
-rw-r--r--test/CodeGen/X86/GlobalISel/legalize-phi.mir599
-rw-r--r--test/CodeGen/X86/GlobalISel/legalize-sub-v128.mir45
-rw-r--r--test/CodeGen/X86/GlobalISel/legalize-sub-v256.mir45
-rw-r--r--test/CodeGen/X86/GlobalISel/legalize-sub-v512.mir45
-rw-r--r--test/CodeGen/X86/GlobalISel/legalize-sub.mir49
-rw-r--r--test/CodeGen/X86/GlobalISel/legalize-trunc.mir34
-rw-r--r--test/CodeGen/X86/GlobalISel/legalize-undef.mir51
-rw-r--r--test/CodeGen/X86/GlobalISel/legalize-xor-scalar.mir71
-rw-r--r--test/CodeGen/X86/GlobalISel/memop-scalar-x32.ll20
-rw-r--r--test/CodeGen/X86/GlobalISel/memop-scalar.ll40
-rw-r--r--test/CodeGen/X86/GlobalISel/memop-vec.ll24
-rw-r--r--test/CodeGen/X86/GlobalISel/mul-scalar.ll6
-rw-r--r--test/CodeGen/X86/GlobalISel/mul-vec.ll18
-rw-r--r--test/CodeGen/X86/GlobalISel/or-scalar.ll23
-rw-r--r--test/CodeGen/X86/GlobalISel/phi.ll168
-rw-r--r--test/CodeGen/X86/GlobalISel/regbankselect-X86_64.mir354
-rw-r--r--test/CodeGen/X86/GlobalISel/select-GV.mir69
-rw-r--r--test/CodeGen/X86/GlobalISel/select-add-v128.mir32
-rw-r--r--test/CodeGen/X86/GlobalISel/select-add-v256.mir24
-rw-r--r--test/CodeGen/X86/GlobalISel/select-add-v512.mir49
-rw-r--r--test/CodeGen/X86/GlobalISel/select-add-x32.mir38
-rw-r--r--test/CodeGen/X86/GlobalISel/select-add.mir96
-rw-r--r--test/CodeGen/X86/GlobalISel/select-and-scalar.mir73
-rw-r--r--test/CodeGen/X86/GlobalISel/select-blsi.mir63
-rw-r--r--test/CodeGen/X86/GlobalISel/select-blsr.mir60
-rw-r--r--test/CodeGen/X86/GlobalISel/select-br.mir12
-rw-r--r--test/CodeGen/X86/GlobalISel/select-brcond.mir66
-rw-r--r--test/CodeGen/X86/GlobalISel/select-cmp.mir321
-rw-r--r--test/CodeGen/X86/GlobalISel/select-constant.mir76
-rw-r--r--test/CodeGen/X86/GlobalISel/select-copy.mir185
-rw-r--r--test/CodeGen/X86/GlobalISel/select-ext-x86-64.mir156
-rw-r--r--test/CodeGen/X86/GlobalISel/select-ext.mir251
-rw-r--r--test/CodeGen/X86/GlobalISel/select-extract-vec256.mir16
-rw-r--r--test/CodeGen/X86/GlobalISel/select-extract-vec512.mir17
-rw-r--r--test/CodeGen/X86/GlobalISel/select-fadd-scalar.mir36
-rw-r--r--test/CodeGen/X86/GlobalISel/select-fconstant.mir85
-rw-r--r--test/CodeGen/X86/GlobalISel/select-fdiv-scalar.mir36
-rw-r--r--test/CodeGen/X86/GlobalISel/select-fmul-scalar.mir36
-rw-r--r--test/CodeGen/X86/GlobalISel/select-fpext-scalar.mir38
-rw-r--r--test/CodeGen/X86/GlobalISel/select-frameIndex.mir6
-rw-r--r--test/CodeGen/X86/GlobalISel/select-fsub-scalar.mir36
-rw-r--r--test/CodeGen/X86/GlobalISel/select-gep.mir15
-rw-r--r--test/CodeGen/X86/GlobalISel/select-inc.mir9
-rw-r--r--test/CodeGen/X86/GlobalISel/select-insert-vec256.mir86
-rw-r--r--test/CodeGen/X86/GlobalISel/select-insert-vec512.mir142
-rw-r--r--test/CodeGen/X86/GlobalISel/select-intrinsic-x86-flags-read-u32.mir27
-rw-r--r--test/CodeGen/X86/GlobalISel/select-leaf-constant.mir37
-rw-r--r--test/CodeGen/X86/GlobalISel/select-memop-scalar-x32.mir123
-rw-r--r--test/CodeGen/X86/GlobalISel/select-memop-scalar.mir146
-rw-r--r--test/CodeGen/X86/GlobalISel/select-memop-v128.mir58
-rw-r--r--test/CodeGen/X86/GlobalISel/select-memop-v256.mir67
-rw-r--r--test/CodeGen/X86/GlobalISel/select-memop-v512.mir53
-rw-r--r--test/CodeGen/X86/GlobalISel/select-merge-vec256.mir39
-rw-r--r--test/CodeGen/X86/GlobalISel/select-merge-vec512.mir45
-rw-r--r--test/CodeGen/X86/GlobalISel/select-mul-scalar.mir58
-rw-r--r--test/CodeGen/X86/GlobalISel/select-mul-vec.mir181
-rw-r--r--test/CodeGen/X86/GlobalISel/select-or-scalar.mir73
-rw-r--r--test/CodeGen/X86/GlobalISel/select-phi.mir423
-rw-r--r--test/CodeGen/X86/GlobalISel/select-sub-v128.mir88
-rw-r--r--test/CodeGen/X86/GlobalISel/select-sub-v256.mir80
-rw-r--r--test/CodeGen/X86/GlobalISel/select-sub-v512.mir49
-rw-r--r--test/CodeGen/X86/GlobalISel/select-sub.mir65
-rw-r--r--test/CodeGen/X86/GlobalISel/select-trunc.mir79
-rw-r--r--test/CodeGen/X86/GlobalISel/select-undef.mir66
-rw-r--r--test/CodeGen/X86/GlobalISel/select-unmerge-vec256.mir37
-rw-r--r--test/CodeGen/X86/GlobalISel/select-unmerge-vec512.mir42
-rw-r--r--test/CodeGen/X86/GlobalISel/select-xor-scalar.mir73
-rw-r--r--test/CodeGen/X86/GlobalISel/sub-scalar.ll56
-rw-r--r--test/CodeGen/X86/GlobalISel/sub-vec.ll24
-rw-r--r--test/CodeGen/X86/GlobalISel/trunc.ll12
-rw-r--r--test/CodeGen/X86/GlobalISel/undef.ll37
-rw-r--r--test/CodeGen/X86/GlobalISel/x86_64-fallback.ll13
-rw-r--r--test/CodeGen/X86/GlobalISel/xor-scalar.ll23
-rw-r--r--test/CodeGen/X86/MachineBranchProb.ll8
-rw-r--r--test/CodeGen/X86/MergeConsecutiveStores.ll442
-rw-r--r--test/CodeGen/X86/O0-pipeline.ll4
-rw-r--r--test/CodeGen/X86/PR34565.ll60
-rw-r--r--test/CodeGen/X86/SwitchLowering.ll2
-rw-r--r--test/CodeGen/X86/SwizzleShuff.ll12
-rw-r--r--test/CodeGen/X86/TruncAssertSext.ll20
-rw-r--r--test/CodeGen/X86/TruncAssertZext.ll43
-rw-r--r--test/CodeGen/X86/WidenArith.ll4
-rw-r--r--test/CodeGen/X86/abi-isel.ll20
-rw-r--r--test/CodeGen/X86/absolute-bit-mask.ll2
-rw-r--r--test/CodeGen/X86/add-ext.ll22
-rw-r--r--test/CodeGen/X86/add-of-carry.ll4
-rw-r--r--test/CodeGen/X86/add-sub-nsw-nuw.ll4
-rw-r--r--test/CodeGen/X86/add.ll386
-rw-r--r--test/CodeGen/X86/add_shl_constant.ll2
-rw-r--r--test/CodeGen/X86/addcarry.ll23
-rw-r--r--test/CodeGen/X86/addr-of-ret-addr.ll4
-rw-r--r--test/CodeGen/X86/adx-intrinsics.ll151
-rw-r--r--test/CodeGen/X86/adx-schedule.ll114
-rw-r--r--test/CodeGen/X86/aes-schedule.ll359
-rw-r--r--test/CodeGen/X86/aes_intrinsics.ll102
-rw-r--r--test/CodeGen/X86/aligned-comm.ll2
-rw-r--r--test/CodeGen/X86/all-ones-vector.ll304
-rw-r--r--test/CodeGen/X86/alloca-align-rounding-32.ll2
-rw-r--r--test/CodeGen/X86/alloca-align-rounding.ll4
-rw-r--r--test/CodeGen/X86/and-sink.ll30
-rw-r--r--test/CodeGen/X86/and-su.ll2
-rw-r--r--test/CodeGen/X86/andimm8.ll2
-rw-r--r--test/CodeGen/X86/anyext.ll16
-rw-r--r--test/CodeGen/X86/asm-global-imm.ll2
-rw-r--r--test/CodeGen/X86/asm-modifier-P.ll9
-rw-r--r--test/CodeGen/X86/atom-cmpb.ll2
-rw-r--r--test/CodeGen/X86/atom-fixup-lea2.ll2
-rw-r--r--test/CodeGen/X86/atom-fixup-lea3.ll11
-rw-r--r--test/CodeGen/X86/atom-sched.ll8
-rw-r--r--test/CodeGen/X86/atomic-dagsched.ll2
-rw-r--r--test/CodeGen/X86/atomic-eflags-reuse.ll238
-rw-r--r--test/CodeGen/X86/atomic-load-store-wide.ll2
-rw-r--r--test/CodeGen/X86/atomic-minmax-i6432.ll325
-rw-r--r--test/CodeGen/X86/atomic-or.ll2
-rw-r--r--test/CodeGen/X86/atomic128.ll154
-rw-r--r--test/CodeGen/X86/atomic32.ll6
-rw-r--r--test/CodeGen/X86/atomic64.ll2
-rw-r--r--test/CodeGen/X86/atomic6432.ll2
-rw-r--r--test/CodeGen/X86/atomic8.ll4
-rw-r--r--test/CodeGen/X86/atomic_add.ll4
-rw-r--r--test/CodeGen/X86/atomic_idempotent.ll4
-rw-r--r--test/CodeGen/X86/atomic_op.ll2
-rw-r--r--test/CodeGen/X86/avg-mask.ll445
-rw-r--r--test/CodeGen/X86/avg.ll854
-rw-r--r--test/CodeGen/X86/avoid-loop-align-2.ll2
-rw-r--r--test/CodeGen/X86/avx-arith.ll62
-rw-r--r--test/CodeGen/X86/avx-basic.ll30
-rw-r--r--test/CodeGen/X86/avx-bitcast.ll2
-rw-r--r--test/CodeGen/X86/avx-cast.ll38
-rw-r--r--test/CodeGen/X86/avx-cmp.ll34
-rw-r--r--test/CodeGen/X86/avx-cvt-2.ll32
-rw-r--r--test/CodeGen/X86/avx-cvt-3.ll44
-rw-r--r--test/CodeGen/X86/avx-cvt.ll32
-rw-r--r--test/CodeGen/X86/avx-gfni-intrinsics.ll63
-rw-r--r--test/CodeGen/X86/avx-insertelt.ll16
-rw-r--r--test/CodeGen/X86/avx-intrinsics-fast-isel.ll998
-rw-r--r--test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll349
-rw-r--r--test/CodeGen/X86/avx-intrinsics-x86.ll962
-rw-r--r--test/CodeGen/X86/avx-intrinsics-x86_64.ll28
-rw-r--r--test/CodeGen/X86/avx-load-store.ll84
-rw-r--r--test/CodeGen/X86/avx-logic.ll64
-rw-r--r--test/CodeGen/X86/avx-minmax.ll2
-rw-r--r--test/CodeGen/X86/avx-schedule.ll4457
-rw-r--r--test/CodeGen/X86/avx-select.ll65
-rw-r--r--test/CodeGen/X86/avx-shift.ll32
-rwxr-xr-xtest/CodeGen/X86/avx-shuffle-x86_32.ll6
-rw-r--r--test/CodeGen/X86/avx-splat.ll28
-rwxr-xr-xtest/CodeGen/X86/avx-trunc.ll6
-rw-r--r--test/CodeGen/X86/avx-unpack.ll48
-rw-r--r--test/CodeGen/X86/avx-vbroadcast.ll212
-rw-r--r--test/CodeGen/X86/avx-vbroadcastf128.ll60
-rw-r--r--test/CodeGen/X86/avx-vextractf128.ll22
-rw-r--r--test/CodeGen/X86/avx-vinsertf128.ll26
-rw-r--r--test/CodeGen/X86/avx-vpclmulqdq.ll13
-rw-r--r--test/CodeGen/X86/avx-vperm2x128.ll257
-rw-r--r--test/CodeGen/X86/avx-vzeroupper.ll48
-rw-r--r--test/CodeGen/X86/avx1-logical-load-folding.ll16
-rw-r--r--test/CodeGen/X86/avx2-arith.ll104
-rw-r--r--test/CodeGen/X86/avx2-cmp.ll32
-rwxr-xr-xtest/CodeGen/X86/avx2-conversions.ll76
-rw-r--r--test/CodeGen/X86/avx2-fma-fneg-combine.ll24
-rw-r--r--test/CodeGen/X86/avx2-gather.ll32
-rw-r--r--test/CodeGen/X86/avx2-intrinsics-fast-isel.ll2733
-rw-r--r--test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll289
-rw-r--r--test/CodeGen/X86/avx2-intrinsics-x86.ll1341
-rw-r--r--test/CodeGen/X86/avx2-logic.ll28
-rw-r--r--test/CodeGen/X86/avx2-masked-gather.ll807
-rw-r--r--test/CodeGen/X86/avx2-nontemporal.ll60
-rw-r--r--test/CodeGen/X86/avx2-phaddsub.ll32
-rw-r--r--test/CodeGen/X86/avx2-pmovxrm.ll48
-rw-r--r--test/CodeGen/X86/avx2-schedule.ll6949
-rw-r--r--test/CodeGen/X86/avx2-shift.ll170
-rw-r--r--test/CodeGen/X86/avx2-vbroadcast.ll393
-rw-r--r--test/CodeGen/X86/avx2-vbroadcasti128.ll84
-rw-r--r--test/CodeGen/X86/avx2-vector-shifts.ll170
-rwxr-xr-xtest/CodeGen/X86/avx2-vperm.ll28
-rw-r--r--test/CodeGen/X86/avx512-adc-sbb.ll2
-rw-r--r--test/CodeGen/X86/avx512-any_extend_load.ll14
-rw-r--r--test/CodeGen/X86/avx512-arith.ll316
-rw-r--r--test/CodeGen/X86/avx512-bugfix-23634.ll4
-rw-r--r--test/CodeGen/X86/avx512-bugfix-25270.ll2
-rw-r--r--test/CodeGen/X86/avx512-bugfix-26264.ll4
-rw-r--r--test/CodeGen/X86/avx512-build-vector.ll10
-rw-r--r--test/CodeGen/X86/avx512-calling-conv.ll123
-rw-r--r--test/CodeGen/X86/avx512-cmp-kor-sequence.ll4
-rw-r--r--test/CodeGen/X86/avx512-cmp.ll32
-rw-r--r--test/CodeGen/X86/avx512-cvt.ll1810
-rw-r--r--test/CodeGen/X86/avx512-ext.ll382
-rw-r--r--test/CodeGen/X86/avx512-extract-subvector-load-store.ll1458
-rw-r--r--test/CodeGen/X86/avx512-extract-subvector.ll158
-rw-r--r--test/CodeGen/X86/avx512-fma-commute.ll95
-rw-r--r--test/CodeGen/X86/avx512-fma-intrinsics.ll128
-rw-r--r--test/CodeGen/X86/avx512-fma.ll34
-rw-r--r--test/CodeGen/X86/avx512-fsel.ll14
-rw-r--r--test/CodeGen/X86/avx512-gather-scatter-intrin.ll118
-rw-r--r--test/CodeGen/X86/avx512-gfni-intrinsics.ll183
-rw-r--r--test/CodeGen/X86/avx512-hadd-hsub.ll303
-rwxr-xr-xtest/CodeGen/X86/avx512-i1test.ll8
-rw-r--r--test/CodeGen/X86/avx512-inc-dec.ll2
-rw-r--r--test/CodeGen/X86/avx512-insert-extract.ll2749
-rw-r--r--test/CodeGen/X86/avx512-insert-extract_i1.ll11
-rw-r--r--test/CodeGen/X86/avx512-intrinsics-fast-isel.ll962
-rw-r--r--test/CodeGen/X86/avx512-intrinsics-upgrade.ll930
-rw-r--r--test/CodeGen/X86/avx512-intrinsics.ll1242
-rw-r--r--test/CodeGen/X86/avx512-load-store.ll44
-rw-r--r--test/CodeGen/X86/avx512-load-trunc-store-i1.ll151
-rw-r--r--test/CodeGen/X86/avx512-logic.ll170
-rw-r--r--test/CodeGen/X86/avx512-mask-op.ll2123
-rw-r--r--test/CodeGen/X86/avx512-mask-spills.ll15
-rwxr-xr-xtest/CodeGen/X86/avx512-mask-zext-bugfix.ll49
-rw-r--r--test/CodeGen/X86/avx512-masked-memop-64-32.ll52
-rw-r--r--test/CodeGen/X86/avx512-masked_memop-16-8.ll24
-rw-r--r--test/CodeGen/X86/avx512-memfold.ll32
-rw-r--r--test/CodeGen/X86/avx512-mov.ll152
-rw-r--r--test/CodeGen/X86/avx512-nontemporal.ll19
-rw-r--r--test/CodeGen/X86/avx512-pmovxrm.ll48
-rw-r--r--test/CodeGen/X86/avx512-regcall-Mask.ll1054
-rw-r--r--test/CodeGen/X86/avx512-regcall-NoMask.ll1392
-rw-r--r--test/CodeGen/X86/avx512-rotate.ll71
-rw-r--r--test/CodeGen/X86/avx512-round.ll106
-rw-r--r--test/CodeGen/X86/avx512-scalarIntrinsics.ll133
-rw-r--r--test/CodeGen/X86/avx512-scalar_mask.ll20
-rwxr-xr-xtest/CodeGen/X86/avx512-schedule.ll8864
-rw-r--r--test/CodeGen/X86/avx512-select.ll357
-rw-r--r--test/CodeGen/X86/avx512-shift.ll46
-rwxr-xr-xtest/CodeGen/X86/avx512-shuffle-schedule.ll17005
-rw-r--r--test/CodeGen/X86/avx512-shuffles/broadcast-scalar-fp.ll1238
-rw-r--r--test/CodeGen/X86/avx512-shuffles/broadcast-scalar-int.ll2807
-rw-r--r--test/CodeGen/X86/avx512-shuffles/broadcast-vector-fp.ll1163
-rw-r--r--test/CodeGen/X86/avx512-shuffles/broadcast-vector-int.ll1430
-rw-r--r--test/CodeGen/X86/avx512-shuffles/duplicate-high.ll849
-rw-r--r--test/CodeGen/X86/avx512-shuffles/duplicate-low.ll1536
-rw-r--r--test/CodeGen/X86/avx512-shuffles/in_lane_permute.ll1866
-rw-r--r--test/CodeGen/X86/avx512-shuffles/partial_permute.ll4808
-rw-r--r--test/CodeGen/X86/avx512-shuffles/permute.ll3129
-rw-r--r--test/CodeGen/X86/avx512-shuffles/shuffle-interleave.ll1400
-rw-r--r--test/CodeGen/X86/avx512-shuffles/shuffle-vec.ll2037
-rw-r--r--test/CodeGen/X86/avx512-shuffles/shuffle.ll2984
-rw-r--r--test/CodeGen/X86/avx512-shuffles/unpack.ll2797
-rw-r--r--test/CodeGen/X86/avx512-skx-insert-subvec.ll63
-rw-r--r--test/CodeGen/X86/avx512-trunc.ll225
-rw-r--r--test/CodeGen/X86/avx512-unsafe-fp-math.ll26
-rw-r--r--test/CodeGen/X86/avx512-vbroadcast.ll94
-rw-r--r--test/CodeGen/X86/avx512-vbroadcasti128.ll136
-rw-r--r--test/CodeGen/X86/avx512-vbroadcasti256.ll60
-rw-r--r--test/CodeGen/X86/avx512-vec-cmp.ll634
-rw-r--r--test/CodeGen/X86/avx512-vec3-crash.ll8
-rw-r--r--test/CodeGen/X86/avx512-vpclmulqdq.ll11
-rw-r--r--test/CodeGen/X86/avx512-vpermv3-commute.ll58
-rw-r--r--test/CodeGen/X86/avx512-vpternlog-commute.ll184
-rw-r--r--test/CodeGen/X86/avx512-vselect-crash.ll8
-rw-r--r--test/CodeGen/X86/avx512-vselect.ll41
-rw-r--r--test/CodeGen/X86/avx512bw-intrinsics-fast-isel.ll2655
-rw-r--r--test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll3246
-rw-r--r--test/CodeGen/X86/avx512bw-intrinsics.ll742
-rw-r--r--test/CodeGen/X86/avx512bw-mask-op.ll28
-rw-r--r--test/CodeGen/X86/avx512bw-mov.ll84
-rw-r--r--test/CodeGen/X86/avx512bw-vec-cmp.ll24
-rw-r--r--test/CodeGen/X86/avx512bw-vec-test-testn.ll145
-rw-r--r--test/CodeGen/X86/avx512bwvl-intrinsics-fast-isel.ll643
-rw-r--r--test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll911
-rw-r--r--test/CodeGen/X86/avx512bwvl-intrinsics.ll759
-rw-r--r--test/CodeGen/X86/avx512bwvl-mov.ll58
-rw-r--r--test/CodeGen/X86/avx512bwvl-vec-cmp.ll48
-rw-r--r--test/CodeGen/X86/avx512bwvl-vec-test-testn.ll288
-rw-r--r--test/CodeGen/X86/avx512cd-intrinsics-fast-isel.ll37
-rw-r--r--test/CodeGen/X86/avx512cd-intrinsics-upgrade.ll31
-rw-r--r--test/CodeGen/X86/avx512cd-intrinsics.ll36
-rw-r--r--test/CodeGen/X86/avx512cdvl-intrinsics-upgrade.ll52
-rw-r--r--test/CodeGen/X86/avx512cdvl-intrinsics.ll59
-rw-r--r--test/CodeGen/X86/avx512dq-intrinsics-upgrade.ll219
-rw-r--r--test/CodeGen/X86/avx512dq-intrinsics.ll303
-rw-r--r--test/CodeGen/X86/avx512dq-mask-op.ll12
-rw-r--r--test/CodeGen/X86/avx512dqvl-intrinsics-upgrade.ll421
-rw-r--r--test/CodeGen/X86/avx512dqvl-intrinsics.ll245
-rw-r--r--test/CodeGen/X86/avx512er-intrinsics.ll78
-rw-r--r--test/CodeGen/X86/avx512f-vec-test-testn.ll147
-rw-r--r--test/CodeGen/X86/avx512ifma-intrinsics.ll168
-rw-r--r--test/CodeGen/X86/avx512ifmavl-intrinsics.ll25
-rw-r--r--test/CodeGen/X86/avx512vbmi-intrinsics.ll14
-rw-r--r--test/CodeGen/X86/avx512vbmi2-intrinsics.ll327
-rw-r--r--test/CodeGen/X86/avx512vbmi2vl-intrinsics.ll657
-rw-r--r--test/CodeGen/X86/avx512vbmivl-intrinsics.ll26
-rw-r--r--test/CodeGen/X86/avx512vl-arith.ll192
-rw-r--r--test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll1122
-rw-r--r--test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll1401
-rw-r--r--test/CodeGen/X86/avx512vl-intrinsics.ll1175
-rw-r--r--test/CodeGen/X86/avx512vl-logic.ll160
-rw-r--r--test/CodeGen/X86/avx512vl-mov.ll174
-rw-r--r--test/CodeGen/X86/avx512vl-nontemporal.ll24
-rw-r--r--test/CodeGen/X86/avx512vl-vbroadcast.ll60
-rw-r--r--test/CodeGen/X86/avx512vl-vec-cmp.ll325
-rw-r--r--test/CodeGen/X86/avx512vl-vec-masked-cmp.ll40936
-rw-r--r--test/CodeGen/X86/avx512vl-vec-test-testn.ll440
-rw-r--r--test/CodeGen/X86/avx512vl-vpclmulqdq.ll22
-rw-r--r--test/CodeGen/X86/avx512vl_vnni-intrinsics.ll195
-rw-r--r--test/CodeGen/X86/avx512vlcd-intrinsics-fast-isel.ll75
-rw-r--r--test/CodeGen/X86/avx512vnni-intrinsics.ll98
-rw-r--r--test/CodeGen/X86/avx512vpopcntdq-intrinsics.ll16
-rw-r--r--test/CodeGen/X86/avx512vpopcntdq-schedule.ll79
-rw-r--r--test/CodeGen/X86/barrier.ll2
-rw-r--r--test/CodeGen/X86/base-pointer-and-cmpxchg.ll12
-rw-r--r--test/CodeGen/X86/basic-promote-integers.ll4
-rw-r--r--test/CodeGen/X86/bc-extract.ll12
-rw-r--r--test/CodeGen/X86/bigstructret.ll47
-rw-r--r--test/CodeGen/X86/bigstructret2.ll2
-rw-r--r--test/CodeGen/X86/bit-test-shift.ll2
-rw-r--r--test/CodeGen/X86/bitcast-and-setcc-128.ll645
-rw-r--r--test/CodeGen/X86/bitcast-and-setcc-256.ll693
-rw-r--r--test/CodeGen/X86/bitcast-and-setcc-512.ll943
-rw-r--r--test/CodeGen/X86/bitcast-i256.ll11
-rw-r--r--test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll3716
-rw-r--r--test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll3843
-rw-r--r--test/CodeGen/X86/bitcast-int-to-vector-bool.ll742
-rw-r--r--test/CodeGen/X86/bitcast-int-to-vector.ll33
-rw-r--r--test/CodeGen/X86/bitcast-mmx.ll16
-rw-r--r--test/CodeGen/X86/bitcast-setcc-128.ll504
-rw-r--r--test/CodeGen/X86/bitcast-setcc-256.ll542
-rw-r--r--test/CodeGen/X86/bitcast-setcc-512.ll543
-rw-r--r--test/CodeGen/X86/bitcast.ll4
-rw-r--r--test/CodeGen/X86/bitcast2.ll4
-rw-r--r--test/CodeGen/X86/bitreverse.ll80
-rw-r--r--test/CodeGen/X86/block-placement.ll8
-rw-r--r--test/CodeGen/X86/block-placement.mir14
-rw-r--r--test/CodeGen/X86/bmi-intrinsics-fast-isel-x86_64.ll24
-rw-r--r--test/CodeGen/X86/bmi-intrinsics-fast-isel.ll64
-rw-r--r--test/CodeGen/X86/bmi-schedule.ll621
-rw-r--r--test/CodeGen/X86/bmi.ll465
-rw-r--r--test/CodeGen/X86/bmi2-schedule.ll797
-rw-r--r--test/CodeGen/X86/bmi2.ll99
-rw-r--r--test/CodeGen/X86/bool-ext-inc.ll14
-rw-r--r--test/CodeGen/X86/bool-simplify.ll24
-rw-r--r--test/CodeGen/X86/bool-vector.ll200
-rw-r--r--test/CodeGen/X86/bool-zext.ll74
-rw-r--r--test/CodeGen/X86/branch_instruction_and_target_split_perf_nops.mir288
-rw-r--r--test/CodeGen/X86/branchfolding-undef.mir2
-rw-r--r--test/CodeGen/X86/break-anti-dependencies.ll4
-rw-r--r--test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll388
-rw-r--r--test/CodeGen/X86/broadcastm-lowering.ll212
-rw-r--r--test/CodeGen/X86/bss_pagealigned.ll2
-rw-r--r--test/CodeGen/X86/bswap-rotate.ll4
-rw-r--r--test/CodeGen/X86/bswap-vector.ll106
-rw-r--r--test/CodeGen/X86/bswap-wide-int.ll24
-rw-r--r--test/CodeGen/X86/bswap.ll4
-rw-r--r--test/CodeGen/X86/bswap_tree.ll8
-rw-r--r--test/CodeGen/X86/bswap_tree2.ll12
-rw-r--r--test/CodeGen/X86/bt.ll985
-rw-r--r--test/CodeGen/X86/btq.ll25
-rw-r--r--test/CodeGen/X86/bug26810.ll312
-rw-r--r--test/CodeGen/X86/build-vector-128.ll66
-rw-r--r--test/CodeGen/X86/build-vector-256.ll40
-rw-r--r--test/CodeGen/X86/build-vector-512.ll40
-rw-r--r--test/CodeGen/X86/buildvec-insertvec.ll87
-rw-r--r--test/CodeGen/X86/bypass-slow-division-32.ll46
-rw-r--r--test/CodeGen/X86/bypass-slow-division-64.ll20
-rw-r--r--test/CodeGen/X86/bypass-slow-division-tune.ll28
-rw-r--r--test/CodeGen/X86/byval.ll2
-rw-r--r--test/CodeGen/X86/byval2.ll2
-rw-r--r--test/CodeGen/X86/byval3.ll2
-rw-r--r--test/CodeGen/X86/byval4.ll2
-rw-r--r--test/CodeGen/X86/byval5.ll2
-rw-r--r--test/CodeGen/X86/byval6.ll2
-rw-r--r--test/CodeGen/X86/byval7.ll2
-rw-r--r--test/CodeGen/X86/call-imm.ll2
-rw-r--r--test/CodeGen/X86/cast-vsel.ll255
-rw-r--r--test/CodeGen/X86/catchpad-weight.ll4
-rw-r--r--test/CodeGen/X86/cfi-xmm.ll21
-rw-r--r--test/CodeGen/X86/chain_order.ll2
-rw-r--r--test/CodeGen/X86/change-compare-stride-1.ll2
-rw-r--r--test/CodeGen/X86/change-compare-stride-trickiness-1.ll2
-rw-r--r--test/CodeGen/X86/clear_upper_vector_element_bits.ll1148
-rw-r--r--test/CodeGen/X86/clflushopt-schedule.ll36
-rw-r--r--test/CodeGen/X86/clflushopt.ll18
-rw-r--r--test/CodeGen/X86/clwb-schedule.ll18
-rw-r--r--test/CodeGen/X86/clwb.ll13
-rw-r--r--test/CodeGen/X86/clz.ll292
-rw-r--r--test/CodeGen/X86/clzero-schedule.ll20
-rw-r--r--test/CodeGen/X86/clzero.ll4
-rw-r--r--test/CodeGen/X86/cmov-fp.ll8
-rw-r--r--test/CodeGen/X86/cmov-into-branch.ll46
-rw-r--r--test/CodeGen/X86/cmov-promotion.ll317
-rw-r--r--test/CodeGen/X86/cmov-schedule.ll2004
-rw-r--r--test/CodeGen/X86/cmov.ll39
-rw-r--r--test/CodeGen/X86/cmovcmov.ll28
-rw-r--r--test/CodeGen/X86/cmp.ll430
-rw-r--r--test/CodeGen/X86/cmpxchg-clobber-flags.ll42
-rw-r--r--test/CodeGen/X86/cmpxchg16b.ll2
-rw-r--r--test/CodeGen/X86/cmpxchg8b_alloca_regalloc_handling.ll2
-rw-r--r--test/CodeGen/X86/coalesce_commute_movsd.ll16
-rw-r--r--test/CodeGen/X86/coalescer-dce.ll28
-rw-r--r--test/CodeGen/X86/code_placement.ll2
-rw-r--r--test/CodeGen/X86/codegen-prepare-cast.ll2
-rw-r--r--test/CodeGen/X86/combine-64bit-vec-binop.ll42
-rw-r--r--test/CodeGen/X86/combine-abs.ll57
-rw-r--r--test/CodeGen/X86/combine-add.ll68
-rw-r--r--test/CodeGen/X86/combine-and.ll69
-rw-r--r--test/CodeGen/X86/combine-avx-intrinsics.ll12
-rw-r--r--test/CodeGen/X86/combine-avx2-intrinsics.ll18
-rw-r--r--test/CodeGen/X86/combine-fcopysign.ll65
-rw-r--r--test/CodeGen/X86/combine-lds.ll2
-rw-r--r--test/CodeGen/X86/combine-mul.ll135
-rw-r--r--test/CodeGen/X86/combine-multiplies.ll8
-rw-r--r--test/CodeGen/X86/combine-or.ll80
-rw-r--r--test/CodeGen/X86/combine-pmuldq.ll18
-rw-r--r--test/CodeGen/X86/combine-rotates.ll12
-rw-r--r--test/CodeGen/X86/combine-sdiv.ll34
-rw-r--r--test/CodeGen/X86/combine-sext-in-reg.ll8
-rw-r--r--test/CodeGen/X86/combine-shl.ll156
-rw-r--r--test/CodeGen/X86/combine-sra.ll122
-rw-r--r--test/CodeGen/X86/combine-srem.ll18
-rw-r--r--test/CodeGen/X86/combine-srl.ll151
-rw-r--r--test/CodeGen/X86/combine-sse41-intrinsics.ll18
-rw-r--r--test/CodeGen/X86/combine-sub.ll60
-rw-r--r--test/CodeGen/X86/combine-testm-and.ll8
-rw-r--r--test/CodeGen/X86/combine-udiv.ll36
-rw-r--r--test/CodeGen/X86/combine-urem.ll42
-rw-r--r--test/CodeGen/X86/commute-3dnow.ll36
-rw-r--r--test/CodeGen/X86/commute-blend-avx2.ll20
-rw-r--r--test/CodeGen/X86/commute-blend-sse41.ll6
-rw-r--r--test/CodeGen/X86/commute-clmul.ll21
-rw-r--r--test/CodeGen/X86/commute-fcmp.ll405
-rw-r--r--test/CodeGen/X86/commute-vpclmulqdq-avx.ll42
-rw-r--r--test/CodeGen/X86/commute-vpclmulqdq-avx512.ll116
-rw-r--r--test/CodeGen/X86/commute-xop.ll80
-rw-r--r--test/CodeGen/X86/compare-add.ll2
-rw-r--r--test/CodeGen/X86/compare-inf.ll2
-rw-r--r--test/CodeGen/X86/compare_folding.ll4
-rw-r--r--test/CodeGen/X86/complex-fastmath.ll24
-rw-r--r--test/CodeGen/X86/complex-fca.ll2
-rw-r--r--test/CodeGen/X86/compress_expand.ll123
-rw-r--r--test/CodeGen/X86/computeKnownBits_urem.ll4
-rw-r--r--test/CodeGen/X86/conditional-indecrement.ll18
-rw-r--r--test/CodeGen/X86/conditional-tailcall-samedest.mir22
-rw-r--r--test/CodeGen/X86/constant-combines.ll5
-rw-r--r--test/CodeGen/X86/constant-hoisting-and.ll2
-rw-r--r--test/CodeGen/X86/constant-hoisting-cmp.ll2
-rw-r--r--test/CodeGen/X86/constant-hoisting-shift-immediate.ll2
-rw-r--r--test/CodeGen/X86/constant-pool-remat-0.ll2
-rw-r--r--test/CodeGen/X86/constpool.ll9
-rw-r--r--test/CodeGen/X86/constructor.ll27
-rw-r--r--test/CodeGen/X86/copysign-constant-magnitude.ll16
-rw-r--r--test/CodeGen/X86/cpus.ll59
-rw-r--r--test/CodeGen/X86/crash.ll10
-rw-r--r--test/CodeGen/X86/critical-edge-split-2.ll33
-rw-r--r--test/CodeGen/X86/ctpop-combine.ll12
-rw-r--r--test/CodeGen/X86/cvt16.ll8
-rw-r--r--test/CodeGen/X86/cvtv2f32.ll8
-rw-r--r--test/CodeGen/X86/dag-fmf-cse.ll2
-rw-r--r--test/CodeGen/X86/dag-merge-fast-accesses.ll12
-rw-r--r--test/CodeGen/X86/dag-rauw-cse.ll2
-rw-r--r--test/CodeGen/X86/dagcombine-buildvector.ll8
-rw-r--r--test/CodeGen/X86/dagcombine-cse.ll8
-rw-r--r--test/CodeGen/X86/darwin-preemption.ll251
-rw-r--r--test/CodeGen/X86/dbg-baseptr.ll20
-rw-r--r--test/CodeGen/X86/dbg-changes-codegen-branch-folding.ll4
-rw-r--r--test/CodeGen/X86/dbg-changes-codegen.ll2
-rw-r--r--test/CodeGen/X86/dbg-line-0-no-discriminator.ll39
-rw-r--r--test/CodeGen/X86/debug-nodebug-crash.ll61
-rw-r--r--test/CodeGen/X86/debugloc-no-line-0.ll49
-rw-r--r--test/CodeGen/X86/deopt-intrinsic-cconv.ll1
-rw-r--r--test/CodeGen/X86/deopt-intrinsic.ll2
-rw-r--r--test/CodeGen/X86/disable-tail-calls.ll6
-rw-r--r--test/CodeGen/X86/discontiguous-loops.ll2
-rw-r--r--test/CodeGen/X86/div-rem-simplify.ll40
-rw-r--r--test/CodeGen/X86/divide-by-constant.ll102
-rw-r--r--test/CodeGen/X86/divrem.ll36
-rw-r--r--test/CodeGen/X86/divrem8_ext.ll76
-rw-r--r--test/CodeGen/X86/dllimport-x86_64.ll5
-rw-r--r--test/CodeGen/X86/dllimport.ll5
-rw-r--r--test/CodeGen/X86/dollar-name.ll2
-rw-r--r--test/CodeGen/X86/domain-reassignment.mir754
-rw-r--r--test/CodeGen/X86/dont-trunc-store-double-to-float.ll2
-rw-r--r--test/CodeGen/X86/dwarf-comp-dir.ll2
-rw-r--r--test/CodeGen/X86/dwarf-headers.ll10
-rw-r--r--test/CodeGen/X86/dynamic-alloca-lifetime.ll2
-rw-r--r--test/CodeGen/X86/dynamic-allocas-VLAs.ll4
-rw-r--r--test/CodeGen/X86/eflags-copy-expansion.mir2
-rw-r--r--test/CodeGen/X86/empty-functions.ll4
-rw-r--r--test/CodeGen/X86/empty-struct-return-type.ll2
-rw-r--r--test/CodeGen/X86/emutls-pic.ll8
-rw-r--r--test/CodeGen/X86/emutls-pie.ll8
-rw-r--r--test/CodeGen/X86/emutls.ll8
-rw-r--r--test/CodeGen/X86/emutls_generic.ll4
-rw-r--r--test/CodeGen/X86/epilogue.ll2
-rwxr-xr-xtest/CodeGen/X86/evex-to-vex-compress.mir4814
-rw-r--r--test/CodeGen/X86/exedeps-movq.ll16
-rw-r--r--test/CodeGen/X86/exedepsfix-broadcast.ll16
-rw-r--r--test/CodeGen/X86/expand-vr64-gr64-copy.mir8
-rw-r--r--test/CodeGen/X86/extend.ll4
-rw-r--r--test/CodeGen/X86/extended-fma-contraction.ll4
-rw-r--r--test/CodeGen/X86/extmul128.ll2
-rw-r--r--test/CodeGen/X86/extmul64.ll2
-rw-r--r--test/CodeGen/X86/extract-combine.ll2
-rw-r--r--test/CodeGen/X86/extract-extract.ll2
-rw-r--r--test/CodeGen/X86/extract-store.ll284
-rw-r--r--test/CodeGen/X86/extractelement-from-arg.ll2
-rw-r--r--test/CodeGen/X86/extractelement-index.ll223
-rw-r--r--test/CodeGen/X86/extractelement-legalization-store-ordering.ll24
-rw-r--r--test/CodeGen/X86/extractelement-load.ll24
-rw-r--r--test/CodeGen/X86/extractps.ll2
-rw-r--r--test/CodeGen/X86/f16c-intrinsics-fast-isel.ll28
-rw-r--r--test/CodeGen/X86/f16c-intrinsics.ll315
-rw-r--r--test/CodeGen/X86/f16c-schedule.ll227
-rw-r--r--test/CodeGen/X86/fadd-combines.ll36
-rw-r--r--test/CodeGen/X86/fast-cc-callee-pops.ll2
-rw-r--r--test/CodeGen/X86/fast-cc-merge-stack-adj.ll2
-rw-r--r--test/CodeGen/X86/fast-cc-pass-in-regs.ll2
-rw-r--r--test/CodeGen/X86/fast-isel-agg-constant.ll2
-rw-r--r--test/CodeGen/X86/fast-isel-atomic.ll2
-rw-r--r--test/CodeGen/X86/fast-isel-bail.ll2
-rw-r--r--test/CodeGen/X86/fast-isel-bc.ll2
-rw-r--r--test/CodeGen/X86/fast-isel-call-cleanup.ll19
-rw-r--r--test/CodeGen/X86/fast-isel-call.ll4
-rw-r--r--test/CodeGen/X86/fast-isel-cmp.ll1040
-rw-r--r--test/CodeGen/X86/fast-isel-constant.ll2
-rw-r--r--test/CodeGen/X86/fast-isel-constpool.ll59
-rw-r--r--test/CodeGen/X86/fast-isel-emutls.ll2
-rw-r--r--test/CodeGen/X86/fast-isel-expect.ll4
-rw-r--r--test/CodeGen/X86/fast-isel-fneg.ll2
-rw-r--r--test/CodeGen/X86/fast-isel-fptrunc-fpext.ll94
-rw-r--r--test/CodeGen/X86/fast-isel-gep.ll2
-rw-r--r--test/CodeGen/X86/fast-isel-int-float-conversion-x86-64.ll100
-rw-r--r--test/CodeGen/X86/fast-isel-int-float-conversion.ll259
-rw-r--r--test/CodeGen/X86/fast-isel-load-i1.ll4
-rw-r--r--test/CodeGen/X86/fast-isel-nontemporal.ll393
-rw-r--r--test/CodeGen/X86/fast-isel-noplt-pic.ll16
-rw-r--r--test/CodeGen/X86/fast-isel-select-cmov.ll12
-rw-r--r--test/CodeGen/X86/fast-isel-select-cmov2.ll384
-rw-r--r--test/CodeGen/X86/fast-isel-select-pseudo-cmov.ll283
-rw-r--r--test/CodeGen/X86/fast-isel-select-sse.ll144
-rw-r--r--test/CodeGen/X86/fast-isel-sext-zext.ll378
-rw-r--r--test/CodeGen/X86/fast-isel-shift.ll383
-rw-r--r--test/CodeGen/X86/fast-isel-store.ll198
-rw-r--r--test/CodeGen/X86/fast-isel-tailcall.ll2
-rw-r--r--test/CodeGen/X86/fast-isel-tls.ll2
-rw-r--r--test/CodeGen/X86/fast-isel-vecload.ll326
-rw-r--r--test/CodeGen/X86/fast-isel-x86.ll19
-rw-r--r--test/CodeGen/X86/fast-isel.ll2
-rw-r--r--test/CodeGen/X86/fastcc-sret.ll2
-rw-r--r--test/CodeGen/X86/fastcc3struct.ll2
-rw-r--r--test/CodeGen/X86/fastisel-softfloat.ll2
-rw-r--r--test/CodeGen/X86/fcmove.ll2
-rw-r--r--test/CodeGen/X86/fdiv-combine.ll12
-rw-r--r--test/CodeGen/X86/fdiv.ll12
-rw-r--r--test/CodeGen/X86/fentry-insertion.ll16
-rw-r--r--test/CodeGen/X86/field-extract-use-trunc.ll4
-rw-r--r--test/CodeGen/X86/fildll.ll2
-rw-r--r--test/CodeGen/X86/file-directive.ll13
-rw-r--r--test/CodeGen/X86/finite-libcalls.ll52
-rw-r--r--test/CodeGen/X86/fixup-bw-copy.ll20
-rw-r--r--test/CodeGen/X86/fixup-bw-inst.ll4
-rw-r--r--test/CodeGen/X86/fixup-bw-inst.mir151
-rw-r--r--test/CodeGen/X86/fixup-lea.ll2
-rw-r--r--test/CodeGen/X86/float-conv-elim.ll2
-rw-r--r--test/CodeGen/X86/floor-soft-float.ll4
-rw-r--r--test/CodeGen/X86/fma-commute-x86.ll744
-rw-r--r--test/CodeGen/X86/fma-fneg-combine.ll103
-rw-r--r--test/CodeGen/X86/fma-intrinsics-x86.ll1153
-rw-r--r--test/CodeGen/X86/fma-phi-213-to-231.ll2
-rw-r--r--test/CodeGen/X86/fma-scalar-memfold.ll145
-rw-r--r--test/CodeGen/X86/fma-schedule.ll2920
-rw-r--r--test/CodeGen/X86/fma.ll96
-rw-r--r--test/CodeGen/X86/fma4-commute-x86.ll563
-rw-r--r--test/CodeGen/X86/fma4-fneg-combine.ll111
-rw-r--r--test/CodeGen/X86/fma4-intrinsics-x86.ll289
-rw-r--r--test/CodeGen/X86/fma4-intrinsics-x86_64-folded-load.ll77
-rw-r--r--test/CodeGen/X86/fma4-scalar-memfold.ll104
-rw-r--r--test/CodeGen/X86/fma4-schedule.ll758
-rw-r--r--test/CodeGen/X86/fma_patterns.ll452
-rw-r--r--test/CodeGen/X86/fma_patterns_wide.ll306
-rw-r--r--test/CodeGen/X86/fmaddsub-combine.ll1038
-rw-r--r--test/CodeGen/X86/fmf-flags.ll104
-rw-r--r--test/CodeGen/X86/fmsubadd-combine.ll30
-rw-r--r--test/CodeGen/X86/fmul-combines.ll2
-rw-r--r--test/CodeGen/X86/fmul-zero.ll4
-rw-r--r--test/CodeGen/X86/fold-add.ll2
-rw-r--r--test/CodeGen/X86/fold-and-shift.ll2
-rw-r--r--test/CodeGen/X86/fold-call.ll4
-rw-r--r--test/CodeGen/X86/fold-imm.ll2
-rw-r--r--test/CodeGen/X86/fold-load-binops.ll32
-rw-r--r--test/CodeGen/X86/fold-load-unops.ll109
-rw-r--r--test/CodeGen/X86/fold-load-vec.ll2
-rw-r--r--test/CodeGen/X86/fold-load.ll2
-rw-r--r--test/CodeGen/X86/fold-mul-lohi.ll2
-rw-r--r--test/CodeGen/X86/fold-pcmpeqd-1.ll2
-rw-r--r--test/CodeGen/X86/fold-push.ll2
-rw-r--r--test/CodeGen/X86/fold-rmw-ops.ll2439
-rw-r--r--test/CodeGen/X86/fold-sext-trunc.ll2
-rw-r--r--test/CodeGen/X86/fold-vector-sext-crash.ll9
-rw-r--r--test/CodeGen/X86/fold-vector-sext-crash2.ll195
-rw-r--r--test/CodeGen/X86/fold-vector-sext-zext.ll94
-rw-r--r--test/CodeGen/X86/fold-vector-shl-crash.ll4
-rw-r--r--test/CodeGen/X86/fp-elim.ll4
-rw-r--r--test/CodeGen/X86/fp-fast.ll22
-rw-r--r--test/CodeGen/X86/fp-immediate-shorten.ll2
-rw-r--r--test/CodeGen/X86/fp-intrinsics.ll81
-rw-r--r--test/CodeGen/X86/fp-load-trunc.ll16
-rw-r--r--test/CodeGen/X86/fp-logic-replace.ll24
-rw-r--r--test/CodeGen/X86/fp-logic.ll42
-rw-r--r--test/CodeGen/X86/fp-select-cmp-and.ll36
-rw-r--r--test/CodeGen/X86/fp-stack-2results.ll4
-rw-r--r--test/CodeGen/X86/fp-stack-compare-cmov.ll2
-rw-r--r--test/CodeGen/X86/fp-stack-compare.ll2
-rw-r--r--test/CodeGen/X86/fp-stack-direct-ret.ll4
-rw-r--r--test/CodeGen/X86/fp-stack-ret.ll2
-rw-r--r--test/CodeGen/X86/fp-stack-retcopy.ll2
-rw-r--r--test/CodeGen/X86/fp-stack-set-st1.ll2
-rw-r--r--test/CodeGen/X86/fp-trunc.ll16
-rw-r--r--test/CodeGen/X86/fp-une-cmp.ll8
-rw-r--r--test/CodeGen/X86/fp128-cast.ll12
-rw-r--r--test/CodeGen/X86/fp128-g.ll2
-rw-r--r--test/CodeGen/X86/fp128-i128.ll32
-rw-r--r--test/CodeGen/X86/fp128-select.ll10
-rw-r--r--test/CodeGen/X86/fp2sint.ll2
-rw-r--r--test/CodeGen/X86/fp_constant_op.ll2
-rw-r--r--test/CodeGen/X86/fp_load_cast_fold.ll4
-rw-r--r--test/CodeGen/X86/fp_load_fold.ll2
-rw-r--r--test/CodeGen/X86/fpcmp-soft-fp.ll2
-rw-r--r--test/CodeGen/X86/fpstack-debuginstr-kill.ll4
-rw-r--r--test/CodeGen/X86/frame-lowering-debug-intrinsic-2.ll3
-rw-r--r--test/CodeGen/X86/frameaddr.ll4
-rw-r--r--test/CodeGen/X86/fsgsbase-schedule.ll411
-rw-r--r--test/CodeGen/X86/fsgsbase.ll43
-rw-r--r--test/CodeGen/X86/fsxor-alignment.ll2
-rw-r--r--test/CodeGen/X86/full-lsr.ll12
-rw-r--r--test/CodeGen/X86/function-subtarget-features-2.ll2
-rw-r--r--test/CodeGen/X86/function-subtarget-features.ll2
-rw-r--r--test/CodeGen/X86/gather-addresses.ll16
-rw-r--r--test/CodeGen/X86/getelementptr.ll8
-rw-r--r--test/CodeGen/X86/gfni-intrinsics.ll33
-rw-r--r--test/CodeGen/X86/ghc-cc.ll8
-rw-r--r--test/CodeGen/X86/ghc-cc64.ll20
-rw-r--r--test/CodeGen/X86/global-access-pie-copyrelocs.ll31
-rw-r--r--test/CodeGen/X86/global-access-pie.ll4
-rw-r--r--test/CodeGen/X86/gpr-to-mask.ll558
-rw-r--r--test/CodeGen/X86/greedy_regalloc_bad_eviction_sequence.ll116
-rw-r--r--test/CodeGen/X86/h-register-addressing-32.ll2
-rw-r--r--test/CodeGen/X86/h-register-addressing-64.ll2
-rw-r--r--test/CodeGen/X86/h-register-store.ll2
-rw-r--r--test/CodeGen/X86/h-registers-0.ll2
-rw-r--r--test/CodeGen/X86/h-registers-1.ll90
-rw-r--r--test/CodeGen/X86/h-registers-2.ll2
-rw-r--r--test/CodeGen/X86/haddsub-2.ll150
-rw-r--r--test/CodeGen/X86/haddsub-shuf.ll143
-rw-r--r--test/CodeGen/X86/haddsub-undef.ll80
-rw-r--r--test/CodeGen/X86/haddsub.ll97
-rw-r--r--test/CodeGen/X86/half.ll120
-rw-r--r--test/CodeGen/X86/handle-move.ll26
-rw-r--r--test/CodeGen/X86/hoist-invariant-load.ll30
-rw-r--r--test/CodeGen/X86/hoist-spill.ll2
-rw-r--r--test/CodeGen/X86/horizontal-reduce-smax.ll1940
-rw-r--r--test/CodeGen/X86/horizontal-reduce-smin.ll1942
-rw-r--r--test/CodeGen/X86/horizontal-reduce-umax.ll2161
-rw-r--r--test/CodeGen/X86/horizontal-reduce-umin.ll2111
-rw-r--r--test/CodeGen/X86/horizontal-shuffle.ll272
-rw-r--r--test/CodeGen/X86/i128-and-beyond.ll2
-rw-r--r--test/CodeGen/X86/i128-immediate.ll2
-rw-r--r--test/CodeGen/X86/i128-mul.ll2
-rw-r--r--test/CodeGen/X86/i128-sdiv.ll2
-rw-r--r--test/CodeGen/X86/i16lshr8pat.ll2
-rw-r--r--test/CodeGen/X86/i256-add.ll8
-rw-r--r--test/CodeGen/X86/i2k.ll2
-rw-r--r--test/CodeGen/X86/i486-fence-loop.ll4
-rw-r--r--test/CodeGen/X86/i64-mem-copy.ll10
-rw-r--r--test/CodeGen/X86/i64-to-float.ll48
-rw-r--r--test/CodeGen/X86/iabs.ll123
-rw-r--r--test/CodeGen/X86/illegal-bitfield-loadstore.ll97
-rw-r--r--test/CodeGen/X86/illegal-insert.ll2
-rw-r--r--test/CodeGen/X86/illegal-vector-args-return.ll8
-rw-r--r--test/CodeGen/X86/immediate_merging.ll98
-rw-r--r--test/CodeGen/X86/immediate_merging64.ll4
-rw-r--r--test/CodeGen/X86/implicit-null-check-negative.ll16
-rw-r--r--test/CodeGen/X86/implicit-null-check.ll41
-rw-r--r--test/CodeGen/X86/implicit-null-checks.mir256
-rw-r--r--test/CodeGen/X86/implicit-use-spill.mir4
-rw-r--r--test/CodeGen/X86/imul-lea-2.ll19
-rw-r--r--test/CodeGen/X86/imul-lea.ll10
-rw-r--r--test/CodeGen/X86/imul.ll46
-rw-r--r--test/CodeGen/X86/inline-0bh.ll2
-rw-r--r--test/CodeGen/X86/inline-asm-A-constraint.ll3
-rw-r--r--test/CodeGen/X86/inline-asm-R-constraint.ll2
-rw-r--r--test/CodeGen/X86/inline-asm-avx-v-constraint-32bit.ll38
-rw-r--r--test/CodeGen/X86/inline-asm-avx-v-constraint.ll42
-rw-r--r--test/CodeGen/X86/inline-asm-avx512f-v-constraint.ll20
-rw-r--r--test/CodeGen/X86/inline-asm-avx512vl-v-constraint-32bit.ll34
-rw-r--r--test/CodeGen/X86/inline-asm-avx512vl-v-constraint.ll36
-rw-r--r--test/CodeGen/X86/inline-asm-bad-constraint-n.ll2
-rw-r--r--test/CodeGen/X86/inline-asm-duplicated-constraint.ll2
-rw-r--r--test/CodeGen/X86/inline-asm-error.ll6
-rw-r--r--test/CodeGen/X86/inline-asm-flag-clobber.ll2
-rw-r--r--test/CodeGen/X86/inline-asm-fpstack.ll501
-rw-r--r--test/CodeGen/X86/inline-asm-modifier-n.ll2
-rw-r--r--test/CodeGen/X86/inline-asm-modifier-q.ll2
-rw-r--r--test/CodeGen/X86/inline-asm-mrv.ll8
-rw-r--r--test/CodeGen/X86/inline-asm-q-regs.ll2
-rw-r--r--test/CodeGen/X86/inline-asm-stack-realign.ll2
-rw-r--r--test/CodeGen/X86/inline-asm-stack-realign3.ll2
-rw-r--r--test/CodeGen/X86/inline-asm-tied.ll2
-rw-r--r--test/CodeGen/X86/inline-asm-x-scalar.ll2
-rw-r--r--test/CodeGen/X86/inline-asm.ll2
-rw-r--r--test/CodeGen/X86/inline-sse.ll4
-rw-r--r--test/CodeGen/X86/inlineasm-sched-bug.ll2
-rw-r--r--test/CodeGen/X86/ins_split_regalloc.ll2
-rw-r--r--test/CodeGen/X86/ins_subreg_coalesce-1.ll2
-rw-r--r--test/CodeGen/X86/ins_subreg_coalesce-2.ll2
-rw-r--r--test/CodeGen/X86/ins_subreg_coalesce-3.ll2
-rw-r--r--test/CodeGen/X86/insert-into-constant-vector.ll465
-rw-r--r--test/CodeGen/X86/insert-positions.ll2
-rw-r--r--test/CodeGen/X86/insertelement-copytoregs.ll2
-rw-r--r--test/CodeGen/X86/insertelement-duplicates.ll12
-rw-r--r--test/CodeGen/X86/insertelement-legalize.ll2
-rw-r--r--test/CodeGen/X86/insertelement-ones.ll504
-rw-r--r--test/CodeGen/X86/insertelement-shuffle.ll145
-rw-r--r--test/CodeGen/X86/insertelement-zero.ll162
-rw-r--r--test/CodeGen/X86/insertps-combine.ll60
-rw-r--r--test/CodeGen/X86/insertps-from-constantpool.ll4
-rw-r--r--test/CodeGen/X86/insertps-unfold-load-bug.ll4
-rw-r--r--test/CodeGen/X86/int-intrinsic.ll4
-rw-r--r--test/CodeGen/X86/invalid-liveness.mir8
-rw-r--r--test/CodeGen/X86/invalid-shift-immediate.ll2
-rw-r--r--test/CodeGen/X86/ipra-inline-asm.ll2
-rw-r--r--test/CodeGen/X86/ipra-reg-alias.ll2
-rw-r--r--test/CodeGen/X86/ipra-reg-usage.ll2
-rw-r--r--test/CodeGen/X86/isel-optnone.ll2
-rw-r--r--test/CodeGen/X86/isel-sink.ll2
-rw-r--r--test/CodeGen/X86/isel-sink2.ll2
-rw-r--r--test/CodeGen/X86/isnan.ll2
-rw-r--r--test/CodeGen/X86/isnan2.ll2
-rw-r--r--test/CodeGen/X86/ispositive.ll2
-rw-r--r--test/CodeGen/X86/jump_sign.ll58
-rw-r--r--test/CodeGen/X86/known-bits-vector.ll240
-rw-r--r--test/CodeGen/X86/known-bits.ll24
-rw-r--r--test/CodeGen/X86/known-signbits-vector.ll295
-rw-r--r--test/CodeGen/X86/label-annotation.ll73
-rw-r--r--test/CodeGen/X86/lakemont.ll2
-rw-r--r--test/CodeGen/X86/large-code-model-isel.ll2
-rw-r--r--test/CodeGen/X86/large-gep-chain.ll2
-rw-r--r--test/CodeGen/X86/large-gep-scale.ll2
-rw-r--r--test/CodeGen/X86/lea-3.ll89
-rw-r--r--test/CodeGen/X86/lea-opt-cse1.ll46
-rw-r--r--test/CodeGen/X86/lea-opt-cse2.ll72
-rw-r--r--test/CodeGen/X86/lea-opt-cse3.ll162
-rw-r--r--test/CodeGen/X86/lea-opt-cse4.ll142
-rw-r--r--test/CodeGen/X86/lea-opt-memop-check-1.ll2
-rw-r--r--test/CodeGen/X86/lea-opt-with-debug.mir85
-rw-r--r--test/CodeGen/X86/lea-recursion.ll2
-rw-r--r--test/CodeGen/X86/lea32-schedule.ll825
-rw-r--r--test/CodeGen/X86/lea64-schedule.ll672
-rw-r--r--test/CodeGen/X86/leaFixup32.mir40
-rw-r--r--test/CodeGen/X86/leaFixup64.mir90
-rw-r--r--test/CodeGen/X86/legalize-fmp-oeq-vector-select.ll2
-rw-r--r--test/CodeGen/X86/legalize-libcalls.ll4
-rw-r--r--test/CodeGen/X86/legalize-shift-64.ll31
-rw-r--r--test/CodeGen/X86/legalize-shl-vec.ll38
-rw-r--r--test/CodeGen/X86/legalizedag_vec.ll2
-rw-r--r--test/CodeGen/X86/libcall-sret.ll6
-rw-r--r--test/CodeGen/X86/licm-nested.ll2
-rw-r--r--test/CodeGen/X86/limited-prec.ll6
-rw-r--r--test/CodeGen/X86/linux-preemption.ll225
-rw-r--r--test/CodeGen/X86/live-out-reg-info.ll16
-rw-r--r--test/CodeGen/X86/live-range-nosubreg.ll2
-rw-r--r--test/CodeGen/X86/liveness-local-regalloc.ll4
-rw-r--r--test/CodeGen/X86/llc-override-mcpu-mattr.ll4
-rw-r--r--test/CodeGen/X86/load-combine-dbg.ll37
-rw-r--r--test/CodeGen/X86/load-combine.ll152
-rw-r--r--test/CodeGen/X86/logical-load-fold.ll8
-rw-r--r--test/CodeGen/X86/long-setcc.ll2
-rw-r--r--test/CodeGen/X86/longlong-deadload.ll2
-rw-r--r--test/CodeGen/X86/loop-blocks.ll2
-rw-r--r--test/CodeGen/X86/loop-search.ll16
-rw-r--r--test/CodeGen/X86/loop-strength-reduce-2.ll4
-rw-r--r--test/CodeGen/X86/loop-strength-reduce.ll2
-rw-r--r--test/CodeGen/X86/loop-strength-reduce4.ll15
-rw-r--r--test/CodeGen/X86/loop-strength-reduce5.ll2
-rw-r--r--test/CodeGen/X86/loop-strength-reduce6.ll2
-rw-r--r--test/CodeGen/X86/loop-strength-reduce7.ll2
-rw-r--r--test/CodeGen/X86/lower-bitcast.ll36
-rw-r--r--test/CodeGen/X86/lower-vec-shift-2.ll32
-rw-r--r--test/CodeGen/X86/lower-vec-shift.ll104
-rw-r--r--test/CodeGen/X86/lower-vec-shuffle-bug.ll8
-rw-r--r--test/CodeGen/X86/lsr-delayed-fold.ll2
-rw-r--r--test/CodeGen/X86/lsr-i386.ll2
-rw-r--r--test/CodeGen/X86/lsr-interesting-step.ll2
-rw-r--r--test/CodeGen/X86/lsr-negative-stride.ll2
-rw-r--r--test/CodeGen/X86/lsr-nonaffine.ll2
-rw-r--r--test/CodeGen/X86/lsr-normalization.ll4
-rw-r--r--test/CodeGen/X86/lsr-quadratic-expand.ll2
-rw-r--r--test/CodeGen/X86/lsr-redundant-addressing.ll2
-rw-r--r--test/CodeGen/X86/lsr-reuse.ll2
-rw-r--r--test/CodeGen/X86/lsr-sort.ll2
-rw-r--r--test/CodeGen/X86/lsr-static-addr.ll4
-rw-r--r--test/CodeGen/X86/lsr-wrap.ll2
-rw-r--r--test/CodeGen/X86/lwp-intrinsics-x86_64.ll8
-rw-r--r--test/CodeGen/X86/lwp-intrinsics.ll24
-rw-r--r--test/CodeGen/X86/lwp-schedule.ll179
-rw-r--r--test/CodeGen/X86/lzcnt-schedule.ll161
-rw-r--r--test/CodeGen/X86/lzcnt-zext-cmp.ll46
-rw-r--r--test/CodeGen/X86/lzcnt.ll2
-rw-r--r--test/CodeGen/X86/machine-combiner-int-vec.ll18
-rw-r--r--test/CodeGen/X86/machine-combiner-int.ll24
-rw-r--r--test/CodeGen/X86/machine-combiner.ll105
-rw-r--r--test/CodeGen/X86/machine-copy-prop.mir2
-rw-r--r--test/CodeGen/X86/machine-cp.ll167
-rw-r--r--test/CodeGen/X86/machine-cse.ll30
-rw-r--r--test/CodeGen/X86/machine-outliner-debuginfo.ll2
-rw-r--r--test/CodeGen/X86/machine-outliner-tailcalls.ll2
-rw-r--r--test/CodeGen/X86/machine-outliner.ll4
-rw-r--r--test/CodeGen/X86/machine-region-info.mir16
-rw-r--r--test/CodeGen/X86/machinesink-merge-debuginfo.ll56
-rw-r--r--test/CodeGen/X86/machinesink-null-debuginfo.ll49
-rw-r--r--test/CodeGen/X86/madd.ll150
-rw-r--r--test/CodeGen/X86/mask-negated-bool.ll16
-rw-r--r--test/CodeGen/X86/masked-iv-safe.ll18
-rw-r--r--test/CodeGen/X86/masked-iv-unsafe.ll2
-rw-r--r--test/CodeGen/X86/masked_gather_scatter.ll1380
-rw-r--r--test/CodeGen/X86/masked_memop.ll435
-rw-r--r--test/CodeGen/X86/maskmovdqu.ll8
-rw-r--r--test/CodeGen/X86/mature-mc-support.ll8
-rw-r--r--test/CodeGen/X86/mbp-false-cfg-break.ll2
-rw-r--r--test/CodeGen/X86/mem-promote-integers.ll4
-rw-r--r--test/CodeGen/X86/membarrier.ll2
-rw-r--r--test/CodeGen/X86/memcmp-minsize.ll411
-rw-r--r--test/CodeGen/X86/memcmp-optsize.ll739
-rw-r--r--test/CodeGen/X86/memcmp.ll946
-rw-r--r--test/CodeGen/X86/memcpy-2.ll2
-rw-r--r--test/CodeGen/X86/memset-2.ll8
-rw-r--r--test/CodeGen/X86/memset-nonzero.ll71
-rw-r--r--test/CodeGen/X86/memset.ll115
-rw-r--r--test/CodeGen/X86/memset64-on-x86-32.ll6
-rw-r--r--test/CodeGen/X86/merge-consecutive-loads-128.ll622
-rw-r--r--test/CodeGen/X86/merge-consecutive-loads-256.ll198
-rw-r--r--test/CodeGen/X86/merge-consecutive-loads-512.ll182
-rw-r--r--test/CodeGen/X86/merge-consecutive-stores-i1.ll2
-rw-r--r--test/CodeGen/X86/merge-consecutive-stores.ll10
-rw-r--r--test/CodeGen/X86/merge-store-constants.ll146
-rw-r--r--test/CodeGen/X86/merge-store-partially-alias-loads.ll10
-rw-r--r--test/CodeGen/X86/merge_store.ll28
-rw-r--r--test/CodeGen/X86/merge_store_duplicated_loads.ll6
-rw-r--r--test/CodeGen/X86/mfence.ll8
-rw-r--r--test/CodeGen/X86/misched-code-difference-with-debug.ll4
-rw-r--r--test/CodeGen/X86/misched-copy.ll12
-rw-r--r--test/CodeGen/X86/misched-fusion.ll2
-rw-r--r--test/CodeGen/X86/misched-matmul.ll2
-rw-r--r--test/CodeGen/X86/misched-matrix.ll6
-rw-r--r--test/CodeGen/X86/misched-new.ll4
-rw-r--r--test/CodeGen/X86/mmx-arg-passing-x86-64.ll6
-rw-r--r--test/CodeGen/X86/mmx-arg-passing.ll8
-rw-r--r--test/CodeGen/X86/mmx-arith.ll4
-rw-r--r--test/CodeGen/X86/mmx-bitcast.ll14
-rw-r--r--test/CodeGen/X86/mmx-coalescing.ll2
-rw-r--r--test/CodeGen/X86/mmx-copy-gprs.ll4
-rw-r--r--test/CodeGen/X86/mmx-cvt.ll40
-rw-r--r--test/CodeGen/X86/mmx-fold-load.ll72
-rw-r--r--test/CodeGen/X86/mmx-intrinsics.ll8
-rw-r--r--test/CodeGen/X86/mmx-only.ll4
-rw-r--r--test/CodeGen/X86/mmx-schedule.ll6967
-rw-r--r--test/CodeGen/X86/movbe-schedule.ll190
-rw-r--r--test/CodeGen/X86/movfs.ll2
-rw-r--r--test/CodeGen/X86/movgs.ll18
-rw-r--r--test/CodeGen/X86/movmsk.ll16
-rw-r--r--test/CodeGen/X86/movpc32-check.ll2
-rw-r--r--test/CodeGen/X86/movtopush.ll172
-rw-r--r--test/CodeGen/X86/movtopush.mir125
-rw-r--r--test/CodeGen/X86/movtopush64.ll33
-rw-r--r--test/CodeGen/X86/ms-inline-asm.ll2
-rw-r--r--test/CodeGen/X86/mul-constant-i16.ll312
-rw-r--r--test/CodeGen/X86/mul-constant-i32.ll918
-rw-r--r--test/CodeGen/X86/mul-constant-i64.ll678
-rw-r--r--test/CodeGen/X86/mul-constant-result.ll208
-rw-r--r--test/CodeGen/X86/mul-i1024.ll10575
-rw-r--r--test/CodeGen/X86/mul-i256.ll486
-rw-r--r--test/CodeGen/X86/mul-i512.ll2175
-rw-r--r--test/CodeGen/X86/mul-legalize.ll2
-rw-r--r--test/CodeGen/X86/mul-remat.ll2
-rw-r--r--test/CodeGen/X86/mul-shift-reassoc.ll4
-rw-r--r--test/CodeGen/X86/mul128.ll81
-rw-r--r--test/CodeGen/X86/mul128_sext_loop.ll2
-rw-r--r--test/CodeGen/X86/mul64.ll4
-rw-r--r--test/CodeGen/X86/mult-alt-generic-i686.ll2
-rw-r--r--test/CodeGen/X86/mult-alt-generic-x86_64.ll2
-rw-r--r--test/CodeGen/X86/mult-alt-x86.ll2
-rw-r--r--test/CodeGen/X86/multiple-loop-post-inc.ll2
-rw-r--r--test/CodeGen/X86/multiple-return-values-cross-block.ll2
-rw-r--r--test/CodeGen/X86/mulvi32.ll472
-rw-r--r--test/CodeGen/X86/mulx32.ll4
-rw-r--r--test/CodeGen/X86/mulx64.ll4
-rw-r--r--test/CodeGen/X86/musttail-thiscall.ll4
-rw-r--r--test/CodeGen/X86/musttail.ll6
-rw-r--r--test/CodeGen/X86/mwaitx-schedule.ll65
-rw-r--r--test/CodeGen/X86/narrow-shl-cst.ll2
-rw-r--r--test/CodeGen/X86/narrow-shl-load.ll2
-rw-r--r--test/CodeGen/X86/narrow_op-1.ll2
-rw-r--r--test/CodeGen/X86/neg-shl-add.ll2
-rw-r--r--test/CodeGen/X86/neg_cmp.ll8
-rw-r--r--test/CodeGen/X86/neg_fp.ll2
-rw-r--r--test/CodeGen/X86/negate-add-zero.ll2
-rw-r--r--test/CodeGen/X86/negate-i1.ll49
-rw-r--r--test/CodeGen/X86/negate-shift.ll8
-rw-r--r--test/CodeGen/X86/negate.ll12
-rw-r--r--test/CodeGen/X86/negative-sin.ll12
-rw-r--r--test/CodeGen/X86/negative-stride-fptosi-user.ll2
-rw-r--r--test/CodeGen/X86/negative-subscript.ll2
-rw-r--r--test/CodeGen/X86/negative_zero.ll2
-rw-r--r--test/CodeGen/X86/no-cmov.ll2
-rw-r--r--test/CodeGen/X86/no-plt.ll30
-rw-r--r--test/CodeGen/X86/no-sse2-avg.ll20
-rw-r--r--test/CodeGen/X86/nobt.ll2
-rw-r--r--test/CodeGen/X86/nocx16.ll2
-rw-r--r--test/CodeGen/X86/non-value-mem-operand.mir48
-rw-r--r--test/CodeGen/X86/nonconst-static-ev.ll2
-rw-r--r--test/CodeGen/X86/nonconst-static-iv.ll2
-rw-r--r--test/CodeGen/X86/nontemporal-2.ll356
-rw-r--r--test/CodeGen/X86/nontemporal-loads.ll622
-rw-r--r--test/CodeGen/X86/nontemporal.ll131
-rw-r--r--test/CodeGen/X86/norex-subreg.ll12
-rw-r--r--test/CodeGen/X86/nosse-error1.ll4
-rw-r--r--test/CodeGen/X86/nosse-error2.ll4
-rw-r--r--test/CodeGen/X86/nosse-varargs.ll4
-rw-r--r--test/CodeGen/X86/nosse-vector.ll28
-rw-r--r--test/CodeGen/X86/not-and-simplify.ll12
-rw-r--r--test/CodeGen/X86/null-streamer.ll4
-rw-r--r--test/CodeGen/X86/object-size.ll2
-rw-r--r--test/CodeGen/X86/oddshuffles.ll582
-rw-r--r--test/CodeGen/X86/opt-ext-uses.ll2
-rw-r--r--test/CodeGen/X86/optimize-max-0.ll2
-rw-r--r--test/CodeGen/X86/optimize-max-1.ll16
-rw-r--r--test/CodeGen/X86/optimize-max-2.ll4
-rw-r--r--test/CodeGen/X86/or-branch.ll18
-rw-r--r--test/CodeGen/X86/or-lea.ll42
-rw-r--r--test/CodeGen/X86/overflow-intrinsic-setcc-fold.ll174
-rw-r--r--test/CodeGen/X86/overflow.ll77
-rw-r--r--test/CodeGen/X86/overlap-shift.ll2
-rw-r--r--test/CodeGen/X86/packed_struct.ll2
-rw-r--r--test/CodeGen/X86/packss.ll149
-rw-r--r--test/CodeGen/X86/palignr.ll52
-rw-r--r--test/CodeGen/X86/patchpoint-webkit_jscc.ll6
-rw-r--r--test/CodeGen/X86/pause.ll2
-rw-r--r--test/CodeGen/X86/peep-setb.ll18
-rw-r--r--test/CodeGen/X86/peep-test-0.ll2
-rw-r--r--test/CodeGen/X86/peep-test-1.ll2
-rw-r--r--test/CodeGen/X86/peep-test-2.ll2
-rw-r--r--test/CodeGen/X86/peep-test-3.ll2
-rw-r--r--test/CodeGen/X86/peep-test-4.ll54
-rw-r--r--test/CodeGen/X86/peephole-cvt-sse.ll8
-rw-r--r--test/CodeGen/X86/peephole-multiple-folds.ll2
-rw-r--r--test/CodeGen/X86/peephole-na-phys-copy-folding.ll372
-rw-r--r--test/CodeGen/X86/peephole-recurrence.mir158
-rw-r--r--test/CodeGen/X86/peephole.mir10
-rw-r--r--test/CodeGen/X86/phaddsub.ll64
-rw-r--r--test/CodeGen/X86/phi-bit-propagation.ll2
-rw-r--r--test/CodeGen/X86/phi-immediate-factoring.ll6
-rw-r--r--test/CodeGen/X86/phys-reg-local-regalloc.ll6
-rw-r--r--test/CodeGen/X86/phys_subreg_coalesce-2.ll4
-rw-r--r--test/CodeGen/X86/phys_subreg_coalesce-3.ll8
-rw-r--r--test/CodeGen/X86/pku.ll4
-rw-r--r--test/CodeGen/X86/pmovext.ll2
-rw-r--r--test/CodeGen/X86/pmovsx-inreg.ll132
-rw-r--r--test/CodeGen/X86/pmul.ll170
-rw-r--r--test/CodeGen/X86/pointer-vector.ll24
-rw-r--r--test/CodeGen/X86/pop-stack-cleanup-msvc.ll26
-rw-r--r--test/CodeGen/X86/popcnt-schedule.ll212
-rw-r--r--test/CodeGen/X86/popcnt.ll44
-rw-r--r--test/CodeGen/X86/post-ra-sched-with-debug.mir49
-rw-r--r--test/CodeGen/X86/post-ra-sched.ll2
-rw-r--r--test/CodeGen/X86/postalloc-coalescing.ll2
-rw-r--r--test/CodeGen/X86/powi.ll6
-rw-r--r--test/CodeGen/X86/pr10068.ll2
-rw-r--r--test/CodeGen/X86/pr10523.ll2
-rw-r--r--test/CodeGen/X86/pr10524.ll2
-rw-r--r--test/CodeGen/X86/pr10525.ll2
-rw-r--r--test/CodeGen/X86/pr10526.ll2
-rw-r--r--test/CodeGen/X86/pr11334.ll20
-rw-r--r--test/CodeGen/X86/pr11468.ll2
-rw-r--r--test/CodeGen/X86/pr11985.ll4
-rw-r--r--test/CodeGen/X86/pr11998.ll2
-rw-r--r--test/CodeGen/X86/pr12312.ll48
-rw-r--r--test/CodeGen/X86/pr12889.ll4
-rw-r--r--test/CodeGen/X86/pr13220.ll2
-rw-r--r--test/CodeGen/X86/pr13577.ll4
-rw-r--r--test/CodeGen/X86/pr14161.ll4
-rw-r--r--test/CodeGen/X86/pr14204.ll2
-rw-r--r--test/CodeGen/X86/pr14314.ll4
-rw-r--r--test/CodeGen/X86/pr14562.ll2
-rw-r--r--test/CodeGen/X86/pr15267.ll8
-rw-r--r--test/CodeGen/X86/pr15309.ll2
-rw-r--r--test/CodeGen/X86/pr15705.ll10
-rw-r--r--test/CodeGen/X86/pr15981.ll17
-rw-r--r--test/CodeGen/X86/pr16031.ll2
-rw-r--r--test/CodeGen/X86/pr16360.ll2
-rw-r--r--test/CodeGen/X86/pr17764.ll2
-rw-r--r--test/CodeGen/X86/pr18014.ll2
-rw-r--r--test/CodeGen/X86/pr18344.ll4
-rw-r--r--test/CodeGen/X86/pr20011.ll33
-rw-r--r--test/CodeGen/X86/pr20012.ll17
-rw-r--r--test/CodeGen/X86/pr20088.ll2
-rw-r--r--test/CodeGen/X86/pr21099.ll2
-rw-r--r--test/CodeGen/X86/pr21792.ll3
-rw-r--r--test/CodeGen/X86/pr22338.ll26
-rw-r--r--test/CodeGen/X86/pr22774.ll2
-rw-r--r--test/CodeGen/X86/pr22970.ll10
-rw-r--r--test/CodeGen/X86/pr2326.ll2
-rw-r--r--test/CodeGen/X86/pr23273.ll2
-rw-r--r--test/CodeGen/X86/pr23603.ll4
-rw-r--r--test/CodeGen/X86/pr24602.ll2
-rw-r--r--test/CodeGen/X86/pr2585.ll4
-rw-r--r--test/CodeGen/X86/pr26350.ll2
-rw-r--r--test/CodeGen/X86/pr2656.ll7
-rw-r--r--test/CodeGen/X86/pr2659.ll2
-rw-r--r--test/CodeGen/X86/pr26652.ll2
-rw-r--r--test/CodeGen/X86/pr26870.ll6
-rw-r--r--test/CodeGen/X86/pr27591.ll8
-rw-r--r--test/CodeGen/X86/pr27681.mir6
-rw-r--r--test/CodeGen/X86/pr28129.ll32
-rw-r--r--test/CodeGen/X86/pr28173.ll14
-rw-r--r--test/CodeGen/X86/pr28472.ll2
-rw-r--r--test/CodeGen/X86/pr28560.ll2
-rw-r--r--test/CodeGen/X86/pr29061.ll40
-rw-r--r--test/CodeGen/X86/pr29112.ll7
-rw-r--r--test/CodeGen/X86/pr29170.ll6
-rw-r--r--test/CodeGen/X86/pr2982.ll2
-rw-r--r--test/CodeGen/X86/pr30284.ll2
-rw-r--r--test/CodeGen/X86/pr30430.ll11
-rw-r--r--test/CodeGen/X86/pr30511.ll2
-rw-r--r--test/CodeGen/X86/pr31045.ll89
-rw-r--r--test/CodeGen/X86/pr31088.ll12
-rw-r--r--test/CodeGen/X86/pr31323.ll4
-rw-r--r--test/CodeGen/X86/pr31773.ll41
-rw-r--r--test/CodeGen/X86/pr31956.ll2
-rw-r--r--test/CodeGen/X86/pr32108.ll2
-rw-r--r--test/CodeGen/X86/pr3216.ll2
-rw-r--r--test/CodeGen/X86/pr32241.ll18
-rw-r--r--test/CodeGen/X86/pr32256.ll3
-rw-r--r--test/CodeGen/X86/pr32282.ll15
-rw-r--r--test/CodeGen/X86/pr32284.ll491
-rw-r--r--test/CodeGen/X86/pr32329.ll40
-rw-r--r--test/CodeGen/X86/pr32340.ll62
-rw-r--r--test/CodeGen/X86/pr32345.ll50
-rw-r--r--test/CodeGen/X86/pr32368.ll24
-rw-r--r--test/CodeGen/X86/pr3241.ll2
-rw-r--r--test/CodeGen/X86/pr32420.ll4
-rw-r--r--test/CodeGen/X86/pr3243.ll2
-rw-r--r--test/CodeGen/X86/pr3244.ll2
-rw-r--r--test/CodeGen/X86/pr32451.ll5
-rw-r--r--test/CodeGen/X86/pr32484.ll6
-rw-r--r--test/CodeGen/X86/pr3250.ll2
-rw-r--r--test/CodeGen/X86/pr32659.ll81
-rw-r--r--test/CodeGen/X86/pr32907.ll8
-rw-r--r--test/CodeGen/X86/pr3317.ll2
-rw-r--r--test/CodeGen/X86/pr33290.ll51
-rw-r--r--test/CodeGen/X86/pr33349.ll30
-rw-r--r--test/CodeGen/X86/pr3366.ll2
-rw-r--r--test/CodeGen/X86/pr33828.ll8
-rw-r--r--test/CodeGen/X86/pr33844.ll2
-rw-r--r--test/CodeGen/X86/pr33954.ll91
-rw-r--r--test/CodeGen/X86/pr33960.ll4
-rw-r--r--test/CodeGen/X86/pr34080.ll167
-rw-r--r--test/CodeGen/X86/pr34088.ll7
-rw-r--r--test/CodeGen/X86/pr34137.ll4
-rw-r--r--test/CodeGen/X86/pr34139.ll10
-rw-r--r--test/CodeGen/X86/pr34149.ll40
-rw-r--r--test/CodeGen/X86/pr34177.ll2
-rw-r--r--test/CodeGen/X86/pr34271-1.ll2
-rw-r--r--test/CodeGen/X86/pr34271.ll2
-rw-r--r--test/CodeGen/X86/pr34381.ll43
-rw-r--r--test/CodeGen/X86/pr34397.ll24
-rw-r--r--test/CodeGen/X86/pr34421.ll40
-rw-r--r--test/CodeGen/X86/pr34605.ll25
-rw-r--r--test/CodeGen/X86/pr34629.ll52
-rw-r--r--test/CodeGen/X86/pr34634.ll67
-rw-r--r--test/CodeGen/X86/pr34653.ll209
-rw-r--r--test/CodeGen/X86/pr34657.ll20
-rw-r--r--test/CodeGen/X86/pr34855.ll27
-rw-r--r--test/CodeGen/X86/pr3522.ll2
-rw-r--r--test/CodeGen/X86/pr35272.ll14
-rw-r--r--test/CodeGen/X86/pr35399.ll22
-rw-r--r--test/CodeGen/X86/pr35443.ll30
-rw-r--r--test/CodeGen/X86/pr35636.ll35
-rw-r--r--test/CodeGen/X86/pr5145.ll2
-rw-r--r--test/CodeGen/X86/pr7882.ll2
-rw-r--r--test/CodeGen/X86/pr9743.ll3
-rw-r--r--test/CodeGen/X86/pre-coalesce.mir20
-rw-r--r--test/CodeGen/X86/prefetch.ll12
-rw-r--r--test/CodeGen/X86/prolog-push-seq.ll2
-rw-r--r--test/CodeGen/X86/prologue-epilogue-remarks.mir58
-rw-r--r--test/CodeGen/X86/promote-trunc.ll2
-rw-r--r--test/CodeGen/X86/promote-vec3.ll64
-rw-r--r--test/CodeGen/X86/promote.ll2
-rw-r--r--test/CodeGen/X86/pseudo_cmov_lower2.ll4
-rw-r--r--test/CodeGen/X86/pshufb-mask-comments.ll12
-rw-r--r--test/CodeGen/X86/pshufd-combine-crash.ll2
-rw-r--r--test/CodeGen/X86/psubus.ll1812
-rw-r--r--test/CodeGen/X86/push-cfi.ll12
-rw-r--r--test/CodeGen/X86/rd-mod-wr-eflags.ll2
-rw-r--r--test/CodeGen/X86/rdpmc.ll4
-rw-r--r--test/CodeGen/X86/rdrand-schedule.ll148
-rw-r--r--test/CodeGen/X86/rdrand-x86_64.ll2
-rw-r--r--test/CodeGen/X86/rdrand.ll22
-rw-r--r--test/CodeGen/X86/rdseed-schedule.ll116
-rw-r--r--test/CodeGen/X86/rdseed-x86_64.ll2
-rw-r--r--test/CodeGen/X86/rdseed.ll8
-rw-r--r--test/CodeGen/X86/rdtsc.ll66
-rw-r--r--test/CodeGen/X86/recip-fastmath.ll458
-rw-r--r--test/CodeGen/X86/recip-fastmath2.ll767
-rw-r--r--test/CodeGen/X86/recip-pic.ll4
-rw-r--r--test/CodeGen/X86/reduce-trunc-shl.ll34
-rw-r--r--test/CodeGen/X86/regpressure.ll2
-rw-r--r--test/CodeGen/X86/rem.ll10
-rw-r--r--test/CodeGen/X86/rem_crash.ll4
-rw-r--r--test/CodeGen/X86/remat-phys-dead.ll6
-rw-r--r--test/CodeGen/X86/replace-load-and-with-bzhi.ll89
-rw-r--r--test/CodeGen/X86/ret-addr.ll4
-rw-r--r--test/CodeGen/X86/ret-i64-0.ll2
-rw-r--r--test/CodeGen/X86/ret-mmx.ll8
-rw-r--r--test/CodeGen/X86/rip-rel-address.ll2
-rw-r--r--test/CodeGen/X86/rot16.ll216
-rw-r--r--test/CodeGen/X86/rot32.ll143
-rw-r--r--test/CodeGen/X86/rot64.ll133
-rw-r--r--test/CodeGen/X86/rotate.ll172
-rw-r--r--test/CodeGen/X86/rotate2.ll2
-rw-r--r--test/CodeGen/X86/rotate4.ll159
-rw-r--r--test/CodeGen/X86/rotate_vec.ll8
-rw-r--r--test/CodeGen/X86/rounding-ops.ll202
-rw-r--r--test/CodeGen/X86/rrlist-livereg-corrutpion.ll2
-rw-r--r--test/CodeGen/X86/rtm-schedule.ll61
-rw-r--r--test/CodeGen/X86/rtm.ll21
-rw-r--r--test/CodeGen/X86/sad.ll209
-rw-r--r--test/CodeGen/X86/sad_variations.ll42
-rw-r--r--test/CodeGen/X86/sandybridge-loads.ll4
-rw-r--r--test/CodeGen/X86/sar_fold.ll8
-rw-r--r--test/CodeGen/X86/sar_fold64.ll20
-rw-r--r--test/CodeGen/X86/sbb.ll38
-rw-r--r--test/CodeGen/X86/scalar-extract.ll2
-rw-r--r--test/CodeGen/X86/scalar-int-to-fp.ll132
-rw-r--r--test/CodeGen/X86/scalar_sse_minmax.ll2
-rw-r--r--test/CodeGen/X86/scalar_widen_div.ll418
-rw-r--r--test/CodeGen/X86/scalarize-bitcast.ll2
-rw-r--r--test/CodeGen/X86/scatter-schedule.ll22
-rw-r--r--test/CodeGen/X86/schedule-x86_32.ll2332
-rw-r--r--test/CodeGen/X86/schedule-x86_64.ll17197
-rw-r--r--test/CodeGen/X86/scheduler-backtracking.ll10
-rw-r--r--test/CodeGen/X86/sdiv-exact.ll2
-rw-r--r--test/CodeGen/X86/sdiv-pow2.ll2
-rw-r--r--test/CodeGen/X86/segmented-stacks.ll19
-rw-r--r--test/CodeGen/X86/select-mmx.ll18
-rw-r--r--test/CodeGen/X86/select-with-and-or.ll28
-rw-r--r--test/CodeGen/X86/select.ll540
-rw-r--r--test/CodeGen/X86/select_const.ll275
-rw-r--r--test/CodeGen/X86/setcc-combine.ll24
-rw-r--r--test/CodeGen/X86/setcc-logic.ll84
-rw-r--r--test/CodeGen/X86/setcc-lowering.ll29
-rw-r--r--test/CodeGen/X86/setcc-narrowing.ll13
-rw-r--r--test/CodeGen/X86/setcc-wide-types.ll16
-rw-r--r--test/CodeGen/X86/setcc.ll12
-rw-r--r--test/CodeGen/X86/setoeq.ll2
-rw-r--r--test/CodeGen/X86/setuge.ll2
-rw-r--r--test/CodeGen/X86/sext-i1.ll59
-rw-r--r--test/CodeGen/X86/sext-load.ll2
-rw-r--r--test/CodeGen/X86/sext-setcc-self.ll12
-rw-r--r--test/CodeGen/X86/sext-subreg.ll2
-rw-r--r--test/CodeGen/X86/sha-schedule.ll242
-rw-r--r--test/CodeGen/X86/sha.ll8
-rw-r--r--test/CodeGen/X86/shift-and.ll192
-rw-r--r--test/CodeGen/X86/shift-bmi2.ll288
-rw-r--r--test/CodeGen/X86/shift-coalesce.ll4
-rw-r--r--test/CodeGen/X86/shift-codegen.ll4
-rw-r--r--test/CodeGen/X86/shift-combine.ll36
-rw-r--r--test/CodeGen/X86/shift-double-x86_64.ll14
-rw-r--r--test/CodeGen/X86/shift-double.ll447
-rw-r--r--test/CodeGen/X86/shift-folding.ll10
-rw-r--r--test/CodeGen/X86/shift-i128.ll4
-rw-r--r--test/CodeGen/X86/shift-i256.ll6
-rw-r--r--test/CodeGen/X86/shift-one.ll2
-rw-r--r--test/CodeGen/X86/shift-pair.ll2
-rw-r--r--test/CodeGen/X86/shift-parts.ll2
-rw-r--r--test/CodeGen/X86/shift-pcmp.ll8
-rw-r--r--test/CodeGen/X86/shl-anyext.ll2
-rw-r--r--test/CodeGen/X86/shl-crash-on-legalize.ll2
-rw-r--r--test/CodeGen/X86/shl-i64.ll2
-rw-r--r--test/CodeGen/X86/shl_elim.ll2
-rw-r--r--test/CodeGen/X86/shrink-compare.ll60
-rw-r--r--test/CodeGen/X86/shrink-fp-const1.ll2
-rw-r--r--test/CodeGen/X86/shrink-fp-const2.ll2
-rw-r--r--test/CodeGen/X86/shrink_vmul.ll1310
-rw-r--r--test/CodeGen/X86/shrink_vmul_sse.ll2
-rw-r--r--test/CodeGen/X86/shuffle-combine-crash-2.ll4
-rw-r--r--test/CodeGen/X86/shuffle-of-insert.ll197
-rw-r--r--test/CodeGen/X86/shuffle-of-splat-multiuses.ll34
-rw-r--r--test/CodeGen/X86/shuffle-strided-with-offset-128.ll907
-rw-r--r--test/CodeGen/X86/shuffle-strided-with-offset-256.ll1156
-rw-r--r--test/CodeGen/X86/shuffle-strided-with-offset-512.ll1178
-rw-r--r--test/CodeGen/X86/shuffle-vs-trunc-128.ll188
-rw-r--r--test/CodeGen/X86/shuffle-vs-trunc-256.ll426
-rw-r--r--test/CodeGen/X86/shuffle-vs-trunc-512.ll558
-rw-r--r--test/CodeGen/X86/sincos-opt.ll8
-rw-r--r--test/CodeGen/X86/sincos.ll92
-rw-r--r--test/CodeGen/X86/sink-blockfreq.ll2
-rw-r--r--test/CodeGen/X86/sink-hoist.ll2
-rw-r--r--test/CodeGen/X86/sink-out-of-loop.ll2
-rw-r--r--test/CodeGen/X86/sjlj-eh.ll85
-rw-r--r--test/CodeGen/X86/slow-incdec.ll123
-rw-r--r--test/CodeGen/X86/slow-pmulld.ll16
-rw-r--r--test/CodeGen/X86/slow-unaligned-mem.ll4
-rw-r--r--test/CodeGen/X86/smul-with-overflow.ll2
-rw-r--r--test/CodeGen/X86/soft-fp-legal-in-HW-reg.ll2
-rw-r--r--test/CodeGen/X86/soft-fp.ll8
-rw-r--r--test/CodeGen/X86/splat-for-size.ll40
-rw-r--r--test/CodeGen/X86/split-extend-vector-inreg.ll16
-rw-r--r--test/CodeGen/X86/split-store.ll203
-rw-r--r--test/CodeGen/X86/split-vector-bitcast.ll2
-rw-r--r--test/CodeGen/X86/split-vector-rem.ll4
-rw-r--r--test/CodeGen/X86/sqrt-fastmath-mir.ll54
-rw-r--r--test/CodeGen/X86/sqrt-fastmath-tune.ll12
-rw-r--r--test/CodeGen/X86/sqrt-fastmath.ll44
-rw-r--r--test/CodeGen/X86/sqrt-partial.ll43
-rw-r--r--test/CodeGen/X86/sse-align-1.ll2
-rw-r--r--test/CodeGen/X86/sse-align-10.ll2
-rw-r--r--test/CodeGen/X86/sse-align-11.ll4
-rw-r--r--test/CodeGen/X86/sse-align-12.ll12
-rw-r--r--test/CodeGen/X86/sse-align-2.ll2
-rw-r--r--test/CodeGen/X86/sse-align-4.ll2
-rw-r--r--test/CodeGen/X86/sse-align-5.ll2
-rw-r--r--test/CodeGen/X86/sse-align-6.ll2
-rw-r--r--test/CodeGen/X86/sse-align-8.ll2
-rw-r--r--test/CodeGen/X86/sse-align-9.ll2
-rw-r--r--test/CodeGen/X86/sse-fcopysign.ll24
-rw-r--r--test/CodeGen/X86/sse-fsignum.ll58
-rw-r--r--test/CodeGen/X86/sse-intrinsics-fast-isel-x86_64.ll6
-rw-r--r--test/CodeGen/X86/sse-intrinsics-fast-isel.ll672
-rw-r--r--test/CodeGen/X86/sse-intrinsics-x86-upgrade.ll38
-rw-r--r--test/CodeGen/X86/sse-intrinsics-x86.ll207
-rw-r--r--test/CodeGen/X86/sse-intrinsics-x86_64.ll24
-rw-r--r--test/CodeGen/X86/sse-load-ret.ll4
-rw-r--r--test/CodeGen/X86/sse-minmax.ll308
-rw-r--r--test/CodeGen/X86/sse-only.ll4
-rw-r--r--test/CodeGen/X86/sse-regcall.ll2
-rw-r--r--test/CodeGen/X86/sse-scalar-fp-arith-unary.ll16
-rw-r--r--test/CodeGen/X86/sse-scalar-fp-arith.ll284
-rw-r--r--test/CodeGen/X86/sse-schedule.ll3169
-rw-r--r--test/CodeGen/X86/sse-unaligned-mem-feature.ll4
-rw-r--r--test/CodeGen/X86/sse-varargs.ll2
-rw-r--r--test/CodeGen/X86/sse1.ll229
-rw-r--r--test/CodeGen/X86/sse2-intrinsics-fast-isel-x86_64.ll14
-rw-r--r--test/CodeGen/X86/sse2-intrinsics-fast-isel.ll990
-rw-r--r--test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll83
-rw-r--r--test/CodeGen/X86/sse2-intrinsics-x86.ll666
-rw-r--r--test/CodeGen/X86/sse2-intrinsics-x86_64.ll24
-rw-r--r--test/CodeGen/X86/sse2-schedule.ll7980
-rw-r--r--test/CodeGen/X86/sse2-vector-shifts.ll75
-rw-r--r--test/CodeGen/X86/sse2.ll108
-rw-r--r--test/CodeGen/X86/sse3-avx-addsub-2.ll76
-rw-r--r--test/CodeGen/X86/sse3-avx-addsub.ll60
-rw-r--r--test/CodeGen/X86/sse3-intrinsics-fast-isel.ll44
-rw-r--r--test/CodeGen/X86/sse3-intrinsics-x86.ll32
-rw-r--r--test/CodeGen/X86/sse3-schedule.ll756
-rw-r--r--test/CodeGen/X86/sse3.ll88
-rw-r--r--test/CodeGen/X86/sse41-intrinsics-fast-isel.ll265
-rw-r--r--test/CodeGen/X86/sse41-intrinsics-x86-upgrade.ll56
-rw-r--r--test/CodeGen/X86/sse41-intrinsics-x86.ll210
-rw-r--r--test/CodeGen/X86/sse41-pmovxrm.ll48
-rw-r--r--test/CodeGen/X86/sse41-schedule.ll2524
-rw-r--r--test/CodeGen/X86/sse41.ll255
-rw-r--r--test/CodeGen/X86/sse42-intrinsics-fast-isel-x86_64.ll4
-rw-r--r--test/CodeGen/X86/sse42-intrinsics-fast-isel.ll72
-rw-r--r--test/CodeGen/X86/sse42-intrinsics-x86.ll88
-rw-r--r--test/CodeGen/X86/sse42-intrinsics-x86_64.ll4
-rw-r--r--test/CodeGen/X86/sse42-schedule.ll712
-rw-r--r--test/CodeGen/X86/sse4a-intrinsics-fast-isel.ll24
-rw-r--r--test/CodeGen/X86/sse4a-schedule.ll86
-rw-r--r--test/CodeGen/X86/sse4a-upgrade.ll8
-rw-r--r--test/CodeGen/X86/sse4a.ll48
-rw-r--r--test/CodeGen/X86/sse_partial_update.ll14
-rw-r--r--test/CodeGen/X86/ssse3-intrinsics-fast-isel.ll86
-rw-r--r--test/CodeGen/X86/ssse3-intrinsics-x86.ll80
-rw-r--r--test/CodeGen/X86/ssse3-schedule.ll1007
-rw-r--r--test/CodeGen/X86/stack-folding-bmi.ll4
-rw-r--r--test/CodeGen/X86/stack-folding-fp-avx512.ll30
-rw-r--r--test/CodeGen/X86/stack-folding-fp-avx512vl.ll50
-rw-r--r--test/CodeGen/X86/stack-folding-int-avx1.ll24
-rw-r--r--test/CodeGen/X86/stack-folding-int-avx2.ll31
-rw-r--r--test/CodeGen/X86/stack-folding-int-avx512.ll77
-rw-r--r--test/CodeGen/X86/stack-folding-int-avx512vl.ll50
-rw-r--r--test/CodeGen/X86/stack-folding-int-sse42.ll20
-rw-r--r--test/CodeGen/X86/stack-folding-lwp.ll8
-rw-r--r--test/CodeGen/X86/stack-folding-tbm.ll4
-rw-r--r--test/CodeGen/X86/stack-protector-msvc.ll154
-rw-r--r--test/CodeGen/X86/stack-protector-vreg-to-vreg-copy.ll2
-rw-r--r--test/CodeGen/X86/stack-protector-weight.ll23
-rw-r--r--test/CodeGen/X86/stack-size-section.ll30
-rw-r--r--test/CodeGen/X86/stackmap-fast-isel.ll2
-rw-r--r--test/CodeGen/X86/stackmap-liveness.ll20
-rw-r--r--test/CodeGen/X86/statepoint-allocas.ll4
-rw-r--r--test/CodeGen/X86/statepoint-live-in.ll24
-rw-r--r--test/CodeGen/X86/stdarg.ll3
-rw-r--r--test/CodeGen/X86/store-empty-member.ll2
-rw-r--r--test/CodeGen/X86/store-fp-constant.ll2
-rw-r--r--test/CodeGen/X86/store-global-address.ll2
-rw-r--r--test/CodeGen/X86/store-narrow.ll7
-rw-r--r--test/CodeGen/X86/stores-merging.ll203
-rw-r--r--test/CodeGen/X86/storetrunc-fp.ll2
-rw-r--r--test/CodeGen/X86/stride-nine-with-base-reg.ll2
-rw-r--r--test/CodeGen/X86/stride-reuse.ll2
-rw-r--r--test/CodeGen/X86/sub.ll2
-rw-r--r--test/CodeGen/X86/subcarry.ll4
-rw-r--r--test/CodeGen/X86/subreg-to-reg-0.ll2
-rw-r--r--test/CodeGen/X86/subreg-to-reg-1.ll2
-rw-r--r--test/CodeGen/X86/subreg-to-reg-3.ll2
-rw-r--r--test/CodeGen/X86/subreg-to-reg-4.ll2
-rw-r--r--test/CodeGen/X86/subreg-to-reg-6.ll2
-rw-r--r--test/CodeGen/X86/subvector-broadcast.ll1405
-rw-r--r--test/CodeGen/X86/swift-error.ll18
-rw-r--r--test/CodeGen/X86/swiftcc.ll11
-rw-r--r--test/CodeGen/X86/switch-bt.ll2
-rw-r--r--test/CodeGen/X86/switch-crit-edge-constant.ll2
-rw-r--r--test/CodeGen/X86/switch-default-only.ll2
-rw-r--r--test/CodeGen/X86/switch-edge-weight.ll104
-rw-r--r--test/CodeGen/X86/switch-jump-table.ll6
-rw-r--r--test/CodeGen/X86/switch-lower-peel-top-case.ll135
-rw-r--r--test/CodeGen/X86/switch-or.ll2
-rw-r--r--test/CodeGen/X86/switch-zextload.ll2
-rw-r--r--test/CodeGen/X86/switch.ll6
-rw-r--r--test/CodeGen/X86/swizzle-2.ll84
-rw-r--r--test/CodeGen/X86/swizzle-avx2.ll38
-rw-r--r--test/CodeGen/X86/system-intrinsics-xgetbv.ll2
-rw-r--r--test/CodeGen/X86/tail-call-conditional.mir2
-rw-r--r--test/CodeGen/X86/tail-call-legality.ll2
-rw-r--r--test/CodeGen/X86/tail-dup-debugloc.ll6
-rw-r--r--test/CodeGen/X86/tail-dup-repeat.ll2
-rw-r--r--test/CodeGen/X86/tail-merge-after-mbp.mir46
-rw-r--r--test/CodeGen/X86/tail-merge-debugloc.ll2
-rw-r--r--test/CodeGen/X86/tail-opts.ll2
-rw-r--r--test/CodeGen/X86/tailcall-64.ll4
-rw-r--r--test/CodeGen/X86/tailcall-calleesave.ll2
-rw-r--r--test/CodeGen/X86/tailcall-mem-intrinsics.ll24
-rw-r--r--test/CodeGen/X86/tailcall-returndup-void.ll2
-rw-r--r--test/CodeGen/X86/tailcall.ll2
-rw-r--r--test/CodeGen/X86/tailcallfp.ll2
-rw-r--r--test/CodeGen/X86/tailcallfp2.ll2
-rw-r--r--test/CodeGen/X86/tbm-intrinsics-fast-isel-x86_64.ll20
-rw-r--r--test/CodeGen/X86/tbm-intrinsics-fast-isel.ll48
-rw-r--r--test/CodeGen/X86/tbm-intrinsics-x86_64.ll57
-rw-r--r--test/CodeGen/X86/tbm-schedule.ll489
-rw-r--r--test/CodeGen/X86/tbm_patterns.ll705
-rw-r--r--test/CodeGen/X86/test-nofold.ll2
-rw-r--r--test/CodeGen/X86/test-shrink-bug.ll2
-rw-r--r--test/CodeGen/X86/test-shrink.ll441
-rw-r--r--test/CodeGen/X86/testb-je-fusion.ll2
-rw-r--r--test/CodeGen/X86/testl-commute.ll6
-rw-r--r--test/CodeGen/X86/tls-android-negative.ll4
-rw-r--r--test/CodeGen/X86/tls-android.ll4
-rw-r--r--test/CodeGen/X86/tls-local-dynamic.ll2
-rw-r--r--test/CodeGen/X86/tls-models.ll10
-rw-r--r--test/CodeGen/X86/tls-pic.ll4
-rw-r--r--test/CodeGen/X86/tls-pie.ll34
-rw-r--r--test/CodeGen/X86/tls-shrink-wrapping.ll6
-rw-r--r--test/CodeGen/X86/tls.ll12
-rw-r--r--test/CodeGen/X86/token_landingpad.ll3
-rw-r--r--test/CodeGen/X86/trunc-ext-ld-st.ll26
-rw-r--r--test/CodeGen/X86/trunc-store.ll4
-rw-r--r--test/CodeGen/X86/trunc-to-bool.ll18
-rw-r--r--test/CodeGen/X86/twoaddr-coalesce-2.ll2
-rw-r--r--test/CodeGen/X86/twoaddr-coalesce-3.ll2
-rw-r--r--test/CodeGen/X86/twoaddr-coalesce.ll2
-rw-r--r--test/CodeGen/X86/twoaddr-pass-sink.ll2
-rw-r--r--test/CodeGen/X86/uint64-to-float.ll6
-rw-r--r--test/CodeGen/X86/uint_to_fp-2.ll6
-rw-r--r--test/CodeGen/X86/uint_to_fp-3.ll16
-rw-r--r--test/CodeGen/X86/uint_to_fp.ll4
-rw-r--r--test/CodeGen/X86/umul-with-carry.ll2
-rw-r--r--test/CodeGen/X86/umul-with-overflow.ll62
-rw-r--r--test/CodeGen/X86/unaligned-32-byte-memops.ll60
-rw-r--r--test/CodeGen/X86/update-terminator-debugloc.ll14
-rw-r--r--test/CodeGen/X86/update-terminator.mir2
-rw-r--r--test/CodeGen/X86/urem-i8-constant.ll8
-rw-r--r--test/CodeGen/X86/urem-power-of-two.ll132
-rw-r--r--test/CodeGen/X86/use-add-flags.ll74
-rw-r--r--test/CodeGen/X86/utf8.ll2
-rw-r--r--test/CodeGen/X86/v2f32.ll22
-rw-r--r--test/CodeGen/X86/v4f32-immediate.ll4
-rw-r--r--test/CodeGen/X86/v4i32load-crash.ll4
-rw-r--r--test/CodeGen/X86/v8i1-masks.ll8
-rw-r--r--test/CodeGen/X86/vaargs.ll2
-rw-r--r--test/CodeGen/X86/vaes-intrinsics-avx-x86.ll13
-rw-r--r--test/CodeGen/X86/vaes-intrinsics-avx512-x86.ll42
-rw-r--r--test/CodeGen/X86/vaes-intrinsics-avx512vl-x86.ll82
-rw-r--r--test/CodeGen/X86/var-permute-128.ll356
-rw-r--r--test/CodeGen/X86/var-permute-256.ll1285
-rw-r--r--test/CodeGen/X86/var-permute-512.ll1064
-rw-r--r--test/CodeGen/X86/variable-sized-darwin-bzero.ll2
-rw-r--r--test/CodeGen/X86/vec-copysign-avx512.ll24
-rw-r--r--test/CodeGen/X86/vec-copysign.ll16
-rw-r--r--test/CodeGen/X86/vec-trunc-store.ll4
-rw-r--r--test/CodeGen/X86/vec3.ll4
-rw-r--r--test/CodeGen/X86/vec_add.ll2
-rw-r--r--test/CodeGen/X86/vec_anyext.ll2
-rw-r--r--test/CodeGen/X86/vec_call.ll4
-rw-r--r--test/CodeGen/X86/vec_cast2.ll29
-rw-r--r--test/CodeGen/X86/vec_cmp_sint-128.ll168
-rw-r--r--test/CodeGen/X86/vec_cmp_uint-128.ll262
-rw-r--r--test/CodeGen/X86/vec_compare-sse4.ll12
-rw-r--r--test/CodeGen/X86/vec_compare.ll2
-rw-r--r--test/CodeGen/X86/vec_ctbits.ll12
-rw-r--r--test/CodeGen/X86/vec_ext_inreg.ll26
-rw-r--r--test/CodeGen/X86/vec_extract-avx.ll46
-rw-r--r--test/CodeGen/X86/vec_extract-mmx.ll20
-rw-r--r--test/CodeGen/X86/vec_extract-sse4.ll16
-rw-r--r--test/CodeGen/X86/vec_extract.ll16
-rw-r--r--test/CodeGen/X86/vec_fabs.ll80
-rw-r--r--test/CodeGen/X86/vec_floor.ll475
-rw-r--r--test/CodeGen/X86/vec_fneg.ll16
-rw-r--r--test/CodeGen/X86/vec_fp_to_int.ll524
-rw-r--r--test/CodeGen/X86/vec_fpext.ll48
-rw-r--r--test/CodeGen/X86/vec_fptrunc.ll48
-rw-r--r--test/CodeGen/X86/vec_i64.ll8
-rw-r--r--test/CodeGen/X86/vec_ins_extract-1.ll24
-rw-r--r--test/CodeGen/X86/vec_ins_extract.ll2
-rw-r--r--test/CodeGen/X86/vec_insert-2.ll20
-rw-r--r--test/CodeGen/X86/vec_insert-3.ll4
-rw-r--r--test/CodeGen/X86/vec_insert-4.ll6
-rw-r--r--test/CodeGen/X86/vec_insert-5.ll38
-rw-r--r--test/CodeGen/X86/vec_insert-7.ll6
-rw-r--r--test/CodeGen/X86/vec_insert-8.ll12
-rw-r--r--test/CodeGen/X86/vec_insert-9.ll4
-rw-r--r--test/CodeGen/X86/vec_insert-mmx.ll20
-rw-r--r--test/CodeGen/X86/vec_int_to_fp.ll1428
-rw-r--r--test/CodeGen/X86/vec_loadsingles.ll18
-rw-r--r--test/CodeGen/X86/vec_logical.ll20
-rw-r--r--test/CodeGen/X86/vec_minmax_match.ll62
-rw-r--r--test/CodeGen/X86/vec_minmax_sint.ll532
-rw-r--r--test/CodeGen/X86/vec_minmax_uint.ll580
-rw-r--r--test/CodeGen/X86/vec_partial.ll12
-rw-r--r--test/CodeGen/X86/vec_reassociate.ll40
-rw-r--r--test/CodeGen/X86/vec_return.ll4
-rw-r--r--test/CodeGen/X86/vec_sdiv_to_shift.ll32
-rw-r--r--test/CodeGen/X86/vec_set-2.ll8
-rw-r--r--test/CodeGen/X86/vec_set-3.ll12
-rw-r--r--test/CodeGen/X86/vec_set-4.ll8
-rw-r--r--test/CodeGen/X86/vec_set-6.ll4
-rw-r--r--test/CodeGen/X86/vec_set-7.ll4
-rw-r--r--test/CodeGen/X86/vec_set-8.ll4
-rw-r--r--test/CodeGen/X86/vec_set-A.ll4
-rw-r--r--test/CodeGen/X86/vec_set-B.ll8
-rw-r--r--test/CodeGen/X86/vec_set-C.ll4
-rw-r--r--test/CodeGen/X86/vec_set-D.ll2
-rw-r--r--test/CodeGen/X86/vec_set-F.ll4
-rw-r--r--test/CodeGen/X86/vec_set-H.ll2
-rw-r--r--test/CodeGen/X86/vec_set.ll4
-rw-r--r--test/CodeGen/X86/vec_setcc.ll44
-rw-r--r--test/CodeGen/X86/vec_shift.ll12
-rw-r--r--test/CodeGen/X86/vec_shift2.ll8
-rw-r--r--test/CodeGen/X86/vec_shift3.ll12
-rw-r--r--test/CodeGen/X86/vec_shift4.ll8
-rw-r--r--test/CodeGen/X86/vec_shift5.ll64
-rw-r--r--test/CodeGen/X86/vec_shift6.ll54
-rw-r--r--test/CodeGen/X86/vec_shift7.ll4
-rw-r--r--test/CodeGen/X86/vec_split.ll6
-rw-r--r--test/CodeGen/X86/vec_ss_load_fold.ll186
-rw-r--r--test/CodeGen/X86/vec_trunc_sext.ll4
-rw-r--r--test/CodeGen/X86/vec_uint_to_fp-fastmath.ll26
-rw-r--r--test/CodeGen/X86/vec_unsafe-fp-math.ll4
-rw-r--r--test/CodeGen/X86/vec_zero-2.ll2
-rw-r--r--test/CodeGen/X86/vec_zero.ll2
-rw-r--r--test/CodeGen/X86/vec_zero_cse.ll16
-rw-r--r--test/CodeGen/X86/vector-bitreverse.ll241
-rw-r--r--test/CodeGen/X86/vector-blend.ll265
-rw-r--r--test/CodeGen/X86/vector-compare-all_of.ll180
-rw-r--r--test/CodeGen/X86/vector-compare-any_of.ll180
-rw-r--r--test/CodeGen/X86/vector-compare-combines.ll8
-rw-r--r--test/CodeGen/X86/vector-compare-results.ll5845
-rw-r--r--test/CodeGen/X86/vector-extend-inreg.ll18
-rw-r--r--test/CodeGen/X86/vector-half-conversions.ll3744
-rw-r--r--test/CodeGen/X86/vector-idiv-sdiv-128.ll179
-rw-r--r--test/CodeGen/X86/vector-idiv-sdiv-256.ll203
-rw-r--r--test/CodeGen/X86/vector-idiv-sdiv-512.ll2271
-rw-r--r--test/CodeGen/X86/vector-idiv-udiv-128.ll161
-rw-r--r--test/CodeGen/X86/vector-idiv-udiv-256.ll185
-rw-r--r--test/CodeGen/X86/vector-idiv-udiv-512.ll1917
-rw-r--r--test/CodeGen/X86/vector-idiv.ll12
-rw-r--r--test/CodeGen/X86/vector-interleave.ll22
-rw-r--r--test/CodeGen/X86/vector-intrinsics.ll2
-rw-r--r--test/CodeGen/X86/vector-lzcnt-128.ll258
-rw-r--r--test/CodeGen/X86/vector-lzcnt-256.ll280
-rw-r--r--test/CodeGen/X86/vector-lzcnt-512.ll230
-rw-r--r--test/CodeGen/X86/vector-merge-store-fp-constants.ll4
-rw-r--r--test/CodeGen/X86/vector-mul.ll1090
-rw-r--r--test/CodeGen/X86/vector-narrow-binop.ll12
-rw-r--r--test/CodeGen/X86/vector-pcmp.ll104
-rw-r--r--test/CodeGen/X86/vector-popcnt-128.ll246
-rw-r--r--test/CodeGen/X86/vector-popcnt-256.ll164
-rw-r--r--test/CodeGen/X86/vector-popcnt-512.ll91
-rw-r--r--test/CodeGen/X86/vector-rem.ll8
-rw-r--r--test/CodeGen/X86/vector-rotate-128.ll417
-rw-r--r--test/CodeGen/X86/vector-rotate-256.ll268
-rw-r--r--test/CodeGen/X86/vector-rotate-512.ll98
-rw-r--r--test/CodeGen/X86/vector-sext.ll690
-rw-r--r--test/CodeGen/X86/vector-shift-ashr-128.ll454
-rw-r--r--test/CodeGen/X86/vector-shift-ashr-256.ll324
-rw-r--r--test/CodeGen/X86/vector-shift-ashr-512.ll62
-rw-r--r--test/CodeGen/X86/vector-shift-lshr-128.ll376
-rw-r--r--test/CodeGen/X86/vector-shift-lshr-256.ll306
-rw-r--r--test/CodeGen/X86/vector-shift-lshr-512.ll54
-rw-r--r--test/CodeGen/X86/vector-shift-shl-128.ll302
-rw-r--r--test/CodeGen/X86/vector-shift-shl-256.ll302
-rw-r--r--test/CodeGen/X86/vector-shift-shl-512.ll54
-rw-r--r--test/CodeGen/X86/vector-shuffle-128-v16.ll484
-rw-r--r--test/CodeGen/X86/vector-shuffle-128-v2.ll640
-rw-r--r--test/CodeGen/X86/vector-shuffle-128-v4.ll804
-rw-r--r--test/CodeGen/X86/vector-shuffle-128-v8.ll770
-rw-r--r--test/CodeGen/X86/vector-shuffle-256-v16.ll2025
-rw-r--r--test/CodeGen/X86/vector-shuffle-256-v32.ll707
-rw-r--r--test/CodeGen/X86/vector-shuffle-256-v4.ll749
-rw-r--r--test/CodeGen/X86/vector-shuffle-256-v8.ll1091
-rw-r--r--test/CodeGen/X86/vector-shuffle-512-v16.ll232
-rw-r--r--test/CodeGen/X86/vector-shuffle-512-v32.ll150
-rw-r--r--test/CodeGen/X86/vector-shuffle-512-v64.ll262
-rw-r--r--test/CodeGen/X86/vector-shuffle-512-v8.ll1182
-rw-r--r--test/CodeGen/X86/vector-shuffle-avx512.ll350
-rw-r--r--test/CodeGen/X86/vector-shuffle-combining-avx.ll156
-rw-r--r--test/CodeGen/X86/vector-shuffle-combining-avx2.ll440
-rw-r--r--test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll328
-rw-r--r--test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll40
-rw-r--r--test/CodeGen/X86/vector-shuffle-combining-avx512vbmi.ll64
-rw-r--r--test/CodeGen/X86/vector-shuffle-combining-sse41.ll4
-rw-r--r--test/CodeGen/X86/vector-shuffle-combining-sse4a.ll18
-rw-r--r--test/CodeGen/X86/vector-shuffle-combining-ssse3.ll320
-rw-r--r--test/CodeGen/X86/vector-shuffle-combining-xop.ll116
-rw-r--r--test/CodeGen/X86/vector-shuffle-combining.ll961
-rw-r--r--test/CodeGen/X86/vector-shuffle-masked.ll1302
-rw-r--r--test/CodeGen/X86/vector-shuffle-mmx.ll15
-rw-r--r--test/CodeGen/X86/vector-shuffle-sse1.ll70
-rw-r--r--test/CodeGen/X86/vector-shuffle-sse41.ll12
-rw-r--r--test/CodeGen/X86/vector-shuffle-sse4a.ll116
-rw-r--r--test/CodeGen/X86/vector-shuffle-v1.ll363
-rw-r--r--test/CodeGen/X86/vector-shuffle-v48.ll23
-rw-r--r--test/CodeGen/X86/vector-shuffle-variable-128.ll448
-rw-r--r--test/CodeGen/X86/vector-shuffle-variable-256.ll585
-rw-r--r--test/CodeGen/X86/vector-sqrt.ll4
-rw-r--r--test/CodeGen/X86/vector-trunc-math.ll1465
-rw-r--r--test/CodeGen/X86/vector-trunc.ll1173
-rw-r--r--test/CodeGen/X86/vector-truncate-combine.ll10
-rw-r--r--test/CodeGen/X86/vector-tzcnt-128.ll685
-rw-r--r--test/CodeGen/X86/vector-tzcnt-256.ll641
-rw-r--r--test/CodeGen/X86/vector-tzcnt-512.ll297
-rw-r--r--test/CodeGen/X86/vector-unsigned-cmp.ll72
-rw-r--r--test/CodeGen/X86/vector-variable-idx.ll2
-rw-r--r--test/CodeGen/X86/vector-variable-idx2.ll2
-rw-r--r--test/CodeGen/X86/vector-zext.ll635
-rw-r--r--test/CodeGen/X86/vector-zmov.ll8
-rw-r--r--test/CodeGen/X86/vector.ll4
-rw-r--r--test/CodeGen/X86/verifier-phi-fail0.mir30
-rw-r--r--test/CodeGen/X86/verifier-phi.mir34
-rw-r--r--test/CodeGen/X86/vfcmp.ll2
-rw-r--r--test/CodeGen/X86/viabs.ll320
-rw-r--r--test/CodeGen/X86/vmovq.ll4
-rw-r--r--test/CodeGen/X86/volatile.ll4
-rw-r--r--test/CodeGen/X86/vortex-bug.ll2
-rw-r--r--test/CodeGen/X86/vpshufbitqbm-intrinsics.ll41
-rw-r--r--test/CodeGen/X86/vselect-2.ll32
-rw-r--r--test/CodeGen/X86/vselect-avx.ll35
-rw-r--r--test/CodeGen/X86/vselect-constants.ll258
-rw-r--r--test/CodeGen/X86/vselect-minmax.ll1632
-rw-r--r--test/CodeGen/X86/vselect-packss.ll418
-rw-r--r--test/CodeGen/X86/vselect-pcmp.ll88
-rw-r--r--test/CodeGen/X86/vselect-zero.ll59
-rw-r--r--test/CodeGen/X86/vselect.ll168
-rw-r--r--test/CodeGen/X86/vshift-1.ll24
-rw-r--r--test/CodeGen/X86/vshift-2.ll24
-rw-r--r--test/CodeGen/X86/vshift-3.ll20
-rw-r--r--test/CodeGen/X86/vshift-4.ll48
-rw-r--r--test/CodeGen/X86/vshift-5.ll16
-rw-r--r--test/CodeGen/X86/vshift-6.ll4
-rw-r--r--test/CodeGen/X86/vshift_split.ll2
-rw-r--r--test/CodeGen/X86/vshift_split2.ll2
-rw-r--r--test/CodeGen/X86/vsplit-and.ll26
-rw-r--r--test/CodeGen/X86/vzero-excess.ll8
-rw-r--r--test/CodeGen/X86/weak-undef.ll58
-rw-r--r--test/CodeGen/X86/weak.ll2
-rw-r--r--test/CodeGen/X86/wide-fma-contraction.ll14
-rw-r--r--test/CodeGen/X86/wide-integer-cmp.ll24
-rw-r--r--test/CodeGen/X86/wide-integer-fold.ll2
-rw-r--r--test/CodeGen/X86/widen_arith-1.ll4
-rw-r--r--test/CodeGen/X86/widen_arith-2.ll15
-rw-r--r--test/CodeGen/X86/widen_arith-3.ll6
-rw-r--r--test/CodeGen/X86/widen_arith-4.ll4
-rw-r--r--test/CodeGen/X86/widen_arith-5.ll4
-rw-r--r--test/CodeGen/X86/widen_arith-6.ll4
-rw-r--r--test/CodeGen/X86/widen_bitops-0.ll84
-rw-r--r--test/CodeGen/X86/widen_bitops-1.ll48
-rw-r--r--test/CodeGen/X86/widen_cast-1.ll4
-rw-r--r--test/CodeGen/X86/widen_cast-2.ll2
-rw-r--r--test/CodeGen/X86/widen_cast-3.ll4
-rw-r--r--test/CodeGen/X86/widen_cast-4.ll10
-rw-r--r--test/CodeGen/X86/widen_cast-5.ll4
-rw-r--r--test/CodeGen/X86/widen_cast-6.ll4
-rw-r--r--test/CodeGen/X86/widen_compare-1.ll8
-rw-r--r--test/CodeGen/X86/widen_conv-1.ll12
-rw-r--r--test/CodeGen/X86/widen_conv-2.ll4
-rw-r--r--test/CodeGen/X86/widen_conv-3.ll14
-rw-r--r--test/CodeGen/X86/widen_conv-4.ll16
-rw-r--r--test/CodeGen/X86/widen_conversions.ll4
-rw-r--r--test/CodeGen/X86/widen_extract-1.ll10
-rw-r--r--test/CodeGen/X86/widen_load-0.ll4
-rw-r--r--test/CodeGen/X86/widen_load-1.ll4
-rw-r--r--test/CodeGen/X86/widen_load-2.ll44
-rw-r--r--test/CodeGen/X86/widen_load-3.ll128
-rw-r--r--test/CodeGen/X86/widen_shuffle-1.ll20
-rw-r--r--test/CodeGen/X86/widened-broadcast.ll235
-rw-r--r--test/CodeGen/X86/win32-pic-jumptable.ll2
-rw-r--r--test/CodeGen/X86/win32-preemption.ll236
-rw-r--r--test/CodeGen/X86/win64_sibcall.ll4
-rw-r--r--test/CodeGen/X86/win_chkstk.ll8
-rw-r--r--test/CodeGen/X86/win_coreclr_chkstk.ll4
-rw-r--r--test/CodeGen/X86/x32-cet-intrinsics.ll106
-rw-r--r--test/CodeGen/X86/x32-lea-1.ll2
-rw-r--r--test/CodeGen/X86/x64-cet-intrinsics.ll150
-rw-r--r--test/CodeGen/X86/x86-64-disp.ll2
-rw-r--r--test/CodeGen/X86/x86-64-double-shifts-Oz-Os-O2.ll2
-rw-r--r--test/CodeGen/X86/x86-64-double-shifts-var.ll40
-rw-r--r--test/CodeGen/X86/x86-64-intrcc-nosse.ll2
-rw-r--r--test/CodeGen/X86/x86-64-ret0.ll2
-rw-r--r--test/CodeGen/X86/x86-cmov-converter.ll183
-rw-r--r--test/CodeGen/X86/x86-fold-pshufb.ll8
-rw-r--r--test/CodeGen/X86/x86-interleaved-access.ll1836
-rw-r--r--test/CodeGen/X86/x86-interleaved-check.ll15
-rw-r--r--test/CodeGen/X86/x86-interrupt_cc.ll2
-rw-r--r--test/CodeGen/X86/x86-no_caller_saved_registers-preserve.ll13
-rw-r--r--test/CodeGen/X86/x86-setcc-int-to-fp-combine.ll10
-rw-r--r--test/CodeGen/X86/x86-shifts.ll68
-rw-r--r--test/CodeGen/X86/x86-shrink-wrapping.ll6
-rw-r--r--test/CodeGen/X86/x86-upgrade-avx-vbroadcast.ll6
-rw-r--r--test/CodeGen/X86/x86-upgrade-avx2-vbroadcast.ll4
-rw-r--r--test/CodeGen/X86/x87-schedule.ll5848
-rw-r--r--test/CodeGen/X86/x87.ll12
-rw-r--r--test/CodeGen/X86/xaluo.ll294
-rw-r--r--test/CodeGen/X86/xchg-nofold.ll4
-rw-r--r--test/CodeGen/X86/xmm-r64.ll2
-rw-r--r--test/CodeGen/X86/xmulo.ll144
-rw-r--r--test/CodeGen/X86/xop-ifma.ll22
-rw-r--r--test/CodeGen/X86/xop-intrinsics-fast-isel.ll260
-rw-r--r--test/CodeGen/X86/xop-intrinsics-x86_64-upgrade.ll154
-rw-r--r--test/CodeGen/X86/xop-intrinsics-x86_64.ll166
-rw-r--r--test/CodeGen/X86/xop-mask-comments.ll56
-rw-r--r--test/CodeGen/X86/xop-pcmov.ll24
-rw-r--r--test/CodeGen/X86/xop-schedule.ll998
-rw-r--r--test/CodeGen/X86/xor-combine-debugloc.ll10
-rw-r--r--test/CodeGen/X86/xor-icmp.ll24
-rw-r--r--test/CodeGen/X86/xor-select-i1-combine.ll4
-rw-r--r--test/CodeGen/X86/xray-attribute-instrumentation.ll28
-rw-r--r--test/CodeGen/X86/xray-custom-log.ll25
-rw-r--r--test/CodeGen/X86/xray-log-args.ll44
-rw-r--r--test/CodeGen/X86/xray-loop-detection.ll1
-rw-r--r--test/CodeGen/X86/xray-section-group.ll4
-rw-r--r--test/CodeGen/X86/xray-tail-call-sled.ll22
-rw-r--r--test/CodeGen/X86/xtest.ll2
-rw-r--r--test/CodeGen/X86/zero-remat.ll6
-rw-r--r--test/CodeGen/X86/zext-inreg-0.ll4
-rw-r--r--test/CodeGen/X86/zext-inreg-1.ll2
-rw-r--r--test/CodeGen/X86/zext-shl.ll4
-rw-r--r--test/CodeGen/X86/zext-trunc.ll2
-rw-r--r--test/CodeGen/X86/zlib-longest-match.ll2
1971 files changed, 274041 insertions, 122225 deletions
diff --git a/test/CodeGen/X86/2003-08-03-CallArgLiveRanges.ll b/test/CodeGen/X86/2003-08-03-CallArgLiveRanges.ll
index cb90bf644d5f..8a0710865058 100644
--- a/test/CodeGen/X86/2003-08-03-CallArgLiveRanges.ll
+++ b/test/CodeGen/X86/2003-08-03-CallArgLiveRanges.ll
@@ -4,7 +4,7 @@
; it makes a ton of annoying overlapping live ranges. This code should not
; cause spills!
;
-; RUN: llc < %s -march=x86 -stats 2>&1 | FileCheck %s
+; RUN: llc < %s -mtriple=i686-- -stats 2>&1 | FileCheck %s
; CHECK-NOT: spilled
diff --git a/test/CodeGen/X86/2003-08-23-DeadBlockTest.ll b/test/CodeGen/X86/2003-08-23-DeadBlockTest.ll
index 5c40eeaa1ead..db2ad18d0efb 100644
--- a/test/CodeGen/X86/2003-08-23-DeadBlockTest.ll
+++ b/test/CodeGen/X86/2003-08-23-DeadBlockTest.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86
+; RUN: llc < %s -mtriple=i686--
define i32 @test() {
entry:
diff --git a/test/CodeGen/X86/2003-11-03-GlobalBool.ll b/test/CodeGen/X86/2003-11-03-GlobalBool.ll
index e0d4988abbf7..16bbbce5eb7d 100644
--- a/test/CodeGen/X86/2003-11-03-GlobalBool.ll
+++ b/test/CodeGen/X86/2003-11-03-GlobalBool.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | FileCheck %s
+; RUN: llc < %s -mtriple=i686-- | FileCheck %s
@X = global i1 true
; CHECK-NOT: .byte true
diff --git a/test/CodeGen/X86/2004-02-13-FrameReturnAddress.ll b/test/CodeGen/X86/2004-02-13-FrameReturnAddress.ll
index bd3317a68b8c..7fdb070081a1 100644
--- a/test/CodeGen/X86/2004-02-13-FrameReturnAddress.ll
+++ b/test/CodeGen/X86/2004-02-13-FrameReturnAddress.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | FileCheck %s
+; RUN: llc < %s -mtriple=i686-- | FileCheck %s
declare i8* @llvm.returnaddress(i32)
diff --git a/test/CodeGen/X86/2004-02-14-InefficientStackPointer.ll b/test/CodeGen/X86/2004-02-14-InefficientStackPointer.ll
index d7f7e262d893..aa68ade1dfe2 100644
--- a/test/CodeGen/X86/2004-02-14-InefficientStackPointer.ll
+++ b/test/CodeGen/X86/2004-02-14-InefficientStackPointer.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | FileCheck %s
+; RUN: llc < %s | FileCheck %s
target triple = "i686-unknown-unknown"
diff --git a/test/CodeGen/X86/2004-02-22-Casts.ll b/test/CodeGen/X86/2004-02-22-Casts.ll
index dabf7d3c15b6..dafc1d597625 100644
--- a/test/CodeGen/X86/2004-02-22-Casts.ll
+++ b/test/CodeGen/X86/2004-02-22-Casts.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86
+; RUN: llc < %s -mtriple=i686--
define i1 @test1(double %X) {
%V = fcmp one double %X, 0.000000e+00 ; <i1> [#uses=1]
ret i1 %V
diff --git a/test/CodeGen/X86/2004-03-30-Select-Max.ll b/test/CodeGen/X86/2004-03-30-Select-Max.ll
index e22aa6a09398..aa5bf631b522 100644
--- a/test/CodeGen/X86/2004-03-30-Select-Max.ll
+++ b/test/CodeGen/X86/2004-03-30-Select-Max.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mcpu=yonah | FileCheck %s
+; RUN: llc < %s -mtriple=i686-- -mcpu=yonah | FileCheck %s
; CHECK-NOT: {{j[lgbe]}}
define i32 @max(i32 %A, i32 %B) nounwind {
diff --git a/test/CodeGen/X86/2004-04-13-FPCMOV-Crash.ll b/test/CodeGen/X86/2004-04-13-FPCMOV-Crash.ll
index f8ed016f99b6..915dc4e41609 100644
--- a/test/CodeGen/X86/2004-04-13-FPCMOV-Crash.ll
+++ b/test/CodeGen/X86/2004-04-13-FPCMOV-Crash.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86
+; RUN: llc < %s -mtriple=i686--
define double @test(double %d) {
%X = select i1 false, double %d, double %d ; <double> [#uses=1]
diff --git a/test/CodeGen/X86/2004-06-10-StackifierCrash.ll b/test/CodeGen/X86/2004-06-10-StackifierCrash.ll
index 036aa6a77f40..88acc6ad9af8 100644
--- a/test/CodeGen/X86/2004-06-10-StackifierCrash.ll
+++ b/test/CodeGen/X86/2004-06-10-StackifierCrash.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86
+; RUN: llc < %s -mtriple=i686--
define i1 @T(double %X) {
%V = fcmp oeq double %X, %X ; <i1> [#uses=1]
diff --git a/test/CodeGen/X86/2004-10-08-SelectSetCCFold.ll b/test/CodeGen/X86/2004-10-08-SelectSetCCFold.ll
index db3af0139cee..2bd755d979d1 100644
--- a/test/CodeGen/X86/2004-10-08-SelectSetCCFold.ll
+++ b/test/CodeGen/X86/2004-10-08-SelectSetCCFold.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86
+; RUN: llc < %s -mtriple=i686--
define i1 @test(i1 %C, i1 %D, i32 %X, i32 %Y) {
%E = icmp slt i32 %X, %Y ; <i1> [#uses=1]
diff --git a/test/CodeGen/X86/2005-01-17-CycleInDAG.ll b/test/CodeGen/X86/2005-01-17-CycleInDAG.ll
index 7bb634d97130..1f9f2c6240da 100644
--- a/test/CodeGen/X86/2005-01-17-CycleInDAG.ll
+++ b/test/CodeGen/X86/2005-01-17-CycleInDAG.ll
@@ -3,7 +3,7 @@
; is invalid code (there is no correct way to order the instruction). Check
; that we do not fold the load into the sub.
-; RUN: llc < %s -march=x86 | FileCheck %s
+; RUN: llc < %s -mtriple=i686-- | FileCheck %s
@GLOBAL = external global i32
diff --git a/test/CodeGen/X86/2005-02-14-IllegalAssembler.ll b/test/CodeGen/X86/2005-02-14-IllegalAssembler.ll
index 1e3a0937d5b1..16fea863b494 100644
--- a/test/CodeGen/X86/2005-02-14-IllegalAssembler.ll
+++ b/test/CodeGen/X86/2005-02-14-IllegalAssembler.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | FileCheck %s
+; RUN: llc < %s -mtriple=i686-- | FileCheck %s
@A = external global i32 ; <i32*> [#uses=1]
@Y = global i32* getelementptr (i32, i32* @A, i32 -1) ; <i32**> [#uses=0]
diff --git a/test/CodeGen/X86/2005-05-08-FPStackifierPHI.ll b/test/CodeGen/X86/2005-05-08-FPStackifierPHI.ll
index 5266009c55a5..47cd0da7a4d3 100644
--- a/test/CodeGen/X86/2005-05-08-FPStackifierPHI.ll
+++ b/test/CodeGen/X86/2005-05-08-FPStackifierPHI.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mcpu=generic
+; RUN: llc < %s -mtriple=i686-- -mcpu=generic
; Make sure LLC doesn't crash in the stackifier due to FP PHI nodes.
define void @radfg_() {
diff --git a/test/CodeGen/X86/2006-01-19-ISelFoldingBug.ll b/test/CodeGen/X86/2006-01-19-ISelFoldingBug.ll
index 48f5bc3e2986..288e2921daae 100644
--- a/test/CodeGen/X86/2006-01-19-ISelFoldingBug.ll
+++ b/test/CodeGen/X86/2006-01-19-ISelFoldingBug.ll
@@ -10,7 +10,7 @@ target triple = "i686-unknown-unknown"
define i32 @test5(i32 %B, i8 %C) {
; CHECK-LABEL: test5:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movb {{[0-9]+}}(%esp), %cl
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx
; CHECK-NEXT: movl A, %eax
diff --git a/test/CodeGen/X86/2006-03-01-InstrSchedBug.ll b/test/CodeGen/X86/2006-03-01-InstrSchedBug.ll
index ca3eb9cda372..4bc6b1a53d9d 100644
--- a/test/CodeGen/X86/2006-03-01-InstrSchedBug.ll
+++ b/test/CodeGen/X86/2006-03-01-InstrSchedBug.ll
@@ -3,7 +3,7 @@
define i32 @f(i32 %a, i32 %b) {
; CHECK-LABEL: f:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
; CHECK-NEXT: movl %ecx, %edx
diff --git a/test/CodeGen/X86/2006-03-02-InstrSchedBug.ll b/test/CodeGen/X86/2006-03-02-InstrSchedBug.ll
index 6963b1d92f6c..ece16192566d 100644
--- a/test/CodeGen/X86/2006-03-02-InstrSchedBug.ll
+++ b/test/CodeGen/X86/2006-03-02-InstrSchedBug.ll
@@ -1,5 +1,5 @@
; REQUIRES: asserts
-; RUN: llc < %s -march=x86 -stats 2>&1 | FileCheck %s
+; RUN: llc < %s -mtriple=i686-- -stats 2>&1 | FileCheck %s
; CHECK: 7 asm-printer
define i32 @g(i32 %a, i32 %b) nounwind {
diff --git a/test/CodeGen/X86/2006-04-04-CrossBlockCrash.ll b/test/CodeGen/X86/2006-04-04-CrossBlockCrash.ll
index 3f67097ddc0d..ddcc7e2a6d15 100644
--- a/test/CodeGen/X86/2006-04-04-CrossBlockCrash.ll
+++ b/test/CodeGen/X86/2006-04-04-CrossBlockCrash.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mcpu=yonah
+; RUN: llc < %s -mcpu=yonah
; END.
target datalayout = "e-p:32:32"
diff --git a/test/CodeGen/X86/2006-04-27-ISelFoldingBug.ll b/test/CodeGen/X86/2006-04-27-ISelFoldingBug.ll
index 9f44bc348e37..c70cbd9c3e14 100644
--- a/test/CodeGen/X86/2006-04-27-ISelFoldingBug.ll
+++ b/test/CodeGen/X86/2006-04-27-ISelFoldingBug.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mtriple=i686-apple-darwin8 -relocation-model=static > %t
+; RUN: llc < %s -mtriple=i686-apple-darwin8 -relocation-model=static > %t
; RUN: grep "movl _last" %t | count 1
; RUN: grep "cmpl.*_last" %t | count 1
diff --git a/test/CodeGen/X86/2006-05-01-SchedCausingSpills.ll b/test/CodeGen/X86/2006-05-01-SchedCausingSpills.ll
index 583877e66582..6f2fadbe8ca6 100644
--- a/test/CodeGen/X86/2006-05-01-SchedCausingSpills.ll
+++ b/test/CodeGen/X86/2006-05-01-SchedCausingSpills.ll
@@ -1,5 +1,5 @@
; REQUIRES: asserts
-; RUN: llc < %s -march=x86 -mcpu=yonah -stats 2>&1 | \
+; RUN: llc < %s -mtriple=i686-- -mcpu=yonah -stats 2>&1 | \
; RUN: not grep "Number of register spills"
; END.
diff --git a/test/CodeGen/X86/2006-05-02-InstrSched2.ll b/test/CodeGen/X86/2006-05-02-InstrSched2.ll
index 3281c68e9334..f5be3a3e2bff 100644
--- a/test/CodeGen/X86/2006-05-02-InstrSched2.ll
+++ b/test/CodeGen/X86/2006-05-02-InstrSched2.ll
@@ -1,5 +1,5 @@
; REQUIRES: asserts
-; RUN: llc < %s -march=x86 -stats 2>&1 | \
+; RUN: llc < %s -mtriple=i686-- -stats 2>&1 | \
; RUN: grep asm-printer | grep 13
define void @_ZN9__gnu_cxx9hashtableISt4pairIKPKciES3_NS_4hashIS3_EESt10_Select1stIS5_E5eqstrSaIiEE14find_or_insertERKS5__cond_true456.i(i8* %tmp435.i, i32* %tmp449.i.out) nounwind {
diff --git a/test/CodeGen/X86/2006-05-08-CoalesceSubRegClass.ll b/test/CodeGen/X86/2006-05-08-CoalesceSubRegClass.ll
index b70d375bf51a..6c100aa92666 100644
--- a/test/CodeGen/X86/2006-05-08-CoalesceSubRegClass.ll
+++ b/test/CodeGen/X86/2006-05-08-CoalesceSubRegClass.ll
@@ -1,7 +1,7 @@
; Coalescing from R32 to a subset R32_. Once another register coalescer bug is
; fixed, the movb should go away as well.
-; RUN: llc < %s -march=x86 -relocation-model=static | \
+; RUN: llc < %s -mtriple=i686-- -relocation-model=static | \
; RUN: grep movl
@B = external global i32 ; <i32*> [#uses=2]
diff --git a/test/CodeGen/X86/2006-05-08-InstrSched.ll b/test/CodeGen/X86/2006-05-08-InstrSched.ll
index cd46ecfef525..ffe1a9ce3e9a 100644
--- a/test/CodeGen/X86/2006-05-08-InstrSched.ll
+++ b/test/CodeGen/X86/2006-05-08-InstrSched.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -relocation-model=static | not grep "subl.*%esp"
+; RUN: llc < %s -mtriple=i686-- -relocation-model=static | not grep "subl.*%esp"
@A = external global i16* ; <i16**> [#uses=1]
@B = external global i32 ; <i32*> [#uses=1]
diff --git a/test/CodeGen/X86/2006-05-11-InstrSched.ll b/test/CodeGen/X86/2006-05-11-InstrSched.ll
index b1deb2c5f567..d4732a62a4f7 100644
--- a/test/CodeGen/X86/2006-05-11-InstrSched.ll
+++ b/test/CodeGen/X86/2006-05-11-InstrSched.ll
@@ -1,6 +1,6 @@
; REQUIRES: asserts
-; RUN: llc < %s -march=x86 -mtriple=i386-linux-gnu -mcpu=penryn -mattr=+sse2 -stats 2>&1 | \
-; RUN: grep "asm-printer" | grep 35
+; RUN: llc < %s -mtriple=i386-linux-gnu -mcpu=penryn -mattr=+sse2 -stats 2>&1 | \
+; RUN: grep "asm-printer" | grep 33
target datalayout = "e-p:32:32"
define void @foo(i32* %mc, i32* %bp, i32* %ms, i32* %xmb, i32* %mpp, i32* %tpmm, i32* %ip, i32* %tpim, i32* %dpp, i32* %tpdm, i32* %bpi, i32 %M) nounwind {
diff --git a/test/CodeGen/X86/2006-05-17-VectorArg.ll b/test/CodeGen/X86/2006-05-17-VectorArg.ll
index b36d61e0f31b..7f022b6a607b 100644
--- a/test/CodeGen/X86/2006-05-17-VectorArg.ll
+++ b/test/CodeGen/X86/2006-05-17-VectorArg.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2
+; RUN: llc < %s -mtriple=i686-- -mattr=+sse2
define <4 x float> @opRSQ(<4 x float> %a) nounwind {
entry:
diff --git a/test/CodeGen/X86/2006-05-22-FPSetEQ.ll b/test/CodeGen/X86/2006-05-22-FPSetEQ.ll
index 3be77f5c3099..bea11e91669a 100644
--- a/test/CodeGen/X86/2006-05-22-FPSetEQ.ll
+++ b/test/CodeGen/X86/2006-05-22-FPSetEQ.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=x86 -mattr=-sse | FileCheck %s -check-prefix=WITHNANS
-; RUN: llc < %s -march=x86 -mattr=-sse -enable-unsafe-fp-math -enable-no-nans-fp-math | FileCheck %s -check-prefix=NONANS
+; RUN: llc < %s -mtriple=i686-- -mattr=-sse | FileCheck %s -check-prefix=WITHNANS
+; RUN: llc < %s -mtriple=i686-- -mattr=-sse -enable-unsafe-fp-math -enable-no-nans-fp-math | FileCheck %s -check-prefix=NONANS
; WITHNANS-LABEL: test:
; WITHNANS: setnp
diff --git a/test/CodeGen/X86/2006-05-25-CycleInDAG.ll b/test/CodeGen/X86/2006-05-25-CycleInDAG.ll
index 6ff879760ea0..987a229d4271 100644
--- a/test/CodeGen/X86/2006-05-25-CycleInDAG.ll
+++ b/test/CodeGen/X86/2006-05-25-CycleInDAG.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86
+; RUN: llc < %s -mtriple=i686--
define i32 @test() {
br i1 false, label %cond_next33, label %cond_true12
diff --git a/test/CodeGen/X86/2006-07-10-InlineAsmAConstraint.ll b/test/CodeGen/X86/2006-07-10-InlineAsmAConstraint.ll
index 4ea364d57e51..1c5b9d07eeb8 100644
--- a/test/CodeGen/X86/2006-07-10-InlineAsmAConstraint.ll
+++ b/test/CodeGen/X86/2006-07-10-InlineAsmAConstraint.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86
+; RUN: llc < %s -mtriple=i686--
; PR825
define i64 @test() {
diff --git a/test/CodeGen/X86/2006-07-12-InlineAsmQConstraint.ll b/test/CodeGen/X86/2006-07-12-InlineAsmQConstraint.ll
index 568fbbcc4f4f..fe3c7cfa3d7d 100644
--- a/test/CodeGen/X86/2006-07-12-InlineAsmQConstraint.ll
+++ b/test/CodeGen/X86/2006-07-12-InlineAsmQConstraint.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86
+; RUN: llc < %s
; PR828
target datalayout = "e-p:32:32"
diff --git a/test/CodeGen/X86/2006-07-20-InlineAsm.ll b/test/CodeGen/X86/2006-07-20-InlineAsm.ll
index 795e898df347..944fae68abc4 100644
--- a/test/CodeGen/X86/2006-07-20-InlineAsm.ll
+++ b/test/CodeGen/X86/2006-07-20-InlineAsm.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -no-integrated-as
+; RUN: llc < %s -mtriple=i686-- -no-integrated-as
; PR833
@G = weak global i32 0 ; <i32*> [#uses=3]
diff --git a/test/CodeGen/X86/2006-07-28-AsmPrint-Long-As-Pointer.ll b/test/CodeGen/X86/2006-07-28-AsmPrint-Long-As-Pointer.ll
index deae086cf76c..1c4c6478bc8b 100644
--- a/test/CodeGen/X86/2006-07-28-AsmPrint-Long-As-Pointer.ll
+++ b/test/CodeGen/X86/2006-07-28-AsmPrint-Long-As-Pointer.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | grep -- 4294967240
+; RUN: llc < %s -mtriple=i686-- | grep -- 4294967240
; PR853
@X = global i32* inttoptr (i64 -56 to i32*) ; <i32**> [#uses=0]
diff --git a/test/CodeGen/X86/2006-07-31-SingleRegClass.ll b/test/CodeGen/X86/2006-07-31-SingleRegClass.ll
index 2a9c8324d36a..baf4dfa2e43d 100644
--- a/test/CodeGen/X86/2006-07-31-SingleRegClass.ll
+++ b/test/CodeGen/X86/2006-07-31-SingleRegClass.ll
@@ -1,5 +1,5 @@
; PR850
-; RUN: llc < %s -march=x86 -x86-asm-syntax=att -no-integrated-as | FileCheck %s
+; RUN: llc < %s -mtriple=i686-- -x86-asm-syntax=att -no-integrated-as | FileCheck %s
; CHECK: {{movl 4[(]%eax[)],%ebp}}
; CHECK: {{movl 0[(]%eax[)], %ebx}}
diff --git a/test/CodeGen/X86/2006-08-07-CycleInDAG.ll b/test/CodeGen/X86/2006-08-07-CycleInDAG.ll
index 397bc26dbecd..66c79303fa46 100644
--- a/test/CodeGen/X86/2006-08-07-CycleInDAG.ll
+++ b/test/CodeGen/X86/2006-08-07-CycleInDAG.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2
+; RUN: llc < %s -mtriple=i686-- -mattr=+sse2
%struct.foo = type opaque
define fastcc i32 @test(%struct.foo* %v, %struct.foo* %vi) {
diff --git a/test/CodeGen/X86/2006-08-16-CycleInDAG.ll b/test/CodeGen/X86/2006-08-16-CycleInDAG.ll
index 2c44adf6829c..20a0163227b8 100644
--- a/test/CodeGen/X86/2006-08-16-CycleInDAG.ll
+++ b/test/CodeGen/X86/2006-08-16-CycleInDAG.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86
+; RUN: llc < %s -mtriple=i686--
%struct.expr = type { %struct.rtx_def*, i32, %struct.expr*, %struct.occr*, %struct.occr*, %struct.rtx_def* }
%struct.hash_table = type { %struct.expr**, i32, i32, i32 }
%struct.occr = type { %struct.occr*, %struct.rtx_def*, i8, i8 }
diff --git a/test/CodeGen/X86/2006-08-21-ExtraMovInst.ll b/test/CodeGen/X86/2006-08-21-ExtraMovInst.ll
index 56d5f2f3040a..1601d684e1f6 100644
--- a/test/CodeGen/X86/2006-08-21-ExtraMovInst.ll
+++ b/test/CodeGen/X86/2006-08-21-ExtraMovInst.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mcpu=i386 | \
+; RUN: llc < %s -mtriple=i686-- -mcpu=i386 | \
; RUN: not grep "movl %eax, %edx"
define i32 @foo(i32 %t, i32 %C) {
diff --git a/test/CodeGen/X86/2006-09-01-CycleInDAG.ll b/test/CodeGen/X86/2006-09-01-CycleInDAG.ll
index a7a10afaae1d..3c32e1d7c6a3 100644
--- a/test/CodeGen/X86/2006-09-01-CycleInDAG.ll
+++ b/test/CodeGen/X86/2006-09-01-CycleInDAG.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86
+; RUN: llc < %s
target datalayout = "e-p:32:32"
target triple = "i686-apple-darwin8"
%struct.CUMULATIVE_ARGS = type { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }
diff --git a/test/CodeGen/X86/2006-10-09-CycleInDAG.ll b/test/CodeGen/X86/2006-10-09-CycleInDAG.ll
index e2c84ea569e6..00726f9b820a 100644
--- a/test/CodeGen/X86/2006-10-09-CycleInDAG.ll
+++ b/test/CodeGen/X86/2006-10-09-CycleInDAG.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86
+; RUN: llc < %s -mtriple=i686--
define void @_ZN13QFSFileEngine4readEPcx() {
%tmp201 = load i32, i32* null ; <i32> [#uses=1]
diff --git a/test/CodeGen/X86/2006-10-10-FindModifiedNodeSlotBug.ll b/test/CodeGen/X86/2006-10-10-FindModifiedNodeSlotBug.ll
index 435582587856..0b0fc05269d1 100644
--- a/test/CodeGen/X86/2006-10-10-FindModifiedNodeSlotBug.ll
+++ b/test/CodeGen/X86/2006-10-10-FindModifiedNodeSlotBug.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | grep shrl
+; RUN: llc < %s -mtriple=i686-- | grep shrl
; Bug in FindModifiedNodeSlot cause tmp14 load to become a zextload and shr 31
; is then optimized away.
@tree_code_type = external global [0 x i32] ; <[0 x i32]*> [#uses=1]
diff --git a/test/CodeGen/X86/2006-10-12-CycleInDAG.ll b/test/CodeGen/X86/2006-10-12-CycleInDAG.ll
index 7a32ef7801d9..8b9b91b31d2d 100644
--- a/test/CodeGen/X86/2006-10-12-CycleInDAG.ll
+++ b/test/CodeGen/X86/2006-10-12-CycleInDAG.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86
+; RUN: llc < %s -mtriple=i686--
%struct.function = type opaque
%struct.lang_decl = type opaque
%struct.location_t = type { i8*, i32 }
diff --git a/test/CodeGen/X86/2006-10-13-CycleInDAG.ll b/test/CodeGen/X86/2006-10-13-CycleInDAG.ll
index c45469d4e3ee..9d2ec80b7958 100644
--- a/test/CodeGen/X86/2006-10-13-CycleInDAG.ll
+++ b/test/CodeGen/X86/2006-10-13-CycleInDAG.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86
+; RUN: llc < %s -mtriple=i686--
@str = external global [18 x i8] ; <[18 x i8]*> [#uses=1]
define void @test() {
diff --git a/test/CodeGen/X86/2006-10-19-SwitchUnnecessaryBranching.ll b/test/CodeGen/X86/2006-10-19-SwitchUnnecessaryBranching.ll
index 332816e22cda..0e589f2f2e39 100644
--- a/test/CodeGen/X86/2006-10-19-SwitchUnnecessaryBranching.ll
+++ b/test/CodeGen/X86/2006-10-19-SwitchUnnecessaryBranching.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -asm-verbose | FileCheck %s
+; RUN: llc < %s -mtriple=i686-- -asm-verbose | FileCheck %s
@str = internal constant [14 x i8] c"Hello world!\0A\00" ; <[14 x i8]*> [#uses=1]
@str.upgrd.1 = internal constant [13 x i8] c"Blah world!\0A\00" ; <[13 x i8]*> [#uses=1]
diff --git a/test/CodeGen/X86/2006-11-12-CSRetCC.ll b/test/CodeGen/X86/2006-11-12-CSRetCC.ll
index 9e1bf9edbbc4..98d627b9ffe7 100644
--- a/test/CodeGen/X86/2006-11-12-CSRetCC.ll
+++ b/test/CodeGen/X86/2006-11-12-CSRetCC.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | FileCheck %s
+; RUN: llc < %s | FileCheck %s
target triple = "i686-pc-linux-gnu"
@str = internal constant [9 x i8] c"%f+%f*i\0A\00" ; <[9 x i8]*> [#uses=1]
diff --git a/test/CodeGen/X86/2006-11-17-IllegalMove.ll b/test/CodeGen/X86/2006-11-17-IllegalMove.ll
index c0bd6f728422..f110bd89a301 100644
--- a/test/CodeGen/X86/2006-11-17-IllegalMove.ll
+++ b/test/CodeGen/X86/2006-11-17-IllegalMove.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 > %t
+; RUN: llc < %s -mtriple=x86_64-- > %t
; RUN: grep movb %t | count 1
; RUN: grep "movzb[wl]" %t
diff --git a/test/CodeGen/X86/2006-11-27-SelectLegalize.ll b/test/CodeGen/X86/2006-11-27-SelectLegalize.ll
index ba83a8db8399..87aba33fb7c4 100644
--- a/test/CodeGen/X86/2006-11-27-SelectLegalize.ll
+++ b/test/CodeGen/X86/2006-11-27-SelectLegalize.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | FileCheck %s
+; RUN: llc < %s -mtriple=i686-- | FileCheck %s
; PR1016
; CHECK: {{test.*1}}
diff --git a/test/CodeGen/X86/2006-12-16-InlineAsmCrash.ll b/test/CodeGen/X86/2006-12-16-InlineAsmCrash.ll
index 080de1fb553e..60bffdd908c2 100644
--- a/test/CodeGen/X86/2006-12-16-InlineAsmCrash.ll
+++ b/test/CodeGen/X86/2006-12-16-InlineAsmCrash.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86
+; RUN: llc < %s
; PR1049
target datalayout = "e-p:32:32"
target triple = "i686-pc-linux-gnu"
diff --git a/test/CodeGen/X86/2006-12-19-IntelSyntax.ll b/test/CodeGen/X86/2006-12-19-IntelSyntax.ll
index 2c3c5c99c1b2..de56a5280d19 100644
--- a/test/CodeGen/X86/2006-12-19-IntelSyntax.ll
+++ b/test/CodeGen/X86/2006-12-19-IntelSyntax.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -x86-asm-syntax=intel
+; RUN: llc < %s -x86-asm-syntax=intel
; PR1061
target datalayout = "e-p:32:32"
target triple = "i686-pc-linux-gnu"
diff --git a/test/CodeGen/X86/2007-01-13-StackPtrIndex.ll b/test/CodeGen/X86/2007-01-13-StackPtrIndex.ll
index f83eea179d6a..f8975e6c3b5d 100644
--- a/test/CodeGen/X86/2007-01-13-StackPtrIndex.ll
+++ b/test/CodeGen/X86/2007-01-13-StackPtrIndex.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 > %t
+; RUN: llc < %s -mtriple=x86_64-- > %t
; RUN: not grep ",%rsp)" %t
; PR1103
diff --git a/test/CodeGen/X86/2007-01-29-InlineAsm-ir.ll b/test/CodeGen/X86/2007-01-29-InlineAsm-ir.ll
index e83e2e54e455..f16a8783d7de 100644
--- a/test/CodeGen/X86/2007-01-29-InlineAsm-ir.ll
+++ b/test/CodeGen/X86/2007-01-29-InlineAsm-ir.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86
+; RUN: llc < %s -mtriple=i686--
; Test 'ri' constraint.
define void @run_init_process() {
diff --git a/test/CodeGen/X86/2007-02-04-OrAddrMode.ll b/test/CodeGen/X86/2007-02-04-OrAddrMode.ll
index f05175259c80..660331cabd18 100644
--- a/test/CodeGen/X86/2007-02-04-OrAddrMode.ll
+++ b/test/CodeGen/X86/2007-02-04-OrAddrMode.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | FileCheck %s
+; RUN: llc < %s -mtriple=i686-- | FileCheck %s
;; This example can't fold the or into an LEA.
define i32 @test(float ** %tmp2, i32 %tmp12) nounwind {
diff --git a/test/CodeGen/X86/2007-02-19-LiveIntervalAssert.ll b/test/CodeGen/X86/2007-02-19-LiveIntervalAssert.ll
index a9b85b94cd41..201de30abc0b 100644
--- a/test/CodeGen/X86/2007-02-19-LiveIntervalAssert.ll
+++ b/test/CodeGen/X86/2007-02-19-LiveIntervalAssert.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mtriple=i686-pc-linux-gnu -relocation-model=pic
+; RUN: llc < %s -mtriple=i686-pc-linux-gnu -relocation-model=pic
; PR1027
%struct._IO_FILE = type { i32, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, %struct._IO_marker*, %struct._IO_FILE*, i32, i32, i32, i16, i8, [1 x i8], i8*, i64, i8*, i8*, i8*, i8*, i32, i32, [40 x i8] }
diff --git a/test/CodeGen/X86/2007-02-23-DAGCombine-Miscompile.ll b/test/CodeGen/X86/2007-02-23-DAGCombine-Miscompile.ll
index cbc1bc47fb15..be4e16917446 100644
--- a/test/CodeGen/X86/2007-02-23-DAGCombine-Miscompile.ll
+++ b/test/CodeGen/X86/2007-02-23-DAGCombine-Miscompile.ll
@@ -1,5 +1,5 @@
; PR1219
-; RUN: llc < %s -march=x86 | FileCheck %s
+; RUN: llc < %s -mtriple=i686-- | FileCheck %s
define i32 @test(i1 %X) {
; CHECK-LABEL: test:
diff --git a/test/CodeGen/X86/2007-02-25-FastCCStack.ll b/test/CodeGen/X86/2007-02-25-FastCCStack.ll
index 2e2b56d04a25..e22f0b4540db 100644
--- a/test/CodeGen/X86/2007-02-25-FastCCStack.ll
+++ b/test/CodeGen/X86/2007-02-25-FastCCStack.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mcpu=pentium3
+; RUN: llc < %s -mtriple=i686-- -mcpu=pentium3
define internal fastcc double @ggc_rlimit_bound(double %limit) {
ret double %limit
diff --git a/test/CodeGen/X86/2007-03-15-GEP-Idx-Sink.ll b/test/CodeGen/X86/2007-03-15-GEP-Idx-Sink.ll
index 645221fe299e..a9140f505c62 100644
--- a/test/CodeGen/X86/2007-03-15-GEP-Idx-Sink.ll
+++ b/test/CodeGen/X86/2007-03-15-GEP-Idx-Sink.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mtriple=i686-darwin | FileCheck %s
+; RUN: llc < %s -mtriple=i686-darwin | FileCheck %s
define void @foo(i8** %buf, i32 %size, i32 %col, i8* %p) nounwind {
entry:
diff --git a/test/CodeGen/X86/2007-03-16-InlineAsm.ll b/test/CodeGen/X86/2007-03-16-InlineAsm.ll
index 61746814f9a0..d04b0966b3a8 100644
--- a/test/CodeGen/X86/2007-03-16-InlineAsm.ll
+++ b/test/CodeGen/X86/2007-03-16-InlineAsm.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86
+; RUN: llc < %s -mtriple=i686--
; ModuleID = 'a.bc'
diff --git a/test/CodeGen/X86/2007-03-18-LiveIntervalAssert.ll b/test/CodeGen/X86/2007-03-18-LiveIntervalAssert.ll
index 70936fbc9281..4f5e0914612b 100644
--- a/test/CodeGen/X86/2007-03-18-LiveIntervalAssert.ll
+++ b/test/CodeGen/X86/2007-03-18-LiveIntervalAssert.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86
+; RUN: llc < %s -mtriple=i686--
; PR1259
define void @test() {
diff --git a/test/CodeGen/X86/2007-03-24-InlineAsmMultiRegConstraint.ll b/test/CodeGen/X86/2007-03-24-InlineAsmMultiRegConstraint.ll
index 44d68dd0493e..c7914037a00b 100644
--- a/test/CodeGen/X86/2007-03-24-InlineAsmMultiRegConstraint.ll
+++ b/test/CodeGen/X86/2007-03-24-InlineAsmMultiRegConstraint.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86
+; RUN: llc < %s -mtriple=i686--
define i32 @test(i16 %tmp40414244) {
%tmp48 = call i32 asm sideeffect "inl ${1:w}, $0", "={ax},N{dx},~{dirflag},~{fpsr},~{flags}"( i16 %tmp40414244 )
diff --git a/test/CodeGen/X86/2007-03-24-InlineAsmPModifier.ll b/test/CodeGen/X86/2007-03-24-InlineAsmPModifier.ll
index 93fb344cbb1d..da9a81c07395 100644
--- a/test/CodeGen/X86/2007-03-24-InlineAsmPModifier.ll
+++ b/test/CodeGen/X86/2007-03-24-InlineAsmPModifier.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -no-integrated-as | grep "mov %gs:72, %eax"
+; RUN: llc < %s -no-integrated-as | grep "mov %gs:72, %eax"
target datalayout = "e-p:32:32"
target triple = "i686-apple-darwin9"
diff --git a/test/CodeGen/X86/2007-03-24-InlineAsmVectorOp.ll b/test/CodeGen/X86/2007-03-24-InlineAsmVectorOp.ll
index 6cf8bf90611c..5713804aa032 100644
--- a/test/CodeGen/X86/2007-03-24-InlineAsmVectorOp.ll
+++ b/test/CodeGen/X86/2007-03-24-InlineAsmVectorOp.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mcpu=yonah -march=x86 -no-integrated-as | FileCheck %s
+; RUN: llc < %s -mcpu=yonah -no-integrated-as | FileCheck %s
target datalayout = "e-p:32:32"
target triple = "i686-apple-darwin9"
diff --git a/test/CodeGen/X86/2007-03-24-InlineAsmXConstraint.ll b/test/CodeGen/X86/2007-03-24-InlineAsmXConstraint.ll
index 3e1786bef793..9b9d819c1efb 100644
--- a/test/CodeGen/X86/2007-03-24-InlineAsmXConstraint.ll
+++ b/test/CodeGen/X86/2007-03-24-InlineAsmXConstraint.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | FileCheck %s
+; RUN: llc < %s | FileCheck %s
target datalayout = "e-p:32:32"
target triple = "i686-apple-darwin9"
diff --git a/test/CodeGen/X86/2007-03-26-CoalescerBug.ll b/test/CodeGen/X86/2007-03-26-CoalescerBug.ll
index 9a3d4cb3bee7..52001f5c21dc 100644
--- a/test/CodeGen/X86/2007-03-26-CoalescerBug.ll
+++ b/test/CodeGen/X86/2007-03-26-CoalescerBug.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86
+; RUN: llc < %s -mtriple=i686--
@data = external global [339 x i64]
diff --git a/test/CodeGen/X86/2007-04-11-InlineAsmVectorResult.ll b/test/CodeGen/X86/2007-04-11-InlineAsmVectorResult.ll
index f48c13259c42..5bf669bb1967 100644
--- a/test/CodeGen/X86/2007-04-11-InlineAsmVectorResult.ll
+++ b/test/CodeGen/X86/2007-04-11-InlineAsmVectorResult.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mcpu=yonah
+; RUN: llc < %s -mcpu=yonah
target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64"
target triple = "i686-apple-darwin8"
diff --git a/test/CodeGen/X86/2007-04-24-Huge-Stack.ll b/test/CodeGen/X86/2007-04-24-Huge-Stack.ll
index 648718cc7223..b6074053d454 100644
--- a/test/CodeGen/X86/2007-04-24-Huge-Stack.ll
+++ b/test/CodeGen/X86/2007-04-24-Huge-Stack.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-- | FileCheck %s
; PR1348
; CHECK-NOT: 4294967112
diff --git a/test/CodeGen/X86/2007-05-05-VecCastExpand.ll b/test/CodeGen/X86/2007-05-05-VecCastExpand.ll
index 0edf1398295d..19b9224cdc07 100644
--- a/test/CodeGen/X86/2007-05-05-VecCastExpand.ll
+++ b/test/CodeGen/X86/2007-05-05-VecCastExpand.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mcpu=i386 -mattr=+sse
+; RUN: llc < %s -mtriple=i686-- -mcpu=i386 -mattr=+sse
; PR1371
@str = external global [18 x i8] ; <[18 x i8]*> [#uses=1]
diff --git a/test/CodeGen/X86/2007-05-14-LiveIntervalAssert.ll b/test/CodeGen/X86/2007-05-14-LiveIntervalAssert.ll
index 9ce5f5ac63a1..24044e7ebb94 100644
--- a/test/CodeGen/X86/2007-05-14-LiveIntervalAssert.ll
+++ b/test/CodeGen/X86/2007-05-14-LiveIntervalAssert.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64
+; RUN: llc < %s -mtriple=x86_64--
%struct.XDesc = type <{ i32, %struct.OpaqueXDataStorageType** }>
%struct.OpaqueXDataStorageType = type opaque
diff --git a/test/CodeGen/X86/2007-05-17-ShuffleISelBug.ll b/test/CodeGen/X86/2007-05-17-ShuffleISelBug.ll
index 38fc5e18fe28..d5b5fa200dad 100644
--- a/test/CodeGen/X86/2007-05-17-ShuffleISelBug.ll
+++ b/test/CodeGen/X86/2007-05-17-ShuffleISelBug.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 | FileCheck %s
+; RUN: llc < %s -mtriple=i686-- -mattr=+sse2 | FileCheck %s
; CHECK-NOT: punpckhwd
diff --git a/test/CodeGen/X86/2007-06-28-X86-64-isel.ll b/test/CodeGen/X86/2007-06-28-X86-64-isel.ll
index 9d42c49317fd..6b300d12b322 100644
--- a/test/CodeGen/X86/2007-06-28-X86-64-isel.ll
+++ b/test/CodeGen/X86/2007-06-28-X86-64-isel.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mattr=+sse2
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2
define void @test() {
%tmp1 = call <8 x i16> @llvm.x86.sse2.pmins.w( <8 x i16> zeroinitializer, <8 x i16> bitcast (<4 x i32> < i32 7, i32 7, i32 7, i32 7 > to <8 x i16>) )
diff --git a/test/CodeGen/X86/2007-06-29-DAGCombinerBug.ll b/test/CodeGen/X86/2007-06-29-DAGCombinerBug.ll
index d2d6388c0782..dc74352988f4 100644
--- a/test/CodeGen/X86/2007-06-29-DAGCombinerBug.ll
+++ b/test/CodeGen/X86/2007-06-29-DAGCombinerBug.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2
+; RUN: llc < %s -mtriple=i686-- -mattr=+sse2
define void @test() {
entry:
diff --git a/test/CodeGen/X86/2007-06-29-VecFPConstantCSEBug.ll b/test/CodeGen/X86/2007-06-29-VecFPConstantCSEBug.ll
index 87edab77ac19..bd8917cd7637 100644
--- a/test/CodeGen/X86/2007-06-29-VecFPConstantCSEBug.ll
+++ b/test/CodeGen/X86/2007-06-29-VecFPConstantCSEBug.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2
+; RUN: llc < %s -mtriple=i686-- -mattr=+sse2
define void @test(<4 x float>* %arg) {
%tmp89 = getelementptr <4 x float>, <4 x float>* %arg, i64 3
diff --git a/test/CodeGen/X86/2007-08-01-LiveVariablesBug.ll b/test/CodeGen/X86/2007-08-01-LiveVariablesBug.ll
index 4f7ae327d1fd..ac3dd78160ca 100644
--- a/test/CodeGen/X86/2007-08-01-LiveVariablesBug.ll
+++ b/test/CodeGen/X86/2007-08-01-LiveVariablesBug.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | FileCheck %s
+; RUN: llc < %s -mtriple=i686-- | FileCheck %s
; CHECK-NOT: movl
diff --git a/test/CodeGen/X86/2007-08-10-SignExtSubreg.ll b/test/CodeGen/X86/2007-08-10-SignExtSubreg.ll
index 85a144083ece..a798839fdb90 100644
--- a/test/CodeGen/X86/2007-08-10-SignExtSubreg.ll
+++ b/test/CodeGen/X86/2007-08-10-SignExtSubreg.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | grep "movsbl"
+; RUN: llc < %s -mtriple=i686-- | grep "movsbl"
@X = global i32 0 ; <i32*> [#uses=1]
diff --git a/test/CodeGen/X86/2007-10-04-AvoidEFLAGSCopy.ll b/test/CodeGen/X86/2007-10-04-AvoidEFLAGSCopy.ll
index 7eb018ce525f..bcffac2cdd4c 100644
--- a/test/CodeGen/X86/2007-10-04-AvoidEFLAGSCopy.ll
+++ b/test/CodeGen/X86/2007-10-04-AvoidEFLAGSCopy.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | not grep pushf
+; RUN: llc < %s -mtriple=i686-- | not grep pushf
%struct.gl_texture_image = type { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i8*, i8* }
%struct.gl_texture_object = type { i32, i32, i32, float, [4 x i32], i32, i32, i32, i32, i32, float, [11 x %struct.gl_texture_image*], [1024 x i8], i32, i32, i32, i8, i8*, i8, void (%struct.gl_texture_object*, i32, float*, float*, float*, float*, i8*, i8*, i8*, i8*)*, %struct.gl_texture_object* }
diff --git a/test/CodeGen/X86/2007-10-12-CoalesceExtSubReg.ll b/test/CodeGen/X86/2007-10-12-CoalesceExtSubReg.ll
index c535392ffdfc..6658536bd34f 100644
--- a/test/CodeGen/X86/2007-10-12-CoalesceExtSubReg.ll
+++ b/test/CodeGen/X86/2007-10-12-CoalesceExtSubReg.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | not grep movb
+; RUN: llc < %s -mtriple=i686-- | not grep movb
define signext i16 @f(i32* %bp, i32* %ss) {
entry:
diff --git a/test/CodeGen/X86/2007-10-12-SpillerUnfold1.ll b/test/CodeGen/X86/2007-10-12-SpillerUnfold1.ll
index c4d5cb970c3f..14cff04e1e1a 100644
--- a/test/CodeGen/X86/2007-10-12-SpillerUnfold1.ll
+++ b/test/CodeGen/X86/2007-10-12-SpillerUnfold1.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 | grep addss | not grep esp
+; RUN: llc < %s -mtriple=i686-- -mattr=+sse2 | grep addss | not grep esp
define fastcc void @fht(float* %fz, i16 signext %n) {
entry:
diff --git a/test/CodeGen/X86/2007-10-12-SpillerUnfold2.ll b/test/CodeGen/X86/2007-10-12-SpillerUnfold2.ll
index 4b1c1d77f7a0..2601b4d4c53b 100644
--- a/test/CodeGen/X86/2007-10-12-SpillerUnfold2.ll
+++ b/test/CodeGen/X86/2007-10-12-SpillerUnfold2.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mcpu=corei7 | grep sarl | not grep esp
+; RUN: llc < %s -mtriple=i686-- -mcpu=corei7 | grep sarl | not grep esp
define signext i16 @t(i16* %qmatrix, i16* %dct, i16* %acBaseTable, i16* %acExtTable, i16 signext %acBaseRes, i16 signext %acMaskRes, i16 signext %acExtRes, i32* %bitptr, i32* %source, i32 %markerPrefix, i8** %byteptr, i32 %scale, i32 %round, i32 %bits) {
entry:
diff --git a/test/CodeGen/X86/2007-10-19-SpillerUnfold.ll b/test/CodeGen/X86/2007-10-19-SpillerUnfold.ll
index 30e1f575caf0..a1bc764e22ec 100644
--- a/test/CodeGen/X86/2007-10-19-SpillerUnfold.ll
+++ b/test/CodeGen/X86/2007-10-19-SpillerUnfold.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -x86-asm-syntax=intel | FileCheck %s
+; RUN: llc < %s -mtriple=i686-- -x86-asm-syntax=intel | FileCheck %s
; CHECK: inc
; CHECK-NOT: PTR
diff --git a/test/CodeGen/X86/2007-10-29-ExtendSetCC.ll b/test/CodeGen/X86/2007-10-29-ExtendSetCC.ll
index 9f57df87fe48..f338ff065ab1 100644
--- a/test/CodeGen/X86/2007-10-29-ExtendSetCC.ll
+++ b/test/CodeGen/X86/2007-10-29-ExtendSetCC.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | grep mov | count 1
+; RUN: llc < %s -mtriple=i686-- | grep mov | count 1
define signext i16 @t() {
entry:
diff --git a/test/CodeGen/X86/2007-10-30-LSRCrash.ll b/test/CodeGen/X86/2007-10-30-LSRCrash.ll
index d945d57fad7c..10cb826944b1 100644
--- a/test/CodeGen/X86/2007-10-30-LSRCrash.ll
+++ b/test/CodeGen/X86/2007-10-30-LSRCrash.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86
+; RUN: llc < %s -mtriple=i686--
define i32 @unique(i8* %full, i32 %p, i32 %len, i32 %mode, i32 %verbos, i32 %flags) {
entry:
diff --git a/test/CodeGen/X86/2007-10-31-extractelement-i64.ll b/test/CodeGen/X86/2007-10-31-extractelement-i64.ll
index 3d52b6cf7b3e..2a562458e855 100644
--- a/test/CodeGen/X86/2007-10-31-extractelement-i64.ll
+++ b/test/CodeGen/X86/2007-10-31-extractelement-i64.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=sse2
+; RUN: llc < %s -mattr=sse2
; ModuleID = 'yyy.c'
target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
target triple = "i686-apple-darwin8"
diff --git a/test/CodeGen/X86/2007-11-01-ISelCrash.ll b/test/CodeGen/X86/2007-11-01-ISelCrash.ll
index 019c6a8cc0d9..d0c7a6259026 100644
--- a/test/CodeGen/X86/2007-11-01-ISelCrash.ll
+++ b/test/CodeGen/X86/2007-11-01-ISelCrash.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86
+; RUN: llc < %s -mtriple=i686--
%"struct.K::JL" = type <{ i8 }>
%struct.jv = type { i64 }
diff --git a/test/CodeGen/X86/2007-11-06-InstrSched.ll b/test/CodeGen/X86/2007-11-06-InstrSched.ll
index d88b45f68390..3894029e5925 100644
--- a/test/CodeGen/X86/2007-11-06-InstrSched.ll
+++ b/test/CodeGen/X86/2007-11-06-InstrSched.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mcpu=generic -mattr=+sse2 | not grep lea
+; RUN: llc < %s -mtriple=i686-- -mcpu=generic -mattr=+sse2 | not grep lea
define float @foo(i32* %x, float* %y, i32 %c) nounwind {
entry:
diff --git a/test/CodeGen/X86/2007-11-07-MulBy4.ll b/test/CodeGen/X86/2007-11-07-MulBy4.ll
index 06e0a779be1b..9ecf0680f0d3 100644
--- a/test/CodeGen/X86/2007-11-07-MulBy4.ll
+++ b/test/CodeGen/X86/2007-11-07-MulBy4.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | not grep imul
+; RUN: llc < %s -mtriple=i686-- | not grep imul
%struct.eebb = type { %struct.eebb*, i16* }
%struct.hf = type { %struct.hf*, i16*, i8*, i32, i32, %struct.eebb*, i32, i32, i8*, i8*, i8*, i8*, i16*, i8*, i16*, %struct.ri, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, [30 x i32], %struct.eebb, i32, i8* }
diff --git a/test/CodeGen/X86/2007-11-30-LoadFolding-Bug.ll b/test/CodeGen/X86/2007-11-30-LoadFolding-Bug.ll
index 8e315f4d80ff..1f8d829383cf 100644
--- a/test/CodeGen/X86/2007-11-30-LoadFolding-Bug.ll
+++ b/test/CodeGen/X86/2007-11-30-LoadFolding-Bug.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 | FileCheck %s
+; RUN: llc < %s -mtriple=i686-- -mattr=+sse2 | FileCheck %s
; Increment in loop bb.i28.i adjusted to 2, to prevent loop reversal from
; kicking in.
diff --git a/test/CodeGen/X86/2007-12-18-LoadCSEBug.ll b/test/CodeGen/X86/2007-12-18-LoadCSEBug.ll
index 26d18273dd47..35aa016ec58b 100644
--- a/test/CodeGen/X86/2007-12-18-LoadCSEBug.ll
+++ b/test/CodeGen/X86/2007-12-18-LoadCSEBug.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mcpu=generic | grep "(%esp)" | count 2
+; RUN: llc < %s -mtriple=i686-- -mcpu=generic | grep "(%esp)" | count 2
; PR1872
%struct.c34007g__designated___XUB = type { i32, i32, i32, i32 }
diff --git a/test/CodeGen/X86/2008-01-08-SchedulerCrash.ll b/test/CodeGen/X86/2008-01-08-SchedulerCrash.ll
index fa71bffaf8c6..2246c9894096 100644
--- a/test/CodeGen/X86/2008-01-08-SchedulerCrash.ll
+++ b/test/CodeGen/X86/2008-01-08-SchedulerCrash.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=+cmov -x86-cmov-converter=false | FileCheck %s
+; RUN: llc < %s -mtriple=i686-- -mattr=+cmov -x86-cmov-converter=false | FileCheck %s
;
; Test scheduling a multi-use compare. We should neither spill flags
; nor clone the compare.
diff --git a/test/CodeGen/X86/2008-01-16-FPStackifierAssert.ll b/test/CodeGen/X86/2008-01-16-FPStackifierAssert.ll
index 6e98f9cb219a..eb8439fc1795 100644
--- a/test/CodeGen/X86/2008-01-16-FPStackifierAssert.ll
+++ b/test/CodeGen/X86/2008-01-16-FPStackifierAssert.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 -regalloc=fast -optimize-regalloc=0
+; RUN: llc < %s -mtriple=i686-- -mattr=+sse2 -regalloc=fast -optimize-regalloc=0
define void @SolveCubic(double %a, double %b, double %c, double %d, i32* %solutions, double* %x) {
entry:
diff --git a/test/CodeGen/X86/2008-01-16-InvalidDAGCombineXform.ll b/test/CodeGen/X86/2008-01-16-InvalidDAGCombineXform.ll
index ffc5a1fb6d41..a08fc1cbd052 100644
--- a/test/CodeGen/X86/2008-01-16-InvalidDAGCombineXform.ll
+++ b/test/CodeGen/X86/2008-01-16-InvalidDAGCombineXform.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | not grep IMPLICIT_DEF
+; RUN: llc < %s -mtriple=i686-- | not grep IMPLICIT_DEF
%struct.node_t = type { double*, %struct.node_t*, %struct.node_t**, double**, double*, i32, i32 }
diff --git a/test/CodeGen/X86/2008-02-05-ISelCrash.ll b/test/CodeGen/X86/2008-02-05-ISelCrash.ll
index ce233a9a554a..a74e7fc6aa41 100644
--- a/test/CodeGen/X86/2008-02-05-ISelCrash.ll
+++ b/test/CodeGen/X86/2008-02-05-ISelCrash.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86
+; RUN: llc < %s -mtriple=i686--
; PR1975
@nodes = external global i64 ; <i64*> [#uses=2]
diff --git a/test/CodeGen/X86/2008-02-06-LoadFoldingBug.ll b/test/CodeGen/X86/2008-02-06-LoadFoldingBug.ll
index 2b4d68ce5df7..ba80086a1510 100644
--- a/test/CodeGen/X86/2008-02-06-LoadFoldingBug.ll
+++ b/test/CodeGen/X86/2008-02-06-LoadFoldingBug.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 | FileCheck %s
+; RUN: llc < %s -mtriple=i686-- -mattr=+sse2 | FileCheck %s
; CHECK: xorps {{.*}}{{LCPI0_0|__xmm@}}
define void @casin({ double, double }* sret %agg.result, double %z.0, double %z.1) nounwind {
diff --git a/test/CodeGen/X86/2008-02-14-BitMiscompile.ll b/test/CodeGen/X86/2008-02-14-BitMiscompile.ll
index fdc1c3bb67ba..d3fa16a07476 100644
--- a/test/CodeGen/X86/2008-02-14-BitMiscompile.ll
+++ b/test/CodeGen/X86/2008-02-14-BitMiscompile.ll
@@ -3,7 +3,7 @@
define i32 @test(i1 %A) {
; CHECK-LABEL: test:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: andl $1, %eax
; CHECK-NEXT: negl %eax
diff --git a/test/CodeGen/X86/2008-02-18-TailMergingBug.ll b/test/CodeGen/X86/2008-02-18-TailMergingBug.ll
index ef69bd01cb96..9f11ecd8bb02 100644
--- a/test/CodeGen/X86/2008-02-18-TailMergingBug.ll
+++ b/test/CodeGen/X86/2008-02-18-TailMergingBug.ll
@@ -1,5 +1,5 @@
; REQUIRES: asserts
-; RUN: llc < %s -march=x86 -mcpu=yonah -stats 2>&1 | grep "Number of block tails merged" | grep 16
+; RUN: llc < %s -mtriple=i686-- -mcpu=yonah -stats 2>&1 | grep "Number of block tails merged" | grep 16
; PR1909
@.str = internal constant [48 x i8] c"transformed bounds: (%.2f, %.2f), (%.2f, %.2f)\0A\00" ; <[48 x i8]*> [#uses=1]
diff --git a/test/CodeGen/X86/2008-02-22-LocalRegAllocBug.ll b/test/CodeGen/X86/2008-02-22-LocalRegAllocBug.ll
index 75f88b0e3811..77ed1857e6c7 100644
--- a/test/CodeGen/X86/2008-02-22-LocalRegAllocBug.ll
+++ b/test/CodeGen/X86/2008-02-22-LocalRegAllocBug.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -regalloc=fast -optimize-regalloc=0 -march=x86 -mattr=+mmx | grep esi
+; RUN: llc < %s -regalloc=fast -optimize-regalloc=0 -mtriple=i686-- -mattr=+mmx | grep esi
; PR2082
; Local register allocator was refusing to use ESI, EDI, and EBP so it ran out of
; registers.
diff --git a/test/CodeGen/X86/2008-02-25-X86-64-CoalescerBug.ll b/test/CodeGen/X86/2008-02-25-X86-64-CoalescerBug.ll
index 382fbed9b88e..7614b40cf9a4 100644
--- a/test/CodeGen/X86/2008-02-25-X86-64-CoalescerBug.ll
+++ b/test/CodeGen/X86/2008-02-25-X86-64-CoalescerBug.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64
+; RUN: llc < %s -mtriple=x86_64--
%struct.XX = type <{ i8 }>
%struct.YY = type { i64 }
diff --git a/test/CodeGen/X86/2008-02-26-AsmDirectMemOp.ll b/test/CodeGen/X86/2008-02-26-AsmDirectMemOp.ll
index 11b55a6e5ac7..2589dbcbbf50 100644
--- a/test/CodeGen/X86/2008-02-26-AsmDirectMemOp.ll
+++ b/test/CodeGen/X86/2008-02-26-AsmDirectMemOp.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -no-integrated-as
+; RUN: llc < %s -no-integrated-as
target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32"
target triple = "i386-pc-linux-gnu"
diff --git a/test/CodeGen/X86/2008-02-27-DeadSlotElimBug.ll b/test/CodeGen/X86/2008-02-27-DeadSlotElimBug.ll
index 857e6237d14f..de13ebf8b666 100644
--- a/test/CodeGen/X86/2008-02-27-DeadSlotElimBug.ll
+++ b/test/CodeGen/X86/2008-02-27-DeadSlotElimBug.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86
+; RUN: llc < %s -mtriple=i686--
%struct.CompAtom = type <{ %struct.Position, float, i32 }>
%struct.Lattice = type { %struct.Position, %struct.Position, %struct.Position, %struct.Position, %struct.Position, %struct.Position, %struct.Position, i32, i32, i32 }
diff --git a/test/CodeGen/X86/2008-02-27-PEICrash.ll b/test/CodeGen/X86/2008-02-27-PEICrash.ll
index d842967561ab..a257bf072277 100644
--- a/test/CodeGen/X86/2008-02-27-PEICrash.ll
+++ b/test/CodeGen/X86/2008-02-27-PEICrash.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2
+; RUN: llc < %s -mtriple=i686-- -mattr=+sse2
define i64 @__divsc3(float %a, float %b, float %c, float %d) nounwind readnone {
entry:
diff --git a/test/CodeGen/X86/2008-03-06-frem-fpstack.ll b/test/CodeGen/X86/2008-03-06-frem-fpstack.ll
index 70a83b5c9f57..498d6baa8581 100644
--- a/test/CodeGen/X86/2008-03-06-frem-fpstack.ll
+++ b/test/CodeGen/X86/2008-03-06-frem-fpstack.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mcpu=i386
+; RUN: llc < %s -mtriple=i686-- -mcpu=i386
; PR2122
define float @func(float %a, float %b) nounwind {
entry:
diff --git a/test/CodeGen/X86/2008-03-07-APIntBug.ll b/test/CodeGen/X86/2008-03-07-APIntBug.ll
index 409bcd51a13b..e17e963ecc90 100644
--- a/test/CodeGen/X86/2008-03-07-APIntBug.ll
+++ b/test/CodeGen/X86/2008-03-07-APIntBug.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mcpu=i386 | not grep 255
+; RUN: llc < %s -mtriple=i686-- -mcpu=i386 | not grep 255
%struct.CONSTRAINT = type { i32, i32, i32, i32 }
%struct.FIRST_UNION = type { %struct.anon }
diff --git a/test/CodeGen/X86/2008-03-13-TwoAddrPassCrash.ll b/test/CodeGen/X86/2008-03-13-TwoAddrPassCrash.ll
index 19d49b21f5bb..4c14024223f8 100644
--- a/test/CodeGen/X86/2008-03-13-TwoAddrPassCrash.ll
+++ b/test/CodeGen/X86/2008-03-13-TwoAddrPassCrash.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86
+; RUN: llc < %s -mtriple=i686--
define signext i16 @t(i32 %depth) nounwind {
entry:
diff --git a/test/CodeGen/X86/2008-03-19-DAGCombinerBug.ll b/test/CodeGen/X86/2008-03-19-DAGCombinerBug.ll
index eaa883c963f2..163b250c6929 100644
--- a/test/CodeGen/X86/2008-03-19-DAGCombinerBug.ll
+++ b/test/CodeGen/X86/2008-03-19-DAGCombinerBug.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86
+; RUN: llc < %s -mtriple=i686--
define i32 @t() nounwind {
entry:
diff --git a/test/CodeGen/X86/2008-03-25-TwoAddrPassBug.ll b/test/CodeGen/X86/2008-03-25-TwoAddrPassBug.ll
index 2d868e0f612a..c7ba60a71742 100644
--- a/test/CodeGen/X86/2008-03-25-TwoAddrPassBug.ll
+++ b/test/CodeGen/X86/2008-03-25-TwoAddrPassBug.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2
+; RUN: llc < %s -mtriple=i686-- -mattr=+sse2
define void @t() {
entry:
diff --git a/test/CodeGen/X86/2008-04-09-BranchFolding.ll b/test/CodeGen/X86/2008-04-09-BranchFolding.ll
index f21a6f37f4b5..55050706115d 100644
--- a/test/CodeGen/X86/2008-04-09-BranchFolding.ll
+++ b/test/CodeGen/X86/2008-04-09-BranchFolding.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | not grep jmp
+; RUN: llc < %s -mtriple=i686-- | not grep jmp
%struct..0anon = type { i32 }
%struct.binding_level = type { %struct.tree_node*, %struct.tree_node*, %struct.tree_node*, %struct.tree_node*, %struct.tree_node*, %struct.binding_level*, i8, i8, i8, i8, i8, i32, %struct.tree_node* }
diff --git a/test/CodeGen/X86/2008-04-16-CoalescerBug.ll b/test/CodeGen/X86/2008-04-16-CoalescerBug.ll
index 1488034f2eb9..b2106873af4a 100644
--- a/test/CodeGen/X86/2008-04-16-CoalescerBug.ll
+++ b/test/CodeGen/X86/2008-04-16-CoalescerBug.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86
+; RUN: llc < %s -mtriple=i686--
define void @Hubba(i8* %saveunder, i32 %firstBlob, i32 %select) nounwind {
entry:
diff --git a/test/CodeGen/X86/2008-04-24-MemCpyBug.ll b/test/CodeGen/X86/2008-04-24-MemCpyBug.ll
index 6389267aa4e8..cd0f3a385685 100644
--- a/test/CodeGen/X86/2008-04-24-MemCpyBug.ll
+++ b/test/CodeGen/X86/2008-04-24-MemCpyBug.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | not grep 120
+; RUN: llc < %s -mtriple=i686-- | not grep 120
; Don't accidentally add the offset twice for trailing bytes.
%struct.S63 = type { [63 x i8] }
diff --git a/test/CodeGen/X86/2008-04-28-CyclicSchedUnit.ll b/test/CodeGen/X86/2008-04-28-CyclicSchedUnit.ll
index 6e8e98d865bd..f819a4cd1396 100644
--- a/test/CodeGen/X86/2008-04-28-CyclicSchedUnit.ll
+++ b/test/CodeGen/X86/2008-04-28-CyclicSchedUnit.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86
+; RUN: llc < %s -mtriple=i686--
define i64 @t(i64 %maxIdleDuration) nounwind {
call void asm sideeffect "wrmsr", "{cx},A,~{dirflag},~{fpsr},~{flags}"( i32 416, i64 0 ) nounwind
diff --git a/test/CodeGen/X86/2008-05-01-InvalidOrdCompare.ll b/test/CodeGen/X86/2008-05-01-InvalidOrdCompare.ll
index 4160b203e36b..42ccb21faf1c 100644
--- a/test/CodeGen/X86/2008-05-01-InvalidOrdCompare.ll
+++ b/test/CodeGen/X86/2008-05-01-InvalidOrdCompare.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -enable-unsafe-fp-math -march=x86 | grep jp
+; RUN: llc < %s -enable-unsafe-fp-math -mtriple=i686-- | grep jp
; rdar://5902801
declare void @test2()
diff --git a/test/CodeGen/X86/2008-05-09-PHIElimBug.ll b/test/CodeGen/X86/2008-05-09-PHIElimBug.ll
index 8ed1b2a759b8..d1e458e38378 100644
--- a/test/CodeGen/X86/2008-05-09-PHIElimBug.ll
+++ b/test/CodeGen/X86/2008-05-09-PHIElimBug.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86
+; RUN: llc < %s -mtriple=i686--
%struct.V = type { <4 x float>, <4 x float>, <4 x float>, <4 x float>, <4 x float>, <4 x float>, <4 x float>, <4 x i32>, float*, float*, float*, float*, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, i32, i32, i32, i32, i32, i32, i32, i32 }
diff --git a/test/CodeGen/X86/2008-05-09-ShuffleLoweringBug.ll b/test/CodeGen/X86/2008-05-09-ShuffleLoweringBug.ll
index 0e4ef1c3260a..19faab66fa33 100644
--- a/test/CodeGen/X86/2008-05-09-ShuffleLoweringBug.ll
+++ b/test/CodeGen/X86/2008-05-09-ShuffleLoweringBug.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2
+; RUN: llc < %s -mtriple=i686-- -mattr=+sse2
define fastcc void @glgVectorFloatConversion() nounwind {
%tmp12745 = load <4 x float>, <4 x float>* null, align 16 ; <<4 x float>> [#uses=1]
diff --git a/test/CodeGen/X86/2008-05-21-CoalescerBug.ll b/test/CodeGen/X86/2008-05-21-CoalescerBug.ll
index c6709a86d85b..4dc6c4e49172 100644
--- a/test/CodeGen/X86/2008-05-21-CoalescerBug.ll
+++ b/test/CodeGen/X86/2008-05-21-CoalescerBug.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -O0 -fast-isel=false -optimize-regalloc -regalloc=basic | grep mov | count 5
+; RUN: llc < %s -mtriple=i686-- -O0 -fast-isel=false -optimize-regalloc -regalloc=basic | grep mov | count 5
; PR2343
%llvm.dbg.anchor.type = type { i32, i32 }
diff --git a/test/CodeGen/X86/2008-05-22-FoldUnalignedLoad.ll b/test/CodeGen/X86/2008-05-22-FoldUnalignedLoad.ll
index a91a422f55d1..9c3ab06d8cab 100644
--- a/test/CodeGen/X86/2008-05-22-FoldUnalignedLoad.ll
+++ b/test/CodeGen/X86/2008-05-22-FoldUnalignedLoad.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mcpu=penryn | FileCheck %s
+; RUN: llc < %s -mtriple=i686-- -mcpu=penryn | FileCheck %s
define void @a(<4 x float>* %x) nounwind {
entry:
diff --git a/test/CodeGen/X86/2008-06-13-NotVolatileLoadStore.ll b/test/CodeGen/X86/2008-06-13-NotVolatileLoadStore.ll
index 581fae269021..e2b2af2ebb0c 100644
--- a/test/CodeGen/X86/2008-06-13-NotVolatileLoadStore.ll
+++ b/test/CodeGen/X86/2008-06-13-NotVolatileLoadStore.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=x86 -fixup-byte-word-insts=0 | FileCheck %s -check-prefix=CHECK -check-prefix=BWOFF
-; RUN: llc < %s -march=x86 -fixup-byte-word-insts=1 | FileCheck %s -check-prefix=CHECK -check-prefix=BWON
+; RUN: llc < %s -mtriple=i686-- -fixup-byte-word-insts=0 | FileCheck %s -check-prefix=CHECK -check-prefix=BWOFF
+; RUN: llc < %s -mtriple=i686-- -fixup-byte-word-insts=1 | FileCheck %s -check-prefix=CHECK -check-prefix=BWON
; These transforms are turned off for load volatiles and stores.
; Check that they weren't turned off for all loads and stores!
; CHECK-LABEL: f:
diff --git a/test/CodeGen/X86/2008-06-25-VecISelBug.ll b/test/CodeGen/X86/2008-06-25-VecISelBug.ll
index 72d190758f8d..7836316b0ade 100644
--- a/test/CodeGen/X86/2008-06-25-VecISelBug.ll
+++ b/test/CodeGen/X86/2008-06-25-VecISelBug.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 | not grep pslldq
+; RUN: llc < %s -mtriple=i686-- -mattr=+sse2 | not grep pslldq
define void @t() nounwind {
entry:
diff --git a/test/CodeGen/X86/2008-07-11-SHLBy1.ll b/test/CodeGen/X86/2008-07-11-SHLBy1.ll
index ff2b05fb08eb..ef1594603921 100644
--- a/test/CodeGen/X86/2008-07-11-SHLBy1.ll
+++ b/test/CodeGen/X86/2008-07-11-SHLBy1.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -o - | not grep shr
+; RUN: llc < %s -mtriple=x86_64-- -o - | not grep shr
define i128 @sl(i128 %x) {
%t = shl i128 %x, 1
ret i128 %t
diff --git a/test/CodeGen/X86/2008-07-22-CombinerCrash.ll b/test/CodeGen/X86/2008-07-22-CombinerCrash.ll
index 719baf5cc945..1d142e05b18c 100644
--- a/test/CodeGen/X86/2008-07-22-CombinerCrash.ll
+++ b/test/CodeGen/X86/2008-07-22-CombinerCrash.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2
+; RUN: llc < %s -mtriple=i686-- -mattr=+sse2
; PR2566
@0 = external global i16 ; <i16*>:0 [#uses=1]
diff --git a/test/CodeGen/X86/2008-07-23-VSetCC.ll b/test/CodeGen/X86/2008-07-23-VSetCC.ll
index 684ca5c89fd2..e0b7c004fc2a 100644
--- a/test/CodeGen/X86/2008-07-23-VSetCC.ll
+++ b/test/CodeGen/X86/2008-07-23-VSetCC.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mcpu=pentium
+; RUN: llc < %s -mtriple=i686-- -mcpu=pentium
; PR2575
define void @entry(i32 %m_task_id, i32 %start_x, i32 %end_x) nounwind {
diff --git a/test/CodeGen/X86/2008-08-06-CmpStride.ll b/test/CodeGen/X86/2008-08-06-CmpStride.ll
index a030fbeed513..1b6bd7ec0a4f 100644
--- a/test/CodeGen/X86/2008-08-06-CmpStride.ll
+++ b/test/CodeGen/X86/2008-08-06-CmpStride.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=x86-64 < %s -o - | grep "cmpl \$[1], %"
+; RUN: llc -mtriple=x86_64-- < %s -o - | grep "cmpl \$[1], %"
@.str = internal constant [4 x i8] c"%d\0A\00"
diff --git a/test/CodeGen/X86/2008-08-06-RewriterBug.ll b/test/CodeGen/X86/2008-08-06-RewriterBug.ll
index f9c5467713fc..201be38d24ee 100644
--- a/test/CodeGen/X86/2008-08-06-RewriterBug.ll
+++ b/test/CodeGen/X86/2008-08-06-RewriterBug.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86
+; RUN: llc < %s -mtriple=i686--
; PR2596
@data = external global [400 x i64] ; <[400 x i64]*> [#uses=5]
diff --git a/test/CodeGen/X86/2008-08-23-64Bit-maskmovq.ll b/test/CodeGen/X86/2008-08-23-64Bit-maskmovq.ll
index 53402c04511c..703fba5247ae 100644
--- a/test/CodeGen/X86/2008-08-23-64Bit-maskmovq.ll
+++ b/test/CodeGen/X86/2008-08-23-64Bit-maskmovq.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64
+; RUN: llc < %s -mtriple=x86_64--
%struct.DrawHelper = type { void (i32, %struct.QT_FT_Span*, i8*)*, void (i32, %struct.QT_FT_Span*, i8*)*, void (%struct.QRasterBuffer*, i32, i32, i32, i8*, i32, i32, i32)*, void (%struct.QRasterBuffer*, i32, i32, i32, i8*, i32, i32, i32)*, void (%struct.QRasterBuffer*, i32, i32, i32, i32, i32)* }
%struct.QBasicAtomic = type { i32 }
diff --git a/test/CodeGen/X86/2008-09-05-sinttofp-2xi32.ll b/test/CodeGen/X86/2008-09-05-sinttofp-2xi32.ll
index 84d373d70a2d..34a5d9999a6b 100644
--- a/test/CodeGen/X86/2008-09-05-sinttofp-2xi32.ll
+++ b/test/CodeGen/X86/2008-09-05-sinttofp-2xi32.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2,+mmx | FileCheck %s
+; RUN: llc < %s -mtriple=i686-- -mattr=+sse2,+mmx | FileCheck %s
; originally from PR2687, but things don't work that way any more.
; there are no MMX instructions here; we use XMM.
diff --git a/test/CodeGen/X86/2008-09-11-CoalescerBug.ll b/test/CodeGen/X86/2008-09-11-CoalescerBug.ll
index 8c46bb3ec8b7..1822dbebb12b 100644
--- a/test/CodeGen/X86/2008-09-11-CoalescerBug.ll
+++ b/test/CodeGen/X86/2008-09-11-CoalescerBug.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86
+; RUN: llc < %s -mtriple=i686--
; PR2783
@g_15 = external global i16 ; <i16*> [#uses=2]
diff --git a/test/CodeGen/X86/2008-09-11-CoalescerBug2.ll b/test/CodeGen/X86/2008-09-11-CoalescerBug2.ll
index a9875521fb18..4d35d65431ab 100644
--- a/test/CodeGen/X86/2008-09-11-CoalescerBug2.ll
+++ b/test/CodeGen/X86/2008-09-11-CoalescerBug2.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86
+; RUN: llc < %s -mtriple=i686--
; RUN: llc -pre-RA-sched=source < %s -mtriple=i686-unknown-linux -mcpu=corei7 | FileCheck %s --check-prefix=SOURCE-SCHED
; PR2748
diff --git a/test/CodeGen/X86/2008-09-17-inline-asm-1.ll b/test/CodeGen/X86/2008-09-17-inline-asm-1.ll
index 3edd72bdba90..1ba17254c3c3 100644
--- a/test/CodeGen/X86/2008-09-17-inline-asm-1.ll
+++ b/test/CodeGen/X86/2008-09-17-inline-asm-1.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=x86 | FileCheck %s
-; RUN: llc < %s -march=x86 -regalloc=fast -optimize-regalloc=0 | FileCheck %s
+; RUN: llc < %s | FileCheck %s
+; RUN: llc < %s -regalloc=fast -optimize-regalloc=0 | FileCheck %s
; %0 must not be put in EAX or EDX.
; In the first asm, $0 and $2 must not be put in EAX.
diff --git a/test/CodeGen/X86/2008-09-18-inline-asm-2.ll b/test/CodeGen/X86/2008-09-18-inline-asm-2.ll
index 0058d979a2fa..51f2dfbfdb58 100644
--- a/test/CodeGen/X86/2008-09-18-inline-asm-2.ll
+++ b/test/CodeGen/X86/2008-09-18-inline-asm-2.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -march=x86 -regalloc=fast -optimize-regalloc=0 -no-integrated-as | FileCheck %s
-; RUN: llc < %s -march=x86 -regalloc=basic -no-integrated-as | FileCheck %s
-; RUN: llc < %s -march=x86 -regalloc=greedy -no-integrated-as | FileCheck %s
+; RUN: llc < %s -regalloc=fast -optimize-regalloc=0 -no-integrated-as | FileCheck %s
+; RUN: llc < %s -regalloc=basic -no-integrated-as | FileCheck %s
+; RUN: llc < %s -regalloc=greedy -no-integrated-as | FileCheck %s
; The 1st, 2nd, 3rd and 5th registers must all be different. The registers
; referenced in the 4th and 6th operands must not be the same as the 1st or 5th
diff --git a/test/CodeGen/X86/2008-09-25-sseregparm-1.ll b/test/CodeGen/X86/2008-09-25-sseregparm-1.ll
index fc3e35ed1f9b..9373c7986199 100644
--- a/test/CodeGen/X86/2008-09-25-sseregparm-1.ll
+++ b/test/CodeGen/X86/2008-09-25-sseregparm-1.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 | grep movs | count 2
-; RUN: llc < %s -march=x86 -mattr=+sse2 | grep fld | count 2
+; RUN: llc < %s -mtriple=i686-- -mattr=+sse2 | grep movs | count 2
+; RUN: llc < %s -mtriple=i686-- -mattr=+sse2 | grep fld | count 2
; check 'inreg' attribute for sse_regparm
define inreg double @foo1() nounwind {
diff --git a/test/CodeGen/X86/2008-09-29-VolatileBug.ll b/test/CodeGen/X86/2008-09-29-VolatileBug.ll
index 6ee8cf2f5e33..c90afa07dd73 100644
--- a/test/CodeGen/X86/2008-09-29-VolatileBug.ll
+++ b/test/CodeGen/X86/2008-09-29-VolatileBug.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | not grep movz
+; RUN: llc < %s -mtriple=i686-- | not grep movz
; PR2835
@g_407 = internal global i32 0 ; <i32*> [#uses=1]
diff --git a/test/CodeGen/X86/2008-10-06-x87ld-nan-1.ll b/test/CodeGen/X86/2008-10-06-x87ld-nan-1.ll
index a135cd497876..6a78a37c9b20 100644
--- a/test/CodeGen/X86/2008-10-06-x87ld-nan-1.ll
+++ b/test/CodeGen/X86/2008-10-06-x87ld-nan-1.ll
@@ -1,7 +1,7 @@
; ModuleID = 'nan.bc'
target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-f80:32:32-v64:64:64-v128:128:128-a0:0:64"
target triple = "i686-apple-darwin8"
-; RUN: llc < %s -march=x86 -mattr=-sse2,-sse3,-sse | grep fldl
+; RUN: llc < %s -mattr=-sse2,-sse3,-sse | grep fldl
; This NaN should be shortened to a double (not a float).
declare x86_stdcallcc void @_D3nan5printFeZv(x86_fp80 %f)
diff --git a/test/CodeGen/X86/2008-10-06-x87ld-nan-2.ll b/test/CodeGen/X86/2008-10-06-x87ld-nan-2.ll
index 34c9857b00ea..e659eea4cfbb 100644
--- a/test/CodeGen/X86/2008-10-06-x87ld-nan-2.ll
+++ b/test/CodeGen/X86/2008-10-06-x87ld-nan-2.ll
@@ -1,7 +1,7 @@
; ModuleID = 'nan.bc'
target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-f80:32:32-v64:64:64-v128:128:128-a0:0:64"
target triple = "i686-apple-darwin8"
-; RUN: llc < %s -march=x86 -mattr=-sse2,-sse3,-sse | grep fldt | count 3
+; RUN: llc < %s -mattr=-sse2,-sse3,-sse | grep fldt | count 3
; it is not safe to shorten any of these NaNs.
declare x86_stdcallcc void @_D3nan5printFeZv(x86_fp80 %f)
diff --git a/test/CodeGen/X86/2008-10-07-SSEISelBug.ll b/test/CodeGen/X86/2008-10-07-SSEISelBug.ll
index 26e802ac05f9..258c999d3f38 100644
--- a/test/CodeGen/X86/2008-10-07-SSEISelBug.ll
+++ b/test/CodeGen/X86/2008-10-07-SSEISelBug.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=+sse,-sse2
+; RUN: llc < %s -mtriple=i686-- -mattr=+sse,-sse2
define <4 x float> @f(float %w) nounwind {
entry:
diff --git a/test/CodeGen/X86/2008-10-13-CoalescerBug.ll b/test/CodeGen/X86/2008-10-13-CoalescerBug.ll
index c285ae4fdd28..b8f2c1f47b0b 100644
--- a/test/CodeGen/X86/2008-10-13-CoalescerBug.ll
+++ b/test/CodeGen/X86/2008-10-13-CoalescerBug.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86
+; RUN: llc < %s -mtriple=i686--
; PR2775
define i32 @func_77(i8 zeroext %p_79) nounwind {
diff --git a/test/CodeGen/X86/2008-10-16-VecUnaryOp.ll b/test/CodeGen/X86/2008-10-16-VecUnaryOp.ll
index ac6fa0dc9b26..fef7d7de0d7f 100644
--- a/test/CodeGen/X86/2008-10-16-VecUnaryOp.ll
+++ b/test/CodeGen/X86/2008-10-16-VecUnaryOp.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2
+; RUN: llc < %s -mtriple=i686-- -mattr=+sse2
; PR2762
define void @foo(<4 x i32>* %p, <4 x double>* %q) {
%n = load <4 x i32>, <4 x i32>* %p
diff --git a/test/CodeGen/X86/2008-10-17-Asm64bitRConstraint.ll b/test/CodeGen/X86/2008-10-17-Asm64bitRConstraint.ll
index 2b2f704349b6..29ed15fbd179 100644
--- a/test/CodeGen/X86/2008-10-17-Asm64bitRConstraint.ll
+++ b/test/CodeGen/X86/2008-10-17-Asm64bitRConstraint.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=x86 -no-integrated-as
-; RUN: llc < %s -march=x86-64 -no-integrated-as
+; RUN: llc < %s -mtriple=i686-- -no-integrated-as
+; RUN: llc < %s -mtriple=x86_64-- -no-integrated-as
define void @test(i64 %x) nounwind {
entry:
diff --git a/test/CodeGen/X86/2008-10-20-AsmDoubleInI32.ll b/test/CodeGen/X86/2008-10-20-AsmDoubleInI32.ll
index e23dfe5a6a1d..45d702ec368a 100644
--- a/test/CodeGen/X86/2008-10-20-AsmDoubleInI32.ll
+++ b/test/CodeGen/X86/2008-10-20-AsmDoubleInI32.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=x86 -no-integrated-as
-; RUN: llc < %s -march=x86-64 -no-integrated-as
+; RUN: llc < %s -mtriple=i686-- -no-integrated-as
+; RUN: llc < %s -mtriple=x86_64-- -no-integrated-as
; from gcc.c-torture/compile/920520-1.c
diff --git a/test/CodeGen/X86/2008-10-24-FlippedCompare.ll b/test/CodeGen/X86/2008-10-24-FlippedCompare.ll
index e504bc3e776c..6c29bfaf33f8 100644
--- a/test/CodeGen/X86/2008-10-24-FlippedCompare.ll
+++ b/test/CodeGen/X86/2008-10-24-FlippedCompare.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 -o - | not grep "ucomiss[^,]*esp"
+; RUN: llc < %s -mtriple=i686-- -mattr=+sse2 -o - | not grep "ucomiss[^,]*esp"
define void @f(float %wt) {
entry:
diff --git a/test/CodeGen/X86/2008-10-29-ExpandVAARG.ll b/test/CodeGen/X86/2008-10-29-ExpandVAARG.ll
index 7ad94f149e1f..6a89fbd04859 100644
--- a/test/CodeGen/X86/2008-10-29-ExpandVAARG.ll
+++ b/test/CodeGen/X86/2008-10-29-ExpandVAARG.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86
+; RUN: llc < %s -mtriple=i686--
; PR2977
define i8* @ap_php_conv_p2(){
entry:
diff --git a/test/CodeGen/X86/2008-11-03-F80VAARG.ll b/test/CodeGen/X86/2008-11-03-F80VAARG.ll
index 97c046c86426..331b9dba9849 100644
--- a/test/CodeGen/X86/2008-11-03-F80VAARG.ll
+++ b/test/CodeGen/X86/2008-11-03-F80VAARG.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -o - | FileCheck %s
+; RUN: llc < %s -mtriple=i686-- -o - | FileCheck %s
declare void @llvm.va_start(i8*) nounwind
diff --git a/test/CodeGen/X86/2008-12-02-dagcombine-1.ll b/test/CodeGen/X86/2008-12-02-dagcombine-1.ll
index 004adc08091e..3c7227c87ddc 100644
--- a/test/CodeGen/X86/2008-12-02-dagcombine-1.ll
+++ b/test/CodeGen/X86/2008-12-02-dagcombine-1.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | grep "(%esp)" | count 2
+; RUN: llc < %s | grep "(%esp)" | count 2
target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
target triple = "i386-apple-darwin9.5"
; a - a should be found and removed, leaving refs to only L and P
diff --git a/test/CodeGen/X86/2008-12-02-dagcombine-2.ll b/test/CodeGen/X86/2008-12-02-dagcombine-2.ll
index 6622bc26692b..42846f05e266 100644
--- a/test/CodeGen/X86/2008-12-02-dagcombine-2.ll
+++ b/test/CodeGen/X86/2008-12-02-dagcombine-2.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | grep "(%esp)" | count 2
+; RUN: llc < %s | grep "(%esp)" | count 2
target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
target triple = "i386-apple-darwin9.5"
; a - a should be found and removed, leaving refs to only L and P
diff --git a/test/CodeGen/X86/2008-12-02-dagcombine-3.ll b/test/CodeGen/X86/2008-12-02-dagcombine-3.ll
index d5a676a7dbba..013732315bec 100644
--- a/test/CodeGen/X86/2008-12-02-dagcombine-3.ll
+++ b/test/CodeGen/X86/2008-12-02-dagcombine-3.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=x86 | grep add | count 2
-; RUN: llc < %s -march=x86 | grep sub | grep -v subsections | count 1
+; RUN: llc < %s | grep add | count 2
+; RUN: llc < %s | grep sub | grep -v subsections | count 1
target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
target triple = "i386-apple-darwin9.5"
; this should be rearranged to have two +s and one -
diff --git a/test/CodeGen/X86/2008-12-16-dagcombine-4.ll b/test/CodeGen/X86/2008-12-16-dagcombine-4.ll
index 3080d0855727..76141825ded9 100644
--- a/test/CodeGen/X86/2008-12-16-dagcombine-4.ll
+++ b/test/CodeGen/X86/2008-12-16-dagcombine-4.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | grep "(%esp)" | count 2
+; RUN: llc < %s | grep "(%esp)" | count 2
target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
target triple = "i386-apple-darwin9.5"
; a - a should be found and removed, leaving refs to only L and P
diff --git a/test/CodeGen/X86/2008-12-22-dagcombine-5.ll b/test/CodeGen/X86/2008-12-22-dagcombine-5.ll
index 75773e0959c2..884afae92d1b 100644
--- a/test/CodeGen/X86/2008-12-22-dagcombine-5.ll
+++ b/test/CodeGen/X86/2008-12-22-dagcombine-5.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | grep "(%esp)" | count 2
+; RUN: llc < %s | grep "(%esp)" | count 2
target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
target triple = "i386-apple-darwin9.5"
; -(-a) - a should be found and removed, leaving refs to only L and P
diff --git a/test/CodeGen/X86/2008-12-23-crazy-address.ll b/test/CodeGen/X86/2008-12-23-crazy-address.ll
index b80f4731f8ab..902b3afcf29e 100644
--- a/test/CodeGen/X86/2008-12-23-crazy-address.ll
+++ b/test/CodeGen/X86/2008-12-23-crazy-address.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -relocation-model=static | grep "lea.*X.*esp" | count 2
+; RUN: llc < %s -mtriple=i686-- -relocation-model=static | grep "lea.*X.*esp" | count 2
@X = external global [0 x i32]
diff --git a/test/CodeGen/X86/2008-12-23-dagcombine-6.ll b/test/CodeGen/X86/2008-12-23-dagcombine-6.ll
index bae928336baa..1f588934b4c9 100644
--- a/test/CodeGen/X86/2008-12-23-dagcombine-6.ll
+++ b/test/CodeGen/X86/2008-12-23-dagcombine-6.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | grep "(%esp)" | count 4
+; RUN: llc < %s | grep "(%esp)" | count 4
target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
target triple = "i386-apple-darwin9.5"
; a - a should be found and removed, leaving refs to only L and P
diff --git a/test/CodeGen/X86/2009-01-13-DoubleUpdate.ll b/test/CodeGen/X86/2009-01-13-DoubleUpdate.ll
index 4feb764bec6b..e8dd814b2167 100644
--- a/test/CodeGen/X86/2009-01-13-DoubleUpdate.ll
+++ b/test/CodeGen/X86/2009-01-13-DoubleUpdate.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 -enable-legalize-types-checking
+; RUN: llc < %s -mtriple=i686-- -mattr=+sse2 -enable-legalize-types-checking
declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone
diff --git a/test/CodeGen/X86/2009-01-16-UIntToFP.ll b/test/CodeGen/X86/2009-01-16-UIntToFP.ll
index 2eab5f1773ac..bc7df2a60323 100644
--- a/test/CodeGen/X86/2009-01-16-UIntToFP.ll
+++ b/test/CodeGen/X86/2009-01-16-UIntToFP.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86
+; RUN: llc < %s
target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
target triple = "i386-apple-darwin8"
diff --git a/test/CodeGen/X86/2009-01-25-NoSSE.ll b/test/CodeGen/X86/2009-01-25-NoSSE.ll
index c655f2c374ac..58bce75fc738 100644
--- a/test/CodeGen/X86/2009-01-25-NoSSE.ll
+++ b/test/CodeGen/X86/2009-01-25-NoSSE.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mattr=-sse,-sse2 | FileCheck %s
+; RUN: llc < %s -mattr=-sse,-sse2 | FileCheck %s
; PR3402
target datalayout =
"e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
diff --git a/test/CodeGen/X86/2009-01-26-WrongCheck.ll b/test/CodeGen/X86/2009-01-26-WrongCheck.ll
index 117ff47657f4..39ebec5b866f 100644
--- a/test/CodeGen/X86/2009-01-26-WrongCheck.ll
+++ b/test/CodeGen/X86/2009-01-26-WrongCheck.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -enable-legalize-types-checking
+; RUN: llc < %s -mtriple=i686-- -enable-legalize-types-checking
; PR3393
define void @foo(i32 inreg %x) {
diff --git a/test/CodeGen/X86/2009-01-31-BigShift.ll b/test/CodeGen/X86/2009-01-31-BigShift.ll
index 4eb0ec1485b7..158839743d79 100644
--- a/test/CodeGen/X86/2009-01-31-BigShift.ll
+++ b/test/CodeGen/X86/2009-01-31-BigShift.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | not grep and
+; RUN: llc < %s -mtriple=i686-- | not grep and
; PR3401
define void @x(i288 %i) nounwind {
diff --git a/test/CodeGen/X86/2009-01-31-BigShift2.ll b/test/CodeGen/X86/2009-01-31-BigShift2.ll
index 90d14e7b707d..a6209280e4b7 100644
--- a/test/CodeGen/X86/2009-01-31-BigShift2.ll
+++ b/test/CodeGen/X86/2009-01-31-BigShift2.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | grep "mov.*56"
+; RUN: llc < %s -mtriple=i686-- | grep "mov.*56"
; PR3449
define void @test(<8 x double>* %P, i64* %Q) nounwind {
diff --git a/test/CodeGen/X86/2009-01-31-BigShift3.ll b/test/CodeGen/X86/2009-01-31-BigShift3.ll
index 1b531e370437..5d0405106ebb 100644
--- a/test/CodeGen/X86/2009-01-31-BigShift3.ll
+++ b/test/CodeGen/X86/2009-01-31-BigShift3.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86
+; RUN: llc < %s
; PR3450
target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
diff --git a/test/CodeGen/X86/2009-02-01-LargeMask.ll b/test/CodeGen/X86/2009-02-01-LargeMask.ll
index e91208d5b3a1..5084f6fe85a6 100644
--- a/test/CodeGen/X86/2009-02-01-LargeMask.ll
+++ b/test/CodeGen/X86/2009-02-01-LargeMask.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86
+; RUN: llc < %s
; PR3453
target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32"
diff --git a/test/CodeGen/X86/2009-02-03-AnalyzedTwice.ll b/test/CodeGen/X86/2009-02-03-AnalyzedTwice.ll
index 592a7e33b196..a18275a8fc8f 100644
--- a/test/CodeGen/X86/2009-02-03-AnalyzedTwice.ll
+++ b/test/CodeGen/X86/2009-02-03-AnalyzedTwice.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86
+; RUN: llc < %s
; PR3411
target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32"
diff --git a/test/CodeGen/X86/2009-02-08-CoalescerBug.ll b/test/CodeGen/X86/2009-02-08-CoalescerBug.ll
index 908cc08991d8..c1a7823ad45d 100644
--- a/test/CodeGen/X86/2009-02-08-CoalescerBug.ll
+++ b/test/CodeGen/X86/2009-02-08-CoalescerBug.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86
+; RUN: llc < %s -mtriple=i686--
; PR3486
define i32 @foo(i8 signext %p_26) nounwind {
diff --git a/test/CodeGen/X86/2009-02-12-DebugInfoVLA.ll b/test/CodeGen/X86/2009-02-12-DebugInfoVLA.ll
index 8edaf3f1fa34..d144bfaab662 100644
--- a/test/CodeGen/X86/2009-02-12-DebugInfoVLA.ll
+++ b/test/CodeGen/X86/2009-02-12-DebugInfoVLA.ll
@@ -1,8 +1,7 @@
-; RUN: llc < %s
-; RUN: llc < %s -stack-symbol-ordering=0 -march=x86-64 -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -mtriple=i386-apple-darwin9
+; RUN: llc < %s -mtriple=x86_64-apple-darwin9 -stack-symbol-ordering=0 -verify-machineinstrs | FileCheck %s
; PR3538
target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
-target triple = "i386-apple-darwin9"
define signext i8 @foo(i8* %s1) nounwind ssp {
; Make sure we generate:
diff --git a/test/CodeGen/X86/2009-02-12-InlineAsm-nieZ-constraints.ll b/test/CodeGen/X86/2009-02-12-InlineAsm-nieZ-constraints.ll
index 5004f04bf8fd..5aef1f69f1ad 100644
--- a/test/CodeGen/X86/2009-02-12-InlineAsm-nieZ-constraints.ll
+++ b/test/CodeGen/X86/2009-02-12-InlineAsm-nieZ-constraints.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -no-integrated-as | FileCheck %s
+; RUN: llc < %s -no-integrated-as | FileCheck %s
; ModuleID = 'shant.c'
target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
diff --git a/test/CodeGen/X86/2009-02-12-SpillerBug.ll b/test/CodeGen/X86/2009-02-12-SpillerBug.ll
index 4f8a5e7b3e30..acf2f6d65122 100644
--- a/test/CodeGen/X86/2009-02-12-SpillerBug.ll
+++ b/test/CodeGen/X86/2009-02-12-SpillerBug.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mtriple=i386-apple-darwin8
+; RUN: llc < %s -mtriple=i386-apple-darwin8
; PR3561
define hidden void @__mulxc3({ x86_fp80, x86_fp80 }* noalias nocapture sret %agg.result, x86_fp80 %a, x86_fp80 %b, x86_fp80 %c, x86_fp80 %d) nounwind {
diff --git a/test/CodeGen/X86/2009-02-25-CommuteBug.ll b/test/CodeGen/X86/2009-02-25-CommuteBug.ll
index 5bec179534b9..d555f127375a 100644
--- a/test/CodeGen/X86/2009-02-25-CommuteBug.ll
+++ b/test/CodeGen/X86/2009-02-25-CommuteBug.ll
@@ -1,5 +1,5 @@
; REQUIRES: asserts
-; RUN: llc < %s -march=x86 -mattr=+sse2 -stats 2>&1 | FileCheck %s
+; RUN: llc < %s -mtriple=i686-- -mattr=+sse2 -stats 2>&1 | FileCheck %s
; rdar://6608609
; CHECK-NOT: commuted
diff --git a/test/CodeGen/X86/2009-02-26-MachineLICMBug.ll b/test/CodeGen/X86/2009-02-26-MachineLICMBug.ll
index 89cb71a52c04..0b392232b8d9 100644
--- a/test/CodeGen/X86/2009-02-26-MachineLICMBug.ll
+++ b/test/CodeGen/X86/2009-02-26-MachineLICMBug.ll
@@ -1,6 +1,6 @@
; REQUIRES: asserts
-; RUN: llc < %s -march=x86-64 -mattr=+sse3,+sse4.1 -mcpu=penryn -stats 2>&1 | grep "9 machinelicm"
-; RUN: llc < %s -march=x86-64 -mattr=+sse3,+sse4.1 -mcpu=penryn | FileCheck %s
+; RUN: llc < %s -mattr=+sse3,+sse4.1 -mcpu=penryn -stats 2>&1 | grep "9 machinelicm"
+; RUN: llc < %s -mattr=+sse3,+sse4.1 -mcpu=penryn | FileCheck %s
; rdar://6627786
; rdar://7792037
diff --git a/test/CodeGen/X86/2009-03-03-BTHang.ll b/test/CodeGen/X86/2009-03-03-BTHang.ll
index d6d24cda295f..12e667e0b25b 100644
--- a/test/CodeGen/X86/2009-03-03-BTHang.ll
+++ b/test/CodeGen/X86/2009-03-03-BTHang.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86
+; RUN: llc < %s -mtriple=i686--
; rdar://6642541
%struct.HandleBlock = type { [30 x i32], [990 x i8*], %struct.HandleBlockTrailer }
diff --git a/test/CodeGen/X86/2009-03-03-BitcastLongDouble.ll b/test/CodeGen/X86/2009-03-03-BitcastLongDouble.ll
index 9deecebe9453..3dff4f7bfc9f 100644
--- a/test/CodeGen/X86/2009-03-03-BitcastLongDouble.ll
+++ b/test/CodeGen/X86/2009-03-03-BitcastLongDouble.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86
+; RUN: llc < %s -mtriple=i686--
; PR3686
; rdar://6661799
diff --git a/test/CodeGen/X86/2009-03-07-FPConstSelect.ll b/test/CodeGen/X86/2009-03-07-FPConstSelect.ll
index 39caddcf9342..298b81b90e46 100644
--- a/test/CodeGen/X86/2009-03-07-FPConstSelect.ll
+++ b/test/CodeGen/X86/2009-03-07-FPConstSelect.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mcpu=yonah | not grep xmm
+; RUN: llc < %s -mcpu=yonah | not grep xmm
; This should do a single load into the fp stack for the return, not diddle with xmm registers.
target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
diff --git a/test/CodeGen/X86/2009-03-09-APIntCrash.ll b/test/CodeGen/X86/2009-03-09-APIntCrash.ll
index 3bff7dc76561..1c4d71e38a95 100644
--- a/test/CodeGen/X86/2009-03-09-APIntCrash.ll
+++ b/test/CodeGen/X86/2009-03-09-APIntCrash.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64
+; RUN: llc < %s -mtriple=x86_64--
; PR3763
%struct.__block_descriptor = type { i64, i64 }
diff --git a/test/CodeGen/X86/2009-03-25-TestBug.ll b/test/CodeGen/X86/2009-03-25-TestBug.ll
index 367a6d2a3b84..7293511eda4a 100644
--- a/test/CodeGen/X86/2009-03-25-TestBug.ll
+++ b/test/CodeGen/X86/2009-03-25-TestBug.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | FileCheck %s
+; RUN: llc < %s -mtriple=i686-- | FileCheck %s
; rdar://6661955
; CHECK-NOT: and
diff --git a/test/CodeGen/X86/2009-03-26-NoImplicitFPBug.ll b/test/CodeGen/X86/2009-03-26-NoImplicitFPBug.ll
index f4864793ba2f..eb37665a41e0 100644
--- a/test/CodeGen/X86/2009-03-26-NoImplicitFPBug.ll
+++ b/test/CodeGen/X86/2009-03-26-NoImplicitFPBug.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2
+; RUN: llc < %s -mtriple=i686-- -mattr=+sse2
define double @t(double %x) nounwind ssp noimplicitfloat {
entry:
diff --git a/test/CodeGen/X86/2009-04-12-FastIselOverflowCrash.ll b/test/CodeGen/X86/2009-04-12-FastIselOverflowCrash.ll
index 4f8df0533aa3..363053fe341e 100644
--- a/test/CodeGen/X86/2009-04-12-FastIselOverflowCrash.ll
+++ b/test/CodeGen/X86/2009-04-12-FastIselOverflowCrash.ll
@@ -11,7 +11,7 @@ declare %0 @llvm.sadd.with.overflow.i32(i32, i32) nounwind
define fastcc i32 @test() nounwind {
entry:
; CHECK-LABEL: test:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: movl $1, %eax
; CHECK-NEXT: addl $0, %eax
; CHECK-NEXT: seto %cl
diff --git a/test/CodeGen/X86/2009-04-12-picrel.ll b/test/CodeGen/X86/2009-04-12-picrel.ll
index 037dee95717f..697f14e223c9 100644
--- a/test/CodeGen/X86/2009-04-12-picrel.ll
+++ b/test/CodeGen/X86/2009-04-12-picrel.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -march=x86-64 -relocation-model=static -code-model=small > %t
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -relocation-model=static -code-model=small > %t
; RUN: grep leaq %t | count 1
@dst = external global [131072 x i32]
diff --git a/test/CodeGen/X86/2009-04-24.ll b/test/CodeGen/X86/2009-04-24.ll
index 7647dcc7febd..2f4cc588ccd3 100644
--- a/test/CodeGen/X86/2009-04-24.ll
+++ b/test/CodeGen/X86/2009-04-24.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mtriple=x86_64-linux-gnu -regalloc=fast -optimize-regalloc=0 -relocation-model=pic | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-linux-gnu -regalloc=fast -optimize-regalloc=0 -relocation-model=pic | FileCheck %s
; PR4004
; CHECK: {{leaq.*TLSGD}}
diff --git a/test/CodeGen/X86/2009-04-25-CoalescerBug.ll b/test/CodeGen/X86/2009-04-25-CoalescerBug.ll
index 151f0ffc751a..f6b5ecef6e3b 100644
--- a/test/CodeGen/X86/2009-04-25-CoalescerBug.ll
+++ b/test/CodeGen/X86/2009-04-25-CoalescerBug.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 | grep mov | count 1
+; RUN: llc < %s -mtriple=x86_64-- | grep mov | count 1
; rdar://6806252
define i64 @test(i32* %tmp13) nounwind {
diff --git a/test/CodeGen/X86/2009-04-scale.ll b/test/CodeGen/X86/2009-04-scale.ll
index 1fc5f2b234fe..4238d6eb0614 100644
--- a/test/CodeGen/X86/2009-04-scale.ll
+++ b/test/CodeGen/X86/2009-04-scale.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mtriple=i386-unknown-linux-gnu
+; RUN: llc < %s -mtriple=i386-unknown-linux-gnu
; PR3995
%struct.vtable = type { i32 (...)** }
diff --git a/test/CodeGen/X86/2009-05-11-tailmerge-crash.ll b/test/CodeGen/X86/2009-05-11-tailmerge-crash.ll
index e9d15583e562..a7d9aea74c67 100644
--- a/test/CodeGen/X86/2009-05-11-tailmerge-crash.ll
+++ b/test/CodeGen/X86/2009-05-11-tailmerge-crash.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86
+; RUN: llc < %s
; PR4188
; ModuleID = '<stdin>'
target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
diff --git a/test/CodeGen/X86/2009-05-19-SingleElementExtractElement.ll b/test/CodeGen/X86/2009-05-19-SingleElementExtractElement.ll
index 89cd24d7dcfe..b2786ab6167e 100644
--- a/test/CodeGen/X86/2009-05-19-SingleElementExtractElement.ll
+++ b/test/CodeGen/X86/2009-05-19-SingleElementExtractElement.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64
+; RUN: llc < %s -mtriple=x86_64--
; PR3886
define i32 @main(i32 %argc, i8** nocapture %argv) nounwind {
diff --git a/test/CodeGen/X86/2009-05-28-DAGCombineCrash.ll b/test/CodeGen/X86/2009-05-28-DAGCombineCrash.ll
index 019d5dfb1fea..80aafb2dba82 100644
--- a/test/CodeGen/X86/2009-05-28-DAGCombineCrash.ll
+++ b/test/CodeGen/X86/2009-05-28-DAGCombineCrash.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64
+; RUN: llc < %s -mtriple=x86_64--
define fastcc void @S_next_symbol(i448* %P) nounwind ssp {
entry:
diff --git a/test/CodeGen/X86/2009-05-30-ISelBug.ll b/test/CodeGen/X86/2009-05-30-ISelBug.ll
index e01fe9f89308..5aa8fe804945 100644
--- a/test/CodeGen/X86/2009-05-30-ISelBug.ll
+++ b/test/CodeGen/X86/2009-05-30-ISelBug.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 | not grep "movzbl %[abcd]h,"
+; RUN: llc < %s -mtriple=x86_64-- | not grep "movzbl %[abcd]h,"
define void @BZ2_bzDecompress_bb5_2E_outer_bb35_2E_i_bb54_2E_i(i32*, i32 %c_nblock_used.2.i, i32 %.reload51, i32* %.out, i32* %.out1, i32* %.out2, i32* %.out3) nounwind {
newFuncRoot:
diff --git a/test/CodeGen/X86/2009-06-04-VirtualLiveIn.ll b/test/CodeGen/X86/2009-06-04-VirtualLiveIn.ll
index 29795492d89c..baa99f15183b 100644
--- a/test/CodeGen/X86/2009-06-04-VirtualLiveIn.ll
+++ b/test/CodeGen/X86/2009-06-04-VirtualLiveIn.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86
+; RUN: llc < %s -mtriple=i686--
%0 = type { %struct.GAP } ; type %0
%1 = type { i16, i8, i8 } ; type %1
diff --git a/test/CodeGen/X86/2009-06-05-VZextByteShort.ll b/test/CodeGen/X86/2009-06-05-VZextByteShort.ll
index 9c7eb6d633db..d1d05a190015 100644
--- a/test/CodeGen/X86/2009-06-05-VZextByteShort.ll
+++ b/test/CodeGen/X86/2009-06-05-VZextByteShort.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mcpu=core2 | FileCheck %s
+; RUN: llc < %s -mtriple=i686-- -mcpu=core2 | FileCheck %s
define <4 x i16> @a(i32* %x1) nounwind {
; CHECK-LABEL: a:
diff --git a/test/CodeGen/X86/2009-06-05-sitofpCrash.ll b/test/CodeGen/X86/2009-06-05-sitofpCrash.ll
index e361804d61ba..bc19edd23da8 100644
--- a/test/CodeGen/X86/2009-06-05-sitofpCrash.ll
+++ b/test/CodeGen/X86/2009-06-05-sitofpCrash.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=+sse
+; RUN: llc < %s -mtriple=i686-- -mattr=+sse
; PR2598
define <2 x float> @a(<2 x i32> %i) nounwind {
diff --git a/test/CodeGen/X86/2009-06-12-x86_64-tail-call-conv-out-of-sync-bug.ll b/test/CodeGen/X86/2009-06-12-x86_64-tail-call-conv-out-of-sync-bug.ll
index 673e936e2178..5989e69bdc66 100644
--- a/test/CodeGen/X86/2009-06-12-x86_64-tail-call-conv-out-of-sync-bug.ll
+++ b/test/CodeGen/X86/2009-06-12-x86_64-tail-call-conv-out-of-sync-bug.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -tailcallopt -march=x86-64 -mattr=+sse2 -mtriple=x86_64-apple-darwin | grep fstpt
-; RUN: llc < %s -tailcallopt -march=x86-64 -mattr=+sse2 -mtriple=x86_64-apple-darwin | grep xmm
+; RUN: llc < %s -tailcallopt -mattr=+sse2 -mtriple=x86_64-apple-darwin | grep fstpt
+; RUN: llc < %s -tailcallopt -mattr=+sse2 -mtriple=x86_64-apple-darwin | grep xmm
; Check that x86-64 tail calls support x86_fp80 and v2f32 types. (Tail call
; calling convention out of sync with standard c calling convention on x86_64)
diff --git a/test/CodeGen/X86/2009-06-15-not-a-tail-call.ll b/test/CodeGen/X86/2009-06-15-not-a-tail-call.ll
index feb578098cae..a927b87cb866 100644
--- a/test/CodeGen/X86/2009-06-15-not-a-tail-call.ll
+++ b/test/CodeGen/X86/2009-06-15-not-a-tail-call.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -tailcallopt | not grep TAILCALL
+; RUN: llc < %s -mtriple=i686-- -tailcallopt | not grep TAILCALL
; Bug 4396. This tail call can NOT be optimized.
diff --git a/test/CodeGen/X86/2009-06-18-movlp-shuffle-register.ll b/test/CodeGen/X86/2009-06-18-movlp-shuffle-register.ll
index 4c4552da16a5..7cb1b1e95de4 100644
--- a/test/CodeGen/X86/2009-06-18-movlp-shuffle-register.ll
+++ b/test/CodeGen/X86/2009-06-18-movlp-shuffle-register.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=+sse,-sse2 | FileCheck %s
+; RUN: llc < %s -mtriple=i686-- -mattr=+sse,-sse2 | FileCheck %s
; PR2484
define <4 x float> @f4523(<4 x float> %a,<4 x float> %b) nounwind {
diff --git a/test/CodeGen/X86/2009-07-06-TwoAddrAssert.ll b/test/CodeGen/X86/2009-07-06-TwoAddrAssert.ll
index fcc71aef23ae..494423d23023 100644
--- a/test/CodeGen/X86/2009-07-06-TwoAddrAssert.ll
+++ b/test/CodeGen/X86/2009-07-06-TwoAddrAssert.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mtriple=x86_64-unknown-freebsd7.2
+; RUN: llc < %s -mtriple=x86_64-unknown-freebsd7.2
; PR4478
%struct.sockaddr = type <{ i8, i8, [14 x i8] }>
diff --git a/test/CodeGen/X86/2009-07-07-SplitICmp.ll b/test/CodeGen/X86/2009-07-07-SplitICmp.ll
index 366985678e54..bf683ba159f5 100644
--- a/test/CodeGen/X86/2009-07-07-SplitICmp.ll
+++ b/test/CodeGen/X86/2009-07-07-SplitICmp.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86
+; RUN: llc < %s -mtriple=i686--
define void @test2(<2 x i32> %A, <2 x i32> %B, <2 x i32>* %C) nounwind {
%D = icmp sgt <2 x i32> %A, %B
diff --git a/test/CodeGen/X86/2009-07-09-ExtractBoolFromVector.ll b/test/CodeGen/X86/2009-07-09-ExtractBoolFromVector.ll
index 0fdfdcb8a30a..5614a9d4aa5f 100644
--- a/test/CodeGen/X86/2009-07-09-ExtractBoolFromVector.ll
+++ b/test/CodeGen/X86/2009-07-09-ExtractBoolFromVector.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86
+; RUN: llc < %s -mtriple=i686--
; PR3037
define void @entry(<4 x i8>* %dest) {
diff --git a/test/CodeGen/X86/2009-07-19-AsmExtraOperands.ll b/test/CodeGen/X86/2009-07-19-AsmExtraOperands.ll
index a0095ab2064c..2f5c898ce221 100644
--- a/test/CodeGen/X86/2009-07-19-AsmExtraOperands.ll
+++ b/test/CodeGen/X86/2009-07-19-AsmExtraOperands.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64
+; RUN: llc < %s -mtriple=x86_64--
; PR4583
define i32 @atomic_cmpset_long(i64* %dst, i64 %exp, i64 %src) nounwind ssp noredzone noimplicitfloat {
diff --git a/test/CodeGen/X86/2009-07-20-DAGCombineBug.ll b/test/CodeGen/X86/2009-07-20-DAGCombineBug.ll
index 045e89e15856..258ff35d30f1 100644
--- a/test/CodeGen/X86/2009-07-20-DAGCombineBug.ll
+++ b/test/CodeGen/X86/2009-07-20-DAGCombineBug.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86
+; RUN: llc < %s -mtriple=i686--
@bsBuff = internal global i32 0 ; <i32*> [#uses=1]
@llvm.used = appending global [1 x i8*] [i8* bitcast (i32 ()* @bsGetUInt32 to i8*)], section "llvm.metadata" ; <[1 x i8*]*> [#uses=0]
diff --git a/test/CodeGen/X86/2009-08-19-LoadNarrowingMiscompile.ll b/test/CodeGen/X86/2009-08-19-LoadNarrowingMiscompile.ll
index 5926ab4b5c72..058ffcf152e1 100644
--- a/test/CodeGen/X86/2009-08-19-LoadNarrowingMiscompile.ll
+++ b/test/CodeGen/X86/2009-08-19-LoadNarrowingMiscompile.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mtriple=i386-pc-linux | FileCheck %s
+; RUN: llc < %s -mtriple=i386-pc-linux | FileCheck %s
@a = external global i96, align 4
@b = external global i64, align 8
diff --git a/test/CodeGen/X86/2009-08-23-SubRegReuseUndo.ll b/test/CodeGen/X86/2009-08-23-SubRegReuseUndo.ll
index 45e770f8121b..0169de758008 100644
--- a/test/CodeGen/X86/2009-08-23-SubRegReuseUndo.ll
+++ b/test/CodeGen/X86/2009-08-23-SubRegReuseUndo.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86
+; RUN: llc < %s -mtriple=i686--
; PR4753
; This function has a sub-register reuse undone.
diff --git a/test/CodeGen/X86/20090313-signext.ll b/test/CodeGen/X86/20090313-signext.ll
index 3ea13164112e..4162ee66838a 100644
--- a/test/CodeGen/X86/20090313-signext.ll
+++ b/test/CodeGen/X86/20090313-signext.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -relocation-model=pic > %t
+; RUN: llc < %s -mtriple=x86_64-- -relocation-model=pic > %t
; RUN: grep "movswl %ax, %edi" %t
; RUN: grep "movw (%rax), %ax" %t
; XFAIL: *
diff --git a/test/CodeGen/X86/2010-01-05-ZExt-Shl.ll b/test/CodeGen/X86/2010-01-05-ZExt-Shl.ll
index e7004e28752e..73b9179919a9 100644
--- a/test/CodeGen/X86/2010-01-05-ZExt-Shl.ll
+++ b/test/CodeGen/X86/2010-01-05-ZExt-Shl.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -march=x86-64
+; RUN: llc < %s
; <rdar://problem/7499313>
-target triple = "i686-apple-darwin8"
+target triple = "x86_64-apple-darwin8"
declare void @func2(i16 zeroext)
diff --git a/test/CodeGen/X86/2010-01-15-SelectionDAGCycle.ll b/test/CodeGen/X86/2010-01-15-SelectionDAGCycle.ll
index 6aba39e04bca..759bbcef709a 100644
--- a/test/CodeGen/X86/2010-01-15-SelectionDAGCycle.ll
+++ b/test/CodeGen/X86/2010-01-15-SelectionDAGCycle.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64
+; RUN: llc < %s
; ModuleID = 'bugpoint-reduced-simplified.bc'
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
target triple = "x86_64-unknown-linux-gnu"
diff --git a/test/CodeGen/X86/2010-01-18-DbgValue.ll b/test/CodeGen/X86/2010-01-18-DbgValue.ll
index ae60d57bbf49..e95a443a1cc5 100644
--- a/test/CodeGen/X86/2010-01-18-DbgValue.ll
+++ b/test/CodeGen/X86/2010-01-18-DbgValue.ll
@@ -1,12 +1,11 @@
-; RUN: llc -march=x86 -O0 < %s -filetype=obj | llvm-dwarfdump - | FileCheck %s
+; RUN: llc -mtriple=i686-- -O0 < %s -filetype=obj | llvm-dwarfdump -v - | FileCheck %s
; CHECK-LABEL: .debug_info contents:
; CHECK-LABEL: DW_TAG_subprogram
; CHECK: DW_AT_name [DW_FORM_strp] ( {{.*}}"foo")
; CHECK: DW_TAG_formal_parameter
-; CHECK-NEXT: DW_AT_location [DW_FORM_exprloc] (<0x2> 91 {{..}} )
-; DW_OP_fbreg ??
+; CHECK-NEXT: DW_AT_location [DW_FORM_exprloc] (DW_OP_fbreg {{[^ ]*}})
; CHECK-NEXT: DW_AT_name [DW_FORM_strp] ( {{.*}}"my_r0")
%struct.Pt = type { double, double }
diff --git a/test/CodeGen/X86/2010-02-03-DualUndef.ll b/test/CodeGen/X86/2010-02-03-DualUndef.ll
index d116ecc6bde8..d8616f06c61d 100644
--- a/test/CodeGen/X86/2010-02-03-DualUndef.ll
+++ b/test/CodeGen/X86/2010-02-03-DualUndef.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64
+; RUN: llc < %s -mtriple=x86_64--
; PR6086
define fastcc void @prepOutput() nounwind {
bb: ; preds = %output.exit
diff --git a/test/CodeGen/X86/2010-02-11-NonTemporal.ll b/test/CodeGen/X86/2010-02-11-NonTemporal.ll
index 5d74db1160c5..4734b3695cf7 100644
--- a/test/CodeGen/X86/2010-02-11-NonTemporal.ll
+++ b/test/CodeGen/X86/2010-02-11-NonTemporal.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 | FileCheck %s
+; RUN: llc < %s | FileCheck %s
; CHECK: movnt
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
target triple = "x86_64-unknown-linux-gnu"
diff --git a/test/CodeGen/X86/2010-02-12-CoalescerBug-Impdef.ll b/test/CodeGen/X86/2010-02-12-CoalescerBug-Impdef.ll
index 193f8cfcd52b..11ac376a8932 100644
--- a/test/CodeGen/X86/2010-02-12-CoalescerBug-Impdef.ll
+++ b/test/CodeGen/X86/2010-02-12-CoalescerBug-Impdef.ll
@@ -4,7 +4,7 @@
; Tricky coalescer bug:
; After coalescing %RAX with a virtual register, this instruction was rematted:
;
-; %EAX<def> = MOV32rr %reg1070<kill>
+; %EAX = MOV32rr killed %reg1070
;
; This instruction silently defined %RAX, and when rematting removed the
; instruction, the live interval for %RAX was not properly updated. The valno
@@ -12,7 +12,7 @@
;
; The fix is to implicitly define %RAX when coalescing:
;
-; %EAX<def> = MOV32rr %reg1070<kill>, %RAX<imp-def>
+; %EAX = MOV32rr killed %reg1070, implicit-def %RAX
;
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
diff --git a/test/CodeGen/X86/2010-02-23-DAGCombineBug.ll b/test/CodeGen/X86/2010-02-23-DAGCombineBug.ll
index a8c87fa2074a..f38471246146 100644
--- a/test/CodeGen/X86/2010-02-23-DAGCombineBug.ll
+++ b/test/CodeGen/X86/2010-02-23-DAGCombineBug.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | FileCheck %s
+; RUN: llc < %s -mtriple=i686-- | FileCheck %s
define i32* @t() nounwind optsize ssp {
entry:
diff --git a/test/CodeGen/X86/2010-02-23-RematImplicitSubreg.ll b/test/CodeGen/X86/2010-02-23-RematImplicitSubreg.ll
index 6fe31b6d1672..19182ab2cb55 100644
--- a/test/CodeGen/X86/2010-02-23-RematImplicitSubreg.ll
+++ b/test/CodeGen/X86/2010-02-23-RematImplicitSubreg.ll
@@ -3,7 +3,7 @@
;
; This test produces a move instruction with an implicitly defined super-register:
;
-; %DL<def> = MOV8rr %reg1038<kill>, %RDX<imp-def>
+; %DL = MOV8rr killed %reg1038, implicit-def %RDX
;
; When %DL is rematerialized, we must remember to update live intervals for
; sub-registers %DX and %EDX.
diff --git a/test/CodeGen/X86/2010-03-05-EFLAGS-Redef.ll b/test/CodeGen/X86/2010-03-05-EFLAGS-Redef.ll
index 3cca10e268cb..3040a26b046e 100644
--- a/test/CodeGen/X86/2010-03-05-EFLAGS-Redef.ll
+++ b/test/CodeGen/X86/2010-03-05-EFLAGS-Redef.ll
@@ -1,7 +1,7 @@
; RUN: llc < %s -verify-machineinstrs
;
; This test case is transformed into a single basic block by the machine
-; branch folding pass. That makes a complete mess of the %EFLAGS liveness, but
+; branch folding pass. That makes a complete mess of the %eflags liveness, but
; we don't care about liveness this late anyway.
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
diff --git a/test/CodeGen/X86/2010-04-08-CoalescerBug.ll b/test/CodeGen/X86/2010-04-08-CoalescerBug.ll
index 5adf99e3e47b..c3dfbfc15ecd 100644
--- a/test/CodeGen/X86/2010-04-08-CoalescerBug.ll
+++ b/test/CodeGen/X86/2010-04-08-CoalescerBug.ll
@@ -2,8 +2,8 @@
; rdar://7842028
; Do not delete partially dead copy instructions.
-; %RDI<def,dead> = MOV64rr %RAX<kill>, %EDI<imp-def>
-; REP_MOVSD %ECX<imp-def,dead>, %EDI<imp-def,dead>, %ESI<imp-def,dead>, %ECX<imp-use,kill>, %EDI<imp-use,kill>, %ESI<imp-use,kill>
+; dead %rdi = MOV64rr killed %rax, implicit-def %edi
+; REP_MOVSD implicit dead %ecx, implicit dead %edi, implicit dead %esi, implicit killed %ecx, implicit killed %edi, implicit killed %esi
%struct.F = type { %struct.FC*, i32, i32, i8, i32, i32, i32 }
diff --git a/test/CodeGen/X86/2010-05-12-FastAllocKills.ll b/test/CodeGen/X86/2010-05-12-FastAllocKills.ll
index eb0b150378d6..790ccb3ded38 100644
--- a/test/CodeGen/X86/2010-05-12-FastAllocKills.ll
+++ b/test/CodeGen/X86/2010-05-12-FastAllocKills.ll
@@ -3,28 +3,28 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
target triple = "x86_64-apple-darwin"
; This test causes a virtual FP register to be redefined while it is live:
-;BB#5: derived from LLVM BB %bb10
-; Predecessors according to CFG: BB#4 BB#5
-; %reg1024<def> = MOV_Fp8080 %reg1034
-; %reg1025<def> = MUL_Fp80m32 %reg1024, %RIP, 1, %reg0, <cp#0>, %reg0; mem:LD4[ConstantPool]
-; %reg1034<def> = MOV_Fp8080 %reg1025
-; FP_REG_KILL %FP0<imp-def>, %FP1<imp-def>, %FP2<imp-def>, %FP3<imp-def>, %FP4<imp-def>, %FP5<imp-def>, %FP6<imp-def>
-; JMP_4 <BB#5>
-; Successors according to CFG: BB#5
+;%bb.5: derived from LLVM BB %bb10
+; Predecessors according to CFG: %bb.4 %bb.5
+; %reg1024 = MOV_Fp8080 %reg1034
+; %reg1025 = MUL_Fp80m32 %reg1024, %rip, 1, %reg0, %const.0, %reg0; mem:LD4[ConstantPool]
+; %reg1034 = MOV_Fp8080 %reg1025
+; FP_REG_KILL implicit-def %fp0, implicit-def %fp1, implicit-def %fp2, implicit-def %fp3, implicit-def %fp4, implicit-def %fp5, implicit-def %fp6
+; JMP_4 <%bb.5>
+; Successors according to CFG: %bb.5
;
-; The X86FP pass needs good kill flags, like on %FP0 representing %reg1034:
-;BB#5: derived from LLVM BB %bb10
-; Predecessors according to CFG: BB#4 BB#5
-; %FP0<def> = LD_Fp80m <fi#3>, 1, %reg0, 0, %reg0; mem:LD10[FixedStack3](align=4)
-; %FP1<def> = MOV_Fp8080 %FP0<kill>
-; %FP2<def> = MUL_Fp80m32 %FP1, %RIP, 1, %reg0, <cp#0>, %reg0; mem:LD4[ConstantPool]
-; %FP0<def> = MOV_Fp8080 %FP2
-; ST_FpP80m <fi#3>, 1, %reg0, 0, %reg0, %FP0<kill>; mem:ST10[FixedStack3](align=4)
-; ST_FpP80m <fi#4>, 1, %reg0, 0, %reg0, %FP1<kill>; mem:ST10[FixedStack4](align=4)
-; ST_FpP80m <fi#5>, 1, %reg0, 0, %reg0, %FP2<kill>; mem:ST10[FixedStack5](align=4)
-; FP_REG_KILL %FP0<imp-def>, %FP1<imp-def>, %FP2<imp-def>, %FP3<imp-def>, %FP4<imp-def>, %FP5<imp-def>, %FP6<imp-def>
-; JMP_4 <BB#5>
-; Successors according to CFG: BB#5
+; The X86FP pass needs good kill flags, like on %fp0 representing %reg1034:
+;%bb.5: derived from LLVM BB %bb10
+; Predecessors according to CFG: %bb.4 %bb.5
+; %fp0 = LD_Fp80m %stack.3, 1, %reg0, 0, %reg0; mem:LD10[FixedStack3](align=4)
+; %fp1 = MOV_Fp8080 killed %fp0
+; %fp2 = MUL_Fp80m32 %fp1, %rip, 1, %reg0, %const.0, %reg0; mem:LD4[ConstantPool]
+; %fp0 = MOV_Fp8080 %fp2
+; ST_FpP80m %stack.3, 1, %reg0, 0, %reg0, killed %fp0; mem:ST10[FixedStack3](align=4)
+; ST_FpP80m %stack.4, 1, %reg0, 0, %reg0, killed %fp1; mem:ST10[FixedStack4](align=4)
+; ST_FpP80m %stack.5, 1, %reg0, 0, %reg0, killed %fp2; mem:ST10[FixedStack5](align=4)
+; FP_REG_KILL implicit-def %fp0, implicit-def %fp1, implicit-def %fp2, implicit-def %fp3, implicit-def %fp4, implicit-def %fp5, implicit-def %fp6
+; JMP_4 <%bb.5>
+; Successors according to CFG: %bb.5
define fastcc i32 @sqlite3AtoF(i8* %z, double* nocapture %pResult) nounwind ssp {
entry:
diff --git a/test/CodeGen/X86/2010-05-26-DotDebugLoc.ll b/test/CodeGen/X86/2010-05-26-DotDebugLoc.ll
index 84756f0cd311..9f36a2833877 100644
--- a/test/CodeGen/X86/2010-05-26-DotDebugLoc.ll
+++ b/test/CodeGen/X86/2010-05-26-DotDebugLoc.ll
@@ -35,7 +35,7 @@ attributes #1 = { nounwind readnone }
!1 = !DIFile(filename: "foo.c", directory: "/tmp/")
!2 = !{}
!3 = !{!4}
-!4 = !DIGlobalVariableExpression(var: !5)
+!4 = !DIGlobalVariableExpression(var: !5, expr: !DIExpression())
!5 = !DIGlobalVariable(name: "ret", scope: !1, file: !1, line: 7, type: !6, isLocal: false, isDefinition: true)
!6 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
!7 = !{i32 1, !"Debug Info Version", i32 3}
diff --git a/test/CodeGen/X86/2010-05-28-Crash.ll b/test/CodeGen/X86/2010-05-28-Crash.ll
index 38bbe4e367b1..fbb0b1079bde 100644
--- a/test/CodeGen/X86/2010-05-28-Crash.ll
+++ b/test/CodeGen/X86/2010-05-28-Crash.ll
@@ -45,7 +45,7 @@ entry:
!18 = !DIFile(filename: "f.c", directory: "/tmp")
!19 = !{}
-;CHECK: DEBUG_VALUE: bar:x <- %E
+;CHECK: DEBUG_VALUE: bar:x <- %e
;CHECK: Ltmp
;CHECK: DEBUG_VALUE: foo:y <- 1{{$}}
!20 = !{i32 1, !"Debug Info Version", i32 3}
diff --git a/test/CodeGen/X86/2010-06-01-DeadArg-DbgInfo.ll b/test/CodeGen/X86/2010-06-01-DeadArg-DbgInfo.ll
index 4b019abf5d50..435582e9b9c7 100644
--- a/test/CodeGen/X86/2010-06-01-DeadArg-DbgInfo.ll
+++ b/test/CodeGen/X86/2010-06-01-DeadArg-DbgInfo.ll
@@ -11,7 +11,7 @@ target triple = "x86_64-apple-darwin10.2"
; Function Attrs: noinline nounwind optsize readnone ssp
define i32 @_ZN3foo3bazEi(%struct.foo* nocapture %this, i32 %x) #0 align 2 !dbg !4 {
entry:
- ; CHECK: DEBUG_VALUE: baz:this <- %RDI{{$}}
+ ; CHECK: DEBUG_VALUE: baz:this <- %rdi{{$}}
tail call void @llvm.dbg.value(metadata %struct.foo* %this, i64 0, metadata !13, metadata !16), !dbg !17
tail call void @llvm.dbg.value(metadata i32 %x, i64 0, metadata !18, metadata !16), !dbg !17
%0 = mul nsw i32 %x, 7, !dbg !19
diff --git a/test/CodeGen/X86/2010-06-14-fast-isel-fs-load.ll b/test/CodeGen/X86/2010-06-14-fast-isel-fs-load.ll
index 5a4b389acb3d..8805402b2228 100644
--- a/test/CodeGen/X86/2010-06-14-fast-isel-fs-load.ll
+++ b/test/CodeGen/X86/2010-06-14-fast-isel-fs-load.ll
@@ -1,4 +1,4 @@
-; RUN: llc -fast-isel -march=x86 < %s | FileCheck %s
+; RUN: llc -fast-isel -mtriple=i686-- < %s | FileCheck %s
; CHECK: %fs:
define i32 @test1(i32 addrspace(257)* %arg) nounwind {
diff --git a/test/CodeGen/X86/2010-06-28-FastAllocTiedOperand.ll b/test/CodeGen/X86/2010-06-28-FastAllocTiedOperand.ll
index 7cffdc545e02..96ceb1985810 100644
--- a/test/CodeGen/X86/2010-06-28-FastAllocTiedOperand.ll
+++ b/test/CodeGen/X86/2010-06-28-FastAllocTiedOperand.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -O0 -no-integrated-as | FileCheck %s
+; RUN: llc < %s -O0 -no-integrated-as | FileCheck %s
; PR7509
target triple = "i386-apple-darwin10"
%asmtype = type { i32, i8*, i32, i32 }
diff --git a/test/CodeGen/X86/2010-08-04-MaskedSignedCompare.ll b/test/CodeGen/X86/2010-08-04-MaskedSignedCompare.ll
index 66d3f3108ec4..9bbd86ca646a 100644
--- a/test/CodeGen/X86/2010-08-04-MaskedSignedCompare.ll
+++ b/test/CodeGen/X86/2010-08-04-MaskedSignedCompare.ll
@@ -8,22 +8,20 @@
define i32 @main() nounwind {
; CHECK-LABEL: main:
-; CHECK: # BB#0: # %entry
-; CHECK-NEXT: cmpq $0, {{.*}}(%rip)
-; CHECK-NEXT: movb $-106, %al
-; CHECK-NEXT: jne .LBB0_2
-; CHECK-NEXT: # BB#1: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xorl %eax, %eax
-; CHECK-NEXT: .LBB0_2: # %entry
+; CHECK-NEXT: cmpq {{.*}}(%rip), %rax
+; CHECK-NEXT: sbbl %eax, %eax
+; CHECK-NEXT: andl $150, %eax
; CHECK-NEXT: testb %al, %al
-; CHECK-NEXT: jle .LBB0_3
-; CHECK-NEXT: # BB#4: # %if.then
+; CHECK-NEXT: jle .LBB0_1
+; CHECK-NEXT: # %bb.2: # %if.then
; CHECK-NEXT: movl $1, {{.*}}(%rip)
; CHECK-NEXT: movl $1, %esi
-; CHECK-NEXT: jmp .LBB0_5
-; CHECK-NEXT: .LBB0_3: # %entry.if.end_crit_edge
+; CHECK-NEXT: jmp .LBB0_3
+; CHECK-NEXT: .LBB0_1: # %entry.if.end_crit_edge
; CHECK-NEXT: movl {{.*}}(%rip), %esi
-; CHECK-NEXT: .LBB0_5: # %if.end
+; CHECK-NEXT: .LBB0_3: # %if.end
; CHECK-NEXT: pushq %rax
; CHECK-NEXT: movl $.L.str, %edi
; CHECK-NEXT: xorl %eax, %eax
diff --git a/test/CodeGen/X86/2010-09-17-SideEffectsInChain.ll b/test/CodeGen/X86/2010-09-17-SideEffectsInChain.ll
index 54a7763eb696..9e33d2bf6ac6 100644
--- a/test/CodeGen/X86/2010-09-17-SideEffectsInChain.ll
+++ b/test/CodeGen/X86/2010-09-17-SideEffectsInChain.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mcpu=core2 | FileCheck %s
+; RUN: llc < %s -mcpu=core2 | FileCheck %s
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
target triple = "x86_64-apple-darwin10.4"
diff --git a/test/CodeGen/X86/2010-10-08-cmpxchg8b.ll b/test/CodeGen/X86/2010-10-08-cmpxchg8b.ll
index ee50cb13e634..153b6dc4e02b 100644
--- a/test/CodeGen/X86/2010-10-08-cmpxchg8b.ll
+++ b/test/CodeGen/X86/2010-10-08-cmpxchg8b.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mtriple=i386-apple-darwin | FileCheck %s
+; RUN: llc < %s -mtriple=i386-apple-darwin | FileCheck %s
; PR8297
;
; On i386, i64 cmpxchg is lowered during legalize types to extract the
diff --git a/test/CodeGen/X86/2010-11-09-MOVLPS.ll b/test/CodeGen/X86/2010-11-09-MOVLPS.ll
index 4b937333c8e9..6e2e3aed0875 100644
--- a/test/CodeGen/X86/2010-11-09-MOVLPS.ll
+++ b/test/CodeGen/X86/2010-11-09-MOVLPS.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -O0
+; RUN: llc < %s -O0
; PR8211
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
target triple = "x86_64-unknown-linux-gnu"
diff --git a/test/CodeGen/X86/2010-11-18-SelectOfExtload.ll b/test/CodeGen/X86/2010-11-18-SelectOfExtload.ll
index 331e83bb5067..1764922dfd26 100644
--- a/test/CodeGen/X86/2010-11-18-SelectOfExtload.ll
+++ b/test/CodeGen/X86/2010-11-18-SelectOfExtload.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=+cmov | FileCheck %s
+; RUN: llc < %s -mtriple=i686-- -mattr=+cmov | FileCheck %s
; Both values were being zero extended.
@u = external global i8
@s = external global i8
diff --git a/test/CodeGen/X86/2011-01-24-DbgValue-Before-Use.ll b/test/CodeGen/X86/2011-01-24-DbgValue-Before-Use.ll
index d4f4e9057105..e27441864b02 100644
--- a/test/CodeGen/X86/2011-01-24-DbgValue-Before-Use.ll
+++ b/test/CodeGen/X86/2011-01-24-DbgValue-Before-Use.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -filetype=obj | llvm-dwarfdump -debug-dump=info - | FileCheck %s
-; RUN: llc < %s -filetype=obj -regalloc=basic | llvm-dwarfdump -debug-dump=info - | FileCheck %s
+; RUN: llc < %s -filetype=obj | llvm-dwarfdump -v -debug-info - | FileCheck %s
+; RUN: llc < %s -filetype=obj -regalloc=basic | llvm-dwarfdump -v -debug-info - | FileCheck %s
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
target triple = "x86_64-apple-darwin10.0.0"
@@ -9,7 +9,8 @@ target triple = "x86_64-apple-darwin10.0.0"
; CHECK: DW_TAG_variable
; CHECK: DW_TAG_variable
; CHECK-NEXT: DW_AT_location
-; CHECK-NEXT: DW_AT_name {{.*}} "z_s"
+; CHECK-NOT: DW_{{TAG|AT}}
+; CHECK: DW_AT_name {{.*}} "z_s"
; CHECK-NEXT: DW_AT_decl_file
; CHECK-NEXT: DW_AT_decl_line
; CHECK-NEXT: DW_AT_type{{.*}}{[[TYPE:.*]]}
diff --git a/test/CodeGen/X86/2011-02-21-VirtRegRewriter-KillSubReg.ll b/test/CodeGen/X86/2011-02-21-VirtRegRewriter-KillSubReg.ll
index 7821f0537e70..dc7f613880bf 100644
--- a/test/CodeGen/X86/2011-02-21-VirtRegRewriter-KillSubReg.ll
+++ b/test/CodeGen/X86/2011-02-21-VirtRegRewriter-KillSubReg.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -O2 -march=x86 -mtriple=i386-pc-linux-gnu -relocation-model=pic | FileCheck %s
+; RUN: llc < %s -O2 -mtriple=i386-pc-linux-gnu -relocation-model=pic | FileCheck %s
; PR9237: Assertion in VirtRegRewriter.cpp, ResurrectConfirmedKill
; `KillOps[*SR] == KillOp && "invalid subreg kill flags"'
diff --git a/test/CodeGen/X86/2011-03-02-DAGCombiner.ll b/test/CodeGen/X86/2011-03-02-DAGCombiner.ll
index d25fbf7b71f1..9ee19a7a0a93 100644
--- a/test/CodeGen/X86/2011-03-02-DAGCombiner.ll
+++ b/test/CodeGen/X86/2011-03-02-DAGCombiner.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64
+; RUN: llc < %s
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
target triple = "x86_64-apple-darwin11.0.0"
diff --git a/test/CodeGen/X86/2011-03-30-CreateFixedObjCrash.ll b/test/CodeGen/X86/2011-03-30-CreateFixedObjCrash.ll
index 38a9b3d4f5cc..f829c05fc56e 100644
--- a/test/CodeGen/X86/2011-03-30-CreateFixedObjCrash.ll
+++ b/test/CodeGen/X86/2011-03-30-CreateFixedObjCrash.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86
+; RUN: llc < %s -mtriple=i686--
; rdar://7983260
diff --git a/test/CodeGen/X86/2011-05-09-loaduse.ll b/test/CodeGen/X86/2011-05-09-loaduse.ll
index a94a9812431e..2212c5dc86e8 100644
--- a/test/CodeGen/X86/2011-05-09-loaduse.ll
+++ b/test/CodeGen/X86/2011-05-09-loaduse.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mcpu=corei7 | FileCheck %s
+; RUN: llc < %s -mtriple=i686-- -mcpu=corei7 | FileCheck %s
;CHECK-LABEL: test:
;CHECK-NOT: pshufd
diff --git a/test/CodeGen/X86/2011-06-01-fildll.ll b/test/CodeGen/X86/2011-06-01-fildll.ll
index 30c743441c36..54aa71c8ee93 100644
--- a/test/CodeGen/X86/2011-06-01-fildll.ll
+++ b/test/CodeGen/X86/2011-06-01-fildll.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | FileCheck %s
+; RUN: llc < %s | FileCheck %s
; ModuleID = '<stdin>'
target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32"
target triple = "i386-apple-macosx10.6.6"
diff --git a/test/CodeGen/X86/2011-06-03-x87chain.ll b/test/CodeGen/X86/2011-06-03-x87chain.ll
index c78e8e38a567..7ab93ff5fd68 100644
--- a/test/CodeGen/X86/2011-06-03-x87chain.ll
+++ b/test/CodeGen/X86/2011-06-03-x87chain.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mcpu=generic -march=x86 -mattr=+sse | FileCheck %s
+; RUN: llc < %s -mcpu=generic -mtriple=i686-- -mattr=+sse | FileCheck %s
define float @chainfail1(i64* nocapture %a, i64* nocapture %b, i32 %x, i32 %y, float* nocapture %f) nounwind uwtable noinline ssp {
entry:
diff --git a/test/CodeGen/X86/2011-06-06-fgetsign80bit.ll b/test/CodeGen/X86/2011-06-06-fgetsign80bit.ll
index d93414890570..2899b8b51ef7 100644
--- a/test/CodeGen/X86/2011-06-06-fgetsign80bit.ll
+++ b/test/CodeGen/X86/2011-06-06-fgetsign80bit.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=x86-64 < %s
+; RUN: llc -mtriple=x86_64-- < %s
define i32 @signbitl(x86_fp80 %x) nounwind uwtable readnone {
entry:
%tmp4 = bitcast x86_fp80 %x to i80
diff --git a/test/CodeGen/X86/2011-06-14-PreschedRegalias.ll b/test/CodeGen/X86/2011-06-14-PreschedRegalias.ll
index c9b3df83613d..3ac21048dce6 100644
--- a/test/CodeGen/X86/2011-06-14-PreschedRegalias.ll
+++ b/test/CodeGen/X86/2011-06-14-PreschedRegalias.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -stress-sched | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-- -stress-sched | FileCheck %s
; REQUIRES: asserts
; Test interference between physreg aliases during preRAsched.
; mul wants an operand in AL, but call clobbers it.
diff --git a/test/CodeGen/X86/2011-07-13-BadFrameIndexDisplacement.ll b/test/CodeGen/X86/2011-07-13-BadFrameIndexDisplacement.ll
index f38ebf1da85a..f1b1a7077bd6 100644
--- a/test/CodeGen/X86/2011-07-13-BadFrameIndexDisplacement.ll
+++ b/test/CodeGen/X86/2011-07-13-BadFrameIndexDisplacement.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=x86-64 < %s -disable-fp-elim | FileCheck %s
+; RUN: llc -mtriple=x86_64-- < %s -disable-fp-elim | FileCheck %s
; This test is checking that we don't crash and we don't incorrectly fold
; a large displacement and a frame index into a single lea.
diff --git a/test/CodeGen/X86/2011-08-23-PerformSubCombine128.ll b/test/CodeGen/X86/2011-08-23-PerformSubCombine128.ll
index 12171ac8b679..3740c05ecb01 100644
--- a/test/CodeGen/X86/2011-08-23-PerformSubCombine128.ll
+++ b/test/CodeGen/X86/2011-08-23-PerformSubCombine128.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=x86-64 -O2 < %s
+; RUN: llc -mtriple=x86_64-- -O2 < %s
define void @test(i64 %add127.tr.i2686) {
entry:
diff --git a/test/CodeGen/X86/2011-08-23-Trampoline.ll b/test/CodeGen/X86/2011-08-23-Trampoline.ll
index 7a5a0f81490f..3fd0936dd197 100644
--- a/test/CodeGen/X86/2011-08-23-Trampoline.ll
+++ b/test/CodeGen/X86/2011-08-23-Trampoline.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=x86
-; RUN: llc < %s -march=x86-64
+; RUN: llc < %s -mtriple=i686--
+; RUN: llc < %s -mtriple=x86_64--
%struct.FRAME.gnat__perfect_hash_generators__select_char_position__build_identical_keys_sets = type { i32, i32, void (i32, i32)*, i8 (i32, i32)* }
diff --git a/test/CodeGen/X86/2011-08-29-BlockConstant.ll b/test/CodeGen/X86/2011-08-29-BlockConstant.ll
index 83e4bcc6093b..f98b19fe52ac 100644
--- a/test/CodeGen/X86/2011-08-29-BlockConstant.ll
+++ b/test/CodeGen/X86/2011-08-29-BlockConstant.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=x86-64 < %s | FileCheck %s
+; RUN: llc < %s | FileCheck %s
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
target triple = "x86_64-unknown-linux-gnu"
diff --git a/test/CodeGen/X86/2011-09-14-valcoalesce.ll b/test/CodeGen/X86/2011-09-14-valcoalesce.ll
index 812628bf0e70..6e4fab50ca18 100644
--- a/test/CodeGen/X86/2011-09-14-valcoalesce.ll
+++ b/test/CodeGen/X86/2011-09-14-valcoalesce.ll
@@ -1,18 +1,18 @@
-; RUN: llc < %s -march=x86 -disable-block-placement | FileCheck %s
+; RUN: llc < %s -mtriple=i686-- -disable-block-placement | FileCheck %s
;
; Test RegistersDefinedFromSameValue. We have multiple copies of the same vreg:
; while.body85.i:
-; vreg1 = copy vreg2
-; vreg2 = add
+; %1 = copy %2
+; %2 = add
; critical edge from land.lhs.true.i -> if.end117.i:
-; vreg27 = vreg2
+; %27 = %2
; critical edge from land.lhs.true103.i -> if.end117.i:
-; vreg27 = vreg2
+; %27 = %2
; if.then108.i:
-; vreg27 = vreg1
+; %27 = %1
;
; Prior to fixing PR10920 401.bzip miscompile, the coalescer would
-; consider vreg1 and vreg27 to be copies of the same value. It would
+; consider %1 and %27 to be copies of the same value. It would
; then remove one of the critical edge copes, which cannot safely be removed.
; There are two obvious ways the register-allocator could go here, either
diff --git a/test/CodeGen/X86/2011-09-18-sse2cmp.ll b/test/CodeGen/X86/2011-09-18-sse2cmp.ll
index 89de648ca966..7c75feb9375a 100644
--- a/test/CodeGen/X86/2011-09-18-sse2cmp.ll
+++ b/test/CodeGen/X86/2011-09-18-sse2cmp.ll
@@ -1,4 +1,4 @@
-;RUN: llc < %s -march=x86 -mcpu=yonah -mattr=+sse2,-sse4.1 | FileCheck %s
+;RUN: llc < %s -mtriple=i686-- -mcpu=yonah -mattr=+sse2,-sse4.1 | FileCheck %s
;CHECK: @max
;CHECK: cmplepd
diff --git a/test/CodeGen/X86/2011-09-21-setcc-bug.ll b/test/CodeGen/X86/2011-09-21-setcc-bug.ll
index e61715a4813d..235d5f04f05a 100644
--- a/test/CodeGen/X86/2011-09-21-setcc-bug.ll
+++ b/test/CodeGen/X86/2011-09-21-setcc-bug.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mcpu=corei7 -mattr=+sse4.1
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=corei7 -mattr=+sse4.1
; Make sure we are not crashing on this code.
diff --git a/test/CodeGen/X86/2011-10-18-FastISel-VectorParams.ll b/test/CodeGen/X86/2011-10-18-FastISel-VectorParams.ll
index c9dc050d0b4e..0cfe89ce7eb2 100644
--- a/test/CodeGen/X86/2011-10-18-FastISel-VectorParams.ll
+++ b/test/CodeGen/X86/2011-10-18-FastISel-VectorParams.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=x86 -fast-isel -mattr=+sse < %s | FileCheck %s
+; RUN: llc -fast-isel -mattr=+sse < %s | FileCheck %s
; <rdar://problem/10215997>
target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32-S128"
target triple = "i386-apple-macosx10.7"
diff --git a/test/CodeGen/X86/2011-10-19-LegelizeLoad.ll b/test/CodeGen/X86/2011-10-19-LegelizeLoad.ll
index bf1f029847ea..2ae3d389d055 100644
--- a/test/CodeGen/X86/2011-10-19-LegelizeLoad.ll
+++ b/test/CodeGen/X86/2011-10-19-LegelizeLoad.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mcpu=corei7 | FileCheck %s
+; RUN: llc < %s -mcpu=corei7 | FileCheck %s
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i8:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
diff --git a/test/CodeGen/X86/2011-10-19-widen_vselect.ll b/test/CodeGen/X86/2011-10-19-widen_vselect.ll
index e812cbe3270a..c98bafcd565e 100644
--- a/test/CodeGen/X86/2011-10-19-widen_vselect.ll
+++ b/test/CodeGen/X86/2011-10-19-widen_vselect.ll
@@ -7,13 +7,13 @@
define void @simple_widen(<2 x float> %a, <2 x float> %b) {
; X32-LABEL: simple_widen:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: extractps $1, %xmm1, (%eax)
; X32-NEXT: movss %xmm1, (%eax)
; X32-NEXT: retl
;
; X64-LABEL: simple_widen:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: movlps %xmm1, (%rax)
; X64-NEXT: retq
entry:
@@ -24,7 +24,7 @@ entry:
define void @complex_inreg_work(<2 x float> %a, <2 x float> %b) {
; X32-LABEL: complex_inreg_work:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: movaps %xmm0, %xmm2
; X32-NEXT: cmpordps %xmm0, %xmm0
; X32-NEXT: blendvps %xmm0, %xmm2, %xmm1
@@ -33,7 +33,7 @@ define void @complex_inreg_work(<2 x float> %a, <2 x float> %b) {
; X32-NEXT: retl
;
; X64-LABEL: complex_inreg_work:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: movaps %xmm0, %xmm2
; X64-NEXT: cmpordps %xmm0, %xmm0
; X64-NEXT: blendvps %xmm0, %xmm2, %xmm1
@@ -48,14 +48,14 @@ entry:
define void @zero_test() {
; X32-LABEL: zero_test:
-; X32: # BB#0: # %entry
-; X32-NEXT: pxor %xmm0, %xmm0
-; X32-NEXT: pextrd $1, %xmm0, (%eax)
-; X32-NEXT: movd %xmm0, (%eax)
+; X32: # %bb.0: # %entry
+; X32-NEXT: xorps %xmm0, %xmm0
+; X32-NEXT: extractps $1, %xmm0, (%eax)
+; X32-NEXT: movss %xmm0, (%eax)
; X32-NEXT: retl
;
; X64-LABEL: zero_test:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: xorps %xmm0, %xmm0
; X64-NEXT: movlps %xmm0, (%rax)
; X64-NEXT: retq
@@ -67,9 +67,8 @@ entry:
define void @full_test() {
; X32-LABEL: full_test:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: subl $60, %esp
-; X32-NEXT: .Lcfi0:
; X32-NEXT: .cfi_def_cfa_offset 64
; X32-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero
; X32-NEXT: cvttps2dq %xmm2, %xmm0
@@ -92,7 +91,7 @@ define void @full_test() {
; X32-NEXT: retl
;
; X64-LABEL: full_test:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero
; X64-NEXT: cvttps2dq %xmm2, %xmm0
; X64-NEXT: cvtdq2ps %xmm0, %xmm1
diff --git a/test/CodeGen/X86/2011-10-21-widen-cmp.ll b/test/CodeGen/X86/2011-10-21-widen-cmp.ll
index 9232eba213bf..812faaf473d0 100644
--- a/test/CodeGen/X86/2011-10-21-widen-cmp.ll
+++ b/test/CodeGen/X86/2011-10-21-widen-cmp.ll
@@ -6,7 +6,7 @@
define void @cmp_2_floats(<2 x float> %a, <2 x float> %b) {
; CHECK-LABEL: cmp_2_floats:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movaps %xmm0, %xmm2
; CHECK-NEXT: cmpordps %xmm0, %xmm0
; CHECK-NEXT: blendvps %xmm0, %xmm2, %xmm1
@@ -21,7 +21,7 @@ entry:
define void @cmp_2_doubles(<2 x double> %a, <2 x double> %b) {
; CHECK-LABEL: cmp_2_doubles:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movapd %xmm0, %xmm2
; CHECK-NEXT: cmpordpd %xmm0, %xmm0
; CHECK-NEXT: blendvpd %xmm0, %xmm2, %xmm1
@@ -36,7 +36,7 @@ entry:
define void @mp_11193(<8 x float> * nocapture %aFOO, <8 x float>* nocapture %RET) nounwind {
; CHECK-LABEL: mp_11193:
-; CHECK: # BB#0: # %allocas
+; CHECK: # %bb.0: # %allocas
; CHECK-NEXT: movl $-1082130432, (%rsi) # imm = 0xBF800000
; CHECK-NEXT: retq
allocas:
diff --git a/test/CodeGen/X86/2011-10-27-tstore.ll b/test/CodeGen/X86/2011-10-27-tstore.ll
index 290b4d0cb00b..295e00932753 100644
--- a/test/CodeGen/X86/2011-10-27-tstore.ll
+++ b/test/CodeGen/X86/2011-10-27-tstore.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mcpu=corei7 | FileCheck %s
+; RUN: llc < %s -mcpu=corei7 | FileCheck %s
target triple = "x86_64-unknown-linux-gnu"
diff --git a/test/CodeGen/X86/2011-10-30-padd.ll b/test/CodeGen/X86/2011-10-30-padd.ll
index 1b8c12bc8e38..655b6f3c2074 100644
--- a/test/CodeGen/X86/2011-10-30-padd.ll
+++ b/test/CodeGen/X86/2011-10-30-padd.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mcpu=corei7 | FileCheck %s
+; RUN: llc < %s -mtriple=i686-- -mcpu=corei7 | FileCheck %s
;CHECK-LABEL: addXX_test:
;CHECK: padd
diff --git a/test/CodeGen/X86/2011-11-07-LegalizeBuildVector.ll b/test/CodeGen/X86/2011-11-07-LegalizeBuildVector.ll
index d3164707a35d..2f085300202b 100644
--- a/test/CodeGen/X86/2011-11-07-LegalizeBuildVector.ll
+++ b/test/CodeGen/X86/2011-11-07-LegalizeBuildVector.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=+avx | FileCheck %s
+; RUN: llc < %s -mtriple=i686-- -mattr=+avx | FileCheck %s
; We don't really care what this outputs; just make sure it's somewhat sane.
; CHECK: legalize_test
diff --git a/test/CodeGen/X86/2011-11-30-or.ll b/test/CodeGen/X86/2011-11-30-or.ll
index 5c324a423923..48cd86bfb23b 100644
--- a/test/CodeGen/X86/2011-11-30-or.ll
+++ b/test/CodeGen/X86/2011-11-30-or.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mcpu=corei7 | FileCheck %s
+; RUN: llc < %s -mcpu=corei7 | FileCheck %s
target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32"
target triple = "x86_64-apple-macosx10.6.6"
diff --git a/test/CodeGen/X86/2011-12-06-BitcastVectorGlobal.ll b/test/CodeGen/X86/2011-12-06-BitcastVectorGlobal.ll
index 7a4126f4ae2a..4ea59bb4c505 100644
--- a/test/CodeGen/X86/2011-12-06-BitcastVectorGlobal.ll
+++ b/test/CodeGen/X86/2011-12-06-BitcastVectorGlobal.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-- | FileCheck %s
; PR11495
; CHECK: 1311768467463790320
diff --git a/test/CodeGen/X86/2011-12-08-AVXISelBugs.ll b/test/CodeGen/X86/2011-12-08-AVXISelBugs.ll
index ab1b46c99d97..f8b6f3705d96 100644
--- a/test/CodeGen/X86/2011-12-08-AVXISelBugs.ll
+++ b/test/CodeGen/X86/2011-12-08-AVXISelBugs.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mcpu=corei7-avx -mattr=+avx
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=corei7-avx -mattr=+avx
; Various missing patterns causing crashes.
; rdar://10538793
diff --git a/test/CodeGen/X86/2011-12-15-vec_shift.ll b/test/CodeGen/X86/2011-12-15-vec_shift.ll
index 70783509bb7f..c6454530b8ac 100644
--- a/test/CodeGen/X86/2011-12-15-vec_shift.ll
+++ b/test/CodeGen/X86/2011-12-15-vec_shift.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=x86-64 -mattr=+sse4.1 -mcpu=penryn < %s | FileCheck %s -check-prefix=CHECK-W-SSE4
-; RUN: llc -march=x86-64 -mattr=-sse4.1 -mcpu=penryn < %s | FileCheck %s -check-prefix=CHECK-WO-SSE4
+; RUN: llc -mattr=+sse4.1 -mcpu=penryn < %s | FileCheck %s -check-prefix=CHECK-W-SSE4
+; RUN: llc -mattr=-sse4.1 -mcpu=penryn < %s | FileCheck %s -check-prefix=CHECK-WO-SSE4
; Test case for r146671
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
target triple = "x86_64-apple-macosx10.7"
diff --git a/test/CodeGen/X86/2011-12-26-extractelement-duplicate-load.ll b/test/CodeGen/X86/2011-12-26-extractelement-duplicate-load.ll
index 3439ebcf9deb..ad52d58bde1c 100644
--- a/test/CodeGen/X86/2011-12-26-extractelement-duplicate-load.ll
+++ b/test/CodeGen/X86/2011-12-26-extractelement-duplicate-load.ll
@@ -9,13 +9,13 @@
define <4 x i32> @test(<4 x i32>* %p) {
; CHECK-LABEL: test:
-; CHECK: # BB#0:
-; CHECK-NEXT: movdqa (%rdi), %xmm0
-; CHECK-NEXT: pextrd $2, %xmm0, %eax
+; CHECK: # %bb.0:
+; CHECK-NEXT: movaps (%rdi), %xmm0
+; CHECK-NEXT: extractps $2, %xmm0, %eax
; CHECK-NEXT: cmpl $3, %eax
; CHECK-NEXT: je .LBB0_2
-; CHECK-NEXT: # BB#1:
-; CHECK-NEXT: pxor %xmm0, %xmm0
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: xorps %xmm0, %xmm0
; CHECK-NEXT: .LBB0_2:
; CHECK-NEXT: retq
%v = load <4 x i32>, <4 x i32>* %p
diff --git a/test/CodeGen/X86/2011-12-8-bitcastintprom.ll b/test/CodeGen/X86/2011-12-8-bitcastintprom.ll
index e2ccaa1b8378..36ef1bac1a7c 100644
--- a/test/CodeGen/X86/2011-12-8-bitcastintprom.ll
+++ b/test/CodeGen/X86/2011-12-8-bitcastintprom.ll
@@ -5,7 +5,7 @@
; Make sure that the conversion between v4i8 to v2i16 is not a simple bitcast.
define void @prom_bug(<4 x i8> %t, i16* %p) {
; SSE2-LABEL: prom_bug:
-; SSE2: ## BB#0:
+; SSE2: ## %bb.0:
; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
; SSE2-NEXT: packuswb %xmm0, %xmm0
; SSE2-NEXT: packuswb %xmm0, %xmm0
@@ -16,7 +16,7 @@ define void @prom_bug(<4 x i8> %t, i16* %p) {
; SSE2-NEXT: retq
;
; SSE41-LABEL: prom_bug:
-; SSE41: ## BB#0:
+; SSE41: ## %bb.0:
; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
; SSE41-NEXT: pextrw $0, %xmm0, (%rdi)
; SSE41-NEXT: retq
diff --git a/test/CodeGen/X86/2011-20-21-zext-ui2fp.ll b/test/CodeGen/X86/2011-20-21-zext-ui2fp.ll
index 539d5547d5f1..f1543d5262f6 100644
--- a/test/CodeGen/X86/2011-20-21-zext-ui2fp.ll
+++ b/test/CodeGen/X86/2011-20-21-zext-ui2fp.ll
@@ -6,7 +6,7 @@
define void @ui_to_fp_conv(<8 x float> * nocapture %aFOO, <8 x float>* nocapture %RET) nounwind {
; CHECK-LABEL: ui_to_fp_conv:
-; CHECK: # BB#0: # %allocas
+; CHECK: # %bb.0: # %allocas
; CHECK-NEXT: movaps {{.*#+}} xmm0 = [1.000000e+00,1.000000e+00,0.000000e+00,0.000000e+00]
; CHECK-NEXT: xorps %xmm1, %xmm1
; CHECK-NEXT: movups %xmm1, 16(%rsi)
diff --git a/test/CodeGen/X86/2012-01-11-split-cv.ll b/test/CodeGen/X86/2012-01-11-split-cv.ll
index 34ec48a02517..c8424fa69aaf 100644
--- a/test/CodeGen/X86/2012-01-11-split-cv.ll
+++ b/test/CodeGen/X86/2012-01-11-split-cv.ll
@@ -3,7 +3,7 @@
define void @add18i16(<18 x i16>* nocapture sret %ret, <18 x i16>* %bp) nounwind {
; CHECK-LABEL: add18i16:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
; CHECK-NEXT: vmovups (%ecx), %ymm0
diff --git a/test/CodeGen/X86/2012-01-12-extract-sv.ll b/test/CodeGen/X86/2012-01-12-extract-sv.ll
index 9bc4b5f55b64..156e373a5af0 100644
--- a/test/CodeGen/X86/2012-01-12-extract-sv.ll
+++ b/test/CodeGen/X86/2012-01-12-extract-sv.ll
@@ -2,13 +2,13 @@
define void @endless_loop() {
; CHECK-LABEL: endless_loop:
-; CHECK-NEXT: # BB#0:
+; CHECK-NEXT: # %bb.0:
; CHECK-NEXT: vmovaps (%eax), %ymm0
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0
-; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
-; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,1]
+; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; CHECK-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,1,0,1]
; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
-; CHECK-NEXT: vxorps %ymm2, %ymm2, %ymm2
+; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3,4,5,6,7]
; CHECK-NEXT: vmovaps %ymm0, (%eax)
diff --git a/test/CodeGen/X86/2012-01-18-vbitcast.ll b/test/CodeGen/X86/2012-01-18-vbitcast.ll
index 9eb59e41ef7d..647f15730c56 100644
--- a/test/CodeGen/X86/2012-01-18-vbitcast.ll
+++ b/test/CodeGen/X86/2012-01-18-vbitcast.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mcpu=corei7 -mtriple=x86_64-pc-win32 | FileCheck %s
+; RUN: llc < %s -mcpu=corei7 -mtriple=x86_64-pc-win32 | FileCheck %s
;CHECK-LABEL: vcast:
define <2 x i32> @vcast(<2 x float> %a, <2 x float> %b) {
diff --git a/test/CodeGen/X86/2012-02-23-mmx-inlineasm.ll b/test/CodeGen/X86/2012-02-23-mmx-inlineasm.ll
index a55c77bd2266..790bc1f329f1 100644
--- a/test/CodeGen/X86/2012-02-23-mmx-inlineasm.ll
+++ b/test/CodeGen/X86/2012-02-23-mmx-inlineasm.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=x86 -mcpu=i686 -mattr=+mmx < %s | FileCheck %s
+; RUN: llc -mtriple=i686-- -mcpu=i686 -mattr=+mmx < %s | FileCheck %s
; <rdar://problem/10106006>
define void @func() nounwind ssp {
diff --git a/test/CodeGen/X86/2012-04-26-sdglue.ll b/test/CodeGen/X86/2012-04-26-sdglue.ll
index 2a76e1a66b2b..8066b76f3bf5 100644
--- a/test/CodeGen/X86/2012-04-26-sdglue.ll
+++ b/test/CodeGen/X86/2012-04-26-sdglue.ll
@@ -6,9 +6,9 @@
define void @func() nounwind ssp {
; CHECK-LABEL: func:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovups 0, %xmm0
-; CHECK-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
; CHECK-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,2,3,3]
; CHECK-NEXT: vbroadcastss 32, %xmm3
@@ -26,6 +26,7 @@ define void @func() nounwind ssp {
; CHECK-NEXT: vmovaps %ymm0, (%rax)
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
+; CHECK-NEXT: ## -- End function
%tmp = load <4 x float>, <4 x float>* null, align 1
%tmp14 = getelementptr <4 x float>, <4 x float>* null, i32 2
%tmp15 = load <4 x float>, <4 x float>* %tmp14, align 1
diff --git a/test/CodeGen/X86/2012-07-10-extload64.ll b/test/CodeGen/X86/2012-07-10-extload64.ll
index a41123e40a58..e1f9839340c3 100644
--- a/test/CodeGen/X86/2012-07-10-extload64.ll
+++ b/test/CodeGen/X86/2012-07-10-extload64.ll
@@ -3,7 +3,7 @@
define void @load_store(<4 x i16>* %in) {
; CHECK-LABEL: load_store:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: pmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
; CHECK-NEXT: paddw %xmm0, %xmm0
@@ -20,7 +20,7 @@ entry:
; Make sure that we store a 64bit value, even on 32bit systems.
define void @store_64(<2 x i32>* %ptr) {
; CHECK-LABEL: store_64:
-; CHECK: # BB#0: # %BB
+; CHECK: # %bb.0: # %BB
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: xorps %xmm0, %xmm0
; CHECK-NEXT: movlps %xmm0, (%eax)
@@ -32,7 +32,7 @@ BB:
define <2 x i32> @load_64(<2 x i32>* %ptr) {
; CHECK-LABEL: load_64:
-; CHECK: # BB#0: # %BB
+; CHECK: # %bb.0: # %BB
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: pmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
; CHECK-NEXT: retl
diff --git a/test/CodeGen/X86/2012-07-10-shufnorm.ll b/test/CodeGen/X86/2012-07-10-shufnorm.ll
index e39df58877f6..9ce9d14f42ef 100644
--- a/test/CodeGen/X86/2012-07-10-shufnorm.ll
+++ b/test/CodeGen/X86/2012-07-10-shufnorm.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mcpu=corei7 -mattr=+avx | FileCheck %s
+; RUN: llc < %s -mtriple=i686-- -mcpu=corei7 -mattr=+avx | FileCheck %s
; CHECK: ocl
define void @ocl() {
diff --git a/test/CodeGen/X86/2012-07-15-BuildVectorPromote.ll b/test/CodeGen/X86/2012-07-15-BuildVectorPromote.ll
index 078f1b05c3fc..2d72d2b2575c 100644
--- a/test/CodeGen/X86/2012-07-15-BuildVectorPromote.ll
+++ b/test/CodeGen/X86/2012-07-15-BuildVectorPromote.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mcpu=corei7
+; RUN: llc < %s -mtriple=i686-- -mcpu=corei7
; We don't care about the output, just that it doesn't crash
define <1 x i1> @buildvec_promote() {
diff --git a/test/CodeGen/X86/2012-07-15-broadcastfold.ll b/test/CodeGen/X86/2012-07-15-broadcastfold.ll
index 7c8c2f28348a..d3106c10e88a 100644
--- a/test/CodeGen/X86/2012-07-15-broadcastfold.ll
+++ b/test/CodeGen/X86/2012-07-15-broadcastfold.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mcpu=corei7 -mattr=+avx2 | FileCheck %s
+; RUN: llc < %s -mtriple=i686-- -mcpu=corei7 -mattr=+avx2 | FileCheck %s
declare x86_fastcallcc i64 @barrier()
diff --git a/test/CodeGen/X86/2012-07-15-tconst_shl.ll b/test/CodeGen/X86/2012-07-15-tconst_shl.ll
index 46eca7644ebb..0ac26c3b402f 100644
--- a/test/CodeGen/X86/2012-07-15-tconst_shl.ll
+++ b/test/CodeGen/X86/2012-07-15-tconst_shl.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mcpu=corei7 -mattr=+avx2
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=corei7 -mattr=+avx2
; make sure that we are not crashing.
define <16 x i32> @autogen_SD34717() {
diff --git a/test/CodeGen/X86/2012-07-15-vshl.ll b/test/CodeGen/X86/2012-07-15-vshl.ll
index cd0fef469e6a..b8b6f06bc0b0 100644
--- a/test/CodeGen/X86/2012-07-15-vshl.ll
+++ b/test/CodeGen/X86/2012-07-15-vshl.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mcpu=corei7 -mattr=+avx
+; RUN: llc < %s -mtriple=i686-- -mcpu=corei7 -mattr=+avx
; PR13352
declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>, <8 x float>) nounwind readnone
diff --git a/test/CodeGen/X86/2012-07-16-LeaUndef.ll b/test/CodeGen/X86/2012-07-16-LeaUndef.ll
index 9e5cbd2f3373..194d9e63b9fd 100644
--- a/test/CodeGen/X86/2012-07-16-LeaUndef.ll
+++ b/test/CodeGen/X86/2012-07-16-LeaUndef.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mcpu=corei7
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=corei7
define void @autogen_SD2543() {
A:
diff --git a/test/CodeGen/X86/2012-07-16-fp2ui-i1.ll b/test/CodeGen/X86/2012-07-16-fp2ui-i1.ll
index 17533a1e1649..7e78d70921f9 100644
--- a/test/CodeGen/X86/2012-07-16-fp2ui-i1.ll
+++ b/test/CodeGen/X86/2012-07-16-fp2ui-i1.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mcpu=corei7
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=corei7
define void @autogen_SD3100() {
BB:
diff --git a/test/CodeGen/X86/2012-07-17-vtrunc.ll b/test/CodeGen/X86/2012-07-17-vtrunc.ll
index 2de2f97d7d2d..764ccfe66e86 100644
--- a/test/CodeGen/X86/2012-07-17-vtrunc.ll
+++ b/test/CodeGen/X86/2012-07-17-vtrunc.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mcpu=corei7
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=corei7
define void @autogen_SD33189483() {
BB:
diff --git a/test/CodeGen/X86/2012-07-23-select_cc.ll b/test/CodeGen/X86/2012-07-23-select_cc.ll
index 33fcb120e162..1838dc97dda3 100644
--- a/test/CodeGen/X86/2012-07-23-select_cc.ll
+++ b/test/CodeGen/X86/2012-07-23-select_cc.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mcpu=corei7
+; RUN: llc < %s -mcpu=corei7
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
diff --git a/test/CodeGen/X86/2012-08-16-setcc.ll b/test/CodeGen/X86/2012-08-16-setcc.ll
index cba208e62a14..a31b651b3e3d 100644
--- a/test/CodeGen/X86/2012-08-16-setcc.ll
+++ b/test/CodeGen/X86/2012-08-16-setcc.ll
@@ -5,7 +5,7 @@
define i32 @and_1(i8 zeroext %a, i8 zeroext %b, i32 %x) {
; CHECK-LABEL: and_1:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: andb %dil, %sil
; CHECK-NEXT: cmovnel %edx, %eax
@@ -18,7 +18,7 @@ define i32 @and_1(i8 zeroext %a, i8 zeroext %b, i32 %x) {
define zeroext i1 @and_2(i8 zeroext %a, i8 zeroext %b) {
; CHECK-LABEL: and_2:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: andb %dil, %sil
; CHECK-NEXT: setne %al
; CHECK-NEXT: retq
@@ -29,7 +29,7 @@ define zeroext i1 @and_2(i8 zeroext %a, i8 zeroext %b) {
define i32 @xor_1(i8 zeroext %a, i8 zeroext %b, i32 %x) {
; CHECK-LABEL: xor_1:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: xorb %dil, %sil
; CHECK-NEXT: cmovnel %edx, %eax
@@ -42,7 +42,7 @@ define i32 @xor_1(i8 zeroext %a, i8 zeroext %b, i32 %x) {
define zeroext i1 @xor_2(i8 zeroext %a, i8 zeroext %b) {
; CHECK-LABEL: xor_2:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: xorb %dil, %sil
; CHECK-NEXT: setne %al
; CHECK-NEXT: retq
diff --git a/test/CodeGen/X86/2012-09-13-dagco-fneg.ll b/test/CodeGen/X86/2012-09-13-dagco-fneg.ll
index 7b9bab97be6f..010f0934b4ae 100644
--- a/test/CodeGen/X86/2012-09-13-dagco-fneg.ll
+++ b/test/CodeGen/X86/2012-09-13-dagco-fneg.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=x86-64 -mcpu=corei7 < %s | FileCheck %s
+; RUN: llc -mcpu=corei7 < %s | FileCheck %s
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
target triple = "x86_64-apple-macosx10.8.0"
diff --git a/test/CodeGen/X86/2012-1-10-buildvector.ll b/test/CodeGen/X86/2012-1-10-buildvector.ll
index 32212343869f..03044ac3722e 100644
--- a/test/CodeGen/X86/2012-1-10-buildvector.ll
+++ b/test/CodeGen/X86/2012-1-10-buildvector.ll
@@ -3,11 +3,10 @@
define void @bad_cast() {
; CHECK-LABEL: bad_cast:
-; CHECK: # BB#0:
-; CHECK-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; CHECK: # %bb.0:
+; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
; CHECK-NEXT: vmovaps %xmm0, (%eax)
; CHECK-NEXT: movl $0, (%eax)
-; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retl
%vext.i = shufflevector <2 x i64> undef, <2 x i64> undef, <3 x i32> <i32 0, i32 1, i32 undef>
%vecinit8.i = shufflevector <3 x i64> zeroinitializer, <3 x i64> %vext.i, <3 x i32> <i32 0, i32 3, i32 4>
@@ -17,7 +16,7 @@ define void @bad_cast() {
define void @bad_insert(i32 %t) {
; CHECK-LABEL: bad_insert:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: vmovaps %ymm0, (%eax)
; CHECK-NEXT: vzeroupper
diff --git a/test/CodeGen/X86/2012-10-18-crash-dagco.ll b/test/CodeGen/X86/2012-10-18-crash-dagco.ll
index fb29241035ef..8b1fd74dba1e 100644
--- a/test/CodeGen/X86/2012-10-18-crash-dagco.ll
+++ b/test/CodeGen/X86/2012-10-18-crash-dagco.ll
@@ -1,9 +1,9 @@
-; RUN: llc -march=x86-64 -mcpu=corei7 -disable-cgp-select2branch < %s
+; RUN: llc -mcpu=corei7 -disable-cgp-select2branch < %s
; We should not crash on this test.
target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32"
-target triple = "i386-apple-darwin9.0.0"
+target triple = "x86_64-apple-darwin9.0.0"
@global = external constant [411 x i8], align 1
diff --git a/test/CodeGen/X86/2012-11-28-merge-store-alias.ll b/test/CodeGen/X86/2012-11-28-merge-store-alias.ll
index 2e8206a75916..2bd755328ade 100644
--- a/test/CodeGen/X86/2012-11-28-merge-store-alias.ll
+++ b/test/CodeGen/X86/2012-11-28-merge-store-alias.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mcpu=corei7 -mtriple=x86_64-pc-win64 | FileCheck %s
+; RUN: llc < %s -mcpu=corei7 -mtriple=x86_64-pc-win64 | FileCheck %s
; CHECK: merge_stores_can
; CHECK: callq foo
diff --git a/test/CodeGen/X86/2012-12-1-merge-multiple.ll b/test/CodeGen/X86/2012-12-1-merge-multiple.ll
index 9be8b5bbb427..83a719742204 100644
--- a/test/CodeGen/X86/2012-12-1-merge-multiple.ll
+++ b/test/CodeGen/X86/2012-12-1-merge-multiple.ll
@@ -1,12 +1,14 @@
-; RUN: llc < %s -march=x86-64 -mcpu=corei7 -mtriple=x86_64-pc-win64 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s
-; CHECK: multiple_stores_on_chain
-; CHECK: movabsq
-; CHECK: movq
-; CHECK: movabsq
-; CHECK: movq
-; CHECK: ret
define void @multiple_stores_on_chain(i16 * %A) {
+; CHECK-LABEL: multiple_stores_on_chain:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: movabsq $844433520132096, %rax # imm = 0x3000200010000
+; CHECK-NEXT: movq %rax, (%rdi)
+; CHECK-NEXT: movabsq $1970350607106052, %rax # imm = 0x7000600050004
+; CHECK-NEXT: movq %rax, 8(%rdi)
+; CHECK-NEXT: retq
entry:
%a0 = getelementptr inbounds i16, i16* %A, i64 0
%a1 = getelementptr inbounds i16, i16* %A, i64 1
diff --git a/test/CodeGen/X86/2012-12-12-DAGCombineCrash.ll b/test/CodeGen/X86/2012-12-12-DAGCombineCrash.ll
index 8cef2c8201c6..8420d8871feb 100644
--- a/test/CodeGen/X86/2012-12-12-DAGCombineCrash.ll
+++ b/test/CodeGen/X86/2012-12-12-DAGCombineCrash.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=x86 -mtriple=i686-apple-ios -mcpu=yonah < %s
+; RUN: llc -mtriple=i686-apple-ios -mcpu=yonah < %s
; rdar://12868039
define void @t() nounwind ssp {
diff --git a/test/CodeGen/X86/2012-12-14-v8fp80-crash.ll b/test/CodeGen/X86/2012-12-14-v8fp80-crash.ll
index c465527bd867..e9daa39c5ee8 100644
--- a/test/CodeGen/X86/2012-12-14-v8fp80-crash.ll
+++ b/test/CodeGen/X86/2012-12-14-v8fp80-crash.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mcpu=corei7 -mtriple=i686-pc-win32
+; RUN: llc < %s -mcpu=corei7 -mtriple=i686-pc-win32
; Make sure we don't crash on this testcase.
diff --git a/test/CodeGen/X86/2013-05-06-ConactVectorCrash.ll b/test/CodeGen/X86/2013-05-06-ConactVectorCrash.ll
index 920341799d63..55b8a9956d24 100644
--- a/test/CodeGen/X86/2013-05-06-ConactVectorCrash.ll
+++ b/test/CodeGen/X86/2013-05-06-ConactVectorCrash.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86
+; RUN: llc < %s
; Make sure this doesn't crash
diff --git a/test/CodeGen/X86/2014-05-29-factorial.ll b/test/CodeGen/X86/2014-05-29-factorial.ll
index 987a21d34eab..6d7bc3918f2a 100644
--- a/test/CodeGen/X86/2014-05-29-factorial.ll
+++ b/test/CodeGen/X86/2014-05-29-factorial.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-- | FileCheck %s
; CHECK: decq [[X:%rdi|%rcx]]
; CHECK-NOT: testq [[X]], [[X]]
diff --git a/test/CodeGen/X86/3dnow-intrinsics.ll b/test/CodeGen/X86/3dnow-intrinsics.ll
index fe8b95ec4655..6817df4b1d1d 100644
--- a/test/CodeGen/X86/3dnow-intrinsics.ll
+++ b/test/CodeGen/X86/3dnow-intrinsics.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=+3dnow | FileCheck %s
+; RUN: llc < %s -mtriple=i686-- -mattr=+3dnow | FileCheck %s
define <8 x i8> @test_pavgusb(x86_mmx %a.coerce, x86_mmx %b.coerce) nounwind readnone {
; CHECK: pavgusb
diff --git a/test/CodeGen/X86/3dnow-schedule.ll b/test/CodeGen/X86/3dnow-schedule.ll
new file mode 100644
index 000000000000..d8ecfb8114a3
--- /dev/null
+++ b/test/CodeGen/X86/3dnow-schedule.ll
@@ -0,0 +1,394 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=+3dnowa | FileCheck %s --check-prefix=CHECK --check-prefix=GENERIC
+
+define void @test_femms() optsize {
+; CHECK-LABEL: test_femms:
+; CHECK: # %bb.0:
+; CHECK-NEXT: femms
+; CHECK-NEXT: retq # sched: [1:1.00]
+ call void @llvm.x86.mmx.femms()
+ ret void
+}
+declare void @llvm.x86.mmx.femms() nounwind readnone
+
+define i64 @test_pavgusb(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
+; CHECK-LABEL: test_pavgusb:
+; CHECK: # %bb.0:
+; CHECK-NEXT: pavgusb %mm1, %mm0 # sched: [5:1.00]
+; CHECK-NEXT: pavgusb (%rdi), %mm0 # sched: [9:1.00]
+; CHECK-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; CHECK-NEXT: retq # sched: [1:1.00]
+ %1 = call x86_mmx @llvm.x86.3dnow.pavgusb(x86_mmx %a0, x86_mmx %a1)
+ %2 = load x86_mmx, x86_mmx *%a2, align 8
+ %3 = call x86_mmx @llvm.x86.3dnow.pavgusb(x86_mmx %1, x86_mmx %2)
+ %4 = bitcast x86_mmx %3 to i64
+ ret i64 %4
+}
+declare x86_mmx @llvm.x86.3dnow.pavgusb(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test_pf2id(x86_mmx* %a0) optsize {
+; CHECK-LABEL: test_pf2id:
+; CHECK: # %bb.0:
+; CHECK-NEXT: pf2id (%rdi), %mm0 # sched: [7:1.00]
+; CHECK-NEXT: pf2id %mm0, %mm0 # sched: [3:1.00]
+; CHECK-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; CHECK-NEXT: retq # sched: [1:1.00]
+ %1 = load x86_mmx, x86_mmx *%a0, align 8
+ %2 = call x86_mmx @llvm.x86.3dnow.pf2id(x86_mmx %1)
+ %3 = call x86_mmx @llvm.x86.3dnow.pf2id(x86_mmx %2)
+ %4 = bitcast x86_mmx %3 to i64
+ ret i64 %4
+}
+declare x86_mmx @llvm.x86.3dnow.pf2id(x86_mmx) nounwind readnone
+
+define i64 @test_pf2iw(x86_mmx* %a0) optsize {
+; CHECK-LABEL: test_pf2iw:
+; CHECK: # %bb.0:
+; CHECK-NEXT: pf2iw (%rdi), %mm0 # sched: [7:1.00]
+; CHECK-NEXT: pf2iw %mm0, %mm0 # sched: [3:1.00]
+; CHECK-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; CHECK-NEXT: retq # sched: [1:1.00]
+ %1 = load x86_mmx, x86_mmx *%a0, align 8
+ %2 = call x86_mmx @llvm.x86.3dnowa.pf2iw(x86_mmx %1)
+ %3 = call x86_mmx @llvm.x86.3dnowa.pf2iw(x86_mmx %2)
+ %4 = bitcast x86_mmx %3 to i64
+ ret i64 %4
+}
+declare x86_mmx @llvm.x86.3dnowa.pf2iw(x86_mmx) nounwind readnone
+
+define i64 @test_pfacc(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
+; CHECK-LABEL: test_pfacc:
+; CHECK: # %bb.0:
+; CHECK-NEXT: pfacc %mm1, %mm0 # sched: [3:1.00]
+; CHECK-NEXT: pfacc (%rdi), %mm0 # sched: [7:1.00]
+; CHECK-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; CHECK-NEXT: retq # sched: [1:1.00]
+ %1 = call x86_mmx @llvm.x86.3dnow.pfacc(x86_mmx %a0, x86_mmx %a1)
+ %2 = load x86_mmx, x86_mmx *%a2, align 8
+ %3 = call x86_mmx @llvm.x86.3dnow.pfacc(x86_mmx %1, x86_mmx %2)
+ %4 = bitcast x86_mmx %3 to i64
+ ret i64 %4
+}
+declare x86_mmx @llvm.x86.3dnow.pfacc(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test_pfadd(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
+; CHECK-LABEL: test_pfadd:
+; CHECK: # %bb.0:
+; CHECK-NEXT: pfadd %mm1, %mm0 # sched: [3:1.00]
+; CHECK-NEXT: pfadd (%rdi), %mm0 # sched: [7:1.00]
+; CHECK-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; CHECK-NEXT: retq # sched: [1:1.00]
+ %1 = call x86_mmx @llvm.x86.3dnow.pfadd(x86_mmx %a0, x86_mmx %a1)
+ %2 = load x86_mmx, x86_mmx *%a2, align 8
+ %3 = call x86_mmx @llvm.x86.3dnow.pfadd(x86_mmx %1, x86_mmx %2)
+ %4 = bitcast x86_mmx %3 to i64
+ ret i64 %4
+}
+declare x86_mmx @llvm.x86.3dnow.pfadd(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test_pfcmpeq(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
+; CHECK-LABEL: test_pfcmpeq:
+; CHECK: # %bb.0:
+; CHECK-NEXT: pfcmpeq %mm1, %mm0 # sched: [3:1.00]
+; CHECK-NEXT: pfcmpeq (%rdi), %mm0 # sched: [7:1.00]
+; CHECK-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; CHECK-NEXT: retq # sched: [1:1.00]
+ %1 = call x86_mmx @llvm.x86.3dnow.pfcmpeq(x86_mmx %a0, x86_mmx %a1)
+ %2 = load x86_mmx, x86_mmx *%a2, align 8
+ %3 = call x86_mmx @llvm.x86.3dnow.pfcmpeq(x86_mmx %1, x86_mmx %2)
+ %4 = bitcast x86_mmx %3 to i64
+ ret i64 %4
+}
+declare x86_mmx @llvm.x86.3dnow.pfcmpeq(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test_pfcmpge(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
+; CHECK-LABEL: test_pfcmpge:
+; CHECK: # %bb.0:
+; CHECK-NEXT: pfcmpge %mm1, %mm0 # sched: [3:1.00]
+; CHECK-NEXT: pfcmpge (%rdi), %mm0 # sched: [7:1.00]
+; CHECK-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; CHECK-NEXT: retq # sched: [1:1.00]
+ %1 = call x86_mmx @llvm.x86.3dnow.pfcmpge(x86_mmx %a0, x86_mmx %a1)
+ %2 = load x86_mmx, x86_mmx *%a2, align 8
+ %3 = call x86_mmx @llvm.x86.3dnow.pfcmpge(x86_mmx %1, x86_mmx %2)
+ %4 = bitcast x86_mmx %3 to i64
+ ret i64 %4
+}
+declare x86_mmx @llvm.x86.3dnow.pfcmpge(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test_pfcmpgt(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
+; CHECK-LABEL: test_pfcmpgt:
+; CHECK: # %bb.0:
+; CHECK-NEXT: pfcmpgt %mm1, %mm0 # sched: [3:1.00]
+; CHECK-NEXT: pfcmpgt (%rdi), %mm0 # sched: [7:1.00]
+; CHECK-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; CHECK-NEXT: retq # sched: [1:1.00]
+ %1 = call x86_mmx @llvm.x86.3dnow.pfcmpgt(x86_mmx %a0, x86_mmx %a1)
+ %2 = load x86_mmx, x86_mmx *%a2, align 8
+ %3 = call x86_mmx @llvm.x86.3dnow.pfcmpgt(x86_mmx %1, x86_mmx %2)
+ %4 = bitcast x86_mmx %3 to i64
+ ret i64 %4
+}
+declare x86_mmx @llvm.x86.3dnow.pfcmpgt(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test_pfmax(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
+; CHECK-LABEL: test_pfmax:
+; CHECK: # %bb.0:
+; CHECK-NEXT: pfmax %mm1, %mm0 # sched: [3:1.00]
+; CHECK-NEXT: pfmax (%rdi), %mm0 # sched: [7:1.00]
+; CHECK-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; CHECK-NEXT: retq # sched: [1:1.00]
+ %1 = call x86_mmx @llvm.x86.3dnow.pfmax(x86_mmx %a0, x86_mmx %a1)
+ %2 = load x86_mmx, x86_mmx *%a2, align 8
+ %3 = call x86_mmx @llvm.x86.3dnow.pfmax(x86_mmx %1, x86_mmx %2)
+ %4 = bitcast x86_mmx %3 to i64
+ ret i64 %4
+}
+declare x86_mmx @llvm.x86.3dnow.pfmax(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test_pfmin(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
+; CHECK-LABEL: test_pfmin:
+; CHECK: # %bb.0:
+; CHECK-NEXT: pfmin %mm1, %mm0 # sched: [3:1.00]
+; CHECK-NEXT: pfmin (%rdi), %mm0 # sched: [7:1.00]
+; CHECK-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; CHECK-NEXT: retq # sched: [1:1.00]
+ %1 = call x86_mmx @llvm.x86.3dnow.pfmin(x86_mmx %a0, x86_mmx %a1)
+ %2 = load x86_mmx, x86_mmx *%a2, align 8
+ %3 = call x86_mmx @llvm.x86.3dnow.pfmin(x86_mmx %1, x86_mmx %2)
+ %4 = bitcast x86_mmx %3 to i64
+ ret i64 %4
+}
+declare x86_mmx @llvm.x86.3dnow.pfmin(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test_pfmul(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
+; CHECK-LABEL: test_pfmul:
+; CHECK: # %bb.0:
+; CHECK-NEXT: pfmul %mm1, %mm0 # sched: [3:1.00]
+; CHECK-NEXT: pfmul (%rdi), %mm0 # sched: [7:1.00]
+; CHECK-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; CHECK-NEXT: retq # sched: [1:1.00]
+ %1 = call x86_mmx @llvm.x86.3dnow.pfmul(x86_mmx %a0, x86_mmx %a1)
+ %2 = load x86_mmx, x86_mmx *%a2, align 8
+ %3 = call x86_mmx @llvm.x86.3dnow.pfmul(x86_mmx %1, x86_mmx %2)
+ %4 = bitcast x86_mmx %3 to i64
+ ret i64 %4
+}
+declare x86_mmx @llvm.x86.3dnow.pfmul(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test_pfnacc(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
+; CHECK-LABEL: test_pfnacc:
+; CHECK: # %bb.0:
+; CHECK-NEXT: pfnacc %mm1, %mm0 # sched: [3:1.00]
+; CHECK-NEXT: pfnacc (%rdi), %mm0 # sched: [7:1.00]
+; CHECK-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; CHECK-NEXT: retq # sched: [1:1.00]
+ %1 = call x86_mmx @llvm.x86.3dnowa.pfnacc(x86_mmx %a0, x86_mmx %a1)
+ %2 = load x86_mmx, x86_mmx *%a2, align 8
+ %3 = call x86_mmx @llvm.x86.3dnowa.pfnacc(x86_mmx %1, x86_mmx %2)
+ %4 = bitcast x86_mmx %3 to i64
+ ret i64 %4
+}
+declare x86_mmx @llvm.x86.3dnowa.pfnacc(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test_pfpnacc(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
+; CHECK-LABEL: test_pfpnacc:
+; CHECK: # %bb.0:
+; CHECK-NEXT: pfpnacc %mm1, %mm0 # sched: [3:1.00]
+; CHECK-NEXT: pfpnacc (%rdi), %mm0 # sched: [7:1.00]
+; CHECK-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; CHECK-NEXT: retq # sched: [1:1.00]
+ %1 = call x86_mmx @llvm.x86.3dnowa.pfpnacc(x86_mmx %a0, x86_mmx %a1)
+ %2 = load x86_mmx, x86_mmx *%a2, align 8
+ %3 = call x86_mmx @llvm.x86.3dnowa.pfpnacc(x86_mmx %1, x86_mmx %2)
+ %4 = bitcast x86_mmx %3 to i64
+ ret i64 %4
+}
+declare x86_mmx @llvm.x86.3dnowa.pfpnacc(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test_pfrcp(x86_mmx* %a0) optsize {
+; CHECK-LABEL: test_pfrcp:
+; CHECK: # %bb.0:
+; CHECK-NEXT: pfrcp (%rdi), %mm0 # sched: [7:1.00]
+; CHECK-NEXT: pfrcp %mm0, %mm0 # sched: [3:1.00]
+; CHECK-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; CHECK-NEXT: retq # sched: [1:1.00]
+ %1 = load x86_mmx, x86_mmx *%a0, align 8
+ %2 = call x86_mmx @llvm.x86.3dnow.pfrcp(x86_mmx %1)
+ %3 = call x86_mmx @llvm.x86.3dnow.pfrcp(x86_mmx %2)
+ %4 = bitcast x86_mmx %3 to i64
+ ret i64 %4
+}
+declare x86_mmx @llvm.x86.3dnow.pfrcp(x86_mmx) nounwind readnone
+
+define i64 @test_pfrcpit1(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
+; CHECK-LABEL: test_pfrcpit1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: pfrcpit1 %mm1, %mm0 # sched: [3:1.00]
+; CHECK-NEXT: pfrcpit1 (%rdi), %mm0 # sched: [7:1.00]
+; CHECK-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; CHECK-NEXT: retq # sched: [1:1.00]
+ %1 = call x86_mmx @llvm.x86.3dnow.pfrcpit1(x86_mmx %a0, x86_mmx %a1)
+ %2 = load x86_mmx, x86_mmx *%a2, align 8
+ %3 = call x86_mmx @llvm.x86.3dnow.pfrcpit1(x86_mmx %1, x86_mmx %2)
+ %4 = bitcast x86_mmx %3 to i64
+ ret i64 %4
+}
+declare x86_mmx @llvm.x86.3dnow.pfrcpit1(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test_pfrcpit2(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
+; CHECK-LABEL: test_pfrcpit2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: pfrcpit2 %mm1, %mm0 # sched: [3:1.00]
+; CHECK-NEXT: pfrcpit2 (%rdi), %mm0 # sched: [7:1.00]
+; CHECK-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; CHECK-NEXT: retq # sched: [1:1.00]
+ %1 = call x86_mmx @llvm.x86.3dnow.pfrcpit2(x86_mmx %a0, x86_mmx %a1)
+ %2 = load x86_mmx, x86_mmx *%a2, align 8
+ %3 = call x86_mmx @llvm.x86.3dnow.pfrcpit2(x86_mmx %1, x86_mmx %2)
+ %4 = bitcast x86_mmx %3 to i64
+ ret i64 %4
+}
+declare x86_mmx @llvm.x86.3dnow.pfrcpit2(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test_pfrsqit1(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
+; CHECK-LABEL: test_pfrsqit1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: pfrsqit1 %mm1, %mm0 # sched: [3:1.00]
+; CHECK-NEXT: pfrsqit1 (%rdi), %mm0 # sched: [7:1.00]
+; CHECK-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; CHECK-NEXT: retq # sched: [1:1.00]
+ %1 = call x86_mmx @llvm.x86.3dnow.pfrsqit1(x86_mmx %a0, x86_mmx %a1)
+ %2 = load x86_mmx, x86_mmx *%a2, align 8
+ %3 = call x86_mmx @llvm.x86.3dnow.pfrsqit1(x86_mmx %1, x86_mmx %2)
+ %4 = bitcast x86_mmx %3 to i64
+ ret i64 %4
+}
+declare x86_mmx @llvm.x86.3dnow.pfrsqit1(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test_pfrsqrt(x86_mmx* %a0) optsize {
+; CHECK-LABEL: test_pfrsqrt:
+; CHECK: # %bb.0:
+; CHECK-NEXT: pfrsqrt (%rdi), %mm0 # sched: [7:1.00]
+; CHECK-NEXT: pfrsqrt %mm0, %mm0 # sched: [3:1.00]
+; CHECK-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; CHECK-NEXT: retq # sched: [1:1.00]
+ %1 = load x86_mmx, x86_mmx *%a0, align 8
+ %2 = call x86_mmx @llvm.x86.3dnow.pfrsqrt(x86_mmx %1)
+ %3 = call x86_mmx @llvm.x86.3dnow.pfrsqrt(x86_mmx %2)
+ %4 = bitcast x86_mmx %3 to i64
+ ret i64 %4
+}
+declare x86_mmx @llvm.x86.3dnow.pfrsqrt(x86_mmx) nounwind readnone
+
+define i64 @test_pfsub(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
+; CHECK-LABEL: test_pfsub:
+; CHECK: # %bb.0:
+; CHECK-NEXT: pfsub %mm1, %mm0 # sched: [3:1.00]
+; CHECK-NEXT: pfsub (%rdi), %mm0 # sched: [7:1.00]
+; CHECK-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; CHECK-NEXT: retq # sched: [1:1.00]
+ %1 = call x86_mmx @llvm.x86.3dnow.pfsub(x86_mmx %a0, x86_mmx %a1)
+ %2 = load x86_mmx, x86_mmx *%a2, align 8
+ %3 = call x86_mmx @llvm.x86.3dnow.pfsub(x86_mmx %1, x86_mmx %2)
+ %4 = bitcast x86_mmx %3 to i64
+ ret i64 %4
+}
+declare x86_mmx @llvm.x86.3dnow.pfsub(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test_pfsubr(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
+; CHECK-LABEL: test_pfsubr:
+; CHECK: # %bb.0:
+; CHECK-NEXT: pfsubr %mm1, %mm0 # sched: [3:1.00]
+; CHECK-NEXT: pfsubr (%rdi), %mm0 # sched: [7:1.00]
+; CHECK-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; CHECK-NEXT: retq # sched: [1:1.00]
+ %1 = call x86_mmx @llvm.x86.3dnow.pfsubr(x86_mmx %a0, x86_mmx %a1)
+ %2 = load x86_mmx, x86_mmx *%a2, align 8
+ %3 = call x86_mmx @llvm.x86.3dnow.pfsubr(x86_mmx %1, x86_mmx %2)
+ %4 = bitcast x86_mmx %3 to i64
+ ret i64 %4
+}
+declare x86_mmx @llvm.x86.3dnow.pfsubr(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test_pi2fd(x86_mmx* %a0) optsize {
+; CHECK-LABEL: test_pi2fd:
+; CHECK: # %bb.0:
+; CHECK-NEXT: pi2fd (%rdi), %mm0 # sched: [8:1.00]
+; CHECK-NEXT: pi2fd %mm0, %mm0 # sched: [4:1.00]
+; CHECK-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; CHECK-NEXT: retq # sched: [1:1.00]
+ %1 = load x86_mmx, x86_mmx *%a0, align 8
+ %2 = call x86_mmx @llvm.x86.3dnow.pi2fd(x86_mmx %1)
+ %3 = call x86_mmx @llvm.x86.3dnow.pi2fd(x86_mmx %2)
+ %4 = bitcast x86_mmx %3 to i64
+ ret i64 %4
+}
+declare x86_mmx @llvm.x86.3dnow.pi2fd(x86_mmx) nounwind readnone
+
+define i64 @test_pi2fw(x86_mmx* %a0) optsize {
+; CHECK-LABEL: test_pi2fw:
+; CHECK: # %bb.0:
+; CHECK-NEXT: pi2fw (%rdi), %mm0 # sched: [8:1.00]
+; CHECK-NEXT: pi2fw %mm0, %mm0 # sched: [4:1.00]
+; CHECK-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; CHECK-NEXT: retq # sched: [1:1.00]
+ %1 = load x86_mmx, x86_mmx *%a0, align 8
+ %2 = call x86_mmx @llvm.x86.3dnowa.pi2fw(x86_mmx %1)
+ %3 = call x86_mmx @llvm.x86.3dnowa.pi2fw(x86_mmx %2)
+ %4 = bitcast x86_mmx %3 to i64
+ ret i64 %4
+}
+declare x86_mmx @llvm.x86.3dnowa.pi2fw(x86_mmx) nounwind readnone
+
+define i64 @test_pmulhrw(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
+; CHECK-LABEL: test_pmulhrw:
+; CHECK: # %bb.0:
+; CHECK-NEXT: pmulhrw %mm1, %mm0 # sched: [5:1.00]
+; CHECK-NEXT: pmulhrw (%rdi), %mm0 # sched: [9:1.00]
+; CHECK-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; CHECK-NEXT: retq # sched: [1:1.00]
+ %1 = call x86_mmx @llvm.x86.3dnow.pmulhrw(x86_mmx %a0, x86_mmx %a1)
+ %2 = load x86_mmx, x86_mmx *%a2, align 8
+ %3 = call x86_mmx @llvm.x86.3dnow.pmulhrw(x86_mmx %1, x86_mmx %2)
+ %4 = bitcast x86_mmx %3 to i64
+ ret i64 %4
+}
+declare x86_mmx @llvm.x86.3dnow.pmulhrw(x86_mmx, x86_mmx) nounwind readnone
+
+define void @test_prefetch(i8* %a0) optsize {
+; CHECK-LABEL: test_prefetch:
+; CHECK: # %bb.0:
+; CHECK-NEXT: #APP
+; CHECK-NEXT: prefetch (%rdi) # sched: [5:0.50]
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: retq # sched: [1:1.00]
+ tail call void asm sideeffect "prefetch $0", "*m"(i8 *%a0) nounwind
+ ret void
+}
+
+define void @test_prefetchw(i8* %a0) optsize {
+; CHECK-LABEL: test_prefetchw:
+; CHECK: # %bb.0:
+; CHECK-NEXT: #APP
+; CHECK-NEXT: prefetchw (%rdi) # sched: [5:0.50]
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: retq # sched: [1:1.00]
+ tail call void asm sideeffect "prefetchw $0", "*m"(i8 *%a0) nounwind
+ ret void
+}
+
+define i64 @test_pswapd(x86_mmx* %a0) optsize {
+; CHECK-LABEL: test_pswapd:
+; CHECK: # %bb.0:
+; CHECK-NEXT: pswapd (%rdi), %mm0 # mm0 = mem[1,0] sched: [5:1.00]
+; CHECK-NEXT: pswapd %mm0, %mm0 # mm0 = mm0[1,0] sched: [1:1.00]
+; CHECK-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; CHECK-NEXT: retq # sched: [1:1.00]
+ %1 = load x86_mmx, x86_mmx *%a0, align 8
+ %2 = call x86_mmx @llvm.x86.3dnowa.pswapd(x86_mmx %1)
+ %3 = call x86_mmx @llvm.x86.3dnowa.pswapd(x86_mmx %2)
+ %4 = bitcast x86_mmx %3 to i64
+ ret i64 %4
+}
+declare x86_mmx @llvm.x86.3dnowa.pswapd(x86_mmx) nounwind readnone
diff --git a/test/CodeGen/X86/4char-promote.ll b/test/CodeGen/X86/4char-promote.ll
index 4f1a859fd436..bfe025eaa914 100644
--- a/test/CodeGen/X86/4char-promote.ll
+++ b/test/CodeGen/X86/4char-promote.ll
@@ -1,5 +1,5 @@
; A test for checking PR 9623
-; RUN: llc -march=x86-64 -mcpu=corei7 < %s | FileCheck %s
+; RUN: llc -mcpu=corei7 < %s | FileCheck %s
target triple = "x86_64-apple-darwin"
diff --git a/test/CodeGen/X86/AppendingLinkage.ll b/test/CodeGen/X86/AppendingLinkage.ll
index 1a49287d1b38..5ab49a28e96d 100644
--- a/test/CodeGen/X86/AppendingLinkage.ll
+++ b/test/CodeGen/X86/AppendingLinkage.ll
@@ -1,4 +1,4 @@
-; RUN: not llc < %s -march=x86 2>&1 | FileCheck %s
+; RUN: not llc < %s -mtriple=i686-- 2>&1 | FileCheck %s
; CHECK: unknown special variable
@foo = appending constant [1 x i32 ]zeroinitializer
diff --git a/test/CodeGen/X86/Atomics-64.ll b/test/CodeGen/X86/Atomics-64.ll
index 6d367a71d015..b5748730548e 100644
--- a/test/CodeGen/X86/Atomics-64.ll
+++ b/test/CodeGen/X86/Atomics-64.ll
@@ -1,7 +1,6 @@
-; RUN: llc < %s -march=x86-64 > %t.x86-64
-; RUN: llc < %s -march=x86 -mattr=cx16 > %t.x86
+; RUN: llc < %s -mtriple=x86_64-apple-darwin8 > %t.x86-64
+; RUN: llc < %s -mtriple=i686-apple-darwin8 -mattr=cx16 > %t.x86
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
-target triple = "x86_64-apple-darwin8"
@sc = common global i8 0
@uc = common global i8 0
diff --git a/test/CodeGen/X86/DbgValueOtherTargets.test b/test/CodeGen/X86/DbgValueOtherTargets.test
index 7b4d431c93b1..02f56823fd1d 100644
--- a/test/CodeGen/X86/DbgValueOtherTargets.test
+++ b/test/CodeGen/X86/DbgValueOtherTargets.test
@@ -1,2 +1,2 @@
-RUN: llc -O0 -march=x86 -asm-verbose < %S/../Inputs/DbgValueOtherTargets.ll | FileCheck %S/../Inputs/DbgValueOtherTargets.ll
-RUN: llc -O0 -march=x86-64 -asm-verbose < %S/../Inputs/DbgValueOtherTargets.ll | FileCheck %S/../Inputs/DbgValueOtherTargets.ll
+RUN: llc -O0 -mtriple=i686-- -asm-verbose < %S/../Inputs/DbgValueOtherTargets.ll | FileCheck %S/../Inputs/DbgValueOtherTargets.ll
+RUN: llc -O0 -mtriple=x86_64-- -asm-verbose < %S/../Inputs/DbgValueOtherTargets.ll | FileCheck %S/../Inputs/DbgValueOtherTargets.ll
diff --git a/test/CodeGen/X86/GlobalISel/GV.ll b/test/CodeGen/X86/GlobalISel/GV.ll
index 44862ab5a96e..09a2fe665c40 100644
--- a/test/CodeGen/X86/GlobalISel/GV.ll
+++ b/test/CodeGen/X86/GlobalISel/GV.ll
@@ -9,22 +9,22 @@
; Function Attrs: noinline nounwind optnone uwtable
define i32* @test_global_ptrv() #3 {
; X64-LABEL: test_global_ptrv:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: leaq g_int, %rax
; X64-NEXT: retq
;
; X64_DARWIN_PIC-LABEL: test_global_ptrv:
-; X64_DARWIN_PIC: ## BB#0: ## %entry
+; X64_DARWIN_PIC: ## %bb.0: ## %entry
; X64_DARWIN_PIC-NEXT: leaq _g_int(%rip), %rax
; X64_DARWIN_PIC-NEXT: retq
;
; X32-LABEL: test_global_ptrv:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: leal g_int, %eax
; X32-NEXT: retl
;
; X32ABI-LABEL: test_global_ptrv:
-; X32ABI: # BB#0: # %entry
+; X32ABI: # %bb.0: # %entry
; X32ABI-NEXT: leal g_int, %eax
; X32ABI-NEXT: retq
entry:
@@ -34,25 +34,25 @@ entry:
; Function Attrs: noinline nounwind optnone uwtable
define i32 @test_global_valv() #3 {
; X64-LABEL: test_global_valv:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: leaq g_int, %rax
; X64-NEXT: movl (%rax), %eax
; X64-NEXT: retq
;
; X64_DARWIN_PIC-LABEL: test_global_valv:
-; X64_DARWIN_PIC: ## BB#0: ## %entry
+; X64_DARWIN_PIC: ## %bb.0: ## %entry
; X64_DARWIN_PIC-NEXT: leaq _g_int(%rip), %rax
; X64_DARWIN_PIC-NEXT: movl (%rax), %eax
; X64_DARWIN_PIC-NEXT: retq
;
; X32-LABEL: test_global_valv:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: leal g_int, %eax
; X32-NEXT: movl (%eax), %eax
; X32-NEXT: retl
;
; X32ABI-LABEL: test_global_valv:
-; X32ABI: # BB#0: # %entry
+; X32ABI: # %bb.0: # %entry
; X32ABI-NEXT: leal g_int, %eax
; X32ABI-NEXT: movl (%eax), %eax
; X32ABI-NEXT: retq
diff --git a/test/CodeGen/X86/GlobalISel/add-scalar.ll b/test/CodeGen/X86/GlobalISel/add-scalar.ll
index a5dc7906363c..0ef7c956d493 100644
--- a/test/CodeGen/X86/GlobalISel/add-scalar.ll
+++ b/test/CodeGen/X86/GlobalISel/add-scalar.ll
@@ -4,19 +4,16 @@
define i64 @test_add_i64(i64 %arg1, i64 %arg2) {
; X64-LABEL: test_add_i64:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: leaq (%rsi,%rdi), %rax
; X64-NEXT: retq
;
; X32-LABEL: test_add_i64:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pushl %ebp
-; X32-NEXT: .Lcfi0:
; X32-NEXT: .cfi_def_cfa_offset 8
-; X32-NEXT: .Lcfi1:
; X32-NEXT: .cfi_offset %ebp, -8
; X32-NEXT: movl %esp, %ebp
-; X32-NEXT: .Lcfi2:
; X32-NEXT: .cfi_def_cfa_register %ebp
; X32-NEXT: movl 16(%ebp), %eax
; X32-NEXT: movl 20(%ebp), %edx
@@ -30,16 +27,16 @@ define i64 @test_add_i64(i64 %arg1, i64 %arg2) {
define i32 @test_add_i32(i32 %arg1, i32 %arg2) {
; X64-LABEL: test_add_i32:
-; X64: # BB#0:
-; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
+; X64: # %bb.0:
+; X64-NEXT: # kill: def %edi killed %edi def %rdi
+; X64-NEXT: # kill: def %esi killed %esi def %rsi
; X64-NEXT: leal (%rsi,%rdi), %eax
; X64-NEXT: retq
;
; X32-LABEL: test_add_i32:
-; X32: # BB#0:
-; X32-NEXT: movl 8(%esp), %eax
-; X32-NEXT: addl 4(%esp), %eax
+; X32: # %bb.0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: addl {{[0-9]+}}(%esp), %eax
; X32-NEXT: retl
%ret = add i32 %arg1, %arg2
ret i32 %ret
@@ -47,17 +44,17 @@ define i32 @test_add_i32(i32 %arg1, i32 %arg2) {
define i16 @test_add_i16(i16 %arg1, i16 %arg2) {
; X64-LABEL: test_add_i16:
-; X64: # BB#0:
-; X64-NEXT: # kill: %DI<def> %DI<kill> %RDI<def>
-; X64-NEXT: # kill: %SI<def> %SI<kill> %RSI<def>
+; X64: # %bb.0:
+; X64-NEXT: # kill: def %edi killed %edi def %rdi
+; X64-NEXT: # kill: def %esi killed %esi def %rsi
; X64-NEXT: leal (%rsi,%rdi), %eax
-; X64-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-NEXT: # kill: def %ax killed %ax killed %eax
; X64-NEXT: retq
;
; X32-LABEL: test_add_i16:
-; X32: # BB#0:
-; X32-NEXT: movzwl 8(%esp), %eax
-; X32-NEXT: addw 4(%esp), %ax
+; X32: # %bb.0:
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: addw {{[0-9]+}}(%esp), %ax
; X32-NEXT: retl
%ret = add i16 %arg1, %arg2
ret i16 %ret
@@ -65,16 +62,41 @@ define i16 @test_add_i16(i16 %arg1, i16 %arg2) {
define i8 @test_add_i8(i8 %arg1, i8 %arg2) {
; X64-LABEL: test_add_i8:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: addb %dil, %sil
; X64-NEXT: movl %esi, %eax
; X64-NEXT: retq
;
; X32-LABEL: test_add_i8:
-; X32: # BB#0:
-; X32-NEXT: movb 8(%esp), %al
-; X32-NEXT: addb 4(%esp), %al
+; X32: # %bb.0:
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: addb {{[0-9]+}}(%esp), %al
; X32-NEXT: retl
%ret = add i8 %arg1, %arg2
ret i8 %ret
}
+
+define i32 @test_add_i1(i32 %arg1, i32 %arg2) {
+; X64-LABEL: test_add_i1:
+; X64: # %bb.0:
+; X64-NEXT: cmpl %esi, %edi
+; X64-NEXT: sete %al
+; X64-NEXT: addb %al, %al
+; X64-NEXT: movzbl %al, %eax
+; X64-NEXT: andl $1, %eax
+; X64-NEXT: retq
+;
+; X32-LABEL: test_add_i1:
+; X32: # %bb.0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: cmpl %eax, {{[0-9]+}}(%esp)
+; X32-NEXT: sete %al
+; X32-NEXT: addb %al, %al
+; X32-NEXT: movzbl %al, %eax
+; X32-NEXT: andl $1, %eax
+; X32-NEXT: retl
+ %c = icmp eq i32 %arg1, %arg2
+ %x = add i1 %c , %c
+ %ret = zext i1 %x to i32
+ ret i32 %ret
+}
diff --git a/test/CodeGen/X86/GlobalISel/add-vec.ll b/test/CodeGen/X86/GlobalISel/add-vec.ll
index 0ea1cf820c0f..6bebf09b26b8 100644
--- a/test/CodeGen/X86/GlobalISel/add-vec.ll
+++ b/test/CodeGen/X86/GlobalISel/add-vec.ll
@@ -6,7 +6,7 @@
define <16 x i8> @test_add_v16i8(<16 x i8> %arg1, <16 x i8> %arg2) {
; ALL-LABEL: test_add_v16i8:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vpaddb %xmm1, %xmm0, %xmm0
; ALL-NEXT: retq
%ret = add <16 x i8> %arg1, %arg2
@@ -15,7 +15,7 @@ define <16 x i8> @test_add_v16i8(<16 x i8> %arg1, <16 x i8> %arg2) {
define <8 x i16> @test_add_v8i16(<8 x i16> %arg1, <8 x i16> %arg2) {
; ALL-LABEL: test_add_v8i16:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; ALL-NEXT: retq
%ret = add <8 x i16> %arg1, %arg2
@@ -24,7 +24,7 @@ define <8 x i16> @test_add_v8i16(<8 x i16> %arg1, <8 x i16> %arg2) {
define <4 x i32> @test_add_v4i32(<4 x i32> %arg1, <4 x i32> %arg2) {
; ALL-LABEL: test_add_v4i32:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; ALL-NEXT: retq
%ret = add <4 x i32> %arg1, %arg2
@@ -33,7 +33,7 @@ define <4 x i32> @test_add_v4i32(<4 x i32> %arg1, <4 x i32> %arg2) {
define <2 x i64> @test_add_v2i64(<2 x i64> %arg1, <2 x i64> %arg2) {
; ALL-LABEL: test_add_v2i64:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; ALL-NEXT: retq
%ret = add <2 x i64> %arg1, %arg2
@@ -42,17 +42,17 @@ define <2 x i64> @test_add_v2i64(<2 x i64> %arg1, <2 x i64> %arg2) {
define <32 x i8> @test_add_v32i8(<32 x i8> %arg1, <32 x i8> %arg2) {
; SKX-LABEL: test_add_v32i8:
-; SKX: # BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vpaddb %ymm1, %ymm0, %ymm0
; SKX-NEXT: retq
;
; AVX2-LABEL: test_add_v32i8:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX1-LABEL: test_add_v32i8:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
; AVX1-NEXT: vpaddb %xmm3, %xmm2, %xmm2
@@ -65,17 +65,17 @@ define <32 x i8> @test_add_v32i8(<32 x i8> %arg1, <32 x i8> %arg2) {
define <16 x i16> @test_add_v16i16(<16 x i16> %arg1, <16 x i16> %arg2) {
; SKX-LABEL: test_add_v16i16:
-; SKX: # BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vpaddw %ymm1, %ymm0, %ymm0
; SKX-NEXT: retq
;
; AVX2-LABEL: test_add_v16i16:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX1-LABEL: test_add_v16i16:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
; AVX1-NEXT: vpaddw %xmm3, %xmm2, %xmm2
@@ -88,17 +88,17 @@ define <16 x i16> @test_add_v16i16(<16 x i16> %arg1, <16 x i16> %arg2) {
define <8 x i32> @test_add_v8i32(<8 x i32> %arg1, <8 x i32> %arg2) {
; SKX-LABEL: test_add_v8i32:
-; SKX: # BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; SKX-NEXT: retq
;
; AVX2-LABEL: test_add_v8i32:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX1-LABEL: test_add_v8i32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2
@@ -111,17 +111,17 @@ define <8 x i32> @test_add_v8i32(<8 x i32> %arg1, <8 x i32> %arg2) {
define <4 x i64> @test_add_v4i64(<4 x i64> %arg1, <4 x i64> %arg2) {
; SKX-LABEL: test_add_v4i64:
-; SKX: # BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vpaddq %ymm1, %ymm0, %ymm0
; SKX-NEXT: retq
;
; AVX2-LABEL: test_add_v4i64:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX1-LABEL: test_add_v4i64:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
; AVX1-NEXT: vpaddq %xmm3, %xmm2, %xmm2
@@ -134,18 +134,18 @@ define <4 x i64> @test_add_v4i64(<4 x i64> %arg1, <4 x i64> %arg2) {
define <64 x i8> @test_add_v64i8(<64 x i8> %arg1, <64 x i8> %arg2) {
; SKX-LABEL: test_add_v64i8:
-; SKX: # BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vpaddb %zmm1, %zmm0, %zmm0
; SKX-NEXT: retq
;
; AVX2-LABEL: test_add_v64i8:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpaddb %ymm3, %ymm1, %ymm1
; AVX2-NEXT: retq
;
; AVX1-LABEL: test_add_v64i8:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6
@@ -163,18 +163,18 @@ define <64 x i8> @test_add_v64i8(<64 x i8> %arg1, <64 x i8> %arg2) {
define <32 x i16> @test_add_v32i16(<32 x i16> %arg1, <32 x i16> %arg2) {
; SKX-LABEL: test_add_v32i16:
-; SKX: # BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vpaddw %zmm1, %zmm0, %zmm0
; SKX-NEXT: retq
;
; AVX2-LABEL: test_add_v32i16:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpaddw %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpaddw %ymm3, %ymm1, %ymm1
; AVX2-NEXT: retq
;
; AVX1-LABEL: test_add_v32i16:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6
@@ -192,18 +192,18 @@ define <32 x i16> @test_add_v32i16(<32 x i16> %arg1, <32 x i16> %arg2) {
define <16 x i32> @test_add_v16i32(<16 x i32> %arg1, <16 x i32> %arg2) {
; SKX-LABEL: test_add_v16i32:
-; SKX: # BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vpaddd %zmm1, %zmm0, %zmm0
; SKX-NEXT: retq
;
; AVX2-LABEL: test_add_v16i32:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpaddd %ymm3, %ymm1, %ymm1
; AVX2-NEXT: retq
;
; AVX1-LABEL: test_add_v16i32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6
@@ -221,18 +221,18 @@ define <16 x i32> @test_add_v16i32(<16 x i32> %arg1, <16 x i32> %arg2) {
define <8 x i64> @test_add_v8i64(<8 x i64> %arg1, <8 x i64> %arg2) {
; SKX-LABEL: test_add_v8i64:
-; SKX: # BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vpaddq %zmm1, %zmm0, %zmm0
; SKX-NEXT: retq
;
; AVX2-LABEL: test_add_v8i64:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpaddq %ymm3, %ymm1, %ymm1
; AVX2-NEXT: retq
;
; AVX1-LABEL: test_add_v8i64:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6
diff --git a/test/CodeGen/X86/GlobalISel/and-scalar.ll b/test/CodeGen/X86/GlobalISel/and-scalar.ll
index b19321421087..b23701394148 100644
--- a/test/CodeGen/X86/GlobalISel/and-scalar.ll
+++ b/test/CodeGen/X86/GlobalISel/and-scalar.ll
@@ -1,9 +1,24 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=x86_64-linux-gnu -global-isel -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=ALL
+define i32 @test_and_i1(i32 %arg1, i32 %arg2) {
+; ALL-LABEL: test_and_i1:
+; ALL: # %bb.0:
+; ALL-NEXT: cmpl %esi, %edi
+; ALL-NEXT: sete %al
+; ALL-NEXT: andb %al, %al
+; ALL-NEXT: movzbl %al, %eax
+; ALL-NEXT: andl $1, %eax
+; ALL-NEXT: retq
+ %c = icmp eq i32 %arg1, %arg2
+ %x = and i1 %c , %c
+ %ret = zext i1 %x to i32
+ ret i32 %ret
+}
+
define i8 @test_and_i8(i8 %arg1, i8 %arg2) {
; ALL-LABEL: test_and_i8:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: andb %dil, %sil
; ALL-NEXT: movl %esi, %eax
; ALL-NEXT: retq
@@ -13,7 +28,7 @@ define i8 @test_and_i8(i8 %arg1, i8 %arg2) {
define i16 @test_and_i16(i16 %arg1, i16 %arg2) {
; ALL-LABEL: test_and_i16:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: andw %di, %si
; ALL-NEXT: movl %esi, %eax
; ALL-NEXT: retq
@@ -23,7 +38,7 @@ define i16 @test_and_i16(i16 %arg1, i16 %arg2) {
define i32 @test_and_i32(i32 %arg1, i32 %arg2) {
; ALL-LABEL: test_and_i32:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: andl %edi, %esi
; ALL-NEXT: movl %esi, %eax
; ALL-NEXT: retq
@@ -33,7 +48,7 @@ define i32 @test_and_i32(i32 %arg1, i32 %arg2) {
define i64 @test_and_i64(i64 %arg1, i64 %arg2) {
; ALL-LABEL: test_and_i64:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: andq %rdi, %rsi
; ALL-NEXT: movq %rsi, %rax
; ALL-NEXT: retq
diff --git a/test/CodeGen/X86/GlobalISel/binop.ll b/test/CodeGen/X86/GlobalISel/binop.ll
index d7ae4435682f..a0efcffa66f7 100644
--- a/test/CodeGen/X86/GlobalISel/binop.ll
+++ b/test/CodeGen/X86/GlobalISel/binop.ll
@@ -6,7 +6,7 @@
define i64 @test_sub_i64(i64 %arg1, i64 %arg2) {
; ALL-LABEL: test_sub_i64:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: subq %rsi, %rdi
; ALL-NEXT: movq %rdi, %rax
; ALL-NEXT: retq
@@ -16,7 +16,7 @@ define i64 @test_sub_i64(i64 %arg1, i64 %arg2) {
define i32 @test_sub_i32(i32 %arg1, i32 %arg2) {
; ALL-LABEL: test_sub_i32:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: subl %esi, %edi
; ALL-NEXT: movl %edi, %eax
; ALL-NEXT: retq
@@ -26,12 +26,12 @@ define i32 @test_sub_i32(i32 %arg1, i32 %arg2) {
define float @test_add_float(float %arg1, float %arg2) {
; SSE-LABEL: test_add_float:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: addss %xmm1, %xmm0
; SSE-NEXT: retq
;
; ALL_AVX-LABEL: test_add_float:
-; ALL_AVX: # BB#0:
+; ALL_AVX: # %bb.0:
; ALL_AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
; ALL_AVX-NEXT: retq
%ret = fadd float %arg1, %arg2
@@ -40,12 +40,12 @@ define float @test_add_float(float %arg1, float %arg2) {
define double @test_add_double(double %arg1, double %arg2) {
; SSE-LABEL: test_add_double:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: addsd %xmm1, %xmm0
; SSE-NEXT: retq
;
; ALL_AVX-LABEL: test_add_double:
-; ALL_AVX: # BB#0:
+; ALL_AVX: # %bb.0:
; ALL_AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0
; ALL_AVX-NEXT: retq
%ret = fadd double %arg1, %arg2
@@ -54,12 +54,12 @@ define double @test_add_double(double %arg1, double %arg2) {
define float @test_sub_float(float %arg1, float %arg2) {
; SSE-LABEL: test_sub_float:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: subss %xmm1, %xmm0
; SSE-NEXT: retq
;
; ALL_AVX-LABEL: test_sub_float:
-; ALL_AVX: # BB#0:
+; ALL_AVX: # %bb.0:
; ALL_AVX-NEXT: vsubss %xmm1, %xmm0, %xmm0
; ALL_AVX-NEXT: retq
%ret = fsub float %arg1, %arg2
@@ -68,12 +68,12 @@ define float @test_sub_float(float %arg1, float %arg2) {
define double @test_sub_double(double %arg1, double %arg2) {
; SSE-LABEL: test_sub_double:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: subsd %xmm1, %xmm0
; SSE-NEXT: retq
;
; ALL_AVX-LABEL: test_sub_double:
-; ALL_AVX: # BB#0:
+; ALL_AVX: # %bb.0:
; ALL_AVX-NEXT: vsubsd %xmm1, %xmm0, %xmm0
; ALL_AVX-NEXT: retq
%ret = fsub double %arg1, %arg2
@@ -82,12 +82,12 @@ define double @test_sub_double(double %arg1, double %arg2) {
define <4 x i32> @test_add_v4i32(<4 x i32> %arg1, <4 x i32> %arg2) {
; SSE-LABEL: test_add_v4i32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: paddd %xmm1, %xmm0
; SSE-NEXT: retq
;
; ALL_AVX-LABEL: test_add_v4i32:
-; ALL_AVX: # BB#0:
+; ALL_AVX: # %bb.0:
; ALL_AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; ALL_AVX-NEXT: retq
%ret = add <4 x i32> %arg1, %arg2
@@ -96,12 +96,12 @@ define <4 x i32> @test_add_v4i32(<4 x i32> %arg1, <4 x i32> %arg2) {
define <4 x i32> @test_sub_v4i32(<4 x i32> %arg1, <4 x i32> %arg2) {
; SSE-LABEL: test_sub_v4i32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: psubd %xmm1, %xmm0
; SSE-NEXT: retq
;
; ALL_AVX-LABEL: test_sub_v4i32:
-; ALL_AVX: # BB#0:
+; ALL_AVX: # %bb.0:
; ALL_AVX-NEXT: vpsubd %xmm1, %xmm0, %xmm0
; ALL_AVX-NEXT: retq
%ret = sub <4 x i32> %arg1, %arg2
@@ -110,12 +110,12 @@ define <4 x i32> @test_sub_v4i32(<4 x i32> %arg1, <4 x i32> %arg2) {
define <4 x float> @test_add_v4f32(<4 x float> %arg1, <4 x float> %arg2) {
; SSE-LABEL: test_add_v4f32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: addps %xmm1, %xmm0
; SSE-NEXT: retq
;
; ALL_AVX-LABEL: test_add_v4f32:
-; ALL_AVX: # BB#0:
+; ALL_AVX: # %bb.0:
; ALL_AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0
; ALL_AVX-NEXT: retq
%ret = fadd <4 x float> %arg1, %arg2
@@ -124,12 +124,12 @@ define <4 x float> @test_add_v4f32(<4 x float> %arg1, <4 x float> %arg2) {
define <4 x float> @test_sub_v4f32(<4 x float> %arg1, <4 x float> %arg2) {
; SSE-LABEL: test_sub_v4f32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: subps %xmm1, %xmm0
; SSE-NEXT: retq
;
; ALL_AVX-LABEL: test_sub_v4f32:
-; ALL_AVX: # BB#0:
+; ALL_AVX: # %bb.0:
; ALL_AVX-NEXT: vsubps %xmm1, %xmm0, %xmm0
; ALL_AVX-NEXT: retq
%ret = fsub <4 x float> %arg1, %arg2
@@ -138,12 +138,12 @@ define <4 x float> @test_sub_v4f32(<4 x float> %arg1, <4 x float> %arg2) {
define i32 @test_copy_float(float %val) {
; SSE-LABEL: test_copy_float:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movd %xmm0, %eax
; SSE-NEXT: retq
;
; ALL_AVX-LABEL: test_copy_float:
-; ALL_AVX: # BB#0:
+; ALL_AVX: # %bb.0:
; ALL_AVX-NEXT: vmovd %xmm0, %eax
; ALL_AVX-NEXT: retq
%r = bitcast float %val to i32
@@ -152,12 +152,12 @@ define i32 @test_copy_float(float %val) {
define float @test_copy_i32(i32 %val) {
; SSE-LABEL: test_copy_i32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movd %edi, %xmm0
; SSE-NEXT: retq
;
; ALL_AVX-LABEL: test_copy_i32:
-; ALL_AVX: # BB#0:
+; ALL_AVX: # %bb.0:
; ALL_AVX-NEXT: vmovd %edi, %xmm0
; ALL_AVX-NEXT: retq
%r = bitcast i32 %val to float
diff --git a/test/CodeGen/X86/GlobalISel/br.ll b/test/CodeGen/X86/GlobalISel/br.ll
index 387e8797f0cd..2c07a4d326e4 100644
--- a/test/CodeGen/X86/GlobalISel/br.ll
+++ b/test/CodeGen/X86/GlobalISel/br.ll
@@ -3,7 +3,7 @@
define void @uncondbr() {
; CHECK-LABEL: uncondbr:
-; CHECK: # BB#1: # %entry
+; CHECK: # %bb.1: # %entry
; CHECK-NEXT: jmp .LBB0_3
; CHECK-NEXT: .LBB0_2: # %end
; CHECK-NEXT: retq
diff --git a/test/CodeGen/X86/GlobalISel/brcond.ll b/test/CodeGen/X86/GlobalISel/brcond.ll
new file mode 100644
index 000000000000..e92573115c3d
--- /dev/null
+++ b/test/CodeGen/X86/GlobalISel/brcond.ll
@@ -0,0 +1,90 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=x86_64-linux-gnu -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=X64
+; RUN: llc -mtriple=i386-linux-gnu -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=X32
+
+define i32 @test_1(i32 %a, i32 %b, i32 %tValue, i32 %fValue) {
+; X64-LABEL: test_1:
+; X64: # %bb.0: # %entry
+; X64-NEXT: cmpl %esi, %edi
+; X64-NEXT: setl %al
+; X64-NEXT: testb $1, %al
+; X64-NEXT: je .LBB0_2
+; X64-NEXT: # %bb.1: # %if.then
+; X64-NEXT: movl %edx, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movl -{{[0-9]+}}(%rsp), %eax
+; X64-NEXT: retq
+; X64-NEXT: .LBB0_2: # %if.else
+; X64-NEXT: movl %ecx, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movl -{{[0-9]+}}(%rsp), %eax
+; X64-NEXT: retq
+;
+; X32-LABEL: test_1:
+; X32: # %bb.0: # %entry
+; X32-NEXT: pushl %eax
+; X32-NEXT: .cfi_def_cfa_offset 8
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: cmpl %eax, {{[0-9]+}}(%esp)
+; X32-NEXT: setl %al
+; X32-NEXT: testb $1, %al
+; X32-NEXT: je .LBB0_2
+; X32-NEXT: # %bb.1: # %if.then
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: jmp .LBB0_3
+; X32-NEXT: .LBB0_2: # %if.else
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: .LBB0_3: # %return
+; X32-NEXT: movl %eax, (%esp)
+; X32-NEXT: movl (%esp), %eax
+; X32-NEXT: popl %ecx
+; X32-NEXT: retl
+entry:
+ %retval = alloca i32, align 4
+ %cmp = icmp slt i32 %a, %b
+ br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+ store i32 %tValue, i32* %retval, align 4
+ br label %return
+
+if.else:
+ store i32 %fValue, i32* %retval, align 4
+ br label %return
+
+return:
+ %0 = load i32, i32* %retval, align 4
+ ret i32 %0
+}
+
+define i32 @test_2(i32 %a) {
+; X64-LABEL: test_2:
+; X64: # %bb.0: # %entry
+; X64-NEXT: testb $1, %dil
+; X64-NEXT: je .LBB1_2
+; X64-NEXT: # %bb.1: # %if.then
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: retq
+; X64-NEXT: .LBB1_2: # %if.else
+; X64-NEXT: movl $1, %eax
+; X64-NEXT: retq
+;
+; X32-LABEL: test_2:
+; X32: # %bb.0: # %entry
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: testb $1, %al
+; X32-NEXT: je .LBB1_2
+; X32-NEXT: # %bb.1: # %if.then
+; X32-NEXT: xorl %eax, %eax
+; X32-NEXT: retl
+; X32-NEXT: .LBB1_2: # %if.else
+; X32-NEXT: movl $1, %eax
+; X32-NEXT: retl
+entry:
+ %cmp = trunc i32 %a to i1
+ br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+ ret i32 0
+if.else:
+ ret i32 1
+}
+
diff --git a/test/CodeGen/X86/GlobalISel/callingconv.ll b/test/CodeGen/X86/GlobalISel/callingconv.ll
index 8a14436e29d9..238f1fa21cfa 100644
--- a/test/CodeGen/X86/GlobalISel/callingconv.ll
+++ b/test/CodeGen/X86/GlobalISel/callingconv.ll
@@ -1,37 +1,29 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=i386-linux-gnu -mattr=+sse2 -global-isel -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X32 --check-prefix=X32_GISEL
-; RUN: llc -mtriple=i386-linux-gnu -mattr=+sse2 -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X32 --check-prefix=X32_ISEL
-; RUN: llc -mtriple=x86_64-linux-gnu -global-isel -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X64 --check-prefix=X64_GISEL
-; RUN: llc -mtriple=x86_64-linux-gnu -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X64 --check-prefix=X64_ISEL
+; RUN: llc -mtriple=i386-linux-gnu -mattr=+sse2 -global-isel -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X32
+; RUN: llc -mtriple=x86_64-linux-gnu -global-isel -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X64
define i32 @test_ret_i32() {
; X32-LABEL: test_ret_i32:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl $20, %eax
; X32-NEXT: retl
;
; X64-LABEL: test_ret_i32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movl $20, %eax
; X64-NEXT: retq
ret i32 20
}
define i64 @test_ret_i64() {
-; X32_GISEL-LABEL: test_ret_i64:
-; X32_GISEL: # BB#0:
-; X32_GISEL-NEXT: movl $4294967295, %eax # imm = 0xFFFFFFFF
-; X32_GISEL-NEXT: movl $15, %edx
-; X32_GISEL-NEXT: retl
-;
-; X32_ISEL-LABEL: test_ret_i64:
-; X32_ISEL: # BB#0:
-; X32_ISEL-NEXT: movl $-1, %eax
-; X32_ISEL-NEXT: movl $15, %edx
-; X32_ISEL-NEXT: retl
+; X32-LABEL: test_ret_i64:
+; X32: # %bb.0:
+; X32-NEXT: movl $4294967295, %eax # imm = 0xFFFFFFFF
+; X32-NEXT: movl $15, %edx
+; X32-NEXT: retl
;
; X64-LABEL: test_ret_i64:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movabsq $68719476735, %rax # imm = 0xFFFFFFFFF
; X64-NEXT: retq
ret i64 68719476735
@@ -39,12 +31,12 @@ define i64 @test_ret_i64() {
define i8 @test_arg_i8(i8 %a) {
; X32-LABEL: test_arg_i8:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movb 4(%esp), %al
; X32-NEXT: retl
;
; X64-LABEL: test_arg_i8:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movl %edi, %eax
; X64-NEXT: retq
ret i8 %a
@@ -52,12 +44,12 @@ define i8 @test_arg_i8(i8 %a) {
define i16 @test_arg_i16(i16 %a) {
; X32-LABEL: test_arg_i16:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movzwl 4(%esp), %eax
; X32-NEXT: retl
;
; X64-LABEL: test_arg_i16:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movl %edi, %eax
; X64-NEXT: retq
ret i16 %a
@@ -65,12 +57,12 @@ define i16 @test_arg_i16(i16 %a) {
define i32 @test_arg_i32(i32 %a) {
; X32-LABEL: test_arg_i32:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl 4(%esp), %eax
; X32-NEXT: retl
;
; X64-LABEL: test_arg_i32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movl %edi, %eax
; X64-NEXT: retq
ret i32 %a
@@ -78,13 +70,13 @@ define i32 @test_arg_i32(i32 %a) {
define i64 @test_arg_i64(i64 %a) {
; X32-LABEL: test_arg_i64:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl 4(%esp), %eax
; X32-NEXT: movl 8(%esp), %edx
; X32-NEXT: retl
;
; X64-LABEL: test_arg_i64:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movq %rdi, %rax
; X64-NEXT: retq
ret i64 %a
@@ -92,40 +84,331 @@ define i64 @test_arg_i64(i64 %a) {
define i64 @test_i64_args_8(i64 %arg1, i64 %arg2, i64 %arg3, i64 %arg4, i64 %arg5, i64 %arg6, i64 %arg7, i64 %arg8) {
; X32-LABEL: test_i64_args_8:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl 60(%esp), %eax
; X32-NEXT: movl 64(%esp), %edx
; X32-NEXT: retl
;
; X64-LABEL: test_i64_args_8:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movq 16(%rsp), %rax
; X64-NEXT: retq
-
ret i64 %arg8
}
define <4 x i32> @test_v4i32_args(<4 x i32> %arg1, <4 x i32> %arg2) {
; X32-LABEL: test_v4i32_args:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movaps %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_v4i32_args:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movaps %xmm1, %xmm0
; X64-NEXT: retq
ret <4 x i32> %arg2
}
-define <8 x i32> @test_v8i32_args(<8 x i32> %arg1) {
+define <8 x i32> @test_v8i32_args(<8 x i32> %arg1, <8 x i32> %arg2) {
; X32-LABEL: test_v8i32_args:
-; X32: # BB#0:
+; X32: # %bb.0:
+; X32-NEXT: subl $12, %esp
+; X32-NEXT: .cfi_def_cfa_offset 16
+; X32-NEXT: movups 16(%esp), %xmm1
+; X32-NEXT: movaps %xmm2, %xmm0
+; X32-NEXT: addl $12, %esp
; X32-NEXT: retl
;
; X64-LABEL: test_v8i32_args:
-; X64: # BB#0:
+; X64: # %bb.0:
+; X64-NEXT: movaps %xmm2, %xmm0
+; X64-NEXT: movaps %xmm3, %xmm1
+; X64-NEXT: retq
+ ret <8 x i32> %arg2
+}
+
+declare void @trivial_callee()
+define void @test_trivial_call() {
+; X32-LABEL: test_trivial_call:
+; X32: # %bb.0:
+; X32-NEXT: subl $12, %esp
+; X32-NEXT: .cfi_def_cfa_offset 16
+; X32-NEXT: calll trivial_callee
+; X32-NEXT: addl $12, %esp
+; X32-NEXT: retl
+;
+; X64-LABEL: test_trivial_call:
+; X64: # %bb.0:
+; X64-NEXT: pushq %rax
+; X64-NEXT: .cfi_def_cfa_offset 16
+; X64-NEXT: callq trivial_callee
+; X64-NEXT: popq %rax
+; X64-NEXT: retq
+ call void @trivial_callee()
+ ret void
+}
+
+declare void @simple_arg_callee(i32 %in0, i32 %in1)
+define void @test_simple_arg_call(i32 %in0, i32 %in1) {
+; X32-LABEL: test_simple_arg_call:
+; X32: # %bb.0:
+; X32-NEXT: subl $12, %esp
+; X32-NEXT: .cfi_def_cfa_offset 16
+; X32-NEXT: movl 16(%esp), %eax
+; X32-NEXT: movl 20(%esp), %ecx
+; X32-NEXT: movl %ecx, (%esp)
+; X32-NEXT: movl %eax, 4(%esp)
+; X32-NEXT: calll simple_arg_callee
+; X32-NEXT: addl $12, %esp
+; X32-NEXT: retl
+;
+; X64-LABEL: test_simple_arg_call:
+; X64: # %bb.0:
+; X64-NEXT: pushq %rax
+; X64-NEXT: .cfi_def_cfa_offset 16
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: movl %esi, %edi
+; X64-NEXT: movl %eax, %esi
+; X64-NEXT: callq simple_arg_callee
+; X64-NEXT: popq %rax
+; X64-NEXT: retq
+ call void @simple_arg_callee(i32 %in1, i32 %in0)
+ ret void
+}
+
+declare void @simple_arg8_callee(i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7, i32 %arg8)
+define void @test_simple_arg8_call(i32 %in0) {
+; X32-LABEL: test_simple_arg8_call:
+; X32: # %bb.0:
+; X32-NEXT: subl $44, %esp
+; X32-NEXT: .cfi_def_cfa_offset 48
+; X32-NEXT: movl 48(%esp), %eax
+; X32-NEXT: movl %eax, (%esp)
+; X32-NEXT: movl %eax, 4(%esp)
+; X32-NEXT: movl %eax, 8(%esp)
+; X32-NEXT: movl %eax, 12(%esp)
+; X32-NEXT: movl %eax, 16(%esp)
+; X32-NEXT: movl %eax, 20(%esp)
+; X32-NEXT: movl %eax, 24(%esp)
+; X32-NEXT: movl %eax, 28(%esp)
+; X32-NEXT: calll simple_arg8_callee
+; X32-NEXT: addl $44, %esp
+; X32-NEXT: retl
+;
+; X64-LABEL: test_simple_arg8_call:
+; X64: # %bb.0:
+; X64-NEXT: subq $24, %rsp
+; X64-NEXT: .cfi_def_cfa_offset 32
+; X64-NEXT: movl %edi, (%rsp)
+; X64-NEXT: movl %edi, 8(%rsp)
+; X64-NEXT: movl %edi, %esi
+; X64-NEXT: movl %edi, %edx
+; X64-NEXT: movl %edi, %ecx
+; X64-NEXT: movl %edi, %r8d
+; X64-NEXT: movl %edi, %r9d
+; X64-NEXT: callq simple_arg8_callee
+; X64-NEXT: addq $24, %rsp
+; X64-NEXT: retq
+ call void @simple_arg8_callee(i32 %in0, i32 %in0, i32 %in0, i32 %in0,i32 %in0, i32 %in0, i32 %in0, i32 %in0)
+ ret void
+}
+
+declare i32 @simple_return_callee(i32 %in0)
+define i32 @test_simple_return_callee() {
+; X32-LABEL: test_simple_return_callee:
+; X32: # %bb.0:
+; X32-NEXT: subl $12, %esp
+; X32-NEXT: .cfi_def_cfa_offset 16
+; X32-NEXT: movl $5, %eax
+; X32-NEXT: movl %eax, (%esp)
+; X32-NEXT: calll simple_return_callee
+; X32-NEXT: addl %eax, %eax
+; X32-NEXT: addl $12, %esp
+; X32-NEXT: retl
+;
+; X64-LABEL: test_simple_return_callee:
+; X64: # %bb.0:
+; X64-NEXT: pushq %rax
+; X64-NEXT: .cfi_def_cfa_offset 16
+; X64-NEXT: movl $5, %edi
+; X64-NEXT: callq simple_return_callee
+; X64-NEXT: addl %eax, %eax
+; X64-NEXT: popq %rcx
+; X64-NEXT: retq
+ %call = call i32 @simple_return_callee(i32 5)
+ %r = add i32 %call, %call
+ ret i32 %r
+}
+
+declare <8 x i32> @split_return_callee(<8 x i32> %in0)
+define <8 x i32> @test_split_return_callee(<8 x i32> %arg1, <8 x i32> %arg2) {
+; X32-LABEL: test_split_return_callee:
+; X32: # %bb.0:
+; X32-NEXT: subl $44, %esp
+; X32-NEXT: .cfi_def_cfa_offset 48
+; X32-NEXT: movaps %xmm0, (%esp) # 16-byte Spill
+; X32-NEXT: movaps %xmm1, 16(%esp) # 16-byte Spill
+; X32-NEXT: movdqu 48(%esp), %xmm1
+; X32-NEXT: movdqa %xmm2, %xmm0
+; X32-NEXT: calll split_return_callee
+; X32-NEXT: paddd (%esp), %xmm0 # 16-byte Folded Reload
+; X32-NEXT: paddd 16(%esp), %xmm1 # 16-byte Folded Reload
+; X32-NEXT: addl $44, %esp
+; X32-NEXT: retl
+;
+; X64-LABEL: test_split_return_callee:
+; X64: # %bb.0:
+; X64-NEXT: subq $40, %rsp
+; X64-NEXT: .cfi_def_cfa_offset 48
+; X64-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
+; X64-NEXT: movaps %xmm1, 16(%rsp) # 16-byte Spill
+; X64-NEXT: movdqa %xmm2, %xmm0
+; X64-NEXT: movdqa %xmm3, %xmm1
+; X64-NEXT: callq split_return_callee
+; X64-NEXT: paddd (%rsp), %xmm0 # 16-byte Folded Reload
+; X64-NEXT: paddd 16(%rsp), %xmm1 # 16-byte Folded Reload
+; X64-NEXT: addq $40, %rsp
+; X64-NEXT: retq
+ %call = call <8 x i32> @split_return_callee(<8 x i32> %arg2)
+ %r = add <8 x i32> %arg1, %call
+ ret <8 x i32> %r
+}
+
+define void @test_indirect_call(void()* %func) {
+; X32-LABEL: test_indirect_call:
+; X32: # %bb.0:
+; X32-NEXT: subl $12, %esp
+; X32-NEXT: .cfi_def_cfa_offset 16
+; X32-NEXT: calll *16(%esp)
+; X32-NEXT: addl $12, %esp
+; X32-NEXT: retl
+;
+; X64-LABEL: test_indirect_call:
+; X64: # %bb.0:
+; X64-NEXT: pushq %rax
+; X64-NEXT: .cfi_def_cfa_offset 16
+; X64-NEXT: callq *%rdi
+; X64-NEXT: popq %rax
+; X64-NEXT: retq
+ call void %func()
+ ret void
+}
+
+declare void @take_char(i8)
+define void @test_abi_exts_call(i8* %addr) {
+; X32-LABEL: test_abi_exts_call:
+; X32: # %bb.0:
+; X32-NEXT: pushl %ebx
+; X32-NEXT: .cfi_def_cfa_offset 8
+; X32-NEXT: pushl %esi
+; X32-NEXT: .cfi_def_cfa_offset 12
+; X32-NEXT: pushl %eax
+; X32-NEXT: .cfi_def_cfa_offset 16
+; X32-NEXT: .cfi_offset %esi, -12
+; X32-NEXT: .cfi_offset %ebx, -8
+; X32-NEXT: movl 16(%esp), %eax
+; X32-NEXT: movb (%eax), %bl
+; X32-NEXT: movzbl %bl, %esi
+; X32-NEXT: movl %esi, (%esp)
+; X32-NEXT: calll take_char
+; X32-NEXT: movsbl %bl, %eax
+; X32-NEXT: movl %eax, (%esp)
+; X32-NEXT: calll take_char
+; X32-NEXT: movl %esi, (%esp)
+; X32-NEXT: calll take_char
+; X32-NEXT: addl $4, %esp
+; X32-NEXT: popl %esi
+; X32-NEXT: popl %ebx
+; X32-NEXT: retl
+;
+; X64-LABEL: test_abi_exts_call:
+; X64: # %bb.0:
+; X64-NEXT: pushq %rbx
+; X64-NEXT: .cfi_def_cfa_offset 16
+; X64-NEXT: .cfi_offset %rbx, -16
+; X64-NEXT: movb (%rdi), %al
+; X64-NEXT: movzbl %al, %ebx
+; X64-NEXT: movl %ebx, %edi
+; X64-NEXT: callq take_char
+; X64-NEXT: movsbl %bl, %edi
+; X64-NEXT: callq take_char
+; X64-NEXT: movl %ebx, %edi
+; X64-NEXT: callq take_char
+; X64-NEXT: popq %rbx
+; X64-NEXT: retq
+ %val = load i8, i8* %addr
+ call void @take_char(i8 %val)
+ call void @take_char(i8 signext %val)
+ call void @take_char(i8 zeroext %val)
+ ret void
+}
+
+declare void @variadic_callee(i8*, ...)
+define void @test_variadic_call_1(i8** %addr_ptr, i32* %val_ptr) {
+; X32-LABEL: test_variadic_call_1:
+; X32: # %bb.0:
+; X32-NEXT: subl $12, %esp
+; X32-NEXT: .cfi_def_cfa_offset 16
+; X32-NEXT: movl 16(%esp), %eax
+; X32-NEXT: movl 20(%esp), %ecx
+; X32-NEXT: movl (%eax), %eax
+; X32-NEXT: movl (%ecx), %ecx
+; X32-NEXT: movl %eax, (%esp)
+; X32-NEXT: movl %ecx, 4(%esp)
+; X32-NEXT: calll variadic_callee
+; X32-NEXT: addl $12, %esp
+; X32-NEXT: retl
+;
+; X64-LABEL: test_variadic_call_1:
+; X64: # %bb.0:
+; X64-NEXT: pushq %rax
+; X64-NEXT: .cfi_def_cfa_offset 16
+; X64-NEXT: movq (%rdi), %rdi
+; X64-NEXT: movl (%rsi), %esi
+; X64-NEXT: movb $0, %al
+; X64-NEXT: callq variadic_callee
+; X64-NEXT: popq %rax
+; X64-NEXT: retq
+
+ %addr = load i8*, i8** %addr_ptr
+ %val = load i32, i32* %val_ptr
+ call void (i8*, ...) @variadic_callee(i8* %addr, i32 %val)
+ ret void
+}
+
+define void @test_variadic_call_2(i8** %addr_ptr, double* %val_ptr) {
+; X32-LABEL: test_variadic_call_2:
+; X32: # %bb.0:
+; X32-NEXT: subl $12, %esp
+; X32-NEXT: .cfi_def_cfa_offset 16
+; X32-NEXT: movl 16(%esp), %eax
+; X32-NEXT: movl 20(%esp), %ecx
+; X32-NEXT: movl (%eax), %eax
+; X32-NEXT: movl (%ecx), %edx
+; X32-NEXT: movl 4(%ecx), %ecx
+; X32-NEXT: movl %eax, (%esp)
+; X32-NEXT: movl $4, %eax
+; X32-NEXT: leal (%esp,%eax), %eax
+; X32-NEXT: movl %edx, 4(%esp)
+; X32-NEXT: movl %ecx, 4(%eax)
+; X32-NEXT: calll variadic_callee
+; X32-NEXT: addl $12, %esp
+; X32-NEXT: retl
+;
+; X64-LABEL: test_variadic_call_2:
+; X64: # %bb.0:
+; X64-NEXT: pushq %rax
+; X64-NEXT: .cfi_def_cfa_offset 16
+; X64-NEXT: movq (%rdi), %rdi
+; X64-NEXT: movq (%rsi), %rcx
+; X64-NEXT: movb $1, %al
+; X64-NEXT: movq %rcx, %xmm0
+; X64-NEXT: callq variadic_callee
+; X64-NEXT: popq %rax
; X64-NEXT: retq
- ret <8 x i32> %arg1
+ %addr = load i8*, i8** %addr_ptr
+ %val = load double, double* %val_ptr
+ call void (i8*, ...) @variadic_callee(i8* %addr, double %val)
+ ret void
}
diff --git a/test/CodeGen/X86/GlobalISel/cmp.ll b/test/CodeGen/X86/GlobalISel/cmp.ll
index 39fee409d785..085f5e326754 100644
--- a/test/CodeGen/X86/GlobalISel/cmp.ll
+++ b/test/CodeGen/X86/GlobalISel/cmp.ll
@@ -3,7 +3,7 @@
define i32 @test_icmp_eq_i8(i8 %a, i8 %b) {
; ALL-LABEL: test_icmp_eq_i8:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: cmpb %sil, %dil
; ALL-NEXT: sete %al
; ALL-NEXT: andl $1, %eax
@@ -15,7 +15,7 @@ define i32 @test_icmp_eq_i8(i8 %a, i8 %b) {
define i32 @test_icmp_eq_i16(i16 %a, i16 %b) {
; ALL-LABEL: test_icmp_eq_i16:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: cmpw %si, %di
; ALL-NEXT: sete %al
; ALL-NEXT: andl $1, %eax
@@ -27,7 +27,7 @@ define i32 @test_icmp_eq_i16(i16 %a, i16 %b) {
define i32 @test_icmp_eq_i64(i64 %a, i64 %b) {
; ALL-LABEL: test_icmp_eq_i64:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: cmpq %rsi, %rdi
; ALL-NEXT: sete %al
; ALL-NEXT: andl $1, %eax
@@ -39,7 +39,7 @@ define i32 @test_icmp_eq_i64(i64 %a, i64 %b) {
define i32 @test_icmp_eq_i32(i32 %a, i32 %b) {
; ALL-LABEL: test_icmp_eq_i32:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: cmpl %esi, %edi
; ALL-NEXT: sete %al
; ALL-NEXT: andl $1, %eax
@@ -51,7 +51,7 @@ define i32 @test_icmp_eq_i32(i32 %a, i32 %b) {
define i32 @test_icmp_ne_i32(i32 %a, i32 %b) {
; ALL-LABEL: test_icmp_ne_i32:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: cmpl %esi, %edi
; ALL-NEXT: setne %al
; ALL-NEXT: andl $1, %eax
@@ -63,7 +63,7 @@ define i32 @test_icmp_ne_i32(i32 %a, i32 %b) {
define i32 @test_icmp_ugt_i32(i32 %a, i32 %b) {
; ALL-LABEL: test_icmp_ugt_i32:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: cmpl %esi, %edi
; ALL-NEXT: seta %al
; ALL-NEXT: andl $1, %eax
@@ -75,7 +75,7 @@ define i32 @test_icmp_ugt_i32(i32 %a, i32 %b) {
define i32 @test_icmp_uge_i32(i32 %a, i32 %b) {
; ALL-LABEL: test_icmp_uge_i32:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: cmpl %esi, %edi
; ALL-NEXT: setae %al
; ALL-NEXT: andl $1, %eax
@@ -87,7 +87,7 @@ define i32 @test_icmp_uge_i32(i32 %a, i32 %b) {
define i32 @test_icmp_ult_i32(i32 %a, i32 %b) {
; ALL-LABEL: test_icmp_ult_i32:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: cmpl %esi, %edi
; ALL-NEXT: setb %al
; ALL-NEXT: andl $1, %eax
@@ -99,7 +99,7 @@ define i32 @test_icmp_ult_i32(i32 %a, i32 %b) {
define i32 @test_icmp_ule_i32(i32 %a, i32 %b) {
; ALL-LABEL: test_icmp_ule_i32:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: cmpl %esi, %edi
; ALL-NEXT: setbe %al
; ALL-NEXT: andl $1, %eax
@@ -111,7 +111,7 @@ define i32 @test_icmp_ule_i32(i32 %a, i32 %b) {
define i32 @test_icmp_sgt_i32(i32 %a, i32 %b) {
; ALL-LABEL: test_icmp_sgt_i32:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: cmpl %esi, %edi
; ALL-NEXT: setg %al
; ALL-NEXT: andl $1, %eax
@@ -123,7 +123,7 @@ define i32 @test_icmp_sgt_i32(i32 %a, i32 %b) {
define i32 @test_icmp_sge_i32(i32 %a, i32 %b) {
; ALL-LABEL: test_icmp_sge_i32:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: cmpl %esi, %edi
; ALL-NEXT: setge %al
; ALL-NEXT: andl $1, %eax
@@ -135,7 +135,7 @@ define i32 @test_icmp_sge_i32(i32 %a, i32 %b) {
define i32 @test_icmp_slt_i32(i32 %a, i32 %b) {
; ALL-LABEL: test_icmp_slt_i32:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: cmpl %esi, %edi
; ALL-NEXT: setl %al
; ALL-NEXT: andl $1, %eax
@@ -147,7 +147,7 @@ define i32 @test_icmp_slt_i32(i32 %a, i32 %b) {
define i32 @test_icmp_sle_i32(i32 %a, i32 %b) {
; ALL-LABEL: test_icmp_sle_i32:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: cmpl %esi, %edi
; ALL-NEXT: setle %al
; ALL-NEXT: andl $1, %eax
diff --git a/test/CodeGen/X86/GlobalISel/constant.ll b/test/CodeGen/X86/GlobalISel/constant.ll
index 5b512f9ce937..f6ebb70fcf50 100644
--- a/test/CodeGen/X86/GlobalISel/constant.ll
+++ b/test/CodeGen/X86/GlobalISel/constant.ll
@@ -3,7 +3,7 @@
define i8 @const_i8() {
; ALL-LABEL: const_i8:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: movb $2, %al
; ALL-NEXT: retq
ret i8 2
@@ -11,7 +11,7 @@ define i8 @const_i8() {
define i16 @const_i16() {
; ALL-LABEL: const_i16:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: movw $3, %ax
; ALL-NEXT: retq
ret i16 3
@@ -19,7 +19,7 @@ define i16 @const_i16() {
define i32 @const_i32() {
; ALL-LABEL: const_i32:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: movl $4, %eax
; ALL-NEXT: retq
ret i32 4
@@ -27,7 +27,7 @@ define i32 @const_i32() {
define i64 @const_i64() {
; ALL-LABEL: const_i64:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: movabsq $68719476720, %rax # imm = 0xFFFFFFFF0
; ALL-NEXT: retq
ret i64 68719476720
@@ -36,7 +36,7 @@ define i64 @const_i64() {
;i64 value fit into u32
define i64 @const_i64_u32() {
; ALL-LABEL: const_i64_u32:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: movq $1879048192, %rax # imm = 0x70000000
; ALL-NEXT: retq
ret i64 1879048192
@@ -45,7 +45,7 @@ define i64 @const_i64_u32() {
;i64 value fit into i32
define i64 @const_i64_i32() {
; ALL-LABEL: const_i64_i32:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: movq $-1, %rax
; ALL-NEXT: retq
ret i64 -1
@@ -53,7 +53,7 @@ define i64 @const_i64_i32() {
define void @main(i32 ** %data) {
; ALL-LABEL: main:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: movq $0, %rax
; ALL-NEXT: movq %rax, (%rdi)
; ALL-NEXT: retq
diff --git a/test/CodeGen/X86/GlobalISel/ext-x86-64.ll b/test/CodeGen/X86/GlobalISel/ext-x86-64.ll
index 11b03bd56110..6b93a2b9de23 100644
--- a/test/CodeGen/X86/GlobalISel/ext-x86-64.ll
+++ b/test/CodeGen/X86/GlobalISel/ext-x86-64.ll
@@ -5,8 +5,8 @@
define i64 @test_zext_i1(i8 %a) {
; X64-LABEL: test_zext_i1:
-; X64: # BB#0:
-; X64-NEXT: # kill: %DIL<def> %DIL<kill> %RDI<def>
+; X64: # %bb.0:
+; X64-NEXT: # kill: def %edi killed %edi def %rdi
; X64-NEXT: andq $1, %rdi
; X64-NEXT: movq %rdi, %rax
; X64-NEXT: retq
@@ -17,7 +17,7 @@ define i64 @test_zext_i1(i8 %a) {
define i64 @test_sext_i8(i8 %val) {
; X64-LABEL: test_sext_i8:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movsbq %dil, %rax
; X64-NEXT: retq
%r = sext i8 %val to i64
@@ -26,7 +26,7 @@ define i64 @test_sext_i8(i8 %val) {
define i64 @test_sext_i16(i16 %val) {
; X64-LABEL: test_sext_i16:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movswq %di, %rax
; X64-NEXT: retq
%r = sext i16 %val to i64
diff --git a/test/CodeGen/X86/GlobalISel/ext.ll b/test/CodeGen/X86/GlobalISel/ext.ll
index d9a09678cf4b..51cee2b51d38 100644
--- a/test/CodeGen/X86/GlobalISel/ext.ll
+++ b/test/CodeGen/X86/GlobalISel/ext.ll
@@ -4,16 +4,16 @@
define i8 @test_zext_i1toi8(i32 %a) {
; X64-LABEL: test_zext_i1toi8:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: andb $1, %dil
; X64-NEXT: movl %edi, %eax
; X64-NEXT: retq
;
; X32-LABEL: test_zext_i1toi8:
-; X32: # BB#0:
-; X32-NEXT: movl 4(%esp), %eax
+; X32: # %bb.0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: andb $1, %al
-; X32-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; X32-NEXT: # kill: def %al killed %al killed %eax
; X32-NEXT: retl
%val = trunc i32 %a to i1
%r = zext i1 %val to i8
@@ -22,16 +22,16 @@ define i8 @test_zext_i1toi8(i32 %a) {
define i16 @test_zext_i1toi16(i32 %a) {
; X64-LABEL: test_zext_i1toi16:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: andw $1, %di
; X64-NEXT: movl %edi, %eax
; X64-NEXT: retq
;
; X32-LABEL: test_zext_i1toi16:
-; X32: # BB#0:
-; X32-NEXT: movl 4(%esp), %eax
+; X32: # %bb.0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: andw $1, %ax
-; X32-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; X32-NEXT: # kill: def %ax killed %ax killed %eax
; X32-NEXT: retl
%val = trunc i32 %a to i1
%r = zext i1 %val to i16
@@ -40,14 +40,14 @@ define i16 @test_zext_i1toi16(i32 %a) {
define i32 @test_zext_i1(i32 %a) {
; X64-LABEL: test_zext_i1:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: andl $1, %edi
; X64-NEXT: movl %edi, %eax
; X64-NEXT: retq
;
; X32-LABEL: test_zext_i1:
-; X32: # BB#0:
-; X32-NEXT: movl 4(%esp), %eax
+; X32: # %bb.0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: andl $1, %eax
; X32-NEXT: retl
%val = trunc i32 %a to i1
@@ -57,13 +57,13 @@ define i32 @test_zext_i1(i32 %a) {
define i32 @test_zext_i8(i8 %val) {
; X64-LABEL: test_zext_i8:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movzbl %dil, %eax
; X64-NEXT: retq
;
; X32-LABEL: test_zext_i8:
-; X32: # BB#0:
-; X32-NEXT: movzbl 4(%esp), %eax
+; X32: # %bb.0:
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; X32-NEXT: retl
%r = zext i8 %val to i32
ret i32 %r
@@ -71,13 +71,13 @@ define i32 @test_zext_i8(i8 %val) {
define i32 @test_zext_i16(i16 %val) {
; X64-LABEL: test_zext_i16:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movzwl %di, %eax
; X64-NEXT: retq
;
; X32-LABEL: test_zext_i16:
-; X32: # BB#0:
-; X32-NEXT: movzwl 4(%esp), %eax
+; X32: # %bb.0:
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: retl
%r = zext i16 %val to i32
ret i32 %r
@@ -85,13 +85,13 @@ define i32 @test_zext_i16(i16 %val) {
define i32 @test_sext_i8(i8 %val) {
; X64-LABEL: test_sext_i8:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movsbl %dil, %eax
; X64-NEXT: retq
;
; X32-LABEL: test_sext_i8:
-; X32: # BB#0:
-; X32-NEXT: movsbl 4(%esp), %eax
+; X32: # %bb.0:
+; X32-NEXT: movsbl {{[0-9]+}}(%esp), %eax
; X32-NEXT: retl
%r = sext i8 %val to i32
ret i32 %r
@@ -99,13 +99,13 @@ define i32 @test_sext_i8(i8 %val) {
define i32 @test_sext_i16(i16 %val) {
; X64-LABEL: test_sext_i16:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movswl %di, %eax
; X64-NEXT: retq
;
; X32-LABEL: test_sext_i16:
-; X32: # BB#0:
-; X32-NEXT: movswl 4(%esp), %eax
+; X32: # %bb.0:
+; X32-NEXT: movswl {{[0-9]+}}(%esp), %eax
; X32-NEXT: retl
%r = sext i16 %val to i32
ret i32 %r
diff --git a/test/CodeGen/X86/GlobalISel/fadd-scalar.ll b/test/CodeGen/X86/GlobalISel/fadd-scalar.ll
index 6aee06a75f6a..0fa1142c30a6 100644
--- a/test/CodeGen/X86/GlobalISel/fadd-scalar.ll
+++ b/test/CodeGen/X86/GlobalISel/fadd-scalar.ll
@@ -2,7 +2,7 @@
; RUN: llc -mtriple=x86_64-linux-gnu -global-isel -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X64
define float @test_fadd_float(float %arg1, float %arg2) {
; ALL-LABEL: test_fadd_float:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: addss %xmm1, %xmm0
; ALL-NEXT: retq
%ret = fadd float %arg1, %arg2
@@ -11,7 +11,7 @@ define float @test_fadd_float(float %arg1, float %arg2) {
define double @test_fadd_double(double %arg1, double %arg2) {
; ALL-LABEL: test_fadd_double:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: addsd %xmm1, %xmm0
; ALL-NEXT: retq
%ret = fadd double %arg1, %arg2
diff --git a/test/CodeGen/X86/GlobalISel/fconstant.ll b/test/CodeGen/X86/GlobalISel/fconstant.ll
new file mode 100644
index 000000000000..6c3586acd377
--- /dev/null
+++ b/test/CodeGen/X86/GlobalISel/fconstant.ll
@@ -0,0 +1,40 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=x86_64-linux-gnu -mattr=+sse2 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK64 --check-prefix=CHECK_SMALL --check-prefix=CHECK_SMALL64 --check-prefix=CHECK_NOPIC64
+; RUN: llc -mtriple=x86_64-linux-gnu -mattr=+sse2 -global-isel -code-model=large -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK64 --check-prefix=CHECK_LARGE --check-prefix=CHECK_LARGE64
+; RUN: llc -mtriple=i386-linux-gnu -mattr=+sse2 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK32 --check-prefix=CHECK_SMALL --check-prefix=CHECK_SMALL32
+; RUN: llc -mtriple=i386-linux-gnu -mattr=+sse2 -global-isel -code-model=large -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK32 --check-prefix=CHECK_LARGE --check-prefix=CHECK_LARGE32
+; RUN: llc -mtriple=x86_64-linux-gnu -mattr=+sse2 -global-isel -relocation-model=pic -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK64 --check-prefix=CHECK_SMALL --check-prefix=CHECK_SMALL64 --check-prefix=CHECK_PIC64
+
+define void @test_float(float* %a , float %b) {
+; CHECK_SMALL64-LABEL: test_float:
+; CHECK_SMALL64: # %bb.0: # %entry
+; CHECK_SMALL64-NEXT: movss .LCPI0_0(%rip), %xmm1 # xmm1 = mem[0],zero,zero,zero
+; CHECK_SMALL64-NEXT: addss %xmm0, %xmm1
+; CHECK_SMALL64-NEXT: movd %xmm1, %eax
+; CHECK_SMALL64-NEXT: movl %eax, (%rdi)
+; CHECK_SMALL64-NEXT: retq
+;
+; CHECK_LARGE64-LABEL: test_float:
+; CHECK_LARGE64: # %bb.0: # %entry
+; CHECK_LARGE64-NEXT: movabsq $.LCPI0_0, %rax
+; CHECK_LARGE64-NEXT: addss (%rax), %xmm0
+; CHECK_LARGE64-NEXT: movd %xmm0, %eax
+; CHECK_LARGE64-NEXT: movl %eax, (%rdi)
+; CHECK_LARGE64-NEXT: retq
+;
+; CHECK32-LABEL: test_float:
+; CHECK32: # %bb.0: # %entry
+; CHECK32-NEXT: movl 4(%esp), %eax
+; CHECK32-NEXT: movl 8(%esp), %ecx
+; CHECK32-NEXT: movss .LCPI0_0, %xmm0 # xmm0 = mem[0],zero,zero,zero
+; CHECK32-NEXT: movd %ecx, %xmm1
+; CHECK32-NEXT: addss %xmm0, %xmm1
+; CHECK32-NEXT: movd %xmm1, %ecx
+; CHECK32-NEXT: movl %ecx, (%eax)
+; CHECK32-NEXT: retl
+entry:
+ %aa = fadd float 5.500000e+00, %b
+ store float %aa, float* %a
+ ret void
+}
+
diff --git a/test/CodeGen/X86/GlobalISel/fdiv-scalar.ll b/test/CodeGen/X86/GlobalISel/fdiv-scalar.ll
index 268802dc06aa..e05a36c4997e 100644
--- a/test/CodeGen/X86/GlobalISel/fdiv-scalar.ll
+++ b/test/CodeGen/X86/GlobalISel/fdiv-scalar.ll
@@ -2,7 +2,7 @@
; RUN: llc -mtriple=x86_64-linux-gnu -global-isel -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X64
define float @test_fdiv_float(float %arg1, float %arg2) {
; ALL-LABEL: test_fdiv_float:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: divss %xmm1, %xmm0
; ALL-NEXT: retq
%ret = fdiv float %arg1, %arg2
@@ -11,7 +11,7 @@ define float @test_fdiv_float(float %arg1, float %arg2) {
define double @test_fdiv_double(double %arg1, double %arg2) {
; ALL-LABEL: test_fdiv_double:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: divsd %xmm1, %xmm0
; ALL-NEXT: retq
%ret = fdiv double %arg1, %arg2
diff --git a/test/CodeGen/X86/GlobalISel/fmul-scalar.ll b/test/CodeGen/X86/GlobalISel/fmul-scalar.ll
index c7a37a14c33c..c2244cb8a5c3 100644
--- a/test/CodeGen/X86/GlobalISel/fmul-scalar.ll
+++ b/test/CodeGen/X86/GlobalISel/fmul-scalar.ll
@@ -2,7 +2,7 @@
; RUN: llc -mtriple=x86_64-linux-gnu -global-isel -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X64
define float @test_fmul_float(float %arg1, float %arg2) {
; ALL-LABEL: test_fmul_float:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: mulss %xmm1, %xmm0
; ALL-NEXT: retq
%ret = fmul float %arg1, %arg2
@@ -11,7 +11,7 @@ define float @test_fmul_float(float %arg1, float %arg2) {
define double @test_fmul_double(double %arg1, double %arg2) {
; ALL-LABEL: test_fmul_double:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: mulsd %xmm1, %xmm0
; ALL-NEXT: retq
%ret = fmul double %arg1, %arg2
diff --git a/test/CodeGen/X86/GlobalISel/fpext-scalar.ll b/test/CodeGen/X86/GlobalISel/fpext-scalar.ll
new file mode 100644
index 000000000000..8501009e2915
--- /dev/null
+++ b/test/CodeGen/X86/GlobalISel/fpext-scalar.ll
@@ -0,0 +1,12 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=x86_64-linux-gnu -global-isel -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=CHECK
+
+define double @test(float %a) {
+; CHECK-LABEL: test:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: cvtss2sd %xmm0, %xmm0
+; CHECK-NEXT: retq
+entry:
+ %conv = fpext float %a to double
+ ret double %conv
+}
diff --git a/test/CodeGen/X86/GlobalISel/frameIndex.ll b/test/CodeGen/X86/GlobalISel/frameIndex.ll
index a9ec94defea8..1faa82b37c19 100644
--- a/test/CodeGen/X86/GlobalISel/frameIndex.ll
+++ b/test/CodeGen/X86/GlobalISel/frameIndex.ll
@@ -8,21 +8,20 @@
define i32* @allocai32() {
; X64-LABEL: allocai32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: leaq -4(%rsp), %rax
; X64-NEXT: retq
;
; X32-LABEL: allocai32:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pushl %eax
-; X32-NEXT: .Lcfi0:
; X32-NEXT: .cfi_def_cfa_offset 8
; X32-NEXT: movl %esp, %eax
; X32-NEXT: popl %ecx
; X32-NEXT: retl
;
; X32ABI-LABEL: allocai32:
-; X32ABI: # BB#0:
+; X32ABI: # %bb.0:
; X32ABI-NEXT: leal -4(%rsp), %eax
; X32ABI-NEXT: retq
%ptr1 = alloca i32
diff --git a/test/CodeGen/X86/GlobalISel/fsub-scalar.ll b/test/CodeGen/X86/GlobalISel/fsub-scalar.ll
index 32c25a3a0822..7fc9dd31490e 100644
--- a/test/CodeGen/X86/GlobalISel/fsub-scalar.ll
+++ b/test/CodeGen/X86/GlobalISel/fsub-scalar.ll
@@ -2,7 +2,7 @@
; RUN: llc -mtriple=x86_64-linux-gnu -global-isel -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X64
define float @test_fsub_float(float %arg1, float %arg2) {
; ALL-LABEL: test_fsub_float:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: subss %xmm1, %xmm0
; ALL-NEXT: retq
%ret = fsub float %arg1, %arg2
@@ -11,7 +11,7 @@ define float @test_fsub_float(float %arg1, float %arg2) {
define double @test_fsub_double(double %arg1, double %arg2) {
; ALL-LABEL: test_fsub_double:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: subsd %xmm1, %xmm0
; ALL-NEXT: retq
%ret = fsub double %arg1, %arg2
diff --git a/test/CodeGen/X86/GlobalISel/gep.ll b/test/CodeGen/X86/GlobalISel/gep.ll
index 94da9fb46761..97a986e27d23 100644
--- a/test/CodeGen/X86/GlobalISel/gep.ll
+++ b/test/CodeGen/X86/GlobalISel/gep.ll
@@ -4,7 +4,7 @@
define i32* @test_gep_i8(i32 *%arr, i8 %ind) {
; X64_GISEL-LABEL: test_gep_i8:
-; X64_GISEL: # BB#0:
+; X64_GISEL: # %bb.0:
; X64_GISEL-NEXT: movq $4, %rax
; X64_GISEL-NEXT: movsbq %sil, %rcx
; X64_GISEL-NEXT: imulq %rax, %rcx
@@ -12,8 +12,8 @@ define i32* @test_gep_i8(i32 *%arr, i8 %ind) {
; X64_GISEL-NEXT: retq
;
; X64-LABEL: test_gep_i8:
-; X64: # BB#0:
-; X64-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
+; X64: # %bb.0:
+; X64-NEXT: # kill: def %esi killed %esi def %rsi
; X64-NEXT: movsbq %sil, %rax
; X64-NEXT: leaq (%rdi,%rax,4), %rax
; X64-NEXT: retq
@@ -23,13 +23,13 @@ define i32* @test_gep_i8(i32 *%arr, i8 %ind) {
define i32* @test_gep_i8_const(i32 *%arr) {
; X64_GISEL-LABEL: test_gep_i8_const:
-; X64_GISEL: # BB#0:
+; X64_GISEL: # %bb.0:
; X64_GISEL-NEXT: movq $80, %rax
; X64_GISEL-NEXT: leaq (%rdi,%rax), %rax
; X64_GISEL-NEXT: retq
;
; X64-LABEL: test_gep_i8_const:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: leaq 80(%rdi), %rax
; X64-NEXT: retq
%arrayidx = getelementptr i32, i32* %arr, i8 20
@@ -38,7 +38,7 @@ define i32* @test_gep_i8_const(i32 *%arr) {
define i32* @test_gep_i16(i32 *%arr, i16 %ind) {
; X64_GISEL-LABEL: test_gep_i16:
-; X64_GISEL: # BB#0:
+; X64_GISEL: # %bb.0:
; X64_GISEL-NEXT: movq $4, %rax
; X64_GISEL-NEXT: movswq %si, %rcx
; X64_GISEL-NEXT: imulq %rax, %rcx
@@ -46,8 +46,8 @@ define i32* @test_gep_i16(i32 *%arr, i16 %ind) {
; X64_GISEL-NEXT: retq
;
; X64-LABEL: test_gep_i16:
-; X64: # BB#0:
-; X64-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
+; X64: # %bb.0:
+; X64-NEXT: # kill: def %esi killed %esi def %rsi
; X64-NEXT: movswq %si, %rax
; X64-NEXT: leaq (%rdi,%rax,4), %rax
; X64-NEXT: retq
@@ -57,13 +57,13 @@ define i32* @test_gep_i16(i32 *%arr, i16 %ind) {
define i32* @test_gep_i16_const(i32 *%arr) {
; X64_GISEL-LABEL: test_gep_i16_const:
-; X64_GISEL: # BB#0:
+; X64_GISEL: # %bb.0:
; X64_GISEL-NEXT: movq $80, %rax
; X64_GISEL-NEXT: leaq (%rdi,%rax), %rax
; X64_GISEL-NEXT: retq
;
; X64-LABEL: test_gep_i16_const:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: leaq 80(%rdi), %rax
; X64-NEXT: retq
%arrayidx = getelementptr i32, i32* %arr, i16 20
@@ -72,7 +72,7 @@ define i32* @test_gep_i16_const(i32 *%arr) {
define i32* @test_gep_i32(i32 *%arr, i32 %ind) {
; X64_GISEL-LABEL: test_gep_i32:
-; X64_GISEL: # BB#0:
+; X64_GISEL: # %bb.0:
; X64_GISEL-NEXT: movq $4, %rax
; X64_GISEL-NEXT: movslq %esi, %rcx
; X64_GISEL-NEXT: imulq %rax, %rcx
@@ -80,7 +80,7 @@ define i32* @test_gep_i32(i32 *%arr, i32 %ind) {
; X64_GISEL-NEXT: retq
;
; X64-LABEL: test_gep_i32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movslq %esi, %rax
; X64-NEXT: leaq (%rdi,%rax,4), %rax
; X64-NEXT: retq
@@ -90,13 +90,13 @@ define i32* @test_gep_i32(i32 *%arr, i32 %ind) {
define i32* @test_gep_i32_const(i32 *%arr) {
; X64_GISEL-LABEL: test_gep_i32_const:
-; X64_GISEL: # BB#0:
+; X64_GISEL: # %bb.0:
; X64_GISEL-NEXT: movq $20, %rax
; X64_GISEL-NEXT: leaq (%rdi,%rax), %rax
; X64_GISEL-NEXT: retq
;
; X64-LABEL: test_gep_i32_const:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: leaq 20(%rdi), %rax
; X64-NEXT: retq
%arrayidx = getelementptr i32, i32* %arr, i32 5
@@ -105,14 +105,14 @@ define i32* @test_gep_i32_const(i32 *%arr) {
define i32* @test_gep_i64(i32 *%arr, i64 %ind) {
; X64_GISEL-LABEL: test_gep_i64:
-; X64_GISEL: # BB#0:
+; X64_GISEL: # %bb.0:
; X64_GISEL-NEXT: movq $4, %rax
; X64_GISEL-NEXT: imulq %rsi, %rax
; X64_GISEL-NEXT: leaq (%rdi,%rax), %rax
; X64_GISEL-NEXT: retq
;
; X64-LABEL: test_gep_i64:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: leaq (%rdi,%rsi,4), %rax
; X64-NEXT: retq
%arrayidx = getelementptr i32, i32* %arr, i64 %ind
@@ -121,13 +121,13 @@ define i32* @test_gep_i64(i32 *%arr, i64 %ind) {
define i32* @test_gep_i64_const(i32 *%arr) {
; X64_GISEL-LABEL: test_gep_i64_const:
-; X64_GISEL: # BB#0:
+; X64_GISEL: # %bb.0:
; X64_GISEL-NEXT: movq $20, %rax
; X64_GISEL-NEXT: leaq (%rdi,%rax), %rax
; X64_GISEL-NEXT: retq
;
; X64-LABEL: test_gep_i64_const:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: leaq 20(%rdi), %rax
; X64-NEXT: retq
%arrayidx = getelementptr i32, i32* %arr, i64 5
diff --git a/test/CodeGen/X86/GlobalISel/irtranslator-callingconv.ll b/test/CodeGen/X86/GlobalISel/irtranslator-callingconv.ll
index 00aa7cf84e55..0b906e7a9859 100644
--- a/test/CodeGen/X86/GlobalISel/irtranslator-callingconv.ll
+++ b/test/CodeGen/X86/GlobalISel/irtranslator-callingconv.ll
@@ -11,49 +11,75 @@ define i8 @test_i8_args_8(i8 %arg1, i8 %arg2, i8 %arg3, i8 %arg4,
; ALL-LABEL: name: test_i8_args_8
; X64: fixedStack:
-; X64: id: [[STACK8:[0-9]+]], type: default, offset: 8, size: 1, alignment: 8, isImmutable: true,
-; X64: id: [[STACK0:[0-9]+]], type: default, offset: 0, size: 1, alignment: 16, isImmutable: true,
+; X64: id: [[STACK8:[0-9]+]], type: default, offset: 8, size: 1, alignment: 8,
+; X64-NEXT: isImmutable: true,
+
+; X64: id: [[STACK0:[0-9]+]], type: default, offset: 0, size: 1, alignment: 16,
+; X64-NEXT: isImmutable: true,
+
; X64: liveins: %ecx, %edi, %edx, %esi, %r8d, %r9d
-; X64: [[ARG1:%[0-9]+]](s8) = COPY %edi
-; X64-NEXT: %{{[0-9]+}}(s8) = COPY %esi
-; X64-NEXT: %{{[0-9]+}}(s8) = COPY %edx
-; X64-NEXT: %{{[0-9]+}}(s8) = COPY %ecx
-; X64-NEXT: %{{[0-9]+}}(s8) = COPY %r8d
-; X64-NEXT: %{{[0-9]+}}(s8) = COPY %r9d
-; X64-NEXT: [[ARG7_ADDR:%[0-9]+]](p0) = G_FRAME_INDEX %fixed-stack.[[STACK0]]
-; X64-NEXT: [[ARG7:%[0-9]+]](s8) = G_LOAD [[ARG7_ADDR]](p0) :: (invariant load 1 from %fixed-stack.[[STACK0]], align 0)
-; X64-NEXT: [[ARG8_ADDR:%[0-9]+]](p0) = G_FRAME_INDEX %fixed-stack.[[STACK8]]
-; X64-NEXT: [[ARG8:%[0-9]+]](s8) = G_LOAD [[ARG8_ADDR]](p0) :: (invariant load 1 from %fixed-stack.[[STACK8]], align 0)
+; X64: [[ARG1_TMP:%[0-9]+]]:_(s32) = COPY %edi
+; X64: [[ARG1:%[0-9]+]]:_(s8) = G_TRUNC [[ARG1_TMP]](s32)
+; X64-NEXT: %{{[0-9]+}}:_(s32) = COPY %esi
+; X64-NEXT: %{{[0-9]+}}:_(s8) = G_TRUNC %{{[0-9]+}}(s32)
+; X64-NEXT: %{{[0-9]+}}:_(s32) = COPY %edx
+; X64-NEXT: %{{[0-9]+}}:_(s8) = G_TRUNC %{{[0-9]+}}(s32)
+; X64-NEXT: %{{[0-9]+}}:_(s32) = COPY %ecx
+; X64-NEXT: %{{[0-9]+}}:_(s8) = G_TRUNC %{{[0-9]+}}(s32)
+; X64-NEXT: %{{[0-9]+}}:_(s32) = COPY %r8d
+; X64-NEXT: %{{[0-9]+}}:_(s8) = G_TRUNC %{{[0-9]+}}(s32)
+; X64-NEXT: %{{[0-9]+}}:_(s32) = COPY %r9d
+; X64-NEXT: %{{[0-9]+}}:_(s8) = G_TRUNC %{{[0-9]+}}(s32)
+; X64-NEXT: [[ARG7_ADDR:%[0-9]+]]:_(p0) = G_FRAME_INDEX %fixed-stack.[[STACK0]]
+; X64-NEXT: [[ARG7:%[0-9]+]]:_(s8) = G_LOAD [[ARG7_ADDR]](p0) :: (invariant load 1 from %fixed-stack.[[STACK0]], align 0)
+; X64-NEXT: [[ARG8_ADDR:%[0-9]+]]:_(p0) = G_FRAME_INDEX %fixed-stack.[[STACK8]]
+; X64-NEXT: [[ARG8:%[0-9]+]]:_(s8) = G_LOAD [[ARG8_ADDR]](p0) :: (invariant load 1 from %fixed-stack.[[STACK8]], align 0)
; X32: fixedStack:
-; X32: id: [[STACK28:[0-9]+]], type: default, offset: 28, size: 1, alignment: 4, isImmutable: true,
-; X32: id: [[STACK24:[0-9]+]], type: default, offset: 24, size: 1, alignment: 8, isImmutable: true,
-; X32: id: [[STACK20:[0-9]+]], type: default, offset: 20, size: 1, alignment: 4, isImmutable: true,
-; X32: id: [[STACK16:[0-9]+]], type: default, offset: 16, size: 1, alignment: 16, isImmutable: true,
-; X32: id: [[STACK12:[0-9]+]], type: default, offset: 12, size: 1, alignment: 4, isImmutable: true,
-; X32: id: [[STACK8:[0-9]+]], type: default, offset: 8, size: 1, alignment: 8, isImmutable: true,
-; X32: id: [[STACK4:[0-9]+]], type: default, offset: 4, size: 1, alignment: 4, isImmutable: true,
-; X32: id: [[STACK0:[0-9]+]], type: default, offset: 0, size: 1, alignment: 16, isImmutable: true,
-; X32: [[ARG1_ADDR:%[0-9]+]](p0) = G_FRAME_INDEX %fixed-stack.[[STACK0]]
-; X32-NEXT: [[ARG1:%[0-9]+]](s8) = G_LOAD [[ARG1_ADDR]](p0) :: (invariant load 1 from %fixed-stack.[[STACK0]], align 0)
-; X32-NEXT: [[ARG2_ADDR:%[0-9]+]](p0) = G_FRAME_INDEX %fixed-stack.[[STACK4]]
-; X32-NEXT: [[ARG2:%[0-9]+]](s8) = G_LOAD [[ARG2_ADDR]](p0) :: (invariant load 1 from %fixed-stack.[[STACK4]], align 0)
-; X32-NEXT: [[ARG3_ADDR:%[0-9]+]](p0) = G_FRAME_INDEX %fixed-stack.[[STACK8]]
-; X32-NEXT: [[ARG3:%[0-9]+]](s8) = G_LOAD [[ARG3_ADDR]](p0) :: (invariant load 1 from %fixed-stack.[[STACK8]], align 0)
-; X32-NEXT: [[ARG4_ADDR:%[0-9]+]](p0) = G_FRAME_INDEX %fixed-stack.[[STACK12]]
-; X32-NEXT: [[ARG4:%[0-9]+]](s8) = G_LOAD [[ARG4_ADDR]](p0) :: (invariant load 1 from %fixed-stack.[[STACK12]], align 0)
-; X32-NEXT: [[ARG5_ADDR:%[0-9]+]](p0) = G_FRAME_INDEX %fixed-stack.[[STACK16]]
-; X32-NEXT: [[ARG5:%[0-9]+]](s8) = G_LOAD [[ARG5_ADDR]](p0) :: (invariant load 1 from %fixed-stack.[[STACK16]], align 0)
-; X32-NEXT: [[ARG6_ADDR:%[0-9]+]](p0) = G_FRAME_INDEX %fixed-stack.[[STACK20]]
-; X32-NEXT: [[ARG6:%[0-9]+]](s8) = G_LOAD [[ARG6_ADDR]](p0) :: (invariant load 1 from %fixed-stack.[[STACK20]], align 0)
-; X32-NEXT: [[ARG7_ADDR:%[0-9]+]](p0) = G_FRAME_INDEX %fixed-stack.[[STACK24]]
-; X32-NEXT: [[ARG7:%[0-9]+]](s8) = G_LOAD [[ARG7_ADDR]](p0) :: (invariant load 1 from %fixed-stack.[[STACK24]], align 0)
-; X32-NEXT: [[ARG8_ADDR:%[0-9]+]](p0) = G_FRAME_INDEX %fixed-stack.[[STACK28]]
-; X32-NEXT: [[ARG8:%[0-9]+]](s8) = G_LOAD [[ARG8_ADDR]](p0) :: (invariant load 1 from %fixed-stack.[[STACK28]], align 0)
-
-; ALL-NEXT: [[GADDR_A1:%[0-9]+]](p0) = G_GLOBAL_VALUE @a1_8bit
-; ALL-NEXT: [[GADDR_A7:%[0-9]+]](p0) = G_GLOBAL_VALUE @a7_8bit
-; ALL-NEXT: [[GADDR_A8:%[0-9]+]](p0) = G_GLOBAL_VALUE @a8_8bit
+; X32: id: [[STACK28:[0-9]+]], type: default, offset: 28, size: 1, alignment: 4,
+; X32-NEXT: isImmutable: true,
+
+; X32: id: [[STACK24:[0-9]+]], type: default, offset: 24, size: 1, alignment: 8,
+; X32-NEXT: isImmutable: true,
+
+; X32: id: [[STACK20:[0-9]+]], type: default, offset: 20, size: 1, alignment: 4,
+; X32-NEXT: isImmutable: true,
+
+; X32: id: [[STACK16:[0-9]+]], type: default, offset: 16, size: 1, alignment: 16,
+; X32-NEXT: isImmutable: true,
+
+; X32: id: [[STACK12:[0-9]+]], type: default, offset: 12, size: 1, alignment: 4,
+; X32-NEXT: isImmutable: true,
+
+; X32: id: [[STACK8:[0-9]+]], type: default, offset: 8, size: 1, alignment: 8,
+;X32-NEXT: isImmutable: true,
+
+; X32: id: [[STACK4:[0-9]+]], type: default, offset: 4, size: 1, alignment: 4,
+; X32-NEXT: isImmutable: true,
+
+; X32: id: [[STACK0:[0-9]+]], type: default, offset: 0, size: 1, alignment: 16,
+; X32-NEXT: isImmutable: true,
+
+; X32: [[ARG1_ADDR:%[0-9]+]]:_(p0) = G_FRAME_INDEX %fixed-stack.[[STACK0]]
+; X32-NEXT: [[ARG1:%[0-9]+]]:_(s8) = G_LOAD [[ARG1_ADDR]](p0) :: (invariant load 1 from %fixed-stack.[[STACK0]], align 0)
+; X32-NEXT: [[ARG2_ADDR:%[0-9]+]]:_(p0) = G_FRAME_INDEX %fixed-stack.[[STACK4]]
+; X32-NEXT: [[ARG2:%[0-9]+]]:_(s8) = G_LOAD [[ARG2_ADDR]](p0) :: (invariant load 1 from %fixed-stack.[[STACK4]], align 0)
+; X32-NEXT: [[ARG3_ADDR:%[0-9]+]]:_(p0) = G_FRAME_INDEX %fixed-stack.[[STACK8]]
+; X32-NEXT: [[ARG3:%[0-9]+]]:_(s8) = G_LOAD [[ARG3_ADDR]](p0) :: (invariant load 1 from %fixed-stack.[[STACK8]], align 0)
+; X32-NEXT: [[ARG4_ADDR:%[0-9]+]]:_(p0) = G_FRAME_INDEX %fixed-stack.[[STACK12]]
+; X32-NEXT: [[ARG4:%[0-9]+]]:_(s8) = G_LOAD [[ARG4_ADDR]](p0) :: (invariant load 1 from %fixed-stack.[[STACK12]], align 0)
+; X32-NEXT: [[ARG5_ADDR:%[0-9]+]]:_(p0) = G_FRAME_INDEX %fixed-stack.[[STACK16]]
+; X32-NEXT: [[ARG5:%[0-9]+]]:_(s8) = G_LOAD [[ARG5_ADDR]](p0) :: (invariant load 1 from %fixed-stack.[[STACK16]], align 0)
+; X32-NEXT: [[ARG6_ADDR:%[0-9]+]]:_(p0) = G_FRAME_INDEX %fixed-stack.[[STACK20]]
+; X32-NEXT: [[ARG6:%[0-9]+]]:_(s8) = G_LOAD [[ARG6_ADDR]](p0) :: (invariant load 1 from %fixed-stack.[[STACK20]], align 0)
+; X32-NEXT: [[ARG7_ADDR:%[0-9]+]]:_(p0) = G_FRAME_INDEX %fixed-stack.[[STACK24]]
+; X32-NEXT: [[ARG7:%[0-9]+]]:_(s8) = G_LOAD [[ARG7_ADDR]](p0) :: (invariant load 1 from %fixed-stack.[[STACK24]], align 0)
+; X32-NEXT: [[ARG8_ADDR:%[0-9]+]]:_(p0) = G_FRAME_INDEX %fixed-stack.[[STACK28]]
+; X32-NEXT: [[ARG8:%[0-9]+]]:_(s8) = G_LOAD [[ARG8_ADDR]](p0) :: (invariant load 1 from %fixed-stack.[[STACK28]], align 0)
+
+; ALL-NEXT: [[GADDR_A1:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @a1_8bit
+; ALL-NEXT: [[GADDR_A7:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @a7_8bit
+; ALL-NEXT: [[GADDR_A8:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @a8_8bit
; ALL-NEXT: G_STORE [[ARG1]](s8), [[GADDR_A1]](p0) :: (store 1 into @a1_8bit)
; ALL-NEXT: G_STORE [[ARG7]](s8), [[GADDR_A7]](p0) :: (store 1 into @a7_8bit)
; ALL-NEXT: G_STORE [[ARG8]](s8), [[GADDR_A8]](p0) :: (store 1 into @a8_8bit)
@@ -77,49 +103,66 @@ define i32 @test_i32_args_8(i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4,
; ALL-LABEL: name: test_i32_args_8
; X64: fixedStack:
-; X64: id: [[STACK8:[0-9]+]], type: default, offset: 8, size: 4, alignment: 8, isImmutable: true,
-; X64: id: [[STACK0:[0-9]+]], type: default, offset: 0, size: 4, alignment: 16, isImmutable: true,
+; X64: id: [[STACK8:[0-9]+]], type: default, offset: 8, size: 4, alignment: 8,
+; X64-NEXT: isImmutable: true,
+; X64: id: [[STACK0:[0-9]+]], type: default, offset: 0, size: 4, alignment: 16,
+; X64-NEXT: isImmutable: true,
; X64: liveins: %ecx, %edi, %edx, %esi, %r8d, %r9d
-; X64: [[ARG1:%[0-9]+]](s32) = COPY %edi
-; X64-NEXT: %{{[0-9]+}}(s32) = COPY %esi
-; X64-NEXT: %{{[0-9]+}}(s32) = COPY %edx
-; X64-NEXT: %{{[0-9]+}}(s32) = COPY %ecx
-; X64-NEXT: %{{[0-9]+}}(s32) = COPY %r8d
-; X64-NEXT: %{{[0-9]+}}(s32) = COPY %r9d
-; X64-NEXT: [[ARG7_ADDR:%[0-9]+]](p0) = G_FRAME_INDEX %fixed-stack.[[STACK0]]
-; X64-NEXT: [[ARG7:%[0-9]+]](s32) = G_LOAD [[ARG7_ADDR]](p0) :: (invariant load 4 from %fixed-stack.[[STACK0]], align 0)
-; X64-NEXT: [[ARG8_ADDR:%[0-9]+]](p0) = G_FRAME_INDEX %fixed-stack.[[STACK8]]
-; X64-NEXT: [[ARG8:%[0-9]+]](s32) = G_LOAD [[ARG8_ADDR]](p0) :: (invariant load 4 from %fixed-stack.[[STACK8]], align 0)
+; X64: [[ARG1:%[0-9]+]]:_(s32) = COPY %edi
+; X64-NEXT: %{{[0-9]+}}:_(s32) = COPY %esi
+; X64-NEXT: %{{[0-9]+}}:_(s32) = COPY %edx
+; X64-NEXT: %{{[0-9]+}}:_(s32) = COPY %ecx
+; X64-NEXT: %{{[0-9]+}}:_(s32) = COPY %r8d
+; X64-NEXT: %{{[0-9]+}}:_(s32) = COPY %r9d
+; X64-NEXT: [[ARG7_ADDR:%[0-9]+]]:_(p0) = G_FRAME_INDEX %fixed-stack.[[STACK0]]
+; X64-NEXT: [[ARG7:%[0-9]+]]:_(s32) = G_LOAD [[ARG7_ADDR]](p0) :: (invariant load 4 from %fixed-stack.[[STACK0]], align 0)
+; X64-NEXT: [[ARG8_ADDR:%[0-9]+]]:_(p0) = G_FRAME_INDEX %fixed-stack.[[STACK8]]
+; X64-NEXT: [[ARG8:%[0-9]+]]:_(s32) = G_LOAD [[ARG8_ADDR]](p0) :: (invariant load 4 from %fixed-stack.[[STACK8]], align 0)
; X32: fixedStack:
-; X32: id: [[STACK28:[0-9]+]], type: default, offset: 28, size: 4, alignment: 4, isImmutable: true,
-; X32: id: [[STACK24:[0-9]+]], type: default, offset: 24, size: 4, alignment: 8, isImmutable: true,
-; X32: id: [[STACK20:[0-9]+]], type: default, offset: 20, size: 4, alignment: 4, isImmutable: true,
-; X32: id: [[STACK16:[0-9]+]], type: default, offset: 16, size: 4, alignment: 16, isImmutable: true,
-; X32: id: [[STACK12:[0-9]+]], type: default, offset: 12, size: 4, alignment: 4, isImmutable: true,
-; X32: id: [[STACK8:[0-9]+]], type: default, offset: 8, size: 4, alignment: 8, isImmutable: true,
-; X32: id: [[STACK4:[0-9]+]], type: default, offset: 4, size: 4, alignment: 4, isImmutable: true,
-; X32: id: [[STACK0:[0-9]+]], type: default, offset: 0, size: 4, alignment: 16, isImmutable: true,
-; X32: [[ARG1_ADDR:%[0-9]+]](p0) = G_FRAME_INDEX %fixed-stack.[[STACK0]]
-; X32-NEXT: [[ARG1:%[0-9]+]](s32) = G_LOAD [[ARG1_ADDR]](p0) :: (invariant load 4 from %fixed-stack.[[STACK0]], align 0)
-; X32-NEXT: [[ARG2_ADDR:%[0-9]+]](p0) = G_FRAME_INDEX %fixed-stack.[[STACK4]]
-; X32-NEXT: [[ARG2:%[0-9]+]](s32) = G_LOAD [[ARG2_ADDR]](p0) :: (invariant load 4 from %fixed-stack.[[STACK4]], align 0)
-; X32-NEXT: [[ARG3_ADDR:%[0-9]+]](p0) = G_FRAME_INDEX %fixed-stack.[[STACK8]]
-; X32-NEXT: [[ARG3:%[0-9]+]](s32) = G_LOAD [[ARG3_ADDR]](p0) :: (invariant load 4 from %fixed-stack.[[STACK8]], align 0)
-; X32-NEXT: [[ARG4_ADDR:%[0-9]+]](p0) = G_FRAME_INDEX %fixed-stack.[[STACK12]]
-; X32-NEXT: [[ARG4:%[0-9]+]](s32) = G_LOAD [[ARG4_ADDR]](p0) :: (invariant load 4 from %fixed-stack.[[STACK12]], align 0)
-; X32-NEXT: [[ARG5_ADDR:%[0-9]+]](p0) = G_FRAME_INDEX %fixed-stack.[[STACK16]]
-; X32-NEXT: [[ARG5:%[0-9]+]](s32) = G_LOAD [[ARG5_ADDR]](p0) :: (invariant load 4 from %fixed-stack.[[STACK16]], align 0)
-; X32-NEXT: [[ARG6_ADDR:%[0-9]+]](p0) = G_FRAME_INDEX %fixed-stack.[[STACK20]]
-; X32-NEXT: [[ARG6:%[0-9]+]](s32) = G_LOAD [[ARG6_ADDR]](p0) :: (invariant load 4 from %fixed-stack.[[STACK20]], align 0)
-; X32-NEXT: [[ARG7_ADDR:%[0-9]+]](p0) = G_FRAME_INDEX %fixed-stack.[[STACK24]]
-; X32-NEXT: [[ARG7:%[0-9]+]](s32) = G_LOAD [[ARG7_ADDR]](p0) :: (invariant load 4 from %fixed-stack.[[STACK24]], align 0)
-; X32-NEXT: [[ARG8_ADDR:%[0-9]+]](p0) = G_FRAME_INDEX %fixed-stack.[[STACK28]]
-; X32-NEXT: [[ARG8:%[0-9]+]](s32) = G_LOAD [[ARG8_ADDR]](p0) :: (invariant load 4 from %fixed-stack.[[STACK28]], align 0)
-
-; ALL-NEXT: [[GADDR_A1:%[0-9]+]](p0) = G_GLOBAL_VALUE @a1_32bit
-; ALL-NEXT: [[GADDR_A7:%[0-9]+]](p0) = G_GLOBAL_VALUE @a7_32bit
-; ALL-NEXT: [[GADDR_A8:%[0-9]+]](p0) = G_GLOBAL_VALUE @a8_32bit
+; X32: id: [[STACK28:[0-9]+]], type: default, offset: 28, size: 4, alignment: 4,
+; X32-NEXT: isImmutable: true,
+
+; X32: id: [[STACK24:[0-9]+]], type: default, offset: 24, size: 4, alignment: 8
+; X32-NEXT: isImmutable: true,
+
+; X32: id: [[STACK20:[0-9]+]], type: default, offset: 20, size: 4, alignment: 4
+; X32-NEXT: isImmutable: true,
+
+; X32: id: [[STACK16:[0-9]+]], type: default, offset: 16, size: 4, alignment: 16
+; X32-NEXT: isImmutable: true,
+
+; X32: id: [[STACK12:[0-9]+]], type: default, offset: 12, size: 4, alignment: 4
+; X32-NEXT: isImmutable: true,
+
+; X32: id: [[STACK8:[0-9]+]], type: default, offset: 8, size: 4, alignment: 8
+; X32-NEXT: isImmutable: true,
+
+; X32: id: [[STACK4:[0-9]+]], type: default, offset: 4, size: 4, alignment: 4
+; X32-NEXT: isImmutable: true,
+; X32: id: [[STACK0:[0-9]+]], type: default, offset: 0, size: 4, alignment: 16
+; X32-NEXT: isImmutable: true,
+
+; X32: [[ARG1_ADDR:%[0-9]+]]:_(p0) = G_FRAME_INDEX %fixed-stack.[[STACK0]]
+; X32-NEXT: [[ARG1:%[0-9]+]]:_(s32) = G_LOAD [[ARG1_ADDR]](p0) :: (invariant load 4 from %fixed-stack.[[STACK0]], align 0)
+; X32-NEXT: [[ARG2_ADDR:%[0-9]+]]:_(p0) = G_FRAME_INDEX %fixed-stack.[[STACK4]]
+; X32-NEXT: [[ARG2:%[0-9]+]]:_(s32) = G_LOAD [[ARG2_ADDR]](p0) :: (invariant load 4 from %fixed-stack.[[STACK4]], align 0)
+; X32-NEXT: [[ARG3_ADDR:%[0-9]+]]:_(p0) = G_FRAME_INDEX %fixed-stack.[[STACK8]]
+; X32-NEXT: [[ARG3:%[0-9]+]]:_(s32) = G_LOAD [[ARG3_ADDR]](p0) :: (invariant load 4 from %fixed-stack.[[STACK8]], align 0)
+; X32-NEXT: [[ARG4_ADDR:%[0-9]+]]:_(p0) = G_FRAME_INDEX %fixed-stack.[[STACK12]]
+; X32-NEXT: [[ARG4:%[0-9]+]]:_(s32) = G_LOAD [[ARG4_ADDR]](p0) :: (invariant load 4 from %fixed-stack.[[STACK12]], align 0)
+; X32-NEXT: [[ARG5_ADDR:%[0-9]+]]:_(p0) = G_FRAME_INDEX %fixed-stack.[[STACK16]]
+; X32-NEXT: [[ARG5:%[0-9]+]]:_(s32) = G_LOAD [[ARG5_ADDR]](p0) :: (invariant load 4 from %fixed-stack.[[STACK16]], align 0)
+; X32-NEXT: [[ARG6_ADDR:%[0-9]+]]:_(p0) = G_FRAME_INDEX %fixed-stack.[[STACK20]]
+; X32-NEXT: [[ARG6:%[0-9]+]]:_(s32) = G_LOAD [[ARG6_ADDR]](p0) :: (invariant load 4 from %fixed-stack.[[STACK20]], align 0)
+; X32-NEXT: [[ARG7_ADDR:%[0-9]+]]:_(p0) = G_FRAME_INDEX %fixed-stack.[[STACK24]]
+; X32-NEXT: [[ARG7:%[0-9]+]]:_(s32) = G_LOAD [[ARG7_ADDR]](p0) :: (invariant load 4 from %fixed-stack.[[STACK24]], align 0)
+; X32-NEXT: [[ARG8_ADDR:%[0-9]+]]:_(p0) = G_FRAME_INDEX %fixed-stack.[[STACK28]]
+; X32-NEXT: [[ARG8:%[0-9]+]]:_(s32) = G_LOAD [[ARG8_ADDR]](p0) :: (invariant load 4 from %fixed-stack.[[STACK28]], align 0)
+
+; ALL-NEXT: [[GADDR_A1:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @a1_32bit
+; ALL-NEXT: [[GADDR_A7:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @a7_32bit
+; ALL-NEXT: [[GADDR_A8:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @a8_32bit
; ALL-NEXT: G_STORE [[ARG1]](s32), [[GADDR_A1]](p0) :: (store 4 into @a1_32bit)
; ALL-NEXT: G_STORE [[ARG7]](s32), [[GADDR_A7]](p0) :: (store 4 into @a7_32bit)
; ALL-NEXT: G_STORE [[ARG8]](s32), [[GADDR_A8]](p0) :: (store 4 into @a8_32bit)
@@ -142,84 +185,102 @@ define i64 @test_i64_args_8(i64 %arg1, i64 %arg2, i64 %arg3, i64 %arg4,
; ALL-LABEL: name: test_i64_args_8
; X64: fixedStack:
-; X64: id: [[STACK8:[0-9]+]], type: default, offset: 8, size: 8, alignment: 8, isImmutable: true,
-; X64: id: [[STACK0:[0-9]+]], type: default, offset: 0, size: 8, alignment: 16, isImmutable: true,
+; X64: id: [[STACK8:[0-9]+]], type: default, offset: 8, size: 8, alignment: 8,
+; X64-NEXT: isImmutable: true,
+; X64: id: [[STACK0:[0-9]+]], type: default, offset: 0, size: 8, alignment: 16,
+; X64-NEXT: isImmutable: true,
; X64: liveins: %rcx, %rdi, %rdx, %rsi, %r8, %r9
-; X64: [[ARG1:%[0-9]+]](s64) = COPY %rdi
-; X64-NEXT: %{{[0-9]+}}(s64) = COPY %rsi
-; X64-NEXT: %{{[0-9]+}}(s64) = COPY %rdx
-; X64-NEXT: %{{[0-9]+}}(s64) = COPY %rcx
-; X64-NEXT: %{{[0-9]+}}(s64) = COPY %r8
-; X64-NEXT: %{{[0-9]+}}(s64) = COPY %r9
-; X64-NEXT: [[ARG7_ADDR:%[0-9]+]](p0) = G_FRAME_INDEX %fixed-stack.[[STACK0]]
-; X64-NEXT: [[ARG7:%[0-9]+]](s64) = G_LOAD [[ARG7_ADDR]](p0) :: (invariant load 8 from %fixed-stack.[[STACK0]], align 0)
-; X64-NEXT: [[ARG8_ADDR:%[0-9]+]](p0) = G_FRAME_INDEX %fixed-stack.[[STACK8]]
-; X64-NEXT: [[ARG8:%[0-9]+]](s64) = G_LOAD [[ARG8_ADDR]](p0) :: (invariant load 8 from %fixed-stack.[[STACK8]], align 0)
+; X64: [[ARG1:%[0-9]+]]:_(s64) = COPY %rdi
+; X64-NEXT: %{{[0-9]+}}:_(s64) = COPY %rsi
+; X64-NEXT: %{{[0-9]+}}:_(s64) = COPY %rdx
+; X64-NEXT: %{{[0-9]+}}:_(s64) = COPY %rcx
+; X64-NEXT: %{{[0-9]+}}:_(s64) = COPY %r8
+; X64-NEXT: %{{[0-9]+}}:_(s64) = COPY %r9
+; X64-NEXT: [[ARG7_ADDR:%[0-9]+]]:_(p0) = G_FRAME_INDEX %fixed-stack.[[STACK0]]
+; X64-NEXT: [[ARG7:%[0-9]+]]:_(s64) = G_LOAD [[ARG7_ADDR]](p0) :: (invariant load 8 from %fixed-stack.[[STACK0]], align 0)
+; X64-NEXT: [[ARG8_ADDR:%[0-9]+]]:_(p0) = G_FRAME_INDEX %fixed-stack.[[STACK8]]
+; X64-NEXT: [[ARG8:%[0-9]+]]:_(s64) = G_LOAD [[ARG8_ADDR]](p0) :: (invariant load 8 from %fixed-stack.[[STACK8]], align 0)
; X32: fixedStack:
-; X32: id: [[STACK60:[0-9]+]], type: default, offset: 60, size: 4, alignment: 4, isImmutable: true,
-; X32: id: [[STACK56:[0-9]+]], type: default, offset: 56, size: 4, alignment: 8, isImmutable: true,
-; X32: id: [[STACK52:[0-9]+]], type: default, offset: 52, size: 4, alignment: 4, isImmutable: true,
-; X32: id: [[STACK48:[0-9]+]], type: default, offset: 48, size: 4, alignment: 16, isImmutable: true,
-; X32: id: [[STACK44:[0-9]+]], type: default, offset: 44, size: 4, alignment: 4, isImmutable: true,
-; X32: id: [[STACK40:[0-9]+]], type: default, offset: 40, size: 4, alignment: 8, isImmutable: true,
-; X32: id: [[STACK36:[0-9]+]], type: default, offset: 36, size: 4, alignment: 4, isImmutable: true,
-; X32: id: [[STACK32:[0-9]+]], type: default, offset: 32, size: 4, alignment: 16, isImmutable: true,
-; X32: id: [[STACK28:[0-9]+]], type: default, offset: 28, size: 4, alignment: 4, isImmutable: true,
-; X32: id: [[STACK24:[0-9]+]], type: default, offset: 24, size: 4, alignment: 8, isImmutable: true,
-; X32: id: [[STACK20:[0-9]+]], type: default, offset: 20, size: 4, alignment: 4, isImmutable: true,
-; X32: id: [[STACK16:[0-9]+]], type: default, offset: 16, size: 4, alignment: 16, isImmutable: true,
-; X32: id: [[STACK12:[0-9]+]], type: default, offset: 12, size: 4, alignment: 4, isImmutable: true,
-; X32: id: [[STACK8:[0-9]+]], type: default, offset: 8, size: 4, alignment: 8, isImmutable: true,
-; X32: id: [[STACK4:[0-9]+]], type: default, offset: 4, size: 4, alignment: 4, isImmutable: true,
-; X32: id: [[STACK0:[0-9]+]], type: default, offset: 0, size: 4, alignment: 16, isImmutable: true,
-
-; X32: [[ARG1L_ADDR:%[0-9]+]](p0) = G_FRAME_INDEX %fixed-stack.[[STACK0]]
-; X32-NEXT: [[ARG1L:%[0-9]+]](s32) = G_LOAD [[ARG1L_ADDR]](p0) :: (invariant load 4 from %fixed-stack.[[STACK0]], align 0)
-; X32-NEXT: [[ARG1H_ADDR:%[0-9]+]](p0) = G_FRAME_INDEX %fixed-stack.[[STACK4]]
-; X32-NEXT: [[ARG1H:%[0-9]+]](s32) = G_LOAD [[ARG1H_ADDR]](p0) :: (invariant load 4 from %fixed-stack.[[STACK4]], align 0)
-; X32-NEXT: %{{[0-9]+}}(p0) = G_FRAME_INDEX %fixed-stack.[[STACK8]]
-; X32-NEXT: %{{[0-9]+}}(s32) = G_LOAD %{{[0-9]+}}(p0) :: (invariant load 4 from %fixed-stack.[[STACK8]], align 0)
-; X32-NEXT: %{{[0-9]+}}(p0) = G_FRAME_INDEX %fixed-stack.[[STACK12]]
-; X32-NEXT: %{{[0-9]+}}(s32) = G_LOAD %{{[0-9]+}}(p0) :: (invariant load 4 from %fixed-stack.[[STACK12]], align 0)
-; X32-NEXT: %{{[0-9]+}}(p0) = G_FRAME_INDEX %fixed-stack.[[STACK16]]
-; X32-NEXT: %{{[0-9]+}}(s32) = G_LOAD %{{[0-9]+}}(p0) :: (invariant load 4 from %fixed-stack.[[STACK16]], align 0)
-; X32-NEXT: %{{[0-9]+}}(p0) = G_FRAME_INDEX %fixed-stack.[[STACK20]]
-; X32-NEXT: %{{[0-9]+}}(s32) = G_LOAD %{{[0-9]+}}(p0) :: (invariant load 4 from %fixed-stack.[[STACK20]], align 0)
-; X32-NEXT: %{{[0-9]+}}(p0) = G_FRAME_INDEX %fixed-stack.[[STACK24]]
-; X32-NEXT: %{{[0-9]+}}(s32) = G_LOAD %{{[0-9]+}}(p0) :: (invariant load 4 from %fixed-stack.[[STACK24]], align 0)
-; X32-NEXT: %{{[0-9]+}}(p0) = G_FRAME_INDEX %fixed-stack.[[STACK28]]
-; X32-NEXT: %{{[0-9]+}}(s32) = G_LOAD %{{[0-9]+}}(p0) :: (invariant load 4 from %fixed-stack.[[STACK28]], align 0)
-; X32-NEXT: %{{[0-9]+}}(p0) = G_FRAME_INDEX %fixed-stack.[[STACK32]]
-; X32-NEXT: %{{[0-9]+}}(s32) = G_LOAD %{{[0-9]+}}(p0) :: (invariant load 4 from %fixed-stack.[[STACK32]], align 0)
-; X32-NEXT: %{{[0-9]+}}(p0) = G_FRAME_INDEX %fixed-stack.[[STACK36]]
-; X32-NEXT: %{{[0-9]+}}(s32) = G_LOAD %{{[0-9]+}}(p0) :: (invariant load 4 from %fixed-stack.[[STACK36]], align 0)
-; X32-NEXT: %{{[0-9]+}}(p0) = G_FRAME_INDEX %fixed-stack.[[STACK40]]
-; X32-NEXT: %{{[0-9]+}}(s32) = G_LOAD %{{[0-9]+}}(p0) :: (invariant load 4 from %fixed-stack.[[STACK40]], align 0)
-; X32-NEXT: %{{[0-9]+}}(p0) = G_FRAME_INDEX %fixed-stack.[[STACK44]]
-; X32-NEXT: %{{[0-9]+}}(s32) = G_LOAD %{{[0-9]+}}(p0) :: (invariant load 4 from %fixed-stack.[[STACK44]], align 0)
-; X32-NEXT: [[ARG7L_ADDR:%[0-9]+]](p0) = G_FRAME_INDEX %fixed-stack.[[STACK48]]
-; X32-NEXT: [[ARG7L:%[0-9]+]](s32) = G_LOAD [[ARG7L_ADDR]](p0) :: (invariant load 4 from %fixed-stack.[[STACK48]], align 0)
-; X32-NEXT: [[ARG7H_ADDR:%[0-9]+]](p0) = G_FRAME_INDEX %fixed-stack.[[STACK52]]
-; X32-NEXT: [[ARG7H:%[0-9]+]](s32) = G_LOAD [[ARG7H_ADDR]](p0) :: (invariant load 4 from %fixed-stack.[[STACK52]], align 0)
-; X32-NEXT: [[ARG8L_ADDR:%[0-9]+]](p0) = G_FRAME_INDEX %fixed-stack.[[STACK56]]
-; X32-NEXT: [[ARG8L:%[0-9]+]](s32) = G_LOAD [[ARG8L_ADDR]](p0) :: (invariant load 4 from %fixed-stack.[[STACK56]], align 0)
-; X32-NEXT: [[ARG8H_ADDR:%[0-9]+]](p0) = G_FRAME_INDEX %fixed-stack.[[STACK60]]
-; X32-NEXT: [[ARG8H:%[0-9]+]](s32) = G_LOAD [[ARG8H_ADDR]](p0) :: (invariant load 4 from %fixed-stack.[[STACK60]], align 0)
-
-; X32-NEXT: [[ARG1:%[0-9]+]](s64) = G_MERGE_VALUES [[ARG1L]](s32), [[ARG1H]](s32)
+; X32: id: [[STACK60:[0-9]+]], type: default, offset: 60, size: 4, alignment: 4,
+; X32-NEXT: isImmutable: true,
+; X32: id: [[STACK56:[0-9]+]], type: default, offset: 56, size: 4, alignment: 8,
+; X32-NEXT: isImmutable: true,
+; X32: id: [[STACK52:[0-9]+]], type: default, offset: 52, size: 4, alignment: 4
+; X32-NEXT: isImmutable: true,
+; X32: id: [[STACK48:[0-9]+]], type: default, offset: 48, size: 4, alignment: 16
+; X32-NEXT: isImmutable: true,
+; X32: id: [[STACK44:[0-9]+]], type: default, offset: 44, size: 4, alignment: 4
+; X32-NEXT: isImmutable: true,
+; X32: id: [[STACK40:[0-9]+]], type: default, offset: 40, size: 4, alignment: 8
+; X32-NEXT: isImmutable: true,
+; X32: id: [[STACK36:[0-9]+]], type: default, offset: 36, size: 4, alignment: 4
+; X32-NEXT: isImmutable: true,
+; X32: id: [[STACK32:[0-9]+]], type: default, offset: 32, size: 4, alignment: 16
+; X32-NEXT: isImmutable: true,
+; X32: id: [[STACK28:[0-9]+]], type: default, offset: 28, size: 4, alignment: 4
+; X32-NEXT: isImmutable: true,
+; X32: id: [[STACK24:[0-9]+]], type: default, offset: 24, size: 4, alignment: 8
+; X32-NEXT: isImmutable: true,
+; X32: id: [[STACK20:[0-9]+]], type: default, offset: 20, size: 4, alignment: 4
+; X32-NEXT: isImmutable: true,
+; X32: id: [[STACK16:[0-9]+]], type: default, offset: 16, size: 4, alignment: 16
+; X32-NEXT: isImmutable: true,
+; X32: id: [[STACK12:[0-9]+]], type: default, offset: 12, size: 4, alignment: 4
+; X32-NEXT: isImmutable: true,
+; X32: id: [[STACK8:[0-9]+]], type: default, offset: 8, size: 4, alignment: 8
+; X32-NEXT: isImmutable: true,
+; X32: id: [[STACK4:[0-9]+]], type: default, offset: 4, size: 4, alignment: 4
+; X32-NEXT: isImmutable: true,
+; X32: id: [[STACK0:[0-9]+]], type: default, offset: 0, size: 4, alignment: 16
+; X32-NEXT: isImmutable: true,
+
+; X32: [[ARG1L_ADDR:%[0-9]+]]:_(p0) = G_FRAME_INDEX %fixed-stack.[[STACK0]]
+; X32-NEXT: [[ARG1L:%[0-9]+]]:_(s32) = G_LOAD [[ARG1L_ADDR]](p0) :: (invariant load 4 from %fixed-stack.[[STACK0]], align 0)
+; X32-NEXT: [[ARG1H_ADDR:%[0-9]+]]:_(p0) = G_FRAME_INDEX %fixed-stack.[[STACK4]]
+; X32-NEXT: [[ARG1H:%[0-9]+]]:_(s32) = G_LOAD [[ARG1H_ADDR]](p0) :: (invariant load 4 from %fixed-stack.[[STACK4]], align 0)
+; X32-NEXT: %{{[0-9]+}}:_(p0) = G_FRAME_INDEX %fixed-stack.[[STACK8]]
+; X32-NEXT: %{{[0-9]+}}:_(s32) = G_LOAD %{{[0-9]+}}(p0) :: (invariant load 4 from %fixed-stack.[[STACK8]], align 0)
+; X32-NEXT: %{{[0-9]+}}:_(p0) = G_FRAME_INDEX %fixed-stack.[[STACK12]]
+; X32-NEXT: %{{[0-9]+}}:_(s32) = G_LOAD %{{[0-9]+}}(p0) :: (invariant load 4 from %fixed-stack.[[STACK12]], align 0)
+; X32-NEXT: %{{[0-9]+}}:_(p0) = G_FRAME_INDEX %fixed-stack.[[STACK16]]
+; X32-NEXT: %{{[0-9]+}}:_(s32) = G_LOAD %{{[0-9]+}}(p0) :: (invariant load 4 from %fixed-stack.[[STACK16]], align 0)
+; X32-NEXT: %{{[0-9]+}}:_(p0) = G_FRAME_INDEX %fixed-stack.[[STACK20]]
+; X32-NEXT: %{{[0-9]+}}:_(s32) = G_LOAD %{{[0-9]+}}(p0) :: (invariant load 4 from %fixed-stack.[[STACK20]], align 0)
+; X32-NEXT: %{{[0-9]+}}:_(p0) = G_FRAME_INDEX %fixed-stack.[[STACK24]]
+; X32-NEXT: %{{[0-9]+}}:_(s32) = G_LOAD %{{[0-9]+}}(p0) :: (invariant load 4 from %fixed-stack.[[STACK24]], align 0)
+; X32-NEXT: %{{[0-9]+}}:_(p0) = G_FRAME_INDEX %fixed-stack.[[STACK28]]
+; X32-NEXT: %{{[0-9]+}}:_(s32) = G_LOAD %{{[0-9]+}}(p0) :: (invariant load 4 from %fixed-stack.[[STACK28]], align 0)
+; X32-NEXT: %{{[0-9]+}}:_(p0) = G_FRAME_INDEX %fixed-stack.[[STACK32]]
+; X32-NEXT: %{{[0-9]+}}:_(s32) = G_LOAD %{{[0-9]+}}(p0) :: (invariant load 4 from %fixed-stack.[[STACK32]], align 0)
+; X32-NEXT: %{{[0-9]+}}:_(p0) = G_FRAME_INDEX %fixed-stack.[[STACK36]]
+; X32-NEXT: %{{[0-9]+}}:_(s32) = G_LOAD %{{[0-9]+}}(p0) :: (invariant load 4 from %fixed-stack.[[STACK36]], align 0)
+; X32-NEXT: %{{[0-9]+}}:_(p0) = G_FRAME_INDEX %fixed-stack.[[STACK40]]
+; X32-NEXT: %{{[0-9]+}}:_(s32) = G_LOAD %{{[0-9]+}}(p0) :: (invariant load 4 from %fixed-stack.[[STACK40]], align 0)
+; X32-NEXT: %{{[0-9]+}}:_(p0) = G_FRAME_INDEX %fixed-stack.[[STACK44]]
+; X32-NEXT: %{{[0-9]+}}:_(s32) = G_LOAD %{{[0-9]+}}(p0) :: (invariant load 4 from %fixed-stack.[[STACK44]], align 0)
+; X32-NEXT: [[ARG7L_ADDR:%[0-9]+]]:_(p0) = G_FRAME_INDEX %fixed-stack.[[STACK48]]
+; X32-NEXT: [[ARG7L:%[0-9]+]]:_(s32) = G_LOAD [[ARG7L_ADDR]](p0) :: (invariant load 4 from %fixed-stack.[[STACK48]], align 0)
+; X32-NEXT: [[ARG7H_ADDR:%[0-9]+]]:_(p0) = G_FRAME_INDEX %fixed-stack.[[STACK52]]
+; X32-NEXT: [[ARG7H:%[0-9]+]]:_(s32) = G_LOAD [[ARG7H_ADDR]](p0) :: (invariant load 4 from %fixed-stack.[[STACK52]], align 0)
+; X32-NEXT: [[ARG8L_ADDR:%[0-9]+]]:_(p0) = G_FRAME_INDEX %fixed-stack.[[STACK56]]
+; X32-NEXT: [[ARG8L:%[0-9]+]]:_(s32) = G_LOAD [[ARG8L_ADDR]](p0) :: (invariant load 4 from %fixed-stack.[[STACK56]], align 0)
+; X32-NEXT: [[ARG8H_ADDR:%[0-9]+]]:_(p0) = G_FRAME_INDEX %fixed-stack.[[STACK60]]
+; X32-NEXT: [[ARG8H:%[0-9]+]]:_(s32) = G_LOAD [[ARG8H_ADDR]](p0) :: (invariant load 4 from %fixed-stack.[[STACK60]], align 0)
+
+; X32-NEXT: [[ARG1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[ARG1L]](s32), [[ARG1H]](s32)
; ... a bunch more that we don't track ...
; X32-NEXT: G_MERGE_VALUES
; X32-NEXT: G_MERGE_VALUES
; X32-NEXT: G_MERGE_VALUES
; X32-NEXT: G_MERGE_VALUES
; X32-NEXT: G_MERGE_VALUES
-; X32-NEXT: [[ARG7:%[0-9]+]](s64) = G_MERGE_VALUES [[ARG7L]](s32), [[ARG7H]](s32)
-; X32-NEXT: [[ARG8:%[0-9]+]](s64) = G_MERGE_VALUES [[ARG8L]](s32), [[ARG8H]](s32)
+; X32-NEXT: [[ARG7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[ARG7L]](s32), [[ARG7H]](s32)
+; X32-NEXT: [[ARG8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[ARG8L]](s32), [[ARG8H]](s32)
-; ALL-NEXT: [[GADDR_A1:%[0-9]+]](p0) = G_GLOBAL_VALUE @a1_64bit
-; ALL-NEXT: [[GADDR_A7:%[0-9]+]](p0) = G_GLOBAL_VALUE @a7_64bit
-; ALL-NEXT: [[GADDR_A8:%[0-9]+]](p0) = G_GLOBAL_VALUE @a8_64bit
+; ALL-NEXT: [[GADDR_A1:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @a1_64bit
+; ALL-NEXT: [[GADDR_A7:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @a7_64bit
+; ALL-NEXT: [[GADDR_A8:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @a8_64bit
; ALL-NEXT: G_STORE [[ARG1]](s64), [[GADDR_A1]](p0) :: (store 8 into @a1_64bit
; ALL-NEXT: G_STORE [[ARG7]](s64), [[GADDR_A7]](p0) :: (store 8 into @a7_64bit
; ALL-NEXT: G_STORE [[ARG8]](s64), [[GADDR_A8]](p0) :: (store 8 into @a8_64bit
@@ -227,7 +288,7 @@ define i64 @test_i64_args_8(i64 %arg1, i64 %arg2, i64 %arg3, i64 %arg4,
; X64-NEXT: %rax = COPY [[ARG1]](s64)
; X64-NEXT: RET 0, implicit %rax
-; X32-NEXT: [[RETL:%[0-9]+]](s32), [[RETH:%[0-9]+]](s32) = G_UNMERGE_VALUES [[ARG1:%[0-9]+]](s64)
+; X32-NEXT: [[RETL:%[0-9]+]]:_(s32), [[RETH:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ARG1:%[0-9]+]](s64)
; X32-NEXT: %eax = COPY [[RETL:%[0-9]+]](s32)
; X32-NEXT: %edx = COPY [[RETH:%[0-9]+]](s32)
; X32-NEXT: RET 0, implicit %eax, implicit %edx
@@ -243,18 +304,20 @@ define float @test_float_args(float %arg1, float %arg2) {
; ALL-LABEL:name: test_float_args
; X64: liveins: %xmm0, %xmm1
-; X64: [[ARG1:%[0-9]+]](s32) = COPY %xmm0
-; X64-NEXT: [[ARG2:%[0-9]+]](s32) = COPY %xmm1
+; X64: [[ARG1:%[0-9]+]]:_(s32) = COPY %xmm0
+; X64-NEXT: [[ARG2:%[0-9]+]]:_(s32) = COPY %xmm1
; X64-NEXT: %xmm0 = COPY [[ARG2:%[0-9]+]](s32)
; X64-NEXT: RET 0, implicit %xmm0
; X32: fixedStack:
-; X32: id: [[STACK4:[0-9]+]], type: default, offset: 4, size: 4, alignment: 4, isImmutable: true,
-; X32: id: [[STACK0:[0-9]+]], type: default, offset: 0, size: 4, alignment: 16, isImmutable: true,
-; X32: [[ARG1_ADDR:%[0-9]+]](p0) = G_FRAME_INDEX %fixed-stack.[[STACK0]]
-; X32-NEXT: [[ARG1:%[0-9]+]](s32) = G_LOAD [[ARG1_ADDR:%[0-9]+]](p0) :: (invariant load 4 from %fixed-stack.[[STACK0]], align 0)
-; X32-NEXT: [[ARG2_ADDR:%[0-9]+]](p0) = G_FRAME_INDEX %fixed-stack.[[STACK4]]
-; X32-NEXT: [[ARG2:%[0-9]+]](s32) = G_LOAD [[ARG2_ADDR:%[0-9]+]](p0) :: (invariant load 4 from %fixed-stack.[[STACK4]], align 0)
+; X32: id: [[STACK4:[0-9]+]], type: default, offset: 4, size: 4, alignment: 4,
+; X32-NEXT: isImmutable: true,
+; X32: id: [[STACK0:[0-9]+]], type: default, offset: 0, size: 4, alignment: 16
+; X32-NEXT: isImmutable: true,
+; X32: [[ARG1_ADDR:%[0-9]+]]:_(p0) = G_FRAME_INDEX %fixed-stack.[[STACK0]]
+; X32-NEXT: [[ARG1:%[0-9]+]]:_(s32) = G_LOAD [[ARG1_ADDR:%[0-9]+]](p0) :: (invariant load 4 from %fixed-stack.[[STACK0]], align 0)
+; X32-NEXT: [[ARG2_ADDR:%[0-9]+]]:_(p0) = G_FRAME_INDEX %fixed-stack.[[STACK4]]
+; X32-NEXT: [[ARG2:%[0-9]+]]:_(s32) = G_LOAD [[ARG2_ADDR:%[0-9]+]](p0) :: (invariant load 4 from %fixed-stack.[[STACK4]], align 0)
; X32-NEXT: %fp0 = COPY [[ARG2:%[0-9]+]](s32)
; X32-NEXT: RET 0, implicit %fp0
@@ -264,18 +327,22 @@ define float @test_float_args(float %arg1, float %arg2) {
define double @test_double_args(double %arg1, double %arg2) {
; ALL-LABEL:name: test_double_args
; X64: liveins: %xmm0, %xmm1
-; X64: [[ARG1:%[0-9]+]](s64) = COPY %xmm0
-; X64-NEXT: [[ARG2:%[0-9]+]](s64) = COPY %xmm1
+; X64: [[ARG1:%[0-9]+]]:_(s64) = COPY %xmm0
+; X64-NEXT: [[ARG2:%[0-9]+]]:_(s64) = COPY %xmm1
; X64-NEXT: %xmm0 = COPY [[ARG2:%[0-9]+]](s64)
; X64-NEXT: RET 0, implicit %xmm0
; X32: fixedStack:
-; X32: id: [[STACK4:[0-9]+]], type: default, offset: 8, size: 8, alignment: 8, isImmutable: true,
-; X32: id: [[STACK0:[0-9]+]], type: default, offset: 0, size: 8, alignment: 16, isImmutable: true,
-; X32: [[ARG1_ADDR:%[0-9]+]](p0) = G_FRAME_INDEX %fixed-stack.[[STACK0]]
-; X32-NEXT: [[ARG1:%[0-9]+]](s64) = G_LOAD [[ARG1_ADDR:%[0-9]+]](p0) :: (invariant load 8 from %fixed-stack.[[STACK0]], align 0)
-; X32-NEXT: [[ARG2_ADDR:%[0-9]+]](p0) = G_FRAME_INDEX %fixed-stack.[[STACK4]]
-; X32-NEXT: [[ARG2:%[0-9]+]](s64) = G_LOAD [[ARG2_ADDR:%[0-9]+]](p0) :: (invariant load 8 from %fixed-stack.[[STACK4]], align 0)
+; X32: id: [[STACK4:[0-9]+]], type: default, offset: 8, size: 8, alignment: 8,
+; X32-NEXT: isImmutable: true,
+
+; X32: id: [[STACK0:[0-9]+]], type: default, offset: 0, size: 8, alignment: 16,
+; X32-NEXT: isImmutable: true,
+
+; X32: [[ARG1_ADDR:%[0-9]+]]:_(p0) = G_FRAME_INDEX %fixed-stack.[[STACK0]]
+; X32-NEXT: [[ARG1:%[0-9]+]]:_(s64) = G_LOAD [[ARG1_ADDR:%[0-9]+]](p0) :: (invariant load 8 from %fixed-stack.[[STACK0]], align 0)
+; X32-NEXT: [[ARG2_ADDR:%[0-9]+]]:_(p0) = G_FRAME_INDEX %fixed-stack.[[STACK4]]
+; X32-NEXT: [[ARG2:%[0-9]+]]:_(s64) = G_LOAD [[ARG2_ADDR:%[0-9]+]](p0) :: (invariant load 8 from %fixed-stack.[[STACK4]], align 0)
; X32-NEXT: %fp0 = COPY [[ARG2:%[0-9]+]](s64)
; X32-NEXT: RET 0, implicit %fp0
@@ -285,8 +352,8 @@ define double @test_double_args(double %arg1, double %arg2) {
define <4 x i32> @test_v4i32_args(<4 x i32> %arg1, <4 x i32> %arg2) {
; ALL: name: test_v4i32_args
; ALL: liveins: %xmm0, %xmm1
-; ALL: [[ARG1:%[0-9]+]](<4 x s32>) = COPY %xmm0
-; ALL-NEXT: [[ARG2:%[0-9]+]](<4 x s32>) = COPY %xmm1
+; ALL: [[ARG1:%[0-9]+]]:_(<4 x s32>) = COPY %xmm0
+; ALL-NEXT: [[ARG2:%[0-9]+]]:_(<4 x s32>) = COPY %xmm1
; ALL-NEXT: %xmm0 = COPY [[ARG2:%[0-9]+]](<4 x s32>)
; ALL-NEXT: RET 0, implicit %xmm0
ret <4 x i32> %arg2
@@ -295,10 +362,10 @@ define <4 x i32> @test_v4i32_args(<4 x i32> %arg1, <4 x i32> %arg2) {
define <8 x i32> @test_v8i32_args(<8 x i32> %arg1) {
; ALL: name: test_v8i32_args
; ALL: liveins: %xmm0, %xmm1
-; ALL: [[ARG1L:%[0-9]+]](<4 x s32>) = COPY %xmm0
-; ALL-NEXT: [[ARG1H:%[0-9]+]](<4 x s32>) = COPY %xmm1
-; ALL-NEXT: [[ARG1:%[0-9]+]](<8 x s32>) = G_MERGE_VALUES [[ARG1L]](<4 x s32>), [[ARG1H]](<4 x s32>)
-; ALL-NEXT: [[RETL:%[0-9]+]](<4 x s32>), [[RETH:%[0-9]+]](<4 x s32>) = G_UNMERGE_VALUES [[ARG1:%[0-9]+]](<8 x s32>)
+; ALL: [[ARG1L:%[0-9]+]]:_(<4 x s32>) = COPY %xmm0
+; ALL-NEXT: [[ARG1H:%[0-9]+]]:_(<4 x s32>) = COPY %xmm1
+; ALL-NEXT: [[ARG1:%[0-9]+]]:_(<8 x s32>) = G_MERGE_VALUES [[ARG1L]](<4 x s32>), [[ARG1H]](<4 x s32>)
+; ALL-NEXT: [[RETL:%[0-9]+]]:_(<4 x s32>), [[RETH:%[0-9]+]]:_(<4 x s32>) = G_UNMERGE_VALUES [[ARG1:%[0-9]+]](<8 x s32>)
; ALL-NEXT: %xmm0 = COPY [[RETL:%[0-9]+]](<4 x s32>)
; ALL-NEXT: %xmm1 = COPY [[RETH:%[0-9]+]](<4 x s32>)
; ALL-NEXT: RET 0, implicit %xmm0, implicit %xmm1
@@ -317,16 +384,413 @@ entry:
define i32 * @test_memop_i32(i32 * %p1) {
; ALL-LABEL:name: test_memop_i32
;X64 liveins: %rdi
-;X64: %0(p0) = COPY %rdi
+;X64: %0:_(p0) = COPY %rdi
;X64-NEXT: %rax = COPY %0(p0)
;X64-NEXT: RET 0, implicit %rax
;X32: fixedStack:
-;X32: id: [[STACK0:[0-9]+]], type: default, offset: 0, size: 4, alignment: 16, isImmutable: true,
-;X32: %1(p0) = G_FRAME_INDEX %fixed-stack.[[STACK0]]
-;X32-NEXT: %0(p0) = G_LOAD %1(p0) :: (invariant load 4 from %fixed-stack.[[STACK0]], align 0)
+;X32: id: [[STACK0:[0-9]+]], type: default, offset: 0, size: 4, alignment: 16,
+;X32-NEXT: isImmutable: true,
+;X32: %1:_(p0) = G_FRAME_INDEX %fixed-stack.[[STACK0]]
+;X32-NEXT: %0:_(p0) = G_LOAD %1(p0) :: (invariant load 4 from %fixed-stack.[[STACK0]], align 0)
;X32-NEXT: %eax = COPY %0(p0)
;X32-NEXT: RET 0, implicit %eax
ret i32 * %p1;
-} \ No newline at end of file
+}
+
+declare void @trivial_callee()
+define void @test_trivial_call() {
+; ALL-LABEL: name: test_trivial_call
+
+; X32: ADJCALLSTACKDOWN32 0, 0, 0, implicit-def %esp, implicit-def %eflags, implicit-def %ssp, implicit %esp, implicit %ssp
+; X32-NEXT: CALLpcrel32 @trivial_callee, csr_32, implicit %esp
+; X32-NEXT: ADJCALLSTACKUP32 0, 0, implicit-def %esp, implicit-def %eflags, implicit-def %ssp, implicit %esp, implicit %ssp
+; X32-NEXT: RET 0
+
+; X64: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def %rsp, implicit-def %eflags, implicit-def %ssp, implicit %rsp, implicit %ssp
+; X64-NEXT: CALL64pcrel32 @trivial_callee, csr_64, implicit %rsp
+; X64-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def %rsp, implicit-def %eflags, implicit-def %ssp, implicit %rsp, implicit %ssp
+; X64-NEXT: RET 0
+
+ call void @trivial_callee()
+ ret void
+}
+
+declare void @simple_arg_callee(i32 %in0, i32 %in1)
+define void @test_simple_arg(i32 %in0, i32 %in1) {
+; ALL-LABEL: name: test_simple_arg
+
+; X32: fixedStack:
+; X32: - { id: 0, type: default, offset: 4, size: 4, alignment: 4,
+; X32-NEXT: isImmutable: true,
+; X32: - { id: 1, type: default, offset: 0, size: 4, alignment: 16,
+; X32-NEXT: isImmutable: true,
+; X32: body: |
+; X32-NEXT: bb.1 (%ir-block.0):
+; X32-NEXT: %2:_(p0) = G_FRAME_INDEX %fixed-stack.1
+; X32-NEXT: %0:_(s32) = G_LOAD %2(p0) :: (invariant load 4 from %fixed-stack.1, align 0)
+; X32-NEXT: %3:_(p0) = G_FRAME_INDEX %fixed-stack.0
+; X32-NEXT: %1:_(s32) = G_LOAD %3(p0) :: (invariant load 4 from %fixed-stack.0, align 0)
+; X32-NEXT: ADJCALLSTACKDOWN32 8, 0, 0, implicit-def %esp, implicit-def %eflags, implicit-def %ssp, implicit %esp, implicit %ssp
+; X32-NEXT: %4:_(p0) = COPY %esp
+; X32-NEXT: %5:_(s32) = G_CONSTANT i32 0
+; X32-NEXT: %6:_(p0) = G_GEP %4, %5(s32)
+; X32-NEXT: G_STORE %1(s32), %6(p0) :: (store 4 into stack, align 0)
+; X32-NEXT: %7:_(p0) = COPY %esp
+; X32-NEXT: %8:_(s32) = G_CONSTANT i32 4
+; X32-NEXT: %9:_(p0) = G_GEP %7, %8(s32)
+; X32-NEXT: G_STORE %0(s32), %9(p0) :: (store 4 into stack + 4, align 0)
+; X32-NEXT: CALLpcrel32 @simple_arg_callee, csr_32, implicit %esp
+; X32-NEXT: ADJCALLSTACKUP32 8, 0, implicit-def %esp, implicit-def %eflags, implicit-def %ssp, implicit %esp, implicit %ssp
+; X32-NEXT: RET 0
+
+; X64: %0:_(s32) = COPY %edi
+; X64-NEXT: %1:_(s32) = COPY %esi
+; X64-NEXT: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def %rsp, implicit-def %eflags, implicit-def %ssp, implicit %rsp, implicit %ssp
+; X64-NEXT: %edi = COPY %1(s32)
+; X64-NEXT: %esi = COPY %0(s32)
+; X64-NEXT: CALL64pcrel32 @simple_arg_callee, csr_64, implicit %rsp, implicit %ssp, implicit %edi, implicit %esi
+; X64-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def %rsp, implicit-def %eflags, implicit-def %ssp, implicit %rsp, implicit %ssp
+; X64-NEXT: RET 0
+
+ call void @simple_arg_callee(i32 %in1, i32 %in0)
+ ret void
+}
+
+declare void @simple_arg8_callee(i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7, i32 %arg8)
+define void @test_simple_arg8_call(i32 %in0) {
+; ALL-LABEL: name: test_simple_arg8_call
+
+; X32: fixedStack:
+; X32: - { id: 0, type: default, offset: 0, size: 4, alignment: 16,
+; X32-NEXT: isImmutable: true,
+; X32: body: |
+; X32-NEXT: bb.1 (%ir-block.0):
+; X32-NEXT: %1:_(p0) = G_FRAME_INDEX %fixed-stack.0
+; X32-NEXT: %0:_(s32) = G_LOAD %1(p0) :: (invariant load 4 from %fixed-stack.0, align 0)
+; X32-NEXT: ADJCALLSTACKDOWN32 32, 0, 0, implicit-def %esp, implicit-def %eflags, implicit-def %ssp, implicit %esp, implicit %ssp
+; X32-NEXT: %2:_(p0) = COPY %esp
+; X32-NEXT: %3:_(s32) = G_CONSTANT i32 0
+; X32-NEXT: %4:_(p0) = G_GEP %2, %3(s32)
+; X32-NEXT: G_STORE %0(s32), %4(p0) :: (store 4 into stack, align 0)
+; X32-NEXT: %5:_(p0) = COPY %esp
+; X32-NEXT: %6:_(s32) = G_CONSTANT i32 4
+; X32-NEXT: %7:_(p0) = G_GEP %5, %6(s32)
+; X32-NEXT: G_STORE %0(s32), %7(p0) :: (store 4 into stack + 4, align 0)
+; X32-NEXT: %8:_(p0) = COPY %esp
+; X32-NEXT: %9:_(s32) = G_CONSTANT i32 8
+; X32-NEXT: %10:_(p0) = G_GEP %8, %9(s32)
+; X32-NEXT: G_STORE %0(s32), %10(p0) :: (store 4 into stack + 8, align 0)
+; X32-NEXT: %11:_(p0) = COPY %esp
+; X32-NEXT: %12:_(s32) = G_CONSTANT i32 12
+; X32-NEXT: %13:_(p0) = G_GEP %11, %12(s32)
+; X32-NEXT: G_STORE %0(s32), %13(p0) :: (store 4 into stack + 12, align 0)
+; X32-NEXT: %14:_(p0) = COPY %esp
+; X32-NEXT: %15:_(s32) = G_CONSTANT i32 16
+; X32-NEXT: %16:_(p0) = G_GEP %14, %15(s32)
+; X32-NEXT: G_STORE %0(s32), %16(p0) :: (store 4 into stack + 16, align 0)
+; X32-NEXT: %17:_(p0) = COPY %esp
+; X32-NEXT: %18:_(s32) = G_CONSTANT i32 20
+; X32-NEXT: %19:_(p0) = G_GEP %17, %18(s32)
+; X32-NEXT: G_STORE %0(s32), %19(p0) :: (store 4 into stack + 20, align 0)
+; X32-NEXT: %20:_(p0) = COPY %esp
+; X32-NEXT: %21:_(s32) = G_CONSTANT i32 24
+; X32-NEXT: %22:_(p0) = G_GEP %20, %21(s32)
+; X32-NEXT: G_STORE %0(s32), %22(p0) :: (store 4 into stack + 24, align 0)
+; X32-NEXT: %23:_(p0) = COPY %esp
+; X32-NEXT: %24:_(s32) = G_CONSTANT i32 28
+; X32-NEXT: %25:_(p0) = G_GEP %23, %24(s32)
+; X32-NEXT: G_STORE %0(s32), %25(p0) :: (store 4 into stack + 28, align 0)
+; X32-NEXT: CALLpcrel32 @simple_arg8_callee, csr_32, implicit %esp
+; X32-NEXT: ADJCALLSTACKUP32 32, 0, implicit-def %esp, implicit-def %eflags, implicit-def %ssp, implicit %esp, implicit %ssp
+; X32-NEXT: RET 0
+
+; X64: %0:_(s32) = COPY %edi
+; X64-NEXT: ADJCALLSTACKDOWN64 16, 0, 0, implicit-def %rsp, implicit-def %eflags, implicit-def %ssp, implicit %rsp, implicit %ssp
+; X64-NEXT: %edi = COPY %0(s32)
+; X64-NEXT: %esi = COPY %0(s32)
+; X64-NEXT: %edx = COPY %0(s32)
+; X64-NEXT: %ecx = COPY %0(s32)
+; X64-NEXT: %r8d = COPY %0(s32)
+; X64-NEXT: %r9d = COPY %0(s32)
+; X64-NEXT: %1:_(p0) = COPY %rsp
+; X64-NEXT: %2:_(s64) = G_CONSTANT i64 0
+; X64-NEXT: %3:_(p0) = G_GEP %1, %2(s64)
+; X64-NEXT: G_STORE %0(s32), %3(p0) :: (store 4 into stack, align 0)
+; X64-NEXT: %4:_(p0) = COPY %rsp
+; X64-NEXT: %5:_(s64) = G_CONSTANT i64 8
+; X64-NEXT: %6:_(p0) = G_GEP %4, %5(s64)
+; X64-NEXT: G_STORE %0(s32), %6(p0) :: (store 4 into stack + 8, align 0)
+; X64-NEXT: CALL64pcrel32 @simple_arg8_callee, csr_64, implicit %rsp, implicit %ssp, implicit %edi, implicit %esi, implicit %edx, implicit %ecx, implicit %r8d, implicit %r9d
+; X64-NEXT: ADJCALLSTACKUP64 16, 0, implicit-def %rsp, implicit-def %eflags, implicit-def %ssp, implicit %rsp, implicit %ssp
+; X64-NEXT: RET 0
+
+ call void @simple_arg8_callee(i32 %in0, i32 %in0, i32 %in0, i32 %in0,i32 %in0, i32 %in0, i32 %in0, i32 %in0)
+ ret void
+}
+
+declare i32 @simple_return_callee(i32 %in0)
+define i32 @test_simple_return_callee() {
+; ALL-LABEL: name: test_simple_return_callee
+
+; X32: %1:_(s32) = G_CONSTANT i32 5
+; X32-NEXT: ADJCALLSTACKDOWN32 4, 0, 0, implicit-def %esp, implicit-def %eflags, implicit-def %ssp, implicit %esp, implicit %ssp
+; X32-NEXT: %2:_(p0) = COPY %esp
+; X32-NEXT: %3:_(s32) = G_CONSTANT i32 0
+; X32-NEXT: %4:_(p0) = G_GEP %2, %3(s32)
+; X32-NEXT: G_STORE %1(s32), %4(p0) :: (store 4 into stack, align 0)
+; X32-NEXT: CALLpcrel32 @simple_return_callee, csr_32, implicit %esp, implicit %ssp, implicit-def %eax
+; X32-NEXT: %0:_(s32) = COPY %eax
+; X32-NEXT: ADJCALLSTACKUP32 4, 0, implicit-def %esp, implicit-def %eflags, implicit-def %ssp, implicit %esp, implicit %ssp
+; X32-NEXT: %5:_(s32) = G_ADD %0, %0
+; X32-NEXT: %eax = COPY %5(s32)
+; X32-NEXT: RET 0, implicit %eax
+
+; X64: %1:_(s32) = G_CONSTANT i32 5
+; X64-NEXT: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def %rsp, implicit-def %eflags, implicit-def %ssp, implicit %rsp, implicit %ssp
+; X64-NEXT: %edi = COPY %1(s32)
+; X64-NEXT: CALL64pcrel32 @simple_return_callee, csr_64, implicit %rsp, implicit %ssp, implicit %edi, implicit-def %eax
+; X64-NEXT: %0:_(s32) = COPY %eax
+; X64-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def %rsp, implicit-def %eflags, implicit-def %ssp, implicit %rsp, implicit %ssp
+; X64-NEXT: %2:_(s32) = G_ADD %0, %0
+; X64-NEXT: %eax = COPY %2(s32)
+; X64-NEXT: RET 0, implicit %eax
+
+ %call = call i32 @simple_return_callee(i32 5)
+ %r = add i32 %call, %call
+ ret i32 %r
+}
+
+declare <8 x i32> @split_return_callee(<8 x i32> %in0)
+define <8 x i32> @test_split_return_callee(<8 x i32> %arg1, <8 x i32> %arg2) {
+; ALL-LABEL: name: test_split_return_callee
+
+; X32: fixedStack:
+; X32-NEXT: - { id: 0, type: default, offset: 0, size: 16, alignment: 16,
+; X32-NEXT: isImmutable: true,
+; X32: %2:_(<4 x s32>) = COPY %xmm0
+; X32-NEXT: %3:_(<4 x s32>) = COPY %xmm1
+; X32-NEXT: %4:_(<4 x s32>) = COPY %xmm2
+; X32-NEXT: %6:_(p0) = G_FRAME_INDEX %fixed-stack.0
+; X32-NEXT: %5:_(<4 x s32>) = G_LOAD %6(p0) :: (invariant load 16 from %fixed-stack.0, align 0)
+; X32-NEXT: %0:_(<8 x s32>) = G_MERGE_VALUES %2(<4 x s32>), %3(<4 x s32>)
+; X32-NEXT: %1:_(<8 x s32>) = G_MERGE_VALUES %4(<4 x s32>), %5(<4 x s32>)
+; X32-NEXT: ADJCALLSTACKDOWN32 0, 0, 0, implicit-def %esp, implicit-def %eflags, implicit-def %ssp, implicit %esp, implicit %ssp
+; X32-NEXT: %8:_(<4 x s32>), %9:_(<4 x s32>) = G_UNMERGE_VALUES %1(<8 x s32>)
+; X32-NEXT: %xmm0 = COPY %8(<4 x s32>)
+; X32-NEXT: %xmm1 = COPY %9(<4 x s32>)
+; X32-NEXT: CALLpcrel32 @split_return_callee, csr_32, implicit %esp, implicit %ssp, implicit %xmm0, implicit %xmm1, implicit-def %xmm0, implicit-def %xmm1
+; X32-NEXT: %10:_(<4 x s32>) = COPY %xmm0
+; X32-NEXT: %11:_(<4 x s32>) = COPY %xmm1
+; X32-NEXT: %7:_(<8 x s32>) = G_MERGE_VALUES %10(<4 x s32>), %11(<4 x s32>)
+; X32-NEXT: ADJCALLSTACKUP32 0, 0, implicit-def %esp, implicit-def %eflags, implicit-def %ssp, implicit %esp, implicit %ssp
+; X32-NEXT: %12:_(<8 x s32>) = G_ADD %0, %7
+; X32-NEXT: %13:_(<4 x s32>), %14:_(<4 x s32>) = G_UNMERGE_VALUES %12(<8 x s32>)
+; X32-NEXT: %xmm0 = COPY %13(<4 x s32>)
+; X32-NEXT: %xmm1 = COPY %14(<4 x s32>)
+; X32-NEXT: RET 0, implicit %xmm0, implicit %xmm1
+
+; X64: %2:_(<4 x s32>) = COPY %xmm0
+; X64-NEXT: %3:_(<4 x s32>) = COPY %xmm1
+; X64-NEXT: %4:_(<4 x s32>) = COPY %xmm2
+; X64-NEXT: %5:_(<4 x s32>) = COPY %xmm3
+; X64-NEXT: %0:_(<8 x s32>) = G_MERGE_VALUES %2(<4 x s32>), %3(<4 x s32>)
+; X64-NEXT: %1:_(<8 x s32>) = G_MERGE_VALUES %4(<4 x s32>), %5(<4 x s32>)
+; X64-NEXT: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def %rsp, implicit-def %eflags, implicit-def %ssp, implicit %rsp, implicit %ssp
+; X64-NEXT: %7:_(<4 x s32>), %8:_(<4 x s32>) = G_UNMERGE_VALUES %1(<8 x s32>)
+; X64-NEXT: %xmm0 = COPY %7(<4 x s32>)
+; X64-NEXT: %xmm1 = COPY %8(<4 x s32>)
+; X64-NEXT: CALL64pcrel32 @split_return_callee, csr_64, implicit %rsp, implicit %ssp, implicit %xmm0, implicit %xmm1, implicit-def %xmm0, implicit-def %xmm1
+; X64-NEXT: %9:_(<4 x s32>) = COPY %xmm0
+; X64-NEXT: %10:_(<4 x s32>) = COPY %xmm1
+; X64-NEXT: %6:_(<8 x s32>) = G_MERGE_VALUES %9(<4 x s32>), %10(<4 x s32>)
+; X64-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def %rsp, implicit-def %eflags, implicit-def %ssp, implicit %rsp, implicit %ssp
+; X64-NEXT: %11:_(<8 x s32>) = G_ADD %0, %6
+; X64-NEXT: %12:_(<4 x s32>), %13:_(<4 x s32>) = G_UNMERGE_VALUES %11(<8 x s32>)
+; X64-NEXT: %xmm0 = COPY %12(<4 x s32>)
+; X64-NEXT: %xmm1 = COPY %13(<4 x s32>)
+; X64-NEXT: RET 0, implicit %xmm0, implicit %xmm1
+
+ %call = call <8 x i32> @split_return_callee(<8 x i32> %arg2)
+ %r = add <8 x i32> %arg1, %call
+ ret <8 x i32> %r
+}
+
+define void @test_indirect_call(void()* %func) {
+; ALL-LABEL: name: test_indirect_call
+
+; X32: registers:
+; X32-NEXT: - { id: 0, class: gr32, preferred-register: '' }
+; X32-NEXT: - { id: 1, class: _, preferred-register: '' }
+; X32: %1:_(p0) = G_FRAME_INDEX %fixed-stack.0
+; X32-NEXT: %0:gr32(p0) = G_LOAD %1(p0) :: (invariant load 4 from %fixed-stack.0, align 0)
+; X32-NEXT: ADJCALLSTACKDOWN32 0, 0, 0, implicit-def %esp, implicit-def %eflags, implicit-def %ssp, implicit %esp, implicit %ssp
+; X32-NEXT: CALL32r %0(p0), csr_32, implicit %esp
+; X32-NEXT: ADJCALLSTACKUP32 0, 0, implicit-def %esp, implicit-def %eflags, implicit-def %ssp, implicit %esp, implicit %ssp
+; X32-NEXT: RET 0
+
+; X64: registers:
+; X64-NEXT: - { id: 0, class: gr64, preferred-register: '' }
+; X64: %0:gr64(p0) = COPY %rdi
+; X64-NEXT: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def %rsp, implicit-def %eflags, implicit-def %ssp, implicit %rsp, implicit %ssp
+; X64-NEXT: CALL64r %0(p0), csr_64, implicit %rsp
+; X64-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def %rsp, implicit-def %eflags, implicit-def %ssp, implicit %rsp, implicit %ssp
+; X64-NEXT: RET 0
+
+ call void %func()
+ ret void
+}
+
+
+declare void @take_char(i8)
+define void @test_abi_exts_call(i8* %addr) {
+; ALL-LABEL: name: test_abi_exts_call
+
+; X32: fixedStack:
+; X32-NEXT: - { id: 0, type: default, offset: 0, size: 4, alignment: 16,
+; X32-NEXT: isImmutable: true,
+; X32: %1:_(p0) = G_FRAME_INDEX %fixed-stack.0
+; X32-NEXT: %0:_(p0) = G_LOAD %1(p0) :: (invariant load 4 from %fixed-stack.0, align 0)
+; X32-NEXT: %2:_(s8) = G_LOAD %0(p0) :: (load 1 from %ir.addr)
+; X32-NEXT: ADJCALLSTACKDOWN32 4, 0, 0, implicit-def %esp, implicit-def %eflags, implicit-def %ssp, implicit %esp, implicit %ssp
+; X32-NEXT: %3:_(p0) = COPY %esp
+; X32-NEXT: %4:_(s32) = G_CONSTANT i32 0
+; X32-NEXT: %5:_(p0) = G_GEP %3, %4(s32)
+; X32-NEXT: %6:_(s32) = G_ANYEXT %2(s8)
+; X32-NEXT: G_STORE %6(s32), %5(p0) :: (store 4 into stack, align 0)
+; X32-NEXT: CALLpcrel32 @take_char, csr_32, implicit %esp
+; X32-NEXT: ADJCALLSTACKUP32 4, 0, implicit-def %esp, implicit-def %eflags, implicit-def %ssp, implicit %esp, implicit %ssp
+; X32-NEXT: ADJCALLSTACKDOWN32 4, 0, 0, implicit-def %esp, implicit-def %eflags, implicit-def %ssp, implicit %esp, implicit %ssp
+; X32-NEXT: %7:_(p0) = COPY %esp
+; X32-NEXT: %8:_(s32) = G_CONSTANT i32 0
+; X32-NEXT: %9:_(p0) = G_GEP %7, %8(s32)
+; X32-NEXT: %10:_(s32) = G_SEXT %2(s8)
+; X32-NEXT: G_STORE %10(s32), %9(p0) :: (store 4 into stack, align 0)
+; X32-NEXT: CALLpcrel32 @take_char, csr_32, implicit %esp
+; X32-NEXT: ADJCALLSTACKUP32 4, 0, implicit-def %esp, implicit-def %eflags, implicit-def %ssp, implicit %esp, implicit %ssp
+; X32-NEXT: ADJCALLSTACKDOWN32 4, 0, 0, implicit-def %esp, implicit-def %eflags, implicit-def %ssp, implicit %esp, implicit %ssp
+; X32-NEXT: %11:_(p0) = COPY %esp
+; X32-NEXT: %12:_(s32) = G_CONSTANT i32 0
+; X32-NEXT: %13:_(p0) = G_GEP %11, %12(s32)
+; X32-NEXT: %14:_(s32) = G_ZEXT %2(s8)
+; X32-NEXT: G_STORE %14(s32), %13(p0) :: (store 4 into stack, align 0)
+; X32-NEXT: CALLpcrel32 @take_char, csr_32, implicit %esp
+; X32-NEXT: ADJCALLSTACKUP32 4, 0, implicit-def %esp, implicit-def %eflags, implicit-def %ssp, implicit %esp, implicit %ssp
+; X32-NEXT: RET 0
+
+; X64: %0:_(p0) = COPY %rdi
+; X64-NEXT: %1:_(s8) = G_LOAD %0(p0) :: (load 1 from %ir.addr)
+; X64-NEXT: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def %rsp, implicit-def %eflags, implicit-def %ssp, implicit %rsp, implicit %ssp
+; X64-NEXT: %2:_(s32) = G_ANYEXT %1(s8)
+; X64-NEXT: %edi = COPY %2(s32)
+; X64-NEXT: CALL64pcrel32 @take_char, csr_64, implicit %rsp, implicit %ssp, implicit %edi
+; X64-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def %rsp, implicit-def %eflags, implicit-def %ssp, implicit %rsp, implicit %ssp
+; X64-NEXT: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def %rsp, implicit-def %eflags, implicit-def %ssp, implicit %rsp, implicit %ssp
+; X64-NEXT: %3:_(s32) = G_SEXT %1(s8)
+; X64-NEXT: %edi = COPY %3(s32)
+; X64-NEXT: CALL64pcrel32 @take_char, csr_64, implicit %rsp, implicit %ssp, implicit %edi
+; X64-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def %rsp, implicit-def %eflags, implicit-def %ssp, implicit %rsp, implicit %ssp
+; X64-NEXT: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def %rsp, implicit-def %eflags, implicit-def %ssp, implicit %rsp, implicit %ssp
+; X64-NEXT: %4:_(s32) = G_ZEXT %1(s8)
+; X64-NEXT: %edi = COPY %4(s32)
+; X64-NEXT: CALL64pcrel32 @take_char, csr_64, implicit %rsp, implicit %ssp, implicit %edi
+; X64-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def %rsp, implicit-def %eflags, implicit-def %ssp, implicit %rsp, implicit %ssp
+; X64-NEXT: RET 0
+
+ %val = load i8, i8* %addr
+ call void @take_char(i8 %val)
+ call void @take_char(i8 signext %val)
+ call void @take_char(i8 zeroext %val)
+ ret void
+}
+
+declare void @variadic_callee(i8*, ...)
+define void @test_variadic_call_1(i8** %addr_ptr, i32* %val_ptr) {
+; ALL-LABEL: name: test_variadic_call_1
+
+; X32: fixedStack:
+; X32-NEXT: - { id: 0, type: default, offset: 4, size: 4, alignment: 4, stack-id: 0,
+; X32-NEXT: isImmutable: true, isAliased: false, callee-saved-register: '', callee-saved-restored: true }
+; X32-NEXT: - { id: 1, type: default, offset: 0, size: 4, alignment: 16, stack-id: 0,
+; X32-NEXT: isImmutable: true, isAliased: false, callee-saved-register: '', callee-saved-restored: true }
+; X32: %2:_(p0) = G_FRAME_INDEX %fixed-stack.1
+; X32-NEXT: %0:_(p0) = G_LOAD %2(p0) :: (invariant load 4 from %fixed-stack.1, align 0)
+; X32-NEXT: %3:_(p0) = G_FRAME_INDEX %fixed-stack.0
+; X32-NEXT: %1:_(p0) = G_LOAD %3(p0) :: (invariant load 4 from %fixed-stack.0, align 0)
+; X32-NEXT: %4:_(p0) = G_LOAD %0(p0) :: (load 4 from %ir.addr_ptr)
+; X32-NEXT: %5:_(s32) = G_LOAD %1(p0) :: (load 4 from %ir.val_ptr)
+; X32-NEXT: ADJCALLSTACKDOWN32 8, 0, 0, implicit-def %esp, implicit-def %eflags, implicit-def %ssp, implicit %esp, implicit %ssp
+; X32-NEXT: %6:_(p0) = COPY %esp
+; X32-NEXT: %7:_(s32) = G_CONSTANT i32 0
+; X32-NEXT: %8:_(p0) = G_GEP %6, %7(s32)
+; X32-NEXT: G_STORE %4(p0), %8(p0) :: (store 4 into stack, align 0)
+; X32-NEXT: %9:_(p0) = COPY %esp
+; X32-NEXT: %10:_(s32) = G_CONSTANT i32 4
+; X32-NEXT: %11:_(p0) = G_GEP %9, %10(s32)
+; X32-NEXT: G_STORE %5(s32), %11(p0) :: (store 4 into stack + 4, align 0)
+; X32-NEXT: CALLpcrel32 @variadic_callee, csr_32, implicit %esp
+; X32-NEXT: ADJCALLSTACKUP32 8, 0, implicit-def %esp, implicit-def %eflags, implicit-def %ssp, implicit %esp, implicit %ssp
+; X32-NEXT: RET 0
+
+; X64: %0:_(p0) = COPY %rdi
+; X64-NEXT: %1:_(p0) = COPY %rsi
+; X64-NEXT: %2:_(p0) = G_LOAD %0(p0) :: (load 8 from %ir.addr_ptr)
+; X64-NEXT: %3:_(s32) = G_LOAD %1(p0) :: (load 4 from %ir.val_ptr)
+; X64-NEXT: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def %rsp, implicit-def %eflags, implicit-def %ssp, implicit %rsp, implicit %ssp
+; X64-NEXT: %rdi = COPY %2(p0)
+; X64-NEXT: %esi = COPY %3(s32)
+; X64-NEXT: %al = MOV8ri 0
+; X64-NEXT: CALL64pcrel32 @variadic_callee, csr_64, implicit %rsp, implicit %ssp, implicit %rdi, implicit %esi, implicit %al
+; X64-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def %rsp, implicit-def %eflags, implicit-def %ssp, implicit %rsp, implicit %ssp
+; X64-NEXT: RET 0
+
+ %addr = load i8*, i8** %addr_ptr
+ %val = load i32, i32* %val_ptr
+ call void (i8*, ...) @variadic_callee(i8* %addr, i32 %val)
+ ret void
+}
+
+define void @test_variadic_call_2(i8** %addr_ptr, double* %val_ptr) {
+; ALL-LABEL: name: test_variadic_call_2
+
+; X32: fixedStack:
+; X32-NEXT: - { id: 0, type: default, offset: 4, size: 4, alignment: 4, stack-id: 0,
+; X32-NEXT: isImmutable: true, isAliased: false, callee-saved-register: '', callee-saved-restored: true }
+; X32-NEXT: - { id: 1, type: default, offset: 0, size: 4, alignment: 16, stack-id: 0,
+; X32-NEXT: isImmutable: true, isAliased: false, callee-saved-register: '', callee-saved-restored: true }
+; X32: %2:_(p0) = G_FRAME_INDEX %fixed-stack.1
+; X32-NEXT: %0:_(p0) = G_LOAD %2(p0) :: (invariant load 4 from %fixed-stack.1, align 0)
+; X32-NEXT: %3:_(p0) = G_FRAME_INDEX %fixed-stack.0
+; X32-NEXT: %1:_(p0) = G_LOAD %3(p0) :: (invariant load 4 from %fixed-stack.0, align 0)
+; X32-NEXT: %4:_(p0) = G_LOAD %0(p0) :: (load 4 from %ir.addr_ptr)
+; X32-NEXT: %5:_(s64) = G_LOAD %1(p0) :: (load 8 from %ir.val_ptr, align 4)
+; X32-NEXT: ADJCALLSTACKDOWN32 12, 0, 0, implicit-def %esp, implicit-def %eflags, implicit-def %ssp, implicit %esp, implicit %ssp
+; X32-NEXT: %6:_(p0) = COPY %esp
+; X32-NEXT: %7:_(s32) = G_CONSTANT i32 0
+; X32-NEXT: %8:_(p0) = G_GEP %6, %7(s32)
+; X32-NEXT: G_STORE %4(p0), %8(p0) :: (store 4 into stack, align 0)
+; X32-NEXT: %9:_(p0) = COPY %esp
+; X32-NEXT: %10:_(s32) = G_CONSTANT i32 4
+; X32-NEXT: %11:_(p0) = G_GEP %9, %10(s32)
+; X32-NEXT: G_STORE %5(s64), %11(p0) :: (store 8 into stack + 4, align 0)
+; X32-NEXT: CALLpcrel32 @variadic_callee, csr_32, implicit %esp
+; X32-NEXT: ADJCALLSTACKUP32 12, 0, implicit-def %esp, implicit-def %eflags, implicit-def %ssp, implicit %esp, implicit %ssp
+; X32-NEXT: RET 0
+
+; X64: %1:_(p0) = COPY %rsi
+; X64-NEXT: %2:_(p0) = G_LOAD %0(p0) :: (load 8 from %ir.addr_ptr)
+; X64-NEXT: %3:_(s64) = G_LOAD %1(p0) :: (load 8 from %ir.val_ptr)
+; X64-NEXT: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def %rsp, implicit-def %eflags, implicit-def %ssp, implicit %rsp, implicit %ssp
+; X64-NEXT: %rdi = COPY %2(p0)
+; X64-NEXT: %xmm0 = COPY %3(s64)
+; X64-NEXT: %al = MOV8ri 1
+; X64-NEXT: CALL64pcrel32 @variadic_callee, csr_64, implicit %rsp, implicit %ssp, implicit %rdi, implicit %xmm0, implicit %al
+; X64-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def %rsp, implicit-def %eflags, implicit-def %ssp, implicit %rsp, implicit %ssp
+; X64-NEXT: RET 0
+
+ %addr = load i8*, i8** %addr_ptr
+ %val = load double, double* %val_ptr
+ call void (i8*, ...) @variadic_callee(i8* %addr, double %val)
+ ret void
+}
diff --git a/test/CodeGen/X86/GlobalISel/legalize-GV.mir b/test/CodeGen/X86/GlobalISel/legalize-GV.mir
index 7f9971e4c70a..60ca303d0384 100644
--- a/test/CodeGen/X86/GlobalISel/legalize-GV.mir
+++ b/test/CodeGen/X86/GlobalISel/legalize-GV.mir
@@ -19,7 +19,7 @@ regBankSelected: false
# ALL-NEXT: - { id: 0, class: _, preferred-register: '' }
registers:
- { id: 0, class: _, preferred-register: '' }
-# ALL: %0(p0) = G_GLOBAL_VALUE @g_int
+# ALL: %0:_(p0) = G_GLOBAL_VALUE @g_int
# ALL-NEXT: %rax = COPY %0(p0)
# ALL-NEXT: RET 0, implicit %rax
body: |
diff --git a/test/CodeGen/X86/GlobalISel/legalize-add-v128.mir b/test/CodeGen/X86/GlobalISel/legalize-add-v128.mir
index feba33ac91be..abd88f41bd34 100644
--- a/test/CodeGen/X86/GlobalISel/legalize-add-v128.mir
+++ b/test/CodeGen/X86/GlobalISel/legalize-add-v128.mir
@@ -1,3 +1,4 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple=x86_64-linux-gnu -mattr=+sse2 -global-isel -run-pass=legalizer %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=SSE2
--- |
@@ -23,7 +24,6 @@
...
---
name: test_add_v16i8
-# ALL-LABEL: name: test_add_v16i8
alignment: 4
legalized: false
regBankSelected: false
@@ -31,23 +31,25 @@ registers:
- { id: 0, class: _ }
- { id: 1, class: _ }
- { id: 2, class: _ }
-# ALL: %0(<16 x s8>) = IMPLICIT_DEF
-# ALL-NEXT: %1(<16 x s8>) = IMPLICIT_DEF
-# ALL-NEXT: %2(<16 x s8>) = G_ADD %0, %1
-# ALL-NEXT: RET 0
body: |
bb.1 (%ir-block.0):
liveins: %xmm0, %xmm1
+ ; ALL-LABEL: name: test_add_v16i8
+ ; ALL: [[DEF:%[0-9]+]]:_(<16 x s8>) = IMPLICIT_DEF
+ ; ALL: [[DEF1:%[0-9]+]]:_(<16 x s8>) = IMPLICIT_DEF
+ ; ALL: [[ADD:%[0-9]+]]:_(<16 x s8>) = G_ADD [[DEF]], [[DEF1]]
+ ; ALL: %xmm0 = COPY [[ADD]](<16 x s8>)
+ ; ALL: RET 0
%0(<16 x s8>) = IMPLICIT_DEF
%1(<16 x s8>) = IMPLICIT_DEF
%2(<16 x s8>) = G_ADD %0, %1
+ %xmm0 = COPY %2
RET 0
...
---
name: test_add_v8i16
-# ALL-LABEL: name: test_add_v8i16
alignment: 4
legalized: false
regBankSelected: false
@@ -55,23 +57,25 @@ registers:
- { id: 0, class: _ }
- { id: 1, class: _ }
- { id: 2, class: _ }
-# ALL: %0(<8 x s16>) = IMPLICIT_DEF
-# ALL-NEXT: %1(<8 x s16>) = IMPLICIT_DEF
-# ALL-NEXT: %2(<8 x s16>) = G_ADD %0, %1
-# ALL-NEXT: RET 0
body: |
bb.1 (%ir-block.0):
liveins: %xmm0, %xmm1
+ ; ALL-LABEL: name: test_add_v8i16
+ ; ALL: [[DEF:%[0-9]+]]:_(<8 x s16>) = IMPLICIT_DEF
+ ; ALL: [[DEF1:%[0-9]+]]:_(<8 x s16>) = IMPLICIT_DEF
+ ; ALL: [[ADD:%[0-9]+]]:_(<8 x s16>) = G_ADD [[DEF]], [[DEF1]]
+ ; ALL: %xmm0 = COPY [[ADD]](<8 x s16>)
+ ; ALL: RET 0
%0(<8 x s16>) = IMPLICIT_DEF
%1(<8 x s16>) = IMPLICIT_DEF
%2(<8 x s16>) = G_ADD %0, %1
+ %xmm0 = COPY %2
RET 0
...
---
name: test_add_v4i32
-# ALL-LABEL: name: test_add_v4i32
alignment: 4
legalized: false
regBankSelected: false
@@ -79,23 +83,25 @@ registers:
- { id: 0, class: _ }
- { id: 1, class: _ }
- { id: 2, class: _ }
-# ALL: %0(<4 x s32>) = IMPLICIT_DEF
-# ALL-NEXT: %1(<4 x s32>) = IMPLICIT_DEF
-# ALL-NEXT: %2(<4 x s32>) = G_ADD %0, %1
-# ALL-NEXT: RET 0
body: |
bb.1 (%ir-block.0):
liveins: %xmm0, %xmm1
+ ; ALL-LABEL: name: test_add_v4i32
+ ; ALL: [[DEF:%[0-9]+]]:_(<4 x s32>) = IMPLICIT_DEF
+ ; ALL: [[DEF1:%[0-9]+]]:_(<4 x s32>) = IMPLICIT_DEF
+ ; ALL: [[ADD:%[0-9]+]]:_(<4 x s32>) = G_ADD [[DEF]], [[DEF1]]
+ ; ALL: %xmm0 = COPY [[ADD]](<4 x s32>)
+ ; ALL: RET 0
%0(<4 x s32>) = IMPLICIT_DEF
%1(<4 x s32>) = IMPLICIT_DEF
%2(<4 x s32>) = G_ADD %0, %1
+ %xmm0 = COPY %2
RET 0
...
---
name: test_add_v2i64
-# ALL-LABEL: name: test_add_v2i64
alignment: 4
legalized: false
regBankSelected: false
@@ -103,17 +109,20 @@ registers:
- { id: 0, class: _ }
- { id: 1, class: _ }
- { id: 2, class: _ }
-# ALL: %0(<2 x s64>) = IMPLICIT_DEF
-# ALL-NEXT: %1(<2 x s64>) = IMPLICIT_DEF
-# ALL-NEXT: %2(<2 x s64>) = G_ADD %0, %1
-# ALL-NEXT: RET 0
body: |
bb.1 (%ir-block.0):
liveins: %xmm0, %xmm1
+ ; ALL-LABEL: name: test_add_v2i64
+ ; ALL: [[DEF:%[0-9]+]]:_(<2 x s64>) = IMPLICIT_DEF
+ ; ALL: [[DEF1:%[0-9]+]]:_(<2 x s64>) = IMPLICIT_DEF
+ ; ALL: [[ADD:%[0-9]+]]:_(<2 x s64>) = G_ADD [[DEF]], [[DEF1]]
+ ; ALL: %xmm0 = COPY [[ADD]](<2 x s64>)
+ ; ALL: RET 0
%0(<2 x s64>) = IMPLICIT_DEF
%1(<2 x s64>) = IMPLICIT_DEF
%2(<2 x s64>) = G_ADD %0, %1
+ %xmm0 = COPY %2
RET 0
...
diff --git a/test/CodeGen/X86/GlobalISel/legalize-add-v256.mir b/test/CodeGen/X86/GlobalISel/legalize-add-v256.mir
index 9f918c404b12..bea9161428cf 100644
--- a/test/CodeGen/X86/GlobalISel/legalize-add-v256.mir
+++ b/test/CodeGen/X86/GlobalISel/legalize-add-v256.mir
@@ -1,3 +1,4 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple=x86_64-linux-gnu -mattr=+sse2 -global-isel -run-pass=legalizer %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=NOT_AVX2 --check-prefix=SSE2
# RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx -global-isel -run-pass=legalizer %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=NOT_AVX2 --check-prefix=AVX1
# RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx2 -global-isel -run-pass=legalizer %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=AVX2
@@ -26,7 +27,6 @@
...
---
name: test_add_v32i8
-# ALL-LABEL: name: test_add_v32i8
alignment: 4
legalized: false
regBankSelected: false
@@ -34,32 +34,37 @@ registers:
- { id: 0, class: _ }
- { id: 1, class: _ }
- { id: 2, class: _ }
-# NOT_AVX2: %0(<32 x s8>) = IMPLICIT_DEF
-# NOT_AVX2-NEXT: %1(<32 x s8>) = IMPLICIT_DEF
-# NOT_AVX2-NEXT: %3(<16 x s8>), %4(<16 x s8>) = G_UNMERGE_VALUES %0(<32 x s8>)
-# NOT_AVX2-NEXT: %5(<16 x s8>), %6(<16 x s8>) = G_UNMERGE_VALUES %1(<32 x s8>)
-# NOT_AVX2-NEXT: %7(<16 x s8>) = G_ADD %3, %5
-# NOT_AVX2-NEXT: %8(<16 x s8>) = G_ADD %4, %6
-# NOT_AVX2-NEXT: %2(<32 x s8>) = G_MERGE_VALUES %7(<16 x s8>), %8(<16 x s8>)
-# NOT_AVX2-NEXT: RET 0
-#
-# AVX2: %0(<32 x s8>) = IMPLICIT_DEF
-# AVX2-NEXT: %1(<32 x s8>) = IMPLICIT_DEF
-# AVX2-NEXT: %2(<32 x s8>) = G_ADD %0, %1
-# AVX2-NEXT: RET 0
body: |
bb.1 (%ir-block.0):
liveins: %ymm0, %ymm1
+ ; CHECK-LABEL: name: test_add_v32i8
+ ; ALL: [[DEF:%[0-9]+]]:_(<32 x s8>) = IMPLICIT_DEF
+ ; ALL: [[DEF1:%[0-9]+]]:_(<32 x s8>) = IMPLICIT_DEF
+ ; SSE2: [[UV:%[0-9]+]]:_(<16 x s8>), [[UV1:%[0-9]+]]:_(<16 x s8>) = G_UNMERGE_VALUES [[DEF]](<32 x s8>)
+ ; SSE2: [[UV2:%[0-9]+]]:_(<16 x s8>), [[UV3:%[0-9]+]]:_(<16 x s8>) = G_UNMERGE_VALUES [[DEF1]](<32 x s8>)
+ ; AVX1: [[UV:%[0-9]+]]:_(<16 x s8>), [[UV1:%[0-9]+]]:_(<16 x s8>) = G_UNMERGE_VALUES [[DEF]](<32 x s8>)
+ ; AVX1: [[UV2:%[0-9]+]]:_(<16 x s8>), [[UV3:%[0-9]+]]:_(<16 x s8>) = G_UNMERGE_VALUES [[DEF1]](<32 x s8>)
+ ; SSE2: [[ADD:%[0-9]+]]:_(<16 x s8>) = G_ADD [[UV]], [[UV2]]
+ ; SSE2: [[ADD1:%[0-9]+]]:_(<16 x s8>) = G_ADD [[UV1]], [[UV3]]
+ ; AVX1: [[ADD:%[0-9]+]]:_(<16 x s8>) = G_ADD [[UV]], [[UV2]]
+ ; AVX1: [[ADD1:%[0-9]+]]:_(<16 x s8>) = G_ADD [[UV1]], [[UV3]]
+ ; SSE2: [[MV:%[0-9]+]]:_(<32 x s8>) = G_MERGE_VALUES [[ADD]](<16 x s8>), [[ADD1]](<16 x s8>)
+ ; AVX1: [[MV:%[0-9]+]]:_(<32 x s8>) = G_MERGE_VALUES [[ADD]](<16 x s8>), [[ADD1]](<16 x s8>)
+ ; SSE2: %ymm0 = COPY [[MV]](<32 x s8>)
+ ; AVX1: %ymm0 = COPY [[MV]](<32 x s8>)
+ ; AVX2: [[ADD:%[0-9]+]]:_(<32 x s8>) = G_ADD [[DEF]], [[DEF1]]
+ ; AVX2: %ymm0 = COPY [[ADD]](<32 x s8>)
+ ; ALL: RET 0
%0(<32 x s8>) = IMPLICIT_DEF
%1(<32 x s8>) = IMPLICIT_DEF
%2(<32 x s8>) = G_ADD %0, %1
+ %ymm0 = COPY %2
RET 0
...
---
name: test_add_v16i16
-# ALL-LABEL: name: test_add_v16i16
alignment: 4
legalized: false
regBankSelected: false
@@ -67,32 +72,37 @@ registers:
- { id: 0, class: _ }
- { id: 1, class: _ }
- { id: 2, class: _ }
-# NOT_AVX2: %0(<16 x s16>) = IMPLICIT_DEF
-# NOT_AVX2-NEXT: %1(<16 x s16>) = IMPLICIT_DEF
-# NOT_AVX2-NEXT: %3(<8 x s16>), %4(<8 x s16>) = G_UNMERGE_VALUES %0(<16 x s16>)
-# NOT_AVX2-NEXT: %5(<8 x s16>), %6(<8 x s16>) = G_UNMERGE_VALUES %1(<16 x s16>)
-# NOT_AVX2-NEXT: %7(<8 x s16>) = G_ADD %3, %5
-# NOT_AVX2-NEXT: %8(<8 x s16>) = G_ADD %4, %6
-# NOT_AVX2-NEXT: %2(<16 x s16>) = G_MERGE_VALUES %7(<8 x s16>), %8(<8 x s16>)
-# NOT_AVX2-NEXT: RET 0
-#
-# AVX2: %0(<16 x s16>) = IMPLICIT_DEF
-# AVX2-NEXT: %1(<16 x s16>) = IMPLICIT_DEF
-# AVX2-NEXT: %2(<16 x s16>) = G_ADD %0, %1
-# AVX2-NEXT: RET 0
body: |
bb.1 (%ir-block.0):
liveins: %ymm0, %ymm1
+ ; ALL-LABEL: name: test_add_v16i16
+ ; ALL: [[DEF:%[0-9]+]]:_(<16 x s16>) = IMPLICIT_DEF
+ ; ALL: [[DEF1:%[0-9]+]]:_(<16 x s16>) = IMPLICIT_DEF
+ ; SSE2: [[UV:%[0-9]+]]:_(<8 x s16>), [[UV1:%[0-9]+]]:_(<8 x s16>) = G_UNMERGE_VALUES [[DEF]](<16 x s16>)
+ ; SSE2: [[UV2:%[0-9]+]]:_(<8 x s16>), [[UV3:%[0-9]+]]:_(<8 x s16>) = G_UNMERGE_VALUES [[DEF1]](<16 x s16>)
+ ; SSE2: [[ADD:%[0-9]+]]:_(<8 x s16>) = G_ADD [[UV]], [[UV2]]
+ ; SSE2: [[ADD1:%[0-9]+]]:_(<8 x s16>) = G_ADD [[UV1]], [[UV3]]
+ ; SSE2: [[MV:%[0-9]+]]:_(<16 x s16>) = G_MERGE_VALUES [[ADD]](<8 x s16>), [[ADD1]](<8 x s16>)
+ ; AVX1: [[UV:%[0-9]+]]:_(<8 x s16>), [[UV1:%[0-9]+]]:_(<8 x s16>) = G_UNMERGE_VALUES [[DEF]](<16 x s16>)
+ ; AVX1: [[UV2:%[0-9]+]]:_(<8 x s16>), [[UV3:%[0-9]+]]:_(<8 x s16>) = G_UNMERGE_VALUES [[DEF1]](<16 x s16>)
+ ; AVX1: [[ADD:%[0-9]+]]:_(<8 x s16>) = G_ADD [[UV]], [[UV2]]
+ ; AVX1: [[ADD1:%[0-9]+]]:_(<8 x s16>) = G_ADD [[UV1]], [[UV3]]
+ ; AVX1: [[MV:%[0-9]+]]:_(<16 x s16>) = G_MERGE_VALUES [[ADD]](<8 x s16>), [[ADD1]](<8 x s16>)
+ ; SSE2: %ymm0 = COPY [[MV]](<16 x s16>)
+ ; AVX1: %ymm0 = COPY [[MV]](<16 x s16>)
+ ; AVX2: [[ADD:%[0-9]+]]:_(<16 x s16>) = G_ADD [[DEF]], [[DEF1]]
+ ; AVX2: %ymm0 = COPY [[ADD]](<16 x s16>)
+ ; ALL: RET 0
%0(<16 x s16>) = IMPLICIT_DEF
%1(<16 x s16>) = IMPLICIT_DEF
%2(<16 x s16>) = G_ADD %0, %1
+ %ymm0 = COPY %2
RET 0
...
---
name: test_add_v8i32
-# ALL-LABEL: name: test_add_v8i32
alignment: 4
legalized: false
regBankSelected: false
@@ -100,32 +110,37 @@ registers:
- { id: 0, class: _ }
- { id: 1, class: _ }
- { id: 2, class: _ }
-# NOT_AVX2: %0(<8 x s32>) = IMPLICIT_DEF
-# NOT_AVX2-NEXT: %1(<8 x s32>) = IMPLICIT_DEF
-# NOT_AVX2-NEXT: %3(<4 x s32>), %4(<4 x s32>) = G_UNMERGE_VALUES %0(<8 x s32>)
-# NOT_AVX2-NEXT: %5(<4 x s32>), %6(<4 x s32>) = G_UNMERGE_VALUES %1(<8 x s32>)
-# NOT_AVX2-NEXT: %7(<4 x s32>) = G_ADD %3, %5
-# NOT_AVX2-NEXT: %8(<4 x s32>) = G_ADD %4, %6
-# NOT_AVX2-NEXT: %2(<8 x s32>) = G_MERGE_VALUES %7(<4 x s32>), %8(<4 x s32>)
-# NOT_AVX2-NEXT: RET 0
-#
-# AVX2: %0(<8 x s32>) = IMPLICIT_DEF
-# AVX2-NEXT: %1(<8 x s32>) = IMPLICIT_DEF
-# AVX2-NEXT: %2(<8 x s32>) = G_ADD %0, %1
-# AVX2-NEXT: RET 0
body: |
bb.1 (%ir-block.0):
liveins: %ymm0, %ymm1
+ ; ALL-LABEL: name: test_add_v8i32
+ ; ALL: [[DEF:%[0-9]+]]:_(<8 x s32>) = IMPLICIT_DEF
+ ; ALL: [[DEF1:%[0-9]+]]:_(<8 x s32>) = IMPLICIT_DEF
+ ; SSE2: [[UV:%[0-9]+]]:_(<4 x s32>), [[UV1:%[0-9]+]]:_(<4 x s32>) = G_UNMERGE_VALUES [[DEF]](<8 x s32>)
+ ; SSE2: [[UV2:%[0-9]+]]:_(<4 x s32>), [[UV3:%[0-9]+]]:_(<4 x s32>) = G_UNMERGE_VALUES [[DEF1]](<8 x s32>)
+ ; SSE2: [[ADD:%[0-9]+]]:_(<4 x s32>) = G_ADD [[UV]], [[UV2]]
+ ; SSE2: [[ADD1:%[0-9]+]]:_(<4 x s32>) = G_ADD [[UV1]], [[UV3]]
+ ; SSE2: [[MV:%[0-9]+]]:_(<8 x s32>) = G_MERGE_VALUES [[ADD]](<4 x s32>), [[ADD1]](<4 x s32>)
+ ; SSE2: %ymm0 = COPY [[MV]](<8 x s32>)
+ ; AVX1: [[UV:%[0-9]+]]:_(<4 x s32>), [[UV1:%[0-9]+]]:_(<4 x s32>) = G_UNMERGE_VALUES [[DEF]](<8 x s32>)
+ ; AVX1: [[UV2:%[0-9]+]]:_(<4 x s32>), [[UV3:%[0-9]+]]:_(<4 x s32>) = G_UNMERGE_VALUES [[DEF1]](<8 x s32>)
+ ; AVX1: [[ADD:%[0-9]+]]:_(<4 x s32>) = G_ADD [[UV]], [[UV2]]
+ ; AVX1: [[ADD1:%[0-9]+]]:_(<4 x s32>) = G_ADD [[UV1]], [[UV3]]
+ ; AVX1: [[MV:%[0-9]+]]:_(<8 x s32>) = G_MERGE_VALUES [[ADD]](<4 x s32>), [[ADD1]](<4 x s32>)
+ ; AVX1: %ymm0 = COPY [[MV]](<8 x s32>)
+ ; AVX2: [[ADD:%[0-9]+]]:_(<8 x s32>) = G_ADD [[DEF]], [[DEF1]]
+ ; AVX2: %ymm0 = COPY [[ADD]](<8 x s32>)
+ ; ALL: RET 0
%0(<8 x s32>) = IMPLICIT_DEF
%1(<8 x s32>) = IMPLICIT_DEF
%2(<8 x s32>) = G_ADD %0, %1
+ %ymm0 = COPY %2
RET 0
...
---
name: test_add_v4i64
-# ALL-LABEL: name: test_add_v4i64
alignment: 4
legalized: false
regBankSelected: false
@@ -133,26 +148,32 @@ registers:
- { id: 0, class: _ }
- { id: 1, class: _ }
- { id: 2, class: _ }
-# NOT_AVX2: %0(<4 x s64>) = IMPLICIT_DEF
-# NOT_AVX2-NEXT: %1(<4 x s64>) = IMPLICIT_DEF
-# NOT_AVX2-NEXT: %3(<2 x s64>), %4(<2 x s64>) = G_UNMERGE_VALUES %0(<4 x s64>)
-# NOT_AVX2-NEXT: %5(<2 x s64>), %6(<2 x s64>) = G_UNMERGE_VALUES %1(<4 x s64>)
-# NOT_AVX2-NEXT: %7(<2 x s64>) = G_ADD %3, %5
-# NOT_AVX2-NEXT: %8(<2 x s64>) = G_ADD %4, %6
-# NOT_AVX2-NEXT: %2(<4 x s64>) = G_MERGE_VALUES %7(<2 x s64>), %8(<2 x s64>)
-# NOT_AVX2-NEXT: RET 0
-#
-# AVX2: %0(<4 x s64>) = IMPLICIT_DEF
-# AVX2-NEXT: %1(<4 x s64>) = IMPLICIT_DEF
-# AVX2-NEXT: %2(<4 x s64>) = G_ADD %0, %1
-# AVX2-NEXT: RET 0
body: |
bb.1 (%ir-block.0):
liveins: %ymm0, %ymm1
+ ; ALL-LABEL: name: test_add_v4i64
+ ; ALL: [[DEF:%[0-9]+]]:_(<4 x s64>) = IMPLICIT_DEF
+ ; ALL: [[DEF1:%[0-9]+]]:_(<4 x s64>) = IMPLICIT_DEF
+ ; SSE2: [[UV:%[0-9]+]]:_(<2 x s64>), [[UV1:%[0-9]+]]:_(<2 x s64>) = G_UNMERGE_VALUES [[DEF]](<4 x s64>)
+ ; SSE2: [[UV2:%[0-9]+]]:_(<2 x s64>), [[UV3:%[0-9]+]]:_(<2 x s64>) = G_UNMERGE_VALUES [[DEF1]](<4 x s64>)
+ ; SSE2: [[ADD:%[0-9]+]]:_(<2 x s64>) = G_ADD [[UV]], [[UV2]]
+ ; SSE2: [[ADD1:%[0-9]+]]:_(<2 x s64>) = G_ADD [[UV1]], [[UV3]]
+ ; SSE2: [[MV:%[0-9]+]]:_(<4 x s64>) = G_MERGE_VALUES [[ADD]](<2 x s64>), [[ADD1]](<2 x s64>)
+ ; AVX1: [[UV:%[0-9]+]]:_(<2 x s64>), [[UV1:%[0-9]+]]:_(<2 x s64>) = G_UNMERGE_VALUES [[DEF]](<4 x s64>)
+ ; AVX1: [[UV2:%[0-9]+]]:_(<2 x s64>), [[UV3:%[0-9]+]]:_(<2 x s64>) = G_UNMERGE_VALUES [[DEF1]](<4 x s64>)
+ ; AVX1: [[ADD:%[0-9]+]]:_(<2 x s64>) = G_ADD [[UV]], [[UV2]]
+ ; AVX1: [[ADD1:%[0-9]+]]:_(<2 x s64>) = G_ADD [[UV1]], [[UV3]]
+ ; AVX1: [[MV:%[0-9]+]]:_(<4 x s64>) = G_MERGE_VALUES [[ADD]](<2 x s64>), [[ADD1]](<2 x s64>)
+ ; SSE2: %ymm0 = COPY [[MV]](<4 x s64>)
+ ; AVX1: %ymm0 = COPY [[MV]](<4 x s64>)
+ ; AVX2: [[ADD:%[0-9]+]]:_(<4 x s64>) = G_ADD [[DEF]], [[DEF1]]
+ ; AVX2: %ymm0 = COPY [[ADD]](<4 x s64>)
+ ; ALL: RET 0
%0(<4 x s64>) = IMPLICIT_DEF
%1(<4 x s64>) = IMPLICIT_DEF
%2(<4 x s64>) = G_ADD %0, %1
+ %ymm0 = COPY %2
RET 0
...
diff --git a/test/CodeGen/X86/GlobalISel/legalize-add-v512.mir b/test/CodeGen/X86/GlobalISel/legalize-add-v512.mir
index 5b7532ea5d00..81b66d17a979 100644
--- a/test/CodeGen/X86/GlobalISel/legalize-add-v512.mir
+++ b/test/CodeGen/X86/GlobalISel/legalize-add-v512.mir
@@ -1,3 +1,4 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx -global-isel -run-pass=legalizer %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=AVX1
# RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx512f -global-isel -run-pass=legalizer %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=AVX512F
# RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx512f,+avx512bw -global-isel -run-pass=legalizer %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=AVX512BW
@@ -30,7 +31,6 @@
...
---
name: test_add_v64i8
-# ALL-LABEL: name: test_add_v64i8
alignment: 4
legalized: false
regBankSelected: false
@@ -38,43 +38,39 @@ registers:
- { id: 0, class: _ }
- { id: 1, class: _ }
- { id: 2, class: _ }
-# AVX1: %0(<64 x s8>) = IMPLICIT_DEF
-# AVX1-NEXT: %1(<64 x s8>) = IMPLICIT_DEF
-# AVX1-NEXT: %3(<16 x s8>), %4(<16 x s8>), %5(<16 x s8>), %6(<16 x s8>) = G_UNMERGE_VALUES %0(<64 x s8>)
-# AVX1-NEXT: %7(<16 x s8>), %8(<16 x s8>), %9(<16 x s8>), %10(<16 x s8>) = G_UNMERGE_VALUES %1(<64 x s8>)
-# AVX1-NEXT: %11(<16 x s8>) = G_ADD %3, %7
-# AVX1-NEXT: %12(<16 x s8>) = G_ADD %4, %8
-# AVX1-NEXT: %13(<16 x s8>) = G_ADD %5, %9
-# AVX1-NEXT: %14(<16 x s8>) = G_ADD %6, %10
-# AVX1-NEXT: %2(<64 x s8>) = G_MERGE_VALUES %11(<16 x s8>), %12(<16 x s8>), %13(<16 x s8>), %14(<16 x s8>)
-# AVX1-NEXT: RET 0
-#
-# AVX512F: %0(<64 x s8>) = IMPLICIT_DEF
-# AVX512F-NEXT: %1(<64 x s8>) = IMPLICIT_DEF
-# AVX512F-NEXT: %3(<32 x s8>), %4(<32 x s8>) = G_UNMERGE_VALUES %0(<64 x s8>)
-# AVX512F-NEXT: %5(<32 x s8>), %6(<32 x s8>) = G_UNMERGE_VALUES %1(<64 x s8>)
-# AVX512F-NEXT: %7(<32 x s8>) = G_ADD %3, %5
-# AVX512F-NEXT: %8(<32 x s8>) = G_ADD %4, %6
-# AVX512F-NEXT: %2(<64 x s8>) = G_MERGE_VALUES %7(<32 x s8>), %8(<32 x s8>)
-# AVX512F-NEXT: RET 0
-#
-# AVX512BW: %0(<64 x s8>) = IMPLICIT_DEF
-# AVX512BW-NEXT: %1(<64 x s8>) = IMPLICIT_DEF
-# AVX512BW-NEXT: %2(<64 x s8>) = G_ADD %0, %1
-# AVX512BW-NEXT: RET 0
body: |
bb.1 (%ir-block.0):
liveins: %zmm0, %zmm1
+ ; ALL-LABEL: name: test_add_v64i8
+ ; ALL: [[DEF:%[0-9]+]]:_(<64 x s8>) = IMPLICIT_DEF
+ ; ALL: [[DEF1:%[0-9]+]]:_(<64 x s8>) = IMPLICIT_DEF
+ ; AVX1: [[UV:%[0-9]+]]:_(<16 x s8>), [[UV1:%[0-9]+]]:_(<16 x s8>), [[UV2:%[0-9]+]]:_(<16 x s8>), [[UV3:%[0-9]+]]:_(<16 x s8>) = G_UNMERGE_VALUES [[DEF]](<64 x s8>)
+ ; AVX1: [[UV4:%[0-9]+]]:_(<16 x s8>), [[UV5:%[0-9]+]]:_(<16 x s8>), [[UV6:%[0-9]+]]:_(<16 x s8>), [[UV7:%[0-9]+]]:_(<16 x s8>) = G_UNMERGE_VALUES [[DEF1]](<64 x s8>)
+ ; AVX1: [[ADD:%[0-9]+]]:_(<16 x s8>) = G_ADD [[UV]], [[UV4]]
+ ; AVX1: [[ADD1:%[0-9]+]]:_(<16 x s8>) = G_ADD [[UV1]], [[UV5]]
+ ; AVX1: [[ADD2:%[0-9]+]]:_(<16 x s8>) = G_ADD [[UV2]], [[UV6]]
+ ; AVX1: [[ADD3:%[0-9]+]]:_(<16 x s8>) = G_ADD [[UV3]], [[UV7]]
+ ; AVX1: [[MV:%[0-9]+]]:_(<64 x s8>) = G_MERGE_VALUES [[ADD]](<16 x s8>), [[ADD1]](<16 x s8>), [[ADD2]](<16 x s8>), [[ADD3]](<16 x s8>)
+ ; AVX1: %zmm0 = COPY [[MV]](<64 x s8>)
+ ; AVX512F: [[UV:%[0-9]+]]:_(<32 x s8>), [[UV1:%[0-9]+]]:_(<32 x s8>) = G_UNMERGE_VALUES [[DEF]](<64 x s8>)
+ ; AVX512F: [[UV2:%[0-9]+]]:_(<32 x s8>), [[UV3:%[0-9]+]]:_(<32 x s8>) = G_UNMERGE_VALUES [[DEF1]](<64 x s8>)
+ ; AVX512F: [[ADD:%[0-9]+]]:_(<32 x s8>) = G_ADD [[UV]], [[UV2]]
+ ; AVX512F: [[ADD1:%[0-9]+]]:_(<32 x s8>) = G_ADD [[UV1]], [[UV3]]
+ ; AVX512F: [[MV:%[0-9]+]]:_(<64 x s8>) = G_MERGE_VALUES [[ADD]](<32 x s8>), [[ADD1]](<32 x s8>)
+ ; AVX512F: %zmm0 = COPY [[MV]](<64 x s8>)
+ ; AVX512BW: [[ADD:%[0-9]+]]:_(<64 x s8>) = G_ADD [[DEF]], [[DEF1]]
+ ; AVX512BW: %zmm0 = COPY [[ADD]](<64 x s8>)
+ ; ALL: RET 0
%0(<64 x s8>) = IMPLICIT_DEF
%1(<64 x s8>) = IMPLICIT_DEF
%2(<64 x s8>) = G_ADD %0, %1
+ %zmm0 = COPY %2
RET 0
...
---
name: test_add_v32i16
-# ALL-LABEL: name: test_add_v32i16
alignment: 4
legalized: false
regBankSelected: false
@@ -82,43 +78,39 @@ registers:
- { id: 0, class: _ }
- { id: 1, class: _ }
- { id: 2, class: _ }
-# AVX1: %0(<32 x s16>) = IMPLICIT_DEF
-# AVX1-NEXT: %1(<32 x s16>) = IMPLICIT_DEF
-# AVX1-NEXT: %3(<8 x s16>), %4(<8 x s16>), %5(<8 x s16>), %6(<8 x s16>) = G_UNMERGE_VALUES %0(<32 x s16>)
-# AVX1-NEXT: %7(<8 x s16>), %8(<8 x s16>), %9(<8 x s16>), %10(<8 x s16>) = G_UNMERGE_VALUES %1(<32 x s16>)
-# AVX1-NEXT: %11(<8 x s16>) = G_ADD %3, %7
-# AVX1-NEXT: %12(<8 x s16>) = G_ADD %4, %8
-# AVX1-NEXT: %13(<8 x s16>) = G_ADD %5, %9
-# AVX1-NEXT: %14(<8 x s16>) = G_ADD %6, %10
-# AVX1-NEXT: %2(<32 x s16>) = G_MERGE_VALUES %11(<8 x s16>), %12(<8 x s16>), %13(<8 x s16>), %14(<8 x s16>)
-# AVX1-NEXT: RET 0
-#
-# AVX512F: %0(<32 x s16>) = IMPLICIT_DEF
-# AVX512F-NEXT: %1(<32 x s16>) = IMPLICIT_DEF
-# AVX512F-NEXT: %3(<16 x s16>), %4(<16 x s16>) = G_UNMERGE_VALUES %0(<32 x s16>)
-# AVX512F-NEXT: %5(<16 x s16>), %6(<16 x s16>) = G_UNMERGE_VALUES %1(<32 x s16>)
-# AVX512F-NEXT: %7(<16 x s16>) = G_ADD %3, %5
-# AVX512F-NEXT: %8(<16 x s16>) = G_ADD %4, %6
-# AVX512F-NEXT: %2(<32 x s16>) = G_MERGE_VALUES %7(<16 x s16>), %8(<16 x s16>)
-# AVX512F-NEXT: RET 0
-#
-# AVX512BW: %0(<32 x s16>) = IMPLICIT_DEF
-# AVX512BW-NEXT: %1(<32 x s16>) = IMPLICIT_DEF
-# AVX512BW-NEXT: %2(<32 x s16>) = G_ADD %0, %1
-# AVX512BW-NEXT: RET 0
body: |
bb.1 (%ir-block.0):
liveins: %zmm0, %zmm1
+ ; ALL-LABEL: name: test_add_v32i16
+ ; ALL: [[DEF:%[0-9]+]]:_(<32 x s16>) = IMPLICIT_DEF
+ ; ALL: [[DEF1:%[0-9]+]]:_(<32 x s16>) = IMPLICIT_DEF
+ ; AVX1: [[UV:%[0-9]+]]:_(<8 x s16>), [[UV1:%[0-9]+]]:_(<8 x s16>), [[UV2:%[0-9]+]]:_(<8 x s16>), [[UV3:%[0-9]+]]:_(<8 x s16>) = G_UNMERGE_VALUES [[DEF]](<32 x s16>)
+ ; AVX1: [[UV4:%[0-9]+]]:_(<8 x s16>), [[UV5:%[0-9]+]]:_(<8 x s16>), [[UV6:%[0-9]+]]:_(<8 x s16>), [[UV7:%[0-9]+]]:_(<8 x s16>) = G_UNMERGE_VALUES [[DEF1]](<32 x s16>)
+ ; AVX1: [[ADD:%[0-9]+]]:_(<8 x s16>) = G_ADD [[UV]], [[UV4]]
+ ; AVX1: [[ADD1:%[0-9]+]]:_(<8 x s16>) = G_ADD [[UV1]], [[UV5]]
+ ; AVX1: [[ADD2:%[0-9]+]]:_(<8 x s16>) = G_ADD [[UV2]], [[UV6]]
+ ; AVX1: [[ADD3:%[0-9]+]]:_(<8 x s16>) = G_ADD [[UV3]], [[UV7]]
+ ; AVX1: [[MV:%[0-9]+]]:_(<32 x s16>) = G_MERGE_VALUES [[ADD]](<8 x s16>), [[ADD1]](<8 x s16>), [[ADD2]](<8 x s16>), [[ADD3]](<8 x s16>)
+ ; AVX1: %zmm0 = COPY [[MV]](<32 x s16>)
+ ; AVX512F: [[UV:%[0-9]+]]:_(<16 x s16>), [[UV1:%[0-9]+]]:_(<16 x s16>) = G_UNMERGE_VALUES [[DEF]](<32 x s16>)
+ ; AVX512F: [[UV2:%[0-9]+]]:_(<16 x s16>), [[UV3:%[0-9]+]]:_(<16 x s16>) = G_UNMERGE_VALUES [[DEF1]](<32 x s16>)
+ ; AVX512F: [[ADD:%[0-9]+]]:_(<16 x s16>) = G_ADD [[UV]], [[UV2]]
+ ; AVX512F: [[ADD1:%[0-9]+]]:_(<16 x s16>) = G_ADD [[UV1]], [[UV3]]
+ ; AVX512F: [[MV:%[0-9]+]]:_(<32 x s16>) = G_MERGE_VALUES [[ADD]](<16 x s16>), [[ADD1]](<16 x s16>)
+ ; AVX512F: %zmm0 = COPY [[MV]](<32 x s16>)
+ ; AVX512BW: [[ADD:%[0-9]+]]:_(<32 x s16>) = G_ADD [[DEF]], [[DEF1]]
+ ; AVX512BW: %zmm0 = COPY [[ADD]](<32 x s16>)
+ ; ALL: RET 0
%0(<32 x s16>) = IMPLICIT_DEF
%1(<32 x s16>) = IMPLICIT_DEF
%2(<32 x s16>) = G_ADD %0, %1
+ %zmm0 = COPY %2
RET 0
...
---
name: test_add_v16i32
-# ALL-LABEL: name: test_add_v16i32
alignment: 4
legalized: false
regBankSelected: false
@@ -126,39 +118,35 @@ registers:
- { id: 0, class: _ }
- { id: 1, class: _ }
- { id: 2, class: _ }
-# AVX1: %0(<16 x s32>) = IMPLICIT_DEF
-# AVX1-NEXT: %1(<16 x s32>) = IMPLICIT_DEF
-# AVX1-NEXT: %3(<4 x s32>), %4(<4 x s32>), %5(<4 x s32>), %6(<4 x s32>) = G_UNMERGE_VALUES %0(<16 x s32>)
-# AVX1-NEXT: %7(<4 x s32>), %8(<4 x s32>), %9(<4 x s32>), %10(<4 x s32>) = G_UNMERGE_VALUES %1(<16 x s32>)
-# AVX1-NEXT: %11(<4 x s32>) = G_ADD %3, %7
-# AVX1-NEXT: %12(<4 x s32>) = G_ADD %4, %8
-# AVX1-NEXT: %13(<4 x s32>) = G_ADD %5, %9
-# AVX1-NEXT: %14(<4 x s32>) = G_ADD %6, %10
-# AVX1-NEXT: %2(<16 x s32>) = G_MERGE_VALUES %11(<4 x s32>), %12(<4 x s32>), %13(<4 x s32>), %14(<4 x s32>)
-# AVX1-NEXT: RET 0
-#
-# AVX512F: %0(<16 x s32>) = IMPLICIT_DEF
-# AVX512F-NEXT: %1(<16 x s32>) = IMPLICIT_DEF
-# AVX512F-NEXT: %2(<16 x s32>) = G_ADD %0, %1
-# AVX512F-NEXT: RET 0
-#
-# AVX512BW: %0(<16 x s32>) = IMPLICIT_DEF
-# AVX512BW-NEXT: %1(<16 x s32>) = IMPLICIT_DEF
-# AVX512BW-NEXT: %2(<16 x s32>) = G_ADD %0, %1
-# AVX512BW-NEXT: RET 0
body: |
bb.1 (%ir-block.0):
liveins: %zmm0, %zmm1
+ ; ALL-LABEL: name: test_add_v16i32
+ ; ALL: [[DEF:%[0-9]+]]:_(<16 x s32>) = IMPLICIT_DEF
+ ; ALL: [[DEF1:%[0-9]+]]:_(<16 x s32>) = IMPLICIT_DEF
+ ; AVX1: [[UV:%[0-9]+]]:_(<4 x s32>), [[UV1:%[0-9]+]]:_(<4 x s32>), [[UV2:%[0-9]+]]:_(<4 x s32>), [[UV3:%[0-9]+]]:_(<4 x s32>) = G_UNMERGE_VALUES [[DEF]](<16 x s32>)
+ ; AVX1: [[UV4:%[0-9]+]]:_(<4 x s32>), [[UV5:%[0-9]+]]:_(<4 x s32>), [[UV6:%[0-9]+]]:_(<4 x s32>), [[UV7:%[0-9]+]]:_(<4 x s32>) = G_UNMERGE_VALUES [[DEF1]](<16 x s32>)
+ ; AVX1: [[ADD:%[0-9]+]]:_(<4 x s32>) = G_ADD [[UV]], [[UV4]]
+ ; AVX1: [[ADD1:%[0-9]+]]:_(<4 x s32>) = G_ADD [[UV1]], [[UV5]]
+ ; AVX1: [[ADD2:%[0-9]+]]:_(<4 x s32>) = G_ADD [[UV2]], [[UV6]]
+ ; AVX1: [[ADD3:%[0-9]+]]:_(<4 x s32>) = G_ADD [[UV3]], [[UV7]]
+ ; AVX1: [[MV:%[0-9]+]]:_(<16 x s32>) = G_MERGE_VALUES [[ADD]](<4 x s32>), [[ADD1]](<4 x s32>), [[ADD2]](<4 x s32>), [[ADD3]](<4 x s32>)
+ ; AVX1: %zmm0 = COPY [[MV]](<16 x s32>)
+ ; AVX512F: [[ADD:%[0-9]+]]:_(<16 x s32>) = G_ADD [[DEF]], [[DEF1]]
+ ; AVX512F: %zmm0 = COPY [[ADD]](<16 x s32>)
+ ; AVX512BW: [[ADD:%[0-9]+]]:_(<16 x s32>) = G_ADD [[DEF]], [[DEF1]]
+ ; AVX512BW: %zmm0 = COPY [[ADD]](<16 x s32>)
+ ; ALL: RET 0
%0(<16 x s32>) = IMPLICIT_DEF
%1(<16 x s32>) = IMPLICIT_DEF
%2(<16 x s32>) = G_ADD %0, %1
+ %zmm0 = COPY %2
RET 0
...
---
name: test_add_v8i64
-# ALL-LABEL: name: test_add_v8i64
alignment: 4
legalized: false
regBankSelected: false
@@ -166,39 +154,35 @@ registers:
- { id: 0, class: _ }
- { id: 1, class: _ }
- { id: 2, class: _ }
-# AVX1: %0(<8 x s64>) = IMPLICIT_DEF
-# AVX1-NEXT: %1(<8 x s64>) = IMPLICIT_DEF
-# AVX1-NEXT: %3(<2 x s64>), %4(<2 x s64>), %5(<2 x s64>), %6(<2 x s64>) = G_UNMERGE_VALUES %0(<8 x s64>)
-# AVX1-NEXT: %7(<2 x s64>), %8(<2 x s64>), %9(<2 x s64>), %10(<2 x s64>) = G_UNMERGE_VALUES %1(<8 x s64>)
-# AVX1-NEXT: %11(<2 x s64>) = G_ADD %3, %7
-# AVX1-NEXT: %12(<2 x s64>) = G_ADD %4, %8
-# AVX1-NEXT: %13(<2 x s64>) = G_ADD %5, %9
-# AVX1-NEXT: %14(<2 x s64>) = G_ADD %6, %10
-# AVX1-NEXT: %2(<8 x s64>) = G_MERGE_VALUES %11(<2 x s64>), %12(<2 x s64>), %13(<2 x s64>), %14(<2 x s64>)
-# AVX1-NEXT: RET 0
-#
-# AVX512F: %0(<8 x s64>) = IMPLICIT_DEF
-# AVX512F-NEXT: %1(<8 x s64>) = IMPLICIT_DEF
-# AVX512F-NEXT: %2(<8 x s64>) = G_ADD %0, %1
-# AVX512F-NEXT: RET 0
-#
-# AVX512BW: %0(<8 x s64>) = IMPLICIT_DEF
-# AVX512BW-NEXT: %1(<8 x s64>) = IMPLICIT_DEF
-# AVX512BW-NEXT: %2(<8 x s64>) = G_ADD %0, %1
-# AVX512BW-NEXT: RET 0
body: |
bb.1 (%ir-block.0):
liveins: %zmm0, %zmm1
+ ; ALL-LABEL: name: test_add_v8i64
+ ; ALL: [[DEF:%[0-9]+]]:_(<8 x s64>) = IMPLICIT_DEF
+ ; ALL: [[DEF1:%[0-9]+]]:_(<8 x s64>) = IMPLICIT_DEF
+ ; AVX1: [[UV:%[0-9]+]]:_(<2 x s64>), [[UV1:%[0-9]+]]:_(<2 x s64>), [[UV2:%[0-9]+]]:_(<2 x s64>), [[UV3:%[0-9]+]]:_(<2 x s64>) = G_UNMERGE_VALUES [[DEF]](<8 x s64>)
+ ; AVX1: [[UV4:%[0-9]+]]:_(<2 x s64>), [[UV5:%[0-9]+]]:_(<2 x s64>), [[UV6:%[0-9]+]]:_(<2 x s64>), [[UV7:%[0-9]+]]:_(<2 x s64>) = G_UNMERGE_VALUES [[DEF1]](<8 x s64>)
+ ; AVX1: [[ADD:%[0-9]+]]:_(<2 x s64>) = G_ADD [[UV]], [[UV4]]
+ ; AVX1: [[ADD1:%[0-9]+]]:_(<2 x s64>) = G_ADD [[UV1]], [[UV5]]
+ ; AVX1: [[ADD2:%[0-9]+]]:_(<2 x s64>) = G_ADD [[UV2]], [[UV6]]
+ ; AVX1: [[ADD3:%[0-9]+]]:_(<2 x s64>) = G_ADD [[UV3]], [[UV7]]
+ ; AVX1: [[MV:%[0-9]+]]:_(<8 x s64>) = G_MERGE_VALUES [[ADD]](<2 x s64>), [[ADD1]](<2 x s64>), [[ADD2]](<2 x s64>), [[ADD3]](<2 x s64>)
+ ; AVX1: %zmm0 = COPY [[MV]](<8 x s64>)
+ ; AVX512F: [[ADD:%[0-9]+]]:_(<8 x s64>) = G_ADD [[DEF]], [[DEF1]]
+ ; AVX512F: %zmm0 = COPY [[ADD]](<8 x s64>)
+ ; AVX512BW: [[ADD:%[0-9]+]]:_(<8 x s64>) = G_ADD [[DEF]], [[DEF1]]
+ ; AVX512BW: %zmm0 = COPY [[ADD]](<8 x s64>)
+ ; ALL: RET 0
%0(<8 x s64>) = IMPLICIT_DEF
%1(<8 x s64>) = IMPLICIT_DEF
%2(<8 x s64>) = G_ADD %0, %1
+ %zmm0 = COPY %2
RET 0
...
---
name: test_add_v64i8_2
-# ALL-LABEL: name: test_add_v64i8_2
alignment: 4
legalized: false
regBankSelected: false
@@ -212,49 +196,40 @@ registers:
- { id: 6, class: _ }
- { id: 7, class: _ }
- { id: 8, class: _ }
-# AVX1: %2(<32 x s8>) = COPY %ymm0
-# AVX1-NEXT: %3(<32 x s8>) = COPY %ymm1
-# AVX1-NEXT: %4(<32 x s8>) = COPY %ymm2
-# AVX1-NEXT: %5(<32 x s8>) = COPY %ymm3
-# AVX1-NEXT: %9(<16 x s8>), %10(<16 x s8>) = G_UNMERGE_VALUES %2(<32 x s8>)
-# AVX1-NEXT: %11(<16 x s8>), %12(<16 x s8>) = G_UNMERGE_VALUES %3(<32 x s8>)
-# AVX1-NEXT: %13(<16 x s8>), %14(<16 x s8>) = G_UNMERGE_VALUES %4(<32 x s8>)
-# AVX1-NEXT: %15(<16 x s8>), %16(<16 x s8>) = G_UNMERGE_VALUES %5(<32 x s8>)
-# AVX1-NEXT: %17(<16 x s8>) = G_ADD %9, %13
-# AVX1-NEXT: %18(<16 x s8>) = G_ADD %10, %14
-# AVX1-NEXT: %19(<16 x s8>) = G_ADD %11, %15
-# AVX1-NEXT: %20(<16 x s8>) = G_ADD %12, %16
-# AVX1-NEXT: %7(<32 x s8>) = G_MERGE_VALUES %17(<16 x s8>), %18(<16 x s8>)
-# AVX1-NEXT: %8(<32 x s8>) = G_MERGE_VALUES %19(<16 x s8>), %20(<16 x s8>)
-# AVX1-NEXT: %ymm0 = COPY %7(<32 x s8>)
-# AVX1-NEXT: %ymm1 = COPY %8(<32 x s8>)
-# AVX1-NEXT: RET 0, implicit %ymm0, implicit %ymm1
#
-# AVX512F: %2(<32 x s8>) = COPY %ymm0
-# AVX512F-NEXT: %3(<32 x s8>) = COPY %ymm1
-# AVX512F-NEXT: %4(<32 x s8>) = COPY %ymm2
-# AVX512F-NEXT: %5(<32 x s8>) = COPY %ymm3
-# AVX512F-NEXT: %13(<32 x s8>) = G_ADD %2, %4
-# AVX512F-NEXT: %14(<32 x s8>) = G_ADD %3, %5
-# AVX512F-NEXT: %ymm0 = COPY %13(<32 x s8>)
-# AVX512F-NEXT: %ymm1 = COPY %14(<32 x s8>)
-# AVX512F-NEXT: RET 0, implicit %ymm0, implicit %ymm1
#
-# AVX512BW: %2(<32 x s8>) = COPY %ymm0
-# AVX512BW-NEXT: %3(<32 x s8>) = COPY %ymm1
-# AVX512BW-NEXT: %4(<32 x s8>) = COPY %ymm2
-# AVX512BW-NEXT: %5(<32 x s8>) = COPY %ymm3
-# AVX512BW-NEXT: %0(<64 x s8>) = G_MERGE_VALUES %2(<32 x s8>), %3(<32 x s8>)
-# AVX512BW-NEXT: %1(<64 x s8>) = G_MERGE_VALUES %4(<32 x s8>), %5(<32 x s8>)
-# AVX512BW-NEXT: %6(<64 x s8>) = G_ADD %0, %1
-# AVX512BW-NEXT: %7(<32 x s8>), %8(<32 x s8>) = G_UNMERGE_VALUES %6(<64 x s8>)
-# AVX512BW-NEXT: %ymm0 = COPY %7(<32 x s8>)
-# AVX512BW-NEXT: %ymm1 = COPY %8(<32 x s8>)
-# AVX512BW-NEXT: RET 0, implicit %ymm0, implicit %ymm1
body: |
bb.1 (%ir-block.0):
liveins: %ymm0, %ymm1, %ymm2, %ymm3
+ ; ALL-LABEL: name: test_add_v64i8_2
+ ; ALL: [[COPY:%[0-9]+]]:_(<32 x s8>) = COPY %ymm0
+ ; ALL: [[COPY1:%[0-9]+]]:_(<32 x s8>) = COPY %ymm1
+ ; ALL: [[COPY2:%[0-9]+]]:_(<32 x s8>) = COPY %ymm2
+ ; ALL: [[COPY3:%[0-9]+]]:_(<32 x s8>) = COPY %ymm3
+ ; AVX1: [[UV:%[0-9]+]]:_(<16 x s8>), [[UV1:%[0-9]+]]:_(<16 x s8>) = G_UNMERGE_VALUES [[COPY]](<32 x s8>)
+ ; AVX1: [[UV2:%[0-9]+]]:_(<16 x s8>), [[UV3:%[0-9]+]]:_(<16 x s8>) = G_UNMERGE_VALUES [[COPY1]](<32 x s8>)
+ ; AVX1: [[UV4:%[0-9]+]]:_(<16 x s8>), [[UV5:%[0-9]+]]:_(<16 x s8>) = G_UNMERGE_VALUES [[COPY2]](<32 x s8>)
+ ; AVX1: [[UV6:%[0-9]+]]:_(<16 x s8>), [[UV7:%[0-9]+]]:_(<16 x s8>) = G_UNMERGE_VALUES [[COPY3]](<32 x s8>)
+ ; AVX1: [[ADD:%[0-9]+]]:_(<16 x s8>) = G_ADD [[UV]], [[UV4]]
+ ; AVX1: [[ADD1:%[0-9]+]]:_(<16 x s8>) = G_ADD [[UV1]], [[UV5]]
+ ; AVX1: [[ADD2:%[0-9]+]]:_(<16 x s8>) = G_ADD [[UV2]], [[UV6]]
+ ; AVX1: [[ADD3:%[0-9]+]]:_(<16 x s8>) = G_ADD [[UV3]], [[UV7]]
+ ; AVX1: [[MV:%[0-9]+]]:_(<32 x s8>) = G_MERGE_VALUES [[ADD]](<16 x s8>), [[ADD1]](<16 x s8>)
+ ; AVX1: [[MV1:%[0-9]+]]:_(<32 x s8>) = G_MERGE_VALUES [[ADD2]](<16 x s8>), [[ADD3]](<16 x s8>)
+ ; AVX1: %ymm0 = COPY [[MV]](<32 x s8>)
+ ; AVX1: %ymm1 = COPY [[MV1]](<32 x s8>)
+ ; AVX512F: [[ADD:%[0-9]+]]:_(<32 x s8>) = G_ADD [[COPY]], [[COPY2]]
+ ; AVX512F: [[ADD1:%[0-9]+]]:_(<32 x s8>) = G_ADD [[COPY1]], [[COPY3]]
+ ; AVX512F: %ymm0 = COPY [[ADD]](<32 x s8>)
+ ; AVX512F: %ymm1 = COPY [[ADD1]](<32 x s8>)
+ ; AVX512BW: [[MV:%[0-9]+]]:_(<64 x s8>) = G_MERGE_VALUES [[COPY]](<32 x s8>), [[COPY1]](<32 x s8>)
+ ; AVX512BW: [[MV1:%[0-9]+]]:_(<64 x s8>) = G_MERGE_VALUES [[COPY2]](<32 x s8>), [[COPY3]](<32 x s8>)
+ ; AVX512BW: [[ADD:%[0-9]+]]:_(<64 x s8>) = G_ADD [[MV]], [[MV1]]
+ ; AVX512BW: [[UV:%[0-9]+]]:_(<32 x s8>), [[UV1:%[0-9]+]]:_(<32 x s8>) = G_UNMERGE_VALUES [[ADD]](<64 x s8>)
+ ; AVX512BW: %ymm0 = COPY [[UV]](<32 x s8>)
+ ; AVX512BW: %ymm1 = COPY [[UV1]](<32 x s8>)
+ ; ALL: RET 0, implicit %ymm0, implicit %ymm1
%2(<32 x s8>) = COPY %ymm0
%3(<32 x s8>) = COPY %ymm1
%4(<32 x s8>) = COPY %ymm2
diff --git a/test/CodeGen/X86/GlobalISel/legalize-add.mir b/test/CodeGen/X86/GlobalISel/legalize-add.mir
index 6a03388da947..9d50fc445c79 100644
--- a/test/CodeGen/X86/GlobalISel/legalize-add.mir
+++ b/test/CodeGen/X86/GlobalISel/legalize-add.mir
@@ -1,18 +1,56 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple=x86_64-linux-gnu -global-isel -run-pass=legalizer %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X64
# RUN: llc -mtriple=i386-linux-gnu -global-isel -run-pass=legalizer %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X32
--- |
- define void @test_add_i32() {
- ret void
- }
- define void @test_add_i64() {
- ret void
- }
+ define void @test_add_i1() { ret void}
+ define void @test_add_i32() { ret void }
+ define void @test_add_i64() { ret void }
...
---
+name: test_add_i1
+# CHECK-LABEL: name: test_add_i1
+alignment: 4
+legalized: false
+regBankSelected: false
+registers:
+ - { id: 0, class: _, preferred-register: '' }
+ - { id: 1, class: _, preferred-register: '' }
+ - { id: 2, class: _, preferred-register: '' }
+# CHECK: %0(s32) = COPY %edx
+# CHECK-NEXT: %3(s8) = G_TRUNC %0(s32)
+# CHECK-NEXT: %4(s8) = G_TRUNC %0(s32)
+# CHECK-NEXT: %5(s8) = G_ADD %3, %4
+# CHECK: RET 0
+body: |
+ bb.1 (%ir-block.0):
+
+ ; X64-LABEL: name: test_add_i1
+ ; X64: [[COPY:%[0-9]+]]:_(s32) = COPY %edx
+ ; X64: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[COPY]](s32)
+ ; X64: [[TRUNC1:%[0-9]+]]:_(s8) = G_TRUNC [[COPY]](s32)
+ ; X64: [[ADD:%[0-9]+]]:_(s8) = G_ADD [[TRUNC]], [[TRUNC1]]
+ ; X64: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[ADD]](s8)
+ ; X64: %eax = COPY [[ANYEXT]](s32)
+ ; X64: RET 0
+ ; X32-LABEL: name: test_add_i1
+ ; X32: [[COPY:%[0-9]+]]:_(s32) = COPY %edx
+ ; X32: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[COPY]](s32)
+ ; X32: [[TRUNC1:%[0-9]+]]:_(s8) = G_TRUNC [[COPY]](s32)
+ ; X32: [[ADD:%[0-9]+]]:_(s8) = G_ADD [[TRUNC]], [[TRUNC1]]
+ ; X32: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[ADD]](s8)
+ ; X32: %eax = COPY [[ANYEXT]](s32)
+ ; X32: RET 0
+ %0(s32) = COPY %edx
+ %1(s1) = G_TRUNC %0(s32)
+ %2(s1) = G_ADD %1, %1
+ %3:_(s32) = G_ANYEXT %2
+ %eax = COPY %3
+ RET 0
+...
+---
name: test_add_i32
-# ALL-LABEL: name: test_add_i32
alignment: 4
legalized: false
regBankSelected: false
@@ -20,21 +58,29 @@ registers:
- { id: 0, class: _ }
- { id: 1, class: _ }
- { id: 2, class: _ }
-# ALL: %0(s32) = IMPLICIT_DEF
-# ALL-NEXT: %1(s32) = IMPLICIT_DEF
-# ALL-NEXT: %2(s32) = G_ADD %0, %1
-# ALL-NEXT: RET 0
body: |
bb.1 (%ir-block.0):
+ ; X64-LABEL: name: test_add_i32
+ ; X64: [[DEF:%[0-9]+]]:_(s32) = IMPLICIT_DEF
+ ; X64: [[DEF1:%[0-9]+]]:_(s32) = IMPLICIT_DEF
+ ; X64: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[DEF]], [[DEF1]]
+ ; X64: %eax = COPY [[ADD]](s32)
+ ; X64: RET 0
+ ; X32-LABEL: name: test_add_i32
+ ; X32: [[DEF:%[0-9]+]]:_(s32) = IMPLICIT_DEF
+ ; X32: [[DEF1:%[0-9]+]]:_(s32) = IMPLICIT_DEF
+ ; X32: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[DEF]], [[DEF1]]
+ ; X32: %eax = COPY [[ADD]](s32)
+ ; X32: RET 0
%0(s32) = IMPLICIT_DEF
%1(s32) = IMPLICIT_DEF
%2(s32) = G_ADD %0, %1
+ %eax = COPY %2
RET 0
...
---
name: test_add_i64
-# ALL-LABEL: name: test_add_i64
alignment: 4
legalized: false
regBankSelected: false
@@ -42,26 +88,30 @@ registers:
- { id: 0, class: _ }
- { id: 1, class: _ }
- { id: 2, class: _ }
-# X64: %0(s64) = IMPLICIT_DEF
-# X64-NEXT: %1(s64) = IMPLICIT_DEF
-# X64-NEXT: %2(s64) = G_ADD %0, %1
-# X64-NEXT: RET 0
-#
-# X32: %0(s64) = IMPLICIT_DEF
-# X32-NEXT: %1(s64) = IMPLICIT_DEF
-# X32-NEXT: %3(s32), %4(s32) = G_UNMERGE_VALUES %0(s64)
-# X32-NEXT: %5(s32), %6(s32) = G_UNMERGE_VALUES %1(s64)
-# X32-NEXT: %12(s8) = G_CONSTANT i8 0
-# X32-NEXT: %7(s1) = G_TRUNC %12(s8)
-# X32-NEXT: %8(s32), %9(s1) = G_UADDE %3, %5, %7
-# X32-NEXT: %10(s32), %11(s1) = G_UADDE %4, %6, %9
-# X32-NEXT: %2(s64) = G_MERGE_VALUES %8(s32), %10(s32)
-# X32-NEXT: RET 0
body: |
bb.1 (%ir-block.0):
+ ; X64-LABEL: name: test_add_i64
+ ; X64: [[DEF:%[0-9]+]]:_(s64) = IMPLICIT_DEF
+ ; X64: [[DEF1:%[0-9]+]]:_(s64) = IMPLICIT_DEF
+ ; X64: [[ADD:%[0-9]+]]:_(s64) = G_ADD [[DEF]], [[DEF1]]
+ ; X64: %rax = COPY [[ADD]](s64)
+ ; X64: RET 0
+ ; X32-LABEL: name: test_add_i64
+ ; X32: [[DEF:%[0-9]+]]:_(s64) = IMPLICIT_DEF
+ ; X32: [[DEF1:%[0-9]+]]:_(s64) = IMPLICIT_DEF
+ ; X32: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[DEF]](s64)
+ ; X32: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[DEF1]](s64)
+ ; X32: [[C:%[0-9]+]]:_(s8) = G_CONSTANT i8 0
+ ; X32: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[C]](s8)
+ ; X32: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s1) = G_UADDE [[UV]], [[UV2]], [[TRUNC]]
+ ; X32: [[UADDE2:%[0-9]+]]:_(s32), [[UADDE3:%[0-9]+]]:_(s1) = G_UADDE [[UV1]], [[UV3]], [[UADDE1]]
+ ; X32: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDE]](s32), [[UADDE2]](s32)
+ ; X32: %rax = COPY [[MV]](s64)
+ ; X32: RET 0
%0(s64) = IMPLICIT_DEF
%1(s64) = IMPLICIT_DEF
%2(s64) = G_ADD %0, %1
+ %rax = COPY %2
RET 0
...
diff --git a/test/CodeGen/X86/GlobalISel/legalize-and-scalar.mir b/test/CodeGen/X86/GlobalISel/legalize-and-scalar.mir
index b57db15d4646..44ccdd834e25 100644
--- a/test/CodeGen/X86/GlobalISel/legalize-and-scalar.mir
+++ b/test/CodeGen/X86/GlobalISel/legalize-and-scalar.mir
@@ -1,6 +1,12 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple=x86_64-linux-gnu -global-isel -run-pass=legalizer %s -o - | FileCheck %s
--- |
+ define i1 @test_and_i1() {
+ %ret = and i1 undef, undef
+ ret i1 %ret
+ }
+
define i8 @test_and_i8() {
%ret = and i8 undef, undef
ret i8 %ret
@@ -23,8 +29,34 @@
...
---
+name: test_and_i1
+alignment: 4
+legalized: false
+regBankSelected: false
+registers:
+ - { id: 0, class: _, preferred-register: '' }
+ - { id: 1, class: _, preferred-register: '' }
+ - { id: 2, class: _, preferred-register: '' }
+body: |
+ bb.1 (%ir-block.0):
+
+ ; CHECK-LABEL: name: test_and_i1
+ ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY %edx
+ ; CHECK: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[COPY]](s32)
+ ; CHECK: [[TRUNC1:%[0-9]+]]:_(s8) = G_TRUNC [[COPY]](s32)
+ ; CHECK: [[AND:%[0-9]+]]:_(s8) = G_AND [[TRUNC]], [[TRUNC1]]
+ ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[AND]](s8)
+ ; CHECK: %eax = COPY [[ANYEXT]](s32)
+ ; CHECK: RET 0
+ %0(s32) = COPY %edx
+ %1(s1) = G_TRUNC %0(s32)
+ %2(s1) = G_AND %1, %1
+ %3:_(s32) = G_ANYEXT %2
+ %eax = COPY %3
+ RET 0
+...
+---
name: test_and_i8
-# CHECK-LABEL: name: test_and_i8
alignment: 4
legalized: false
regBankSelected: false
@@ -35,12 +67,13 @@ liveins:
fixedStack:
stack:
constants:
-# CHECK: %0(s8) = IMPLICIT_DEF
-# CHECK-NEXT: %1(s8) = G_AND %0, %0
-# CHECK-NEXT: %al = COPY %1(s8)
-# CHECK-NEXT: RET 0, implicit %al
body: |
bb.1 (%ir-block.0):
+ ; CHECK-LABEL: name: test_and_i8
+ ; CHECK: [[DEF:%[0-9]+]]:_(s8) = IMPLICIT_DEF
+ ; CHECK: [[AND:%[0-9]+]]:_(s8) = G_AND [[DEF]], [[DEF]]
+ ; CHECK: %al = COPY [[AND]](s8)
+ ; CHECK: RET 0, implicit %al
%0(s8) = IMPLICIT_DEF
%1(s8) = G_AND %0, %0
%al = COPY %1(s8)
@@ -49,7 +82,6 @@ body: |
...
---
name: test_and_i16
-# CHECK-LABEL: name: test_and_i16
alignment: 4
legalized: false
regBankSelected: false
@@ -60,12 +92,13 @@ liveins:
fixedStack:
stack:
constants:
-# CHECK: %0(s16) = IMPLICIT_DEF
-# CHECK-NEXT: %1(s16) = G_AND %0, %0
-# CHECK-NEXT: %ax = COPY %1(s16)
-# CHECK-NEXT: RET 0, implicit %ax
body: |
bb.1 (%ir-block.0):
+ ; CHECK-LABEL: name: test_and_i16
+ ; CHECK: [[DEF:%[0-9]+]]:_(s16) = IMPLICIT_DEF
+ ; CHECK: [[AND:%[0-9]+]]:_(s16) = G_AND [[DEF]], [[DEF]]
+ ; CHECK: %ax = COPY [[AND]](s16)
+ ; CHECK: RET 0, implicit %ax
%0(s16) = IMPLICIT_DEF
%1(s16) = G_AND %0, %0
%ax = COPY %1(s16)
@@ -74,7 +107,6 @@ body: |
...
---
name: test_and_i32
-# CHECK-LABEL: name: test_and_i32
alignment: 4
legalized: false
regBankSelected: false
@@ -85,12 +117,13 @@ liveins:
fixedStack:
stack:
constants:
-# CHECK: %0(s32) = IMPLICIT_DEF
-# CHECK-NEXT: %1(s32) = G_AND %0, %0
-# CHECK-NEXT: %eax = COPY %1(s32)
-# CHECK-NEXT: RET 0, implicit %eax
body: |
bb.1 (%ir-block.0):
+ ; CHECK-LABEL: name: test_and_i32
+ ; CHECK: [[DEF:%[0-9]+]]:_(s32) = IMPLICIT_DEF
+ ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[DEF]], [[DEF]]
+ ; CHECK: %eax = COPY [[AND]](s32)
+ ; CHECK: RET 0, implicit %eax
%0(s32) = IMPLICIT_DEF
%1(s32) = G_AND %0, %0
%eax = COPY %1(s32)
@@ -99,7 +132,6 @@ body: |
...
---
name: test_and_i64
-# CHECK-LABEL: name: test_and_i64
alignment: 4
legalized: false
regBankSelected: false
@@ -110,12 +142,13 @@ liveins:
fixedStack:
stack:
constants:
-# CHECK: %0(s64) = IMPLICIT_DEF
-# CHECK-NEXT: %1(s64) = G_AND %0, %0
-# CHECK-NEXT: %rax = COPY %1(s64)
-# CHECK-NEXT: RET 0, implicit %rax
body: |
bb.1 (%ir-block.0):
+ ; CHECK-LABEL: name: test_and_i64
+ ; CHECK: [[DEF:%[0-9]+]]:_(s64) = IMPLICIT_DEF
+ ; CHECK: [[AND:%[0-9]+]]:_(s64) = G_AND [[DEF]], [[DEF]]
+ ; CHECK: %rax = COPY [[AND]](s64)
+ ; CHECK: RET 0, implicit %rax
%0(s64) = IMPLICIT_DEF
%1(s64) = G_AND %0, %0
%rax = COPY %1(s64)
diff --git a/test/CodeGen/X86/GlobalISel/legalize-brcond.mir b/test/CodeGen/X86/GlobalISel/legalize-brcond.mir
new file mode 100644
index 000000000000..946e7385f380
--- /dev/null
+++ b/test/CodeGen/X86/GlobalISel/legalize-brcond.mir
@@ -0,0 +1,58 @@
+# RUN: llc -mtriple=x86_64-linux-gnu -global-isel -run-pass=legalizer %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X64
+# RUN: llc -mtriple=i386-linux-gnu -global-isel -run-pass=legalizer %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X32
+
+--- |
+
+ define i32 @test(i32 %a) {
+ entry:
+ %cmp = trunc i32 %a to i1
+ br i1 %cmp, label %if.then, label %if.else
+
+ if.then: ; preds = %entry
+ ret i32 0
+
+ if.else: ; preds = %entry
+ ret i32 1
+ }
+...
+---
+name: test
+# ALL-LABEL: name: test
+alignment: 4
+legalized: false
+regBankSelected: false
+registers:
+ - { id: 0, class: _, preferred-register: '' }
+ - { id: 1, class: _, preferred-register: '' }
+ - { id: 2, class: _, preferred-register: '' }
+ - { id: 3, class: _, preferred-register: '' }
+# ALL: %1:_(s1) = G_TRUNC %0(s32)
+# ALL-NEXT: G_BRCOND %1(s1), %[[TRUE:bb.[0-9]+]]
+# ALL-NEXT: G_BR %[[FALSE:bb.[0-9]+]]
+# ALL: [[TRUE]].{{[a-zA-Z0-9.]+}}:
+# ALL-NEXT: %eax = COPY %2(s32)
+# ALL-NEXT: RET 0, implicit %eax
+# ALL: [[FALSE]].{{[a-zA-Z0-9.]+}}:
+# ALL-NEXT: %eax = COPY %3(s32)
+# ALL-NEXT: RET 0, implicit %eax
+body: |
+ bb.1.entry:
+ successors: %bb.2(0x40000000), %bb.3(0x40000000)
+ liveins: %edi
+
+ %0(s32) = COPY %edi
+ %2(s32) = G_CONSTANT i32 0
+ %3(s32) = G_CONSTANT i32 1
+ %1(s1) = G_TRUNC %0(s32)
+ G_BRCOND %1(s1), %bb.2
+ G_BR %bb.3
+
+ bb.2.if.then:
+ %eax = COPY %2(s32)
+ RET 0, implicit %eax
+
+ bb.3.if.else:
+ %eax = COPY %3(s32)
+ RET 0, implicit %eax
+
+...
diff --git a/test/CodeGen/X86/GlobalISel/legalize-cmp.mir b/test/CodeGen/X86/GlobalISel/legalize-cmp.mir
index 68ccbbba0a73..c3e7b77aa3e7 100644
--- a/test/CodeGen/X86/GlobalISel/legalize-cmp.mir
+++ b/test/CodeGen/X86/GlobalISel/legalize-cmp.mir
@@ -1,3 +1,4 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple=x86_64-linux-gnu -global-isel -run-pass=legalizer %s -o - | FileCheck %s
--- |
@@ -34,7 +35,6 @@
...
---
name: test_cmp_i8
-# CHECK-LABEL: name: test_cmp_i8
alignment: 4
legalized: false
regBankSelected: false
@@ -43,18 +43,19 @@ registers:
- { id: 1, class: _ }
- { id: 2, class: _ }
- { id: 3, class: _ }
-# CHECK: %0(s8) = COPY %edi
-# CHECK-NEXT: %1(s8) = COPY %esi
-# CHECK-NEXT: %2(s1) = G_ICMP intpred(ult), %0(s8), %1
-# CHECK-NEXT: %3(s32) = G_ZEXT %2(s1)
-# CHECK-NEXT: %eax = COPY %3(s32)
-# CHECK-NEXT: RET 0, implicit %eax
body: |
bb.1 (%ir-block.0):
liveins: %edi, %esi
- %0(s8) = COPY %edi
- %1(s8) = COPY %esi
+ ; CHECK-LABEL: name: test_cmp_i8
+ ; CHECK: [[COPY:%[0-9]+]]:_(s8) = COPY %dil
+ ; CHECK: [[COPY1:%[0-9]+]]:_(s8) = COPY %sil
+ ; CHECK: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[COPY]](s8), [[COPY1]]
+ ; CHECK: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[ICMP]](s1)
+ ; CHECK: %eax = COPY [[ZEXT]](s32)
+ ; CHECK: RET 0, implicit %eax
+ %0(s8) = COPY %dil
+ %1(s8) = COPY %sil
%2(s1) = G_ICMP intpred(ult), %0(s8), %1
%3(s32) = G_ZEXT %2(s1)
%eax = COPY %3(s32)
@@ -63,7 +64,6 @@ body: |
...
---
name: test_cmp_i16
-# CHECK-LABEL: name: test_cmp_i16
alignment: 4
legalized: false
regBankSelected: false
@@ -72,18 +72,19 @@ registers:
- { id: 1, class: _ }
- { id: 2, class: _ }
- { id: 3, class: _ }
-# CHECK: %0(s16) = COPY %edi
-# CHECK-NEXT: %1(s16) = COPY %esi
-# CHECK-NEXT: %2(s1) = G_ICMP intpred(ult), %0(s16), %1
-# CHECK-NEXT: %3(s32) = G_ZEXT %2(s1)
-# CHECK-NEXT: %eax = COPY %3(s32)
-# CHECK-NEXT: RET 0, implicit %eax
body: |
bb.1 (%ir-block.0):
liveins: %edi, %esi
- %0(s16) = COPY %edi
- %1(s16) = COPY %esi
+ ; CHECK-LABEL: name: test_cmp_i16
+ ; CHECK: [[COPY:%[0-9]+]]:_(s16) = COPY %di
+ ; CHECK: [[COPY1:%[0-9]+]]:_(s16) = COPY %si
+ ; CHECK: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[COPY]](s16), [[COPY1]]
+ ; CHECK: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[ICMP]](s1)
+ ; CHECK: %eax = COPY [[ZEXT]](s32)
+ ; CHECK: RET 0, implicit %eax
+ %0(s16) = COPY %di
+ %1(s16) = COPY %si
%2(s1) = G_ICMP intpred(ult), %0(s16), %1
%3(s32) = G_ZEXT %2(s1)
%eax = COPY %3(s32)
@@ -92,7 +93,6 @@ body: |
...
---
name: test_cmp_i32
-# CHECK-LABEL: name: test_cmp_i32
alignment: 4
legalized: false
regBankSelected: false
@@ -101,16 +101,17 @@ registers:
- { id: 1, class: _ }
- { id: 2, class: _ }
- { id: 3, class: _ }
-# CHECK: %0(s32) = COPY %edi
-# CHECK-NEXT: %1(s32) = COPY %esi
-# CHECK-NEXT: %2(s1) = G_ICMP intpred(ult), %0(s32), %1
-# CHECK-NEXT: %3(s32) = G_ZEXT %2(s1)
-# CHECK-NEXT: %eax = COPY %3(s32)
-# CHECK-NEXT: RET 0, implicit %eax
body: |
bb.1 (%ir-block.0):
liveins: %edi, %esi
+ ; CHECK-LABEL: name: test_cmp_i32
+ ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY %edi
+ ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY %esi
+ ; CHECK: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[COPY]](s32), [[COPY1]]
+ ; CHECK: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[ICMP]](s1)
+ ; CHECK: %eax = COPY [[ZEXT]](s32)
+ ; CHECK: RET 0, implicit %eax
%0(s32) = COPY %edi
%1(s32) = COPY %esi
%2(s1) = G_ICMP intpred(ult), %0(s32), %1
@@ -121,7 +122,6 @@ body: |
...
---
name: test_cmp_i64
-# CHECK-LABEL: name: test_cmp_i64
alignment: 4
legalized: false
regBankSelected: false
@@ -130,16 +130,17 @@ registers:
- { id: 1, class: _ }
- { id: 2, class: _ }
- { id: 3, class: _ }
-# CHECK: %0(s64) = COPY %rdi
-# CHECK-NEXT: %1(s64) = COPY %rsi
-# CHECK-NEXT: %2(s1) = G_ICMP intpred(ult), %0(s64), %1
-# CHECK-NEXT: %3(s32) = G_ZEXT %2(s1)
-# CHECK-NEXT: %eax = COPY %3(s32)
-# CHECK-NEXT: RET 0, implicit %eax
body: |
bb.1 (%ir-block.0):
liveins: %rdi, %rsi
+ ; CHECK-LABEL: name: test_cmp_i64
+ ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY %rdi
+ ; CHECK: [[COPY1:%[0-9]+]]:_(s64) = COPY %rsi
+ ; CHECK: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[COPY]](s64), [[COPY1]]
+ ; CHECK: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[ICMP]](s1)
+ ; CHECK: %eax = COPY [[ZEXT]](s32)
+ ; CHECK: RET 0, implicit %eax
%0(s64) = COPY %rdi
%1(s64) = COPY %rsi
%2(s1) = G_ICMP intpred(ult), %0(s64), %1
@@ -150,7 +151,6 @@ body: |
...
---
name: test_cmp_p0
-# CHECK-LABEL: name: test_cmp_p0
alignment: 4
legalized: false
regBankSelected: false
@@ -159,16 +159,17 @@ registers:
- { id: 1, class: _ }
- { id: 2, class: _ }
- { id: 3, class: _ }
-# CHECK: %0(p0) = COPY %rdi
-# CHECK-NEXT: %1(p0) = COPY %rsi
-# CHECK-NEXT: %2(s1) = G_ICMP intpred(ult), %0(p0), %1
-# CHECK-NEXT: %3(s32) = G_ZEXT %2(s1)
-# CHECK-NEXT: %eax = COPY %3(s32)
-# CHECK-NEXT: RET 0, implicit %eax
body: |
bb.1 (%ir-block.0):
liveins: %rdi, %rsi
+ ; CHECK-LABEL: name: test_cmp_p0
+ ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY %rdi
+ ; CHECK: [[COPY1:%[0-9]+]]:_(p0) = COPY %rsi
+ ; CHECK: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[COPY]](p0), [[COPY1]]
+ ; CHECK: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[ICMP]](s1)
+ ; CHECK: %eax = COPY [[ZEXT]](s32)
+ ; CHECK: RET 0, implicit %eax
%0(p0) = COPY %rdi
%1(p0) = COPY %rsi
%2(s1) = G_ICMP intpred(ult), %0(p0), %1
diff --git a/test/CodeGen/X86/GlobalISel/legalize-constant.mir b/test/CodeGen/X86/GlobalISel/legalize-constant.mir
index 612d33a77fc9..1697afb87b0d 100644
--- a/test/CodeGen/X86/GlobalISel/legalize-constant.mir
+++ b/test/CodeGen/X86/GlobalISel/legalize-constant.mir
@@ -1,16 +1,15 @@
-# RUN: llc -mtriple=i386-linux-gnu -global-isel -run-pass=legalizer %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X32
-# RUN: llc -mtriple=x86_64-linux-gnu -global-isel -run-pass=legalizer %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X64
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=i386-linux-gnu -mattr=+sse2 -global-isel -run-pass=legalizer %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X32
+# RUN: llc -mtriple=x86_64-linux-gnu -global-isel -run-pass=legalizer %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X64
---- |
- define void @constInt_check() {
- ret void
- }
+--- |
+ define void @test_constant() { ret void }
+ define void @test_fconstant() { ret void }
...
---
-name: constInt_check
-# ALL-LABEL: name: constInt_check
-registers:
+name: test_constant
+registers:
- { id: 0, class: _ }
- { id: 1, class: _ }
- { id: 2, class: _ }
@@ -18,26 +17,73 @@ registers:
- { id: 4, class: _ }
body: |
bb.1 (%ir-block.0):
- ; ALL: %5(s8) = G_CONSTANT i8 -1
- ; ALL: %0(s1) = G_TRUNC %5(s8)
+ ; X32-LABEL: name: test_constant
+ ; X32: [[C:%[0-9]+]]:_(s8) = G_CONSTANT i8 -1
+ ; X32: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[C]](s8)
+ ; X32: %eax = COPY [[ANYEXT]](s32)
+ ; X32: [[C1:%[0-9]+]]:_(s8) = G_CONSTANT i8 8
+ ; X32: %al = COPY [[C1]](s8)
+ ; X32: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 16
+ ; X32: %ax = COPY [[C2]](s16)
+ ; X32: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+ ; X32: %eax = COPY [[C3]](s32)
+ ; X32: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 64
+ ; X32: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; X32: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[C4]](s32), [[C5]](s32)
+ ; X32: %rax = COPY [[MV]](s64)
+ ; X32: RET 0
+ ; X64-LABEL: name: test_constant
+ ; X64: [[C:%[0-9]+]]:_(s8) = G_CONSTANT i8 -1
+ ; X64: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[C]](s8)
+ ; X64: %eax = COPY [[ANYEXT]](s32)
+ ; X64: [[C1:%[0-9]+]]:_(s8) = G_CONSTANT i8 8
+ ; X64: %al = COPY [[C1]](s8)
+ ; X64: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 16
+ ; X64: %ax = COPY [[C2]](s16)
+ ; X64: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+ ; X64: %eax = COPY [[C3]](s32)
+ ; X64: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 64
+ ; X64: %rax = COPY [[C4]](s64)
+ ; X64: RET 0
%0(s1) = G_CONSTANT i1 1
+ %5:_(s32) = G_ANYEXT %0
+ %eax = COPY %5
- ; ALL: %1(s8) = G_CONSTANT i8 8
- %1(s8) = G_CONSTANT i8 8
-
- ; ALL: %2(s16) = G_CONSTANT i16 16
+ %1(s8) = G_CONSTANT i8 8
+ %al = COPY %1
%2(s16) = G_CONSTANT i16 16
+ %ax = COPY %2
- ; ALL: %3(s32) = G_CONSTANT i32 32
%3(s32) = G_CONSTANT i32 32
+ %eax = COPY %3
+
- ; X64: %4(s64) = G_CONSTANT i64 64
-
- ; X32: %6(s32) = G_CONSTANT i32 64
- ; X32: %7(s32) = G_CONSTANT i32 0
- ; X32: %4(s64) = G_MERGE_VALUES %6(s32), %7(s32)
%4(s64) = G_CONSTANT i64 64
+ %rax = COPY %4
RET 0
...
+---
+name: test_fconstant
+registers:
+ - { id: 0, class: _ }
+ - { id: 1, class: _ }
+body: |
+ bb.0:
+
+ ; X32-LABEL: name: test_fconstant
+ ; X32: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00
+ ; X32: %eax = COPY [[C]](s32)
+ ; X32: [[C1:%[0-9]+]]:_(s64) = G_FCONSTANT double 2.000000e+00
+ ; X32: %rax = COPY [[C1]](s64)
+ ; X64-LABEL: name: test_fconstant
+ ; X64: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00
+ ; X64: %eax = COPY [[C]](s32)
+ ; X64: [[C1:%[0-9]+]]:_(s64) = G_FCONSTANT double 2.000000e+00
+ ; X64: %rax = COPY [[C1]](s64)
+ %0(s32) = G_FCONSTANT float 1.0
+ %eax = COPY %0
+ %1(s64) = G_FCONSTANT double 2.0
+ %rax = COPY %1
+...
diff --git a/test/CodeGen/X86/GlobalISel/legalize-ext-x86-64.mir b/test/CodeGen/X86/GlobalISel/legalize-ext-x86-64.mir
index 6f051f1b6ea5..eb8c1da4b15e 100644
--- a/test/CodeGen/X86/GlobalISel/legalize-ext-x86-64.mir
+++ b/test/CodeGen/X86/GlobalISel/legalize-ext-x86-64.mir
@@ -1,3 +1,4 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple=x86_64-linux-gnu -global-isel -run-pass=legalizer %s -o - | FileCheck %s
--- |
@@ -43,10 +44,26 @@
ret i64 %r
}
+ define void @test_anyext_i1(i8 %a) {
+ ret void
+ }
+
+ define void @test_anyext_i8(i8 %val) {
+ ret void
+ }
+
+ define void @test_anyext_i16(i16 %val) {
+ ret void
+ }
+
+ define void @test_anyext_i32(i32 %val) {
+ ret void
+ }
+
+
...
---
name: test_sext_i1
-# CHECK-LABEL: name: test_sext_i1
alignment: 4
legalized: false
regBankSelected: false
@@ -54,16 +71,17 @@ registers:
- { id: 0, class: _ }
- { id: 1, class: _ }
- { id: 2, class: _ }
-# CHECK: %0(s8) = COPY %edi
-# CHECK-NEXT: %1(s1) = G_TRUNC %0(s8)
-# CHECK-NEXT: %2(s64) = G_SEXT %1(s1)
-# CHECK-NEXT: %rax = COPY %2(s64)
-# CHECK-NEXT: RET 0, implicit %rax
body: |
bb.1 (%ir-block.0):
liveins: %edi
- %0(s8) = COPY %edi
+ ; CHECK-LABEL: name: test_sext_i1
+ ; CHECK: [[COPY:%[0-9]+]]:_(s8) = COPY %dil
+ ; CHECK: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY]](s8)
+ ; CHECK: [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[TRUNC]](s1)
+ ; CHECK: %rax = COPY [[SEXT]](s64)
+ ; CHECK: RET 0, implicit %rax
+ %0(s8) = COPY %dil
%1(s1) = G_TRUNC %0(s8)
%2(s64) = G_SEXT %1(s1)
%rax = COPY %2(s64)
@@ -72,22 +90,22 @@ body: |
...
---
name: test_sext_i8
-# CHECK-LABEL: name: test_sext_i8
alignment: 4
legalized: false
regBankSelected: false
registers:
- { id: 0, class: _ }
- { id: 1, class: _ }
-# CHECK: %0(s8) = COPY %edi
-# CHECK-NEXT: %1(s64) = G_SEXT %0(s8)
-# CHECK-NEXT: %rax = COPY %1(s64)
-# CHECK-NEXT: RET 0, implicit %rax
body: |
bb.1 (%ir-block.0):
liveins: %edi
- %0(s8) = COPY %edi
+ ; CHECK-LABEL: name: test_sext_i8
+ ; CHECK: [[COPY:%[0-9]+]]:_(s8) = COPY %dil
+ ; CHECK: [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[COPY]](s8)
+ ; CHECK: %rax = COPY [[SEXT]](s64)
+ ; CHECK: RET 0, implicit %rax
+ %0(s8) = COPY %dil
%1(s64) = G_SEXT %0(s8)
%rax = COPY %1(s64)
RET 0, implicit %rax
@@ -95,22 +113,22 @@ body: |
...
---
name: test_sext_i16
-# CHECK-LABEL: name: test_sext_i16
alignment: 4
legalized: false
regBankSelected: false
registers:
- { id: 0, class: _ }
- { id: 1, class: _ }
-# CHECK: %0(s16) = COPY %edi
-# CHECK-NEXT: %1(s64) = G_SEXT %0(s16)
-# CHECK-NEXT: %rax = COPY %1(s64)
-# CHECK-NEXT: RET 0, implicit %rax
body: |
bb.1 (%ir-block.0):
liveins: %edi
- %0(s16) = COPY %edi
+ ; CHECK-LABEL: name: test_sext_i16
+ ; CHECK: [[COPY:%[0-9]+]]:_(s16) = COPY %di
+ ; CHECK: [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[COPY]](s16)
+ ; CHECK: %rax = COPY [[SEXT]](s64)
+ ; CHECK: RET 0, implicit %rax
+ %0(s16) = COPY %di
%1(s64) = G_SEXT %0(s16)
%rax = COPY %1(s64)
RET 0, implicit %rax
@@ -118,21 +136,21 @@ body: |
...
---
name: test_sext_i32
-# CHECK-LABEL: name: test_sext_i32
alignment: 4
legalized: false
regBankSelected: false
registers:
- { id: 0, class: _ }
- { id: 1, class: _ }
-# CHECK: %0(s32) = COPY %edi
-# CHECK-NEXT: %1(s64) = G_SEXT %0(s32)
-# CHECK-NEXT: %rax = COPY %1(s64)
-# CHECK-NEXT: RET 0, implicit %rax
body: |
bb.1 (%ir-block.0):
liveins: %edi
+ ; CHECK-LABEL: name: test_sext_i32
+ ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY %edi
+ ; CHECK: [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[COPY]](s32)
+ ; CHECK: %rax = COPY [[SEXT]](s64)
+ ; CHECK: RET 0, implicit %rax
%0(s32) = COPY %edi
%1(s64) = G_SEXT %0(s32)
%rax = COPY %1(s64)
@@ -141,7 +159,6 @@ body: |
...
---
name: test_zext_i1
-# CHECK-LABEL: name: test_zext_i1
alignment: 4
legalized: false
regBankSelected: false
@@ -149,16 +166,18 @@ registers:
- { id: 0, class: _ }
- { id: 1, class: _ }
- { id: 2, class: _ }
-# CHECK: %0(s8) = COPY %edi
-# CHECK-NEXT: %1(s1) = G_TRUNC %0(s8)
-# CHECK-NEXT: %2(s64) = G_ZEXT %1(s1)
-# CHECK-NEXT: %rax = COPY %2(s64)
-# CHECK-NEXT: RET 0, implicit %rax
body: |
bb.1 (%ir-block.0):
liveins: %edi
- %0(s8) = COPY %edi
+ ; CHECK-LABEL: name: test_zext_i1
+ ; CHECK: [[COPY:%[0-9]+]]:_(s8) = COPY %dil
+ ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+ ; CHECK: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[COPY]](s8)
+ ; CHECK: [[AND:%[0-9]+]]:_(s64) = G_AND [[ANYEXT]], [[C]]
+ ; CHECK: %rax = COPY [[AND]](s64)
+ ; CHECK: RET 0, implicit %rax
+ %0(s8) = COPY %dil
%1(s1) = G_TRUNC %0(s8)
%2(s64) = G_ZEXT %1(s1)
%rax = COPY %2(s64)
@@ -167,22 +186,22 @@ body: |
...
---
name: test_zext_i8
-# CHECK-LABEL: name: test_zext_i8
alignment: 4
legalized: false
regBankSelected: false
registers:
- { id: 0, class: _ }
- { id: 1, class: _ }
-# CHECK: %0(s8) = COPY %edi
-# CHECK-NEXT: %1(s64) = G_ZEXT %0(s8)
-# CHECK-NEXT: %rax = COPY %1(s64)
-# CHECK-NEXT: RET 0, implicit %rax
body: |
bb.1 (%ir-block.0):
liveins: %edi
- %0(s8) = COPY %edi
+ ; CHECK-LABEL: name: test_zext_i8
+ ; CHECK: [[COPY:%[0-9]+]]:_(s8) = COPY %dil
+ ; CHECK: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[COPY]](s8)
+ ; CHECK: %rax = COPY [[ZEXT]](s64)
+ ; CHECK: RET 0, implicit %rax
+ %0(s8) = COPY %dil
%1(s64) = G_ZEXT %0(s8)
%rax = COPY %1(s64)
RET 0, implicit %rax
@@ -190,22 +209,22 @@ body: |
...
---
name: test_zext_i16
-# CHECK-LABEL: name: test_zext_i16
alignment: 4
legalized: false
regBankSelected: false
registers:
- { id: 0, class: _ }
- { id: 1, class: _ }
-# CHECK: %0(s16) = COPY %edi
-# CHECK-NEXT: %1(s64) = G_ZEXT %0(s16)
-# CHECK-NEXT: %rax = COPY %1(s64)
-# CHECK-NEXT: RET 0, implicit %rax
body: |
bb.1 (%ir-block.0):
liveins: %edi
- %0(s16) = COPY %edi
+ ; CHECK-LABEL: name: test_zext_i16
+ ; CHECK: [[COPY:%[0-9]+]]:_(s16) = COPY %di
+ ; CHECK: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[COPY]](s16)
+ ; CHECK: %rax = COPY [[ZEXT]](s64)
+ ; CHECK: RET 0, implicit %rax
+ %0(s16) = COPY %di
%1(s64) = G_ZEXT %0(s16)
%rax = COPY %1(s64)
RET 0, implicit %rax
@@ -213,24 +232,119 @@ body: |
...
---
name: test_zext_i32
-# CHECK-LABEL: name: test_zext_i32
alignment: 4
legalized: false
regBankSelected: false
registers:
- { id: 0, class: _ }
- { id: 1, class: _ }
-# CHECK: %0(s32) = COPY %edi
-# CHECK-NEXT: %1(s64) = G_ZEXT %0(s32)
-# CHECK-NEXT: %rax = COPY %1(s64)
-# CHECK-NEXT: RET 0, implicit %rax
body: |
bb.1 (%ir-block.0):
liveins: %edi
+ ; CHECK-LABEL: name: test_zext_i32
+ ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY %edi
+ ; CHECK: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[COPY]](s32)
+ ; CHECK: %rax = COPY [[ZEXT]](s64)
+ ; CHECK: RET 0, implicit %rax
%0(s32) = COPY %edi
%1(s64) = G_ZEXT %0(s32)
%rax = COPY %1(s64)
RET 0, implicit %rax
...
+---
+name: test_anyext_i1
+alignment: 4
+legalized: false
+regBankSelected: false
+registers:
+ - { id: 0, class: _ }
+ - { id: 1, class: _ }
+ - { id: 2, class: _ }
+body: |
+ bb.1 (%ir-block.0):
+ liveins: %edi
+
+ ; CHECK-LABEL: name: test_anyext_i1
+ ; CHECK: [[COPY:%[0-9]+]]:_(s8) = COPY %dil
+ ; CHECK: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[COPY]](s8)
+ ; CHECK: %rax = COPY [[ANYEXT]](s64)
+ ; CHECK: RET 0, implicit %rax
+ %0(s8) = COPY %dil
+ %1(s1) = G_TRUNC %0(s8)
+ %2(s64) = G_ANYEXT %1(s1)
+ %rax = COPY %2(s64)
+ RET 0, implicit %rax
+
+...
+---
+name: test_anyext_i8
+alignment: 4
+legalized: false
+regBankSelected: false
+registers:
+ - { id: 0, class: _ }
+ - { id: 1, class: _ }
+body: |
+ bb.1 (%ir-block.0):
+ liveins: %edi
+
+ ; CHECK-LABEL: name: test_anyext_i8
+ ; CHECK: [[COPY:%[0-9]+]]:_(s8) = COPY %dil
+ ; CHECK: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[COPY]](s8)
+ ; CHECK: %rax = COPY [[ANYEXT]](s64)
+ ; CHECK: RET 0, implicit %rax
+ %0(s8) = COPY %dil
+ %1(s64) = G_ANYEXT %0(s8)
+ %rax = COPY %1(s64)
+ RET 0, implicit %rax
+
+...
+---
+name: test_anyext_i16
+alignment: 4
+legalized: false
+regBankSelected: false
+registers:
+ - { id: 0, class: _ }
+ - { id: 1, class: _ }
+body: |
+ bb.1 (%ir-block.0):
+ liveins: %edi
+
+ ; CHECK-LABEL: name: test_anyext_i16
+ ; CHECK: [[COPY:%[0-9]+]]:_(s16) = COPY %di
+ ; CHECK: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[COPY]](s16)
+ ; CHECK: %rax = COPY [[ANYEXT]](s64)
+ ; CHECK: RET 0, implicit %rax
+ %0(s16) = COPY %di
+ %1(s64) = G_ANYEXT %0(s16)
+ %rax = COPY %1(s64)
+ RET 0, implicit %rax
+
+...
+---
+name: test_anyext_i32
+alignment: 4
+legalized: false
+regBankSelected: false
+registers:
+ - { id: 0, class: _ }
+ - { id: 1, class: _ }
+body: |
+ bb.1 (%ir-block.0):
+ liveins: %edi
+
+ ; CHECK-LABEL: name: test_anyext_i32
+ ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY %edi
+ ; CHECK: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[COPY]](s32)
+ ; CHECK: %rax = COPY [[ANYEXT]](s64)
+ ; CHECK: RET 0, implicit %rax
+ %0(s32) = COPY %edi
+ %1(s64) = G_ANYEXT %0(s32)
+ %rax = COPY %1(s64)
+ RET 0, implicit %rax
+
+...
+
diff --git a/test/CodeGen/X86/GlobalISel/legalize-ext.mir b/test/CodeGen/X86/GlobalISel/legalize-ext.mir
index c86bfd9ee96d..ac5dcc0e3001 100644
--- a/test/CodeGen/X86/GlobalISel/legalize-ext.mir
+++ b/test/CodeGen/X86/GlobalISel/legalize-ext.mir
@@ -1,3 +1,4 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple=i386-linux-gnu -global-isel -run-pass=legalizer %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X32
# RUN: llc -mtriple=x86_64-linux-gnu -global-isel -run-pass=legalizer %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X64
--- |
@@ -64,24 +65,53 @@
ret i32 %r
}
+ define void @test_anyext_i1toi8(i1 %a) {
+ ret void
+ }
+
+ define void @test_anyext_i1toi16(i1 %a) {
+ ret void
+ }
+
+ define void @test_anyext_i1(i8 %a) {
+ ret void
+ }
+
+ define void @test_anyext_i8toi16(i8 %val) {
+ ret void
+ }
+
+ define void @test_anyext_i8(i8 %val) {
+ ret void
+ }
+
+ define void @test_anyext_i16(i16 %val) {
+ ret void
+ }
+
...
---
name: test_zext_i1toi8
-# ALL-LABEL: name: test_zext_i1toi8
alignment: 4
legalized: false
regBankSelected: false
registers:
- { id: 0, class: _, preferred-register: '' }
- { id: 1, class: _, preferred-register: '' }
-# ALL: %0(s1) = COPY %edi
-# ALL-NEXT: %1(s8) = G_ZEXT %0(s1)
-# ALL-NEXT: %al = COPY %1(s8)
-# ALL-NEXT: RET 0, implicit %al
body: |
bb.1 (%ir-block.0):
liveins: %edi
+ ; X32-LABEL: name: test_zext_i1toi8
+ ; X32: [[COPY:%[0-9]+]]:_(s1) = COPY %edi
+ ; X32: [[ZEXT:%[0-9]+]]:_(s8) = G_ZEXT [[COPY]](s1)
+ ; X32: %al = COPY [[ZEXT]](s8)
+ ; X32: RET 0, implicit %al
+ ; X64-LABEL: name: test_zext_i1toi8
+ ; X64: [[COPY:%[0-9]+]]:_(s1) = COPY %edi
+ ; X64: [[ZEXT:%[0-9]+]]:_(s8) = G_ZEXT [[COPY]](s1)
+ ; X64: %al = COPY [[ZEXT]](s8)
+ ; X64: RET 0, implicit %al
%0(s1) = COPY %edi
%1(s8) = G_ZEXT %0(s1)
%al = COPY %1(s8)
@@ -90,21 +120,26 @@ body: |
...
---
name: test_zext_i1toi16
-# ALL-LABEL: name: test_zext_i1toi16
alignment: 4
legalized: false
regBankSelected: false
registers:
- { id: 0, class: _, preferred-register: '' }
- { id: 1, class: _, preferred-register: '' }
-# ALL: %0(s1) = COPY %edi
-# ALL-NEXT: %1(s16) = G_ZEXT %0(s1)
-# ALL-NEXT: %ax = COPY %1(s16)
-# ALL-NEXT: RET 0, implicit %ax
body: |
bb.1 (%ir-block.0):
liveins: %edi
+ ; X32-LABEL: name: test_zext_i1toi16
+ ; X32: [[COPY:%[0-9]+]]:_(s1) = COPY %edi
+ ; X32: [[ZEXT:%[0-9]+]]:_(s16) = G_ZEXT [[COPY]](s1)
+ ; X32: %ax = COPY [[ZEXT]](s16)
+ ; X32: RET 0, implicit %ax
+ ; X64-LABEL: name: test_zext_i1toi16
+ ; X64: [[COPY:%[0-9]+]]:_(s1) = COPY %edi
+ ; X64: [[ZEXT:%[0-9]+]]:_(s16) = G_ZEXT [[COPY]](s1)
+ ; X64: %ax = COPY [[ZEXT]](s16)
+ ; X64: RET 0, implicit %ax
%0(s1) = COPY %edi
%1(s16) = G_ZEXT %0(s1)
%ax = COPY %1(s16)
@@ -113,7 +148,6 @@ body: |
...
---
name: test_zext_i1
-# ALL-LABEL: name: test_zext_i1
alignment: 4
legalized: false
regBankSelected: false
@@ -121,16 +155,25 @@ registers:
- { id: 0, class: _ }
- { id: 1, class: _ }
- { id: 2, class: _ }
-# ALL: %0(s8) = COPY %edi
-# ALL-NEXT: %1(s1) = G_TRUNC %0(s8)
-# ALL-NEXT: %2(s32) = G_ZEXT %1(s1)
-# ALL-NEXT: %eax = COPY %2(s32)
-# ALL-NEXT: RET 0, implicit %eax
body: |
bb.1 (%ir-block.0):
liveins: %edi
- %0(s8) = COPY %edi
+ ; X32-LABEL: name: test_zext_i1
+ ; X32: [[COPY:%[0-9]+]]:_(s8) = COPY %dil
+ ; X32: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+ ; X32: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[COPY]](s8)
+ ; X32: [[AND:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C]]
+ ; X32: %eax = COPY [[AND]](s32)
+ ; X32: RET 0, implicit %eax
+ ; X64-LABEL: name: test_zext_i1
+ ; X64: [[COPY:%[0-9]+]]:_(s8) = COPY %dil
+ ; X64: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+ ; X64: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[COPY]](s8)
+ ; X64: [[AND:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C]]
+ ; X64: %eax = COPY [[AND]](s32)
+ ; X64: RET 0, implicit %eax
+ %0(s8) = COPY %dil
%1(s1) = G_TRUNC %0(s8)
%2(s32) = G_ZEXT %1(s1)
%eax = COPY %2(s32)
@@ -139,22 +182,27 @@ body: |
...
---
name: test_zext_i8toi16
-# ALL-LABEL: name: test_zext_i8toi16
alignment: 4
legalized: false
regBankSelected: false
registers:
- { id: 0, class: _, preferred-register: '' }
- { id: 1, class: _, preferred-register: '' }
-# ALL: %0(s8) = COPY %edi
-# ALL-NEXT: %1(s16) = G_ZEXT %0(s8)
-# ALL-NEXT: %ax = COPY %1(s16)
-# ALL-NEXT: RET 0, implicit %ax
body: |
bb.1 (%ir-block.0):
liveins: %edi
- %0(s8) = COPY %edi
+ ; X32-LABEL: name: test_zext_i8toi16
+ ; X32: [[COPY:%[0-9]+]]:_(s8) = COPY %dil
+ ; X32: [[ZEXT:%[0-9]+]]:_(s16) = G_ZEXT [[COPY]](s8)
+ ; X32: %ax = COPY [[ZEXT]](s16)
+ ; X32: RET 0, implicit %ax
+ ; X64-LABEL: name: test_zext_i8toi16
+ ; X64: [[COPY:%[0-9]+]]:_(s8) = COPY %dil
+ ; X64: [[ZEXT:%[0-9]+]]:_(s16) = G_ZEXT [[COPY]](s8)
+ ; X64: %ax = COPY [[ZEXT]](s16)
+ ; X64: RET 0, implicit %ax
+ %0(s8) = COPY %dil
%1(s16) = G_ZEXT %0(s8)
%ax = COPY %1(s16)
RET 0, implicit %ax
@@ -162,22 +210,27 @@ body: |
...
---
name: test_zext_i8
-# ALL-LABEL: name: test_zext_i8
alignment: 4
legalized: false
regBankSelected: false
registers:
- { id: 0, class: _ }
- { id: 1, class: _ }
-# ALL: %0(s8) = COPY %edi
-# ALL-NEXT: %1(s32) = G_ZEXT %0(s8)
-# ALL-NEXT: %eax = COPY %1(s32)
-# ALL-NEXT: RET 0, implicit %eax
body: |
bb.1 (%ir-block.0):
liveins: %edi
- %0(s8) = COPY %edi
+ ; X32-LABEL: name: test_zext_i8
+ ; X32: [[COPY:%[0-9]+]]:_(s8) = COPY %dil
+ ; X32: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[COPY]](s8)
+ ; X32: %eax = COPY [[ZEXT]](s32)
+ ; X32: RET 0, implicit %eax
+ ; X64-LABEL: name: test_zext_i8
+ ; X64: [[COPY:%[0-9]+]]:_(s8) = COPY %dil
+ ; X64: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[COPY]](s8)
+ ; X64: %eax = COPY [[ZEXT]](s32)
+ ; X64: RET 0, implicit %eax
+ %0(s8) = COPY %dil
%1(s32) = G_ZEXT %0(s8)
%eax = COPY %1(s32)
RET 0, implicit %eax
@@ -185,22 +238,27 @@ body: |
...
---
name: test_zext_i16
-# ALL-LABEL: name: test_zext_i16
alignment: 4
legalized: false
regBankSelected: false
registers:
- { id: 0, class: _ }
- { id: 1, class: _ }
-# ALL: %0(s16) = COPY %edi
-# ALL-NEXT: %1(s32) = G_ZEXT %0(s16)
-# ALL-NEXT: %eax = COPY %1(s32)
-# ALL-NEXT: RET 0, implicit %eax
body: |
bb.1 (%ir-block.0):
liveins: %edi
- %0(s16) = COPY %edi
+ ; X32-LABEL: name: test_zext_i16
+ ; X32: [[COPY:%[0-9]+]]:_(s16) = COPY %di
+ ; X32: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[COPY]](s16)
+ ; X32: %eax = COPY [[ZEXT]](s32)
+ ; X32: RET 0, implicit %eax
+ ; X64-LABEL: name: test_zext_i16
+ ; X64: [[COPY:%[0-9]+]]:_(s16) = COPY %di
+ ; X64: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[COPY]](s16)
+ ; X64: %eax = COPY [[ZEXT]](s32)
+ ; X64: RET 0, implicit %eax
+ %0(s16) = COPY %di
%1(s32) = G_ZEXT %0(s16)
%eax = COPY %1(s32)
RET 0, implicit %eax
@@ -208,22 +266,25 @@ body: |
...
---
name: test_sext_i1toi8
-# ALL-LABEL: name: test_sext_i1toi8
alignment: 4
legalized: false
regBankSelected: false
registers:
- { id: 0, class: _, preferred-register: '' }
- { id: 1, class: _, preferred-register: '' }
-# ALL: %0(s1) = COPY %edi
-# ALL-NEXT: %1(s8) = G_SEXT %0(s1)
-# ALL-NEXT: %al = COPY %1(s8)
-# ALL-NEXT: RET 0, implicit %al
body: |
bb.1 (%ir-block.0):
liveins: %edi
- %0(s1) = COPY %edi
+ ; X32-LABEL: name: test_sext_i1toi8
+ ; X32: [[DEF:%[0-9]+]]:_(s8) = G_IMPLICIT_DEF
+ ; X32: %al = COPY [[DEF]](s8)
+ ; X32: RET 0, implicit %al
+ ; X64-LABEL: name: test_sext_i1toi8
+ ; X64: [[DEF:%[0-9]+]]:_(s8) = G_IMPLICIT_DEF
+ ; X64: %al = COPY [[DEF]](s8)
+ ; X64: RET 0, implicit %al
+ %0(s1) = G_IMPLICIT_DEF
%1(s8) = G_SEXT %0(s1)
%al = COPY %1(s8)
RET 0, implicit %al
@@ -231,22 +292,25 @@ body: |
...
---
name: test_sext_i1toi16
-# ALL-LABEL: name: test_sext_i1toi16
alignment: 4
legalized: false
regBankSelected: false
registers:
- { id: 0, class: _, preferred-register: '' }
- { id: 1, class: _, preferred-register: '' }
-# ALL: %0(s1) = COPY %edi
-# ALL-NEXT: %1(s16) = G_SEXT %0(s1)
-# ALL-NEXT: %ax = COPY %1(s16)
-# ALL-NEXT: RET 0, implicit %ax
body: |
bb.1 (%ir-block.0):
liveins: %edi
- %0(s1) = COPY %edi
+ ; X32-LABEL: name: test_sext_i1toi16
+ ; X32: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
+ ; X32: %ax = COPY [[DEF]](s16)
+ ; X32: RET 0, implicit %ax
+ ; X64-LABEL: name: test_sext_i1toi16
+ ; X64: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
+ ; X64: %ax = COPY [[DEF]](s16)
+ ; X64: RET 0, implicit %ax
+ %0(s1) = G_IMPLICIT_DEF
%1(s16) = G_SEXT %0(s1)
%ax = COPY %1(s16)
RET 0, implicit %ax
@@ -254,7 +318,6 @@ body: |
...
---
name: test_sext_i1
-# ALL-LABEL: name: test_sext_i1
alignment: 4
legalized: false
regBankSelected: false
@@ -262,40 +325,47 @@ registers:
- { id: 0, class: _ }
- { id: 1, class: _ }
- { id: 2, class: _ }
-# ALL: %0(s8) = COPY %edi
-# ALL-NEXT: %1(s1) = G_TRUNC %0(s8)
-# ALL-NEXT: %2(s32) = G_SEXT %1(s1)
-# ALL-NEXT: %eax = COPY %2(s32)
-# ALL-NEXT: RET 0, implicit %eax
body: |
bb.1 (%ir-block.0):
liveins: %edi
- %0(s8) = COPY %edi
- %1(s1) = G_TRUNC %0(s8)
- %2(s32) = G_SEXT %1(s1)
+ ; X32-LABEL: name: test_sext_i1
+ ; X32: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+ ; X32: %eax = COPY [[DEF]](s32)
+ ; X32: RET 0, implicit %eax
+ ; X64-LABEL: name: test_sext_i1
+ ; X64: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+ ; X64: %eax = COPY [[DEF]](s32)
+ ; X64: RET 0, implicit %eax
+ %0(s1) = G_IMPLICIT_DEF
+ %2(s32) = G_SEXT %0(s1)
%eax = COPY %2(s32)
RET 0, implicit %eax
...
---
name: test_sext_i8toi16
-# ALL-LABEL: name: test_sext_i8toi16
alignment: 4
legalized: false
regBankSelected: false
registers:
- { id: 0, class: _, preferred-register: '' }
- { id: 1, class: _, preferred-register: '' }
-# ALL: %0(s8) = COPY %edi
-# ALL-NEXT: %1(s16) = G_SEXT %0(s8)
-# ALL-NEXT: %ax = COPY %1(s16)
-# ALL-NEXT: RET 0, implicit %ax
body: |
bb.1 (%ir-block.0):
liveins: %edi
- %0(s8) = COPY %edi
+ ; X32-LABEL: name: test_sext_i8toi16
+ ; X32: [[COPY:%[0-9]+]]:_(s8) = COPY %dil
+ ; X32: [[SEXT:%[0-9]+]]:_(s16) = G_SEXT [[COPY]](s8)
+ ; X32: %ax = COPY [[SEXT]](s16)
+ ; X32: RET 0, implicit %ax
+ ; X64-LABEL: name: test_sext_i8toi16
+ ; X64: [[COPY:%[0-9]+]]:_(s8) = COPY %dil
+ ; X64: [[SEXT:%[0-9]+]]:_(s16) = G_SEXT [[COPY]](s8)
+ ; X64: %ax = COPY [[SEXT]](s16)
+ ; X64: RET 0, implicit %ax
+ %0(s8) = COPY %dil
%1(s16) = G_SEXT %0(s8)
%ax = COPY %1(s16)
RET 0, implicit %ax
@@ -303,22 +373,27 @@ body: |
...
---
name: test_sext_i8
-# ALL-LABEL: name: test_sext_i8
alignment: 4
legalized: false
regBankSelected: false
registers:
- { id: 0, class: _ }
- { id: 1, class: _ }
-# ALL: %0(s8) = COPY %edi
-# ALL-NEXT: %1(s32) = G_SEXT %0(s8)
-# ALL-NEXT: %eax = COPY %1(s32)
-# ALL-NEXT: RET 0, implicit %eax
body: |
bb.1 (%ir-block.0):
liveins: %edi
- %0(s8) = COPY %edi
+ ; X32-LABEL: name: test_sext_i8
+ ; X32: [[COPY:%[0-9]+]]:_(s8) = COPY %dil
+ ; X32: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[COPY]](s8)
+ ; X32: %eax = COPY [[SEXT]](s32)
+ ; X32: RET 0, implicit %eax
+ ; X64-LABEL: name: test_sext_i8
+ ; X64: [[COPY:%[0-9]+]]:_(s8) = COPY %dil
+ ; X64: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[COPY]](s8)
+ ; X64: %eax = COPY [[SEXT]](s32)
+ ; X64: RET 0, implicit %eax
+ %0(s8) = COPY %dil
%1(s32) = G_SEXT %0(s8)
%eax = COPY %1(s32)
RET 0, implicit %eax
@@ -326,24 +401,199 @@ body: |
...
---
name: test_sext_i16
-# ALL-LABEL: name: test_sext_i16
alignment: 4
legalized: false
regBankSelected: false
registers:
- { id: 0, class: _ }
- { id: 1, class: _ }
-# ALL: %0(s16) = COPY %edi
-# ALL-NEXT: %1(s32) = G_SEXT %0(s16)
-# ALL-NEXT: %eax = COPY %1(s32)
-# ALL-NEXT: RET 0, implicit %eax
body: |
bb.1 (%ir-block.0):
liveins: %edi
- %0(s16) = COPY %edi
+ ; X32-LABEL: name: test_sext_i16
+ ; X32: [[COPY:%[0-9]+]]:_(s16) = COPY %di
+ ; X32: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[COPY]](s16)
+ ; X32: %eax = COPY [[SEXT]](s32)
+ ; X32: RET 0, implicit %eax
+ ; X64-LABEL: name: test_sext_i16
+ ; X64: [[COPY:%[0-9]+]]:_(s16) = COPY %di
+ ; X64: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[COPY]](s16)
+ ; X64: %eax = COPY [[SEXT]](s32)
+ ; X64: RET 0, implicit %eax
+ %0(s16) = COPY %di
%1(s32) = G_SEXT %0(s16)
%eax = COPY %1(s32)
RET 0, implicit %eax
...
+---
+name: test_anyext_i1toi8
+alignment: 4
+legalized: false
+regBankSelected: false
+registers:
+ - { id: 0, class: _, preferred-register: '' }
+ - { id: 1, class: _, preferred-register: '' }
+body: |
+ bb.1 (%ir-block.0):
+ liveins: %edi
+
+ ; X32-LABEL: name: test_anyext_i1toi8
+ ; X32: [[COPY:%[0-9]+]]:_(s1) = COPY %edi
+ ; X32: [[ANYEXT:%[0-9]+]]:_(s8) = G_ANYEXT [[COPY]](s1)
+ ; X32: %al = COPY [[ANYEXT]](s8)
+ ; X32: RET 0, implicit %al
+ ; X64-LABEL: name: test_anyext_i1toi8
+ ; X64: [[COPY:%[0-9]+]]:_(s1) = COPY %edi
+ ; X64: [[ANYEXT:%[0-9]+]]:_(s8) = G_ANYEXT [[COPY]](s1)
+ ; X64: %al = COPY [[ANYEXT]](s8)
+ ; X64: RET 0, implicit %al
+ %0(s1) = COPY %edi
+ %1(s8) = G_ANYEXT %0(s1)
+ %al = COPY %1(s8)
+ RET 0, implicit %al
+
+...
+---
+name: test_anyext_i1toi16
+alignment: 4
+legalized: false
+regBankSelected: false
+registers:
+ - { id: 0, class: _, preferred-register: '' }
+ - { id: 1, class: _, preferred-register: '' }
+body: |
+ bb.1 (%ir-block.0):
+ liveins: %edi
+
+ ; X32-LABEL: name: test_anyext_i1toi16
+ ; X32: [[COPY:%[0-9]+]]:_(s1) = COPY %edi
+ ; X32: [[ANYEXT:%[0-9]+]]:_(s16) = G_ANYEXT [[COPY]](s1)
+ ; X32: %ax = COPY [[ANYEXT]](s16)
+ ; X32: RET 0, implicit %ax
+ ; X64-LABEL: name: test_anyext_i1toi16
+ ; X64: [[COPY:%[0-9]+]]:_(s1) = COPY %edi
+ ; X64: [[ANYEXT:%[0-9]+]]:_(s16) = G_ANYEXT [[COPY]](s1)
+ ; X64: %ax = COPY [[ANYEXT]](s16)
+ ; X64: RET 0, implicit %ax
+ %0(s1) = COPY %edi
+ %1(s16) = G_ANYEXT %0(s1)
+ %ax = COPY %1(s16)
+ RET 0, implicit %ax
+
+...
+---
+name: test_anyext_i1
+alignment: 4
+legalized: false
+regBankSelected: false
+registers:
+ - { id: 0, class: _ }
+ - { id: 1, class: _ }
+ - { id: 2, class: _ }
+body: |
+ bb.1 (%ir-block.0):
+ liveins: %edi
+
+ ; X32-LABEL: name: test_anyext_i1
+ ; X32: [[COPY:%[0-9]+]]:_(s8) = COPY %dil
+ ; X32: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[COPY]](s8)
+ ; X32: %eax = COPY [[ANYEXT]](s32)
+ ; X32: RET 0, implicit %eax
+ ; X64-LABEL: name: test_anyext_i1
+ ; X64: [[COPY:%[0-9]+]]:_(s8) = COPY %dil
+ ; X64: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[COPY]](s8)
+ ; X64: %eax = COPY [[ANYEXT]](s32)
+ ; X64: RET 0, implicit %eax
+ %0(s8) = COPY %dil
+ %1(s1) = G_TRUNC %0(s8)
+ %2(s32) = G_ANYEXT %1(s1)
+ %eax = COPY %2(s32)
+ RET 0, implicit %eax
+
+...
+---
+name: test_anyext_i8toi16
+alignment: 4
+legalized: false
+regBankSelected: false
+registers:
+ - { id: 0, class: _, preferred-register: '' }
+ - { id: 1, class: _, preferred-register: '' }
+body: |
+ bb.1 (%ir-block.0):
+ liveins: %edi
+
+ ; X32-LABEL: name: test_anyext_i8toi16
+ ; X32: [[COPY:%[0-9]+]]:_(s8) = COPY %dil
+ ; X32: [[ANYEXT:%[0-9]+]]:_(s16) = G_ANYEXT [[COPY]](s8)
+ ; X32: %ax = COPY [[ANYEXT]](s16)
+ ; X32: RET 0, implicit %ax
+ ; X64-LABEL: name: test_anyext_i8toi16
+ ; X64: [[COPY:%[0-9]+]]:_(s8) = COPY %dil
+ ; X64: [[ANYEXT:%[0-9]+]]:_(s16) = G_ANYEXT [[COPY]](s8)
+ ; X64: %ax = COPY [[ANYEXT]](s16)
+ ; X64: RET 0, implicit %ax
+ %0(s8) = COPY %dil
+ %1(s16) = G_ANYEXT %0(s8)
+ %ax = COPY %1(s16)
+ RET 0, implicit %ax
+
+...
+---
+name: test_anyext_i8
+alignment: 4
+legalized: false
+regBankSelected: false
+registers:
+ - { id: 0, class: _ }
+ - { id: 1, class: _ }
+body: |
+ bb.1 (%ir-block.0):
+ liveins: %edi
+
+ ; X32-LABEL: name: test_anyext_i8
+ ; X32: [[COPY:%[0-9]+]]:_(s8) = COPY %dil
+ ; X32: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[COPY]](s8)
+ ; X32: %eax = COPY [[ANYEXT]](s32)
+ ; X32: RET 0, implicit %eax
+ ; X64-LABEL: name: test_anyext_i8
+ ; X64: [[COPY:%[0-9]+]]:_(s8) = COPY %dil
+ ; X64: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[COPY]](s8)
+ ; X64: %eax = COPY [[ANYEXT]](s32)
+ ; X64: RET 0, implicit %eax
+ %0(s8) = COPY %dil
+ %1(s32) = G_ANYEXT %0(s8)
+ %eax = COPY %1(s32)
+ RET 0, implicit %eax
+
+...
+---
+name: test_anyext_i16
+alignment: 4
+legalized: false
+regBankSelected: false
+registers:
+ - { id: 0, class: _ }
+ - { id: 1, class: _ }
+body: |
+ bb.1 (%ir-block.0):
+ liveins: %edi
+
+ ; X32-LABEL: name: test_anyext_i16
+ ; X32: [[COPY:%[0-9]+]]:_(s16) = COPY %di
+ ; X32: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[COPY]](s16)
+ ; X32: %eax = COPY [[ANYEXT]](s32)
+ ; X32: RET 0, implicit %eax
+ ; X64-LABEL: name: test_anyext_i16
+ ; X64: [[COPY:%[0-9]+]]:_(s16) = COPY %di
+ ; X64: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[COPY]](s16)
+ ; X64: %eax = COPY [[ANYEXT]](s32)
+ ; X64: RET 0, implicit %eax
+ %0(s16) = COPY %di
+ %1(s32) = G_ANYEXT %0(s16)
+ %eax = COPY %1(s32)
+ RET 0, implicit %eax
+
+...
diff --git a/test/CodeGen/X86/GlobalISel/legalize-fadd-scalar.mir b/test/CodeGen/X86/GlobalISel/legalize-fadd-scalar.mir
index 353a26ca2c8a..407c42567acc 100644
--- a/test/CodeGen/X86/GlobalISel/legalize-fadd-scalar.mir
+++ b/test/CodeGen/X86/GlobalISel/legalize-fadd-scalar.mir
@@ -1,3 +1,4 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple=x86_64-linux-gnu -global-isel -run-pass=legalizer %s -o - | FileCheck %s
--- |
@@ -14,7 +15,6 @@
...
---
name: test_fadd_float
-# CHECK-LABEL: name: test_fadd_float
alignment: 4
legalized: false
regBankSelected: false
@@ -26,15 +26,16 @@ liveins:
fixedStack:
stack:
constants:
-# CHECK: %0(s32) = COPY %xmm0
-# CHECK-NEXT: %1(s32) = COPY %xmm1
-# CHECK-NEXT: %2(s32) = G_FADD %0, %1
-# CHECK-NEXT: %xmm0 = COPY %2(s32)
-# CHECK-NEXT: RET 0, implicit %xmm0
body: |
bb.1 (%ir-block.0):
liveins: %xmm0, %xmm1
+ ; CHECK-LABEL: name: test_fadd_float
+ ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY %xmm0
+ ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY %xmm1
+ ; CHECK: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[COPY]], [[COPY1]]
+ ; CHECK: %xmm0 = COPY [[FADD]](s32)
+ ; CHECK: RET 0, implicit %xmm0
%0(s32) = COPY %xmm0
%1(s32) = COPY %xmm1
%2(s32) = G_FADD %0, %1
@@ -44,7 +45,6 @@ body: |
...
---
name: test_fadd_double
-# CHECK-LABEL: name: test_fadd_double
alignment: 4
legalized: false
regBankSelected: false
@@ -56,15 +56,16 @@ liveins:
fixedStack:
stack:
constants:
-# CHECK: %0(s64) = COPY %xmm0
-# CHECK-NEXT: %1(s64) = COPY %xmm1
-# CHECK-NEXT: %2(s64) = G_FADD %0, %1
-# CHECK-NEXT: %xmm0 = COPY %2(s64)
-# CHECK-NEXT: RET 0, implicit %xmm0
body: |
bb.1 (%ir-block.0):
liveins: %xmm0, %xmm1
+ ; CHECK-LABEL: name: test_fadd_double
+ ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY %xmm0
+ ; CHECK: [[COPY1:%[0-9]+]]:_(s64) = COPY %xmm1
+ ; CHECK: [[FADD:%[0-9]+]]:_(s64) = G_FADD [[COPY]], [[COPY1]]
+ ; CHECK: %xmm0 = COPY [[FADD]](s64)
+ ; CHECK: RET 0, implicit %xmm0
%0(s64) = COPY %xmm0
%1(s64) = COPY %xmm1
%2(s64) = G_FADD %0, %1
diff --git a/test/CodeGen/X86/GlobalISel/legalize-fdiv-scalar.mir b/test/CodeGen/X86/GlobalISel/legalize-fdiv-scalar.mir
index 102d95c6390c..128ab9b0ee89 100644
--- a/test/CodeGen/X86/GlobalISel/legalize-fdiv-scalar.mir
+++ b/test/CodeGen/X86/GlobalISel/legalize-fdiv-scalar.mir
@@ -1,3 +1,4 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple=x86_64-linux-gnu -global-isel -run-pass=legalizer %s -o - | FileCheck %s
--- |
@@ -14,7 +15,6 @@
...
---
name: test_fdiv_float
-# CHECK-LABEL: name: test_fdiv_float
alignment: 4
legalized: false
regBankSelected: false
@@ -26,15 +26,16 @@ liveins:
fixedStack:
stack:
constants:
-# CHECK: %0(s32) = COPY %xmm0
-# CHECK-NEXT: %1(s32) = COPY %xmm1
-# CHECK-NEXT: %2(s32) = G_FDIV %0, %1
-# CHECK-NEXT: %xmm0 = COPY %2(s32)
-# CHECK-NEXT: RET 0, implicit %xmm0
body: |
bb.1 (%ir-block.0):
liveins: %xmm0, %xmm1
+ ; CHECK-LABEL: name: test_fdiv_float
+ ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY %xmm0
+ ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY %xmm1
+ ; CHECK: [[FDIV:%[0-9]+]]:_(s32) = G_FDIV [[COPY]], [[COPY1]]
+ ; CHECK: %xmm0 = COPY [[FDIV]](s32)
+ ; CHECK: RET 0, implicit %xmm0
%0(s32) = COPY %xmm0
%1(s32) = COPY %xmm1
%2(s32) = G_FDIV %0, %1
@@ -44,7 +45,6 @@ body: |
...
---
name: test_fdiv_double
-# CHECK-LABEL: name: test_fdiv_double
alignment: 4
legalized: false
regBankSelected: false
@@ -56,15 +56,16 @@ liveins:
fixedStack:
stack:
constants:
-# CHECK: %0(s64) = COPY %xmm0
-# CHECK-NEXT: %1(s64) = COPY %xmm1
-# CHECK-NEXT: %2(s64) = G_FDIV %0, %1
-# CHECK-NEXT: %xmm0 = COPY %2(s64)
-# CHECK-NEXT: RET 0, implicit %xmm0
body: |
bb.1 (%ir-block.0):
liveins: %xmm0, %xmm1
+ ; CHECK-LABEL: name: test_fdiv_double
+ ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY %xmm0
+ ; CHECK: [[COPY1:%[0-9]+]]:_(s64) = COPY %xmm1
+ ; CHECK: [[FDIV:%[0-9]+]]:_(s64) = G_FDIV [[COPY]], [[COPY1]]
+ ; CHECK: %xmm0 = COPY [[FDIV]](s64)
+ ; CHECK: RET 0, implicit %xmm0
%0(s64) = COPY %xmm0
%1(s64) = COPY %xmm1
%2(s64) = G_FDIV %0, %1
diff --git a/test/CodeGen/X86/GlobalISel/legalize-fmul-scalar.mir b/test/CodeGen/X86/GlobalISel/legalize-fmul-scalar.mir
index eeacbfcf07b2..73e04d0fcf8a 100644
--- a/test/CodeGen/X86/GlobalISel/legalize-fmul-scalar.mir
+++ b/test/CodeGen/X86/GlobalISel/legalize-fmul-scalar.mir
@@ -1,3 +1,4 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple=x86_64-linux-gnu -global-isel -run-pass=legalizer %s -o - | FileCheck %s
--- |
@@ -14,7 +15,6 @@
...
---
name: test_fmul_float
-# CHECK-LABEL: name: test_fmul_float
alignment: 4
legalized: false
regBankSelected: false
@@ -26,15 +26,16 @@ liveins:
fixedStack:
stack:
constants:
-# CHECK: %0(s32) = COPY %xmm0
-# CHECK-NEXT: %1(s32) = COPY %xmm1
-# CHECK-NEXT: %2(s32) = G_FMUL %0, %1
-# CHECK-NEXT: %xmm0 = COPY %2(s32)
-# CHECK-NEXT: RET 0, implicit %xmm0
body: |
bb.1 (%ir-block.0):
liveins: %xmm0, %xmm1
+ ; CHECK-LABEL: name: test_fmul_float
+ ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY %xmm0
+ ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY %xmm1
+ ; CHECK: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[COPY]], [[COPY1]]
+ ; CHECK: %xmm0 = COPY [[FMUL]](s32)
+ ; CHECK: RET 0, implicit %xmm0
%0(s32) = COPY %xmm0
%1(s32) = COPY %xmm1
%2(s32) = G_FMUL %0, %1
@@ -44,7 +45,6 @@ body: |
...
---
name: test_fmul_double
-# CHECK-LABEL: name: test_fmul_double
alignment: 4
legalized: false
regBankSelected: false
@@ -56,15 +56,16 @@ liveins:
fixedStack:
stack:
constants:
-# CHECK: %0(s64) = COPY %xmm0
-# CHECK-NEXT: %1(s64) = COPY %xmm1
-# CHECK-NEXT: %2(s64) = G_FMUL %0, %1
-# CHECK-NEXT: %xmm0 = COPY %2(s64)
-# CHECK-NEXT: RET 0, implicit %xmm0
body: |
bb.1 (%ir-block.0):
liveins: %xmm0, %xmm1
+ ; CHECK-LABEL: name: test_fmul_double
+ ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY %xmm0
+ ; CHECK: [[COPY1:%[0-9]+]]:_(s64) = COPY %xmm1
+ ; CHECK: [[FMUL:%[0-9]+]]:_(s64) = G_FMUL [[COPY]], [[COPY1]]
+ ; CHECK: %xmm0 = COPY [[FMUL]](s64)
+ ; CHECK: RET 0, implicit %xmm0
%0(s64) = COPY %xmm0
%1(s64) = COPY %xmm1
%2(s64) = G_FMUL %0, %1
diff --git a/test/CodeGen/X86/GlobalISel/legalize-fpext-scalar.mir b/test/CodeGen/X86/GlobalISel/legalize-fpext-scalar.mir
new file mode 100644
index 000000000000..25d1fbc564ef
--- /dev/null
+++ b/test/CodeGen/X86/GlobalISel/legalize-fpext-scalar.mir
@@ -0,0 +1,34 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=x86_64-linux-gnu -global-isel -run-pass=legalizer %s -o - | FileCheck %s --check-prefix=ALL
+--- |
+
+ define double @test(float %a) {
+ entry:
+ %conv = fpext float %a to double
+ ret double %conv
+ }
+
+...
+---
+name: test
+alignment: 4
+legalized: false
+regBankSelected: false
+registers:
+ - { id: 0, class: _, preferred-register: '' }
+ - { id: 1, class: _, preferred-register: '' }
+body: |
+ bb.1.entry:
+ liveins: %xmm0
+
+ ; ALL-LABEL: name: test
+ ; ALL: [[COPY:%[0-9]+]]:_(s32) = COPY %xmm0
+ ; ALL: [[FPEXT:%[0-9]+]]:_(s64) = G_FPEXT [[COPY]](s32)
+ ; ALL: %xmm0 = COPY [[FPEXT]](s64)
+ ; ALL: RET 0, implicit %xmm0
+ %0(s32) = COPY %xmm0
+ %1(s64) = G_FPEXT %0(s32)
+ %xmm0 = COPY %1(s64)
+ RET 0, implicit %xmm0
+
+...
diff --git a/test/CodeGen/X86/GlobalISel/legalize-fsub-scalar.mir b/test/CodeGen/X86/GlobalISel/legalize-fsub-scalar.mir
index 3b3ee4aa0afb..253d1fb49a3a 100644
--- a/test/CodeGen/X86/GlobalISel/legalize-fsub-scalar.mir
+++ b/test/CodeGen/X86/GlobalISel/legalize-fsub-scalar.mir
@@ -1,3 +1,4 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple=x86_64-linux-gnu -global-isel -run-pass=legalizer %s -o - | FileCheck %s
--- |
@@ -14,7 +15,6 @@
...
---
name: test_fsub_float
-# CHECK-LABEL: name: test_fsub_float
alignment: 4
legalized: false
regBankSelected: false
@@ -26,15 +26,16 @@ liveins:
fixedStack:
stack:
constants:
-# CHECK: %0(s32) = COPY %xmm0
-# CHECK-NEXT: %1(s32) = COPY %xmm1
-# CHECK-NEXT: %2(s32) = G_FSUB %0, %1
-# CHECK-NEXT: %xmm0 = COPY %2(s32)
-# CHECK-NEXT: RET 0, implicit %xmm0
body: |
bb.1 (%ir-block.0):
liveins: %xmm0, %xmm1
+ ; CHECK-LABEL: name: test_fsub_float
+ ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY %xmm0
+ ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY %xmm1
+ ; CHECK: [[FSUB:%[0-9]+]]:_(s32) = G_FSUB [[COPY]], [[COPY1]]
+ ; CHECK: %xmm0 = COPY [[FSUB]](s32)
+ ; CHECK: RET 0, implicit %xmm0
%0(s32) = COPY %xmm0
%1(s32) = COPY %xmm1
%2(s32) = G_FSUB %0, %1
@@ -44,7 +45,6 @@ body: |
...
---
name: test_fsub_double
-# CHECK-LABEL: name: test_fsub_double
alignment: 4
legalized: false
regBankSelected: false
@@ -56,15 +56,16 @@ liveins:
fixedStack:
stack:
constants:
-# CHECK: %0(s64) = COPY %xmm0
-# CHECK-NEXT: %1(s64) = COPY %xmm1
-# CHECK-NEXT: %2(s64) = G_FSUB %0, %1
-# CHECK-NEXT: %xmm0 = COPY %2(s64)
-# CHECK-NEXT: RET 0, implicit %xmm0
body: |
bb.1 (%ir-block.0):
liveins: %xmm0, %xmm1
+ ; CHECK-LABEL: name: test_fsub_double
+ ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY %xmm0
+ ; CHECK: [[COPY1:%[0-9]+]]:_(s64) = COPY %xmm1
+ ; CHECK: [[FSUB:%[0-9]+]]:_(s64) = G_FSUB [[COPY]], [[COPY1]]
+ ; CHECK: %xmm0 = COPY [[FSUB]](s64)
+ ; CHECK: RET 0, implicit %xmm0
%0(s64) = COPY %xmm0
%1(s64) = COPY %xmm1
%2(s64) = G_FSUB %0, %1
diff --git a/test/CodeGen/X86/GlobalISel/legalize-gep.mir b/test/CodeGen/X86/GlobalISel/legalize-gep.mir
index 4fdb9b910ad7..349ac9097cf4 100644
--- a/test/CodeGen/X86/GlobalISel/legalize-gep.mir
+++ b/test/CodeGen/X86/GlobalISel/legalize-gep.mir
@@ -1,101 +1,110 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple=x86_64-linux-gnu -global-isel -run-pass=legalizer %s -o - | FileCheck %s
--- |
- define void @test_gep_i8() {
+ define void @test_gep_i8(i8* %addr) {
%arrayidx = getelementptr i32, i32* undef, i8 5
ret void
}
- define void @test_gep_i16() {
+ define void @test_gep_i16(i8* %addr) {
%arrayidx = getelementptr i32, i32* undef, i16 5
ret void
}
- define void @test_gep_i32() {
+ define void @test_gep_i32(i8* %addr) {
%arrayidx = getelementptr i32, i32* undef, i32 5
ret void
}
- define void @test_gep_i64() {
+ define void @test_gep_i64(i8* %addr) {
%arrayidx = getelementptr i32, i32* undef, i64 5
ret void
}
...
---
name: test_gep_i8
-# CHECK-LABEL: name: test_gep_i8
legalized: false
registers:
- { id: 0, class: _ }
- { id: 1, class: _ }
- { id: 2, class: _ }
-# CHECK: %0(p0) = IMPLICIT_DEF
-# CHECK-NEXT: %1(s8) = G_CONSTANT i8 20
-# CHECK-NEXT: %3(s32) = G_SEXT %1(s8)
-# CHECK-NEXT: %2(p0) = G_GEP %0, %3(s32)
-# CHECK-NEXT: RET 0
body: |
bb.1 (%ir-block.0):
+ ; CHECK-LABEL: name: test_gep_i8
+ ; CHECK: [[DEF:%[0-9]+]]:_(p0) = IMPLICIT_DEF
+ ; CHECK: [[C:%[0-9]+]]:_(s8) = G_CONSTANT i8 20
+ ; CHECK: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[C]](s8)
+ ; CHECK: [[GEP:%[0-9]+]]:_(p0) = G_GEP [[DEF]], [[SEXT]](s32)
+ ; CHECK: G_STORE [[GEP]](p0), [[DEF]](p0) :: (store 1 into %ir.addr)
+ ; CHECK: RET 0
%0(p0) = IMPLICIT_DEF
%1(s8) = G_CONSTANT i8 20
%2(p0) = G_GEP %0, %1(s8)
+ G_STORE %2, %0 :: (store 1 into %ir.addr)
RET 0
...
---
name: test_gep_i16
-# CHECK-LABEL: name: test_gep_i16
legalized: false
registers:
- { id: 0, class: _ }
- { id: 1, class: _ }
- { id: 2, class: _ }
-# CHECK: %0(p0) = IMPLICIT_DEF
-# CHECK-NEXT: %1(s16) = G_CONSTANT i16 20
-# CHECK-NEXT: %3(s32) = G_SEXT %1(s16)
-# CHECK-NEXT: %2(p0) = G_GEP %0, %3(s32)
-# CHECK-NEXT: RET 0
body: |
bb.1 (%ir-block.0):
+ ; CHECK-LABEL: name: test_gep_i16
+ ; CHECK: [[DEF:%[0-9]+]]:_(p0) = IMPLICIT_DEF
+ ; CHECK: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 20
+ ; CHECK: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[C]](s16)
+ ; CHECK: [[GEP:%[0-9]+]]:_(p0) = G_GEP [[DEF]], [[SEXT]](s32)
+ ; CHECK: G_STORE [[GEP]](p0), [[DEF]](p0) :: (store 1 into %ir.addr)
+ ; CHECK: RET 0
%0(p0) = IMPLICIT_DEF
%1(s16) = G_CONSTANT i16 20
%2(p0) = G_GEP %0, %1(s16)
+ G_STORE %2, %0 :: (store 1 into %ir.addr)
RET 0
...
---
name: test_gep_i32
-# CHECK-LABEL: name: test_gep_i32
legalized: false
registers:
- { id: 0, class: _ }
- { id: 1, class: _ }
- { id: 2, class: _ }
-# CHECK: %0(p0) = IMPLICIT_DEF
-# CHECK-NEXT: %1(s32) = G_CONSTANT i32 20
-# CHECK-NEXT: %2(p0) = G_GEP %0, %1(s32)
-# CHECK-NEXT: RET 0
body: |
bb.1 (%ir-block.0):
+ ; CHECK-LABEL: name: test_gep_i32
+ ; CHECK: [[DEF:%[0-9]+]]:_(p0) = IMPLICIT_DEF
+ ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
+ ; CHECK: [[GEP:%[0-9]+]]:_(p0) = G_GEP [[DEF]], [[C]](s32)
+ ; CHECK: G_STORE [[GEP]](p0), [[DEF]](p0) :: (store 1 into %ir.addr)
+ ; CHECK: RET 0
%0(p0) = IMPLICIT_DEF
%1(s32) = G_CONSTANT i32 20
%2(p0) = G_GEP %0, %1(s32)
+ G_STORE %2, %0 :: (store 1 into %ir.addr)
RET 0
...
---
name: test_gep_i64
-# CHECK-LABEL: name: test_gep_i64
legalized: false
registers:
- { id: 0, class: _ }
- { id: 1, class: _ }
- { id: 2, class: _ }
-# CHECK: %0(p0) = IMPLICIT_DEF
-# CHECK-NEXT: %1(s64) = G_CONSTANT i64 20
-# CHECK-NEXT: %2(p0) = G_GEP %0, %1(s64)
-# CHECK-NEXT: RET 0
body: |
bb.1 (%ir-block.0):
+ ; CHECK-LABEL: name: test_gep_i64
+ ; CHECK: [[DEF:%[0-9]+]]:_(p0) = IMPLICIT_DEF
+ ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 20
+ ; CHECK: [[GEP:%[0-9]+]]:_(p0) = G_GEP [[DEF]], [[C]](s64)
+ ; CHECK: G_STORE [[GEP]](p0), [[DEF]](p0) :: (store 1 into %ir.addr)
+ ; CHECK: RET 0
%0(p0) = IMPLICIT_DEF
%1(s64) = G_CONSTANT i64 20
%2(p0) = G_GEP %0, %1(s64)
+ G_STORE %2, %0 :: (store 1 into %ir.addr)
RET 0
...
diff --git a/test/CodeGen/X86/GlobalISel/legalize-insert-vec256.mir b/test/CodeGen/X86/GlobalISel/legalize-insert-vec256.mir
index 8989fb69b415..613f2a794b05 100644
--- a/test/CodeGen/X86/GlobalISel/legalize-insert-vec256.mir
+++ b/test/CodeGen/X86/GlobalISel/legalize-insert-vec256.mir
@@ -15,9 +15,9 @@ registers:
- { id: 0, class: _ }
- { id: 1, class: _ }
- { id: 2, class: _ }
-# ALL: %0(<8 x s32>) = COPY %ymm0
-# ALL-NEXT: %1(<4 x s32>) = COPY %xmm1
-# ALL-NEXT: %2(<8 x s32>) = G_INSERT %0, %1(<4 x s32>), 0
+# ALL: %0:_(<8 x s32>) = COPY %ymm0
+# ALL-NEXT: %1:_(<4 x s32>) = COPY %xmm1
+# ALL-NEXT: %2:_(<8 x s32>) = G_INSERT %0, %1(<4 x s32>), 0
# ALL-NEXT: %ymm0 = COPY %2(<8 x s32>)
# ALL-NEXT: RET 0, implicit %ymm0
body: |
diff --git a/test/CodeGen/X86/GlobalISel/legalize-insert-vec512.mir b/test/CodeGen/X86/GlobalISel/legalize-insert-vec512.mir
index 777531da4d93..d9fb35e44dc3 100644
--- a/test/CodeGen/X86/GlobalISel/legalize-insert-vec512.mir
+++ b/test/CodeGen/X86/GlobalISel/legalize-insert-vec512.mir
@@ -1,3 +1,4 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx512f -global-isel -run-pass=legalizer -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL
--- |
@@ -11,7 +12,6 @@
...
---
name: test_insert_128
-# ALL-LABEL: name: test_insert_128
alignment: 4
legalized: false
regBankSelected: false
@@ -19,25 +19,25 @@ registers:
- { id: 0, class: _ }
- { id: 1, class: _ }
- { id: 2, class: _ }
-# ALL: %0(<16 x s32>) = COPY %zmm0
-# ALL-NEXT: %1(<4 x s32>) = COPY %xmm1
-# ALL-NEXT: %2(<16 x s32>) = G_INSERT %0, %1(<4 x s32>), 0
-# ALL-NEXT: %ymm0 = COPY %2(<16 x s32>)
-# ALL-NEXT: RET 0, implicit %ymm0
body: |
bb.1 (%ir-block.0):
liveins: %zmm0, %ymm1
+ ; ALL-LABEL: name: test_insert_128
+ ; ALL: [[COPY:%[0-9]+]]:_(<16 x s32>) = COPY %zmm0
+ ; ALL: [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY %xmm1
+ ; ALL: [[INSERT:%[0-9]+]]:_(<16 x s32>) = G_INSERT [[COPY]], [[COPY1]](<4 x s32>), 0
+ ; ALL: %zmm0 = COPY [[INSERT]](<16 x s32>)
+ ; ALL: RET 0, implicit %ymm0
%0(<16 x s32>) = COPY %zmm0
%1(<4 x s32>) = COPY %xmm1
%2(<16 x s32>) = G_INSERT %0(<16 x s32>), %1(<4 x s32>), 0
- %ymm0 = COPY %2(<16 x s32>)
+ %zmm0 = COPY %2(<16 x s32>)
RET 0, implicit %ymm0
...
---
name: test_insert_256
-# ALL-LABEL: name: test_insert_256
alignment: 4
legalized: false
regBankSelected: false
@@ -45,19 +45,20 @@ registers:
- { id: 0, class: _ }
- { id: 1, class: _ }
- { id: 2, class: _ }
-# ALL: %0(<16 x s32>) = COPY %zmm0
-# ALL-NEXT: %1(<8 x s32>) = COPY %ymm1
-# ALL-NEXT: %2(<16 x s32>) = G_INSERT %0, %1(<8 x s32>), 0
-# ALL-NEXT: %ymm0 = COPY %2(<16 x s32>)
-# ALL-NEXT: RET 0, implicit %ymm0
body: |
bb.1 (%ir-block.0):
liveins: %zmm0, %ymm1
+ ; ALL-LABEL: name: test_insert_256
+ ; ALL: [[COPY:%[0-9]+]]:_(<16 x s32>) = COPY %zmm0
+ ; ALL: [[COPY1:%[0-9]+]]:_(<8 x s32>) = COPY %ymm1
+ ; ALL: [[INSERT:%[0-9]+]]:_(<16 x s32>) = G_INSERT [[COPY]], [[COPY1]](<8 x s32>), 0
+ ; ALL: %zmm0 = COPY [[INSERT]](<16 x s32>)
+ ; ALL: RET 0, implicit %ymm0
%0(<16 x s32>) = COPY %zmm0
%1(<8 x s32>) = COPY %ymm1
%2(<16 x s32>) = G_INSERT %0(<16 x s32>), %1(<8 x s32>), 0
- %ymm0 = COPY %2(<16 x s32>)
+ %zmm0 = COPY %2(<16 x s32>)
RET 0, implicit %ymm0
...
diff --git a/test/CodeGen/X86/GlobalISel/legalize-memop-scalar.mir b/test/CodeGen/X86/GlobalISel/legalize-memop-scalar.mir
index 60d9fc63c14a..ee572a3f7a12 100644
--- a/test/CodeGen/X86/GlobalISel/legalize-memop-scalar.mir
+++ b/test/CodeGen/X86/GlobalISel/legalize-memop-scalar.mir
@@ -1,3 +1,4 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple=x86_64-linux-gnu -global-isel -run-pass=legalizer %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X64
# RUN: llc -mtriple=i386-linux-gnu -global-isel -run-pass=legalizer %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X32
@@ -12,7 +13,6 @@
...
---
name: test_memop_s8tos32
-# ALL-LABEL: name: test_memop_s8tos32
alignment: 4
legalized: false
regBankSelected: false
@@ -28,28 +28,40 @@ registers:
- { id: 8, class: _, preferred-register: '' }
- { id: 9, class: _, preferred-register: '' }
- { id: 10, class: _, preferred-register: '' }
-# ALL: %0(p0) = IMPLICIT_DEF
-# ALL-NEXT: %11(s8) = G_LOAD %0(p0) :: (load 1)
-# ALL-NEXT: %9(s1) = G_TRUNC %11(s8)
-# ALL-NEXT: %1(s8) = G_LOAD %0(p0) :: (load 1)
-# ALL-NEXT: %2(s16) = G_LOAD %0(p0) :: (load 2)
-# ALL-NEXT: %3(s32) = G_LOAD %0(p0) :: (load 4)
-# ALL-NEXT: %4(p0) = G_LOAD %0(p0) :: (load 8)
-# ALL-NEXT: %10(s1) = IMPLICIT_DEF
-# ALL-NEXT: %12(s8) = G_ZEXT %10(s1)
-# ALL-NEXT: G_STORE %12(s8), %0(p0) :: (store 1)
-# ALL-NEXT: %5(s8) = IMPLICIT_DEF
-# ALL-NEXT: G_STORE %5(s8), %0(p0) :: (store 1)
-# ALL-NEXT: %6(s16) = IMPLICIT_DEF
-# ALL-NEXT: G_STORE %6(s16), %0(p0) :: (store 2)
-# ALL-NEXT: %7(s32) = IMPLICIT_DEF
-# ALL-NEXT: G_STORE %7(s32), %0(p0) :: (store 4)
-# ALL-NEXT: %8(p0) = IMPLICIT_DEF
-# ALL-NEXT: G_STORE %8(p0), %0(p0) :: (store 8)
body: |
bb.1 (%ir-block.0):
liveins: %rdi
+ ; X64-LABEL: name: test_memop_s8tos32
+ ; X64: [[DEF:%[0-9]+]]:_(p0) = IMPLICIT_DEF
+ ; X64: [[LOAD:%[0-9]+]]:_(s8) = G_LOAD [[DEF]](p0) :: (load 1)
+ ; X64: [[LOAD1:%[0-9]+]]:_(s8) = G_LOAD [[DEF]](p0) :: (load 1)
+ ; X64: [[LOAD2:%[0-9]+]]:_(s16) = G_LOAD [[DEF]](p0) :: (load 2)
+ ; X64: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[DEF]](p0) :: (load 4)
+ ; X64: [[LOAD4:%[0-9]+]]:_(p0) = G_LOAD [[DEF]](p0) :: (load 8)
+ ; X64: [[C:%[0-9]+]]:_(s8) = G_CONSTANT i8 1
+ ; X64: [[COPY:%[0-9]+]]:_(s8) = COPY [[LOAD]](s8)
+ ; X64: [[AND:%[0-9]+]]:_(s8) = G_AND [[COPY]], [[C]]
+ ; X64: G_STORE [[AND]](s8), [[DEF]](p0) :: (store 1)
+ ; X64: G_STORE [[LOAD1]](s8), [[DEF]](p0) :: (store 1)
+ ; X64: G_STORE [[LOAD2]](s16), [[DEF]](p0) :: (store 2)
+ ; X64: G_STORE [[LOAD3]](s32), [[DEF]](p0) :: (store 4)
+ ; X64: G_STORE [[LOAD4]](p0), [[DEF]](p0) :: (store 8)
+ ; X32-LABEL: name: test_memop_s8tos32
+ ; X32: [[DEF:%[0-9]+]]:_(p0) = IMPLICIT_DEF
+ ; X32: [[LOAD:%[0-9]+]]:_(s8) = G_LOAD [[DEF]](p0) :: (load 1)
+ ; X32: [[LOAD1:%[0-9]+]]:_(s8) = G_LOAD [[DEF]](p0) :: (load 1)
+ ; X32: [[LOAD2:%[0-9]+]]:_(s16) = G_LOAD [[DEF]](p0) :: (load 2)
+ ; X32: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[DEF]](p0) :: (load 4)
+ ; X32: [[LOAD4:%[0-9]+]]:_(p0) = G_LOAD [[DEF]](p0) :: (load 8)
+ ; X32: [[C:%[0-9]+]]:_(s8) = G_CONSTANT i8 1
+ ; X32: [[COPY:%[0-9]+]]:_(s8) = COPY [[LOAD]](s8)
+ ; X32: [[AND:%[0-9]+]]:_(s8) = G_AND [[COPY]], [[C]]
+ ; X32: G_STORE [[AND]](s8), [[DEF]](p0) :: (store 1)
+ ; X32: G_STORE [[LOAD1]](s8), [[DEF]](p0) :: (store 1)
+ ; X32: G_STORE [[LOAD2]](s16), [[DEF]](p0) :: (store 2)
+ ; X32: G_STORE [[LOAD3]](s32), [[DEF]](p0) :: (store 4)
+ ; X32: G_STORE [[LOAD4]](p0), [[DEF]](p0) :: (store 8)
%0(p0) = IMPLICIT_DEF
%9(s1) = G_LOAD %0(p0) :: (load 1)
%1(s8) = G_LOAD %0(p0) :: (load 1)
@@ -57,20 +69,14 @@ body: |
%3(s32) = G_LOAD %0(p0) :: (load 4)
%4(p0) = G_LOAD %0(p0) :: (load 8)
- %10(s1) = IMPLICIT_DEF
- G_STORE %10, %0 :: (store 1)
- %5(s8) = IMPLICIT_DEF
- G_STORE %5, %0 :: (store 1)
- %6(s16) = IMPLICIT_DEF
- G_STORE %6, %0 :: (store 2)
- %7(s32) = IMPLICIT_DEF
- G_STORE %7, %0 :: (store 4)
- %8(p0) = IMPLICIT_DEF
- G_STORE %8, %0 :: (store 8)
+ G_STORE %9, %0 :: (store 1)
+ G_STORE %1, %0 :: (store 1)
+ G_STORE %2, %0 :: (store 2)
+ G_STORE %3, %0 :: (store 4)
+ G_STORE %4, %0 :: (store 8)
...
---
name: test_memop_s64
-# ALL-LABEL: name: test_memop_s64
alignment: 4
legalized: false
regBankSelected: false
@@ -79,32 +85,28 @@ registers:
- { id: 1, class: _, preferred-register: '' }
- { id: 2, class: _, preferred-register: '' }
liveins:
-# X64: %0(p0) = IMPLICIT_DEF
-# X64-NEXT: %1(s64) = G_LOAD %0(p0) :: (load 8)
-# X64-NEXT: %2(s64) = IMPLICIT_DEF
-# X64-NEXT: G_STORE %2(s64), %0(p0) :: (store 8)
#
-# X32: %0(p0) = IMPLICIT_DEF
-# X32-NEXT: %3(s32) = G_LOAD %0(p0) :: (load 8)
-# X32-NEXT: %6(s32) = G_CONSTANT i32 4
-# X32-NEXT: %5(p0) = G_GEP %0, %6(s32)
-# X32-NEXT: %4(s32) = G_LOAD %5(p0) :: (load 8)
-# X32-NEXT: %1(s64) = G_MERGE_VALUES %3(s32), %4(s32)
-# X32-NEXT: %2(s64) = IMPLICIT_DEF
-# X32-NEXT: %7(s32), %8(s32) = G_UNMERGE_VALUES %2(s64)
-# X32-NEXT: G_STORE %7(s32), %0(p0) :: (store 8)
-# X32-NEXT: %10(s32) = G_CONSTANT i32 4
-# X32-NEXT: %9(p0) = G_GEP %0, %10(s32)
-# X32-NEXT: G_STORE %8(s32), %9(p0) :: (store 8)
body: |
bb.1 (%ir-block.0):
liveins: %rdi
+ ; X64-LABEL: name: test_memop_s64
+ ; X64: [[DEF:%[0-9]+]]:_(p0) = IMPLICIT_DEF
+ ; X64: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[DEF]](p0) :: (load 8)
+ ; X64: G_STORE [[LOAD]](s64), [[DEF]](p0) :: (store 8)
+ ; X32-LABEL: name: test_memop_s64
+ ; X32: [[DEF:%[0-9]+]]:_(p0) = IMPLICIT_DEF
+ ; X32: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[DEF]](p0) :: (load 8)
+ ; X32: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+ ; X32: [[GEP:%[0-9]+]]:_(p0) = G_GEP [[DEF]], [[C]](s32)
+ ; X32: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP]](p0) :: (load 8)
+ ; X32: G_STORE [[LOAD]](s32), [[DEF]](p0) :: (store 8)
+ ; X32: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+ ; X32: [[GEP1:%[0-9]+]]:_(p0) = G_GEP [[DEF]], [[C1]](s32)
+ ; X32: G_STORE [[LOAD1]](s32), [[GEP1]](p0) :: (store 8)
%0(p0) = IMPLICIT_DEF
%1(s64) = G_LOAD %0(p0) :: (load 8)
- %2(s64) = IMPLICIT_DEF
- G_STORE %2, %0 :: (store 8)
+ G_STORE %1, %0 :: (store 8)
...
-
diff --git a/test/CodeGen/X86/GlobalISel/legalize-mul-scalar.mir b/test/CodeGen/X86/GlobalISel/legalize-mul-scalar.mir
index 682d01e66fa0..6f844886e09e 100644
--- a/test/CodeGen/X86/GlobalISel/legalize-mul-scalar.mir
+++ b/test/CodeGen/X86/GlobalISel/legalize-mul-scalar.mir
@@ -1,6 +1,9 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple=x86_64-linux-gnu -global-isel -run-pass=legalizer %s -o - | FileCheck %s
--- |
+ define void @test_mul_i1() { ret void}
+
define i16 @test_mul_i16(i16 %arg1, i16 %arg2) {
%ret = mul i16 %arg1, %arg2
ret i16 %ret
@@ -18,32 +21,56 @@
...
---
+name: test_mul_i1
+alignment: 4
+legalized: false
+regBankSelected: false
+registers:
+ - { id: 0, class: _, preferred-register: '' }
+ - { id: 1, class: _, preferred-register: '' }
+ - { id: 2, class: _, preferred-register: '' }
+body: |
+ bb.1 (%ir-block.0):
+
+ ; CHECK-LABEL: name: test_mul_i1
+ ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY %edx
+ ; CHECK: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[COPY]](s32)
+ ; CHECK: [[TRUNC1:%[0-9]+]]:_(s8) = G_TRUNC [[COPY]](s32)
+ ; CHECK: [[MUL:%[0-9]+]]:_(s8) = G_MUL [[TRUNC]], [[TRUNC1]]
+ ; CHECK: [[DEF:%[0-9]+]]:_(p0) = G_IMPLICIT_DEF
+ ; CHECK: [[C:%[0-9]+]]:_(s8) = G_CONSTANT i8 1
+ ; CHECK: [[COPY1:%[0-9]+]]:_(s8) = COPY [[MUL]](s8)
+ ; CHECK: [[AND:%[0-9]+]]:_(s8) = G_AND [[COPY1]], [[C]]
+ ; CHECK: G_STORE [[AND]](s8), [[DEF]](p0) :: (store 1)
+ ; CHECK: RET 0
+ %0(s32) = COPY %edx
+ %1(s1) = G_TRUNC %0(s32)
+ %2(s1) = G_MUL %1, %1
+ %3:_(p0) = G_IMPLICIT_DEF
+ G_STORE %2, %3 :: (store 1)
+ RET 0
+...
+---
name: test_mul_i16
-# CHECK-LABEL: name: test_mul_i16
alignment: 4
legalized: false
regBankSelected: false
-# CHECK: registers:
-# CHECK-NEXT: - { id: 0, class: _, preferred-register: '' }
-# CHECK-NEXT: - { id: 1, class: _, preferred-register: '' }
-# CHECK-NEXT: - { id: 2, class: _, preferred-register: '' }
registers:
- { id: 0, class: _ }
- { id: 1, class: _ }
- { id: 2, class: _ }
-# CHECK: body: |
-# CHECK-NEXT: bb.0 (%ir-block.0):
-# CHECK-NEXT: %0(s16) = COPY %edi
-# CHECK-NEXT: %1(s16) = COPY %esi
-# CHECK-NEXT: %2(s16) = G_MUL %0, %1
-# CHECK-NEXT: %ax = COPY %2(s16)
-# CHECK-NEXT: RET 0, implicit %ax
body: |
bb.1 (%ir-block.0):
liveins: %edi, %esi
- %0(s16) = COPY %edi
- %1(s16) = COPY %esi
+ ; CHECK-LABEL: name: test_mul_i16
+ ; CHECK: [[COPY:%[0-9]+]]:_(s16) = COPY %di
+ ; CHECK: [[COPY1:%[0-9]+]]:_(s16) = COPY %si
+ ; CHECK: [[MUL:%[0-9]+]]:_(s16) = G_MUL [[COPY]], [[COPY1]]
+ ; CHECK: %ax = COPY [[MUL]](s16)
+ ; CHECK: RET 0, implicit %ax
+ %0(s16) = COPY %di
+ %1(s16) = COPY %si
%2(s16) = G_MUL %0, %1
%ax = COPY %2(s16)
RET 0, implicit %ax
@@ -51,29 +78,23 @@ body: |
...
---
name: test_mul_i32
-# CHECK-LABEL: name: test_mul_i32
alignment: 4
legalized: false
regBankSelected: false
-# CHECK: registers:
-# CHECK-NEXT: - { id: 0, class: _, preferred-register: '' }
-# CHECK-NEXT: - { id: 1, class: _, preferred-register: '' }
-# CHECK-NEXT: - { id: 2, class: _, preferred-register: '' }
registers:
- { id: 0, class: _ }
- { id: 1, class: _ }
- { id: 2, class: _ }
-# CHECK: body: |
-# CHECK-NEXT: bb.0 (%ir-block.0):
-# CHECK-NEXT: %0(s32) = COPY %edi
-# CHECK-NEXT: %1(s32) = COPY %esi
-# CHECK-NEXT: %2(s32) = G_MUL %0, %1
-# CHECK-NEXT: %eax = COPY %2(s32)
-# CHECK-NEXT: RET 0, implicit %eax
body: |
bb.1 (%ir-block.0):
liveins: %edi, %esi
+ ; CHECK-LABEL: name: test_mul_i32
+ ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY %edi
+ ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY %esi
+ ; CHECK: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[COPY]], [[COPY1]]
+ ; CHECK: %eax = COPY [[MUL]](s32)
+ ; CHECK: RET 0, implicit %eax
%0(s32) = COPY %edi
%1(s32) = COPY %esi
%2(s32) = G_MUL %0, %1
@@ -83,29 +104,23 @@ body: |
...
---
name: test_mul_i64
-# CHECK-LABEL: name: test_mul_i64
alignment: 4
legalized: false
regBankSelected: false
-# CHECK: registers:
-# CHECK-NEXT: - { id: 0, class: _, preferred-register: '' }
-# CHECK-NEXT: - { id: 1, class: _, preferred-register: '' }
-# CHECK-NEXT: - { id: 2, class: _, preferred-register: '' }
registers:
- { id: 0, class: _ }
- { id: 1, class: _ }
- { id: 2, class: _ }
-# CHECK: body: |
-# CHECK-NEXT: bb.0 (%ir-block.0):
-# CHECK-NEXT: %0(s64) = COPY %rdi
-# CHECK-NEXT: %1(s64) = COPY %rsi
-# CHECK-NEXT: %2(s64) = G_MUL %0, %1
-# CHECK-NEXT: %rax = COPY %2(s64)
-# CHECK-NEXT: RET 0, implicit %rax
body: |
bb.1 (%ir-block.0):
liveins: %rdi, %rsi
+ ; CHECK-LABEL: name: test_mul_i64
+ ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY %rdi
+ ; CHECK: [[COPY1:%[0-9]+]]:_(s64) = COPY %rsi
+ ; CHECK: [[MUL:%[0-9]+]]:_(s64) = G_MUL [[COPY]], [[COPY1]]
+ ; CHECK: %rax = COPY [[MUL]](s64)
+ ; CHECK: RET 0, implicit %rax
%0(s64) = COPY %rdi
%1(s64) = COPY %rsi
%2(s64) = G_MUL %0, %1
diff --git a/test/CodeGen/X86/GlobalISel/legalize-mul-v128.mir b/test/CodeGen/X86/GlobalISel/legalize-mul-v128.mir
index effd26e9866d..f14b6eb2ebb5 100644
--- a/test/CodeGen/X86/GlobalISel/legalize-mul-v128.mir
+++ b/test/CodeGen/X86/GlobalISel/legalize-mul-v128.mir
@@ -33,9 +33,9 @@ registers:
- { id: 0, class: _ }
- { id: 1, class: _ }
- { id: 2, class: _ }
-# ALL: %0(<8 x s16>) = COPY %xmm0
-# ALL-NEXT: %1(<8 x s16>) = COPY %xmm1
-# ALL-NEXT: %2(<8 x s16>) = G_MUL %0, %1
+# ALL: %0:_(<8 x s16>) = COPY %xmm0
+# ALL-NEXT: %1:_(<8 x s16>) = COPY %xmm1
+# ALL-NEXT: %2:_(<8 x s16>) = G_MUL %0, %1
# ALL-NEXT: %xmm0 = COPY %2(<8 x s16>)
# ALL-NEXT: RET 0, implicit %xmm0
body: |
@@ -63,9 +63,9 @@ registers:
- { id: 0, class: _ }
- { id: 1, class: _ }
- { id: 2, class: _ }
-# ALL: %0(<4 x s32>) = COPY %xmm0
-# ALL-NEXT: %1(<4 x s32>) = COPY %xmm1
-# ALL-NEXT: %2(<4 x s32>) = G_MUL %0, %1
+# ALL: %0:_(<4 x s32>) = COPY %xmm0
+# ALL-NEXT: %1:_(<4 x s32>) = COPY %xmm1
+# ALL-NEXT: %2:_(<4 x s32>) = G_MUL %0, %1
# ALL-NEXT: %xmm0 = COPY %2(<4 x s32>)
# ALL-NEXT: RET 0, implicit %xmm0
body: |
@@ -93,9 +93,9 @@ registers:
- { id: 0, class: _ }
- { id: 1, class: _ }
- { id: 2, class: _ }
-# ALL: %0(<2 x s64>) = COPY %xmm0
-# ALL-NEXT: %1(<2 x s64>) = COPY %xmm1
-# ALL-NEXT: %2(<2 x s64>) = G_MUL %0, %1
+# ALL: %0:_(<2 x s64>) = COPY %xmm0
+# ALL-NEXT: %1:_(<2 x s64>) = COPY %xmm1
+# ALL-NEXT: %2:_(<2 x s64>) = G_MUL %0, %1
# ALL-NEXT: %xmm0 = COPY %2(<2 x s64>)
# ALL-NEXT: RET 0, implicit %xmm0
body: |
diff --git a/test/CodeGen/X86/GlobalISel/legalize-mul-v256.mir b/test/CodeGen/X86/GlobalISel/legalize-mul-v256.mir
index 5ae8132156d5..b0921a9b0749 100644
--- a/test/CodeGen/X86/GlobalISel/legalize-mul-v256.mir
+++ b/test/CodeGen/X86/GlobalISel/legalize-mul-v256.mir
@@ -33,9 +33,9 @@ registers:
- { id: 0, class: _ }
- { id: 1, class: _ }
- { id: 2, class: _ }
-# ALL: %0(<16 x s16>) = COPY %ymm0
-# ALL-NEXT: %1(<16 x s16>) = COPY %ymm1
-# ALL-NEXT: %2(<16 x s16>) = G_MUL %0, %1
+# ALL: %0:_(<16 x s16>) = COPY %ymm0
+# ALL-NEXT: %1:_(<16 x s16>) = COPY %ymm1
+# ALL-NEXT: %2:_(<16 x s16>) = G_MUL %0, %1
# ALL-NEXT: %ymm0 = COPY %2(<16 x s16>)
# ALL-NEXT: RET 0, implicit %ymm0
body: |
@@ -63,9 +63,9 @@ registers:
- { id: 0, class: _ }
- { id: 1, class: _ }
- { id: 2, class: _ }
-# ALL: %0(<8 x s32>) = COPY %ymm0
-# ALL-NEXT: %1(<8 x s32>) = COPY %ymm1
-# ALL-NEXT: %2(<8 x s32>) = G_MUL %0, %1
+# ALL: %0:_(<8 x s32>) = COPY %ymm0
+# ALL-NEXT: %1:_(<8 x s32>) = COPY %ymm1
+# ALL-NEXT: %2:_(<8 x s32>) = G_MUL %0, %1
# ALL-NEXT: %ymm0 = COPY %2(<8 x s32>)
# ALL-NEXT: RET 0, implicit %ymm0
body: |
@@ -93,9 +93,9 @@ registers:
- { id: 0, class: _ }
- { id: 1, class: _ }
- { id: 2, class: _ }
-# ALL: %0(<4 x s64>) = COPY %ymm0
-# ALL-NEXT: %1(<4 x s64>) = COPY %ymm1
-# ALL-NEXT: %2(<4 x s64>) = G_MUL %0, %1
+# ALL: %0:_(<4 x s64>) = COPY %ymm0
+# ALL-NEXT: %1:_(<4 x s64>) = COPY %ymm1
+# ALL-NEXT: %2:_(<4 x s64>) = G_MUL %0, %1
# ALL-NEXT: %ymm0 = COPY %2(<4 x s64>)
# ALL-NEXT: RET 0, implicit %ymm0
body: |
diff --git a/test/CodeGen/X86/GlobalISel/legalize-mul-v512.mir b/test/CodeGen/X86/GlobalISel/legalize-mul-v512.mir
index 71ea313c4c72..79d65f2fe7dc 100644
--- a/test/CodeGen/X86/GlobalISel/legalize-mul-v512.mir
+++ b/test/CodeGen/X86/GlobalISel/legalize-mul-v512.mir
@@ -35,9 +35,9 @@ registers:
- { id: 0, class: _ }
- { id: 1, class: _ }
- { id: 2, class: _ }
-# ALL: %0(<32 x s16>) = COPY %zmm0
-# ALL-NEXT: %1(<32 x s16>) = COPY %zmm1
-# ALL-NEXT: %2(<32 x s16>) = G_MUL %0, %1
+# ALL: %0:_(<32 x s16>) = COPY %zmm0
+# ALL-NEXT: %1:_(<32 x s16>) = COPY %zmm1
+# ALL-NEXT: %2:_(<32 x s16>) = G_MUL %0, %1
# ALL-NEXT: %zmm0 = COPY %2(<32 x s16>)
# ALL-NEXT: RET 0, implicit %zmm0
body: |
@@ -65,9 +65,9 @@ registers:
- { id: 0, class: _ }
- { id: 1, class: _ }
- { id: 2, class: _ }
-# ALL: %0(<16 x s32>) = COPY %zmm0
-# ALL-NEXT: %1(<16 x s32>) = COPY %zmm1
-# ALL-NEXT: %2(<16 x s32>) = G_MUL %0, %1
+# ALL: %0:_(<16 x s32>) = COPY %zmm0
+# ALL-NEXT: %1:_(<16 x s32>) = COPY %zmm1
+# ALL-NEXT: %2:_(<16 x s32>) = G_MUL %0, %1
# ALL-NEXT: %zmm0 = COPY %2(<16 x s32>)
# ALL-NEXT: RET 0, implicit %zmm0
body: |
@@ -95,9 +95,9 @@ registers:
- { id: 0, class: _ }
- { id: 1, class: _ }
- { id: 2, class: _ }
-# ALL: %0(<8 x s64>) = COPY %zmm0
-# ALL-NEXT: %1(<8 x s64>) = COPY %zmm1
-# ALL-NEXT: %2(<8 x s64>) = G_MUL %0, %1
+# ALL: %0:_(<8 x s64>) = COPY %zmm0
+# ALL-NEXT: %1:_(<8 x s64>) = COPY %zmm1
+# ALL-NEXT: %2:_(<8 x s64>) = G_MUL %0, %1
# ALL-NEXT: %zmm0 = COPY %2(<8 x s64>)
# ALL-NEXT: RET 0, implicit %zmm0
body: |
diff --git a/test/CodeGen/X86/GlobalISel/legalize-or-scalar.mir b/test/CodeGen/X86/GlobalISel/legalize-or-scalar.mir
index a014f56a3588..df6a3fd28275 100644
--- a/test/CodeGen/X86/GlobalISel/legalize-or-scalar.mir
+++ b/test/CodeGen/X86/GlobalISel/legalize-or-scalar.mir
@@ -1,6 +1,12 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple=x86_64-linux-gnu -global-isel -run-pass=legalizer %s -o - | FileCheck %s
--- |
+ define i1 @test_or_i1() {
+ %ret = or i1 undef, undef
+ ret i1 %ret
+ }
+
define i8 @test_or_i8() {
%ret = or i8 undef, undef
ret i8 %ret
@@ -23,8 +29,37 @@
...
---
+name: test_or_i1
+alignment: 4
+legalized: false
+regBankSelected: false
+registers:
+ - { id: 0, class: _, preferred-register: '' }
+ - { id: 1, class: _, preferred-register: '' }
+ - { id: 2, class: _, preferred-register: '' }
+body: |
+ bb.1 (%ir-block.0):
+
+ ; CHECK-LABEL: name: test_or_i1
+ ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY %edx
+ ; CHECK: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[COPY]](s32)
+ ; CHECK: [[TRUNC1:%[0-9]+]]:_(s8) = G_TRUNC [[COPY]](s32)
+ ; CHECK: [[OR:%[0-9]+]]:_(s8) = G_OR [[TRUNC]], [[TRUNC1]]
+ ; CHECK: [[DEF:%[0-9]+]]:_(p0) = G_IMPLICIT_DEF
+ ; CHECK: [[C:%[0-9]+]]:_(s8) = G_CONSTANT i8 1
+ ; CHECK: [[COPY1:%[0-9]+]]:_(s8) = COPY [[OR]](s8)
+ ; CHECK: [[AND:%[0-9]+]]:_(s8) = G_AND [[COPY1]], [[C]]
+ ; CHECK: G_STORE [[AND]](s8), [[DEF]](p0) :: (store 1)
+ ; CHECK: RET 0
+ %0(s32) = COPY %edx
+ %1(s1) = G_TRUNC %0(s32)
+ %2(s1) = G_OR %1, %1
+ %3:_(p0) = G_IMPLICIT_DEF
+ G_STORE %2, %3 :: (store 1)
+ RET 0
+...
+---
name: test_or_i8
-# CHECK-LABEL: name: test_or_i8
alignment: 4
legalized: false
regBankSelected: false
@@ -35,12 +70,13 @@ liveins:
fixedStack:
stack:
constants:
-# CHECK: %0(s8) = IMPLICIT_DEF
-# CHECK-NEXT: %1(s8) = G_OR %0, %0
-# CHECK-NEXT: %al = COPY %1(s8)
-# CHECK-NEXT: RET 0, implicit %al
body: |
bb.1 (%ir-block.0):
+ ; CHECK-LABEL: name: test_or_i8
+ ; CHECK: [[DEF:%[0-9]+]]:_(s8) = IMPLICIT_DEF
+ ; CHECK: [[OR:%[0-9]+]]:_(s8) = G_OR [[DEF]], [[DEF]]
+ ; CHECK: %al = COPY [[OR]](s8)
+ ; CHECK: RET 0, implicit %al
%0(s8) = IMPLICIT_DEF
%1(s8) = G_OR %0, %0
%al = COPY %1(s8)
@@ -49,7 +85,6 @@ body: |
...
---
name: test_or_i16
-# CHECK-LABEL: name: test_or_i16
alignment: 4
legalized: false
regBankSelected: false
@@ -60,12 +95,13 @@ liveins:
fixedStack:
stack:
constants:
-# CHECK: %0(s16) = IMPLICIT_DEF
-# CHECK-NEXT: %1(s16) = G_OR %0, %0
-# CHECK-NEXT: %ax = COPY %1(s16)
-# CHECK-NEXT: RET 0, implicit %ax
body: |
bb.1 (%ir-block.0):
+ ; CHECK-LABEL: name: test_or_i16
+ ; CHECK: [[DEF:%[0-9]+]]:_(s16) = IMPLICIT_DEF
+ ; CHECK: [[OR:%[0-9]+]]:_(s16) = G_OR [[DEF]], [[DEF]]
+ ; CHECK: %ax = COPY [[OR]](s16)
+ ; CHECK: RET 0, implicit %ax
%0(s16) = IMPLICIT_DEF
%1(s16) = G_OR %0, %0
%ax = COPY %1(s16)
@@ -74,7 +110,6 @@ body: |
...
---
name: test_or_i32
-# CHECK-LABEL: name: test_or_i32
alignment: 4
legalized: false
regBankSelected: false
@@ -85,12 +120,13 @@ liveins:
fixedStack:
stack:
constants:
-# CHECK: %0(s32) = IMPLICIT_DEF
-# CHECK-NEXT: %1(s32) = G_OR %0, %0
-# CHECK-NEXT: %eax = COPY %1(s32)
-# CHECK-NEXT: RET 0, implicit %eax
body: |
bb.1 (%ir-block.0):
+ ; CHECK-LABEL: name: test_or_i32
+ ; CHECK: [[DEF:%[0-9]+]]:_(s32) = IMPLICIT_DEF
+ ; CHECK: [[OR:%[0-9]+]]:_(s32) = G_OR [[DEF]], [[DEF]]
+ ; CHECK: %eax = COPY [[OR]](s32)
+ ; CHECK: RET 0, implicit %eax
%0(s32) = IMPLICIT_DEF
%1(s32) = G_OR %0, %0
%eax = COPY %1(s32)
@@ -99,7 +135,6 @@ body: |
...
---
name: test_or_i64
-# CHECK-LABEL: name: test_or_i64
alignment: 4
legalized: false
regBankSelected: false
@@ -110,12 +145,13 @@ liveins:
fixedStack:
stack:
constants:
-# CHECK: %0(s64) = IMPLICIT_DEF
-# CHECK-NEXT: %1(s64) = G_OR %0, %0
-# CHECK-NEXT: %rax = COPY %1(s64)
-# CHECK-NEXT: RET 0, implicit %rax
body: |
bb.1 (%ir-block.0):
+ ; CHECK-LABEL: name: test_or_i64
+ ; CHECK: [[DEF:%[0-9]+]]:_(s64) = IMPLICIT_DEF
+ ; CHECK: [[OR:%[0-9]+]]:_(s64) = G_OR [[DEF]], [[DEF]]
+ ; CHECK: %rax = COPY [[OR]](s64)
+ ; CHECK: RET 0, implicit %rax
%0(s64) = IMPLICIT_DEF
%1(s64) = G_OR %0, %0
%rax = COPY %1(s64)
diff --git a/test/CodeGen/X86/GlobalISel/legalize-phi.mir b/test/CodeGen/X86/GlobalISel/legalize-phi.mir
new file mode 100644
index 000000000000..44db405f1650
--- /dev/null
+++ b/test/CodeGen/X86/GlobalISel/legalize-phi.mir
@@ -0,0 +1,599 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=x86_64-linux-gnu -global-isel -run-pass=legalizer %s -o - | FileCheck %s --check-prefix=ALL
+
+--- |
+
+ define zeroext i1 @test_i1(i32 %a, i1 zeroext %f, i1 zeroext %t) {
+ entry:
+ %cmp = icmp sgt i32 %a, 0
+ br i1 %cmp, label %cond.true, label %cond.false
+
+ cond.true: ; preds = %entry
+ br label %cond.end
+
+ cond.false: ; preds = %entry
+ br label %cond.end
+
+ cond.end: ; preds = %cond.false, %cond.true
+ %cond = phi i1 [ %f, %cond.true ], [ %t, %cond.false ]
+ ret i1 %cond
+ }
+
+ define i8 @test_i8(i32 %a, i8 %f, i8 %t) {
+ entry:
+ %cmp = icmp sgt i32 %a, 0
+ br i1 %cmp, label %cond.true, label %cond.false
+
+ cond.true: ; preds = %entry
+ br label %cond.end
+
+ cond.false: ; preds = %entry
+ br label %cond.end
+
+ cond.end: ; preds = %cond.false, %cond.true
+ %cond = phi i8 [ %f, %cond.true ], [ %t, %cond.false ]
+ ret i8 %cond
+ }
+
+ define i16 @test_i16(i32 %a, i16 %f, i16 %t) {
+ entry:
+ %cmp = icmp sgt i32 %a, 0
+ br i1 %cmp, label %cond.true, label %cond.false
+
+ cond.true: ; preds = %entry
+ br label %cond.end
+
+ cond.false: ; preds = %entry
+ br label %cond.end
+
+ cond.end: ; preds = %cond.false, %cond.true
+ %cond = phi i16 [ %f, %cond.true ], [ %t, %cond.false ]
+ ret i16 %cond
+ }
+
+ define i32 @test_i32(i32 %a, i32 %f, i32 %t) {
+ entry:
+ %cmp = icmp sgt i32 %a, 0
+ br i1 %cmp, label %cond.true, label %cond.false
+
+ cond.true: ; preds = %entry
+ br label %cond.end
+
+ cond.false: ; preds = %entry
+ br label %cond.end
+
+ cond.end: ; preds = %cond.false, %cond.true
+ %cond = phi i32 [ %f, %cond.true ], [ %t, %cond.false ]
+ ret i32 %cond
+ }
+
+ define i64 @test_i64(i32 %a, i64 %f, i64 %t) {
+ entry:
+ %cmp = icmp sgt i32 %a, 0
+ br i1 %cmp, label %cond.true, label %cond.false
+
+ cond.true: ; preds = %entry
+ br label %cond.end
+
+ cond.false: ; preds = %entry
+ br label %cond.end
+
+ cond.end: ; preds = %cond.false, %cond.true
+ %cond = phi i64 [ %f, %cond.true ], [ %t, %cond.false ]
+ ret i64 %cond
+ }
+
+ define float @test_float(i32 %a, float %f, float %t) {
+ entry:
+ %cmp = icmp sgt i32 %a, 0
+ br i1 %cmp, label %cond.true, label %cond.false
+
+ cond.true: ; preds = %entry
+ br label %cond.end
+
+ cond.false: ; preds = %entry
+ br label %cond.end
+
+ cond.end: ; preds = %cond.false, %cond.true
+ %cond = phi float [ %f, %cond.true ], [ %t, %cond.false ]
+ ret float %cond
+ }
+
+ define double @test_double(i32 %a, double %f, double %t) {
+ entry:
+ %cmp = icmp sgt i32 %a, 0
+ br i1 %cmp, label %cond.true, label %cond.false
+
+ cond.true: ; preds = %entry
+ br label %cond.end
+
+ cond.false: ; preds = %entry
+ br label %cond.end
+
+ cond.end: ; preds = %cond.false, %cond.true
+ %cond = phi double [ %f, %cond.true ], [ %t, %cond.false ]
+ ret double %cond
+ }
+
+...
+---
+name: test_i1
+alignment: 4
+legalized: false
+regBankSelected: false
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: _, preferred-register: '' }
+ - { id: 1, class: _, preferred-register: '' }
+ - { id: 2, class: _, preferred-register: '' }
+ - { id: 3, class: _, preferred-register: '' }
+ - { id: 4, class: _, preferred-register: '' }
+ - { id: 5, class: _, preferred-register: '' }
+ - { id: 6, class: _, preferred-register: '' }
+liveins:
+fixedStack:
+stack:
+constants:
+
+
+
+
+body: |
+ ; ALL-LABEL: name: test_i1
+ ; ALL: bb.0.{{[a-zA-Z0-9]+}}:
+ ; ALL: successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ ; ALL: liveins: %edi, %edx, %esi
+ ; ALL: [[COPY:%[0-9]+]]:_(s32) = COPY %edi
+ ; ALL: [[COPY1:%[0-9]+]]:_(s1) = COPY %esi
+ ; ALL: [[COPY2:%[0-9]+]]:_(s1) = COPY %edx
+ ; ALL: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; ALL: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[COPY]](s32), [[C]]
+ ; ALL: G_BRCOND [[ICMP]](s1), %bb.1
+ ; ALL: G_BR %bb.2
+ ; ALL: bb.1.cond.true:
+ ; ALL: successors: %bb.3(0x80000000)
+ ; ALL: [[ANYEXT:%[0-9]+]]:_(s8) = G_ANYEXT [[COPY1]](s1)
+ ; ALL: G_BR %bb.3
+ ; ALL: bb.2.cond.false:
+ ; ALL: successors: %bb.3(0x80000000)
+ ; ALL: [[ANYEXT1:%[0-9]+]]:_(s8) = G_ANYEXT [[COPY2]](s1)
+ ; ALL: bb.3.cond.end:
+ ; ALL: [[PHI:%[0-9]+]]:_(s8) = G_PHI [[ANYEXT]](s8), %bb.1, [[ANYEXT1]](s8), %bb.2
+ ; ALL: %al = COPY
+ ; ALL: RET 0, implicit %al
+ bb.1.entry:
+ successors: %bb.2(0x40000000), %bb.3(0x40000000)
+ liveins: %edi, %edx, %esi
+
+ %0(s32) = COPY %edi
+ %1(s1) = COPY %esi
+ %2(s1) = COPY %edx
+ %3(s32) = G_CONSTANT i32 0
+ %4(s1) = G_ICMP intpred(sgt), %0(s32), %3
+ G_BRCOND %4(s1), %bb.2
+ G_BR %bb.3
+
+ bb.2.cond.true:
+ successors: %bb.4(0x80000000)
+
+ G_BR %bb.4
+
+ bb.3.cond.false:
+ successors: %bb.4(0x80000000)
+
+
+ bb.4.cond.end:
+ %5(s1) = G_PHI %1(s1), %bb.2, %2(s1), %bb.3
+ %6(s8) = G_ZEXT %5(s1)
+ %al = COPY %6(s8)
+ RET 0, implicit %al
+
+...
+---
+name: test_i8
+alignment: 4
+legalized: false
+regBankSelected: false
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: _, preferred-register: '' }
+ - { id: 1, class: _, preferred-register: '' }
+ - { id: 2, class: _, preferred-register: '' }
+ - { id: 3, class: _, preferred-register: '' }
+ - { id: 4, class: _, preferred-register: '' }
+ - { id: 5, class: _, preferred-register: '' }
+liveins:
+fixedStack:
+stack:
+constants:
+
+
+
+body: |
+ ; ALL-LABEL: name: test_i8
+ ; ALL: bb.0.{{[a-zA-Z0-9]+}}:
+ ; ALL: successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ ; ALL: liveins: %edi, %edx, %esi
+ ; ALL: [[COPY:%[0-9]+]]:_(s32) = COPY %edi
+ ; ALL: [[COPY1:%[0-9]+]]:_(s8) = COPY %sil
+ ; ALL: [[COPY2:%[0-9]+]]:_(s8) = COPY %edx
+ ; ALL: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; ALL: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[COPY]](s32), [[C]]
+ ; ALL: G_BRCOND [[ICMP]](s1), %bb.1
+ ; ALL: G_BR %bb.2
+ ; ALL: bb.1.cond.true:
+ ; ALL: successors: %bb.3(0x80000000)
+ ; ALL: G_BR %bb.3
+ ; ALL: bb.2.cond.false:
+ ; ALL: successors: %bb.3(0x80000000)
+ ; ALL: bb.3.cond.end:
+ ; ALL: [[PHI:%[0-9]+]]:_(s8) = G_PHI [[COPY1]](s8), %bb.1, [[COPY2]](s8), %bb.2
+ ; ALL: %al = COPY [[PHI]](s8)
+ ; ALL: RET 0, implicit %al
+ bb.1.entry:
+ successors: %bb.2(0x40000000), %bb.3(0x40000000)
+ liveins: %edi, %edx, %esi
+
+ %0(s32) = COPY %edi
+ %1(s8) = COPY %sil
+ %2(s8) = COPY %edx
+ %3(s32) = G_CONSTANT i32 0
+ %4(s1) = G_ICMP intpred(sgt), %0(s32), %3
+ G_BRCOND %4(s1), %bb.2
+ G_BR %bb.3
+
+ bb.2.cond.true:
+ successors: %bb.4(0x80000000)
+
+ G_BR %bb.4
+
+ bb.3.cond.false:
+ successors: %bb.4(0x80000000)
+
+
+ bb.4.cond.end:
+ %5(s8) = G_PHI %1(s8), %bb.2, %2(s8), %bb.3
+ %al = COPY %5(s8)
+ RET 0, implicit %al
+
+...
+---
+name: test_i16
+alignment: 4
+legalized: false
+regBankSelected: false
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: _, preferred-register: '' }
+ - { id: 1, class: _, preferred-register: '' }
+ - { id: 2, class: _, preferred-register: '' }
+ - { id: 3, class: _, preferred-register: '' }
+ - { id: 4, class: _, preferred-register: '' }
+ - { id: 5, class: _, preferred-register: '' }
+liveins:
+fixedStack:
+stack:
+constants:
+
+
+
+body: |
+ ; ALL-LABEL: name: test_i16
+ ; ALL: bb.0.{{[a-zA-Z0-9]+}}:
+ ; ALL: successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ ; ALL: liveins: %edi, %edx, %esi
+ ; ALL: [[COPY:%[0-9]+]]:_(s32) = COPY %edi
+ ; ALL: [[COPY1:%[0-9]+]]:_(s16) = COPY %si
+ ; ALL: [[COPY2:%[0-9]+]]:_(s16) = COPY %edx
+ ; ALL: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; ALL: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[COPY]](s32), [[C]]
+ ; ALL: G_BRCOND [[ICMP]](s1), %bb.1
+ ; ALL: G_BR %bb.2
+ ; ALL: bb.1.cond.true:
+ ; ALL: successors: %bb.3(0x80000000)
+ ; ALL: G_BR %bb.3
+ ; ALL: bb.2.cond.false:
+ ; ALL: successors: %bb.3(0x80000000)
+ ; ALL: bb.3.cond.end:
+ ; ALL: [[PHI:%[0-9]+]]:_(s16) = G_PHI [[COPY1]](s16), %bb.1, [[COPY2]](s16), %bb.2
+ ; ALL: %ax = COPY [[PHI]](s16)
+ ; ALL: RET 0, implicit %ax
+ bb.1.entry:
+ successors: %bb.2(0x40000000), %bb.3(0x40000000)
+ liveins: %edi, %edx, %esi
+
+ %0(s32) = COPY %edi
+ %1(s16) = COPY %si
+ %2(s16) = COPY %edx
+ %3(s32) = G_CONSTANT i32 0
+ %4(s1) = G_ICMP intpred(sgt), %0(s32), %3
+ G_BRCOND %4(s1), %bb.2
+ G_BR %bb.3
+
+ bb.2.cond.true:
+ successors: %bb.4(0x80000000)
+
+ G_BR %bb.4
+
+ bb.3.cond.false:
+ successors: %bb.4(0x80000000)
+
+
+ bb.4.cond.end:
+ %5(s16) = G_PHI %1(s16), %bb.2, %2(s16), %bb.3
+ %ax = COPY %5(s16)
+ RET 0, implicit %ax
+
+...
+---
+name: test_i32
+alignment: 4
+legalized: false
+regBankSelected: false
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: _, preferred-register: '' }
+ - { id: 1, class: _, preferred-register: '' }
+ - { id: 2, class: _, preferred-register: '' }
+ - { id: 3, class: _, preferred-register: '' }
+ - { id: 4, class: _, preferred-register: '' }
+ - { id: 5, class: _, preferred-register: '' }
+liveins:
+fixedStack:
+stack:
+constants:
+
+
+
+body: |
+ ; ALL-LABEL: name: test_i32
+ ; ALL: bb.0.{{[a-zA-Z0-9]+}}:
+ ; ALL: successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ ; ALL: liveins: %edi, %edx, %esi
+ ; ALL: [[COPY:%[0-9]+]]:_(s32) = COPY %edi
+ ; ALL: [[COPY1:%[0-9]+]]:_(s32) = COPY %esi
+ ; ALL: [[COPY2:%[0-9]+]]:_(s32) = COPY %edx
+ ; ALL: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; ALL: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[COPY]](s32), [[C]]
+ ; ALL: G_BRCOND [[ICMP]](s1), %bb.1
+ ; ALL: G_BR %bb.2
+ ; ALL: bb.1.cond.true:
+ ; ALL: successors: %bb.3(0x80000000)
+ ; ALL: G_BR %bb.3
+ ; ALL: bb.2.cond.false:
+ ; ALL: successors: %bb.3(0x80000000)
+ ; ALL: bb.3.cond.end:
+ ; ALL: [[PHI:%[0-9]+]]:_(s32) = G_PHI [[COPY1]](s32), %bb.1, [[COPY2]](s32), %bb.2
+ ; ALL: %eax = COPY [[PHI]](s32)
+ ; ALL: RET 0, implicit %eax
+ bb.1.entry:
+ successors: %bb.2(0x40000000), %bb.3(0x40000000)
+ liveins: %edi, %edx, %esi
+
+ %0(s32) = COPY %edi
+ %1(s32) = COPY %esi
+ %2(s32) = COPY %edx
+ %3(s32) = G_CONSTANT i32 0
+ %4(s1) = G_ICMP intpred(sgt), %0(s32), %3
+ G_BRCOND %4(s1), %bb.2
+ G_BR %bb.3
+
+ bb.2.cond.true:
+ successors: %bb.4(0x80000000)
+
+ G_BR %bb.4
+
+ bb.3.cond.false:
+ successors: %bb.4(0x80000000)
+
+
+ bb.4.cond.end:
+ %5(s32) = G_PHI %1(s32), %bb.2, %2(s32), %bb.3
+ %eax = COPY %5(s32)
+ RET 0, implicit %eax
+
+...
+---
+name: test_i64
+alignment: 4
+legalized: false
+regBankSelected: false
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: _, preferred-register: '' }
+ - { id: 1, class: _, preferred-register: '' }
+ - { id: 2, class: _, preferred-register: '' }
+ - { id: 3, class: _, preferred-register: '' }
+ - { id: 4, class: _, preferred-register: '' }
+ - { id: 5, class: _, preferred-register: '' }
+liveins:
+fixedStack:
+stack:
+constants:
+
+
+
+body: |
+ ; ALL-LABEL: name: test_i64
+ ; ALL: bb.0.{{[a-zA-Z0-9]+}}:
+ ; ALL: successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ ; ALL: liveins: %edi, %rdx, %rsi
+ ; ALL: [[COPY:%[0-9]+]]:_(s32) = COPY %edi
+ ; ALL: [[COPY1:%[0-9]+]]:_(s64) = COPY %rsi
+ ; ALL: [[COPY2:%[0-9]+]]:_(s64) = COPY %rdx
+ ; ALL: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; ALL: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[COPY]](s32), [[C]]
+ ; ALL: G_BRCOND [[ICMP]](s1), %bb.1
+ ; ALL: G_BR %bb.2
+ ; ALL: bb.1.cond.true:
+ ; ALL: successors: %bb.3(0x80000000)
+ ; ALL: G_BR %bb.3
+ ; ALL: bb.2.cond.false:
+ ; ALL: successors: %bb.3(0x80000000)
+ ; ALL: bb.3.cond.end:
+ ; ALL: [[PHI:%[0-9]+]]:_(s64) = G_PHI [[COPY1]](s64), %bb.1, [[COPY2]](s64), %bb.2
+ ; ALL: %rax = COPY [[PHI]](s64)
+ ; ALL: RET 0, implicit %rax
+ bb.1.entry:
+ successors: %bb.2(0x40000000), %bb.3(0x40000000)
+ liveins: %edi, %rdx, %rsi
+
+ %0(s32) = COPY %edi
+ %1(s64) = COPY %rsi
+ %2(s64) = COPY %rdx
+ %3(s32) = G_CONSTANT i32 0
+ %4(s1) = G_ICMP intpred(sgt), %0(s32), %3
+ G_BRCOND %4(s1), %bb.2
+ G_BR %bb.3
+
+ bb.2.cond.true:
+ successors: %bb.4(0x80000000)
+
+ G_BR %bb.4
+
+ bb.3.cond.false:
+ successors: %bb.4(0x80000000)
+
+
+ bb.4.cond.end:
+ %5(s64) = G_PHI %1(s64), %bb.2, %2(s64), %bb.3
+ %rax = COPY %5(s64)
+ RET 0, implicit %rax
+
+...
+---
+name: test_float
+alignment: 4
+legalized: false
+regBankSelected: false
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: _, preferred-register: '' }
+ - { id: 1, class: _, preferred-register: '' }
+ - { id: 2, class: _, preferred-register: '' }
+ - { id: 3, class: _, preferred-register: '' }
+ - { id: 4, class: _, preferred-register: '' }
+ - { id: 5, class: _, preferred-register: '' }
+liveins:
+fixedStack:
+stack:
+constants:
+
+
+
+body: |
+ ; ALL-LABEL: name: test_float
+ ; ALL: bb.0.{{[a-zA-Z0-9]+}}:
+ ; ALL: successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ ; ALL: liveins: %edi, %xmm0, %xmm1
+ ; ALL: [[COPY:%[0-9]+]]:_(s32) = COPY %edi
+ ; ALL: [[COPY1:%[0-9]+]]:_(s32) = COPY %xmm0
+ ; ALL: [[COPY2:%[0-9]+]]:_(s32) = COPY %xmm1
+ ; ALL: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; ALL: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[COPY]](s32), [[C]]
+ ; ALL: G_BRCOND [[ICMP]](s1), %bb.1
+ ; ALL: G_BR %bb.2
+ ; ALL: bb.1.cond.true:
+ ; ALL: successors: %bb.3(0x80000000)
+ ; ALL: G_BR %bb.3
+ ; ALL: bb.2.cond.false:
+ ; ALL: successors: %bb.3(0x80000000)
+ ; ALL: bb.3.cond.end:
+ ; ALL: [[PHI:%[0-9]+]]:_(s32) = G_PHI [[COPY1]](s32), %bb.1, [[COPY2]](s32), %bb.2
+ ; ALL: %xmm0 = COPY [[PHI]](s32)
+ ; ALL: RET 0, implicit %xmm0
+ bb.1.entry:
+ successors: %bb.2(0x40000000), %bb.3(0x40000000)
+ liveins: %edi, %xmm0, %xmm1
+
+ %0(s32) = COPY %edi
+ %1(s32) = COPY %xmm0
+ %2(s32) = COPY %xmm1
+ %3(s32) = G_CONSTANT i32 0
+ %4(s1) = G_ICMP intpred(sgt), %0(s32), %3
+ G_BRCOND %4(s1), %bb.2
+ G_BR %bb.3
+
+ bb.2.cond.true:
+ successors: %bb.4(0x80000000)
+
+ G_BR %bb.4
+
+ bb.3.cond.false:
+ successors: %bb.4(0x80000000)
+
+
+ bb.4.cond.end:
+ %5(s32) = G_PHI %1(s32), %bb.2, %2(s32), %bb.3
+ %xmm0 = COPY %5(s32)
+ RET 0, implicit %xmm0
+
+...
+---
+name: test_double
+alignment: 4
+legalized: false
+regBankSelected: false
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: _, preferred-register: '' }
+ - { id: 1, class: _, preferred-register: '' }
+ - { id: 2, class: _, preferred-register: '' }
+ - { id: 3, class: _, preferred-register: '' }
+ - { id: 4, class: _, preferred-register: '' }
+ - { id: 5, class: _, preferred-register: '' }
+liveins:
+fixedStack:
+stack:
+constants:
+
+
+
+body: |
+ ; ALL-LABEL: name: test_double
+ ; ALL: bb.0.{{[a-zA-Z0-9]+}}:
+ ; ALL: successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ ; ALL: liveins: %edi, %xmm0, %xmm1
+ ; ALL: [[COPY:%[0-9]+]]:_(s32) = COPY %edi
+ ; ALL: [[COPY1:%[0-9]+]]:_(s64) = COPY %xmm0
+ ; ALL: [[COPY2:%[0-9]+]]:_(s64) = COPY %xmm1
+ ; ALL: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; ALL: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[COPY]](s32), [[C]]
+ ; ALL: G_BRCOND [[ICMP]](s1), %bb.1
+ ; ALL: G_BR %bb.2
+ ; ALL: bb.1.cond.true:
+ ; ALL: successors: %bb.3(0x80000000)
+ ; ALL: G_BR %bb.3
+ ; ALL: bb.2.cond.false:
+ ; ALL: successors: %bb.3(0x80000000)
+ ; ALL: bb.3.cond.end:
+ ; ALL: [[PHI:%[0-9]+]]:_(s64) = G_PHI [[COPY1]](s64), %bb.1, [[COPY2]](s64), %bb.2
+ ; ALL: %xmm0 = COPY [[PHI]](s64)
+ ; ALL: RET 0, implicit %xmm0
+ bb.1.entry:
+ successors: %bb.2(0x40000000), %bb.3(0x40000000)
+ liveins: %edi, %xmm0, %xmm1
+
+ %0(s32) = COPY %edi
+ %1(s64) = COPY %xmm0
+ %2(s64) = COPY %xmm1
+ %3(s32) = G_CONSTANT i32 0
+ %4(s1) = G_ICMP intpred(sgt), %0(s32), %3
+ G_BRCOND %4(s1), %bb.2
+ G_BR %bb.3
+
+ bb.2.cond.true:
+ successors: %bb.4(0x80000000)
+
+ G_BR %bb.4
+
+ bb.3.cond.false:
+ successors: %bb.4(0x80000000)
+
+
+ bb.4.cond.end:
+ %5(s64) = G_PHI %1(s64), %bb.2, %2(s64), %bb.3
+ %xmm0 = COPY %5(s64)
+ RET 0, implicit %xmm0
+
+...
diff --git a/test/CodeGen/X86/GlobalISel/legalize-sub-v128.mir b/test/CodeGen/X86/GlobalISel/legalize-sub-v128.mir
index 2f90fc9a3c90..c4314dd8f764 100644
--- a/test/CodeGen/X86/GlobalISel/legalize-sub-v128.mir
+++ b/test/CodeGen/X86/GlobalISel/legalize-sub-v128.mir
@@ -1,3 +1,4 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple=x86_64-linux-gnu -mattr=+sse2 -global-isel -run-pass=legalizer %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=SSE2
--- |
@@ -23,7 +24,6 @@
...
---
name: test_sub_v16i8
-# ALL-LABEL: name: test_sub_v16i8
alignment: 4
legalized: false
regBankSelected: false
@@ -31,23 +31,24 @@ registers:
- { id: 0, class: _ }
- { id: 1, class: _ }
- { id: 2, class: _ }
-# ALL: %0(<16 x s8>) = IMPLICIT_DEF
-# ALL-NEXT: %1(<16 x s8>) = IMPLICIT_DEF
-# ALL-NEXT: %2(<16 x s8>) = G_SUB %0, %1
-# ALL-NEXT: RET 0
body: |
bb.1 (%ir-block.0):
liveins: %xmm0, %xmm1
+ ; ALL-LABEL: name: test_sub_v16i8
+ ; ALL: [[DEF:%[0-9]+]]:_(<16 x s8>) = IMPLICIT_DEF
+ ; ALL: [[DEF1:%[0-9]+]]:_(<16 x s8>) = IMPLICIT_DEF
+ ; ALL: [[SUB:%[0-9]+]]:_(<16 x s8>) = G_SUB [[DEF]], [[DEF1]]
+ ; ALL: RET 0
%0(<16 x s8>) = IMPLICIT_DEF
%1(<16 x s8>) = IMPLICIT_DEF
%2(<16 x s8>) = G_SUB %0, %1
+ %xmm0 = COPY %2
RET 0
...
---
name: test_sub_v8i16
-# ALL-LABEL: name: test_sub_v8i16
alignment: 4
legalized: false
regBankSelected: false
@@ -55,23 +56,24 @@ registers:
- { id: 0, class: _ }
- { id: 1, class: _ }
- { id: 2, class: _ }
-# ALL: %0(<8 x s16>) = IMPLICIT_DEF
-# ALL-NEXT: %1(<8 x s16>) = IMPLICIT_DEF
-# ALL-NEXT: %2(<8 x s16>) = G_SUB %0, %1
-# ALL-NEXT: RET 0
body: |
bb.1 (%ir-block.0):
liveins: %xmm0, %xmm1
+ ; ALL-LABEL: name: test_sub_v8i16
+ ; ALL: [[DEF:%[0-9]+]]:_(<8 x s16>) = IMPLICIT_DEF
+ ; ALL: [[DEF1:%[0-9]+]]:_(<8 x s16>) = IMPLICIT_DEF
+ ; ALL: [[SUB:%[0-9]+]]:_(<8 x s16>) = G_SUB [[DEF]], [[DEF1]]
+ ; ALL: RET 0
%0(<8 x s16>) = IMPLICIT_DEF
%1(<8 x s16>) = IMPLICIT_DEF
%2(<8 x s16>) = G_SUB %0, %1
+ %xmm0 = COPY %2
RET 0
...
---
name: test_sub_v4i32
-# ALL-LABEL: name: test_sub_v4i32
alignment: 4
legalized: false
regBankSelected: false
@@ -79,23 +81,24 @@ registers:
- { id: 0, class: _ }
- { id: 1, class: _ }
- { id: 2, class: _ }
-# ALL: %0(<4 x s32>) = IMPLICIT_DEF
-# ALL-NEXT: %1(<4 x s32>) = IMPLICIT_DEF
-# ALL-NEXT: %2(<4 x s32>) = G_SUB %0, %1
-# ALL-NEXT: RET 0
body: |
bb.1 (%ir-block.0):
liveins: %xmm0, %xmm1
+ ; ALL-LABEL: name: test_sub_v4i32
+ ; ALL: [[DEF:%[0-9]+]]:_(<4 x s32>) = IMPLICIT_DEF
+ ; ALL: [[DEF1:%[0-9]+]]:_(<4 x s32>) = IMPLICIT_DEF
+ ; ALL: [[SUB:%[0-9]+]]:_(<4 x s32>) = G_SUB [[DEF]], [[DEF1]]
+ ; ALL: RET 0
%0(<4 x s32>) = IMPLICIT_DEF
%1(<4 x s32>) = IMPLICIT_DEF
%2(<4 x s32>) = G_SUB %0, %1
+ %xmm0 = COPY %2
RET 0
...
---
name: test_sub_v2i64
-# ALL-LABEL: name: test_sub_v2i64
alignment: 4
legalized: false
regBankSelected: false
@@ -103,17 +106,19 @@ registers:
- { id: 0, class: _ }
- { id: 1, class: _ }
- { id: 2, class: _ }
-# ALL: %0(<2 x s64>) = IMPLICIT_DEF
-# ALL-NEXT: %1(<2 x s64>) = IMPLICIT_DEF
-# ALL-NEXT: %2(<2 x s64>) = G_SUB %0, %1
-# ALL-NEXT: RET 0
body: |
bb.1 (%ir-block.0):
liveins: %xmm0, %xmm1
+ ; ALL-LABEL: name: test_sub_v2i64
+ ; ALL: [[DEF:%[0-9]+]]:_(<2 x s64>) = IMPLICIT_DEF
+ ; ALL: [[DEF1:%[0-9]+]]:_(<2 x s64>) = IMPLICIT_DEF
+ ; ALL: [[SUB:%[0-9]+]]:_(<2 x s64>) = G_SUB [[DEF]], [[DEF1]]
+ ; ALL: RET 0
%0(<2 x s64>) = IMPLICIT_DEF
%1(<2 x s64>) = IMPLICIT_DEF
%2(<2 x s64>) = G_SUB %0, %1
+ %xmm0 = COPY %2
RET 0
...
diff --git a/test/CodeGen/X86/GlobalISel/legalize-sub-v256.mir b/test/CodeGen/X86/GlobalISel/legalize-sub-v256.mir
index 9d07787b8ecb..7f4a6d7dc32a 100644
--- a/test/CodeGen/X86/GlobalISel/legalize-sub-v256.mir
+++ b/test/CodeGen/X86/GlobalISel/legalize-sub-v256.mir
@@ -1,3 +1,4 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx2 -global-isel -run-pass=legalizer %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=AVX2
# TODO: add tests for additional configuration after the legalization supported
--- |
@@ -24,7 +25,6 @@
...
---
name: test_sub_v32i8
-# ALL-LABEL: name: test_sub_v32i8
alignment: 4
legalized: false
regBankSelected: false
@@ -32,23 +32,24 @@ registers:
- { id: 0, class: _ }
- { id: 1, class: _ }
- { id: 2, class: _ }
-# AVX2: %0(<32 x s8>) = IMPLICIT_DEF
-# AVX2-NEXT: %1(<32 x s8>) = IMPLICIT_DEF
-# AVX2-NEXT: %2(<32 x s8>) = G_SUB %0, %1
-# AVX2-NEXT: RET 0
body: |
bb.1 (%ir-block.0):
liveins: %ymm0, %ymm1
+ ; ALL-LABEL: name: test_sub_v32i8
+ ; ALL: [[DEF:%[0-9]+]]:_(<32 x s8>) = IMPLICIT_DEF
+ ; ALL: [[DEF1:%[0-9]+]]:_(<32 x s8>) = IMPLICIT_DEF
+ ; ALL: [[SUB:%[0-9]+]]:_(<32 x s8>) = G_SUB [[DEF]], [[DEF1]]
+ ; ALL: RET 0
%0(<32 x s8>) = IMPLICIT_DEF
%1(<32 x s8>) = IMPLICIT_DEF
%2(<32 x s8>) = G_SUB %0, %1
+ %ymm0 = COPY %2
RET 0
...
---
name: test_sub_v16i16
-# ALL-LABEL: name: test_sub_v16i16
alignment: 4
legalized: false
regBankSelected: false
@@ -56,23 +57,24 @@ registers:
- { id: 0, class: _ }
- { id: 1, class: _ }
- { id: 2, class: _ }
-# AVX2: %0(<16 x s16>) = IMPLICIT_DEF
-# AVX2-NEXT: %1(<16 x s16>) = IMPLICIT_DEF
-# AVX2-NEXT: %2(<16 x s16>) = G_SUB %0, %1
-# AVX2-NEXT: RET 0
body: |
bb.1 (%ir-block.0):
liveins: %ymm0, %ymm1
+ ; ALL-LABEL: name: test_sub_v16i16
+ ; ALL: [[DEF:%[0-9]+]]:_(<16 x s16>) = IMPLICIT_DEF
+ ; ALL: [[DEF1:%[0-9]+]]:_(<16 x s16>) = IMPLICIT_DEF
+ ; ALL: [[SUB:%[0-9]+]]:_(<16 x s16>) = G_SUB [[DEF]], [[DEF1]]
+ ; ALL: RET 0
%0(<16 x s16>) = IMPLICIT_DEF
%1(<16 x s16>) = IMPLICIT_DEF
%2(<16 x s16>) = G_SUB %0, %1
+ %ymm0 = COPY %2
RET 0
...
---
name: test_sub_v8i32
-# ALL-LABEL: name: test_sub_v8i32
alignment: 4
legalized: false
regBankSelected: false
@@ -80,23 +82,24 @@ registers:
- { id: 0, class: _ }
- { id: 1, class: _ }
- { id: 2, class: _ }
-# AVX2: %0(<8 x s32>) = IMPLICIT_DEF
-# AVX2-NEXT: %1(<8 x s32>) = IMPLICIT_DEF
-# AVX2-NEXT: %2(<8 x s32>) = G_SUB %0, %1
-# AVX2-NEXT: RET 0
body: |
bb.1 (%ir-block.0):
liveins: %ymm0, %ymm1
+ ; ALL-LABEL: name: test_sub_v8i32
+ ; ALL: [[DEF:%[0-9]+]]:_(<8 x s32>) = IMPLICIT_DEF
+ ; ALL: [[DEF1:%[0-9]+]]:_(<8 x s32>) = IMPLICIT_DEF
+ ; ALL: [[SUB:%[0-9]+]]:_(<8 x s32>) = G_SUB [[DEF]], [[DEF1]]
+ ; ALL: RET 0
%0(<8 x s32>) = IMPLICIT_DEF
%1(<8 x s32>) = IMPLICIT_DEF
%2(<8 x s32>) = G_SUB %0, %1
+ %ymm0 = COPY %2
RET 0
...
---
name: test_sub_v4i64
-# ALL-LABEL: name: test_sub_v4i64
alignment: 4
legalized: false
regBankSelected: false
@@ -104,17 +107,19 @@ registers:
- { id: 0, class: _ }
- { id: 1, class: _ }
- { id: 2, class: _ }
-# AVX2: %0(<4 x s64>) = IMPLICIT_DEF
-# AVX2-NEXT: %1(<4 x s64>) = IMPLICIT_DEF
-# AVX2-NEXT: %2(<4 x s64>) = G_SUB %0, %1
-# AVX2-NEXT: RET 0
body: |
bb.1 (%ir-block.0):
liveins: %ymm0, %ymm1
+ ; ALL-LABEL: name: test_sub_v4i64
+ ; ALL: [[DEF:%[0-9]+]]:_(<4 x s64>) = IMPLICIT_DEF
+ ; ALL: [[DEF1:%[0-9]+]]:_(<4 x s64>) = IMPLICIT_DEF
+ ; ALL: [[SUB:%[0-9]+]]:_(<4 x s64>) = G_SUB [[DEF]], [[DEF1]]
+ ; ALL: RET 0
%0(<4 x s64>) = IMPLICIT_DEF
%1(<4 x s64>) = IMPLICIT_DEF
%2(<4 x s64>) = G_SUB %0, %1
+ %ymm0 = COPY %2
RET 0
...
diff --git a/test/CodeGen/X86/GlobalISel/legalize-sub-v512.mir b/test/CodeGen/X86/GlobalISel/legalize-sub-v512.mir
index c88e074ca413..2271d364a3fb 100644
--- a/test/CodeGen/X86/GlobalISel/legalize-sub-v512.mir
+++ b/test/CodeGen/X86/GlobalISel/legalize-sub-v512.mir
@@ -1,3 +1,4 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx512f,+avx512bw -global-isel -run-pass=legalizer %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=AVX512BW
# TODO: add tests for additional configuration after the legalization supported
--- |
@@ -24,7 +25,6 @@
...
---
name: test_sub_v64i8
-# ALL-LABEL: name: test_sub_v64i8
alignment: 4
legalized: false
regBankSelected: false
@@ -32,23 +32,24 @@ registers:
- { id: 0, class: _ }
- { id: 1, class: _ }
- { id: 2, class: _ }
-# AVX512BW: %0(<64 x s8>) = IMPLICIT_DEF
-# AVX512BW-NEXT: %1(<64 x s8>) = IMPLICIT_DEF
-# AVX512BW-NEXT: %2(<64 x s8>) = G_SUB %0, %1
-# AVX512BW-NEXT: RET 0
body: |
bb.1 (%ir-block.0):
liveins: %zmm0, %zmm1
+ ; ALL-LABEL: name: test_sub_v64i8
+ ; ALL: [[DEF:%[0-9]+]]:_(<64 x s8>) = IMPLICIT_DEF
+ ; ALL: [[DEF1:%[0-9]+]]:_(<64 x s8>) = IMPLICIT_DEF
+ ; ALL: [[SUB:%[0-9]+]]:_(<64 x s8>) = G_SUB [[DEF]], [[DEF1]]
+ ; ALL: RET 0
%0(<64 x s8>) = IMPLICIT_DEF
%1(<64 x s8>) = IMPLICIT_DEF
%2(<64 x s8>) = G_SUB %0, %1
+ %zmm0 = COPY %2
RET 0
...
---
name: test_sub_v32i16
-# ALL-LABEL: name: test_sub_v32i16
alignment: 4
legalized: false
regBankSelected: false
@@ -56,23 +57,24 @@ registers:
- { id: 0, class: _ }
- { id: 1, class: _ }
- { id: 2, class: _ }
-# AVX512BW: %0(<32 x s16>) = IMPLICIT_DEF
-# AVX512BW-NEXT: %1(<32 x s16>) = IMPLICIT_DEF
-# AVX512BW-NEXT: %2(<32 x s16>) = G_SUB %0, %1
-# AVX512BW-NEXT: RET 0
body: |
bb.1 (%ir-block.0):
liveins: %zmm0, %zmm1
+ ; ALL-LABEL: name: test_sub_v32i16
+ ; ALL: [[DEF:%[0-9]+]]:_(<32 x s16>) = IMPLICIT_DEF
+ ; ALL: [[DEF1:%[0-9]+]]:_(<32 x s16>) = IMPLICIT_DEF
+ ; ALL: [[SUB:%[0-9]+]]:_(<32 x s16>) = G_SUB [[DEF]], [[DEF1]]
+ ; ALL: RET 0
%0(<32 x s16>) = IMPLICIT_DEF
%1(<32 x s16>) = IMPLICIT_DEF
%2(<32 x s16>) = G_SUB %0, %1
+ %zmm0 = COPY %2
RET 0
...
---
name: test_sub_v16i32
-# ALL-LABEL: name: test_sub_v16i32
alignment: 4
legalized: false
regBankSelected: false
@@ -80,23 +82,24 @@ registers:
- { id: 0, class: _ }
- { id: 1, class: _ }
- { id: 2, class: _ }
-# ALL: %0(<16 x s32>) = IMPLICIT_DEF
-# ALL-NEXT: %1(<16 x s32>) = IMPLICIT_DEF
-# ALL-NEXT: %2(<16 x s32>) = G_SUB %0, %1
-# ALL-NEXT: RET 0
body: |
bb.1 (%ir-block.0):
liveins: %zmm0, %zmm1
+ ; ALL-LABEL: name: test_sub_v16i32
+ ; ALL: [[DEF:%[0-9]+]]:_(<16 x s32>) = IMPLICIT_DEF
+ ; ALL: [[DEF1:%[0-9]+]]:_(<16 x s32>) = IMPLICIT_DEF
+ ; ALL: [[SUB:%[0-9]+]]:_(<16 x s32>) = G_SUB [[DEF]], [[DEF1]]
+ ; ALL: RET 0
%0(<16 x s32>) = IMPLICIT_DEF
%1(<16 x s32>) = IMPLICIT_DEF
%2(<16 x s32>) = G_SUB %0, %1
+ %zmm0 = COPY %2
RET 0
...
---
name: test_sub_v8i64
-# ALL-LABEL: name: test_sub_v8i64
alignment: 4
legalized: false
regBankSelected: false
@@ -104,17 +107,19 @@ registers:
- { id: 0, class: _ }
- { id: 1, class: _ }
- { id: 2, class: _ }
-# ALL: %0(<8 x s64>) = IMPLICIT_DEF
-# ALL-NEXT: %1(<8 x s64>) = IMPLICIT_DEF
-# ALL-NEXT: %2(<8 x s64>) = G_SUB %0, %1
-# ALL-NEXT: RET 0
body: |
bb.1 (%ir-block.0):
liveins: %zmm0, %zmm1
+ ; ALL-LABEL: name: test_sub_v8i64
+ ; ALL: [[DEF:%[0-9]+]]:_(<8 x s64>) = IMPLICIT_DEF
+ ; ALL: [[DEF1:%[0-9]+]]:_(<8 x s64>) = IMPLICIT_DEF
+ ; ALL: [[SUB:%[0-9]+]]:_(<8 x s64>) = G_SUB [[DEF]], [[DEF1]]
+ ; ALL: RET 0
%0(<8 x s64>) = IMPLICIT_DEF
%1(<8 x s64>) = IMPLICIT_DEF
%2(<8 x s64>) = G_SUB %0, %1
+ %zmm0 = COPY %2
RET 0
...
diff --git a/test/CodeGen/X86/GlobalISel/legalize-sub.mir b/test/CodeGen/X86/GlobalISel/legalize-sub.mir
index 26ef285929a6..406967f66d81 100644
--- a/test/CodeGen/X86/GlobalISel/legalize-sub.mir
+++ b/test/CodeGen/X86/GlobalISel/legalize-sub.mir
@@ -1,10 +1,9 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple=x86_64-linux-gnu -global-isel -run-pass=legalizer %s -o - | FileCheck %s
--- |
- ; ModuleID = '<stdin>'
- source_filename = "<stdin>"
- target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
- target triple = "x86_64--linux-gnu"
+
+ define void @test_sub_i1() { ret void}
define i32 @test_sub_i32(i32 %arg1, i32 %arg2) {
%ret = sub i32 %arg1, %arg2
@@ -13,11 +12,40 @@
...
---
+name: test_sub_i1
+alignment: 4
+legalized: false
+regBankSelected: false
+registers:
+ - { id: 0, class: _, preferred-register: '' }
+ - { id: 1, class: _, preferred-register: '' }
+ - { id: 2, class: _, preferred-register: '' }
+body: |
+ bb.1 (%ir-block.0):
+
+ ; CHECK-LABEL: name: test_sub_i1
+ ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY %edx
+ ; CHECK: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[COPY]](s32)
+ ; CHECK: [[TRUNC1:%[0-9]+]]:_(s8) = G_TRUNC [[COPY]](s32)
+ ; CHECK: [[SUB:%[0-9]+]]:_(s8) = G_SUB [[TRUNC]], [[TRUNC1]]
+ ; CHECK: [[DEF:%[0-9]+]]:_(p0) = G_IMPLICIT_DEF
+ ; CHECK: [[C:%[0-9]+]]:_(s8) = G_CONSTANT i8 1
+ ; CHECK: [[COPY1:%[0-9]+]]:_(s8) = COPY [[SUB]](s8)
+ ; CHECK: [[AND:%[0-9]+]]:_(s8) = G_AND [[COPY1]], [[C]]
+ ; CHECK: G_STORE [[AND]](s8), [[DEF]](p0) :: (store 1)
+ ; CHECK: RET 0
+ %0(s32) = COPY %edx
+ %1(s1) = G_TRUNC %0(s32)
+ %2(s1) = G_SUB %1, %1
+ %3:_(p0) = G_IMPLICIT_DEF
+ G_STORE %2, %3 :: (store 1)
+ RET 0
+...
+---
name: test_sub_i32
alignment: 4
legalized: false
regBankSelected: false
-selected: false
tracksRegLiveness: true
registers:
- { id: 0, class: _ }
@@ -26,11 +54,14 @@ registers:
body: |
bb.1 (%ir-block.0):
liveins: %edi, %esi
- ; CHECK-LABEL: name: test_sub_i32
- ; CHECK: [[VAL1:%.*]](s32) = COPY %edi
- ; CHECK: [[VAL2:%.*]](s32) = COPY %esi
- ; CHECK: [[RES:%.*]](s32) = G_SUB [[VAL1:%.*]], [[VAL2:%.*]]
+ ; CHECK-LABEL: name: test_sub_i32
+ ; CHECK: liveins: %edi, %esi
+ ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY %edi
+ ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY %esi
+ ; CHECK: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[COPY]], [[COPY1]]
+ ; CHECK: %eax = COPY [[SUB]](s32)
+ ; CHECK: RET 0, implicit %eax
%0(s32) = COPY %edi
%1(s32) = COPY %esi
%2(s32) = G_SUB %0, %1
diff --git a/test/CodeGen/X86/GlobalISel/legalize-trunc.mir b/test/CodeGen/X86/GlobalISel/legalize-trunc.mir
index 6b390d990ecf..7fbd15c45635 100644
--- a/test/CodeGen/X86/GlobalISel/legalize-trunc.mir
+++ b/test/CodeGen/X86/GlobalISel/legalize-trunc.mir
@@ -1,3 +1,4 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple=i386-linux-gnu -global-isel -run-pass=legalizer %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X32
# RUN: llc -mtriple=x86_64-linux-gnu -global-isel -run-pass=legalizer %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X64
--- |
@@ -8,7 +9,6 @@
...
---
name: trunc_check
-# ALL-LABEL: name: trunc_check
registers:
- { id: 0, class: _ }
- { id: 1, class: _ }
@@ -16,16 +16,40 @@ registers:
- { id: 3, class: _ }
body: |
bb.1 (%ir-block.0):
+ ; X32-LABEL: name: trunc_check
+ ; X32: [[DEF:%[0-9]+]]:_(s32) = IMPLICIT_DEF
+ ; X32: [[DEF1:%[0-9]+]]:_(p0) = G_IMPLICIT_DEF
+ ; X32: [[C:%[0-9]+]]:_(s8) = G_CONSTANT i8 1
+ ; X32: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[DEF]](s32)
+ ; X32: [[AND:%[0-9]+]]:_(s8) = G_AND [[TRUNC]], [[C]]
+ ; X32: G_STORE [[AND]](s8), [[DEF1]](p0) :: (store 1)
+ ; X32: [[TRUNC1:%[0-9]+]]:_(s8) = G_TRUNC [[DEF]](s32)
+ ; X32: G_STORE [[TRUNC1]](s8), [[DEF1]](p0) :: (store 8)
+ ; X32: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[DEF]](s32)
+ ; X32: G_STORE [[TRUNC2]](s16), [[DEF1]](p0) :: (store 16)
+ ; X32: RET 0
+ ; X64-LABEL: name: trunc_check
+ ; X64: [[DEF:%[0-9]+]]:_(s32) = IMPLICIT_DEF
+ ; X64: [[DEF1:%[0-9]+]]:_(p0) = G_IMPLICIT_DEF
+ ; X64: [[C:%[0-9]+]]:_(s8) = G_CONSTANT i8 1
+ ; X64: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[DEF]](s32)
+ ; X64: [[AND:%[0-9]+]]:_(s8) = G_AND [[TRUNC]], [[C]]
+ ; X64: G_STORE [[AND]](s8), [[DEF1]](p0) :: (store 1)
+ ; X64: [[TRUNC1:%[0-9]+]]:_(s8) = G_TRUNC [[DEF]](s32)
+ ; X64: G_STORE [[TRUNC1]](s8), [[DEF1]](p0) :: (store 8)
+ ; X64: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[DEF]](s32)
+ ; X64: G_STORE [[TRUNC2]](s16), [[DEF1]](p0) :: (store 16)
+ ; X64: RET 0
%0(s32) = IMPLICIT_DEF
- ; ALL: %1(s1) = G_TRUNC %0(s32)
%1(s1) = G_TRUNC %0(s32)
+ %4:_(p0) = G_IMPLICIT_DEF
+ G_STORE %1, %4 :: (store 1)
- ; ALL: %2(s8) = G_TRUNC %0(s32)
%2(s8) = G_TRUNC %0(s32)
+ G_STORE %2, %4 :: (store 8)
- ; ALL: %3(s16) = G_TRUNC %0(s32)
%3(s16) = G_TRUNC %0(s32)
+ G_STORE %3, %4 :: (store 16)
RET 0
...
-
diff --git a/test/CodeGen/X86/GlobalISel/legalize-undef.mir b/test/CodeGen/X86/GlobalISel/legalize-undef.mir
new file mode 100644
index 000000000000..8b7b8f594a22
--- /dev/null
+++ b/test/CodeGen/X86/GlobalISel/legalize-undef.mir
@@ -0,0 +1,51 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=x86_64-linux-gnu -global-isel -run-pass=legalizer %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X64
+# RUN: llc -mtriple=i386-linux-gnu -global-isel -run-pass=legalizer %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X32
+
+---
+name: test_implicit_def
+registers:
+#
+body: |
+ bb.0.entry:
+ liveins:
+ ; X64-LABEL: name: test_implicit_def
+ ; X64: [[DEF:%[0-9]+]]:_(p0) = G_IMPLICIT_DEF
+ ; X64: [[DEF1:%[0-9]+]]:_(s8) = G_IMPLICIT_DEF
+ ; X64: G_STORE [[DEF1]](s8), [[DEF]](p0) :: (store 1)
+ ; X64: [[DEF2:%[0-9]+]]:_(s8) = G_IMPLICIT_DEF
+ ; X64: G_STORE [[DEF2]](s8), [[DEF]](p0) :: (store 8)
+ ; X64: [[DEF3:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
+ ; X64: G_STORE [[DEF3]](s16), [[DEF]](p0) :: (store 16)
+ ; X64: [[DEF4:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+ ; X64: G_STORE [[DEF4]](s32), [[DEF]](p0) :: (store 32)
+ ; X64: [[DEF5:%[0-9]+]]:_(s64) = G_IMPLICIT_DEF
+ ; X64: G_STORE [[DEF5]](s64), [[DEF]](p0) :: (store 64)
+ ; X32-LABEL: name: test_implicit_def
+ ; X32: [[DEF:%[0-9]+]]:_(p0) = G_IMPLICIT_DEF
+ ; X32: [[DEF1:%[0-9]+]]:_(s8) = G_IMPLICIT_DEF
+ ; X32: G_STORE [[DEF1]](s8), [[DEF]](p0) :: (store 1)
+ ; X32: [[DEF2:%[0-9]+]]:_(s8) = G_IMPLICIT_DEF
+ ; X32: G_STORE [[DEF2]](s8), [[DEF]](p0) :: (store 8)
+ ; X32: [[DEF3:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
+ ; X32: G_STORE [[DEF3]](s16), [[DEF]](p0) :: (store 16)
+ ; X32: [[DEF4:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+ ; X32: G_STORE [[DEF4]](s32), [[DEF]](p0) :: (store 32)
+ ; X32: [[DEF5:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+ ; X32: [[DEF6:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+ ; X32: G_STORE [[DEF5]](s32), [[DEF]](p0) :: (store 64)
+ ; X32: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+ ; X32: [[GEP:%[0-9]+]]:_(p0) = G_GEP [[DEF]], [[C]](s32)
+ ; X32: G_STORE [[DEF6]](s32), [[GEP]](p0) :: (store 64)
+ %5:_(p0) = G_IMPLICIT_DEF
+ %0:_(s1) = G_IMPLICIT_DEF
+ G_STORE %0, %5 ::(store 1)
+ %1:_(s8) = G_IMPLICIT_DEF
+ G_STORE %1, %5 ::(store 8)
+ %2:_(s16) = G_IMPLICIT_DEF
+ G_STORE %2, %5 ::(store 16)
+ %3:_(s32) = G_IMPLICIT_DEF
+ G_STORE %3, %5 ::(store 32)
+ %4:_(s64) = G_IMPLICIT_DEF
+ G_STORE %4, %5 ::(store 64)
+...
diff --git a/test/CodeGen/X86/GlobalISel/legalize-xor-scalar.mir b/test/CodeGen/X86/GlobalISel/legalize-xor-scalar.mir
index e2af91283026..3213ce1bc075 100644
--- a/test/CodeGen/X86/GlobalISel/legalize-xor-scalar.mir
+++ b/test/CodeGen/X86/GlobalISel/legalize-xor-scalar.mir
@@ -1,6 +1,12 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple=x86_64-linux-gnu -global-isel -run-pass=legalizer %s -o - | FileCheck %s
--- |
+ define i1 @test_xor_i1() {
+ %ret = xor i1 undef, undef
+ ret i1 %ret
+ }
+
define i8 @test_xor_i8() {
%ret = xor i8 undef, undef
ret i8 %ret
@@ -23,8 +29,32 @@
...
---
+name: test_xor_i1
+alignment: 4
+legalized: false
+regBankSelected: false
+registers:
+ - { id: 0, class: _, preferred-register: '' }
+ - { id: 1, class: _, preferred-register: '' }
+ - { id: 2, class: _, preferred-register: '' }
+body: |
+ bb.1 (%ir-block.0):
+
+ ; CHECK-LABEL: name: test_xor_i1
+ ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY %edx
+ ; CHECK: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[COPY]](s32)
+ ; CHECK: [[TRUNC1:%[0-9]+]]:_(s8) = G_TRUNC [[COPY]](s32)
+ ; CHECK: [[XOR:%[0-9]+]]:_(s8) = G_XOR [[TRUNC]], [[TRUNC1]]
+ ; CHECK: RET 0
+ %0(s32) = COPY %edx
+ %1(s1) = G_TRUNC %0(s32)
+ %2(s1) = G_XOR %1, %1
+ %3:_(p0) = G_IMPLICIT_DEF
+ G_STORE %2, %3 ::(store 1)
+ RET 0
+...
+---
name: test_xor_i8
-# CHECK-LABEL: name: test_xor_i8
alignment: 4
legalized: false
regBankSelected: false
@@ -35,12 +65,13 @@ liveins:
fixedStack:
stack:
constants:
-# CHECK: %0(s8) = IMPLICIT_DEF
-# CHECK-NEXT: %1(s8) = G_XOR %0, %0
-# CHECK-NEXT: %al = COPY %1(s8)
-# CHECK-NEXT: RET 0, implicit %al
body: |
bb.1 (%ir-block.0):
+ ; CHECK-LABEL: name: test_xor_i8
+ ; CHECK: [[DEF:%[0-9]+]]:_(s8) = IMPLICIT_DEF
+ ; CHECK: [[XOR:%[0-9]+]]:_(s8) = G_XOR [[DEF]], [[DEF]]
+ ; CHECK: %al = COPY [[XOR]](s8)
+ ; CHECK: RET 0, implicit %al
%0(s8) = IMPLICIT_DEF
%1(s8) = G_XOR %0, %0
%al = COPY %1(s8)
@@ -49,7 +80,6 @@ body: |
...
---
name: test_xor_i16
-# CHECK-LABEL: name: test_xor_i16
alignment: 4
legalized: false
regBankSelected: false
@@ -60,12 +90,13 @@ liveins:
fixedStack:
stack:
constants:
-# CHECK: %0(s16) = IMPLICIT_DEF
-# CHECK-NEXT: %1(s16) = G_XOR %0, %0
-# CHECK-NEXT: %ax = COPY %1(s16)
-# CHECK-NEXT: RET 0, implicit %ax
body: |
bb.1 (%ir-block.0):
+ ; CHECK-LABEL: name: test_xor_i16
+ ; CHECK: [[DEF:%[0-9]+]]:_(s16) = IMPLICIT_DEF
+ ; CHECK: [[XOR:%[0-9]+]]:_(s16) = G_XOR [[DEF]], [[DEF]]
+ ; CHECK: %ax = COPY [[XOR]](s16)
+ ; CHECK: RET 0, implicit %ax
%0(s16) = IMPLICIT_DEF
%1(s16) = G_XOR %0, %0
%ax = COPY %1(s16)
@@ -74,7 +105,6 @@ body: |
...
---
name: test_xor_i32
-# CHECK-LABEL: name: test_xor_i32
alignment: 4
legalized: false
regBankSelected: false
@@ -85,12 +115,13 @@ liveins:
fixedStack:
stack:
constants:
-# CHECK: %0(s32) = IMPLICIT_DEF
-# CHECK-NEXT: %1(s32) = G_XOR %0, %0
-# CHECK-NEXT: %eax = COPY %1(s32)
-# CHECK-NEXT: RET 0, implicit %eax
body: |
bb.1 (%ir-block.0):
+ ; CHECK-LABEL: name: test_xor_i32
+ ; CHECK: [[DEF:%[0-9]+]]:_(s32) = IMPLICIT_DEF
+ ; CHECK: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[DEF]], [[DEF]]
+ ; CHECK: %eax = COPY [[XOR]](s32)
+ ; CHECK: RET 0, implicit %eax
%0(s32) = IMPLICIT_DEF
%1(s32) = G_XOR %0, %0
%eax = COPY %1(s32)
@@ -99,7 +130,6 @@ body: |
...
---
name: test_xor_i64
-# CHECK-LABEL: name: test_xor_i64
alignment: 4
legalized: false
regBankSelected: false
@@ -110,12 +140,13 @@ liveins:
fixedStack:
stack:
constants:
-# CHECK: %0(s64) = IMPLICIT_DEF
-# CHECK-NEXT: %1(s64) = G_XOR %0, %0
-# CHECK-NEXT: %rax = COPY %1(s64)
-# CHECK-NEXT: RET 0, implicit %rax
body: |
bb.1 (%ir-block.0):
+ ; CHECK-LABEL: name: test_xor_i64
+ ; CHECK: [[DEF:%[0-9]+]]:_(s64) = IMPLICIT_DEF
+ ; CHECK: [[XOR:%[0-9]+]]:_(s64) = G_XOR [[DEF]], [[DEF]]
+ ; CHECK: %rax = COPY [[XOR]](s64)
+ ; CHECK: RET 0, implicit %rax
%0(s64) = IMPLICIT_DEF
%1(s64) = G_XOR %0, %0
%rax = COPY %1(s64)
diff --git a/test/CodeGen/X86/GlobalISel/memop-scalar-x32.ll b/test/CodeGen/X86/GlobalISel/memop-scalar-x32.ll
index 1c719b1bf74d..31fb11179bb8 100644
--- a/test/CodeGen/X86/GlobalISel/memop-scalar-x32.ll
+++ b/test/CodeGen/X86/GlobalISel/memop-scalar-x32.ll
@@ -6,7 +6,7 @@
define i1 @test_load_i1(i1 * %p1) {
; ALL-LABEL: test_load_i1:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: movl 4(%esp), %eax
; ALL-NEXT: movb (%eax), %al
; ALL-NEXT: retl
@@ -16,7 +16,7 @@ define i1 @test_load_i1(i1 * %p1) {
define i8 @test_load_i8(i8 * %p1) {
; ALL-LABEL: test_load_i8:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: movl 4(%esp), %eax
; ALL-NEXT: movb (%eax), %al
; ALL-NEXT: retl
@@ -26,7 +26,7 @@ define i8 @test_load_i8(i8 * %p1) {
define i16 @test_load_i16(i16 * %p1) {
; ALL-LABEL: test_load_i16:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: movl 4(%esp), %eax
; ALL-NEXT: movzwl (%eax), %eax
; ALL-NEXT: retl
@@ -36,7 +36,7 @@ define i16 @test_load_i16(i16 * %p1) {
define i32 @test_load_i32(i32 * %p1) {
; ALL-LABEL: test_load_i32:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: movl 4(%esp), %eax
; ALL-NEXT: movl (%eax), %eax
; ALL-NEXT: retl
@@ -46,7 +46,7 @@ define i32 @test_load_i32(i32 * %p1) {
define i1 * @test_store_i1(i1 %val, i1 * %p1) {
; ALL-LABEL: test_store_i1:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: movb 4(%esp), %cl
; ALL-NEXT: movl 8(%esp), %eax
; ALL-NEXT: andb $1, %cl
@@ -58,7 +58,7 @@ define i1 * @test_store_i1(i1 %val, i1 * %p1) {
define i8 * @test_store_i8(i8 %val, i8 * %p1) {
; ALL-LABEL: test_store_i8:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: movb 4(%esp), %cl
; ALL-NEXT: movl 8(%esp), %eax
; ALL-NEXT: movb %cl, (%eax)
@@ -69,7 +69,7 @@ define i8 * @test_store_i8(i8 %val, i8 * %p1) {
define i16 * @test_store_i16(i16 %val, i16 * %p1) {
; ALL-LABEL: test_store_i16:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: movzwl 4(%esp), %ecx
; ALL-NEXT: movl 8(%esp), %eax
; ALL-NEXT: movw %cx, (%eax)
@@ -80,7 +80,7 @@ define i16 * @test_store_i16(i16 %val, i16 * %p1) {
define i32 * @test_store_i32(i32 %val, i32 * %p1) {
; ALL-LABEL: test_store_i32:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: movl 4(%esp), %ecx
; ALL-NEXT: movl 8(%esp), %eax
; ALL-NEXT: movl %ecx, (%eax)
@@ -91,7 +91,7 @@ define i32 * @test_store_i32(i32 %val, i32 * %p1) {
define i32* @test_load_ptr(i32** %ptr1) {
; ALL-LABEL: test_load_ptr:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: movl 4(%esp), %eax
; ALL-NEXT: movl (%eax), %eax
; ALL-NEXT: retl
@@ -101,7 +101,7 @@ define i32* @test_load_ptr(i32** %ptr1) {
define void @test_store_ptr(i32** %ptr1, i32* %a) {
; ALL-LABEL: test_store_ptr:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: movl 4(%esp), %eax
; ALL-NEXT: movl 8(%esp), %ecx
; ALL-NEXT: movl %ecx, (%eax)
diff --git a/test/CodeGen/X86/GlobalISel/memop-scalar.ll b/test/CodeGen/X86/GlobalISel/memop-scalar.ll
index 2097a3b0bfc9..0355c395780b 100644
--- a/test/CodeGen/X86/GlobalISel/memop-scalar.ll
+++ b/test/CodeGen/X86/GlobalISel/memop-scalar.ll
@@ -4,7 +4,7 @@
define i1 @test_load_i1(i1 * %p1) {
; ALL-LABEL: test_load_i1:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: movb (%rdi), %al
; ALL-NEXT: retq
%r = load i1, i1* %p1
@@ -13,7 +13,7 @@ define i1 @test_load_i1(i1 * %p1) {
define i8 @test_load_i8(i8 * %p1) {
; ALL-LABEL: test_load_i8:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: movb (%rdi), %al
; ALL-NEXT: retq
%r = load i8, i8* %p1
@@ -22,7 +22,7 @@ define i8 @test_load_i8(i8 * %p1) {
define i16 @test_load_i16(i16 * %p1) {
; ALL-LABEL: test_load_i16:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: movzwl (%rdi), %eax
; ALL-NEXT: retq
%r = load i16, i16* %p1
@@ -31,7 +31,7 @@ define i16 @test_load_i16(i16 * %p1) {
define i32 @test_load_i32(i32 * %p1) {
; ALL-LABEL: test_load_i32:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: movl (%rdi), %eax
; ALL-NEXT: retq
%r = load i32, i32* %p1
@@ -40,7 +40,7 @@ define i32 @test_load_i32(i32 * %p1) {
define i64 @test_load_i64(i64 * %p1) {
; ALL-LABEL: test_load_i64:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: movq (%rdi), %rax
; ALL-NEXT: retq
%r = load i64, i64* %p1
@@ -49,13 +49,13 @@ define i64 @test_load_i64(i64 * %p1) {
define float @test_load_float(float * %p1) {
; SSE-LABEL: test_load_float:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movl (%rdi), %eax
; SSE-NEXT: movd %eax, %xmm0
; SSE-NEXT: retq
;
; ALL-LABEL: test_load_float:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: movl (%rdi), %eax
; ALL-NEXT: movd %eax, %xmm0
; ALL-NEXT: retq
@@ -65,13 +65,13 @@ define float @test_load_float(float * %p1) {
define double @test_load_double(double * %p1) {
; SSE-LABEL: test_load_double:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movq (%rdi), %rax
; SSE-NEXT: movq %rax, %xmm0
; SSE-NEXT: retq
;
; ALL-LABEL: test_load_double:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: movq (%rdi), %rax
; ALL-NEXT: movq %rax, %xmm0
; ALL-NEXT: retq
@@ -81,7 +81,7 @@ define double @test_load_double(double * %p1) {
define i1 * @test_store_i1(i1 %val, i1 * %p1) {
; ALL-LABEL: test_store_i1:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: andb $1, %dil
; ALL-NEXT: movb %dil, (%rsi)
; ALL-NEXT: movq %rsi, %rax
@@ -92,7 +92,7 @@ define i1 * @test_store_i1(i1 %val, i1 * %p1) {
define i32 * @test_store_i32(i32 %val, i32 * %p1) {
; ALL-LABEL: test_store_i32:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: movl %edi, (%rsi)
; ALL-NEXT: movq %rsi, %rax
; ALL-NEXT: retq
@@ -102,7 +102,7 @@ define i32 * @test_store_i32(i32 %val, i32 * %p1) {
define i64 * @test_store_i64(i64 %val, i64 * %p1) {
; ALL-LABEL: test_store_i64:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: movq %rdi, (%rsi)
; ALL-NEXT: movq %rsi, %rax
; ALL-NEXT: retq
@@ -113,14 +113,14 @@ define i64 * @test_store_i64(i64 %val, i64 * %p1) {
define float * @test_store_float(float %val, float * %p1) {
;
; SSE_FAST-LABEL: test_store_float:
-; SSE_FAST: # BB#0:
+; SSE_FAST: # %bb.0:
; SSE_FAST-NEXT: movd %xmm0, %eax
; SSE_FAST-NEXT: movl %eax, (%rdi)
; SSE_FAST-NEXT: movq %rdi, %rax
; SSE_FAST-NEXT: retq
;
; SSE_GREEDY-LABEL: test_store_float:
-; SSE_GREEDY: # BB#0:
+; SSE_GREEDY: # %bb.0:
; SSE_GREEDY-NEXT: movss %xmm0, (%rdi)
; SSE_GREEDY-NEXT: movq %rdi, %rax
; SSE_GREEDY-NEXT: retq
@@ -131,14 +131,14 @@ define float * @test_store_float(float %val, float * %p1) {
define double * @test_store_double(double %val, double * %p1) {
;
; SSE_FAST-LABEL: test_store_double:
-; SSE_FAST: # BB#0:
+; SSE_FAST: # %bb.0:
; SSE_FAST-NEXT: movq %xmm0, %rax
; SSE_FAST-NEXT: movq %rax, (%rdi)
; SSE_FAST-NEXT: movq %rdi, %rax
; SSE_FAST-NEXT: retq
;
; SSE_GREEDY-LABEL: test_store_double:
-; SSE_GREEDY: # BB#0:
+; SSE_GREEDY: # %bb.0:
; SSE_GREEDY-NEXT: movsd %xmm0, (%rdi)
; SSE_GREEDY-NEXT: movq %rdi, %rax
; SSE_GREEDY-NEXT: retq
@@ -148,7 +148,7 @@ define double * @test_store_double(double %val, double * %p1) {
define i32* @test_load_ptr(i32** %ptr1) {
; ALL-LABEL: test_load_ptr:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: movq (%rdi), %rax
; ALL-NEXT: retq
%p = load i32*, i32** %ptr1
@@ -157,7 +157,7 @@ define i32* @test_load_ptr(i32** %ptr1) {
define void @test_store_ptr(i32** %ptr1, i32* %a) {
; ALL-LABEL: test_store_ptr:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: movq %rsi, (%rdi)
; ALL-NEXT: retq
store i32* %a, i32** %ptr1
@@ -166,7 +166,7 @@ define void @test_store_ptr(i32** %ptr1, i32* %a) {
define i32 @test_gep_folding(i32* %arr, i32 %val) {
; ALL-LABEL: test_gep_folding:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: movl %esi, 20(%rdi)
; ALL-NEXT: movl 20(%rdi), %eax
; ALL-NEXT: retq
@@ -179,7 +179,7 @@ define i32 @test_gep_folding(i32* %arr, i32 %val) {
; check that gep index doesn't folded into memory operand
define i32 @test_gep_folding_largeGepIndex(i32* %arr, i32 %val) {
; ALL-LABEL: test_gep_folding_largeGepIndex:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: movabsq $228719476720, %rax # imm = 0x3540BE3FF0
; ALL-NEXT: leaq (%rdi,%rax), %rax
; ALL-NEXT: movl %esi, (%rax)
diff --git a/test/CodeGen/X86/GlobalISel/memop-vec.ll b/test/CodeGen/X86/GlobalISel/memop-vec.ll
index 870e812bbb69..a5fbd6c76a4b 100644
--- a/test/CodeGen/X86/GlobalISel/memop-vec.ll
+++ b/test/CodeGen/X86/GlobalISel/memop-vec.ll
@@ -4,7 +4,7 @@
define <4 x i32> @test_load_v4i32_noalign(<4 x i32> * %p1) {
; SKX-LABEL: test_load_v4i32_noalign:
-; SKX: # BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vmovups (%rdi), %xmm0
; SKX-NEXT: retq
%r = load <4 x i32>, <4 x i32>* %p1, align 1
@@ -13,7 +13,7 @@ define <4 x i32> @test_load_v4i32_noalign(<4 x i32> * %p1) {
define <4 x i32> @test_load_v4i32_align(<4 x i32> * %p1) {
; SKX-LABEL: test_load_v4i32_align:
-; SKX: # BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vmovaps (%rdi), %xmm0
; SKX-NEXT: retq
%r = load <4 x i32>, <4 x i32>* %p1, align 16
@@ -22,7 +22,7 @@ define <4 x i32> @test_load_v4i32_align(<4 x i32> * %p1) {
define <8 x i32> @test_load_v8i32_noalign(<8 x i32> * %p1) {
; SKX-LABEL: test_load_v8i32_noalign:
-; SKX: # BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vmovups (%rdi), %ymm0
; SKX-NEXT: retq
%r = load <8 x i32>, <8 x i32>* %p1, align 1
@@ -31,7 +31,7 @@ define <8 x i32> @test_load_v8i32_noalign(<8 x i32> * %p1) {
define <8 x i32> @test_load_v8i32_align(<8 x i32> * %p1) {
; SKX-LABEL: test_load_v8i32_align:
-; SKX: # BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vmovaps (%rdi), %ymm0
; SKX-NEXT: retq
%r = load <8 x i32>, <8 x i32>* %p1, align 32
@@ -40,7 +40,7 @@ define <8 x i32> @test_load_v8i32_align(<8 x i32> * %p1) {
define <16 x i32> @test_load_v16i32_noalign(<16 x i32> * %p1) {
; SKX-LABEL: test_load_v16i32_noalign:
-; SKX: # BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vmovups (%rdi), %zmm0
; SKX-NEXT: retq
%r = load <16 x i32>, <16 x i32>* %p1, align 1
@@ -49,7 +49,7 @@ define <16 x i32> @test_load_v16i32_noalign(<16 x i32> * %p1) {
define <16 x i32> @test_load_v16i32_align(<16 x i32> * %p1) {
; SKX-LABEL: test_load_v16i32_align:
-; SKX: # BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vmovups (%rdi), %zmm0
; SKX-NEXT: retq
%r = load <16 x i32>, <16 x i32>* %p1, align 32
@@ -58,7 +58,7 @@ define <16 x i32> @test_load_v16i32_align(<16 x i32> * %p1) {
define void @test_store_v4i32_noalign(<4 x i32> %val, <4 x i32>* %p1) {
; SKX-LABEL: test_store_v4i32_noalign:
-; SKX: # BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vmovups %xmm0, (%rdi)
; SKX-NEXT: retq
store <4 x i32> %val, <4 x i32>* %p1, align 1
@@ -67,7 +67,7 @@ define void @test_store_v4i32_noalign(<4 x i32> %val, <4 x i32>* %p1) {
define void @test_store_v4i32_align(<4 x i32> %val, <4 x i32>* %p1) {
; SKX-LABEL: test_store_v4i32_align:
-; SKX: # BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vmovaps %xmm0, (%rdi)
; SKX-NEXT: retq
store <4 x i32> %val, <4 x i32>* %p1, align 16
@@ -76,7 +76,7 @@ define void @test_store_v4i32_align(<4 x i32> %val, <4 x i32>* %p1) {
define void @test_store_v8i32_noalign(<8 x i32> %val, <8 x i32>* %p1) {
; SKX-LABEL: test_store_v8i32_noalign:
-; SKX: # BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vmovups %ymm0, (%rdi)
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
@@ -86,7 +86,7 @@ define void @test_store_v8i32_noalign(<8 x i32> %val, <8 x i32>* %p1) {
define void @test_store_v8i32_align(<8 x i32> %val, <8 x i32>* %p1) {
; SKX-LABEL: test_store_v8i32_align:
-; SKX: # BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vmovaps %ymm0, (%rdi)
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
@@ -96,7 +96,7 @@ define void @test_store_v8i32_align(<8 x i32> %val, <8 x i32>* %p1) {
define void @test_store_v16i32_noalign(<16 x i32> %val, <16 x i32>* %p1) {
; SKX-LABEL: test_store_v16i32_noalign:
-; SKX: # BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vmovups %zmm0, (%rdi)
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
@@ -106,7 +106,7 @@ define void @test_store_v16i32_noalign(<16 x i32> %val, <16 x i32>* %p1) {
define void @test_store_v16i32_align(<16 x i32> %val, <16 x i32>* %p1) {
; SKX-LABEL: test_store_v16i32_align:
-; SKX: # BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vmovaps %zmm0, (%rdi)
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
diff --git a/test/CodeGen/X86/GlobalISel/mul-scalar.ll b/test/CodeGen/X86/GlobalISel/mul-scalar.ll
index 450c3839797c..5fd64c4bcce6 100644
--- a/test/CodeGen/X86/GlobalISel/mul-scalar.ll
+++ b/test/CodeGen/X86/GlobalISel/mul-scalar.ll
@@ -9,7 +9,7 @@
define i16 @test_mul_i16(i16 %arg1, i16 %arg2) {
; X64-LABEL: test_mul_i16:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: imulw %di, %si
; X64-NEXT: movl %esi, %eax
; X64-NEXT: retq
@@ -19,7 +19,7 @@ define i16 @test_mul_i16(i16 %arg1, i16 %arg2) {
define i32 @test_mul_i32(i32 %arg1, i32 %arg2) {
; X64-LABEL: test_mul_i32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: imull %edi, %esi
; X64-NEXT: movl %esi, %eax
; X64-NEXT: retq
@@ -29,7 +29,7 @@ define i32 @test_mul_i32(i32 %arg1, i32 %arg2) {
define i64 @test_mul_i64(i64 %arg1, i64 %arg2) {
; X64-LABEL: test_mul_i64:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: imulq %rdi, %rsi
; X64-NEXT: movq %rsi, %rax
; X64-NEXT: retq
diff --git a/test/CodeGen/X86/GlobalISel/mul-vec.ll b/test/CodeGen/X86/GlobalISel/mul-vec.ll
index b2e211470f39..37e174531156 100644
--- a/test/CodeGen/X86/GlobalISel/mul-vec.ll
+++ b/test/CodeGen/X86/GlobalISel/mul-vec.ll
@@ -3,7 +3,7 @@
define <8 x i16> @test_mul_v8i16(<8 x i16> %arg1, <8 x i16> %arg2) {
; SKX-LABEL: test_mul_v8i16:
-; SKX: # BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; SKX-NEXT: retq
%ret = mul <8 x i16> %arg1, %arg2
@@ -12,7 +12,7 @@ define <8 x i16> @test_mul_v8i16(<8 x i16> %arg1, <8 x i16> %arg2) {
define <4 x i32> @test_mul_v4i32(<4 x i32> %arg1, <4 x i32> %arg2) {
; SKX-LABEL: test_mul_v4i32:
-; SKX: # BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vpmulld %xmm1, %xmm0, %xmm0
; SKX-NEXT: retq
%ret = mul <4 x i32> %arg1, %arg2
@@ -21,7 +21,7 @@ define <4 x i32> @test_mul_v4i32(<4 x i32> %arg1, <4 x i32> %arg2) {
define <2 x i64> @test_mul_v2i64(<2 x i64> %arg1, <2 x i64> %arg2) {
; SKX-LABEL: test_mul_v2i64:
-; SKX: # BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vpmullq %xmm1, %xmm0, %xmm0
; SKX-NEXT: retq
%ret = mul <2 x i64> %arg1, %arg2
@@ -30,7 +30,7 @@ define <2 x i64> @test_mul_v2i64(<2 x i64> %arg1, <2 x i64> %arg2) {
define <16 x i16> @test_mul_v16i16(<16 x i16> %arg1, <16 x i16> %arg2) {
; SKX-LABEL: test_mul_v16i16:
-; SKX: # BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vpmullw %ymm1, %ymm0, %ymm0
; SKX-NEXT: retq
%ret = mul <16 x i16> %arg1, %arg2
@@ -39,7 +39,7 @@ define <16 x i16> @test_mul_v16i16(<16 x i16> %arg1, <16 x i16> %arg2) {
define <8 x i32> @test_mul_v8i32(<8 x i32> %arg1, <8 x i32> %arg2) {
; SKX-LABEL: test_mul_v8i32:
-; SKX: # BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vpmulld %ymm1, %ymm0, %ymm0
; SKX-NEXT: retq
%ret = mul <8 x i32> %arg1, %arg2
@@ -48,7 +48,7 @@ define <8 x i32> @test_mul_v8i32(<8 x i32> %arg1, <8 x i32> %arg2) {
define <4 x i64> @test_mul_v4i64(<4 x i64> %arg1, <4 x i64> %arg2) {
; SKX-LABEL: test_mul_v4i64:
-; SKX: # BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vpmullq %ymm1, %ymm0, %ymm0
; SKX-NEXT: retq
%ret = mul <4 x i64> %arg1, %arg2
@@ -57,7 +57,7 @@ define <4 x i64> @test_mul_v4i64(<4 x i64> %arg1, <4 x i64> %arg2) {
define <32 x i16> @test_mul_v32i16(<32 x i16> %arg1, <32 x i16> %arg2) {
; SKX-LABEL: test_mul_v32i16:
-; SKX: # BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vpmullw %zmm1, %zmm0, %zmm0
; SKX-NEXT: retq
%ret = mul <32 x i16> %arg1, %arg2
@@ -66,7 +66,7 @@ define <32 x i16> @test_mul_v32i16(<32 x i16> %arg1, <32 x i16> %arg2) {
define <16 x i32> @test_mul_v16i32(<16 x i32> %arg1, <16 x i32> %arg2) {
; SKX-LABEL: test_mul_v16i32:
-; SKX: # BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vpmulld %zmm1, %zmm0, %zmm0
; SKX-NEXT: retq
%ret = mul <16 x i32> %arg1, %arg2
@@ -75,7 +75,7 @@ define <16 x i32> @test_mul_v16i32(<16 x i32> %arg1, <16 x i32> %arg2) {
define <8 x i64> @test_mul_v8i64(<8 x i64> %arg1, <8 x i64> %arg2) {
; SKX-LABEL: test_mul_v8i64:
-; SKX: # BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vpmullq %zmm1, %zmm0, %zmm0
; SKX-NEXT: retq
%ret = mul <8 x i64> %arg1, %arg2
diff --git a/test/CodeGen/X86/GlobalISel/or-scalar.ll b/test/CodeGen/X86/GlobalISel/or-scalar.ll
index b0371457f76e..397deaaf9060 100644
--- a/test/CodeGen/X86/GlobalISel/or-scalar.ll
+++ b/test/CodeGen/X86/GlobalISel/or-scalar.ll
@@ -1,9 +1,24 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=x86_64-linux-gnu -global-isel -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=ALL
+define i32 @test_or_i1(i32 %arg1, i32 %arg2) {
+; ALL-LABEL: test_or_i1:
+; ALL: # %bb.0:
+; ALL-NEXT: cmpl %esi, %edi
+; ALL-NEXT: sete %al
+; ALL-NEXT: orb %al, %al
+; ALL-NEXT: movzbl %al, %eax
+; ALL-NEXT: andl $1, %eax
+; ALL-NEXT: retq
+ %c = icmp eq i32 %arg1, %arg2
+ %x = or i1 %c , %c
+ %ret = zext i1 %x to i32
+ ret i32 %ret
+}
+
define i8 @test_or_i8(i8 %arg1, i8 %arg2) {
; ALL-LABEL: test_or_i8:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: orb %dil, %sil
; ALL-NEXT: movl %esi, %eax
; ALL-NEXT: retq
@@ -13,7 +28,7 @@ define i8 @test_or_i8(i8 %arg1, i8 %arg2) {
define i16 @test_or_i16(i16 %arg1, i16 %arg2) {
; ALL-LABEL: test_or_i16:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: orw %di, %si
; ALL-NEXT: movl %esi, %eax
; ALL-NEXT: retq
@@ -23,7 +38,7 @@ define i16 @test_or_i16(i16 %arg1, i16 %arg2) {
define i32 @test_or_i32(i32 %arg1, i32 %arg2) {
; ALL-LABEL: test_or_i32:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: orl %edi, %esi
; ALL-NEXT: movl %esi, %eax
; ALL-NEXT: retq
@@ -33,7 +48,7 @@ define i32 @test_or_i32(i32 %arg1, i32 %arg2) {
define i64 @test_or_i64(i64 %arg1, i64 %arg2) {
; ALL-LABEL: test_or_i64:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: orq %rdi, %rsi
; ALL-NEXT: movq %rsi, %rax
; ALL-NEXT: retq
diff --git a/test/CodeGen/X86/GlobalISel/phi.ll b/test/CodeGen/X86/GlobalISel/phi.ll
new file mode 100644
index 000000000000..21570819884c
--- /dev/null
+++ b/test/CodeGen/X86/GlobalISel/phi.ll
@@ -0,0 +1,168 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=x86_64-linux-gnu -global-isel -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=ALL
+
+define i8 @test_i8(i32 %a, i8 %f, i8 %t) {
+; ALL-LABEL: test_i8:
+; ALL: # %bb.0: # %entry
+; ALL-NEXT: xorl %eax, %eax
+; ALL-NEXT: cmpl %eax, %edi
+; ALL-NEXT: setg %al
+; ALL-NEXT: testb $1, %al
+; ALL-NEXT: jne .LBB0_2
+; ALL-NEXT: # %bb.1: # %cond.false
+; ALL-NEXT: movl %edx, %esi
+; ALL-NEXT: .LBB0_2: # %cond.end
+; ALL-NEXT: movl %esi, %eax
+; ALL-NEXT: retq
+entry:
+ %cmp = icmp sgt i32 %a, 0
+ br i1 %cmp, label %cond.true, label %cond.false
+
+cond.true: ; preds = %entry
+ br label %cond.end
+
+cond.false: ; preds = %entry
+ br label %cond.end
+
+cond.end: ; preds = %cond.false, %cond.true
+ %cond = phi i8 [ %f, %cond.true ], [ %t, %cond.false ]
+ ret i8 %cond
+}
+
+define i16 @test_i16(i32 %a, i16 %f, i16 %t) {
+; ALL-LABEL: test_i16:
+; ALL: # %bb.0: # %entry
+; ALL-NEXT: xorl %eax, %eax
+; ALL-NEXT: cmpl %eax, %edi
+; ALL-NEXT: setg %al
+; ALL-NEXT: testb $1, %al
+; ALL-NEXT: jne .LBB1_2
+; ALL-NEXT: # %bb.1: # %cond.false
+; ALL-NEXT: movl %edx, %esi
+; ALL-NEXT: .LBB1_2: # %cond.end
+; ALL-NEXT: movl %esi, %eax
+; ALL-NEXT: retq
+entry:
+ %cmp = icmp sgt i32 %a, 0
+ br i1 %cmp, label %cond.true, label %cond.false
+
+cond.true: ; preds = %entry
+ br label %cond.end
+
+cond.false: ; preds = %entry
+ br label %cond.end
+
+cond.end: ; preds = %cond.false, %cond.true
+ %cond = phi i16 [ %f, %cond.true ], [ %t, %cond.false ]
+ ret i16 %cond
+}
+
+define i32 @test_i32(i32 %a, i32 %f, i32 %t) {
+; ALL-LABEL: test_i32:
+; ALL: # %bb.0: # %entry
+; ALL-NEXT: xorl %eax, %eax
+; ALL-NEXT: cmpl %eax, %edi
+; ALL-NEXT: setg %al
+; ALL-NEXT: testb $1, %al
+; ALL-NEXT: jne .LBB2_2
+; ALL-NEXT: # %bb.1: # %cond.false
+; ALL-NEXT: movl %edx, %esi
+; ALL-NEXT: .LBB2_2: # %cond.end
+; ALL-NEXT: movl %esi, %eax
+; ALL-NEXT: retq
+entry:
+ %cmp = icmp sgt i32 %a, 0
+ br i1 %cmp, label %cond.true, label %cond.false
+
+cond.true: ; preds = %entry
+ br label %cond.end
+
+cond.false: ; preds = %entry
+ br label %cond.end
+
+cond.end: ; preds = %cond.false, %cond.true
+ %cond = phi i32 [ %f, %cond.true ], [ %t, %cond.false ]
+ ret i32 %cond
+}
+
+define i64 @test_i64(i32 %a, i64 %f, i64 %t) {
+; ALL-LABEL: test_i64:
+; ALL: # %bb.0: # %entry
+; ALL-NEXT: xorl %eax, %eax
+; ALL-NEXT: cmpl %eax, %edi
+; ALL-NEXT: setg %al
+; ALL-NEXT: testb $1, %al
+; ALL-NEXT: jne .LBB3_2
+; ALL-NEXT: # %bb.1: # %cond.false
+; ALL-NEXT: movq %rdx, %rsi
+; ALL-NEXT: .LBB3_2: # %cond.end
+; ALL-NEXT: movq %rsi, %rax
+; ALL-NEXT: retq
+entry:
+ %cmp = icmp sgt i32 %a, 0
+ br i1 %cmp, label %cond.true, label %cond.false
+
+cond.true: ; preds = %entry
+ br label %cond.end
+
+cond.false: ; preds = %entry
+ br label %cond.end
+
+cond.end: ; preds = %cond.false, %cond.true
+ %cond = phi i64 [ %f, %cond.true ], [ %t, %cond.false ]
+ ret i64 %cond
+}
+
+define float @test_float(i32 %a, float %f, float %t) {
+; ALL-LABEL: test_float:
+; ALL: # %bb.0: # %entry
+; ALL-NEXT: xorl %eax, %eax
+; ALL-NEXT: cmpl %eax, %edi
+; ALL-NEXT: setg %al
+; ALL-NEXT: testb $1, %al
+; ALL-NEXT: jne .LBB4_2
+; ALL-NEXT: # %bb.1: # %cond.false
+; ALL-NEXT: movaps %xmm1, %xmm0
+; ALL-NEXT: .LBB4_2: # %cond.end
+; ALL-NEXT: retq
+entry:
+ %cmp = icmp sgt i32 %a, 0
+ br i1 %cmp, label %cond.true, label %cond.false
+
+cond.true: ; preds = %entry
+ br label %cond.end
+
+cond.false: ; preds = %entry
+ br label %cond.end
+
+cond.end: ; preds = %cond.false, %cond.true
+ %cond = phi float [ %f, %cond.true ], [ %t, %cond.false ]
+ ret float %cond
+}
+
+define double @test_double(i32 %a, double %f, double %t) {
+; ALL-LABEL: test_double:
+; ALL: # %bb.0: # %entry
+; ALL-NEXT: xorl %eax, %eax
+; ALL-NEXT: cmpl %eax, %edi
+; ALL-NEXT: setg %al
+; ALL-NEXT: testb $1, %al
+; ALL-NEXT: jne .LBB5_2
+; ALL-NEXT: # %bb.1: # %cond.false
+; ALL-NEXT: movaps %xmm1, %xmm0
+; ALL-NEXT: .LBB5_2: # %cond.end
+; ALL-NEXT: retq
+entry:
+ %cmp = icmp sgt i32 %a, 0
+ br i1 %cmp, label %cond.true, label %cond.false
+
+cond.true: ; preds = %entry
+ br label %cond.end
+
+cond.false: ; preds = %entry
+ br label %cond.end
+
+cond.end: ; preds = %cond.false, %cond.true
+ %cond = phi double [ %f, %cond.true ], [ %t, %cond.false ]
+ ret double %cond
+}
diff --git a/test/CodeGen/X86/GlobalISel/regbankselect-X86_64.mir b/test/CodeGen/X86/GlobalISel/regbankselect-X86_64.mir
index 95ef15ceb689..d6c881c12199 100644
--- a/test/CodeGen/X86/GlobalISel/regbankselect-X86_64.mir
+++ b/test/CodeGen/X86/GlobalISel/regbankselect-X86_64.mir
@@ -35,7 +35,7 @@
%ret = fadd double %arg1, %arg2
ret double %ret
}
-
+
define void @test_fsub_float() {
%ret1 = fsub float undef, undef
%ret2 = fsub double undef, undef
@@ -53,7 +53,7 @@
%ret2 = fdiv double undef, undef
ret void
}
-
+
define <4 x i32> @test_add_v4i32(<4 x i32> %arg1, <4 x i32> %arg2) {
%ret = add <4 x i32> %arg1, %arg2
@@ -180,7 +180,67 @@
entry:
ret i32* @g_int
}
-
+
+ define i8 @test_undef() {
+ ret i8 undef
+ }
+
+ define i8 @test_undef2(i8 %a) {
+ %r = add i8 %a, undef
+ ret i8 %r
+ }
+
+ define float @test_undef3() {
+ ret float undef
+ }
+
+ define float @test_undef4(float %a) {
+ %r = fadd float %a, undef
+ ret float %r
+ }
+
+ define i32 @test_i32(i32 %a, i32 %f, i32 %t) {
+ entry:
+ %cmp = icmp sgt i32 %a, 0
+ br i1 %cmp, label %cond.true, label %cond.false
+
+ cond.true: ; preds = %entry
+ br label %cond.end
+
+ cond.false: ; preds = %entry
+ br label %cond.end
+
+ cond.end: ; preds = %cond.false, %cond.true
+ %cond = phi i32 [ %f, %cond.true ], [ %t, %cond.false ]
+ ret i32 %cond
+ }
+
+ define float @test_float(i32 %a, float %f, float %t) {
+ entry:
+ %cmp = icmp sgt i32 %a, 0
+ br i1 %cmp, label %cond.true, label %cond.false
+
+ cond.true: ; preds = %entry
+ br label %cond.end
+
+ cond.false: ; preds = %entry
+ br label %cond.end
+
+ cond.end: ; preds = %cond.false, %cond.true
+ %cond = phi float [ %f, %cond.true ], [ %t, %cond.false ]
+ ret float %cond
+ }
+
+ define double @test_fpext(float %a) {
+ entry:
+ %conv = fpext float %a to double
+ ret double %conv
+ }
+
+ define void @test_fconstant() {
+ ret void
+ }
+
...
---
name: test_add_i8
@@ -202,8 +262,8 @@ body: |
bb.1 (%ir-block.0):
liveins: %edi, %esi
- %0(s8) = COPY %edi
- %1(s8) = COPY %esi
+ %0(s8) = COPY %dil
+ %1(s8) = COPY %sil
%2(s8) = G_ADD %0, %1
%al = COPY %2(s8)
RET 0, implicit %al
@@ -229,8 +289,8 @@ body: |
bb.1 (%ir-block.0):
liveins: %edi, %esi
- %0(s16) = COPY %edi
- %1(s16) = COPY %esi
+ %0(s16) = COPY %di
+ %1(s16) = COPY %si
%2(s16) = G_ADD %0, %1
%ax = COPY %2(s16)
RET 0, implicit %ax
@@ -315,10 +375,10 @@ registers:
- { id: 4, class: _ }
- { id: 5, class: _ }
- { id: 6, class: _ }
- - { id: 7, class: _ }
+ - { id: 7, class: _ }
body: |
bb.1 (%ir-block.0):
-
+
%0(s64) = IMPLICIT_DEF
%1(s32) = IMPLICIT_DEF
%2(s16) = IMPLICIT_DEF
@@ -771,9 +831,9 @@ body: |
%0(s32) = COPY %xmm0
%1(p0) = COPY %rdi
- ; CHECK: %1(p0) = COPY %rdi
+ ; CHECK: %1:gpr(p0) = COPY %rdi
- ; FAST-NEXT: %2(s32) = COPY %0(s32)
+ ; FAST-NEXT: %2:gpr(s32) = COPY %0(s32)
; FAST-NEXT: G_STORE %2(s32), %1(p0) :: (store 4 into %ir.p1)
; GREEDY-NEXT: G_STORE %0(s32), %1(p0) :: (store 4 into %ir.p1)
@@ -809,9 +869,9 @@ body: |
%0(s64) = COPY %xmm0
%1(p0) = COPY %rdi
- ; CHECK: %1(p0) = COPY %rdi
+ ; CHECK: %1:gpr(p0) = COPY %rdi
- ; FAST-NEXT: %2(s64) = COPY %0(s64)
+ ; FAST-NEXT: %2:gpr(s64) = COPY %0(s64)
; FAST-NEXT: G_STORE %2(s64), %1(p0) :: (store 8 into %ir.p1)
; GREEDY-NEXT: G_STORE %0(s64), %1(p0) :: (store 8 into %ir.p1)
@@ -913,8 +973,8 @@ body: |
bb.1 (%ir-block.0):
liveins: %edi, %esi
- %0(s8) = COPY %edi
- %1(s8) = COPY %esi
+ %0(s8) = COPY %dil
+ %1(s8) = COPY %sil
%2(s1) = G_ICMP intpred(eq), %0(s8), %1
%al = COPY %2(s1)
RET 0, implicit %al
@@ -938,8 +998,8 @@ body: |
bb.1 (%ir-block.0):
liveins: %edi, %esi
- %0(s16) = COPY %edi
- %1(s16) = COPY %esi
+ %0(s16) = COPY %di
+ %1(s16) = COPY %si
%2(s1) = G_ICMP intpred(eq), %0(s16), %1
%al = COPY %2(s1)
RET 0, implicit %al
@@ -1101,7 +1161,7 @@ regBankSelected: false
# CHECK-NEXT: - { id: 0, class: gpr, preferred-register: '' }
registers:
- { id: 0, class: _, preferred-register: '' }
-# CHECK: %0(p0) = G_GLOBAL_VALUE @g_int
+# CHECK: %0:gpr(p0) = G_GLOBAL_VALUE @g_int
# CHECK-NEXT: %rax = COPY %0(p0)
# CHECK-NEXT: RET 0, implicit %rax
body: |
@@ -1111,4 +1171,262 @@ body: |
RET 0, implicit %rax
...
+---
+name: test_undef
+# CHECK-LABEL: name: test_undef
+alignment: 4
+legalized: true
+regBankSelected: false
+# CHECK: registers:
+# CHECK-NEXT: - { id: 0, class: gpr, preferred-register: '' }
+registers:
+ - { id: 0, class: _, preferred-register: '' }
+liveins:
+fixedStack:
+stack:
+constants:
+# CHECK: %0:gpr(s8) = G_IMPLICIT_DEF
+# CHECK-NEXT: %al = COPY %0(s8)
+# CHECK-NEXT: RET 0, implicit %al
+body: |
+ bb.1 (%ir-block.0):
+ %0(s8) = G_IMPLICIT_DEF
+ %al = COPY %0(s8)
+ RET 0, implicit %al
+
+...
+---
+name: test_undef2
+# CHECK-LABEL: name: test_undef2
+alignment: 4
+legalized: true
+regBankSelected: false
+# CHECK: registers:
+# CHECK-NEXT: - { id: 0, class: gpr, preferred-register: '' }
+# CHECK-NEXT: - { id: 1, class: gpr, preferred-register: '' }
+# CHECK-NEXT: - { id: 2, class: gpr, preferred-register: '' }
+registers:
+ - { id: 0, class: _, preferred-register: '' }
+ - { id: 1, class: _, preferred-register: '' }
+ - { id: 2, class: _, preferred-register: '' }
+liveins:
+fixedStack:
+stack:
+constants:
+# CHECK: %0:gpr(s8) = COPY %dil
+# CHECK-NEXT: %1:gpr(s8) = G_IMPLICIT_DEF
+# CHECK-NEXT: %2:gpr(s8) = G_ADD %0, %1
+# CHECK-NEXT: %al = COPY %2(s8)
+# CHECK-NEXT: RET 0, implicit %al
+body: |
+ bb.1 (%ir-block.0):
+ liveins: %edi
+
+ %0(s8) = COPY %dil
+ %1(s8) = G_IMPLICIT_DEF
+ %2(s8) = G_ADD %0, %1
+ %al = COPY %2(s8)
+ RET 0, implicit %al
+...
+---
+name: test_undef3
+# CHECK-LABEL: name: test_undef3
+alignment: 4
+legalized: true
+regBankSelected: false
+# CHECK: registers:
+# CHECK-NEXT: - { id: 0, class: gpr, preferred-register: '' }
+registers:
+ - { id: 0, class: _, preferred-register: '' }
+liveins:
+fixedStack:
+stack:
+constants:
+# CHECK: %0:gpr(s32) = G_IMPLICIT_DEF
+# CHECK-NEXT: %xmm0 = COPY %0(s32)
+# CHECK-NEXT: RET 0, implicit %xmm0
+body: |
+ bb.1 (%ir-block.0):
+ %0(s32) = G_IMPLICIT_DEF
+ %xmm0 = COPY %0(s32)
+ RET 0, implicit %xmm0
+
+...
+---
+name: test_undef4
+# CHECK-LABEL: name: test_undef4
+alignment: 4
+legalized: true
+regBankSelected: false
+# CHECK: registers:
+# CHECK-NEXT: - { id: 0, class: vecr, preferred-register: '' }
+# CHECK-NEXT: - { id: 1, class: gpr, preferred-register: '' }
+# CHECK-NEXT: - { id: 2, class: vecr, preferred-register: '' }
+# CHECK-NEXT: - { id: 3, class: vecr, preferred-register: '' }
+registers:
+ - { id: 0, class: _, preferred-register: '' }
+ - { id: 1, class: _, preferred-register: '' }
+ - { id: 2, class: _, preferred-register: '' }
+liveins:
+fixedStack:
+stack:
+constants:
+# CHECK: %0:vecr(s32) = COPY %xmm0
+# CHECK-NEXT: %1:gpr(s32) = G_IMPLICIT_DEF
+# CHECK-NEXT: %3:vecr(s32) = COPY %1(s32)
+# CHECK-NEXT: %2:vecr(s32) = G_FADD %0, %3
+# CHECK-NEXT: %xmm0 = COPY %2(s32)
+# CHECK-NEXT: RET 0, implicit %xmm0
+body: |
+ bb.1 (%ir-block.0):
+ liveins: %xmm0
+
+ %0(s32) = COPY %xmm0
+ %1(s32) = G_IMPLICIT_DEF
+ %2(s32) = G_FADD %0, %1
+ %xmm0 = COPY %2(s32)
+ RET 0, implicit %xmm0
+
+...
+---
+name: test_i32
+# CHECK-LABEL: name: test_i32
+alignment: 4
+legalized: true
+regBankSelected: false
+tracksRegLiveness: true
+# CHECK: registers:
+# CHECK-NEXT: - { id: 0, class: gpr, preferred-register: '' }
+# CHECK-NEXT: - { id: 1, class: gpr, preferred-register: '' }
+# CHECK-NEXT: - { id: 2, class: gpr, preferred-register: '' }
+# CHECK-NEXT: - { id: 3, class: gpr, preferred-register: '' }
+# CHECK-NEXT: - { id: 4, class: gpr, preferred-register: '' }
+# CHECK-NEXT: - { id: 5, class: gpr, preferred-register: '' }
+registers:
+ - { id: 0, class: _, preferred-register: '' }
+ - { id: 1, class: _, preferred-register: '' }
+ - { id: 2, class: _, preferred-register: '' }
+ - { id: 3, class: _, preferred-register: '' }
+ - { id: 4, class: _, preferred-register: '' }
+ - { id: 5, class: _, preferred-register: '' }
+# CHECK: bb.3.cond.end:
+# CHECK-NEXT: %5:gpr(s32) = G_PHI %1(s32), %bb.1, %2(s32), %bb.2
+# CHECK-NEXT: %eax = COPY %5(s32)
+# CHECK-NEXT: RET 0, implicit %eax
+body: |
+ bb.0.entry:
+ successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ liveins: %edi, %edx, %esi
+
+ %0(s32) = COPY %edi
+ %1(s32) = COPY %esi
+ %2(s32) = COPY %edx
+ %3(s32) = G_CONSTANT i32 0
+ %4(s1) = G_ICMP intpred(sgt), %0(s32), %3
+ G_BRCOND %4(s1), %bb.1
+ G_BR %bb.2
+
+ bb.1.cond.true:
+ successors: %bb.3(0x80000000)
+
+ G_BR %bb.3
+
+ bb.2.cond.false:
+ successors: %bb.3(0x80000000)
+
+ bb.3.cond.end:
+ %5(s32) = G_PHI %1(s32), %bb.1, %2(s32), %bb.2
+ %eax = COPY %5(s32)
+ RET 0, implicit %eax
+
+...
+---
+name: test_float
+# CHECK-LABEL: name: test_float
+alignment: 4
+legalized: true
+regBankSelected: false
+tracksRegLiveness: true
+# CHECK: registers:
+# CHECK-NEXT: - { id: 0, class: gpr, preferred-register: '' }
+# CHECK-NEXT: - { id: 1, class: vecr, preferred-register: '' }
+# CHECK-NEXT: - { id: 2, class: vecr, preferred-register: '' }
+# CHECK-NEXT: - { id: 3, class: gpr, preferred-register: '' }
+# CHECK-NEXT: - { id: 4, class: gpr, preferred-register: '' }
+# CHECK-NEXT: - { id: 5, class: vecr, preferred-register: '' }
+registers:
+ - { id: 0, class: _, preferred-register: '' }
+ - { id: 1, class: _, preferred-register: '' }
+ - { id: 2, class: _, preferred-register: '' }
+ - { id: 3, class: _, preferred-register: '' }
+ - { id: 4, class: _, preferred-register: '' }
+ - { id: 5, class: _, preferred-register: '' }
+# CHECK: bb.3.cond.end:
+# CHECK-NEXT: %5:vecr(s32) = G_PHI %1(s32), %bb.1, %2(s32), %bb.2
+# CHECK-NEXT: %xmm0 = COPY %5(s32)
+# CHECK-NEXT: RET 0, implicit %xmm0
+body: |
+ bb.0.entry:
+ successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ liveins: %edi, %xmm0, %xmm1
+
+ %0(s32) = COPY %edi
+ %1(s32) = COPY %xmm0
+ %2(s32) = COPY %xmm1
+ %3(s32) = G_CONSTANT i32 0
+ %4(s1) = G_ICMP intpred(sgt), %0(s32), %3
+ G_BRCOND %4(s1), %bb.1
+ G_BR %bb.2
+
+ bb.1.cond.true:
+ successors: %bb.3(0x80000000)
+
+ G_BR %bb.3
+
+ bb.2.cond.false:
+ successors: %bb.3(0x80000000)
+
+ bb.3.cond.end:
+ %5(s32) = G_PHI %1(s32), %bb.1, %2(s32), %bb.2
+ %xmm0 = COPY %5(s32)
+ RET 0, implicit %xmm0
+
+...
+---
+name: test_fpext
+# CHECK-LABEL: name: test_fpext
+alignment: 4
+legalized: true
+regBankSelected: false
+# CHECK: registers:
+# CHECK-NEXT: - { id: 0, class: vecr, preferred-register: '' }
+# CHECK-NEXT: - { id: 1, class: vecr, preferred-register: '' }
+registers:
+ - { id: 0, class: _, preferred-register: '' }
+ - { id: 1, class: _, preferred-register: '' }
+body: |
+ bb.1.entry:
+ liveins: %xmm0
+
+ %0(s32) = COPY %xmm0
+ %1(s64) = G_FPEXT %0(s32)
+ %xmm0 = COPY %1(s64)
+ RET 0, implicit %xmm0
+
+...
+---
+name: test_fconstant
+# ALL-LABEL: name: test_fconstant
+legalized: true
+# CHECK: registers:
+# CHECK-NEXT: - { id: 0, class: vecr, preferred-register: '' }
+# CHECK-NEXT: - { id: 1, class: vecr, preferred-register: '' }
+registers:
+ - { id: 0, class: _ }
+ - { id: 1, class: _ }
+body: |
+ bb.0:
+ %0(s32) = G_FCONSTANT float 1.0
+ %1(s64) = G_FCONSTANT double 2.0
+...
diff --git a/test/CodeGen/X86/GlobalISel/select-GV.mir b/test/CodeGen/X86/GlobalISel/select-GV.mir
index 2f2fd51d99d1..0248ca289596 100644
--- a/test/CodeGen/X86/GlobalISel/select-GV.mir
+++ b/test/CodeGen/X86/GlobalISel/select-GV.mir
@@ -7,9 +7,10 @@
@g_int = global i32 0, align 4
- define i32* @test_global_ptrv() {
+ define void @test_global_ptrv() {
entry:
- ret i32* @g_int
+ store i32* @g_int, i32** undef
+ ret void
}
define i32 @test_global_valv() {
@@ -27,31 +28,45 @@ legalized: true
regBankSelected: true
# X64ALL: registers:
# X64ALL-NEXT: - { id: 0, class: gr64, preferred-register: '' }
+# X64ALL-NEXT: - { id: 1, class: gr64, preferred-register: '' }
#
-# X32ALL: registers:
-# X32ALL-NEXT: - { id: 0, class: gr32, preferred-register: '' }
+# X32: registers:
+# X32-NEXT: - { id: 0, class: gr32, preferred-register: '' }
+# X32-NEXT: - { id: 1, class: gr32, preferred-register: '' }
+#
+# X32ABI: registers:
+# X32ABI-NEXT: - { id: 0, class: low32_addr_access, preferred-register: '' }
+# X32ABI-NEXT: - { id: 1, class: gr32, preferred-register: '' }
registers:
- { id: 0, class: gpr, preferred-register: '' }
-# X64: %0 = LEA64r _, 1, _, @g_int, _
-# X64-NEXT: %rax = COPY %0
-# X64-NEXT: RET 0, implicit %rax
+ - { id: 1, class: gpr, preferred-register: '' }
+# X64: %0:gr64 = IMPLICIT_DEF
+# X64-NEXT: %1:gr64 = LEA64r %noreg, 1, %noreg, @g_int, %noreg
+# X64-NEXT: MOV64mr %0, 1, %noreg, 0, %noreg, %1 :: (store 8 into `i32** undef`)
+# X64-NEXT: RET 0
#
-# X64_DARWIN_PIC: %0 = LEA64r %rip, 1, _, @g_int, _
-# X64_DARWIN_PIC-NEXT: %rax = COPY %0
-# X64_DARWIN_PIC-NEXT: RET 0, implicit %rax
+# X64_DARWIN_PIC: %0:gr64 = IMPLICIT_DEF
+# X64_DARWIN_PIC-NEXT: %1:gr64 = LEA64r %rip, 1, %noreg, @g_int, %noreg
+# X64_DARWIN_PIC-NEXT: MOV64mr %0, 1, %noreg, 0, %noreg, %1 :: (store 8 into `i32** undef`)
+# X64_DARWIN_PIC-NEXT: RET 0
#
-# X32: %0 = LEA32r _, 1, _, @g_int, _
-# X32-NEXT: %rax = COPY %0
-# X32-NEXT: RET 0, implicit %rax
+# X32: %0:gr32 = IMPLICIT_DEF
+# X32-NEXT: %1:gr32 = LEA32r %noreg, 1, %noreg, @g_int, %noreg
+# X32-NEXT: MOV32mr %0, 1, %noreg, 0, %noreg, %1 :: (store 8 into `i32** undef`)
+# X32-NEXT: RET 0
#
-# X32ABI: %0 = LEA64_32r _, 1, _, @g_int, _
-# X32ABI-NEXT: %rax = COPY %0
-# X32ABI-NEXT: RET 0, implicit %rax
+# X32ABI: %0:low32_addr_access = IMPLICIT_DEF
+# X32ABI-NEXT: %1:gr32 = LEA64_32r %noreg, 1, %noreg, @g_int, %noreg
+# X32ABI-NEXT: MOV32mr %0, 1, %noreg, 0, %noreg, %1 :: (store 8 into `i32** undef`)
+# X32ABI-NEXT: RET 0
body: |
bb.1.entry:
- %0(p0) = G_GLOBAL_VALUE @g_int
- %rax = COPY %0(p0)
- RET 0, implicit %rax
+ liveins: %rdi
+
+ %0(p0) = IMPLICIT_DEF
+ %1(p0) = G_GLOBAL_VALUE @g_int
+ G_STORE %1(p0), %0(p0) :: (store 8 into `i32** undef`)
+ RET 0
...
---
@@ -70,23 +85,23 @@ regBankSelected: true
registers:
- { id: 0, class: gpr, preferred-register: '' }
- { id: 1, class: gpr, preferred-register: '' }
-# X64: %1 = LEA64r _, 1, _, @g_int, _
-# X64-NEXT: %0 = MOV32rm %1, 1, _, 0, _ :: (load 4 from @g_int)
+# X64: %1:gr64 = LEA64r %noreg, 1, %noreg, @g_int, %noreg
+# X64-NEXT: %0:gr32 = MOV32rm %1, 1, %noreg, 0, %noreg :: (load 4 from @g_int)
# X64-NEXT: %eax = COPY %0
# X64-NEXT: RET 0, implicit %eax
#
-# X64_DARWIN_PIC: %1 = LEA64r %rip, 1, _, @g_int, _
-# X64_DARWIN_PIC-NEXT: %0 = MOV32rm %1, 1, _, 0, _ :: (load 4 from @g_int)
+# X64_DARWIN_PIC: %1:gr64 = LEA64r %rip, 1, %noreg, @g_int, %noreg
+# X64_DARWIN_PIC-NEXT: %0:gr32 = MOV32rm %1, 1, %noreg, 0, %noreg :: (load 4 from @g_int)
# X64_DARWIN_PIC-NEXT: %eax = COPY %0
# X64_DARWIN_PIC-NEXT: RET 0, implicit %eax
#
-# X32: %1 = LEA32r _, 1, _, @g_int, _
-# X32-NEXT: %0 = MOV32rm %1, 1, _, 0, _ :: (load 4 from @g_int)
+# X32: %1:gr32 = LEA32r %noreg, 1, %noreg, @g_int, %noreg
+# X32-NEXT: %0:gr32 = MOV32rm %1, 1, %noreg, 0, %noreg :: (load 4 from @g_int)
# X32-NEXT: %eax = COPY %0
# X32-NEXT: RET 0, implicit %eax
#
-# X32ABI: %1 = LEA64_32r _, 1, _, @g_int, _
-# X32ABI-NEXT: %0 = MOV32rm %1, 1, _, 0, _ :: (load 4 from @g_int)
+# X32ABI: %1:gr32 = LEA64_32r %noreg, 1, %noreg, @g_int, %noreg
+# X32ABI-NEXT: %0:gr32 = MOV32rm %1, 1, %noreg, 0, %noreg :: (load 4 from @g_int)
# X32ABI-NEXT: %eax = COPY %0
# X32ABI-NEXT: RET 0, implicit %eax
body: |
diff --git a/test/CodeGen/X86/GlobalISel/select-add-v128.mir b/test/CodeGen/X86/GlobalISel/select-add-v128.mir
index 4f7b6ec72d52..7a2f606a45af 100644
--- a/test/CodeGen/X86/GlobalISel/select-add-v128.mir
+++ b/test/CodeGen/X86/GlobalISel/select-add-v128.mir
@@ -49,13 +49,13 @@ registers:
- { id: 0, class: vecr }
- { id: 1, class: vecr }
- { id: 2, class: vecr }
-# SSE2: %2 = PADDBrr %0, %1
+# SSE2: %2:vr128 = PADDBrr %0, %1
#
-# AVX1: %2 = VPADDBrr %0, %1
+# AVX1: %2:vr128 = VPADDBrr %0, %1
#
-# AVX512VL: %2 = VPADDBrr %0, %1
+# AVX512VL: %2:vr128 = VPADDBrr %0, %1
#
-# AVX512BWVL: %2 = VPADDBZ128rr %0, %1
+# AVX512BWVL: %2:vr128x = VPADDBZ128rr %0, %1
body: |
bb.1 (%ir-block.0):
liveins: %xmm0, %xmm1
@@ -91,13 +91,13 @@ registers:
- { id: 0, class: vecr }
- { id: 1, class: vecr }
- { id: 2, class: vecr }
-# SSE2: %2 = PADDWrr %0, %1
+# SSE2: %2:vr128 = PADDWrr %0, %1
#
-# AVX1: %2 = VPADDWrr %0, %1
+# AVX1: %2:vr128 = VPADDWrr %0, %1
#
-# AVX512VL: %2 = VPADDWrr %0, %1
+# AVX512VL: %2:vr128 = VPADDWrr %0, %1
#
-# AVX512BWVL: %2 = VPADDWZ128rr %0, %1
+# AVX512BWVL: %2:vr128x = VPADDWZ128rr %0, %1
body: |
bb.1 (%ir-block.0):
liveins: %xmm0, %xmm1
@@ -133,13 +133,13 @@ registers:
- { id: 0, class: vecr }
- { id: 1, class: vecr }
- { id: 2, class: vecr }
-# SSE2: %2 = PADDDrr %0, %1
+# SSE2: %2:vr128 = PADDDrr %0, %1
#
-# AVX1: %2 = VPADDDrr %0, %1
+# AVX1: %2:vr128 = VPADDDrr %0, %1
#
-# AVX512VL: %2 = VPADDDZ128rr %0, %1
+# AVX512VL: %2:vr128x = VPADDDZ128rr %0, %1
#
-# AVX512BWVL: %2 = VPADDDZ128rr %0, %1
+# AVX512BWVL: %2:vr128x = VPADDDZ128rr %0, %1
body: |
bb.1 (%ir-block.0):
liveins: %xmm0, %xmm1
@@ -175,13 +175,13 @@ registers:
- { id: 0, class: vecr }
- { id: 1, class: vecr }
- { id: 2, class: vecr }
-# SSE2: %2 = PADDQrr %0, %1
+# SSE2: %2:vr128 = PADDQrr %0, %1
#
-# AVX1: %2 = VPADDQrr %0, %1
+# AVX1: %2:vr128 = VPADDQrr %0, %1
#
-# AVX512VL: %2 = VPADDQZ128rr %0, %1
+# AVX512VL: %2:vr128x = VPADDQZ128rr %0, %1
#
-# AVX512BWVL: %2 = VPADDQZ128rr %0, %1
+# AVX512BWVL: %2:vr128x = VPADDQZ128rr %0, %1
body: |
bb.1 (%ir-block.0):
liveins: %xmm0, %xmm1
diff --git a/test/CodeGen/X86/GlobalISel/select-add-v256.mir b/test/CodeGen/X86/GlobalISel/select-add-v256.mir
index 143fd9422974..8a98a6d87648 100644
--- a/test/CodeGen/X86/GlobalISel/select-add-v256.mir
+++ b/test/CodeGen/X86/GlobalISel/select-add-v256.mir
@@ -47,11 +47,11 @@ registers:
- { id: 0, class: vecr }
- { id: 1, class: vecr }
- { id: 2, class: vecr }
-# AVX2: %2 = VPADDBYrr %0, %1
+# AVX2: %2:vr256 = VPADDBYrr %0, %1
#
-# AVX512VL: %2 = VPADDBYrr %0, %1
+# AVX512VL: %2:vr256 = VPADDBYrr %0, %1
#
-# AVX512BWVL: %2 = VPADDBZ256rr %0, %1
+# AVX512BWVL: %2:vr256x = VPADDBZ256rr %0, %1
body: |
bb.1 (%ir-block.0):
liveins: %ymm0, %ymm1
@@ -87,11 +87,11 @@ registers:
- { id: 0, class: vecr }
- { id: 1, class: vecr }
- { id: 2, class: vecr }
-# AVX2: %2 = VPADDWYrr %0, %1
+# AVX2: %2:vr256 = VPADDWYrr %0, %1
#
-# AVX512VL: %2 = VPADDWYrr %0, %1
+# AVX512VL: %2:vr256 = VPADDWYrr %0, %1
#
-# AVX512BWVL: %2 = VPADDWZ256rr %0, %1
+# AVX512BWVL: %2:vr256x = VPADDWZ256rr %0, %1
body: |
bb.1 (%ir-block.0):
liveins: %ymm0, %ymm1
@@ -127,11 +127,11 @@ registers:
- { id: 0, class: vecr }
- { id: 1, class: vecr }
- { id: 2, class: vecr }
-# AVX2: %2 = VPADDDYrr %0, %1
+# AVX2: %2:vr256 = VPADDDYrr %0, %1
#
-# AVX512VL: %2 = VPADDDZ256rr %0, %1
+# AVX512VL: %2:vr256x = VPADDDZ256rr %0, %1
#
-# AVX512BWVL: %2 = VPADDDZ256rr %0, %1
+# AVX512BWVL: %2:vr256x = VPADDDZ256rr %0, %1
body: |
bb.1 (%ir-block.0):
liveins: %ymm0, %ymm1
@@ -167,11 +167,11 @@ registers:
- { id: 0, class: vecr }
- { id: 1, class: vecr }
- { id: 2, class: vecr }
-# AVX2: %2 = VPADDQYrr %0, %1
+# AVX2: %2:vr256 = VPADDQYrr %0, %1
#
-# AVX512VL: %2 = VPADDQZ256rr %0, %1
+# AVX512VL: %2:vr256x = VPADDQZ256rr %0, %1
#
-# AVX512BWVL: %2 = VPADDQZ256rr %0, %1
+# AVX512BWVL: %2:vr256x = VPADDQZ256rr %0, %1
body: |
bb.1 (%ir-block.0):
liveins: %ymm0, %ymm1
diff --git a/test/CodeGen/X86/GlobalISel/select-add-v512.mir b/test/CodeGen/X86/GlobalISel/select-add-v512.mir
index 6a0cd32eefd5..392d22c09003 100644
--- a/test/CodeGen/X86/GlobalISel/select-add-v512.mir
+++ b/test/CodeGen/X86/GlobalISel/select-add-v512.mir
@@ -1,3 +1,4 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple=x86_64-linux-gnu -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL
--- |
@@ -26,23 +27,23 @@
...
---
name: test_add_v64i8
-# ALL-LABEL: name: test_add_v64i8
alignment: 4
legalized: true
regBankSelected: true
-# ALL: registers:
-# ALL-NEXT: - { id: 0, class: vr512, preferred-register: '' }
-# ALL-NEXT: - { id: 1, class: vr512, preferred-register: '' }
-# ALL-NEXT: - { id: 2, class: vr512, preferred-register: '' }
registers:
- { id: 0, class: vecr }
- { id: 1, class: vecr }
- { id: 2, class: vecr }
-# ALL: %2 = VPADDBZrr %0, %1
body: |
bb.1 (%ir-block.0):
liveins: %zmm0, %zmm1
+ ; ALL-LABEL: name: test_add_v64i8
+ ; ALL: [[COPY:%[0-9]+]]:vr512 = COPY %zmm0
+ ; ALL: [[COPY1:%[0-9]+]]:vr512 = COPY %zmm1
+ ; ALL: [[VPADDBZrr:%[0-9]+]]:vr512 = VPADDBZrr [[COPY]], [[COPY1]]
+ ; ALL: %zmm0 = COPY [[VPADDBZrr]]
+ ; ALL: RET 0, implicit %zmm0
%0(<64 x s8>) = COPY %zmm0
%1(<64 x s8>) = COPY %zmm1
%2(<64 x s8>) = G_ADD %0, %1
@@ -52,23 +53,23 @@ body: |
...
---
name: test_add_v32i16
-# ALL-LABEL: name: test_add_v32i16
alignment: 4
legalized: true
regBankSelected: true
-# ALL: registers:
-# ALL-NEXT: - { id: 0, class: vr512, preferred-register: '' }
-# ALL-NEXT: - { id: 1, class: vr512, preferred-register: '' }
-# ALL-NEXT: - { id: 2, class: vr512, preferred-register: '' }
registers:
- { id: 0, class: vecr }
- { id: 1, class: vecr }
- { id: 2, class: vecr }
-# ALL: %2 = VPADDWZrr %0, %1
body: |
bb.1 (%ir-block.0):
liveins: %zmm0, %zmm1
+ ; ALL-LABEL: name: test_add_v32i16
+ ; ALL: [[COPY:%[0-9]+]]:vr512 = COPY %zmm0
+ ; ALL: [[COPY1:%[0-9]+]]:vr512 = COPY %zmm1
+ ; ALL: [[VPADDWZrr:%[0-9]+]]:vr512 = VPADDWZrr [[COPY]], [[COPY1]]
+ ; ALL: %zmm0 = COPY [[VPADDWZrr]]
+ ; ALL: RET 0, implicit %zmm0
%0(<32 x s16>) = COPY %zmm0
%1(<32 x s16>) = COPY %zmm1
%2(<32 x s16>) = G_ADD %0, %1
@@ -78,23 +79,23 @@ body: |
...
---
name: test_add_v16i32
-# ALL-LABEL: name: test_add_v16i32
alignment: 4
legalized: true
regBankSelected: true
-# ALL: registers:
-# ALL-NEXT: - { id: 0, class: vr512, preferred-register: '' }
-# ALL-NEXT: - { id: 1, class: vr512, preferred-register: '' }
-# ALL-NEXT: - { id: 2, class: vr512, preferred-register: '' }
registers:
- { id: 0, class: vecr }
- { id: 1, class: vecr }
- { id: 2, class: vecr }
-# ALL: %2 = VPADDDZrr %0, %1
body: |
bb.1 (%ir-block.0):
liveins: %zmm0, %zmm1
+ ; ALL-LABEL: name: test_add_v16i32
+ ; ALL: [[COPY:%[0-9]+]]:vr512 = COPY %zmm0
+ ; ALL: [[COPY1:%[0-9]+]]:vr512 = COPY %zmm1
+ ; ALL: [[VPADDDZrr:%[0-9]+]]:vr512 = VPADDDZrr [[COPY]], [[COPY1]]
+ ; ALL: %zmm0 = COPY [[VPADDDZrr]]
+ ; ALL: RET 0, implicit %zmm0
%0(<16 x s32>) = COPY %zmm0
%1(<16 x s32>) = COPY %zmm1
%2(<16 x s32>) = G_ADD %0, %1
@@ -104,23 +105,23 @@ body: |
...
---
name: test_add_v8i64
-# ALL-LABEL: name: test_add_v8i64
alignment: 4
legalized: true
regBankSelected: true
-# ALL: registers:
-# ALL-NEXT: - { id: 0, class: vr512, preferred-register: '' }
-# ALL-NEXT: - { id: 1, class: vr512, preferred-register: '' }
-# ALL-NEXT: - { id: 2, class: vr512, preferred-register: '' }
registers:
- { id: 0, class: vecr }
- { id: 1, class: vecr }
- { id: 2, class: vecr }
-# ALL: %2 = VPADDQZrr %0, %1
body: |
bb.1 (%ir-block.0):
liveins: %zmm0, %zmm1
+ ; ALL-LABEL: name: test_add_v8i64
+ ; ALL: [[COPY:%[0-9]+]]:vr512 = COPY %zmm0
+ ; ALL: [[COPY1:%[0-9]+]]:vr512 = COPY %zmm1
+ ; ALL: [[VPADDQZrr:%[0-9]+]]:vr512 = VPADDQZrr [[COPY]], [[COPY1]]
+ ; ALL: %zmm0 = COPY [[VPADDQZrr]]
+ ; ALL: RET 0, implicit %zmm0
%0(<8 x s64>) = COPY %zmm0
%1(<8 x s64>) = COPY %zmm1
%2(<8 x s64>) = G_ADD %0, %1
diff --git a/test/CodeGen/X86/GlobalISel/select-add-x32.mir b/test/CodeGen/X86/GlobalISel/select-add-x32.mir
index 0b864f417367..4f04bc58ae6d 100644
--- a/test/CodeGen/X86/GlobalISel/select-add-x32.mir
+++ b/test/CodeGen/X86/GlobalISel/select-add-x32.mir
@@ -1,3 +1,4 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple=i386-linux-gnu -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=X32
--- |
define i64 @test_add_i64(i64 %a, i64 %b) {
@@ -8,21 +9,9 @@
...
---
name: test_add_i64
-# X32-LABEL: name: test_add_i64
alignment: 4
legalized: true
regBankSelected: true
-# X32: registers:
-# X32-NEXT: - { id: 0, class: gr32, preferred-register: '' }
-# X32-NEXT: - { id: 1, class: gr32, preferred-register: '' }
-# X32-NEXT: - { id: 2, class: gr32, preferred-register: '' }
-# X32-NEXT: - { id: 3, class: gr32, preferred-register: '' }
-# X32-NEXT: - { id: 4, class: gpr, preferred-register: '' }
-# X32-NEXT: - { id: 5, class: gr32, preferred-register: '' }
-# X32-NEXT: - { id: 6, class: gr32, preferred-register: '' }
-# X32-NEXT: - { id: 7, class: gr32, preferred-register: '' }
-# X32-NEXT: - { id: 8, class: gr32, preferred-register: '' }
-# X32-NEXT: - { id: 9, class: gpr, preferred-register: '' }
registers:
- { id: 0, class: gpr }
- { id: 1, class: gpr }
@@ -34,20 +23,21 @@ registers:
- { id: 7, class: gpr }
- { id: 8, class: gpr }
- { id: 9, class: gpr }
-# X32: %0 = IMPLICIT_DEF
-# X32-NEXT: %1 = IMPLICIT_DEF
-# X32-NEXT: %2 = IMPLICIT_DEF
-# X32-NEXT: %3 = IMPLICIT_DEF
-# X32-NEXT: %5 = ADD32rr %0, %2, implicit-def %eflags
-# X32-NEXT: %6 = COPY %eflags
-# X32-NEXT: %eflags = COPY %6
-# X32-NEXT: %7 = ADC32rr %1, %3, implicit-def %eflags, implicit %eflags
-# X32-NEXT: %8 = COPY %eflags
-# X32-NEXT: %eax = COPY %5
-# X32-NEXT: %edx = COPY %7
-# X32-NEXT: RET 0, implicit %eax, implicit %edx
body: |
bb.0 (%ir-block.0):
+ ; X32-LABEL: name: test_add_i64
+ ; X32: [[DEF:%[0-9]+]]:gr32 = IMPLICIT_DEF
+ ; X32: [[DEF1:%[0-9]+]]:gr32 = IMPLICIT_DEF
+ ; X32: [[DEF2:%[0-9]+]]:gr32 = IMPLICIT_DEF
+ ; X32: [[DEF3:%[0-9]+]]:gr32 = IMPLICIT_DEF
+ ; X32: [[ADD32rr:%[0-9]+]]:gr32 = ADD32rr [[DEF]], [[DEF2]], implicit-def %eflags
+ ; X32: [[COPY:%[0-9]+]]:gr32 = COPY %eflags
+ ; X32: %eflags = COPY [[COPY]]
+ ; X32: [[ADC32rr:%[0-9]+]]:gr32 = ADC32rr [[DEF1]], [[DEF3]], implicit-def %eflags, implicit %eflags
+ ; X32: [[COPY1:%[0-9]+]]:gr32 = COPY %eflags
+ ; X32: %eax = COPY [[ADD32rr]]
+ ; X32: %edx = COPY [[ADC32rr]]
+ ; X32: RET 0, implicit %eax, implicit %edx
%0(s32) = IMPLICIT_DEF
%1(s32) = IMPLICIT_DEF
%2(s32) = IMPLICIT_DEF
diff --git a/test/CodeGen/X86/GlobalISel/select-add.mir b/test/CodeGen/X86/GlobalISel/select-add.mir
index 45811c5cdc26..1f42fd4fc859 100644
--- a/test/CodeGen/X86/GlobalISel/select-add.mir
+++ b/test/CodeGen/X86/GlobalISel/select-add.mir
@@ -40,17 +40,13 @@ name: test_add_i64
# ALL-LABEL: name: test_add_i64
legalized: true
regBankSelected: true
-# ALL: registers:
-# ALL-NEXT: - { id: 0, class: gr64, preferred-register: '' }
-# ALL-NEXT: - { id: 1, class: gr64, preferred-register: '' }
-# ALL-NEXT: - { id: 2, class: gr64, preferred-register: '' }
registers:
- { id: 0, class: gpr }
- { id: 1, class: gpr }
- { id: 2, class: gpr }
-# ALL: %0 = COPY %rdi
-# ALL-NEXT: %1 = COPY %rsi
-# ALL-NEXT: %2 = ADD64rr %0, %1
+# ALL: %0:gr64 = COPY %rdi
+# ALL-NEXT: %1:gr64 = COPY %rsi
+# ALL-NEXT: %2:gr64 = ADD64rr %0, %1
body: |
bb.1 (%ir-block.0):
liveins: %edi, %esi
@@ -67,17 +63,13 @@ name: test_add_i32
# ALL-LABEL: name: test_add_i32
legalized: true
regBankSelected: true
-# ALL: registers:
-# ALL-NEXT: - { id: 0, class: gr32, preferred-register: '' }
-# ALL-NEXT: - { id: 1, class: gr32, preferred-register: '' }
-# ALL-NEXT: - { id: 2, class: gr32, preferred-register: '' }
registers:
- { id: 0, class: gpr }
- { id: 1, class: gpr }
- { id: 2, class: gpr }
-# ALL: %0 = COPY %edi
-# ALL-NEXT: %1 = COPY %esi
-# ALL-NEXT: %2 = ADD32rr %0, %1
+# ALL: %0:gr32 = COPY %edi
+# ALL-NEXT: %1:gr32 = COPY %esi
+# ALL-NEXT: %2:gr32 = ADD32rr %0, %1
body: |
bb.1 (%ir-block.0):
liveins: %edi, %esi
@@ -85,7 +77,7 @@ body: |
%0(s32) = COPY %edi
%1(s32) = COPY %esi
%2(s32) = G_ADD %0, %1
- %rax = COPY %2(s32)
+ %eax = COPY %2(s32)
...
---
@@ -95,23 +87,19 @@ alignment: 4
legalized: true
regBankSelected: true
selected: false
-# ALL: registers:
-# ALL-NEXT: - { id: 0, class: gr16, preferred-register: '' }
-# ALL-NEXT: - { id: 1, class: gr16, preferred-register: '' }
-# ALL-NEXT: - { id: 2, class: gr16, preferred-register: '' }
registers:
- { id: 0, class: gpr }
- { id: 1, class: gpr }
- { id: 2, class: gpr }
-# ALL: %0 = COPY %di
-# ALL: %1 = COPY %si
-# ALL: %2 = ADD16rr %0, %1, implicit-def %eflags
+# ALL: %0:gr16 = COPY %di
+# ALL: %1:gr16 = COPY %si
+# ALL: %2:gr16 = ADD16rr %0, %1, implicit-def %eflags
body: |
bb.1 (%ir-block.0):
liveins: %edi, %esi
- %0(s16) = COPY %edi
- %1(s16) = COPY %esi
+ %0(s16) = COPY %di
+ %1(s16) = COPY %si
%2(s16) = G_ADD %0, %1
%ax = COPY %2(s16)
RET 0, implicit %ax
@@ -124,23 +112,19 @@ alignment: 4
legalized: true
regBankSelected: true
selected: false
-# ALL: registers:
-# ALL-NEXT: - { id: 0, class: gr8, preferred-register: '' }
-# ALL-NEXT: - { id: 1, class: gr8, preferred-register: '' }
-# ALL-NEXT: - { id: 2, class: gr8, preferred-register: '' }
registers:
- { id: 0, class: gpr }
- { id: 1, class: gpr }
- { id: 2, class: gpr }
-# ALL: %0 = COPY %dil
-# ALL: %1 = COPY %sil
-# ALL: %2 = ADD8rr %0, %1, implicit-def %eflags
+# ALL: %0:gr8 = COPY %dil
+# ALL: %1:gr8 = COPY %sil
+# ALL: %2:gr8 = ADD8rr %0, %1, implicit-def %eflags
body: |
bb.1 (%ir-block.0):
liveins: %edi, %esi
- %0(s8) = COPY %edi
- %1(s8) = COPY %esi
+ %0(s8) = COPY %dil
+ %1(s8) = COPY %sil
%2(s8) = G_ADD %0, %1
%al = COPY %2(s8)
RET 0, implicit %al
@@ -154,23 +138,18 @@ legalized: true
regBankSelected: true
selected: false
tracksRegLiveness: true
-# ALL: registers:
-# NO_AVX512VL-NEXT: - { id: 0, class: vr128, preferred-register: '' }
-# NO_AVX512VL-NEXT: - { id: 1, class: vr128, preferred-register: '' }
-# NO_AVX512VL-NEXT: - { id: 2, class: vr128, preferred-register: '' }
-# AVX512VL-NEXT: - { id: 0, class: vr128x, preferred-register: '' }
-# AVX512VL-NEXT: - { id: 1, class: vr128x, preferred-register: '' }
-# AVX512VL-NEXT: - { id: 2, class: vr128x, preferred-register: '' }
registers:
- { id: 0, class: vecr }
- { id: 1, class: vecr }
- { id: 2, class: vecr }
-# ALL: %0 = COPY %xmm0
-# ALL-NEXT: %1 = COPY %xmm1
-# SSE-NEXT: %2 = PADDDrr %0, %1
-# AVX-NEXT: %2 = VPADDDrr %0, %1
-# AVX512F-NEXT: %2 = VPADDDrr %0, %1
-# AVX512VL-NEXT: %2 = VPADDDZ128rr %0, %1
+# NO_AVX512VL: %0:vr128 = COPY %xmm0
+# NO_AVX512VL: %1:vr128 = COPY %xmm1
+# SSE-NEXT: %2:vr128 = PADDDrr %0, %1
+# AVX-NEXT: %2:vr128 = VPADDDrr %0, %1
+# AVX512F-NEXT: %2:vr128 = VPADDDrr %0, %1
+# AVX512VL: %0:vr128x = COPY %xmm0
+# AVX512VL: %1:vr128x = COPY %xmm1
+# AVX512VL-NEXT: %2:vr128x = VPADDDZ128rr %0, %1
body: |
bb.1 (%ir-block.0):
liveins: %xmm0, %xmm1
@@ -190,23 +169,22 @@ legalized: true
regBankSelected: true
selected: false
tracksRegLiveness: true
-# ALL: registers:
-# NO_AVX512VL-NEXT: - { id: 0, class: vr128, preferred-register: '' }
-# NO_AVX512VL-NEXT: - { id: 1, class: vr128, preferred-register: '' }
-# NO_AVX512VL-NEXT: - { id: 2, class: vr128, preferred-register: '' }
-# AVX512VL-NEXT: - { id: 0, class: vr128x, preferred-register: '' }
-# AVX512VL-NEXT: - { id: 1, class: vr128x, preferred-register: '' }
-# AVX512VL-NEXT: - { id: 2, class: vr128x, preferred-register: '' }
registers:
- { id: 0, class: vecr }
- { id: 1, class: vecr }
- { id: 2, class: vecr }
-# ALL: %0 = COPY %xmm0
-# ALL-NEXT: %1 = COPY %xmm1
-# SSE-NEXT: %2 = ADDPSrr %0, %1
-# AVX-NEXT: %2 = VADDPSrr %0, %1
-# AVX512F-NEXT: %2 = VADDPSrr %0, %1
-# AVX512VL-NEXT: %2 = VADDPSZ128rr %0, %1
+# SSE: %0:vr128 = COPY %xmm0
+# SSE-NEXT: %1:vr128 = COPY %xmm1
+# SSE-NEXT: %2:vr128 = ADDPSrr %0, %1
+# AVX: %0:vr128 = COPY %xmm0
+# AVX-NEXT: %1:vr128 = COPY %xmm1
+# AVX-NEXT: %2:vr128 = VADDPSrr %0, %1
+# AVX512F: %0:vr128 = COPY %xmm0
+# AVX512F-NEXT: 1:vr128 = COPY %xmm1
+# AVX512F-NEXT: %2:vr128 = VADDPSrr %0, %1
+# AVX512VL: %0:vr128x = COPY %xmm0
+# AVX512VL-NEXT: %1:vr128x = COPY %xmm1
+# AVX512VL-NEXT: %2:vr128x = VADDPSZ128rr %0, %1
body: |
bb.1 (%ir-block.0):
liveins: %xmm0, %xmm1
diff --git a/test/CodeGen/X86/GlobalISel/select-and-scalar.mir b/test/CodeGen/X86/GlobalISel/select-and-scalar.mir
index c40cc224d50e..0ecb8816d794 100644
--- a/test/CodeGen/X86/GlobalISel/select-and-scalar.mir
+++ b/test/CodeGen/X86/GlobalISel/select-and-scalar.mir
@@ -1,3 +1,4 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple=x86_64-linux-gnu -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL
--- |
@@ -24,14 +25,9 @@
...
---
name: test_and_i8
-# ALL-LABEL: name: test_and_i8
alignment: 4
legalized: true
regBankSelected: true
-# ALL: registers:
-# ALL-NEXT: - { id: 0, class: gr8, preferred-register: '' }
-# ALL-NEXT: - { id: 1, class: gr8, preferred-register: '' }
-# ALL-NEXT: - { id: 2, class: gr8, preferred-register: '' }
registers:
- { id: 0, class: gpr, preferred-register: '' }
- { id: 1, class: gpr, preferred-register: '' }
@@ -40,17 +36,18 @@ liveins:
fixedStack:
stack:
constants:
-# ALL: %0 = COPY %dil
-# ALL-NEXT: %1 = COPY %sil
-# ALL-NEXT: %2 = AND8rr %0, %1, implicit-def %eflags
-# ALL-NEXT: %al = COPY %2
-# ALL-NEXT: RET 0, implicit %al
body: |
bb.1 (%ir-block.0):
liveins: %edi, %esi
- %0(s8) = COPY %edi
- %1(s8) = COPY %esi
+ ; ALL-LABEL: name: test_and_i8
+ ; ALL: [[COPY:%[0-9]+]]:gr8 = COPY %dil
+ ; ALL: [[COPY1:%[0-9]+]]:gr8 = COPY %sil
+ ; ALL: [[AND8rr:%[0-9]+]]:gr8 = AND8rr [[COPY]], [[COPY1]], implicit-def %eflags
+ ; ALL: %al = COPY [[AND8rr]]
+ ; ALL: RET 0, implicit %al
+ %0(s8) = COPY %dil
+ %1(s8) = COPY %sil
%2(s8) = G_AND %0, %1
%al = COPY %2(s8)
RET 0, implicit %al
@@ -58,14 +55,9 @@ body: |
...
---
name: test_and_i16
-# ALL-LABEL: name: test_and_i16
alignment: 4
legalized: true
regBankSelected: true
-# ALL: registers:
-# ALL-NEXT: - { id: 0, class: gr16, preferred-register: '' }
-# ALL-NEXT: - { id: 1, class: gr16, preferred-register: '' }
-# ALL-NEXT: - { id: 2, class: gr16, preferred-register: '' }
registers:
- { id: 0, class: gpr, preferred-register: '' }
- { id: 1, class: gpr, preferred-register: '' }
@@ -74,17 +66,18 @@ liveins:
fixedStack:
stack:
constants:
-# ALL: %0 = COPY %di
-# ALL-NEXT: %1 = COPY %si
-# ALL-NEXT: %2 = AND16rr %0, %1, implicit-def %eflags
-# ALL-NEXT: %ax = COPY %2
-# ALL-NEXT: RET 0, implicit %ax
body: |
bb.1 (%ir-block.0):
liveins: %edi, %esi
- %0(s16) = COPY %edi
- %1(s16) = COPY %esi
+ ; ALL-LABEL: name: test_and_i16
+ ; ALL: [[COPY:%[0-9]+]]:gr16 = COPY %di
+ ; ALL: [[COPY1:%[0-9]+]]:gr16 = COPY %si
+ ; ALL: [[AND16rr:%[0-9]+]]:gr16 = AND16rr [[COPY]], [[COPY1]], implicit-def %eflags
+ ; ALL: %ax = COPY [[AND16rr]]
+ ; ALL: RET 0, implicit %ax
+ %0(s16) = COPY %di
+ %1(s16) = COPY %si
%2(s16) = G_AND %0, %1
%ax = COPY %2(s16)
RET 0, implicit %ax
@@ -92,14 +85,9 @@ body: |
...
---
name: test_and_i32
-# ALL-LABEL: name: test_and_i32
alignment: 4
legalized: true
regBankSelected: true
-# ALL: registers:
-# ALL-NEXT: - { id: 0, class: gr32, preferred-register: '' }
-# ALL-NEXT: - { id: 1, class: gr32, preferred-register: '' }
-# ALL-NEXT: - { id: 2, class: gr32, preferred-register: '' }
registers:
- { id: 0, class: gpr, preferred-register: '' }
- { id: 1, class: gpr, preferred-register: '' }
@@ -108,15 +96,16 @@ liveins:
fixedStack:
stack:
constants:
-# ALL: %0 = COPY %edi
-# ALL-NEXT: %1 = COPY %esi
-# ALL-NEXT: %2 = AND32rr %0, %1, implicit-def %eflags
-# ALL-NEXT: %eax = COPY %2
-# ALL-NEXT: RET 0, implicit %eax
body: |
bb.1 (%ir-block.0):
liveins: %edi, %esi
+ ; ALL-LABEL: name: test_and_i32
+ ; ALL: [[COPY:%[0-9]+]]:gr32 = COPY %edi
+ ; ALL: [[COPY1:%[0-9]+]]:gr32 = COPY %esi
+ ; ALL: [[AND32rr:%[0-9]+]]:gr32 = AND32rr [[COPY]], [[COPY1]], implicit-def %eflags
+ ; ALL: %eax = COPY [[AND32rr]]
+ ; ALL: RET 0, implicit %eax
%0(s32) = COPY %edi
%1(s32) = COPY %esi
%2(s32) = G_AND %0, %1
@@ -126,14 +115,9 @@ body: |
...
---
name: test_and_i64
-# ALL-LABEL: name: test_and_i64
alignment: 4
legalized: true
regBankSelected: true
-# ALL: registers:
-# ALL-NEXT: - { id: 0, class: gr64, preferred-register: '' }
-# ALL-NEXT: - { id: 1, class: gr64, preferred-register: '' }
-# ALL-NEXT: - { id: 2, class: gr64, preferred-register: '' }
registers:
- { id: 0, class: gpr, preferred-register: '' }
- { id: 1, class: gpr, preferred-register: '' }
@@ -142,15 +126,16 @@ liveins:
fixedStack:
stack:
constants:
-# ALL: %0 = COPY %rdi
-# ALL-NEXT: %1 = COPY %rsi
-# ALL-NEXT: %2 = AND64rr %0, %1, implicit-def %eflags
-# ALL-NEXT: %rax = COPY %2
-# ALL-NEXT: RET 0, implicit %rax
body: |
bb.1 (%ir-block.0):
liveins: %rdi, %rsi
+ ; ALL-LABEL: name: test_and_i64
+ ; ALL: [[COPY:%[0-9]+]]:gr64 = COPY %rdi
+ ; ALL: [[COPY1:%[0-9]+]]:gr64 = COPY %rsi
+ ; ALL: [[AND64rr:%[0-9]+]]:gr64 = AND64rr [[COPY]], [[COPY1]], implicit-def %eflags
+ ; ALL: %rax = COPY [[AND64rr]]
+ ; ALL: RET 0, implicit %rax
%0(s64) = COPY %rdi
%1(s64) = COPY %rsi
%2(s64) = G_AND %0, %1
diff --git a/test/CodeGen/X86/GlobalISel/select-blsi.mir b/test/CodeGen/X86/GlobalISel/select-blsi.mir
new file mode 100644
index 000000000000..024788371341
--- /dev/null
+++ b/test/CodeGen/X86/GlobalISel/select-blsi.mir
@@ -0,0 +1,63 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=x86_64-linux-gnu -mattr=+bmi -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s
+#
+# Test that rules where multiple operands must be the same operand successfully
+# match. Also test that the rules do not match when they're not the same
+# operand.
+#
+# This test covers the case when OtherInsnID and OtherOpIdx are different in a
+# GIM_CheckIsSameOperand.
+
+---
+name: test_blsi32rr
+alignment: 4
+legalized: true
+regBankSelected: true
+registers:
+ - { id: 0, class: gpr }
+ - { id: 1, class: gpr }
+ - { id: 2, class: gpr }
+ - { id: 3, class: gpr }
+# G_SUB and G_AND both use %0 so we should match this.
+body: |
+ bb.1:
+ liveins: %edi
+
+ ; CHECK-LABEL: name: test_blsi32rr
+ ; CHECK: [[COPY:%[0-9]+]]:gr32 = COPY %edi
+ ; CHECK: [[BLSI32rr:%[0-9]+]]:gr32 = BLSI32rr [[COPY]], implicit-def %eflags
+ ; CHECK: %edi = COPY [[BLSI32rr]]
+ %0(s32) = COPY %edi
+ %1(s32) = G_CONSTANT i32 0
+ %2(s32) = G_SUB %1, %0
+ %3(s32) = G_AND %2, %0
+ %edi = COPY %3
+
+...
+---
+name: test_blsi32rr_nomatch
+alignment: 4
+legalized: true
+regBankSelected: true
+registers:
+ - { id: 0, class: gpr }
+ - { id: 1, class: gpr }
+ - { id: 2, class: gpr }
+ - { id: 3, class: gpr }
+# G_SUB and G_AND use different operands so we shouldn't match this.
+body: |
+ bb.1:
+ liveins: %edi
+
+ ; CHECK-LABEL: name: test_blsi32rr_nomatch
+ ; CHECK: [[COPY:%[0-9]+]]:gr32 = COPY %edi
+ ; CHECK: [[MOV32r0_:%[0-9]+]]:gr32 = MOV32r0 implicit-def %eflags
+ ; CHECK: [[SUB32ri:%[0-9]+]]:gr32 = SUB32ri [[MOV32r0_]], 0, implicit-def %eflags
+ ; CHECK: [[AND32rr:%[0-9]+]]:gr32 = AND32rr [[SUB32ri]], [[COPY]], implicit-def %eflags
+ ; CHECK: %edi = COPY [[AND32rr]]
+ %0(s32) = COPY %edi
+ %1(s32) = G_CONSTANT i32 0
+ %2(s32) = G_SUB %1, %1
+ %3(s32) = G_AND %2, %0
+ %edi = COPY %3
+...
diff --git a/test/CodeGen/X86/GlobalISel/select-blsr.mir b/test/CodeGen/X86/GlobalISel/select-blsr.mir
new file mode 100644
index 000000000000..95c6cfdef807
--- /dev/null
+++ b/test/CodeGen/X86/GlobalISel/select-blsr.mir
@@ -0,0 +1,60 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=x86_64-linux-gnu -mattr=+bmi -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s
+#
+# Test that rules where multiple operands must be the same operand successfully
+# match. Also test that the rules do not match when they're not the same
+# operand.
+
+---
+name: test_blsr32rr
+alignment: 4
+legalized: true
+regBankSelected: true
+registers:
+ - { id: 0, class: gpr }
+ - { id: 1, class: gpr }
+ - { id: 2, class: gpr }
+ - { id: 3, class: gpr }
+# G_ADD and G_AND both use %0 so we should match this.
+body: |
+ bb.1:
+ liveins: %edi
+
+ ; CHECK-LABEL: name: test_blsr32rr
+ ; CHECK: [[COPY:%[0-9]+]]:gr32 = COPY %edi
+ ; CHECK: [[BLSR32rr:%[0-9]+]]:gr32 = BLSR32rr [[COPY]], implicit-def %eflags
+ ; CHECK: %edi = COPY [[BLSR32rr]]
+ %0(s32) = COPY %edi
+ %1(s32) = G_CONSTANT i32 -1
+ %2(s32) = G_ADD %0, %1
+ %3(s32) = G_AND %2, %0
+ %edi = COPY %3
+
+...
+---
+name: test_blsr32rr_nomatch
+alignment: 4
+legalized: true
+regBankSelected: true
+registers:
+ - { id: 0, class: gpr }
+ - { id: 1, class: gpr }
+ - { id: 2, class: gpr }
+ - { id: 3, class: gpr }
+# G_ADD and G_AND use different operands so we shouldn't match this.
+body: |
+ bb.1:
+ liveins: %edi
+
+ ; CHECK-LABEL: name: test_blsr32rr_nomatch
+ ; CHECK: [[COPY:%[0-9]+]]:gr32 = COPY %edi
+ ; CHECK: [[MOV32ri:%[0-9]+]]:gr32 = MOV32ri 4294967295
+ ; CHECK: [[DEC32r:%[0-9]+]]:gr32 = DEC32r [[MOV32ri]], implicit-def %eflags
+ ; CHECK: [[AND32rr:%[0-9]+]]:gr32 = AND32rr [[DEC32r]], [[COPY]], implicit-def %eflags
+ ; CHECK: %edi = COPY [[AND32rr]]
+ %0(s32) = COPY %edi
+ %1(s32) = G_CONSTANT i32 -1
+ %2(s32) = G_ADD %1, %1
+ %3(s32) = G_AND %2, %0
+ %edi = COPY %3
+...
diff --git a/test/CodeGen/X86/GlobalISel/select-br.mir b/test/CodeGen/X86/GlobalISel/select-br.mir
index 9d2a878e7575..8d231cc26ba5 100644
--- a/test/CodeGen/X86/GlobalISel/select-br.mir
+++ b/test/CodeGen/X86/GlobalISel/select-br.mir
@@ -20,20 +20,20 @@ name: uncondbr
alignment: 4
legalized: true
regBankSelected: true
-# CHECK: JMP_1 %bb.2.bb2
-# CHECK: JMP_1 %bb.1.end
+# CHECK: JMP_1 %bb.2
+# CHECK: JMP_1 %bb.1
body: |
bb.1.entry:
- successors: %bb.3.bb2(0x80000000)
+ successors: %bb.3(0x80000000)
- G_BR %bb.3.bb2
+ G_BR %bb.3
bb.2.end:
RET 0
bb.3.bb2:
- successors: %bb.2.end(0x80000000)
+ successors: %bb.2(0x80000000)
- G_BR %bb.2.end
+ G_BR %bb.2
...
diff --git a/test/CodeGen/X86/GlobalISel/select-brcond.mir b/test/CodeGen/X86/GlobalISel/select-brcond.mir
new file mode 100644
index 000000000000..00a9cc044ea4
--- /dev/null
+++ b/test/CodeGen/X86/GlobalISel/select-brcond.mir
@@ -0,0 +1,66 @@
+# RUN: llc -mtriple=x86_64-linux-gnu -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=X64
+# RUN: llc -mtriple=i386-linux-gnu -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=X32
+
+--- |
+
+ define i32 @test(i32 %a) {
+ entry:
+ %cmp = trunc i32 %a to i1
+ br i1 %cmp, label %true, label %false
+
+ true: ; preds = %entry
+ ret i32 0
+
+ false: ; preds = %entry
+ ret i32 1
+ }
+
+...
+---
+name: test
+# CHECK-LABEL: name: test
+alignment: 4
+legalized: true
+regBankSelected: true
+registers:
+ - { id: 0, class: gpr, preferred-register: '' }
+ - { id: 1, class: gpr, preferred-register: '' }
+ - { id: 2, class: gpr, preferred-register: '' }
+ - { id: 3, class: gpr, preferred-register: '' }
+# X64: %0:gr32 = COPY %edi
+# X32: %0:gr32_abcd = COPY %edi
+# CHECK-NEXT: %2:gr32 = MOV32r0 implicit-def %eflags
+# CHECK-NEXT: %3:gr32 = MOV32ri 1
+# CHECK-NEXT: %1:gr8 = COPY %0.sub_8bit
+# CHECK-NEXT: TEST8ri %1, 1, implicit-def %eflags
+# CHECK-NEXT: JNE_1 %[[TRUE:bb.[0-9]+]], implicit %eflags
+# CHECK-NEXT: JMP_1 %[[FALSE:bb.[0-9]+]]
+# CHECK: [[TRUE]].{{[a-zA-Z0-9]+}}:
+# CHECK-NEXT: %eax = COPY %2
+# CHECK-NEXT: RET 0, implicit %eax
+# CHECK: [[FALSE]].{{[a-zA-Z0-9]+}}:
+# CHECK-NEXT: %eax = COPY %3
+# CHECK-NEXT: RET 0, implicit %eax
+
+
+body: |
+ bb.1.entry:
+ successors: %bb.2(0x40000000), %bb.3(0x40000000)
+ liveins: %edi
+
+ %0(s32) = COPY %edi
+ %2(s32) = G_CONSTANT i32 0
+ %3(s32) = G_CONSTANT i32 1
+ %1(s1) = G_TRUNC %0(s32)
+ G_BRCOND %1(s1), %bb.2
+ G_BR %bb.3
+
+ bb.2.true:
+ %eax = COPY %2(s32)
+ RET 0, implicit %eax
+
+ bb.3.false:
+ %eax = COPY %3(s32)
+ RET 0, implicit %eax
+
+...
diff --git a/test/CodeGen/X86/GlobalISel/select-cmp.mir b/test/CodeGen/X86/GlobalISel/select-cmp.mir
index 64c8cb6b823a..3457e971b8d4 100644
--- a/test/CodeGen/X86/GlobalISel/select-cmp.mir
+++ b/test/CodeGen/X86/GlobalISel/select-cmp.mir
@@ -1,3 +1,4 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple=x86_64-linux-gnu -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK
--- |
@@ -82,35 +83,29 @@
...
---
name: test_icmp_eq_i8
-# CHECK-LABEL: name: test_icmp_eq_i8
alignment: 4
legalized: true
regBankSelected: true
-# CHECK: registers:
-# CHECK-NEXT: - { id: 0, class: gr8, preferred-register: '' }
-# CHECK-NEXT: - { id: 1, class: gr8, preferred-register: '' }
-# CHECK-NEXT: - { id: 2, class: gr8, preferred-register: '' }
-# CHECK-NEXT: - { id: 3, class: gr32, preferred-register: '' }
-# CHECK-NEXT: - { id: 4, class: gr32, preferred-register: '' }
registers:
- { id: 0, class: gpr }
- { id: 1, class: gpr }
- { id: 2, class: gpr }
- { id: 3, class: gpr }
-# CHECK: %0 = COPY %dil
-# CHECK-NEXT: %1 = COPY %sil
-# CHECK-NEXT: CMP8rr %0, %1, implicit-def %eflags
-# CHECK-NEXT: %2 = SETEr implicit %eflags
-# CHECK-NEXT: %4 = SUBREG_TO_REG 0, %2, 1
-# CHECK-NEXT: %3 = AND32ri8 %4, 1, implicit-def %eflags
-# CHECK-NEXT: %eax = COPY %3
-# CHECK-NEXT: RET 0, implicit %eax
body: |
bb.1 (%ir-block.0):
liveins: %edi, %esi
- %0(s8) = COPY %edi
- %1(s8) = COPY %esi
+ ; CHECK-LABEL: name: test_icmp_eq_i8
+ ; CHECK: [[COPY:%[0-9]+]]:gr8 = COPY %dil
+ ; CHECK: [[COPY1:%[0-9]+]]:gr8 = COPY %sil
+ ; CHECK: CMP8rr [[COPY]], [[COPY1]], implicit-def %eflags
+ ; CHECK: [[SETEr:%[0-9]+]]:gr8 = SETEr implicit %eflags
+ ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gr32 = SUBREG_TO_REG 0, [[SETEr]], %subreg.sub_8bit
+ ; CHECK: [[AND32ri8_:%[0-9]+]]:gr32 = AND32ri8 [[SUBREG_TO_REG]], 1, implicit-def %eflags
+ ; CHECK: %eax = COPY [[AND32ri8_]]
+ ; CHECK: RET 0, implicit %eax
+ %0(s8) = COPY %dil
+ %1(s8) = COPY %sil
%2(s1) = G_ICMP intpred(eq), %0(s8), %1
%3(s32) = G_ZEXT %2(s1)
%eax = COPY %3(s32)
@@ -119,35 +114,29 @@ body: |
...
---
name: test_icmp_eq_i16
-# CHECK-LABEL: name: test_icmp_eq_i16
alignment: 4
legalized: true
regBankSelected: true
-# CHECK: registers:
-# CHECK-NEXT: - { id: 0, class: gr16, preferred-register: '' }
-# CHECK-NEXT: - { id: 1, class: gr16, preferred-register: '' }
-# CHECK-NEXT: - { id: 2, class: gr8, preferred-register: '' }
-# CHECK-NEXT: - { id: 3, class: gr32, preferred-register: '' }
-# CHECK-NEXT: - { id: 4, class: gr32, preferred-register: '' }
registers:
- { id: 0, class: gpr }
- { id: 1, class: gpr }
- { id: 2, class: gpr }
- { id: 3, class: gpr }
-# CHECK: %0 = COPY %di
-# CHECK-NEXT: %1 = COPY %si
-# CHECK-NEXT: CMP16rr %0, %1, implicit-def %eflags
-# CHECK-NEXT: %2 = SETEr implicit %eflags
-# CHECK-NEXT: %4 = SUBREG_TO_REG 0, %2, 1
-# CHECK-NEXT: %3 = AND32ri8 %4, 1, implicit-def %eflags
-# CHECK-NEXT: %eax = COPY %3
-# CHECK-NEXT: RET 0, implicit %eax
body: |
bb.1 (%ir-block.0):
liveins: %edi, %esi
- %0(s16) = COPY %edi
- %1(s16) = COPY %esi
+ ; CHECK-LABEL: name: test_icmp_eq_i16
+ ; CHECK: [[COPY:%[0-9]+]]:gr16 = COPY %di
+ ; CHECK: [[COPY1:%[0-9]+]]:gr16 = COPY %si
+ ; CHECK: CMP16rr [[COPY]], [[COPY1]], implicit-def %eflags
+ ; CHECK: [[SETEr:%[0-9]+]]:gr8 = SETEr implicit %eflags
+ ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gr32 = SUBREG_TO_REG 0, [[SETEr]], %subreg.sub_8bit
+ ; CHECK: [[AND32ri8_:%[0-9]+]]:gr32 = AND32ri8 [[SUBREG_TO_REG]], 1, implicit-def %eflags
+ ; CHECK: %eax = COPY [[AND32ri8_]]
+ ; CHECK: RET 0, implicit %eax
+ %0(s16) = COPY %di
+ %1(s16) = COPY %si
%2(s1) = G_ICMP intpred(eq), %0(s16), %1
%3(s32) = G_ZEXT %2(s1)
%eax = COPY %3(s32)
@@ -156,33 +145,27 @@ body: |
...
---
name: test_icmp_eq_i64
-# CHECK-LABEL: name: test_icmp_eq_i64
alignment: 4
legalized: true
regBankSelected: true
-# CHECK: registers:
-# CHECK-NEXT: - { id: 0, class: gr64, preferred-register: '' }
-# CHECK-NEXT: - { id: 1, class: gr64, preferred-register: '' }
-# CHECK-NEXT: - { id: 2, class: gr8, preferred-register: '' }
-# CHECK-NEXT: - { id: 3, class: gr32, preferred-register: '' }
-# CHECK-NEXT: - { id: 4, class: gr32, preferred-register: '' }
registers:
- { id: 0, class: gpr }
- { id: 1, class: gpr }
- { id: 2, class: gpr }
- { id: 3, class: gpr }
-# CHECK: %0 = COPY %rdi
-# CHECK-NEXT: %1 = COPY %rsi
-# CHECK-NEXT: CMP64rr %0, %1, implicit-def %eflags
-# CHECK-NEXT: %2 = SETEr implicit %eflags
-# CHECK-NEXT: %4 = SUBREG_TO_REG 0, %2, 1
-# CHECK-NEXT: %3 = AND32ri8 %4, 1, implicit-def %eflags
-# CHECK-NEXT: %eax = COPY %3
-# CHECK-NEXT: RET 0, implicit %eax
body: |
bb.1 (%ir-block.0):
liveins: %rdi, %rsi
+ ; CHECK-LABEL: name: test_icmp_eq_i64
+ ; CHECK: [[COPY:%[0-9]+]]:gr64 = COPY %rdi
+ ; CHECK: [[COPY1:%[0-9]+]]:gr64 = COPY %rsi
+ ; CHECK: CMP64rr [[COPY]], [[COPY1]], implicit-def %eflags
+ ; CHECK: [[SETEr:%[0-9]+]]:gr8 = SETEr implicit %eflags
+ ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gr32 = SUBREG_TO_REG 0, [[SETEr]], %subreg.sub_8bit
+ ; CHECK: [[AND32ri8_:%[0-9]+]]:gr32 = AND32ri8 [[SUBREG_TO_REG]], 1, implicit-def %eflags
+ ; CHECK: %eax = COPY [[AND32ri8_]]
+ ; CHECK: RET 0, implicit %eax
%0(s64) = COPY %rdi
%1(s64) = COPY %rsi
%2(s1) = G_ICMP intpred(eq), %0(s64), %1
@@ -193,33 +176,27 @@ body: |
...
---
name: test_icmp_eq_i32
-# CHECK-LABEL: name: test_icmp_eq_i32
alignment: 4
legalized: true
regBankSelected: true
-# CHECK: registers:
-# CHECK-NEXT: - { id: 0, class: gr32, preferred-register: '' }
-# CHECK-NEXT: - { id: 1, class: gr32, preferred-register: '' }
-# CHECK-NEXT: - { id: 2, class: gr8, preferred-register: '' }
-# CHECK-NEXT: - { id: 3, class: gr32, preferred-register: '' }
-# CHECK-NEXT: - { id: 4, class: gr32, preferred-register: '' }
registers:
- { id: 0, class: gpr }
- { id: 1, class: gpr }
- { id: 2, class: gpr }
- { id: 3, class: gpr }
-# CHECK: %0 = COPY %edi
-# CHECK-NEXT: %1 = COPY %esi
-# CHECK-NEXT: CMP32rr %0, %1, implicit-def %eflags
-# CHECK-NEXT: %2 = SETEr implicit %eflags
-# CHECK-NEXT: %4 = SUBREG_TO_REG 0, %2, 1
-# CHECK-NEXT: %3 = AND32ri8 %4, 1, implicit-def %eflags
-# CHECK-NEXT: %eax = COPY %3
-# CHECK-NEXT: RET 0, implicit %eax
body: |
bb.1 (%ir-block.0):
liveins: %edi, %esi
+ ; CHECK-LABEL: name: test_icmp_eq_i32
+ ; CHECK: [[COPY:%[0-9]+]]:gr32 = COPY %edi
+ ; CHECK: [[COPY1:%[0-9]+]]:gr32 = COPY %esi
+ ; CHECK: CMP32rr [[COPY]], [[COPY1]], implicit-def %eflags
+ ; CHECK: [[SETEr:%[0-9]+]]:gr8 = SETEr implicit %eflags
+ ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gr32 = SUBREG_TO_REG 0, [[SETEr]], %subreg.sub_8bit
+ ; CHECK: [[AND32ri8_:%[0-9]+]]:gr32 = AND32ri8 [[SUBREG_TO_REG]], 1, implicit-def %eflags
+ ; CHECK: %eax = COPY [[AND32ri8_]]
+ ; CHECK: RET 0, implicit %eax
%0(s32) = COPY %edi
%1(s32) = COPY %esi
%2(s1) = G_ICMP intpred(eq), %0(s32), %1
@@ -230,33 +207,27 @@ body: |
...
---
name: test_icmp_ne_i32
-# CHECK-LABEL: name: test_icmp_ne_i32
alignment: 4
legalized: true
regBankSelected: true
-# CHECK: registers:
-# CHECK-NEXT: - { id: 0, class: gr32, preferred-register: '' }
-# CHECK-NEXT: - { id: 1, class: gr32, preferred-register: '' }
-# CHECK-NEXT: - { id: 2, class: gr8, preferred-register: '' }
-# CHECK-NEXT: - { id: 3, class: gr32, preferred-register: '' }
-# CHECK-NEXT: - { id: 4, class: gr32, preferred-register: '' }
registers:
- { id: 0, class: gpr }
- { id: 1, class: gpr }
- { id: 2, class: gpr }
- { id: 3, class: gpr }
-# CHECK: %0 = COPY %edi
-# CHECK-NEXT: %1 = COPY %esi
-# CHECK-NEXT: CMP32rr %0, %1, implicit-def %eflags
-# CHECK-NEXT: %2 = SETNEr implicit %eflags
-# CHECK-NEXT: %4 = SUBREG_TO_REG 0, %2, 1
-# CHECK-NEXT: %3 = AND32ri8 %4, 1, implicit-def %eflags
-# CHECK-NEXT: %eax = COPY %3
-# CHECK-NEXT: RET 0, implicit %eax
body: |
bb.1 (%ir-block.0):
liveins: %edi, %esi
+ ; CHECK-LABEL: name: test_icmp_ne_i32
+ ; CHECK: [[COPY:%[0-9]+]]:gr32 = COPY %edi
+ ; CHECK: [[COPY1:%[0-9]+]]:gr32 = COPY %esi
+ ; CHECK: CMP32rr [[COPY]], [[COPY1]], implicit-def %eflags
+ ; CHECK: [[SETNEr:%[0-9]+]]:gr8 = SETNEr implicit %eflags
+ ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gr32 = SUBREG_TO_REG 0, [[SETNEr]], %subreg.sub_8bit
+ ; CHECK: [[AND32ri8_:%[0-9]+]]:gr32 = AND32ri8 [[SUBREG_TO_REG]], 1, implicit-def %eflags
+ ; CHECK: %eax = COPY [[AND32ri8_]]
+ ; CHECK: RET 0, implicit %eax
%0(s32) = COPY %edi
%1(s32) = COPY %esi
%2(s1) = G_ICMP intpred(ne), %0(s32), %1
@@ -267,33 +238,27 @@ body: |
...
---
name: test_icmp_ugt_i32
-# CHECK-LABEL: name: test_icmp_ugt_i32
alignment: 4
legalized: true
regBankSelected: true
-# CHECK: registers:
-# CHECK-NEXT: - { id: 0, class: gr32, preferred-register: '' }
-# CHECK-NEXT: - { id: 1, class: gr32, preferred-register: '' }
-# CHECK-NEXT: - { id: 2, class: gr8, preferred-register: '' }
-# CHECK-NEXT: - { id: 3, class: gr32, preferred-register: '' }
-# CHECK-NEXT: - { id: 4, class: gr32, preferred-register: '' }
registers:
- { id: 0, class: gpr }
- { id: 1, class: gpr }
- { id: 2, class: gpr }
- { id: 3, class: gpr }
-# CHECK: %0 = COPY %edi
-# CHECK-NEXT: %1 = COPY %esi
-# CHECK-NEXT: CMP32rr %0, %1, implicit-def %eflags
-# CHECK-NEXT: %2 = SETAr implicit %eflags
-# CHECK-NEXT: %4 = SUBREG_TO_REG 0, %2, 1
-# CHECK-NEXT: %3 = AND32ri8 %4, 1, implicit-def %eflags
-# CHECK-NEXT: %eax = COPY %3
-# CHECK-NEXT: RET 0, implicit %eax
body: |
bb.1 (%ir-block.0):
liveins: %edi, %esi
+ ; CHECK-LABEL: name: test_icmp_ugt_i32
+ ; CHECK: [[COPY:%[0-9]+]]:gr32 = COPY %edi
+ ; CHECK: [[COPY1:%[0-9]+]]:gr32 = COPY %esi
+ ; CHECK: CMP32rr [[COPY]], [[COPY1]], implicit-def %eflags
+ ; CHECK: [[SETAr:%[0-9]+]]:gr8 = SETAr implicit %eflags
+ ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gr32 = SUBREG_TO_REG 0, [[SETAr]], %subreg.sub_8bit
+ ; CHECK: [[AND32ri8_:%[0-9]+]]:gr32 = AND32ri8 [[SUBREG_TO_REG]], 1, implicit-def %eflags
+ ; CHECK: %eax = COPY [[AND32ri8_]]
+ ; CHECK: RET 0, implicit %eax
%0(s32) = COPY %edi
%1(s32) = COPY %esi
%2(s1) = G_ICMP intpred(ugt), %0(s32), %1
@@ -304,33 +269,27 @@ body: |
...
---
name: test_icmp_uge_i32
-# CHECK-LABEL: name: test_icmp_uge_i32
alignment: 4
legalized: true
regBankSelected: true
-# CHECK: registers:
-# CHECK-NEXT: - { id: 0, class: gr32, preferred-register: '' }
-# CHECK-NEXT: - { id: 1, class: gr32, preferred-register: '' }
-# CHECK-NEXT: - { id: 2, class: gr8, preferred-register: '' }
-# CHECK-NEXT: - { id: 3, class: gr32, preferred-register: '' }
-# CHECK-NEXT: - { id: 4, class: gr32, preferred-register: '' }
registers:
- { id: 0, class: gpr }
- { id: 1, class: gpr }
- { id: 2, class: gpr }
- { id: 3, class: gpr }
-# CHECK: %0 = COPY %edi
-# CHECK-NEXT: %1 = COPY %esi
-# CHECK-NEXT: CMP32rr %0, %1, implicit-def %eflags
-# CHECK-NEXT: %2 = SETAEr implicit %eflags
-# CHECK-NEXT: %4 = SUBREG_TO_REG 0, %2, 1
-# CHECK-NEXT: %3 = AND32ri8 %4, 1, implicit-def %eflags
-# CHECK-NEXT: %eax = COPY %3
-# CHECK-NEXT: RET 0, implicit %eax
body: |
bb.1 (%ir-block.0):
liveins: %edi, %esi
+ ; CHECK-LABEL: name: test_icmp_uge_i32
+ ; CHECK: [[COPY:%[0-9]+]]:gr32 = COPY %edi
+ ; CHECK: [[COPY1:%[0-9]+]]:gr32 = COPY %esi
+ ; CHECK: CMP32rr [[COPY]], [[COPY1]], implicit-def %eflags
+ ; CHECK: [[SETAEr:%[0-9]+]]:gr8 = SETAEr implicit %eflags
+ ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gr32 = SUBREG_TO_REG 0, [[SETAEr]], %subreg.sub_8bit
+ ; CHECK: [[AND32ri8_:%[0-9]+]]:gr32 = AND32ri8 [[SUBREG_TO_REG]], 1, implicit-def %eflags
+ ; CHECK: %eax = COPY [[AND32ri8_]]
+ ; CHECK: RET 0, implicit %eax
%0(s32) = COPY %edi
%1(s32) = COPY %esi
%2(s1) = G_ICMP intpred(uge), %0(s32), %1
@@ -341,33 +300,27 @@ body: |
...
---
name: test_icmp_ult_i32
-# CHECK-LABEL: name: test_icmp_ult_i32
alignment: 4
legalized: true
regBankSelected: true
-# CHECK: registers:
-# CHECK-NEXT: - { id: 0, class: gr32, preferred-register: '' }
-# CHECK-NEXT: - { id: 1, class: gr32, preferred-register: '' }
-# CHECK-NEXT: - { id: 2, class: gr8, preferred-register: '' }
-# CHECK-NEXT: - { id: 3, class: gr32, preferred-register: '' }
-# CHECK-NEXT: - { id: 4, class: gr32, preferred-register: '' }
registers:
- { id: 0, class: gpr }
- { id: 1, class: gpr }
- { id: 2, class: gpr }
- { id: 3, class: gpr }
-# CHECK: %0 = COPY %edi
-# CHECK-NEXT: %1 = COPY %esi
-# CHECK-NEXT: CMP32rr %0, %1, implicit-def %eflags
-# CHECK-NEXT: %2 = SETBr implicit %eflags
-# CHECK-NEXT: %4 = SUBREG_TO_REG 0, %2, 1
-# CHECK-NEXT: %3 = AND32ri8 %4, 1, implicit-def %eflags
-# CHECK-NEXT: %eax = COPY %3
-# CHECK-NEXT: RET 0, implicit %eax
body: |
bb.1 (%ir-block.0):
liveins: %edi, %esi
+ ; CHECK-LABEL: name: test_icmp_ult_i32
+ ; CHECK: [[COPY:%[0-9]+]]:gr32 = COPY %edi
+ ; CHECK: [[COPY1:%[0-9]+]]:gr32 = COPY %esi
+ ; CHECK: CMP32rr [[COPY]], [[COPY1]], implicit-def %eflags
+ ; CHECK: [[SETBr:%[0-9]+]]:gr8 = SETBr implicit %eflags
+ ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gr32 = SUBREG_TO_REG 0, [[SETBr]], %subreg.sub_8bit
+ ; CHECK: [[AND32ri8_:%[0-9]+]]:gr32 = AND32ri8 [[SUBREG_TO_REG]], 1, implicit-def %eflags
+ ; CHECK: %eax = COPY [[AND32ri8_]]
+ ; CHECK: RET 0, implicit %eax
%0(s32) = COPY %edi
%1(s32) = COPY %esi
%2(s1) = G_ICMP intpred(ult), %0(s32), %1
@@ -378,33 +331,27 @@ body: |
...
---
name: test_icmp_ule_i32
-# CHECK-LABEL: name: test_icmp_ule_i32
alignment: 4
legalized: true
regBankSelected: true
-# CHECK: registers:
-# CHECK-NEXT: - { id: 0, class: gr32, preferred-register: '' }
-# CHECK-NEXT: - { id: 1, class: gr32, preferred-register: '' }
-# CHECK-NEXT: - { id: 2, class: gr8, preferred-register: '' }
-# CHECK-NEXT: - { id: 3, class: gr32, preferred-register: '' }
-# CHECK-NEXT: - { id: 4, class: gr32, preferred-register: '' }
registers:
- { id: 0, class: gpr }
- { id: 1, class: gpr }
- { id: 2, class: gpr }
- { id: 3, class: gpr }
-# CHECK: %0 = COPY %edi
-# CHECK-NEXT: %1 = COPY %esi
-# CHECK-NEXT: CMP32rr %0, %1, implicit-def %eflags
-# CHECK-NEXT: %2 = SETBEr implicit %eflags
-# CHECK-NEXT: %4 = SUBREG_TO_REG 0, %2, 1
-# CHECK-NEXT: %3 = AND32ri8 %4, 1, implicit-def %eflags
-# CHECK-NEXT: %eax = COPY %3
-# CHECK-NEXT: RET 0, implicit %eax
body: |
bb.1 (%ir-block.0):
liveins: %edi, %esi
+ ; CHECK-LABEL: name: test_icmp_ule_i32
+ ; CHECK: [[COPY:%[0-9]+]]:gr32 = COPY %edi
+ ; CHECK: [[COPY1:%[0-9]+]]:gr32 = COPY %esi
+ ; CHECK: CMP32rr [[COPY]], [[COPY1]], implicit-def %eflags
+ ; CHECK: [[SETBEr:%[0-9]+]]:gr8 = SETBEr implicit %eflags
+ ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gr32 = SUBREG_TO_REG 0, [[SETBEr]], %subreg.sub_8bit
+ ; CHECK: [[AND32ri8_:%[0-9]+]]:gr32 = AND32ri8 [[SUBREG_TO_REG]], 1, implicit-def %eflags
+ ; CHECK: %eax = COPY [[AND32ri8_]]
+ ; CHECK: RET 0, implicit %eax
%0(s32) = COPY %edi
%1(s32) = COPY %esi
%2(s1) = G_ICMP intpred(ule), %0(s32), %1
@@ -415,33 +362,27 @@ body: |
...
---
name: test_icmp_sgt_i32
-# CHECK-LABEL: name: test_icmp_sgt_i32
alignment: 4
legalized: true
regBankSelected: true
-# CHECK: registers:
-# CHECK-NEXT: - { id: 0, class: gr32, preferred-register: '' }
-# CHECK-NEXT: - { id: 1, class: gr32, preferred-register: '' }
-# CHECK-NEXT: - { id: 2, class: gr8, preferred-register: '' }
-# CHECK-NEXT: - { id: 3, class: gr32, preferred-register: '' }
-# CHECK-NEXT: - { id: 4, class: gr32, preferred-register: '' }
registers:
- { id: 0, class: gpr }
- { id: 1, class: gpr }
- { id: 2, class: gpr }
- { id: 3, class: gpr }
-# CHECK: %0 = COPY %edi
-# CHECK-NEXT: %1 = COPY %esi
-# CHECK-NEXT: CMP32rr %0, %1, implicit-def %eflags
-# CHECK-NEXT: %2 = SETGr implicit %eflags
-# CHECK-NEXT: %4 = SUBREG_TO_REG 0, %2, 1
-# CHECK-NEXT: %3 = AND32ri8 %4, 1, implicit-def %eflags
-# CHECK-NEXT: %eax = COPY %3
-# CHECK-NEXT: RET 0, implicit %eax
body: |
bb.1 (%ir-block.0):
liveins: %edi, %esi
+ ; CHECK-LABEL: name: test_icmp_sgt_i32
+ ; CHECK: [[COPY:%[0-9]+]]:gr32 = COPY %edi
+ ; CHECK: [[COPY1:%[0-9]+]]:gr32 = COPY %esi
+ ; CHECK: CMP32rr [[COPY]], [[COPY1]], implicit-def %eflags
+ ; CHECK: [[SETGr:%[0-9]+]]:gr8 = SETGr implicit %eflags
+ ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gr32 = SUBREG_TO_REG 0, [[SETGr]], %subreg.sub_8bit
+ ; CHECK: [[AND32ri8_:%[0-9]+]]:gr32 = AND32ri8 [[SUBREG_TO_REG]], 1, implicit-def %eflags
+ ; CHECK: %eax = COPY [[AND32ri8_]]
+ ; CHECK: RET 0, implicit %eax
%0(s32) = COPY %edi
%1(s32) = COPY %esi
%2(s1) = G_ICMP intpred(sgt), %0(s32), %1
@@ -452,33 +393,27 @@ body: |
...
---
name: test_icmp_sge_i32
-# CHECK-LABEL: name: test_icmp_sge_i32
alignment: 4
legalized: true
regBankSelected: true
-# CHECK: registers:
-# CHECK-NEXT: - { id: 0, class: gr32, preferred-register: '' }
-# CHECK-NEXT: - { id: 1, class: gr32, preferred-register: '' }
-# CHECK-NEXT: - { id: 2, class: gr8, preferred-register: '' }
-# CHECK-NEXT: - { id: 3, class: gr32, preferred-register: '' }
-# CHECK-NEXT: - { id: 4, class: gr32, preferred-register: '' }
registers:
- { id: 0, class: gpr }
- { id: 1, class: gpr }
- { id: 2, class: gpr }
- { id: 3, class: gpr }
-# CHECK: %0 = COPY %edi
-# CHECK-NEXT: %1 = COPY %esi
-# CHECK-NEXT: CMP32rr %0, %1, implicit-def %eflags
-# CHECK-NEXT: %2 = SETGEr implicit %eflags
-# CHECK-NEXT: %4 = SUBREG_TO_REG 0, %2, 1
-# CHECK-NEXT: %3 = AND32ri8 %4, 1, implicit-def %eflags
-# CHECK-NEXT: %eax = COPY %3
-# CHECK-NEXT: RET 0, implicit %eax
body: |
bb.1 (%ir-block.0):
liveins: %edi, %esi
+ ; CHECK-LABEL: name: test_icmp_sge_i32
+ ; CHECK: [[COPY:%[0-9]+]]:gr32 = COPY %edi
+ ; CHECK: [[COPY1:%[0-9]+]]:gr32 = COPY %esi
+ ; CHECK: CMP32rr [[COPY]], [[COPY1]], implicit-def %eflags
+ ; CHECK: [[SETGEr:%[0-9]+]]:gr8 = SETGEr implicit %eflags
+ ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gr32 = SUBREG_TO_REG 0, [[SETGEr]], %subreg.sub_8bit
+ ; CHECK: [[AND32ri8_:%[0-9]+]]:gr32 = AND32ri8 [[SUBREG_TO_REG]], 1, implicit-def %eflags
+ ; CHECK: %eax = COPY [[AND32ri8_]]
+ ; CHECK: RET 0, implicit %eax
%0(s32) = COPY %edi
%1(s32) = COPY %esi
%2(s1) = G_ICMP intpred(sge), %0(s32), %1
@@ -489,33 +424,27 @@ body: |
...
---
name: test_icmp_slt_i32
-# CHECK-LABEL: name: test_icmp_slt_i32
alignment: 4
legalized: true
regBankSelected: true
-# CHECK: registers:
-# CHECK-NEXT: - { id: 0, class: gr32, preferred-register: '' }
-# CHECK-NEXT: - { id: 1, class: gr32, preferred-register: '' }
-# CHECK-NEXT: - { id: 2, class: gr8, preferred-register: '' }
-# CHECK-NEXT: - { id: 3, class: gr32, preferred-register: '' }
-# CHECK-NEXT: - { id: 4, class: gr32, preferred-register: '' }
registers:
- { id: 0, class: gpr }
- { id: 1, class: gpr }
- { id: 2, class: gpr }
- { id: 3, class: gpr }
-# CHECK: %0 = COPY %edi
-# CHECK-NEXT: %1 = COPY %esi
-# CHECK-NEXT: CMP32rr %0, %1, implicit-def %eflags
-# CHECK-NEXT: %2 = SETLr implicit %eflags
-# CHECK-NEXT: %4 = SUBREG_TO_REG 0, %2, 1
-# CHECK-NEXT: %3 = AND32ri8 %4, 1, implicit-def %eflags
-# CHECK-NEXT: %eax = COPY %3
-# CHECK-NEXT: RET 0, implicit %eax
body: |
bb.1 (%ir-block.0):
liveins: %edi, %esi
+ ; CHECK-LABEL: name: test_icmp_slt_i32
+ ; CHECK: [[COPY:%[0-9]+]]:gr32 = COPY %edi
+ ; CHECK: [[COPY1:%[0-9]+]]:gr32 = COPY %esi
+ ; CHECK: CMP32rr [[COPY]], [[COPY1]], implicit-def %eflags
+ ; CHECK: [[SETLr:%[0-9]+]]:gr8 = SETLr implicit %eflags
+ ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gr32 = SUBREG_TO_REG 0, [[SETLr]], %subreg.sub_8bit
+ ; CHECK: [[AND32ri8_:%[0-9]+]]:gr32 = AND32ri8 [[SUBREG_TO_REG]], 1, implicit-def %eflags
+ ; CHECK: %eax = COPY [[AND32ri8_]]
+ ; CHECK: RET 0, implicit %eax
%0(s32) = COPY %edi
%1(s32) = COPY %esi
%2(s1) = G_ICMP intpred(slt), %0(s32), %1
@@ -526,33 +455,27 @@ body: |
...
---
name: test_icmp_sle_i32
-# CHECK-LABEL: name: test_icmp_sle_i32
alignment: 4
legalized: true
regBankSelected: true
-# CHECK: registers:
-# CHECK-NEXT: - { id: 0, class: gr32, preferred-register: '' }
-# CHECK-NEXT: - { id: 1, class: gr32, preferred-register: '' }
-# CHECK-NEXT: - { id: 2, class: gr8, preferred-register: '' }
-# CHECK-NEXT: - { id: 3, class: gr32, preferred-register: '' }
-# CHECK-NEXT: - { id: 4, class: gr32, preferred-register: '' }
registers:
- { id: 0, class: gpr }
- { id: 1, class: gpr }
- { id: 2, class: gpr }
- { id: 3, class: gpr }
-# CHECK: %0 = COPY %edi
-# CHECK-NEXT: %1 = COPY %esi
-# CHECK-NEXT: CMP32rr %0, %1, implicit-def %eflags
-# CHECK-NEXT: %2 = SETLEr implicit %eflags
-# CHECK-NEXT: %4 = SUBREG_TO_REG 0, %2, 1
-# CHECK-NEXT: %3 = AND32ri8 %4, 1, implicit-def %eflags
-# CHECK-NEXT: %eax = COPY %3
-# CHECK-NEXT: RET 0, implicit %eax
body: |
bb.1 (%ir-block.0):
liveins: %edi, %esi
+ ; CHECK-LABEL: name: test_icmp_sle_i32
+ ; CHECK: [[COPY:%[0-9]+]]:gr32 = COPY %edi
+ ; CHECK: [[COPY1:%[0-9]+]]:gr32 = COPY %esi
+ ; CHECK: CMP32rr [[COPY]], [[COPY1]], implicit-def %eflags
+ ; CHECK: [[SETLEr:%[0-9]+]]:gr8 = SETLEr implicit %eflags
+ ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gr32 = SUBREG_TO_REG 0, [[SETLEr]], %subreg.sub_8bit
+ ; CHECK: [[AND32ri8_:%[0-9]+]]:gr32 = AND32ri8 [[SUBREG_TO_REG]], 1, implicit-def %eflags
+ ; CHECK: %eax = COPY [[AND32ri8_]]
+ ; CHECK: RET 0, implicit %eax
%0(s32) = COPY %edi
%1(s32) = COPY %esi
%2(s1) = G_ICMP intpred(sle), %0(s32), %1
diff --git a/test/CodeGen/X86/GlobalISel/select-constant.mir b/test/CodeGen/X86/GlobalISel/select-constant.mir
index 30f57418b4ce..b083288781c1 100644
--- a/test/CodeGen/X86/GlobalISel/select-constant.mir
+++ b/test/CodeGen/X86/GlobalISel/select-constant.mir
@@ -1,3 +1,4 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple=x86_64-linux-gnu -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK
--- |
@@ -40,15 +41,14 @@ name: const_i8
legalized: true
regBankSelected: true
selected: false
-# CHECK-LABEL: name: const_i8
-# CHECK: registers:
-# CHECK-NEXT: - { id: 0, class: gr8, preferred-register: '' }
registers:
- { id: 0, class: gpr }
-# CHECK: body:
-# CHECK: %0 = MOV8ri 2
body: |
bb.1 (%ir-block.0):
+ ; CHECK-LABEL: name: const_i8
+ ; CHECK: [[MOV8ri:%[0-9]+]]:gr8 = MOV8ri 2
+ ; CHECK: %al = COPY [[MOV8ri]]
+ ; CHECK: RET 0, implicit %al
%0(s8) = G_CONSTANT i8 2
%al = COPY %0(s8)
RET 0, implicit %al
@@ -59,15 +59,14 @@ name: const_i16
legalized: true
regBankSelected: true
selected: false
-# CHECK-LABEL: name: const_i16
-# CHECK: registers:
-# CHECK-NEXT: - { id: 0, class: gr16, preferred-register: '' }
registers:
- { id: 0, class: gpr }
-# CHECK: body:
-# CHECK: %0 = MOV16ri 3
body: |
bb.1 (%ir-block.0):
+ ; CHECK-LABEL: name: const_i16
+ ; CHECK: [[MOV16ri:%[0-9]+]]:gr16 = MOV16ri 3
+ ; CHECK: %ax = COPY [[MOV16ri]]
+ ; CHECK: RET 0, implicit %ax
%0(s16) = G_CONSTANT i16 3
%ax = COPY %0(s16)
RET 0, implicit %ax
@@ -78,15 +77,14 @@ name: const_i32
legalized: true
regBankSelected: true
selected: false
-# CHECK-LABEL: name: const_i32
-# CHECK: registers:
-# CHECK-NEXT: - { id: 0, class: gr32, preferred-register: '' }
registers:
- { id: 0, class: gpr }
-# CHECK: body:
-# CHECK: %0 = MOV32ri 4
body: |
bb.1 (%ir-block.0):
+ ; CHECK-LABEL: name: const_i32
+ ; CHECK: [[MOV32ri:%[0-9]+]]:gr32 = MOV32ri 4
+ ; CHECK: %eax = COPY [[MOV32ri]]
+ ; CHECK: RET 0, implicit %eax
%0(s32) = G_CONSTANT i32 4
%eax = COPY %0(s32)
RET 0, implicit %eax
@@ -94,16 +92,16 @@ body: |
...
---
name: const_i32_0
-# CHECK-LABEL: name: const_i32_0
legalized: true
regBankSelected: true
-# CHECK: registers:
-# CHECK-NEXT: - { id: 0, class: gr32, preferred-register: '' }
registers:
- { id: 0, class: gpr }
-# CHECK: %0 = MOV32r0 implicit-def %eflags
body: |
bb.1 (%ir-block.0):
+ ; CHECK-LABEL: name: const_i32_0
+ ; CHECK: [[MOV32r0_:%[0-9]+]]:gr32 = MOV32r0 implicit-def %eflags
+ ; CHECK: %eax = COPY [[MOV32r0_]]
+ ; CHECK: RET 0, implicit %eax
%0(s32) = G_CONSTANT i32 0
%eax = COPY %0(s32)
RET 0, implicit %eax
@@ -114,15 +112,14 @@ name: const_i64
legalized: true
regBankSelected: true
selected: false
-# CHECK-LABEL: name: const_i64
-# CHECK: registers:
-# CHECK-NEXT: - { id: 0, class: gr64, preferred-register: '' }
registers:
- { id: 0, class: gpr }
-# CHECK: body:
-# CHECK: %0 = MOV64ri 68719476720
body: |
bb.1 (%ir-block.0):
+ ; CHECK-LABEL: name: const_i64
+ ; CHECK: [[MOV64ri:%[0-9]+]]:gr64 = MOV64ri 68719476720
+ ; CHECK: %rax = COPY [[MOV64ri]]
+ ; CHECK: RET 0, implicit %rax
%0(s64) = G_CONSTANT i64 68719476720
%rax = COPY %0(s64)
RET 0, implicit %rax
@@ -134,15 +131,14 @@ alignment: 4
legalized: true
regBankSelected: true
selected: false
-# CHECK-LABEL: name: const_i64_u32
-# CHECK: registers:
-# CHECK-NEXT: - { id: 0, class: gr64, preferred-register: '' }
registers:
- { id: 0, class: gpr }
-# CHECK: body:
-# CHECK: %0 = MOV64ri32 1879048192
body: |
bb.1 (%ir-block.0):
+ ; CHECK-LABEL: name: const_i64_u32
+ ; CHECK: [[MOV64ri32_:%[0-9]+]]:gr64 = MOV64ri32 1879048192
+ ; CHECK: %rax = COPY [[MOV64ri32_]]
+ ; CHECK: RET 0, implicit %rax
%0(s64) = G_CONSTANT i64 1879048192
%rax = COPY %0(s64)
RET 0, implicit %rax
@@ -153,15 +149,14 @@ name: const_i64_i32
legalized: true
regBankSelected: true
selected: false
-# CHECK-LABEL: name: const_i64_i32
-# CHECK: registers:
-# CHECK-NEXT: - { id: 0, class: gr64, preferred-register: '' }
registers:
- { id: 0, class: gpr }
-# CHECK: body:
-# CHECK: %0 = MOV64ri32 -1
body: |
bb.1 (%ir-block.0):
+ ; CHECK-LABEL: name: const_i64_i32
+ ; CHECK: [[MOV64ri32_:%[0-9]+]]:gr64 = MOV64ri32 -1
+ ; CHECK: %rax = COPY [[MOV64ri32_]]
+ ; CHECK: RET 0, implicit %rax
%0(s64) = G_CONSTANT i64 -1
%rax = COPY %0(s64)
RET 0, implicit %rax
@@ -169,24 +164,21 @@ body: |
...
---
name: main
-# CHECK-LABEL: name: main
alignment: 4
legalized: true
regBankSelected: true
-# CHECK: registers:
-# CHECK-NEXT: - { id: 0, class: gr64, preferred-register: '' }
-# CHECK-NEXT: - { id: 1, class: gr64, preferred-register: '' }
registers:
- { id: 0, class: gpr, preferred-register: '' }
- { id: 1, class: gpr, preferred-register: '' }
-# CHECK: %0 = COPY %rdi
-# CHECK-NEXT: %1 = MOV64ri32 0
-# CHECK-NEXT: MOV64mr %0, 1, _, 0, _, %1 :: (store 8 into %ir.data)
-# CHECK-NEXT: RET 0
body: |
bb.1 (%ir-block.0):
liveins: %rdi
+ ; CHECK-LABEL: name: main
+ ; CHECK: [[COPY:%[0-9]+]]:gr64 = COPY %rdi
+ ; CHECK: [[MOV64ri32_:%[0-9]+]]:gr64 = MOV64ri32 0
+ ; CHECK: MOV64mr [[COPY]], 1, %noreg, 0, %noreg, [[MOV64ri32_]] :: (store 8 into %ir.data)
+ ; CHECK: RET 0
%0(p0) = COPY %rdi
%1(p0) = G_CONSTANT i64 0
G_STORE %1(p0), %0(p0) :: (store 8 into %ir.data)
diff --git a/test/CodeGen/X86/GlobalISel/select-copy.mir b/test/CodeGen/X86/GlobalISel/select-copy.mir
new file mode 100644
index 000000000000..fccba1f82068
--- /dev/null
+++ b/test/CodeGen/X86/GlobalISel/select-copy.mir
@@ -0,0 +1,185 @@
+# RUN: llc -mtriple=i386-linux-gnu -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X32
+# RUN: llc -mtriple=x86_64-linux-gnu -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X64
+
+--- |
+
+ define void @test_copy() {
+ ret void
+ }
+
+ define void @test_copy2() {
+ ret void
+ }
+
+ define void @test_copy3() {
+ ret void
+ }
+
+ define void @test_copy4() {
+ ret void
+ }
+
+ define void @test_copy5() {
+ ret void
+ }
+
+ define void @test_copy6() {
+ ret void
+ }
+
+...
+---
+name: test_copy
+# ALL-LABEL: name: test_copy
+alignment: 4
+legalized: true
+regBankSelected: true
+# ALL: registers:
+# ALL-NEXT: - { id: 0, class: gr8, preferred-register: '' }
+# ALL-NEXT: - { id: 1, class: gr32, preferred-register: '' }
+# ALL-NEXT: - { id: 2, class: gr32, preferred-register: '' }
+registers:
+ - { id: 0, class: gpr, preferred-register: '' }
+ - { id: 1, class: gpr, preferred-register: '' }
+# ALL: %0:gr8 = COPY %al
+# ALL-NEXT: %2:gr32 = SUBREG_TO_REG 0, %0, %subreg.sub_8bit
+# ALL-NEXT: %1:gr32 = AND32ri8 %2, 1, implicit-def %eflags
+# ALL-NEXT: %eax = COPY %1
+# ALL-NEXT: RET 0, implicit %eax
+body: |
+ bb.1 (%ir-block.0):
+ liveins: %eax
+
+ %0(s1) = COPY %al
+ %1(s32) = G_ZEXT %0(s1)
+ %eax = COPY %1(s32)
+ RET 0, implicit %eax
+
+...
+---
+name: test_copy2
+# ALL-LABEL: name: test_copy2
+alignment: 4
+legalized: true
+regBankSelected: true
+# ALL: registers:
+# ALL-NEXT: - { id: 0, class: gr8, preferred-register: '' }
+# ALL-NEXT: - { id: 1, class: gr32, preferred-register: '' }
+registers:
+ - { id: 0, class: gpr, preferred-register: '' }
+ - { id: 1, class: gpr, preferred-register: '' }
+# ALL: %0:gr8 = COPY %al
+# ALL-NEXT: %1:gr32 = MOVZX32rr8 %0
+# ALL-NEXT: %eax = COPY %1
+# ALL-NEXT: RET 0, implicit %eax
+body: |
+ bb.1 (%ir-block.0):
+ liveins: %eax
+
+ %0(s8) = COPY %al
+ %1(s32) = G_ZEXT %0(s8)
+ %eax = COPY %1(s32)
+ RET 0, implicit %eax
+
+...
+---
+name: test_copy3
+# ALL-LABEL: name: test_copy3
+alignment: 4
+legalized: true
+regBankSelected: true
+# ALL: registers:
+# ALL-NEXT: - { id: 0, class: gr8, preferred-register: '' }
+# ALL-NEXT: - { id: 1, class: gr32, preferred-register: '' }
+registers:
+ - { id: 0, class: gpr, preferred-register: '' }
+ - { id: 1, class: gpr, preferred-register: '' }
+# ALL: %0:gr8 = COPY %al
+# ALL-NEXT: %1:gr32 = MOVZX32rr8 %0
+# ALL-NEXT: %eax = COPY %1
+# ALL-NEXT: RET 0, implicit %eax
+body: |
+ bb.1 (%ir-block.0):
+ liveins: %eax
+
+ %0(s8) = COPY %ax
+ %1(s32) = G_ZEXT %0(s8)
+ %eax = COPY %1(s32)
+ RET 0, implicit %eax
+
+...
+---
+name: test_copy4
+# ALL-LABEL: name: test_copy4
+alignment: 4
+legalized: true
+regBankSelected: true
+# ALL: registers:
+# ALL-NEXT: - { id: 0, class: gr16, preferred-register: '' }
+# ALL-NEXT: - { id: 1, class: gr32, preferred-register: '' }
+registers:
+ - { id: 0, class: gpr, preferred-register: '' }
+ - { id: 1, class: gpr, preferred-register: '' }
+# ALL: %0:gr16 = COPY %ax
+# ALL-NEXT: %1:gr32 = MOVZX32rr16 %0
+# ALL-NEXT: %eax = COPY %1
+# ALL-NEXT: RET 0, implicit %eax
+body: |
+ bb.1 (%ir-block.0):
+ liveins: %eax
+
+ %0(s16) = COPY %eax
+ %1(s32) = G_ZEXT %0(s16)
+ %eax = COPY %1(s32)
+ RET 0, implicit %eax
+
+...
+---
+name: test_copy5
+# ALL-LABEL: name: test_copy5
+alignment: 4
+legalized: true
+regBankSelected: true
+# ALL: registers:
+# ALL-NEXT: - { id: 0, class: gr8, preferred-register: '' }
+# ALL-NEXT: - { id: 1, class: gr32, preferred-register: '' }
+registers:
+ - { id: 0, class: gpr, preferred-register: '' }
+# ALL: %0:gr8 = COPY %dl
+# ALL-NEXT: %1:gr32 = SUBREG_TO_REG 0, %0, %subreg.sub_8bit
+# ALL-NEXT: %eax = COPY %1
+# ALL-NEXT: RET 0, implicit %eax
+body: |
+ bb.1 (%ir-block.0):
+ liveins: %eax,%edx
+
+ %0(s8) = COPY %edx
+ %eax = COPY %0(s8)
+ RET 0, implicit %eax
+
+...
+---
+name: test_copy6
+# ALL-LABEL: name: test_copy6
+alignment: 4
+legalized: true
+regBankSelected: true
+# ALL: registers:
+# ALL-NEXT: - { id: 0, class: gr16, preferred-register: '' }
+# ALL-NEXT: - { id: 1, class: gr32, preferred-register: '' }
+registers:
+ - { id: 0, class: gpr, preferred-register: '' }
+# ALL: %0:gr16 = COPY %dx
+# ALL-NEXT: %1:gr32 = SUBREG_TO_REG 0, %0, %subreg.sub_16bit
+# ALL-NEXT: %eax = COPY %1
+# ALL-NEXT: RET 0, implicit %eax
+body: |
+ bb.1 (%ir-block.0):
+ liveins: %eax,%edx
+
+ %0(s16) = COPY %edx
+ %eax = COPY %0(s16)
+ RET 0, implicit %eax
+
+...
+
diff --git a/test/CodeGen/X86/GlobalISel/select-ext-x86-64.mir b/test/CodeGen/X86/GlobalISel/select-ext-x86-64.mir
index edb467b2bf90..9df24f65b368 100644
--- a/test/CodeGen/X86/GlobalISel/select-ext-x86-64.mir
+++ b/test/CodeGen/X86/GlobalISel/select-ext-x86-64.mir
@@ -1,3 +1,4 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple=x86_64-linux-gnu -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X64
--- |
@@ -17,33 +18,32 @@
ret i64 %r
}
+ define void @anyext_s64_from_s1() { ret void }
+ define void @anyext_s64_from_s8() { ret void }
+ define void @anyext_s64_from_s16() { ret void }
+ define void @anyext_s64_from_s32() { ret void }
...
---
name: test_zext_i1
-# ALL-LABEL: name: test_zext_i1
alignment: 4
legalized: true
regBankSelected: true
-# ALL: registers:
-# ALL-NEXT: - { id: 0, class: gr8, preferred-register: '' }
-# ALL-NEXT: - { id: 1, class: gr8, preferred-register: '' }
-# ALL-NEXT: - { id: 2, class: gr64, preferred-register: '' }
-# ALL-NEXT: - { id: 3, class: gr64, preferred-register: '' }
registers:
- { id: 0, class: gpr }
- { id: 1, class: gpr }
- { id: 2, class: gpr }
-# ALL: %0 = COPY %dil
-# ALL-NEXT: %1 = COPY %0
-# ALL-NEXT: %3 = SUBREG_TO_REG 0, %1, 1
-# ALL-NEXT: %2 = AND64ri8 %3, 1, implicit-def %eflags
-# ALL-NEXT: %rax = COPY %2
-# ALL-NEXT: RET 0, implicit %rax
body: |
bb.1 (%ir-block.0):
liveins: %edi
- %0(s8) = COPY %edi
+ ; ALL-LABEL: name: test_zext_i1
+ ; ALL: [[COPY:%[0-9]+]]:gr8 = COPY %dil
+ ; ALL: [[COPY1:%[0-9]+]]:gr8 = COPY [[COPY]]
+ ; ALL: [[SUBREG_TO_REG:%[0-9]+]]:gr64 = SUBREG_TO_REG 0, [[COPY1]], %subreg.sub_8bit
+ ; ALL: [[AND64ri8_:%[0-9]+]]:gr64 = AND64ri8 [[SUBREG_TO_REG]], 1, implicit-def %eflags
+ ; ALL: %rax = COPY [[AND64ri8_]]
+ ; ALL: RET 0, implicit %rax
+ %0(s8) = COPY %dil
%1(s1) = G_TRUNC %0(s8)
%2(s64) = G_ZEXT %1(s1)
%rax = COPY %2(s64)
@@ -52,25 +52,22 @@ body: |
...
---
name: test_sext_i8
-# ALL-LABEL: name: test_sext_i8
alignment: 4
legalized: true
regBankSelected: true
-# ALL: registers:
-# ALL-NEXT: - { id: 0, class: gr8, preferred-register: '' }
-# ALL-NEXT: - { id: 1, class: gr64, preferred-register: '' }
registers:
- { id: 0, class: gpr }
- { id: 1, class: gpr }
-# ALL: %0 = COPY %dil
-# ALL-NEXT: %1 = MOVSX64rr8 %0
-# ALL-NEXT: %rax = COPY %1
-# ALL-NEXT: RET 0, implicit %rax
body: |
bb.1 (%ir-block.0):
liveins: %edi
- %0(s8) = COPY %edi
+ ; ALL-LABEL: name: test_sext_i8
+ ; ALL: [[COPY:%[0-9]+]]:gr8 = COPY %dil
+ ; ALL: [[MOVSX64rr8_:%[0-9]+]]:gr64 = MOVSX64rr8 [[COPY]]
+ ; ALL: %rax = COPY [[MOVSX64rr8_]]
+ ; ALL: RET 0, implicit %rax
+ %0(s8) = COPY %dil
%1(s64) = G_SEXT %0(s8)
%rax = COPY %1(s64)
RET 0, implicit %rax
@@ -78,27 +75,124 @@ body: |
...
---
name: test_sext_i16
-# ALL-LABEL: name: test_sext_i16
alignment: 4
legalized: true
regBankSelected: true
-# ALL: registers:
-# ALL-NEXT: - { id: 0, class: gr16, preferred-register: '' }
-# ALL-NEXT: - { id: 1, class: gr64, preferred-register: '' }
registers:
- { id: 0, class: gpr }
- { id: 1, class: gpr }
-# ALL: %0 = COPY %di
-# ALL-NEXT: %1 = MOVSX64rr16 %0
-# ALL-NEXT: %rax = COPY %1
-# ALL-NEXT: RET 0, implicit %rax
body: |
bb.1 (%ir-block.0):
liveins: %edi
- %0(s16) = COPY %edi
+ ; ALL-LABEL: name: test_sext_i16
+ ; ALL: [[COPY:%[0-9]+]]:gr16 = COPY %di
+ ; ALL: [[MOVSX64rr16_:%[0-9]+]]:gr64 = MOVSX64rr16 [[COPY]]
+ ; ALL: %rax = COPY [[MOVSX64rr16_]]
+ ; ALL: RET 0, implicit %rax
+ %0(s16) = COPY %di
%1(s64) = G_SEXT %0(s16)
%rax = COPY %1(s64)
RET 0, implicit %rax
...
+---
+name: anyext_s64_from_s1
+alignment: 4
+legalized: true
+regBankSelected: true
+registers:
+ - { id: 0, class: gpr }
+ - { id: 1, class: gpr }
+ - { id: 2, class: gpr }
+body: |
+ bb.1 (%ir-block.0):
+ liveins: %edi
+
+ ; ALL-LABEL: name: anyext_s64_from_s1
+ ; ALL: [[COPY:%[0-9]+]]:gr64_with_sub_8bit = COPY %rdi
+ ; ALL: [[COPY1:%[0-9]+]]:gr8 = COPY [[COPY]].sub_8bit
+ ; ALL: [[SUBREG_TO_REG:%[0-9]+]]:gr64 = SUBREG_TO_REG 0, [[COPY1]], %subreg.sub_8bit
+ ; ALL: %rax = COPY [[SUBREG_TO_REG]]
+ ; ALL: RET 0, implicit %rax
+ %0(s64) = COPY %rdi
+ %1(s1) = G_TRUNC %0(s64)
+ %2(s64) = G_ANYEXT %1(s1)
+ %rax = COPY %2(s64)
+ RET 0, implicit %rax
+...
+---
+name: anyext_s64_from_s8
+alignment: 4
+legalized: true
+regBankSelected: true
+registers:
+ - { id: 0, class: gpr }
+ - { id: 1, class: gpr }
+ - { id: 2, class: gpr }
+body: |
+ bb.1 (%ir-block.0):
+ liveins: %edi
+
+ ; ALL-LABEL: name: anyext_s64_from_s8
+ ; ALL: [[COPY:%[0-9]+]]:gr64_with_sub_8bit = COPY %rdi
+ ; ALL: [[COPY1:%[0-9]+]]:gr8 = COPY [[COPY]].sub_8bit
+ ; ALL: [[SUBREG_TO_REG:%[0-9]+]]:gr64 = SUBREG_TO_REG 0, [[COPY1]], %subreg.sub_8bit
+ ; ALL: %rax = COPY [[SUBREG_TO_REG]]
+ ; ALL: RET 0, implicit %rax
+ %0(s64) = COPY %rdi
+ %1(s8) = G_TRUNC %0(s64)
+ %2(s64) = G_ANYEXT %1(s8)
+ %rax = COPY %2(s64)
+ RET 0, implicit %rax
+...
+---
+name: anyext_s64_from_s16
+alignment: 4
+legalized: true
+regBankSelected: true
+registers:
+ - { id: 0, class: gpr }
+ - { id: 1, class: gpr }
+ - { id: 2, class: gpr }
+body: |
+ bb.1 (%ir-block.0):
+ liveins: %edi
+
+ ; ALL-LABEL: name: anyext_s64_from_s16
+ ; ALL: [[COPY:%[0-9]+]]:gr64 = COPY %rdi
+ ; ALL: [[COPY1:%[0-9]+]]:gr16 = COPY [[COPY]].sub_16bit
+ ; ALL: [[SUBREG_TO_REG:%[0-9]+]]:gr64 = SUBREG_TO_REG 0, [[COPY1]], %subreg.sub_16bit
+ ; ALL: %rax = COPY [[SUBREG_TO_REG]]
+ ; ALL: RET 0, implicit %rax
+ %0(s64) = COPY %rdi
+ %1(s16) = G_TRUNC %0(s64)
+ %2(s64) = G_ANYEXT %1(s16)
+ %rax = COPY %2(s64)
+ RET 0, implicit %rax
+...
+---
+name: anyext_s64_from_s32
+alignment: 4
+legalized: true
+regBankSelected: true
+registers:
+ - { id: 0, class: gpr }
+ - { id: 1, class: gpr }
+ - { id: 2, class: gpr }
+body: |
+ bb.1 (%ir-block.0):
+ liveins: %edi
+
+ ; ALL-LABEL: name: anyext_s64_from_s32
+ ; ALL: [[COPY:%[0-9]+]]:gr64 = COPY %rdi
+ ; ALL: [[COPY1:%[0-9]+]]:gr32 = COPY [[COPY]].sub_32bit
+ ; ALL: [[SUBREG_TO_REG:%[0-9]+]]:gr64 = SUBREG_TO_REG 0, [[COPY1]], %subreg.sub_32bit
+ ; ALL: %rax = COPY [[SUBREG_TO_REG]]
+ ; ALL: RET 0, implicit %rax
+ %0(s64) = COPY %rdi
+ %1(s32) = G_TRUNC %0(s64)
+ %2(s64) = G_ANYEXT %1(s32)
+ %rax = COPY %2(s64)
+ RET 0, implicit %rax
+...
diff --git a/test/CodeGen/X86/GlobalISel/select-ext.mir b/test/CodeGen/X86/GlobalISel/select-ext.mir
index b6734e5aa2b8..90ac0c6763aa 100644
--- a/test/CodeGen/X86/GlobalISel/select-ext.mir
+++ b/test/CodeGen/X86/GlobalISel/select-ext.mir
@@ -37,6 +37,13 @@
ret i32 %r
}
+ define void @test_anyext_i1toi8() { ret void }
+ define void @test_anyext_i1toi16() { ret void }
+ define void @test_anyext_i1toi32() { ret void }
+ define void @test_anyext_i8toi16() { ret void }
+ define void @test_anyext_i8toi32() { ret void }
+ define void @test_anyext_i16toi32() { ret void }
+
...
---
name: test_zext_i1toi8
@@ -50,8 +57,8 @@ regBankSelected: true
registers:
- { id: 0, class: gpr, preferred-register: '' }
- { id: 1, class: gpr, preferred-register: '' }
-# ALL: %0 = COPY %dil
-# ALL-NEXT: %1 = AND8ri %0, 1, implicit-def %eflags
+# ALL: %0:gr8 = COPY %dil
+# ALL-NEXT: %1:gr8 = AND8ri %0, 1, implicit-def %eflags
# ALL-NEXT: %al = COPY %1
# ALL-NEXT: RET 0, implicit %al
body: |
@@ -77,9 +84,9 @@ regBankSelected: true
registers:
- { id: 0, class: gpr, preferred-register: '' }
- { id: 1, class: gpr, preferred-register: '' }
-# ALL: %0 = COPY %dil
-# ALL-NEXT: %2 = SUBREG_TO_REG 0, %0, 1
-# ALL-NEXT: %1 = AND16ri8 %2, 1, implicit-def %eflags
+# ALL: %0:gr8 = COPY %dil
+# ALL-NEXT: %2:gr16 = SUBREG_TO_REG 0, %0, %subreg.sub_8bit
+# ALL-NEXT: %1:gr16 = AND16ri8 %2, 1, implicit-def %eflags
# ALL-NEXT: %ax = COPY %1
# ALL-NEXT: RET 0, implicit %ax
body: |
@@ -105,9 +112,9 @@ regBankSelected: true
registers:
- { id: 0, class: gpr }
- { id: 1, class: gpr }
-# ALL: %0 = COPY %dil
-# ALL-NEXT: %2 = SUBREG_TO_REG 0, %0, 1
-# ALL-NEXT: %1 = AND32ri8 %2, 1, implicit-def %eflags
+# ALL: %0:gr8 = COPY %dil
+# ALL-NEXT: %2:gr32 = SUBREG_TO_REG 0, %0, %subreg.sub_8bit
+# ALL-NEXT: %1:gr32 = AND32ri8 %2, 1, implicit-def %eflags
# ALL-NEXT: %eax = COPY %1
# ALL-NEXT: RET 0, implicit %eax
body: |
@@ -132,15 +139,15 @@ regBankSelected: true
registers:
- { id: 0, class: gpr }
- { id: 1, class: gpr }
-# ALL: %0 = COPY %dil
-# ALL-NEXT: %1 = MOVZX32rr8 %0
+# ALL: %0:gr8 = COPY %dil
+# ALL-NEXT: %1:gr32 = MOVZX32rr8 %0
# ALL-NEXT: %eax = COPY %1
# ALL-NEXT: RET 0, implicit %eax
body: |
bb.1 (%ir-block.0):
liveins: %edi
- %0(s8) = COPY %edi
+ %0(s8) = COPY %dil
%1(s32) = G_ZEXT %0(s8)
%eax = COPY %1(s32)
RET 0, implicit %eax
@@ -158,15 +165,15 @@ regBankSelected: true
registers:
- { id: 0, class: gpr }
- { id: 1, class: gpr }
-# ALL: %0 = COPY %di
-# ALL-NEXT: %1 = MOVZX32rr16 %0
+# ALL: %0:gr16 = COPY %di
+# ALL-NEXT: %1:gr32 = MOVZX32rr16 %0
# ALL-NEXT: %eax = COPY %1
# ALL-NEXT: RET 0, implicit %eax
body: |
bb.1 (%ir-block.0):
liveins: %edi
- %0(s16) = COPY %edi
+ %0(s16) = COPY %di
%1(s32) = G_ZEXT %0(s16)
%eax = COPY %1(s32)
RET 0, implicit %eax
@@ -184,15 +191,15 @@ regBankSelected: true
registers:
- { id: 0, class: gpr }
- { id: 1, class: gpr }
-# ALL: %0 = COPY %dil
-# ALL-NEXT: %1 = MOVSX32rr8 %0
+# ALL: %0:gr8 = COPY %dil
+# ALL-NEXT: %1:gr32 = MOVSX32rr8 %0
# ALL-NEXT: %eax = COPY %1
# ALL-NEXT: RET 0, implicit %eax
body: |
bb.1 (%ir-block.0):
liveins: %edi
- %0(s8) = COPY %edi
+ %0(s8) = COPY %dil
%1(s32) = G_SEXT %0(s8)
%eax = COPY %1(s32)
RET 0, implicit %eax
@@ -210,17 +217,221 @@ regBankSelected: true
registers:
- { id: 0, class: gpr }
- { id: 1, class: gpr }
-# ALL: %0 = COPY %di
-# ALL-NEXT: %1 = MOVSX32rr16 %0
+# ALL: %0:gr16 = COPY %di
+# ALL-NEXT: %1:gr32 = MOVSX32rr16 %0
# ALL-NEXT: %eax = COPY %1
# ALL-NEXT: RET 0, implicit %eax
body: |
bb.1 (%ir-block.0):
liveins: %edi
- %0(s16) = COPY %edi
+ %0(s16) = COPY %di
%1(s32) = G_SEXT %0(s16)
%eax = COPY %1(s32)
RET 0, implicit %eax
...
+---
+name: test_anyext_i1toi8
+# ALL-LABEL: name: test_anyext_i1toi8
+alignment: 4
+legalized: true
+regBankSelected: true
+# X32: registers:
+# X32-NEXT: - { id: 0, class: gr32_abcd, preferred-register: '' }
+# X32-NEXT: - { id: 1, class: gr8, preferred-register: '' }
+# X32-NEXT: - { id: 2, class: gr8, preferred-register: '' }
+#
+# X64: registers:
+# X64-NEXT: - { id: 0, class: gr32, preferred-register: '' }
+# X64-NEXT: - { id: 1, class: gr8, preferred-register: '' }
+# X64-NEXT: - { id: 2, class: gr8, preferred-register: '' }
+registers:
+ - { id: 0, class: gpr }
+ - { id: 1, class: gpr }
+ - { id: 2, class: gpr }
+# X32: %0:gr32_abcd = COPY %edi
+# X64: %0:gr32 = COPY %edi
+# ALL-NEXT: %1:gr8 = COPY %0.sub_8bit
+# ALL-NEXT: %2:gr8 = COPY %1
+# ALL-NEXT: %al = COPY %2
+# ALL-NEXT: RET 0, implicit %al
+body: |
+ bb.1 (%ir-block.0):
+ liveins: %edi
+
+ %0(s32) = COPY %edi
+ %1(s1) = G_TRUNC %0(s32)
+ %2(s8) = G_ANYEXT %1(s1)
+ %al = COPY %2(s8)
+ RET 0, implicit %al
+...
+---
+name: test_anyext_i1toi16
+# ALL-LABEL: name: test_anyext_i1toi16
+alignment: 4
+legalized: true
+regBankSelected: true
+# X32: registers:
+# X32-NEXT: - { id: 0, class: gr32_abcd, preferred-register: '' }
+# X32-NEXT: - { id: 1, class: gr8, preferred-register: '' }
+# X32-NEXT: - { id: 2, class: gr16, preferred-register: '' }
+#
+# X64: registers:
+# X64-NEXT: - { id: 0, class: gr32, preferred-register: '' }
+# X64-NEXT: - { id: 1, class: gr8, preferred-register: '' }
+# X64-NEXT: - { id: 2, class: gr16, preferred-register: '' }
+registers:
+ - { id: 0, class: gpr }
+ - { id: 1, class: gpr }
+ - { id: 2, class: gpr }
+# X32: %0:gr32_abcd = COPY %edi
+# X64: %0:gr32 = COPY %edi
+# ALL-NEXT: %1:gr8 = COPY %0.sub_8bit
+# ALL-NEXT: %2:gr16 = SUBREG_TO_REG 0, %1, %subreg.sub_8bit
+# ALL-NEXT: %ax = COPY %2
+# ALL-NEXT: RET 0, implicit %ax
+body: |
+ bb.1 (%ir-block.0):
+ liveins: %edi
+
+ %0(s32) = COPY %edi
+ %1(s1) = G_TRUNC %0(s32)
+ %2(s16) = G_ANYEXT %1(s1)
+ %ax = COPY %2(s16)
+ RET 0, implicit %ax
+...
+---
+name: test_anyext_i1toi32
+# ALL-LABEL: name: test_anyext_i1toi32
+alignment: 4
+legalized: true
+regBankSelected: true
+# X32: registers:
+# X32-NEXT: - { id: 0, class: gr32_abcd, preferred-register: '' }
+# X32-NEXT: - { id: 1, class: gr8, preferred-register: '' }
+# X32-NEXT: - { id: 2, class: gr32, preferred-register: '' }
+#
+# X64: registers:
+# X64-NEXT: - { id: 0, class: gr32, preferred-register: '' }
+# X64-NEXT: - { id: 1, class: gr8, preferred-register: '' }
+# X64-NEXT: - { id: 2, class: gr32, preferred-register: '' }
+registers:
+ - { id: 0, class: gpr }
+ - { id: 1, class: gpr }
+ - { id: 2, class: gpr }
+# X32: %0:gr32_abcd = COPY %edi
+# X64: %0:gr32 = COPY %edi
+# ALL-NEXT: %1:gr8 = COPY %0.sub_8bit
+# ALL-NEXT: %2:gr32 = SUBREG_TO_REG 0, %1, %subreg.sub_8bit
+# ALL-NEXT: %eax = COPY %2
+# ALL-NEXT: RET 0, implicit %eax
+body: |
+ bb.1 (%ir-block.0):
+ liveins: %edi
+
+ %0(s32) = COPY %edi
+ %1(s1) = G_TRUNC %0(s32)
+ %2(s32) = G_ANYEXT %1(s1)
+ %eax = COPY %2(s32)
+ RET 0, implicit %eax
+...
+---
+name: test_anyext_i8toi16
+# ALL-LABEL: name: test_anyext_i8toi16
+alignment: 4
+legalized: true
+regBankSelected: true
+# X32: registers:
+# X32-NEXT: - { id: 0, class: gr32_abcd, preferred-register: '' }
+# X32-NEXT: - { id: 1, class: gr8, preferred-register: '' }
+# X32-NEXT: - { id: 2, class: gr16, preferred-register: '' }
+#
+# X64: registers:
+# X64-NEXT: - { id: 0, class: gr32, preferred-register: '' }
+# X64-NEXT: - { id: 1, class: gr8, preferred-register: '' }
+# X64-NEXT: - { id: 2, class: gr16, preferred-register: '' }
+registers:
+ - { id: 0, class: gpr }
+ - { id: 1, class: gpr }
+ - { id: 2, class: gpr }
+# X32: %0:gr32_abcd = COPY %edi
+# X64: %0:gr32 = COPY %edi
+# ALL-NEXT: %1:gr8 = COPY %0.sub_8bit
+# ALL-NEXT: %2:gr16 = SUBREG_TO_REG 0, %1, %subreg.sub_8bit
+# ALL-NEXT: %ax = COPY %2
+# ALL-NEXT: RET 0, implicit %ax
+body: |
+ bb.1 (%ir-block.0):
+ liveins: %edi
+
+ %0(s32) = COPY %edi
+ %1(s8) = G_TRUNC %0(s32)
+ %2(s16) = G_ANYEXT %1(s8)
+ %ax = COPY %2(s16)
+ RET 0, implicit %ax
+...
+---
+name: test_anyext_i8toi32
+# ALL-LABEL: name: test_anyext_i8toi32
+alignment: 4
+legalized: true
+regBankSelected: true
+# X32: registers:
+# X32-NEXT: - { id: 0, class: gr32_abcd, preferred-register: '' }
+# X32-NEXT: - { id: 1, class: gr8, preferred-register: '' }
+# X32-NEXT: - { id: 2, class: gr32, preferred-register: '' }
+#
+# X64: registers:
+# X64-NEXT: - { id: 0, class: gr32, preferred-register: '' }
+# X64-NEXT: - { id: 1, class: gr8, preferred-register: '' }
+# X64-NEXT: - { id: 2, class: gr32, preferred-register: '' }
+registers:
+ - { id: 0, class: gpr }
+ - { id: 1, class: gpr }
+ - { id: 2, class: gpr }
+# X32: %0:gr32_abcd = COPY %edi
+# X64: %0:gr32 = COPY %edi
+# ALL-NEXT: %1:gr8 = COPY %0.sub_8bit
+# ALL-NEXT: %2:gr32 = MOVZX32rr8 %1
+# ALL-NEXT: %eax = COPY %2
+# ALL-NEXT: RET 0, implicit %eax
+body: |
+ bb.1 (%ir-block.0):
+ liveins: %edi
+
+ %0(s32) = COPY %edi
+ %1(s8) = G_TRUNC %0(s32)
+ %2(s32) = G_ANYEXT %1(s8)
+ %eax = COPY %2(s32)
+ RET 0, implicit %eax
+...
+---
+name: test_anyext_i16toi32
+# ALL-LABEL: name: test_anyext_i16toi32
+alignment: 4
+legalized: true
+regBankSelected: true
+# ALL: registers:
+# ALL-NEXT: - { id: 0, class: gr32, preferred-register: '' }
+# ALL-NEXT: - { id: 1, class: gr16, preferred-register: '' }
+# ALL-NEXT: - { id: 2, class: gr32, preferred-register: '' }
+registers:
+ - { id: 0, class: gpr }
+ - { id: 1, class: gpr }
+ - { id: 2, class: gpr }
+# ALL: %0:gr32 = COPY %edi
+# ALL-NEXT: %1:gr16 = COPY %0.sub_16bit
+# ALL-NEXT: %2:gr32 = SUBREG_TO_REG 0, %1, %subreg.sub_16bit
+# ALL-NEXT: %eax = COPY %2
+# ALL-NEXT: RET 0, implicit %eax
+body: |
+ bb.1 (%ir-block.0):
+ liveins: %edi
+
+ %0(s32) = COPY %edi
+ %1(s16) = G_TRUNC %0(s32)
+ %2(s32) = G_ANYEXT %1(s16)
+ %eax = COPY %2(s32)
+ RET 0, implicit %eax
+...
diff --git a/test/CodeGen/X86/GlobalISel/select-extract-vec256.mir b/test/CodeGen/X86/GlobalISel/select-extract-vec256.mir
index 89bb84932cc0..01f43be153bc 100644
--- a/test/CodeGen/X86/GlobalISel/select-extract-vec256.mir
+++ b/test/CodeGen/X86/GlobalISel/select-extract-vec256.mir
@@ -27,8 +27,10 @@ regBankSelected: true
registers:
- { id: 0, class: vecr }
- { id: 1, class: vecr }
-# ALL: %0 = COPY %ymm1
-# ALL-NEXT: %1 = COPY %0.sub_xmm
+# AVX: %0:vr256 = COPY %ymm1
+# AVX-NEXT: %1:vr128 = COPY %0.sub_xmm
+# AVX512VL: %0:vr256x = COPY %ymm1
+# AVX512VL-NEXT: %1:vr128x = COPY %0.sub_xmm
# ALL-NEXT: %xmm0 = COPY %1
# ALL-NEXT: RET 0, implicit %xmm0
body: |
@@ -57,13 +59,13 @@ regBankSelected: true
registers:
- { id: 0, class: vecr }
- { id: 1, class: vecr }
-# AVX: %0 = COPY %ymm1
-# AVX-NEXT: %1 = VEXTRACTF128rr %0, 1
+# AVX: %0:vr256 = COPY %ymm1
+# AVX-NEXT: %1:vr128 = VEXTRACTF128rr %0, 1
# AVX-NEXT: %xmm0 = COPY %1
# AVX-NEXT: RET 0, implicit %xmm0
#
-# AVX512VL: %0 = COPY %ymm1
-# AVX512VL-NEXT: %1 = VEXTRACTF32x4Z256rr %0, 1
+# AVX512VL: %0:vr256x = COPY %ymm1
+# AVX512VL-NEXT: %1:vr128x = VEXTRACTF32x4Z256rr %0, 1
# AVX512VL-NEXT: %xmm0 = COPY %1
# AVX512VL-NEXT: RET 0, implicit %xmm0
body: |
@@ -76,5 +78,3 @@ body: |
RET 0, implicit %xmm0
...
-
-
diff --git a/test/CodeGen/X86/GlobalISel/select-extract-vec512.mir b/test/CodeGen/X86/GlobalISel/select-extract-vec512.mir
index a0f0d6f39d45..b17b9793d101 100644
--- a/test/CodeGen/X86/GlobalISel/select-extract-vec512.mir
+++ b/test/CodeGen/X86/GlobalISel/select-extract-vec512.mir
@@ -32,8 +32,8 @@ regBankSelected: true
registers:
- { id: 0, class: vecr }
- { id: 1, class: vecr }
-# ALL: %0 = COPY %zmm1
-# ALL-NEXT: %1 = COPY %0.sub_xmm
+# ALL: %0:vr512 = COPY %zmm1
+# ALL-NEXT: %1:vr128x = COPY %0.sub_xmm
# ALL-NEXT: %xmm0 = COPY %1
# ALL-NEXT: RET 0, implicit %xmm0
body: |
@@ -58,8 +58,8 @@ regBankSelected: true
registers:
- { id: 0, class: vecr }
- { id: 1, class: vecr }
-# ALL: %0 = COPY %zmm1
-# ALL-NEXT: %1 = VEXTRACTF32x4Zrr %0, 1
+# ALL: %0:vr512 = COPY %zmm1
+# ALL-NEXT: %1:vr128x = VEXTRACTF32x4Zrr %0, 1
# ALL-NEXT: %xmm0 = COPY %1
# ALL-NEXT: RET 0, implicit %xmm0
body: |
@@ -84,8 +84,8 @@ regBankSelected: true
registers:
- { id: 0, class: vecr }
- { id: 1, class: vecr }
-# ALL: %0 = COPY %zmm1
-# ALL-NEXT: %1 = COPY %0.sub_ymm
+# ALL: %0:vr512 = COPY %zmm1
+# ALL-NEXT: %1:vr256x = COPY %0.sub_ymm
# ALL-NEXT: %ymm0 = COPY %1
# ALL-NEXT: RET 0, implicit %ymm0
body: |
@@ -110,8 +110,8 @@ regBankSelected: true
registers:
- { id: 0, class: vecr }
- { id: 1, class: vecr }
-# ALL: %0 = COPY %zmm1
-# ALL-NEXT: %1 = VEXTRACTF64x4Zrr %0, 1
+# ALL: %0:vr512 = COPY %zmm1
+# ALL-NEXT: %1:vr256x = VEXTRACTF64x4Zrr %0, 1
# ALL-NEXT: %ymm0 = COPY %1
# ALL-NEXT: RET 0, implicit %ymm0
body: |
@@ -124,4 +124,3 @@ body: |
RET 0, implicit %ymm0
...
-
diff --git a/test/CodeGen/X86/GlobalISel/select-fadd-scalar.mir b/test/CodeGen/X86/GlobalISel/select-fadd-scalar.mir
index fa4c529982cc..da8262bc38fc 100644
--- a/test/CodeGen/X86/GlobalISel/select-fadd-scalar.mir
+++ b/test/CodeGen/X86/GlobalISel/select-fadd-scalar.mir
@@ -38,21 +38,21 @@ liveins:
fixedStack:
stack:
constants:
-# SSE: %0 = COPY %xmm0
-# SSE-NEXT: %1 = COPY %xmm1
-# SSE-NEXT: %2 = ADDSSrr %0, %1
+# SSE: %0:fr32 = COPY %xmm0
+# SSE-NEXT: %1:fr32 = COPY %xmm1
+# SSE-NEXT: %2:fr32 = ADDSSrr %0, %1
# SSE-NEXT: %xmm0 = COPY %2
# SSE-NEXT: RET 0, implicit %xmm0
#
-# AVX: %0 = COPY %xmm0
-# AVX-NEXT: %1 = COPY %xmm1
-# AVX-NEXT: %2 = VADDSSrr %0, %1
+# AVX: %0:fr32 = COPY %xmm0
+# AVX-NEXT: %1:fr32 = COPY %xmm1
+# AVX-NEXT: %2:fr32 = VADDSSrr %0, %1
# AVX-NEXT: %xmm0 = COPY %2
# AVX-NEXT: RET 0, implicit %xmm0
#
-# AVX512ALL: %0 = COPY %xmm0
-# AVX512ALL-NEXT: %1 = COPY %xmm1
-# AVX512ALL-NEXT: %2 = VADDSSZrr %0, %1
+# AVX512ALL: %0:fr32x = COPY %xmm0
+# AVX512ALL-NEXT: %1:fr32x = COPY %xmm1
+# AVX512ALL-NEXT: %2:fr32x = VADDSSZrr %0, %1
# AVX512ALL-NEXT: %xmm0 = COPY %2
# AVX512ALL-NEXT: RET 0, implicit %xmm0
body: |
@@ -89,21 +89,21 @@ liveins:
fixedStack:
stack:
constants:
-# SSE: %0 = COPY %xmm0
-# SSE-NEXT: %1 = COPY %xmm1
-# SSE-NEXT: %2 = ADDSDrr %0, %1
+# SSE: %0:fr64 = COPY %xmm0
+# SSE-NEXT: %1:fr64 = COPY %xmm1
+# SSE-NEXT: %2:fr64 = ADDSDrr %0, %1
# SSE-NEXT: %xmm0 = COPY %2
# SSE-NEXT: RET 0, implicit %xmm0
#
-# AVX: %0 = COPY %xmm0
-# AVX-NEXT: %1 = COPY %xmm1
-# AVX-NEXT: %2 = VADDSDrr %0, %1
+# AVX: %0:fr64 = COPY %xmm0
+# AVX-NEXT: %1:fr64 = COPY %xmm1
+# AVX-NEXT: %2:fr64 = VADDSDrr %0, %1
# AVX-NEXT: %xmm0 = COPY %2
# AVX-NEXT: RET 0, implicit %xmm0
#
-# AVX512ALL: %0 = COPY %xmm0
-# AVX512ALL-NEXT: %1 = COPY %xmm1
-# AVX512ALL-NEXT: %2 = VADDSDZrr %0, %1
+# AVX512ALL: %0:fr64x = COPY %xmm0
+# AVX512ALL-NEXT: %1:fr64x = COPY %xmm1
+# AVX512ALL-NEXT: %2:fr64x = VADDSDZrr %0, %1
# AVX512ALL-NEXT: %xmm0 = COPY %2
# AVX512ALL-NEXT: RET 0, implicit %xmm0
body: |
diff --git a/test/CodeGen/X86/GlobalISel/select-fconstant.mir b/test/CodeGen/X86/GlobalISel/select-fconstant.mir
new file mode 100644
index 000000000000..8855d2be68c5
--- /dev/null
+++ b/test/CodeGen/X86/GlobalISel/select-fconstant.mir
@@ -0,0 +1,85 @@
+#RUN: llc -mtriple=x86_64-linux-gnu -mattr=+sse2 -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK64 --check-prefix=CHECK_SMALL --check-prefix=CHECK_SMALL64 --check-prefix=CHECK_NOPIC64
+#RUN: llc -mtriple=x86_64-linux-gnu -mattr=+sse2 -global-isel -code-model=large -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK64 --check-prefix=CHECK_LARGE --check-prefix=CHECK_LARGE64
+#RUN: llc -mtriple=i386-linux-gnu -mattr=+sse2 -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK32 --check-prefix=CHECK_SMALL --check-prefix=CHECK_SMALL32
+#RUN: llc -mtriple=i386-linux-gnu -mattr=+sse2 -global-isel -code-model=large -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK32 --check-prefix=CHECK_LARGE --check-prefix=CHECK_LARGE32
+#RUN: llc -mtriple=x86_64-linux-gnu -mattr=+sse2 -global-isel -relocation-model=pic -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK64 --check-prefix=CHECK_SMALL --check-prefix=CHECK_SMALL64 --check-prefix=CHECK_PIC64
+
+--- |
+ define float @test_float() {
+ entry:
+ ret float 5.500000e+00
+ }
+
+ define double @test_double() {
+ entry:
+ ret double 5.500000e+00
+ }
+---
+name: test_float
+# CHECK64-LABEL: name: test_float
+#
+# CHECK32-LABEL: name: test_float
+alignment: 4
+legalized: true
+regBankSelected: true
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: vecr, preferred-register: '' }
+# CHECK_SMALL64: %0:fr32 = MOVSSrm %rip, 1, %noreg, %const.0, %noreg
+# CHECK_SMALL64-NEXT: %xmm0 = COPY %0
+# CHECK_SMALL64-NEXT: RET 0, implicit %xmm0
+#
+# CHECK_LARGE64: %1:gr64 = MOV64ri %const.0
+# CHECK_LARGE64-NEXT: %0:fr32 = MOVSSrm %1, 1, %noreg, 0, %noreg :: (load 8 from constant-pool, align 32)
+# CHECK_LARGE64-NEXT: %xmm0 = COPY %0
+# CHECK_LARGE64-NEXT: RET 0, implicit %xmm0
+#
+# CHECK32: %0:fr32 = MOVSSrm %noreg, 1, %noreg, %const.0, %noreg
+# CHECK32-NEXT: %xmm0 = COPY %0
+# CHECK32-NEXT: RET 0, implicit %xmm0
+body: |
+ bb.1.entry:
+ %0(s32) = G_FCONSTANT float 5.500000e+00
+ %xmm0 = COPY %0(s32)
+ RET 0, implicit %xmm0
+
+...
+---
+name: test_double
+# CHECK64-LABEL: name: test_double
+#
+# CHECK32-LABEL: name: test_double
+alignment: 4
+legalized: true
+regBankSelected: true
+tracksRegLiveness: true
+# CHECK_SMALL64: registers:
+# CHECK_SMALL64-NEXT: - { id: 0, class: fr64, preferred-register: '' }
+#
+# CHECK_LARGE64: registers:
+# CHECK_LARGE64-NEXT: - { id: 0, class: fr64, preferred-register: '' }
+# CHECK_LARGE64-NEXT: - { id: 1, class: gr64, preferred-register: '' }
+#
+# CHECK32: registers:
+# CHECK32-NEXT: - { id: 0, class: fr64, preferred-register: '' }
+registers:
+ - { id: 0, class: vecr, preferred-register: '' }
+# CHECK_SMALL64: %0:fr64 = MOVSDrm %rip, 1, %noreg, %const.0, %noreg
+# CHECK_SMALL64-NEXT: %xmm0 = COPY %0
+# CHECK_SMALL64-NEXT: RET 0, implicit %xmm0
+#
+# CHECK_LARGE64: %1:gr64 = MOV64ri %const.0
+# CHECK_LARGE64-NEXT: %0:fr64 = MOVSDrm %1, 1, %noreg, 0, %noreg :: (load 8 from constant-pool, align 64)
+# CHECK_LARGE64-NEXT: %xmm0 = COPY %0
+# CHECK_LARGE64-NEXT: RET 0, implicit %xmm0
+#
+# CHECK32: %0:fr64 = MOVSDrm %noreg, 1, %noreg, %const.0, %noreg
+# CHECK32-NEXT: %xmm0 = COPY %0
+# CHECK32-NEXT: RET 0, implicit %xmm0
+body: |
+ bb.1.entry:
+ %0(s64) = G_FCONSTANT double 5.500000e+00
+ %xmm0 = COPY %0(s64)
+ RET 0, implicit %xmm0
+
+...
diff --git a/test/CodeGen/X86/GlobalISel/select-fdiv-scalar.mir b/test/CodeGen/X86/GlobalISel/select-fdiv-scalar.mir
index d2c1d1528652..7dec4c5dffd7 100644
--- a/test/CodeGen/X86/GlobalISel/select-fdiv-scalar.mir
+++ b/test/CodeGen/X86/GlobalISel/select-fdiv-scalar.mir
@@ -38,21 +38,21 @@ liveins:
fixedStack:
stack:
constants:
-# SSE: %0 = COPY %xmm0
-# SSE-NEXT: %1 = COPY %xmm1
-# SSE-NEXT: %2 = DIVSSrr %0, %1
+# SSE: %0:fr32 = COPY %xmm0
+# SSE-NEXT: %1:fr32 = COPY %xmm1
+# SSE-NEXT: %2:fr32 = DIVSSrr %0, %1
# SSE-NEXT: %xmm0 = COPY %2
# SSE-NEXT: RET 0, implicit %xmm0
#
-# AVX: %0 = COPY %xmm0
-# AVX-NEXT: %1 = COPY %xmm1
-# AVX-NEXT: %2 = VDIVSSrr %0, %1
+# AVX: %0:fr32 = COPY %xmm0
+# AVX-NEXT: %1:fr32 = COPY %xmm1
+# AVX-NEXT: %2:fr32 = VDIVSSrr %0, %1
# AVX-NEXT: %xmm0 = COPY %2
# AVX-NEXT: RET 0, implicit %xmm0
#
-# AVX512ALL: %0 = COPY %xmm0
-# AVX512ALL-NEXT: %1 = COPY %xmm1
-# AVX512ALL-NEXT: %2 = VDIVSSZrr %0, %1
+# AVX512ALL: %0:fr32x = COPY %xmm0
+# AVX512ALL-NEXT: %1:fr32x = COPY %xmm1
+# AVX512ALL-NEXT: %2:fr32x = VDIVSSZrr %0, %1
# AVX512ALL-NEXT: %xmm0 = COPY %2
# AVX512ALL-NEXT: RET 0, implicit %xmm0
body: |
@@ -89,21 +89,21 @@ liveins:
fixedStack:
stack:
constants:
-# SSE: %0 = COPY %xmm0
-# SSE-NEXT: %1 = COPY %xmm1
-# SSE-NEXT: %2 = DIVSDrr %0, %1
+# SSE: %0:fr64 = COPY %xmm0
+# SSE-NEXT: %1:fr64 = COPY %xmm1
+# SSE-NEXT: %2:fr64 = DIVSDrr %0, %1
# SSE-NEXT: %xmm0 = COPY %2
# SSE-NEXT: RET 0, implicit %xmm0
#
-# AVX: %0 = COPY %xmm0
-# AVX-NEXT: %1 = COPY %xmm1
-# AVX-NEXT: %2 = VDIVSDrr %0, %1
+# AVX: %0:fr64 = COPY %xmm0
+# AVX-NEXT: %1:fr64 = COPY %xmm1
+# AVX-NEXT: %2:fr64 = VDIVSDrr %0, %1
# AVX-NEXT: %xmm0 = COPY %2
# AVX-NEXT: RET 0, implicit %xmm0
#
-# AVX512ALL: %0 = COPY %xmm0
-# AVX512ALL-NEXT: %1 = COPY %xmm1
-# AVX512ALL-NEXT: %2 = VDIVSDZrr %0, %1
+# AVX512ALL: %0:fr64x = COPY %xmm0
+# AVX512ALL-NEXT: %1:fr64x = COPY %xmm1
+# AVX512ALL-NEXT: %2:fr64x = VDIVSDZrr %0, %1
# AVX512ALL-NEXT: %xmm0 = COPY %2
# AVX512ALL-NEXT: RET 0, implicit %xmm0
body: |
diff --git a/test/CodeGen/X86/GlobalISel/select-fmul-scalar.mir b/test/CodeGen/X86/GlobalISel/select-fmul-scalar.mir
index 98e5d303d7b1..ef4195d5d74d 100644
--- a/test/CodeGen/X86/GlobalISel/select-fmul-scalar.mir
+++ b/test/CodeGen/X86/GlobalISel/select-fmul-scalar.mir
@@ -38,21 +38,21 @@ liveins:
fixedStack:
stack:
constants:
-# SSE: %0 = COPY %xmm0
-# SSE-NEXT: %1 = COPY %xmm1
-# SSE-NEXT: %2 = MULSSrr %0, %1
+# SSE: %0:fr32 = COPY %xmm0
+# SSE-NEXT: %1:fr32 = COPY %xmm1
+# SSE-NEXT: %2:fr32 = MULSSrr %0, %1
# SSE-NEXT: %xmm0 = COPY %2
# SSE-NEXT: RET 0, implicit %xmm0
#
-# AVX: %0 = COPY %xmm0
-# AVX-NEXT: %1 = COPY %xmm1
-# AVX-NEXT: %2 = VMULSSrr %0, %1
+# AVX: %0:fr32 = COPY %xmm0
+# AVX-NEXT: %1:fr32 = COPY %xmm1
+# AVX-NEXT: %2:fr32 = VMULSSrr %0, %1
# AVX-NEXT: %xmm0 = COPY %2
# AVX-NEXT: RET 0, implicit %xmm0
#
-# AVX512ALL: %0 = COPY %xmm0
-# AVX512ALL-NEXT: %1 = COPY %xmm1
-# AVX512ALL-NEXT: %2 = VMULSSZrr %0, %1
+# AVX512ALL: %0:fr32x = COPY %xmm0
+# AVX512ALL-NEXT: %1:fr32x = COPY %xmm1
+# AVX512ALL-NEXT: %2:fr32x = VMULSSZrr %0, %1
# AVX512ALL-NEXT: %xmm0 = COPY %2
# AVX512ALL-NEXT: RET 0, implicit %xmm0
body: |
@@ -89,21 +89,21 @@ liveins:
fixedStack:
stack:
constants:
-# SSE: %0 = COPY %xmm0
-# SSE-NEXT: %1 = COPY %xmm1
-# SSE-NEXT: %2 = MULSDrr %0, %1
+# SSE: %0:fr64 = COPY %xmm0
+# SSE-NEXT: %1:fr64 = COPY %xmm1
+# SSE-NEXT: %2:fr64 = MULSDrr %0, %1
# SSE-NEXT: %xmm0 = COPY %2
# SSE-NEXT: RET 0, implicit %xmm0
#
-# AVX: %0 = COPY %xmm0
-# AVX-NEXT: %1 = COPY %xmm1
-# AVX-NEXT: %2 = VMULSDrr %0, %1
+# AVX: %0:fr64 = COPY %xmm0
+# AVX-NEXT: %1:fr64 = COPY %xmm1
+# AVX-NEXT: %2:fr64 = VMULSDrr %0, %1
# AVX-NEXT: %xmm0 = COPY %2
# AVX-NEXT: RET 0, implicit %xmm0
#
-# AVX512ALL: %0 = COPY %xmm0
-# AVX512ALL-NEXT: %1 = COPY %xmm1
-# AVX512ALL-NEXT: %2 = VMULSDZrr %0, %1
+# AVX512ALL: %0:fr64x = COPY %xmm0
+# AVX512ALL-NEXT: %1:fr64x = COPY %xmm1
+# AVX512ALL-NEXT: %2:fr64x = VMULSDZrr %0, %1
# AVX512ALL-NEXT: %xmm0 = COPY %2
# AVX512ALL-NEXT: RET 0, implicit %xmm0
body: |
diff --git a/test/CodeGen/X86/GlobalISel/select-fpext-scalar.mir b/test/CodeGen/X86/GlobalISel/select-fpext-scalar.mir
new file mode 100644
index 000000000000..00dfa6ae726f
--- /dev/null
+++ b/test/CodeGen/X86/GlobalISel/select-fpext-scalar.mir
@@ -0,0 +1,38 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=x86_64-linux-gnu -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL
+--- |
+
+ define double @test(float %a) {
+ entry:
+ %conv = fpext float %a to double
+ ret double %conv
+ }
+
+...
+---
+name: test
+alignment: 4
+legalized: true
+regBankSelected: true
+registers:
+ - { id: 0, class: vecr, preferred-register: '' }
+ - { id: 1, class: vecr, preferred-register: '' }
+liveins:
+fixedStack:
+stack:
+constants:
+body: |
+ bb.1.entry:
+ liveins: %xmm0
+
+ ; ALL-LABEL: name: test
+ ; ALL: [[COPY:%[0-9]+]]:fr32 = COPY %xmm0
+ ; ALL: [[CVTSS2SDrr:%[0-9]+]]:fr64 = CVTSS2SDrr [[COPY]]
+ ; ALL: %xmm0 = COPY [[CVTSS2SDrr]]
+ ; ALL: RET 0, implicit %xmm0
+ %0(s32) = COPY %xmm0
+ %1(s64) = G_FPEXT %0(s32)
+ %xmm0 = COPY %1(s64)
+ RET 0, implicit %xmm0
+
+...
diff --git a/test/CodeGen/X86/GlobalISel/select-frameIndex.mir b/test/CodeGen/X86/GlobalISel/select-frameIndex.mir
index 1d641ba279af..5d0573ecc49d 100644
--- a/test/CodeGen/X86/GlobalISel/select-frameIndex.mir
+++ b/test/CodeGen/X86/GlobalISel/select-frameIndex.mir
@@ -24,9 +24,9 @@ registers:
stack:
- { id: 0, name: ptr1, offset: 0, size: 4, alignment: 4 }
-# CHECK-X32: %0 = LEA32r %stack.0.ptr1, 1, _, 0, _
-# CHECK-X32ABI: %0 = LEA64_32r %stack.0.ptr1, 1, _, 0, _
-# CHECK-X64: %0 = LEA64r %stack.0.ptr1, 1, _, 0, _
+# CHECK-X32: %0 = LEA32r %stack.0.ptr1, 1, %noreg, 0, %noreg
+# CHECK-X32ABI: %0 = LEA64_32r %stack.0.ptr1, 1, %noreg, 0, %noreg
+# CHECK-X64: %0 = LEA64r %stack.0.ptr1, 1, %noreg, 0, %noreg
body: |
bb.1 (%ir-block.0):
%0(p0) = G_FRAME_INDEX %stack.0.ptr1
diff --git a/test/CodeGen/X86/GlobalISel/select-fsub-scalar.mir b/test/CodeGen/X86/GlobalISel/select-fsub-scalar.mir
index 9f58327d9bb6..e72bf4bac19f 100644
--- a/test/CodeGen/X86/GlobalISel/select-fsub-scalar.mir
+++ b/test/CodeGen/X86/GlobalISel/select-fsub-scalar.mir
@@ -38,21 +38,21 @@ liveins:
fixedStack:
stack:
constants:
-# SSE: %0 = COPY %xmm0
-# SSE-NEXT: %1 = COPY %xmm1
-# SSE-NEXT: %2 = SUBSSrr %0, %1
+# SSE: %0:fr32 = COPY %xmm0
+# SSE-NEXT: %1:fr32 = COPY %xmm1
+# SSE-NEXT: %2:fr32 = SUBSSrr %0, %1
# SSE-NEXT: %xmm0 = COPY %2
# SSE-NEXT: RET 0, implicit %xmm0
#
-# AVX: %0 = COPY %xmm0
-# AVX-NEXT: %1 = COPY %xmm1
-# AVX-NEXT: %2 = VSUBSSrr %0, %1
+# AVX: %0:fr32 = COPY %xmm0
+# AVX-NEXT: %1:fr32 = COPY %xmm1
+# AVX-NEXT: %2:fr32 = VSUBSSrr %0, %1
# AVX-NEXT: %xmm0 = COPY %2
# AVX-NEXT: RET 0, implicit %xmm0
#
-# AVX512ALL: %0 = COPY %xmm0
-# AVX512ALL-NEXT: %1 = COPY %xmm1
-# AVX512ALL-NEXT: %2 = VSUBSSZrr %0, %1
+# AVX512ALL: %0:fr32x = COPY %xmm0
+# AVX512ALL-NEXT: %1:fr32x = COPY %xmm1
+# AVX512ALL-NEXT: %2:fr32x = VSUBSSZrr %0, %1
# AVX512ALL-NEXT: %xmm0 = COPY %2
# AVX512ALL-NEXT: RET 0, implicit %xmm0
body: |
@@ -89,21 +89,21 @@ liveins:
fixedStack:
stack:
constants:
-# SSE: %0 = COPY %xmm0
-# SSE-NEXT: %1 = COPY %xmm1
-# SSE-NEXT: %2 = SUBSDrr %0, %1
+# SSE: %0:fr64 = COPY %xmm0
+# SSE-NEXT: %1:fr64 = COPY %xmm1
+# SSE-NEXT: %2:fr64 = SUBSDrr %0, %1
# SSE-NEXT: %xmm0 = COPY %2
# SSE-NEXT: RET 0, implicit %xmm0
#
-# AVX: %0 = COPY %xmm0
-# AVX-NEXT: %1 = COPY %xmm1
-# AVX-NEXT: %2 = VSUBSDrr %0, %1
+# AVX: %0:fr64 = COPY %xmm0
+# AVX-NEXT: %1:fr64 = COPY %xmm1
+# AVX-NEXT: %2:fr64 = VSUBSDrr %0, %1
# AVX-NEXT: %xmm0 = COPY %2
# AVX-NEXT: RET 0, implicit %xmm0
#
-# AVX512ALL: %0 = COPY %xmm0
-# AVX512ALL-NEXT: %1 = COPY %xmm1
-# AVX512ALL-NEXT: %2 = VSUBSDZrr %0, %1
+# AVX512ALL: %0:fr64x = COPY %xmm0
+# AVX512ALL-NEXT: %1:fr64x = COPY %xmm1
+# AVX512ALL-NEXT: %2:fr64x = VSUBSDZrr %0, %1
# AVX512ALL-NEXT: %xmm0 = COPY %2
# AVX512ALL-NEXT: RET 0, implicit %xmm0
body: |
diff --git a/test/CodeGen/X86/GlobalISel/select-gep.mir b/test/CodeGen/X86/GlobalISel/select-gep.mir
index 61c766230035..b78afd2803aa 100644
--- a/test/CodeGen/X86/GlobalISel/select-gep.mir
+++ b/test/CodeGen/X86/GlobalISel/select-gep.mir
@@ -1,3 +1,4 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple=x86_64-linux-gnu -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK
--- |
@@ -12,22 +13,20 @@ alignment: 4
legalized: true
regBankSelected: true
selected: false
-# CHECK-LABEL: name: test_gep_i32
-# CHECK: registers:
-# CHECK-NEXT: - { id: 0, class: gr64, preferred-register: '' }
-# CHECK-NEXT: - { id: 1, class: gr64_nosp, preferred-register: '' }
-# CHECK-NEXT: - { id: 2, class: gr64, preferred-register: '' }
registers:
- { id: 0, class: gpr }
- { id: 1, class: gpr }
- { id: 2, class: gpr }
-# CHECK: body:
-# CHECK: %1 = MOV64ri32 20
-# CHECK-NEXT: %2 = LEA64r %0, 1, %1, 0, _
body: |
bb.1 (%ir-block.0):
liveins: %rdi
+ ; CHECK-LABEL: name: test_gep_i32
+ ; CHECK: [[COPY:%[0-9]+]]:gr64 = COPY %rdi
+ ; CHECK: [[MOV64ri32_:%[0-9]+]]:gr64_nosp = MOV64ri32 20
+ ; CHECK: [[LEA64r:%[0-9]+]]:gr64 = LEA64r [[COPY]], 1, [[MOV64ri32_]], 0, %noreg
+ ; CHECK: %rax = COPY [[LEA64r]]
+ ; CHECK: RET 0, implicit %rax
%0(p0) = COPY %rdi
%1(s64) = G_CONSTANT i64 20
%2(p0) = G_GEP %0, %1(s64)
diff --git a/test/CodeGen/X86/GlobalISel/select-inc.mir b/test/CodeGen/X86/GlobalISel/select-inc.mir
index 47fe6ef672ba..b2cfa4724b26 100644
--- a/test/CodeGen/X86/GlobalISel/select-inc.mir
+++ b/test/CodeGen/X86/GlobalISel/select-inc.mir
@@ -15,16 +15,15 @@ regBankSelected: true
# ALL: registers:
# ALL-NEXT: - { id: 0, class: gr8, preferred-register: '' }
# INC-NEXT: - { id: 1, class: gpr, preferred-register: '' }
-# ADD-NEXT: - { id: 1, class: gr8, preferred-register: '' }
+# ADD-NEXT: - { id: 1, class: gpr, preferred-register: '' }
# ALL-NEXT: - { id: 2, class: gr8, preferred-register: '' }
registers:
- { id: 0, class: gpr }
- { id: 1, class: gpr }
- { id: 2, class: gpr }
-# ALL: %0 = COPY %al
-# INC-NEXT: %2 = INC8r %0
-# ADD-NEXT: %1 = MOV8ri 1
-# ADD-NEXT: %2 = ADD8rr %0, %1
+# ALL: %0:gr8 = COPY %al
+# INC-NEXT: %2:gr8 = INC8r %0
+# ADD-NEXT: %2:gr8 = ADD8ri %0, 1
body: |
bb.1 (%ir-block.0):
liveins: %al
diff --git a/test/CodeGen/X86/GlobalISel/select-insert-vec256.mir b/test/CodeGen/X86/GlobalISel/select-insert-vec256.mir
index 923dc22678fb..744dfd6c8200 100644
--- a/test/CodeGen/X86/GlobalISel/select-insert-vec256.mir
+++ b/test/CodeGen/X86/GlobalISel/select-insert-vec256.mir
@@ -24,28 +24,19 @@ name: test_insert_128_idx0
alignment: 4
legalized: true
regBankSelected: true
-# AVX: registers:
-# AVX-NEXT: - { id: 0, class: vr256, preferred-register: '' }
-# AVX-NEXT: - { id: 1, class: vr128, preferred-register: '' }
-# AVX-NEXT: - { id: 2, class: vr256, preferred-register: '' }
-#
-# AVX512VL: registers:
-# AVX512VL-NEXT: - { id: 0, class: vr256x, preferred-register: '' }
-# AVX512VL-NEXT: - { id: 1, class: vr128x, preferred-register: '' }
-# AVX512VL-NEXT: - { id: 2, class: vr256x, preferred-register: '' }
registers:
- { id: 0, class: vecr }
- { id: 1, class: vecr }
- { id: 2, class: vecr }
-# AVX: %0 = COPY %ymm0
-# AVX-NEXT: %1 = COPY %xmm1
-# AVX-NEXT: %2 = VINSERTF128rr %0, %1, 0
+# AVX: %0:vr256 = COPY %ymm0
+# AVX-NEXT: %1:vr128 = COPY %xmm1
+# AVX-NEXT: %2:vr256 = VINSERTF128rr %0, %1, 0
# AVX-NEXT: %ymm0 = COPY %2
# AVX-NEXT: RET 0, implicit %ymm0
#
-# AVX512VL: %0 = COPY %ymm0
-# AVX512VL-NEXT: %1 = COPY %xmm1
-# AVX512VL-NEXT: %2 = VINSERTF32x4Z256rr %0, %1, 0
+# AVX512VL: %0:vr256x = COPY %ymm0
+# AVX512VL-NEXT: %1:vr128x = COPY %xmm1
+# AVX512VL-NEXT: %2:vr256x = VINSERTF32x4Z256rr %0, %1, 0
# AVX512VL-NEXT: %ymm0 = COPY %2
# AVX512VL-NEXT: RET 0, implicit %ymm0
body: |
@@ -65,23 +56,19 @@ name: test_insert_128_idx0_undef
alignment: 4
legalized: true
regBankSelected: true
-# AVX: registers:
-# AVX-NEXT: - { id: 0, class: vecr, preferred-register: '' }
-# AVX-NEXT: - { id: 1, class: vr128, preferred-register: '' }
-# AVX-NEXT: - { id: 2, class: vr256, preferred-register: '' }
-#
-# AVX512VL: registers:
-# AVX512VL-NEXT: - { id: 0, class: vecr, preferred-register: '' }
-# AVX512VL-NEXT: - { id: 1, class: vr128x, preferred-register: '' }
-# AVX512VL-NEXT: - { id: 2, class: vr256x, preferred-register: '' }
registers:
- { id: 0, class: vecr }
- { id: 1, class: vecr }
- { id: 2, class: vecr }
-# ALL: %1 = COPY %xmm1
-# ALL-NEXT: undef %2.sub_xmm = COPY %1
-# ALL-NEXT: %ymm0 = COPY %2
-# ALL-NEXT: RET 0, implicit %ymm0
+# AVX: %1:vr128 = COPY %xmm1
+# AVX-NEXT: undef %2.sub_xmm:vr256 = COPY %1
+# AVX-NEXT: %ymm0 = COPY %2
+# AVX-NEXT: RET 0, implicit %ymm0
+#
+# AVX512VL: %1:vr128x = COPY %xmm1
+# AVX512VL-NEXT: undef %2.sub_xmm:vr256x = COPY %1
+# AVX512VL-NEXT: %ymm0 = COPY %2
+# AVX512VL-NEXT: RET 0, implicit %ymm0
body: |
bb.1 (%ir-block.0):
liveins: %ymm0, %ymm1
@@ -99,28 +86,19 @@ name: test_insert_128_idx1
alignment: 4
legalized: true
regBankSelected: true
-# AVX: registers:
-# AVX-NEXT: - { id: 0, class: vr256, preferred-register: '' }
-# AVX-NEXT: - { id: 1, class: vr128, preferred-register: '' }
-# AVX-NEXT: - { id: 2, class: vr256, preferred-register: '' }
-#
-# AVX512VL: registers:
-# AVX512VL-NEXT: - { id: 0, class: vr256x, preferred-register: '' }
-# AVX512VL-NEXT: - { id: 1, class: vr128x, preferred-register: '' }
-# AVX512VL-NEXT: - { id: 2, class: vr256x, preferred-register: '' }
registers:
- { id: 0, class: vecr }
- { id: 1, class: vecr }
- { id: 2, class: vecr }
-# AVX: %0 = COPY %ymm0
-# AVX-NEXT: %1 = COPY %xmm1
-# AVX-NEXT: %2 = VINSERTF128rr %0, %1, 1
+# AVX: %0:vr256 = COPY %ymm0
+# AVX-NEXT: %1:vr128 = COPY %xmm1
+# AVX-NEXT: %2:vr256 = VINSERTF128rr %0, %1, 1
# AVX-NEXT: %ymm0 = COPY %2
# AVX-NEXT: RET 0, implicit %ymm0
#
-# AVX512VL: %0 = COPY %ymm0
-# AVX512VL-NEXT: %1 = COPY %xmm1
-# AVX512VL-NEXT: %2 = VINSERTF32x4Z256rr %0, %1, 1
+# AVX512VL: %0:vr256x = COPY %ymm0
+# AVX512VL-NEXT: %1:vr128x = COPY %xmm1
+# AVX512VL-NEXT: %2:vr256x = VINSERTF32x4Z256rr %0, %1, 1
# AVX512VL-NEXT: %ymm0 = COPY %2
# AVX512VL-NEXT: RET 0, implicit %ymm0
body: |
@@ -139,28 +117,19 @@ name: test_insert_128_idx1_undef
alignment: 4
legalized: true
regBankSelected: true
-# AVX: registers:
-# AVX-NEXT: - { id: 0, class: vr256, preferred-register: '' }
-# AVX-NEXT: - { id: 1, class: vr128, preferred-register: '' }
-# AVX-NEXT: - { id: 2, class: vr256, preferred-register: '' }
-#
-# AVX512VL: registers:
-# AVX512VL-NEXT: - { id: 0, class: vr256x, preferred-register: '' }
-# AVX512VL-NEXT: - { id: 1, class: vr128x, preferred-register: '' }
-# AVX512VL-NEXT: - { id: 2, class: vr256x, preferred-register: '' }
registers:
- { id: 0, class: vecr }
- { id: 1, class: vecr }
- { id: 2, class: vecr }
-# AVX: %0 = IMPLICIT_DEF
-# AVX-NEXT: %1 = COPY %xmm1
-# AVX-NEXT: %2 = VINSERTF128rr %0, %1, 1
+# AVX: %0:vr256 = IMPLICIT_DEF
+# AVX-NEXT: %1:vr128 = COPY %xmm1
+# AVX-NEXT: %2:vr256 = VINSERTF128rr %0, %1, 1
# AVX-NEXT: %ymm0 = COPY %2
# AVX-NEXT: RET 0, implicit %ymm0
#
-# AVX512VL: %0 = IMPLICIT_DEF
-# AVX512VL-NEXT: %1 = COPY %xmm1
-# AVX512VL-NEXT: %2 = VINSERTF32x4Z256rr %0, %1, 1
+# AVX512VL: %0:vr256x = IMPLICIT_DEF
+# AVX512VL-NEXT: %1:vr128x = COPY %xmm1
+# AVX512VL-NEXT: %2:vr256x = VINSERTF32x4Z256rr %0, %1, 1
# AVX512VL-NEXT: %ymm0 = COPY %2
# AVX512VL-NEXT: RET 0, implicit %ymm0
body: |
@@ -173,4 +142,3 @@ body: |
%ymm0 = COPY %2(<8 x s32>)
RET 0, implicit %ymm0
...
-
diff --git a/test/CodeGen/X86/GlobalISel/select-insert-vec512.mir b/test/CodeGen/X86/GlobalISel/select-insert-vec512.mir
index 3eddc083805a..45ed7289494f 100644
--- a/test/CodeGen/X86/GlobalISel/select-insert-vec512.mir
+++ b/test/CodeGen/X86/GlobalISel/select-insert-vec512.mir
@@ -1,3 +1,4 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx512f -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL
--- |
@@ -36,236 +37,203 @@
...
---
name: test_insert_128_idx0
-# ALL-LABEL: name: test_insert_128_idx0
alignment: 4
legalized: true
regBankSelected: true
-# ALL: registers:
-# ALL-NEXT: - { id: 0, class: vr512, preferred-register: '' }
-# ALL-NEXT: - { id: 1, class: vr128x, preferred-register: '' }
-# ALL-NEXT: - { id: 2, class: vr512, preferred-register: '' }
registers:
- { id: 0, class: vecr }
- { id: 1, class: vecr }
- { id: 2, class: vecr }
-# ALL: %0 = COPY %zmm0
-# ALL-NEXT: %1 = COPY %xmm1
-# ALL-NEXT: %2 = VINSERTF32x4Zrr %0, %1, 0
-# ALL-NEXT: %ymm0 = COPY %2
-# ALL-NEXT: RET 0, implicit %ymm0
body: |
bb.1 (%ir-block.0):
liveins: %zmm0, %ymm1
+ ; ALL-LABEL: name: test_insert_128_idx0
+ ; ALL: [[COPY:%[0-9]+]]:vr512 = COPY %zmm0
+ ; ALL: [[COPY1:%[0-9]+]]:vr128x = COPY %xmm1
+ ; ALL: [[VINSERTF32x4Zrr:%[0-9]+]]:vr512 = VINSERTF32x4Zrr [[COPY]], [[COPY1]], 0
+ ; ALL: %zmm0 = COPY [[VINSERTF32x4Zrr]]
+ ; ALL: RET 0, implicit %ymm0
%0(<16 x s32>) = COPY %zmm0
%1(<4 x s32>) = COPY %xmm1
%2(<16 x s32>) = G_INSERT %0(<16 x s32>), %1(<4 x s32>), 0
- %ymm0 = COPY %2(<16 x s32>)
+ %zmm0 = COPY %2(<16 x s32>)
RET 0, implicit %ymm0
...
---
name: test_insert_128_idx0_undef
-# ALL-LABEL: name: test_insert_128_idx0_undef
alignment: 4
legalized: true
regBankSelected: true
-# ALL: registers:
-# ALL-NEXT: - { id: 0, class: vecr, preferred-register: '' }
-# ALL-NEXT: - { id: 1, class: vr128x, preferred-register: '' }
-# ALL-NEXT: - { id: 2, class: vr512, preferred-register: '' }
registers:
- { id: 0, class: vecr }
- { id: 1, class: vecr }
- { id: 2, class: vecr }
-# ALL: %1 = COPY %xmm1
-# ALL-NEXT: undef %2.sub_xmm = COPY %1
-# ALL-NEXT: %ymm0 = COPY %2
-# ALL-NEXT: RET 0, implicit %ymm0
body: |
bb.1 (%ir-block.0):
liveins: %ymm0, %ymm1
+ ; ALL-LABEL: name: test_insert_128_idx0_undef
+ ; ALL: [[COPY:%[0-9]+]]:vr128x = COPY %xmm1
+ ; ALL: undef %2.sub_xmm:vr512 = COPY [[COPY]]
+ ; ALL: %zmm0 = COPY %2
+ ; ALL: RET 0, implicit %ymm0
%0(<16 x s32>) = IMPLICIT_DEF
%1(<4 x s32>) = COPY %xmm1
%2(<16 x s32>) = G_INSERT %0(<16 x s32>), %1(<4 x s32>), 0
- %ymm0 = COPY %2(<16 x s32>)
+ %zmm0 = COPY %2(<16 x s32>)
RET 0, implicit %ymm0
...
---
name: test_insert_128_idx1
-# ALL-LABEL: name: test_insert_128_idx1
alignment: 4
legalized: true
regBankSelected: true
-# ALL: registers:
-# ALL-NEXT: - { id: 0, class: vr512, preferred-register: '' }
-# ALL-NEXT: - { id: 1, class: vr128x, preferred-register: '' }
-# ALL-NEXT: - { id: 2, class: vr512, preferred-register: '' }
registers:
- { id: 0, class: vecr }
- { id: 1, class: vecr }
- { id: 2, class: vecr }
-# ALL: %0 = COPY %zmm0
-# ALL-NEXT: %1 = COPY %xmm1
-# ALL-NEXT: %2 = VINSERTF32x4Zrr %0, %1, 1
-# ALL-NEXT: %ymm0 = COPY %2
-# ALL-NEXT: RET 0, implicit %ymm0
body: |
bb.1 (%ir-block.0):
liveins: %ymm0, %ymm1
+ ; ALL-LABEL: name: test_insert_128_idx1
+ ; ALL: [[COPY:%[0-9]+]]:vr512 = COPY %zmm0
+ ; ALL: [[COPY1:%[0-9]+]]:vr128x = COPY %xmm1
+ ; ALL: [[VINSERTF32x4Zrr:%[0-9]+]]:vr512 = VINSERTF32x4Zrr [[COPY]], [[COPY1]], 1
+ ; ALL: %zmm0 = COPY [[VINSERTF32x4Zrr]]
+ ; ALL: RET 0, implicit %ymm0
%0(<16 x s32>) = COPY %zmm0
%1(<4 x s32>) = COPY %xmm1
%2(<16 x s32>) = G_INSERT %0(<16 x s32>), %1(<4 x s32>), 128
- %ymm0 = COPY %2(<16 x s32>)
+ %zmm0 = COPY %2(<16 x s32>)
RET 0, implicit %ymm0
...
---
name: test_insert_128_idx1_undef
-# ALL-LABEL: name: test_insert_128_idx1_undef
alignment: 4
legalized: true
regBankSelected: true
-# ALL: registers:
-# ALL-NEXT: - { id: 0, class: vr512, preferred-register: '' }
-# ALL-NEXT: - { id: 1, class: vr128x, preferred-register: '' }
-# ALL-NEXT: - { id: 2, class: vr512, preferred-register: '' }
registers:
- { id: 0, class: vecr }
- { id: 1, class: vecr }
- { id: 2, class: vecr }
-# ALL: %0 = IMPLICIT_DEF
-# ALL-NEXT: %1 = COPY %xmm1
-# ALL-NEXT: %2 = VINSERTF32x4Zrr %0, %1, 1
-# ALL-NEXT: %ymm0 = COPY %2
-# ALL-NEXT: RET 0, implicit %ymm0
body: |
bb.1 (%ir-block.0):
liveins: %ymm0, %ymm1
+ ; ALL-LABEL: name: test_insert_128_idx1_undef
+ ; ALL: [[DEF:%[0-9]+]]:vr512 = IMPLICIT_DEF
+ ; ALL: [[COPY:%[0-9]+]]:vr128x = COPY %xmm1
+ ; ALL: [[VINSERTF32x4Zrr:%[0-9]+]]:vr512 = VINSERTF32x4Zrr [[DEF]], [[COPY]], 1
+ ; ALL: %zmm0 = COPY [[VINSERTF32x4Zrr]]
+ ; ALL: RET 0, implicit %ymm0
%0(<16 x s32>) = IMPLICIT_DEF
%1(<4 x s32>) = COPY %xmm1
%2(<16 x s32>) = G_INSERT %0(<16 x s32>), %1(<4 x s32>), 128
- %ymm0 = COPY %2(<16 x s32>)
+ %zmm0 = COPY %2(<16 x s32>)
RET 0, implicit %ymm0
...
---
name: test_insert_256_idx0
-# ALL-LABEL: name: test_insert_256_idx0
alignment: 4
legalized: true
regBankSelected: true
-# ALL: registers:
-# ALL-NEXT: - { id: 0, class: vr512, preferred-register: '' }
-# ALL-NEXT: - { id: 1, class: vr256x, preferred-register: '' }
-# ALL-NEXT: - { id: 2, class: vr512, preferred-register: '' }
registers:
- { id: 0, class: vecr }
- { id: 1, class: vecr }
- { id: 2, class: vecr }
-# ALL: %0 = COPY %zmm0
-# ALL-NEXT: %1 = COPY %ymm1
-# ALL-NEXT: %2 = VINSERTF64x4Zrr %0, %1, 0
-# ALL-NEXT: %ymm0 = COPY %2
-# ALL-NEXT: RET 0, implicit %ymm0
body: |
bb.1 (%ir-block.0):
liveins: %zmm0, %ymm1
+ ; ALL-LABEL: name: test_insert_256_idx0
+ ; ALL: [[COPY:%[0-9]+]]:vr512 = COPY %zmm0
+ ; ALL: [[COPY1:%[0-9]+]]:vr256x = COPY %ymm1
+ ; ALL: [[VINSERTF64x4Zrr:%[0-9]+]]:vr512 = VINSERTF64x4Zrr [[COPY]], [[COPY1]], 0
+ ; ALL: %zmm0 = COPY [[VINSERTF64x4Zrr]]
+ ; ALL: RET 0, implicit %ymm0
%0(<16 x s32>) = COPY %zmm0
%1(<8 x s32>) = COPY %ymm1
%2(<16 x s32>) = G_INSERT %0(<16 x s32>), %1(<8 x s32>), 0
- %ymm0 = COPY %2(<16 x s32>)
+ %zmm0 = COPY %2(<16 x s32>)
RET 0, implicit %ymm0
...
---
name: test_insert_256_idx0_undef
-# ALL-LABEL: name: test_insert_256_idx0_undef
alignment: 4
legalized: true
regBankSelected: true
-# ALL: registers:
-# ALL-NEXT: - { id: 0, class: vecr, preferred-register: '' }
-# ALL-NEXT: - { id: 1, class: vr256x, preferred-register: '' }
-# ALL-NEXT: - { id: 2, class: vr512, preferred-register: '' }
registers:
- { id: 0, class: vecr }
- { id: 1, class: vecr }
- { id: 2, class: vecr }
-# ALL: %1 = COPY %ymm1
-# ALL-NEXT: undef %2.sub_ymm = COPY %1
-# ALL-NEXT: %ymm0 = COPY %2
-# ALL-NEXT: RET 0, implicit %ymm0
body: |
bb.1 (%ir-block.0):
liveins: %ymm0, %ymm1
+ ; ALL-LABEL: name: test_insert_256_idx0_undef
+ ; ALL: [[COPY:%[0-9]+]]:vr256x = COPY %ymm1
+ ; ALL: undef %2.sub_ymm:vr512 = COPY [[COPY]]
+ ; ALL: %zmm0 = COPY %2
+ ; ALL: RET 0, implicit %ymm0
%0(<16 x s32>) = IMPLICIT_DEF
%1(<8 x s32>) = COPY %ymm1
%2(<16 x s32>) = G_INSERT %0(<16 x s32>), %1(<8 x s32>), 0
- %ymm0 = COPY %2(<16 x s32>)
+ %zmm0 = COPY %2(<16 x s32>)
RET 0, implicit %ymm0
...
---
name: test_insert_256_idx1
-# ALL-LABEL: name: test_insert_256_idx1
alignment: 4
legalized: true
regBankSelected: true
-# ALL: registers:
-# ALL-NEXT: - { id: 0, class: vr512, preferred-register: '' }
-# ALL-NEXT: - { id: 1, class: vr256x, preferred-register: '' }
-# ALL-NEXT: - { id: 2, class: vr512, preferred-register: '' }
registers:
- { id: 0, class: vecr }
- { id: 1, class: vecr }
- { id: 2, class: vecr }
-# ALL: %0 = COPY %zmm0
-# ALL-NEXT: %1 = COPY %ymm1
-# ALL-NEXT: %2 = VINSERTF64x4Zrr %0, %1, 1
-# ALL-NEXT: %ymm0 = COPY %2
-# ALL-NEXT: RET 0, implicit %ymm0
body: |
bb.1 (%ir-block.0):
liveins: %ymm0, %ymm1
+ ; ALL-LABEL: name: test_insert_256_idx1
+ ; ALL: [[COPY:%[0-9]+]]:vr512 = COPY %zmm0
+ ; ALL: [[COPY1:%[0-9]+]]:vr256x = COPY %ymm1
+ ; ALL: [[VINSERTF64x4Zrr:%[0-9]+]]:vr512 = VINSERTF64x4Zrr [[COPY]], [[COPY1]], 1
+ ; ALL: %zmm0 = COPY [[VINSERTF64x4Zrr]]
+ ; ALL: RET 0, implicit %ymm0
%0(<16 x s32>) = COPY %zmm0
%1(<8 x s32>) = COPY %ymm1
%2(<16 x s32>) = G_INSERT %0(<16 x s32>), %1(<8 x s32>), 256
- %ymm0 = COPY %2(<16 x s32>)
+ %zmm0 = COPY %2(<16 x s32>)
RET 0, implicit %ymm0
...
---
name: test_insert_256_idx1_undef
-# ALL-LABEL: name: test_insert_256_idx1_undef
alignment: 4
legalized: true
regBankSelected: true
-# ALL: registers:
-# ALL-NEXT: - { id: 0, class: vr512, preferred-register: '' }
-# ALL-NEXT: - { id: 1, class: vr256x, preferred-register: '' }
-# ALL-NEXT: - { id: 2, class: vr512, preferred-register: '' }
registers:
- { id: 0, class: vecr }
- { id: 1, class: vecr }
- { id: 2, class: vecr }
-# ALL: %0 = IMPLICIT_DEF
-# ALL-NEXT: %1 = COPY %ymm1
-# ALL-NEXT: %2 = VINSERTF64x4Zrr %0, %1, 1
-# ALL-NEXT: %ymm0 = COPY %2
-# ALL-NEXT: RET 0, implicit %ymm0
body: |
bb.1 (%ir-block.0):
liveins: %ymm0, %ymm1
+ ; ALL-LABEL: name: test_insert_256_idx1_undef
+ ; ALL: [[DEF:%[0-9]+]]:vr512 = IMPLICIT_DEF
+ ; ALL: [[COPY:%[0-9]+]]:vr256x = COPY %ymm1
+ ; ALL: [[VINSERTF64x4Zrr:%[0-9]+]]:vr512 = VINSERTF64x4Zrr [[DEF]], [[COPY]], 1
+ ; ALL: %zmm0 = COPY [[VINSERTF64x4Zrr]]
+ ; ALL: RET 0, implicit %ymm0
%0(<16 x s32>) = IMPLICIT_DEF
%1(<8 x s32>) = COPY %ymm1
%2(<16 x s32>) = G_INSERT %0(<16 x s32>), %1(<8 x s32>), 256
- %ymm0 = COPY %2(<16 x s32>)
+ %zmm0 = COPY %2(<16 x s32>)
RET 0, implicit %ymm0
...
-
diff --git a/test/CodeGen/X86/GlobalISel/select-intrinsic-x86-flags-read-u32.mir b/test/CodeGen/X86/GlobalISel/select-intrinsic-x86-flags-read-u32.mir
new file mode 100644
index 000000000000..628ab3bac4ab
--- /dev/null
+++ b/test/CodeGen/X86/GlobalISel/select-intrinsic-x86-flags-read-u32.mir
@@ -0,0 +1,27 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=i386-- -run-pass=instruction-select -verify-machineinstrs -global-isel %s -o - | FileCheck %s
+
+--- |
+ define void @read_flags() { ret void }
+...
+
+---
+# Check that we select a the x86.flags.read.u32 intrinsic into a RDFLAGS
+# instruction. Also check that we constrain the register class of the COPY to
+# gr32.
+name: read_flags
+legalized: true
+regBankSelected: true
+
+registers:
+ - { id: 0, class: gpr }
+
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: read_flags
+ ; CHECK: [[RDFLAGS32_:%[0-9]+]]:gr32 = RDFLAGS32 implicit-def %esp, implicit %esp
+ ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:gr64 = SUBREG_TO_REG 0, [[RDFLAGS32_]], %subreg.sub_32bit
+ ; CHECK: %rax = COPY [[SUBREG_TO_REG]]
+ %0(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.x86.flags.read.u32)
+ %rax = COPY %0(s32)
+...
diff --git a/test/CodeGen/X86/GlobalISel/select-leaf-constant.mir b/test/CodeGen/X86/GlobalISel/select-leaf-constant.mir
index 9128f19b1d24..897f9a56a202 100644
--- a/test/CodeGen/X86/GlobalISel/select-leaf-constant.mir
+++ b/test/CodeGen/X86/GlobalISel/select-leaf-constant.mir
@@ -1,3 +1,4 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple=i586-linux-gnu -mcpu=haswell -mattr=-slow-incdec -global-isel -run-pass=instruction-select %s -o - | FileCheck %s --check-prefix=CHECK
#
# This is necessary to test that attribute-based rule predicates work and that
@@ -27,15 +28,14 @@ name: const_i32_1
legalized: true
regBankSelected: true
selected: false
-# CHECK-LABEL: name: const_i32_1
-# CHECK: registers:
-# CHECK-NEXT: - { id: 0, class: gr32, preferred-register: '' }
registers:
- { id: 0, class: gpr }
-# CHECK: body:
-# CHECK: %0 = MOV32ri 1
body: |
bb.1 (%ir-block.0):
+ ; CHECK-LABEL: name: const_i32_1
+ ; CHECK: [[MOV32ri:%[0-9]+]]:gr32 = MOV32ri 1
+ ; CHECK: %eax = COPY [[MOV32ri]]
+ ; CHECK: RET 0, implicit %eax
%0(s32) = G_CONSTANT i32 1
%eax = COPY %0(s32)
RET 0, implicit %eax
@@ -45,15 +45,14 @@ name: const_i32_1_optsize
legalized: true
regBankSelected: true
selected: false
-# CHECK-LABEL: name: const_i32_1_optsize
-# CHECK: registers:
-# CHECK-NEXT: - { id: 0, class: gr32, preferred-register: '' }
registers:
- { id: 0, class: gpr }
-# CHECK: body:
-# CHECK: %0 = MOV32r1
body: |
bb.1 (%ir-block.0):
+ ; CHECK-LABEL: name: const_i32_1_optsize
+ ; CHECK: [[MOV32r1_:%[0-9]+]]:gr32 = MOV32r1 implicit-def %eflags
+ ; CHECK: %eax = COPY [[MOV32r1_]]
+ ; CHECK: RET 0, implicit %eax
%0(s32) = G_CONSTANT i32 1
%eax = COPY %0(s32)
RET 0, implicit %eax
@@ -63,15 +62,14 @@ name: const_i32_1b
legalized: true
regBankSelected: true
selected: false
-# CHECK-LABEL: name: const_i32_1b
-# CHECK: registers:
-# CHECK-NEXT: - { id: 0, class: gr32, preferred-register: '' }
registers:
- { id: 0, class: gpr }
-# CHECK: body:
-# CHECK: %0 = MOV32ri 1
body: |
bb.1 (%ir-block.0):
+ ; CHECK-LABEL: name: const_i32_1b
+ ; CHECK: [[MOV32ri:%[0-9]+]]:gr32 = MOV32ri 1
+ ; CHECK: %eax = COPY [[MOV32ri]]
+ ; CHECK: RET 0, implicit %eax
%0(s32) = G_CONSTANT i32 1
%eax = COPY %0(s32)
RET 0, implicit %eax
@@ -81,15 +79,14 @@ name: const_i32_1_optsizeb
legalized: true
regBankSelected: true
selected: false
-# CHECK-LABEL: name: const_i32_1_optsizeb
-# CHECK: registers:
-# CHECK-NEXT: - { id: 0, class: gr32, preferred-register: '' }
registers:
- { id: 0, class: gpr }
-# CHECK: body:
-# CHECK: %0 = MOV32r1
body: |
bb.1 (%ir-block.0):
+ ; CHECK-LABEL: name: const_i32_1_optsizeb
+ ; CHECK: [[MOV32r1_:%[0-9]+]]:gr32 = MOV32r1 implicit-def %eflags
+ ; CHECK: %eax = COPY [[MOV32r1_]]
+ ; CHECK: RET 0, implicit %eax
%0(s32) = G_CONSTANT i32 1
%eax = COPY %0(s32)
RET 0, implicit %eax
diff --git a/test/CodeGen/X86/GlobalISel/select-memop-scalar-x32.mir b/test/CodeGen/X86/GlobalISel/select-memop-scalar-x32.mir
index af09ea049295..804d7bce671b 100644
--- a/test/CodeGen/X86/GlobalISel/select-memop-scalar-x32.mir
+++ b/test/CodeGen/X86/GlobalISel/select-memop-scalar-x32.mir
@@ -1,3 +1,4 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple=i386-linux-gnu -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL
--- |
@@ -44,26 +45,22 @@
...
---
name: test_load_i8
-# ALL-LABEL: name: test_load_i8
alignment: 4
legalized: true
regBankSelected: true
-# ALL: registers:
-# ALL-NEXT: - { id: 0, class: gr32, preferred-register: '' }
-# ALL-NEXT: - { id: 1, class: gpr, preferred-register: '' }
-# ALL-NEXT: - { id: 2, class: gr8, preferred-register: '' }
registers:
- { id: 0, class: gpr }
- { id: 1, class: gpr }
- { id: 2, class: gpr }
fixedStack:
- { id: 0, offset: 0, size: 4, alignment: 16, isImmutable: true, isAliased: false }
-# ALL: %0 = MOV32rm %fixed-stack.0, 1, _, 0, _ :: (invariant load 4 from %fixed-stack.0, align 0)
-# ALL-NEXT: %2 = MOV8rm %0, 1, _, 0, _ :: (load 1 from %ir.p1)
-# ALL-NEXT: %al = COPY %2
-# ALL-NEXT: RET 0, implicit %al
body: |
bb.1 (%ir-block.0):
+ ; ALL-LABEL: name: test_load_i8
+ ; ALL: [[MOV32rm:%[0-9]+]]:gr32 = MOV32rm %fixed-stack.0, 1, %noreg, 0, %noreg :: (invariant load 4 from %fixed-stack.0, align 0)
+ ; ALL: [[MOV8rm:%[0-9]+]]:gr8 = MOV8rm [[MOV32rm]], 1, %noreg, 0, %noreg :: (load 1 from %ir.p1)
+ ; ALL: %al = COPY [[MOV8rm]]
+ ; ALL: RET 0, implicit %al
%1(p0) = G_FRAME_INDEX %fixed-stack.0
%0(p0) = G_LOAD %1(p0) :: (invariant load 4 from %fixed-stack.0, align 0)
%2(s8) = G_LOAD %0(p0) :: (load 1 from %ir.p1)
@@ -73,26 +70,22 @@ body: |
...
---
name: test_load_i16
-# ALL-LABEL: name: test_load_i16
alignment: 4
legalized: true
regBankSelected: true
-# ALL: registers:
-# ALL-NEXT: - { id: 0, class: gr32, preferred-register: '' }
-# ALL-NEXT: - { id: 1, class: gpr, preferred-register: '' }
-# ALL-NEXT: - { id: 2, class: gr16, preferred-register: '' }
registers:
- { id: 0, class: gpr }
- { id: 1, class: gpr }
- { id: 2, class: gpr }
fixedStack:
- { id: 0, offset: 0, size: 4, alignment: 16, isImmutable: true, isAliased: false }
-# ALL: %0 = MOV32rm %fixed-stack.0, 1, _, 0, _ :: (invariant load 4 from %fixed-stack.0, align 0)
-# ALL-NEXT: %2 = MOV16rm %0, 1, _, 0, _ :: (load 2 from %ir.p1)
-# ALL-NEXT: %ax = COPY %2
-# ALL-NEXT: RET 0, implicit %ax
body: |
bb.1 (%ir-block.0):
+ ; ALL-LABEL: name: test_load_i16
+ ; ALL: [[MOV32rm:%[0-9]+]]:gr32 = MOV32rm %fixed-stack.0, 1, %noreg, 0, %noreg :: (invariant load 4 from %fixed-stack.0, align 0)
+ ; ALL: [[MOV16rm:%[0-9]+]]:gr16 = MOV16rm [[MOV32rm]], 1, %noreg, 0, %noreg :: (load 2 from %ir.p1)
+ ; ALL: %ax = COPY [[MOV16rm]]
+ ; ALL: RET 0, implicit %ax
%1(p0) = G_FRAME_INDEX %fixed-stack.0
%0(p0) = G_LOAD %1(p0) :: (invariant load 4 from %fixed-stack.0, align 0)
%2(s16) = G_LOAD %0(p0) :: (load 2 from %ir.p1)
@@ -102,26 +95,22 @@ body: |
...
---
name: test_load_i32
-# ALL-LABEL: name: test_load_i32
alignment: 4
legalized: true
regBankSelected: true
-# ALL: registers:
-# ALL-NEXT: - { id: 0, class: gr32, preferred-register: '' }
-# ALL-NEXT: - { id: 1, class: gpr, preferred-register: '' }
-# ALL-NEXT: - { id: 2, class: gr32, preferred-register: '' }
registers:
- { id: 0, class: gpr }
- { id: 1, class: gpr }
- { id: 2, class: gpr }
fixedStack:
- { id: 0, offset: 0, size: 4, alignment: 16, isImmutable: true, isAliased: false }
-# ALL: %0 = MOV32rm %fixed-stack.0, 1, _, 0, _ :: (invariant load 4 from %fixed-stack.0, align 0)
-# ALL-NEXT: %2 = MOV32rm %0, 1, _, 0, _ :: (load 4 from %ir.p1)
-# ALL-NEXT: %eax = COPY %2
-# ALL-NEXT: RET 0, implicit %eax
body: |
bb.1 (%ir-block.0):
+ ; ALL-LABEL: name: test_load_i32
+ ; ALL: [[MOV32rm:%[0-9]+]]:gr32 = MOV32rm %fixed-stack.0, 1, %noreg, 0, %noreg :: (invariant load 4 from %fixed-stack.0, align 0)
+ ; ALL: [[MOV32rm1:%[0-9]+]]:gr32 = MOV32rm [[MOV32rm]], 1, %noreg, 0, %noreg :: (load 4 from %ir.p1)
+ ; ALL: %eax = COPY [[MOV32rm1]]
+ ; ALL: RET 0, implicit %eax
%1(p0) = G_FRAME_INDEX %fixed-stack.0
%0(p0) = G_LOAD %1(p0) :: (invariant load 4 from %fixed-stack.0, align 0)
%2(s32) = G_LOAD %0(p0) :: (load 4 from %ir.p1)
@@ -131,15 +120,9 @@ body: |
...
---
name: test_store_i8
-# ALL-LABEL: name: test_store_i8
alignment: 4
legalized: true
regBankSelected: true
-# ALL: registers:
-# ALL-NEXT: - { id: 0, class: gr8, preferred-register: '' }
-# ALL-NEXT: - { id: 1, class: gr32, preferred-register: '' }
-# ALL-NEXT: - { id: 2, class: gpr, preferred-register: '' }
-# ALL-NEXT: - { id: 3, class: gpr, preferred-register: '' }
registers:
- { id: 0, class: gpr }
- { id: 1, class: gpr }
@@ -148,13 +131,14 @@ registers:
fixedStack:
- { id: 0, offset: 4, size: 4, alignment: 4, isImmutable: true, isAliased: false }
- { id: 1, offset: 0, size: 1, alignment: 16, isImmutable: true, isAliased: false }
-# ALL: %0 = MOV8rm %fixed-stack.0, 1, _, 0, _ :: (invariant load 1 from %fixed-stack.0, align 0)
-# ALL-NEXT: %1 = MOV32rm %fixed-stack.1, 1, _, 0, _ :: (invariant load 4 from %fixed-stack.1, align 0)
-# ALL-NEXT: MOV8mr %1, 1, _, 0, _, %0 :: (store 1 into %ir.p1)
-# ALL-NEXT: %eax = COPY %1
-# ALL-NEXT: RET 0, implicit %eax
body: |
bb.1 (%ir-block.0):
+ ; ALL-LABEL: name: test_store_i8
+ ; ALL: [[MOV8rm:%[0-9]+]]:gr8 = MOV8rm %fixed-stack.0, 1, %noreg, 0, %noreg :: (invariant load 1 from %fixed-stack.0, align 0)
+ ; ALL: [[MOV32rm:%[0-9]+]]:gr32 = MOV32rm %fixed-stack.1, 1, %noreg, 0, %noreg :: (invariant load 4 from %fixed-stack.1, align 0)
+ ; ALL: MOV8mr [[MOV32rm]], 1, %noreg, 0, %noreg, [[MOV8rm]] :: (store 1 into %ir.p1)
+ ; ALL: %eax = COPY [[MOV32rm]]
+ ; ALL: RET 0, implicit %eax
%2(p0) = G_FRAME_INDEX %fixed-stack.1
%0(s8) = G_LOAD %2(p0) :: (invariant load 1 from %fixed-stack.1, align 0)
%3(p0) = G_FRAME_INDEX %fixed-stack.0
@@ -166,15 +150,9 @@ body: |
...
---
name: test_store_i16
-# ALL-LABEL: name: test_store_i16
alignment: 4
legalized: true
regBankSelected: true
-# ALL: registers:
-# ALL-NEXT: - { id: 0, class: gr16, preferred-register: '' }
-# ALL-NEXT: - { id: 1, class: gr32, preferred-register: '' }
-# ALL-NEXT: - { id: 2, class: gpr, preferred-register: '' }
-# ALL-NEXT: - { id: 3, class: gpr, preferred-register: '' }
registers:
- { id: 0, class: gpr }
- { id: 1, class: gpr }
@@ -183,13 +161,14 @@ registers:
fixedStack:
- { id: 0, offset: 4, size: 4, alignment: 4, isImmutable: true, isAliased: false }
- { id: 1, offset: 0, size: 2, alignment: 16, isImmutable: true, isAliased: false }
-# ALL: %0 = MOV16rm %fixed-stack.0, 1, _, 0, _ :: (invariant load 2 from %fixed-stack.0, align 0)
-# ALL-NEXT: %1 = MOV32rm %fixed-stack.1, 1, _, 0, _ :: (invariant load 4 from %fixed-stack.1, align 0)
-# ALL-NEXT: MOV16mr %1, 1, _, 0, _, %0 :: (store 2 into %ir.p1)
-# ALL-NEXT: %eax = COPY %1
-# ALL-NEXT: RET 0, implicit %eax
body: |
bb.1 (%ir-block.0):
+ ; ALL-LABEL: name: test_store_i16
+ ; ALL: [[MOV16rm:%[0-9]+]]:gr16 = MOV16rm %fixed-stack.0, 1, %noreg, 0, %noreg :: (invariant load 2 from %fixed-stack.0, align 0)
+ ; ALL: [[MOV32rm:%[0-9]+]]:gr32 = MOV32rm %fixed-stack.1, 1, %noreg, 0, %noreg :: (invariant load 4 from %fixed-stack.1, align 0)
+ ; ALL: MOV16mr [[MOV32rm]], 1, %noreg, 0, %noreg, [[MOV16rm]] :: (store 2 into %ir.p1)
+ ; ALL: %eax = COPY [[MOV32rm]]
+ ; ALL: RET 0, implicit %eax
%2(p0) = G_FRAME_INDEX %fixed-stack.1
%0(s16) = G_LOAD %2(p0) :: (invariant load 2 from %fixed-stack.1, align 0)
%3(p0) = G_FRAME_INDEX %fixed-stack.0
@@ -201,15 +180,9 @@ body: |
...
---
name: test_store_i32
-# ALL-LABEL: name: test_store_i32
alignment: 4
legalized: true
regBankSelected: true
-# ALL: registers:
-# ALL-NEXT: - { id: 0, class: gr32, preferred-register: '' }
-# ALL-NEXT: - { id: 1, class: gr32, preferred-register: '' }
-# ALL-NEXT: - { id: 2, class: gpr, preferred-register: '' }
-# ALL-NEXT: - { id: 3, class: gpr, preferred-register: '' }
registers:
- { id: 0, class: gpr }
- { id: 1, class: gpr }
@@ -218,13 +191,14 @@ registers:
fixedStack:
- { id: 0, offset: 4, size: 4, alignment: 4, isImmutable: true, isAliased: false }
- { id: 1, offset: 0, size: 4, alignment: 16, isImmutable: true, isAliased: false }
-# ALL: %0 = MOV32rm %fixed-stack.0, 1, _, 0, _ :: (invariant load 4 from %fixed-stack.0, align 0)
-# ALL-NEXT: %1 = MOV32rm %fixed-stack.1, 1, _, 0, _ :: (invariant load 4 from %fixed-stack.1, align 0)
-# ALL-NEXT: MOV32mr %1, 1, _, 0, _, %0 :: (store 4 into %ir.p1)
-# ALL-NEXT: %eax = COPY %1
-# ALL-NEXT: RET 0, implicit %eax
body: |
bb.1 (%ir-block.0):
+ ; ALL-LABEL: name: test_store_i32
+ ; ALL: [[MOV32rm:%[0-9]+]]:gr32 = MOV32rm %fixed-stack.0, 1, %noreg, 0, %noreg :: (invariant load 4 from %fixed-stack.0, align 0)
+ ; ALL: [[MOV32rm1:%[0-9]+]]:gr32 = MOV32rm %fixed-stack.1, 1, %noreg, 0, %noreg :: (invariant load 4 from %fixed-stack.1, align 0)
+ ; ALL: MOV32mr [[MOV32rm1]], 1, %noreg, 0, %noreg, [[MOV32rm]] :: (store 4 into %ir.p1)
+ ; ALL: %eax = COPY [[MOV32rm1]]
+ ; ALL: RET 0, implicit %eax
%2(p0) = G_FRAME_INDEX %fixed-stack.1
%0(s32) = G_LOAD %2(p0) :: (invariant load 4 from %fixed-stack.1, align 0)
%3(p0) = G_FRAME_INDEX %fixed-stack.0
@@ -236,26 +210,22 @@ body: |
...
---
name: test_load_ptr
-# ALL-LABEL: name: test_load_ptr
alignment: 4
legalized: true
regBankSelected: true
-# ALL: registers:
-# ALL-NEXT: - { id: 0, class: gr32, preferred-register: '' }
-# ALL-NEXT: - { id: 1, class: gpr, preferred-register: '' }
-# ALL-NEXT: - { id: 2, class: gr32, preferred-register: '' }
registers:
- { id: 0, class: gpr }
- { id: 1, class: gpr }
- { id: 2, class: gpr }
fixedStack:
- { id: 0, offset: 0, size: 4, alignment: 16, isImmutable: true, isAliased: false }
-# ALL: %0 = MOV32rm %fixed-stack.0, 1, _, 0, _ :: (invariant load 4 from %fixed-stack.0, align 0)
-# ALL-NEXT: %2 = MOV32rm %0, 1, _, 0, _ :: (load 4 from %ir.ptr1)
-# ALL-NEXT: %eax = COPY %2
-# ALL-NEXT: RET 0, implicit %eax
body: |
bb.1 (%ir-block.0):
+ ; ALL-LABEL: name: test_load_ptr
+ ; ALL: [[MOV32rm:%[0-9]+]]:gr32 = MOV32rm %fixed-stack.0, 1, %noreg, 0, %noreg :: (invariant load 4 from %fixed-stack.0, align 0)
+ ; ALL: [[MOV32rm1:%[0-9]+]]:gr32 = MOV32rm [[MOV32rm]], 1, %noreg, 0, %noreg :: (load 4 from %ir.ptr1)
+ ; ALL: %eax = COPY [[MOV32rm1]]
+ ; ALL: RET 0, implicit %eax
%1(p0) = G_FRAME_INDEX %fixed-stack.0
%0(p0) = G_LOAD %1(p0) :: (invariant load 4 from %fixed-stack.0, align 0)
%2(p0) = G_LOAD %0(p0) :: (load 4 from %ir.ptr1)
@@ -265,15 +235,9 @@ body: |
...
---
name: test_store_ptr
-# ALL-LABEL: name: test_store_ptr
alignment: 4
legalized: true
regBankSelected: true
-# ALL: registers:
-# ALL-NEXT: - { id: 0, class: gr32, preferred-register: '' }
-# ALL-NEXT: - { id: 1, class: gr32, preferred-register: '' }
-# ALL-NEXT: - { id: 2, class: gpr, preferred-register: '' }
-# ALL-NEXT: - { id: 3, class: gpr, preferred-register: '' }
registers:
- { id: 0, class: gpr }
- { id: 1, class: gpr }
@@ -282,12 +246,13 @@ registers:
fixedStack:
- { id: 0, offset: 4, size: 4, alignment: 4, isImmutable: true, isAliased: false }
- { id: 1, offset: 0, size: 4, alignment: 16, isImmutable: true, isAliased: false }
-# ALL: %0 = MOV32rm %fixed-stack.0, 1, _, 0, _ :: (invariant load 4 from %fixed-stack.0, align 0)
-# ALL-NEXT: %1 = MOV32rm %fixed-stack.1, 1, _, 0, _ :: (invariant load 4 from %fixed-stack.1, align 0)
-# ALL-NEXT: MOV32mr %0, 1, _, 0, _, %1 :: (store 4 into %ir.ptr1)
-# ALL-NEXT: RET 0
body: |
bb.1 (%ir-block.0):
+ ; ALL-LABEL: name: test_store_ptr
+ ; ALL: [[MOV32rm:%[0-9]+]]:gr32 = MOV32rm %fixed-stack.0, 1, %noreg, 0, %noreg :: (invariant load 4 from %fixed-stack.0, align 0)
+ ; ALL: [[MOV32rm1:%[0-9]+]]:gr32 = MOV32rm %fixed-stack.1, 1, %noreg, 0, %noreg :: (invariant load 4 from %fixed-stack.1, align 0)
+ ; ALL: MOV32mr [[MOV32rm]], 1, %noreg, 0, %noreg, [[MOV32rm1]] :: (store 4 into %ir.ptr1)
+ ; ALL: RET 0
%2(p0) = G_FRAME_INDEX %fixed-stack.1
%0(p0) = G_LOAD %2(p0) :: (invariant load 4 from %fixed-stack.1, align 0)
%3(p0) = G_FRAME_INDEX %fixed-stack.0
diff --git a/test/CodeGen/X86/GlobalISel/select-memop-scalar.mir b/test/CodeGen/X86/GlobalISel/select-memop-scalar.mir
index de79aac9f300..35e1659a53c1 100644
--- a/test/CodeGen/X86/GlobalISel/select-memop-scalar.mir
+++ b/test/CodeGen/X86/GlobalISel/select-memop-scalar.mir
@@ -109,8 +109,8 @@ registers:
# ALL: - { id: 1, class: gr8, preferred-register: '' }
- { id: 0, class: gpr }
- { id: 1, class: gpr }
-# ALL: %0 = COPY %rdi
-# ALL: %1 = MOV8rm %0, 1, _, 0, _ :: (load 1 from %ir.p1)
+# ALL: %0:gr64 = COPY %rdi
+# ALL: %1:gr8 = MOV8rm %0, 1, %noreg, 0, %noreg :: (load 1 from %ir.p1)
# ALL: %al = COPY %1
body: |
bb.1 (%ir-block.0):
@@ -133,8 +133,8 @@ registers:
# ALL: - { id: 1, class: gr16, preferred-register: '' }
- { id: 0, class: gpr }
- { id: 1, class: gpr }
-# ALL: %0 = COPY %rdi
-# ALL: %1 = MOV16rm %0, 1, _, 0, _ :: (load 2 from %ir.p1)
+# ALL: %0:gr64 = COPY %rdi
+# ALL: %1:gr16 = MOV16rm %0, 1, %noreg, 0, %noreg :: (load 2 from %ir.p1)
# ALL: %ax = COPY %1
body: |
bb.1 (%ir-block.0):
@@ -157,8 +157,8 @@ registers:
# ALL: - { id: 1, class: gr32, preferred-register: '' }
- { id: 0, class: gpr }
- { id: 1, class: gpr }
-# ALL: %0 = COPY %rdi
-# ALL: %1 = MOV32rm %0, 1, _, 0, _ :: (load 4 from %ir.p1)
+# ALL: %0:gr64 = COPY %rdi
+# ALL: %1:gr32 = MOV32rm %0, 1, %noreg, 0, %noreg :: (load 4 from %ir.p1)
# ALL: %eax = COPY %1
body: |
bb.1 (%ir-block.0):
@@ -181,8 +181,8 @@ registers:
# ALL: - { id: 1, class: gr64, preferred-register: '' }
- { id: 0, class: gpr }
- { id: 1, class: gpr }
-# ALL: %0 = COPY %rdi
-# ALL: %1 = MOV64rm %0, 1, _, 0, _ :: (load 8 from %ir.p1)
+# ALL: %0:gr64 = COPY %rdi
+# ALL: %1:gr64 = MOV64rm %0, 1, %noreg, 0, %noreg :: (load 8 from %ir.p1)
# ALL: %rax = COPY %1
body: |
bb.1 (%ir-block.0):
@@ -205,8 +205,8 @@ registers:
# ALL: - { id: 1, class: gr32, preferred-register: '' }
- { id: 0, class: gpr }
- { id: 1, class: gpr }
-# ALL: %0 = COPY %rdi
-# ALL: %1 = MOV32rm %0, 1, _, 0, _ :: (load 4 from %ir.p1)
+# ALL: %0:gr64 = COPY %rdi
+# ALL: %1:gr32 = MOV32rm %0, 1, %noreg, 0, %noreg :: (load 4 from %ir.p1)
# ALL: %xmm0 = COPY %1
body: |
bb.1 (%ir-block.0):
@@ -225,15 +225,12 @@ alignment: 4
legalized: true
regBankSelected: true
registers:
-# ALL: - { id: 0, class: gr64, preferred-register: '' }
-# NO_AVX512F: - { id: 1, class: fr32, preferred-register: '' }
-# AVX512ALL: - { id: 1, class: fr32x, preferred-register: '' }
- { id: 0, class: gpr }
- { id: 1, class: vecr }
-# ALL: %0 = COPY %rdi
-# SSE: %1 = MOVSSrm %0, 1, _, 0, _ :: (load 4 from %ir.p1)
-# AVX: %1 = VMOVSSrm %0, 1, _, 0, _ :: (load 4 from %ir.p1)
-# AVX512ALL: %1 = VMOVSSZrm %0, 1, _, 0, _ :: (load 4 from %ir.p1)
+# ALL: %0:gr64 = COPY %rdi
+# SSE: %1:fr32 = MOVSSrm %0, 1, %noreg, 0, %noreg :: (load 4 from %ir.p1)
+# AVX: %1:fr32 = VMOVSSrm %0, 1, %noreg, 0, %noreg :: (load 4 from %ir.p1)
+# AVX512ALL: %1:fr32x = VMOVSSZrm %0, 1, %noreg, 0, %noreg :: (load 4 from %ir.p1)
# ALL: %xmm0 = COPY %1
body: |
bb.1 (%ir-block.0):
@@ -256,8 +253,8 @@ registers:
# ALL: - { id: 1, class: gr64, preferred-register: '' }
- { id: 0, class: gpr }
- { id: 1, class: gpr }
-# ALL: %0 = COPY %rdi
-# ALL: %1 = MOV64rm %0, 1, _, 0, _ :: (load 8 from %ir.p1)
+# ALL: %0:gr64 = COPY %rdi
+# ALL: %1:gr64 = MOV64rm %0, 1, %noreg, 0, %noreg :: (load 8 from %ir.p1)
# ALL: %xmm0 = COPY %1
body: |
bb.1 (%ir-block.0):
@@ -276,15 +273,12 @@ alignment: 4
legalized: true
regBankSelected: true
registers:
-# ALL: - { id: 0, class: gr64, preferred-register: '' }
-# NO_AVX512F: - { id: 1, class: fr64, preferred-register: '' }
-# AVX512ALL: - { id: 1, class: fr64x, preferred-register: '' }
- { id: 0, class: gpr }
- { id: 1, class: vecr }
-# ALL: %0 = COPY %rdi
-# SSE: %1 = MOVSDrm %0, 1, _, 0, _ :: (load 8 from %ir.p1)
-# AVX: %1 = VMOVSDrm %0, 1, _, 0, _ :: (load 8 from %ir.p1)
-# AVX512ALL: %1 = VMOVSDZrm %0, 1, _, 0, _ :: (load 8 from %ir.p1)
+# ALL: %0:gr64 = COPY %rdi
+# SSE: %1:fr64 = MOVSDrm %0, 1, %noreg, 0, %noreg :: (load 8 from %ir.p1)
+# AVX: %1:fr64 = VMOVSDrm %0, 1, %noreg, 0, %noreg :: (load 8 from %ir.p1)
+# AVX512ALL: %1:fr64x = VMOVSDZrm %0, 1, %noreg, 0, %noreg :: (load 8 from %ir.p1)
# ALL: %xmm0 = COPY %1
body: |
bb.1 (%ir-block.0):
@@ -307,9 +301,9 @@ registers:
# ALL: - { id: 1, class: gr64, preferred-register: '' }
- { id: 0, class: gpr }
- { id: 1, class: gpr }
-# ALL: %0 = COPY %edi
-# ALL: %1 = COPY %rsi
-# ALL: MOV32mr %1, 1, _, 0, _, %0 :: (store 4 into %ir.p1)
+# ALL: %0:gr32 = COPY %edi
+# ALL: %1:gr64 = COPY %rsi
+# ALL: MOV32mr %1, 1, %noreg, 0, %noreg, %0 :: (store 4 into %ir.p1)
# ALL: %rax = COPY %1
body: |
bb.1 (%ir-block.0):
@@ -333,9 +327,9 @@ registers:
# ALL: - { id: 1, class: gr64, preferred-register: '' }
- { id: 0, class: gpr }
- { id: 1, class: gpr }
-# ALL: %0 = COPY %rdi
-# ALL: %1 = COPY %rsi
-# ALL: MOV64mr %1, 1, _, 0, _, %0 :: (store 8 into %ir.p1)
+# ALL: %0:gr64 = COPY %rdi
+# ALL: %1:gr64 = COPY %rsi
+# ALL: MOV64mr %1, 1, %noreg, 0, %noreg, %0 :: (store 8 into %ir.p1)
# ALL: %rax = COPY %1
body: |
bb.1 (%ir-block.0):
@@ -354,23 +348,15 @@ name: test_store_float
alignment: 4
legalized: true
regBankSelected: true
-# NO_AVX512F: registers:
-# NO_AVX512F-NEXT: - { id: 0, class: fr32, preferred-register: '' }
-# NO_AVX512F-NEXT: - { id: 1, class: gr64, preferred-register: '' }
-# NO_AVX512F-NEXT: - { id: 2, class: gr32, preferred-register: '' }
-#
-# AVX512ALL: registers:
-# AVX512ALL-NEXT: - { id: 0, class: fr32x, preferred-register: '' }
-# AVX512ALL-NEXT: - { id: 1, class: gr64, preferred-register: '' }
-# AVX512ALL-NEXT: - { id: 2, class: gr32, preferred-register: '' }
registers:
- { id: 0, class: vecr }
- { id: 1, class: gpr }
- { id: 2, class: gpr }
-# ALL: %0 = COPY %xmm0
-# ALL: %1 = COPY %rdi
-# ALL: %2 = COPY %0
-# ALL: MOV32mr %1, 1, _, 0, _, %2 :: (store 4 into %ir.p1)
+# NO_AVX512F: %0:fr32 = COPY %xmm0
+# AVX512ALL: %0:fr32x = COPY %xmm0
+# ALL: %1:gr64 = COPY %rdi
+# ALL: %2:gr32 = COPY %0
+# ALL: MOV32mr %1, 1, %noreg, 0, %noreg, %2 :: (store 4 into %ir.p1)
# ALL: %rax = COPY %1
body: |
bb.1 (%ir-block.0):
@@ -391,16 +377,14 @@ alignment: 4
legalized: true
regBankSelected: true
registers:
-# NO_AVX512F: - { id: 0, class: fr32, preferred-register: '' }
-# AVX512ALL: - { id: 0, class: fr32x, preferred-register: '' }
-# ALL: - { id: 1, class: gr64, preferred-register: '' }
- { id: 0, class: vecr }
- { id: 1, class: gpr }
-# ALL: %0 = COPY %xmm0
-# ALL: %1 = COPY %rdi
-# SSE: MOVSSmr %1, 1, _, 0, _, %0 :: (store 4 into %ir.p1)
-# AVX: VMOVSSmr %1, 1, _, 0, _, %0 :: (store 4 into %ir.p1)
-# AVX512ALL: VMOVSSZmr %1, 1, _, 0, _, %0 :: (store 4 into %ir.p1)
+# NO_AVX512F: %0:fr32 = COPY %xmm0
+# AVX512ALL: %0:fr32x = COPY %xmm0
+# ALL: %1:gr64 = COPY %rdi
+# SSE: MOVSSmr %1, 1, %noreg, 0, %noreg, %0 :: (store 4 into %ir.p1)
+# AVX: VMOVSSmr %1, 1, %noreg, 0, %noreg, %0 :: (store 4 into %ir.p1)
+# AVX512ALL: VMOVSSZmr %1, 1, %noreg, 0, %noreg, %0 :: (store 4 into %ir.p1)
# ALL: %rax = COPY %1
body: |
bb.1 (%ir-block.0):
@@ -419,23 +403,15 @@ name: test_store_double
alignment: 4
legalized: true
regBankSelected: true
-# NO_AVX512F: registers:
-# NO_AVX512F-NEXT: - { id: 0, class: fr64, preferred-register: '' }
-# NO_AVX512F-NEXT: - { id: 1, class: gr64, preferred-register: '' }
-# NO_AVX512F-NEXT: - { id: 2, class: gr64, preferred-register: '' }
-#
-# AVX512ALL: registers:
-# AVX512ALL-NEXT: - { id: 0, class: fr64x, preferred-register: '' }
-# AVX512ALL-NEXT: - { id: 1, class: gr64, preferred-register: '' }
-# AVX512ALL-NEXT: - { id: 2, class: gr64, preferred-register: '' }
registers:
- { id: 0, class: vecr }
- { id: 1, class: gpr }
- { id: 2, class: gpr }
-# ALL: %0 = COPY %xmm0
-# ALL: %1 = COPY %rdi
-# ALL: %2 = COPY %0
-# ALL: MOV64mr %1, 1, _, 0, _, %2 :: (store 8 into %ir.p1)
+# NO_AVX512X: %0:fr64 = COPY %xmm0
+# AVX512ALL: %0:fr64x = COPY %xmm0
+# ALL: %1:gr64 = COPY %rdi
+# ALL: %2:gr64 = COPY %0
+# ALL: MOV64mr %1, 1, %noreg, 0, %noreg, %2 :: (store 8 into %ir.p1)
# ALL: %rax = COPY %1
body: |
bb.1 (%ir-block.0):
@@ -456,16 +432,14 @@ alignment: 4
legalized: true
regBankSelected: true
registers:
-# NO_AVX512F: - { id: 0, class: fr64, preferred-register: '' }
-# AVX512ALL: - { id: 0, class: fr64x, preferred-register: '' }
-# ALL: - { id: 1, class: gr64, preferred-register: '' }
- { id: 0, class: vecr }
- { id: 1, class: gpr }
-# ALL: %0 = COPY %xmm0
-# ALL: %1 = COPY %rdi
-# SSE: MOVSDmr %1, 1, _, 0, _, %0 :: (store 8 into %ir.p1)
-# AVX: VMOVSDmr %1, 1, _, 0, _, %0 :: (store 8 into %ir.p1)
-# AVX512ALL: VMOVSDZmr %1, 1, _, 0, _, %0 :: (store 8 into %ir.p1)
+# NO_AVX512F: %0:fr64 = COPY %xmm0
+# AVX512ALL: %0:fr64x = COPY %xmm0
+# ALL: %1:gr64 = COPY %rdi
+# SSE: MOVSDmr %1, 1, %noreg, 0, %noreg, %0 :: (store 8 into %ir.p1)
+# AVX: VMOVSDmr %1, 1, %noreg, 0, %noreg, %0 :: (store 8 into %ir.p1)
+# AVX512ALL: VMOVSDZmr %1, 1, %noreg, 0, %noreg, %0 :: (store 8 into %ir.p1)
# ALL: %rax = COPY %1
body: |
bb.1 (%ir-block.0):
@@ -490,7 +464,7 @@ registers:
# ALL: - { id: 1, class: gr64, preferred-register: '' }
- { id: 0, class: gpr }
- { id: 1, class: gpr }
-# ALL: %1 = MOV64rm %0, 1, _, 0, _ :: (load 8 from %ir.ptr1)
+# ALL: %1:gr64 = MOV64rm %0, 1, %noreg, 0, %noreg :: (load 8 from %ir.ptr1)
body: |
bb.1 (%ir-block.0):
liveins: %rdi
@@ -513,7 +487,7 @@ registers:
# ALL: - { id: 1, class: gr64, preferred-register: '' }
- { id: 0, class: gpr }
- { id: 1, class: gpr }
-# ALL: MOV64mr %0, 1, _, 0, _, %1 :: (store 8 into %ir.ptr1)
+# ALL: MOV64mr %0, 1, %noreg, 0, %noreg, %1 :: (store 8 into %ir.ptr1)
body: |
bb.1 (%ir-block.0):
liveins: %rdi, %rsi
@@ -542,10 +516,10 @@ registers:
- { id: 2, class: gpr }
- { id: 3, class: gpr }
- { id: 4, class: gpr }
-# ALL: %0 = COPY %rdi
-# ALL-NEXT: %1 = COPY %esi
-# ALL-NEXT: MOV32mr %0, 1, _, 20, _, %1 :: (store 4 into %ir.arrayidx)
-# ALL-NEXT: %4 = MOV32rm %0, 1, _, 20, _ :: (load 4 from %ir.arrayidx)
+# ALL: %0:gr64 = COPY %rdi
+# ALL-NEXT: %1:gr32 = COPY %esi
+# ALL-NEXT: MOV32mr %0, 1, %noreg, 20, %noreg, %1 :: (store 4 into %ir.arrayidx)
+# ALL-NEXT: %4:gr32 = MOV32rm %0, 1, %noreg, 20, %noreg :: (load 4 from %ir.arrayidx)
# ALL-NEXT: %eax = COPY %4
# ALL-NEXT: RET 0, implicit %eax
body: |
@@ -580,12 +554,12 @@ registers:
- { id: 2, class: gpr }
- { id: 3, class: gpr }
- { id: 4, class: gpr }
-# ALL: %0 = COPY %rdi
-# ALL-NEXT: %1 = COPY %esi
-# ALL-NEXT: %2 = MOV64ri 228719476720
-# ALL-NEXT: %3 = LEA64r %0, 1, %2, 0, _
-# ALL-NEXT: MOV32mr %3, 1, _, 0, _, %1 :: (store 4 into %ir.arrayidx)
-# ALL-NEXT: %4 = MOV32rm %3, 1, _, 0, _ :: (load 4 from %ir.arrayidx)
+# ALL: %0:gr64 = COPY %rdi
+# ALL-NEXT: %1:gr32 = COPY %esi
+# ALL-NEXT: %2:gr64_nosp = MOV64ri 228719476720
+# ALL-NEXT: %3:gr64 = LEA64r %0, 1, %2, 0, %noreg
+# ALL-NEXT: MOV32mr %3, 1, %noreg, 0, %noreg, %1 :: (store 4 into %ir.arrayidx)
+# ALL-NEXT: %4:gr32 = MOV32rm %3, 1, %noreg, 0, %noreg :: (load 4 from %ir.arrayidx)
# ALL-NEXT: %eax = COPY %4
# ALL-NEXT: RET 0, implicit %eax
body: |
diff --git a/test/CodeGen/X86/GlobalISel/select-memop-v128.mir b/test/CodeGen/X86/GlobalISel/select-memop-v128.mir
index 08844657e2a2..7a3647c3e5c3 100644
--- a/test/CodeGen/X86/GlobalISel/select-memop-v128.mir
+++ b/test/CodeGen/X86/GlobalISel/select-memop-v128.mir
@@ -32,16 +32,13 @@ alignment: 4
legalized: true
regBankSelected: true
registers:
-# ALL: - { id: 0, class: gr64, preferred-register: '' }
-# NO_AVX512F: - { id: 1, class: vr128, preferred-register: '' }
-# AVX512ALL: - { id: 1, class: vr128x, preferred-register: '' }
- { id: 0, class: gpr }
- { id: 1, class: vecr }
-# ALL: %0 = COPY %rdi
-# SSE: %1 = MOVUPSrm %0, 1, _, 0, _ :: (load 16 from %ir.p1, align 1)
-# AVX: %1 = VMOVUPSrm %0, 1, _, 0, _ :: (load 16 from %ir.p1, align 1)
-# AVX512F: %1 = VMOVUPSZ128rm_NOVLX %0, 1, _, 0, _ :: (load 16 from %ir.p1, align 1)
-# AVX512VL: %1 = VMOVUPSZ128rm %0, 1, _, 0, _ :: (load 16 from %ir.p1, align 1)
+# ALL: %0:gr64 = COPY %rdi
+# SSE: %1:vr128 = MOVUPSrm %0, 1, %noreg, 0, %noreg :: (load 16 from %ir.p1, align 1)
+# AVX: %1:vr128 = VMOVUPSrm %0, 1, %noreg, 0, %noreg :: (load 16 from %ir.p1, align 1)
+# AVX512F: %1:vr128x = VMOVUPSZ128rm_NOVLX %0, 1, %noreg, 0, %noreg :: (load 16 from %ir.p1, align 1)
+# AVX512VL: %1:vr128x = VMOVUPSZ128rm %0, 1, %noreg, 0, %noreg :: (load 16 from %ir.p1, align 1)
# ALL: %xmm0 = COPY %1
body: |
bb.1 (%ir-block.0):
@@ -60,16 +57,13 @@ alignment: 4
legalized: true
regBankSelected: true
registers:
-# ALL: - { id: 0, class: gr64, preferred-register: '' }
-# NO_AVX512F: - { id: 1, class: vr128, preferred-register: '' }
-# AVX512ALL: - { id: 1, class: vr128x, preferred-register: '' }
- { id: 0, class: gpr }
- { id: 1, class: vecr }
-# ALL: %0 = COPY %rdi
-# SSE: %1 = MOVAPSrm %0, 1, _, 0, _ :: (load 16 from %ir.p1)
-# AVX: %1 = VMOVAPSrm %0, 1, _, 0, _ :: (load 16 from %ir.p1)
-# AVX512F: %1 = VMOVAPSZ128rm_NOVLX %0, 1, _, 0, _ :: (load 16 from %ir.p1)
-# AVX512VL: %1 = VMOVAPSZ128rm %0, 1, _, 0, _ :: (load 16 from %ir.p1)
+# ALL: %0:gr64 = COPY %rdi
+# SSE: %1:vr128 = MOVAPSrm %0, 1, %noreg, 0, %noreg :: (load 16 from %ir.p1)
+# AVX: %1:vr128 = VMOVAPSrm %0, 1, %noreg, 0, %noreg :: (load 16 from %ir.p1)
+# AVX512F: %1:vr128x = VMOVAPSZ128rm_NOVLX %0, 1, %noreg, 0, %noreg :: (load 16 from %ir.p1)
+# AVX512VL: %1:vr128x = VMOVAPSZ128rm %0, 1, %noreg, 0, %noreg :: (load 16 from %ir.p1)
# ALL: %xmm0 = COPY %1
body: |
bb.1 (%ir-block.0):
@@ -88,17 +82,15 @@ alignment: 4
legalized: true
regBankSelected: true
registers:
-# NO_AVX512F: - { id: 0, class: vr128, preferred-register: '' }
-# AVX512ALL: - { id: 0, class: vr128x, preferred-register: '' }
-# ALL: - { id: 1, class: gr64, preferred-register: '' }
- { id: 0, class: vecr }
- { id: 1, class: gpr }
-# ALL: %0 = COPY %xmm0
-# ALL: %1 = COPY %rdi
-# SSE: MOVAPSmr %1, 1, _, 0, _, %0 :: (store 16 into %ir.p1)
-# AVX: VMOVAPSmr %1, 1, _, 0, _, %0 :: (store 16 into %ir.p1)
-# AVX512F: VMOVAPSZ128mr_NOVLX %1, 1, _, 0, _, %0 :: (store 16 into %ir.p1)
-# AVX512VL: VMOVAPSZ128mr %1, 1, _, 0, _, %0 :: (store 16 into %ir.p1)
+# NO_AVX512F: %0:vr128 = COPY %xmm0
+# AVX512ALL: %0:vr128x = COPY %xmm0
+# ALL: %1:gr64 = COPY %rdi
+# SSE: MOVAPSmr %1, 1, %noreg, 0, %noreg, %0 :: (store 16 into %ir.p1)
+# AVX: VMOVAPSmr %1, 1, %noreg, 0, %noreg, %0 :: (store 16 into %ir.p1)
+# AVX512F: VMOVAPSZ128mr_NOVLX %1, 1, %noreg, 0, %noreg, %0 :: (store 16 into %ir.p1)
+# AVX512VL: VMOVAPSZ128mr %1, 1, %noreg, 0, %noreg, %0 :: (store 16 into %ir.p1)
# ALL: %rax = COPY %1
body: |
bb.1 (%ir-block.0):
@@ -118,17 +110,15 @@ alignment: 4
legalized: true
regBankSelected: true
registers:
-# NO_AVX512F: - { id: 0, class: vr128, preferred-register: '' }
-# AVX512ALL: - { id: 0, class: vr128x, preferred-register: '' }
-# ALL: - { id: 1, class: gr64, preferred-register: '' }
- { id: 0, class: vecr }
- { id: 1, class: gpr }
-# ALL: %0 = COPY %xmm0
-# ALL: %1 = COPY %rdi
-# SSE: MOVUPSmr %1, 1, _, 0, _, %0 :: (store 16 into %ir.p1, align 1)
-# AVX: VMOVUPSmr %1, 1, _, 0, _, %0 :: (store 16 into %ir.p1, align 1)
-# AVX512F: VMOVUPSZ128mr_NOVLX %1, 1, _, 0, _, %0 :: (store 16 into %ir.p1, align 1)
-# AVX512VL: VMOVUPSZ128mr %1, 1, _, 0, _, %0 :: (store 16 into %ir.p1, align 1)
+# NO_AVX512F: %0:vr128 = COPY %xmm0
+# AVX512ALL: %0:vr128x = COPY %xmm0
+# ALL: %1:gr64 = COPY %rdi
+# SSE: MOVUPSmr %1, 1, %noreg, 0, %noreg, %0 :: (store 16 into %ir.p1, align 1)
+# AVX: VMOVUPSmr %1, 1, %noreg, 0, %noreg, %0 :: (store 16 into %ir.p1, align 1)
+# AVX512F: VMOVUPSZ128mr_NOVLX %1, 1, %noreg, 0, %noreg, %0 :: (store 16 into %ir.p1, align 1)
+# AVX512VL: VMOVUPSZ128mr %1, 1, %noreg, 0, %noreg, %0 :: (store 16 into %ir.p1, align 1)
# ALL: %rax = COPY %1
body: |
bb.1 (%ir-block.0):
diff --git a/test/CodeGen/X86/GlobalISel/select-memop-v256.mir b/test/CodeGen/X86/GlobalISel/select-memop-v256.mir
index ff371ad9989f..962201f5f54d 100644
--- a/test/CodeGen/X86/GlobalISel/select-memop-v256.mir
+++ b/test/CodeGen/X86/GlobalISel/select-memop-v256.mir
@@ -42,18 +42,18 @@ regBankSelected: true
registers:
- { id: 0, class: gpr }
- { id: 1, class: vecr }
-# NO_AVX512F: %0 = COPY %rdi
-# NO_AVX512F-NEXT: %1 = VMOVUPSYrm %0, 1, _, 0, _ :: (load 32 from %ir.p1, align 1)
+# NO_AVX512F: %0:gr64 = COPY %rdi
+# NO_AVX512F-NEXT: %1:vr256 = VMOVUPSYrm %0, 1, %noreg, 0, %noreg :: (load 32 from %ir.p1, align 1)
# NO_AVX512F-NEXT: %ymm0 = COPY %1
# NO_AVX512F-NEXT: RET 0, implicit %ymm0
#
-# AVX512F: %0 = COPY %rdi
-# AVX512F-NEXT: %1 = VMOVUPSZ256rm_NOVLX %0, 1, _, 0, _ :: (load 32 from %ir.p1, align 1)
+# AVX512F: %0:gr64 = COPY %rdi
+# AVX512F-NEXT: %1:vr256x = VMOVUPSZ256rm_NOVLX %0, 1, %noreg, 0, %noreg :: (load 32 from %ir.p1, align 1)
# AVX512F-NEXT: %ymm0 = COPY %1
# AVX512F-NEXT: RET 0, implicit %ymm0
#
-# AVX512VL: %0 = COPY %rdi
-# AVX512VL-NEXT: %1 = VMOVUPSZ256rm %0, 1, _, 0, _ :: (load 32 from %ir.p1, align 1)
+# AVX512VL: %0:gr64 = COPY %rdi
+# AVX512VL-NEXT: %1:vr256x = VMOVUPSZ256rm %0, 1, %noreg, 0, %noreg :: (load 32 from %ir.p1, align 1)
# AVX512VL-NEXT: %ymm0 = COPY %1
# AVX512VL-NEXT: RET 0, implicit %ymm0
body: |
@@ -72,28 +72,21 @@ name: test_load_v8i32_align
alignment: 4
legalized: true
regBankSelected: true
-# NO_AVX512F: registers:
-# NO_AVX512F-NEXT: - { id: 0, class: gr64, preferred-register: '' }
-# NO_AVX512F-NEXT: - { id: 1, class: vr256, preferred-register: '' }
-#
-# AVX512ALL: registers:
-# AVX512ALL-NEXT: - { id: 0, class: gr64, preferred-register: '' }
-# AVX512ALL-NEXT: - { id: 1, class: vr256x, preferred-register: '' }
registers:
- { id: 0, class: gpr }
- { id: 1, class: vecr }
-# NO_AVX512F: %0 = COPY %rdi
-# NO_AVX512F-NEXT: %1 = VMOVAPSYrm %0, 1, _, 0, _ :: (load 32 from %ir.p1)
+# NO_AVX512F: %0:gr64 = COPY %rdi
+# NO_AVX512F-NEXT: %1:vr256 = VMOVAPSYrm %0, 1, %noreg, 0, %noreg :: (load 32 from %ir.p1)
# NO_AVX512F-NEXT: %ymm0 = COPY %1
# NO_AVX512F-NEXT: RET 0, implicit %ymm0
#
-# AVX512F: %0 = COPY %rdi
-# AVX512F-NEXT: %1 = VMOVAPSZ256rm_NOVLX %0, 1, _, 0, _ :: (load 32 from %ir.p1)
+# AVX512F: %0:gr64 = COPY %rdi
+# AVX512F-NEXT: %1:vr256x = VMOVAPSZ256rm_NOVLX %0, 1, %noreg, 0, %noreg :: (load 32 from %ir.p1)
# AVX512F-NEXT: %ymm0 = COPY %1
# AVX512F-NEXT: RET 0, implicit %ymm0
#
-# AVX512VL: %0 = COPY %rdi
-# AVX512VL-NEXT: %1 = VMOVAPSZ256rm %0, 1, _, 0, _ :: (load 32 from %ir.p1)
+# AVX512VL: %0:gr64 = COPY %rdi
+# AVX512VL-NEXT: %1:vr256x = VMOVAPSZ256rm %0, 1, %noreg, 0, %noreg :: (load 32 from %ir.p1)
# AVX512VL-NEXT: %ymm0 = COPY %1
# AVX512VL-NEXT: RET 0, implicit %ymm0
body: |
@@ -122,19 +115,19 @@ regBankSelected: true
registers:
- { id: 0, class: vecr }
- { id: 1, class: gpr }
-# NO_AVX512F: %0 = COPY %ymm0
-# NO_AVX512F-NEXT: %1 = COPY %rdi
-# NO_AVX512F-NEXT: VMOVUPSYmr %1, 1, _, 0, _, %0 :: (store 32 into %ir.p1, align 1)
+# NO_AVX512F: %0:vr256 = COPY %ymm0
+# NO_AVX512F-NEXT: %1:gr64 = COPY %rdi
+# NO_AVX512F-NEXT: VMOVUPSYmr %1, 1, %noreg, 0, %noreg, %0 :: (store 32 into %ir.p1, align 1)
# NO_AVX512F-NEXT: RET 0
#
-# AVX512F: %0 = COPY %ymm0
-# AVX512F-NEXT: %1 = COPY %rdi
-# AVX512F-NEXT: VMOVUPSZ256mr_NOVLX %1, 1, _, 0, _, %0 :: (store 32 into %ir.p1, align 1)
+# AVX512F: %0:vr256x = COPY %ymm0
+# AVX512F-NEXT: %1:gr64 = COPY %rdi
+# AVX512F-NEXT: VMOVUPSZ256mr_NOVLX %1, 1, %noreg, 0, %noreg, %0 :: (store 32 into %ir.p1, align 1)
# AVX512F-NEXT: RET 0
#
-# AVX512VL: %0 = COPY %ymm0
-# AVX512VL-NEXT: %1 = COPY %rdi
-# AVX512VL-NEXT: VMOVUPSZ256mr %1, 1, _, 0, _, %0 :: (store 32 into %ir.p1, align 1)
+# AVX512VL: %0:vr256x = COPY %ymm0
+# AVX512VL-NEXT: %1:gr64 = COPY %rdi
+# AVX512VL-NEXT: VMOVUPSZ256mr %1, 1, %noreg, 0, %noreg, %0 :: (store 32 into %ir.p1, align 1)
# AVX512VL-NEXT: RET 0
body: |
bb.1 (%ir-block.0):
@@ -162,19 +155,19 @@ regBankSelected: true
registers:
- { id: 0, class: vecr }
- { id: 1, class: gpr }
-# NO_AVX512F: %0 = COPY %ymm0
-# NO_AVX512F-NEXT: %1 = COPY %rdi
-# NO_AVX512F-NEXT: VMOVAPSYmr %1, 1, _, 0, _, %0 :: (store 32 into %ir.p1)
+# NO_AVX512F: %0:vr256 = COPY %ymm0
+# NO_AVX512F-NEXT: %1:gr64 = COPY %rdi
+# NO_AVX512F-NEXT: VMOVAPSYmr %1, 1, %noreg, 0, %noreg, %0 :: (store 32 into %ir.p1)
# NO_AVX512F-NEXT: RET 0
#
-# AVX512F: %0 = COPY %ymm0
-# AVX512F-NEXT: %1 = COPY %rdi
-# AVX512F-NEXT: VMOVAPSZ256mr_NOVLX %1, 1, _, 0, _, %0 :: (store 32 into %ir.p1)
+# AVX512F: %0:vr256x = COPY %ymm0
+# AVX512F-NEXT: %1:gr64 = COPY %rdi
+# AVX512F-NEXT: VMOVAPSZ256mr_NOVLX %1, 1, %noreg, 0, %noreg, %0 :: (store 32 into %ir.p1)
# AVX512F-NEXT: RET 0
#
-# AVX512VL: %0 = COPY %ymm0
-# AVX512VL-NEXT: %1 = COPY %rdi
-# AVX512VL-NEXT: VMOVAPSZ256mr %1, 1, _, 0, _, %0 :: (store 32 into %ir.p1)
+# AVX512VL: %0:vr256x = COPY %ymm0
+# AVX512VL-NEXT: %1:gr64 = COPY %rdi
+# AVX512VL-NEXT: VMOVAPSZ256mr %1, 1, %noreg, 0, %noreg, %0 :: (store 32 into %ir.p1)
# AVX512VL-NEXT: RET 0
body: |
bb.1 (%ir-block.0):
diff --git a/test/CodeGen/X86/GlobalISel/select-memop-v512.mir b/test/CodeGen/X86/GlobalISel/select-memop-v512.mir
index 131902d81a00..8be5c940effa 100644
--- a/test/CodeGen/X86/GlobalISel/select-memop-v512.mir
+++ b/test/CodeGen/X86/GlobalISel/select-memop-v512.mir
@@ -1,3 +1,4 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx512f -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=AVX512F
--- |
define <16 x i32> @test_load_v16i32_noalign(<16 x i32>* %p1) {
@@ -23,24 +24,21 @@
...
---
name: test_load_v16i32_noalign
-# AVX512F-LABEL: name: test_load_v16i32_noalign
alignment: 4
legalized: true
regBankSelected: true
-# AVX512F: registers:
-# AVX512F-NEXT: - { id: 0, class: gr64, preferred-register: '' }
-# AVX512F-NEXT: - { id: 1, class: vr512, preferred-register: '' }
registers:
- { id: 0, class: gpr }
- { id: 1, class: vecr }
-# AVX512F: %0 = COPY %rdi
-# AVX512F-NEXT: %1 = VMOVUPSZrm %0, 1, _, 0, _ :: (load 64 from %ir.p1, align 1)
-# AVX512F-NEXT: %zmm0 = COPY %1
-# AVX512F-NEXT: RET 0, implicit %zmm0
body: |
bb.1 (%ir-block.0):
liveins: %rdi
+ ; AVX512F-LABEL: name: test_load_v16i32_noalign
+ ; AVX512F: [[COPY:%[0-9]+]]:gr64 = COPY %rdi
+ ; AVX512F: [[VMOVUPSZrm:%[0-9]+]]:vr512 = VMOVUPSZrm [[COPY]], 1, %noreg, 0, %noreg :: (load 64 from %ir.p1, align 1)
+ ; AVX512F: %zmm0 = COPY [[VMOVUPSZrm]]
+ ; AVX512F: RET 0, implicit %zmm0
%0(p0) = COPY %rdi
%1(<16 x s32>) = G_LOAD %0(p0) :: (load 64 from %ir.p1, align 1)
%zmm0 = COPY %1(<16 x s32>)
@@ -49,24 +47,21 @@ body: |
...
---
name: test_load_v16i32_align
-# AVX512F-LABEL: name: test_load_v16i32_align
alignment: 4
legalized: true
regBankSelected: true
-# AVX512F: registers:
-# AVX512F-NEXT: - { id: 0, class: gr64, preferred-register: '' }
-# AVX512F-NEXT: - { id: 1, class: vr512, preferred-register: '' }
registers:
- { id: 0, class: gpr }
- { id: 1, class: vecr }
-# AVX512F: %0 = COPY %rdi
-# AVX512F-NEXT: %1 = VMOVUPSZrm %0, 1, _, 0, _ :: (load 64 from %ir.p1, align 32)
-# AVX512F-NEXT: %zmm0 = COPY %1
-# AVX512F-NEXT: RET 0, implicit %zmm0
body: |
bb.1 (%ir-block.0):
liveins: %rdi
+ ; AVX512F-LABEL: name: test_load_v16i32_align
+ ; AVX512F: [[COPY:%[0-9]+]]:gr64 = COPY %rdi
+ ; AVX512F: [[VMOVUPSZrm:%[0-9]+]]:vr512 = VMOVUPSZrm [[COPY]], 1, %noreg, 0, %noreg :: (load 64 from %ir.p1, align 32)
+ ; AVX512F: %zmm0 = COPY [[VMOVUPSZrm]]
+ ; AVX512F: RET 0, implicit %zmm0
%0(p0) = COPY %rdi
%1(<16 x s32>) = G_LOAD %0(p0) :: (load 64 from %ir.p1, align 32)
%zmm0 = COPY %1(<16 x s32>)
@@ -75,24 +70,21 @@ body: |
...
---
name: test_store_v16i32_noalign
-# AVX512F-LABEL: name: test_store_v16i32_noalign
alignment: 4
legalized: true
regBankSelected: true
-# AVX512F: registers:
-# AVX512F-NEXT: - { id: 0, class: vr512, preferred-register: '' }
-# AVX512F-NEXT: - { id: 1, class: gr64, preferred-register: '' }
registers:
- { id: 0, class: vecr }
- { id: 1, class: gpr }
-# AVX512F: %0 = COPY %zmm0
-# AVX512F-NEXT: %1 = COPY %rdi
-# AVX512F-NEXT: VMOVUPSZmr %1, 1, _, 0, _, %0 :: (store 64 into %ir.p1, align 1)
-# AVX512F-NEXT: RET 0
body: |
bb.1 (%ir-block.0):
liveins: %rdi, %zmm0
+ ; AVX512F-LABEL: name: test_store_v16i32_noalign
+ ; AVX512F: [[COPY:%[0-9]+]]:vr512 = COPY %zmm0
+ ; AVX512F: [[COPY1:%[0-9]+]]:gr64 = COPY %rdi
+ ; AVX512F: VMOVUPSZmr [[COPY1]], 1, %noreg, 0, %noreg, [[COPY]] :: (store 64 into %ir.p1, align 1)
+ ; AVX512F: RET 0
%0(<16 x s32>) = COPY %zmm0
%1(p0) = COPY %rdi
G_STORE %0(<16 x s32>), %1(p0) :: (store 64 into %ir.p1, align 1)
@@ -101,24 +93,21 @@ body: |
...
---
name: test_store_v16i32_align
-# AVX512F-LABEL: name: test_store_v16i32_align
alignment: 4
legalized: true
regBankSelected: true
-# AVX512F: registers:
-# AVX512F-NEXT: - { id: 0, class: vr512, preferred-register: '' }
-# AVX512F-NEXT: - { id: 1, class: gr64, preferred-register: '' }
registers:
- { id: 0, class: vecr }
- { id: 1, class: gpr }
-# AVX512F: %0 = COPY %zmm0
-# AVX512F-NEXT: %1 = COPY %rdi
-# AVX512F-NEXT: VMOVUPSZmr %1, 1, _, 0, _, %0 :: (store 64 into %ir.p1, align 32)
-# AVX512F-NEXT: RET 0
body: |
bb.1 (%ir-block.0):
liveins: %rdi, %zmm0
+ ; AVX512F-LABEL: name: test_store_v16i32_align
+ ; AVX512F: [[COPY:%[0-9]+]]:vr512 = COPY %zmm0
+ ; AVX512F: [[COPY1:%[0-9]+]]:gr64 = COPY %rdi
+ ; AVX512F: VMOVUPSZmr [[COPY1]], 1, %noreg, 0, %noreg, [[COPY]] :: (store 64 into %ir.p1, align 32)
+ ; AVX512F: RET 0
%0(<16 x s32>) = COPY %zmm0
%1(p0) = COPY %rdi
G_STORE %0(<16 x s32>), %1(p0) :: (store 64 into %ir.p1, align 32)
diff --git a/test/CodeGen/X86/GlobalISel/select-merge-vec256.mir b/test/CodeGen/X86/GlobalISel/select-merge-vec256.mir
index 8e31a904e360..0dfb678479f8 100644
--- a/test/CodeGen/X86/GlobalISel/select-merge-vec256.mir
+++ b/test/CodeGen/X86/GlobalISel/select-merge-vec256.mir
@@ -1,3 +1,4 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=AVX
# RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx512f,+avx512vl -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=AVX512VL
--- |
@@ -7,42 +8,32 @@
...
---
name: test_merge
-# AVX-LABEL: name: test_merge
#
-# AVX512VL-LABEL: name: test_merge
alignment: 4
legalized: true
regBankSelected: true
-# AVX: registers:
-# AVX-NEXT: - { id: 0, class: vr128, preferred-register: '' }
-# AVX-NEXT: - { id: 1, class: vr256, preferred-register: '' }
-# AVX-NEXT: - { id: 2, class: vr256, preferred-register: '' }
-# AVX-NEXT: - { id: 3, class: vr256, preferred-register: '' }
#
-# AVX512VL: registers:
-# AVX512VL-NEXT: - { id: 0, class: vr128x, preferred-register: '' }
-# AVX512VL-NEXT: - { id: 1, class: vr256x, preferred-register: '' }
-# AVX512VL-NEXT: - { id: 2, class: vr256x, preferred-register: '' }
-# AVX512VL-NEXT: - { id: 3, class: vr256x, preferred-register: '' }
registers:
- { id: 0, class: vecr }
- { id: 1, class: vecr }
-# AVX: %0 = IMPLICIT_DEF
-# AVX-NEXT: undef %2.sub_xmm = COPY %0
-# AVX-NEXT: %3 = VINSERTF128rr %2, %0, 1
-# AVX-NEXT: %1 = COPY %3
-# AVX-NEXT: %ymm0 = COPY %1
-# AVX-NEXT: RET 0, implicit %ymm0
#
-# AVX512VL: %0 = IMPLICIT_DEF
-# AVX512VL-NEXT: undef %2.sub_xmm = COPY %0
-# AVX512VL-NEXT: %3 = VINSERTF32x4Z256rr %2, %0, 1
-# AVX512VL-NEXT: %1 = COPY %3
-# AVX512VL-NEXT: %ymm0 = COPY %1
-# AVX512VL-NEXT: RET 0, implicit %ymm0
body: |
bb.1 (%ir-block.0):
+ ; AVX-LABEL: name: test_merge
+ ; AVX: [[DEF:%[0-9]+]]:vr128 = IMPLICIT_DEF
+ ; AVX: undef %2.sub_xmm:vr256 = COPY [[DEF]]
+ ; AVX: [[VINSERTF128rr:%[0-9]+]]:vr256 = VINSERTF128rr %2, [[DEF]], 1
+ ; AVX: [[COPY:%[0-9]+]]:vr256 = COPY [[VINSERTF128rr]]
+ ; AVX: %ymm0 = COPY [[COPY]]
+ ; AVX: RET 0, implicit %ymm0
+ ; AVX512VL-LABEL: name: test_merge
+ ; AVX512VL: [[DEF:%[0-9]+]]:vr128x = IMPLICIT_DEF
+ ; AVX512VL: undef %2.sub_xmm:vr256x = COPY [[DEF]]
+ ; AVX512VL: [[VINSERTF32x4Z256rr:%[0-9]+]]:vr256x = VINSERTF32x4Z256rr %2, [[DEF]], 1
+ ; AVX512VL: [[COPY:%[0-9]+]]:vr256x = COPY [[VINSERTF32x4Z256rr]]
+ ; AVX512VL: %ymm0 = COPY [[COPY]]
+ ; AVX512VL: RET 0, implicit %ymm0
%0(<4 x s32>) = IMPLICIT_DEF
%1(<8 x s32>) = G_MERGE_VALUES %0(<4 x s32>), %0(<4 x s32>)
%ymm0 = COPY %1(<8 x s32>)
diff --git a/test/CodeGen/X86/GlobalISel/select-merge-vec512.mir b/test/CodeGen/X86/GlobalISel/select-merge-vec512.mir
index a072d582e505..5de38e4ce1f1 100644
--- a/test/CodeGen/X86/GlobalISel/select-merge-vec512.mir
+++ b/test/CodeGen/X86/GlobalISel/select-merge-vec512.mir
@@ -1,3 +1,4 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx512f -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL
--- |
define void @test_merge_v128() {
@@ -11,31 +12,24 @@
...
---
name: test_merge_v128
-# ALL-LABEL: name: test_merge_v128
alignment: 4
legalized: true
regBankSelected: true
-# ALL: registers:
-# ALL-NEXT: - { id: 0, class: vr128x, preferred-register: '' }
-# ALL-NEXT: - { id: 1, class: vr512, preferred-register: '' }
-# ALL-NEXT: - { id: 2, class: vr512, preferred-register: '' }
-# ALL-NEXT: - { id: 3, class: vr512, preferred-register: '' }
-# ALL-NEXT: - { id: 4, class: vr512, preferred-register: '' }
-# ALL-NEXT: - { id: 5, class: vr512, preferred-register: '' }
registers:
- { id: 0, class: vecr }
- { id: 1, class: vecr }
-# ALL: %0 = IMPLICIT_DEF
-# ALL-NEXT: undef %2.sub_xmm = COPY %0
-# ALL-NEXT: %3 = VINSERTF32x4Zrr %2, %0, 1
-# ALL-NEXT: %4 = VINSERTF32x4Zrr %3, %0, 2
-# ALL-NEXT: %5 = VINSERTF32x4Zrr %4, %0, 3
-# ALL-NEXT: %1 = COPY %5
-# ALL-NEXT: %zmm0 = COPY %1
-# ALL-NEXT: RET 0, implicit %zmm0
body: |
bb.1 (%ir-block.0):
+ ; ALL-LABEL: name: test_merge_v128
+ ; ALL: [[DEF:%[0-9]+]]:vr128x = IMPLICIT_DEF
+ ; ALL: undef %2.sub_xmm:vr512 = COPY [[DEF]]
+ ; ALL: [[VINSERTF32x4Zrr:%[0-9]+]]:vr512 = VINSERTF32x4Zrr %2, [[DEF]], 1
+ ; ALL: [[VINSERTF32x4Zrr1:%[0-9]+]]:vr512 = VINSERTF32x4Zrr [[VINSERTF32x4Zrr]], [[DEF]], 2
+ ; ALL: [[VINSERTF32x4Zrr2:%[0-9]+]]:vr512 = VINSERTF32x4Zrr [[VINSERTF32x4Zrr1]], [[DEF]], 3
+ ; ALL: [[COPY:%[0-9]+]]:vr512 = COPY [[VINSERTF32x4Zrr2]]
+ ; ALL: %zmm0 = COPY [[COPY]]
+ ; ALL: RET 0, implicit %zmm0
%0(<4 x s32>) = IMPLICIT_DEF
%1(<16 x s32>) = G_MERGE_VALUES %0(<4 x s32>), %0(<4 x s32>), %0(<4 x s32>), %0(<4 x s32>)
%zmm0 = COPY %1(<16 x s32>)
@@ -44,27 +38,22 @@ body: |
...
---
name: test_merge_v256
-# ALL-LABEL: name: test_merge_v256
alignment: 4
legalized: true
regBankSelected: true
-# ALL: registers:
-# ALL-NEXT: - { id: 0, class: vr256x, preferred-register: '' }
-# ALL-NEXT: - { id: 1, class: vr512, preferred-register: '' }
-# ALL-NEXT: - { id: 2, class: vr512, preferred-register: '' }
-# ALL-NEXT: - { id: 3, class: vr512, preferred-register: '' }
registers:
- { id: 0, class: vecr }
- { id: 1, class: vecr }
-# ALL: %0 = IMPLICIT_DEF
-# ALL-NEXT: undef %2.sub_ymm = COPY %0
-# ALL-NEXT: %3 = VINSERTF64x4Zrr %2, %0, 1
-# ALL-NEXT: %1 = COPY %3
-# ALL-NEXT: %zmm0 = COPY %1
-# ALL-NEXT: RET 0, implicit %zmm0
body: |
bb.1 (%ir-block.0):
+ ; ALL-LABEL: name: test_merge_v256
+ ; ALL: [[DEF:%[0-9]+]]:vr256x = IMPLICIT_DEF
+ ; ALL: undef %2.sub_ymm:vr512 = COPY [[DEF]]
+ ; ALL: [[VINSERTF64x4Zrr:%[0-9]+]]:vr512 = VINSERTF64x4Zrr %2, [[DEF]], 1
+ ; ALL: [[COPY:%[0-9]+]]:vr512 = COPY [[VINSERTF64x4Zrr]]
+ ; ALL: %zmm0 = COPY [[COPY]]
+ ; ALL: RET 0, implicit %zmm0
%0(<8 x s32>) = IMPLICIT_DEF
%1(<16 x s32>) = G_MERGE_VALUES %0(<8 x s32>), %0(<8 x s32>)
%zmm0 = COPY %1(<16 x s32>)
diff --git a/test/CodeGen/X86/GlobalISel/select-mul-scalar.mir b/test/CodeGen/X86/GlobalISel/select-mul-scalar.mir
index 453557c08469..f0766ff7eb59 100644
--- a/test/CodeGen/X86/GlobalISel/select-mul-scalar.mir
+++ b/test/CodeGen/X86/GlobalISel/select-mul-scalar.mir
@@ -1,4 +1,5 @@
-# RUN: llc -mtriple=x86_64-linux-gnu -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=x86_64-linux-gnu -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL
--- |
define i16 @test_mul_i16(i16 %arg1, i16 %arg2) {
@@ -19,30 +20,25 @@
...
---
name: test_mul_i16
-# ALL-LABEL: name: test_mul_i16
alignment: 4
legalized: true
regBankSelected: true
-# ALL: registers:
-# ALL-NEXT: - { id: 0, class: gr16, preferred-register: '' }
-# ALL-NEXT: - { id: 1, class: gr16, preferred-register: '' }
-# ALL-NEXT: - { id: 2, class: gr16, preferred-register: '' }
registers:
- { id: 0, class: gpr }
- { id: 1, class: gpr }
- { id: 2, class: gpr }
-# ALL: body: |
-# ALL: %0 = COPY %di
-# ALL-NEXT: %1 = COPY %si
-# ALL-NEXT: %2 = IMUL16rr %0, %1, implicit-def %eflags
-# ALL-NEXT: %ax = COPY %2
-# ALL-NEXT: RET 0, implicit %ax
body: |
bb.1 (%ir-block.0):
liveins: %edi, %esi
- %0(s16) = COPY %edi
- %1(s16) = COPY %esi
+ ; ALL-LABEL: name: test_mul_i16
+ ; ALL: [[COPY:%[0-9]+]]:gr16 = COPY %di
+ ; ALL: [[COPY1:%[0-9]+]]:gr16 = COPY %si
+ ; ALL: [[IMUL16rr:%[0-9]+]]:gr16 = IMUL16rr [[COPY]], [[COPY1]], implicit-def %eflags
+ ; ALL: %ax = COPY [[IMUL16rr]]
+ ; ALL: RET 0, implicit %ax
+ %0(s16) = COPY %di
+ %1(s16) = COPY %si
%2(s16) = G_MUL %0, %1
%ax = COPY %2(s16)
RET 0, implicit %ax
@@ -50,28 +46,23 @@ body: |
...
---
name: test_mul_i32
-# ALL-LABEL: name: test_mul_i32
alignment: 4
legalized: true
regBankSelected: true
-# ALL: registers:
-# ALL-NEXT: - { id: 0, class: gr32, preferred-register: '' }
-# ALL-NEXT: - { id: 1, class: gr32, preferred-register: '' }
-# ALL-NEXT: - { id: 2, class: gr32, preferred-register: '' }
registers:
- { id: 0, class: gpr }
- { id: 1, class: gpr }
- { id: 2, class: gpr }
-# ALL: body: |
-# ALL: %0 = COPY %edi
-# ALL-NEXT: %1 = COPY %esi
-# ALL-NEXT: %2 = IMUL32rr %0, %1, implicit-def %eflags
-# ALL-NEXT: %eax = COPY %2
-# ALL-NEXT: RET 0, implicit %eax
body: |
bb.1 (%ir-block.0):
liveins: %edi, %esi
+ ; ALL-LABEL: name: test_mul_i32
+ ; ALL: [[COPY:%[0-9]+]]:gr32 = COPY %edi
+ ; ALL: [[COPY1:%[0-9]+]]:gr32 = COPY %esi
+ ; ALL: [[IMUL32rr:%[0-9]+]]:gr32 = IMUL32rr [[COPY]], [[COPY1]], implicit-def %eflags
+ ; ALL: %eax = COPY [[IMUL32rr]]
+ ; ALL: RET 0, implicit %eax
%0(s32) = COPY %edi
%1(s32) = COPY %esi
%2(s32) = G_MUL %0, %1
@@ -81,28 +72,23 @@ body: |
...
---
name: test_mul_i64
-# ALL-LABEL: name: test_mul_i64
alignment: 4
legalized: true
regBankSelected: true
-# ALL: registers:
-# ALL-NEXT: - { id: 0, class: gr64, preferred-register: '' }
-# ALL-NEXT: - { id: 1, class: gr64, preferred-register: '' }
-# ALL-NEXT: - { id: 2, class: gr64, preferred-register: '' }
registers:
- { id: 0, class: gpr }
- { id: 1, class: gpr }
- { id: 2, class: gpr }
-# ALL: body: |
-# ALL: %0 = COPY %rdi
-# ALL-NEXT: %1 = COPY %rsi
-# ALL-NEXT: %2 = IMUL64rr %0, %1, implicit-def %eflags
-# ALL-NEXT: %rax = COPY %2
-# ALL-NEXT: RET 0, implicit %rax
body: |
bb.1 (%ir-block.0):
liveins: %rdi, %rsi
+ ; ALL-LABEL: name: test_mul_i64
+ ; ALL: [[COPY:%[0-9]+]]:gr64 = COPY %rdi
+ ; ALL: [[COPY1:%[0-9]+]]:gr64 = COPY %rsi
+ ; ALL: [[IMUL64rr:%[0-9]+]]:gr64 = IMUL64rr [[COPY]], [[COPY1]], implicit-def %eflags
+ ; ALL: %rax = COPY [[IMUL64rr]]
+ ; ALL: RET 0, implicit %rax
%0(s64) = COPY %rdi
%1(s64) = COPY %rsi
%2(s64) = G_MUL %0, %1
diff --git a/test/CodeGen/X86/GlobalISel/select-mul-vec.mir b/test/CodeGen/X86/GlobalISel/select-mul-vec.mir
index d3651ccd1ab9..afc40815af0d 100644
--- a/test/CodeGen/X86/GlobalISel/select-mul-vec.mir
+++ b/test/CodeGen/X86/GlobalISel/select-mul-vec.mir
@@ -1,3 +1,4 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple=x86_64-linux-gnu -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s
--- |
@@ -90,23 +91,23 @@
...
---
name: test_mul_v8i16
-# CHECK-LABEL: name: test_mul_v8i16
alignment: 4
legalized: true
regBankSelected: true
-# CHECK: registers:
-# CHECK-NEXT: - { id: 0, class: vr128, preferred-register: '' }
-# CHECK-NEXT: - { id: 1, class: vr128, preferred-register: '' }
-# CHECK-NEXT: - { id: 2, class: vr128, preferred-register: '' }
registers:
- { id: 0, class: vecr }
- { id: 1, class: vecr }
- { id: 2, class: vecr }
-# CHECK: %2 = PMULLWrr %0, %1
body: |
bb.1 (%ir-block.0):
liveins: %xmm0, %xmm1
+ ; CHECK-LABEL: name: test_mul_v8i16
+ ; CHECK: [[COPY:%[0-9]+]]:vr128 = COPY %xmm0
+ ; CHECK: [[COPY1:%[0-9]+]]:vr128 = COPY %xmm1
+ ; CHECK: [[PMULLWrr:%[0-9]+]]:vr128 = PMULLWrr [[COPY]], [[COPY1]]
+ ; CHECK: %xmm0 = COPY [[PMULLWrr]]
+ ; CHECK: RET 0, implicit %xmm0
%0(<8 x s16>) = COPY %xmm0
%1(<8 x s16>) = COPY %xmm1
%2(<8 x s16>) = G_MUL %0, %1
@@ -116,23 +117,23 @@ body: |
...
---
name: test_mul_v8i16_avx
-# CHECK-LABEL: name: test_mul_v8i16_avx
alignment: 4
legalized: true
regBankSelected: true
-# CHECK: registers:
-# CHECK-NEXT: - { id: 0, class: vr128, preferred-register: '' }
-# CHECK-NEXT: - { id: 1, class: vr128, preferred-register: '' }
-# CHECK-NEXT: - { id: 2, class: vr128, preferred-register: '' }
registers:
- { id: 0, class: vecr }
- { id: 1, class: vecr }
- { id: 2, class: vecr }
-# CHECK: %2 = VPMULLWrr %0, %1
body: |
bb.1 (%ir-block.0):
liveins: %xmm0, %xmm1
+ ; CHECK-LABEL: name: test_mul_v8i16_avx
+ ; CHECK: [[COPY:%[0-9]+]]:vr128 = COPY %xmm0
+ ; CHECK: [[COPY1:%[0-9]+]]:vr128 = COPY %xmm1
+ ; CHECK: [[VPMULLWrr:%[0-9]+]]:vr128 = VPMULLWrr [[COPY]], [[COPY1]]
+ ; CHECK: %xmm0 = COPY [[VPMULLWrr]]
+ ; CHECK: RET 0, implicit %xmm0
%0(<8 x s16>) = COPY %xmm0
%1(<8 x s16>) = COPY %xmm1
%2(<8 x s16>) = G_MUL %0, %1
@@ -142,23 +143,23 @@ body: |
...
---
name: test_mul_v8i16_avx512bwvl
-# CHECK-LABEL: name: test_mul_v8i16_avx512bwvl
alignment: 4
legalized: true
regBankSelected: true
-# CHECK: registers:
-# CHECK-NEXT: - { id: 0, class: vr128x, preferred-register: '' }
-# CHECK-NEXT: - { id: 1, class: vr128x, preferred-register: '' }
-# CHECK-NEXT: - { id: 2, class: vr128x, preferred-register: '' }
registers:
- { id: 0, class: vecr }
- { id: 1, class: vecr }
- { id: 2, class: vecr }
-# CHECK: %2 = VPMULLWZ128rr %0, %1
body: |
bb.1 (%ir-block.0):
liveins: %xmm0, %xmm1
+ ; CHECK-LABEL: name: test_mul_v8i16_avx512bwvl
+ ; CHECK: [[COPY:%[0-9]+]]:vr128x = COPY %xmm0
+ ; CHECK: [[COPY1:%[0-9]+]]:vr128x = COPY %xmm1
+ ; CHECK: [[VPMULLWZ128rr:%[0-9]+]]:vr128x = VPMULLWZ128rr [[COPY]], [[COPY1]]
+ ; CHECK: %xmm0 = COPY [[VPMULLWZ128rr]]
+ ; CHECK: RET 0, implicit %xmm0
%0(<8 x s16>) = COPY %xmm0
%1(<8 x s16>) = COPY %xmm1
%2(<8 x s16>) = G_MUL %0, %1
@@ -168,23 +169,23 @@ body: |
...
---
name: test_mul_v4i32
-# CHECK-LABEL: name: test_mul_v4i32
alignment: 4
legalized: true
regBankSelected: true
-# CHECK: registers:
-# CHECK-NEXT: - { id: 0, class: vr128, preferred-register: '' }
-# CHECK-NEXT: - { id: 1, class: vr128, preferred-register: '' }
-# CHECK-NEXT: - { id: 2, class: vr128, preferred-register: '' }
registers:
- { id: 0, class: vecr }
- { id: 1, class: vecr }
- { id: 2, class: vecr }
-# CHECK: %2 = PMULLDrr %0, %1
body: |
bb.1 (%ir-block.0):
liveins: %xmm0, %xmm1
+ ; CHECK-LABEL: name: test_mul_v4i32
+ ; CHECK: [[COPY:%[0-9]+]]:vr128 = COPY %xmm0
+ ; CHECK: [[COPY1:%[0-9]+]]:vr128 = COPY %xmm1
+ ; CHECK: [[PMULLDrr:%[0-9]+]]:vr128 = PMULLDrr [[COPY]], [[COPY1]]
+ ; CHECK: %xmm0 = COPY [[PMULLDrr]]
+ ; CHECK: RET 0, implicit %xmm0
%0(<4 x s32>) = COPY %xmm0
%1(<4 x s32>) = COPY %xmm1
%2(<4 x s32>) = G_MUL %0, %1
@@ -194,23 +195,23 @@ body: |
...
---
name: test_mul_v4i32_avx
-# CHECK-LABEL: name: test_mul_v4i32_avx
alignment: 4
legalized: true
regBankSelected: true
-# CHECK: registers:
-# CHECK-NEXT: - { id: 0, class: vr128, preferred-register: '' }
-# CHECK-NEXT: - { id: 1, class: vr128, preferred-register: '' }
-# CHECK-NEXT: - { id: 2, class: vr128, preferred-register: '' }
registers:
- { id: 0, class: vecr }
- { id: 1, class: vecr }
- { id: 2, class: vecr }
-# CHECK: %2 = VPMULLDrr %0, %1
body: |
bb.1 (%ir-block.0):
liveins: %xmm0, %xmm1
+ ; CHECK-LABEL: name: test_mul_v4i32_avx
+ ; CHECK: [[COPY:%[0-9]+]]:vr128 = COPY %xmm0
+ ; CHECK: [[COPY1:%[0-9]+]]:vr128 = COPY %xmm1
+ ; CHECK: [[VPMULLDrr:%[0-9]+]]:vr128 = VPMULLDrr [[COPY]], [[COPY1]]
+ ; CHECK: %xmm0 = COPY [[VPMULLDrr]]
+ ; CHECK: RET 0, implicit %xmm0
%0(<4 x s32>) = COPY %xmm0
%1(<4 x s32>) = COPY %xmm1
%2(<4 x s32>) = G_MUL %0, %1
@@ -220,23 +221,23 @@ body: |
...
---
name: test_mul_v4i32_avx512vl
-# CHECK-LABEL: name: test_mul_v4i32_avx512vl
alignment: 4
legalized: true
regBankSelected: true
-# CHECK: registers:
-# CHECK-NEXT: - { id: 0, class: vr128x, preferred-register: '' }
-# CHECK-NEXT: - { id: 1, class: vr128x, preferred-register: '' }
-# CHECK-NEXT: - { id: 2, class: vr128x, preferred-register: '' }
registers:
- { id: 0, class: vecr }
- { id: 1, class: vecr }
- { id: 2, class: vecr }
-# CHECK: %2 = VPMULLDZ128rr %0, %1
body: |
bb.1 (%ir-block.0):
liveins: %xmm0, %xmm1
+ ; CHECK-LABEL: name: test_mul_v4i32_avx512vl
+ ; CHECK: [[COPY:%[0-9]+]]:vr128x = COPY %xmm0
+ ; CHECK: [[COPY1:%[0-9]+]]:vr128x = COPY %xmm1
+ ; CHECK: [[VPMULLDZ128rr:%[0-9]+]]:vr128x = VPMULLDZ128rr [[COPY]], [[COPY1]]
+ ; CHECK: %xmm0 = COPY [[VPMULLDZ128rr]]
+ ; CHECK: RET 0, implicit %xmm0
%0(<4 x s32>) = COPY %xmm0
%1(<4 x s32>) = COPY %xmm1
%2(<4 x s32>) = G_MUL %0, %1
@@ -246,23 +247,23 @@ body: |
...
---
name: test_mul_v2i64
-# CHECK-LABEL: name: test_mul_v2i64
alignment: 4
legalized: true
regBankSelected: true
-# CHECK: registers:
-# CHECK-NEXT: - { id: 0, class: vr128x, preferred-register: '' }
-# CHECK-NEXT: - { id: 1, class: vr128x, preferred-register: '' }
-# CHECK-NEXT: - { id: 2, class: vr128x, preferred-register: '' }
registers:
- { id: 0, class: vecr }
- { id: 1, class: vecr }
- { id: 2, class: vecr }
-# CHECK: %2 = VPMULLQZ128rr %0, %1
body: |
bb.1 (%ir-block.0):
liveins: %xmm0, %xmm1
+ ; CHECK-LABEL: name: test_mul_v2i64
+ ; CHECK: [[COPY:%[0-9]+]]:vr128x = COPY %xmm0
+ ; CHECK: [[COPY1:%[0-9]+]]:vr128x = COPY %xmm1
+ ; CHECK: [[VPMULLQZ128rr:%[0-9]+]]:vr128x = VPMULLQZ128rr [[COPY]], [[COPY1]]
+ ; CHECK: %xmm0 = COPY [[VPMULLQZ128rr]]
+ ; CHECK: RET 0, implicit %xmm0
%0(<2 x s64>) = COPY %xmm0
%1(<2 x s64>) = COPY %xmm1
%2(<2 x s64>) = G_MUL %0, %1
@@ -272,23 +273,23 @@ body: |
...
---
name: test_mul_v16i16
-# CHECK-LABEL: name: test_mul_v16i16
alignment: 4
legalized: true
regBankSelected: true
-# CHECK: registers:
-# CHECK-NEXT: - { id: 0, class: vr256, preferred-register: '' }
-# CHECK-NEXT: - { id: 1, class: vr256, preferred-register: '' }
-# CHECK-NEXT: - { id: 2, class: vr256, preferred-register: '' }
registers:
- { id: 0, class: vecr }
- { id: 1, class: vecr }
- { id: 2, class: vecr }
-# CHECK: %2 = VPMULLWYrr %0, %1
body: |
bb.1 (%ir-block.0):
liveins: %ymm0, %ymm1
+ ; CHECK-LABEL: name: test_mul_v16i16
+ ; CHECK: [[COPY:%[0-9]+]]:vr256 = COPY %ymm0
+ ; CHECK: [[COPY1:%[0-9]+]]:vr256 = COPY %ymm1
+ ; CHECK: [[VPMULLWYrr:%[0-9]+]]:vr256 = VPMULLWYrr [[COPY]], [[COPY1]]
+ ; CHECK: %ymm0 = COPY [[VPMULLWYrr]]
+ ; CHECK: RET 0, implicit %ymm0
%0(<16 x s16>) = COPY %ymm0
%1(<16 x s16>) = COPY %ymm1
%2(<16 x s16>) = G_MUL %0, %1
@@ -298,23 +299,23 @@ body: |
...
---
name: test_mul_v16i16_avx512bwvl
-# CHECK-LABEL: name: test_mul_v16i16_avx512bwvl
alignment: 4
legalized: true
regBankSelected: true
-# CHECK: registers:
-# CHECK-NEXT: - { id: 0, class: vr256x, preferred-register: '' }
-# CHECK-NEXT: - { id: 1, class: vr256x, preferred-register: '' }
-# CHECK-NEXT: - { id: 2, class: vr256x, preferred-register: '' }
registers:
- { id: 0, class: vecr }
- { id: 1, class: vecr }
- { id: 2, class: vecr }
-# CHECK: %2 = VPMULLWZ256rr %0, %1
body: |
bb.1 (%ir-block.0):
liveins: %ymm0, %ymm1
+ ; CHECK-LABEL: name: test_mul_v16i16_avx512bwvl
+ ; CHECK: [[COPY:%[0-9]+]]:vr256x = COPY %ymm0
+ ; CHECK: [[COPY1:%[0-9]+]]:vr256x = COPY %ymm1
+ ; CHECK: [[VPMULLWZ256rr:%[0-9]+]]:vr256x = VPMULLWZ256rr [[COPY]], [[COPY1]]
+ ; CHECK: %ymm0 = COPY [[VPMULLWZ256rr]]
+ ; CHECK: RET 0, implicit %ymm0
%0(<16 x s16>) = COPY %ymm0
%1(<16 x s16>) = COPY %ymm1
%2(<16 x s16>) = G_MUL %0, %1
@@ -324,23 +325,23 @@ body: |
...
---
name: test_mul_v8i32
-# CHECK-LABEL: name: test_mul_v8i32
alignment: 4
legalized: true
regBankSelected: true
-# CHECK: registers:
-# CHECK-NEXT: - { id: 0, class: vr256, preferred-register: '' }
-# CHECK-NEXT: - { id: 1, class: vr256, preferred-register: '' }
-# CHECK-NEXT: - { id: 2, class: vr256, preferred-register: '' }
registers:
- { id: 0, class: vecr }
- { id: 1, class: vecr }
- { id: 2, class: vecr }
-# CHECK: %2 = VPMULLDYrr %0, %1
body: |
bb.1 (%ir-block.0):
liveins: %ymm0, %ymm1
+ ; CHECK-LABEL: name: test_mul_v8i32
+ ; CHECK: [[COPY:%[0-9]+]]:vr256 = COPY %ymm0
+ ; CHECK: [[COPY1:%[0-9]+]]:vr256 = COPY %ymm1
+ ; CHECK: [[VPMULLDYrr:%[0-9]+]]:vr256 = VPMULLDYrr [[COPY]], [[COPY1]]
+ ; CHECK: %ymm0 = COPY [[VPMULLDYrr]]
+ ; CHECK: RET 0, implicit %ymm0
%0(<8 x s32>) = COPY %ymm0
%1(<8 x s32>) = COPY %ymm1
%2(<8 x s32>) = G_MUL %0, %1
@@ -350,23 +351,23 @@ body: |
...
---
name: test_mul_v8i32_avx512vl
-# CHECK-LABEL: name: test_mul_v8i32_avx512vl
alignment: 4
legalized: true
regBankSelected: true
-# CHECK: registers:
-# CHECK-NEXT: - { id: 0, class: vr256x, preferred-register: '' }
-# CHECK-NEXT: - { id: 1, class: vr256x, preferred-register: '' }
-# CHECK-NEXT: - { id: 2, class: vr256x, preferred-register: '' }
registers:
- { id: 0, class: vecr }
- { id: 1, class: vecr }
- { id: 2, class: vecr }
-# CHECK: %2 = VPMULLDZ256rr %0, %1
body: |
bb.1 (%ir-block.0):
liveins: %ymm0, %ymm1
+ ; CHECK-LABEL: name: test_mul_v8i32_avx512vl
+ ; CHECK: [[COPY:%[0-9]+]]:vr256x = COPY %ymm0
+ ; CHECK: [[COPY1:%[0-9]+]]:vr256x = COPY %ymm1
+ ; CHECK: [[VPMULLDZ256rr:%[0-9]+]]:vr256x = VPMULLDZ256rr [[COPY]], [[COPY1]]
+ ; CHECK: %ymm0 = COPY [[VPMULLDZ256rr]]
+ ; CHECK: RET 0, implicit %ymm0
%0(<8 x s32>) = COPY %ymm0
%1(<8 x s32>) = COPY %ymm1
%2(<8 x s32>) = G_MUL %0, %1
@@ -376,23 +377,23 @@ body: |
...
---
name: test_mul_v4i64
-# CHECK-LABEL: name: test_mul_v4i64
alignment: 4
legalized: true
regBankSelected: true
-# CHECK: registers:
-# CHECK-NEXT: - { id: 0, class: vr256x, preferred-register: '' }
-# CHECK-NEXT: - { id: 1, class: vr256x, preferred-register: '' }
-# CHECK-NEXT: - { id: 2, class: vr256x, preferred-register: '' }
registers:
- { id: 0, class: vecr }
- { id: 1, class: vecr }
- { id: 2, class: vecr }
-# CHECK: %2 = VPMULLQZ256rr %0, %1
body: |
bb.1 (%ir-block.0):
liveins: %ymm0, %ymm1
+ ; CHECK-LABEL: name: test_mul_v4i64
+ ; CHECK: [[COPY:%[0-9]+]]:vr256x = COPY %ymm0
+ ; CHECK: [[COPY1:%[0-9]+]]:vr256x = COPY %ymm1
+ ; CHECK: [[VPMULLQZ256rr:%[0-9]+]]:vr256x = VPMULLQZ256rr [[COPY]], [[COPY1]]
+ ; CHECK: %ymm0 = COPY [[VPMULLQZ256rr]]
+ ; CHECK: RET 0, implicit %ymm0
%0(<4 x s64>) = COPY %ymm0
%1(<4 x s64>) = COPY %ymm1
%2(<4 x s64>) = G_MUL %0, %1
@@ -402,23 +403,23 @@ body: |
...
---
name: test_mul_v32i16
-# CHECK-LABEL: name: test_mul_v32i16
alignment: 4
legalized: true
regBankSelected: true
-# CHECK: registers:
-# CHECK-NEXT: - { id: 0, class: vr512, preferred-register: '' }
-# CHECK-NEXT: - { id: 1, class: vr512, preferred-register: '' }
-# CHECK-NEXT: - { id: 2, class: vr512, preferred-register: '' }
registers:
- { id: 0, class: vecr }
- { id: 1, class: vecr }
- { id: 2, class: vecr }
-# CHECK: %2 = VPMULLWZrr %0, %1
body: |
bb.1 (%ir-block.0):
liveins: %zmm0, %zmm1
+ ; CHECK-LABEL: name: test_mul_v32i16
+ ; CHECK: [[COPY:%[0-9]+]]:vr512 = COPY %zmm0
+ ; CHECK: [[COPY1:%[0-9]+]]:vr512 = COPY %zmm1
+ ; CHECK: [[VPMULLWZrr:%[0-9]+]]:vr512 = VPMULLWZrr [[COPY]], [[COPY1]]
+ ; CHECK: %zmm0 = COPY [[VPMULLWZrr]]
+ ; CHECK: RET 0, implicit %zmm0
%0(<32 x s16>) = COPY %zmm0
%1(<32 x s16>) = COPY %zmm1
%2(<32 x s16>) = G_MUL %0, %1
@@ -428,23 +429,23 @@ body: |
...
---
name: test_mul_v16i32
-# CHECK-LABEL: name: test_mul_v16i32
alignment: 4
legalized: true
regBankSelected: true
-# CHECK: registers:
-# CHECK-NEXT: - { id: 0, class: vr512, preferred-register: '' }
-# CHECK-NEXT: - { id: 1, class: vr512, preferred-register: '' }
-# CHECK-NEXT: - { id: 2, class: vr512, preferred-register: '' }
registers:
- { id: 0, class: vecr }
- { id: 1, class: vecr }
- { id: 2, class: vecr }
-# CHECK: %2 = VPMULLDZrr %0, %1
body: |
bb.1 (%ir-block.0):
liveins: %zmm0, %zmm1
+ ; CHECK-LABEL: name: test_mul_v16i32
+ ; CHECK: [[COPY:%[0-9]+]]:vr512 = COPY %zmm0
+ ; CHECK: [[COPY1:%[0-9]+]]:vr512 = COPY %zmm1
+ ; CHECK: [[VPMULLDZrr:%[0-9]+]]:vr512 = VPMULLDZrr [[COPY]], [[COPY1]]
+ ; CHECK: %zmm0 = COPY [[VPMULLDZrr]]
+ ; CHECK: RET 0, implicit %zmm0
%0(<16 x s32>) = COPY %zmm0
%1(<16 x s32>) = COPY %zmm1
%2(<16 x s32>) = G_MUL %0, %1
@@ -454,23 +455,23 @@ body: |
...
---
name: test_mul_v8i64
-# CHECK-LABEL: name: test_mul_v8i64
alignment: 4
legalized: true
regBankSelected: true
-# CHECK: registers:
-# CHECK-NEXT: - { id: 0, class: vr512, preferred-register: '' }
-# CHECK-NEXT: - { id: 1, class: vr512, preferred-register: '' }
-# CHECK-NEXT: - { id: 2, class: vr512, preferred-register: '' }
registers:
- { id: 0, class: vecr }
- { id: 1, class: vecr }
- { id: 2, class: vecr }
-# CHECK: %2 = VPMULLQZrr %0, %1
body: |
bb.1 (%ir-block.0):
liveins: %zmm0, %zmm1
+ ; CHECK-LABEL: name: test_mul_v8i64
+ ; CHECK: [[COPY:%[0-9]+]]:vr512 = COPY %zmm0
+ ; CHECK: [[COPY1:%[0-9]+]]:vr512 = COPY %zmm1
+ ; CHECK: [[VPMULLQZrr:%[0-9]+]]:vr512 = VPMULLQZrr [[COPY]], [[COPY1]]
+ ; CHECK: %zmm0 = COPY [[VPMULLQZrr]]
+ ; CHECK: RET 0, implicit %zmm0
%0(<8 x s64>) = COPY %zmm0
%1(<8 x s64>) = COPY %zmm1
%2(<8 x s64>) = G_MUL %0, %1
diff --git a/test/CodeGen/X86/GlobalISel/select-or-scalar.mir b/test/CodeGen/X86/GlobalISel/select-or-scalar.mir
index 4f7e48207838..21c6ed50d3b3 100644
--- a/test/CodeGen/X86/GlobalISel/select-or-scalar.mir
+++ b/test/CodeGen/X86/GlobalISel/select-or-scalar.mir
@@ -1,3 +1,4 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple=x86_64-linux-gnu -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL
--- |
@@ -24,14 +25,9 @@
...
---
name: test_or_i8
-# ALL-LABEL: name: test_or_i8
alignment: 4
legalized: true
regBankSelected: true
-# ALL: registers:
-# ALL-NEXT: - { id: 0, class: gr8, preferred-register: '' }
-# ALL-NEXT: - { id: 1, class: gr8, preferred-register: '' }
-# ALL-NEXT: - { id: 2, class: gr8, preferred-register: '' }
registers:
- { id: 0, class: gpr, preferred-register: '' }
- { id: 1, class: gpr, preferred-register: '' }
@@ -40,17 +36,18 @@ liveins:
fixedStack:
stack:
constants:
-# ALL: %0 = COPY %dil
-# ALL-NEXT: %1 = COPY %sil
-# ALL-NEXT: %2 = OR8rr %0, %1, implicit-def %eflags
-# ALL-NEXT: %al = COPY %2
-# ALL-NEXT: RET 0, implicit %al
body: |
bb.1 (%ir-block.0):
liveins: %edi, %esi
- %0(s8) = COPY %edi
- %1(s8) = COPY %esi
+ ; ALL-LABEL: name: test_or_i8
+ ; ALL: [[COPY:%[0-9]+]]:gr8 = COPY %dil
+ ; ALL: [[COPY1:%[0-9]+]]:gr8 = COPY %sil
+ ; ALL: [[OR8rr:%[0-9]+]]:gr8 = OR8rr [[COPY]], [[COPY1]], implicit-def %eflags
+ ; ALL: %al = COPY [[OR8rr]]
+ ; ALL: RET 0, implicit %al
+ %0(s8) = COPY %dil
+ %1(s8) = COPY %sil
%2(s8) = G_OR %0, %1
%al = COPY %2(s8)
RET 0, implicit %al
@@ -58,14 +55,9 @@ body: |
...
---
name: test_or_i16
-# ALL-LABEL: name: test_or_i16
alignment: 4
legalized: true
regBankSelected: true
-# ALL: registers:
-# ALL-NEXT: - { id: 0, class: gr16, preferred-register: '' }
-# ALL-NEXT: - { id: 1, class: gr16, preferred-register: '' }
-# ALL-NEXT: - { id: 2, class: gr16, preferred-register: '' }
registers:
- { id: 0, class: gpr, preferred-register: '' }
- { id: 1, class: gpr, preferred-register: '' }
@@ -74,17 +66,18 @@ liveins:
fixedStack:
stack:
constants:
-# ALL: %0 = COPY %di
-# ALL-NEXT: %1 = COPY %si
-# ALL-NEXT: %2 = OR16rr %0, %1, implicit-def %eflags
-# ALL-NEXT: %ax = COPY %2
-# ALL-NEXT: RET 0, implicit %ax
body: |
bb.1 (%ir-block.0):
liveins: %edi, %esi
- %0(s16) = COPY %edi
- %1(s16) = COPY %esi
+ ; ALL-LABEL: name: test_or_i16
+ ; ALL: [[COPY:%[0-9]+]]:gr16 = COPY %di
+ ; ALL: [[COPY1:%[0-9]+]]:gr16 = COPY %si
+ ; ALL: [[OR16rr:%[0-9]+]]:gr16 = OR16rr [[COPY]], [[COPY1]], implicit-def %eflags
+ ; ALL: %ax = COPY [[OR16rr]]
+ ; ALL: RET 0, implicit %ax
+ %0(s16) = COPY %di
+ %1(s16) = COPY %si
%2(s16) = G_OR %0, %1
%ax = COPY %2(s16)
RET 0, implicit %ax
@@ -92,14 +85,9 @@ body: |
...
---
name: test_or_i32
-# ALL-LABEL: name: test_or_i32
alignment: 4
legalized: true
regBankSelected: true
-# ALL: registers:
-# ALL-NEXT: - { id: 0, class: gr32, preferred-register: '' }
-# ALL-NEXT: - { id: 1, class: gr32, preferred-register: '' }
-# ALL-NEXT: - { id: 2, class: gr32, preferred-register: '' }
registers:
- { id: 0, class: gpr, preferred-register: '' }
- { id: 1, class: gpr, preferred-register: '' }
@@ -108,15 +96,16 @@ liveins:
fixedStack:
stack:
constants:
-# ALL: %0 = COPY %edi
-# ALL-NEXT: %1 = COPY %esi
-# ALL-NEXT: %2 = OR32rr %0, %1, implicit-def %eflags
-# ALL-NEXT: %eax = COPY %2
-# ALL-NEXT: RET 0, implicit %eax
body: |
bb.1 (%ir-block.0):
liveins: %edi, %esi
+ ; ALL-LABEL: name: test_or_i32
+ ; ALL: [[COPY:%[0-9]+]]:gr32 = COPY %edi
+ ; ALL: [[COPY1:%[0-9]+]]:gr32 = COPY %esi
+ ; ALL: [[OR32rr:%[0-9]+]]:gr32 = OR32rr [[COPY]], [[COPY1]], implicit-def %eflags
+ ; ALL: %eax = COPY [[OR32rr]]
+ ; ALL: RET 0, implicit %eax
%0(s32) = COPY %edi
%1(s32) = COPY %esi
%2(s32) = G_OR %0, %1
@@ -126,14 +115,9 @@ body: |
...
---
name: test_or_i64
-# ALL-LABEL: name: test_or_i64
alignment: 4
legalized: true
regBankSelected: true
-# ALL: registers:
-# ALL-NEXT: - { id: 0, class: gr64, preferred-register: '' }
-# ALL-NEXT: - { id: 1, class: gr64, preferred-register: '' }
-# ALL-NEXT: - { id: 2, class: gr64, preferred-register: '' }
registers:
- { id: 0, class: gpr, preferred-register: '' }
- { id: 1, class: gpr, preferred-register: '' }
@@ -142,15 +126,16 @@ liveins:
fixedStack:
stack:
constants:
-# ALL: %0 = COPY %rdi
-# ALL-NEXT: %1 = COPY %rsi
-# ALL-NEXT: %2 = OR64rr %0, %1, implicit-def %eflags
-# ALL-NEXT: %rax = COPY %2
-# ALL-NEXT: RET 0, implicit %rax
body: |
bb.1 (%ir-block.0):
liveins: %rdi, %rsi
+ ; ALL-LABEL: name: test_or_i64
+ ; ALL: [[COPY:%[0-9]+]]:gr64 = COPY %rdi
+ ; ALL: [[COPY1:%[0-9]+]]:gr64 = COPY %rsi
+ ; ALL: [[OR64rr:%[0-9]+]]:gr64 = OR64rr [[COPY]], [[COPY1]], implicit-def %eflags
+ ; ALL: %rax = COPY [[OR64rr]]
+ ; ALL: RET 0, implicit %rax
%0(s64) = COPY %rdi
%1(s64) = COPY %rsi
%2(s64) = G_OR %0, %1
diff --git a/test/CodeGen/X86/GlobalISel/select-phi.mir b/test/CodeGen/X86/GlobalISel/select-phi.mir
new file mode 100644
index 000000000000..7792d8c208df
--- /dev/null
+++ b/test/CodeGen/X86/GlobalISel/select-phi.mir
@@ -0,0 +1,423 @@
+# RUN: llc -mtriple=x86_64-linux-gnu -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL
+
+--- |
+
+ define i8 @test_i8(i32 %a, i8 %f, i8 %t) {
+ entry:
+ %cmp = icmp sgt i32 %a, 0
+ br i1 %cmp, label %cond.true, label %cond.false
+
+ cond.true: ; preds = %entry
+ br label %cond.end
+
+ cond.false: ; preds = %entry
+ br label %cond.end
+
+ cond.end: ; preds = %cond.false, %cond.true
+ %cond = phi i8 [ %f, %cond.true ], [ %t, %cond.false ]
+ ret i8 %cond
+ }
+
+ define i16 @test_i16(i32 %a, i16 %f, i16 %t) {
+ entry:
+ %cmp = icmp sgt i32 %a, 0
+ br i1 %cmp, label %cond.true, label %cond.false
+
+ cond.true: ; preds = %entry
+ br label %cond.end
+
+ cond.false: ; preds = %entry
+ br label %cond.end
+
+ cond.end: ; preds = %cond.false, %cond.true
+ %cond = phi i16 [ %f, %cond.true ], [ %t, %cond.false ]
+ ret i16 %cond
+ }
+
+ define i32 @test_i32(i32 %a, i32 %f, i32 %t) {
+ entry:
+ %cmp = icmp sgt i32 %a, 0
+ br i1 %cmp, label %cond.true, label %cond.false
+
+ cond.true: ; preds = %entry
+ br label %cond.end
+
+ cond.false: ; preds = %entry
+ br label %cond.end
+
+ cond.end: ; preds = %cond.false, %cond.true
+ %cond = phi i32 [ %f, %cond.true ], [ %t, %cond.false ]
+ ret i32 %cond
+ }
+
+ define i64 @test_i64(i32 %a, i64 %f, i64 %t) {
+ entry:
+ %cmp = icmp sgt i32 %a, 0
+ br i1 %cmp, label %cond.true, label %cond.false
+
+ cond.true: ; preds = %entry
+ br label %cond.end
+
+ cond.false: ; preds = %entry
+ br label %cond.end
+
+ cond.end: ; preds = %cond.false, %cond.true
+ %cond = phi i64 [ %f, %cond.true ], [ %t, %cond.false ]
+ ret i64 %cond
+ }
+
+ define float @test_float(i32 %a, float %f, float %t) {
+ entry:
+ %cmp = icmp sgt i32 %a, 0
+ br i1 %cmp, label %cond.true, label %cond.false
+
+ cond.true: ; preds = %entry
+ br label %cond.end
+
+ cond.false: ; preds = %entry
+ br label %cond.end
+
+ cond.end: ; preds = %cond.false, %cond.true
+ %cond = phi float [ %f, %cond.true ], [ %t, %cond.false ]
+ ret float %cond
+ }
+
+ define double @test_double(i32 %a, double %f, double %t) {
+ entry:
+ %cmp = icmp sgt i32 %a, 0
+ br i1 %cmp, label %cond.true, label %cond.false
+
+ cond.true: ; preds = %entry
+ br label %cond.end
+
+ cond.false: ; preds = %entry
+ br label %cond.end
+
+ cond.end: ; preds = %cond.false, %cond.true
+ %cond = phi double [ %f, %cond.true ], [ %t, %cond.false ]
+ ret double %cond
+ }
+
+...
+---
+name: test_i8
+# ALL-LABEL: name: test_i8
+alignment: 4
+legalized: true
+regBankSelected: true
+tracksRegLiveness: true
+# ALL: registers:
+# ALL-NEXT: - { id: 0, class: gr32, preferred-register: '' }
+# ALL-NEXT: - { id: 1, class: gr8, preferred-register: '' }
+# ALL-NEXT: - { id: 2, class: gr8, preferred-register: '' }
+# ALL-NEXT: - { id: 3, class: gr32, preferred-register: '' }
+# ALL-NEXT: - { id: 4, class: gr8, preferred-register: '' }
+# ALL-NEXT: - { id: 5, class: gr8, preferred-register: '' }
+registers:
+ - { id: 0, class: gpr, preferred-register: '' }
+ - { id: 1, class: gpr, preferred-register: '' }
+ - { id: 2, class: gpr, preferred-register: '' }
+ - { id: 3, class: gpr, preferred-register: '' }
+ - { id: 4, class: gpr, preferred-register: '' }
+ - { id: 5, class: gpr, preferred-register: '' }
+# ALL-LABEL: bb.3.cond.end:
+# ALL: %5:gr8 = PHI %1, %bb.1, %2, %bb.2
+# ALL-NEXT: %al = COPY %5
+# ALL-NEXT: RET 0, implicit %al
+body: |
+ bb.1.entry:
+ successors: %bb.2(0x40000000), %bb.3(0x40000000)
+ liveins: %edi, %edx, %esi
+
+ %0(s32) = COPY %edi
+ %1(s8) = COPY %sil
+ %2(s8) = COPY %edx
+ %3(s32) = G_CONSTANT i32 0
+ %4(s1) = G_ICMP intpred(sgt), %0(s32), %3
+ G_BRCOND %4(s1), %bb.2
+ G_BR %bb.3
+
+ bb.2.cond.true:
+ successors: %bb.4(0x80000000)
+
+ G_BR %bb.4
+
+ bb.3.cond.false:
+ successors: %bb.4(0x80000000)
+
+
+ bb.4.cond.end:
+ %5(s8) = G_PHI %1(s8), %bb.2, %2(s8), %bb.3
+ %al = COPY %5(s8)
+ RET 0, implicit %al
+
+...
+---
+name: test_i16
+# ALL-LABEL: name: test_i16
+alignment: 4
+legalized: true
+regBankSelected: true
+tracksRegLiveness: true
+# ALL: registers:
+# ALL-NEXT: - { id: 0, class: gr32, preferred-register: '' }
+# ALL-NEXT: - { id: 1, class: gr16, preferred-register: '' }
+# ALL-NEXT: - { id: 2, class: gr16, preferred-register: '' }
+# ALL-NEXT: - { id: 3, class: gr32, preferred-register: '' }
+# ALL-NEXT: - { id: 4, class: gr8, preferred-register: '' }
+# ALL-NEXT: - { id: 5, class: gr16, preferred-register: '' }
+registers:
+ - { id: 0, class: gpr, preferred-register: '' }
+ - { id: 1, class: gpr, preferred-register: '' }
+ - { id: 2, class: gpr, preferred-register: '' }
+ - { id: 3, class: gpr, preferred-register: '' }
+ - { id: 4, class: gpr, preferred-register: '' }
+ - { id: 5, class: gpr, preferred-register: '' }
+# ALL-LABEL: bb.3.cond.end:
+# ALL: %5:gr16 = PHI %1, %bb.1, %2, %bb.2
+# ALL-NEXT: %ax = COPY %5
+# ALL-NEXT: RET 0, implicit %ax
+body: |
+ bb.1.entry:
+ successors: %bb.2(0x40000000), %bb.3(0x40000000)
+ liveins: %edi, %edx, %esi
+
+ %0(s32) = COPY %edi
+ %1(s16) = COPY %si
+ %2(s16) = COPY %edx
+ %3(s32) = G_CONSTANT i32 0
+ %4(s1) = G_ICMP intpred(sgt), %0(s32), %3
+ G_BRCOND %4(s1), %bb.2
+ G_BR %bb.3
+
+ bb.2.cond.true:
+ successors: %bb.4(0x80000000)
+
+ G_BR %bb.4
+
+ bb.3.cond.false:
+ successors: %bb.4(0x80000000)
+
+
+ bb.4.cond.end:
+ %5(s16) = G_PHI %1(s16), %bb.2, %2(s16), %bb.3
+ %ax = COPY %5(s16)
+ RET 0, implicit %ax
+
+...
+---
+name: test_i32
+# ALL-LABEL: name: test_i32
+alignment: 4
+legalized: true
+regBankSelected: true
+tracksRegLiveness: true
+# ALL: registers:
+# ALL-NEXT: - { id: 0, class: gr32, preferred-register: '' }
+# ALL-NEXT: - { id: 1, class: gr32, preferred-register: '' }
+# ALL-NEXT: - { id: 2, class: gr32, preferred-register: '' }
+# ALL-NEXT: - { id: 3, class: gr32, preferred-register: '' }
+# ALL-NEXT: - { id: 4, class: gr8, preferred-register: '' }
+# ALL-NEXT: - { id: 5, class: gr32, preferred-register: '' }
+registers:
+ - { id: 0, class: gpr, preferred-register: '' }
+ - { id: 1, class: gpr, preferred-register: '' }
+ - { id: 2, class: gpr, preferred-register: '' }
+ - { id: 3, class: gpr, preferred-register: '' }
+ - { id: 4, class: gpr, preferred-register: '' }
+ - { id: 5, class: gpr, preferred-register: '' }
+# ALL-LABEL: bb.3.cond.end:
+# ALL: %5:gr32 = PHI %1, %bb.1, %2, %bb.2
+# ALL-NEXT: %eax = COPY %5
+# ALL-NEXT: RET 0, implicit %eax
+body: |
+ bb.1.entry:
+ successors: %bb.2(0x40000000), %bb.3(0x40000000)
+ liveins: %edi, %edx, %esi
+
+ %0(s32) = COPY %edi
+ %1(s32) = COPY %esi
+ %2(s32) = COPY %edx
+ %3(s32) = G_CONSTANT i32 0
+ %4(s1) = G_ICMP intpred(sgt), %0(s32), %3
+ G_BRCOND %4(s1), %bb.2
+ G_BR %bb.3
+
+ bb.2.cond.true:
+ successors: %bb.4(0x80000000)
+
+ G_BR %bb.4
+
+ bb.3.cond.false:
+ successors: %bb.4(0x80000000)
+
+
+ bb.4.cond.end:
+ %5(s32) = G_PHI %1(s32), %bb.2, %2(s32), %bb.3
+ %eax = COPY %5(s32)
+ RET 0, implicit %eax
+
+...
+---
+name: test_i64
+# ALL-LABEL: name: test_i64
+alignment: 4
+legalized: true
+regBankSelected: true
+tracksRegLiveness: true
+# ALL: registers:
+# ALL-NEXT: - { id: 0, class: gr32, preferred-register: '' }
+# ALL-NEXT: - { id: 1, class: gr64, preferred-register: '' }
+# ALL-NEXT: - { id: 2, class: gr64, preferred-register: '' }
+# ALL-NEXT: - { id: 3, class: gr32, preferred-register: '' }
+# ALL-NEXT: - { id: 4, class: gr8, preferred-register: '' }
+# ALL-NEXT: - { id: 5, class: gr64, preferred-register: '' }
+registers:
+ - { id: 0, class: gpr, preferred-register: '' }
+ - { id: 1, class: gpr, preferred-register: '' }
+ - { id: 2, class: gpr, preferred-register: '' }
+ - { id: 3, class: gpr, preferred-register: '' }
+ - { id: 4, class: gpr, preferred-register: '' }
+ - { id: 5, class: gpr, preferred-register: '' }
+# ALL-LABEL: bb.3.cond.end:
+# ALL: %5:gr64 = PHI %1, %bb.1, %2, %bb.2
+# ALL-NEXT: %rax = COPY %5
+# ALL-NEXT: RET 0, implicit %rax
+body: |
+ bb.1.entry:
+ successors: %bb.2(0x40000000), %bb.3(0x40000000)
+ liveins: %edi, %rdx, %rsi
+
+ %0(s32) = COPY %edi
+ %1(s64) = COPY %rsi
+ %2(s64) = COPY %rdx
+ %3(s32) = G_CONSTANT i32 0
+ %4(s1) = G_ICMP intpred(sgt), %0(s32), %3
+ G_BRCOND %4(s1), %bb.2
+ G_BR %bb.3
+
+ bb.2.cond.true:
+ successors: %bb.4(0x80000000)
+
+ G_BR %bb.4
+
+ bb.3.cond.false:
+ successors: %bb.4(0x80000000)
+
+
+ bb.4.cond.end:
+ %5(s64) = G_PHI %1(s64), %bb.2, %2(s64), %bb.3
+ %rax = COPY %5(s64)
+ RET 0, implicit %rax
+
+...
+---
+name: test_float
+# ALL-LABEL: name: test_float
+alignment: 4
+legalized: true
+regBankSelected: true
+tracksRegLiveness: true
+# ALL: registers:
+# ALL-NEXT: - { id: 0, class: gr32, preferred-register: '' }
+# ALL-NEXT: - { id: 1, class: fr32, preferred-register: '' }
+# ALL-NEXT: - { id: 2, class: fr32, preferred-register: '' }
+# ALL-NEXT: - { id: 3, class: gr32, preferred-register: '' }
+# ALL-NEXT: - { id: 4, class: gr8, preferred-register: '' }
+# ALL-NEXT: - { id: 5, class: fr32, preferred-register: '' }
+registers:
+ - { id: 0, class: gpr, preferred-register: '' }
+ - { id: 1, class: vecr, preferred-register: '' }
+ - { id: 2, class: vecr, preferred-register: '' }
+ - { id: 3, class: gpr, preferred-register: '' }
+ - { id: 4, class: gpr, preferred-register: '' }
+ - { id: 5, class: vecr, preferred-register: '' }
+liveins:
+fixedStack:
+stack:
+constants:
+# ALL-LABEL: bb.3.cond.end:
+# ALL: %5:fr32 = PHI %1, %bb.1, %2, %bb.2
+# ALL-NEXT: %xmm0 = COPY %5
+# ALL-NEXT: RET 0, implicit %xmm0
+body: |
+ bb.1.entry:
+ successors: %bb.2(0x40000000), %bb.3(0x40000000)
+ liveins: %edi, %xmm0, %xmm1
+
+ %0(s32) = COPY %edi
+ %1(s32) = COPY %xmm0
+ %2(s32) = COPY %xmm1
+ %3(s32) = G_CONSTANT i32 0
+ %4(s1) = G_ICMP intpred(sgt), %0(s32), %3
+ G_BRCOND %4(s1), %bb.2
+ G_BR %bb.3
+
+ bb.2.cond.true:
+ successors: %bb.4(0x80000000)
+
+ G_BR %bb.4
+
+ bb.3.cond.false:
+ successors: %bb.4(0x80000000)
+
+
+ bb.4.cond.end:
+ %5(s32) = G_PHI %1(s32), %bb.2, %2(s32), %bb.3
+ %xmm0 = COPY %5(s32)
+ RET 0, implicit %xmm0
+
+...
+---
+name: test_double
+# ALL-LABEL: name: test_double
+alignment: 4
+legalized: true
+regBankSelected: true
+tracksRegLiveness: true
+# ALL: registers:
+# ALL-NEXT: - { id: 0, class: gr32, preferred-register: '' }
+# ALL-NEXT: - { id: 1, class: fr64, preferred-register: '' }
+# ALL-NEXT: - { id: 2, class: fr64, preferred-register: '' }
+# ALL-NEXT: - { id: 3, class: gr32, preferred-register: '' }
+# ALL-NEXT: - { id: 4, class: gr8, preferred-register: '' }
+# ALL-NEXT: - { id: 5, class: fr64, preferred-register: '' }
+registers:
+ - { id: 0, class: gpr, preferred-register: '' }
+ - { id: 1, class: vecr, preferred-register: '' }
+ - { id: 2, class: vecr, preferred-register: '' }
+ - { id: 3, class: gpr, preferred-register: '' }
+ - { id: 4, class: gpr, preferred-register: '' }
+ - { id: 5, class: vecr, preferred-register: '' }
+# ALL-LABEL: bb.3.cond.end:
+# ALL: %5:fr64 = PHI %1, %bb.1, %2, %bb.2
+# ALL-NEXT: %xmm0 = COPY %5
+# ALL-NEXT: RET 0, implicit %xmm0
+body: |
+ bb.1.entry:
+ successors: %bb.2(0x40000000), %bb.3(0x40000000)
+ liveins: %edi, %xmm0, %xmm1
+
+ %0(s32) = COPY %edi
+ %1(s64) = COPY %xmm0
+ %2(s64) = COPY %xmm1
+ %3(s32) = G_CONSTANT i32 0
+ %4(s1) = G_ICMP intpred(sgt), %0(s32), %3
+ G_BRCOND %4(s1), %bb.2
+ G_BR %bb.3
+
+ bb.2.cond.true:
+ successors: %bb.4(0x80000000)
+
+ G_BR %bb.4
+
+ bb.3.cond.false:
+ successors: %bb.4(0x80000000)
+
+
+ bb.4.cond.end:
+ %5(s64) = G_PHI %1(s64), %bb.2, %2(s64), %bb.3
+ %xmm0 = COPY %5(s64)
+ RET 0, implicit %xmm0
+
+...
diff --git a/test/CodeGen/X86/GlobalISel/select-sub-v128.mir b/test/CodeGen/X86/GlobalISel/select-sub-v128.mir
index f77879d93009..bb0500751942 100644
--- a/test/CodeGen/X86/GlobalISel/select-sub-v128.mir
+++ b/test/CodeGen/X86/GlobalISel/select-sub-v128.mir
@@ -31,31 +31,17 @@ name: test_sub_v16i8
alignment: 4
legalized: true
regBankSelected: true
-# NOVL: registers:
-# NOVL-NEXT: - { id: 0, class: vr128, preferred-register: '' }
-# NOVL-NEXT: - { id: 1, class: vr128, preferred-register: '' }
-# NOVL-NEXT: - { id: 2, class: vr128, preferred-register: '' }
-#
-# AVX512VL: registers:
-# AVX512VL-NEXT: - { id: 0, class: vr128, preferred-register: '' }
-# AVX512VL-NEXT: - { id: 1, class: vr128, preferred-register: '' }
-# AVX512VL-NEXT: - { id: 2, class: vr128, preferred-register: '' }
-#
-# AVX512BWVL: registers:
-# AVX512BWVL-NEXT: - { id: 0, class: vr128x, preferred-register: '' }
-# AVX512BWVL-NEXT: - { id: 1, class: vr128x, preferred-register: '' }
-# AVX512BWVL-NEXT: - { id: 2, class: vr128x, preferred-register: '' }
registers:
- { id: 0, class: vecr }
- { id: 1, class: vecr }
- { id: 2, class: vecr }
-# SSE2: %2 = PSUBBrr %0, %1
+# SSE2: %2:vr128 = PSUBBrr %0, %1
#
-# AVX1: %2 = VPSUBBrr %0, %1
+# AVX1: %2:vr128 = VPSUBBrr %0, %1
#
-# AVX512VL: %2 = VPSUBBrr %0, %1
+# AVX512VL: %2:vr128 = VPSUBBrr %0, %1
#
-# AVX512BWVL: %2 = VPSUBBZ128rr %0, %1
+# AVX512BWVL: %2:vr128x = VPSUBBZ128rr %0, %1
body: |
bb.1 (%ir-block.0):
liveins: %xmm0, %xmm1
@@ -73,31 +59,17 @@ name: test_sub_v8i16
alignment: 4
legalized: true
regBankSelected: true
-# NOVL: registers:
-# NOVL-NEXT: - { id: 0, class: vr128, preferred-register: '' }
-# NOVL-NEXT: - { id: 1, class: vr128, preferred-register: '' }
-# NOVL-NEXT: - { id: 2, class: vr128, preferred-register: '' }
-#
-# AVX512VL: registers:
-# AVX512VL-NEXT: - { id: 0, class: vr128, preferred-register: '' }
-# AVX512VL-NEXT: - { id: 1, class: vr128, preferred-register: '' }
-# AVX512VL-NEXT: - { id: 2, class: vr128, preferred-register: '' }
-#
-# AVX512BWVL: registers:
-# AVX512BWVL-NEXT: - { id: 0, class: vr128x, preferred-register: '' }
-# AVX512BWVL-NEXT: - { id: 1, class: vr128x, preferred-register: '' }
-# AVX512BWVL-NEXT: - { id: 2, class: vr128x, preferred-register: '' }
registers:
- { id: 0, class: vecr }
- { id: 1, class: vecr }
- { id: 2, class: vecr }
-# SSE2: %2 = PSUBWrr %0, %1
+# SSE2: %2:vr128 = PSUBWrr %0, %1
#
-# AVX1: %2 = VPSUBWrr %0, %1
+# AVX1: %2:vr128 = VPSUBWrr %0, %1
#
-# AVX512VL: %2 = VPSUBWrr %0, %1
+# AVX512VL: %2:vr128 = VPSUBWrr %0, %1
#
-# AVX512BWVL: %2 = VPSUBWZ128rr %0, %1
+# AVX512BWVL: %2:vr128x = VPSUBWZ128rr %0, %1
body: |
bb.1 (%ir-block.0):
liveins: %xmm0, %xmm1
@@ -115,31 +87,17 @@ name: test_sub_v4i32
alignment: 4
legalized: true
regBankSelected: true
-# NOVL: registers:
-# NOVL-NEXT: - { id: 0, class: vr128, preferred-register: '' }
-# NOVL-NEXT: - { id: 1, class: vr128, preferred-register: '' }
-# NOVL-NEXT: - { id: 2, class: vr128, preferred-register: '' }
-#
-# AVX512VL: registers:
-# AVX512VL-NEXT: - { id: 0, class: vr128x, preferred-register: '' }
-# AVX512VL-NEXT: - { id: 1, class: vr128x, preferred-register: '' }
-# AVX512VL-NEXT: - { id: 2, class: vr128x, preferred-register: '' }
-#
-# AVX512BWVL: registers:
-# AVX512BWVL-NEXT: - { id: 0, class: vr128x, preferred-register: '' }
-# AVX512BWVL-NEXT: - { id: 1, class: vr128x, preferred-register: '' }
-# AVX512BWVL-NEXT: - { id: 2, class: vr128x, preferred-register: '' }
registers:
- { id: 0, class: vecr }
- { id: 1, class: vecr }
- { id: 2, class: vecr }
-# SSE2: %2 = PSUBDrr %0, %1
+# SSE2: %2:vr128 = PSUBDrr %0, %1
#
-# AVX1: %2 = VPSUBDrr %0, %1
+# AVX1: %2:vr128 = VPSUBDrr %0, %1
#
-# AVX512VL: %2 = VPSUBDZ128rr %0, %1
+# AVX512VL: %2:vr128x = VPSUBDZ128rr %0, %1
#
-# AVX512BWVL: %2 = VPSUBDZ128rr %0, %1
+# AVX512BWVL: %2:vr128x = VPSUBDZ128rr %0, %1
body: |
bb.1 (%ir-block.0):
liveins: %xmm0, %xmm1
@@ -157,31 +115,17 @@ name: test_sub_v2i64
alignment: 4
legalized: true
regBankSelected: true
-# NOVL: registers:
-# NOVL-NEXT: - { id: 0, class: vr128, preferred-register: '' }
-# NOVL-NEXT: - { id: 1, class: vr128, preferred-register: '' }
-# NOVL-NEXT: - { id: 2, class: vr128, preferred-register: '' }
-#
-# AVX512VL: registers:
-# AVX512VL-NEXT: - { id: 0, class: vr128x, preferred-register: '' }
-# AVX512VL-NEXT: - { id: 1, class: vr128x, preferred-register: '' }
-# AVX512VL-NEXT: - { id: 2, class: vr128x, preferred-register: '' }
-#
-# AVX512BWVL: registers:
-# AVX512BWVL-NEXT: - { id: 0, class: vr128x, preferred-register: '' }
-# AVX512BWVL-NEXT: - { id: 1, class: vr128x, preferred-register: '' }
-# AVX512BWVL-NEXT: - { id: 2, class: vr128x, preferred-register: '' }
registers:
- { id: 0, class: vecr }
- { id: 1, class: vecr }
- { id: 2, class: vecr }
-# SSE2: %2 = PSUBQrr %0, %1
+# SSE2: %2:vr128 = PSUBQrr %0, %1
#
-# AVX1: %2 = VPSUBQrr %0, %1
+# AVX1: %2:vr128 = VPSUBQrr %0, %1
#
-# AVX512VL: %2 = VPSUBQZ128rr %0, %1
+# AVX512VL: %2:vr128x = VPSUBQZ128rr %0, %1
#
-# AVX512BWVL: %2 = VPSUBQZ128rr %0, %1
+# AVX512BWVL: %2:vr128x = VPSUBQZ128rr %0, %1
body: |
bb.1 (%ir-block.0):
liveins: %xmm0, %xmm1
diff --git a/test/CodeGen/X86/GlobalISel/select-sub-v256.mir b/test/CodeGen/X86/GlobalISel/select-sub-v256.mir
index d6bde7fbb691..614d13169f33 100644
--- a/test/CodeGen/X86/GlobalISel/select-sub-v256.mir
+++ b/test/CodeGen/X86/GlobalISel/select-sub-v256.mir
@@ -29,29 +29,15 @@ name: test_sub_v32i8
alignment: 4
legalized: true
regBankSelected: true
-# AVX2: registers:
-# AVX2-NEXT: - { id: 0, class: vr256, preferred-register: '' }
-# AVX2-NEXT: - { id: 1, class: vr256, preferred-register: '' }
-# AVX2-NEXT: - { id: 2, class: vr256, preferred-register: '' }
-#
-# AVX512VL: registers:
-# AVX512VL-NEXT: - { id: 0, class: vr256, preferred-register: '' }
-# AVX512VL-NEXT: - { id: 1, class: vr256, preferred-register: '' }
-# AVX512VL-NEXT: - { id: 2, class: vr256, preferred-register: '' }
-#
-# AVX512BWVL: registers:
-# AVX512BWVL-NEXT: - { id: 0, class: vr256x, preferred-register: '' }
-# AVX512BWVL-NEXT: - { id: 1, class: vr256x, preferred-register: '' }
-# AVX512BWVL-NEXT: - { id: 2, class: vr256x, preferred-register: '' }
registers:
- { id: 0, class: vecr }
- { id: 1, class: vecr }
- { id: 2, class: vecr }
-# AVX2: %2 = VPSUBBYrr %0, %1
+# AVX2: %2:vr256 = VPSUBBYrr %0, %1
#
-# AVX512VL: %2 = VPSUBBYrr %0, %1
+# AVX512VL: %2:vr256 = VPSUBBYrr %0, %1
#
-# AVX512BWVL: %2 = VPSUBBZ256rr %0, %1
+# AVX512BWVL: %2:vr256x = VPSUBBZ256rr %0, %1
body: |
bb.1 (%ir-block.0):
liveins: %ymm0, %ymm1
@@ -69,29 +55,15 @@ name: test_sub_v16i16
alignment: 4
legalized: true
regBankSelected: true
-# AVX2: registers:
-# AVX2-NEXT: - { id: 0, class: vr256, preferred-register: '' }
-# AVX2-NEXT: - { id: 1, class: vr256, preferred-register: '' }
-# AVX2-NEXT: - { id: 2, class: vr256, preferred-register: '' }
-#
-# AVX512VL: registers:
-# AVX512VL-NEXT: - { id: 0, class: vr256, preferred-register: '' }
-# AVX512VL-NEXT: - { id: 1, class: vr256, preferred-register: '' }
-# AVX512VL-NEXT: - { id: 2, class: vr256, preferred-register: '' }
-#
-# AVX512BWVL: registers:
-# AVX512BWVL-NEXT: - { id: 0, class: vr256x, preferred-register: '' }
-# AVX512BWVL-NEXT: - { id: 1, class: vr256x, preferred-register: '' }
-# AVX512BWVL-NEXT: - { id: 2, class: vr256x, preferred-register: '' }
registers:
- { id: 0, class: vecr }
- { id: 1, class: vecr }
- { id: 2, class: vecr }
-# AVX2: %2 = VPSUBWYrr %0, %1
+# AVX2: %2:vr256 = VPSUBWYrr %0, %1
#
-# AVX512VL: %2 = VPSUBWYrr %0, %1
+# AVX512VL: %2:vr256 = VPSUBWYrr %0, %1
#
-# AVX512BWVL: %2 = VPSUBWZ256rr %0, %1
+# AVX512BWVL: %2:vr256x = VPSUBWZ256rr %0, %1
body: |
bb.1 (%ir-block.0):
liveins: %ymm0, %ymm1
@@ -109,29 +81,15 @@ name: test_sub_v8i32
alignment: 4
legalized: true
regBankSelected: true
-# AVX2: registers:
-# AVX2-NEXT: - { id: 0, class: vr256, preferred-register: '' }
-# AVX2-NEXT: - { id: 1, class: vr256, preferred-register: '' }
-# AVX2-NEXT: - { id: 2, class: vr256, preferred-register: '' }
-#
-# AVX512VL: registers:
-# AVX512VL-NEXT: - { id: 0, class: vr256x, preferred-register: '' }
-# AVX512VL-NEXT: - { id: 1, class: vr256x, preferred-register: '' }
-# AVX512VL-NEXT: - { id: 2, class: vr256x, preferred-register: '' }
-#
-# AVX512BWVL: registers:
-# AVX512BWVL-NEXT: - { id: 0, class: vr256x, preferred-register: '' }
-# AVX512BWVL-NEXT: - { id: 1, class: vr256x, preferred-register: '' }
-# AVX512BWVL-NEXT: - { id: 2, class: vr256x, preferred-register: '' }
registers:
- { id: 0, class: vecr }
- { id: 1, class: vecr }
- { id: 2, class: vecr }
-# AVX2: %2 = VPSUBDYrr %0, %1
+# AVX2: %2:vr256 = VPSUBDYrr %0, %1
#
-# AVX512VL: %2 = VPSUBDZ256rr %0, %1
+# AVX512VL: %2:vr256x = VPSUBDZ256rr %0, %1
#
-# AVX512BWVL: %2 = VPSUBDZ256rr %0, %1
+# AVX512BWVL: %2:vr256x = VPSUBDZ256rr %0, %1
body: |
bb.1 (%ir-block.0):
liveins: %ymm0, %ymm1
@@ -149,29 +107,15 @@ name: test_sub_v4i64
alignment: 4
legalized: true
regBankSelected: true
-# AVX2: registers:
-# AVX2-NEXT: - { id: 0, class: vr256, preferred-register: '' }
-# AVX2-NEXT: - { id: 1, class: vr256, preferred-register: '' }
-# AVX2-NEXT: - { id: 2, class: vr256, preferred-register: '' }
-#
-# AVX512VL: registers:
-# AVX512VL-NEXT: - { id: 0, class: vr256x, preferred-register: '' }
-# AVX512VL-NEXT: - { id: 1, class: vr256x, preferred-register: '' }
-# AVX512VL-NEXT: - { id: 2, class: vr256x, preferred-register: '' }
-#
-# AVX512BWVL: registers:
-# AVX512BWVL-NEXT: - { id: 0, class: vr256x, preferred-register: '' }
-# AVX512BWVL-NEXT: - { id: 1, class: vr256x, preferred-register: '' }
-# AVX512BWVL-NEXT: - { id: 2, class: vr256x, preferred-register: '' }
registers:
- { id: 0, class: vecr }
- { id: 1, class: vecr }
- { id: 2, class: vecr }
-# AVX2: %2 = VPSUBQYrr %0, %1
+# AVX2: %2:vr256 = VPSUBQYrr %0, %1
#
-# AVX512VL: %2 = VPSUBQZ256rr %0, %1
+# AVX512VL: %2:vr256x = VPSUBQZ256rr %0, %1
#
-# AVX512BWVL: %2 = VPSUBQZ256rr %0, %1
+# AVX512BWVL: %2:vr256x = VPSUBQZ256rr %0, %1
body: |
bb.1 (%ir-block.0):
liveins: %ymm0, %ymm1
diff --git a/test/CodeGen/X86/GlobalISel/select-sub-v512.mir b/test/CodeGen/X86/GlobalISel/select-sub-v512.mir
index 828a243b2656..67949219ba43 100644
--- a/test/CodeGen/X86/GlobalISel/select-sub-v512.mir
+++ b/test/CodeGen/X86/GlobalISel/select-sub-v512.mir
@@ -1,3 +1,4 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple=x86_64-linux-gnu -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL
--- |
@@ -26,23 +27,23 @@
...
---
name: test_sub_v64i8
-# ALL-LABEL: name: test_sub_v64i8
alignment: 4
legalized: true
regBankSelected: true
-# ALL: registers:
-# ALL-NEXT: - { id: 0, class: vr512, preferred-register: '' }
-# ALL-NEXT: - { id: 1, class: vr512, preferred-register: '' }
-# ALL-NEXT: - { id: 2, class: vr512, preferred-register: '' }
registers:
- { id: 0, class: vecr }
- { id: 1, class: vecr }
- { id: 2, class: vecr }
-# ALL: %2 = VPSUBBZrr %0, %1
body: |
bb.1 (%ir-block.0):
liveins: %zmm0, %zmm1
+ ; ALL-LABEL: name: test_sub_v64i8
+ ; ALL: [[COPY:%[0-9]+]]:vr512 = COPY %zmm0
+ ; ALL: [[COPY1:%[0-9]+]]:vr512 = COPY %zmm1
+ ; ALL: [[VPSUBBZrr:%[0-9]+]]:vr512 = VPSUBBZrr [[COPY]], [[COPY1]]
+ ; ALL: %zmm0 = COPY [[VPSUBBZrr]]
+ ; ALL: RET 0, implicit %zmm0
%0(<64 x s8>) = COPY %zmm0
%1(<64 x s8>) = COPY %zmm1
%2(<64 x s8>) = G_SUB %0, %1
@@ -52,23 +53,23 @@ body: |
...
---
name: test_sub_v32i16
-# ALL-LABEL: name: test_sub_v32i16
alignment: 4
legalized: true
regBankSelected: true
-# ALL: registers:
-# ALL-NEXT: - { id: 0, class: vr512, preferred-register: '' }
-# ALL-NEXT: - { id: 1, class: vr512, preferred-register: '' }
-# ALL-NEXT: - { id: 2, class: vr512, preferred-register: '' }
registers:
- { id: 0, class: vecr }
- { id: 1, class: vecr }
- { id: 2, class: vecr }
-# ALL: %2 = VPSUBWZrr %0, %1
body: |
bb.1 (%ir-block.0):
liveins: %zmm0, %zmm1
+ ; ALL-LABEL: name: test_sub_v32i16
+ ; ALL: [[COPY:%[0-9]+]]:vr512 = COPY %zmm0
+ ; ALL: [[COPY1:%[0-9]+]]:vr512 = COPY %zmm1
+ ; ALL: [[VPSUBWZrr:%[0-9]+]]:vr512 = VPSUBWZrr [[COPY]], [[COPY1]]
+ ; ALL: %zmm0 = COPY [[VPSUBWZrr]]
+ ; ALL: RET 0, implicit %zmm0
%0(<32 x s16>) = COPY %zmm0
%1(<32 x s16>) = COPY %zmm1
%2(<32 x s16>) = G_SUB %0, %1
@@ -78,23 +79,23 @@ body: |
...
---
name: test_sub_v16i32
-# ALL-LABEL: name: test_sub_v16i32
alignment: 4
legalized: true
regBankSelected: true
-# ALL: registers:
-# ALL-NEXT: - { id: 0, class: vr512, preferred-register: '' }
-# ALL-NEXT: - { id: 1, class: vr512, preferred-register: '' }
-# ALL-NEXT: - { id: 2, class: vr512, preferred-register: '' }
registers:
- { id: 0, class: vecr }
- { id: 1, class: vecr }
- { id: 2, class: vecr }
-# ALL: %2 = VPSUBDZrr %0, %1
body: |
bb.1 (%ir-block.0):
liveins: %zmm0, %zmm1
+ ; ALL-LABEL: name: test_sub_v16i32
+ ; ALL: [[COPY:%[0-9]+]]:vr512 = COPY %zmm0
+ ; ALL: [[COPY1:%[0-9]+]]:vr512 = COPY %zmm1
+ ; ALL: [[VPSUBDZrr:%[0-9]+]]:vr512 = VPSUBDZrr [[COPY]], [[COPY1]]
+ ; ALL: %zmm0 = COPY [[VPSUBDZrr]]
+ ; ALL: RET 0, implicit %zmm0
%0(<16 x s32>) = COPY %zmm0
%1(<16 x s32>) = COPY %zmm1
%2(<16 x s32>) = G_SUB %0, %1
@@ -104,23 +105,23 @@ body: |
...
---
name: test_sub_v8i64
-# ALL-LABEL: name: test_sub_v8i64
alignment: 4
legalized: true
regBankSelected: true
-# ALL: registers:
-# ALL-NEXT: - { id: 0, class: vr512, preferred-register: '' }
-# ALL-NEXT: - { id: 1, class: vr512, preferred-register: '' }
-# ALL-NEXT: - { id: 2, class: vr512, preferred-register: '' }
registers:
- { id: 0, class: vecr }
- { id: 1, class: vecr }
- { id: 2, class: vecr }
-# ALL: %2 = VPSUBQZrr %0, %1
body: |
bb.1 (%ir-block.0):
liveins: %zmm0, %zmm1
+ ; ALL-LABEL: name: test_sub_v8i64
+ ; ALL: [[COPY:%[0-9]+]]:vr512 = COPY %zmm0
+ ; ALL: [[COPY1:%[0-9]+]]:vr512 = COPY %zmm1
+ ; ALL: [[VPSUBQZrr:%[0-9]+]]:vr512 = VPSUBQZrr [[COPY]], [[COPY1]]
+ ; ALL: %zmm0 = COPY [[VPSUBQZrr]]
+ ; ALL: RET 0, implicit %zmm0
%0(<8 x s64>) = COPY %zmm0
%1(<8 x s64>) = COPY %zmm1
%2(<8 x s64>) = G_SUB %0, %1
diff --git a/test/CodeGen/X86/GlobalISel/select-sub.mir b/test/CodeGen/X86/GlobalISel/select-sub.mir
index d47f77828c9b..d2f99d12ae54 100644
--- a/test/CodeGen/X86/GlobalISel/select-sub.mir
+++ b/test/CodeGen/X86/GlobalISel/select-sub.mir
@@ -29,17 +29,13 @@
name: test_sub_i64
legalized: true
regBankSelected: true
-# ALL: registers:
-# ALL-NEXT: - { id: 0, class: gr64, preferred-register: '' }
-# ALL-NEXT: - { id: 1, class: gr64, preferred-register: '' }
-# ALL-NEXT: - { id: 2, class: gr64, preferred-register: '' }
registers:
- { id: 0, class: gpr }
- { id: 1, class: gpr }
- { id: 2, class: gpr }
-# ALL: %0 = COPY %rdi
-# ALL-NEXT: %1 = COPY %rsi
-# ALL-NEXT: %2 = SUB64rr %0, %1
+# ALL: %0:gr64 = COPY %rdi
+# ALL-NEXT: %1:gr64 = COPY %rsi
+# ALL-NEXT: %2:gr64 = SUB64rr %0, %1
body: |
bb.1 (%ir-block.0):
liveins: %edi, %esi
@@ -55,17 +51,13 @@ body: |
name: test_sub_i32
legalized: true
regBankSelected: true
-# ALL: registers:
-# ALL-NEXT: - { id: 0, class: gr32, preferred-register: '' }
-# ALL-NEXT: - { id: 1, class: gr32, preferred-register: '' }
-# ALL-NEXT: - { id: 2, class: gr32, preferred-register: '' }
registers:
- { id: 0, class: gpr }
- { id: 1, class: gpr }
- { id: 2, class: gpr }
-# ALL: %0 = COPY %edi
-# ALL-NEXT: %1 = COPY %esi
-# ALL-NEXT: %2 = SUB32rr %0, %1
+# ALL: %0:gr32 = COPY %edi
+# ALL-NEXT: %1:gr32 = COPY %esi
+# ALL-NEXT: %2:gr32 = SUB32rr %0, %1
body: |
bb.1 (%ir-block.0):
liveins: %edi, %esi
@@ -73,7 +65,7 @@ body: |
%0(s32) = COPY %edi
%1(s32) = COPY %esi
%2(s32) = G_SUB %0, %1
- %rax = COPY %2(s32)
+ %eax = COPY %2(s32)
...
---
@@ -83,23 +75,18 @@ legalized: true
regBankSelected: true
selected: false
tracksRegLiveness: true
-# ALL: registers:
-# NO_AVX512VL-NEXT: - { id: 0, class: vr128, preferred-register: '' }
-# NO_AVX512VL-NEXT: - { id: 1, class: vr128, preferred-register: '' }
-# NO_AVX512VL-NEXT: - { id: 2, class: vr128, preferred-register: '' }
-# AVX512VL-NEXT: - { id: 0, class: vr128x, preferred-register: '' }
-# AVX512VL-NEXT: - { id: 1, class: vr128x, preferred-register: '' }
-# AVX512VL-NEXT: - { id: 2, class: vr128x, preferred-register: '' }
registers:
- { id: 0, class: vecr }
- { id: 1, class: vecr }
- { id: 2, class: vecr }
-# ALL: %0 = COPY %xmm0
-# ALL-NEXT: %1 = COPY %xmm1
-# SSE-NEXT: %2 = PSUBDrr %0, %1
-# AVX-NEXT: %2 = VPSUBDrr %0, %1
-# AVX512F-NEXT: %2 = VPSUBDrr %0, %1
-# AVX512VL-NEXT: %2 = VPSUBDZ128rr %0, %1
+# NO_AVX512VL: %0:vr128 = COPY %xmm0
+# AVX512VL: %0:vr128x = COPY %xmm0
+# NO_AVX512VL: %1:vr128 = COPY %xmm1
+# AVX512VL: %1:vr128x = COPY %xmm1
+# SSE-NEXT: %2:vr128 = PSUBDrr %0, %1
+# AVX-NEXT: %2:vr128 = VPSUBDrr %0, %1
+# AVX512F-NEXT: %2:vr128 = VPSUBDrr %0, %1
+# AVX512VL-NEXT: %2:vr128x = VPSUBDZ128rr %0, %1
body: |
bb.1 (%ir-block.0):
liveins: %xmm0, %xmm1
@@ -118,23 +105,19 @@ legalized: true
regBankSelected: true
selected: false
tracksRegLiveness: true
-# ALL: registers:
-# NO_AVX512VL-NEXT: - { id: 0, class: vr128, preferred-register: '' }
-# NO_AVX512VL-NEXT: - { id: 1, class: vr128, preferred-register: '' }
-# NO_AVX512VL-NEXT: - { id: 2, class: vr128, preferred-register: '' }
-# AVX512VL-NEXT: - { id: 0, class: vr128x, preferred-register: '' }
-# AVX512VL-NEXT: - { id: 1, class: vr128x, preferred-register: '' }
-# AVX512VL-NEXT: - { id: 2, class: vr128x, preferred-register: '' }
registers:
- { id: 0, class: vecr }
- { id: 1, class: vecr }
- { id: 2, class: vecr }
-# ALL: %0 = COPY %xmm0
-# ALL-NEXT: %1 = COPY %xmm1
-# SSE-NEXT: %2 = SUBPSrr %0, %1
-# AVX-NEXT: %2 = VSUBPSrr %0, %1
-# AVX512F-NEXT: %2 = VSUBPSrr %0, %1
-# AVX512VL-NEXT: %2 = VSUBPSZ128rr %0, %1
+# NO_AVX512VL: %0:vr128 = COPY %xmm0
+# NO_AVX512VL: %1:vr128 = COPY %xmm1
+# SSE-NEXT: %2:vr128 = SUBPSrr %0, %1
+# AVX-NEXT: %2:vr128 = VSUBPSrr %0, %1
+# AVX512F-NEXT: %2:vr128 = VSUBPSrr %0, %1
+#
+# AVX512VL: %0:vr128x = COPY %xmm0
+# AVX512VL: %1:vr128x = COPY %xmm1
+# AVX512VL-NEXT: %2:vr128x = VSUBPSZ128rr %0, %1
body: |
bb.1 (%ir-block.0):
liveins: %xmm0, %xmm1
diff --git a/test/CodeGen/X86/GlobalISel/select-trunc.mir b/test/CodeGen/X86/GlobalISel/select-trunc.mir
index 4df585628ddc..3ebecafc05de 100644
--- a/test/CodeGen/X86/GlobalISel/select-trunc.mir
+++ b/test/CodeGen/X86/GlobalISel/select-trunc.mir
@@ -1,3 +1,4 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple=x86_64-linux-gnu -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK
--- |
define i1 @trunc_i32toi1(i32 %a) {
@@ -33,24 +34,21 @@
...
---
name: trunc_i32toi1
-# CHECK-LABEL: name: trunc_i32toi1
alignment: 4
legalized: true
regBankSelected: true
-# CHECK: registers:
-# CHECK-NEXT: - { id: 0, class: gr32, preferred-register: '' }
-# CHECK-NEXT: - { id: 1, class: gr8, preferred-register: '' }
registers:
- { id: 0, class: gpr }
- { id: 1, class: gpr }
-# CHECK: %0 = COPY %edi
-# CHECK-NEXT: %1 = COPY %0.sub_8bit
-# CHECK-NEXT: %al = COPY %1
-# CHECK-NEXT: RET 0, implicit %al
body: |
bb.1 (%ir-block.0):
liveins: %edi
+ ; CHECK-LABEL: name: trunc_i32toi1
+ ; CHECK: [[COPY:%[0-9]+]]:gr32 = COPY %edi
+ ; CHECK: [[COPY1:%[0-9]+]]:gr8 = COPY [[COPY]].sub_8bit
+ ; CHECK: %al = COPY [[COPY1]]
+ ; CHECK: RET 0, implicit %al
%0(s32) = COPY %edi
%1(s1) = G_TRUNC %0(s32)
%al = COPY %1(s1)
@@ -59,24 +57,21 @@ body: |
...
---
name: trunc_i32toi8
-# CHECK-LABEL: name: trunc_i32toi8
alignment: 4
legalized: true
regBankSelected: true
-# CHECK: registers:
-# CHECK-NEXT: - { id: 0, class: gr32, preferred-register: '' }
-# CHECK-NEXT: - { id: 1, class: gr8, preferred-register: '' }
registers:
- { id: 0, class: gpr }
- { id: 1, class: gpr }
-# CHECK: %0 = COPY %edi
-# CHECK-NEXT: %1 = COPY %0.sub_8bit
-# CHECK-NEXT: %al = COPY %1
-# CHECK-NEXT: RET 0, implicit %al
body: |
bb.1 (%ir-block.0):
liveins: %edi
+ ; CHECK-LABEL: name: trunc_i32toi8
+ ; CHECK: [[COPY:%[0-9]+]]:gr32 = COPY %edi
+ ; CHECK: [[COPY1:%[0-9]+]]:gr8 = COPY [[COPY]].sub_8bit
+ ; CHECK: %al = COPY [[COPY1]]
+ ; CHECK: RET 0, implicit %al
%0(s32) = COPY %edi
%1(s8) = G_TRUNC %0(s32)
%al = COPY %1(s8)
@@ -85,24 +80,21 @@ body: |
...
---
name: trunc_i32toi16
-# CHECK-LABEL: name: trunc_i32toi16
alignment: 4
legalized: true
regBankSelected: true
-# CHECK: registers:
-# CHECK-NEXT: - { id: 0, class: gr32, preferred-register: '' }
-# CHECK-NEXT: - { id: 1, class: gr16, preferred-register: '' }
registers:
- { id: 0, class: gpr }
- { id: 1, class: gpr }
-# CHECK: %0 = COPY %edi
-# CHECK-NEXT: %1 = COPY %0.sub_16bit
-# CHECK-NEXT: %ax = COPY %1
-# CHECK-NEXT: RET 0, implicit %ax
body: |
bb.1 (%ir-block.0):
liveins: %edi
+ ; CHECK-LABEL: name: trunc_i32toi16
+ ; CHECK: [[COPY:%[0-9]+]]:gr32 = COPY %edi
+ ; CHECK: [[COPY1:%[0-9]+]]:gr16 = COPY [[COPY]].sub_16bit
+ ; CHECK: %ax = COPY [[COPY1]]
+ ; CHECK: RET 0, implicit %ax
%0(s32) = COPY %edi
%1(s16) = G_TRUNC %0(s32)
%ax = COPY %1(s16)
@@ -111,24 +103,21 @@ body: |
...
---
name: trunc_i64toi8
-# CHECK-LABEL: name: trunc_i64toi8
alignment: 4
legalized: true
regBankSelected: true
-# CHECK: registers:
-# CHECK-NEXT: - { id: 0, class: gr64_with_sub_8bit, preferred-register: '' }
-# CHECK-NEXT: - { id: 1, class: gr8, preferred-register: '' }
registers:
- { id: 0, class: gpr }
- { id: 1, class: gpr }
-# CHECK: %0 = COPY %rdi
-# CHECK-NEXT: %1 = COPY %0.sub_8bit
-# CHECK-NEXT: %al = COPY %1
-# CHECK-NEXT: RET 0, implicit %al
body: |
bb.1 (%ir-block.0):
liveins: %rdi
+ ; CHECK-LABEL: name: trunc_i64toi8
+ ; CHECK: [[COPY:%[0-9]+]]:gr64_with_sub_8bit = COPY %rdi
+ ; CHECK: [[COPY1:%[0-9]+]]:gr8 = COPY [[COPY]].sub_8bit
+ ; CHECK: %al = COPY [[COPY1]]
+ ; CHECK: RET 0, implicit %al
%0(s64) = COPY %rdi
%1(s8) = G_TRUNC %0(s64)
%al = COPY %1(s8)
@@ -137,24 +126,21 @@ body: |
...
---
name: trunc_i64toi16
-# CHECK-LABEL: name: trunc_i64toi16
alignment: 4
legalized: true
regBankSelected: true
-# CHECK: registers:
-# CHECK-NEXT: - { id: 0, class: gr64, preferred-register: '' }
-# CHECK-NEXT: - { id: 1, class: gr16, preferred-register: '' }
registers:
- { id: 0, class: gpr }
- { id: 1, class: gpr }
-# CHECK: %0 = COPY %rdi
-# CHECK-NEXT: %1 = COPY %0.sub_16bit
-# CHECK-NEXT: %ax = COPY %1
-# CHECK-NEXT: RET 0, implicit %ax
body: |
bb.1 (%ir-block.0):
liveins: %rdi
+ ; CHECK-LABEL: name: trunc_i64toi16
+ ; CHECK: [[COPY:%[0-9]+]]:gr64 = COPY %rdi
+ ; CHECK: [[COPY1:%[0-9]+]]:gr16 = COPY [[COPY]].sub_16bit
+ ; CHECK: %ax = COPY [[COPY1]]
+ ; CHECK: RET 0, implicit %ax
%0(s64) = COPY %rdi
%1(s16) = G_TRUNC %0(s64)
%ax = COPY %1(s16)
@@ -163,24 +149,21 @@ body: |
...
---
name: trunc_i64toi32
-# CHECK-LABEL: name: trunc_i64toi32
alignment: 4
legalized: true
regBankSelected: true
-# CHECK: registers:
-# CHECK-NEXT: - { id: 0, class: gr64, preferred-register: '' }
-# CHECK-NEXT: - { id: 1, class: gr32, preferred-register: '' }
registers:
- { id: 0, class: gpr }
- { id: 1, class: gpr }
-# CHECK: %0 = COPY %rdi
-# CHECK-NEXT: %1 = COPY %0.sub_32bit
-# CHECK-NEXT: %eax = COPY %1
-# CHECK-NEXT: RET 0, implicit %eax
body: |
bb.1 (%ir-block.0):
liveins: %rdi
+ ; CHECK-LABEL: name: trunc_i64toi32
+ ; CHECK: [[COPY:%[0-9]+]]:gr64 = COPY %rdi
+ ; CHECK: [[COPY1:%[0-9]+]]:gr32 = COPY [[COPY]].sub_32bit
+ ; CHECK: %eax = COPY [[COPY1]]
+ ; CHECK: RET 0, implicit %eax
%0(s64) = COPY %rdi
%1(s32) = G_TRUNC %0(s64)
%eax = COPY %1(s32)
diff --git a/test/CodeGen/X86/GlobalISel/select-undef.mir b/test/CodeGen/X86/GlobalISel/select-undef.mir
new file mode 100644
index 000000000000..897ed8550e18
--- /dev/null
+++ b/test/CodeGen/X86/GlobalISel/select-undef.mir
@@ -0,0 +1,66 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=x86_64-linux-gnu -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL
+--- |
+
+ define i8 @test() {
+ ret i8 undef
+ }
+
+ define i8 @test2(i8 %a) {
+ %r = add i8 %a, undef
+ ret i8 %r
+ }
+
+...
+---
+name: test
+alignment: 4
+legalized: true
+regBankSelected: true
+registers:
+ - { id: 0, class: gpr, preferred-register: '' }
+liveins:
+fixedStack:
+stack:
+constants:
+body: |
+ bb.1 (%ir-block.0):
+ ; ALL-LABEL: name: test
+ ; ALL: [[DEF:%[0-9]+]]:gr8 = IMPLICIT_DEF
+ ; ALL: %al = COPY [[DEF]]
+ ; ALL: RET 0, implicit %al
+ %0(s8) = G_IMPLICIT_DEF
+ %al = COPY %0(s8)
+ RET 0, implicit %al
+
+...
+---
+name: test2
+alignment: 4
+legalized: true
+regBankSelected: true
+registers:
+ - { id: 0, class: gpr, preferred-register: '' }
+ - { id: 1, class: gpr, preferred-register: '' }
+ - { id: 2, class: gpr, preferred-register: '' }
+liveins:
+fixedStack:
+stack:
+constants:
+body: |
+ bb.1 (%ir-block.0):
+ liveins: %edi
+
+ ; ALL-LABEL: name: test2
+ ; ALL: [[COPY:%[0-9]+]]:gr8 = COPY %dil
+ ; ALL: [[DEF:%[0-9]+]]:gr8 = IMPLICIT_DEF
+ ; ALL: [[ADD8rr:%[0-9]+]]:gr8 = ADD8rr [[COPY]], [[DEF]], implicit-def %eflags
+ ; ALL: %al = COPY [[ADD8rr]]
+ ; ALL: RET 0, implicit %al
+ %0(s8) = COPY %dil
+ %1(s8) = G_IMPLICIT_DEF
+ %2(s8) = G_ADD %0, %1
+ %al = COPY %2(s8)
+ RET 0, implicit %al
+
+...
diff --git a/test/CodeGen/X86/GlobalISel/select-unmerge-vec256.mir b/test/CodeGen/X86/GlobalISel/select-unmerge-vec256.mir
index 09dc5344796f..55a3428c0557 100644
--- a/test/CodeGen/X86/GlobalISel/select-unmerge-vec256.mir
+++ b/test/CodeGen/X86/GlobalISel/select-unmerge-vec256.mir
@@ -1,3 +1,4 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=AVX
# RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx512f,+avx512vl -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=AVX512VL
--- |
@@ -8,41 +9,33 @@
...
---
name: test_unmerge
-# AVX-LABEL: name: test_unmerge
#
-# AVX512VL-LABEL: name: test_unmerge
alignment: 4
legalized: true
regBankSelected: true
-# AVX: registers:
-# AVX-NEXT: - { id: 0, class: vr256, preferred-register: '' }
-# AVX-NEXT: - { id: 1, class: vr128, preferred-register: '' }
-# AVX-NEXT: - { id: 2, class: vr128, preferred-register: '' }
#
-# AVX512VL: registers:
-# AVX512VL-NEXT: - { id: 0, class: vr256x, preferred-register: '' }
-# AVX512VL-NEXT: - { id: 1, class: vr128x, preferred-register: '' }
-# AVX512VL-NEXT: - { id: 2, class: vr128x, preferred-register: '' }
registers:
- { id: 0, class: vecr }
- { id: 1, class: vecr }
- { id: 2, class: vecr }
-# AVX: %0 = IMPLICIT_DEF
-# AVX-NEXT: %1 = COPY %0.sub_xmm
-# AVX-NEXT: %2 = VEXTRACTF128rr %0, 1
-# AVX-NEXT: %xmm0 = COPY %1
-# AVX-NEXT: %xmm1 = COPY %2
-# AVX-NEXT: RET 0, implicit %xmm0, implicit %xmm1
#
-# AVX512VL: %0 = IMPLICIT_DEF
-# AVX512VL-NEXT: %1 = COPY %0.sub_xmm
-# AVX512VL-NEXT: %2 = VEXTRACTF32x4Z256rr %0, 1
-# AVX512VL-NEXT: %xmm0 = COPY %1
-# AVX512VL-NEXT: %xmm1 = COPY %2
-# AVX512VL-NEXT: RET 0, implicit %xmm0, implicit %xmm1
body: |
bb.1 (%ir-block.0):
+ ; AVX-LABEL: name: test_unmerge
+ ; AVX: [[DEF:%[0-9]+]]:vr256 = IMPLICIT_DEF
+ ; AVX: [[COPY:%[0-9]+]]:vr128 = COPY [[DEF]].sub_xmm
+ ; AVX: [[VEXTRACTF128rr:%[0-9]+]]:vr128 = VEXTRACTF128rr [[DEF]], 1
+ ; AVX: %xmm0 = COPY [[COPY]]
+ ; AVX: %xmm1 = COPY [[VEXTRACTF128rr]]
+ ; AVX: RET 0, implicit %xmm0, implicit %xmm1
+ ; AVX512VL-LABEL: name: test_unmerge
+ ; AVX512VL: [[DEF:%[0-9]+]]:vr256x = IMPLICIT_DEF
+ ; AVX512VL: [[COPY:%[0-9]+]]:vr128x = COPY [[DEF]].sub_xmm
+ ; AVX512VL: [[VEXTRACTF32x4Z256rr:%[0-9]+]]:vr128x = VEXTRACTF32x4Z256rr [[DEF]], 1
+ ; AVX512VL: %xmm0 = COPY [[COPY]]
+ ; AVX512VL: %xmm1 = COPY [[VEXTRACTF32x4Z256rr]]
+ ; AVX512VL: RET 0, implicit %xmm0, implicit %xmm1
%0(<8 x s32>) = IMPLICIT_DEF
%1(<4 x s32>), %2(<4 x s32>) = G_UNMERGE_VALUES %0(<8 x s32>)
%xmm0 = COPY %1(<4 x s32>)
diff --git a/test/CodeGen/X86/GlobalISel/select-unmerge-vec512.mir b/test/CodeGen/X86/GlobalISel/select-unmerge-vec512.mir
index a63733d07f6a..4446ab5de998 100644
--- a/test/CodeGen/X86/GlobalISel/select-unmerge-vec512.mir
+++ b/test/CodeGen/X86/GlobalISel/select-unmerge-vec512.mir
@@ -1,3 +1,4 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx512f -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL
--- |
define void @test_unmerge_v128() {
@@ -11,32 +12,26 @@
...
---
name: test_unmerge_v128
-# ALL-LABEL: name: test_unmerge_v128
alignment: 4
legalized: true
regBankSelected: true
-# ALL: registers:
-# ALL-NEXT: - { id: 0, class: vr512, preferred-register: '' }
-# ALL-NEXT: - { id: 1, class: vr128x, preferred-register: '' }
-# ALL-NEXT: - { id: 2, class: vr128x, preferred-register: '' }
-# ALL-NEXT: - { id: 3, class: vr128x, preferred-register: '' }
-# ALL-NEXT: - { id: 4, class: vr128x, preferred-register: '' }
registers:
- { id: 0, class: vecr }
- { id: 1, class: vecr }
- { id: 2, class: vecr }
- { id: 3, class: vecr }
- { id: 4, class: vecr }
-# ALL: %0 = IMPLICIT_DEF
-# ALL-NEXT: %1 = COPY %0.sub_xmm
-# ALL-NEXT: %2 = VEXTRACTF32x4Zrr %0, 1
-# ALL-NEXT: %3 = VEXTRACTF32x4Zrr %0, 2
-# ALL-NEXT: %4 = VEXTRACTF32x4Zrr %0, 3
-# ALL-NEXT: %xmm0 = COPY %1
-# ALL-NEXT: RET 0, implicit %xmm0
body: |
bb.1 (%ir-block.0):
+ ; ALL-LABEL: name: test_unmerge_v128
+ ; ALL: [[DEF:%[0-9]+]]:vr512 = IMPLICIT_DEF
+ ; ALL: [[COPY:%[0-9]+]]:vr128x = COPY [[DEF]].sub_xmm
+ ; ALL: [[VEXTRACTF32x4Zrr:%[0-9]+]]:vr128x = VEXTRACTF32x4Zrr [[DEF]], 1
+ ; ALL: [[VEXTRACTF32x4Zrr1:%[0-9]+]]:vr128x = VEXTRACTF32x4Zrr [[DEF]], 2
+ ; ALL: [[VEXTRACTF32x4Zrr2:%[0-9]+]]:vr128x = VEXTRACTF32x4Zrr [[DEF]], 3
+ ; ALL: %xmm0 = COPY [[COPY]]
+ ; ALL: RET 0, implicit %xmm0
%0(<16 x s32>) = IMPLICIT_DEF
%1(<4 x s32>), %2(<4 x s32>), %3(<4 x s32>), %4(<4 x s32>) = G_UNMERGE_VALUES %0(<16 x s32>)
%xmm0 = COPY %1(<4 x s32>)
@@ -45,30 +40,25 @@ body: |
...
---
name: test_unmerge_v256
-# ALL-LABEL: name: test_unmerge_v256
alignment: 4
legalized: true
regBankSelected: true
-# ALL: registers:
-# ALL-NEXT: - { id: 0, class: vr512, preferred-register: '' }
-# ALL-NEXT: - { id: 1, class: vr256x, preferred-register: '' }
-# ALL-NEXT: - { id: 2, class: vr256x, preferred-register: '' }
registers:
- { id: 0, class: vecr }
- { id: 1, class: vecr }
- { id: 2, class: vecr }
-# ALL: %0 = IMPLICIT_DEF
-# ALL-NEXT: %1 = COPY %0.sub_ymm
-# ALL-NEXT: %2 = VEXTRACTF64x4Zrr %0, 1
-# ALL-NEXT: %xmm0 = COPY %1
-# ALL-NEXT: RET 0, implicit %ymm0
body: |
bb.1 (%ir-block.0):
+ ; ALL-LABEL: name: test_unmerge_v256
+ ; ALL: [[DEF:%[0-9]+]]:vr512 = IMPLICIT_DEF
+ ; ALL: [[COPY:%[0-9]+]]:vr256x = COPY [[DEF]].sub_ymm
+ ; ALL: [[VEXTRACTF64x4Zrr:%[0-9]+]]:vr256x = VEXTRACTF64x4Zrr [[DEF]], 1
+ ; ALL: %ymm0 = COPY [[COPY]]
+ ; ALL: RET 0, implicit %ymm0
%0(<16 x s32>) = IMPLICIT_DEF
%1(<8 x s32>), %2(<8 x s32>) = G_UNMERGE_VALUES %0(<16 x s32>)
- %xmm0 = COPY %1(<8 x s32>)
+ %ymm0 = COPY %1(<8 x s32>)
RET 0, implicit %ymm0
...
-
diff --git a/test/CodeGen/X86/GlobalISel/select-xor-scalar.mir b/test/CodeGen/X86/GlobalISel/select-xor-scalar.mir
index 9d03c6a3f1a8..26b07db83c3a 100644
--- a/test/CodeGen/X86/GlobalISel/select-xor-scalar.mir
+++ b/test/CodeGen/X86/GlobalISel/select-xor-scalar.mir
@@ -1,3 +1,4 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple=x86_64-linux-gnu -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL
--- |
@@ -24,14 +25,9 @@
...
---
name: test_xor_i8
-# ALL-LABEL: name: test_xor_i8
alignment: 4
legalized: true
regBankSelected: true
-# ALL: registers:
-# ALL-NEXT: - { id: 0, class: gr8, preferred-register: '' }
-# ALL-NEXT: - { id: 1, class: gr8, preferred-register: '' }
-# ALL-NEXT: - { id: 2, class: gr8, preferred-register: '' }
registers:
- { id: 0, class: gpr, preferred-register: '' }
- { id: 1, class: gpr, preferred-register: '' }
@@ -40,17 +36,18 @@ liveins:
fixedStack:
stack:
constants:
-# ALL: %0 = COPY %dil
-# ALL-NEXT: %1 = COPY %sil
-# ALL-NEXT: %2 = XOR8rr %0, %1, implicit-def %eflags
-# ALL-NEXT: %al = COPY %2
-# ALL-NEXT: RET 0, implicit %al
body: |
bb.1 (%ir-block.0):
liveins: %edi, %esi
- %0(s8) = COPY %edi
- %1(s8) = COPY %esi
+ ; ALL-LABEL: name: test_xor_i8
+ ; ALL: [[COPY:%[0-9]+]]:gr8 = COPY %dil
+ ; ALL: [[COPY1:%[0-9]+]]:gr8 = COPY %sil
+ ; ALL: [[XOR8rr:%[0-9]+]]:gr8 = XOR8rr [[COPY]], [[COPY1]], implicit-def %eflags
+ ; ALL: %al = COPY [[XOR8rr]]
+ ; ALL: RET 0, implicit %al
+ %0(s8) = COPY %dil
+ %1(s8) = COPY %sil
%2(s8) = G_XOR %0, %1
%al = COPY %2(s8)
RET 0, implicit %al
@@ -58,14 +55,9 @@ body: |
...
---
name: test_xor_i16
-# ALL-LABEL: name: test_xor_i16
alignment: 4
legalized: true
regBankSelected: true
-# ALL: registers:
-# ALL-NEXT: - { id: 0, class: gr16, preferred-register: '' }
-# ALL-NEXT: - { id: 1, class: gr16, preferred-register: '' }
-# ALL-NEXT: - { id: 2, class: gr16, preferred-register: '' }
registers:
- { id: 0, class: gpr, preferred-register: '' }
- { id: 1, class: gpr, preferred-register: '' }
@@ -74,17 +66,18 @@ liveins:
fixedStack:
stack:
constants:
-# ALL: %0 = COPY %di
-# ALL-NEXT: %1 = COPY %si
-# ALL-NEXT: %2 = XOR16rr %0, %1, implicit-def %eflags
-# ALL-NEXT: %ax = COPY %2
-# ALL-NEXT: RET 0, implicit %ax
body: |
bb.1 (%ir-block.0):
liveins: %edi, %esi
- %0(s16) = COPY %edi
- %1(s16) = COPY %esi
+ ; ALL-LABEL: name: test_xor_i16
+ ; ALL: [[COPY:%[0-9]+]]:gr16 = COPY %di
+ ; ALL: [[COPY1:%[0-9]+]]:gr16 = COPY %si
+ ; ALL: [[XOR16rr:%[0-9]+]]:gr16 = XOR16rr [[COPY]], [[COPY1]], implicit-def %eflags
+ ; ALL: %ax = COPY [[XOR16rr]]
+ ; ALL: RET 0, implicit %ax
+ %0(s16) = COPY %di
+ %1(s16) = COPY %si
%2(s16) = G_XOR %0, %1
%ax = COPY %2(s16)
RET 0, implicit %ax
@@ -92,14 +85,9 @@ body: |
...
---
name: test_xor_i32
-# ALL-LABEL: name: test_xor_i32
alignment: 4
legalized: true
regBankSelected: true
-# ALL: registers:
-# ALL-NEXT: - { id: 0, class: gr32, preferred-register: '' }
-# ALL-NEXT: - { id: 1, class: gr32, preferred-register: '' }
-# ALL-NEXT: - { id: 2, class: gr32, preferred-register: '' }
registers:
- { id: 0, class: gpr, preferred-register: '' }
- { id: 1, class: gpr, preferred-register: '' }
@@ -108,15 +96,16 @@ liveins:
fixedStack:
stack:
constants:
-# ALL: %0 = COPY %edi
-# ALL-NEXT: %1 = COPY %esi
-# ALL-NEXT: %2 = XOR32rr %0, %1, implicit-def %eflags
-# ALL-NEXT: %eax = COPY %2
-# ALL-NEXT: RET 0, implicit %eax
body: |
bb.1 (%ir-block.0):
liveins: %edi, %esi
+ ; ALL-LABEL: name: test_xor_i32
+ ; ALL: [[COPY:%[0-9]+]]:gr32 = COPY %edi
+ ; ALL: [[COPY1:%[0-9]+]]:gr32 = COPY %esi
+ ; ALL: [[XOR32rr:%[0-9]+]]:gr32 = XOR32rr [[COPY]], [[COPY1]], implicit-def %eflags
+ ; ALL: %eax = COPY [[XOR32rr]]
+ ; ALL: RET 0, implicit %eax
%0(s32) = COPY %edi
%1(s32) = COPY %esi
%2(s32) = G_XOR %0, %1
@@ -126,14 +115,9 @@ body: |
...
---
name: test_xor_i64
-# ALL-LABEL: name: test_xor_i64
alignment: 4
legalized: true
regBankSelected: true
-# ALL: registers:
-# ALL-NEXT: - { id: 0, class: gr64, preferred-register: '' }
-# ALL-NEXT: - { id: 1, class: gr64, preferred-register: '' }
-# ALL-NEXT: - { id: 2, class: gr64, preferred-register: '' }
registers:
- { id: 0, class: gpr, preferred-register: '' }
- { id: 1, class: gpr, preferred-register: '' }
@@ -142,15 +126,16 @@ liveins:
fixedStack:
stack:
constants:
-# ALL: %0 = COPY %rdi
-# ALL-NEXT: %1 = COPY %rsi
-# ALL-NEXT: %2 = XOR64rr %0, %1, implicit-def %eflags
-# ALL-NEXT: %rax = COPY %2
-# ALL-NEXT: RET 0, implicit %rax
body: |
bb.1 (%ir-block.0):
liveins: %rdi, %rsi
+ ; ALL-LABEL: name: test_xor_i64
+ ; ALL: [[COPY:%[0-9]+]]:gr64 = COPY %rdi
+ ; ALL: [[COPY1:%[0-9]+]]:gr64 = COPY %rsi
+ ; ALL: [[XOR64rr:%[0-9]+]]:gr64 = XOR64rr [[COPY]], [[COPY1]], implicit-def %eflags
+ ; ALL: %rax = COPY [[XOR64rr]]
+ ; ALL: RET 0, implicit %rax
%0(s64) = COPY %rdi
%1(s64) = COPY %rsi
%2(s64) = G_XOR %0, %1
diff --git a/test/CodeGen/X86/GlobalISel/sub-scalar.ll b/test/CodeGen/X86/GlobalISel/sub-scalar.ll
new file mode 100644
index 000000000000..f8d825dff389
--- /dev/null
+++ b/test/CodeGen/X86/GlobalISel/sub-scalar.ll
@@ -0,0 +1,56 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=x86_64-linux-gnu -global-isel -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=X64
+
+define i64 @test_sub_i64(i64 %arg1, i64 %arg2) {
+; X64-LABEL: test_sub_i64:
+; X64: # %bb.0:
+; X64-NEXT: subq %rsi, %rdi
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: retq
+ %ret = sub i64 %arg1, %arg2
+ ret i64 %ret
+}
+
+define i32 @test_sub_i32(i32 %arg1, i32 %arg2) {
+; X64-LABEL: test_sub_i32:
+; X64: # %bb.0:
+; X64-NEXT: subl %esi, %edi
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: retq
+ %ret = sub i32 %arg1, %arg2
+ ret i32 %ret
+}
+
+define i16 @test_sub_i16(i16 %arg1, i16 %arg2) {
+; X64-LABEL: test_sub_i16:
+; X64: # %bb.0:
+; X64-NEXT: subw %si, %di
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: retq
+ %ret = sub i16 %arg1, %arg2
+ ret i16 %ret
+}
+
+define i8 @test_sub_i8(i8 %arg1, i8 %arg2) {
+; X64-LABEL: test_sub_i8:
+; X64: # %bb.0:
+; X64-NEXT: subb %sil, %dil
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: retq
+ %ret = sub i8 %arg1, %arg2
+ ret i8 %ret
+}
+
+define i32 @test_sub_i1(i32 %arg1, i32 %arg2) {
+; X64-LABEL: test_sub_i1:
+; X64: # %bb.0:
+; X64-NEXT: subb %sil, %dil
+; X64-NEXT: movzbl %dil, %eax
+; X64-NEXT: andl $1, %eax
+; X64-NEXT: retq
+ %a1 = trunc i32 %arg1 to i1
+ %a2 = trunc i32 %arg2 to i1
+ %x = sub i1 %a1 , %a2
+ %ret = zext i1 %x to i32
+ ret i32 %ret
+}
diff --git a/test/CodeGen/X86/GlobalISel/sub-vec.ll b/test/CodeGen/X86/GlobalISel/sub-vec.ll
index 9caf18f0c0c7..8186026836f3 100644
--- a/test/CodeGen/X86/GlobalISel/sub-vec.ll
+++ b/test/CodeGen/X86/GlobalISel/sub-vec.ll
@@ -3,7 +3,7 @@
define <16 x i8> @test_sub_v16i8(<16 x i8> %arg1, <16 x i8> %arg2) {
; SKX-LABEL: test_sub_v16i8:
-; SKX: # BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vpsubb %xmm1, %xmm0, %xmm0
; SKX-NEXT: retq
%ret = sub <16 x i8> %arg1, %arg2
@@ -12,7 +12,7 @@ define <16 x i8> @test_sub_v16i8(<16 x i8> %arg1, <16 x i8> %arg2) {
define <8 x i16> @test_sub_v8i16(<8 x i16> %arg1, <8 x i16> %arg2) {
; SKX-LABEL: test_sub_v8i16:
-; SKX: # BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vpsubw %xmm1, %xmm0, %xmm0
; SKX-NEXT: retq
%ret = sub <8 x i16> %arg1, %arg2
@@ -21,7 +21,7 @@ define <8 x i16> @test_sub_v8i16(<8 x i16> %arg1, <8 x i16> %arg2) {
define <4 x i32> @test_sub_v4i32(<4 x i32> %arg1, <4 x i32> %arg2) {
; SKX-LABEL: test_sub_v4i32:
-; SKX: # BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vpsubd %xmm1, %xmm0, %xmm0
; SKX-NEXT: retq
%ret = sub <4 x i32> %arg1, %arg2
@@ -30,7 +30,7 @@ define <4 x i32> @test_sub_v4i32(<4 x i32> %arg1, <4 x i32> %arg2) {
define <2 x i64> @test_sub_v2i64(<2 x i64> %arg1, <2 x i64> %arg2) {
; SKX-LABEL: test_sub_v2i64:
-; SKX: # BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vpsubq %xmm1, %xmm0, %xmm0
; SKX-NEXT: retq
%ret = sub <2 x i64> %arg1, %arg2
@@ -39,7 +39,7 @@ define <2 x i64> @test_sub_v2i64(<2 x i64> %arg1, <2 x i64> %arg2) {
define <32 x i8> @test_sub_v32i8(<32 x i8> %arg1, <32 x i8> %arg2) {
; SKX-LABEL: test_sub_v32i8:
-; SKX: # BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vpsubb %ymm1, %ymm0, %ymm0
; SKX-NEXT: retq
%ret = sub <32 x i8> %arg1, %arg2
@@ -48,7 +48,7 @@ define <32 x i8> @test_sub_v32i8(<32 x i8> %arg1, <32 x i8> %arg2) {
define <16 x i16> @test_sub_v16i16(<16 x i16> %arg1, <16 x i16> %arg2) {
; SKX-LABEL: test_sub_v16i16:
-; SKX: # BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vpsubw %ymm1, %ymm0, %ymm0
; SKX-NEXT: retq
%ret = sub <16 x i16> %arg1, %arg2
@@ -57,7 +57,7 @@ define <16 x i16> @test_sub_v16i16(<16 x i16> %arg1, <16 x i16> %arg2) {
define <8 x i32> @test_sub_v8i32(<8 x i32> %arg1, <8 x i32> %arg2) {
; SKX-LABEL: test_sub_v8i32:
-; SKX: # BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vpsubd %ymm1, %ymm0, %ymm0
; SKX-NEXT: retq
%ret = sub <8 x i32> %arg1, %arg2
@@ -66,7 +66,7 @@ define <8 x i32> @test_sub_v8i32(<8 x i32> %arg1, <8 x i32> %arg2) {
define <4 x i64> @test_sub_v4i64(<4 x i64> %arg1, <4 x i64> %arg2) {
; SKX-LABEL: test_sub_v4i64:
-; SKX: # BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vpsubq %ymm1, %ymm0, %ymm0
; SKX-NEXT: retq
%ret = sub <4 x i64> %arg1, %arg2
@@ -75,7 +75,7 @@ define <4 x i64> @test_sub_v4i64(<4 x i64> %arg1, <4 x i64> %arg2) {
define <64 x i8> @test_sub_v64i8(<64 x i8> %arg1, <64 x i8> %arg2) {
; SKX-LABEL: test_sub_v64i8:
-; SKX: # BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vpsubb %zmm1, %zmm0, %zmm0
; SKX-NEXT: retq
%ret = sub <64 x i8> %arg1, %arg2
@@ -84,7 +84,7 @@ define <64 x i8> @test_sub_v64i8(<64 x i8> %arg1, <64 x i8> %arg2) {
define <32 x i16> @test_sub_v32i16(<32 x i16> %arg1, <32 x i16> %arg2) {
; SKX-LABEL: test_sub_v32i16:
-; SKX: # BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vpsubw %zmm1, %zmm0, %zmm0
; SKX-NEXT: retq
%ret = sub <32 x i16> %arg1, %arg2
@@ -93,7 +93,7 @@ define <32 x i16> @test_sub_v32i16(<32 x i16> %arg1, <32 x i16> %arg2) {
define <16 x i32> @test_sub_v16i32(<16 x i32> %arg1, <16 x i32> %arg2) {
; SKX-LABEL: test_sub_v16i32:
-; SKX: # BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vpsubd %zmm1, %zmm0, %zmm0
; SKX-NEXT: retq
%ret = sub <16 x i32> %arg1, %arg2
@@ -102,7 +102,7 @@ define <16 x i32> @test_sub_v16i32(<16 x i32> %arg1, <16 x i32> %arg2) {
define <8 x i64> @test_sub_v8i64(<8 x i64> %arg1, <8 x i64> %arg2) {
; SKX-LABEL: test_sub_v8i64:
-; SKX: # BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vpsubq %zmm1, %zmm0, %zmm0
; SKX-NEXT: retq
%ret = sub <8 x i64> %arg1, %arg2
diff --git a/test/CodeGen/X86/GlobalISel/trunc.ll b/test/CodeGen/X86/GlobalISel/trunc.ll
index 6c0f01673afc..6c4729f3021c 100644
--- a/test/CodeGen/X86/GlobalISel/trunc.ll
+++ b/test/CodeGen/X86/GlobalISel/trunc.ll
@@ -3,7 +3,7 @@
define i1 @trunc_i32toi1(i32 %a) {
; CHECK-LABEL: trunc_i32toi1:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movl %edi, %eax
; CHECK-NEXT: retq
%r = trunc i32 %a to i1
@@ -12,7 +12,7 @@ define i1 @trunc_i32toi1(i32 %a) {
define i8 @trunc_i32toi8(i32 %a) {
; CHECK-LABEL: trunc_i32toi8:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movl %edi, %eax
; CHECK-NEXT: retq
%r = trunc i32 %a to i8
@@ -21,7 +21,7 @@ define i8 @trunc_i32toi8(i32 %a) {
define i16 @trunc_i32toi16(i32 %a) {
; CHECK-LABEL: trunc_i32toi16:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movl %edi, %eax
; CHECK-NEXT: retq
%r = trunc i32 %a to i16
@@ -30,7 +30,7 @@ define i16 @trunc_i32toi16(i32 %a) {
define i8 @trunc_i64toi8(i64 %a) {
; CHECK-LABEL: trunc_i64toi8:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movl %edi, %eax
; CHECK-NEXT: retq
%r = trunc i64 %a to i8
@@ -39,7 +39,7 @@ define i8 @trunc_i64toi8(i64 %a) {
define i16 @trunc_i64toi16(i64 %a) {
; CHECK-LABEL: trunc_i64toi16:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movl %edi, %eax
; CHECK-NEXT: retq
%r = trunc i64 %a to i16
@@ -48,7 +48,7 @@ define i16 @trunc_i64toi16(i64 %a) {
define i32 @trunc_i64toi32(i64 %a) {
; CHECK-LABEL: trunc_i64toi32:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movl %edi, %eax
; CHECK-NEXT: retq
%r = trunc i64 %a to i32
diff --git a/test/CodeGen/X86/GlobalISel/undef.ll b/test/CodeGen/X86/GlobalISel/undef.ll
new file mode 100644
index 000000000000..6edd0bfed501
--- /dev/null
+++ b/test/CodeGen/X86/GlobalISel/undef.ll
@@ -0,0 +1,37 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=x86_64-linux-gnu -global-isel -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=ALL
+
+define i8 @test() {
+; ALL-LABEL: test:
+; ALL: # %bb.0:
+; ALL-NEXT: retq
+ ret i8 undef
+}
+
+define i8 @test2(i8 %a) {
+; ALL-LABEL: test2:
+; ALL: # %bb.0:
+; ALL-NEXT: addb %al, %dil
+; ALL-NEXT: movl %edi, %eax
+; ALL-NEXT: retq
+ %r = add i8 %a, undef
+ ret i8 %r
+}
+
+
+define float @test3() {
+; ALL-LABEL: test3:
+; ALL: # %bb.0:
+; ALL-NEXT: retq
+ ret float undef
+}
+
+define float @test4(float %a) {
+; ALL-LABEL: test4:
+; ALL: # %bb.0:
+; ALL-NEXT: addss %xmm0, %xmm0
+; ALL-NEXT: retq
+ %r = fadd float %a, undef
+ ret float %r
+}
+
diff --git a/test/CodeGen/X86/GlobalISel/x86_64-fallback.ll b/test/CodeGen/X86/GlobalISel/x86_64-fallback.ll
index 2743f882b2e4..f4d359a20651 100644
--- a/test/CodeGen/X86/GlobalISel/x86_64-fallback.ll
+++ b/test/CodeGen/X86/GlobalISel/x86_64-fallback.ll
@@ -8,7 +8,7 @@
; the fallback path.
; Check that we fallback on invoke translation failures.
-; FALLBACK-WITH-REPORT-ERR: remark: <unknown>:0:0: unable to legalize instruction: %vreg1<def>(s80) = G_FCONSTANT x86_fp80 0xK4002A000000000000000
+; FALLBACK-WITH-REPORT-ERR: remark: <unknown>:0:0: unable to legalize instruction: G_STORE %1(s80), %0(p0); mem:ST10[%ptr](align=16) (in function: test_x86_fp80_dump)
; FALLBACK-WITH-REPORT-ERR: warning: Instruction selection used fallback path for test_x86_fp80_dump
; FALLBACK-WITH-REPORT-OUT-LABEL: test_x86_fp80_dump:
define void @test_x86_fp80_dump(x86_fp80* %ptr){
@@ -16,3 +16,14 @@ define void @test_x86_fp80_dump(x86_fp80* %ptr){
ret void
}
+; Check that we fallback on byVal argument
+; FALLBACK-WITH-REPORT-ERR: remark: <unknown>:0:0: unable to translate instruction: call: ' call void @ScaleObjectOverwrite_3(%struct.PointListStruct* %index, %struct.PointListStruct* byval %index)' (in function: ScaleObjectOverwrite_2)
+; FALLBACK-WITH-REPORT-ERR: warning: Instruction selection used fallback path for ScaleObjectOverwrite_2
+; FALLBACK-WITH-REPORT-OUT-LABEL: ScaleObjectOverwrite_2:
+%struct.PointListStruct = type { i8*, i8* }
+declare void @ScaleObjectOverwrite_3(%struct.PointListStruct* %index, %struct.PointListStruct* byval %index2)
+define void @ScaleObjectOverwrite_2(%struct.PointListStruct* %index) {
+entry:
+ call void @ScaleObjectOverwrite_3(%struct.PointListStruct* %index, %struct.PointListStruct* byval %index)
+ ret void
+}
diff --git a/test/CodeGen/X86/GlobalISel/xor-scalar.ll b/test/CodeGen/X86/GlobalISel/xor-scalar.ll
index 9941db8abd9c..069edaadee98 100644
--- a/test/CodeGen/X86/GlobalISel/xor-scalar.ll
+++ b/test/CodeGen/X86/GlobalISel/xor-scalar.ll
@@ -1,9 +1,24 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=x86_64-linux-gnu -global-isel -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=ALL
+define i32 @test_xor_i1(i32 %arg1, i32 %arg2) {
+; ALL-LABEL: test_xor_i1:
+; ALL: # %bb.0:
+; ALL-NEXT: cmpl %esi, %edi
+; ALL-NEXT: sete %al
+; ALL-NEXT: xorb %al, %al
+; ALL-NEXT: movzbl %al, %eax
+; ALL-NEXT: andl $1, %eax
+; ALL-NEXT: retq
+ %c = icmp eq i32 %arg1, %arg2
+ %x = xor i1 %c , %c
+ %ret = zext i1 %x to i32
+ ret i32 %ret
+}
+
define i8 @test_xor_i8(i8 %arg1, i8 %arg2) {
; ALL-LABEL: test_xor_i8:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: xorb %dil, %sil
; ALL-NEXT: movl %esi, %eax
; ALL-NEXT: retq
@@ -13,7 +28,7 @@ define i8 @test_xor_i8(i8 %arg1, i8 %arg2) {
define i16 @test_xor_i16(i16 %arg1, i16 %arg2) {
; ALL-LABEL: test_xor_i16:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: xorw %di, %si
; ALL-NEXT: movl %esi, %eax
; ALL-NEXT: retq
@@ -23,7 +38,7 @@ define i16 @test_xor_i16(i16 %arg1, i16 %arg2) {
define i32 @test_xor_i32(i32 %arg1, i32 %arg2) {
; ALL-LABEL: test_xor_i32:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: xorl %edi, %esi
; ALL-NEXT: movl %esi, %eax
; ALL-NEXT: retq
@@ -33,7 +48,7 @@ define i32 @test_xor_i32(i32 %arg1, i32 %arg2) {
define i64 @test_xor_i64(i64 %arg1, i64 %arg2) {
; ALL-LABEL: test_xor_i64:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: xorq %rdi, %rsi
; ALL-NEXT: movq %rsi, %rax
; ALL-NEXT: retq
diff --git a/test/CodeGen/X86/MachineBranchProb.ll b/test/CodeGen/X86/MachineBranchProb.ll
index ee1c658d4c55..e6a56651837b 100644
--- a/test/CodeGen/X86/MachineBranchProb.ll
+++ b/test/CodeGen/X86/MachineBranchProb.ll
@@ -17,10 +17,10 @@ for.cond2: ; preds = %for.inc, %for.cond
%cmp4 = icmp eq i32 %i.1, %v3
%or.cond = or i1 %tobool, %cmp4
br i1 %or.cond, label %for.inc20, label %for.inc, !prof !0
-; CHECK: BB#1: derived from LLVM BB %for.cond2
-; CHECK: Successors according to CFG: BB#3({{[0-9a-fx/= ]+}}1.53%) BB#4({{[0-9a-fx/= ]+}}98.47%)
-; CHECK: BB#4: derived from LLVM BB %for.cond2
-; CHECK: Successors according to CFG: BB#3({{[0-9a-fx/= ]+}}1.55%) BB#2({{[0-9a-fx/= ]+}}98.45%)
+; CHECK: %bb.1: derived from LLVM BB %for.cond2
+; CHECK: Successors according to CFG: %bb.3({{[0-9a-fx/= ]+}}1.53%) %bb.4({{[0-9a-fx/= ]+}}98.47%)
+; CHECK: %bb.4: derived from LLVM BB %for.cond2
+; CHECK: Successors according to CFG: %bb.3({{[0-9a-fx/= ]+}}1.55%) %bb.2({{[0-9a-fx/= ]+}}98.45%)
for.inc: ; preds = %for.cond2
%shl = shl i32 %bit.0, 1
diff --git a/test/CodeGen/X86/MergeConsecutiveStores.ll b/test/CodeGen/X86/MergeConsecutiveStores.ll
index 55ff14988069..fd4e9891bacb 100644
--- a/test/CodeGen/X86/MergeConsecutiveStores.ll
+++ b/test/CodeGen/X86/MergeConsecutiveStores.ll
@@ -1,14 +1,27 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx -fixup-byte-word-insts=1 < %s | FileCheck -check-prefix=CHECK -check-prefix=BWON %s
; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx -fixup-byte-word-insts=0 < %s | FileCheck -check-prefix=CHECK -check-prefix=BWOFF %s
%struct.A = type { i8, i8, i8, i8, i8, i8, i8, i8 }
%struct.B = type { i32, i32, i32, i32, i32, i32, i32, i32 }
-; CHECK-LABEL: merge_const_store:
; save 1,2,3 ... as one big integer.
-; CHECK: movabsq $578437695752307201
-; CHECK: ret
define void @merge_const_store(i32 %count, %struct.A* nocapture %p) nounwind uwtable noinline ssp {
+; CHECK-LABEL: merge_const_store:
+; CHECK: # %bb.0:
+; CHECK-NEXT: testl %edi, %edi
+; CHECK-NEXT: jle .LBB0_3
+; CHECK-NEXT: # %bb.1: # %.lr.ph.preheader
+; CHECK-NEXT: movabsq $578437695752307201, %rax # imm = 0x807060504030201
+; CHECK-NEXT: .p2align 4, 0x90
+; CHECK-NEXT: .LBB0_2: # %.lr.ph
+; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: movq %rax, (%rsi)
+; CHECK-NEXT: addq $8, %rsi
+; CHECK-NEXT: decl %edi
+; CHECK-NEXT: jne .LBB0_2
+; CHECK-NEXT: .LBB0_3: # %._crit_edge
+; CHECK-NEXT: retq
%1 = icmp sgt i32 %count, 0
br i1 %1, label %.lr.ph, label %._crit_edge
.lr.ph:
@@ -39,10 +52,23 @@ define void @merge_const_store(i32 %count, %struct.A* nocapture %p) nounwind uwt
}
; No vectors because we use noimplicitfloat
-; CHECK-LABEL: merge_const_store_no_vec:
-; CHECK-NOT: vmovups
-; CHECK: ret
define void @merge_const_store_no_vec(i32 %count, %struct.B* nocapture %p) noimplicitfloat{
+; CHECK-LABEL: merge_const_store_no_vec:
+; CHECK: # %bb.0:
+; CHECK-NEXT: testl %edi, %edi
+; CHECK-NEXT: jle .LBB1_2
+; CHECK-NEXT: .p2align 4, 0x90
+; CHECK-NEXT: .LBB1_1: # %.lr.ph
+; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: movq $0, (%rsi)
+; CHECK-NEXT: movq $0, 8(%rsi)
+; CHECK-NEXT: movq $0, 16(%rsi)
+; CHECK-NEXT: movq $0, 24(%rsi)
+; CHECK-NEXT: addq $32, %rsi
+; CHECK-NEXT: decl %edi
+; CHECK-NEXT: jne .LBB1_1
+; CHECK-NEXT: .LBB1_2: # %._crit_edge
+; CHECK-NEXT: retq
%1 = icmp sgt i32 %count, 0
br i1 %1, label %.lr.ph, label %._crit_edge
.lr.ph:
@@ -73,10 +99,23 @@ define void @merge_const_store_no_vec(i32 %count, %struct.B* nocapture %p) noimp
}
; Move the constants using a single vector store.
-; CHECK-LABEL: merge_const_store_vec:
-; CHECK: vmovups
-; CHECK: ret
define void @merge_const_store_vec(i32 %count, %struct.B* nocapture %p) nounwind uwtable noinline ssp {
+; CHECK-LABEL: merge_const_store_vec:
+; CHECK: # %bb.0:
+; CHECK-NEXT: testl %edi, %edi
+; CHECK-NEXT: jle .LBB2_3
+; CHECK-NEXT: # %bb.1: # %.lr.ph.preheader
+; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; CHECK-NEXT: .p2align 4, 0x90
+; CHECK-NEXT: .LBB2_2: # %.lr.ph
+; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: vmovups %ymm0, (%rsi)
+; CHECK-NEXT: addq $32, %rsi
+; CHECK-NEXT: decl %edi
+; CHECK-NEXT: jne .LBB2_2
+; CHECK-NEXT: .LBB2_3: # %._crit_edge
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
%1 = icmp sgt i32 %count, 0
br i1 %1, label %.lr.ph, label %._crit_edge
.lr.ph:
@@ -107,13 +146,23 @@ define void @merge_const_store_vec(i32 %count, %struct.B* nocapture %p) nounwind
}
; Move the first 4 constants as a single vector. Move the rest as scalars.
-; CHECK-LABEL: merge_nonconst_store:
-; CHECK: movl $67305985
-; CHECK: movb
-; CHECK: movw
-; CHECK: movb
-; CHECK: ret
define void @merge_nonconst_store(i32 %count, i8 %zz, %struct.A* nocapture %p) nounwind uwtable noinline ssp {
+; CHECK-LABEL: merge_nonconst_store:
+; CHECK: # %bb.0:
+; CHECK-NEXT: testl %edi, %edi
+; CHECK-NEXT: jle .LBB3_2
+; CHECK-NEXT: .p2align 4, 0x90
+; CHECK-NEXT: .LBB3_1: # %.lr.ph
+; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: movl $67305985, (%rdx) # imm = 0x4030201
+; CHECK-NEXT: movb %sil, 4(%rdx)
+; CHECK-NEXT: movw $1798, 5(%rdx) # imm = 0x706
+; CHECK-NEXT: movb $8, 7(%rdx)
+; CHECK-NEXT: addq $8, %rdx
+; CHECK-NEXT: decl %edi
+; CHECK-NEXT: jne .LBB3_1
+; CHECK-NEXT: .LBB3_2: # %._crit_edge
+; CHECK-NEXT: retq
%1 = icmp sgt i32 %count, 0
br i1 %1, label %.lr.ph, label %._crit_edge
.lr.ph:
@@ -143,15 +192,34 @@ define void @merge_nonconst_store(i32 %count, i8 %zz, %struct.A* nocapture %p) n
ret void
}
-
-; CHECK-LABEL: merge_loads_i16:
-; load:
-; BWON: movzwl
-; BWOFF: movw
-; store:
-; CHECK: movw
-; CHECK: ret
define void @merge_loads_i16(i32 %count, %struct.A* noalias nocapture %q, %struct.A* noalias nocapture %p) nounwind uwtable noinline ssp {
+; BWON-LABEL: merge_loads_i16:
+; BWON: # %bb.0:
+; BWON-NEXT: testl %edi, %edi
+; BWON-NEXT: jle .LBB4_2
+; BWON-NEXT: .p2align 4, 0x90
+; BWON-NEXT: .LBB4_1: # =>This Inner Loop Header: Depth=1
+; BWON-NEXT: movzwl (%rsi), %eax
+; BWON-NEXT: movw %ax, (%rdx)
+; BWON-NEXT: addq $8, %rdx
+; BWON-NEXT: decl %edi
+; BWON-NEXT: jne .LBB4_1
+; BWON-NEXT: .LBB4_2: # %._crit_edge
+; BWON-NEXT: retq
+;
+; BWOFF-LABEL: merge_loads_i16:
+; BWOFF: # %bb.0:
+; BWOFF-NEXT: testl %edi, %edi
+; BWOFF-NEXT: jle .LBB4_2
+; BWOFF-NEXT: .p2align 4, 0x90
+; BWOFF-NEXT: .LBB4_1: # =>This Inner Loop Header: Depth=1
+; BWOFF-NEXT: movw (%rsi), %ax
+; BWOFF-NEXT: movw %ax, (%rdx)
+; BWOFF-NEXT: addq $8, %rdx
+; BWOFF-NEXT: decl %edi
+; BWOFF-NEXT: jne .LBB4_1
+; BWOFF-NEXT: .LBB4_2: # %._crit_edge
+; BWOFF-NEXT: retq
%1 = icmp sgt i32 %count, 0
br i1 %1, label %.lr.ph, label %._crit_edge
@@ -179,15 +247,40 @@ define void @merge_loads_i16(i32 %count, %struct.A* noalias nocapture %q, %struc
}
; The loads and the stores are interleaved. Can't merge them.
-; CHECK-LABEL: no_merge_loads:
-; BWON: movzbl
-; BWOFF: movb
-; CHECK: movb
-; BWON: movzbl
-; BWOFF: movb
-; CHECK: movb
-; CHECK: ret
define void @no_merge_loads(i32 %count, %struct.A* noalias nocapture %q, %struct.A* noalias nocapture %p) nounwind uwtable noinline ssp {
+; BWON-LABEL: no_merge_loads:
+; BWON: # %bb.0:
+; BWON-NEXT: testl %edi, %edi
+; BWON-NEXT: jle .LBB5_2
+; BWON-NEXT: .p2align 4, 0x90
+; BWON-NEXT: .LBB5_1: # %a4
+; BWON-NEXT: # =>This Inner Loop Header: Depth=1
+; BWON-NEXT: movzbl (%rsi), %eax
+; BWON-NEXT: movb %al, (%rdx)
+; BWON-NEXT: movzbl 1(%rsi), %eax
+; BWON-NEXT: movb %al, 1(%rdx)
+; BWON-NEXT: addq $8, %rdx
+; BWON-NEXT: decl %edi
+; BWON-NEXT: jne .LBB5_1
+; BWON-NEXT: .LBB5_2: # %._crit_edge
+; BWON-NEXT: retq
+;
+; BWOFF-LABEL: no_merge_loads:
+; BWOFF: # %bb.0:
+; BWOFF-NEXT: testl %edi, %edi
+; BWOFF-NEXT: jle .LBB5_2
+; BWOFF-NEXT: .p2align 4, 0x90
+; BWOFF-NEXT: .LBB5_1: # %a4
+; BWOFF-NEXT: # =>This Inner Loop Header: Depth=1
+; BWOFF-NEXT: movb (%rsi), %al
+; BWOFF-NEXT: movb %al, (%rdx)
+; BWOFF-NEXT: movb 1(%rsi), %al
+; BWOFF-NEXT: movb %al, 1(%rdx)
+; BWOFF-NEXT: addq $8, %rdx
+; BWOFF-NEXT: decl %edi
+; BWOFF-NEXT: jne .LBB5_1
+; BWOFF-NEXT: .LBB5_2: # %._crit_edge
+; BWOFF-NEXT: retq
%1 = icmp sgt i32 %count, 0
br i1 %1, label %.lr.ph, label %._crit_edge
@@ -214,14 +307,20 @@ a4: ; preds = %4, %.lr.ph
ret void
}
-
-; CHECK-LABEL: merge_loads_integer:
-; load:
-; CHECK: movq
-; store:
-; CHECK: movq
-; CHECK: ret
define void @merge_loads_integer(i32 %count, %struct.B* noalias nocapture %q, %struct.B* noalias nocapture %p) nounwind uwtable noinline ssp {
+; CHECK-LABEL: merge_loads_integer:
+; CHECK: # %bb.0:
+; CHECK-NEXT: testl %edi, %edi
+; CHECK-NEXT: jle .LBB6_2
+; CHECK-NEXT: .p2align 4, 0x90
+; CHECK-NEXT: .LBB6_1: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: movq (%rsi), %rax
+; CHECK-NEXT: movq %rax, (%rdx)
+; CHECK-NEXT: addq $32, %rdx
+; CHECK-NEXT: decl %edi
+; CHECK-NEXT: jne .LBB6_1
+; CHECK-NEXT: .LBB6_2: # %._crit_edge
+; CHECK-NEXT: retq
%1 = icmp sgt i32 %count, 0
br i1 %1, label %.lr.ph, label %._crit_edge
@@ -248,14 +347,21 @@ define void @merge_loads_integer(i32 %count, %struct.B* noalias nocapture %q, %s
ret void
}
-
-; CHECK-LABEL: merge_loads_vector:
-; load:
-; CHECK: movups
-; store:
-; CHECK: movups
-; CHECK: ret
define void @merge_loads_vector(i32 %count, %struct.B* noalias nocapture %q, %struct.B* noalias nocapture %p) nounwind uwtable noinline ssp {
+; CHECK-LABEL: merge_loads_vector:
+; CHECK: # %bb.0:
+; CHECK-NEXT: testl %edi, %edi
+; CHECK-NEXT: jle .LBB7_2
+; CHECK-NEXT: .p2align 4, 0x90
+; CHECK-NEXT: .LBB7_1: # %block4
+; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: vmovups (%rsi), %xmm0
+; CHECK-NEXT: vmovups %xmm0, (%rdx)
+; CHECK-NEXT: addq $32, %rdx
+; CHECK-NEXT: decl %edi
+; CHECK-NEXT: jne .LBB7_1
+; CHECK-NEXT: .LBB7_2: # %._crit_edge
+; CHECK-NEXT: retq
%a1 = icmp sgt i32 %count, 0
br i1 %a1, label %.lr.ph, label %._crit_edge
@@ -290,14 +396,22 @@ block4: ; preds = %4, %.lr.ph
ret void
}
-;; On x86, even unaligned copies can be merged to vector ops.
-; CHECK-LABEL: merge_loads_no_align:
-; load:
-; CHECK: vmovups
-; store:
-; CHECK: vmovups
-; CHECK: ret
+; On x86, even unaligned copies can be merged to vector ops.
define void @merge_loads_no_align(i32 %count, %struct.B* noalias nocapture %q, %struct.B* noalias nocapture %p) nounwind uwtable noinline ssp {
+; CHECK-LABEL: merge_loads_no_align:
+; CHECK: # %bb.0:
+; CHECK-NEXT: testl %edi, %edi
+; CHECK-NEXT: jle .LBB8_2
+; CHECK-NEXT: .p2align 4, 0x90
+; CHECK-NEXT: .LBB8_1: # %block4
+; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: vmovups (%rsi), %xmm0
+; CHECK-NEXT: vmovups %xmm0, (%rdx)
+; CHECK-NEXT: addq $32, %rdx
+; CHECK-NEXT: decl %edi
+; CHECK-NEXT: jne .LBB8_1
+; CHECK-NEXT: .LBB8_2: # %._crit_edge
+; CHECK-NEXT: retq
%a1 = icmp sgt i32 %count, 0
br i1 %a1, label %.lr.ph, label %._crit_edge
@@ -334,11 +448,36 @@ block4: ; preds = %4, %.lr.ph
; Make sure that we merge the consecutive load/store sequence below and use a
; word (16 bit) instead of a byte copy.
-; CHECK-LABEL: MergeLoadStoreBaseIndexOffset:
-; BWON: movzwl (%{{.*}},%{{.*}}), %e[[REG:[a-z]+]]
-; BWOFF: movw (%{{.*}},%{{.*}}), %[[REG:[a-z]+]]
-; CHECK: movw %[[REG]], (%{{.*}})
define void @MergeLoadStoreBaseIndexOffset(i64* %a, i8* %b, i8* %c, i32 %n) {
+; BWON-LABEL: MergeLoadStoreBaseIndexOffset:
+; BWON: # %bb.0:
+; BWON-NEXT: movl %ecx, %r8d
+; BWON-NEXT: xorl %ecx, %ecx
+; BWON-NEXT: .p2align 4, 0x90
+; BWON-NEXT: .LBB9_1: # =>This Inner Loop Header: Depth=1
+; BWON-NEXT: movq (%rdi,%rcx,8), %rax
+; BWON-NEXT: movzwl (%rdx,%rax), %eax
+; BWON-NEXT: movw %ax, (%rsi,%rcx,2)
+; BWON-NEXT: incq %rcx
+; BWON-NEXT: cmpl %ecx, %r8d
+; BWON-NEXT: jne .LBB9_1
+; BWON-NEXT: # %bb.2:
+; BWON-NEXT: retq
+;
+; BWOFF-LABEL: MergeLoadStoreBaseIndexOffset:
+; BWOFF: # %bb.0:
+; BWOFF-NEXT: movl %ecx, %r8d
+; BWOFF-NEXT: xorl %ecx, %ecx
+; BWOFF-NEXT: .p2align 4, 0x90
+; BWOFF-NEXT: .LBB9_1: # =>This Inner Loop Header: Depth=1
+; BWOFF-NEXT: movq (%rdi,%rcx,8), %rax
+; BWOFF-NEXT: movw (%rdx,%rax), %ax
+; BWOFF-NEXT: movw %ax, (%rsi,%rcx,2)
+; BWOFF-NEXT: incq %rcx
+; BWOFF-NEXT: cmpl %ecx, %r8d
+; BWOFF-NEXT: jne .LBB9_1
+; BWOFF-NEXT: # %bb.2:
+; BWOFF-NEXT: retq
br label %1
; <label>:1
@@ -366,12 +505,36 @@ define void @MergeLoadStoreBaseIndexOffset(i64* %a, i8* %b, i8* %c, i32 %n) {
; Make sure that we merge the consecutive load/store sequence below and use a
; word (16 bit) instead of a byte copy for complicated address calculation.
-; .
-; CHECK-LABEL: MergeLoadStoreBaseIndexOffsetComplicated:
-; BWON: movzwl (%{{.*}},%{{.*}}), %e[[REG:[a-z]+]]
-; BWOFF: movw (%{{.*}},%{{.*}}), %[[REG:[a-z]+]]
-; CHECK: movw %[[REG]], (%{{.*}})
define void @MergeLoadStoreBaseIndexOffsetComplicated(i8* %a, i8* %b, i8* %c, i64 %n) {
+; BWON-LABEL: MergeLoadStoreBaseIndexOffsetComplicated:
+; BWON: # %bb.0:
+; BWON-NEXT: xorl %r8d, %r8d
+; BWON-NEXT: .p2align 4, 0x90
+; BWON-NEXT: .LBB10_1: # =>This Inner Loop Header: Depth=1
+; BWON-NEXT: movsbq (%rsi), %rax
+; BWON-NEXT: movzwl (%rdx,%rax), %eax
+; BWON-NEXT: movw %ax, (%rdi,%r8)
+; BWON-NEXT: incq %rsi
+; BWON-NEXT: addq $2, %r8
+; BWON-NEXT: cmpq %rcx, %r8
+; BWON-NEXT: jl .LBB10_1
+; BWON-NEXT: # %bb.2:
+; BWON-NEXT: retq
+;
+; BWOFF-LABEL: MergeLoadStoreBaseIndexOffsetComplicated:
+; BWOFF: # %bb.0:
+; BWOFF-NEXT: xorl %r8d, %r8d
+; BWOFF-NEXT: .p2align 4, 0x90
+; BWOFF-NEXT: .LBB10_1: # =>This Inner Loop Header: Depth=1
+; BWOFF-NEXT: movsbq (%rsi), %rax
+; BWOFF-NEXT: movw (%rdx,%rax), %ax
+; BWOFF-NEXT: movw %ax, (%rdi,%r8)
+; BWOFF-NEXT: incq %rsi
+; BWOFF-NEXT: addq $2, %r8
+; BWOFF-NEXT: cmpq %rcx, %r8
+; BWOFF-NEXT: jl .LBB10_1
+; BWOFF-NEXT: # %bb.2:
+; BWOFF-NEXT: retq
br label %1
; <label>:1
@@ -401,11 +564,36 @@ define void @MergeLoadStoreBaseIndexOffsetComplicated(i8* %a, i8* %b, i8* %c, i6
; Make sure that we merge the consecutive load/store sequence below and use a
; word (16 bit) instead of a byte copy even if there are intermediate sign
; extensions.
-; CHECK-LABEL: MergeLoadStoreBaseIndexOffsetSext:
-; BWON: movzwl (%{{.*}},%{{.*}}), %e[[REG:[a-z]+]]
-; BWOFF: movw (%{{.*}},%{{.*}}), %[[REG:[a-z]+]]
-; CHECK: movw %[[REG]], (%{{.*}})
define void @MergeLoadStoreBaseIndexOffsetSext(i8* %a, i8* %b, i8* %c, i32 %n) {
+; BWON-LABEL: MergeLoadStoreBaseIndexOffsetSext:
+; BWON: # %bb.0:
+; BWON-NEXT: movl %ecx, %r8d
+; BWON-NEXT: xorl %ecx, %ecx
+; BWON-NEXT: .p2align 4, 0x90
+; BWON-NEXT: .LBB11_1: # =>This Inner Loop Header: Depth=1
+; BWON-NEXT: movsbq (%rdi,%rcx), %rax
+; BWON-NEXT: movzwl (%rdx,%rax), %eax
+; BWON-NEXT: movw %ax, (%rsi,%rcx,2)
+; BWON-NEXT: incq %rcx
+; BWON-NEXT: cmpl %ecx, %r8d
+; BWON-NEXT: jne .LBB11_1
+; BWON-NEXT: # %bb.2:
+; BWON-NEXT: retq
+;
+; BWOFF-LABEL: MergeLoadStoreBaseIndexOffsetSext:
+; BWOFF: # %bb.0:
+; BWOFF-NEXT: movl %ecx, %r8d
+; BWOFF-NEXT: xorl %ecx, %ecx
+; BWOFF-NEXT: .p2align 4, 0x90
+; BWOFF-NEXT: .LBB11_1: # =>This Inner Loop Header: Depth=1
+; BWOFF-NEXT: movsbq (%rdi,%rcx), %rax
+; BWOFF-NEXT: movw (%rdx,%rax), %ax
+; BWOFF-NEXT: movw %ax, (%rsi,%rcx,2)
+; BWOFF-NEXT: incq %rcx
+; BWOFF-NEXT: cmpl %ecx, %r8d
+; BWOFF-NEXT: jne .LBB11_1
+; BWOFF-NEXT: # %bb.2:
+; BWOFF-NEXT: retq
br label %1
; <label>:1
@@ -434,10 +622,44 @@ define void @MergeLoadStoreBaseIndexOffsetSext(i8* %a, i8* %b, i8* %c, i32 %n) {
; However, we can only merge ignore sign extensions when they are on all memory
; computations;
-; CHECK-LABEL: loadStoreBaseIndexOffsetSextNoSex:
-; CHECK-NOT: movw (%{{.*}},%{{.*}}), [[REG:%[a-z]+]]
-; CHECK-NOT: movw [[REG]], (%{{.*}})
define void @loadStoreBaseIndexOffsetSextNoSex(i8* %a, i8* %b, i8* %c, i32 %n) {
+; BWON-LABEL: loadStoreBaseIndexOffsetSextNoSex:
+; BWON: # %bb.0:
+; BWON-NEXT: movl %ecx, %r8d
+; BWON-NEXT: xorl %ecx, %ecx
+; BWON-NEXT: .p2align 4, 0x90
+; BWON-NEXT: .LBB12_1: # =>This Inner Loop Header: Depth=1
+; BWON-NEXT: movsbq (%rdi,%rcx), %rax
+; BWON-NEXT: movzbl (%rdx,%rax), %r9d
+; BWON-NEXT: incb %al
+; BWON-NEXT: movsbq %al, %rax
+; BWON-NEXT: movzbl (%rdx,%rax), %eax
+; BWON-NEXT: movb %r9b, (%rsi,%rcx,2)
+; BWON-NEXT: movb %al, 1(%rsi,%rcx,2)
+; BWON-NEXT: incq %rcx
+; BWON-NEXT: cmpl %ecx, %r8d
+; BWON-NEXT: jne .LBB12_1
+; BWON-NEXT: # %bb.2:
+; BWON-NEXT: retq
+;
+; BWOFF-LABEL: loadStoreBaseIndexOffsetSextNoSex:
+; BWOFF: # %bb.0:
+; BWOFF-NEXT: movl %ecx, %r8d
+; BWOFF-NEXT: xorl %ecx, %ecx
+; BWOFF-NEXT: .p2align 4, 0x90
+; BWOFF-NEXT: .LBB12_1: # =>This Inner Loop Header: Depth=1
+; BWOFF-NEXT: movsbq (%rdi,%rcx), %rax
+; BWOFF-NEXT: movb (%rdx,%rax), %r9b
+; BWOFF-NEXT: incb %al
+; BWOFF-NEXT: movsbq %al, %rax
+; BWOFF-NEXT: movb (%rdx,%rax), %al
+; BWOFF-NEXT: movb %r9b, (%rsi,%rcx,2)
+; BWOFF-NEXT: movb %al, 1(%rsi,%rcx,2)
+; BWOFF-NEXT: incq %rcx
+; BWOFF-NEXT: cmpl %ecx, %r8d
+; BWOFF-NEXT: jne .LBB12_1
+; BWOFF-NEXT: # %bb.2:
+; BWOFF-NEXT: retq
br label %1
; <label>:1
@@ -467,6 +689,11 @@ define void @loadStoreBaseIndexOffsetSextNoSex(i8* %a, i8* %b, i8* %c, i32 %n) {
; PR21711 ( http://llvm.org/bugs/show_bug.cgi?id=21711 )
define void @merge_vec_element_store(<8 x float> %v, float* %ptr) {
+; CHECK-LABEL: merge_vec_element_store:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm0, (%rdi)
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
%vecext0 = extractelement <8 x float> %v, i32 0
%vecext1 = extractelement <8 x float> %v, i32 1
%vecext2 = extractelement <8 x float> %v, i32 2
@@ -492,15 +719,17 @@ define void @merge_vec_element_store(<8 x float> %v, float* %ptr) {
store float %vecext7, float* %arrayidx7, align 4
ret void
-; CHECK-LABEL: merge_vec_element_store
-; CHECK: vmovups
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
}
; PR21711 - Merge vector stores into wider vector stores.
; These should be merged into 32-byte stores.
define void @merge_vec_extract_stores(<8 x float> %v1, <8 x float> %v2, <4 x float>* %ptr) {
+; CHECK-LABEL: merge_vec_extract_stores:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm0, 48(%rdi)
+; CHECK-NEXT: vmovups %ymm1, 80(%rdi)
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
%idx0 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 3
%idx1 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 4
%idx2 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 5
@@ -515,15 +744,16 @@ define void @merge_vec_extract_stores(<8 x float> %v1, <8 x float> %v2, <4 x flo
store <4 x float> %shuffle3, <4 x float>* %idx3, align 16
ret void
-; CHECK-LABEL: merge_vec_extract_stores
-; CHECK: vmovups %ymm0, 48(%rdi)
-; CHECK-NEXT: vmovups %ymm1, 80(%rdi)
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
}
-; Merging vector stores when sourced from vector loads is not currently handled.
+; Merging vector stores when sourced from vector loads.
define void @merge_vec_stores_from_loads(<4 x float>* %v, <4 x float>* %ptr) {
+; CHECK-LABEL: merge_vec_stores_from_loads:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups (%rdi), %ymm0
+; CHECK-NEXT: vmovups %ymm0, (%rsi)
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
%load_idx0 = getelementptr inbounds <4 x float>, <4 x float>* %v, i64 0
%load_idx1 = getelementptr inbounds <4 x float>, <4 x float>* %v, i64 1
%v0 = load <4 x float>, <4 x float>* %load_idx0
@@ -534,33 +764,32 @@ define void @merge_vec_stores_from_loads(<4 x float>* %v, <4 x float>* %ptr) {
store <4 x float> %v1, <4 x float>* %store_idx1, align 16
ret void
-; CHECK-LABEL: merge_vec_stores_from_loads
-; CHECK: vmovaps
-; CHECK-NEXT: vmovaps
-; CHECK-NEXT: vmovaps
-; CHECK-NEXT: vmovaps
-; CHECK-NEXT: retq
}
-; Merging vector stores when sourced from a constant vector is not currently handled.
+; Merging vector stores when sourced from a constant vector is not currently handled.
define void @merge_vec_stores_of_constants(<4 x i32>* %ptr) {
+; CHECK-LABEL: merge_vec_stores_of_constants:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; CHECK-NEXT: vmovaps %xmm0, 48(%rdi)
+; CHECK-NEXT: vmovaps %xmm0, 64(%rdi)
+; CHECK-NEXT: retq
%idx0 = getelementptr inbounds <4 x i32>, <4 x i32>* %ptr, i64 3
%idx1 = getelementptr inbounds <4 x i32>, <4 x i32>* %ptr, i64 4
store <4 x i32> <i32 0, i32 0, i32 0, i32 0>, <4 x i32>* %idx0, align 16
store <4 x i32> <i32 0, i32 0, i32 0, i32 0>, <4 x i32>* %idx1, align 16
ret void
-; CHECK-LABEL: merge_vec_stores_of_constants
-; CHECK: vxorps
-; CHECK-NEXT: vmovaps
-; CHECK-NEXT: vmovaps
-; CHECK-NEXT: retq
}
; This is a minimized test based on real code that was failing.
-; We could merge stores (and loads) like this...
-
+; This should now be merged.
define void @merge_vec_element_and_scalar_load([6 x i64]* %array) {
+; CHECK-LABEL: merge_vec_element_and_scalar_load:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups (%rdi), %xmm0
+; CHECK-NEXT: vmovups %xmm0, 32(%rdi)
+; CHECK-NEXT: retq
%idx0 = getelementptr inbounds [6 x i64], [6 x i64]* %array, i64 0, i64 0
%idx1 = getelementptr inbounds [6 x i64], [6 x i64]* %array, i64 0, i64 1
%idx4 = getelementptr inbounds [6 x i64], [6 x i64]* %array, i64 0, i64 4
@@ -575,16 +804,16 @@ define void @merge_vec_element_and_scalar_load([6 x i64]* %array) {
store i64 %a1, i64* %idx5, align 8
ret void
-; CHECK-LABEL: merge_vec_element_and_scalar_load
-; CHECK: movq (%rdi), %rax
-; CHECK-NEXT: movq 8(%rdi), %rcx
-; CHECK-NEXT: movq %rax, 32(%rdi)
-; CHECK-NEXT: movq %rcx, 40(%rdi)
-; CHECK-NEXT: retq
}
; Don't let a non-consecutive store thwart merging of the last two.
define void @almost_consecutive_stores(i8* %p) {
+; CHECK-LABEL: almost_consecutive_stores:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movb $0, (%rdi)
+; CHECK-NEXT: movb $1, 42(%rdi)
+; CHECK-NEXT: movw $770, 2(%rdi) # imm = 0x302
+; CHECK-NEXT: retq
store i8 0, i8* %p
%p1 = getelementptr i8, i8* %p, i64 42
store i8 1, i8* %p1
@@ -593,17 +822,15 @@ define void @almost_consecutive_stores(i8* %p) {
%p3 = getelementptr i8, i8* %p, i64 3
store i8 3, i8* %p3
ret void
-; CHECK-LABEL: almost_consecutive_stores
-; CHECK-DAG: movb $0, (%rdi)
-; CHECK-DAG: movb $1, 42(%rdi)
-; CHECK-DAG: movw $770, 2(%rdi)
-; CHECK: retq
}
; We should be able to merge these.
define void @merge_bitcast(<4 x i32> %v, float* %ptr) {
+; CHECK-LABEL: merge_bitcast:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %xmm0, (%rdi)
+; CHECK-NEXT: retq
%fv = bitcast <4 x i32> %v to <4 x float>
-
%vecext1 = extractelement <4 x i32> %v, i32 1
%vecext2 = extractelement <4 x i32> %v, i32 2
%vecext3 = extractelement <4 x i32> %v, i32 3
@@ -620,11 +847,4 @@ define void @merge_bitcast(<4 x i32> %v, float* %ptr) {
store float %f2, float* %idx2, align 4
store float %f3, float* %idx3, align 4
ret void
-
-; CHECK-LABEL: merge_bitcast
-; CHECK: vmovd %xmm0, (%rdi)
-; CHECK-NEXT: vpextrd $1, %xmm0, 4(%rdi)
-; CHECK-NEXT: vpextrd $2, %xmm0, 8(%rdi)
-; CHECK-NEXT: vpextrd $3, %xmm0, 12(%rdi)
-; CHECK-NEXT: retq
}
diff --git a/test/CodeGen/X86/O0-pipeline.ll b/test/CodeGen/X86/O0-pipeline.ll
index 5e375cc42e01..cb7dabefe45a 100644
--- a/test/CodeGen/X86/O0-pipeline.ll
+++ b/test/CodeGen/X86/O0-pipeline.ll
@@ -22,7 +22,7 @@
; CHECK-NEXT: Lower Garbage Collection Instructions
; CHECK-NEXT: Shadow Stack GC Lowering
; CHECK-NEXT: Remove unreachable blocks from the CFG
-; CHECK-NEXT: Inserts calls to mcount-like functions
+; CHECK-NEXT: Instrument function entry/exit with calls to e.g. mcount() (post inlining)
; CHECK-NEXT: Scalarize Masked Memory Intrinsics
; CHECK-NEXT: Expand reduction intrinsics
; CHECK-NEXT: Rewrite Symbols
@@ -42,6 +42,8 @@
; CHECK-NEXT: Fast Register Allocator
; CHECK-NEXT: Bundle Machine CFG Edges
; CHECK-NEXT: X86 FP Stackifier
+; CHECK-NEXT: Lazy Machine Block Frequency Analysis
+; CHECK-NEXT: Machine Optimization Remark Emitter
; CHECK-NEXT: Prologue/Epilogue Insertion & Frame Finalization
; CHECK-NEXT: Post-RA pseudo instruction expansion pass
; CHECK-NEXT: X86 pseudo instruction expansion pass
diff --git a/test/CodeGen/X86/PR34565.ll b/test/CodeGen/X86/PR34565.ll
new file mode 100644
index 000000000000..21ea13869229
--- /dev/null
+++ b/test/CodeGen/X86/PR34565.ll
@@ -0,0 +1,60 @@
+; RUN: llc -mtriple=x86_64-pc-linux -x86-cmov-converter=true -verify-machineinstrs < %s | FileCheck %s
+
+; Test for PR34565, check that DBG instructions are ignored while optimizing
+; X86 CMOV instructions.
+; In this case, we check that there is no 'cmov' generated.
+
+; CHECK-NOT: cmov
+
+@main.buf = private unnamed_addr constant [10 x i64] [i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9], align 8
+
+define i32 @main() #0 !dbg !5 {
+entry:
+ br label %while.body
+
+while.body: ; preds = %while.body, %entry
+ %a.010 = phi i32 [ 0, %entry ], [ %add.a.0, %while.body ]
+ %b.09 = phi i32 [ 10, %entry ], [ %b.0.add, %while.body ]
+ %add = add i32 %a.010, %b.09
+ %call = tail call i32 @rand()
+ %conv = sext i32 %call to i64
+ %arrayidx = getelementptr inbounds [10 x i64], [10 x i64]* @main.buf, i32 0, i32 %add
+ %0 = load i64, i64* %arrayidx, align 8
+ %cmp1 = icmp ult i64 %0, %conv
+ %b.0.add = select i1 %cmp1, i32 %b.09, i32 %add
+ %add.a.0 = select i1 %cmp1, i32 %add, i32 %a.010
+ tail call void @llvm.dbg.value(metadata i32 %add.a.0, metadata !10, metadata !DIExpression()), !dbg !13
+ tail call void @llvm.dbg.value(metadata i32 %b.0.add, metadata !12, metadata !DIExpression()), !dbg !14
+ tail call void @llvm.dbg.value(metadata i32 %add.a.0, metadata !10, metadata !DIExpression()), !dbg !13
+ tail call void @llvm.dbg.value(metadata i32 %b.0.add, metadata !12, metadata !DIExpression()), !dbg !14
+ %cmp = icmp ult i32 %add.a.0, %b.0.add
+ br i1 %cmp, label %while.body, label %while.end
+
+while.end: ; preds = %while.body
+ ret i32 0
+}
+
+declare i32 @rand()
+
+declare void @llvm.dbg.value(metadata, metadata, metadata)
+
+attributes #0 = { "target-cpu"="x86-64" }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 6.0.0 (trunk)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
+!1 = !DIFile(filename: "PR34565.c", directory: "\5C")
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = distinct !DISubprogram(name: "main", scope: !1, file: !1, line: 3, type: !6, isLocal: false, isDefinition: true, scopeLine: 4, flags: DIFlagPrototyped, isOptimized: true, unit: !0, variables: !9)
+!6 = !DISubroutineType(types: !7)
+!7 = !{!8}
+!8 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!9 = !{!10, !12}
+!10 = !DILocalVariable(name: "a", scope: !5, file: !1, line: 6, type: !11)
+!11 = !DIBasicType(name: "unsigned int", size: 32, encoding: DW_ATE_unsigned)
+!12 = !DILocalVariable(name: "b", scope: !5, file: !1, line: 7, type: !11)
+!13 = !DILocation(line: 6, column: 16, scope: !5)
+!14 = !DILocation(line: 7, column: 16, scope: !5)
diff --git a/test/CodeGen/X86/SwitchLowering.ll b/test/CodeGen/X86/SwitchLowering.ll
index 5f17d9d85726..6ee5152c040f 100644
--- a/test/CodeGen/X86/SwitchLowering.ll
+++ b/test/CodeGen/X86/SwitchLowering.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | grep cmp | count 1
+; RUN: llc < %s -mtriple=i686-- | grep cmp | count 1
; PR964
define i8* @FindChar(i8* %CurPtr) {
diff --git a/test/CodeGen/X86/SwizzleShuff.ll b/test/CodeGen/X86/SwizzleShuff.ll
index ba21c32e56eb..e6519a60a4b4 100644
--- a/test/CodeGen/X86/SwizzleShuff.ll
+++ b/test/CodeGen/X86/SwizzleShuff.ll
@@ -5,7 +5,7 @@
define void @pull_bitcast(<4 x i8>* %pA, <4 x i8>* %pB) {
; CHECK-LABEL: pull_bitcast:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movl (%rsi), %eax
; CHECK-NEXT: xorl %eax, (%rdi)
; CHECK-NEXT: retq
@@ -18,7 +18,7 @@ define void @pull_bitcast(<4 x i8>* %pA, <4 x i8>* %pB) {
define <4 x i32> @multi_use_swizzle(<4 x i32>* %pA, <4 x i32>* %pB) {
; CHECK-LABEL: multi_use_swizzle:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps (%rdi), %xmm0
; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1],mem[1,2]
; CHECK-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,3,2,2]
@@ -36,7 +36,7 @@ define <4 x i32> @multi_use_swizzle(<4 x i32>* %pA, <4 x i32>* %pB) {
define <4 x i8> @pull_bitcast2(<4 x i8>* %pA, <4 x i8>* %pB, <4 x i8>* %pC) {
; CHECK-LABEL: pull_bitcast2:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movl (%rdi), %eax
; CHECK-NEXT: movl %eax, (%rdx)
; CHECK-NEXT: xorl (%rsi), %eax
@@ -53,7 +53,7 @@ define <4 x i8> @pull_bitcast2(<4 x i8>* %pA, <4 x i8>* %pB, <4 x i8>* %pC) {
define <4 x i32> @reverse_1(<4 x i32>* %pA, <4 x i32>* %pB) {
; CHECK-LABEL: reverse_1:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps (%rdi), %xmm0
; CHECK-NEXT: retq
%A = load <4 x i32>, <4 x i32>* %pA
@@ -65,8 +65,8 @@ define <4 x i32> @reverse_1(<4 x i32>* %pA, <4 x i32>* %pB) {
define <4 x i32> @no_reverse_shuff(<4 x i32>* %pA, <4 x i32>* %pB) {
; CHECK-LABEL: no_reverse_shuff:
-; CHECK: # BB#0:
-; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = mem[2,3,2,3]
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,2,3]
; CHECK-NEXT: retq
%A = load <4 x i32>, <4 x i32>* %pA
%B = load <4 x i32>, <4 x i32>* %pB
diff --git a/test/CodeGen/X86/TruncAssertSext.ll b/test/CodeGen/X86/TruncAssertSext.ll
new file mode 100644
index 000000000000..9ab7622ef9dc
--- /dev/null
+++ b/test/CodeGen/X86/TruncAssertSext.ll
@@ -0,0 +1,20 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -O2 -mtriple=x86_64-unknown-unknown | FileCheck %s
+; Checks that a zeroing mov is inserted for the trunc/zext pair even when
+; the source of the zext is an AssertSext node
+; PR20494
+
+define i64 @main(i64 %a) {
+; CHECK-LABEL: main:
+; CHECK: # %bb.0:
+; CHECK-NEXT: orq $-2, %rdi
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: retq
+ %or = or i64 %a, -2
+ %trunc = trunc i64 %or to i32
+ br label %l
+l:
+ %ext = zext i32 %trunc to i64
+ trunc i64 %or to i32 ; to keep the or from being narrowed
+ ret i64 %ext
+}
diff --git a/test/CodeGen/X86/TruncAssertZext.ll b/test/CodeGen/X86/TruncAssertZext.ll
index 8c664127f92a..80f8e0f647a0 100644
--- a/test/CodeGen/X86/TruncAssertZext.ll
+++ b/test/CodeGen/X86/TruncAssertZext.ll
@@ -1,16 +1,41 @@
-; RUN: llc < %s -O2 -march=x86-64 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -O2 -mtriple=x86_64-unknown-unknown | FileCheck %s
; Checks that a zeroing mov is inserted for the trunc/zext pair even when
-; the source of the zext is an AssertSext node
-; PR20494
+; the source of the zext is an AssertZext node
+; PR28540
-define i64 @main(i64 %a) {
-; CHECK-LABEL: main
-; CHECK: movl %e{{..}}, %eax
-; CHECK: ret
- %or = or i64 %a, -2
+define i64 @foo() {
+; CHECK-LABEL: foo:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movq $-1, %rax
+; CHECK-NEXT: retq
+ ret i64 -1
+}
+
+define i64 @main() {
+; CHECK-LABEL: main:
+; CHECK: # %bb.0:
+; CHECK-NEXT: pushq %rax
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: callq foo
+; CHECK-NEXT: movabsq $-4294967041, %rcx # imm = 0xFFFFFFFF000000FF
+; CHECK-NEXT: andq %rax, %rcx
+; CHECK-NEXT: movl %ecx, %ecx
+; CHECK-NEXT: leaq (,%rcx,8), %rax
+; CHECK-NEXT: subq %rcx, %rax
+; CHECK-NEXT: shrq $32, %rax
+; CHECK-NEXT: popq %rcx
+; CHECK-NEXT: retq
+ %b = call i64 @foo()
+ %or = and i64 %b, 18446744069414584575 ; this is 0xffffffff000000ff
%trunc = trunc i64 %or to i32
br label %l
l:
%ext = zext i32 %trunc to i64
- ret i64 %ext
+ %mul = mul i64 %ext, 7
+ br label %m
+m: ; keeps dag combine from seeing the multiply and the shift together
+ %shr = lshr i64 %mul, 32
+ trunc i64 %or to i32 ; keeps the and alive so it doesn't simplify
+ ret i64 %shr
}
diff --git a/test/CodeGen/X86/WidenArith.ll b/test/CodeGen/X86/WidenArith.ll
index 7470416ba7e6..cb9bf03b64c2 100644
--- a/test/CodeGen/X86/WidenArith.ll
+++ b/test/CodeGen/X86/WidenArith.ll
@@ -4,7 +4,7 @@
define <8 x i32> @test(<8 x float> %a, <8 x float> %b) {
; X86-LABEL: test:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: vaddps %ymm1, %ymm0, %ymm2
; X86-NEXT: vmulps %ymm0, %ymm1, %ymm1
; X86-NEXT: vsubps %ymm2, %ymm1, %ymm3
@@ -15,7 +15,7 @@ define <8 x i32> @test(<8 x float> %a, <8 x float> %b) {
; X86-NEXT: retl
;
; X64-LABEL: test:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vaddps %ymm1, %ymm0, %ymm2
; X64-NEXT: vmulps %ymm0, %ymm1, %ymm1
; X64-NEXT: vsubps %ymm2, %ymm1, %ymm3
diff --git a/test/CodeGen/X86/abi-isel.ll b/test/CodeGen/X86/abi-isel.ll
index 742041a974b3..47f95fde57ba 100644
--- a/test/CodeGen/X86/abi-isel.ll
+++ b/test/CodeGen/X86/abi-isel.ll
@@ -1,16 +1,16 @@
-; RUN: llc < %s -asm-verbose=0 -mcpu=generic -mtriple=i686-unknown-linux-gnu -march=x86 -relocation-model=static -code-model=small -pre-RA-sched=list-ilp | FileCheck %s -check-prefix=LINUX-32-STATIC
-; RUN: llc < %s -asm-verbose=0 -mcpu=generic -mtriple=i686-unknown-linux-gnu -march=x86 -relocation-model=static -code-model=small -pre-RA-sched=list-ilp | FileCheck %s -check-prefix=LINUX-32-PIC
+; RUN: llc < %s -asm-verbose=0 -mcpu=generic -mtriple=i686-unknown-linux-gnu -relocation-model=static -code-model=small -pre-RA-sched=list-ilp | FileCheck %s -check-prefix=LINUX-32-STATIC
+; RUN: llc < %s -asm-verbose=0 -mcpu=generic -mtriple=i686-unknown-linux-gnu -relocation-model=static -code-model=small -pre-RA-sched=list-ilp | FileCheck %s -check-prefix=LINUX-32-PIC
-; RUN: llc < %s -asm-verbose=0 -mcpu=generic -mtriple=x86_64-unknown-linux-gnu -march=x86-64 -relocation-model=static -code-model=small -pre-RA-sched=list-ilp | FileCheck %s -check-prefix=LINUX-64-STATIC
-; RUN: llc < %s -asm-verbose=0 -mcpu=generic -mtriple=x86_64-unknown-linux-gnu -march=x86-64 -relocation-model=pic -code-model=small -pre-RA-sched=list-ilp | FileCheck %s -check-prefix=LINUX-64-PIC
+; RUN: llc < %s -asm-verbose=0 -mcpu=generic -mtriple=x86_64-unknown-linux-gnu -relocation-model=static -code-model=small -pre-RA-sched=list-ilp | FileCheck %s -check-prefix=LINUX-64-STATIC
+; RUN: llc < %s -asm-verbose=0 -mcpu=generic -mtriple=x86_64-unknown-linux-gnu -relocation-model=pic -code-model=small -pre-RA-sched=list-ilp | FileCheck %s -check-prefix=LINUX-64-PIC
-; RUN: llc < %s -asm-verbose=0 -mcpu=generic -mtriple=i686-apple-darwin -march=x86 -relocation-model=static -code-model=small -pre-RA-sched=list-ilp | FileCheck %s -check-prefix=DARWIN-32-STATIC
-; RUN: llc < %s -asm-verbose=0 -mcpu=generic -mtriple=i686-apple-darwin9 -march=x86 -relocation-model=dynamic-no-pic -code-model=small -pre-RA-sched=list-ilp | FileCheck %s -check-prefix=DARWIN-32-DYNAMIC
-; RUN: llc < %s -asm-verbose=0 -mcpu=generic -mtriple=i686-apple-darwin9 -march=x86 -relocation-model=pic -code-model=small -pre-RA-sched=list-ilp | FileCheck %s -check-prefix=DARWIN-32-PIC
+; RUN: llc < %s -asm-verbose=0 -mcpu=generic -mtriple=i686-apple-darwin -relocation-model=static -code-model=small -pre-RA-sched=list-ilp | FileCheck %s -check-prefix=DARWIN-32-STATIC
+; RUN: llc < %s -asm-verbose=0 -mcpu=generic -mtriple=i686-apple-darwin9 -relocation-model=dynamic-no-pic -code-model=small -pre-RA-sched=list-ilp | FileCheck %s -check-prefix=DARWIN-32-DYNAMIC
+; RUN: llc < %s -asm-verbose=0 -mcpu=generic -mtriple=i686-apple-darwin9 -relocation-model=pic -code-model=small -pre-RA-sched=list-ilp | FileCheck %s -check-prefix=DARWIN-32-PIC
-; RUN: llc < %s -asm-verbose=0 -mcpu=generic -mtriple=x86_64-apple-darwin -march=x86-64 -relocation-model=static -code-model=small -pre-RA-sched=list-ilp | FileCheck %s -check-prefix=DARWIN-64-STATIC
-; RUN: llc < %s -asm-verbose=0 -mcpu=generic -mtriple=x86_64-apple-darwin -march=x86-64 -relocation-model=dynamic-no-pic -code-model=small -pre-RA-sched=list-ilp | FileCheck %s -check-prefix=DARWIN-64-DYNAMIC
-; RUN: llc < %s -asm-verbose=0 -mcpu=generic -mtriple=x86_64-apple-darwin -march=x86-64 -relocation-model=pic -code-model=small -pre-RA-sched=list-ilp | FileCheck %s -check-prefix=DARWIN-64-PIC
+; RUN: llc < %s -asm-verbose=0 -mcpu=generic -mtriple=x86_64-apple-darwin -relocation-model=static -code-model=small -pre-RA-sched=list-ilp | FileCheck %s -check-prefix=DARWIN-64-STATIC
+; RUN: llc < %s -asm-verbose=0 -mcpu=generic -mtriple=x86_64-apple-darwin -relocation-model=dynamic-no-pic -code-model=small -pre-RA-sched=list-ilp | FileCheck %s -check-prefix=DARWIN-64-DYNAMIC
+; RUN: llc < %s -asm-verbose=0 -mcpu=generic -mtriple=x86_64-apple-darwin -relocation-model=pic -code-model=small -pre-RA-sched=list-ilp | FileCheck %s -check-prefix=DARWIN-64-PIC
@src = external global [131072 x i32]
@dst = external global [131072 x i32]
diff --git a/test/CodeGen/X86/absolute-bit-mask.ll b/test/CodeGen/X86/absolute-bit-mask.ll
index 6e119494ac38..818952dc4538 100644
--- a/test/CodeGen/X86/absolute-bit-mask.ll
+++ b/test/CodeGen/X86/absolute-bit-mask.ll
@@ -43,7 +43,7 @@ f:
define void @foo64(i64* %ptr) {
%load = load i64, i64* %ptr
; CHECK: movabsq $bit_mask64, %rax
- ; CHECK: testq (%rdi), %rax
+ ; CHECK: testq %rax, (%rdi)
%and = and i64 %load, ptrtoint (i8* @bit_mask64 to i64)
%icmp = icmp eq i64 %and, 0
br i1 %icmp, label %t, label %f
diff --git a/test/CodeGen/X86/add-ext.ll b/test/CodeGen/X86/add-ext.ll
index 7a157ecd3fe6..16646fa71ca2 100644
--- a/test/CodeGen/X86/add-ext.ll
+++ b/test/CodeGen/X86/add-ext.ll
@@ -8,7 +8,7 @@
define i64 @add_nsw_consts(i32 %i) {
; CHECK-LABEL: add_nsw_consts:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movslq %edi, %rax
; CHECK-NEXT: addq $12, %rax
; CHECK-NEXT: retq
@@ -24,7 +24,7 @@ define i64 @add_nsw_consts(i32 %i) {
define i64 @add_nsw_sext_add(i32 %i, i64 %x) {
; CHECK-LABEL: add_nsw_sext_add:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movslq %edi, %rax
; CHECK-NEXT: leaq 5(%rsi,%rax), %rax
; CHECK-NEXT: retq
@@ -40,7 +40,7 @@ define i64 @add_nsw_sext_add(i32 %i, i64 %x) {
define i64 @add_nsw_sext_lsh_add(i32 %i, i64 %x) {
; CHECK-LABEL: add_nsw_sext_lsh_add:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movslq %edi, %rax
; CHECK-NEXT: leaq -40(%rsi,%rax,8), %rax
; CHECK-NEXT: retq
@@ -57,7 +57,7 @@ define i64 @add_nsw_sext_lsh_add(i32 %i, i64 %x) {
define i64 @add_nsw_sext(i32 %i, i64 %x) {
; CHECK-LABEL: add_nsw_sext:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: addl $5, %edi
; CHECK-NEXT: movslq %edi, %rax
; CHECK-NEXT: retq
@@ -71,7 +71,7 @@ define i64 @add_nsw_sext(i32 %i, i64 %x) {
define i8* @gep8(i32 %i, i8* %x) {
; CHECK-LABEL: gep8:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movslq %edi, %rax
; CHECK-NEXT: leaq 5(%rsi,%rax), %rax
; CHECK-NEXT: retq
@@ -84,7 +84,7 @@ define i8* @gep8(i32 %i, i8* %x) {
define i16* @gep16(i32 %i, i16* %x) {
; CHECK-LABEL: gep16:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movslq %edi, %rax
; CHECK-NEXT: leaq -10(%rsi,%rax,2), %rax
; CHECK-NEXT: retq
@@ -97,7 +97,7 @@ define i16* @gep16(i32 %i, i16* %x) {
define i32* @gep32(i32 %i, i32* %x) {
; CHECK-LABEL: gep32:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movslq %edi, %rax
; CHECK-NEXT: leaq 20(%rsi,%rax,4), %rax
; CHECK-NEXT: retq
@@ -110,7 +110,7 @@ define i32* @gep32(i32 %i, i32* %x) {
define i64* @gep64(i32 %i, i64* %x) {
; CHECK-LABEL: gep64:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movslq %edi, %rax
; CHECK-NEXT: leaq -40(%rsi,%rax,8), %rax
; CHECK-NEXT: retq
@@ -125,7 +125,7 @@ define i64* @gep64(i32 %i, i64* %x) {
define i128* @gep128(i32 %i, i128* %x) {
; CHECK-LABEL: gep128:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movslq %edi, %rax
; CHECK-NEXT: shlq $4, %rax
; CHECK-NEXT: leaq 80(%rsi,%rax), %rax
@@ -143,7 +143,7 @@ define i128* @gep128(i32 %i, i128* %x) {
define void @PR20134(i32* %a, i32 %i) {
; CHECK-LABEL: PR20134:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movslq %esi, %rax
; CHECK-NEXT: movl 4(%rdi,%rax,4), %ecx
; CHECK-NEXT: addl 8(%rdi,%rax,4), %ecx
@@ -169,7 +169,7 @@ define void @PR20134(i32* %a, i32 %i) {
; The same as @PR20134 but sign extension is replaced with zero extension
define void @PR20134_zext(i32* %a, i32 %i) {
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movl %esi, %eax
; CHECK-NEXT: movl 4(%rdi,%rax,4), %ecx
; CHECK-NEXT: addl 8(%rdi,%rax,4), %ecx
diff --git a/test/CodeGen/X86/add-of-carry.ll b/test/CodeGen/X86/add-of-carry.ll
index ad82b8cfb775..1149ae575522 100644
--- a/test/CodeGen/X86/add-of-carry.ll
+++ b/test/CodeGen/X86/add-of-carry.ll
@@ -8,7 +8,7 @@
define i32 @test1(i32 %sum, i32 %x) nounwind readnone ssp {
; CHECK-LABEL: test1:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: movl %eax, %edx
@@ -26,7 +26,7 @@ define i32 @test1(i32 %sum, i32 %x) nounwind readnone ssp {
define i32 @test2(i32 %x, i32 %y, i32 %res) nounwind uwtable readnone ssp {
; CHECK-LABEL: test2:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
; CHECK-NEXT: cmpl {{[0-9]+}}(%esp), %ecx
diff --git a/test/CodeGen/X86/add-sub-nsw-nuw.ll b/test/CodeGen/X86/add-sub-nsw-nuw.ll
index d02736de55d3..703860de944d 100644
--- a/test/CodeGen/X86/add-sub-nsw-nuw.ll
+++ b/test/CodeGen/X86/add-sub-nsw-nuw.ll
@@ -7,10 +7,10 @@
define i8 @PR30841(i64 %argc) {
; CHECK-LABEL: PR30841:
-; CHECK: ## BB#0: ## %entry
+; CHECK: ## %bb.0: ## %entry
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: negl %eax
-; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: ## kill: def %al killed %al killed %eax
; CHECK-NEXT: retl
entry:
%or = or i64 %argc, -4294967296
diff --git a/test/CodeGen/X86/add.ll b/test/CodeGen/X86/add.ll
index df1bc9b6ee7e..3511bae6a617 100644
--- a/test/CodeGen/X86/add.ll
+++ b/test/CodeGen/X86/add.ll
@@ -1,31 +1,111 @@
-; RUN: llc < %s -mcpu=generic -march=x86 | FileCheck %s -check-prefix=X32
-; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux | FileCheck %s -check-prefix=X64
-; RUN: llc < %s -mcpu=generic -mtriple=x86_64-win32 | FileCheck %s -check-prefix=X64
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mcpu=generic -mtriple=i686-unknown-unknown | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux | FileCheck %s --check-prefixes=X64,X64-LINUX
+; RUN: llc < %s -mcpu=generic -mtriple=x86_64-win32 | FileCheck %s --check-prefixes=X64,X64-WIN32
+
+declare {i32, i1} @llvm.sadd.with.overflow.i32(i32, i32)
+declare {i32, i1} @llvm.uadd.with.overflow.i32(i32, i32)
; The immediate can be encoded in a smaller way if the
; instruction is a sub instead of an add.
-
define i32 @test1(i32 inreg %a) nounwind {
+; X32-LABEL: test1:
+; X32: # %bb.0: # %entry
+; X32-NEXT: subl $-128, %eax
+; X32-NEXT: retl
+;
+; X64-LINUX-LABEL: test1:
+; X64-LINUX: # %bb.0: # %entry
+; X64-LINUX-NEXT: subl $-128, %edi
+; X64-LINUX-NEXT: movl %edi, %eax
+; X64-LINUX-NEXT: retq
+;
+; X64-WIN32-LABEL: test1:
+; X64-WIN32: # %bb.0: # %entry
+; X64-WIN32-NEXT: subl $-128, %ecx
+; X64-WIN32-NEXT: movl %ecx, %eax
+; X64-WIN32-NEXT: retq
+entry:
%b = add i32 %a, 128
ret i32 %b
-; X32: subl $-128, %eax
-; X64: subl $-128,
}
define i64 @test2(i64 inreg %a) nounwind {
+; X32-LABEL: test2:
+; X32: # %bb.0: # %entry
+; X32-NEXT: addl $-2147483648, %eax # imm = 0x80000000
+; X32-NEXT: adcl $0, %edx
+; X32-NEXT: retl
+;
+; X64-LINUX-LABEL: test2:
+; X64-LINUX: # %bb.0: # %entry
+; X64-LINUX-NEXT: subq $-2147483648, %rdi # imm = 0x80000000
+; X64-LINUX-NEXT: movq %rdi, %rax
+; X64-LINUX-NEXT: retq
+;
+; X64-WIN32-LABEL: test2:
+; X64-WIN32: # %bb.0: # %entry
+; X64-WIN32-NEXT: subq $-2147483648, %rcx # imm = 0x80000000
+; X64-WIN32-NEXT: movq %rcx, %rax
+; X64-WIN32-NEXT: retq
+entry:
%b = add i64 %a, 2147483648
ret i64 %b
-; X32: addl $-2147483648, %eax
-; X64: subq $-2147483648,
}
define i64 @test3(i64 inreg %a) nounwind {
+; X32-LABEL: test3:
+; X32: # %bb.0: # %entry
+; X32-NEXT: addl $128, %eax
+; X32-NEXT: adcl $0, %edx
+; X32-NEXT: retl
+;
+; X64-LINUX-LABEL: test3:
+; X64-LINUX: # %bb.0: # %entry
+; X64-LINUX-NEXT: subq $-128, %rdi
+; X64-LINUX-NEXT: movq %rdi, %rax
+; X64-LINUX-NEXT: retq
+;
+; X64-WIN32-LABEL: test3:
+; X64-WIN32: # %bb.0: # %entry
+; X64-WIN32-NEXT: subq $-128, %rcx
+; X64-WIN32-NEXT: movq %rcx, %rax
+; X64-WIN32-NEXT: retq
+entry:
%b = add i64 %a, 128
ret i64 %b
-
-; X32: addl $128, %eax
-; X64: subq $-128,
}
define i1 @test4(i32 %v1, i32 %v2, i32* %X) nounwind {
+; X32-LABEL: test4:
+; X32: # %bb.0: # %entry
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: addl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: jo .LBB3_2
+; X32-NEXT: # %bb.1: # %normal
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl $0, (%eax)
+; X32-NEXT: .LBB3_2: # %overflow
+; X32-NEXT: xorl %eax, %eax
+; X32-NEXT: retl
+;
+; X64-LINUX-LABEL: test4:
+; X64-LINUX: # %bb.0: # %entry
+; X64-LINUX-NEXT: addl %esi, %edi
+; X64-LINUX-NEXT: jo .LBB3_2
+; X64-LINUX-NEXT: # %bb.1: # %normal
+; X64-LINUX-NEXT: movl $0, (%rdx)
+; X64-LINUX-NEXT: .LBB3_2: # %overflow
+; X64-LINUX-NEXT: xorl %eax, %eax
+; X64-LINUX-NEXT: retq
+;
+; X64-WIN32-LABEL: test4:
+; X64-WIN32: # %bb.0: # %entry
+; X64-WIN32-NEXT: addl %edx, %ecx
+; X64-WIN32-NEXT: jo .LBB3_2
+; X64-WIN32-NEXT: # %bb.1: # %normal
+; X64-WIN32-NEXT: movl $0, (%r8)
+; X64-WIN32-NEXT: .LBB3_2: # %overflow
+; X64-WIN32-NEXT: xorl %eax, %eax
+; X64-WIN32-NEXT: retq
entry:
%t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2)
%sum = extractvalue {i32, i1} %t, 0
@@ -38,17 +118,40 @@ normal:
overflow:
ret i1 false
-
-; X32-LABEL: test4:
-; X32: addl
-; X32-NEXT: jo
-
-; X64-LABEL: test4:
-; X64: addl %e[[A1:si|dx]], %e[[A0:di|cx]]
-; X64-NEXT: jo
}
define i1 @test5(i32 %v1, i32 %v2, i32* %X) nounwind {
+; X32-LABEL: test5:
+; X32: # %bb.0: # %entry
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: addl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: jb .LBB4_2
+; X32-NEXT: # %bb.1: # %normal
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl $0, (%eax)
+; X32-NEXT: .LBB4_2: # %carry
+; X32-NEXT: xorl %eax, %eax
+; X32-NEXT: retl
+;
+; X64-LINUX-LABEL: test5:
+; X64-LINUX: # %bb.0: # %entry
+; X64-LINUX-NEXT: addl %esi, %edi
+; X64-LINUX-NEXT: jb .LBB4_2
+; X64-LINUX-NEXT: # %bb.1: # %normal
+; X64-LINUX-NEXT: movl $0, (%rdx)
+; X64-LINUX-NEXT: .LBB4_2: # %carry
+; X64-LINUX-NEXT: xorl %eax, %eax
+; X64-LINUX-NEXT: retq
+;
+; X64-WIN32-LABEL: test5:
+; X64-WIN32: # %bb.0: # %entry
+; X64-WIN32-NEXT: addl %edx, %ecx
+; X64-WIN32-NEXT: jb .LBB4_2
+; X64-WIN32-NEXT: # %bb.1: # %normal
+; X64-WIN32-NEXT: movl $0, (%r8)
+; X64-WIN32-NEXT: .LBB4_2: # %carry
+; X64-WIN32-NEXT: xorl %eax, %eax
+; X64-WIN32-NEXT: retq
entry:
%t = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %v1, i32 %v2)
%sum = extractvalue {i32, i1} %t, 0
@@ -61,126 +164,221 @@ normal:
carry:
ret i1 false
-
-; X32-LABEL: test5:
-; X32: addl
-; X32-NEXT: jb
-
-; X64-LABEL: test5:
-; X64: addl %e[[A1]], %e[[A0]]
-; X64-NEXT: jb
}
-declare {i32, i1} @llvm.sadd.with.overflow.i32(i32, i32)
-declare {i32, i1} @llvm.uadd.with.overflow.i32(i32, i32)
-
-
define i64 @test6(i64 %A, i32 %B) nounwind {
- %tmp12 = zext i32 %B to i64 ; <i64> [#uses=1]
- %tmp3 = shl i64 %tmp12, 32 ; <i64> [#uses=1]
- %tmp5 = add i64 %tmp3, %A ; <i64> [#uses=1]
- ret i64 %tmp5
-
; X32-LABEL: test6:
-; X32: movl 4(%esp), %eax
-; X32-NEXT: movl 12(%esp), %edx
-; X32-NEXT: addl 8(%esp), %edx
-; X32-NEXT: ret
-
-; X64-LABEL: test6:
-; X64: shlq $32, %r[[A1]]
-; X64: leaq (%r[[A1]],%r[[A0]]), %rax
-; X64: ret
+; X32: # %bb.0: # %entry
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT: addl {{[0-9]+}}(%esp), %edx
+; X32-NEXT: retl
+;
+; X64-LINUX-LABEL: test6:
+; X64-LINUX: # %bb.0: # %entry
+; X64-LINUX-NEXT: # kill: def %esi killed %esi def %rsi
+; X64-LINUX-NEXT: shlq $32, %rsi
+; X64-LINUX-NEXT: leaq (%rsi,%rdi), %rax
+; X64-LINUX-NEXT: retq
+;
+; X64-WIN32-LABEL: test6:
+; X64-WIN32: # %bb.0: # %entry
+; X64-WIN32-NEXT: # kill: def %edx killed %edx def %rdx
+; X64-WIN32-NEXT: shlq $32, %rdx
+; X64-WIN32-NEXT: leaq (%rdx,%rcx), %rax
+; X64-WIN32-NEXT: retq
+entry:
+ %tmp12 = zext i32 %B to i64
+ %tmp3 = shl i64 %tmp12, 32
+ %tmp5 = add i64 %tmp3, %A
+ ret i64 %tmp5
}
define {i32, i1} @test7(i32 %v1, i32 %v2) nounwind {
- %t = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %v1, i32 %v2)
- ret {i32, i1} %t
+; X32-LABEL: test7:
+; X32: # %bb.0: # %entry
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: addl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: setb %dl
+; X32-NEXT: retl
+;
+; X64-LINUX-LABEL: test7:
+; X64-LINUX: # %bb.0: # %entry
+; X64-LINUX-NEXT: addl %esi, %edi
+; X64-LINUX-NEXT: setb %dl
+; X64-LINUX-NEXT: movl %edi, %eax
+; X64-LINUX-NEXT: retq
+;
+; X64-WIN32-LABEL: test7:
+; X64-WIN32: # %bb.0: # %entry
+; X64-WIN32-NEXT: addl %edx, %ecx
+; X64-WIN32-NEXT: setb %dl
+; X64-WIN32-NEXT: movl %ecx, %eax
+; X64-WIN32-NEXT: retq
+entry:
+ %t = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %v1, i32 %v2)
+ ret {i32, i1} %t
}
-; X64-LABEL: test7:
-; X64: addl %e[[A1]], %e
-; X64-NEXT: setb %dl
-; X64: ret
-
; PR5443
define {i64, i1} @test8(i64 %left, i64 %right) nounwind {
+; X32-LABEL: test8:
+; X32: # %bb.0: # %entry
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT: addl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: adcl {{[0-9]+}}(%esp), %edx
+; X32-NEXT: setb %cl
+; X32-NEXT: retl
+;
+; X64-LINUX-LABEL: test8:
+; X64-LINUX: # %bb.0: # %entry
+; X64-LINUX-NEXT: addq %rsi, %rdi
+; X64-LINUX-NEXT: setb %dl
+; X64-LINUX-NEXT: movq %rdi, %rax
+; X64-LINUX-NEXT: retq
+;
+; X64-WIN32-LABEL: test8:
+; X64-WIN32: # %bb.0: # %entry
+; X64-WIN32-NEXT: addq %rdx, %rcx
+; X64-WIN32-NEXT: setb %dl
+; X64-WIN32-NEXT: movq %rcx, %rax
+; X64-WIN32-NEXT: retq
entry:
- %extleft = zext i64 %left to i65
- %extright = zext i64 %right to i65
- %sum = add i65 %extleft, %extright
- %res.0 = trunc i65 %sum to i64
- %overflow = and i65 %sum, -18446744073709551616
- %res.1 = icmp ne i65 %overflow, 0
- %final0 = insertvalue {i64, i1} undef, i64 %res.0, 0
- %final1 = insertvalue {i64, i1} %final0, i1 %res.1, 1
- ret {i64, i1} %final1
+ %extleft = zext i64 %left to i65
+ %extright = zext i64 %right to i65
+ %sum = add i65 %extleft, %extright
+ %res.0 = trunc i65 %sum to i64
+ %overflow = and i65 %sum, -18446744073709551616
+ %res.1 = icmp ne i65 %overflow, 0
+ %final0 = insertvalue {i64, i1} undef, i64 %res.0, 0
+ %final1 = insertvalue {i64, i1} %final0, i1 %res.1, 1
+ ret {i64, i1} %final1
}
-; X64-LABEL: test8:
-; X64: addq
-; X64-NEXT: setb
-; X64: ret
-
define i32 @test9(i32 %x, i32 %y) nounwind readnone {
+; X32-LABEL: test9:
+; X32: # %bb.0: # %entry
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: xorl %ecx, %ecx
+; X32-NEXT: cmpl $10, {{[0-9]+}}(%esp)
+; X32-NEXT: sete %cl
+; X32-NEXT: subl %ecx, %eax
+; X32-NEXT: retl
+;
+; X64-LINUX-LABEL: test9:
+; X64-LINUX: # %bb.0: # %entry
+; X64-LINUX-NEXT: xorl %eax, %eax
+; X64-LINUX-NEXT: cmpl $10, %edi
+; X64-LINUX-NEXT: sete %al
+; X64-LINUX-NEXT: subl %eax, %esi
+; X64-LINUX-NEXT: movl %esi, %eax
+; X64-LINUX-NEXT: retq
+;
+; X64-WIN32-LABEL: test9:
+; X64-WIN32: # %bb.0: # %entry
+; X64-WIN32-NEXT: xorl %eax, %eax
+; X64-WIN32-NEXT: cmpl $10, %ecx
+; X64-WIN32-NEXT: sete %al
+; X64-WIN32-NEXT: subl %eax, %edx
+; X64-WIN32-NEXT: movl %edx, %eax
+; X64-WIN32-NEXT: retq
+entry:
%cmp = icmp eq i32 %x, 10
%sub = sext i1 %cmp to i32
%cond = add i32 %sub, %y
ret i32 %cond
-; X64-LABEL: test9:
-; X64: cmpl $10
-; X64: sete
-; X64: subl
-; X64: ret
}
define i1 @test10(i32 %x) nounwind {
+; X32-LABEL: test10:
+; X32: # %bb.0: # %entry
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: incl %eax
+; X32-NEXT: seto %al
+; X32-NEXT: retl
+;
+; X64-LINUX-LABEL: test10:
+; X64-LINUX: # %bb.0: # %entry
+; X64-LINUX-NEXT: incl %edi
+; X64-LINUX-NEXT: seto %al
+; X64-LINUX-NEXT: retq
+;
+; X64-WIN32-LABEL: test10:
+; X64-WIN32: # %bb.0: # %entry
+; X64-WIN32-NEXT: incl %ecx
+; X64-WIN32-NEXT: seto %al
+; X64-WIN32-NEXT: retq
entry:
%t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %x, i32 1)
%obit = extractvalue {i32, i1} %t, 1
ret i1 %obit
-
-; X32-LABEL: test10:
-; X32: incl
-; X32-NEXT: seto
-
-; X64-LABEL: test10:
-; X64: incl
-; X64-NEXT: seto
}
define void @test11(i32* inreg %a) nounwind {
+; X32-LABEL: test11:
+; X32: # %bb.0: # %entry
+; X32-NEXT: subl $-128, (%eax)
+; X32-NEXT: retl
+;
+; X64-LINUX-LABEL: test11:
+; X64-LINUX: # %bb.0: # %entry
+; X64-LINUX-NEXT: subl $-128, (%rdi)
+; X64-LINUX-NEXT: retq
+;
+; X64-WIN32-LABEL: test11:
+; X64-WIN32: # %bb.0: # %entry
+; X64-WIN32-NEXT: subl $-128, (%rcx)
+; X64-WIN32-NEXT: retq
+entry:
%aa = load i32, i32* %a
%b = add i32 %aa, 128
store i32 %b, i32* %a
ret void
-; X32-LABEL: test11:
-; X32: subl $-128, (%
-; X64-LABEL: test11:
-; X64: subl $-128, (%
}
define void @test12(i64* inreg %a) nounwind {
+; X32-LABEL: test12:
+; X32: # %bb.0: # %entry
+; X32-NEXT: addl $-2147483648, (%eax) # imm = 0x80000000
+; X32-NEXT: adcl $0, 4(%eax)
+; X32-NEXT: retl
+;
+; X64-LINUX-LABEL: test12:
+; X64-LINUX: # %bb.0: # %entry
+; X64-LINUX-NEXT: subq $-2147483648, (%rdi) # imm = 0x80000000
+; X64-LINUX-NEXT: retq
+;
+; X64-WIN32-LABEL: test12:
+; X64-WIN32: # %bb.0: # %entry
+; X64-WIN32-NEXT: subq $-2147483648, (%rcx) # imm = 0x80000000
+; X64-WIN32-NEXT: retq
+entry:
%aa = load i64, i64* %a
%b = add i64 %aa, 2147483648
store i64 %b, i64* %a
ret void
-; X32-LABEL: test12:
-; X32: addl (%
-; X32-NEXT: adcl $0,
-; X64-LABEL: test12:
-; X64: subq $-2147483648, (%
}
define void @test13(i64* inreg %a) nounwind {
+; X32-LABEL: test13:
+; X32: # %bb.0: # %entry
+; X32-NEXT: addl $128, (%eax)
+; X32-NEXT: adcl $0, 4(%eax)
+; X32-NEXT: retl
+;
+; X64-LINUX-LABEL: test13:
+; X64-LINUX: # %bb.0: # %entry
+; X64-LINUX-NEXT: subq $-128, (%rdi)
+; X64-LINUX-NEXT: retq
+;
+; X64-WIN32-LABEL: test13:
+; X64-WIN32: # %bb.0: # %entry
+; X64-WIN32-NEXT: subq $-128, (%rcx)
+; X64-WIN32-NEXT: retq
+entry:
%aa = load i64, i64* %a
%b = add i64 %aa, 128
store i64 %b, i64* %a
ret void
-
-; X32-LABEL: test13:
-; X32: addl (%
-; X32-NEXT: adcl $0,
-; X64-LABEL: test13:
-; X64: subq $-128, (%
}
diff --git a/test/CodeGen/X86/add_shl_constant.ll b/test/CodeGen/X86/add_shl_constant.ll
index 33074e4780e6..b783a51c2eef 100644
--- a/test/CodeGen/X86/add_shl_constant.ll
+++ b/test/CodeGen/X86/add_shl_constant.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=x86-64 -mtriple=x86_64-apple-darwin < %s | FileCheck %s
+; RUN: llc -mtriple=x86_64-apple-darwin < %s | FileCheck %s
; CHECK-LABEL: add_shl_add_constant_1_i32
; CHECK: leal 984(%rsi,%rdi,8), %eax
diff --git a/test/CodeGen/X86/addcarry.ll b/test/CodeGen/X86/addcarry.ll
index cffcfd8e8a42..2d76c62bbd01 100644
--- a/test/CodeGen/X86/addcarry.ll
+++ b/test/CodeGen/X86/addcarry.ll
@@ -3,7 +3,7 @@
define void @a(i64* nocapture %s, i64* nocapture %t, i64 %a, i64 %b, i64 %c) nounwind {
; CHECK-LABEL: a:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: addq %rcx, %rdx
; CHECK-NEXT: adcq $0, %r8
; CHECK-NEXT: movq %r8, (%rdi)
@@ -26,7 +26,7 @@ entry:
define void @b(i32* nocapture %r, i64 %a, i64 %b, i32 %c) nounwind {
; CHECK-LABEL: b:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: addq %rdx, %rsi
; CHECK-NEXT: adcl $0, %ecx
; CHECK-NEXT: movl %ecx, (%rdi)
@@ -45,7 +45,7 @@ entry:
define void @c(i16* nocapture %r, i64 %a, i64 %b, i16 %c) nounwind {
; CHECK-LABEL: c:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: addq %rdx, %rsi
; CHECK-NEXT: adcw $0, %cx
; CHECK-NEXT: movw %cx, (%rdi)
@@ -64,7 +64,7 @@ entry:
define void @d(i8* nocapture %r, i64 %a, i64 %b, i8 %c) nounwind {
; CHECK-LABEL: d:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: addq %rdx, %rsi
; CHECK-NEXT: adcb $0, %cl
; CHECK-NEXT: movb %cl, (%rdi)
@@ -83,8 +83,8 @@ entry:
define i8 @e(i32* nocapture %a, i32 %b) nounwind {
; CHECK-LABEL: e:
-; CHECK: # BB#0:
-; CHECK-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
+; CHECK: # %bb.0:
+; CHECK-NEXT: # kill: def %esi killed %esi def %rsi
; CHECK-NEXT: movl (%rdi), %ecx
; CHECK-NEXT: leal (%rsi,%rcx), %edx
; CHECK-NEXT: addl %esi, %edx
@@ -109,7 +109,7 @@ define i8 @e(i32* nocapture %a, i32 %b) nounwind {
define %scalar @pr31719(%scalar* nocapture readonly %this, %scalar %arg.b) {
; CHECK-LABEL: pr31719:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: addq (%rsi), %rdx
; CHECK-NEXT: adcq 8(%rsi), %rcx
; CHECK-NEXT: adcq 16(%rsi), %r8
@@ -168,11 +168,10 @@ entry:
define void @muladd(%accumulator* nocapture %this, i64 %arg.a, i64 %arg.b) {
; CHECK-LABEL: muladd:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movq %rdx, %rax
; CHECK-NEXT: mulq %rsi
-; CHECK-NEXT: addq (%rdi), %rax
-; CHECK-NEXT: movq %rax, (%rdi)
+; CHECK-NEXT: addq %rax, (%rdi)
; CHECK-NEXT: adcq 8(%rdi), %rdx
; CHECK-NEXT: movq %rdx, 8(%rdi)
; CHECK-NEXT: adcl $0, 16(%rdi)
@@ -206,7 +205,7 @@ entry:
define i64 @shiftadd(i64 %a, i64 %b, i64 %c, i64 %d) {
; CHECK-LABEL: shiftadd:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: addq %rsi, %rdi
; CHECK-NEXT: adcq %rcx, %rdx
; CHECK-NEXT: movq %rdx, %rax
@@ -226,7 +225,7 @@ entry:
define %S @readd(%S* nocapture readonly %this, %S %arg.b) {
; CHECK-LABEL: readd:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: addq (%rsi), %rdx
; CHECK-NEXT: movq 8(%rsi), %r10
; CHECK-NEXT: adcq $0, %r10
diff --git a/test/CodeGen/X86/addr-of-ret-addr.ll b/test/CodeGen/X86/addr-of-ret-addr.ll
index 7423e7161c8a..67ebb7f9d78d 100644
--- a/test/CodeGen/X86/addr-of-ret-addr.ll
+++ b/test/CodeGen/X86/addr-of-ret-addr.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -disable-fp-elim -march=x86 | FileCheck %s --check-prefix=CHECK-X86
-; RUN: llc < %s -disable-fp-elim -march=x86-64 | FileCheck %s --check-prefix=CHECK-X64
+; RUN: llc < %s -disable-fp-elim -mtriple=i686-- | FileCheck %s --check-prefix=CHECK-X86
+; RUN: llc < %s -disable-fp-elim -mtriple=x86_64-- | FileCheck %s --check-prefix=CHECK-X64
define i8* @f() nounwind readnone optsize {
entry:
diff --git a/test/CodeGen/X86/adx-intrinsics.ll b/test/CodeGen/X86/adx-intrinsics.ll
index 819a5df14e63..bc8e2be4de04 100644
--- a/test/CodeGen/X86/adx-intrinsics.ll
+++ b/test/CodeGen/X86/adx-intrinsics.ll
@@ -1,14 +1,25 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 --show-mc-encoding| FileCheck %s --check-prefix=NOADX --check-prefix=CHECK
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=broadwell --show-mc-encoding| FileCheck %s --check-prefix=ADX --check-prefix=CHECK
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 --show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=NOADX
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=broadwell --show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=ADX
declare i8 @llvm.x86.addcarryx.u32(i8, i32, i32, i8*)
define i8 @test_addcarryx_u32(i8 %c, i32 %a, i32 %b, i8* %ptr) {
-; CHECK-LABEL: test_addcarryx_u32
-; CHECK: addb
-; ADX: adcxl
-; CHECK: setb
-; CHECK: retq
+; NOADX-LABEL: test_addcarryx_u32:
+; NOADX: ## %bb.0:
+; NOADX-NEXT: addb $-1, %dil ## encoding: [0x40,0x80,0xc7,0xff]
+; NOADX-NEXT: adcl %edx, %esi ## encoding: [0x11,0xd6]
+; NOADX-NEXT: movl %esi, (%rcx) ## encoding: [0x89,0x31]
+; NOADX-NEXT: setb %al ## encoding: [0x0f,0x92,0xc0]
+; NOADX-NEXT: retq ## encoding: [0xc3]
+;
+; ADX-LABEL: test_addcarryx_u32:
+; ADX: ## %bb.0:
+; ADX-NEXT: addb $-1, %dil ## encoding: [0x40,0x80,0xc7,0xff]
+; ADX-NEXT: adcxl %edx, %esi ## encoding: [0x66,0x0f,0x38,0xf6,0xf2]
+; ADX-NEXT: movl %esi, (%rcx) ## encoding: [0x89,0x31]
+; ADX-NEXT: setb %al ## encoding: [0x0f,0x92,0xc0]
+; ADX-NEXT: retq ## encoding: [0xc3]
%ret = tail call i8 @llvm.x86.addcarryx.u32(i8 %c, i32 %a, i32 %b, i8* %ptr)
ret i8 %ret;
}
@@ -16,11 +27,21 @@ define i8 @test_addcarryx_u32(i8 %c, i32 %a, i32 %b, i8* %ptr) {
declare i8 @llvm.x86.addcarryx.u64(i8, i64, i64, i8*)
define i8 @test_addcarryx_u64(i8 %c, i64 %a, i64 %b, i8* %ptr) {
-; CHECK-LABEL: test_addcarryx_u64
-; CHECK: addb
-; ADX: adcxq
-; CHECK: setb
-; CHECK: retq
+; NOADX-LABEL: test_addcarryx_u64:
+; NOADX: ## %bb.0:
+; NOADX-NEXT: addb $-1, %dil ## encoding: [0x40,0x80,0xc7,0xff]
+; NOADX-NEXT: adcq %rdx, %rsi ## encoding: [0x48,0x11,0xd6]
+; NOADX-NEXT: movq %rsi, (%rcx) ## encoding: [0x48,0x89,0x31]
+; NOADX-NEXT: setb %al ## encoding: [0x0f,0x92,0xc0]
+; NOADX-NEXT: retq ## encoding: [0xc3]
+;
+; ADX-LABEL: test_addcarryx_u64:
+; ADX: ## %bb.0:
+; ADX-NEXT: addb $-1, %dil ## encoding: [0x40,0x80,0xc7,0xff]
+; ADX-NEXT: adcxq %rdx, %rsi ## encoding: [0x66,0x48,0x0f,0x38,0xf6,0xf2]
+; ADX-NEXT: movq %rsi, (%rcx) ## encoding: [0x48,0x89,0x31]
+; ADX-NEXT: setb %al ## encoding: [0x0f,0x92,0xc0]
+; ADX-NEXT: retq ## encoding: [0xc3]
%ret = tail call i8 @llvm.x86.addcarryx.u64(i8 %c, i64 %a, i64 %b, i8* %ptr)
ret i8 %ret;
}
@@ -28,12 +49,21 @@ define i8 @test_addcarryx_u64(i8 %c, i64 %a, i64 %b, i8* %ptr) {
declare i8 @llvm.x86.addcarry.u32(i8, i32, i32, i8*)
define i8 @test_addcarry_u32(i8 %c, i32 %a, i32 %b, i8* %ptr) {
-; CHECK-LABEL: test_addcarry_u32
-; CHECK: addb
-; ADX: adcxl
-; NOADX: adcl
-; CHECK: setb
-; CHECK: retq
+; NOADX-LABEL: test_addcarry_u32:
+; NOADX: ## %bb.0:
+; NOADX-NEXT: addb $-1, %dil ## encoding: [0x40,0x80,0xc7,0xff]
+; NOADX-NEXT: adcl %edx, %esi ## encoding: [0x11,0xd6]
+; NOADX-NEXT: movl %esi, (%rcx) ## encoding: [0x89,0x31]
+; NOADX-NEXT: setb %al ## encoding: [0x0f,0x92,0xc0]
+; NOADX-NEXT: retq ## encoding: [0xc3]
+;
+; ADX-LABEL: test_addcarry_u32:
+; ADX: ## %bb.0:
+; ADX-NEXT: addb $-1, %dil ## encoding: [0x40,0x80,0xc7,0xff]
+; ADX-NEXT: adcxl %edx, %esi ## encoding: [0x66,0x0f,0x38,0xf6,0xf2]
+; ADX-NEXT: movl %esi, (%rcx) ## encoding: [0x89,0x31]
+; ADX-NEXT: setb %al ## encoding: [0x0f,0x92,0xc0]
+; ADX-NEXT: retq ## encoding: [0xc3]
%ret = tail call i8 @llvm.x86.addcarry.u32(i8 %c, i32 %a, i32 %b, i8* %ptr)
ret i8 %ret;
}
@@ -41,12 +71,21 @@ define i8 @test_addcarry_u32(i8 %c, i32 %a, i32 %b, i8* %ptr) {
declare i8 @llvm.x86.addcarry.u64(i8, i64, i64, i8*)
define i8 @test_addcarry_u64(i8 %c, i64 %a, i64 %b, i8* %ptr) {
-; CHECK-LABEL: test_addcarry_u64
-; CHECK: addb
-; ADX: adcxq
-; NOADX: adcq
-; CHECK: setb
-; CHECK: retq
+; NOADX-LABEL: test_addcarry_u64:
+; NOADX: ## %bb.0:
+; NOADX-NEXT: addb $-1, %dil ## encoding: [0x40,0x80,0xc7,0xff]
+; NOADX-NEXT: adcq %rdx, %rsi ## encoding: [0x48,0x11,0xd6]
+; NOADX-NEXT: movq %rsi, (%rcx) ## encoding: [0x48,0x89,0x31]
+; NOADX-NEXT: setb %al ## encoding: [0x0f,0x92,0xc0]
+; NOADX-NEXT: retq ## encoding: [0xc3]
+;
+; ADX-LABEL: test_addcarry_u64:
+; ADX: ## %bb.0:
+; ADX-NEXT: addb $-1, %dil ## encoding: [0x40,0x80,0xc7,0xff]
+; ADX-NEXT: adcxq %rdx, %rsi ## encoding: [0x66,0x48,0x0f,0x38,0xf6,0xf2]
+; ADX-NEXT: movq %rsi, (%rcx) ## encoding: [0x48,0x89,0x31]
+; ADX-NEXT: setb %al ## encoding: [0x0f,0x92,0xc0]
+; ADX-NEXT: retq ## encoding: [0xc3]
%ret = tail call i8 @llvm.x86.addcarry.u64(i8 %c, i64 %a, i64 %b, i8* %ptr)
ret i8 %ret;
}
@@ -54,11 +93,13 @@ define i8 @test_addcarry_u64(i8 %c, i64 %a, i64 %b, i8* %ptr) {
declare i8 @llvm.x86.subborrow.u32(i8, i32, i32, i8*)
define i8 @test_subborrow_u32(i8 %c, i32 %a, i32 %b, i8* %ptr) {
-; CHECK-LABEL: test_subborrow_u32
-; CHECK: addb
-; CHECK: sbbl
-; CHECK: setb
-; CHECK: retq
+; CHECK-LABEL: test_subborrow_u32:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: addb $-1, %dil ## encoding: [0x40,0x80,0xc7,0xff]
+; CHECK-NEXT: sbbl %edx, %esi ## encoding: [0x19,0xd6]
+; CHECK-NEXT: movl %esi, (%rcx) ## encoding: [0x89,0x31]
+; CHECK-NEXT: setb %al ## encoding: [0x0f,0x92,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%ret = tail call i8 @llvm.x86.subborrow.u32(i8 %c, i32 %a, i32 %b, i8* %ptr)
ret i8 %ret;
}
@@ -66,22 +107,40 @@ define i8 @test_subborrow_u32(i8 %c, i32 %a, i32 %b, i8* %ptr) {
declare i8 @llvm.x86.subborrow.u64(i8, i64, i64, i8*)
define i8 @test_subborrow_u64(i8 %c, i64 %a, i64 %b, i8* %ptr) {
-; CHECK-LABEL: test_subborrow_u64
-; CHECK: addb
-; CHECK: sbbq
-; CHECK: setb
-; CHECK: retq
+; CHECK-LABEL: test_subborrow_u64:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: addb $-1, %dil ## encoding: [0x40,0x80,0xc7,0xff]
+; CHECK-NEXT: sbbq %rdx, %rsi ## encoding: [0x48,0x19,0xd6]
+; CHECK-NEXT: movq %rsi, (%rcx) ## encoding: [0x48,0x89,0x31]
+; CHECK-NEXT: setb %al ## encoding: [0x0f,0x92,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
%ret = tail call i8 @llvm.x86.subborrow.u64(i8 %c, i64 %a, i64 %b, i8* %ptr)
ret i8 %ret;
}
; Try a version with loads. Previously we crashed on this.
define i32 @load_crash(i64* nocapture readonly %a, i64* nocapture readonly %b, i64* %res) {
-; CHECK-LABEL: load_crash
-; CHECK: addb
-; ADX: adcxq
-; CHECK: setb
-; CHECK: retq
+; NOADX-LABEL: load_crash:
+; NOADX: ## %bb.0:
+; NOADX-NEXT: movq (%rdi), %rax ## encoding: [0x48,0x8b,0x07]
+; NOADX-NEXT: xorl %ecx, %ecx ## encoding: [0x31,0xc9]
+; NOADX-NEXT: addb $-1, %cl ## encoding: [0x80,0xc1,0xff]
+; NOADX-NEXT: adcq (%rsi), %rax ## encoding: [0x48,0x13,0x06]
+; NOADX-NEXT: movq %rax, (%rdx) ## encoding: [0x48,0x89,0x02]
+; NOADX-NEXT: setb %al ## encoding: [0x0f,0x92,0xc0]
+; NOADX-NEXT: movzbl %al, %eax ## encoding: [0x0f,0xb6,0xc0]
+; NOADX-NEXT: retq ## encoding: [0xc3]
+;
+; ADX-LABEL: load_crash:
+; ADX: ## %bb.0:
+; ADX-NEXT: movq (%rdi), %rax ## encoding: [0x48,0x8b,0x07]
+; ADX-NEXT: xorl %ecx, %ecx ## encoding: [0x31,0xc9]
+; ADX-NEXT: addb $-1, %cl ## encoding: [0x80,0xc1,0xff]
+; ADX-NEXT: adcxq (%rsi), %rax ## encoding: [0x66,0x48,0x0f,0x38,0xf6,0x06]
+; ADX-NEXT: movq %rax, (%rdx) ## encoding: [0x48,0x89,0x02]
+; ADX-NEXT: setb %al ## encoding: [0x0f,0x92,0xc0]
+; ADX-NEXT: movzbl %al, %eax ## encoding: [0x0f,0xb6,0xc0]
+; ADX-NEXT: retq ## encoding: [0xc3]
%1 = load i64, i64* %a, align 8
%2 = load i64, i64* %b, align 8
%3 = bitcast i64* %res to i8*
@@ -92,12 +151,14 @@ define i32 @load_crash(i64* nocapture readonly %a, i64* nocapture readonly %b, i
; Try a really simple all zero input case, which also used to crash
define void @allzeros() {
-; CHECK-LABEL: allzeros
-; CHECK: xorl
-; CHECK: addb
-; CHECK: sbbq
-; CHECK: andl
-; CHECK: retq
+; CHECK-LABEL: allzeros:
+; CHECK: ## %bb.0: ## %entry
+; CHECK-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0]
+; CHECK-NEXT: addb $-1, %al ## encoding: [0x04,0xff]
+; CHECK-NEXT: sbbq %rax, %rax ## encoding: [0x48,0x19,0xc0]
+; CHECK-NEXT: andl $1, %eax ## encoding: [0x83,0xe0,0x01]
+; CHECK-NEXT: movq %rax, 0 ## encoding: [0x48,0x89,0x04,0x25,0x00,0x00,0x00,0x00]
+; CHECK-NEXT: retq ## encoding: [0xc3]
entry:
%0 = tail call i8 @llvm.x86.addcarryx.u64(i8 0, i64 0, i64 0, i8* null)
ret void
diff --git a/test/CodeGen/X86/adx-schedule.ll b/test/CodeGen/X86/adx-schedule.ll
new file mode 100644
index 000000000000..13166f769b8e
--- /dev/null
+++ b/test/CodeGen/X86/adx-schedule.ll
@@ -0,0 +1,114 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=+adx | FileCheck %s --check-prefix=CHECK --check-prefix=GENERIC
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=broadwell | FileCheck %s --check-prefix=CHECK --check-prefix=BROADWELL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=SKYLAKE --check-prefix=SKL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skx | FileCheck %s --check-prefix=CHECK --check-prefix=SKYLAKE --check-prefix=SKX
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=knl | FileCheck %s --check-prefix=CHECK --check-prefix=KNL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1
+
+define void @test_adcx(i32 %a0, i32* %a1, i64 %a2, i64* %a3) optsize {
+; GENERIC-LABEL: test_adcx:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: adcxl %edi, %edi # sched: [1:0.33]
+; GENERIC-NEXT: adcxq %rdx, %rdx # sched: [1:0.33]
+; GENERIC-NEXT: adcxl (%rsi), %edi # sched: [5:0.50]
+; GENERIC-NEXT: adcxq (%rcx), %rdx # sched: [5:0.50]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; BROADWELL-LABEL: test_adcx:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: adcxl %edi, %edi # sched: [1:0.50]
+; BROADWELL-NEXT: adcxq %rdx, %rdx # sched: [1:0.50]
+; BROADWELL-NEXT: adcxl (%rsi), %edi # sched: [6:0.50]
+; BROADWELL-NEXT: adcxq (%rcx), %rdx # sched: [6:0.50]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_adcx:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: adcxl %edi, %edi # sched: [1:0.50]
+; SKYLAKE-NEXT: adcxq %rdx, %rdx # sched: [1:0.50]
+; SKYLAKE-NEXT: adcxl (%rsi), %edi # sched: [6:0.50]
+; SKYLAKE-NEXT: adcxq (%rcx), %rdx # sched: [6:0.50]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; KNL-LABEL: test_adcx:
+; KNL: # %bb.0:
+; KNL-NEXT: #APP
+; KNL-NEXT: adcxl %edi, %edi # sched: [1:0.25]
+; KNL-NEXT: adcxq %rdx, %rdx # sched: [1:0.25]
+; KNL-NEXT: adcxl (%rsi), %edi # sched: [6:0.50]
+; KNL-NEXT: adcxq (%rcx), %rdx # sched: [6:0.50]
+; KNL-NEXT: #NO_APP
+; KNL-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_adcx:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: adcxl %edi, %edi # sched: [1:0.25]
+; ZNVER1-NEXT: adcxq %rdx, %rdx # sched: [1:0.25]
+; ZNVER1-NEXT: adcxl (%rsi), %edi # sched: [5:0.50]
+; ZNVER1-NEXT: adcxq (%rcx), %rdx # sched: [5:0.50]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ tail call void asm "adcx $0, $0 \0A\09 adcx $2, $2 \0A\09 adcx $1, $0 \0A\09 adcx $3, $2", "r,*m,r,*m"(i32 %a0, i32* %a1, i64 %a2, i64* %a3) nounwind
+ ret void
+}
+define void @test_adox(i32 %a0, i32* %a1, i64 %a2, i64* %a3) optsize {
+; GENERIC-LABEL: test_adox:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: adoxl %edi, %edi # sched: [1:0.33]
+; GENERIC-NEXT: adoxq %rdx, %rdx # sched: [1:0.33]
+; GENERIC-NEXT: adoxl (%rsi), %edi # sched: [5:0.50]
+; GENERIC-NEXT: adoxq (%rcx), %rdx # sched: [5:0.50]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; BROADWELL-LABEL: test_adox:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: adoxl %edi, %edi # sched: [1:0.50]
+; BROADWELL-NEXT: adoxq %rdx, %rdx # sched: [1:0.50]
+; BROADWELL-NEXT: adoxl (%rsi), %edi # sched: [6:0.50]
+; BROADWELL-NEXT: adoxq (%rcx), %rdx # sched: [6:0.50]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_adox:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: adoxl %edi, %edi # sched: [1:0.50]
+; SKYLAKE-NEXT: adoxq %rdx, %rdx # sched: [1:0.50]
+; SKYLAKE-NEXT: adoxl (%rsi), %edi # sched: [6:0.50]
+; SKYLAKE-NEXT: adoxq (%rcx), %rdx # sched: [6:0.50]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; KNL-LABEL: test_adox:
+; KNL: # %bb.0:
+; KNL-NEXT: #APP
+; KNL-NEXT: adoxl %edi, %edi # sched: [1:0.25]
+; KNL-NEXT: adoxq %rdx, %rdx # sched: [1:0.25]
+; KNL-NEXT: adoxl (%rsi), %edi # sched: [6:0.50]
+; KNL-NEXT: adoxq (%rcx), %rdx # sched: [6:0.50]
+; KNL-NEXT: #NO_APP
+; KNL-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_adox:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: adoxl %edi, %edi # sched: [1:0.25]
+; ZNVER1-NEXT: adoxq %rdx, %rdx # sched: [1:0.25]
+; ZNVER1-NEXT: adoxl (%rsi), %edi # sched: [5:0.50]
+; ZNVER1-NEXT: adoxq (%rcx), %rdx # sched: [5:0.50]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ tail call void asm "adox $0, $0 \0A\09 adox $2, $2 \0A\09 adox $1, $0 \0A\09 adox $3, $2", "r,*m,r,*m"(i32 %a0, i32* %a1, i64 %a2, i64* %a3) nounwind
+ ret void
+}
diff --git a/test/CodeGen/X86/aes-schedule.ll b/test/CodeGen/X86/aes-schedule.ll
new file mode 100644
index 000000000000..be3b049b9d54
--- /dev/null
+++ b/test/CodeGen/X86/aes-schedule.ll
@@ -0,0 +1,359 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=+aes | FileCheck %s --check-prefix=CHECK --check-prefix=GENERIC
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=slm | FileCheck %s --check-prefix=CHECK --check-prefix=SLM
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=goldmont | FileCheck %s --check-prefix=CHECK --check-prefix=SLM
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=sandybridge | FileCheck %s --check-prefix=CHECK --check-prefix=SANDY
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=ivybridge | FileCheck %s --check-prefix=CHECK --check-prefix=SANDY
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=haswell | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=broadwell | FileCheck %s --check-prefix=CHECK --check-prefix=BROADWELL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=SKYLAKE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1
+
+define <2 x i64> @test_aesdec(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
+; GENERIC-LABEL: test_aesdec:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: aesdec %xmm1, %xmm0 # sched: [7:1.00]
+; GENERIC-NEXT: aesdec (%rdi), %xmm0 # sched: [13:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SLM-LABEL: test_aesdec:
+; SLM: # %bb.0:
+; SLM-NEXT: aesdec %xmm1, %xmm0 # sched: [8:5.00]
+; SLM-NEXT: aesdec (%rdi), %xmm0 # sched: [8:5.00]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_aesdec:
+; SANDY: # %bb.0:
+; SANDY-NEXT: vaesdec %xmm1, %xmm0, %xmm0 # sched: [7:1.00]
+; SANDY-NEXT: vaesdec (%rdi), %xmm0, %xmm0 # sched: [13:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_aesdec:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vaesdec %xmm1, %xmm0, %xmm0 # sched: [7:1.00]
+; HASWELL-NEXT: vaesdec (%rdi), %xmm0, %xmm0 # sched: [13:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_aesdec:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vaesdec %xmm1, %xmm0, %xmm0 # sched: [7:1.00]
+; BROADWELL-NEXT: vaesdec (%rdi), %xmm0, %xmm0 # sched: [12:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_aesdec:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vaesdec %xmm1, %xmm0, %xmm0 # sched: [4:1.00]
+; SKYLAKE-NEXT: vaesdec (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_aesdec:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: vaesdec %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; BTVER2-NEXT: vaesdec (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_aesdec:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vaesdec %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
+; ZNVER1-NEXT: vaesdec (%rdi), %xmm0, %xmm0 # sched: [11:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = load <2 x i64>, <2 x i64> *%a2, align 16
+ %2 = call <2 x i64> @llvm.x86.aesni.aesdec(<2 x i64> %a0, <2 x i64> %a1)
+ %3 = call <2 x i64> @llvm.x86.aesni.aesdec(<2 x i64> %2, <2 x i64> %1)
+ ret <2 x i64> %3
+}
+declare <2 x i64> @llvm.x86.aesni.aesdec(<2 x i64>, <2 x i64>)
+
+define <2 x i64> @test_aesdeclast(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
+; GENERIC-LABEL: test_aesdeclast:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: aesdeclast %xmm1, %xmm0 # sched: [7:1.00]
+; GENERIC-NEXT: aesdeclast (%rdi), %xmm0 # sched: [13:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SLM-LABEL: test_aesdeclast:
+; SLM: # %bb.0:
+; SLM-NEXT: aesdeclast %xmm1, %xmm0 # sched: [8:5.00]
+; SLM-NEXT: aesdeclast (%rdi), %xmm0 # sched: [8:5.00]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_aesdeclast:
+; SANDY: # %bb.0:
+; SANDY-NEXT: vaesdeclast %xmm1, %xmm0, %xmm0 # sched: [7:1.00]
+; SANDY-NEXT: vaesdeclast (%rdi), %xmm0, %xmm0 # sched: [13:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_aesdeclast:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vaesdeclast %xmm1, %xmm0, %xmm0 # sched: [7:1.00]
+; HASWELL-NEXT: vaesdeclast (%rdi), %xmm0, %xmm0 # sched: [13:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_aesdeclast:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vaesdeclast %xmm1, %xmm0, %xmm0 # sched: [7:1.00]
+; BROADWELL-NEXT: vaesdeclast (%rdi), %xmm0, %xmm0 # sched: [12:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_aesdeclast:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vaesdeclast %xmm1, %xmm0, %xmm0 # sched: [4:1.00]
+; SKYLAKE-NEXT: vaesdeclast (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_aesdeclast:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: vaesdeclast %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; BTVER2-NEXT: vaesdeclast (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_aesdeclast:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vaesdeclast %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
+; ZNVER1-NEXT: vaesdeclast (%rdi), %xmm0, %xmm0 # sched: [11:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = load <2 x i64>, <2 x i64> *%a2, align 16
+ %2 = call <2 x i64> @llvm.x86.aesni.aesdeclast(<2 x i64> %a0, <2 x i64> %a1)
+ %3 = call <2 x i64> @llvm.x86.aesni.aesdeclast(<2 x i64> %2, <2 x i64> %1)
+ ret <2 x i64> %3
+}
+declare <2 x i64> @llvm.x86.aesni.aesdeclast(<2 x i64>, <2 x i64>)
+
+define <2 x i64> @test_aesenc(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
+; GENERIC-LABEL: test_aesenc:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: aesenc %xmm1, %xmm0 # sched: [7:1.00]
+; GENERIC-NEXT: aesenc (%rdi), %xmm0 # sched: [13:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SLM-LABEL: test_aesenc:
+; SLM: # %bb.0:
+; SLM-NEXT: aesenc %xmm1, %xmm0 # sched: [8:5.00]
+; SLM-NEXT: aesenc (%rdi), %xmm0 # sched: [8:5.00]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_aesenc:
+; SANDY: # %bb.0:
+; SANDY-NEXT: vaesenc %xmm1, %xmm0, %xmm0 # sched: [7:1.00]
+; SANDY-NEXT: vaesenc (%rdi), %xmm0, %xmm0 # sched: [13:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_aesenc:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vaesenc %xmm1, %xmm0, %xmm0 # sched: [7:1.00]
+; HASWELL-NEXT: vaesenc (%rdi), %xmm0, %xmm0 # sched: [13:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_aesenc:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vaesenc %xmm1, %xmm0, %xmm0 # sched: [7:1.00]
+; BROADWELL-NEXT: vaesenc (%rdi), %xmm0, %xmm0 # sched: [12:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_aesenc:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vaesenc %xmm1, %xmm0, %xmm0 # sched: [4:1.00]
+; SKYLAKE-NEXT: vaesenc (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_aesenc:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: vaesenc %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; BTVER2-NEXT: vaesenc (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_aesenc:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vaesenc %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
+; ZNVER1-NEXT: vaesenc (%rdi), %xmm0, %xmm0 # sched: [11:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = load <2 x i64>, <2 x i64> *%a2, align 16
+ %2 = call <2 x i64> @llvm.x86.aesni.aesenc(<2 x i64> %a0, <2 x i64> %a1)
+ %3 = call <2 x i64> @llvm.x86.aesni.aesenc(<2 x i64> %2, <2 x i64> %1)
+ ret <2 x i64> %3
+}
+declare <2 x i64> @llvm.x86.aesni.aesenc(<2 x i64>, <2 x i64>)
+
+define <2 x i64> @test_aesenclast(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
+; GENERIC-LABEL: test_aesenclast:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: aesenclast %xmm1, %xmm0 # sched: [7:1.00]
+; GENERIC-NEXT: aesenclast (%rdi), %xmm0 # sched: [13:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SLM-LABEL: test_aesenclast:
+; SLM: # %bb.0:
+; SLM-NEXT: aesenclast %xmm1, %xmm0 # sched: [8:5.00]
+; SLM-NEXT: aesenclast (%rdi), %xmm0 # sched: [8:5.00]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_aesenclast:
+; SANDY: # %bb.0:
+; SANDY-NEXT: vaesenclast %xmm1, %xmm0, %xmm0 # sched: [7:1.00]
+; SANDY-NEXT: vaesenclast (%rdi), %xmm0, %xmm0 # sched: [13:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_aesenclast:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vaesenclast %xmm1, %xmm0, %xmm0 # sched: [7:1.00]
+; HASWELL-NEXT: vaesenclast (%rdi), %xmm0, %xmm0 # sched: [13:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_aesenclast:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vaesenclast %xmm1, %xmm0, %xmm0 # sched: [7:1.00]
+; BROADWELL-NEXT: vaesenclast (%rdi), %xmm0, %xmm0 # sched: [12:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_aesenclast:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vaesenclast %xmm1, %xmm0, %xmm0 # sched: [4:1.00]
+; SKYLAKE-NEXT: vaesenclast (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_aesenclast:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: vaesenclast %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; BTVER2-NEXT: vaesenclast (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_aesenclast:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vaesenclast %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
+; ZNVER1-NEXT: vaesenclast (%rdi), %xmm0, %xmm0 # sched: [11:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = load <2 x i64>, <2 x i64> *%a2, align 16
+ %2 = call <2 x i64> @llvm.x86.aesni.aesenclast(<2 x i64> %a0, <2 x i64> %a1)
+ %3 = call <2 x i64> @llvm.x86.aesni.aesenclast(<2 x i64> %2, <2 x i64> %1)
+ ret <2 x i64> %3
+}
+declare <2 x i64> @llvm.x86.aesni.aesenclast(<2 x i64>, <2 x i64>)
+
+define <2 x i64> @test_aesimc(<2 x i64> %a0, <2 x i64> *%a1) {
+; GENERIC-LABEL: test_aesimc:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: aesimc %xmm0, %xmm1 # sched: [12:2.00]
+; GENERIC-NEXT: aesimc (%rdi), %xmm0 # sched: [18:2.00]
+; GENERIC-NEXT: por %xmm1, %xmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SLM-LABEL: test_aesimc:
+; SLM: # %bb.0:
+; SLM-NEXT: aesimc %xmm0, %xmm1 # sched: [8:5.00]
+; SLM-NEXT: aesimc (%rdi), %xmm0 # sched: [8:5.00]
+; SLM-NEXT: por %xmm1, %xmm0 # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_aesimc:
+; SANDY: # %bb.0:
+; SANDY-NEXT: vaesimc %xmm0, %xmm0 # sched: [12:2.00]
+; SANDY-NEXT: vaesimc (%rdi), %xmm1 # sched: [18:2.00]
+; SANDY-NEXT: vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_aesimc:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vaesimc %xmm0, %xmm0 # sched: [14:2.00]
+; HASWELL-NEXT: vaesimc (%rdi), %xmm1 # sched: [20:2.00]
+; HASWELL-NEXT: vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_aesimc:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vaesimc (%rdi), %xmm1 # sched: [19:2.00]
+; BROADWELL-NEXT: vaesimc %xmm0, %xmm0 # sched: [14:2.00]
+; BROADWELL-NEXT: vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_aesimc:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vaesimc %xmm0, %xmm0 # sched: [8:2.00]
+; SKYLAKE-NEXT: vaesimc (%rdi), %xmm1 # sched: [14:2.00]
+; SKYLAKE-NEXT: vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_aesimc:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: vaesimc (%rdi), %xmm1 # sched: [7:1.00]
+; BTVER2-NEXT: vaesimc %xmm0, %xmm0 # sched: [2:1.00]
+; BTVER2-NEXT: vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_aesimc:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vaesimc (%rdi), %xmm1 # sched: [11:0.50]
+; ZNVER1-NEXT: vaesimc %xmm0, %xmm0 # sched: [4:0.50]
+; ZNVER1-NEXT: vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = load <2 x i64>, <2 x i64> *%a1, align 16
+ %2 = call <2 x i64> @llvm.x86.aesni.aesimc(<2 x i64> %a0)
+ %3 = call <2 x i64> @llvm.x86.aesni.aesimc(<2 x i64> %1)
+ %4 = or <2 x i64> %2, %3
+ ret <2 x i64> %4
+}
+declare <2 x i64> @llvm.x86.aesni.aesimc(<2 x i64>)
+
+define <2 x i64> @test_aeskeygenassist(<2 x i64> %a0, <2 x i64> *%a1) {
+; GENERIC-LABEL: test_aeskeygenassist:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: aeskeygenassist $7, %xmm0, %xmm1 # sched: [8:3.67]
+; GENERIC-NEXT: aeskeygenassist $7, (%rdi), %xmm0 # sched: [8:3.33]
+; GENERIC-NEXT: por %xmm1, %xmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SLM-LABEL: test_aeskeygenassist:
+; SLM: # %bb.0:
+; SLM-NEXT: aeskeygenassist $7, %xmm0, %xmm1 # sched: [8:5.00]
+; SLM-NEXT: aeskeygenassist $7, (%rdi), %xmm0 # sched: [8:5.00]
+; SLM-NEXT: por %xmm1, %xmm0 # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_aeskeygenassist:
+; SANDY: # %bb.0:
+; SANDY-NEXT: vaeskeygenassist $7, %xmm0, %xmm0 # sched: [8:3.67]
+; SANDY-NEXT: vaeskeygenassist $7, (%rdi), %xmm1 # sched: [8:3.33]
+; SANDY-NEXT: vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_aeskeygenassist:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vaeskeygenassist $7, %xmm0, %xmm0 # sched: [29:7.00]
+; HASWELL-NEXT: vaeskeygenassist $7, (%rdi), %xmm1 # sched: [34:7.00]
+; HASWELL-NEXT: vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_aeskeygenassist:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vaeskeygenassist $7, %xmm0, %xmm0 # sched: [29:7.00]
+; BROADWELL-NEXT: vaeskeygenassist $7, (%rdi), %xmm1 # sched: [33:7.00]
+; BROADWELL-NEXT: vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_aeskeygenassist:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vaeskeygenassist $7, %xmm0, %xmm0 # sched: [20:6.00]
+; SKYLAKE-NEXT: vaeskeygenassist $7, (%rdi), %xmm1 # sched: [25:6.00]
+; SKYLAKE-NEXT: vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_aeskeygenassist:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: vaeskeygenassist $7, (%rdi), %xmm1 # sched: [7:1.00]
+; BTVER2-NEXT: vaeskeygenassist $7, %xmm0, %xmm0 # sched: [2:1.00]
+; BTVER2-NEXT: vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_aeskeygenassist:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vaeskeygenassist $7, (%rdi), %xmm1 # sched: [11:0.50]
+; ZNVER1-NEXT: vaeskeygenassist $7, %xmm0, %xmm0 # sched: [4:0.50]
+; ZNVER1-NEXT: vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = load <2 x i64>, <2 x i64> *%a1, align 16
+ %2 = call <2 x i64> @llvm.x86.aesni.aeskeygenassist(<2 x i64> %a0, i8 7)
+ %3 = call <2 x i64> @llvm.x86.aesni.aeskeygenassist(<2 x i64> %1, i8 7)
+ %4 = or <2 x i64> %2, %3
+ ret <2 x i64> %4
+}
+declare <2 x i64> @llvm.x86.aesni.aeskeygenassist(<2 x i64>, i8)
diff --git a/test/CodeGen/X86/aes_intrinsics.ll b/test/CodeGen/X86/aes_intrinsics.ll
index fc3d55a05429..442feca3fc19 100644
--- a/test/CodeGen/X86/aes_intrinsics.ll
+++ b/test/CodeGen/X86/aes_intrinsics.ll
@@ -1,17 +1,19 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=+aes,-avx -show-mc-encoding | FileCheck %s
-; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=+aes,+avx -show-mc-encoding | FileCheck %s --check-prefix=VCHECK
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=+aes,-avx -show-mc-encoding | FileCheck %s --check-prefix=SSE
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=+aes,+avx -show-mc-encoding | FileCheck %s --check-prefix=AVX
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+aes,-avx -show-mc-encoding | FileCheck %s --check-prefix=SSE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+aes,+avx -show-mc-encoding | FileCheck %s --check-prefix=AVX
define <2 x i64> @test_x86_aesni_aesdec(<2 x i64> %a0, <2 x i64> %a1) {
-; CHECK-LABEL: test_x86_aesni_aesdec:
-; CHECK: ## BB#0:
-; CHECK-NEXT: aesdec %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x38,0xde,0xc1]
-; CHECK-NEXT: retl ## encoding: [0xc3]
+; SSE-LABEL: test_x86_aesni_aesdec:
+; SSE: # %bb.0:
+; SSE-NEXT: aesdec %xmm1, %xmm0 # encoding: [0x66,0x0f,0x38,0xde,0xc1]
+; SSE-NEXT: ret{{[l|q]}} # encoding: [0xc3]
;
-; VCHECK-LABEL: test_x86_aesni_aesdec:
-; VCHECK: ## BB#0:
-; VCHECK-NEXT: vaesdec %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0xde,0xc1]
-; VCHECK-NEXT: retl ## encoding: [0xc3]
+; AVX-LABEL: test_x86_aesni_aesdec:
+; AVX: # %bb.0:
+; AVX-NEXT: vaesdec %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe2,0x79,0xde,0xc1]
+; AVX-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%res = call <2 x i64> @llvm.x86.aesni.aesdec(<2 x i64> %a0, <2 x i64> %a1) ; <<2 x i64>> [#uses=1]
ret <2 x i64> %res
}
@@ -19,15 +21,15 @@ declare <2 x i64> @llvm.x86.aesni.aesdec(<2 x i64>, <2 x i64>) nounwind readnone
define <2 x i64> @test_x86_aesni_aesdeclast(<2 x i64> %a0, <2 x i64> %a1) {
-; CHECK-LABEL: test_x86_aesni_aesdeclast:
-; CHECK: ## BB#0:
-; CHECK-NEXT: aesdeclast %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x38,0xdf,0xc1]
-; CHECK-NEXT: retl ## encoding: [0xc3]
+; SSE-LABEL: test_x86_aesni_aesdeclast:
+; SSE: # %bb.0:
+; SSE-NEXT: aesdeclast %xmm1, %xmm0 # encoding: [0x66,0x0f,0x38,0xdf,0xc1]
+; SSE-NEXT: ret{{[l|q]}} # encoding: [0xc3]
;
-; VCHECK-LABEL: test_x86_aesni_aesdeclast:
-; VCHECK: ## BB#0:
-; VCHECK-NEXT: vaesdeclast %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0xdf,0xc1]
-; VCHECK-NEXT: retl ## encoding: [0xc3]
+; AVX-LABEL: test_x86_aesni_aesdeclast:
+; AVX: # %bb.0:
+; AVX-NEXT: vaesdeclast %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe2,0x79,0xdf,0xc1]
+; AVX-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%res = call <2 x i64> @llvm.x86.aesni.aesdeclast(<2 x i64> %a0, <2 x i64> %a1) ; <<2 x i64>> [#uses=1]
ret <2 x i64> %res
}
@@ -35,15 +37,15 @@ declare <2 x i64> @llvm.x86.aesni.aesdeclast(<2 x i64>, <2 x i64>) nounwind read
define <2 x i64> @test_x86_aesni_aesenc(<2 x i64> %a0, <2 x i64> %a1) {
-; CHECK-LABEL: test_x86_aesni_aesenc:
-; CHECK: ## BB#0:
-; CHECK-NEXT: aesenc %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x38,0xdc,0xc1]
-; CHECK-NEXT: retl ## encoding: [0xc3]
+; SSE-LABEL: test_x86_aesni_aesenc:
+; SSE: # %bb.0:
+; SSE-NEXT: aesenc %xmm1, %xmm0 # encoding: [0x66,0x0f,0x38,0xdc,0xc1]
+; SSE-NEXT: ret{{[l|q]}} # encoding: [0xc3]
;
-; VCHECK-LABEL: test_x86_aesni_aesenc:
-; VCHECK: ## BB#0:
-; VCHECK-NEXT: vaesenc %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0xdc,0xc1]
-; VCHECK-NEXT: retl ## encoding: [0xc3]
+; AVX-LABEL: test_x86_aesni_aesenc:
+; AVX: # %bb.0:
+; AVX-NEXT: vaesenc %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe2,0x79,0xdc,0xc1]
+; AVX-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%res = call <2 x i64> @llvm.x86.aesni.aesenc(<2 x i64> %a0, <2 x i64> %a1) ; <<2 x i64>> [#uses=1]
ret <2 x i64> %res
}
@@ -51,15 +53,15 @@ declare <2 x i64> @llvm.x86.aesni.aesenc(<2 x i64>, <2 x i64>) nounwind readnone
define <2 x i64> @test_x86_aesni_aesenclast(<2 x i64> %a0, <2 x i64> %a1) {
-; CHECK-LABEL: test_x86_aesni_aesenclast:
-; CHECK: ## BB#0:
-; CHECK-NEXT: aesenclast %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x38,0xdd,0xc1]
-; CHECK-NEXT: retl ## encoding: [0xc3]
+; SSE-LABEL: test_x86_aesni_aesenclast:
+; SSE: # %bb.0:
+; SSE-NEXT: aesenclast %xmm1, %xmm0 # encoding: [0x66,0x0f,0x38,0xdd,0xc1]
+; SSE-NEXT: ret{{[l|q]}} # encoding: [0xc3]
;
-; VCHECK-LABEL: test_x86_aesni_aesenclast:
-; VCHECK: ## BB#0:
-; VCHECK-NEXT: vaesenclast %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0xdd,0xc1]
-; VCHECK-NEXT: retl ## encoding: [0xc3]
+; AVX-LABEL: test_x86_aesni_aesenclast:
+; AVX: # %bb.0:
+; AVX-NEXT: vaesenclast %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe2,0x79,0xdd,0xc1]
+; AVX-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%res = call <2 x i64> @llvm.x86.aesni.aesenclast(<2 x i64> %a0, <2 x i64> %a1) ; <<2 x i64>> [#uses=1]
ret <2 x i64> %res
}
@@ -67,15 +69,15 @@ declare <2 x i64> @llvm.x86.aesni.aesenclast(<2 x i64>, <2 x i64>) nounwind read
define <2 x i64> @test_x86_aesni_aesimc(<2 x i64> %a0) {
-; CHECK-LABEL: test_x86_aesni_aesimc:
-; CHECK: ## BB#0:
-; CHECK-NEXT: aesimc %xmm0, %xmm0 ## encoding: [0x66,0x0f,0x38,0xdb,0xc0]
-; CHECK-NEXT: retl ## encoding: [0xc3]
+; SSE-LABEL: test_x86_aesni_aesimc:
+; SSE: # %bb.0:
+; SSE-NEXT: aesimc %xmm0, %xmm0 # encoding: [0x66,0x0f,0x38,0xdb,0xc0]
+; SSE-NEXT: ret{{[l|q]}} # encoding: [0xc3]
;
-; VCHECK-LABEL: test_x86_aesni_aesimc:
-; VCHECK: ## BB#0:
-; VCHECK-NEXT: vaesimc %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0xdb,0xc0]
-; VCHECK-NEXT: retl ## encoding: [0xc3]
+; AVX-LABEL: test_x86_aesni_aesimc:
+; AVX: # %bb.0:
+; AVX-NEXT: vaesimc %xmm0, %xmm0 # encoding: [0xc4,0xe2,0x79,0xdb,0xc0]
+; AVX-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%res = call <2 x i64> @llvm.x86.aesni.aesimc(<2 x i64> %a0) ; <<2 x i64>> [#uses=1]
ret <2 x i64> %res
}
@@ -83,15 +85,15 @@ declare <2 x i64> @llvm.x86.aesni.aesimc(<2 x i64>) nounwind readnone
define <2 x i64> @test_x86_aesni_aeskeygenassist(<2 x i64> %a0) {
-; CHECK-LABEL: test_x86_aesni_aeskeygenassist:
-; CHECK: ## BB#0:
-; CHECK-NEXT: aeskeygenassist $7, %xmm0, %xmm0 ## encoding: [0x66,0x0f,0x3a,0xdf,0xc0,0x07]
-; CHECK-NEXT: retl ## encoding: [0xc3]
+; SSE-LABEL: test_x86_aesni_aeskeygenassist:
+; SSE: # %bb.0:
+; SSE-NEXT: aeskeygenassist $7, %xmm0, %xmm0 # encoding: [0x66,0x0f,0x3a,0xdf,0xc0,0x07]
+; SSE-NEXT: ret{{[l|q]}} # encoding: [0xc3]
;
-; VCHECK-LABEL: test_x86_aesni_aeskeygenassist:
-; VCHECK: ## BB#0:
-; VCHECK-NEXT: vaeskeygenassist $7, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0xdf,0xc0,0x07]
-; VCHECK-NEXT: retl ## encoding: [0xc3]
+; AVX-LABEL: test_x86_aesni_aeskeygenassist:
+; AVX: # %bb.0:
+; AVX-NEXT: vaeskeygenassist $7, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0xdf,0xc0,0x07]
+; AVX-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%res = call <2 x i64> @llvm.x86.aesni.aeskeygenassist(<2 x i64> %a0, i8 7) ; <<2 x i64>> [#uses=1]
ret <2 x i64> %res
}
diff --git a/test/CodeGen/X86/aligned-comm.ll b/test/CodeGen/X86/aligned-comm.ll
index eab02cc1f9d5..1192b72c48d2 100644
--- a/test/CodeGen/X86/aligned-comm.ll
+++ b/test/CodeGen/X86/aligned-comm.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86
+; RUN: llc < %s -mtriple=i686--
; RUN: llc < %s -mtriple=i386-apple-darwin10 | grep "array,16512,7"
; RUN: llc < %s -mtriple=i386-apple-darwin9 | grep "array,16512,7"
diff --git a/test/CodeGen/X86/all-ones-vector.ll b/test/CodeGen/X86/all-ones-vector.ll
index d0160a5b84df..d64b3d7e29b0 100644
--- a/test/CodeGen/X86/all-ones-vector.ll
+++ b/test/CodeGen/X86/all-ones-vector.ll
@@ -12,22 +12,22 @@
define <16 x i8> @allones_v16i8() nounwind {
; X32-SSE-LABEL: allones_v16i8:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: pcmpeqd %xmm0, %xmm0
; X32-SSE-NEXT: retl
;
; X32-AVX-LABEL: allones_v16i8:
-; X32-AVX: # BB#0:
+; X32-AVX: # %bb.0:
; X32-AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; X32-AVX-NEXT: retl
;
; X64-SSE-LABEL: allones_v16i8:
-; X64-SSE: # BB#0:
+; X64-SSE: # %bb.0:
; X64-SSE-NEXT: pcmpeqd %xmm0, %xmm0
; X64-SSE-NEXT: retq
;
; X64-AVX-LABEL: allones_v16i8:
-; X64-AVX: # BB#0:
+; X64-AVX: # %bb.0:
; X64-AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; X64-AVX-NEXT: retq
ret <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
@@ -35,22 +35,22 @@ define <16 x i8> @allones_v16i8() nounwind {
define <8 x i16> @allones_v8i16() nounwind {
; X32-SSE-LABEL: allones_v8i16:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: pcmpeqd %xmm0, %xmm0
; X32-SSE-NEXT: retl
;
; X32-AVX-LABEL: allones_v8i16:
-; X32-AVX: # BB#0:
+; X32-AVX: # %bb.0:
; X32-AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; X32-AVX-NEXT: retl
;
; X64-SSE-LABEL: allones_v8i16:
-; X64-SSE: # BB#0:
+; X64-SSE: # %bb.0:
; X64-SSE-NEXT: pcmpeqd %xmm0, %xmm0
; X64-SSE-NEXT: retq
;
; X64-AVX-LABEL: allones_v8i16:
-; X64-AVX: # BB#0:
+; X64-AVX: # %bb.0:
; X64-AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; X64-AVX-NEXT: retq
ret <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
@@ -58,22 +58,22 @@ define <8 x i16> @allones_v8i16() nounwind {
define <4 x i32> @allones_v4i32() nounwind {
; X32-SSE-LABEL: allones_v4i32:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: pcmpeqd %xmm0, %xmm0
; X32-SSE-NEXT: retl
;
; X32-AVX-LABEL: allones_v4i32:
-; X32-AVX: # BB#0:
+; X32-AVX: # %bb.0:
; X32-AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; X32-AVX-NEXT: retl
;
; X64-SSE-LABEL: allones_v4i32:
-; X64-SSE: # BB#0:
+; X64-SSE: # %bb.0:
; X64-SSE-NEXT: pcmpeqd %xmm0, %xmm0
; X64-SSE-NEXT: retq
;
; X64-AVX-LABEL: allones_v4i32:
-; X64-AVX: # BB#0:
+; X64-AVX: # %bb.0:
; X64-AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; X64-AVX-NEXT: retq
ret <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>
@@ -81,22 +81,22 @@ define <4 x i32> @allones_v4i32() nounwind {
define <2 x i64> @allones_v2i64() nounwind {
; X32-SSE-LABEL: allones_v2i64:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: pcmpeqd %xmm0, %xmm0
; X32-SSE-NEXT: retl
;
; X32-AVX-LABEL: allones_v2i64:
-; X32-AVX: # BB#0:
+; X32-AVX: # %bb.0:
; X32-AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; X32-AVX-NEXT: retl
;
; X64-SSE-LABEL: allones_v2i64:
-; X64-SSE: # BB#0:
+; X64-SSE: # %bb.0:
; X64-SSE-NEXT: pcmpeqd %xmm0, %xmm0
; X64-SSE-NEXT: retq
;
; X64-AVX-LABEL: allones_v2i64:
-; X64-AVX: # BB#0:
+; X64-AVX: # %bb.0:
; X64-AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; X64-AVX-NEXT: retq
ret <2 x i64> <i64 -1, i64 -1>
@@ -104,22 +104,22 @@ define <2 x i64> @allones_v2i64() nounwind {
define <2 x double> @allones_v2f64() nounwind {
; X32-SSE-LABEL: allones_v2f64:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: pcmpeqd %xmm0, %xmm0
; X32-SSE-NEXT: retl
;
; X32-AVX-LABEL: allones_v2f64:
-; X32-AVX: # BB#0:
+; X32-AVX: # %bb.0:
; X32-AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; X32-AVX-NEXT: retl
;
; X64-SSE-LABEL: allones_v2f64:
-; X64-SSE: # BB#0:
+; X64-SSE: # %bb.0:
; X64-SSE-NEXT: pcmpeqd %xmm0, %xmm0
; X64-SSE-NEXT: retq
;
; X64-AVX-LABEL: allones_v2f64:
-; X64-AVX: # BB#0:
+; X64-AVX: # %bb.0:
; X64-AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; X64-AVX-NEXT: retq
ret <2 x double> <double 0xffffffffffffffff, double 0xffffffffffffffff>
@@ -127,22 +127,22 @@ define <2 x double> @allones_v2f64() nounwind {
define <4 x float> @allones_v4f32() nounwind {
; X32-SSE-LABEL: allones_v4f32:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: pcmpeqd %xmm0, %xmm0
; X32-SSE-NEXT: retl
;
; X32-AVX-LABEL: allones_v4f32:
-; X32-AVX: # BB#0:
+; X32-AVX: # %bb.0:
; X32-AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; X32-AVX-NEXT: retl
;
; X64-SSE-LABEL: allones_v4f32:
-; X64-SSE: # BB#0:
+; X64-SSE: # %bb.0:
; X64-SSE-NEXT: pcmpeqd %xmm0, %xmm0
; X64-SSE-NEXT: retq
;
; X64-AVX-LABEL: allones_v4f32:
-; X64-AVX: # BB#0:
+; X64-AVX: # %bb.0:
; X64-AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; X64-AVX-NEXT: retq
ret <4 x float> <float 0xffffffffe0000000, float 0xffffffffe0000000, float 0xffffffffe0000000, float 0xffffffffe0000000>
@@ -150,36 +150,36 @@ define <4 x float> @allones_v4f32() nounwind {
define <32 x i8> @allones_v32i8() nounwind {
; X32-SSE-LABEL: allones_v32i8:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: pcmpeqd %xmm0, %xmm0
; X32-SSE-NEXT: pcmpeqd %xmm1, %xmm1
; X32-SSE-NEXT: retl
;
; X32-AVX1-LABEL: allones_v32i8:
-; X32-AVX1: # BB#0:
-; X32-AVX1-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; X32-AVX1: # %bb.0:
+; X32-AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
; X32-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0
; X32-AVX1-NEXT: retl
;
; X32-AVX256-LABEL: allones_v32i8:
-; X32-AVX256: # BB#0:
+; X32-AVX256: # %bb.0:
; X32-AVX256-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; X32-AVX256-NEXT: retl
;
; X64-SSE-LABEL: allones_v32i8:
-; X64-SSE: # BB#0:
+; X64-SSE: # %bb.0:
; X64-SSE-NEXT: pcmpeqd %xmm0, %xmm0
; X64-SSE-NEXT: pcmpeqd %xmm1, %xmm1
; X64-SSE-NEXT: retq
;
; X64-AVX1-LABEL: allones_v32i8:
-; X64-AVX1: # BB#0:
-; X64-AVX1-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; X64-AVX1: # %bb.0:
+; X64-AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
; X64-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0
; X64-AVX1-NEXT: retq
;
; X64-AVX256-LABEL: allones_v32i8:
-; X64-AVX256: # BB#0:
+; X64-AVX256: # %bb.0:
; X64-AVX256-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; X64-AVX256-NEXT: retq
ret <32 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
@@ -187,36 +187,36 @@ define <32 x i8> @allones_v32i8() nounwind {
define <16 x i16> @allones_v16i16() nounwind {
; X32-SSE-LABEL: allones_v16i16:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: pcmpeqd %xmm0, %xmm0
; X32-SSE-NEXT: pcmpeqd %xmm1, %xmm1
; X32-SSE-NEXT: retl
;
; X32-AVX1-LABEL: allones_v16i16:
-; X32-AVX1: # BB#0:
-; X32-AVX1-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; X32-AVX1: # %bb.0:
+; X32-AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
; X32-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0
; X32-AVX1-NEXT: retl
;
; X32-AVX256-LABEL: allones_v16i16:
-; X32-AVX256: # BB#0:
+; X32-AVX256: # %bb.0:
; X32-AVX256-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; X32-AVX256-NEXT: retl
;
; X64-SSE-LABEL: allones_v16i16:
-; X64-SSE: # BB#0:
+; X64-SSE: # %bb.0:
; X64-SSE-NEXT: pcmpeqd %xmm0, %xmm0
; X64-SSE-NEXT: pcmpeqd %xmm1, %xmm1
; X64-SSE-NEXT: retq
;
; X64-AVX1-LABEL: allones_v16i16:
-; X64-AVX1: # BB#0:
-; X64-AVX1-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; X64-AVX1: # %bb.0:
+; X64-AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
; X64-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0
; X64-AVX1-NEXT: retq
;
; X64-AVX256-LABEL: allones_v16i16:
-; X64-AVX256: # BB#0:
+; X64-AVX256: # %bb.0:
; X64-AVX256-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; X64-AVX256-NEXT: retq
ret <16 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
@@ -224,36 +224,36 @@ define <16 x i16> @allones_v16i16() nounwind {
define <8 x i32> @allones_v8i32() nounwind {
; X32-SSE-LABEL: allones_v8i32:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: pcmpeqd %xmm0, %xmm0
; X32-SSE-NEXT: pcmpeqd %xmm1, %xmm1
; X32-SSE-NEXT: retl
;
; X32-AVX1-LABEL: allones_v8i32:
-; X32-AVX1: # BB#0:
-; X32-AVX1-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; X32-AVX1: # %bb.0:
+; X32-AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
; X32-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0
; X32-AVX1-NEXT: retl
;
; X32-AVX256-LABEL: allones_v8i32:
-; X32-AVX256: # BB#0:
+; X32-AVX256: # %bb.0:
; X32-AVX256-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; X32-AVX256-NEXT: retl
;
; X64-SSE-LABEL: allones_v8i32:
-; X64-SSE: # BB#0:
+; X64-SSE: # %bb.0:
; X64-SSE-NEXT: pcmpeqd %xmm0, %xmm0
; X64-SSE-NEXT: pcmpeqd %xmm1, %xmm1
; X64-SSE-NEXT: retq
;
; X64-AVX1-LABEL: allones_v8i32:
-; X64-AVX1: # BB#0:
-; X64-AVX1-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; X64-AVX1: # %bb.0:
+; X64-AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
; X64-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0
; X64-AVX1-NEXT: retq
;
; X64-AVX256-LABEL: allones_v8i32:
-; X64-AVX256: # BB#0:
+; X64-AVX256: # %bb.0:
; X64-AVX256-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; X64-AVX256-NEXT: retq
ret <8 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
@@ -261,36 +261,36 @@ define <8 x i32> @allones_v8i32() nounwind {
define <4 x i64> @allones_v4i64() nounwind {
; X32-SSE-LABEL: allones_v4i64:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: pcmpeqd %xmm0, %xmm0
; X32-SSE-NEXT: pcmpeqd %xmm1, %xmm1
; X32-SSE-NEXT: retl
;
; X32-AVX1-LABEL: allones_v4i64:
-; X32-AVX1: # BB#0:
-; X32-AVX1-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; X32-AVX1: # %bb.0:
+; X32-AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
; X32-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0
; X32-AVX1-NEXT: retl
;
; X32-AVX256-LABEL: allones_v4i64:
-; X32-AVX256: # BB#0:
+; X32-AVX256: # %bb.0:
; X32-AVX256-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; X32-AVX256-NEXT: retl
;
; X64-SSE-LABEL: allones_v4i64:
-; X64-SSE: # BB#0:
+; X64-SSE: # %bb.0:
; X64-SSE-NEXT: pcmpeqd %xmm0, %xmm0
; X64-SSE-NEXT: pcmpeqd %xmm1, %xmm1
; X64-SSE-NEXT: retq
;
; X64-AVX1-LABEL: allones_v4i64:
-; X64-AVX1: # BB#0:
-; X64-AVX1-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; X64-AVX1: # %bb.0:
+; X64-AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
; X64-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0
; X64-AVX1-NEXT: retq
;
; X64-AVX256-LABEL: allones_v4i64:
-; X64-AVX256: # BB#0:
+; X64-AVX256: # %bb.0:
; X64-AVX256-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; X64-AVX256-NEXT: retq
ret <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1>
@@ -298,36 +298,36 @@ define <4 x i64> @allones_v4i64() nounwind {
define <4 x double> @allones_v4f64() nounwind {
; X32-SSE-LABEL: allones_v4f64:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: pcmpeqd %xmm0, %xmm0
; X32-SSE-NEXT: pcmpeqd %xmm1, %xmm1
; X32-SSE-NEXT: retl
;
; X32-AVX1-LABEL: allones_v4f64:
-; X32-AVX1: # BB#0:
-; X32-AVX1-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; X32-AVX1: # %bb.0:
+; X32-AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
; X32-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0
; X32-AVX1-NEXT: retl
;
; X32-AVX256-LABEL: allones_v4f64:
-; X32-AVX256: # BB#0:
+; X32-AVX256: # %bb.0:
; X32-AVX256-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; X32-AVX256-NEXT: retl
;
; X64-SSE-LABEL: allones_v4f64:
-; X64-SSE: # BB#0:
+; X64-SSE: # %bb.0:
; X64-SSE-NEXT: pcmpeqd %xmm0, %xmm0
; X64-SSE-NEXT: pcmpeqd %xmm1, %xmm1
; X64-SSE-NEXT: retq
;
; X64-AVX1-LABEL: allones_v4f64:
-; X64-AVX1: # BB#0:
-; X64-AVX1-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; X64-AVX1: # %bb.0:
+; X64-AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
; X64-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0
; X64-AVX1-NEXT: retq
;
; X64-AVX256-LABEL: allones_v4f64:
-; X64-AVX256: # BB#0:
+; X64-AVX256: # %bb.0:
; X64-AVX256-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; X64-AVX256-NEXT: retq
ret <4 x double> <double 0xffffffffffffffff, double 0xffffffffffffffff, double 0xffffffffffffffff, double 0xffffffffffffffff>
@@ -335,36 +335,36 @@ define <4 x double> @allones_v4f64() nounwind {
define <4 x double> @allones_v4f64_optsize() nounwind optsize {
; X32-SSE-LABEL: allones_v4f64_optsize:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: pcmpeqd %xmm0, %xmm0
; X32-SSE-NEXT: pcmpeqd %xmm1, %xmm1
; X32-SSE-NEXT: retl
;
; X32-AVX1-LABEL: allones_v4f64_optsize:
-; X32-AVX1: # BB#0:
-; X32-AVX1-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; X32-AVX1: # %bb.0:
+; X32-AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
; X32-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0
; X32-AVX1-NEXT: retl
;
; X32-AVX256-LABEL: allones_v4f64_optsize:
-; X32-AVX256: # BB#0:
+; X32-AVX256: # %bb.0:
; X32-AVX256-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; X32-AVX256-NEXT: retl
;
; X64-SSE-LABEL: allones_v4f64_optsize:
-; X64-SSE: # BB#0:
+; X64-SSE: # %bb.0:
; X64-SSE-NEXT: pcmpeqd %xmm0, %xmm0
; X64-SSE-NEXT: pcmpeqd %xmm1, %xmm1
; X64-SSE-NEXT: retq
;
; X64-AVX1-LABEL: allones_v4f64_optsize:
-; X64-AVX1: # BB#0:
-; X64-AVX1-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; X64-AVX1: # %bb.0:
+; X64-AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
; X64-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0
; X64-AVX1-NEXT: retq
;
; X64-AVX256-LABEL: allones_v4f64_optsize:
-; X64-AVX256: # BB#0:
+; X64-AVX256: # %bb.0:
; X64-AVX256-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; X64-AVX256-NEXT: retq
ret <4 x double> <double 0xffffffffffffffff, double 0xffffffffffffffff, double 0xffffffffffffffff, double 0xffffffffffffffff>
@@ -372,36 +372,36 @@ define <4 x double> @allones_v4f64_optsize() nounwind optsize {
define <8 x float> @allones_v8f32() nounwind {
; X32-SSE-LABEL: allones_v8f32:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: pcmpeqd %xmm0, %xmm0
; X32-SSE-NEXT: pcmpeqd %xmm1, %xmm1
; X32-SSE-NEXT: retl
;
; X32-AVX1-LABEL: allones_v8f32:
-; X32-AVX1: # BB#0:
-; X32-AVX1-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; X32-AVX1: # %bb.0:
+; X32-AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
; X32-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0
; X32-AVX1-NEXT: retl
;
; X32-AVX256-LABEL: allones_v8f32:
-; X32-AVX256: # BB#0:
+; X32-AVX256: # %bb.0:
; X32-AVX256-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; X32-AVX256-NEXT: retl
;
; X64-SSE-LABEL: allones_v8f32:
-; X64-SSE: # BB#0:
+; X64-SSE: # %bb.0:
; X64-SSE-NEXT: pcmpeqd %xmm0, %xmm0
; X64-SSE-NEXT: pcmpeqd %xmm1, %xmm1
; X64-SSE-NEXT: retq
;
; X64-AVX1-LABEL: allones_v8f32:
-; X64-AVX1: # BB#0:
-; X64-AVX1-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; X64-AVX1: # %bb.0:
+; X64-AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
; X64-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0
; X64-AVX1-NEXT: retq
;
; X64-AVX256-LABEL: allones_v8f32:
-; X64-AVX256: # BB#0:
+; X64-AVX256: # %bb.0:
; X64-AVX256-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; X64-AVX256-NEXT: retq
ret <8 x float> <float 0xffffffffe0000000, float 0xffffffffe0000000, float 0xffffffffe0000000, float 0xffffffffe0000000, float 0xffffffffe0000000, float 0xffffffffe0000000, float 0xffffffffe0000000, float 0xffffffffe0000000>
@@ -409,36 +409,36 @@ define <8 x float> @allones_v8f32() nounwind {
define <8 x float> @allones_v8f32_optsize() nounwind optsize {
; X32-SSE-LABEL: allones_v8f32_optsize:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: pcmpeqd %xmm0, %xmm0
; X32-SSE-NEXT: pcmpeqd %xmm1, %xmm1
; X32-SSE-NEXT: retl
;
; X32-AVX1-LABEL: allones_v8f32_optsize:
-; X32-AVX1: # BB#0:
-; X32-AVX1-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; X32-AVX1: # %bb.0:
+; X32-AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
; X32-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0
; X32-AVX1-NEXT: retl
;
; X32-AVX256-LABEL: allones_v8f32_optsize:
-; X32-AVX256: # BB#0:
+; X32-AVX256: # %bb.0:
; X32-AVX256-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; X32-AVX256-NEXT: retl
;
; X64-SSE-LABEL: allones_v8f32_optsize:
-; X64-SSE: # BB#0:
+; X64-SSE: # %bb.0:
; X64-SSE-NEXT: pcmpeqd %xmm0, %xmm0
; X64-SSE-NEXT: pcmpeqd %xmm1, %xmm1
; X64-SSE-NEXT: retq
;
; X64-AVX1-LABEL: allones_v8f32_optsize:
-; X64-AVX1: # BB#0:
-; X64-AVX1-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; X64-AVX1: # %bb.0:
+; X64-AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
; X64-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0
; X64-AVX1-NEXT: retq
;
; X64-AVX256-LABEL: allones_v8f32_optsize:
-; X64-AVX256: # BB#0:
+; X64-AVX256: # %bb.0:
; X64-AVX256-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; X64-AVX256-NEXT: retq
ret <8 x float> <float 0xffffffffe0000000, float 0xffffffffe0000000, float 0xffffffffe0000000, float 0xffffffffe0000000, float 0xffffffffe0000000, float 0xffffffffe0000000, float 0xffffffffe0000000, float 0xffffffffe0000000>
@@ -446,7 +446,7 @@ define <8 x float> @allones_v8f32_optsize() nounwind optsize {
define <64 x i8> @allones_v64i8() nounwind {
; X32-SSE-LABEL: allones_v64i8:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: pcmpeqd %xmm0, %xmm0
; X32-SSE-NEXT: pcmpeqd %xmm1, %xmm1
; X32-SSE-NEXT: pcmpeqd %xmm2, %xmm2
@@ -454,31 +454,31 @@ define <64 x i8> @allones_v64i8() nounwind {
; X32-SSE-NEXT: retl
;
; X32-AVX1-LABEL: allones_v64i8:
-; X32-AVX1: # BB#0:
-; X32-AVX1-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; X32-AVX1: # %bb.0:
+; X32-AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
; X32-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0
; X32-AVX1-NEXT: vmovaps %ymm0, %ymm1
; X32-AVX1-NEXT: retl
;
; X32-AVX2-LABEL: allones_v64i8:
-; X32-AVX2: # BB#0:
+; X32-AVX2: # %bb.0:
; X32-AVX2-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; X32-AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; X32-AVX2-NEXT: retl
;
; X32-KNL-LABEL: allones_v64i8:
-; X32-KNL: # BB#0:
+; X32-KNL: # %bb.0:
; X32-KNL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; X32-KNL-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; X32-KNL-NEXT: retl
;
; X32-SKX-LABEL: allones_v64i8:
-; X32-SKX: # BB#0:
+; X32-SKX: # %bb.0:
; X32-SKX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
; X32-SKX-NEXT: retl
;
; X64-SSE-LABEL: allones_v64i8:
-; X64-SSE: # BB#0:
+; X64-SSE: # %bb.0:
; X64-SSE-NEXT: pcmpeqd %xmm0, %xmm0
; X64-SSE-NEXT: pcmpeqd %xmm1, %xmm1
; X64-SSE-NEXT: pcmpeqd %xmm2, %xmm2
@@ -486,26 +486,26 @@ define <64 x i8> @allones_v64i8() nounwind {
; X64-SSE-NEXT: retq
;
; X64-AVX1-LABEL: allones_v64i8:
-; X64-AVX1: # BB#0:
-; X64-AVX1-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; X64-AVX1: # %bb.0:
+; X64-AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
; X64-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0
; X64-AVX1-NEXT: vmovaps %ymm0, %ymm1
; X64-AVX1-NEXT: retq
;
; X64-AVX2-LABEL: allones_v64i8:
-; X64-AVX2: # BB#0:
+; X64-AVX2: # %bb.0:
; X64-AVX2-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; X64-AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; X64-AVX2-NEXT: retq
;
; X64-KNL-LABEL: allones_v64i8:
-; X64-KNL: # BB#0:
+; X64-KNL: # %bb.0:
; X64-KNL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; X64-KNL-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; X64-KNL-NEXT: retq
;
; X64-SKX-LABEL: allones_v64i8:
-; X64-SKX: # BB#0:
+; X64-SKX: # %bb.0:
; X64-SKX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
; X64-SKX-NEXT: retq
ret <64 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
@@ -513,7 +513,7 @@ define <64 x i8> @allones_v64i8() nounwind {
define <32 x i16> @allones_v32i16() nounwind {
; X32-SSE-LABEL: allones_v32i16:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: pcmpeqd %xmm0, %xmm0
; X32-SSE-NEXT: pcmpeqd %xmm1, %xmm1
; X32-SSE-NEXT: pcmpeqd %xmm2, %xmm2
@@ -521,31 +521,31 @@ define <32 x i16> @allones_v32i16() nounwind {
; X32-SSE-NEXT: retl
;
; X32-AVX1-LABEL: allones_v32i16:
-; X32-AVX1: # BB#0:
-; X32-AVX1-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; X32-AVX1: # %bb.0:
+; X32-AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
; X32-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0
; X32-AVX1-NEXT: vmovaps %ymm0, %ymm1
; X32-AVX1-NEXT: retl
;
; X32-AVX2-LABEL: allones_v32i16:
-; X32-AVX2: # BB#0:
+; X32-AVX2: # %bb.0:
; X32-AVX2-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; X32-AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; X32-AVX2-NEXT: retl
;
; X32-KNL-LABEL: allones_v32i16:
-; X32-KNL: # BB#0:
+; X32-KNL: # %bb.0:
; X32-KNL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; X32-KNL-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; X32-KNL-NEXT: retl
;
; X32-SKX-LABEL: allones_v32i16:
-; X32-SKX: # BB#0:
+; X32-SKX: # %bb.0:
; X32-SKX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
; X32-SKX-NEXT: retl
;
; X64-SSE-LABEL: allones_v32i16:
-; X64-SSE: # BB#0:
+; X64-SSE: # %bb.0:
; X64-SSE-NEXT: pcmpeqd %xmm0, %xmm0
; X64-SSE-NEXT: pcmpeqd %xmm1, %xmm1
; X64-SSE-NEXT: pcmpeqd %xmm2, %xmm2
@@ -553,26 +553,26 @@ define <32 x i16> @allones_v32i16() nounwind {
; X64-SSE-NEXT: retq
;
; X64-AVX1-LABEL: allones_v32i16:
-; X64-AVX1: # BB#0:
-; X64-AVX1-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; X64-AVX1: # %bb.0:
+; X64-AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
; X64-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0
; X64-AVX1-NEXT: vmovaps %ymm0, %ymm1
; X64-AVX1-NEXT: retq
;
; X64-AVX2-LABEL: allones_v32i16:
-; X64-AVX2: # BB#0:
+; X64-AVX2: # %bb.0:
; X64-AVX2-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; X64-AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; X64-AVX2-NEXT: retq
;
; X64-KNL-LABEL: allones_v32i16:
-; X64-KNL: # BB#0:
+; X64-KNL: # %bb.0:
; X64-KNL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; X64-KNL-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; X64-KNL-NEXT: retq
;
; X64-SKX-LABEL: allones_v32i16:
-; X64-SKX: # BB#0:
+; X64-SKX: # %bb.0:
; X64-SKX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
; X64-SKX-NEXT: retq
ret <32 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
@@ -580,7 +580,7 @@ define <32 x i16> @allones_v32i16() nounwind {
define <16 x i32> @allones_v16i32() nounwind {
; X32-SSE-LABEL: allones_v16i32:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: pcmpeqd %xmm0, %xmm0
; X32-SSE-NEXT: pcmpeqd %xmm1, %xmm1
; X32-SSE-NEXT: pcmpeqd %xmm2, %xmm2
@@ -588,25 +588,25 @@ define <16 x i32> @allones_v16i32() nounwind {
; X32-SSE-NEXT: retl
;
; X32-AVX1-LABEL: allones_v16i32:
-; X32-AVX1: # BB#0:
-; X32-AVX1-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; X32-AVX1: # %bb.0:
+; X32-AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
; X32-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0
; X32-AVX1-NEXT: vmovaps %ymm0, %ymm1
; X32-AVX1-NEXT: retl
;
; X32-AVX2-LABEL: allones_v16i32:
-; X32-AVX2: # BB#0:
+; X32-AVX2: # %bb.0:
; X32-AVX2-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; X32-AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; X32-AVX2-NEXT: retl
;
; X32-AVX512-LABEL: allones_v16i32:
-; X32-AVX512: # BB#0:
+; X32-AVX512: # %bb.0:
; X32-AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
; X32-AVX512-NEXT: retl
;
; X64-SSE-LABEL: allones_v16i32:
-; X64-SSE: # BB#0:
+; X64-SSE: # %bb.0:
; X64-SSE-NEXT: pcmpeqd %xmm0, %xmm0
; X64-SSE-NEXT: pcmpeqd %xmm1, %xmm1
; X64-SSE-NEXT: pcmpeqd %xmm2, %xmm2
@@ -614,20 +614,20 @@ define <16 x i32> @allones_v16i32() nounwind {
; X64-SSE-NEXT: retq
;
; X64-AVX1-LABEL: allones_v16i32:
-; X64-AVX1: # BB#0:
-; X64-AVX1-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; X64-AVX1: # %bb.0:
+; X64-AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
; X64-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0
; X64-AVX1-NEXT: vmovaps %ymm0, %ymm1
; X64-AVX1-NEXT: retq
;
; X64-AVX2-LABEL: allones_v16i32:
-; X64-AVX2: # BB#0:
+; X64-AVX2: # %bb.0:
; X64-AVX2-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; X64-AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; X64-AVX2-NEXT: retq
;
; X64-AVX512-LABEL: allones_v16i32:
-; X64-AVX512: # BB#0:
+; X64-AVX512: # %bb.0:
; X64-AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
; X64-AVX512-NEXT: retq
ret <16 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
@@ -635,7 +635,7 @@ define <16 x i32> @allones_v16i32() nounwind {
define <8 x i64> @allones_v8i64() nounwind {
; X32-SSE-LABEL: allones_v8i64:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: pcmpeqd %xmm0, %xmm0
; X32-SSE-NEXT: pcmpeqd %xmm1, %xmm1
; X32-SSE-NEXT: pcmpeqd %xmm2, %xmm2
@@ -643,25 +643,25 @@ define <8 x i64> @allones_v8i64() nounwind {
; X32-SSE-NEXT: retl
;
; X32-AVX1-LABEL: allones_v8i64:
-; X32-AVX1: # BB#0:
-; X32-AVX1-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; X32-AVX1: # %bb.0:
+; X32-AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
; X32-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0
; X32-AVX1-NEXT: vmovaps %ymm0, %ymm1
; X32-AVX1-NEXT: retl
;
; X32-AVX2-LABEL: allones_v8i64:
-; X32-AVX2: # BB#0:
+; X32-AVX2: # %bb.0:
; X32-AVX2-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; X32-AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; X32-AVX2-NEXT: retl
;
; X32-AVX512-LABEL: allones_v8i64:
-; X32-AVX512: # BB#0:
+; X32-AVX512: # %bb.0:
; X32-AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
; X32-AVX512-NEXT: retl
;
; X64-SSE-LABEL: allones_v8i64:
-; X64-SSE: # BB#0:
+; X64-SSE: # %bb.0:
; X64-SSE-NEXT: pcmpeqd %xmm0, %xmm0
; X64-SSE-NEXT: pcmpeqd %xmm1, %xmm1
; X64-SSE-NEXT: pcmpeqd %xmm2, %xmm2
@@ -669,20 +669,20 @@ define <8 x i64> @allones_v8i64() nounwind {
; X64-SSE-NEXT: retq
;
; X64-AVX1-LABEL: allones_v8i64:
-; X64-AVX1: # BB#0:
-; X64-AVX1-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; X64-AVX1: # %bb.0:
+; X64-AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
; X64-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0
; X64-AVX1-NEXT: vmovaps %ymm0, %ymm1
; X64-AVX1-NEXT: retq
;
; X64-AVX2-LABEL: allones_v8i64:
-; X64-AVX2: # BB#0:
+; X64-AVX2: # %bb.0:
; X64-AVX2-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; X64-AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; X64-AVX2-NEXT: retq
;
; X64-AVX512-LABEL: allones_v8i64:
-; X64-AVX512: # BB#0:
+; X64-AVX512: # %bb.0:
; X64-AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
; X64-AVX512-NEXT: retq
ret <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>
@@ -690,7 +690,7 @@ define <8 x i64> @allones_v8i64() nounwind {
define <8 x double> @allones_v8f64() nounwind {
; X32-SSE-LABEL: allones_v8f64:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: pcmpeqd %xmm0, %xmm0
; X32-SSE-NEXT: pcmpeqd %xmm1, %xmm1
; X32-SSE-NEXT: pcmpeqd %xmm2, %xmm2
@@ -698,25 +698,25 @@ define <8 x double> @allones_v8f64() nounwind {
; X32-SSE-NEXT: retl
;
; X32-AVX1-LABEL: allones_v8f64:
-; X32-AVX1: # BB#0:
-; X32-AVX1-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; X32-AVX1: # %bb.0:
+; X32-AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
; X32-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0
; X32-AVX1-NEXT: vmovaps %ymm0, %ymm1
; X32-AVX1-NEXT: retl
;
; X32-AVX2-LABEL: allones_v8f64:
-; X32-AVX2: # BB#0:
+; X32-AVX2: # %bb.0:
; X32-AVX2-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; X32-AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; X32-AVX2-NEXT: retl
;
; X32-AVX512-LABEL: allones_v8f64:
-; X32-AVX512: # BB#0:
+; X32-AVX512: # %bb.0:
; X32-AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
; X32-AVX512-NEXT: retl
;
; X64-SSE-LABEL: allones_v8f64:
-; X64-SSE: # BB#0:
+; X64-SSE: # %bb.0:
; X64-SSE-NEXT: pcmpeqd %xmm0, %xmm0
; X64-SSE-NEXT: pcmpeqd %xmm1, %xmm1
; X64-SSE-NEXT: pcmpeqd %xmm2, %xmm2
@@ -724,20 +724,20 @@ define <8 x double> @allones_v8f64() nounwind {
; X64-SSE-NEXT: retq
;
; X64-AVX1-LABEL: allones_v8f64:
-; X64-AVX1: # BB#0:
-; X64-AVX1-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; X64-AVX1: # %bb.0:
+; X64-AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
; X64-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0
; X64-AVX1-NEXT: vmovaps %ymm0, %ymm1
; X64-AVX1-NEXT: retq
;
; X64-AVX2-LABEL: allones_v8f64:
-; X64-AVX2: # BB#0:
+; X64-AVX2: # %bb.0:
; X64-AVX2-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; X64-AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; X64-AVX2-NEXT: retq
;
; X64-AVX512-LABEL: allones_v8f64:
-; X64-AVX512: # BB#0:
+; X64-AVX512: # %bb.0:
; X64-AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
; X64-AVX512-NEXT: retq
ret <8 x double> <double 0xffffffffffffffff, double 0xffffffffffffffff, double 0xffffffffffffffff, double 0xffffffffffffffff, double 0xffffffffffffffff, double 0xffffffffffffffff, double 0xffffffffffffffff, double 0xffffffffffffffff>
@@ -745,7 +745,7 @@ define <8 x double> @allones_v8f64() nounwind {
define <16 x float> @allones_v16f32() nounwind {
; X32-SSE-LABEL: allones_v16f32:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: pcmpeqd %xmm0, %xmm0
; X32-SSE-NEXT: pcmpeqd %xmm1, %xmm1
; X32-SSE-NEXT: pcmpeqd %xmm2, %xmm2
@@ -753,25 +753,25 @@ define <16 x float> @allones_v16f32() nounwind {
; X32-SSE-NEXT: retl
;
; X32-AVX1-LABEL: allones_v16f32:
-; X32-AVX1: # BB#0:
-; X32-AVX1-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; X32-AVX1: # %bb.0:
+; X32-AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
; X32-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0
; X32-AVX1-NEXT: vmovaps %ymm0, %ymm1
; X32-AVX1-NEXT: retl
;
; X32-AVX2-LABEL: allones_v16f32:
-; X32-AVX2: # BB#0:
+; X32-AVX2: # %bb.0:
; X32-AVX2-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; X32-AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; X32-AVX2-NEXT: retl
;
; X32-AVX512-LABEL: allones_v16f32:
-; X32-AVX512: # BB#0:
+; X32-AVX512: # %bb.0:
; X32-AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
; X32-AVX512-NEXT: retl
;
; X64-SSE-LABEL: allones_v16f32:
-; X64-SSE: # BB#0:
+; X64-SSE: # %bb.0:
; X64-SSE-NEXT: pcmpeqd %xmm0, %xmm0
; X64-SSE-NEXT: pcmpeqd %xmm1, %xmm1
; X64-SSE-NEXT: pcmpeqd %xmm2, %xmm2
@@ -779,20 +779,20 @@ define <16 x float> @allones_v16f32() nounwind {
; X64-SSE-NEXT: retq
;
; X64-AVX1-LABEL: allones_v16f32:
-; X64-AVX1: # BB#0:
-; X64-AVX1-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; X64-AVX1: # %bb.0:
+; X64-AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
; X64-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0
; X64-AVX1-NEXT: vmovaps %ymm0, %ymm1
; X64-AVX1-NEXT: retq
;
; X64-AVX2-LABEL: allones_v16f32:
-; X64-AVX2: # BB#0:
+; X64-AVX2: # %bb.0:
; X64-AVX2-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; X64-AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; X64-AVX2-NEXT: retq
;
; X64-AVX512-LABEL: allones_v16f32:
-; X64-AVX512: # BB#0:
+; X64-AVX512: # %bb.0:
; X64-AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
; X64-AVX512-NEXT: retq
ret <16 x float> <float 0xffffffffe0000000, float 0xffffffffe0000000, float 0xffffffffe0000000, float 0xffffffffe0000000, float 0xffffffffe0000000, float 0xffffffffe0000000, float 0xffffffffe0000000, float 0xffffffffe0000000, float 0xffffffffe0000000, float 0xffffffffe0000000, float 0xffffffffe0000000, float 0xffffffffe0000000, float 0xffffffffe0000000, float 0xffffffffe0000000, float 0xffffffffe0000000, float 0xffffffffe0000000>
diff --git a/test/CodeGen/X86/alloca-align-rounding-32.ll b/test/CodeGen/X86/alloca-align-rounding-32.ll
index 2b5a205086e9..002179970531 100644
--- a/test/CodeGen/X86/alloca-align-rounding-32.ll
+++ b/test/CodeGen/X86/alloca-align-rounding-32.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mtriple=i686-apple-darwin | FileCheck %s
+; RUN: llc < %s -mtriple=i686-apple-darwin | FileCheck %s
declare void @bar(<2 x i64>* %n)
diff --git a/test/CodeGen/X86/alloca-align-rounding.ll b/test/CodeGen/X86/alloca-align-rounding.ll
index 9d8b6cfa6730..7bc485eb2386 100644
--- a/test/CodeGen/X86/alloca-align-rounding.ll
+++ b/test/CodeGen/X86/alloca-align-rounding.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=x86-64 -mtriple=i686-pc-linux -enable-misched=false | FileCheck %s
-; RUN: llc < %s -march=x86-64 -mtriple=x86_64-pc-linux-gnux32 -enable-misched=false | FileCheck %s -check-prefix=X32ABI
+; RUN: llc < %s -mtriple=x86_64-pc-linux -enable-misched=false | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-pc-linux-gnux32 -enable-misched=false | FileCheck %s -check-prefix=X32ABI
declare void @bar(<2 x i64>* %n)
diff --git a/test/CodeGen/X86/and-sink.ll b/test/CodeGen/X86/and-sink.ll
index 0f877e778c70..6d23d6cfb704 100644
--- a/test/CodeGen/X86/and-sink.ll
+++ b/test/CodeGen/X86/and-sink.ll
@@ -9,15 +9,15 @@
; Test that 'and' is sunk into bb0.
define i32 @and_sink1(i32 %a, i1 %c) {
; CHECK-LABEL: and_sink1:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: testb $1, {{[0-9]+}}(%esp)
; CHECK-NEXT: je .LBB0_3
-; CHECK-NEXT: # BB#1: # %bb0
+; CHECK-NEXT: # %bb.1: # %bb0
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: movl $0, A
; CHECK-NEXT: testb $4, %al
; CHECK-NEXT: jne .LBB0_3
-; CHECK-NEXT: # BB#2: # %bb1
+; CHECK-NEXT: # %bb.2: # %bb1
; CHECK-NEXT: movl $1, %eax
; CHECK-NEXT: retl
; CHECK-NEXT: .LBB0_3: # %bb2
@@ -46,11 +46,11 @@ bb2:
; Test that both 'and' and cmp get sunk to bb1.
define i32 @and_sink2(i32 %a, i1 %c, i1 %c2) {
; CHECK-LABEL: and_sink2:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movl $0, A
; CHECK-NEXT: testb $1, {{[0-9]+}}(%esp)
; CHECK-NEXT: je .LBB1_5
-; CHECK-NEXT: # BB#1: # %bb0.preheader
+; CHECK-NEXT: # %bb.1: # %bb0.preheader
; CHECK-NEXT: movb {{[0-9]+}}(%esp), %al
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
; CHECK-NEXT: .p2align 4, 0x90
@@ -59,12 +59,12 @@ define i32 @and_sink2(i32 %a, i1 %c, i1 %c2) {
; CHECK-NEXT: movl $0, B
; CHECK-NEXT: testb $1, %al
; CHECK-NEXT: je .LBB1_5
-; CHECK-NEXT: # BB#3: # %bb1
+; CHECK-NEXT: # %bb.3: # %bb1
; CHECK-NEXT: # in Loop: Header=BB1_2 Depth=1
; CHECK-NEXT: movl $0, C
; CHECK-NEXT: testb $4, %cl
; CHECK-NEXT: jne .LBB1_2
-; CHECK-NEXT: # BB#4: # %bb2
+; CHECK-NEXT: # %bb.4: # %bb2
; CHECK-NEXT: movl $1, %eax
; CHECK-NEXT: retl
; CHECK-NEXT: .LBB1_5: # %bb3
@@ -100,10 +100,10 @@ bb3:
; Test that CodeGenPrepare doesn't get stuck in a loop sinking and hoisting a masked load.
define i32 @and_sink3(i1 %c, i32* %p) {
; CHECK-LABEL: and_sink3:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: testb $1, {{[0-9]+}}(%esp)
; CHECK-NEXT: je .LBB2_3
-; CHECK-NEXT: # BB#1: # %bb0
+; CHECK-NEXT: # %bb.1: # %bb0
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: movzbl (%eax), %eax
; CHECK-NEXT: testl %eax, %eax
@@ -138,16 +138,16 @@ bb2:
; Test that CodeGenPrepare sinks/duplicates non-immediate 'and'.
define i32 @and_sink4(i32 %a, i32 %b, i1 %c) {
; CHECK-LABEL: and_sink4:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: testb $1, {{[0-9]+}}(%esp)
; CHECK-NEXT: je .LBB3_4
-; CHECK-NEXT: # BB#1: # %bb0
+; CHECK-NEXT: # %bb.1: # %bb0
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
; CHECK-NEXT: testl %eax, %ecx
; CHECK-NEXT: movl $0, A
; CHECK-NEXT: jne .LBB3_4
-; CHECK-NEXT: # BB#2: # %bb1
+; CHECK-NEXT: # %bb.2: # %bb1
; CHECK-NEXT: leal (%ecx,%eax), %edx
; CHECK-NEXT: testl %eax, %ecx
; CHECK-NEXT: movl %edx, B
@@ -189,15 +189,15 @@ bb3:
; when it would increase register pressure.
define i32 @and_sink5(i32 %a, i32 %b, i32 %a2, i32 %b2, i1 %c) {
; CHECK-LABEL: and_sink5:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: testb $1, {{[0-9]+}}(%esp)
; CHECK-NEXT: je .LBB4_4
-; CHECK-NEXT: # BB#1: # %bb0
+; CHECK-NEXT: # %bb.1: # %bb0
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: andl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: movl $0, A
; CHECK-NEXT: jne .LBB4_4
-; CHECK-NEXT: # BB#2: # %bb1
+; CHECK-NEXT: # %bb.2: # %bb1
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
; CHECK-NEXT: addl {{[0-9]+}}(%esp), %ecx
; CHECK-NEXT: testl %eax, %eax
diff --git a/test/CodeGen/X86/and-su.ll b/test/CodeGen/X86/and-su.ll
index bdbab1535016..7e87e7b4b409 100644
--- a/test/CodeGen/X86/and-su.ll
+++ b/test/CodeGen/X86/and-su.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | FileCheck %s
+; RUN: llc < %s -mtriple=i686-- | FileCheck %s
; Don't duplicate the load.
diff --git a/test/CodeGen/X86/andimm8.ll b/test/CodeGen/X86/andimm8.ll
index d9e676aa66c5..36f634897f99 100644
--- a/test/CodeGen/X86/andimm8.ll
+++ b/test/CodeGen/X86/andimm8.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mtriple=x86_64-pc-linux-gnu -show-mc-encoding | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-pc-linux-gnu -show-mc-encoding | FileCheck %s
; PR8365
; CHECK: andl $-64, %edi # encoding: [0x83,0xe7,0xc0]
diff --git a/test/CodeGen/X86/anyext.ll b/test/CodeGen/X86/anyext.ll
index 4f4218bdd63d..f0b514343b50 100644
--- a/test/CodeGen/X86/anyext.ll
+++ b/test/CodeGen/X86/anyext.ll
@@ -6,18 +6,18 @@
define i32 @foo(i32 %p, i8 zeroext %x) nounwind {
; X32-LABEL: foo:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: # kill: %EAX<def> %EAX<kill> %AX<def>
+; X32-NEXT: # kill: def %eax killed %eax def %ax
; X32-NEXT: divb {{[0-9]+}}(%esp)
; X32-NEXT: movzbl %al, %eax
; X32-NEXT: andl $1, %eax
; X32-NEXT: retl
;
; X64-LABEL: foo:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movzbl %dil, %eax
-; X64-NEXT: # kill: %EAX<def> %EAX<kill> %AX<def>
+; X64-NEXT: # kill: def %eax killed %eax def %ax
; X64-NEXT: divb %sil
; X64-NEXT: movzbl %al, %eax
; X64-NEXT: andl $1, %eax
@@ -31,20 +31,20 @@ define i32 @foo(i32 %p, i8 zeroext %x) nounwind {
define i32 @bar(i32 %p, i16 zeroext %x) nounwind {
; X32-LABEL: bar:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: xorl %edx, %edx
; X32-NEXT: divw {{[0-9]+}}(%esp)
-; X32-NEXT: # kill: %AX<def> %AX<kill> %EAX<def>
+; X32-NEXT: # kill: def %ax killed %ax def %eax
; X32-NEXT: andl $1, %eax
; X32-NEXT: retl
;
; X64-LABEL: bar:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: xorl %edx, %edx
; X64-NEXT: movl %edi, %eax
; X64-NEXT: divw %si
-; X64-NEXT: # kill: %AX<def> %AX<kill> %EAX<def>
+; X64-NEXT: # kill: def %ax killed %ax def %eax
; X64-NEXT: andl $1, %eax
; X64-NEXT: retq
%q = trunc i32 %p to i16
diff --git a/test/CodeGen/X86/asm-global-imm.ll b/test/CodeGen/X86/asm-global-imm.ll
index 82610114ad34..a971882a444f 100644
--- a/test/CodeGen/X86/asm-global-imm.ll
+++ b/test/CodeGen/X86/asm-global-imm.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -relocation-model=static -no-integrated-as | FileCheck %s
+; RUN: llc < %s -relocation-model=static -no-integrated-as | FileCheck %s
; PR882
target datalayout = "e-p:32:32"
diff --git a/test/CodeGen/X86/asm-modifier-P.ll b/test/CodeGen/X86/asm-modifier-P.ll
index 0aa55556d8f3..ae7155034ccf 100644
--- a/test/CodeGen/X86/asm-modifier-P.ll
+++ b/test/CodeGen/X86/asm-modifier-P.ll
@@ -1,12 +1,11 @@
-; RUN: llc < %s -march=x86 -mtriple=i686-unknown-linux-gnu -relocation-model=pic | FileCheck %s -check-prefix=CHECK-PIC-32
-; RUN: llc < %s -march=x86 -mtriple=i686-unknown-linux-gnu -relocation-model=static | FileCheck %s -check-prefix=CHECK-STATIC-32
-; RUN: llc < %s -march=x86-64 -relocation-model=static | FileCheck %s -check-prefix=CHECK-STATIC-64
-; RUN: llc < %s -march=x86-64 -relocation-model=pic | FileCheck %s -check-prefix=CHECK-PIC-64
+; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -relocation-model=pic | FileCheck %s -check-prefix=CHECK-PIC-32
+; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -relocation-model=static | FileCheck %s -check-prefix=CHECK-STATIC-32
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -relocation-model=static | FileCheck %s -check-prefix=CHECK-STATIC-64
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -relocation-model=pic | FileCheck %s -check-prefix=CHECK-PIC-64
; PR3379
; XFAIL: *
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
-target triple = "x86_64-unknown-linux-gnu"
@G = external global i32 ; <i32*> [#uses=1]
declare void @bar(...)
diff --git a/test/CodeGen/X86/atom-cmpb.ll b/test/CodeGen/X86/atom-cmpb.ll
index baf0f5e87fc9..e1894cb58d09 100644
--- a/test/CodeGen/X86/atom-cmpb.ll
+++ b/test/CodeGen/X86/atom-cmpb.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mcpu=atom | FileCheck %s
+; RUN: llc < %s -mtriple=i686-- -mcpu=atom | FileCheck %s
; CHECK: movl
; CHECK: movb
; CHECK: movb
diff --git a/test/CodeGen/X86/atom-fixup-lea2.ll b/test/CodeGen/X86/atom-fixup-lea2.ll
index 68b376ea5cc2..9b0b472be0f3 100644
--- a/test/CodeGen/X86/atom-fixup-lea2.ll
+++ b/test/CodeGen/X86/atom-fixup-lea2.ll
@@ -1,7 +1,7 @@
; RUN: llc < %s -mcpu=atom -mtriple=i686-linux | FileCheck %s
; RUN: llc < %s -mcpu=goldmont -mtriple=i686-linux | FileCheck %s
-; CHECK:BB#5
+; CHECK:%bb.5
; CHECK-NEXT:leal
; CHECK-NEXT:leal
; CHECK-NEXT:leal
diff --git a/test/CodeGen/X86/atom-fixup-lea3.ll b/test/CodeGen/X86/atom-fixup-lea3.ll
index ed2df277480e..e79d2e69e347 100644
--- a/test/CodeGen/X86/atom-fixup-lea3.ll
+++ b/test/CodeGen/X86/atom-fixup-lea3.ll
@@ -1,6 +1,8 @@
; RUN: llc < %s -mcpu=atom -mtriple=i686-linux | FileCheck %s
-; CHECK: addl ([[reg:%[a-z]+]])
-; CHECK-NEXT: addl $4, [[reg]]
+; CHECK: addl ({{%[a-z]+}},[[reg:%[a-z]+]],4)
+; CHECK-NEXT: movl
+; CHECK-NEXT: addl 4({{%[a-z]+}},[[reg:%[a-z]+]],4)
+; CHECK-NEXT: incl
; Test for the FixupLEAs pre-emit pass.
; An LEA should NOT be substituted for the ADD instruction
@@ -20,7 +22,7 @@
; return sum;
;}
-define i32 @test(i32 %n, i32* nocapture %array, i32* nocapture %m, i32* nocapture %array2) #0 {
+define i32 @test(i32 %n, i32* nocapture %array, i32* nocapture %k, i32* nocapture %l, i32* nocapture %m, i32* nocapture %array2) #0 {
entry:
%cmp7 = icmp sgt i32 %n, 0
br i1 %cmp7, label %for.body.lr.ph, label %for.end
@@ -35,6 +37,9 @@ for.body: ; preds = %for.body, %for.body
%j.09 = phi i32 [ 0, %for.body.lr.ph ], [ %inc1, %for.body ]
%inc1 = add nsw i32 %j.09, 1
%arrayidx = getelementptr inbounds i32, i32* %array2, i32 %j.09
+ store i32 %0, i32* %m, align 4
+ store i32 %sum.010, i32* %m, align 4
+ store i32 %0, i32* %m, align 4
%1 = load i32, i32* %arrayidx, align 4
%add = add nsw i32 %0, %1
store i32 %add, i32* %m, align 4
diff --git a/test/CodeGen/X86/atom-sched.ll b/test/CodeGen/X86/atom-sched.ll
index bddb015a0dd5..19249ab684ac 100644
--- a/test/CodeGen/X86/atom-sched.ll
+++ b/test/CodeGen/X86/atom-sched.ll
@@ -1,7 +1,7 @@
-; RUN: llc <%s -O2 -mcpu=atom -march=x86 -relocation-model=static | FileCheck -check-prefix=atom %s
-; RUN: llc <%s -O2 -mcpu=slm -march=x86 -relocation-model=static | FileCheck -check-prefix=slm %s
-; RUN: llc <%s -O2 -mcpu=goldmont -march=x86 -relocation-model=static | FileCheck -check-prefix=slm %s
-; RUN: llc <%s -O2 -mcpu=core2 -march=x86 -relocation-model=static | FileCheck %s
+; RUN: llc <%s -O2 -mcpu=atom -mtriple=i686-- -relocation-model=static | FileCheck -check-prefix=atom %s
+; RUN: llc <%s -O2 -mcpu=slm -mtriple=i686-- -relocation-model=static | FileCheck -check-prefix=slm %s
+; RUN: llc <%s -O2 -mcpu=goldmont -mtriple=i686-- -relocation-model=static | FileCheck -check-prefix=slm %s
+; RUN: llc <%s -O2 -mcpu=core2 -mtriple=i686-- -relocation-model=static | FileCheck %s
;
@a = common global i32 0, align 4
diff --git a/test/CodeGen/X86/atomic-dagsched.ll b/test/CodeGen/X86/atomic-dagsched.ll
index 97bb1afa47a7..15cf96f473c8 100644
--- a/test/CodeGen/X86/atomic-dagsched.ll
+++ b/test/CodeGen/X86/atomic-dagsched.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mcpu=corei7 -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=corei7 -verify-machineinstrs | FileCheck %s
define void @test(i8** %a, i64* %b, i64 %c, i64 %d) nounwind {
entry:
diff --git a/test/CodeGen/X86/atomic-eflags-reuse.ll b/test/CodeGen/X86/atomic-eflags-reuse.ll
index 9521a2afefcd..fc2b5671e821 100644
--- a/test/CodeGen/X86/atomic-eflags-reuse.ll
+++ b/test/CodeGen/X86/atomic-eflags-reuse.ll
@@ -1,13 +1,21 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-- | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-- | FileCheck %s --check-prefix=CHECK --check-prefix=FASTINCDEC
+; RUN: llc < %s -mtriple=x86_64-- -mattr=slow-incdec | FileCheck %s --check-prefix=CHECK --check-prefix=SLOWINCDEC
define i32 @test_add_1_cmov_slt(i64* %p, i32 %a0, i32 %a1) #0 {
-; CHECK-LABEL: test_add_1_cmov_slt:
-; CHECK: # BB#0: # %entry
-; CHECK-NEXT: lock incq (%rdi)
-; CHECK-NEXT: cmovgl %edx, %esi
-; CHECK-NEXT: movl %esi, %eax
-; CHECK-NEXT: retq
+; FASTINCDEC-LABEL: test_add_1_cmov_slt:
+; FASTINCDEC: # %bb.0: # %entry
+; FASTINCDEC-NEXT: lock incq (%rdi)
+; FASTINCDEC-NEXT: cmovgl %edx, %esi
+; FASTINCDEC-NEXT: movl %esi, %eax
+; FASTINCDEC-NEXT: retq
+;
+; SLOWINCDEC-LABEL: test_add_1_cmov_slt:
+; SLOWINCDEC: # %bb.0: # %entry
+; SLOWINCDEC-NEXT: lock addq $1, (%rdi)
+; SLOWINCDEC-NEXT: cmovgl %edx, %esi
+; SLOWINCDEC-NEXT: movl %esi, %eax
+; SLOWINCDEC-NEXT: retq
entry:
%tmp0 = atomicrmw add i64* %p, i64 1 seq_cst
%tmp1 = icmp slt i64 %tmp0, 0
@@ -16,12 +24,19 @@ entry:
}
define i32 @test_add_1_cmov_sge(i64* %p, i32 %a0, i32 %a1) #0 {
-; CHECK-LABEL: test_add_1_cmov_sge:
-; CHECK: # BB#0: # %entry
-; CHECK-NEXT: lock incq (%rdi)
-; CHECK-NEXT: cmovlel %edx, %esi
-; CHECK-NEXT: movl %esi, %eax
-; CHECK-NEXT: retq
+; FASTINCDEC-LABEL: test_add_1_cmov_sge:
+; FASTINCDEC: # %bb.0: # %entry
+; FASTINCDEC-NEXT: lock incq (%rdi)
+; FASTINCDEC-NEXT: cmovlel %edx, %esi
+; FASTINCDEC-NEXT: movl %esi, %eax
+; FASTINCDEC-NEXT: retq
+;
+; SLOWINCDEC-LABEL: test_add_1_cmov_sge:
+; SLOWINCDEC: # %bb.0: # %entry
+; SLOWINCDEC-NEXT: lock addq $1, (%rdi)
+; SLOWINCDEC-NEXT: cmovlel %edx, %esi
+; SLOWINCDEC-NEXT: movl %esi, %eax
+; SLOWINCDEC-NEXT: retq
entry:
%tmp0 = atomicrmw add i64* %p, i64 1 seq_cst
%tmp1 = icmp sge i64 %tmp0, 0
@@ -30,12 +45,19 @@ entry:
}
define i32 @test_sub_1_cmov_sle(i64* %p, i32 %a0, i32 %a1) #0 {
-; CHECK-LABEL: test_sub_1_cmov_sle:
-; CHECK: # BB#0: # %entry
-; CHECK-NEXT: lock decq (%rdi)
-; CHECK-NEXT: cmovgel %edx, %esi
-; CHECK-NEXT: movl %esi, %eax
-; CHECK-NEXT: retq
+; FASTINCDEC-LABEL: test_sub_1_cmov_sle:
+; FASTINCDEC: # %bb.0: # %entry
+; FASTINCDEC-NEXT: lock decq (%rdi)
+; FASTINCDEC-NEXT: cmovgel %edx, %esi
+; FASTINCDEC-NEXT: movl %esi, %eax
+; FASTINCDEC-NEXT: retq
+;
+; SLOWINCDEC-LABEL: test_sub_1_cmov_sle:
+; SLOWINCDEC: # %bb.0: # %entry
+; SLOWINCDEC-NEXT: lock addq $-1, (%rdi)
+; SLOWINCDEC-NEXT: cmovgel %edx, %esi
+; SLOWINCDEC-NEXT: movl %esi, %eax
+; SLOWINCDEC-NEXT: retq
entry:
%tmp0 = atomicrmw sub i64* %p, i64 1 seq_cst
%tmp1 = icmp sle i64 %tmp0, 0
@@ -44,12 +66,19 @@ entry:
}
define i32 @test_sub_1_cmov_sgt(i64* %p, i32 %a0, i32 %a1) #0 {
-; CHECK-LABEL: test_sub_1_cmov_sgt:
-; CHECK: # BB#0: # %entry
-; CHECK-NEXT: lock decq (%rdi)
-; CHECK-NEXT: cmovll %edx, %esi
-; CHECK-NEXT: movl %esi, %eax
-; CHECK-NEXT: retq
+; FASTINCDEC-LABEL: test_sub_1_cmov_sgt:
+; FASTINCDEC: # %bb.0: # %entry
+; FASTINCDEC-NEXT: lock decq (%rdi)
+; FASTINCDEC-NEXT: cmovll %edx, %esi
+; FASTINCDEC-NEXT: movl %esi, %eax
+; FASTINCDEC-NEXT: retq
+;
+; SLOWINCDEC-LABEL: test_sub_1_cmov_sgt:
+; SLOWINCDEC: # %bb.0: # %entry
+; SLOWINCDEC-NEXT: lock addq $-1, (%rdi)
+; SLOWINCDEC-NEXT: cmovll %edx, %esi
+; SLOWINCDEC-NEXT: movl %esi, %eax
+; SLOWINCDEC-NEXT: retq
entry:
%tmp0 = atomicrmw sub i64* %p, i64 1 seq_cst
%tmp1 = icmp sgt i64 %tmp0, 0
@@ -60,11 +89,11 @@ entry:
; FIXME: (setcc slt x, 0) gets combined into shr early.
define i8 @test_add_1_setcc_slt(i64* %p) #0 {
; CHECK-LABEL: test_add_1_setcc_slt:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movl $1, %eax
; CHECK-NEXT: lock xaddq %rax, (%rdi)
; CHECK-NEXT: shrq $63, %rax
-; CHECK-NEXT: # kill: %AL<def> %AL<kill> %RAX<kill>
+; CHECK-NEXT: # kill: def %al killed %al killed %rax
; CHECK-NEXT: retq
entry:
%tmp0 = atomicrmw add i64* %p, i64 1 seq_cst
@@ -74,11 +103,17 @@ entry:
}
define i8 @test_sub_1_setcc_sgt(i64* %p) #0 {
-; CHECK-LABEL: test_sub_1_setcc_sgt:
-; CHECK: # BB#0: # %entry
-; CHECK-NEXT: lock decq (%rdi)
-; CHECK-NEXT: setge %al
-; CHECK-NEXT: retq
+; FASTINCDEC-LABEL: test_sub_1_setcc_sgt:
+; FASTINCDEC: # %bb.0: # %entry
+; FASTINCDEC-NEXT: lock decq (%rdi)
+; FASTINCDEC-NEXT: setge %al
+; FASTINCDEC-NEXT: retq
+;
+; SLOWINCDEC-LABEL: test_sub_1_setcc_sgt:
+; SLOWINCDEC: # %bb.0: # %entry
+; SLOWINCDEC-NEXT: lock addq $-1, (%rdi)
+; SLOWINCDEC-NEXT: setge %al
+; SLOWINCDEC-NEXT: retq
entry:
%tmp0 = atomicrmw sub i64* %p, i64 1 seq_cst
%tmp1 = icmp sgt i64 %tmp0, 0
@@ -87,16 +122,27 @@ entry:
}
define i32 @test_add_1_brcond_sge(i64* %p, i32 %a0, i32 %a1) #0 {
-; CHECK-LABEL: test_add_1_brcond_sge:
-; CHECK: # BB#0: # %entry
-; CHECK-NEXT: lock incq (%rdi)
-; CHECK-NEXT: jle .LBB6_2
-; CHECK-NEXT: # BB#1: # %t
-; CHECK-NEXT: movl %esi, %eax
-; CHECK-NEXT: retq
-; CHECK-NEXT: .LBB6_2: # %f
-; CHECK-NEXT: movl %edx, %eax
-; CHECK-NEXT: retq
+; FASTINCDEC-LABEL: test_add_1_brcond_sge:
+; FASTINCDEC: # %bb.0: # %entry
+; FASTINCDEC-NEXT: lock incq (%rdi)
+; FASTINCDEC-NEXT: jle .LBB6_2
+; FASTINCDEC-NEXT: # %bb.1: # %t
+; FASTINCDEC-NEXT: movl %esi, %eax
+; FASTINCDEC-NEXT: retq
+; FASTINCDEC-NEXT: .LBB6_2: # %f
+; FASTINCDEC-NEXT: movl %edx, %eax
+; FASTINCDEC-NEXT: retq
+;
+; SLOWINCDEC-LABEL: test_add_1_brcond_sge:
+; SLOWINCDEC: # %bb.0: # %entry
+; SLOWINCDEC-NEXT: lock addq $1, (%rdi)
+; SLOWINCDEC-NEXT: jle .LBB6_2
+; SLOWINCDEC-NEXT: # %bb.1: # %t
+; SLOWINCDEC-NEXT: movl %esi, %eax
+; SLOWINCDEC-NEXT: retq
+; SLOWINCDEC-NEXT: .LBB6_2: # %f
+; SLOWINCDEC-NEXT: movl %edx, %eax
+; SLOWINCDEC-NEXT: retq
entry:
%tmp0 = atomicrmw add i64* %p, i64 1 seq_cst
%tmp1 = icmp sge i64 %tmp0, 0
@@ -112,7 +158,7 @@ f:
define i32 @test_add_1_cmov_sle(i64* %p, i32 %a0, i32 %a1) #0 {
; CHECK-LABEL: test_add_1_cmov_sle:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movl $1, %eax
; CHECK-NEXT: lock xaddq %rax, (%rdi)
; CHECK-NEXT: testq %rax, %rax
@@ -128,7 +174,7 @@ entry:
define i32 @test_add_1_cmov_sgt(i64* %p, i32 %a0, i32 %a1) #0 {
; CHECK-LABEL: test_add_1_cmov_sgt:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movl $1, %eax
; CHECK-NEXT: lock xaddq %rax, (%rdi)
; CHECK-NEXT: testq %rax, %rax
@@ -146,7 +192,7 @@ entry:
define i8 @test_add_1_setcc_sgt_reuse(i64* %p, i64* %p2) #0 {
; CHECK-LABEL: test_add_1_setcc_sgt_reuse:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movl $1, %ecx
; CHECK-NEXT: lock xaddq %rcx, (%rdi)
; CHECK-NEXT: testq %rcx, %rcx
@@ -163,7 +209,7 @@ entry:
define i8 @test_sub_2_setcc_sgt(i64* %p) #0 {
; CHECK-LABEL: test_sub_2_setcc_sgt:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movq $-2, %rax
; CHECK-NEXT: lock xaddq %rax, (%rdi)
; CHECK-NEXT: testq %rax, %rax
@@ -179,7 +225,7 @@ entry:
define i8 @test_add_1_cmov_cmov(i64* %p, i8* %q) #0 {
; TODO: It's possible to use "lock inc" here, but both cmovs need to be updated.
; CHECK-LABEL: test_add_1_cmov_cmov:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movl $1, %eax
; CHECK-NEXT: lock xaddq %rax, (%rdi)
; CHECK-NEXT: testq %rax, %rax
@@ -192,4 +238,102 @@ entry:
ret i8 %s2
}
+define i8 @test_sub_1_cmp_1_setcc_eq(i64* %p) #0 {
+; FASTINCDEC-LABEL: test_sub_1_cmp_1_setcc_eq:
+; FASTINCDEC: # %bb.0: # %entry
+; FASTINCDEC-NEXT: lock decq (%rdi)
+; FASTINCDEC-NEXT: sete %al
+; FASTINCDEC-NEXT: retq
+;
+; SLOWINCDEC-LABEL: test_sub_1_cmp_1_setcc_eq:
+; SLOWINCDEC: # %bb.0: # %entry
+; SLOWINCDEC-NEXT: lock subq $1, (%rdi)
+; SLOWINCDEC-NEXT: sete %al
+; SLOWINCDEC-NEXT: retq
+entry:
+ %tmp0 = atomicrmw sub i64* %p, i64 1 seq_cst
+ %tmp1 = icmp eq i64 %tmp0, 1
+ %tmp2 = zext i1 %tmp1 to i8
+ ret i8 %tmp2
+}
+
+define i8 @test_sub_1_cmp_1_setcc_ne(i64* %p) #0 {
+; FASTINCDEC-LABEL: test_sub_1_cmp_1_setcc_ne:
+; FASTINCDEC: # %bb.0: # %entry
+; FASTINCDEC-NEXT: lock decq (%rdi)
+; FASTINCDEC-NEXT: setne %al
+; FASTINCDEC-NEXT: retq
+;
+; SLOWINCDEC-LABEL: test_sub_1_cmp_1_setcc_ne:
+; SLOWINCDEC: # %bb.0: # %entry
+; SLOWINCDEC-NEXT: lock subq $1, (%rdi)
+; SLOWINCDEC-NEXT: setne %al
+; SLOWINCDEC-NEXT: retq
+entry:
+ %tmp0 = atomicrmw sub i64* %p, i64 1 seq_cst
+ %tmp1 = icmp ne i64 %tmp0, 1
+ %tmp2 = zext i1 %tmp1 to i8
+ ret i8 %tmp2
+}
+
+define i8 @test_sub_1_cmp_1_setcc_ugt(i64* %p) #0 {
+; CHECK-LABEL: test_sub_1_cmp_1_setcc_ugt:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: lock subq $1, (%rdi)
+; CHECK-NEXT: seta %al
+; CHECK-NEXT: retq
+entry:
+ %tmp0 = atomicrmw sub i64* %p, i64 1 seq_cst
+ %tmp1 = icmp ugt i64 %tmp0, 1
+ %tmp2 = zext i1 %tmp1 to i8
+ ret i8 %tmp2
+}
+
+; FIXME: This test canonicalizes in a way that hides the fact that the
+; comparison can be folded into the atomic subtract.
+define i8 @test_sub_1_cmp_1_setcc_sle(i64* %p) #0 {
+; CHECK-LABEL: test_sub_1_cmp_1_setcc_sle:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: movq $-1, %rax
+; CHECK-NEXT: lock xaddq %rax, (%rdi)
+; CHECK-NEXT: cmpq $2, %rax
+; CHECK-NEXT: setl %al
+; CHECK-NEXT: retq
+entry:
+ %tmp0 = atomicrmw sub i64* %p, i64 1 seq_cst
+ %tmp1 = icmp sle i64 %tmp0, 1
+ %tmp2 = zext i1 %tmp1 to i8
+ ret i8 %tmp2
+}
+
+define i8 @test_sub_3_cmp_3_setcc_eq(i64* %p) #0 {
+; CHECK-LABEL: test_sub_3_cmp_3_setcc_eq:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: lock subq $3, (%rdi)
+; CHECK-NEXT: sete %al
+; CHECK-NEXT: retq
+entry:
+ %tmp0 = atomicrmw sub i64* %p, i64 3 seq_cst
+ %tmp1 = icmp eq i64 %tmp0, 3
+ %tmp2 = zext i1 %tmp1 to i8
+ ret i8 %tmp2
+}
+
+; FIXME: This test canonicalizes in a way that hides the fact that the
+; comparison can be folded into the atomic subtract.
+define i8 @test_sub_3_cmp_3_setcc_uge(i64* %p) #0 {
+; CHECK-LABEL: test_sub_3_cmp_3_setcc_uge:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: movq $-3, %rax
+; CHECK-NEXT: lock xaddq %rax, (%rdi)
+; CHECK-NEXT: cmpq $2, %rax
+; CHECK-NEXT: seta %al
+; CHECK-NEXT: retq
+entry:
+ %tmp0 = atomicrmw sub i64* %p, i64 3 seq_cst
+ %tmp1 = icmp uge i64 %tmp0, 3
+ %tmp2 = zext i1 %tmp1 to i8
+ ret i8 %tmp2
+}
+
attributes #0 = { nounwind }
diff --git a/test/CodeGen/X86/atomic-load-store-wide.ll b/test/CodeGen/X86/atomic-load-store-wide.ll
index df0af5f7f27d..6940e8a7d408 100644
--- a/test/CodeGen/X86/atomic-load-store-wide.ll
+++ b/test/CodeGen/X86/atomic-load-store-wide.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mcpu=corei7 -march=x86 -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -mcpu=corei7 -mtriple=i686-- -verify-machineinstrs | FileCheck %s
; 64-bit load/store on x86-32
; FIXME: The generated code can be substantially improved.
diff --git a/test/CodeGen/X86/atomic-minmax-i6432.ll b/test/CodeGen/X86/atomic-minmax-i6432.ll
index 1a6fde371f09..fec740f591f0 100644
--- a/test/CodeGen/X86/atomic-minmax-i6432.ll
+++ b/test/CodeGen/X86/atomic-minmax-i6432.ll
@@ -1,52 +1,297 @@
-; RUN: llc -march=x86 -mattr=+cmov,cx16 -mtriple=i386-pc-linux -verify-machineinstrs < %s | FileCheck %s -check-prefix=LINUX
-; RUN: llc -march=x86 -mattr=cx16 -mtriple=i386-macosx -relocation-model=pic -verify-machineinstrs < %s | FileCheck %s -check-prefix=PIC
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mattr=+cmov,cx16 -mtriple=i386-pc-linux -verify-machineinstrs < %s | FileCheck %s -check-prefix=LINUX
+; RUN: llc -mattr=cx16 -mtriple=i386-macosx -relocation-model=pic -verify-machineinstrs < %s | FileCheck %s -check-prefix=PIC
@sc64 = external global i64
-define void @atomic_maxmin_i6432() {
-; LINUX: atomic_maxmin_i6432
- %1 = atomicrmw max i64* @sc64, i64 5 acquire
-; LINUX: [[LABEL:.LBB[0-9]+_[0-9]+]]
-; LINUX: cmpl
-; LINUX: sbbl
-; LINUX: jne
-; LINUX: jne
-; LINUX: lock cmpxchg8b
-; LINUX: jne [[LABEL]]
- %2 = atomicrmw min i64* @sc64, i64 6 acquire
-; LINUX: [[LABEL:.LBB[0-9]+_[0-9]+]]
-; LINUX: cmpl
-; LINUX: sbbl
-; LINUX: jne
-; LINUX: jne
-; LINUX: lock cmpxchg8b
-; LINUX: jne [[LABEL]]
- %3 = atomicrmw umax i64* @sc64, i64 7 acquire
-; LINUX: [[LABEL:.LBB[0-9]+_[0-9]+]]
-; LINUX: cmpl
-; LINUX: sbbl
-; LINUX: jne
-; LINUX: jne
-; LINUX: lock cmpxchg8b
-; LINUX: jne [[LABEL]]
- %4 = atomicrmw umin i64* @sc64, i64 8 acquire
-; LINUX: [[LABEL:.LBB[0-9]+_[0-9]+]]
-; LINUX: cmpl
-; LINUX: sbbl
-; LINUX: jne
-; LINUX: jne
-; LINUX: lock cmpxchg8b
-; LINUX: jne [[LABEL]]
- ret void
+define i64 @atomic_max_i64() nounwind {
+; LINUX-LABEL: atomic_max_i64:
+; LINUX: # %bb.0: # %entry
+; LINUX-NEXT: pushl %ebx
+; LINUX-NEXT: pushl %esi
+; LINUX-NEXT: movl sc64+4, %edx
+; LINUX-NEXT: movl sc64, %eax
+; LINUX-NEXT: movl $4, %esi
+; LINUX-NEXT: .p2align 4, 0x90
+; LINUX-NEXT: .LBB0_1: # %atomicrmw.start
+; LINUX-NEXT: # =>This Inner Loop Header: Depth=1
+; LINUX-NEXT: cmpl %eax, %esi
+; LINUX-NEXT: movl $0, %ecx
+; LINUX-NEXT: sbbl %edx, %ecx
+; LINUX-NEXT: movl $0, %ecx
+; LINUX-NEXT: cmovll %edx, %ecx
+; LINUX-NEXT: movl $5, %ebx
+; LINUX-NEXT: cmovll %eax, %ebx
+; LINUX-NEXT: lock cmpxchg8b sc64
+; LINUX-NEXT: jne .LBB0_1
+; LINUX-NEXT: # %bb.2: # %atomicrmw.end
+; LINUX-NEXT: popl %esi
+; LINUX-NEXT: popl %ebx
+; LINUX-NEXT: retl
+;
+; PIC-LABEL: atomic_max_i64:
+; PIC: ## %bb.0: ## %entry
+; PIC-NEXT: pushl %ebx
+; PIC-NEXT: pushl %edi
+; PIC-NEXT: pushl %esi
+; PIC-NEXT: calll L0$pb
+; PIC-NEXT: L0$pb:
+; PIC-NEXT: popl %eax
+; PIC-NEXT: movl L_sc64$non_lazy_ptr-L0$pb(%eax), %esi
+; PIC-NEXT: movl (%esi), %eax
+; PIC-NEXT: movl 4(%esi), %edx
+; PIC-NEXT: movl $4, %edi
+; PIC-NEXT: .p2align 4, 0x90
+; PIC-NEXT: LBB0_1: ## %atomicrmw.start
+; PIC-NEXT: ## =>This Inner Loop Header: Depth=1
+; PIC-NEXT: cmpl %eax, %edi
+; PIC-NEXT: movl $0, %ecx
+; PIC-NEXT: sbbl %edx, %ecx
+; PIC-NEXT: movl $0, %ecx
+; PIC-NEXT: cmovll %edx, %ecx
+; PIC-NEXT: movl $5, %ebx
+; PIC-NEXT: cmovll %eax, %ebx
+; PIC-NEXT: lock cmpxchg8b (%esi)
+; PIC-NEXT: jne LBB0_1
+; PIC-NEXT: ## %bb.2: ## %atomicrmw.end
+; PIC-NEXT: popl %esi
+; PIC-NEXT: popl %edi
+; PIC-NEXT: popl %ebx
+; PIC-NEXT: retl
+; PIC-NEXT: ## -- End function
+entry:
+ %max = atomicrmw max i64* @sc64, i64 5 acquire
+ ret i64 %max
+}
+
+define i64 @atomic_min_i64() nounwind {
+; LINUX-LABEL: atomic_min_i64:
+; LINUX: # %bb.0: # %entry
+; LINUX-NEXT: pushl %ebx
+; LINUX-NEXT: movl sc64+4, %edx
+; LINUX-NEXT: movl sc64, %eax
+; LINUX-NEXT: .p2align 4, 0x90
+; LINUX-NEXT: .LBB1_1: # %atomicrmw.start
+; LINUX-NEXT: # =>This Inner Loop Header: Depth=1
+; LINUX-NEXT: cmpl $7, %eax
+; LINUX-NEXT: movl %edx, %ecx
+; LINUX-NEXT: sbbl $0, %ecx
+; LINUX-NEXT: movl $0, %ecx
+; LINUX-NEXT: cmovll %edx, %ecx
+; LINUX-NEXT: movl $6, %ebx
+; LINUX-NEXT: cmovll %eax, %ebx
+; LINUX-NEXT: lock cmpxchg8b sc64
+; LINUX-NEXT: jne .LBB1_1
+; LINUX-NEXT: # %bb.2: # %atomicrmw.end
+; LINUX-NEXT: popl %ebx
+; LINUX-NEXT: retl
+;
+; PIC-LABEL: atomic_min_i64:
+; PIC: ## %bb.0: ## %entry
+; PIC-NEXT: pushl %ebx
+; PIC-NEXT: pushl %esi
+; PIC-NEXT: calll L1$pb
+; PIC-NEXT: L1$pb:
+; PIC-NEXT: popl %eax
+; PIC-NEXT: movl L_sc64$non_lazy_ptr-L1$pb(%eax), %esi
+; PIC-NEXT: movl (%esi), %eax
+; PIC-NEXT: movl 4(%esi), %edx
+; PIC-NEXT: .p2align 4, 0x90
+; PIC-NEXT: LBB1_1: ## %atomicrmw.start
+; PIC-NEXT: ## =>This Inner Loop Header: Depth=1
+; PIC-NEXT: cmpl $7, %eax
+; PIC-NEXT: movl %edx, %ecx
+; PIC-NEXT: sbbl $0, %ecx
+; PIC-NEXT: movl $0, %ecx
+; PIC-NEXT: cmovll %edx, %ecx
+; PIC-NEXT: movl $6, %ebx
+; PIC-NEXT: cmovll %eax, %ebx
+; PIC-NEXT: lock cmpxchg8b (%esi)
+; PIC-NEXT: jne LBB1_1
+; PIC-NEXT: ## %bb.2: ## %atomicrmw.end
+; PIC-NEXT: popl %esi
+; PIC-NEXT: popl %ebx
+; PIC-NEXT: retl
+; PIC-NEXT: ## -- End function
+entry:
+ %min = atomicrmw min i64* @sc64, i64 6 acquire
+ ret i64 %min
+}
+
+define i64 @atomic_umax_i64() nounwind {
+; LINUX-LABEL: atomic_umax_i64:
+; LINUX: # %bb.0: # %entry
+; LINUX-NEXT: pushl %ebx
+; LINUX-NEXT: pushl %esi
+; LINUX-NEXT: movl sc64+4, %edx
+; LINUX-NEXT: movl sc64, %eax
+; LINUX-NEXT: movl $7, %esi
+; LINUX-NEXT: .p2align 4, 0x90
+; LINUX-NEXT: .LBB2_1: # %atomicrmw.start
+; LINUX-NEXT: # =>This Inner Loop Header: Depth=1
+; LINUX-NEXT: cmpl %eax, %esi
+; LINUX-NEXT: movl $0, %ecx
+; LINUX-NEXT: sbbl %edx, %ecx
+; LINUX-NEXT: movl $0, %ecx
+; LINUX-NEXT: cmovbl %edx, %ecx
+; LINUX-NEXT: movl $7, %ebx
+; LINUX-NEXT: cmovbl %eax, %ebx
+; LINUX-NEXT: lock cmpxchg8b sc64
+; LINUX-NEXT: jne .LBB2_1
+; LINUX-NEXT: # %bb.2: # %atomicrmw.end
+; LINUX-NEXT: popl %esi
+; LINUX-NEXT: popl %ebx
+; LINUX-NEXT: retl
+;
+; PIC-LABEL: atomic_umax_i64:
+; PIC: ## %bb.0: ## %entry
+; PIC-NEXT: pushl %ebx
+; PIC-NEXT: pushl %edi
+; PIC-NEXT: pushl %esi
+; PIC-NEXT: calll L2$pb
+; PIC-NEXT: L2$pb:
+; PIC-NEXT: popl %eax
+; PIC-NEXT: movl L_sc64$non_lazy_ptr-L2$pb(%eax), %esi
+; PIC-NEXT: movl (%esi), %eax
+; PIC-NEXT: movl 4(%esi), %edx
+; PIC-NEXT: movl $7, %edi
+; PIC-NEXT: .p2align 4, 0x90
+; PIC-NEXT: LBB2_1: ## %atomicrmw.start
+; PIC-NEXT: ## =>This Inner Loop Header: Depth=1
+; PIC-NEXT: cmpl %eax, %edi
+; PIC-NEXT: movl $0, %ecx
+; PIC-NEXT: sbbl %edx, %ecx
+; PIC-NEXT: movl $0, %ecx
+; PIC-NEXT: cmovbl %edx, %ecx
+; PIC-NEXT: movl $7, %ebx
+; PIC-NEXT: cmovbl %eax, %ebx
+; PIC-NEXT: lock cmpxchg8b (%esi)
+; PIC-NEXT: jne LBB2_1
+; PIC-NEXT: ## %bb.2: ## %atomicrmw.end
+; PIC-NEXT: popl %esi
+; PIC-NEXT: popl %edi
+; PIC-NEXT: popl %ebx
+; PIC-NEXT: retl
+; PIC-NEXT: ## -- End function
+entry:
+ %umax = atomicrmw umax i64* @sc64, i64 7 acquire
+ ret i64 %umax
+}
+
+define i64 @atomic_umin_i64() nounwind {
+; LINUX-LABEL: atomic_umin_i64:
+; LINUX: # %bb.0: # %entry
+; LINUX-NEXT: pushl %ebx
+; LINUX-NEXT: movl sc64+4, %edx
+; LINUX-NEXT: movl sc64, %eax
+; LINUX-NEXT: .p2align 4, 0x90
+; LINUX-NEXT: .LBB3_1: # %atomicrmw.start
+; LINUX-NEXT: # =>This Inner Loop Header: Depth=1
+; LINUX-NEXT: cmpl $9, %eax
+; LINUX-NEXT: movl %edx, %ecx
+; LINUX-NEXT: sbbl $0, %ecx
+; LINUX-NEXT: movl $0, %ecx
+; LINUX-NEXT: cmovbl %edx, %ecx
+; LINUX-NEXT: movl $8, %ebx
+; LINUX-NEXT: cmovbl %eax, %ebx
+; LINUX-NEXT: lock cmpxchg8b sc64
+; LINUX-NEXT: jne .LBB3_1
+; LINUX-NEXT: # %bb.2: # %atomicrmw.end
+; LINUX-NEXT: popl %ebx
+; LINUX-NEXT: retl
+;
+; PIC-LABEL: atomic_umin_i64:
+; PIC: ## %bb.0: ## %entry
+; PIC-NEXT: pushl %ebx
+; PIC-NEXT: pushl %esi
+; PIC-NEXT: calll L3$pb
+; PIC-NEXT: L3$pb:
+; PIC-NEXT: popl %eax
+; PIC-NEXT: movl L_sc64$non_lazy_ptr-L3$pb(%eax), %esi
+; PIC-NEXT: movl (%esi), %eax
+; PIC-NEXT: movl 4(%esi), %edx
+; PIC-NEXT: .p2align 4, 0x90
+; PIC-NEXT: LBB3_1: ## %atomicrmw.start
+; PIC-NEXT: ## =>This Inner Loop Header: Depth=1
+; PIC-NEXT: cmpl $9, %eax
+; PIC-NEXT: movl %edx, %ecx
+; PIC-NEXT: sbbl $0, %ecx
+; PIC-NEXT: movl $0, %ecx
+; PIC-NEXT: cmovbl %edx, %ecx
+; PIC-NEXT: movl $8, %ebx
+; PIC-NEXT: cmovbl %eax, %ebx
+; PIC-NEXT: lock cmpxchg8b (%esi)
+; PIC-NEXT: jne LBB3_1
+; PIC-NEXT: ## %bb.2: ## %atomicrmw.end
+; PIC-NEXT: popl %esi
+; PIC-NEXT: popl %ebx
+; PIC-NEXT: retl
+; PIC-NEXT: ## -- End function
+entry:
+ %umin = atomicrmw umin i64* @sc64, i64 8 acquire
+ ret i64 %umin
}
-; rdar://12453106
@id = internal global i64 0, align 8
define void @tf_bug(i8* %ptr) nounwind {
+; LINUX-LABEL: tf_bug:
+; LINUX: # %bb.0: # %entry
+; LINUX-NEXT: pushl %ebx
+; LINUX-NEXT: pushl %esi
+; LINUX-NEXT: movl {{[0-9]+}}(%esp), %esi
+; LINUX-NEXT: movl id+4, %edx
+; LINUX-NEXT: movl id, %eax
+; LINUX-NEXT: .p2align 4, 0x90
+; LINUX-NEXT: .LBB4_1: # %atomicrmw.start
+; LINUX-NEXT: # =>This Inner Loop Header: Depth=1
+; LINUX-NEXT: movl %eax, %ebx
+; LINUX-NEXT: addl $1, %ebx
+; LINUX-NEXT: movl %edx, %ecx
+; LINUX-NEXT: adcl $0, %ecx
+; LINUX-NEXT: lock cmpxchg8b id
+; LINUX-NEXT: jne .LBB4_1
+; LINUX-NEXT: # %bb.2: # %atomicrmw.end
+; LINUX-NEXT: addl $1, %eax
+; LINUX-NEXT: adcl $0, %edx
+; LINUX-NEXT: movl %eax, (%esi)
+; LINUX-NEXT: movl %edx, 4(%esi)
+; LINUX-NEXT: popl %esi
+; LINUX-NEXT: popl %ebx
+; LINUX-NEXT: retl
+;
; PIC-LABEL: tf_bug:
-; PIC-DAG: movl _id-L1$pb(
-; PIC-DAG: movl (_id-L1$pb)+4(
+; PIC: ## %bb.0: ## %entry
+; PIC-NEXT: pushl %ebx
+; PIC-NEXT: pushl %edi
+; PIC-NEXT: pushl %esi
+; PIC-NEXT: calll L4$pb
+; PIC-NEXT: L4$pb:
+; PIC-NEXT: popl %edi
+; PIC-NEXT: movl {{[0-9]+}}(%esp), %esi
+; PIC-NEXT: movl (_id-L4$pb)+4(%edi), %edx
+; PIC-NEXT: movl _id-L4$pb(%edi), %eax
+; PIC-NEXT: .p2align 4, 0x90
+; PIC-NEXT: LBB4_1: ## %atomicrmw.start
+; PIC-NEXT: ## =>This Inner Loop Header: Depth=1
+; PIC-NEXT: movl %eax, %ebx
+; PIC-NEXT: addl $1, %ebx
+; PIC-NEXT: movl %edx, %ecx
+; PIC-NEXT: adcl $0, %ecx
+; PIC-NEXT: lock cmpxchg8b _id-L4$pb(%edi)
+; PIC-NEXT: jne LBB4_1
+; PIC-NEXT: ## %bb.2: ## %atomicrmw.end
+; PIC-NEXT: addl $1, %eax
+; PIC-NEXT: adcl $0, %edx
+; PIC-NEXT: movl %eax, (%esi)
+; PIC-NEXT: movl %edx, 4(%esi)
+; PIC-NEXT: popl %esi
+; PIC-NEXT: popl %edi
+; PIC-NEXT: popl %ebx
+; PIC-NEXT: retl
+; PIC-NEXT: ## -- End function
+; PIC-NEXT: .zerofill __DATA,__bss,_id,8,3 ## @id
+entry:
%tmp1 = atomicrmw add i64* @id, i64 1 seq_cst
%tmp2 = add i64 %tmp1, 1
%tmp3 = bitcast i8* %ptr to i64*
diff --git a/test/CodeGen/X86/atomic-or.ll b/test/CodeGen/X86/atomic-or.ll
index 60e9968bdc71..690ddd19a1af 100644
--- a/test/CodeGen/X86/atomic-or.ll
+++ b/test/CodeGen/X86/atomic-or.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs | FileCheck %s
; rdar://9692967
diff --git a/test/CodeGen/X86/atomic128.ll b/test/CodeGen/X86/atomic128.ll
index c6300708bcc1..896b6d25cf35 100644
--- a/test/CodeGen/X86/atomic128.ll
+++ b/test/CodeGen/X86/atomic128.ll
@@ -8,11 +8,9 @@
; register live-ranges, we end up with a useless copy.
define i128 @val_compare_and_swap(i128* %p, i128 %oldval, i128 %newval) {
; CHECK-LABEL: val_compare_and_swap:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: pushq %rbx
-; CHECK-NEXT: Lcfi0:
; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: Lcfi1:
; CHECK-NEXT: .cfi_offset %rbx, -16
; CHECK-NEXT: movq %rcx, %r9
; CHECK-NEXT: movq %rsi, %rax
@@ -28,11 +26,9 @@ define i128 @val_compare_and_swap(i128* %p, i128 %oldval, i128 %newval) {
define void @fetch_and_nand(i128* %p, i128 %bits) {
; CHECK-LABEL: fetch_and_nand:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: pushq %rbx
-; CHECK-NEXT: Lcfi2:
; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: Lcfi3:
; CHECK-NEXT: .cfi_offset %rbx, -16
; CHECK-NEXT: movq %rdx, %r8
; CHECK-NEXT: movq (%rdi), %rax
@@ -48,7 +44,7 @@ define void @fetch_and_nand(i128* %p, i128 %bits) {
; CHECK-NEXT: notq %rcx
; CHECK-NEXT: lock cmpxchg16b (%rdi)
; CHECK-NEXT: jne LBB1_1
-; CHECK-NEXT: ## BB#2: ## %atomicrmw.end
+; CHECK-NEXT: ## %bb.2: ## %atomicrmw.end
; CHECK-NEXT: movq %rax, {{.*}}(%rip)
; CHECK-NEXT: movq %rdx, _var+{{.*}}(%rip)
; CHECK-NEXT: popq %rbx
@@ -60,11 +56,9 @@ define void @fetch_and_nand(i128* %p, i128 %bits) {
define void @fetch_and_or(i128* %p, i128 %bits) {
; CHECK-LABEL: fetch_and_or:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: pushq %rbx
-; CHECK-NEXT: Lcfi4:
; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: Lcfi5:
; CHECK-NEXT: .cfi_offset %rbx, -16
; CHECK-NEXT: movq %rdx, %r8
; CHECK-NEXT: movq (%rdi), %rax
@@ -78,7 +72,7 @@ define void @fetch_and_or(i128* %p, i128 %bits) {
; CHECK-NEXT: orq %r8, %rcx
; CHECK-NEXT: lock cmpxchg16b (%rdi)
; CHECK-NEXT: jne LBB2_1
-; CHECK-NEXT: ## BB#2: ## %atomicrmw.end
+; CHECK-NEXT: ## %bb.2: ## %atomicrmw.end
; CHECK-NEXT: movq %rax, {{.*}}(%rip)
; CHECK-NEXT: movq %rdx, _var+{{.*}}(%rip)
; CHECK-NEXT: popq %rbx
@@ -90,11 +84,9 @@ define void @fetch_and_or(i128* %p, i128 %bits) {
define void @fetch_and_add(i128* %p, i128 %bits) {
; CHECK-LABEL: fetch_and_add:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: pushq %rbx
-; CHECK-NEXT: Lcfi6:
; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: Lcfi7:
; CHECK-NEXT: .cfi_offset %rbx, -16
; CHECK-NEXT: movq %rdx, %r8
; CHECK-NEXT: movq (%rdi), %rax
@@ -108,7 +100,7 @@ define void @fetch_and_add(i128* %p, i128 %bits) {
; CHECK-NEXT: adcq %r8, %rcx
; CHECK-NEXT: lock cmpxchg16b (%rdi)
; CHECK-NEXT: jne LBB3_1
-; CHECK-NEXT: ## BB#2: ## %atomicrmw.end
+; CHECK-NEXT: ## %bb.2: ## %atomicrmw.end
; CHECK-NEXT: movq %rax, {{.*}}(%rip)
; CHECK-NEXT: movq %rdx, _var+{{.*}}(%rip)
; CHECK-NEXT: popq %rbx
@@ -120,11 +112,9 @@ define void @fetch_and_add(i128* %p, i128 %bits) {
define void @fetch_and_sub(i128* %p, i128 %bits) {
; CHECK-LABEL: fetch_and_sub:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: pushq %rbx
-; CHECK-NEXT: Lcfi8:
; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: Lcfi9:
; CHECK-NEXT: .cfi_offset %rbx, -16
; CHECK-NEXT: movq %rdx, %r8
; CHECK-NEXT: movq (%rdi), %rax
@@ -138,7 +128,7 @@ define void @fetch_and_sub(i128* %p, i128 %bits) {
; CHECK-NEXT: sbbq %r8, %rcx
; CHECK-NEXT: lock cmpxchg16b (%rdi)
; CHECK-NEXT: jne LBB4_1
-; CHECK-NEXT: ## BB#2: ## %atomicrmw.end
+; CHECK-NEXT: ## %bb.2: ## %atomicrmw.end
; CHECK-NEXT: movq %rax, {{.*}}(%rip)
; CHECK-NEXT: movq %rdx, _var+{{.*}}(%rip)
; CHECK-NEXT: popq %rbx
@@ -150,11 +140,9 @@ define void @fetch_and_sub(i128* %p, i128 %bits) {
define void @fetch_and_min(i128* %p, i128 %bits) {
; CHECK-LABEL: fetch_and_min:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: pushq %rbx
-; CHECK-NEXT: Lcfi10:
; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: Lcfi11:
; CHECK-NEXT: .cfi_offset %rbx, -16
; CHECK-NEXT: movq %rdx, %r8
; CHECK-NEXT: movq (%rdi), %rax
@@ -165,26 +153,13 @@ define void @fetch_and_min(i128* %p, i128 %bits) {
; CHECK-NEXT: cmpq %rax, %rsi
; CHECK-NEXT: movq %r8, %rcx
; CHECK-NEXT: sbbq %rdx, %rcx
-; CHECK-NEXT: setge %cl
-; CHECK-NEXT: andb $1, %cl
-; CHECK-NEXT: movq %rax, %rbx
-; CHECK-NEXT: jne LBB5_3
-; CHECK-NEXT: ## BB#2: ## %atomicrmw.start
-; CHECK-NEXT: ## in Loop: Header=BB5_1 Depth=1
-; CHECK-NEXT: movq %rsi, %rbx
-; CHECK-NEXT: LBB5_3: ## %atomicrmw.start
-; CHECK-NEXT: ## in Loop: Header=BB5_1 Depth=1
-; CHECK-NEXT: testb %cl, %cl
-; CHECK-NEXT: movq %rdx, %rcx
-; CHECK-NEXT: jne LBB5_5
-; CHECK-NEXT: ## BB#4: ## %atomicrmw.start
-; CHECK-NEXT: ## in Loop: Header=BB5_1 Depth=1
; CHECK-NEXT: movq %r8, %rcx
-; CHECK-NEXT: LBB5_5: ## %atomicrmw.start
-; CHECK-NEXT: ## in Loop: Header=BB5_1 Depth=1
+; CHECK-NEXT: cmovgeq %rdx, %rcx
+; CHECK-NEXT: movq %rsi, %rbx
+; CHECK-NEXT: cmovgeq %rax, %rbx
; CHECK-NEXT: lock cmpxchg16b (%rdi)
; CHECK-NEXT: jne LBB5_1
-; CHECK-NEXT: ## BB#6: ## %atomicrmw.end
+; CHECK-NEXT: ## %bb.2: ## %atomicrmw.end
; CHECK-NEXT: movq %rax, {{.*}}(%rip)
; CHECK-NEXT: movq %rdx, _var+{{.*}}(%rip)
; CHECK-NEXT: popq %rbx
@@ -196,11 +171,9 @@ define void @fetch_and_min(i128* %p, i128 %bits) {
define void @fetch_and_max(i128* %p, i128 %bits) {
; CHECK-LABEL: fetch_and_max:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: pushq %rbx
-; CHECK-NEXT: Lcfi12:
; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: Lcfi13:
; CHECK-NEXT: .cfi_offset %rbx, -16
; CHECK-NEXT: movq %rdx, %r8
; CHECK-NEXT: movq (%rdi), %rax
@@ -211,26 +184,13 @@ define void @fetch_and_max(i128* %p, i128 %bits) {
; CHECK-NEXT: cmpq %rsi, %rax
; CHECK-NEXT: movq %rdx, %rcx
; CHECK-NEXT: sbbq %r8, %rcx
-; CHECK-NEXT: setge %cl
-; CHECK-NEXT: andb $1, %cl
-; CHECK-NEXT: movq %rax, %rbx
-; CHECK-NEXT: jne LBB6_3
-; CHECK-NEXT: ## BB#2: ## %atomicrmw.start
-; CHECK-NEXT: ## in Loop: Header=BB6_1 Depth=1
-; CHECK-NEXT: movq %rsi, %rbx
-; CHECK-NEXT: LBB6_3: ## %atomicrmw.start
-; CHECK-NEXT: ## in Loop: Header=BB6_1 Depth=1
-; CHECK-NEXT: testb %cl, %cl
-; CHECK-NEXT: movq %rdx, %rcx
-; CHECK-NEXT: jne LBB6_5
-; CHECK-NEXT: ## BB#4: ## %atomicrmw.start
-; CHECK-NEXT: ## in Loop: Header=BB6_1 Depth=1
; CHECK-NEXT: movq %r8, %rcx
-; CHECK-NEXT: LBB6_5: ## %atomicrmw.start
-; CHECK-NEXT: ## in Loop: Header=BB6_1 Depth=1
+; CHECK-NEXT: cmovgeq %rdx, %rcx
+; CHECK-NEXT: movq %rsi, %rbx
+; CHECK-NEXT: cmovgeq %rax, %rbx
; CHECK-NEXT: lock cmpxchg16b (%rdi)
; CHECK-NEXT: jne LBB6_1
-; CHECK-NEXT: ## BB#6: ## %atomicrmw.end
+; CHECK-NEXT: ## %bb.2: ## %atomicrmw.end
; CHECK-NEXT: movq %rax, {{.*}}(%rip)
; CHECK-NEXT: movq %rdx, _var+{{.*}}(%rip)
; CHECK-NEXT: popq %rbx
@@ -242,11 +202,9 @@ define void @fetch_and_max(i128* %p, i128 %bits) {
define void @fetch_and_umin(i128* %p, i128 %bits) {
; CHECK-LABEL: fetch_and_umin:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: pushq %rbx
-; CHECK-NEXT: Lcfi14:
; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: Lcfi15:
; CHECK-NEXT: .cfi_offset %rbx, -16
; CHECK-NEXT: movq %rdx, %r8
; CHECK-NEXT: movq (%rdi), %rax
@@ -257,26 +215,13 @@ define void @fetch_and_umin(i128* %p, i128 %bits) {
; CHECK-NEXT: cmpq %rax, %rsi
; CHECK-NEXT: movq %r8, %rcx
; CHECK-NEXT: sbbq %rdx, %rcx
-; CHECK-NEXT: setae %cl
-; CHECK-NEXT: andb $1, %cl
-; CHECK-NEXT: movq %rax, %rbx
-; CHECK-NEXT: jne LBB7_3
-; CHECK-NEXT: ## BB#2: ## %atomicrmw.start
-; CHECK-NEXT: ## in Loop: Header=BB7_1 Depth=1
-; CHECK-NEXT: movq %rsi, %rbx
-; CHECK-NEXT: LBB7_3: ## %atomicrmw.start
-; CHECK-NEXT: ## in Loop: Header=BB7_1 Depth=1
-; CHECK-NEXT: testb %cl, %cl
-; CHECK-NEXT: movq %rdx, %rcx
-; CHECK-NEXT: jne LBB7_5
-; CHECK-NEXT: ## BB#4: ## %atomicrmw.start
-; CHECK-NEXT: ## in Loop: Header=BB7_1 Depth=1
; CHECK-NEXT: movq %r8, %rcx
-; CHECK-NEXT: LBB7_5: ## %atomicrmw.start
-; CHECK-NEXT: ## in Loop: Header=BB7_1 Depth=1
+; CHECK-NEXT: cmovaeq %rdx, %rcx
+; CHECK-NEXT: movq %rsi, %rbx
+; CHECK-NEXT: cmovaeq %rax, %rbx
; CHECK-NEXT: lock cmpxchg16b (%rdi)
; CHECK-NEXT: jne LBB7_1
-; CHECK-NEXT: ## BB#6: ## %atomicrmw.end
+; CHECK-NEXT: ## %bb.2: ## %atomicrmw.end
; CHECK-NEXT: movq %rax, {{.*}}(%rip)
; CHECK-NEXT: movq %rdx, _var+{{.*}}(%rip)
; CHECK-NEXT: popq %rbx
@@ -288,11 +233,9 @@ define void @fetch_and_umin(i128* %p, i128 %bits) {
define void @fetch_and_umax(i128* %p, i128 %bits) {
; CHECK-LABEL: fetch_and_umax:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: pushq %rbx
-; CHECK-NEXT: Lcfi16:
; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: Lcfi17:
; CHECK-NEXT: .cfi_offset %rbx, -16
; CHECK-NEXT: movq %rdx, %r8
; CHECK-NEXT: movq (%rdi), %rax
@@ -303,26 +246,13 @@ define void @fetch_and_umax(i128* %p, i128 %bits) {
; CHECK-NEXT: cmpq %rax, %rsi
; CHECK-NEXT: movq %r8, %rcx
; CHECK-NEXT: sbbq %rdx, %rcx
-; CHECK-NEXT: setb %cl
-; CHECK-NEXT: andb $1, %cl
-; CHECK-NEXT: movq %rax, %rbx
-; CHECK-NEXT: jne LBB8_3
-; CHECK-NEXT: ## BB#2: ## %atomicrmw.start
-; CHECK-NEXT: ## in Loop: Header=BB8_1 Depth=1
-; CHECK-NEXT: movq %rsi, %rbx
-; CHECK-NEXT: LBB8_3: ## %atomicrmw.start
-; CHECK-NEXT: ## in Loop: Header=BB8_1 Depth=1
-; CHECK-NEXT: testb %cl, %cl
-; CHECK-NEXT: movq %rdx, %rcx
-; CHECK-NEXT: jne LBB8_5
-; CHECK-NEXT: ## BB#4: ## %atomicrmw.start
-; CHECK-NEXT: ## in Loop: Header=BB8_1 Depth=1
; CHECK-NEXT: movq %r8, %rcx
-; CHECK-NEXT: LBB8_5: ## %atomicrmw.start
-; CHECK-NEXT: ## in Loop: Header=BB8_1 Depth=1
+; CHECK-NEXT: cmovbq %rdx, %rcx
+; CHECK-NEXT: movq %rsi, %rbx
+; CHECK-NEXT: cmovbq %rax, %rbx
; CHECK-NEXT: lock cmpxchg16b (%rdi)
; CHECK-NEXT: jne LBB8_1
-; CHECK-NEXT: ## BB#6: ## %atomicrmw.end
+; CHECK-NEXT: ## %bb.2: ## %atomicrmw.end
; CHECK-NEXT: movq %rax, {{.*}}(%rip)
; CHECK-NEXT: movq %rdx, _var+{{.*}}(%rip)
; CHECK-NEXT: popq %rbx
@@ -334,11 +264,9 @@ define void @fetch_and_umax(i128* %p, i128 %bits) {
define i128 @atomic_load_seq_cst(i128* %p) {
; CHECK-LABEL: atomic_load_seq_cst:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: pushq %rbx
-; CHECK-NEXT: Lcfi18:
; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: Lcfi19:
; CHECK-NEXT: .cfi_offset %rbx, -16
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: xorl %edx, %edx
@@ -353,11 +281,9 @@ define i128 @atomic_load_seq_cst(i128* %p) {
define i128 @atomic_load_relaxed(i128* %p) {
; CHECK-LABEL: atomic_load_relaxed:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: pushq %rbx
-; CHECK-NEXT: Lcfi20:
; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: Lcfi21:
; CHECK-NEXT: .cfi_offset %rbx, -16
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: xorl %edx, %edx
@@ -372,11 +298,9 @@ define i128 @atomic_load_relaxed(i128* %p) {
define void @atomic_store_seq_cst(i128* %p, i128 %in) {
; CHECK-LABEL: atomic_store_seq_cst:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: pushq %rbx
-; CHECK-NEXT: Lcfi22:
; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: Lcfi23:
; CHECK-NEXT: .cfi_offset %rbx, -16
; CHECK-NEXT: movq %rdx, %rcx
; CHECK-NEXT: movq %rsi, %rbx
@@ -387,7 +311,7 @@ define void @atomic_store_seq_cst(i128* %p, i128 %in) {
; CHECK-NEXT: ## =>This Inner Loop Header: Depth=1
; CHECK-NEXT: lock cmpxchg16b (%rdi)
; CHECK-NEXT: jne LBB11_1
-; CHECK-NEXT: ## BB#2: ## %atomicrmw.end
+; CHECK-NEXT: ## %bb.2: ## %atomicrmw.end
; CHECK-NEXT: popq %rbx
; CHECK-NEXT: retq
store atomic i128 %in, i128* %p seq_cst, align 16
@@ -396,11 +320,9 @@ define void @atomic_store_seq_cst(i128* %p, i128 %in) {
define void @atomic_store_release(i128* %p, i128 %in) {
; CHECK-LABEL: atomic_store_release:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: pushq %rbx
-; CHECK-NEXT: Lcfi24:
; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: Lcfi25:
; CHECK-NEXT: .cfi_offset %rbx, -16
; CHECK-NEXT: movq %rdx, %rcx
; CHECK-NEXT: movq %rsi, %rbx
@@ -411,7 +333,7 @@ define void @atomic_store_release(i128* %p, i128 %in) {
; CHECK-NEXT: ## =>This Inner Loop Header: Depth=1
; CHECK-NEXT: lock cmpxchg16b (%rdi)
; CHECK-NEXT: jne LBB12_1
-; CHECK-NEXT: ## BB#2: ## %atomicrmw.end
+; CHECK-NEXT: ## %bb.2: ## %atomicrmw.end
; CHECK-NEXT: popq %rbx
; CHECK-NEXT: retq
store atomic i128 %in, i128* %p release, align 16
@@ -420,11 +342,9 @@ define void @atomic_store_release(i128* %p, i128 %in) {
define void @atomic_store_relaxed(i128* %p, i128 %in) {
; CHECK-LABEL: atomic_store_relaxed:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: pushq %rbx
-; CHECK-NEXT: Lcfi26:
; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: Lcfi27:
; CHECK-NEXT: .cfi_offset %rbx, -16
; CHECK-NEXT: movq %rdx, %rcx
; CHECK-NEXT: movq %rsi, %rbx
@@ -435,7 +355,7 @@ define void @atomic_store_relaxed(i128* %p, i128 %in) {
; CHECK-NEXT: ## =>This Inner Loop Header: Depth=1
; CHECK-NEXT: lock cmpxchg16b (%rdi)
; CHECK-NEXT: jne LBB13_1
-; CHECK-NEXT: ## BB#2: ## %atomicrmw.end
+; CHECK-NEXT: ## %bb.2: ## %atomicrmw.end
; CHECK-NEXT: popq %rbx
; CHECK-NEXT: retq
store atomic i128 %in, i128* %p unordered, align 16
diff --git a/test/CodeGen/X86/atomic32.ll b/test/CodeGen/X86/atomic32.ll
index d514ac7b3110..6a18e6a41ca9 100644
--- a/test/CodeGen/X86/atomic32.ll
+++ b/test/CodeGen/X86/atomic32.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -O0 -march=x86-64 -mcpu=corei7 -verify-machineinstrs | FileCheck %s -check-prefix=WITH-CMOV
-; RUN: llc < %s -O0 -march=x86 -mcpu=corei7 -verify-machineinstrs | FileCheck %s -check-prefix=WITH-CMOV
-; RUN: llc < %s -O0 -march=x86 -mcpu=corei7 -mattr=-cmov -verify-machineinstrs | FileCheck %s --check-prefix NOCMOV
+; RUN: llc < %s -O0 -mtriple=x86_64-- -mcpu=corei7 -verify-machineinstrs | FileCheck %s -check-prefix=WITH-CMOV
+; RUN: llc < %s -O0 -mtriple=i686-- -mcpu=corei7 -verify-machineinstrs | FileCheck %s -check-prefix=WITH-CMOV
+; RUN: llc < %s -O0 -mtriple=i686-- -mcpu=corei7 -mattr=-cmov -verify-machineinstrs | FileCheck %s --check-prefix NOCMOV
@sc32 = external global i32
diff --git a/test/CodeGen/X86/atomic64.ll b/test/CodeGen/X86/atomic64.ll
index c6b1c39d35dc..3df34af9bd77 100644
--- a/test/CodeGen/X86/atomic64.ll
+++ b/test/CodeGen/X86/atomic64.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -O0 -march=x86-64 -mcpu=corei7 -verify-machineinstrs | FileCheck %s --check-prefix X64
+; RUN: llc < %s -O0 -mtriple=x86_64-- -mcpu=corei7 -verify-machineinstrs | FileCheck %s --check-prefix X64
@sc64 = external global i64
diff --git a/test/CodeGen/X86/atomic6432.ll b/test/CodeGen/X86/atomic6432.ll
index 5a78934eb3fd..37434ae7b195 100644
--- a/test/CodeGen/X86/atomic6432.ll
+++ b/test/CodeGen/X86/atomic6432.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -O0 -march=x86 -mcpu=corei7 -verify-machineinstrs | FileCheck %s --check-prefix X32
+; RUN: llc < %s -O0 -mtriple=i686-- -mcpu=corei7 -verify-machineinstrs | FileCheck %s --check-prefix X32
@sc64 = external global i64
diff --git a/test/CodeGen/X86/atomic8.ll b/test/CodeGen/X86/atomic8.ll
index 01123ae9b073..c78a919dcb3d 100644
--- a/test/CodeGen/X86/atomic8.ll
+++ b/test/CodeGen/X86/atomic8.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -O0 -march=x86-64 -mcpu=corei7 -verify-machineinstrs | FileCheck %s --check-prefix X64
-; RUN: llc < %s -O0 -march=x86 -mcpu=corei7 -verify-machineinstrs | FileCheck %s --check-prefix X32
+; RUN: llc < %s -O0 -mtriple=x86_64-- -mcpu=corei7 -verify-machineinstrs | FileCheck %s --check-prefix X64
+; RUN: llc < %s -O0 -mtriple=i686-- -mcpu=corei7 -verify-machineinstrs | FileCheck %s --check-prefix X32
@sc8 = external global i8
diff --git a/test/CodeGen/X86/atomic_add.ll b/test/CodeGen/X86/atomic_add.ll
index f60212de5339..86d950abc07d 100644
--- a/test/CodeGen/X86/atomic_add.ll
+++ b/test/CodeGen/X86/atomic_add.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=x86-64 -verify-machineinstrs | FileCheck %s
-; RUN: llc < %s -march=x86-64 -mattr=slow-incdec -verify-machineinstrs | FileCheck %s --check-prefix SLOW_INC
+; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-- -mattr=slow-incdec -verify-machineinstrs | FileCheck %s --check-prefix SLOW_INC
; rdar://7103704
diff --git a/test/CodeGen/X86/atomic_idempotent.ll b/test/CodeGen/X86/atomic_idempotent.ll
index 1afc535133d6..cc5563a422a0 100644
--- a/test/CodeGen/X86/atomic_idempotent.ll
+++ b/test/CodeGen/X86/atomic_idempotent.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=x86-64 -verify-machineinstrs | FileCheck %s --check-prefix=CHECK --check-prefix=X64
-; RUN: llc < %s -march=x86 -mattr=+sse2 -verify-machineinstrs | FileCheck %s --check-prefix=CHECK --check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs | FileCheck %s --check-prefix=CHECK --check-prefix=X64
+; RUN: llc < %s -mtriple=i686-- -mattr=+sse2 -verify-machineinstrs | FileCheck %s --check-prefix=CHECK --check-prefix=X32
; On x86, an atomic rmw operation that does not modify the value in memory
; (such as atomic add 0) can be replaced by an mfence followed by a mov.
diff --git a/test/CodeGen/X86/atomic_op.ll b/test/CodeGen/X86/atomic_op.ll
index aa895dedfe4c..fe724a8f6694 100644
--- a/test/CodeGen/X86/atomic_op.ll
+++ b/test/CodeGen/X86/atomic_op.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mcpu=generic -march=x86 -mattr=+cmov,cx16 -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -mcpu=generic -mtriple=i686-- -mattr=+cmov,cx16 -verify-machineinstrs | FileCheck %s
target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
diff --git a/test/CodeGen/X86/avg-mask.ll b/test/CodeGen/X86/avg-mask.ll
new file mode 100644
index 000000000000..578d7aa75287
--- /dev/null
+++ b/test/CodeGen/X86/avg-mask.ll
@@ -0,0 +1,445 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw -mattr=+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BWVL
+
+define <16 x i8> @avg_v16i8_mask(<16 x i8> %a, <16 x i8> %b, <16 x i8> %src, i16 %mask) nounwind {
+; AVX512F-LABEL: avg_v16i8_mask:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpavgb %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT: kmovw %edi, %k1
+; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
+; AVX512F-NEXT: vpblendvb %xmm1, %xmm0, %xmm2, %xmm0
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512BWVL-LABEL: avg_v16i8_mask:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: kmovd %edi, %k1
+; AVX512BWVL-NEXT: vpavgb %xmm1, %xmm0, %xmm2 {%k1}
+; AVX512BWVL-NEXT: vmovdqa %xmm2, %xmm0
+; AVX512BWVL-NEXT: retq
+ %za = zext <16 x i8> %a to <16 x i16>
+ %zb = zext <16 x i8> %b to <16 x i16>
+ %add = add nuw nsw <16 x i16> %za, %zb
+ %add1 = add nuw nsw <16 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+ %lshr = lshr <16 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+ %trunc = trunc <16 x i16> %lshr to <16 x i8>
+ %mask1 = bitcast i16 %mask to <16 x i1>
+ %res = select <16 x i1> %mask1, <16 x i8> %trunc, <16 x i8> %src
+ ret <16 x i8> %res
+}
+
+define <16 x i8> @avg_v16i8_maskz(<16 x i8> %a, <16 x i8> %b, i16 %mask) nounwind {
+; AVX512F-LABEL: avg_v16i8_maskz:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpavgb %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT: kmovw %edi, %k1
+; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
+; AVX512F-NEXT: vpand %xmm0, %xmm1, %xmm0
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512BWVL-LABEL: avg_v16i8_maskz:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: kmovd %edi, %k1
+; AVX512BWVL-NEXT: vpavgb %xmm1, %xmm0, %xmm0 {%k1} {z}
+; AVX512BWVL-NEXT: retq
+ %za = zext <16 x i8> %a to <16 x i16>
+ %zb = zext <16 x i8> %b to <16 x i16>
+ %add = add nuw nsw <16 x i16> %za, %zb
+ %add1 = add nuw nsw <16 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+ %lshr = lshr <16 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+ %trunc = trunc <16 x i16> %lshr to <16 x i8>
+ %mask1 = bitcast i16 %mask to <16 x i1>
+ %res = select <16 x i1> %mask1, <16 x i8> %trunc, <16 x i8> zeroinitializer
+ ret <16 x i8> %res
+}
+
+define <32 x i8> @avg_v32i8_mask(<32 x i8> %a, <32 x i8> %b, <32 x i8> %src, i32 %mask) nounwind {
+; AVX512F-LABEL: avg_v32i8_mask:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: pushq %rbp
+; AVX512F-NEXT: movq %rsp, %rbp
+; AVX512F-NEXT: andq $-32, %rsp
+; AVX512F-NEXT: subq $32, %rsp
+; AVX512F-NEXT: movl %edi, (%rsp)
+; AVX512F-NEXT: vpavgb %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT: kmovw (%rsp), %k1
+; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k2
+; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
+; AVX512F-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k2} {z}
+; AVX512F-NEXT: vpmovdb %zmm3, %xmm3
+; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
+; AVX512F-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm0
+; AVX512F-NEXT: movq %rbp, %rsp
+; AVX512F-NEXT: popq %rbp
+; AVX512F-NEXT: retq
+;
+; AVX512BWVL-LABEL: avg_v32i8_mask:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: kmovd %edi, %k1
+; AVX512BWVL-NEXT: vpavgb %ymm1, %ymm0, %ymm2 {%k1}
+; AVX512BWVL-NEXT: vmovdqa %ymm2, %ymm0
+; AVX512BWVL-NEXT: retq
+ %za = zext <32 x i8> %a to <32 x i16>
+ %zb = zext <32 x i8> %b to <32 x i16>
+ %add = add nuw nsw <32 x i16> %za, %zb
+ %add1 = add nuw nsw <32 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+ %lshr = lshr <32 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+ %trunc = trunc <32 x i16> %lshr to <32 x i8>
+ %mask1 = bitcast i32 %mask to <32 x i1>
+ %res = select <32 x i1> %mask1, <32 x i8> %trunc, <32 x i8> %src
+ ret <32 x i8> %res
+}
+
+define <32 x i8> @avg_v32i8_maskz(<32 x i8> %a, <32 x i8> %b, i32 %mask) nounwind {
+; AVX512F-LABEL: avg_v32i8_maskz:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: pushq %rbp
+; AVX512F-NEXT: movq %rsp, %rbp
+; AVX512F-NEXT: andq $-32, %rsp
+; AVX512F-NEXT: subq $32, %rsp
+; AVX512F-NEXT: movl %edi, (%rsp)
+; AVX512F-NEXT: vpavgb %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT: kmovw (%rsp), %k1
+; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k2
+; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
+; AVX512F-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k2} {z}
+; AVX512F-NEXT: vpmovdb %zmm2, %xmm2
+; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512F-NEXT: vpand %ymm0, %ymm1, %ymm0
+; AVX512F-NEXT: movq %rbp, %rsp
+; AVX512F-NEXT: popq %rbp
+; AVX512F-NEXT: retq
+;
+; AVX512BWVL-LABEL: avg_v32i8_maskz:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: kmovd %edi, %k1
+; AVX512BWVL-NEXT: vpavgb %ymm1, %ymm0, %ymm0 {%k1} {z}
+; AVX512BWVL-NEXT: retq
+ %za = zext <32 x i8> %a to <32 x i16>
+ %zb = zext <32 x i8> %b to <32 x i16>
+ %add = add nuw nsw <32 x i16> %za, %zb
+ %add1 = add nuw nsw <32 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+ %lshr = lshr <32 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+ %trunc = trunc <32 x i16> %lshr to <32 x i8>
+ %mask1 = bitcast i32 %mask to <32 x i1>
+ %res = select <32 x i1> %mask1, <32 x i8> %trunc, <32 x i8> zeroinitializer
+ ret <32 x i8> %res
+}
+
+define <64 x i8> @avg_v64i8_mask(<64 x i8> %a, <64 x i8> %b, <64 x i8> %src, i64 %mask) nounwind {
+; AVX512F-LABEL: avg_v64i8_mask:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: pushq %rbp
+; AVX512F-NEXT: movq %rsp, %rbp
+; AVX512F-NEXT: andq $-32, %rsp
+; AVX512F-NEXT: subq $64, %rsp
+; AVX512F-NEXT: movq %rdi, %rax
+; AVX512F-NEXT: shrq $32, %rax
+; AVX512F-NEXT: movl %eax, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movl %edi, (%rsp)
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm6
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm8
+; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm7
+; AVX512F-NEXT: vpavgb %xmm7, %xmm6, %xmm6
+; AVX512F-NEXT: vextracti128 $1, %ymm3, %xmm7
+; AVX512F-NEXT: vpavgb %xmm7, %xmm8, %xmm7
+; AVX512F-NEXT: vpavgb %xmm3, %xmm1, %xmm1
+; AVX512F-NEXT: vinserti128 $1, %xmm7, %ymm1, %ymm1
+; AVX512F-NEXT: vpavgb %xmm2, %xmm0, %xmm0
+; AVX512F-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm0
+; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k1
+; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k2
+; AVX512F-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; AVX512F-NEXT: vpmovdb %zmm2, %xmm2
+; AVX512F-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k2} {z}
+; AVX512F-NEXT: vpmovdb %zmm3, %xmm3
+; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX512F-NEXT: vpblendvb %ymm2, %ymm1, %ymm5, %ymm1
+; AVX512F-NEXT: kmovw (%rsp), %k1
+; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k2
+; AVX512F-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; AVX512F-NEXT: vpmovdb %zmm2, %xmm2
+; AVX512F-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k2} {z}
+; AVX512F-NEXT: vpmovdb %zmm3, %xmm3
+; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX512F-NEXT: vpblendvb %ymm2, %ymm0, %ymm4, %ymm0
+; AVX512F-NEXT: movq %rbp, %rsp
+; AVX512F-NEXT: popq %rbp
+; AVX512F-NEXT: retq
+;
+; AVX512BWVL-LABEL: avg_v64i8_mask:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: kmovq %rdi, %k1
+; AVX512BWVL-NEXT: vpavgb %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512BWVL-NEXT: vmovdqa64 %zmm2, %zmm0
+; AVX512BWVL-NEXT: retq
+ %za = zext <64 x i8> %a to <64 x i16>
+ %zb = zext <64 x i8> %b to <64 x i16>
+ %add = add nuw nsw <64 x i16> %za, %zb
+ %add1 = add nuw nsw <64 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+ %lshr = lshr <64 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+ %trunc = trunc <64 x i16> %lshr to <64 x i8>
+ %mask1 = bitcast i64 %mask to <64 x i1>
+ %res = select <64 x i1> %mask1, <64 x i8> %trunc, <64 x i8> %src
+ ret <64 x i8> %res
+}
+
+define <64 x i8> @avg_v64i8_maskz(<64 x i8> %a, <64 x i8> %b, i64 %mask) nounwind {
+; AVX512F-LABEL: avg_v64i8_maskz:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: pushq %rbp
+; AVX512F-NEXT: movq %rsp, %rbp
+; AVX512F-NEXT: andq $-32, %rsp
+; AVX512F-NEXT: subq $64, %rsp
+; AVX512F-NEXT: movq %rdi, %rax
+; AVX512F-NEXT: shrq $32, %rax
+; AVX512F-NEXT: movl %eax, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movl %edi, (%rsp)
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm4
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm5
+; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm6
+; AVX512F-NEXT: vpavgb %xmm6, %xmm4, %xmm4
+; AVX512F-NEXT: vextracti128 $1, %ymm3, %xmm6
+; AVX512F-NEXT: vpavgb %xmm6, %xmm5, %xmm5
+; AVX512F-NEXT: vpavgb %xmm3, %xmm1, %xmm1
+; AVX512F-NEXT: vinserti128 $1, %xmm5, %ymm1, %ymm1
+; AVX512F-NEXT: vpavgb %xmm2, %xmm0, %xmm0
+; AVX512F-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0
+; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k1
+; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k2
+; AVX512F-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; AVX512F-NEXT: vpmovdb %zmm2, %xmm2
+; AVX512F-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k2} {z}
+; AVX512F-NEXT: vpmovdb %zmm3, %xmm3
+; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX512F-NEXT: vpand %ymm1, %ymm2, %ymm1
+; AVX512F-NEXT: kmovw (%rsp), %k1
+; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k2
+; AVX512F-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; AVX512F-NEXT: vpmovdb %zmm2, %xmm2
+; AVX512F-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k2} {z}
+; AVX512F-NEXT: vpmovdb %zmm3, %xmm3
+; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX512F-NEXT: vpand %ymm0, %ymm2, %ymm0
+; AVX512F-NEXT: movq %rbp, %rsp
+; AVX512F-NEXT: popq %rbp
+; AVX512F-NEXT: retq
+;
+; AVX512BWVL-LABEL: avg_v64i8_maskz:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: kmovq %rdi, %k1
+; AVX512BWVL-NEXT: vpavgb %zmm1, %zmm0, %zmm0 {%k1} {z}
+; AVX512BWVL-NEXT: retq
+ %za = zext <64 x i8> %a to <64 x i16>
+ %zb = zext <64 x i8> %b to <64 x i16>
+ %add = add nuw nsw <64 x i16> %za, %zb
+ %add1 = add nuw nsw <64 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+ %lshr = lshr <64 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+ %trunc = trunc <64 x i16> %lshr to <64 x i8>
+ %mask1 = bitcast i64 %mask to <64 x i1>
+ %res = select <64 x i1> %mask1, <64 x i8> %trunc, <64 x i8> zeroinitializer
+ ret <64 x i8> %res
+}
+
+define <8 x i16> @avg_v8i16_mask(<8 x i16> %a, <8 x i16> %b, <8 x i16> %src, i8 %mask) nounwind {
+; AVX512F-LABEL: avg_v8i16_mask:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpavgw %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT: kmovw %edi, %k1
+; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; AVX512F-NEXT: vpmovdw %zmm1, %ymm1
+; AVX512F-NEXT: vpblendvb %xmm1, %xmm0, %xmm2, %xmm0
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512BWVL-LABEL: avg_v8i16_mask:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: kmovd %edi, %k1
+; AVX512BWVL-NEXT: vpavgw %xmm1, %xmm0, %xmm2 {%k1}
+; AVX512BWVL-NEXT: vmovdqa %xmm2, %xmm0
+; AVX512BWVL-NEXT: retq
+ %za = zext <8 x i16> %a to <8 x i32>
+ %zb = zext <8 x i16> %b to <8 x i32>
+ %add = add nuw nsw <8 x i32> %za, %zb
+ %add1 = add nuw nsw <8 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+ %lshr = lshr <8 x i32> %add1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+ %trunc = trunc <8 x i32> %lshr to <8 x i16>
+ %mask1 = bitcast i8 %mask to <8 x i1>
+ %res = select <8 x i1> %mask1, <8 x i16> %trunc, <8 x i16> %src
+ ret <8 x i16> %res
+}
+
+define <8 x i16> @avg_v8i16_maskz(<8 x i16> %a, <8 x i16> %b, i8 %mask) nounwind {
+; AVX512F-LABEL: avg_v8i16_maskz:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpavgw %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT: kmovw %edi, %k1
+; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; AVX512F-NEXT: vpmovdw %zmm1, %ymm1
+; AVX512F-NEXT: vpand %xmm0, %xmm1, %xmm0
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512BWVL-LABEL: avg_v8i16_maskz:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: kmovd %edi, %k1
+; AVX512BWVL-NEXT: vpavgw %xmm1, %xmm0, %xmm0 {%k1} {z}
+; AVX512BWVL-NEXT: retq
+ %za = zext <8 x i16> %a to <8 x i32>
+ %zb = zext <8 x i16> %b to <8 x i32>
+ %add = add nuw nsw <8 x i32> %za, %zb
+ %add1 = add nuw nsw <8 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+ %lshr = lshr <8 x i32> %add1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+ %trunc = trunc <8 x i32> %lshr to <8 x i16>
+ %mask1 = bitcast i8 %mask to <8 x i1>
+ %res = select <8 x i1> %mask1, <8 x i16> %trunc, <8 x i16> zeroinitializer
+ ret <8 x i16> %res
+}
+
+define <16 x i16> @avg_v16i16_mask(<16 x i16> %a, <16 x i16> %b, <16 x i16> %src, i16 %mask) nounwind {
+; AVX512F-LABEL: avg_v16i16_mask:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpavgw %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT: kmovw %edi, %k1
+; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; AVX512F-NEXT: vpmovdw %zmm1, %ymm1
+; AVX512F-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm0
+; AVX512F-NEXT: retq
+;
+; AVX512BWVL-LABEL: avg_v16i16_mask:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: kmovd %edi, %k1
+; AVX512BWVL-NEXT: vpavgw %ymm1, %ymm0, %ymm2 {%k1}
+; AVX512BWVL-NEXT: vmovdqa %ymm2, %ymm0
+; AVX512BWVL-NEXT: retq
+ %za = zext <16 x i16> %a to <16 x i32>
+ %zb = zext <16 x i16> %b to <16 x i32>
+ %add = add nuw nsw <16 x i32> %za, %zb
+ %add1 = add nuw nsw <16 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+ %lshr = lshr <16 x i32> %add1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+ %trunc = trunc <16 x i32> %lshr to <16 x i16>
+ %mask1 = bitcast i16 %mask to <16 x i1>
+ %res = select <16 x i1> %mask1, <16 x i16> %trunc, <16 x i16> %src
+ ret <16 x i16> %res
+}
+
+define <16 x i16> @avg_v16i16_maskz(<16 x i16> %a, <16 x i16> %b, i16 %mask) nounwind {
+; AVX512F-LABEL: avg_v16i16_maskz:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpavgw %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT: kmovw %edi, %k1
+; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; AVX512F-NEXT: vpmovdw %zmm1, %ymm1
+; AVX512F-NEXT: vpand %ymm0, %ymm1, %ymm0
+; AVX512F-NEXT: retq
+;
+; AVX512BWVL-LABEL: avg_v16i16_maskz:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: kmovd %edi, %k1
+; AVX512BWVL-NEXT: vpavgw %ymm1, %ymm0, %ymm0 {%k1} {z}
+; AVX512BWVL-NEXT: retq
+ %za = zext <16 x i16> %a to <16 x i32>
+ %zb = zext <16 x i16> %b to <16 x i32>
+ %add = add nuw nsw <16 x i32> %za, %zb
+ %add1 = add nuw nsw <16 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+ %lshr = lshr <16 x i32> %add1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+ %trunc = trunc <16 x i32> %lshr to <16 x i16>
+ %mask1 = bitcast i16 %mask to <16 x i1>
+ %res = select <16 x i1> %mask1, <16 x i16> %trunc, <16 x i16> zeroinitializer
+ ret <16 x i16> %res
+}
+
+define <32 x i16> @avg_v32i16_mask(<32 x i16> %a, <32 x i16> %b, <32 x i16> %src, i32 %mask) nounwind {
+; AVX512F-LABEL: avg_v32i16_mask:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: pushq %rbp
+; AVX512F-NEXT: movq %rsp, %rbp
+; AVX512F-NEXT: andq $-32, %rsp
+; AVX512F-NEXT: subq $32, %rsp
+; AVX512F-NEXT: movl %edi, (%rsp)
+; AVX512F-NEXT: kmovw (%rsp), %k1
+; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k2
+; AVX512F-NEXT: vpternlogd $255, %zmm6, %zmm6, %zmm6 {%k2} {z}
+; AVX512F-NEXT: vpmovdb %zmm6, %xmm6
+; AVX512F-NEXT: vpternlogd $255, %zmm7, %zmm7, %zmm7 {%k1} {z}
+; AVX512F-NEXT: vpmovdb %zmm7, %xmm7
+; AVX512F-NEXT: vpavgw %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT: vpavgw %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero,xmm7[4],zero,xmm7[5],zero,xmm7[6],zero,xmm7[7],zero,xmm7[8],zero,xmm7[9],zero,xmm7[10],zero,xmm7[11],zero,xmm7[12],zero,xmm7[13],zero,xmm7[14],zero,xmm7[15],zero
+; AVX512F-NEXT: vpsllw $15, %ymm2, %ymm2
+; AVX512F-NEXT: vpsraw $15, %ymm2, %ymm2
+; AVX512F-NEXT: vpblendvb %ymm2, %ymm0, %ymm4, %ymm0
+; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero,xmm6[8],zero,xmm6[9],zero,xmm6[10],zero,xmm6[11],zero,xmm6[12],zero,xmm6[13],zero,xmm6[14],zero,xmm6[15],zero
+; AVX512F-NEXT: vpsllw $15, %ymm2, %ymm2
+; AVX512F-NEXT: vpsraw $15, %ymm2, %ymm2
+; AVX512F-NEXT: vpblendvb %ymm2, %ymm1, %ymm5, %ymm1
+; AVX512F-NEXT: movq %rbp, %rsp
+; AVX512F-NEXT: popq %rbp
+; AVX512F-NEXT: retq
+;
+; AVX512BWVL-LABEL: avg_v32i16_mask:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: kmovd %edi, %k1
+; AVX512BWVL-NEXT: vpavgw %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512BWVL-NEXT: vmovdqa64 %zmm2, %zmm0
+; AVX512BWVL-NEXT: retq
+ %za = zext <32 x i16> %a to <32 x i32>
+ %zb = zext <32 x i16> %b to <32 x i32>
+ %add = add nuw nsw <32 x i32> %za, %zb
+ %add1 = add nuw nsw <32 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+ %lshr = lshr <32 x i32> %add1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+ %trunc = trunc <32 x i32> %lshr to <32 x i16>
+ %mask1 = bitcast i32 %mask to <32 x i1>
+ %res = select <32 x i1> %mask1, <32 x i16> %trunc, <32 x i16> %src
+ ret <32 x i16> %res
+}
+
+define <32 x i16> @avg_v32i16_maskz(<32 x i16> %a, <32 x i16> %b, i32 %mask) nounwind {
+; AVX512F-LABEL: avg_v32i16_maskz:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: pushq %rbp
+; AVX512F-NEXT: movq %rsp, %rbp
+; AVX512F-NEXT: andq $-32, %rsp
+; AVX512F-NEXT: subq $32, %rsp
+; AVX512F-NEXT: movl %edi, (%rsp)
+; AVX512F-NEXT: kmovw (%rsp), %k1
+; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k2
+; AVX512F-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k2} {z}
+; AVX512F-NEXT: vpmovdb %zmm4, %xmm4
+; AVX512F-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k1} {z}
+; AVX512F-NEXT: vpmovdb %zmm5, %xmm5
+; AVX512F-NEXT: vpavgw %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT: vpavgw %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero
+; AVX512F-NEXT: vpsllw $15, %ymm2, %ymm2
+; AVX512F-NEXT: vpsraw $15, %ymm2, %ymm2
+; AVX512F-NEXT: vpand %ymm0, %ymm2, %ymm0
+; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
+; AVX512F-NEXT: vpsllw $15, %ymm2, %ymm2
+; AVX512F-NEXT: vpsraw $15, %ymm2, %ymm2
+; AVX512F-NEXT: vpand %ymm1, %ymm2, %ymm1
+; AVX512F-NEXT: movq %rbp, %rsp
+; AVX512F-NEXT: popq %rbp
+; AVX512F-NEXT: retq
+;
+; AVX512BWVL-LABEL: avg_v32i16_maskz:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: kmovd %edi, %k1
+; AVX512BWVL-NEXT: vpavgw %zmm1, %zmm0, %zmm0 {%k1} {z}
+; AVX512BWVL-NEXT: retq
+ %za = zext <32 x i16> %a to <32 x i32>
+ %zb = zext <32 x i16> %b to <32 x i32>
+ %add = add nuw nsw <32 x i32> %za, %zb
+ %add1 = add nuw nsw <32 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+ %lshr = lshr <32 x i32> %add1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+ %trunc = trunc <32 x i32> %lshr to <32 x i16>
+ %mask1 = bitcast i32 %mask to <32 x i1>
+ %res = select <32 x i1> %mask1, <32 x i16> %trunc, <32 x i16> zeroinitializer
+ ret <32 x i16> %res
+}
diff --git a/test/CodeGen/X86/avg.ll b/test/CodeGen/X86/avg.ll
index 640b5215afe9..d1e26b787f48 100644
--- a/test/CodeGen/X86/avg.ll
+++ b/test/CodeGen/X86/avg.ll
@@ -5,9 +5,9 @@
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512BW
-define void @avg_v4i8(<4 x i8>* %a, <4 x i8>* %b) {
+define void @avg_v4i8(<4 x i8>* %a, <4 x i8>* %b) nounwind {
; SSE2-LABEL: avg_v4i8:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; SSE2-NEXT: pavgb %xmm0, %xmm1
@@ -15,7 +15,7 @@ define void @avg_v4i8(<4 x i8>* %a, <4 x i8>* %b) {
; SSE2-NEXT: retq
;
; AVX-LABEL: avg_v4i8:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; AVX-NEXT: vpavgb %xmm0, %xmm1, %xmm0
@@ -33,9 +33,9 @@ define void @avg_v4i8(<4 x i8>* %a, <4 x i8>* %b) {
ret void
}
-define void @avg_v8i8(<8 x i8>* %a, <8 x i8>* %b) {
+define void @avg_v8i8(<8 x i8>* %a, <8 x i8>* %b) nounwind {
; SSE2-LABEL: avg_v8i8:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
; SSE2-NEXT: pavgb %xmm0, %xmm1
@@ -43,7 +43,7 @@ define void @avg_v8i8(<8 x i8>* %a, <8 x i8>* %b) {
; SSE2-NEXT: retq
;
; AVX-LABEL: avg_v8i8:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; AVX-NEXT: vpavgb %xmm0, %xmm1, %xmm0
@@ -61,16 +61,16 @@ define void @avg_v8i8(<8 x i8>* %a, <8 x i8>* %b) {
ret void
}
-define void @avg_v16i8(<16 x i8>* %a, <16 x i8>* %b) {
+define void @avg_v16i8(<16 x i8>* %a, <16 x i8>* %b) nounwind {
; SSE2-LABEL: avg_v16i8:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa (%rsi), %xmm0
; SSE2-NEXT: pavgb (%rdi), %xmm0
; SSE2-NEXT: movdqu %xmm0, (%rax)
; SSE2-NEXT: retq
;
; AVX-LABEL: avg_v16i8:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovdqa (%rsi), %xmm0
; AVX-NEXT: vpavgb (%rdi), %xmm0, %xmm0
; AVX-NEXT: vmovdqu %xmm0, (%rax)
@@ -87,9 +87,9 @@ define void @avg_v16i8(<16 x i8>* %a, <16 x i8>* %b) {
ret void
}
-define void @avg_v32i8(<32 x i8>* %a, <32 x i8>* %b) {
+define void @avg_v32i8(<32 x i8>* %a, <32 x i8>* %b) nounwind {
; SSE2-LABEL: avg_v32i8:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa (%rdi), %xmm3
; SSE2-NEXT: movdqa 16(%rdi), %xmm8
; SSE2-NEXT: movdqa (%rsi), %xmm0
@@ -176,7 +176,7 @@ define void @avg_v32i8(<32 x i8>* %a, <32 x i8>* %b) {
; SSE2-NEXT: retq
;
; AVX1-LABEL: avg_v32i8:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
@@ -239,7 +239,7 @@ define void @avg_v32i8(<32 x i8>* %a, <32 x i8>* %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: avg_v32i8:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa (%rsi), %ymm0
; AVX2-NEXT: vpavgb (%rdi), %ymm0, %ymm0
; AVX2-NEXT: vmovdqu %ymm0, (%rax)
@@ -247,7 +247,7 @@ define void @avg_v32i8(<32 x i8>* %a, <32 x i8>* %b) {
; AVX2-NEXT: retq
;
; AVX512-LABEL: avg_v32i8:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vmovdqa (%rsi), %ymm0
; AVX512-NEXT: vpavgb (%rdi), %ymm0, %ymm0
; AVX512-NEXT: vmovdqu %ymm0, (%rax)
@@ -265,9 +265,9 @@ define void @avg_v32i8(<32 x i8>* %a, <32 x i8>* %b) {
ret void
}
-define void @avg_v64i8(<64 x i8>* %a, <64 x i8>* %b) {
+define void @avg_v64i8(<64 x i8>* %a, <64 x i8>* %b) nounwind {
; SSE2-LABEL: avg_v64i8:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa (%rdi), %xmm6
; SSE2-NEXT: movdqa 16(%rdi), %xmm2
; SSE2-NEXT: movdqa 32(%rdi), %xmm1
@@ -448,10 +448,8 @@ define void @avg_v64i8(<64 x i8>* %a, <64 x i8>* %b) {
; SSE2-NEXT: retq
;
; AVX1-LABEL: avg_v64i8:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: subq $24, %rsp
-; AVX1-NEXT: .Lcfi0:
-; AVX1-NEXT: .cfi_def_cfa_offset 32
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
@@ -595,7 +593,7 @@ define void @avg_v64i8(<64 x i8>* %a, <64 x i8>* %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: avg_v64i8:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
@@ -675,7 +673,7 @@ define void @avg_v64i8(<64 x i8>* %a, <64 x i8>* %b) {
; AVX2-NEXT: retq
;
; AVX512F-LABEL: avg_v64i8:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
@@ -709,10 +707,10 @@ define void @avg_v64i8(<64 x i8>* %a, <64 x i8>* %b) {
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: avg_v64i8:
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vmovdqu8 (%rsi), %zmm0
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm0
; AVX512BW-NEXT: vpavgb (%rdi), %zmm0, %zmm0
-; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rax)
+; AVX512BW-NEXT: vmovdqu32 %zmm0, (%rax)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
%1 = load <64 x i8>, <64 x i8>* %a
@@ -727,9 +725,9 @@ define void @avg_v64i8(<64 x i8>* %a, <64 x i8>* %b) {
ret void
}
-define void @avg_v4i16(<4 x i16>* %a, <4 x i16>* %b) {
+define void @avg_v4i16(<4 x i16>* %a, <4 x i16>* %b) nounwind {
; SSE2-LABEL: avg_v4i16:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
; SSE2-NEXT: pavgw %xmm0, %xmm1
@@ -737,7 +735,7 @@ define void @avg_v4i16(<4 x i16>* %a, <4 x i16>* %b) {
; SSE2-NEXT: retq
;
; AVX-LABEL: avg_v4i16:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; AVX-NEXT: vpavgw %xmm0, %xmm1, %xmm0
@@ -755,16 +753,16 @@ define void @avg_v4i16(<4 x i16>* %a, <4 x i16>* %b) {
ret void
}
-define void @avg_v8i16(<8 x i16>* %a, <8 x i16>* %b) {
+define void @avg_v8i16(<8 x i16>* %a, <8 x i16>* %b) nounwind {
; SSE2-LABEL: avg_v8i16:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa (%rsi), %xmm0
; SSE2-NEXT: pavgw (%rdi), %xmm0
; SSE2-NEXT: movdqu %xmm0, (%rax)
; SSE2-NEXT: retq
;
; AVX-LABEL: avg_v8i16:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovdqa (%rsi), %xmm0
; AVX-NEXT: vpavgw (%rdi), %xmm0, %xmm0
; AVX-NEXT: vmovdqu %xmm0, (%rax)
@@ -781,9 +779,9 @@ define void @avg_v8i16(<8 x i16>* %a, <8 x i16>* %b) {
ret void
}
-define void @avg_v16i16(<16 x i16>* %a, <16 x i16>* %b) {
+define void @avg_v16i16(<16 x i16>* %a, <16 x i16>* %b) nounwind {
; SSE2-LABEL: avg_v16i16:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa (%rdi), %xmm2
; SSE2-NEXT: movdqa 16(%rdi), %xmm4
; SSE2-NEXT: movdqa (%rsi), %xmm0
@@ -829,7 +827,7 @@ define void @avg_v16i16(<16 x i16>* %a, <16 x i16>* %b) {
; SSE2-NEXT: retq
;
; AVX1-LABEL: avg_v16i16:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
@@ -864,7 +862,7 @@ define void @avg_v16i16(<16 x i16>* %a, <16 x i16>* %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: avg_v16i16:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa (%rsi), %ymm0
; AVX2-NEXT: vpavgw (%rdi), %ymm0, %ymm0
; AVX2-NEXT: vmovdqu %ymm0, (%rax)
@@ -872,7 +870,7 @@ define void @avg_v16i16(<16 x i16>* %a, <16 x i16>* %b) {
; AVX2-NEXT: retq
;
; AVX512-LABEL: avg_v16i16:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vmovdqa (%rsi), %ymm0
; AVX512-NEXT: vpavgw (%rdi), %ymm0, %ymm0
; AVX512-NEXT: vmovdqu %ymm0, (%rax)
@@ -890,9 +888,9 @@ define void @avg_v16i16(<16 x i16>* %a, <16 x i16>* %b) {
ret void
}
-define void @avg_v32i16(<32 x i16>* %a, <32 x i16>* %b) {
+define void @avg_v32i16(<32 x i16>* %a, <32 x i16>* %b) nounwind {
; SSE2-LABEL: avg_v32i16:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa (%rdi), %xmm4
; SSE2-NEXT: movdqa 16(%rdi), %xmm11
; SSE2-NEXT: movdqa 32(%rdi), %xmm10
@@ -978,7 +976,7 @@ define void @avg_v32i16(<32 x i16>* %a, <32 x i16>* %b) {
; SSE2-NEXT: retq
;
; AVX1-LABEL: avg_v32i16:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
@@ -1041,7 +1039,7 @@ define void @avg_v32i16(<32 x i16>* %a, <32 x i16>* %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: avg_v32i16:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
@@ -1080,7 +1078,7 @@ define void @avg_v32i16(<32 x i16>* %a, <32 x i16>* %b) {
; AVX2-NEXT: retq
;
; AVX512F-LABEL: avg_v32i16:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
@@ -1098,10 +1096,10 @@ define void @avg_v32i16(<32 x i16>* %a, <32 x i16>* %b) {
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: avg_v32i16:
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vmovdqu16 (%rsi), %zmm0
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm0
; AVX512BW-NEXT: vpavgw (%rdi), %zmm0, %zmm0
-; AVX512BW-NEXT: vmovdqu16 %zmm0, (%rax)
+; AVX512BW-NEXT: vmovdqu32 %zmm0, (%rax)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
%1 = load <32 x i16>, <32 x i16>* %a
@@ -1116,9 +1114,9 @@ define void @avg_v32i16(<32 x i16>* %a, <32 x i16>* %b) {
ret void
}
-define void @avg_v4i8_2(<4 x i8>* %a, <4 x i8>* %b) {
+define void @avg_v4i8_2(<4 x i8>* %a, <4 x i8>* %b) nounwind {
; SSE2-LABEL: avg_v4i8_2:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; SSE2-NEXT: pavgb %xmm0, %xmm1
@@ -1126,7 +1124,7 @@ define void @avg_v4i8_2(<4 x i8>* %a, <4 x i8>* %b) {
; SSE2-NEXT: retq
;
; AVX-LABEL: avg_v4i8_2:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; AVX-NEXT: vpavgb %xmm1, %xmm0, %xmm0
@@ -1144,9 +1142,9 @@ define void @avg_v4i8_2(<4 x i8>* %a, <4 x i8>* %b) {
ret void
}
-define void @avg_v8i8_2(<8 x i8>* %a, <8 x i8>* %b) {
+define void @avg_v8i8_2(<8 x i8>* %a, <8 x i8>* %b) nounwind {
; SSE2-LABEL: avg_v8i8_2:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
; SSE2-NEXT: pavgb %xmm0, %xmm1
@@ -1154,7 +1152,7 @@ define void @avg_v8i8_2(<8 x i8>* %a, <8 x i8>* %b) {
; SSE2-NEXT: retq
;
; AVX-LABEL: avg_v8i8_2:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; AVX-NEXT: vpavgb %xmm1, %xmm0, %xmm0
@@ -1172,16 +1170,16 @@ define void @avg_v8i8_2(<8 x i8>* %a, <8 x i8>* %b) {
ret void
}
-define void @avg_v16i8_2(<16 x i8>* %a, <16 x i8>* %b) {
+define void @avg_v16i8_2(<16 x i8>* %a, <16 x i8>* %b) nounwind {
; SSE2-LABEL: avg_v16i8_2:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa (%rdi), %xmm0
; SSE2-NEXT: pavgb (%rsi), %xmm0
; SSE2-NEXT: movdqu %xmm0, (%rax)
; SSE2-NEXT: retq
;
; AVX-LABEL: avg_v16i8_2:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovdqa (%rdi), %xmm0
; AVX-NEXT: vpavgb (%rsi), %xmm0, %xmm0
; AVX-NEXT: vmovdqu %xmm0, (%rax)
@@ -1198,9 +1196,9 @@ define void @avg_v16i8_2(<16 x i8>* %a, <16 x i8>* %b) {
ret void
}
-define void @avg_v32i8_2(<32 x i8>* %a, <32 x i8>* %b) {
+define void @avg_v32i8_2(<32 x i8>* %a, <32 x i8>* %b) nounwind {
; SSE2-LABEL: avg_v32i8_2:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa (%rdi), %xmm3
; SSE2-NEXT: movdqa 16(%rdi), %xmm8
; SSE2-NEXT: movdqa (%rsi), %xmm0
@@ -1287,7 +1285,7 @@ define void @avg_v32i8_2(<32 x i8>* %a, <32 x i8>* %b) {
; SSE2-NEXT: retq
;
; AVX1-LABEL: avg_v32i8_2:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
@@ -1350,7 +1348,7 @@ define void @avg_v32i8_2(<32 x i8>* %a, <32 x i8>* %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: avg_v32i8_2:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa (%rdi), %ymm0
; AVX2-NEXT: vpavgb (%rsi), %ymm0, %ymm0
; AVX2-NEXT: vmovdqu %ymm0, (%rax)
@@ -1358,7 +1356,7 @@ define void @avg_v32i8_2(<32 x i8>* %a, <32 x i8>* %b) {
; AVX2-NEXT: retq
;
; AVX512-LABEL: avg_v32i8_2:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vmovdqa (%rdi), %ymm0
; AVX512-NEXT: vpavgb (%rsi), %ymm0, %ymm0
; AVX512-NEXT: vmovdqu %ymm0, (%rax)
@@ -1376,9 +1374,9 @@ define void @avg_v32i8_2(<32 x i8>* %a, <32 x i8>* %b) {
ret void
}
-define void @avg_v64i8_2(<64 x i8>* %a, <64 x i8>* %b) {
+define void @avg_v64i8_2(<64 x i8>* %a, <64 x i8>* %b) nounwind {
; SSE2-LABEL: avg_v64i8_2:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa (%rsi), %xmm14
; SSE2-NEXT: movdqa 16(%rsi), %xmm12
; SSE2-NEXT: movdqa 32(%rsi), %xmm2
@@ -1512,7 +1510,7 @@ define void @avg_v64i8_2(<64 x i8>* %a, <64 x i8>* %b) {
; SSE2-NEXT: retq
;
; AVX1-LABEL: avg_v64i8_2:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm9 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm10 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
@@ -1629,7 +1627,7 @@ define void @avg_v64i8_2(<64 x i8>* %a, <64 x i8>* %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: avg_v64i8_2:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
@@ -1701,7 +1699,7 @@ define void @avg_v64i8_2(<64 x i8>* %a, <64 x i8>* %b) {
; AVX2-NEXT: retq
;
; AVX512F-LABEL: avg_v64i8_2:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
@@ -1731,10 +1729,10 @@ define void @avg_v64i8_2(<64 x i8>* %a, <64 x i8>* %b) {
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: avg_v64i8_2:
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vmovdqu8 (%rsi), %zmm0
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm0
; AVX512BW-NEXT: vpavgb %zmm0, %zmm0, %zmm0
-; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rax)
+; AVX512BW-NEXT: vmovdqu32 %zmm0, (%rax)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
%1 = load <64 x i8>, <64 x i8>* %a
@@ -1750,9 +1748,9 @@ define void @avg_v64i8_2(<64 x i8>* %a, <64 x i8>* %b) {
}
-define void @avg_v4i16_2(<4 x i16>* %a, <4 x i16>* %b) {
+define void @avg_v4i16_2(<4 x i16>* %a, <4 x i16>* %b) nounwind {
; SSE2-LABEL: avg_v4i16_2:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
; SSE2-NEXT: pavgw %xmm0, %xmm1
@@ -1760,7 +1758,7 @@ define void @avg_v4i16_2(<4 x i16>* %a, <4 x i16>* %b) {
; SSE2-NEXT: retq
;
; AVX-LABEL: avg_v4i16_2:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; AVX-NEXT: vpavgw %xmm1, %xmm0, %xmm0
@@ -1778,16 +1776,16 @@ define void @avg_v4i16_2(<4 x i16>* %a, <4 x i16>* %b) {
ret void
}
-define void @avg_v8i16_2(<8 x i16>* %a, <8 x i16>* %b) {
+define void @avg_v8i16_2(<8 x i16>* %a, <8 x i16>* %b) nounwind {
; SSE2-LABEL: avg_v8i16_2:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa (%rdi), %xmm0
; SSE2-NEXT: pavgw (%rsi), %xmm0
; SSE2-NEXT: movdqu %xmm0, (%rax)
; SSE2-NEXT: retq
;
; AVX-LABEL: avg_v8i16_2:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovdqa (%rdi), %xmm0
; AVX-NEXT: vpavgw (%rsi), %xmm0, %xmm0
; AVX-NEXT: vmovdqu %xmm0, (%rax)
@@ -1804,9 +1802,9 @@ define void @avg_v8i16_2(<8 x i16>* %a, <8 x i16>* %b) {
ret void
}
-define void @avg_v16i16_2(<16 x i16>* %a, <16 x i16>* %b) {
+define void @avg_v16i16_2(<16 x i16>* %a, <16 x i16>* %b) nounwind {
; SSE2-LABEL: avg_v16i16_2:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa (%rdi), %xmm2
; SSE2-NEXT: movdqa 16(%rdi), %xmm4
; SSE2-NEXT: movdqa (%rsi), %xmm0
@@ -1852,7 +1850,7 @@ define void @avg_v16i16_2(<16 x i16>* %a, <16 x i16>* %b) {
; SSE2-NEXT: retq
;
; AVX1-LABEL: avg_v16i16_2:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
@@ -1887,7 +1885,7 @@ define void @avg_v16i16_2(<16 x i16>* %a, <16 x i16>* %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: avg_v16i16_2:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa (%rdi), %ymm0
; AVX2-NEXT: vpavgw (%rsi), %ymm0, %ymm0
; AVX2-NEXT: vmovdqu %ymm0, (%rax)
@@ -1895,7 +1893,7 @@ define void @avg_v16i16_2(<16 x i16>* %a, <16 x i16>* %b) {
; AVX2-NEXT: retq
;
; AVX512-LABEL: avg_v16i16_2:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vmovdqa (%rdi), %ymm0
; AVX512-NEXT: vpavgw (%rsi), %ymm0, %ymm0
; AVX512-NEXT: vmovdqu %ymm0, (%rax)
@@ -1913,9 +1911,9 @@ define void @avg_v16i16_2(<16 x i16>* %a, <16 x i16>* %b) {
ret void
}
-define void @avg_v32i16_2(<32 x i16>* %a, <32 x i16>* %b) {
+define void @avg_v32i16_2(<32 x i16>* %a, <32 x i16>* %b) nounwind {
; SSE2-LABEL: avg_v32i16_2:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa (%rdi), %xmm4
; SSE2-NEXT: movdqa 16(%rdi), %xmm11
; SSE2-NEXT: movdqa 32(%rdi), %xmm10
@@ -2001,7 +1999,7 @@ define void @avg_v32i16_2(<32 x i16>* %a, <32 x i16>* %b) {
; SSE2-NEXT: retq
;
; AVX1-LABEL: avg_v32i16_2:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
@@ -2064,7 +2062,7 @@ define void @avg_v32i16_2(<32 x i16>* %a, <32 x i16>* %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: avg_v32i16_2:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
@@ -2103,7 +2101,7 @@ define void @avg_v32i16_2(<32 x i16>* %a, <32 x i16>* %b) {
; AVX2-NEXT: retq
;
; AVX512F-LABEL: avg_v32i16_2:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
@@ -2121,10 +2119,10 @@ define void @avg_v32i16_2(<32 x i16>* %a, <32 x i16>* %b) {
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: avg_v32i16_2:
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vmovdqu16 (%rdi), %zmm0
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
; AVX512BW-NEXT: vpavgw (%rsi), %zmm0, %zmm0
-; AVX512BW-NEXT: vmovdqu16 %zmm0, (%rax)
+; AVX512BW-NEXT: vmovdqu32 %zmm0, (%rax)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
%1 = load <32 x i16>, <32 x i16>* %a
@@ -2139,16 +2137,16 @@ define void @avg_v32i16_2(<32 x i16>* %a, <32 x i16>* %b) {
ret void
}
-define void @avg_v4i8_const(<4 x i8>* %a) {
+define void @avg_v4i8_const(<4 x i8>* %a) nounwind {
; SSE2-LABEL: avg_v4i8_const:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: pavgb {{.*}}(%rip), %xmm0
; SSE2-NEXT: movd %xmm0, (%rax)
; SSE2-NEXT: retq
;
; AVX-LABEL: avg_v4i8_const:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX-NEXT: vpavgb {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: vmovd %xmm0, (%rax)
@@ -2162,16 +2160,16 @@ define void @avg_v4i8_const(<4 x i8>* %a) {
ret void
}
-define void @avg_v8i8_const(<8 x i8>* %a) {
+define void @avg_v8i8_const(<8 x i8>* %a) nounwind {
; SSE2-LABEL: avg_v8i8_const:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; SSE2-NEXT: pavgb {{.*}}(%rip), %xmm0
; SSE2-NEXT: movq %xmm0, (%rax)
; SSE2-NEXT: retq
;
; AVX-LABEL: avg_v8i8_const:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: vpavgb {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: vmovq %xmm0, (%rax)
@@ -2185,16 +2183,16 @@ define void @avg_v8i8_const(<8 x i8>* %a) {
ret void
}
-define void @avg_v16i8_const(<16 x i8>* %a) {
+define void @avg_v16i8_const(<16 x i8>* %a) nounwind {
; SSE2-LABEL: avg_v16i8_const:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa (%rdi), %xmm0
; SSE2-NEXT: pavgb {{.*}}(%rip), %xmm0
; SSE2-NEXT: movdqu %xmm0, (%rax)
; SSE2-NEXT: retq
;
; AVX-LABEL: avg_v16i8_const:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovdqa (%rdi), %xmm0
; AVX-NEXT: vpavgb {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: vmovdqu %xmm0, (%rax)
@@ -2208,69 +2206,60 @@ define void @avg_v16i8_const(<16 x i8>* %a) {
ret void
}
-define void @avg_v32i8_const(<32 x i8>* %a) {
+define void @avg_v32i8_const(<32 x i8>* %a) nounwind {
; SSE2-LABEL: avg_v32i8_const:
-; SSE2: # BB#0:
-; SSE2-NEXT: movdqa (%rdi), %xmm5
-; SSE2-NEXT: movdqa 16(%rdi), %xmm2
-; SSE2-NEXT: pxor %xmm3, %xmm3
-; SSE2-NEXT: movdqa %xmm2, %xmm1
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15]
-; SSE2-NEXT: movdqa %xmm2, %xmm8
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3]
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
-; SSE2-NEXT: movdqa %xmm5, %xmm6
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3],xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7]
-; SSE2-NEXT: movdqa %xmm6, %xmm4
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7]
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15]
-; SSE2-NEXT: movdqa %xmm5, %xmm7
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3]
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7]
-; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [5,6,7,8]
-; SSE2-NEXT: paddd %xmm9, %xmm5
-; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1,2,3,4]
-; SSE2-NEXT: paddd %xmm3, %xmm7
-; SSE2-NEXT: paddd %xmm9, %xmm6
-; SSE2-NEXT: paddd %xmm3, %xmm4
+; SSE2: # %bb.0:
+; SSE2-NEXT: movdqa (%rdi), %xmm0
+; SSE2-NEXT: movdqa 16(%rdi), %xmm3
+; SSE2-NEXT: pxor %xmm4, %xmm4
+; SSE2-NEXT: movdqa %xmm3, %xmm1
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
+; SSE2-NEXT: movdqa %xmm1, %xmm7
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15]
+; SSE2-NEXT: movdqa %xmm3, %xmm6
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
+; SSE2-NEXT: movdqa %xmm2, %xmm5
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15]
+; SSE2-NEXT: movdqa %xmm0, %xmm8
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
+; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [1,2,3,4]
+; SSE2-NEXT: paddd %xmm9, %xmm0
+; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [5,6,7,8]
+; SSE2-NEXT: paddd %xmm4, %xmm8
; SSE2-NEXT: paddd %xmm9, %xmm2
-; SSE2-NEXT: paddd %xmm3, %xmm8
+; SSE2-NEXT: paddd %xmm4, %xmm5
+; SSE2-NEXT: paddd %xmm9, %xmm3
+; SSE2-NEXT: paddd %xmm4, %xmm6
; SSE2-NEXT: paddd %xmm9, %xmm1
-; SSE2-NEXT: paddd %xmm3, %xmm0
-; SSE2-NEXT: psrld $1, %xmm0
+; SSE2-NEXT: paddd %xmm4, %xmm7
+; SSE2-NEXT: psrld $1, %xmm7
; SSE2-NEXT: psrld $1, %xmm1
-; SSE2-NEXT: psrld $1, %xmm8
-; SSE2-NEXT: psrld $1, %xmm2
-; SSE2-NEXT: psrld $1, %xmm4
+; SSE2-NEXT: packuswb %xmm7, %xmm1
; SSE2-NEXT: psrld $1, %xmm6
-; SSE2-NEXT: psrld $1, %xmm7
+; SSE2-NEXT: psrld $1, %xmm3
+; SSE2-NEXT: packuswb %xmm6, %xmm3
+; SSE2-NEXT: packuswb %xmm3, %xmm1
; SSE2-NEXT: psrld $1, %xmm5
-; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
-; SSE2-NEXT: pand %xmm3, %xmm5
-; SSE2-NEXT: pand %xmm3, %xmm7
-; SSE2-NEXT: packuswb %xmm5, %xmm7
-; SSE2-NEXT: pand %xmm3, %xmm6
-; SSE2-NEXT: pand %xmm3, %xmm4
-; SSE2-NEXT: packuswb %xmm6, %xmm4
-; SSE2-NEXT: packuswb %xmm7, %xmm4
-; SSE2-NEXT: pand %xmm3, %xmm2
-; SSE2-NEXT: pand %xmm3, %xmm8
-; SSE2-NEXT: packuswb %xmm2, %xmm8
-; SSE2-NEXT: pand %xmm3, %xmm1
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: packuswb %xmm1, %xmm0
+; SSE2-NEXT: psrld $1, %xmm2
+; SSE2-NEXT: packuswb %xmm5, %xmm2
+; SSE2-NEXT: psrld $1, %xmm8
+; SSE2-NEXT: psrld $1, %xmm0
; SSE2-NEXT: packuswb %xmm8, %xmm0
-; SSE2-NEXT: movdqu %xmm0, (%rax)
-; SSE2-NEXT: movdqu %xmm4, (%rax)
+; SSE2-NEXT: packuswb %xmm0, %xmm2
+; SSE2-NEXT: movdqu %xmm1, (%rax)
+; SSE2-NEXT: movdqu %xmm2, (%rax)
; SSE2-NEXT: retq
;
; AVX1-LABEL: avg_v32i8_const:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
@@ -2279,9 +2268,9 @@ define void @avg_v32i8_const(<32 x i8>* %a) {
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = [5,6,7,8]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = [1,2,3,4]
; AVX1-NEXT: vpaddd %xmm0, %xmm7, %xmm9
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [1,2,3,4]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [5,6,7,8]
; AVX1-NEXT: vpaddd %xmm7, %xmm6, %xmm6
; AVX1-NEXT: vpaddd %xmm0, %xmm5, %xmm5
; AVX1-NEXT: vpaddd %xmm7, %xmm4, %xmm4
@@ -2289,36 +2278,27 @@ define void @avg_v32i8_const(<32 x i8>* %a) {
; AVX1-NEXT: vpaddd %xmm7, %xmm2, %xmm2
; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpaddd %xmm7, %xmm8, %xmm1
-; AVX1-NEXT: vpsrld $1, %xmm1, %xmm8
+; AVX1-NEXT: vpsrld $1, %xmm1, %xmm1
; AVX1-NEXT: vpsrld $1, %xmm0, %xmm0
-; AVX1-NEXT: vpsrld $1, %xmm2, %xmm2
-; AVX1-NEXT: vpsrld $1, %xmm3, %xmm3
-; AVX1-NEXT: vpsrld $1, %xmm4, %xmm4
-; AVX1-NEXT: vpsrld $1, %xmm5, %xmm5
-; AVX1-NEXT: vpsrld $1, %xmm6, %xmm6
-; AVX1-NEXT: vpsrld $1, %xmm9, %xmm7
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
-; AVX1-NEXT: vpand %xmm1, %xmm7, %xmm7
-; AVX1-NEXT: vpand %xmm1, %xmm6, %xmm6
-; AVX1-NEXT: vpackuswb %xmm7, %xmm6, %xmm6
-; AVX1-NEXT: vpand %xmm1, %xmm5, %xmm5
-; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4
-; AVX1-NEXT: vpackuswb %xmm5, %xmm4, %xmm4
-; AVX1-NEXT: vpackuswb %xmm6, %xmm4, %xmm4
-; AVX1-NEXT: vpand %xmm1, %xmm3, %xmm3
-; AVX1-NEXT: vpand %xmm1, %xmm2, %xmm2
-; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm8, %xmm1
-; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpsrld $1, %xmm2, %xmm1
+; AVX1-NEXT: vpsrld $1, %xmm3, %xmm2
+; AVX1-NEXT: vpackssdw %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpsrld $1, %xmm4, %xmm1
+; AVX1-NEXT: vpsrld $1, %xmm5, %xmm2
+; AVX1-NEXT: vpackssdw %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vpsrld $1, %xmm6, %xmm2
+; AVX1-NEXT: vpsrld $1, %xmm9, %xmm3
+; AVX1-NEXT: vpackssdw %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: vmovups %ymm0, (%rax)
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: avg_v32i8_const:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa (%rdi), %ymm0
; AVX2-NEXT: vpavgb {{.*}}(%rip), %ymm0, %ymm0
; AVX2-NEXT: vmovdqu %ymm0, (%rax)
@@ -2326,7 +2306,7 @@ define void @avg_v32i8_const(<32 x i8>* %a) {
; AVX2-NEXT: retq
;
; AVX512-LABEL: avg_v32i8_const:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vmovdqa (%rdi), %ymm0
; AVX512-NEXT: vpavgb {{.*}}(%rip), %ymm0, %ymm0
; AVX512-NEXT: vmovdqu %ymm0, (%rax)
@@ -2341,9 +2321,9 @@ define void @avg_v32i8_const(<32 x i8>* %a) {
ret void
}
-define void @avg_v64i8_const(<64 x i8>* %a) {
+define void @avg_v64i8_const(<64 x i8>* %a) nounwind {
; SSE2-LABEL: avg_v64i8_const:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa (%rdi), %xmm5
; SSE2-NEXT: movdqa 16(%rdi), %xmm6
; SSE2-NEXT: movdqa 32(%rdi), %xmm15
@@ -2462,7 +2442,7 @@ define void @avg_v64i8_const(<64 x i8>* %a) {
; SSE2-NEXT: retq
;
; AVX1-LABEL: avg_v64i8_const:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; AVX1-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
@@ -2555,7 +2535,7 @@ define void @avg_v64i8_const(<64 x i8>* %a) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: avg_v64i8_const:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
@@ -2573,53 +2553,43 @@ define void @avg_v64i8_const(<64 x i8>* %a) {
; AVX2-NEXT: vpaddd %ymm8, %ymm2, %ymm2
; AVX2-NEXT: vpaddd %ymm8, %ymm1, %ymm1
; AVX2-NEXT: vpaddd %ymm8, %ymm0, %ymm0
-; AVX2-NEXT: vpsrld $1, %ymm0, %ymm0
+; AVX2-NEXT: vpsrld $1, %ymm0, %ymm8
; AVX2-NEXT: vpsrld $1, %ymm1, %ymm1
-; AVX2-NEXT: vpsrld $1, %ymm2, %ymm8
-; AVX2-NEXT: vpsrld $1, %ymm3, %ymm9
+; AVX2-NEXT: vpsrld $1, %ymm2, %ymm2
+; AVX2-NEXT: vpsrld $1, %ymm3, %ymm3
; AVX2-NEXT: vpsrld $1, %ymm4, %ymm4
; AVX2-NEXT: vpsrld $1, %ymm5, %ymm5
; AVX2-NEXT: vpsrld $1, %ymm6, %ymm6
-; AVX2-NEXT: vpsrld $1, %ymm7, %ymm3
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm3
-; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm3[0,2,2,3]
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX2-NEXT: vpshufb %xmm3, %xmm7, %xmm7
-; AVX2-NEXT: vpshufb %ymm2, %ymm6, %ymm6
-; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3]
-; AVX2-NEXT: vpshufb %xmm3, %xmm6, %xmm6
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm7[0]
-; AVX2-NEXT: vpshufb %ymm2, %ymm5, %ymm5
-; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3]
-; AVX2-NEXT: vpshufb %xmm3, %xmm5, %xmm5
-; AVX2-NEXT: vpshufb %ymm2, %ymm4, %ymm4
-; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3]
-; AVX2-NEXT: vpshufb %xmm3, %xmm4, %xmm4
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0]
-; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm4, %ymm4
-; AVX2-NEXT: vpshufb %ymm2, %ymm9, %ymm5
-; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3]
-; AVX2-NEXT: vpshufb %xmm3, %xmm5, %xmm5
-; AVX2-NEXT: vpshufb %ymm2, %ymm8, %ymm6
-; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3]
-; AVX2-NEXT: vpshufb %xmm3, %xmm6, %xmm6
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm6[0],xmm5[0]
-; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
-; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
-; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm0
+; AVX2-NEXT: vpsrld $1, %ymm7, %ymm7
+; AVX2-NEXT: vextracti128 $1, %ymm7, %xmm0
+; AVX2-NEXT: vpackssdw %xmm0, %xmm7, %xmm0
+; AVX2-NEXT: vextracti128 $1, %ymm6, %xmm7
+; AVX2-NEXT: vpackssdw %xmm7, %xmm6, %xmm6
+; AVX2-NEXT: vpackuswb %xmm0, %xmm6, %xmm0
+; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6
+; AVX2-NEXT: vpackssdw %xmm6, %xmm5, %xmm5
+; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm6
+; AVX2-NEXT: vpackssdw %xmm6, %xmm4, %xmm4
+; AVX2-NEXT: vpackuswb %xmm5, %xmm4, %xmm4
+; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX2-NEXT: vpackssdw %xmm4, %xmm3, %xmm3
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm4
+; AVX2-NEXT: vpackssdw %xmm4, %xmm2, %xmm2
+; AVX2-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX2-NEXT: vpackssdw %xmm3, %xmm1, %xmm1
+; AVX2-NEXT: vextracti128 $1, %ymm8, %xmm3
+; AVX2-NEXT: vpackssdw %xmm3, %xmm8, %xmm3
+; AVX2-NEXT: vpackuswb %xmm1, %xmm3, %xmm1
+; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX2-NEXT: vmovdqu %ymm1, (%rax)
; AVX2-NEXT: vmovdqu %ymm0, (%rax)
-; AVX2-NEXT: vmovdqu %ymm4, (%rax)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512F-LABEL: avg_v64i8_const:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
@@ -2646,10 +2616,10 @@ define void @avg_v64i8_const(<64 x i8>* %a) {
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: avg_v64i8_const:
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vmovdqu8 (%rdi), %zmm0
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
; AVX512BW-NEXT: vpavgb {{.*}}(%rip), %zmm0, %zmm0
-; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rax)
+; AVX512BW-NEXT: vmovdqu32 %zmm0, (%rax)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
%1 = load <64 x i8>, <64 x i8>* %a
@@ -2661,16 +2631,16 @@ define void @avg_v64i8_const(<64 x i8>* %a) {
ret void
}
-define void @avg_v4i16_const(<4 x i16>* %a) {
+define void @avg_v4i16_const(<4 x i16>* %a) nounwind {
; SSE2-LABEL: avg_v4i16_const:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; SSE2-NEXT: pavgw {{.*}}(%rip), %xmm0
; SSE2-NEXT: movq %xmm0, (%rax)
; SSE2-NEXT: retq
;
; AVX-LABEL: avg_v4i16_const:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: vpavgw {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: vmovq %xmm0, (%rax)
@@ -2684,16 +2654,16 @@ define void @avg_v4i16_const(<4 x i16>* %a) {
ret void
}
-define void @avg_v8i16_const(<8 x i16>* %a) {
+define void @avg_v8i16_const(<8 x i16>* %a) nounwind {
; SSE2-LABEL: avg_v8i16_const:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa (%rdi), %xmm0
; SSE2-NEXT: pavgw {{.*}}(%rip), %xmm0
; SSE2-NEXT: movdqu %xmm0, (%rax)
; SSE2-NEXT: retq
;
; AVX-LABEL: avg_v8i16_const:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovdqa (%rdi), %xmm0
; AVX-NEXT: vpavgw {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: vmovdqu %xmm0, (%rax)
@@ -2707,9 +2677,9 @@ define void @avg_v8i16_const(<8 x i16>* %a) {
ret void
}
-define void @avg_v16i16_const(<16 x i16>* %a) {
+define void @avg_v16i16_const(<16 x i16>* %a) nounwind {
; SSE2-LABEL: avg_v16i16_const:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa (%rdi), %xmm3
; SSE2-NEXT: movdqa 16(%rdi), %xmm0
; SSE2-NEXT: pxor %xmm4, %xmm4
@@ -2744,35 +2714,30 @@ define void @avg_v16i16_const(<16 x i16>* %a) {
; SSE2-NEXT: retq
;
; AVX1-LABEL: avg_v16i16_const:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [5,6,7,8]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [1,2,3,4]
; AVX1-NEXT: vpaddd %xmm4, %xmm3, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [1,2,3,4]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [5,6,7,8]
; AVX1-NEXT: vpaddd %xmm5, %xmm2, %xmm2
; AVX1-NEXT: vpaddd %xmm4, %xmm1, %xmm1
; AVX1-NEXT: vpaddd %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsrld $1, %xmm0, %xmm0
; AVX1-NEXT: vpsrld $1, %xmm1, %xmm1
-; AVX1-NEXT: vpsrld $1, %xmm2, %xmm2
-; AVX1-NEXT: vpsrld $1, %xmm3, %xmm3
-; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2],xmm4[3],xmm3[4],xmm4[5],xmm3[6],xmm4[7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2],xmm4[3],xmm2[4],xmm4[5],xmm2[6],xmm4[7]
-; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2],xmm4[3],xmm1[4],xmm4[5],xmm1[6],xmm4[7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2],xmm4[3],xmm0[4],xmm4[5],xmm0[6],xmm4[7]
-; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: vpackusdw %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpsrld $1, %xmm2, %xmm1
+; AVX1-NEXT: vpsrld $1, %xmm3, %xmm2
+; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: vmovups %ymm0, (%rax)
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: avg_v16i16_const:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa (%rdi), %ymm0
; AVX2-NEXT: vpavgw {{.*}}(%rip), %ymm0, %ymm0
; AVX2-NEXT: vmovdqu %ymm0, (%rax)
@@ -2780,7 +2745,7 @@ define void @avg_v16i16_const(<16 x i16>* %a) {
; AVX2-NEXT: retq
;
; AVX512-LABEL: avg_v16i16_const:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vmovdqa (%rdi), %ymm0
; AVX512-NEXT: vpavgw {{.*}}(%rip), %ymm0, %ymm0
; AVX512-NEXT: vmovdqu %ymm0, (%rax)
@@ -2795,9 +2760,9 @@ define void @avg_v16i16_const(<16 x i16>* %a) {
ret void
}
-define void @avg_v32i16_const(<32 x i16>* %a) {
+define void @avg_v32i16_const(<32 x i16>* %a) nounwind {
; SSE2-LABEL: avg_v32i16_const:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa (%rdi), %xmm7
; SSE2-NEXT: movdqa 16(%rdi), %xmm6
; SSE2-NEXT: movdqa 32(%rdi), %xmm4
@@ -2860,7 +2825,7 @@ define void @avg_v32i16_const(<32 x i16>* %a) {
; SSE2-NEXT: retq
;
; AVX1-LABEL: avg_v32i16_const:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm8 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
@@ -2869,9 +2834,9 @@ define void @avg_v32i16_const(<32 x i16>* %a) {
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm5 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm6 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = [5,6,7,8]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = [1,2,3,4]
; AVX1-NEXT: vpaddd %xmm0, %xmm7, %xmm9
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [1,2,3,4]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [5,6,7,8]
; AVX1-NEXT: vpaddd %xmm7, %xmm6, %xmm6
; AVX1-NEXT: vpaddd %xmm0, %xmm5, %xmm5
; AVX1-NEXT: vpaddd %xmm7, %xmm4, %xmm4
@@ -2879,36 +2844,27 @@ define void @avg_v32i16_const(<32 x i16>* %a) {
; AVX1-NEXT: vpaddd %xmm7, %xmm2, %xmm2
; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpaddd %xmm7, %xmm8, %xmm1
-; AVX1-NEXT: vpsrld $1, %xmm1, %xmm8
+; AVX1-NEXT: vpsrld $1, %xmm1, %xmm1
; AVX1-NEXT: vpsrld $1, %xmm0, %xmm0
-; AVX1-NEXT: vpsrld $1, %xmm2, %xmm2
-; AVX1-NEXT: vpsrld $1, %xmm3, %xmm3
-; AVX1-NEXT: vpsrld $1, %xmm4, %xmm4
-; AVX1-NEXT: vpsrld $1, %xmm5, %xmm5
-; AVX1-NEXT: vpsrld $1, %xmm6, %xmm6
-; AVX1-NEXT: vpsrld $1, %xmm9, %xmm7
-; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm1[1],xmm7[2],xmm1[3],xmm7[4],xmm1[5],xmm7[6],xmm1[7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm1[1],xmm6[2],xmm1[3],xmm6[4],xmm1[5],xmm6[6],xmm1[7]
-; AVX1-NEXT: vpackusdw %xmm7, %xmm6, %xmm6
-; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm1[1],xmm5[2],xmm1[3],xmm5[4],xmm1[5],xmm5[6],xmm1[7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2],xmm1[3],xmm4[4],xmm1[5],xmm4[6],xmm1[7]
-; AVX1-NEXT: vpackusdw %xmm5, %xmm4, %xmm4
-; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4
-; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2],xmm1[3],xmm3[4],xmm1[5],xmm3[6],xmm1[7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4],xmm1[5],xmm2[6],xmm1[7]
-; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm8[0],xmm1[1],xmm8[2],xmm1[3],xmm8[4],xmm1[5],xmm8[6],xmm1[7]
-; AVX1-NEXT: vpackusdw %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpsrld $1, %xmm2, %xmm1
+; AVX1-NEXT: vpsrld $1, %xmm3, %xmm2
+; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vpsrld $1, %xmm4, %xmm2
+; AVX1-NEXT: vpsrld $1, %xmm5, %xmm3
+; AVX1-NEXT: vpackusdw %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpsrld $1, %xmm6, %xmm3
+; AVX1-NEXT: vpsrld $1, %xmm9, %xmm4
+; AVX1-NEXT: vpackusdw %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: vmovups %ymm0, (%rax)
-; AVX1-NEXT: vmovups %ymm4, (%rax)
+; AVX1-NEXT: vmovups %ymm2, (%rax)
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: avg_v32i16_const:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
@@ -2922,24 +2878,23 @@ define void @avg_v32i16_const(<32 x i16>* %a) {
; AVX2-NEXT: vpsrld $1, %ymm1, %ymm1
; AVX2-NEXT: vpsrld $1, %ymm2, %ymm2
; AVX2-NEXT: vpsrld $1, %ymm3, %ymm3
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-NEXT: vpshufb %ymm4, %ymm3, %ymm3
-; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
-; AVX2-NEXT: vpshufb %ymm4, %ymm2, %ymm2
-; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2
-; AVX2-NEXT: vpshufb %ymm4, %ymm1, %ymm1
-; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-NEXT: vpshufb %ymm4, %ymm0, %ymm0
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX2-NEXT: vpackusdw %xmm4, %xmm3, %xmm3
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm4
+; AVX2-NEXT: vpackusdw %xmm4, %xmm2, %xmm2
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX2-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
+; AVX2-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX2-NEXT: vmovdqu %ymm0, (%rax)
; AVX2-NEXT: vmovdqu %ymm2, (%rax)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512F-LABEL: avg_v32i16_const:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [1,2,3,4,5,6,7,8,1,2,3,4,5,6,7,8]
@@ -2954,10 +2909,10 @@ define void @avg_v32i16_const(<32 x i16>* %a) {
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: avg_v32i16_const:
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vmovdqu16 (%rdi), %zmm0
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
; AVX512BW-NEXT: vpavgw {{.*}}(%rip), %zmm0, %zmm0
-; AVX512BW-NEXT: vmovdqu16 %zmm0, (%rax)
+; AVX512BW-NEXT: vmovdqu32 %zmm0, (%rax)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
%1 = load <32 x i16>, <32 x i16>* %a
@@ -2968,3 +2923,332 @@ define void @avg_v32i16_const(<32 x i16>* %a) {
store <32 x i16> %5, <32 x i16>* undef, align 4
ret void
}
+
+define <16 x i8> @avg_v16i8_3(<16 x i8> %a, <16 x i8> %b) nounwind {
+; SSE2-LABEL: avg_v16i8_3:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pavgb %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: avg_v16i8_3:
+; AVX: # %bb.0:
+; AVX-NEXT: vpavgb %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
+ %za = zext <16 x i8> %a to <16 x i16>
+ %zb = zext <16 x i8> %b to <16 x i16>
+ %add = add nuw nsw <16 x i16> %za, %zb
+ %add1 = add nuw nsw <16 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+ %lshr = lshr <16 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+ %res = trunc <16 x i16> %lshr to <16 x i8>
+ ret <16 x i8> %res
+}
+
+define <32 x i8> @avg_v32i8_3(<32 x i8> %a, <32 x i8> %b) nounwind {
+; SSE2-LABEL: avg_v32i8_3:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pxor %xmm5, %xmm5
+; SSE2-NEXT: movdqa %xmm0, %xmm6
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7]
+; SSE2-NEXT: movdqa %xmm1, %xmm7
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm5[8],xmm7[9],xmm5[9],xmm7[10],xmm5[10],xmm7[11],xmm5[11],xmm7[12],xmm5[12],xmm7[13],xmm5[13],xmm7[14],xmm5[14],xmm7[15],xmm5[15]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7]
+; SSE2-NEXT: movdqa %xmm2, %xmm4
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15]
+; SSE2-NEXT: paddw %xmm6, %xmm4
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7]
+; SSE2-NEXT: paddw %xmm2, %xmm0
+; SSE2-NEXT: movdqa %xmm3, %xmm2
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15]
+; SSE2-NEXT: paddw %xmm7, %xmm2
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
+; SSE2-NEXT: paddw %xmm3, %xmm1
+; SSE2-NEXT: pcmpeqd %xmm3, %xmm3
+; SSE2-NEXT: psubw %xmm3, %xmm4
+; SSE2-NEXT: psubw %xmm3, %xmm0
+; SSE2-NEXT: psubw %xmm3, %xmm2
+; SSE2-NEXT: psubw %xmm3, %xmm1
+; SSE2-NEXT: psrlw $1, %xmm1
+; SSE2-NEXT: psrlw $1, %xmm2
+; SSE2-NEXT: psrlw $1, %xmm0
+; SSE2-NEXT: psrlw $1, %xmm4
+; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
+; SSE2-NEXT: pand %xmm3, %xmm4
+; SSE2-NEXT: pand %xmm3, %xmm0
+; SSE2-NEXT: packuswb %xmm4, %xmm0
+; SSE2-NEXT: pand %xmm3, %xmm2
+; SSE2-NEXT: pand %xmm3, %xmm1
+; SSE2-NEXT: packuswb %xmm2, %xmm1
+; SSE2-NEXT: retq
+;
+; AVX1-LABEL: avg_v32i8_3:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm6 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
+; AVX1-NEXT: vpaddw %xmm6, %xmm3, %xmm3
+; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,3,0,1]
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
+; AVX1-NEXT: vpaddw %xmm5, %xmm2, %xmm2
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX1-NEXT: vpaddw %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpsubw %xmm1, %xmm3, %xmm3
+; AVX1-NEXT: vpsubw %xmm1, %xmm2, %xmm2
+; AVX1-NEXT: vpsubw %xmm1, %xmm4, %xmm4
+; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm0
+; AVX1-NEXT: vpsrlw $1, %xmm4, %xmm1
+; AVX1-NEXT: vpsrlw $1, %xmm2, %xmm2
+; AVX1-NEXT: vpsrlw $1, %xmm3, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpshufb %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; AVX1-NEXT: vpshufb %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: avg_v32i8_3:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpavgb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: avg_v32i8_3:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpavgb %ymm1, %ymm0, %ymm0
+; AVX512-NEXT: retq
+ %za = zext <32 x i8> %a to <32 x i16>
+ %zb = zext <32 x i8> %b to <32 x i16>
+ %add = add nuw nsw <32 x i16> %za, %zb
+ %add1 = add nuw nsw <32 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+ %lshr = lshr <32 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+ %res = trunc <32 x i16> %lshr to <32 x i8>
+ ret <32 x i8> %res
+}
+
+define <64 x i8> @avg_v64i8_3(<64 x i8> %a, <64 x i8> %b) nounwind {
+; SSE2-LABEL: avg_v64i8_3:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pxor %xmm9, %xmm9
+; SSE2-NEXT: movdqa %xmm0, %xmm10
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7]
+; SSE2-NEXT: movdqa %xmm1, %xmm11
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm9[8],xmm11[9],xmm9[9],xmm11[10],xmm9[10],xmm11[11],xmm9[11],xmm11[12],xmm9[12],xmm11[13],xmm9[13],xmm11[14],xmm9[14],xmm11[15],xmm9[15]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7]
+; SSE2-NEXT: movdqa %xmm2, %xmm12
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm9[8],xmm12[9],xmm9[9],xmm12[10],xmm9[10],xmm12[11],xmm9[11],xmm12[12],xmm9[12],xmm12[13],xmm9[13],xmm12[14],xmm9[14],xmm12[15],xmm9[15]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3],xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7]
+; SSE2-NEXT: movdqa %xmm3, %xmm13
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm9[8],xmm13[9],xmm9[9],xmm13[10],xmm9[10],xmm13[11],xmm9[11],xmm13[12],xmm9[12],xmm13[13],xmm9[13],xmm13[14],xmm9[14],xmm13[15],xmm9[15]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7]
+; SSE2-NEXT: movdqa %xmm4, %xmm8
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm9[8],xmm8[9],xmm9[9],xmm8[10],xmm9[10],xmm8[11],xmm9[11],xmm8[12],xmm9[12],xmm8[13],xmm9[13],xmm8[14],xmm9[14],xmm8[15],xmm9[15]
+; SSE2-NEXT: paddw %xmm10, %xmm8
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3],xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7]
+; SSE2-NEXT: paddw %xmm4, %xmm0
+; SSE2-NEXT: movdqa %xmm5, %xmm4
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15]
+; SSE2-NEXT: paddw %xmm11, %xmm4
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3],xmm5[4],xmm9[4],xmm5[5],xmm9[5],xmm5[6],xmm9[6],xmm5[7],xmm9[7]
+; SSE2-NEXT: paddw %xmm5, %xmm1
+; SSE2-NEXT: movdqa %xmm6, %xmm5
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm9[8],xmm5[9],xmm9[9],xmm5[10],xmm9[10],xmm5[11],xmm9[11],xmm5[12],xmm9[12],xmm5[13],xmm9[13],xmm5[14],xmm9[14],xmm5[15],xmm9[15]
+; SSE2-NEXT: paddw %xmm12, %xmm5
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3],xmm6[4],xmm9[4],xmm6[5],xmm9[5],xmm6[6],xmm9[6],xmm6[7],xmm9[7]
+; SSE2-NEXT: paddw %xmm6, %xmm2
+; SSE2-NEXT: movdqa %xmm7, %xmm6
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm9[8],xmm6[9],xmm9[9],xmm6[10],xmm9[10],xmm6[11],xmm9[11],xmm6[12],xmm9[12],xmm6[13],xmm9[13],xmm6[14],xmm9[14],xmm6[15],xmm9[15]
+; SSE2-NEXT: paddw %xmm13, %xmm6
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3],xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7]
+; SSE2-NEXT: paddw %xmm7, %xmm3
+; SSE2-NEXT: pcmpeqd %xmm7, %xmm7
+; SSE2-NEXT: psubw %xmm7, %xmm8
+; SSE2-NEXT: psubw %xmm7, %xmm0
+; SSE2-NEXT: psubw %xmm7, %xmm4
+; SSE2-NEXT: psubw %xmm7, %xmm1
+; SSE2-NEXT: psubw %xmm7, %xmm5
+; SSE2-NEXT: psubw %xmm7, %xmm2
+; SSE2-NEXT: psubw %xmm7, %xmm6
+; SSE2-NEXT: psubw %xmm7, %xmm3
+; SSE2-NEXT: psrlw $1, %xmm3
+; SSE2-NEXT: psrlw $1, %xmm6
+; SSE2-NEXT: psrlw $1, %xmm2
+; SSE2-NEXT: psrlw $1, %xmm5
+; SSE2-NEXT: psrlw $1, %xmm1
+; SSE2-NEXT: psrlw $1, %xmm4
+; SSE2-NEXT: psrlw $1, %xmm0
+; SSE2-NEXT: psrlw $1, %xmm8
+; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255]
+; SSE2-NEXT: pand %xmm7, %xmm8
+; SSE2-NEXT: pand %xmm7, %xmm0
+; SSE2-NEXT: packuswb %xmm8, %xmm0
+; SSE2-NEXT: pand %xmm7, %xmm4
+; SSE2-NEXT: pand %xmm7, %xmm1
+; SSE2-NEXT: packuswb %xmm4, %xmm1
+; SSE2-NEXT: pand %xmm7, %xmm5
+; SSE2-NEXT: pand %xmm7, %xmm2
+; SSE2-NEXT: packuswb %xmm5, %xmm2
+; SSE2-NEXT: pand %xmm7, %xmm6
+; SSE2-NEXT: pand %xmm7, %xmm3
+; SSE2-NEXT: packuswb %xmm6, %xmm3
+; SSE2-NEXT: retq
+;
+; AVX1-LABEL: avg_v64i8_3:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
+; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,0,1]
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm6 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm8 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero,xmm7[4],zero,xmm7[5],zero,xmm7[6],zero,xmm7[7],zero
+; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,3,0,1]
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm11 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero,xmm7[4],zero,xmm7[5],zero,xmm7[6],zero,xmm7[7],zero
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm9 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm10 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm7 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX1-NEXT: vpaddw %xmm7, %xmm5, %xmm12
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX1-NEXT: vpaddw %xmm1, %xmm4, %xmm13
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
+; AVX1-NEXT: vpaddw %xmm4, %xmm6, %xmm14
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
+; AVX1-NEXT: vpaddw %xmm2, %xmm0, %xmm15
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm6 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
+; AVX1-NEXT: vpaddw %xmm6, %xmm8, %xmm6
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
+; AVX1-NEXT: vpaddw %xmm2, %xmm11, %xmm2
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm7 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
+; AVX1-NEXT: vpaddw %xmm7, %xmm9, %xmm7
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
+; AVX1-NEXT: vpaddw %xmm3, %xmm10, %xmm3
+; AVX1-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
+; AVX1-NEXT: vpsubw %xmm5, %xmm12, %xmm8
+; AVX1-NEXT: vpsubw %xmm5, %xmm13, %xmm4
+; AVX1-NEXT: vpsubw %xmm5, %xmm14, %xmm0
+; AVX1-NEXT: vpsubw %xmm5, %xmm15, %xmm1
+; AVX1-NEXT: vpsubw %xmm5, %xmm6, %xmm6
+; AVX1-NEXT: vpsubw %xmm5, %xmm2, %xmm2
+; AVX1-NEXT: vpsubw %xmm5, %xmm7, %xmm7
+; AVX1-NEXT: vpsubw %xmm5, %xmm3, %xmm3
+; AVX1-NEXT: vpsrlw $1, %xmm3, %xmm9
+; AVX1-NEXT: vpsrlw $1, %xmm7, %xmm5
+; AVX1-NEXT: vpsrlw $1, %xmm2, %xmm2
+; AVX1-NEXT: vpsrlw $1, %xmm6, %xmm6
+; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1
+; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm0
+; AVX1-NEXT: vpsrlw $1, %xmm4, %xmm4
+; AVX1-NEXT: vpsrlw $1, %xmm8, %xmm7
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vpshufb %xmm3, %xmm7, %xmm7
+; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm4
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm7[0],xmm4[0]
+; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT: vpshufb %xmm3, %xmm6, %xmm1
+; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX1-NEXT: vpshufb %xmm3, %xmm5, %xmm2
+; AVX1-NEXT: vpshufb %xmm3, %xmm9, %xmm3
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: avg_v64i8_3:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4
+; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
+; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm5
+; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero
+; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm6
+; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero,xmm6[8],zero,xmm6[9],zero,xmm6[10],zero,xmm6[11],zero,xmm6[12],zero,xmm6[13],zero,xmm6[14],zero,xmm6[15],zero
+; AVX2-NEXT: vpaddw %ymm6, %ymm4, %ymm4
+; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
+; AVX2-NEXT: vpaddw %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm2
+; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
+; AVX2-NEXT: vpaddw %ymm2, %ymm5, %ymm2
+; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero,xmm3[8],zero,xmm3[9],zero,xmm3[10],zero,xmm3[11],zero,xmm3[12],zero,xmm3[13],zero,xmm3[14],zero,xmm3[15],zero
+; AVX2-NEXT: vpaddw %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3
+; AVX2-NEXT: vpsubw %ymm3, %ymm4, %ymm4
+; AVX2-NEXT: vpsubw %ymm3, %ymm0, %ymm0
+; AVX2-NEXT: vpsubw %ymm3, %ymm2, %ymm2
+; AVX2-NEXT: vpsubw %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: vpsrlw $1, %ymm1, %ymm1
+; AVX2-NEXT: vpsrlw $1, %ymm2, %ymm2
+; AVX2-NEXT: vpsrlw $1, %ymm0, %ymm0
+; AVX2-NEXT: vpsrlw $1, %ymm4, %ymm3
+; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm5, %xmm4, %xmm4
+; AVX2-NEXT: vpshufb %xmm5, %xmm3, %xmm3
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0]
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4
+; AVX2-NEXT: vpshufb %xmm5, %xmm4, %xmm4
+; AVX2-NEXT: vpshufb %xmm5, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX2-NEXT: vpshufb %xmm5, %xmm3, %xmm3
+; AVX2-NEXT: vpshufb %xmm5, %xmm2, %xmm2
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX2-NEXT: vpshufb %xmm5, %xmm3, %xmm3
+; AVX2-NEXT: vpshufb %xmm5, %xmm1, %xmm1
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
+; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: avg_v64i8_3:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm4
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm5
+; AVX512F-NEXT: vextracti128 $1, %ymm3, %xmm6
+; AVX512F-NEXT: vpavgb %xmm6, %xmm4, %xmm4
+; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm6
+; AVX512F-NEXT: vpavgb %xmm6, %xmm5, %xmm5
+; AVX512F-NEXT: vpavgb %xmm2, %xmm0, %xmm0
+; AVX512F-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm0
+; AVX512F-NEXT: vpavgb %xmm3, %xmm1, %xmm1
+; AVX512F-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: avg_v64i8_3:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpavgb %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: retq
+ %za = zext <64 x i8> %a to <64 x i16>
+ %zb = zext <64 x i8> %b to <64 x i16>
+ %add = add nuw nsw <64 x i16> %za, %zb
+ %add1 = add nuw nsw <64 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+ %lshr = lshr <64 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+ %res = trunc <64 x i16> %lshr to <64 x i8>
+ ret <64 x i8> %res
+}
diff --git a/test/CodeGen/X86/avoid-loop-align-2.ll b/test/CodeGen/X86/avoid-loop-align-2.ll
index e02f3569c89d..7e5927e6ee32 100644
--- a/test/CodeGen/X86/avoid-loop-align-2.ll
+++ b/test/CodeGen/X86/avoid-loop-align-2.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | grep align | count 4
+; RUN: llc < %s -mtriple=i686-- | grep align | count 4
; TODO: Is it a good idea to align inner loops? It's hard to know without
; knowing what their trip counts are, or other dynamic information. For
diff --git a/test/CodeGen/X86/avx-arith.ll b/test/CodeGen/X86/avx-arith.ll
index 82d890a08cfe..6f535038bb2d 100644
--- a/test/CodeGen/X86/avx-arith.ll
+++ b/test/CodeGen/X86/avx-arith.ll
@@ -3,7 +3,7 @@
define <4 x double> @addpd256(<4 x double> %y, <4 x double> %x) nounwind uwtable readnone ssp {
; CHECK-LABEL: addpd256:
-; CHECK: ## BB#0: ## %entry
+; CHECK: ## %bb.0: ## %entry
; CHECK-NEXT: vaddpd %ymm0, %ymm1, %ymm0
; CHECK-NEXT: retq
entry:
@@ -13,7 +13,7 @@ entry:
define <4 x double> @addpd256fold(<4 x double> %y) nounwind uwtable readnone ssp {
; CHECK-LABEL: addpd256fold:
-; CHECK: ## BB#0: ## %entry
+; CHECK: ## %bb.0: ## %entry
; CHECK-NEXT: vaddpd {{.*}}(%rip), %ymm0, %ymm0
; CHECK-NEXT: retq
entry:
@@ -23,7 +23,7 @@ entry:
define <8 x float> @addps256(<8 x float> %y, <8 x float> %x) nounwind uwtable readnone ssp {
; CHECK-LABEL: addps256:
-; CHECK: ## BB#0: ## %entry
+; CHECK: ## %bb.0: ## %entry
; CHECK-NEXT: vaddps %ymm0, %ymm1, %ymm0
; CHECK-NEXT: retq
entry:
@@ -33,7 +33,7 @@ entry:
define <8 x float> @addps256fold(<8 x float> %y) nounwind uwtable readnone ssp {
; CHECK-LABEL: addps256fold:
-; CHECK: ## BB#0: ## %entry
+; CHECK: ## %bb.0: ## %entry
; CHECK-NEXT: vaddps {{.*}}(%rip), %ymm0, %ymm0
; CHECK-NEXT: retq
entry:
@@ -43,7 +43,7 @@ entry:
define <4 x double> @subpd256(<4 x double> %y, <4 x double> %x) nounwind uwtable readnone ssp {
; CHECK-LABEL: subpd256:
-; CHECK: ## BB#0: ## %entry
+; CHECK: ## %bb.0: ## %entry
; CHECK-NEXT: vsubpd %ymm0, %ymm1, %ymm0
; CHECK-NEXT: retq
entry:
@@ -53,7 +53,7 @@ entry:
define <4 x double> @subpd256fold(<4 x double> %y, <4 x double>* nocapture %x) nounwind uwtable readonly ssp {
; CHECK-LABEL: subpd256fold:
-; CHECK: ## BB#0: ## %entry
+; CHECK: ## %bb.0: ## %entry
; CHECK-NEXT: vsubpd (%rdi), %ymm0, %ymm0
; CHECK-NEXT: retq
entry:
@@ -64,7 +64,7 @@ entry:
define <8 x float> @subps256(<8 x float> %y, <8 x float> %x) nounwind uwtable readnone ssp {
; CHECK-LABEL: subps256:
-; CHECK: ## BB#0: ## %entry
+; CHECK: ## %bb.0: ## %entry
; CHECK-NEXT: vsubps %ymm0, %ymm1, %ymm0
; CHECK-NEXT: retq
entry:
@@ -74,7 +74,7 @@ entry:
define <8 x float> @subps256fold(<8 x float> %y, <8 x float>* nocapture %x) nounwind uwtable readonly ssp {
; CHECK-LABEL: subps256fold:
-; CHECK: ## BB#0: ## %entry
+; CHECK: ## %bb.0: ## %entry
; CHECK-NEXT: vsubps (%rdi), %ymm0, %ymm0
; CHECK-NEXT: retq
entry:
@@ -85,7 +85,7 @@ entry:
define <4 x double> @mulpd256(<4 x double> %y, <4 x double> %x) nounwind uwtable readnone ssp {
; CHECK-LABEL: mulpd256:
-; CHECK: ## BB#0: ## %entry
+; CHECK: ## %bb.0: ## %entry
; CHECK-NEXT: vmulpd %ymm0, %ymm1, %ymm0
; CHECK-NEXT: retq
entry:
@@ -95,7 +95,7 @@ entry:
define <4 x double> @mulpd256fold(<4 x double> %y) nounwind uwtable readnone ssp {
; CHECK-LABEL: mulpd256fold:
-; CHECK: ## BB#0: ## %entry
+; CHECK: ## %bb.0: ## %entry
; CHECK-NEXT: vmulpd {{.*}}(%rip), %ymm0, %ymm0
; CHECK-NEXT: retq
entry:
@@ -105,7 +105,7 @@ entry:
define <8 x float> @mulps256(<8 x float> %y, <8 x float> %x) nounwind uwtable readnone ssp {
; CHECK-LABEL: mulps256:
-; CHECK: ## BB#0: ## %entry
+; CHECK: ## %bb.0: ## %entry
; CHECK-NEXT: vmulps %ymm0, %ymm1, %ymm0
; CHECK-NEXT: retq
entry:
@@ -115,7 +115,7 @@ entry:
define <8 x float> @mulps256fold(<8 x float> %y) nounwind uwtable readnone ssp {
; CHECK-LABEL: mulps256fold:
-; CHECK: ## BB#0: ## %entry
+; CHECK: ## %bb.0: ## %entry
; CHECK-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0
; CHECK-NEXT: retq
entry:
@@ -125,7 +125,7 @@ entry:
define <4 x double> @divpd256(<4 x double> %y, <4 x double> %x) nounwind uwtable readnone ssp {
; CHECK-LABEL: divpd256:
-; CHECK: ## BB#0: ## %entry
+; CHECK: ## %bb.0: ## %entry
; CHECK-NEXT: vdivpd %ymm0, %ymm1, %ymm0
; CHECK-NEXT: retq
entry:
@@ -135,7 +135,7 @@ entry:
define <4 x double> @divpd256fold(<4 x double> %y) nounwind uwtable readnone ssp {
; CHECK-LABEL: divpd256fold:
-; CHECK: ## BB#0: ## %entry
+; CHECK: ## %bb.0: ## %entry
; CHECK-NEXT: vdivpd {{.*}}(%rip), %ymm0, %ymm0
; CHECK-NEXT: retq
entry:
@@ -145,7 +145,7 @@ entry:
define <8 x float> @divps256(<8 x float> %y, <8 x float> %x) nounwind uwtable readnone ssp {
; CHECK-LABEL: divps256:
-; CHECK: ## BB#0: ## %entry
+; CHECK: ## %bb.0: ## %entry
; CHECK-NEXT: vdivps %ymm0, %ymm1, %ymm0
; CHECK-NEXT: retq
entry:
@@ -155,7 +155,7 @@ entry:
define <8 x float> @divps256fold(<8 x float> %y) nounwind uwtable readnone ssp {
; CHECK-LABEL: divps256fold:
-; CHECK: ## BB#0: ## %entry
+; CHECK: ## %bb.0: ## %entry
; CHECK-NEXT: vdivps {{.*}}(%rip), %ymm0, %ymm0
; CHECK-NEXT: retq
entry:
@@ -165,7 +165,7 @@ entry:
define float @sqrtA(float %a) nounwind uwtable readnone ssp {
; CHECK-LABEL: sqrtA:
-; CHECK: ## BB#0: ## %entry
+; CHECK: ## %bb.0: ## %entry
; CHECK-NEXT: vsqrtss %xmm0, %xmm0, %xmm0
; CHECK-NEXT: retq
entry:
@@ -177,7 +177,7 @@ declare double @sqrt(double) readnone
define double @sqrtB(double %a) nounwind uwtable readnone ssp {
; CHECK-LABEL: sqrtB:
-; CHECK: ## BB#0: ## %entry
+; CHECK: ## %bb.0: ## %entry
; CHECK-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0
; CHECK-NEXT: retq
entry:
@@ -190,7 +190,7 @@ declare float @sqrtf(float) readnone
define <4 x i64> @vpaddq(<4 x i64> %i, <4 x i64> %j) nounwind readnone {
; CHECK-LABEL: vpaddq:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm2
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm3
; CHECK-NEXT: vpaddq %xmm2, %xmm3, %xmm2
@@ -203,7 +203,7 @@ define <4 x i64> @vpaddq(<4 x i64> %i, <4 x i64> %j) nounwind readnone {
define <8 x i32> @vpaddd(<8 x i32> %i, <8 x i32> %j) nounwind readnone {
; CHECK-LABEL: vpaddd:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm2
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm3
; CHECK-NEXT: vpaddd %xmm2, %xmm3, %xmm2
@@ -216,7 +216,7 @@ define <8 x i32> @vpaddd(<8 x i32> %i, <8 x i32> %j) nounwind readnone {
define <16 x i16> @vpaddw(<16 x i16> %i, <16 x i16> %j) nounwind readnone {
; CHECK-LABEL: vpaddw:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm2
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm3
; CHECK-NEXT: vpaddw %xmm2, %xmm3, %xmm2
@@ -229,7 +229,7 @@ define <16 x i16> @vpaddw(<16 x i16> %i, <16 x i16> %j) nounwind readnone {
define <32 x i8> @vpaddb(<32 x i8> %i, <32 x i8> %j) nounwind readnone {
; CHECK-LABEL: vpaddb:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm2
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm3
; CHECK-NEXT: vpaddb %xmm2, %xmm3, %xmm2
@@ -242,7 +242,7 @@ define <32 x i8> @vpaddb(<32 x i8> %i, <32 x i8> %j) nounwind readnone {
define <4 x i64> @vpsubq(<4 x i64> %i, <4 x i64> %j) nounwind readnone {
; CHECK-LABEL: vpsubq:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm2
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm3
; CHECK-NEXT: vpsubq %xmm2, %xmm3, %xmm2
@@ -255,7 +255,7 @@ define <4 x i64> @vpsubq(<4 x i64> %i, <4 x i64> %j) nounwind readnone {
define <8 x i32> @vpsubd(<8 x i32> %i, <8 x i32> %j) nounwind readnone {
; CHECK-LABEL: vpsubd:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm2
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm3
; CHECK-NEXT: vpsubd %xmm2, %xmm3, %xmm2
@@ -268,7 +268,7 @@ define <8 x i32> @vpsubd(<8 x i32> %i, <8 x i32> %j) nounwind readnone {
define <16 x i16> @vpsubw(<16 x i16> %i, <16 x i16> %j) nounwind readnone {
; CHECK-LABEL: vpsubw:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm2
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm3
; CHECK-NEXT: vpsubw %xmm2, %xmm3, %xmm2
@@ -281,7 +281,7 @@ define <16 x i16> @vpsubw(<16 x i16> %i, <16 x i16> %j) nounwind readnone {
define <32 x i8> @vpsubb(<32 x i8> %i, <32 x i8> %j) nounwind readnone {
; CHECK-LABEL: vpsubb:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm2
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm3
; CHECK-NEXT: vpsubb %xmm2, %xmm3, %xmm2
@@ -294,7 +294,7 @@ define <32 x i8> @vpsubb(<32 x i8> %i, <32 x i8> %j) nounwind readnone {
define <8 x i32> @vpmulld(<8 x i32> %i, <8 x i32> %j) nounwind readnone {
; CHECK-LABEL: vpmulld:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm2
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm3
; CHECK-NEXT: vpmulld %xmm2, %xmm3, %xmm2
@@ -307,7 +307,7 @@ define <8 x i32> @vpmulld(<8 x i32> %i, <8 x i32> %j) nounwind readnone {
define <16 x i16> @vpmullw(<16 x i16> %i, <16 x i16> %j) nounwind readnone {
; CHECK-LABEL: vpmullw:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm2
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm3
; CHECK-NEXT: vpmullw %xmm2, %xmm3, %xmm2
@@ -320,7 +320,7 @@ define <16 x i16> @vpmullw(<16 x i16> %i, <16 x i16> %j) nounwind readnone {
define <4 x i64> @mul_v4i64(<4 x i64> %i, <4 x i64> %j) nounwind readnone {
; CHECK-LABEL: mul_v4i64:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm2
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm3
; CHECK-NEXT: vpsrlq $32, %xmm3, %xmm4
@@ -349,7 +349,7 @@ declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone
define <4 x float> @int_sqrt_ss() {
; CHECK-LABEL: int_sqrt_ss:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: vsqrtss %xmm0, %xmm0, %xmm0
; CHECK-NEXT: retq
@@ -361,7 +361,7 @@ define <4 x float> @int_sqrt_ss() {
define <2 x double> @vector_sqrt_scalar_load(double* %a0) optsize {
; CHECK-LABEL: vector_sqrt_scalar_load:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: vsqrtpd %xmm0, %xmm0
; CHECK-NEXT: retq
diff --git a/test/CodeGen/X86/avx-basic.ll b/test/CodeGen/X86/avx-basic.ll
index 6869d088e7cd..d27a641203fd 100644
--- a/test/CodeGen/X86/avx-basic.ll
+++ b/test/CodeGen/X86/avx-basic.ll
@@ -7,7 +7,7 @@
define void @zero128() nounwind ssp {
; CHECK-LABEL: zero128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
; CHECK-NEXT: movq _z@{{.*}}(%rip), %rax
; CHECK-NEXT: vmovaps %xmm0, (%rax)
@@ -18,9 +18,9 @@ define void @zero128() nounwind ssp {
define void @zero256() nounwind ssp {
; CHECK-LABEL: zero256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: movq _x@{{.*}}(%rip), %rax
-; CHECK-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
; CHECK-NEXT: vmovaps %ymm0, (%rax)
; CHECK-NEXT: movq _y@{{.*}}(%rip), %rax
; CHECK-NEXT: vmovaps %ymm0, (%rax)
@@ -33,8 +33,8 @@ define void @zero256() nounwind ssp {
define void @ones([0 x float]* nocapture %RET, [0 x float]* nocapture %aFOO) nounwind {
; CHECK-LABEL: ones:
-; CHECK: ## BB#0: ## %allocas
-; CHECK-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; CHECK: ## %bb.0: ## %allocas
+; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
; CHECK-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0
; CHECK-NEXT: vmovaps %ymm0, (%rdi)
; CHECK-NEXT: vzeroupper
@@ -50,8 +50,8 @@ float>* %ptr2vec615, align 32
define void @ones2([0 x i32]* nocapture %RET, [0 x i32]* nocapture %aFOO) nounwind {
; CHECK-LABEL: ones2:
-; CHECK: ## BB#0: ## %allocas
-; CHECK-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; CHECK: ## %bb.0: ## %allocas
+; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
; CHECK-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0
; CHECK-NEXT: vmovaps %ymm0, (%rdi)
; CHECK-NEXT: vzeroupper
@@ -65,7 +65,7 @@ allocas:
;;; Just make sure this doesn't crash
define <4 x i64> @ISelCrash(<4 x i64> %a) nounwind uwtable readnone ssp {
; CHECK-LABEL: ISelCrash:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0
; CHECK-NEXT: retq
%shuffle = shufflevector <4 x i64> %a, <4 x i64> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 4>
@@ -75,9 +75,9 @@ define <4 x i64> @ISelCrash(<4 x i64> %a) nounwind uwtable readnone ssp {
;;; Don't crash on movd
define <8 x i32> @VMOVZQI2PQI([0 x float]* nocapture %aFOO) nounwind {
; CHECK-LABEL: VMOVZQI2PQI:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,1]
; CHECK-NEXT: retq
%ptrcast.i33.i = bitcast [0 x float]* %aFOO to i32*
%val.i34.i = load i32, i32* %ptrcast.i33.i, align 4
@@ -92,7 +92,7 @@ define <8 x i32> @VMOVZQI2PQI([0 x float]* nocapture %aFOO) nounwind {
; rdar://10566486
define <16 x float> @fneg(<16 x float> %a) nounwind {
; CHECK-LABEL: fneg:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = [-0.000000e+00,-0.000000e+00,-0.000000e+00,-0.000000e+00,-0.000000e+00,-0.000000e+00,-0.000000e+00,-0.000000e+00]
; CHECK-NEXT: vxorps %ymm2, %ymm0, %ymm0
; CHECK-NEXT: vxorps %ymm2, %ymm1, %ymm1
@@ -104,7 +104,7 @@ define <16 x float> @fneg(<16 x float> %a) nounwind {
;;; Don't crash on build vector
define <16 x i16> @build_vec_16x16(i16 %a) nounwind readonly {
; CHECK-LABEL: build_vec_16x16:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: movzwl %di, %eax
; CHECK-NEXT: vmovd %eax, %xmm0
; CHECK-NEXT: retq
@@ -116,7 +116,7 @@ define <16 x i16> @build_vec_16x16(i16 %a) nounwind readonly {
;;; an incorrect mnemonic of "movd" was printed for this instruction.
define i64 @VMOVPQIto64rr(<2 x i64> %a) {
; CHECK-LABEL: VMOVPQIto64rr:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovq %xmm0, %rax
; CHECK-NEXT: retq
%vecext.i = extractelement <2 x i64> %a, i32 0
@@ -126,7 +126,7 @@ define i64 @VMOVPQIto64rr(<2 x i64> %a) {
; PR22685
define <8 x float> @mov00_8f32(float* %ptr) {
; CHECK-LABEL: mov00_8f32:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: retq
%val = load float, float* %ptr
diff --git a/test/CodeGen/X86/avx-bitcast.ll b/test/CodeGen/X86/avx-bitcast.ll
index e34c20fcbd73..150c7ccfa0cc 100644
--- a/test/CodeGen/X86/avx-bitcast.ll
+++ b/test/CodeGen/X86/avx-bitcast.ll
@@ -2,7 +2,7 @@
define i64 @bitcasti64tof64() {
; CHECK-LABEL: bitcasti64tof64:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK: vmovsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: vmovq %xmm0, %rax
; CHECK-NEXT: retq
diff --git a/test/CodeGen/X86/avx-cast.ll b/test/CodeGen/X86/avx-cast.ll
index 103715c3628e..09bbb564ea85 100644
--- a/test/CodeGen/X86/avx-cast.ll
+++ b/test/CodeGen/X86/avx-cast.ll
@@ -8,9 +8,9 @@
define <8 x float> @castA(<4 x float> %m) nounwind uwtable readnone ssp {
; AVX-LABEL: castA:
-; AVX: ## BB#0:
-; AVX-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<def>
-; AVX-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; AVX: ## %bb.0:
+; AVX-NEXT: ## kill: def %xmm0 killed %xmm0 def %ymm0
+; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX-NEXT: retq
%shuffle.i = shufflevector <4 x float> %m, <4 x float> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4>
@@ -19,9 +19,9 @@ define <8 x float> @castA(<4 x float> %m) nounwind uwtable readnone ssp {
define <4 x double> @castB(<2 x double> %m) nounwind uwtable readnone ssp {
; AVX-LABEL: castB:
-; AVX: ## BB#0:
-; AVX-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<def>
-; AVX-NEXT: vxorpd %ymm1, %ymm1, %ymm1
+; AVX: ## %bb.0:
+; AVX-NEXT: ## kill: def %xmm0 killed %xmm0 def %ymm0
+; AVX-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
; AVX-NEXT: retq
%shuffle.i = shufflevector <2 x double> %m, <2 x double> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
@@ -32,17 +32,17 @@ define <4 x double> @castB(<2 x double> %m) nounwind uwtable readnone ssp {
define <4 x i64> @castC(<2 x i64> %m) nounwind uwtable readnone ssp {
; AVX1-LABEL: castC:
-; AVX1: ## BB#0:
-; AVX1-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<def>
-; AVX1-NEXT: vxorpd %ymm1, %ymm1, %ymm1
+; AVX1: ## %bb.0:
+; AVX1-NEXT: ## kill: def %xmm0 killed %xmm0 def %ymm0
+; AVX1-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
; AVX1-NEXT: retq
;
; AVX2-LABEL: castC:
-; AVX2: ## BB#0:
-; AVX2-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<def>
-; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX2: ## %bb.0:
+; AVX2-NEXT: ## kill: def %xmm0 killed %xmm0 def %ymm0
+; AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX2-NEXT: retq
%shuffle.i = shufflevector <2 x i64> %m, <2 x i64> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
ret <4 x i64> %shuffle.i
@@ -53,8 +53,8 @@ define <4 x i64> @castC(<2 x i64> %m) nounwind uwtable readnone ssp {
define <4 x float> @castD(<8 x float> %m) nounwind uwtable readnone ssp {
; AVX-LABEL: castD:
-; AVX: ## BB#0:
-; AVX-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX: ## %bb.0:
+; AVX-NEXT: ## kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
%shuffle.i = shufflevector <8 x float> %m, <8 x float> %m, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -63,8 +63,8 @@ define <4 x float> @castD(<8 x float> %m) nounwind uwtable readnone ssp {
define <2 x i64> @castE(<4 x i64> %m) nounwind uwtable readnone ssp {
; AVX-LABEL: castE:
-; AVX: ## BB#0:
-; AVX-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX: ## %bb.0:
+; AVX-NEXT: ## kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
%shuffle.i = shufflevector <4 x i64> %m, <4 x i64> %m, <2 x i32> <i32 0, i32 1>
@@ -73,8 +73,8 @@ define <2 x i64> @castE(<4 x i64> %m) nounwind uwtable readnone ssp {
define <2 x double> @castF(<4 x double> %m) nounwind uwtable readnone ssp {
; AVX-LABEL: castF:
-; AVX: ## BB#0:
-; AVX-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX: ## %bb.0:
+; AVX-NEXT: ## kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
%shuffle.i = shufflevector <4 x double> %m, <4 x double> %m, <2 x i32> <i32 0, i32 1>
diff --git a/test/CodeGen/X86/avx-cmp.ll b/test/CodeGen/X86/avx-cmp.ll
index 963878b0f563..968d8e360ecf 100644
--- a/test/CodeGen/X86/avx-cmp.ll
+++ b/test/CodeGen/X86/avx-cmp.ll
@@ -3,7 +3,7 @@
define <8 x i32> @cmp00(<8 x float> %a, <8 x float> %b) nounwind {
; CHECK-LABEL: cmp00:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vcmpltps %ymm1, %ymm0, %ymm0
; CHECK-NEXT: retq
%bincmp = fcmp olt <8 x float> %a, %b
@@ -13,7 +13,7 @@ define <8 x i32> @cmp00(<8 x float> %a, <8 x float> %b) nounwind {
define <4 x i64> @cmp01(<4 x double> %a, <4 x double> %b) nounwind {
; CHECK-LABEL: cmp01:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vcmpltpd %ymm1, %ymm0, %ymm0
; CHECK-NEXT: retq
%bincmp = fcmp olt <4 x double> %a, %b
@@ -25,12 +25,12 @@ declare void @scale() nounwind
define void @render() nounwind {
; CHECK-LABEL: render:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: pushq %rbx
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: testb %al, %al
; CHECK-NEXT: jne .LBB2_6
-; CHECK-NEXT: # BB#1: # %for.cond5.preheader
+; CHECK-NEXT: # %bb.1: # %for.cond5.preheader
; CHECK-NEXT: xorl %ebx, %ebx
; CHECK-NEXT: jmp .LBB2_2
; CHECK-NEXT: .p2align 4, 0x90
@@ -41,11 +41,11 @@ define void @render() nounwind {
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
; CHECK-NEXT: testb %bl, %bl
; CHECK-NEXT: jne .LBB2_2
-; CHECK-NEXT: # BB#3: # %for.cond5
+; CHECK-NEXT: # %bb.3: # %for.cond5
; CHECK-NEXT: # in Loop: Header=BB2_2 Depth=1
; CHECK-NEXT: testb %bl, %bl
; CHECK-NEXT: je .LBB2_2
-; CHECK-NEXT: # BB#4: # %for.body33
+; CHECK-NEXT: # %bb.4: # %for.body33
; CHECK-NEXT: # in Loop: Header=BB2_2 Depth=1
; CHECK-NEXT: vucomisd {{\.LCPI.*}}, %xmm0
; CHECK-NEXT: jne .LBB2_5
@@ -78,7 +78,7 @@ for.end52:
define <8 x i32> @int256_cmp(<8 x i32> %i, <8 x i32> %j) nounwind {
; CHECK-LABEL: int256_cmp:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm2
; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm3
; CHECK-NEXT: vpcmpgtd %xmm2, %xmm3, %xmm2
@@ -92,7 +92,7 @@ define <8 x i32> @int256_cmp(<8 x i32> %i, <8 x i32> %j) nounwind {
define <4 x i64> @v4i64_cmp(<4 x i64> %i, <4 x i64> %j) nounwind {
; CHECK-LABEL: v4i64_cmp:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm2
; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm3
; CHECK-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
@@ -106,7 +106,7 @@ define <4 x i64> @v4i64_cmp(<4 x i64> %i, <4 x i64> %j) nounwind {
define <16 x i16> @v16i16_cmp(<16 x i16> %i, <16 x i16> %j) nounwind {
; CHECK-LABEL: v16i16_cmp:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm2
; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm3
; CHECK-NEXT: vpcmpgtw %xmm2, %xmm3, %xmm2
@@ -120,7 +120,7 @@ define <16 x i16> @v16i16_cmp(<16 x i16> %i, <16 x i16> %j) nounwind {
define <32 x i8> @v32i8_cmp(<32 x i8> %i, <32 x i8> %j) nounwind {
; CHECK-LABEL: v32i8_cmp:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm2
; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm3
; CHECK-NEXT: vpcmpgtb %xmm2, %xmm3, %xmm2
@@ -134,7 +134,7 @@ define <32 x i8> @v32i8_cmp(<32 x i8> %i, <32 x i8> %j) nounwind {
define <8 x i32> @int256_cmpeq(<8 x i32> %i, <8 x i32> %j) nounwind {
; CHECK-LABEL: int256_cmpeq:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm2
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm3
; CHECK-NEXT: vpcmpeqd %xmm2, %xmm3, %xmm2
@@ -148,7 +148,7 @@ define <8 x i32> @int256_cmpeq(<8 x i32> %i, <8 x i32> %j) nounwind {
define <4 x i64> @v4i64_cmpeq(<4 x i64> %i, <4 x i64> %j) nounwind {
; CHECK-LABEL: v4i64_cmpeq:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm2
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm3
; CHECK-NEXT: vpcmpeqq %xmm2, %xmm3, %xmm2
@@ -162,7 +162,7 @@ define <4 x i64> @v4i64_cmpeq(<4 x i64> %i, <4 x i64> %j) nounwind {
define <16 x i16> @v16i16_cmpeq(<16 x i16> %i, <16 x i16> %j) nounwind {
; CHECK-LABEL: v16i16_cmpeq:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm2
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm3
; CHECK-NEXT: vpcmpeqw %xmm2, %xmm3, %xmm2
@@ -176,7 +176,7 @@ define <16 x i16> @v16i16_cmpeq(<16 x i16> %i, <16 x i16> %j) nounwind {
define <32 x i8> @v32i8_cmpeq(<32 x i8> %i, <32 x i8> %j) nounwind {
; CHECK-LABEL: v32i8_cmpeq:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm2
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm3
; CHECK-NEXT: vpcmpeqb %xmm2, %xmm3, %xmm2
@@ -192,12 +192,12 @@ define <32 x i8> @v32i8_cmpeq(<32 x i8> %i, <32 x i8> %j) nounwind {
define i32 @scalarcmpA() uwtable ssp {
; CHECK-LABEL: scalarcmpA:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vxorpd %xmm0, %xmm0, %xmm0
; CHECK-NEXT: vcmpeqsd %xmm0, %xmm0, %xmm0
; CHECK-NEXT: vmovq %xmm0, %rax
; CHECK-NEXT: andl $1, %eax
-; CHECK-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill>
+; CHECK-NEXT: # kill: def %eax killed %eax killed %rax
; CHECK-NEXT: retq
%cmp29 = fcmp oeq double undef, 0.000000e+00
%res = zext i1 %cmp29 to i32
@@ -206,7 +206,7 @@ define i32 @scalarcmpA() uwtable ssp {
define i32 @scalarcmpB() uwtable ssp {
; CHECK-LABEL: scalarcmpB:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
; CHECK-NEXT: vcmpeqss %xmm0, %xmm0, %xmm0
; CHECK-NEXT: vmovd %xmm0, %eax
diff --git a/test/CodeGen/X86/avx-cvt-2.ll b/test/CodeGen/X86/avx-cvt-2.ll
index c849312f2367..7c2df3e99623 100644
--- a/test/CodeGen/X86/avx-cvt-2.ll
+++ b/test/CodeGen/X86/avx-cvt-2.ll
@@ -9,13 +9,10 @@
define void @fptoui16(%f32vec_t %a, %i16vec_t *%p) {
; CHECK-LABEL: fptoui16:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vcvttps2dq %ymm0, %ymm0
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1
-; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; CHECK-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; CHECK-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; CHECK-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
; CHECK-NEXT: vmovdqa %xmm0, (%rdi)
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -26,13 +23,10 @@ define void @fptoui16(%f32vec_t %a, %i16vec_t *%p) {
define void @fptosi16(%f32vec_t %a, %i16vec_t *%p) {
; CHECK-LABEL: fptosi16:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vcvttps2dq %ymm0, %ymm0
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1
-; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; CHECK-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; CHECK-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; CHECK-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
; CHECK-NEXT: vmovdqa %xmm0, (%rdi)
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -43,14 +37,11 @@ define void @fptosi16(%f32vec_t %a, %i16vec_t *%p) {
define void @fptoui8(%f32vec_t %a, %i8vec_t *%p) {
; CHECK-LABEL: fptoui8:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vcvttps2dq %ymm0, %ymm0
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1
-; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; CHECK-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; CHECK-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; CHECK-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: vpackuswb %xmm0, %xmm0, %xmm0
; CHECK-NEXT: vmovq %xmm0, (%rdi)
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -61,14 +52,11 @@ define void @fptoui8(%f32vec_t %a, %i8vec_t *%p) {
define void @fptosi8(%f32vec_t %a, %i8vec_t *%p) {
; CHECK-LABEL: fptosi8:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vcvttps2dq %ymm0, %ymm0
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1
-; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; CHECK-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; CHECK-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; CHECK-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
; CHECK-NEXT: vmovq %xmm0, (%rdi)
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
diff --git a/test/CodeGen/X86/avx-cvt-3.ll b/test/CodeGen/X86/avx-cvt-3.ll
index 231334ddcb85..ac99684ab3ab 100644
--- a/test/CodeGen/X86/avx-cvt-3.ll
+++ b/test/CodeGen/X86/avx-cvt-3.ll
@@ -6,15 +6,15 @@
define <8 x float> @sitofp_insert_zero_v8i32(<8 x i32> %a0) {
; X86-LABEL: sitofp_insert_zero_v8i32:
-; X86: # BB#0:
-; X86-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; X86: # %bb.0:
+; X86-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X86-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6,7]
; X86-NEXT: vcvtdq2ps %ymm0, %ymm0
; X86-NEXT: retl
;
; X64-LABEL: sitofp_insert_zero_v8i32:
-; X64: # BB#0:
-; X64-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; X64: # %bb.0:
+; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X64-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6,7]
; X64-NEXT: vcvtdq2ps %ymm0, %ymm0
; X64-NEXT: retq
@@ -28,15 +28,15 @@ define <8 x float> @sitofp_insert_zero_v8i32(<8 x i32> %a0) {
define <8 x float> @sitofp_shuffle_zero_v8i32(<8 x i32> %a0) {
; X86-LABEL: sitofp_shuffle_zero_v8i32:
-; X86: # BB#0:
-; X86-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; X86: # %bb.0:
+; X86-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X86-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7]
; X86-NEXT: vcvtdq2ps %ymm0, %ymm0
; X86-NEXT: retl
;
; X64-LABEL: sitofp_shuffle_zero_v8i32:
-; X64: # BB#0:
-; X64-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; X64: # %bb.0:
+; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X64-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7]
; X64-NEXT: vcvtdq2ps %ymm0, %ymm0
; X64-NEXT: retq
@@ -47,16 +47,16 @@ define <8 x float> @sitofp_shuffle_zero_v8i32(<8 x i32> %a0) {
define <8 x float> @sitofp_insert_allbits_v8i32(<8 x i32> %a0) {
; X86-LABEL: sitofp_insert_allbits_v8i32:
-; X86: # BB#0:
-; X86-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; X86: # %bb.0:
+; X86-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X86-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1
; X86-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6,7]
; X86-NEXT: vcvtdq2ps %ymm0, %ymm0
; X86-NEXT: retl
;
; X64-LABEL: sitofp_insert_allbits_v8i32:
-; X64: # BB#0:
-; X64-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; X64: # %bb.0:
+; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X64-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1
; X64-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6,7]
; X64-NEXT: vcvtdq2ps %ymm0, %ymm0
@@ -71,16 +71,16 @@ define <8 x float> @sitofp_insert_allbits_v8i32(<8 x i32> %a0) {
define <8 x float> @sitofp_shuffle_allbits_v8i32(<8 x i32> %a0) {
; X86-LABEL: sitofp_shuffle_allbits_v8i32:
-; X86: # BB#0:
-; X86-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; X86: # %bb.0:
+; X86-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X86-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1
; X86-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7]
; X86-NEXT: vcvtdq2ps %ymm0, %ymm0
; X86-NEXT: retl
;
; X64-LABEL: sitofp_shuffle_allbits_v8i32:
-; X64: # BB#0:
-; X64-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; X64: # %bb.0:
+; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X64-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1
; X64-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7]
; X64-NEXT: vcvtdq2ps %ymm0, %ymm0
@@ -92,8 +92,8 @@ define <8 x float> @sitofp_shuffle_allbits_v8i32(<8 x i32> %a0) {
define <8 x float> @sitofp_insert_constants_v8i32(<8 x i32> %a0) {
; X86-LABEL: sitofp_insert_constants_v8i32:
-; X86: # BB#0:
-; X86-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; X86: # %bb.0:
+; X86-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X86-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7]
; X86-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1
; X86-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6,7]
@@ -107,8 +107,8 @@ define <8 x float> @sitofp_insert_constants_v8i32(<8 x i32> %a0) {
; X86-NEXT: retl
;
; X64-LABEL: sitofp_insert_constants_v8i32:
-; X64: # BB#0:
-; X64-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; X64: # %bb.0:
+; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X64-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7]
; X64-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1
; X64-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6,7]
@@ -130,13 +130,13 @@ define <8 x float> @sitofp_insert_constants_v8i32(<8 x i32> %a0) {
define <8 x float> @sitofp_shuffle_constants_v8i32(<8 x i32> %a0) {
; X86-LABEL: sitofp_shuffle_constants_v8i32:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: vblendps {{.*#+}} ymm0 = mem[0],ymm0[1],mem[2],ymm0[3],mem[4],ymm0[5],mem[6],ymm0[7]
; X86-NEXT: vcvtdq2ps %ymm0, %ymm0
; X86-NEXT: retl
;
; X64-LABEL: sitofp_shuffle_constants_v8i32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vblendps {{.*#+}} ymm0 = mem[0],ymm0[1],mem[2],ymm0[3],mem[4],ymm0[5],mem[6],ymm0[7]
; X64-NEXT: vcvtdq2ps %ymm0, %ymm0
; X64-NEXT: retq
diff --git a/test/CodeGen/X86/avx-cvt.ll b/test/CodeGen/X86/avx-cvt.ll
index f2900dba938a..0a6ba2f84faf 100644
--- a/test/CodeGen/X86/avx-cvt.ll
+++ b/test/CodeGen/X86/avx-cvt.ll
@@ -3,7 +3,7 @@
define <8 x float> @sitofp00(<8 x i32> %a) nounwind {
; CHECK-LABEL: sitofp00:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vcvtdq2ps %ymm0, %ymm0
; CHECK-NEXT: retq
%b = sitofp <8 x i32> %a to <8 x float>
@@ -12,7 +12,7 @@ define <8 x float> @sitofp00(<8 x i32> %a) nounwind {
define <8 x i32> @fptosi00(<8 x float> %a) nounwind {
; CHECK-LABEL: fptosi00:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vcvttps2dq %ymm0, %ymm0
; CHECK-NEXT: retq
%b = fptosi <8 x float> %a to <8 x i32>
@@ -21,7 +21,7 @@ define <8 x i32> @fptosi00(<8 x float> %a) nounwind {
define <4 x double> @sitofp01(<4 x i32> %a) {
; CHECK-LABEL: sitofp01:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vcvtdq2pd %xmm0, %ymm0
; CHECK-NEXT: retq
%b = sitofp <4 x i32> %a to <4 x double>
@@ -30,7 +30,7 @@ define <4 x double> @sitofp01(<4 x i32> %a) {
define <8 x float> @sitofp02(<8 x i16> %a) {
; CHECK-LABEL: sitofp02:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpmovsxwd %xmm0, %xmm1
; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
; CHECK-NEXT: vpmovsxwd %xmm0, %xmm0
@@ -43,7 +43,7 @@ define <8 x float> @sitofp02(<8 x i16> %a) {
define <4 x i32> @fptosi01(<4 x double> %a) {
; CHECK-LABEL: fptosi01:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vcvttpd2dq %ymm0, %xmm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -53,7 +53,7 @@ define <4 x i32> @fptosi01(<4 x double> %a) {
define <8 x float> @fptrunc00(<8 x double> %b) nounwind {
; CHECK-LABEL: fptrunc00:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vcvtpd2ps %ymm0, %xmm0
; CHECK-NEXT: vcvtpd2ps %ymm1, %xmm1
; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
@@ -64,7 +64,7 @@ define <8 x float> @fptrunc00(<8 x double> %b) nounwind {
define <4 x float> @fptrunc01(<2 x double> %a0, <4 x float> %a1) nounwind {
; CHECK-LABEL: fptrunc01:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vcvtsd2ss %xmm0, %xmm1, %xmm0
; CHECK-NEXT: retq
%ext = extractelement <2 x double> %a0, i32 0
@@ -75,7 +75,7 @@ define <4 x float> @fptrunc01(<2 x double> %a0, <4 x float> %a1) nounwind {
define <4 x double> @fpext00(<4 x float> %b) nounwind {
; CHECK-LABEL: fpext00:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vcvtps2pd %xmm0, %ymm0
; CHECK-NEXT: retq
%a = fpext <4 x float> %b to <4 x double>
@@ -84,7 +84,7 @@ define <4 x double> @fpext00(<4 x float> %b) nounwind {
define <2 x double> @fpext01(<2 x double> %a0, <4 x float> %a1) nounwind {
; CHECK-LABEL: fpext01:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vcvtss2sd %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%ext = extractelement <4 x float> %a1, i32 0
@@ -95,7 +95,7 @@ define <2 x double> @fpext01(<2 x double> %a0, <4 x float> %a1) nounwind {
define double @funcA(i64* nocapture %e) nounwind uwtable readonly ssp {
; CHECK-LABEL: funcA:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vcvtsi2sdq (%rdi), %xmm0, %xmm0
; CHECK-NEXT: retq
%tmp1 = load i64, i64* %e, align 8
@@ -105,7 +105,7 @@ define double @funcA(i64* nocapture %e) nounwind uwtable readonly ssp {
define double @funcB(i32* nocapture %e) nounwind uwtable readonly ssp {
; CHECK-LABEL: funcB:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vcvtsi2sdl (%rdi), %xmm0, %xmm0
; CHECK-NEXT: retq
%tmp1 = load i32, i32* %e, align 4
@@ -115,7 +115,7 @@ define double @funcB(i32* nocapture %e) nounwind uwtable readonly ssp {
define float @funcC(i32* nocapture %e) nounwind uwtable readonly ssp {
; CHECK-LABEL: funcC:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vcvtsi2ssl (%rdi), %xmm0, %xmm0
; CHECK-NEXT: retq
%tmp1 = load i32, i32* %e, align 4
@@ -125,7 +125,7 @@ define float @funcC(i32* nocapture %e) nounwind uwtable readonly ssp {
define float @funcD(i64* nocapture %e) nounwind uwtable readonly ssp {
; CHECK-LABEL: funcD:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vcvtsi2ssq (%rdi), %xmm0, %xmm0
; CHECK-NEXT: retq
%tmp1 = load i64, i64* %e, align 8
@@ -135,7 +135,7 @@ define float @funcD(i64* nocapture %e) nounwind uwtable readonly ssp {
define void @fpext() nounwind uwtable {
; CHECK-LABEL: fpext:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
; CHECK-NEXT: vmovsd %xmm0, -{{[0-9]+}}(%rsp)
@@ -150,7 +150,7 @@ define void @fpext() nounwind uwtable {
define double @nearbyint_f64(double %a) {
; CHECK-LABEL: nearbyint_f64:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vroundsd $12, %xmm0, %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call double @llvm.nearbyint.f64(double %a)
@@ -160,7 +160,7 @@ declare double @llvm.nearbyint.f64(double %p)
define float @floor_f32(float %a) {
; CHECK-LABEL: floor_f32:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vroundss $9, %xmm0, %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call float @llvm.floor.f32(float %a)
diff --git a/test/CodeGen/X86/avx-gfni-intrinsics.ll b/test/CodeGen/X86/avx-gfni-intrinsics.ll
new file mode 100644
index 000000000000..a59cfcccad24
--- /dev/null
+++ b/test/CodeGen/X86/avx-gfni-intrinsics.ll
@@ -0,0 +1,63 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=+gfni,+avx -show-mc-encoding | FileCheck %s
+
+declare <16 x i8> @llvm.x86.vgf2p8affineinvqb.128(<16 x i8>, <16 x i8>, i8)
+define <16 x i8> @test_vgf2p8affineinvqb_128(<16 x i8> %src1, <16 x i8> %src2) {
+; CHECK-LABEL: test_vgf2p8affineinvqb_128:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vgf2p8affineinvqb $11, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0xf9,0xcf,0xc1,0x0b]
+; CHECK-NEXT: retl ## encoding: [0xc3]
+ %1 = call <16 x i8> @llvm.x86.vgf2p8affineinvqb.128(<16 x i8> %src1, <16 x i8> %src2, i8 11)
+ ret <16 x i8> %1
+}
+
+declare <32 x i8> @llvm.x86.vgf2p8affineinvqb.256(<32 x i8>, <32 x i8>, i8)
+define <32 x i8> @test_vgf2p8affineinvqb_256(<32 x i8> %src1, <32 x i8> %src2) {
+; CHECK-LABEL: test_vgf2p8affineinvqb_256:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vgf2p8affineinvqb $11, %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0xfd,0xcf,0xc1,0x0b]
+; CHECK-NEXT: retl ## encoding: [0xc3]
+ %1 = call <32 x i8> @llvm.x86.vgf2p8affineinvqb.256(<32 x i8> %src1, <32 x i8> %src2, i8 11)
+ ret <32 x i8> %1
+}
+
+declare <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8>, <16 x i8>, i8)
+define <16 x i8> @test_vgf2p8affineqb(<16 x i8> %src1, <16 x i8> %src2) {
+; CHECK-LABEL: test_vgf2p8affineqb:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vgf2p8affineqb $11, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0xf9,0xce,0xc1,0x0b]
+; CHECK-NEXT: retl ## encoding: [0xc3]
+ %1 = call <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8> %src1, <16 x i8> %src2, i8 11)
+ ret <16 x i8> %1
+}
+
+declare <32 x i8> @llvm.x86.vgf2p8affineqb.256(<32 x i8>, <32 x i8>, i8)
+define <32 x i8> @test_vgf2p8affineqb_256(<32 x i8> %src1, <32 x i8> %src2) {
+; CHECK-LABEL: test_vgf2p8affineqb_256:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vgf2p8affineqb $11, %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0xfd,0xce,0xc1,0x0b]
+; CHECK-NEXT: retl ## encoding: [0xc3]
+ %1 = call <32 x i8> @llvm.x86.vgf2p8affineqb.256(<32 x i8> %src1, <32 x i8> %src2, i8 11)
+ ret <32 x i8> %1
+}
+
+declare <16 x i8> @llvm.x86.vgf2p8mulb.128(<16 x i8>, <16 x i8>)
+define <16 x i8> @test_vgf2p8mulb_128(<16 x i8> %src1, <16 x i8> %src2) {
+; CHECK-LABEL: test_vgf2p8mulb_128:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vgf2p8mulb %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0xcf,0xc1]
+; CHECK-NEXT: retl ## encoding: [0xc3]
+ %1 = call <16 x i8> @llvm.x86.vgf2p8mulb.128(<16 x i8> %src1, <16 x i8> %src2)
+ ret <16 x i8> %1
+}
+
+declare <32 x i8> @llvm.x86.vgf2p8mulb.256(<32 x i8>, <32 x i8>)
+define <32 x i8> @test_vgf2p8mulb_256(<32 x i8> %src1, <32 x i8> %src2) {
+; CHECK-LABEL: test_vgf2p8mulb_256:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vgf2p8mulb %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0xcf,0xc1]
+; CHECK-NEXT: retl ## encoding: [0xc3]
+ %1 = call <32 x i8> @llvm.x86.vgf2p8mulb.256(<32 x i8> %src1, <32 x i8> %src2)
+ ret <32 x i8> %1
+}
+
diff --git a/test/CodeGen/X86/avx-insertelt.ll b/test/CodeGen/X86/avx-insertelt.ll
index c159d689451b..284a6d71e2d3 100644
--- a/test/CodeGen/X86/avx-insertelt.ll
+++ b/test/CodeGen/X86/avx-insertelt.ll
@@ -19,13 +19,13 @@ define <4 x double> @insert_f64(<4 x double> %y, double %f, <4 x double> %x) {
define <32 x i8> @insert_i8(<32 x i8> %y, i8 %f, <32 x i8> %x) {
; AVX-LABEL: insert_i8:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpinsrb $0, %edi, %xmm0, %xmm1
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX-NEXT: retq
;
; AVX2-LABEL: insert_i8:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpinsrb $0, %edi, %xmm0, %xmm1
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX2-NEXT: retq
@@ -35,13 +35,13 @@ define <32 x i8> @insert_i8(<32 x i8> %y, i8 %f, <32 x i8> %x) {
define <16 x i16> @insert_i16(<16 x i16> %y, i16 %f, <16 x i16> %x) {
; AVX-LABEL: insert_i16:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpinsrw $0, %edi, %xmm0, %xmm1
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX-NEXT: retq
;
; AVX2-LABEL: insert_i16:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpinsrw $0, %edi, %xmm0, %xmm1
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX2-NEXT: retq
@@ -51,13 +51,13 @@ define <16 x i16> @insert_i16(<16 x i16> %y, i16 %f, <16 x i16> %x) {
define <8 x i32> @insert_i32(<8 x i32> %y, i32 %f, <8 x i32> %x) {
; AVX-LABEL: insert_i32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpinsrd $0, %edi, %xmm0, %xmm1
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX-NEXT: retq
;
; AVX2-LABEL: insert_i32:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vmovd %edi, %xmm1
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7]
; AVX2-NEXT: retq
@@ -67,13 +67,13 @@ define <8 x i32> @insert_i32(<8 x i32> %y, i32 %f, <8 x i32> %x) {
define <4 x i64> @insert_i64(<4 x i64> %y, i64 %f, <4 x i64> %x) {
; AVX-LABEL: insert_i64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpinsrq $0, %rdi, %xmm0, %xmm1
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX-NEXT: retq
;
; AVX2-LABEL: insert_i64:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpinsrq $0, %rdi, %xmm0, %xmm1
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX2-NEXT: retq
diff --git a/test/CodeGen/X86/avx-intrinsics-fast-isel.ll b/test/CodeGen/X86/avx-intrinsics-fast-isel.ll
index ff5a2371a145..297922809ea7 100644
--- a/test/CodeGen/X86/avx-intrinsics-fast-isel.ll
+++ b/test/CodeGen/X86/avx-intrinsics-fast-isel.ll
@@ -6,12 +6,12 @@
define <4 x double> @test_mm256_add_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
; X32-LABEL: test_mm256_add_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vaddpd %ymm1, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_add_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vaddpd %ymm1, %ymm0, %ymm0
; X64-NEXT: retq
%res = fadd <4 x double> %a0, %a1
@@ -20,12 +20,12 @@ define <4 x double> @test_mm256_add_pd(<4 x double> %a0, <4 x double> %a1) nounw
define <8 x float> @test_mm256_add_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
; X32-LABEL: test_mm256_add_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vaddps %ymm1, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_add_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vaddps %ymm1, %ymm0, %ymm0
; X64-NEXT: retq
%res = fadd <8 x float> %a0, %a1
@@ -34,12 +34,12 @@ define <8 x float> @test_mm256_add_ps(<8 x float> %a0, <8 x float> %a1) nounwind
define <4 x double> @test_mm256_addsub_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
; X32-LABEL: test_mm256_addsub_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vaddsubpd %ymm1, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_addsub_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vaddsubpd %ymm1, %ymm0, %ymm0
; X64-NEXT: retq
%res = call <4 x double> @llvm.x86.avx.addsub.pd.256(<4 x double> %a0, <4 x double> %a1)
@@ -49,12 +49,12 @@ declare <4 x double> @llvm.x86.avx.addsub.pd.256(<4 x double>, <4 x double>) nou
define <8 x float> @test_mm256_addsub_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
; X32-LABEL: test_mm256_addsub_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vaddsubps %ymm1, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_addsub_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vaddsubps %ymm1, %ymm0, %ymm0
; X64-NEXT: retq
%res = call <8 x float> @llvm.x86.avx.addsub.ps.256(<8 x float> %a0, <8 x float> %a1)
@@ -64,12 +64,12 @@ declare <8 x float> @llvm.x86.avx.addsub.ps.256(<8 x float>, <8 x float>) nounwi
define <4 x double> @test_mm256_and_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
; X32-LABEL: test_mm256_and_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vandps %ymm1, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_and_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vandps %ymm1, %ymm0, %ymm0
; X64-NEXT: retq
%1 = bitcast <4 x double> %a0 to <4 x i64>
@@ -81,12 +81,12 @@ define <4 x double> @test_mm256_and_pd(<4 x double> %a0, <4 x double> %a1) nounw
define <8 x float> @test_mm256_and_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
; X32-LABEL: test_mm256_and_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vandps %ymm1, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_and_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vandps %ymm1, %ymm0, %ymm0
; X64-NEXT: retq
%1 = bitcast <8 x float> %a0 to <8 x i32>
@@ -98,16 +98,16 @@ define <8 x float> @test_mm256_and_ps(<8 x float> %a0, <8 x float> %a1) nounwind
define <4 x double> @test_mm256_andnot_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
; X32-LABEL: test_mm256_andnot_pd:
-; X32: # BB#0:
-; X32-NEXT: vxorps %ymm2, %ymm2, %ymm2
+; X32: # %bb.0:
+; X32-NEXT: vxorps %xmm2, %xmm2, %xmm2
; X32-NEXT: vcmptrueps %ymm2, %ymm2, %ymm2
; X32-NEXT: vxorps %ymm2, %ymm0, %ymm0
; X32-NEXT: vandps %ymm1, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_andnot_pd:
-; X64: # BB#0:
-; X64-NEXT: vxorps %ymm2, %ymm2, %ymm2
+; X64: # %bb.0:
+; X64-NEXT: vxorps %xmm2, %xmm2, %xmm2
; X64-NEXT: vcmptrueps %ymm2, %ymm2, %ymm2
; X64-NEXT: vxorps %ymm2, %ymm0, %ymm0
; X64-NEXT: vandps %ymm1, %ymm0, %ymm0
@@ -122,12 +122,12 @@ define <4 x double> @test_mm256_andnot_pd(<4 x double> %a0, <4 x double> %a1) no
define <8 x float> @test_mm256_andnot_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
; X32-LABEL: test_mm256_andnot_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vandnps %ymm1, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_andnot_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vandnps %ymm1, %ymm0, %ymm0
; X64-NEXT: retq
%1 = bitcast <8 x float> %a0 to <8 x i32>
@@ -140,12 +140,12 @@ define <8 x float> @test_mm256_andnot_ps(<8 x float> %a0, <8 x float> %a1) nounw
define <4 x double> @test_mm256_blend_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
; X32-LABEL: test_mm256_blend_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3]
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_blend_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3]
; X64-NEXT: retq
%res = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 0, i32 5, i32 6, i32 3>
@@ -154,12 +154,12 @@ define <4 x double> @test_mm256_blend_pd(<4 x double> %a0, <4 x double> %a1) nou
define <8 x float> @test_mm256_blend_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
; X32-LABEL: test_mm256_blend_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6],ymm1[7]
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_blend_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6],ymm1[7]
; X64-NEXT: retq
%res = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 15>
@@ -168,12 +168,12 @@ define <8 x float> @test_mm256_blend_ps(<8 x float> %a0, <8 x float> %a1) nounwi
define <4 x double> @test_mm256_blendv_pd(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) nounwind {
; X32-LABEL: test_mm256_blendv_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vblendvpd %ymm2, %ymm1, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_blendv_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vblendvpd %ymm2, %ymm1, %ymm0, %ymm0
; X64-NEXT: retq
%res = call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2)
@@ -183,12 +183,12 @@ declare <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double>, <4 x double>, <4
define <8 x float> @test_mm256_blendv_ps(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) nounwind {
; X32-LABEL: test_mm256_blendv_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vblendvps %ymm2, %ymm1, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_blendv_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vblendvps %ymm2, %ymm1, %ymm0, %ymm0
; X64-NEXT: retq
%res = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2)
@@ -198,13 +198,13 @@ declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>, <8 x f
define <4 x double> @test_mm256_broadcast_pd(<2 x double>* %a0) nounwind {
; X32-LABEL: test_mm256_broadcast_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_broadcast_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
; X64-NEXT: retq
%ld = load <2 x double>, <2 x double>* %a0
@@ -214,13 +214,13 @@ define <4 x double> @test_mm256_broadcast_pd(<2 x double>* %a0) nounwind {
define <8 x float> @test_mm256_broadcast_ps(<4 x float>* %a0) nounwind {
; X32-LABEL: test_mm256_broadcast_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_broadcast_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
; X64-NEXT: retq
%ld = load <4 x float>, <4 x float>* %a0
@@ -230,13 +230,13 @@ define <8 x float> @test_mm256_broadcast_ps(<4 x float>* %a0) nounwind {
define <4 x double> @test_mm256_broadcast_sd(double* %a0) nounwind {
; X32-LABEL: test_mm256_broadcast_sd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vbroadcastsd (%eax), %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_broadcast_sd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vbroadcastsd (%rdi), %ymm0
; X64-NEXT: retq
%ld = load double, double* %a0
@@ -249,13 +249,13 @@ define <4 x double> @test_mm256_broadcast_sd(double* %a0) nounwind {
define <4 x float> @test_mm_broadcast_ss(float* %a0) nounwind {
; X32-LABEL: test_mm_broadcast_ss:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vbroadcastss (%eax), %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_broadcast_ss:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vbroadcastss (%rdi), %xmm0
; X64-NEXT: retq
%ld = load float, float* %a0
@@ -268,13 +268,13 @@ define <4 x float> @test_mm_broadcast_ss(float* %a0) nounwind {
define <8 x float> @test_mm256_broadcast_ss(float* %a0) nounwind {
; X32-LABEL: test_mm256_broadcast_ss:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vbroadcastss (%eax), %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_broadcast_ss:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vbroadcastss (%rdi), %ymm0
; X64-NEXT: retq
%ld = load float, float* %a0
@@ -291,11 +291,11 @@ define <8 x float> @test_mm256_broadcast_ss(float* %a0) nounwind {
define <8 x float> @test_mm256_castpd_ps(<4 x double> %a0) nounwind {
; X32-LABEL: test_mm256_castpd_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_castpd_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: retq
%res = bitcast <4 x double> %a0 to <8 x float>
ret <8 x float> %res
@@ -303,11 +303,11 @@ define <8 x float> @test_mm256_castpd_ps(<4 x double> %a0) nounwind {
define <4 x i64> @test_mm256_castpd_si256(<4 x double> %a0) nounwind {
; X32-LABEL: test_mm256_castpd_si256:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_castpd_si256:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: retq
%res = bitcast <4 x double> %a0 to <4 x i64>
ret <4 x i64> %res
@@ -315,13 +315,13 @@ define <4 x i64> @test_mm256_castpd_si256(<4 x double> %a0) nounwind {
define <4 x double> @test_mm256_castpd128_pd256(<2 x double> %a0) nounwind {
; X32-LABEL: test_mm256_castpd128_pd256:
-; X32: # BB#0:
-; X32-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; X32: # %bb.0:
+; X32-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_castpd128_pd256:
-; X64: # BB#0:
-; X64-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; X64: # %bb.0:
+; X64-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
; X64-NEXT: retq
%res = shufflevector <2 x double> %a0, <2 x double> %a0, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
ret <4 x double> %res
@@ -329,14 +329,14 @@ define <4 x double> @test_mm256_castpd128_pd256(<2 x double> %a0) nounwind {
define <2 x double> @test_mm256_castpd256_pd128(<4 x double> %a0) nounwind {
; X32-LABEL: test_mm256_castpd256_pd128:
-; X32: # BB#0:
-; X32-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; X32: # %bb.0:
+; X32-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; X32-NEXT: vzeroupper
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_castpd256_pd128:
-; X64: # BB#0:
-; X64-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; X64: # %bb.0:
+; X64-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; X64-NEXT: vzeroupper
; X64-NEXT: retq
%res = shufflevector <4 x double> %a0, <4 x double> %a0, <2 x i32> <i32 0, i32 1>
@@ -345,11 +345,11 @@ define <2 x double> @test_mm256_castpd256_pd128(<4 x double> %a0) nounwind {
define <4 x double> @test_mm256_castps_pd(<8 x float> %a0) nounwind {
; X32-LABEL: test_mm256_castps_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_castps_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: retq
%res = bitcast <8 x float> %a0 to <4 x double>
ret <4 x double> %res
@@ -357,11 +357,11 @@ define <4 x double> @test_mm256_castps_pd(<8 x float> %a0) nounwind {
define <4 x i64> @test_mm256_castps_si256(<8 x float> %a0) nounwind {
; X32-LABEL: test_mm256_castps_si256:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_castps_si256:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: retq
%res = bitcast <8 x float> %a0 to <4 x i64>
ret <4 x i64> %res
@@ -369,13 +369,13 @@ define <4 x i64> @test_mm256_castps_si256(<8 x float> %a0) nounwind {
define <8 x float> @test_mm256_castps128_ps256(<4 x float> %a0) nounwind {
; X32-LABEL: test_mm256_castps128_ps256:
-; X32: # BB#0:
-; X32-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; X32: # %bb.0:
+; X32-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_castps128_ps256:
-; X64: # BB#0:
-; X64-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; X64: # %bb.0:
+; X64-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
; X64-NEXT: retq
%res = shufflevector <4 x float> %a0, <4 x float> %a0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
ret <8 x float> %res
@@ -383,14 +383,14 @@ define <8 x float> @test_mm256_castps128_ps256(<4 x float> %a0) nounwind {
define <4 x float> @test_mm256_castps256_ps128(<8 x float> %a0) nounwind {
; X32-LABEL: test_mm256_castps256_ps128:
-; X32: # BB#0:
-; X32-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; X32: # %bb.0:
+; X32-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; X32-NEXT: vzeroupper
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_castps256_ps128:
-; X64: # BB#0:
-; X64-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; X64: # %bb.0:
+; X64-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; X64-NEXT: vzeroupper
; X64-NEXT: retq
%res = shufflevector <8 x float> %a0, <8 x float> %a0, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -399,13 +399,13 @@ define <4 x float> @test_mm256_castps256_ps128(<8 x float> %a0) nounwind {
define <4 x i64> @test_mm256_castsi128_si256(<2 x i64> %a0) nounwind {
; X32-LABEL: test_mm256_castsi128_si256:
-; X32: # BB#0:
-; X32-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; X32: # %bb.0:
+; X32-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_castsi128_si256:
-; X64: # BB#0:
-; X64-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; X64: # %bb.0:
+; X64-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
; X64-NEXT: retq
%res = shufflevector <2 x i64> %a0, <2 x i64> %a0, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
ret <4 x i64> %res
@@ -413,11 +413,11 @@ define <4 x i64> @test_mm256_castsi128_si256(<2 x i64> %a0) nounwind {
define <4 x double> @test_mm256_castsi256_pd(<4 x i64> %a0) nounwind {
; X32-LABEL: test_mm256_castsi256_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_castsi256_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: retq
%res = bitcast <4 x i64> %a0 to <4 x double>
ret <4 x double> %res
@@ -425,11 +425,11 @@ define <4 x double> @test_mm256_castsi256_pd(<4 x i64> %a0) nounwind {
define <8 x float> @test_mm256_castsi256_ps(<4 x i64> %a0) nounwind {
; X32-LABEL: test_mm256_castsi256_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_castsi256_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: retq
%res = bitcast <4 x i64> %a0 to <8 x float>
ret <8 x float> %res
@@ -437,14 +437,14 @@ define <8 x float> @test_mm256_castsi256_ps(<4 x i64> %a0) nounwind {
define <2 x i64> @test_mm256_castsi256_si128(<4 x i64> %a0) nounwind {
; X32-LABEL: test_mm256_castsi256_si128:
-; X32: # BB#0:
-; X32-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; X32: # %bb.0:
+; X32-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; X32-NEXT: vzeroupper
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_castsi256_si128:
-; X64: # BB#0:
-; X64-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; X64: # %bb.0:
+; X64-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; X64-NEXT: vzeroupper
; X64-NEXT: retq
%res = shufflevector <4 x i64> %a0, <4 x i64> %a0, <2 x i32> <i32 0, i32 1>
@@ -453,12 +453,12 @@ define <2 x i64> @test_mm256_castsi256_si128(<4 x i64> %a0) nounwind {
define <4 x double> @test_mm256_ceil_pd(<4 x double> %a0) nounwind {
; X32-LABEL: test_mm256_ceil_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vroundpd $2, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_ceil_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vroundpd $2, %ymm0, %ymm0
; X64-NEXT: retq
%res = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %a0, i32 2)
@@ -468,12 +468,12 @@ declare <4 x double> @llvm.x86.avx.round.pd.256(<4 x double>, i32) nounwind read
define <8 x float> @test_mm256_ceil_ps(<8 x float> %a0) nounwind {
; X32-LABEL: test_mm256_ceil_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vroundps $2, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_ceil_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vroundps $2, %ymm0, %ymm0
; X64-NEXT: retq
%res = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %a0, i32 2)
@@ -483,12 +483,12 @@ declare <8 x float> @llvm.x86.avx.round.ps.256(<8 x float>, i32) nounwind readno
define <2 x double> @test_mm_cmp_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
; X32-LABEL: test_mm_cmp_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vcmpgepd %xmm1, %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_cmp_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vcmpgepd %xmm1, %xmm0, %xmm0
; X64-NEXT: retq
%res = call <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double> %a0, <2 x double> %a1, i8 13)
@@ -498,12 +498,12 @@ declare <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double>, <2 x double>, i8) nounw
define <4 x double> @test_mm256_cmp_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
; X32-LABEL: test_mm256_cmp_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vcmpgepd %ymm1, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_cmp_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vcmpgepd %ymm1, %ymm0, %ymm0
; X64-NEXT: retq
%res = call <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double> %a0, <4 x double> %a1, i8 13)
@@ -513,12 +513,12 @@ declare <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double>, <4 x double>, i8) no
define <4 x float> @test_mm_cmp_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
; X32-LABEL: test_mm_cmp_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vcmpgeps %xmm1, %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_cmp_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vcmpgeps %xmm1, %xmm0, %xmm0
; X64-NEXT: retq
%res = call <4 x float> @llvm.x86.sse.cmp.ps(<4 x float> %a0, <4 x float> %a1, i8 13)
@@ -528,12 +528,12 @@ declare <4 x float> @llvm.x86.sse.cmp.ps(<4 x float>, <4 x float>, i8) nounwind
define <8 x float> @test_mm256_cmp_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
; X32-LABEL: test_mm256_cmp_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vcmpgeps %ymm1, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_cmp_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vcmpgeps %ymm1, %ymm0, %ymm0
; X64-NEXT: retq
%res = call <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float> %a0, <8 x float> %a1, i8 13)
@@ -543,12 +543,12 @@ declare <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float>, <8 x float>, i8) nounw
define <2 x double> @test_mm_cmp_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
; X32-LABEL: test_mm_cmp_sd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vcmpgesd %xmm1, %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_cmp_sd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vcmpgesd %xmm1, %xmm0, %xmm0
; X64-NEXT: retq
%res = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a0, <2 x double> %a1, i8 13)
@@ -558,12 +558,12 @@ declare <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double>, <2 x double>, i8) nounw
define <4 x float> @test_mm_cmp_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
; X32-LABEL: test_mm_cmp_ss:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vcmpgess %xmm1, %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_cmp_ss:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vcmpgess %xmm1, %xmm0, %xmm0
; X64-NEXT: retq
%res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 13)
@@ -573,12 +573,12 @@ declare <4 x float> @llvm.x86.sse.cmp.ss(<4 x float>, <4 x float>, i8) nounwind
define <4 x double> @test_mm256_cvtepi32_pd(<2 x i64> %a0) nounwind {
; X32-LABEL: test_mm256_cvtepi32_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vcvtdq2pd %xmm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_cvtepi32_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vcvtdq2pd %xmm0, %ymm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <4 x i32>
@@ -588,12 +588,12 @@ define <4 x double> @test_mm256_cvtepi32_pd(<2 x i64> %a0) nounwind {
define <8 x float> @test_mm256_cvtepi32_ps(<4 x i64> %a0) nounwind {
; X32-LABEL: test_mm256_cvtepi32_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vcvtdq2ps %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_cvtepi32_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vcvtdq2ps %ymm0, %ymm0
; X64-NEXT: retq
%arg0 = bitcast <4 x i64> %a0 to <8 x i32>
@@ -604,13 +604,13 @@ declare <8 x float> @llvm.x86.avx.cvtdq2.ps.256(<8 x i32>) nounwind readnone
define <2 x i64> @test_mm256_cvtpd_epi32(<4 x double> %a0) nounwind {
; X32-LABEL: test_mm256_cvtpd_epi32:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vcvtpd2dq %ymm0, %xmm0
; X32-NEXT: vzeroupper
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_cvtpd_epi32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vcvtpd2dq %ymm0, %xmm0
; X64-NEXT: vzeroupper
; X64-NEXT: retq
@@ -622,13 +622,13 @@ declare <4 x i32> @llvm.x86.avx.cvt.pd2dq.256(<4 x double>) nounwind readnone
define <4 x float> @test_mm256_cvtpd_ps(<4 x double> %a0) nounwind {
; X32-LABEL: test_mm256_cvtpd_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vcvtpd2ps %ymm0, %xmm0
; X32-NEXT: vzeroupper
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_cvtpd_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vcvtpd2ps %ymm0, %xmm0
; X64-NEXT: vzeroupper
; X64-NEXT: retq
@@ -639,12 +639,12 @@ declare <4 x float> @llvm.x86.avx.cvt.pd2.ps.256(<4 x double>) nounwind readnone
define <4 x i64> @test_mm256_cvtps_epi32(<8 x float> %a0) nounwind {
; X32-LABEL: test_mm256_cvtps_epi32:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vcvtps2dq %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_cvtps_epi32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vcvtps2dq %ymm0, %ymm0
; X64-NEXT: retq
%cvt = call <8 x i32> @llvm.x86.avx.cvt.ps2dq.256(<8 x float> %a0)
@@ -655,12 +655,12 @@ declare <8 x i32> @llvm.x86.avx.cvt.ps2dq.256(<8 x float>) nounwind readnone
define <4 x double> @test_mm256_cvtps_pd(<4 x float> %a0) nounwind {
; X32-LABEL: test_mm256_cvtps_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vcvtps2pd %xmm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_cvtps_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vcvtps2pd %xmm0, %ymm0
; X64-NEXT: retq
%res = fpext <4 x float> %a0 to <4 x double>
@@ -669,13 +669,13 @@ define <4 x double> @test_mm256_cvtps_pd(<4 x float> %a0) nounwind {
define <2 x i64> @test_mm256_cvttpd_epi32(<4 x double> %a0) nounwind {
; X32-LABEL: test_mm256_cvttpd_epi32:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vcvttpd2dq %ymm0, %xmm0
; X32-NEXT: vzeroupper
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_cvttpd_epi32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vcvttpd2dq %ymm0, %xmm0
; X64-NEXT: vzeroupper
; X64-NEXT: retq
@@ -687,12 +687,12 @@ declare <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double>) nounwind readnone
define <4 x i64> @test_mm256_cvttps_epi32(<8 x float> %a0) nounwind {
; X32-LABEL: test_mm256_cvttps_epi32:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vcvttps2dq %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_cvttps_epi32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vcvttps2dq %ymm0, %ymm0
; X64-NEXT: retq
%cvt = call <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float> %a0)
@@ -703,12 +703,12 @@ declare <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float>) nounwind readnone
define <4 x double> @test_mm256_div_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
; X32-LABEL: test_mm256_div_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vdivpd %ymm1, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_div_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vdivpd %ymm1, %ymm0, %ymm0
; X64-NEXT: retq
%res = fdiv <4 x double> %a0, %a1
@@ -717,12 +717,12 @@ define <4 x double> @test_mm256_div_pd(<4 x double> %a0, <4 x double> %a1) nounw
define <8 x float> @test_mm256_div_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
; X32-LABEL: test_mm256_div_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vdivps %ymm1, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_div_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vdivps %ymm1, %ymm0, %ymm0
; X64-NEXT: retq
%res = fdiv <8 x float> %a0, %a1
@@ -731,12 +731,12 @@ define <8 x float> @test_mm256_div_ps(<8 x float> %a0, <8 x float> %a1) nounwind
define <8 x float> @test_mm256_dp_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
; X32-LABEL: test_mm256_dp_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vdpps $7, %ymm1, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_dp_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vdpps $7, %ymm1, %ymm0, %ymm0
; X64-NEXT: retq
%res = call <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float> %a0, <8 x float> %a1, i8 7)
@@ -746,7 +746,7 @@ declare <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float>, <8 x float>, i8) nounwi
define i32 @test_mm256_extract_epi8(<4 x i64> %a0) nounwind {
; X32-LABEL: test_mm256_extract_epi8:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vextractf128 $1, %ymm0, %xmm0
; X32-NEXT: vpextrb $15, %xmm0, %eax
; X32-NEXT: movzbl %al, %eax
@@ -754,7 +754,7 @@ define i32 @test_mm256_extract_epi8(<4 x i64> %a0) nounwind {
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_extract_epi8:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vextractf128 $1, %ymm0, %xmm0
; X64-NEXT: vpextrb $15, %xmm0, %eax
; X64-NEXT: movzbl %al, %eax
@@ -768,7 +768,7 @@ define i32 @test_mm256_extract_epi8(<4 x i64> %a0) nounwind {
define i32 @test_mm256_extract_epi16(<4 x i64> %a0) nounwind {
; X32-LABEL: test_mm256_extract_epi16:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vextractf128 $1, %ymm0, %xmm0
; X32-NEXT: vpextrw $3, %xmm0, %eax
; X32-NEXT: movzwl %ax, %eax
@@ -776,7 +776,7 @@ define i32 @test_mm256_extract_epi16(<4 x i64> %a0) nounwind {
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_extract_epi16:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vextractf128 $1, %ymm0, %xmm0
; X64-NEXT: vpextrw $3, %xmm0, %eax
; X64-NEXT: movzwl %ax, %eax
@@ -790,16 +790,16 @@ define i32 @test_mm256_extract_epi16(<4 x i64> %a0) nounwind {
define i32 @test_mm256_extract_epi32(<4 x i64> %a0) nounwind {
; X32-LABEL: test_mm256_extract_epi32:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vextractf128 $1, %ymm0, %xmm0
-; X32-NEXT: vpextrd $1, %xmm0, %eax
+; X32-NEXT: vextractps $1, %xmm0, %eax
; X32-NEXT: vzeroupper
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_extract_epi32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vextractf128 $1, %ymm0, %xmm0
-; X64-NEXT: vpextrd $1, %xmm0, %eax
+; X64-NEXT: vextractps $1, %xmm0, %eax
; X64-NEXT: vzeroupper
; X64-NEXT: retq
%arg0 = bitcast <4 x i64> %a0 to <8 x i32>
@@ -809,15 +809,15 @@ define i32 @test_mm256_extract_epi32(<4 x i64> %a0) nounwind {
define i64 @test_mm256_extract_epi64(<4 x i64> %a0) nounwind {
; X32-LABEL: test_mm256_extract_epi64:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vextractf128 $1, %ymm0, %xmm0
-; X32-NEXT: vpextrd $2, %xmm0, %eax
-; X32-NEXT: vpextrd $3, %xmm0, %edx
+; X32-NEXT: vextractps $2, %xmm0, %eax
+; X32-NEXT: vextractps $3, %xmm0, %edx
; X32-NEXT: vzeroupper
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_extract_epi64:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vextractf128 $1, %ymm0, %xmm0
; X64-NEXT: vpextrq $1, %xmm0, %rax
; X64-NEXT: vzeroupper
@@ -828,13 +828,13 @@ define i64 @test_mm256_extract_epi64(<4 x i64> %a0) nounwind {
define <2 x double> @test_mm256_extractf128_pd(<4 x double> %a0) nounwind {
; X32-LABEL: test_mm256_extractf128_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vextractf128 $1, %ymm0, %xmm0
; X32-NEXT: vzeroupper
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_extractf128_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vextractf128 $1, %ymm0, %xmm0
; X64-NEXT: vzeroupper
; X64-NEXT: retq
@@ -844,13 +844,13 @@ define <2 x double> @test_mm256_extractf128_pd(<4 x double> %a0) nounwind {
define <4 x float> @test_mm256_extractf128_ps(<8 x float> %a0) nounwind {
; X32-LABEL: test_mm256_extractf128_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vextractf128 $1, %ymm0, %xmm0
; X32-NEXT: vzeroupper
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_extractf128_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vextractf128 $1, %ymm0, %xmm0
; X64-NEXT: vzeroupper
; X64-NEXT: retq
@@ -860,13 +860,13 @@ define <4 x float> @test_mm256_extractf128_ps(<8 x float> %a0) nounwind {
define <2 x i64> @test_mm256_extractf128_si256(<4 x i64> %a0) nounwind {
; X32-LABEL: test_mm256_extractf128_si256:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vextractf128 $1, %ymm0, %xmm0
; X32-NEXT: vzeroupper
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_extractf128_si256:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vextractf128 $1, %ymm0, %xmm0
; X64-NEXT: vzeroupper
; X64-NEXT: retq
@@ -876,12 +876,12 @@ define <2 x i64> @test_mm256_extractf128_si256(<4 x i64> %a0) nounwind {
define <4 x double> @test_mm256_floor_pd(<4 x double> %a0) nounwind {
; X32-LABEL: test_mm256_floor_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vroundpd $1, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_floor_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vroundpd $1, %ymm0, %ymm0
; X64-NEXT: retq
%res = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %a0, i32 1)
@@ -890,12 +890,12 @@ define <4 x double> @test_mm256_floor_pd(<4 x double> %a0) nounwind {
define <8 x float> @test_mm256_floor_ps(<8 x float> %a0) nounwind {
; X32-LABEL: test_mm256_floor_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vroundps $1, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_floor_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vroundps $1, %ymm0, %ymm0
; X64-NEXT: retq
%res = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %a0, i32 1)
@@ -904,12 +904,12 @@ define <8 x float> @test_mm256_floor_ps(<8 x float> %a0) nounwind {
define <4 x double> @test_mm256_hadd_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
; X32-LABEL: test_mm256_hadd_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vhaddpd %ymm1, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_hadd_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vhaddpd %ymm1, %ymm0, %ymm0
; X64-NEXT: retq
%res = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %a0, <4 x double> %a1)
@@ -919,12 +919,12 @@ declare <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double>, <4 x double>) nounw
define <8 x float> @test_mm256_hadd_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
; X32-LABEL: test_mm256_hadd_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vhaddps %ymm1, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_hadd_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vhaddps %ymm1, %ymm0, %ymm0
; X64-NEXT: retq
%res = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %a0, <8 x float> %a1)
@@ -934,12 +934,12 @@ declare <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float>, <8 x float>) nounwind
define <4 x double> @test_mm256_hsub_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
; X32-LABEL: test_mm256_hsub_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vhsubpd %ymm1, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_hsub_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vhsubpd %ymm1, %ymm0, %ymm0
; X64-NEXT: retq
%res = call <4 x double> @llvm.x86.avx.hsub.pd.256(<4 x double> %a0, <4 x double> %a1)
@@ -949,12 +949,12 @@ declare <4 x double> @llvm.x86.avx.hsub.pd.256(<4 x double>, <4 x double>) nounw
define <8 x float> @test_mm256_hsub_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
; X32-LABEL: test_mm256_hsub_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vhsubps %ymm1, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_hsub_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vhsubps %ymm1, %ymm0, %ymm0
; X64-NEXT: retq
%res = call <8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float> %a0, <8 x float> %a1)
@@ -964,14 +964,14 @@ declare <8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float>, <8 x float>) nounwind
define <4 x i64> @test_mm256_insert_epi8(<4 x i64> %a0, i8 %a1) nounwind {
; X32-LABEL: test_mm256_insert_epi8:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpinsrb $4, %eax, %xmm0, %xmm1
; X32-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_insert_epi8:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movzbl %dil, %eax
; X64-NEXT: vpinsrb $4, %eax, %xmm0, %xmm1
; X64-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
@@ -984,15 +984,15 @@ define <4 x i64> @test_mm256_insert_epi8(<4 x i64> %a0, i8 %a1) nounwind {
define <4 x i64> @test_mm256_insert_epi16(<4 x i64> %a0, i16 %a1) nounwind {
; X32-LABEL: test_mm256_insert_epi16:
-; X32: # BB#0:
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32: # %bb.0:
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vextractf128 $1, %ymm0, %xmm1
; X32-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1
; X32-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_insert_epi16:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
; X64-NEXT: vpinsrw $6, %edi, %xmm1, %xmm1
; X64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
@@ -1005,13 +1005,13 @@ define <4 x i64> @test_mm256_insert_epi16(<4 x i64> %a0, i16 %a1) nounwind {
define <4 x i64> @test_mm256_insert_epi32(<4 x i64> %a0, i32 %a1) nounwind {
; X32-LABEL: test_mm256_insert_epi32:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm1
; X32-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_insert_epi32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpinsrd $3, %edi, %xmm0, %xmm1
; X64-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; X64-NEXT: retq
@@ -1023,7 +1023,7 @@ define <4 x i64> @test_mm256_insert_epi32(<4 x i64> %a0, i32 %a1) nounwind {
define <4 x i64> @test_mm256_insert_epi64(<4 x i64> %a0, i64 %a1) nounwind {
; X32-LABEL: test_mm256_insert_epi64:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vextractf128 $1, %ymm0, %xmm1
; X32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1
; X32-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm1, %xmm1
@@ -1031,7 +1031,7 @@ define <4 x i64> @test_mm256_insert_epi64(<4 x i64> %a0, i64 %a1) nounwind {
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_insert_epi64:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
; X64-NEXT: vpinsrq $1, %rdi, %xmm1, %xmm1
; X64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
@@ -1042,14 +1042,14 @@ define <4 x i64> @test_mm256_insert_epi64(<4 x i64> %a0, i64 %a1) nounwind {
define <4 x double> @test_mm256_insertf128_pd(<4 x double> %a0, <2 x double> %a1) nounwind {
; X32-LABEL: test_mm256_insertf128_pd:
-; X32: # BB#0:
-; X32-NEXT: # kill: %XMM1<def> %XMM1<kill> %YMM1<def>
+; X32: # %bb.0:
+; X32-NEXT: # kill: def %xmm1 killed %xmm1 def %ymm1
; X32-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_insertf128_pd:
-; X64: # BB#0:
-; X64-NEXT: # kill: %XMM1<def> %XMM1<kill> %YMM1<def>
+; X64: # %bb.0:
+; X64-NEXT: # kill: def %xmm1 killed %xmm1 def %ymm1
; X64-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
; X64-NEXT: retq
%ext = shufflevector <2 x double> %a1, <2 x double> %a1, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
@@ -1059,12 +1059,12 @@ define <4 x double> @test_mm256_insertf128_pd(<4 x double> %a0, <2 x double> %a1
define <8 x float> @test_mm256_insertf128_ps(<8 x float> %a0, <4 x float> %a1) nounwind {
; X32-LABEL: test_mm256_insertf128_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_insertf128_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; X64-NEXT: retq
%ext = shufflevector <4 x float> %a1, <4 x float> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -1074,14 +1074,14 @@ define <8 x float> @test_mm256_insertf128_ps(<8 x float> %a0, <4 x float> %a1) n
define <4 x i64> @test_mm256_insertf128_si256(<4 x i64> %a0, <2 x i64> %a1) nounwind {
; X32-LABEL: test_mm256_insertf128_si256:
-; X32: # BB#0:
-; X32-NEXT: # kill: %XMM1<def> %XMM1<kill> %YMM1<def>
+; X32: # %bb.0:
+; X32-NEXT: # kill: def %xmm1 killed %xmm1 def %ymm1
; X32-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_insertf128_si256:
-; X64: # BB#0:
-; X64-NEXT: # kill: %XMM1<def> %XMM1<kill> %YMM1<def>
+; X64: # %bb.0:
+; X64-NEXT: # kill: def %xmm1 killed %xmm1 def %ymm1
; X64-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
; X64-NEXT: retq
%ext = shufflevector <2 x i64> %a1, <2 x i64> %a1, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
@@ -1091,13 +1091,13 @@ define <4 x i64> @test_mm256_insertf128_si256(<4 x i64> %a0, <2 x i64> %a1) noun
define <4 x i64> @test_mm256_lddqu_si256(<4 x i64>* %a0) nounwind {
; X32-LABEL: test_mm256_lddqu_si256:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vlddqu (%eax), %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_lddqu_si256:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vlddqu (%rdi), %ymm0
; X64-NEXT: retq
%arg0 = bitcast <4 x i64>* %a0 to i8*
@@ -1109,13 +1109,13 @@ declare <32 x i8> @llvm.x86.avx.ldu.dq.256(i8*) nounwind readnone
define <4 x double> @test_mm256_load_pd(double* %a0) nounwind {
; X32-LABEL: test_mm256_load_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vmovaps (%eax), %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_load_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovaps (%rdi), %ymm0
; X64-NEXT: retq
%arg0 = bitcast double* %a0 to <4 x double>*
@@ -1125,13 +1125,13 @@ define <4 x double> @test_mm256_load_pd(double* %a0) nounwind {
define <8 x float> @test_mm256_load_ps(float* %a0) nounwind {
; X32-LABEL: test_mm256_load_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vmovaps (%eax), %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_load_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovaps (%rdi), %ymm0
; X64-NEXT: retq
%arg0 = bitcast float* %a0 to <8 x float>*
@@ -1141,13 +1141,13 @@ define <8 x float> @test_mm256_load_ps(float* %a0) nounwind {
define <4 x i64> @test_mm256_load_si256(<4 x i64>* %a0) nounwind {
; X32-LABEL: test_mm256_load_si256:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vmovaps (%eax), %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_load_si256:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovaps (%rdi), %ymm0
; X64-NEXT: retq
%res = load <4 x i64>, <4 x i64>* %a0, align 32
@@ -1156,13 +1156,13 @@ define <4 x i64> @test_mm256_load_si256(<4 x i64>* %a0) nounwind {
define <4 x double> @test_mm256_loadu_pd(double* %a0) nounwind {
; X32-LABEL: test_mm256_loadu_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vmovups (%eax), %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_loadu_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovups (%rdi), %ymm0
; X64-NEXT: retq
%arg0 = bitcast double* %a0 to <4 x double>*
@@ -1172,13 +1172,13 @@ define <4 x double> @test_mm256_loadu_pd(double* %a0) nounwind {
define <8 x float> @test_mm256_loadu_ps(float* %a0) nounwind {
; X32-LABEL: test_mm256_loadu_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vmovups (%eax), %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_loadu_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovups (%rdi), %ymm0
; X64-NEXT: retq
%arg0 = bitcast float* %a0 to <8 x float>*
@@ -1188,13 +1188,13 @@ define <8 x float> @test_mm256_loadu_ps(float* %a0) nounwind {
define <4 x i64> @test_mm256_loadu_si256(<4 x i64>* %a0) nounwind {
; X32-LABEL: test_mm256_loadu_si256:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vmovups (%eax), %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_loadu_si256:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovups (%rdi), %ymm0
; X64-NEXT: retq
%res = load <4 x i64>, <4 x i64>* %a0, align 1
@@ -1203,7 +1203,7 @@ define <4 x i64> @test_mm256_loadu_si256(<4 x i64>* %a0) nounwind {
define <8 x float> @test_mm256_loadu2_m128(float* %a0, float* %a1) nounwind {
; X32-LABEL: test_mm256_loadu2_m128:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: vmovups (%eax), %xmm0
@@ -1211,7 +1211,7 @@ define <8 x float> @test_mm256_loadu2_m128(float* %a0, float* %a1) nounwind {
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_loadu2_m128:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovups (%rsi), %xmm0
; X64-NEXT: vinsertf128 $1, (%rdi), %ymm0, %ymm0
; X64-NEXT: retq
@@ -1227,7 +1227,7 @@ define <8 x float> @test_mm256_loadu2_m128(float* %a0, float* %a1) nounwind {
define <4 x double> @test_mm256_loadu2_m128d(double* %a0, double* %a1) nounwind {
; X32-LABEL: test_mm256_loadu2_m128d:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: vmovups (%eax), %xmm0
@@ -1235,7 +1235,7 @@ define <4 x double> @test_mm256_loadu2_m128d(double* %a0, double* %a1) nounwind
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_loadu2_m128d:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovups (%rsi), %xmm0
; X64-NEXT: vinsertf128 $1, (%rdi), %ymm0, %ymm0
; X64-NEXT: retq
@@ -1251,7 +1251,7 @@ define <4 x double> @test_mm256_loadu2_m128d(double* %a0, double* %a1) nounwind
define <4 x i64> @test_mm256_loadu2_m128i(i64* %a0, i64* %a1) nounwind {
; X32-LABEL: test_mm256_loadu2_m128i:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: vmovups (%eax), %xmm0
@@ -1259,7 +1259,7 @@ define <4 x i64> @test_mm256_loadu2_m128i(i64* %a0, i64* %a1) nounwind {
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_loadu2_m128i:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovups (%rsi), %xmm0
; X64-NEXT: vinsertf128 $1, (%rdi), %ymm0, %ymm0
; X64-NEXT: retq
@@ -1275,13 +1275,13 @@ define <4 x i64> @test_mm256_loadu2_m128i(i64* %a0, i64* %a1) nounwind {
define <2 x double> @test_mm_maskload_pd(double* %a0, <2 x i64> %a1) nounwind {
; X32-LABEL: test_mm_maskload_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vmaskmovpd (%eax), %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_maskload_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmaskmovpd (%rdi), %xmm0, %xmm0
; X64-NEXT: retq
%arg0 = bitcast double* %a0 to i8*
@@ -1292,13 +1292,13 @@ declare <2 x double> @llvm.x86.avx.maskload.pd(i8*, <2 x i64>) nounwind readnone
define <4 x double> @test_mm256_maskload_pd(double* %a0, <4 x i64> %a1) nounwind {
; X32-LABEL: test_mm256_maskload_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vmaskmovpd (%eax), %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_maskload_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm0
; X64-NEXT: retq
%arg0 = bitcast double* %a0 to i8*
@@ -1309,13 +1309,13 @@ declare <4 x double> @llvm.x86.avx.maskload.pd.256(i8*, <4 x i64>) nounwind read
define <4 x float> @test_mm_maskload_ps(float* %a0, <2 x i64> %a1) nounwind {
; X32-LABEL: test_mm_maskload_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vmaskmovps (%eax), %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_maskload_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmaskmovps (%rdi), %xmm0, %xmm0
; X64-NEXT: retq
%arg0 = bitcast float* %a0 to i8*
@@ -1327,13 +1327,13 @@ declare <4 x float> @llvm.x86.avx.maskload.ps(i8*, <4 x i32>) nounwind readnone
define <8 x float> @test_mm256_maskload_ps(float* %a0, <4 x i64> %a1) nounwind {
; X32-LABEL: test_mm256_maskload_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vmaskmovps (%eax), %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_maskload_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmaskmovps (%rdi), %ymm0, %ymm0
; X64-NEXT: retq
%arg0 = bitcast float* %a0 to i8*
@@ -1345,13 +1345,13 @@ declare <8 x float> @llvm.x86.avx.maskload.ps.256(i8*, <8 x i32>) nounwind readn
define void @test_mm_maskstore_pd(double* %a0, <2 x i64> %a1, <2 x double> %a2) nounwind {
; X32-LABEL: test_mm_maskstore_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vmaskmovpd %xmm1, %xmm0, (%eax)
; X32-NEXT: retl
;
; X64-LABEL: test_mm_maskstore_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmaskmovpd %xmm1, %xmm0, (%rdi)
; X64-NEXT: retq
%arg0 = bitcast double* %a0 to i8*
@@ -1362,14 +1362,14 @@ declare void @llvm.x86.avx.maskstore.pd(i8*, <2 x i64>, <2 x double>) nounwind r
define void @test_mm256_maskstore_pd(double* %a0, <4 x i64> %a1, <4 x double> %a2) nounwind {
; X32-LABEL: test_mm256_maskstore_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vmaskmovpd %ymm1, %ymm0, (%eax)
; X32-NEXT: vzeroupper
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_maskstore_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmaskmovpd %ymm1, %ymm0, (%rdi)
; X64-NEXT: vzeroupper
; X64-NEXT: retq
@@ -1381,13 +1381,13 @@ declare void @llvm.x86.avx.maskstore.pd.256(i8*, <4 x i64>, <4 x double>) nounwi
define void @test_mm_maskstore_ps(float* %a0, <2 x i64> %a1, <4 x float> %a2) nounwind {
; X32-LABEL: test_mm_maskstore_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vmaskmovps %xmm1, %xmm0, (%eax)
; X32-NEXT: retl
;
; X64-LABEL: test_mm_maskstore_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi)
; X64-NEXT: retq
%arg0 = bitcast float* %a0 to i8*
@@ -1399,14 +1399,14 @@ declare void @llvm.x86.avx.maskstore.ps(i8*, <4 x i32>, <4 x float>) nounwind re
define void @test_mm256_maskstore_ps(float* %a0, <4 x i64> %a1, <8 x float> %a2) nounwind {
; X32-LABEL: test_mm256_maskstore_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vmaskmovps %ymm1, %ymm0, (%eax)
; X32-NEXT: vzeroupper
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_maskstore_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmaskmovps %ymm1, %ymm0, (%rdi)
; X64-NEXT: vzeroupper
; X64-NEXT: retq
@@ -1419,12 +1419,12 @@ declare void @llvm.x86.avx.maskstore.ps.256(i8*, <8 x i32>, <8 x float>) nounwin
define <4 x double> @test_mm256_max_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
; X32-LABEL: test_mm256_max_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vmaxpd %ymm1, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_max_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmaxpd %ymm1, %ymm0, %ymm0
; X64-NEXT: retq
%res = call <4 x double> @llvm.x86.avx.max.pd.256(<4 x double> %a0, <4 x double> %a1)
@@ -1434,12 +1434,12 @@ declare <4 x double> @llvm.x86.avx.max.pd.256(<4 x double>, <4 x double>) nounwi
define <8 x float> @test_mm256_max_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
; X32-LABEL: test_mm256_max_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vmaxps %ymm1, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_max_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmaxps %ymm1, %ymm0, %ymm0
; X64-NEXT: retq
%res = call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> %a0, <8 x float> %a1)
@@ -1449,12 +1449,12 @@ declare <8 x float> @llvm.x86.avx.max.ps.256(<8 x float>, <8 x float>) nounwind
define <4 x double> @test_mm256_min_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
; X32-LABEL: test_mm256_min_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vminpd %ymm1, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_min_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vminpd %ymm1, %ymm0, %ymm0
; X64-NEXT: retq
%res = call <4 x double> @llvm.x86.avx.min.pd.256(<4 x double> %a0, <4 x double> %a1)
@@ -1464,12 +1464,12 @@ declare <4 x double> @llvm.x86.avx.min.pd.256(<4 x double>, <4 x double>) nounwi
define <8 x float> @test_mm256_min_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
; X32-LABEL: test_mm256_min_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vminps %ymm1, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_min_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vminps %ymm1, %ymm0, %ymm0
; X64-NEXT: retq
%res = call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> %a0, <8 x float> %a1)
@@ -1479,12 +1479,12 @@ declare <8 x float> @llvm.x86.avx.min.ps.256(<8 x float>, <8 x float>) nounwind
define <4 x double> @test_mm256_movedup_pd(<4 x double> %a0) nounwind {
; X32-LABEL: test_mm256_movedup_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_movedup_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
; X64-NEXT: retq
%res = shufflevector <4 x double> %a0, <4 x double> %a0, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
@@ -1493,12 +1493,12 @@ define <4 x double> @test_mm256_movedup_pd(<4 x double> %a0) nounwind {
define <8 x float> @test_mm256_movehdup_ps(<8 x float> %a0) nounwind {
; X32-LABEL: test_mm256_movehdup_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7]
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_movehdup_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7]
; X64-NEXT: retq
%res = shufflevector <8 x float> %a0, <8 x float> %a0, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
@@ -1507,12 +1507,12 @@ define <8 x float> @test_mm256_movehdup_ps(<8 x float> %a0) nounwind {
define <8 x float> @test_mm256_moveldup_ps(<8 x float> %a0) nounwind {
; X32-LABEL: test_mm256_moveldup_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vmovsldup {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6]
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_moveldup_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovsldup {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6]
; X64-NEXT: retq
%res = shufflevector <8 x float> %a0, <8 x float> %a0, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
@@ -1521,13 +1521,13 @@ define <8 x float> @test_mm256_moveldup_ps(<8 x float> %a0) nounwind {
define i32 @test_mm256_movemask_pd(<4 x double> %a0) nounwind {
; X32-LABEL: test_mm256_movemask_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vmovmskpd %ymm0, %eax
; X32-NEXT: vzeroupper
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_movemask_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovmskpd %ymm0, %eax
; X64-NEXT: vzeroupper
; X64-NEXT: retq
@@ -1538,13 +1538,13 @@ declare i32 @llvm.x86.avx.movmsk.pd.256(<4 x double>) nounwind readnone
define i32 @test_mm256_movemask_ps(<8 x float> %a0) nounwind {
; X32-LABEL: test_mm256_movemask_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vmovmskps %ymm0, %eax
; X32-NEXT: vzeroupper
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_movemask_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovmskps %ymm0, %eax
; X64-NEXT: vzeroupper
; X64-NEXT: retq
@@ -1555,12 +1555,12 @@ declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>) nounwind readnone
define <4 x double> @test_mm256_mul_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
; X32-LABEL: test_mm256_mul_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vmulpd %ymm1, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_mul_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmulpd %ymm1, %ymm0, %ymm0
; X64-NEXT: retq
%res = fmul <4 x double> %a0, %a1
@@ -1569,12 +1569,12 @@ define <4 x double> @test_mm256_mul_pd(<4 x double> %a0, <4 x double> %a1) nounw
define <8 x float> @test_mm256_mul_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
; X32-LABEL: test_mm256_mul_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vmulps %ymm1, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_mul_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmulps %ymm1, %ymm0, %ymm0
; X64-NEXT: retq
%res = fmul <8 x float> %a0, %a1
@@ -1583,12 +1583,12 @@ define <8 x float> @test_mm256_mul_ps(<8 x float> %a0, <8 x float> %a1) nounwind
define <4 x double> @test_mm256_or_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
; X32-LABEL: test_mm256_or_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vorps %ymm1, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_or_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vorps %ymm1, %ymm0, %ymm0
; X64-NEXT: retq
%1 = bitcast <4 x double> %a0 to <4 x i64>
@@ -1600,12 +1600,12 @@ define <4 x double> @test_mm256_or_pd(<4 x double> %a0, <4 x double> %a1) nounwi
define <8 x float> @test_mm256_or_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
; X32-LABEL: test_mm256_or_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vorps %ymm1, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_or_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vorps %ymm1, %ymm0, %ymm0
; X64-NEXT: retq
%1 = bitcast <8 x float> %a0 to <8 x i32>
@@ -1617,12 +1617,12 @@ define <8 x float> @test_mm256_or_ps(<8 x float> %a0, <8 x float> %a1) nounwind
define <2 x double> @test_mm_permute_pd(<2 x double> %a0) nounwind {
; X32-LABEL: test_mm_permute_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; X32-NEXT: retl
;
; X64-LABEL: test_mm_permute_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; X64-NEXT: retq
%res = shufflevector <2 x double> %a0, <2 x double> %a0, <2 x i32> <i32 1, i32 0>
@@ -1631,12 +1631,12 @@ define <2 x double> @test_mm_permute_pd(<2 x double> %a0) nounwind {
define <4 x double> @test_mm256_permute_pd(<4 x double> %a0) nounwind {
; X32-LABEL: test_mm256_permute_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_permute_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
; X64-NEXT: retq
%res = shufflevector <4 x double> %a0, <4 x double> %a0, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
@@ -1645,12 +1645,12 @@ define <4 x double> @test_mm256_permute_pd(<4 x double> %a0) nounwind {
define <4 x float> @test_mm_permute_ps(<4 x float> %a0) nounwind {
; X32-LABEL: test_mm_permute_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
; X32-NEXT: retl
;
; X64-LABEL: test_mm_permute_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
; X64-NEXT: retq
%res = shufflevector <4 x float> %a0, <4 x float> %a0, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
@@ -1659,12 +1659,12 @@ define <4 x float> @test_mm_permute_ps(<4 x float> %a0) nounwind {
define <4 x float> @test2_mm_permute_ps(<4 x float> %a0) nounwind {
; X32-LABEL: test2_mm_permute_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,1,2,3]
; X32-NEXT: retl
;
; X64-LABEL: test2_mm_permute_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,1,2,3]
; X64-NEXT: retq
%res = shufflevector <4 x float> %a0, <4 x float> %a0, <4 x i32> <i32 2, i32 1, i32 2, i32 3>
@@ -1673,12 +1673,12 @@ define <4 x float> @test2_mm_permute_ps(<4 x float> %a0) nounwind {
define <8 x float> @test_mm256_permute_ps(<8 x float> %a0) nounwind {
; X32-LABEL: test_mm256_permute_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_permute_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
; X64-NEXT: retq
%res = shufflevector <8 x float> %a0, <8 x float> %a0, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
@@ -1687,15 +1687,15 @@ define <8 x float> @test_mm256_permute_ps(<8 x float> %a0) nounwind {
define <4 x double> @test_mm256_permute2f128_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
; X32-LABEL: test_mm256_permute2f128_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm1[0,1]
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_permute2f128_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm1[0,1]
; X64-NEXT: retq
- %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 44)
+ %res = shufflevector <4 x double> zeroinitializer, <4 x double> %a1, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
ret <4 x double> %res
}
declare <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone
@@ -1703,32 +1703,32 @@ declare <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double>, <4 x double>,
; PR26667
define <8 x float> @test_mm256_permute2f128_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
; X32-LABEL: test_mm256_permute2f128_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vmovaps %ymm1, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_permute2f128_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovaps %ymm1, %ymm0
; X64-NEXT: retq
- %res = call <8 x float> @llvm.x86.avx.vperm2f128.ps.256(<8 x float> %a0, <8 x float> %a1, i8 50)
+ %res = shufflevector <8 x float> %a1, <8 x float> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
ret <8 x float> %res
}
declare <8 x float> @llvm.x86.avx.vperm2f128.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone
define <4 x i64> @test_mm256_permute2f128_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind {
; X32-LABEL: test_mm256_permute2f128_si256:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3,0,1]
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_permute2f128_si256:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3,0,1]
; X64-NEXT: retq
%1 = bitcast <4 x i64> %a0 to <8 x i32>
%2 = bitcast <4 x i64> %a1 to <8 x i32>
- %res = call <8 x i32> @llvm.x86.avx.vperm2f128.si.256(<8 x i32> %1, <8 x i32> %2, i8 35)
+ %res = shufflevector <8 x i32> %2, <8 x i32> %2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
%bc = bitcast <8 x i32> %res to <4 x i64>
ret <4 x i64> %bc
}
@@ -1736,12 +1736,12 @@ declare <8 x i32> @llvm.x86.avx.vperm2f128.si.256(<8 x i32>, <8 x i32>, i8) noun
define <2 x double> @test_mm_permutevar_pd(<2 x double> %a0, <2 x i64> %a1) nounwind {
; X32-LABEL: test_mm_permutevar_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpermilpd %xmm1, %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_permutevar_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpermilpd %xmm1, %xmm0, %xmm0
; X64-NEXT: retq
%res = call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %a0, <2 x i64> %a1)
@@ -1751,12 +1751,12 @@ declare <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double>, <2 x i64>) nounwi
define <4 x double> @test_mm256_permutevar_pd(<4 x double> %a0, <4 x i64> %a1) nounwind {
; X32-LABEL: test_mm256_permutevar_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpermilpd %ymm1, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_permutevar_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpermilpd %ymm1, %ymm0, %ymm0
; X64-NEXT: retq
%res = call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %a0, <4 x i64> %a1)
@@ -1766,12 +1766,12 @@ declare <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double>, <4 x i64>) no
define <4 x float> @test_mm_permutevar_ps(<4 x float> %a0, <2 x i64> %a1) nounwind {
; X32-LABEL: test_mm_permutevar_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpermilps %xmm1, %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_permutevar_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpermilps %xmm1, %xmm0, %xmm0
; X64-NEXT: retq
%arg1 = bitcast <2 x i64> %a1 to <4 x i32>
@@ -1782,12 +1782,12 @@ declare <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float>, <4 x i32>) nounwind
define <8 x float> @test_mm256_permutevar_ps(<8 x float> %a0, <4 x i64> %a1) nounwind {
; X32-LABEL: test_mm256_permutevar_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpermilps %ymm1, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_permutevar_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpermilps %ymm1, %ymm0, %ymm0
; X64-NEXT: retq
%arg1 = bitcast <4 x i64> %a1 to <8 x i32>
@@ -1798,12 +1798,12 @@ declare <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float>, <8 x i32>) noun
define <8 x float> @test_mm256_rcp_ps(<8 x float> %a0) nounwind {
; X32-LABEL: test_mm256_rcp_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vrcpps %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_rcp_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vrcpps %ymm0, %ymm0
; X64-NEXT: retq
%res = call <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float> %a0)
@@ -1813,12 +1813,12 @@ declare <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float>) nounwind readnone
define <4 x double> @test_mm256_round_pd(<4 x double> %a0) nounwind {
; X32-LABEL: test_mm256_round_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vroundpd $4, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_round_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vroundpd $4, %ymm0, %ymm0
; X64-NEXT: retq
%res = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %a0, i32 4)
@@ -1827,12 +1827,12 @@ define <4 x double> @test_mm256_round_pd(<4 x double> %a0) nounwind {
define <8 x float> @test_mm256_round_ps(<8 x float> %a0) nounwind {
; X32-LABEL: test_mm256_round_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vroundps $4, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_round_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vroundps $4, %ymm0, %ymm0
; X64-NEXT: retq
%res = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %a0, i32 4)
@@ -1841,12 +1841,12 @@ define <8 x float> @test_mm256_round_ps(<8 x float> %a0) nounwind {
define <8 x float> @test_mm256_rsqrt_ps(<8 x float> %a0) nounwind {
; X32-LABEL: test_mm256_rsqrt_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vrsqrtps %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_rsqrt_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vrsqrtps %ymm0, %ymm0
; X64-NEXT: retq
%res = call <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float> %a0)
@@ -1856,7 +1856,7 @@ declare <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float>) nounwind readnone
define <4 x i64> @test_mm256_set_epi8(i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, i8 %a5, i8 %a6, i8 %a7, i8 %a8, i8 %a9, i8 %a10, i8 %a11, i8 %a12, i8 %a13, i8 %a14, i8 %a15, i8 %a16, i8 %a17, i8 %a18, i8 %a19, i8 %a20, i8 %a21, i8 %a22, i8 %a23, i8 %a24, i8 %a25, i8 %a26, i8 %a27, i8 %a28, i8 %a29, i8 %a30, i8 %a31) nounwind {
; X32-LABEL: test_mm256_set_epi8:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: vmovd %ecx, %xmm0
@@ -1925,7 +1925,7 @@ define <4 x i64> @test_mm256_set_epi8(i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, i8
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_set_epi8:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d
; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
; X64-NEXT: vmovd %eax, %xmm0
@@ -2030,47 +2030,47 @@ define <4 x i64> @test_mm256_set_epi8(i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, i8
define <4 x i64> @test_mm256_set_epi16(i16 %a0, i16 %a1, i16 %a2, i16 %a3, i16 %a4, i16 %a5, i16 %a6, i16 %a7, i16 %a8, i16 %a9, i16 %a10, i16 %a11, i16 %a12, i16 %a13, i16 %a14, i16 %a15) nounwind {
; X32-LABEL: test_mm256_set_epi16:
-; X32: # BB#0:
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32: # %bb.0:
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vmovd %eax, %xmm0
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vmovd %eax, %xmm1
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpinsrw $5, %eax, %xmm1, %xmm1
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1
; X32-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_set_epi16:
-; X64: # BB#0:
-; X64-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; X64: # %bb.0:
+; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax
; X64-NEXT: vmovd %eax, %xmm0
-; X64-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax
; X64-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0
; X64-NEXT: vpinsrw $2, %r9d, %xmm0, %xmm0
; X64-NEXT: vpinsrw $3, %r8d, %xmm0, %xmm0
@@ -2078,21 +2078,21 @@ define <4 x i64> @test_mm256_set_epi16(i16 %a0, i16 %a1, i16 %a2, i16 %a3, i16 %
; X64-NEXT: vpinsrw $5, %edx, %xmm0, %xmm0
; X64-NEXT: vpinsrw $6, %esi, %xmm0, %xmm0
; X64-NEXT: vpinsrw $7, %edi, %xmm0, %xmm0
-; X64-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax
; X64-NEXT: vmovd %eax, %xmm1
-; X64-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax
; X64-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1
-; X64-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax
; X64-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1
-; X64-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax
; X64-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1
-; X64-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax
; X64-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1
-; X64-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax
; X64-NEXT: vpinsrw $5, %eax, %xmm1, %xmm1
-; X64-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax
; X64-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1
-; X64-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax
; X64-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1
; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; X64-NEXT: retq
@@ -2118,7 +2118,7 @@ define <4 x i64> @test_mm256_set_epi16(i16 %a0, i16 %a1, i16 %a2, i16 %a3, i16 %
define <4 x i64> @test_mm256_set_epi32(i32 %a0, i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7) nounwind {
; X32-LABEL: test_mm256_set_epi32:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
; X32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0
@@ -2131,7 +2131,7 @@ define <4 x i64> @test_mm256_set_epi32(i32 %a0, i32 %a1, i32 %a2, i32 %a3, i32 %
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_set_epi32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovd %ecx, %xmm0
; X64-NEXT: vpinsrd $1, %edx, %xmm0, %xmm0
; X64-NEXT: vpinsrd $2, %esi, %xmm0, %xmm0
@@ -2156,7 +2156,7 @@ define <4 x i64> @test_mm256_set_epi32(i32 %a0, i32 %a1, i32 %a2, i32 %a3, i32 %
define <4 x i64> @test_mm256_set_epi64x(i64 %a0, i64 %a1, i64 %a2, i64 %a3) nounwind {
; X32-LABEL: test_mm256_set_epi64x:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
; X32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0
@@ -2169,7 +2169,7 @@ define <4 x i64> @test_mm256_set_epi64x(i64 %a0, i64 %a1, i64 %a2, i64 %a3) noun
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_set_epi64x:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovq %rdi, %xmm0
; X64-NEXT: vmovq %rsi, %xmm1
; X64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
@@ -2187,14 +2187,14 @@ define <4 x i64> @test_mm256_set_epi64x(i64 %a0, i64 %a1, i64 %a2, i64 %a3) noun
define <8 x float> @test_mm256_set_m128(<4 x float> %a0, <4 x float> %a1) nounwind {
; X32-LABEL: test_mm256_set_m128:
-; X32: # BB#0:
-; X32-NEXT: # kill: %XMM1<def> %XMM1<kill> %YMM1<def>
+; X32: # %bb.0:
+; X32-NEXT: # kill: def %xmm1 killed %xmm1 def %ymm1
; X32-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_set_m128:
-; X64: # BB#0:
-; X64-NEXT: # kill: %XMM1<def> %XMM1<kill> %YMM1<def>
+; X64: # %bb.0:
+; X64-NEXT: # kill: def %xmm1 killed %xmm1 def %ymm1
; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; X64-NEXT: retq
%res = shufflevector <4 x float> %a1, <4 x float> %a0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -2203,14 +2203,14 @@ define <8 x float> @test_mm256_set_m128(<4 x float> %a0, <4 x float> %a1) nounwi
define <4 x double> @test_mm256_set_m128d(<2 x double> %a0, <2 x double> %a1) nounwind {
; X32-LABEL: test_mm256_set_m128d:
-; X32: # BB#0:
-; X32-NEXT: # kill: %XMM1<def> %XMM1<kill> %YMM1<def>
+; X32: # %bb.0:
+; X32-NEXT: # kill: def %xmm1 killed %xmm1 def %ymm1
; X32-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_set_m128d:
-; X64: # BB#0:
-; X64-NEXT: # kill: %XMM1<def> %XMM1<kill> %YMM1<def>
+; X64: # %bb.0:
+; X64-NEXT: # kill: def %xmm1 killed %xmm1 def %ymm1
; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; X64-NEXT: retq
%arg0 = bitcast <2 x double> %a0 to <4 x float>
@@ -2222,14 +2222,14 @@ define <4 x double> @test_mm256_set_m128d(<2 x double> %a0, <2 x double> %a1) no
define <4 x i64> @test_mm256_set_m128i(<2 x i64> %a0, <2 x i64> %a1) nounwind {
; X32-LABEL: test_mm256_set_m128i:
-; X32: # BB#0:
-; X32-NEXT: # kill: %XMM1<def> %XMM1<kill> %YMM1<def>
+; X32: # %bb.0:
+; X32-NEXT: # kill: def %xmm1 killed %xmm1 def %ymm1
; X32-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_set_m128i:
-; X64: # BB#0:
-; X64-NEXT: # kill: %XMM1<def> %XMM1<kill> %YMM1<def>
+; X64: # %bb.0:
+; X64-NEXT: # kill: def %xmm1 killed %xmm1 def %ymm1
; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <4 x float>
@@ -2241,20 +2241,20 @@ define <4 x i64> @test_mm256_set_m128i(<2 x i64> %a0, <2 x i64> %a1) nounwind {
define <4 x double> @test_mm256_set_pd(double %a0, double %a1, double %a2, double %a3) nounwind {
; X32-LABEL: test_mm256_set_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; X32-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
; X32-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
; X32-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero
-; X32-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; X32-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X32-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; X32-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; X32-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_set_pd:
-; X64: # BB#0:
-; X64-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; X64-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm3[0],xmm2[0]
+; X64: # %bb.0:
+; X64-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; X64-NEXT: vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm2[0]
; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; X64-NEXT: retq
%res0 = insertelement <4 x double> undef, double %a3, i32 0
@@ -2266,7 +2266,7 @@ define <4 x double> @test_mm256_set_pd(double %a0, double %a1, double %a2, doubl
define <8 x float> @test_mm256_set_ps(float %a0, float %a1, float %a2, float %a3, float %a4, float %a5, float %a6, float %a7) nounwind {
; X32-LABEL: test_mm256_set_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X32-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X32-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
@@ -2285,7 +2285,7 @@ define <8 x float> @test_mm256_set_ps(float %a0, float %a1, float %a2, float %a3
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_set_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
; X64-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
; X64-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
@@ -2307,7 +2307,7 @@ define <8 x float> @test_mm256_set_ps(float %a0, float %a1, float %a2, float %a3
define <4 x i64> @test_mm256_set1_epi8(i8 %a0) nounwind {
; X32-LABEL: test_mm256_set1_epi8:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vmovd %eax, %xmm0
; X32-NEXT: vpxor %xmm1, %xmm1, %xmm1
@@ -2316,7 +2316,7 @@ define <4 x i64> @test_mm256_set1_epi8(i8 %a0) nounwind {
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_set1_epi8:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movzbl %dil, %eax
; X64-NEXT: vmovd %eax, %xmm0
; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1
@@ -2361,8 +2361,8 @@ define <4 x i64> @test_mm256_set1_epi8(i8 %a0) nounwind {
define <4 x i64> @test_mm256_set1_epi16(i16 %a0) nounwind {
; X32-LABEL: test_mm256_set1_epi16:
-; X32: # BB#0:
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32: # %bb.0:
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vmovd %eax, %xmm0
; X32-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
; X32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
@@ -2370,7 +2370,7 @@ define <4 x i64> @test_mm256_set1_epi16(i16 %a0) nounwind {
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_set1_epi16:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovd %edi, %xmm0
; X64-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
@@ -2398,14 +2398,14 @@ define <4 x i64> @test_mm256_set1_epi16(i16 %a0) nounwind {
define <4 x i64> @test_mm256_set1_epi32(i32 %a0) nounwind {
; X32-LABEL: test_mm256_set1_epi32:
-; X32: # BB#0:
-; X32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; X32: # %bb.0:
+; X32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_set1_epi32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovd %edi, %xmm0
; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
@@ -2424,7 +2424,7 @@ define <4 x i64> @test_mm256_set1_epi32(i32 %a0) nounwind {
define <4 x i64> @test_mm256_set1_epi64x(i64 %a0) nounwind {
; X32-LABEL: test_mm256_set1_epi64x:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: vmovd %ecx, %xmm0
@@ -2435,7 +2435,7 @@ define <4 x i64> @test_mm256_set1_epi64x(i64 %a0) nounwind {
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_set1_epi64x:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovq %rdi, %xmm0
; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
@@ -2449,14 +2449,14 @@ define <4 x i64> @test_mm256_set1_epi64x(i64 %a0) nounwind {
define <4 x double> @test_mm256_set1_pd(double %a0) nounwind {
; X32-LABEL: test_mm256_set1_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; X32-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_set1_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; X64-NEXT: retq
@@ -2469,14 +2469,14 @@ define <4 x double> @test_mm256_set1_pd(double %a0) nounwind {
define <8 x float> @test_mm256_set1_ps(float %a0) nounwind {
; X32-LABEL: test_mm256_set1_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_set1_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; X64-NEXT: retq
@@ -2493,7 +2493,7 @@ define <8 x float> @test_mm256_set1_ps(float %a0) nounwind {
define <4 x i64> @test_mm256_setr_epi8(i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, i8 %a5, i8 %a6, i8 %a7, i8 %a8, i8 %a9, i8 %a10, i8 %a11, i8 %a12, i8 %a13, i8 %a14, i8 %a15, i8 %a16, i8 %a17, i8 %a18, i8 %a19, i8 %a20, i8 %a21, i8 %a22, i8 %a23, i8 %a24, i8 %a25, i8 %a26, i8 %a27, i8 %a28, i8 %a29, i8 %a30, i8 %a31) nounwind {
; X32-LABEL: test_mm256_setr_epi8:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: vmovd %ecx, %xmm0
@@ -2562,7 +2562,7 @@ define <4 x i64> @test_mm256_setr_epi8(i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, i
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_setr_epi8:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d
; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
; X64-NEXT: vmovd %eax, %xmm0
@@ -2667,59 +2667,59 @@ define <4 x i64> @test_mm256_setr_epi8(i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, i
define <4 x i64> @test_mm256_setr_epi16(i16 %a0, i16 %a1, i16 %a2, i16 %a3, i16 %a4, i16 %a5, i16 %a6, i16 %a7, i16 %a8, i16 %a9, i16 %a10, i16 %a11, i16 %a12, i16 %a13, i16 %a14, i16 %a15) nounwind {
; X32-LABEL: test_mm256_setr_epi16:
-; X32: # BB#0:
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32: # %bb.0:
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vmovd %eax, %xmm0
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vmovd %eax, %xmm1
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpinsrw $5, %eax, %xmm1, %xmm1
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1
; X32-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_setr_epi16:
-; X64: # BB#0:
-; X64-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; X64: # %bb.0:
+; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax
; X64-NEXT: vmovd %eax, %xmm0
-; X64-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax
; X64-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0
-; X64-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax
; X64-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0
-; X64-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax
; X64-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0
-; X64-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax
; X64-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0
-; X64-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax
; X64-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0
-; X64-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax
; X64-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0
-; X64-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax
; X64-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0
; X64-NEXT: vmovd %edi, %xmm1
; X64-NEXT: vpinsrw $1, %esi, %xmm1, %xmm1
@@ -2727,9 +2727,9 @@ define <4 x i64> @test_mm256_setr_epi16(i16 %a0, i16 %a1, i16 %a2, i16 %a3, i16
; X64-NEXT: vpinsrw $3, %ecx, %xmm1, %xmm1
; X64-NEXT: vpinsrw $4, %r8d, %xmm1, %xmm1
; X64-NEXT: vpinsrw $5, %r9d, %xmm1, %xmm1
-; X64-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax
; X64-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1
-; X64-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax
; X64-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1
; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; X64-NEXT: retq
@@ -2755,7 +2755,7 @@ define <4 x i64> @test_mm256_setr_epi16(i16 %a0, i16 %a1, i16 %a2, i16 %a3, i16
define <4 x i64> @test_mm256_setr_epi32(i32 %a0, i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7) nounwind {
; X32-LABEL: test_mm256_setr_epi32:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
; X32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0
@@ -2768,7 +2768,7 @@ define <4 x i64> @test_mm256_setr_epi32(i32 %a0, i32 %a1, i32 %a2, i32 %a3, i32
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_setr_epi32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovd %r8d, %xmm0
; X64-NEXT: vpinsrd $1, %r9d, %xmm0, %xmm0
; X64-NEXT: vpinsrd $2, {{[0-9]+}}(%rsp), %xmm0, %xmm0
@@ -2793,7 +2793,7 @@ define <4 x i64> @test_mm256_setr_epi32(i32 %a0, i32 %a1, i32 %a2, i32 %a3, i32
define <4 x i64> @test_mm256_setr_epi64x(i64 %a0, i64 %a1, i64 %a2, i64 %a3) nounwind {
; X32-LABEL: test_mm256_setr_epi64x:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
; X32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0
@@ -2806,7 +2806,7 @@ define <4 x i64> @test_mm256_setr_epi64x(i64 %a0, i64 %a1, i64 %a2, i64 %a3) nou
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_setr_epi64x:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovq %rcx, %xmm0
; X64-NEXT: vmovq %rdx, %xmm1
; X64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
@@ -2824,14 +2824,14 @@ define <4 x i64> @test_mm256_setr_epi64x(i64 %a0, i64 %a1, i64 %a2, i64 %a3) nou
define <8 x float> @test_mm256_setr_m128(<4 x float> %a0, <4 x float> %a1) nounwind {
; X32-LABEL: test_mm256_setr_m128:
-; X32: # BB#0:
-; X32-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; X32: # %bb.0:
+; X32-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
; X32-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_setr_m128:
-; X64: # BB#0:
-; X64-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; X64: # %bb.0:
+; X64-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
; X64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; X64-NEXT: retq
%res = shufflevector <4 x float> %a0, <4 x float> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -2840,14 +2840,14 @@ define <8 x float> @test_mm256_setr_m128(<4 x float> %a0, <4 x float> %a1) nounw
define <4 x double> @test_mm256_setr_m128d(<2 x double> %a0, <2 x double> %a1) nounwind {
; X32-LABEL: test_mm256_setr_m128d:
-; X32: # BB#0:
-; X32-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; X32: # %bb.0:
+; X32-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
; X32-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_setr_m128d:
-; X64: # BB#0:
-; X64-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; X64: # %bb.0:
+; X64-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
; X64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; X64-NEXT: retq
%arg0 = bitcast <2 x double> %a0 to <4 x float>
@@ -2859,14 +2859,14 @@ define <4 x double> @test_mm256_setr_m128d(<2 x double> %a0, <2 x double> %a1) n
define <4 x i64> @test_mm256_setr_m128i(<2 x i64> %a0, <2 x i64> %a1) nounwind {
; X32-LABEL: test_mm256_setr_m128i:
-; X32: # BB#0:
-; X32-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; X32: # %bb.0:
+; X32-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
; X32-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_setr_m128i:
-; X64: # BB#0:
-; X64-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; X64: # %bb.0:
+; X64-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
; X64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <4 x float>
@@ -2878,20 +2878,20 @@ define <4 x i64> @test_mm256_setr_m128i(<2 x i64> %a0, <2 x i64> %a1) nounwind {
define <4 x double> @test_mm256_setr_pd(double %a0, double %a1, double %a2, double %a3) nounwind {
; X32-LABEL: test_mm256_setr_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; X32-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
; X32-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
; X32-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero
-; X32-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; X32-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm3[0],xmm2[0]
+; X32-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; X32-NEXT: vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm2[0]
; X32-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_setr_pd:
-; X64: # BB#0:
-; X64-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; X64-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X64: # %bb.0:
+; X64-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; X64-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; X64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; X64-NEXT: retq
%res0 = insertelement <4 x double> undef, double %a0, i32 0
@@ -2903,7 +2903,7 @@ define <4 x double> @test_mm256_setr_pd(double %a0, double %a1, double %a2, doub
define <8 x float> @test_mm256_setr_ps(float %a0, float %a1, float %a2, float %a3, float %a4, float %a5, float %a6, float %a7) nounwind {
; X32-LABEL: test_mm256_setr_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X32-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X32-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
@@ -2922,7 +2922,7 @@ define <8 x float> @test_mm256_setr_ps(float %a0, float %a1, float %a2, float %a
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_setr_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[2,3]
; X64-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1],xmm6[0],xmm4[3]
; X64-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1,2],xmm7[0]
@@ -2944,51 +2944,51 @@ define <8 x float> @test_mm256_setr_ps(float %a0, float %a1, float %a2, float %a
define <4 x double> @test_mm256_setzero_pd() nounwind {
; X32-LABEL: test_mm256_setzero_pd:
-; X32: # BB#0:
-; X32-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; X32: # %bb.0:
+; X32-NEXT: vxorps %xmm0, %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_setzero_pd:
-; X64: # BB#0:
-; X64-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; X64: # %bb.0:
+; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0
; X64-NEXT: retq
ret <4 x double> zeroinitializer
}
define <8 x float> @test_mm256_setzero_ps() nounwind {
; X32-LABEL: test_mm256_setzero_ps:
-; X32: # BB#0:
-; X32-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; X32: # %bb.0:
+; X32-NEXT: vxorps %xmm0, %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_setzero_ps:
-; X64: # BB#0:
-; X64-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; X64: # %bb.0:
+; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0
; X64-NEXT: retq
ret <8 x float> zeroinitializer
}
define <4 x i64> @test_mm256_setzero_si256() nounwind {
; X32-LABEL: test_mm256_setzero_si256:
-; X32: # BB#0:
-; X32-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; X32: # %bb.0:
+; X32-NEXT: vxorps %xmm0, %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_setzero_si256:
-; X64: # BB#0:
-; X64-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; X64: # %bb.0:
+; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0
; X64-NEXT: retq
ret <4 x i64> zeroinitializer
}
define <4 x double> @test_mm256_shuffle_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
; X32-LABEL: test_mm256_shuffle_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_shuffle_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
; X64-NEXT: retq
%res = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
@@ -2997,12 +2997,12 @@ define <4 x double> @test_mm256_shuffle_pd(<4 x double> %a0, <4 x double> %a1) n
define <8 x float> @test_mm256_shuffle_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
; X32-LABEL: test_mm256_shuffle_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[0,0],ymm0[4,4],ymm1[4,4]
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_shuffle_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[0,0],ymm0[4,4],ymm1[4,4]
; X64-NEXT: retq
%res = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 0, i32 0, i32 8, i32 8, i32 4, i32 4, i32 12, i32 12>
@@ -3011,12 +3011,12 @@ define <8 x float> @test_mm256_shuffle_ps(<8 x float> %a0, <8 x float> %a1) noun
define <4 x double> @test_mm256_sqrt_pd(<4 x double> %a0) nounwind {
; X32-LABEL: test_mm256_sqrt_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vsqrtpd %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_sqrt_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vsqrtpd %ymm0, %ymm0
; X64-NEXT: retq
%res = call <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double> %a0)
@@ -3026,12 +3026,12 @@ declare <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double>) nounwind readnone
define <8 x float> @test_mm256_sqrt_ps(<8 x float> %a0) nounwind {
; X32-LABEL: test_mm256_sqrt_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vsqrtps %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_sqrt_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vsqrtps %ymm0, %ymm0
; X64-NEXT: retq
%res = call <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float> %a0)
@@ -3041,14 +3041,14 @@ declare <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float>) nounwind readnone
define void @test_mm256_store_pd(double* %a0, <4 x double> %a1) nounwind {
; X32-LABEL: test_mm256_store_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vmovaps %ymm0, (%eax)
; X32-NEXT: vzeroupper
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_store_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovaps %ymm0, (%rdi)
; X64-NEXT: vzeroupper
; X64-NEXT: retq
@@ -3059,14 +3059,14 @@ define void @test_mm256_store_pd(double* %a0, <4 x double> %a1) nounwind {
define void @test_mm256_store_ps(float* %a0, <8 x float> %a1) nounwind {
; X32-LABEL: test_mm256_store_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vmovaps %ymm0, (%eax)
; X32-NEXT: vzeroupper
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_store_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovaps %ymm0, (%rdi)
; X64-NEXT: vzeroupper
; X64-NEXT: retq
@@ -3077,14 +3077,14 @@ define void @test_mm256_store_ps(float* %a0, <8 x float> %a1) nounwind {
define void @test_mm256_store_si256(<4 x i64>* %a0, <4 x i64> %a1) nounwind {
; X32-LABEL: test_mm256_store_si256:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vmovaps %ymm0, (%eax)
; X32-NEXT: vzeroupper
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_store_si256:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovaps %ymm0, (%rdi)
; X64-NEXT: vzeroupper
; X64-NEXT: retq
@@ -3094,14 +3094,14 @@ define void @test_mm256_store_si256(<4 x i64>* %a0, <4 x i64> %a1) nounwind {
define void @test_mm256_storeu_pd(double* %a0, <4 x double> %a1) nounwind {
; X32-LABEL: test_mm256_storeu_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vmovups %ymm0, (%eax)
; X32-NEXT: vzeroupper
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_storeu_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovups %ymm0, (%rdi)
; X64-NEXT: vzeroupper
; X64-NEXT: retq
@@ -3112,14 +3112,14 @@ define void @test_mm256_storeu_pd(double* %a0, <4 x double> %a1) nounwind {
define void @test_mm256_storeu_ps(float* %a0, <8 x float> %a1) nounwind {
; X32-LABEL: test_mm256_storeu_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vmovups %ymm0, (%eax)
; X32-NEXT: vzeroupper
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_storeu_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovups %ymm0, (%rdi)
; X64-NEXT: vzeroupper
; X64-NEXT: retq
@@ -3130,14 +3130,14 @@ define void @test_mm256_storeu_ps(float* %a0, <8 x float> %a1) nounwind {
define void @test_mm256_storeu_si256(<4 x i64>* %a0, <4 x i64> %a1) nounwind {
; X32-LABEL: test_mm256_storeu_si256:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vmovups %ymm0, (%eax)
; X32-NEXT: vzeroupper
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_storeu_si256:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovups %ymm0, (%rdi)
; X64-NEXT: vzeroupper
; X64-NEXT: retq
@@ -3147,7 +3147,7 @@ define void @test_mm256_storeu_si256(<4 x i64>* %a0, <4 x i64> %a1) nounwind {
define void @test_mm256_storeu2_m128(float* %a0, float* %a1, <8 x float> %a2) nounwind {
; X32-LABEL: test_mm256_storeu2_m128:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: vmovups %xmm0, (%ecx)
@@ -3157,7 +3157,7 @@ define void @test_mm256_storeu2_m128(float* %a0, float* %a1, <8 x float> %a2) no
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_storeu2_m128:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovups %xmm0, (%rdi)
; X64-NEXT: vextractf128 $1, %ymm0, %xmm0
; X64-NEXT: vmovups %xmm0, (%rsi)
@@ -3174,7 +3174,7 @@ define void @test_mm256_storeu2_m128(float* %a0, float* %a1, <8 x float> %a2) no
define void @test_mm256_storeu2_m128d(double* %a0, double* %a1, <4 x double> %a2) nounwind {
; X32-LABEL: test_mm256_storeu2_m128d:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: vmovups %xmm0, (%ecx)
@@ -3184,7 +3184,7 @@ define void @test_mm256_storeu2_m128d(double* %a0, double* %a1, <4 x double> %a2
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_storeu2_m128d:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovups %xmm0, (%rdi)
; X64-NEXT: vextractf128 $1, %ymm0, %xmm0
; X64-NEXT: vmovups %xmm0, (%rsi)
@@ -3201,7 +3201,7 @@ define void @test_mm256_storeu2_m128d(double* %a0, double* %a1, <4 x double> %a2
define void @test_mm256_storeu2_m128i(<2 x i64>* %a0, <2 x i64>* %a1, <4 x i64> %a2) nounwind {
; X32-LABEL: test_mm256_storeu2_m128i:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: vmovups %xmm0, (%ecx)
@@ -3211,7 +3211,7 @@ define void @test_mm256_storeu2_m128i(<2 x i64>* %a0, <2 x i64>* %a1, <4 x i64>
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_storeu2_m128i:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovups %xmm0, (%rdi)
; X64-NEXT: vextractf128 $1, %ymm0, %xmm0
; X64-NEXT: vmovups %xmm0, (%rsi)
@@ -3228,14 +3228,14 @@ define void @test_mm256_storeu2_m128i(<2 x i64>* %a0, <2 x i64>* %a1, <4 x i64>
define void @test_mm256_stream_pd(double *%a0, <4 x double> %a1) nounwind {
; X32-LABEL: test_mm256_stream_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vmovntps %ymm0, (%eax)
; X32-NEXT: vzeroupper
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_stream_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovntps %ymm0, (%rdi)
; X64-NEXT: vzeroupper
; X64-NEXT: retq
@@ -3246,14 +3246,14 @@ define void @test_mm256_stream_pd(double *%a0, <4 x double> %a1) nounwind {
define void @test_mm256_stream_ps(float *%a0, <8 x float> %a1) nounwind {
; X32-LABEL: test_mm256_stream_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vmovntps %ymm0, (%eax)
; X32-NEXT: vzeroupper
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_stream_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovntps %ymm0, (%rdi)
; X64-NEXT: vzeroupper
; X64-NEXT: retq
@@ -3264,14 +3264,14 @@ define void @test_mm256_stream_ps(float *%a0, <8 x float> %a1) nounwind {
define void @test_mm256_stream_si256(<4 x i64> *%a0, <4 x i64> %a1) nounwind {
; X32-LABEL: test_mm256_stream_si256:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vmovntps %ymm0, (%eax)
; X32-NEXT: vzeroupper
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_stream_si256:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovntps %ymm0, (%rdi)
; X64-NEXT: vzeroupper
; X64-NEXT: retq
@@ -3281,12 +3281,12 @@ define void @test_mm256_stream_si256(<4 x i64> *%a0, <4 x i64> %a1) nounwind {
define <4 x double> @test_mm256_sub_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
; X32-LABEL: test_mm256_sub_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vsubpd %ymm1, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_sub_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vsubpd %ymm1, %ymm0, %ymm0
; X64-NEXT: retq
%res = fsub <4 x double> %a0, %a1
@@ -3295,12 +3295,12 @@ define <4 x double> @test_mm256_sub_pd(<4 x double> %a0, <4 x double> %a1) nounw
define <8 x float> @test_mm256_sub_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
; X32-LABEL: test_mm256_sub_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vsubps %ymm1, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_sub_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vsubps %ymm1, %ymm0, %ymm0
; X64-NEXT: retq
%res = fsub <8 x float> %a0, %a1
@@ -3309,14 +3309,14 @@ define <8 x float> @test_mm256_sub_ps(<8 x float> %a0, <8 x float> %a1) nounwind
define i32 @test_mm_testc_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
; X32-LABEL: test_mm_testc_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: xorl %eax, %eax
; X32-NEXT: vtestpd %xmm1, %xmm0
; X32-NEXT: setb %al
; X32-NEXT: retl
;
; X64-LABEL: test_mm_testc_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: vtestpd %xmm1, %xmm0
; X64-NEXT: setb %al
@@ -3328,7 +3328,7 @@ declare i32 @llvm.x86.avx.vtestc.pd(<2 x double>, <2 x double>) nounwind readnon
define i32 @test_mm256_testc_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
; X32-LABEL: test_mm256_testc_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: xorl %eax, %eax
; X32-NEXT: vtestpd %ymm1, %ymm0
; X32-NEXT: setb %al
@@ -3336,7 +3336,7 @@ define i32 @test_mm256_testc_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_testc_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: vtestpd %ymm1, %ymm0
; X64-NEXT: setb %al
@@ -3349,14 +3349,14 @@ declare i32 @llvm.x86.avx.vtestc.pd.256(<4 x double>, <4 x double>) nounwind rea
define i32 @test_mm_testc_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
; X32-LABEL: test_mm_testc_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: xorl %eax, %eax
; X32-NEXT: vtestps %xmm1, %xmm0
; X32-NEXT: setb %al
; X32-NEXT: retl
;
; X64-LABEL: test_mm_testc_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: vtestps %xmm1, %xmm0
; X64-NEXT: setb %al
@@ -3368,7 +3368,7 @@ declare i32 @llvm.x86.avx.vtestc.ps(<4 x float>, <4 x float>) nounwind readnone
define i32 @test_mm256_testc_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
; X32-LABEL: test_mm256_testc_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: xorl %eax, %eax
; X32-NEXT: vtestps %ymm1, %ymm0
; X32-NEXT: setb %al
@@ -3376,7 +3376,7 @@ define i32 @test_mm256_testc_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_testc_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: vtestps %ymm1, %ymm0
; X64-NEXT: setb %al
@@ -3389,7 +3389,7 @@ declare i32 @llvm.x86.avx.vtestc.ps.256(<8 x float>, <8 x float>) nounwind readn
define i32 @test_mm256_testc_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind {
; X32-LABEL: test_mm256_testc_si256:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: xorl %eax, %eax
; X32-NEXT: vptest %ymm1, %ymm0
; X32-NEXT: setb %al
@@ -3397,7 +3397,7 @@ define i32 @test_mm256_testc_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind {
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_testc_si256:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: vptest %ymm1, %ymm0
; X64-NEXT: setb %al
@@ -3410,14 +3410,14 @@ declare i32 @llvm.x86.avx.ptestc.256(<4 x i64>, <4 x i64>) nounwind readnone
define i32 @test_mm_testnzc_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
; X32-LABEL: test_mm_testnzc_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: xorl %eax, %eax
; X32-NEXT: vtestpd %xmm1, %xmm0
; X32-NEXT: seta %al
; X32-NEXT: retl
;
; X64-LABEL: test_mm_testnzc_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: vtestpd %xmm1, %xmm0
; X64-NEXT: seta %al
@@ -3429,7 +3429,7 @@ declare i32 @llvm.x86.avx.vtestnzc.pd(<2 x double>, <2 x double>) nounwind readn
define i32 @test_mm256_testnzc_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
; X32-LABEL: test_mm256_testnzc_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: xorl %eax, %eax
; X32-NEXT: vtestpd %ymm1, %ymm0
; X32-NEXT: seta %al
@@ -3437,7 +3437,7 @@ define i32 @test_mm256_testnzc_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_testnzc_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: vtestpd %ymm1, %ymm0
; X64-NEXT: seta %al
@@ -3450,14 +3450,14 @@ declare i32 @llvm.x86.avx.vtestnzc.pd.256(<4 x double>, <4 x double>) nounwind r
define i32 @test_mm_testnzc_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
; X32-LABEL: test_mm_testnzc_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: xorl %eax, %eax
; X32-NEXT: vtestps %xmm1, %xmm0
; X32-NEXT: seta %al
; X32-NEXT: retl
;
; X64-LABEL: test_mm_testnzc_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: vtestps %xmm1, %xmm0
; X64-NEXT: seta %al
@@ -3469,7 +3469,7 @@ declare i32 @llvm.x86.avx.vtestnzc.ps(<4 x float>, <4 x float>) nounwind readnon
define i32 @test_mm256_testnzc_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
; X32-LABEL: test_mm256_testnzc_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: xorl %eax, %eax
; X32-NEXT: vtestps %ymm1, %ymm0
; X32-NEXT: seta %al
@@ -3477,7 +3477,7 @@ define i32 @test_mm256_testnzc_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_testnzc_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: vtestps %ymm1, %ymm0
; X64-NEXT: seta %al
@@ -3490,7 +3490,7 @@ declare i32 @llvm.x86.avx.vtestnzc.ps.256(<8 x float>, <8 x float>) nounwind rea
define i32 @test_mm256_testnzc_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind {
; X32-LABEL: test_mm256_testnzc_si256:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: xorl %eax, %eax
; X32-NEXT: vptest %ymm1, %ymm0
; X32-NEXT: seta %al
@@ -3498,7 +3498,7 @@ define i32 @test_mm256_testnzc_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind {
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_testnzc_si256:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: vptest %ymm1, %ymm0
; X64-NEXT: seta %al
@@ -3511,14 +3511,14 @@ declare i32 @llvm.x86.avx.ptestnzc.256(<4 x i64>, <4 x i64>) nounwind readnone
define i32 @test_mm_testz_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
; X32-LABEL: test_mm_testz_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: xorl %eax, %eax
; X32-NEXT: vtestpd %xmm1, %xmm0
; X32-NEXT: sete %al
; X32-NEXT: retl
;
; X64-LABEL: test_mm_testz_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: vtestpd %xmm1, %xmm0
; X64-NEXT: sete %al
@@ -3530,7 +3530,7 @@ declare i32 @llvm.x86.avx.vtestz.pd(<2 x double>, <2 x double>) nounwind readnon
define i32 @test_mm256_testz_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
; X32-LABEL: test_mm256_testz_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: xorl %eax, %eax
; X32-NEXT: vtestpd %ymm1, %ymm0
; X32-NEXT: sete %al
@@ -3538,7 +3538,7 @@ define i32 @test_mm256_testz_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_testz_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: vtestpd %ymm1, %ymm0
; X64-NEXT: sete %al
@@ -3551,14 +3551,14 @@ declare i32 @llvm.x86.avx.vtestz.pd.256(<4 x double>, <4 x double>) nounwind rea
define i32 @test_mm_testz_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
; X32-LABEL: test_mm_testz_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: xorl %eax, %eax
; X32-NEXT: vtestps %xmm1, %xmm0
; X32-NEXT: sete %al
; X32-NEXT: retl
;
; X64-LABEL: test_mm_testz_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: vtestps %xmm1, %xmm0
; X64-NEXT: sete %al
@@ -3570,7 +3570,7 @@ declare i32 @llvm.x86.avx.vtestz.ps(<4 x float>, <4 x float>) nounwind readnone
define i32 @test_mm256_testz_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
; X32-LABEL: test_mm256_testz_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: xorl %eax, %eax
; X32-NEXT: vtestps %ymm1, %ymm0
; X32-NEXT: sete %al
@@ -3578,7 +3578,7 @@ define i32 @test_mm256_testz_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_testz_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: vtestps %ymm1, %ymm0
; X64-NEXT: sete %al
@@ -3591,7 +3591,7 @@ declare i32 @llvm.x86.avx.vtestz.ps.256(<8 x float>, <8 x float>) nounwind readn
define i32 @test_mm256_testz_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind {
; X32-LABEL: test_mm256_testz_si256:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: xorl %eax, %eax
; X32-NEXT: vptest %ymm1, %ymm0
; X32-NEXT: sete %al
@@ -3599,7 +3599,7 @@ define i32 @test_mm256_testz_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind {
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_testz_si256:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: vptest %ymm1, %ymm0
; X64-NEXT: sete %al
@@ -3612,56 +3612,56 @@ declare i32 @llvm.x86.avx.ptestz.256(<4 x i64>, <4 x i64>) nounwind readnone
define <2 x double> @test_mm_undefined_pd() nounwind {
; X32-LABEL: test_mm_undefined_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: retl
;
; X64-LABEL: test_mm_undefined_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: retq
ret <2 x double> undef
}
define <4 x double> @test_mm256_undefined_pd() nounwind {
; X32-LABEL: test_mm256_undefined_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_undefined_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: retq
ret <4 x double> undef
}
define <8 x float> @test_mm256_undefined_ps() nounwind {
; X32-LABEL: test_mm256_undefined_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_undefined_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: retq
ret <8 x float> undef
}
define <4 x i64> @test_mm256_undefined_si256() nounwind {
; X32-LABEL: test_mm256_undefined_si256:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_undefined_si256:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: retq
ret <4 x i64> undef
}
define <4 x double> @test_mm256_unpackhi_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
; X32-LABEL: test_mm256_unpackhi_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_unpackhi_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
; X64-NEXT: retq
%res = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
@@ -3670,12 +3670,12 @@ define <4 x double> @test_mm256_unpackhi_pd(<4 x double> %a0, <4 x double> %a1)
define <8 x float> @test_mm256_unpackhi_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
; X32-LABEL: test_mm256_unpackhi_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_unpackhi_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
; X64-NEXT: retq
%res = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
@@ -3684,12 +3684,12 @@ define <8 x float> @test_mm256_unpackhi_ps(<8 x float> %a0, <8 x float> %a1) nou
define <4 x double> @test_mm256_unpacklo_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
; X32-LABEL: test_mm256_unpacklo_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_unpacklo_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
; X64-NEXT: retq
%res = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
@@ -3698,12 +3698,12 @@ define <4 x double> @test_mm256_unpacklo_pd(<4 x double> %a0, <4 x double> %a1)
define <8 x float> @test_mm256_unpacklo_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
; X32-LABEL: test_mm256_unpacklo_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_unpacklo_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
; X64-NEXT: retq
%res = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
@@ -3712,12 +3712,12 @@ define <8 x float> @test_mm256_unpacklo_ps(<8 x float> %a0, <8 x float> %a1) nou
define <4 x double> @test_mm256_xor_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
; X32-LABEL: test_mm256_xor_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vxorps %ymm1, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_xor_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vxorps %ymm1, %ymm0, %ymm0
; X64-NEXT: retq
%1 = bitcast <4 x double> %a0 to <4 x i64>
@@ -3729,12 +3729,12 @@ define <4 x double> @test_mm256_xor_pd(<4 x double> %a0, <4 x double> %a1) nounw
define <8 x float> @test_mm256_xor_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
; X32-LABEL: test_mm256_xor_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vxorps %ymm1, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_xor_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vxorps %ymm1, %ymm0, %ymm0
; X64-NEXT: retq
%1 = bitcast <8 x float> %a0 to <8 x i32>
@@ -3746,12 +3746,12 @@ define <8 x float> @test_mm256_xor_ps(<8 x float> %a0, <8 x float> %a1) nounwind
define void @test_mm256_zeroall() nounwind {
; X32-LABEL: test_mm256_zeroall:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vzeroall
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_zeroall:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vzeroall
; X64-NEXT: retq
call void @llvm.x86.avx.vzeroall()
@@ -3761,12 +3761,12 @@ declare void @llvm.x86.avx.vzeroall() nounwind readnone
define void @test_mm256_zeroupper() nounwind {
; X32-LABEL: test_mm256_zeroupper:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vzeroupper
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_zeroupper:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vzeroupper
; X64-NEXT: retq
call void @llvm.x86.avx.vzeroupper()
@@ -3776,17 +3776,13 @@ declare void @llvm.x86.avx.vzeroupper() nounwind readnone
define <4 x double> @test_mm256_zextpd128_pd256(<2 x double> %a0) nounwind {
; X32-LABEL: test_mm256_zextpd128_pd256:
-; X32: # BB#0:
-; X32-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
-; X32-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X32-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X32: # %bb.0:
+; X32-NEXT: vmovaps %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_zextpd128_pd256:
-; X64: # BB#0:
-; X64-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
-; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X64: # %bb.0:
+; X64-NEXT: vmovaps %xmm0, %xmm0
; X64-NEXT: retq
%res = shufflevector <2 x double> %a0, <2 x double> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
ret <4 x double> %res
@@ -3794,17 +3790,13 @@ define <4 x double> @test_mm256_zextpd128_pd256(<2 x double> %a0) nounwind {
define <8 x float> @test_mm256_zextps128_ps256(<4 x float> %a0) nounwind {
; X32-LABEL: test_mm256_zextps128_ps256:
-; X32: # BB#0:
-; X32-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
-; X32-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X32-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X32: # %bb.0:
+; X32-NEXT: vmovaps %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_zextps128_ps256:
-; X64: # BB#0:
-; X64-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
-; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X64: # %bb.0:
+; X64-NEXT: vmovaps %xmm0, %xmm0
; X64-NEXT: retq
%res = shufflevector <4 x float> %a0, <4 x float> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
ret <8 x float> %res
@@ -3812,17 +3804,13 @@ define <8 x float> @test_mm256_zextps128_ps256(<4 x float> %a0) nounwind {
define <4 x i64> @test_mm256_zextsi128_si256(<2 x i64> %a0) nounwind {
; X32-LABEL: test_mm256_zextsi128_si256:
-; X32: # BB#0:
-; X32-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
-; X32-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X32-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X32: # %bb.0:
+; X32-NEXT: vmovaps %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_zextsi128_si256:
-; X64: # BB#0:
-; X64-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
-; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X64: # %bb.0:
+; X64-NEXT: vmovaps %xmm0, %xmm0
; X64-NEXT: retq
%res = shufflevector <2 x i64> %a0, <2 x i64> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
ret <4 x i64> %res
diff --git a/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll b/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll
index 2bcb083f6d4c..70358cdaf9e2 100644
--- a/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll
+++ b/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll
@@ -1,13 +1,14 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=avx | FileCheck %s
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=CHECK --check-prefix=X86
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=CHECK --check-prefix=X64
; We don't check any vinsertf128 variant with immediate 0 because that's just a blend.
define <4 x double> @test_x86_avx_vinsertf128_pd_256_1(<4 x double> %a0, <2 x double> %a1) {
; CHECK-LABEL: test_x86_avx_vinsertf128_pd_256_1:
-; CHECK: ## BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; CHECK-NEXT: retl
+; CHECK-NEXT: ret{{[l|q]}}
%res = call <4 x double> @llvm.x86.avx.vinsertf128.pd.256(<4 x double> %a0, <2 x double> %a1, i8 1)
ret <4 x double> %res
}
@@ -15,9 +16,9 @@ declare <4 x double> @llvm.x86.avx.vinsertf128.pd.256(<4 x double>, <2 x double>
define <8 x float> @test_x86_avx_vinsertf128_ps_256_1(<8 x float> %a0, <4 x float> %a1) {
; CHECK-LABEL: test_x86_avx_vinsertf128_ps_256_1:
-; CHECK: ## BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; CHECK-NEXT: retl
+; CHECK-NEXT: ret{{[l|q]}}
%res = call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> %a0, <4 x float> %a1, i8 1)
ret <8 x float> %res
}
@@ -25,9 +26,9 @@ declare <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float>, <4 x float>, i
define <8 x i32> @test_x86_avx_vinsertf128_si_256_1(<8 x i32> %a0, <4 x i32> %a1) {
; CHECK-LABEL: test_x86_avx_vinsertf128_si_256_1:
-; CHECK: ## BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; CHECK-NEXT: retl
+; CHECK-NEXT: ret{{[l|q]}}
%res = call <8 x i32> @llvm.x86.avx.vinsertf128.si.256(<8 x i32> %a0, <4 x i32> %a1, i8 1)
ret <8 x i32> %res
}
@@ -37,10 +38,10 @@ define <8 x i32> @test_x86_avx_vinsertf128_si_256_1(<8 x i32> %a0, <4 x i32> %a1
; not a vinsertf128 $1.
define <8 x i32> @test_x86_avx_vinsertf128_si_256_2(<8 x i32> %a0, <4 x i32> %a1) {
; CHECK-LABEL: test_x86_avx_vinsertf128_si_256_2:
-; CHECK: ## BB#0:
-; CHECK-NEXT: ## kill: %XMM1<def> %XMM1<kill> %YMM1<def>
+; CHECK: # %bb.0:
+; CHECK-NEXT: # kill: def %xmm1 killed %xmm1 def %ymm1
; CHECK-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
-; CHECK-NEXT: retl
+; CHECK-NEXT: ret{{[l|q]}}
%res = call <8 x i32> @llvm.x86.avx.vinsertf128.si.256(<8 x i32> %a0, <4 x i32> %a1, i8 2)
ret <8 x i32> %res
}
@@ -50,10 +51,10 @@ declare <8 x i32> @llvm.x86.avx.vinsertf128.si.256(<8 x i32>, <4 x i32>, i8) nou
define <2 x double> @test_x86_avx_vextractf128_pd_256_1(<4 x double> %a0) {
; CHECK-LABEL: test_x86_avx_vextractf128_pd_256_1:
-; CHECK: ## BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0
; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retl
+; CHECK-NEXT: ret{{[l|q]}}
%res = call <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double> %a0, i8 1)
ret <2 x double> %res
}
@@ -61,10 +62,10 @@ declare <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double>, i8) nounwin
define <4 x float> @test_x86_avx_vextractf128_ps_256_1(<8 x float> %a0) {
; CHECK-LABEL: test_x86_avx_vextractf128_ps_256_1:
-; CHECK: ## BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0
; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retl
+; CHECK-NEXT: ret{{[l|q]}}
%res = call <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float> %a0, i8 1)
ret <4 x float> %res
}
@@ -72,10 +73,10 @@ declare <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float>, i8) nounwind
define <4 x i32> @test_x86_avx_vextractf128_si_256_1(<8 x i32> %a0) {
; CHECK-LABEL: test_x86_avx_vextractf128_si_256_1:
-; CHECK: ## BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0
; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retl
+; CHECK-NEXT: ret{{[l|q]}}
%res = call <4 x i32> @llvm.x86.avx.vextractf128.si.256(<8 x i32> %a0, i8 1)
ret <4 x i32> %res
}
@@ -86,21 +87,26 @@ declare <4 x i32> @llvm.x86.avx.vextractf128.si.256(<8 x i32>, i8) nounwind read
; not a vextractf128 of any kind.
define <2 x double> @test_x86_avx_extractf128_pd_256_2(<4 x double> %a0) {
; CHECK-LABEL: test_x86_avx_extractf128_pd_256_2:
-; CHECK: ## BB#0:
-; CHECK-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; CHECK: # %bb.0:
+; CHECK-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retl
+; CHECK-NEXT: ret{{[l|q]}}
%res = call <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double> %a0, i8 2)
ret <2 x double> %res
}
define <4 x double> @test_x86_avx_vbroadcastf128_pd_256(i8* %a0) {
-; CHECK-LABEL: test_x86_avx_vbroadcastf128_pd_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
-; CHECK-NEXT: retl
+; X86-LABEL: test_x86_avx_vbroadcastf128_pd_256:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
+; X86-NEXT: ret{{[l|q]}}
+;
+; X64-LABEL: test_x86_avx_vbroadcastf128_pd_256:
+; X64: # %bb.0:
+; X64-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
+; X64-NEXT: ret{{[l|q]}}
%res = call <4 x double> @llvm.x86.avx.vbroadcastf128.pd.256(i8* %a0) ; <<4 x double>> [#uses=1]
ret <4 x double> %res
}
@@ -108,11 +114,16 @@ declare <4 x double> @llvm.x86.avx.vbroadcastf128.pd.256(i8*) nounwind readonly
define <8 x float> @test_x86_avx_vbroadcastf128_ps_256(i8* %a0) {
-; CHECK-LABEL: test_x86_avx_vbroadcastf128_ps_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
-; CHECK-NEXT: retl
+; X86-LABEL: test_x86_avx_vbroadcastf128_ps_256:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
+; X86-NEXT: ret{{[l|q]}}
+;
+; X64-LABEL: test_x86_avx_vbroadcastf128_ps_256:
+; X64: # %bb.0:
+; X64-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
+; X64-NEXT: ret{{[l|q]}}
%res = call <8 x float> @llvm.x86.avx.vbroadcastf128.ps.256(i8* %a0) ; <<8 x float>> [#uses=1]
ret <8 x float> %res
}
@@ -121,9 +132,9 @@ declare <8 x float> @llvm.x86.avx.vbroadcastf128.ps.256(i8*) nounwind readonly
define <4 x double> @test_x86_avx_blend_pd_256(<4 x double> %a0, <4 x double> %a1) {
; CHECK-LABEL: test_x86_avx_blend_pd_256:
-; CHECK: ## BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3]
-; CHECK-NEXT: retl
+; CHECK-NEXT: ret{{[l|q]}}
%res = call <4 x double> @llvm.x86.avx.blend.pd.256(<4 x double> %a0, <4 x double> %a1, i32 7) ; <<4 x double>> [#uses=1]
ret <4 x double> %res
}
@@ -132,9 +143,9 @@ declare <4 x double> @llvm.x86.avx.blend.pd.256(<4 x double>, <4 x double>, i32)
define <8 x float> @test_x86_avx_blend_ps_256(<8 x float> %a0, <8 x float> %a1) {
; CHECK-LABEL: test_x86_avx_blend_ps_256:
-; CHECK: ## BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
-; CHECK-NEXT: retl
+; CHECK-NEXT: ret{{[l|q]}}
%res = call <8 x float> @llvm.x86.avx.blend.ps.256(<8 x float> %a0, <8 x float> %a1, i32 7) ; <<8 x float>> [#uses=1]
ret <8 x float> %res
}
@@ -143,9 +154,9 @@ declare <8 x float> @llvm.x86.avx.blend.ps.256(<8 x float>, <8 x float>, i32) no
define <8 x float> @test_x86_avx_dp_ps_256(<8 x float> %a0, <8 x float> %a1) {
; CHECK-LABEL: test_x86_avx_dp_ps_256:
-; CHECK: ## BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vdpps $7, %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: retl
+; CHECK-NEXT: ret{{[l|q]}}
%res = call <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float> %a0, <8 x float> %a1, i32 7) ; <<8 x float>> [#uses=1]
ret <8 x float> %res
}
@@ -154,9 +165,9 @@ declare <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float>, <8 x float>, i32) nounw
define <2 x i64> @test_x86_sse2_psll_dq(<2 x i64> %a0) {
; CHECK-LABEL: test_x86_sse2_psll_dq:
-; CHECK: ## BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpslldq {{.*#+}} xmm0 = zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
-; CHECK-NEXT: retl
+; CHECK-NEXT: ret{{[l|q]}}
%res = call <2 x i64> @llvm.x86.sse2.psll.dq(<2 x i64> %a0, i32 8) ; <<2 x i64>> [#uses=1]
ret <2 x i64> %res
}
@@ -165,9 +176,9 @@ declare <2 x i64> @llvm.x86.sse2.psll.dq(<2 x i64>, i32) nounwind readnone
define <2 x i64> @test_x86_sse2_psrl_dq(<2 x i64> %a0) {
; CHECK-LABEL: test_x86_sse2_psrl_dq:
-; CHECK: ## BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero
-; CHECK-NEXT: retl
+; CHECK-NEXT: ret{{[l|q]}}
%res = call <2 x i64> @llvm.x86.sse2.psrl.dq(<2 x i64> %a0, i32 8) ; <<2 x i64>> [#uses=1]
ret <2 x i64> %res
}
@@ -176,9 +187,9 @@ declare <2 x i64> @llvm.x86.sse2.psrl.dq(<2 x i64>, i32) nounwind readnone
define <2 x double> @test_x86_sse41_blendpd(<2 x double> %a0, <2 x double> %a1) {
; CHECK-LABEL: test_x86_sse41_blendpd:
-; CHECK: ## BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
-; CHECK-NEXT: retl
+; CHECK-NEXT: ret{{[l|q]}}
%res = call <2 x double> @llvm.x86.sse41.blendpd(<2 x double> %a0, <2 x double> %a1, i8 2) ; <<2 x double>> [#uses=1]
ret <2 x double> %res
}
@@ -187,9 +198,9 @@ declare <2 x double> @llvm.x86.sse41.blendpd(<2 x double>, <2 x double>, i8) nou
define <4 x float> @test_x86_sse41_blendps(<4 x float> %a0, <4 x float> %a1) {
; CHECK-LABEL: test_x86_sse41_blendps:
-; CHECK: ## BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
-; CHECK-NEXT: retl
+; CHECK-NEXT: ret{{[l|q]}}
%res = call <4 x float> @llvm.x86.sse41.blendps(<4 x float> %a0, <4 x float> %a1, i8 7) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
@@ -198,9 +209,9 @@ declare <4 x float> @llvm.x86.sse41.blendps(<4 x float>, <4 x float>, i8) nounwi
define <8 x i16> @test_x86_sse41_pblendw(<8 x i16> %a0, <8 x i16> %a1) {
; CHECK-LABEL: test_x86_sse41_pblendw:
-; CHECK: ## BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4,5,6,7]
-; CHECK-NEXT: retl
+; CHECK-NEXT: ret{{[l|q]}}
%res = call <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16> %a0, <8 x i16> %a1, i8 7) ; <<8 x i16>> [#uses=1]
ret <8 x i16> %res
}
@@ -209,9 +220,9 @@ declare <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16>, <8 x i16>, i8) nounwind rea
define <4 x i32> @test_x86_sse41_pmovsxbd(<16 x i8> %a0) {
; CHECK-LABEL: test_x86_sse41_pmovsxbd:
-; CHECK: ## BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpmovsxbd %xmm0, %xmm0
-; CHECK-NEXT: retl
+; CHECK-NEXT: ret{{[l|q]}}
%res = call <4 x i32> @llvm.x86.sse41.pmovsxbd(<16 x i8> %a0) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %res
}
@@ -220,9 +231,9 @@ declare <4 x i32> @llvm.x86.sse41.pmovsxbd(<16 x i8>) nounwind readnone
define <2 x i64> @test_x86_sse41_pmovsxbq(<16 x i8> %a0) {
; CHECK-LABEL: test_x86_sse41_pmovsxbq:
-; CHECK: ## BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpmovsxbq %xmm0, %xmm0
-; CHECK-NEXT: retl
+; CHECK-NEXT: ret{{[l|q]}}
%res = call <2 x i64> @llvm.x86.sse41.pmovsxbq(<16 x i8> %a0) ; <<2 x i64>> [#uses=1]
ret <2 x i64> %res
}
@@ -231,9 +242,9 @@ declare <2 x i64> @llvm.x86.sse41.pmovsxbq(<16 x i8>) nounwind readnone
define <8 x i16> @test_x86_sse41_pmovsxbw(<16 x i8> %a0) {
; CHECK-LABEL: test_x86_sse41_pmovsxbw:
-; CHECK: ## BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpmovsxbw %xmm0, %xmm0
-; CHECK-NEXT: retl
+; CHECK-NEXT: ret{{[l|q]}}
%res = call <8 x i16> @llvm.x86.sse41.pmovsxbw(<16 x i8> %a0) ; <<8 x i16>> [#uses=1]
ret <8 x i16> %res
}
@@ -242,9 +253,9 @@ declare <8 x i16> @llvm.x86.sse41.pmovsxbw(<16 x i8>) nounwind readnone
define <2 x i64> @test_x86_sse41_pmovsxdq(<4 x i32> %a0) {
; CHECK-LABEL: test_x86_sse41_pmovsxdq:
-; CHECK: ## BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpmovsxdq %xmm0, %xmm0
-; CHECK-NEXT: retl
+; CHECK-NEXT: ret{{[l|q]}}
%res = call <2 x i64> @llvm.x86.sse41.pmovsxdq(<4 x i32> %a0) ; <<2 x i64>> [#uses=1]
ret <2 x i64> %res
}
@@ -253,9 +264,9 @@ declare <2 x i64> @llvm.x86.sse41.pmovsxdq(<4 x i32>) nounwind readnone
define <4 x i32> @test_x86_sse41_pmovsxwd(<8 x i16> %a0) {
; CHECK-LABEL: test_x86_sse41_pmovsxwd:
-; CHECK: ## BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpmovsxwd %xmm0, %xmm0
-; CHECK-NEXT: retl
+; CHECK-NEXT: ret{{[l|q]}}
%res = call <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16> %a0) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %res
}
@@ -264,9 +275,9 @@ declare <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16>) nounwind readnone
define <2 x i64> @test_x86_sse41_pmovsxwq(<8 x i16> %a0) {
; CHECK-LABEL: test_x86_sse41_pmovsxwq:
-; CHECK: ## BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpmovsxwq %xmm0, %xmm0
-; CHECK-NEXT: retl
+; CHECK-NEXT: ret{{[l|q]}}
%res = call <2 x i64> @llvm.x86.sse41.pmovsxwq(<8 x i16> %a0) ; <<2 x i64>> [#uses=1]
ret <2 x i64> %res
}
@@ -275,9 +286,9 @@ declare <2 x i64> @llvm.x86.sse41.pmovsxwq(<8 x i16>) nounwind readnone
define <4 x i32> @test_x86_sse41_pmovzxbd(<16 x i8> %a0) {
; CHECK-LABEL: test_x86_sse41_pmovzxbd:
-; CHECK: ## BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; CHECK-NEXT: retl
+; CHECK-NEXT: ret{{[l|q]}}
%res = call <4 x i32> @llvm.x86.sse41.pmovzxbd(<16 x i8> %a0) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %res
}
@@ -286,9 +297,9 @@ declare <4 x i32> @llvm.x86.sse41.pmovzxbd(<16 x i8>) nounwind readnone
define <2 x i64> @test_x86_sse41_pmovzxbq(<16 x i8> %a0) {
; CHECK-LABEL: test_x86_sse41_pmovzxbq:
-; CHECK: ## BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
-; CHECK-NEXT: retl
+; CHECK-NEXT: ret{{[l|q]}}
%res = call <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8> %a0) ; <<2 x i64>> [#uses=1]
ret <2 x i64> %res
}
@@ -297,9 +308,9 @@ declare <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8>) nounwind readnone
define <8 x i16> @test_x86_sse41_pmovzxbw(<16 x i8> %a0) {
; CHECK-LABEL: test_x86_sse41_pmovzxbw:
-; CHECK: ## BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; CHECK-NEXT: retl
+; CHECK-NEXT: ret{{[l|q]}}
%res = call <8 x i16> @llvm.x86.sse41.pmovzxbw(<16 x i8> %a0) ; <<8 x i16>> [#uses=1]
ret <8 x i16> %res
}
@@ -308,9 +319,9 @@ declare <8 x i16> @llvm.x86.sse41.pmovzxbw(<16 x i8>) nounwind readnone
define <2 x i64> @test_x86_sse41_pmovzxdq(<4 x i32> %a0) {
; CHECK-LABEL: test_x86_sse41_pmovzxdq:
-; CHECK: ## BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; CHECK-NEXT: retl
+; CHECK-NEXT: ret{{[l|q]}}
%res = call <2 x i64> @llvm.x86.sse41.pmovzxdq(<4 x i32> %a0) ; <<2 x i64>> [#uses=1]
ret <2 x i64> %res
}
@@ -319,9 +330,9 @@ declare <2 x i64> @llvm.x86.sse41.pmovzxdq(<4 x i32>) nounwind readnone
define <4 x i32> @test_x86_sse41_pmovzxwd(<8 x i16> %a0) {
; CHECK-LABEL: test_x86_sse41_pmovzxwd:
-; CHECK: ## BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; CHECK-NEXT: retl
+; CHECK-NEXT: ret{{[l|q]}}
%res = call <4 x i32> @llvm.x86.sse41.pmovzxwd(<8 x i16> %a0) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %res
}
@@ -330,9 +341,9 @@ declare <4 x i32> @llvm.x86.sse41.pmovzxwd(<8 x i16>) nounwind readnone
define <2 x i64> @test_x86_sse41_pmovzxwq(<8 x i16> %a0) {
; CHECK-LABEL: test_x86_sse41_pmovzxwq:
-; CHECK: ## BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; CHECK-NEXT: retl
+; CHECK-NEXT: ret{{[l|q]}}
%res = call <2 x i64> @llvm.x86.sse41.pmovzxwq(<8 x i16> %a0) ; <<2 x i64>> [#uses=1]
ret <2 x i64> %res
}
@@ -341,9 +352,9 @@ declare <2 x i64> @llvm.x86.sse41.pmovzxwq(<8 x i16>) nounwind readnone
define <2 x double> @test_x86_sse2_cvtdq2pd(<4 x i32> %a0) {
; CHECK-LABEL: test_x86_sse2_cvtdq2pd:
-; CHECK: ## BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vcvtdq2pd %xmm0, %xmm0
-; CHECK-NEXT: retl
+; CHECK-NEXT: ret{{[l|q]}}
%res = call <2 x double> @llvm.x86.sse2.cvtdq2pd(<4 x i32> %a0) ; <<2 x double>> [#uses=1]
ret <2 x double> %res
}
@@ -352,9 +363,9 @@ declare <2 x double> @llvm.x86.sse2.cvtdq2pd(<4 x i32>) nounwind readnone
define <4 x double> @test_x86_avx_cvtdq2_pd_256(<4 x i32> %a0) {
; CHECK-LABEL: test_x86_avx_cvtdq2_pd_256:
-; CHECK: ## BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vcvtdq2pd %xmm0, %ymm0
-; CHECK-NEXT: retl
+; CHECK-NEXT: ret{{[l|q]}}
%res = call <4 x double> @llvm.x86.avx.cvtdq2.pd.256(<4 x i32> %a0) ; <<4 x double>> [#uses=1]
ret <4 x double> %res
}
@@ -363,9 +374,9 @@ declare <4 x double> @llvm.x86.avx.cvtdq2.pd.256(<4 x i32>) nounwind readnone
define <2 x double> @test_x86_sse2_cvtps2pd(<4 x float> %a0) {
; CHECK-LABEL: test_x86_sse2_cvtps2pd:
-; CHECK: ## BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vcvtps2pd %xmm0, %xmm0
-; CHECK-NEXT: retl
+; CHECK-NEXT: ret{{[l|q]}}
%res = call <2 x double> @llvm.x86.sse2.cvtps2pd(<4 x float> %a0) ; <<2 x double>> [#uses=1]
ret <2 x double> %res
}
@@ -374,9 +385,9 @@ declare <2 x double> @llvm.x86.sse2.cvtps2pd(<4 x float>) nounwind readnone
define <4 x double> @test_x86_avx_cvt_ps2_pd_256(<4 x float> %a0) {
; CHECK-LABEL: test_x86_avx_cvt_ps2_pd_256:
-; CHECK: ## BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vcvtps2pd %xmm0, %ymm0
-; CHECK-NEXT: retl
+; CHECK-NEXT: ret{{[l|q]}}
%res = call <4 x double> @llvm.x86.avx.cvt.ps2.pd.256(<4 x float> %a0) ; <<4 x double>> [#uses=1]
ret <4 x double> %res
}
@@ -385,13 +396,20 @@ declare <4 x double> @llvm.x86.avx.cvt.ps2.pd.256(<4 x float>) nounwind readnone
define void @test_x86_sse2_storeu_dq(i8* %a0, <16 x i8> %a1) {
; add operation forces the execution domain.
-; CHECK-LABEL: test_x86_sse2_storeu_dq:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpsubb %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: vmovdqu %xmm0, (%eax)
-; CHECK-NEXT: retl
+; X86-LABEL: test_x86_sse2_storeu_dq:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; X86-NEXT: vpsubb %xmm1, %xmm0, %xmm0
+; X86-NEXT: vmovdqu %xmm0, (%eax)
+; X86-NEXT: ret{{[l|q]}}
+;
+; X64-LABEL: test_x86_sse2_storeu_dq:
+; X64: # %bb.0:
+; X64-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; X64-NEXT: vpsubb %xmm1, %xmm0, %xmm0
+; X64-NEXT: vmovdqu %xmm0, (%rdi)
+; X64-NEXT: ret{{[l|q]}}
%a2 = add <16 x i8> %a1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
call void @llvm.x86.sse2.storeu.dq(i8* %a0, <16 x i8> %a2)
ret void
@@ -401,14 +419,22 @@ declare void @llvm.x86.sse2.storeu.dq(i8*, <16 x i8>) nounwind
define void @test_x86_sse2_storeu_pd(i8* %a0, <2 x double> %a1) {
; fadd operation forces the execution domain.
-; CHECK-LABEL: test_x86_sse2_storeu_pd:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
-; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: vmovupd %xmm0, (%eax)
-; CHECK-NEXT: retl
+; X86-LABEL: test_x86_sse2_storeu_pd:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: vxorpd %xmm1, %xmm1, %xmm1
+; X86-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; X86-NEXT: vaddpd %xmm1, %xmm0, %xmm0
+; X86-NEXT: vmovupd %xmm0, (%eax)
+; X86-NEXT: ret{{[l|q]}}
+;
+; X64-LABEL: test_x86_sse2_storeu_pd:
+; X64: # %bb.0:
+; X64-NEXT: vxorpd %xmm1, %xmm1, %xmm1
+; X64-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; X64-NEXT: vaddpd %xmm1, %xmm0, %xmm0
+; X64-NEXT: vmovupd %xmm0, (%rdi)
+; X64-NEXT: ret{{[l|q]}}
%a2 = fadd <2 x double> %a1, <double 0x0, double 0x4200000000000000>
call void @llvm.x86.sse2.storeu.pd(i8* %a0, <2 x double> %a2)
ret void
@@ -417,11 +443,16 @@ declare void @llvm.x86.sse2.storeu.pd(i8*, <2 x double>) nounwind
define void @test_x86_sse_storeu_ps(i8* %a0, <4 x float> %a1) {
-; CHECK-LABEL: test_x86_sse_storeu_ps:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: vmovups %xmm0, (%eax)
-; CHECK-NEXT: retl
+; X86-LABEL: test_x86_sse_storeu_ps:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: vmovups %xmm0, (%eax)
+; X86-NEXT: ret{{[l|q]}}
+;
+; X64-LABEL: test_x86_sse_storeu_ps:
+; X64: # %bb.0:
+; X64-NEXT: vmovups %xmm0, (%rdi)
+; X64-NEXT: ret{{[l|q]}}
call void @llvm.x86.sse.storeu.ps(i8* %a0, <4 x float> %a1)
ret void
}
@@ -431,17 +462,28 @@ declare void @llvm.x86.sse.storeu.ps(i8*, <4 x float>) nounwind
define void @test_x86_avx_storeu_dq_256(i8* %a0, <32 x i8> %a1) {
; FIXME: unfortunately the execution domain fix pass changes this to vmovups and its hard to force with no 256-bit integer instructions
; add operation forces the execution domain.
-; CHECK-LABEL: test_x86_avx_storeu_dq_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1
-; CHECK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vpsubb %xmm2, %xmm1, %xmm1
-; CHECK-NEXT: vpsubb %xmm2, %xmm0, %xmm0
-; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; CHECK-NEXT: vmovups %ymm0, (%eax)
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retl
+; X86-LABEL: test_x86_avx_storeu_dq_256:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X86-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; X86-NEXT: vpsubb %xmm2, %xmm1, %xmm1
+; X86-NEXT: vpsubb %xmm2, %xmm0, %xmm0
+; X86-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X86-NEXT: vmovups %ymm0, (%eax)
+; X86-NEXT: vzeroupper
+; X86-NEXT: ret{{[l|q]}}
+;
+; X64-LABEL: test_x86_avx_storeu_dq_256:
+; X64: # %bb.0:
+; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; X64-NEXT: vpsubb %xmm2, %xmm1, %xmm1
+; X64-NEXT: vpsubb %xmm2, %xmm0, %xmm0
+; X64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X64-NEXT: vmovups %ymm0, (%rdi)
+; X64-NEXT: vzeroupper
+; X64-NEXT: ret{{[l|q]}}
%a2 = add <32 x i8> %a1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
call void @llvm.x86.avx.storeu.dq.256(i8* %a0, <32 x i8> %a2)
ret void
@@ -451,14 +493,22 @@ declare void @llvm.x86.avx.storeu.dq.256(i8*, <32 x i8>) nounwind
define void @test_x86_avx_storeu_pd_256(i8* %a0, <4 x double> %a1) {
; add operation forces the execution domain.
-; CHECK-LABEL: test_x86_avx_storeu_pd_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: vxorpd %ymm1, %ymm1, %ymm1
-; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: vmovupd %ymm0, (%eax)
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retl
+; X86-LABEL: test_x86_avx_storeu_pd_256:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: vxorpd %xmm1, %xmm1, %xmm1
+; X86-NEXT: vaddpd %ymm1, %ymm0, %ymm0
+; X86-NEXT: vmovupd %ymm0, (%eax)
+; X86-NEXT: vzeroupper
+; X86-NEXT: ret{{[l|q]}}
+;
+; X64-LABEL: test_x86_avx_storeu_pd_256:
+; X64: # %bb.0:
+; X64-NEXT: vxorpd %xmm1, %xmm1, %xmm1
+; X64-NEXT: vaddpd %ymm1, %ymm0, %ymm0
+; X64-NEXT: vmovupd %ymm0, (%rdi)
+; X64-NEXT: vzeroupper
+; X64-NEXT: ret{{[l|q]}}
%a2 = fadd <4 x double> %a1, <double 0x0, double 0x0, double 0x0, double 0x0>
call void @llvm.x86.avx.storeu.pd.256(i8* %a0, <4 x double> %a2)
ret void
@@ -467,12 +517,18 @@ declare void @llvm.x86.avx.storeu.pd.256(i8*, <4 x double>) nounwind
define void @test_x86_avx_storeu_ps_256(i8* %a0, <8 x float> %a1) {
-; CHECK-LABEL: test_x86_avx_storeu_ps_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: vmovups %ymm0, (%eax)
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retl
+; X86-LABEL: test_x86_avx_storeu_ps_256:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: vmovups %ymm0, (%eax)
+; X86-NEXT: vzeroupper
+; X86-NEXT: ret{{[l|q]}}
+;
+; X64-LABEL: test_x86_avx_storeu_ps_256:
+; X64: # %bb.0:
+; X64-NEXT: vmovups %ymm0, (%rdi)
+; X64-NEXT: vzeroupper
+; X64-NEXT: ret{{[l|q]}}
call void @llvm.x86.avx.storeu.ps.256(i8* %a0, <8 x float> %a1)
ret void
}
@@ -481,9 +537,9 @@ declare void @llvm.x86.avx.storeu.ps.256(i8*, <8 x float>) nounwind
define <2 x double> @test_x86_avx_vpermil_pd(<2 x double> %a0) {
; CHECK-LABEL: test_x86_avx_vpermil_pd:
-; CHECK: ## BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; CHECK-NEXT: retl
+; CHECK-NEXT: ret{{[l|q]}}
%res = call <2 x double> @llvm.x86.avx.vpermil.pd(<2 x double> %a0, i8 1) ; <<2 x double>> [#uses=1]
ret <2 x double> %res
}
@@ -492,9 +548,9 @@ declare <2 x double> @llvm.x86.avx.vpermil.pd(<2 x double>, i8) nounwind readnon
define <4 x double> @test_x86_avx_vpermil_pd_256(<4 x double> %a0) {
; CHECK-LABEL: test_x86_avx_vpermil_pd_256:
-; CHECK: ## BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,1,3,2]
-; CHECK-NEXT: retl
+; CHECK-NEXT: ret{{[l|q]}}
%res = call <4 x double> @llvm.x86.avx.vpermil.pd.256(<4 x double> %a0, i8 7) ; <<4 x double>> [#uses=1]
ret <4 x double> %res
}
@@ -503,9 +559,9 @@ declare <4 x double> @llvm.x86.avx.vpermil.pd.256(<4 x double>, i8) nounwind rea
define <4 x float> @test_x86_avx_vpermil_ps(<4 x float> %a0) {
; CHECK-LABEL: test_x86_avx_vpermil_ps:
-; CHECK: ## BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,0,0]
-; CHECK-NEXT: retl
+; CHECK-NEXT: ret{{[l|q]}}
%res = call <4 x float> @llvm.x86.avx.vpermil.ps(<4 x float> %a0, i8 7) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
@@ -514,10 +570,43 @@ declare <4 x float> @llvm.x86.avx.vpermil.ps(<4 x float>, i8) nounwind readnone
define <8 x float> @test_x86_avx_vpermil_ps_256(<8 x float> %a0) {
; CHECK-LABEL: test_x86_avx_vpermil_ps_256:
-; CHECK: ## BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,1,0,0,7,5,4,4]
-; CHECK-NEXT: retl
+; CHECK-NEXT: ret{{[l|q]}}
%res = call <8 x float> @llvm.x86.avx.vpermil.ps.256(<8 x float> %a0, i8 7) ; <<8 x float>> [#uses=1]
ret <8 x float> %res
}
declare <8 x float> @llvm.x86.avx.vpermil.ps.256(<8 x float>, i8) nounwind readnone
+
+
+define <4 x double> @test_x86_avx_vperm2f128_pd_256(<4 x double> %a0, <4 x double> %a1) {
+; CHECK-LABEL: test_x86_avx_vperm2f128_pd_256:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1]
+; CHECK-NEXT: ret{{[l|q]}}
+ %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 3) ; <<4 x double>> [#uses=1]
+ ret <4 x double> %res
+}
+declare <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone
+
+
+define <8 x float> @test_x86_avx_vperm2f128_ps_256(<8 x float> %a0, <8 x float> %a1) {
+; CHECK-LABEL: test_x86_avx_vperm2f128_ps_256:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1]
+; CHECK-NEXT: ret{{[l|q]}}
+ %res = call <8 x float> @llvm.x86.avx.vperm2f128.ps.256(<8 x float> %a0, <8 x float> %a1, i8 3) ; <<8 x float>> [#uses=1]
+ ret <8 x float> %res
+}
+declare <8 x float> @llvm.x86.avx.vperm2f128.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone
+
+
+define <8 x i32> @test_x86_avx_vperm2f128_si_256(<8 x i32> %a0, <8 x i32> %a1) {
+; CHECK-LABEL: test_x86_avx_vperm2f128_si_256:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1]
+; CHECK-NEXT: ret{{[l|q]}}
+ %res = call <8 x i32> @llvm.x86.avx.vperm2f128.si.256(<8 x i32> %a0, <8 x i32> %a1, i8 3) ; <<8 x i32>> [#uses=1]
+ ret <8 x i32> %res
+}
+declare <8 x i32> @llvm.x86.avx.vperm2f128.si.256(<8 x i32>, <8 x i32>, i8) nounwind readnone
diff --git a/test/CodeGen/X86/avx-intrinsics-x86.ll b/test/CodeGen/X86/avx-intrinsics-x86.ll
index c1fa2b1d6b76..748dd6804dd8 100644
--- a/test/CodeGen/X86/avx-intrinsics-x86.ll
+++ b/test/CodeGen/X86/avx-intrinsics-x86.ll
@@ -1,12 +1,14 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=avx,pclmul -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=AVX
-; RUN: llc < %s -mtriple=i686-apple-darwin -mcpu=skx -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512VL
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=avx,pclmul -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=X86 --check-prefix=X86-AVX
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mcpu=skx -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512VL --check-prefix=X86 --check-prefix=X86-AVX512VL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx,pclmul -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=X64 --check-prefix=X64-AVX
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skx -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512VL --check-prefix=X64 --check-prefix=X64-AVX512VL
define <4 x double> @test_x86_avx_addsub_pd_256(<4 x double> %a0, <4 x double> %a1) {
; CHECK-LABEL: test_x86_avx_addsub_pd_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vaddsubpd %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xd0,0xc1]
-; CHECK-NEXT: retl ## encoding: [0xc3]
+; CHECK: # %bb.0:
+; CHECK-NEXT: vaddsubpd %ymm1, %ymm0, %ymm0 # encoding: [0xc5,0xfd,0xd0,0xc1]
+; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx.addsub.pd.256(<4 x double> %a0, <4 x double> %a1) ; <<4 x double>> [#uses=1]
ret <4 x double> %res
}
@@ -15,9 +17,9 @@ declare <4 x double> @llvm.x86.avx.addsub.pd.256(<4 x double>, <4 x double>) nou
define <8 x float> @test_x86_avx_addsub_ps_256(<8 x float> %a0, <8 x float> %a1) {
; CHECK-LABEL: test_x86_avx_addsub_ps_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vaddsubps %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xff,0xd0,0xc1]
-; CHECK-NEXT: retl ## encoding: [0xc3]
+; CHECK: # %bb.0:
+; CHECK-NEXT: vaddsubps %ymm1, %ymm0, %ymm0 # encoding: [0xc5,0xff,0xd0,0xc1]
+; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx.addsub.ps.256(<8 x float> %a0, <8 x float> %a1) ; <<8 x float>> [#uses=1]
ret <8 x float> %res
}
@@ -26,9 +28,9 @@ declare <8 x float> @llvm.x86.avx.addsub.ps.256(<8 x float>, <8 x float>) nounwi
define <4 x double> @test_x86_avx_blendv_pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) {
; CHECK-LABEL: test_x86_avx_blendv_pd_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vblendvpd %ymm2, %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x4b,0xc1,0x20]
-; CHECK-NEXT: retl ## encoding: [0xc3]
+; CHECK: # %bb.0:
+; CHECK-NEXT: vblendvpd %ymm2, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0x7d,0x4b,0xc1,0x20]
+; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) ; <<4 x double>> [#uses=1]
ret <4 x double> %res
}
@@ -37,9 +39,9 @@ declare <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double>, <4 x double>, <4
define <8 x float> @test_x86_avx_blendv_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) {
; CHECK-LABEL: test_x86_avx_blendv_ps_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vblendvps %ymm2, %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x4a,0xc1,0x20]
-; CHECK-NEXT: retl ## encoding: [0xc3]
+; CHECK: # %bb.0:
+; CHECK-NEXT: vblendvps %ymm2, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0x7d,0x4a,0xc1,0x20]
+; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) ; <<8 x float>> [#uses=1]
ret <8 x float> %res
}
@@ -48,9 +50,9 @@ declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>, <8 x f
define <4 x double> @test_x86_avx_cmp_pd_256(<4 x double> %a0, <4 x double> %a1) {
; CHECK-LABEL: test_x86_avx_cmp_pd_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vcmpordpd %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xc2,0xc1,0x07]
-; CHECK-NEXT: retl ## encoding: [0xc3]
+; CHECK: # %bb.0:
+; CHECK-NEXT: vcmpordpd %ymm1, %ymm0, %ymm0 # encoding: [0xc5,0xfd,0xc2,0xc1,0x07]
+; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double> %a0, <4 x double> %a1, i8 7) ; <<4 x double>> [#uses=1]
ret <4 x double> %res
}
@@ -59,49 +61,49 @@ declare <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double>, <4 x double>, i8) no
define <8 x float> @test_x86_avx_cmp_ps_256(<8 x float> %a0, <8 x float> %a1) {
; CHECK-LABEL: test_x86_avx_cmp_ps_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vcmpordps %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfc,0xc2,0xc1,0x07]
-; CHECK-NEXT: retl ## encoding: [0xc3]
+; CHECK: # %bb.0:
+; CHECK-NEXT: vcmpordps %ymm1, %ymm0, %ymm0 # encoding: [0xc5,0xfc,0xc2,0xc1,0x07]
+; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float> %a0, <8 x float> %a1, i8 7) ; <<8 x float>> [#uses=1]
ret <8 x float> %res
}
define <8 x float> @test_x86_avx_cmp_ps_256_pseudo_op(<8 x float> %a0, <8 x float> %a1) {
; CHECK-LABEL: test_x86_avx_cmp_ps_256_pseudo_op:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vcmpeqps %ymm1, %ymm0, %ymm1 ## encoding: [0xc5,0xfc,0xc2,0xc9,0x00]
-; CHECK-NEXT: vcmpltps %ymm1, %ymm0, %ymm1 ## encoding: [0xc5,0xfc,0xc2,0xc9,0x01]
-; CHECK-NEXT: vcmpleps %ymm1, %ymm0, %ymm1 ## encoding: [0xc5,0xfc,0xc2,0xc9,0x02]
-; CHECK-NEXT: vcmpunordps %ymm1, %ymm0, %ymm1 ## encoding: [0xc5,0xfc,0xc2,0xc9,0x03]
-; CHECK-NEXT: vcmpneqps %ymm1, %ymm0, %ymm1 ## encoding: [0xc5,0xfc,0xc2,0xc9,0x04]
-; CHECK-NEXT: vcmpnltps %ymm1, %ymm0, %ymm1 ## encoding: [0xc5,0xfc,0xc2,0xc9,0x05]
-; CHECK-NEXT: vcmpnleps %ymm1, %ymm0, %ymm1 ## encoding: [0xc5,0xfc,0xc2,0xc9,0x06]
-; CHECK-NEXT: vcmpordps %ymm1, %ymm0, %ymm1 ## encoding: [0xc5,0xfc,0xc2,0xc9,0x07]
-; CHECK-NEXT: vcmpeq_uqps %ymm1, %ymm0, %ymm1 ## encoding: [0xc5,0xfc,0xc2,0xc9,0x08]
-; CHECK-NEXT: vcmpngeps %ymm1, %ymm0, %ymm1 ## encoding: [0xc5,0xfc,0xc2,0xc9,0x09]
-; CHECK-NEXT: vcmpngtps %ymm1, %ymm0, %ymm1 ## encoding: [0xc5,0xfc,0xc2,0xc9,0x0a]
-; CHECK-NEXT: vcmpfalseps %ymm1, %ymm0, %ymm1 ## encoding: [0xc5,0xfc,0xc2,0xc9,0x0b]
-; CHECK-NEXT: vcmpneq_oqps %ymm1, %ymm0, %ymm1 ## encoding: [0xc5,0xfc,0xc2,0xc9,0x0c]
-; CHECK-NEXT: vcmpgeps %ymm1, %ymm0, %ymm1 ## encoding: [0xc5,0xfc,0xc2,0xc9,0x0d]
-; CHECK-NEXT: vcmpgtps %ymm1, %ymm0, %ymm1 ## encoding: [0xc5,0xfc,0xc2,0xc9,0x0e]
-; CHECK-NEXT: vcmptrueps %ymm1, %ymm0, %ymm1 ## encoding: [0xc5,0xfc,0xc2,0xc9,0x0f]
-; CHECK-NEXT: vcmpeq_osps %ymm1, %ymm0, %ymm1 ## encoding: [0xc5,0xfc,0xc2,0xc9,0x10]
-; CHECK-NEXT: vcmplt_oqps %ymm1, %ymm0, %ymm1 ## encoding: [0xc5,0xfc,0xc2,0xc9,0x11]
-; CHECK-NEXT: vcmple_oqps %ymm1, %ymm0, %ymm1 ## encoding: [0xc5,0xfc,0xc2,0xc9,0x12]
-; CHECK-NEXT: vcmpunord_sps %ymm1, %ymm0, %ymm1 ## encoding: [0xc5,0xfc,0xc2,0xc9,0x13]
-; CHECK-NEXT: vcmpneq_usps %ymm1, %ymm0, %ymm1 ## encoding: [0xc5,0xfc,0xc2,0xc9,0x14]
-; CHECK-NEXT: vcmpnlt_uqps %ymm1, %ymm0, %ymm1 ## encoding: [0xc5,0xfc,0xc2,0xc9,0x15]
-; CHECK-NEXT: vcmpnle_uqps %ymm1, %ymm0, %ymm1 ## encoding: [0xc5,0xfc,0xc2,0xc9,0x16]
-; CHECK-NEXT: vcmpord_sps %ymm1, %ymm0, %ymm1 ## encoding: [0xc5,0xfc,0xc2,0xc9,0x17]
-; CHECK-NEXT: vcmpeq_usps %ymm1, %ymm0, %ymm1 ## encoding: [0xc5,0xfc,0xc2,0xc9,0x18]
-; CHECK-NEXT: vcmpnge_uqps %ymm1, %ymm0, %ymm1 ## encoding: [0xc5,0xfc,0xc2,0xc9,0x19]
-; CHECK-NEXT: vcmpngt_uqps %ymm1, %ymm0, %ymm1 ## encoding: [0xc5,0xfc,0xc2,0xc9,0x1a]
-; CHECK-NEXT: vcmpfalse_osps %ymm1, %ymm0, %ymm1 ## encoding: [0xc5,0xfc,0xc2,0xc9,0x1b]
-; CHECK-NEXT: vcmpneq_osps %ymm1, %ymm0, %ymm1 ## encoding: [0xc5,0xfc,0xc2,0xc9,0x1c]
-; CHECK-NEXT: vcmpge_oqps %ymm1, %ymm0, %ymm1 ## encoding: [0xc5,0xfc,0xc2,0xc9,0x1d]
-; CHECK-NEXT: vcmpgt_oqps %ymm1, %ymm0, %ymm1 ## encoding: [0xc5,0xfc,0xc2,0xc9,0x1e]
-; CHECK-NEXT: vcmptrue_usps %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfc,0xc2,0xc1,0x1f]
-; CHECK-NEXT: retl ## encoding: [0xc3]
+; CHECK: # %bb.0:
+; CHECK-NEXT: vcmpeqps %ymm1, %ymm0, %ymm1 # encoding: [0xc5,0xfc,0xc2,0xc9,0x00]
+; CHECK-NEXT: vcmpltps %ymm1, %ymm0, %ymm1 # encoding: [0xc5,0xfc,0xc2,0xc9,0x01]
+; CHECK-NEXT: vcmpleps %ymm1, %ymm0, %ymm1 # encoding: [0xc5,0xfc,0xc2,0xc9,0x02]
+; CHECK-NEXT: vcmpunordps %ymm1, %ymm0, %ymm1 # encoding: [0xc5,0xfc,0xc2,0xc9,0x03]
+; CHECK-NEXT: vcmpneqps %ymm1, %ymm0, %ymm1 # encoding: [0xc5,0xfc,0xc2,0xc9,0x04]
+; CHECK-NEXT: vcmpnltps %ymm1, %ymm0, %ymm1 # encoding: [0xc5,0xfc,0xc2,0xc9,0x05]
+; CHECK-NEXT: vcmpnleps %ymm1, %ymm0, %ymm1 # encoding: [0xc5,0xfc,0xc2,0xc9,0x06]
+; CHECK-NEXT: vcmpordps %ymm1, %ymm0, %ymm1 # encoding: [0xc5,0xfc,0xc2,0xc9,0x07]
+; CHECK-NEXT: vcmpeq_uqps %ymm1, %ymm0, %ymm1 # encoding: [0xc5,0xfc,0xc2,0xc9,0x08]
+; CHECK-NEXT: vcmpngeps %ymm1, %ymm0, %ymm1 # encoding: [0xc5,0xfc,0xc2,0xc9,0x09]
+; CHECK-NEXT: vcmpngtps %ymm1, %ymm0, %ymm1 # encoding: [0xc5,0xfc,0xc2,0xc9,0x0a]
+; CHECK-NEXT: vcmpfalseps %ymm1, %ymm0, %ymm1 # encoding: [0xc5,0xfc,0xc2,0xc9,0x0b]
+; CHECK-NEXT: vcmpneq_oqps %ymm1, %ymm0, %ymm1 # encoding: [0xc5,0xfc,0xc2,0xc9,0x0c]
+; CHECK-NEXT: vcmpgeps %ymm1, %ymm0, %ymm1 # encoding: [0xc5,0xfc,0xc2,0xc9,0x0d]
+; CHECK-NEXT: vcmpgtps %ymm1, %ymm0, %ymm1 # encoding: [0xc5,0xfc,0xc2,0xc9,0x0e]
+; CHECK-NEXT: vcmptrueps %ymm1, %ymm0, %ymm1 # encoding: [0xc5,0xfc,0xc2,0xc9,0x0f]
+; CHECK-NEXT: vcmpeq_osps %ymm1, %ymm0, %ymm1 # encoding: [0xc5,0xfc,0xc2,0xc9,0x10]
+; CHECK-NEXT: vcmplt_oqps %ymm1, %ymm0, %ymm1 # encoding: [0xc5,0xfc,0xc2,0xc9,0x11]
+; CHECK-NEXT: vcmple_oqps %ymm1, %ymm0, %ymm1 # encoding: [0xc5,0xfc,0xc2,0xc9,0x12]
+; CHECK-NEXT: vcmpunord_sps %ymm1, %ymm0, %ymm1 # encoding: [0xc5,0xfc,0xc2,0xc9,0x13]
+; CHECK-NEXT: vcmpneq_usps %ymm1, %ymm0, %ymm1 # encoding: [0xc5,0xfc,0xc2,0xc9,0x14]
+; CHECK-NEXT: vcmpnlt_uqps %ymm1, %ymm0, %ymm1 # encoding: [0xc5,0xfc,0xc2,0xc9,0x15]
+; CHECK-NEXT: vcmpnle_uqps %ymm1, %ymm0, %ymm1 # encoding: [0xc5,0xfc,0xc2,0xc9,0x16]
+; CHECK-NEXT: vcmpord_sps %ymm1, %ymm0, %ymm1 # encoding: [0xc5,0xfc,0xc2,0xc9,0x17]
+; CHECK-NEXT: vcmpeq_usps %ymm1, %ymm0, %ymm1 # encoding: [0xc5,0xfc,0xc2,0xc9,0x18]
+; CHECK-NEXT: vcmpnge_uqps %ymm1, %ymm0, %ymm1 # encoding: [0xc5,0xfc,0xc2,0xc9,0x19]
+; CHECK-NEXT: vcmpngt_uqps %ymm1, %ymm0, %ymm1 # encoding: [0xc5,0xfc,0xc2,0xc9,0x1a]
+; CHECK-NEXT: vcmpfalse_osps %ymm1, %ymm0, %ymm1 # encoding: [0xc5,0xfc,0xc2,0xc9,0x1b]
+; CHECK-NEXT: vcmpneq_osps %ymm1, %ymm0, %ymm1 # encoding: [0xc5,0xfc,0xc2,0xc9,0x1c]
+; CHECK-NEXT: vcmpge_oqps %ymm1, %ymm0, %ymm1 # encoding: [0xc5,0xfc,0xc2,0xc9,0x1d]
+; CHECK-NEXT: vcmpgt_oqps %ymm1, %ymm0, %ymm1 # encoding: [0xc5,0xfc,0xc2,0xc9,0x1e]
+; CHECK-NEXT: vcmptrue_usps %ymm1, %ymm0, %ymm0 # encoding: [0xc5,0xfc,0xc2,0xc1,0x1f]
+; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%a2 = call <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float> %a0, <8 x float> %a1, i8 0) ; <<8 x float>> [#uses=1]
%a3 = call <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float> %a0, <8 x float> %a2, i8 1) ; <<8 x float>> [#uses=1]
%a4 = call <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float> %a0, <8 x float> %a3, i8 2) ; <<8 x float>> [#uses=1]
@@ -141,16 +143,16 @@ declare <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float>, <8 x float>, i8) nounw
define <4 x float> @test_x86_avx_cvt_pd2_ps_256(<4 x double> %a0) {
; AVX-LABEL: test_x86_avx_cvt_pd2_ps_256:
-; AVX: ## BB#0:
-; AVX-NEXT: vcvtpd2ps %ymm0, %xmm0 ## encoding: [0xc5,0xfd,0x5a,0xc0]
-; AVX-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; AVX-NEXT: retl ## encoding: [0xc3]
+; AVX: # %bb.0:
+; AVX-NEXT: vcvtpd2ps %ymm0, %xmm0 # encoding: [0xc5,0xfd,0x5a,0xc0]
+; AVX-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
+; AVX-NEXT: ret{{[l|q]}} # encoding: [0xc3]
;
; AVX512VL-LABEL: test_x86_avx_cvt_pd2_ps_256:
-; AVX512VL: ## BB#0:
-; AVX512VL-NEXT: vcvtpd2ps %ymm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x5a,0xc0]
-; AVX512VL-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; AVX512VL-NEXT: retl ## encoding: [0xc3]
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vcvtpd2ps %ymm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x5a,0xc0]
+; AVX512VL-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
+; AVX512VL-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx.cvt.pd2.ps.256(<4 x double> %a0) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
@@ -159,16 +161,16 @@ declare <4 x float> @llvm.x86.avx.cvt.pd2.ps.256(<4 x double>) nounwind readnone
define <4 x i32> @test_x86_avx_cvt_pd2dq_256(<4 x double> %a0) {
; AVX-LABEL: test_x86_avx_cvt_pd2dq_256:
-; AVX: ## BB#0:
-; AVX-NEXT: vcvtpd2dq %ymm0, %xmm0 ## encoding: [0xc5,0xff,0xe6,0xc0]
-; AVX-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; AVX-NEXT: retl ## encoding: [0xc3]
+; AVX: # %bb.0:
+; AVX-NEXT: vcvtpd2dq %ymm0, %xmm0 # encoding: [0xc5,0xff,0xe6,0xc0]
+; AVX-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
+; AVX-NEXT: ret{{[l|q]}} # encoding: [0xc3]
;
; AVX512VL-LABEL: test_x86_avx_cvt_pd2dq_256:
-; AVX512VL: ## BB#0:
-; AVX512VL-NEXT: vcvtpd2dq %ymm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xff,0xe6,0xc0]
-; AVX512VL-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; AVX512VL-NEXT: retl ## encoding: [0xc3]
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vcvtpd2dq %ymm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xff,0xe6,0xc0]
+; AVX512VL-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
+; AVX512VL-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx.cvt.pd2dq.256(<4 x double> %a0) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %res
}
@@ -177,9 +179,9 @@ declare <4 x i32> @llvm.x86.avx.cvt.pd2dq.256(<4 x double>) nounwind readnone
define <8 x i32> @test_x86_avx_cvt_ps2dq_256(<8 x float> %a0) {
; CHECK-LABEL: test_x86_avx_cvt_ps2dq_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vcvtps2dq %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x5b,0xc0]
-; CHECK-NEXT: retl ## encoding: [0xc3]
+; CHECK: # %bb.0:
+; CHECK-NEXT: vcvtps2dq %ymm0, %ymm0 # encoding: [0xc5,0xfd,0x5b,0xc0]
+; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%res = call <8 x i32> @llvm.x86.avx.cvt.ps2dq.256(<8 x float> %a0) ; <<8 x i32>> [#uses=1]
ret <8 x i32> %res
}
@@ -188,14 +190,14 @@ declare <8 x i32> @llvm.x86.avx.cvt.ps2dq.256(<8 x float>) nounwind readnone
define <8 x float> @test_x86_avx_cvtdq2_ps_256(<8 x i32> %a0) {
; AVX-LABEL: test_x86_avx_cvtdq2_ps_256:
-; AVX: ## BB#0:
-; AVX-NEXT: vcvtdq2ps %ymm0, %ymm0 ## encoding: [0xc5,0xfc,0x5b,0xc0]
-; AVX-NEXT: retl ## encoding: [0xc3]
+; AVX: # %bb.0:
+; AVX-NEXT: vcvtdq2ps %ymm0, %ymm0 # encoding: [0xc5,0xfc,0x5b,0xc0]
+; AVX-NEXT: ret{{[l|q]}} # encoding: [0xc3]
;
; AVX512VL-LABEL: test_x86_avx_cvtdq2_ps_256:
-; AVX512VL: ## BB#0:
-; AVX512VL-NEXT: vcvtdq2ps %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x5b,0xc0]
-; AVX512VL-NEXT: retl ## encoding: [0xc3]
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vcvtdq2ps %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x5b,0xc0]
+; AVX512VL-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx.cvtdq2.ps.256(<8 x i32> %a0) ; <<8 x float>> [#uses=1]
ret <8 x float> %res
}
@@ -204,16 +206,16 @@ declare <8 x float> @llvm.x86.avx.cvtdq2.ps.256(<8 x i32>) nounwind readnone
define <4 x i32> @test_x86_avx_cvtt_pd2dq_256(<4 x double> %a0) {
; AVX-LABEL: test_x86_avx_cvtt_pd2dq_256:
-; AVX: ## BB#0:
-; AVX-NEXT: vcvttpd2dq %ymm0, %xmm0 ## encoding: [0xc5,0xfd,0xe6,0xc0]
-; AVX-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; AVX-NEXT: retl ## encoding: [0xc3]
+; AVX: # %bb.0:
+; AVX-NEXT: vcvttpd2dq %ymm0, %xmm0 # encoding: [0xc5,0xfd,0xe6,0xc0]
+; AVX-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
+; AVX-NEXT: ret{{[l|q]}} # encoding: [0xc3]
;
; AVX512VL-LABEL: test_x86_avx_cvtt_pd2dq_256:
-; AVX512VL: ## BB#0:
-; AVX512VL-NEXT: vcvttpd2dq %ymm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe6,0xc0]
-; AVX512VL-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; AVX512VL-NEXT: retl ## encoding: [0xc3]
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vcvttpd2dq %ymm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe6,0xc0]
+; AVX512VL-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
+; AVX512VL-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double> %a0) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %res
}
@@ -222,14 +224,14 @@ declare <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double>) nounwind readnone
define <8 x i32> @test_x86_avx_cvtt_ps2dq_256(<8 x float> %a0) {
; AVX-LABEL: test_x86_avx_cvtt_ps2dq_256:
-; AVX: ## BB#0:
-; AVX-NEXT: vcvttps2dq %ymm0, %ymm0 ## encoding: [0xc5,0xfe,0x5b,0xc0]
-; AVX-NEXT: retl ## encoding: [0xc3]
+; AVX: # %bb.0:
+; AVX-NEXT: vcvttps2dq %ymm0, %ymm0 # encoding: [0xc5,0xfe,0x5b,0xc0]
+; AVX-NEXT: ret{{[l|q]}} # encoding: [0xc3]
;
; AVX512VL-LABEL: test_x86_avx_cvtt_ps2dq_256:
-; AVX512VL: ## BB#0:
-; AVX512VL-NEXT: vcvttps2dq %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfe,0x5b,0xc0]
-; AVX512VL-NEXT: retl ## encoding: [0xc3]
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vcvttps2dq %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfe,0x5b,0xc0]
+; AVX512VL-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%res = call <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float> %a0) ; <<8 x i32>> [#uses=1]
ret <8 x i32> %res
}
@@ -238,9 +240,9 @@ declare <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float>) nounwind readnone
define <8 x float> @test_x86_avx_dp_ps_256(<8 x float> %a0, <8 x float> %a1) {
; CHECK-LABEL: test_x86_avx_dp_ps_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vdpps $7, %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x40,0xc1,0x07]
-; CHECK-NEXT: retl ## encoding: [0xc3]
+; CHECK: # %bb.0:
+; CHECK-NEXT: vdpps $7, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0x7d,0x40,0xc1,0x07]
+; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float> %a0, <8 x float> %a1, i8 7) ; <<8 x float>> [#uses=1]
ret <8 x float> %res
}
@@ -249,9 +251,9 @@ declare <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float>, <8 x float>, i8) nounwi
define <4 x double> @test_x86_avx_hadd_pd_256(<4 x double> %a0, <4 x double> %a1) {
; CHECK-LABEL: test_x86_avx_hadd_pd_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x7c,0xc1]
-; CHECK-NEXT: retl ## encoding: [0xc3]
+; CHECK: # %bb.0:
+; CHECK-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 # encoding: [0xc5,0xfd,0x7c,0xc1]
+; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %a0, <4 x double> %a1) ; <<4 x double>> [#uses=1]
ret <4 x double> %res
}
@@ -260,9 +262,9 @@ declare <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double>, <4 x double>) nounw
define <8 x float> @test_x86_avx_hadd_ps_256(<8 x float> %a0, <8 x float> %a1) {
; CHECK-LABEL: test_x86_avx_hadd_ps_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vhaddps %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xff,0x7c,0xc1]
-; CHECK-NEXT: retl ## encoding: [0xc3]
+; CHECK: # %bb.0:
+; CHECK-NEXT: vhaddps %ymm1, %ymm0, %ymm0 # encoding: [0xc5,0xff,0x7c,0xc1]
+; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %a0, <8 x float> %a1) ; <<8 x float>> [#uses=1]
ret <8 x float> %res
}
@@ -271,9 +273,9 @@ declare <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float>, <8 x float>) nounwind
define <4 x double> @test_x86_avx_hsub_pd_256(<4 x double> %a0, <4 x double> %a1) {
; CHECK-LABEL: test_x86_avx_hsub_pd_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vhsubpd %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x7d,0xc1]
-; CHECK-NEXT: retl ## encoding: [0xc3]
+; CHECK: # %bb.0:
+; CHECK-NEXT: vhsubpd %ymm1, %ymm0, %ymm0 # encoding: [0xc5,0xfd,0x7d,0xc1]
+; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx.hsub.pd.256(<4 x double> %a0, <4 x double> %a1) ; <<4 x double>> [#uses=1]
ret <4 x double> %res
}
@@ -282,9 +284,9 @@ declare <4 x double> @llvm.x86.avx.hsub.pd.256(<4 x double>, <4 x double>) nounw
define <8 x float> @test_x86_avx_hsub_ps_256(<8 x float> %a0, <8 x float> %a1) {
; CHECK-LABEL: test_x86_avx_hsub_ps_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vhsubps %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xff,0x7d,0xc1]
-; CHECK-NEXT: retl ## encoding: [0xc3]
+; CHECK: # %bb.0:
+; CHECK-NEXT: vhsubps %ymm1, %ymm0, %ymm0 # encoding: [0xc5,0xff,0x7d,0xc1]
+; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float> %a0, <8 x float> %a1) ; <<8 x float>> [#uses=1]
ret <8 x float> %res
}
@@ -292,11 +294,16 @@ declare <8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float>, <8 x float>) nounwind
define <32 x i8> @test_x86_avx_ldu_dq_256(i8* %a0) {
-; CHECK-LABEL: test_x86_avx_ldu_dq_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; CHECK-NEXT: vlddqu (%eax), %ymm0 ## encoding: [0xc5,0xff,0xf0,0x00]
-; CHECK-NEXT: retl ## encoding: [0xc3]
+; X86-LABEL: test_x86_avx_ldu_dq_256:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT: vlddqu (%eax), %ymm0 # encoding: [0xc5,0xff,0xf0,0x00]
+; X86-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+;
+; X64-LABEL: test_x86_avx_ldu_dq_256:
+; X64: # %bb.0:
+; X64-NEXT: vlddqu (%rdi), %ymm0 # encoding: [0xc5,0xff,0xf0,0x07]
+; X64-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%res = call <32 x i8> @llvm.x86.avx.ldu.dq.256(i8* %a0) ; <<32 x i8>> [#uses=1]
ret <32 x i8> %res
}
@@ -304,11 +311,16 @@ declare <32 x i8> @llvm.x86.avx.ldu.dq.256(i8*) nounwind readonly
define <2 x double> @test_x86_avx_maskload_pd(i8* %a0, <2 x i64> %mask) {
-; CHECK-LABEL: test_x86_avx_maskload_pd:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; CHECK-NEXT: vmaskmovpd (%eax), %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x2d,0x00]
-; CHECK-NEXT: retl ## encoding: [0xc3]
+; X86-LABEL: test_x86_avx_maskload_pd:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT: vmaskmovpd (%eax), %xmm0, %xmm0 # encoding: [0xc4,0xe2,0x79,0x2d,0x00]
+; X86-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+;
+; X64-LABEL: test_x86_avx_maskload_pd:
+; X64: # %bb.0:
+; X64-NEXT: vmaskmovpd (%rdi), %xmm0, %xmm0 # encoding: [0xc4,0xe2,0x79,0x2d,0x07]
+; X64-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%res = call <2 x double> @llvm.x86.avx.maskload.pd(i8* %a0, <2 x i64> %mask) ; <<2 x double>> [#uses=1]
ret <2 x double> %res
}
@@ -316,11 +328,16 @@ declare <2 x double> @llvm.x86.avx.maskload.pd(i8*, <2 x i64>) nounwind readonly
define <4 x double> @test_x86_avx_maskload_pd_256(i8* %a0, <4 x i64> %mask) {
-; CHECK-LABEL: test_x86_avx_maskload_pd_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; CHECK-NEXT: vmaskmovpd (%eax), %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x2d,0x00]
-; CHECK-NEXT: retl ## encoding: [0xc3]
+; X86-LABEL: test_x86_avx_maskload_pd_256:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT: vmaskmovpd (%eax), %ymm0, %ymm0 # encoding: [0xc4,0xe2,0x7d,0x2d,0x00]
+; X86-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+;
+; X64-LABEL: test_x86_avx_maskload_pd_256:
+; X64: # %bb.0:
+; X64-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm0 # encoding: [0xc4,0xe2,0x7d,0x2d,0x07]
+; X64-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %a0, <4 x i64> %mask) ; <<4 x double>> [#uses=1]
ret <4 x double> %res
}
@@ -328,11 +345,16 @@ declare <4 x double> @llvm.x86.avx.maskload.pd.256(i8*, <4 x i64>) nounwind read
define <4 x float> @test_x86_avx_maskload_ps(i8* %a0, <4 x i32> %mask) {
-; CHECK-LABEL: test_x86_avx_maskload_ps:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; CHECK-NEXT: vmaskmovps (%eax), %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x2c,0x00]
-; CHECK-NEXT: retl ## encoding: [0xc3]
+; X86-LABEL: test_x86_avx_maskload_ps:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT: vmaskmovps (%eax), %xmm0, %xmm0 # encoding: [0xc4,0xe2,0x79,0x2c,0x00]
+; X86-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+;
+; X64-LABEL: test_x86_avx_maskload_ps:
+; X64: # %bb.0:
+; X64-NEXT: vmaskmovps (%rdi), %xmm0, %xmm0 # encoding: [0xc4,0xe2,0x79,0x2c,0x07]
+; X64-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx.maskload.ps(i8* %a0, <4 x i32> %mask) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
@@ -340,11 +362,16 @@ declare <4 x float> @llvm.x86.avx.maskload.ps(i8*, <4 x i32>) nounwind readonly
define <8 x float> @test_x86_avx_maskload_ps_256(i8* %a0, <8 x i32> %mask) {
-; CHECK-LABEL: test_x86_avx_maskload_ps_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; CHECK-NEXT: vmaskmovps (%eax), %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x2c,0x00]
-; CHECK-NEXT: retl ## encoding: [0xc3]
+; X86-LABEL: test_x86_avx_maskload_ps_256:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT: vmaskmovps (%eax), %ymm0, %ymm0 # encoding: [0xc4,0xe2,0x7d,0x2c,0x00]
+; X86-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+;
+; X64-LABEL: test_x86_avx_maskload_ps_256:
+; X64: # %bb.0:
+; X64-NEXT: vmaskmovps (%rdi), %ymm0, %ymm0 # encoding: [0xc4,0xe2,0x7d,0x2c,0x07]
+; X64-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx.maskload.ps.256(i8* %a0, <8 x i32> %mask) ; <<8 x float>> [#uses=1]
ret <8 x float> %res
}
@@ -352,11 +379,16 @@ declare <8 x float> @llvm.x86.avx.maskload.ps.256(i8*, <8 x i32>) nounwind reado
define void @test_x86_avx_maskstore_pd(i8* %a0, <2 x i64> %mask, <2 x double> %a2) {
-; CHECK-LABEL: test_x86_avx_maskstore_pd:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; CHECK-NEXT: vmaskmovpd %xmm1, %xmm0, (%eax) ## encoding: [0xc4,0xe2,0x79,0x2f,0x08]
-; CHECK-NEXT: retl ## encoding: [0xc3]
+; X86-LABEL: test_x86_avx_maskstore_pd:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT: vmaskmovpd %xmm1, %xmm0, (%eax) # encoding: [0xc4,0xe2,0x79,0x2f,0x08]
+; X86-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+;
+; X64-LABEL: test_x86_avx_maskstore_pd:
+; X64: # %bb.0:
+; X64-NEXT: vmaskmovpd %xmm1, %xmm0, (%rdi) # encoding: [0xc4,0xe2,0x79,0x2f,0x0f]
+; X64-NEXT: ret{{[l|q]}} # encoding: [0xc3]
call void @llvm.x86.avx.maskstore.pd(i8* %a0, <2 x i64> %mask, <2 x double> %a2)
ret void
}
@@ -364,12 +396,18 @@ declare void @llvm.x86.avx.maskstore.pd(i8*, <2 x i64>, <2 x double>) nounwind
define void @test_x86_avx_maskstore_pd_256(i8* %a0, <4 x i64> %mask, <4 x double> %a2) {
-; CHECK-LABEL: test_x86_avx_maskstore_pd_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; CHECK-NEXT: vmaskmovpd %ymm1, %ymm0, (%eax) ## encoding: [0xc4,0xe2,0x7d,0x2f,0x08]
-; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; CHECK-NEXT: retl ## encoding: [0xc3]
+; X86-LABEL: test_x86_avx_maskstore_pd_256:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT: vmaskmovpd %ymm1, %ymm0, (%eax) # encoding: [0xc4,0xe2,0x7d,0x2f,0x08]
+; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X86-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+;
+; X64-LABEL: test_x86_avx_maskstore_pd_256:
+; X64: # %bb.0:
+; X64-NEXT: vmaskmovpd %ymm1, %ymm0, (%rdi) # encoding: [0xc4,0xe2,0x7d,0x2f,0x0f]
+; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X64-NEXT: ret{{[l|q]}} # encoding: [0xc3]
call void @llvm.x86.avx.maskstore.pd.256(i8* %a0, <4 x i64> %mask, <4 x double> %a2)
ret void
}
@@ -377,11 +415,16 @@ declare void @llvm.x86.avx.maskstore.pd.256(i8*, <4 x i64>, <4 x double>) nounwi
define void @test_x86_avx_maskstore_ps(i8* %a0, <4 x i32> %mask, <4 x float> %a2) {
-; CHECK-LABEL: test_x86_avx_maskstore_ps:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; CHECK-NEXT: vmaskmovps %xmm1, %xmm0, (%eax) ## encoding: [0xc4,0xe2,0x79,0x2e,0x08]
-; CHECK-NEXT: retl ## encoding: [0xc3]
+; X86-LABEL: test_x86_avx_maskstore_ps:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT: vmaskmovps %xmm1, %xmm0, (%eax) # encoding: [0xc4,0xe2,0x79,0x2e,0x08]
+; X86-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+;
+; X64-LABEL: test_x86_avx_maskstore_ps:
+; X64: # %bb.0:
+; X64-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi) # encoding: [0xc4,0xe2,0x79,0x2e,0x0f]
+; X64-NEXT: ret{{[l|q]}} # encoding: [0xc3]
call void @llvm.x86.avx.maskstore.ps(i8* %a0, <4 x i32> %mask, <4 x float> %a2)
ret void
}
@@ -389,12 +432,18 @@ declare void @llvm.x86.avx.maskstore.ps(i8*, <4 x i32>, <4 x float>) nounwind
define void @test_x86_avx_maskstore_ps_256(i8* %a0, <8 x i32> %mask, <8 x float> %a2) {
-; CHECK-LABEL: test_x86_avx_maskstore_ps_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; CHECK-NEXT: vmaskmovps %ymm1, %ymm0, (%eax) ## encoding: [0xc4,0xe2,0x7d,0x2e,0x08]
-; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; CHECK-NEXT: retl ## encoding: [0xc3]
+; X86-LABEL: test_x86_avx_maskstore_ps_256:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT: vmaskmovps %ymm1, %ymm0, (%eax) # encoding: [0xc4,0xe2,0x7d,0x2e,0x08]
+; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X86-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+;
+; X64-LABEL: test_x86_avx_maskstore_ps_256:
+; X64: # %bb.0:
+; X64-NEXT: vmaskmovps %ymm1, %ymm0, (%rdi) # encoding: [0xc4,0xe2,0x7d,0x2e,0x0f]
+; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X64-NEXT: ret{{[l|q]}} # encoding: [0xc3]
call void @llvm.x86.avx.maskstore.ps.256(i8* %a0, <8 x i32> %mask, <8 x float> %a2)
ret void
}
@@ -403,14 +452,14 @@ declare void @llvm.x86.avx.maskstore.ps.256(i8*, <8 x i32>, <8 x float>) nounwin
define <4 x double> @test_x86_avx_max_pd_256(<4 x double> %a0, <4 x double> %a1) {
; AVX-LABEL: test_x86_avx_max_pd_256:
-; AVX: ## BB#0:
-; AVX-NEXT: vmaxpd %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x5f,0xc1]
-; AVX-NEXT: retl ## encoding: [0xc3]
+; AVX: # %bb.0:
+; AVX-NEXT: vmaxpd %ymm1, %ymm0, %ymm0 # encoding: [0xc5,0xfd,0x5f,0xc1]
+; AVX-NEXT: ret{{[l|q]}} # encoding: [0xc3]
;
; AVX512VL-LABEL: test_x86_avx_max_pd_256:
-; AVX512VL: ## BB#0:
-; AVX512VL-NEXT: vmaxpd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x5f,0xc1]
-; AVX512VL-NEXT: retl ## encoding: [0xc3]
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmaxpd %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x5f,0xc1]
+; AVX512VL-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx.max.pd.256(<4 x double> %a0, <4 x double> %a1) ; <<4 x double>> [#uses=1]
ret <4 x double> %res
}
@@ -419,14 +468,14 @@ declare <4 x double> @llvm.x86.avx.max.pd.256(<4 x double>, <4 x double>) nounwi
define <8 x float> @test_x86_avx_max_ps_256(<8 x float> %a0, <8 x float> %a1) {
; AVX-LABEL: test_x86_avx_max_ps_256:
-; AVX: ## BB#0:
-; AVX-NEXT: vmaxps %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfc,0x5f,0xc1]
-; AVX-NEXT: retl ## encoding: [0xc3]
+; AVX: # %bb.0:
+; AVX-NEXT: vmaxps %ymm1, %ymm0, %ymm0 # encoding: [0xc5,0xfc,0x5f,0xc1]
+; AVX-NEXT: ret{{[l|q]}} # encoding: [0xc3]
;
; AVX512VL-LABEL: test_x86_avx_max_ps_256:
-; AVX512VL: ## BB#0:
-; AVX512VL-NEXT: vmaxps %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x5f,0xc1]
-; AVX512VL-NEXT: retl ## encoding: [0xc3]
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmaxps %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x5f,0xc1]
+; AVX512VL-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> %a0, <8 x float> %a1) ; <<8 x float>> [#uses=1]
ret <8 x float> %res
}
@@ -435,14 +484,14 @@ declare <8 x float> @llvm.x86.avx.max.ps.256(<8 x float>, <8 x float>) nounwind
define <4 x double> @test_x86_avx_min_pd_256(<4 x double> %a0, <4 x double> %a1) {
; AVX-LABEL: test_x86_avx_min_pd_256:
-; AVX: ## BB#0:
-; AVX-NEXT: vminpd %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x5d,0xc1]
-; AVX-NEXT: retl ## encoding: [0xc3]
+; AVX: # %bb.0:
+; AVX-NEXT: vminpd %ymm1, %ymm0, %ymm0 # encoding: [0xc5,0xfd,0x5d,0xc1]
+; AVX-NEXT: ret{{[l|q]}} # encoding: [0xc3]
;
; AVX512VL-LABEL: test_x86_avx_min_pd_256:
-; AVX512VL: ## BB#0:
-; AVX512VL-NEXT: vminpd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x5d,0xc1]
-; AVX512VL-NEXT: retl ## encoding: [0xc3]
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vminpd %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x5d,0xc1]
+; AVX512VL-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx.min.pd.256(<4 x double> %a0, <4 x double> %a1) ; <<4 x double>> [#uses=1]
ret <4 x double> %res
}
@@ -451,14 +500,14 @@ declare <4 x double> @llvm.x86.avx.min.pd.256(<4 x double>, <4 x double>) nounwi
define <8 x float> @test_x86_avx_min_ps_256(<8 x float> %a0, <8 x float> %a1) {
; AVX-LABEL: test_x86_avx_min_ps_256:
-; AVX: ## BB#0:
-; AVX-NEXT: vminps %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfc,0x5d,0xc1]
-; AVX-NEXT: retl ## encoding: [0xc3]
+; AVX: # %bb.0:
+; AVX-NEXT: vminps %ymm1, %ymm0, %ymm0 # encoding: [0xc5,0xfc,0x5d,0xc1]
+; AVX-NEXT: ret{{[l|q]}} # encoding: [0xc3]
;
; AVX512VL-LABEL: test_x86_avx_min_ps_256:
-; AVX512VL: ## BB#0:
-; AVX512VL-NEXT: vminps %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x5d,0xc1]
-; AVX512VL-NEXT: retl ## encoding: [0xc3]
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vminps %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x5d,0xc1]
+; AVX512VL-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> %a0, <8 x float> %a1) ; <<8 x float>> [#uses=1]
ret <8 x float> %res
}
@@ -467,10 +516,10 @@ declare <8 x float> @llvm.x86.avx.min.ps.256(<8 x float>, <8 x float>) nounwind
define i32 @test_x86_avx_movmsk_pd_256(<4 x double> %a0) {
; CHECK-LABEL: test_x86_avx_movmsk_pd_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vmovmskpd %ymm0, %eax ## encoding: [0xc5,0xfd,0x50,0xc0]
-; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; CHECK-NEXT: retl ## encoding: [0xc3]
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovmskpd %ymm0, %eax # encoding: [0xc5,0xfd,0x50,0xc0]
+; CHECK-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
+; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%res = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> %a0) ; <i32> [#uses=1]
ret i32 %res
}
@@ -479,29 +528,24 @@ declare i32 @llvm.x86.avx.movmsk.pd.256(<4 x double>) nounwind readnone
define i32 @test_x86_avx_movmsk_ps_256(<8 x float> %a0) {
; CHECK-LABEL: test_x86_avx_movmsk_ps_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vmovmskps %ymm0, %eax ## encoding: [0xc5,0xfc,0x50,0xc0]
-; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; CHECK-NEXT: retl ## encoding: [0xc3]
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovmskps %ymm0, %eax # encoding: [0xc5,0xfc,0x50,0xc0]
+; CHECK-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
+; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%res = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %a0) ; <i32> [#uses=1]
ret i32 %res
}
declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>) nounwind readnone
-
-
-
-
-
define i32 @test_x86_avx_ptestc_256(<4 x i64> %a0, <4 x i64> %a1) {
; CHECK-LABEL: test_x86_avx_ptestc_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0]
-; CHECK-NEXT: vptest %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x17,0xc1]
-; CHECK-NEXT: setb %al ## encoding: [0x0f,0x92,0xc0]
-; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; CHECK-NEXT: retl ## encoding: [0xc3]
+; CHECK: # %bb.0:
+; CHECK-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0]
+; CHECK-NEXT: vptest %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x7d,0x17,0xc1]
+; CHECK-NEXT: setb %al # encoding: [0x0f,0x92,0xc0]
+; CHECK-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
+; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%res = call i32 @llvm.x86.avx.ptestc.256(<4 x i64> %a0, <4 x i64> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -510,12 +554,12 @@ declare i32 @llvm.x86.avx.ptestc.256(<4 x i64>, <4 x i64>) nounwind readnone
define i32 @test_x86_avx_ptestnzc_256(<4 x i64> %a0, <4 x i64> %a1) {
; CHECK-LABEL: test_x86_avx_ptestnzc_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0]
-; CHECK-NEXT: vptest %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x17,0xc1]
-; CHECK-NEXT: seta %al ## encoding: [0x0f,0x97,0xc0]
-; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; CHECK-NEXT: retl ## encoding: [0xc3]
+; CHECK: # %bb.0:
+; CHECK-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0]
+; CHECK-NEXT: vptest %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x7d,0x17,0xc1]
+; CHECK-NEXT: seta %al # encoding: [0x0f,0x97,0xc0]
+; CHECK-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
+; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%res = call i32 @llvm.x86.avx.ptestnzc.256(<4 x i64> %a0, <4 x i64> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -524,12 +568,12 @@ declare i32 @llvm.x86.avx.ptestnzc.256(<4 x i64>, <4 x i64>) nounwind readnone
define i32 @test_x86_avx_ptestz_256(<4 x i64> %a0, <4 x i64> %a1) {
; CHECK-LABEL: test_x86_avx_ptestz_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0]
-; CHECK-NEXT: vptest %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x17,0xc1]
-; CHECK-NEXT: sete %al ## encoding: [0x0f,0x94,0xc0]
-; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; CHECK-NEXT: retl ## encoding: [0xc3]
+; CHECK: # %bb.0:
+; CHECK-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0]
+; CHECK-NEXT: vptest %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x7d,0x17,0xc1]
+; CHECK-NEXT: sete %al # encoding: [0x0f,0x94,0xc0]
+; CHECK-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
+; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%res = call i32 @llvm.x86.avx.ptestz.256(<4 x i64> %a0, <4 x i64> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -537,15 +581,10 @@ declare i32 @llvm.x86.avx.ptestz.256(<4 x i64>, <4 x i64>) nounwind readnone
define <8 x float> @test_x86_avx_rcp_ps_256(<8 x float> %a0) {
-; AVX-LABEL: test_x86_avx_rcp_ps_256:
-; AVX: ## BB#0:
-; AVX-NEXT: vrcpps %ymm0, %ymm0 ## encoding: [0xc5,0xfc,0x53,0xc0]
-; AVX-NEXT: retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_avx_rcp_ps_256:
-; AVX512VL: ## BB#0:
-; AVX512VL-NEXT: vrcp14ps %ymm0, %ymm0 ## encoding: [0x62,0xf2,0x7d,0x28,0x4c,0xc0]
-; AVX512VL-NEXT: retl ## encoding: [0xc3]
+; CHECK-LABEL: test_x86_avx_rcp_ps_256:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vrcpps %ymm0, %ymm0 # encoding: [0xc5,0xfc,0x53,0xc0]
+; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float> %a0) ; <<8 x float>> [#uses=1]
ret <8 x float> %res
}
@@ -553,10 +592,15 @@ declare <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float>) nounwind readnone
define <4 x double> @test_x86_avx_round_pd_256(<4 x double> %a0) {
-; CHECK-LABEL: test_x86_avx_round_pd_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vroundpd $7, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x09,0xc0,0x07]
-; CHECK-NEXT: retl ## encoding: [0xc3]
+; AVX-LABEL: test_x86_avx_round_pd_256:
+; AVX: # %bb.0:
+; AVX-NEXT: vroundpd $7, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0x7d,0x09,0xc0,0x07]
+; AVX-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_avx_round_pd_256:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vrndscalepd $7, %ymm0, %ymm0 # encoding: [0x62,0xf3,0xfd,0x28,0x09,0xc0,0x07]
+; AVX512VL-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %a0, i32 7) ; <<4 x double>> [#uses=1]
ret <4 x double> %res
}
@@ -564,10 +608,15 @@ declare <4 x double> @llvm.x86.avx.round.pd.256(<4 x double>, i32) nounwind read
define <8 x float> @test_x86_avx_round_ps_256(<8 x float> %a0) {
-; CHECK-LABEL: test_x86_avx_round_ps_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vroundps $7, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x08,0xc0,0x07]
-; CHECK-NEXT: retl ## encoding: [0xc3]
+; AVX-LABEL: test_x86_avx_round_ps_256:
+; AVX: # %bb.0:
+; AVX-NEXT: vroundps $7, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0x7d,0x08,0xc0,0x07]
+; AVX-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_avx_round_ps_256:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vrndscaleps $7, %ymm0, %ymm0 # encoding: [0x62,0xf3,0x7d,0x28,0x08,0xc0,0x07]
+; AVX512VL-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %a0, i32 7) ; <<8 x float>> [#uses=1]
ret <8 x float> %res
}
@@ -575,15 +624,10 @@ declare <8 x float> @llvm.x86.avx.round.ps.256(<8 x float>, i32) nounwind readno
define <8 x float> @test_x86_avx_rsqrt_ps_256(<8 x float> %a0) {
-; AVX-LABEL: test_x86_avx_rsqrt_ps_256:
-; AVX: ## BB#0:
-; AVX-NEXT: vrsqrtps %ymm0, %ymm0 ## encoding: [0xc5,0xfc,0x52,0xc0]
-; AVX-NEXT: retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_avx_rsqrt_ps_256:
-; AVX512VL: ## BB#0:
-; AVX512VL-NEXT: vrsqrt14ps %ymm0, %ymm0 ## encoding: [0x62,0xf2,0x7d,0x28,0x4e,0xc0]
-; AVX512VL-NEXT: retl ## encoding: [0xc3]
+; CHECK-LABEL: test_x86_avx_rsqrt_ps_256:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vrsqrtps %ymm0, %ymm0 # encoding: [0xc5,0xfc,0x52,0xc0]
+; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float> %a0) ; <<8 x float>> [#uses=1]
ret <8 x float> %res
}
@@ -591,10 +635,15 @@ declare <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float>) nounwind readnone
define <4 x double> @test_x86_avx_sqrt_pd_256(<4 x double> %a0) {
-; CHECK-LABEL: test_x86_avx_sqrt_pd_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vsqrtpd %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x51,0xc0]
-; CHECK-NEXT: retl ## encoding: [0xc3]
+; AVX-LABEL: test_x86_avx_sqrt_pd_256:
+; AVX: # %bb.0:
+; AVX-NEXT: vsqrtpd %ymm0, %ymm0 # encoding: [0xc5,0xfd,0x51,0xc0]
+; AVX-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_avx_sqrt_pd_256:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vsqrtpd %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x51,0xc0]
+; AVX512VL-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double> %a0) ; <<4 x double>> [#uses=1]
ret <4 x double> %res
}
@@ -602,62 +651,31 @@ declare <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double>) nounwind readnone
define <8 x float> @test_x86_avx_sqrt_ps_256(<8 x float> %a0) {
-; CHECK-LABEL: test_x86_avx_sqrt_ps_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vsqrtps %ymm0, %ymm0 ## encoding: [0xc5,0xfc,0x51,0xc0]
-; CHECK-NEXT: retl ## encoding: [0xc3]
+; AVX-LABEL: test_x86_avx_sqrt_ps_256:
+; AVX: # %bb.0:
+; AVX-NEXT: vsqrtps %ymm0, %ymm0 # encoding: [0xc5,0xfc,0x51,0xc0]
+; AVX-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_avx_sqrt_ps_256:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vsqrtps %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x51,0xc0]
+; AVX512VL-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float> %a0) ; <<8 x float>> [#uses=1]
ret <8 x float> %res
}
declare <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float>) nounwind readnone
-define <4 x double> @test_x86_avx_vperm2f128_pd_256(<4 x double> %a0, <4 x double> %a1) {
-; CHECK-LABEL: test_x86_avx_vperm2f128_pd_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vperm2f128 $7, %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x06,0xc1,0x07]
-; CHECK-NEXT: ## ymm0 = ymm1[2,3],ymm0[0,1]
-; CHECK-NEXT: retl ## encoding: [0xc3]
- %res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 7) ; <<4 x double>> [#uses=1]
- ret <4 x double> %res
-}
-declare <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone
-
-
-define <8 x float> @test_x86_avx_vperm2f128_ps_256(<8 x float> %a0, <8 x float> %a1) {
-; CHECK-LABEL: test_x86_avx_vperm2f128_ps_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vperm2f128 $7, %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x06,0xc1,0x07]
-; CHECK-NEXT: ## ymm0 = ymm1[2,3],ymm0[0,1]
-; CHECK-NEXT: retl ## encoding: [0xc3]
- %res = call <8 x float> @llvm.x86.avx.vperm2f128.ps.256(<8 x float> %a0, <8 x float> %a1, i8 7) ; <<8 x float>> [#uses=1]
- ret <8 x float> %res
-}
-declare <8 x float> @llvm.x86.avx.vperm2f128.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone
-
-
-define <8 x i32> @test_x86_avx_vperm2f128_si_256(<8 x i32> %a0, <8 x i32> %a1) {
-; CHECK-LABEL: test_x86_avx_vperm2f128_si_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vperm2f128 $7, %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x06,0xc1,0x07]
-; CHECK-NEXT: ## ymm0 = ymm1[2,3],ymm0[0,1]
-; CHECK-NEXT: retl ## encoding: [0xc3]
- %res = call <8 x i32> @llvm.x86.avx.vperm2f128.si.256(<8 x i32> %a0, <8 x i32> %a1, i8 7) ; <<8 x i32>> [#uses=1]
- ret <8 x i32> %res
-}
-declare <8 x i32> @llvm.x86.avx.vperm2f128.si.256(<8 x i32>, <8 x i32>, i8) nounwind readnone
-
-
define <2 x double> @test_x86_avx_vpermilvar_pd(<2 x double> %a0, <2 x i64> %a1) {
; AVX-LABEL: test_x86_avx_vpermilvar_pd:
-; AVX: ## BB#0:
-; AVX-NEXT: vpermilpd %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x0d,0xc1]
-; AVX-NEXT: retl ## encoding: [0xc3]
+; AVX: # %bb.0:
+; AVX-NEXT: vpermilpd %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe2,0x79,0x0d,0xc1]
+; AVX-NEXT: ret{{[l|q]}} # encoding: [0xc3]
;
; AVX512VL-LABEL: test_x86_avx_vpermilvar_pd:
-; AVX512VL: ## BB#0:
-; AVX512VL-NEXT: vpermilpd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x0d,0xc1]
-; AVX512VL-NEXT: retl ## encoding: [0xc3]
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpermilpd %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x0d,0xc1]
+; AVX512VL-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%res = call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %a0, <2 x i64> %a1) ; <<2 x double>> [#uses=1]
ret <2 x double> %res
}
@@ -666,14 +684,14 @@ declare <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double>, <2 x i64>) nounwi
define <4 x double> @test_x86_avx_vpermilvar_pd_256(<4 x double> %a0, <4 x i64> %a1) {
; AVX-LABEL: test_x86_avx_vpermilvar_pd_256:
-; AVX: ## BB#0:
-; AVX-NEXT: vpermilpd %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x0d,0xc1]
-; AVX-NEXT: retl ## encoding: [0xc3]
+; AVX: # %bb.0:
+; AVX-NEXT: vpermilpd %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe2,0x7d,0x0d,0xc1]
+; AVX-NEXT: ret{{[l|q]}} # encoding: [0xc3]
;
; AVX512VL-LABEL: test_x86_avx_vpermilvar_pd_256:
-; AVX512VL: ## BB#0:
-; AVX512VL-NEXT: vpermilpd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x0d,0xc1]
-; AVX512VL-NEXT: retl ## encoding: [0xc3]
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpermilpd %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x0d,0xc1]
+; AVX512VL-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %a0, <4 x i64> %a1) ; <<4 x double>> [#uses=1]
ret <4 x double> %res
}
@@ -681,45 +699,55 @@ declare <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double>, <4 x i64>) no
define <4 x double> @test_x86_avx_vpermilvar_pd_256_2(<4 x double> %a0) {
; AVX-LABEL: test_x86_avx_vpermilvar_pd_256_2:
-; AVX: ## BB#0:
-; AVX-NEXT: vpermilpd $9, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x05,0xc0,0x09]
-; AVX-NEXT: ## ymm0 = ymm0[1,0,2,3]
-; AVX-NEXT: retl ## encoding: [0xc3]
+; AVX: # %bb.0:
+; AVX-NEXT: vpermilpd $9, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0x7d,0x05,0xc0,0x09]
+; AVX-NEXT: # ymm0 = ymm0[1,0,2,3]
+; AVX-NEXT: ret{{[l|q]}} # encoding: [0xc3]
;
; AVX512VL-LABEL: test_x86_avx_vpermilvar_pd_256_2:
-; AVX512VL: ## BB#0:
-; AVX512VL-NEXT: vpermilpd $9, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x05,0xc0,0x09]
-; AVX512VL-NEXT: ## ymm0 = ymm0[1,0,2,3]
-; AVX512VL-NEXT: retl ## encoding: [0xc3]
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpermilpd $9, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x05,0xc0,0x09]
+; AVX512VL-NEXT: # ymm0 = ymm0[1,0,2,3]
+; AVX512VL-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %a0, <4 x i64> <i64 2, i64 0, i64 0, i64 2>) ; <<4 x double>> [#uses=1]
ret <4 x double> %res
}
define <4 x float> @test_x86_avx_vpermilvar_ps(<4 x float> %a0, <4 x i32> %a1) {
; AVX-LABEL: test_x86_avx_vpermilvar_ps:
-; AVX: ## BB#0:
-; AVX-NEXT: vpermilps %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x0c,0xc1]
-; AVX-NEXT: retl ## encoding: [0xc3]
+; AVX: # %bb.0:
+; AVX-NEXT: vpermilps %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe2,0x79,0x0c,0xc1]
+; AVX-NEXT: ret{{[l|q]}} # encoding: [0xc3]
;
; AVX512VL-LABEL: test_x86_avx_vpermilvar_ps:
-; AVX512VL: ## BB#0:
-; AVX512VL-NEXT: vpermilps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x0c,0xc1]
-; AVX512VL-NEXT: retl ## encoding: [0xc3]
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpermilps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x0c,0xc1]
+; AVX512VL-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> %a1) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
define <4 x float> @test_x86_avx_vpermilvar_ps_load(<4 x float> %a0, <4 x i32>* %a1) {
-; AVX-LABEL: test_x86_avx_vpermilvar_ps_load:
-; AVX: ## BB#0:
-; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; AVX-NEXT: vpermilps (%eax), %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x0c,0x00]
-; AVX-NEXT: retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_avx_vpermilvar_ps_load:
-; AVX512VL: ## BB#0:
-; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; AVX512VL-NEXT: vpermilps (%eax), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x0c,0x00]
-; AVX512VL-NEXT: retl ## encoding: [0xc3]
+; X86-AVX-LABEL: test_x86_avx_vpermilvar_ps_load:
+; X86-AVX: # %bb.0:
+; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-AVX-NEXT: vpermilps (%eax), %xmm0, %xmm0 # encoding: [0xc4,0xe2,0x79,0x0c,0x00]
+; X86-AVX-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+;
+; X86-AVX512VL-LABEL: test_x86_avx_vpermilvar_ps_load:
+; X86-AVX512VL: # %bb.0:
+; X86-AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-AVX512VL-NEXT: vpermilps (%eax), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x0c,0x00]
+; X86-AVX512VL-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+;
+; X64-AVX-LABEL: test_x86_avx_vpermilvar_ps_load:
+; X64-AVX: # %bb.0:
+; X64-AVX-NEXT: vpermilps (%rdi), %xmm0, %xmm0 # encoding: [0xc4,0xe2,0x79,0x0c,0x07]
+; X64-AVX-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+;
+; X64-AVX512VL-LABEL: test_x86_avx_vpermilvar_ps_load:
+; X64-AVX512VL: # %bb.0:
+; X64-AVX512VL-NEXT: vpermilps (%rdi), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x0c,0x07]
+; X64-AVX512VL-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%a2 = load <4 x i32>, <4 x i32>* %a1
%res = call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> %a2) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
@@ -729,14 +757,14 @@ declare <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float>, <4 x i32>) nounwind
define <8 x float> @test_x86_avx_vpermilvar_ps_256(<8 x float> %a0, <8 x i32> %a1) {
; AVX-LABEL: test_x86_avx_vpermilvar_ps_256:
-; AVX: ## BB#0:
-; AVX-NEXT: vpermilps %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x0c,0xc1]
-; AVX-NEXT: retl ## encoding: [0xc3]
+; AVX: # %bb.0:
+; AVX-NEXT: vpermilps %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe2,0x7d,0x0c,0xc1]
+; AVX-NEXT: ret{{[l|q]}} # encoding: [0xc3]
;
; AVX512VL-LABEL: test_x86_avx_vpermilvar_ps_256:
-; AVX512VL: ## BB#0:
-; AVX512VL-NEXT: vpermilps %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x0c,0xc1]
-; AVX512VL-NEXT: retl ## encoding: [0xc3]
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpermilps %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x0c,0xc1]
+; AVX512VL-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> %a1) ; <<8 x float>> [#uses=1]
ret <8 x float> %res
}
@@ -745,11 +773,11 @@ declare <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float>, <8 x i32>) noun
define i32 @test_x86_avx_vtestc_pd(<2 x double> %a0, <2 x double> %a1) {
; CHECK-LABEL: test_x86_avx_vtestc_pd:
-; CHECK: ## BB#0:
-; CHECK-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0]
-; CHECK-NEXT: vtestpd %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x0f,0xc1]
-; CHECK-NEXT: setb %al ## encoding: [0x0f,0x92,0xc0]
-; CHECK-NEXT: retl ## encoding: [0xc3]
+; CHECK: # %bb.0:
+; CHECK-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0]
+; CHECK-NEXT: vtestpd %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x79,0x0f,0xc1]
+; CHECK-NEXT: setb %al # encoding: [0x0f,0x92,0xc0]
+; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%res = call i32 @llvm.x86.avx.vtestc.pd(<2 x double> %a0, <2 x double> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -758,12 +786,12 @@ declare i32 @llvm.x86.avx.vtestc.pd(<2 x double>, <2 x double>) nounwind readnon
define i32 @test_x86_avx_vtestc_pd_256(<4 x double> %a0, <4 x double> %a1) {
; CHECK-LABEL: test_x86_avx_vtestc_pd_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0]
-; CHECK-NEXT: vtestpd %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x0f,0xc1]
-; CHECK-NEXT: setb %al ## encoding: [0x0f,0x92,0xc0]
-; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; CHECK-NEXT: retl ## encoding: [0xc3]
+; CHECK: # %bb.0:
+; CHECK-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0]
+; CHECK-NEXT: vtestpd %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x7d,0x0f,0xc1]
+; CHECK-NEXT: setb %al # encoding: [0x0f,0x92,0xc0]
+; CHECK-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
+; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%res = call i32 @llvm.x86.avx.vtestc.pd.256(<4 x double> %a0, <4 x double> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -772,11 +800,11 @@ declare i32 @llvm.x86.avx.vtestc.pd.256(<4 x double>, <4 x double>) nounwind rea
define i32 @test_x86_avx_vtestc_ps(<4 x float> %a0, <4 x float> %a1) {
; CHECK-LABEL: test_x86_avx_vtestc_ps:
-; CHECK: ## BB#0:
-; CHECK-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0]
-; CHECK-NEXT: vtestps %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x0e,0xc1]
-; CHECK-NEXT: setb %al ## encoding: [0x0f,0x92,0xc0]
-; CHECK-NEXT: retl ## encoding: [0xc3]
+; CHECK: # %bb.0:
+; CHECK-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0]
+; CHECK-NEXT: vtestps %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x79,0x0e,0xc1]
+; CHECK-NEXT: setb %al # encoding: [0x0f,0x92,0xc0]
+; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%res = call i32 @llvm.x86.avx.vtestc.ps(<4 x float> %a0, <4 x float> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -785,12 +813,12 @@ declare i32 @llvm.x86.avx.vtestc.ps(<4 x float>, <4 x float>) nounwind readnone
define i32 @test_x86_avx_vtestc_ps_256(<8 x float> %a0, <8 x float> %a1) {
; CHECK-LABEL: test_x86_avx_vtestc_ps_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0]
-; CHECK-NEXT: vtestps %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x0e,0xc1]
-; CHECK-NEXT: setb %al ## encoding: [0x0f,0x92,0xc0]
-; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; CHECK-NEXT: retl ## encoding: [0xc3]
+; CHECK: # %bb.0:
+; CHECK-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0]
+; CHECK-NEXT: vtestps %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x7d,0x0e,0xc1]
+; CHECK-NEXT: setb %al # encoding: [0x0f,0x92,0xc0]
+; CHECK-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
+; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%res = call i32 @llvm.x86.avx.vtestc.ps.256(<8 x float> %a0, <8 x float> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -799,11 +827,11 @@ declare i32 @llvm.x86.avx.vtestc.ps.256(<8 x float>, <8 x float>) nounwind readn
define i32 @test_x86_avx_vtestnzc_pd(<2 x double> %a0, <2 x double> %a1) {
; CHECK-LABEL: test_x86_avx_vtestnzc_pd:
-; CHECK: ## BB#0:
-; CHECK-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0]
-; CHECK-NEXT: vtestpd %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x0f,0xc1]
-; CHECK-NEXT: seta %al ## encoding: [0x0f,0x97,0xc0]
-; CHECK-NEXT: retl ## encoding: [0xc3]
+; CHECK: # %bb.0:
+; CHECK-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0]
+; CHECK-NEXT: vtestpd %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x79,0x0f,0xc1]
+; CHECK-NEXT: seta %al # encoding: [0x0f,0x97,0xc0]
+; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%res = call i32 @llvm.x86.avx.vtestnzc.pd(<2 x double> %a0, <2 x double> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -812,12 +840,12 @@ declare i32 @llvm.x86.avx.vtestnzc.pd(<2 x double>, <2 x double>) nounwind readn
define i32 @test_x86_avx_vtestnzc_pd_256(<4 x double> %a0, <4 x double> %a1) {
; CHECK-LABEL: test_x86_avx_vtestnzc_pd_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0]
-; CHECK-NEXT: vtestpd %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x0f,0xc1]
-; CHECK-NEXT: seta %al ## encoding: [0x0f,0x97,0xc0]
-; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; CHECK-NEXT: retl ## encoding: [0xc3]
+; CHECK: # %bb.0:
+; CHECK-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0]
+; CHECK-NEXT: vtestpd %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x7d,0x0f,0xc1]
+; CHECK-NEXT: seta %al # encoding: [0x0f,0x97,0xc0]
+; CHECK-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
+; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%res = call i32 @llvm.x86.avx.vtestnzc.pd.256(<4 x double> %a0, <4 x double> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -826,11 +854,11 @@ declare i32 @llvm.x86.avx.vtestnzc.pd.256(<4 x double>, <4 x double>) nounwind r
define i32 @test_x86_avx_vtestnzc_ps(<4 x float> %a0, <4 x float> %a1) {
; CHECK-LABEL: test_x86_avx_vtestnzc_ps:
-; CHECK: ## BB#0:
-; CHECK-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0]
-; CHECK-NEXT: vtestps %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x0e,0xc1]
-; CHECK-NEXT: seta %al ## encoding: [0x0f,0x97,0xc0]
-; CHECK-NEXT: retl ## encoding: [0xc3]
+; CHECK: # %bb.0:
+; CHECK-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0]
+; CHECK-NEXT: vtestps %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x79,0x0e,0xc1]
+; CHECK-NEXT: seta %al # encoding: [0x0f,0x97,0xc0]
+; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%res = call i32 @llvm.x86.avx.vtestnzc.ps(<4 x float> %a0, <4 x float> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -839,12 +867,12 @@ declare i32 @llvm.x86.avx.vtestnzc.ps(<4 x float>, <4 x float>) nounwind readnon
define i32 @test_x86_avx_vtestnzc_ps_256(<8 x float> %a0, <8 x float> %a1) {
; CHECK-LABEL: test_x86_avx_vtestnzc_ps_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0]
-; CHECK-NEXT: vtestps %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x0e,0xc1]
-; CHECK-NEXT: seta %al ## encoding: [0x0f,0x97,0xc0]
-; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; CHECK-NEXT: retl ## encoding: [0xc3]
+; CHECK: # %bb.0:
+; CHECK-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0]
+; CHECK-NEXT: vtestps %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x7d,0x0e,0xc1]
+; CHECK-NEXT: seta %al # encoding: [0x0f,0x97,0xc0]
+; CHECK-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
+; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%res = call i32 @llvm.x86.avx.vtestnzc.ps.256(<8 x float> %a0, <8 x float> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -853,11 +881,11 @@ declare i32 @llvm.x86.avx.vtestnzc.ps.256(<8 x float>, <8 x float>) nounwind rea
define i32 @test_x86_avx_vtestz_pd(<2 x double> %a0, <2 x double> %a1) {
; CHECK-LABEL: test_x86_avx_vtestz_pd:
-; CHECK: ## BB#0:
-; CHECK-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0]
-; CHECK-NEXT: vtestpd %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x0f,0xc1]
-; CHECK-NEXT: sete %al ## encoding: [0x0f,0x94,0xc0]
-; CHECK-NEXT: retl ## encoding: [0xc3]
+; CHECK: # %bb.0:
+; CHECK-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0]
+; CHECK-NEXT: vtestpd %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x79,0x0f,0xc1]
+; CHECK-NEXT: sete %al # encoding: [0x0f,0x94,0xc0]
+; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%res = call i32 @llvm.x86.avx.vtestz.pd(<2 x double> %a0, <2 x double> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -866,12 +894,12 @@ declare i32 @llvm.x86.avx.vtestz.pd(<2 x double>, <2 x double>) nounwind readnon
define i32 @test_x86_avx_vtestz_pd_256(<4 x double> %a0, <4 x double> %a1) {
; CHECK-LABEL: test_x86_avx_vtestz_pd_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0]
-; CHECK-NEXT: vtestpd %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x0f,0xc1]
-; CHECK-NEXT: sete %al ## encoding: [0x0f,0x94,0xc0]
-; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; CHECK-NEXT: retl ## encoding: [0xc3]
+; CHECK: # %bb.0:
+; CHECK-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0]
+; CHECK-NEXT: vtestpd %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x7d,0x0f,0xc1]
+; CHECK-NEXT: sete %al # encoding: [0x0f,0x94,0xc0]
+; CHECK-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
+; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%res = call i32 @llvm.x86.avx.vtestz.pd.256(<4 x double> %a0, <4 x double> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -880,11 +908,11 @@ declare i32 @llvm.x86.avx.vtestz.pd.256(<4 x double>, <4 x double>) nounwind rea
define i32 @test_x86_avx_vtestz_ps(<4 x float> %a0, <4 x float> %a1) {
; CHECK-LABEL: test_x86_avx_vtestz_ps:
-; CHECK: ## BB#0:
-; CHECK-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0]
-; CHECK-NEXT: vtestps %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x0e,0xc1]
-; CHECK-NEXT: sete %al ## encoding: [0x0f,0x94,0xc0]
-; CHECK-NEXT: retl ## encoding: [0xc3]
+; CHECK: # %bb.0:
+; CHECK-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0]
+; CHECK-NEXT: vtestps %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x79,0x0e,0xc1]
+; CHECK-NEXT: sete %al # encoding: [0x0f,0x94,0xc0]
+; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%res = call i32 @llvm.x86.avx.vtestz.ps(<4 x float> %a0, <4 x float> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -893,12 +921,12 @@ declare i32 @llvm.x86.avx.vtestz.ps(<4 x float>, <4 x float>) nounwind readnone
define i32 @test_x86_avx_vtestz_ps_256(<8 x float> %a0, <8 x float> %a1) {
; CHECK-LABEL: test_x86_avx_vtestz_ps_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0]
-; CHECK-NEXT: vtestps %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x0e,0xc1]
-; CHECK-NEXT: sete %al ## encoding: [0x0f,0x94,0xc0]
-; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; CHECK-NEXT: retl ## encoding: [0xc3]
+; CHECK: # %bb.0:
+; CHECK-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0]
+; CHECK-NEXT: vtestps %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x7d,0x0e,0xc1]
+; CHECK-NEXT: sete %al # encoding: [0x0f,0x94,0xc0]
+; CHECK-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
+; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%res = call i32 @llvm.x86.avx.vtestz.ps.256(<8 x float> %a0, <8 x float> %a1) ; <i32> [#uses=1]
ret i32 %res
}
@@ -907,9 +935,9 @@ declare i32 @llvm.x86.avx.vtestz.ps.256(<8 x float>, <8 x float>) nounwind readn
define void @test_x86_avx_vzeroall() {
; CHECK-LABEL: test_x86_avx_vzeroall:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vzeroall ## encoding: [0xc5,0xfc,0x77]
-; CHECK-NEXT: retl ## encoding: [0xc3]
+; CHECK: # %bb.0:
+; CHECK-NEXT: vzeroall # encoding: [0xc5,0xfc,0x77]
+; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
call void @llvm.x86.avx.vzeroall()
ret void
}
@@ -918,32 +946,48 @@ declare void @llvm.x86.avx.vzeroall() nounwind
define void @test_x86_avx_vzeroupper() {
; CHECK-LABEL: test_x86_avx_vzeroupper:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; CHECK-NEXT: retl ## encoding: [0xc3]
+; CHECK: # %bb.0:
+; CHECK-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
+; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
call void @llvm.x86.avx.vzeroupper()
ret void
}
declare void @llvm.x86.avx.vzeroupper() nounwind
define void @movnt_dq(i8* %p, <2 x i64> %a1) nounwind {
-; AVX-LABEL: movnt_dq:
-; AVX: ## BB#0:
-; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf1,0x76,0xc9]
-; AVX-NEXT: vpsubq %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfb,0xc1]
-; AVX-NEXT: vmovntdq %ymm0, (%eax) ## encoding: [0xc5,0xfd,0xe7,0x00]
-; AVX-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; AVX-NEXT: retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: movnt_dq:
-; AVX512VL: ## BB#0:
-; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf1,0x76,0xc9]
-; AVX512VL-NEXT: vpsubq %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfb,0xc1]
-; AVX512VL-NEXT: vmovntdq %ymm0, (%eax) ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe7,0x00]
-; AVX512VL-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; AVX512VL-NEXT: retl ## encoding: [0xc3]
+; X86-AVX-LABEL: movnt_dq:
+; X86-AVX: # %bb.0:
+; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0x76,0xc9]
+; X86-AVX-NEXT: vpsubq %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfb,0xc1]
+; X86-AVX-NEXT: vmovntdq %ymm0, (%eax) # encoding: [0xc5,0xfd,0xe7,0x00]
+; X86-AVX-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X86-AVX-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+;
+; X86-AVX512VL-LABEL: movnt_dq:
+; X86-AVX512VL: # %bb.0:
+; X86-AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0x76,0xc9]
+; X86-AVX512VL-NEXT: vpsubq %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfb,0xc1]
+; X86-AVX512VL-NEXT: vmovntdq %ymm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe7,0x00]
+; X86-AVX512VL-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X86-AVX512VL-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+;
+; X64-AVX-LABEL: movnt_dq:
+; X64-AVX: # %bb.0:
+; X64-AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0x76,0xc9]
+; X64-AVX-NEXT: vpsubq %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfb,0xc1]
+; X64-AVX-NEXT: vmovntdq %ymm0, (%rdi) # encoding: [0xc5,0xfd,0xe7,0x07]
+; X64-AVX-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X64-AVX-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+;
+; X64-AVX512VL-LABEL: movnt_dq:
+; X64-AVX512VL: # %bb.0:
+; X64-AVX512VL-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0x76,0xc9]
+; X64-AVX512VL-NEXT: vpsubq %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfb,0xc1]
+; X64-AVX512VL-NEXT: vmovntdq %ymm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe7,0x07]
+; X64-AVX512VL-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X64-AVX512VL-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%a2 = add <2 x i64> %a1, <i64 1, i64 1>
%a3 = shufflevector <2 x i64> %a2, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
tail call void @llvm.x86.avx.movnt.dq.256(i8* %p, <4 x i64> %a3) nounwind
@@ -952,19 +996,31 @@ define void @movnt_dq(i8* %p, <2 x i64> %a1) nounwind {
declare void @llvm.x86.avx.movnt.dq.256(i8*, <4 x i64>) nounwind
define void @movnt_ps(i8* %p, <8 x float> %a) nounwind {
-; AVX-LABEL: movnt_ps:
-; AVX: ## BB#0:
-; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; AVX-NEXT: vmovntps %ymm0, (%eax) ## encoding: [0xc5,0xfc,0x2b,0x00]
-; AVX-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; AVX-NEXT: retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: movnt_ps:
-; AVX512VL: ## BB#0:
-; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; AVX512VL-NEXT: vmovntps %ymm0, (%eax) ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x2b,0x00]
-; AVX512VL-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; AVX512VL-NEXT: retl ## encoding: [0xc3]
+; X86-AVX-LABEL: movnt_ps:
+; X86-AVX: # %bb.0:
+; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-AVX-NEXT: vmovntps %ymm0, (%eax) # encoding: [0xc5,0xfc,0x2b,0x00]
+; X86-AVX-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X86-AVX-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+;
+; X86-AVX512VL-LABEL: movnt_ps:
+; X86-AVX512VL: # %bb.0:
+; X86-AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-AVX512VL-NEXT: vmovntps %ymm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x2b,0x00]
+; X86-AVX512VL-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X86-AVX512VL-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+;
+; X64-AVX-LABEL: movnt_ps:
+; X64-AVX: # %bb.0:
+; X64-AVX-NEXT: vmovntps %ymm0, (%rdi) # encoding: [0xc5,0xfc,0x2b,0x07]
+; X64-AVX-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X64-AVX-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+;
+; X64-AVX512VL-LABEL: movnt_ps:
+; X64-AVX512VL: # %bb.0:
+; X64-AVX512VL-NEXT: vmovntps %ymm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x2b,0x07]
+; X64-AVX512VL-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X64-AVX512VL-NEXT: ret{{[l|q]}} # encoding: [0xc3]
tail call void @llvm.x86.avx.movnt.ps.256(i8* %p, <8 x float> %a) nounwind
ret void
}
@@ -972,23 +1028,39 @@ declare void @llvm.x86.avx.movnt.ps.256(i8*, <8 x float>) nounwind
define void @movnt_pd(i8* %p, <4 x double> %a1) nounwind {
; add operation forces the execution domain.
-; AVX-LABEL: movnt_pd:
-; AVX: ## BB#0:
-; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; AVX-NEXT: vxorpd %ymm1, %ymm1, %ymm1 ## encoding: [0xc5,0xf5,0x57,0xc9]
-; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x58,0xc1]
-; AVX-NEXT: vmovntpd %ymm0, (%eax) ## encoding: [0xc5,0xfd,0x2b,0x00]
-; AVX-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; AVX-NEXT: retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: movnt_pd:
-; AVX512VL: ## BB#0:
-; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; AVX512VL-NEXT: vxorpd %ymm1, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0x57,0xc9]
-; AVX512VL-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x58,0xc1]
-; AVX512VL-NEXT: vmovntpd %ymm0, (%eax) ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x2b,0x00]
-; AVX512VL-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; AVX512VL-NEXT: retl ## encoding: [0xc3]
+; X86-AVX-LABEL: movnt_pd:
+; X86-AVX: # %bb.0:
+; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-AVX-NEXT: vxorpd %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0x57,0xc9]
+; X86-AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # encoding: [0xc5,0xfd,0x58,0xc1]
+; X86-AVX-NEXT: vmovntpd %ymm0, (%eax) # encoding: [0xc5,0xfd,0x2b,0x00]
+; X86-AVX-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X86-AVX-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+;
+; X86-AVX512VL-LABEL: movnt_pd:
+; X86-AVX512VL: # %bb.0:
+; X86-AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-AVX512VL-NEXT: vxorpd %xmm1, %xmm1, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0x57,0xc9]
+; X86-AVX512VL-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x58,0xc1]
+; X86-AVX512VL-NEXT: vmovntpd %ymm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x2b,0x00]
+; X86-AVX512VL-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X86-AVX512VL-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+;
+; X64-AVX-LABEL: movnt_pd:
+; X64-AVX: # %bb.0:
+; X64-AVX-NEXT: vxorpd %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0x57,0xc9]
+; X64-AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # encoding: [0xc5,0xfd,0x58,0xc1]
+; X64-AVX-NEXT: vmovntpd %ymm0, (%rdi) # encoding: [0xc5,0xfd,0x2b,0x07]
+; X64-AVX-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X64-AVX-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+;
+; X64-AVX512VL-LABEL: movnt_pd:
+; X64-AVX512VL: # %bb.0:
+; X64-AVX512VL-NEXT: vxorpd %xmm1, %xmm1, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0x57,0xc9]
+; X64-AVX512VL-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x58,0xc1]
+; X64-AVX512VL-NEXT: vmovntpd %ymm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x2b,0x07]
+; X64-AVX512VL-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X64-AVX512VL-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%a2 = fadd <4 x double> %a1, <double 0x0, double 0x0, double 0x0, double 0x0>
tail call void @llvm.x86.avx.movnt.pd.256(i8* %p, <4 x double> %a2) nounwind
ret void
@@ -999,9 +1071,9 @@ declare void @llvm.x86.avx.movnt.pd.256(i8*, <4 x double>) nounwind
; Check for pclmulqdq
define <2 x i64> @test_x86_pclmulqdq(<2 x i64> %a0, <2 x i64> %a1) {
; CHECK-LABEL: test_x86_pclmulqdq:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpclmulqdq $0, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x44,0xc1,0x00]
-; CHECK-NEXT: retl ## encoding: [0xc3]
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpclmulqdq $0, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x44,0xc1,0x00]
+; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%res = call <2 x i64> @llvm.x86.pclmulqdq(<2 x i64> %a0, <2 x i64> %a1, i8 0) ; <<2 x i64>> [#uses=1]
ret <2 x i64> %res
}
diff --git a/test/CodeGen/X86/avx-intrinsics-x86_64.ll b/test/CodeGen/X86/avx-intrinsics-x86_64.ll
index 909c69cb9a17..c7039dca27aa 100644
--- a/test/CodeGen/X86/avx-intrinsics-x86_64.ll
+++ b/test/CodeGen/X86/avx-intrinsics-x86_64.ll
@@ -1,22 +1,22 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -march=x86-64 -mcpu=corei7 -mattr=avx | FileCheck %s --check-prefix=CHECK --check-prefix=AVX
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -march=x86-64 -mcpu=corei7 -mattr=avx512vl | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512VL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=CHECK --check-prefix=AVX
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512VL
define <4 x double> @test_x86_avx_vzeroall(<4 x double> %a, <4 x double> %b) {
; AVX-LABEL: test_x86_avx_vzeroall:
-; AVX: ## BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0
-; AVX-NEXT: vmovupd %ymm0, -{{[0-9]+}}(%rsp) ## 32-byte Spill
+; AVX-NEXT: vmovupd %ymm0, -{{[0-9]+}}(%rsp) # 32-byte Spill
; AVX-NEXT: vzeroall
-; AVX-NEXT: vmovups -{{[0-9]+}}(%rsp), %ymm0 ## 32-byte Reload
-; AVX-NEXT: retq
+; AVX-NEXT: vmovups -{{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
+; AVX-NEXT: ret{{[l|q]}}
;
; AVX512VL-LABEL: test_x86_avx_vzeroall:
-; AVX512VL: ## BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vaddpd %ymm1, %ymm0, %ymm16
; AVX512VL-NEXT: vzeroall
; AVX512VL-NEXT: vmovapd %ymm16, %ymm0
-; AVX512VL-NEXT: retq
+; AVX512VL-NEXT: ret{{[l|q]}}
%c = fadd <4 x double> %a, %b
call void @llvm.x86.avx.vzeroall()
ret <4 x double> %c
@@ -25,19 +25,19 @@ declare void @llvm.x86.avx.vzeroall() nounwind
define <4 x double> @test_x86_avx_vzeroupper(<4 x double> %a, <4 x double> %b) {
; AVX-LABEL: test_x86_avx_vzeroupper:
-; AVX: ## BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0
-; AVX-NEXT: vmovupd %ymm0, -{{[0-9]+}}(%rsp) ## 32-byte Spill
+; AVX-NEXT: vmovupd %ymm0, -{{[0-9]+}}(%rsp) # 32-byte Spill
; AVX-NEXT: vzeroupper
-; AVX-NEXT: vmovups -{{[0-9]+}}(%rsp), %ymm0 ## 32-byte Reload
-; AVX-NEXT: retq
+; AVX-NEXT: vmovups -{{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
+; AVX-NEXT: ret{{[l|q]}}
;
; AVX512VL-LABEL: test_x86_avx_vzeroupper:
-; AVX512VL: ## BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vaddpd %ymm1, %ymm0, %ymm16
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: vmovapd %ymm16, %ymm0
-; AVX512VL-NEXT: retq
+; AVX512VL-NEXT: ret{{[l|q]}}
%c = fadd <4 x double> %a, %b
call void @llvm.x86.avx.vzeroupper()
ret <4 x double> %c
diff --git a/test/CodeGen/X86/avx-load-store.ll b/test/CodeGen/X86/avx-load-store.ll
index 06aadc476e4c..5a64db043575 100644
--- a/test/CodeGen/X86/avx-load-store.ll
+++ b/test/CodeGen/X86/avx-load-store.ll
@@ -4,7 +4,7 @@
define void @test_256_load(double* nocapture %d, float* nocapture %f, <4 x i64>* nocapture %i) nounwind {
; CHECK-LABEL: test_256_load:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: pushq %r15
; CHECK-NEXT: pushq %r14
; CHECK-NEXT: pushq %rbx
@@ -33,7 +33,7 @@ define void @test_256_load(double* nocapture %d, float* nocapture %f, <4 x i64>*
; CHECK-NEXT: retq
;
; CHECK_O0-LABEL: test_256_load:
-; CHECK_O0: # BB#0: # %entry
+; CHECK_O0: # %bb.0: # %entry
; CHECK_O0-NEXT: subq $152, %rsp
; CHECK_O0-NEXT: vmovapd (%rdi), %ymm0
; CHECK_O0-NEXT: vmovaps (%rsi), %ymm1
@@ -78,16 +78,16 @@ declare void @dummy(<4 x double>, <8 x float>, <4 x i64>)
define <8 x float> @mov00(<8 x float> %v, float * %ptr) nounwind {
; CHECK-LABEL: mov00:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: retq
;
; CHECK_O0-LABEL: mov00:
-; CHECK_O0: # BB#0:
+; CHECK_O0: # %bb.0:
; CHECK_O0-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; CHECK_O0-NEXT: # implicit-def: %YMM1
+; CHECK_O0-NEXT: # implicit-def: %ymm1
; CHECK_O0-NEXT: vmovaps %xmm0, %xmm1
-; CHECK_O0-NEXT: vxorps %ymm2, %ymm2, %ymm2
+; CHECK_O0-NEXT: vxorps %xmm2, %xmm2, %xmm2
; CHECK_O0-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm2[1,2,3,4,5,6,7]
; CHECK_O0-NEXT: retq
%val = load float, float* %ptr
@@ -97,16 +97,16 @@ define <8 x float> @mov00(<8 x float> %v, float * %ptr) nounwind {
define <4 x double> @mov01(<4 x double> %v, double * %ptr) nounwind {
; CHECK-LABEL: mov01:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: retq
;
; CHECK_O0-LABEL: mov01:
-; CHECK_O0: # BB#0:
+; CHECK_O0: # %bb.0:
; CHECK_O0-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; CHECK_O0-NEXT: # implicit-def: %YMM1
+; CHECK_O0-NEXT: # implicit-def: %ymm1
; CHECK_O0-NEXT: vmovaps %xmm0, %xmm1
-; CHECK_O0-NEXT: vxorps %ymm2, %ymm2, %ymm2
+; CHECK_O0-NEXT: vxorps %xmm2, %xmm2, %xmm2
; CHECK_O0-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm2[1,2,3]
; CHECK_O0-NEXT: retq
%val = load double, double* %ptr
@@ -116,12 +116,12 @@ define <4 x double> @mov01(<4 x double> %v, double * %ptr) nounwind {
define void @storev16i16(<16 x i16> %a) nounwind {
; CHECK-LABEL: storev16i16:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps %ymm0, (%rax)
;
; CHECK_O0-LABEL: storev16i16:
-; CHECK_O0: # BB#0:
-; CHECK_O0-NEXT: # implicit-def: %RAX
+; CHECK_O0: # %bb.0:
+; CHECK_O0-NEXT: # implicit-def: %rax
; CHECK_O0-NEXT: vmovdqa %ymm0, (%rax)
store <16 x i16> %a, <16 x i16>* undef, align 32
unreachable
@@ -129,13 +129,13 @@ define void @storev16i16(<16 x i16> %a) nounwind {
define void @storev16i16_01(<16 x i16> %a) nounwind {
; CHECK-LABEL: storev16i16_01:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vextractf128 $1, %ymm0, (%rax)
; CHECK-NEXT: vmovups %xmm0, (%rax)
;
; CHECK_O0-LABEL: storev16i16_01:
-; CHECK_O0: # BB#0:
-; CHECK_O0-NEXT: # implicit-def: %RAX
+; CHECK_O0: # %bb.0:
+; CHECK_O0-NEXT: # implicit-def: %rax
; CHECK_O0-NEXT: vmovdqu %ymm0, (%rax)
store <16 x i16> %a, <16 x i16>* undef, align 4
unreachable
@@ -143,12 +143,12 @@ define void @storev16i16_01(<16 x i16> %a) nounwind {
define void @storev32i8(<32 x i8> %a) nounwind {
; CHECK-LABEL: storev32i8:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps %ymm0, (%rax)
;
; CHECK_O0-LABEL: storev32i8:
-; CHECK_O0: # BB#0:
-; CHECK_O0-NEXT: # implicit-def: %RAX
+; CHECK_O0: # %bb.0:
+; CHECK_O0-NEXT: # implicit-def: %rax
; CHECK_O0-NEXT: vmovdqa %ymm0, (%rax)
store <32 x i8> %a, <32 x i8>* undef, align 32
unreachable
@@ -156,30 +156,30 @@ define void @storev32i8(<32 x i8> %a) nounwind {
define void @storev32i8_01(<32 x i8> %a) nounwind {
; CHECK-LABEL: storev32i8_01:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vextractf128 $1, %ymm0, (%rax)
; CHECK-NEXT: vmovups %xmm0, (%rax)
;
; CHECK_O0-LABEL: storev32i8_01:
-; CHECK_O0: # BB#0:
-; CHECK_O0-NEXT: # implicit-def: %RAX
+; CHECK_O0: # %bb.0:
+; CHECK_O0-NEXT: # implicit-def: %rax
; CHECK_O0-NEXT: vmovdqu %ymm0, (%rax)
store <32 x i8> %a, <32 x i8>* undef, align 4
unreachable
}
-; It is faster to make two saves, if the data is already in XMM registers. For
+; It is faster to make two saves, if the data is already in xmm registers. For
; example, after making an integer operation.
define void @double_save(<4 x i32> %A, <4 x i32> %B, <8 x i32>* %P) nounwind ssp {
; CHECK-LABEL: double_save:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps %xmm1, 16(%rdi)
; CHECK-NEXT: vmovaps %xmm0, (%rdi)
; CHECK-NEXT: retq
;
; CHECK_O0-LABEL: double_save:
-; CHECK_O0: # BB#0:
-; CHECK_O0-NEXT: # implicit-def: %YMM2
+; CHECK_O0: # %bb.0:
+; CHECK_O0-NEXT: # implicit-def: %ymm2
; CHECK_O0-NEXT: vmovaps %xmm0, %xmm2
; CHECK_O0-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm2
; CHECK_O0-NEXT: vmovdqu %ymm2, (%rdi)
@@ -194,30 +194,30 @@ declare void @llvm.x86.avx.maskstore.ps.256(i8*, <8 x i32>, <8 x float>) nounwin
define void @f_f() nounwind {
; CHECK-LABEL: f_f:
-; CHECK: # BB#0: # %allocas
+; CHECK: # %bb.0: # %allocas
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: testb %al, %al
; CHECK-NEXT: jne .LBB8_2
-; CHECK-NEXT: # BB#1: # %cif_mask_all
+; CHECK-NEXT: # %bb.1: # %cif_mask_all
; CHECK-NEXT: .LBB8_2: # %cif_mask_mixed
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: testb %al, %al
; CHECK-NEXT: jne .LBB8_4
-; CHECK-NEXT: # BB#3: # %cif_mixed_test_all
+; CHECK-NEXT: # %bb.3: # %cif_mixed_test_all
; CHECK-NEXT: movl $-1, %eax
; CHECK-NEXT: vmovd %eax, %xmm0
; CHECK-NEXT: vmaskmovps %ymm0, %ymm0, (%rax)
; CHECK-NEXT: .LBB8_4: # %cif_mixed_test_any_check
;
; CHECK_O0-LABEL: f_f:
-; CHECK_O0: # BB#0: # %allocas
-; CHECK_O0-NEXT: # implicit-def: %AL
+; CHECK_O0: # %bb.0: # %allocas
+; CHECK_O0-NEXT: # implicit-def: %al
; CHECK_O0-NEXT: testb $1, %al
; CHECK_O0-NEXT: jne .LBB8_1
; CHECK_O0-NEXT: jmp .LBB8_2
; CHECK_O0-NEXT: .LBB8_1: # %cif_mask_all
; CHECK_O0-NEXT: .LBB8_2: # %cif_mask_mixed
-; CHECK_O0-NEXT: # implicit-def: %AL
+; CHECK_O0-NEXT: # implicit-def: %al
; CHECK_O0-NEXT: testb $1, %al
; CHECK_O0-NEXT: jne .LBB8_3
; CHECK_O0-NEXT: jmp .LBB8_4
@@ -225,8 +225,8 @@ define void @f_f() nounwind {
; CHECK_O0-NEXT: movl $-1, %eax
; CHECK_O0-NEXT: vmovd %eax, %xmm0
; CHECK_O0-NEXT: vmovaps %xmm0, %xmm1
-; CHECK_O0-NEXT: # implicit-def: %RCX
-; CHECK_O0-NEXT: # implicit-def: %YMM2
+; CHECK_O0-NEXT: # implicit-def: %rcx
+; CHECK_O0-NEXT: # implicit-def: %ymm2
; CHECK_O0-NEXT: vmaskmovps %ymm2, %ymm1, (%rcx)
; CHECK_O0-NEXT: .LBB8_4: # %cif_mixed_test_any_check
allocas:
@@ -248,7 +248,7 @@ cif_mixed_test_any_check:
define void @add8i32(<8 x i32>* %ret, <8 x i32>* %bp) nounwind {
; CHECK-LABEL: add8i32:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vmovups (%rsi), %xmm0
; CHECK-NEXT: vmovups 16(%rsi), %xmm1
; CHECK-NEXT: vmovups %xmm1, 16(%rdi)
@@ -256,10 +256,10 @@ define void @add8i32(<8 x i32>* %ret, <8 x i32>* %bp) nounwind {
; CHECK-NEXT: retq
;
; CHECK_O0-LABEL: add8i32:
-; CHECK_O0: # BB#0:
+; CHECK_O0: # %bb.0:
; CHECK_O0-NEXT: vmovdqu (%rsi), %xmm0
; CHECK_O0-NEXT: vmovdqu 16(%rsi), %xmm1
-; CHECK_O0-NEXT: # implicit-def: %YMM2
+; CHECK_O0-NEXT: # implicit-def: %ymm2
; CHECK_O0-NEXT: vmovaps %xmm0, %xmm2
; CHECK_O0-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm2
; CHECK_O0-NEXT: vmovdqu %ymm2, (%rdi)
@@ -273,14 +273,14 @@ define void @add8i32(<8 x i32>* %ret, <8 x i32>* %bp) nounwind {
define void @add4i64a64(<4 x i64>* %ret, <4 x i64>* %bp) nounwind {
; CHECK-LABEL: add4i64a64:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps (%rsi), %ymm0
; CHECK-NEXT: vmovaps %ymm0, (%rdi)
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
;
; CHECK_O0-LABEL: add4i64a64:
-; CHECK_O0: # BB#0:
+; CHECK_O0: # %bb.0:
; CHECK_O0-NEXT: vmovaps (%rsi), %ymm0
; CHECK_O0-NEXT: vmovdqa %ymm0, (%rdi)
; CHECK_O0-NEXT: vzeroupper
@@ -293,7 +293,7 @@ define void @add4i64a64(<4 x i64>* %ret, <4 x i64>* %bp) nounwind {
define void @add4i64a16(<4 x i64>* %ret, <4 x i64>* %bp) nounwind {
; CHECK-LABEL: add4i64a16:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps (%rsi), %xmm0
; CHECK-NEXT: vmovaps 16(%rsi), %xmm1
; CHECK-NEXT: vmovaps %xmm1, 16(%rdi)
@@ -301,10 +301,10 @@ define void @add4i64a16(<4 x i64>* %ret, <4 x i64>* %bp) nounwind {
; CHECK-NEXT: retq
;
; CHECK_O0-LABEL: add4i64a16:
-; CHECK_O0: # BB#0:
+; CHECK_O0: # %bb.0:
; CHECK_O0-NEXT: vmovdqa (%rsi), %xmm0
; CHECK_O0-NEXT: vmovdqa 16(%rsi), %xmm1
-; CHECK_O0-NEXT: # implicit-def: %YMM2
+; CHECK_O0-NEXT: # implicit-def: %ymm2
; CHECK_O0-NEXT: vmovaps %xmm0, %xmm2
; CHECK_O0-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm2
; CHECK_O0-NEXT: vmovdqu %ymm2, (%rdi)
diff --git a/test/CodeGen/X86/avx-logic.ll b/test/CodeGen/X86/avx-logic.ll
index 7c76f2f623dd..ad7ceda9b1f9 100644
--- a/test/CodeGen/X86/avx-logic.ll
+++ b/test/CodeGen/X86/avx-logic.ll
@@ -4,9 +4,9 @@
define <4 x double> @andpd256(<4 x double> %y, <4 x double> %x) nounwind uwtable readnone ssp {
; CHECK-LABEL: andpd256:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vandpd %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: vxorpd %ymm1, %ymm1, %ymm1
+; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0
; CHECK-NEXT: retq
entry:
@@ -21,9 +21,9 @@ entry:
define <4 x double> @andpd256fold(<4 x double> %y) nounwind uwtable readnone ssp {
; CHECK-LABEL: andpd256fold:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vandpd {{.*}}(%rip), %ymm0, %ymm0
-; CHECK-NEXT: vxorpd %ymm1, %ymm1, %ymm1
+; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0
; CHECK-NEXT: retq
entry:
@@ -37,7 +37,7 @@ entry:
define <8 x float> @andps256(<8 x float> %y, <8 x float> %x) nounwind uwtable readnone ssp {
; CHECK-LABEL: andps256:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vandps %ymm0, %ymm1, %ymm0
; CHECK-NEXT: retq
entry:
@@ -50,7 +50,7 @@ entry:
define <8 x float> @andps256fold(<8 x float> %y) nounwind uwtable readnone ssp {
; CHECK-LABEL: andps256fold:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
; CHECK-NEXT: retq
entry:
@@ -62,9 +62,9 @@ entry:
define <4 x double> @xorpd256(<4 x double> %y, <4 x double> %x) nounwind uwtable readnone ssp {
; CHECK-LABEL: xorpd256:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vxorpd %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: vxorpd %ymm1, %ymm1, %ymm1
+; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0
; CHECK-NEXT: retq
entry:
@@ -79,9 +79,9 @@ entry:
define <4 x double> @xorpd256fold(<4 x double> %y) nounwind uwtable readnone ssp {
; CHECK-LABEL: xorpd256fold:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vxorpd {{.*}}(%rip), %ymm0, %ymm0
-; CHECK-NEXT: vxorpd %ymm1, %ymm1, %ymm1
+; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0
; CHECK-NEXT: retq
entry:
@@ -95,7 +95,7 @@ entry:
define <8 x float> @xorps256(<8 x float> %y, <8 x float> %x) nounwind uwtable readnone ssp {
; CHECK-LABEL: xorps256:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vxorps %ymm0, %ymm1, %ymm0
; CHECK-NEXT: retq
entry:
@@ -108,7 +108,7 @@ entry:
define <8 x float> @xorps256fold(<8 x float> %y) nounwind uwtable readnone ssp {
; CHECK-LABEL: xorps256fold:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vxorps {{.*}}(%rip), %ymm0, %ymm0
; CHECK-NEXT: retq
entry:
@@ -120,9 +120,9 @@ entry:
define <4 x double> @orpd256(<4 x double> %y, <4 x double> %x) nounwind uwtable readnone ssp {
; CHECK-LABEL: orpd256:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vorpd %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: vxorpd %ymm1, %ymm1, %ymm1
+; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0
; CHECK-NEXT: retq
entry:
@@ -137,9 +137,9 @@ entry:
define <4 x double> @orpd256fold(<4 x double> %y) nounwind uwtable readnone ssp {
; CHECK-LABEL: orpd256fold:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vorpd {{.*}}(%rip), %ymm0, %ymm0
-; CHECK-NEXT: vxorpd %ymm1, %ymm1, %ymm1
+; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0
; CHECK-NEXT: retq
entry:
@@ -153,7 +153,7 @@ entry:
define <8 x float> @orps256(<8 x float> %y, <8 x float> %x) nounwind uwtable readnone ssp {
; CHECK-LABEL: orps256:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vorps %ymm0, %ymm1, %ymm0
; CHECK-NEXT: retq
entry:
@@ -166,7 +166,7 @@ entry:
define <8 x float> @orps256fold(<8 x float> %y) nounwind uwtable readnone ssp {
; CHECK-LABEL: orps256fold:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vorps {{.*}}(%rip), %ymm0, %ymm0
; CHECK-NEXT: retq
entry:
@@ -178,9 +178,9 @@ entry:
define <4 x double> @andnotpd256(<4 x double> %y, <4 x double> %x) nounwind uwtable readnone ssp {
; CHECK-LABEL: andnotpd256:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vandnpd %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: vxorpd %ymm1, %ymm1, %ymm1
+; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0
; CHECK-NEXT: retq
entry:
@@ -196,9 +196,9 @@ entry:
define <4 x double> @andnotpd256fold(<4 x double> %y, <4 x double>* nocapture %x) nounwind uwtable readonly ssp {
; CHECK-LABEL: andnotpd256fold:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vandnpd (%rdi), %ymm0, %ymm0
-; CHECK-NEXT: vxorpd %ymm1, %ymm1, %ymm1
+; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0
; CHECK-NEXT: retq
entry:
@@ -215,7 +215,7 @@ entry:
define <8 x float> @andnotps256(<8 x float> %y, <8 x float> %x) nounwind uwtable readnone ssp {
; CHECK-LABEL: andnotps256:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vandnps %ymm0, %ymm1, %ymm0
; CHECK-NEXT: retq
entry:
@@ -229,7 +229,7 @@ entry:
define <8 x float> @andnotps256fold(<8 x float> %y, <8 x float>* nocapture %x) nounwind uwtable readonly ssp {
; CHECK-LABEL: andnotps256fold:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vandnps (%rdi), %ymm0, %ymm0
; CHECK-NEXT: retq
entry:
@@ -246,7 +246,7 @@ entry:
define <2 x i64> @vpandn(<2 x i64> %a, <2 x i64> %b) nounwind uwtable readnone ssp {
; CHECK-LABEL: vpandn:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; CHECK-NEXT: vpsubq %xmm1, %xmm0, %xmm1
; CHECK-NEXT: vpandn %xmm0, %xmm1, %xmm0
@@ -261,7 +261,7 @@ entry:
define <2 x i64> @vpand(<2 x i64> %a, <2 x i64> %b) nounwind uwtable readnone ssp {
; CHECK-LABEL: vpand:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vpsubq %xmm2, %xmm0, %xmm0
; CHECK-NEXT: vpand %xmm1, %xmm0, %xmm0
@@ -275,13 +275,13 @@ entry:
define <4 x i32> @and_xor_splat1_v4i32(<4 x i32> %x) nounwind {
; AVX-LABEL: and_xor_splat1_v4i32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vandnps {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: retq
;
; AVX512-LABEL: and_xor_splat1_v4i32:
-; AVX512: # BB#0:
-; AVX512-NEXT: vbroadcastss {{.*}}(%rip), %xmm1
+; AVX512: # %bb.0:
+; AVX512-NEXT: vbroadcastss {{.*#+}} xmm1 = [1,1,1,1]
; AVX512-NEXT: vandnps %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
%xor = xor <4 x i32> %x, <i32 1, i32 1, i32 1, i32 1>
@@ -291,13 +291,13 @@ define <4 x i32> @and_xor_splat1_v4i32(<4 x i32> %x) nounwind {
define <4 x i64> @and_xor_splat1_v4i64(<4 x i64> %x) nounwind {
; AVX-LABEL: and_xor_splat1_v4i64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vandnps {{.*}}(%rip), %ymm0, %ymm0
; AVX-NEXT: retq
;
; AVX512-LABEL: and_xor_splat1_v4i64:
-; AVX512: # BB#0:
-; AVX512-NEXT: vbroadcastsd {{.*}}(%rip), %ymm1
+; AVX512: # %bb.0:
+; AVX512-NEXT: vbroadcastsd {{.*#+}} ymm1 = [1,1,1,1]
; AVX512-NEXT: vandnps %ymm1, %ymm0, %ymm0
; AVX512-NEXT: retq
%xor = xor <4 x i64> %x, <i64 1, i64 1, i64 1, i64 1>
diff --git a/test/CodeGen/X86/avx-minmax.ll b/test/CodeGen/X86/avx-minmax.ll
index c94962b74ed1..002d99fd8ebd 100644
--- a/test/CodeGen/X86/avx-minmax.ll
+++ b/test/CodeGen/X86/avx-minmax.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mattr=+avx -asm-verbose=false -enable-unsafe-fp-math -enable-no-nans-fp-math | FileCheck -check-prefix=UNSAFE %s
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx -asm-verbose=false -enable-unsafe-fp-math -enable-no-nans-fp-math | FileCheck -check-prefix=UNSAFE %s
; UNSAFE-LABEL: maxpd:
; UNSAFE: vmaxpd {{.+}}, %xmm
diff --git a/test/CodeGen/X86/avx-schedule.ll b/test/CodeGen/X86/avx-schedule.ll
index 78c88f401cbc..6f63af9c25bc 100644
--- a/test/CodeGen/X86/avx-schedule.ll
+++ b/test/CodeGen/X86/avx-schedule.ll
@@ -1,35 +1,62 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=CHECK --check-prefix=GENERIC
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=sandybridge | FileCheck %s --check-prefix=CHECK --check-prefix=SANDY
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=ivybridge | FileCheck %s --check-prefix=CHECK --check-prefix=SANDY
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=haswell | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=broadwell | FileCheck %s --check-prefix=CHECK --check-prefix=BROADWELL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=SKYLAKE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skx | FileCheck %s --check-prefix=CHECK --check-prefix=SKX
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1
define <4 x double> @test_addpd(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a2) {
+; GENERIC-LABEL: test_addpd:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: vaddpd (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
; SANDY-LABEL: test_addpd:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: vaddpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vaddpd (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_addpd:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT: vaddpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vaddpd (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_addpd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; BROADWELL-NEXT: vaddpd (%rdi), %ymm0, %ymm0 # sched: [9:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_addpd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vaddpd (%rdi), %ymm0, %ymm0 # sched: [11:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_addpd:
+; SKX: # %bb.0:
+; SKX-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [4:0.33]
+; SKX-NEXT: vaddpd (%rdi), %ymm0, %ymm0 # sched: [11:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_addpd:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
; BTVER2-NEXT: vaddpd (%rdi), %ymm0, %ymm0 # sched: [8:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_addpd:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
; ZNVER1-NEXT: vaddpd (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = fadd <4 x double> %a0, %a1
%2 = load <4 x double>, <4 x double> *%a2, align 32
%3 = fadd <4 x double> %1, %2
@@ -37,29 +64,53 @@ define <4 x double> @test_addpd(<4 x double> %a0, <4 x double> %a1, <4 x double>
}
define <8 x float> @test_addps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2) {
+; GENERIC-LABEL: test_addps:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: vaddps (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
; SANDY-LABEL: test_addps:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: vaddps (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vaddps (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_addps:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT: vaddps (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vaddps (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_addps:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; BROADWELL-NEXT: vaddps (%rdi), %ymm0, %ymm0 # sched: [9:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_addps:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vaddps (%rdi), %ymm0, %ymm0 # sched: [11:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_addps:
+; SKX: # %bb.0:
+; SKX-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [4:0.33]
+; SKX-NEXT: vaddps (%rdi), %ymm0, %ymm0 # sched: [11:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_addps:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
; BTVER2-NEXT: vaddps (%rdi), %ymm0, %ymm0 # sched: [8:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_addps:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
; ZNVER1-NEXT: vaddps (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = fadd <8 x float> %a0, %a1
%2 = load <8 x float>, <8 x float> *%a2, align 32
%3 = fadd <8 x float> %1, %2
@@ -67,29 +118,53 @@ define <8 x float> @test_addps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a
}
define <4 x double> @test_addsubpd(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a2) {
+; GENERIC-LABEL: test_addsubpd:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vaddsubpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: vaddsubpd (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
; SANDY-LABEL: test_addsubpd:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vaddsubpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: vaddsubpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vaddsubpd (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_addsubpd:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vaddsubpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT: vaddsubpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vaddsubpd (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_addsubpd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vaddsubpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; BROADWELL-NEXT: vaddsubpd (%rdi), %ymm0, %ymm0 # sched: [9:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_addsubpd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vaddsubpd %ymm1, %ymm0, %ymm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vaddsubpd (%rdi), %ymm0, %ymm0 # sched: [11:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_addsubpd:
+; SKX: # %bb.0:
+; SKX-NEXT: vaddsubpd %ymm1, %ymm0, %ymm0 # sched: [4:0.33]
+; SKX-NEXT: vaddsubpd (%rdi), %ymm0, %ymm0 # sched: [11:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_addsubpd:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vaddsubpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
; BTVER2-NEXT: vaddsubpd (%rdi), %ymm0, %ymm0 # sched: [8:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_addsubpd:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vaddsubpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
; ZNVER1-NEXT: vaddsubpd (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call <4 x double> @llvm.x86.avx.addsub.pd.256(<4 x double> %a0, <4 x double> %a1)
%2 = load <4 x double>, <4 x double> *%a2, align 32
%3 = call <4 x double> @llvm.x86.avx.addsub.pd.256(<4 x double> %1, <4 x double> %2)
@@ -98,29 +173,53 @@ define <4 x double> @test_addsubpd(<4 x double> %a0, <4 x double> %a1, <4 x doub
declare <4 x double> @llvm.x86.avx.addsub.pd.256(<4 x double>, <4 x double>) nounwind readnone
define <8 x float> @test_addsubps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2) {
+; GENERIC-LABEL: test_addsubps:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vaddsubps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: vaddsubps (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
; SANDY-LABEL: test_addsubps:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vaddsubps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: vaddsubps (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vaddsubps (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_addsubps:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vaddsubps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT: vaddsubps (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vaddsubps (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_addsubps:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vaddsubps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; BROADWELL-NEXT: vaddsubps (%rdi), %ymm0, %ymm0 # sched: [9:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_addsubps:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vaddsubps %ymm1, %ymm0, %ymm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vaddsubps (%rdi), %ymm0, %ymm0 # sched: [11:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_addsubps:
+; SKX: # %bb.0:
+; SKX-NEXT: vaddsubps %ymm1, %ymm0, %ymm0 # sched: [4:0.33]
+; SKX-NEXT: vaddsubps (%rdi), %ymm0, %ymm0 # sched: [11:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_addsubps:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vaddsubps %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
; BTVER2-NEXT: vaddsubps (%rdi), %ymm0, %ymm0 # sched: [8:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_addsubps:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vaddsubps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
; ZNVER1-NEXT: vaddsubps (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call <8 x float> @llvm.x86.avx.addsub.ps.256(<8 x float> %a0, <8 x float> %a1)
%2 = load <8 x float>, <8 x float> *%a2, align 32
%3 = call <8 x float> @llvm.x86.avx.addsub.ps.256(<8 x float> %1, <8 x float> %2)
@@ -129,33 +228,61 @@ define <8 x float> @test_addsubps(<8 x float> %a0, <8 x float> %a1, <8 x float>
declare <8 x float> @llvm.x86.avx.addsub.ps.256(<8 x float>, <8 x float>) nounwind readnone
define <4 x double> @test_andnotpd(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a2) {
+; GENERIC-LABEL: test_andnotpd:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vandnpd %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: vandnpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; GENERIC-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
; SANDY-LABEL: test_andnotpd:
-; SANDY: # BB#0:
-; SANDY-NEXT: vandnpd %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
-; SANDY-NEXT: vandnpd (%rdi), %ymm0, %ymm0 # sched: [5:0.50]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vandnpd %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; SANDY-NEXT: vandnpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
; SANDY-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_andnotpd:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vandnpd %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
-; HASWELL-NEXT: vandnpd (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
+; HASWELL-NEXT: vandnpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
; HASWELL-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_andnotpd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vandnpd %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; BROADWELL-NEXT: vandnpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; BROADWELL-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_andnotpd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vandnpd %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: vandnpd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; SKYLAKE-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_andnotpd:
+; SKX: # %bb.0:
+; SKX-NEXT: vandnpd %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: vandnpd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; SKX-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_andnotpd:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vandnpd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
; BTVER2-NEXT: vandnpd (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
; BTVER2-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_andnotpd:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vandnpd %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
; ZNVER1-NEXT: vandnpd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
; ZNVER1-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = bitcast <4 x double> %a0 to <4 x i64>
%2 = bitcast <4 x double> %a1 to <4 x i64>
%3 = xor <4 x i64> %1, <i64 -1, i64 -1, i64 -1, i64 -1>
@@ -170,33 +297,61 @@ define <4 x double> @test_andnotpd(<4 x double> %a0, <4 x double> %a1, <4 x doub
}
define <8 x float> @test_andnotps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2) {
+; GENERIC-LABEL: test_andnotps:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vandnps %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: vandnps (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; GENERIC-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
; SANDY-LABEL: test_andnotps:
-; SANDY: # BB#0:
-; SANDY-NEXT: vandnps %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
-; SANDY-NEXT: vandnps (%rdi), %ymm0, %ymm0 # sched: [5:0.50]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vandnps %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; SANDY-NEXT: vandnps (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_andnotps:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vandnps %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
-; HASWELL-NEXT: vandnps (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
+; HASWELL-NEXT: vandnps (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
; HASWELL-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_andnotps:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vandnps %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; BROADWELL-NEXT: vandnps (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; BROADWELL-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_andnotps:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vandnps %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: vandnps (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; SKYLAKE-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_andnotps:
+; SKX: # %bb.0:
+; SKX-NEXT: vandnps %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: vandnps (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; SKX-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_andnotps:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vandnps %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
; BTVER2-NEXT: vandnps (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_andnotps:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vandnps %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
; ZNVER1-NEXT: vandnps (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
; ZNVER1-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = bitcast <8 x float> %a0 to <4 x i64>
%2 = bitcast <8 x float> %a1 to <4 x i64>
%3 = xor <4 x i64> %1, <i64 -1, i64 -1, i64 -1, i64 -1>
@@ -211,33 +366,61 @@ define <8 x float> @test_andnotps(<8 x float> %a0, <8 x float> %a1, <8 x float>
}
define <4 x double> @test_andpd(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a2) {
+; GENERIC-LABEL: test_andpd:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vandpd %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: vandpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; GENERIC-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
; SANDY-LABEL: test_andpd:
-; SANDY: # BB#0:
-; SANDY-NEXT: vandpd %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
-; SANDY-NEXT: vandpd (%rdi), %ymm0, %ymm0 # sched: [5:0.50]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vandpd %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; SANDY-NEXT: vandpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
; SANDY-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_andpd:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vandpd %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
-; HASWELL-NEXT: vandpd (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
+; HASWELL-NEXT: vandpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
; HASWELL-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_andpd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vandpd %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; BROADWELL-NEXT: vandpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; BROADWELL-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_andpd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vandpd %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: vandpd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; SKYLAKE-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_andpd:
+; SKX: # %bb.0:
+; SKX-NEXT: vandpd %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: vandpd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; SKX-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_andpd:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vandpd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
; BTVER2-NEXT: vandpd (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
; BTVER2-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_andpd:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vandpd %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
; ZNVER1-NEXT: vandpd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
; ZNVER1-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = bitcast <4 x double> %a0 to <4 x i64>
%2 = bitcast <4 x double> %a1 to <4 x i64>
%3 = and <4 x i64> %1, %2
@@ -250,33 +433,61 @@ define <4 x double> @test_andpd(<4 x double> %a0, <4 x double> %a1, <4 x double>
}
define <8 x float> @test_andps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2) {
+; GENERIC-LABEL: test_andps:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vandps %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: vandps (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; GENERIC-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
; SANDY-LABEL: test_andps:
-; SANDY: # BB#0:
-; SANDY-NEXT: vandps %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
-; SANDY-NEXT: vandps (%rdi), %ymm0, %ymm0 # sched: [5:0.50]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vandps %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; SANDY-NEXT: vandps (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_andps:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vandps %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
-; HASWELL-NEXT: vandps (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
+; HASWELL-NEXT: vandps (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
; HASWELL-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_andps:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vandps %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; BROADWELL-NEXT: vandps (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; BROADWELL-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_andps:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vandps %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: vandps (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; SKYLAKE-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_andps:
+; SKX: # %bb.0:
+; SKX-NEXT: vandps %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: vandps (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; SKX-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_andps:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vandps %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
; BTVER2-NEXT: vandps (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_andps:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vandps %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
; ZNVER1-NEXT: vandps (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
; ZNVER1-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = bitcast <8 x float> %a0 to <4 x i64>
%2 = bitcast <8 x float> %a1 to <4 x i64>
%3 = and <4 x i64> %1, %2
@@ -289,33 +500,61 @@ define <8 x float> @test_andps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a
}
define <4 x double> @test_blendpd(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a2) {
+; GENERIC-LABEL: test_blendpd:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3] sched: [1:0.50]
+; GENERIC-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],mem[1,2],ymm0[3] sched: [8:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
; SANDY-LABEL: test_blendpd:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3] sched: [1:0.50]
; SANDY-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],mem[1,2],ymm0[3] sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],mem[1,2],ymm0[3] sched: [8:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_blendpd:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3] sched: [1:0.33]
; HASWELL-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],mem[1,2],ymm0[3] sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],mem[1,2],ymm0[3] sched: [8:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_blendpd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3] sched: [1:0.33]
+; BROADWELL-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; BROADWELL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],mem[1,2],ymm0[3] sched: [7:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_blendpd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3] sched: [1:0.33]
+; SKYLAKE-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],mem[1,2],ymm0[3] sched: [8:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_blendpd:
+; SKX: # %bb.0:
+; SKX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3] sched: [1:0.33]
+; SKX-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [4:0.33]
+; SKX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],mem[1,2],ymm0[3] sched: [8:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_blendpd:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3] sched: [1:0.50]
; BTVER2-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:2.00]
; BTVER2-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],mem[1,2],ymm0[3] sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_blendpd:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3] sched: [1:0.50]
; ZNVER1-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
; ZNVER1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],mem[1,2],ymm0[3] sched: [8:0.50]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 0, i32 5, i32 6, i32 3>
%2 = load <4 x double>, <4 x double> *%a2, align 32
%3 = fadd <4 x double> %a1, %1
@@ -324,29 +563,53 @@ define <4 x double> @test_blendpd(<4 x double> %a0, <4 x double> %a1, <4 x doubl
}
define <8 x float> @test_blendps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2) {
+; GENERIC-LABEL: test_blendps:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3,4,5,6,7] sched: [1:0.50]
+; GENERIC-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2],ymm0[3],mem[4,5,6],ymm0[7] sched: [8:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
; SANDY-LABEL: test_blendps:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3,4,5,6,7] sched: [1:0.50]
-; SANDY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2],ymm0[3],mem[4,5,6],ymm0[7] sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2],ymm0[3],mem[4,5,6],ymm0[7] sched: [8:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_blendps:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3,4,5,6,7] sched: [1:0.33]
-; HASWELL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2],ymm0[3],mem[4,5,6],ymm0[7] sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2],ymm0[3],mem[4,5,6],ymm0[7] sched: [8:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_blendps:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3,4,5,6,7] sched: [1:0.33]
+; BROADWELL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2],ymm0[3],mem[4,5,6],ymm0[7] sched: [7:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_blendps:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3,4,5,6,7] sched: [1:0.33]
+; SKYLAKE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2],ymm0[3],mem[4,5,6],ymm0[7] sched: [8:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_blendps:
+; SKX: # %bb.0:
+; SKX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3,4,5,6,7] sched: [1:0.33]
+; SKX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2],ymm0[3],mem[4,5,6],ymm0[7] sched: [8:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_blendps:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3,4,5,6,7] sched: [1:0.50]
; BTVER2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2],ymm0[3],mem[4,5,6],ymm0[7] sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_blendps:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3,4,5,6,7] sched: [1:0.50]
; ZNVER1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2],ymm0[3],mem[4,5,6],ymm0[7] sched: [8:0.50]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 0, i32 9, i32 10, i32 3, i32 4, i32 5, i32 6, i32 7>
%2 = load <8 x float>, <8 x float> *%a2, align 32
%3 = shufflevector <8 x float> %1, <8 x float> %2, <8 x i32> <i32 0, i32 1, i32 10, i32 3, i32 12, i32 13, i32 14, i32 7>
@@ -354,29 +617,53 @@ define <8 x float> @test_blendps(<8 x float> %a0, <8 x float> %a1, <8 x float> *
}
define <4 x double> @test_blendvpd(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, <4 x double> *%a3) {
+; GENERIC-LABEL: test_blendvpd:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vblendvpd %ymm2, %ymm1, %ymm0, %ymm0 # sched: [2:1.00]
+; GENERIC-NEXT: vblendvpd %ymm2, (%rdi), %ymm0, %ymm0 # sched: [9:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
; SANDY-LABEL: test_blendvpd:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vblendvpd %ymm2, %ymm1, %ymm0, %ymm0 # sched: [2:1.00]
-; SANDY-NEXT: vblendvpd %ymm2, (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vblendvpd %ymm2, (%rdi), %ymm0, %ymm0 # sched: [9:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_blendvpd:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vblendvpd %ymm2, %ymm1, %ymm0, %ymm0 # sched: [2:2.00]
-; HASWELL-NEXT: vblendvpd %ymm2, (%rdi), %ymm0, %ymm0 # sched: [6:2.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vblendvpd %ymm2, (%rdi), %ymm0, %ymm0 # sched: [9:2.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_blendvpd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vblendvpd %ymm2, %ymm1, %ymm0, %ymm0 # sched: [2:2.00]
+; BROADWELL-NEXT: vblendvpd %ymm2, (%rdi), %ymm0, %ymm0 # sched: [8:2.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_blendvpd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vblendvpd %ymm2, %ymm1, %ymm0, %ymm0 # sched: [2:0.67]
+; SKYLAKE-NEXT: vblendvpd %ymm2, (%rdi), %ymm0, %ymm0 # sched: [9:0.67]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_blendvpd:
+; SKX: # %bb.0:
+; SKX-NEXT: vblendvpd %ymm2, %ymm1, %ymm0, %ymm0 # sched: [2:0.67]
+; SKX-NEXT: vblendvpd %ymm2, (%rdi), %ymm0, %ymm0 # sched: [9:0.67]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_blendvpd:
-; BTVER2: # BB#0:
-; BTVER2-NEXT: vblendvpd %ymm2, %ymm1, %ymm0, %ymm0 # sched: [2:1.00]
-; BTVER2-NEXT: vblendvpd %ymm2, (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: vblendvpd %ymm2, %ymm1, %ymm0, %ymm0 # sched: [3:3.00]
+; BTVER2-NEXT: vblendvpd %ymm2, (%rdi), %ymm0, %ymm0 # sched: [8:3.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_blendvpd:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vblendvpd %ymm2, %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
; ZNVER1-NEXT: vblendvpd %ymm2, (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2)
%2 = load <4 x double>, <4 x double> *%a3, align 32
%3 = call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %1, <4 x double> %2, <4 x double> %a2)
@@ -385,29 +672,53 @@ define <4 x double> @test_blendvpd(<4 x double> %a0, <4 x double> %a1, <4 x doub
declare <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double>, <4 x double>, <4 x double>) nounwind readnone
define <8 x float> @test_blendvps(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, <8 x float> *%a3) {
+; GENERIC-LABEL: test_blendvps:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vblendvps %ymm2, %ymm1, %ymm0, %ymm0 # sched: [2:1.00]
+; GENERIC-NEXT: vblendvps %ymm2, (%rdi), %ymm0, %ymm0 # sched: [9:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
; SANDY-LABEL: test_blendvps:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vblendvps %ymm2, %ymm1, %ymm0, %ymm0 # sched: [2:1.00]
-; SANDY-NEXT: vblendvps %ymm2, (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vblendvps %ymm2, (%rdi), %ymm0, %ymm0 # sched: [9:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_blendvps:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vblendvps %ymm2, %ymm1, %ymm0, %ymm0 # sched: [2:2.00]
-; HASWELL-NEXT: vblendvps %ymm2, (%rdi), %ymm0, %ymm0 # sched: [6:2.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vblendvps %ymm2, (%rdi), %ymm0, %ymm0 # sched: [9:2.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_blendvps:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vblendvps %ymm2, %ymm1, %ymm0, %ymm0 # sched: [2:2.00]
+; BROADWELL-NEXT: vblendvps %ymm2, (%rdi), %ymm0, %ymm0 # sched: [8:2.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_blendvps:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vblendvps %ymm2, %ymm1, %ymm0, %ymm0 # sched: [2:0.67]
+; SKYLAKE-NEXT: vblendvps %ymm2, (%rdi), %ymm0, %ymm0 # sched: [9:0.67]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_blendvps:
+; SKX: # %bb.0:
+; SKX-NEXT: vblendvps %ymm2, %ymm1, %ymm0, %ymm0 # sched: [2:0.67]
+; SKX-NEXT: vblendvps %ymm2, (%rdi), %ymm0, %ymm0 # sched: [9:0.67]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_blendvps:
-; BTVER2: # BB#0:
-; BTVER2-NEXT: vblendvps %ymm2, %ymm1, %ymm0, %ymm0 # sched: [2:1.00]
-; BTVER2-NEXT: vblendvps %ymm2, (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: vblendvps %ymm2, %ymm1, %ymm0, %ymm0 # sched: [3:3.00]
+; BTVER2-NEXT: vblendvps %ymm2, (%rdi), %ymm0, %ymm0 # sched: [8:3.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_blendvps:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vblendvps %ymm2, %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
; ZNVER1-NEXT: vblendvps %ymm2, (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2)
%2 = load <8 x float>, <8 x float> *%a3, align 32
%3 = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %1, <8 x float> %2, <8 x float> %a2)
@@ -416,50 +727,90 @@ define <8 x float> @test_blendvps(<8 x float> %a0, <8 x float> %a1, <8 x float>
declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>, <8 x float>) nounwind readnone
define <8 x float> @test_broadcastf128(<4 x float> *%a0) {
+; GENERIC-LABEL: test_broadcastf128:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
; SANDY-LABEL: test_broadcastf128:
-; SANDY: # BB#0:
-; SANDY-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] sched: [5:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] sched: [7:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_broadcastf128:
-; HASWELL: # BB#0:
-; HASWELL-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] sched: [4:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] sched: [7:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_broadcastf128:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] sched: [6:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_broadcastf128:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] sched: [7:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_broadcastf128:
+; SKX: # %bb.0:
+; SKX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] sched: [7:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_broadcastf128:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_broadcastf128:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] sched: [8:0.50]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = load <4 x float>, <4 x float> *%a0, align 32
%2 = shufflevector <4 x float> %1, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
ret <8 x float> %2
}
define <4 x double> @test_broadcastsd_ymm(double *%a0) {
+; GENERIC-LABEL: test_broadcastsd_ymm:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vbroadcastsd (%rdi), %ymm0 # sched: [7:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
; SANDY-LABEL: test_broadcastsd_ymm:
-; SANDY: # BB#0:
-; SANDY-NEXT: vbroadcastsd (%rdi), %ymm0 # sched: [5:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vbroadcastsd (%rdi), %ymm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_broadcastsd_ymm:
-; HASWELL: # BB#0:
-; HASWELL-NEXT: vbroadcastsd (%rdi), %ymm0 # sched: [5:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vbroadcastsd (%rdi), %ymm0 # sched: [7:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_broadcastsd_ymm:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vbroadcastsd (%rdi), %ymm0 # sched: [6:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_broadcastsd_ymm:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vbroadcastsd (%rdi), %ymm0 # sched: [7:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_broadcastsd_ymm:
+; SKX: # %bb.0:
+; SKX-NEXT: vbroadcastsd (%rdi), %ymm0 # sched: [7:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_broadcastsd_ymm:
-; BTVER2: # BB#0:
-; BTVER2-NEXT: vbroadcastsd (%rdi), %ymm0 # sched: [6:1.00]
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: vbroadcastsd (%rdi), %ymm0 # sched: [6:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_broadcastsd_ymm:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vbroadcastsd (%rdi), %ymm0 # sched: [8:0.50]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = load double, double *%a0, align 8
%2 = insertelement <4 x double> undef, double %1, i32 0
%3 = shufflevector <4 x double> %2, <4 x double> undef, <4 x i32> zeroinitializer
@@ -467,25 +818,45 @@ define <4 x double> @test_broadcastsd_ymm(double *%a0) {
}
define <4 x float> @test_broadcastss(float *%a0) {
+; GENERIC-LABEL: test_broadcastss:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vbroadcastss (%rdi), %xmm0 # sched: [6:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
; SANDY-LABEL: test_broadcastss:
-; SANDY: # BB#0:
-; SANDY-NEXT: vbroadcastss (%rdi), %xmm0 # sched: [4:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vbroadcastss (%rdi), %xmm0 # sched: [6:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_broadcastss:
-; HASWELL: # BB#0:
-; HASWELL-NEXT: vbroadcastss (%rdi), %xmm0 # sched: [4:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vbroadcastss (%rdi), %xmm0 # sched: [6:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_broadcastss:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vbroadcastss (%rdi), %xmm0 # sched: [5:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_broadcastss:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vbroadcastss (%rdi), %xmm0 # sched: [6:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_broadcastss:
+; SKX: # %bb.0:
+; SKX-NEXT: vbroadcastss (%rdi), %xmm0 # sched: [6:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_broadcastss:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vbroadcastss (%rdi), %xmm0 # sched: [5:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_broadcastss:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vbroadcastss (%rdi), %xmm0 # sched: [8:0.50]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = load float, float *%a0, align 4
%2 = insertelement <4 x float> undef, float %1, i32 0
%3 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> zeroinitializer
@@ -493,25 +864,45 @@ define <4 x float> @test_broadcastss(float *%a0) {
}
define <8 x float> @test_broadcastss_ymm(float *%a0) {
+; GENERIC-LABEL: test_broadcastss_ymm:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vbroadcastss (%rdi), %ymm0 # sched: [7:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
; SANDY-LABEL: test_broadcastss_ymm:
-; SANDY: # BB#0:
-; SANDY-NEXT: vbroadcastss (%rdi), %ymm0 # sched: [5:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vbroadcastss (%rdi), %ymm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_broadcastss_ymm:
-; HASWELL: # BB#0:
-; HASWELL-NEXT: vbroadcastss (%rdi), %ymm0 # sched: [5:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vbroadcastss (%rdi), %ymm0 # sched: [7:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_broadcastss_ymm:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vbroadcastss (%rdi), %ymm0 # sched: [6:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_broadcastss_ymm:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vbroadcastss (%rdi), %ymm0 # sched: [7:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_broadcastss_ymm:
+; SKX: # %bb.0:
+; SKX-NEXT: vbroadcastss (%rdi), %ymm0 # sched: [7:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_broadcastss_ymm:
-; BTVER2: # BB#0:
-; BTVER2-NEXT: vbroadcastss (%rdi), %ymm0 # sched: [6:1.00]
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: vbroadcastss (%rdi), %ymm0 # sched: [6:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_broadcastss_ymm:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vbroadcastss (%rdi), %ymm0 # sched: [8:0.50]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = load float, float *%a0, align 4
%2 = insertelement <8 x float> undef, float %1, i32 0
%3 = shufflevector <8 x float> %2, <8 x float> undef, <8 x i32> zeroinitializer
@@ -519,33 +910,63 @@ define <8 x float> @test_broadcastss_ymm(float *%a0) {
}
define <4 x double> @test_cmppd(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a2) {
+; GENERIC-LABEL: test_cmppd:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm1 # sched: [3:1.00]
+; GENERIC-NEXT: vcmpeqpd (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; GENERIC-NEXT: vorpd %ymm0, %ymm1, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
; SANDY-LABEL: test_cmppd:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm1 # sched: [3:1.00]
-; SANDY-NEXT: vcmpeqpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; SANDY-NEXT: vorpd %ymm0, %ymm1, %ymm0 # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vcmpeqpd (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; SANDY-NEXT: vorpd %ymm0, %ymm1, %ymm0 # sched: [1:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cmppd:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm1 # sched: [3:1.00]
-; HASWELL-NEXT: vcmpeqpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; HASWELL-NEXT: vcmpeqpd (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
; HASWELL-NEXT: vorpd %ymm0, %ymm1, %ymm0 # sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_cmppd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm1 # sched: [3:1.00]
+; BROADWELL-NEXT: vcmpeqpd (%rdi), %ymm0, %ymm0 # sched: [9:1.00]
+; BROADWELL-NEXT: vorpd %ymm0, %ymm1, %ymm0 # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_cmppd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm1 # sched: [4:0.33]
+; SKYLAKE-NEXT: vcmpeqpd (%rdi), %ymm0, %ymm0 # sched: [11:0.50]
+; SKYLAKE-NEXT: vorpd %ymm0, %ymm1, %ymm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_cmppd:
+; SKX: # %bb.0:
+; SKX-NEXT: vcmpeqpd %ymm1, %ymm0, %k0 # sched: [3:1.00]
+; SKX-NEXT: vcmpeqpd (%rdi), %ymm0, %k1 # sched: [10:1.00]
+; SKX-NEXT: vpmovm2q %k0, %ymm0 # sched: [1:0.25]
+; SKX-NEXT: vpmovm2q %k1, %ymm1 # sched: [1:0.25]
+; SKX-NEXT: vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_cmppd:
-; BTVER2: # BB#0:
-; BTVER2-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm1 # sched: [3:1.00]
-; BTVER2-NEXT: vcmpeqpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm1 # sched: [2:2.00]
+; BTVER2-NEXT: vcmpeqpd (%rdi), %ymm0, %ymm0 # sched: [7:2.00]
; BTVER2-NEXT: vorpd %ymm0, %ymm1, %ymm0 # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_cmppd:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm1 # sched: [3:1.00]
; ZNVER1-NEXT: vcmpeqpd (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
; ZNVER1-NEXT: vorpd %ymm0, %ymm1, %ymm0 # sched: [1:0.25]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = fcmp oeq <4 x double> %a0, %a1
%2 = load <4 x double>, <4 x double> *%a2, align 32
%3 = fcmp oeq <4 x double> %a0, %2
@@ -557,33 +978,63 @@ define <4 x double> @test_cmppd(<4 x double> %a0, <4 x double> %a1, <4 x double>
}
define <8 x float> @test_cmpps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2) {
+; GENERIC-LABEL: test_cmpps:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vcmpeqps %ymm1, %ymm0, %ymm1 # sched: [3:1.00]
+; GENERIC-NEXT: vcmpeqps (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; GENERIC-NEXT: vorps %ymm0, %ymm1, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
; SANDY-LABEL: test_cmpps:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vcmpeqps %ymm1, %ymm0, %ymm1 # sched: [3:1.00]
-; SANDY-NEXT: vcmpeqps (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; SANDY-NEXT: vorps %ymm0, %ymm1, %ymm0 # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vcmpeqps (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; SANDY-NEXT: vorps %ymm0, %ymm1, %ymm0 # sched: [1:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cmpps:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vcmpeqps %ymm1, %ymm0, %ymm1 # sched: [3:1.00]
-; HASWELL-NEXT: vcmpeqps (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; HASWELL-NEXT: vcmpeqps (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
; HASWELL-NEXT: vorps %ymm0, %ymm1, %ymm0 # sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_cmpps:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vcmpeqps %ymm1, %ymm0, %ymm1 # sched: [3:1.00]
+; BROADWELL-NEXT: vcmpeqps (%rdi), %ymm0, %ymm0 # sched: [9:1.00]
+; BROADWELL-NEXT: vorps %ymm0, %ymm1, %ymm0 # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_cmpps:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vcmpeqps %ymm1, %ymm0, %ymm1 # sched: [4:0.33]
+; SKYLAKE-NEXT: vcmpeqps (%rdi), %ymm0, %ymm0 # sched: [11:0.50]
+; SKYLAKE-NEXT: vorps %ymm0, %ymm1, %ymm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_cmpps:
+; SKX: # %bb.0:
+; SKX-NEXT: vcmpeqps %ymm1, %ymm0, %k0 # sched: [3:1.00]
+; SKX-NEXT: vcmpeqps (%rdi), %ymm0, %k1 # sched: [10:1.00]
+; SKX-NEXT: vpmovm2d %k0, %ymm0 # sched: [1:0.25]
+; SKX-NEXT: vpmovm2d %k1, %ymm1 # sched: [1:0.25]
+; SKX-NEXT: vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_cmpps:
-; BTVER2: # BB#0:
-; BTVER2-NEXT: vcmpeqps %ymm1, %ymm0, %ymm1 # sched: [3:1.00]
-; BTVER2-NEXT: vcmpeqps (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: vcmpeqps %ymm1, %ymm0, %ymm1 # sched: [2:2.00]
+; BTVER2-NEXT: vcmpeqps (%rdi), %ymm0, %ymm0 # sched: [7:2.00]
; BTVER2-NEXT: vorps %ymm0, %ymm1, %ymm0 # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_cmpps:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vcmpeqps %ymm1, %ymm0, %ymm1 # sched: [3:1.00]
; ZNVER1-NEXT: vcmpeqps (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
; ZNVER1-NEXT: vorps %ymm0, %ymm1, %ymm0 # sched: [1:0.25]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = fcmp oeq <8 x float> %a0, %a1
%2 = load <8 x float>, <8 x float> *%a2, align 32
%3 = fcmp oeq <8 x float> %a0, %2
@@ -595,33 +1046,61 @@ define <8 x float> @test_cmpps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a
}
define <4 x double> @test_cvtdq2pd(<4 x i32> %a0, <4 x i32> *%a1) {
+; GENERIC-LABEL: test_cvtdq2pd:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vcvtdq2pd %xmm0, %ymm0 # sched: [4:1.00]
+; GENERIC-NEXT: vcvtdq2pd (%rdi), %ymm1 # sched: [10:1.00]
+; GENERIC-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
; SANDY-LABEL: test_cvtdq2pd:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vcvtdq2pd %xmm0, %ymm0 # sched: [4:1.00]
-; SANDY-NEXT: vcvtdq2pd (%rdi), %ymm1 # sched: [8:1.00]
+; SANDY-NEXT: vcvtdq2pd (%rdi), %ymm1 # sched: [10:1.00]
; SANDY-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cvtdq2pd:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vcvtdq2pd %xmm0, %ymm0 # sched: [6:1.00]
-; HASWELL-NEXT: vcvtdq2pd (%rdi), %ymm1 # sched: [8:1.00]
+; HASWELL-NEXT: vcvtdq2pd (%rdi), %ymm1 # sched: [12:1.00]
; HASWELL-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_cvtdq2pd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vcvtdq2pd (%rdi), %ymm1 # sched: [11:1.00]
+; BROADWELL-NEXT: vcvtdq2pd %xmm0, %ymm0 # sched: [6:1.00]
+; BROADWELL-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_cvtdq2pd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vcvtdq2pd %xmm0, %ymm0 # sched: [7:1.00]
+; SKYLAKE-NEXT: vcvtdq2pd (%rdi), %ymm1 # sched: [13:1.00]
+; SKYLAKE-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_cvtdq2pd:
+; SKX: # %bb.0:
+; SKX-NEXT: vcvtdq2pd %xmm0, %ymm0 # sched: [7:1.00]
+; SKX-NEXT: vcvtdq2pd (%rdi), %ymm1 # sched: [13:1.00]
+; SKX-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_cvtdq2pd:
-; BTVER2: # BB#0:
-; BTVER2-NEXT: vcvtdq2pd (%rdi), %ymm1 # sched: [8:1.00]
-; BTVER2-NEXT: vcvtdq2pd %xmm0, %ymm0 # sched: [3:1.00]
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: vcvtdq2pd (%rdi), %ymm1 # sched: [8:2.00]
+; BTVER2-NEXT: vcvtdq2pd %xmm0, %ymm0 # sched: [3:2.00]
; BTVER2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_cvtdq2pd:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vcvtdq2pd (%rdi), %ymm1 # sched: [12:1.00]
; ZNVER1-NEXT: vcvtdq2pd %xmm0, %ymm0 # sched: [5:1.00]
; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = sitofp <4 x i32> %a0 to <4 x double>
%2 = load <4 x i32>, <4 x i32> *%a1, align 16
%3 = sitofp <4 x i32> %2 to <4 x double>
@@ -630,35 +1109,63 @@ define <4 x double> @test_cvtdq2pd(<4 x i32> %a0, <4 x i32> *%a1) {
}
define <8 x float> @test_cvtdq2ps(<8 x i32> %a0, <8 x i32> *%a1) {
+; GENERIC-LABEL: test_cvtdq2ps:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vcvtdq2ps %ymm0, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: vcvtdq2ps (%rdi), %ymm1 # sched: [10:1.00]
+; GENERIC-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
; SANDY-LABEL: test_cvtdq2ps:
-; SANDY: # BB#0:
-; SANDY-NEXT: vcvtdq2ps %ymm0, %ymm0 # sched: [4:1.00]
-; SANDY-NEXT: vmovaps (%rdi), %xmm1 # sched: [4:0.50]
-; SANDY-NEXT: vinsertf128 $1, 16(%rdi), %ymm1, %ymm1 # sched: [5:1.00]
-; SANDY-NEXT: vcvtdq2ps %ymm1, %ymm1 # sched: [4:1.00]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vcvtdq2ps %ymm0, %ymm0 # sched: [3:1.00]
+; SANDY-NEXT: vmovaps (%rdi), %xmm1 # sched: [6:0.50]
+; SANDY-NEXT: vinsertf128 $1, 16(%rdi), %ymm1, %ymm1 # sched: [7:0.50]
+; SANDY-NEXT: vcvtdq2ps %ymm1, %ymm1 # sched: [3:1.00]
; SANDY-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cvtdq2ps:
-; HASWELL: # BB#0:
-; HASWELL-NEXT: vcvtdq2ps %ymm0, %ymm0 # sched: [4:1.00]
-; HASWELL-NEXT: vcvtdq2ps (%rdi), %ymm1 # sched: [8:1.00]
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vcvtdq2ps %ymm0, %ymm0 # sched: [3:1.00]
+; HASWELL-NEXT: vcvtdq2ps (%rdi), %ymm1 # sched: [10:1.00]
; HASWELL-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_cvtdq2ps:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vcvtdq2ps %ymm0, %ymm0 # sched: [3:1.00]
+; BROADWELL-NEXT: vcvtdq2ps (%rdi), %ymm1 # sched: [9:1.00]
+; BROADWELL-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_cvtdq2ps:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vcvtdq2ps %ymm0, %ymm0 # sched: [4:0.33]
+; SKYLAKE-NEXT: vcvtdq2ps (%rdi), %ymm1 # sched: [11:0.50]
+; SKYLAKE-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_cvtdq2ps:
+; SKX: # %bb.0:
+; SKX-NEXT: vcvtdq2ps %ymm0, %ymm0 # sched: [4:0.33]
+; SKX-NEXT: vcvtdq2ps (%rdi), %ymm1 # sched: [11:0.50]
+; SKX-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_cvtdq2ps:
-; BTVER2: # BB#0:
-; BTVER2-NEXT: vcvtdq2ps (%rdi), %ymm1 # sched: [8:1.00]
-; BTVER2-NEXT: vcvtdq2ps %ymm0, %ymm0 # sched: [3:1.00]
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: vcvtdq2ps (%rdi), %ymm1 # sched: [8:2.00]
+; BTVER2-NEXT: vcvtdq2ps %ymm0, %ymm0 # sched: [3:2.00]
; BTVER2-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_cvtdq2ps:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vcvtdq2ps (%rdi), %ymm1 # sched: [12:1.00]
; ZNVER1-NEXT: vcvtdq2ps %ymm0, %ymm0 # sched: [5:1.00]
; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = sitofp <8 x i32> %a0 to <8 x float>
%2 = load <8 x i32>, <8 x i32> *%a1, align 16
%3 = sitofp <8 x i32> %2 to <8 x float>
@@ -667,33 +1174,125 @@ define <8 x float> @test_cvtdq2ps(<8 x i32> %a0, <8 x i32> *%a1) {
}
define <8 x i32> @test_cvtpd2dq(<4 x double> %a0, <4 x double> *%a1) {
+; GENERIC-LABEL: test_cvtpd2dq:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vcvtpd2dq %ymm0, %xmm0 # sched: [4:1.00]
+; GENERIC-NEXT: vcvtpd2dqy (%rdi), %xmm1 # sched: [11:1.00]
+; GENERIC-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
; SANDY-LABEL: test_cvtpd2dq:
-; SANDY: # BB#0:
-; SANDY-NEXT: vcvttpd2dq %ymm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vcvttpd2dqy (%rdi), %xmm1 # sched: [7:1.00]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vcvtpd2dq %ymm0, %xmm0 # sched: [4:1.00]
+; SANDY-NEXT: vcvtpd2dqy (%rdi), %xmm1 # sched: [11:1.00]
; SANDY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cvtpd2dq:
-; HASWELL: # BB#0:
-; HASWELL-NEXT: vcvttpd2dq %ymm0, %xmm0 # sched: [6:1.00]
-; HASWELL-NEXT: vcvttpd2dqy (%rdi), %xmm1 # sched: [10:1.00]
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vcvtpd2dq %ymm0, %xmm0 # sched: [6:1.00]
+; HASWELL-NEXT: vcvtpd2dqy (%rdi), %xmm1 # sched: [8:1.00]
; HASWELL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_cvtpd2dq:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vcvtpd2dq %ymm0, %xmm0 # sched: [6:1.00]
+; BROADWELL-NEXT: vcvtpd2dqy (%rdi), %xmm1 # sched: [8:1.00]
+; BROADWELL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # sched: [3:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_cvtpd2dq:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vcvtpd2dq %ymm0, %xmm0 # sched: [7:1.00]
+; SKYLAKE-NEXT: vcvtpd2dqy (%rdi), %xmm1 # sched: [8:1.00]
+; SKYLAKE-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # sched: [3:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_cvtpd2dq:
+; SKX: # %bb.0:
+; SKX-NEXT: vcvtpd2dq %ymm0, %xmm0 # sched: [7:1.00]
+; SKX-NEXT: vcvtpd2dqy (%rdi), %xmm1 # sched: [8:1.00]
+; SKX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_cvtpd2dq:
-; BTVER2: # BB#0:
-; BTVER2-NEXT: vcvttpd2dqy (%rdi), %xmm1 # sched: [8:1.00]
-; BTVER2-NEXT: vcvttpd2dq %ymm0, %xmm0 # sched: [3:1.00]
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: vcvtpd2dqy (%rdi), %xmm1 # sched: [11:2.00]
+; BTVER2-NEXT: vcvtpd2dq %ymm0, %xmm0 # sched: [6:2.00]
; BTVER2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_cvtpd2dq:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vcvtpd2dqy (%rdi), %xmm1 # sched: [12:1.00]
+; ZNVER1-NEXT: vcvtpd2dq %ymm0, %xmm0 # sched: [5:1.00]
+; ZNVER1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # sched: [2:0.67]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call <4 x i32> @llvm.x86.avx.cvt.pd2dq.256(<4 x double> %a0)
+ %2 = load <4 x double>, <4 x double> *%a1, align 32
+ %3 = call <4 x i32> @llvm.x86.avx.cvt.pd2dq.256(<4 x double> %2)
+ %4 = shufflevector <4 x i32> %1, <4 x i32> %3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x i32> %4
+}
+declare <4 x i32> @llvm.x86.avx.cvt.pd2dq.256(<4 x double>) nounwind readnone
+
+define <8 x i32> @test_cvttpd2dq(<4 x double> %a0, <4 x double> *%a1) {
+; GENERIC-LABEL: test_cvttpd2dq:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vcvttpd2dq %ymm0, %xmm0 # sched: [4:1.00]
+; GENERIC-NEXT: vcvttpd2dqy (%rdi), %xmm1 # sched: [11:1.00]
+; GENERIC-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SANDY-LABEL: test_cvttpd2dq:
+; SANDY: # %bb.0:
+; SANDY-NEXT: vcvttpd2dq %ymm0, %xmm0 # sched: [4:1.00]
+; SANDY-NEXT: vcvttpd2dqy (%rdi), %xmm1 # sched: [11:1.00]
+; SANDY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # sched: [1:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_cvttpd2dq:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vcvttpd2dq %ymm0, %xmm0 # sched: [6:1.00]
+; HASWELL-NEXT: vcvttpd2dqy (%rdi), %xmm1 # sched: [8:1.00]
+; HASWELL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # sched: [3:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_cvttpd2dq:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vcvttpd2dq %ymm0, %xmm0 # sched: [6:1.00]
+; BROADWELL-NEXT: vcvttpd2dqy (%rdi), %xmm1 # sched: [8:1.00]
+; BROADWELL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # sched: [3:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_cvttpd2dq:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vcvttpd2dq %ymm0, %xmm0 # sched: [7:1.00]
+; SKYLAKE-NEXT: vcvttpd2dqy (%rdi), %xmm1 # sched: [8:1.00]
+; SKYLAKE-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # sched: [3:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_cvttpd2dq:
+; SKX: # %bb.0:
+; SKX-NEXT: vcvttpd2dq %ymm0, %xmm0 # sched: [7:1.00]
+; SKX-NEXT: vcvttpd2dqy (%rdi), %xmm1 # sched: [8:1.00]
+; SKX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_cvttpd2dq:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: vcvttpd2dqy (%rdi), %xmm1 # sched: [11:2.00]
+; BTVER2-NEXT: vcvttpd2dq %ymm0, %xmm0 # sched: [6:2.00]
+; BTVER2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_cvttpd2dq:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vcvttpd2dqy (%rdi), %xmm1 # sched: [12:1.00]
; ZNVER1-NEXT: vcvttpd2dq %ymm0, %xmm0 # sched: [5:1.00]
-; ZNVER1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # sched: [1:0.50]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # sched: [2:0.67]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = fptosi <4 x double> %a0 to <4 x i32>
%2 = load <4 x double>, <4 x double> *%a1, align 32
%3 = fptosi <4 x double> %2 to <4 x i32>
@@ -702,33 +1301,61 @@ define <8 x i32> @test_cvtpd2dq(<4 x double> %a0, <4 x double> *%a1) {
}
define <8 x float> @test_cvtpd2ps(<4 x double> %a0, <4 x double> *%a1) {
+; GENERIC-LABEL: test_cvtpd2ps:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vcvtpd2ps %ymm0, %xmm0 # sched: [4:1.00]
+; GENERIC-NEXT: vcvtpd2psy (%rdi), %xmm1 # sched: [11:1.00]
+; GENERIC-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
; SANDY-LABEL: test_cvtpd2ps:
-; SANDY: # BB#0:
-; SANDY-NEXT: vcvtpd2ps %ymm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vcvtpd2psy (%rdi), %xmm1 # sched: [7:1.00]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vcvtpd2ps %ymm0, %xmm0 # sched: [4:1.00]
+; SANDY-NEXT: vcvtpd2psy (%rdi), %xmm1 # sched: [11:1.00]
; SANDY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cvtpd2ps:
-; HASWELL: # BB#0:
-; HASWELL-NEXT: vcvtpd2ps %ymm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT: vcvtpd2psy (%rdi), %xmm1 # sched: [9:1.00]
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vcvtpd2ps %ymm0, %xmm0 # sched: [6:1.00]
+; HASWELL-NEXT: vcvtpd2psy (%rdi), %xmm1 # sched: [8:1.00]
; HASWELL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_cvtpd2ps:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vcvtpd2ps %ymm0, %xmm0 # sched: [6:1.00]
+; BROADWELL-NEXT: vcvtpd2psy (%rdi), %xmm1 # sched: [8:1.00]
+; BROADWELL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # sched: [3:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_cvtpd2ps:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vcvtpd2ps %ymm0, %xmm0 # sched: [7:1.00]
+; SKYLAKE-NEXT: vcvtpd2psy (%rdi), %xmm1 # sched: [8:1.00]
+; SKYLAKE-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # sched: [3:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_cvtpd2ps:
+; SKX: # %bb.0:
+; SKX-NEXT: vcvtpd2ps %ymm0, %xmm0 # sched: [7:1.00]
+; SKX-NEXT: vcvtpd2psy (%rdi), %xmm1 # sched: [8:1.00]
+; SKX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_cvtpd2ps:
-; BTVER2: # BB#0:
-; BTVER2-NEXT: vcvtpd2psy (%rdi), %xmm1 # sched: [8:1.00]
-; BTVER2-NEXT: vcvtpd2ps %ymm0, %xmm0 # sched: [3:1.00]
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: vcvtpd2psy (%rdi), %xmm1 # sched: [11:2.00]
+; BTVER2-NEXT: vcvtpd2ps %ymm0, %xmm0 # sched: [6:2.00]
; BTVER2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_cvtpd2ps:
-; ZNVER1: # BB#0:
-; ZNVER1-NEXT: vcvtpd2psy (%rdi), %xmm1 # sched: [12:1.00]
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vcvtpd2psy (%rdi), %xmm1 # sched: [11:1.00]
; ZNVER1-NEXT: vcvtpd2ps %ymm0, %xmm0 # sched: [5:1.00]
-; ZNVER1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # sched: [1:0.50]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # sched: [2:0.67]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = fptrunc <4 x double> %a0 to <4 x float>
%2 = load <4 x double>, <4 x double> *%a1, align 32
%3 = fptrunc <4 x double> %2 to <4 x float>
@@ -737,33 +1364,125 @@ define <8 x float> @test_cvtpd2ps(<4 x double> %a0, <4 x double> *%a1) {
}
define <8 x i32> @test_cvtps2dq(<8 x float> %a0, <8 x float> *%a1) {
+; GENERIC-LABEL: test_cvtps2dq:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vcvtps2dq %ymm0, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: vcvtps2dq (%rdi), %ymm1 # sched: [10:1.00]
+; GENERIC-NEXT: vorpd %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
; SANDY-LABEL: test_cvtps2dq:
-; SANDY: # BB#0:
-; SANDY-NEXT: vcvttps2dq %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: vcvttps2dq (%rdi), %ymm1 # sched: [7:1.00]
-; SANDY-NEXT: vorps %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vcvtps2dq %ymm0, %ymm0 # sched: [3:1.00]
+; SANDY-NEXT: vcvtps2dq (%rdi), %ymm1 # sched: [10:1.00]
+; SANDY-NEXT: vorpd %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cvtps2dq:
-; HASWELL: # BB#0:
-; HASWELL-NEXT: vcvttps2dq %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT: vcvttps2dq (%rdi), %ymm1 # sched: [7:1.00]
-; HASWELL-NEXT: vorps %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vcvtps2dq %ymm0, %ymm0 # sched: [3:1.00]
+; HASWELL-NEXT: vcvtps2dq (%rdi), %ymm1 # sched: [10:1.00]
+; HASWELL-NEXT: vorpd %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_cvtps2dq:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vcvtps2dq %ymm0, %ymm0 # sched: [3:1.00]
+; BROADWELL-NEXT: vcvtps2dq (%rdi), %ymm1 # sched: [9:1.00]
+; BROADWELL-NEXT: vorpd %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_cvtps2dq:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vcvtps2dq %ymm0, %ymm0 # sched: [4:0.33]
+; SKYLAKE-NEXT: vcvtps2dq (%rdi), %ymm1 # sched: [11:0.50]
+; SKYLAKE-NEXT: vorpd %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_cvtps2dq:
+; SKX: # %bb.0:
+; SKX-NEXT: vcvtps2dq %ymm0, %ymm0 # sched: [4:0.33]
+; SKX-NEXT: vcvtps2dq (%rdi), %ymm1 # sched: [11:0.50]
+; SKX-NEXT: vorpd %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_cvtps2dq:
-; BTVER2: # BB#0:
-; BTVER2-NEXT: vcvttps2dq (%rdi), %ymm1 # sched: [8:1.00]
-; BTVER2-NEXT: vcvttps2dq %ymm0, %ymm0 # sched: [3:1.00]
-; BTVER2-NEXT: vorps %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: vcvtps2dq (%rdi), %ymm1 # sched: [8:2.00]
+; BTVER2-NEXT: vcvtps2dq %ymm0, %ymm0 # sched: [3:2.00]
+; BTVER2-NEXT: vorpd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_cvtps2dq:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vcvtps2dq (%rdi), %ymm1 # sched: [12:1.00]
+; ZNVER1-NEXT: vcvtps2dq %ymm0, %ymm0 # sched: [5:1.00]
+; ZNVER1-NEXT: vorpd %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call <8 x i32> @llvm.x86.avx.cvt.ps2dq.256(<8 x float> %a0)
+ %2 = load <8 x float>, <8 x float> *%a1, align 32
+ %3 = call <8 x i32> @llvm.x86.avx.cvt.ps2dq.256(<8 x float> %2)
+ %4 = or <8 x i32> %1, %3
+ ret <8 x i32> %4
+}
+declare <8 x i32> @llvm.x86.avx.cvt.ps2dq.256(<8 x float>) nounwind readnone
+
+define <8 x i32> @test_cvttps2dq(<8 x float> %a0, <8 x float> *%a1) {
+; GENERIC-LABEL: test_cvttps2dq:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vcvttps2dq %ymm0, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: vcvttps2dq (%rdi), %ymm1 # sched: [10:1.00]
+; GENERIC-NEXT: vorps %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SANDY-LABEL: test_cvttps2dq:
+; SANDY: # %bb.0:
+; SANDY-NEXT: vcvttps2dq %ymm0, %ymm0 # sched: [3:1.00]
+; SANDY-NEXT: vcvttps2dq (%rdi), %ymm1 # sched: [10:1.00]
+; SANDY-NEXT: vorps %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_cvttps2dq:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vcvttps2dq %ymm0, %ymm0 # sched: [3:1.00]
+; HASWELL-NEXT: vcvttps2dq (%rdi), %ymm1 # sched: [10:1.00]
+; HASWELL-NEXT: vorps %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_cvttps2dq:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vcvttps2dq %ymm0, %ymm0 # sched: [3:1.00]
+; BROADWELL-NEXT: vcvttps2dq (%rdi), %ymm1 # sched: [9:1.00]
+; BROADWELL-NEXT: vorps %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_cvttps2dq:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vcvttps2dq %ymm0, %ymm0 # sched: [4:0.33]
+; SKYLAKE-NEXT: vcvttps2dq (%rdi), %ymm1 # sched: [11:0.50]
+; SKYLAKE-NEXT: vorps %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_cvttps2dq:
+; SKX: # %bb.0:
+; SKX-NEXT: vcvttps2dq %ymm0, %ymm0 # sched: [4:0.33]
+; SKX-NEXT: vcvttps2dq (%rdi), %ymm1 # sched: [11:0.50]
+; SKX-NEXT: vorps %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_cvttps2dq:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: vcvttps2dq (%rdi), %ymm1 # sched: [8:2.00]
+; BTVER2-NEXT: vcvttps2dq %ymm0, %ymm0 # sched: [3:2.00]
+; BTVER2-NEXT: vorps %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_cvttps2dq:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vcvttps2dq (%rdi), %ymm1 # sched: [12:1.00]
; ZNVER1-NEXT: vcvttps2dq %ymm0, %ymm0 # sched: [5:1.00]
; ZNVER1-NEXT: vorps %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = fptosi <8 x float> %a0 to <8 x i32>
%2 = load <8 x float>, <8 x float> *%a1, align 32
%3 = fptosi <8 x float> %2 to <8 x i32>
@@ -772,29 +1491,53 @@ define <8 x i32> @test_cvtps2dq(<8 x float> %a0, <8 x float> *%a1) {
}
define <4 x double> @test_divpd(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a2) {
+; GENERIC-LABEL: test_divpd:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vdivpd %ymm1, %ymm0, %ymm0 # sched: [45:2.00]
+; GENERIC-NEXT: vdivpd (%rdi), %ymm0, %ymm0 # sched: [52:2.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
; SANDY-LABEL: test_divpd:
-; SANDY: # BB#0:
-; SANDY-NEXT: vdivpd %ymm1, %ymm0, %ymm0 # sched: [12:1.00]
-; SANDY-NEXT: vdivpd (%rdi), %ymm0, %ymm0 # sched: [16:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vdivpd %ymm1, %ymm0, %ymm0 # sched: [45:2.00]
+; SANDY-NEXT: vdivpd (%rdi), %ymm0, %ymm0 # sched: [52:2.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_divpd:
-; HASWELL: # BB#0:
-; HASWELL-NEXT: vdivpd %ymm1, %ymm0, %ymm0 # sched: [27:2.00]
-; HASWELL-NEXT: vdivpd (%rdi), %ymm0, %ymm0 # sched: [31:2.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vdivpd %ymm1, %ymm0, %ymm0 # sched: [35:2.00]
+; HASWELL-NEXT: vdivpd (%rdi), %ymm0, %ymm0 # sched: [42:2.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_divpd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vdivpd %ymm1, %ymm0, %ymm0 # sched: [23:2.00]
+; BROADWELL-NEXT: vdivpd (%rdi), %ymm0, %ymm0 # sched: [29:2.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_divpd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vdivpd %ymm1, %ymm0, %ymm0 # sched: [14:1.00]
+; SKYLAKE-NEXT: vdivpd (%rdi), %ymm0, %ymm0 # sched: [21:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_divpd:
+; SKX: # %bb.0:
+; SKX-NEXT: vdivpd %ymm1, %ymm0, %ymm0 # sched: [14:1.00]
+; SKX-NEXT: vdivpd (%rdi), %ymm0, %ymm0 # sched: [21:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_divpd:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vdivpd %ymm1, %ymm0, %ymm0 # sched: [38:38.00]
; BTVER2-NEXT: vdivpd (%rdi), %ymm0, %ymm0 # sched: [43:38.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_divpd:
-; ZNVER1: # BB#0:
-; ZNVER1-NEXT: vdivpd %ymm1, %ymm0, %ymm0 # sched: [15:1.00]
-; ZNVER1-NEXT: vdivpd (%rdi), %ymm0, %ymm0 # sched: [22:1.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vdivpd %ymm1, %ymm0, %ymm0 # sched: [15:15.00]
+; ZNVER1-NEXT: vdivpd (%rdi), %ymm0, %ymm0 # sched: [22:22.00]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = fdiv <4 x double> %a0, %a1
%2 = load <4 x double>, <4 x double> *%a2, align 32
%3 = fdiv <4 x double> %1, %2
@@ -802,29 +1545,53 @@ define <4 x double> @test_divpd(<4 x double> %a0, <4 x double> %a1, <4 x double>
}
define <8 x float> @test_divps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2) {
+; GENERIC-LABEL: test_divps:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vdivps %ymm1, %ymm0, %ymm0 # sched: [29:2.00]
+; GENERIC-NEXT: vdivps (%rdi), %ymm0, %ymm0 # sched: [36:2.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
; SANDY-LABEL: test_divps:
-; SANDY: # BB#0:
-; SANDY-NEXT: vdivps %ymm1, %ymm0, %ymm0 # sched: [12:1.00]
-; SANDY-NEXT: vdivps (%rdi), %ymm0, %ymm0 # sched: [16:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vdivps %ymm1, %ymm0, %ymm0 # sched: [29:2.00]
+; SANDY-NEXT: vdivps (%rdi), %ymm0, %ymm0 # sched: [36:2.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_divps:
-; HASWELL: # BB#0:
-; HASWELL-NEXT: vdivps %ymm1, %ymm0, %ymm0 # sched: [19:2.00]
-; HASWELL-NEXT: vdivps (%rdi), %ymm0, %ymm0 # sched: [23:2.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vdivps %ymm1, %ymm0, %ymm0 # sched: [21:2.00]
+; HASWELL-NEXT: vdivps (%rdi), %ymm0, %ymm0 # sched: [28:2.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_divps:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vdivps %ymm1, %ymm0, %ymm0 # sched: [17:2.00]
+; BROADWELL-NEXT: vdivps (%rdi), %ymm0, %ymm0 # sched: [23:2.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_divps:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vdivps %ymm1, %ymm0, %ymm0 # sched: [11:1.00]
+; SKYLAKE-NEXT: vdivps (%rdi), %ymm0, %ymm0 # sched: [18:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_divps:
+; SKX: # %bb.0:
+; SKX-NEXT: vdivps %ymm1, %ymm0, %ymm0 # sched: [11:1.00]
+; SKX-NEXT: vdivps (%rdi), %ymm0, %ymm0 # sched: [18:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_divps:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vdivps %ymm1, %ymm0, %ymm0 # sched: [38:38.00]
; BTVER2-NEXT: vdivps (%rdi), %ymm0, %ymm0 # sched: [43:38.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_divps:
-; ZNVER1: # BB#0:
-; ZNVER1-NEXT: vdivps %ymm1, %ymm0, %ymm0 # sched: [15:1.00]
-; ZNVER1-NEXT: vdivps (%rdi), %ymm0, %ymm0 # sched: [22:1.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vdivps %ymm1, %ymm0, %ymm0 # sched: [12:12.00]
+; ZNVER1-NEXT: vdivps (%rdi), %ymm0, %ymm0 # sched: [19:19.00]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = fdiv <8 x float> %a0, %a1
%2 = load <8 x float>, <8 x float> *%a2, align 32
%3 = fdiv <8 x float> %1, %2
@@ -832,29 +1599,53 @@ define <8 x float> @test_divps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a
}
define <8 x float> @test_dpps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2) {
+; GENERIC-LABEL: test_dpps:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vdpps $7, %ymm1, %ymm0, %ymm0 # sched: [12:2.00]
+; GENERIC-NEXT: vdpps $7, (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
; SANDY-LABEL: test_dpps:
-; SANDY: # BB#0:
-; SANDY-NEXT: vdpps $7, %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vdpps $7, %ymm1, %ymm0, %ymm0 # sched: [12:2.00]
; SANDY-NEXT: vdpps $7, (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_dpps:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vdpps $7, %ymm1, %ymm0, %ymm0 # sched: [14:2.00]
-; HASWELL-NEXT: vdpps $7, (%rdi), %ymm0, %ymm0 # sched: [18:2.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vdpps $7, (%rdi), %ymm0, %ymm0 # sched: [21:2.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_dpps:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vdpps $7, %ymm1, %ymm0, %ymm0 # sched: [14:2.00]
+; BROADWELL-NEXT: vdpps $7, (%rdi), %ymm0, %ymm0 # sched: [20:2.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_dpps:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vdpps $7, %ymm1, %ymm0, %ymm0 # sched: [13:1.33]
+; SKYLAKE-NEXT: vdpps $7, (%rdi), %ymm0, %ymm0 # sched: [20:1.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_dpps:
+; SKX: # %bb.0:
+; SKX-NEXT: vdpps $7, %ymm1, %ymm0, %ymm0 # sched: [13:1.33]
+; SKX-NEXT: vdpps $7, (%rdi), %ymm0, %ymm0 # sched: [20:1.33]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_dpps:
-; BTVER2: # BB#0:
-; BTVER2-NEXT: vdpps $7, %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; BTVER2-NEXT: vdpps $7, (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: vdpps $7, %ymm1, %ymm0, %ymm0 # sched: [12:6.00]
+; BTVER2-NEXT: vdpps $7, (%rdi), %ymm0, %ymm0 # sched: [17:6.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_dpps:
-; ZNVER1: # BB#0:
-; ZNVER1-NEXT: vdpps $7, %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; ZNVER1-NEXT: vdpps $7, (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vdpps $7, %ymm1, %ymm0, %ymm0 # sched: [100:?]
+; ZNVER1-NEXT: vdpps $7, (%rdi), %ymm0, %ymm0 # sched: [100:?]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float> %a0, <8 x float> %a1, i8 7)
%2 = load <8 x float>, <8 x float> *%a2, align 32
%3 = call <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float> %1, <8 x float> %2, i8 7)
@@ -863,32 +1654,60 @@ define <8 x float> @test_dpps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2
declare <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone
define <4 x float> @test_extractf128(<8 x float> %a0, <8 x float> %a1, <4 x float> *%a2) {
+; GENERIC-LABEL: test_extractf128:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vextractf128 $1, %ymm0, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: vextractf128 $1, %ymm1, (%rdi) # sched: [5:1.00]
+; GENERIC-NEXT: vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
; SANDY-LABEL: test_extractf128:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vextractf128 $1, %ymm0, %xmm0 # sched: [1:1.00]
-; SANDY-NEXT: vextractf128 $1, %ymm1, (%rdi) # sched: [1:1.00]
-; SANDY-NEXT: vzeroupper # sched: [?:0.000000e+00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vextractf128 $1, %ymm1, (%rdi) # sched: [5:1.00]
+; SANDY-NEXT: vzeroupper # sched: [100:0.33]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_extractf128:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vextractf128 $1, %ymm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: vextractf128 $1, %ymm1, (%rdi) # sched: [4:1.00]
-; HASWELL-NEXT: vzeroupper # sched: [1:0.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vextractf128 $1, %ymm1, (%rdi) # sched: [1:1.00]
+; HASWELL-NEXT: vzeroupper # sched: [4:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_extractf128:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vextractf128 $1, %ymm0, %xmm0 # sched: [3:1.00]
+; BROADWELL-NEXT: vextractf128 $1, %ymm1, (%rdi) # sched: [1:1.00]
+; BROADWELL-NEXT: vzeroupper # sched: [4:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_extractf128:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vextractf128 $1, %ymm0, %xmm0 # sched: [3:1.00]
+; SKYLAKE-NEXT: vextractf128 $1, %ymm1, (%rdi) # sched: [1:1.00]
+; SKYLAKE-NEXT: vzeroupper # sched: [4:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_extractf128:
+; SKX: # %bb.0:
+; SKX-NEXT: vextractf128 $1, %ymm0, %xmm0 # sched: [3:1.00]
+; SKX-NEXT: vextractf128 $1, %ymm1, (%rdi) # sched: [1:1.00]
+; SKX-NEXT: vzeroupper # sched: [4:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_extractf128:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vextractf128 $1, %ymm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vextractf128 $1, %ymm1, (%rdi) # sched: [1:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_extractf128:
-; ZNVER1: # BB#0:
-; ZNVER1-NEXT: vextractf128 $1, %ymm0, %xmm0 # sched: [1:0.50]
-; ZNVER1-NEXT: vextractf128 $1, %ymm1, (%rdi) # sched: [1:0.50]
-; ZNVER1-NEXT: vzeroupper # sched: [?:0.000000e+00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vextractf128 $1, %ymm0, %xmm0 # sched: [1:0.33]
+; ZNVER1-NEXT: vextractf128 $1, %ymm1, (%rdi) # sched: [8:0.50]
+; ZNVER1-NEXT: vzeroupper # sched: [100:?]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = shufflevector <8 x float> %a0, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%2 = shufflevector <8 x float> %a1, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
store <4 x float> %2, <4 x float> *%a2
@@ -896,29 +1715,53 @@ define <4 x float> @test_extractf128(<8 x float> %a0, <8 x float> %a1, <4 x floa
}
define <4 x double> @test_haddpd(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a2) {
+; GENERIC-LABEL: test_haddpd:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 # sched: [5:2.00]
+; GENERIC-NEXT: vhaddpd (%rdi), %ymm0, %ymm0 # sched: [12:2.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
; SANDY-LABEL: test_haddpd:
-; SANDY: # BB#0:
-; SANDY-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: vhaddpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 # sched: [5:2.00]
+; SANDY-NEXT: vhaddpd (%rdi), %ymm0, %ymm0 # sched: [12:2.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_haddpd:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 # sched: [5:2.00]
-; HASWELL-NEXT: vhaddpd (%rdi), %ymm0, %ymm0 # sched: [9:2.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vhaddpd (%rdi), %ymm0, %ymm0 # sched: [12:2.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_haddpd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 # sched: [5:2.00]
+; BROADWELL-NEXT: vhaddpd (%rdi), %ymm0, %ymm0 # sched: [11:2.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_haddpd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 # sched: [6:2.00]
+; SKYLAKE-NEXT: vhaddpd (%rdi), %ymm0, %ymm0 # sched: [13:2.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_haddpd:
+; SKX: # %bb.0:
+; SKX-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 # sched: [6:2.00]
+; SKX-NEXT: vhaddpd (%rdi), %ymm0, %ymm0 # sched: [13:2.00]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_haddpd:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
; BTVER2-NEXT: vhaddpd (%rdi), %ymm0, %ymm0 # sched: [8:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_haddpd:
-; ZNVER1: # BB#0:
-; ZNVER1-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; ZNVER1-NEXT: vhaddpd (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 # sched: [100:?]
+; ZNVER1-NEXT: vhaddpd (%rdi), %ymm0, %ymm0 # sched: [100:?]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %a0, <4 x double> %a1)
%2 = load <4 x double>, <4 x double> *%a2, align 32
%3 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %1, <4 x double> %2)
@@ -927,29 +1770,53 @@ define <4 x double> @test_haddpd(<4 x double> %a0, <4 x double> %a1, <4 x double
declare <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double>, <4 x double>) nounwind readnone
define <8 x float> @test_haddps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2) {
+; GENERIC-LABEL: test_haddps:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vhaddps %ymm1, %ymm0, %ymm0 # sched: [5:2.00]
+; GENERIC-NEXT: vhaddps (%rdi), %ymm0, %ymm0 # sched: [12:2.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
; SANDY-LABEL: test_haddps:
-; SANDY: # BB#0:
-; SANDY-NEXT: vhaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: vhaddps (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vhaddps %ymm1, %ymm0, %ymm0 # sched: [5:2.00]
+; SANDY-NEXT: vhaddps (%rdi), %ymm0, %ymm0 # sched: [12:2.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_haddps:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vhaddps %ymm1, %ymm0, %ymm0 # sched: [5:2.00]
-; HASWELL-NEXT: vhaddps (%rdi), %ymm0, %ymm0 # sched: [9:2.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vhaddps (%rdi), %ymm0, %ymm0 # sched: [12:2.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_haddps:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vhaddps %ymm1, %ymm0, %ymm0 # sched: [5:2.00]
+; BROADWELL-NEXT: vhaddps (%rdi), %ymm0, %ymm0 # sched: [11:2.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_haddps:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vhaddps %ymm1, %ymm0, %ymm0 # sched: [6:2.00]
+; SKYLAKE-NEXT: vhaddps (%rdi), %ymm0, %ymm0 # sched: [13:2.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_haddps:
+; SKX: # %bb.0:
+; SKX-NEXT: vhaddps %ymm1, %ymm0, %ymm0 # sched: [6:2.00]
+; SKX-NEXT: vhaddps (%rdi), %ymm0, %ymm0 # sched: [13:2.00]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_haddps:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vhaddps %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
; BTVER2-NEXT: vhaddps (%rdi), %ymm0, %ymm0 # sched: [8:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_haddps:
-; ZNVER1: # BB#0:
-; ZNVER1-NEXT: vhaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; ZNVER1-NEXT: vhaddps (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vhaddps %ymm1, %ymm0, %ymm0 # sched: [100:?]
+; ZNVER1-NEXT: vhaddps (%rdi), %ymm0, %ymm0 # sched: [100:?]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %a0, <8 x float> %a1)
%2 = load <8 x float>, <8 x float> *%a2, align 32
%3 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %1, <8 x float> %2)
@@ -958,29 +1825,53 @@ define <8 x float> @test_haddps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%
declare <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float>, <8 x float>) nounwind readnone
define <4 x double> @test_hsubpd(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a2) {
+; GENERIC-LABEL: test_hsubpd:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vhsubpd %ymm1, %ymm0, %ymm0 # sched: [5:2.00]
+; GENERIC-NEXT: vhsubpd (%rdi), %ymm0, %ymm0 # sched: [12:2.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
; SANDY-LABEL: test_hsubpd:
-; SANDY: # BB#0:
-; SANDY-NEXT: vhsubpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: vhsubpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vhsubpd %ymm1, %ymm0, %ymm0 # sched: [5:2.00]
+; SANDY-NEXT: vhsubpd (%rdi), %ymm0, %ymm0 # sched: [12:2.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_hsubpd:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vhsubpd %ymm1, %ymm0, %ymm0 # sched: [5:2.00]
-; HASWELL-NEXT: vhsubpd (%rdi), %ymm0, %ymm0 # sched: [9:2.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vhsubpd (%rdi), %ymm0, %ymm0 # sched: [12:2.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_hsubpd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vhsubpd %ymm1, %ymm0, %ymm0 # sched: [5:2.00]
+; BROADWELL-NEXT: vhsubpd (%rdi), %ymm0, %ymm0 # sched: [11:2.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_hsubpd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vhsubpd %ymm1, %ymm0, %ymm0 # sched: [6:2.00]
+; SKYLAKE-NEXT: vhsubpd (%rdi), %ymm0, %ymm0 # sched: [13:2.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_hsubpd:
+; SKX: # %bb.0:
+; SKX-NEXT: vhsubpd %ymm1, %ymm0, %ymm0 # sched: [6:2.00]
+; SKX-NEXT: vhsubpd (%rdi), %ymm0, %ymm0 # sched: [13:2.00]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_hsubpd:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vhsubpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
; BTVER2-NEXT: vhsubpd (%rdi), %ymm0, %ymm0 # sched: [8:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_hsubpd:
-; ZNVER1: # BB#0:
-; ZNVER1-NEXT: vhsubpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; ZNVER1-NEXT: vhsubpd (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vhsubpd %ymm1, %ymm0, %ymm0 # sched: [100:?]
+; ZNVER1-NEXT: vhsubpd (%rdi), %ymm0, %ymm0 # sched: [100:?]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call <4 x double> @llvm.x86.avx.hsub.pd.256(<4 x double> %a0, <4 x double> %a1)
%2 = load <4 x double>, <4 x double> *%a2, align 32
%3 = call <4 x double> @llvm.x86.avx.hsub.pd.256(<4 x double> %1, <4 x double> %2)
@@ -989,29 +1880,53 @@ define <4 x double> @test_hsubpd(<4 x double> %a0, <4 x double> %a1, <4 x double
declare <4 x double> @llvm.x86.avx.hsub.pd.256(<4 x double>, <4 x double>) nounwind readnone
define <8 x float> @test_hsubps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2) {
+; GENERIC-LABEL: test_hsubps:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vhsubps %ymm1, %ymm0, %ymm0 # sched: [5:2.00]
+; GENERIC-NEXT: vhsubps (%rdi), %ymm0, %ymm0 # sched: [12:2.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
; SANDY-LABEL: test_hsubps:
-; SANDY: # BB#0:
-; SANDY-NEXT: vhsubps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: vhsubps (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vhsubps %ymm1, %ymm0, %ymm0 # sched: [5:2.00]
+; SANDY-NEXT: vhsubps (%rdi), %ymm0, %ymm0 # sched: [12:2.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_hsubps:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vhsubps %ymm1, %ymm0, %ymm0 # sched: [5:2.00]
-; HASWELL-NEXT: vhsubps (%rdi), %ymm0, %ymm0 # sched: [9:2.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vhsubps (%rdi), %ymm0, %ymm0 # sched: [12:2.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_hsubps:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vhsubps %ymm1, %ymm0, %ymm0 # sched: [5:2.00]
+; BROADWELL-NEXT: vhsubps (%rdi), %ymm0, %ymm0 # sched: [11:2.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_hsubps:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vhsubps %ymm1, %ymm0, %ymm0 # sched: [6:2.00]
+; SKYLAKE-NEXT: vhsubps (%rdi), %ymm0, %ymm0 # sched: [13:2.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_hsubps:
+; SKX: # %bb.0:
+; SKX-NEXT: vhsubps %ymm1, %ymm0, %ymm0 # sched: [6:2.00]
+; SKX-NEXT: vhsubps (%rdi), %ymm0, %ymm0 # sched: [13:2.00]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_hsubps:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vhsubps %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
; BTVER2-NEXT: vhsubps (%rdi), %ymm0, %ymm0 # sched: [8:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_hsubps:
-; ZNVER1: # BB#0:
-; ZNVER1-NEXT: vhsubps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; ZNVER1-NEXT: vhsubps (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vhsubps %ymm1, %ymm0, %ymm0 # sched: [100:?]
+; ZNVER1-NEXT: vhsubps (%rdi), %ymm0, %ymm0 # sched: [100:?]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call <8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float> %a0, <8 x float> %a1)
%2 = load <8 x float>, <8 x float> *%a2, align 32
%3 = call <8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float> %1, <8 x float> %2)
@@ -1020,33 +1935,61 @@ define <8 x float> @test_hsubps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%
declare <8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float>, <8 x float>) nounwind readnone
define <8 x float> @test_insertf128(<8 x float> %a0, <4 x float> %a1, <4 x float> *%a2) {
+; GENERIC-LABEL: test_insertf128:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 # sched: [1:1.00]
+; GENERIC-NEXT: vinsertf128 $1, (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
+; GENERIC-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
; SANDY-LABEL: test_insertf128:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 # sched: [1:1.00]
-; SANDY-NEXT: vinsertf128 $1, (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
+; SANDY-NEXT: vinsertf128 $1, (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_insertf128:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 # sched: [3:1.00]
-; HASWELL-NEXT: vinsertf128 $1, (%rdi), %ymm0, %ymm0 # sched: [3:1.00]
+; HASWELL-NEXT: vinsertf128 $1, (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
; HASWELL-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_insertf128:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 # sched: [3:1.00]
+; BROADWELL-NEXT: vinsertf128 $1, (%rdi), %ymm0, %ymm0 # sched: [6:0.50]
+; BROADWELL-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_insertf128:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 # sched: [3:1.00]
+; SKYLAKE-NEXT: vinsertf128 $1, (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
+; SKYLAKE-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_insertf128:
+; SKX: # %bb.0:
+; SKX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 # sched: [3:1.00]
+; SKX-NEXT: vinsertf128 $1, (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
+; SKX-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_insertf128:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 # sched: [1:0.50]
; BTVER2-NEXT: vinsertf128 $1, (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_insertf128:
-; ZNVER1: # BB#0:
-; ZNVER1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 # sched: [1:0.50]
-; ZNVER1-NEXT: vinsertf128 $1, (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 # sched: [2:0.67]
+; ZNVER1-NEXT: vinsertf128 $1, (%rdi), %ymm0, %ymm0 # sched: [9:0.67]
; ZNVER1-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = shufflevector <4 x float> %a1, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
%2 = shufflevector <8 x float> %a0, <8 x float> %1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
%3 = load <4 x float>, <4 x float> *%a2, align 16
@@ -1057,58 +2000,106 @@ define <8 x float> @test_insertf128(<8 x float> %a0, <4 x float> %a1, <4 x float
}
define <32 x i8> @test_lddqu(i8* %a0) {
+; GENERIC-LABEL: test_lddqu:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vlddqu (%rdi), %ymm0 # sched: [6:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
; SANDY-LABEL: test_lddqu:
-; SANDY: # BB#0:
-; SANDY-NEXT: vlddqu (%rdi), %ymm0 # sched: [4:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vlddqu (%rdi), %ymm0 # sched: [6:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_lddqu:
-; HASWELL: # BB#0:
-; HASWELL-NEXT: vlddqu (%rdi), %ymm0 # sched: [4:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vlddqu (%rdi), %ymm0 # sched: [7:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_lddqu:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vlddqu (%rdi), %ymm0 # sched: [6:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_lddqu:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vlddqu (%rdi), %ymm0 # sched: [7:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_lddqu:
+; SKX: # %bb.0:
+; SKX-NEXT: vlddqu (%rdi), %ymm0 # sched: [7:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_lddqu:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vlddqu (%rdi), %ymm0 # sched: [5:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_lddqu:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vlddqu (%rdi), %ymm0 # sched: [8:0.50]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call <32 x i8> @llvm.x86.avx.ldu.dq.256(i8* %a0)
ret <32 x i8> %1
}
declare <32 x i8> @llvm.x86.avx.ldu.dq.256(i8*) nounwind readonly
define <2 x double> @test_maskmovpd(i8* %a0, <2 x i64> %a1, <2 x double> %a2) {
+; GENERIC-LABEL: test_maskmovpd:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmaskmovpd (%rdi), %xmm0, %xmm2 # sched: [8:1.00]
+; GENERIC-NEXT: vmaskmovpd %xmm1, %xmm0, (%rdi) # sched: [5:1.00]
+; GENERIC-NEXT: vmovapd %xmm2, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
; SANDY-LABEL: test_maskmovpd:
-; SANDY: # BB#0:
-; SANDY-NEXT: vmaskmovpd (%rdi), %xmm0, %xmm2 # sched: [?:0.000000e+00]
-; SANDY-NEXT: vmaskmovpd %xmm1, %xmm0, (%rdi) # sched: [?:0.000000e+00]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vmaskmovpd (%rdi), %xmm0, %xmm2 # sched: [8:1.00]
+; SANDY-NEXT: vmaskmovpd %xmm1, %xmm0, (%rdi) # sched: [5:1.00]
; SANDY-NEXT: vmovapd %xmm2, %xmm0 # sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_maskmovpd:
-; HASWELL: # BB#0:
-; HASWELL-NEXT: vmaskmovpd (%rdi), %xmm0, %xmm2 # sched: [4:2.00]
-; HASWELL-NEXT: vmaskmovpd %xmm1, %xmm0, (%rdi) # sched: [13:1.00]
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vmaskmovpd (%rdi), %xmm0, %xmm2 # sched: [8:2.00]
+; HASWELL-NEXT: vmaskmovpd %xmm1, %xmm0, (%rdi) # sched: [5:1.00]
; HASWELL-NEXT: vmovapd %xmm2, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_maskmovpd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vmaskmovpd (%rdi), %xmm0, %xmm2 # sched: [7:2.00]
+; BROADWELL-NEXT: vmaskmovpd %xmm1, %xmm0, (%rdi) # sched: [5:1.00]
+; BROADWELL-NEXT: vmovapd %xmm2, %xmm0 # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_maskmovpd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vmaskmovpd (%rdi), %xmm0, %xmm2 # sched: [7:0.50]
+; SKYLAKE-NEXT: vmaskmovpd %xmm1, %xmm0, (%rdi) # sched: [2:1.00]
+; SKYLAKE-NEXT: vmovapd %xmm2, %xmm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_maskmovpd:
+; SKX: # %bb.0:
+; SKX-NEXT: vmaskmovpd (%rdi), %xmm0, %xmm2 # sched: [7:0.50]
+; SKX-NEXT: vmaskmovpd %xmm1, %xmm0, (%rdi) # sched: [2:1.00]
+; SKX-NEXT: vmovapd %xmm2, %xmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_maskmovpd:
-; BTVER2: # BB#0:
-; BTVER2-NEXT: vmaskmovpd (%rdi), %xmm0, %xmm2 # sched: [?:0.000000e+00]
-; BTVER2-NEXT: vmaskmovpd %xmm1, %xmm0, (%rdi) # sched: [?:0.000000e+00]
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: vmaskmovpd (%rdi), %xmm0, %xmm2 # sched: [6:1.00]
+; BTVER2-NEXT: vmaskmovpd %xmm1, %xmm0, (%rdi) # sched: [6:2.00]
; BTVER2-NEXT: vmovapd %xmm2, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_maskmovpd:
-; ZNVER1: # BB#0:
-; ZNVER1-NEXT: vmaskmovpd (%rdi), %xmm0, %xmm2 # sched: [?:0.000000e+00]
-; ZNVER1-NEXT: vmaskmovpd %xmm1, %xmm0, (%rdi) # sched: [?:0.000000e+00]
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vmaskmovpd (%rdi), %xmm0, %xmm2 # sched: [8:0.50]
+; ZNVER1-NEXT: vmaskmovpd %xmm1, %xmm0, (%rdi) # sched: [4:0.50]
; ZNVER1-NEXT: vmovapd %xmm2, %xmm0 # sched: [1:0.50]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call <2 x double> @llvm.x86.avx.maskload.pd(i8* %a0, <2 x i64> %a1)
call void @llvm.x86.avx.maskstore.pd(i8* %a0, <2 x i64> %a1, <2 x double> %a2)
ret <2 x double> %1
@@ -1117,33 +2108,61 @@ declare <2 x double> @llvm.x86.avx.maskload.pd(i8*, <2 x i64>) nounwind readonly
declare void @llvm.x86.avx.maskstore.pd(i8*, <2 x i64>, <2 x double>) nounwind
define <4 x double> @test_maskmovpd_ymm(i8* %a0, <4 x i64> %a1, <4 x double> %a2) {
+; GENERIC-LABEL: test_maskmovpd_ymm:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm2 # sched: [9:1.00]
+; GENERIC-NEXT: vmaskmovpd %ymm1, %ymm0, (%rdi) # sched: [5:1.00]
+; GENERIC-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
; SANDY-LABEL: test_maskmovpd_ymm:
-; SANDY: # BB#0:
-; SANDY-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm2 # sched: [?:0.000000e+00]
-; SANDY-NEXT: vmaskmovpd %ymm1, %ymm0, (%rdi) # sched: [?:0.000000e+00]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm2 # sched: [9:1.00]
+; SANDY-NEXT: vmaskmovpd %ymm1, %ymm0, (%rdi) # sched: [5:1.00]
; SANDY-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_maskmovpd_ymm:
-; HASWELL: # BB#0:
-; HASWELL-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm2 # sched: [4:2.00]
-; HASWELL-NEXT: vmaskmovpd %ymm1, %ymm0, (%rdi) # sched: [14:1.00]
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm2 # sched: [9:2.00]
+; HASWELL-NEXT: vmaskmovpd %ymm1, %ymm0, (%rdi) # sched: [5:1.00]
; HASWELL-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_maskmovpd_ymm:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm2 # sched: [8:2.00]
+; BROADWELL-NEXT: vmaskmovpd %ymm1, %ymm0, (%rdi) # sched: [5:1.00]
+; BROADWELL-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_maskmovpd_ymm:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm2 # sched: [8:0.50]
+; SKYLAKE-NEXT: vmaskmovpd %ymm1, %ymm0, (%rdi) # sched: [2:1.00]
+; SKYLAKE-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_maskmovpd_ymm:
+; SKX: # %bb.0:
+; SKX-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm2 # sched: [8:0.50]
+; SKX-NEXT: vmaskmovpd %ymm1, %ymm0, (%rdi) # sched: [2:1.00]
+; SKX-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_maskmovpd_ymm:
-; BTVER2: # BB#0:
-; BTVER2-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm2 # sched: [?:0.000000e+00]
-; BTVER2-NEXT: vmaskmovpd %ymm1, %ymm0, (%rdi) # sched: [?:0.000000e+00]
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm2 # sched: [6:2.00]
+; BTVER2-NEXT: vmaskmovpd %ymm1, %ymm0, (%rdi) # sched: [6:2.00]
; BTVER2-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_maskmovpd_ymm:
-; ZNVER1: # BB#0:
-; ZNVER1-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm2 # sched: [?:0.000000e+00]
-; ZNVER1-NEXT: vmaskmovpd %ymm1, %ymm0, (%rdi) # sched: [?:0.000000e+00]
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm2 # sched: [8:1.00]
+; ZNVER1-NEXT: vmaskmovpd %ymm1, %ymm0, (%rdi) # sched: [5:1.00]
; ZNVER1-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:0.50]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %a0, <4 x i64> %a1)
call void @llvm.x86.avx.maskstore.pd.256(i8* %a0, <4 x i64> %a1, <4 x double> %a2)
ret <4 x double> %1
@@ -1152,33 +2171,61 @@ declare <4 x double> @llvm.x86.avx.maskload.pd.256(i8*, <4 x i64>) nounwind read
declare void @llvm.x86.avx.maskstore.pd.256(i8*, <4 x i64>, <4 x double>) nounwind
define <4 x float> @test_maskmovps(i8* %a0, <4 x i32> %a1, <4 x float> %a2) {
+; GENERIC-LABEL: test_maskmovps:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2 # sched: [8:1.00]
+; GENERIC-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi) # sched: [5:1.00]
+; GENERIC-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
; SANDY-LABEL: test_maskmovps:
-; SANDY: # BB#0:
-; SANDY-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2 # sched: [?:0.000000e+00]
-; SANDY-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi) # sched: [?:0.000000e+00]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2 # sched: [8:1.00]
+; SANDY-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi) # sched: [5:1.00]
; SANDY-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_maskmovps:
-; HASWELL: # BB#0:
-; HASWELL-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2 # sched: [4:2.00]
-; HASWELL-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi) # sched: [13:1.00]
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2 # sched: [8:2.00]
+; HASWELL-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi) # sched: [5:1.00]
; HASWELL-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_maskmovps:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2 # sched: [7:2.00]
+; BROADWELL-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi) # sched: [5:1.00]
+; BROADWELL-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_maskmovps:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2 # sched: [7:0.50]
+; SKYLAKE-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi) # sched: [2:1.00]
+; SKYLAKE-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_maskmovps:
+; SKX: # %bb.0:
+; SKX-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2 # sched: [7:0.50]
+; SKX-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi) # sched: [2:1.00]
+; SKX-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_maskmovps:
-; BTVER2: # BB#0:
-; BTVER2-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2 # sched: [?:0.000000e+00]
-; BTVER2-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi) # sched: [?:0.000000e+00]
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2 # sched: [6:1.00]
+; BTVER2-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi) # sched: [6:2.00]
; BTVER2-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_maskmovps:
-; ZNVER1: # BB#0:
-; ZNVER1-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2 # sched: [?:0.000000e+00]
-; ZNVER1-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi) # sched: [?:0.000000e+00]
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2 # sched: [8:0.50]
+; ZNVER1-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi) # sched: [4:0.50]
; ZNVER1-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:0.50]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call <4 x float> @llvm.x86.avx.maskload.ps(i8* %a0, <4 x i32> %a1)
call void @llvm.x86.avx.maskstore.ps(i8* %a0, <4 x i32> %a1, <4 x float> %a2)
ret <4 x float> %1
@@ -1187,33 +2234,61 @@ declare <4 x float> @llvm.x86.avx.maskload.ps(i8*, <4 x i32>) nounwind readonly
declare void @llvm.x86.avx.maskstore.ps(i8*, <4 x i32>, <4 x float>) nounwind
define <8 x float> @test_maskmovps_ymm(i8* %a0, <8 x i32> %a1, <8 x float> %a2) {
+; GENERIC-LABEL: test_maskmovps_ymm:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmaskmovps (%rdi), %ymm0, %ymm2 # sched: [9:1.00]
+; GENERIC-NEXT: vmaskmovps %ymm1, %ymm0, (%rdi) # sched: [5:1.00]
+; GENERIC-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
; SANDY-LABEL: test_maskmovps_ymm:
-; SANDY: # BB#0:
-; SANDY-NEXT: vmaskmovps (%rdi), %ymm0, %ymm2 # sched: [?:0.000000e+00]
-; SANDY-NEXT: vmaskmovps %ymm1, %ymm0, (%rdi) # sched: [?:0.000000e+00]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vmaskmovps (%rdi), %ymm0, %ymm2 # sched: [9:1.00]
+; SANDY-NEXT: vmaskmovps %ymm1, %ymm0, (%rdi) # sched: [5:1.00]
; SANDY-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_maskmovps_ymm:
-; HASWELL: # BB#0:
-; HASWELL-NEXT: vmaskmovps (%rdi), %ymm0, %ymm2 # sched: [4:2.00]
-; HASWELL-NEXT: vmaskmovps %ymm1, %ymm0, (%rdi) # sched: [14:1.00]
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vmaskmovps (%rdi), %ymm0, %ymm2 # sched: [9:2.00]
+; HASWELL-NEXT: vmaskmovps %ymm1, %ymm0, (%rdi) # sched: [5:1.00]
; HASWELL-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_maskmovps_ymm:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vmaskmovps (%rdi), %ymm0, %ymm2 # sched: [8:2.00]
+; BROADWELL-NEXT: vmaskmovps %ymm1, %ymm0, (%rdi) # sched: [5:1.00]
+; BROADWELL-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_maskmovps_ymm:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vmaskmovps (%rdi), %ymm0, %ymm2 # sched: [8:0.50]
+; SKYLAKE-NEXT: vmaskmovps %ymm1, %ymm0, (%rdi) # sched: [2:1.00]
+; SKYLAKE-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_maskmovps_ymm:
+; SKX: # %bb.0:
+; SKX-NEXT: vmaskmovps (%rdi), %ymm0, %ymm2 # sched: [8:0.50]
+; SKX-NEXT: vmaskmovps %ymm1, %ymm0, (%rdi) # sched: [2:1.00]
+; SKX-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_maskmovps_ymm:
-; BTVER2: # BB#0:
-; BTVER2-NEXT: vmaskmovps (%rdi), %ymm0, %ymm2 # sched: [?:0.000000e+00]
-; BTVER2-NEXT: vmaskmovps %ymm1, %ymm0, (%rdi) # sched: [?:0.000000e+00]
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: vmaskmovps (%rdi), %ymm0, %ymm2 # sched: [6:2.00]
+; BTVER2-NEXT: vmaskmovps %ymm1, %ymm0, (%rdi) # sched: [6:2.00]
; BTVER2-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_maskmovps_ymm:
-; ZNVER1: # BB#0:
-; ZNVER1-NEXT: vmaskmovps (%rdi), %ymm0, %ymm2 # sched: [?:0.000000e+00]
-; ZNVER1-NEXT: vmaskmovps %ymm1, %ymm0, (%rdi) # sched: [?:0.000000e+00]
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vmaskmovps (%rdi), %ymm0, %ymm2 # sched: [8:1.00]
+; ZNVER1-NEXT: vmaskmovps %ymm1, %ymm0, (%rdi) # sched: [5:1.00]
; ZNVER1-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:0.50]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call <8 x float> @llvm.x86.avx.maskload.ps.256(i8* %a0, <8 x i32> %a1)
call void @llvm.x86.avx.maskstore.ps.256(i8* %a0, <8 x i32> %a1, <8 x float> %a2)
ret <8 x float> %1
@@ -1222,29 +2297,53 @@ declare <8 x float> @llvm.x86.avx.maskload.ps.256(i8*, <8 x i32>) nounwind reado
declare void @llvm.x86.avx.maskstore.ps.256(i8*, <8 x i32>, <8 x float>) nounwind
define <4 x double> @test_maxpd(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a2) {
+; GENERIC-LABEL: test_maxpd:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmaxpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: vmaxpd (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
; SANDY-LABEL: test_maxpd:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vmaxpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: vmaxpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmaxpd (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_maxpd:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vmaxpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT: vmaxpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vmaxpd (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_maxpd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vmaxpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; BROADWELL-NEXT: vmaxpd (%rdi), %ymm0, %ymm0 # sched: [9:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_maxpd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vmaxpd %ymm1, %ymm0, %ymm0 # sched: [4:0.33]
+; SKYLAKE-NEXT: vmaxpd (%rdi), %ymm0, %ymm0 # sched: [11:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_maxpd:
+; SKX: # %bb.0:
+; SKX-NEXT: vmaxpd %ymm1, %ymm0, %ymm0 # sched: [4:0.33]
+; SKX-NEXT: vmaxpd (%rdi), %ymm0, %ymm0 # sched: [11:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_maxpd:
-; BTVER2: # BB#0:
-; BTVER2-NEXT: vmaxpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; BTVER2-NEXT: vmaxpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: vmaxpd %ymm1, %ymm0, %ymm0 # sched: [2:2.00]
+; BTVER2-NEXT: vmaxpd (%rdi), %ymm0, %ymm0 # sched: [7:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_maxpd:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vmaxpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
; ZNVER1-NEXT: vmaxpd (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call <4 x double> @llvm.x86.avx.max.pd.256(<4 x double> %a0, <4 x double> %a1)
%2 = load <4 x double>, <4 x double> *%a2, align 32
%3 = call <4 x double> @llvm.x86.avx.max.pd.256(<4 x double> %1, <4 x double> %2)
@@ -1253,29 +2352,53 @@ define <4 x double> @test_maxpd(<4 x double> %a0, <4 x double> %a1, <4 x double>
declare <4 x double> @llvm.x86.avx.max.pd.256(<4 x double>, <4 x double>) nounwind readnone
define <8 x float> @test_maxps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2) {
+; GENERIC-LABEL: test_maxps:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmaxps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: vmaxps (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
; SANDY-LABEL: test_maxps:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vmaxps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: vmaxps (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmaxps (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_maxps:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vmaxps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT: vmaxps (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vmaxps (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_maxps:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vmaxps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; BROADWELL-NEXT: vmaxps (%rdi), %ymm0, %ymm0 # sched: [9:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_maxps:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vmaxps %ymm1, %ymm0, %ymm0 # sched: [4:0.33]
+; SKYLAKE-NEXT: vmaxps (%rdi), %ymm0, %ymm0 # sched: [11:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_maxps:
+; SKX: # %bb.0:
+; SKX-NEXT: vmaxps %ymm1, %ymm0, %ymm0 # sched: [4:0.33]
+; SKX-NEXT: vmaxps (%rdi), %ymm0, %ymm0 # sched: [11:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_maxps:
-; BTVER2: # BB#0:
-; BTVER2-NEXT: vmaxps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; BTVER2-NEXT: vmaxps (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: vmaxps %ymm1, %ymm0, %ymm0 # sched: [2:2.00]
+; BTVER2-NEXT: vmaxps (%rdi), %ymm0, %ymm0 # sched: [7:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_maxps:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vmaxps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
; ZNVER1-NEXT: vmaxps (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> %a0, <8 x float> %a1)
%2 = load <8 x float>, <8 x float> *%a2, align 32
%3 = call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> %1, <8 x float> %2)
@@ -1284,29 +2407,53 @@ define <8 x float> @test_maxps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a
declare <8 x float> @llvm.x86.avx.max.ps.256(<8 x float>, <8 x float>) nounwind readnone
define <4 x double> @test_minpd(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a2) {
+; GENERIC-LABEL: test_minpd:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vminpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: vminpd (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
; SANDY-LABEL: test_minpd:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vminpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: vminpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vminpd (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_minpd:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vminpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT: vminpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vminpd (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_minpd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vminpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; BROADWELL-NEXT: vminpd (%rdi), %ymm0, %ymm0 # sched: [9:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_minpd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vminpd %ymm1, %ymm0, %ymm0 # sched: [4:0.33]
+; SKYLAKE-NEXT: vminpd (%rdi), %ymm0, %ymm0 # sched: [11:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_minpd:
+; SKX: # %bb.0:
+; SKX-NEXT: vminpd %ymm1, %ymm0, %ymm0 # sched: [4:0.33]
+; SKX-NEXT: vminpd (%rdi), %ymm0, %ymm0 # sched: [11:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_minpd:
-; BTVER2: # BB#0:
-; BTVER2-NEXT: vminpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; BTVER2-NEXT: vminpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: vminpd %ymm1, %ymm0, %ymm0 # sched: [2:2.00]
+; BTVER2-NEXT: vminpd (%rdi), %ymm0, %ymm0 # sched: [7:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_minpd:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vminpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
; ZNVER1-NEXT: vminpd (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call <4 x double> @llvm.x86.avx.min.pd.256(<4 x double> %a0, <4 x double> %a1)
%2 = load <4 x double>, <4 x double> *%a2, align 32
%3 = call <4 x double> @llvm.x86.avx.min.pd.256(<4 x double> %1, <4 x double> %2)
@@ -1315,29 +2462,53 @@ define <4 x double> @test_minpd(<4 x double> %a0, <4 x double> %a1, <4 x double>
declare <4 x double> @llvm.x86.avx.min.pd.256(<4 x double>, <4 x double>) nounwind readnone
define <8 x float> @test_minps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2) {
+; GENERIC-LABEL: test_minps:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vminps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: vminps (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
; SANDY-LABEL: test_minps:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vminps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: vminps (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vminps (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_minps:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vminps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT: vminps (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vminps (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_minps:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vminps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; BROADWELL-NEXT: vminps (%rdi), %ymm0, %ymm0 # sched: [9:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_minps:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vminps %ymm1, %ymm0, %ymm0 # sched: [4:0.33]
+; SKYLAKE-NEXT: vminps (%rdi), %ymm0, %ymm0 # sched: [11:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_minps:
+; SKX: # %bb.0:
+; SKX-NEXT: vminps %ymm1, %ymm0, %ymm0 # sched: [4:0.33]
+; SKX-NEXT: vminps (%rdi), %ymm0, %ymm0 # sched: [11:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_minps:
-; BTVER2: # BB#0:
-; BTVER2-NEXT: vminps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; BTVER2-NEXT: vminps (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: vminps %ymm1, %ymm0, %ymm0 # sched: [2:2.00]
+; BTVER2-NEXT: vminps (%rdi), %ymm0, %ymm0 # sched: [7:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_minps:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vminps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
; ZNVER1-NEXT: vminps (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> %a0, <8 x float> %a1)
%2 = load <8 x float>, <8 x float> *%a2, align 32
%3 = call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> %1, <8 x float> %2)
@@ -1346,33 +2517,61 @@ define <8 x float> @test_minps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a
declare <8 x float> @llvm.x86.avx.min.ps.256(<8 x float>, <8 x float>) nounwind readnone
define <4 x double> @test_movapd(<4 x double> *%a0, <4 x double> *%a1) {
+; GENERIC-LABEL: test_movapd:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovapd (%rdi), %ymm0 # sched: [7:0.50]
+; GENERIC-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: vmovapd %ymm0, (%rsi) # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
; SANDY-LABEL: test_movapd:
-; SANDY: # BB#0:
-; SANDY-NEXT: vmovapd (%rdi), %ymm0 # sched: [4:0.50]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vmovapd (%rdi), %ymm0 # sched: [7:0.50]
; SANDY-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: vmovapd %ymm0, (%rsi) # sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmovapd %ymm0, (%rsi) # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movapd:
-; HASWELL: # BB#0:
-; HASWELL-NEXT: vmovapd (%rdi), %ymm0 # sched: [4:0.50]
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vmovapd (%rdi), %ymm0 # sched: [7:0.50]
; HASWELL-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
; HASWELL-NEXT: vmovapd %ymm0, (%rsi) # sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_movapd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vmovapd (%rdi), %ymm0 # sched: [6:0.50]
+; BROADWELL-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
+; BROADWELL-NEXT: vmovapd %ymm0, (%rsi) # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_movapd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vmovapd (%rdi), %ymm0 # sched: [7:0.50]
+; SKYLAKE-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vmovapd %ymm0, (%rsi) # sched: [1:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_movapd:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovapd (%rdi), %ymm0 # sched: [7:0.50]
+; SKX-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [4:0.33]
+; SKX-NEXT: vmovapd %ymm0, (%rsi) # sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_movapd:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vmovapd (%rdi), %ymm0 # sched: [5:1.00]
; BTVER2-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:2.00]
; BTVER2-NEXT: vmovapd %ymm0, (%rsi) # sched: [1:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_movapd:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vmovapd (%rdi), %ymm0 # sched: [8:0.50]
; ZNVER1-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
; ZNVER1-NEXT: vmovapd %ymm0, (%rsi) # sched: [1:0.50]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = load <4 x double>, <4 x double> *%a0, align 32
%2 = fadd <4 x double> %1, %1
store <4 x double> %2, <4 x double> *%a1, align 32
@@ -1380,33 +2579,61 @@ define <4 x double> @test_movapd(<4 x double> *%a0, <4 x double> *%a1) {
}
define <8 x float> @test_movaps(<8 x float> *%a0, <8 x float> *%a1) {
+; GENERIC-LABEL: test_movaps:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovaps (%rdi), %ymm0 # sched: [7:0.50]
+; GENERIC-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: vmovaps %ymm0, (%rsi) # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
; SANDY-LABEL: test_movaps:
-; SANDY: # BB#0:
-; SANDY-NEXT: vmovaps (%rdi), %ymm0 # sched: [4:0.50]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vmovaps (%rdi), %ymm0 # sched: [7:0.50]
; SANDY-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: vmovaps %ymm0, (%rsi) # sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmovaps %ymm0, (%rsi) # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movaps:
-; HASWELL: # BB#0:
-; HASWELL-NEXT: vmovaps (%rdi), %ymm0 # sched: [4:0.50]
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vmovaps (%rdi), %ymm0 # sched: [7:0.50]
; HASWELL-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
; HASWELL-NEXT: vmovaps %ymm0, (%rsi) # sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_movaps:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vmovaps (%rdi), %ymm0 # sched: [6:0.50]
+; BROADWELL-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
+; BROADWELL-NEXT: vmovaps %ymm0, (%rsi) # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_movaps:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vmovaps (%rdi), %ymm0 # sched: [7:0.50]
+; SKYLAKE-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vmovaps %ymm0, (%rsi) # sched: [1:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_movaps:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovaps (%rdi), %ymm0 # sched: [7:0.50]
+; SKX-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [4:0.33]
+; SKX-NEXT: vmovaps %ymm0, (%rsi) # sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_movaps:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vmovaps (%rdi), %ymm0 # sched: [5:1.00]
; BTVER2-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:2.00]
; BTVER2-NEXT: vmovaps %ymm0, (%rsi) # sched: [1:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_movaps:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vmovaps (%rdi), %ymm0 # sched: [8:0.50]
; ZNVER1-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
; ZNVER1-NEXT: vmovaps %ymm0, (%rsi) # sched: [1:0.50]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = load <8 x float>, <8 x float> *%a0, align 32
%2 = fadd <8 x float> %1, %1
store <8 x float> %2, <8 x float> *%a1, align 32
@@ -1414,33 +2641,61 @@ define <8 x float> @test_movaps(<8 x float> *%a0, <8 x float> *%a1) {
}
define <4 x double> @test_movddup(<4 x double> %a0, <4 x double> *%a1) {
+; GENERIC-LABEL: test_movddup:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] sched: [1:1.00]
+; GENERIC-NEXT: vmovddup {{.*#+}} ymm1 = mem[0,0,2,2] sched: [7:0.50]
+; GENERIC-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
; SANDY-LABEL: test_movddup:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] sched: [1:1.00]
-; SANDY-NEXT: vmovddup {{.*#+}} ymm1 = mem[0,0,2,2] sched: [4:0.50]
+; SANDY-NEXT: vmovddup {{.*#+}} ymm1 = mem[0,0,2,2] sched: [7:0.50]
; SANDY-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movddup:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] sched: [1:1.00]
-; HASWELL-NEXT: vmovddup {{.*#+}} ymm1 = mem[0,0,2,2] sched: [4:0.50]
+; HASWELL-NEXT: vmovddup {{.*#+}} ymm1 = mem[0,0,2,2] sched: [7:0.50]
; HASWELL-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_movddup:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] sched: [1:1.00]
+; BROADWELL-NEXT: vmovddup {{.*#+}} ymm1 = mem[0,0,2,2] sched: [6:0.50]
+; BROADWELL-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_movddup:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] sched: [1:1.00]
+; SKYLAKE-NEXT: vmovddup {{.*#+}} ymm1 = mem[0,0,2,2] sched: [7:0.50]
+; SKYLAKE-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_movddup:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] sched: [1:1.00]
+; SKX-NEXT: vmovddup {{.*#+}} ymm1 = mem[0,0,2,2] sched: [7:0.50]
+; SKX-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_movddup:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vmovddup {{.*#+}} ymm1 = mem[0,0,2,2] sched: [5:1.00]
; BTVER2-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] sched: [1:0.50]
; BTVER2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_movddup:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vmovddup {{.*#+}} ymm1 = mem[0,0,2,2] sched: [8:0.50]
; ZNVER1-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] sched: [1:0.50]
; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
%2 = load <4 x double>, <4 x double> *%a1, align 32
%3 = shufflevector <4 x double> %2, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
@@ -1449,147 +2704,338 @@ define <4 x double> @test_movddup(<4 x double> %a0, <4 x double> *%a1) {
}
define i32 @test_movmskpd(<4 x double> %a0) {
+; GENERIC-LABEL: test_movmskpd:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovmskpd %ymm0, %eax # sched: [2:1.00]
+; GENERIC-NEXT: vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
; SANDY-LABEL: test_movmskpd:
-; SANDY: # BB#0:
-; SANDY-NEXT: vmovmskpd %ymm0, %eax # sched: [1:0.33]
-; SANDY-NEXT: vzeroupper # sched: [?:0.000000e+00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vmovmskpd %ymm0, %eax # sched: [2:1.00]
+; SANDY-NEXT: vzeroupper # sched: [100:0.33]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movmskpd:
-; HASWELL: # BB#0:
-; HASWELL-NEXT: vmovmskpd %ymm0, %eax # sched: [2:1.00]
-; HASWELL-NEXT: vzeroupper # sched: [1:0.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vmovmskpd %ymm0, %eax # sched: [3:1.00]
+; HASWELL-NEXT: vzeroupper # sched: [4:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_movmskpd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vmovmskpd %ymm0, %eax # sched: [3:1.00]
+; BROADWELL-NEXT: vzeroupper # sched: [4:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_movmskpd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vmovmskpd %ymm0, %eax # sched: [2:1.00]
+; SKYLAKE-NEXT: vzeroupper # sched: [4:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_movmskpd:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovmskpd %ymm0, %eax # sched: [2:1.00]
+; SKX-NEXT: vzeroupper # sched: [4:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_movmskpd:
-; BTVER2: # BB#0:
-; BTVER2-NEXT: vmovmskpd %ymm0, %eax # sched: [1:0.50]
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: vmovmskpd %ymm0, %eax # sched: [3:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_movmskpd:
-; ZNVER1: # BB#0:
-; ZNVER1-NEXT: vmovmskpd %ymm0, %eax # sched: [1:0.25]
-; ZNVER1-NEXT: vzeroupper # sched: [?:0.000000e+00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vmovmskpd %ymm0, %eax # sched: [1:1.00]
+; ZNVER1-NEXT: vzeroupper # sched: [100:?]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> %a0)
ret i32 %1
}
declare i32 @llvm.x86.avx.movmsk.pd.256(<4 x double>) nounwind readnone
define i32 @test_movmskps(<8 x float> %a0) {
+; GENERIC-LABEL: test_movmskps:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovmskps %ymm0, %eax # sched: [2:1.00]
+; GENERIC-NEXT: vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
; SANDY-LABEL: test_movmskps:
-; SANDY: # BB#0:
-; SANDY-NEXT: vmovmskps %ymm0, %eax # sched: [1:0.33]
-; SANDY-NEXT: vzeroupper # sched: [?:0.000000e+00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vmovmskps %ymm0, %eax # sched: [2:1.00]
+; SANDY-NEXT: vzeroupper # sched: [100:0.33]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movmskps:
-; HASWELL: # BB#0:
-; HASWELL-NEXT: vmovmskps %ymm0, %eax # sched: [2:1.00]
-; HASWELL-NEXT: vzeroupper # sched: [1:0.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vmovmskps %ymm0, %eax # sched: [3:1.00]
+; HASWELL-NEXT: vzeroupper # sched: [4:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_movmskps:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vmovmskps %ymm0, %eax # sched: [3:1.00]
+; BROADWELL-NEXT: vzeroupper # sched: [4:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_movmskps:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vmovmskps %ymm0, %eax # sched: [2:1.00]
+; SKYLAKE-NEXT: vzeroupper # sched: [4:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_movmskps:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovmskps %ymm0, %eax # sched: [2:1.00]
+; SKX-NEXT: vzeroupper # sched: [4:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_movmskps:
-; BTVER2: # BB#0:
-; BTVER2-NEXT: vmovmskps %ymm0, %eax # sched: [1:0.50]
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: vmovmskps %ymm0, %eax # sched: [3:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_movmskps:
-; ZNVER1: # BB#0:
-; ZNVER1-NEXT: vmovmskps %ymm0, %eax # sched: [1:0.25]
-; ZNVER1-NEXT: vzeroupper # sched: [?:0.000000e+00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vmovmskps %ymm0, %eax # sched: [1:1.00]
+; ZNVER1-NEXT: vzeroupper # sched: [100:?]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %a0)
ret i32 %1
}
declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>) nounwind readnone
+define void @test_movntdq(<4 x i64> %a0, <4 x i64> *%a1) {
+; GENERIC-LABEL: test_movntdq:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: vmovntdq %ymm0, (%rdi) # sched: [5:1.00]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SANDY-LABEL: test_movntdq:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: vmovntdq %ymm0, (%rdi) # sched: [5:1.00]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: vzeroupper # sched: [100:0.33]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_movntdq:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: vmovntdq %ymm0, (%rdi) # sched: [1:1.00]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: vzeroupper # sched: [4:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_movntdq:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: vmovntdq %ymm0, (%rdi) # sched: [1:1.00]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: vzeroupper # sched: [4:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_movntdq:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: vmovntdq %ymm0, (%rdi) # sched: [1:1.00]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: vzeroupper # sched: [4:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_movntdq:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: vmovntdq %ymm0, (%rdi) # sched: [1:1.00]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: vzeroupper # sched: [4:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_movntdq:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: vmovntdq %ymm0, (%rdi) # sched: [3:2.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_movntdq:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: vmovntdq %ymm0, (%rdi) # sched: [1:0.50]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: vzeroupper # sched: [100:?]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ call void asm sideeffect "vmovntdq $0, $1", "x,*m"(<4 x i64> %a0, <4 x i64> *%a1)
+ ret void
+}
+
define <4 x double> @test_movntpd(<4 x double> %a0, <4 x double> *%a1) {
+; GENERIC-LABEL: test_movntpd:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: vmovntpd %ymm0, (%rdi) # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
; SANDY-LABEL: test_movntpd:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: vmovntpd %ymm0, (%rdi) # sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmovntpd %ymm0, (%rdi) # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movntpd:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
; HASWELL-NEXT: vmovntpd %ymm0, (%rdi) # sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_movntpd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
+; BROADWELL-NEXT: vmovntpd %ymm0, (%rdi) # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_movntpd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vmovntpd %ymm0, (%rdi) # sched: [1:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_movntpd:
+; SKX: # %bb.0:
+; SKX-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [4:0.33]
+; SKX-NEXT: vmovntpd %ymm0, (%rdi) # sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_movntpd:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:2.00]
-; BTVER2-NEXT: vmovntpd %ymm0, (%rdi) # sched: [1:1.00]
+; BTVER2-NEXT: vmovntpd %ymm0, (%rdi) # sched: [3:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_movntpd:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
; ZNVER1-NEXT: vmovntpd %ymm0, (%rdi) # sched: [1:0.50]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = fadd <4 x double> %a0, %a0
store <4 x double> %1, <4 x double> *%a1, align 32, !nontemporal !0
ret <4 x double> %1
}
define <8 x float> @test_movntps(<8 x float> %a0, <8 x float> *%a1) {
+; GENERIC-LABEL: test_movntps:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: vmovntps %ymm0, (%rdi) # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
; SANDY-LABEL: test_movntps:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: vmovntps %ymm0, (%rdi) # sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmovntps %ymm0, (%rdi) # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movntps:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
; HASWELL-NEXT: vmovntps %ymm0, (%rdi) # sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_movntps:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
+; BROADWELL-NEXT: vmovntps %ymm0, (%rdi) # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_movntps:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vmovntps %ymm0, (%rdi) # sched: [1:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_movntps:
+; SKX: # %bb.0:
+; SKX-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [4:0.33]
+; SKX-NEXT: vmovntps %ymm0, (%rdi) # sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_movntps:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:2.00]
-; BTVER2-NEXT: vmovntps %ymm0, (%rdi) # sched: [1:1.00]
+; BTVER2-NEXT: vmovntps %ymm0, (%rdi) # sched: [3:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_movntps:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
; ZNVER1-NEXT: vmovntps %ymm0, (%rdi) # sched: [1:0.50]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = fadd <8 x float> %a0, %a0
store <8 x float> %1, <8 x float> *%a1, align 32, !nontemporal !0
ret <8 x float> %1
}
define <8 x float> @test_movshdup(<8 x float> %a0, <8 x float> *%a1) {
+; GENERIC-LABEL: test_movshdup:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7] sched: [1:1.00]
+; GENERIC-NEXT: vmovshdup {{.*#+}} ymm1 = mem[1,1,3,3,5,5,7,7] sched: [7:0.50]
+; GENERIC-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
; SANDY-LABEL: test_movshdup:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7] sched: [1:1.00]
-; SANDY-NEXT: vmovshdup {{.*#+}} ymm1 = mem[1,1,3,3,5,5,7,7] sched: [4:0.50]
+; SANDY-NEXT: vmovshdup {{.*#+}} ymm1 = mem[1,1,3,3,5,5,7,7] sched: [7:0.50]
; SANDY-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movshdup:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7] sched: [1:1.00]
-; HASWELL-NEXT: vmovshdup {{.*#+}} ymm1 = mem[1,1,3,3,5,5,7,7] sched: [4:0.50]
+; HASWELL-NEXT: vmovshdup {{.*#+}} ymm1 = mem[1,1,3,3,5,5,7,7] sched: [7:0.50]
; HASWELL-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_movshdup:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7] sched: [1:1.00]
+; BROADWELL-NEXT: vmovshdup {{.*#+}} ymm1 = mem[1,1,3,3,5,5,7,7] sched: [6:0.50]
+; BROADWELL-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_movshdup:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7] sched: [1:1.00]
+; SKYLAKE-NEXT: vmovshdup {{.*#+}} ymm1 = mem[1,1,3,3,5,5,7,7] sched: [7:0.50]
+; SKYLAKE-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_movshdup:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7] sched: [1:1.00]
+; SKX-NEXT: vmovshdup {{.*#+}} ymm1 = mem[1,1,3,3,5,5,7,7] sched: [7:0.50]
+; SKX-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_movshdup:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vmovshdup {{.*#+}} ymm1 = mem[1,1,3,3,5,5,7,7] sched: [5:1.00]
; BTVER2-NEXT: vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7] sched: [1:0.50]
; BTVER2-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_movshdup:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vmovshdup {{.*#+}} ymm1 = mem[1,1,3,3,5,5,7,7] sched: [8:0.50]
; ZNVER1-NEXT: vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7] sched: [1:0.50]
; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
%2 = load <8 x float>, <8 x float> *%a1, align 32
%3 = shufflevector <8 x float> %2, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
@@ -1598,33 +3044,61 @@ define <8 x float> @test_movshdup(<8 x float> %a0, <8 x float> *%a1) {
}
define <8 x float> @test_movsldup(<8 x float> %a0, <8 x float> *%a1) {
+; GENERIC-LABEL: test_movsldup:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovsldup {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6] sched: [1:1.00]
+; GENERIC-NEXT: vmovsldup {{.*#+}} ymm1 = mem[0,0,2,2,4,4,6,6] sched: [7:0.50]
+; GENERIC-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
; SANDY-LABEL: test_movsldup:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vmovsldup {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6] sched: [1:1.00]
-; SANDY-NEXT: vmovsldup {{.*#+}} ymm1 = mem[0,0,2,2,4,4,6,6] sched: [4:0.50]
+; SANDY-NEXT: vmovsldup {{.*#+}} ymm1 = mem[0,0,2,2,4,4,6,6] sched: [7:0.50]
; SANDY-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movsldup:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vmovsldup {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6] sched: [1:1.00]
-; HASWELL-NEXT: vmovsldup {{.*#+}} ymm1 = mem[0,0,2,2,4,4,6,6] sched: [4:0.50]
+; HASWELL-NEXT: vmovsldup {{.*#+}} ymm1 = mem[0,0,2,2,4,4,6,6] sched: [7:0.50]
; HASWELL-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_movsldup:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vmovsldup {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6] sched: [1:1.00]
+; BROADWELL-NEXT: vmovsldup {{.*#+}} ymm1 = mem[0,0,2,2,4,4,6,6] sched: [6:0.50]
+; BROADWELL-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_movsldup:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vmovsldup {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6] sched: [1:1.00]
+; SKYLAKE-NEXT: vmovsldup {{.*#+}} ymm1 = mem[0,0,2,2,4,4,6,6] sched: [7:0.50]
+; SKYLAKE-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_movsldup:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovsldup {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6] sched: [1:1.00]
+; SKX-NEXT: vmovsldup {{.*#+}} ymm1 = mem[0,0,2,2,4,4,6,6] sched: [7:0.50]
+; SKX-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_movsldup:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vmovsldup {{.*#+}} ymm1 = mem[0,0,2,2,4,4,6,6] sched: [5:1.00]
; BTVER2-NEXT: vmovsldup {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6] sched: [1:0.50]
; BTVER2-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_movsldup:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vmovsldup {{.*#+}} ymm1 = mem[0,0,2,2,4,4,6,6] sched: [8:0.50]
; ZNVER1-NEXT: vmovsldup {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6] sched: [1:0.50]
; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
%2 = load <8 x float>, <8 x float> *%a1, align 32
%3 = shufflevector <8 x float> %2, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
@@ -1633,35 +3107,63 @@ define <8 x float> @test_movsldup(<8 x float> %a0, <8 x float> *%a1) {
}
define <4 x double> @test_movupd(<4 x double> *%a0, <4 x double> *%a1) {
+; GENERIC-LABEL: test_movupd:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovupd (%rdi), %ymm0 # sched: [7:0.50]
+; GENERIC-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: vmovupd %ymm0, (%rsi) # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
; SANDY-LABEL: test_movupd:
-; SANDY: # BB#0:
-; SANDY-NEXT: vmovups (%rdi), %xmm0 # sched: [4:0.50]
-; SANDY-NEXT: vinsertf128 $1, 16(%rdi), %ymm0, %ymm0 # sched: [5:1.00]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vmovups (%rdi), %xmm0 # sched: [6:0.50]
+; SANDY-NEXT: vinsertf128 $1, 16(%rdi), %ymm0, %ymm0 # sched: [7:0.50]
; SANDY-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: vextractf128 $1, %ymm0, 16(%rsi) # sched: [1:1.00]
-; SANDY-NEXT: vmovupd %xmm0, (%rsi) # sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vextractf128 $1, %ymm0, 16(%rsi) # sched: [5:1.00]
+; SANDY-NEXT: vmovupd %xmm0, (%rsi) # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movupd:
-; HASWELL: # BB#0:
-; HASWELL-NEXT: vmovupd (%rdi), %ymm0 # sched: [4:0.50]
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vmovupd (%rdi), %ymm0 # sched: [7:0.50]
; HASWELL-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
; HASWELL-NEXT: vmovupd %ymm0, (%rsi) # sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_movupd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vmovupd (%rdi), %ymm0 # sched: [6:0.50]
+; BROADWELL-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
+; BROADWELL-NEXT: vmovupd %ymm0, (%rsi) # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_movupd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vmovupd (%rdi), %ymm0 # sched: [7:0.50]
+; SKYLAKE-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vmovupd %ymm0, (%rsi) # sched: [1:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_movupd:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovupd (%rdi), %ymm0 # sched: [7:0.50]
+; SKX-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [4:0.33]
+; SKX-NEXT: vmovupd %ymm0, (%rsi) # sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_movupd:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vmovupd (%rdi), %ymm0 # sched: [5:1.00]
; BTVER2-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:2.00]
; BTVER2-NEXT: vmovupd %ymm0, (%rsi) # sched: [1:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_movupd:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vmovupd (%rdi), %ymm0 # sched: [8:0.50]
; ZNVER1-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
; ZNVER1-NEXT: vmovupd %ymm0, (%rsi) # sched: [1:0.50]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = load <4 x double>, <4 x double> *%a0, align 1
%2 = fadd <4 x double> %1, %1
store <4 x double> %2, <4 x double> *%a1, align 1
@@ -1669,35 +3171,63 @@ define <4 x double> @test_movupd(<4 x double> *%a0, <4 x double> *%a1) {
}
define <8 x float> @test_movups(<8 x float> *%a0, <8 x float> *%a1) {
+; GENERIC-LABEL: test_movups:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovups (%rdi), %ymm0 # sched: [7:0.50]
+; GENERIC-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: vmovups %ymm0, (%rsi) # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
; SANDY-LABEL: test_movups:
-; SANDY: # BB#0:
-; SANDY-NEXT: vmovups (%rdi), %xmm0 # sched: [4:0.50]
-; SANDY-NEXT: vinsertf128 $1, 16(%rdi), %ymm0, %ymm0 # sched: [5:1.00]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vmovups (%rdi), %xmm0 # sched: [6:0.50]
+; SANDY-NEXT: vinsertf128 $1, 16(%rdi), %ymm0, %ymm0 # sched: [7:0.50]
; SANDY-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: vextractf128 $1, %ymm0, 16(%rsi) # sched: [1:1.00]
-; SANDY-NEXT: vmovups %xmm0, (%rsi) # sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vextractf128 $1, %ymm0, 16(%rsi) # sched: [5:1.00]
+; SANDY-NEXT: vmovups %xmm0, (%rsi) # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movups:
-; HASWELL: # BB#0:
-; HASWELL-NEXT: vmovups (%rdi), %ymm0 # sched: [4:0.50]
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vmovups (%rdi), %ymm0 # sched: [7:0.50]
; HASWELL-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
; HASWELL-NEXT: vmovups %ymm0, (%rsi) # sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_movups:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vmovups (%rdi), %ymm0 # sched: [6:0.50]
+; BROADWELL-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
+; BROADWELL-NEXT: vmovups %ymm0, (%rsi) # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_movups:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vmovups (%rdi), %ymm0 # sched: [7:0.50]
+; SKYLAKE-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vmovups %ymm0, (%rsi) # sched: [1:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_movups:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovups (%rdi), %ymm0 # sched: [7:0.50]
+; SKX-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [4:0.33]
+; SKX-NEXT: vmovups %ymm0, (%rsi) # sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_movups:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vmovups (%rdi), %ymm0 # sched: [5:1.00]
; BTVER2-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:2.00]
; BTVER2-NEXT: vmovups %ymm0, (%rsi) # sched: [1:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_movups:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vmovups (%rdi), %ymm0 # sched: [8:0.50]
; ZNVER1-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
; ZNVER1-NEXT: vmovups %ymm0, (%rsi) # sched: [1:0.50]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = load <8 x float>, <8 x float> *%a0, align 1
%2 = fadd <8 x float> %1, %1
store <8 x float> %2, <8 x float> *%a1, align 1
@@ -1705,29 +3235,53 @@ define <8 x float> @test_movups(<8 x float> *%a0, <8 x float> *%a1) {
}
define <4 x double> @test_mulpd(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a2) {
+; GENERIC-LABEL: test_mulpd:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmulpd %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
+; GENERIC-NEXT: vmulpd (%rdi), %ymm0, %ymm0 # sched: [12:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
; SANDY-LABEL: test_mulpd:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vmulpd %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
-; SANDY-NEXT: vmulpd (%rdi), %ymm0, %ymm0 # sched: [9:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmulpd (%rdi), %ymm0, %ymm0 # sched: [12:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_mulpd:
-; HASWELL: # BB#0:
-; HASWELL-NEXT: vmulpd %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
-; HASWELL-NEXT: vmulpd (%rdi), %ymm0, %ymm0 # sched: [9:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vmulpd %ymm1, %ymm0, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT: vmulpd (%rdi), %ymm0, %ymm0 # sched: [12:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_mulpd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vmulpd %ymm1, %ymm0, %ymm0 # sched: [3:0.50]
+; BROADWELL-NEXT: vmulpd (%rdi), %ymm0, %ymm0 # sched: [9:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_mulpd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vmulpd %ymm1, %ymm0, %ymm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vmulpd (%rdi), %ymm0, %ymm0 # sched: [11:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_mulpd:
+; SKX: # %bb.0:
+; SKX-NEXT: vmulpd %ymm1, %ymm0, %ymm0 # sched: [4:0.33]
+; SKX-NEXT: vmulpd (%rdi), %ymm0, %ymm0 # sched: [11:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_mulpd:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vmulpd %ymm1, %ymm0, %ymm0 # sched: [4:4.00]
; BTVER2-NEXT: vmulpd (%rdi), %ymm0, %ymm0 # sched: [9:4.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_mulpd:
-; ZNVER1: # BB#0:
-; ZNVER1-NEXT: vmulpd %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
-; ZNVER1-NEXT: vmulpd (%rdi), %ymm0, %ymm0 # sched: [12:1.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vmulpd %ymm1, %ymm0, %ymm0 # sched: [4:0.50]
+; ZNVER1-NEXT: vmulpd (%rdi), %ymm0, %ymm0 # sched: [11:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = fmul <4 x double> %a0, %a1
%2 = load <4 x double>, <4 x double> *%a2, align 32
%3 = fmul <4 x double> %1, %2
@@ -1735,29 +3289,53 @@ define <4 x double> @test_mulpd(<4 x double> %a0, <4 x double> %a1, <4 x double>
}
define <8 x float> @test_mulps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2) {
+; GENERIC-LABEL: test_mulps:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
+; GENERIC-NEXT: vmulps (%rdi), %ymm0, %ymm0 # sched: [12:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
; SANDY-LABEL: test_mulps:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
-; SANDY-NEXT: vmulps (%rdi), %ymm0, %ymm0 # sched: [9:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmulps (%rdi), %ymm0, %ymm0 # sched: [12:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_mulps:
-; HASWELL: # BB#0:
-; HASWELL-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
-; HASWELL-NEXT: vmulps (%rdi), %ymm0, %ymm0 # sched: [9:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT: vmulps (%rdi), %ymm0, %ymm0 # sched: [12:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_mulps:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [3:0.50]
+; BROADWELL-NEXT: vmulps (%rdi), %ymm0, %ymm0 # sched: [9:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_mulps:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vmulps (%rdi), %ymm0, %ymm0 # sched: [11:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_mulps:
+; SKX: # %bb.0:
+; SKX-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [4:0.33]
+; SKX-NEXT: vmulps (%rdi), %ymm0, %ymm0 # sched: [11:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_mulps:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [2:2.00]
; BTVER2-NEXT: vmulps (%rdi), %ymm0, %ymm0 # sched: [7:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_mulps:
-; ZNVER1: # BB#0:
-; ZNVER1-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
-; ZNVER1-NEXT: vmulps (%rdi), %ymm0, %ymm0 # sched: [12:1.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [4:0.50]
+; ZNVER1-NEXT: vmulps (%rdi), %ymm0, %ymm0 # sched: [11:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = fmul <8 x float> %a0, %a1
%2 = load <8 x float>, <8 x float> *%a2, align 32
%3 = fmul <8 x float> %1, %2
@@ -1765,33 +3343,61 @@ define <8 x float> @test_mulps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a
}
define <4 x double> @orpd(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a2) {
+; GENERIC-LABEL: orpd:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vorpd %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: vorpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; GENERIC-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
; SANDY-LABEL: orpd:
-; SANDY: # BB#0:
-; SANDY-NEXT: vorpd %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
-; SANDY-NEXT: vorpd (%rdi), %ymm0, %ymm0 # sched: [5:0.50]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vorpd %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; SANDY-NEXT: vorpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
; SANDY-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: orpd:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vorpd %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
-; HASWELL-NEXT: vorpd (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
+; HASWELL-NEXT: vorpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
; HASWELL-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: orpd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vorpd %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; BROADWELL-NEXT: vorpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; BROADWELL-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: orpd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vorpd %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: vorpd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; SKYLAKE-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: orpd:
+; SKX: # %bb.0:
+; SKX-NEXT: vorpd %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: vorpd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; SKX-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: orpd:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vorpd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
; BTVER2-NEXT: vorpd (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
; BTVER2-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: orpd:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vorpd %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
; ZNVER1-NEXT: vorpd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
; ZNVER1-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = bitcast <4 x double> %a0 to <4 x i64>
%2 = bitcast <4 x double> %a1 to <4 x i64>
%3 = or <4 x i64> %1, %2
@@ -1804,33 +3410,61 @@ define <4 x double> @orpd(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a2)
}
define <8 x float> @test_orps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2) {
+; GENERIC-LABEL: test_orps:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vorps %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: vorps (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; GENERIC-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
; SANDY-LABEL: test_orps:
-; SANDY: # BB#0:
-; SANDY-NEXT: vorps %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
-; SANDY-NEXT: vorps (%rdi), %ymm0, %ymm0 # sched: [5:0.50]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vorps %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; SANDY-NEXT: vorps (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_orps:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vorps %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
-; HASWELL-NEXT: vorps (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
+; HASWELL-NEXT: vorps (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
; HASWELL-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_orps:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vorps %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; BROADWELL-NEXT: vorps (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; BROADWELL-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_orps:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vorps %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: vorps (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; SKYLAKE-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_orps:
+; SKX: # %bb.0:
+; SKX-NEXT: vorps %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: vorps (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; SKX-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_orps:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vorps %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
; BTVER2-NEXT: vorps (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_orps:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vorps %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
; ZNVER1-NEXT: vorps (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
; ZNVER1-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = bitcast <8 x float> %a0 to <4 x i64>
%2 = bitcast <8 x float> %a1 to <4 x i64>
%3 = or <4 x i64> %1, %2
@@ -1842,34 +3476,125 @@ define <8 x float> @test_orps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2
ret <8 x float> %8
}
+define <4 x double> @test_perm2f128(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a2) {
+; GENERIC-LABEL: test_perm2f128:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
+; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [8:1.00]
+; GENERIC-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SANDY-LABEL: test_perm2f128:
+; SANDY: # %bb.0:
+; SANDY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
+; SANDY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [8:1.00]
+; SANDY-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_perm2f128:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
+; HASWELL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
+; HASWELL-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_perm2f128:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
+; BROADWELL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [9:1.00]
+; BROADWELL-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_perm2f128:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
+; SKYLAKE-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
+; SKYLAKE-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_perm2f128:
+; SKX: # %bb.0:
+; SKX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
+; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
+; SKX-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_perm2f128:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3],ymm1[0,1] sched: [1:0.50]
+; BTVER2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [6:1.00]
+; BTVER2-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:2.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_perm2f128:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3],ymm1[0,1] sched: [100:?]
+; ZNVER1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [100:?]
+; ZNVER1-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+ %2 = load <4 x double>, <4 x double> *%a2, align 32
+ %3 = shufflevector <4 x double> %a0, <4 x double> %2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+ %4 = fadd <4 x double> %1, %3
+ ret <4 x double> %4
+}
+
define <2 x double> @test_permilpd(<2 x double> %a0, <2 x double> *%a1) {
+; GENERIC-LABEL: test_permilpd:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] sched: [1:1.00]
+; GENERIC-NEXT: vpermilpd {{.*#+}} xmm1 = mem[1,0] sched: [7:1.00]
+; GENERIC-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
; SANDY-LABEL: test_permilpd:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] sched: [1:1.00]
-; SANDY-NEXT: vpermilpd {{.*#+}} xmm1 = mem[1,0] sched: [5:1.00]
+; SANDY-NEXT: vpermilpd {{.*#+}} xmm1 = mem[1,0] sched: [7:1.00]
; SANDY-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_permilpd:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] sched: [1:1.00]
-; HASWELL-NEXT: vpermilpd {{.*#+}} xmm1 = mem[1,0] sched: [5:1.00]
+; HASWELL-NEXT: vpermilpd {{.*#+}} xmm1 = mem[1,0] sched: [7:1.00]
; HASWELL-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_permilpd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] sched: [1:1.00]
+; BROADWELL-NEXT: vpermilpd {{.*#+}} xmm1 = mem[1,0] sched: [6:1.00]
+; BROADWELL-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_permilpd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] sched: [1:1.00]
+; SKYLAKE-NEXT: vpermilpd {{.*#+}} xmm1 = mem[1,0] sched: [7:1.00]
+; SKYLAKE-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_permilpd:
+; SKX: # %bb.0:
+; SKX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] sched: [1:1.00]
+; SKX-NEXT: vpermilpd {{.*#+}} xmm1 = mem[1,0] sched: [7:1.00]
+; SKX-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_permilpd:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vpermilpd {{.*#+}} xmm1 = mem[1,0] sched: [6:1.00]
; BTVER2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] sched: [1:0.50]
; BTVER2-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_permilpd:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vpermilpd {{.*#+}} xmm1 = mem[1,0] sched: [8:0.50]
; ZNVER1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] sched: [1:0.50]
; ZNVER1-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = shufflevector <2 x double> %a0, <2 x double> undef, <2 x i32> <i32 1, i32 0>
%2 = load <2 x double>, <2 x double> *%a1, align 16
%3 = shufflevector <2 x double> %2, <2 x double> undef, <2 x i32> <i32 1, i32 0>
@@ -1878,33 +3603,61 @@ define <2 x double> @test_permilpd(<2 x double> %a0, <2 x double> *%a1) {
}
define <4 x double> @test_permilpd_ymm(<4 x double> %a0, <4 x double> *%a1) {
+; GENERIC-LABEL: test_permilpd_ymm:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,3] sched: [1:1.00]
+; GENERIC-NEXT: vpermilpd {{.*#+}} ymm1 = mem[1,0,2,3] sched: [8:1.00]
+; GENERIC-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
; SANDY-LABEL: test_permilpd_ymm:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,3] sched: [1:1.00]
-; SANDY-NEXT: vpermilpd {{.*#+}} ymm1 = mem[1,0,2,3] sched: [5:1.00]
+; SANDY-NEXT: vpermilpd {{.*#+}} ymm1 = mem[1,0,2,3] sched: [8:1.00]
; SANDY-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_permilpd_ymm:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,3] sched: [1:1.00]
-; HASWELL-NEXT: vpermilpd {{.*#+}} ymm1 = mem[1,0,2,3] sched: [5:1.00]
+; HASWELL-NEXT: vpermilpd {{.*#+}} ymm1 = mem[1,0,2,3] sched: [8:1.00]
; HASWELL-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_permilpd_ymm:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,3] sched: [1:1.00]
+; BROADWELL-NEXT: vpermilpd {{.*#+}} ymm1 = mem[1,0,2,3] sched: [7:1.00]
+; BROADWELL-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_permilpd_ymm:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,3] sched: [1:1.00]
+; SKYLAKE-NEXT: vpermilpd {{.*#+}} ymm1 = mem[1,0,2,3] sched: [8:1.00]
+; SKYLAKE-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_permilpd_ymm:
+; SKX: # %bb.0:
+; SKX-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,3] sched: [1:1.00]
+; SKX-NEXT: vpermilpd {{.*#+}} ymm1 = mem[1,0,2,3] sched: [8:1.00]
+; SKX-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_permilpd_ymm:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vpermilpd {{.*#+}} ymm1 = mem[1,0,2,3] sched: [6:1.00]
; BTVER2-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,3] sched: [1:0.50]
; BTVER2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_permilpd_ymm:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vpermilpd {{.*#+}} ymm1 = mem[1,0,2,3] sched: [8:0.50]
; ZNVER1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,3] sched: [1:0.50]
; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 2, i32 3>
%2 = load <4 x double>, <4 x double> *%a1, align 32
%3 = shufflevector <4 x double> %2, <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 2, i32 3>
@@ -1913,33 +3666,61 @@ define <4 x double> @test_permilpd_ymm(<4 x double> %a0, <4 x double> *%a1) {
}
define <4 x float> @test_permilps(<4 x float> %a0, <4 x float> *%a1) {
+; GENERIC-LABEL: test_permilps:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] sched: [1:1.00]
+; GENERIC-NEXT: vpermilps {{.*#+}} xmm1 = mem[3,2,1,0] sched: [7:1.00]
+; GENERIC-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
; SANDY-LABEL: test_permilps:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] sched: [1:1.00]
-; SANDY-NEXT: vpermilps {{.*#+}} xmm1 = mem[3,2,1,0] sched: [5:1.00]
+; SANDY-NEXT: vpermilps {{.*#+}} xmm1 = mem[3,2,1,0] sched: [7:1.00]
; SANDY-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_permilps:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] sched: [1:1.00]
-; HASWELL-NEXT: vpermilps {{.*#+}} xmm1 = mem[3,2,1,0] sched: [5:1.00]
+; HASWELL-NEXT: vpermilps {{.*#+}} xmm1 = mem[3,2,1,0] sched: [7:1.00]
; HASWELL-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_permilps:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] sched: [1:1.00]
+; BROADWELL-NEXT: vpermilps {{.*#+}} xmm1 = mem[3,2,1,0] sched: [6:1.00]
+; BROADWELL-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_permilps:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] sched: [1:1.00]
+; SKYLAKE-NEXT: vpermilps {{.*#+}} xmm1 = mem[3,2,1,0] sched: [7:1.00]
+; SKYLAKE-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_permilps:
+; SKX: # %bb.0:
+; SKX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] sched: [1:1.00]
+; SKX-NEXT: vpermilps {{.*#+}} xmm1 = mem[3,2,1,0] sched: [7:1.00]
+; SKX-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_permilps:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vpermilps {{.*#+}} xmm1 = mem[3,2,1,0] sched: [6:1.00]
; BTVER2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] sched: [1:0.50]
; BTVER2-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_permilps:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vpermilps {{.*#+}} xmm1 = mem[3,2,1,0] sched: [8:0.50]
; ZNVER1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] sched: [1:0.50]
; ZNVER1-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
%2 = load <4 x float>, <4 x float> *%a1, align 16
%3 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
@@ -1948,33 +3729,61 @@ define <4 x float> @test_permilps(<4 x float> %a0, <4 x float> *%a1) {
}
define <8 x float> @test_permilps_ymm(<8 x float> %a0, <8 x float> *%a1) {
+; GENERIC-LABEL: test_permilps_ymm:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] sched: [1:1.00]
+; GENERIC-NEXT: vpermilps {{.*#+}} ymm1 = mem[3,2,1,0,7,6,5,4] sched: [8:1.00]
+; GENERIC-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
; SANDY-LABEL: test_permilps_ymm:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] sched: [1:1.00]
-; SANDY-NEXT: vpermilps {{.*#+}} ymm1 = mem[3,2,1,0,7,6,5,4] sched: [5:1.00]
+; SANDY-NEXT: vpermilps {{.*#+}} ymm1 = mem[3,2,1,0,7,6,5,4] sched: [8:1.00]
; SANDY-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_permilps_ymm:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] sched: [1:1.00]
-; HASWELL-NEXT: vpermilps {{.*#+}} ymm1 = mem[3,2,1,0,7,6,5,4] sched: [5:1.00]
+; HASWELL-NEXT: vpermilps {{.*#+}} ymm1 = mem[3,2,1,0,7,6,5,4] sched: [8:1.00]
; HASWELL-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_permilps_ymm:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] sched: [1:1.00]
+; BROADWELL-NEXT: vpermilps {{.*#+}} ymm1 = mem[3,2,1,0,7,6,5,4] sched: [7:1.00]
+; BROADWELL-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_permilps_ymm:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] sched: [1:1.00]
+; SKYLAKE-NEXT: vpermilps {{.*#+}} ymm1 = mem[3,2,1,0,7,6,5,4] sched: [8:1.00]
+; SKYLAKE-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_permilps_ymm:
+; SKX: # %bb.0:
+; SKX-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] sched: [1:1.00]
+; SKX-NEXT: vpermilps {{.*#+}} ymm1 = mem[3,2,1,0,7,6,5,4] sched: [8:1.00]
+; SKX-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_permilps_ymm:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vpermilps {{.*#+}} ymm1 = mem[3,2,1,0,7,6,5,4] sched: [6:1.00]
; BTVER2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] sched: [1:0.50]
; BTVER2-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_permilps_ymm:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vpermilps {{.*#+}} ymm1 = mem[3,2,1,0,7,6,5,4] sched: [8:0.50]
; ZNVER1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] sched: [1:0.50]
; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
%2 = load <8 x float>, <8 x float> *%a1, align 32
%3 = shufflevector <8 x float> %2, <8 x float> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
@@ -1983,29 +3792,53 @@ define <8 x float> @test_permilps_ymm(<8 x float> %a0, <8 x float> *%a1) {
}
define <2 x double> @test_permilvarpd(<2 x double> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
+; GENERIC-LABEL: test_permilvarpd:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpermilpd %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: vpermilpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
; SANDY-LABEL: test_permilvarpd:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vpermilpd %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; SANDY-NEXT: vpermilpd (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpermilpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_permilvarpd:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vpermilpd %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT: vpermilpd (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpermilpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_permilvarpd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpermilpd %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; BROADWELL-NEXT: vpermilpd (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_permilvarpd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpermilpd %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; SKYLAKE-NEXT: vpermilpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_permilvarpd:
+; SKX: # %bb.0:
+; SKX-NEXT: vpermilpd %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; SKX-NEXT: vpermilpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_permilvarpd:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vpermilpd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vpermilpd (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_permilvarpd:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vpermilpd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; ZNVER1-NEXT: vpermilpd (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %a0, <2 x i64> %a1)
%2 = load <2 x i64>, <2 x i64> *%a2, align 16
%3 = call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %1, <2 x i64> %2)
@@ -2014,29 +3847,53 @@ define <2 x double> @test_permilvarpd(<2 x double> %a0, <2 x i64> %a1, <2 x i64>
declare <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double>, <2 x i64>) nounwind readnone
define <4 x double> @test_permilvarpd_ymm(<4 x double> %a0, <4 x i64> %a1, <4 x i64> *%a2) {
+; GENERIC-LABEL: test_permilvarpd_ymm:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpermilpd %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: vpermilpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
; SANDY-LABEL: test_permilvarpd_ymm:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vpermilpd %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
-; SANDY-NEXT: vpermilpd (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpermilpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_permilvarpd_ymm:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vpermilpd %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
-; HASWELL-NEXT: vpermilpd (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpermilpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_permilvarpd_ymm:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpermilpd %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; BROADWELL-NEXT: vpermilpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_permilvarpd_ymm:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpermilpd %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; SKYLAKE-NEXT: vpermilpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_permilvarpd_ymm:
+; SKX: # %bb.0:
+; SKX-NEXT: vpermilpd %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; SKX-NEXT: vpermilpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_permilvarpd_ymm:
-; BTVER2: # BB#0:
-; BTVER2-NEXT: vpermilpd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: vpermilpd %ymm1, %ymm0, %ymm0 # sched: [3:3.00]
; BTVER2-NEXT: vpermilpd (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_permilvarpd_ymm:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vpermilpd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
; ZNVER1-NEXT: vpermilpd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %a0, <4 x i64> %a1)
%2 = load <4 x i64>, <4 x i64> *%a2, align 32
%3 = call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %1, <4 x i64> %2)
@@ -2045,29 +3902,53 @@ define <4 x double> @test_permilvarpd_ymm(<4 x double> %a0, <4 x i64> %a1, <4 x
declare <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double>, <4 x i64>) nounwind readnone
define <4 x float> @test_permilvarps(<4 x float> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
+; GENERIC-LABEL: test_permilvarps:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpermilps %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: vpermilps (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
; SANDY-LABEL: test_permilvarps:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vpermilps %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; SANDY-NEXT: vpermilps (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpermilps (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_permilvarps:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vpermilps %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT: vpermilps (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpermilps (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_permilvarps:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpermilps %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; BROADWELL-NEXT: vpermilps (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_permilvarps:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpermilps %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; SKYLAKE-NEXT: vpermilps (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_permilvarps:
+; SKX: # %bb.0:
+; SKX-NEXT: vpermilps %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; SKX-NEXT: vpermilps (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_permilvarps:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vpermilps %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vpermilps (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_permilvarps:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vpermilps %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; ZNVER1-NEXT: vpermilps (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> %a1)
%2 = load <4 x i32>, <4 x i32> *%a2, align 16
%3 = call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %1, <4 x i32> %2)
@@ -2076,29 +3957,53 @@ define <4 x float> @test_permilvarps(<4 x float> %a0, <4 x i32> %a1, <4 x i32> *
declare <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float>, <4 x i32>) nounwind readnone
define <8 x float> @test_permilvarps_ymm(<8 x float> %a0, <8 x i32> %a1, <8 x i32> *%a2) {
+; GENERIC-LABEL: test_permilvarps_ymm:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpermilps %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: vpermilps (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
; SANDY-LABEL: test_permilvarps_ymm:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vpermilps %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
-; SANDY-NEXT: vpermilps (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpermilps (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_permilvarps_ymm:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vpermilps %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
-; HASWELL-NEXT: vpermilps (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpermilps (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_permilvarps_ymm:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpermilps %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; BROADWELL-NEXT: vpermilps (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_permilvarps_ymm:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpermilps %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; SKYLAKE-NEXT: vpermilps (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_permilvarps_ymm:
+; SKX: # %bb.0:
+; SKX-NEXT: vpermilps %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; SKX-NEXT: vpermilps (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_permilvarps_ymm:
-; BTVER2: # BB#0:
-; BTVER2-NEXT: vpermilps %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: vpermilps %ymm1, %ymm0, %ymm0 # sched: [3:3.00]
; BTVER2-NEXT: vpermilps (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_permilvarps_ymm:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vpermilps %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
; ZNVER1-NEXT: vpermilps (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> %a1)
%2 = load <8 x i32>, <8 x i32> *%a2, align 32
%3 = call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %1, <8 x i32> %2)
@@ -2107,33 +4012,61 @@ define <8 x float> @test_permilvarps_ymm(<8 x float> %a0, <8 x i32> %a1, <8 x i3
declare <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float>, <8 x i32>) nounwind readnone
define <8 x float> @test_rcpps(<8 x float> %a0, <8 x float> *%a1) {
+; GENERIC-LABEL: test_rcpps:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vrcpps (%rdi), %ymm1 # sched: [14:2.00]
+; GENERIC-NEXT: vrcpps %ymm0, %ymm0 # sched: [7:2.00]
+; GENERIC-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
; SANDY-LABEL: test_rcpps:
-; SANDY: # BB#0:
-; SANDY-NEXT: vrcpps %ymm0, %ymm0 # sched: [5:1.00]
-; SANDY-NEXT: vrcpps (%rdi), %ymm1 # sched: [9:1.00]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vrcpps (%rdi), %ymm1 # sched: [14:2.00]
+; SANDY-NEXT: vrcpps %ymm0, %ymm0 # sched: [7:2.00]
; SANDY-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_rcpps:
-; HASWELL: # BB#0:
-; HASWELL-NEXT: vrcpps (%rdi), %ymm1 # sched: [11:2.00]
-; HASWELL-NEXT: vrcpps %ymm0, %ymm0 # sched: [7:2.00]
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vrcpps (%rdi), %ymm1 # sched: [18:2.00]
+; HASWELL-NEXT: vrcpps %ymm0, %ymm0 # sched: [11:2.00]
; HASWELL-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_rcpps:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vrcpps %ymm0, %ymm0 # sched: [11:2.00]
+; BROADWELL-NEXT: vrcpps (%rdi), %ymm1 # sched: [17:2.00]
+; BROADWELL-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_rcpps:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vrcpps %ymm0, %ymm0 # sched: [4:1.00]
+; SKYLAKE-NEXT: vrcpps (%rdi), %ymm1 # sched: [11:1.00]
+; SKYLAKE-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_rcpps:
+; SKX: # %bb.0:
+; SKX-NEXT: vrcpps %ymm0, %ymm0 # sched: [4:1.00]
+; SKX-NEXT: vrcpps (%rdi), %ymm1 # sched: [11:1.00]
+; SKX-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_rcpps:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vrcpps (%rdi), %ymm1 # sched: [7:2.00]
; BTVER2-NEXT: vrcpps %ymm0, %ymm0 # sched: [2:2.00]
; BTVER2-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_rcpps:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vrcpps (%rdi), %ymm1 # sched: [12:0.50]
; ZNVER1-NEXT: vrcpps %ymm0, %ymm0 # sched: [5:0.50]
; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float> %a0)
%2 = load <8 x float>, <8 x float> *%a1, align 32
%3 = call <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float> %2)
@@ -2143,33 +4076,61 @@ define <8 x float> @test_rcpps(<8 x float> %a0, <8 x float> *%a1) {
declare <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float>) nounwind readnone
define <4 x double> @test_roundpd(<4 x double> %a0, <4 x double> *%a1) {
+; GENERIC-LABEL: test_roundpd:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vroundpd $7, %ymm0, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: vroundpd $7, (%rdi), %ymm1 # sched: [10:1.00]
+; GENERIC-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
; SANDY-LABEL: test_roundpd:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vroundpd $7, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: vroundpd $7, (%rdi), %ymm1 # sched: [7:1.00]
+; SANDY-NEXT: vroundpd $7, (%rdi), %ymm1 # sched: [10:1.00]
; SANDY-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_roundpd:
-; HASWELL: # BB#0:
-; HASWELL-NEXT: vroundpd $7, %ymm0, %ymm0 # sched: [6:2.00]
-; HASWELL-NEXT: vroundpd $7, (%rdi), %ymm1 # sched: [10:2.00]
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vroundpd $7, %ymm0, %ymm0 # sched: [6:0.50]
+; HASWELL-NEXT: vroundpd $7, (%rdi), %ymm1 # sched: [13:2.00]
; HASWELL-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_roundpd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vroundpd $7, (%rdi), %ymm1 # sched: [12:2.00]
+; BROADWELL-NEXT: vroundpd $7, %ymm0, %ymm0 # sched: [6:0.50]
+; BROADWELL-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_roundpd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vroundpd $7, %ymm0, %ymm0 # sched: [8:0.67]
+; SKYLAKE-NEXT: vroundpd $7, (%rdi), %ymm1 # sched: [15:0.67]
+; SKYLAKE-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_roundpd:
+; SKX: # %bb.0:
+; SKX-NEXT: vrndscalepd $7, %ymm0, %ymm0 # sched: [8:0.67]
+; SKX-NEXT: vrndscalepd $7, (%rdi), %ymm1 # sched: [15:0.67]
+; SKX-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_roundpd:
-; BTVER2: # BB#0:
-; BTVER2-NEXT: vroundpd $7, (%rdi), %ymm1 # sched: [8:1.00]
-; BTVER2-NEXT: vroundpd $7, %ymm0, %ymm0 # sched: [3:1.00]
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: vroundpd $7, (%rdi), %ymm1 # sched: [8:2.00]
+; BTVER2-NEXT: vroundpd $7, %ymm0, %ymm0 # sched: [3:2.00]
; BTVER2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_roundpd:
-; ZNVER1: # BB#0:
-; ZNVER1-NEXT: vroundpd $7, (%rdi), %ymm1 # sched: [10:1.00]
-; ZNVER1-NEXT: vroundpd $7, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vroundpd $7, (%rdi), %ymm1 # sched: [11:1.00]
+; ZNVER1-NEXT: vroundpd $7, %ymm0, %ymm0 # sched: [4:1.00]
; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %a0, i32 7)
%2 = load <4 x double>, <4 x double> *%a1, align 32
%3 = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %2, i32 7)
@@ -2179,33 +4140,61 @@ define <4 x double> @test_roundpd(<4 x double> %a0, <4 x double> *%a1) {
declare <4 x double> @llvm.x86.avx.round.pd.256(<4 x double>, i32) nounwind readnone
define <8 x float> @test_roundps(<8 x float> %a0, <8 x float> *%a1) {
+; GENERIC-LABEL: test_roundps:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vroundps $7, %ymm0, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: vroundps $7, (%rdi), %ymm1 # sched: [10:1.00]
+; GENERIC-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
; SANDY-LABEL: test_roundps:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vroundps $7, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: vroundps $7, (%rdi), %ymm1 # sched: [7:1.00]
+; SANDY-NEXT: vroundps $7, (%rdi), %ymm1 # sched: [10:1.00]
; SANDY-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_roundps:
-; HASWELL: # BB#0:
-; HASWELL-NEXT: vroundps $7, %ymm0, %ymm0 # sched: [6:2.00]
-; HASWELL-NEXT: vroundps $7, (%rdi), %ymm1 # sched: [10:2.00]
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vroundps $7, %ymm0, %ymm0 # sched: [6:0.50]
+; HASWELL-NEXT: vroundps $7, (%rdi), %ymm1 # sched: [13:2.00]
; HASWELL-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_roundps:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vroundps $7, (%rdi), %ymm1 # sched: [12:2.00]
+; BROADWELL-NEXT: vroundps $7, %ymm0, %ymm0 # sched: [6:0.50]
+; BROADWELL-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_roundps:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vroundps $7, %ymm0, %ymm0 # sched: [8:0.67]
+; SKYLAKE-NEXT: vroundps $7, (%rdi), %ymm1 # sched: [15:0.67]
+; SKYLAKE-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_roundps:
+; SKX: # %bb.0:
+; SKX-NEXT: vrndscaleps $7, %ymm0, %ymm0 # sched: [8:0.67]
+; SKX-NEXT: vrndscaleps $7, (%rdi), %ymm1 # sched: [15:0.67]
+; SKX-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_roundps:
-; BTVER2: # BB#0:
-; BTVER2-NEXT: vroundps $7, (%rdi), %ymm1 # sched: [8:1.00]
-; BTVER2-NEXT: vroundps $7, %ymm0, %ymm0 # sched: [3:1.00]
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: vroundps $7, (%rdi), %ymm1 # sched: [8:2.00]
+; BTVER2-NEXT: vroundps $7, %ymm0, %ymm0 # sched: [3:2.00]
; BTVER2-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_roundps:
-; ZNVER1: # BB#0:
-; ZNVER1-NEXT: vroundps $7, (%rdi), %ymm1 # sched: [10:1.00]
-; ZNVER1-NEXT: vroundps $7, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vroundps $7, (%rdi), %ymm1 # sched: [11:1.00]
+; ZNVER1-NEXT: vroundps $7, %ymm0, %ymm0 # sched: [4:1.00]
; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %a0, i32 7)
%2 = load <8 x float>, <8 x float> *%a1, align 32
%3 = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %2, i32 7)
@@ -2215,33 +4204,61 @@ define <8 x float> @test_roundps(<8 x float> %a0, <8 x float> *%a1) {
declare <8 x float> @llvm.x86.avx.round.ps.256(<8 x float>, i32) nounwind readnone
define <8 x float> @test_rsqrtps(<8 x float> %a0, <8 x float> *%a1) {
+; GENERIC-LABEL: test_rsqrtps:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vrsqrtps (%rdi), %ymm1 # sched: [14:2.00]
+; GENERIC-NEXT: vrsqrtps %ymm0, %ymm0 # sched: [7:2.00]
+; GENERIC-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
; SANDY-LABEL: test_rsqrtps:
-; SANDY: # BB#0:
-; SANDY-NEXT: vrsqrtps %ymm0, %ymm0 # sched: [5:1.00]
-; SANDY-NEXT: vrsqrtps (%rdi), %ymm1 # sched: [9:1.00]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vrsqrtps (%rdi), %ymm1 # sched: [14:2.00]
+; SANDY-NEXT: vrsqrtps %ymm0, %ymm0 # sched: [7:2.00]
; SANDY-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_rsqrtps:
-; HASWELL: # BB#0:
-; HASWELL-NEXT: vrsqrtps (%rdi), %ymm1 # sched: [11:2.00]
-; HASWELL-NEXT: vrsqrtps %ymm0, %ymm0 # sched: [7:2.00]
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vrsqrtps (%rdi), %ymm1 # sched: [18:2.00]
+; HASWELL-NEXT: vrsqrtps %ymm0, %ymm0 # sched: [11:2.00]
; HASWELL-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_rsqrtps:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vrsqrtps %ymm0, %ymm0 # sched: [11:2.00]
+; BROADWELL-NEXT: vrsqrtps (%rdi), %ymm1 # sched: [17:2.00]
+; BROADWELL-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_rsqrtps:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vrsqrtps %ymm0, %ymm0 # sched: [4:1.00]
+; SKYLAKE-NEXT: vrsqrtps (%rdi), %ymm1 # sched: [11:1.00]
+; SKYLAKE-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_rsqrtps:
+; SKX: # %bb.0:
+; SKX-NEXT: vrsqrtps %ymm0, %ymm0 # sched: [4:1.00]
+; SKX-NEXT: vrsqrtps (%rdi), %ymm1 # sched: [11:1.00]
+; SKX-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_rsqrtps:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vrsqrtps (%rdi), %ymm1 # sched: [7:2.00]
; BTVER2-NEXT: vrsqrtps %ymm0, %ymm0 # sched: [2:2.00]
; BTVER2-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_rsqrtps:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vrsqrtps (%rdi), %ymm1 # sched: [12:0.50]
; ZNVER1-NEXT: vrsqrtps %ymm0, %ymm0 # sched: [5:0.50]
; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float> %a0)
%2 = load <8 x float>, <8 x float> *%a1, align 32
%3 = call <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float> %2)
@@ -2251,33 +4268,61 @@ define <8 x float> @test_rsqrtps(<8 x float> %a0, <8 x float> *%a1) {
declare <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float>) nounwind readnone
define <4 x double> @test_shufpd(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a2) {
+; GENERIC-LABEL: test_shufpd:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[0],ymm0[2],ymm1[3] sched: [1:1.00]
+; GENERIC-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],mem[0],ymm1[2],mem[3] sched: [8:1.00]
+; GENERIC-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
; SANDY-LABEL: test_shufpd:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[0],ymm0[2],ymm1[3] sched: [1:1.00]
-; SANDY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],mem[0],ymm1[2],mem[3] sched: [5:1.00]
+; SANDY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],mem[0],ymm1[2],mem[3] sched: [8:1.00]
; SANDY-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_shufpd:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[0],ymm0[2],ymm1[3] sched: [1:1.00]
-; HASWELL-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],mem[0],ymm1[2],mem[3] sched: [5:1.00]
+; HASWELL-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],mem[0],ymm1[2],mem[3] sched: [8:1.00]
; HASWELL-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_shufpd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[0],ymm0[2],ymm1[3] sched: [1:1.00]
+; BROADWELL-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],mem[0],ymm1[2],mem[3] sched: [7:1.00]
+; BROADWELL-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_shufpd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[0],ymm0[2],ymm1[3] sched: [1:1.00]
+; SKYLAKE-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],mem[0],ymm1[2],mem[3] sched: [8:1.00]
+; SKYLAKE-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_shufpd:
+; SKX: # %bb.0:
+; SKX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[0],ymm0[2],ymm1[3] sched: [1:1.00]
+; SKX-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],mem[0],ymm1[2],mem[3] sched: [8:1.00]
+; SKX-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_shufpd:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[0],ymm0[2],ymm1[3] sched: [1:0.50]
; BTVER2-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],mem[0],ymm1[2],mem[3] sched: [6:1.00]
; BTVER2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_shufpd:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[0],ymm0[2],ymm1[3] sched: [1:0.50]
; ZNVER1-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],mem[0],ymm1[2],mem[3] sched: [8:0.50]
; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 1, i32 4, i32 2, i32 7>
%2 = load <4 x double>, <4 x double> *%a2, align 32
%3 = shufflevector <4 x double> %a1, <4 x double> %2, <4 x i32> <i32 1, i32 4, i32 2, i32 7>
@@ -2286,29 +4331,53 @@ define <4 x double> @test_shufpd(<4 x double> %a0, <4 x double> %a1, <4 x double
}
define <8 x float> @test_shufps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2) nounwind {
+; GENERIC-LABEL: test_shufps:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[0,0],ymm0[4,4],ymm1[4,4] sched: [1:1.00]
+; GENERIC-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],mem[0,0],ymm0[4,7],mem[4,4] sched: [8:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
; SANDY-LABEL: test_shufps:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[0,0],ymm0[4,4],ymm1[4,4] sched: [1:1.00]
-; SANDY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],mem[0,0],ymm0[4,7],mem[4,4] sched: [5:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],mem[0,0],ymm0[4,7],mem[4,4] sched: [8:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_shufps:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[0,0],ymm0[4,4],ymm1[4,4] sched: [1:1.00]
-; HASWELL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],mem[0,0],ymm0[4,7],mem[4,4] sched: [5:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],mem[0,0],ymm0[4,7],mem[4,4] sched: [8:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_shufps:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[0,0],ymm0[4,4],ymm1[4,4] sched: [1:1.00]
+; BROADWELL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],mem[0,0],ymm0[4,7],mem[4,4] sched: [7:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_shufps:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[0,0],ymm0[4,4],ymm1[4,4] sched: [1:1.00]
+; SKYLAKE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],mem[0,0],ymm0[4,7],mem[4,4] sched: [8:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_shufps:
+; SKX: # %bb.0:
+; SKX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[0,0],ymm0[4,4],ymm1[4,4] sched: [1:1.00]
+; SKX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],mem[0,0],ymm0[4,7],mem[4,4] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_shufps:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[0,0],ymm0[4,4],ymm1[4,4] sched: [1:0.50]
; BTVER2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],mem[0,0],ymm0[4,7],mem[4,4] sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_shufps:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[0,0],ymm0[4,4],ymm1[4,4] sched: [1:0.50]
; ZNVER1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],mem[0,0],ymm0[4,7],mem[4,4] sched: [8:0.50]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 0, i32 0, i32 8, i32 8, i32 4, i32 4, i32 12, i32 12>
%2 = load <8 x float>, <8 x float> *%a2, align 32
%3 = shufflevector <8 x float> %1, <8 x float> %2, <8 x i32> <i32 0, i32 3, i32 8, i32 8, i32 4, i32 7, i32 12, i32 12>
@@ -2316,33 +4385,61 @@ define <8 x float> @test_shufps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%
}
define <4 x double> @test_sqrtpd(<4 x double> %a0, <4 x double> *%a1) {
+; GENERIC-LABEL: test_sqrtpd:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vsqrtpd (%rdi), %ymm1 # sched: [52:2.00]
+; GENERIC-NEXT: vsqrtpd %ymm0, %ymm0 # sched: [45:2.00]
+; GENERIC-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
; SANDY-LABEL: test_sqrtpd:
-; SANDY: # BB#0:
-; SANDY-NEXT: vsqrtpd %ymm0, %ymm0 # sched: [15:1.00]
-; SANDY-NEXT: vsqrtpd (%rdi), %ymm1 # sched: [19:1.00]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vsqrtpd (%rdi), %ymm1 # sched: [52:2.00]
+; SANDY-NEXT: vsqrtpd %ymm0, %ymm0 # sched: [45:2.00]
; SANDY-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_sqrtpd:
-; HASWELL: # BB#0:
-; HASWELL-NEXT: vsqrtpd (%rdi), %ymm1 # sched: [32:2.00]
-; HASWELL-NEXT: vsqrtpd %ymm0, %ymm0 # sched: [28:2.00]
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vsqrtpd (%rdi), %ymm1 # sched: [42:2.00]
+; HASWELL-NEXT: vsqrtpd %ymm0, %ymm0 # sched: [35:2.00]
; HASWELL-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_sqrtpd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vsqrtpd %ymm0, %ymm0 # sched: [34:2.00]
+; BROADWELL-NEXT: vsqrtpd (%rdi), %ymm1 # sched: [40:2.00]
+; BROADWELL-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_sqrtpd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vsqrtpd %ymm0, %ymm0 # sched: [18:1.00]
+; SKYLAKE-NEXT: vsqrtpd (%rdi), %ymm1 # sched: [25:1.00]
+; SKYLAKE-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_sqrtpd:
+; SKX: # %bb.0:
+; SKX-NEXT: vsqrtpd %ymm0, %ymm0 # sched: [18:1.00]
+; SKX-NEXT: vsqrtpd (%rdi), %ymm1 # sched: [25:1.00]
+; SKX-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_sqrtpd:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vsqrtpd (%rdi), %ymm1 # sched: [59:54.00]
; BTVER2-NEXT: vsqrtpd %ymm0, %ymm0 # sched: [54:54.00]
; BTVER2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_sqrtpd:
-; ZNVER1: # BB#0:
-; ZNVER1-NEXT: vsqrtpd (%rdi), %ymm1 # sched: [27:1.00]
-; ZNVER1-NEXT: vsqrtpd %ymm0, %ymm0 # sched: [20:1.00]
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vsqrtpd (%rdi), %ymm1 # sched: [47:47.00]
+; ZNVER1-NEXT: vsqrtpd %ymm0, %ymm0 # sched: [40:40.00]
; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double> %a0)
%2 = load <4 x double>, <4 x double> *%a1, align 32
%3 = call <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double> %2)
@@ -2352,33 +4449,61 @@ define <4 x double> @test_sqrtpd(<4 x double> %a0, <4 x double> *%a1) {
declare <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double>) nounwind readnone
define <8 x float> @test_sqrtps(<8 x float> %a0, <8 x float> *%a1) {
+; GENERIC-LABEL: test_sqrtps:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vsqrtps (%rdi), %ymm1 # sched: [36:2.00]
+; GENERIC-NEXT: vsqrtps %ymm0, %ymm0 # sched: [29:2.00]
+; GENERIC-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
; SANDY-LABEL: test_sqrtps:
-; SANDY: # BB#0:
-; SANDY-NEXT: vsqrtps %ymm0, %ymm0 # sched: [15:1.00]
-; SANDY-NEXT: vsqrtps (%rdi), %ymm1 # sched: [19:1.00]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vsqrtps (%rdi), %ymm1 # sched: [36:2.00]
+; SANDY-NEXT: vsqrtps %ymm0, %ymm0 # sched: [29:2.00]
; SANDY-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_sqrtps:
-; HASWELL: # BB#0:
-; HASWELL-NEXT: vsqrtps (%rdi), %ymm1 # sched: [23:2.00]
-; HASWELL-NEXT: vsqrtps %ymm0, %ymm0 # sched: [19:2.00]
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vsqrtps (%rdi), %ymm1 # sched: [28:2.00]
+; HASWELL-NEXT: vsqrtps %ymm0, %ymm0 # sched: [21:2.00]
; HASWELL-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_sqrtps:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vsqrtps %ymm0, %ymm0 # sched: [21:2.00]
+; BROADWELL-NEXT: vsqrtps (%rdi), %ymm1 # sched: [27:2.00]
+; BROADWELL-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_sqrtps:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vsqrtps %ymm0, %ymm0 # sched: [12:1.00]
+; SKYLAKE-NEXT: vsqrtps (%rdi), %ymm1 # sched: [19:1.00]
+; SKYLAKE-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_sqrtps:
+; SKX: # %bb.0:
+; SKX-NEXT: vsqrtps %ymm0, %ymm0 # sched: [12:1.00]
+; SKX-NEXT: vsqrtps (%rdi), %ymm1 # sched: [19:1.00]
+; SKX-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_sqrtps:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vsqrtps (%rdi), %ymm1 # sched: [47:42.00]
; BTVER2-NEXT: vsqrtps %ymm0, %ymm0 # sched: [42:42.00]
; BTVER2-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_sqrtps:
-; ZNVER1: # BB#0:
-; ZNVER1-NEXT: vsqrtps (%rdi), %ymm1 # sched: [27:1.00]
-; ZNVER1-NEXT: vsqrtps %ymm0, %ymm0 # sched: [20:1.00]
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vsqrtps (%rdi), %ymm1 # sched: [35:35.00]
+; ZNVER1-NEXT: vsqrtps %ymm0, %ymm0 # sched: [28:28.00]
; ZNVER1-NEXT: vaddps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float> %a0)
%2 = load <8 x float>, <8 x float> *%a1, align 32
%3 = call <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float> %2)
@@ -2388,29 +4513,53 @@ define <8 x float> @test_sqrtps(<8 x float> %a0, <8 x float> *%a1) {
declare <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float>) nounwind readnone
define <4 x double> @test_subpd(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a2) {
+; GENERIC-LABEL: test_subpd:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vsubpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: vsubpd (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
; SANDY-LABEL: test_subpd:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vsubpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: vsubpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vsubpd (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_subpd:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vsubpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT: vsubpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vsubpd (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_subpd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vsubpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; BROADWELL-NEXT: vsubpd (%rdi), %ymm0, %ymm0 # sched: [9:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_subpd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vsubpd %ymm1, %ymm0, %ymm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vsubpd (%rdi), %ymm0, %ymm0 # sched: [11:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_subpd:
+; SKX: # %bb.0:
+; SKX-NEXT: vsubpd %ymm1, %ymm0, %ymm0 # sched: [4:0.33]
+; SKX-NEXT: vsubpd (%rdi), %ymm0, %ymm0 # sched: [11:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_subpd:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vsubpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
; BTVER2-NEXT: vsubpd (%rdi), %ymm0, %ymm0 # sched: [8:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_subpd:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vsubpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
; ZNVER1-NEXT: vsubpd (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = fsub <4 x double> %a0, %a1
%2 = load <4 x double>, <4 x double> *%a2, align 32
%3 = fsub <4 x double> %1, %2
@@ -2418,29 +4567,53 @@ define <4 x double> @test_subpd(<4 x double> %a0, <4 x double> %a1, <4 x double>
}
define <8 x float> @test_subps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2) {
+; GENERIC-LABEL: test_subps:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vsubps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: vsubps (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
; SANDY-LABEL: test_subps:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vsubps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: vsubps (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vsubps (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_subps:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vsubps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT: vsubps (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vsubps (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_subps:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vsubps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; BROADWELL-NEXT: vsubps (%rdi), %ymm0, %ymm0 # sched: [9:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_subps:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vsubps %ymm1, %ymm0, %ymm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vsubps (%rdi), %ymm0, %ymm0 # sched: [11:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_subps:
+; SKX: # %bb.0:
+; SKX-NEXT: vsubps %ymm1, %ymm0, %ymm0 # sched: [4:0.33]
+; SKX-NEXT: vsubps (%rdi), %ymm0, %ymm0 # sched: [11:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_subps:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vsubps %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
; BTVER2-NEXT: vsubps (%rdi), %ymm0, %ymm0 # sched: [8:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_subps:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vsubps %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
; ZNVER1-NEXT: vsubps (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = fsub <8 x float> %a0, %a1
%2 = load <8 x float>, <8 x float> *%a2, align 32
%3 = fsub <8 x float> %1, %2
@@ -2448,41 +4621,77 @@ define <8 x float> @test_subps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a
}
define i32 @test_testpd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2) {
+; GENERIC-LABEL: test_testpd:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: xorl %eax, %eax # sched: [1:0.33]
+; GENERIC-NEXT: vtestpd %xmm1, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: setb %al # sched: [1:0.50]
+; GENERIC-NEXT: vtestpd (%rdi), %xmm0 # sched: [7:1.00]
+; GENERIC-NEXT: adcl $0, %eax # sched: [2:0.67]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
; SANDY-LABEL: test_testpd:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: xorl %eax, %eax # sched: [1:0.33]
-; SANDY-NEXT: vtestpd %xmm1, %xmm0 # sched: [1:0.33]
-; SANDY-NEXT: setb %al # sched: [1:0.33]
-; SANDY-NEXT: vtestpd (%rdi), %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: adcl $0, %eax # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vtestpd %xmm1, %xmm0 # sched: [1:1.00]
+; SANDY-NEXT: setb %al # sched: [1:0.50]
+; SANDY-NEXT: vtestpd (%rdi), %xmm0 # sched: [7:1.00]
+; SANDY-NEXT: adcl $0, %eax # sched: [2:0.67]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_testpd:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: xorl %eax, %eax # sched: [1:0.25]
-; HASWELL-NEXT: vtestpd %xmm1, %xmm0 # sched: [1:0.33]
+; HASWELL-NEXT: vtestpd %xmm1, %xmm0 # sched: [1:1.00]
; HASWELL-NEXT: setb %al # sched: [1:0.50]
-; HASWELL-NEXT: vtestpd (%rdi), %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vtestpd (%rdi), %xmm0 # sched: [7:1.00]
; HASWELL-NEXT: adcl $0, %eax # sched: [2:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_testpd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: xorl %eax, %eax # sched: [1:0.25]
+; BROADWELL-NEXT: vtestpd %xmm1, %xmm0 # sched: [1:1.00]
+; BROADWELL-NEXT: setb %al # sched: [1:0.50]
+; BROADWELL-NEXT: vtestpd (%rdi), %xmm0 # sched: [6:1.00]
+; BROADWELL-NEXT: adcl $0, %eax # sched: [1:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_testpd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: xorl %eax, %eax # sched: [1:0.25]
+; SKYLAKE-NEXT: vtestpd %xmm1, %xmm0 # sched: [2:1.00]
+; SKYLAKE-NEXT: setb %al # sched: [1:0.50]
+; SKYLAKE-NEXT: vtestpd (%rdi), %xmm0 # sched: [8:1.00]
+; SKYLAKE-NEXT: adcl $0, %eax # sched: [1:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_testpd:
+; SKX: # %bb.0:
+; SKX-NEXT: xorl %eax, %eax # sched: [1:0.25]
+; SKX-NEXT: vtestpd %xmm1, %xmm0 # sched: [2:1.00]
+; SKX-NEXT: setb %al # sched: [1:0.50]
+; SKX-NEXT: vtestpd (%rdi), %xmm0 # sched: [8:1.00]
+; SKX-NEXT: adcl $0, %eax # sched: [1:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_testpd:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: xorl %eax, %eax # sched: [1:0.50]
-; BTVER2-NEXT: vtestpd %xmm1, %xmm0 # sched: [1:0.50]
+; BTVER2-NEXT: vtestpd %xmm1, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: setb %al # sched: [1:0.50]
-; BTVER2-NEXT: vtestpd (%rdi), %xmm0 # sched: [6:1.00]
+; BTVER2-NEXT: vtestpd (%rdi), %xmm0 # sched: [8:1.00]
; BTVER2-NEXT: adcl $0, %eax # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_testpd:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: xorl %eax, %eax # sched: [1:0.25]
; ZNVER1-NEXT: vtestpd %xmm1, %xmm0 # sched: [1:0.25]
; ZNVER1-NEXT: setb %al # sched: [1:0.25]
; ZNVER1-NEXT: vtestpd (%rdi), %xmm0 # sched: [8:0.50]
; ZNVER1-NEXT: adcl $0, %eax # sched: [1:0.25]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call i32 @llvm.x86.avx.vtestc.pd(<2 x double> %a0, <2 x double> %a1)
%2 = load <2 x double>, <2 x double> *%a2, align 16
%3 = call i32 @llvm.x86.avx.vtestc.pd(<2 x double> %a0, <2 x double> %2)
@@ -2492,44 +4701,84 @@ define i32 @test_testpd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2) {
declare i32 @llvm.x86.avx.vtestc.pd(<2 x double>, <2 x double>) nounwind readnone
define i32 @test_testpd_ymm(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a2) {
+; GENERIC-LABEL: test_testpd_ymm:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: xorl %eax, %eax # sched: [1:0.33]
+; GENERIC-NEXT: vtestpd %ymm1, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: setb %al # sched: [1:0.50]
+; GENERIC-NEXT: vtestpd (%rdi), %ymm0 # sched: [8:1.00]
+; GENERIC-NEXT: adcl $0, %eax # sched: [2:0.67]
+; GENERIC-NEXT: vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
; SANDY-LABEL: test_testpd_ymm:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: xorl %eax, %eax # sched: [1:0.33]
-; SANDY-NEXT: vtestpd %ymm1, %ymm0 # sched: [1:0.33]
-; SANDY-NEXT: setb %al # sched: [1:0.33]
-; SANDY-NEXT: vtestpd (%rdi), %ymm0 # sched: [5:0.50]
-; SANDY-NEXT: adcl $0, %eax # sched: [1:0.33]
-; SANDY-NEXT: vzeroupper # sched: [?:0.000000e+00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vtestpd %ymm1, %ymm0 # sched: [1:1.00]
+; SANDY-NEXT: setb %al # sched: [1:0.50]
+; SANDY-NEXT: vtestpd (%rdi), %ymm0 # sched: [8:1.00]
+; SANDY-NEXT: adcl $0, %eax # sched: [2:0.67]
+; SANDY-NEXT: vzeroupper # sched: [100:0.33]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_testpd_ymm:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: xorl %eax, %eax # sched: [1:0.25]
-; HASWELL-NEXT: vtestpd %ymm1, %ymm0 # sched: [1:0.33]
+; HASWELL-NEXT: vtestpd %ymm1, %ymm0 # sched: [1:1.00]
; HASWELL-NEXT: setb %al # sched: [1:0.50]
-; HASWELL-NEXT: vtestpd (%rdi), %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT: vtestpd (%rdi), %ymm0 # sched: [8:1.00]
; HASWELL-NEXT: adcl $0, %eax # sched: [2:0.50]
-; HASWELL-NEXT: vzeroupper # sched: [1:0.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vzeroupper # sched: [4:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_testpd_ymm:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: xorl %eax, %eax # sched: [1:0.25]
+; BROADWELL-NEXT: vtestpd %ymm1, %ymm0 # sched: [1:1.00]
+; BROADWELL-NEXT: setb %al # sched: [1:0.50]
+; BROADWELL-NEXT: vtestpd (%rdi), %ymm0 # sched: [7:1.00]
+; BROADWELL-NEXT: adcl $0, %eax # sched: [1:0.50]
+; BROADWELL-NEXT: vzeroupper # sched: [4:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_testpd_ymm:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: xorl %eax, %eax # sched: [1:0.25]
+; SKYLAKE-NEXT: vtestpd %ymm1, %ymm0 # sched: [2:1.00]
+; SKYLAKE-NEXT: setb %al # sched: [1:0.50]
+; SKYLAKE-NEXT: vtestpd (%rdi), %ymm0 # sched: [9:1.00]
+; SKYLAKE-NEXT: adcl $0, %eax # sched: [1:0.50]
+; SKYLAKE-NEXT: vzeroupper # sched: [4:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_testpd_ymm:
+; SKX: # %bb.0:
+; SKX-NEXT: xorl %eax, %eax # sched: [1:0.25]
+; SKX-NEXT: vtestpd %ymm1, %ymm0 # sched: [2:1.00]
+; SKX-NEXT: setb %al # sched: [1:0.50]
+; SKX-NEXT: vtestpd (%rdi), %ymm0 # sched: [9:1.00]
+; SKX-NEXT: adcl $0, %eax # sched: [1:0.50]
+; SKX-NEXT: vzeroupper # sched: [4:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_testpd_ymm:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: xorl %eax, %eax # sched: [1:0.50]
-; BTVER2-NEXT: vtestpd %ymm1, %ymm0 # sched: [1:0.50]
+; BTVER2-NEXT: vtestpd %ymm1, %ymm0 # sched: [4:2.00]
; BTVER2-NEXT: setb %al # sched: [1:0.50]
-; BTVER2-NEXT: vtestpd (%rdi), %ymm0 # sched: [6:1.00]
+; BTVER2-NEXT: vtestpd (%rdi), %ymm0 # sched: [9:2.00]
; BTVER2-NEXT: adcl $0, %eax # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_testpd_ymm:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: xorl %eax, %eax # sched: [1:0.25]
; ZNVER1-NEXT: vtestpd %ymm1, %ymm0 # sched: [1:0.25]
; ZNVER1-NEXT: setb %al # sched: [1:0.25]
; ZNVER1-NEXT: vtestpd (%rdi), %ymm0 # sched: [8:0.50]
; ZNVER1-NEXT: adcl $0, %eax # sched: [1:0.25]
-; ZNVER1-NEXT: vzeroupper # sched: [?:0.000000e+00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: vzeroupper # sched: [100:?]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call i32 @llvm.x86.avx.vtestc.pd.256(<4 x double> %a0, <4 x double> %a1)
%2 = load <4 x double>, <4 x double> *%a2, align 32
%3 = call i32 @llvm.x86.avx.vtestc.pd.256(<4 x double> %a0, <4 x double> %2)
@@ -2539,41 +4788,77 @@ define i32 @test_testpd_ymm(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a
declare i32 @llvm.x86.avx.vtestc.pd.256(<4 x double>, <4 x double>) nounwind readnone
define i32 @test_testps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2) {
+; GENERIC-LABEL: test_testps:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: xorl %eax, %eax # sched: [1:0.33]
+; GENERIC-NEXT: vtestps %xmm1, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: setb %al # sched: [1:0.50]
+; GENERIC-NEXT: vtestps (%rdi), %xmm0 # sched: [7:1.00]
+; GENERIC-NEXT: adcl $0, %eax # sched: [2:0.67]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
; SANDY-LABEL: test_testps:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: xorl %eax, %eax # sched: [1:0.33]
-; SANDY-NEXT: vtestps %xmm1, %xmm0 # sched: [1:0.33]
-; SANDY-NEXT: setb %al # sched: [1:0.33]
-; SANDY-NEXT: vtestps (%rdi), %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: adcl $0, %eax # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vtestps %xmm1, %xmm0 # sched: [1:1.00]
+; SANDY-NEXT: setb %al # sched: [1:0.50]
+; SANDY-NEXT: vtestps (%rdi), %xmm0 # sched: [7:1.00]
+; SANDY-NEXT: adcl $0, %eax # sched: [2:0.67]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_testps:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: xorl %eax, %eax # sched: [1:0.25]
-; HASWELL-NEXT: vtestps %xmm1, %xmm0 # sched: [1:0.33]
+; HASWELL-NEXT: vtestps %xmm1, %xmm0 # sched: [1:1.00]
; HASWELL-NEXT: setb %al # sched: [1:0.50]
-; HASWELL-NEXT: vtestps (%rdi), %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vtestps (%rdi), %xmm0 # sched: [7:1.00]
; HASWELL-NEXT: adcl $0, %eax # sched: [2:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_testps:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: xorl %eax, %eax # sched: [1:0.25]
+; BROADWELL-NEXT: vtestps %xmm1, %xmm0 # sched: [1:1.00]
+; BROADWELL-NEXT: setb %al # sched: [1:0.50]
+; BROADWELL-NEXT: vtestps (%rdi), %xmm0 # sched: [6:1.00]
+; BROADWELL-NEXT: adcl $0, %eax # sched: [1:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_testps:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: xorl %eax, %eax # sched: [1:0.25]
+; SKYLAKE-NEXT: vtestps %xmm1, %xmm0 # sched: [2:1.00]
+; SKYLAKE-NEXT: setb %al # sched: [1:0.50]
+; SKYLAKE-NEXT: vtestps (%rdi), %xmm0 # sched: [8:1.00]
+; SKYLAKE-NEXT: adcl $0, %eax # sched: [1:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_testps:
+; SKX: # %bb.0:
+; SKX-NEXT: xorl %eax, %eax # sched: [1:0.25]
+; SKX-NEXT: vtestps %xmm1, %xmm0 # sched: [2:1.00]
+; SKX-NEXT: setb %al # sched: [1:0.50]
+; SKX-NEXT: vtestps (%rdi), %xmm0 # sched: [8:1.00]
+; SKX-NEXT: adcl $0, %eax # sched: [1:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_testps:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: xorl %eax, %eax # sched: [1:0.50]
-; BTVER2-NEXT: vtestps %xmm1, %xmm0 # sched: [1:0.50]
+; BTVER2-NEXT: vtestps %xmm1, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: setb %al # sched: [1:0.50]
-; BTVER2-NEXT: vtestps (%rdi), %xmm0 # sched: [6:1.00]
+; BTVER2-NEXT: vtestps (%rdi), %xmm0 # sched: [8:1.00]
; BTVER2-NEXT: adcl $0, %eax # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_testps:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: xorl %eax, %eax # sched: [1:0.25]
; ZNVER1-NEXT: vtestps %xmm1, %xmm0 # sched: [1:0.25]
; ZNVER1-NEXT: setb %al # sched: [1:0.25]
; ZNVER1-NEXT: vtestps (%rdi), %xmm0 # sched: [8:0.50]
; ZNVER1-NEXT: adcl $0, %eax # sched: [1:0.25]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call i32 @llvm.x86.avx.vtestc.ps(<4 x float> %a0, <4 x float> %a1)
%2 = load <4 x float>, <4 x float> *%a2, align 16
%3 = call i32 @llvm.x86.avx.vtestc.ps(<4 x float> %a0, <4 x float> %2)
@@ -2583,44 +4868,84 @@ define i32 @test_testps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2) {
declare i32 @llvm.x86.avx.vtestc.ps(<4 x float>, <4 x float>) nounwind readnone
define i32 @test_testps_ymm(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2) {
+; GENERIC-LABEL: test_testps_ymm:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: xorl %eax, %eax # sched: [1:0.33]
+; GENERIC-NEXT: vtestps %ymm1, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: setb %al # sched: [1:0.50]
+; GENERIC-NEXT: vtestps (%rdi), %ymm0 # sched: [8:1.00]
+; GENERIC-NEXT: adcl $0, %eax # sched: [2:0.67]
+; GENERIC-NEXT: vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
; SANDY-LABEL: test_testps_ymm:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: xorl %eax, %eax # sched: [1:0.33]
-; SANDY-NEXT: vtestps %ymm1, %ymm0 # sched: [1:0.33]
-; SANDY-NEXT: setb %al # sched: [1:0.33]
-; SANDY-NEXT: vtestps (%rdi), %ymm0 # sched: [5:0.50]
-; SANDY-NEXT: adcl $0, %eax # sched: [1:0.33]
-; SANDY-NEXT: vzeroupper # sched: [?:0.000000e+00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vtestps %ymm1, %ymm0 # sched: [1:1.00]
+; SANDY-NEXT: setb %al # sched: [1:0.50]
+; SANDY-NEXT: vtestps (%rdi), %ymm0 # sched: [8:1.00]
+; SANDY-NEXT: adcl $0, %eax # sched: [2:0.67]
+; SANDY-NEXT: vzeroupper # sched: [100:0.33]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_testps_ymm:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: xorl %eax, %eax # sched: [1:0.25]
-; HASWELL-NEXT: vtestps %ymm1, %ymm0 # sched: [1:0.33]
+; HASWELL-NEXT: vtestps %ymm1, %ymm0 # sched: [1:1.00]
; HASWELL-NEXT: setb %al # sched: [1:0.50]
-; HASWELL-NEXT: vtestps (%rdi), %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT: vtestps (%rdi), %ymm0 # sched: [8:1.00]
; HASWELL-NEXT: adcl $0, %eax # sched: [2:0.50]
-; HASWELL-NEXT: vzeroupper # sched: [1:0.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vzeroupper # sched: [4:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_testps_ymm:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: xorl %eax, %eax # sched: [1:0.25]
+; BROADWELL-NEXT: vtestps %ymm1, %ymm0 # sched: [1:1.00]
+; BROADWELL-NEXT: setb %al # sched: [1:0.50]
+; BROADWELL-NEXT: vtestps (%rdi), %ymm0 # sched: [7:1.00]
+; BROADWELL-NEXT: adcl $0, %eax # sched: [1:0.50]
+; BROADWELL-NEXT: vzeroupper # sched: [4:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_testps_ymm:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: xorl %eax, %eax # sched: [1:0.25]
+; SKYLAKE-NEXT: vtestps %ymm1, %ymm0 # sched: [2:1.00]
+; SKYLAKE-NEXT: setb %al # sched: [1:0.50]
+; SKYLAKE-NEXT: vtestps (%rdi), %ymm0 # sched: [9:1.00]
+; SKYLAKE-NEXT: adcl $0, %eax # sched: [1:0.50]
+; SKYLAKE-NEXT: vzeroupper # sched: [4:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_testps_ymm:
+; SKX: # %bb.0:
+; SKX-NEXT: xorl %eax, %eax # sched: [1:0.25]
+; SKX-NEXT: vtestps %ymm1, %ymm0 # sched: [2:1.00]
+; SKX-NEXT: setb %al # sched: [1:0.50]
+; SKX-NEXT: vtestps (%rdi), %ymm0 # sched: [9:1.00]
+; SKX-NEXT: adcl $0, %eax # sched: [1:0.50]
+; SKX-NEXT: vzeroupper # sched: [4:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_testps_ymm:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: xorl %eax, %eax # sched: [1:0.50]
-; BTVER2-NEXT: vtestps %ymm1, %ymm0 # sched: [1:0.50]
+; BTVER2-NEXT: vtestps %ymm1, %ymm0 # sched: [4:2.00]
; BTVER2-NEXT: setb %al # sched: [1:0.50]
-; BTVER2-NEXT: vtestps (%rdi), %ymm0 # sched: [6:1.00]
+; BTVER2-NEXT: vtestps (%rdi), %ymm0 # sched: [9:2.00]
; BTVER2-NEXT: adcl $0, %eax # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_testps_ymm:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: xorl %eax, %eax # sched: [1:0.25]
; ZNVER1-NEXT: vtestps %ymm1, %ymm0 # sched: [1:0.25]
; ZNVER1-NEXT: setb %al # sched: [1:0.25]
; ZNVER1-NEXT: vtestps (%rdi), %ymm0 # sched: [8:0.50]
; ZNVER1-NEXT: adcl $0, %eax # sched: [1:0.25]
-; ZNVER1-NEXT: vzeroupper # sched: [?:0.000000e+00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: vzeroupper # sched: [100:?]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call i32 @llvm.x86.avx.vtestc.ps.256(<8 x float> %a0, <8 x float> %a1)
%2 = load <8 x float>, <8 x float> *%a2, align 32
%3 = call i32 @llvm.x86.avx.vtestc.ps.256(<8 x float> %a0, <8 x float> %2)
@@ -2630,33 +4955,61 @@ define i32 @test_testps_ymm(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2)
declare i32 @llvm.x86.avx.vtestc.ps.256(<8 x float>, <8 x float>) nounwind readnone
define <4 x double> @test_unpckhpd(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a2) {
+; GENERIC-LABEL: test_unpckhpd:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
+; GENERIC-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] sched: [8:1.00]
+; GENERIC-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
; SANDY-LABEL: test_unpckhpd:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
-; SANDY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] sched: [5:1.00]
+; SANDY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] sched: [8:1.00]
; SANDY-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_unpckhpd:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
-; HASWELL-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] sched: [5:1.00]
+; HASWELL-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] sched: [8:1.00]
; HASWELL-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_unpckhpd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
+; BROADWELL-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] sched: [7:1.00]
+; BROADWELL-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_unpckhpd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
+; SKYLAKE-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] sched: [8:1.00]
+; SKYLAKE-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_unpckhpd:
+; SKX: # %bb.0:
+; SKX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
+; SKX-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] sched: [8:1.00]
+; SKX-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_unpckhpd:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:0.50]
; BTVER2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] sched: [6:1.00]
; BTVER2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_unpckhpd:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:0.50]
; ZNVER1-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],mem[1],ymm1[3],mem[3] sched: [8:0.50]
; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
%2 = load <4 x double>, <4 x double> *%a2, align 32
%3 = shufflevector <4 x double> %a1, <4 x double> %2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
@@ -2665,29 +5018,53 @@ define <4 x double> @test_unpckhpd(<4 x double> %a0, <4 x double> %a1, <4 x doub
}
define <8 x float> @test_unpckhps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2) nounwind {
+; GENERIC-LABEL: test_unpckhps:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
+; GENERIC-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
; SANDY-LABEL: test_unpckhps:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
-; SANDY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [5:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_unpckhps:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
-; HASWELL-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [5:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_unpckhps:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
+; BROADWELL-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [7:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_unpckhps:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
+; SKYLAKE-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_unpckhps:
+; SKX: # %bb.0:
+; SKX-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
+; SKX-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_unpckhps:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:0.50]
; BTVER2-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_unpckhps:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:0.50]
; ZNVER1-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:0.50]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
%2 = load <8 x float>, <8 x float> *%a2, align 32
%3 = shufflevector <8 x float> %1, <8 x float> %2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
@@ -2695,33 +5072,61 @@ define <8 x float> @test_unpckhps(<8 x float> %a0, <8 x float> %a1, <8 x float>
}
define <4 x double> @test_unpcklpd(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a2) {
+; GENERIC-LABEL: test_unpcklpd:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
+; GENERIC-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] sched: [8:1.00]
+; GENERIC-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
; SANDY-LABEL: test_unpcklpd:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
-; SANDY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] sched: [5:1.00]
+; SANDY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] sched: [8:1.00]
; SANDY-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_unpcklpd:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
-; HASWELL-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] sched: [5:1.00]
+; HASWELL-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] sched: [8:1.00]
; HASWELL-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_unpcklpd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
+; BROADWELL-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] sched: [7:1.00]
+; BROADWELL-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_unpcklpd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
+; SKYLAKE-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] sched: [8:1.00]
+; SKYLAKE-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_unpcklpd:
+; SKX: # %bb.0:
+; SKX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
+; SKX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] sched: [8:1.00]
+; SKX-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_unpcklpd:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:0.50]
; BTVER2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] sched: [6:1.00]
; BTVER2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_unpcklpd:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:0.50]
; ZNVER1-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] sched: [8:0.50]
; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
%2 = load <4 x double>, <4 x double> *%a2, align 32
%3 = shufflevector <4 x double> %a1, <4 x double> %2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
@@ -2730,29 +5135,53 @@ define <4 x double> @test_unpcklpd(<4 x double> %a0, <4 x double> %a1, <4 x doub
}
define <8 x float> @test_unpcklps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2) nounwind {
+; GENERIC-LABEL: test_unpcklps:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
+; GENERIC-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
; SANDY-LABEL: test_unpcklps:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
-; SANDY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [5:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_unpcklps:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
-; HASWELL-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [5:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_unpcklps:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
+; BROADWELL-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [7:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_unpcklps:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
+; SKYLAKE-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_unpcklps:
+; SKX: # %bb.0:
+; SKX-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
+; SKX-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_unpcklps:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:0.50]
; BTVER2-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_unpcklps:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:0.50]
; ZNVER1-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:0.50]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
%2 = load <8 x float>, <8 x float> *%a2, align 32
%3 = shufflevector <8 x float> %1, <8 x float> %2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
@@ -2760,33 +5189,61 @@ define <8 x float> @test_unpcklps(<8 x float> %a0, <8 x float> %a1, <8 x float>
}
define <4 x double> @test_xorpd(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a2) {
+; GENERIC-LABEL: test_xorpd:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vxorpd %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: vxorpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; GENERIC-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
; SANDY-LABEL: test_xorpd:
-; SANDY: # BB#0:
-; SANDY-NEXT: vxorpd %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
-; SANDY-NEXT: vxorpd (%rdi), %ymm0, %ymm0 # sched: [5:0.50]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vxorpd %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; SANDY-NEXT: vxorpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
; SANDY-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_xorpd:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vxorpd %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
-; HASWELL-NEXT: vxorpd (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
+; HASWELL-NEXT: vxorpd (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
; HASWELL-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_xorpd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vxorpd %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; BROADWELL-NEXT: vxorpd (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; BROADWELL-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_xorpd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vxorpd %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: vxorpd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; SKYLAKE-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_xorpd:
+; SKX: # %bb.0:
+; SKX-NEXT: vxorpd %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: vxorpd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; SKX-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_xorpd:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vxorpd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
; BTVER2-NEXT: vxorpd (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
; BTVER2-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_xorpd:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vxorpd %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
; ZNVER1-NEXT: vxorpd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
; ZNVER1-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = bitcast <4 x double> %a0 to <4 x i64>
%2 = bitcast <4 x double> %a1 to <4 x i64>
%3 = xor <4 x i64> %1, %2
@@ -2799,33 +5256,61 @@ define <4 x double> @test_xorpd(<4 x double> %a0, <4 x double> %a1, <4 x double>
}
define <8 x float> @test_xorps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2) {
+; GENERIC-LABEL: test_xorps:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vxorps %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: vxorps (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; GENERIC-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
; SANDY-LABEL: test_xorps:
-; SANDY: # BB#0:
-; SANDY-NEXT: vxorps %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
-; SANDY-NEXT: vxorps (%rdi), %ymm0, %ymm0 # sched: [5:0.50]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vxorps %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; SANDY-NEXT: vxorps (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_xorps:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vxorps %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
-; HASWELL-NEXT: vxorps (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
+; HASWELL-NEXT: vxorps (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
; HASWELL-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_xorps:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vxorps %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; BROADWELL-NEXT: vxorps (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; BROADWELL-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_xorps:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vxorps %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: vxorps (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; SKYLAKE-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_xorps:
+; SKX: # %bb.0:
+; SKX-NEXT: vxorps %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: vxorps (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; SKX-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_xorps:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vxorps %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
; BTVER2-NEXT: vxorps (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_xorps:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vxorps %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
; ZNVER1-NEXT: vxorps (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
; ZNVER1-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = bitcast <8 x float> %a0 to <4 x i64>
%2 = bitcast <8 x float> %a1 to <4 x i64>
%3 = xor <4 x i64> %1, %2
@@ -2838,50 +5323,90 @@ define <8 x float> @test_xorps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a
}
define void @test_zeroall() {
+; GENERIC-LABEL: test_zeroall:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vzeroall # sched: [100:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
; SANDY-LABEL: test_zeroall:
-; SANDY: # BB#0:
-; SANDY-NEXT: vzeroall # sched: [?:0.000000e+00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vzeroall # sched: [100:0.33]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_zeroall:
-; HASWELL: # BB#0:
-; HASWELL-NEXT: vzeroall # sched: [1:0.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vzeroall # sched: [16:16.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_zeroall:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vzeroall # sched: [16:16.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_zeroall:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vzeroall # sched: [16:4.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_zeroall:
+; SKX: # %bb.0:
+; SKX-NEXT: vzeroall # sched: [16:4.00]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_zeroall:
-; BTVER2: # BB#0:
-; BTVER2-NEXT: vzeroall # sched: [?:0.000000e+00]
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: vzeroall # sched: [90:?]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_zeroall:
-; ZNVER1: # BB#0:
-; ZNVER1-NEXT: vzeroall # sched: [?:0.000000e+00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vzeroall # sched: [100:?]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
call void @llvm.x86.avx.vzeroall()
ret void
}
declare void @llvm.x86.avx.vzeroall() nounwind
define void @test_zeroupper() {
+; GENERIC-LABEL: test_zeroupper:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
; SANDY-LABEL: test_zeroupper:
-; SANDY: # BB#0:
-; SANDY-NEXT: vzeroupper # sched: [?:0.000000e+00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vzeroupper # sched: [100:0.33]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_zeroupper:
-; HASWELL: # BB#0:
-; HASWELL-NEXT: vzeroupper # sched: [1:0.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vzeroupper # sched: [4:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_zeroupper:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vzeroupper # sched: [4:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_zeroupper:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vzeroupper # sched: [4:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_zeroupper:
+; SKX: # %bb.0:
+; SKX-NEXT: vzeroupper # sched: [4:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_zeroupper:
-; BTVER2: # BB#0:
-; BTVER2-NEXT: vzeroupper # sched: [?:0.000000e+00]
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: vzeroupper # sched: [46:?]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_zeroupper:
-; ZNVER1: # BB#0:
-; ZNVER1-NEXT: vzeroupper # sched: [?:0.000000e+00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vzeroupper # sched: [100:?]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
call void @llvm.x86.avx.vzeroupper()
ret void
}
diff --git a/test/CodeGen/X86/avx-select.ll b/test/CodeGen/X86/avx-select.ll
index cdd3180d6245..ea64973eb9d7 100644
--- a/test/CodeGen/X86/avx-select.ll
+++ b/test/CodeGen/X86/avx-select.ll
@@ -1,17 +1,29 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=X64
define <8 x i32> @select00(i32 %a, <8 x i32> %b) nounwind {
-; CHECK-LABEL: select00:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vxorps %ymm1, %ymm1, %ymm1
-; CHECK-NEXT: cmpl $255, %edi
-; CHECK-NEXT: je LBB0_2
-; CHECK-NEXT: ## BB#1:
-; CHECK-NEXT: vmovaps %ymm0, %ymm1
-; CHECK-NEXT: LBB0_2:
-; CHECK-NEXT: vxorps %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: retq
+; X86-LABEL: select00:
+; X86: # %bb.0:
+; X86-NEXT: cmpl $255, {{[0-9]+}}(%esp)
+; X86-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; X86-NEXT: je .LBB0_2
+; X86-NEXT: # %bb.1:
+; X86-NEXT: vmovaps %ymm0, %ymm1
+; X86-NEXT: .LBB0_2:
+; X86-NEXT: vxorps %ymm1, %ymm0, %ymm0
+; X86-NEXT: retl
+;
+; X64-LABEL: select00:
+; X64: # %bb.0:
+; X64-NEXT: cmpl $255, %edi
+; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; X64-NEXT: je .LBB0_2
+; X64-NEXT: # %bb.1:
+; X64-NEXT: vmovaps %ymm0, %ymm1
+; X64-NEXT: .LBB0_2:
+; X64-NEXT: vxorps %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
%cmpres = icmp eq i32 %a, 255
%selres = select i1 %cmpres, <8 x i32> zeroinitializer, <8 x i32> %b
%res = xor <8 x i32> %b, %selres
@@ -19,16 +31,27 @@ define <8 x i32> @select00(i32 %a, <8 x i32> %b) nounwind {
}
define <4 x i64> @select01(i32 %a, <4 x i64> %b) nounwind {
-; CHECK-LABEL: select01:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vxorps %ymm1, %ymm1, %ymm1
-; CHECK-NEXT: cmpl $255, %edi
-; CHECK-NEXT: je LBB1_2
-; CHECK-NEXT: ## BB#1:
-; CHECK-NEXT: vmovaps %ymm0, %ymm1
-; CHECK-NEXT: LBB1_2:
-; CHECK-NEXT: vxorps %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: retq
+; X86-LABEL: select01:
+; X86: # %bb.0:
+; X86-NEXT: cmpl $255, {{[0-9]+}}(%esp)
+; X86-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; X86-NEXT: je .LBB1_2
+; X86-NEXT: # %bb.1:
+; X86-NEXT: vmovaps %ymm0, %ymm1
+; X86-NEXT: .LBB1_2:
+; X86-NEXT: vxorps %ymm1, %ymm0, %ymm0
+; X86-NEXT: retl
+;
+; X64-LABEL: select01:
+; X64: # %bb.0:
+; X64-NEXT: cmpl $255, %edi
+; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; X64-NEXT: je .LBB1_2
+; X64-NEXT: # %bb.1:
+; X64-NEXT: vmovaps %ymm0, %ymm1
+; X64-NEXT: .LBB1_2:
+; X64-NEXT: vxorps %ymm1, %ymm0, %ymm0
+; X64-NEXT: retq
%cmpres = icmp eq i32 %a, 255
%selres = select i1 %cmpres, <4 x i64> zeroinitializer, <4 x i64> %b
%res = xor <4 x i64> %b, %selres
diff --git a/test/CodeGen/X86/avx-shift.ll b/test/CodeGen/X86/avx-shift.ll
index b65412d99eb4..ee6ca2224ea4 100644
--- a/test/CodeGen/X86/avx-shift.ll
+++ b/test/CodeGen/X86/avx-shift.ll
@@ -4,7 +4,7 @@
;;; Shift left
define <8 x i32> @vshift00(<8 x i32> %a) {
; CHECK-LABEL: vshift00:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpslld $2, %xmm0, %xmm1
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0
; CHECK-NEXT: vpslld $2, %xmm0, %xmm0
@@ -16,7 +16,7 @@ define <8 x i32> @vshift00(<8 x i32> %a) {
define <16 x i16> @vshift01(<16 x i16> %a) {
; CHECK-LABEL: vshift01:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpsllw $2, %xmm0, %xmm1
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0
; CHECK-NEXT: vpsllw $2, %xmm0, %xmm0
@@ -28,7 +28,7 @@ define <16 x i16> @vshift01(<16 x i16> %a) {
define <4 x i64> @vshift02(<4 x i64> %a) {
; CHECK-LABEL: vshift02:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpsllq $2, %xmm0, %xmm1
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0
; CHECK-NEXT: vpsllq $2, %xmm0, %xmm0
@@ -41,7 +41,7 @@ define <4 x i64> @vshift02(<4 x i64> %a) {
;;; Logical Shift right
define <8 x i32> @vshift03(<8 x i32> %a) {
; CHECK-LABEL: vshift03:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpsrld $2, %xmm0, %xmm1
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0
; CHECK-NEXT: vpsrld $2, %xmm0, %xmm0
@@ -53,7 +53,7 @@ define <8 x i32> @vshift03(<8 x i32> %a) {
define <16 x i16> @vshift04(<16 x i16> %a) {
; CHECK-LABEL: vshift04:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpsrlw $2, %xmm0, %xmm1
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0
; CHECK-NEXT: vpsrlw $2, %xmm0, %xmm0
@@ -65,7 +65,7 @@ define <16 x i16> @vshift04(<16 x i16> %a) {
define <4 x i64> @vshift05(<4 x i64> %a) {
; CHECK-LABEL: vshift05:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpsrlq $2, %xmm0, %xmm1
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0
; CHECK-NEXT: vpsrlq $2, %xmm0, %xmm0
@@ -78,7 +78,7 @@ define <4 x i64> @vshift05(<4 x i64> %a) {
;;; Arithmetic Shift right
define <8 x i32> @vshift06(<8 x i32> %a) {
; CHECK-LABEL: vshift06:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpsrad $2, %xmm0, %xmm1
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0
; CHECK-NEXT: vpsrad $2, %xmm0, %xmm0
@@ -90,7 +90,7 @@ define <8 x i32> @vshift06(<8 x i32> %a) {
define <16 x i16> @vshift07(<16 x i16> %a) {
; CHECK-LABEL: vshift07:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpsraw $2, %xmm0, %xmm1
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0
; CHECK-NEXT: vpsraw $2, %xmm0, %xmm0
@@ -102,7 +102,7 @@ define <16 x i16> @vshift07(<16 x i16> %a) {
define <32 x i8> @vshift09(<32 x i8> %a) {
; CHECK-LABEL: vshift09:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1
; CHECK-NEXT: vpsrlw $2, %xmm1, %xmm1
; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
@@ -122,7 +122,7 @@ define <32 x i8> @vshift09(<32 x i8> %a) {
define <32 x i8> @vshift10(<32 x i8> %a) {
; CHECK-LABEL: vshift10:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vpcmpgtb %xmm1, %xmm2, %xmm1
@@ -135,7 +135,7 @@ define <32 x i8> @vshift10(<32 x i8> %a) {
define <32 x i8> @vshift11(<32 x i8> %a) {
; CHECK-LABEL: vshift11:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1
; CHECK-NEXT: vpsrlw $2, %xmm1, %xmm1
; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
@@ -150,7 +150,7 @@ define <32 x i8> @vshift11(<32 x i8> %a) {
define <32 x i8> @vshift12(<32 x i8> %a) {
; CHECK-LABEL: vshift12:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1
; CHECK-NEXT: vpsllw $2, %xmm1, %xmm1
; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
@@ -166,7 +166,7 @@ define <32 x i8> @vshift12(<32 x i8> %a) {
;;; Support variable shifts
define <8 x i32> @vshift08(<8 x i32> %a) {
; CHECK-LABEL: vshift08:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpslld $23, %xmm0, %xmm1
; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [1065353216,1065353216,1065353216,1065353216]
; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1
@@ -184,7 +184,7 @@ define <8 x i32> @vshift08(<8 x i32> %a) {
; PR15141
define <4 x i32> @vshift13(<4 x i32> %in) {
; CHECK-LABEL: vshift13:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
; CHECK-NEXT: retq
%T = shl <4 x i32> %in, <i32 0, i32 1, i32 2, i32 4>
@@ -194,7 +194,7 @@ define <4 x i32> @vshift13(<4 x i32> %in) {
;;; Uses shifts for sign extension
define <16 x i16> @sext_v16i16(<16 x i16> %a) {
; CHECK-LABEL: sext_v16i16:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpsllw $8, %xmm0, %xmm1
; CHECK-NEXT: vpsraw $8, %xmm1, %xmm1
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0
@@ -209,7 +209,7 @@ define <16 x i16> @sext_v16i16(<16 x i16> %a) {
define <8 x i32> @sext_v8i32(<8 x i32> %a) {
; CHECK-LABEL: sext_v8i32:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpslld $16, %xmm0, %xmm1
; CHECK-NEXT: vpsrad $16, %xmm1, %xmm1
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0
diff --git a/test/CodeGen/X86/avx-shuffle-x86_32.ll b/test/CodeGen/X86/avx-shuffle-x86_32.ll
index 6defe7efb941..8c01c440d757 100755
--- a/test/CodeGen/X86/avx-shuffle-x86_32.ll
+++ b/test/CodeGen/X86/avx-shuffle-x86_32.ll
@@ -4,9 +4,9 @@
; Avoid unnecessary vinsertf128
define <4 x i64> @test1(<4 x i64> %a) nounwind {
; CHECK-LABEL: test1:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1
-; CHECK-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; CHECK-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
; CHECK-NEXT: retl
%b = shufflevector <4 x i64> %a, <4 x i64> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
ret <4 x i64>%b
@@ -14,7 +14,7 @@ define <4 x i64> @test1(<4 x i64> %a) nounwind {
define <8 x i16> @test2(<4 x i16>* %v) nounwind {
; CHECK-LABEL: test2:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: retl
diff --git a/test/CodeGen/X86/avx-splat.ll b/test/CodeGen/X86/avx-splat.ll
index 91d1f64c6706..da547397c6ce 100644
--- a/test/CodeGen/X86/avx-splat.ll
+++ b/test/CodeGen/X86/avx-splat.ll
@@ -3,7 +3,7 @@
define <32 x i8> @funcA(<32 x i8> %a) nounwind uwtable readnone ssp {
; CHECK-LABEL: funcA:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5]
; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; CHECK-NEXT: retq
@@ -14,7 +14,7 @@ entry:
define <16 x i16> @funcB(<16 x i16> %a) nounwind uwtable readnone ssp {
; CHECK-LABEL: funcB:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5]
; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
@@ -26,7 +26,7 @@ entry:
define <4 x i64> @funcC(i64 %q) nounwind uwtable readnone ssp {
; CHECK-LABEL: funcC:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vmovq %rdi, %xmm0
; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
@@ -41,7 +41,7 @@ entry:
define <4 x double> @funcD(double %q) nounwind uwtable readnone ssp {
; CHECK-LABEL: funcD:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; CHECK-NEXT: retq
@@ -58,12 +58,12 @@ entry:
;
define <8 x float> @funcE() nounwind {
; CHECK-LABEL: funcE:
-; CHECK: # BB#0: # %for_exit499
+; CHECK: # %bb.0: # %for_exit499
; CHECK-NEXT: xorl %eax, %eax
-; CHECK-NEXT: # implicit-def: %YMM0
; CHECK-NEXT: testb %al, %al
+; CHECK-NEXT: # implicit-def: %ymm0
; CHECK-NEXT: jne .LBB4_2
-; CHECK-NEXT: # BB#1: # %load.i1247
+; CHECK-NEXT: # %bb.1: # %load.i1247
; CHECK-NEXT: pushq %rbp
; CHECK-NEXT: movq %rsp, %rbp
; CHECK-NEXT: andq $-32, %rsp
@@ -99,7 +99,7 @@ __load_and_broadcast_32.exit1249: ; preds = %load.i1247, %for_ex
define <8 x float> @funcF(i32 %val) nounwind {
; CHECK-LABEL: funcF:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vmovd %edi, %xmm0
; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,0]
; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
@@ -112,7 +112,7 @@ define <8 x float> @funcF(i32 %val) nounwind {
define <8 x float> @funcG(<8 x float> %a) nounwind uwtable readnone ssp {
; CHECK-LABEL: funcG:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; CHECK-NEXT: retq
@@ -123,7 +123,7 @@ entry:
define <8 x float> @funcH(<8 x float> %a) nounwind uwtable readnone ssp {
; CHECK-LABEL: funcH:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,1,1,1,5,5,5,5]
; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
; CHECK-NEXT: retq
@@ -134,7 +134,7 @@ entry:
define <2 x double> @splat_load_2f64_11(<2 x double>* %ptr) {
; CHECK-LABEL: splat_load_2f64_11:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
; CHECK-NEXT: retq
%x = load <2 x double>, <2 x double>* %ptr
@@ -144,7 +144,7 @@ define <2 x double> @splat_load_2f64_11(<2 x double>* %ptr) {
define <4 x double> @splat_load_4f64_2222(<4 x double>* %ptr) {
; CHECK-LABEL: splat_load_4f64_2222:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vbroadcastsd 16(%rdi), %ymm0
; CHECK-NEXT: retq
%x = load <4 x double>, <4 x double>* %ptr
@@ -154,7 +154,7 @@ define <4 x double> @splat_load_4f64_2222(<4 x double>* %ptr) {
define <4 x float> @splat_load_4f32_0000(<4 x float>* %ptr) {
; CHECK-LABEL: splat_load_4f32_0000:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vbroadcastss (%rdi), %xmm0
; CHECK-NEXT: retq
%x = load <4 x float>, <4 x float>* %ptr
@@ -164,7 +164,7 @@ define <4 x float> @splat_load_4f32_0000(<4 x float>* %ptr) {
define <8 x float> @splat_load_8f32_77777777(<8 x float>* %ptr) {
; CHECK-LABEL: splat_load_8f32_77777777:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vbroadcastss 28(%rdi), %ymm0
; CHECK-NEXT: retq
%x = load <8 x float>, <8 x float>* %ptr
diff --git a/test/CodeGen/X86/avx-trunc.ll b/test/CodeGen/X86/avx-trunc.ll
index 1a9acd007778..f1af384ce473 100755
--- a/test/CodeGen/X86/avx-trunc.ll
+++ b/test/CodeGen/X86/avx-trunc.ll
@@ -3,7 +3,7 @@
define <4 x i32> @trunc_64_32(<4 x i64> %A) nounwind uwtable readnone ssp{
; CHECK-LABEL: trunc_64_32:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1
; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; CHECK-NEXT: vzeroupper
@@ -14,7 +14,7 @@ define <4 x i32> @trunc_64_32(<4 x i64> %A) nounwind uwtable readnone ssp{
define <8 x i16> @trunc_32_16(<8 x i32> %A) nounwind uwtable readnone ssp{
; CHECK-LABEL: trunc_32_16:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1
; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; CHECK-NEXT: vpshufb %xmm2, %xmm1, %xmm1
@@ -28,7 +28,7 @@ define <8 x i16> @trunc_32_16(<8 x i32> %A) nounwind uwtable readnone ssp{
define <16 x i8> @trunc_16_8(<16 x i16> %A) nounwind uwtable readnone ssp{
; CHECK-LABEL: trunc_16_8:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1
; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
; CHECK-NEXT: vpshufb %xmm2, %xmm1, %xmm1
diff --git a/test/CodeGen/X86/avx-unpack.ll b/test/CodeGen/X86/avx-unpack.ll
index 7826bc97eec5..8c7098097566 100644
--- a/test/CodeGen/X86/avx-unpack.ll
+++ b/test/CodeGen/X86/avx-unpack.ll
@@ -3,7 +3,7 @@
define <8 x float> @unpackhips(<8 x float> %src1, <8 x float> %src2) nounwind uwtable readnone ssp {
; CHECK-LABEL: unpackhips:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
; CHECK-NEXT: retq
%shuffle.i = shufflevector <8 x float> %src1, <8 x float> %src2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
@@ -12,7 +12,7 @@ define <8 x float> @unpackhips(<8 x float> %src1, <8 x float> %src2) nounwind uw
define <4 x double> @unpackhipd(<4 x double> %src1, <4 x double> %src2) nounwind uwtable readnone ssp {
; CHECK-LABEL: unpackhipd:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
; CHECK-NEXT: retq
%shuffle.i = shufflevector <4 x double> %src1, <4 x double> %src2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
@@ -21,7 +21,7 @@ define <4 x double> @unpackhipd(<4 x double> %src1, <4 x double> %src2) nounwind
define <8 x float> @unpacklops(<8 x float> %src1, <8 x float> %src2) nounwind uwtable readnone ssp {
; CHECK-LABEL: unpacklops:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
; CHECK-NEXT: retq
%shuffle.i = shufflevector <8 x float> %src1, <8 x float> %src2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
@@ -30,7 +30,7 @@ define <8 x float> @unpacklops(<8 x float> %src1, <8 x float> %src2) nounwind uw
define <4 x double> @unpacklopd(<4 x double> %src1, <4 x double> %src2) nounwind uwtable readnone ssp {
; CHECK-LABEL: unpacklopd:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
; CHECK-NEXT: retq
%shuffle.i = shufflevector <4 x double> %src1, <4 x double> %src2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
@@ -39,7 +39,7 @@ define <4 x double> @unpacklopd(<4 x double> %src1, <4 x double> %src2) nounwind
define <8 x float> @unpacklops_not(<8 x float> %src1, <8 x float> %src2) nounwind uwtable readnone ssp {
; CHECK-LABEL: unpacklops_not:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vunpckhps {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; CHECK-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
@@ -50,9 +50,9 @@ define <8 x float> @unpacklops_not(<8 x float> %src1, <8 x float> %src2) nounwin
define <4 x double> @unpacklopd_not(<4 x double> %src1, <4 x double> %src2) nounwind uwtable readnone ssp {
; CHECK-LABEL: unpacklopd_not:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm0[1],xmm1[1]
-; CHECK-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; CHECK-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; CHECK-NEXT: retq
%shuffle.i = shufflevector <4 x double> %src1, <4 x double> %src2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
@@ -61,7 +61,7 @@ define <4 x double> @unpacklopd_not(<4 x double> %src1, <4 x double> %src2) noun
define <8 x float> @unpackhips_not(<8 x float> %src1, <8 x float> %src2) nounwind uwtable readnone ssp {
; CHECK-LABEL: unpackhips_not:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[u,2,u,3,u,4,u,5]
; CHECK-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,u,3,u,4,u,5,u]
; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
@@ -72,11 +72,11 @@ define <8 x float> @unpackhips_not(<8 x float> %src1, <8 x float> %src2) nounwin
define <4 x double> @unpackhipd_not(<4 x double> %src1, <4 x double> %src2) nounwind uwtable readnone ssp {
; CHECK-LABEL: unpackhipd_not:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm1
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0
; CHECK-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm0[1],xmm1[1]
-; CHECK-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; CHECK-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; CHECK-NEXT: retq
%shuffle.i = shufflevector <4 x double> %src1, <4 x double> %src2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
@@ -89,7 +89,7 @@ define <4 x double> @unpackhipd_not(<4 x double> %src1, <4 x double> %src2) noun
define <8 x i32> @unpackhips1(<8 x i32> %src1, <8 x i32> %src2) nounwind uwtable readnone ssp {
; CHECK-LABEL: unpackhips1:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
; CHECK-NEXT: retq
%shuffle.i = shufflevector <8 x i32> %src1, <8 x i32> %src2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
@@ -98,7 +98,7 @@ define <8 x i32> @unpackhips1(<8 x i32> %src1, <8 x i32> %src2) nounwind uwtable
define <8 x i32> @unpackhips2(<8 x i32>* %src1, <8 x i32>* %src2) nounwind uwtable readnone ssp {
; CHECK-LABEL: unpackhips2:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps (%rdi), %ymm0
; CHECK-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
; CHECK-NEXT: retq
@@ -110,7 +110,7 @@ define <8 x i32> @unpackhips2(<8 x i32>* %src1, <8 x i32>* %src2) nounwind uwtab
define <4 x i64> @unpackhipd1(<4 x i64> %src1, <4 x i64> %src2) nounwind uwtable readnone ssp {
; CHECK-LABEL: unpackhipd1:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
; CHECK-NEXT: retq
%shuffle.i = shufflevector <4 x i64> %src1, <4 x i64> %src2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
@@ -119,8 +119,8 @@ define <4 x i64> @unpackhipd1(<4 x i64> %src1, <4 x i64> %src2) nounwind uwtable
define <4 x i64> @unpackhipd2(<4 x i64>* %src1, <4 x i64>* %src2) nounwind uwtable readnone ssp {
; CHECK-LABEL: unpackhipd2:
-; CHECK: # BB#0:
-; CHECK-NEXT: vmovapd (%rdi), %ymm0
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps (%rdi), %ymm0
; CHECK-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
; CHECK-NEXT: retq
%a = load <4 x i64>, <4 x i64>* %src1
@@ -131,7 +131,7 @@ define <4 x i64> @unpackhipd2(<4 x i64>* %src1, <4 x i64>* %src2) nounwind uwtab
define <8 x i32> @unpacklops1(<8 x i32> %src1, <8 x i32> %src2) nounwind uwtable readnone ssp {
; CHECK-LABEL: unpacklops1:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
; CHECK-NEXT: retq
%shuffle.i = shufflevector <8 x i32> %src1, <8 x i32> %src2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
@@ -140,7 +140,7 @@ define <8 x i32> @unpacklops1(<8 x i32> %src1, <8 x i32> %src2) nounwind uwtable
define <8 x i32> @unpacklops2(<8 x i32>* %src1, <8 x i32>* %src2) nounwind uwtable readnone ssp {
; CHECK-LABEL: unpacklops2:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps (%rdi), %ymm0
; CHECK-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
; CHECK-NEXT: retq
@@ -152,7 +152,7 @@ define <8 x i32> @unpacklops2(<8 x i32>* %src1, <8 x i32>* %src2) nounwind uwtab
define <4 x i64> @unpacklopd1(<4 x i64> %src1, <4 x i64> %src2) nounwind uwtable readnone ssp {
; CHECK-LABEL: unpacklopd1:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
; CHECK-NEXT: retq
%shuffle.i = shufflevector <4 x i64> %src1, <4 x i64> %src2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
@@ -161,8 +161,8 @@ define <4 x i64> @unpacklopd1(<4 x i64> %src1, <4 x i64> %src2) nounwind uwtable
define <4 x i64> @unpacklopd2(<4 x i64>* %src1, <4 x i64>* %src2) nounwind uwtable readnone ssp {
; CHECK-LABEL: unpacklopd2:
-; CHECK: # BB#0:
-; CHECK-NEXT: vmovapd (%rdi), %ymm0
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps (%rdi), %ymm0
; CHECK-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2]
; CHECK-NEXT: retq
%a = load <4 x i64>, <4 x i64>* %src1
@@ -173,7 +173,7 @@ define <4 x i64> @unpacklopd2(<4 x i64>* %src1, <4 x i64>* %src2) nounwind uwtab
define <16 x i16> @unpackhwd_undef(<16 x i16> %src1) nounwind uwtable readnone ssp {
; CHECK-LABEL: unpackhwd_undef:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4,4,5,5,6,6,7,7]
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0
; CHECK-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
@@ -185,7 +185,7 @@ define <16 x i16> @unpackhwd_undef(<16 x i16> %src1) nounwind uwtable readnone s
define <16 x i16> @unpacklwd_undef(<16 x i16> %src1) nounwind uwtable readnone ssp {
; CHECK-LABEL: unpacklwd_undef:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0,0,1,1,2,2,3,3]
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0
; CHECK-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
@@ -197,7 +197,7 @@ define <16 x i16> @unpacklwd_undef(<16 x i16> %src1) nounwind uwtable readnone s
define <32 x i8> @unpackhbw_undef(<32 x i8> %src1, <32 x i8> %src2) nounwind uwtable readnone ssp {
; CHECK-LABEL: unpackhbw_undef:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0
; CHECK-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
@@ -209,7 +209,7 @@ define <32 x i8> @unpackhbw_undef(<32 x i8> %src1, <32 x i8> %src2) nounwind uwt
define <32 x i8> @unpacklbw_undef(<32 x i8> %src1) nounwind uwtable readnone ssp {
; CHECK-LABEL: unpacklbw_undef:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0
; CHECK-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
diff --git a/test/CodeGen/X86/avx-vbroadcast.ll b/test/CodeGen/X86/avx-vbroadcast.ll
index 41ea2a8c3677..5a9f23007d86 100644
--- a/test/CodeGen/X86/avx-vbroadcast.ll
+++ b/test/CodeGen/X86/avx-vbroadcast.ll
@@ -4,7 +4,7 @@
define <4 x i64> @A(i64* %ptr) nounwind uwtable readnone ssp {
; X32-LABEL: A:
-; X32: ## BB#0: ## %entry
+; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl (%eax), %ecx
; X32-NEXT: movl 4(%eax), %eax
@@ -16,7 +16,7 @@ define <4 x i64> @A(i64* %ptr) nounwind uwtable readnone ssp {
; X32-NEXT: retl
;
; X64-LABEL: A:
-; X64: ## BB#0: ## %entry
+; X64: ## %bb.0: ## %entry
; X64-NEXT: vbroadcastsd (%rdi), %ymm0
; X64-NEXT: retq
entry:
@@ -30,7 +30,7 @@ entry:
define <4 x i64> @A2(i64* %ptr, i64* %ptr2) nounwind uwtable readnone ssp {
; X32-LABEL: A2:
-; X32: ## BB#0: ## %entry
+; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: movl (%ecx), %edx
@@ -45,7 +45,7 @@ define <4 x i64> @A2(i64* %ptr, i64* %ptr2) nounwind uwtable readnone ssp {
; X32-NEXT: retl
;
; X64-LABEL: A2:
-; X64: ## BB#0: ## %entry
+; X64: ## %bb.0: ## %entry
; X64-NEXT: movq (%rdi), %rax
; X64-NEXT: vmovq %rax, %xmm0
; X64-NEXT: movq %rax, (%rsi)
@@ -64,13 +64,13 @@ entry:
define <8 x i32> @B(i32* %ptr) nounwind uwtable readnone ssp {
; X32-LABEL: B:
-; X32: ## BB#0: ## %entry
+; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vbroadcastss (%eax), %ymm0
; X32-NEXT: retl
;
; X64-LABEL: B:
-; X64: ## BB#0: ## %entry
+; X64: ## %bb.0: ## %entry
; X64-NEXT: vbroadcastss (%rdi), %ymm0
; X64-NEXT: retq
entry:
@@ -84,13 +84,13 @@ entry:
define <8 x i32> @B2(i32* %ptr) nounwind uwtable readnone ssp {
; X32-LABEL: B2:
-; X32: ## BB#0: ## %entry
+; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vbroadcastss (%eax), %ymm0
; X32-NEXT: retl
;
; X64-LABEL: B2:
-; X64: ## BB#0: ## %entry
+; X64: ## %bb.0: ## %entry
; X64-NEXT: vbroadcastss (%rdi), %ymm0
; X64-NEXT: retq
entry:
@@ -108,22 +108,22 @@ entry:
define <8 x i32> @B3(i32* %ptr, i32* %ptr2) nounwind uwtable readnone ssp {
; X32-LABEL: B3:
-; X32: ## BB#0: ## %entry
+; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: movl (%ecx), %ecx
; X32-NEXT: vmovd %ecx, %xmm0
; X32-NEXT: movl %ecx, (%eax)
-; X32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; X32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: B3:
-; X64: ## BB#0: ## %entry
+; X64: ## %bb.0: ## %entry
; X64-NEXT: movl (%rdi), %eax
; X64-NEXT: vmovd %eax, %xmm0
; X64-NEXT: movl %eax, (%rsi)
-; X64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; X64-NEXT: retq
entry:
@@ -142,13 +142,13 @@ entry:
define <4 x double> @C(double* %ptr) nounwind uwtable readnone ssp {
; X32-LABEL: C:
-; X32: ## BB#0: ## %entry
+; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vbroadcastsd (%eax), %ymm0
; X32-NEXT: retl
;
; X64-LABEL: C:
-; X64: ## BB#0: ## %entry
+; X64: ## %bb.0: ## %entry
; X64-NEXT: vbroadcastsd (%rdi), %ymm0
; X64-NEXT: retq
entry:
@@ -162,7 +162,7 @@ entry:
define <4 x double> @C2(double* %ptr, double* %ptr2) nounwind uwtable readnone ssp {
; X32-LABEL: C2:
-; X32: ## BB#0: ## %entry
+; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
@@ -172,7 +172,7 @@ define <4 x double> @C2(double* %ptr, double* %ptr2) nounwind uwtable readnone s
; X32-NEXT: retl
;
; X64-LABEL: C2:
-; X64: ## BB#0: ## %entry
+; X64: ## %bb.0: ## %entry
; X64-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; X64-NEXT: vmovsd %xmm0, (%rsi)
; X64-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
@@ -190,13 +190,13 @@ entry:
define <8 x float> @D(float* %ptr) nounwind uwtable readnone ssp {
; X32-LABEL: D:
-; X32: ## BB#0: ## %entry
+; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vbroadcastss (%eax), %ymm0
; X32-NEXT: retl
;
; X64-LABEL: D:
-; X64: ## BB#0: ## %entry
+; X64: ## %bb.0: ## %entry
; X64-NEXT: vbroadcastss (%rdi), %ymm0
; X64-NEXT: retq
entry:
@@ -210,13 +210,13 @@ entry:
define <8 x float> @D2(float* %ptr) nounwind uwtable readnone ssp {
; X32-LABEL: D2:
-; X32: ## BB#0: ## %entry
+; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vbroadcastss (%eax), %ymm0
; X32-NEXT: retl
;
; X64-LABEL: D2:
-; X64: ## BB#0: ## %entry
+; X64: ## %bb.0: ## %entry
; X64-NEXT: vbroadcastss (%rdi), %ymm0
; X64-NEXT: retq
entry:
@@ -234,7 +234,7 @@ entry:
define <8 x float> @D3(float* %ptr, float* %ptr2) nounwind uwtable readnone ssp {
; X32-LABEL: D3:
-; X32: ## BB#0: ## %entry
+; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
@@ -244,7 +244,7 @@ define <8 x float> @D3(float* %ptr, float* %ptr2) nounwind uwtable readnone ssp
; X32-NEXT: retl
;
; X64-LABEL: D3:
-; X64: ## BB#0: ## %entry
+; X64: ## %bb.0: ## %entry
; X64-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X64-NEXT: vmovss %xmm0, (%rsi)
; X64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
@@ -268,13 +268,13 @@ entry:
define <4 x float> @e(float* %ptr) nounwind uwtable readnone ssp {
; X32-LABEL: e:
-; X32: ## BB#0: ## %entry
+; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vbroadcastss (%eax), %xmm0
; X32-NEXT: retl
;
; X64-LABEL: e:
-; X64: ## BB#0: ## %entry
+; X64: ## %bb.0: ## %entry
; X64-NEXT: vbroadcastss (%rdi), %xmm0
; X64-NEXT: retq
entry:
@@ -288,7 +288,7 @@ entry:
define <4 x float> @e2(float* %ptr, float* %ptr2) nounwind uwtable readnone ssp {
; X32-LABEL: e2:
-; X32: ## BB#0: ## %entry
+; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
@@ -297,7 +297,7 @@ define <4 x float> @e2(float* %ptr, float* %ptr2) nounwind uwtable readnone ssp
; X32-NEXT: retl
;
; X64-LABEL: e2:
-; X64: ## BB#0: ## %entry
+; X64: ## %bb.0: ## %entry
; X64-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X64-NEXT: vmovss %xmm0, (%rsi)
; X64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
@@ -315,12 +315,12 @@ entry:
; Don't broadcast constants on pre-AVX2 hardware.
define <4 x float> @_e2(float* %ptr) nounwind uwtable readnone ssp {
; X32-LABEL: _e2:
-; X32: ## BB#0: ## %entry
+; X32: ## %bb.0: ## %entry
; X32-NEXT: vmovaps {{.*#+}} xmm0 = [-7.812500e-03,-7.812500e-03,-7.812500e-03,-7.812500e-03]
; X32-NEXT: retl
;
; X64-LABEL: _e2:
-; X64: ## BB#0: ## %entry
+; X64: ## %bb.0: ## %entry
; X64-NEXT: vmovaps {{.*#+}} xmm0 = [-7.812500e-03,-7.812500e-03,-7.812500e-03,-7.812500e-03]
; X64-NEXT: retq
entry:
@@ -334,13 +334,13 @@ entry:
define <4 x i32> @F(i32* %ptr) nounwind uwtable readnone ssp {
; X32-LABEL: F:
-; X32: ## BB#0: ## %entry
+; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vbroadcastss (%eax), %xmm0
; X32-NEXT: retl
;
; X64-LABEL: F:
-; X64: ## BB#0: ## %entry
+; X64: ## %bb.0: ## %entry
; X64-NEXT: vbroadcastss (%rdi), %xmm0
; X64-NEXT: retq
entry:
@@ -354,7 +354,7 @@ entry:
define <4 x i32> @F2(i32* %ptr, i32* %ptr2) nounwind uwtable readnone ssp {
; X32-LABEL: F2:
-; X32: ## BB#0: ## %entry
+; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: movl (%ecx), %ecx
@@ -364,7 +364,7 @@ define <4 x i32> @F2(i32* %ptr, i32* %ptr2) nounwind uwtable readnone ssp {
; X32-NEXT: retl
;
; X64-LABEL: F2:
-; X64: ## BB#0: ## %entry
+; X64: ## %bb.0: ## %entry
; X64-NEXT: movl (%rdi), %eax
; X64-NEXT: movl %eax, (%rsi)
; X64-NEXT: vmovd %eax, %xmm0
@@ -384,14 +384,14 @@ entry:
define <4 x i32> @load_splat_4i32_4i32_1111(<4 x i32>* %ptr) nounwind uwtable readnone ssp {
; X32-LABEL: load_splat_4i32_4i32_1111:
-; X32: ## BB#0: ## %entry
+; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: vpshufd {{.*#+}} xmm0 = mem[1,1,1,1]
+; X32-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,1,1,1]
; X32-NEXT: retl
;
; X64-LABEL: load_splat_4i32_4i32_1111:
-; X64: ## BB#0: ## %entry
-; X64-NEXT: vpshufd {{.*#+}} xmm0 = mem[1,1,1,1]
+; X64: ## %bb.0: ## %entry
+; X64-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,1,1,1]
; X64-NEXT: retq
entry:
%ld = load <4 x i32>, <4 x i32>* %ptr
@@ -401,13 +401,13 @@ entry:
define <8 x i32> @load_splat_8i32_4i32_33333333(<4 x i32>* %ptr) nounwind uwtable readnone ssp {
; X32-LABEL: load_splat_8i32_4i32_33333333:
-; X32: ## BB#0: ## %entry
+; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vbroadcastss 12(%eax), %ymm0
; X32-NEXT: retl
;
; X64-LABEL: load_splat_8i32_4i32_33333333:
-; X64: ## BB#0: ## %entry
+; X64: ## %bb.0: ## %entry
; X64-NEXT: vbroadcastss 12(%rdi), %ymm0
; X64-NEXT: retq
entry:
@@ -418,13 +418,13 @@ entry:
define <8 x i32> @load_splat_8i32_8i32_55555555(<8 x i32>* %ptr) nounwind uwtable readnone ssp {
; X32-LABEL: load_splat_8i32_8i32_55555555:
-; X32: ## BB#0: ## %entry
+; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vbroadcastss 20(%eax), %ymm0
; X32-NEXT: retl
;
; X64-LABEL: load_splat_8i32_8i32_55555555:
-; X64: ## BB#0: ## %entry
+; X64: ## %bb.0: ## %entry
; X64-NEXT: vbroadcastss 20(%rdi), %ymm0
; X64-NEXT: retq
entry:
@@ -435,13 +435,13 @@ entry:
define <4 x float> @load_splat_4f32_4f32_1111(<4 x float>* %ptr) nounwind uwtable readnone ssp {
; X32-LABEL: load_splat_4f32_4f32_1111:
-; X32: ## BB#0: ## %entry
+; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vbroadcastss 4(%eax), %xmm0
; X32-NEXT: retl
;
; X64-LABEL: load_splat_4f32_4f32_1111:
-; X64: ## BB#0: ## %entry
+; X64: ## %bb.0: ## %entry
; X64-NEXT: vbroadcastss 4(%rdi), %xmm0
; X64-NEXT: retq
entry:
@@ -452,13 +452,13 @@ entry:
define <8 x float> @load_splat_8f32_4f32_33333333(<4 x float>* %ptr) nounwind uwtable readnone ssp {
; X32-LABEL: load_splat_8f32_4f32_33333333:
-; X32: ## BB#0: ## %entry
+; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vbroadcastss 12(%eax), %ymm0
; X32-NEXT: retl
;
; X64-LABEL: load_splat_8f32_4f32_33333333:
-; X64: ## BB#0: ## %entry
+; X64: ## %bb.0: ## %entry
; X64-NEXT: vbroadcastss 12(%rdi), %ymm0
; X64-NEXT: retq
entry:
@@ -469,13 +469,13 @@ entry:
define <8 x float> @load_splat_8f32_8f32_55555555(<8 x float>* %ptr) nounwind uwtable readnone ssp {
; X32-LABEL: load_splat_8f32_8f32_55555555:
-; X32: ## BB#0: ## %entry
+; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vbroadcastss 20(%eax), %ymm0
; X32-NEXT: retl
;
; X64-LABEL: load_splat_8f32_8f32_55555555:
-; X64: ## BB#0: ## %entry
+; X64: ## %bb.0: ## %entry
; X64-NEXT: vbroadcastss 20(%rdi), %ymm0
; X64-NEXT: retq
entry:
@@ -486,14 +486,14 @@ entry:
define <2 x i64> @load_splat_2i64_2i64_1111(<2 x i64>* %ptr) nounwind uwtable readnone ssp {
; X32-LABEL: load_splat_2i64_2i64_1111:
-; X32: ## BB#0: ## %entry
+; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: vpshufd {{.*#+}} xmm0 = mem[2,3,2,3]
+; X32-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,2,3]
; X32-NEXT: retl
;
; X64-LABEL: load_splat_2i64_2i64_1111:
-; X64: ## BB#0: ## %entry
-; X64-NEXT: vpshufd {{.*#+}} xmm0 = mem[2,3,2,3]
+; X64: ## %bb.0: ## %entry
+; X64-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,2,3]
; X64-NEXT: retq
entry:
%ld = load <2 x i64>, <2 x i64>* %ptr
@@ -503,13 +503,13 @@ entry:
define <4 x i64> @load_splat_4i64_2i64_1111(<2 x i64>* %ptr) nounwind uwtable readnone ssp {
; X32-LABEL: load_splat_4i64_2i64_1111:
-; X32: ## BB#0: ## %entry
+; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vbroadcastsd 8(%eax), %ymm0
; X32-NEXT: retl
;
; X64-LABEL: load_splat_4i64_2i64_1111:
-; X64: ## BB#0: ## %entry
+; X64: ## %bb.0: ## %entry
; X64-NEXT: vbroadcastsd 8(%rdi), %ymm0
; X64-NEXT: retq
entry:
@@ -520,13 +520,13 @@ entry:
define <4 x i64> @load_splat_4i64_4i64_2222(<4 x i64>* %ptr) nounwind uwtable readnone ssp {
; X32-LABEL: load_splat_4i64_4i64_2222:
-; X32: ## BB#0: ## %entry
+; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vbroadcastsd 16(%eax), %ymm0
; X32-NEXT: retl
;
; X64-LABEL: load_splat_4i64_4i64_2222:
-; X64: ## BB#0: ## %entry
+; X64: ## %bb.0: ## %entry
; X64-NEXT: vbroadcastsd 16(%rdi), %ymm0
; X64-NEXT: retq
entry:
@@ -537,13 +537,13 @@ entry:
define <2 x double> @load_splat_2f64_2f64_1111(<2 x double>* %ptr) nounwind uwtable readnone ssp {
; X32-LABEL: load_splat_2f64_2f64_1111:
-; X32: ## BB#0: ## %entry
+; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
; X32-NEXT: retl
;
; X64-LABEL: load_splat_2f64_2f64_1111:
-; X64: ## BB#0: ## %entry
+; X64: ## %bb.0: ## %entry
; X64-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
; X64-NEXT: retq
entry:
@@ -554,13 +554,13 @@ entry:
define <4 x double> @load_splat_4f64_2f64_1111(<2 x double>* %ptr) nounwind uwtable readnone ssp {
; X32-LABEL: load_splat_4f64_2f64_1111:
-; X32: ## BB#0: ## %entry
+; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vbroadcastsd 8(%eax), %ymm0
; X32-NEXT: retl
;
; X64-LABEL: load_splat_4f64_2f64_1111:
-; X64: ## BB#0: ## %entry
+; X64: ## %bb.0: ## %entry
; X64-NEXT: vbroadcastsd 8(%rdi), %ymm0
; X64-NEXT: retq
entry:
@@ -571,13 +571,13 @@ entry:
define <4 x double> @load_splat_4f64_4f64_2222(<4 x double>* %ptr) nounwind uwtable readnone ssp {
; X32-LABEL: load_splat_4f64_4f64_2222:
-; X32: ## BB#0: ## %entry
+; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vbroadcastsd 16(%eax), %ymm0
; X32-NEXT: retl
;
; X64-LABEL: load_splat_4f64_4f64_2222:
-; X64: ## BB#0: ## %entry
+; X64: ## %bb.0: ## %entry
; X64-NEXT: vbroadcastsd 16(%rdi), %ymm0
; X64-NEXT: retq
entry:
@@ -590,7 +590,7 @@ entry:
define <2 x i64> @G(i64* %ptr) nounwind uwtable readnone ssp {
; X32-LABEL: G:
-; X32: ## BB#0: ## %entry
+; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl (%eax), %ecx
; X32-NEXT: movl 4(%eax), %eax
@@ -601,9 +601,9 @@ define <2 x i64> @G(i64* %ptr) nounwind uwtable readnone ssp {
; X32-NEXT: retl
;
; X64-LABEL: G:
-; X64: ## BB#0: ## %entry
-; X64-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
-; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; X64: ## %bb.0: ## %entry
+; X64-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1]
; X64-NEXT: retq
entry:
%q = load i64, i64* %ptr, align 8
@@ -614,7 +614,7 @@ entry:
define <2 x i64> @G2(i64* %ptr, i64* %ptr2) nounwind uwtable readnone ssp {
; X32-LABEL: G2:
-; X32: ## BB#0: ## %entry
+; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: movl (%ecx), %edx
@@ -628,7 +628,7 @@ define <2 x i64> @G2(i64* %ptr, i64* %ptr2) nounwind uwtable readnone ssp {
; X32-NEXT: retl
;
; X64-LABEL: G2:
-; X64: ## BB#0: ## %entry
+; X64: ## %bb.0: ## %entry
; X64-NEXT: movq (%rdi), %rax
; X64-NEXT: movq %rax, (%rsi)
; X64-NEXT: vmovq %rax, %xmm0
@@ -644,13 +644,13 @@ entry:
define <4 x i32> @H(<4 x i32> %a) {
; X32-LABEL: H:
-; X32: ## BB#0: ## %entry
-; X32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; X32: ## %bb.0: ## %entry
+; X32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,2,3]
; X32-NEXT: retl
;
; X64-LABEL: H:
-; X64: ## BB#0: ## %entry
-; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; X64: ## %bb.0: ## %entry
+; X64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,2,3]
; X64-NEXT: retq
entry:
%x = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
@@ -659,13 +659,13 @@ entry:
define <2 x double> @I(double* %ptr) nounwind uwtable readnone ssp {
; X32-LABEL: I:
-; X32: ## BB#0: ## %entry
+; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
; X32-NEXT: retl
;
; X64-LABEL: I:
-; X64: ## BB#0: ## %entry
+; X64: ## %bb.0: ## %entry
; X64-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
; X64-NEXT: retq
entry:
@@ -677,7 +677,7 @@ entry:
define <2 x double> @I2(double* %ptr, double* %ptr2) nounwind uwtable readnone ssp {
; X32-LABEL: I2:
-; X32: ## BB#0: ## %entry
+; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
@@ -686,7 +686,7 @@ define <2 x double> @I2(double* %ptr, double* %ptr2) nounwind uwtable readnone s
; X32-NEXT: retl
;
; X64-LABEL: I2:
-; X64: ## BB#0: ## %entry
+; X64: ## %bb.0: ## %entry
; X64-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; X64-NEXT: vmovsd %xmm0, (%rsi)
; X64-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
@@ -701,7 +701,7 @@ entry:
define <4 x float> @_RR(float* %ptr, i32* %k) nounwind uwtable readnone ssp {
; X32-LABEL: _RR:
-; X32: ## BB#0: ## %entry
+; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: vbroadcastss (%ecx), %xmm0
@@ -710,7 +710,7 @@ define <4 x float> @_RR(float* %ptr, i32* %k) nounwind uwtable readnone ssp {
; X32-NEXT: retl
;
; X64-LABEL: _RR:
-; X64: ## BB#0: ## %entry
+; X64: ## %bb.0: ## %entry
; X64-NEXT: vbroadcastss (%rdi), %xmm0
; X64-NEXT: movl (%rsi), %eax
; X64-NEXT: movl %eax, (%rax)
@@ -729,13 +729,13 @@ entry:
define <4 x float> @_RR2(float* %ptr, i32* %k) nounwind uwtable readnone ssp {
; X32-LABEL: _RR2:
-; X32: ## BB#0: ## %entry
+; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vbroadcastss (%eax), %xmm0
; X32-NEXT: retl
;
; X64-LABEL: _RR2:
-; X64: ## BB#0: ## %entry
+; X64: ## %bb.0: ## %entry
; X64-NEXT: vbroadcastss (%rdi), %xmm0
; X64-NEXT: retq
entry:
@@ -751,13 +751,13 @@ entry:
define <8 x float> @splat_concat1(float* %p) {
; X32-LABEL: splat_concat1:
-; X32: ## BB#0:
+; X32: ## %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vbroadcastss (%eax), %ymm0
; X32-NEXT: retl
;
; X64-LABEL: splat_concat1:
-; X64: ## BB#0:
+; X64: ## %bb.0:
; X64-NEXT: vbroadcastss (%rdi), %ymm0
; X64-NEXT: retq
%1 = load float, float* %p, align 4
@@ -771,13 +771,13 @@ define <8 x float> @splat_concat1(float* %p) {
define <8 x float> @splat_concat2(float* %p) {
; X32-LABEL: splat_concat2:
-; X32: ## BB#0:
+; X32: ## %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vbroadcastss (%eax), %ymm0
; X32-NEXT: retl
;
; X64-LABEL: splat_concat2:
-; X64: ## BB#0:
+; X64: ## %bb.0:
; X64-NEXT: vbroadcastss (%rdi), %ymm0
; X64-NEXT: retq
%1 = load float, float* %p, align 4
@@ -795,13 +795,13 @@ define <8 x float> @splat_concat2(float* %p) {
define <4 x double> @splat_concat3(double* %p) {
; X32-LABEL: splat_concat3:
-; X32: ## BB#0:
+; X32: ## %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vbroadcastsd (%eax), %ymm0
; X32-NEXT: retl
;
; X64-LABEL: splat_concat3:
-; X64: ## BB#0:
+; X64: ## %bb.0:
; X64-NEXT: vbroadcastsd (%rdi), %ymm0
; X64-NEXT: retq
%1 = load double, double* %p, align 8
@@ -813,13 +813,13 @@ define <4 x double> @splat_concat3(double* %p) {
define <4 x double> @splat_concat4(double* %p) {
; X32-LABEL: splat_concat4:
-; X32: ## BB#0:
+; X32: ## %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vbroadcastsd (%eax), %ymm0
; X32-NEXT: retl
;
; X64-LABEL: splat_concat4:
-; X64: ## BB#0:
+; X64: ## %bb.0:
; X64-NEXT: vbroadcastsd (%rdi), %ymm0
; X64-NEXT: retq
%1 = load double, double* %p, align 8
@@ -831,12 +831,48 @@ define <4 x double> @splat_concat4(double* %p) {
ret <4 x double> %6
}
+; PR34041
+define <4 x double> @broadcast_shuffle_1000(double* %p) {
+; X32-LABEL: broadcast_shuffle_1000:
+; X32: ## %bb.0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vbroadcastsd (%eax), %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: broadcast_shuffle_1000:
+; X64: ## %bb.0:
+; X64-NEXT: vbroadcastsd (%rdi), %ymm0
+; X64-NEXT: retq
+ %1 = load double, double* %p
+ %2 = insertelement <2 x double> undef, double %1, i32 0
+ %3 = shufflevector <2 x double> %2, <2 x double> undef, <4 x i32> <i32 1, i32 0, i32 0, i32 0>
+ ret <4 x double> %3
+}
+
+define <4 x double> @broadcast_shuffle1032(double* %p) {
+; X32-LABEL: broadcast_shuffle1032:
+; X32: ## %bb.0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vbroadcastsd (%eax), %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: broadcast_shuffle1032:
+; X64: ## %bb.0:
+; X64-NEXT: vbroadcastsd (%rdi), %ymm0
+; X64-NEXT: retq
+ %1 = load double, double* %p
+ %2 = insertelement <2 x double> undef, double %1, i32 1
+ %3 = insertelement <2 x double> undef, double %1, i32 0
+ %4 = shufflevector <2 x double> %2, <2 x double> %3, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+ ret <4 x double> %4
+}
+
;
; When VBROADCAST replaces an existing load, ensure it still respects lifetime dependencies.
;
define float @broadcast_lifetime() nounwind {
; X32-LABEL: broadcast_lifetime:
-; X32: ## BB#0:
+; X32: ## %bb.0:
; X32-NEXT: pushl %esi
; X32-NEXT: subl $56, %esp
; X32-NEXT: leal {{[0-9]+}}(%esp), %esi
@@ -858,7 +894,7 @@ define float @broadcast_lifetime() nounwind {
; X32-NEXT: retl
;
; X64-LABEL: broadcast_lifetime:
-; X64: ## BB#0:
+; X64: ## %bb.0:
; X64-NEXT: subq $40, %rsp
; X64-NEXT: movq %rsp, %rdi
; X64-NEXT: callq _gfunc
diff --git a/test/CodeGen/X86/avx-vbroadcastf128.ll b/test/CodeGen/X86/avx-vbroadcastf128.ll
index 426ff8159590..7fdbf31a9931 100644
--- a/test/CodeGen/X86/avx-vbroadcastf128.ll
+++ b/test/CodeGen/X86/avx-vbroadcastf128.ll
@@ -1,16 +1,16 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx | FileCheck %s --check-prefix=X32
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=X64
define <4 x double> @test_broadcast_2f64_4f64(<2 x double> *%p) nounwind {
; X32-LABEL: test_broadcast_2f64_4f64:
-; X32: ## BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
; X32-NEXT: retl
;
; X64-LABEL: test_broadcast_2f64_4f64:
-; X64: ## BB#0:
+; X64: # %bb.0:
; X64-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
; X64-NEXT: retq
%1 = load <2 x double>, <2 x double> *%p
@@ -20,13 +20,13 @@ define <4 x double> @test_broadcast_2f64_4f64(<2 x double> *%p) nounwind {
define <4 x i64> @test_broadcast_2i64_4i64(<2 x i64> *%p) nounwind {
; X32-LABEL: test_broadcast_2i64_4i64:
-; X32: ## BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
; X32-NEXT: retl
;
; X64-LABEL: test_broadcast_2i64_4i64:
-; X64: ## BB#0:
+; X64: # %bb.0:
; X64-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
; X64-NEXT: retq
%1 = load <2 x i64>, <2 x i64> *%p
@@ -36,13 +36,13 @@ define <4 x i64> @test_broadcast_2i64_4i64(<2 x i64> *%p) nounwind {
define <8 x float> @test_broadcast_4f32_8f32(<4 x float> *%p) nounwind {
; X32-LABEL: test_broadcast_4f32_8f32:
-; X32: ## BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
; X32-NEXT: retl
;
; X64-LABEL: test_broadcast_4f32_8f32:
-; X64: ## BB#0:
+; X64: # %bb.0:
; X64-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
; X64-NEXT: retq
%1 = load <4 x float>, <4 x float> *%p
@@ -52,13 +52,13 @@ define <8 x float> @test_broadcast_4f32_8f32(<4 x float> *%p) nounwind {
define <8 x i32> @test_broadcast_4i32_8i32(<4 x i32> *%p) nounwind {
; X32-LABEL: test_broadcast_4i32_8i32:
-; X32: ## BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
; X32-NEXT: retl
;
; X64-LABEL: test_broadcast_4i32_8i32:
-; X64: ## BB#0:
+; X64: # %bb.0:
; X64-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
; X64-NEXT: retq
%1 = load <4 x i32>, <4 x i32> *%p
@@ -68,13 +68,13 @@ define <8 x i32> @test_broadcast_4i32_8i32(<4 x i32> *%p) nounwind {
define <16 x i16> @test_broadcast_8i16_16i16(<8 x i16> *%p) nounwind {
; X32-LABEL: test_broadcast_8i16_16i16:
-; X32: ## BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
; X32-NEXT: retl
;
; X64-LABEL: test_broadcast_8i16_16i16:
-; X64: ## BB#0:
+; X64: # %bb.0:
; X64-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
; X64-NEXT: retq
%1 = load <8 x i16>, <8 x i16> *%p
@@ -84,13 +84,13 @@ define <16 x i16> @test_broadcast_8i16_16i16(<8 x i16> *%p) nounwind {
define <32 x i8> @test_broadcast_16i8_32i8(<16 x i8> *%p) nounwind {
; X32-LABEL: test_broadcast_16i8_32i8:
-; X32: ## BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
; X32-NEXT: retl
;
; X64-LABEL: test_broadcast_16i8_32i8:
-; X64: ## BB#0:
+; X64: # %bb.0:
; X64-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
; X64-NEXT: retq
%1 = load <16 x i8>, <16 x i8> *%p
@@ -100,7 +100,7 @@ define <32 x i8> @test_broadcast_16i8_32i8(<16 x i8> *%p) nounwind {
define <4 x double> @test_broadcast_2f64_4f64_reuse(<2 x double>* %p0, <2 x double>* %p1) {
; X32-LABEL: test_broadcast_2f64_4f64_reuse:
-; X32: ## BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: vmovaps (%ecx), %xmm1
@@ -109,7 +109,7 @@ define <4 x double> @test_broadcast_2f64_4f64_reuse(<2 x double>* %p0, <2 x doub
; X32-NEXT: retl
;
; X64-LABEL: test_broadcast_2f64_4f64_reuse:
-; X64: ## BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovaps (%rdi), %xmm1
; X64-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm0
; X64-NEXT: vmovaps %xmm1, (%rsi)
@@ -122,7 +122,7 @@ define <4 x double> @test_broadcast_2f64_4f64_reuse(<2 x double>* %p0, <2 x doub
define <4 x i64> @test_broadcast_2i64_4i64_reuse(<2 x i64>* %p0, <2 x i64>* %p1) {
; X32-LABEL: test_broadcast_2i64_4i64_reuse:
-; X32: ## BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: vmovaps (%ecx), %xmm1
@@ -131,7 +131,7 @@ define <4 x i64> @test_broadcast_2i64_4i64_reuse(<2 x i64>* %p0, <2 x i64>* %p1)
; X32-NEXT: retl
;
; X64-LABEL: test_broadcast_2i64_4i64_reuse:
-; X64: ## BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovaps (%rdi), %xmm1
; X64-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm0
; X64-NEXT: vmovaps %xmm1, (%rsi)
@@ -144,7 +144,7 @@ define <4 x i64> @test_broadcast_2i64_4i64_reuse(<2 x i64>* %p0, <2 x i64>* %p1)
define <8 x float> @test_broadcast_4f32_8f32_reuse(<4 x float>* %p0, <4 x float>* %p1) {
; X32-LABEL: test_broadcast_4f32_8f32_reuse:
-; X32: ## BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: vmovaps (%ecx), %xmm1
@@ -153,7 +153,7 @@ define <8 x float> @test_broadcast_4f32_8f32_reuse(<4 x float>* %p0, <4 x float>
; X32-NEXT: retl
;
; X64-LABEL: test_broadcast_4f32_8f32_reuse:
-; X64: ## BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovaps (%rdi), %xmm1
; X64-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm0
; X64-NEXT: vmovaps %xmm1, (%rsi)
@@ -166,7 +166,7 @@ define <8 x float> @test_broadcast_4f32_8f32_reuse(<4 x float>* %p0, <4 x float>
define <8 x i32> @test_broadcast_4i32_8i32_reuse(<4 x i32>* %p0, <4 x i32>* %p1) {
; X32-LABEL: test_broadcast_4i32_8i32_reuse:
-; X32: ## BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: vmovaps (%ecx), %xmm1
@@ -175,7 +175,7 @@ define <8 x i32> @test_broadcast_4i32_8i32_reuse(<4 x i32>* %p0, <4 x i32>* %p1)
; X32-NEXT: retl
;
; X64-LABEL: test_broadcast_4i32_8i32_reuse:
-; X64: ## BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovaps (%rdi), %xmm1
; X64-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm0
; X64-NEXT: vmovaps %xmm1, (%rsi)
@@ -188,7 +188,7 @@ define <8 x i32> @test_broadcast_4i32_8i32_reuse(<4 x i32>* %p0, <4 x i32>* %p1)
define <16 x i16> @test_broadcast_8i16_16i16_reuse(<8 x i16> *%p0, <8 x i16> *%p1) nounwind {
; X32-LABEL: test_broadcast_8i16_16i16_reuse:
-; X32: ## BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: vmovaps (%ecx), %xmm1
@@ -197,7 +197,7 @@ define <16 x i16> @test_broadcast_8i16_16i16_reuse(<8 x i16> *%p0, <8 x i16> *%p
; X32-NEXT: retl
;
; X64-LABEL: test_broadcast_8i16_16i16_reuse:
-; X64: ## BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovaps (%rdi), %xmm1
; X64-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm0
; X64-NEXT: vmovaps %xmm1, (%rsi)
@@ -210,7 +210,7 @@ define <16 x i16> @test_broadcast_8i16_16i16_reuse(<8 x i16> *%p0, <8 x i16> *%p
define <32 x i8> @test_broadcast_16i8_32i8_reuse(<16 x i8> *%p0, <16 x i8> *%p1) nounwind {
; X32-LABEL: test_broadcast_16i8_32i8_reuse:
-; X32: ## BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: vmovaps (%ecx), %xmm1
@@ -219,7 +219,7 @@ define <32 x i8> @test_broadcast_16i8_32i8_reuse(<16 x i8> *%p0, <16 x i8> *%p1)
; X32-NEXT: retl
;
; X64-LABEL: test_broadcast_16i8_32i8_reuse:
-; X64: ## BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovaps (%rdi), %xmm1
; X64-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm0
; X64-NEXT: vmovaps %xmm1, (%rsi)
@@ -232,19 +232,19 @@ define <32 x i8> @test_broadcast_16i8_32i8_reuse(<16 x i8> *%p0, <16 x i8> *%p1)
define <8 x i32> @PR29088(<4 x i32>* %p0, <8 x float>* %p1) {
; X32-LABEL: PR29088:
-; X32: ## BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: vmovaps (%ecx), %xmm0
-; X32-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; X32-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X32-NEXT: vmovaps %ymm1, (%eax)
; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: PR29088:
-; X64: ## BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovaps (%rdi), %xmm0
-; X64-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X64-NEXT: vmovaps %ymm1, (%rsi)
; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; X64-NEXT: retq
diff --git a/test/CodeGen/X86/avx-vextractf128.ll b/test/CodeGen/X86/avx-vextractf128.ll
index 2feddddaf780..924e510338ec 100644
--- a/test/CodeGen/X86/avx-vextractf128.ll
+++ b/test/CodeGen/X86/avx-vextractf128.ll
@@ -3,7 +3,7 @@
define <8 x float> @A(<8 x float> %a) nounwind uwtable readnone ssp {
; CHECK-LABEL: A:
-; CHECK: ## BB#0: ## %entry
+; CHECK: ## %bb.0: ## %entry
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0
; CHECK-NEXT: retq
entry:
@@ -13,7 +13,7 @@ entry:
define <4 x double> @B(<4 x double> %a) nounwind uwtable readnone ssp {
; CHECK-LABEL: B:
-; CHECK: ## BB#0: ## %entry
+; CHECK: ## %bb.0: ## %entry
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0
; CHECK-NEXT: retq
entry:
@@ -23,7 +23,7 @@ entry:
define void @t0(float* nocapture %addr, <8 x float> %a) nounwind uwtable ssp {
; CHECK-LABEL: t0:
-; CHECK: ## BB#0: ## %entry
+; CHECK: ## %bb.0: ## %entry
; CHECK-NEXT: vextractf128 $1, %ymm0, (%rdi)
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -36,7 +36,7 @@ entry:
define void @t2(double* nocapture %addr, <4 x double> %a) nounwind uwtable ssp {
; CHECK-LABEL: t2:
-; CHECK: ## BB#0: ## %entry
+; CHECK: ## %bb.0: ## %entry
; CHECK-NEXT: vextractf128 $1, %ymm0, (%rdi)
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -49,7 +49,7 @@ entry:
define void @t4(<2 x i64>* nocapture %addr, <4 x i64> %a) nounwind uwtable ssp {
; CHECK-LABEL: t4:
-; CHECK: ## BB#0: ## %entry
+; CHECK: ## %bb.0: ## %entry
; CHECK-NEXT: vextractf128 $1, %ymm0, (%rdi)
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -63,7 +63,7 @@ entry:
define void @t5(float* nocapture %addr, <8 x float> %a) nounwind uwtable ssp {
; CHECK-LABEL: t5:
-; CHECK: ## BB#0: ## %entry
+; CHECK: ## %bb.0: ## %entry
; CHECK-NEXT: vmovaps %xmm0, (%rdi)
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -76,7 +76,7 @@ entry:
define void @t6(double* nocapture %addr, <4 x double> %a) nounwind uwtable ssp {
; CHECK-LABEL: t6:
-; CHECK: ## BB#0: ## %entry
+; CHECK: ## %bb.0: ## %entry
; CHECK-NEXT: vmovaps %xmm0, (%rdi)
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -89,7 +89,7 @@ entry:
define void @t7(<2 x i64>* nocapture %addr, <4 x i64> %a) nounwind uwtable ssp {
; CHECK-LABEL: t7:
-; CHECK: ## BB#0: ## %entry
+; CHECK: ## %bb.0: ## %entry
; CHECK-NEXT: vmovaps %xmm0, (%rdi)
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -103,7 +103,7 @@ entry:
define void @t8(<2 x i64>* nocapture %addr, <4 x i64> %a) nounwind uwtable ssp {
; CHECK-LABEL: t8:
-; CHECK: ## BB#0: ## %entry
+; CHECK: ## %bb.0: ## %entry
; CHECK-NEXT: vmovups %xmm0, (%rdi)
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -118,8 +118,8 @@ entry:
; PR15462
define void @t9(i64* %p) {
; CHECK-LABEL: t9:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
; CHECK-NEXT: vmovups %ymm0, (%rdi)
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
diff --git a/test/CodeGen/X86/avx-vinsertf128.ll b/test/CodeGen/X86/avx-vinsertf128.ll
index b7a4d5b5c308..13b47c3d650f 100644
--- a/test/CodeGen/X86/avx-vinsertf128.ll
+++ b/test/CodeGen/X86/avx-vinsertf128.ll
@@ -3,7 +3,7 @@
define <8 x float> @A(<8 x float> %a) nounwind uwtable readnone ssp {
; CHECK-LABEL: A:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; CHECK-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> <i32 8, i32 8, i32 8, i32 8, i32 0, i32 1, i32 2, i32 3>
@@ -12,7 +12,7 @@ define <8 x float> @A(<8 x float> %a) nounwind uwtable readnone ssp {
define <4 x double> @B(<4 x double> %a) nounwind uwtable readnone ssp {
; CHECK-LABEL: B:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; CHECK-NEXT: retq
%shuffle = shufflevector <4 x double> %a, <4 x double> undef, <4 x i32> <i32 4, i32 4, i32 0, i32 1>
@@ -24,7 +24,7 @@ declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind
define void @insert_crash() nounwind {
; CHECK-LABEL: insert_crash:
-; CHECK: # BB#0: # %allocas
+; CHECK: # %bb.0: # %allocas
; CHECK-NEXT: vxorpd %xmm0, %xmm0, %xmm0
; CHECK-NEXT: vminpd %xmm0, %xmm0, %xmm0
; CHECK-NEXT: vminsd %xmm0, %xmm0, %xmm0
@@ -49,7 +49,7 @@ allocas:
define <4 x i32> @DAGCombineA(<4 x i32> %v1) nounwind readonly {
; CHECK-LABEL: DAGCombineA:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: retq
%t1 = shufflevector <4 x i32> %v1, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%t2 = shufflevector <8 x i32> %t1, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -58,7 +58,7 @@ define <4 x i32> @DAGCombineA(<4 x i32> %v1) nounwind readonly {
define <8 x i32> @DAGCombineB(<8 x i32> %v1, <8 x i32> %v2) nounwind readonly {
; CHECK-LABEL: DAGCombineB:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm2
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm3
; CHECK-NEXT: vpaddd %xmm3, %xmm2, %xmm2
@@ -74,8 +74,8 @@ define <8 x i32> @DAGCombineB(<8 x i32> %v1, <8 x i32> %v2) nounwind readonly {
define <4 x double> @insert_undef_pd(<4 x double> %a0, <2 x double> %a1) {
; CHECK-LABEL: insert_undef_pd:
-; CHECK: # BB#0:
-; CHECK-NEXT: # kill: %XMM1<def> %XMM1<kill> %YMM1<def>
+; CHECK: # %bb.0:
+; CHECK-NEXT: # kill: def %xmm1 killed %xmm1 def %ymm1
; CHECK-NEXT: vmovaps %ymm1, %ymm0
; CHECK-NEXT: retq
%res = call <4 x double> @llvm.x86.avx.vinsertf128.pd.256(<4 x double> undef, <2 x double> %a1, i8 0)
@@ -85,8 +85,8 @@ declare <4 x double> @llvm.x86.avx.vinsertf128.pd.256(<4 x double>, <2 x double>
define <8 x float> @insert_undef_ps(<8 x float> %a0, <4 x float> %a1) {
; CHECK-LABEL: insert_undef_ps:
-; CHECK: # BB#0:
-; CHECK-NEXT: # kill: %XMM1<def> %XMM1<kill> %YMM1<def>
+; CHECK: # %bb.0:
+; CHECK-NEXT: # kill: def %xmm1 killed %xmm1 def %ymm1
; CHECK-NEXT: vmovaps %ymm1, %ymm0
; CHECK-NEXT: retq
%res = call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> undef, <4 x float> %a1, i8 0)
@@ -96,8 +96,8 @@ declare <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float>, <4 x float>, i
define <8 x i32> @insert_undef_si(<8 x i32> %a0, <4 x i32> %a1) {
; CHECK-LABEL: insert_undef_si:
-; CHECK: # BB#0:
-; CHECK-NEXT: # kill: %XMM1<def> %XMM1<kill> %YMM1<def>
+; CHECK: # %bb.0:
+; CHECK-NEXT: # kill: def %xmm1 killed %xmm1 def %ymm1
; CHECK-NEXT: vmovaps %ymm1, %ymm0
; CHECK-NEXT: retq
%res = call <8 x i32> @llvm.x86.avx.vinsertf128.si.256(<8 x i32> undef, <4 x i32> %a1, i8 0)
@@ -108,7 +108,7 @@ declare <8 x i32> @llvm.x86.avx.vinsertf128.si.256(<8 x i32>, <4 x i32>, i8) nou
; rdar://10643481
define <8 x float> @vinsertf128_combine(float* nocapture %f) nounwind uwtable readonly ssp {
; CHECK-LABEL: vinsertf128_combine:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vinsertf128 $1, 16(%rdi), %ymm0, %ymm0
; CHECK-NEXT: retq
%add.ptr = getelementptr inbounds float, float* %f, i64 4
@@ -121,7 +121,7 @@ define <8 x float> @vinsertf128_combine(float* nocapture %f) nounwind uwtable re
; rdar://11076953
define <8 x float> @vinsertf128_ucombine(float* nocapture %f) nounwind uwtable readonly ssp {
; CHECK-LABEL: vinsertf128_ucombine:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vinsertf128 $1, 16(%rdi), %ymm0, %ymm0
; CHECK-NEXT: retq
%add.ptr = getelementptr inbounds float, float* %f, i64 4
diff --git a/test/CodeGen/X86/avx-vpclmulqdq.ll b/test/CodeGen/X86/avx-vpclmulqdq.ll
new file mode 100644
index 000000000000..2447ff0907c0
--- /dev/null
+++ b/test/CodeGen/X86/avx-vpclmulqdq.ll
@@ -0,0 +1,13 @@
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=avx,vpclmulqdq -show-mc-encoding | FileCheck %s --check-prefix=AVX_VPCLMULQDQ
+
+; Check for vpclmulqdq
+define <4 x i64> @test_x86_pclmulqdq(<4 x i64> %a0, <4 x i64> %a1) {
+; AVX_VPCLMULQDQ-LABEL: test_x86_pclmulqdq:
+; AVX_VPCLMULQDQ: # %bb.0:
+; AVX_VPCLMULQDQ-NEXT: vpclmulqdq $17, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0x7d,0x44,0xc1,0x11]
+; AVX_VPCLMULQDQ-NEXT: retl # encoding: [0xc3]
+ %res = call <4 x i64> @llvm.x86.pclmulqdq.256(<4 x i64> %a0, <4 x i64> %a1, i8 17)
+ ret <4 x i64> %res
+}
+declare <4 x i64> @llvm.x86.pclmulqdq.256(<4 x i64>, <4 x i64>, i8) nounwind readnone
+
diff --git a/test/CodeGen/X86/avx-vperm2x128.ll b/test/CodeGen/X86/avx-vperm2x128.ll
index 29b9decea429..42bc6b393698 100644
--- a/test/CodeGen/X86/avx-vperm2x128.ll
+++ b/test/CodeGen/X86/avx-vperm2x128.ll
@@ -1,22 +1,32 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX1
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx -disable-peephole | FileCheck %s --check-prefix=ALL --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 -disable-peephole | FileCheck %s --check-prefix=ALL --check-prefix=AVX2
define <8 x float> @shuffle_v8f32_45670123(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
-; ALL-LABEL: shuffle_v8f32_45670123:
-; ALL: ## BB#0: ## %entry
-; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
-; ALL-NEXT: retq
+; AVX1-LABEL: shuffle_v8f32_45670123:
+; AVX1: # %bb.0: # %entry
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v8f32_45670123:
+; AVX2: # %bb.0: # %entry
+; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX2-NEXT: retq
entry:
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
ret <8 x float> %shuffle
}
define <8 x float> @shuffle_v8f32_45670123_mem(<8 x float>* %pa, <8 x float>* %pb) nounwind uwtable readnone ssp {
-; ALL-LABEL: shuffle_v8f32_45670123_mem:
-; ALL: ## BB#0: ## %entry
-; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3,0,1]
-; ALL-NEXT: retq
+; AVX1-LABEL: shuffle_v8f32_45670123_mem:
+; AVX1: # %bb.0: # %entry
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3,0,1]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v8f32_45670123_mem:
+; AVX2: # %bb.0: # %entry
+; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = mem[2,3,0,1]
+; AVX2-NEXT: retq
entry:
%a = load <8 x float>, <8 x float>* %pa
%b = load <8 x float>, <8 x float>* %pb
@@ -26,7 +36,7 @@ entry:
define <8 x float> @shuffle_v8f32_0123cdef(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
; ALL-LABEL: shuffle_v8f32_0123cdef:
-; ALL: ## BB#0: ## %entry
+; ALL: # %bb.0: # %entry
; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
; ALL-NEXT: retq
entry:
@@ -36,13 +46,13 @@ entry:
define <8 x float> @shuffle_v8f32_01230123(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
; AVX1-LABEL: shuffle_v8f32_01230123:
-; AVX1: ## BB#0: ## %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v8f32_01230123:
-; AVX2: ## BB#0: ## %entry
-; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[0,1,0,1]
+; AVX2: # %bb.0: # %entry
+; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1]
; AVX2-NEXT: retq
entry:
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
@@ -50,10 +60,15 @@ entry:
}
define <8 x float> @shuffle_v8f32_01230123_mem(<8 x float>* %pa, <8 x float>* %pb) nounwind uwtable readnone ssp {
-; ALL-LABEL: shuffle_v8f32_01230123_mem:
-; ALL: ## BB#0: ## %entry
-; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[0,1,0,1]
-; ALL-NEXT: retq
+; AVX1-LABEL: shuffle_v8f32_01230123_mem:
+; AVX1: # %bb.0: # %entry
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[0,1,0,1]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v8f32_01230123_mem:
+; AVX2: # %bb.0: # %entry
+; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = mem[0,1,0,1]
+; AVX2-NEXT: retq
entry:
%a = load <8 x float>, <8 x float>* %pa
%b = load <8 x float>, <8 x float>* %pb
@@ -62,20 +77,30 @@ entry:
}
define <8 x float> @shuffle_v8f32_45674567(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
-; ALL-LABEL: shuffle_v8f32_45674567:
-; ALL: ## BB#0: ## %entry
-; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
-; ALL-NEXT: retq
+; AVX1-LABEL: shuffle_v8f32_45674567:
+; AVX1: # %bb.0: # %entry
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v8f32_45674567:
+; AVX2: # %bb.0: # %entry
+; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,3]
+; AVX2-NEXT: retq
entry:
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
ret <8 x float> %shuffle
}
define <8 x float> @shuffle_v8f32_45674567_mem(<8 x float>* %pa, <8 x float>* %pb) nounwind uwtable readnone ssp {
-; ALL-LABEL: shuffle_v8f32_45674567_mem:
-; ALL: ## BB#0: ## %entry
-; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3,2,3]
-; ALL-NEXT: retq
+; AVX1-LABEL: shuffle_v8f32_45674567_mem:
+; AVX1: # %bb.0: # %entry
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3,2,3]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v8f32_45674567_mem:
+; AVX2: # %bb.0: # %entry
+; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = mem[2,3,2,3]
+; AVX2-NEXT: retq
entry:
%a = load <8 x float>, <8 x float>* %pa
%b = load <8 x float>, <8 x float>* %pb
@@ -84,10 +109,15 @@ entry:
}
define <32 x i8> @shuffle_v32i8_2323(<32 x i8> %a, <32 x i8> %b) nounwind uwtable readnone ssp {
-; ALL-LABEL: shuffle_v32i8_2323:
-; ALL: ## BB#0: ## %entry
-; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
-; ALL-NEXT: retq
+; AVX1-LABEL: shuffle_v32i8_2323:
+; AVX1: # %bb.0: # %entry
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v32i8_2323:
+; AVX2: # %bb.0: # %entry
+; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,3]
+; AVX2-NEXT: retq
entry:
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
ret <32 x i8> %shuffle
@@ -95,7 +125,7 @@ entry:
define <32 x i8> @shuffle_v32i8_2323_domain(<32 x i8> %a, <32 x i8> %b) nounwind uwtable readnone ssp {
; AVX1-LABEL: shuffle_v32i8_2323_domain:
-; AVX1: ## BB#0: ## %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0
@@ -104,10 +134,10 @@ define <32 x i8> @shuffle_v32i8_2323_domain(<32 x i8> %a, <32 x i8> %b) nounwind
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v32i8_2323_domain:
-; AVX2: ## BB#0: ## %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; AVX2-NEXT: vpsubb %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
; AVX2-NEXT: retq
entry:
; add forces execution domain
@@ -118,7 +148,7 @@ entry:
define <4 x i64> @shuffle_v4i64_6701(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp {
; ALL-LABEL: shuffle_v4i64_6701:
-; ALL: ## BB#0: ## %entry
+; ALL: # %bb.0: # %entry
; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1]
; ALL-NEXT: retq
entry:
@@ -128,14 +158,14 @@ entry:
define <4 x i64> @shuffle_v4i64_6701_domain(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp {
; AVX1-LABEL: shuffle_v4i64_6701_domain:
-; AVX1: ## BB#0: ## %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v4i64_6701_domain:
-; AVX2: ## BB#0: ## %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
; AVX2-NEXT: vpsubq %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1]
@@ -149,7 +179,7 @@ entry:
define <8 x i32> @shuffle_v8i32_u5u7cdef(<8 x i32> %a, <8 x i32> %b) nounwind uwtable readnone ssp {
; AVX1-LABEL: shuffle_v8i32_u5u7cdef:
-; AVX1: ## BB#0: ## %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm0
@@ -158,7 +188,7 @@ define <8 x i32> @shuffle_v8i32_u5u7cdef(<8 x i32> %a, <8 x i32> %b) nounwind uw
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v8i32_u5u7cdef:
-; AVX2: ## BB#0: ## %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
; AVX2-NEXT: vpsubd %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
@@ -172,14 +202,14 @@ entry:
define <16 x i16> @shuffle_v16i16_4501(<16 x i16> %a, <16 x i16> %b) nounwind uwtable readnone ssp {
; AVX1-LABEL: shuffle_v16i16_4501:
-; AVX1: ## BB#0: ## %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpsubw %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v16i16_4501:
-; AVX2: ## BB#0: ## %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
; AVX2-NEXT: vpsubw %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
@@ -193,7 +223,7 @@ entry:
define <16 x i16> @shuffle_v16i16_4501_mem(<16 x i16>* %a, <16 x i16>* %b) nounwind uwtable readnone ssp {
; AVX1-LABEL: shuffle_v16i16_4501_mem:
-; AVX1: ## BB#0: ## %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vmovdqa (%rdi), %ymm0
; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm0
@@ -201,7 +231,7 @@ define <16 x i16> @shuffle_v16i16_4501_mem(<16 x i16>* %a, <16 x i16>* %b) nounw
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v16i16_4501_mem:
-; AVX2: ## BB#0: ## %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vmovdqa (%rdi), %ymm0
; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; AVX2-NEXT: vpsubw %ymm1, %ymm0, %ymm0
@@ -219,7 +249,7 @@ entry:
define <8 x float> @shuffle_v8f32_uu67u9ub(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
; ALL-LABEL: shuffle_v8f32_uu67u9ub:
-; ALL: ## BB#0: ## %entry
+; ALL: # %bb.0: # %entry
; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
; ALL-NEXT: retq
entry:
@@ -228,10 +258,15 @@ entry:
}
define <8 x float> @shuffle_v8f32_uu67uu67(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
-; ALL-LABEL: shuffle_v8f32_uu67uu67:
-; ALL: ## BB#0: ## %entry
-; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
-; ALL-NEXT: retq
+; AVX1-LABEL: shuffle_v8f32_uu67uu67:
+; AVX1: # %bb.0: # %entry
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v8f32_uu67uu67:
+; AVX2: # %bb.0: # %entry
+; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3]
+; AVX2-NEXT: retq
entry:
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 undef, i32 undef, i32 6, i32 7>
ret <8 x float> %shuffle
@@ -239,7 +274,7 @@ entry:
define <8 x float> @shuffle_v8f32_uu67uuab(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
; ALL-LABEL: shuffle_v8f32_uu67uuab:
-; ALL: ## BB#0: ## %entry
+; ALL: # %bb.0: # %entry
; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
; ALL-NEXT: retq
entry:
@@ -249,7 +284,7 @@ entry:
define <8 x float> @shuffle_v8f32_uu67uuef(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
; ALL-LABEL: shuffle_v8f32_uu67uuef:
-; ALL: ## BB#0: ## %entry
+; ALL: # %bb.0: # %entry
; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
; ALL-NEXT: retq
entry:
@@ -258,10 +293,15 @@ entry:
}
define <8 x float> @shuffle_v8f32_uu674567(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
-; ALL-LABEL: shuffle_v8f32_uu674567:
-; ALL: ## BB#0: ## %entry
-; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
-; ALL-NEXT: retq
+; AVX1-LABEL: shuffle_v8f32_uu674567:
+; AVX1: # %bb.0: # %entry
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v8f32_uu674567:
+; AVX2: # %bb.0: # %entry
+; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3]
+; AVX2-NEXT: retq
entry:
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
ret <8 x float> %shuffle
@@ -269,7 +309,7 @@ entry:
define <8 x float> @shuffle_v8f32_uu6789ab(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
; ALL-LABEL: shuffle_v8f32_uu6789ab:
-; ALL: ## BB#0: ## %entry
+; ALL: # %bb.0: # %entry
; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
; ALL-NEXT: retq
entry:
@@ -278,10 +318,15 @@ entry:
}
define <8 x float> @shuffle_v8f32_4567uu67(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
-; ALL-LABEL: shuffle_v8f32_4567uu67:
-; ALL: ## BB#0: ## %entry
-; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
-; ALL-NEXT: retq
+; AVX1-LABEL: shuffle_v8f32_4567uu67:
+; AVX1: # %bb.0: # %entry
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v8f32_4567uu67:
+; AVX2: # %bb.0: # %entry
+; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,3]
+; AVX2-NEXT: retq
entry:
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 6, i32 7>
ret <8 x float> %shuffle
@@ -289,7 +334,7 @@ entry:
define <8 x float> @shuffle_v8f32_4567uuef(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
; ALL-LABEL: shuffle_v8f32_4567uuef:
-; ALL: ## BB#0: ## %entry
+; ALL: # %bb.0: # %entry
; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
; ALL-NEXT: retq
entry:
@@ -301,7 +346,7 @@ entry:
define <8 x float> @shuffle_v8f32_uu67ucuf(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
; ALL-LABEL: shuffle_v8f32_uu67ucuf:
-; ALL: ## BB#0: ## %entry
+; ALL: # %bb.0: # %entry
; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,3,4,4,6,7]
; ALL-NEXT: retq
@@ -317,7 +362,7 @@ entry:
define <4 x double> @shuffle_v4f64_zz01(<4 x double> %a) {
; ALL-LABEL: shuffle_v4f64_zz01:
-; ALL: ## BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1]
; ALL-NEXT: retq
%s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
@@ -325,7 +370,7 @@ define <4 x double> @shuffle_v4f64_zz01(<4 x double> %a) {
}
define <4 x double> @shuffle_v4f64_zz01_optsize(<4 x double> %a) optsize {
; ALL-LABEL: shuffle_v4f64_zz01_optsize:
-; ALL: ## BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1]
; ALL-NEXT: retq
%s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
@@ -334,8 +379,8 @@ define <4 x double> @shuffle_v4f64_zz01_optsize(<4 x double> %a) optsize {
define <4 x double> @shuffle_v4f64_zz23(<4 x double> %a) {
; ALL-LABEL: shuffle_v4f64_zz23:
-; ALL: ## BB#0:
-; ALL-NEXT: vxorpd %ymm1, %ymm1, %ymm1
+; ALL: # %bb.0:
+; ALL-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
; ALL-NEXT: retq
%s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
@@ -343,8 +388,8 @@ define <4 x double> @shuffle_v4f64_zz23(<4 x double> %a) {
}
define <4 x double> @shuffle_v4f64_zz23_optsize(<4 x double> %a) optsize {
; ALL-LABEL: shuffle_v4f64_zz23_optsize:
-; ALL: ## BB#0:
-; ALL-NEXT: vxorpd %ymm1, %ymm1, %ymm1
+; ALL: # %bb.0:
+; ALL-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
; ALL-NEXT: retq
%s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
@@ -353,7 +398,7 @@ define <4 x double> @shuffle_v4f64_zz23_optsize(<4 x double> %a) optsize {
define <4 x double> @shuffle_v4f64_zz45(<4 x double> %a) {
; ALL-LABEL: shuffle_v4f64_zz45:
-; ALL: ## BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1]
; ALL-NEXT: retq
%s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
@@ -361,7 +406,7 @@ define <4 x double> @shuffle_v4f64_zz45(<4 x double> %a) {
}
define <4 x double> @shuffle_v4f64_zz45_optsize(<4 x double> %a) optsize {
; ALL-LABEL: shuffle_v4f64_zz45_optsize:
-; ALL: ## BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1]
; ALL-NEXT: retq
%s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
@@ -370,8 +415,8 @@ define <4 x double> @shuffle_v4f64_zz45_optsize(<4 x double> %a) optsize {
define <4 x double> @shuffle_v4f64_zz67(<4 x double> %a) {
; ALL-LABEL: shuffle_v4f64_zz67:
-; ALL: ## BB#0:
-; ALL-NEXT: vxorpd %ymm1, %ymm1, %ymm1
+; ALL: # %bb.0:
+; ALL-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
; ALL-NEXT: retq
%s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
@@ -379,8 +424,8 @@ define <4 x double> @shuffle_v4f64_zz67(<4 x double> %a) {
}
define <4 x double> @shuffle_v4f64_zz67_optsize(<4 x double> %a) optsize {
; ALL-LABEL: shuffle_v4f64_zz67_optsize:
-; ALL: ## BB#0:
-; ALL-NEXT: vxorpd %ymm1, %ymm1, %ymm1
+; ALL: # %bb.0:
+; ALL-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
; ALL-NEXT: retq
%s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
@@ -389,8 +434,8 @@ define <4 x double> @shuffle_v4f64_zz67_optsize(<4 x double> %a) optsize {
define <4 x double> @shuffle_v4f64_01zz(<4 x double> %a) {
; ALL-LABEL: shuffle_v4f64_01zz:
-; ALL: ## BB#0:
-; ALL-NEXT: vxorpd %ymm1, %ymm1, %ymm1
+; ALL: # %bb.0:
+; ALL-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
; ALL-NEXT: retq
%s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
@@ -398,8 +443,8 @@ define <4 x double> @shuffle_v4f64_01zz(<4 x double> %a) {
}
define <4 x double> @shuffle_v4f64_01zz_optsize(<4 x double> %a) optsize {
; ALL-LABEL: shuffle_v4f64_01zz_optsize:
-; ALL: ## BB#0:
-; ALL-NEXT: vxorpd %ymm1, %ymm1, %ymm1
+; ALL: # %bb.0:
+; ALL-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
; ALL-NEXT: retq
%s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
@@ -408,7 +453,7 @@ define <4 x double> @shuffle_v4f64_01zz_optsize(<4 x double> %a) optsize {
define <4 x double> @shuffle_v4f64_23zz(<4 x double> %a) {
; ALL-LABEL: shuffle_v4f64_23zz:
-; ALL: ## BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero
; ALL-NEXT: retq
%s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
@@ -416,7 +461,7 @@ define <4 x double> @shuffle_v4f64_23zz(<4 x double> %a) {
}
define <4 x double> @shuffle_v4f64_23zz_optsize(<4 x double> %a) optsize {
; ALL-LABEL: shuffle_v4f64_23zz_optsize:
-; ALL: ## BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero
; ALL-NEXT: retq
%s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
@@ -425,8 +470,8 @@ define <4 x double> @shuffle_v4f64_23zz_optsize(<4 x double> %a) optsize {
define <4 x double> @shuffle_v4f64_45zz(<4 x double> %a) {
; ALL-LABEL: shuffle_v4f64_45zz:
-; ALL: ## BB#0:
-; ALL-NEXT: vxorpd %ymm1, %ymm1, %ymm1
+; ALL: # %bb.0:
+; ALL-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
; ALL-NEXT: retq
%s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
@@ -434,8 +479,8 @@ define <4 x double> @shuffle_v4f64_45zz(<4 x double> %a) {
}
define <4 x double> @shuffle_v4f64_45zz_optsize(<4 x double> %a) optsize {
; ALL-LABEL: shuffle_v4f64_45zz_optsize:
-; ALL: ## BB#0:
-; ALL-NEXT: vxorpd %ymm1, %ymm1, %ymm1
+; ALL: # %bb.0:
+; ALL-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
; ALL-NEXT: retq
%s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
@@ -444,7 +489,7 @@ define <4 x double> @shuffle_v4f64_45zz_optsize(<4 x double> %a) optsize {
define <4 x double> @shuffle_v4f64_67zz(<4 x double> %a) {
; ALL-LABEL: shuffle_v4f64_67zz:
-; ALL: ## BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero
; ALL-NEXT: retq
%s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
@@ -452,7 +497,7 @@ define <4 x double> @shuffle_v4f64_67zz(<4 x double> %a) {
}
define <4 x double> @shuffle_v4f64_67zz_optsize(<4 x double> %a) optsize {
; ALL-LABEL: shuffle_v4f64_67zz_optsize:
-; ALL: ## BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero
; ALL-NEXT: retq
%s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
@@ -463,14 +508,14 @@ define <4 x double> @shuffle_v4f64_67zz_optsize(<4 x double> %a) optsize {
define <4 x i64> @shuffle_v4i64_67zz(<4 x i64> %a, <4 x i64> %b) {
; AVX1-LABEL: shuffle_v4i64_67zz:
-; AVX1: ## BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero
; AVX1-NEXT: vpaddq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v4i64_67zz:
-; AVX2: ## BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero
; AVX2-NEXT: vpaddq %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
@@ -483,15 +528,15 @@ define <4 x i64> @shuffle_v4i64_67zz(<4 x i64> %a, <4 x i64> %b) {
define <4 x double> @ld0_hi0_lo1_4f64(<4 x double> * %pa, <4 x double> %b) nounwind uwtable readnone ssp {
; AVX1-LABEL: ld0_hi0_lo1_4f64:
-; AVX1: ## BB#0: ## %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[0,1]
; AVX1-NEXT: vaddpd {{.*}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ld0_hi0_lo1_4f64:
-; AVX2: ## BB#0: ## %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[0,1]
-; AVX2-NEXT: vbroadcastsd {{.*}}(%rip), %ymm1
+; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm1 = [1,1,1,1]
; AVX2-NEXT: vaddpd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
entry:
@@ -503,15 +548,15 @@ entry:
define <4 x double> @ld1_hi0_hi1_4f64(<4 x double> %a, <4 x double> * %pb) nounwind uwtable readnone ssp {
; AVX1-LABEL: ld1_hi0_hi1_4f64:
-; AVX1: ## BB#0: ## %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
; AVX1-NEXT: vaddpd {{.*}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ld1_hi0_hi1_4f64:
-; AVX2: ## BB#0: ## %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
-; AVX2-NEXT: vbroadcastsd {{.*}}(%rip), %ymm1
+; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm1 = [1,1,1,1]
; AVX2-NEXT: vaddpd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
entry:
@@ -523,15 +568,15 @@ entry:
define <8 x float> @ld0_hi0_lo1_8f32(<8 x float> * %pa, <8 x float> %b) nounwind uwtable readnone ssp {
; AVX1-LABEL: ld0_hi0_lo1_8f32:
-; AVX1: ## BB#0: ## %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[0,1]
; AVX1-NEXT: vaddps {{.*}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ld0_hi0_lo1_8f32:
-; AVX2: ## BB#0: ## %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[0,1]
-; AVX2-NEXT: vbroadcastss {{.*}}(%rip), %ymm1
+; AVX2-NEXT: vbroadcastss {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1]
; AVX2-NEXT: vaddps %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
entry:
@@ -543,15 +588,15 @@ entry:
define <8 x float> @ld1_hi0_hi1_8f32(<8 x float> %a, <8 x float> * %pb) nounwind uwtable readnone ssp {
; AVX1-LABEL: ld1_hi0_hi1_8f32:
-; AVX1: ## BB#0: ## %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
; AVX1-NEXT: vaddps {{.*}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ld1_hi0_hi1_8f32:
-; AVX2: ## BB#0: ## %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
-; AVX2-NEXT: vbroadcastss {{.*}}(%rip), %ymm1
+; AVX2-NEXT: vbroadcastss {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1]
; AVX2-NEXT: vaddps %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
entry:
@@ -563,7 +608,7 @@ entry:
define <4 x i64> @ld0_hi0_lo1_4i64(<4 x i64> * %pa, <4 x i64> %b) nounwind uwtable readnone ssp {
; AVX1-LABEL: ld0_hi0_lo1_4i64:
-; AVX1: ## BB#0: ## %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[0,1]
; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm0, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
@@ -572,7 +617,7 @@ define <4 x i64> @ld0_hi0_lo1_4i64(<4 x i64> * %pa, <4 x i64> %b) nounwind uwtab
; AVX1-NEXT: retq
;
; AVX2-LABEL: ld0_hi0_lo1_4i64:
-; AVX2: ## BB#0: ## %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = mem[2,3],ymm0[0,1]
; AVX2-NEXT: vpaddq {{.*}}(%rip), %ymm0, %ymm0
; AVX2-NEXT: retq
@@ -585,7 +630,7 @@ entry:
define <4 x i64> @ld1_hi0_hi1_4i64(<4 x i64> %a, <4 x i64> * %pb) nounwind uwtable readnone ssp {
; AVX1-LABEL: ld1_hi0_hi1_4i64:
-; AVX1: ## BB#0: ## %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm0, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
@@ -594,7 +639,7 @@ define <4 x i64> @ld1_hi0_hi1_4i64(<4 x i64> %a, <4 x i64> * %pb) nounwind uwtab
; AVX1-NEXT: retq
;
; AVX2-LABEL: ld1_hi0_hi1_4i64:
-; AVX2: ## BB#0: ## %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
; AVX2-NEXT: vpaddq {{.*}}(%rip), %ymm0, %ymm0
; AVX2-NEXT: retq
@@ -607,7 +652,7 @@ entry:
define <8 x i32> @ld0_hi0_lo1_8i32(<8 x i32> * %pa, <8 x i32> %b) nounwind uwtable readnone ssp {
; AVX1-LABEL: ld0_hi0_lo1_8i32:
-; AVX1: ## BB#0: ## %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[0,1]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,2,3,4]
@@ -617,7 +662,7 @@ define <8 x i32> @ld0_hi0_lo1_8i32(<8 x i32> * %pa, <8 x i32> %b) nounwind uwtab
; AVX1-NEXT: retq
;
; AVX2-LABEL: ld0_hi0_lo1_8i32:
-; AVX2: ## BB#0: ## %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = mem[2,3],ymm0[0,1]
; AVX2-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0
; AVX2-NEXT: retq
@@ -630,7 +675,7 @@ entry:
define <8 x i32> @ld1_hi0_hi1_8i32(<8 x i32> %a, <8 x i32> * %pb) nounwind uwtable readnone ssp {
; AVX1-LABEL: ld1_hi0_hi1_8i32:
-; AVX1: ## BB#0: ## %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,2,3,4]
@@ -640,7 +685,7 @@ define <8 x i32> @ld1_hi0_hi1_8i32(<8 x i32> %a, <8 x i32> * %pb) nounwind uwtab
; AVX1-NEXT: retq
;
; AVX2-LABEL: ld1_hi0_hi1_8i32:
-; AVX2: ## BB#0: ## %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
; AVX2-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0
; AVX2-NEXT: retq
diff --git a/test/CodeGen/X86/avx-vzeroupper.ll b/test/CodeGen/X86/avx-vzeroupper.ll
index 016ddb9c5e78..e69a2905b0b6 100644
--- a/test/CodeGen/X86/avx-vzeroupper.ll
+++ b/test/CodeGen/X86/avx-vzeroupper.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=VZ --check-prefix=AVX
; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=VZ --check-prefix=AVX512
-; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-unknown-unknown -mattr=+avx,+fast-partial-ymm-or-zmm-write | FileCheck %s --check-prefix=ALL --check-prefix=NO-VZ --check-prefix=FAST-YMM-ZMM
+; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-unknown-unknown -mattr=+avx,+fast-partial-ymm-or-zmm-write | FileCheck %s --check-prefix=ALL --check-prefix=NO-VZ --check-prefix=FAST-ymm-zmm
; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-unknown-unknown -mcpu=btver2 | FileCheck %s --check-prefix=ALL --check-prefix=NO-VZ --check-prefix=BTVER2
declare i32 @foo()
@@ -15,7 +15,7 @@ declare <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float>, i8) nounwind
define <4 x float> @test00(<4 x float> %a, <4 x float> %b) nounwind {
; ALL-LABEL: test00:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: pushq %rax
; ALL-NEXT: vaddps %xmm1, %xmm0, %xmm0
; ALL-NEXT: callq do_sse
@@ -30,7 +30,7 @@ define <4 x float> @test00(<4 x float> %a, <4 x float> %b) nounwind {
define <8 x float> @test01(<4 x float> %a, <4 x float> %b, <8 x float> %c) nounwind {
; VZ-LABEL: test01:
-; VZ: # BB#0:
+; VZ: # %bb.0:
; VZ-NEXT: subq $56, %rsp
; VZ-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill
; VZ-NEXT: vmovaps {{.*}}(%rip), %xmm0
@@ -44,7 +44,7 @@ define <8 x float> @test01(<4 x float> %a, <4 x float> %b, <8 x float> %c) nounw
; VZ-NEXT: retq
;
; FAST-YMM-ZMM-LABEL: test01:
-; FAST-YMM-ZMM: # BB#0:
+; FAST-YMM-ZMM: # %bb.0:
; FAST-YMM-ZMM-NEXT: subq $56, %rsp
; FAST-YMM-ZMM-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill
; FAST-YMM-ZMM-NEXT: vmovaps {{.*}}(%rip), %xmm0
@@ -57,7 +57,7 @@ define <8 x float> @test01(<4 x float> %a, <4 x float> %b, <8 x float> %c) nounw
; FAST-YMM-ZMM-NEXT: retq
;
; BTVER2-LABEL: test01:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: subq $56, %rsp
; BTVER2-NEXT: vmovaps {{.*}}(%rip), %xmm0
; BTVER2-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill
@@ -80,16 +80,16 @@ define <8 x float> @test01(<4 x float> %a, <4 x float> %b, <8 x float> %c) nounw
define <4 x float> @test02(<8 x float> %a, <8 x float> %b) nounwind {
; VZ-LABEL: test02:
-; VZ: # BB#0:
+; VZ: # %bb.0:
; VZ-NEXT: vaddps %ymm1, %ymm0, %ymm0
-; VZ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; VZ-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; VZ-NEXT: vzeroupper
; VZ-NEXT: jmp do_sse # TAILCALL
;
; NO-VZ-LABEL: test02:
-; NO-VZ: # BB#0:
+; NO-VZ: # %bb.0:
; NO-VZ-NEXT: vaddps %ymm1, %ymm0, %ymm0
-; NO-VZ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; NO-VZ-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; NO-VZ-NEXT: jmp do_sse # TAILCALL
%add.i = fadd <8 x float> %a, %b
%add.low = call <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float> %add.i, i8 0)
@@ -102,7 +102,7 @@ define <4 x float> @test02(<8 x float> %a, <8 x float> %b) nounwind {
define <4 x float> @test03(<4 x float> %a, <4 x float> %b) nounwind {
; VZ-LABEL: test03:
-; VZ: # BB#0: # %entry
+; VZ: # %bb.0: # %entry
; VZ-NEXT: pushq %rbx
; VZ-NEXT: subq $16, %rsp
; VZ-NEXT: vaddps %xmm1, %xmm0, %xmm0
@@ -113,7 +113,7 @@ define <4 x float> @test03(<4 x float> %a, <4 x float> %b) nounwind {
; VZ-NEXT: callq foo
; VZ-NEXT: testl %eax, %eax
; VZ-NEXT: jne .LBB3_1
-; VZ-NEXT: # BB#2: # %for.body.preheader
+; VZ-NEXT: # %bb.2: # %for.body.preheader
; VZ-NEXT: movl $4, %ebx
; VZ-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
; VZ-NEXT: .p2align 4, 0x90
@@ -127,13 +127,13 @@ define <4 x float> @test03(<4 x float> %a, <4 x float> %b) nounwind {
; VZ-NEXT: callq do_sse
; VZ-NEXT: decl %ebx
; VZ-NEXT: jne .LBB3_3
-; VZ-NEXT: # BB#4: # %for.end
+; VZ-NEXT: # %bb.4: # %for.end
; VZ-NEXT: addq $16, %rsp
; VZ-NEXT: popq %rbx
; VZ-NEXT: retq
;
; FAST-YMM-ZMM-LABEL: test03:
-; FAST-YMM-ZMM: # BB#0: # %entry
+; FAST-YMM-ZMM: # %bb.0: # %entry
; FAST-YMM-ZMM-NEXT: pushq %rbx
; FAST-YMM-ZMM-NEXT: subq $16, %rsp
; FAST-YMM-ZMM-NEXT: vaddps %xmm1, %xmm0, %xmm0
@@ -144,7 +144,7 @@ define <4 x float> @test03(<4 x float> %a, <4 x float> %b) nounwind {
; FAST-YMM-ZMM-NEXT: callq foo
; FAST-YMM-ZMM-NEXT: testl %eax, %eax
; FAST-YMM-ZMM-NEXT: jne .LBB3_1
-; FAST-YMM-ZMM-NEXT: # BB#2: # %for.body.preheader
+; FAST-YMM-ZMM-NEXT: # %bb.2: # %for.body.preheader
; FAST-YMM-ZMM-NEXT: movl $4, %ebx
; FAST-YMM-ZMM-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
; FAST-YMM-ZMM-NEXT: .p2align 4, 0x90
@@ -157,13 +157,13 @@ define <4 x float> @test03(<4 x float> %a, <4 x float> %b) nounwind {
; FAST-YMM-ZMM-NEXT: callq do_sse
; FAST-YMM-ZMM-NEXT: decl %ebx
; FAST-YMM-ZMM-NEXT: jne .LBB3_3
-; FAST-YMM-ZMM-NEXT: # BB#4: # %for.end
+; FAST-YMM-ZMM-NEXT: # %bb.4: # %for.end
; FAST-YMM-ZMM-NEXT: addq $16, %rsp
; FAST-YMM-ZMM-NEXT: popq %rbx
; FAST-YMM-ZMM-NEXT: retq
;
; BTVER2-LABEL: test03:
-; BTVER2: # BB#0: # %entry
+; BTVER2: # %bb.0: # %entry
; BTVER2-NEXT: pushq %rbx
; BTVER2-NEXT: subq $16, %rsp
; BTVER2-NEXT: vaddps %xmm1, %xmm0, %xmm0
@@ -174,7 +174,7 @@ define <4 x float> @test03(<4 x float> %a, <4 x float> %b) nounwind {
; BTVER2-NEXT: callq foo
; BTVER2-NEXT: testl %eax, %eax
; BTVER2-NEXT: jne .LBB3_1
-; BTVER2-NEXT: # BB#2: # %for.body.preheader
+; BTVER2-NEXT: # %bb.2: # %for.body.preheader
; BTVER2-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
; BTVER2-NEXT: movl $4, %ebx
; BTVER2-NEXT: .p2align 4, 0x90
@@ -187,7 +187,7 @@ define <4 x float> @test03(<4 x float> %a, <4 x float> %b) nounwind {
; BTVER2-NEXT: callq do_sse
; BTVER2-NEXT: decl %ebx
; BTVER2-NEXT: jne .LBB3_3
-; BTVER2-NEXT: # BB#4: # %for.end
+; BTVER2-NEXT: # %bb.4: # %for.end
; BTVER2-NEXT: addq $16, %rsp
; BTVER2-NEXT: popq %rbx
; BTVER2-NEXT: retq
@@ -220,23 +220,23 @@ for.end:
define <4 x float> @test04(<4 x float> %a, <4 x float> %b) nounwind {
; VZ-LABEL: test04:
-; VZ: # BB#0:
+; VZ: # %bb.0:
; VZ-NEXT: pushq %rax
-; VZ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; VZ-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
; VZ-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; VZ-NEXT: callq do_avx
-; VZ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; VZ-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; VZ-NEXT: popq %rax
; VZ-NEXT: vzeroupper
; VZ-NEXT: retq
;
; NO-VZ-LABEL: test04:
-; NO-VZ: # BB#0:
+; NO-VZ: # %bb.0:
; NO-VZ-NEXT: pushq %rax
-; NO-VZ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; NO-VZ-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
; NO-VZ-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; NO-VZ-NEXT: callq do_avx
-; NO-VZ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; NO-VZ-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; NO-VZ-NEXT: popq %rax
; NO-VZ-NEXT: retq
%shuf = shufflevector <4 x float> %a, <4 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
diff --git a/test/CodeGen/X86/avx1-logical-load-folding.ll b/test/CodeGen/X86/avx1-logical-load-folding.ll
index 7073eb224763..88521dedc1c9 100644
--- a/test/CodeGen/X86/avx1-logical-load-folding.ll
+++ b/test/CodeGen/X86/avx1-logical-load-folding.ll
@@ -5,7 +5,7 @@
; Function Attrs: nounwind ssp uwtable
define void @test1(float* %A, float* %C) #0 {
; X86-LABEL: test1:
-; X86: ## BB#0:
+; X86: ## %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: vmovaps (%ecx), %ymm0
@@ -15,7 +15,7 @@ define void @test1(float* %A, float* %C) #0 {
; X86-NEXT: retl
;
; X64-LABEL: test1:
-; X64: ## BB#0:
+; X64: ## %bb.0:
; X64-NEXT: vmovaps (%rdi), %ymm0
; X64-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
; X64-NEXT: vmovss %xmm0, (%rsi)
@@ -34,7 +34,7 @@ define void @test1(float* %A, float* %C) #0 {
; Function Attrs: nounwind ssp uwtable
define void @test2(float* %A, float* %C) #0 {
; X86-LABEL: test2:
-; X86: ## BB#0:
+; X86: ## %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: vmovaps (%ecx), %ymm0
@@ -44,7 +44,7 @@ define void @test2(float* %A, float* %C) #0 {
; X86-NEXT: retl
;
; X64-LABEL: test2:
-; X64: ## BB#0:
+; X64: ## %bb.0:
; X64-NEXT: vmovaps (%rdi), %ymm0
; X64-NEXT: vorps {{.*}}(%rip), %ymm0, %ymm0
; X64-NEXT: vmovss %xmm0, (%rsi)
@@ -63,7 +63,7 @@ define void @test2(float* %A, float* %C) #0 {
; Function Attrs: nounwind ssp uwtable
define void @test3(float* %A, float* %C) #0 {
; X86-LABEL: test3:
-; X86: ## BB#0:
+; X86: ## %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: vmovaps (%ecx), %ymm0
@@ -73,7 +73,7 @@ define void @test3(float* %A, float* %C) #0 {
; X86-NEXT: retl
;
; X64-LABEL: test3:
-; X64: ## BB#0:
+; X64: ## %bb.0:
; X64-NEXT: vmovaps (%rdi), %ymm0
; X64-NEXT: vxorps {{.*}}(%rip), %ymm0, %ymm0
; X64-NEXT: vmovss %xmm0, (%rsi)
@@ -91,7 +91,7 @@ define void @test3(float* %A, float* %C) #0 {
define void @test4(float* %A, float* %C) #0 {
; X86-LABEL: test4:
-; X86: ## BB#0:
+; X86: ## %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: vmovaps (%ecx), %ymm0
@@ -101,7 +101,7 @@ define void @test4(float* %A, float* %C) #0 {
; X86-NEXT: retl
;
; X64-LABEL: test4:
-; X64: ## BB#0:
+; X64: ## %bb.0:
; X64-NEXT: vmovaps (%rdi), %ymm0
; X64-NEXT: vandnps {{.*}}(%rip), %ymm0, %ymm0
; X64-NEXT: vmovss %xmm0, (%rsi)
diff --git a/test/CodeGen/X86/avx2-arith.ll b/test/CodeGen/X86/avx2-arith.ll
index 9918d6680256..aa625be4ded5 100644
--- a/test/CodeGen/X86/avx2-arith.ll
+++ b/test/CodeGen/X86/avx2-arith.ll
@@ -4,12 +4,12 @@
define <4 x i64> @test_vpaddq(<4 x i64> %i, <4 x i64> %j) nounwind readnone {
; X32-LABEL: test_vpaddq:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpaddq %ymm1, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_vpaddq:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpaddq %ymm1, %ymm0, %ymm0
; X64-NEXT: retq
%x = add <4 x i64> %i, %j
@@ -18,12 +18,12 @@ define <4 x i64> @test_vpaddq(<4 x i64> %i, <4 x i64> %j) nounwind readnone {
define <8 x i32> @test_vpaddd(<8 x i32> %i, <8 x i32> %j) nounwind readnone {
; X32-LABEL: test_vpaddd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_vpaddd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; X64-NEXT: retq
%x = add <8 x i32> %i, %j
@@ -32,12 +32,12 @@ define <8 x i32> @test_vpaddd(<8 x i32> %i, <8 x i32> %j) nounwind readnone {
define <16 x i16> @test_vpaddw(<16 x i16> %i, <16 x i16> %j) nounwind readnone {
; X32-LABEL: test_vpaddw:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpaddw %ymm1, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_vpaddw:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpaddw %ymm1, %ymm0, %ymm0
; X64-NEXT: retq
%x = add <16 x i16> %i, %j
@@ -46,12 +46,12 @@ define <16 x i16> @test_vpaddw(<16 x i16> %i, <16 x i16> %j) nounwind readnone {
define <32 x i8> @test_vpaddb(<32 x i8> %i, <32 x i8> %j) nounwind readnone {
; X32-LABEL: test_vpaddb:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpaddb %ymm1, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_vpaddb:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpaddb %ymm1, %ymm0, %ymm0
; X64-NEXT: retq
%x = add <32 x i8> %i, %j
@@ -60,12 +60,12 @@ define <32 x i8> @test_vpaddb(<32 x i8> %i, <32 x i8> %j) nounwind readnone {
define <4 x i64> @test_vpsubq(<4 x i64> %i, <4 x i64> %j) nounwind readnone {
; X32-LABEL: test_vpsubq:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpsubq %ymm1, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_vpsubq:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpsubq %ymm1, %ymm0, %ymm0
; X64-NEXT: retq
%x = sub <4 x i64> %i, %j
@@ -74,12 +74,12 @@ define <4 x i64> @test_vpsubq(<4 x i64> %i, <4 x i64> %j) nounwind readnone {
define <8 x i32> @test_vpsubd(<8 x i32> %i, <8 x i32> %j) nounwind readnone {
; X32-LABEL: test_vpsubd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpsubd %ymm1, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_vpsubd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpsubd %ymm1, %ymm0, %ymm0
; X64-NEXT: retq
%x = sub <8 x i32> %i, %j
@@ -88,12 +88,12 @@ define <8 x i32> @test_vpsubd(<8 x i32> %i, <8 x i32> %j) nounwind readnone {
define <16 x i16> @test_vpsubw(<16 x i16> %i, <16 x i16> %j) nounwind readnone {
; X32-LABEL: test_vpsubw:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpsubw %ymm1, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_vpsubw:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpsubw %ymm1, %ymm0, %ymm0
; X64-NEXT: retq
%x = sub <16 x i16> %i, %j
@@ -102,12 +102,12 @@ define <16 x i16> @test_vpsubw(<16 x i16> %i, <16 x i16> %j) nounwind readnone {
define <32 x i8> @test_vpsubb(<32 x i8> %i, <32 x i8> %j) nounwind readnone {
; X32-LABEL: test_vpsubb:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpsubb %ymm1, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_vpsubb:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpsubb %ymm1, %ymm0, %ymm0
; X64-NEXT: retq
%x = sub <32 x i8> %i, %j
@@ -116,12 +116,12 @@ define <32 x i8> @test_vpsubb(<32 x i8> %i, <32 x i8> %j) nounwind readnone {
define <8 x i32> @test_vpmulld(<8 x i32> %i, <8 x i32> %j) nounwind readnone {
; X32-LABEL: test_vpmulld:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpmulld %ymm1, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_vpmulld:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpmulld %ymm1, %ymm0, %ymm0
; X64-NEXT: retq
%x = mul <8 x i32> %i, %j
@@ -130,12 +130,12 @@ define <8 x i32> @test_vpmulld(<8 x i32> %i, <8 x i32> %j) nounwind readnone {
define <16 x i16> @test_vpmullw(<16 x i16> %i, <16 x i16> %j) nounwind readnone {
; X32-LABEL: test_vpmullw:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpmullw %ymm1, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_vpmullw:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpmullw %ymm1, %ymm0, %ymm0
; X64-NEXT: retq
%x = mul <16 x i16> %i, %j
@@ -144,7 +144,7 @@ define <16 x i16> @test_vpmullw(<16 x i16> %i, <16 x i16> %j) nounwind readnone
define <16 x i8> @mul_v16i8(<16 x i8> %i, <16 x i8> %j) nounwind readnone {
; X32-LABEL: mul_v16i8:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpmovsxbw %xmm1, %ymm1
; X32-NEXT: vpmovsxbw %xmm0, %ymm0
; X32-NEXT: vpmullw %ymm1, %ymm0, %ymm0
@@ -157,7 +157,7 @@ define <16 x i8> @mul_v16i8(<16 x i8> %i, <16 x i8> %j) nounwind readnone {
; X32-NEXT: retl
;
; X64-LABEL: mul_v16i8:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpmovsxbw %xmm1, %ymm1
; X64-NEXT: vpmovsxbw %xmm0, %ymm0
; X64-NEXT: vpmullw %ymm1, %ymm0, %ymm0
@@ -174,7 +174,7 @@ define <16 x i8> @mul_v16i8(<16 x i8> %i, <16 x i8> %j) nounwind readnone {
define <32 x i8> @mul_v32i8(<32 x i8> %i, <32 x i8> %j) nounwind readnone {
; X32-LABEL: mul_v32i8:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vextracti128 $1, %ymm1, %xmm2
; X32-NEXT: vpmovsxbw %xmm2, %ymm2
; X32-NEXT: vextracti128 $1, %ymm0, %xmm3
@@ -196,7 +196,7 @@ define <32 x i8> @mul_v32i8(<32 x i8> %i, <32 x i8> %j) nounwind readnone {
; X32-NEXT: retl
;
; X64-LABEL: mul_v32i8:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vextracti128 $1, %ymm1, %xmm2
; X64-NEXT: vpmovsxbw %xmm2, %ymm2
; X64-NEXT: vextracti128 $1, %ymm0, %xmm3
@@ -222,7 +222,7 @@ define <32 x i8> @mul_v32i8(<32 x i8> %i, <32 x i8> %j) nounwind readnone {
define <4 x i64> @mul_v4i64(<4 x i64> %i, <4 x i64> %j) nounwind readnone {
; X32-LABEL: mul_v4i64:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpsrlq $32, %ymm0, %ymm2
; X32-NEXT: vpmuludq %ymm1, %ymm2, %ymm2
; X32-NEXT: vpsrlq $32, %ymm1, %ymm3
@@ -234,7 +234,7 @@ define <4 x i64> @mul_v4i64(<4 x i64> %i, <4 x i64> %j) nounwind readnone {
; X32-NEXT: retl
;
; X64-LABEL: mul_v4i64:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpsrlq $32, %ymm0, %ymm2
; X64-NEXT: vpmuludq %ymm1, %ymm2, %ymm2
; X64-NEXT: vpsrlq $32, %ymm1, %ymm3
@@ -250,12 +250,12 @@ define <4 x i64> @mul_v4i64(<4 x i64> %i, <4 x i64> %j) nounwind readnone {
define <8 x i32> @mul_const1(<8 x i32> %x) {
; X32-LABEL: mul_const1:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpaddd %ymm0, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: mul_const1:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpaddd %ymm0, %ymm0, %ymm0
; X64-NEXT: retq
%y = mul <8 x i32> %x, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
@@ -264,12 +264,12 @@ define <8 x i32> @mul_const1(<8 x i32> %x) {
define <4 x i64> @mul_const2(<4 x i64> %x) {
; X32-LABEL: mul_const2:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpsllq $2, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: mul_const2:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpsllq $2, %ymm0, %ymm0
; X64-NEXT: retq
%y = mul <4 x i64> %x, <i64 4, i64 4, i64 4, i64 4>
@@ -278,12 +278,12 @@ define <4 x i64> @mul_const2(<4 x i64> %x) {
define <16 x i16> @mul_const3(<16 x i16> %x) {
; X32-LABEL: mul_const3:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpsllw $3, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: mul_const3:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpsllw $3, %ymm0, %ymm0
; X64-NEXT: retq
%y = mul <16 x i16> %x, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
@@ -292,14 +292,14 @@ define <16 x i16> @mul_const3(<16 x i16> %x) {
define <4 x i64> @mul_const4(<4 x i64> %x) {
; X32-LABEL: mul_const4:
-; X32: # BB#0:
-; X32-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; X32: # %bb.0:
+; X32-NEXT: vpxor %xmm1, %xmm1, %xmm1
; X32-NEXT: vpsubq %ymm0, %ymm1, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: mul_const4:
-; X64: # BB#0:
-; X64-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; X64: # %bb.0:
+; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1
; X64-NEXT: vpsubq %ymm0, %ymm1, %ymm0
; X64-NEXT: retq
%y = mul <4 x i64> %x, <i64 -1, i64 -1, i64 -1, i64 -1>
@@ -308,13 +308,13 @@ define <4 x i64> @mul_const4(<4 x i64> %x) {
define <8 x i32> @mul_const5(<8 x i32> %x) {
; X32-LABEL: mul_const5:
-; X32: # BB#0:
-; X32-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; X32: # %bb.0:
+; X32-NEXT: vxorps %xmm0, %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: mul_const5:
-; X64: # BB#0:
-; X64-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; X64: # %bb.0:
+; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0
; X64-NEXT: retq
%y = mul <8 x i32> %x, <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <8 x i32> %y
@@ -322,12 +322,12 @@ define <8 x i32> @mul_const5(<8 x i32> %x) {
define <8 x i32> @mul_const6(<8 x i32> %x) {
; X32-LABEL: mul_const6:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpmulld {{\.LCPI.*}}, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: mul_const6:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpmulld {{.*}}(%rip), %ymm0, %ymm0
; X64-NEXT: retq
%y = mul <8 x i32> %x, <i32 0, i32 0, i32 0, i32 2, i32 0, i32 2, i32 0, i32 0>
@@ -336,13 +336,13 @@ define <8 x i32> @mul_const6(<8 x i32> %x) {
define <8 x i64> @mul_const7(<8 x i64> %x) {
; X32-LABEL: mul_const7:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpaddq %ymm0, %ymm0, %ymm0
; X32-NEXT: vpaddq %ymm1, %ymm1, %ymm1
; X32-NEXT: retl
;
; X64-LABEL: mul_const7:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpaddq %ymm0, %ymm0, %ymm0
; X64-NEXT: vpaddq %ymm1, %ymm1, %ymm1
; X64-NEXT: retq
@@ -352,12 +352,12 @@ define <8 x i64> @mul_const7(<8 x i64> %x) {
define <8 x i16> @mul_const8(<8 x i16> %x) {
; X32-LABEL: mul_const8:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpsllw $3, %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: mul_const8:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpsllw $3, %xmm0, %xmm0
; X64-NEXT: retq
%y = mul <8 x i16> %x, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
@@ -366,14 +366,14 @@ define <8 x i16> @mul_const8(<8 x i16> %x) {
define <8 x i32> @mul_const9(<8 x i32> %x) {
; X32-LABEL: mul_const9:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl $2, %eax
; X32-NEXT: vmovd %eax, %xmm1
; X32-NEXT: vpmulld %ymm1, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: mul_const9:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movl $2, %eax
; X64-NEXT: vmovd %eax, %xmm1
; X64-NEXT: vpmulld %ymm1, %ymm0, %ymm0
@@ -385,13 +385,13 @@ define <8 x i32> @mul_const9(<8 x i32> %x) {
; %x * 0x01010101
define <4 x i32> @mul_const10(<4 x i32> %x) {
; X32-LABEL: mul_const10:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpbroadcastd {{.*#+}} xmm1 = [16843009,16843009,16843009,16843009]
; X32-NEXT: vpmulld %xmm1, %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: mul_const10:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpbroadcastd {{.*#+}} xmm1 = [16843009,16843009,16843009,16843009]
; X64-NEXT: vpmulld %xmm1, %xmm0, %xmm0
; X64-NEXT: retq
@@ -402,13 +402,13 @@ define <4 x i32> @mul_const10(<4 x i32> %x) {
; %x * 0x80808080
define <4 x i32> @mul_const11(<4 x i32> %x) {
; X32-LABEL: mul_const11:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2155905152,2155905152,2155905152,2155905152]
; X32-NEXT: vpmulld %xmm1, %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: mul_const11:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2155905152,2155905152,2155905152,2155905152]
; X64-NEXT: vpmulld %xmm1, %xmm0, %xmm0
; X64-NEXT: retq
diff --git a/test/CodeGen/X86/avx2-cmp.ll b/test/CodeGen/X86/avx2-cmp.ll
index 2369aa5ac9a0..2d710e40daf5 100644
--- a/test/CodeGen/X86/avx2-cmp.ll
+++ b/test/CodeGen/X86/avx2-cmp.ll
@@ -4,12 +4,12 @@
define <8 x i32> @v8i32_cmpgt(<8 x i32> %i, <8 x i32> %j) nounwind readnone {
; X32-LABEL: v8i32_cmpgt:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpcmpgtd %ymm0, %ymm1, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: v8i32_cmpgt:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpcmpgtd %ymm0, %ymm1, %ymm0
; X64-NEXT: retq
%bincmp = icmp slt <8 x i32> %i, %j
@@ -19,12 +19,12 @@ define <8 x i32> @v8i32_cmpgt(<8 x i32> %i, <8 x i32> %j) nounwind readnone {
define <4 x i64> @v4i64_cmpgt(<4 x i64> %i, <4 x i64> %j) nounwind readnone {
; X32-LABEL: v4i64_cmpgt:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: v4i64_cmpgt:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
; X64-NEXT: retq
%bincmp = icmp slt <4 x i64> %i, %j
@@ -34,12 +34,12 @@ define <4 x i64> @v4i64_cmpgt(<4 x i64> %i, <4 x i64> %j) nounwind readnone {
define <16 x i16> @v16i16_cmpgt(<16 x i16> %i, <16 x i16> %j) nounwind readnone {
; X32-LABEL: v16i16_cmpgt:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: v16i16_cmpgt:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
; X64-NEXT: retq
%bincmp = icmp slt <16 x i16> %i, %j
@@ -49,12 +49,12 @@ define <16 x i16> @v16i16_cmpgt(<16 x i16> %i, <16 x i16> %j) nounwind readnone
define <32 x i8> @v32i8_cmpgt(<32 x i8> %i, <32 x i8> %j) nounwind readnone {
; X32-LABEL: v32i8_cmpgt:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: v32i8_cmpgt:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0
; X64-NEXT: retq
%bincmp = icmp slt <32 x i8> %i, %j
@@ -64,12 +64,12 @@ define <32 x i8> @v32i8_cmpgt(<32 x i8> %i, <32 x i8> %j) nounwind readnone {
define <8 x i32> @int256_cmpeq(<8 x i32> %i, <8 x i32> %j) nounwind readnone {
; X32-LABEL: int256_cmpeq:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpcmpeqd %ymm1, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: int256_cmpeq:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpcmpeqd %ymm1, %ymm0, %ymm0
; X64-NEXT: retq
%bincmp = icmp eq <8 x i32> %i, %j
@@ -79,12 +79,12 @@ define <8 x i32> @int256_cmpeq(<8 x i32> %i, <8 x i32> %j) nounwind readnone {
define <4 x i64> @v4i64_cmpeq(<4 x i64> %i, <4 x i64> %j) nounwind readnone {
; X32-LABEL: v4i64_cmpeq:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: v4i64_cmpeq:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0
; X64-NEXT: retq
%bincmp = icmp eq <4 x i64> %i, %j
@@ -94,12 +94,12 @@ define <4 x i64> @v4i64_cmpeq(<4 x i64> %i, <4 x i64> %j) nounwind readnone {
define <16 x i16> @v16i16_cmpeq(<16 x i16> %i, <16 x i16> %j) nounwind readnone {
; X32-LABEL: v16i16_cmpeq:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: v16i16_cmpeq:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0
; X64-NEXT: retq
%bincmp = icmp eq <16 x i16> %i, %j
@@ -109,12 +109,12 @@ define <16 x i16> @v16i16_cmpeq(<16 x i16> %i, <16 x i16> %j) nounwind readnone
define <32 x i8> @v32i8_cmpeq(<32 x i8> %i, <32 x i8> %j) nounwind readnone {
; X32-LABEL: v32i8_cmpeq:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: v32i8_cmpeq:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0
; X64-NEXT: retq
%bincmp = icmp eq <32 x i8> %i, %j
diff --git a/test/CodeGen/X86/avx2-conversions.ll b/test/CodeGen/X86/avx2-conversions.ll
index 60cc2cf199e6..1fee5ed56cb9 100755
--- a/test/CodeGen/X86/avx2-conversions.ll
+++ b/test/CodeGen/X86/avx2-conversions.ll
@@ -4,18 +4,18 @@
define <4 x i32> @trunc4(<4 x i64> %A) nounwind {
; X32-LABEL: trunc4:
-; X32: # BB#0:
-; X32-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; X32-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; X32-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; X32: # %bb.0:
+; X32-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; X32-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; X32-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; X32-NEXT: vzeroupper
; X32-NEXT: retl
;
; X64-LABEL: trunc4:
-; X64: # BB#0:
-; X64-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; X64-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; X64-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; X64: # %bb.0:
+; X64-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; X64-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; X64-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; X64-NEXT: vzeroupper
; X64-NEXT: retq
%B = trunc <4 x i64> %A to <4 x i32>
@@ -24,18 +24,18 @@ define <4 x i32> @trunc4(<4 x i64> %A) nounwind {
define <8 x i16> @trunc8(<8 x i32> %A) nounwind {
; X32-LABEL: trunc8:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; X32-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; X32-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; X32-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; X32-NEXT: vzeroupper
; X32-NEXT: retl
;
; X64-LABEL: trunc8:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; X64-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; X64-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; X64-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; X64-NEXT: vzeroupper
; X64-NEXT: retq
%B = trunc <8 x i32> %A to <8 x i16>
@@ -44,12 +44,12 @@ define <8 x i16> @trunc8(<8 x i32> %A) nounwind {
define <4 x i64> @sext4(<4 x i32> %A) nounwind {
; X32-LABEL: sext4:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpmovsxdq %xmm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: sext4:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpmovsxdq %xmm0, %ymm0
; X64-NEXT: retq
%B = sext <4 x i32> %A to <4 x i64>
@@ -58,12 +58,12 @@ define <4 x i64> @sext4(<4 x i32> %A) nounwind {
define <8 x i32> @sext8(<8 x i16> %A) nounwind {
; X32-LABEL: sext8:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpmovsxwd %xmm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: sext8:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpmovsxwd %xmm0, %ymm0
; X64-NEXT: retq
%B = sext <8 x i16> %A to <8 x i32>
@@ -72,12 +72,12 @@ define <8 x i32> @sext8(<8 x i16> %A) nounwind {
define <4 x i64> @zext4(<4 x i32> %A) nounwind {
; X32-LABEL: zext4:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; X32-NEXT: retl
;
; X64-LABEL: zext4:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; X64-NEXT: retq
%B = zext <4 x i32> %A to <4 x i64>
@@ -86,12 +86,12 @@ define <4 x i64> @zext4(<4 x i32> %A) nounwind {
define <8 x i32> @zext8(<8 x i16> %A) nounwind {
; X32-LABEL: zext8:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; X32-NEXT: retl
;
; X64-LABEL: zext8:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; X64-NEXT: retq
%B = zext <8 x i16> %A to <8 x i32>
@@ -100,13 +100,13 @@ define <8 x i32> @zext8(<8 x i16> %A) nounwind {
define <8 x i32> @zext_8i8_8i32(<8 x i8> %A) nounwind {
; X32-LABEL: zext_8i8_8i32:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0
; X32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; X32-NEXT: retl
;
; X64-LABEL: zext_8i8_8i32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; X64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; X64-NEXT: retq
@@ -116,12 +116,12 @@ define <8 x i32> @zext_8i8_8i32(<8 x i8> %A) nounwind {
define <16 x i16> @zext_16i8_16i16(<16 x i8> %z) {
; X32-LABEL: zext_16i8_16i16:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
; X32-NEXT: retl
;
; X64-LABEL: zext_16i8_16i16:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
; X64-NEXT: retq
%t = zext <16 x i8> %z to <16 x i16>
@@ -130,12 +130,12 @@ define <16 x i16> @zext_16i8_16i16(<16 x i8> %z) {
define <16 x i16> @sext_16i8_16i16(<16 x i8> %z) {
; X32-LABEL: sext_16i8_16i16:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpmovsxbw %xmm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: sext_16i8_16i16:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpmovsxbw %xmm0, %ymm0
; X64-NEXT: retq
%t = sext <16 x i8> %z to <16 x i16>
@@ -144,7 +144,7 @@ define <16 x i16> @sext_16i8_16i16(<16 x i8> %z) {
define <16 x i8> @trunc_16i16_16i8(<16 x i16> %z) {
; X32-LABEL: trunc_16i16_16i8:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vextracti128 $1, %ymm0, %xmm1
; X32-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
; X32-NEXT: vpshufb %xmm2, %xmm1, %xmm1
@@ -154,7 +154,7 @@ define <16 x i8> @trunc_16i16_16i8(<16 x i16> %z) {
; X32-NEXT: retl
;
; X64-LABEL: trunc_16i16_16i8:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
; X64-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
; X64-NEXT: vpshufb %xmm2, %xmm1, %xmm1
@@ -168,13 +168,13 @@ define <16 x i8> @trunc_16i16_16i8(<16 x i16> %z) {
define <4 x i64> @load_sext_test1(<4 x i32> *%ptr) {
; X32-LABEL: load_sext_test1:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpmovsxdq (%eax), %ymm0
; X32-NEXT: retl
;
; X64-LABEL: load_sext_test1:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpmovsxdq (%rdi), %ymm0
; X64-NEXT: retq
%X = load <4 x i32>, <4 x i32>* %ptr
@@ -184,13 +184,13 @@ define <4 x i64> @load_sext_test1(<4 x i32> *%ptr) {
define <4 x i64> @load_sext_test2(<4 x i8> *%ptr) {
; X32-LABEL: load_sext_test2:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpmovsxbq (%eax), %ymm0
; X32-NEXT: retl
;
; X64-LABEL: load_sext_test2:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpmovsxbq (%rdi), %ymm0
; X64-NEXT: retq
%X = load <4 x i8>, <4 x i8>* %ptr
@@ -200,13 +200,13 @@ define <4 x i64> @load_sext_test2(<4 x i8> *%ptr) {
define <4 x i64> @load_sext_test3(<4 x i16> *%ptr) {
; X32-LABEL: load_sext_test3:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpmovsxwq (%eax), %ymm0
; X32-NEXT: retl
;
; X64-LABEL: load_sext_test3:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpmovsxwq (%rdi), %ymm0
; X64-NEXT: retq
%X = load <4 x i16>, <4 x i16>* %ptr
@@ -216,13 +216,13 @@ define <4 x i64> @load_sext_test3(<4 x i16> *%ptr) {
define <8 x i32> @load_sext_test4(<8 x i16> *%ptr) {
; X32-LABEL: load_sext_test4:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpmovsxwd (%eax), %ymm0
; X32-NEXT: retl
;
; X64-LABEL: load_sext_test4:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpmovsxwd (%rdi), %ymm0
; X64-NEXT: retq
%X = load <8 x i16>, <8 x i16>* %ptr
@@ -232,13 +232,13 @@ define <8 x i32> @load_sext_test4(<8 x i16> *%ptr) {
define <8 x i32> @load_sext_test5(<8 x i8> *%ptr) {
; X32-LABEL: load_sext_test5:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpmovsxbd (%eax), %ymm0
; X32-NEXT: retl
;
; X64-LABEL: load_sext_test5:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpmovsxbd (%rdi), %ymm0
; X64-NEXT: retq
%X = load <8 x i8>, <8 x i8>* %ptr
diff --git a/test/CodeGen/X86/avx2-fma-fneg-combine.ll b/test/CodeGen/X86/avx2-fma-fneg-combine.ll
index 019593cc0f80..212a3ac4a93d 100644
--- a/test/CodeGen/X86/avx2-fma-fneg-combine.ll
+++ b/test/CodeGen/X86/avx2-fma-fneg-combine.ll
@@ -6,12 +6,12 @@
define <8 x float> @test1(<8 x float> %a, <8 x float> %b, <8 x float> %c) {
; X32-LABEL: test1:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: vfmsub213ps %ymm2, %ymm1, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test1:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: vfmsub213ps %ymm2, %ymm1, %ymm0
; X64-NEXT: retq
entry:
@@ -24,12 +24,12 @@ declare <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float>, <8 x float>, <8 x f
define <4 x float> @test2(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
; X32-LABEL: test2:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: vfnmsub213ps %xmm2, %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test2:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: vfnmsub213ps %xmm2, %xmm1, %xmm0
; X64-NEXT: retq
entry:
@@ -42,14 +42,14 @@ declare <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float> %a, <4 x float> %b, <4 x
define <4 x float> @test3(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
; X32-LABEL: test3:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: vfnmadd213ss %xmm2, %xmm1, %xmm0
; X32-NEXT: vbroadcastss {{\.LCPI.*}}, %xmm1
; X32-NEXT: vxorps %xmm1, %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test3:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: vfnmadd213ss %xmm2, %xmm1, %xmm0
; X64-NEXT: vbroadcastss {{.*}}(%rip), %xmm1
; X64-NEXT: vxorps %xmm1, %xmm0, %xmm0
@@ -64,12 +64,12 @@ declare <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float> %a, <4 x float> %b, <4
define <8 x float> @test4(<8 x float> %a, <8 x float> %b, <8 x float> %c) {
; X32-LABEL: test4:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test4:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0
; X64-NEXT: retq
entry:
@@ -80,14 +80,14 @@ entry:
define <8 x float> @test5(<8 x float> %a, <8 x float> %b, <8 x float> %c) {
; X32-LABEL: test5:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: vbroadcastss {{\.LCPI.*}}, %ymm3
; X32-NEXT: vxorps %ymm3, %ymm2, %ymm2
; X32-NEXT: vfmsub213ps %ymm2, %ymm1, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test5:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: vbroadcastss {{.*}}(%rip), %ymm3
; X64-NEXT: vxorps %ymm3, %ymm2, %ymm2
; X64-NEXT: vfmsub213ps %ymm2, %ymm1, %ymm0
@@ -103,12 +103,12 @@ declare <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float>, <8 x float>, <8 x f
define <2 x double> @test6(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
; X32-LABEL: test6:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: vfnmsub213pd %xmm2, %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test6:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: vfnmsub213pd %xmm2, %xmm1, %xmm0
; X64-NEXT: retq
entry:
diff --git a/test/CodeGen/X86/avx2-gather.ll b/test/CodeGen/X86/avx2-gather.ll
index 64dd6fa00616..a7da2fcc91d0 100644
--- a/test/CodeGen/X86/avx2-gather.ll
+++ b/test/CodeGen/X86/avx2-gather.ll
@@ -7,7 +7,7 @@ declare <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float>, i8*,
define <4 x float> @test_x86_avx2_gather_d_ps(i8* %a1, <4 x i32> %idx, <4 x float> %mask) {
; X32-LABEL: test_x86_avx2_gather_d_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vxorps %xmm2, %xmm2, %xmm2
; X32-NEXT: vgatherdps %xmm1, (%eax,%xmm0,2), %xmm2
@@ -15,7 +15,7 @@ define <4 x float> @test_x86_avx2_gather_d_ps(i8* %a1, <4 x i32> %idx, <4 x floa
; X32-NEXT: retl
;
; X64-LABEL: test_x86_avx2_gather_d_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vxorps %xmm2, %xmm2, %xmm2
; X64-NEXT: vgatherdps %xmm1, (%rdi,%xmm0,2), %xmm2
; X64-NEXT: vmovaps %xmm2, %xmm0
@@ -30,7 +30,7 @@ declare <2 x double> @llvm.x86.avx2.gather.d.pd(<2 x double>, i8*,
define <2 x double> @test_x86_avx2_gather_d_pd(i8* %a1, <4 x i32> %idx, <2 x double> %mask) {
; X32-LABEL: test_x86_avx2_gather_d_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vxorpd %xmm2, %xmm2, %xmm2
; X32-NEXT: vgatherdpd %xmm1, (%eax,%xmm0,2), %xmm2
@@ -38,7 +38,7 @@ define <2 x double> @test_x86_avx2_gather_d_pd(i8* %a1, <4 x i32> %idx, <2 x dou
; X32-NEXT: retl
;
; X64-LABEL: test_x86_avx2_gather_d_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vxorpd %xmm2, %xmm2, %xmm2
; X64-NEXT: vgatherdpd %xmm1, (%rdi,%xmm0,2), %xmm2
; X64-NEXT: vmovapd %xmm2, %xmm0
@@ -53,16 +53,16 @@ declare <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float>, i8*,
define <8 x float> @test_x86_avx2_gather_d_ps_256(i8* %a1, <8 x i32> %idx, <8 x float> %mask) {
; X32-LABEL: test_x86_avx2_gather_d_ps_256:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: vxorps %ymm2, %ymm2, %ymm2
+; X32-NEXT: vxorps %xmm2, %xmm2, %xmm2
; X32-NEXT: vgatherdps %ymm1, (%eax,%ymm0,4), %ymm2
; X32-NEXT: vmovaps %ymm2, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_x86_avx2_gather_d_ps_256:
-; X64: # BB#0:
-; X64-NEXT: vxorps %ymm2, %ymm2, %ymm2
+; X64: # %bb.0:
+; X64-NEXT: vxorps %xmm2, %xmm2, %xmm2
; X64-NEXT: vgatherdps %ymm1, (%rdi,%ymm0,4), %ymm2
; X64-NEXT: vmovaps %ymm2, %ymm0
; X64-NEXT: retq
@@ -76,16 +76,16 @@ declare <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double>, i8*,
define <4 x double> @test_x86_avx2_gather_d_pd_256(i8* %a1, <4 x i32> %idx, <4 x double> %mask) {
; X32-LABEL: test_x86_avx2_gather_d_pd_256:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: vxorpd %ymm2, %ymm2, %ymm2
+; X32-NEXT: vxorpd %xmm2, %xmm2, %xmm2
; X32-NEXT: vgatherdpd %ymm1, (%eax,%xmm0,8), %ymm2
; X32-NEXT: vmovapd %ymm2, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_x86_avx2_gather_d_pd_256:
-; X64: # BB#0:
-; X64-NEXT: vxorpd %ymm2, %ymm2, %ymm2
+; X64: # %bb.0:
+; X64-NEXT: vxorpd %xmm2, %xmm2, %xmm2
; X64-NEXT: vgatherdpd %ymm1, (%rdi,%xmm0,8), %ymm2
; X64-NEXT: vmovapd %ymm2, %ymm0
; X64-NEXT: retq
@@ -96,7 +96,7 @@ define <4 x double> @test_x86_avx2_gather_d_pd_256(i8* %a1, <4 x i32> %idx, <4 x
define <2 x i64> @test_mm_i32gather_epi32(i32 *%a0, <2 x i64> %a1) {
; X32-LABEL: test_mm_i32gather_epi32:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; X32-NEXT: vpxor %xmm1, %xmm1, %xmm1
@@ -105,7 +105,7 @@ define <2 x i64> @test_mm_i32gather_epi32(i32 *%a0, <2 x i64> %a1) {
; X32-NEXT: retl
;
; X64-LABEL: test_mm_i32gather_epi32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1
; X64-NEXT: vpgatherdd %xmm2, (%rdi,%xmm0,2), %xmm1
@@ -122,7 +122,7 @@ declare <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32>, i8*, <4 x i32>, <4 x i32>
define <2 x double> @test_mm_i32gather_pd(double *%a0, <2 x i64> %a1) {
; X32-LABEL: test_mm_i32gather_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; X32-NEXT: vxorpd %xmm1, %xmm1, %xmm1
@@ -131,7 +131,7 @@ define <2 x double> @test_mm_i32gather_pd(double *%a0, <2 x i64> %a1) {
; X32-NEXT: retl
;
; X64-LABEL: test_mm_i32gather_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; X64-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; X64-NEXT: vgatherdpd %xmm2, (%rdi,%xmm0,2), %xmm1
diff --git a/test/CodeGen/X86/avx2-intrinsics-fast-isel.ll b/test/CodeGen/X86/avx2-intrinsics-fast-isel.ll
index cb0abf3b137f..e0baf8408d03 100644
--- a/test/CodeGen/X86/avx2-intrinsics-fast-isel.ll
+++ b/test/CodeGen/X86/avx2-intrinsics-fast-isel.ll
@@ -1,70 +1,56 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=X32
-; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=X64
+; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=X86
+; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=X64
; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/avx2-builtins.c
define <4 x i64> @test_mm256_abs_epi8(<4 x i64> %a0) {
-; X32-LABEL: test_mm256_abs_epi8:
-; X32: # BB#0:
-; X32-NEXT: vpabsb %ymm0, %ymm0
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_abs_epi8:
-; X64: # BB#0:
-; X64-NEXT: vpabsb %ymm0, %ymm0
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_abs_epi8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpabsb %ymm0, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
%arg = bitcast <4 x i64> %a0 to <32 x i8>
- %call = call <32 x i8> @llvm.x86.avx2.pabs.b(<32 x i8> %arg)
- %res = bitcast <32 x i8> %call to <4 x i64>
+ %sub = sub <32 x i8> zeroinitializer, %arg
+ %cmp = icmp sgt <32 x i8> %arg, zeroinitializer
+ %sel = select <32 x i1> %cmp, <32 x i8> %arg, <32 x i8> %sub
+ %res = bitcast <32 x i8> %sel to <4 x i64>
ret <4 x i64> %res
}
declare <32 x i8> @llvm.x86.avx2.pabs.b(<32 x i8>) nounwind readnone
define <4 x i64> @test_mm256_abs_epi16(<4 x i64> %a0) {
-; X32-LABEL: test_mm256_abs_epi16:
-; X32: # BB#0:
-; X32-NEXT: vpabsw %ymm0, %ymm0
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_abs_epi16:
-; X64: # BB#0:
-; X64-NEXT: vpabsw %ymm0, %ymm0
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_abs_epi16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpabsw %ymm0, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
%arg = bitcast <4 x i64> %a0 to <16 x i16>
- %call = call <16 x i16> @llvm.x86.avx2.pabs.w(<16 x i16> %arg)
- %res = bitcast <16 x i16> %call to <4 x i64>
+ %sub = sub <16 x i16> zeroinitializer, %arg
+ %cmp = icmp sgt <16 x i16> %arg, zeroinitializer
+ %sel = select <16 x i1> %cmp, <16 x i16> %arg, <16 x i16> %sub
+ %res = bitcast <16 x i16> %sel to <4 x i64>
ret <4 x i64> %res
}
declare <16 x i16> @llvm.x86.avx2.pabs.w(<16 x i16>) nounwind readnone
define <4 x i64> @test_mm256_abs_epi32(<4 x i64> %a0) {
-; X32-LABEL: test_mm256_abs_epi32:
-; X32: # BB#0:
-; X32-NEXT: vpabsd %ymm0, %ymm0
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_abs_epi32:
-; X64: # BB#0:
-; X64-NEXT: vpabsd %ymm0, %ymm0
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_abs_epi32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpabsd %ymm0, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
%arg = bitcast <4 x i64> %a0 to <8 x i32>
- %call = call <8 x i32> @llvm.x86.avx2.pabs.d(<8 x i32> %arg)
- %res = bitcast <8 x i32> %call to <4 x i64>
+ %sub = sub <8 x i32> zeroinitializer, %arg
+ %cmp = icmp sgt <8 x i32> %arg, zeroinitializer
+ %sel = select <8 x i1> %cmp, <8 x i32> %arg, <8 x i32> %sub
+ %res = bitcast <8 x i32> %sel to <4 x i64>
ret <4 x i64> %res
}
declare <8 x i32> @llvm.x86.avx2.pabs.d(<8 x i32>) nounwind readnone
define <4 x i64> @test_mm256_add_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind {
-; X32-LABEL: test_mm256_add_epi8:
-; X32: # BB#0:
-; X32-NEXT: vpaddb %ymm1, %ymm0, %ymm0
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_add_epi8:
-; X64: # BB#0:
-; X64-NEXT: vpaddb %ymm1, %ymm0, %ymm0
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_add_epi8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpaddb %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
%arg0 = bitcast <4 x i64> %a0 to <32 x i8>
%arg1 = bitcast <4 x i64> %a1 to <32 x i8>
%res = add <32 x i8> %arg0, %arg1
@@ -73,15 +59,10 @@ define <4 x i64> @test_mm256_add_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind {
}
define <4 x i64> @test_mm256_add_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind {
-; X32-LABEL: test_mm256_add_epi16:
-; X32: # BB#0:
-; X32-NEXT: vpaddw %ymm1, %ymm0, %ymm0
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_add_epi16:
-; X64: # BB#0:
-; X64-NEXT: vpaddw %ymm1, %ymm0, %ymm0
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_add_epi16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpaddw %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
%arg0 = bitcast <4 x i64> %a0 to <16 x i16>
%arg1 = bitcast <4 x i64> %a1 to <16 x i16>
%res = add <16 x i16> %arg0, %arg1
@@ -90,15 +71,10 @@ define <4 x i64> @test_mm256_add_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind {
}
define <4 x i64> @test_mm256_add_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
-; X32-LABEL: test_mm256_add_epi32:
-; X32: # BB#0:
-; X32-NEXT: vpaddd %ymm1, %ymm0, %ymm0
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_add_epi32:
-; X64: # BB#0:
-; X64-NEXT: vpaddd %ymm1, %ymm0, %ymm0
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_add_epi32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
%arg0 = bitcast <4 x i64> %a0 to <8 x i32>
%arg1 = bitcast <4 x i64> %a1 to <8 x i32>
%res = add <8 x i32> %arg0, %arg1
@@ -107,29 +83,19 @@ define <4 x i64> @test_mm256_add_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
}
define <4 x i64> @test_mm256_add_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
-; X32-LABEL: test_mm256_add_epi64:
-; X32: # BB#0:
-; X32-NEXT: vpaddq %ymm1, %ymm0, %ymm0
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_add_epi64:
-; X64: # BB#0:
-; X64-NEXT: vpaddq %ymm1, %ymm0, %ymm0
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_add_epi64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpaddq %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
%res = add <4 x i64> %a0, %a1
ret <4 x i64> %res
}
define <4 x i64> @test_mm256_adds_epi8(<4 x i64> %a0, <4 x i64> %a1) {
-; X32-LABEL: test_mm256_adds_epi8:
-; X32: # BB#0:
-; X32-NEXT: vpaddsb %ymm1, %ymm0, %ymm0
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_adds_epi8:
-; X64: # BB#0:
-; X64-NEXT: vpaddsb %ymm1, %ymm0, %ymm0
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_adds_epi8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpaddsb %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
%arg0 = bitcast <4 x i64> %a0 to <32 x i8>
%arg1 = bitcast <4 x i64> %a1 to <32 x i8>
%res = call <32 x i8> @llvm.x86.avx2.padds.b(<32 x i8> %arg0, <32 x i8> %arg1)
@@ -139,15 +105,10 @@ define <4 x i64> @test_mm256_adds_epi8(<4 x i64> %a0, <4 x i64> %a1) {
declare <32 x i8> @llvm.x86.avx2.padds.b(<32 x i8>, <32 x i8>) nounwind readnone
define <4 x i64> @test_mm256_adds_epi16(<4 x i64> %a0, <4 x i64> %a1) {
-; X32-LABEL: test_mm256_adds_epi16:
-; X32: # BB#0:
-; X32-NEXT: vpaddsw %ymm1, %ymm0, %ymm0
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_adds_epi16:
-; X64: # BB#0:
-; X64-NEXT: vpaddsw %ymm1, %ymm0, %ymm0
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_adds_epi16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpaddsw %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
%arg0 = bitcast <4 x i64> %a0 to <16 x i16>
%arg1 = bitcast <4 x i64> %a1 to <16 x i16>
%res = call <16 x i16> @llvm.x86.avx2.padds.w(<16 x i16> %arg0, <16 x i16> %arg1)
@@ -157,15 +118,10 @@ define <4 x i64> @test_mm256_adds_epi16(<4 x i64> %a0, <4 x i64> %a1) {
declare <16 x i16> @llvm.x86.avx2.padds.w(<16 x i16>, <16 x i16>) nounwind readnone
define <4 x i64> @test_mm256_adds_epu8(<4 x i64> %a0, <4 x i64> %a1) {
-; X32-LABEL: test_mm256_adds_epu8:
-; X32: # BB#0:
-; X32-NEXT: vpaddusb %ymm1, %ymm0, %ymm0
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_adds_epu8:
-; X64: # BB#0:
-; X64-NEXT: vpaddusb %ymm1, %ymm0, %ymm0
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_adds_epu8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpaddusb %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
%arg0 = bitcast <4 x i64> %a0 to <32 x i8>
%arg1 = bitcast <4 x i64> %a1 to <32 x i8>
%res = call <32 x i8> @llvm.x86.avx2.paddus.b(<32 x i8> %arg0, <32 x i8> %arg1)
@@ -175,15 +131,10 @@ define <4 x i64> @test_mm256_adds_epu8(<4 x i64> %a0, <4 x i64> %a1) {
declare <32 x i8> @llvm.x86.avx2.paddus.b(<32 x i8>, <32 x i8>) nounwind readnone
define <4 x i64> @test_mm256_adds_epu16(<4 x i64> %a0, <4 x i64> %a1) {
-; X32-LABEL: test_mm256_adds_epu16:
-; X32: # BB#0:
-; X32-NEXT: vpaddusw %ymm1, %ymm0, %ymm0
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_adds_epu16:
-; X64: # BB#0:
-; X64-NEXT: vpaddusw %ymm1, %ymm0, %ymm0
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_adds_epu16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpaddusw %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
%arg0 = bitcast <4 x i64> %a0 to <16 x i16>
%arg1 = bitcast <4 x i64> %a1 to <16 x i16>
%res = call <16 x i16> @llvm.x86.avx2.paddus.w(<16 x i16> %arg0, <16 x i16> %arg1)
@@ -193,15 +144,10 @@ define <4 x i64> @test_mm256_adds_epu16(<4 x i64> %a0, <4 x i64> %a1) {
declare <16 x i16> @llvm.x86.avx2.paddus.w(<16 x i16>, <16 x i16>) nounwind readnone
define <4 x i64> @test_mm256_alignr_epi8(<4 x i64> %a0, <4 x i64> %a1) {
-; X32-LABEL: test_mm256_alignr_epi8:
-; X32: # BB#0:
-; X32-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm1[0,1],ymm0[18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm1[16,17]
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_alignr_epi8:
-; X64: # BB#0:
-; X64-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm1[0,1],ymm0[18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm1[16,17]
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_alignr_epi8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm1[0,1],ymm0[18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm1[16,17]
+; CHECK-NEXT: ret{{[l|q]}}
%arg0 = bitcast <4 x i64> %a0 to <32 x i8>
%arg1 = bitcast <4 x i64> %a1 to <32 x i8>
%shuf = shufflevector <32 x i8> %arg0, <32 x i8> %arg1, <32 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 49>
@@ -210,15 +156,10 @@ define <4 x i64> @test_mm256_alignr_epi8(<4 x i64> %a0, <4 x i64> %a1) {
}
define <4 x i64> @test2_mm256_alignr_epi8(<4 x i64> %a0, <4 x i64> %a1) {
-; X32-LABEL: test2_mm256_alignr_epi8:
-; X32: # BB#0:
-; X32-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm1[0],ymm0[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm1[16]
-; X32-NEXT: retl
-;
-; X64-LABEL: test2_mm256_alignr_epi8:
-; X64: # BB#0:
-; X64-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm1[0],ymm0[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm1[16]
-; X64-NEXT: retq
+; CHECK-LABEL: test2_mm256_alignr_epi8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm1[0],ymm0[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm1[16]
+; CHECK-NEXT: ret{{[l|q]}}
%arg0 = bitcast <4 x i64> %a0 to <32 x i8>
%arg1 = bitcast <4 x i64> %a1 to <32 x i8>
%shuf = shufflevector <32 x i8> %arg0, <32 x i8> %arg1, <32 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48>
@@ -227,84 +168,65 @@ define <4 x i64> @test2_mm256_alignr_epi8(<4 x i64> %a0, <4 x i64> %a1) {
}
define <4 x i64> @test_mm256_and_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind {
-; X32-LABEL: test_mm256_and_si256:
-; X32: # BB#0:
-; X32-NEXT: vandps %ymm1, %ymm0, %ymm0
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_and_si256:
-; X64: # BB#0:
-; X64-NEXT: vandps %ymm1, %ymm0, %ymm0
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_and_si256:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vandps %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
%res = and <4 x i64> %a0, %a1
ret <4 x i64> %res
}
define <4 x i64> @test_mm256_andnot_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind {
-; X32-LABEL: test_mm256_andnot_si256:
-; X32: # BB#0:
-; X32-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
-; X32-NEXT: vpxor %ymm2, %ymm0, %ymm0
-; X32-NEXT: vpand %ymm1, %ymm0, %ymm0
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_andnot_si256:
-; X64: # BB#0:
-; X64-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
-; X64-NEXT: vpxor %ymm2, %ymm0, %ymm0
-; X64-NEXT: vpand %ymm1, %ymm0, %ymm0
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_andnot_si256:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
+; CHECK-NEXT: vpxor %ymm2, %ymm0, %ymm0
+; CHECK-NEXT: vpand %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
%not = xor <4 x i64> %a0, <i64 -1, i64 -1, i64 -1, i64 -1>
%res = and <4 x i64> %not, %a1
ret <4 x i64> %res
}
-define <4 x i64> @test_mm256_avg_epu8(<4 x i64> %a0, <4 x i64> %a1) {
-; X32-LABEL: test_mm256_avg_epu8:
-; X32: # BB#0:
-; X32-NEXT: vpavgb %ymm1, %ymm0, %ymm0
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_avg_epu8:
-; X64: # BB#0:
-; X64-NEXT: vpavgb %ymm1, %ymm0, %ymm0
-; X64-NEXT: retq
+define <4 x i64> @test_mm256_avg_epu8(<4 x i64> %a0, <4 x i64> %a1) nounwind {
+; CHECK-LABEL: test_mm256_avg_epu8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpavgb %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
%arg0 = bitcast <4 x i64> %a0 to <32 x i8>
%arg1 = bitcast <4 x i64> %a1 to <32 x i8>
- %res = call <32 x i8> @llvm.x86.avx2.pavg.b(<32 x i8> %arg0, <32 x i8> %arg1)
+ %zext0 = zext <32 x i8> %arg0 to <32 x i16>
+ %zext1 = zext <32 x i8> %arg1 to <32 x i16>
+ %add = add <32 x i16> %zext0, %zext1
+ %add1 = add <32 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+ %lshr = lshr <32 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+ %res = trunc <32 x i16> %lshr to <32 x i8>
%bc = bitcast <32 x i8> %res to <4 x i64>
ret <4 x i64> %bc
}
-declare <32 x i8> @llvm.x86.avx2.pavg.b(<32 x i8>, <32 x i8>) nounwind readnone
-define <4 x i64> @test_mm256_avg_epu16(<4 x i64> %a0, <4 x i64> %a1) {
-; X32-LABEL: test_mm256_avg_epu16:
-; X32: # BB#0:
-; X32-NEXT: vpavgw %ymm1, %ymm0, %ymm0
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_avg_epu16:
-; X64: # BB#0:
-; X64-NEXT: vpavgw %ymm1, %ymm0, %ymm0
-; X64-NEXT: retq
+define <4 x i64> @test_mm256_avg_epu16(<4 x i64> %a0, <4 x i64> %a1) nounwind {
+; CHECK-LABEL: test_mm256_avg_epu16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpavgw %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
%arg0 = bitcast <4 x i64> %a0 to <16 x i16>
%arg1 = bitcast <4 x i64> %a1 to <16 x i16>
- %res = call <16 x i16> @llvm.x86.avx2.pavg.w(<16 x i16> %arg0, <16 x i16> %arg1)
+ %zext0 = zext <16 x i16> %arg0 to <16 x i32>
+ %zext1 = zext <16 x i16> %arg1 to <16 x i32>
+ %add = add <16 x i32> %zext0, %zext1
+ %add1 = add <16 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+ %lshr = lshr <16 x i32> %add1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+ %res = trunc <16 x i32> %lshr to <16 x i16>
%bc = bitcast <16 x i16> %res to <4 x i64>
ret <4 x i64> %bc
}
-declare <16 x i16> @llvm.x86.avx2.pavg.w(<16 x i16>, <16 x i16>) nounwind readnone
define <4 x i64> @test_mm256_blend_epi16(<4 x i64> %a0, <4 x i64> %a1) {
-; X32-LABEL: test_mm256_blend_epi16:
-; X32: # BB#0:
-; X32-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7,8],ymm1[9],ymm0[10,11,12,13,14,15]
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_blend_epi16:
-; X64: # BB#0:
-; X64-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7,8],ymm1[9],ymm0[10,11,12,13,14,15]
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_blend_epi16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7,8],ymm1[9],ymm0[10,11,12,13,14,15]
+; CHECK-NEXT: ret{{[l|q]}}
%arg0 = bitcast <4 x i64> %a0 to <16 x i16>
%arg1 = bitcast <4 x i64> %a1 to <16 x i16>
%shuf = shufflevector <16 x i16> %arg0, <16 x i16> %arg1, <16 x i32> <i32 0, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 25, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -313,15 +235,10 @@ define <4 x i64> @test_mm256_blend_epi16(<4 x i64> %a0, <4 x i64> %a1) {
}
define <2 x i64> @test_mm_blend_epi32(<2 x i64> %a0, <2 x i64> %a1) {
-; X32-LABEL: test_mm_blend_epi32:
-; X32: # BB#0:
-; X32-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm_blend_epi32:
-; X64: # BB#0:
-; X64-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm_blend_epi32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
+; CHECK-NEXT: ret{{[l|q]}}
%arg0 = bitcast <2 x i64> %a0 to <4 x i32>
%arg1 = bitcast <2 x i64> %a1 to <4 x i32>
%shuf = shufflevector <4 x i32> %arg0, <4 x i32> %arg1, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
@@ -330,15 +247,10 @@ define <2 x i64> @test_mm_blend_epi32(<2 x i64> %a0, <2 x i64> %a1) {
}
define <4 x i64> @test_mm256_blend_epi32(<4 x i64> %a0, <4 x i64> %a1) {
-; X32-LABEL: test_mm256_blend_epi32:
-; X32: # BB#0:
-; X32-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6,7]
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_blend_epi32:
-; X64: # BB#0:
-; X64-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6,7]
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_blend_epi32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6,7]
+; CHECK-NEXT: ret{{[l|q]}}
%arg0 = bitcast <4 x i64> %a0 to <8 x i32>
%arg1 = bitcast <4 x i64> %a1 to <8 x i32>
%shuf = shufflevector <8 x i32> %arg0, <8 x i32> %arg1, <8 x i32> <i32 8, i32 1, i32 10, i32 3, i32 12, i32 13, i32 6, i32 7>
@@ -347,15 +259,10 @@ define <4 x i64> @test_mm256_blend_epi32(<4 x i64> %a0, <4 x i64> %a1) {
}
define <4 x i64> @test_mm256_blendv_epi8(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> %a2) {
-; X32-LABEL: test_mm256_blendv_epi8:
-; X32: # BB#0:
-; X32-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_blendv_epi8:
-; X64: # BB#0:
-; X64-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_blendv_epi8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
%arg0 = bitcast <4 x i64> %a0 to <32 x i8>
%arg1 = bitcast <4 x i64> %a1 to <32 x i8>
%arg2 = bitcast <4 x i64> %a2 to <32 x i8>
@@ -366,15 +273,10 @@ define <4 x i64> @test_mm256_blendv_epi8(<4 x i64> %a0, <4 x i64> %a1, <4 x i64>
declare <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8>, <32 x i8>, <32 x i8>) nounwind readnone
define <2 x i64> @test_mm_broadcastb_epi8(<2 x i64> %a0) {
-; X32-LABEL: test_mm_broadcastb_epi8:
-; X32: # BB#0:
-; X32-NEXT: vpbroadcastb %xmm0, %xmm0
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm_broadcastb_epi8:
-; X64: # BB#0:
-; X64-NEXT: vpbroadcastb %xmm0, %xmm0
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm_broadcastb_epi8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpbroadcastb %xmm0, %xmm0
+; CHECK-NEXT: ret{{[l|q]}}
%arg0 = bitcast <2 x i64> %a0 to <16 x i8>
%shuf = shufflevector <16 x i8> %arg0, <16 x i8> undef, <16 x i32> zeroinitializer
%res = bitcast <16 x i8> %shuf to <2 x i64>
@@ -382,15 +284,10 @@ define <2 x i64> @test_mm_broadcastb_epi8(<2 x i64> %a0) {
}
define <4 x i64> @test_mm256_broadcastb_epi8(<4 x i64> %a0) {
-; X32-LABEL: test_mm256_broadcastb_epi8:
-; X32: # BB#0:
-; X32-NEXT: vpbroadcastb %xmm0, %ymm0
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_broadcastb_epi8:
-; X64: # BB#0:
-; X64-NEXT: vpbroadcastb %xmm0, %ymm0
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_broadcastb_epi8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpbroadcastb %xmm0, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
%arg0 = bitcast <4 x i64> %a0 to <32 x i8>
%shuf = shufflevector <32 x i8> %arg0, <32 x i8> undef, <32 x i32> zeroinitializer
%res = bitcast <32 x i8> %shuf to <4 x i64>
@@ -398,15 +295,10 @@ define <4 x i64> @test_mm256_broadcastb_epi8(<4 x i64> %a0) {
}
define <2 x i64> @test_mm_broadcastd_epi32(<2 x i64> %a0) {
-; X32-LABEL: test_mm_broadcastd_epi32:
-; X32: # BB#0:
-; X32-NEXT: vbroadcastss %xmm0, %xmm0
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm_broadcastd_epi32:
-; X64: # BB#0:
-; X64-NEXT: vbroadcastss %xmm0, %xmm0
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm_broadcastd_epi32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vbroadcastss %xmm0, %xmm0
+; CHECK-NEXT: ret{{[l|q]}}
%arg0 = bitcast <2 x i64> %a0 to <4 x i32>
%shuf = shufflevector <4 x i32> %arg0, <4 x i32> undef, <4 x i32> zeroinitializer
%res = bitcast <4 x i32> %shuf to <2 x i64>
@@ -414,15 +306,10 @@ define <2 x i64> @test_mm_broadcastd_epi32(<2 x i64> %a0) {
}
define <4 x i64> @test_mm256_broadcastd_epi32(<4 x i64> %a0) {
-; X32-LABEL: test_mm256_broadcastd_epi32:
-; X32: # BB#0:
-; X32-NEXT: vbroadcastss %xmm0, %ymm0
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_broadcastd_epi32:
-; X64: # BB#0:
-; X64-NEXT: vbroadcastss %xmm0, %ymm0
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_broadcastd_epi32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vbroadcastss %xmm0, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
%arg0 = bitcast <4 x i64> %a0 to <8 x i32>
%shuf = shufflevector <8 x i32> %arg0, <8 x i32> undef, <8 x i32> zeroinitializer
%res = bitcast <8 x i32> %shuf to <4 x i64>
@@ -430,131 +317,90 @@ define <4 x i64> @test_mm256_broadcastd_epi32(<4 x i64> %a0) {
}
define <2 x i64> @test_mm_broadcastq_epi64(<2 x i64> %a0) {
-; X32-LABEL: test_mm_broadcastq_epi64:
-; X32: # BB#0:
-; X32-NEXT: vpbroadcastq %xmm0, %xmm0
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm_broadcastq_epi64:
-; X64: # BB#0:
-; X64-NEXT: vpbroadcastq %xmm0, %xmm0
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm_broadcastq_epi64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpbroadcastq %xmm0, %xmm0
+; CHECK-NEXT: ret{{[l|q]}}
%res = shufflevector <2 x i64> %a0, <2 x i64> undef, <2 x i32> zeroinitializer
ret <2 x i64> %res
}
define <4 x i64> @test_mm256_broadcastq_epi64(<4 x i64> %a0) {
-; X32-LABEL: test_mm256_broadcastq_epi64:
-; X32: # BB#0:
-; X32-NEXT: vbroadcastsd %xmm0, %ymm0
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_broadcastq_epi64:
-; X64: # BB#0:
-; X64-NEXT: vbroadcastsd %xmm0, %ymm0
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_broadcastq_epi64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
%res = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> zeroinitializer
ret <4 x i64> %res
}
define <2 x double> @test_mm_broadcastsd_pd(<2 x double> %a0) {
-; X32-LABEL: test_mm_broadcastsd_pd:
-; X32: # BB#0:
-; X32-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm_broadcastsd_pd:
-; X64: # BB#0:
-; X64-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm_broadcastsd_pd:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
+; CHECK-NEXT: ret{{[l|q]}}
%res = shufflevector <2 x double> %a0, <2 x double> undef, <2 x i32> zeroinitializer
ret <2 x double> %res
}
define <4 x double> @test_mm256_broadcastsd_pd(<4 x double> %a0) {
-; X32-LABEL: test_mm256_broadcastsd_pd:
-; X32: # BB#0:
-; X32-NEXT: vbroadcastsd %xmm0, %ymm0
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_broadcastsd_pd:
-; X64: # BB#0:
-; X64-NEXT: vbroadcastsd %xmm0, %ymm0
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_broadcastsd_pd:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
%res = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> zeroinitializer
ret <4 x double> %res
}
define <4 x i64> @test_mm256_broadcastsi128_si256(<2 x i64> %a0) {
-; X32-LABEL: test_mm256_broadcastsi128_si256:
-; X32: # BB#0:
-; X32-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
-; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_broadcastsi128_si256:
-; X64: # BB#0:
-; X64-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
-; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_broadcastsi128_si256:
+; CHECK: # %bb.0:
+; CHECK-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
+; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
%res = shufflevector <2 x i64> %a0, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
ret <4 x i64> %res
}
define <4 x i64> @test_mm256_broadcastsi128_si256_mem(<2 x i64>* %p0) {
-; X32-LABEL: test_mm256_broadcastsi128_si256_mem:
-; X32: # BB#0:
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
-; X32-NEXT: retl
+; X86-LABEL: test_mm256_broadcastsi128_si256_mem:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
+; X86-NEXT: ret{{[l|q]}}
;
; X64-LABEL: test_mm256_broadcastsi128_si256_mem:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
-; X64-NEXT: retq
+; X64-NEXT: ret{{[l|q]}}
%a0 = load <2 x i64>, <2 x i64>* %p0
%res = shufflevector <2 x i64> %a0, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
ret <4 x i64> %res
}
define <4 x float> @test_mm_broadcastss_ps(<4 x float> %a0) {
-; X32-LABEL: test_mm_broadcastss_ps:
-; X32: # BB#0:
-; X32-NEXT: vbroadcastss %xmm0, %xmm0
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm_broadcastss_ps:
-; X64: # BB#0:
-; X64-NEXT: vbroadcastss %xmm0, %xmm0
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm_broadcastss_ps:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vbroadcastss %xmm0, %xmm0
+; CHECK-NEXT: ret{{[l|q]}}
%res = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> zeroinitializer
ret <4 x float> %res
}
define <8 x float> @test_mm256_broadcastss_ps(<8 x float> %a0) {
-; X32-LABEL: test_mm256_broadcastss_ps:
-; X32: # BB#0:
-; X32-NEXT: vbroadcastss %xmm0, %ymm0
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_broadcastss_ps:
-; X64: # BB#0:
-; X64-NEXT: vbroadcastss %xmm0, %ymm0
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_broadcastss_ps:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vbroadcastss %xmm0, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
%res = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> zeroinitializer
ret <8 x float> %res
}
define <2 x i64> @test_mm_broadcastw_epi16(<2 x i64> %a0) {
-; X32-LABEL: test_mm_broadcastw_epi16:
-; X32: # BB#0:
-; X32-NEXT: vpbroadcastw %xmm0, %xmm0
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm_broadcastw_epi16:
-; X64: # BB#0:
-; X64-NEXT: vpbroadcastw %xmm0, %xmm0
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm_broadcastw_epi16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpbroadcastw %xmm0, %xmm0
+; CHECK-NEXT: ret{{[l|q]}}
%arg0 = bitcast <2 x i64> %a0 to <8 x i16>
%shuf = shufflevector <8 x i16> %arg0, <8 x i16> undef, <8 x i32> zeroinitializer
%res = bitcast <8 x i16> %shuf to <2 x i64>
@@ -562,15 +408,10 @@ define <2 x i64> @test_mm_broadcastw_epi16(<2 x i64> %a0) {
}
define <4 x i64> @test_mm256_broadcastw_epi16(<4 x i64> %a0) {
-; X32-LABEL: test_mm256_broadcastw_epi16:
-; X32: # BB#0:
-; X32-NEXT: vpbroadcastw %xmm0, %ymm0
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_broadcastw_epi16:
-; X64: # BB#0:
-; X64-NEXT: vpbroadcastw %xmm0, %ymm0
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_broadcastw_epi16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpbroadcastw %xmm0, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
%arg0 = bitcast <4 x i64> %a0 to <16 x i16>
%shuf = shufflevector <16 x i16> %arg0, <16 x i16> undef, <16 x i32> zeroinitializer
%res = bitcast <16 x i16> %shuf to <4 x i64>
@@ -578,15 +419,10 @@ define <4 x i64> @test_mm256_broadcastw_epi16(<4 x i64> %a0) {
}
define <4 x i64> @test_mm256_bslli_epi128(<4 x i64> %a0) {
-; X32-LABEL: test_mm256_bslli_epi128:
-; X32: # BB#0:
-; X32-NEXT: vpslldq {{.*#+}} ymm0 = zero,zero,zero,ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12],zero,zero,zero,ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28]
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_bslli_epi128:
-; X64: # BB#0:
-; X64-NEXT: vpslldq {{.*#+}} ymm0 = zero,zero,zero,ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12],zero,zero,zero,ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28]
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_bslli_epi128:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpslldq {{.*#+}} ymm0 = zero,zero,zero,ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12],zero,zero,zero,ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28]
+; CHECK-NEXT: ret{{[l|q]}}
%arg0 = bitcast <4 x i64> %a0 to <32 x i8>
%shuf = shufflevector <32 x i8> zeroinitializer, <32 x i8> %arg0, <32 x i32> <i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60>
%res = bitcast <32 x i8> %shuf to <4 x i64>
@@ -594,15 +430,10 @@ define <4 x i64> @test_mm256_bslli_epi128(<4 x i64> %a0) {
}
define <4 x i64> @test_mm256_bsrli_epi128(<4 x i64> %a0) {
-; X32-LABEL: test_mm256_bsrli_epi128:
-; X32: # BB#0:
-; X32-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,ymm0[19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_bsrli_epi128:
-; X64: # BB#0:
-; X64-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,ymm0[19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_bsrli_epi128:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,ymm0[19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero
+; CHECK-NEXT: ret{{[l|q]}}
%arg0 = bitcast <4 x i64> %a0 to <32 x i8>
%shuf = shufflevector <32 x i8> %arg0, <32 x i8> zeroinitializer, <32 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50>
%res = bitcast <32 x i8> %shuf to <4 x i64>
@@ -610,15 +441,10 @@ define <4 x i64> @test_mm256_bsrli_epi128(<4 x i64> %a0) {
}
define <4 x i64> @test_mm256_cmpeq_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind {
-; X32-LABEL: test_mm256_cmpeq_epi8:
-; X32: # BB#0:
-; X32-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_cmpeq_epi8:
-; X64: # BB#0:
-; X64-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_cmpeq_epi8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
%arg0 = bitcast <4 x i64> %a0 to <32 x i8>
%arg1 = bitcast <4 x i64> %a1 to <32 x i8>
%cmp = icmp eq <32 x i8> %arg0, %arg1
@@ -628,15 +454,10 @@ define <4 x i64> @test_mm256_cmpeq_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind {
}
define <4 x i64> @test_mm256_cmpeq_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind {
-; X32-LABEL: test_mm256_cmpeq_epi16:
-; X32: # BB#0:
-; X32-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_cmpeq_epi16:
-; X64: # BB#0:
-; X64-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_cmpeq_epi16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
%arg0 = bitcast <4 x i64> %a0 to <16 x i16>
%arg1 = bitcast <4 x i64> %a1 to <16 x i16>
%cmp = icmp eq <16 x i16> %arg0, %arg1
@@ -646,15 +467,10 @@ define <4 x i64> @test_mm256_cmpeq_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind
}
define <4 x i64> @test_mm256_cmpeq_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
-; X32-LABEL: test_mm256_cmpeq_epi32:
-; X32: # BB#0:
-; X32-NEXT: vpcmpeqd %ymm1, %ymm0, %ymm0
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_cmpeq_epi32:
-; X64: # BB#0:
-; X64-NEXT: vpcmpeqd %ymm1, %ymm0, %ymm0
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_cmpeq_epi32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
%arg0 = bitcast <4 x i64> %a0 to <8 x i32>
%arg1 = bitcast <4 x i64> %a1 to <8 x i32>
%cmp = icmp eq <8 x i32> %arg0, %arg1
@@ -664,30 +480,20 @@ define <4 x i64> @test_mm256_cmpeq_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind
}
define <4 x i64> @test_mm256_cmpeq_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
-; X32-LABEL: test_mm256_cmpeq_epi64:
-; X32: # BB#0:
-; X32-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_cmpeq_epi64:
-; X64: # BB#0:
-; X64-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_cmpeq_epi64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
%cmp = icmp eq <4 x i64> %a0, %a1
%res = sext <4 x i1> %cmp to <4 x i64>
ret <4 x i64> %res
}
define <4 x i64> @test_mm256_cmpgt_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind {
-; X32-LABEL: test_mm256_cmpgt_epi8:
-; X32: # BB#0:
-; X32-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_cmpgt_epi8:
-; X64: # BB#0:
-; X64-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_cmpgt_epi8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
%arg0 = bitcast <4 x i64> %a0 to <32 x i8>
%arg1 = bitcast <4 x i64> %a1 to <32 x i8>
%cmp = icmp sgt <32 x i8> %arg0, %arg1
@@ -697,15 +503,10 @@ define <4 x i64> @test_mm256_cmpgt_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind {
}
define <4 x i64> @test_mm256_cmpgt_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind {
-; X32-LABEL: test_mm256_cmpgt_epi16:
-; X32: # BB#0:
-; X32-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_cmpgt_epi16:
-; X64: # BB#0:
-; X64-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_cmpgt_epi16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
%arg0 = bitcast <4 x i64> %a0 to <16 x i16>
%arg1 = bitcast <4 x i64> %a1 to <16 x i16>
%cmp = icmp sgt <16 x i16> %arg0, %arg1
@@ -715,15 +516,10 @@ define <4 x i64> @test_mm256_cmpgt_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind
}
define <4 x i64> @test_mm256_cmpgt_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
-; X32-LABEL: test_mm256_cmpgt_epi32:
-; X32: # BB#0:
-; X32-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_cmpgt_epi32:
-; X64: # BB#0:
-; X64-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_cmpgt_epi32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
%arg0 = bitcast <4 x i64> %a0 to <8 x i32>
%arg1 = bitcast <4 x i64> %a1 to <8 x i32>
%cmp = icmp sgt <8 x i32> %arg0, %arg1
@@ -733,30 +529,20 @@ define <4 x i64> @test_mm256_cmpgt_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind
}
define <4 x i64> @test_mm256_cmpgt_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
-; X32-LABEL: test_mm256_cmpgt_epi64:
-; X32: # BB#0:
-; X32-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_cmpgt_epi64:
-; X64: # BB#0:
-; X64-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_cmpgt_epi64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
%cmp = icmp sgt <4 x i64> %a0, %a1
%res = sext <4 x i1> %cmp to <4 x i64>
ret <4 x i64> %res
}
define <4 x i64> @test_mm256_cvtepi8_epi16(<2 x i64> %a0) {
-; X32-LABEL: test_mm256_cvtepi8_epi16:
-; X32: # BB#0:
-; X32-NEXT: vpmovsxbw %xmm0, %ymm0
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_cvtepi8_epi16:
-; X64: # BB#0:
-; X64-NEXT: vpmovsxbw %xmm0, %ymm0
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_cvtepi8_epi16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpmovsxbw %xmm0, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
%arg0 = bitcast <2 x i64> %a0 to <16 x i8>
%ext = sext <16 x i8> %arg0 to <16 x i16>
%res = bitcast <16 x i16> %ext to <4 x i64>
@@ -764,15 +550,10 @@ define <4 x i64> @test_mm256_cvtepi8_epi16(<2 x i64> %a0) {
}
define <4 x i64> @test_mm256_cvtepi8_epi32(<2 x i64> %a0) {
-; X32-LABEL: test_mm256_cvtepi8_epi32:
-; X32: # BB#0:
-; X32-NEXT: vpmovsxbd %xmm0, %ymm0
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_cvtepi8_epi32:
-; X64: # BB#0:
-; X64-NEXT: vpmovsxbd %xmm0, %ymm0
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_cvtepi8_epi32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpmovsxbd %xmm0, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
%arg0 = bitcast <2 x i64> %a0 to <16 x i8>
%shuf = shufflevector <16 x i8> %arg0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%ext = sext <8 x i8> %shuf to <8 x i32>
@@ -781,15 +562,10 @@ define <4 x i64> @test_mm256_cvtepi8_epi32(<2 x i64> %a0) {
}
define <4 x i64> @test_mm256_cvtepi8_epi64(<2 x i64> %a0) {
-; X32-LABEL: test_mm256_cvtepi8_epi64:
-; X32: # BB#0:
-; X32-NEXT: vpmovsxbq %xmm0, %ymm0
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_cvtepi8_epi64:
-; X64: # BB#0:
-; X64-NEXT: vpmovsxbq %xmm0, %ymm0
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_cvtepi8_epi64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpmovsxbq %xmm0, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
%arg0 = bitcast <2 x i64> %a0 to <16 x i8>
%shuf = shufflevector <16 x i8> %arg0, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%ext = sext <4 x i8> %shuf to <4 x i64>
@@ -797,15 +573,10 @@ define <4 x i64> @test_mm256_cvtepi8_epi64(<2 x i64> %a0) {
}
define <4 x i64> @test_mm256_cvtepi16_epi32(<2 x i64> %a0) {
-; X32-LABEL: test_mm256_cvtepi16_epi32:
-; X32: # BB#0:
-; X32-NEXT: vpmovsxwd %xmm0, %ymm0
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_cvtepi16_epi32:
-; X64: # BB#0:
-; X64-NEXT: vpmovsxwd %xmm0, %ymm0
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_cvtepi16_epi32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpmovsxwd %xmm0, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
%arg0 = bitcast <2 x i64> %a0 to <8 x i16>
%ext = sext <8 x i16> %arg0 to <8 x i32>
%res = bitcast <8 x i32> %ext to <4 x i64>
@@ -813,15 +584,10 @@ define <4 x i64> @test_mm256_cvtepi16_epi32(<2 x i64> %a0) {
}
define <4 x i64> @test_mm256_cvtepi16_epi64(<2 x i64> %a0) {
-; X32-LABEL: test_mm256_cvtepi16_epi64:
-; X32: # BB#0:
-; X32-NEXT: vpmovsxwq %xmm0, %ymm0
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_cvtepi16_epi64:
-; X64: # BB#0:
-; X64-NEXT: vpmovsxwq %xmm0, %ymm0
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_cvtepi16_epi64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpmovsxwq %xmm0, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
%arg0 = bitcast <2 x i64> %a0 to <8 x i16>
%shuf = shufflevector <8 x i16> %arg0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%ext = sext <4 x i16> %shuf to <4 x i64>
@@ -829,30 +595,20 @@ define <4 x i64> @test_mm256_cvtepi16_epi64(<2 x i64> %a0) {
}
define <4 x i64> @test_mm256_cvtepi32_epi64(<2 x i64> %a0) {
-; X32-LABEL: test_mm256_cvtepi32_epi64:
-; X32: # BB#0:
-; X32-NEXT: vpmovsxdq %xmm0, %ymm0
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_cvtepi32_epi64:
-; X64: # BB#0:
-; X64-NEXT: vpmovsxdq %xmm0, %ymm0
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_cvtepi32_epi64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpmovsxdq %xmm0, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
%arg0 = bitcast <2 x i64> %a0 to <4 x i32>
%ext = sext <4 x i32> %arg0 to <4 x i64>
ret <4 x i64> %ext
}
define <4 x i64> @test_mm256_cvtepu8_epi16(<2 x i64> %a0) {
-; X32-LABEL: test_mm256_cvtepu8_epi16:
-; X32: # BB#0:
-; X32-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_cvtepu8_epi16:
-; X64: # BB#0:
-; X64-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_cvtepu8_epi16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; CHECK-NEXT: ret{{[l|q]}}
%arg0 = bitcast <2 x i64> %a0 to <16 x i8>
%ext = zext <16 x i8> %arg0 to <16 x i16>
%res = bitcast <16 x i16> %ext to <4 x i64>
@@ -860,15 +616,10 @@ define <4 x i64> @test_mm256_cvtepu8_epi16(<2 x i64> %a0) {
}
define <4 x i64> @test_mm256_cvtepu8_epi32(<2 x i64> %a0) {
-; X32-LABEL: test_mm256_cvtepu8_epi32:
-; X32: # BB#0:
-; X32-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_cvtepu8_epi32:
-; X64: # BB#0:
-; X64-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_cvtepu8_epi32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; CHECK-NEXT: ret{{[l|q]}}
%arg0 = bitcast <2 x i64> %a0 to <16 x i8>
%shuf = shufflevector <16 x i8> %arg0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%ext = zext <8 x i8> %shuf to <8 x i32>
@@ -877,15 +628,10 @@ define <4 x i64> @test_mm256_cvtepu8_epi32(<2 x i64> %a0) {
}
define <4 x i64> @test_mm256_cvtepu8_epi64(<2 x i64> %a0) {
-; X32-LABEL: test_mm256_cvtepu8_epi64:
-; X32: # BB#0:
-; X32-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_cvtepu8_epi64:
-; X64: # BB#0:
-; X64-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_cvtepu8_epi64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT: ret{{[l|q]}}
%arg0 = bitcast <2 x i64> %a0 to <16 x i8>
%shuf = shufflevector <16 x i8> %arg0, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%ext = zext <4 x i8> %shuf to <4 x i64>
@@ -893,15 +639,10 @@ define <4 x i64> @test_mm256_cvtepu8_epi64(<2 x i64> %a0) {
}
define <4 x i64> @test_mm256_cvtepu16_epi32(<2 x i64> %a0) {
-; X32-LABEL: test_mm256_cvtepu16_epi32:
-; X32: # BB#0:
-; X32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_cvtepu16_epi32:
-; X64: # BB#0:
-; X64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_cvtepu16_epi32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; CHECK-NEXT: ret{{[l|q]}}
%arg0 = bitcast <2 x i64> %a0 to <8 x i16>
%ext = zext <8 x i16> %arg0 to <8 x i32>
%res = bitcast <8 x i32> %ext to <4 x i64>
@@ -909,15 +650,10 @@ define <4 x i64> @test_mm256_cvtepu16_epi32(<2 x i64> %a0) {
}
define <4 x i64> @test_mm256_cvtepu16_epi64(<2 x i64> %a0) {
-; X32-LABEL: test_mm256_cvtepu16_epi64:
-; X32: # BB#0:
-; X32-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_cvtepu16_epi64:
-; X64: # BB#0:
-; X64-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_cvtepu16_epi64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; CHECK-NEXT: ret{{[l|q]}}
%arg0 = bitcast <2 x i64> %a0 to <8 x i16>
%shuf = shufflevector <8 x i16> %arg0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%ext = zext <4 x i16> %shuf to <4 x i64>
@@ -925,46 +661,30 @@ define <4 x i64> @test_mm256_cvtepu16_epi64(<2 x i64> %a0) {
}
define <4 x i64> @test_mm256_cvtepu32_epi64(<2 x i64> %a0) {
-; X32-LABEL: test_mm256_cvtepu32_epi64:
-; X32: # BB#0:
-; X32-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_cvtepu32_epi64:
-; X64: # BB#0:
-; X64-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_cvtepu32_epi64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; CHECK-NEXT: ret{{[l|q]}}
%arg0 = bitcast <2 x i64> %a0 to <4 x i32>
%ext = zext <4 x i32> %arg0 to <4 x i64>
ret <4 x i64> %ext
}
define <2 x i64> @test_mm256_extracti128_si256(<4 x i64> %a0) nounwind {
-; X32-LABEL: test_mm256_extracti128_si256:
-; X32: # BB#0:
-; X32-NEXT: vextractf128 $1, %ymm0, %xmm0
-; X32-NEXT: vzeroupper
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_extracti128_si256:
-; X64: # BB#0:
-; X64-NEXT: vextractf128 $1, %ymm0, %xmm0
-; X64-NEXT: vzeroupper
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_extracti128_si256:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: ret{{[l|q]}}
%res = shufflevector <4 x i64> %a0, <4 x i64> %a0, <2 x i32> <i32 2, i32 3>
ret <2 x i64> %res
}
define <4 x i64> @test_mm256_hadd_epi16(<4 x i64> %a0, <4 x i64> %a1) {
-; X32-LABEL: test_mm256_hadd_epi16:
-; X32: # BB#0:
-; X32-NEXT: vphaddw %ymm1, %ymm0, %ymm0
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_hadd_epi16:
-; X64: # BB#0:
-; X64-NEXT: vphaddw %ymm1, %ymm0, %ymm0
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_hadd_epi16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vphaddw %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
%arg0 = bitcast <4 x i64> %a0 to <16 x i16>
%arg1 = bitcast <4 x i64> %a1 to <16 x i16>
%res = call <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16> %arg0, <16 x i16> %arg1)
@@ -974,15 +694,10 @@ define <4 x i64> @test_mm256_hadd_epi16(<4 x i64> %a0, <4 x i64> %a1) {
declare <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16>, <16 x i16>) nounwind readnone
define <4 x i64> @test_mm256_hadd_epi32(<4 x i64> %a0, <4 x i64> %a1) {
-; X32-LABEL: test_mm256_hadd_epi32:
-; X32: # BB#0:
-; X32-NEXT: vphaddd %ymm1, %ymm0, %ymm0
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_hadd_epi32:
-; X64: # BB#0:
-; X64-NEXT: vphaddd %ymm1, %ymm0, %ymm0
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_hadd_epi32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vphaddd %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
%arg0 = bitcast <4 x i64> %a0 to <8 x i32>
%arg1 = bitcast <4 x i64> %a1 to <8 x i32>
%res = call <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32> %arg0, <8 x i32> %arg1)
@@ -992,15 +707,10 @@ define <4 x i64> @test_mm256_hadd_epi32(<4 x i64> %a0, <4 x i64> %a1) {
declare <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32>, <8 x i32>) nounwind readnone
define <4 x i64> @test_mm256_hadds_epi16(<4 x i64> %a0, <4 x i64> %a1) {
-; X32-LABEL: test_mm256_hadds_epi16:
-; X32: # BB#0:
-; X32-NEXT: vphaddsw %ymm1, %ymm0, %ymm0
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_hadds_epi16:
-; X64: # BB#0:
-; X64-NEXT: vphaddsw %ymm1, %ymm0, %ymm0
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_hadds_epi16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vphaddsw %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
%arg0 = bitcast <4 x i64> %a0 to <16 x i16>
%arg1 = bitcast <4 x i64> %a1 to <16 x i16>
%res = call <16 x i16> @llvm.x86.avx2.phadd.sw(<16 x i16> %arg0, <16 x i16> %arg1)
@@ -1010,15 +720,10 @@ define <4 x i64> @test_mm256_hadds_epi16(<4 x i64> %a0, <4 x i64> %a1) {
declare <16 x i16> @llvm.x86.avx2.phadd.sw(<16 x i16>, <16 x i16>) nounwind readnone
define <4 x i64> @test_mm256_hsub_epi16(<4 x i64> %a0, <4 x i64> %a1) {
-; X32-LABEL: test_mm256_hsub_epi16:
-; X32: # BB#0:
-; X32-NEXT: vphsubw %ymm1, %ymm0, %ymm0
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_hsub_epi16:
-; X64: # BB#0:
-; X64-NEXT: vphsubw %ymm1, %ymm0, %ymm0
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_hsub_epi16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vphsubw %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
%arg0 = bitcast <4 x i64> %a0 to <16 x i16>
%arg1 = bitcast <4 x i64> %a1 to <16 x i16>
%res = call <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16> %arg0, <16 x i16> %arg1)
@@ -1028,15 +733,10 @@ define <4 x i64> @test_mm256_hsub_epi16(<4 x i64> %a0, <4 x i64> %a1) {
declare <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16>, <16 x i16>) nounwind readnone
define <4 x i64> @test_mm256_hsub_epi32(<4 x i64> %a0, <4 x i64> %a1) {
-; X32-LABEL: test_mm256_hsub_epi32:
-; X32: # BB#0:
-; X32-NEXT: vphsubd %ymm1, %ymm0, %ymm0
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_hsub_epi32:
-; X64: # BB#0:
-; X64-NEXT: vphsubd %ymm1, %ymm0, %ymm0
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_hsub_epi32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vphsubd %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
%arg0 = bitcast <4 x i64> %a0 to <8 x i32>
%arg1 = bitcast <4 x i64> %a1 to <8 x i32>
%res = call <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32> %arg0, <8 x i32> %arg1)
@@ -1046,15 +746,10 @@ define <4 x i64> @test_mm256_hsub_epi32(<4 x i64> %a0, <4 x i64> %a1) {
declare <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32>, <8 x i32>) nounwind readnone
define <4 x i64> @test_mm256_hsubs_epi16(<4 x i64> %a0, <4 x i64> %a1) {
-; X32-LABEL: test_mm256_hsubs_epi16:
-; X32: # BB#0:
-; X32-NEXT: vphsubsw %ymm1, %ymm0, %ymm0
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_hsubs_epi16:
-; X64: # BB#0:
-; X64-NEXT: vphsubsw %ymm1, %ymm0, %ymm0
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_hsubs_epi16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vphsubsw %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
%arg0 = bitcast <4 x i64> %a0 to <16 x i16>
%arg1 = bitcast <4 x i64> %a1 to <16 x i16>
%res = call <16 x i16> @llvm.x86.avx2.phsub.sw(<16 x i16> %arg0, <16 x i16> %arg1)
@@ -1064,22 +759,22 @@ define <4 x i64> @test_mm256_hsubs_epi16(<4 x i64> %a0, <4 x i64> %a1) {
declare <16 x i16> @llvm.x86.avx2.phsub.sw(<16 x i16>, <16 x i16>) nounwind readnone
define <2 x i64> @test_mm_i32gather_epi32(i32 *%a0, <2 x i64> %a1) {
-; X32-LABEL: test_mm_i32gather_epi32:
-; X32: # BB#0:
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; X32-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; X32-NEXT: vpgatherdd %xmm2, (%eax,%xmm0,2), %xmm1
-; X32-NEXT: vmovdqa %xmm1, %xmm0
-; X32-NEXT: retl
+; X86-LABEL: test_mm_i32gather_epi32:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; X86-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; X86-NEXT: vpgatherdd %xmm2, (%eax,%xmm0,2), %xmm1
+; X86-NEXT: vmovdqa %xmm1, %xmm0
+; X86-NEXT: ret{{[l|q]}}
;
; X64-LABEL: test_mm_i32gather_epi32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1
; X64-NEXT: vpgatherdd %xmm2, (%rdi,%xmm0,2), %xmm1
; X64-NEXT: vmovdqa %xmm1, %xmm0
-; X64-NEXT: retq
+; X64-NEXT: ret{{[l|q]}}
%arg0 = bitcast i32 *%a0 to i8*
%arg1 = bitcast <2 x i64> %a1 to <4 x i32>
%mask = bitcast <2 x i64> <i64 -1, i64 -1> to <4 x i32>
@@ -1090,16 +785,16 @@ define <2 x i64> @test_mm_i32gather_epi32(i32 *%a0, <2 x i64> %a1) {
declare <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32>, i8*, <4 x i32>, <4 x i32>, i8) nounwind readonly
define <2 x i64> @test_mm_mask_i32gather_epi32(<2 x i64> %a0, i32 *%a1, <2 x i64> %a2, <2 x i64> %a3) {
-; X32-LABEL: test_mm_mask_i32gather_epi32:
-; X32: # BB#0:
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: vpgatherdd %xmm2, (%eax,%xmm1,2), %xmm0
-; X32-NEXT: retl
+; X86-LABEL: test_mm_mask_i32gather_epi32:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: vpgatherdd %xmm2, (%eax,%xmm1,2), %xmm0
+; X86-NEXT: ret{{[l|q]}}
;
; X64-LABEL: test_mm_mask_i32gather_epi32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpgatherdd %xmm2, (%rdi,%xmm1,2), %xmm0
-; X64-NEXT: retq
+; X64-NEXT: ret{{[l|q]}}
%arg0 = bitcast <2 x i64> %a0 to <4 x i32>
%arg1 = bitcast i32 *%a1 to i8*
%arg2 = bitcast <2 x i64> %a2 to <4 x i32>
@@ -1110,22 +805,22 @@ define <2 x i64> @test_mm_mask_i32gather_epi32(<2 x i64> %a0, i32 *%a1, <2 x i64
}
define <4 x i64> @test_mm256_i32gather_epi32(i32 *%a0, <4 x i64> %a1) {
-; X32-LABEL: test_mm256_i32gather_epi32:
-; X32: # BB#0:
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
-; X32-NEXT: vpxor %ymm1, %ymm1, %ymm1
-; X32-NEXT: vpgatherdd %ymm2, (%eax,%ymm0,2), %ymm1
-; X32-NEXT: vmovdqa %ymm1, %ymm0
-; X32-NEXT: retl
+; X86-LABEL: test_mm256_i32gather_epi32:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
+; X86-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; X86-NEXT: vpgatherdd %ymm2, (%eax,%ymm0,2), %ymm1
+; X86-NEXT: vmovdqa %ymm1, %ymm0
+; X86-NEXT: ret{{[l|q]}}
;
; X64-LABEL: test_mm256_i32gather_epi32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
-; X64-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1
; X64-NEXT: vpgatherdd %ymm2, (%rdi,%ymm0,2), %ymm1
; X64-NEXT: vmovdqa %ymm1, %ymm0
-; X64-NEXT: retq
+; X64-NEXT: ret{{[l|q]}}
%arg0 = bitcast i32 *%a0 to i8*
%arg1 = bitcast <4 x i64> %a1 to <8 x i32>
%mask = bitcast <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1> to <8 x i32>
@@ -1136,16 +831,16 @@ define <4 x i64> @test_mm256_i32gather_epi32(i32 *%a0, <4 x i64> %a1) {
declare <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32>, i8*, <8 x i32>, <8 x i32>, i8) nounwind readonly
define <4 x i64> @test_mm256_mask_i32gather_epi32(<4 x i64> %a0, i32 *%a1, <4 x i64> %a2, <4 x i64> %a3) {
-; X32-LABEL: test_mm256_mask_i32gather_epi32:
-; X32: # BB#0:
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: vpgatherdd %ymm2, (%eax,%ymm1,2), %ymm0
-; X32-NEXT: retl
+; X86-LABEL: test_mm256_mask_i32gather_epi32:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: vpgatherdd %ymm2, (%eax,%ymm1,2), %ymm0
+; X86-NEXT: ret{{[l|q]}}
;
; X64-LABEL: test_mm256_mask_i32gather_epi32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpgatherdd %ymm2, (%rdi,%ymm1,2), %ymm0
-; X64-NEXT: retq
+; X64-NEXT: ret{{[l|q]}}
%arg0 = bitcast <4 x i64> %a0 to <8 x i32>
%arg1 = bitcast i32 *%a1 to i8*
%arg2 = bitcast <4 x i64> %a2 to <8 x i32>
@@ -1156,22 +851,22 @@ define <4 x i64> @test_mm256_mask_i32gather_epi32(<4 x i64> %a0, i32 *%a1, <4 x
}
define <2 x i64> @test_mm_i32gather_epi64(i64 *%a0, <2 x i64> %a1) {
-; X32-LABEL: test_mm_i32gather_epi64:
-; X32: # BB#0:
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; X32-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; X32-NEXT: vpgatherdq %xmm2, (%eax,%xmm0,2), %xmm1
-; X32-NEXT: vmovdqa %xmm1, %xmm0
-; X32-NEXT: retl
+; X86-LABEL: test_mm_i32gather_epi64:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; X86-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; X86-NEXT: vpgatherdq %xmm2, (%eax,%xmm0,2), %xmm1
+; X86-NEXT: vmovdqa %xmm1, %xmm0
+; X86-NEXT: ret{{[l|q]}}
;
; X64-LABEL: test_mm_i32gather_epi64:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1
; X64-NEXT: vpgatherdq %xmm2, (%rdi,%xmm0,2), %xmm1
; X64-NEXT: vmovdqa %xmm1, %xmm0
-; X64-NEXT: retq
+; X64-NEXT: ret{{[l|q]}}
%arg0 = bitcast i64 *%a0 to i8*
%arg1 = bitcast <2 x i64> %a1 to <4 x i32>
%res = call <2 x i64> @llvm.x86.avx2.gather.d.q(<2 x i64> undef, i8* %arg0, <4 x i32> %arg1, <2 x i64> <i64 -1, i64 -1>, i8 2)
@@ -1180,16 +875,16 @@ define <2 x i64> @test_mm_i32gather_epi64(i64 *%a0, <2 x i64> %a1) {
declare <2 x i64> @llvm.x86.avx2.gather.d.q(<2 x i64>, i8*, <4 x i32>, <2 x i64>, i8) nounwind readonly
define <2 x i64> @test_mm_mask_i32gather_epi64(<2 x i64> %a0, i64 *%a1, <2 x i64> %a2, <2 x i64> %a3) {
-; X32-LABEL: test_mm_mask_i32gather_epi64:
-; X32: # BB#0:
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: vpgatherdq %xmm2, (%eax,%xmm1,2), %xmm0
-; X32-NEXT: retl
+; X86-LABEL: test_mm_mask_i32gather_epi64:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: vpgatherdq %xmm2, (%eax,%xmm1,2), %xmm0
+; X86-NEXT: ret{{[l|q]}}
;
; X64-LABEL: test_mm_mask_i32gather_epi64:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpgatherdq %xmm2, (%rdi,%xmm1,2), %xmm0
-; X64-NEXT: retq
+; X64-NEXT: ret{{[l|q]}}
%arg1 = bitcast i64 *%a1 to i8*
%arg2 = bitcast <2 x i64> %a2 to <4 x i32>
%res = call <2 x i64> @llvm.x86.avx2.gather.d.q(<2 x i64> %a0, i8* %arg1, <4 x i32> %arg2, <2 x i64> %a3, i8 2)
@@ -1197,22 +892,22 @@ define <2 x i64> @test_mm_mask_i32gather_epi64(<2 x i64> %a0, i64 *%a1, <2 x i64
}
define <4 x i64> @test_mm256_i32gather_epi64(i64 *%a0, <2 x i64> %a1) {
-; X32-LABEL: test_mm256_i32gather_epi64:
-; X32: # BB#0:
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
-; X32-NEXT: vpxor %ymm1, %ymm1, %ymm1
-; X32-NEXT: vpgatherdq %ymm2, (%eax,%xmm0,2), %ymm1
-; X32-NEXT: vmovdqa %ymm1, %ymm0
-; X32-NEXT: retl
+; X86-LABEL: test_mm256_i32gather_epi64:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
+; X86-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; X86-NEXT: vpgatherdq %ymm2, (%eax,%xmm0,2), %ymm1
+; X86-NEXT: vmovdqa %ymm1, %ymm0
+; X86-NEXT: ret{{[l|q]}}
;
; X64-LABEL: test_mm256_i32gather_epi64:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
-; X64-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1
; X64-NEXT: vpgatherdq %ymm2, (%rdi,%xmm0,2), %ymm1
; X64-NEXT: vmovdqa %ymm1, %ymm0
-; X64-NEXT: retq
+; X64-NEXT: ret{{[l|q]}}
%arg0 = bitcast i64 *%a0 to i8*
%arg1 = bitcast <2 x i64> %a1 to <4 x i32>
%res = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8* %arg0, <4 x i32> %arg1, <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1>, i8 2)
@@ -1221,16 +916,16 @@ define <4 x i64> @test_mm256_i32gather_epi64(i64 *%a0, <2 x i64> %a1) {
declare <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64>, i8*, <4 x i32>, <4 x i64>, i8) nounwind readonly
define <4 x i64> @test_mm256_mask_i32gather_epi64(<4 x i64> %a0, i64 *%a1, <2 x i64> %a2, <4 x i64> %a3) {
-; X32-LABEL: test_mm256_mask_i32gather_epi64:
-; X32: # BB#0:
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: vpgatherdq %ymm2, (%eax,%xmm1,2), %ymm0
-; X32-NEXT: retl
+; X86-LABEL: test_mm256_mask_i32gather_epi64:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: vpgatherdq %ymm2, (%eax,%xmm1,2), %ymm0
+; X86-NEXT: ret{{[l|q]}}
;
; X64-LABEL: test_mm256_mask_i32gather_epi64:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpgatherdq %ymm2, (%rdi,%xmm1,2), %ymm0
-; X64-NEXT: retq
+; X64-NEXT: ret{{[l|q]}}
%arg1 = bitcast i64 *%a1 to i8*
%arg2 = bitcast <2 x i64> %a2 to <4 x i32>
%res = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> %a0, i8* %arg1, <4 x i32> %arg2, <4 x i64> %a3, i8 2)
@@ -1238,22 +933,22 @@ define <4 x i64> @test_mm256_mask_i32gather_epi64(<4 x i64> %a0, i64 *%a1, <2 x
}
define <2 x double> @test_mm_i32gather_pd(double *%a0, <2 x i64> %a1) {
-; X32-LABEL: test_mm_i32gather_pd:
-; X32: # BB#0:
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; X32-NEXT: vxorpd %xmm1, %xmm1, %xmm1
-; X32-NEXT: vgatherdpd %xmm2, (%eax,%xmm0,2), %xmm1
-; X32-NEXT: vmovapd %xmm1, %xmm0
-; X32-NEXT: retl
+; X86-LABEL: test_mm_i32gather_pd:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; X86-NEXT: vxorpd %xmm1, %xmm1, %xmm1
+; X86-NEXT: vgatherdpd %xmm2, (%eax,%xmm0,2), %xmm1
+; X86-NEXT: vmovapd %xmm1, %xmm0
+; X86-NEXT: ret{{[l|q]}}
;
; X64-LABEL: test_mm_i32gather_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; X64-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; X64-NEXT: vgatherdpd %xmm2, (%rdi,%xmm0,2), %xmm1
; X64-NEXT: vmovapd %xmm1, %xmm0
-; X64-NEXT: retq
+; X64-NEXT: ret{{[l|q]}}
%arg0 = bitcast double *%a0 to i8*
%arg1 = bitcast <2 x i64> %a1 to <4 x i32>
%cmp = fcmp oeq <2 x double> zeroinitializer, zeroinitializer
@@ -1265,16 +960,16 @@ define <2 x double> @test_mm_i32gather_pd(double *%a0, <2 x i64> %a1) {
declare <2 x double> @llvm.x86.avx2.gather.d.pd(<2 x double>, i8*, <4 x i32>, <2 x double>, i8) nounwind readonly
define <2 x double> @test_mm_mask_i32gather_pd(<2 x double> %a0, double *%a1, <2 x i64> %a2, <2 x double> %a3) {
-; X32-LABEL: test_mm_mask_i32gather_pd:
-; X32: # BB#0:
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: vgatherdpd %xmm2, (%eax,%xmm1,2), %xmm0
-; X32-NEXT: retl
+; X86-LABEL: test_mm_mask_i32gather_pd:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: vgatherdpd %xmm2, (%eax,%xmm1,2), %xmm0
+; X86-NEXT: ret{{[l|q]}}
;
; X64-LABEL: test_mm_mask_i32gather_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vgatherdpd %xmm2, (%rdi,%xmm1,2), %xmm0
-; X64-NEXT: retq
+; X64-NEXT: ret{{[l|q]}}
%arg1 = bitcast double *%a1 to i8*
%arg2 = bitcast <2 x i64> %a2 to <4 x i32>
%res = call <2 x double> @llvm.x86.avx2.gather.d.pd(<2 x double> %a0, i8* %arg1, <4 x i32> %arg2, <2 x double> %a3, i8 2)
@@ -1282,22 +977,22 @@ define <2 x double> @test_mm_mask_i32gather_pd(<2 x double> %a0, double *%a1, <2
}
define <4 x double> @test_mm256_i32gather_pd(double *%a0, <2 x i64> %a1) {
-; X32-LABEL: test_mm256_i32gather_pd:
-; X32: # BB#0:
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: vxorpd %ymm1, %ymm1, %ymm1
-; X32-NEXT: vcmpeqpd %ymm1, %ymm1, %ymm2
-; X32-NEXT: vgatherdpd %ymm2, (%eax,%xmm0,2), %ymm1
-; X32-NEXT: vmovapd %ymm1, %ymm0
-; X32-NEXT: retl
+; X86-LABEL: test_mm256_i32gather_pd:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: vxorpd %xmm1, %xmm1, %xmm1
+; X86-NEXT: vcmpeqpd %ymm1, %ymm1, %ymm2
+; X86-NEXT: vgatherdpd %ymm2, (%eax,%xmm0,2), %ymm1
+; X86-NEXT: vmovapd %ymm1, %ymm0
+; X86-NEXT: ret{{[l|q]}}
;
; X64-LABEL: test_mm256_i32gather_pd:
-; X64: # BB#0:
-; X64-NEXT: vxorpd %ymm1, %ymm1, %ymm1
+; X64: # %bb.0:
+; X64-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; X64-NEXT: vcmpeqpd %ymm1, %ymm1, %ymm2
; X64-NEXT: vgatherdpd %ymm2, (%rdi,%xmm0,2), %ymm1
; X64-NEXT: vmovapd %ymm1, %ymm0
-; X64-NEXT: retq
+; X64-NEXT: ret{{[l|q]}}
%arg0 = bitcast double *%a0 to i8*
%arg1 = bitcast <2 x i64> %a1 to <4 x i32>
%mask = call <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double> zeroinitializer, <4 x double> zeroinitializer, i8 0)
@@ -1307,16 +1002,16 @@ define <4 x double> @test_mm256_i32gather_pd(double *%a0, <2 x i64> %a1) {
declare <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double>, i8*, <4 x i32>, <4 x double>, i8) nounwind readonly
define <4 x double> @test_mm256_mask_i32gather_pd(<4 x double> %a0, double *%a1, <2 x i64> %a2, <4 x double> %a3) {
-; X32-LABEL: test_mm256_mask_i32gather_pd:
-; X32: # BB#0:
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: vgatherdpd %ymm2, (%eax,%xmm1,2), %ymm0
-; X32-NEXT: retl
+; X86-LABEL: test_mm256_mask_i32gather_pd:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: vgatherdpd %ymm2, (%eax,%xmm1,2), %ymm0
+; X86-NEXT: ret{{[l|q]}}
;
; X64-LABEL: test_mm256_mask_i32gather_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vgatherdpd %ymm2, (%rdi,%xmm1,2), %ymm0
-; X64-NEXT: retq
+; X64-NEXT: ret{{[l|q]}}
%arg1 = bitcast double *%a1 to i8*
%arg2 = bitcast <2 x i64> %a2 to <4 x i32>
%res = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> %a0, i8* %arg1, <4 x i32> %arg2, <4 x double> %a3, i8 2)
@@ -1324,22 +1019,22 @@ define <4 x double> @test_mm256_mask_i32gather_pd(<4 x double> %a0, double *%a1,
}
define <4 x float> @test_mm_i32gather_ps(float *%a0, <2 x i64> %a1) {
-; X32-LABEL: test_mm_i32gather_ps:
-; X32: # BB#0:
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; X32-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X32-NEXT: vgatherdps %xmm2, (%eax,%xmm0,2), %xmm1
-; X32-NEXT: vmovaps %xmm1, %xmm0
-; X32-NEXT: retl
+; X86-LABEL: test_mm_i32gather_ps:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; X86-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; X86-NEXT: vgatherdps %xmm2, (%eax,%xmm0,2), %xmm1
+; X86-NEXT: vmovaps %xmm1, %xmm0
+; X86-NEXT: ret{{[l|q]}}
;
; X64-LABEL: test_mm_i32gather_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X64-NEXT: vgatherdps %xmm2, (%rdi,%xmm0,2), %xmm1
; X64-NEXT: vmovaps %xmm1, %xmm0
-; X64-NEXT: retq
+; X64-NEXT: ret{{[l|q]}}
%arg0 = bitcast float *%a0 to i8*
%arg1 = bitcast <2 x i64> %a1 to <4 x i32>
%cmp = fcmp oeq <4 x float> zeroinitializer, zeroinitializer
@@ -1351,16 +1046,16 @@ define <4 x float> @test_mm_i32gather_ps(float *%a0, <2 x i64> %a1) {
declare <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float>, i8*, <4 x i32>, <4 x float>, i8) nounwind readonly
define <4 x float> @test_mm_mask_i32gather_ps(<4 x float> %a0, float *%a1, <2 x i64> %a2, <4 x float> %a3) {
-; X32-LABEL: test_mm_mask_i32gather_ps:
-; X32: # BB#0:
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: vgatherdps %xmm2, (%eax,%xmm1,2), %xmm0
-; X32-NEXT: retl
+; X86-LABEL: test_mm_mask_i32gather_ps:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: vgatherdps %xmm2, (%eax,%xmm1,2), %xmm0
+; X86-NEXT: ret{{[l|q]}}
;
; X64-LABEL: test_mm_mask_i32gather_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vgatherdps %xmm2, (%rdi,%xmm1,2), %xmm0
-; X64-NEXT: retq
+; X64-NEXT: ret{{[l|q]}}
%arg1 = bitcast float *%a1 to i8*
%arg2 = bitcast <2 x i64> %a2 to <4 x i32>
%call = call <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float> %a0, i8* %arg1, <4 x i32> %arg2, <4 x float> %a3, i8 2)
@@ -1368,22 +1063,22 @@ define <4 x float> @test_mm_mask_i32gather_ps(<4 x float> %a0, float *%a1, <2 x
}
define <8 x float> @test_mm256_i32gather_ps(float *%a0, <4 x i64> %a1) {
-; X32-LABEL: test_mm256_i32gather_ps:
-; X32: # BB#0:
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: vxorps %ymm1, %ymm1, %ymm1
-; X32-NEXT: vcmpeqps %ymm1, %ymm1, %ymm2
-; X32-NEXT: vgatherdps %ymm2, (%eax,%ymm0,2), %ymm1
-; X32-NEXT: vmovaps %ymm1, %ymm0
-; X32-NEXT: retl
+; X86-LABEL: test_mm256_i32gather_ps:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; X86-NEXT: vcmpeqps %ymm1, %ymm1, %ymm2
+; X86-NEXT: vgatherdps %ymm2, (%eax,%ymm0,2), %ymm1
+; X86-NEXT: vmovaps %ymm1, %ymm0
+; X86-NEXT: ret{{[l|q]}}
;
; X64-LABEL: test_mm256_i32gather_ps:
-; X64: # BB#0:
-; X64-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; X64: # %bb.0:
+; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X64-NEXT: vcmpeqps %ymm1, %ymm1, %ymm2
; X64-NEXT: vgatherdps %ymm2, (%rdi,%ymm0,2), %ymm1
; X64-NEXT: vmovaps %ymm1, %ymm0
-; X64-NEXT: retq
+; X64-NEXT: ret{{[l|q]}}
%arg0 = bitcast float *%a0 to i8*
%arg1 = bitcast <4 x i64> %a1 to <8 x i32>
%mask = call <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float> zeroinitializer, <8 x float> zeroinitializer, i8 0)
@@ -1393,16 +1088,16 @@ define <8 x float> @test_mm256_i32gather_ps(float *%a0, <4 x i64> %a1) {
declare <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float>, i8*, <8 x i32>, <8 x float>, i8) nounwind readonly
define <8 x float> @test_mm256_mask_i32gather_ps(<8 x float> %a0, float *%a1, <4 x i64> %a2, <8 x float> %a3) {
-; X32-LABEL: test_mm256_mask_i32gather_ps:
-; X32: # BB#0:
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: vgatherdps %ymm2, (%eax,%ymm1,2), %ymm0
-; X32-NEXT: retl
+; X86-LABEL: test_mm256_mask_i32gather_ps:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: vgatherdps %ymm2, (%eax,%ymm1,2), %ymm0
+; X86-NEXT: ret{{[l|q]}}
;
; X64-LABEL: test_mm256_mask_i32gather_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vgatherdps %ymm2, (%rdi,%ymm1,2), %ymm0
-; X64-NEXT: retq
+; X64-NEXT: ret{{[l|q]}}
%arg1 = bitcast float *%a1 to i8*
%arg2 = bitcast <4 x i64> %a2 to <8 x i32>
%call = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> %a0, i8* %arg1, <8 x i32> %arg2, <8 x float> %a3, i8 2)
@@ -1410,22 +1105,22 @@ define <8 x float> @test_mm256_mask_i32gather_ps(<8 x float> %a0, float *%a1, <4
}
define <2 x i64> @test_mm_i64gather_epi32(i32 *%a0, <2 x i64> %a1) {
-; X32-LABEL: test_mm_i64gather_epi32:
-; X32: # BB#0:
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; X32-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; X32-NEXT: vpgatherqd %xmm2, (%eax,%xmm0,2), %xmm1
-; X32-NEXT: vmovdqa %xmm1, %xmm0
-; X32-NEXT: retl
+; X86-LABEL: test_mm_i64gather_epi32:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; X86-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; X86-NEXT: vpgatherqd %xmm2, (%eax,%xmm0,2), %xmm1
+; X86-NEXT: vmovdqa %xmm1, %xmm0
+; X86-NEXT: ret{{[l|q]}}
;
; X64-LABEL: test_mm_i64gather_epi32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1
; X64-NEXT: vpgatherqd %xmm2, (%rdi,%xmm0,2), %xmm1
; X64-NEXT: vmovdqa %xmm1, %xmm0
-; X64-NEXT: retq
+; X64-NEXT: ret{{[l|q]}}
%arg0 = bitcast i32 *%a0 to i8*
%mask = bitcast <2 x i64> <i64 -1, i64 -1> to <4 x i32>
%call = call <4 x i32> @llvm.x86.avx2.gather.q.d(<4 x i32> undef, i8* %arg0, <2 x i64> %a1, <4 x i32> %mask, i8 2)
@@ -1435,16 +1130,16 @@ define <2 x i64> @test_mm_i64gather_epi32(i32 *%a0, <2 x i64> %a1) {
declare <4 x i32> @llvm.x86.avx2.gather.q.d(<4 x i32>, i8*, <2 x i64>, <4 x i32>, i8) nounwind readonly
define <2 x i64> @test_mm_mask_i64gather_epi32(<2 x i64> %a0, i32 *%a1, <2 x i64> %a2, <2 x i64> %a3) {
-; X32-LABEL: test_mm_mask_i64gather_epi32:
-; X32: # BB#0:
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: vpgatherqd %xmm2, (%eax,%xmm1,2), %xmm0
-; X32-NEXT: retl
+; X86-LABEL: test_mm_mask_i64gather_epi32:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: vpgatherqd %xmm2, (%eax,%xmm1,2), %xmm0
+; X86-NEXT: ret{{[l|q]}}
;
; X64-LABEL: test_mm_mask_i64gather_epi32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpgatherqd %xmm2, (%rdi,%xmm1,2), %xmm0
-; X64-NEXT: retq
+; X64-NEXT: ret{{[l|q]}}
%arg0 = bitcast <2 x i64> %a0 to <4 x i32>
%arg1 = bitcast i32 *%a1 to i8*
%arg3 = bitcast <2 x i64> %a3 to <4 x i32>
@@ -1454,24 +1149,24 @@ define <2 x i64> @test_mm_mask_i64gather_epi32(<2 x i64> %a0, i32 *%a1, <2 x i64
}
define <2 x i64> @test_mm256_i64gather_epi32(i32 *%a0, <4 x i64> %a1) {
-; X32-LABEL: test_mm256_i64gather_epi32:
-; X32: # BB#0:
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; X32-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; X32-NEXT: vpgatherqd %xmm2, (%eax,%ymm0,2), %xmm1
-; X32-NEXT: vmovdqa %xmm1, %xmm0
-; X32-NEXT: vzeroupper
-; X32-NEXT: retl
+; X86-LABEL: test_mm256_i64gather_epi32:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; X86-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; X86-NEXT: vpgatherqd %xmm2, (%eax,%ymm0,2), %xmm1
+; X86-NEXT: vmovdqa %xmm1, %xmm0
+; X86-NEXT: vzeroupper
+; X86-NEXT: ret{{[l|q]}}
;
; X64-LABEL: test_mm256_i64gather_epi32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1
; X64-NEXT: vpgatherqd %xmm2, (%rdi,%ymm0,2), %xmm1
; X64-NEXT: vmovdqa %xmm1, %xmm0
; X64-NEXT: vzeroupper
-; X64-NEXT: retq
+; X64-NEXT: ret{{[l|q]}}
%arg0 = bitcast i32 *%a0 to i8*
%mask = bitcast <2 x i64> <i64 -1, i64 -1> to <4 x i32>
%call = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8* %arg0, <4 x i64> %a1, <4 x i32> %mask, i8 2)
@@ -1481,18 +1176,18 @@ define <2 x i64> @test_mm256_i64gather_epi32(i32 *%a0, <4 x i64> %a1) {
declare <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32>, i8*, <4 x i64>, <4 x i32>, i8) nounwind readonly
define <2 x i64> @test_mm256_mask_i64gather_epi32(<2 x i64> %a0, i32 *%a1, <4 x i64> %a2, <2 x i64> %a3) {
-; X32-LABEL: test_mm256_mask_i64gather_epi32:
-; X32: # BB#0:
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: vpgatherqd %xmm2, (%eax,%ymm1,2), %xmm0
-; X32-NEXT: vzeroupper
-; X32-NEXT: retl
+; X86-LABEL: test_mm256_mask_i64gather_epi32:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: vpgatherqd %xmm2, (%eax,%ymm1,2), %xmm0
+; X86-NEXT: vzeroupper
+; X86-NEXT: ret{{[l|q]}}
;
; X64-LABEL: test_mm256_mask_i64gather_epi32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpgatherqd %xmm2, (%rdi,%ymm1,2), %xmm0
; X64-NEXT: vzeroupper
-; X64-NEXT: retq
+; X64-NEXT: ret{{[l|q]}}
%arg0 = bitcast <2 x i64> %a0 to <4 x i32>
%arg1 = bitcast i32 *%a1 to i8*
%arg3 = bitcast <2 x i64> %a3 to <4 x i32>
@@ -1502,22 +1197,22 @@ define <2 x i64> @test_mm256_mask_i64gather_epi32(<2 x i64> %a0, i32 *%a1, <4 x
}
define <2 x i64> @test_mm_i64gather_epi64(i64 *%a0, <2 x i64> %a1) {
-; X32-LABEL: test_mm_i64gather_epi64:
-; X32: # BB#0:
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; X32-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; X32-NEXT: vpgatherqq %xmm2, (%eax,%xmm0,2), %xmm1
-; X32-NEXT: vmovdqa %xmm1, %xmm0
-; X32-NEXT: retl
+; X86-LABEL: test_mm_i64gather_epi64:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; X86-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; X86-NEXT: vpgatherqq %xmm2, (%eax,%xmm0,2), %xmm1
+; X86-NEXT: vmovdqa %xmm1, %xmm0
+; X86-NEXT: ret{{[l|q]}}
;
; X64-LABEL: test_mm_i64gather_epi64:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1
; X64-NEXT: vpgatherqq %xmm2, (%rdi,%xmm0,2), %xmm1
; X64-NEXT: vmovdqa %xmm1, %xmm0
-; X64-NEXT: retq
+; X64-NEXT: ret{{[l|q]}}
%arg0 = bitcast i64 *%a0 to i8*
%call = call <2 x i64> @llvm.x86.avx2.gather.q.q(<2 x i64> undef, i8* %arg0, <2 x i64> %a1, <2 x i64> <i64 -1, i64 -1>, i8 2)
ret <2 x i64> %call
@@ -1525,38 +1220,38 @@ define <2 x i64> @test_mm_i64gather_epi64(i64 *%a0, <2 x i64> %a1) {
declare <2 x i64> @llvm.x86.avx2.gather.q.q(<2 x i64>, i8*, <2 x i64>, <2 x i64>, i8) nounwind readonly
define <2 x i64> @test_mm_mask_i64gather_epi64(<2 x i64> %a0, i64 *%a1, <2 x i64> %a2, <2 x i64> %a3) {
-; X32-LABEL: test_mm_mask_i64gather_epi64:
-; X32: # BB#0:
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: vpgatherqq %xmm2, (%eax,%xmm1,2), %xmm0
-; X32-NEXT: retl
+; X86-LABEL: test_mm_mask_i64gather_epi64:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: vpgatherqq %xmm2, (%eax,%xmm1,2), %xmm0
+; X86-NEXT: ret{{[l|q]}}
;
; X64-LABEL: test_mm_mask_i64gather_epi64:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpgatherqq %xmm2, (%rdi,%xmm1,2), %xmm0
-; X64-NEXT: retq
+; X64-NEXT: ret{{[l|q]}}
%arg1 = bitcast i64 *%a1 to i8*
%call = call <2 x i64> @llvm.x86.avx2.gather.q.q(<2 x i64> %a0, i8* %arg1, <2 x i64> %a2, <2 x i64> %a3, i8 2)
ret <2 x i64> %call
}
define <4 x i64> @test_mm256_i64gather_epi64(i64 *%a0, <4 x i64> %a1) {
-; X32-LABEL: test_mm256_i64gather_epi64:
-; X32: # BB#0:
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
-; X32-NEXT: vpxor %ymm1, %ymm1, %ymm1
-; X32-NEXT: vpgatherqq %ymm2, (%eax,%ymm0,2), %ymm1
-; X32-NEXT: vmovdqa %ymm1, %ymm0
-; X32-NEXT: retl
+; X86-LABEL: test_mm256_i64gather_epi64:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
+; X86-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; X86-NEXT: vpgatherqq %ymm2, (%eax,%ymm0,2), %ymm1
+; X86-NEXT: vmovdqa %ymm1, %ymm0
+; X86-NEXT: ret{{[l|q]}}
;
; X64-LABEL: test_mm256_i64gather_epi64:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
-; X64-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1
; X64-NEXT: vpgatherqq %ymm2, (%rdi,%ymm0,2), %ymm1
; X64-NEXT: vmovdqa %ymm1, %ymm0
-; X64-NEXT: retq
+; X64-NEXT: ret{{[l|q]}}
%arg0 = bitcast i64 *%a0 to i8*
%call = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8* %arg0, <4 x i64> %a1, <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1>, i8 2)
ret <4 x i64> %call
@@ -1564,38 +1259,38 @@ define <4 x i64> @test_mm256_i64gather_epi64(i64 *%a0, <4 x i64> %a1) {
declare <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64>, i8*, <4 x i64>, <4 x i64>, i8) nounwind readonly
define <4 x i64> @test_mm256_mask_i64gather_epi64(<4 x i64> %a0, i64 *%a1, <4 x i64> %a2, <4 x i64> %a3) {
-; X32-LABEL: test_mm256_mask_i64gather_epi64:
-; X32: # BB#0:
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: vpgatherqq %ymm2, (%eax,%ymm1,2), %ymm0
-; X32-NEXT: retl
+; X86-LABEL: test_mm256_mask_i64gather_epi64:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: vpgatherqq %ymm2, (%eax,%ymm1,2), %ymm0
+; X86-NEXT: ret{{[l|q]}}
;
; X64-LABEL: test_mm256_mask_i64gather_epi64:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpgatherqq %ymm2, (%rdi,%ymm1,2), %ymm0
-; X64-NEXT: retq
+; X64-NEXT: ret{{[l|q]}}
%arg1 = bitcast i64 *%a1 to i8*
%call = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> %a0, i8* %arg1, <4 x i64> %a2, <4 x i64> %a3, i8 2)
ret <4 x i64> %call
}
define <2 x double> @test_mm_i64gather_pd(double *%a0, <2 x i64> %a1) {
-; X32-LABEL: test_mm_i64gather_pd:
-; X32: # BB#0:
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; X32-NEXT: vxorpd %xmm1, %xmm1, %xmm1
-; X32-NEXT: vgatherqpd %xmm2, (%eax,%xmm0,2), %xmm1
-; X32-NEXT: vmovapd %xmm1, %xmm0
-; X32-NEXT: retl
+; X86-LABEL: test_mm_i64gather_pd:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; X86-NEXT: vxorpd %xmm1, %xmm1, %xmm1
+; X86-NEXT: vgatherqpd %xmm2, (%eax,%xmm0,2), %xmm1
+; X86-NEXT: vmovapd %xmm1, %xmm0
+; X86-NEXT: ret{{[l|q]}}
;
; X64-LABEL: test_mm_i64gather_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; X64-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; X64-NEXT: vgatherqpd %xmm2, (%rdi,%xmm0,2), %xmm1
; X64-NEXT: vmovapd %xmm1, %xmm0
-; X64-NEXT: retq
+; X64-NEXT: ret{{[l|q]}}
%arg0 = bitcast double *%a0 to i8*
%cmp = fcmp oeq <2 x double> zeroinitializer, zeroinitializer
%sext = sext <2 x i1> %cmp to <2 x i64>
@@ -1606,38 +1301,38 @@ define <2 x double> @test_mm_i64gather_pd(double *%a0, <2 x i64> %a1) {
declare <2 x double> @llvm.x86.avx2.gather.q.pd(<2 x double>, i8*, <2 x i64>, <2 x double>, i8) nounwind readonly
define <2 x double> @test_mm_mask_i64gather_pd(<2 x double> %a0, double *%a1, <2 x i64> %a2, <2 x double> %a3) {
-; X32-LABEL: test_mm_mask_i64gather_pd:
-; X32: # BB#0:
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: vgatherqpd %xmm2, (%eax,%xmm1,2), %xmm0
-; X32-NEXT: retl
+; X86-LABEL: test_mm_mask_i64gather_pd:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: vgatherqpd %xmm2, (%eax,%xmm1,2), %xmm0
+; X86-NEXT: ret{{[l|q]}}
;
; X64-LABEL: test_mm_mask_i64gather_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vgatherqpd %xmm2, (%rdi,%xmm1,2), %xmm0
-; X64-NEXT: retq
+; X64-NEXT: ret{{[l|q]}}
%arg1 = bitcast double *%a1 to i8*
%call = call <2 x double> @llvm.x86.avx2.gather.q.pd(<2 x double> %a0, i8* %arg1, <2 x i64> %a2, <2 x double> %a3, i8 2)
ret <2 x double> %call
}
define <4 x double> @test_mm256_i64gather_pd(double *%a0, <4 x i64> %a1) {
-; X32-LABEL: test_mm256_i64gather_pd:
-; X32: # BB#0:
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: vxorpd %ymm1, %ymm1, %ymm1
-; X32-NEXT: vcmpeqpd %ymm1, %ymm1, %ymm2
-; X32-NEXT: vgatherqpd %ymm2, (%eax,%ymm0,2), %ymm1
-; X32-NEXT: vmovapd %ymm1, %ymm0
-; X32-NEXT: retl
+; X86-LABEL: test_mm256_i64gather_pd:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: vxorpd %xmm1, %xmm1, %xmm1
+; X86-NEXT: vcmpeqpd %ymm1, %ymm1, %ymm2
+; X86-NEXT: vgatherqpd %ymm2, (%eax,%ymm0,2), %ymm1
+; X86-NEXT: vmovapd %ymm1, %ymm0
+; X86-NEXT: ret{{[l|q]}}
;
; X64-LABEL: test_mm256_i64gather_pd:
-; X64: # BB#0:
-; X64-NEXT: vxorpd %ymm1, %ymm1, %ymm1
+; X64: # %bb.0:
+; X64-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; X64-NEXT: vcmpeqpd %ymm1, %ymm1, %ymm2
; X64-NEXT: vgatherqpd %ymm2, (%rdi,%ymm0,2), %ymm1
; X64-NEXT: vmovapd %ymm1, %ymm0
-; X64-NEXT: retq
+; X64-NEXT: ret{{[l|q]}}
%arg0 = bitcast double *%a0 to i8*
%mask = call <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double> zeroinitializer, <4 x double> zeroinitializer, i8 0)
%call = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8* %arg0, <4 x i64> %a1, <4 x double> %mask, i8 2)
@@ -1646,38 +1341,38 @@ define <4 x double> @test_mm256_i64gather_pd(double *%a0, <4 x i64> %a1) {
declare <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double>, i8*, <4 x i64>, <4 x double>, i8) nounwind readonly
define <4 x double> @test_mm256_mask_i64gather_pd(<4 x double> %a0, i64 *%a1, <4 x i64> %a2, <4 x double> %a3) {
-; X32-LABEL: test_mm256_mask_i64gather_pd:
-; X32: # BB#0:
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: vgatherqpd %ymm2, (%eax,%ymm1,2), %ymm0
-; X32-NEXT: retl
+; X86-LABEL: test_mm256_mask_i64gather_pd:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: vgatherqpd %ymm2, (%eax,%ymm1,2), %ymm0
+; X86-NEXT: ret{{[l|q]}}
;
; X64-LABEL: test_mm256_mask_i64gather_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vgatherqpd %ymm2, (%rdi,%ymm1,2), %ymm0
-; X64-NEXT: retq
+; X64-NEXT: ret{{[l|q]}}
%arg1 = bitcast i64 *%a1 to i8*
%call = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> %a0, i8* %arg1, <4 x i64> %a2, <4 x double> %a3, i8 2)
ret <4 x double> %call
}
define <4 x float> @test_mm_i64gather_ps(float *%a0, <2 x i64> %a1) {
-; X32-LABEL: test_mm_i64gather_ps:
-; X32: # BB#0:
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; X32-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X32-NEXT: vgatherqps %xmm2, (%eax,%xmm0,2), %xmm1
-; X32-NEXT: vmovaps %xmm1, %xmm0
-; X32-NEXT: retl
+; X86-LABEL: test_mm_i64gather_ps:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; X86-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; X86-NEXT: vgatherqps %xmm2, (%eax,%xmm0,2), %xmm1
+; X86-NEXT: vmovaps %xmm1, %xmm0
+; X86-NEXT: ret{{[l|q]}}
;
; X64-LABEL: test_mm_i64gather_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X64-NEXT: vgatherqps %xmm2, (%rdi,%xmm0,2), %xmm1
; X64-NEXT: vmovaps %xmm1, %xmm0
-; X64-NEXT: retq
+; X64-NEXT: ret{{[l|q]}}
%arg0 = bitcast float *%a0 to i8*
%cmp = fcmp oeq <4 x float> zeroinitializer, zeroinitializer
%sext = sext <4 x i1> %cmp to <4 x i32>
@@ -1688,40 +1383,40 @@ define <4 x float> @test_mm_i64gather_ps(float *%a0, <2 x i64> %a1) {
declare <4 x float> @llvm.x86.avx2.gather.q.ps(<4 x float>, i8*, <2 x i64>, <4 x float>, i8) nounwind readonly
define <4 x float> @test_mm_mask_i64gather_ps(<4 x float> %a0, float *%a1, <2 x i64> %a2, <4 x float> %a3) {
-; X32-LABEL: test_mm_mask_i64gather_ps:
-; X32: # BB#0:
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: vgatherqps %xmm2, (%eax,%xmm1,2), %xmm0
-; X32-NEXT: retl
+; X86-LABEL: test_mm_mask_i64gather_ps:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: vgatherqps %xmm2, (%eax,%xmm1,2), %xmm0
+; X86-NEXT: ret{{[l|q]}}
;
; X64-LABEL: test_mm_mask_i64gather_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vgatherqps %xmm2, (%rdi,%xmm1,2), %xmm0
-; X64-NEXT: retq
+; X64-NEXT: ret{{[l|q]}}
%arg1 = bitcast float *%a1 to i8*
%call = call <4 x float> @llvm.x86.avx2.gather.q.ps(<4 x float> %a0, i8* %arg1, <2 x i64> %a2, <4 x float> %a3, i8 2)
ret <4 x float> %call
}
define <4 x float> @test_mm256_i64gather_ps(float *%a0, <4 x i64> %a1) {
-; X32-LABEL: test_mm256_i64gather_ps:
-; X32: # BB#0:
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; X32-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X32-NEXT: vgatherqps %xmm2, (%eax,%ymm0,2), %xmm1
-; X32-NEXT: vmovaps %xmm1, %xmm0
-; X32-NEXT: vzeroupper
-; X32-NEXT: retl
+; X86-LABEL: test_mm256_i64gather_ps:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; X86-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; X86-NEXT: vgatherqps %xmm2, (%eax,%ymm0,2), %xmm1
+; X86-NEXT: vmovaps %xmm1, %xmm0
+; X86-NEXT: vzeroupper
+; X86-NEXT: ret{{[l|q]}}
;
; X64-LABEL: test_mm256_i64gather_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X64-NEXT: vgatherqps %xmm2, (%rdi,%ymm0,2), %xmm1
; X64-NEXT: vmovaps %xmm1, %xmm0
; X64-NEXT: vzeroupper
-; X64-NEXT: retq
+; X64-NEXT: ret{{[l|q]}}
%arg0 = bitcast float *%a0 to i8*
%cmp = fcmp oeq <4 x float> zeroinitializer, zeroinitializer
%sext = sext <4 x i1> %cmp to <4 x i32>
@@ -1732,65 +1427,49 @@ define <4 x float> @test_mm256_i64gather_ps(float *%a0, <4 x i64> %a1) {
declare <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float>, i8*, <4 x i64>, <4 x float>, i8) nounwind readonly
define <4 x float> @test_mm256_mask_i64gather_ps(<4 x float> %a0, float *%a1, <4 x i64> %a2, <4 x float> %a3) {
-; X32-LABEL: test_mm256_mask_i64gather_ps:
-; X32: # BB#0:
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: vgatherqps %xmm2, (%eax,%ymm1,2), %xmm0
-; X32-NEXT: vzeroupper
-; X32-NEXT: retl
+; X86-LABEL: test_mm256_mask_i64gather_ps:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: vgatherqps %xmm2, (%eax,%ymm1,2), %xmm0
+; X86-NEXT: vzeroupper
+; X86-NEXT: ret{{[l|q]}}
;
; X64-LABEL: test_mm256_mask_i64gather_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vgatherqps %xmm2, (%rdi,%ymm1,2), %xmm0
; X64-NEXT: vzeroupper
-; X64-NEXT: retq
+; X64-NEXT: ret{{[l|q]}}
%arg1 = bitcast float *%a1 to i8*
%call = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> %a0, i8* %arg1, <4 x i64> %a2, <4 x float> %a3, i8 2)
ret <4 x float> %call
}
define <4 x i64> @test0_mm256_inserti128_si256(<4 x i64> %a0, <2 x i64> %a1) nounwind {
-; X32-LABEL: test0_mm256_inserti128_si256:
-; X32: # BB#0:
-; X32-NEXT: # kill: %XMM1<def> %XMM1<kill> %YMM1<def>
-; X32-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; X32-NEXT: retl
-;
-; X64-LABEL: test0_mm256_inserti128_si256:
-; X64: # BB#0:
-; X64-NEXT: # kill: %XMM1<def> %XMM1<kill> %YMM1<def>
-; X64-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; X64-NEXT: retq
+; CHECK-LABEL: test0_mm256_inserti128_si256:
+; CHECK: # %bb.0:
+; CHECK-NEXT: # kill: def %xmm1 killed %xmm1 def %ymm1
+; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; CHECK-NEXT: ret{{[l|q]}}
%ext = shufflevector <2 x i64> %a1, <2 x i64> %a1, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
%res = shufflevector <4 x i64> %a0, <4 x i64> %ext, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
ret <4 x i64> %res
}
define <4 x i64> @test1_mm256_inserti128_si256(<4 x i64> %a0, <2 x i64> %a1) nounwind {
-; X32-LABEL: test1_mm256_inserti128_si256:
-; X32: # BB#0:
-; X32-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; X32-NEXT: retl
-;
-; X64-LABEL: test1_mm256_inserti128_si256:
-; X64: # BB#0:
-; X64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; X64-NEXT: retq
+; CHECK-LABEL: test1_mm256_inserti128_si256:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
%ext = shufflevector <2 x i64> %a1, <2 x i64> %a1, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
%res = shufflevector <4 x i64> %a0, <4 x i64> %ext, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
ret <4 x i64> %res
}
define <4 x i64> @test_mm256_madd_epi16(<4 x i64> %a0, <4 x i64> %a1) {
-; X32-LABEL: test_mm256_madd_epi16:
-; X32: # BB#0:
-; X32-NEXT: vpmaddwd %ymm1, %ymm0, %ymm0
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_madd_epi16:
-; X64: # BB#0:
-; X64-NEXT: vpmaddwd %ymm1, %ymm0, %ymm0
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_madd_epi16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpmaddwd %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
%arg0 = bitcast <4 x i64> %a0 to <16 x i16>
%arg1 = bitcast <4 x i64> %a1 to <16 x i16>
%res = call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> %arg0, <16 x i16> %arg1)
@@ -1800,15 +1479,10 @@ define <4 x i64> @test_mm256_madd_epi16(<4 x i64> %a0, <4 x i64> %a1) {
declare <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16>, <16 x i16>) nounwind readnone
define <4 x i64> @test_mm256_maddubs_epi16(<4 x i64> %a0, <4 x i64> %a1) {
-; X32-LABEL: test_mm256_maddubs_epi16:
-; X32: # BB#0:
-; X32-NEXT: vpmaddubsw %ymm1, %ymm0, %ymm0
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_maddubs_epi16:
-; X64: # BB#0:
-; X64-NEXT: vpmaddubsw %ymm1, %ymm0, %ymm0
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_maddubs_epi16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpmaddubsw %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
%arg0 = bitcast <4 x i64> %a0 to <32 x i8>
%arg1 = bitcast <4 x i64> %a1 to <32 x i8>
%res = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> %arg0, <32 x i8> %arg1)
@@ -1818,16 +1492,16 @@ define <4 x i64> @test_mm256_maddubs_epi16(<4 x i64> %a0, <4 x i64> %a1) {
declare <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8>, <32 x i8>) nounwind readnone
define <2 x i64> @test_mm_maskload_epi32(i32* %a0, <2 x i64> %a1) nounwind {
-; X32-LABEL: test_mm_maskload_epi32:
-; X32: # BB#0:
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: vpmaskmovd (%eax), %xmm0, %xmm0
-; X32-NEXT: retl
+; X86-LABEL: test_mm_maskload_epi32:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: vpmaskmovd (%eax), %xmm0, %xmm0
+; X86-NEXT: ret{{[l|q]}}
;
; X64-LABEL: test_mm_maskload_epi32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpmaskmovd (%rdi), %xmm0, %xmm0
-; X64-NEXT: retq
+; X64-NEXT: ret{{[l|q]}}
%arg0 = bitcast i32* %a0 to i8*
%arg1 = bitcast <2 x i64> %a1 to <4 x i32>
%call = call <4 x i32> @llvm.x86.avx2.maskload.d(i8* %arg0, <4 x i32> %arg1)
@@ -1837,16 +1511,16 @@ define <2 x i64> @test_mm_maskload_epi32(i32* %a0, <2 x i64> %a1) nounwind {
declare <4 x i32> @llvm.x86.avx2.maskload.d(i8*, <4 x i32>) nounwind readonly
define <4 x i64> @test_mm256_maskload_epi32(i32* %a0, <4 x i64> %a1) nounwind {
-; X32-LABEL: test_mm256_maskload_epi32:
-; X32: # BB#0:
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: vpmaskmovd (%eax), %ymm0, %ymm0
-; X32-NEXT: retl
+; X86-LABEL: test_mm256_maskload_epi32:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: vpmaskmovd (%eax), %ymm0, %ymm0
+; X86-NEXT: ret{{[l|q]}}
;
; X64-LABEL: test_mm256_maskload_epi32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpmaskmovd (%rdi), %ymm0, %ymm0
-; X64-NEXT: retq
+; X64-NEXT: ret{{[l|q]}}
%arg0 = bitcast i32* %a0 to i8*
%arg1 = bitcast <4 x i64> %a1 to <8 x i32>
%call = call <8 x i32> @llvm.x86.avx2.maskload.d.256(i8* %arg0, <8 x i32> %arg1)
@@ -1856,16 +1530,16 @@ define <4 x i64> @test_mm256_maskload_epi32(i32* %a0, <4 x i64> %a1) nounwind {
declare <8 x i32> @llvm.x86.avx2.maskload.d.256(i8*, <8 x i32>) nounwind readonly
define <2 x i64> @test_mm_maskload_epi64(i64* %a0, <2 x i64> %a1) nounwind {
-; X32-LABEL: test_mm_maskload_epi64:
-; X32: # BB#0:
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: vpmaskmovq (%eax), %xmm0, %xmm0
-; X32-NEXT: retl
+; X86-LABEL: test_mm_maskload_epi64:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: vpmaskmovq (%eax), %xmm0, %xmm0
+; X86-NEXT: ret{{[l|q]}}
;
; X64-LABEL: test_mm_maskload_epi64:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpmaskmovq (%rdi), %xmm0, %xmm0
-; X64-NEXT: retq
+; X64-NEXT: ret{{[l|q]}}
%arg0 = bitcast i64* %a0 to i8*
%res = call <2 x i64> @llvm.x86.avx2.maskload.q(i8* %arg0, <2 x i64> %a1)
ret <2 x i64> %res
@@ -1873,16 +1547,16 @@ define <2 x i64> @test_mm_maskload_epi64(i64* %a0, <2 x i64> %a1) nounwind {
declare <2 x i64> @llvm.x86.avx2.maskload.q(i8*, <2 x i64>) nounwind readonly
define <4 x i64> @test_mm256_maskload_epi64(i64* %a0, <4 x i64> %a1) nounwind {
-; X32-LABEL: test_mm256_maskload_epi64:
-; X32: # BB#0:
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: vpmaskmovq (%eax), %ymm0, %ymm0
-; X32-NEXT: retl
+; X86-LABEL: test_mm256_maskload_epi64:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: vpmaskmovq (%eax), %ymm0, %ymm0
+; X86-NEXT: ret{{[l|q]}}
;
; X64-LABEL: test_mm256_maskload_epi64:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpmaskmovq (%rdi), %ymm0, %ymm0
-; X64-NEXT: retq
+; X64-NEXT: ret{{[l|q]}}
%arg0 = bitcast i64* %a0 to i8*
%res = call <4 x i64> @llvm.x86.avx2.maskload.q.256(i8* %arg0, <4 x i64> %a1)
ret <4 x i64> %res
@@ -1890,16 +1564,16 @@ define <4 x i64> @test_mm256_maskload_epi64(i64* %a0, <4 x i64> %a1) nounwind {
declare <4 x i64> @llvm.x86.avx2.maskload.q.256(i8*, <4 x i64>) nounwind readonly
define void @test_mm_maskstore_epi32(float* %a0, <2 x i64> %a1, <2 x i64> %a2) nounwind {
-; X32-LABEL: test_mm_maskstore_epi32:
-; X32: # BB#0:
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: vpmaskmovd %xmm1, %xmm0, (%eax)
-; X32-NEXT: retl
+; X86-LABEL: test_mm_maskstore_epi32:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: vpmaskmovd %xmm1, %xmm0, (%eax)
+; X86-NEXT: ret{{[l|q]}}
;
; X64-LABEL: test_mm_maskstore_epi32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpmaskmovd %xmm1, %xmm0, (%rdi)
-; X64-NEXT: retq
+; X64-NEXT: ret{{[l|q]}}
%arg0 = bitcast float* %a0 to i8*
%arg1 = bitcast <2 x i64> %a1 to <4 x i32>
%arg2 = bitcast <2 x i64> %a2 to <4 x i32>
@@ -1909,18 +1583,18 @@ define void @test_mm_maskstore_epi32(float* %a0, <2 x i64> %a1, <2 x i64> %a2) n
declare void @llvm.x86.avx2.maskstore.d(i8*, <4 x i32>, <4 x i32>) nounwind readnone
define void @test_mm256_maskstore_epi32(float* %a0, <4 x i64> %a1, <4 x i64> %a2) nounwind {
-; X32-LABEL: test_mm256_maskstore_epi32:
-; X32: # BB#0:
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: vpmaskmovd %ymm1, %ymm0, (%eax)
-; X32-NEXT: vzeroupper
-; X32-NEXT: retl
+; X86-LABEL: test_mm256_maskstore_epi32:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: vpmaskmovd %ymm1, %ymm0, (%eax)
+; X86-NEXT: vzeroupper
+; X86-NEXT: ret{{[l|q]}}
;
; X64-LABEL: test_mm256_maskstore_epi32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpmaskmovd %ymm1, %ymm0, (%rdi)
; X64-NEXT: vzeroupper
-; X64-NEXT: retq
+; X64-NEXT: ret{{[l|q]}}
%arg0 = bitcast float* %a0 to i8*
%arg1 = bitcast <4 x i64> %a1 to <8 x i32>
%arg2 = bitcast <4 x i64> %a2 to <8 x i32>
@@ -1930,16 +1604,16 @@ define void @test_mm256_maskstore_epi32(float* %a0, <4 x i64> %a1, <4 x i64> %a2
declare void @llvm.x86.avx2.maskstore.d.256(i8*, <8 x i32>, <8 x i32>) nounwind readnone
define void @test_mm_maskstore_epi64(i64* %a0, <2 x i64> %a1, <2 x i64> %a2) nounwind {
-; X32-LABEL: test_mm_maskstore_epi64:
-; X32: # BB#0:
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: vpmaskmovq %xmm1, %xmm0, (%eax)
-; X32-NEXT: retl
+; X86-LABEL: test_mm_maskstore_epi64:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: vpmaskmovq %xmm1, %xmm0, (%eax)
+; X86-NEXT: ret{{[l|q]}}
;
; X64-LABEL: test_mm_maskstore_epi64:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpmaskmovq %xmm1, %xmm0, (%rdi)
-; X64-NEXT: retq
+; X64-NEXT: ret{{[l|q]}}
%arg0 = bitcast i64* %a0 to i8*
call void @llvm.x86.avx2.maskstore.q(i8* %arg0, <2 x i64> %a1, <2 x i64> %a2)
ret void
@@ -1947,18 +1621,18 @@ define void @test_mm_maskstore_epi64(i64* %a0, <2 x i64> %a1, <2 x i64> %a2) nou
declare void @llvm.x86.avx2.maskstore.q(i8*, <2 x i64>, <2 x i64>) nounwind readnone
define void @test_mm256_maskstore_epi64(i64* %a0, <4 x i64> %a1, <4 x i64> %a2) nounwind {
-; X32-LABEL: test_mm256_maskstore_epi64:
-; X32: # BB#0:
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: vpmaskmovq %ymm1, %ymm0, (%eax)
-; X32-NEXT: vzeroupper
-; X32-NEXT: retl
+; X86-LABEL: test_mm256_maskstore_epi64:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: vpmaskmovq %ymm1, %ymm0, (%eax)
+; X86-NEXT: vzeroupper
+; X86-NEXT: ret{{[l|q]}}
;
; X64-LABEL: test_mm256_maskstore_epi64:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpmaskmovq %ymm1, %ymm0, (%rdi)
; X64-NEXT: vzeroupper
-; X64-NEXT: retq
+; X64-NEXT: ret{{[l|q]}}
%arg0 = bitcast i64* %a0 to i8*
call void @llvm.x86.avx2.maskstore.q.256(i8* %arg0, <4 x i64> %a1, <4 x i64> %a2)
ret void
@@ -1966,15 +1640,10 @@ define void @test_mm256_maskstore_epi64(i64* %a0, <4 x i64> %a1, <4 x i64> %a2)
declare void @llvm.x86.avx2.maskstore.q.256(i8*, <4 x i64>, <4 x i64>) nounwind readnone
define <4 x i64> @test_mm256_max_epi8(<4 x i64> %a0, <4 x i64> %a1) {
-; X32-LABEL: test_mm256_max_epi8:
-; X32: # BB#0:
-; X32-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_max_epi8:
-; X64: # BB#0:
-; X64-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_max_epi8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
%arg0 = bitcast <4 x i64> %a0 to <32 x i8>
%arg1 = bitcast <4 x i64> %a1 to <32 x i8>
%cmp = icmp sgt <32 x i8> %arg0, %arg1
@@ -1984,15 +1653,10 @@ define <4 x i64> @test_mm256_max_epi8(<4 x i64> %a0, <4 x i64> %a1) {
}
define <4 x i64> @test_mm256_max_epi16(<4 x i64> %a0, <4 x i64> %a1) {
-; X32-LABEL: test_mm256_max_epi16:
-; X32: # BB#0:
-; X32-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_max_epi16:
-; X64: # BB#0:
-; X64-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_max_epi16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
%arg0 = bitcast <4 x i64> %a0 to <16 x i16>
%arg1 = bitcast <4 x i64> %a1 to <16 x i16>
%cmp = icmp sgt <16 x i16> %arg0, %arg1
@@ -2002,15 +1666,10 @@ define <4 x i64> @test_mm256_max_epi16(<4 x i64> %a0, <4 x i64> %a1) {
}
define <4 x i64> @test_mm256_max_epi32(<4 x i64> %a0, <4 x i64> %a1) {
-; X32-LABEL: test_mm256_max_epi32:
-; X32: # BB#0:
-; X32-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_max_epi32:
-; X64: # BB#0:
-; X64-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_max_epi32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
%arg0 = bitcast <4 x i64> %a0 to <8 x i32>
%arg1 = bitcast <4 x i64> %a1 to <8 x i32>
%cmp = icmp sgt <8 x i32> %arg0, %arg1
@@ -2020,15 +1679,10 @@ define <4 x i64> @test_mm256_max_epi32(<4 x i64> %a0, <4 x i64> %a1) {
}
define <4 x i64> @test_mm256_max_epu8(<4 x i64> %a0, <4 x i64> %a1) {
-; X32-LABEL: test_mm256_max_epu8:
-; X32: # BB#0:
-; X32-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_max_epu8:
-; X64: # BB#0:
-; X64-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_max_epu8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
%arg0 = bitcast <4 x i64> %a0 to <32 x i8>
%arg1 = bitcast <4 x i64> %a1 to <32 x i8>
%cmp = icmp ugt <32 x i8> %arg0, %arg1
@@ -2038,15 +1692,10 @@ define <4 x i64> @test_mm256_max_epu8(<4 x i64> %a0, <4 x i64> %a1) {
}
define <4 x i64> @test_mm256_max_epu16(<4 x i64> %a0, <4 x i64> %a1) {
-; X32-LABEL: test_mm256_max_epu16:
-; X32: # BB#0:
-; X32-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_max_epu16:
-; X64: # BB#0:
-; X64-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_max_epu16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
%arg0 = bitcast <4 x i64> %a0 to <16 x i16>
%arg1 = bitcast <4 x i64> %a1 to <16 x i16>
%cmp = icmp ugt <16 x i16> %arg0, %arg1
@@ -2056,15 +1705,10 @@ define <4 x i64> @test_mm256_max_epu16(<4 x i64> %a0, <4 x i64> %a1) {
}
define <4 x i64> @test_mm256_max_epu32(<4 x i64> %a0, <4 x i64> %a1) {
-; X32-LABEL: test_mm256_max_epu32:
-; X32: # BB#0:
-; X32-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_max_epu32:
-; X64: # BB#0:
-; X64-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_max_epu32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
%arg0 = bitcast <4 x i64> %a0 to <8 x i32>
%arg1 = bitcast <4 x i64> %a1 to <8 x i32>
%cmp = icmp ugt <8 x i32> %arg0, %arg1
@@ -2074,15 +1718,10 @@ define <4 x i64> @test_mm256_max_epu32(<4 x i64> %a0, <4 x i64> %a1) {
}
define <4 x i64> @test_mm256_min_epi8(<4 x i64> %a0, <4 x i64> %a1) {
-; X32-LABEL: test_mm256_min_epi8:
-; X32: # BB#0:
-; X32-NEXT: vpminsb %ymm1, %ymm0, %ymm0
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_min_epi8:
-; X64: # BB#0:
-; X64-NEXT: vpminsb %ymm1, %ymm0, %ymm0
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_min_epi8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpminsb %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
%arg0 = bitcast <4 x i64> %a0 to <32 x i8>
%arg1 = bitcast <4 x i64> %a1 to <32 x i8>
%cmp = icmp slt <32 x i8> %arg0, %arg1
@@ -2092,15 +1731,10 @@ define <4 x i64> @test_mm256_min_epi8(<4 x i64> %a0, <4 x i64> %a1) {
}
define <4 x i64> @test_mm256_min_epi16(<4 x i64> %a0, <4 x i64> %a1) {
-; X32-LABEL: test_mm256_min_epi16:
-; X32: # BB#0:
-; X32-NEXT: vpminsw %ymm1, %ymm0, %ymm0
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_min_epi16:
-; X64: # BB#0:
-; X64-NEXT: vpminsw %ymm1, %ymm0, %ymm0
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_min_epi16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpminsw %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
%arg0 = bitcast <4 x i64> %a0 to <16 x i16>
%arg1 = bitcast <4 x i64> %a1 to <16 x i16>
%cmp = icmp slt <16 x i16> %arg0, %arg1
@@ -2110,15 +1744,10 @@ define <4 x i64> @test_mm256_min_epi16(<4 x i64> %a0, <4 x i64> %a1) {
}
define <4 x i64> @test_mm256_min_epi32(<4 x i64> %a0, <4 x i64> %a1) {
-; X32-LABEL: test_mm256_min_epi32:
-; X32: # BB#0:
-; X32-NEXT: vpminsd %ymm1, %ymm0, %ymm0
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_min_epi32:
-; X64: # BB#0:
-; X64-NEXT: vpminsd %ymm1, %ymm0, %ymm0
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_min_epi32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpminsd %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
%arg0 = bitcast <4 x i64> %a0 to <8 x i32>
%arg1 = bitcast <4 x i64> %a1 to <8 x i32>
%cmp = icmp slt <8 x i32> %arg0, %arg1
@@ -2128,15 +1757,10 @@ define <4 x i64> @test_mm256_min_epi32(<4 x i64> %a0, <4 x i64> %a1) {
}
define <4 x i64> @test_mm256_min_epu8(<4 x i64> %a0, <4 x i64> %a1) {
-; X32-LABEL: test_mm256_min_epu8:
-; X32: # BB#0:
-; X32-NEXT: vpminub %ymm1, %ymm0, %ymm0
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_min_epu8:
-; X64: # BB#0:
-; X64-NEXT: vpminub %ymm1, %ymm0, %ymm0
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_min_epu8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpminub %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
%arg0 = bitcast <4 x i64> %a0 to <32 x i8>
%arg1 = bitcast <4 x i64> %a1 to <32 x i8>
%cmp = icmp ult <32 x i8> %arg0, %arg1
@@ -2146,15 +1770,10 @@ define <4 x i64> @test_mm256_min_epu8(<4 x i64> %a0, <4 x i64> %a1) {
}
define <4 x i64> @test_mm256_min_epu16(<4 x i64> %a0, <4 x i64> %a1) {
-; X32-LABEL: test_mm256_min_epu16:
-; X32: # BB#0:
-; X32-NEXT: vpminuw %ymm1, %ymm0, %ymm0
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_min_epu16:
-; X64: # BB#0:
-; X64-NEXT: vpminuw %ymm1, %ymm0, %ymm0
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_min_epu16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpminuw %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
%arg0 = bitcast <4 x i64> %a0 to <16 x i16>
%arg1 = bitcast <4 x i64> %a1 to <16 x i16>
%cmp = icmp ult <16 x i16> %arg0, %arg1
@@ -2164,15 +1783,10 @@ define <4 x i64> @test_mm256_min_epu16(<4 x i64> %a0, <4 x i64> %a1) {
}
define <4 x i64> @test_mm256_min_epu32(<4 x i64> %a0, <4 x i64> %a1) {
-; X32-LABEL: test_mm256_min_epu32:
-; X32: # BB#0:
-; X32-NEXT: vpminud %ymm1, %ymm0, %ymm0
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_min_epu32:
-; X64: # BB#0:
-; X64-NEXT: vpminud %ymm1, %ymm0, %ymm0
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_min_epu32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpminud %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
%arg0 = bitcast <4 x i64> %a0 to <8 x i32>
%arg1 = bitcast <4 x i64> %a1 to <8 x i32>
%cmp = icmp ult <8 x i32> %arg0, %arg1
@@ -2182,17 +1796,11 @@ define <4 x i64> @test_mm256_min_epu32(<4 x i64> %a0, <4 x i64> %a1) {
}
define i32 @test_mm256_movemask_epi8(<4 x i64> %a0) nounwind {
-; X32-LABEL: test_mm256_movemask_epi8:
-; X32: # BB#0:
-; X32-NEXT: vpmovmskb %ymm0, %eax
-; X32-NEXT: vzeroupper
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_movemask_epi8:
-; X64: # BB#0:
-; X64-NEXT: vpmovmskb %ymm0, %eax
-; X64-NEXT: vzeroupper
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_movemask_epi8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpmovmskb %ymm0, %eax
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: ret{{[l|q]}}
%arg0 = bitcast <4 x i64> %a0 to <32 x i8>
%res = call i32 @llvm.x86.avx2.pmovmskb(<32 x i8> %arg0)
ret i32 %res
@@ -2200,15 +1808,10 @@ define i32 @test_mm256_movemask_epi8(<4 x i64> %a0) nounwind {
declare i32 @llvm.x86.avx2.pmovmskb(<32 x i8>) nounwind readnone
define <4 x i64> @test_mm256_mpsadbw_epu8(<4 x i64> %a0, <4 x i64> %a1) {
-; X32-LABEL: test_mm256_mpsadbw_epu8:
-; X32: # BB#0:
-; X32-NEXT: vmpsadbw $3, %ymm1, %ymm0, %ymm0
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_mpsadbw_epu8:
-; X64: # BB#0:
-; X64-NEXT: vmpsadbw $3, %ymm1, %ymm0, %ymm0
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_mpsadbw_epu8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmpsadbw $3, %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
%arg0 = bitcast <4 x i64> %a0 to <32 x i8>
%arg1 = bitcast <4 x i64> %a1 to <32 x i8>
%call = call <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8> %arg0, <32 x i8> %arg1, i8 3)
@@ -2218,15 +1821,10 @@ define <4 x i64> @test_mm256_mpsadbw_epu8(<4 x i64> %a0, <4 x i64> %a1) {
declare <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8>, <32 x i8>, i8) nounwind readnone
define <4 x i64> @test_mm256_mul_epi32(<4 x i64> %a0, <4 x i64> %a1) {
-; X32-LABEL: test_mm256_mul_epi32:
-; X32: # BB#0:
-; X32-NEXT: vpmuldq %ymm1, %ymm0, %ymm0
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_mul_epi32:
-; X64: # BB#0:
-; X64-NEXT: vpmuldq %ymm1, %ymm0, %ymm0
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_mul_epi32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpmuldq %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
%arg0 = bitcast <4 x i64> %a0 to <8 x i32>
%arg1 = bitcast <4 x i64> %a1 to <8 x i32>
%res = call <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32> %arg0, <8 x i32> %arg1)
@@ -2235,15 +1833,10 @@ define <4 x i64> @test_mm256_mul_epi32(<4 x i64> %a0, <4 x i64> %a1) {
declare <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32>, <8 x i32>) nounwind readnone
define <4 x i64> @test_mm256_mul_epu32(<4 x i64> %a0, <4 x i64> %a1) {
-; X32-LABEL: test_mm256_mul_epu32:
-; X32: # BB#0:
-; X32-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_mul_epu32:
-; X64: # BB#0:
-; X64-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_mul_epu32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
%arg0 = bitcast <4 x i64> %a0 to <8 x i32>
%arg1 = bitcast <4 x i64> %a1 to <8 x i32>
%res = call <4 x i64> @llvm.x86.avx2.pmulu.dq(<8 x i32> %arg0, <8 x i32> %arg1)
@@ -2252,15 +1845,10 @@ define <4 x i64> @test_mm256_mul_epu32(<4 x i64> %a0, <4 x i64> %a1) {
declare <4 x i64> @llvm.x86.avx2.pmulu.dq(<8 x i32>, <8 x i32>) nounwind readnone
define <4 x i64> @test_mm256_mulhi_epi16(<4 x i64> %a0, <4 x i64> %a1) {
-; X32-LABEL: test_mm256_mulhi_epi16:
-; X32: # BB#0:
-; X32-NEXT: vpmulhw %ymm1, %ymm0, %ymm0
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_mulhi_epi16:
-; X64: # BB#0:
-; X64-NEXT: vpmulhw %ymm1, %ymm0, %ymm0
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_mulhi_epi16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpmulhw %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
%arg0 = bitcast <4 x i64> %a0 to <16 x i16>
%arg1 = bitcast <4 x i64> %a1 to <16 x i16>
%res = call <16 x i16> @llvm.x86.avx2.pmulh.w(<16 x i16> %arg0, <16 x i16> %arg1)
@@ -2270,15 +1858,10 @@ define <4 x i64> @test_mm256_mulhi_epi16(<4 x i64> %a0, <4 x i64> %a1) {
declare <16 x i16> @llvm.x86.avx2.pmulh.w(<16 x i16>, <16 x i16>) nounwind readnone
define <4 x i64> @test_mm256_mulhi_epu16(<4 x i64> %a0, <4 x i64> %a1) {
-; X32-LABEL: test_mm256_mulhi_epu16:
-; X32: # BB#0:
-; X32-NEXT: vpmulhuw %ymm1, %ymm0, %ymm0
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_mulhi_epu16:
-; X64: # BB#0:
-; X64-NEXT: vpmulhuw %ymm1, %ymm0, %ymm0
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_mulhi_epu16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpmulhuw %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
%arg0 = bitcast <4 x i64> %a0 to <16 x i16>
%arg1 = bitcast <4 x i64> %a1 to <16 x i16>
%res = call <16 x i16> @llvm.x86.avx2.pmulhu.w(<16 x i16> %arg0, <16 x i16> %arg1)
@@ -2288,15 +1871,10 @@ define <4 x i64> @test_mm256_mulhi_epu16(<4 x i64> %a0, <4 x i64> %a1) {
declare <16 x i16> @llvm.x86.avx2.pmulhu.w(<16 x i16>, <16 x i16>) nounwind readnone
define <4 x i64> @test_mm256_mulhrs_epi16(<4 x i64> %a0, <4 x i64> %a1) {
-; X32-LABEL: test_mm256_mulhrs_epi16:
-; X32: # BB#0:
-; X32-NEXT: vpmulhrsw %ymm1, %ymm0, %ymm0
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_mulhrs_epi16:
-; X64: # BB#0:
-; X64-NEXT: vpmulhrsw %ymm1, %ymm0, %ymm0
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_mulhrs_epi16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpmulhrsw %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
%arg0 = bitcast <4 x i64> %a0 to <16 x i16>
%arg1 = bitcast <4 x i64> %a1 to <16 x i16>
%res = call <16 x i16> @llvm.x86.avx2.pmul.hr.sw(<16 x i16> %arg0, <16 x i16> %arg1)
@@ -2306,15 +1884,10 @@ define <4 x i64> @test_mm256_mulhrs_epi16(<4 x i64> %a0, <4 x i64> %a1) {
declare <16 x i16> @llvm.x86.avx2.pmul.hr.sw(<16 x i16>, <16 x i16>) nounwind readnone
define <4 x i64> @test_mm256_mullo_epi16(<4 x i64> %a0, <4 x i64> %a1) {
-; X32-LABEL: test_mm256_mullo_epi16:
-; X32: # BB#0:
-; X32-NEXT: vpmullw %ymm1, %ymm0, %ymm0
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_mullo_epi16:
-; X64: # BB#0:
-; X64-NEXT: vpmullw %ymm1, %ymm0, %ymm0
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_mullo_epi16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpmullw %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
%arg0 = bitcast <4 x i64> %a0 to <16 x i16>
%arg1 = bitcast <4 x i64> %a1 to <16 x i16>
%res = mul <16 x i16> %arg0, %arg1
@@ -2323,15 +1896,10 @@ define <4 x i64> @test_mm256_mullo_epi16(<4 x i64> %a0, <4 x i64> %a1) {
}
define <4 x i64> @test_mm256_mullo_epi32(<4 x i64> %a0, <4 x i64> %a1) {
-; X32-LABEL: test_mm256_mullo_epi32:
-; X32: # BB#0:
-; X32-NEXT: vpmulld %ymm1, %ymm0, %ymm0
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_mullo_epi32:
-; X64: # BB#0:
-; X64-NEXT: vpmulld %ymm1, %ymm0, %ymm0
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_mullo_epi32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpmulld %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
%arg0 = bitcast <4 x i64> %a0 to <8 x i32>
%arg1 = bitcast <4 x i64> %a1 to <8 x i32>
%res = mul <8 x i32> %arg0, %arg1
@@ -2340,29 +1908,19 @@ define <4 x i64> @test_mm256_mullo_epi32(<4 x i64> %a0, <4 x i64> %a1) {
}
define <4 x i64> @test_mm256_or_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind {
-; X32-LABEL: test_mm256_or_si256:
-; X32: # BB#0:
-; X32-NEXT: vorps %ymm1, %ymm0, %ymm0
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_or_si256:
-; X64: # BB#0:
-; X64-NEXT: vorps %ymm1, %ymm0, %ymm0
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_or_si256:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vorps %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
%res = or <4 x i64> %a0, %a1
ret <4 x i64> %res
}
define <4 x i64> @test_mm256_packs_epi16(<4 x i64> %a0, <4 x i64> %a1) {
-; X32-LABEL: test_mm256_packs_epi16:
-; X32: # BB#0:
-; X32-NEXT: vpacksswb %ymm1, %ymm0, %ymm0
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_packs_epi16:
-; X64: # BB#0:
-; X64-NEXT: vpacksswb %ymm1, %ymm0, %ymm0
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_packs_epi16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpacksswb %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
%arg0 = bitcast <4 x i64> %a0 to <16 x i16>
%arg1 = bitcast <4 x i64> %a1 to <16 x i16>
%call = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %arg0, <16 x i16> %arg1)
@@ -2372,15 +1930,10 @@ define <4 x i64> @test_mm256_packs_epi16(<4 x i64> %a0, <4 x i64> %a1) {
declare <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16>, <16 x i16>) nounwind readnone
define <4 x i64> @test_mm256_packs_epi32(<4 x i64> %a0, <4 x i64> %a1) {
-; X32-LABEL: test_mm256_packs_epi32:
-; X32: # BB#0:
-; X32-NEXT: vpackssdw %ymm1, %ymm0, %ymm0
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_packs_epi32:
-; X64: # BB#0:
-; X64-NEXT: vpackssdw %ymm1, %ymm0, %ymm0
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_packs_epi32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpackssdw %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
%arg0 = bitcast <4 x i64> %a0 to <8 x i32>
%arg1 = bitcast <4 x i64> %a1 to <8 x i32>
%call = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %arg0, <8 x i32> %arg1)
@@ -2390,15 +1943,10 @@ define <4 x i64> @test_mm256_packs_epi32(<4 x i64> %a0, <4 x i64> %a1) {
declare <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32>, <8 x i32>) nounwind readnone
define <4 x i64> @test_mm256_packus_epi16(<4 x i64> %a0, <4 x i64> %a1) {
-; X32-LABEL: test_mm256_packus_epi16:
-; X32: # BB#0:
-; X32-NEXT: vpackuswb %ymm1, %ymm0, %ymm0
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_packus_epi16:
-; X64: # BB#0:
-; X64-NEXT: vpackuswb %ymm1, %ymm0, %ymm0
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_packus_epi16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpackuswb %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
%arg0 = bitcast <4 x i64> %a0 to <16 x i16>
%arg1 = bitcast <4 x i64> %a1 to <16 x i16>
%call = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %arg0, <16 x i16> %arg1)
@@ -2408,15 +1956,10 @@ define <4 x i64> @test_mm256_packus_epi16(<4 x i64> %a0, <4 x i64> %a1) {
declare <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16>, <16 x i16>) nounwind readnone
define <4 x i64> @test_mm256_packus_epi32(<4 x i64> %a0, <4 x i64> %a1) {
-; X32-LABEL: test_mm256_packus_epi32:
-; X32: # BB#0:
-; X32-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_packus_epi32:
-; X64: # BB#0:
-; X64-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_packus_epi32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
%arg0 = bitcast <4 x i64> %a0 to <8 x i32>
%arg1 = bitcast <4 x i64> %a1 to <8 x i32>
%call = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %arg0, <8 x i32> %arg1)
@@ -2426,58 +1969,38 @@ define <4 x i64> @test_mm256_packus_epi32(<4 x i64> %a0, <4 x i64> %a1) {
declare <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32>, <8 x i32>) nounwind readnone
define <4 x i64> @test_mm256_permute2x128_si256(<4 x i64> %a0, <4 x i64> %a1) {
-; X32-LABEL: test_mm256_permute2x128_si256:
-; X32: # BB#0:
-; X32-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_permute2x128_si256:
-; X64: # BB#0:
-; X64-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
-; X64-NEXT: retq
- %res = call <4 x i64> @llvm.x86.avx2.vperm2i128(<4 x i64> %a0, <4 x i64> %a1, i8 49)
+; CHECK-LABEL: test_mm256_permute2x128_si256:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
+; CHECK-NEXT: ret{{[l|q]}}
+ %res = shufflevector <4 x i64> %a0, <4 x i64> %a1, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
ret <4 x i64> %res
}
declare <4 x i64> @llvm.x86.avx2.vperm2i128(<4 x i64>, <4 x i64>, i8) nounwind readonly
define <4 x i64> @test_mm256_permute4x64_epi64(<4 x i64> %a0) {
-; X32-LABEL: test_mm256_permute4x64_epi64:
-; X32: # BB#0:
-; X32-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,0,2,0]
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_permute4x64_epi64:
-; X64: # BB#0:
-; X64-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,0,2,0]
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_permute4x64_epi64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,0,2,0]
+; CHECK-NEXT: ret{{[l|q]}}
%res = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 3, i32 0, i32 2, i32 0>
ret <4 x i64> %res
}
define <4 x double> @test_mm256_permute4x64_pd(<4 x double> %a0) {
-; X32-LABEL: test_mm256_permute4x64_pd:
-; X32: # BB#0:
-; X32-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,2,1,0]
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_permute4x64_pd:
-; X64: # BB#0:
-; X64-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,2,1,0]
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_permute4x64_pd:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,2,1,0]
+; CHECK-NEXT: ret{{[l|q]}}
%res = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 1, i32 2, i32 1, i32 0>
ret <4 x double> %res
}
define <4 x i64> @test_mm256_permutevar8x32_epi32(<4 x i64> %a0, <4 x i64> %a1) {
-; X32-LABEL: test_mm256_permutevar8x32_epi32:
-; X32: # BB#0:
-; X32-NEXT: vpermd %ymm0, %ymm1, %ymm0
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_permutevar8x32_epi32:
-; X64: # BB#0:
-; X64-NEXT: vpermd %ymm0, %ymm1, %ymm0
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_permutevar8x32_epi32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
%arg0 = bitcast <4 x i64> %a0 to <8 x i32>
%arg1 = bitcast <4 x i64> %a1 to <8 x i32>
%call = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %arg0, <8 x i32> %arg1)
@@ -2487,15 +2010,10 @@ define <4 x i64> @test_mm256_permutevar8x32_epi32(<4 x i64> %a0, <4 x i64> %a1)
declare <8 x i32> @llvm.x86.avx2.permd(<8 x i32>, <8 x i32>) nounwind readonly
define <8 x float> @test_mm256_permutevar8x32_ps(<8 x float> %a0, <4 x i64> %a1) {
-; X32-LABEL: test_mm256_permutevar8x32_ps:
-; X32: # BB#0:
-; X32-NEXT: vpermps %ymm0, %ymm1, %ymm0
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_permutevar8x32_ps:
-; X64: # BB#0:
-; X64-NEXT: vpermps %ymm0, %ymm1, %ymm0
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_permutevar8x32_ps:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
%arg1 = bitcast <4 x i64> %a1 to <8 x i32>
%res = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> %arg1)
ret <8 x float> %res
@@ -2503,15 +2021,10 @@ define <8 x float> @test_mm256_permutevar8x32_ps(<8 x float> %a0, <4 x i64> %a1)
declare <8 x float> @llvm.x86.avx2.permps(<8 x float>, <8 x i32>) nounwind readonly
define <4 x i64> @test_mm256_sad_epu8(<4 x i64> %a0, <4 x i64> %a1) {
-; X32-LABEL: test_mm256_sad_epu8:
-; X32: # BB#0:
-; X32-NEXT: vpsadbw %ymm1, %ymm0, %ymm0
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_sad_epu8:
-; X64: # BB#0:
-; X64-NEXT: vpsadbw %ymm1, %ymm0, %ymm0
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_sad_epu8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpsadbw %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
%arg0 = bitcast <4 x i64> %a0 to <32 x i8>
%arg1 = bitcast <4 x i64> %a1 to <32 x i8>
%res = call <4 x i64> @llvm.x86.avx2.psad.bw(<32 x i8> %arg0, <32 x i8> %arg1)
@@ -2520,15 +2033,10 @@ define <4 x i64> @test_mm256_sad_epu8(<4 x i64> %a0, <4 x i64> %a1) {
declare <4 x i64> @llvm.x86.avx2.psad.bw(<32 x i8>, <32 x i8>) nounwind readnone
define <4 x i64> @test_mm256_shuffle_epi32(<4 x i64> %a0) {
-; X32-LABEL: test_mm256_shuffle_epi32:
-; X32: # BB#0:
-; X32-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,3,0,0,7,7,4,4]
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_shuffle_epi32:
-; X64: # BB#0:
-; X64-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,3,0,0,7,7,4,4]
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_shuffle_epi32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,3,0,0,7,7,4,4]
+; CHECK-NEXT: ret{{[l|q]}}
%arg0 = bitcast <4 x i64> %a0 to <8 x i32>
%shuf = shufflevector <8 x i32> %arg0, <8 x i32> undef, <8 x i32> <i32 3, i32 3, i32 0, i32 0, i32 7, i32 7, i32 4, i32 4>
%res = bitcast <8 x i32> %shuf to <4 x i64>
@@ -2536,15 +2044,10 @@ define <4 x i64> @test_mm256_shuffle_epi32(<4 x i64> %a0) {
}
define <4 x i64> @test_mm256_shuffle_epi8(<4 x i64> %a0, <4 x i64> %a1) {
-; X32-LABEL: test_mm256_shuffle_epi8:
-; X32: # BB#0:
-; X32-NEXT: vpshufb %ymm1, %ymm0, %ymm0
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_shuffle_epi8:
-; X64: # BB#0:
-; X64-NEXT: vpshufb %ymm1, %ymm0, %ymm0
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_shuffle_epi8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpshufb %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
%arg0 = bitcast <4 x i64> %a0 to <32 x i8>
%arg1 = bitcast <4 x i64> %a1 to <32 x i8>
%shuf = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %arg0, <32 x i8> %arg1)
@@ -2554,15 +2057,10 @@ define <4 x i64> @test_mm256_shuffle_epi8(<4 x i64> %a0, <4 x i64> %a1) {
declare <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8>, <32 x i8>) nounwind readnone
define <4 x i64> @test_mm256_shufflehi_epi16(<4 x i64> %a0) {
-; X32-LABEL: test_mm256_shufflehi_epi16:
-; X32: # BB#0:
-; X32-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,6,6,5,8,9,10,11,15,14,14,13]
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_shufflehi_epi16:
-; X64: # BB#0:
-; X64-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,6,6,5,8,9,10,11,15,14,14,13]
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_shufflehi_epi16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,6,6,5,8,9,10,11,15,14,14,13]
+; CHECK-NEXT: ret{{[l|q]}}
%arg0 = bitcast <4 x i64> %a0 to <16 x i16>
%shuf = shufflevector <16 x i16> %arg0, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 6, i32 5, i32 8, i32 9, i32 10, i32 11, i32 15, i32 14, i32 14, i32 13>
%res = bitcast <16 x i16> %shuf to <4 x i64>
@@ -2570,15 +2068,10 @@ define <4 x i64> @test_mm256_shufflehi_epi16(<4 x i64> %a0) {
}
define <4 x i64> @test_mm256_shufflelo_epi16(<4 x i64> %a0) {
-; X32-LABEL: test_mm256_shufflelo_epi16:
-; X32: # BB#0:
-; X32-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[3,0,1,1,4,5,6,7,11,8,9,9,12,13,14,15]
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_shufflelo_epi16:
-; X64: # BB#0:
-; X64-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[3,0,1,1,4,5,6,7,11,8,9,9,12,13,14,15]
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_shufflelo_epi16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[3,0,1,1,4,5,6,7,11,8,9,9,12,13,14,15]
+; CHECK-NEXT: ret{{[l|q]}}
%arg0 = bitcast <4 x i64> %a0 to <16 x i16>
%shuf = shufflevector <16 x i16> %arg0, <16 x i16> undef, <16 x i32> <i32 3, i32 0, i32 1, i32 1, i32 4, i32 5, i32 6, i32 7, i32 11, i32 8, i32 9, i32 9, i32 12, i32 13, i32 14, i32 15>
%res = bitcast <16 x i16> %shuf to <4 x i64>
@@ -2586,15 +2079,10 @@ define <4 x i64> @test_mm256_shufflelo_epi16(<4 x i64> %a0) {
}
define <4 x i64> @test_mm256_sign_epi8(<4 x i64> %a0, <4 x i64> %a1) {
-; X32-LABEL: test_mm256_sign_epi8:
-; X32: # BB#0:
-; X32-NEXT: vpsignb %ymm1, %ymm0, %ymm0
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_sign_epi8:
-; X64: # BB#0:
-; X64-NEXT: vpsignb %ymm1, %ymm0, %ymm0
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_sign_epi8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpsignb %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
%arg0 = bitcast <4 x i64> %a0 to <32 x i8>
%arg1 = bitcast <4 x i64> %a1 to <32 x i8>
%call = call <32 x i8> @llvm.x86.avx2.psign.b(<32 x i8> %arg0, <32 x i8> %arg1)
@@ -2604,15 +2092,10 @@ define <4 x i64> @test_mm256_sign_epi8(<4 x i64> %a0, <4 x i64> %a1) {
declare <32 x i8> @llvm.x86.avx2.psign.b(<32 x i8>, <32 x i8>) nounwind readnone
define <4 x i64> @test_mm256_sign_epi16(<4 x i64> %a0, <4 x i64> %a1) {
-; X32-LABEL: test_mm256_sign_epi16:
-; X32: # BB#0:
-; X32-NEXT: vpsignw %ymm1, %ymm0, %ymm0
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_sign_epi16:
-; X64: # BB#0:
-; X64-NEXT: vpsignw %ymm1, %ymm0, %ymm0
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_sign_epi16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpsignw %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
%arg0 = bitcast <4 x i64> %a0 to <16 x i16>
%arg1 = bitcast <4 x i64> %a1 to <16 x i16>
%call = call <16 x i16> @llvm.x86.avx2.psign.w(<16 x i16> %arg0, <16 x i16> %arg1)
@@ -2622,15 +2105,10 @@ define <4 x i64> @test_mm256_sign_epi16(<4 x i64> %a0, <4 x i64> %a1) {
declare <16 x i16> @llvm.x86.avx2.psign.w(<16 x i16>, <16 x i16>) nounwind readnone
define <4 x i64> @test_mm256_sign_epi32(<4 x i64> %a0, <4 x i64> %a1) {
-; X32-LABEL: test_mm256_sign_epi32:
-; X32: # BB#0:
-; X32-NEXT: vpsignd %ymm1, %ymm0, %ymm0
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_sign_epi32:
-; X64: # BB#0:
-; X64-NEXT: vpsignd %ymm1, %ymm0, %ymm0
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_sign_epi32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpsignd %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
%arg0 = bitcast <4 x i64> %a0 to <8 x i32>
%arg1 = bitcast <4 x i64> %a1 to <8 x i32>
%call = call <8 x i32> @llvm.x86.avx2.psign.d(<8 x i32> %arg0, <8 x i32> %arg1)
@@ -2640,15 +2118,10 @@ define <4 x i64> @test_mm256_sign_epi32(<4 x i64> %a0, <4 x i64> %a1) {
declare <8 x i32> @llvm.x86.avx2.psign.d(<8 x i32>, <8 x i32>) nounwind readnone
define <4 x i64> @test_mm256_sll_epi16(<4 x i64> %a0, <2 x i64> %a1) {
-; X32-LABEL: test_mm256_sll_epi16:
-; X32: # BB#0:
-; X32-NEXT: vpsllw %xmm1, %ymm0, %ymm0
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_sll_epi16:
-; X64: # BB#0:
-; X64-NEXT: vpsllw %xmm1, %ymm0, %ymm0
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_sll_epi16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpsllw %xmm1, %ymm0, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
%arg0 = bitcast <4 x i64> %a0 to <16 x i16>
%arg1 = bitcast <2 x i64> %a1 to <8 x i16>
%res = call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> %arg0, <8 x i16> %arg1)
@@ -2658,15 +2131,10 @@ define <4 x i64> @test_mm256_sll_epi16(<4 x i64> %a0, <2 x i64> %a1) {
declare <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16>, <8 x i16>) nounwind readnone
define <4 x i64> @test_mm256_sll_epi32(<4 x i64> %a0, <2 x i64> %a1) {
-; X32-LABEL: test_mm256_sll_epi32:
-; X32: # BB#0:
-; X32-NEXT: vpslld %xmm1, %ymm0, %ymm0
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_sll_epi32:
-; X64: # BB#0:
-; X64-NEXT: vpslld %xmm1, %ymm0, %ymm0
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_sll_epi32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpslld %xmm1, %ymm0, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
%arg0 = bitcast <4 x i64> %a0 to <8 x i32>
%arg1 = bitcast <2 x i64> %a1 to <4 x i32>
%res = call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %arg0, <4 x i32> %arg1)
@@ -2676,30 +2144,20 @@ define <4 x i64> @test_mm256_sll_epi32(<4 x i64> %a0, <2 x i64> %a1) {
declare <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32>, <4 x i32>) nounwind readnone
define <4 x i64> @test_mm256_sll_epi64(<4 x i64> %a0, <2 x i64> %a1) {
-; X32-LABEL: test_mm256_sll_epi64:
-; X32: # BB#0:
-; X32-NEXT: vpsllq %xmm1, %ymm0, %ymm0
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_sll_epi64:
-; X64: # BB#0:
-; X64-NEXT: vpsllq %xmm1, %ymm0, %ymm0
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_sll_epi64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpsllq %xmm1, %ymm0, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
%res = call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %a0, <2 x i64> %a1)
ret <4 x i64> %res
}
declare <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64>, <2 x i64>) nounwind readnone
define <4 x i64> @test_mm256_slli_epi16(<4 x i64> %a0) {
-; X32-LABEL: test_mm256_slli_epi16:
-; X32: # BB#0:
-; X32-NEXT: vpsllw $3, %ymm0, %ymm0
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_slli_epi16:
-; X64: # BB#0:
-; X64-NEXT: vpsllw $3, %ymm0, %ymm0
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_slli_epi16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpsllw $3, %ymm0, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
%arg0 = bitcast <4 x i64> %a0 to <16 x i16>
%res = call <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16> %arg0, i32 3)
%bc = bitcast <16 x i16> %res to <4 x i64>
@@ -2708,15 +2166,10 @@ define <4 x i64> @test_mm256_slli_epi16(<4 x i64> %a0) {
declare <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16>, i32) nounwind readnone
define <4 x i64> @test_mm256_slli_epi32(<4 x i64> %a0) {
-; X32-LABEL: test_mm256_slli_epi32:
-; X32: # BB#0:
-; X32-NEXT: vpslld $3, %ymm0, %ymm0
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_slli_epi32:
-; X64: # BB#0:
-; X64-NEXT: vpslld $3, %ymm0, %ymm0
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_slli_epi32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpslld $3, %ymm0, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
%arg0 = bitcast <4 x i64> %a0 to <8 x i32>
%res = call <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32> %arg0, i32 3)
%bc = bitcast <8 x i32> %res to <4 x i64>
@@ -2725,30 +2178,20 @@ define <4 x i64> @test_mm256_slli_epi32(<4 x i64> %a0) {
declare <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32>, i32) nounwind readnone
define <4 x i64> @test_mm256_slli_epi64(<4 x i64> %a0) {
-; X32-LABEL: test_mm256_slli_epi64:
-; X32: # BB#0:
-; X32-NEXT: vpsllq $3, %ymm0, %ymm0
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_slli_epi64:
-; X64: # BB#0:
-; X64-NEXT: vpsllq $3, %ymm0, %ymm0
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_slli_epi64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpsllq $3, %ymm0, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
%res = call <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64> %a0, i32 3)
ret <4 x i64> %res
}
declare <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64>, i32) nounwind readnone
define <4 x i64> @test_mm256_slli_si256(<4 x i64> %a0) {
-; X32-LABEL: test_mm256_slli_si256:
-; X32: # BB#0:
-; X32-NEXT: vpslldq {{.*#+}} ymm0 = zero,zero,zero,ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12],zero,zero,zero,ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28]
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_slli_si256:
-; X64: # BB#0:
-; X64-NEXT: vpslldq {{.*#+}} ymm0 = zero,zero,zero,ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12],zero,zero,zero,ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28]
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_slli_si256:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpslldq {{.*#+}} ymm0 = zero,zero,zero,ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12],zero,zero,zero,ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28]
+; CHECK-NEXT: ret{{[l|q]}}
%arg0 = bitcast <4 x i64> %a0 to <32 x i8>
%shuf = shufflevector <32 x i8> zeroinitializer, <32 x i8> %arg0, <32 x i32> <i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60>
%res = bitcast <32 x i8> %shuf to <4 x i64>
@@ -2756,15 +2199,10 @@ define <4 x i64> @test_mm256_slli_si256(<4 x i64> %a0) {
}
define <2 x i64> @test_mm_sllv_epi32(<2 x i64> %a0, <2 x i64> %a1) {
-; X32-LABEL: test_mm_sllv_epi32:
-; X32: # BB#0:
-; X32-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm_sllv_epi32:
-; X64: # BB#0:
-; X64-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm_sllv_epi32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: ret{{[l|q]}}
%arg0 = bitcast <2 x i64> %a0 to <4 x i32>
%arg1 = bitcast <2 x i64> %a1 to <4 x i32>
%res = call <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32> %arg0, <4 x i32> %arg1)
@@ -2774,15 +2212,10 @@ define <2 x i64> @test_mm_sllv_epi32(<2 x i64> %a0, <2 x i64> %a1) {
declare <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32>, <4 x i32>) nounwind readnone
define <4 x i64> @test_mm256_sllv_epi32(<4 x i64> %a0, <4 x i64> %a1) {
-; X32-LABEL: test_mm256_sllv_epi32:
-; X32: # BB#0:
-; X32-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_sllv_epi32:
-; X64: # BB#0:
-; X64-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_sllv_epi32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
%arg0 = bitcast <4 x i64> %a0 to <8 x i32>
%arg1 = bitcast <4 x i64> %a1 to <8 x i32>
%res = call <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32> %arg0, <8 x i32> %arg1)
@@ -2792,45 +2225,30 @@ define <4 x i64> @test_mm256_sllv_epi32(<4 x i64> %a0, <4 x i64> %a1) {
declare <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32>, <8 x i32>) nounwind readnone
define <2 x i64> @test_mm_sllv_epi64(<2 x i64> %a0, <2 x i64> %a1) {
-; X32-LABEL: test_mm_sllv_epi64:
-; X32: # BB#0:
-; X32-NEXT: vpsllvq %xmm1, %xmm0, %xmm0
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm_sllv_epi64:
-; X64: # BB#0:
-; X64-NEXT: vpsllvq %xmm1, %xmm0, %xmm0
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm_sllv_epi64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpsllvq %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: ret{{[l|q]}}
%res = call <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64> %a0, <2 x i64> %a1)
ret <2 x i64> %res
}
declare <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64>, <2 x i64>) nounwind readnone
define <4 x i64> @test_mm256_sllv_epi64(<4 x i64> %a0, <4 x i64> %a1) {
-; X32-LABEL: test_mm256_sllv_epi64:
-; X32: # BB#0:
-; X32-NEXT: vpsllvq %ymm1, %ymm0, %ymm0
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_sllv_epi64:
-; X64: # BB#0:
-; X64-NEXT: vpsllvq %ymm1, %ymm0, %ymm0
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_sllv_epi64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpsllvq %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
%res = call <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64> %a0, <4 x i64> %a1)
ret <4 x i64> %res
}
declare <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64>, <4 x i64>) nounwind readnone
define <4 x i64> @test_mm256_sra_epi16(<4 x i64> %a0, <2 x i64> %a1) {
-; X32-LABEL: test_mm256_sra_epi16:
-; X32: # BB#0:
-; X32-NEXT: vpsraw %xmm1, %ymm0, %ymm0
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_sra_epi16:
-; X64: # BB#0:
-; X64-NEXT: vpsraw %xmm1, %ymm0, %ymm0
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_sra_epi16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpsraw %xmm1, %ymm0, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
%arg0 = bitcast <4 x i64> %a0 to <16 x i16>
%arg1 = bitcast <2 x i64> %a1 to <8 x i16>
%res = call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %arg0, <8 x i16> %arg1)
@@ -2840,15 +2258,10 @@ define <4 x i64> @test_mm256_sra_epi16(<4 x i64> %a0, <2 x i64> %a1) {
declare <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16>, <8 x i16>) nounwind readnone
define <4 x i64> @test_mm256_sra_epi32(<4 x i64> %a0, <2 x i64> %a1) {
-; X32-LABEL: test_mm256_sra_epi32:
-; X32: # BB#0:
-; X32-NEXT: vpsrad %xmm1, %ymm0, %ymm0
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_sra_epi32:
-; X64: # BB#0:
-; X64-NEXT: vpsrad %xmm1, %ymm0, %ymm0
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_sra_epi32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpsrad %xmm1, %ymm0, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
%arg0 = bitcast <4 x i64> %a0 to <8 x i32>
%arg1 = bitcast <2 x i64> %a1 to <4 x i32>
%res = call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %arg0, <4 x i32> %arg1)
@@ -2858,15 +2271,10 @@ define <4 x i64> @test_mm256_sra_epi32(<4 x i64> %a0, <2 x i64> %a1) {
declare <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32>, <4 x i32>) nounwind readnone
define <4 x i64> @test_mm256_srai_epi16(<4 x i64> %a0) {
-; X32-LABEL: test_mm256_srai_epi16:
-; X32: # BB#0:
-; X32-NEXT: vpsraw $3, %ymm0, %ymm0
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_srai_epi16:
-; X64: # BB#0:
-; X64-NEXT: vpsraw $3, %ymm0, %ymm0
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_srai_epi16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpsraw $3, %ymm0, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
%arg0 = bitcast <4 x i64> %a0 to <16 x i16>
%res = call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> %arg0, i32 3)
%bc = bitcast <16 x i16> %res to <4 x i64>
@@ -2875,15 +2283,10 @@ define <4 x i64> @test_mm256_srai_epi16(<4 x i64> %a0) {
declare <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16>, i32) nounwind readnone
define <4 x i64> @test_mm256_srai_epi32(<4 x i64> %a0) {
-; X32-LABEL: test_mm256_srai_epi32:
-; X32: # BB#0:
-; X32-NEXT: vpsrad $3, %ymm0, %ymm0
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_srai_epi32:
-; X64: # BB#0:
-; X64-NEXT: vpsrad $3, %ymm0, %ymm0
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_srai_epi32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpsrad $3, %ymm0, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
%arg0 = bitcast <4 x i64> %a0 to <8 x i32>
%res = call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %arg0, i32 3)
%bc = bitcast <8 x i32> %res to <4 x i64>
@@ -2892,15 +2295,10 @@ define <4 x i64> @test_mm256_srai_epi32(<4 x i64> %a0) {
declare <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32>, i32) nounwind readnone
define <2 x i64> @test_mm_srav_epi32(<2 x i64> %a0, <2 x i64> %a1) {
-; X32-LABEL: test_mm_srav_epi32:
-; X32: # BB#0:
-; X32-NEXT: vpsravd %xmm1, %xmm0, %xmm0
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm_srav_epi32:
-; X64: # BB#0:
-; X64-NEXT: vpsravd %xmm1, %xmm0, %xmm0
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm_srav_epi32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpsravd %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: ret{{[l|q]}}
%arg0 = bitcast <2 x i64> %a0 to <4 x i32>
%arg1 = bitcast <2 x i64> %a1 to <4 x i32>
%res = call <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32> %arg0, <4 x i32> %arg1)
@@ -2910,15 +2308,10 @@ define <2 x i64> @test_mm_srav_epi32(<2 x i64> %a0, <2 x i64> %a1) {
declare <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32>, <4 x i32>) nounwind readnone
define <4 x i64> @test_mm256_srav_epi32(<4 x i64> %a0, <4 x i64> %a1) {
-; X32-LABEL: test_mm256_srav_epi32:
-; X32: # BB#0:
-; X32-NEXT: vpsravd %ymm1, %ymm0, %ymm0
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_srav_epi32:
-; X64: # BB#0:
-; X64-NEXT: vpsravd %ymm1, %ymm0, %ymm0
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_srav_epi32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpsravd %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
%arg0 = bitcast <4 x i64> %a0 to <8 x i32>
%arg1 = bitcast <4 x i64> %a1 to <8 x i32>
%res = call <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32> %arg0, <8 x i32> %arg1)
@@ -2928,15 +2321,10 @@ define <4 x i64> @test_mm256_srav_epi32(<4 x i64> %a0, <4 x i64> %a1) {
declare <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32>, <8 x i32>) nounwind readnone
define <4 x i64> @test_mm256_srl_epi16(<4 x i64> %a0, <2 x i64> %a1) {
-; X32-LABEL: test_mm256_srl_epi16:
-; X32: # BB#0:
-; X32-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_srl_epi16:
-; X64: # BB#0:
-; X64-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_srl_epi16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
%arg0 = bitcast <4 x i64> %a0 to <16 x i16>
%arg1 = bitcast <2 x i64> %a1 to <8 x i16>
%res = call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> %arg0, <8 x i16> %arg1)
@@ -2946,15 +2334,10 @@ define <4 x i64> @test_mm256_srl_epi16(<4 x i64> %a0, <2 x i64> %a1) {
declare <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16>, <8 x i16>) nounwind readnone
define <4 x i64> @test_mm256_srl_epi32(<4 x i64> %a0, <2 x i64> %a1) {
-; X32-LABEL: test_mm256_srl_epi32:
-; X32: # BB#0:
-; X32-NEXT: vpsrld %xmm1, %ymm0, %ymm0
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_srl_epi32:
-; X64: # BB#0:
-; X64-NEXT: vpsrld %xmm1, %ymm0, %ymm0
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_srl_epi32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpsrld %xmm1, %ymm0, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
%arg0 = bitcast <4 x i64> %a0 to <8 x i32>
%arg1 = bitcast <2 x i64> %a1 to <4 x i32>
%res = call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %arg0, <4 x i32> %arg1)
@@ -2964,30 +2347,20 @@ define <4 x i64> @test_mm256_srl_epi32(<4 x i64> %a0, <2 x i64> %a1) {
declare <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32>, <4 x i32>) nounwind readnone
define <4 x i64> @test_mm256_srl_epi64(<4 x i64> %a0, <2 x i64> %a1) {
-; X32-LABEL: test_mm256_srl_epi64:
-; X32: # BB#0:
-; X32-NEXT: vpsrlq %xmm1, %ymm0, %ymm0
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_srl_epi64:
-; X64: # BB#0:
-; X64-NEXT: vpsrlq %xmm1, %ymm0, %ymm0
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_srl_epi64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpsrlq %xmm1, %ymm0, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
%res = call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %a0, <2 x i64> %a1)
ret <4 x i64> %res
}
declare <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64>, <2 x i64>) nounwind readnone
define <4 x i64> @test_mm256_srli_epi16(<4 x i64> %a0) {
-; X32-LABEL: test_mm256_srli_epi16:
-; X32: # BB#0:
-; X32-NEXT: vpsrlw $3, %ymm0, %ymm0
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_srli_epi16:
-; X64: # BB#0:
-; X64-NEXT: vpsrlw $3, %ymm0, %ymm0
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_srli_epi16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpsrlw $3, %ymm0, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
%arg0 = bitcast <4 x i64> %a0 to <16 x i16>
%res = call <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16> %arg0, i32 3)
%bc = bitcast <16 x i16> %res to <4 x i64>
@@ -2996,15 +2369,10 @@ define <4 x i64> @test_mm256_srli_epi16(<4 x i64> %a0) {
declare <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16>, i32) nounwind readnone
define <4 x i64> @test_mm256_srli_epi32(<4 x i64> %a0) {
-; X32-LABEL: test_mm256_srli_epi32:
-; X32: # BB#0:
-; X32-NEXT: vpsrld $3, %ymm0, %ymm0
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_srli_epi32:
-; X64: # BB#0:
-; X64-NEXT: vpsrld $3, %ymm0, %ymm0
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_srli_epi32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpsrld $3, %ymm0, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
%arg0 = bitcast <4 x i64> %a0 to <8 x i32>
%res = call <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32> %arg0, i32 3)
%bc = bitcast <8 x i32> %res to <4 x i64>
@@ -3013,30 +2381,20 @@ define <4 x i64> @test_mm256_srli_epi32(<4 x i64> %a0) {
declare <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32>, i32) nounwind readnone
define <4 x i64> @test_mm256_srli_epi64(<4 x i64> %a0) {
-; X32-LABEL: test_mm256_srli_epi64:
-; X32: # BB#0:
-; X32-NEXT: vpsrlq $3, %ymm0, %ymm0
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_srli_epi64:
-; X64: # BB#0:
-; X64-NEXT: vpsrlq $3, %ymm0, %ymm0
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_srli_epi64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpsrlq $3, %ymm0, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
%res = call <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64> %a0, i32 3)
ret <4 x i64> %res
}
declare <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64>, i32) nounwind readnone
define <4 x i64> @test_mm256_srli_si256(<4 x i64> %a0) {
-; X32-LABEL: test_mm256_srli_si256:
-; X32: # BB#0:
-; X32-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,ymm0[19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_srli_si256:
-; X64: # BB#0:
-; X64-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,ymm0[19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_srli_si256:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,ymm0[19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero
+; CHECK-NEXT: ret{{[l|q]}}
%arg0 = bitcast <4 x i64> %a0 to <32 x i8>
%shuf = shufflevector <32 x i8> %arg0, <32 x i8> zeroinitializer, <32 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50>
%res = bitcast <32 x i8> %shuf to <4 x i64>
@@ -3044,15 +2402,10 @@ define <4 x i64> @test_mm256_srli_si256(<4 x i64> %a0) {
}
define <2 x i64> @test_mm_srlv_epi32(<2 x i64> %a0, <2 x i64> %a1) {
-; X32-LABEL: test_mm_srlv_epi32:
-; X32: # BB#0:
-; X32-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm_srlv_epi32:
-; X64: # BB#0:
-; X64-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm_srlv_epi32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: ret{{[l|q]}}
%arg0 = bitcast <2 x i64> %a0 to <4 x i32>
%arg1 = bitcast <2 x i64> %a1 to <4 x i32>
%res = call <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32> %arg0, <4 x i32> %arg1)
@@ -3062,15 +2415,10 @@ define <2 x i64> @test_mm_srlv_epi32(<2 x i64> %a0, <2 x i64> %a1) {
declare <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32>, <4 x i32>) nounwind readnone
define <4 x i64> @test_mm256_srlv_epi32(<4 x i64> %a0, <4 x i64> %a1) {
-; X32-LABEL: test_mm256_srlv_epi32:
-; X32: # BB#0:
-; X32-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_srlv_epi32:
-; X64: # BB#0:
-; X64-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_srlv_epi32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
%arg0 = bitcast <4 x i64> %a0 to <8 x i32>
%arg1 = bitcast <4 x i64> %a1 to <8 x i32>
%res = call <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32> %arg0, <8 x i32> %arg1)
@@ -3080,46 +2428,36 @@ define <4 x i64> @test_mm256_srlv_epi32(<4 x i64> %a0, <4 x i64> %a1) {
declare <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32>, <8 x i32>) nounwind readnone
define <2 x i64> @test_mm_srlv_epi64(<2 x i64> %a0, <2 x i64> %a1) {
-; X32-LABEL: test_mm_srlv_epi64:
-; X32: # BB#0:
-; X32-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm_srlv_epi64:
-; X64: # BB#0:
-; X64-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm_srlv_epi64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: ret{{[l|q]}}
%res = call <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64> %a0, <2 x i64> %a1)
ret <2 x i64> %res
}
declare <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64>, <2 x i64>) nounwind readnone
define <4 x i64> @test_mm256_srlv_epi64(<4 x i64> %a0, <4 x i64> %a1) {
-; X32-LABEL: test_mm256_srlv_epi64:
-; X32: # BB#0:
-; X32-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_srlv_epi64:
-; X64: # BB#0:
-; X64-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_srlv_epi64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
%res = call <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64> %a0, <4 x i64> %a1)
ret <4 x i64> %res
}
declare <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64>, <4 x i64>) nounwind readnone
define <4 x i64> @test_mm256_stream_load_si256(<4 x i64> *%a0) {
-; X32-LABEL: test_mm256_stream_load_si256:
-; X32: # BB#0:
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: vmovntdqa (%eax), %ymm0
-; X32-NEXT: retl
+; X86-LABEL: test_mm256_stream_load_si256:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: vmovntdqa (%eax), %ymm0
+; X86-NEXT: ret{{[l|q]}}
;
; X64-LABEL: test_mm256_stream_load_si256:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovntdqa (%rdi), %ymm0
-; X64-NEXT: retq
+; X64-NEXT: ret{{[l|q]}}
%arg0 = bitcast <4 x i64> *%a0 to i8*
%res = call <4 x i64> @llvm.x86.avx2.movntdqa(i8* %arg0)
ret <4 x i64> %res
@@ -3127,15 +2465,10 @@ define <4 x i64> @test_mm256_stream_load_si256(<4 x i64> *%a0) {
declare <4 x i64> @llvm.x86.avx2.movntdqa(i8*) nounwind readonly
define <4 x i64> @test_mm256_sub_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind {
-; X32-LABEL: test_mm256_sub_epi8:
-; X32: # BB#0:
-; X32-NEXT: vpsubb %ymm1, %ymm0, %ymm0
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_sub_epi8:
-; X64: # BB#0:
-; X64-NEXT: vpsubb %ymm1, %ymm0, %ymm0
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_sub_epi8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpsubb %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
%arg0 = bitcast <4 x i64> %a0 to <32 x i8>
%arg1 = bitcast <4 x i64> %a1 to <32 x i8>
%res = sub <32 x i8> %arg0, %arg1
@@ -3144,15 +2477,10 @@ define <4 x i64> @test_mm256_sub_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind {
}
define <4 x i64> @test_mm256_sub_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind {
-; X32-LABEL: test_mm256_sub_epi16:
-; X32: # BB#0:
-; X32-NEXT: vpsubw %ymm1, %ymm0, %ymm0
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_sub_epi16:
-; X64: # BB#0:
-; X64-NEXT: vpsubw %ymm1, %ymm0, %ymm0
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_sub_epi16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpsubw %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
%arg0 = bitcast <4 x i64> %a0 to <16 x i16>
%arg1 = bitcast <4 x i64> %a1 to <16 x i16>
%res = sub <16 x i16> %arg0, %arg1
@@ -3161,15 +2489,10 @@ define <4 x i64> @test_mm256_sub_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind {
}
define <4 x i64> @test_mm256_sub_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
-; X32-LABEL: test_mm256_sub_epi32:
-; X32: # BB#0:
-; X32-NEXT: vpsubd %ymm1, %ymm0, %ymm0
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_sub_epi32:
-; X64: # BB#0:
-; X64-NEXT: vpsubd %ymm1, %ymm0, %ymm0
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_sub_epi32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpsubd %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
%arg0 = bitcast <4 x i64> %a0 to <8 x i32>
%arg1 = bitcast <4 x i64> %a1 to <8 x i32>
%res = sub <8 x i32> %arg0, %arg1
@@ -3178,29 +2501,19 @@ define <4 x i64> @test_mm256_sub_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
}
define <4 x i64> @test_mm256_sub_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
-; X32-LABEL: test_mm256_sub_epi64:
-; X32: # BB#0:
-; X32-NEXT: vpsubq %ymm1, %ymm0, %ymm0
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_sub_epi64:
-; X64: # BB#0:
-; X64-NEXT: vpsubq %ymm1, %ymm0, %ymm0
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_sub_epi64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpsubq %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
%res = sub <4 x i64> %a0, %a1
ret <4 x i64> %res
}
define <4 x i64> @test_mm256_subs_epi8(<4 x i64> %a0, <4 x i64> %a1) {
-; X32-LABEL: test_mm256_subs_epi8:
-; X32: # BB#0:
-; X32-NEXT: vpsubsb %ymm1, %ymm0, %ymm0
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_subs_epi8:
-; X64: # BB#0:
-; X64-NEXT: vpsubsb %ymm1, %ymm0, %ymm0
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_subs_epi8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpsubsb %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
%arg0 = bitcast <4 x i64> %a0 to <32 x i8>
%arg1 = bitcast <4 x i64> %a1 to <32 x i8>
%res = call <32 x i8> @llvm.x86.avx2.psubs.b(<32 x i8> %arg0, <32 x i8> %arg1)
@@ -3210,15 +2523,10 @@ define <4 x i64> @test_mm256_subs_epi8(<4 x i64> %a0, <4 x i64> %a1) {
declare <32 x i8> @llvm.x86.avx2.psubs.b(<32 x i8>, <32 x i8>) nounwind readnone
define <4 x i64> @test_mm256_subs_epi16(<4 x i64> %a0, <4 x i64> %a1) {
-; X32-LABEL: test_mm256_subs_epi16:
-; X32: # BB#0:
-; X32-NEXT: vpsubsw %ymm1, %ymm0, %ymm0
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_subs_epi16:
-; X64: # BB#0:
-; X64-NEXT: vpsubsw %ymm1, %ymm0, %ymm0
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_subs_epi16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpsubsw %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
%arg0 = bitcast <4 x i64> %a0 to <16 x i16>
%arg1 = bitcast <4 x i64> %a1 to <16 x i16>
%res = call <16 x i16> @llvm.x86.avx2.psubs.w(<16 x i16> %arg0, <16 x i16> %arg1)
@@ -3228,15 +2536,10 @@ define <4 x i64> @test_mm256_subs_epi16(<4 x i64> %a0, <4 x i64> %a1) {
declare <16 x i16> @llvm.x86.avx2.psubs.w(<16 x i16>, <16 x i16>) nounwind readnone
define <4 x i64> @test_mm256_subs_epu8(<4 x i64> %a0, <4 x i64> %a1) {
-; X32-LABEL: test_mm256_subs_epu8:
-; X32: # BB#0:
-; X32-NEXT: vpsubusb %ymm1, %ymm0, %ymm0
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_subs_epu8:
-; X64: # BB#0:
-; X64-NEXT: vpsubusb %ymm1, %ymm0, %ymm0
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_subs_epu8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpsubusb %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
%arg0 = bitcast <4 x i64> %a0 to <32 x i8>
%arg1 = bitcast <4 x i64> %a1 to <32 x i8>
%res = call <32 x i8> @llvm.x86.avx2.psubus.b(<32 x i8> %arg0, <32 x i8> %arg1)
@@ -3246,15 +2549,10 @@ define <4 x i64> @test_mm256_subs_epu8(<4 x i64> %a0, <4 x i64> %a1) {
declare <32 x i8> @llvm.x86.avx2.psubus.b(<32 x i8>, <32 x i8>) nounwind readnone
define <4 x i64> @test_mm256_subs_epu16(<4 x i64> %a0, <4 x i64> %a1) {
-; X32-LABEL: test_mm256_subs_epu16:
-; X32: # BB#0:
-; X32-NEXT: vpsubusw %ymm1, %ymm0, %ymm0
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_subs_epu16:
-; X64: # BB#0:
-; X64-NEXT: vpsubusw %ymm1, %ymm0, %ymm0
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_subs_epu16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpsubusw %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
%arg0 = bitcast <4 x i64> %a0 to <16 x i16>
%arg1 = bitcast <4 x i64> %a1 to <16 x i16>
%res = call <16 x i16> @llvm.x86.avx2.psubus.w(<16 x i16> %arg0, <16 x i16> %arg1)
@@ -3264,15 +2562,10 @@ define <4 x i64> @test_mm256_subs_epu16(<4 x i64> %a0, <4 x i64> %a1) {
declare <16 x i16> @llvm.x86.avx2.psubus.w(<16 x i16>, <16 x i16>) nounwind readnone
define <4 x i64> @test_mm256_unpackhi_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind {
-; X32-LABEL: test_mm256_unpackhi_epi8:
-; X32: # BB#0:
-; X32-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_unpackhi_epi8:
-; X64: # BB#0:
-; X64-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_unpackhi_epi8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
+; CHECK-NEXT: ret{{[l|q]}}
%arg0 = bitcast <4 x i64> %a0 to <32 x i8>
%arg1 = bitcast <4 x i64> %a1 to <32 x i8>
%res = shufflevector <32 x i8> %arg0, <32 x i8> %arg1, <32 x i32> <i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
@@ -3281,15 +2574,10 @@ define <4 x i64> @test_mm256_unpackhi_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwin
}
define <4 x i64> @test_mm256_unpackhi_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind {
-; X32-LABEL: test_mm256_unpackhi_epi16:
-; X32: # BB#0:
-; X32-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15]
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_unpackhi_epi16:
-; X64: # BB#0:
-; X64-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15]
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_unpackhi_epi16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15]
+; CHECK-NEXT: ret{{[l|q]}}
%arg0 = bitcast <4 x i64> %a0 to <16 x i16>
%arg1 = bitcast <4 x i64> %a1 to <16 x i16>
%res = shufflevector <16 x i16> %arg0, <16 x i16> %arg1, <16 x i32> <i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
@@ -3298,15 +2586,10 @@ define <4 x i64> @test_mm256_unpackhi_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwi
}
define <4 x i64> @test_mm256_unpackhi_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
-; X32-LABEL: test_mm256_unpackhi_epi32:
-; X32: # BB#0:
-; X32-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_unpackhi_epi32:
-; X64: # BB#0:
-; X64-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_unpackhi_epi32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
+; CHECK-NEXT: ret{{[l|q]}}
%arg0 = bitcast <4 x i64> %a0 to <8 x i32>
%arg1 = bitcast <4 x i64> %a1 to <8 x i32>
%res = shufflevector <8 x i32> %arg0, <8 x i32> %arg1, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
@@ -3315,29 +2598,19 @@ define <4 x i64> @test_mm256_unpackhi_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwi
}
define <4 x i64> @test_mm256_unpackhi_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
-; X32-LABEL: test_mm256_unpackhi_epi64:
-; X32: # BB#0:
-; X32-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_unpackhi_epi64:
-; X64: # BB#0:
-; X64-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_unpackhi_epi64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; CHECK-NEXT: ret{{[l|q]}}
%res = shufflevector <4 x i64> %a0, <4 x i64> %a1, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
ret <4 x i64> %res
}
define <4 x i64> @test_mm256_unpacklo_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind {
-; X32-LABEL: test_mm256_unpacklo_epi8:
-; X32: # BB#0:
-; X32-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_unpacklo_epi8:
-; X64: # BB#0:
-; X64-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_unpacklo_epi8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
+; CHECK-NEXT: ret{{[l|q]}}
%arg0 = bitcast <4 x i64> %a0 to <32 x i8>
%arg1 = bitcast <4 x i64> %a1 to <32 x i8>
%res = shufflevector <32 x i8> %arg0, <32 x i8> %arg1, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55>
@@ -3346,15 +2619,10 @@ define <4 x i64> @test_mm256_unpacklo_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwin
}
define <4 x i64> @test_mm256_unpacklo_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind {
-; X32-LABEL: test_mm256_unpacklo_epi16:
-; X32: # BB#0:
-; X32-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_unpacklo_epi16:
-; X64: # BB#0:
-; X64-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_unpacklo_epi16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
+; CHECK-NEXT: ret{{[l|q]}}
%arg0 = bitcast <4 x i64> %a0 to <16 x i16>
%arg1 = bitcast <4 x i64> %a1 to <16 x i16>
%res = shufflevector <16 x i16> %arg0, <16 x i16> %arg1, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27>
@@ -3363,15 +2631,10 @@ define <4 x i64> @test_mm256_unpacklo_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwi
}
define <4 x i64> @test_mm256_unpacklo_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
-; X32-LABEL: test_mm256_unpacklo_epi32:
-; X32: # BB#0:
-; X32-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_unpacklo_epi32:
-; X64: # BB#0:
-; X64-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_unpacklo_epi32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
+; CHECK-NEXT: ret{{[l|q]}}
%arg0 = bitcast <4 x i64> %a0 to <8 x i32>
%arg1 = bitcast <4 x i64> %a1 to <8 x i32>
%res = shufflevector <8 x i32> %arg0, <8 x i32> %arg1, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
@@ -3380,29 +2643,19 @@ define <4 x i64> @test_mm256_unpacklo_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwi
}
define <4 x i64> @test_mm256_unpacklo_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
-; X32-LABEL: test_mm256_unpacklo_epi64:
-; X32: # BB#0:
-; X32-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_unpacklo_epi64:
-; X64: # BB#0:
-; X64-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_unpacklo_epi64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; CHECK-NEXT: ret{{[l|q]}}
%res = shufflevector <4 x i64> %a0, <4 x i64> %a1, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
ret <4 x i64> %res
}
define <4 x i64> @test_mm256_xor_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind {
-; X32-LABEL: test_mm256_xor_si256:
-; X32: # BB#0:
-; X32-NEXT: vxorps %ymm1, %ymm0, %ymm0
-; X32-NEXT: retl
-;
-; X64-LABEL: test_mm256_xor_si256:
-; X64: # BB#0:
-; X64-NEXT: vxorps %ymm1, %ymm0, %ymm0
-; X64-NEXT: retq
+; CHECK-LABEL: test_mm256_xor_si256:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vxorps %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
%res = xor <4 x i64> %a0, %a1
ret <4 x i64> %res
}
diff --git a/test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll b/test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll
index c4db9579afef..a761ec955fbc 100644
--- a/test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll
+++ b/test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll
@@ -1,11 +1,14 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=avx2 | FileCheck %s
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=AVX2 --check-prefix=X86 --check-prefix=X86-AVX2
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=X86 --check-prefix=X86-AVX512
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=AVX2 --check-prefix=X64 --check-prefix=X64-AVX2
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=X64 --check-prefix=X64-AVX512
define <16 x i16> @test_x86_avx2_pblendw(<16 x i16> %a0, <16 x i16> %a1) {
; CHECK-LABEL: test_x86_avx2_pblendw:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
-; CHECK-NEXT: retl
+; CHECK-NEXT: ret{{[l|q]}}
%res = call <16 x i16> @llvm.x86.avx2.pblendw(<16 x i16> %a0, <16 x i16> %a1, i32 7) ; <<16 x i16>> [#uses=1]
ret <16 x i16> %res
}
@@ -14,9 +17,9 @@ declare <16 x i16> @llvm.x86.avx2.pblendw(<16 x i16>, <16 x i16>, i32) nounwind
define <4 x i32> @test_x86_avx2_pblendd_128(<4 x i32> %a0, <4 x i32> %a1) {
; CHECK-LABEL: test_x86_avx2_pblendd_128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
-; CHECK-NEXT: retl
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
+; CHECK-NEXT: ret{{[l|q]}}
%res = call <4 x i32> @llvm.x86.avx2.pblendd.128(<4 x i32> %a0, <4 x i32> %a1, i32 7) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %res
}
@@ -25,9 +28,9 @@ declare <4 x i32> @llvm.x86.avx2.pblendd.128(<4 x i32>, <4 x i32>, i32) nounwind
define <8 x i32> @test_x86_avx2_pblendd_256(<8 x i32> %a0, <8 x i32> %a1) {
; CHECK-LABEL: test_x86_avx2_pblendd_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
-; CHECK-NEXT: retl
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
+; CHECK-NEXT: ret{{[l|q]}}
%res = call <8 x i32> @llvm.x86.avx2.pblendd.256(<8 x i32> %a0, <8 x i32> %a1, i32 7) ; <<8 x i32>> [#uses=1]
ret <8 x i32> %res
}
@@ -35,11 +38,16 @@ declare <8 x i32> @llvm.x86.avx2.pblendd.256(<8 x i32>, <8 x i32>, i32) nounwind
define <4 x i64> @test_x86_avx2_movntdqa(i8* %a0) {
-; CHECK-LABEL: test_x86_avx2_movntdqa:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: vmovntdqa (%eax), %ymm0
-; CHECK-NEXT: retl
+; X86-LABEL: test_x86_avx2_movntdqa:
+; X86: ## %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: vmovntdqa (%eax), %ymm0
+; X86-NEXT: ret{{[l|q]}}
+;
+; X64-LABEL: test_x86_avx2_movntdqa:
+; X64: ## %bb.0:
+; X64-NEXT: vmovntdqa (%rdi), %ymm0
+; X64-NEXT: ret{{[l|q]}}
%res = call <4 x i64> @llvm.x86.avx2.movntdqa(i8* %a0) ; <<4 x i64>> [#uses=1]
ret <4 x i64> %res
}
@@ -48,9 +56,9 @@ declare <4 x i64> @llvm.x86.avx2.movntdqa(i8*) nounwind readonly
define <16 x i16> @test_x86_avx2_mpsadbw(<32 x i8> %a0, <32 x i8> %a1) {
; CHECK-LABEL: test_x86_avx2_mpsadbw:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmpsadbw $7, %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: retl
+; CHECK-NEXT: ret{{[l|q]}}
%res = call <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8> %a0, <32 x i8> %a1, i32 7) ; <<16 x i16>> [#uses=1]
ret <16 x i16> %res
}
@@ -59,9 +67,9 @@ declare <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8>, <32 x i8>, i32) nounwind re
define <4 x i64> @test_x86_avx2_psll_dq_bs(<4 x i64> %a0) {
; CHECK-LABEL: test_x86_avx2_psll_dq_bs:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpslldq {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,ymm0[0,1,2,3,4,5,6,7,8],zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,18,19,20,21,22,23,24]
-; CHECK-NEXT: retl
+; CHECK-NEXT: ret{{[l|q]}}
%res = call <4 x i64> @llvm.x86.avx2.psll.dq.bs(<4 x i64> %a0, i32 7) ; <<4 x i64>> [#uses=1]
ret <4 x i64> %res
}
@@ -70,9 +78,9 @@ declare <4 x i64> @llvm.x86.avx2.psll.dq.bs(<4 x i64>, i32) nounwind readnone
define <4 x i64> @test_x86_avx2_psrl_dq_bs(<4 x i64> %a0) {
; CHECK-LABEL: test_x86_avx2_psrl_dq_bs:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,ymm0[23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero
-; CHECK-NEXT: retl
+; CHECK-NEXT: ret{{[l|q]}}
%res = call <4 x i64> @llvm.x86.avx2.psrl.dq.bs(<4 x i64> %a0, i32 7) ; <<4 x i64>> [#uses=1]
ret <4 x i64> %res
}
@@ -81,9 +89,9 @@ declare <4 x i64> @llvm.x86.avx2.psrl.dq.bs(<4 x i64>, i32) nounwind readnone
define <4 x i64> @test_x86_avx2_psll_dq(<4 x i64> %a0) {
; CHECK-LABEL: test_x86_avx2_psll_dq:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpslldq {{.*#+}} ymm0 = zero,ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],zero,ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
-; CHECK-NEXT: retl
+; CHECK-NEXT: ret{{[l|q]}}
%res = call <4 x i64> @llvm.x86.avx2.psll.dq(<4 x i64> %a0, i32 8) ; <<4 x i64>> [#uses=1]
ret <4 x i64> %res
}
@@ -92,9 +100,9 @@ declare <4 x i64> @llvm.x86.avx2.psll.dq(<4 x i64>, i32) nounwind readnone
define <4 x i64> @test_x86_avx2_psrl_dq(<4 x i64> %a0) {
; CHECK-LABEL: test_x86_avx2_psrl_dq:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,ymm0[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],zero
-; CHECK-NEXT: retl
+; CHECK-NEXT: ret{{[l|q]}}
%res = call <4 x i64> @llvm.x86.avx2.psrl.dq(<4 x i64> %a0, i32 8) ; <<4 x i64>> [#uses=1]
ret <4 x i64> %res
}
@@ -103,10 +111,10 @@ declare <4 x i64> @llvm.x86.avx2.psrl.dq(<4 x i64>, i32) nounwind readnone
define <2 x i64> @test_x86_avx2_vextracti128(<4 x i64> %a0) {
; CHECK-LABEL: test_x86_avx2_vextracti128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0
; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retl
+; CHECK-NEXT: ret{{[l|q]}}
%res = call <2 x i64> @llvm.x86.avx2.vextracti128(<4 x i64> %a0, i8 7)
ret <2 x i64> %res
}
@@ -115,9 +123,9 @@ declare <2 x i64> @llvm.x86.avx2.vextracti128(<4 x i64>, i8) nounwind readnone
define <4 x i64> @test_x86_avx2_vinserti128(<4 x i64> %a0, <2 x i64> %a1) {
; CHECK-LABEL: test_x86_avx2_vinserti128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; CHECK-NEXT: retl
+; CHECK-NEXT: ret{{[l|q]}}
%res = call <4 x i64> @llvm.x86.avx2.vinserti128(<4 x i64> %a0, <2 x i64> %a1, i8 7)
ret <4 x i64> %res
}
@@ -126,9 +134,9 @@ declare <4 x i64> @llvm.x86.avx2.vinserti128(<4 x i64>, <2 x i64>, i8) nounwind
define <4 x double> @test_x86_avx2_vbroadcast_sd_pd_256(<2 x double> %a0) {
; CHECK-LABEL: test_x86_avx2_vbroadcast_sd_pd_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0
-; CHECK-NEXT: retl
+; CHECK-NEXT: ret{{[l|q]}}
%res = call <4 x double> @llvm.x86.avx2.vbroadcast.sd.pd.256(<2 x double> %a0)
ret <4 x double> %res
}
@@ -137,9 +145,9 @@ declare <4 x double> @llvm.x86.avx2.vbroadcast.sd.pd.256(<2 x double>) nounwind
define <4 x float> @test_x86_avx2_vbroadcast_ss_ps(<4 x float> %a0) {
; CHECK-LABEL: test_x86_avx2_vbroadcast_ss_ps:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vbroadcastss %xmm0, %xmm0
-; CHECK-NEXT: retl
+; CHECK-NEXT: ret{{[l|q]}}
%res = call <4 x float> @llvm.x86.avx2.vbroadcast.ss.ps(<4 x float> %a0)
ret <4 x float> %res
}
@@ -148,9 +156,9 @@ declare <4 x float> @llvm.x86.avx2.vbroadcast.ss.ps(<4 x float>) nounwind readon
define <8 x float> @test_x86_avx2_vbroadcast_ss_ps_256(<4 x float> %a0) {
; CHECK-LABEL: test_x86_avx2_vbroadcast_ss_ps_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vbroadcastss %xmm0, %ymm0
-; CHECK-NEXT: retl
+; CHECK-NEXT: ret{{[l|q]}}
%res = call <8 x float> @llvm.x86.avx2.vbroadcast.ss.ps.256(<4 x float> %a0)
ret <8 x float> %res
}
@@ -159,9 +167,9 @@ declare <8 x float> @llvm.x86.avx2.vbroadcast.ss.ps.256(<4 x float>) nounwind re
define <16 x i8> @test_x86_avx2_pbroadcastb_128(<16 x i8> %a0) {
; CHECK-LABEL: test_x86_avx2_pbroadcastb_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpbroadcastb %xmm0, %xmm0
-; CHECK-NEXT: retl
+; CHECK-NEXT: ret{{[l|q]}}
%res = call <16 x i8> @llvm.x86.avx2.pbroadcastb.128(<16 x i8> %a0)
ret <16 x i8> %res
}
@@ -170,9 +178,9 @@ declare <16 x i8> @llvm.x86.avx2.pbroadcastb.128(<16 x i8>) nounwind readonly
define <32 x i8> @test_x86_avx2_pbroadcastb_256(<16 x i8> %a0) {
; CHECK-LABEL: test_x86_avx2_pbroadcastb_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpbroadcastb %xmm0, %ymm0
-; CHECK-NEXT: retl
+; CHECK-NEXT: ret{{[l|q]}}
%res = call <32 x i8> @llvm.x86.avx2.pbroadcastb.256(<16 x i8> %a0)
ret <32 x i8> %res
}
@@ -181,9 +189,9 @@ declare <32 x i8> @llvm.x86.avx2.pbroadcastb.256(<16 x i8>) nounwind readonly
define <8 x i16> @test_x86_avx2_pbroadcastw_128(<8 x i16> %a0) {
; CHECK-LABEL: test_x86_avx2_pbroadcastw_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpbroadcastw %xmm0, %xmm0
-; CHECK-NEXT: retl
+; CHECK-NEXT: ret{{[l|q]}}
%res = call <8 x i16> @llvm.x86.avx2.pbroadcastw.128(<8 x i16> %a0)
ret <8 x i16> %res
}
@@ -192,9 +200,9 @@ declare <8 x i16> @llvm.x86.avx2.pbroadcastw.128(<8 x i16>) nounwind readonly
define <16 x i16> @test_x86_avx2_pbroadcastw_256(<8 x i16> %a0) {
; CHECK-LABEL: test_x86_avx2_pbroadcastw_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpbroadcastw %xmm0, %ymm0
-; CHECK-NEXT: retl
+; CHECK-NEXT: ret{{[l|q]}}
%res = call <16 x i16> @llvm.x86.avx2.pbroadcastw.256(<8 x i16> %a0)
ret <16 x i16> %res
}
@@ -203,9 +211,9 @@ declare <16 x i16> @llvm.x86.avx2.pbroadcastw.256(<8 x i16>) nounwind readonly
define <4 x i32> @test_x86_avx2_pbroadcastd_128(<4 x i32> %a0) {
; CHECK-LABEL: test_x86_avx2_pbroadcastd_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vbroadcastss %xmm0, %xmm0
-; CHECK-NEXT: retl
+; CHECK-NEXT: ret{{[l|q]}}
%res = call <4 x i32> @llvm.x86.avx2.pbroadcastd.128(<4 x i32> %a0)
ret <4 x i32> %res
}
@@ -214,9 +222,9 @@ declare <4 x i32> @llvm.x86.avx2.pbroadcastd.128(<4 x i32>) nounwind readonly
define <8 x i32> @test_x86_avx2_pbroadcastd_256(<4 x i32> %a0) {
; CHECK-LABEL: test_x86_avx2_pbroadcastd_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vbroadcastss %xmm0, %ymm0
-; CHECK-NEXT: retl
+; CHECK-NEXT: ret{{[l|q]}}
%res = call <8 x i32> @llvm.x86.avx2.pbroadcastd.256(<4 x i32> %a0)
ret <8 x i32> %res
}
@@ -225,9 +233,9 @@ declare <8 x i32> @llvm.x86.avx2.pbroadcastd.256(<4 x i32>) nounwind readonly
define <2 x i64> @test_x86_avx2_pbroadcastq_128(<2 x i64> %a0) {
; CHECK-LABEL: test_x86_avx2_pbroadcastq_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpbroadcastq %xmm0, %xmm0
-; CHECK-NEXT: retl
+; CHECK-NEXT: ret{{[l|q]}}
%res = call <2 x i64> @llvm.x86.avx2.pbroadcastq.128(<2 x i64> %a0)
ret <2 x i64> %res
}
@@ -236,9 +244,9 @@ declare <2 x i64> @llvm.x86.avx2.pbroadcastq.128(<2 x i64>) nounwind readonly
define <4 x i64> @test_x86_avx2_pbroadcastq_256(<2 x i64> %a0) {
; CHECK-LABEL: test_x86_avx2_pbroadcastq_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0
-; CHECK-NEXT: retl
+; CHECK-NEXT: ret{{[l|q]}}
%res = call <4 x i64> @llvm.x86.avx2.pbroadcastq.256(<2 x i64> %a0)
ret <4 x i64> %res
}
@@ -247,9 +255,9 @@ declare <4 x i64> @llvm.x86.avx2.pbroadcastq.256(<2 x i64>) nounwind readonly
define <8 x i32> @test_x86_avx2_pmovsxbd(<16 x i8> %a0) {
; CHECK-LABEL: test_x86_avx2_pmovsxbd:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpmovsxbd %xmm0, %ymm0
-; CHECK-NEXT: retl
+; CHECK-NEXT: ret{{[l|q]}}
%res = call <8 x i32> @llvm.x86.avx2.pmovsxbd(<16 x i8> %a0) ; <<8 x i32>> [#uses=1]
ret <8 x i32> %res
}
@@ -258,9 +266,9 @@ declare <8 x i32> @llvm.x86.avx2.pmovsxbd(<16 x i8>) nounwind readnone
define <4 x i64> @test_x86_avx2_pmovsxbq(<16 x i8> %a0) {
; CHECK-LABEL: test_x86_avx2_pmovsxbq:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpmovsxbq %xmm0, %ymm0
-; CHECK-NEXT: retl
+; CHECK-NEXT: ret{{[l|q]}}
%res = call <4 x i64> @llvm.x86.avx2.pmovsxbq(<16 x i8> %a0) ; <<4 x i64>> [#uses=1]
ret <4 x i64> %res
}
@@ -269,9 +277,9 @@ declare <4 x i64> @llvm.x86.avx2.pmovsxbq(<16 x i8>) nounwind readnone
define <16 x i16> @test_x86_avx2_pmovsxbw(<16 x i8> %a0) {
; CHECK-LABEL: test_x86_avx2_pmovsxbw:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpmovsxbw %xmm0, %ymm0
-; CHECK-NEXT: retl
+; CHECK-NEXT: ret{{[l|q]}}
%res = call <16 x i16> @llvm.x86.avx2.pmovsxbw(<16 x i8> %a0) ; <<8 x i16>> [#uses=1]
ret <16 x i16> %res
}
@@ -280,9 +288,9 @@ declare <16 x i16> @llvm.x86.avx2.pmovsxbw(<16 x i8>) nounwind readnone
define <4 x i64> @test_x86_avx2_pmovsxdq(<4 x i32> %a0) {
; CHECK-LABEL: test_x86_avx2_pmovsxdq:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpmovsxdq %xmm0, %ymm0
-; CHECK-NEXT: retl
+; CHECK-NEXT: ret{{[l|q]}}
%res = call <4 x i64> @llvm.x86.avx2.pmovsxdq(<4 x i32> %a0) ; <<4 x i64>> [#uses=1]
ret <4 x i64> %res
}
@@ -291,9 +299,9 @@ declare <4 x i64> @llvm.x86.avx2.pmovsxdq(<4 x i32>) nounwind readnone
define <8 x i32> @test_x86_avx2_pmovsxwd(<8 x i16> %a0) {
; CHECK-LABEL: test_x86_avx2_pmovsxwd:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpmovsxwd %xmm0, %ymm0
-; CHECK-NEXT: retl
+; CHECK-NEXT: ret{{[l|q]}}
%res = call <8 x i32> @llvm.x86.avx2.pmovsxwd(<8 x i16> %a0) ; <<8 x i32>> [#uses=1]
ret <8 x i32> %res
}
@@ -302,9 +310,9 @@ declare <8 x i32> @llvm.x86.avx2.pmovsxwd(<8 x i16>) nounwind readnone
define <4 x i64> @test_x86_avx2_pmovsxwq(<8 x i16> %a0) {
; CHECK-LABEL: test_x86_avx2_pmovsxwq:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpmovsxwq %xmm0, %ymm0
-; CHECK-NEXT: retl
+; CHECK-NEXT: ret{{[l|q]}}
%res = call <4 x i64> @llvm.x86.avx2.pmovsxwq(<8 x i16> %a0) ; <<4 x i64>> [#uses=1]
ret <4 x i64> %res
}
@@ -313,9 +321,9 @@ declare <4 x i64> @llvm.x86.avx2.pmovsxwq(<8 x i16>) nounwind readnone
define <8 x i32> @test_x86_avx2_pmovzxbd(<16 x i8> %a0) {
; CHECK-LABEL: test_x86_avx2_pmovzxbd:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; CHECK-NEXT: retl
+; CHECK-NEXT: ret{{[l|q]}}
%res = call <8 x i32> @llvm.x86.avx2.pmovzxbd(<16 x i8> %a0) ; <<8 x i32>> [#uses=1]
ret <8 x i32> %res
}
@@ -324,9 +332,9 @@ declare <8 x i32> @llvm.x86.avx2.pmovzxbd(<16 x i8>) nounwind readnone
define <4 x i64> @test_x86_avx2_pmovzxbq(<16 x i8> %a0) {
; CHECK-LABEL: test_x86_avx2_pmovzxbq:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
-; CHECK-NEXT: retl
+; CHECK-NEXT: ret{{[l|q]}}
%res = call <4 x i64> @llvm.x86.avx2.pmovzxbq(<16 x i8> %a0) ; <<4 x i64>> [#uses=1]
ret <4 x i64> %res
}
@@ -335,9 +343,9 @@ declare <4 x i64> @llvm.x86.avx2.pmovzxbq(<16 x i8>) nounwind readnone
define <16 x i16> @test_x86_avx2_pmovzxbw(<16 x i8> %a0) {
; CHECK-LABEL: test_x86_avx2_pmovzxbw:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; CHECK-NEXT: retl
+; CHECK-NEXT: ret{{[l|q]}}
%res = call <16 x i16> @llvm.x86.avx2.pmovzxbw(<16 x i8> %a0) ; <<16 x i16>> [#uses=1]
ret <16 x i16> %res
}
@@ -346,9 +354,9 @@ declare <16 x i16> @llvm.x86.avx2.pmovzxbw(<16 x i8>) nounwind readnone
define <4 x i64> @test_x86_avx2_pmovzxdq(<4 x i32> %a0) {
; CHECK-LABEL: test_x86_avx2_pmovzxdq:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; CHECK-NEXT: retl
+; CHECK-NEXT: ret{{[l|q]}}
%res = call <4 x i64> @llvm.x86.avx2.pmovzxdq(<4 x i32> %a0) ; <<4 x i64>> [#uses=1]
ret <4 x i64> %res
}
@@ -357,9 +365,9 @@ declare <4 x i64> @llvm.x86.avx2.pmovzxdq(<4 x i32>) nounwind readnone
define <8 x i32> @test_x86_avx2_pmovzxwd(<8 x i16> %a0) {
; CHECK-LABEL: test_x86_avx2_pmovzxwd:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; CHECK-NEXT: retl
+; CHECK-NEXT: ret{{[l|q]}}
%res = call <8 x i32> @llvm.x86.avx2.pmovzxwd(<8 x i16> %a0) ; <<8 x i32>> [#uses=1]
ret <8 x i32> %res
}
@@ -368,9 +376,9 @@ declare <8 x i32> @llvm.x86.avx2.pmovzxwd(<8 x i16>) nounwind readnone
define <4 x i64> @test_x86_avx2_pmovzxwq(<8 x i16> %a0) {
; CHECK-LABEL: test_x86_avx2_pmovzxwq:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; CHECK-NEXT: retl
+; CHECK-NEXT: ret{{[l|q]}}
%res = call <4 x i64> @llvm.x86.avx2.pmovzxwq(<8 x i16> %a0) ; <<4 x i64>> [#uses=1]
ret <4 x i64> %res
}
@@ -379,14 +387,22 @@ declare <4 x i64> @llvm.x86.avx2.pmovzxwq(<8 x i16>) nounwind readnone
; This is checked here because the execution dependency fix pass makes it hard to test in AVX mode since we don't have 256-bit integer instructions
define void @test_x86_avx_storeu_dq_256(i8* %a0, <32 x i8> %a1) {
; add operation forces the execution domain.
-; CHECK-LABEL: test_x86_avx_storeu_dq_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
-; CHECK-NEXT: vpsubb %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: vmovdqu %ymm0, (%eax)
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retl
+; X86-LABEL: test_x86_avx_storeu_dq_256:
+; X86: ## %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; X86-NEXT: vpsubb %ymm1, %ymm0, %ymm0
+; X86-NEXT: vmovdqu %ymm0, (%eax)
+; X86-NEXT: vzeroupper
+; X86-NEXT: ret{{[l|q]}}
+;
+; X64-LABEL: test_x86_avx_storeu_dq_256:
+; X64: ## %bb.0:
+; X64-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; X64-NEXT: vpsubb %ymm1, %ymm0, %ymm0
+; X64-NEXT: vmovdqu %ymm0, (%rdi)
+; X64-NEXT: vzeroupper
+; X64-NEXT: ret{{[l|q]}}
%a2 = add <32 x i8> %a1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
call void @llvm.x86.avx.storeu.dq.256(i8* %a0, <32 x i8> %a2)
ret void
@@ -395,9 +411,9 @@ declare void @llvm.x86.avx.storeu.dq.256(i8*, <32 x i8>) nounwind
define <32 x i8> @mm256_max_epi8(<32 x i8> %a0, <32 x i8> %a1) {
; CHECK-LABEL: mm256_max_epi8:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: retl
+; CHECK-NEXT: ret{{[l|q]}}
%res = call <32 x i8> @llvm.x86.avx2.pmaxs.b(<32 x i8> %a0, <32 x i8> %a1)
ret <32 x i8> %res
}
@@ -405,9 +421,9 @@ declare <32 x i8> @llvm.x86.avx2.pmaxs.b(<32 x i8>, <32 x i8>) nounwind readnone
define <16 x i16> @mm256_max_epi16(<16 x i16> %a0, <16 x i16> %a1) {
; CHECK-LABEL: mm256_max_epi16:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: retl
+; CHECK-NEXT: ret{{[l|q]}}
%res = call <16 x i16> @llvm.x86.avx2.pmaxs.w(<16 x i16> %a0, <16 x i16> %a1)
ret <16 x i16> %res
}
@@ -415,9 +431,9 @@ declare <16 x i16> @llvm.x86.avx2.pmaxs.w(<16 x i16>, <16 x i16>) nounwind readn
define <8 x i32> @mm256_max_epi32(<8 x i32> %a0, <8 x i32> %a1) {
; CHECK-LABEL: mm256_max_epi32:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: retl
+; CHECK-NEXT: ret{{[l|q]}}
%res = call <8 x i32> @llvm.x86.avx2.pmaxs.d(<8 x i32> %a0, <8 x i32> %a1)
ret <8 x i32> %res
}
@@ -425,9 +441,9 @@ declare <8 x i32> @llvm.x86.avx2.pmaxs.d(<8 x i32>, <8 x i32>) nounwind readnone
define <32 x i8> @mm256_max_epu8(<32 x i8> %a0, <32 x i8> %a1) {
; CHECK-LABEL: mm256_max_epu8:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: retl
+; CHECK-NEXT: ret{{[l|q]}}
%res = call <32 x i8> @llvm.x86.avx2.pmaxu.b(<32 x i8> %a0, <32 x i8> %a1)
ret <32 x i8> %res
}
@@ -435,9 +451,9 @@ declare <32 x i8> @llvm.x86.avx2.pmaxu.b(<32 x i8>, <32 x i8>) nounwind readnone
define <16 x i16> @mm256_max_epu16(<16 x i16> %a0, <16 x i16> %a1) {
; CHECK-LABEL: mm256_max_epu16:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: retl
+; CHECK-NEXT: ret{{[l|q]}}
%res = call <16 x i16> @llvm.x86.avx2.pmaxu.w(<16 x i16> %a0, <16 x i16> %a1)
ret <16 x i16> %res
}
@@ -445,9 +461,9 @@ declare <16 x i16> @llvm.x86.avx2.pmaxu.w(<16 x i16>, <16 x i16>) nounwind readn
define <8 x i32> @mm256_max_epu32(<8 x i32> %a0, <8 x i32> %a1) {
; CHECK-LABEL: mm256_max_epu32:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: retl
+; CHECK-NEXT: ret{{[l|q]}}
%res = call <8 x i32> @llvm.x86.avx2.pmaxu.d(<8 x i32> %a0, <8 x i32> %a1)
ret <8 x i32> %res
}
@@ -455,9 +471,9 @@ declare <8 x i32> @llvm.x86.avx2.pmaxu.d(<8 x i32>, <8 x i32>) nounwind readnone
define <32 x i8> @mm256_min_epi8(<32 x i8> %a0, <32 x i8> %a1) {
; CHECK-LABEL: mm256_min_epi8:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpminsb %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: retl
+; CHECK-NEXT: ret{{[l|q]}}
%res = call <32 x i8> @llvm.x86.avx2.pmins.b(<32 x i8> %a0, <32 x i8> %a1)
ret <32 x i8> %res
}
@@ -465,9 +481,9 @@ declare <32 x i8> @llvm.x86.avx2.pmins.b(<32 x i8>, <32 x i8>) nounwind readnone
define <16 x i16> @mm256_min_epi16(<16 x i16> %a0, <16 x i16> %a1) {
; CHECK-LABEL: mm256_min_epi16:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpminsw %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: retl
+; CHECK-NEXT: ret{{[l|q]}}
%res = call <16 x i16> @llvm.x86.avx2.pmins.w(<16 x i16> %a0, <16 x i16> %a1)
ret <16 x i16> %res
}
@@ -475,9 +491,9 @@ declare <16 x i16> @llvm.x86.avx2.pmins.w(<16 x i16>, <16 x i16>) nounwind readn
define <8 x i32> @mm256_min_epi32(<8 x i32> %a0, <8 x i32> %a1) {
; CHECK-LABEL: mm256_min_epi32:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpminsd %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: retl
+; CHECK-NEXT: ret{{[l|q]}}
%res = call <8 x i32> @llvm.x86.avx2.pmins.d(<8 x i32> %a0, <8 x i32> %a1)
ret <8 x i32> %res
}
@@ -485,9 +501,9 @@ declare <8 x i32> @llvm.x86.avx2.pmins.d(<8 x i32>, <8 x i32>) nounwind readnone
define <32 x i8> @mm256_min_epu8(<32 x i8> %a0, <32 x i8> %a1) {
; CHECK-LABEL: mm256_min_epu8:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpminub %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: retl
+; CHECK-NEXT: ret{{[l|q]}}
%res = call <32 x i8> @llvm.x86.avx2.pminu.b(<32 x i8> %a0, <32 x i8> %a1)
ret <32 x i8> %res
}
@@ -495,9 +511,9 @@ declare <32 x i8> @llvm.x86.avx2.pminu.b(<32 x i8>, <32 x i8>) nounwind readnone
define <16 x i16> @mm256_min_epu16(<16 x i16> %a0, <16 x i16> %a1) {
; CHECK-LABEL: mm256_min_epu16:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpminuw %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: retl
+; CHECK-NEXT: ret{{[l|q]}}
%res = call <16 x i16> @llvm.x86.avx2.pminu.w(<16 x i16> %a0, <16 x i16> %a1)
ret <16 x i16> %res
}
@@ -505,11 +521,72 @@ declare <16 x i16> @llvm.x86.avx2.pminu.w(<16 x i16>, <16 x i16>) nounwind readn
define <8 x i32> @mm256_min_epu32(<8 x i32> %a0, <8 x i32> %a1) {
; CHECK-LABEL: mm256_min_epu32:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpminud %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: retl
+; CHECK-NEXT: ret{{[l|q]}}
%res = call <8 x i32> @llvm.x86.avx2.pminu.d(<8 x i32> %a0, <8 x i32> %a1)
ret <8 x i32> %res
}
declare <8 x i32> @llvm.x86.avx2.pminu.d(<8 x i32>, <8 x i32>) nounwind readnone
+define <32 x i8> @mm256_avg_epu8(<32 x i8> %a0, <32 x i8> %a1) {
+; CHECK-LABEL: mm256_avg_epu8:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpavgb %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
+ %res = call <32 x i8> @llvm.x86.avx2.pavg.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1]
+ ret <32 x i8> %res
+}
+declare <32 x i8> @llvm.x86.avx2.pavg.b(<32 x i8>, <32 x i8>) nounwind readnone
+
+define <16 x i16> @mm256_avg_epu16(<16 x i16> %a0, <16 x i16> %a1) {
+; CHECK-LABEL: mm256_avg_epu16:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpavgw %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
+ %res = call <16 x i16> @llvm.x86.avx2.pavg.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
+ ret <16 x i16> %res
+}
+declare <16 x i16> @llvm.x86.avx2.pavg.w(<16 x i16>, <16 x i16>) nounwind readnone
+
+define <32 x i8> @test_x86_avx2_pabs_b(<32 x i8> %a0) {
+; CHECK-LABEL: test_x86_avx2_pabs_b:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpabsb %ymm0, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
+ %res = call <32 x i8> @llvm.x86.avx2.pabs.b(<32 x i8> %a0) ; <<32 x i8>> [#uses=1]
+ ret <32 x i8> %res
+}
+declare <32 x i8> @llvm.x86.avx2.pabs.b(<32 x i8>) nounwind readnone
+
+define <8 x i32> @test_x86_avx2_pabs_d(<8 x i32> %a0) {
+; CHECK-LABEL: test_x86_avx2_pabs_d:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpabsd %ymm0, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
+ %res = call <8 x i32> @llvm.x86.avx2.pabs.d(<8 x i32> %a0) ; <<8 x i32>> [#uses=1]
+ ret <8 x i32> %res
+}
+declare <8 x i32> @llvm.x86.avx2.pabs.d(<8 x i32>) nounwind readnone
+
+
+define <16 x i16> @test_x86_avx2_pabs_w(<16 x i16> %a0) {
+; CHECK-LABEL: test_x86_avx2_pabs_w:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpabsw %ymm0, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
+ %res = call <16 x i16> @llvm.x86.avx2.pabs.w(<16 x i16> %a0) ; <<16 x i16>> [#uses=1]
+ ret <16 x i16> %res
+}
+declare <16 x i16> @llvm.x86.avx2.pabs.w(<16 x i16>) nounwind readnone
+
+
+define <4 x i64> @test_x86_avx2_vperm2i128(<4 x i64> %a0, <4 x i64> %a1) {
+; CHECK-LABEL: test_x86_avx2_vperm2i128:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; CHECK-NEXT: ret{{[l|q]}}
+ %res = call <4 x i64> @llvm.x86.avx2.vperm2i128(<4 x i64> %a0, <4 x i64> %a1, i8 1) ; <<4 x i64>> [#uses=1]
+ ret <4 x i64> %res
+}
+declare <4 x i64> @llvm.x86.avx2.vperm2i128(<4 x i64>, <4 x i64>, i8) nounwind readonly
diff --git a/test/CodeGen/X86/avx2-intrinsics-x86.ll b/test/CodeGen/X86/avx2-intrinsics-x86.ll
index 52e37dbf2696..20ebda5beb09 100644
--- a/test/CodeGen/X86/avx2-intrinsics-x86.ll
+++ b/test/CodeGen/X86/avx2-intrinsics-x86.ll
@@ -1,65 +1,166 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=avx2 -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=AVX2
-; RUN: llc < %s -mtriple=i686-apple-darwin -mcpu=skx -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512VL
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=avx2 -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=AVX2 --check-prefix=X86 --check-prefix=X86-AVX
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512VL --check-prefix=X86 --check-prefix=X86-AVX512VL
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx2 -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=AVX2 --check-prefix=X64 --check-prefix=X64-AVX
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512VL --check-prefix=X64 --check-prefix=X64-AVX512VL
define <16 x i16> @test_x86_avx2_packssdw(<8 x i32> %a0, <8 x i32> %a1) {
; AVX2-LABEL: test_x86_avx2_packssdw:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x6b,0xc1]
-; AVX2-NEXT: retl ## encoding: [0xc3]
+; AVX2-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
;
; AVX512VL-LABEL: test_x86_avx2_packssdw:
-; AVX512VL: ## BB#0:
+; AVX512VL: ## %bb.0:
; AVX512VL-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6b,0xc1]
-; AVX512VL-NEXT: retl ## encoding: [0xc3]
+; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a0, <8 x i32> %a1) ; <<16 x i16>> [#uses=1]
ret <16 x i16> %res
}
declare <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32>, <8 x i32>) nounwind readnone
+define <16 x i16> @test_x86_avx2_packssdw_fold() {
+; X86-AVX-LABEL: test_x86_avx2_packssdw_fold:
+; X86-AVX: ## %bb.0:
+; X86-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [0,0,0,0,255,32767,32767,65535,0,0,0,0,32769,32768,0,65280]
+; X86-AVX-NEXT: ## encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A]
+; X86-AVX-NEXT: ## fixup A - offset: 4, value: LCPI1_0, kind: FK_Data_4
+; X86-AVX-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
+;
+; X86-AVX512VL-LABEL: test_x86_avx2_packssdw_fold:
+; X86-AVX512VL: ## %bb.0:
+; X86-AVX512VL-NEXT: vmovaps LCPI1_0, %ymm0 ## EVEX TO VEX Compression ymm0 = [0,0,0,0,255,32767,32767,65535,0,0,0,0,32769,32768,0,65280]
+; X86-AVX512VL-NEXT: ## encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A]
+; X86-AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI1_0, kind: FK_Data_4
+; X86-AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
+;
+; X64-AVX-LABEL: test_x86_avx2_packssdw_fold:
+; X64-AVX: ## %bb.0:
+; X64-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [0,0,0,0,255,32767,32767,65535,0,0,0,0,32769,32768,0,65280]
+; X64-AVX-NEXT: ## encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A]
+; X64-AVX-NEXT: ## fixup A - offset: 4, value: LCPI1_0-4, kind: reloc_riprel_4byte
+; X64-AVX-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
+;
+; X64-AVX512VL-LABEL: test_x86_avx2_packssdw_fold:
+; X64-AVX512VL: ## %bb.0:
+; X64-AVX512VL-NEXT: vmovaps {{.*}}(%rip), %ymm0 ## EVEX TO VEX Compression ymm0 = [0,0,0,0,255,32767,32767,65535,0,0,0,0,32769,32768,0,65280]
+; X64-AVX512VL-NEXT: ## encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A]
+; X64-AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI1_0-4, kind: reloc_riprel_4byte
+; X64-AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
+ %res = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> zeroinitializer, <8 x i32> <i32 255, i32 32767, i32 65535, i32 -1, i32 -32767, i32 -65535, i32 0, i32 -256>)
+ ret <16 x i16> %res
+}
+
+
define <32 x i8> @test_x86_avx2_packsswb(<16 x i16> %a0, <16 x i16> %a1) {
; AVX2-LABEL: test_x86_avx2_packsswb:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vpacksswb %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x63,0xc1]
-; AVX2-NEXT: retl ## encoding: [0xc3]
+; AVX2-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
;
; AVX512VL-LABEL: test_x86_avx2_packsswb:
-; AVX512VL: ## BB#0:
+; AVX512VL: ## %bb.0:
; AVX512VL-NEXT: vpacksswb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x63,0xc1]
-; AVX512VL-NEXT: retl ## encoding: [0xc3]
+; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
%res = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %a0, <16 x i16> %a1) ; <<32 x i8>> [#uses=1]
ret <32 x i8> %res
}
declare <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16>, <16 x i16>) nounwind readnone
+define <32 x i8> @test_x86_avx2_packsswb_fold() {
+; X86-AVX-LABEL: test_x86_avx2_packsswb_fold:
+; X86-AVX: ## %bb.0:
+; X86-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0,0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0]
+; X86-AVX-NEXT: ## encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A]
+; X86-AVX-NEXT: ## fixup A - offset: 4, value: LCPI3_0, kind: FK_Data_4
+; X86-AVX-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
+;
+; X86-AVX512VL-LABEL: test_x86_avx2_packsswb_fold:
+; X86-AVX512VL: ## %bb.0:
+; X86-AVX512VL-NEXT: vmovaps LCPI3_0, %ymm0 ## EVEX TO VEX Compression ymm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0,0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0]
+; X86-AVX512VL-NEXT: ## encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A]
+; X86-AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI3_0, kind: FK_Data_4
+; X86-AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
+;
+; X64-AVX-LABEL: test_x86_avx2_packsswb_fold:
+; X64-AVX: ## %bb.0:
+; X64-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0,0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0]
+; X64-AVX-NEXT: ## encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A]
+; X64-AVX-NEXT: ## fixup A - offset: 4, value: LCPI3_0-4, kind: reloc_riprel_4byte
+; X64-AVX-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
+;
+; X64-AVX512VL-LABEL: test_x86_avx2_packsswb_fold:
+; X64-AVX512VL: ## %bb.0:
+; X64-AVX512VL-NEXT: vmovaps {{.*}}(%rip), %ymm0 ## EVEX TO VEX Compression ymm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0,0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0]
+; X64-AVX512VL-NEXT: ## encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A]
+; X64-AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI3_0-4, kind: reloc_riprel_4byte
+; X64-AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
+ %res = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> <i16 0, i16 255, i16 256, i16 65535, i16 -1, i16 -255, i16 -256, i16 -32678, i16 0, i16 255, i16 256, i16 65535, i16 -1, i16 -255, i16 -256, i16 -32678>, <16 x i16> zeroinitializer)
+ ret <32 x i8> %res
+}
+
+
define <32 x i8> @test_x86_avx2_packuswb(<16 x i16> %a0, <16 x i16> %a1) {
; AVX2-LABEL: test_x86_avx2_packuswb:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x67,0xc1]
-; AVX2-NEXT: retl ## encoding: [0xc3]
+; AVX2-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
;
; AVX512VL-LABEL: test_x86_avx2_packuswb:
-; AVX512VL: ## BB#0:
+; AVX512VL: ## %bb.0:
; AVX512VL-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x67,0xc1]
-; AVX512VL-NEXT: retl ## encoding: [0xc3]
+; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
%res = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %a0, <16 x i16> %a1) ; <<32 x i8>> [#uses=1]
ret <32 x i8> %res
}
declare <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16>, <16 x i16>) nounwind readnone
+define <32 x i8> @test_x86_avx2_packuswb_fold() {
+; X86-AVX-LABEL: test_x86_avx2_packuswb_fold:
+; X86-AVX: ## %bb.0:
+; X86-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; X86-AVX-NEXT: ## encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A]
+; X86-AVX-NEXT: ## fixup A - offset: 4, value: LCPI5_0, kind: FK_Data_4
+; X86-AVX-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
+;
+; X86-AVX512VL-LABEL: test_x86_avx2_packuswb_fold:
+; X86-AVX512VL: ## %bb.0:
+; X86-AVX512VL-NEXT: vmovaps LCPI5_0, %ymm0 ## EVEX TO VEX Compression ymm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; X86-AVX512VL-NEXT: ## encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A]
+; X86-AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI5_0, kind: FK_Data_4
+; X86-AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
+;
+; X64-AVX-LABEL: test_x86_avx2_packuswb_fold:
+; X64-AVX: ## %bb.0:
+; X64-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; X64-AVX-NEXT: ## encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A]
+; X64-AVX-NEXT: ## fixup A - offset: 4, value: LCPI5_0-4, kind: reloc_riprel_4byte
+; X64-AVX-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
+;
+; X64-AVX512VL-LABEL: test_x86_avx2_packuswb_fold:
+; X64-AVX512VL: ## %bb.0:
+; X64-AVX512VL-NEXT: vmovaps {{.*}}(%rip), %ymm0 ## EVEX TO VEX Compression ymm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; X64-AVX512VL-NEXT: ## encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A]
+; X64-AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI5_0-4, kind: reloc_riprel_4byte
+; X64-AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
+ %res = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> <i16 0, i16 255, i16 256, i16 65535, i16 -1, i16 -255, i16 -256, i16 -32678, i16 0, i16 255, i16 256, i16 65535, i16 -1, i16 -255, i16 -256, i16 -32678>, <16 x i16> zeroinitializer)
+ ret <32 x i8> %res
+}
+
+
define <32 x i8> @test_x86_avx2_padds_b(<32 x i8> %a0, <32 x i8> %a1) {
; AVX2-LABEL: test_x86_avx2_padds_b:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vpaddsb %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xec,0xc1]
-; AVX2-NEXT: retl ## encoding: [0xc3]
+; AVX2-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
;
; AVX512VL-LABEL: test_x86_avx2_padds_b:
-; AVX512VL: ## BB#0:
+; AVX512VL: ## %bb.0:
; AVX512VL-NEXT: vpaddsb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xec,0xc1]
-; AVX512VL-NEXT: retl ## encoding: [0xc3]
+; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
%res = call <32 x i8> @llvm.x86.avx2.padds.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1]
ret <32 x i8> %res
}
@@ -68,14 +169,14 @@ declare <32 x i8> @llvm.x86.avx2.padds.b(<32 x i8>, <32 x i8>) nounwind readnone
define <16 x i16> @test_x86_avx2_padds_w(<16 x i16> %a0, <16 x i16> %a1) {
; AVX2-LABEL: test_x86_avx2_padds_w:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vpaddsw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xed,0xc1]
-; AVX2-NEXT: retl ## encoding: [0xc3]
+; AVX2-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
;
; AVX512VL-LABEL: test_x86_avx2_padds_w:
-; AVX512VL: ## BB#0:
+; AVX512VL: ## %bb.0:
; AVX512VL-NEXT: vpaddsw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xed,0xc1]
-; AVX512VL-NEXT: retl ## encoding: [0xc3]
+; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx2.padds.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
ret <16 x i16> %res
}
@@ -84,14 +185,14 @@ declare <16 x i16> @llvm.x86.avx2.padds.w(<16 x i16>, <16 x i16>) nounwind readn
define <32 x i8> @test_x86_avx2_paddus_b(<32 x i8> %a0, <32 x i8> %a1) {
; AVX2-LABEL: test_x86_avx2_paddus_b:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vpaddusb %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xdc,0xc1]
-; AVX2-NEXT: retl ## encoding: [0xc3]
+; AVX2-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
;
; AVX512VL-LABEL: test_x86_avx2_paddus_b:
-; AVX512VL: ## BB#0:
+; AVX512VL: ## %bb.0:
; AVX512VL-NEXT: vpaddusb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdc,0xc1]
-; AVX512VL-NEXT: retl ## encoding: [0xc3]
+; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
%res = call <32 x i8> @llvm.x86.avx2.paddus.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1]
ret <32 x i8> %res
}
@@ -100,62 +201,30 @@ declare <32 x i8> @llvm.x86.avx2.paddus.b(<32 x i8>, <32 x i8>) nounwind readnon
define <16 x i16> @test_x86_avx2_paddus_w(<16 x i16> %a0, <16 x i16> %a1) {
; AVX2-LABEL: test_x86_avx2_paddus_w:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vpaddusw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xdd,0xc1]
-; AVX2-NEXT: retl ## encoding: [0xc3]
+; AVX2-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
;
; AVX512VL-LABEL: test_x86_avx2_paddus_w:
-; AVX512VL: ## BB#0:
+; AVX512VL: ## %bb.0:
; AVX512VL-NEXT: vpaddusw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdd,0xc1]
-; AVX512VL-NEXT: retl ## encoding: [0xc3]
+; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx2.paddus.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
ret <16 x i16> %res
}
declare <16 x i16> @llvm.x86.avx2.paddus.w(<16 x i16>, <16 x i16>) nounwind readnone
-define <32 x i8> @test_x86_avx2_pavg_b(<32 x i8> %a0, <32 x i8> %a1) {
-; AVX2-LABEL: test_x86_avx2_pavg_b:
-; AVX2: ## BB#0:
-; AVX2-NEXT: vpavgb %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xe0,0xc1]
-; AVX2-NEXT: retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_avx2_pavg_b:
-; AVX512VL: ## BB#0:
-; AVX512VL-NEXT: vpavgb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe0,0xc1]
-; AVX512VL-NEXT: retl ## encoding: [0xc3]
- %res = call <32 x i8> @llvm.x86.avx2.pavg.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1]
- ret <32 x i8> %res
-}
-declare <32 x i8> @llvm.x86.avx2.pavg.b(<32 x i8>, <32 x i8>) nounwind readnone
-
-
-define <16 x i16> @test_x86_avx2_pavg_w(<16 x i16> %a0, <16 x i16> %a1) {
-; AVX2-LABEL: test_x86_avx2_pavg_w:
-; AVX2: ## BB#0:
-; AVX2-NEXT: vpavgw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xe3,0xc1]
-; AVX2-NEXT: retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_avx2_pavg_w:
-; AVX512VL: ## BB#0:
-; AVX512VL-NEXT: vpavgw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe3,0xc1]
-; AVX512VL-NEXT: retl ## encoding: [0xc3]
- %res = call <16 x i16> @llvm.x86.avx2.pavg.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
- ret <16 x i16> %res
-}
-declare <16 x i16> @llvm.x86.avx2.pavg.w(<16 x i16>, <16 x i16>) nounwind readnone
-
-
define <8 x i32> @test_x86_avx2_pmadd_wd(<16 x i16> %a0, <16 x i16> %a1) {
; AVX2-LABEL: test_x86_avx2_pmadd_wd:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vpmaddwd %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xf5,0xc1]
-; AVX2-NEXT: retl ## encoding: [0xc3]
+; AVX2-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
;
; AVX512VL-LABEL: test_x86_avx2_pmadd_wd:
-; AVX512VL: ## BB#0:
+; AVX512VL: ## %bb.0:
; AVX512VL-NEXT: vpmaddwd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xf5,0xc1]
-; AVX512VL-NEXT: retl ## encoding: [0xc3]
+; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
%res = call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> %a0, <16 x i16> %a1) ; <<8 x i32>> [#uses=1]
ret <8 x i32> %res
}
@@ -164,14 +233,14 @@ declare <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16>, <16 x i16>) nounwind readn
define <16 x i16> @test_x86_avx2_pmaxs_w(<16 x i16> %a0, <16 x i16> %a1) {
; AVX2-LABEL: test_x86_avx2_pmaxs_w:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xee,0xc1]
-; AVX2-NEXT: retl ## encoding: [0xc3]
+; AVX2-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
;
; AVX512VL-LABEL: test_x86_avx2_pmaxs_w:
-; AVX512VL: ## BB#0:
+; AVX512VL: ## %bb.0:
; AVX512VL-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xee,0xc1]
-; AVX512VL-NEXT: retl ## encoding: [0xc3]
+; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx2.pmaxs.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
ret <16 x i16> %res
}
@@ -180,14 +249,14 @@ declare <16 x i16> @llvm.x86.avx2.pmaxs.w(<16 x i16>, <16 x i16>) nounwind readn
define <32 x i8> @test_x86_avx2_pmaxu_b(<32 x i8> %a0, <32 x i8> %a1) {
; AVX2-LABEL: test_x86_avx2_pmaxu_b:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xde,0xc1]
-; AVX2-NEXT: retl ## encoding: [0xc3]
+; AVX2-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
;
; AVX512VL-LABEL: test_x86_avx2_pmaxu_b:
-; AVX512VL: ## BB#0:
+; AVX512VL: ## %bb.0:
; AVX512VL-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xde,0xc1]
-; AVX512VL-NEXT: retl ## encoding: [0xc3]
+; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
%res = call <32 x i8> @llvm.x86.avx2.pmaxu.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1]
ret <32 x i8> %res
}
@@ -196,14 +265,14 @@ declare <32 x i8> @llvm.x86.avx2.pmaxu.b(<32 x i8>, <32 x i8>) nounwind readnone
define <16 x i16> @test_x86_avx2_pmins_w(<16 x i16> %a0, <16 x i16> %a1) {
; AVX2-LABEL: test_x86_avx2_pmins_w:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xea,0xc1]
-; AVX2-NEXT: retl ## encoding: [0xc3]
+; AVX2-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
;
; AVX512VL-LABEL: test_x86_avx2_pmins_w:
-; AVX512VL: ## BB#0:
+; AVX512VL: ## %bb.0:
; AVX512VL-NEXT: vpminsw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xea,0xc1]
-; AVX512VL-NEXT: retl ## encoding: [0xc3]
+; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx2.pmins.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
ret <16 x i16> %res
}
@@ -212,14 +281,14 @@ declare <16 x i16> @llvm.x86.avx2.pmins.w(<16 x i16>, <16 x i16>) nounwind readn
define <32 x i8> @test_x86_avx2_pminu_b(<32 x i8> %a0, <32 x i8> %a1) {
; AVX2-LABEL: test_x86_avx2_pminu_b:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xda,0xc1]
-; AVX2-NEXT: retl ## encoding: [0xc3]
+; AVX2-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
;
; AVX512VL-LABEL: test_x86_avx2_pminu_b:
-; AVX512VL: ## BB#0:
+; AVX512VL: ## %bb.0:
; AVX512VL-NEXT: vpminub %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xda,0xc1]
-; AVX512VL-NEXT: retl ## encoding: [0xc3]
+; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
%res = call <32 x i8> @llvm.x86.avx2.pminu.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1]
ret <32 x i8> %res
}
@@ -228,10 +297,10 @@ declare <32 x i8> @llvm.x86.avx2.pminu.b(<32 x i8>, <32 x i8>) nounwind readnone
define i32 @test_x86_avx2_pmovmskb(<32 x i8> %a0) {
; CHECK-LABEL: test_x86_avx2_pmovmskb:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpmovmskb %ymm0, %eax ## encoding: [0xc5,0xfd,0xd7,0xc0]
; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; CHECK-NEXT: retl ## encoding: [0xc3]
+; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
%res = call i32 @llvm.x86.avx2.pmovmskb(<32 x i8> %a0) ; <i32> [#uses=1]
ret i32 %res
}
@@ -240,14 +309,14 @@ declare i32 @llvm.x86.avx2.pmovmskb(<32 x i8>) nounwind readnone
define <16 x i16> @test_x86_avx2_pmulh_w(<16 x i16> %a0, <16 x i16> %a1) {
; AVX2-LABEL: test_x86_avx2_pmulh_w:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vpmulhw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xe5,0xc1]
-; AVX2-NEXT: retl ## encoding: [0xc3]
+; AVX2-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
;
; AVX512VL-LABEL: test_x86_avx2_pmulh_w:
-; AVX512VL: ## BB#0:
+; AVX512VL: ## %bb.0:
; AVX512VL-NEXT: vpmulhw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe5,0xc1]
-; AVX512VL-NEXT: retl ## encoding: [0xc3]
+; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx2.pmulh.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
ret <16 x i16> %res
}
@@ -256,14 +325,14 @@ declare <16 x i16> @llvm.x86.avx2.pmulh.w(<16 x i16>, <16 x i16>) nounwind readn
define <16 x i16> @test_x86_avx2_pmulhu_w(<16 x i16> %a0, <16 x i16> %a1) {
; AVX2-LABEL: test_x86_avx2_pmulhu_w:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vpmulhuw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xe4,0xc1]
-; AVX2-NEXT: retl ## encoding: [0xc3]
+; AVX2-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
;
; AVX512VL-LABEL: test_x86_avx2_pmulhu_w:
-; AVX512VL: ## BB#0:
+; AVX512VL: ## %bb.0:
; AVX512VL-NEXT: vpmulhuw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe4,0xc1]
-; AVX512VL-NEXT: retl ## encoding: [0xc3]
+; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx2.pmulhu.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
ret <16 x i16> %res
}
@@ -272,14 +341,14 @@ declare <16 x i16> @llvm.x86.avx2.pmulhu.w(<16 x i16>, <16 x i16>) nounwind read
define <4 x i64> @test_x86_avx2_pmulu_dq(<8 x i32> %a0, <8 x i32> %a1) {
; AVX2-LABEL: test_x86_avx2_pmulu_dq:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xf4,0xc1]
-; AVX2-NEXT: retl ## encoding: [0xc3]
+; AVX2-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
;
; AVX512VL-LABEL: test_x86_avx2_pmulu_dq:
-; AVX512VL: ## BB#0:
+; AVX512VL: ## %bb.0:
; AVX512VL-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xf4,0xc1]
-; AVX512VL-NEXT: retl ## encoding: [0xc3]
+; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
%res = call <4 x i64> @llvm.x86.avx2.pmulu.dq(<8 x i32> %a0, <8 x i32> %a1) ; <<4 x i64>> [#uses=1]
ret <4 x i64> %res
}
@@ -288,14 +357,14 @@ declare <4 x i64> @llvm.x86.avx2.pmulu.dq(<8 x i32>, <8 x i32>) nounwind readnon
define <4 x i64> @test_x86_avx2_psad_bw(<32 x i8> %a0, <32 x i8> %a1) {
; AVX2-LABEL: test_x86_avx2_psad_bw:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xf6,0xc1]
-; AVX2-NEXT: retl ## encoding: [0xc3]
+; AVX2-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
;
; AVX512VL-LABEL: test_x86_avx2_psad_bw:
-; AVX512VL: ## BB#0:
+; AVX512VL: ## %bb.0:
; AVX512VL-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xf6,0xc1]
-; AVX512VL-NEXT: retl ## encoding: [0xc3]
+; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
%res = call <4 x i64> @llvm.x86.avx2.psad.bw(<32 x i8> %a0, <32 x i8> %a1) ; <<4 x i64>> [#uses=1]
ret <4 x i64> %res
}
@@ -304,14 +373,14 @@ declare <4 x i64> @llvm.x86.avx2.psad.bw(<32 x i8>, <32 x i8>) nounwind readnone
define <8 x i32> @test_x86_avx2_psll_d(<8 x i32> %a0, <4 x i32> %a1) {
; AVX2-LABEL: test_x86_avx2_psll_d:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vpslld %xmm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xf2,0xc1]
-; AVX2-NEXT: retl ## encoding: [0xc3]
+; AVX2-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
;
; AVX512VL-LABEL: test_x86_avx2_psll_d:
-; AVX512VL: ## BB#0:
+; AVX512VL: ## %bb.0:
; AVX512VL-NEXT: vpslld %xmm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xf2,0xc1]
-; AVX512VL-NEXT: retl ## encoding: [0xc3]
+; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
%res = call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %a0, <4 x i32> %a1) ; <<8 x i32>> [#uses=1]
ret <8 x i32> %res
}
@@ -320,14 +389,14 @@ declare <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32>, <4 x i32>) nounwind readnone
define <4 x i64> @test_x86_avx2_psll_q(<4 x i64> %a0, <2 x i64> %a1) {
; AVX2-LABEL: test_x86_avx2_psll_q:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vpsllq %xmm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xf3,0xc1]
-; AVX2-NEXT: retl ## encoding: [0xc3]
+; AVX2-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
;
; AVX512VL-LABEL: test_x86_avx2_psll_q:
-; AVX512VL: ## BB#0:
+; AVX512VL: ## %bb.0:
; AVX512VL-NEXT: vpsllq %xmm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xf3,0xc1]
-; AVX512VL-NEXT: retl ## encoding: [0xc3]
+; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
%res = call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %a0, <2 x i64> %a1) ; <<4 x i64>> [#uses=1]
ret <4 x i64> %res
}
@@ -336,14 +405,14 @@ declare <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64>, <2 x i64>) nounwind readnone
define <16 x i16> @test_x86_avx2_psll_w(<16 x i16> %a0, <8 x i16> %a1) {
; AVX2-LABEL: test_x86_avx2_psll_w:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vpsllw %xmm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xf1,0xc1]
-; AVX2-NEXT: retl ## encoding: [0xc3]
+; AVX2-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
;
; AVX512VL-LABEL: test_x86_avx2_psll_w:
-; AVX512VL: ## BB#0:
+; AVX512VL: ## %bb.0:
; AVX512VL-NEXT: vpsllw %xmm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xf1,0xc1]
-; AVX512VL-NEXT: retl ## encoding: [0xc3]
+; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> %a0, <8 x i16> %a1) ; <<16 x i16>> [#uses=1]
ret <16 x i16> %res
}
@@ -352,14 +421,14 @@ declare <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16>, <8 x i16>) nounwind readnon
define <8 x i32> @test_x86_avx2_pslli_d(<8 x i32> %a0) {
; AVX2-LABEL: test_x86_avx2_pslli_d:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vpslld $7, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x72,0xf0,0x07]
-; AVX2-NEXT: retl ## encoding: [0xc3]
+; AVX2-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
;
; AVX512VL-LABEL: test_x86_avx2_pslli_d:
-; AVX512VL: ## BB#0:
+; AVX512VL: ## %bb.0:
; AVX512VL-NEXT: vpslld $7, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x72,0xf0,0x07]
-; AVX512VL-NEXT: retl ## encoding: [0xc3]
+; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
%res = call <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32> %a0, i32 7) ; <<8 x i32>> [#uses=1]
ret <8 x i32> %res
}
@@ -368,14 +437,14 @@ declare <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32>, i32) nounwind readnone
define <4 x i64> @test_x86_avx2_pslli_q(<4 x i64> %a0) {
; AVX2-LABEL: test_x86_avx2_pslli_q:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vpsllq $7, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x73,0xf0,0x07]
-; AVX2-NEXT: retl ## encoding: [0xc3]
+; AVX2-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
;
; AVX512VL-LABEL: test_x86_avx2_pslli_q:
-; AVX512VL: ## BB#0:
+; AVX512VL: ## %bb.0:
; AVX512VL-NEXT: vpsllq $7, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x73,0xf0,0x07]
-; AVX512VL-NEXT: retl ## encoding: [0xc3]
+; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
%res = call <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64> %a0, i32 7) ; <<4 x i64>> [#uses=1]
ret <4 x i64> %res
}
@@ -384,14 +453,14 @@ declare <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64>, i32) nounwind readnone
define <16 x i16> @test_x86_avx2_pslli_w(<16 x i16> %a0) {
; AVX2-LABEL: test_x86_avx2_pslli_w:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vpsllw $7, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x71,0xf0,0x07]
-; AVX2-NEXT: retl ## encoding: [0xc3]
+; AVX2-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
;
; AVX512VL-LABEL: test_x86_avx2_pslli_w:
-; AVX512VL: ## BB#0:
+; AVX512VL: ## %bb.0:
; AVX512VL-NEXT: vpsllw $7, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x71,0xf0,0x07]
-; AVX512VL-NEXT: retl ## encoding: [0xc3]
+; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16> %a0, i32 7) ; <<16 x i16>> [#uses=1]
ret <16 x i16> %res
}
@@ -400,14 +469,14 @@ declare <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16>, i32) nounwind readnone
define <8 x i32> @test_x86_avx2_psra_d(<8 x i32> %a0, <4 x i32> %a1) {
; AVX2-LABEL: test_x86_avx2_psra_d:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vpsrad %xmm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xe2,0xc1]
-; AVX2-NEXT: retl ## encoding: [0xc3]
+; AVX2-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
;
; AVX512VL-LABEL: test_x86_avx2_psra_d:
-; AVX512VL: ## BB#0:
+; AVX512VL: ## %bb.0:
; AVX512VL-NEXT: vpsrad %xmm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe2,0xc1]
-; AVX512VL-NEXT: retl ## encoding: [0xc3]
+; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
%res = call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %a0, <4 x i32> %a1) ; <<8 x i32>> [#uses=1]
ret <8 x i32> %res
}
@@ -416,14 +485,14 @@ declare <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32>, <4 x i32>) nounwind readnone
define <16 x i16> @test_x86_avx2_psra_w(<16 x i16> %a0, <8 x i16> %a1) {
; AVX2-LABEL: test_x86_avx2_psra_w:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vpsraw %xmm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xe1,0xc1]
-; AVX2-NEXT: retl ## encoding: [0xc3]
+; AVX2-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
;
; AVX512VL-LABEL: test_x86_avx2_psra_w:
-; AVX512VL: ## BB#0:
+; AVX512VL: ## %bb.0:
; AVX512VL-NEXT: vpsraw %xmm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe1,0xc1]
-; AVX512VL-NEXT: retl ## encoding: [0xc3]
+; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %a0, <8 x i16> %a1) ; <<16 x i16>> [#uses=1]
ret <16 x i16> %res
}
@@ -432,14 +501,14 @@ declare <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16>, <8 x i16>) nounwind readnon
define <8 x i32> @test_x86_avx2_psrai_d(<8 x i32> %a0) {
; AVX2-LABEL: test_x86_avx2_psrai_d:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vpsrad $7, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x72,0xe0,0x07]
-; AVX2-NEXT: retl ## encoding: [0xc3]
+; AVX2-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
;
; AVX512VL-LABEL: test_x86_avx2_psrai_d:
-; AVX512VL: ## BB#0:
+; AVX512VL: ## %bb.0:
; AVX512VL-NEXT: vpsrad $7, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x72,0xe0,0x07]
-; AVX512VL-NEXT: retl ## encoding: [0xc3]
+; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
%res = call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %a0, i32 7) ; <<8 x i32>> [#uses=1]
ret <8 x i32> %res
}
@@ -448,14 +517,14 @@ declare <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32>, i32) nounwind readnone
define <16 x i16> @test_x86_avx2_psrai_w(<16 x i16> %a0) {
; AVX2-LABEL: test_x86_avx2_psrai_w:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vpsraw $7, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x71,0xe0,0x07]
-; AVX2-NEXT: retl ## encoding: [0xc3]
+; AVX2-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
;
; AVX512VL-LABEL: test_x86_avx2_psrai_w:
-; AVX512VL: ## BB#0:
+; AVX512VL: ## %bb.0:
; AVX512VL-NEXT: vpsraw $7, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x71,0xe0,0x07]
-; AVX512VL-NEXT: retl ## encoding: [0xc3]
+; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> %a0, i32 7) ; <<16 x i16>> [#uses=1]
ret <16 x i16> %res
}
@@ -464,14 +533,14 @@ declare <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16>, i32) nounwind readnone
define <8 x i32> @test_x86_avx2_psrl_d(<8 x i32> %a0, <4 x i32> %a1) {
; AVX2-LABEL: test_x86_avx2_psrl_d:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vpsrld %xmm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xd2,0xc1]
-; AVX2-NEXT: retl ## encoding: [0xc3]
+; AVX2-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
;
; AVX512VL-LABEL: test_x86_avx2_psrl_d:
-; AVX512VL: ## BB#0:
+; AVX512VL: ## %bb.0:
; AVX512VL-NEXT: vpsrld %xmm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd2,0xc1]
-; AVX512VL-NEXT: retl ## encoding: [0xc3]
+; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
%res = call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %a0, <4 x i32> %a1) ; <<8 x i32>> [#uses=1]
ret <8 x i32> %res
}
@@ -480,14 +549,14 @@ declare <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32>, <4 x i32>) nounwind readnone
define <4 x i64> @test_x86_avx2_psrl_q(<4 x i64> %a0, <2 x i64> %a1) {
; AVX2-LABEL: test_x86_avx2_psrl_q:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vpsrlq %xmm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xd3,0xc1]
-; AVX2-NEXT: retl ## encoding: [0xc3]
+; AVX2-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
;
; AVX512VL-LABEL: test_x86_avx2_psrl_q:
-; AVX512VL: ## BB#0:
+; AVX512VL: ## %bb.0:
; AVX512VL-NEXT: vpsrlq %xmm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd3,0xc1]
-; AVX512VL-NEXT: retl ## encoding: [0xc3]
+; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
%res = call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %a0, <2 x i64> %a1) ; <<4 x i64>> [#uses=1]
ret <4 x i64> %res
}
@@ -496,14 +565,14 @@ declare <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64>, <2 x i64>) nounwind readnone
define <16 x i16> @test_x86_avx2_psrl_w(<16 x i16> %a0, <8 x i16> %a1) {
; AVX2-LABEL: test_x86_avx2_psrl_w:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xd1,0xc1]
-; AVX2-NEXT: retl ## encoding: [0xc3]
+; AVX2-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
;
; AVX512VL-LABEL: test_x86_avx2_psrl_w:
-; AVX512VL: ## BB#0:
+; AVX512VL: ## %bb.0:
; AVX512VL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd1,0xc1]
-; AVX512VL-NEXT: retl ## encoding: [0xc3]
+; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> %a0, <8 x i16> %a1) ; <<16 x i16>> [#uses=1]
ret <16 x i16> %res
}
@@ -512,14 +581,14 @@ declare <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16>, <8 x i16>) nounwind readnon
define <8 x i32> @test_x86_avx2_psrli_d(<8 x i32> %a0) {
; AVX2-LABEL: test_x86_avx2_psrli_d:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vpsrld $7, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x72,0xd0,0x07]
-; AVX2-NEXT: retl ## encoding: [0xc3]
+; AVX2-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
;
; AVX512VL-LABEL: test_x86_avx2_psrli_d:
-; AVX512VL: ## BB#0:
+; AVX512VL: ## %bb.0:
; AVX512VL-NEXT: vpsrld $7, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x72,0xd0,0x07]
-; AVX512VL-NEXT: retl ## encoding: [0xc3]
+; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
%res = call <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32> %a0, i32 7) ; <<8 x i32>> [#uses=1]
ret <8 x i32> %res
}
@@ -528,14 +597,14 @@ declare <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32>, i32) nounwind readnone
define <4 x i64> @test_x86_avx2_psrli_q(<4 x i64> %a0) {
; AVX2-LABEL: test_x86_avx2_psrli_q:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vpsrlq $7, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x73,0xd0,0x07]
-; AVX2-NEXT: retl ## encoding: [0xc3]
+; AVX2-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
;
; AVX512VL-LABEL: test_x86_avx2_psrli_q:
-; AVX512VL: ## BB#0:
+; AVX512VL: ## %bb.0:
; AVX512VL-NEXT: vpsrlq $7, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x73,0xd0,0x07]
-; AVX512VL-NEXT: retl ## encoding: [0xc3]
+; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
%res = call <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64> %a0, i32 7) ; <<4 x i64>> [#uses=1]
ret <4 x i64> %res
}
@@ -544,14 +613,14 @@ declare <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64>, i32) nounwind readnone
define <16 x i16> @test_x86_avx2_psrli_w(<16 x i16> %a0) {
; AVX2-LABEL: test_x86_avx2_psrli_w:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vpsrlw $7, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0x71,0xd0,0x07]
-; AVX2-NEXT: retl ## encoding: [0xc3]
+; AVX2-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
;
; AVX512VL-LABEL: test_x86_avx2_psrli_w:
-; AVX512VL: ## BB#0:
+; AVX512VL: ## %bb.0:
; AVX512VL-NEXT: vpsrlw $7, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x71,0xd0,0x07]
-; AVX512VL-NEXT: retl ## encoding: [0xc3]
+; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16> %a0, i32 7) ; <<16 x i16>> [#uses=1]
ret <16 x i16> %res
}
@@ -560,14 +629,14 @@ declare <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16>, i32) nounwind readnone
define <32 x i8> @test_x86_avx2_psubs_b(<32 x i8> %a0, <32 x i8> %a1) {
; AVX2-LABEL: test_x86_avx2_psubs_b:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vpsubsb %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xe8,0xc1]
-; AVX2-NEXT: retl ## encoding: [0xc3]
+; AVX2-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
;
; AVX512VL-LABEL: test_x86_avx2_psubs_b:
-; AVX512VL: ## BB#0:
+; AVX512VL: ## %bb.0:
; AVX512VL-NEXT: vpsubsb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe8,0xc1]
-; AVX512VL-NEXT: retl ## encoding: [0xc3]
+; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
%res = call <32 x i8> @llvm.x86.avx2.psubs.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1]
ret <32 x i8> %res
}
@@ -576,14 +645,14 @@ declare <32 x i8> @llvm.x86.avx2.psubs.b(<32 x i8>, <32 x i8>) nounwind readnone
define <16 x i16> @test_x86_avx2_psubs_w(<16 x i16> %a0, <16 x i16> %a1) {
; AVX2-LABEL: test_x86_avx2_psubs_w:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vpsubsw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xe9,0xc1]
-; AVX2-NEXT: retl ## encoding: [0xc3]
+; AVX2-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
;
; AVX512VL-LABEL: test_x86_avx2_psubs_w:
-; AVX512VL: ## BB#0:
+; AVX512VL: ## %bb.0:
; AVX512VL-NEXT: vpsubsw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe9,0xc1]
-; AVX512VL-NEXT: retl ## encoding: [0xc3]
+; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx2.psubs.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
ret <16 x i16> %res
}
@@ -592,14 +661,14 @@ declare <16 x i16> @llvm.x86.avx2.psubs.w(<16 x i16>, <16 x i16>) nounwind readn
define <32 x i8> @test_x86_avx2_psubus_b(<32 x i8> %a0, <32 x i8> %a1) {
; AVX2-LABEL: test_x86_avx2_psubus_b:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vpsubusb %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xd8,0xc1]
-; AVX2-NEXT: retl ## encoding: [0xc3]
+; AVX2-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
;
; AVX512VL-LABEL: test_x86_avx2_psubus_b:
-; AVX512VL: ## BB#0:
+; AVX512VL: ## %bb.0:
; AVX512VL-NEXT: vpsubusb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd8,0xc1]
-; AVX512VL-NEXT: retl ## encoding: [0xc3]
+; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
%res = call <32 x i8> @llvm.x86.avx2.psubus.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1]
ret <32 x i8> %res
}
@@ -608,73 +677,24 @@ declare <32 x i8> @llvm.x86.avx2.psubus.b(<32 x i8>, <32 x i8>) nounwind readnon
define <16 x i16> @test_x86_avx2_psubus_w(<16 x i16> %a0, <16 x i16> %a1) {
; AVX2-LABEL: test_x86_avx2_psubus_w:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vpsubusw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xd9,0xc1]
-; AVX2-NEXT: retl ## encoding: [0xc3]
+; AVX2-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
;
; AVX512VL-LABEL: test_x86_avx2_psubus_w:
-; AVX512VL: ## BB#0:
+; AVX512VL: ## %bb.0:
; AVX512VL-NEXT: vpsubusw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd9,0xc1]
-; AVX512VL-NEXT: retl ## encoding: [0xc3]
+; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx2.psubus.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
ret <16 x i16> %res
}
declare <16 x i16> @llvm.x86.avx2.psubus.w(<16 x i16>, <16 x i16>) nounwind readnone
-
-define <32 x i8> @test_x86_avx2_pabs_b(<32 x i8> %a0) {
-; AVX2-LABEL: test_x86_avx2_pabs_b:
-; AVX2: ## BB#0:
-; AVX2-NEXT: vpabsb %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x1c,0xc0]
-; AVX2-NEXT: retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_avx2_pabs_b:
-; AVX512VL: ## BB#0:
-; AVX512VL-NEXT: vpabsb %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x1c,0xc0]
-; AVX512VL-NEXT: retl ## encoding: [0xc3]
- %res = call <32 x i8> @llvm.x86.avx2.pabs.b(<32 x i8> %a0) ; <<32 x i8>> [#uses=1]
- ret <32 x i8> %res
-}
-declare <32 x i8> @llvm.x86.avx2.pabs.b(<32 x i8>) nounwind readnone
-
-
-define <8 x i32> @test_x86_avx2_pabs_d(<8 x i32> %a0) {
-; AVX2-LABEL: test_x86_avx2_pabs_d:
-; AVX2: ## BB#0:
-; AVX2-NEXT: vpabsd %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x1e,0xc0]
-; AVX2-NEXT: retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_avx2_pabs_d:
-; AVX512VL: ## BB#0:
-; AVX512VL-NEXT: vpabsd %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x1e,0xc0]
-; AVX512VL-NEXT: retl ## encoding: [0xc3]
- %res = call <8 x i32> @llvm.x86.avx2.pabs.d(<8 x i32> %a0) ; <<8 x i32>> [#uses=1]
- ret <8 x i32> %res
-}
-declare <8 x i32> @llvm.x86.avx2.pabs.d(<8 x i32>) nounwind readnone
-
-
-define <16 x i16> @test_x86_avx2_pabs_w(<16 x i16> %a0) {
-; AVX2-LABEL: test_x86_avx2_pabs_w:
-; AVX2: ## BB#0:
-; AVX2-NEXT: vpabsw %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x1d,0xc0]
-; AVX2-NEXT: retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_avx2_pabs_w:
-; AVX512VL: ## BB#0:
-; AVX512VL-NEXT: vpabsw %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x1d,0xc0]
-; AVX512VL-NEXT: retl ## encoding: [0xc3]
- %res = call <16 x i16> @llvm.x86.avx2.pabs.w(<16 x i16> %a0) ; <<16 x i16>> [#uses=1]
- ret <16 x i16> %res
-}
-declare <16 x i16> @llvm.x86.avx2.pabs.w(<16 x i16>) nounwind readnone
-
-
define <8 x i32> @test_x86_avx2_phadd_d(<8 x i32> %a0, <8 x i32> %a1) {
; CHECK-LABEL: test_x86_avx2_phadd_d:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vphaddd %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x02,0xc1]
-; CHECK-NEXT: retl ## encoding: [0xc3]
+; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
%res = call <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32> %a0, <8 x i32> %a1) ; <<8 x i32>> [#uses=1]
ret <8 x i32> %res
}
@@ -683,9 +703,9 @@ declare <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32>, <8 x i32>) nounwind readnone
define <16 x i16> @test_x86_avx2_phadd_sw(<16 x i16> %a0, <16 x i16> %a1) {
; CHECK-LABEL: test_x86_avx2_phadd_sw:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vphaddsw %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x03,0xc1]
-; CHECK-NEXT: retl ## encoding: [0xc3]
+; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx2.phadd.sw(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
ret <16 x i16> %res
}
@@ -694,9 +714,9 @@ declare <16 x i16> @llvm.x86.avx2.phadd.sw(<16 x i16>, <16 x i16>) nounwind read
define <16 x i16> @test_x86_avx2_phadd_w(<16 x i16> %a0, <16 x i16> %a1) {
; CHECK-LABEL: test_x86_avx2_phadd_w:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vphaddw %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x01,0xc1]
-; CHECK-NEXT: retl ## encoding: [0xc3]
+; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
ret <16 x i16> %res
}
@@ -705,9 +725,9 @@ declare <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16>, <16 x i16>) nounwind readn
define <8 x i32> @test_x86_avx2_phsub_d(<8 x i32> %a0, <8 x i32> %a1) {
; CHECK-LABEL: test_x86_avx2_phsub_d:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vphsubd %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x06,0xc1]
-; CHECK-NEXT: retl ## encoding: [0xc3]
+; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
%res = call <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32> %a0, <8 x i32> %a1) ; <<8 x i32>> [#uses=1]
ret <8 x i32> %res
}
@@ -716,9 +736,9 @@ declare <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32>, <8 x i32>) nounwind readnone
define <16 x i16> @test_x86_avx2_phsub_sw(<16 x i16> %a0, <16 x i16> %a1) {
; CHECK-LABEL: test_x86_avx2_phsub_sw:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vphsubsw %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x07,0xc1]
-; CHECK-NEXT: retl ## encoding: [0xc3]
+; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx2.phsub.sw(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
ret <16 x i16> %res
}
@@ -727,9 +747,9 @@ declare <16 x i16> @llvm.x86.avx2.phsub.sw(<16 x i16>, <16 x i16>) nounwind read
define <16 x i16> @test_x86_avx2_phsub_w(<16 x i16> %a0, <16 x i16> %a1) {
; CHECK-LABEL: test_x86_avx2_phsub_w:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vphsubw %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x05,0xc1]
-; CHECK-NEXT: retl ## encoding: [0xc3]
+; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
ret <16 x i16> %res
}
@@ -738,14 +758,14 @@ declare <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16>, <16 x i16>) nounwind readn
define <16 x i16> @test_x86_avx2_pmadd_ub_sw(<32 x i8> %a0, <32 x i8> %a1) {
; AVX2-LABEL: test_x86_avx2_pmadd_ub_sw:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vpmaddubsw %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x04,0xc1]
-; AVX2-NEXT: retl ## encoding: [0xc3]
+; AVX2-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
;
; AVX512VL-LABEL: test_x86_avx2_pmadd_ub_sw:
-; AVX512VL: ## BB#0:
+; AVX512VL: ## %bb.0:
; AVX512VL-NEXT: vpmaddubsw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x04,0xc1]
-; AVX512VL-NEXT: retl ## encoding: [0xc3]
+; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> %a0, <32 x i8> %a1) ; <<16 x i16>> [#uses=1]
ret <16 x i16> %res
}
@@ -753,19 +773,31 @@ declare <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8>, <32 x i8>) nounwind rea
; Make sure we don't commute this operation.
define <16 x i16> @test_x86_avx2_pmadd_ub_sw_load_op0(<32 x i8>* %ptr, <32 x i8> %a1) {
-; AVX2-LABEL: test_x86_avx2_pmadd_ub_sw_load_op0:
-; AVX2: ## BB#0:
-; AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; AVX2-NEXT: vmovdqa (%eax), %ymm1 ## encoding: [0xc5,0xfd,0x6f,0x08]
-; AVX2-NEXT: vpmaddubsw %ymm0, %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x75,0x04,0xc0]
-; AVX2-NEXT: retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_avx2_pmadd_ub_sw_load_op0:
-; AVX512VL: ## BB#0:
-; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; AVX512VL-NEXT: vmovdqu (%eax), %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xfe,0x6f,0x08]
-; AVX512VL-NEXT: vpmaddubsw %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0x04,0xc0]
-; AVX512VL-NEXT: retl ## encoding: [0xc3]
+; X86-AVX-LABEL: test_x86_avx2_pmadd_ub_sw_load_op0:
+; X86-AVX: ## %bb.0:
+; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-AVX-NEXT: vmovdqa (%eax), %ymm1 ## encoding: [0xc5,0xfd,0x6f,0x08]
+; X86-AVX-NEXT: vpmaddubsw %ymm0, %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x75,0x04,0xc0]
+; X86-AVX-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
+;
+; X86-AVX512VL-LABEL: test_x86_avx2_pmadd_ub_sw_load_op0:
+; X86-AVX512VL: ## %bb.0:
+; X86-AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-AVX512VL-NEXT: vmovdqa (%eax), %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0x08]
+; X86-AVX512VL-NEXT: vpmaddubsw %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0x04,0xc0]
+; X86-AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
+;
+; X64-AVX-LABEL: test_x86_avx2_pmadd_ub_sw_load_op0:
+; X64-AVX: ## %bb.0:
+; X64-AVX-NEXT: vmovdqa (%rdi), %ymm1 ## encoding: [0xc5,0xfd,0x6f,0x0f]
+; X64-AVX-NEXT: vpmaddubsw %ymm0, %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x75,0x04,0xc0]
+; X64-AVX-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
+;
+; X64-AVX512VL-LABEL: test_x86_avx2_pmadd_ub_sw_load_op0:
+; X64-AVX512VL: ## %bb.0:
+; X64-AVX512VL-NEXT: vmovdqa (%rdi), %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0x0f]
+; X64-AVX512VL-NEXT: vpmaddubsw %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0x04,0xc0]
+; X64-AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
%a0 = load <32 x i8>, <32 x i8>* %ptr
%res = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> %a0, <32 x i8> %a1) ; <<16 x i16>> [#uses=1]
ret <16 x i16> %res
@@ -773,14 +805,14 @@ define <16 x i16> @test_x86_avx2_pmadd_ub_sw_load_op0(<32 x i8>* %ptr, <32 x i8>
define <16 x i16> @test_x86_avx2_pmul_hr_sw(<16 x i16> %a0, <16 x i16> %a1) {
; AVX2-LABEL: test_x86_avx2_pmul_hr_sw:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vpmulhrsw %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x0b,0xc1]
-; AVX2-NEXT: retl ## encoding: [0xc3]
+; AVX2-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
;
; AVX512VL-LABEL: test_x86_avx2_pmul_hr_sw:
-; AVX512VL: ## BB#0:
+; AVX512VL: ## %bb.0:
; AVX512VL-NEXT: vpmulhrsw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x0b,0xc1]
-; AVX512VL-NEXT: retl ## encoding: [0xc3]
+; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx2.pmul.hr.sw(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
ret <16 x i16> %res
}
@@ -789,14 +821,14 @@ declare <16 x i16> @llvm.x86.avx2.pmul.hr.sw(<16 x i16>, <16 x i16>) nounwind re
define <32 x i8> @test_x86_avx2_pshuf_b(<32 x i8> %a0, <32 x i8> %a1) {
; AVX2-LABEL: test_x86_avx2_pshuf_b:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x00,0xc1]
-; AVX2-NEXT: retl ## encoding: [0xc3]
+; AVX2-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
;
; AVX512VL-LABEL: test_x86_avx2_pshuf_b:
-; AVX512VL: ## BB#0:
+; AVX512VL: ## %bb.0:
; AVX512VL-NEXT: vpshufb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x00,0xc1]
-; AVX512VL-NEXT: retl ## encoding: [0xc3]
+; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
%res = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> %a1) ; <<16 x i8>> [#uses=1]
ret <32 x i8> %res
}
@@ -805,9 +837,9 @@ declare <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8>, <32 x i8>) nounwind readnone
define <32 x i8> @test_x86_avx2_psign_b(<32 x i8> %a0, <32 x i8> %a1) {
; CHECK-LABEL: test_x86_avx2_psign_b:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsignb %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x08,0xc1]
-; CHECK-NEXT: retl ## encoding: [0xc3]
+; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
%res = call <32 x i8> @llvm.x86.avx2.psign.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1]
ret <32 x i8> %res
}
@@ -816,9 +848,9 @@ declare <32 x i8> @llvm.x86.avx2.psign.b(<32 x i8>, <32 x i8>) nounwind readnone
define <8 x i32> @test_x86_avx2_psign_d(<8 x i32> %a0, <8 x i32> %a1) {
; CHECK-LABEL: test_x86_avx2_psign_d:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsignd %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x0a,0xc1]
-; CHECK-NEXT: retl ## encoding: [0xc3]
+; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
%res = call <8 x i32> @llvm.x86.avx2.psign.d(<8 x i32> %a0, <8 x i32> %a1) ; <<4 x i32>> [#uses=1]
ret <8 x i32> %res
}
@@ -827,9 +859,9 @@ declare <8 x i32> @llvm.x86.avx2.psign.d(<8 x i32>, <8 x i32>) nounwind readnone
define <16 x i16> @test_x86_avx2_psign_w(<16 x i16> %a0, <16 x i16> %a1) {
; CHECK-LABEL: test_x86_avx2_psign_w:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsignw %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x09,0xc1]
-; CHECK-NEXT: retl ## encoding: [0xc3]
+; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx2.psign.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
ret <16 x i16> %res
}
@@ -838,9 +870,9 @@ declare <16 x i16> @llvm.x86.avx2.psign.w(<16 x i16>, <16 x i16>) nounwind readn
define <16 x i16> @test_x86_avx2_mpsadbw(<32 x i8> %a0, <32 x i8> %a1) {
; CHECK-LABEL: test_x86_avx2_mpsadbw:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmpsadbw $7, %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x42,0xc1,0x07]
-; CHECK-NEXT: retl ## encoding: [0xc3]
+; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8> %a0, <32 x i8> %a1, i8 7) ; <<16 x i16>> [#uses=1]
ret <16 x i16> %res
}
@@ -849,25 +881,58 @@ declare <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8>, <32 x i8>, i8) nounwind rea
define <16 x i16> @test_x86_avx2_packusdw(<8 x i32> %a0, <8 x i32> %a1) {
; AVX2-LABEL: test_x86_avx2_packusdw:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x2b,0xc1]
-; AVX2-NEXT: retl ## encoding: [0xc3]
+; AVX2-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
;
; AVX512VL-LABEL: test_x86_avx2_packusdw:
-; AVX512VL: ## BB#0:
+; AVX512VL: ## %bb.0:
; AVX512VL-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x2b,0xc1]
-; AVX512VL-NEXT: retl ## encoding: [0xc3]
+; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a0, <8 x i32> %a1) ; <<16 x i16>> [#uses=1]
ret <16 x i16> %res
}
declare <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32>, <8 x i32>) nounwind readnone
+define <16 x i16> @test_x86_avx2_packusdw_fold() {
+; X86-AVX-LABEL: test_x86_avx2_packusdw_fold:
+; X86-AVX: ## %bb.0:
+; X86-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [0,0,0,0,255,32767,65535,0,0,0,0,0,0,0,0,0]
+; X86-AVX-NEXT: ## encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A]
+; X86-AVX-NEXT: ## fixup A - offset: 4, value: LCPI55_0, kind: FK_Data_4
+; X86-AVX-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
+;
+; X86-AVX512VL-LABEL: test_x86_avx2_packusdw_fold:
+; X86-AVX512VL: ## %bb.0:
+; X86-AVX512VL-NEXT: vmovaps LCPI55_0, %ymm0 ## EVEX TO VEX Compression ymm0 = [0,0,0,0,255,32767,65535,0,0,0,0,0,0,0,0,0]
+; X86-AVX512VL-NEXT: ## encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A]
+; X86-AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI55_0, kind: FK_Data_4
+; X86-AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
+;
+; X64-AVX-LABEL: test_x86_avx2_packusdw_fold:
+; X64-AVX: ## %bb.0:
+; X64-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [0,0,0,0,255,32767,65535,0,0,0,0,0,0,0,0,0]
+; X64-AVX-NEXT: ## encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A]
+; X64-AVX-NEXT: ## fixup A - offset: 4, value: LCPI55_0-4, kind: reloc_riprel_4byte
+; X64-AVX-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
+;
+; X64-AVX512VL-LABEL: test_x86_avx2_packusdw_fold:
+; X64-AVX512VL: ## %bb.0:
+; X64-AVX512VL-NEXT: vmovaps {{.*}}(%rip), %ymm0 ## EVEX TO VEX Compression ymm0 = [0,0,0,0,255,32767,65535,0,0,0,0,0,0,0,0,0]
+; X64-AVX512VL-NEXT: ## encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A]
+; X64-AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI55_0-4, kind: reloc_riprel_4byte
+; X64-AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
+ %res = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> zeroinitializer, <8 x i32> <i32 255, i32 32767, i32 65535, i32 -1, i32 -32767, i32 -65535, i32 0, i32 -256>)
+ ret <16 x i16> %res
+}
+
+
define <32 x i8> @test_x86_avx2_pblendvb(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> %a2) {
; CHECK-LABEL: test_x86_avx2_pblendvb:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x4c,0xc1,0x20]
-; CHECK-NEXT: retl ## encoding: [0xc3]
+; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
%res = call <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> %a2) ; <<32 x i8>> [#uses=1]
ret <32 x i8> %res
}
@@ -876,10 +941,10 @@ declare <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8>, <32 x i8>, <32 x i8>) nounw
define <16 x i16> @test_x86_avx2_pblendw(<16 x i16> %a0, <16 x i16> %a1) {
; CHECK-LABEL: test_x86_avx2_pblendw:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpblendw $7, %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x0e,0xc1,0x07]
; CHECK-NEXT: ## ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
-; CHECK-NEXT: retl ## encoding: [0xc3]
+; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx2.pblendw(<16 x i16> %a0, <16 x i16> %a1, i8 7) ; <<16 x i16>> [#uses=1]
ret <16 x i16> %res
}
@@ -888,14 +953,14 @@ declare <16 x i16> @llvm.x86.avx2.pblendw(<16 x i16>, <16 x i16>, i8) nounwind r
define <32 x i8> @test_x86_avx2_pmaxsb(<32 x i8> %a0, <32 x i8> %a1) {
; AVX2-LABEL: test_x86_avx2_pmaxsb:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x3c,0xc1]
-; AVX2-NEXT: retl ## encoding: [0xc3]
+; AVX2-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
;
; AVX512VL-LABEL: test_x86_avx2_pmaxsb:
-; AVX512VL: ## BB#0:
+; AVX512VL: ## %bb.0:
; AVX512VL-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x3c,0xc1]
-; AVX512VL-NEXT: retl ## encoding: [0xc3]
+; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
%res = call <32 x i8> @llvm.x86.avx2.pmaxs.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1]
ret <32 x i8> %res
}
@@ -904,14 +969,14 @@ declare <32 x i8> @llvm.x86.avx2.pmaxs.b(<32 x i8>, <32 x i8>) nounwind readnone
define <8 x i32> @test_x86_avx2_pmaxsd(<8 x i32> %a0, <8 x i32> %a1) {
; AVX2-LABEL: test_x86_avx2_pmaxsd:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x3d,0xc1]
-; AVX2-NEXT: retl ## encoding: [0xc3]
+; AVX2-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
;
; AVX512VL-LABEL: test_x86_avx2_pmaxsd:
-; AVX512VL: ## BB#0:
+; AVX512VL: ## %bb.0:
; AVX512VL-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x3d,0xc1]
-; AVX512VL-NEXT: retl ## encoding: [0xc3]
+; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
%res = call <8 x i32> @llvm.x86.avx2.pmaxs.d(<8 x i32> %a0, <8 x i32> %a1) ; <<8 x i32>> [#uses=1]
ret <8 x i32> %res
}
@@ -920,14 +985,14 @@ declare <8 x i32> @llvm.x86.avx2.pmaxs.d(<8 x i32>, <8 x i32>) nounwind readnone
define <8 x i32> @test_x86_avx2_pmaxud(<8 x i32> %a0, <8 x i32> %a1) {
; AVX2-LABEL: test_x86_avx2_pmaxud:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x3f,0xc1]
-; AVX2-NEXT: retl ## encoding: [0xc3]
+; AVX2-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
;
; AVX512VL-LABEL: test_x86_avx2_pmaxud:
-; AVX512VL: ## BB#0:
+; AVX512VL: ## %bb.0:
; AVX512VL-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x3f,0xc1]
-; AVX512VL-NEXT: retl ## encoding: [0xc3]
+; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
%res = call <8 x i32> @llvm.x86.avx2.pmaxu.d(<8 x i32> %a0, <8 x i32> %a1) ; <<8 x i32>> [#uses=1]
ret <8 x i32> %res
}
@@ -936,14 +1001,14 @@ declare <8 x i32> @llvm.x86.avx2.pmaxu.d(<8 x i32>, <8 x i32>) nounwind readnone
define <16 x i16> @test_x86_avx2_pmaxuw(<16 x i16> %a0, <16 x i16> %a1) {
; AVX2-LABEL: test_x86_avx2_pmaxuw:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x3e,0xc1]
-; AVX2-NEXT: retl ## encoding: [0xc3]
+; AVX2-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
;
; AVX512VL-LABEL: test_x86_avx2_pmaxuw:
-; AVX512VL: ## BB#0:
+; AVX512VL: ## %bb.0:
; AVX512VL-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x3e,0xc1]
-; AVX512VL-NEXT: retl ## encoding: [0xc3]
+; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx2.pmaxu.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
ret <16 x i16> %res
}
@@ -952,14 +1017,14 @@ declare <16 x i16> @llvm.x86.avx2.pmaxu.w(<16 x i16>, <16 x i16>) nounwind readn
define <32 x i8> @test_x86_avx2_pminsb(<32 x i8> %a0, <32 x i8> %a1) {
; AVX2-LABEL: test_x86_avx2_pminsb:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x38,0xc1]
-; AVX2-NEXT: retl ## encoding: [0xc3]
+; AVX2-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
;
; AVX512VL-LABEL: test_x86_avx2_pminsb:
-; AVX512VL: ## BB#0:
+; AVX512VL: ## %bb.0:
; AVX512VL-NEXT: vpminsb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x38,0xc1]
-; AVX512VL-NEXT: retl ## encoding: [0xc3]
+; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
%res = call <32 x i8> @llvm.x86.avx2.pmins.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1]
ret <32 x i8> %res
}
@@ -968,14 +1033,14 @@ declare <32 x i8> @llvm.x86.avx2.pmins.b(<32 x i8>, <32 x i8>) nounwind readnone
define <8 x i32> @test_x86_avx2_pminsd(<8 x i32> %a0, <8 x i32> %a1) {
; AVX2-LABEL: test_x86_avx2_pminsd:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x39,0xc1]
-; AVX2-NEXT: retl ## encoding: [0xc3]
+; AVX2-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
;
; AVX512VL-LABEL: test_x86_avx2_pminsd:
-; AVX512VL: ## BB#0:
+; AVX512VL: ## %bb.0:
; AVX512VL-NEXT: vpminsd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x39,0xc1]
-; AVX512VL-NEXT: retl ## encoding: [0xc3]
+; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
%res = call <8 x i32> @llvm.x86.avx2.pmins.d(<8 x i32> %a0, <8 x i32> %a1) ; <<8 x i32>> [#uses=1]
ret <8 x i32> %res
}
@@ -984,14 +1049,14 @@ declare <8 x i32> @llvm.x86.avx2.pmins.d(<8 x i32>, <8 x i32>) nounwind readnone
define <8 x i32> @test_x86_avx2_pminud(<8 x i32> %a0, <8 x i32> %a1) {
; AVX2-LABEL: test_x86_avx2_pminud:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x3b,0xc1]
-; AVX2-NEXT: retl ## encoding: [0xc3]
+; AVX2-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
;
; AVX512VL-LABEL: test_x86_avx2_pminud:
-; AVX512VL: ## BB#0:
+; AVX512VL: ## %bb.0:
; AVX512VL-NEXT: vpminud %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x3b,0xc1]
-; AVX512VL-NEXT: retl ## encoding: [0xc3]
+; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
%res = call <8 x i32> @llvm.x86.avx2.pminu.d(<8 x i32> %a0, <8 x i32> %a1) ; <<8 x i32>> [#uses=1]
ret <8 x i32> %res
}
@@ -1000,14 +1065,14 @@ declare <8 x i32> @llvm.x86.avx2.pminu.d(<8 x i32>, <8 x i32>) nounwind readnone
define <16 x i16> @test_x86_avx2_pminuw(<16 x i16> %a0, <16 x i16> %a1) {
; AVX2-LABEL: test_x86_avx2_pminuw:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x3a,0xc1]
-; AVX2-NEXT: retl ## encoding: [0xc3]
+; AVX2-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
;
; AVX512VL-LABEL: test_x86_avx2_pminuw:
-; AVX512VL: ## BB#0:
+; AVX512VL: ## %bb.0:
; AVX512VL-NEXT: vpminuw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x3a,0xc1]
-; AVX512VL-NEXT: retl ## encoding: [0xc3]
+; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx2.pminu.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
ret <16 x i16> %res
}
@@ -1023,10 +1088,10 @@ declare <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32>, <8 x i32>) nounwind readnone
define <4 x i32> @test_x86_avx2_pblendd_128(<4 x i32> %a0, <4 x i32> %a1) {
; CHECK-LABEL: test_x86_avx2_pblendd_128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpblendd $8, %xmm0, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x71,0x02,0xc0,0x08]
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vblendps $8, %xmm0, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x71,0x0c,0xc0,0x08]
; CHECK-NEXT: ## xmm0 = xmm1[0,1,2],xmm0[3]
-; CHECK-NEXT: retl ## encoding: [0xc3]
+; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx2.pblendd.128(<4 x i32> %a0, <4 x i32> %a1, i8 7) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %res
}
@@ -1035,10 +1100,10 @@ declare <4 x i32> @llvm.x86.avx2.pblendd.128(<4 x i32>, <4 x i32>, i8) nounwind
define <8 x i32> @test_x86_avx2_pblendd_256(<8 x i32> %a0, <8 x i32> %a1) {
; CHECK-LABEL: test_x86_avx2_pblendd_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpblendd $7, %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x02,0xc1,0x07]
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vblendps $7, %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x0c,0xc1,0x07]
; CHECK-NEXT: ## ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
-; CHECK-NEXT: retl ## encoding: [0xc3]
+; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
%res = call <8 x i32> @llvm.x86.avx2.pblendd.256(<8 x i32> %a0, <8 x i32> %a1, i8 7) ; <<8 x i32>> [#uses=1]
ret <8 x i32> %res
}
@@ -1050,14 +1115,14 @@ declare <8 x i32> @llvm.x86.avx2.pblendd.256(<8 x i32>, <8 x i32>, i8) nounwind
; the instruction.
define <8 x i32> @test_x86_avx2_permd(<8 x i32> %a0, <8 x i32> %a1) {
; AVX2-LABEL: test_x86_avx2_permd:
-; AVX2: ## BB#0:
-; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x75,0x36,0xc0]
-; AVX2-NEXT: retl ## encoding: [0xc3]
+; AVX2: ## %bb.0:
+; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x75,0x16,0xc0]
+; AVX2-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
;
; AVX512VL-LABEL: test_x86_avx2_permd:
-; AVX512VL: ## BB#0:
-; AVX512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0x36,0xc0]
-; AVX512VL-NEXT: retl ## encoding: [0xc3]
+; AVX512VL: ## %bb.0:
+; AVX512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0x16,0xc0]
+; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
%res = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a0, <8 x i32> %a1) ; <<8 x i32>> [#uses=1]
ret <8 x i32> %res
}
@@ -1069,38 +1134,31 @@ declare <8 x i32> @llvm.x86.avx2.permd(<8 x i32>, <8 x i32>) nounwind readonly
; the instruction.
define <8 x float> @test_x86_avx2_permps(<8 x float> %a0, <8 x i32> %a1) {
; AVX2-LABEL: test_x86_avx2_permps:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x75,0x16,0xc0]
-; AVX2-NEXT: retl ## encoding: [0xc3]
+; AVX2-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
;
; AVX512VL-LABEL: test_x86_avx2_permps:
-; AVX512VL: ## BB#0:
+; AVX512VL: ## %bb.0:
; AVX512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0x16,0xc0]
-; AVX512VL-NEXT: retl ## encoding: [0xc3]
+; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> %a1) ; <<8 x float>> [#uses=1]
ret <8 x float> %res
}
declare <8 x float> @llvm.x86.avx2.permps(<8 x float>, <8 x i32>) nounwind readonly
-define <4 x i64> @test_x86_avx2_vperm2i128(<4 x i64> %a0, <4 x i64> %a1) {
-; CHECK-LABEL: test_x86_avx2_vperm2i128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vperm2f128 $1, %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x06,0xc1,0x01]
-; CHECK-NEXT: ## ymm0 = ymm0[2,3,0,1]
-; CHECK-NEXT: retl ## encoding: [0xc3]
- %res = call <4 x i64> @llvm.x86.avx2.vperm2i128(<4 x i64> %a0, <4 x i64> %a1, i8 1) ; <<4 x i64>> [#uses=1]
- ret <4 x i64> %res
-}
-declare <4 x i64> @llvm.x86.avx2.vperm2i128(<4 x i64>, <4 x i64>, i8) nounwind readonly
-
-
define <2 x i64> @test_x86_avx2_maskload_q(i8* %a0, <2 x i64> %a1) {
-; CHECK-LABEL: test_x86_avx2_maskload_q:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; CHECK-NEXT: vpmaskmovq (%eax), %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0xf9,0x8c,0x00]
-; CHECK-NEXT: retl ## encoding: [0xc3]
+; X86-LABEL: test_x86_avx2_maskload_q:
+; X86: ## %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT: vpmaskmovq (%eax), %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0xf9,0x8c,0x00]
+; X86-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
+;
+; X64-LABEL: test_x86_avx2_maskload_q:
+; X64: ## %bb.0:
+; X64-NEXT: vpmaskmovq (%rdi), %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0xf9,0x8c,0x07]
+; X64-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
%res = call <2 x i64> @llvm.x86.avx2.maskload.q(i8* %a0, <2 x i64> %a1) ; <<2 x i64>> [#uses=1]
ret <2 x i64> %res
}
@@ -1108,11 +1166,16 @@ declare <2 x i64> @llvm.x86.avx2.maskload.q(i8*, <2 x i64>) nounwind readonly
define <4 x i64> @test_x86_avx2_maskload_q_256(i8* %a0, <4 x i64> %a1) {
-; CHECK-LABEL: test_x86_avx2_maskload_q_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; CHECK-NEXT: vpmaskmovq (%eax), %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0xfd,0x8c,0x00]
-; CHECK-NEXT: retl ## encoding: [0xc3]
+; X86-LABEL: test_x86_avx2_maskload_q_256:
+; X86: ## %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT: vpmaskmovq (%eax), %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0xfd,0x8c,0x00]
+; X86-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
+;
+; X64-LABEL: test_x86_avx2_maskload_q_256:
+; X64: ## %bb.0:
+; X64-NEXT: vpmaskmovq (%rdi), %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0xfd,0x8c,0x07]
+; X64-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
%res = call <4 x i64> @llvm.x86.avx2.maskload.q.256(i8* %a0, <4 x i64> %a1) ; <<4 x i64>> [#uses=1]
ret <4 x i64> %res
}
@@ -1120,11 +1183,16 @@ declare <4 x i64> @llvm.x86.avx2.maskload.q.256(i8*, <4 x i64>) nounwind readonl
define <4 x i32> @test_x86_avx2_maskload_d(i8* %a0, <4 x i32> %a1) {
-; CHECK-LABEL: test_x86_avx2_maskload_d:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; CHECK-NEXT: vpmaskmovd (%eax), %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x8c,0x00]
-; CHECK-NEXT: retl ## encoding: [0xc3]
+; X86-LABEL: test_x86_avx2_maskload_d:
+; X86: ## %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT: vpmaskmovd (%eax), %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x8c,0x00]
+; X86-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
+;
+; X64-LABEL: test_x86_avx2_maskload_d:
+; X64: ## %bb.0:
+; X64-NEXT: vpmaskmovd (%rdi), %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x8c,0x07]
+; X64-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx2.maskload.d(i8* %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %res
}
@@ -1132,11 +1200,16 @@ declare <4 x i32> @llvm.x86.avx2.maskload.d(i8*, <4 x i32>) nounwind readonly
define <8 x i32> @test_x86_avx2_maskload_d_256(i8* %a0, <8 x i32> %a1) {
-; CHECK-LABEL: test_x86_avx2_maskload_d_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; CHECK-NEXT: vpmaskmovd (%eax), %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x8c,0x00]
-; CHECK-NEXT: retl ## encoding: [0xc3]
+; X86-LABEL: test_x86_avx2_maskload_d_256:
+; X86: ## %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT: vpmaskmovd (%eax), %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x8c,0x00]
+; X86-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
+;
+; X64-LABEL: test_x86_avx2_maskload_d_256:
+; X64: ## %bb.0:
+; X64-NEXT: vpmaskmovd (%rdi), %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x8c,0x07]
+; X64-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
%res = call <8 x i32> @llvm.x86.avx2.maskload.d.256(i8* %a0, <8 x i32> %a1) ; <<8 x i32>> [#uses=1]
ret <8 x i32> %res
}
@@ -1144,11 +1217,16 @@ declare <8 x i32> @llvm.x86.avx2.maskload.d.256(i8*, <8 x i32>) nounwind readonl
define void @test_x86_avx2_maskstore_q(i8* %a0, <2 x i64> %a1, <2 x i64> %a2) {
-; CHECK-LABEL: test_x86_avx2_maskstore_q:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; CHECK-NEXT: vpmaskmovq %xmm1, %xmm0, (%eax) ## encoding: [0xc4,0xe2,0xf9,0x8e,0x08]
-; CHECK-NEXT: retl ## encoding: [0xc3]
+; X86-LABEL: test_x86_avx2_maskstore_q:
+; X86: ## %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT: vpmaskmovq %xmm1, %xmm0, (%eax) ## encoding: [0xc4,0xe2,0xf9,0x8e,0x08]
+; X86-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
+;
+; X64-LABEL: test_x86_avx2_maskstore_q:
+; X64: ## %bb.0:
+; X64-NEXT: vpmaskmovq %xmm1, %xmm0, (%rdi) ## encoding: [0xc4,0xe2,0xf9,0x8e,0x0f]
+; X64-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
call void @llvm.x86.avx2.maskstore.q(i8* %a0, <2 x i64> %a1, <2 x i64> %a2)
ret void
}
@@ -1156,12 +1234,18 @@ declare void @llvm.x86.avx2.maskstore.q(i8*, <2 x i64>, <2 x i64>) nounwind
define void @test_x86_avx2_maskstore_q_256(i8* %a0, <4 x i64> %a1, <4 x i64> %a2) {
-; CHECK-LABEL: test_x86_avx2_maskstore_q_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; CHECK-NEXT: vpmaskmovq %ymm1, %ymm0, (%eax) ## encoding: [0xc4,0xe2,0xfd,0x8e,0x08]
-; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; CHECK-NEXT: retl ## encoding: [0xc3]
+; X86-LABEL: test_x86_avx2_maskstore_q_256:
+; X86: ## %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT: vpmaskmovq %ymm1, %ymm0, (%eax) ## encoding: [0xc4,0xe2,0xfd,0x8e,0x08]
+; X86-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; X86-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
+;
+; X64-LABEL: test_x86_avx2_maskstore_q_256:
+; X64: ## %bb.0:
+; X64-NEXT: vpmaskmovq %ymm1, %ymm0, (%rdi) ## encoding: [0xc4,0xe2,0xfd,0x8e,0x0f]
+; X64-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; X64-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
call void @llvm.x86.avx2.maskstore.q.256(i8* %a0, <4 x i64> %a1, <4 x i64> %a2)
ret void
}
@@ -1169,11 +1253,16 @@ declare void @llvm.x86.avx2.maskstore.q.256(i8*, <4 x i64>, <4 x i64>) nounwind
define void @test_x86_avx2_maskstore_d(i8* %a0, <4 x i32> %a1, <4 x i32> %a2) {
-; CHECK-LABEL: test_x86_avx2_maskstore_d:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; CHECK-NEXT: vpmaskmovd %xmm1, %xmm0, (%eax) ## encoding: [0xc4,0xe2,0x79,0x8e,0x08]
-; CHECK-NEXT: retl ## encoding: [0xc3]
+; X86-LABEL: test_x86_avx2_maskstore_d:
+; X86: ## %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT: vpmaskmovd %xmm1, %xmm0, (%eax) ## encoding: [0xc4,0xe2,0x79,0x8e,0x08]
+; X86-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
+;
+; X64-LABEL: test_x86_avx2_maskstore_d:
+; X64: ## %bb.0:
+; X64-NEXT: vpmaskmovd %xmm1, %xmm0, (%rdi) ## encoding: [0xc4,0xe2,0x79,0x8e,0x0f]
+; X64-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
call void @llvm.x86.avx2.maskstore.d(i8* %a0, <4 x i32> %a1, <4 x i32> %a2)
ret void
}
@@ -1181,12 +1270,18 @@ declare void @llvm.x86.avx2.maskstore.d(i8*, <4 x i32>, <4 x i32>) nounwind
define void @test_x86_avx2_maskstore_d_256(i8* %a0, <8 x i32> %a1, <8 x i32> %a2) {
-; CHECK-LABEL: test_x86_avx2_maskstore_d_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; CHECK-NEXT: vpmaskmovd %ymm1, %ymm0, (%eax) ## encoding: [0xc4,0xe2,0x7d,0x8e,0x08]
-; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; CHECK-NEXT: retl ## encoding: [0xc3]
+; X86-LABEL: test_x86_avx2_maskstore_d_256:
+; X86: ## %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT: vpmaskmovd %ymm1, %ymm0, (%eax) ## encoding: [0xc4,0xe2,0x7d,0x8e,0x08]
+; X86-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; X86-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
+;
+; X64-LABEL: test_x86_avx2_maskstore_d_256:
+; X64: ## %bb.0:
+; X64-NEXT: vpmaskmovd %ymm1, %ymm0, (%rdi) ## encoding: [0xc4,0xe2,0x7d,0x8e,0x0f]
+; X64-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; X64-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
call void @llvm.x86.avx2.maskstore.d.256(i8* %a0, <8 x i32> %a1, <8 x i32> %a2)
ret void
}
@@ -1195,14 +1290,14 @@ declare void @llvm.x86.avx2.maskstore.d.256(i8*, <8 x i32>, <8 x i32>) nounwind
define <4 x i32> @test_x86_avx2_psllv_d(<4 x i32> %a0, <4 x i32> %a1) {
; AVX2-LABEL: test_x86_avx2_psllv_d:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vpsllvd %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x47,0xc1]
-; AVX2-NEXT: retl ## encoding: [0xc3]
+; AVX2-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
;
; AVX512VL-LABEL: test_x86_avx2_psllv_d:
-; AVX512VL: ## BB#0:
+; AVX512VL: ## %bb.0:
; AVX512VL-NEXT: vpsllvd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x47,0xc1]
-; AVX512VL-NEXT: retl ## encoding: [0xc3]
+; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32> %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %res
}
@@ -1211,14 +1306,14 @@ declare <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32>, <4 x i32>) nounwind readnone
define <8 x i32> @test_x86_avx2_psllv_d_256(<8 x i32> %a0, <8 x i32> %a1) {
; AVX2-LABEL: test_x86_avx2_psllv_d_256:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vpsllvd %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x47,0xc1]
-; AVX2-NEXT: retl ## encoding: [0xc3]
+; AVX2-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
;
; AVX512VL-LABEL: test_x86_avx2_psllv_d_256:
-; AVX512VL: ## BB#0:
+; AVX512VL: ## %bb.0:
; AVX512VL-NEXT: vpsllvd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x47,0xc1]
-; AVX512VL-NEXT: retl ## encoding: [0xc3]
+; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
%res = call <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32> %a0, <8 x i32> %a1) ; <<8 x i32>> [#uses=1]
ret <8 x i32> %res
}
@@ -1227,14 +1322,14 @@ declare <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32>, <8 x i32>) nounwind read
define <2 x i64> @test_x86_avx2_psllv_q(<2 x i64> %a0, <2 x i64> %a1) {
; AVX2-LABEL: test_x86_avx2_psllv_q:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vpsllvq %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0xf9,0x47,0xc1]
-; AVX2-NEXT: retl ## encoding: [0xc3]
+; AVX2-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
;
; AVX512VL-LABEL: test_x86_avx2_psllv_q:
-; AVX512VL: ## BB#0:
+; AVX512VL: ## %bb.0:
; AVX512VL-NEXT: vpsllvq %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0x47,0xc1]
-; AVX512VL-NEXT: retl ## encoding: [0xc3]
+; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
%res = call <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64> %a0, <2 x i64> %a1) ; <<2 x i64>> [#uses=1]
ret <2 x i64> %res
}
@@ -1243,14 +1338,14 @@ declare <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64>, <2 x i64>) nounwind readnone
define <4 x i64> @test_x86_avx2_psllv_q_256(<4 x i64> %a0, <4 x i64> %a1) {
; AVX2-LABEL: test_x86_avx2_psllv_q_256:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vpsllvq %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0xfd,0x47,0xc1]
-; AVX2-NEXT: retl ## encoding: [0xc3]
+; AVX2-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
;
; AVX512VL-LABEL: test_x86_avx2_psllv_q_256:
-; AVX512VL: ## BB#0:
+; AVX512VL: ## %bb.0:
; AVX512VL-NEXT: vpsllvq %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xfd,0x47,0xc1]
-; AVX512VL-NEXT: retl ## encoding: [0xc3]
+; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
%res = call <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64> %a0, <4 x i64> %a1) ; <<4 x i64>> [#uses=1]
ret <4 x i64> %res
}
@@ -1259,14 +1354,14 @@ declare <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64>, <4 x i64>) nounwind read
define <4 x i32> @test_x86_avx2_psrlv_d(<4 x i32> %a0, <4 x i32> %a1) {
; AVX2-LABEL: test_x86_avx2_psrlv_d:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x45,0xc1]
-; AVX2-NEXT: retl ## encoding: [0xc3]
+; AVX2-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
;
; AVX512VL-LABEL: test_x86_avx2_psrlv_d:
-; AVX512VL: ## BB#0:
+; AVX512VL: ## %bb.0:
; AVX512VL-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x45,0xc1]
-; AVX512VL-NEXT: retl ## encoding: [0xc3]
+; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32> %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %res
}
@@ -1275,14 +1370,14 @@ declare <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32>, <4 x i32>) nounwind readnone
define <8 x i32> @test_x86_avx2_psrlv_d_256(<8 x i32> %a0, <8 x i32> %a1) {
; AVX2-LABEL: test_x86_avx2_psrlv_d_256:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x45,0xc1]
-; AVX2-NEXT: retl ## encoding: [0xc3]
+; AVX2-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
;
; AVX512VL-LABEL: test_x86_avx2_psrlv_d_256:
-; AVX512VL: ## BB#0:
+; AVX512VL: ## %bb.0:
; AVX512VL-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x45,0xc1]
-; AVX512VL-NEXT: retl ## encoding: [0xc3]
+; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
%res = call <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32> %a0, <8 x i32> %a1) ; <<8 x i32>> [#uses=1]
ret <8 x i32> %res
}
@@ -1291,14 +1386,14 @@ declare <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32>, <8 x i32>) nounwind read
define <2 x i64> @test_x86_avx2_psrlv_q(<2 x i64> %a0, <2 x i64> %a1) {
; AVX2-LABEL: test_x86_avx2_psrlv_q:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0xf9,0x45,0xc1]
-; AVX2-NEXT: retl ## encoding: [0xc3]
+; AVX2-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
;
; AVX512VL-LABEL: test_x86_avx2_psrlv_q:
-; AVX512VL: ## BB#0:
+; AVX512VL: ## %bb.0:
; AVX512VL-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0x45,0xc1]
-; AVX512VL-NEXT: retl ## encoding: [0xc3]
+; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
%res = call <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64> %a0, <2 x i64> %a1) ; <<2 x i64>> [#uses=1]
ret <2 x i64> %res
}
@@ -1307,14 +1402,14 @@ declare <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64>, <2 x i64>) nounwind readnone
define <4 x i64> @test_x86_avx2_psrlv_q_256(<4 x i64> %a0, <4 x i64> %a1) {
; AVX2-LABEL: test_x86_avx2_psrlv_q_256:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0xfd,0x45,0xc1]
-; AVX2-NEXT: retl ## encoding: [0xc3]
+; AVX2-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
;
; AVX512VL-LABEL: test_x86_avx2_psrlv_q_256:
-; AVX512VL: ## BB#0:
+; AVX512VL: ## %bb.0:
; AVX512VL-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xfd,0x45,0xc1]
-; AVX512VL-NEXT: retl ## encoding: [0xc3]
+; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
%res = call <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64> %a0, <4 x i64> %a1) ; <<4 x i64>> [#uses=1]
ret <4 x i64> %res
}
@@ -1323,36 +1418,54 @@ declare <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64>, <4 x i64>) nounwind read
define <4 x i32> @test_x86_avx2_psrav_d(<4 x i32> %a0, <4 x i32> %a1) {
; AVX2-LABEL: test_x86_avx2_psrav_d:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vpsravd %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x46,0xc1]
-; AVX2-NEXT: retl ## encoding: [0xc3]
+; AVX2-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
;
; AVX512VL-LABEL: test_x86_avx2_psrav_d:
-; AVX512VL: ## BB#0:
+; AVX512VL: ## %bb.0:
; AVX512VL-NEXT: vpsravd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x46,0xc1]
-; AVX512VL-NEXT: retl ## encoding: [0xc3]
+; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32> %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1]
ret <4 x i32> %res
}
define <4 x i32> @test_x86_avx2_psrav_d_const(<4 x i32> %a0, <4 x i32> %a1) {
-; AVX2-LABEL: test_x86_avx2_psrav_d_const:
-; AVX2: ## BB#0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [2,9,4294967284,23]
-; AVX2-NEXT: ## encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A]
-; AVX2-NEXT: ## fixup A - offset: 4, value: LCPI90_0, kind: FK_Data_4
-; AVX2-NEXT: vpsravd LCPI90_1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x46,0x05,A,A,A,A]
-; AVX2-NEXT: ## fixup A - offset: 5, value: LCPI90_1, kind: FK_Data_4
-; AVX2-NEXT: retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_avx2_psrav_d_const:
-; AVX512VL: ## BB#0:
-; AVX512VL-NEXT: vmovdqa LCPI90_0, %xmm0 ## EVEX TO VEX Compression xmm0 = [2,9,4294967284,23]
-; AVX512VL-NEXT: ## encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A]
-; AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI90_0, kind: FK_Data_4
-; AVX512VL-NEXT: vpsravd LCPI90_1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x46,0x05,A,A,A,A]
-; AVX512VL-NEXT: ## fixup A - offset: 5, value: LCPI90_1, kind: FK_Data_4
-; AVX512VL-NEXT: retl ## encoding: [0xc3]
+; X86-AVX-LABEL: test_x86_avx2_psrav_d_const:
+; X86-AVX: ## %bb.0:
+; X86-AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [2,9,4294967284,23]
+; X86-AVX-NEXT: ## encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A]
+; X86-AVX-NEXT: ## fixup A - offset: 4, value: LCPI88_0, kind: FK_Data_4
+; X86-AVX-NEXT: vpsravd LCPI88_1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x46,0x05,A,A,A,A]
+; X86-AVX-NEXT: ## fixup A - offset: 5, value: LCPI88_1, kind: FK_Data_4
+; X86-AVX-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
+;
+; X86-AVX512VL-LABEL: test_x86_avx2_psrav_d_const:
+; X86-AVX512VL: ## %bb.0:
+; X86-AVX512VL-NEXT: vmovdqa LCPI88_0, %xmm0 ## EVEX TO VEX Compression xmm0 = [2,9,4294967284,23]
+; X86-AVX512VL-NEXT: ## encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A]
+; X86-AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI88_0, kind: FK_Data_4
+; X86-AVX512VL-NEXT: vpsravd LCPI88_1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x46,0x05,A,A,A,A]
+; X86-AVX512VL-NEXT: ## fixup A - offset: 5, value: LCPI88_1, kind: FK_Data_4
+; X86-AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
+;
+; X64-AVX-LABEL: test_x86_avx2_psrav_d_const:
+; X64-AVX: ## %bb.0:
+; X64-AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [2,9,4294967284,23]
+; X64-AVX-NEXT: ## encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A]
+; X64-AVX-NEXT: ## fixup A - offset: 4, value: LCPI88_0-4, kind: reloc_riprel_4byte
+; X64-AVX-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x46,0x05,A,A,A,A]
+; X64-AVX-NEXT: ## fixup A - offset: 5, value: LCPI88_1-4, kind: reloc_riprel_4byte
+; X64-AVX-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
+;
+; X64-AVX512VL-LABEL: test_x86_avx2_psrav_d_const:
+; X64-AVX512VL: ## %bb.0:
+; X64-AVX512VL-NEXT: vmovdqa {{.*}}(%rip), %xmm0 ## EVEX TO VEX Compression xmm0 = [2,9,4294967284,23]
+; X64-AVX512VL-NEXT: ## encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A]
+; X64-AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI88_0-4, kind: reloc_riprel_4byte
+; X64-AVX512VL-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x46,0x05,A,A,A,A]
+; X64-AVX512VL-NEXT: ## fixup A - offset: 5, value: LCPI88_1-4, kind: reloc_riprel_4byte
+; X64-AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32> <i32 2, i32 9, i32 -12, i32 23>, <4 x i32> <i32 1, i32 18, i32 35, i32 52>)
ret <4 x i32> %res
}
@@ -1360,47 +1473,70 @@ declare <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32>, <4 x i32>) nounwind readnone
define <8 x i32> @test_x86_avx2_psrav_d_256(<8 x i32> %a0, <8 x i32> %a1) {
; AVX2-LABEL: test_x86_avx2_psrav_d_256:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vpsravd %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x46,0xc1]
-; AVX2-NEXT: retl ## encoding: [0xc3]
+; AVX2-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
;
; AVX512VL-LABEL: test_x86_avx2_psrav_d_256:
-; AVX512VL: ## BB#0:
+; AVX512VL: ## %bb.0:
; AVX512VL-NEXT: vpsravd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x46,0xc1]
-; AVX512VL-NEXT: retl ## encoding: [0xc3]
+; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
%res = call <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32> %a0, <8 x i32> %a1) ; <<8 x i32>> [#uses=1]
ret <8 x i32> %res
}
define <8 x i32> @test_x86_avx2_psrav_d_256_const(<8 x i32> %a0, <8 x i32> %a1) {
-; AVX2-LABEL: test_x86_avx2_psrav_d_256_const:
-; AVX2: ## BB#0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [2,9,4294967284,23,4294967270,37,4294967256,51]
-; AVX2-NEXT: ## encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A]
-; AVX2-NEXT: ## fixup A - offset: 4, value: LCPI92_0, kind: FK_Data_4
-; AVX2-NEXT: vpsravd LCPI92_1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x46,0x05,A,A,A,A]
-; AVX2-NEXT: ## fixup A - offset: 5, value: LCPI92_1, kind: FK_Data_4
-; AVX2-NEXT: retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_avx2_psrav_d_256_const:
-; AVX512VL: ## BB#0:
-; AVX512VL-NEXT: vmovdqa LCPI92_0, %ymm0 ## EVEX TO VEX Compression ymm0 = [2,9,4294967284,23,4294967270,37,4294967256,51]
-; AVX512VL-NEXT: ## encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A]
-; AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI92_0, kind: FK_Data_4
-; AVX512VL-NEXT: vpsravd LCPI92_1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x46,0x05,A,A,A,A]
-; AVX512VL-NEXT: ## fixup A - offset: 5, value: LCPI92_1, kind: FK_Data_4
-; AVX512VL-NEXT: retl ## encoding: [0xc3]
+; X86-AVX-LABEL: test_x86_avx2_psrav_d_256_const:
+; X86-AVX: ## %bb.0:
+; X86-AVX-NEXT: vmovdqa {{.*#+}} ymm0 = [2,9,4294967284,23,4294967270,37,4294967256,51]
+; X86-AVX-NEXT: ## encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A]
+; X86-AVX-NEXT: ## fixup A - offset: 4, value: LCPI90_0, kind: FK_Data_4
+; X86-AVX-NEXT: vpsravd LCPI90_1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x46,0x05,A,A,A,A]
+; X86-AVX-NEXT: ## fixup A - offset: 5, value: LCPI90_1, kind: FK_Data_4
+; X86-AVX-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
+;
+; X86-AVX512VL-LABEL: test_x86_avx2_psrav_d_256_const:
+; X86-AVX512VL: ## %bb.0:
+; X86-AVX512VL-NEXT: vmovdqa LCPI90_0, %ymm0 ## EVEX TO VEX Compression ymm0 = [2,9,4294967284,23,4294967270,37,4294967256,51]
+; X86-AVX512VL-NEXT: ## encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A]
+; X86-AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI90_0, kind: FK_Data_4
+; X86-AVX512VL-NEXT: vpsravd LCPI90_1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x46,0x05,A,A,A,A]
+; X86-AVX512VL-NEXT: ## fixup A - offset: 5, value: LCPI90_1, kind: FK_Data_4
+; X86-AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
+;
+; X64-AVX-LABEL: test_x86_avx2_psrav_d_256_const:
+; X64-AVX: ## %bb.0:
+; X64-AVX-NEXT: vmovdqa {{.*#+}} ymm0 = [2,9,4294967284,23,4294967270,37,4294967256,51]
+; X64-AVX-NEXT: ## encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A]
+; X64-AVX-NEXT: ## fixup A - offset: 4, value: LCPI90_0-4, kind: reloc_riprel_4byte
+; X64-AVX-NEXT: vpsravd {{.*}}(%rip), %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x46,0x05,A,A,A,A]
+; X64-AVX-NEXT: ## fixup A - offset: 5, value: LCPI90_1-4, kind: reloc_riprel_4byte
+; X64-AVX-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
+;
+; X64-AVX512VL-LABEL: test_x86_avx2_psrav_d_256_const:
+; X64-AVX512VL: ## %bb.0:
+; X64-AVX512VL-NEXT: vmovdqa {{.*}}(%rip), %ymm0 ## EVEX TO VEX Compression ymm0 = [2,9,4294967284,23,4294967270,37,4294967256,51]
+; X64-AVX512VL-NEXT: ## encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A]
+; X64-AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI90_0-4, kind: reloc_riprel_4byte
+; X64-AVX512VL-NEXT: vpsravd {{.*}}(%rip), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x46,0x05,A,A,A,A]
+; X64-AVX512VL-NEXT: ## fixup A - offset: 5, value: LCPI90_1-4, kind: reloc_riprel_4byte
+; X64-AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
%res = call <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32> <i32 2, i32 9, i32 -12, i32 23, i32 -26, i32 37, i32 -40, i32 51>, <8 x i32> <i32 1, i32 18, i32 35, i32 52, i32 69, i32 15, i32 32, i32 49>)
ret <8 x i32> %res
}
declare <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32>, <8 x i32>) nounwind readnone
define <2 x double> @test_x86_avx2_gather_d_pd(<2 x double> %a0, i8* %a1, <4 x i32> %idx, <2 x double> %mask) {
-; CHECK-LABEL: test_x86_avx2_gather_d_pd:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; CHECK-NEXT: vgatherdpd %xmm2, (%eax,%xmm1,2), %xmm0 ## encoding: [0xc4,0xe2,0xe9,0x92,0x04,0x48]
-; CHECK-NEXT: retl ## encoding: [0xc3]
+; X86-LABEL: test_x86_avx2_gather_d_pd:
+; X86: ## %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT: vgatherdpd %xmm2, (%eax,%xmm1,2), %xmm0 ## encoding: [0xc4,0xe2,0xe9,0x92,0x04,0x48]
+; X86-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
+;
+; X64-LABEL: test_x86_avx2_gather_d_pd:
+; X64: ## %bb.0:
+; X64-NEXT: vgatherdpd %xmm2, (%rdi,%xmm1,2), %xmm0 ## encoding: [0xc4,0xe2,0xe9,0x92,0x04,0x4f]
+; X64-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.avx2.gather.d.pd(<2 x double> %a0,
i8* %a1, <4 x i32> %idx, <2 x double> %mask, i8 2) ;
ret <2 x double> %res
@@ -1409,11 +1545,16 @@ declare <2 x double> @llvm.x86.avx2.gather.d.pd(<2 x double>, i8*,
<4 x i32>, <2 x double>, i8) nounwind readonly
define <4 x double> @test_x86_avx2_gather_d_pd_256(<4 x double> %a0, i8* %a1, <4 x i32> %idx, <4 x double> %mask) {
-; CHECK-LABEL: test_x86_avx2_gather_d_pd_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; CHECK-NEXT: vgatherdpd %ymm2, (%eax,%xmm1,2), %ymm0 ## encoding: [0xc4,0xe2,0xed,0x92,0x04,0x48]
-; CHECK-NEXT: retl ## encoding: [0xc3]
+; X86-LABEL: test_x86_avx2_gather_d_pd_256:
+; X86: ## %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT: vgatherdpd %ymm2, (%eax,%xmm1,2), %ymm0 ## encoding: [0xc4,0xe2,0xed,0x92,0x04,0x48]
+; X86-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
+;
+; X64-LABEL: test_x86_avx2_gather_d_pd_256:
+; X64: ## %bb.0:
+; X64-NEXT: vgatherdpd %ymm2, (%rdi,%xmm1,2), %ymm0 ## encoding: [0xc4,0xe2,0xed,0x92,0x04,0x4f]
+; X64-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> %a0,
i8* %a1, <4 x i32> %idx, <4 x double> %mask, i8 2) ;
ret <4 x double> %res
@@ -1422,11 +1563,16 @@ declare <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double>, i8*,
<4 x i32>, <4 x double>, i8) nounwind readonly
define <2 x double> @test_x86_avx2_gather_q_pd(<2 x double> %a0, i8* %a1, <2 x i64> %idx, <2 x double> %mask) {
-; CHECK-LABEL: test_x86_avx2_gather_q_pd:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; CHECK-NEXT: vgatherqpd %xmm2, (%eax,%xmm1,2), %xmm0 ## encoding: [0xc4,0xe2,0xe9,0x93,0x04,0x48]
-; CHECK-NEXT: retl ## encoding: [0xc3]
+; X86-LABEL: test_x86_avx2_gather_q_pd:
+; X86: ## %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT: vgatherqpd %xmm2, (%eax,%xmm1,2), %xmm0 ## encoding: [0xc4,0xe2,0xe9,0x93,0x04,0x48]
+; X86-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
+;
+; X64-LABEL: test_x86_avx2_gather_q_pd:
+; X64: ## %bb.0:
+; X64-NEXT: vgatherqpd %xmm2, (%rdi,%xmm1,2), %xmm0 ## encoding: [0xc4,0xe2,0xe9,0x93,0x04,0x4f]
+; X64-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.avx2.gather.q.pd(<2 x double> %a0,
i8* %a1, <2 x i64> %idx, <2 x double> %mask, i8 2) ;
ret <2 x double> %res
@@ -1435,11 +1581,16 @@ declare <2 x double> @llvm.x86.avx2.gather.q.pd(<2 x double>, i8*,
<2 x i64>, <2 x double>, i8) nounwind readonly
define <4 x double> @test_x86_avx2_gather_q_pd_256(<4 x double> %a0, i8* %a1, <4 x i64> %idx, <4 x double> %mask) {
-; CHECK-LABEL: test_x86_avx2_gather_q_pd_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; CHECK-NEXT: vgatherqpd %ymm2, (%eax,%ymm1,2), %ymm0 ## encoding: [0xc4,0xe2,0xed,0x93,0x04,0x48]
-; CHECK-NEXT: retl ## encoding: [0xc3]
+; X86-LABEL: test_x86_avx2_gather_q_pd_256:
+; X86: ## %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT: vgatherqpd %ymm2, (%eax,%ymm1,2), %ymm0 ## encoding: [0xc4,0xe2,0xed,0x93,0x04,0x48]
+; X86-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
+;
+; X64-LABEL: test_x86_avx2_gather_q_pd_256:
+; X64: ## %bb.0:
+; X64-NEXT: vgatherqpd %ymm2, (%rdi,%ymm1,2), %ymm0 ## encoding: [0xc4,0xe2,0xed,0x93,0x04,0x4f]
+; X64-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> %a0,
i8* %a1, <4 x i64> %idx, <4 x double> %mask, i8 2) ;
ret <4 x double> %res
@@ -1448,11 +1599,16 @@ declare <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double>, i8*,
<4 x i64>, <4 x double>, i8) nounwind readonly
define <4 x float> @test_x86_avx2_gather_d_ps(<4 x float> %a0, i8* %a1, <4 x i32> %idx, <4 x float> %mask) {
-; CHECK-LABEL: test_x86_avx2_gather_d_ps:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; CHECK-NEXT: vgatherdps %xmm2, (%eax,%xmm1,2), %xmm0 ## encoding: [0xc4,0xe2,0x69,0x92,0x04,0x48]
-; CHECK-NEXT: retl ## encoding: [0xc3]
+; X86-LABEL: test_x86_avx2_gather_d_ps:
+; X86: ## %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT: vgatherdps %xmm2, (%eax,%xmm1,2), %xmm0 ## encoding: [0xc4,0xe2,0x69,0x92,0x04,0x48]
+; X86-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
+;
+; X64-LABEL: test_x86_avx2_gather_d_ps:
+; X64: ## %bb.0:
+; X64-NEXT: vgatherdps %xmm2, (%rdi,%xmm1,2), %xmm0 ## encoding: [0xc4,0xe2,0x69,0x92,0x04,0x4f]
+; X64-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float> %a0,
i8* %a1, <4 x i32> %idx, <4 x float> %mask, i8 2) ;
ret <4 x float> %res
@@ -1461,11 +1617,16 @@ declare <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float>, i8*,
<4 x i32>, <4 x float>, i8) nounwind readonly
define <8 x float> @test_x86_avx2_gather_d_ps_256(<8 x float> %a0, i8* %a1, <8 x i32> %idx, <8 x float> %mask) {
-; CHECK-LABEL: test_x86_avx2_gather_d_ps_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; CHECK-NEXT: vgatherdps %ymm2, (%eax,%ymm1,2), %ymm0 ## encoding: [0xc4,0xe2,0x6d,0x92,0x04,0x48]
-; CHECK-NEXT: retl ## encoding: [0xc3]
+; X86-LABEL: test_x86_avx2_gather_d_ps_256:
+; X86: ## %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT: vgatherdps %ymm2, (%eax,%ymm1,2), %ymm0 ## encoding: [0xc4,0xe2,0x6d,0x92,0x04,0x48]
+; X86-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
+;
+; X64-LABEL: test_x86_avx2_gather_d_ps_256:
+; X64: ## %bb.0:
+; X64-NEXT: vgatherdps %ymm2, (%rdi,%ymm1,2), %ymm0 ## encoding: [0xc4,0xe2,0x6d,0x92,0x04,0x4f]
+; X64-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> %a0,
i8* %a1, <8 x i32> %idx, <8 x float> %mask, i8 2) ;
ret <8 x float> %res
@@ -1474,11 +1635,16 @@ declare <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float>, i8*,
<8 x i32>, <8 x float>, i8) nounwind readonly
define <4 x float> @test_x86_avx2_gather_q_ps(<4 x float> %a0, i8* %a1, <2 x i64> %idx, <4 x float> %mask) {
-; CHECK-LABEL: test_x86_avx2_gather_q_ps:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; CHECK-NEXT: vgatherqps %xmm2, (%eax,%xmm1,2), %xmm0 ## encoding: [0xc4,0xe2,0x69,0x93,0x04,0x48]
-; CHECK-NEXT: retl ## encoding: [0xc3]
+; X86-LABEL: test_x86_avx2_gather_q_ps:
+; X86: ## %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT: vgatherqps %xmm2, (%eax,%xmm1,2), %xmm0 ## encoding: [0xc4,0xe2,0x69,0x93,0x04,0x48]
+; X86-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
+;
+; X64-LABEL: test_x86_avx2_gather_q_ps:
+; X64: ## %bb.0:
+; X64-NEXT: vgatherqps %xmm2, (%rdi,%xmm1,2), %xmm0 ## encoding: [0xc4,0xe2,0x69,0x93,0x04,0x4f]
+; X64-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx2.gather.q.ps(<4 x float> %a0,
i8* %a1, <2 x i64> %idx, <4 x float> %mask, i8 2) ;
ret <4 x float> %res
@@ -1487,12 +1653,18 @@ declare <4 x float> @llvm.x86.avx2.gather.q.ps(<4 x float>, i8*,
<2 x i64>, <4 x float>, i8) nounwind readonly
define <4 x float> @test_x86_avx2_gather_q_ps_256(<4 x float> %a0, i8* %a1, <4 x i64> %idx, <4 x float> %mask) {
-; CHECK-LABEL: test_x86_avx2_gather_q_ps_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; CHECK-NEXT: vgatherqps %xmm2, (%eax,%ymm1,2), %xmm0 ## encoding: [0xc4,0xe2,0x6d,0x93,0x04,0x48]
-; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; CHECK-NEXT: retl ## encoding: [0xc3]
+; X86-LABEL: test_x86_avx2_gather_q_ps_256:
+; X86: ## %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT: vgatherqps %xmm2, (%eax,%ymm1,2), %xmm0 ## encoding: [0xc4,0xe2,0x6d,0x93,0x04,0x48]
+; X86-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; X86-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
+;
+; X64-LABEL: test_x86_avx2_gather_q_ps_256:
+; X64: ## %bb.0:
+; X64-NEXT: vgatherqps %xmm2, (%rdi,%ymm1,2), %xmm0 ## encoding: [0xc4,0xe2,0x6d,0x93,0x04,0x4f]
+; X64-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; X64-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> %a0,
i8* %a1, <4 x i64> %idx, <4 x float> %mask, i8 2) ;
ret <4 x float> %res
@@ -1501,11 +1673,16 @@ declare <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float>, i8*,
<4 x i64>, <4 x float>, i8) nounwind readonly
define <2 x i64> @test_x86_avx2_gather_d_q(<2 x i64> %a0, i8* %a1, <4 x i32> %idx, <2 x i64> %mask) {
-; CHECK-LABEL: test_x86_avx2_gather_d_q:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; CHECK-NEXT: vpgatherdq %xmm2, (%eax,%xmm1,2), %xmm0 ## encoding: [0xc4,0xe2,0xe9,0x90,0x04,0x48]
-; CHECK-NEXT: retl ## encoding: [0xc3]
+; X86-LABEL: test_x86_avx2_gather_d_q:
+; X86: ## %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT: vpgatherdq %xmm2, (%eax,%xmm1,2), %xmm0 ## encoding: [0xc4,0xe2,0xe9,0x90,0x04,0x48]
+; X86-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
+;
+; X64-LABEL: test_x86_avx2_gather_d_q:
+; X64: ## %bb.0:
+; X64-NEXT: vpgatherdq %xmm2, (%rdi,%xmm1,2), %xmm0 ## encoding: [0xc4,0xe2,0xe9,0x90,0x04,0x4f]
+; X64-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
%res = call <2 x i64> @llvm.x86.avx2.gather.d.q(<2 x i64> %a0,
i8* %a1, <4 x i32> %idx, <2 x i64> %mask, i8 2) ;
ret <2 x i64> %res
@@ -1514,11 +1691,16 @@ declare <2 x i64> @llvm.x86.avx2.gather.d.q(<2 x i64>, i8*,
<4 x i32>, <2 x i64>, i8) nounwind readonly
define <4 x i64> @test_x86_avx2_gather_d_q_256(<4 x i64> %a0, i8* %a1, <4 x i32> %idx, <4 x i64> %mask) {
-; CHECK-LABEL: test_x86_avx2_gather_d_q_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; CHECK-NEXT: vpgatherdq %ymm2, (%eax,%xmm1,2), %ymm0 ## encoding: [0xc4,0xe2,0xed,0x90,0x04,0x48]
-; CHECK-NEXT: retl ## encoding: [0xc3]
+; X86-LABEL: test_x86_avx2_gather_d_q_256:
+; X86: ## %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT: vpgatherdq %ymm2, (%eax,%xmm1,2), %ymm0 ## encoding: [0xc4,0xe2,0xed,0x90,0x04,0x48]
+; X86-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
+;
+; X64-LABEL: test_x86_avx2_gather_d_q_256:
+; X64: ## %bb.0:
+; X64-NEXT: vpgatherdq %ymm2, (%rdi,%xmm1,2), %ymm0 ## encoding: [0xc4,0xe2,0xed,0x90,0x04,0x4f]
+; X64-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
%res = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> %a0,
i8* %a1, <4 x i32> %idx, <4 x i64> %mask, i8 2) ;
ret <4 x i64> %res
@@ -1527,11 +1709,16 @@ declare <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64>, i8*,
<4 x i32>, <4 x i64>, i8) nounwind readonly
define <2 x i64> @test_x86_avx2_gather_q_q(<2 x i64> %a0, i8* %a1, <2 x i64> %idx, <2 x i64> %mask) {
-; CHECK-LABEL: test_x86_avx2_gather_q_q:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; CHECK-NEXT: vpgatherqq %xmm2, (%eax,%xmm1,2), %xmm0 ## encoding: [0xc4,0xe2,0xe9,0x91,0x04,0x48]
-; CHECK-NEXT: retl ## encoding: [0xc3]
+; X86-LABEL: test_x86_avx2_gather_q_q:
+; X86: ## %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT: vpgatherqq %xmm2, (%eax,%xmm1,2), %xmm0 ## encoding: [0xc4,0xe2,0xe9,0x91,0x04,0x48]
+; X86-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
+;
+; X64-LABEL: test_x86_avx2_gather_q_q:
+; X64: ## %bb.0:
+; X64-NEXT: vpgatherqq %xmm2, (%rdi,%xmm1,2), %xmm0 ## encoding: [0xc4,0xe2,0xe9,0x91,0x04,0x4f]
+; X64-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
%res = call <2 x i64> @llvm.x86.avx2.gather.q.q(<2 x i64> %a0,
i8* %a1, <2 x i64> %idx, <2 x i64> %mask, i8 2) ;
ret <2 x i64> %res
@@ -1540,11 +1727,16 @@ declare <2 x i64> @llvm.x86.avx2.gather.q.q(<2 x i64>, i8*,
<2 x i64>, <2 x i64>, i8) nounwind readonly
define <4 x i64> @test_x86_avx2_gather_q_q_256(<4 x i64> %a0, i8* %a1, <4 x i64> %idx, <4 x i64> %mask) {
-; CHECK-LABEL: test_x86_avx2_gather_q_q_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; CHECK-NEXT: vpgatherqq %ymm2, (%eax,%ymm1,2), %ymm0 ## encoding: [0xc4,0xe2,0xed,0x91,0x04,0x48]
-; CHECK-NEXT: retl ## encoding: [0xc3]
+; X86-LABEL: test_x86_avx2_gather_q_q_256:
+; X86: ## %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT: vpgatherqq %ymm2, (%eax,%ymm1,2), %ymm0 ## encoding: [0xc4,0xe2,0xed,0x91,0x04,0x48]
+; X86-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
+;
+; X64-LABEL: test_x86_avx2_gather_q_q_256:
+; X64: ## %bb.0:
+; X64-NEXT: vpgatherqq %ymm2, (%rdi,%ymm1,2), %ymm0 ## encoding: [0xc4,0xe2,0xed,0x91,0x04,0x4f]
+; X64-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
%res = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> %a0,
i8* %a1, <4 x i64> %idx, <4 x i64> %mask, i8 2) ;
ret <4 x i64> %res
@@ -1553,11 +1745,16 @@ declare <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64>, i8*,
<4 x i64>, <4 x i64>, i8) nounwind readonly
define <4 x i32> @test_x86_avx2_gather_d_d(<4 x i32> %a0, i8* %a1, <4 x i32> %idx, <4 x i32> %mask) {
-; CHECK-LABEL: test_x86_avx2_gather_d_d:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; CHECK-NEXT: vpgatherdd %xmm2, (%eax,%xmm1,2), %xmm0 ## encoding: [0xc4,0xe2,0x69,0x90,0x04,0x48]
-; CHECK-NEXT: retl ## encoding: [0xc3]
+; X86-LABEL: test_x86_avx2_gather_d_d:
+; X86: ## %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT: vpgatherdd %xmm2, (%eax,%xmm1,2), %xmm0 ## encoding: [0xc4,0xe2,0x69,0x90,0x04,0x48]
+; X86-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
+;
+; X64-LABEL: test_x86_avx2_gather_d_d:
+; X64: ## %bb.0:
+; X64-NEXT: vpgatherdd %xmm2, (%rdi,%xmm1,2), %xmm0 ## encoding: [0xc4,0xe2,0x69,0x90,0x04,0x4f]
+; X64-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32> %a0,
i8* %a1, <4 x i32> %idx, <4 x i32> %mask, i8 2) ;
ret <4 x i32> %res
@@ -1566,11 +1763,16 @@ declare <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32>, i8*,
<4 x i32>, <4 x i32>, i8) nounwind readonly
define <8 x i32> @test_x86_avx2_gather_d_d_256(<8 x i32> %a0, i8* %a1, <8 x i32> %idx, <8 x i32> %mask) {
-; CHECK-LABEL: test_x86_avx2_gather_d_d_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; CHECK-NEXT: vpgatherdd %ymm2, (%eax,%ymm1,2), %ymm0 ## encoding: [0xc4,0xe2,0x6d,0x90,0x04,0x48]
-; CHECK-NEXT: retl ## encoding: [0xc3]
+; X86-LABEL: test_x86_avx2_gather_d_d_256:
+; X86: ## %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT: vpgatherdd %ymm2, (%eax,%ymm1,2), %ymm0 ## encoding: [0xc4,0xe2,0x6d,0x90,0x04,0x48]
+; X86-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
+;
+; X64-LABEL: test_x86_avx2_gather_d_d_256:
+; X64: ## %bb.0:
+; X64-NEXT: vpgatherdd %ymm2, (%rdi,%ymm1,2), %ymm0 ## encoding: [0xc4,0xe2,0x6d,0x90,0x04,0x4f]
+; X64-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
%res = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> %a0,
i8* %a1, <8 x i32> %idx, <8 x i32> %mask, i8 2) ;
ret <8 x i32> %res
@@ -1579,11 +1781,16 @@ declare <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32>, i8*,
<8 x i32>, <8 x i32>, i8) nounwind readonly
define <4 x i32> @test_x86_avx2_gather_q_d(<4 x i32> %a0, i8* %a1, <2 x i64> %idx, <4 x i32> %mask) {
-; CHECK-LABEL: test_x86_avx2_gather_q_d:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; CHECK-NEXT: vpgatherqd %xmm2, (%eax,%xmm1,2), %xmm0 ## encoding: [0xc4,0xe2,0x69,0x91,0x04,0x48]
-; CHECK-NEXT: retl ## encoding: [0xc3]
+; X86-LABEL: test_x86_avx2_gather_q_d:
+; X86: ## %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT: vpgatherqd %xmm2, (%eax,%xmm1,2), %xmm0 ## encoding: [0xc4,0xe2,0x69,0x91,0x04,0x48]
+; X86-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
+;
+; X64-LABEL: test_x86_avx2_gather_q_d:
+; X64: ## %bb.0:
+; X64-NEXT: vpgatherqd %xmm2, (%rdi,%xmm1,2), %xmm0 ## encoding: [0xc4,0xe2,0x69,0x91,0x04,0x4f]
+; X64-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx2.gather.q.d(<4 x i32> %a0,
i8* %a1, <2 x i64> %idx, <4 x i32> %mask, i8 2) ;
ret <4 x i32> %res
@@ -1592,12 +1799,18 @@ declare <4 x i32> @llvm.x86.avx2.gather.q.d(<4 x i32>, i8*,
<2 x i64>, <4 x i32>, i8) nounwind readonly
define <4 x i32> @test_x86_avx2_gather_q_d_256(<4 x i32> %a0, i8* %a1, <4 x i64> %idx, <4 x i32> %mask) {
-; CHECK-LABEL: test_x86_avx2_gather_q_d_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; CHECK-NEXT: vpgatherqd %xmm2, (%eax,%ymm1,2), %xmm0 ## encoding: [0xc4,0xe2,0x6d,0x91,0x04,0x48]
-; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; CHECK-NEXT: retl ## encoding: [0xc3]
+; X86-LABEL: test_x86_avx2_gather_q_d_256:
+; X86: ## %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT: vpgatherqd %xmm2, (%eax,%ymm1,2), %xmm0 ## encoding: [0xc4,0xe2,0x6d,0x91,0x04,0x48]
+; X86-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; X86-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
+;
+; X64-LABEL: test_x86_avx2_gather_q_d_256:
+; X64: ## %bb.0:
+; X64-NEXT: vpgatherqd %xmm2, (%rdi,%ymm1,2), %xmm0 ## encoding: [0xc4,0xe2,0x6d,0x91,0x04,0x4f]
+; X64-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; X64-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> %a0,
i8* %a1, <4 x i64> %idx, <4 x i32> %mask, i8 2) ;
ret <4 x i32> %res
@@ -1608,23 +1821,37 @@ declare <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32>, i8*,
; PR13298
define <8 x float> @test_gather_mask(<8 x float> %a0, float* %a, <8 x i32> %idx, <8 x float> %mask, float* nocapture %out) {
;; gather with mask
-; AVX2-LABEL: test_gather_mask:
-; AVX2: ## BB#0:
-; AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x08]
-; AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x04]
-; AVX2-NEXT: vmovaps %ymm2, %ymm3 ## encoding: [0xc5,0xfc,0x28,0xda]
-; AVX2-NEXT: vgatherdps %ymm3, (%ecx,%ymm1,4), %ymm0 ## encoding: [0xc4,0xe2,0x65,0x92,0x04,0x89]
-; AVX2-NEXT: vmovups %ymm2, (%eax) ## encoding: [0xc5,0xfc,0x11,0x10]
-; AVX2-NEXT: retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_gather_mask:
-; AVX512VL: ## BB#0:
-; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; AVX512VL-NEXT: vmovaps %ymm2, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xda]
-; AVX512VL-NEXT: vgatherdps %ymm3, (%eax,%ymm1,4), %ymm0 ## encoding: [0xc4,0xe2,0x65,0x92,0x04,0x88]
-; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x08]
-; AVX512VL-NEXT: vmovups %ymm2, (%eax) ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x11,0x10]
-; AVX512VL-NEXT: retl ## encoding: [0xc3]
+; X86-AVX-LABEL: test_gather_mask:
+; X86-AVX: ## %bb.0:
+; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x08]
+; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x04]
+; X86-AVX-NEXT: vmovaps %ymm2, %ymm3 ## encoding: [0xc5,0xfc,0x28,0xda]
+; X86-AVX-NEXT: vgatherdps %ymm3, (%ecx,%ymm1,4), %ymm0 ## encoding: [0xc4,0xe2,0x65,0x92,0x04,0x89]
+; X86-AVX-NEXT: vmovups %ymm2, (%eax) ## encoding: [0xc5,0xfc,0x11,0x10]
+; X86-AVX-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
+;
+; X86-AVX512VL-LABEL: test_gather_mask:
+; X86-AVX512VL: ## %bb.0:
+; X86-AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x08]
+; X86-AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x04]
+; X86-AVX512VL-NEXT: vmovaps %ymm2, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xda]
+; X86-AVX512VL-NEXT: vgatherdps %ymm3, (%ecx,%ymm1,4), %ymm0 ## encoding: [0xc4,0xe2,0x65,0x92,0x04,0x89]
+; X86-AVX512VL-NEXT: vmovups %ymm2, (%eax) ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x11,0x10]
+; X86-AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
+;
+; X64-AVX-LABEL: test_gather_mask:
+; X64-AVX: ## %bb.0:
+; X64-AVX-NEXT: vmovaps %ymm2, %ymm3 ## encoding: [0xc5,0xfc,0x28,0xda]
+; X64-AVX-NEXT: vgatherdps %ymm3, (%rdi,%ymm1,4), %ymm0 ## encoding: [0xc4,0xe2,0x65,0x92,0x04,0x8f]
+; X64-AVX-NEXT: vmovups %ymm2, (%rsi) ## encoding: [0xc5,0xfc,0x11,0x16]
+; X64-AVX-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
+;
+; X64-AVX512VL-LABEL: test_gather_mask:
+; X64-AVX512VL: ## %bb.0:
+; X64-AVX512VL-NEXT: vmovaps %ymm2, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xda]
+; X64-AVX512VL-NEXT: vgatherdps %ymm3, (%rdi,%ymm1,4), %ymm0 ## encoding: [0xc4,0xe2,0x65,0x92,0x04,0x8f]
+; X64-AVX512VL-NEXT: vmovups %ymm2, (%rsi) ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x11,0x16]
+; X64-AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3]
%a_i8 = bitcast float* %a to i8*
%res = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> %a0,
i8* %a_i8, <8 x i32> %idx, <8 x float> %mask, i8 4) ;
diff --git a/test/CodeGen/X86/avx2-logic.ll b/test/CodeGen/X86/avx2-logic.ll
index 68d486699cbc..8f2207fafef9 100644
--- a/test/CodeGen/X86/avx2-logic.ll
+++ b/test/CodeGen/X86/avx2-logic.ll
@@ -4,14 +4,14 @@
define <4 x i64> @vpandn(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp {
; X32-LABEL: vpandn:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; X32-NEXT: vpsubq %ymm1, %ymm0, %ymm1
; X32-NEXT: vpandn %ymm0, %ymm1, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: vpandn:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; X64-NEXT: vpsubq %ymm1, %ymm0, %ymm1
; X64-NEXT: vpandn %ymm0, %ymm1, %ymm0
@@ -26,14 +26,14 @@ entry:
define <4 x i64> @vpand(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp {
; X32-LABEL: vpand:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
; X32-NEXT: vpsubq %ymm2, %ymm0, %ymm0
; X32-NEXT: vpand %ymm1, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: vpand:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
; X64-NEXT: vpsubq %ymm2, %ymm0, %ymm0
; X64-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -47,14 +47,14 @@ entry:
define <4 x i64> @vpor(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp {
; X32-LABEL: vpor:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
; X32-NEXT: vpsubq %ymm2, %ymm0, %ymm0
; X32-NEXT: vpor %ymm1, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: vpor:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
; X64-NEXT: vpsubq %ymm2, %ymm0, %ymm0
; X64-NEXT: vpor %ymm1, %ymm0, %ymm0
@@ -68,14 +68,14 @@ entry:
define <4 x i64> @vpxor(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp {
; X32-LABEL: vpxor:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
; X32-NEXT: vpsubq %ymm2, %ymm0, %ymm0
; X32-NEXT: vpxor %ymm1, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: vpxor:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
; X64-NEXT: vpsubq %ymm2, %ymm0, %ymm0
; X64-NEXT: vpxor %ymm1, %ymm0, %ymm0
@@ -89,14 +89,14 @@ entry:
define <32 x i8> @vpblendvb(<32 x i1> %cond, <32 x i8> %x, <32 x i8> %y) {
; X32-LABEL: vpblendvb:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpsllw $7, %ymm0, %ymm0
; X32-NEXT: vpand {{\.LCPI.*}}, %ymm0, %ymm0
; X32-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: vpblendvb:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpsllw $7, %ymm0, %ymm0
; X64-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
; X64-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm0
@@ -107,12 +107,12 @@ define <32 x i8> @vpblendvb(<32 x i1> %cond, <32 x i8> %x, <32 x i8> %y) {
define <8 x i32> @allOnes() nounwind {
; X32-LABEL: allOnes:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: allOnes:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; X64-NEXT: retq
ret <8 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
@@ -120,12 +120,12 @@ define <8 x i32> @allOnes() nounwind {
define <16 x i16> @allOnes2() nounwind {
; X32-LABEL: allOnes2:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: allOnes2:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; X64-NEXT: retq
ret <16 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
diff --git a/test/CodeGen/X86/avx2-masked-gather.ll b/test/CodeGen/X86/avx2-masked-gather.ll
new file mode 100644
index 000000000000..eb482c24cc9c
--- /dev/null
+++ b/test/CodeGen/X86/avx2-masked-gather.ll
@@ -0,0 +1,807 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mcpu=skylake -mtriple=i386-unknown-linux-gnu -mattr=+avx2 | FileCheck --check-prefix=X86 %s
+; RUN: llc < %s -mcpu=skylake -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2 | FileCheck --check-prefix=X64 %s
+; RUN: llc < %s -mcpu=skx -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2,-avx512f | FileCheck --check-prefix=X64 %s
+; RUN: llc < %s -mcpu=skylake -mtriple=x86_64-unknown-linux-gnu -mattr=-avx2 | FileCheck --check-prefix=NOGATHER %s
+
+declare <2 x i32> @llvm.masked.gather.v2i32(<2 x i32*> %ptrs, i32 %align, <2 x i1> %masks, <2 x i32> %passthro)
+
+define <2 x i32> @masked_gather_v2i32(<2 x i32*>* %ptr, <2 x i1> %masks, <2 x i32> %passthro) {
+; X86-LABEL: masked_gather_v2i32:
+; X86: # %bb.0: # %entry
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
+; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; X86-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; X86-NEXT: vpslld $31, %xmm0, %xmm0
+; X86-NEXT: vpgatherdd %xmm0, (,%xmm2), %xmm1
+; X86-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero
+; X86-NEXT: retl
+;
+; X64-LABEL: masked_gather_v2i32:
+; X64: # %bb.0: # %entry
+; X64-NEXT: vmovdqa (%rdi), %xmm2
+; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X64-NEXT: vpslld $31, %xmm0, %xmm0
+; X64-NEXT: vpgatherqd %xmm0, (,%xmm2), %xmm1
+; X64-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero
+; X64-NEXT: retq
+;
+; NOGATHER-LABEL: masked_gather_v2i32:
+; NOGATHER: # %bb.0: # %entry
+; NOGATHER-NEXT: vmovdqa (%rdi), %xmm3
+; NOGATHER-NEXT: vpextrb $0, %xmm0, %eax
+; NOGATHER-NEXT: # implicit-def: %xmm2
+; NOGATHER-NEXT: testb $1, %al
+; NOGATHER-NEXT: je .LBB0_2
+; NOGATHER-NEXT: # %bb.1: # %cond.load
+; NOGATHER-NEXT: vmovq %xmm3, %rax
+; NOGATHER-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; NOGATHER-NEXT: .LBB0_2: # %else
+; NOGATHER-NEXT: vpextrb $8, %xmm0, %eax
+; NOGATHER-NEXT: testb $1, %al
+; NOGATHER-NEXT: je .LBB0_4
+; NOGATHER-NEXT: # %bb.3: # %cond.load1
+; NOGATHER-NEXT: vpextrq $1, %xmm3, %rax
+; NOGATHER-NEXT: movl (%rax), %eax
+; NOGATHER-NEXT: vpinsrq $1, %rax, %xmm2, %xmm2
+; NOGATHER-NEXT: .LBB0_4: # %else2
+; NOGATHER-NEXT: vpsllq $63, %xmm0, %xmm0
+; NOGATHER-NEXT: vblendvpd %xmm0, %xmm2, %xmm1, %xmm0
+; NOGATHER-NEXT: retq
+entry:
+ %ld = load <2 x i32*>, <2 x i32*>* %ptr
+ %res = call <2 x i32> @llvm.masked.gather.v2i32(<2 x i32*> %ld, i32 0, <2 x i1> %masks, <2 x i32> %passthro)
+ ret <2 x i32> %res
+}
+
+define <4 x i32> @masked_gather_v2i32_concat(<2 x i32*>* %ptr, <2 x i1> %masks, <2 x i32> %passthro) {
+; X86-LABEL: masked_gather_v2i32_concat:
+; X86: # %bb.0: # %entry
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
+; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; X86-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; X86-NEXT: vpslld $31, %xmm0, %xmm0
+; X86-NEXT: vpgatherdd %xmm0, (,%xmm2), %xmm1
+; X86-NEXT: vmovdqa %xmm1, %xmm0
+; X86-NEXT: retl
+;
+; X64-LABEL: masked_gather_v2i32_concat:
+; X64: # %bb.0: # %entry
+; X64-NEXT: vmovdqa (%rdi), %xmm2
+; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X64-NEXT: vpslld $31, %xmm0, %xmm0
+; X64-NEXT: vpgatherqd %xmm0, (,%xmm2), %xmm1
+; X64-NEXT: vmovdqa %xmm1, %xmm0
+; X64-NEXT: retq
+;
+; NOGATHER-LABEL: masked_gather_v2i32_concat:
+; NOGATHER: # %bb.0: # %entry
+; NOGATHER-NEXT: vmovdqa (%rdi), %xmm3
+; NOGATHER-NEXT: vpextrb $0, %xmm0, %eax
+; NOGATHER-NEXT: # implicit-def: %xmm2
+; NOGATHER-NEXT: testb $1, %al
+; NOGATHER-NEXT: je .LBB1_2
+; NOGATHER-NEXT: # %bb.1: # %cond.load
+; NOGATHER-NEXT: vmovq %xmm3, %rax
+; NOGATHER-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; NOGATHER-NEXT: .LBB1_2: # %else
+; NOGATHER-NEXT: vpextrb $8, %xmm0, %eax
+; NOGATHER-NEXT: testb $1, %al
+; NOGATHER-NEXT: je .LBB1_4
+; NOGATHER-NEXT: # %bb.3: # %cond.load1
+; NOGATHER-NEXT: vpextrq $1, %xmm3, %rax
+; NOGATHER-NEXT: movl (%rax), %eax
+; NOGATHER-NEXT: vpinsrq $1, %rax, %xmm2, %xmm2
+; NOGATHER-NEXT: .LBB1_4: # %else2
+; NOGATHER-NEXT: vpsllq $63, %xmm0, %xmm0
+; NOGATHER-NEXT: vblendvpd %xmm0, %xmm2, %xmm1, %xmm0
+; NOGATHER-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; NOGATHER-NEXT: retq
+entry:
+ %ld = load <2 x i32*>, <2 x i32*>* %ptr
+ %res = call <2 x i32> @llvm.masked.gather.v2i32(<2 x i32*> %ld, i32 0, <2 x i1> %masks, <2 x i32> %passthro)
+ %res2 = shufflevector <2 x i32> %res, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ ret <4 x i32> %res2
+}
+
+declare <2 x float> @llvm.masked.gather.v2float(<2 x float*> %ptrs, i32 %align, <2 x i1> %masks, <2 x float> %passthro)
+
+define <2 x float> @masked_gather_v2float(<2 x float*>* %ptr, <2 x i1> %masks, <2 x float> %passthro) {
+; X86-LABEL: masked_gather_v2float:
+; X86: # %bb.0: # %entry
+; X86-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; X86-NEXT: vpslld $31, %xmm0, %xmm0
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
+; X86-NEXT: vgatherdps %xmm0, (,%xmm2), %xmm1
+; X86-NEXT: vmovaps %xmm1, %xmm0
+; X86-NEXT: retl
+;
+; X64-LABEL: masked_gather_v2float:
+; X64: # %bb.0: # %entry
+; X64-NEXT: vmovaps (%rdi), %xmm2
+; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X64-NEXT: vpslld $31, %xmm0, %xmm0
+; X64-NEXT: vgatherqps %xmm0, (,%xmm2), %xmm1
+; X64-NEXT: vmovaps %xmm1, %xmm0
+; X64-NEXT: retq
+;
+; NOGATHER-LABEL: masked_gather_v2float:
+; NOGATHER: # %bb.0: # %entry
+; NOGATHER-NEXT: vmovdqa (%rdi), %xmm3
+; NOGATHER-NEXT: vpextrb $0, %xmm0, %eax
+; NOGATHER-NEXT: # implicit-def: %xmm2
+; NOGATHER-NEXT: testb $1, %al
+; NOGATHER-NEXT: je .LBB2_2
+; NOGATHER-NEXT: # %bb.1: # %cond.load
+; NOGATHER-NEXT: vmovq %xmm3, %rax
+; NOGATHER-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; NOGATHER-NEXT: .LBB2_2: # %else
+; NOGATHER-NEXT: vpextrb $8, %xmm0, %eax
+; NOGATHER-NEXT: testb $1, %al
+; NOGATHER-NEXT: je .LBB2_4
+; NOGATHER-NEXT: # %bb.3: # %cond.load1
+; NOGATHER-NEXT: vpextrq $1, %xmm3, %rax
+; NOGATHER-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3]
+; NOGATHER-NEXT: .LBB2_4: # %else2
+; NOGATHER-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; NOGATHER-NEXT: vpslld $31, %xmm0, %xmm0
+; NOGATHER-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0
+; NOGATHER-NEXT: retq
+entry:
+ %ld = load <2 x float*>, <2 x float*>* %ptr
+ %res = call <2 x float> @llvm.masked.gather.v2float(<2 x float*> %ld, i32 0, <2 x i1> %masks, <2 x float> %passthro)
+ ret <2 x float> %res
+}
+
+define <4 x float> @masked_gather_v2float_concat(<2 x float*>* %ptr, <2 x i1> %masks, <2 x float> %passthro) {
+; X86-LABEL: masked_gather_v2float_concat:
+; X86: # %bb.0: # %entry
+; X86-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; X86-NEXT: vpslld $31, %xmm0, %xmm0
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
+; X86-NEXT: vgatherdps %xmm0, (,%xmm2), %xmm1
+; X86-NEXT: vmovaps %xmm1, %xmm0
+; X86-NEXT: retl
+;
+; X64-LABEL: masked_gather_v2float_concat:
+; X64: # %bb.0: # %entry
+; X64-NEXT: vmovaps (%rdi), %xmm2
+; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X64-NEXT: vpslld $31, %xmm0, %xmm0
+; X64-NEXT: vgatherqps %xmm0, (,%xmm2), %xmm1
+; X64-NEXT: vmovaps %xmm1, %xmm0
+; X64-NEXT: retq
+;
+; NOGATHER-LABEL: masked_gather_v2float_concat:
+; NOGATHER: # %bb.0: # %entry
+; NOGATHER-NEXT: vmovdqa (%rdi), %xmm3
+; NOGATHER-NEXT: vpextrb $0, %xmm0, %eax
+; NOGATHER-NEXT: # implicit-def: %xmm2
+; NOGATHER-NEXT: testb $1, %al
+; NOGATHER-NEXT: je .LBB3_2
+; NOGATHER-NEXT: # %bb.1: # %cond.load
+; NOGATHER-NEXT: vmovq %xmm3, %rax
+; NOGATHER-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; NOGATHER-NEXT: .LBB3_2: # %else
+; NOGATHER-NEXT: vpextrb $8, %xmm0, %eax
+; NOGATHER-NEXT: testb $1, %al
+; NOGATHER-NEXT: je .LBB3_4
+; NOGATHER-NEXT: # %bb.3: # %cond.load1
+; NOGATHER-NEXT: vpextrq $1, %xmm3, %rax
+; NOGATHER-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3]
+; NOGATHER-NEXT: .LBB3_4: # %else2
+; NOGATHER-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; NOGATHER-NEXT: vpslld $31, %xmm0, %xmm0
+; NOGATHER-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0
+; NOGATHER-NEXT: retq
+entry:
+ %ld = load <2 x float*>, <2 x float*>* %ptr
+ %res = call <2 x float> @llvm.masked.gather.v2float(<2 x float*> %ld, i32 0, <2 x i1> %masks, <2 x float> %passthro)
+ %res2 = shufflevector <2 x float> %res, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ ret <4 x float> %res2
+}
+
+
+declare <4 x i32> @llvm.masked.gather.v4i32(<4 x i32*> %ptrs, i32 %align, <4 x i1> %masks, <4 x i32> %passthro)
+
+define <4 x i32> @masked_gather_v4i32(<4 x i32*> %ptrs, <4 x i1> %masks, <4 x i32> %passthro) {
+; X86-LABEL: masked_gather_v4i32:
+; X86: # %bb.0: # %entry
+; X86-NEXT: vpslld $31, %xmm1, %xmm1
+; X86-NEXT: vpgatherdd %xmm1, (,%xmm0), %xmm2
+; X86-NEXT: vmovdqa %xmm2, %xmm0
+; X86-NEXT: retl
+;
+; X64-LABEL: masked_gather_v4i32:
+; X64: # %bb.0: # %entry
+; X64-NEXT: vpslld $31, %xmm1, %xmm1
+; X64-NEXT: vpgatherqd %xmm1, (,%ymm0), %xmm2
+; X64-NEXT: vmovdqa %xmm2, %xmm0
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+;
+; NOGATHER-LABEL: masked_gather_v4i32:
+; NOGATHER: # %bb.0: # %entry
+; NOGATHER-NEXT: vpextrb $0, %xmm1, %eax
+; NOGATHER-NEXT: # implicit-def: %xmm3
+; NOGATHER-NEXT: testb $1, %al
+; NOGATHER-NEXT: je .LBB4_2
+; NOGATHER-NEXT: # %bb.1: # %cond.load
+; NOGATHER-NEXT: vmovq %xmm0, %rax
+; NOGATHER-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; NOGATHER-NEXT: .LBB4_2: # %else
+; NOGATHER-NEXT: vpextrb $4, %xmm1, %eax
+; NOGATHER-NEXT: testb $1, %al
+; NOGATHER-NEXT: je .LBB4_4
+; NOGATHER-NEXT: # %bb.3: # %cond.load1
+; NOGATHER-NEXT: vpextrq $1, %xmm0, %rax
+; NOGATHER-NEXT: vpinsrd $1, (%rax), %xmm3, %xmm3
+; NOGATHER-NEXT: .LBB4_4: # %else2
+; NOGATHER-NEXT: vpextrb $8, %xmm1, %eax
+; NOGATHER-NEXT: testb $1, %al
+; NOGATHER-NEXT: je .LBB4_6
+; NOGATHER-NEXT: # %bb.5: # %cond.load4
+; NOGATHER-NEXT: vextractf128 $1, %ymm0, %xmm4
+; NOGATHER-NEXT: vmovq %xmm4, %rax
+; NOGATHER-NEXT: vpinsrd $2, (%rax), %xmm3, %xmm3
+; NOGATHER-NEXT: .LBB4_6: # %else5
+; NOGATHER-NEXT: vpextrb $12, %xmm1, %eax
+; NOGATHER-NEXT: testb $1, %al
+; NOGATHER-NEXT: je .LBB4_8
+; NOGATHER-NEXT: # %bb.7: # %cond.load7
+; NOGATHER-NEXT: vextractf128 $1, %ymm0, %xmm0
+; NOGATHER-NEXT: vpextrq $1, %xmm0, %rax
+; NOGATHER-NEXT: vpinsrd $3, (%rax), %xmm3, %xmm3
+; NOGATHER-NEXT: .LBB4_8: # %else8
+; NOGATHER-NEXT: vpslld $31, %xmm1, %xmm0
+; NOGATHER-NEXT: vblendvps %xmm0, %xmm3, %xmm2, %xmm0
+; NOGATHER-NEXT: vzeroupper
+; NOGATHER-NEXT: retq
+entry:
+ %res = call <4 x i32> @llvm.masked.gather.v4i32(<4 x i32*> %ptrs, i32 0, <4 x i1> %masks, <4 x i32> %passthro)
+ ret <4 x i32> %res
+}
+
+declare <4 x float> @llvm.masked.gather.v4float(<4 x float*> %ptrs, i32 %align, <4 x i1> %masks, <4 x float> %passthro)
+
+define <4 x float> @masked_gather_v4float(<4 x float*> %ptrs, <4 x i1> %masks, <4 x float> %passthro) {
+; X86-LABEL: masked_gather_v4float:
+; X86: # %bb.0: # %entry
+; X86-NEXT: vpslld $31, %xmm1, %xmm1
+; X86-NEXT: vgatherdps %xmm1, (,%xmm0), %xmm2
+; X86-NEXT: vmovaps %xmm2, %xmm0
+; X86-NEXT: retl
+;
+; X64-LABEL: masked_gather_v4float:
+; X64: # %bb.0: # %entry
+; X64-NEXT: vpslld $31, %xmm1, %xmm1
+; X64-NEXT: vgatherqps %xmm1, (,%ymm0), %xmm2
+; X64-NEXT: vmovaps %xmm2, %xmm0
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+;
+; NOGATHER-LABEL: masked_gather_v4float:
+; NOGATHER: # %bb.0: # %entry
+; NOGATHER-NEXT: vpextrb $0, %xmm1, %eax
+; NOGATHER-NEXT: # implicit-def: %xmm3
+; NOGATHER-NEXT: testb $1, %al
+; NOGATHER-NEXT: je .LBB5_2
+; NOGATHER-NEXT: # %bb.1: # %cond.load
+; NOGATHER-NEXT: vmovq %xmm0, %rax
+; NOGATHER-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; NOGATHER-NEXT: .LBB5_2: # %else
+; NOGATHER-NEXT: vpextrb $4, %xmm1, %eax
+; NOGATHER-NEXT: testb $1, %al
+; NOGATHER-NEXT: je .LBB5_4
+; NOGATHER-NEXT: # %bb.3: # %cond.load1
+; NOGATHER-NEXT: vpextrq $1, %xmm0, %rax
+; NOGATHER-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[2,3]
+; NOGATHER-NEXT: .LBB5_4: # %else2
+; NOGATHER-NEXT: vpextrb $8, %xmm1, %eax
+; NOGATHER-NEXT: testb $1, %al
+; NOGATHER-NEXT: je .LBB5_6
+; NOGATHER-NEXT: # %bb.5: # %cond.load4
+; NOGATHER-NEXT: vextractf128 $1, %ymm0, %xmm4
+; NOGATHER-NEXT: vmovq %xmm4, %rax
+; NOGATHER-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],mem[0],xmm3[3]
+; NOGATHER-NEXT: .LBB5_6: # %else5
+; NOGATHER-NEXT: vpextrb $12, %xmm1, %eax
+; NOGATHER-NEXT: testb $1, %al
+; NOGATHER-NEXT: je .LBB5_8
+; NOGATHER-NEXT: # %bb.7: # %cond.load7
+; NOGATHER-NEXT: vextractf128 $1, %ymm0, %xmm0
+; NOGATHER-NEXT: vpextrq $1, %xmm0, %rax
+; NOGATHER-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],mem[0]
+; NOGATHER-NEXT: .LBB5_8: # %else8
+; NOGATHER-NEXT: vpslld $31, %xmm1, %xmm0
+; NOGATHER-NEXT: vblendvps %xmm0, %xmm3, %xmm2, %xmm0
+; NOGATHER-NEXT: vzeroupper
+; NOGATHER-NEXT: retq
+entry:
+ %res = call <4 x float> @llvm.masked.gather.v4float(<4 x float*> %ptrs, i32 0, <4 x i1> %masks, <4 x float> %passthro)
+ ret <4 x float> %res
+}
+
+declare <8 x i32> @llvm.masked.gather.v8i32(<8 x i32*> %ptrs, i32 %align, <8 x i1> %masks, <8 x i32> %passthro)
+
+define <8 x i32> @masked_gather_v8i32(<8 x i32*>* %ptr, <8 x i1> %masks, <8 x i32> %passthro) {
+; X86-LABEL: masked_gather_v8i32:
+; X86: # %bb.0: # %entry
+; X86-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; X86-NEXT: vpslld $31, %ymm0, %ymm0
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: vmovdqa (%eax), %ymm2
+; X86-NEXT: vpgatherdd %ymm0, (,%ymm2), %ymm1
+; X86-NEXT: vmovdqa %ymm1, %ymm0
+; X86-NEXT: retl
+;
+; X64-LABEL: masked_gather_v8i32:
+; X64: # %bb.0: # %entry
+; X64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; X64-NEXT: vpslld $31, %ymm0, %ymm0
+; X64-NEXT: vpsrad $31, %ymm0, %ymm0
+; X64-NEXT: vmovdqa (%rdi), %ymm2
+; X64-NEXT: vmovdqa 32(%rdi), %ymm3
+; X64-NEXT: vextracti128 $1, %ymm1, %xmm4
+; X64-NEXT: vextracti128 $1, %ymm0, %xmm5
+; X64-NEXT: vpgatherqd %xmm5, (,%ymm3), %xmm4
+; X64-NEXT: vpgatherqd %xmm0, (,%ymm2), %xmm1
+; X64-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm0
+; X64-NEXT: retq
+;
+; NOGATHER-LABEL: masked_gather_v8i32:
+; NOGATHER: # %bb.0: # %entry
+; NOGATHER-NEXT: vmovdqa (%rdi), %ymm4
+; NOGATHER-NEXT: vmovdqa 32(%rdi), %ymm3
+; NOGATHER-NEXT: vpextrb $0, %xmm0, %eax
+; NOGATHER-NEXT: # implicit-def: %ymm2
+; NOGATHER-NEXT: testb $1, %al
+; NOGATHER-NEXT: je .LBB6_2
+; NOGATHER-NEXT: # %bb.1: # %cond.load
+; NOGATHER-NEXT: vmovq %xmm4, %rax
+; NOGATHER-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; NOGATHER-NEXT: .LBB6_2: # %else
+; NOGATHER-NEXT: vpextrb $2, %xmm0, %eax
+; NOGATHER-NEXT: testb $1, %al
+; NOGATHER-NEXT: je .LBB6_4
+; NOGATHER-NEXT: # %bb.3: # %cond.load1
+; NOGATHER-NEXT: vpextrq $1, %xmm4, %rax
+; NOGATHER-NEXT: vpinsrd $1, (%rax), %xmm2, %xmm5
+; NOGATHER-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7]
+; NOGATHER-NEXT: .LBB6_4: # %else2
+; NOGATHER-NEXT: vpextrb $4, %xmm0, %eax
+; NOGATHER-NEXT: testb $1, %al
+; NOGATHER-NEXT: je .LBB6_6
+; NOGATHER-NEXT: # %bb.5: # %cond.load4
+; NOGATHER-NEXT: vextractf128 $1, %ymm4, %xmm5
+; NOGATHER-NEXT: vmovq %xmm5, %rax
+; NOGATHER-NEXT: vpinsrd $2, (%rax), %xmm2, %xmm5
+; NOGATHER-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7]
+; NOGATHER-NEXT: .LBB6_6: # %else5
+; NOGATHER-NEXT: vpextrb $6, %xmm0, %eax
+; NOGATHER-NEXT: testb $1, %al
+; NOGATHER-NEXT: je .LBB6_8
+; NOGATHER-NEXT: # %bb.7: # %cond.load7
+; NOGATHER-NEXT: vextractf128 $1, %ymm4, %xmm4
+; NOGATHER-NEXT: vpextrq $1, %xmm4, %rax
+; NOGATHER-NEXT: vpinsrd $3, (%rax), %xmm2, %xmm4
+; NOGATHER-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
+; NOGATHER-NEXT: .LBB6_8: # %else8
+; NOGATHER-NEXT: vpextrb $8, %xmm0, %eax
+; NOGATHER-NEXT: testb $1, %al
+; NOGATHER-NEXT: je .LBB6_10
+; NOGATHER-NEXT: # %bb.9: # %cond.load10
+; NOGATHER-NEXT: vmovq %xmm3, %rax
+; NOGATHER-NEXT: vextractf128 $1, %ymm2, %xmm4
+; NOGATHER-NEXT: vpinsrd $0, (%rax), %xmm4, %xmm4
+; NOGATHER-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
+; NOGATHER-NEXT: .LBB6_10: # %else11
+; NOGATHER-NEXT: vpextrb $10, %xmm0, %eax
+; NOGATHER-NEXT: testb $1, %al
+; NOGATHER-NEXT: je .LBB6_12
+; NOGATHER-NEXT: # %bb.11: # %cond.load13
+; NOGATHER-NEXT: vpextrq $1, %xmm3, %rax
+; NOGATHER-NEXT: vextractf128 $1, %ymm2, %xmm4
+; NOGATHER-NEXT: vpinsrd $1, (%rax), %xmm4, %xmm4
+; NOGATHER-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
+; NOGATHER-NEXT: .LBB6_12: # %else14
+; NOGATHER-NEXT: vpextrb $12, %xmm0, %eax
+; NOGATHER-NEXT: testb $1, %al
+; NOGATHER-NEXT: je .LBB6_14
+; NOGATHER-NEXT: # %bb.13: # %cond.load16
+; NOGATHER-NEXT: vextractf128 $1, %ymm3, %xmm4
+; NOGATHER-NEXT: vmovq %xmm4, %rax
+; NOGATHER-NEXT: vextractf128 $1, %ymm2, %xmm4
+; NOGATHER-NEXT: vpinsrd $2, (%rax), %xmm4, %xmm4
+; NOGATHER-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
+; NOGATHER-NEXT: .LBB6_14: # %else17
+; NOGATHER-NEXT: vpextrb $14, %xmm0, %eax
+; NOGATHER-NEXT: testb $1, %al
+; NOGATHER-NEXT: je .LBB6_16
+; NOGATHER-NEXT: # %bb.15: # %cond.load19
+; NOGATHER-NEXT: vextractf128 $1, %ymm3, %xmm3
+; NOGATHER-NEXT: vpextrq $1, %xmm3, %rax
+; NOGATHER-NEXT: vextractf128 $1, %ymm2, %xmm3
+; NOGATHER-NEXT: vpinsrd $3, (%rax), %xmm3, %xmm3
+; NOGATHER-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; NOGATHER-NEXT: .LBB6_16: # %else20
+; NOGATHER-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; NOGATHER-NEXT: vpslld $31, %xmm3, %xmm3
+; NOGATHER-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
+; NOGATHER-NEXT: vpslld $31, %xmm0, %xmm0
+; NOGATHER-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0
+; NOGATHER-NEXT: vblendvps %ymm0, %ymm2, %ymm1, %ymm0
+; NOGATHER-NEXT: retq
+entry:
+ %ld = load <8 x i32*>, <8 x i32*>* %ptr
+ %res = call <8 x i32> @llvm.masked.gather.v8i32(<8 x i32*> %ld, i32 0, <8 x i1> %masks, <8 x i32> %passthro)
+ ret <8 x i32> %res
+}
+
+declare <8 x float> @llvm.masked.gather.v8float(<8 x float*> %ptrs, i32 %align, <8 x i1> %masks, <8 x float> %passthro)
+
+define <8 x float> @masked_gather_v8float(<8 x float*>* %ptr, <8 x i1> %masks, <8 x float> %passthro) {
+; X86-LABEL: masked_gather_v8float:
+; X86: # %bb.0: # %entry
+; X86-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; X86-NEXT: vpslld $31, %ymm0, %ymm0
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: vmovaps (%eax), %ymm2
+; X86-NEXT: vgatherdps %ymm0, (,%ymm2), %ymm1
+; X86-NEXT: vmovaps %ymm1, %ymm0
+; X86-NEXT: retl
+;
+; X64-LABEL: masked_gather_v8float:
+; X64: # %bb.0: # %entry
+; X64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; X64-NEXT: vpslld $31, %ymm0, %ymm0
+; X64-NEXT: vpsrad $31, %ymm0, %ymm0
+; X64-NEXT: vmovaps (%rdi), %ymm2
+; X64-NEXT: vmovaps 32(%rdi), %ymm3
+; X64-NEXT: vextractf128 $1, %ymm1, %xmm4
+; X64-NEXT: vextracti128 $1, %ymm0, %xmm5
+; X64-NEXT: vgatherqps %xmm5, (,%ymm3), %xmm4
+; X64-NEXT: vgatherqps %xmm0, (,%ymm2), %xmm1
+; X64-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm0
+; X64-NEXT: retq
+;
+; NOGATHER-LABEL: masked_gather_v8float:
+; NOGATHER: # %bb.0: # %entry
+; NOGATHER-NEXT: vmovdqa (%rdi), %ymm4
+; NOGATHER-NEXT: vmovdqa 32(%rdi), %ymm3
+; NOGATHER-NEXT: vpextrb $0, %xmm0, %eax
+; NOGATHER-NEXT: # implicit-def: %ymm2
+; NOGATHER-NEXT: testb $1, %al
+; NOGATHER-NEXT: je .LBB7_2
+; NOGATHER-NEXT: # %bb.1: # %cond.load
+; NOGATHER-NEXT: vmovq %xmm4, %rax
+; NOGATHER-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; NOGATHER-NEXT: .LBB7_2: # %else
+; NOGATHER-NEXT: vpextrb $2, %xmm0, %eax
+; NOGATHER-NEXT: testb $1, %al
+; NOGATHER-NEXT: je .LBB7_4
+; NOGATHER-NEXT: # %bb.3: # %cond.load1
+; NOGATHER-NEXT: vpextrq $1, %xmm4, %rax
+; NOGATHER-NEXT: vinsertps {{.*#+}} xmm5 = xmm2[0],mem[0],xmm2[2,3]
+; NOGATHER-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7]
+; NOGATHER-NEXT: .LBB7_4: # %else2
+; NOGATHER-NEXT: vpextrb $4, %xmm0, %eax
+; NOGATHER-NEXT: testb $1, %al
+; NOGATHER-NEXT: je .LBB7_6
+; NOGATHER-NEXT: # %bb.5: # %cond.load4
+; NOGATHER-NEXT: vextractf128 $1, %ymm4, %xmm5
+; NOGATHER-NEXT: vmovq %xmm5, %rax
+; NOGATHER-NEXT: vinsertps {{.*#+}} xmm5 = xmm2[0,1],mem[0],xmm2[3]
+; NOGATHER-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7]
+; NOGATHER-NEXT: .LBB7_6: # %else5
+; NOGATHER-NEXT: vpextrb $6, %xmm0, %eax
+; NOGATHER-NEXT: testb $1, %al
+; NOGATHER-NEXT: je .LBB7_8
+; NOGATHER-NEXT: # %bb.7: # %cond.load7
+; NOGATHER-NEXT: vextractf128 $1, %ymm4, %xmm4
+; NOGATHER-NEXT: vpextrq $1, %xmm4, %rax
+; NOGATHER-NEXT: vinsertps {{.*#+}} xmm4 = xmm2[0,1,2],mem[0]
+; NOGATHER-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
+; NOGATHER-NEXT: .LBB7_8: # %else8
+; NOGATHER-NEXT: vpextrb $8, %xmm0, %eax
+; NOGATHER-NEXT: testb $1, %al
+; NOGATHER-NEXT: je .LBB7_10
+; NOGATHER-NEXT: # %bb.9: # %cond.load10
+; NOGATHER-NEXT: vmovq %xmm3, %rax
+; NOGATHER-NEXT: vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero
+; NOGATHER-NEXT: vextractf128 $1, %ymm2, %xmm5
+; NOGATHER-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0],xmm5[1,2,3]
+; NOGATHER-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
+; NOGATHER-NEXT: .LBB7_10: # %else11
+; NOGATHER-NEXT: vpextrb $10, %xmm0, %eax
+; NOGATHER-NEXT: testb $1, %al
+; NOGATHER-NEXT: je .LBB7_12
+; NOGATHER-NEXT: # %bb.11: # %cond.load13
+; NOGATHER-NEXT: vpextrq $1, %xmm3, %rax
+; NOGATHER-NEXT: vextractf128 $1, %ymm2, %xmm4
+; NOGATHER-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0],mem[0],xmm4[2,3]
+; NOGATHER-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
+; NOGATHER-NEXT: .LBB7_12: # %else14
+; NOGATHER-NEXT: vpextrb $12, %xmm0, %eax
+; NOGATHER-NEXT: testb $1, %al
+; NOGATHER-NEXT: je .LBB7_14
+; NOGATHER-NEXT: # %bb.13: # %cond.load16
+; NOGATHER-NEXT: vextractf128 $1, %ymm3, %xmm4
+; NOGATHER-NEXT: vmovq %xmm4, %rax
+; NOGATHER-NEXT: vextractf128 $1, %ymm2, %xmm4
+; NOGATHER-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1],mem[0],xmm4[3]
+; NOGATHER-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
+; NOGATHER-NEXT: .LBB7_14: # %else17
+; NOGATHER-NEXT: vpextrb $14, %xmm0, %eax
+; NOGATHER-NEXT: testb $1, %al
+; NOGATHER-NEXT: je .LBB7_16
+; NOGATHER-NEXT: # %bb.15: # %cond.load19
+; NOGATHER-NEXT: vextractf128 $1, %ymm3, %xmm3
+; NOGATHER-NEXT: vpextrq $1, %xmm3, %rax
+; NOGATHER-NEXT: vextractf128 $1, %ymm2, %xmm3
+; NOGATHER-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],mem[0]
+; NOGATHER-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; NOGATHER-NEXT: .LBB7_16: # %else20
+; NOGATHER-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; NOGATHER-NEXT: vpslld $31, %xmm3, %xmm3
+; NOGATHER-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
+; NOGATHER-NEXT: vpslld $31, %xmm0, %xmm0
+; NOGATHER-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0
+; NOGATHER-NEXT: vblendvps %ymm0, %ymm2, %ymm1, %ymm0
+; NOGATHER-NEXT: retq
+entry:
+ %ld = load <8 x float*>, <8 x float*>* %ptr
+ %res = call <8 x float> @llvm.masked.gather.v8float(<8 x float*> %ld, i32 0, <8 x i1> %masks, <8 x float> %passthro)
+ ret <8 x float> %res
+}
+
+declare <4 x i64> @llvm.masked.gather.v4i64(<4 x i64*> %ptrs, i32 %align, <4 x i1> %masks, <4 x i64> %passthro)
+
+define <4 x i64> @masked_gather_v4i64(<4 x i64*>* %ptr, <4 x i1> %masks, <4 x i64> %passthro) {
+; X86-LABEL: masked_gather_v4i64:
+; X86: # %bb.0: # %entry
+; X86-NEXT: vpslld $31, %xmm0, %xmm0
+; X86-NEXT: vpmovsxdq %xmm0, %ymm0
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: vmovdqa (%eax), %xmm2
+; X86-NEXT: vpgatherdq %ymm0, (,%xmm2), %ymm1
+; X86-NEXT: vmovdqa %ymm1, %ymm0
+; X86-NEXT: retl
+;
+; X64-LABEL: masked_gather_v4i64:
+; X64: # %bb.0: # %entry
+; X64-NEXT: vpslld $31, %xmm0, %xmm0
+; X64-NEXT: vpmovsxdq %xmm0, %ymm0
+; X64-NEXT: vmovdqa (%rdi), %ymm2
+; X64-NEXT: vpgatherqq %ymm0, (,%ymm2), %ymm1
+; X64-NEXT: vmovdqa %ymm1, %ymm0
+; X64-NEXT: retq
+;
+; NOGATHER-LABEL: masked_gather_v4i64:
+; NOGATHER: # %bb.0: # %entry
+; NOGATHER-NEXT: vmovdqa (%rdi), %ymm3
+; NOGATHER-NEXT: vpextrb $0, %xmm0, %eax
+; NOGATHER-NEXT: # implicit-def: %ymm2
+; NOGATHER-NEXT: testb $1, %al
+; NOGATHER-NEXT: je .LBB8_2
+; NOGATHER-NEXT: # %bb.1: # %cond.load
+; NOGATHER-NEXT: vmovq %xmm3, %rax
+; NOGATHER-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
+; NOGATHER-NEXT: .LBB8_2: # %else
+; NOGATHER-NEXT: vpextrb $4, %xmm0, %eax
+; NOGATHER-NEXT: testb $1, %al
+; NOGATHER-NEXT: je .LBB8_4
+; NOGATHER-NEXT: # %bb.3: # %cond.load1
+; NOGATHER-NEXT: vpextrq $1, %xmm3, %rax
+; NOGATHER-NEXT: vpinsrq $1, (%rax), %xmm2, %xmm4
+; NOGATHER-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
+; NOGATHER-NEXT: .LBB8_4: # %else2
+; NOGATHER-NEXT: vpextrb $8, %xmm0, %eax
+; NOGATHER-NEXT: testb $1, %al
+; NOGATHER-NEXT: je .LBB8_6
+; NOGATHER-NEXT: # %bb.5: # %cond.load4
+; NOGATHER-NEXT: vextractf128 $1, %ymm3, %xmm4
+; NOGATHER-NEXT: vmovq %xmm4, %rax
+; NOGATHER-NEXT: vextractf128 $1, %ymm2, %xmm4
+; NOGATHER-NEXT: vpinsrq $0, (%rax), %xmm4, %xmm4
+; NOGATHER-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
+; NOGATHER-NEXT: .LBB8_6: # %else5
+; NOGATHER-NEXT: vpextrb $12, %xmm0, %eax
+; NOGATHER-NEXT: testb $1, %al
+; NOGATHER-NEXT: je .LBB8_8
+; NOGATHER-NEXT: # %bb.7: # %cond.load7
+; NOGATHER-NEXT: vextractf128 $1, %ymm3, %xmm3
+; NOGATHER-NEXT: vpextrq $1, %xmm3, %rax
+; NOGATHER-NEXT: vextractf128 $1, %ymm2, %xmm3
+; NOGATHER-NEXT: vpinsrq $1, (%rax), %xmm3, %xmm3
+; NOGATHER-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; NOGATHER-NEXT: .LBB8_8: # %else8
+; NOGATHER-NEXT: vpslld $31, %xmm0, %xmm0
+; NOGATHER-NEXT: vpsrad $31, %xmm0, %xmm0
+; NOGATHER-NEXT: vpmovsxdq %xmm0, %xmm3
+; NOGATHER-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; NOGATHER-NEXT: vpmovsxdq %xmm0, %xmm0
+; NOGATHER-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0
+; NOGATHER-NEXT: vblendvpd %ymm0, %ymm2, %ymm1, %ymm0
+; NOGATHER-NEXT: retq
+entry:
+ %ld = load <4 x i64*>, <4 x i64*>* %ptr
+ %res = call <4 x i64> @llvm.masked.gather.v4i64(<4 x i64*> %ld, i32 0, <4 x i1> %masks, <4 x i64> %passthro)
+ ret <4 x i64> %res
+}
+
+declare <4 x double> @llvm.masked.gather.v4double(<4 x double*> %ptrs, i32 %align, <4 x i1> %masks, <4 x double> %passthro)
+
+define <4 x double> @masked_gather_v4double(<4 x double*>* %ptr, <4 x i1> %masks, <4 x double> %passthro) {
+; X86-LABEL: masked_gather_v4double:
+; X86: # %bb.0: # %entry
+; X86-NEXT: vpslld $31, %xmm0, %xmm0
+; X86-NEXT: vpmovsxdq %xmm0, %ymm0
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: vmovapd (%eax), %xmm2
+; X86-NEXT: vgatherdpd %ymm0, (,%xmm2), %ymm1
+; X86-NEXT: vmovapd %ymm1, %ymm0
+; X86-NEXT: retl
+;
+; X64-LABEL: masked_gather_v4double:
+; X64: # %bb.0: # %entry
+; X64-NEXT: vpslld $31, %xmm0, %xmm0
+; X64-NEXT: vpmovsxdq %xmm0, %ymm0
+; X64-NEXT: vmovapd (%rdi), %ymm2
+; X64-NEXT: vgatherqpd %ymm0, (,%ymm2), %ymm1
+; X64-NEXT: vmovapd %ymm1, %ymm0
+; X64-NEXT: retq
+;
+; NOGATHER-LABEL: masked_gather_v4double:
+; NOGATHER: # %bb.0: # %entry
+; NOGATHER-NEXT: vmovdqa (%rdi), %ymm3
+; NOGATHER-NEXT: vpextrb $0, %xmm0, %eax
+; NOGATHER-NEXT: # implicit-def: %ymm2
+; NOGATHER-NEXT: testb $1, %al
+; NOGATHER-NEXT: je .LBB9_2
+; NOGATHER-NEXT: # %bb.1: # %cond.load
+; NOGATHER-NEXT: vmovq %xmm3, %rax
+; NOGATHER-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
+; NOGATHER-NEXT: .LBB9_2: # %else
+; NOGATHER-NEXT: vpextrb $4, %xmm0, %eax
+; NOGATHER-NEXT: testb $1, %al
+; NOGATHER-NEXT: je .LBB9_4
+; NOGATHER-NEXT: # %bb.3: # %cond.load1
+; NOGATHER-NEXT: vpextrq $1, %xmm3, %rax
+; NOGATHER-NEXT: vmovhpd {{.*#+}} xmm4 = xmm2[0],mem[0]
+; NOGATHER-NEXT: vblendpd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3]
+; NOGATHER-NEXT: .LBB9_4: # %else2
+; NOGATHER-NEXT: vpextrb $8, %xmm0, %eax
+; NOGATHER-NEXT: testb $1, %al
+; NOGATHER-NEXT: je .LBB9_6
+; NOGATHER-NEXT: # %bb.5: # %cond.load4
+; NOGATHER-NEXT: vextractf128 $1, %ymm3, %xmm4
+; NOGATHER-NEXT: vmovq %xmm4, %rax
+; NOGATHER-NEXT: vextractf128 $1, %ymm2, %xmm4
+; NOGATHER-NEXT: vmovlpd {{.*#+}} xmm4 = mem[0],xmm4[1]
+; NOGATHER-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
+; NOGATHER-NEXT: .LBB9_6: # %else5
+; NOGATHER-NEXT: vpextrb $12, %xmm0, %eax
+; NOGATHER-NEXT: testb $1, %al
+; NOGATHER-NEXT: je .LBB9_8
+; NOGATHER-NEXT: # %bb.7: # %cond.load7
+; NOGATHER-NEXT: vextractf128 $1, %ymm3, %xmm3
+; NOGATHER-NEXT: vpextrq $1, %xmm3, %rax
+; NOGATHER-NEXT: vextractf128 $1, %ymm2, %xmm3
+; NOGATHER-NEXT: vmovhpd {{.*#+}} xmm3 = xmm3[0],mem[0]
+; NOGATHER-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; NOGATHER-NEXT: .LBB9_8: # %else8
+; NOGATHER-NEXT: vpslld $31, %xmm0, %xmm0
+; NOGATHER-NEXT: vpsrad $31, %xmm0, %xmm0
+; NOGATHER-NEXT: vpmovsxdq %xmm0, %xmm3
+; NOGATHER-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; NOGATHER-NEXT: vpmovsxdq %xmm0, %xmm0
+; NOGATHER-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0
+; NOGATHER-NEXT: vblendvpd %ymm0, %ymm2, %ymm1, %ymm0
+; NOGATHER-NEXT: retq
+entry:
+ %ld = load <4 x double*>, <4 x double*>* %ptr
+ %res = call <4 x double> @llvm.masked.gather.v4double(<4 x double*> %ld, i32 0, <4 x i1> %masks, <4 x double> %passthro)
+ ret <4 x double> %res
+}
+
+declare <2 x i64> @llvm.masked.gather.v2i64(<2 x i64*> %ptrs, i32 %align, <2 x i1> %masks, <2 x i64> %passthro)
+
+define <2 x i64> @masked_gather_v2i64(<2 x i64*>* %ptr, <2 x i1> %masks, <2 x i64> %passthro) {
+; X86-LABEL: masked_gather_v2i64:
+; X86: # %bb.0: # %entry
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: vpmovsxdq (%eax), %xmm2
+; X86-NEXT: vpsllq $63, %xmm0, %xmm0
+; X86-NEXT: vpgatherqq %xmm0, (,%xmm2), %xmm1
+; X86-NEXT: vmovdqa %xmm1, %xmm0
+; X86-NEXT: retl
+;
+; X64-LABEL: masked_gather_v2i64:
+; X64: # %bb.0: # %entry
+; X64-NEXT: vpsllq $63, %xmm0, %xmm0
+; X64-NEXT: vmovdqa (%rdi), %xmm2
+; X64-NEXT: vpgatherqq %xmm0, (,%xmm2), %xmm1
+; X64-NEXT: vmovdqa %xmm1, %xmm0
+; X64-NEXT: retq
+;
+; NOGATHER-LABEL: masked_gather_v2i64:
+; NOGATHER: # %bb.0: # %entry
+; NOGATHER-NEXT: vmovdqa (%rdi), %xmm3
+; NOGATHER-NEXT: vpextrb $0, %xmm0, %eax
+; NOGATHER-NEXT: # implicit-def: %xmm2
+; NOGATHER-NEXT: testb $1, %al
+; NOGATHER-NEXT: je .LBB10_2
+; NOGATHER-NEXT: # %bb.1: # %cond.load
+; NOGATHER-NEXT: vmovq %xmm3, %rax
+; NOGATHER-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
+; NOGATHER-NEXT: .LBB10_2: # %else
+; NOGATHER-NEXT: vpextrb $8, %xmm0, %eax
+; NOGATHER-NEXT: testb $1, %al
+; NOGATHER-NEXT: je .LBB10_4
+; NOGATHER-NEXT: # %bb.3: # %cond.load1
+; NOGATHER-NEXT: vpextrq $1, %xmm3, %rax
+; NOGATHER-NEXT: vpinsrq $1, (%rax), %xmm2, %xmm2
+; NOGATHER-NEXT: .LBB10_4: # %else2
+; NOGATHER-NEXT: vpsllq $63, %xmm0, %xmm0
+; NOGATHER-NEXT: vblendvpd %xmm0, %xmm2, %xmm1, %xmm0
+; NOGATHER-NEXT: retq
+entry:
+ %ld = load <2 x i64*>, <2 x i64*>* %ptr
+ %res = call <2 x i64> @llvm.masked.gather.v2i64(<2 x i64*> %ld, i32 0, <2 x i1> %masks, <2 x i64> %passthro)
+ ret <2 x i64> %res
+}
+
+declare <2 x double> @llvm.masked.gather.v2double(<2 x double*> %ptrs, i32 %align, <2 x i1> %masks, <2 x double> %passthro)
+
+define <2 x double> @masked_gather_v2double(<2 x double*>* %ptr, <2 x i1> %masks, <2 x double> %passthro) {
+; X86-LABEL: masked_gather_v2double:
+; X86: # %bb.0: # %entry
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: vpmovsxdq (%eax), %xmm2
+; X86-NEXT: vpsllq $63, %xmm0, %xmm0
+; X86-NEXT: vgatherqpd %xmm0, (,%xmm2), %xmm1
+; X86-NEXT: vmovapd %xmm1, %xmm0
+; X86-NEXT: retl
+;
+; X64-LABEL: masked_gather_v2double:
+; X64: # %bb.0: # %entry
+; X64-NEXT: vpsllq $63, %xmm0, %xmm0
+; X64-NEXT: vmovapd (%rdi), %xmm2
+; X64-NEXT: vgatherqpd %xmm0, (,%xmm2), %xmm1
+; X64-NEXT: vmovapd %xmm1, %xmm0
+; X64-NEXT: retq
+;
+; NOGATHER-LABEL: masked_gather_v2double:
+; NOGATHER: # %bb.0: # %entry
+; NOGATHER-NEXT: vmovdqa (%rdi), %xmm3
+; NOGATHER-NEXT: vpextrb $0, %xmm0, %eax
+; NOGATHER-NEXT: # implicit-def: %xmm2
+; NOGATHER-NEXT: testb $1, %al
+; NOGATHER-NEXT: je .LBB11_2
+; NOGATHER-NEXT: # %bb.1: # %cond.load
+; NOGATHER-NEXT: vmovq %xmm3, %rax
+; NOGATHER-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
+; NOGATHER-NEXT: .LBB11_2: # %else
+; NOGATHER-NEXT: vpextrb $8, %xmm0, %eax
+; NOGATHER-NEXT: testb $1, %al
+; NOGATHER-NEXT: je .LBB11_4
+; NOGATHER-NEXT: # %bb.3: # %cond.load1
+; NOGATHER-NEXT: vpextrq $1, %xmm3, %rax
+; NOGATHER-NEXT: vmovhpd {{.*#+}} xmm2 = xmm2[0],mem[0]
+; NOGATHER-NEXT: .LBB11_4: # %else2
+; NOGATHER-NEXT: vpsllq $63, %xmm0, %xmm0
+; NOGATHER-NEXT: vblendvpd %xmm0, %xmm2, %xmm1, %xmm0
+; NOGATHER-NEXT: retq
+entry:
+ %ld = load <2 x double*>, <2 x double*>* %ptr
+ %res = call <2 x double> @llvm.masked.gather.v2double(<2 x double*> %ld, i32 0, <2 x i1> %masks, <2 x double> %passthro)
+ ret <2 x double> %res
+}
+
diff --git a/test/CodeGen/X86/avx2-nontemporal.ll b/test/CodeGen/X86/avx2-nontemporal.ll
index 55c966f6f884..dac8b0e704ef 100644
--- a/test/CodeGen/X86/avx2-nontemporal.ll
+++ b/test/CodeGen/X86/avx2-nontemporal.ll
@@ -2,9 +2,9 @@
; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx2 | FileCheck %s --check-prefix=X32
; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2 | FileCheck %s --check-prefix=X64
-define void @f(<8 x float> %A, i8* %B, <4 x double> %C, <4 x i64> %E, <8 x i32> %F, <16 x i16> %G, <32 x i8> %H) nounwind {
+define i32 @f(<8 x float> %A, i8* %B, <4 x double> %C, <4 x i64> %E, <8 x i32> %F, <16 x i16> %G, <32 x i8> %H, i32* %loadptr) nounwind {
; X32-LABEL: f:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pushl %ebp
; X32-NEXT: movl %esp, %ebp
; X32-NEXT: andl $-32, %esp
@@ -12,59 +12,85 @@ define void @f(<8 x float> %A, i8* %B, <4 x double> %C, <4 x i64> %E, <8 x i32>
; X32-NEXT: vmovdqa 104(%ebp), %ymm3
; X32-NEXT: vmovdqa 72(%ebp), %ymm4
; X32-NEXT: vmovdqa 40(%ebp), %ymm5
-; X32-NEXT: movl 8(%ebp), %eax
-; X32-NEXT: vaddps .LCPI0_0, %ymm0, %ymm0
-; X32-NEXT: vmovntps %ymm0, (%eax)
-; X32-NEXT: vpaddq .LCPI0_1, %ymm2, %ymm0
-; X32-NEXT: vmovntdq %ymm0, (%eax)
-; X32-NEXT: vaddpd .LCPI0_2, %ymm1, %ymm0
-; X32-NEXT: vmovntpd %ymm0, (%eax)
-; X32-NEXT: vpaddd .LCPI0_3, %ymm5, %ymm0
-; X32-NEXT: vmovntdq %ymm0, (%eax)
-; X32-NEXT: vpaddw .LCPI0_4, %ymm4, %ymm0
-; X32-NEXT: vmovntdq %ymm0, (%eax)
-; X32-NEXT: vpaddb .LCPI0_5, %ymm3, %ymm0
-; X32-NEXT: vmovntdq %ymm0, (%eax)
+; X32-NEXT: movl 8(%ebp), %ecx
+; X32-NEXT: movl 136(%ebp), %edx
+; X32-NEXT: movl (%edx), %eax
+; X32-NEXT: vaddps {{\.LCPI.*}}, %ymm0, %ymm0
+; X32-NEXT: vmovntps %ymm0, (%ecx)
+; X32-NEXT: vpaddq {{\.LCPI.*}}, %ymm2, %ymm0
+; X32-NEXT: addl (%edx), %eax
+; X32-NEXT: vmovntdq %ymm0, (%ecx)
+; X32-NEXT: vaddpd {{\.LCPI.*}}, %ymm1, %ymm0
+; X32-NEXT: addl (%edx), %eax
+; X32-NEXT: vmovntpd %ymm0, (%ecx)
+; X32-NEXT: vpaddd {{\.LCPI.*}}, %ymm5, %ymm0
+; X32-NEXT: addl (%edx), %eax
+; X32-NEXT: vmovntdq %ymm0, (%ecx)
+; X32-NEXT: vpaddw {{\.LCPI.*}}, %ymm4, %ymm0
+; X32-NEXT: addl (%edx), %eax
+; X32-NEXT: vmovntdq %ymm0, (%ecx)
+; X32-NEXT: vpaddb {{\.LCPI.*}}, %ymm3, %ymm0
+; X32-NEXT: addl (%edx), %eax
+; X32-NEXT: vmovntdq %ymm0, (%ecx)
; X32-NEXT: movl %ebp, %esp
; X32-NEXT: popl %ebp
; X32-NEXT: vzeroupper
; X32-NEXT: retl
;
; X64-LABEL: f:
-; X64: # BB#0:
+; X64: # %bb.0:
+; X64-NEXT: movl (%rsi), %eax
; X64-NEXT: vaddps {{.*}}(%rip), %ymm0, %ymm0
; X64-NEXT: vmovntps %ymm0, (%rdi)
; X64-NEXT: vpaddq {{.*}}(%rip), %ymm2, %ymm0
+; X64-NEXT: addl (%rsi), %eax
; X64-NEXT: vmovntdq %ymm0, (%rdi)
; X64-NEXT: vaddpd {{.*}}(%rip), %ymm1, %ymm0
+; X64-NEXT: addl (%rsi), %eax
; X64-NEXT: vmovntpd %ymm0, (%rdi)
; X64-NEXT: vpaddd {{.*}}(%rip), %ymm3, %ymm0
+; X64-NEXT: addl (%rsi), %eax
; X64-NEXT: vmovntdq %ymm0, (%rdi)
; X64-NEXT: vpaddw {{.*}}(%rip), %ymm4, %ymm0
+; X64-NEXT: addl (%rsi), %eax
; X64-NEXT: vmovntdq %ymm0, (%rdi)
; X64-NEXT: vpaddb {{.*}}(%rip), %ymm5, %ymm0
+; X64-NEXT: addl (%rsi), %eax
; X64-NEXT: vmovntdq %ymm0, (%rdi)
; X64-NEXT: vzeroupper
; X64-NEXT: retq
+ %v0 = load i32, i32* %loadptr, align 1
%cast = bitcast i8* %B to <8 x float>*
%A2 = fadd <8 x float> %A, <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>
store <8 x float> %A2, <8 x float>* %cast, align 32, !nontemporal !0
+ %v1 = load i32, i32* %loadptr, align 1
%cast1 = bitcast i8* %B to <4 x i64>*
%E2 = add <4 x i64> %E, <i64 1, i64 2, i64 3, i64 4>
store <4 x i64> %E2, <4 x i64>* %cast1, align 32, !nontemporal !0
+ %v2 = load i32, i32* %loadptr, align 1
%cast2 = bitcast i8* %B to <4 x double>*
%C2 = fadd <4 x double> %C, <double 1.0, double 2.0, double 3.0, double 4.0>
store <4 x double> %C2, <4 x double>* %cast2, align 32, !nontemporal !0
+ %v3 = load i32, i32* %loadptr, align 1
%cast3 = bitcast i8* %B to <8 x i32>*
%F2 = add <8 x i32> %F, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
store <8 x i32> %F2, <8 x i32>* %cast3, align 32, !nontemporal !0
+ %v4 = load i32, i32* %loadptr, align 1
%cast4 = bitcast i8* %B to <16 x i16>*
%G2 = add <16 x i16> %G, <i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8>
store <16 x i16> %G2, <16 x i16>* %cast4, align 32, !nontemporal !0
+ %v5 = load i32, i32* %loadptr, align 1
%cast5 = bitcast i8* %B to <32 x i8>*
%H2 = add <32 x i8> %H, <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8>
store <32 x i8> %H2, <32 x i8>* %cast5, align 32, !nontemporal !0
- ret void
+ %v6 = load i32, i32* %loadptr, align 1
+ %sum1 = add i32 %v0, %v1
+ %sum2 = add i32 %sum1, %v2
+ %sum3 = add i32 %sum2, %v3
+ %sum4 = add i32 %sum3, %v4
+ %sum5 = add i32 %sum4, %v5
+ %sum6 = add i32 %sum5, %v6
+ ret i32 %sum5
}
!0 = !{i32 1}
diff --git a/test/CodeGen/X86/avx2-phaddsub.ll b/test/CodeGen/X86/avx2-phaddsub.ll
index 232a3326fa13..67ea37575abc 100644
--- a/test/CodeGen/X86/avx2-phaddsub.ll
+++ b/test/CodeGen/X86/avx2-phaddsub.ll
@@ -4,12 +4,12 @@
define <16 x i16> @phaddw1(<16 x i16> %x, <16 x i16> %y) {
; X32-LABEL: phaddw1:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vphaddw %ymm1, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: phaddw1:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vphaddw %ymm1, %ymm0, %ymm0
; X64-NEXT: retq
%a = shufflevector <16 x i16> %x, <16 x i16> %y, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
@@ -20,12 +20,12 @@ define <16 x i16> @phaddw1(<16 x i16> %x, <16 x i16> %y) {
define <16 x i16> @phaddw2(<16 x i16> %x, <16 x i16> %y) {
; X32-LABEL: phaddw2:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vphaddw %ymm1, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: phaddw2:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vphaddw %ymm1, %ymm0, %ymm0
; X64-NEXT: retq
%a = shufflevector <16 x i16> %x, <16 x i16> %y, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
@@ -36,12 +36,12 @@ define <16 x i16> @phaddw2(<16 x i16> %x, <16 x i16> %y) {
define <8 x i32> @phaddd1(<8 x i32> %x, <8 x i32> %y) {
; X32-LABEL: phaddd1:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vphaddd %ymm1, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: phaddd1:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vphaddd %ymm1, %ymm0, %ymm0
; X64-NEXT: retq
%a = shufflevector <8 x i32> %x, <8 x i32> %y, <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
@@ -52,12 +52,12 @@ define <8 x i32> @phaddd1(<8 x i32> %x, <8 x i32> %y) {
define <8 x i32> @phaddd2(<8 x i32> %x, <8 x i32> %y) {
; X32-LABEL: phaddd2:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vphaddd %ymm1, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: phaddd2:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vphaddd %ymm1, %ymm0, %ymm0
; X64-NEXT: retq
%a = shufflevector <8 x i32> %x, <8 x i32> %y, <8 x i32> <i32 1, i32 2, i32 9, i32 10, i32 5, i32 6, i32 13, i32 14>
@@ -68,12 +68,12 @@ define <8 x i32> @phaddd2(<8 x i32> %x, <8 x i32> %y) {
define <8 x i32> @phaddd3(<8 x i32> %x) {
; X32-LABEL: phaddd3:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vphaddd %ymm0, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: phaddd3:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vphaddd %ymm0, %ymm0, %ymm0
; X64-NEXT: retq
%a = shufflevector <8 x i32> %x, <8 x i32> undef, <8 x i32> <i32 undef, i32 2, i32 8, i32 10, i32 4, i32 6, i32 undef, i32 14>
@@ -84,12 +84,12 @@ define <8 x i32> @phaddd3(<8 x i32> %x) {
define <16 x i16> @phsubw1(<16 x i16> %x, <16 x i16> %y) {
; X32-LABEL: phsubw1:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vphsubw %ymm1, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: phsubw1:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vphsubw %ymm1, %ymm0, %ymm0
; X64-NEXT: retq
%a = shufflevector <16 x i16> %x, <16 x i16> %y, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
@@ -100,12 +100,12 @@ define <16 x i16> @phsubw1(<16 x i16> %x, <16 x i16> %y) {
define <8 x i32> @phsubd1(<8 x i32> %x, <8 x i32> %y) {
; X32-LABEL: phsubd1:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vphsubd %ymm1, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: phsubd1:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vphsubd %ymm1, %ymm0, %ymm0
; X64-NEXT: retq
%a = shufflevector <8 x i32> %x, <8 x i32> %y, <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
@@ -116,12 +116,12 @@ define <8 x i32> @phsubd1(<8 x i32> %x, <8 x i32> %y) {
define <8 x i32> @phsubd2(<8 x i32> %x, <8 x i32> %y) {
; X32-LABEL: phsubd2:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vphsubd %ymm1, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: phsubd2:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vphsubd %ymm1, %ymm0, %ymm0
; X64-NEXT: retq
%a = shufflevector <8 x i32> %x, <8 x i32> %y, <8 x i32> <i32 0, i32 undef, i32 8, i32 undef, i32 4, i32 6, i32 12, i32 14>
diff --git a/test/CodeGen/X86/avx2-pmovxrm.ll b/test/CodeGen/X86/avx2-pmovxrm.ll
index 7ba7ae527383..67f33b2a14c7 100644
--- a/test/CodeGen/X86/avx2-pmovxrm.ll
+++ b/test/CodeGen/X86/avx2-pmovxrm.ll
@@ -6,13 +6,13 @@
define <16 x i16> @test_llvm_x86_avx2_pmovsxbw(<16 x i8>* %a) {
; X32-LABEL: test_llvm_x86_avx2_pmovsxbw:
-; X32: ## BB#0:
+; X32: ## %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpmovsxbw (%eax), %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_llvm_x86_avx2_pmovsxbw:
-; X64: ## BB#0:
+; X64: ## %bb.0:
; X64-NEXT: vpmovsxbw (%rdi), %ymm0
; X64-NEXT: retq
%1 = load <16 x i8>, <16 x i8>* %a, align 1
@@ -22,13 +22,13 @@ define <16 x i16> @test_llvm_x86_avx2_pmovsxbw(<16 x i8>* %a) {
define <8 x i32> @test_llvm_x86_avx2_pmovsxbd(<16 x i8>* %a) {
; X32-LABEL: test_llvm_x86_avx2_pmovsxbd:
-; X32: ## BB#0:
+; X32: ## %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpmovsxbd (%eax), %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_llvm_x86_avx2_pmovsxbd:
-; X64: ## BB#0:
+; X64: ## %bb.0:
; X64-NEXT: vpmovsxbd (%rdi), %ymm0
; X64-NEXT: retq
%1 = load <16 x i8>, <16 x i8>* %a, align 1
@@ -39,13 +39,13 @@ define <8 x i32> @test_llvm_x86_avx2_pmovsxbd(<16 x i8>* %a) {
define <4 x i64> @test_llvm_x86_avx2_pmovsxbq(<16 x i8>* %a) {
; X32-LABEL: test_llvm_x86_avx2_pmovsxbq:
-; X32: ## BB#0:
+; X32: ## %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpmovsxbq (%eax), %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_llvm_x86_avx2_pmovsxbq:
-; X64: ## BB#0:
+; X64: ## %bb.0:
; X64-NEXT: vpmovsxbq (%rdi), %ymm0
; X64-NEXT: retq
%1 = load <16 x i8>, <16 x i8>* %a, align 1
@@ -56,13 +56,13 @@ define <4 x i64> @test_llvm_x86_avx2_pmovsxbq(<16 x i8>* %a) {
define <8 x i32> @test_llvm_x86_avx2_pmovsxwd(<8 x i16>* %a) {
; X32-LABEL: test_llvm_x86_avx2_pmovsxwd:
-; X32: ## BB#0:
+; X32: ## %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpmovsxwd (%eax), %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_llvm_x86_avx2_pmovsxwd:
-; X64: ## BB#0:
+; X64: ## %bb.0:
; X64-NEXT: vpmovsxwd (%rdi), %ymm0
; X64-NEXT: retq
%1 = load <8 x i16>, <8 x i16>* %a, align 1
@@ -72,13 +72,13 @@ define <8 x i32> @test_llvm_x86_avx2_pmovsxwd(<8 x i16>* %a) {
define <4 x i64> @test_llvm_x86_avx2_pmovsxwq(<8 x i16>* %a) {
; X32-LABEL: test_llvm_x86_avx2_pmovsxwq:
-; X32: ## BB#0:
+; X32: ## %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpmovsxwq (%eax), %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_llvm_x86_avx2_pmovsxwq:
-; X64: ## BB#0:
+; X64: ## %bb.0:
; X64-NEXT: vpmovsxwq (%rdi), %ymm0
; X64-NEXT: retq
%1 = load <8 x i16>, <8 x i16>* %a, align 1
@@ -89,13 +89,13 @@ define <4 x i64> @test_llvm_x86_avx2_pmovsxwq(<8 x i16>* %a) {
define <4 x i64> @test_llvm_x86_avx2_pmovsxdq(<4 x i32>* %a) {
; X32-LABEL: test_llvm_x86_avx2_pmovsxdq:
-; X32: ## BB#0:
+; X32: ## %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpmovsxdq (%eax), %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_llvm_x86_avx2_pmovsxdq:
-; X64: ## BB#0:
+; X64: ## %bb.0:
; X64-NEXT: vpmovsxdq (%rdi), %ymm0
; X64-NEXT: retq
%1 = load <4 x i32>, <4 x i32>* %a, align 1
@@ -105,13 +105,13 @@ define <4 x i64> @test_llvm_x86_avx2_pmovsxdq(<4 x i32>* %a) {
define <16 x i16> @test_llvm_x86_avx2_pmovzxbw(<16 x i8>* %a) {
; X32-LABEL: test_llvm_x86_avx2_pmovzxbw:
-; X32: ## BB#0:
+; X32: ## %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
; X32-NEXT: retl
;
; X64-LABEL: test_llvm_x86_avx2_pmovzxbw:
-; X64: ## BB#0:
+; X64: ## %bb.0:
; X64-NEXT: vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
; X64-NEXT: retq
%1 = load <16 x i8>, <16 x i8>* %a, align 1
@@ -121,13 +121,13 @@ define <16 x i16> @test_llvm_x86_avx2_pmovzxbw(<16 x i8>* %a) {
define <8 x i32> @test_llvm_x86_avx2_pmovzxbd(<16 x i8>* %a) {
; X32-LABEL: test_llvm_x86_avx2_pmovzxbd:
-; X32: ## BB#0:
+; X32: ## %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
; X32-NEXT: retl
;
; X64-LABEL: test_llvm_x86_avx2_pmovzxbd:
-; X64: ## BB#0:
+; X64: ## %bb.0:
; X64-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
; X64-NEXT: retq
%1 = load <16 x i8>, <16 x i8>* %a, align 1
@@ -138,13 +138,13 @@ define <8 x i32> @test_llvm_x86_avx2_pmovzxbd(<16 x i8>* %a) {
define <4 x i64> @test_llvm_x86_avx2_pmovzxbq(<16 x i8>* %a) {
; X32-LABEL: test_llvm_x86_avx2_pmovzxbq:
-; X32: ## BB#0:
+; X32: ## %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpmovzxbq {{.*#+}} ymm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero
; X32-NEXT: retl
;
; X64-LABEL: test_llvm_x86_avx2_pmovzxbq:
-; X64: ## BB#0:
+; X64: ## %bb.0:
; X64-NEXT: vpmovzxbq {{.*#+}} ymm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero
; X64-NEXT: retq
%1 = load <16 x i8>, <16 x i8>* %a, align 1
@@ -155,13 +155,13 @@ define <4 x i64> @test_llvm_x86_avx2_pmovzxbq(<16 x i8>* %a) {
define <8 x i32> @test_llvm_x86_avx2_pmovzxwd(<8 x i16>* %a) {
; X32-LABEL: test_llvm_x86_avx2_pmovzxwd:
-; X32: ## BB#0:
+; X32: ## %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
; X32-NEXT: retl
;
; X64-LABEL: test_llvm_x86_avx2_pmovzxwd:
-; X64: ## BB#0:
+; X64: ## %bb.0:
; X64-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
; X64-NEXT: retq
%1 = load <8 x i16>, <8 x i16>* %a, align 1
@@ -171,13 +171,13 @@ define <8 x i32> @test_llvm_x86_avx2_pmovzxwd(<8 x i16>* %a) {
define <4 x i64> @test_llvm_x86_avx2_pmovzxwq(<8 x i16>* %a) {
; X32-LABEL: test_llvm_x86_avx2_pmovzxwq:
-; X32: ## BB#0:
+; X32: ## %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpmovzxwq {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; X32-NEXT: retl
;
; X64-LABEL: test_llvm_x86_avx2_pmovzxwq:
-; X64: ## BB#0:
+; X64: ## %bb.0:
; X64-NEXT: vpmovzxwq {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; X64-NEXT: retq
%1 = load <8 x i16>, <8 x i16>* %a, align 1
@@ -188,13 +188,13 @@ define <4 x i64> @test_llvm_x86_avx2_pmovzxwq(<8 x i16>* %a) {
define <4 x i64> @test_llvm_x86_avx2_pmovzxdq(<4 x i32>* %a) {
; X32-LABEL: test_llvm_x86_avx2_pmovzxdq:
-; X32: ## BB#0:
+; X32: ## %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpmovzxdq {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
; X32-NEXT: retl
;
; X64-LABEL: test_llvm_x86_avx2_pmovzxdq:
-; X64: ## BB#0:
+; X64: ## %bb.0:
; X64-NEXT: vpmovzxdq {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
; X64-NEXT: retq
%1 = load <4 x i32>, <4 x i32>* %a, align 1
diff --git a/test/CodeGen/X86/avx2-schedule.ll b/test/CodeGen/X86/avx2-schedule.ll
index a3862d7e27c6..df6b1918c6ab 100644
--- a/test/CodeGen/X86/avx2-schedule.ll
+++ b/test/CodeGen/X86/avx2-schedule.ll
@@ -1,22 +1,696 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=GENERIC
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=haswell | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=broadwell | FileCheck %s --check-prefix=CHECK --check-prefix=BROADWELL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=SKYLAKE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skx | FileCheck %s --check-prefix=CHECK --check-prefix=SKX
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1
+define <8 x i32> @test_broadcasti128(<8 x i32> %a0, <4 x i32> *%a1) {
+; GENERIC-LABEL: test_broadcasti128:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vbroadcasti128 {{.*#+}} ymm1 = mem[0,1,0,1] sched: [4:0.50]
+; GENERIC-NEXT: vpaddd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_broadcasti128:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = mem[0,1,0,1] sched: [7:0.50]
+; HASWELL-NEXT: vpaddd %ymm0, %ymm1, %ymm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_broadcasti128:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = mem[0,1,0,1] sched: [6:0.50]
+; BROADWELL-NEXT: vpaddd %ymm0, %ymm1, %ymm0 # sched: [1:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_broadcasti128:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vbroadcasti128 {{.*#+}} ymm1 = mem[0,1,0,1] sched: [7:0.50]
+; SKYLAKE-NEXT: vpaddd %ymm0, %ymm1, %ymm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_broadcasti128:
+; SKX: # %bb.0:
+; SKX-NEXT: vbroadcasti128 {{.*#+}} ymm1 = mem[0,1,0,1] sched: [7:0.50]
+; SKX-NEXT: vpaddd %ymm0, %ymm1, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_broadcasti128:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vbroadcasti128 {{.*#+}} ymm1 = mem[0,1,0,1] sched: [8:0.50]
+; ZNVER1-NEXT: vpaddd %ymm0, %ymm1, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = load <4 x i32>, <4 x i32> *%a1, align 16
+ %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+ %3 = add <8 x i32> %2, %a0
+ ret <8 x i32> %3
+}
+
+define <4 x double> @test_broadcastsd_ymm(<2 x double> %a0) {
+; GENERIC-LABEL: test_broadcastsd_ymm:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vbroadcastsd %xmm0, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_broadcastsd_ymm:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vbroadcastsd %xmm0, %ymm0 # sched: [3:1.00]
+; HASWELL-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_broadcastsd_ymm:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vbroadcastsd %xmm0, %ymm0 # sched: [3:1.00]
+; BROADWELL-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_broadcastsd_ymm:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vbroadcastsd %xmm0, %ymm0 # sched: [3:1.00]
+; SKYLAKE-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_broadcastsd_ymm:
+; SKX: # %bb.0:
+; SKX-NEXT: vbroadcastsd %xmm0, %ymm0 # sched: [3:1.00]
+; SKX-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_broadcastsd_ymm:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vbroadcastsd %xmm0, %ymm0 # sched: [100:0.25]
+; ZNVER1-NEXT: vaddpd %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = shufflevector <2 x double> %a0, <2 x double> undef, <4 x i32> zeroinitializer
+ %2 = fadd <4 x double> %1, %1
+ ret <4 x double> %2
+}
+
+define <4 x float> @test_broadcastss(<4 x float> %a0) {
+; GENERIC-LABEL: test_broadcastss:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vbroadcastss %xmm0, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: vaddps %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_broadcastss:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vbroadcastss %xmm0, %xmm0 # sched: [1:1.00]
+; HASWELL-NEXT: vaddps %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_broadcastss:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vbroadcastss %xmm0, %xmm0 # sched: [1:1.00]
+; BROADWELL-NEXT: vaddps %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_broadcastss:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vbroadcastss %xmm0, %xmm0 # sched: [1:1.00]
+; SKYLAKE-NEXT: vaddps %xmm0, %xmm0, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_broadcastss:
+; SKX: # %bb.0:
+; SKX-NEXT: vbroadcastss %xmm0, %xmm0 # sched: [1:1.00]
+; SKX-NEXT: vaddps %xmm0, %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_broadcastss:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vbroadcastss %xmm0, %xmm0 # sched: [1:0.50]
+; ZNVER1-NEXT: vaddps %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> zeroinitializer
+ %2 = fadd <4 x float> %1, %1
+ ret <4 x float> %2
+}
+
+define <8 x float> @test_broadcastss_ymm(<4 x float> %a0) {
+; GENERIC-LABEL: test_broadcastss_ymm:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vbroadcastss %xmm0, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_broadcastss_ymm:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vbroadcastss %xmm0, %ymm0 # sched: [3:1.00]
+; HASWELL-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_broadcastss_ymm:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vbroadcastss %xmm0, %ymm0 # sched: [3:1.00]
+; BROADWELL-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_broadcastss_ymm:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vbroadcastss %xmm0, %ymm0 # sched: [3:1.00]
+; SKYLAKE-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_broadcastss_ymm:
+; SKX: # %bb.0:
+; SKX-NEXT: vbroadcastss %xmm0, %ymm0 # sched: [3:1.00]
+; SKX-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_broadcastss_ymm:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vbroadcastss %xmm0, %ymm0 # sched: [100:0.25]
+; ZNVER1-NEXT: vaddps %ymm0, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = shufflevector <4 x float> %a0, <4 x float> undef, <8 x i32> zeroinitializer
+ %2 = fadd <8 x float> %1, %1
+ ret <8 x float> %2
+}
+
+define <4 x i32> @test_extracti128(<8 x i32> %a0, <8 x i32> %a1, <4 x i32> *%a2) {
+; GENERIC-LABEL: test_extracti128:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpaddd %ymm1, %ymm0, %ymm2 # sched: [3:1.00]
+; GENERIC-NEXT: vpsubd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: vextracti128 $1, %ymm0, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: vextracti128 $1, %ymm2, (%rdi) # sched: [1:1.00]
+; GENERIC-NEXT: vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_extracti128:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vpaddd %ymm1, %ymm0, %ymm2 # sched: [1:0.50]
+; HASWELL-NEXT: vpsubd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; HASWELL-NEXT: vextracti128 $1, %ymm0, %xmm0 # sched: [3:1.00]
+; HASWELL-NEXT: vextracti128 $1, %ymm2, (%rdi) # sched: [1:1.00]
+; HASWELL-NEXT: vzeroupper # sched: [4:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_extracti128:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpaddd %ymm1, %ymm0, %ymm2 # sched: [1:0.50]
+; BROADWELL-NEXT: vpsubd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; BROADWELL-NEXT: vextracti128 $1, %ymm0, %xmm0 # sched: [3:1.00]
+; BROADWELL-NEXT: vextracti128 $1, %ymm2, (%rdi) # sched: [1:1.00]
+; BROADWELL-NEXT: vzeroupper # sched: [4:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_extracti128:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpaddd %ymm1, %ymm0, %ymm2 # sched: [1:0.33]
+; SKYLAKE-NEXT: vpsubd %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: vextracti128 $1, %ymm0, %xmm0 # sched: [3:1.00]
+; SKYLAKE-NEXT: vextracti128 $1, %ymm2, (%rdi) # sched: [1:1.00]
+; SKYLAKE-NEXT: vzeroupper # sched: [4:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_extracti128:
+; SKX: # %bb.0:
+; SKX-NEXT: vpaddd %ymm1, %ymm0, %ymm2 # sched: [1:0.33]
+; SKX-NEXT: vpsubd %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: vextracti128 $1, %ymm0, %xmm0 # sched: [3:1.00]
+; SKX-NEXT: vextracti128 $1, %ymm2, (%rdi) # sched: [1:1.00]
+; SKX-NEXT: vzeroupper # sched: [4:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_extracti128:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpaddd %ymm1, %ymm0, %ymm2 # sched: [1:0.25]
+; ZNVER1-NEXT: vpsubd %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT: vextracti128 $1, %ymm0, %xmm0 # sched: [2:0.25]
+; ZNVER1-NEXT: vextracti128 $1, %ymm2, (%rdi) # sched: [1:0.50]
+; ZNVER1-NEXT: vzeroupper # sched: [100:?]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = add <8 x i32> %a0, %a1
+ %2 = sub <8 x i32> %a0, %a1
+ %3 = shufflevector <8 x i32> %1, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %4 = shufflevector <8 x i32> %2, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ store <4 x i32> %3, <4 x i32> *%a2
+ ret <4 x i32> %4
+}
+
+define <2 x double> @test_gatherdpd(<2 x double> %a0, i8* %a1, <4 x i32> %a2, <2 x double> %a3) {
+; GENERIC-LABEL: test_gatherdpd:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vgatherdpd %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [4:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_gatherdpd:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vgatherdpd %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [26:2.67]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_gatherdpd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vgatherdpd %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [25:3.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_gatherdpd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vgatherdpd %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [22:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_gatherdpd:
+; SKX: # %bb.0:
+; SKX-NEXT: vgatherdpd %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [22:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_gatherdpd:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vgatherdpd %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [100:?]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call <2 x double> @llvm.x86.avx2.gather.d.pd(<2 x double> %a0, i8* %a1, <4 x i32> %a2, <2 x double> %a3, i8 2)
+ ret <2 x double> %1
+}
+declare <2 x double> @llvm.x86.avx2.gather.d.pd(<2 x double>, i8*, <4 x i32>, <2 x double>, i8) nounwind readonly
+
+define <4 x double> @test_gatherdpd_ymm(<4 x double> %a0, i8* %a1, <4 x i32> %a2, <4 x double> %a3) {
+; GENERIC-LABEL: test_gatherdpd_ymm:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vgatherdpd %ymm2, (%rdi,%xmm1,8), %ymm0 # sched: [4:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_gatherdpd_ymm:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vgatherdpd %ymm2, (%rdi,%xmm1,8), %ymm0 # sched: [27:4.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_gatherdpd_ymm:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vgatherdpd %ymm2, (%rdi,%xmm1,8), %ymm0 # sched: [26:5.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_gatherdpd_ymm:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vgatherdpd %ymm2, (%rdi,%xmm1,8), %ymm0 # sched: [25:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_gatherdpd_ymm:
+; SKX: # %bb.0:
+; SKX-NEXT: vgatherdpd %ymm2, (%rdi,%xmm1,8), %ymm0 # sched: [25:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_gatherdpd_ymm:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vgatherdpd %ymm2, (%rdi,%xmm1,8), %ymm0 # sched: [100:?]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> %a0, i8* %a1, <4 x i32> %a2, <4 x double> %a3, i8 8)
+ ret <4 x double> %1
+}
+declare <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double>, i8*, <4 x i32>, <4 x double>, i8) nounwind readonly
+
+define <4 x float> @test_gatherdps(<4 x float> %a0, i8* %a1, <4 x i32> %a2, <4 x float> %a3) {
+; GENERIC-LABEL: test_gatherdps:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vgatherdps %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [4:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_gatherdps:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vgatherdps %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [25:3.67]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_gatherdps:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vgatherdps %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [25:3.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_gatherdps:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vgatherdps %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [22:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_gatherdps:
+; SKX: # %bb.0:
+; SKX-NEXT: vgatherdps %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [22:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_gatherdps:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vgatherdps %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [100:?]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float> %a0, i8* %a1, <4 x i32> %a2, <4 x float> %a3, i8 2)
+ ret <4 x float> %1
+}
+declare <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float>, i8*, <4 x i32>, <4 x float>, i8) nounwind readonly
+
+define <8 x float> @test_gatherdps_ymm(<8 x float> %a0, i8* %a1, <8 x i32> %a2, <8 x float> %a3) {
+; GENERIC-LABEL: test_gatherdps_ymm:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vgatherdps %ymm2, (%rdi,%ymm1,4), %ymm0 # sched: [4:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_gatherdps_ymm:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vgatherdps %ymm2, (%rdi,%ymm1,4), %ymm0 # sched: [27:6.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_gatherdps_ymm:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vgatherdps %ymm2, (%rdi,%ymm1,4), %ymm0 # sched: [26:4.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_gatherdps_ymm:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vgatherdps %ymm2, (%rdi,%ymm1,4), %ymm0 # sched: [25:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_gatherdps_ymm:
+; SKX: # %bb.0:
+; SKX-NEXT: vgatherdps %ymm2, (%rdi,%ymm1,4), %ymm0 # sched: [25:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_gatherdps_ymm:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vgatherdps %ymm2, (%rdi,%ymm1,4), %ymm0 # sched: [100:?]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> %a0, i8* %a1, <8 x i32> %a2, <8 x float> %a3, i8 4)
+ ret <8 x float> %1
+}
+declare <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float>, i8*, <8 x i32>, <8 x float>, i8) nounwind readonly
+
+define <2 x double> @test_gatherqpd(<2 x double> %a0, i8* %a1, <2 x i64> %a2, <2 x double> %a3) {
+; GENERIC-LABEL: test_gatherqpd:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vgatherqpd %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [4:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_gatherqpd:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vgatherqpd %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [23:3.33]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_gatherqpd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vgatherqpd %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [22:3.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_gatherqpd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vgatherqpd %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [22:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_gatherqpd:
+; SKX: # %bb.0:
+; SKX-NEXT: vgatherqpd %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [22:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_gatherqpd:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vgatherqpd %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [100:?]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call <2 x double> @llvm.x86.avx2.gather.q.pd(<2 x double> %a0, i8* %a1, <2 x i64> %a2, <2 x double> %a3, i8 2)
+ ret <2 x double> %1
+}
+declare <2 x double> @llvm.x86.avx2.gather.q.pd(<2 x double>, i8*, <2 x i64>, <2 x double>, i8) nounwind readonly
+
+define <4 x double> @test_gatherqpd_ymm(<4 x double> %a0, i8* %a1, <4 x i64> %a2, <4 x double> %a3) {
+; GENERIC-LABEL: test_gatherqpd_ymm:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vgatherqpd %ymm2, (%rdi,%ymm1,8), %ymm0 # sched: [4:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_gatherqpd_ymm:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vgatherqpd %ymm2, (%rdi,%ymm1,8), %ymm0 # sched: [24:5.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_gatherqpd_ymm:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vgatherqpd %ymm2, (%rdi,%ymm1,8), %ymm0 # sched: [23:3.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_gatherqpd_ymm:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vgatherqpd %ymm2, (%rdi,%ymm1,8), %ymm0 # sched: [25:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_gatherqpd_ymm:
+; SKX: # %bb.0:
+; SKX-NEXT: vgatherqpd %ymm2, (%rdi,%ymm1,8), %ymm0 # sched: [25:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_gatherqpd_ymm:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vgatherqpd %ymm2, (%rdi,%ymm1,8), %ymm0 # sched: [100:?]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> %a0, i8* %a1, <4 x i64> %a2, <4 x double> %a3, i8 8)
+ ret <4 x double> %1
+}
+declare <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double>, i8*, <4 x i64>, <4 x double>, i8) nounwind readonly
+
+define <4 x float> @test_gatherqps(<4 x float> %a0, i8* %a1, <2 x i64> %a2, <4 x float> %a3) {
+; GENERIC-LABEL: test_gatherqps:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vgatherqps %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [4:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_gatherqps:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vgatherqps %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [25:3.67]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_gatherqps:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vgatherqps %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [27:5.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_gatherqps:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vgatherqps %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [22:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_gatherqps:
+; SKX: # %bb.0:
+; SKX-NEXT: vgatherqps %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [22:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_gatherqps:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vgatherqps %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [100:?]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call <4 x float> @llvm.x86.avx2.gather.q.ps(<4 x float> %a0, i8* %a1, <2 x i64> %a2, <4 x float> %a3, i8 2)
+ ret <4 x float> %1
+}
+declare <4 x float> @llvm.x86.avx2.gather.q.ps(<4 x float>, i8*, <2 x i64>, <4 x float>, i8) nounwind readonly
+
+define <4 x float> @test_gatherqps_ymm(<4 x float> %a0, i8* %a1, <4 x i64> %a2, <4 x float> %a3) {
+; GENERIC-LABEL: test_gatherqps_ymm:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vgatherqps %xmm2, (%rdi,%ymm1,4), %xmm0 # sched: [4:0.50]
+; GENERIC-NEXT: vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_gatherqps_ymm:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vgatherqps %xmm2, (%rdi,%ymm1,4), %xmm0 # sched: [28:3.67]
+; HASWELL-NEXT: vzeroupper # sched: [4:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_gatherqps_ymm:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vgatherqps %xmm2, (%rdi,%ymm1,4), %xmm0 # sched: [24:5.00]
+; BROADWELL-NEXT: vzeroupper # sched: [4:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_gatherqps_ymm:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vgatherqps %xmm2, (%rdi,%ymm1,4), %xmm0 # sched: [25:1.00]
+; SKYLAKE-NEXT: vzeroupper # sched: [4:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_gatherqps_ymm:
+; SKX: # %bb.0:
+; SKX-NEXT: vgatherqps %xmm2, (%rdi,%ymm1,4), %xmm0 # sched: [25:1.00]
+; SKX-NEXT: vzeroupper # sched: [4:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_gatherqps_ymm:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vgatherqps %xmm2, (%rdi,%ymm1,4), %xmm0 # sched: [100:?]
+; ZNVER1-NEXT: vzeroupper # sched: [100:?]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> %a0, i8* %a1, <4 x i64> %a2, <4 x float> %a3, i8 4)
+ ret <4 x float> %1
+}
+declare <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float>, i8*, <4 x i64>, <4 x float>, i8) nounwind readonly
+
+define <8 x i32> @test_inserti128(<8 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
+; GENERIC-LABEL: test_inserti128:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 # sched: [1:1.00]
+; GENERIC-NEXT: vinserti128 $1, (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
+; GENERIC-NEXT: vpaddd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_inserti128:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 # sched: [3:1.00]
+; HASWELL-NEXT: vinserti128 $1, (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
+; HASWELL-NEXT: vpaddd %ymm0, %ymm1, %ymm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_inserti128:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 # sched: [3:1.00]
+; BROADWELL-NEXT: vinserti128 $1, (%rdi), %ymm0, %ymm0 # sched: [6:0.50]
+; BROADWELL-NEXT: vpaddd %ymm0, %ymm1, %ymm0 # sched: [1:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_inserti128:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 # sched: [3:1.00]
+; SKYLAKE-NEXT: vinserti128 $1, (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
+; SKYLAKE-NEXT: vpaddd %ymm0, %ymm1, %ymm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_inserti128:
+; SKX: # %bb.0:
+; SKX-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 # sched: [3:1.00]
+; SKX-NEXT: vinserti128 $1, (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
+; SKX-NEXT: vpaddd %ymm0, %ymm1, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_inserti128:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 # sched: [2:0.25]
+; ZNVER1-NEXT: vinserti128 $1, (%rdi), %ymm0, %ymm0 # sched: [9:0.50]
+; ZNVER1-NEXT: vpaddd %ymm0, %ymm1, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = shufflevector <4 x i32> %a1, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = shufflevector <8 x i32> %a0, <8 x i32> %1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+ %3 = load <4 x i32>, <4 x i32> *%a2, align 16
+ %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = shufflevector <8 x i32> %a0, <8 x i32> %4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+ %6 = add <8 x i32> %2, %5
+ ret <8 x i32> %6
+}
+
+define <4 x i64> @test_movntdqa(i8* %a0) {
+; GENERIC-LABEL: test_movntdqa:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovntdqa (%rdi), %ymm0 # sched: [4:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_movntdqa:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vmovntdqa (%rdi), %ymm0 # sched: [7:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_movntdqa:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vmovntdqa (%rdi), %ymm0 # sched: [6:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_movntdqa:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vmovntdqa (%rdi), %ymm0 # sched: [7:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_movntdqa:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovntdqa (%rdi), %ymm0 # sched: [7:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_movntdqa:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vmovntdqa (%rdi), %ymm0 # sched: [8:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call <4 x i64> @llvm.x86.avx2.movntdqa(i8* %a0)
+ ret <4 x i64> %1
+}
+declare <4 x i64> @llvm.x86.avx2.movntdqa(i8*) nounwind readonly
+
+define <16 x i16> @test_mpsadbw(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> *%a2) {
+; GENERIC-LABEL: test_mpsadbw:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmpsadbw $7, %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
+; GENERIC-NEXT: vmpsadbw $7, (%rdi), %ymm0, %ymm0 # sched: [11:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_mpsadbw:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vmpsadbw $7, %ymm1, %ymm0, %ymm0 # sched: [7:2.00]
+; HASWELL-NEXT: vmpsadbw $7, (%rdi), %ymm0, %ymm0 # sched: [14:2.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_mpsadbw:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vmpsadbw $7, %ymm1, %ymm0, %ymm0 # sched: [7:2.00]
+; BROADWELL-NEXT: vmpsadbw $7, (%rdi), %ymm0, %ymm0 # sched: [13:2.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_mpsadbw:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vmpsadbw $7, %ymm1, %ymm0, %ymm0 # sched: [4:2.00]
+; SKYLAKE-NEXT: vmpsadbw $7, (%rdi), %ymm0, %ymm0 # sched: [11:2.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_mpsadbw:
+; SKX: # %bb.0:
+; SKX-NEXT: vmpsadbw $7, %ymm1, %ymm0, %ymm0 # sched: [4:2.00]
+; SKX-NEXT: vmpsadbw $7, (%rdi), %ymm0, %ymm0 # sched: [11:2.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_mpsadbw:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vmpsadbw $7, %ymm1, %ymm0, %ymm0 # sched: [100:?]
+; ZNVER1-NEXT: vmpsadbw $7, (%rdi), %ymm0, %ymm0 # sched: [100:?]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8> %a0, <32 x i8> %a1, i8 7)
+ %2 = bitcast <16 x i16> %1 to <32 x i8>
+ %3 = load <32 x i8>, <32 x i8> *%a2, align 32
+ %4 = call <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8> %2, <32 x i8> %3, i8 7)
+ ret <16 x i16> %4
+}
+declare <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8>, <32 x i8>, i8) nounwind readnone
+
define <32 x i8> @test_pabsb(<32 x i8> %a0, <32 x i8> *%a1) {
+; GENERIC-LABEL: test_pabsb:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpabsb %ymm0, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: vpabsb (%rdi), %ymm1 # sched: [7:1.00]
+; GENERIC-NEXT: vpor %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
; HASWELL-LABEL: test_pabsb:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vpabsb %ymm0, %ymm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpabsb (%rdi), %ymm1 # sched: [5:0.50]
+; HASWELL-NEXT: vpabsb (%rdi), %ymm1 # sched: [8:0.50]
; HASWELL-NEXT: vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pabsb:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpabsb %ymm0, %ymm0 # sched: [1:0.50]
+; BROADWELL-NEXT: vpabsb (%rdi), %ymm1 # sched: [7:0.50]
+; BROADWELL-NEXT: vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pabsb:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpabsb %ymm0, %ymm0 # sched: [1:0.50]
+; SKYLAKE-NEXT: vpabsb (%rdi), %ymm1 # sched: [8:0.50]
+; SKYLAKE-NEXT: vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pabsb:
+; SKX: # %bb.0:
+; SKX-NEXT: vpabsb %ymm0, %ymm0 # sched: [1:0.50]
+; SKX-NEXT: vpabsb (%rdi), %ymm1 # sched: [8:0.50]
+; SKX-NEXT: vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; ZNVER1-LABEL: test_pabsb:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vpabsb (%rdi), %ymm1 # sched: [8:0.50]
; ZNVER1-NEXT: vpabsb %ymm0, %ymm0 # sched: [1:0.25]
; ZNVER1-NEXT: vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call <32 x i8> @llvm.x86.avx2.pabs.b(<32 x i8> %a0)
%2 = load <32 x i8>, <32 x i8> *%a1, align 32
%3 = call <32 x i8> @llvm.x86.avx2.pabs.b(<32 x i8> %2)
@@ -26,19 +700,47 @@ define <32 x i8> @test_pabsb(<32 x i8> %a0, <32 x i8> *%a1) {
declare <32 x i8> @llvm.x86.avx2.pabs.b(<32 x i8>) nounwind readnone
define <8 x i32> @test_pabsd(<8 x i32> %a0, <8 x i32> *%a1) {
+; GENERIC-LABEL: test_pabsd:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpabsd %ymm0, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: vpabsd (%rdi), %ymm1 # sched: [7:1.00]
+; GENERIC-NEXT: vpor %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
; HASWELL-LABEL: test_pabsd:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vpabsd %ymm0, %ymm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpabsd (%rdi), %ymm1 # sched: [5:0.50]
+; HASWELL-NEXT: vpabsd (%rdi), %ymm1 # sched: [8:0.50]
; HASWELL-NEXT: vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pabsd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpabsd %ymm0, %ymm0 # sched: [1:0.50]
+; BROADWELL-NEXT: vpabsd (%rdi), %ymm1 # sched: [7:0.50]
+; BROADWELL-NEXT: vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pabsd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpabsd %ymm0, %ymm0 # sched: [1:0.50]
+; SKYLAKE-NEXT: vpabsd (%rdi), %ymm1 # sched: [8:0.50]
+; SKYLAKE-NEXT: vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pabsd:
+; SKX: # %bb.0:
+; SKX-NEXT: vpabsd %ymm0, %ymm0 # sched: [1:0.50]
+; SKX-NEXT: vpabsd (%rdi), %ymm1 # sched: [8:0.50]
+; SKX-NEXT: vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; ZNVER1-LABEL: test_pabsd:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vpabsd (%rdi), %ymm1 # sched: [8:0.50]
; ZNVER1-NEXT: vpabsd %ymm0, %ymm0 # sched: [1:0.25]
; ZNVER1-NEXT: vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call <8 x i32> @llvm.x86.avx2.pabs.d(<8 x i32> %a0)
%2 = load <8 x i32>, <8 x i32> *%a1, align 32
%3 = call <8 x i32> @llvm.x86.avx2.pabs.d(<8 x i32> %2)
@@ -48,19 +750,47 @@ define <8 x i32> @test_pabsd(<8 x i32> %a0, <8 x i32> *%a1) {
declare <8 x i32> @llvm.x86.avx2.pabs.d(<8 x i32>) nounwind readnone
define <16 x i16> @test_pabsw(<16 x i16> %a0, <16 x i16> *%a1) {
+; GENERIC-LABEL: test_pabsw:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpabsw %ymm0, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: vpabsw (%rdi), %ymm1 # sched: [7:1.00]
+; GENERIC-NEXT: vpor %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
; HASWELL-LABEL: test_pabsw:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vpabsw %ymm0, %ymm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpabsw (%rdi), %ymm1 # sched: [5:0.50]
+; HASWELL-NEXT: vpabsw (%rdi), %ymm1 # sched: [8:0.50]
; HASWELL-NEXT: vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pabsw:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpabsw %ymm0, %ymm0 # sched: [1:0.50]
+; BROADWELL-NEXT: vpabsw (%rdi), %ymm1 # sched: [7:0.50]
+; BROADWELL-NEXT: vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pabsw:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpabsw %ymm0, %ymm0 # sched: [1:0.50]
+; SKYLAKE-NEXT: vpabsw (%rdi), %ymm1 # sched: [8:0.50]
+; SKYLAKE-NEXT: vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pabsw:
+; SKX: # %bb.0:
+; SKX-NEXT: vpabsw %ymm0, %ymm0 # sched: [1:0.50]
+; SKX-NEXT: vpabsw (%rdi), %ymm1 # sched: [8:0.50]
+; SKX-NEXT: vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; ZNVER1-LABEL: test_pabsw:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vpabsw (%rdi), %ymm1 # sched: [8:0.50]
; ZNVER1-NEXT: vpabsw %ymm0, %ymm0 # sched: [1:0.25]
; ZNVER1-NEXT: vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call <16 x i16> @llvm.x86.avx2.pabs.w(<16 x i16> %a0)
%2 = load <16 x i16>, <16 x i16> *%a1, align 32
%3 = call <16 x i16> @llvm.x86.avx2.pabs.w(<16 x i16> %2)
@@ -69,18 +799,218 @@ define <16 x i16> @test_pabsw(<16 x i16> %a0, <16 x i16> *%a1) {
}
declare <16 x i16> @llvm.x86.avx2.pabs.w(<16 x i16>) nounwind readnone
+define <16 x i16> @test_packssdw(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> *%a2) {
+; GENERIC-LABEL: test_packssdw:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: vpackssdw (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_packssdw:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; HASWELL-NEXT: vpackssdw (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_packssdw:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; BROADWELL-NEXT: vpackssdw (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_packssdw:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; SKYLAKE-NEXT: vpackssdw (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_packssdw:
+; SKX: # %bb.0:
+; SKX-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; SKX-NEXT: vpackssdw (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_packssdw:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT: vpackssdw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a0, <8 x i32> %a1)
+ %2 = bitcast <16 x i16> %1 to <8 x i32>
+ %3 = load <8 x i32>, <8 x i32> *%a2, align 32
+ %4 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %2, <8 x i32> %3)
+ ret <16 x i16> %4
+}
+declare <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32>, <8 x i32>) nounwind readnone
+
+define <32 x i8> @test_packsswb(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> *%a2) {
+; GENERIC-LABEL: test_packsswb:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpacksswb %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: vpacksswb (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_packsswb:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vpacksswb %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; HASWELL-NEXT: vpacksswb (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_packsswb:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpacksswb %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; BROADWELL-NEXT: vpacksswb (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_packsswb:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpacksswb %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; SKYLAKE-NEXT: vpacksswb (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_packsswb:
+; SKX: # %bb.0:
+; SKX-NEXT: vpacksswb %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; SKX-NEXT: vpacksswb (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_packsswb:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpacksswb %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT: vpacksswb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %a0, <16 x i16> %a1)
+ %2 = bitcast <32 x i8> %1 to <16 x i16>
+ %3 = load <16 x i16>, <16 x i16> *%a2, align 32
+ %4 = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %2, <16 x i16> %3)
+ ret <32 x i8> %4
+}
+declare <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16>, <16 x i16>) nounwind readnone
+
+define <16 x i16> @test_packusdw(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> *%a2) {
+; GENERIC-LABEL: test_packusdw:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: vpackusdw (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_packusdw:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; HASWELL-NEXT: vpackusdw (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_packusdw:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; BROADWELL-NEXT: vpackusdw (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_packusdw:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; SKYLAKE-NEXT: vpackusdw (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_packusdw:
+; SKX: # %bb.0:
+; SKX-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; SKX-NEXT: vpackusdw (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_packusdw:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT: vpackusdw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a0, <8 x i32> %a1)
+ %2 = bitcast <16 x i16> %1 to <8 x i32>
+ %3 = load <8 x i32>, <8 x i32> *%a2, align 32
+ %4 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %2, <8 x i32> %3)
+ ret <16 x i16> %4
+}
+declare <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32>, <8 x i32>) nounwind readnone
+
+define <32 x i8> @test_packuswb(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> *%a2) {
+; GENERIC-LABEL: test_packuswb:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: vpackuswb (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_packuswb:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; HASWELL-NEXT: vpackuswb (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_packuswb:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; BROADWELL-NEXT: vpackuswb (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_packuswb:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; SKYLAKE-NEXT: vpackuswb (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_packuswb:
+; SKX: # %bb.0:
+; SKX-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; SKX-NEXT: vpackuswb (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_packuswb:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT: vpackuswb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %a0, <16 x i16> %a1)
+ %2 = bitcast <32 x i8> %1 to <16 x i16>
+ %3 = load <16 x i16>, <16 x i16> *%a2, align 32
+ %4 = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %2, <16 x i16> %3)
+ ret <32 x i8> %4
+}
+declare <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16>, <16 x i16>) nounwind readnone
+
define <32 x i8> @test_paddb(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> *%a2) {
+; GENERIC-LABEL: test_paddb:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpaddb %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: vpaddb (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
; HASWELL-LABEL: test_paddb:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vpaddb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpaddb (%rdi), %ymm0, %ymm0 # sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpaddb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_paddb:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpaddb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; BROADWELL-NEXT: vpaddb (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_paddb:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpaddb %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: vpaddb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_paddb:
+; SKX: # %bb.0:
+; SKX-NEXT: vpaddb %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: vpaddb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; ZNVER1-LABEL: test_paddb:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vpaddb %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
; ZNVER1-NEXT: vpaddb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = add <32 x i8> %a0, %a1
%2 = load <32 x i8>, <32 x i8> *%a2, align 32
%3 = add <32 x i8> %1, %2
@@ -88,17 +1018,41 @@ define <32 x i8> @test_paddb(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> *%a2) {
}
define <8 x i32> @test_paddd(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> *%a2) {
+; GENERIC-LABEL: test_paddd:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpaddd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: vpaddd (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
; HASWELL-LABEL: test_paddd:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpaddd (%rdi), %ymm0, %ymm0 # sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpaddd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_paddd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; BROADWELL-NEXT: vpaddd (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_paddd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: vpaddd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_paddd:
+; SKX: # %bb.0:
+; SKX-NEXT: vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: vpaddd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; ZNVER1-LABEL: test_paddd:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
; ZNVER1-NEXT: vpaddd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = add <8 x i32> %a0, %a1
%2 = load <8 x i32>, <8 x i32> *%a2, align 32
%3 = add <8 x i32> %1, %2
@@ -106,55 +1060,345 @@ define <8 x i32> @test_paddd(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> *%a2) {
}
define <4 x i64> @test_paddq(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> *%a2) {
+; GENERIC-LABEL: test_paddq:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: vpaddq (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
; HASWELL-LABEL: test_paddq:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpaddq (%rdi), %ymm0, %ymm0 # sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpaddq (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_paddq:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; BROADWELL-NEXT: vpaddq (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_paddq:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: vpaddq (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_paddq:
+; SKX: # %bb.0:
+; SKX-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: vpaddq (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; ZNVER1-LABEL: test_paddq:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
; ZNVER1-NEXT: vpaddq (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = add <4 x i64> %a0, %a1
%2 = load <4 x i64>, <4 x i64> *%a2, align 32
%3 = add <4 x i64> %1, %2
ret <4 x i64> %3
}
+define <32 x i8> @test_paddsb(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> *%a2) {
+; GENERIC-LABEL: test_paddsb:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpaddsb %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: vpaddsb (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_paddsb:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vpaddsb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; HASWELL-NEXT: vpaddsb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_paddsb:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpaddsb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; BROADWELL-NEXT: vpaddsb (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_paddsb:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpaddsb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; SKYLAKE-NEXT: vpaddsb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_paddsb:
+; SKX: # %bb.0:
+; SKX-NEXT: vpaddsb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; SKX-NEXT: vpaddsb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_paddsb:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpaddsb %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT: vpaddsb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call <32 x i8> @llvm.x86.avx2.padds.b(<32 x i8> %a0, <32 x i8> %a1)
+ %2 = load <32 x i8>, <32 x i8> *%a2, align 32
+ %3 = call <32 x i8> @llvm.x86.avx2.padds.b(<32 x i8> %1, <32 x i8> %2)
+ ret <32 x i8> %3
+}
+declare <32 x i8> @llvm.x86.avx2.padds.b(<32 x i8>, <32 x i8>) nounwind readnone
+
+define <16 x i16> @test_paddsw(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> *%a2) {
+; GENERIC-LABEL: test_paddsw:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpaddsw %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: vpaddsw (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_paddsw:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vpaddsw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; HASWELL-NEXT: vpaddsw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_paddsw:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpaddsw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; BROADWELL-NEXT: vpaddsw (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_paddsw:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpaddsw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; SKYLAKE-NEXT: vpaddsw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_paddsw:
+; SKX: # %bb.0:
+; SKX-NEXT: vpaddsw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; SKX-NEXT: vpaddsw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_paddsw:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpaddsw %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT: vpaddsw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call <16 x i16> @llvm.x86.avx2.padds.w(<16 x i16> %a0, <16 x i16> %a1)
+ %2 = load <16 x i16>, <16 x i16> *%a2, align 32
+ %3 = call <16 x i16> @llvm.x86.avx2.padds.w(<16 x i16> %1, <16 x i16> %2)
+ ret <16 x i16> %3
+}
+declare <16 x i16> @llvm.x86.avx2.padds.w(<16 x i16>, <16 x i16>) nounwind readnone
+
+define <32 x i8> @test_paddusb(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> *%a2) {
+; GENERIC-LABEL: test_paddusb:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpaddusb %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: vpaddusb (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_paddusb:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vpaddusb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; HASWELL-NEXT: vpaddusb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_paddusb:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpaddusb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; BROADWELL-NEXT: vpaddusb (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_paddusb:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpaddusb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; SKYLAKE-NEXT: vpaddusb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_paddusb:
+; SKX: # %bb.0:
+; SKX-NEXT: vpaddusb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; SKX-NEXT: vpaddusb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_paddusb:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpaddusb %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT: vpaddusb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call <32 x i8> @llvm.x86.avx2.paddus.b(<32 x i8> %a0, <32 x i8> %a1)
+ %2 = load <32 x i8>, <32 x i8> *%a2, align 32
+ %3 = call <32 x i8> @llvm.x86.avx2.paddus.b(<32 x i8> %1, <32 x i8> %2)
+ ret <32 x i8> %3
+}
+declare <32 x i8> @llvm.x86.avx2.paddus.b(<32 x i8>, <32 x i8>) nounwind readnone
+
+define <16 x i16> @test_paddusw(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> *%a2) {
+; GENERIC-LABEL: test_paddusw:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpaddusw %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: vpaddusw (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_paddusw:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vpaddusw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; HASWELL-NEXT: vpaddusw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_paddusw:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpaddusw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; BROADWELL-NEXT: vpaddusw (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_paddusw:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpaddusw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; SKYLAKE-NEXT: vpaddusw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_paddusw:
+; SKX: # %bb.0:
+; SKX-NEXT: vpaddusw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; SKX-NEXT: vpaddusw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_paddusw:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpaddusw %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT: vpaddusw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call <16 x i16> @llvm.x86.avx2.paddus.w(<16 x i16> %a0, <16 x i16> %a1)
+ %2 = load <16 x i16>, <16 x i16> *%a2, align 32
+ %3 = call <16 x i16> @llvm.x86.avx2.paddus.w(<16 x i16> %1, <16 x i16> %2)
+ ret <16 x i16> %3
+}
+declare <16 x i16> @llvm.x86.avx2.paddus.w(<16 x i16>, <16 x i16>) nounwind readnone
+
define <16 x i16> @test_paddw(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> *%a2) {
+; GENERIC-LABEL: test_paddw:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpaddw %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: vpaddw (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
; HASWELL-LABEL: test_paddw:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vpaddw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpaddw (%rdi), %ymm0, %ymm0 # sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpaddw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_paddw:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpaddw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; BROADWELL-NEXT: vpaddw (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_paddw:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpaddw %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: vpaddw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_paddw:
+; SKX: # %bb.0:
+; SKX-NEXT: vpaddw %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: vpaddw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; ZNVER1-LABEL: test_paddw:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vpaddw %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
; ZNVER1-NEXT: vpaddw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = add <16 x i16> %a0, %a1
%2 = load <16 x i16>, <16 x i16> *%a2, align 32
%3 = add <16 x i16> %1, %2
ret <16 x i16> %3
}
+define <32 x i8> @test_palignr(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> *%a2) {
+; GENERIC-LABEL: test_palignr:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm0[0],ymm1[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm0[16] sched: [1:1.00]
+; GENERIC-NEXT: vpalignr {{.*#+}} ymm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm0[0],mem[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm0[16] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_palignr:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm0[0],ymm1[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm0[16] sched: [1:1.00]
+; HASWELL-NEXT: vpalignr {{.*#+}} ymm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm0[0],mem[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm0[16] sched: [8:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_palignr:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm0[0],ymm1[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm0[16] sched: [1:1.00]
+; BROADWELL-NEXT: vpalignr {{.*#+}} ymm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm0[0],mem[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm0[16] sched: [7:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_palignr:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm0[0],ymm1[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm0[16] sched: [1:1.00]
+; SKYLAKE-NEXT: vpalignr {{.*#+}} ymm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm0[0],mem[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm0[16] sched: [8:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_palignr:
+; SKX: # %bb.0:
+; SKX-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm0[0],ymm1[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm0[16] sched: [1:1.00]
+; SKX-NEXT: vpalignr {{.*#+}} ymm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm0[0],mem[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm0[16] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_palignr:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm0[0],ymm1[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm0[16] sched: [1:0.25]
+; ZNVER1-NEXT: vpalignr {{.*#+}} ymm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm0[0],mem[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm0[16] sched: [8:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = shufflevector <32 x i8> %a1, <32 x i8> %a0, <32 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48>
+ %2 = load <32 x i8>, <32 x i8> *%a2, align 32
+ %3 = shufflevector <32 x i8> %2, <32 x i8> %1, <32 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48>
+ ret <32 x i8> %3
+}
+
define <4 x i64> @test_pand(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> *%a2) {
+; GENERIC-LABEL: test_pand:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpand %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: vpand (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
+; GENERIC-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
; HASWELL-LABEL: test_pand:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vpand %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
-; HASWELL-NEXT: vpand (%rdi), %ymm0, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT: vpand (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
; HASWELL-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pand:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpand %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; BROADWELL-NEXT: vpand (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
+; BROADWELL-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pand:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpand %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: vpand (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; SKYLAKE-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pand:
+; SKX: # %bb.0:
+; SKX-NEXT: vpand %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: vpand (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; SKX-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; ZNVER1-LABEL: test_pand:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vpand %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
; ZNVER1-NEXT: vpand (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
; ZNVER1-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = and <4 x i64> %a0, %a1
%2 = load <4 x i64>, <4 x i64> *%a2, align 32
%3 = and <4 x i64> %1, %2
@@ -163,19 +1407,47 @@ define <4 x i64> @test_pand(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> *%a2) {
}
define <4 x i64> @test_pandn(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> *%a2) {
+; GENERIC-LABEL: test_pandn:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpandn %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: vpandn (%rdi), %ymm0, %ymm1 # sched: [5:1.00]
+; GENERIC-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
; HASWELL-LABEL: test_pandn:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vpandn %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
-; HASWELL-NEXT: vpandn (%rdi), %ymm0, %ymm1 # sched: [5:0.50]
+; HASWELL-NEXT: vpandn (%rdi), %ymm0, %ymm1 # sched: [8:0.50]
; HASWELL-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pandn:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpandn %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; BROADWELL-NEXT: vpandn (%rdi), %ymm0, %ymm1 # sched: [7:0.50]
+; BROADWELL-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pandn:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpandn %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: vpandn (%rdi), %ymm0, %ymm1 # sched: [8:0.50]
+; SKYLAKE-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pandn:
+; SKX: # %bb.0:
+; SKX-NEXT: vpandn %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: vpandn (%rdi), %ymm0, %ymm1 # sched: [8:0.50]
+; SKX-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; ZNVER1-LABEL: test_pandn:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vpandn %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
; ZNVER1-NEXT: vpandn (%rdi), %ymm0, %ymm1 # sched: [8:0.50]
; ZNVER1-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = xor <4 x i64> %a0, <i64 -1, i64 -1, i64 -1, i64 -1>
%2 = and <4 x i64> %a1, %1
%3 = load <4 x i64>, <4 x i64> *%a2, align 32
@@ -185,18 +1457,3490 @@ define <4 x i64> @test_pandn(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> *%a2) {
ret <4 x i64> %6
}
+define <32 x i8> @test_pavgb(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> *%a2) {
+; GENERIC-LABEL: test_pavgb:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpavgb %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: vpavgb (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_pavgb:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vpavgb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; HASWELL-NEXT: vpavgb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pavgb:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpavgb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; BROADWELL-NEXT: vpavgb (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pavgb:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpavgb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; SKYLAKE-NEXT: vpavgb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pavgb:
+; SKX: # %bb.0:
+; SKX-NEXT: vpavgb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; SKX-NEXT: vpavgb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_pavgb:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpavgb %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT: vpavgb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = zext <32 x i8> %a0 to <32 x i16>
+ %2 = zext <32 x i8> %a1 to <32 x i16>
+ %3 = add <32 x i16> %1, %2
+ %4 = add <32 x i16> %3, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+ %5 = lshr <32 x i16> %4, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+ %6 = trunc <32 x i16> %5 to <32 x i8>
+ %7 = load <32 x i8>, <32 x i8> *%a2, align 32
+ %8 = zext <32 x i8> %6 to <32 x i16>
+ %9 = zext <32 x i8> %7 to <32 x i16>
+ %10 = add <32 x i16> %8, %9
+ %11 = add <32 x i16> %10, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+ %12 = lshr <32 x i16> %11, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+ %13 = trunc <32 x i16> %12 to <32 x i8>
+ ret <32 x i8> %13
+}
+
+define <16 x i16> @test_pavgw(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> *%a2) {
+; GENERIC-LABEL: test_pavgw:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpavgw %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: vpavgw (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_pavgw:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vpavgw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; HASWELL-NEXT: vpavgw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pavgw:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpavgw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; BROADWELL-NEXT: vpavgw (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pavgw:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpavgw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; SKYLAKE-NEXT: vpavgw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pavgw:
+; SKX: # %bb.0:
+; SKX-NEXT: vpavgw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; SKX-NEXT: vpavgw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_pavgw:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpavgw %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT: vpavgw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = zext <16 x i16> %a0 to <16 x i32>
+ %2 = zext <16 x i16> %a1 to <16 x i32>
+ %3 = add <16 x i32> %1, %2
+ %4 = add <16 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+ %5 = lshr <16 x i32> %4, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+ %6 = trunc <16 x i32> %5 to <16 x i16>
+ %7 = load <16 x i16>, <16 x i16> *%a2, align 32
+ %8 = zext <16 x i16> %6 to <16 x i32>
+ %9 = zext <16 x i16> %7 to <16 x i32>
+ %10 = add <16 x i32> %8, %9
+ %11 = add <16 x i32> %10, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+ %12 = lshr <16 x i32> %11, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+ %13 = trunc <16 x i32> %12 to <16 x i16>
+ ret <16 x i16> %13
+}
+
+define <4 x i32> @test_pblendd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
+; GENERIC-LABEL: test_pblendd:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3] sched: [1:0.50]
+; GENERIC-NEXT: vpblendd {{.*#+}} xmm1 = mem[0],xmm1[1],mem[2],xmm1[3] sched: [5:0.50]
+; GENERIC-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_pblendd:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3] sched: [1:0.33]
+; HASWELL-NEXT: vpblendd {{.*#+}} xmm1 = mem[0],xmm1[1],mem[2],xmm1[3] sched: [7:0.50]
+; HASWELL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pblendd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3] sched: [1:0.33]
+; BROADWELL-NEXT: vpblendd {{.*#+}} xmm1 = mem[0],xmm1[1],mem[2],xmm1[3] sched: [6:0.50]
+; BROADWELL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pblendd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3] sched: [1:0.33]
+; SKYLAKE-NEXT: vpblendd {{.*#+}} xmm1 = mem[0],xmm1[1],mem[2],xmm1[3] sched: [7:0.50]
+; SKYLAKE-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pblendd:
+; SKX: # %bb.0:
+; SKX-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3] sched: [1:0.33]
+; SKX-NEXT: vpblendd {{.*#+}} xmm1 = mem[0],xmm1[1],mem[2],xmm1[3] sched: [7:0.50]
+; SKX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_pblendd:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3] sched: [1:0.50]
+; ZNVER1-NEXT: vpblendd {{.*#+}} xmm1 = mem[0],xmm1[1],mem[2],xmm1[3] sched: [8:1.00]
+; ZNVER1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <4 x i32> <i32 4, i32 5, i32 6, i32 3>
+ %2 = load <4 x i32>, <4 x i32> *%a2, align 16
+ %3 = shufflevector <4 x i32> %1, <4 x i32> %2, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
+ %4 = add <4 x i32> %a0, %3
+ ret <4 x i32> %4
+}
+
+define <8 x i32> @test_pblendd_ymm(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> *%a2) {
+; GENERIC-LABEL: test_pblendd_ymm:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] sched: [1:0.50]
+; GENERIC-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],mem[1,2],ymm1[3,4,5,6,7] sched: [5:0.50]
+; GENERIC-NEXT: vpaddd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_pblendd_ymm:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] sched: [1:0.33]
+; HASWELL-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],mem[1,2],ymm1[3,4,5,6,7] sched: [8:0.50]
+; HASWELL-NEXT: vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pblendd_ymm:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] sched: [1:0.33]
+; BROADWELL-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],mem[1,2],ymm1[3,4,5,6,7] sched: [7:0.50]
+; BROADWELL-NEXT: vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pblendd_ymm:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] sched: [1:0.33]
+; SKYLAKE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],mem[1,2],ymm1[3,4,5,6,7] sched: [8:0.50]
+; SKYLAKE-NEXT: vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pblendd_ymm:
+; SKX: # %bb.0:
+; SKX-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] sched: [1:0.33]
+; SKX-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],mem[1,2],ymm1[3,4,5,6,7] sched: [8:0.50]
+; SKX-NEXT: vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_pblendd_ymm:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] sched: [1:0.50]
+; ZNVER1-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],mem[1,2],ymm1[3,4,5,6,7] sched: [9:1.50]
+; ZNVER1-NEXT: vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = shufflevector <8 x i32> %a0, <8 x i32> %a1, <8 x i32> <i32 8, i32 9, i32 10, i32 3, i32 4, i32 5, i32 6, i32 15>
+ %2 = load <8 x i32>, <8 x i32> *%a2, align 32
+ %3 = shufflevector <8 x i32> %1, <8 x i32> %2, <8 x i32> <i32 0, i32 9, i32 10, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %4 = add <8 x i32> %a0, %3
+ ret <8 x i32> %4
+}
+
+define <32 x i8> @test_pblendvb(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> %a2, <32 x i8> *%a3, <32 x i8> %a4) {
+; GENERIC-LABEL: test_pblendvb:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 # sched: [2:1.00]
+; GENERIC-NEXT: vpblendvb %ymm3, (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_pblendvb:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 # sched: [2:2.00]
+; HASWELL-NEXT: vpblendvb %ymm3, (%rdi), %ymm0, %ymm0 # sched: [9:2.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pblendvb:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 # sched: [2:2.00]
+; BROADWELL-NEXT: vpblendvb %ymm3, (%rdi), %ymm0, %ymm0 # sched: [8:2.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pblendvb:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 # sched: [2:0.67]
+; SKYLAKE-NEXT: vpblendvb %ymm3, (%rdi), %ymm0, %ymm0 # sched: [8:0.67]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pblendvb:
+; SKX: # %bb.0:
+; SKX-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 # sched: [2:0.67]
+; SKX-NEXT: vpblendvb %ymm3, (%rdi), %ymm0, %ymm0 # sched: [8:0.67]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_pblendvb:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; ZNVER1-NEXT: vpblendvb %ymm3, (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> %a2)
+ %2 = load <32 x i8>, <32 x i8> *%a3, align 32
+ %3 = call <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8> %1, <32 x i8> %2, <32 x i8> %a4)
+ ret <32 x i8> %3
+}
+declare <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8>, <32 x i8>, <32 x i8>) nounwind readnone
+
+define <16 x i16> @test_pblendw(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> *%a2) {
+; GENERIC-LABEL: test_pblendw:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7,8,9],ymm1[10,11,12],ymm0[13,14,15] sched: [1:0.50]
+; GENERIC-NEXT: vpblendw {{.*#+}} ymm0 = mem[0],ymm0[1],mem[2],ymm0[3],mem[4],ymm0[5],mem[6],ymm0[7],mem[8],ymm0[9],mem[10],ymm0[11],mem[12],ymm0[13],mem[14],ymm0[15] sched: [5:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_pblendw:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7,8,9],ymm1[10,11,12],ymm0[13,14,15] sched: [1:1.00]
+; HASWELL-NEXT: vpblendw {{.*#+}} ymm0 = mem[0],ymm0[1],mem[2],ymm0[3],mem[4],ymm0[5],mem[6],ymm0[7],mem[8],ymm0[9],mem[10],ymm0[11],mem[12],ymm0[13],mem[14],ymm0[15] sched: [8:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pblendw:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7,8,9],ymm1[10,11,12],ymm0[13,14,15] sched: [1:1.00]
+; BROADWELL-NEXT: vpblendw {{.*#+}} ymm0 = mem[0],ymm0[1],mem[2],ymm0[3],mem[4],ymm0[5],mem[6],ymm0[7],mem[8],ymm0[9],mem[10],ymm0[11],mem[12],ymm0[13],mem[14],ymm0[15] sched: [7:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pblendw:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7,8,9],ymm1[10,11,12],ymm0[13,14,15] sched: [1:1.00]
+; SKYLAKE-NEXT: vpblendw {{.*#+}} ymm0 = mem[0],ymm0[1],mem[2],ymm0[3],mem[4],ymm0[5],mem[6],ymm0[7],mem[8],ymm0[9],mem[10],ymm0[11],mem[12],ymm0[13],mem[14],ymm0[15] sched: [8:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pblendw:
+; SKX: # %bb.0:
+; SKX-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7,8,9],ymm1[10,11,12],ymm0[13,14,15] sched: [1:1.00]
+; SKX-NEXT: vpblendw {{.*#+}} ymm0 = mem[0],ymm0[1],mem[2],ymm0[3],mem[4],ymm0[5],mem[6],ymm0[7],mem[8],ymm0[9],mem[10],ymm0[11],mem[12],ymm0[13],mem[14],ymm0[15] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_pblendw:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7,8,9],ymm1[10,11,12],ymm0[13,14,15] sched: [2:0.33]
+; ZNVER1-NEXT: vpblendw {{.*#+}} ymm0 = mem[0],ymm0[1],mem[2],ymm0[3],mem[4],ymm0[5],mem[6],ymm0[7],mem[8],ymm0[9],mem[10],ymm0[11],mem[12],ymm0[13],mem[14],ymm0[15] sched: [9:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = shufflevector <16 x i16> %a0, <16 x i16> %a1, <16 x i32> <i32 0, i32 1, i32 18, i32 19, i32 20, i32 5, i32 6, i32 7, i32 8, i32 9, i32 26, i32 27, i32 28, i32 13, i32 14, i32 15>
+ %2 = load <16 x i16>, <16 x i16> *%a2, align 32
+ %3 = shufflevector <16 x i16> %1, <16 x i16> %2, <16 x i32> <i32 16, i32 1, i32 18, i32 3, i32 20, i32 5, i32 22, i32 7, i32 24, i32 9, i32 26, i32 11, i32 28, i32 13, i32 30, i32 15>
+ ret <16 x i16> %3
+}
+
+define <16 x i8> @test_pbroadcastb(<16 x i8> %a0, <16 x i8> *%a1) {
+; GENERIC-LABEL: test_pbroadcastb:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpbroadcastb %xmm0, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: vpbroadcastb (%rdi), %xmm1 # sched: [4:0.50]
+; GENERIC-NEXT: vpaddb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_pbroadcastb:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vpbroadcastb %xmm0, %xmm0 # sched: [3:1.00]
+; HASWELL-NEXT: vpbroadcastb (%rdi), %xmm1 # sched: [9:1.00]
+; HASWELL-NEXT: vpaddb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pbroadcastb:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpbroadcastb (%rdi), %xmm1 # sched: [9:1.00]
+; BROADWELL-NEXT: vpbroadcastb %xmm0, %xmm0 # sched: [3:1.00]
+; BROADWELL-NEXT: vpaddb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pbroadcastb:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpbroadcastb %xmm0, %xmm0 # sched: [3:1.00]
+; SKYLAKE-NEXT: vpbroadcastb (%rdi), %xmm1 # sched: [7:1.00]
+; SKYLAKE-NEXT: vpaddb %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pbroadcastb:
+; SKX: # %bb.0:
+; SKX-NEXT: vpbroadcastb %xmm0, %xmm0 # sched: [3:1.00]
+; SKX-NEXT: vpbroadcastb (%rdi), %xmm1 # sched: [7:1.00]
+; SKX-NEXT: vpaddb %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_pbroadcastb:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpbroadcastb (%rdi), %xmm1 # sched: [8:1.00]
+; ZNVER1-NEXT: vpbroadcastb %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <16 x i32> zeroinitializer
+ %2 = load <16 x i8>, <16 x i8> *%a1, align 16
+ %3 = shufflevector <16 x i8> %2, <16 x i8> undef, <16 x i32> zeroinitializer
+ %4 = add <16 x i8> %1, %3
+ ret <16 x i8> %4
+}
+
+define <32 x i8> @test_pbroadcastb_ymm(<32 x i8> %a0, <32 x i8> *%a1) {
+; GENERIC-LABEL: test_pbroadcastb_ymm:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpbroadcastb %xmm0, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: vpbroadcastb (%rdi), %ymm1 # sched: [4:0.50]
+; GENERIC-NEXT: vpaddb %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_pbroadcastb_ymm:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vpbroadcastb %xmm0, %ymm0 # sched: [3:1.00]
+; HASWELL-NEXT: vpbroadcastb (%rdi), %ymm1 # sched: [9:1.00]
+; HASWELL-NEXT: vpaddb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pbroadcastb_ymm:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpbroadcastb (%rdi), %ymm1 # sched: [9:1.00]
+; BROADWELL-NEXT: vpbroadcastb %xmm0, %ymm0 # sched: [3:1.00]
+; BROADWELL-NEXT: vpaddb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pbroadcastb_ymm:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpbroadcastb %xmm0, %ymm0 # sched: [3:1.00]
+; SKYLAKE-NEXT: vpbroadcastb (%rdi), %ymm1 # sched: [8:1.00]
+; SKYLAKE-NEXT: vpaddb %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pbroadcastb_ymm:
+; SKX: # %bb.0:
+; SKX-NEXT: vpbroadcastb %xmm0, %ymm0 # sched: [3:1.00]
+; SKX-NEXT: vpbroadcastb (%rdi), %ymm1 # sched: [8:1.00]
+; SKX-NEXT: vpaddb %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_pbroadcastb_ymm:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpbroadcastb (%rdi), %ymm1 # sched: [8:2.00]
+; ZNVER1-NEXT: vpbroadcastb %xmm0, %ymm0 # sched: [2:0.25]
+; ZNVER1-NEXT: vpaddb %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = shufflevector <32 x i8> %a0, <32 x i8> undef, <32 x i32> zeroinitializer
+ %2 = load <32 x i8>, <32 x i8> *%a1, align 32
+ %3 = shufflevector <32 x i8> %2, <32 x i8> undef, <32 x i32> zeroinitializer
+ %4 = add <32 x i8> %1, %3
+ ret <32 x i8> %4
+}
+
+define <4 x i32> @test_pbroadcastd(<4 x i32> %a0, <4 x i32> *%a1) {
+; GENERIC-LABEL: test_pbroadcastd:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpbroadcastd %xmm0, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: vpbroadcastd (%rdi), %xmm1 # sched: [4:0.50]
+; GENERIC-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_pbroadcastd:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vpbroadcastd %xmm0, %xmm0 # sched: [1:1.00]
+; HASWELL-NEXT: vpbroadcastd (%rdi), %xmm1 # sched: [6:0.50]
+; HASWELL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pbroadcastd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpbroadcastd %xmm0, %xmm0 # sched: [1:1.00]
+; BROADWELL-NEXT: vpbroadcastd (%rdi), %xmm1 # sched: [5:0.50]
+; BROADWELL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pbroadcastd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpbroadcastd %xmm0, %xmm0 # sched: [1:1.00]
+; SKYLAKE-NEXT: vpbroadcastd (%rdi), %xmm1 # sched: [6:0.50]
+; SKYLAKE-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pbroadcastd:
+; SKX: # %bb.0:
+; SKX-NEXT: vpbroadcastd %xmm0, %xmm0 # sched: [1:1.00]
+; SKX-NEXT: vpaddd (%rdi){1to4}, %xmm0, %xmm0 # sched: [7:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_pbroadcastd:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpbroadcastd (%rdi), %xmm1 # sched: [8:0.50]
+; ZNVER1-NEXT: vpbroadcastd %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <4 x i32> zeroinitializer
+ %2 = load <4 x i32>, <4 x i32> *%a1, align 16
+ %3 = shufflevector <4 x i32> %2, <4 x i32> undef, <4 x i32> zeroinitializer
+ %4 = add <4 x i32> %1, %3
+ ret <4 x i32> %4
+}
+
+define <8 x i32> @test_pbroadcastd_ymm(<8 x i32> %a0, <8 x i32> *%a1) {
+; GENERIC-LABEL: test_pbroadcastd_ymm:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpbroadcastd %xmm0, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: vpbroadcastd (%rdi), %ymm1 # sched: [4:0.50]
+; GENERIC-NEXT: vpaddd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_pbroadcastd_ymm:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vpbroadcastd %xmm0, %ymm0 # sched: [3:1.00]
+; HASWELL-NEXT: vpbroadcastd (%rdi), %ymm1 # sched: [7:0.50]
+; HASWELL-NEXT: vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pbroadcastd_ymm:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpbroadcastd %xmm0, %ymm0 # sched: [3:1.00]
+; BROADWELL-NEXT: vpbroadcastd (%rdi), %ymm1 # sched: [6:0.50]
+; BROADWELL-NEXT: vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pbroadcastd_ymm:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpbroadcastd %xmm0, %ymm0 # sched: [3:1.00]
+; SKYLAKE-NEXT: vpbroadcastd (%rdi), %ymm1 # sched: [7:0.50]
+; SKYLAKE-NEXT: vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pbroadcastd_ymm:
+; SKX: # %bb.0:
+; SKX-NEXT: vpbroadcastd %xmm0, %ymm0 # sched: [3:1.00]
+; SKX-NEXT: vpaddd (%rdi){1to8}, %ymm0, %ymm0 # sched: [8:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_pbroadcastd_ymm:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpbroadcastd (%rdi), %ymm1 # sched: [8:0.50]
+; ZNVER1-NEXT: vpbroadcastd %xmm0, %ymm0 # sched: [2:0.25]
+; ZNVER1-NEXT: vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> zeroinitializer
+ %2 = load <8 x i32>, <8 x i32> *%a1, align 32
+ %3 = shufflevector <8 x i32> %2, <8 x i32> undef, <8 x i32> zeroinitializer
+ %4 = add <8 x i32> %1, %3
+ ret <8 x i32> %4
+}
+
+define <2 x i64> @test_pbroadcastq(<2 x i64> %a0, <2 x i64> *%a1) {
+; GENERIC-LABEL: test_pbroadcastq:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpbroadcastq %xmm0, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: vpbroadcastq (%rdi), %xmm1 # sched: [4:0.50]
+; GENERIC-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_pbroadcastq:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vpbroadcastq %xmm0, %xmm0 # sched: [1:1.00]
+; HASWELL-NEXT: vpbroadcastq (%rdi), %xmm1 # sched: [6:0.50]
+; HASWELL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pbroadcastq:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpbroadcastq %xmm0, %xmm0 # sched: [1:1.00]
+; BROADWELL-NEXT: vpbroadcastq (%rdi), %xmm1 # sched: [5:0.50]
+; BROADWELL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pbroadcastq:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpbroadcastq %xmm0, %xmm0 # sched: [1:1.00]
+; SKYLAKE-NEXT: vpbroadcastq (%rdi), %xmm1 # sched: [6:0.50]
+; SKYLAKE-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pbroadcastq:
+; SKX: # %bb.0:
+; SKX-NEXT: vpbroadcastq %xmm0, %xmm0 # sched: [1:1.00]
+; SKX-NEXT: vpaddq (%rdi){1to2}, %xmm0, %xmm0 # sched: [7:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_pbroadcastq:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpbroadcastq (%rdi), %xmm1 # sched: [8:0.50]
+; ZNVER1-NEXT: vpbroadcastq %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = shufflevector <2 x i64> %a0, <2 x i64> undef, <2 x i32> zeroinitializer
+ %2 = load <2 x i64>, <2 x i64> *%a1, align 16
+ %3 = shufflevector <2 x i64> %2, <2 x i64> undef, <2 x i32> zeroinitializer
+ %4 = add <2 x i64> %1, %3
+ ret <2 x i64> %4
+}
+
+define <4 x i64> @test_pbroadcastq_ymm(<4 x i64> %a0, <4 x i64> *%a1) {
+; GENERIC-LABEL: test_pbroadcastq_ymm:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpbroadcastq %xmm0, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: vpbroadcastq (%rdi), %ymm1 # sched: [4:0.50]
+; GENERIC-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_pbroadcastq_ymm:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vpbroadcastq %xmm0, %ymm0 # sched: [3:1.00]
+; HASWELL-NEXT: vpbroadcastq (%rdi), %ymm1 # sched: [7:0.50]
+; HASWELL-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pbroadcastq_ymm:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpbroadcastq %xmm0, %ymm0 # sched: [3:1.00]
+; BROADWELL-NEXT: vpbroadcastq (%rdi), %ymm1 # sched: [6:0.50]
+; BROADWELL-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pbroadcastq_ymm:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpbroadcastq %xmm0, %ymm0 # sched: [3:1.00]
+; SKYLAKE-NEXT: vpbroadcastq (%rdi), %ymm1 # sched: [7:0.50]
+; SKYLAKE-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pbroadcastq_ymm:
+; SKX: # %bb.0:
+; SKX-NEXT: vpbroadcastq %xmm0, %ymm0 # sched: [3:1.00]
+; SKX-NEXT: vpaddq (%rdi){1to4}, %ymm0, %ymm0 # sched: [8:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_pbroadcastq_ymm:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpbroadcastq (%rdi), %ymm1 # sched: [8:0.50]
+; ZNVER1-NEXT: vpbroadcastq %xmm0, %ymm0 # sched: [2:0.25]
+; ZNVER1-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> zeroinitializer
+ %2 = load <4 x i64>, <4 x i64> *%a1, align 32
+ %3 = shufflevector <4 x i64> %2, <4 x i64> undef, <4 x i32> zeroinitializer
+ %4 = add <4 x i64> %1, %3
+ ret <4 x i64> %4
+}
+
+define <8 x i16> @test_pbroadcastw(<8 x i16> %a0, <8 x i16> *%a1) {
+; GENERIC-LABEL: test_pbroadcastw:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpbroadcastw %xmm0, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: vpbroadcastw (%rdi), %xmm1 # sched: [4:0.50]
+; GENERIC-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_pbroadcastw:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vpbroadcastw %xmm0, %xmm0 # sched: [3:1.00]
+; HASWELL-NEXT: vpbroadcastw (%rdi), %xmm1 # sched: [9:1.00]
+; HASWELL-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pbroadcastw:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpbroadcastw (%rdi), %xmm1 # sched: [9:1.00]
+; BROADWELL-NEXT: vpbroadcastw %xmm0, %xmm0 # sched: [3:1.00]
+; BROADWELL-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pbroadcastw:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpbroadcastw %xmm0, %xmm0 # sched: [3:1.00]
+; SKYLAKE-NEXT: vpbroadcastw (%rdi), %xmm1 # sched: [7:1.00]
+; SKYLAKE-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pbroadcastw:
+; SKX: # %bb.0:
+; SKX-NEXT: vpbroadcastw %xmm0, %xmm0 # sched: [3:1.00]
+; SKX-NEXT: vpbroadcastw (%rdi), %xmm1 # sched: [7:1.00]
+; SKX-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_pbroadcastw:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpbroadcastw (%rdi), %xmm1 # sched: [8:1.00]
+; ZNVER1-NEXT: vpbroadcastw %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> zeroinitializer
+ %2 = load <8 x i16>, <8 x i16> *%a1, align 16
+ %3 = shufflevector <8 x i16> %2, <8 x i16> undef, <8 x i32> zeroinitializer
+ %4 = add <8 x i16> %1, %3
+ ret <8 x i16> %4
+}
+
+define <16 x i16> @test_pbroadcastw_ymm(<16 x i16> %a0, <16 x i16> *%a1) {
+; GENERIC-LABEL: test_pbroadcastw_ymm:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpbroadcastw %xmm0, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: vpbroadcastw (%rdi), %ymm1 # sched: [4:0.50]
+; GENERIC-NEXT: vpaddw %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_pbroadcastw_ymm:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vpbroadcastw %xmm0, %ymm0 # sched: [3:1.00]
+; HASWELL-NEXT: vpbroadcastw (%rdi), %ymm1 # sched: [9:1.00]
+; HASWELL-NEXT: vpaddw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pbroadcastw_ymm:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpbroadcastw (%rdi), %ymm1 # sched: [9:1.00]
+; BROADWELL-NEXT: vpbroadcastw %xmm0, %ymm0 # sched: [3:1.00]
+; BROADWELL-NEXT: vpaddw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pbroadcastw_ymm:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpbroadcastw %xmm0, %ymm0 # sched: [3:1.00]
+; SKYLAKE-NEXT: vpbroadcastw (%rdi), %ymm1 # sched: [8:1.00]
+; SKYLAKE-NEXT: vpaddw %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pbroadcastw_ymm:
+; SKX: # %bb.0:
+; SKX-NEXT: vpbroadcastw %xmm0, %ymm0 # sched: [3:1.00]
+; SKX-NEXT: vpbroadcastw (%rdi), %ymm1 # sched: [8:1.00]
+; SKX-NEXT: vpaddw %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_pbroadcastw_ymm:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpbroadcastw (%rdi), %ymm1 # sched: [8:2.00]
+; ZNVER1-NEXT: vpbroadcastw %xmm0, %ymm0 # sched: [2:0.25]
+; ZNVER1-NEXT: vpaddw %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> zeroinitializer
+ %2 = load <16 x i16>, <16 x i16> *%a1, align 32
+ %3 = shufflevector <16 x i16> %2, <16 x i16> undef, <16 x i32> zeroinitializer
+ %4 = add <16 x i16> %1, %3
+ ret <16 x i16> %4
+}
+
+define <32 x i8> @test_pcmpeqb(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> *%a2) {
+; GENERIC-LABEL: test_pcmpeqb:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: vpcmpeqb (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_pcmpeqb:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; HASWELL-NEXT: vpcmpeqb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pcmpeqb:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; BROADWELL-NEXT: vpcmpeqb (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pcmpeqb:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; SKYLAKE-NEXT: vpcmpeqb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pcmpeqb:
+; SKX: # %bb.0:
+; SKX-NEXT: vpcmpeqb %ymm1, %ymm0, %k0 # sched: [3:1.00]
+; SKX-NEXT: vpmovm2b %k0, %ymm0 # sched: [1:0.25]
+; SKX-NEXT: vpcmpeqb (%rdi), %ymm0, %k0 # sched: [10:1.00]
+; SKX-NEXT: vpmovm2b %k0, %ymm0 # sched: [1:0.25]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_pcmpeqb:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT: vpcmpeqb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = icmp eq <32 x i8> %a0, %a1
+ %2 = sext <32 x i1> %1 to <32 x i8>
+ %3 = load <32 x i8>, <32 x i8> *%a2, align 32
+ %4 = icmp eq <32 x i8> %2, %3
+ %5 = sext <32 x i1> %4 to <32 x i8>
+ ret <32 x i8> %5
+}
+
+define <8 x i32> @test_pcmpeqd(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> *%a2) {
+; GENERIC-LABEL: test_pcmpeqd:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: vpcmpeqd (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_pcmpeqd:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vpcmpeqd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; HASWELL-NEXT: vpcmpeqd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pcmpeqd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpcmpeqd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; BROADWELL-NEXT: vpcmpeqd (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pcmpeqd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpcmpeqd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; SKYLAKE-NEXT: vpcmpeqd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pcmpeqd:
+; SKX: # %bb.0:
+; SKX-NEXT: vpcmpeqd %ymm1, %ymm0, %k0 # sched: [3:1.00]
+; SKX-NEXT: vpmovm2d %k0, %ymm0 # sched: [1:0.25]
+; SKX-NEXT: vpcmpeqd (%rdi), %ymm0, %k0 # sched: [10:1.00]
+; SKX-NEXT: vpmovm2d %k0, %ymm0 # sched: [1:0.25]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_pcmpeqd:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpcmpeqd %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT: vpcmpeqd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = icmp eq <8 x i32> %a0, %a1
+ %2 = sext <8 x i1> %1 to <8 x i32>
+ %3 = load <8 x i32>, <8 x i32> *%a2, align 32
+ %4 = icmp eq <8 x i32> %2, %3
+ %5 = sext <8 x i1> %4 to <8 x i32>
+ ret <8 x i32> %5
+}
+
+define <4 x i64> @test_pcmpeqq(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> *%a2) {
+; GENERIC-LABEL: test_pcmpeqq:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: vpcmpeqq (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_pcmpeqq:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; HASWELL-NEXT: vpcmpeqq (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pcmpeqq:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; BROADWELL-NEXT: vpcmpeqq (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pcmpeqq:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; SKYLAKE-NEXT: vpcmpeqq (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pcmpeqq:
+; SKX: # %bb.0:
+; SKX-NEXT: vpcmpeqq %ymm1, %ymm0, %k0 # sched: [3:1.00]
+; SKX-NEXT: vpmovm2q %k0, %ymm0 # sched: [1:0.25]
+; SKX-NEXT: vpcmpeqq (%rdi), %ymm0, %k0 # sched: [10:1.00]
+; SKX-NEXT: vpmovm2q %k0, %ymm0 # sched: [1:0.25]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_pcmpeqq:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT: vpcmpeqq (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = icmp eq <4 x i64> %a0, %a1
+ %2 = sext <4 x i1> %1 to <4 x i64>
+ %3 = load <4 x i64>, <4 x i64> *%a2, align 32
+ %4 = icmp eq <4 x i64> %2, %3
+ %5 = sext <4 x i1> %4 to <4 x i64>
+ ret <4 x i64> %5
+}
+
+define <16 x i16> @test_pcmpeqw(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> *%a2) {
+; GENERIC-LABEL: test_pcmpeqw:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: vpcmpeqw (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_pcmpeqw:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; HASWELL-NEXT: vpcmpeqw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pcmpeqw:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; BROADWELL-NEXT: vpcmpeqw (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pcmpeqw:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; SKYLAKE-NEXT: vpcmpeqw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pcmpeqw:
+; SKX: # %bb.0:
+; SKX-NEXT: vpcmpeqw %ymm1, %ymm0, %k0 # sched: [3:1.00]
+; SKX-NEXT: vpmovm2w %k0, %ymm0 # sched: [1:0.25]
+; SKX-NEXT: vpcmpeqw (%rdi), %ymm0, %k0 # sched: [10:1.00]
+; SKX-NEXT: vpmovm2w %k0, %ymm0 # sched: [1:0.25]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_pcmpeqw:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT: vpcmpeqw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = icmp eq <16 x i16> %a0, %a1
+ %2 = sext <16 x i1> %1 to <16 x i16>
+ %3 = load <16 x i16>, <16 x i16> *%a2, align 32
+ %4 = icmp eq <16 x i16> %2, %3
+ %5 = sext <16 x i1> %4 to <16 x i16>
+ ret <16 x i16> %5
+}
+
+define <32 x i8> @test_pcmpgtb(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> *%a2) {
+; GENERIC-LABEL: test_pcmpgtb:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: vpcmpgtb (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_pcmpgtb:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; HASWELL-NEXT: vpcmpgtb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pcmpgtb:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; BROADWELL-NEXT: vpcmpgtb (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pcmpgtb:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; SKYLAKE-NEXT: vpcmpgtb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pcmpgtb:
+; SKX: # %bb.0:
+; SKX-NEXT: vpcmpgtb %ymm1, %ymm0, %k0 # sched: [3:1.00]
+; SKX-NEXT: vpmovm2b %k0, %ymm0 # sched: [1:0.25]
+; SKX-NEXT: vpcmpgtb (%rdi), %ymm0, %k0 # sched: [10:1.00]
+; SKX-NEXT: vpmovm2b %k0, %ymm0 # sched: [1:0.25]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_pcmpgtb:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT: vpcmpgtb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = icmp sgt <32 x i8> %a0, %a1
+ %2 = sext <32 x i1> %1 to <32 x i8>
+ %3 = load <32 x i8>, <32 x i8> *%a2, align 32
+ %4 = icmp sgt <32 x i8> %2, %3
+ %5 = sext <32 x i1> %4 to <32 x i8>
+ ret <32 x i8> %5
+}
+
+define <8 x i32> @test_pcmpgtd(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> *%a2) {
+; GENERIC-LABEL: test_pcmpgtd:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: vpcmpgtd (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_pcmpgtd:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; HASWELL-NEXT: vpcmpgtd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pcmpgtd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; BROADWELL-NEXT: vpcmpgtd (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pcmpgtd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; SKYLAKE-NEXT: vpcmpgtd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pcmpgtd:
+; SKX: # %bb.0:
+; SKX-NEXT: vpcmpgtd %ymm1, %ymm0, %k0 # sched: [3:1.00]
+; SKX-NEXT: vpmovm2d %k0, %ymm0 # sched: [1:0.25]
+; SKX-NEXT: vpcmpgtd (%rdi), %ymm0, %k0 # sched: [10:1.00]
+; SKX-NEXT: vpmovm2d %k0, %ymm0 # sched: [1:0.25]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_pcmpgtd:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT: vpcmpgtd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = icmp sgt <8 x i32> %a0, %a1
+ %2 = sext <8 x i1> %1 to <8 x i32>
+ %3 = load <8 x i32>, <8 x i32> *%a2, align 32
+ %4 = icmp sgt <8 x i32> %2, %3
+ %5 = sext <8 x i1> %4 to <8 x i32>
+ ret <8 x i32> %5
+}
+
+define <4 x i64> @test_pcmpgtq(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> *%a2) {
+; GENERIC-LABEL: test_pcmpgtq:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: vpcmpgtq (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_pcmpgtq:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
+; HASWELL-NEXT: vpcmpgtq (%rdi), %ymm0, %ymm0 # sched: [12:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pcmpgtq:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
+; BROADWELL-NEXT: vpcmpgtq (%rdi), %ymm0, %ymm0 # sched: [11:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pcmpgtq:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; SKYLAKE-NEXT: vpcmpgtq (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pcmpgtq:
+; SKX: # %bb.0:
+; SKX-NEXT: vpcmpgtq %ymm1, %ymm0, %k0 # sched: [3:1.00]
+; SKX-NEXT: vpmovm2q %k0, %ymm0 # sched: [1:0.25]
+; SKX-NEXT: vpcmpgtq (%rdi), %ymm0, %k0 # sched: [10:1.00]
+; SKX-NEXT: vpmovm2q %k0, %ymm0 # sched: [1:0.25]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_pcmpgtq:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; ZNVER1-NEXT: vpcmpgtq (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = icmp sgt <4 x i64> %a0, %a1
+ %2 = sext <4 x i1> %1 to <4 x i64>
+ %3 = load <4 x i64>, <4 x i64> *%a2, align 32
+ %4 = icmp sgt <4 x i64> %2, %3
+ %5 = sext <4 x i1> %4 to <4 x i64>
+ ret <4 x i64> %5
+}
+
+define <16 x i16> @test_pcmpgtw(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> *%a2) {
+; GENERIC-LABEL: test_pcmpgtw:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: vpcmpgtw (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_pcmpgtw:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; HASWELL-NEXT: vpcmpgtw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pcmpgtw:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; BROADWELL-NEXT: vpcmpgtw (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pcmpgtw:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; SKYLAKE-NEXT: vpcmpgtw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pcmpgtw:
+; SKX: # %bb.0:
+; SKX-NEXT: vpcmpgtw %ymm1, %ymm0, %k0 # sched: [3:1.00]
+; SKX-NEXT: vpmovm2w %k0, %ymm0 # sched: [1:0.25]
+; SKX-NEXT: vpcmpgtw (%rdi), %ymm0, %k0 # sched: [10:1.00]
+; SKX-NEXT: vpmovm2w %k0, %ymm0 # sched: [1:0.25]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_pcmpgtw:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT: vpcmpgtw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = icmp sgt <16 x i16> %a0, %a1
+ %2 = sext <16 x i1> %1 to <16 x i16>
+ %3 = load <16 x i16>, <16 x i16> *%a2, align 32
+ %4 = icmp sgt <16 x i16> %2, %3
+ %5 = sext <16 x i1> %4 to <16 x i16>
+ ret <16 x i16> %5
+}
+
+define <4 x i64> @test_perm2i128(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> *%a2) {
+; GENERIC-LABEL: test_perm2i128:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
+; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [5:1.00]
+; GENERIC-NEXT: vpaddq %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_perm2i128:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
+; HASWELL-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
+; HASWELL-NEXT: vpaddq %ymm0, %ymm1, %ymm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_perm2i128:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
+; BROADWELL-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [9:1.00]
+; BROADWELL-NEXT: vpaddq %ymm0, %ymm1, %ymm0 # sched: [1:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_perm2i128:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
+; SKYLAKE-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
+; SKYLAKE-NEXT: vpaddq %ymm0, %ymm1, %ymm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_perm2i128:
+; SKX: # %bb.0:
+; SKX-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
+; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
+; SKX-NEXT: vpaddq %ymm0, %ymm1, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_perm2i128:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3],ymm1[0,1] sched: [2:0.25]
+; ZNVER1-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [9:0.50]
+; ZNVER1-NEXT: vpaddq %ymm0, %ymm1, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = shufflevector <4 x i64> %a0, <4 x i64> %a1, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+ %2 = load <4 x i64>, <4 x i64> *%a2, align 32
+ %3 = shufflevector <4 x i64> %a0, <4 x i64> %2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+ %4 = add <4 x i64> %1, %3
+ ret <4 x i64> %4
+}
+
+define <8 x i32> @test_permd(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> *%a2) {
+; GENERIC-LABEL: test_permd:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpermd %ymm1, %ymm0, %ymm1 # sched: [1:1.00]
+; GENERIC-NEXT: vpermd (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
+; GENERIC-NEXT: vpaddd %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_permd:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vpermd %ymm1, %ymm0, %ymm1 # sched: [3:1.00]
+; HASWELL-NEXT: vpermd (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; HASWELL-NEXT: vpaddd %ymm0, %ymm1, %ymm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_permd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpermd %ymm1, %ymm0, %ymm1 # sched: [3:1.00]
+; BROADWELL-NEXT: vpermd (%rdi), %ymm0, %ymm0 # sched: [9:1.00]
+; BROADWELL-NEXT: vpaddd %ymm0, %ymm1, %ymm0 # sched: [1:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_permd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpermd %ymm1, %ymm0, %ymm1 # sched: [3:1.00]
+; SKYLAKE-NEXT: vpermd (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; SKYLAKE-NEXT: vpaddd %ymm0, %ymm1, %ymm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_permd:
+; SKX: # %bb.0:
+; SKX-NEXT: vpermd %ymm1, %ymm0, %ymm1 # sched: [3:1.00]
+; SKX-NEXT: vpermd (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; SKX-NEXT: vpaddd %ymm0, %ymm1, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_permd:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpermd %ymm1, %ymm0, %ymm1 # sched: [2:0.25]
+; ZNVER1-NEXT: vpermd (%rdi), %ymm0, %ymm0 # sched: [9:0.50]
+; ZNVER1-NEXT: vpaddd %ymm0, %ymm1, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a1, <8 x i32> %a0)
+ %2 = load <8 x i32>, <8 x i32> *%a2, align 32
+ %3 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %2, <8 x i32> %a0)
+ %4 = add <8 x i32> %1, %3
+ ret <8 x i32> %4
+}
+declare <8 x i32> @llvm.x86.avx2.permd(<8 x i32>, <8 x i32>) nounwind readonly
+
+define <4 x double> @test_permpd(<4 x double> %a0, <4 x double> *%a1) {
+; GENERIC-LABEL: test_permpd:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,2,2,3] sched: [1:1.00]
+; GENERIC-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,2,2,3] sched: [5:1.00]
+; GENERIC-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_permpd:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,2,2,3] sched: [3:1.00]
+; HASWELL-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,2,2,3] sched: [10:1.00]
+; HASWELL-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_permpd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,2,2,3] sched: [3:1.00]
+; BROADWELL-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,2,2,3] sched: [9:1.00]
+; BROADWELL-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_permpd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,2,2,3] sched: [3:1.00]
+; SKYLAKE-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,2,2,3] sched: [10:1.00]
+; SKYLAKE-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_permpd:
+; SKX: # %bb.0:
+; SKX-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,2,2,3] sched: [3:1.00]
+; SKX-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,2,2,3] sched: [10:1.00]
+; SKX-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_permpd:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpermpd {{.*#+}} ymm1 = mem[0,2,2,3] sched: [107:0.50]
+; ZNVER1-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,2,2,3] sched: [100:0.25]
+; ZNVER1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 3, i32 2, i32 2, i32 3>
+ %2 = load <4 x double>, <4 x double> *%a1, align 32
+ %3 = shufflevector <4 x double> %2, <4 x double> undef, <4 x i32> <i32 0, i32 2, i32 2, i32 3>
+ %4 = fadd <4 x double> %1, %3
+ ret <4 x double> %4
+}
+
+define <8 x float> @test_permps(<8 x i32> %a0, <8 x float> %a1, <8 x float> *%a2) {
+; GENERIC-LABEL: test_permps:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpermps %ymm1, %ymm0, %ymm1 # sched: [1:1.00]
+; GENERIC-NEXT: vpermps (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
+; GENERIC-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_permps:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vpermps %ymm1, %ymm0, %ymm1 # sched: [3:1.00]
+; HASWELL-NEXT: vpermps (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; HASWELL-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_permps:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpermps %ymm1, %ymm0, %ymm1 # sched: [3:1.00]
+; BROADWELL-NEXT: vpermps (%rdi), %ymm0, %ymm0 # sched: [9:1.00]
+; BROADWELL-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_permps:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpermps %ymm1, %ymm0, %ymm1 # sched: [3:1.00]
+; SKYLAKE-NEXT: vpermps (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; SKYLAKE-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_permps:
+; SKX: # %bb.0:
+; SKX-NEXT: vpermps %ymm1, %ymm0, %ymm1 # sched: [3:1.00]
+; SKX-NEXT: vpermps (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; SKX-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_permps:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpermps %ymm1, %ymm0, %ymm1 # sched: [100:0.25]
+; ZNVER1-NEXT: vpermps (%rdi), %ymm0, %ymm0 # sched: [107:0.50]
+; ZNVER1-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a1, <8 x i32> %a0)
+ %2 = load <8 x float>, <8 x float> *%a2, align 32
+ %3 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %2, <8 x i32> %a0)
+ %4 = fadd <8 x float> %1, %3
+ ret <8 x float> %4
+}
+declare <8 x float> @llvm.x86.avx2.permps(<8 x float>, <8 x i32>) nounwind readonly
+
+define <4 x i64> @test_permq(<4 x i64> %a0, <4 x i64> *%a1) {
+; GENERIC-LABEL: test_permq:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,2,2,3] sched: [1:1.00]
+; GENERIC-NEXT: vpermq {{.*#+}} ymm1 = mem[0,2,2,3] sched: [5:1.00]
+; GENERIC-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_permq:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,2,2,3] sched: [3:1.00]
+; HASWELL-NEXT: vpermq {{.*#+}} ymm1 = mem[0,2,2,3] sched: [10:1.00]
+; HASWELL-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_permq:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,2,2,3] sched: [3:1.00]
+; BROADWELL-NEXT: vpermq {{.*#+}} ymm1 = mem[0,2,2,3] sched: [9:1.00]
+; BROADWELL-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_permq:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,2,2,3] sched: [3:1.00]
+; SKYLAKE-NEXT: vpermq {{.*#+}} ymm1 = mem[0,2,2,3] sched: [10:1.00]
+; SKYLAKE-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_permq:
+; SKX: # %bb.0:
+; SKX-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,2,2,3] sched: [3:1.00]
+; SKX-NEXT: vpermq {{.*#+}} ymm1 = mem[0,2,2,3] sched: [10:1.00]
+; SKX-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_permq:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpermq {{.*#+}} ymm1 = mem[0,2,2,3] sched: [9:0.50]
+; ZNVER1-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,2,2,3] sched: [2:0.25]
+; ZNVER1-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 3, i32 2, i32 2, i32 3>
+ %2 = load <4 x i64>, <4 x i64> *%a1, align 32
+ %3 = shufflevector <4 x i64> %2, <4 x i64> undef, <4 x i32> <i32 0, i32 2, i32 2, i32 3>
+ %4 = add <4 x i64> %1, %3
+ ret <4 x i64> %4
+}
+
+define <4 x i32> @test_pgatherdd(<4 x i32> %a0, i8* %a1, <4 x i32> %a2, <4 x i32> %a3) {
+; GENERIC-LABEL: test_pgatherdd:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpgatherdd %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [4:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_pgatherdd:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vpgatherdd %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [26:2.67]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pgatherdd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpgatherdd %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [5:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pgatherdd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpgatherdd %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [22:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pgatherdd:
+; SKX: # %bb.0:
+; SKX-NEXT: vpgatherdd %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [22:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_pgatherdd:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpgatherdd %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [100:?]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32> %a0, i8* %a1, <4 x i32> %a2, <4 x i32> %a3, i8 2)
+ ret <4 x i32> %1
+}
+declare <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32>, i8*, <4 x i32>, <4 x i32>, i8) nounwind readonly
+
+define <8 x i32> @test_pgatherdd_ymm(<8 x i32> %a0, i8* %a1, <8 x i32> %a2, <8 x i32> %a3) {
+; GENERIC-LABEL: test_pgatherdd_ymm:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpgatherdd %ymm2, (%rdi,%ymm1,2), %ymm0 # sched: [4:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_pgatherdd_ymm:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vpgatherdd %ymm2, (%rdi,%ymm1,2), %ymm0 # sched: [27:6.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pgatherdd_ymm:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpgatherdd %ymm2, (%rdi,%ymm1,2), %ymm0 # sched: [5:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pgatherdd_ymm:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpgatherdd %ymm2, (%rdi,%ymm1,2), %ymm0 # sched: [25:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pgatherdd_ymm:
+; SKX: # %bb.0:
+; SKX-NEXT: vpgatherdd %ymm2, (%rdi,%ymm1,2), %ymm0 # sched: [25:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_pgatherdd_ymm:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpgatherdd %ymm2, (%rdi,%ymm1,2), %ymm0 # sched: [100:?]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> %a0, i8* %a1, <8 x i32> %a2, <8 x i32> %a3, i8 2)
+ ret <8 x i32> %1
+}
+declare <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32>, i8*, <8 x i32>, <8 x i32>, i8) nounwind readonly
+
+define <2 x i64> @test_pgatherdq(<2 x i64> %a0, i8* %a1, <4 x i32> %a2, <2 x i64> %a3) {
+; GENERIC-LABEL: test_pgatherdq:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpgatherdq %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [4:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_pgatherdq:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vpgatherdq %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [26:2.67]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pgatherdq:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpgatherdq %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [5:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pgatherdq:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpgatherdq %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [22:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pgatherdq:
+; SKX: # %bb.0:
+; SKX-NEXT: vpgatherdq %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [22:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_pgatherdq:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpgatherdq %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [100:?]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call <2 x i64> @llvm.x86.avx2.gather.d.q(<2 x i64> %a0, i8* %a1, <4 x i32> %a2, <2 x i64> %a3, i8 2)
+ ret <2 x i64> %1
+}
+declare <2 x i64> @llvm.x86.avx2.gather.d.q(<2 x i64>, i8*, <4 x i32>, <2 x i64>, i8) nounwind readonly
+
+define <4 x i64> @test_pgatherdq_ymm(<4 x i64> %a0, i8* %a1, <4 x i32> %a2, <4 x i64> %a3) {
+; GENERIC-LABEL: test_pgatherdq_ymm:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpgatherdq %ymm2, (%rdi,%xmm1,2), %ymm0 # sched: [4:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_pgatherdq_ymm:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vpgatherdq %ymm2, (%rdi,%xmm1,2), %ymm0 # sched: [27:4.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pgatherdq_ymm:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpgatherdq %ymm2, (%rdi,%xmm1,2), %ymm0 # sched: [5:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pgatherdq_ymm:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpgatherdq %ymm2, (%rdi,%xmm1,2), %ymm0 # sched: [25:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pgatherdq_ymm:
+; SKX: # %bb.0:
+; SKX-NEXT: vpgatherdq %ymm2, (%rdi,%xmm1,2), %ymm0 # sched: [25:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_pgatherdq_ymm:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpgatherdq %ymm2, (%rdi,%xmm1,2), %ymm0 # sched: [100:?]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> %a0, i8* %a1, <4 x i32> %a2, <4 x i64> %a3, i8 2)
+ ret <4 x i64> %1
+}
+declare <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64>, i8*, <4 x i32>, <4 x i64>, i8) nounwind readonly
+
+define <4 x i32> @test_pgatherqd(<4 x i32> %a0, i8* %a1, <2 x i64> %a2, <4 x i32> %a3) {
+; GENERIC-LABEL: test_pgatherqd:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpgatherqd %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [4:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_pgatherqd:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vpgatherqd %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [25:5.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pgatherqd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpgatherqd %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [5:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pgatherqd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpgatherqd %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [22:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pgatherqd:
+; SKX: # %bb.0:
+; SKX-NEXT: vpgatherqd %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [22:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_pgatherqd:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpgatherqd %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [100:?]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call <4 x i32> @llvm.x86.avx2.gather.q.d(<4 x i32> %a0, i8* %a1, <2 x i64> %a2, <4 x i32> %a3, i8 2)
+ ret <4 x i32> %1
+}
+declare <4 x i32> @llvm.x86.avx2.gather.q.d(<4 x i32>, i8*, <2 x i64>, <4 x i32>, i8) nounwind readonly
+
+define <4 x i32> @test_pgatherqd_ymm(<4 x i32> %a0, i8* %a1, <4 x i64> %a2, <4 x i32> %a3) {
+; GENERIC-LABEL: test_pgatherqd_ymm:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpgatherqd %xmm2, (%rdi,%ymm1,2), %xmm0 # sched: [4:0.50]
+; GENERIC-NEXT: vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_pgatherqd_ymm:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vpgatherqd %xmm2, (%rdi,%ymm1,2), %xmm0 # sched: [28:5.00]
+; HASWELL-NEXT: vzeroupper # sched: [4:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pgatherqd_ymm:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpgatherqd %xmm2, (%rdi,%ymm1,2), %xmm0 # sched: [5:0.50]
+; BROADWELL-NEXT: vzeroupper # sched: [4:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pgatherqd_ymm:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpgatherqd %xmm2, (%rdi,%ymm1,2), %xmm0 # sched: [25:1.00]
+; SKYLAKE-NEXT: vzeroupper # sched: [4:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pgatherqd_ymm:
+; SKX: # %bb.0:
+; SKX-NEXT: vpgatherqd %xmm2, (%rdi,%ymm1,2), %xmm0 # sched: [25:1.00]
+; SKX-NEXT: vzeroupper # sched: [4:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_pgatherqd_ymm:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpgatherqd %xmm2, (%rdi,%ymm1,2), %xmm0 # sched: [100:?]
+; ZNVER1-NEXT: vzeroupper # sched: [100:?]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> %a0, i8* %a1, <4 x i64> %a2, <4 x i32> %a3, i8 2)
+ ret <4 x i32> %1
+}
+declare <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32>, i8*, <4 x i64>, <4 x i32>, i8) nounwind readonly
+
+define <2 x i64> @test_pgatherqq(<2 x i64> %a0, i8 *%a1, <2 x i64> %a2, <2 x i64> %a3) {
+; GENERIC-LABEL: test_pgatherqq:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpgatherqq %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [4:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_pgatherqq:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vpgatherqq %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [23:3.33]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pgatherqq:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpgatherqq %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [5:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pgatherqq:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpgatherqq %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [22:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pgatherqq:
+; SKX: # %bb.0:
+; SKX-NEXT: vpgatherqq %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [22:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_pgatherqq:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpgatherqq %xmm2, (%rdi,%xmm1,2), %xmm0 # sched: [100:?]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call <2 x i64> @llvm.x86.avx2.gather.q.q(<2 x i64> %a0, i8* %a1, <2 x i64> %a2, <2 x i64> %a3, i8 2)
+ ret <2 x i64> %1
+}
+declare <2 x i64> @llvm.x86.avx2.gather.q.q(<2 x i64>, i8*, <2 x i64>, <2 x i64>, i8) nounwind readonly
+
+define <4 x i64> @test_pgatherqq_ymm(<4 x i64> %a0, i8 *%a1, <4 x i64> %a2, <4 x i64> %a3) {
+; GENERIC-LABEL: test_pgatherqq_ymm:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpgatherqq %ymm2, (%rdi,%ymm1,2), %ymm0 # sched: [4:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_pgatherqq_ymm:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vpgatherqq %ymm2, (%rdi,%ymm1,2), %ymm0 # sched: [24:5.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pgatherqq_ymm:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpgatherqq %ymm2, (%rdi,%ymm1,2), %ymm0 # sched: [5:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pgatherqq_ymm:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpgatherqq %ymm2, (%rdi,%ymm1,2), %ymm0 # sched: [25:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pgatherqq_ymm:
+; SKX: # %bb.0:
+; SKX-NEXT: vpgatherqq %ymm2, (%rdi,%ymm1,2), %ymm0 # sched: [25:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_pgatherqq_ymm:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpgatherqq %ymm2, (%rdi,%ymm1,2), %ymm0 # sched: [100:?]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> %a0, i8* %a1, <4 x i64> %a2, <4 x i64> %a3, i8 2)
+ ret <4 x i64> %1
+}
+declare <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64>, i8*, <4 x i64>, <4 x i64>, i8) nounwind readonly
+
+define <8 x i32> @test_phaddd(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> *%a2) {
+; GENERIC-LABEL: test_phaddd:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vphaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; GENERIC-NEXT: vphaddd (%rdi), %ymm0, %ymm0 # sched: [5:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_phaddd:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vphaddd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
+; HASWELL-NEXT: vphaddd (%rdi), %ymm0, %ymm0 # sched: [10:2.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_phaddd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vphaddd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
+; BROADWELL-NEXT: vphaddd (%rdi), %ymm0, %ymm0 # sched: [9:2.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_phaddd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vphaddd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
+; SKYLAKE-NEXT: vphaddd (%rdi), %ymm0, %ymm0 # sched: [10:2.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_phaddd:
+; SKX: # %bb.0:
+; SKX-NEXT: vphaddd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
+; SKX-NEXT: vphaddd (%rdi), %ymm0, %ymm0 # sched: [10:2.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_phaddd:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vphaddd %ymm1, %ymm0, %ymm0 # sched: [100:?]
+; ZNVER1-NEXT: vphaddd (%rdi), %ymm0, %ymm0 # sched: [100:?]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32> %a0, <8 x i32> %a1)
+ %2 = load <8 x i32>, <8 x i32> *%a2, align 32
+ %3 = call <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32> %1, <8 x i32> %2)
+ ret <8 x i32> %3
+}
+declare <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32>, <8 x i32>) nounwind readnone
+
+define <16 x i16> @test_phaddsw(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> *%a2) {
+; GENERIC-LABEL: test_phaddsw:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vphaddsw %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: vphaddsw (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_phaddsw:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vphaddsw %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
+; HASWELL-NEXT: vphaddsw (%rdi), %ymm0, %ymm0 # sched: [10:2.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_phaddsw:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vphaddsw %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
+; BROADWELL-NEXT: vphaddsw (%rdi), %ymm0, %ymm0 # sched: [9:2.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_phaddsw:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vphaddsw %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
+; SKYLAKE-NEXT: vphaddsw (%rdi), %ymm0, %ymm0 # sched: [10:2.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_phaddsw:
+; SKX: # %bb.0:
+; SKX-NEXT: vphaddsw %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
+; SKX-NEXT: vphaddsw (%rdi), %ymm0, %ymm0 # sched: [10:2.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_phaddsw:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vphaddsw %ymm1, %ymm0, %ymm0 # sched: [100:?]
+; ZNVER1-NEXT: vphaddsw (%rdi), %ymm0, %ymm0 # sched: [100:?]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call <16 x i16> @llvm.x86.avx2.phadd.sw(<16 x i16> %a0, <16 x i16> %a1)
+ %2 = load <16 x i16>, <16 x i16> *%a2, align 32
+ %3 = call <16 x i16> @llvm.x86.avx2.phadd.sw(<16 x i16> %1, <16 x i16> %2)
+ ret <16 x i16> %3
+}
+declare <16 x i16> @llvm.x86.avx2.phadd.sw(<16 x i16>, <16 x i16>) nounwind readnone
+
+define <16 x i16> @test_phaddw(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> *%a2) {
+; GENERIC-LABEL: test_phaddw:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vphaddw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; GENERIC-NEXT: vphaddw (%rdi), %ymm0, %ymm0 # sched: [5:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_phaddw:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vphaddw %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
+; HASWELL-NEXT: vphaddw (%rdi), %ymm0, %ymm0 # sched: [10:2.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_phaddw:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vphaddw %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
+; BROADWELL-NEXT: vphaddw (%rdi), %ymm0, %ymm0 # sched: [9:2.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_phaddw:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vphaddw %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
+; SKYLAKE-NEXT: vphaddw (%rdi), %ymm0, %ymm0 # sched: [10:2.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_phaddw:
+; SKX: # %bb.0:
+; SKX-NEXT: vphaddw %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
+; SKX-NEXT: vphaddw (%rdi), %ymm0, %ymm0 # sched: [10:2.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_phaddw:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vphaddw %ymm1, %ymm0, %ymm0 # sched: [100:?]
+; ZNVER1-NEXT: vphaddw (%rdi), %ymm0, %ymm0 # sched: [100:?]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16> %a0, <16 x i16> %a1)
+ %2 = load <16 x i16>, <16 x i16> *%a2, align 32
+ %3 = call <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16> %1, <16 x i16> %2)
+ ret <16 x i16> %3
+}
+declare <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16>, <16 x i16>) nounwind readnone
+
+define <8 x i32> @test_phsubd(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> *%a2) {
+; GENERIC-LABEL: test_phsubd:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vphsubd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; GENERIC-NEXT: vphsubd (%rdi), %ymm0, %ymm0 # sched: [5:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_phsubd:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vphsubd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
+; HASWELL-NEXT: vphsubd (%rdi), %ymm0, %ymm0 # sched: [10:2.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_phsubd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vphsubd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
+; BROADWELL-NEXT: vphsubd (%rdi), %ymm0, %ymm0 # sched: [9:2.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_phsubd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vphsubd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
+; SKYLAKE-NEXT: vphsubd (%rdi), %ymm0, %ymm0 # sched: [10:2.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_phsubd:
+; SKX: # %bb.0:
+; SKX-NEXT: vphsubd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
+; SKX-NEXT: vphsubd (%rdi), %ymm0, %ymm0 # sched: [10:2.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_phsubd:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vphsubd %ymm1, %ymm0, %ymm0 # sched: [100:?]
+; ZNVER1-NEXT: vphsubd (%rdi), %ymm0, %ymm0 # sched: [100:?]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32> %a0, <8 x i32> %a1)
+ %2 = load <8 x i32>, <8 x i32> *%a2, align 32
+ %3 = call <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32> %1, <8 x i32> %2)
+ ret <8 x i32> %3
+}
+declare <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32>, <8 x i32>) nounwind readnone
+
+define <16 x i16> @test_phsubsw(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> *%a2) {
+; GENERIC-LABEL: test_phsubsw:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vphsubsw %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: vphsubsw (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_phsubsw:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vphsubsw %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
+; HASWELL-NEXT: vphsubsw (%rdi), %ymm0, %ymm0 # sched: [10:2.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_phsubsw:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vphsubsw %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
+; BROADWELL-NEXT: vphsubsw (%rdi), %ymm0, %ymm0 # sched: [9:2.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_phsubsw:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vphsubsw %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
+; SKYLAKE-NEXT: vphsubsw (%rdi), %ymm0, %ymm0 # sched: [10:2.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_phsubsw:
+; SKX: # %bb.0:
+; SKX-NEXT: vphsubsw %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
+; SKX-NEXT: vphsubsw (%rdi), %ymm0, %ymm0 # sched: [10:2.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_phsubsw:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vphsubsw %ymm1, %ymm0, %ymm0 # sched: [100:?]
+; ZNVER1-NEXT: vphsubsw (%rdi), %ymm0, %ymm0 # sched: [100:?]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call <16 x i16> @llvm.x86.avx2.phsub.sw(<16 x i16> %a0, <16 x i16> %a1)
+ %2 = load <16 x i16>, <16 x i16> *%a2, align 32
+ %3 = call <16 x i16> @llvm.x86.avx2.phsub.sw(<16 x i16> %1, <16 x i16> %2)
+ ret <16 x i16> %3
+}
+declare <16 x i16> @llvm.x86.avx2.phsub.sw(<16 x i16>, <16 x i16>) nounwind readnone
+
+define <16 x i16> @test_phsubw(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> *%a2) {
+; GENERIC-LABEL: test_phsubw:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vphsubw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; GENERIC-NEXT: vphsubw (%rdi), %ymm0, %ymm0 # sched: [5:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_phsubw:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vphsubw %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
+; HASWELL-NEXT: vphsubw (%rdi), %ymm0, %ymm0 # sched: [10:2.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_phsubw:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vphsubw %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
+; BROADWELL-NEXT: vphsubw (%rdi), %ymm0, %ymm0 # sched: [9:2.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_phsubw:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vphsubw %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
+; SKYLAKE-NEXT: vphsubw (%rdi), %ymm0, %ymm0 # sched: [10:2.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_phsubw:
+; SKX: # %bb.0:
+; SKX-NEXT: vphsubw %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
+; SKX-NEXT: vphsubw (%rdi), %ymm0, %ymm0 # sched: [10:2.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_phsubw:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vphsubw %ymm1, %ymm0, %ymm0 # sched: [100:?]
+; ZNVER1-NEXT: vphsubw (%rdi), %ymm0, %ymm0 # sched: [100:?]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16> %a0, <16 x i16> %a1)
+ %2 = load <16 x i16>, <16 x i16> *%a2, align 32
+ %3 = call <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16> %1, <16 x i16> %2)
+ ret <16 x i16> %3
+}
+declare <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16>, <16 x i16>) nounwind readnone
+
+define <16 x i16> @test_pmaddubsw(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> *%a2) {
+; GENERIC-LABEL: test_pmaddubsw:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpmaddubsw %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
+; GENERIC-NEXT: vpmaddubsw (%rdi), %ymm0, %ymm0 # sched: [9:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_pmaddubsw:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vpmaddubsw %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
+; HASWELL-NEXT: vpmaddubsw (%rdi), %ymm0, %ymm0 # sched: [12:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pmaddubsw:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpmaddubsw %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
+; BROADWELL-NEXT: vpmaddubsw (%rdi), %ymm0, %ymm0 # sched: [11:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pmaddubsw:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpmaddubsw %ymm1, %ymm0, %ymm0 # sched: [4:0.33]
+; SKYLAKE-NEXT: vpmaddubsw (%rdi), %ymm0, %ymm0 # sched: [11:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pmaddubsw:
+; SKX: # %bb.0:
+; SKX-NEXT: vpmaddubsw %ymm1, %ymm0, %ymm0 # sched: [4:0.33]
+; SKX-NEXT: vpmaddubsw (%rdi), %ymm0, %ymm0 # sched: [11:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_pmaddubsw:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpmaddubsw %ymm1, %ymm0, %ymm0 # sched: [4:1.00]
+; ZNVER1-NEXT: vpmaddubsw (%rdi), %ymm0, %ymm0 # sched: [11:1.00]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> %a0, <32 x i8> %a1)
+ %2 = bitcast <16 x i16> %1 to <32 x i8>
+ %3 = load <32 x i8>, <32 x i8> *%a2, align 32
+ %4 = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> %2, <32 x i8> %3)
+ ret <16 x i16> %4
+}
+declare <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8>, <32 x i8>) nounwind readnone
+
+define <8 x i32> @test_pmaddwd(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> *%a2) {
+; GENERIC-LABEL: test_pmaddwd:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpmaddwd %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
+; GENERIC-NEXT: vpmaddwd (%rdi), %ymm0, %ymm0 # sched: [9:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_pmaddwd:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vpmaddwd %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
+; HASWELL-NEXT: vpmaddwd (%rdi), %ymm0, %ymm0 # sched: [12:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pmaddwd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpmaddwd %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
+; BROADWELL-NEXT: vpmaddwd (%rdi), %ymm0, %ymm0 # sched: [11:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pmaddwd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpmaddwd %ymm1, %ymm0, %ymm0 # sched: [4:0.33]
+; SKYLAKE-NEXT: vpmaddwd (%rdi), %ymm0, %ymm0 # sched: [11:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pmaddwd:
+; SKX: # %bb.0:
+; SKX-NEXT: vpmaddwd %ymm1, %ymm0, %ymm0 # sched: [4:0.33]
+; SKX-NEXT: vpmaddwd (%rdi), %ymm0, %ymm0 # sched: [11:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_pmaddwd:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpmaddwd %ymm1, %ymm0, %ymm0 # sched: [4:1.00]
+; ZNVER1-NEXT: vpmaddwd (%rdi), %ymm0, %ymm0 # sched: [11:1.00]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> %a0, <16 x i16> %a1)
+ %2 = bitcast <8 x i32> %1 to <16 x i16>
+ %3 = load <16 x i16>, <16 x i16> *%a2, align 32
+ %4 = call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> %2, <16 x i16> %3)
+ ret <8 x i32> %4
+}
+declare <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16>, <16 x i16>) nounwind readnone
+
+define <4 x i32> @test_pmaskmovd(i8* %a0, <4 x i32> %a1, <4 x i32> %a2) {
+; GENERIC-LABEL: test_pmaskmovd:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpmaskmovd (%rdi), %xmm0, %xmm2 # sched: [4:0.50]
+; GENERIC-NEXT: vpmaskmovd %xmm1, %xmm0, (%rdi) # sched: [1:1.00]
+; GENERIC-NEXT: vmovdqa %xmm2, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_pmaskmovd:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vpmaskmovd (%rdi), %xmm0, %xmm2 # sched: [8:2.00]
+; HASWELL-NEXT: vpmaskmovd %xmm1, %xmm0, (%rdi) # sched: [5:1.00]
+; HASWELL-NEXT: vmovdqa %xmm2, %xmm0 # sched: [1:0.33]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pmaskmovd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpmaskmovd (%rdi), %xmm0, %xmm2 # sched: [7:2.00]
+; BROADWELL-NEXT: vpmaskmovd %xmm1, %xmm0, (%rdi) # sched: [5:1.00]
+; BROADWELL-NEXT: vmovdqa %xmm2, %xmm0 # sched: [1:0.33]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pmaskmovd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpmaskmovd (%rdi), %xmm0, %xmm2 # sched: [7:0.50]
+; SKYLAKE-NEXT: vpmaskmovd %xmm1, %xmm0, (%rdi) # sched: [2:1.00]
+; SKYLAKE-NEXT: vmovdqa %xmm2, %xmm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pmaskmovd:
+; SKX: # %bb.0:
+; SKX-NEXT: vpmaskmovd (%rdi), %xmm0, %xmm2 # sched: [7:0.50]
+; SKX-NEXT: vpmaskmovd %xmm1, %xmm0, (%rdi) # sched: [2:1.00]
+; SKX-NEXT: vmovdqa %xmm2, %xmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_pmaskmovd:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpmaskmovd (%rdi), %xmm0, %xmm2 # sched: [100:?]
+; ZNVER1-NEXT: vpmaskmovd %xmm1, %xmm0, (%rdi) # sched: [100:?]
+; ZNVER1-NEXT: vmovdqa %xmm2, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call <4 x i32> @llvm.x86.avx2.maskload.d(i8* %a0, <4 x i32> %a1)
+ call void @llvm.x86.avx2.maskstore.d(i8* %a0, <4 x i32> %a1, <4 x i32> %a2)
+ ret <4 x i32> %1
+}
+declare <4 x i32> @llvm.x86.avx2.maskload.d(i8*, <4 x i32>) nounwind readonly
+declare void @llvm.x86.avx2.maskstore.d(i8*, <4 x i32>, <4 x i32>) nounwind
+
+define <8 x i32> @test_pmaskmovd_ymm(i8* %a0, <8 x i32> %a1, <8 x i32> %a2) {
+; GENERIC-LABEL: test_pmaskmovd_ymm:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpmaskmovd (%rdi), %ymm0, %ymm2 # sched: [4:0.50]
+; GENERIC-NEXT: vpmaskmovd %ymm1, %ymm0, (%rdi) # sched: [1:1.00]
+; GENERIC-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_pmaskmovd_ymm:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vpmaskmovd (%rdi), %ymm0, %ymm2 # sched: [9:2.00]
+; HASWELL-NEXT: vpmaskmovd %ymm1, %ymm0, (%rdi) # sched: [5:1.00]
+; HASWELL-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.33]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pmaskmovd_ymm:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpmaskmovd (%rdi), %ymm0, %ymm2 # sched: [8:2.00]
+; BROADWELL-NEXT: vpmaskmovd %ymm1, %ymm0, (%rdi) # sched: [5:1.00]
+; BROADWELL-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.33]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pmaskmovd_ymm:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpmaskmovd (%rdi), %ymm0, %ymm2 # sched: [8:0.50]
+; SKYLAKE-NEXT: vpmaskmovd %ymm1, %ymm0, (%rdi) # sched: [2:1.00]
+; SKYLAKE-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pmaskmovd_ymm:
+; SKX: # %bb.0:
+; SKX-NEXT: vpmaskmovd (%rdi), %ymm0, %ymm2 # sched: [8:0.50]
+; SKX-NEXT: vpmaskmovd %ymm1, %ymm0, (%rdi) # sched: [2:1.00]
+; SKX-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_pmaskmovd_ymm:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpmaskmovd (%rdi), %ymm0, %ymm2 # sched: [100:?]
+; ZNVER1-NEXT: vpmaskmovd %ymm1, %ymm0, (%rdi) # sched: [100:?]
+; ZNVER1-NEXT: vmovdqa %ymm2, %ymm0 # sched: [2:0.25]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call <8 x i32> @llvm.x86.avx2.maskload.d.256(i8* %a0, <8 x i32> %a1)
+ call void @llvm.x86.avx2.maskstore.d.256(i8* %a0, <8 x i32> %a1, <8 x i32> %a2)
+ ret <8 x i32> %1
+}
+declare <8 x i32> @llvm.x86.avx2.maskload.d.256(i8*, <8 x i32>) nounwind readonly
+declare void @llvm.x86.avx2.maskstore.d.256(i8*, <8 x i32>, <8 x i32>) nounwind
+
+define <2 x i64> @test_pmaskmovq(i8* %a0, <2 x i64> %a1, <2 x i64> %a2) {
+; GENERIC-LABEL: test_pmaskmovq:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpmaskmovq (%rdi), %xmm0, %xmm2 # sched: [4:0.50]
+; GENERIC-NEXT: vpmaskmovq %xmm1, %xmm0, (%rdi) # sched: [1:1.00]
+; GENERIC-NEXT: vmovdqa %xmm2, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_pmaskmovq:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vpmaskmovq (%rdi), %xmm0, %xmm2 # sched: [8:2.00]
+; HASWELL-NEXT: vpmaskmovq %xmm1, %xmm0, (%rdi) # sched: [5:1.00]
+; HASWELL-NEXT: vmovdqa %xmm2, %xmm0 # sched: [1:0.33]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pmaskmovq:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpmaskmovq (%rdi), %xmm0, %xmm2 # sched: [7:2.00]
+; BROADWELL-NEXT: vpmaskmovq %xmm1, %xmm0, (%rdi) # sched: [5:1.00]
+; BROADWELL-NEXT: vmovdqa %xmm2, %xmm0 # sched: [1:0.33]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pmaskmovq:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpmaskmovq (%rdi), %xmm0, %xmm2 # sched: [7:0.50]
+; SKYLAKE-NEXT: vpmaskmovq %xmm1, %xmm0, (%rdi) # sched: [2:1.00]
+; SKYLAKE-NEXT: vmovdqa %xmm2, %xmm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pmaskmovq:
+; SKX: # %bb.0:
+; SKX-NEXT: vpmaskmovq (%rdi), %xmm0, %xmm2 # sched: [7:0.50]
+; SKX-NEXT: vpmaskmovq %xmm1, %xmm0, (%rdi) # sched: [2:1.00]
+; SKX-NEXT: vmovdqa %xmm2, %xmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_pmaskmovq:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpmaskmovq (%rdi), %xmm0, %xmm2 # sched: [8:1.00]
+; ZNVER1-NEXT: vpmaskmovq %xmm1, %xmm0, (%rdi) # sched: [100:?]
+; ZNVER1-NEXT: vmovdqa %xmm2, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call <2 x i64> @llvm.x86.avx2.maskload.q(i8* %a0, <2 x i64> %a1)
+ call void @llvm.x86.avx2.maskstore.q(i8* %a0, <2 x i64> %a1, <2 x i64> %a2)
+ ret <2 x i64> %1
+}
+declare <2 x i64> @llvm.x86.avx2.maskload.q(i8*, <2 x i64>) nounwind readonly
+declare void @llvm.x86.avx2.maskstore.q(i8*, <2 x i64>, <2 x i64>) nounwind
+
+define <4 x i64> @test_pmaskmovq_ymm(i8* %a0, <4 x i64> %a1, <4 x i64> %a2) {
+; GENERIC-LABEL: test_pmaskmovq_ymm:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpmaskmovq (%rdi), %ymm0, %ymm2 # sched: [4:0.50]
+; GENERIC-NEXT: vpmaskmovq %ymm1, %ymm0, (%rdi) # sched: [1:1.00]
+; GENERIC-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_pmaskmovq_ymm:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vpmaskmovq (%rdi), %ymm0, %ymm2 # sched: [9:2.00]
+; HASWELL-NEXT: vpmaskmovq %ymm1, %ymm0, (%rdi) # sched: [5:1.00]
+; HASWELL-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.33]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pmaskmovq_ymm:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpmaskmovq (%rdi), %ymm0, %ymm2 # sched: [8:2.00]
+; BROADWELL-NEXT: vpmaskmovq %ymm1, %ymm0, (%rdi) # sched: [5:1.00]
+; BROADWELL-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.33]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pmaskmovq_ymm:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpmaskmovq (%rdi), %ymm0, %ymm2 # sched: [8:0.50]
+; SKYLAKE-NEXT: vpmaskmovq %ymm1, %ymm0, (%rdi) # sched: [2:1.00]
+; SKYLAKE-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pmaskmovq_ymm:
+; SKX: # %bb.0:
+; SKX-NEXT: vpmaskmovq (%rdi), %ymm0, %ymm2 # sched: [8:0.50]
+; SKX-NEXT: vpmaskmovq %ymm1, %ymm0, (%rdi) # sched: [2:1.00]
+; SKX-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_pmaskmovq_ymm:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpmaskmovq (%rdi), %ymm0, %ymm2 # sched: [9:1.50]
+; ZNVER1-NEXT: vpmaskmovq %ymm1, %ymm0, (%rdi) # sched: [100:?]
+; ZNVER1-NEXT: vmovdqa %ymm2, %ymm0 # sched: [2:0.25]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call <4 x i64> @llvm.x86.avx2.maskload.q.256(i8* %a0, <4 x i64> %a1)
+ call void @llvm.x86.avx2.maskstore.q.256(i8* %a0, <4 x i64> %a1, <4 x i64> %a2)
+ ret <4 x i64> %1
+}
+declare <4 x i64> @llvm.x86.avx2.maskload.q.256(i8*, <4 x i64>) nounwind readonly
+declare void @llvm.x86.avx2.maskstore.q.256(i8*, <4 x i64>, <4 x i64>) nounwind
+
+define <32 x i8> @test_pmaxsb(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> *%a2) {
+; GENERIC-LABEL: test_pmaxsb:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: vpmaxsb (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_pmaxsb:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; HASWELL-NEXT: vpmaxsb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pmaxsb:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; BROADWELL-NEXT: vpmaxsb (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pmaxsb:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; SKYLAKE-NEXT: vpmaxsb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pmaxsb:
+; SKX: # %bb.0:
+; SKX-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; SKX-NEXT: vpmaxsb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_pmaxsb:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT: vpmaxsb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call <32 x i8> @llvm.x86.avx2.pmaxs.b(<32 x i8> %a0, <32 x i8> %a1)
+ %2 = load <32 x i8>, <32 x i8> *%a2, align 32
+ %3 = call <32 x i8> @llvm.x86.avx2.pmaxs.b(<32 x i8> %1, <32 x i8> %2)
+ ret <32 x i8> %3
+}
+declare <32 x i8> @llvm.x86.avx2.pmaxs.b(<32 x i8>, <32 x i8>) nounwind readnone
+
+define <8 x i32> @test_pmaxsd(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> *%a2) {
+; GENERIC-LABEL: test_pmaxsd:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: vpmaxsd (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_pmaxsd:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; HASWELL-NEXT: vpmaxsd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pmaxsd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; BROADWELL-NEXT: vpmaxsd (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pmaxsd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; SKYLAKE-NEXT: vpmaxsd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pmaxsd:
+; SKX: # %bb.0:
+; SKX-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; SKX-NEXT: vpmaxsd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_pmaxsd:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT: vpmaxsd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call <8 x i32> @llvm.x86.avx2.pmaxs.d(<8 x i32> %a0, <8 x i32> %a1)
+ %2 = load <8 x i32>, <8 x i32> *%a2, align 32
+ %3 = call <8 x i32> @llvm.x86.avx2.pmaxs.d(<8 x i32> %1, <8 x i32> %2)
+ ret <8 x i32> %3
+}
+declare <8 x i32> @llvm.x86.avx2.pmaxs.d(<8 x i32>, <8 x i32>) nounwind readnone
+
+define <16 x i16> @test_pmaxsw(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> *%a2) {
+; GENERIC-LABEL: test_pmaxsw:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: vpmaxsw (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_pmaxsw:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; HASWELL-NEXT: vpmaxsw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pmaxsw:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; BROADWELL-NEXT: vpmaxsw (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pmaxsw:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; SKYLAKE-NEXT: vpmaxsw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pmaxsw:
+; SKX: # %bb.0:
+; SKX-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; SKX-NEXT: vpmaxsw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_pmaxsw:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT: vpmaxsw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call <16 x i16> @llvm.x86.avx2.pmaxs.w(<16 x i16> %a0, <16 x i16> %a1)
+ %2 = load <16 x i16>, <16 x i16> *%a2, align 32
+ %3 = call <16 x i16> @llvm.x86.avx2.pmaxs.w(<16 x i16> %1, <16 x i16> %2)
+ ret <16 x i16> %3
+}
+declare <16 x i16> @llvm.x86.avx2.pmaxs.w(<16 x i16>, <16 x i16>) nounwind readnone
+
+define <32 x i8> @test_pmaxub(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> *%a2) {
+; GENERIC-LABEL: test_pmaxub:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: vpmaxub (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_pmaxub:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; HASWELL-NEXT: vpmaxub (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pmaxub:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; BROADWELL-NEXT: vpmaxub (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pmaxub:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; SKYLAKE-NEXT: vpmaxub (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pmaxub:
+; SKX: # %bb.0:
+; SKX-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; SKX-NEXT: vpmaxub (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_pmaxub:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT: vpmaxub (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call <32 x i8> @llvm.x86.avx2.pmaxu.b(<32 x i8> %a0, <32 x i8> %a1)
+ %2 = load <32 x i8>, <32 x i8> *%a2, align 32
+ %3 = call <32 x i8> @llvm.x86.avx2.pmaxu.b(<32 x i8> %1, <32 x i8> %2)
+ ret <32 x i8> %3
+}
+declare <32 x i8> @llvm.x86.avx2.pmaxu.b(<32 x i8>, <32 x i8>) nounwind readnone
+
+define <8 x i32> @test_pmaxud(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> *%a2) {
+; GENERIC-LABEL: test_pmaxud:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: vpmaxud (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_pmaxud:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; HASWELL-NEXT: vpmaxud (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pmaxud:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; BROADWELL-NEXT: vpmaxud (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pmaxud:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; SKYLAKE-NEXT: vpmaxud (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pmaxud:
+; SKX: # %bb.0:
+; SKX-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; SKX-NEXT: vpmaxud (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_pmaxud:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT: vpmaxud (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call <8 x i32> @llvm.x86.avx2.pmaxu.d(<8 x i32> %a0, <8 x i32> %a1)
+ %2 = load <8 x i32>, <8 x i32> *%a2, align 32
+ %3 = call <8 x i32> @llvm.x86.avx2.pmaxu.d(<8 x i32> %1, <8 x i32> %2)
+ ret <8 x i32> %3
+}
+declare <8 x i32> @llvm.x86.avx2.pmaxu.d(<8 x i32>, <8 x i32>) nounwind readnone
+
+define <16 x i16> @test_pmaxuw(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> *%a2) {
+; GENERIC-LABEL: test_pmaxuw:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: vpmaxuw (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_pmaxuw:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; HASWELL-NEXT: vpmaxuw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pmaxuw:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; BROADWELL-NEXT: vpmaxuw (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pmaxuw:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; SKYLAKE-NEXT: vpmaxuw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pmaxuw:
+; SKX: # %bb.0:
+; SKX-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; SKX-NEXT: vpmaxuw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_pmaxuw:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT: vpmaxuw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call <16 x i16> @llvm.x86.avx2.pmaxu.w(<16 x i16> %a0, <16 x i16> %a1)
+ %2 = load <16 x i16>, <16 x i16> *%a2, align 32
+ %3 = call <16 x i16> @llvm.x86.avx2.pmaxu.w(<16 x i16> %1, <16 x i16> %2)
+ ret <16 x i16> %3
+}
+declare <16 x i16> @llvm.x86.avx2.pmaxu.w(<16 x i16>, <16 x i16>) nounwind readnone
+
+define <32 x i8> @test_pminsb(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> *%a2) {
+; GENERIC-LABEL: test_pminsb:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpminsb %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: vpminsb (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_pminsb:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vpminsb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; HASWELL-NEXT: vpminsb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pminsb:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpminsb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; BROADWELL-NEXT: vpminsb (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pminsb:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpminsb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; SKYLAKE-NEXT: vpminsb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pminsb:
+; SKX: # %bb.0:
+; SKX-NEXT: vpminsb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; SKX-NEXT: vpminsb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_pminsb:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpminsb %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT: vpminsb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call <32 x i8> @llvm.x86.avx2.pmins.b(<32 x i8> %a0, <32 x i8> %a1)
+ %2 = load <32 x i8>, <32 x i8> *%a2, align 32
+ %3 = call <32 x i8> @llvm.x86.avx2.pmins.b(<32 x i8> %1, <32 x i8> %2)
+ ret <32 x i8> %3
+}
+declare <32 x i8> @llvm.x86.avx2.pmins.b(<32 x i8>, <32 x i8>) nounwind readnone
+
+define <8 x i32> @test_pminsd(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> *%a2) {
+; GENERIC-LABEL: test_pminsd:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpminsd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: vpminsd (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_pminsd:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vpminsd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; HASWELL-NEXT: vpminsd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pminsd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpminsd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; BROADWELL-NEXT: vpminsd (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pminsd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpminsd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; SKYLAKE-NEXT: vpminsd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pminsd:
+; SKX: # %bb.0:
+; SKX-NEXT: vpminsd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; SKX-NEXT: vpminsd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_pminsd:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpminsd %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT: vpminsd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call <8 x i32> @llvm.x86.avx2.pmins.d(<8 x i32> %a0, <8 x i32> %a1)
+ %2 = load <8 x i32>, <8 x i32> *%a2, align 32
+ %3 = call <8 x i32> @llvm.x86.avx2.pmins.d(<8 x i32> %1, <8 x i32> %2)
+ ret <8 x i32> %3
+}
+declare <8 x i32> @llvm.x86.avx2.pmins.d(<8 x i32>, <8 x i32>) nounwind readnone
+
+define <16 x i16> @test_pminsw(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> *%a2) {
+; GENERIC-LABEL: test_pminsw:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpminsw %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: vpminsw (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_pminsw:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vpminsw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; HASWELL-NEXT: vpminsw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pminsw:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpminsw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; BROADWELL-NEXT: vpminsw (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pminsw:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpminsw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; SKYLAKE-NEXT: vpminsw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pminsw:
+; SKX: # %bb.0:
+; SKX-NEXT: vpminsw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; SKX-NEXT: vpminsw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_pminsw:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpminsw %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT: vpminsw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call <16 x i16> @llvm.x86.avx2.pmins.w(<16 x i16> %a0, <16 x i16> %a1)
+ %2 = load <16 x i16>, <16 x i16> *%a2, align 32
+ %3 = call <16 x i16> @llvm.x86.avx2.pmins.w(<16 x i16> %1, <16 x i16> %2)
+ ret <16 x i16> %3
+}
+declare <16 x i16> @llvm.x86.avx2.pmins.w(<16 x i16>, <16 x i16>) nounwind readnone
+
+define <32 x i8> @test_pminub(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> *%a2) {
+; GENERIC-LABEL: test_pminub:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpminub %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: vpminub (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_pminub:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vpminub %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; HASWELL-NEXT: vpminub (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pminub:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpminub %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; BROADWELL-NEXT: vpminub (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pminub:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpminub %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; SKYLAKE-NEXT: vpminub (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pminub:
+; SKX: # %bb.0:
+; SKX-NEXT: vpminub %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; SKX-NEXT: vpminub (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_pminub:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpminub %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT: vpminub (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call <32 x i8> @llvm.x86.avx2.pminu.b(<32 x i8> %a0, <32 x i8> %a1)
+ %2 = load <32 x i8>, <32 x i8> *%a2, align 32
+ %3 = call <32 x i8> @llvm.x86.avx2.pminu.b(<32 x i8> %1, <32 x i8> %2)
+ ret <32 x i8> %3
+}
+declare <32 x i8> @llvm.x86.avx2.pminu.b(<32 x i8>, <32 x i8>) nounwind readnone
+
+define <8 x i32> @test_pminud(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> *%a2) {
+; GENERIC-LABEL: test_pminud:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpminud %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: vpminud (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_pminud:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vpminud %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; HASWELL-NEXT: vpminud (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pminud:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpminud %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; BROADWELL-NEXT: vpminud (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pminud:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpminud %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; SKYLAKE-NEXT: vpminud (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pminud:
+; SKX: # %bb.0:
+; SKX-NEXT: vpminud %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; SKX-NEXT: vpminud (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_pminud:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpminud %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT: vpminud (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call <8 x i32> @llvm.x86.avx2.pminu.d(<8 x i32> %a0, <8 x i32> %a1)
+ %2 = load <8 x i32>, <8 x i32> *%a2, align 32
+ %3 = call <8 x i32> @llvm.x86.avx2.pminu.d(<8 x i32> %1, <8 x i32> %2)
+ ret <8 x i32> %3
+}
+declare <8 x i32> @llvm.x86.avx2.pminu.d(<8 x i32>, <8 x i32>) nounwind readnone
+
+define <16 x i16> @test_pminuw(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> *%a2) {
+; GENERIC-LABEL: test_pminuw:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpminuw %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: vpminuw (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_pminuw:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vpminuw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; HASWELL-NEXT: vpminuw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pminuw:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpminuw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; BROADWELL-NEXT: vpminuw (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pminuw:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpminuw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; SKYLAKE-NEXT: vpminuw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pminuw:
+; SKX: # %bb.0:
+; SKX-NEXT: vpminuw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; SKX-NEXT: vpminuw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_pminuw:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpminuw %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT: vpminuw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call <16 x i16> @llvm.x86.avx2.pminu.w(<16 x i16> %a0, <16 x i16> %a1)
+ %2 = load <16 x i16>, <16 x i16> *%a2, align 32
+ %3 = call <16 x i16> @llvm.x86.avx2.pminu.w(<16 x i16> %1, <16 x i16> %2)
+ ret <16 x i16> %3
+}
+declare <16 x i16> @llvm.x86.avx2.pminu.w(<16 x i16>, <16 x i16>) nounwind readnone
+
+define i32 @test_pmovmskb(<32 x i8> %a0) {
+; GENERIC-LABEL: test_pmovmskb:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpmovmskb %ymm0, %eax # sched: [1:1.00]
+; GENERIC-NEXT: vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_pmovmskb:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vpmovmskb %ymm0, %eax # sched: [3:1.00]
+; HASWELL-NEXT: vzeroupper # sched: [4:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pmovmskb:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpmovmskb %ymm0, %eax # sched: [3:1.00]
+; BROADWELL-NEXT: vzeroupper # sched: [4:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pmovmskb:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpmovmskb %ymm0, %eax # sched: [2:1.00]
+; SKYLAKE-NEXT: vzeroupper # sched: [4:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pmovmskb:
+; SKX: # %bb.0:
+; SKX-NEXT: vpmovmskb %ymm0, %eax # sched: [2:1.00]
+; SKX-NEXT: vzeroupper # sched: [4:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_pmovmskb:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpmovmskb %ymm0, %eax # sched: [2:1.00]
+; ZNVER1-NEXT: vzeroupper # sched: [100:?]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call i32 @llvm.x86.avx2.pmovmskb(<32 x i8> %a0)
+ ret i32 %1
+}
+declare i32 @llvm.x86.avx2.pmovmskb(<32 x i8>) nounwind readnone
+
+define <8 x i32> @test_pmovsxbd(<16 x i8> %a0, <16 x i8> *%a1) {
+; GENERIC-LABEL: test_pmovsxbd:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpmovsxbd %xmm0, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: vpmovsxbd (%rdi), %ymm1 # sched: [5:1.00]
+; GENERIC-NEXT: vpaddd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_pmovsxbd:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vpmovsxbd %xmm0, %ymm0 # sched: [3:1.00]
+; HASWELL-NEXT: vpmovsxbd (%rdi), %ymm1 # sched: [8:1.00]
+; HASWELL-NEXT: vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pmovsxbd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpmovsxbd %xmm0, %ymm0 # sched: [3:1.00]
+; BROADWELL-NEXT: vpmovsxbd (%rdi), %ymm1 # sched: [8:1.00]
+; BROADWELL-NEXT: vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pmovsxbd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpmovsxbd %xmm0, %ymm0 # sched: [3:1.00]
+; SKYLAKE-NEXT: vpmovsxbd (%rdi), %ymm1 # sched: [8:1.00]
+; SKYLAKE-NEXT: vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pmovsxbd:
+; SKX: # %bb.0:
+; SKX-NEXT: vpmovsxbd %xmm0, %ymm0 # sched: [3:1.00]
+; SKX-NEXT: vpmovsxbd (%rdi), %ymm1 # sched: [8:1.00]
+; SKX-NEXT: vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_pmovsxbd:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpmovsxbd (%rdi), %ymm1 # sched: [8:0.50]
+; ZNVER1-NEXT: vpmovsxbd %xmm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT: vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %2 = sext <8 x i8> %1 to <8 x i32>
+ %3 = load <16 x i8>, <16 x i8> *%a1, align 16
+ %4 = shufflevector <16 x i8> %3, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %5 = sext <8 x i8> %4 to <8 x i32>
+ %6 = add <8 x i32> %2, %5
+ ret <8 x i32> %6
+}
+
+define <4 x i64> @test_pmovsxbq(<16 x i8> %a0, <16 x i8> *%a1) {
+; GENERIC-LABEL: test_pmovsxbq:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpmovsxbq %xmm0, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: vpmovsxbq (%rdi), %ymm1 # sched: [5:1.00]
+; GENERIC-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_pmovsxbq:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vpmovsxbq %xmm0, %ymm0 # sched: [3:1.00]
+; HASWELL-NEXT: vpmovsxbq (%rdi), %ymm1 # sched: [8:1.00]
+; HASWELL-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pmovsxbq:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpmovsxbq %xmm0, %ymm0 # sched: [3:1.00]
+; BROADWELL-NEXT: vpmovsxbq (%rdi), %ymm1 # sched: [8:1.00]
+; BROADWELL-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pmovsxbq:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpmovsxbq %xmm0, %ymm0 # sched: [3:1.00]
+; SKYLAKE-NEXT: vpmovsxbq (%rdi), %ymm1 # sched: [8:1.00]
+; SKYLAKE-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pmovsxbq:
+; SKX: # %bb.0:
+; SKX-NEXT: vpmovsxbq %xmm0, %ymm0 # sched: [3:1.00]
+; SKX-NEXT: vpmovsxbq (%rdi), %ymm1 # sched: [8:1.00]
+; SKX-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_pmovsxbq:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpmovsxbq (%rdi), %ymm1 # sched: [8:0.50]
+; ZNVER1-NEXT: vpmovsxbq %xmm0, %ymm0 # sched: [1:0.50]
+; ZNVER1-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %2 = sext <4 x i8> %1 to <4 x i64>
+ %3 = load <16 x i8>, <16 x i8> *%a1, align 16
+ %4 = shufflevector <16 x i8> %3, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %5 = sext <4 x i8> %4 to <4 x i64>
+ %6 = add <4 x i64> %2, %5
+ ret <4 x i64> %6
+}
+
+define <16 x i16> @test_pmovsxbw(<16 x i8> %a0, <16 x i8> *%a1) {
+; GENERIC-LABEL: test_pmovsxbw:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpmovsxbw %xmm0, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: vpmovsxbw (%rdi), %ymm1 # sched: [5:1.00]
+; GENERIC-NEXT: vpaddw %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_pmovsxbw:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vpmovsxbw %xmm0, %ymm0 # sched: [3:1.00]
+; HASWELL-NEXT: vpmovsxbw (%rdi), %ymm1 # sched: [9:1.00]
+; HASWELL-NEXT: vpaddw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pmovsxbw:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpmovsxbw %xmm0, %ymm0 # sched: [3:1.00]
+; BROADWELL-NEXT: vpmovsxbw (%rdi), %ymm1 # sched: [8:1.00]
+; BROADWELL-NEXT: vpaddw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pmovsxbw:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpmovsxbw %xmm0, %ymm0 # sched: [3:1.00]
+; SKYLAKE-NEXT: vpmovsxbw (%rdi), %ymm1 # sched: [9:1.00]
+; SKYLAKE-NEXT: vpaddw %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pmovsxbw:
+; SKX: # %bb.0:
+; SKX-NEXT: vpmovsxbw %xmm0, %ymm0 # sched: [3:1.00]
+; SKX-NEXT: vpmovsxbw (%rdi), %ymm1 # sched: [9:1.00]
+; SKX-NEXT: vpaddw %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_pmovsxbw:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpmovsxbw (%rdi), %ymm1 # sched: [8:0.50]
+; ZNVER1-NEXT: vpmovsxbw %xmm0, %ymm0 # sched: [1:0.50]
+; ZNVER1-NEXT: vpaddw %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = sext <16 x i8> %a0 to <16 x i16>
+ %2 = load <16 x i8>, <16 x i8> *%a1, align 16
+ %3 = sext <16 x i8> %2 to <16 x i16>
+ %4 = add <16 x i16> %1, %3
+ ret <16 x i16> %4
+}
+
+define <4 x i64> @test_pmovsxdq(<4 x i32> %a0, <4 x i32> *%a1) {
+; GENERIC-LABEL: test_pmovsxdq:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpmovsxdq %xmm0, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: vpmovsxdq (%rdi), %ymm1 # sched: [5:1.00]
+; GENERIC-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_pmovsxdq:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vpmovsxdq %xmm0, %ymm0 # sched: [3:1.00]
+; HASWELL-NEXT: vpmovsxdq (%rdi), %ymm1 # sched: [9:1.00]
+; HASWELL-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pmovsxdq:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpmovsxdq %xmm0, %ymm0 # sched: [3:1.00]
+; BROADWELL-NEXT: vpmovsxdq (%rdi), %ymm1 # sched: [8:1.00]
+; BROADWELL-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pmovsxdq:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpmovsxdq %xmm0, %ymm0 # sched: [3:1.00]
+; SKYLAKE-NEXT: vpmovsxdq (%rdi), %ymm1 # sched: [9:1.00]
+; SKYLAKE-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pmovsxdq:
+; SKX: # %bb.0:
+; SKX-NEXT: vpmovsxdq %xmm0, %ymm0 # sched: [3:1.00]
+; SKX-NEXT: vpmovsxdq (%rdi), %ymm1 # sched: [9:1.00]
+; SKX-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_pmovsxdq:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpmovsxdq (%rdi), %ymm1 # sched: [8:0.50]
+; ZNVER1-NEXT: vpmovsxdq %xmm0, %ymm0 # sched: [1:0.50]
+; ZNVER1-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = sext <4 x i32> %a0 to <4 x i64>
+ %2 = load <4 x i32>, <4 x i32> *%a1, align 16
+ %3 = sext <4 x i32> %2 to <4 x i64>
+ %4 = add <4 x i64> %1, %3
+ ret <4 x i64> %4
+}
+
+define <8 x i32> @test_pmovsxwd(<8 x i16> %a0, <8 x i16> *%a1) {
+; GENERIC-LABEL: test_pmovsxwd:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpmovsxwd %xmm0, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: vpmovsxwd (%rdi), %ymm1 # sched: [5:1.00]
+; GENERIC-NEXT: vpaddd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_pmovsxwd:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vpmovsxwd %xmm0, %ymm0 # sched: [3:1.00]
+; HASWELL-NEXT: vpmovsxwd (%rdi), %ymm1 # sched: [9:1.00]
+; HASWELL-NEXT: vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pmovsxwd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpmovsxwd %xmm0, %ymm0 # sched: [3:1.00]
+; BROADWELL-NEXT: vpmovsxwd (%rdi), %ymm1 # sched: [8:1.00]
+; BROADWELL-NEXT: vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pmovsxwd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpmovsxwd %xmm0, %ymm0 # sched: [3:1.00]
+; SKYLAKE-NEXT: vpmovsxwd (%rdi), %ymm1 # sched: [9:1.00]
+; SKYLAKE-NEXT: vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pmovsxwd:
+; SKX: # %bb.0:
+; SKX-NEXT: vpmovsxwd %xmm0, %ymm0 # sched: [3:1.00]
+; SKX-NEXT: vpmovsxwd (%rdi), %ymm1 # sched: [9:1.00]
+; SKX-NEXT: vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_pmovsxwd:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpmovsxwd (%rdi), %ymm1 # sched: [8:0.50]
+; ZNVER1-NEXT: vpmovsxwd %xmm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT: vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = sext <8 x i16> %a0 to <8 x i32>
+ %2 = load <8 x i16>, <8 x i16> *%a1, align 16
+ %3 = sext <8 x i16> %2 to <8 x i32>
+ %4 = add <8 x i32> %1, %3
+ ret <8 x i32> %4
+}
+
+define <4 x i64> @test_pmovsxwq(<8 x i16> %a0, <8 x i16> *%a1) {
+; GENERIC-LABEL: test_pmovsxwq:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpmovsxwq %xmm0, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: vpmovsxwq (%rdi), %ymm1 # sched: [5:1.00]
+; GENERIC-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_pmovsxwq:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vpmovsxwq %xmm0, %ymm0 # sched: [3:1.00]
+; HASWELL-NEXT: vpmovsxwq (%rdi), %ymm1 # sched: [8:1.00]
+; HASWELL-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pmovsxwq:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpmovsxwq %xmm0, %ymm0 # sched: [3:1.00]
+; BROADWELL-NEXT: vpmovsxwq (%rdi), %ymm1 # sched: [8:1.00]
+; BROADWELL-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pmovsxwq:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpmovsxwq %xmm0, %ymm0 # sched: [3:1.00]
+; SKYLAKE-NEXT: vpmovsxwq (%rdi), %ymm1 # sched: [8:1.00]
+; SKYLAKE-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pmovsxwq:
+; SKX: # %bb.0:
+; SKX-NEXT: vpmovsxwq %xmm0, %ymm0 # sched: [3:1.00]
+; SKX-NEXT: vpmovsxwq (%rdi), %ymm1 # sched: [8:1.00]
+; SKX-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_pmovsxwq:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpmovsxwq (%rdi), %ymm1 # sched: [8:0.50]
+; ZNVER1-NEXT: vpmovsxwq %xmm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %2 = sext <4 x i16> %1 to <4 x i64>
+ %3 = load <8 x i16>, <8 x i16> *%a1, align 16
+ %4 = shufflevector <8 x i16> %3, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %5 = sext <4 x i16> %4 to <4 x i64>
+ %6 = add <4 x i64> %2, %5
+ ret <4 x i64> %6
+}
+
+define <8 x i32> @test_pmovzxbd(<16 x i8> %a0, <16 x i8> *%a1) {
+; GENERIC-LABEL: test_pmovzxbd:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero sched: [1:1.00]
+; GENERIC-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero sched: [5:1.00]
+; GENERIC-NEXT: vpaddd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_pmovzxbd:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero sched: [3:1.00]
+; HASWELL-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero sched: [10:1.00]
+; HASWELL-NEXT: vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pmovzxbd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero sched: [3:1.00]
+; BROADWELL-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero sched: [9:1.00]
+; BROADWELL-NEXT: vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pmovzxbd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero sched: [3:1.00]
+; SKYLAKE-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero sched: [10:1.00]
+; SKYLAKE-NEXT: vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pmovzxbd:
+; SKX: # %bb.0:
+; SKX-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero sched: [3:1.00]
+; SKX-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero sched: [10:1.00]
+; SKX-NEXT: vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_pmovzxbd:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero sched: [8:0.50]
+; ZNVER1-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero sched: [1:0.25]
+; ZNVER1-NEXT: vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %2 = zext <8 x i8> %1 to <8 x i32>
+ %3 = load <16 x i8>, <16 x i8> *%a1, align 16
+ %4 = shufflevector <16 x i8> %3, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %5 = zext <8 x i8> %4 to <8 x i32>
+ %6 = add <8 x i32> %2, %5
+ ret <8 x i32> %6
+}
+
+define <4 x i64> @test_pmovzxbq(<16 x i8> %a0, <16 x i8> *%a1) {
+; GENERIC-LABEL: test_pmovzxbq:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero sched: [1:1.00]
+; GENERIC-NEXT: vpmovzxbq {{.*#+}} ymm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero sched: [5:1.00]
+; GENERIC-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_pmovzxbq:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero sched: [3:1.00]
+; HASWELL-NEXT: vpmovzxbq {{.*#+}} ymm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero sched: [10:1.00]
+; HASWELL-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pmovzxbq:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero sched: [3:1.00]
+; BROADWELL-NEXT: vpmovzxbq {{.*#+}} ymm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero sched: [9:1.00]
+; BROADWELL-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pmovzxbq:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero sched: [3:1.00]
+; SKYLAKE-NEXT: vpmovzxbq {{.*#+}} ymm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero sched: [10:1.00]
+; SKYLAKE-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pmovzxbq:
+; SKX: # %bb.0:
+; SKX-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero sched: [3:1.00]
+; SKX-NEXT: vpmovzxbq {{.*#+}} ymm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero sched: [10:1.00]
+; SKX-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_pmovzxbq:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpmovzxbq {{.*#+}} ymm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero sched: [8:0.50]
+; ZNVER1-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero sched: [1:0.50]
+; ZNVER1-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %2 = zext <4 x i8> %1 to <4 x i64>
+ %3 = load <16 x i8>, <16 x i8> *%a1, align 16
+ %4 = shufflevector <16 x i8> %3, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %5 = zext <4 x i8> %4 to <4 x i64>
+ %6 = add <4 x i64> %2, %5
+ ret <4 x i64> %6
+}
+
+define <16 x i16> @test_pmovzxbw(<16 x i8> %a0, <16 x i8> *%a1) {
+; GENERIC-LABEL: test_pmovzxbw:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero sched: [1:1.00]
+; GENERIC-NEXT: vpmovzxbw {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero sched: [5:1.00]
+; GENERIC-NEXT: vpaddw %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_pmovzxbw:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero sched: [3:1.00]
+; HASWELL-NEXT: vpmovzxbw {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero sched: [10:1.00]
+; HASWELL-NEXT: vpaddw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pmovzxbw:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero sched: [3:1.00]
+; BROADWELL-NEXT: vpmovzxbw {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero sched: [9:1.00]
+; BROADWELL-NEXT: vpaddw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pmovzxbw:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero sched: [3:1.00]
+; SKYLAKE-NEXT: vpmovzxbw {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero sched: [10:1.00]
+; SKYLAKE-NEXT: vpaddw %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pmovzxbw:
+; SKX: # %bb.0:
+; SKX-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero sched: [3:1.00]
+; SKX-NEXT: vpmovzxbw {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero sched: [10:1.00]
+; SKX-NEXT: vpaddw %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_pmovzxbw:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpmovzxbw {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero sched: [8:0.50]
+; ZNVER1-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero sched: [1:0.50]
+; ZNVER1-NEXT: vpaddw %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = zext <16 x i8> %a0 to <16 x i16>
+ %2 = load <16 x i8>, <16 x i8> *%a1, align 16
+ %3 = zext <16 x i8> %2 to <16 x i16>
+ %4 = add <16 x i16> %1, %3
+ ret <16 x i16> %4
+}
+
+define <4 x i64> @test_pmovzxdq(<4 x i32> %a0, <4 x i32> *%a1) {
+; GENERIC-LABEL: test_pmovzxdq:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero sched: [1:1.00]
+; GENERIC-NEXT: vpmovzxdq {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero sched: [5:1.00]
+; GENERIC-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_pmovzxdq:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero sched: [3:1.00]
+; HASWELL-NEXT: vpmovzxdq {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero sched: [10:1.00]
+; HASWELL-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pmovzxdq:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero sched: [3:1.00]
+; BROADWELL-NEXT: vpmovzxdq {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero sched: [9:1.00]
+; BROADWELL-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pmovzxdq:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero sched: [3:1.00]
+; SKYLAKE-NEXT: vpmovzxdq {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero sched: [10:1.00]
+; SKYLAKE-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pmovzxdq:
+; SKX: # %bb.0:
+; SKX-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero sched: [3:1.00]
+; SKX-NEXT: vpmovzxdq {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero sched: [10:1.00]
+; SKX-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_pmovzxdq:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpmovzxdq {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero sched: [8:0.50]
+; ZNVER1-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero sched: [1:0.50]
+; ZNVER1-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = zext <4 x i32> %a0 to <4 x i64>
+ %2 = load <4 x i32>, <4 x i32> *%a1, align 16
+ %3 = zext <4 x i32> %2 to <4 x i64>
+ %4 = add <4 x i64> %1, %3
+ ret <4 x i64> %4
+}
+
+define <8 x i32> @test_pmovzxwd(<8 x i16> %a0, <8 x i16> *%a1) {
+; GENERIC-LABEL: test_pmovzxwd:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [1:1.00]
+; GENERIC-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero sched: [5:1.00]
+; GENERIC-NEXT: vpaddd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_pmovzxwd:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [3:1.00]
+; HASWELL-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero sched: [9:1.00]
+; HASWELL-NEXT: vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pmovzxwd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [3:1.00]
+; BROADWELL-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero sched: [8:1.00]
+; BROADWELL-NEXT: vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pmovzxwd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [3:1.00]
+; SKYLAKE-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero sched: [9:1.00]
+; SKYLAKE-NEXT: vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pmovzxwd:
+; SKX: # %bb.0:
+; SKX-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [3:1.00]
+; SKX-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero sched: [9:1.00]
+; SKX-NEXT: vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_pmovzxwd:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero sched: [8:0.50]
+; ZNVER1-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [1:0.25]
+; ZNVER1-NEXT: vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = zext <8 x i16> %a0 to <8 x i32>
+ %2 = load <8 x i16>, <8 x i16> *%a1, align 16
+ %3 = zext <8 x i16> %2 to <8 x i32>
+ %4 = add <8 x i32> %1, %3
+ ret <8 x i32> %4
+}
+
+define <4 x i64> @test_pmovzxwq(<8 x i16> %a0, <8 x i16> *%a1) {
+; GENERIC-LABEL: test_pmovzxwq:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero sched: [1:1.00]
+; GENERIC-NEXT: vpmovzxwq {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero sched: [5:1.00]
+; GENERIC-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_pmovzxwq:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero sched: [3:1.00]
+; HASWELL-NEXT: vpmovzxwq {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero sched: [10:1.00]
+; HASWELL-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pmovzxwq:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero sched: [3:1.00]
+; BROADWELL-NEXT: vpmovzxwq {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero sched: [9:1.00]
+; BROADWELL-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pmovzxwq:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero sched: [3:1.00]
+; SKYLAKE-NEXT: vpmovzxwq {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero sched: [10:1.00]
+; SKYLAKE-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pmovzxwq:
+; SKX: # %bb.0:
+; SKX-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero sched: [3:1.00]
+; SKX-NEXT: vpmovzxwq {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero sched: [10:1.00]
+; SKX-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_pmovzxwq:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpmovzxwq {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero sched: [8:0.50]
+; ZNVER1-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero sched: [1:0.25]
+; ZNVER1-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %2 = zext <4 x i16> %1 to <4 x i64>
+ %3 = load <8 x i16>, <8 x i16> *%a1, align 16
+ %4 = shufflevector <8 x i16> %3, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %5 = zext <4 x i16> %4 to <4 x i64>
+ %6 = add <4 x i64> %2, %5
+ ret <4 x i64> %6
+}
+
+define <4 x i64> @test_pmuldq(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> *%a2) {
+; GENERIC-LABEL: test_pmuldq:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpmuldq %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
+; GENERIC-NEXT: vpmuldq (%rdi), %ymm0, %ymm0 # sched: [9:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_pmuldq:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vpmuldq %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
+; HASWELL-NEXT: vpmuldq (%rdi), %ymm0, %ymm0 # sched: [12:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pmuldq:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpmuldq %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
+; BROADWELL-NEXT: vpmuldq (%rdi), %ymm0, %ymm0 # sched: [11:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pmuldq:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpmuldq %ymm1, %ymm0, %ymm0 # sched: [4:0.33]
+; SKYLAKE-NEXT: vpmuldq (%rdi), %ymm0, %ymm0 # sched: [11:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pmuldq:
+; SKX: # %bb.0:
+; SKX-NEXT: vpmuldq %ymm1, %ymm0, %ymm0 # sched: [4:0.33]
+; SKX-NEXT: vpmuldq (%rdi), %ymm0, %ymm0 # sched: [11:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_pmuldq:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpmuldq %ymm1, %ymm0, %ymm0 # sched: [4:1.00]
+; ZNVER1-NEXT: vpmuldq (%rdi), %ymm0, %ymm0 # sched: [11:1.00]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32> %a0, <8 x i32> %a1)
+ %2 = bitcast <4 x i64> %1 to <8 x i32>
+ %3 = load <8 x i32>, <8 x i32> *%a2, align 32
+ %4 = call <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32> %2, <8 x i32> %3)
+ ret <4 x i64> %4
+}
+declare <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32>, <8 x i32>) nounwind readnone
+
+define <16 x i16> @test_pmulhrsw(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> *%a2) {
+; GENERIC-LABEL: test_pmulhrsw:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpmulhrsw %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
+; GENERIC-NEXT: vpmulhrsw (%rdi), %ymm0, %ymm0 # sched: [9:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_pmulhrsw:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vpmulhrsw %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
+; HASWELL-NEXT: vpmulhrsw (%rdi), %ymm0, %ymm0 # sched: [12:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pmulhrsw:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpmulhrsw %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
+; BROADWELL-NEXT: vpmulhrsw (%rdi), %ymm0, %ymm0 # sched: [11:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pmulhrsw:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpmulhrsw %ymm1, %ymm0, %ymm0 # sched: [4:0.33]
+; SKYLAKE-NEXT: vpmulhrsw (%rdi), %ymm0, %ymm0 # sched: [11:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pmulhrsw:
+; SKX: # %bb.0:
+; SKX-NEXT: vpmulhrsw %ymm1, %ymm0, %ymm0 # sched: [4:0.33]
+; SKX-NEXT: vpmulhrsw (%rdi), %ymm0, %ymm0 # sched: [11:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_pmulhrsw:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpmulhrsw %ymm1, %ymm0, %ymm0 # sched: [4:1.00]
+; ZNVER1-NEXT: vpmulhrsw (%rdi), %ymm0, %ymm0 # sched: [11:1.00]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call <16 x i16> @llvm.x86.avx2.pmul.hr.sw(<16 x i16> %a0, <16 x i16> %a1)
+ %2 = load <16 x i16>, <16 x i16> *%a2, align 32
+ %3 = call <16 x i16> @llvm.x86.avx2.pmul.hr.sw(<16 x i16> %1, <16 x i16> %2)
+ ret <16 x i16> %3
+}
+declare <16 x i16> @llvm.x86.avx2.pmul.hr.sw(<16 x i16>, <16 x i16>) nounwind readnone
+
+define <16 x i16> @test_pmulhuw(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> *%a2) {
+; GENERIC-LABEL: test_pmulhuw:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpmulhuw %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
+; GENERIC-NEXT: vpmulhuw (%rdi), %ymm0, %ymm0 # sched: [9:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_pmulhuw:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vpmulhuw %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
+; HASWELL-NEXT: vpmulhuw (%rdi), %ymm0, %ymm0 # sched: [12:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pmulhuw:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpmulhuw %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
+; BROADWELL-NEXT: vpmulhuw (%rdi), %ymm0, %ymm0 # sched: [11:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pmulhuw:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpmulhuw %ymm1, %ymm0, %ymm0 # sched: [4:0.33]
+; SKYLAKE-NEXT: vpmulhuw (%rdi), %ymm0, %ymm0 # sched: [11:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pmulhuw:
+; SKX: # %bb.0:
+; SKX-NEXT: vpmulhuw %ymm1, %ymm0, %ymm0 # sched: [4:0.33]
+; SKX-NEXT: vpmulhuw (%rdi), %ymm0, %ymm0 # sched: [11:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_pmulhuw:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpmulhuw %ymm1, %ymm0, %ymm0 # sched: [4:1.00]
+; ZNVER1-NEXT: vpmulhuw (%rdi), %ymm0, %ymm0 # sched: [11:1.00]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call <16 x i16> @llvm.x86.avx2.pmulhu.w(<16 x i16> %a0, <16 x i16> %a1)
+ %2 = load <16 x i16>, <16 x i16> *%a2, align 32
+ %3 = call <16 x i16> @llvm.x86.avx2.pmulhu.w(<16 x i16> %1, <16 x i16> %2)
+ ret <16 x i16> %3
+}
+declare <16 x i16> @llvm.x86.avx2.pmulhu.w(<16 x i16>, <16 x i16>) nounwind readnone
+
+define <16 x i16> @test_pmulhw(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> *%a2) {
+; GENERIC-LABEL: test_pmulhw:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpmulhw %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
+; GENERIC-NEXT: vpmulhw (%rdi), %ymm0, %ymm0 # sched: [9:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_pmulhw:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vpmulhw %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
+; HASWELL-NEXT: vpmulhw (%rdi), %ymm0, %ymm0 # sched: [12:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pmulhw:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpmulhw %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
+; BROADWELL-NEXT: vpmulhw (%rdi), %ymm0, %ymm0 # sched: [11:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pmulhw:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpmulhw %ymm1, %ymm0, %ymm0 # sched: [4:0.33]
+; SKYLAKE-NEXT: vpmulhw (%rdi), %ymm0, %ymm0 # sched: [11:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pmulhw:
+; SKX: # %bb.0:
+; SKX-NEXT: vpmulhw %ymm1, %ymm0, %ymm0 # sched: [4:0.33]
+; SKX-NEXT: vpmulhw (%rdi), %ymm0, %ymm0 # sched: [11:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_pmulhw:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpmulhw %ymm1, %ymm0, %ymm0 # sched: [4:1.00]
+; ZNVER1-NEXT: vpmulhw (%rdi), %ymm0, %ymm0 # sched: [11:1.00]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call <16 x i16> @llvm.x86.avx2.pmulh.w(<16 x i16> %a0, <16 x i16> %a1)
+ %2 = load <16 x i16>, <16 x i16> *%a2, align 32
+ %3 = call <16 x i16> @llvm.x86.avx2.pmulh.w(<16 x i16> %1, <16 x i16> %2)
+ ret <16 x i16> %3
+}
+declare <16 x i16> @llvm.x86.avx2.pmulh.w(<16 x i16>, <16 x i16>) nounwind readnone
+
define <8 x i32> @test_pmulld(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> *%a2) {
+; GENERIC-LABEL: test_pmulld:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpmulld %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
+; GENERIC-NEXT: vpmulld (%rdi), %ymm0, %ymm0 # sched: [9:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
; HASWELL-LABEL: test_pmulld:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vpmulld %ymm1, %ymm0, %ymm0 # sched: [10:2.00]
-; HASWELL-NEXT: vpmulld (%rdi), %ymm0, %ymm0 # sched: [10:2.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpmulld (%rdi), %ymm0, %ymm0 # sched: [17:2.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pmulld:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpmulld %ymm1, %ymm0, %ymm0 # sched: [10:2.00]
+; BROADWELL-NEXT: vpmulld (%rdi), %ymm0, %ymm0 # sched: [16:2.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pmulld:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpmulld %ymm1, %ymm0, %ymm0 # sched: [8:0.67]
+; SKYLAKE-NEXT: vpmulld (%rdi), %ymm0, %ymm0 # sched: [15:0.67]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pmulld:
+; SKX: # %bb.0:
+; SKX-NEXT: vpmulld %ymm1, %ymm0, %ymm0 # sched: [8:0.67]
+; SKX-NEXT: vpmulld (%rdi), %ymm0, %ymm0 # sched: [15:0.67]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; ZNVER1-LABEL: test_pmulld:
-; ZNVER1: # BB#0:
-; ZNVER1-NEXT: vpmulld %ymm1, %ymm0, %ymm0 # sched: [4:1.00]
-; ZNVER1-NEXT: vpmulld (%rdi), %ymm0, %ymm0 # sched: [11:1.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpmulld %ymm1, %ymm0, %ymm0 # sched: [5:2.00]
+; ZNVER1-NEXT: vpmulld (%rdi), %ymm0, %ymm0 # sched: [12:2.00]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = mul <8 x i32> %a0, %a1
%2 = load <8 x i32>, <8 x i32> *%a2, align 32
%3 = mul <8 x i32> %1, %2
@@ -204,37 +4948,133 @@ define <8 x i32> @test_pmulld(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> *%a2) {
}
define <16 x i16> @test_pmullw(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> *%a2) {
+; GENERIC-LABEL: test_pmullw:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpmullw %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
+; GENERIC-NEXT: vpmullw (%rdi), %ymm0, %ymm0 # sched: [9:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
; HASWELL-LABEL: test_pmullw:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
-; HASWELL-NEXT: vpmullw (%rdi), %ymm0, %ymm0 # sched: [9:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpmullw (%rdi), %ymm0, %ymm0 # sched: [12:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pmullw:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
+; BROADWELL-NEXT: vpmullw (%rdi), %ymm0, %ymm0 # sched: [11:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pmullw:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpmullw %ymm1, %ymm0, %ymm0 # sched: [4:0.33]
+; SKYLAKE-NEXT: vpmullw (%rdi), %ymm0, %ymm0 # sched: [11:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pmullw:
+; SKX: # %bb.0:
+; SKX-NEXT: vpmullw %ymm1, %ymm0, %ymm0 # sched: [4:0.33]
+; SKX-NEXT: vpmullw (%rdi), %ymm0, %ymm0 # sched: [11:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; ZNVER1-LABEL: test_pmullw:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vpmullw %ymm1, %ymm0, %ymm0 # sched: [4:1.00]
; ZNVER1-NEXT: vpmullw (%rdi), %ymm0, %ymm0 # sched: [11:1.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = mul <16 x i16> %a0, %a1
%2 = load <16 x i16>, <16 x i16> *%a2, align 32
%3 = mul <16 x i16> %1, %2
ret <16 x i16> %3
}
+define <4 x i64> @test_pmuludq(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> *%a2) {
+; GENERIC-LABEL: test_pmuludq:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
+; GENERIC-NEXT: vpmuludq (%rdi), %ymm0, %ymm0 # sched: [9:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_pmuludq:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
+; HASWELL-NEXT: vpmuludq (%rdi), %ymm0, %ymm0 # sched: [12:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pmuludq:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
+; BROADWELL-NEXT: vpmuludq (%rdi), %ymm0, %ymm0 # sched: [11:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pmuludq:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 # sched: [4:0.33]
+; SKYLAKE-NEXT: vpmuludq (%rdi), %ymm0, %ymm0 # sched: [11:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pmuludq:
+; SKX: # %bb.0:
+; SKX-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 # sched: [4:0.33]
+; SKX-NEXT: vpmuludq (%rdi), %ymm0, %ymm0 # sched: [11:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_pmuludq:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 # sched: [4:1.00]
+; ZNVER1-NEXT: vpmuludq (%rdi), %ymm0, %ymm0 # sched: [11:1.00]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call <4 x i64> @llvm.x86.avx2.pmulu.dq(<8 x i32> %a0, <8 x i32> %a1)
+ %2 = bitcast <4 x i64> %1 to <8 x i32>
+ %3 = load <8 x i32>, <8 x i32> *%a2, align 32
+ %4 = call <4 x i64> @llvm.x86.avx2.pmulu.dq(<8 x i32> %2, <8 x i32> %3)
+ ret <4 x i64> %4
+}
+declare <4 x i64> @llvm.x86.avx2.pmulu.dq(<8 x i32>, <8 x i32>) nounwind readnone
+
define <4 x i64> @test_por(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> *%a2) {
+; GENERIC-LABEL: test_por:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpor %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: vpor (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
+; GENERIC-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
; HASWELL-LABEL: test_por:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
-; HASWELL-NEXT: vpor (%rdi), %ymm0, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT: vpor (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
; HASWELL-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_por:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; BROADWELL-NEXT: vpor (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
+; BROADWELL-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_por:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: vpor (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; SKYLAKE-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_por:
+; SKX: # %bb.0:
+; SKX-NEXT: vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: vpor (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; SKX-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; ZNVER1-LABEL: test_por:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
; ZNVER1-NEXT: vpor (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
; ZNVER1-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = or <4 x i64> %a0, %a1
%2 = load <4 x i64>, <4 x i64> *%a2, align 32
%3 = or <4 x i64> %1, %2
@@ -242,18 +5082,1303 @@ define <4 x i64> @test_por(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> *%a2) {
ret <4 x i64> %4
}
+define <4 x i64> @test_psadbw(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> *%a2) {
+; GENERIC-LABEL: test_psadbw:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
+; GENERIC-NEXT: vpsadbw (%rdi), %ymm0, %ymm0 # sched: [9:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_psadbw:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
+; HASWELL-NEXT: vpsadbw (%rdi), %ymm0, %ymm0 # sched: [12:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_psadbw:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
+; BROADWELL-NEXT: vpsadbw (%rdi), %ymm0, %ymm0 # sched: [11:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_psadbw:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; SKYLAKE-NEXT: vpsadbw (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_psadbw:
+; SKX: # %bb.0:
+; SKX-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; SKX-NEXT: vpsadbw (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_psadbw:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 # sched: [4:1.00]
+; ZNVER1-NEXT: vpsadbw (%rdi), %ymm0, %ymm0 # sched: [11:1.00]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call <4 x i64> @llvm.x86.avx2.psad.bw(<32 x i8> %a0, <32 x i8> %a1)
+ %2 = bitcast <4 x i64> %1 to <32 x i8>
+ %3 = load <32 x i8>, <32 x i8> *%a2, align 32
+ %4 = call <4 x i64> @llvm.x86.avx2.psad.bw(<32 x i8> %2, <32 x i8> %3)
+ ret <4 x i64> %4
+}
+declare <4 x i64> @llvm.x86.avx2.psad.bw(<32 x i8>, <32 x i8>) nounwind readnone
+
+define <32 x i8> @test_pshufb(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> *%a2) {
+; GENERIC-LABEL: test_pshufb:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpshufb %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: vpshufb (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_pshufb:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vpshufb %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; HASWELL-NEXT: vpshufb (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pshufb:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpshufb %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; BROADWELL-NEXT: vpshufb (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pshufb:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpshufb %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; SKYLAKE-NEXT: vpshufb (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pshufb:
+; SKX: # %bb.0:
+; SKX-NEXT: vpshufb %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; SKX-NEXT: vpshufb (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_pshufb:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpshufb %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT: vpshufb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> %a1)
+ %2 = load <32 x i8>, <32 x i8> *%a2, align 32
+ %3 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %1, <32 x i8> %2)
+ ret <32 x i8> %3
+}
+declare <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8>, <32 x i8>) nounwind readnone
+
+define <8 x i32> @test_pshufd(<8 x i32> %a0, <8 x i32> *%a1) {
+; GENERIC-LABEL: test_pshufd:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] sched: [1:1.00]
+; GENERIC-NEXT: vpshufd {{.*#+}} ymm1 = mem[1,0,3,2,5,4,7,6] sched: [5:1.00]
+; GENERIC-NEXT: vpaddd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_pshufd:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] sched: [1:1.00]
+; HASWELL-NEXT: vpshufd {{.*#+}} ymm1 = mem[1,0,3,2,5,4,7,6] sched: [8:1.00]
+; HASWELL-NEXT: vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pshufd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] sched: [1:1.00]
+; BROADWELL-NEXT: vpshufd {{.*#+}} ymm1 = mem[1,0,3,2,5,4,7,6] sched: [7:1.00]
+; BROADWELL-NEXT: vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pshufd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] sched: [1:1.00]
+; SKYLAKE-NEXT: vpshufd {{.*#+}} ymm1 = mem[1,0,3,2,5,4,7,6] sched: [8:1.00]
+; SKYLAKE-NEXT: vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pshufd:
+; SKX: # %bb.0:
+; SKX-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] sched: [1:1.00]
+; SKX-NEXT: vpshufd {{.*#+}} ymm1 = mem[1,0,3,2,5,4,7,6] sched: [8:1.00]
+; SKX-NEXT: vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_pshufd:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpshufd {{.*#+}} ymm1 = mem[1,0,3,2,5,4,7,6] sched: [8:0.50]
+; ZNVER1-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] sched: [1:0.25]
+; ZNVER1-NEXT: vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+ %2 = load <8 x i32>, <8 x i32> *%a1, align 32
+ %3 = shufflevector <8 x i32> %2, <8 x i32> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+ %4 = add <8 x i32> %1, %3
+ ret <8 x i32> %4
+}
+
+define <16 x i16> @test_pshufhw(<16 x i16> %a0, <16 x i16> *%a1) {
+; GENERIC-LABEL: test_pshufhw:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,6,5,4,8,9,10,11,15,14,13,12] sched: [1:1.00]
+; GENERIC-NEXT: vpshufhw {{.*#+}} ymm1 = mem[0,1,2,3,5,4,7,6,8,9,10,11,13,12,15,14] sched: [5:1.00]
+; GENERIC-NEXT: vpor %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_pshufhw:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,6,5,4,8,9,10,11,15,14,13,12] sched: [1:1.00]
+; HASWELL-NEXT: vpshufhw {{.*#+}} ymm1 = mem[0,1,2,3,5,4,7,6,8,9,10,11,13,12,15,14] sched: [8:1.00]
+; HASWELL-NEXT: vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pshufhw:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,6,5,4,8,9,10,11,15,14,13,12] sched: [1:1.00]
+; BROADWELL-NEXT: vpshufhw {{.*#+}} ymm1 = mem[0,1,2,3,5,4,7,6,8,9,10,11,13,12,15,14] sched: [7:1.00]
+; BROADWELL-NEXT: vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pshufhw:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,6,5,4,8,9,10,11,15,14,13,12] sched: [1:1.00]
+; SKYLAKE-NEXT: vpshufhw {{.*#+}} ymm1 = mem[0,1,2,3,5,4,7,6,8,9,10,11,13,12,15,14] sched: [8:1.00]
+; SKYLAKE-NEXT: vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pshufhw:
+; SKX: # %bb.0:
+; SKX-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,6,5,4,8,9,10,11,15,14,13,12] sched: [1:1.00]
+; SKX-NEXT: vpshufhw {{.*#+}} ymm1 = mem[0,1,2,3,5,4,7,6,8,9,10,11,13,12,15,14] sched: [8:1.00]
+; SKX-NEXT: vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_pshufhw:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpshufhw {{.*#+}} ymm1 = mem[0,1,2,3,5,4,7,6,8,9,10,11,13,12,15,14] sched: [8:0.50]
+; ZNVER1-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,6,5,4,8,9,10,11,15,14,13,12] sched: [1:0.25]
+; ZNVER1-NEXT: vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 5, i32 4, i32 8, i32 9, i32 10, i32 11, i32 15, i32 14, i32 13, i32 12>
+ %2 = load <16 x i16>, <16 x i16> *%a1, align 32
+ %3 = shufflevector <16 x i16> %2, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 7, i32 6, i32 8, i32 9, i32 10, i32 11, i32 13, i32 12, i32 15, i32 14>
+ %4 = or <16 x i16> %1, %3
+ ret <16 x i16> %4
+}
+
+define <16 x i16> @test_pshuflw(<16 x i16> %a0, <16 x i16> *%a1) {
+; GENERIC-LABEL: test_pshuflw:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[3,2,1,0,4,5,6,7,11,10,9,8,12,13,14,15] sched: [1:1.00]
+; GENERIC-NEXT: vpshuflw {{.*#+}} ymm1 = mem[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15] sched: [5:1.00]
+; GENERIC-NEXT: vpor %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_pshuflw:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[3,2,1,0,4,5,6,7,11,10,9,8,12,13,14,15] sched: [1:1.00]
+; HASWELL-NEXT: vpshuflw {{.*#+}} ymm1 = mem[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15] sched: [8:1.00]
+; HASWELL-NEXT: vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pshuflw:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[3,2,1,0,4,5,6,7,11,10,9,8,12,13,14,15] sched: [1:1.00]
+; BROADWELL-NEXT: vpshuflw {{.*#+}} ymm1 = mem[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15] sched: [7:1.00]
+; BROADWELL-NEXT: vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pshuflw:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[3,2,1,0,4,5,6,7,11,10,9,8,12,13,14,15] sched: [1:1.00]
+; SKYLAKE-NEXT: vpshuflw {{.*#+}} ymm1 = mem[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15] sched: [8:1.00]
+; SKYLAKE-NEXT: vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pshuflw:
+; SKX: # %bb.0:
+; SKX-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[3,2,1,0,4,5,6,7,11,10,9,8,12,13,14,15] sched: [1:1.00]
+; SKX-NEXT: vpshuflw {{.*#+}} ymm1 = mem[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15] sched: [8:1.00]
+; SKX-NEXT: vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_pshuflw:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpshuflw {{.*#+}} ymm1 = mem[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15] sched: [8:0.50]
+; ZNVER1-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[3,2,1,0,4,5,6,7,11,10,9,8,12,13,14,15] sched: [1:0.25]
+; ZNVER1-NEXT: vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 9, i32 8, i32 12, i32 13, i32 14, i32 15>
+ %2 = load <16 x i16>, <16 x i16> *%a1, align 32
+ %3 = shufflevector <16 x i16> %2, <16 x i16> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7, i32 9, i32 8, i32 11, i32 10, i32 12, i32 13, i32 14, i32 15>
+ %4 = or <16 x i16> %1, %3
+ ret <16 x i16> %4
+}
+
+define <32 x i8> @test_psignb(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> *%a2) {
+; GENERIC-LABEL: test_psignb:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpsignb %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: vpsignb (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_psignb:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vpsignb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; HASWELL-NEXT: vpsignb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_psignb:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpsignb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; BROADWELL-NEXT: vpsignb (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_psignb:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpsignb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; SKYLAKE-NEXT: vpsignb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_psignb:
+; SKX: # %bb.0:
+; SKX-NEXT: vpsignb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; SKX-NEXT: vpsignb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_psignb:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpsignb %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT: vpsignb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call <32 x i8> @llvm.x86.avx2.psign.b(<32 x i8> %a0, <32 x i8> %a1)
+ %2 = load <32 x i8>, <32 x i8> *%a2, align 32
+ %3 = call <32 x i8> @llvm.x86.avx2.psign.b(<32 x i8> %1, <32 x i8> %2)
+ ret <32 x i8> %3
+}
+declare <32 x i8> @llvm.x86.avx2.psign.b(<32 x i8>, <32 x i8>) nounwind readnone
+
+define <8 x i32> @test_psignd(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> *%a2) {
+; GENERIC-LABEL: test_psignd:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpsignd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: vpsignd (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_psignd:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vpsignd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; HASWELL-NEXT: vpsignd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_psignd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpsignd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; BROADWELL-NEXT: vpsignd (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_psignd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpsignd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; SKYLAKE-NEXT: vpsignd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_psignd:
+; SKX: # %bb.0:
+; SKX-NEXT: vpsignd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; SKX-NEXT: vpsignd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_psignd:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpsignd %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT: vpsignd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call <8 x i32> @llvm.x86.avx2.psign.d(<8 x i32> %a0, <8 x i32> %a1)
+ %2 = load <8 x i32>, <8 x i32> *%a2, align 32
+ %3 = call <8 x i32> @llvm.x86.avx2.psign.d(<8 x i32> %1, <8 x i32> %2)
+ ret <8 x i32> %3
+}
+declare <8 x i32> @llvm.x86.avx2.psign.d(<8 x i32>, <8 x i32>) nounwind readnone
+
+define <16 x i16> @test_psignw(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> *%a2) {
+; GENERIC-LABEL: test_psignw:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpsignw %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: vpsignw (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_psignw:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vpsignw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; HASWELL-NEXT: vpsignw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_psignw:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpsignw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; BROADWELL-NEXT: vpsignw (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_psignw:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpsignw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; SKYLAKE-NEXT: vpsignw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_psignw:
+; SKX: # %bb.0:
+; SKX-NEXT: vpsignw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; SKX-NEXT: vpsignw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_psignw:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpsignw %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT: vpsignw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call <16 x i16> @llvm.x86.avx2.psign.w(<16 x i16> %a0, <16 x i16> %a1)
+ %2 = load <16 x i16>, <16 x i16> *%a2, align 32
+ %3 = call <16 x i16> @llvm.x86.avx2.psign.w(<16 x i16> %1, <16 x i16> %2)
+ ret <16 x i16> %3
+}
+declare <16 x i16> @llvm.x86.avx2.psign.w(<16 x i16>, <16 x i16>) nounwind readnone
+
+define <8 x i32> @test_pslld(<8 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
+; GENERIC-LABEL: test_pslld:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpslld %xmm1, %ymm0, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: vpslld (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
+; GENERIC-NEXT: vpslld $2, %ymm0, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_pslld:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vpslld %xmm1, %ymm0, %ymm0 # sched: [4:1.00]
+; HASWELL-NEXT: vpslld (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; HASWELL-NEXT: vpslld $2, %ymm0, %ymm0 # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pslld:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpslld %xmm1, %ymm0, %ymm0 # sched: [4:1.00]
+; BROADWELL-NEXT: vpslld (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; BROADWELL-NEXT: vpslld $2, %ymm0, %ymm0 # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pslld:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpslld %xmm1, %ymm0, %ymm0 # sched: [4:1.00]
+; SKYLAKE-NEXT: vpslld (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; SKYLAKE-NEXT: vpslld $2, %ymm0, %ymm0 # sched: [1:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pslld:
+; SKX: # %bb.0:
+; SKX-NEXT: vpslld %xmm1, %ymm0, %ymm0 # sched: [4:1.00]
+; SKX-NEXT: vpslld (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; SKX-NEXT: vpslld $2, %ymm0, %ymm0 # sched: [1:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_pslld:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpslld %xmm1, %ymm0, %ymm0 # sched: [2:1.00]
+; ZNVER1-NEXT: vpslld (%rdi), %ymm0, %ymm0 # sched: [9:1.00]
+; ZNVER1-NEXT: vpslld $2, %ymm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %a0, <4 x i32> %a1)
+ %2 = load <4 x i32>, <4 x i32> *%a2, align 16
+ %3 = call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %1, <4 x i32> %2)
+ %4 = shl <8 x i32> %3, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+ ret <8 x i32> %4
+}
+declare <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32>, <4 x i32>) nounwind readnone
+
+define <32 x i8> @test_pslldq(<32 x i8> %a0) {
+; GENERIC-LABEL: test_pslldq:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpslldq {{.*#+}} ymm0 = zero,zero,zero,ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12],zero,zero,zero,ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_pslldq:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vpslldq {{.*#+}} ymm0 = zero,zero,zero,ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12],zero,zero,zero,ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28] sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pslldq:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpslldq {{.*#+}} ymm0 = zero,zero,zero,ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12],zero,zero,zero,ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28] sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pslldq:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpslldq {{.*#+}} ymm0 = zero,zero,zero,ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12],zero,zero,zero,ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28] sched: [1:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pslldq:
+; SKX: # %bb.0:
+; SKX-NEXT: vpslldq {{.*#+}} ymm0 = zero,zero,zero,ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12],zero,zero,zero,ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28] sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_pslldq:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpslldq {{.*#+}} ymm0 = zero,zero,zero,ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12],zero,zero,zero,ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28] sched: [2:1.00]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = shufflevector <32 x i8> zeroinitializer, <32 x i8> %a0, <32 x i32> <i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60>
+ ret <32 x i8> %1
+}
+
+define <4 x i64> @test_psllq(<4 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
+; GENERIC-LABEL: test_psllq:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpsllq %xmm1, %ymm0, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: vpsllq (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
+; GENERIC-NEXT: vpsllq $2, %ymm0, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_psllq:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vpsllq %xmm1, %ymm0, %ymm0 # sched: [4:1.00]
+; HASWELL-NEXT: vpsllq (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; HASWELL-NEXT: vpsllq $2, %ymm0, %ymm0 # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_psllq:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpsllq %xmm1, %ymm0, %ymm0 # sched: [4:1.00]
+; BROADWELL-NEXT: vpsllq (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; BROADWELL-NEXT: vpsllq $2, %ymm0, %ymm0 # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_psllq:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpsllq %xmm1, %ymm0, %ymm0 # sched: [4:1.00]
+; SKYLAKE-NEXT: vpsllq (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; SKYLAKE-NEXT: vpsllq $2, %ymm0, %ymm0 # sched: [1:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_psllq:
+; SKX: # %bb.0:
+; SKX-NEXT: vpsllq %xmm1, %ymm0, %ymm0 # sched: [4:1.00]
+; SKX-NEXT: vpsllq (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; SKX-NEXT: vpsllq $2, %ymm0, %ymm0 # sched: [1:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_psllq:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpsllq %xmm1, %ymm0, %ymm0 # sched: [2:1.00]
+; ZNVER1-NEXT: vpsllq (%rdi), %ymm0, %ymm0 # sched: [9:1.00]
+; ZNVER1-NEXT: vpsllq $2, %ymm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %a0, <2 x i64> %a1)
+ %2 = load <2 x i64>, <2 x i64> *%a2, align 16
+ %3 = call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %1, <2 x i64> %2)
+ %4 = shl <4 x i64> %3, <i64 2, i64 2, i64 2, i64 2>
+ ret <4 x i64> %4
+}
+declare <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64>, <2 x i64>) nounwind readnone
+
+define <4 x i32> @test_psllvd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
+; GENERIC-LABEL: test_psllvd:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpsllvd %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: vpsllvd (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_psllvd:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vpsllvd %xmm1, %xmm0, %xmm0 # sched: [3:2.00]
+; HASWELL-NEXT: vpsllvd (%rdi), %xmm0, %xmm0 # sched: [9:2.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_psllvd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpsllvd %xmm1, %xmm0, %xmm0 # sched: [3:2.00]
+; BROADWELL-NEXT: vpsllvd (%rdi), %xmm0, %xmm0 # sched: [8:2.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_psllvd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpsllvd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; SKYLAKE-NEXT: vpsllvd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_psllvd:
+; SKX: # %bb.0:
+; SKX-NEXT: vpsllvd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; SKX-NEXT: vpsllvd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_psllvd:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpsllvd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; ZNVER1-NEXT: vpsllvd (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32> %a0, <4 x i32> %a1)
+ %2 = load <4 x i32>, <4 x i32> *%a2, align 16
+ %3 = call <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32> %1, <4 x i32> %2)
+ ret <4 x i32> %3
+}
+declare <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <8 x i32> @test_psllvd_ymm(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> *%a2) {
+; GENERIC-LABEL: test_psllvd_ymm:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpsllvd %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: vpsllvd (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_psllvd_ymm:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vpsllvd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
+; HASWELL-NEXT: vpsllvd (%rdi), %ymm0, %ymm0 # sched: [10:2.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_psllvd_ymm:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpsllvd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
+; BROADWELL-NEXT: vpsllvd (%rdi), %ymm0, %ymm0 # sched: [9:2.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_psllvd_ymm:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpsllvd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; SKYLAKE-NEXT: vpsllvd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_psllvd_ymm:
+; SKX: # %bb.0:
+; SKX-NEXT: vpsllvd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; SKX-NEXT: vpsllvd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_psllvd_ymm:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpsllvd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; ZNVER1-NEXT: vpsllvd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32> %a0, <8 x i32> %a1)
+ %2 = load <8 x i32>, <8 x i32> *%a2, align 32
+ %3 = call <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32> %1, <8 x i32> %2)
+ ret <8 x i32> %3
+}
+declare <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32>, <8 x i32>) nounwind readnone
+
+define <2 x i64> @test_psllvq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
+; GENERIC-LABEL: test_psllvq:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpsllvq %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: vpsllvq (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_psllvq:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vpsllvq %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; HASWELL-NEXT: vpsllvq (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_psllvq:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpsllvq %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; BROADWELL-NEXT: vpsllvq (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_psllvq:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpsllvq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; SKYLAKE-NEXT: vpsllvq (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_psllvq:
+; SKX: # %bb.0:
+; SKX-NEXT: vpsllvq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; SKX-NEXT: vpsllvq (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_psllvq:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpsllvq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; ZNVER1-NEXT: vpsllvq (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64> %a0, <2 x i64> %a1)
+ %2 = load <2 x i64>, <2 x i64> *%a2, align 16
+ %3 = call <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64> %1, <2 x i64> %2)
+ ret <2 x i64> %3
+}
+declare <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64>, <2 x i64>) nounwind readnone
+
+define <4 x i64> @test_psllvq_ymm(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> *%a2) {
+; GENERIC-LABEL: test_psllvq_ymm:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpsllvq %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: vpsllvq (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_psllvq_ymm:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vpsllvq %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; HASWELL-NEXT: vpsllvq (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_psllvq_ymm:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpsllvq %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; BROADWELL-NEXT: vpsllvq (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_psllvq_ymm:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpsllvq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; SKYLAKE-NEXT: vpsllvq (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_psllvq_ymm:
+; SKX: # %bb.0:
+; SKX-NEXT: vpsllvq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; SKX-NEXT: vpsllvq (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_psllvq_ymm:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpsllvq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; ZNVER1-NEXT: vpsllvq (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64> %a0, <4 x i64> %a1)
+ %2 = load <4 x i64>, <4 x i64> *%a2, align 32
+ %3 = call <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64> %1, <4 x i64> %2)
+ ret <4 x i64> %3
+}
+declare <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64>, <4 x i64>) nounwind readnone
+
+define <16 x i16> @test_psllw(<16 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
+; GENERIC-LABEL: test_psllw:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpsllw %xmm1, %ymm0, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: vpsllw (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
+; GENERIC-NEXT: vpsllw $2, %ymm0, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_psllw:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vpsllw %xmm1, %ymm0, %ymm0 # sched: [4:1.00]
+; HASWELL-NEXT: vpsllw (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; HASWELL-NEXT: vpsllw $2, %ymm0, %ymm0 # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_psllw:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpsllw %xmm1, %ymm0, %ymm0 # sched: [4:1.00]
+; BROADWELL-NEXT: vpsllw (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; BROADWELL-NEXT: vpsllw $2, %ymm0, %ymm0 # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_psllw:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpsllw %xmm1, %ymm0, %ymm0 # sched: [4:1.00]
+; SKYLAKE-NEXT: vpsllw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; SKYLAKE-NEXT: vpsllw $2, %ymm0, %ymm0 # sched: [1:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_psllw:
+; SKX: # %bb.0:
+; SKX-NEXT: vpsllw %xmm1, %ymm0, %ymm0 # sched: [4:1.00]
+; SKX-NEXT: vpsllw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; SKX-NEXT: vpsllw $2, %ymm0, %ymm0 # sched: [1:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_psllw:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpsllw %xmm1, %ymm0, %ymm0 # sched: [2:1.00]
+; ZNVER1-NEXT: vpsllw (%rdi), %ymm0, %ymm0 # sched: [9:1.00]
+; ZNVER1-NEXT: vpsllw $2, %ymm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> %a0, <8 x i16> %a1)
+ %2 = load <8 x i16>, <8 x i16> *%a2, align 16
+ %3 = call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> %1, <8 x i16> %2)
+ %4 = shl <16 x i16> %3, <i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2>
+ ret <16 x i16> %4
+}
+declare <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16>, <8 x i16>) nounwind readnone
+
+define <8 x i32> @test_psrad(<8 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
+; GENERIC-LABEL: test_psrad:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpsrad %xmm1, %ymm0, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: vpsrad (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
+; GENERIC-NEXT: vpsrad $2, %ymm0, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_psrad:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vpsrad %xmm1, %ymm0, %ymm0 # sched: [4:1.00]
+; HASWELL-NEXT: vpsrad (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; HASWELL-NEXT: vpsrad $2, %ymm0, %ymm0 # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_psrad:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpsrad %xmm1, %ymm0, %ymm0 # sched: [4:1.00]
+; BROADWELL-NEXT: vpsrad (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; BROADWELL-NEXT: vpsrad $2, %ymm0, %ymm0 # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_psrad:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpsrad %xmm1, %ymm0, %ymm0 # sched: [4:1.00]
+; SKYLAKE-NEXT: vpsrad (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; SKYLAKE-NEXT: vpsrad $2, %ymm0, %ymm0 # sched: [1:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_psrad:
+; SKX: # %bb.0:
+; SKX-NEXT: vpsrad %xmm1, %ymm0, %ymm0 # sched: [4:1.00]
+; SKX-NEXT: vpsrad (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; SKX-NEXT: vpsrad $2, %ymm0, %ymm0 # sched: [1:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_psrad:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpsrad %xmm1, %ymm0, %ymm0 # sched: [2:1.00]
+; ZNVER1-NEXT: vpsrad (%rdi), %ymm0, %ymm0 # sched: [9:1.00]
+; ZNVER1-NEXT: vpsrad $2, %ymm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %a0, <4 x i32> %a1)
+ %2 = load <4 x i32>, <4 x i32> *%a2, align 16
+ %3 = call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %1, <4 x i32> %2)
+ %4 = ashr <8 x i32> %3, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+ ret <8 x i32> %4
+}
+declare <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32>, <4 x i32>) nounwind readnone
+
+define <4 x i32> @test_psravd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
+; GENERIC-LABEL: test_psravd:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpsravd %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: vpsravd (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_psravd:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vpsravd %xmm1, %xmm0, %xmm0 # sched: [3:2.00]
+; HASWELL-NEXT: vpsravd (%rdi), %xmm0, %xmm0 # sched: [9:2.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_psravd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpsravd %xmm1, %xmm0, %xmm0 # sched: [3:2.00]
+; BROADWELL-NEXT: vpsravd (%rdi), %xmm0, %xmm0 # sched: [8:2.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_psravd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpsravd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; SKYLAKE-NEXT: vpsravd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_psravd:
+; SKX: # %bb.0:
+; SKX-NEXT: vpsravd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; SKX-NEXT: vpsravd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_psravd:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpsravd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; ZNVER1-NEXT: vpsravd (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32> %a0, <4 x i32> %a1)
+ %2 = load <4 x i32>, <4 x i32> *%a2, align 16
+ %3 = call <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32> %1, <4 x i32> %2)
+ ret <4 x i32> %3
+}
+declare <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <8 x i32> @test_psravd_ymm(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> *%a2) {
+; GENERIC-LABEL: test_psravd_ymm:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpsravd %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: vpsravd (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_psravd_ymm:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vpsravd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
+; HASWELL-NEXT: vpsravd (%rdi), %ymm0, %ymm0 # sched: [10:2.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_psravd_ymm:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpsravd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
+; BROADWELL-NEXT: vpsravd (%rdi), %ymm0, %ymm0 # sched: [9:2.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_psravd_ymm:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpsravd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; SKYLAKE-NEXT: vpsravd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_psravd_ymm:
+; SKX: # %bb.0:
+; SKX-NEXT: vpsravd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; SKX-NEXT: vpsravd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_psravd_ymm:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpsravd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; ZNVER1-NEXT: vpsravd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32> %a0, <8 x i32> %a1)
+ %2 = load <8 x i32>, <8 x i32> *%a2, align 32
+ %3 = call <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32> %1, <8 x i32> %2)
+ ret <8 x i32> %3
+}
+declare <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32>, <8 x i32>) nounwind readnone
+
+define <16 x i16> @test_psraw(<16 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
+; GENERIC-LABEL: test_psraw:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpsraw %xmm1, %ymm0, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: vpsraw (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
+; GENERIC-NEXT: vpsraw $2, %ymm0, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_psraw:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vpsraw %xmm1, %ymm0, %ymm0 # sched: [4:1.00]
+; HASWELL-NEXT: vpsraw (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; HASWELL-NEXT: vpsraw $2, %ymm0, %ymm0 # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_psraw:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpsraw %xmm1, %ymm0, %ymm0 # sched: [4:1.00]
+; BROADWELL-NEXT: vpsraw (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; BROADWELL-NEXT: vpsraw $2, %ymm0, %ymm0 # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_psraw:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpsraw %xmm1, %ymm0, %ymm0 # sched: [4:1.00]
+; SKYLAKE-NEXT: vpsraw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; SKYLAKE-NEXT: vpsraw $2, %ymm0, %ymm0 # sched: [1:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_psraw:
+; SKX: # %bb.0:
+; SKX-NEXT: vpsraw %xmm1, %ymm0, %ymm0 # sched: [4:1.00]
+; SKX-NEXT: vpsraw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; SKX-NEXT: vpsraw $2, %ymm0, %ymm0 # sched: [1:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_psraw:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpsraw %xmm1, %ymm0, %ymm0 # sched: [2:1.00]
+; ZNVER1-NEXT: vpsraw (%rdi), %ymm0, %ymm0 # sched: [9:1.00]
+; ZNVER1-NEXT: vpsraw $2, %ymm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %a0, <8 x i16> %a1)
+ %2 = load <8 x i16>, <8 x i16> *%a2, align 16
+ %3 = call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %1, <8 x i16> %2)
+ %4 = ashr <16 x i16> %3, <i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2>
+ ret <16 x i16> %4
+}
+declare <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16>, <8 x i16>) nounwind readnone
+
+define <8 x i32> @test_psrld(<8 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
+; GENERIC-LABEL: test_psrld:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpsrld %xmm1, %ymm0, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: vpsrld (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
+; GENERIC-NEXT: vpsrld $2, %ymm0, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_psrld:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vpsrld %xmm1, %ymm0, %ymm0 # sched: [4:1.00]
+; HASWELL-NEXT: vpsrld (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; HASWELL-NEXT: vpsrld $2, %ymm0, %ymm0 # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_psrld:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpsrld %xmm1, %ymm0, %ymm0 # sched: [4:1.00]
+; BROADWELL-NEXT: vpsrld (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; BROADWELL-NEXT: vpsrld $2, %ymm0, %ymm0 # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_psrld:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpsrld %xmm1, %ymm0, %ymm0 # sched: [4:1.00]
+; SKYLAKE-NEXT: vpsrld (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; SKYLAKE-NEXT: vpsrld $2, %ymm0, %ymm0 # sched: [1:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_psrld:
+; SKX: # %bb.0:
+; SKX-NEXT: vpsrld %xmm1, %ymm0, %ymm0 # sched: [4:1.00]
+; SKX-NEXT: vpsrld (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; SKX-NEXT: vpsrld $2, %ymm0, %ymm0 # sched: [1:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_psrld:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpsrld %xmm1, %ymm0, %ymm0 # sched: [2:1.00]
+; ZNVER1-NEXT: vpsrld (%rdi), %ymm0, %ymm0 # sched: [9:1.00]
+; ZNVER1-NEXT: vpsrld $2, %ymm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %a0, <4 x i32> %a1)
+ %2 = load <4 x i32>, <4 x i32> *%a2, align 16
+ %3 = call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %1, <4 x i32> %2)
+ %4 = lshr <8 x i32> %3, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+ ret <8 x i32> %4
+}
+declare <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32>, <4 x i32>) nounwind readnone
+
+define <32 x i8> @test_psrldq(<32 x i8> %a0) {
+; GENERIC-LABEL: test_psrldq:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,ymm0[19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_psrldq:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,ymm0[19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_psrldq:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,ymm0[19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_psrldq:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,ymm0[19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero sched: [1:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_psrldq:
+; SKX: # %bb.0:
+; SKX-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,ymm0[19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_psrldq:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,ymm0[19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero sched: [2:1.00]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = shufflevector <32 x i8> %a0, <32 x i8> zeroinitializer, <32 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50>
+ ret <32 x i8> %1
+}
+
+define <4 x i64> @test_psrlq(<4 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
+; GENERIC-LABEL: test_psrlq:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpsrlq %xmm1, %ymm0, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: vpsrlq (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
+; GENERIC-NEXT: vpsrlq $2, %ymm0, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_psrlq:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vpsrlq %xmm1, %ymm0, %ymm0 # sched: [4:1.00]
+; HASWELL-NEXT: vpsrlq (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; HASWELL-NEXT: vpsrlq $2, %ymm0, %ymm0 # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_psrlq:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpsrlq %xmm1, %ymm0, %ymm0 # sched: [4:1.00]
+; BROADWELL-NEXT: vpsrlq (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; BROADWELL-NEXT: vpsrlq $2, %ymm0, %ymm0 # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_psrlq:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpsrlq %xmm1, %ymm0, %ymm0 # sched: [4:1.00]
+; SKYLAKE-NEXT: vpsrlq (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; SKYLAKE-NEXT: vpsrlq $2, %ymm0, %ymm0 # sched: [1:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_psrlq:
+; SKX: # %bb.0:
+; SKX-NEXT: vpsrlq %xmm1, %ymm0, %ymm0 # sched: [4:1.00]
+; SKX-NEXT: vpsrlq (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; SKX-NEXT: vpsrlq $2, %ymm0, %ymm0 # sched: [1:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_psrlq:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpsrlq %xmm1, %ymm0, %ymm0 # sched: [2:1.00]
+; ZNVER1-NEXT: vpsrlq (%rdi), %ymm0, %ymm0 # sched: [9:1.00]
+; ZNVER1-NEXT: vpsrlq $2, %ymm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %a0, <2 x i64> %a1)
+ %2 = load <2 x i64>, <2 x i64> *%a2, align 16
+ %3 = call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %1, <2 x i64> %2)
+ %4 = lshr <4 x i64> %3, <i64 2, i64 2, i64 2, i64 2>
+ ret <4 x i64> %4
+}
+declare <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64>, <2 x i64>) nounwind readnone
+
+define <4 x i32> @test_psrlvd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
+; GENERIC-LABEL: test_psrlvd:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: vpsrlvd (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_psrlvd:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 # sched: [3:2.00]
+; HASWELL-NEXT: vpsrlvd (%rdi), %xmm0, %xmm0 # sched: [9:2.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_psrlvd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 # sched: [3:2.00]
+; BROADWELL-NEXT: vpsrlvd (%rdi), %xmm0, %xmm0 # sched: [8:2.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_psrlvd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; SKYLAKE-NEXT: vpsrlvd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_psrlvd:
+; SKX: # %bb.0:
+; SKX-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; SKX-NEXT: vpsrlvd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_psrlvd:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; ZNVER1-NEXT: vpsrlvd (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32> %a0, <4 x i32> %a1)
+ %2 = load <4 x i32>, <4 x i32> *%a2, align 16
+ %3 = call <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32> %1, <4 x i32> %2)
+ ret <4 x i32> %3
+}
+declare <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <8 x i32> @test_psrlvd_ymm(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> *%a2) {
+; GENERIC-LABEL: test_psrlvd_ymm:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: vpsrlvd (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_psrlvd_ymm:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
+; HASWELL-NEXT: vpsrlvd (%rdi), %ymm0, %ymm0 # sched: [10:2.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_psrlvd_ymm:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
+; BROADWELL-NEXT: vpsrlvd (%rdi), %ymm0, %ymm0 # sched: [9:2.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_psrlvd_ymm:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; SKYLAKE-NEXT: vpsrlvd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_psrlvd_ymm:
+; SKX: # %bb.0:
+; SKX-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; SKX-NEXT: vpsrlvd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_psrlvd_ymm:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; ZNVER1-NEXT: vpsrlvd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32> %a0, <8 x i32> %a1)
+ %2 = load <8 x i32>, <8 x i32> *%a2, align 32
+ %3 = call <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32> %1, <8 x i32> %2)
+ ret <8 x i32> %3
+}
+declare <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32>, <8 x i32>) nounwind readnone
+
+define <2 x i64> @test_psrlvq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
+; GENERIC-LABEL: test_psrlvq:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: vpsrlvq (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_psrlvq:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; HASWELL-NEXT: vpsrlvq (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_psrlvq:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; BROADWELL-NEXT: vpsrlvq (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_psrlvq:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; SKYLAKE-NEXT: vpsrlvq (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_psrlvq:
+; SKX: # %bb.0:
+; SKX-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; SKX-NEXT: vpsrlvq (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_psrlvq:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; ZNVER1-NEXT: vpsrlvq (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64> %a0, <2 x i64> %a1)
+ %2 = load <2 x i64>, <2 x i64> *%a2, align 16
+ %3 = call <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64> %1, <2 x i64> %2)
+ ret <2 x i64> %3
+}
+declare <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64>, <2 x i64>) nounwind readnone
+
+define <4 x i64> @test_psrlvq_ymm(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> *%a2) {
+; GENERIC-LABEL: test_psrlvq_ymm:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: vpsrlvq (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_psrlvq_ymm:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; HASWELL-NEXT: vpsrlvq (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_psrlvq_ymm:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; BROADWELL-NEXT: vpsrlvq (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_psrlvq_ymm:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; SKYLAKE-NEXT: vpsrlvq (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_psrlvq_ymm:
+; SKX: # %bb.0:
+; SKX-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; SKX-NEXT: vpsrlvq (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_psrlvq_ymm:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; ZNVER1-NEXT: vpsrlvq (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64> %a0, <4 x i64> %a1)
+ %2 = load <4 x i64>, <4 x i64> *%a2, align 32
+ %3 = call <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64> %1, <4 x i64> %2)
+ ret <4 x i64> %3
+}
+declare <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64>, <4 x i64>) nounwind readnone
+
+define <16 x i16> @test_psrlw(<16 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
+; GENERIC-LABEL: test_psrlw:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: vpsrlw (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
+; GENERIC-NEXT: vpsrlw $2, %ymm0, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_psrlw:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 # sched: [4:1.00]
+; HASWELL-NEXT: vpsrlw (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
+; HASWELL-NEXT: vpsrlw $2, %ymm0, %ymm0 # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_psrlw:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 # sched: [4:1.00]
+; BROADWELL-NEXT: vpsrlw (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; BROADWELL-NEXT: vpsrlw $2, %ymm0, %ymm0 # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_psrlw:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 # sched: [4:1.00]
+; SKYLAKE-NEXT: vpsrlw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; SKYLAKE-NEXT: vpsrlw $2, %ymm0, %ymm0 # sched: [1:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_psrlw:
+; SKX: # %bb.0:
+; SKX-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 # sched: [4:1.00]
+; SKX-NEXT: vpsrlw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; SKX-NEXT: vpsrlw $2, %ymm0, %ymm0 # sched: [1:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_psrlw:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 # sched: [2:1.00]
+; ZNVER1-NEXT: vpsrlw (%rdi), %ymm0, %ymm0 # sched: [9:1.00]
+; ZNVER1-NEXT: vpsrlw $2, %ymm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> %a0, <8 x i16> %a1)
+ %2 = load <8 x i16>, <8 x i16> *%a2, align 16
+ %3 = call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> %1, <8 x i16> %2)
+ %4 = lshr <16 x i16> %3, <i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2>
+ ret <16 x i16> %4
+}
+declare <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16>, <8 x i16>) nounwind readnone
+
define <32 x i8> @test_psubb(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> *%a2) {
+; GENERIC-LABEL: test_psubb:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpsubb %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: vpsubb (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
; HASWELL-LABEL: test_psubb:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vpsubb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpsubb (%rdi), %ymm0, %ymm0 # sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpsubb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_psubb:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpsubb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; BROADWELL-NEXT: vpsubb (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_psubb:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpsubb %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: vpsubb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_psubb:
+; SKX: # %bb.0:
+; SKX-NEXT: vpsubb %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: vpsubb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; ZNVER1-LABEL: test_psubb:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vpsubb %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
; ZNVER1-NEXT: vpsubb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = sub <32 x i8> %a0, %a1
%2 = load <32 x i8>, <32 x i8> *%a2, align 32
%3 = sub <32 x i8> %1, %2
@@ -261,17 +6386,41 @@ define <32 x i8> @test_psubb(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> *%a2) {
}
define <8 x i32> @test_psubd(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> *%a2) {
+; GENERIC-LABEL: test_psubd:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpsubd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: vpsubd (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
; HASWELL-LABEL: test_psubd:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vpsubd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpsubd (%rdi), %ymm0, %ymm0 # sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpsubd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_psubd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpsubd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; BROADWELL-NEXT: vpsubd (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_psubd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpsubd %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: vpsubd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_psubd:
+; SKX: # %bb.0:
+; SKX-NEXT: vpsubd %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: vpsubd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; ZNVER1-LABEL: test_psubd:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vpsubd %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
; ZNVER1-NEXT: vpsubd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = sub <8 x i32> %a0, %a1
%2 = load <8 x i32>, <8 x i32> *%a2, align 32
%3 = sub <8 x i32> %1, %2
@@ -279,55 +6428,679 @@ define <8 x i32> @test_psubd(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> *%a2) {
}
define <4 x i64> @test_psubq(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> *%a2) {
+; GENERIC-LABEL: test_psubq:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpsubq %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: vpsubq (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
; HASWELL-LABEL: test_psubq:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vpsubq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpsubq (%rdi), %ymm0, %ymm0 # sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpsubq (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_psubq:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpsubq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; BROADWELL-NEXT: vpsubq (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_psubq:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpsubq %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: vpsubq (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_psubq:
+; SKX: # %bb.0:
+; SKX-NEXT: vpsubq %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: vpsubq (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; ZNVER1-LABEL: test_psubq:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vpsubq %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
; ZNVER1-NEXT: vpsubq (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = sub <4 x i64> %a0, %a1
%2 = load <4 x i64>, <4 x i64> *%a2, align 32
%3 = sub <4 x i64> %1, %2
ret <4 x i64> %3
}
+define <32 x i8> @test_psubsb(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> *%a2) {
+; GENERIC-LABEL: test_psubsb:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpsubsb %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: vpsubsb (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_psubsb:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vpsubsb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; HASWELL-NEXT: vpsubsb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_psubsb:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpsubsb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; BROADWELL-NEXT: vpsubsb (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_psubsb:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpsubsb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; SKYLAKE-NEXT: vpsubsb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_psubsb:
+; SKX: # %bb.0:
+; SKX-NEXT: vpsubsb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; SKX-NEXT: vpsubsb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_psubsb:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpsubsb %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT: vpsubsb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call <32 x i8> @llvm.x86.avx2.psubs.b(<32 x i8> %a0, <32 x i8> %a1)
+ %2 = load <32 x i8>, <32 x i8> *%a2, align 32
+ %3 = call <32 x i8> @llvm.x86.avx2.psubs.b(<32 x i8> %1, <32 x i8> %2)
+ ret <32 x i8> %3
+}
+declare <32 x i8> @llvm.x86.avx2.psubs.b(<32 x i8>, <32 x i8>) nounwind readnone
+
+define <16 x i16> @test_psubsw(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> *%a2) {
+; GENERIC-LABEL: test_psubsw:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpsubsw %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: vpsubsw (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_psubsw:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vpsubsw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; HASWELL-NEXT: vpsubsw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_psubsw:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpsubsw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; BROADWELL-NEXT: vpsubsw (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_psubsw:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpsubsw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; SKYLAKE-NEXT: vpsubsw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_psubsw:
+; SKX: # %bb.0:
+; SKX-NEXT: vpsubsw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; SKX-NEXT: vpsubsw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_psubsw:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpsubsw %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT: vpsubsw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call <16 x i16> @llvm.x86.avx2.psubs.w(<16 x i16> %a0, <16 x i16> %a1)
+ %2 = load <16 x i16>, <16 x i16> *%a2, align 32
+ %3 = call <16 x i16> @llvm.x86.avx2.psubs.w(<16 x i16> %1, <16 x i16> %2)
+ ret <16 x i16> %3
+}
+declare <16 x i16> @llvm.x86.avx2.psubs.w(<16 x i16>, <16 x i16>) nounwind readnone
+
+define <32 x i8> @test_psubusb(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> *%a2) {
+; GENERIC-LABEL: test_psubusb:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpsubusb %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: vpsubusb (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_psubusb:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vpsubusb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; HASWELL-NEXT: vpsubusb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_psubusb:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpsubusb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; BROADWELL-NEXT: vpsubusb (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_psubusb:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpsubusb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; SKYLAKE-NEXT: vpsubusb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_psubusb:
+; SKX: # %bb.0:
+; SKX-NEXT: vpsubusb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; SKX-NEXT: vpsubusb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_psubusb:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpsubusb %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT: vpsubusb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call <32 x i8> @llvm.x86.avx2.psubus.b(<32 x i8> %a0, <32 x i8> %a1)
+ %2 = load <32 x i8>, <32 x i8> *%a2, align 32
+ %3 = call <32 x i8> @llvm.x86.avx2.psubus.b(<32 x i8> %1, <32 x i8> %2)
+ ret <32 x i8> %3
+}
+declare <32 x i8> @llvm.x86.avx2.psubus.b(<32 x i8>, <32 x i8>) nounwind readnone
+
+define <16 x i16> @test_psubusw(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> *%a2) {
+; GENERIC-LABEL: test_psubusw:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpsubusw %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: vpsubusw (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_psubusw:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vpsubusw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; HASWELL-NEXT: vpsubusw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_psubusw:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpsubusw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; BROADWELL-NEXT: vpsubusw (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_psubusw:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpsubusw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; SKYLAKE-NEXT: vpsubusw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_psubusw:
+; SKX: # %bb.0:
+; SKX-NEXT: vpsubusw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; SKX-NEXT: vpsubusw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_psubusw:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpsubusw %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT: vpsubusw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call <16 x i16> @llvm.x86.avx2.psubus.w(<16 x i16> %a0, <16 x i16> %a1)
+ %2 = load <16 x i16>, <16 x i16> *%a2, align 32
+ %3 = call <16 x i16> @llvm.x86.avx2.psubus.w(<16 x i16> %1, <16 x i16> %2)
+ ret <16 x i16> %3
+}
+declare <16 x i16> @llvm.x86.avx2.psubus.w(<16 x i16>, <16 x i16>) nounwind readnone
+
define <16 x i16> @test_psubw(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> *%a2) {
+; GENERIC-LABEL: test_psubw:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpsubw %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: vpsubw (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
; HASWELL-LABEL: test_psubw:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vpsubw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpsubw (%rdi), %ymm0, %ymm0 # sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpsubw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_psubw:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpsubw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; BROADWELL-NEXT: vpsubw (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_psubw:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpsubw %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: vpsubw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_psubw:
+; SKX: # %bb.0:
+; SKX-NEXT: vpsubw %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: vpsubw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; ZNVER1-LABEL: test_psubw:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vpsubw %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
; ZNVER1-NEXT: vpsubw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = sub <16 x i16> %a0, %a1
%2 = load <16 x i16>, <16 x i16> *%a2, align 32
%3 = sub <16 x i16> %1, %2
ret <16 x i16> %3
}
+define <32 x i8> @test_punpckhbw(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> *%a2) {
+; GENERIC-LABEL: test_punpckhbw:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] sched: [1:1.00]
+; GENERIC-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15],ymm0[24],mem[24],ymm0[25],mem[25],ymm0[26],mem[26],ymm0[27],mem[27],ymm0[28],mem[28],ymm0[29],mem[29],ymm0[30],mem[30],ymm0[31],mem[31] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_punpckhbw:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] sched: [1:1.00]
+; HASWELL-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15],ymm0[24],mem[24],ymm0[25],mem[25],ymm0[26],mem[26],ymm0[27],mem[27],ymm0[28],mem[28],ymm0[29],mem[29],ymm0[30],mem[30],ymm0[31],mem[31] sched: [8:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_punpckhbw:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] sched: [1:1.00]
+; BROADWELL-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15],ymm0[24],mem[24],ymm0[25],mem[25],ymm0[26],mem[26],ymm0[27],mem[27],ymm0[28],mem[28],ymm0[29],mem[29],ymm0[30],mem[30],ymm0[31],mem[31] sched: [7:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_punpckhbw:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] sched: [1:1.00]
+; SKYLAKE-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15],ymm0[24],mem[24],ymm0[25],mem[25],ymm0[26],mem[26],ymm0[27],mem[27],ymm0[28],mem[28],ymm0[29],mem[29],ymm0[30],mem[30],ymm0[31],mem[31] sched: [8:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_punpckhbw:
+; SKX: # %bb.0:
+; SKX-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] sched: [1:1.00]
+; SKX-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15],ymm0[24],mem[24],ymm0[25],mem[25],ymm0[26],mem[26],ymm0[27],mem[27],ymm0[28],mem[28],ymm0[29],mem[29],ymm0[30],mem[30],ymm0[31],mem[31] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_punpckhbw:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] sched: [1:0.25]
+; ZNVER1-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15],ymm0[24],mem[24],ymm0[25],mem[25],ymm0[26],mem[26],ymm0[27],mem[27],ymm0[28],mem[28],ymm0[29],mem[29],ymm0[30],mem[30],ymm0[31],mem[31] sched: [8:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = shufflevector <32 x i8> %a0, <32 x i8> %a1, <32 x i32> <i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
+ %2 = load <32 x i8>, <32 x i8> *%a2, align 32
+ %3 = shufflevector <32 x i8> %1, <32 x i8> %2, <32 x i32> <i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
+ ret <32 x i8> %3
+}
+
+define <8 x i32> @test_punpckhdq(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> *%a2) {
+; GENERIC-LABEL: test_punpckhdq:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
+; GENERIC-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [5:1.00]
+; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 # sched: [3:1.00]
+; GENERIC-NEXT: vpsubd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_punpckhdq:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
+; HASWELL-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
+; HASWELL-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 # sched: [1:0.50]
+; HASWELL-NEXT: vpsubd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_punpckhdq:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
+; BROADWELL-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [7:1.00]
+; BROADWELL-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 # sched: [1:0.50]
+; BROADWELL-NEXT: vpsubd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_punpckhdq:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
+; SKYLAKE-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
+; SKYLAKE-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 # sched: [1:0.50]
+; SKYLAKE-NEXT: vpsubd %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_punpckhdq:
+; SKX: # %bb.0:
+; SKX-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
+; SKX-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
+; SKX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 # sched: [1:0.50]
+; SKX-NEXT: vpsubd %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_punpckhdq:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:0.25]
+; ZNVER1-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:0.50]
+; ZNVER1-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 # sched: [1:0.25]
+; ZNVER1-NEXT: vpsubd %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = shufflevector <8 x i32> %a0, <8 x i32> %a1, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
+ %2 = load <8 x i32>, <8 x i32> *%a2, align 32
+ %3 = shufflevector <8 x i32> %1, <8 x i32> %2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
+ %4 = add <8 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+ ret <8 x i32> %4
+}
+
+define <4 x i64> @test_punpckhqdq(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> *%a2) {
+; GENERIC-LABEL: test_punpckhqdq:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
+; GENERIC-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] sched: [5:1.00]
+; GENERIC-NEXT: vpaddq %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_punpckhqdq:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
+; HASWELL-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
+; HASWELL-NEXT: vpaddq %ymm0, %ymm1, %ymm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_punpckhqdq:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
+; BROADWELL-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] sched: [7:1.00]
+; BROADWELL-NEXT: vpaddq %ymm0, %ymm1, %ymm0 # sched: [1:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_punpckhqdq:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
+; SKYLAKE-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
+; SKYLAKE-NEXT: vpaddq %ymm0, %ymm1, %ymm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_punpckhqdq:
+; SKX: # %bb.0:
+; SKX-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
+; SKX-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
+; SKX-NEXT: vpaddq %ymm0, %ymm1, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_punpckhqdq:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:0.25]
+; ZNVER1-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:0.50]
+; ZNVER1-NEXT: vpaddq %ymm0, %ymm1, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = shufflevector <4 x i64> %a0, <4 x i64> %a1, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+ %2 = load <4 x i64>, <4 x i64> *%a2, align 32
+ %3 = shufflevector <4 x i64> %a0, <4 x i64> %2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+ %4 = add <4 x i64> %1, %3
+ ret <4 x i64> %4
+}
+
+define <16 x i16> @test_punpckhwd(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> *%a2) {
+; GENERIC-LABEL: test_punpckhwd:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] sched: [1:1.00]
+; GENERIC-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_punpckhwd:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] sched: [1:1.00]
+; HASWELL-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15] sched: [8:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_punpckhwd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] sched: [1:1.00]
+; BROADWELL-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15] sched: [7:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_punpckhwd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] sched: [1:1.00]
+; SKYLAKE-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15] sched: [8:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_punpckhwd:
+; SKX: # %bb.0:
+; SKX-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] sched: [1:1.00]
+; SKX-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_punpckhwd:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] sched: [1:0.25]
+; ZNVER1-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[12],mem[12],ymm0[13],mem[13],ymm0[14],mem[14],ymm0[15],mem[15] sched: [8:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = shufflevector <16 x i16> %a0, <16 x i16> %a1, <16 x i32> <i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+ %2 = load <16 x i16>, <16 x i16> *%a2, align 32
+ %3 = shufflevector <16 x i16> %1, <16 x i16> %2, <16 x i32> <i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+ ret <16 x i16> %3
+}
+
+define <32 x i8> @test_punpcklbw(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> *%a2) {
+; GENERIC-LABEL: test_punpcklbw:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] sched: [1:1.00]
+; GENERIC-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[2],mem[2],ymm0[3],mem[3],ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[16],mem[16],ymm0[17],mem[17],ymm0[18],mem[18],ymm0[19],mem[19],ymm0[20],mem[20],ymm0[21],mem[21],ymm0[22],mem[22],ymm0[23],mem[23] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_punpcklbw:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] sched: [1:1.00]
+; HASWELL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[2],mem[2],ymm0[3],mem[3],ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[16],mem[16],ymm0[17],mem[17],ymm0[18],mem[18],ymm0[19],mem[19],ymm0[20],mem[20],ymm0[21],mem[21],ymm0[22],mem[22],ymm0[23],mem[23] sched: [8:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_punpcklbw:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] sched: [1:1.00]
+; BROADWELL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[2],mem[2],ymm0[3],mem[3],ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[16],mem[16],ymm0[17],mem[17],ymm0[18],mem[18],ymm0[19],mem[19],ymm0[20],mem[20],ymm0[21],mem[21],ymm0[22],mem[22],ymm0[23],mem[23] sched: [7:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_punpcklbw:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] sched: [1:1.00]
+; SKYLAKE-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[2],mem[2],ymm0[3],mem[3],ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[16],mem[16],ymm0[17],mem[17],ymm0[18],mem[18],ymm0[19],mem[19],ymm0[20],mem[20],ymm0[21],mem[21],ymm0[22],mem[22],ymm0[23],mem[23] sched: [8:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_punpcklbw:
+; SKX: # %bb.0:
+; SKX-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] sched: [1:1.00]
+; SKX-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[2],mem[2],ymm0[3],mem[3],ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[16],mem[16],ymm0[17],mem[17],ymm0[18],mem[18],ymm0[19],mem[19],ymm0[20],mem[20],ymm0[21],mem[21],ymm0[22],mem[22],ymm0[23],mem[23] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_punpcklbw:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] sched: [1:0.25]
+; ZNVER1-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[2],mem[2],ymm0[3],mem[3],ymm0[4],mem[4],ymm0[5],mem[5],ymm0[6],mem[6],ymm0[7],mem[7],ymm0[16],mem[16],ymm0[17],mem[17],ymm0[18],mem[18],ymm0[19],mem[19],ymm0[20],mem[20],ymm0[21],mem[21],ymm0[22],mem[22],ymm0[23],mem[23] sched: [8:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = shufflevector <32 x i8> %a0, <32 x i8> %a1, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55>
+ %2 = load <32 x i8>, <32 x i8> *%a2, align 32
+ %3 = shufflevector <32 x i8> %1, <32 x i8> %2, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55>
+ ret <32 x i8> %3
+}
+
+define <8 x i32> @test_punpckldq(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> *%a2) {
+; GENERIC-LABEL: test_punpckldq:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
+; GENERIC-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [5:1.00]
+; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 # sched: [3:1.00]
+; GENERIC-NEXT: vpsubd %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_punpckldq:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
+; HASWELL-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
+; HASWELL-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 # sched: [1:0.50]
+; HASWELL-NEXT: vpsubd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_punpckldq:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
+; BROADWELL-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [7:1.00]
+; BROADWELL-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 # sched: [1:0.50]
+; BROADWELL-NEXT: vpsubd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_punpckldq:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
+; SKYLAKE-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
+; SKYLAKE-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 # sched: [1:0.50]
+; SKYLAKE-NEXT: vpsubd %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_punpckldq:
+; SKX: # %bb.0:
+; SKX-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
+; SKX-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
+; SKX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 # sched: [1:0.50]
+; SKX-NEXT: vpsubd %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_punpckldq:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:0.25]
+; ZNVER1-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:0.50]
+; ZNVER1-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 # sched: [1:0.25]
+; ZNVER1-NEXT: vpsubd %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = shufflevector <8 x i32> %a0, <8 x i32> %a1, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
+ %2 = load <8 x i32>, <8 x i32> *%a2, align 32
+ %3 = shufflevector <8 x i32> %1, <8 x i32> %2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
+ %4 = add <8 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+ ret <8 x i32> %4
+}
+
+define <4 x i64> @test_punpcklqdq(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> *%a2) {
+; GENERIC-LABEL: test_punpcklqdq:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
+; GENERIC-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] sched: [5:1.00]
+; GENERIC-NEXT: vpaddq %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_punpcklqdq:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
+; HASWELL-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
+; HASWELL-NEXT: vpaddq %ymm0, %ymm1, %ymm0 # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_punpcklqdq:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
+; BROADWELL-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] sched: [7:1.00]
+; BROADWELL-NEXT: vpaddq %ymm0, %ymm1, %ymm0 # sched: [1:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_punpcklqdq:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
+; SKYLAKE-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
+; SKYLAKE-NEXT: vpaddq %ymm0, %ymm1, %ymm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_punpcklqdq:
+; SKX: # %bb.0:
+; SKX-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
+; SKX-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
+; SKX-NEXT: vpaddq %ymm0, %ymm1, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_punpcklqdq:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:0.25]
+; ZNVER1-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:0.50]
+; ZNVER1-NEXT: vpaddq %ymm0, %ymm1, %ymm0 # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = shufflevector <4 x i64> %a0, <4 x i64> %a1, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+ %2 = load <4 x i64>, <4 x i64> *%a2, align 32
+ %3 = shufflevector <4 x i64> %a0, <4 x i64> %2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+ %4 = add <4 x i64> %1, %3
+ ret <4 x i64> %4
+}
+
+define <16 x i16> @test_punpcklwd(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> *%a2) {
+; GENERIC-LABEL: test_punpcklwd:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] sched: [1:1.00]
+; GENERIC-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[2],mem[2],ymm0[3],mem[3],ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_punpcklwd:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] sched: [1:1.00]
+; HASWELL-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[2],mem[2],ymm0[3],mem[3],ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11] sched: [8:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_punpcklwd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] sched: [1:1.00]
+; BROADWELL-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[2],mem[2],ymm0[3],mem[3],ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11] sched: [7:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_punpcklwd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] sched: [1:1.00]
+; SKYLAKE-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[2],mem[2],ymm0[3],mem[3],ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11] sched: [8:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_punpcklwd:
+; SKX: # %bb.0:
+; SKX-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] sched: [1:1.00]
+; SKX-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[2],mem[2],ymm0[3],mem[3],ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_punpcklwd:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] sched: [1:0.25]
+; ZNVER1-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[2],mem[2],ymm0[3],mem[3],ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11] sched: [8:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = shufflevector <16 x i16> %a0, <16 x i16> %a1, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27>
+ %2 = load <16 x i16>, <16 x i16> *%a2, align 32
+ %3 = shufflevector <16 x i16> %1, <16 x i16> %2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27>
+ ret <16 x i16> %3
+}
+
define <4 x i64> @test_pxor(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> *%a2) {
+; GENERIC-LABEL: test_pxor:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: vpxor (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
+; GENERIC-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
; HASWELL-LABEL: test_pxor:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vpxor %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
-; HASWELL-NEXT: vpxor (%rdi), %ymm0, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT: vpxor (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
; HASWELL-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pxor:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpxor %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; BROADWELL-NEXT: vpxor (%rdi), %ymm0, %ymm0 # sched: [7:0.50]
+; BROADWELL-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pxor:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpxor %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: vpxor (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; SKYLAKE-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pxor:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: vpxor (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
+; SKX-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; ZNVER1-LABEL: test_pxor:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vpxor %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
; ZNVER1-NEXT: vpxor (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
; ZNVER1-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = xor <4 x i64> %a0, %a1
%2 = load <4 x i64>, <4 x i64> *%a2, align 32
%3 = xor <4 x i64> %1, %2
diff --git a/test/CodeGen/X86/avx2-shift.ll b/test/CodeGen/X86/avx2-shift.ll
index 47bbba2c7e08..022c9f458db1 100644
--- a/test/CodeGen/X86/avx2-shift.ll
+++ b/test/CodeGen/X86/avx2-shift.ll
@@ -4,12 +4,12 @@
define <4 x i32> @variable_shl0(<4 x i32> %x, <4 x i32> %y) {
; X32-LABEL: variable_shl0:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: variable_shl0:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
; X64-NEXT: retq
%k = shl <4 x i32> %x, %y
@@ -18,12 +18,12 @@ define <4 x i32> @variable_shl0(<4 x i32> %x, <4 x i32> %y) {
define <8 x i32> @variable_shl1(<8 x i32> %x, <8 x i32> %y) {
; X32-LABEL: variable_shl1:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: variable_shl1:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
; X64-NEXT: retq
%k = shl <8 x i32> %x, %y
@@ -32,12 +32,12 @@ define <8 x i32> @variable_shl1(<8 x i32> %x, <8 x i32> %y) {
define <2 x i64> @variable_shl2(<2 x i64> %x, <2 x i64> %y) {
; X32-LABEL: variable_shl2:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpsllvq %xmm1, %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: variable_shl2:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpsllvq %xmm1, %xmm0, %xmm0
; X64-NEXT: retq
%k = shl <2 x i64> %x, %y
@@ -46,12 +46,12 @@ define <2 x i64> @variable_shl2(<2 x i64> %x, <2 x i64> %y) {
define <4 x i64> @variable_shl3(<4 x i64> %x, <4 x i64> %y) {
; X32-LABEL: variable_shl3:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpsllvq %ymm1, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: variable_shl3:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpsllvq %ymm1, %ymm0, %ymm0
; X64-NEXT: retq
%k = shl <4 x i64> %x, %y
@@ -60,12 +60,12 @@ define <4 x i64> @variable_shl3(<4 x i64> %x, <4 x i64> %y) {
define <4 x i32> @variable_srl0(<4 x i32> %x, <4 x i32> %y) {
; X32-LABEL: variable_srl0:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: variable_srl0:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
; X64-NEXT: retq
%k = lshr <4 x i32> %x, %y
@@ -74,12 +74,12 @@ define <4 x i32> @variable_srl0(<4 x i32> %x, <4 x i32> %y) {
define <8 x i32> @variable_srl1(<8 x i32> %x, <8 x i32> %y) {
; X32-LABEL: variable_srl1:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: variable_srl1:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
; X64-NEXT: retq
%k = lshr <8 x i32> %x, %y
@@ -88,12 +88,12 @@ define <8 x i32> @variable_srl1(<8 x i32> %x, <8 x i32> %y) {
define <2 x i64> @variable_srl2(<2 x i64> %x, <2 x i64> %y) {
; X32-LABEL: variable_srl2:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: variable_srl2:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
; X64-NEXT: retq
%k = lshr <2 x i64> %x, %y
@@ -102,12 +102,12 @@ define <2 x i64> @variable_srl2(<2 x i64> %x, <2 x i64> %y) {
define <4 x i64> @variable_srl3(<4 x i64> %x, <4 x i64> %y) {
; X32-LABEL: variable_srl3:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: variable_srl3:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0
; X64-NEXT: retq
%k = lshr <4 x i64> %x, %y
@@ -116,12 +116,12 @@ define <4 x i64> @variable_srl3(<4 x i64> %x, <4 x i64> %y) {
define <4 x i32> @variable_sra0(<4 x i32> %x, <4 x i32> %y) {
; X32-LABEL: variable_sra0:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpsravd %xmm1, %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: variable_sra0:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpsravd %xmm1, %xmm0, %xmm0
; X64-NEXT: retq
%k = ashr <4 x i32> %x, %y
@@ -130,12 +130,12 @@ define <4 x i32> @variable_sra0(<4 x i32> %x, <4 x i32> %y) {
define <8 x i32> @variable_sra1(<8 x i32> %x, <8 x i32> %y) {
; X32-LABEL: variable_sra1:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpsravd %ymm1, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: variable_sra1:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpsravd %ymm1, %ymm0, %ymm0
; X64-NEXT: retq
%k = ashr <8 x i32> %x, %y
@@ -146,12 +146,12 @@ define <8 x i32> @variable_sra1(<8 x i32> %x, <8 x i32> %y) {
define <8 x i32> @vshift00(<8 x i32> %a) nounwind readnone {
; X32-LABEL: vshift00:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpslld $2, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: vshift00:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpslld $2, %ymm0, %ymm0
; X64-NEXT: retq
%s = shl <8 x i32> %a, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
@@ -160,12 +160,12 @@ define <8 x i32> @vshift00(<8 x i32> %a) nounwind readnone {
define <16 x i16> @vshift01(<16 x i16> %a) nounwind readnone {
; X32-LABEL: vshift01:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpsllw $2, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: vshift01:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpsllw $2, %ymm0, %ymm0
; X64-NEXT: retq
%s = shl <16 x i16> %a, <i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2>
@@ -174,12 +174,12 @@ define <16 x i16> @vshift01(<16 x i16> %a) nounwind readnone {
define <4 x i64> @vshift02(<4 x i64> %a) nounwind readnone {
; X32-LABEL: vshift02:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpsllq $2, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: vshift02:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpsllq $2, %ymm0, %ymm0
; X64-NEXT: retq
%s = shl <4 x i64> %a, <i64 2, i64 2, i64 2, i64 2>
@@ -190,12 +190,12 @@ define <4 x i64> @vshift02(<4 x i64> %a) nounwind readnone {
define <8 x i32> @vshift03(<8 x i32> %a) nounwind readnone {
; X32-LABEL: vshift03:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpsrld $2, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: vshift03:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpsrld $2, %ymm0, %ymm0
; X64-NEXT: retq
%s = lshr <8 x i32> %a, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
@@ -204,12 +204,12 @@ define <8 x i32> @vshift03(<8 x i32> %a) nounwind readnone {
define <16 x i16> @vshift04(<16 x i16> %a) nounwind readnone {
; X32-LABEL: vshift04:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpsrlw $2, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: vshift04:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpsrlw $2, %ymm0, %ymm0
; X64-NEXT: retq
%s = lshr <16 x i16> %a, <i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2>
@@ -218,12 +218,12 @@ define <16 x i16> @vshift04(<16 x i16> %a) nounwind readnone {
define <4 x i64> @vshift05(<4 x i64> %a) nounwind readnone {
; X32-LABEL: vshift05:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpsrlq $2, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: vshift05:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpsrlq $2, %ymm0, %ymm0
; X64-NEXT: retq
%s = lshr <4 x i64> %a, <i64 2, i64 2, i64 2, i64 2>
@@ -234,12 +234,12 @@ define <4 x i64> @vshift05(<4 x i64> %a) nounwind readnone {
define <8 x i32> @vshift06(<8 x i32> %a) nounwind readnone {
; X32-LABEL: vshift06:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpsrad $2, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: vshift06:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpsrad $2, %ymm0, %ymm0
; X64-NEXT: retq
%s = ashr <8 x i32> %a, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
@@ -248,12 +248,12 @@ define <8 x i32> @vshift06(<8 x i32> %a) nounwind readnone {
define <16 x i16> @vshift07(<16 x i16> %a) nounwind readnone {
; X32-LABEL: vshift07:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpsraw $2, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: vshift07:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpsraw $2, %ymm0, %ymm0
; X64-NEXT: retq
%s = ashr <16 x i16> %a, <i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2>
@@ -262,13 +262,13 @@ define <16 x i16> @vshift07(<16 x i16> %a) nounwind readnone {
define <4 x i32> @variable_sra0_load(<4 x i32> %x, <4 x i32>* %y) {
; X32-LABEL: variable_sra0_load:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpsravd (%eax), %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: variable_sra0_load:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpsravd (%rdi), %xmm0, %xmm0
; X64-NEXT: retq
%y1 = load <4 x i32>, <4 x i32>* %y
@@ -278,13 +278,13 @@ define <4 x i32> @variable_sra0_load(<4 x i32> %x, <4 x i32>* %y) {
define <8 x i32> @variable_sra1_load(<8 x i32> %x, <8 x i32>* %y) {
; X32-LABEL: variable_sra1_load:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpsravd (%eax), %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: variable_sra1_load:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpsravd (%rdi), %ymm0, %ymm0
; X64-NEXT: retq
%y1 = load <8 x i32>, <8 x i32>* %y
@@ -294,13 +294,13 @@ define <8 x i32> @variable_sra1_load(<8 x i32> %x, <8 x i32>* %y) {
define <4 x i32> @variable_shl0_load(<4 x i32> %x, <4 x i32>* %y) {
; X32-LABEL: variable_shl0_load:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpsllvd (%eax), %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: variable_shl0_load:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpsllvd (%rdi), %xmm0, %xmm0
; X64-NEXT: retq
%y1 = load <4 x i32>, <4 x i32>* %y
@@ -310,13 +310,13 @@ define <4 x i32> @variable_shl0_load(<4 x i32> %x, <4 x i32>* %y) {
define <8 x i32> @variable_shl1_load(<8 x i32> %x, <8 x i32>* %y) {
; X32-LABEL: variable_shl1_load:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpsllvd (%eax), %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: variable_shl1_load:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpsllvd (%rdi), %ymm0, %ymm0
; X64-NEXT: retq
%y1 = load <8 x i32>, <8 x i32>* %y
@@ -326,13 +326,13 @@ define <8 x i32> @variable_shl1_load(<8 x i32> %x, <8 x i32>* %y) {
define <2 x i64> @variable_shl2_load(<2 x i64> %x, <2 x i64>* %y) {
; X32-LABEL: variable_shl2_load:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpsllvq (%eax), %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: variable_shl2_load:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpsllvq (%rdi), %xmm0, %xmm0
; X64-NEXT: retq
%y1 = load <2 x i64>, <2 x i64>* %y
@@ -342,13 +342,13 @@ define <2 x i64> @variable_shl2_load(<2 x i64> %x, <2 x i64>* %y) {
define <4 x i64> @variable_shl3_load(<4 x i64> %x, <4 x i64>* %y) {
; X32-LABEL: variable_shl3_load:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpsllvq (%eax), %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: variable_shl3_load:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpsllvq (%rdi), %ymm0, %ymm0
; X64-NEXT: retq
%y1 = load <4 x i64>, <4 x i64>* %y
@@ -358,13 +358,13 @@ define <4 x i64> @variable_shl3_load(<4 x i64> %x, <4 x i64>* %y) {
define <4 x i32> @variable_srl0_load(<4 x i32> %x, <4 x i32>* %y) {
; X32-LABEL: variable_srl0_load:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpsrlvd (%eax), %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: variable_srl0_load:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpsrlvd (%rdi), %xmm0, %xmm0
; X64-NEXT: retq
%y1 = load <4 x i32>, <4 x i32>* %y
@@ -374,13 +374,13 @@ define <4 x i32> @variable_srl0_load(<4 x i32> %x, <4 x i32>* %y) {
define <8 x i32> @variable_srl1_load(<8 x i32> %x, <8 x i32>* %y) {
; X32-LABEL: variable_srl1_load:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpsrlvd (%eax), %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: variable_srl1_load:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpsrlvd (%rdi), %ymm0, %ymm0
; X64-NEXT: retq
%y1 = load <8 x i32>, <8 x i32>* %y
@@ -390,13 +390,13 @@ define <8 x i32> @variable_srl1_load(<8 x i32> %x, <8 x i32>* %y) {
define <2 x i64> @variable_srl2_load(<2 x i64> %x, <2 x i64>* %y) {
; X32-LABEL: variable_srl2_load:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpsrlvq (%eax), %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: variable_srl2_load:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpsrlvq (%rdi), %xmm0, %xmm0
; X64-NEXT: retq
%y1 = load <2 x i64>, <2 x i64>* %y
@@ -406,13 +406,13 @@ define <2 x i64> @variable_srl2_load(<2 x i64> %x, <2 x i64>* %y) {
define <4 x i64> @variable_srl3_load(<4 x i64> %x, <4 x i64>* %y) {
; X32-LABEL: variable_srl3_load:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpsrlvq (%eax), %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: variable_srl3_load:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpsrlvq (%rdi), %ymm0, %ymm0
; X64-NEXT: retq
%y1 = load <4 x i64>, <4 x i64>* %y
@@ -422,13 +422,13 @@ define <4 x i64> @variable_srl3_load(<4 x i64> %x, <4 x i64>* %y) {
define <32 x i8> @shl9(<32 x i8> %A) nounwind {
; X32-LABEL: shl9:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpsllw $3, %ymm0, %ymm0
; X32-NEXT: vpand {{\.LCPI.*}}, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: shl9:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpsllw $3, %ymm0, %ymm0
; X64-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
; X64-NEXT: retq
@@ -438,13 +438,13 @@ define <32 x i8> @shl9(<32 x i8> %A) nounwind {
define <32 x i8> @shr9(<32 x i8> %A) nounwind {
; X32-LABEL: shr9:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpsrlw $3, %ymm0, %ymm0
; X32-NEXT: vpand {{\.LCPI.*}}, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: shr9:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpsrlw $3, %ymm0, %ymm0
; X64-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
; X64-NEXT: retq
@@ -454,14 +454,14 @@ define <32 x i8> @shr9(<32 x i8> %A) nounwind {
define <32 x i8> @sra_v32i8_7(<32 x i8> %A) nounwind {
; X32-LABEL: sra_v32i8_7:
-; X32: # BB#0:
-; X32-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; X32: # %bb.0:
+; X32-NEXT: vpxor %xmm1, %xmm1, %xmm1
; X32-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: sra_v32i8_7:
-; X64: # BB#0:
-; X64-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; X64: # %bb.0:
+; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1
; X64-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0
; X64-NEXT: retq
%B = ashr <32 x i8> %A, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
@@ -470,7 +470,7 @@ define <32 x i8> @sra_v32i8_7(<32 x i8> %A) nounwind {
define <32 x i8> @sra_v32i8(<32 x i8> %A) nounwind {
; X32-LABEL: sra_v32i8:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpsrlw $3, %ymm0, %ymm0
; X32-NEXT: vpand {{\.LCPI.*}}, %ymm0, %ymm0
; X32-NEXT: vmovdqa {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
@@ -479,7 +479,7 @@ define <32 x i8> @sra_v32i8(<32 x i8> %A) nounwind {
; X32-NEXT: retl
;
; X64-LABEL: sra_v32i8:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpsrlw $3, %ymm0, %ymm0
; X64-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
; X64-NEXT: vmovdqa {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
@@ -492,13 +492,13 @@ define <32 x i8> @sra_v32i8(<32 x i8> %A) nounwind {
define <16 x i16> @sext_v16i16(<16 x i16> %a) nounwind {
; X32-LABEL: sext_v16i16:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpsllw $8, %ymm0, %ymm0
; X32-NEXT: vpsraw $8, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: sext_v16i16:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpsllw $8, %ymm0, %ymm0
; X64-NEXT: vpsraw $8, %ymm0, %ymm0
; X64-NEXT: retq
@@ -509,13 +509,13 @@ define <16 x i16> @sext_v16i16(<16 x i16> %a) nounwind {
define <8 x i32> @sext_v8i32(<8 x i32> %a) nounwind {
; X32-LABEL: sext_v8i32:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpslld $16, %ymm0, %ymm0
; X32-NEXT: vpsrad $16, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: sext_v8i32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpslld $16, %ymm0, %ymm0
; X64-NEXT: vpsrad $16, %ymm0, %ymm0
; X64-NEXT: retq
@@ -526,24 +526,24 @@ define <8 x i32> @sext_v8i32(<8 x i32> %a) nounwind {
define <8 x i16> @variable_shl16(<8 x i16> %lhs, <8 x i16> %rhs) {
; X32-LABEL: variable_shl16:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; X32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; X32-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
; X32-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; X32-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; X32-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; X32-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; X32-NEXT: vzeroupper
; X32-NEXT: retl
;
; X64-LABEL: variable_shl16:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; X64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; X64-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
; X64-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; X64-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; X64-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; X64-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; X64-NEXT: vzeroupper
; X64-NEXT: retq
%res = shl <8 x i16> %lhs, %rhs
@@ -552,24 +552,22 @@ define <8 x i16> @variable_shl16(<8 x i16> %lhs, <8 x i16> %rhs) {
define <8 x i16> @variable_ashr16(<8 x i16> %lhs, <8 x i16> %rhs) {
; X32-LABEL: variable_ashr16:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; X32-NEXT: vpmovsxwd %xmm0, %ymm0
; X32-NEXT: vpsravd %ymm1, %ymm0, %ymm0
-; X32-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; X32-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; X32-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; X32-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X32-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
; X32-NEXT: vzeroupper
; X32-NEXT: retl
;
; X64-LABEL: variable_ashr16:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; X64-NEXT: vpmovsxwd %xmm0, %ymm0
; X64-NEXT: vpsravd %ymm1, %ymm0, %ymm0
-; X64-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; X64-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; X64-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
; X64-NEXT: vzeroupper
; X64-NEXT: retq
%res = ashr <8 x i16> %lhs, %rhs
@@ -578,24 +576,24 @@ define <8 x i16> @variable_ashr16(<8 x i16> %lhs, <8 x i16> %rhs) {
define <8 x i16> @variable_lshr16(<8 x i16> %lhs, <8 x i16> %rhs) {
; X32-LABEL: variable_lshr16:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; X32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; X32-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
; X32-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; X32-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; X32-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; X32-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; X32-NEXT: vzeroupper
; X32-NEXT: retl
;
; X64-LABEL: variable_lshr16:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; X64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; X64-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
; X64-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; X64-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; X64-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; X64-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; X64-NEXT: vzeroupper
; X64-NEXT: retq
%res = lshr <8 x i16> %lhs, %rhs
diff --git a/test/CodeGen/X86/avx2-vbroadcast.ll b/test/CodeGen/X86/avx2-vbroadcast.ll
index 318c9cfd8a3f..e5506257e4ce 100644
--- a/test/CodeGen/X86/avx2-vbroadcast.ll
+++ b/test/CodeGen/X86/avx2-vbroadcast.ll
@@ -6,13 +6,13 @@
define <16 x i8> @BB16(i8* %ptr) nounwind uwtable readnone ssp {
; X32-LABEL: BB16:
-; X32: ## BB#0: ## %entry
+; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpbroadcastb (%eax), %xmm0
; X32-NEXT: retl
;
; X64-LABEL: BB16:
-; X64: ## BB#0: ## %entry
+; X64: ## %bb.0: ## %entry
; X64-NEXT: vpbroadcastb (%rdi), %xmm0
; X64-NEXT: retq
entry:
@@ -38,13 +38,13 @@ entry:
define <32 x i8> @BB32(i8* %ptr) nounwind uwtable readnone ssp {
; X32-LABEL: BB32:
-; X32: ## BB#0: ## %entry
+; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpbroadcastb (%eax), %ymm0
; X32-NEXT: retl
;
; X64-LABEL: BB32:
-; X64: ## BB#0: ## %entry
+; X64: ## %bb.0: ## %entry
; X64-NEXT: vpbroadcastb (%rdi), %ymm0
; X64-NEXT: retq
entry:
@@ -87,13 +87,13 @@ entry:
define <8 x i16> @W16(i16* %ptr) nounwind uwtable readnone ssp {
; X32-LABEL: W16:
-; X32: ## BB#0: ## %entry
+; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpbroadcastw (%eax), %xmm0
; X32-NEXT: retl
;
; X64-LABEL: W16:
-; X64: ## BB#0: ## %entry
+; X64: ## %bb.0: ## %entry
; X64-NEXT: vpbroadcastw (%rdi), %xmm0
; X64-NEXT: retq
entry:
@@ -111,13 +111,13 @@ entry:
define <16 x i16> @WW16(i16* %ptr) nounwind uwtable readnone ssp {
; X32-LABEL: WW16:
-; X32: ## BB#0: ## %entry
+; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpbroadcastw (%eax), %ymm0
; X32-NEXT: retl
;
; X64-LABEL: WW16:
-; X64: ## BB#0: ## %entry
+; X64: ## %bb.0: ## %entry
; X64-NEXT: vpbroadcastw (%rdi), %ymm0
; X64-NEXT: retq
entry:
@@ -143,13 +143,13 @@ entry:
define <4 x i32> @D32(i32* %ptr) nounwind uwtable readnone ssp {
; X32-LABEL: D32:
-; X32: ## BB#0: ## %entry
+; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vbroadcastss (%eax), %xmm0
; X32-NEXT: retl
;
; X64-LABEL: D32:
-; X64: ## BB#0: ## %entry
+; X64: ## %bb.0: ## %entry
; X64-NEXT: vbroadcastss (%rdi), %xmm0
; X64-NEXT: retq
entry:
@@ -163,13 +163,13 @@ entry:
define <8 x i32> @DD32(i32* %ptr) nounwind uwtable readnone ssp {
; X32-LABEL: DD32:
-; X32: ## BB#0: ## %entry
+; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vbroadcastss (%eax), %ymm0
; X32-NEXT: retl
;
; X64-LABEL: DD32:
-; X64: ## BB#0: ## %entry
+; X64: ## %bb.0: ## %entry
; X64-NEXT: vbroadcastss (%rdi), %ymm0
; X64-NEXT: retq
entry:
@@ -187,7 +187,7 @@ entry:
define <2 x i64> @Q64(i64* %ptr) nounwind uwtable readnone ssp {
; X32-LABEL: Q64:
-; X32: ## BB#0: ## %entry
+; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl (%eax), %ecx
; X32-NEXT: movl 4(%eax), %eax
@@ -198,7 +198,7 @@ define <2 x i64> @Q64(i64* %ptr) nounwind uwtable readnone ssp {
; X32-NEXT: retl
;
; X64-LABEL: Q64:
-; X64: ## BB#0: ## %entry
+; X64: ## %bb.0: ## %entry
; X64-NEXT: vpbroadcastq (%rdi), %xmm0
; X64-NEXT: retq
entry:
@@ -210,7 +210,7 @@ entry:
define <4 x i64> @QQ64(i64* %ptr) nounwind uwtable readnone ssp {
; X32-LABEL: QQ64:
-; X32: ## BB#0: ## %entry
+; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl (%eax), %ecx
; X32-NEXT: movl 4(%eax), %eax
@@ -222,7 +222,7 @@ define <4 x i64> @QQ64(i64* %ptr) nounwind uwtable readnone ssp {
; X32-NEXT: retl
;
; X64-LABEL: QQ64:
-; X64: ## BB#0: ## %entry
+; X64: ## %bb.0: ## %entry
; X64-NEXT: vbroadcastsd (%rdi), %ymm0
; X64-NEXT: retq
entry:
@@ -235,68 +235,33 @@ entry:
}
define <8 x i16> @broadcast_mem_v4i16_v8i16(<4 x i16>* %ptr) {
-; X32-AVX2-LABEL: broadcast_mem_v4i16_v8i16:
-; X32-AVX2: ## BB#0:
-; X32-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-AVX2-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
-; X32-AVX2-NEXT: retl
-;
-; X64-AVX2-LABEL: broadcast_mem_v4i16_v8i16:
-; X64-AVX2: ## BB#0:
-; X64-AVX2-NEXT: vpbroadcastq (%rdi), %xmm0
-; X64-AVX2-NEXT: retq
-;
-; X32-AVX512VL-LABEL: broadcast_mem_v4i16_v8i16:
-; X32-AVX512VL: ## BB#0:
-; X32-AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-AVX512VL-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; X32-AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,0,1,4,5,8,9,12,13]
-; X32-AVX512VL-NEXT: retl
+; X32-LABEL: broadcast_mem_v4i16_v8i16:
+; X32: ## %bb.0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
+; X32-NEXT: retl
;
-; X64-AVX512VL-LABEL: broadcast_mem_v4i16_v8i16:
-; X64-AVX512VL: ## BB#0:
-; X64-AVX512VL-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; X64-AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,0,1,4,5,8,9,12,13]
-; X64-AVX512VL-NEXT: retq
+; X64-LABEL: broadcast_mem_v4i16_v8i16:
+; X64: ## %bb.0:
+; X64-NEXT: vpbroadcastq (%rdi), %xmm0
+; X64-NEXT: retq
%load = load <4 x i16>, <4 x i16>* %ptr
%shuf = shufflevector <4 x i16> %load, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
ret <8 x i16> %shuf
}
define <16 x i16> @broadcast_mem_v4i16_v16i16(<4 x i16>* %ptr) {
-; X32-AVX2-LABEL: broadcast_mem_v4i16_v16i16:
-; X32-AVX2: ## BB#0:
-; X32-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
-; X32-AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,7,4,5,6,7,6,7],zero,zero
-; X32-AVX2-NEXT: vpbroadcastq %xmm0, %ymm0
-; X32-AVX2-NEXT: retl
-;
-; X64-AVX2-LABEL: broadcast_mem_v4i16_v16i16:
-; X64-AVX2: ## BB#0:
-; X64-AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
-; X64-AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,7,4,5,6,7,6,7],zero,zero
-; X64-AVX2-NEXT: vpbroadcastq %xmm0, %ymm0
-; X64-AVX2-NEXT: retq
-;
-; X32-AVX512VL-LABEL: broadcast_mem_v4i16_v16i16:
-; X32-AVX512VL: ## BB#0:
-; X32-AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-AVX512VL-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; X32-AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; X32-AVX512VL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
-; X32-AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X32-AVX512VL-NEXT: vpbroadcastq %xmm0, %ymm0
-; X32-AVX512VL-NEXT: retl
+; X32-LABEL: broadcast_mem_v4i16_v16i16:
+; X32: ## %bb.0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X32-NEXT: vbroadcastsd %xmm0, %ymm0
+; X32-NEXT: retl
;
-; X64-AVX512VL-LABEL: broadcast_mem_v4i16_v16i16:
-; X64-AVX512VL: ## BB#0:
-; X64-AVX512VL-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; X64-AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; X64-AVX512VL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
-; X64-AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X64-AVX512VL-NEXT: vpbroadcastq %xmm0, %ymm0
-; X64-AVX512VL-NEXT: retq
+; X64-LABEL: broadcast_mem_v4i16_v16i16:
+; X64: ## %bb.0:
+; X64-NEXT: vbroadcastsd (%rdi), %ymm0
+; X64-NEXT: retq
%load = load <4 x i16>, <4 x i16>* %ptr
%shuf = shufflevector <4 x i16> %load, <4 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
ret <16 x i16> %shuf
@@ -306,13 +271,13 @@ define <16 x i16> @broadcast_mem_v4i16_v16i16(<4 x i16>* %ptr) {
define <16 x i8> @load_splat_16i8_16i8_1111111111111111(<16 x i8>* %ptr) nounwind uwtable readnone ssp {
; X32-LABEL: load_splat_16i8_16i8_1111111111111111:
-; X32: ## BB#0: ## %entry
+; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpbroadcastb 1(%eax), %xmm0
; X32-NEXT: retl
;
; X64-LABEL: load_splat_16i8_16i8_1111111111111111:
-; X64: ## BB#0: ## %entry
+; X64: ## %bb.0: ## %entry
; X64-NEXT: vpbroadcastb 1(%rdi), %xmm0
; X64-NEXT: retq
entry:
@@ -323,13 +288,13 @@ entry:
define <32 x i8> @load_splat_32i8_16i8_11111111111111111111111111111111(<16 x i8>* %ptr) nounwind uwtable readnone ssp {
; X32-LABEL: load_splat_32i8_16i8_11111111111111111111111111111111:
-; X32: ## BB#0: ## %entry
+; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpbroadcastb 1(%eax), %ymm0
; X32-NEXT: retl
;
; X64-LABEL: load_splat_32i8_16i8_11111111111111111111111111111111:
-; X64: ## BB#0: ## %entry
+; X64: ## %bb.0: ## %entry
; X64-NEXT: vpbroadcastb 1(%rdi), %ymm0
; X64-NEXT: retq
entry:
@@ -340,13 +305,13 @@ entry:
define <32 x i8> @load_splat_32i8_32i8_11111111111111111111111111111111(<32 x i8>* %ptr) nounwind uwtable readnone ssp {
; X32-LABEL: load_splat_32i8_32i8_11111111111111111111111111111111:
-; X32: ## BB#0: ## %entry
+; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpbroadcastb 1(%eax), %ymm0
; X32-NEXT: retl
;
; X64-LABEL: load_splat_32i8_32i8_11111111111111111111111111111111:
-; X64: ## BB#0: ## %entry
+; X64: ## %bb.0: ## %entry
; X64-NEXT: vpbroadcastb 1(%rdi), %ymm0
; X64-NEXT: retq
entry:
@@ -357,13 +322,13 @@ entry:
define <8 x i16> @load_splat_8i16_8i16_11111111(<8 x i16>* %ptr) nounwind uwtable readnone ssp {
; X32-LABEL: load_splat_8i16_8i16_11111111:
-; X32: ## BB#0: ## %entry
+; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpbroadcastw 2(%eax), %xmm0
; X32-NEXT: retl
;
; X64-LABEL: load_splat_8i16_8i16_11111111:
-; X64: ## BB#0: ## %entry
+; X64: ## %bb.0: ## %entry
; X64-NEXT: vpbroadcastw 2(%rdi), %xmm0
; X64-NEXT: retq
entry:
@@ -374,13 +339,13 @@ entry:
define <16 x i16> @load_splat_16i16_8i16_1111111111111111(<8 x i16>* %ptr) nounwind uwtable readnone ssp {
; X32-LABEL: load_splat_16i16_8i16_1111111111111111:
-; X32: ## BB#0: ## %entry
+; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpbroadcastw 2(%eax), %ymm0
; X32-NEXT: retl
;
; X64-LABEL: load_splat_16i16_8i16_1111111111111111:
-; X64: ## BB#0: ## %entry
+; X64: ## %bb.0: ## %entry
; X64-NEXT: vpbroadcastw 2(%rdi), %ymm0
; X64-NEXT: retq
entry:
@@ -391,13 +356,13 @@ entry:
define <16 x i16> @load_splat_16i16_16i16_1111111111111111(<16 x i16>* %ptr) nounwind uwtable readnone ssp {
; X32-LABEL: load_splat_16i16_16i16_1111111111111111:
-; X32: ## BB#0: ## %entry
+; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpbroadcastw 2(%eax), %ymm0
; X32-NEXT: retl
;
; X64-LABEL: load_splat_16i16_16i16_1111111111111111:
-; X64: ## BB#0: ## %entry
+; X64: ## %bb.0: ## %entry
; X64-NEXT: vpbroadcastw 2(%rdi), %ymm0
; X64-NEXT: retq
entry:
@@ -408,13 +373,13 @@ entry:
define <4 x i32> @load_splat_4i32_4i32_1111(<4 x i32>* %ptr) nounwind uwtable readnone ssp {
; X32-LABEL: load_splat_4i32_4i32_1111:
-; X32: ## BB#0: ## %entry
+; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vbroadcastss 4(%eax), %xmm0
; X32-NEXT: retl
;
; X64-LABEL: load_splat_4i32_4i32_1111:
-; X64: ## BB#0: ## %entry
+; X64: ## %bb.0: ## %entry
; X64-NEXT: vbroadcastss 4(%rdi), %xmm0
; X64-NEXT: retq
entry:
@@ -425,13 +390,13 @@ entry:
define <8 x i32> @load_splat_8i32_4i32_33333333(<4 x i32>* %ptr) nounwind uwtable readnone ssp {
; X32-LABEL: load_splat_8i32_4i32_33333333:
-; X32: ## BB#0: ## %entry
+; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vbroadcastss 12(%eax), %ymm0
; X32-NEXT: retl
;
; X64-LABEL: load_splat_8i32_4i32_33333333:
-; X64: ## BB#0: ## %entry
+; X64: ## %bb.0: ## %entry
; X64-NEXT: vbroadcastss 12(%rdi), %ymm0
; X64-NEXT: retq
entry:
@@ -442,13 +407,13 @@ entry:
define <8 x i32> @load_splat_8i32_8i32_55555555(<8 x i32>* %ptr) nounwind uwtable readnone ssp {
; X32-LABEL: load_splat_8i32_8i32_55555555:
-; X32: ## BB#0: ## %entry
+; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vbroadcastss 20(%eax), %ymm0
; X32-NEXT: retl
;
; X64-LABEL: load_splat_8i32_8i32_55555555:
-; X64: ## BB#0: ## %entry
+; X64: ## %bb.0: ## %entry
; X64-NEXT: vbroadcastss 20(%rdi), %ymm0
; X64-NEXT: retq
entry:
@@ -459,13 +424,13 @@ entry:
define <4 x float> @load_splat_4f32_4f32_1111(<4 x float>* %ptr) nounwind uwtable readnone ssp {
; X32-LABEL: load_splat_4f32_4f32_1111:
-; X32: ## BB#0: ## %entry
+; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vbroadcastss 4(%eax), %xmm0
; X32-NEXT: retl
;
; X64-LABEL: load_splat_4f32_4f32_1111:
-; X64: ## BB#0: ## %entry
+; X64: ## %bb.0: ## %entry
; X64-NEXT: vbroadcastss 4(%rdi), %xmm0
; X64-NEXT: retq
entry:
@@ -476,13 +441,13 @@ entry:
define <8 x float> @load_splat_8f32_4f32_33333333(<4 x float>* %ptr) nounwind uwtable readnone ssp {
; X32-LABEL: load_splat_8f32_4f32_33333333:
-; X32: ## BB#0: ## %entry
+; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vbroadcastss 12(%eax), %ymm0
; X32-NEXT: retl
;
; X64-LABEL: load_splat_8f32_4f32_33333333:
-; X64: ## BB#0: ## %entry
+; X64: ## %bb.0: ## %entry
; X64-NEXT: vbroadcastss 12(%rdi), %ymm0
; X64-NEXT: retq
entry:
@@ -493,13 +458,13 @@ entry:
define <8 x float> @load_splat_8f32_8f32_55555555(<8 x float>* %ptr) nounwind uwtable readnone ssp {
; X32-LABEL: load_splat_8f32_8f32_55555555:
-; X32: ## BB#0: ## %entry
+; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vbroadcastss 20(%eax), %ymm0
; X32-NEXT: retl
;
; X64-LABEL: load_splat_8f32_8f32_55555555:
-; X64: ## BB#0: ## %entry
+; X64: ## %bb.0: ## %entry
; X64-NEXT: vbroadcastss 20(%rdi), %ymm0
; X64-NEXT: retq
entry:
@@ -510,13 +475,13 @@ entry:
define <2 x i64> @load_splat_2i64_2i64_1111(<2 x i64>* %ptr) nounwind uwtable readnone ssp {
; X32-LABEL: load_splat_2i64_2i64_1111:
-; X32: ## BB#0: ## %entry
+; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
; X32-NEXT: retl
;
; X64-LABEL: load_splat_2i64_2i64_1111:
-; X64: ## BB#0: ## %entry
+; X64: ## %bb.0: ## %entry
; X64-NEXT: vpbroadcastq 8(%rdi), %xmm0
; X64-NEXT: retq
entry:
@@ -527,13 +492,13 @@ entry:
define <4 x i64> @load_splat_4i64_2i64_1111(<2 x i64>* %ptr) nounwind uwtable readnone ssp {
; X32-LABEL: load_splat_4i64_2i64_1111:
-; X32: ## BB#0: ## %entry
+; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vbroadcastsd 8(%eax), %ymm0
; X32-NEXT: retl
;
; X64-LABEL: load_splat_4i64_2i64_1111:
-; X64: ## BB#0: ## %entry
+; X64: ## %bb.0: ## %entry
; X64-NEXT: vbroadcastsd 8(%rdi), %ymm0
; X64-NEXT: retq
entry:
@@ -544,13 +509,13 @@ entry:
define <4 x i64> @load_splat_4i64_4i64_2222(<4 x i64>* %ptr) nounwind uwtable readnone ssp {
; X32-LABEL: load_splat_4i64_4i64_2222:
-; X32: ## BB#0: ## %entry
+; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vbroadcastsd 16(%eax), %ymm0
; X32-NEXT: retl
;
; X64-LABEL: load_splat_4i64_4i64_2222:
-; X64: ## BB#0: ## %entry
+; X64: ## %bb.0: ## %entry
; X64-NEXT: vbroadcastsd 16(%rdi), %ymm0
; X64-NEXT: retq
entry:
@@ -561,13 +526,13 @@ entry:
define <2 x double> @load_splat_2f64_2f64_1111(<2 x double>* %ptr) nounwind uwtable readnone ssp {
; X32-LABEL: load_splat_2f64_2f64_1111:
-; X32: ## BB#0: ## %entry
+; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
; X32-NEXT: retl
;
; X64-LABEL: load_splat_2f64_2f64_1111:
-; X64: ## BB#0: ## %entry
+; X64: ## %bb.0: ## %entry
; X64-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
; X64-NEXT: retq
entry:
@@ -578,13 +543,13 @@ entry:
define <4 x double> @load_splat_4f64_2f64_1111(<2 x double>* %ptr) nounwind uwtable readnone ssp {
; X32-LABEL: load_splat_4f64_2f64_1111:
-; X32: ## BB#0: ## %entry
+; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vbroadcastsd 8(%eax), %ymm0
; X32-NEXT: retl
;
; X64-LABEL: load_splat_4f64_2f64_1111:
-; X64: ## BB#0: ## %entry
+; X64: ## %bb.0: ## %entry
; X64-NEXT: vbroadcastsd 8(%rdi), %ymm0
; X64-NEXT: retq
entry:
@@ -595,13 +560,13 @@ entry:
define <4 x double> @load_splat_4f64_4f64_2222(<4 x double>* %ptr) nounwind uwtable readnone ssp {
; X32-LABEL: load_splat_4f64_4f64_2222:
-; X32: ## BB#0: ## %entry
+; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vbroadcastsd 16(%eax), %ymm0
; X32-NEXT: retl
;
; X64-LABEL: load_splat_4f64_4f64_2222:
-; X64: ## BB#0: ## %entry
+; X64: ## %bb.0: ## %entry
; X64-NEXT: vbroadcastsd 16(%rdi), %ymm0
; X64-NEXT: retq
entry:
@@ -614,13 +579,13 @@ entry:
; this used to crash
define <2 x double> @I(double* %ptr) nounwind uwtable readnone ssp {
; X32-LABEL: I:
-; X32: ## BB#0: ## %entry
+; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
; X32-NEXT: retl
;
; X64-LABEL: I:
-; X64: ## BB#0: ## %entry
+; X64: ## %bb.0: ## %entry
; X64-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
; X64-NEXT: retq
entry:
@@ -632,24 +597,24 @@ entry:
define <8 x i32> @V111(<8 x i32> %in) nounwind uwtable readnone ssp {
; X32-AVX2-LABEL: V111:
-; X32-AVX2: ## BB#0: ## %entry
+; X32-AVX2: ## %bb.0: ## %entry
; X32-AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2]
; X32-AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; X32-AVX2-NEXT: retl
;
; X64-AVX2-LABEL: V111:
-; X64-AVX2: ## BB#0: ## %entry
+; X64-AVX2: ## %bb.0: ## %entry
; X64-AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2]
; X64-AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; X64-AVX2-NEXT: retq
;
; X32-AVX512VL-LABEL: V111:
-; X32-AVX512VL: ## BB#0: ## %entry
+; X32-AVX512VL: ## %bb.0: ## %entry
; X32-AVX512VL-NEXT: vpaddd LCPI29_0{1to8}, %ymm0, %ymm0
; X32-AVX512VL-NEXT: retl
;
; X64-AVX512VL-LABEL: V111:
-; X64-AVX512VL: ## BB#0: ## %entry
+; X64-AVX512VL: ## %bb.0: ## %entry
; X64-AVX512VL-NEXT: vpaddd {{.*}}(%rip){1to8}, %ymm0, %ymm0
; X64-AVX512VL-NEXT: retq
entry:
@@ -659,24 +624,24 @@ entry:
define <8 x float> @V113(<8 x float> %in) nounwind uwtable readnone ssp {
; X32-AVX2-LABEL: V113:
-; X32-AVX2: ## BB#0: ## %entry
+; X32-AVX2: ## %bb.0: ## %entry
; X32-AVX2-NEXT: vbroadcastss {{.*#+}} ymm1 = [-0.0078125,-0.0078125,-0.0078125,-0.0078125,-0.0078125,-0.0078125,-0.0078125,-0.0078125]
; X32-AVX2-NEXT: vaddps %ymm1, %ymm0, %ymm0
; X32-AVX2-NEXT: retl
;
; X64-AVX2-LABEL: V113:
-; X64-AVX2: ## BB#0: ## %entry
+; X64-AVX2: ## %bb.0: ## %entry
; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm1 = [-0.0078125,-0.0078125,-0.0078125,-0.0078125,-0.0078125,-0.0078125,-0.0078125,-0.0078125]
; X64-AVX2-NEXT: vaddps %ymm1, %ymm0, %ymm0
; X64-AVX2-NEXT: retq
;
; X32-AVX512VL-LABEL: V113:
-; X32-AVX512VL: ## BB#0: ## %entry
+; X32-AVX512VL: ## %bb.0: ## %entry
; X32-AVX512VL-NEXT: vaddps LCPI30_0{1to8}, %ymm0, %ymm0
; X32-AVX512VL-NEXT: retl
;
; X64-AVX512VL-LABEL: V113:
-; X64-AVX512VL: ## BB#0: ## %entry
+; X64-AVX512VL: ## %bb.0: ## %entry
; X64-AVX512VL-NEXT: vaddps {{.*}}(%rip){1to8}, %ymm0, %ymm0
; X64-AVX512VL-NEXT: retq
entry:
@@ -686,12 +651,12 @@ entry:
define <4 x float> @_e2(float* %ptr) nounwind uwtable readnone ssp {
; X32-LABEL: _e2:
-; X32: ## BB#0:
+; X32: ## %bb.0:
; X32-NEXT: vbroadcastss {{.*#+}} xmm0 = [-0.0078125,-0.0078125,-0.0078125,-0.0078125]
; X32-NEXT: retl
;
; X64-LABEL: _e2:
-; X64: ## BB#0:
+; X64: ## %bb.0:
; X64-NEXT: vbroadcastss {{.*#+}} xmm0 = [-0.0078125,-0.0078125,-0.0078125,-0.0078125]
; X64-NEXT: retq
%vecinit.i = insertelement <4 x float> undef, float 0xbf80000000000000, i32 0
@@ -703,12 +668,12 @@ define <4 x float> @_e2(float* %ptr) nounwind uwtable readnone ssp {
define <8 x i8> @_e4(i8* %ptr) nounwind uwtable readnone ssp {
; X32-LABEL: _e4:
-; X32: ## BB#0:
+; X32: ## %bb.0:
; X32-NEXT: vmovaps {{.*#+}} xmm0 = [52,52,52,52,52,52,52,52]
; X32-NEXT: retl
;
; X64-LABEL: _e4:
-; X64: ## BB#0:
+; X64: ## %bb.0:
; X64-NEXT: vmovaps {{.*#+}} xmm0 = [52,52,52,52,52,52,52,52]
; X64-NEXT: retq
%vecinit0.i = insertelement <8 x i8> undef, i8 52, i32 0
@@ -724,11 +689,11 @@ define <8 x i8> @_e4(i8* %ptr) nounwind uwtable readnone ssp {
define void @crash() nounwind alwaysinline {
; X32-LABEL: crash:
-; X32: ## BB#0: ## %WGLoopsEntry
+; X32: ## %bb.0: ## %WGLoopsEntry
; X32-NEXT: xorl %eax, %eax
; X32-NEXT: testb %al, %al
; X32-NEXT: je LBB33_1
-; X32-NEXT: ## BB#2: ## %ret
+; X32-NEXT: ## %bb.2: ## %ret
; X32-NEXT: retl
; X32-NEXT: .p2align 4, 0x90
; X32-NEXT: LBB33_1: ## %footer349VF
@@ -736,11 +701,11 @@ define void @crash() nounwind alwaysinline {
; X32-NEXT: jmp LBB33_1
;
; X64-LABEL: crash:
-; X64: ## BB#0: ## %WGLoopsEntry
+; X64: ## %bb.0: ## %WGLoopsEntry
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: testb %al, %al
; X64-NEXT: je LBB33_1
-; X64-NEXT: ## BB#2: ## %ret
+; X64-NEXT: ## %bb.2: ## %ret
; X64-NEXT: retq
; X64-NEXT: .p2align 4, 0x90
; X64-NEXT: LBB33_1: ## %footer349VF
@@ -774,18 +739,18 @@ ret:
define <8 x i32> @_inreg0(i32 %scalar) nounwind uwtable readnone ssp {
; X32-LABEL: _inreg0:
-; X32: ## BB#0:
+; X32: ## %bb.0:
; X32-NEXT: vbroadcastss {{[0-9]+}}(%esp), %ymm0
; X32-NEXT: retl
;
; X64-AVX2-LABEL: _inreg0:
-; X64-AVX2: ## BB#0:
+; X64-AVX2: ## %bb.0:
; X64-AVX2-NEXT: vmovd %edi, %xmm0
; X64-AVX2-NEXT: vpbroadcastd %xmm0, %ymm0
; X64-AVX2-NEXT: retq
;
; X64-AVX512VL-LABEL: _inreg0:
-; X64-AVX512VL: ## BB#0:
+; X64-AVX512VL: ## %bb.0:
; X64-AVX512VL-NEXT: vpbroadcastd %edi, %ymm0
; X64-AVX512VL-NEXT: retq
%in = insertelement <8 x i32> undef, i32 %scalar, i32 0
@@ -795,12 +760,12 @@ define <8 x i32> @_inreg0(i32 %scalar) nounwind uwtable readnone ssp {
define <8 x float> @_inreg1(float %scalar) nounwind uwtable readnone ssp {
; X32-LABEL: _inreg1:
-; X32: ## BB#0:
+; X32: ## %bb.0:
; X32-NEXT: vbroadcastss {{[0-9]+}}(%esp), %ymm0
; X32-NEXT: retl
;
; X64-LABEL: _inreg1:
-; X64: ## BB#0:
+; X64: ## %bb.0:
; X64-NEXT: vbroadcastss %xmm0, %ymm0
; X64-NEXT: retq
%in = insertelement <8 x float> undef, float %scalar, i32 0
@@ -810,12 +775,12 @@ define <8 x float> @_inreg1(float %scalar) nounwind uwtable readnone ssp {
define <4 x float> @_inreg2(float %scalar) nounwind uwtable readnone ssp {
; X32-LABEL: _inreg2:
-; X32: ## BB#0:
+; X32: ## %bb.0:
; X32-NEXT: vbroadcastss {{[0-9]+}}(%esp), %xmm0
; X32-NEXT: retl
;
; X64-LABEL: _inreg2:
-; X64: ## BB#0:
+; X64: ## %bb.0:
; X64-NEXT: vbroadcastss %xmm0, %xmm0
; X64-NEXT: retq
%in = insertelement <4 x float> undef, float %scalar, i32 0
@@ -825,12 +790,12 @@ define <4 x float> @_inreg2(float %scalar) nounwind uwtable readnone ssp {
define <4 x double> @_inreg3(double %scalar) nounwind uwtable readnone ssp {
; X32-LABEL: _inreg3:
-; X32: ## BB#0:
+; X32: ## %bb.0:
; X32-NEXT: vbroadcastsd {{[0-9]+}}(%esp), %ymm0
; X32-NEXT: retl
;
; X64-LABEL: _inreg3:
-; X64: ## BB#0:
+; X64: ## %bb.0:
; X64-NEXT: vbroadcastsd %xmm0, %ymm0
; X64-NEXT: retq
%in = insertelement <4 x double> undef, double %scalar, i32 0
@@ -840,12 +805,12 @@ define <4 x double> @_inreg3(double %scalar) nounwind uwtable readnone ssp {
define <8 x float> @_inreg8xfloat(<8 x float> %a) {
; X32-LABEL: _inreg8xfloat:
-; X32: ## BB#0:
+; X32: ## %bb.0:
; X32-NEXT: vbroadcastss %xmm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: _inreg8xfloat:
-; X64: ## BB#0:
+; X64: ## %bb.0:
; X64-NEXT: vbroadcastss %xmm0, %ymm0
; X64-NEXT: retq
%b = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> zeroinitializer
@@ -854,12 +819,12 @@ define <8 x float> @_inreg8xfloat(<8 x float> %a) {
define <4 x float> @_inreg4xfloat(<4 x float> %a) {
; X32-LABEL: _inreg4xfloat:
-; X32: ## BB#0:
+; X32: ## %bb.0:
; X32-NEXT: vbroadcastss %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: _inreg4xfloat:
-; X64: ## BB#0:
+; X64: ## %bb.0:
; X64-NEXT: vbroadcastss %xmm0, %xmm0
; X64-NEXT: retq
%b = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> zeroinitializer
@@ -868,12 +833,12 @@ define <4 x float> @_inreg4xfloat(<4 x float> %a) {
define <16 x i16> @_inreg16xi16(<16 x i16> %a) {
; X32-LABEL: _inreg16xi16:
-; X32: ## BB#0:
+; X32: ## %bb.0:
; X32-NEXT: vpbroadcastw %xmm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: _inreg16xi16:
-; X64: ## BB#0:
+; X64: ## %bb.0:
; X64-NEXT: vpbroadcastw %xmm0, %ymm0
; X64-NEXT: retq
%b = shufflevector <16 x i16> %a, <16 x i16> undef, <16 x i32> zeroinitializer
@@ -882,12 +847,12 @@ define <16 x i16> @_inreg16xi16(<16 x i16> %a) {
define <8 x i16> @_inreg8xi16(<8 x i16> %a) {
; X32-LABEL: _inreg8xi16:
-; X32: ## BB#0:
+; X32: ## %bb.0:
; X32-NEXT: vpbroadcastw %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: _inreg8xi16:
-; X64: ## BB#0:
+; X64: ## %bb.0:
; X64-NEXT: vpbroadcastw %xmm0, %xmm0
; X64-NEXT: retq
%b = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> zeroinitializer
@@ -896,12 +861,12 @@ define <8 x i16> @_inreg8xi16(<8 x i16> %a) {
define <4 x i64> @_inreg4xi64(<4 x i64> %a) {
; X32-LABEL: _inreg4xi64:
-; X32: ## BB#0:
+; X32: ## %bb.0:
; X32-NEXT: vbroadcastsd %xmm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: _inreg4xi64:
-; X64: ## BB#0:
+; X64: ## %bb.0:
; X64-NEXT: vbroadcastsd %xmm0, %ymm0
; X64-NEXT: retq
%b = shufflevector <4 x i64> %a, <4 x i64> undef, <4 x i32> zeroinitializer
@@ -910,12 +875,12 @@ define <4 x i64> @_inreg4xi64(<4 x i64> %a) {
define <2 x i64> @_inreg2xi64(<2 x i64> %a) {
; X32-LABEL: _inreg2xi64:
-; X32: ## BB#0:
+; X32: ## %bb.0:
; X32-NEXT: vpbroadcastq %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: _inreg2xi64:
-; X64: ## BB#0:
+; X64: ## %bb.0:
; X64-NEXT: vpbroadcastq %xmm0, %xmm0
; X64-NEXT: retq
%b = shufflevector <2 x i64> %a, <2 x i64> undef, <2 x i32> zeroinitializer
@@ -924,12 +889,12 @@ define <2 x i64> @_inreg2xi64(<2 x i64> %a) {
define <4 x double> @_inreg4xdouble(<4 x double> %a) {
; X32-LABEL: _inreg4xdouble:
-; X32: ## BB#0:
+; X32: ## %bb.0:
; X32-NEXT: vbroadcastsd %xmm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: _inreg4xdouble:
-; X64: ## BB#0:
+; X64: ## %bb.0:
; X64-NEXT: vbroadcastsd %xmm0, %ymm0
; X64-NEXT: retq
%b = shufflevector <4 x double> %a, <4 x double> undef, <4 x i32> zeroinitializer
@@ -938,12 +903,12 @@ define <4 x double> @_inreg4xdouble(<4 x double> %a) {
define <2 x double> @_inreg2xdouble(<2 x double> %a) {
; X32-LABEL: _inreg2xdouble:
-; X32: ## BB#0:
+; X32: ## %bb.0:
; X32-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
; X32-NEXT: retl
;
; X64-LABEL: _inreg2xdouble:
-; X64: ## BB#0:
+; X64: ## %bb.0:
; X64-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
; X64-NEXT: retq
%b = shufflevector <2 x double> %a, <2 x double> undef, <2 x i32> zeroinitializer
@@ -952,12 +917,12 @@ define <2 x double> @_inreg2xdouble(<2 x double> %a) {
define <8 x i32> @_inreg8xi32(<8 x i32> %a) {
; X32-LABEL: _inreg8xi32:
-; X32: ## BB#0:
+; X32: ## %bb.0:
; X32-NEXT: vbroadcastss %xmm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: _inreg8xi32:
-; X64: ## BB#0:
+; X64: ## %bb.0:
; X64-NEXT: vbroadcastss %xmm0, %ymm0
; X64-NEXT: retq
%b = shufflevector <8 x i32> %a, <8 x i32> undef, <8 x i32> zeroinitializer
@@ -966,12 +931,12 @@ define <8 x i32> @_inreg8xi32(<8 x i32> %a) {
define <4 x i32> @_inreg4xi32(<4 x i32> %a) {
; X32-LABEL: _inreg4xi32:
-; X32: ## BB#0:
+; X32: ## %bb.0:
; X32-NEXT: vbroadcastss %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: _inreg4xi32:
-; X64: ## BB#0:
+; X64: ## %bb.0:
; X64-NEXT: vbroadcastss %xmm0, %xmm0
; X64-NEXT: retq
%b = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> zeroinitializer
@@ -980,12 +945,12 @@ define <4 x i32> @_inreg4xi32(<4 x i32> %a) {
define <32 x i8> @_inreg32xi8(<32 x i8> %a) {
; X32-LABEL: _inreg32xi8:
-; X32: ## BB#0:
+; X32: ## %bb.0:
; X32-NEXT: vpbroadcastb %xmm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: _inreg32xi8:
-; X64: ## BB#0:
+; X64: ## %bb.0:
; X64-NEXT: vpbroadcastb %xmm0, %ymm0
; X64-NEXT: retq
%b = shufflevector <32 x i8> %a, <32 x i8> undef, <32 x i32> zeroinitializer
@@ -994,12 +959,12 @@ define <32 x i8> @_inreg32xi8(<32 x i8> %a) {
define <16 x i8> @_inreg16xi8(<16 x i8> %a) {
; X32-LABEL: _inreg16xi8:
-; X32: ## BB#0:
+; X32: ## %bb.0:
; X32-NEXT: vpbroadcastb %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: _inreg16xi8:
-; X64: ## BB#0:
+; X64: ## %bb.0:
; X64-NEXT: vpbroadcastb %xmm0, %xmm0
; X64-NEXT: retq
%b = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> zeroinitializer
@@ -1012,12 +977,12 @@ define <16 x i8> @_inreg16xi8(<16 x i8> %a) {
define <8 x float> @splat_concat1(float %f) {
; X32-LABEL: splat_concat1:
-; X32: ## BB#0:
+; X32: ## %bb.0:
; X32-NEXT: vbroadcastss {{[0-9]+}}(%esp), %ymm0
; X32-NEXT: retl
;
; X64-LABEL: splat_concat1:
-; X64: ## BB#0:
+; X64: ## %bb.0:
; X64-NEXT: vbroadcastss %xmm0, %ymm0
; X64-NEXT: retq
%1 = insertelement <4 x float> undef, float %f, i32 0
@@ -1030,12 +995,12 @@ define <8 x float> @splat_concat1(float %f) {
define <8 x float> @splat_concat2(float %f) {
; X32-LABEL: splat_concat2:
-; X32: ## BB#0:
+; X32: ## %bb.0:
; X32-NEXT: vbroadcastss {{[0-9]+}}(%esp), %ymm0
; X32-NEXT: retl
;
; X64-LABEL: splat_concat2:
-; X64: ## BB#0:
+; X64: ## %bb.0:
; X64-NEXT: vbroadcastss %xmm0, %ymm0
; X64-NEXT: retq
%1 = insertelement <4 x float> undef, float %f, i32 0
@@ -1052,12 +1017,12 @@ define <8 x float> @splat_concat2(float %f) {
define <4 x double> @splat_concat3(double %d) {
; X32-LABEL: splat_concat3:
-; X32: ## BB#0:
+; X32: ## %bb.0:
; X32-NEXT: vbroadcastsd {{[0-9]+}}(%esp), %ymm0
; X32-NEXT: retl
;
; X64-LABEL: splat_concat3:
-; X64: ## BB#0:
+; X64: ## %bb.0:
; X64-NEXT: vbroadcastsd %xmm0, %ymm0
; X64-NEXT: retq
%1 = insertelement <2 x double> undef, double %d, i32 0
@@ -1068,12 +1033,12 @@ define <4 x double> @splat_concat3(double %d) {
define <4 x double> @splat_concat4(double %d) {
; X32-LABEL: splat_concat4:
-; X32: ## BB#0:
+; X32: ## %bb.0:
; X32-NEXT: vbroadcastsd {{[0-9]+}}(%esp), %ymm0
; X32-NEXT: retl
;
; X64-LABEL: splat_concat4:
-; X64: ## BB#0:
+; X64: ## %bb.0:
; X64-NEXT: vbroadcastsd %xmm0, %ymm0
; X64-NEXT: retq
%1 = insertelement <2 x double> undef, double %d, i32 0
@@ -1094,9 +1059,8 @@ define <4 x double> @splat_concat4(double %d) {
define void @isel_crash_16b(i8* %cV_R.addr) {
; X32-LABEL: isel_crash_16b:
-; X32: ## BB#0: ## %eintry
+; X32: ## %bb.0: ## %eintry
; X32-NEXT: subl $60, %esp
-; X32-NEXT: Lcfi0:
; X32-NEXT: .cfi_def_cfa_offset 64
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vxorps %xmm0, %xmm0, %xmm0
@@ -1108,7 +1072,7 @@ define void @isel_crash_16b(i8* %cV_R.addr) {
; X32-NEXT: retl
;
; X64-LABEL: isel_crash_16b:
-; X64: ## BB#0: ## %eintry
+; X64: ## %bb.0: ## %eintry
; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0
; X64-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-NEXT: movb (%rdi), %al
@@ -1134,19 +1098,16 @@ eintry:
define void @isel_crash_32b(i8* %cV_R.addr) {
; X32-LABEL: isel_crash_32b:
-; X32: ## BB#0: ## %eintry
+; X32: ## %bb.0: ## %eintry
; X32-NEXT: pushl %ebp
-; X32-NEXT: Lcfi1:
; X32-NEXT: .cfi_def_cfa_offset 8
-; X32-NEXT: Lcfi2:
; X32-NEXT: .cfi_offset %ebp, -8
; X32-NEXT: movl %esp, %ebp
-; X32-NEXT: Lcfi3:
; X32-NEXT: .cfi_def_cfa_register %ebp
; X32-NEXT: andl $-32, %esp
; X32-NEXT: subl $128, %esp
; X32-NEXT: movl 8(%ebp), %eax
-; X32-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; X32-NEXT: vxorps %xmm0, %xmm0, %xmm0
; X32-NEXT: vmovaps %ymm0, (%esp)
; X32-NEXT: vpbroadcastb (%eax), %ymm1
; X32-NEXT: vmovaps %ymm0, {{[0-9]+}}(%esp)
@@ -1157,18 +1118,15 @@ define void @isel_crash_32b(i8* %cV_R.addr) {
; X32-NEXT: retl
;
; X64-LABEL: isel_crash_32b:
-; X64: ## BB#0: ## %eintry
+; X64: ## %bb.0: ## %eintry
; X64-NEXT: pushq %rbp
-; X64-NEXT: Lcfi0:
; X64-NEXT: .cfi_def_cfa_offset 16
-; X64-NEXT: Lcfi1:
; X64-NEXT: .cfi_offset %rbp, -16
; X64-NEXT: movq %rsp, %rbp
-; X64-NEXT: Lcfi2:
; X64-NEXT: .cfi_def_cfa_register %rbp
; X64-NEXT: andq $-32, %rsp
; X64-NEXT: subq $128, %rsp
-; X64-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0
; X64-NEXT: vmovaps %ymm0, (%rsp)
; X64-NEXT: movb (%rdi), %al
; X64-NEXT: vmovd %eax, %xmm1
@@ -1196,9 +1154,8 @@ eintry:
define void @isel_crash_8w(i16* %cV_R.addr) {
; X32-LABEL: isel_crash_8w:
-; X32: ## BB#0: ## %entry
+; X32: ## %bb.0: ## %entry
; X32-NEXT: subl $60, %esp
-; X32-NEXT: Lcfi4:
; X32-NEXT: .cfi_def_cfa_offset 64
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vxorps %xmm0, %xmm0, %xmm0
@@ -1210,10 +1167,10 @@ define void @isel_crash_8w(i16* %cV_R.addr) {
; X32-NEXT: retl
;
; X64-LABEL: isel_crash_8w:
-; X64: ## BB#0: ## %entry
+; X64: ## %bb.0: ## %entry
; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0
; X64-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movw (%rdi), %ax
+; X64-NEXT: movzwl (%rdi), %eax
; X64-NEXT: vmovd %eax, %xmm1
; X64-NEXT: vpbroadcastw %xmm1, %xmm1
; X64-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
@@ -1236,19 +1193,16 @@ entry:
define void @isel_crash_16w(i16* %cV_R.addr) {
; X32-LABEL: isel_crash_16w:
-; X32: ## BB#0: ## %eintry
+; X32: ## %bb.0: ## %eintry
; X32-NEXT: pushl %ebp
-; X32-NEXT: Lcfi5:
; X32-NEXT: .cfi_def_cfa_offset 8
-; X32-NEXT: Lcfi6:
; X32-NEXT: .cfi_offset %ebp, -8
; X32-NEXT: movl %esp, %ebp
-; X32-NEXT: Lcfi7:
; X32-NEXT: .cfi_def_cfa_register %ebp
; X32-NEXT: andl $-32, %esp
; X32-NEXT: subl $128, %esp
; X32-NEXT: movl 8(%ebp), %eax
-; X32-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; X32-NEXT: vxorps %xmm0, %xmm0, %xmm0
; X32-NEXT: vmovaps %ymm0, (%esp)
; X32-NEXT: vpbroadcastw (%eax), %ymm1
; X32-NEXT: vmovaps %ymm0, {{[0-9]+}}(%esp)
@@ -1259,20 +1213,17 @@ define void @isel_crash_16w(i16* %cV_R.addr) {
; X32-NEXT: retl
;
; X64-LABEL: isel_crash_16w:
-; X64: ## BB#0: ## %eintry
+; X64: ## %bb.0: ## %eintry
; X64-NEXT: pushq %rbp
-; X64-NEXT: Lcfi3:
; X64-NEXT: .cfi_def_cfa_offset 16
-; X64-NEXT: Lcfi4:
; X64-NEXT: .cfi_offset %rbp, -16
; X64-NEXT: movq %rsp, %rbp
-; X64-NEXT: Lcfi5:
; X64-NEXT: .cfi_def_cfa_register %rbp
; X64-NEXT: andq $-32, %rsp
; X64-NEXT: subq $128, %rsp
-; X64-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0
; X64-NEXT: vmovaps %ymm0, (%rsp)
-; X64-NEXT: movw (%rdi), %ax
+; X64-NEXT: movzwl (%rdi), %eax
; X64-NEXT: vmovd %eax, %xmm1
; X64-NEXT: vpbroadcastw %xmm1, %ymm1
; X64-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
@@ -1298,9 +1249,8 @@ eintry:
define void @isel_crash_4d(i32* %cV_R.addr) {
; X32-LABEL: isel_crash_4d:
-; X32: ## BB#0: ## %entry
+; X32: ## %bb.0: ## %entry
; X32-NEXT: subl $60, %esp
-; X32-NEXT: Lcfi8:
; X32-NEXT: .cfi_def_cfa_offset 64
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vxorps %xmm0, %xmm0, %xmm0
@@ -1312,7 +1262,7 @@ define void @isel_crash_4d(i32* %cV_R.addr) {
; X32-NEXT: retl
;
; X64-AVX2-LABEL: isel_crash_4d:
-; X64-AVX2: ## BB#0: ## %entry
+; X64-AVX2: ## %bb.0: ## %entry
; X64-AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
; X64-AVX2-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-AVX2-NEXT: movl (%rdi), %eax
@@ -1323,7 +1273,7 @@ define void @isel_crash_4d(i32* %cV_R.addr) {
; X64-AVX2-NEXT: retq
;
; X64-AVX512VL-LABEL: isel_crash_4d:
-; X64-AVX512VL: ## BB#0: ## %entry
+; X64-AVX512VL: ## %bb.0: ## %entry
; X64-AVX512VL-NEXT: vxorps %xmm0, %xmm0, %xmm0
; X64-AVX512VL-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-AVX512VL-NEXT: movl (%rdi), %eax
@@ -1348,19 +1298,16 @@ entry:
define void @isel_crash_8d(i32* %cV_R.addr) {
; X32-LABEL: isel_crash_8d:
-; X32: ## BB#0: ## %eintry
+; X32: ## %bb.0: ## %eintry
; X32-NEXT: pushl %ebp
-; X32-NEXT: Lcfi9:
; X32-NEXT: .cfi_def_cfa_offset 8
-; X32-NEXT: Lcfi10:
; X32-NEXT: .cfi_offset %ebp, -8
; X32-NEXT: movl %esp, %ebp
-; X32-NEXT: Lcfi11:
; X32-NEXT: .cfi_def_cfa_register %ebp
; X32-NEXT: andl $-32, %esp
; X32-NEXT: subl $128, %esp
; X32-NEXT: movl 8(%ebp), %eax
-; X32-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; X32-NEXT: vxorps %xmm0, %xmm0, %xmm0
; X32-NEXT: vmovaps %ymm0, (%esp)
; X32-NEXT: vbroadcastss (%eax), %ymm1
; X32-NEXT: vmovaps %ymm0, {{[0-9]+}}(%esp)
@@ -1371,18 +1318,15 @@ define void @isel_crash_8d(i32* %cV_R.addr) {
; X32-NEXT: retl
;
; X64-AVX2-LABEL: isel_crash_8d:
-; X64-AVX2: ## BB#0: ## %eintry
+; X64-AVX2: ## %bb.0: ## %eintry
; X64-AVX2-NEXT: pushq %rbp
-; X64-AVX2-NEXT: Lcfi6:
; X64-AVX2-NEXT: .cfi_def_cfa_offset 16
-; X64-AVX2-NEXT: Lcfi7:
; X64-AVX2-NEXT: .cfi_offset %rbp, -16
; X64-AVX2-NEXT: movq %rsp, %rbp
-; X64-AVX2-NEXT: Lcfi8:
; X64-AVX2-NEXT: .cfi_def_cfa_register %rbp
; X64-AVX2-NEXT: andq $-32, %rsp
; X64-AVX2-NEXT: subq $128, %rsp
-; X64-AVX2-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; X64-AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
; X64-AVX2-NEXT: vmovaps %ymm0, (%rsp)
; X64-AVX2-NEXT: movl (%rdi), %eax
; X64-AVX2-NEXT: vmovd %eax, %xmm1
@@ -1395,18 +1339,15 @@ define void @isel_crash_8d(i32* %cV_R.addr) {
; X64-AVX2-NEXT: retq
;
; X64-AVX512VL-LABEL: isel_crash_8d:
-; X64-AVX512VL: ## BB#0: ## %eintry
+; X64-AVX512VL: ## %bb.0: ## %eintry
; X64-AVX512VL-NEXT: pushq %rbp
-; X64-AVX512VL-NEXT: Lcfi6:
; X64-AVX512VL-NEXT: .cfi_def_cfa_offset 16
-; X64-AVX512VL-NEXT: Lcfi7:
; X64-AVX512VL-NEXT: .cfi_offset %rbp, -16
; X64-AVX512VL-NEXT: movq %rsp, %rbp
-; X64-AVX512VL-NEXT: Lcfi8:
; X64-AVX512VL-NEXT: .cfi_def_cfa_register %rbp
; X64-AVX512VL-NEXT: andq $-32, %rsp
; X64-AVX512VL-NEXT: subq $128, %rsp
-; X64-AVX512VL-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; X64-AVX512VL-NEXT: vxorps %xmm0, %xmm0, %xmm0
; X64-AVX512VL-NEXT: vmovaps %ymm0, (%rsp)
; X64-AVX512VL-NEXT: movl (%rdi), %eax
; X64-AVX512VL-NEXT: vpbroadcastd %eax, %ymm1
@@ -1433,9 +1374,8 @@ eintry:
define void @isel_crash_2q(i64* %cV_R.addr) {
; X32-LABEL: isel_crash_2q:
-; X32: ## BB#0: ## %entry
+; X32: ## %bb.0: ## %entry
; X32-NEXT: subl $60, %esp
-; X32-NEXT: Lcfi12:
; X32-NEXT: .cfi_def_cfa_offset 64
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vxorps %xmm0, %xmm0, %xmm0
@@ -1452,7 +1392,7 @@ define void @isel_crash_2q(i64* %cV_R.addr) {
; X32-NEXT: retl
;
; X64-AVX2-LABEL: isel_crash_2q:
-; X64-AVX2: ## BB#0: ## %entry
+; X64-AVX2: ## %bb.0: ## %entry
; X64-AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
; X64-AVX2-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-AVX2-NEXT: movq (%rdi), %rax
@@ -1463,7 +1403,7 @@ define void @isel_crash_2q(i64* %cV_R.addr) {
; X64-AVX2-NEXT: retq
;
; X64-AVX512VL-LABEL: isel_crash_2q:
-; X64-AVX512VL: ## BB#0: ## %entry
+; X64-AVX512VL: ## %bb.0: ## %entry
; X64-AVX512VL-NEXT: vxorps %xmm0, %xmm0, %xmm0
; X64-AVX512VL-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-AVX512VL-NEXT: movq (%rdi), %rax
@@ -1487,19 +1427,16 @@ entry:
define void @isel_crash_4q(i64* %cV_R.addr) {
; X32-LABEL: isel_crash_4q:
-; X32: ## BB#0: ## %eintry
+; X32: ## %bb.0: ## %eintry
; X32-NEXT: pushl %ebp
-; X32-NEXT: Lcfi13:
; X32-NEXT: .cfi_def_cfa_offset 8
-; X32-NEXT: Lcfi14:
; X32-NEXT: .cfi_offset %ebp, -8
; X32-NEXT: movl %esp, %ebp
-; X32-NEXT: Lcfi15:
; X32-NEXT: .cfi_def_cfa_register %ebp
; X32-NEXT: andl $-32, %esp
; X32-NEXT: subl $128, %esp
; X32-NEXT: movl 8(%ebp), %eax
-; X32-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; X32-NEXT: vxorps %xmm0, %xmm0, %xmm0
; X32-NEXT: vmovaps %ymm0, (%esp)
; X32-NEXT: movl (%eax), %ecx
; X32-NEXT: movl 4(%eax), %eax
@@ -1516,18 +1453,15 @@ define void @isel_crash_4q(i64* %cV_R.addr) {
; X32-NEXT: retl
;
; X64-AVX2-LABEL: isel_crash_4q:
-; X64-AVX2: ## BB#0: ## %eintry
+; X64-AVX2: ## %bb.0: ## %eintry
; X64-AVX2-NEXT: pushq %rbp
-; X64-AVX2-NEXT: Lcfi9:
; X64-AVX2-NEXT: .cfi_def_cfa_offset 16
-; X64-AVX2-NEXT: Lcfi10:
; X64-AVX2-NEXT: .cfi_offset %rbp, -16
; X64-AVX2-NEXT: movq %rsp, %rbp
-; X64-AVX2-NEXT: Lcfi11:
; X64-AVX2-NEXT: .cfi_def_cfa_register %rbp
; X64-AVX2-NEXT: andq $-32, %rsp
; X64-AVX2-NEXT: subq $128, %rsp
-; X64-AVX2-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; X64-AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
; X64-AVX2-NEXT: vmovaps %ymm0, (%rsp)
; X64-AVX2-NEXT: movq (%rdi), %rax
; X64-AVX2-NEXT: vmovq %rax, %xmm1
@@ -1540,18 +1474,15 @@ define void @isel_crash_4q(i64* %cV_R.addr) {
; X64-AVX2-NEXT: retq
;
; X64-AVX512VL-LABEL: isel_crash_4q:
-; X64-AVX512VL: ## BB#0: ## %eintry
+; X64-AVX512VL: ## %bb.0: ## %eintry
; X64-AVX512VL-NEXT: pushq %rbp
-; X64-AVX512VL-NEXT: Lcfi9:
; X64-AVX512VL-NEXT: .cfi_def_cfa_offset 16
-; X64-AVX512VL-NEXT: Lcfi10:
; X64-AVX512VL-NEXT: .cfi_offset %rbp, -16
; X64-AVX512VL-NEXT: movq %rsp, %rbp
-; X64-AVX512VL-NEXT: Lcfi11:
; X64-AVX512VL-NEXT: .cfi_def_cfa_register %rbp
; X64-AVX512VL-NEXT: andq $-32, %rsp
; X64-AVX512VL-NEXT: subq $128, %rsp
-; X64-AVX512VL-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; X64-AVX512VL-NEXT: vxorps %xmm0, %xmm0, %xmm0
; X64-AVX512VL-NEXT: vmovaps %ymm0, (%rsp)
; X64-AVX512VL-NEXT: movq (%rdi), %rax
; X64-AVX512VL-NEXT: vpbroadcastq %rax, %ymm1
diff --git a/test/CodeGen/X86/avx2-vbroadcasti128.ll b/test/CodeGen/X86/avx2-vbroadcasti128.ll
index fb149f704a11..254cdfdd8cb1 100644
--- a/test/CodeGen/X86/avx2-vbroadcasti128.ll
+++ b/test/CodeGen/X86/avx2-vbroadcasti128.ll
@@ -1,17 +1,17 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefix=X32
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X64
define <4 x double> @test_broadcast_2f64_4f64(<2 x double> *%p) nounwind {
; X32-LABEL: test_broadcast_2f64_4f64:
-; X32: ## BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
-; X32-NEXT: vaddpd LCPI0_0, %ymm0, %ymm0
+; X32-NEXT: vaddpd {{\.LCPI.*}}, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_broadcast_2f64_4f64:
-; X64: ## BB#0:
+; X64: # %bb.0:
; X64-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
; X64-NEXT: vaddpd {{.*}}(%rip), %ymm0, %ymm0
; X64-NEXT: retq
@@ -23,14 +23,14 @@ define <4 x double> @test_broadcast_2f64_4f64(<2 x double> *%p) nounwind {
define <4 x i64> @test_broadcast_2i64_4i64(<2 x i64> *%p) nounwind {
; X32-LABEL: test_broadcast_2i64_4i64:
-; X32: ## BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
-; X32-NEXT: vpaddq LCPI1_0, %ymm0, %ymm0
+; X32-NEXT: vpaddq {{\.LCPI.*}}, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_broadcast_2i64_4i64:
-; X64: ## BB#0:
+; X64: # %bb.0:
; X64-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
; X64-NEXT: vpaddq {{.*}}(%rip), %ymm0, %ymm0
; X64-NEXT: retq
@@ -42,14 +42,14 @@ define <4 x i64> @test_broadcast_2i64_4i64(<2 x i64> *%p) nounwind {
define <8 x float> @test_broadcast_4f32_8f32(<4 x float> *%p) nounwind {
; X32-LABEL: test_broadcast_4f32_8f32:
-; X32: ## BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
-; X32-NEXT: vaddps LCPI2_0, %ymm0, %ymm0
+; X32-NEXT: vaddps {{\.LCPI.*}}, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_broadcast_4f32_8f32:
-; X64: ## BB#0:
+; X64: # %bb.0:
; X64-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
; X64-NEXT: vaddps {{.*}}(%rip), %ymm0, %ymm0
; X64-NEXT: retq
@@ -61,14 +61,14 @@ define <8 x float> @test_broadcast_4f32_8f32(<4 x float> *%p) nounwind {
define <8 x i32> @test_broadcast_4i32_8i32(<4 x i32> *%p) nounwind {
; X32-LABEL: test_broadcast_4i32_8i32:
-; X32: ## BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
-; X32-NEXT: vpaddd LCPI3_0, %ymm0, %ymm0
+; X32-NEXT: vpaddd {{\.LCPI.*}}, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_broadcast_4i32_8i32:
-; X64: ## BB#0:
+; X64: # %bb.0:
; X64-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
; X64-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0
; X64-NEXT: retq
@@ -80,14 +80,14 @@ define <8 x i32> @test_broadcast_4i32_8i32(<4 x i32> *%p) nounwind {
define <16 x i16> @test_broadcast_8i16_16i16(<8 x i16> *%p) nounwind {
; X32-LABEL: test_broadcast_8i16_16i16:
-; X32: ## BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
-; X32-NEXT: vpaddw LCPI4_0, %ymm0, %ymm0
+; X32-NEXT: vpaddw {{\.LCPI.*}}, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_broadcast_8i16_16i16:
-; X64: ## BB#0:
+; X64: # %bb.0:
; X64-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
; X64-NEXT: vpaddw {{.*}}(%rip), %ymm0, %ymm0
; X64-NEXT: retq
@@ -99,14 +99,14 @@ define <16 x i16> @test_broadcast_8i16_16i16(<8 x i16> *%p) nounwind {
define <32 x i8> @test_broadcast_16i8_32i8(<16 x i8> *%p) nounwind {
; X32-LABEL: test_broadcast_16i8_32i8:
-; X32: ## BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
-; X32-NEXT: vpaddb LCPI5_0, %ymm0, %ymm0
+; X32-NEXT: vpaddb {{\.LCPI.*}}, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_broadcast_16i8_32i8:
-; X64: ## BB#0:
+; X64: # %bb.0:
; X64-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
; X64-NEXT: vpaddb {{.*}}(%rip), %ymm0, %ymm0
; X64-NEXT: retq
@@ -118,17 +118,17 @@ define <32 x i8> @test_broadcast_16i8_32i8(<16 x i8> *%p) nounwind {
define <4 x double> @test_broadcast_2f64_4f64_reuse(<2 x double>* %p0, <2 x double>* %p1) {
; X32-LABEL: test_broadcast_2f64_4f64_reuse:
-; X32: ## BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: vmovapd (%ecx), %xmm1
; X32-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm0
-; X32-NEXT: vaddpd LCPI6_0, %ymm0, %ymm0
+; X32-NEXT: vaddpd {{\.LCPI.*}}, %ymm0, %ymm0
; X32-NEXT: vmovapd %xmm1, (%eax)
; X32-NEXT: retl
;
; X64-LABEL: test_broadcast_2f64_4f64_reuse:
-; X64: ## BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovapd (%rdi), %xmm1
; X64-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm0
; X64-NEXT: vaddpd {{.*}}(%rip), %ymm0, %ymm0
@@ -143,17 +143,17 @@ define <4 x double> @test_broadcast_2f64_4f64_reuse(<2 x double>* %p0, <2 x doub
define <4 x i64> @test_broadcast_2i64_4i64_reuse(<2 x i64>* %p0, <2 x i64>* %p1) {
; X32-LABEL: test_broadcast_2i64_4i64_reuse:
-; X32: ## BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: vmovdqa (%ecx), %xmm1
; X32-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm0
-; X32-NEXT: vpaddq LCPI7_0, %ymm0, %ymm0
+; X32-NEXT: vpaddq {{\.LCPI.*}}, %ymm0, %ymm0
; X32-NEXT: vmovdqa %xmm1, (%eax)
; X32-NEXT: retl
;
; X64-LABEL: test_broadcast_2i64_4i64_reuse:
-; X64: ## BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovdqa (%rdi), %xmm1
; X64-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm0
; X64-NEXT: vpaddq {{.*}}(%rip), %ymm0, %ymm0
@@ -168,17 +168,17 @@ define <4 x i64> @test_broadcast_2i64_4i64_reuse(<2 x i64>* %p0, <2 x i64>* %p1)
define <8 x float> @test_broadcast_4f32_8f32_reuse(<4 x float>* %p0, <4 x float>* %p1) {
; X32-LABEL: test_broadcast_4f32_8f32_reuse:
-; X32: ## BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: vmovaps (%ecx), %xmm1
; X32-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm0
-; X32-NEXT: vaddps LCPI8_0, %ymm0, %ymm0
+; X32-NEXT: vaddps {{\.LCPI.*}}, %ymm0, %ymm0
; X32-NEXT: vmovaps %xmm1, (%eax)
; X32-NEXT: retl
;
; X64-LABEL: test_broadcast_4f32_8f32_reuse:
-; X64: ## BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovaps (%rdi), %xmm1
; X64-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm0
; X64-NEXT: vaddps {{.*}}(%rip), %ymm0, %ymm0
@@ -193,17 +193,17 @@ define <8 x float> @test_broadcast_4f32_8f32_reuse(<4 x float>* %p0, <4 x float>
define <8 x i32> @test_broadcast_4i32_8i32_reuse(<4 x i32>* %p0, <4 x i32>* %p1) {
; X32-LABEL: test_broadcast_4i32_8i32_reuse:
-; X32: ## BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: vmovdqa (%ecx), %xmm1
; X32-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm0
-; X32-NEXT: vpaddd LCPI9_0, %ymm0, %ymm0
+; X32-NEXT: vpaddd {{\.LCPI.*}}, %ymm0, %ymm0
; X32-NEXT: vmovdqa %xmm1, (%eax)
; X32-NEXT: retl
;
; X64-LABEL: test_broadcast_4i32_8i32_reuse:
-; X64: ## BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovdqa (%rdi), %xmm1
; X64-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm0
; X64-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0
@@ -218,17 +218,17 @@ define <8 x i32> @test_broadcast_4i32_8i32_reuse(<4 x i32>* %p0, <4 x i32>* %p1)
define <16 x i16> @test_broadcast_8i16_16i16_reuse(<8 x i16> *%p0, <8 x i16> *%p1) nounwind {
; X32-LABEL: test_broadcast_8i16_16i16_reuse:
-; X32: ## BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: vmovdqa (%ecx), %xmm1
; X32-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm0
-; X32-NEXT: vpaddw LCPI10_0, %ymm0, %ymm0
+; X32-NEXT: vpaddw {{\.LCPI.*}}, %ymm0, %ymm0
; X32-NEXT: vmovdqa %xmm1, (%eax)
; X32-NEXT: retl
;
; X64-LABEL: test_broadcast_8i16_16i16_reuse:
-; X64: ## BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovdqa (%rdi), %xmm1
; X64-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm0
; X64-NEXT: vpaddw {{.*}}(%rip), %ymm0, %ymm0
@@ -243,17 +243,17 @@ define <16 x i16> @test_broadcast_8i16_16i16_reuse(<8 x i16> *%p0, <8 x i16> *%p
define <32 x i8> @test_broadcast_16i8_32i8_reuse(<16 x i8> *%p0, <16 x i8> *%p1) nounwind {
; X32-LABEL: test_broadcast_16i8_32i8_reuse:
-; X32: ## BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: vmovdqa (%ecx), %xmm1
; X32-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm0
-; X32-NEXT: vpaddb LCPI11_0, %ymm0, %ymm0
+; X32-NEXT: vpaddb {{\.LCPI.*}}, %ymm0, %ymm0
; X32-NEXT: vmovdqa %xmm1, (%eax)
; X32-NEXT: retl
;
; X64-LABEL: test_broadcast_16i8_32i8_reuse:
-; X64: ## BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovdqa (%rdi), %xmm1
; X64-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm0
; X64-NEXT: vpaddb {{.*}}(%rip), %ymm0, %ymm0
@@ -268,19 +268,19 @@ define <32 x i8> @test_broadcast_16i8_32i8_reuse(<16 x i8> *%p0, <16 x i8> *%p1)
define <8 x i32> @PR29088(<4 x i32>* %p0, <8 x float>* %p1) {
; X32-LABEL: PR29088:
-; X32: ## BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: vmovaps (%ecx), %xmm0
-; X32-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; X32-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X32-NEXT: vmovaps %ymm1, (%eax)
; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: PR29088:
-; X64: ## BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovaps (%rdi), %xmm0
-; X64-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X64-NEXT: vmovaps %ymm1, (%rsi)
; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; X64-NEXT: retq
diff --git a/test/CodeGen/X86/avx2-vector-shifts.ll b/test/CodeGen/X86/avx2-vector-shifts.ll
index c77714b9e181..289a3af3f088 100644
--- a/test/CodeGen/X86/avx2-vector-shifts.ll
+++ b/test/CodeGen/X86/avx2-vector-shifts.ll
@@ -6,11 +6,11 @@
define <16 x i16> @test_sllw_1(<16 x i16> %InVec) {
; X32-LABEL: test_sllw_1:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: retl
;
; X64-LABEL: test_sllw_1:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: retq
entry:
%shl = shl <16 x i16> %InVec, <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>
@@ -19,12 +19,12 @@ entry:
define <16 x i16> @test_sllw_2(<16 x i16> %InVec) {
; X32-LABEL: test_sllw_2:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: vpaddw %ymm0, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_sllw_2:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: vpaddw %ymm0, %ymm0, %ymm0
; X64-NEXT: retq
entry:
@@ -34,12 +34,12 @@ entry:
define <16 x i16> @test_sllw_3(<16 x i16> %InVec) {
; X32-LABEL: test_sllw_3:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: vpsllw $15, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_sllw_3:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: vpsllw $15, %ymm0, %ymm0
; X64-NEXT: retq
entry:
@@ -49,11 +49,11 @@ entry:
define <8 x i32> @test_slld_1(<8 x i32> %InVec) {
; X32-LABEL: test_slld_1:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: retl
;
; X64-LABEL: test_slld_1:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: retq
entry:
%shl = shl <8 x i32> %InVec, <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
@@ -62,12 +62,12 @@ entry:
define <8 x i32> @test_slld_2(<8 x i32> %InVec) {
; X32-LABEL: test_slld_2:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: vpaddd %ymm0, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_slld_2:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: vpaddd %ymm0, %ymm0, %ymm0
; X64-NEXT: retq
entry:
@@ -77,14 +77,14 @@ entry:
define <8 x i32> @test_vpslld_var(i32 %shift) {
; X32-LABEL: test_vpslld_var:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X32-NEXT: vmovdqa {{.*#+}} ymm1 = [192,193,194,195,196,197,198,199]
; X32-NEXT: vpslld %xmm0, %ymm1, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_vpslld_var:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovd %edi, %xmm0
; X64-NEXT: vmovdqa {{.*#+}} ymm1 = [192,193,194,195,196,197,198,199]
; X64-NEXT: vpslld %xmm0, %ymm1, %ymm0
@@ -96,12 +96,12 @@ define <8 x i32> @test_vpslld_var(i32 %shift) {
define <8 x i32> @test_slld_3(<8 x i32> %InVec) {
; X32-LABEL: test_slld_3:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: vpslld $31, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_slld_3:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: vpslld $31, %ymm0, %ymm0
; X64-NEXT: retq
entry:
@@ -111,11 +111,11 @@ entry:
define <4 x i64> @test_sllq_1(<4 x i64> %InVec) {
; X32-LABEL: test_sllq_1:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: retl
;
; X64-LABEL: test_sllq_1:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: retq
entry:
%shl = shl <4 x i64> %InVec, <i64 0, i64 0, i64 0, i64 0>
@@ -124,12 +124,12 @@ entry:
define <4 x i64> @test_sllq_2(<4 x i64> %InVec) {
; X32-LABEL: test_sllq_2:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: vpaddq %ymm0, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_sllq_2:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: vpaddq %ymm0, %ymm0, %ymm0
; X64-NEXT: retq
entry:
@@ -139,12 +139,12 @@ entry:
define <4 x i64> @test_sllq_3(<4 x i64> %InVec) {
; X32-LABEL: test_sllq_3:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: vpsllq $63, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_sllq_3:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: vpsllq $63, %ymm0, %ymm0
; X64-NEXT: retq
entry:
@@ -156,11 +156,11 @@ entry:
define <16 x i16> @test_sraw_1(<16 x i16> %InVec) {
; X32-LABEL: test_sraw_1:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: retl
;
; X64-LABEL: test_sraw_1:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: retq
entry:
%shl = ashr <16 x i16> %InVec, <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>
@@ -169,12 +169,12 @@ entry:
define <16 x i16> @test_sraw_2(<16 x i16> %InVec) {
; X32-LABEL: test_sraw_2:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: vpsraw $1, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_sraw_2:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: vpsraw $1, %ymm0, %ymm0
; X64-NEXT: retq
entry:
@@ -184,12 +184,12 @@ entry:
define <16 x i16> @test_sraw_3(<16 x i16> %InVec) {
; X32-LABEL: test_sraw_3:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: vpsraw $15, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_sraw_3:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: vpsraw $15, %ymm0, %ymm0
; X64-NEXT: retq
entry:
@@ -199,11 +199,11 @@ entry:
define <8 x i32> @test_srad_1(<8 x i32> %InVec) {
; X32-LABEL: test_srad_1:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: retl
;
; X64-LABEL: test_srad_1:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: retq
entry:
%shl = ashr <8 x i32> %InVec, <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
@@ -212,12 +212,12 @@ entry:
define <8 x i32> @test_srad_2(<8 x i32> %InVec) {
; X32-LABEL: test_srad_2:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: vpsrad $1, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_srad_2:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: vpsrad $1, %ymm0, %ymm0
; X64-NEXT: retq
entry:
@@ -227,12 +227,12 @@ entry:
define <8 x i32> @test_srad_3(<8 x i32> %InVec) {
; X32-LABEL: test_srad_3:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: vpsrad $31, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_srad_3:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: vpsrad $31, %ymm0, %ymm0
; X64-NEXT: retq
entry:
@@ -244,11 +244,11 @@ entry:
define <16 x i16> @test_srlw_1(<16 x i16> %InVec) {
; X32-LABEL: test_srlw_1:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: retl
;
; X64-LABEL: test_srlw_1:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: retq
entry:
%shl = lshr <16 x i16> %InVec, <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>
@@ -257,12 +257,12 @@ entry:
define <16 x i16> @test_srlw_2(<16 x i16> %InVec) {
; X32-LABEL: test_srlw_2:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: vpsrlw $1, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_srlw_2:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: vpsrlw $1, %ymm0, %ymm0
; X64-NEXT: retq
entry:
@@ -272,12 +272,12 @@ entry:
define <16 x i16> @test_srlw_3(<16 x i16> %InVec) {
; X32-LABEL: test_srlw_3:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: vpsrlw $15, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_srlw_3:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: vpsrlw $15, %ymm0, %ymm0
; X64-NEXT: retq
entry:
@@ -287,11 +287,11 @@ entry:
define <8 x i32> @test_srld_1(<8 x i32> %InVec) {
; X32-LABEL: test_srld_1:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: retl
;
; X64-LABEL: test_srld_1:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: retq
entry:
%shl = lshr <8 x i32> %InVec, <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
@@ -300,12 +300,12 @@ entry:
define <8 x i32> @test_srld_2(<8 x i32> %InVec) {
; X32-LABEL: test_srld_2:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: vpsrld $1, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_srld_2:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: vpsrld $1, %ymm0, %ymm0
; X64-NEXT: retq
entry:
@@ -315,12 +315,12 @@ entry:
define <8 x i32> @test_srld_3(<8 x i32> %InVec) {
; X32-LABEL: test_srld_3:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: vpsrld $31, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_srld_3:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: vpsrld $31, %ymm0, %ymm0
; X64-NEXT: retq
entry:
@@ -330,11 +330,11 @@ entry:
define <4 x i64> @test_srlq_1(<4 x i64> %InVec) {
; X32-LABEL: test_srlq_1:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: retl
;
; X64-LABEL: test_srlq_1:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: retq
entry:
%shl = lshr <4 x i64> %InVec, <i64 0, i64 0, i64 0, i64 0>
@@ -343,12 +343,12 @@ entry:
define <4 x i64> @test_srlq_2(<4 x i64> %InVec) {
; X32-LABEL: test_srlq_2:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: vpsrlq $1, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_srlq_2:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: vpsrlq $1, %ymm0, %ymm0
; X64-NEXT: retq
entry:
@@ -358,12 +358,12 @@ entry:
define <4 x i64> @test_srlq_3(<4 x i64> %InVec) {
; X32-LABEL: test_srlq_3:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: vpsrlq $63, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_srlq_3:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: vpsrlq $63, %ymm0, %ymm0
; X64-NEXT: retq
entry:
@@ -373,7 +373,7 @@ entry:
define <4 x i32> @srl_trunc_and_v4i64(<4 x i32> %x, <4 x i64> %y) nounwind {
; X32-LABEL: srl_trunc_and_v4i64:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
; X32-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
; X32-NEXT: vpbroadcastd {{.*#+}} xmm2 = [8,8,8,8]
@@ -383,7 +383,7 @@ define <4 x i32> @srl_trunc_and_v4i64(<4 x i32> %x, <4 x i64> %y) nounwind {
; X32-NEXT: retl
;
; X64-LABEL: srl_trunc_and_v4i64:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
; X64-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
; X64-NEXT: vpbroadcastd {{.*#+}} xmm2 = [8,8,8,8]
@@ -403,24 +403,24 @@ define <4 x i32> @srl_trunc_and_v4i64(<4 x i32> %x, <4 x i64> %y) nounwind {
define <8 x i16> @shl_8i16(<8 x i16> %r, <8 x i16> %a) nounwind {
; X32-LABEL: shl_8i16:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; X32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; X32-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
; X32-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; X32-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; X32-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; X32-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; X32-NEXT: vzeroupper
; X32-NEXT: retl
;
; X64-LABEL: shl_8i16:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; X64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; X64-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
; X64-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; X64-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; X64-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; X64-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; X64-NEXT: vzeroupper
; X64-NEXT: retq
%shl = shl <8 x i16> %r, %a
@@ -429,8 +429,8 @@ define <8 x i16> @shl_8i16(<8 x i16> %r, <8 x i16> %a) nounwind {
define <16 x i16> @shl_16i16(<16 x i16> %r, <16 x i16> %a) nounwind {
; X32-LABEL: shl_16i16:
-; X32: # BB#0:
-; X32-NEXT: vpxor %ymm2, %ymm2, %ymm2
+; X32: # %bb.0:
+; X32-NEXT: vpxor %xmm2, %xmm2, %xmm2
; X32-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
; X32-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
; X32-NEXT: vpsllvd %ymm3, %ymm4, %ymm3
@@ -443,8 +443,8 @@ define <16 x i16> @shl_16i16(<16 x i16> %r, <16 x i16> %a) nounwind {
; X32-NEXT: retl
;
; X64-LABEL: shl_16i16:
-; X64: # BB#0:
-; X64-NEXT: vpxor %ymm2, %ymm2, %ymm2
+; X64: # %bb.0:
+; X64-NEXT: vpxor %xmm2, %xmm2, %xmm2
; X64-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
; X64-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
; X64-NEXT: vpsllvd %ymm3, %ymm4, %ymm3
@@ -461,7 +461,7 @@ define <16 x i16> @shl_16i16(<16 x i16> %r, <16 x i16> %a) nounwind {
define <32 x i8> @shl_32i8(<32 x i8> %r, <32 x i8> %a) nounwind {
; X32-LABEL: shl_32i8:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpsllw $5, %ymm1, %ymm1
; X32-NEXT: vpsllw $4, %ymm0, %ymm2
; X32-NEXT: vpand {{\.LCPI.*}}, %ymm2, %ymm2
@@ -476,7 +476,7 @@ define <32 x i8> @shl_32i8(<32 x i8> %r, <32 x i8> %a) nounwind {
; X32-NEXT: retl
;
; X64-LABEL: shl_32i8:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpsllw $5, %ymm1, %ymm1
; X64-NEXT: vpsllw $4, %ymm0, %ymm2
; X64-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
@@ -495,24 +495,22 @@ define <32 x i8> @shl_32i8(<32 x i8> %r, <32 x i8> %a) nounwind {
define <8 x i16> @ashr_8i16(<8 x i16> %r, <8 x i16> %a) nounwind {
; X32-LABEL: ashr_8i16:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; X32-NEXT: vpmovsxwd %xmm0, %ymm0
; X32-NEXT: vpsravd %ymm1, %ymm0, %ymm0
-; X32-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; X32-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; X32-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; X32-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X32-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
; X32-NEXT: vzeroupper
; X32-NEXT: retl
;
; X64-LABEL: ashr_8i16:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; X64-NEXT: vpmovsxwd %xmm0, %ymm0
; X64-NEXT: vpsravd %ymm1, %ymm0, %ymm0
-; X64-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; X64-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; X64-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
; X64-NEXT: vzeroupper
; X64-NEXT: retq
%ashr = ashr <8 x i16> %r, %a
@@ -521,8 +519,8 @@ define <8 x i16> @ashr_8i16(<8 x i16> %r, <8 x i16> %a) nounwind {
define <16 x i16> @ashr_16i16(<16 x i16> %r, <16 x i16> %a) nounwind {
; X32-LABEL: ashr_16i16:
-; X32: # BB#0:
-; X32-NEXT: vpxor %ymm2, %ymm2, %ymm2
+; X32: # %bb.0:
+; X32-NEXT: vpxor %xmm2, %xmm2, %xmm2
; X32-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
; X32-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
; X32-NEXT: vpsravd %ymm3, %ymm4, %ymm3
@@ -535,8 +533,8 @@ define <16 x i16> @ashr_16i16(<16 x i16> %r, <16 x i16> %a) nounwind {
; X32-NEXT: retl
;
; X64-LABEL: ashr_16i16:
-; X64: # BB#0:
-; X64-NEXT: vpxor %ymm2, %ymm2, %ymm2
+; X64: # %bb.0:
+; X64-NEXT: vpxor %xmm2, %xmm2, %xmm2
; X64-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
; X64-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
; X64-NEXT: vpsravd %ymm3, %ymm4, %ymm3
@@ -553,7 +551,7 @@ define <16 x i16> @ashr_16i16(<16 x i16> %r, <16 x i16> %a) nounwind {
define <32 x i8> @ashr_32i8(<32 x i8> %r, <32 x i8> %a) nounwind {
; X32-LABEL: ashr_32i8:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpsllw $5, %ymm1, %ymm1
; X32-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
; X32-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
@@ -581,7 +579,7 @@ define <32 x i8> @ashr_32i8(<32 x i8> %r, <32 x i8> %a) nounwind {
; X32-NEXT: retl
;
; X64-LABEL: ashr_32i8:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpsllw $5, %ymm1, %ymm1
; X64-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
; X64-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
@@ -613,24 +611,24 @@ define <32 x i8> @ashr_32i8(<32 x i8> %r, <32 x i8> %a) nounwind {
define <8 x i16> @lshr_8i16(<8 x i16> %r, <8 x i16> %a) nounwind {
; X32-LABEL: lshr_8i16:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; X32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; X32-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
; X32-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; X32-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; X32-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; X32-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; X32-NEXT: vzeroupper
; X32-NEXT: retl
;
; X64-LABEL: lshr_8i16:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; X64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; X64-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
; X64-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; X64-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; X64-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; X64-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; X64-NEXT: vzeroupper
; X64-NEXT: retq
%lshr = lshr <8 x i16> %r, %a
@@ -639,8 +637,8 @@ define <8 x i16> @lshr_8i16(<8 x i16> %r, <8 x i16> %a) nounwind {
define <16 x i16> @lshr_16i16(<16 x i16> %r, <16 x i16> %a) nounwind {
; X32-LABEL: lshr_16i16:
-; X32: # BB#0:
-; X32-NEXT: vpxor %ymm2, %ymm2, %ymm2
+; X32: # %bb.0:
+; X32-NEXT: vpxor %xmm2, %xmm2, %xmm2
; X32-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
; X32-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
; X32-NEXT: vpsrlvd %ymm3, %ymm4, %ymm3
@@ -653,8 +651,8 @@ define <16 x i16> @lshr_16i16(<16 x i16> %r, <16 x i16> %a) nounwind {
; X32-NEXT: retl
;
; X64-LABEL: lshr_16i16:
-; X64: # BB#0:
-; X64-NEXT: vpxor %ymm2, %ymm2, %ymm2
+; X64: # %bb.0:
+; X64-NEXT: vpxor %xmm2, %xmm2, %xmm2
; X64-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
; X64-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
; X64-NEXT: vpsrlvd %ymm3, %ymm4, %ymm3
@@ -671,7 +669,7 @@ define <16 x i16> @lshr_16i16(<16 x i16> %r, <16 x i16> %a) nounwind {
define <32 x i8> @lshr_32i8(<32 x i8> %r, <32 x i8> %a) nounwind {
; X32-LABEL: lshr_32i8:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpsllw $5, %ymm1, %ymm1
; X32-NEXT: vpsrlw $4, %ymm0, %ymm2
; X32-NEXT: vpand {{\.LCPI.*}}, %ymm2, %ymm2
@@ -687,7 +685,7 @@ define <32 x i8> @lshr_32i8(<32 x i8> %r, <32 x i8> %a) nounwind {
; X32-NEXT: retl
;
; X64-LABEL: lshr_32i8:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpsllw $5, %ymm1, %ymm1
; X64-NEXT: vpsrlw $4, %ymm0, %ymm2
; X64-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
diff --git a/test/CodeGen/X86/avx2-vperm.ll b/test/CodeGen/X86/avx2-vperm.ll
index d57daafab243..32ab55dc12a2 100755
--- a/test/CodeGen/X86/avx2-vperm.ll
+++ b/test/CodeGen/X86/avx2-vperm.ll
@@ -4,15 +4,15 @@
define <8 x i32> @perm_cl_int_8x32(<8 x i32> %A) nounwind readnone {
; X32-LABEL: perm_cl_int_8x32:
-; X32: # BB#0: # %entry
-; X32-NEXT: vmovdqa {{.*#+}} ymm1 = [0,7,2,1,2,7,6,0]
-; X32-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; X32: # %bb.0: # %entry
+; X32-NEXT: vmovaps {{.*#+}} ymm1 = [0,7,2,1,2,7,6,0]
+; X32-NEXT: vpermps %ymm0, %ymm1, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: perm_cl_int_8x32:
-; X64: # BB#0: # %entry
-; X64-NEXT: vmovdqa {{.*#+}} ymm1 = [0,7,2,1,2,7,6,0]
-; X64-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; X64: # %bb.0: # %entry
+; X64-NEXT: vmovaps {{.*#+}} ymm1 = [0,7,2,1,2,7,6,0]
+; X64-NEXT: vpermps %ymm0, %ymm1, %ymm0
; X64-NEXT: retq
entry:
%B = shufflevector <8 x i32> %A, <8 x i32> undef, <8 x i32> <i32 0, i32 7, i32 2, i32 1, i32 2, i32 7, i32 6, i32 0>
@@ -22,13 +22,13 @@ entry:
define <8 x float> @perm_cl_fp_8x32(<8 x float> %A) nounwind readnone {
; X32-LABEL: perm_cl_fp_8x32:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: vmovaps {{.*#+}} ymm1 = <u,7,2,u,4,u,1,6>
; X32-NEXT: vpermps %ymm0, %ymm1, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: perm_cl_fp_8x32:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: vmovaps {{.*#+}} ymm1 = <u,7,2,u,4,u,1,6>
; X64-NEXT: vpermps %ymm0, %ymm1, %ymm0
; X64-NEXT: retq
@@ -39,13 +39,13 @@ entry:
define <4 x i64> @perm_cl_int_4x64(<4 x i64> %A) nounwind readnone {
; X32-LABEL: perm_cl_int_4x64:
-; X32: # BB#0: # %entry
-; X32-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,1]
+; X32: # %bb.0: # %entry
+; X32-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,1]
; X32-NEXT: retl
;
; X64-LABEL: perm_cl_int_4x64:
-; X64: # BB#0: # %entry
-; X64-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,1]
+; X64: # %bb.0: # %entry
+; X64-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,1]
; X64-NEXT: retq
entry:
%B = shufflevector <4 x i64> %A, <4 x i64> undef, <4 x i32> <i32 0, i32 3, i32 2, i32 1>
@@ -54,12 +54,12 @@ entry:
define <4 x double> @perm_cl_fp_4x64(<4 x double> %A) nounwind readnone {
; X32-LABEL: perm_cl_fp_4x64:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,1]
; X32-NEXT: retl
;
; X64-LABEL: perm_cl_fp_4x64:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,1]
; X64-NEXT: retq
entry:
diff --git a/test/CodeGen/X86/avx512-adc-sbb.ll b/test/CodeGen/X86/avx512-adc-sbb.ll
index c994fdef6919..bb21dea68dfa 100644
--- a/test/CodeGen/X86/avx512-adc-sbb.ll
+++ b/test/CodeGen/X86/avx512-adc-sbb.ll
@@ -6,7 +6,7 @@
define i8 @PR32316(i8 %t1, i32 %t5, i8 %t8) {
; CHECK-LABEL: PR32316:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: testb %dil, %dil
; CHECK-NEXT: sete %al
diff --git a/test/CodeGen/X86/avx512-any_extend_load.ll b/test/CodeGen/X86/avx512-any_extend_load.ll
index f6ab0044ee80..de2ca2212d9b 100644
--- a/test/CodeGen/X86/avx512-any_extend_load.ll
+++ b/test/CodeGen/X86/avx512-any_extend_load.ll
@@ -5,14 +5,14 @@
define void @any_extend_load_v8i64(<8 x i8> * %ptr) {
; KNL-LABEL: any_extend_load_v8i64:
-; KNL: # BB#0:
+; KNL: # %bb.0:
; KNL-NEXT: vpmovzxbq {{.*#+}} zmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero,mem[4],zero,zero,zero,zero,zero,zero,zero,mem[5],zero,zero,zero,zero,zero,zero,zero,mem[6],zero,zero,zero,zero,zero,zero,zero,mem[7],zero,zero,zero,zero,zero,zero,zero
; KNL-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm0
; KNL-NEXT: vpmovqb %zmm0, (%rdi)
; KNL-NEXT: retq
;
; SKX-LABEL: any_extend_load_v8i64:
-; SKX: # BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vpmovzxbq {{.*#+}} zmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero,mem[4],zero,zero,zero,zero,zero,zero,zero,mem[5],zero,zero,zero,zero,zero,zero,zero,mem[6],zero,zero,zero,zero,zero,zero,zero,mem[7],zero,zero,zero,zero,zero,zero,zero
; SKX-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm0
; SKX-NEXT: vpmovqb %zmm0, (%rdi)
@@ -29,7 +29,7 @@ define void @any_extend_load_v8i64(<8 x i8> * %ptr) {
define void @any_extend_load_v8i32(<8 x i8> * %ptr) {
; KNL-LABEL: any_extend_load_v8i32:
-; KNL: # BB#0:
+; KNL: # %bb.0:
; KNL-NEXT: vpmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
; KNL-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0
; KNL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
@@ -37,7 +37,7 @@ define void @any_extend_load_v8i32(<8 x i8> * %ptr) {
; KNL-NEXT: retq
;
; SKX-LABEL: any_extend_load_v8i32:
-; SKX: # BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
; SKX-NEXT: vpaddd {{.*}}(%rip){1to8}, %ymm0, %ymm0
; SKX-NEXT: vpmovdb %ymm0, (%rdi)
@@ -54,15 +54,15 @@ define void @any_extend_load_v8i32(<8 x i8> * %ptr) {
define void @any_extend_load_v8i16(<8 x i8> * %ptr) {
; KNL-LABEL: any_extend_load_v8i16:
-; KNL: # BB#0:
+; KNL: # %bb.0:
; KNL-NEXT: vpmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
; KNL-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
-; KNL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; KNL-NEXT: vpackuswb %xmm0, %xmm0, %xmm0
; KNL-NEXT: vmovq %xmm0, (%rdi)
; KNL-NEXT: retq
;
; SKX-LABEL: any_extend_load_v8i16:
-; SKX: # BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vpmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
; SKX-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0
; SKX-NEXT: vpmovwb %xmm0, (%rdi)
diff --git a/test/CodeGen/X86/avx512-arith.ll b/test/CodeGen/X86/avx512-arith.ll
index 7c0f145bb717..766238f32801 100644
--- a/test/CodeGen/X86/avx512-arith.ll
+++ b/test/CodeGen/X86/avx512-arith.ll
@@ -7,7 +7,7 @@
define <8 x double> @addpd512(<8 x double> %y, <8 x double> %x) {
; CHECK-LABEL: addpd512:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vaddpd %zmm0, %zmm1, %zmm0
; CHECK-NEXT: retq
entry:
@@ -17,7 +17,7 @@ entry:
define <8 x double> @addpd512fold(<8 x double> %y) {
; CHECK-LABEL: addpd512fold:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vaddpd {{.*}}(%rip), %zmm0, %zmm0
; CHECK-NEXT: retq
entry:
@@ -27,7 +27,7 @@ entry:
define <16 x float> @addps512(<16 x float> %y, <16 x float> %x) {
; CHECK-LABEL: addps512:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vaddps %zmm0, %zmm1, %zmm0
; CHECK-NEXT: retq
entry:
@@ -37,7 +37,7 @@ entry:
define <16 x float> @addps512fold(<16 x float> %y) {
; CHECK-LABEL: addps512fold:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vaddps {{.*}}(%rip), %zmm0, %zmm0
; CHECK-NEXT: retq
entry:
@@ -47,7 +47,7 @@ entry:
define <8 x double> @subpd512(<8 x double> %y, <8 x double> %x) {
; CHECK-LABEL: subpd512:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vsubpd %zmm0, %zmm1, %zmm0
; CHECK-NEXT: retq
entry:
@@ -57,7 +57,7 @@ entry:
define <8 x double> @subpd512fold(<8 x double> %y, <8 x double>* %x) {
; CHECK-LABEL: subpd512fold:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vsubpd (%rdi), %zmm0, %zmm0
; CHECK-NEXT: retq
entry:
@@ -68,7 +68,7 @@ entry:
define <16 x float> @subps512(<16 x float> %y, <16 x float> %x) {
; CHECK-LABEL: subps512:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vsubps %zmm0, %zmm1, %zmm0
; CHECK-NEXT: retq
entry:
@@ -78,7 +78,7 @@ entry:
define <16 x float> @subps512fold(<16 x float> %y, <16 x float>* %x) {
; CHECK-LABEL: subps512fold:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vsubps (%rdi), %zmm0, %zmm0
; CHECK-NEXT: retq
entry:
@@ -89,7 +89,7 @@ entry:
define <8 x i64> @imulq512(<8 x i64> %y, <8 x i64> %x) {
; AVX512F-LABEL: imulq512:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vpsrlq $32, %zmm1, %zmm2
; AVX512F-NEXT: vpmuludq %zmm0, %zmm2, %zmm2
; AVX512F-NEXT: vpsrlq $32, %zmm0, %zmm3
@@ -101,7 +101,7 @@ define <8 x i64> @imulq512(<8 x i64> %y, <8 x i64> %x) {
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: imulq512:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpsrlq $32, %zmm1, %zmm2
; AVX512VL-NEXT: vpmuludq %zmm0, %zmm2, %zmm2
; AVX512VL-NEXT: vpsrlq $32, %zmm0, %zmm3
@@ -113,7 +113,7 @@ define <8 x i64> @imulq512(<8 x i64> %y, <8 x i64> %x) {
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: imulq512:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpsrlq $32, %zmm1, %zmm2
; AVX512BW-NEXT: vpmuludq %zmm0, %zmm2, %zmm2
; AVX512BW-NEXT: vpsrlq $32, %zmm0, %zmm3
@@ -125,12 +125,12 @@ define <8 x i64> @imulq512(<8 x i64> %y, <8 x i64> %x) {
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: imulq512:
-; AVX512DQ: # BB#0:
+; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vpmullq %zmm0, %zmm1, %zmm0
; AVX512DQ-NEXT: retq
;
; SKX-LABEL: imulq512:
-; SKX: # BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vpmullq %zmm0, %zmm1, %zmm0
; SKX-NEXT: retq
%z = mul <8 x i64>%x, %y
@@ -139,7 +139,7 @@ define <8 x i64> @imulq512(<8 x i64> %y, <8 x i64> %x) {
define <4 x i64> @imulq256(<4 x i64> %y, <4 x i64> %x) {
; AVX512F-LABEL: imulq256:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vpsrlq $32, %ymm1, %ymm2
; AVX512F-NEXT: vpmuludq %ymm0, %ymm2, %ymm2
; AVX512F-NEXT: vpsrlq $32, %ymm0, %ymm3
@@ -151,7 +151,7 @@ define <4 x i64> @imulq256(<4 x i64> %y, <4 x i64> %x) {
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: imulq256:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpsrlq $32, %ymm1, %ymm2
; AVX512VL-NEXT: vpmuludq %ymm0, %ymm2, %ymm2
; AVX512VL-NEXT: vpsrlq $32, %ymm0, %ymm3
@@ -163,7 +163,7 @@ define <4 x i64> @imulq256(<4 x i64> %y, <4 x i64> %x) {
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: imulq256:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpsrlq $32, %ymm1, %ymm2
; AVX512BW-NEXT: vpmuludq %ymm0, %ymm2, %ymm2
; AVX512BW-NEXT: vpsrlq $32, %ymm0, %ymm3
@@ -175,15 +175,15 @@ define <4 x i64> @imulq256(<4 x i64> %y, <4 x i64> %x) {
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: imulq256:
-; AVX512DQ: # BB#0:
-; AVX512DQ-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
-; AVX512DQ-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: # kill: def %ymm1 killed %ymm1 def %zmm1
+; AVX512DQ-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; AVX512DQ-NEXT: vpmullq %zmm0, %zmm1, %zmm0
-; AVX512DQ-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512DQ-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
; AVX512DQ-NEXT: retq
;
; SKX-LABEL: imulq256:
-; SKX: # BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vpmullq %ymm0, %ymm1, %ymm0
; SKX-NEXT: retq
%z = mul <4 x i64>%x, %y
@@ -192,7 +192,7 @@ define <4 x i64> @imulq256(<4 x i64> %y, <4 x i64> %x) {
define <2 x i64> @imulq128(<2 x i64> %y, <2 x i64> %x) {
; AVX512F-LABEL: imulq128:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vpsrlq $32, %xmm1, %xmm2
; AVX512F-NEXT: vpmuludq %xmm0, %xmm2, %xmm2
; AVX512F-NEXT: vpsrlq $32, %xmm0, %xmm3
@@ -204,7 +204,7 @@ define <2 x i64> @imulq128(<2 x i64> %y, <2 x i64> %x) {
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: imulq128:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpsrlq $32, %xmm1, %xmm2
; AVX512VL-NEXT: vpmuludq %xmm0, %xmm2, %xmm2
; AVX512VL-NEXT: vpsrlq $32, %xmm0, %xmm3
@@ -216,7 +216,7 @@ define <2 x i64> @imulq128(<2 x i64> %y, <2 x i64> %x) {
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: imulq128:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpsrlq $32, %xmm1, %xmm2
; AVX512BW-NEXT: vpmuludq %xmm0, %xmm2, %xmm2
; AVX512BW-NEXT: vpsrlq $32, %xmm0, %xmm3
@@ -228,16 +228,16 @@ define <2 x i64> @imulq128(<2 x i64> %y, <2 x i64> %x) {
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: imulq128:
-; AVX512DQ: # BB#0:
-; AVX512DQ-NEXT: # kill: %XMM1<def> %XMM1<kill> %ZMM1<def>
-; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: # kill: def %xmm1 killed %xmm1 def %zmm1
+; AVX512DQ-NEXT: # kill: def %xmm0 killed %xmm0 def %zmm0
; AVX512DQ-NEXT: vpmullq %zmm0, %zmm1, %zmm0
-; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512DQ-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; SKX-LABEL: imulq128:
-; SKX: # BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vpmullq %xmm0, %xmm1, %xmm0
; SKX-NEXT: retq
%z = mul <2 x i64>%x, %y
@@ -246,7 +246,7 @@ define <2 x i64> @imulq128(<2 x i64> %y, <2 x i64> %x) {
define <8 x double> @mulpd512(<8 x double> %y, <8 x double> %x) {
; CHECK-LABEL: mulpd512:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vmulpd %zmm0, %zmm1, %zmm0
; CHECK-NEXT: retq
entry:
@@ -256,7 +256,7 @@ entry:
define <8 x double> @mulpd512fold(<8 x double> %y) {
; CHECK-LABEL: mulpd512fold:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vmulpd {{.*}}(%rip), %zmm0, %zmm0
; CHECK-NEXT: retq
entry:
@@ -266,7 +266,7 @@ entry:
define <16 x float> @mulps512(<16 x float> %y, <16 x float> %x) {
; CHECK-LABEL: mulps512:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vmulps %zmm0, %zmm1, %zmm0
; CHECK-NEXT: retq
entry:
@@ -276,7 +276,7 @@ entry:
define <16 x float> @mulps512fold(<16 x float> %y) {
; CHECK-LABEL: mulps512fold:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vmulps {{.*}}(%rip), %zmm0, %zmm0
; CHECK-NEXT: retq
entry:
@@ -286,7 +286,7 @@ entry:
define <8 x double> @divpd512(<8 x double> %y, <8 x double> %x) {
; CHECK-LABEL: divpd512:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vdivpd %zmm0, %zmm1, %zmm0
; CHECK-NEXT: retq
entry:
@@ -296,7 +296,7 @@ entry:
define <8 x double> @divpd512fold(<8 x double> %y) {
; CHECK-LABEL: divpd512fold:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vdivpd {{.*}}(%rip), %zmm0, %zmm0
; CHECK-NEXT: retq
entry:
@@ -306,7 +306,7 @@ entry:
define <16 x float> @divps512(<16 x float> %y, <16 x float> %x) {
; CHECK-LABEL: divps512:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vdivps %zmm0, %zmm1, %zmm0
; CHECK-NEXT: retq
entry:
@@ -316,7 +316,7 @@ entry:
define <16 x float> @divps512fold(<16 x float> %y) {
; CHECK-LABEL: divps512fold:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vdivps {{.*}}(%rip), %zmm0, %zmm0
; CHECK-NEXT: retq
entry:
@@ -326,7 +326,7 @@ entry:
define <8 x i64> @vpaddq_test(<8 x i64> %i, <8 x i64> %j) nounwind readnone {
; CHECK-LABEL: vpaddq_test:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpaddq %zmm1, %zmm0, %zmm0
; CHECK-NEXT: retq
%x = add <8 x i64> %i, %j
@@ -335,7 +335,7 @@ define <8 x i64> @vpaddq_test(<8 x i64> %i, <8 x i64> %j) nounwind readnone {
define <8 x i64> @vpaddq_fold_test(<8 x i64> %i, <8 x i64>* %j) nounwind {
; CHECK-LABEL: vpaddq_fold_test:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpaddq (%rdi), %zmm0, %zmm0
; CHECK-NEXT: retq
%tmp = load <8 x i64>, <8 x i64>* %j, align 4
@@ -345,7 +345,7 @@ define <8 x i64> @vpaddq_fold_test(<8 x i64> %i, <8 x i64>* %j) nounwind {
define <8 x i64> @vpaddq_broadcast_test(<8 x i64> %i) nounwind {
; CHECK-LABEL: vpaddq_broadcast_test:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm0
; CHECK-NEXT: retq
%x = add <8 x i64> %i, <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
@@ -354,7 +354,7 @@ define <8 x i64> @vpaddq_broadcast_test(<8 x i64> %i) nounwind {
define <8 x i64> @vpaddq_broadcast2_test(<8 x i64> %i, i64* %j) nounwind {
; CHECK-LABEL: vpaddq_broadcast2_test:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpaddq (%rdi){1to8}, %zmm0, %zmm0
; CHECK-NEXT: retq
%tmp = load i64, i64* %j
@@ -372,7 +372,7 @@ define <8 x i64> @vpaddq_broadcast2_test(<8 x i64> %i, i64* %j) nounwind {
define <16 x i32> @vpaddd_test(<16 x i32> %i, <16 x i32> %j) nounwind readnone {
; CHECK-LABEL: vpaddd_test:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0
; CHECK-NEXT: retq
%x = add <16 x i32> %i, %j
@@ -381,7 +381,7 @@ define <16 x i32> @vpaddd_test(<16 x i32> %i, <16 x i32> %j) nounwind readnone {
define <16 x i32> @vpaddd_fold_test(<16 x i32> %i, <16 x i32>* %j) nounwind {
; CHECK-LABEL: vpaddd_fold_test:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpaddd (%rdi), %zmm0, %zmm0
; CHECK-NEXT: retq
%tmp = load <16 x i32>, <16 x i32>* %j, align 4
@@ -391,7 +391,7 @@ define <16 x i32> @vpaddd_fold_test(<16 x i32> %i, <16 x i32>* %j) nounwind {
define <16 x i32> @vpaddd_broadcast_test(<16 x i32> %i) nounwind {
; CHECK-LABEL: vpaddd_broadcast_test:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0
; CHECK-NEXT: retq
%x = add <16 x i32> %i, <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
@@ -400,8 +400,8 @@ define <16 x i32> @vpaddd_broadcast_test(<16 x i32> %i) nounwind {
define <16 x i32> @vpaddd_mask_test(<16 x i32> %i, <16 x i32> %j, <16 x i32> %mask1) nounwind readnone {
; CHECK-LABEL: vpaddd_mask_test:
-; CHECK: # BB#0:
-; CHECK-NEXT: vpxord %zmm3, %zmm3, %zmm3
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vpcmpneqd %zmm3, %zmm2, %k1
; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 {%k1}
; CHECK-NEXT: retq
@@ -413,8 +413,8 @@ define <16 x i32> @vpaddd_mask_test(<16 x i32> %i, <16 x i32> %j, <16 x i32> %ma
define <16 x i32> @vpaddd_maskz_test(<16 x i32> %i, <16 x i32> %j, <16 x i32> %mask1) nounwind readnone {
; CHECK-LABEL: vpaddd_maskz_test:
-; CHECK: # BB#0:
-; CHECK-NEXT: vpxord %zmm3, %zmm3, %zmm3
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vpcmpneqd %zmm3, %zmm2, %k1
; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -426,8 +426,8 @@ define <16 x i32> @vpaddd_maskz_test(<16 x i32> %i, <16 x i32> %j, <16 x i32> %m
define <16 x i32> @vpaddd_mask_fold_test(<16 x i32> %i, <16 x i32>* %j.ptr, <16 x i32> %mask1) nounwind readnone {
; CHECK-LABEL: vpaddd_mask_fold_test:
-; CHECK: # BB#0:
-; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vpcmpneqd %zmm2, %zmm1, %k1
; CHECK-NEXT: vpaddd (%rdi), %zmm0, %zmm0 {%k1}
; CHECK-NEXT: retq
@@ -440,8 +440,8 @@ define <16 x i32> @vpaddd_mask_fold_test(<16 x i32> %i, <16 x i32>* %j.ptr, <16
define <16 x i32> @vpaddd_mask_broadcast_test(<16 x i32> %i, <16 x i32> %mask1) nounwind readnone {
; CHECK-LABEL: vpaddd_mask_broadcast_test:
-; CHECK: # BB#0:
-; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vpcmpneqd %zmm2, %zmm1, %k1
; CHECK-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 {%k1}
; CHECK-NEXT: retq
@@ -453,8 +453,8 @@ define <16 x i32> @vpaddd_mask_broadcast_test(<16 x i32> %i, <16 x i32> %mask1)
define <16 x i32> @vpaddd_maskz_fold_test(<16 x i32> %i, <16 x i32>* %j.ptr, <16 x i32> %mask1) nounwind readnone {
; CHECK-LABEL: vpaddd_maskz_fold_test:
-; CHECK: # BB#0:
-; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vpcmpneqd %zmm2, %zmm1, %k1
; CHECK-NEXT: vpaddd (%rdi), %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -467,8 +467,8 @@ define <16 x i32> @vpaddd_maskz_fold_test(<16 x i32> %i, <16 x i32>* %j.ptr, <16
define <16 x i32> @vpaddd_maskz_broadcast_test(<16 x i32> %i, <16 x i32> %mask1) nounwind readnone {
; CHECK-LABEL: vpaddd_maskz_broadcast_test:
-; CHECK: # BB#0:
-; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vpcmpneqd %zmm2, %zmm1, %k1
; CHECK-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -480,7 +480,7 @@ define <16 x i32> @vpaddd_maskz_broadcast_test(<16 x i32> %i, <16 x i32> %mask1)
define <8 x i64> @vpsubq_test(<8 x i64> %i, <8 x i64> %j) nounwind readnone {
; CHECK-LABEL: vpsubq_test:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpsubq %zmm1, %zmm0, %zmm0
; CHECK-NEXT: retq
%x = sub <8 x i64> %i, %j
@@ -489,7 +489,7 @@ define <8 x i64> @vpsubq_test(<8 x i64> %i, <8 x i64> %j) nounwind readnone {
define <16 x i32> @vpsubd_test(<16 x i32> %i, <16 x i32> %j) nounwind readnone {
; CHECK-LABEL: vpsubd_test:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpsubd %zmm1, %zmm0, %zmm0
; CHECK-NEXT: retq
%x = sub <16 x i32> %i, %j
@@ -498,7 +498,7 @@ define <16 x i32> @vpsubd_test(<16 x i32> %i, <16 x i32> %j) nounwind readnone {
define <16 x i32> @vpmulld_test(<16 x i32> %i, <16 x i32> %j) {
; CHECK-LABEL: vpmulld_test:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpmulld %zmm1, %zmm0, %zmm0
; CHECK-NEXT: retq
%x = mul <16 x i32> %i, %j
@@ -508,7 +508,7 @@ define <16 x i32> @vpmulld_test(<16 x i32> %i, <16 x i32> %j) {
declare float @sqrtf(float) readnone
define float @sqrtA(float %a) nounwind uwtable readnone ssp {
; CHECK-LABEL: sqrtA:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vsqrtss %xmm0, %xmm0, %xmm0
; CHECK-NEXT: retq
entry:
@@ -519,7 +519,7 @@ entry:
declare double @sqrt(double) readnone
define double @sqrtB(double %a) nounwind uwtable readnone ssp {
; CHECK-LABEL: sqrtB:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0
; CHECK-NEXT: retq
entry:
@@ -530,7 +530,7 @@ entry:
declare float @llvm.sqrt.f32(float)
define float @sqrtC(float %a) nounwind {
; CHECK-LABEL: sqrtC:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vsqrtss %xmm0, %xmm0, %xmm0
; CHECK-NEXT: retq
%b = call float @llvm.sqrt.f32(float %a)
@@ -540,7 +540,7 @@ define float @sqrtC(float %a) nounwind {
declare <16 x float> @llvm.sqrt.v16f32(<16 x float>)
define <16 x float> @sqrtD(<16 x float> %a) nounwind {
; CHECK-LABEL: sqrtD:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vsqrtps %zmm0, %zmm0
; CHECK-NEXT: retq
%b = call <16 x float> @llvm.sqrt.v16f32(<16 x float> %a)
@@ -550,7 +550,7 @@ define <16 x float> @sqrtD(<16 x float> %a) nounwind {
declare <8 x double> @llvm.sqrt.v8f64(<8 x double>)
define <8 x double> @sqrtE(<8 x double> %a) nounwind {
; CHECK-LABEL: sqrtE:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vsqrtpd %zmm0, %zmm0
; CHECK-NEXT: retq
%b = call <8 x double> @llvm.sqrt.v8f64(<8 x double> %a)
@@ -559,7 +559,7 @@ define <8 x double> @sqrtE(<8 x double> %a) nounwind {
define <16 x float> @fadd_broadcast(<16 x float> %a) nounwind {
; CHECK-LABEL: fadd_broadcast:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vaddps {{.*}}(%rip){1to16}, %zmm0, %zmm0
; CHECK-NEXT: retq
%b = fadd <16 x float> %a, <float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000>
@@ -568,7 +568,7 @@ define <16 x float> @fadd_broadcast(<16 x float> %a) nounwind {
define <8 x i64> @addq_broadcast(<8 x i64> %a) nounwind {
; CHECK-LABEL: addq_broadcast:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm0
; CHECK-NEXT: retq
%b = add <8 x i64> %a, <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
@@ -577,27 +577,27 @@ define <8 x i64> @addq_broadcast(<8 x i64> %a) nounwind {
define <8 x i64> @orq_broadcast(<8 x i64> %a) nounwind {
; AVX512F-LABEL: orq_broadcast:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vporq {{.*}}(%rip){1to8}, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: orq_broadcast:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vporq {{.*}}(%rip){1to8}, %zmm0, %zmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: orq_broadcast:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vporq {{.*}}(%rip){1to8}, %zmm0, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: orq_broadcast:
-; AVX512DQ: # BB#0:
+; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vorpd {{.*}}(%rip){1to8}, %zmm0, %zmm0
; AVX512DQ-NEXT: retq
;
; SKX-LABEL: orq_broadcast:
-; SKX: # BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vorpd {{.*}}(%rip){1to8}, %zmm0, %zmm0
; SKX-NEXT: retq
%b = or <8 x i64> %a, <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
@@ -606,27 +606,27 @@ define <8 x i64> @orq_broadcast(<8 x i64> %a) nounwind {
define <16 x i32> @andd512fold(<16 x i32> %y, <16 x i32>* %x) {
; AVX512F-LABEL: andd512fold:
-; AVX512F: # BB#0: # %entry
-; AVX512F-NEXT: vpandd (%rdi), %zmm0, %zmm0
+; AVX512F: # %bb.0: # %entry
+; AVX512F-NEXT: vpandq (%rdi), %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: andd512fold:
-; AVX512VL: # BB#0: # %entry
-; AVX512VL-NEXT: vpandd (%rdi), %zmm0, %zmm0
+; AVX512VL: # %bb.0: # %entry
+; AVX512VL-NEXT: vpandq (%rdi), %zmm0, %zmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: andd512fold:
-; AVX512BW: # BB#0: # %entry
-; AVX512BW-NEXT: vpandd (%rdi), %zmm0, %zmm0
+; AVX512BW: # %bb.0: # %entry
+; AVX512BW-NEXT: vpandq (%rdi), %zmm0, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: andd512fold:
-; AVX512DQ: # BB#0: # %entry
+; AVX512DQ: # %bb.0: # %entry
; AVX512DQ-NEXT: vandps (%rdi), %zmm0, %zmm0
; AVX512DQ-NEXT: retq
;
; SKX-LABEL: andd512fold:
-; SKX: # BB#0: # %entry
+; SKX: # %bb.0: # %entry
; SKX-NEXT: vandps (%rdi), %zmm0, %zmm0
; SKX-NEXT: retq
entry:
@@ -637,27 +637,27 @@ entry:
define <8 x i64> @andqbrst(<8 x i64> %p1, i64* %ap) {
; AVX512F-LABEL: andqbrst:
-; AVX512F: # BB#0: # %entry
+; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vpandq (%rdi){1to8}, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: andqbrst:
-; AVX512VL: # BB#0: # %entry
+; AVX512VL: # %bb.0: # %entry
; AVX512VL-NEXT: vpandq (%rdi){1to8}, %zmm0, %zmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: andqbrst:
-; AVX512BW: # BB#0: # %entry
+; AVX512BW: # %bb.0: # %entry
; AVX512BW-NEXT: vpandq (%rdi){1to8}, %zmm0, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: andqbrst:
-; AVX512DQ: # BB#0: # %entry
+; AVX512DQ: # %bb.0: # %entry
; AVX512DQ-NEXT: vandpd (%rdi){1to8}, %zmm0, %zmm0
; AVX512DQ-NEXT: retq
;
; SKX-LABEL: andqbrst:
-; SKX: # BB#0: # %entry
+; SKX: # %bb.0: # %entry
; SKX-NEXT: vandpd (%rdi){1to8}, %zmm0, %zmm0
; SKX-NEXT: retq
entry:
@@ -670,8 +670,8 @@ entry:
define <16 x float> @test_mask_vaddps(<16 x float> %dst, <16 x float> %i,
; CHECK-LABEL: test_mask_vaddps:
-; CHECK: # BB#0:
-; CHECK-NEXT: vpxord %zmm4, %zmm4, %zmm4
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
; CHECK-NEXT: vpcmpneqd %zmm4, %zmm3, %k1
; CHECK-NEXT: vaddps %zmm2, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
@@ -685,8 +685,8 @@ define <16 x float> @test_mask_vaddps(<16 x float> %dst, <16 x float> %i,
define <16 x float> @test_mask_vmulps(<16 x float> %dst, <16 x float> %i,
; CHECK-LABEL: test_mask_vmulps:
-; CHECK: # BB#0:
-; CHECK-NEXT: vpxord %zmm4, %zmm4, %zmm4
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
; CHECK-NEXT: vpcmpneqd %zmm4, %zmm3, %k1
; CHECK-NEXT: vmulps %zmm2, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
@@ -700,8 +700,8 @@ define <16 x float> @test_mask_vmulps(<16 x float> %dst, <16 x float> %i,
define <16 x float> @test_mask_vminps(<16 x float> %dst, <16 x float> %i,
; CHECK-LABEL: test_mask_vminps:
-; CHECK: # BB#0:
-; CHECK-NEXT: vpxord %zmm4, %zmm4, %zmm4
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
; CHECK-NEXT: vpcmpneqd %zmm4, %zmm3, %k1
; CHECK-NEXT: vminps %zmm2, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
@@ -716,39 +716,39 @@ define <16 x float> @test_mask_vminps(<16 x float> %dst, <16 x float> %i,
define <8 x double> @test_mask_vminpd(<8 x double> %dst, <8 x double> %i,
; AVX512F-LABEL: test_mask_vminpd:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: # kill: %YMM3<def> %YMM3<kill> %ZMM3<def>
-; AVX512F-NEXT: vpxor %ymm4, %ymm4, %ymm4
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: # kill: def %ymm3 killed %ymm3 def %zmm3
+; AVX512F-NEXT: vpxor %xmm4, %xmm4, %xmm4
; AVX512F-NEXT: vpcmpneqd %zmm4, %zmm3, %k1
; AVX512F-NEXT: vminpd %zmm2, %zmm1, %zmm0 {%k1}
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: test_mask_vminpd:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vpxor %ymm4, %ymm4, %ymm4
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpxor %xmm4, %xmm4, %xmm4
; AVX512VL-NEXT: vpcmpneqd %ymm4, %ymm3, %k1
; AVX512VL-NEXT: vminpd %zmm2, %zmm1, %zmm0 {%k1}
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: test_mask_vminpd:
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: # kill: %YMM3<def> %YMM3<kill> %ZMM3<def>
-; AVX512BW-NEXT: vpxor %ymm4, %ymm4, %ymm4
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: # kill: def %ymm3 killed %ymm3 def %zmm3
+; AVX512BW-NEXT: vpxor %xmm4, %xmm4, %xmm4
; AVX512BW-NEXT: vpcmpneqd %zmm4, %zmm3, %k1
; AVX512BW-NEXT: vminpd %zmm2, %zmm1, %zmm0 {%k1}
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: test_mask_vminpd:
-; AVX512DQ: # BB#0:
-; AVX512DQ-NEXT: # kill: %YMM3<def> %YMM3<kill> %ZMM3<def>
-; AVX512DQ-NEXT: vpxor %ymm4, %ymm4, %ymm4
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: # kill: def %ymm3 killed %ymm3 def %zmm3
+; AVX512DQ-NEXT: vpxor %xmm4, %xmm4, %xmm4
; AVX512DQ-NEXT: vpcmpneqd %zmm4, %zmm3, %k1
; AVX512DQ-NEXT: vminpd %zmm2, %zmm1, %zmm0 {%k1}
; AVX512DQ-NEXT: retq
;
; SKX-LABEL: test_mask_vminpd:
-; SKX: # BB#0:
-; SKX-NEXT: vpxor %ymm4, %ymm4, %ymm4
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4
; SKX-NEXT: vpcmpneqd %ymm4, %ymm3, %k1
; SKX-NEXT: vminpd %zmm2, %zmm1, %zmm0 {%k1}
; SKX-NEXT: retq
@@ -763,8 +763,8 @@ define <8 x double> @test_mask_vminpd(<8 x double> %dst, <8 x double> %i,
define <16 x float> @test_mask_vmaxps(<16 x float> %dst, <16 x float> %i,
; CHECK-LABEL: test_mask_vmaxps:
-; CHECK: # BB#0:
-; CHECK-NEXT: vpxord %zmm4, %zmm4, %zmm4
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
; CHECK-NEXT: vpcmpneqd %zmm4, %zmm3, %k1
; CHECK-NEXT: vmaxps %zmm2, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
@@ -779,39 +779,39 @@ define <16 x float> @test_mask_vmaxps(<16 x float> %dst, <16 x float> %i,
define <8 x double> @test_mask_vmaxpd(<8 x double> %dst, <8 x double> %i,
; AVX512F-LABEL: test_mask_vmaxpd:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: # kill: %YMM3<def> %YMM3<kill> %ZMM3<def>
-; AVX512F-NEXT: vpxor %ymm4, %ymm4, %ymm4
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: # kill: def %ymm3 killed %ymm3 def %zmm3
+; AVX512F-NEXT: vpxor %xmm4, %xmm4, %xmm4
; AVX512F-NEXT: vpcmpneqd %zmm4, %zmm3, %k1
; AVX512F-NEXT: vmaxpd %zmm2, %zmm1, %zmm0 {%k1}
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: test_mask_vmaxpd:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vpxor %ymm4, %ymm4, %ymm4
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpxor %xmm4, %xmm4, %xmm4
; AVX512VL-NEXT: vpcmpneqd %ymm4, %ymm3, %k1
; AVX512VL-NEXT: vmaxpd %zmm2, %zmm1, %zmm0 {%k1}
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: test_mask_vmaxpd:
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: # kill: %YMM3<def> %YMM3<kill> %ZMM3<def>
-; AVX512BW-NEXT: vpxor %ymm4, %ymm4, %ymm4
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: # kill: def %ymm3 killed %ymm3 def %zmm3
+; AVX512BW-NEXT: vpxor %xmm4, %xmm4, %xmm4
; AVX512BW-NEXT: vpcmpneqd %zmm4, %zmm3, %k1
; AVX512BW-NEXT: vmaxpd %zmm2, %zmm1, %zmm0 {%k1}
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: test_mask_vmaxpd:
-; AVX512DQ: # BB#0:
-; AVX512DQ-NEXT: # kill: %YMM3<def> %YMM3<kill> %ZMM3<def>
-; AVX512DQ-NEXT: vpxor %ymm4, %ymm4, %ymm4
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: # kill: def %ymm3 killed %ymm3 def %zmm3
+; AVX512DQ-NEXT: vpxor %xmm4, %xmm4, %xmm4
; AVX512DQ-NEXT: vpcmpneqd %zmm4, %zmm3, %k1
; AVX512DQ-NEXT: vmaxpd %zmm2, %zmm1, %zmm0 {%k1}
; AVX512DQ-NEXT: retq
;
; SKX-LABEL: test_mask_vmaxpd:
-; SKX: # BB#0:
-; SKX-NEXT: vpxor %ymm4, %ymm4, %ymm4
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4
; SKX-NEXT: vpcmpneqd %ymm4, %ymm3, %k1
; SKX-NEXT: vmaxpd %zmm2, %zmm1, %zmm0 {%k1}
; SKX-NEXT: retq
@@ -826,8 +826,8 @@ define <8 x double> @test_mask_vmaxpd(<8 x double> %dst, <8 x double> %i,
define <16 x float> @test_mask_vsubps(<16 x float> %dst, <16 x float> %i,
; CHECK-LABEL: test_mask_vsubps:
-; CHECK: # BB#0:
-; CHECK-NEXT: vpxord %zmm4, %zmm4, %zmm4
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
; CHECK-NEXT: vpcmpneqd %zmm4, %zmm3, %k1
; CHECK-NEXT: vsubps %zmm2, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
@@ -841,8 +841,8 @@ define <16 x float> @test_mask_vsubps(<16 x float> %dst, <16 x float> %i,
define <16 x float> @test_mask_vdivps(<16 x float> %dst, <16 x float> %i,
; CHECK-LABEL: test_mask_vdivps:
-; CHECK: # BB#0:
-; CHECK-NEXT: vpxord %zmm4, %zmm4, %zmm4
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
; CHECK-NEXT: vpcmpneqd %zmm4, %zmm3, %k1
; CHECK-NEXT: vdivps %zmm2, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
@@ -856,8 +856,8 @@ define <16 x float> @test_mask_vdivps(<16 x float> %dst, <16 x float> %i,
define <8 x double> @test_mask_vaddpd(<8 x double> %dst, <8 x double> %i,
; CHECK-LABEL: test_mask_vaddpd:
-; CHECK: # BB#0:
-; CHECK-NEXT: vpxord %zmm4, %zmm4, %zmm4
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
; CHECK-NEXT: vpcmpneqq %zmm4, %zmm3, %k1
; CHECK-NEXT: vaddpd %zmm2, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
@@ -871,8 +871,8 @@ define <8 x double> @test_mask_vaddpd(<8 x double> %dst, <8 x double> %i,
define <8 x double> @test_maskz_vaddpd(<8 x double> %i, <8 x double> %j,
; CHECK-LABEL: test_maskz_vaddpd:
-; CHECK: # BB#0:
-; CHECK-NEXT: vpxord %zmm3, %zmm3, %zmm3
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vpcmpneqq %zmm3, %zmm2, %k1
; CHECK-NEXT: vaddpd %zmm1, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -885,8 +885,8 @@ define <8 x double> @test_maskz_vaddpd(<8 x double> %i, <8 x double> %j,
define <8 x double> @test_mask_fold_vaddpd(<8 x double> %dst, <8 x double> %i,
; CHECK-LABEL: test_mask_fold_vaddpd:
-; CHECK: # BB#0:
-; CHECK-NEXT: vpxord %zmm3, %zmm3, %zmm3
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vpcmpneqq %zmm3, %zmm2, %k1
; CHECK-NEXT: vaddpd (%rdi), %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
@@ -901,8 +901,8 @@ define <8 x double> @test_mask_fold_vaddpd(<8 x double> %dst, <8 x double> %i,
define <8 x double> @test_maskz_fold_vaddpd(<8 x double> %i, <8 x double>* %j,
; CHECK-LABEL: test_maskz_fold_vaddpd:
-; CHECK: # BB#0:
-; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vpcmpneqq %zmm2, %zmm1, %k1
; CHECK-NEXT: vaddpd (%rdi), %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -916,7 +916,7 @@ define <8 x double> @test_maskz_fold_vaddpd(<8 x double> %i, <8 x double>* %j,
define <8 x double> @test_broadcast_vaddpd(<8 x double> %i, double* %j) nounwind {
; CHECK-LABEL: test_broadcast_vaddpd:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vaddpd (%rdi){1to8}, %zmm0, %zmm0
; CHECK-NEXT: retq
%tmp = load double, double* %j
@@ -929,8 +929,8 @@ define <8 x double> @test_broadcast_vaddpd(<8 x double> %i, double* %j) nounwind
define <8 x double> @test_mask_broadcast_vaddpd(<8 x double> %dst, <8 x double> %i,
; CHECK-LABEL: test_mask_broadcast_vaddpd:
-; CHECK: # BB#0:
-; CHECK-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
; CHECK-NEXT: vpcmpneqq %zmm0, %zmm2, %k1
; CHECK-NEXT: vaddpd (%rdi){1to8}, %zmm1, %zmm1 {%k1}
; CHECK-NEXT: vmovapd %zmm1, %zmm0
@@ -948,8 +948,8 @@ define <8 x double> @test_mask_broadcast_vaddpd(<8 x double> %dst, <8 x double>
define <8 x double> @test_maskz_broadcast_vaddpd(<8 x double> %i, double* %j,
; CHECK-LABEL: test_maskz_broadcast_vaddpd:
-; CHECK: # BB#0:
-; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vpcmpneqq %zmm2, %zmm1, %k1
; CHECK-NEXT: vaddpd (%rdi){1to8}, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -966,27 +966,27 @@ define <8 x double> @test_maskz_broadcast_vaddpd(<8 x double> %i, double* %j,
define <16 x float> @test_fxor(<16 x float> %a) {
; AVX512F-LABEL: test_fxor:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vpxord {{.*}}(%rip){1to16}, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: test_fxor:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpxord {{.*}}(%rip){1to16}, %zmm0, %zmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: test_fxor:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpxord {{.*}}(%rip){1to16}, %zmm0, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: test_fxor:
-; AVX512DQ: # BB#0:
+; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vxorps {{.*}}(%rip){1to16}, %zmm0, %zmm0
; AVX512DQ-NEXT: retq
;
; SKX-LABEL: test_fxor:
-; SKX: # BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vxorps {{.*}}(%rip){1to16}, %zmm0, %zmm0
; SKX-NEXT: retq
@@ -996,30 +996,30 @@ define <16 x float> @test_fxor(<16 x float> %a) {
define <8 x float> @test_fxor_8f32(<8 x float> %a) {
; AVX512F-LABEL: test_fxor_8f32:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vbroadcastss {{.*}}(%rip), %ymm1
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vbroadcastss {{.*#+}} ymm1 = [-0,-0,-0,-0,-0,-0,-0,-0]
; AVX512F-NEXT: vxorps %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: test_fxor_8f32:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpxord {{.*}}(%rip){1to8}, %ymm0, %ymm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: test_fxor_8f32:
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vbroadcastss {{.*}}(%rip), %ymm1
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vbroadcastss {{.*#+}} ymm1 = [-0,-0,-0,-0,-0,-0,-0,-0]
; AVX512BW-NEXT: vxorps %ymm1, %ymm0, %ymm0
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: test_fxor_8f32:
-; AVX512DQ: # BB#0:
-; AVX512DQ-NEXT: vbroadcastss {{.*}}(%rip), %ymm1
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: vbroadcastss {{.*#+}} ymm1 = [-0,-0,-0,-0,-0,-0,-0,-0]
; AVX512DQ-NEXT: vxorps %ymm1, %ymm0, %ymm0
; AVX512DQ-NEXT: retq
;
; SKX-LABEL: test_fxor_8f32:
-; SKX: # BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vxorps {{.*}}(%rip){1to8}, %ymm0, %ymm0
; SKX-NEXT: retq
%res = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %a
@@ -1028,27 +1028,27 @@ define <8 x float> @test_fxor_8f32(<8 x float> %a) {
define <8 x double> @fabs_v8f64(<8 x double> %p)
; AVX512F-LABEL: fabs_v8f64:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: fabs_v8f64:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: fabs_v8f64:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: fabs_v8f64:
-; AVX512DQ: # BB#0:
+; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vandpd {{.*}}(%rip){1to8}, %zmm0, %zmm0
; AVX512DQ-NEXT: retq
;
; SKX-LABEL: fabs_v8f64:
-; SKX: # BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vandpd {{.*}}(%rip){1to8}, %zmm0, %zmm0
; SKX-NEXT: retq
{
@@ -1059,27 +1059,27 @@ declare <8 x double> @llvm.fabs.v8f64(<8 x double> %p)
define <16 x float> @fabs_v16f32(<16 x float> %p)
; AVX512F-LABEL: fabs_v16f32:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: fabs_v16f32:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: fabs_v16f32:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: fabs_v16f32:
-; AVX512DQ: # BB#0:
+; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vandps {{.*}}(%rip){1to16}, %zmm0, %zmm0
; AVX512DQ-NEXT: retq
;
; SKX-LABEL: fabs_v16f32:
-; SKX: # BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vandps {{.*}}(%rip){1to16}, %zmm0, %zmm0
; SKX-NEXT: retq
{
diff --git a/test/CodeGen/X86/avx512-bugfix-23634.ll b/test/CodeGen/X86/avx512-bugfix-23634.ll
index e66eefdb8e9f..97356854da62 100644
--- a/test/CodeGen/X86/avx512-bugfix-23634.ll
+++ b/test/CodeGen/X86/avx512-bugfix-23634.ll
@@ -1,12 +1,12 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
define void @f_fu(float* %ret, float* %aa, float %b) {
; CHECK-LABEL: f_fu:
-; CHECK: ## BB#0: ## %allocas
+; CHECK: ## %bb.0: ## %allocas
; CHECK-NEXT: vcvttss2si %xmm0, %eax
; CHECK-NEXT: vpbroadcastd %eax, %zmm0
; CHECK-NEXT: vcvttps2dq (%rsi), %zmm1
diff --git a/test/CodeGen/X86/avx512-bugfix-25270.ll b/test/CodeGen/X86/avx512-bugfix-25270.ll
index 47384fa98843..49c98bb5457b 100644
--- a/test/CodeGen/X86/avx512-bugfix-25270.ll
+++ b/test/CodeGen/X86/avx512-bugfix-25270.ll
@@ -5,7 +5,7 @@ declare void @Print__512(<16 x i32>) #0
define void @bar__512(<16 x i32>* %var) #0 {
; CHECK-LABEL: bar__512:
-; CHECK: ## BB#0: ## %allocas
+; CHECK: ## %bb.0: ## %allocas
; CHECK-NEXT: pushq %rbx
; CHECK-NEXT: subq $112, %rsp
; CHECK-NEXT: movq %rdi, %rbx
diff --git a/test/CodeGen/X86/avx512-bugfix-26264.ll b/test/CodeGen/X86/avx512-bugfix-26264.ll
index b29b6ee0658d..4d54fb715230 100644
--- a/test/CodeGen/X86/avx512-bugfix-26264.ll
+++ b/test/CodeGen/X86/avx512-bugfix-26264.ll
@@ -3,7 +3,7 @@
define <32 x double> @test_load_32f64(<32 x double>* %ptrs, <32 x i1> %mask, <32 x double> %src0) {
; AVX512BW-LABEL: test_load_32f64:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: vpsllw $7, %ymm0, %ymm0
; AVX512BW-NEXT: vpmovb2m %zmm0, %k1
; AVX512BW-NEXT: vblendmpd (%rdi), %zmm1, %zmm0 {%k1}
@@ -21,7 +21,7 @@ define <32 x double> @test_load_32f64(<32 x double>* %ptrs, <32 x i1> %mask, <32
define <32 x i64> @test_load_32i64(<32 x i64>* %ptrs, <32 x i1> %mask, <32 x i64> %src0) {
; AVX512BW-LABEL: test_load_32i64:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: vpsllw $7, %ymm0, %ymm0
; AVX512BW-NEXT: vpmovb2m %zmm0, %k1
; AVX512BW-NEXT: vpblendmq (%rdi), %zmm1, %zmm0 {%k1}
diff --git a/test/CodeGen/X86/avx512-build-vector.ll b/test/CodeGen/X86/avx512-build-vector.ll
index 618507a08bbb..c7664b61a336 100644
--- a/test/CodeGen/X86/avx512-build-vector.ll
+++ b/test/CodeGen/X86/avx512-build-vector.ll
@@ -1,9 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
define <16 x i32> @test2(<16 x i32> %x) {
; CHECK-LABEL: test2:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0
; CHECK-NEXT: retq
@@ -13,10 +13,10 @@ define <16 x i32> @test2(<16 x i32> %x) {
define <16 x float> @test3(<4 x float> %a) {
; CHECK-LABEL: test3:
-; CHECK: ## BB#0:
-; CHECK-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; CHECK: ## %bb.0:
+; CHECK-NEXT: ## kill: def %xmm0 killed %xmm0 def %zmm0
; CHECK-NEXT: vmovaps {{.*#+}} zmm2 = [0,1,2,3,4,18,16,7,8,9,10,11,12,13,14,15]
-; CHECK-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
; CHECK-NEXT: vpermt2ps %zmm0, %zmm2, %zmm1
; CHECK-NEXT: vmovaps %zmm1, %zmm0
; CHECK-NEXT: retq
diff --git a/test/CodeGen/X86/avx512-calling-conv.ll b/test/CodeGen/X86/avx512-calling-conv.ll
index 138b8750633c..6e6d61f37d2e 100644
--- a/test/CodeGen/X86/avx512-calling-conv.ll
+++ b/test/CodeGen/X86/avx512-calling-conv.ll
@@ -5,12 +5,12 @@
define <16 x i1> @test1() {
; ALL_X64-LABEL: test1:
-; ALL_X64: ## BB#0:
+; ALL_X64: ## %bb.0:
; ALL_X64-NEXT: vxorps %xmm0, %xmm0, %xmm0
; ALL_X64-NEXT: retq
;
; KNL_X32-LABEL: test1:
-; KNL_X32: ## BB#0:
+; KNL_X32: ## %bb.0:
; KNL_X32-NEXT: vxorps %xmm0, %xmm0, %xmm0
; KNL_X32-NEXT: retl
ret <16 x i1> zeroinitializer
@@ -18,7 +18,7 @@ define <16 x i1> @test1() {
define <16 x i1> @test2(<16 x i1>%a, <16 x i1>%b) {
; KNL-LABEL: test2:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: vpmovsxbd %xmm1, %zmm1
; KNL-NEXT: vpslld $31, %zmm1, %zmm1
; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
@@ -30,7 +30,7 @@ define <16 x i1> @test2(<16 x i1>%a, <16 x i1>%b) {
; KNL-NEXT: retq
;
; SKX-LABEL: test2:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpsllw $7, %xmm1, %xmm1
; SKX-NEXT: vpmovb2m %xmm1, %k0
; SKX-NEXT: vpsllw $7, %xmm0, %xmm0
@@ -40,7 +40,7 @@ define <16 x i1> @test2(<16 x i1>%a, <16 x i1>%b) {
; SKX-NEXT: retq
;
; KNL_X32-LABEL: test2:
-; KNL_X32: ## BB#0:
+; KNL_X32: ## %bb.0:
; KNL_X32-NEXT: vpmovsxbd %xmm1, %zmm1
; KNL_X32-NEXT: vpslld $31, %zmm1, %zmm1
; KNL_X32-NEXT: vpmovsxbd %xmm0, %zmm0
@@ -56,19 +56,20 @@ define <16 x i1> @test2(<16 x i1>%a, <16 x i1>%b) {
define <8 x i1> @test3(<8 x i1>%a, <8 x i1>%b) {
; KNL-LABEL: test3:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: vpmovsxwq %xmm1, %zmm1
; KNL-NEXT: vpsllq $63, %zmm1, %zmm1
; KNL-NEXT: vpmovsxwq %xmm0, %zmm0
; KNL-NEXT: vpsllq $63, %zmm0, %zmm0
; KNL-NEXT: vptestmq %zmm0, %zmm0, %k1
; KNL-NEXT: vptestmq %zmm1, %zmm1, %k1 {%k1}
-; KNL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; KNL-NEXT: vpmovqw %zmm0, %xmm0
+; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT: vpmovdw %zmm0, %ymm0
+; KNL-NEXT: ## kill: def %xmm0 killed %xmm0 killed %ymm0
; KNL-NEXT: retq
;
; SKX-LABEL: test3:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpsllw $15, %xmm1, %xmm1
; SKX-NEXT: vpmovw2m %xmm1, %k0
; SKX-NEXT: vpsllw $15, %xmm0, %xmm0
@@ -78,15 +79,16 @@ define <8 x i1> @test3(<8 x i1>%a, <8 x i1>%b) {
; SKX-NEXT: retq
;
; KNL_X32-LABEL: test3:
-; KNL_X32: ## BB#0:
+; KNL_X32: ## %bb.0:
; KNL_X32-NEXT: vpmovsxwq %xmm1, %zmm1
; KNL_X32-NEXT: vpsllq $63, %zmm1, %zmm1
; KNL_X32-NEXT: vpmovsxwq %xmm0, %zmm0
; KNL_X32-NEXT: vpsllq $63, %zmm0, %zmm0
; KNL_X32-NEXT: vptestmq %zmm0, %zmm0, %k1
; KNL_X32-NEXT: vptestmq %zmm1, %zmm1, %k1 {%k1}
-; KNL_X32-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; KNL_X32-NEXT: vpmovqw %zmm0, %xmm0
+; KNL_X32-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; KNL_X32-NEXT: vpmovdw %zmm0, %ymm0
+; KNL_X32-NEXT: ## kill: def %xmm0 killed %xmm0 killed %ymm0
; KNL_X32-NEXT: retl
%c = and <8 x i1>%a, %b
ret <8 x i1> %c
@@ -94,12 +96,12 @@ define <8 x i1> @test3(<8 x i1>%a, <8 x i1>%b) {
define <4 x i1> @test4(<4 x i1>%a, <4 x i1>%b) {
; KNL-LABEL: test4:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: vandps %xmm1, %xmm0, %xmm0
; KNL-NEXT: retq
;
; SKX-LABEL: test4:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpslld $31, %xmm1, %xmm1
; SKX-NEXT: vpslld $31, %xmm0, %xmm0
; SKX-NEXT: vptestmd %xmm0, %xmm0, %k1
@@ -108,7 +110,7 @@ define <4 x i1> @test4(<4 x i1>%a, <4 x i1>%b) {
; SKX-NEXT: retq
;
; KNL_X32-LABEL: test4:
-; KNL_X32: ## BB#0:
+; KNL_X32: ## %bb.0:
; KNL_X32-NEXT: vandps %xmm1, %xmm0, %xmm0
; KNL_X32-NEXT: retl
%c = and <4 x i1>%a, %b
@@ -119,13 +121,12 @@ declare <8 x i1> @func8xi1(<8 x i1> %a)
define <8 x i32> @test5(<8 x i32>%a, <8 x i32>%b) {
; KNL-LABEL: test5:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: pushq %rax
-; KNL-NEXT: Lcfi0:
; KNL-NEXT: .cfi_def_cfa_offset 16
; KNL-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0
; KNL-NEXT: vpmovdw %zmm0, %ymm0
-; KNL-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; KNL-NEXT: ## kill: def %xmm0 killed %xmm0 killed %ymm0
; KNL-NEXT: callq _func8xi1
; KNL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; KNL-NEXT: vpslld $31, %ymm0, %ymm0
@@ -134,9 +135,8 @@ define <8 x i32> @test5(<8 x i32>%a, <8 x i32>%b) {
; KNL-NEXT: retq
;
; SKX-LABEL: test5:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: pushq %rax
-; SKX-NEXT: Lcfi0:
; SKX-NEXT: .cfi_def_cfa_offset 16
; SKX-NEXT: vpcmpgtd %ymm1, %ymm0, %k0
; SKX-NEXT: vpmovm2w %k0, %xmm0
@@ -149,13 +149,12 @@ define <8 x i32> @test5(<8 x i32>%a, <8 x i32>%b) {
; SKX-NEXT: retq
;
; KNL_X32-LABEL: test5:
-; KNL_X32: ## BB#0:
+; KNL_X32: ## %bb.0:
; KNL_X32-NEXT: subl $12, %esp
-; KNL_X32-NEXT: Lcfi0:
; KNL_X32-NEXT: .cfi_def_cfa_offset 16
; KNL_X32-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0
; KNL_X32-NEXT: vpmovdw %zmm0, %ymm0
-; KNL_X32-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; KNL_X32-NEXT: ## kill: def %xmm0 killed %xmm0 killed %ymm0
; KNL_X32-NEXT: calll _func8xi1
; KNL_X32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; KNL_X32-NEXT: vpslld $31, %ymm0, %ymm0
@@ -172,9 +171,8 @@ declare <16 x i1> @func16xi1(<16 x i1> %a)
define <16 x i32> @test6(<16 x i32>%a, <16 x i32>%b) {
; KNL-LABEL: test6:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: pushq %rax
-; KNL-NEXT: Lcfi1:
; KNL-NEXT: .cfi_def_cfa_offset 16
; KNL-NEXT: vpcmpgtd %zmm1, %zmm0, %k1
; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
@@ -187,9 +185,8 @@ define <16 x i32> @test6(<16 x i32>%a, <16 x i32>%b) {
; KNL-NEXT: retq
;
; SKX-LABEL: test6:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: pushq %rax
-; SKX-NEXT: Lcfi1:
; SKX-NEXT: .cfi_def_cfa_offset 16
; SKX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0
; SKX-NEXT: vpmovm2b %k0, %xmm0
@@ -202,9 +199,8 @@ define <16 x i32> @test6(<16 x i32>%a, <16 x i32>%b) {
; SKX-NEXT: retq
;
; KNL_X32-LABEL: test6:
-; KNL_X32: ## BB#0:
+; KNL_X32: ## %bb.0:
; KNL_X32-NEXT: subl $12, %esp
-; KNL_X32-NEXT: Lcfi1:
; KNL_X32-NEXT: .cfi_def_cfa_offset 16
; KNL_X32-NEXT: vpcmpgtd %zmm1, %zmm0, %k1
; KNL_X32-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
@@ -225,9 +221,8 @@ declare <4 x i1> @func4xi1(<4 x i1> %a)
define <4 x i32> @test7(<4 x i32>%a, <4 x i32>%b) {
; KNL-LABEL: test7:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: pushq %rax
-; KNL-NEXT: Lcfi2:
; KNL-NEXT: .cfi_def_cfa_offset 16
; KNL-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
; KNL-NEXT: callq _func4xi1
@@ -237,9 +232,8 @@ define <4 x i32> @test7(<4 x i32>%a, <4 x i32>%b) {
; KNL-NEXT: retq
;
; SKX-LABEL: test7:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: pushq %rax
-; SKX-NEXT: Lcfi2:
; SKX-NEXT: .cfi_def_cfa_offset 16
; SKX-NEXT: vpcmpgtd %xmm1, %xmm0, %k0
; SKX-NEXT: vpmovm2d %k0, %xmm0
@@ -250,9 +244,8 @@ define <4 x i32> @test7(<4 x i32>%a, <4 x i32>%b) {
; SKX-NEXT: retq
;
; KNL_X32-LABEL: test7:
-; KNL_X32: ## BB#0:
+; KNL_X32: ## %bb.0:
; KNL_X32-NEXT: subl $12, %esp
-; KNL_X32-NEXT: Lcfi2:
; KNL_X32-NEXT: .cfi_def_cfa_offset 16
; KNL_X32-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
; KNL_X32-NEXT: calll _func4xi1
@@ -268,28 +261,27 @@ define <4 x i32> @test7(<4 x i32>%a, <4 x i32>%b) {
define <8 x i1> @test7a(<8 x i32>%a, <8 x i32>%b) {
; KNL-LABEL: test7a:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: pushq %rax
-; KNL-NEXT: Lcfi3:
; KNL-NEXT: .cfi_def_cfa_offset 16
; KNL-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0
; KNL-NEXT: vpmovdw %zmm0, %ymm0
-; KNL-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; KNL-NEXT: ## kill: def %xmm0 killed %xmm0 killed %ymm0
; KNL-NEXT: callq _func8xi1
; KNL-NEXT: vpmovsxwq %xmm0, %zmm0
; KNL-NEXT: vpsllq $63, %zmm0, %zmm0
; KNL-NEXT: movb $85, %al
; KNL-NEXT: kmovw %eax, %k1
; KNL-NEXT: vptestmq %zmm0, %zmm0, %k1 {%k1}
-; KNL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; KNL-NEXT: vpmovqw %zmm0, %xmm0
+; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT: vpmovdw %zmm0, %ymm0
+; KNL-NEXT: ## kill: def %xmm0 killed %xmm0 killed %ymm0
; KNL-NEXT: popq %rax
; KNL-NEXT: retq
;
; SKX-LABEL: test7a:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: pushq %rax
-; SKX-NEXT: Lcfi3:
; SKX-NEXT: .cfi_def_cfa_offset 16
; SKX-NEXT: vpcmpgtd %ymm1, %ymm0, %k0
; SKX-NEXT: vpmovm2w %k0, %xmm0
@@ -305,21 +297,21 @@ define <8 x i1> @test7a(<8 x i32>%a, <8 x i32>%b) {
; SKX-NEXT: retq
;
; KNL_X32-LABEL: test7a:
-; KNL_X32: ## BB#0:
+; KNL_X32: ## %bb.0:
; KNL_X32-NEXT: subl $12, %esp
-; KNL_X32-NEXT: Lcfi3:
; KNL_X32-NEXT: .cfi_def_cfa_offset 16
; KNL_X32-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0
; KNL_X32-NEXT: vpmovdw %zmm0, %ymm0
-; KNL_X32-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; KNL_X32-NEXT: ## kill: def %xmm0 killed %xmm0 killed %ymm0
; KNL_X32-NEXT: calll _func8xi1
; KNL_X32-NEXT: vpmovsxwq %xmm0, %zmm0
; KNL_X32-NEXT: vpsllq $63, %zmm0, %zmm0
; KNL_X32-NEXT: movb $85, %al
; KNL_X32-NEXT: kmovw %eax, %k1
; KNL_X32-NEXT: vptestmq %zmm0, %zmm0, %k1 {%k1}
-; KNL_X32-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; KNL_X32-NEXT: vpmovqw %zmm0, %xmm0
+; KNL_X32-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; KNL_X32-NEXT: vpmovdw %zmm0, %ymm0
+; KNL_X32-NEXT: ## kill: def %xmm0 killed %xmm0 killed %ymm0
; KNL_X32-NEXT: addl $12, %esp
; KNL_X32-NEXT: retl
%cmpRes = icmp sgt <8 x i32>%a, %b
@@ -330,19 +322,19 @@ define <8 x i1> @test7a(<8 x i32>%a, <8 x i32>%b) {
define <16 x i8> @test8(<16 x i8> %a1, <16 x i8> %a2, i1 %cond) {
; ALL_X64-LABEL: test8:
-; ALL_X64: ## BB#0:
+; ALL_X64: ## %bb.0:
; ALL_X64-NEXT: testb $1, %dil
; ALL_X64-NEXT: jne LBB8_2
-; ALL_X64-NEXT: ## BB#1:
+; ALL_X64-NEXT: ## %bb.1:
; ALL_X64-NEXT: vmovaps %xmm1, %xmm0
; ALL_X64-NEXT: LBB8_2:
; ALL_X64-NEXT: retq
;
; KNL_X32-LABEL: test8:
-; KNL_X32: ## BB#0:
+; KNL_X32: ## %bb.0:
; KNL_X32-NEXT: testb $1, {{[0-9]+}}(%esp)
; KNL_X32-NEXT: jne LBB8_2
-; KNL_X32-NEXT: ## BB#1:
+; KNL_X32-NEXT: ## %bb.1:
; KNL_X32-NEXT: vmovaps %xmm1, %xmm0
; KNL_X32-NEXT: LBB8_2:
; KNL_X32-NEXT: retl
@@ -352,13 +344,13 @@ define <16 x i8> @test8(<16 x i8> %a1, <16 x i8> %a2, i1 %cond) {
define i1 @test9(double %a, double %b) {
; ALL_X64-LABEL: test9:
-; ALL_X64: ## BB#0:
+; ALL_X64: ## %bb.0:
; ALL_X64-NEXT: vucomisd %xmm0, %xmm1
; ALL_X64-NEXT: setb %al
; ALL_X64-NEXT: retq
;
; KNL_X32-LABEL: test9:
-; KNL_X32: ## BB#0:
+; KNL_X32: ## %bb.0:
; KNL_X32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; KNL_X32-NEXT: vucomisd {{[0-9]+}}(%esp), %xmm0
; KNL_X32-NEXT: setb %al
@@ -369,14 +361,14 @@ define i1 @test9(double %a, double %b) {
define i32 @test10(i32 %a, i32 %b, i1 %cond) {
; ALL_X64-LABEL: test10:
-; ALL_X64: ## BB#0:
+; ALL_X64: ## %bb.0:
; ALL_X64-NEXT: testb $1, %dl
; ALL_X64-NEXT: cmovel %esi, %edi
; ALL_X64-NEXT: movl %edi, %eax
; ALL_X64-NEXT: retq
;
; KNL_X32-LABEL: test10:
-; KNL_X32: ## BB#0:
+; KNL_X32: ## %bb.0:
; KNL_X32-NEXT: testb $1, {{[0-9]+}}(%esp)
; KNL_X32-NEXT: leal {{[0-9]+}}(%esp), %eax
; KNL_X32-NEXT: leal {{[0-9]+}}(%esp), %ecx
@@ -389,13 +381,13 @@ define i32 @test10(i32 %a, i32 %b, i1 %cond) {
define i1 @test11(i32 %a, i32 %b) {
; ALL_X64-LABEL: test11:
-; ALL_X64: ## BB#0:
+; ALL_X64: ## %bb.0:
; ALL_X64-NEXT: cmpl %esi, %edi
; ALL_X64-NEXT: setg %al
; ALL_X64-NEXT: retq
;
; KNL_X32-LABEL: test11:
-; KNL_X32: ## BB#0:
+; KNL_X32: ## %bb.0:
; KNL_X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; KNL_X32-NEXT: cmpl {{[0-9]+}}(%esp), %eax
; KNL_X32-NEXT: setg %al
@@ -406,21 +398,15 @@ define i1 @test11(i32 %a, i32 %b) {
define i32 @test12(i32 %a1, i32 %a2, i32 %b1) {
; ALL_X64-LABEL: test12:
-; ALL_X64: ## BB#0:
+; ALL_X64: ## %bb.0:
; ALL_X64-NEXT: pushq %rbp
-; ALL_X64-NEXT: Lcfi4:
; ALL_X64-NEXT: .cfi_def_cfa_offset 16
; ALL_X64-NEXT: pushq %r14
-; ALL_X64-NEXT: Lcfi5:
; ALL_X64-NEXT: .cfi_def_cfa_offset 24
; ALL_X64-NEXT: pushq %rbx
-; ALL_X64-NEXT: Lcfi6:
; ALL_X64-NEXT: .cfi_def_cfa_offset 32
-; ALL_X64-NEXT: Lcfi7:
; ALL_X64-NEXT: .cfi_offset %rbx, -32
-; ALL_X64-NEXT: Lcfi8:
; ALL_X64-NEXT: .cfi_offset %r14, -24
-; ALL_X64-NEXT: Lcfi9:
; ALL_X64-NEXT: .cfi_offset %rbp, -16
; ALL_X64-NEXT: movl %esi, %r14d
; ALL_X64-NEXT: movl %edi, %ebp
@@ -440,24 +426,17 @@ define i32 @test12(i32 %a1, i32 %a2, i32 %b1) {
; ALL_X64-NEXT: retq
;
; KNL_X32-LABEL: test12:
-; KNL_X32: ## BB#0:
+; KNL_X32: ## %bb.0:
; KNL_X32-NEXT: pushl %ebx
-; KNL_X32-NEXT: Lcfi4:
; KNL_X32-NEXT: .cfi_def_cfa_offset 8
; KNL_X32-NEXT: pushl %edi
-; KNL_X32-NEXT: Lcfi5:
; KNL_X32-NEXT: .cfi_def_cfa_offset 12
; KNL_X32-NEXT: pushl %esi
-; KNL_X32-NEXT: Lcfi6:
; KNL_X32-NEXT: .cfi_def_cfa_offset 16
; KNL_X32-NEXT: subl $16, %esp
-; KNL_X32-NEXT: Lcfi7:
; KNL_X32-NEXT: .cfi_def_cfa_offset 32
-; KNL_X32-NEXT: Lcfi8:
; KNL_X32-NEXT: .cfi_offset %esi, -16
-; KNL_X32-NEXT: Lcfi9:
; KNL_X32-NEXT: .cfi_offset %edi, -12
-; KNL_X32-NEXT: Lcfi10:
; KNL_X32-NEXT: .cfi_offset %ebx, -8
; KNL_X32-NEXT: movl {{[0-9]+}}(%esp), %esi
; KNL_X32-NEXT: movl {{[0-9]+}}(%esp), %edi
diff --git a/test/CodeGen/X86/avx512-cmp-kor-sequence.ll b/test/CodeGen/X86/avx512-cmp-kor-sequence.ll
index e29cf09718ad..29b9afecbe5a 100644
--- a/test/CodeGen/X86/avx512-cmp-kor-sequence.ll
+++ b/test/CodeGen/X86/avx512-cmp-kor-sequence.ll
@@ -10,7 +10,7 @@ target triple = "x86_64-unknown-linux-gnu"
; Function Attrs: nounwind readnone uwtable
define zeroext i16 @cmp_kor_seq_16(<16 x float> %a, <16 x float> %b, <16 x float> %c, <16 x float> %d, <16 x float> %x) local_unnamed_addr #0 {
; CHECK-LABEL: cmp_kor_seq_16:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vcmpgeps %zmm4, %zmm0, %k0
; CHECK-NEXT: vcmpgeps %zmm4, %zmm1, %k1
; CHECK-NEXT: korw %k1, %k0, %k0
@@ -19,7 +19,7 @@ define zeroext i16 @cmp_kor_seq_16(<16 x float> %a, <16 x float> %b, <16 x float
; CHECK-NEXT: korw %k2, %k1, %k1
; CHECK-NEXT: korw %k1, %k0, %k0
; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; CHECK-NEXT: # kill: def %ax killed %ax killed %eax
; CHECK-NEXT: retq
entry:
%0 = tail call i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %a, <16 x float> %x, i32 13, i16 -1, i32 4)
diff --git a/test/CodeGen/X86/avx512-cmp.ll b/test/CodeGen/X86/avx512-cmp.ll
index b5a13404a230..f5b787de0648 100644
--- a/test/CodeGen/X86/avx512-cmp.ll
+++ b/test/CodeGen/X86/avx512-cmp.ll
@@ -1,10 +1,10 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s --check-prefix=ALL --check-prefix=KNL
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s --check-prefix=ALL --check-prefix=SKX
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=KNL
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=SKX
define double @test1(double %a, double %b) nounwind {
; ALL-LABEL: test1:
-; ALL: ## BB#0:
+; ALL: ## %bb.0:
; ALL-NEXT: vucomisd %xmm1, %xmm0
; ALL-NEXT: jne LBB0_1
; ALL-NEXT: jnp LBB0_2
@@ -28,10 +28,10 @@ l2:
define float @test2(float %a, float %b) nounwind {
; ALL-LABEL: test2:
-; ALL: ## BB#0:
+; ALL: ## %bb.0:
; ALL-NEXT: vucomiss %xmm0, %xmm1
; ALL-NEXT: jbe LBB1_2
-; ALL-NEXT: ## BB#1: ## %l1
+; ALL-NEXT: ## %bb.1: ## %l1
; ALL-NEXT: vsubss %xmm1, %xmm0, %xmm0
; ALL-NEXT: retq
; ALL-NEXT: LBB1_2: ## %l2
@@ -51,14 +51,14 @@ l2:
define i32 @test3(float %a, float %b) {
; KNL-LABEL: test3:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: vcmpeqss %xmm1, %xmm0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: movzbl %al, %eax
; KNL-NEXT: retq
;
; SKX-LABEL: test3:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vcmpeqss %xmm1, %xmm0, %k0
; SKX-NEXT: kmovd %k0, %eax
; SKX-NEXT: movzbl %al, %eax
@@ -71,12 +71,12 @@ define i32 @test3(float %a, float %b) {
define float @test5(float %p) #0 {
; ALL-LABEL: test5:
-; ALL: ## BB#0: ## %entry
+; ALL: ## %bb.0: ## %entry
; ALL-NEXT: vxorps %xmm1, %xmm1, %xmm1
; ALL-NEXT: vucomiss %xmm1, %xmm0
; ALL-NEXT: jne LBB3_1
; ALL-NEXT: jp LBB3_1
-; ALL-NEXT: ## BB#2: ## %return
+; ALL-NEXT: ## %bb.2: ## %return
; ALL-NEXT: retq
; ALL-NEXT: LBB3_1: ## %if.end
; ALL-NEXT: seta %al
@@ -100,7 +100,7 @@ return: ; preds = %if.end, %entry
define i32 @test6(i32 %a, i32 %b) {
; ALL-LABEL: test6:
-; ALL: ## BB#0:
+; ALL: ## %bb.0:
; ALL-NEXT: xorl %eax, %eax
; ALL-NEXT: cmpl %esi, %edi
; ALL-NEXT: sete %al
@@ -112,7 +112,7 @@ define i32 @test6(i32 %a, i32 %b) {
define i32 @test7(double %x, double %y) #2 {
; ALL-LABEL: test7:
-; ALL: ## BB#0: ## %entry
+; ALL: ## %bb.0: ## %entry
; ALL-NEXT: xorl %eax, %eax
; ALL-NEXT: vucomisd %xmm1, %xmm0
; ALL-NEXT: setne %al
@@ -125,7 +125,7 @@ entry:
define i32 @test8(i32 %a1, i32 %a2, i32 %a3) {
; ALL-LABEL: test8:
-; ALL: ## BB#0:
+; ALL: ## %bb.0:
; ALL-NEXT: notl %edi
; ALL-NEXT: xorl $-2147483648, %esi ## imm = 0x80000000
; ALL-NEXT: testl %edx, %edx
@@ -145,10 +145,10 @@ define i32 @test8(i32 %a1, i32 %a2, i32 %a3) {
define i32 @test9(i64 %a) {
; ALL-LABEL: test9:
-; ALL: ## BB#0:
+; ALL: ## %bb.0:
; ALL-NEXT: testb $1, %dil
; ALL-NEXT: jne LBB7_2
-; ALL-NEXT: ## BB#1: ## %A
+; ALL-NEXT: ## %bb.1: ## %A
; ALL-NEXT: movl $6, %eax
; ALL-NEXT: retq
; ALL-NEXT: LBB7_2: ## %B
@@ -165,7 +165,7 @@ B:
define i32 @test10(i64 %b, i64 %c, i1 %d) {
; ALL-LABEL: test10:
-; ALL: ## BB#0:
+; ALL: ## %bb.0:
; ALL-NEXT: movl %edx, %eax
; ALL-NEXT: andb $1, %al
; ALL-NEXT: cmpq %rsi, %rdi
@@ -174,7 +174,7 @@ define i32 @test10(i64 %b, i64 %c, i1 %d) {
; ALL-NEXT: andb $1, %cl
; ALL-NEXT: cmpb %cl, %al
; ALL-NEXT: je LBB8_1
-; ALL-NEXT: ## BB#2: ## %if.end.i
+; ALL-NEXT: ## %bb.2: ## %if.end.i
; ALL-NEXT: movl $6, %eax
; ALL-NEXT: retq
; ALL-NEXT: LBB8_1: ## %if.then.i
diff --git a/test/CodeGen/X86/avx512-cvt.ll b/test/CodeGen/X86/avx512-cvt.ll
index e10a781fabc2..e88ec9d7b159 100644
--- a/test/CodeGen/X86/avx512-cvt.ll
+++ b/test/CodeGen/X86/avx512-cvt.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl | FileCheck %s --check-prefix=ALL --check-prefix=NOVL --check-prefix=NODQ --check-prefix=NOVLDQ --check-prefix=KNL
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skx | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=DQ --check-prefix=VL --check-prefix=VLDQ --check-prefix=VLBW --check-prefix=SKX
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=NOVL --check-prefix=NODQ --check-prefix=NOVLDQ --check-prefix=KNL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=DQ --check-prefix=VL --check-prefix=VLDQ --check-prefix=VLBW --check-prefix=SKX
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=NODQ --check-prefix=VL --check-prefix=VLNODQ --check-prefix=VLNOBW --check-prefix=AVX512VL
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=NOVL --check-prefix=DQ --check-prefix=AVX512DQ
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=NOVL --check-prefix=NODQ --check-prefix=NOVLDQ --check-prefix=AVX512BW
@@ -10,7 +10,7 @@
define <16 x float> @sitof32(<16 x i32> %a) nounwind {
; ALL-LABEL: sitof32:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vcvtdq2ps %zmm0, %zmm0
; ALL-NEXT: retq
%b = sitofp <16 x i32> %a to <16 x float>
@@ -19,78 +19,104 @@ define <16 x float> @sitof32(<16 x i32> %a) nounwind {
define <8 x double> @sltof864(<8 x i64> %a) {
; NODQ-LABEL: sltof864:
-; NODQ: # BB#0:
+; NODQ: # %bb.0:
; NODQ-NEXT: vextracti32x4 $3, %zmm0, %xmm1
; NODQ-NEXT: vpextrq $1, %xmm1, %rax
; NODQ-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm2
; NODQ-NEXT: vmovq %xmm1, %rax
; NODQ-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm1
-; NODQ-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; NODQ-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; NODQ-NEXT: vextracti32x4 $2, %zmm0, %xmm2
; NODQ-NEXT: vpextrq $1, %xmm2, %rax
; NODQ-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm3
; NODQ-NEXT: vmovq %xmm2, %rax
; NODQ-NEXT: vcvtsi2sdq %rax, %xmm4, %xmm2
-; NODQ-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; NODQ-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
; NODQ-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
-; NODQ-NEXT: vextracti32x4 $1, %zmm0, %xmm2
+; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm2
; NODQ-NEXT: vpextrq $1, %xmm2, %rax
; NODQ-NEXT: vcvtsi2sdq %rax, %xmm4, %xmm3
; NODQ-NEXT: vmovq %xmm2, %rax
; NODQ-NEXT: vcvtsi2sdq %rax, %xmm4, %xmm2
-; NODQ-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; NODQ-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
; NODQ-NEXT: vpextrq $1, %xmm0, %rax
; NODQ-NEXT: vcvtsi2sdq %rax, %xmm4, %xmm3
; NODQ-NEXT: vmovq %xmm0, %rax
; NODQ-NEXT: vcvtsi2sdq %rax, %xmm4, %xmm0
-; NODQ-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm3[0]
+; NODQ-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0]
; NODQ-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; NODQ-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
; NODQ-NEXT: retq
;
; DQ-LABEL: sltof864:
-; DQ: # BB#0:
+; DQ: # %bb.0:
; DQ-NEXT: vcvtqq2pd %zmm0, %zmm0
; DQ-NEXT: retq
%b = sitofp <8 x i64> %a to <8 x double>
ret <8 x double> %b
}
-define <4 x double> @sltof464(<4 x i64> %a) {
-; NODQ-LABEL: sltof464:
-; NODQ: # BB#0:
+define <4 x double> @slto4f64(<4 x i64> %a) {
+; NODQ-LABEL: slto4f64:
+; NODQ: # %bb.0:
; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm1
; NODQ-NEXT: vpextrq $1, %xmm1, %rax
; NODQ-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm2
; NODQ-NEXT: vmovq %xmm1, %rax
; NODQ-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm1
-; NODQ-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; NODQ-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; NODQ-NEXT: vpextrq $1, %xmm0, %rax
; NODQ-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm2
; NODQ-NEXT: vmovq %xmm0, %rax
; NODQ-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm0
-; NODQ-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; NODQ-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; NODQ-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; NODQ-NEXT: retq
;
-; VLDQ-LABEL: sltof464:
-; VLDQ: # BB#0:
+; VLDQ-LABEL: slto4f64:
+; VLDQ: # %bb.0:
; VLDQ-NEXT: vcvtqq2pd %ymm0, %ymm0
; VLDQ-NEXT: retq
;
-; AVX512DQ-LABEL: sltof464:
-; AVX512DQ: # BB#0:
-; AVX512DQ-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512DQ-LABEL: slto4f64:
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; AVX512DQ-NEXT: vcvtqq2pd %zmm0, %zmm0
-; AVX512DQ-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512DQ-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
; AVX512DQ-NEXT: retq
%b = sitofp <4 x i64> %a to <4 x double>
ret <4 x double> %b
}
+define <2 x double> @slto2f64(<2 x i64> %a) {
+; NODQ-LABEL: slto2f64:
+; NODQ: # %bb.0:
+; NODQ-NEXT: vpextrq $1, %xmm0, %rax
+; NODQ-NEXT: vcvtsi2sdq %rax, %xmm1, %xmm1
+; NODQ-NEXT: vmovq %xmm0, %rax
+; NODQ-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm0
+; NODQ-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; NODQ-NEXT: retq
+;
+; VLDQ-LABEL: slto2f64:
+; VLDQ: # %bb.0:
+; VLDQ-NEXT: vcvtqq2pd %xmm0, %xmm0
+; VLDQ-NEXT: retq
+;
+; AVX512DQ-LABEL: slto2f64:
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: # kill: def %xmm0 killed %xmm0 def %zmm0
+; AVX512DQ-NEXT: vcvtqq2pd %zmm0, %zmm0
+; AVX512DQ-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0
+; AVX512DQ-NEXT: vzeroupper
+; AVX512DQ-NEXT: retq
+ %b = sitofp <2 x i64> %a to <2 x double>
+ ret <2 x double> %b
+}
+
define <2 x float> @sltof2f32(<2 x i64> %a) {
; NODQ-LABEL: sltof2f32:
-; NODQ: # BB#0:
+; NODQ: # %bb.0:
; NODQ-NEXT: vpextrq $1, %xmm0, %rax
; NODQ-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
; NODQ-NEXT: vmovq %xmm0, %rax
@@ -101,95 +127,60 @@ define <2 x float> @sltof2f32(<2 x i64> %a) {
; NODQ-NEXT: retq
;
; VLDQ-LABEL: sltof2f32:
-; VLDQ: # BB#0:
+; VLDQ: # %bb.0:
; VLDQ-NEXT: vcvtqq2ps %xmm0, %xmm0
; VLDQ-NEXT: retq
;
; AVX512DQ-LABEL: sltof2f32:
-; AVX512DQ: # BB#0:
-; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: # kill: def %xmm0 killed %xmm0 def %zmm0
; AVX512DQ-NEXT: vcvtqq2ps %zmm0, %ymm0
-; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512DQ-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
%b = sitofp <2 x i64> %a to <2 x float>
ret <2 x float>%b
}
-define <4 x float> @sltof4f32_mem(<4 x i64>* %a) {
-; KNL-LABEL: sltof4f32_mem:
-; KNL: # BB#0:
-; KNL-NEXT: vmovdqu (%rdi), %ymm0
-; KNL-NEXT: vpextrq $1, %xmm0, %rax
-; KNL-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
-; KNL-NEXT: vmovq %xmm0, %rax
-; KNL-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2
-; KNL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
-; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
-; KNL-NEXT: vmovq %xmm0, %rax
-; KNL-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2
-; KNL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
-; KNL-NEXT: vpextrq $1, %xmm0, %rax
-; KNL-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0
-; KNL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; KNL-NEXT: retq
+define <4 x float> @slto4f32_mem(<4 x i64>* %a) {
+; NODQ-LABEL: slto4f32_mem:
+; NODQ: # %bb.0:
+; NODQ-NEXT: vmovdqu (%rdi), %ymm0
+; NODQ-NEXT: vpextrq $1, %xmm0, %rax
+; NODQ-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
+; NODQ-NEXT: vmovq %xmm0, %rax
+; NODQ-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2
+; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
+; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm0
+; NODQ-NEXT: vmovq %xmm0, %rax
+; NODQ-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2
+; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
+; NODQ-NEXT: vpextrq $1, %xmm0, %rax
+; NODQ-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0
+; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; NODQ-NEXT: vzeroupper
+; NODQ-NEXT: retq
;
-; VLDQ-LABEL: sltof4f32_mem:
-; VLDQ: # BB#0:
+; VLDQ-LABEL: slto4f32_mem:
+; VLDQ: # %bb.0:
; VLDQ-NEXT: vcvtqq2psy (%rdi), %xmm0
; VLDQ-NEXT: retq
;
-; VLNODQ-LABEL: sltof4f32_mem:
-; VLNODQ: # BB#0:
-; VLNODQ-NEXT: vmovdqu (%rdi), %ymm0
-; VLNODQ-NEXT: vpextrq $1, %xmm0, %rax
-; VLNODQ-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
-; VLNODQ-NEXT: vmovq %xmm0, %rax
-; VLNODQ-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2
-; VLNODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
-; VLNODQ-NEXT: vextracti128 $1, %ymm0, %xmm0
-; VLNODQ-NEXT: vmovq %xmm0, %rax
-; VLNODQ-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2
-; VLNODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
-; VLNODQ-NEXT: vpextrq $1, %xmm0, %rax
-; VLNODQ-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0
-; VLNODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; VLNODQ-NEXT: vzeroupper
-; VLNODQ-NEXT: retq
-;
-; AVX512DQ-LABEL: sltof4f32_mem:
-; AVX512DQ: # BB#0:
+; AVX512DQ-LABEL: slto4f32_mem:
+; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vmovups (%rdi), %ymm0
; AVX512DQ-NEXT: vcvtqq2ps %zmm0, %ymm0
-; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512DQ-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
-;
-; AVX512BW-LABEL: sltof4f32_mem:
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vmovdqu (%rdi), %ymm0
-; AVX512BW-NEXT: vpextrq $1, %xmm0, %rax
-; AVX512BW-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
-; AVX512BW-NEXT: vmovq %xmm0, %rax
-; AVX512BW-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2
-; AVX512BW-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
-; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX512BW-NEXT: vmovq %xmm0, %rax
-; AVX512BW-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2
-; AVX512BW-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
-; AVX512BW-NEXT: vpextrq $1, %xmm0, %rax
-; AVX512BW-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0
-; AVX512BW-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
%a1 = load <4 x i64>, <4 x i64>* %a, align 8
%b = sitofp <4 x i64> %a1 to <4 x float>
ret <4 x float>%b
}
-define <4 x i64> @f64tosl(<4 x double> %a) {
-; NODQ-LABEL: f64tosl:
-; NODQ: # BB#0:
+define <4 x i64> @f64to4sl(<4 x double> %a) {
+; NODQ-LABEL: f64to4sl:
+; NODQ: # %bb.0:
; NODQ-NEXT: vextractf128 $1, %ymm0, %xmm1
; NODQ-NEXT: vcvttsd2si %xmm1, %rax
; NODQ-NEXT: vmovq %rax, %xmm2
@@ -206,24 +197,24 @@ define <4 x i64> @f64tosl(<4 x double> %a) {
; NODQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; NODQ-NEXT: retq
;
-; VLDQ-LABEL: f64tosl:
-; VLDQ: # BB#0:
+; VLDQ-LABEL: f64to4sl:
+; VLDQ: # %bb.0:
; VLDQ-NEXT: vcvttpd2qq %ymm0, %ymm0
; VLDQ-NEXT: retq
;
-; AVX512DQ-LABEL: f64tosl:
-; AVX512DQ: # BB#0:
-; AVX512DQ-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512DQ-LABEL: f64to4sl:
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; AVX512DQ-NEXT: vcvttpd2qq %zmm0, %zmm0
-; AVX512DQ-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512DQ-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
; AVX512DQ-NEXT: retq
%b = fptosi <4 x double> %a to <4 x i64>
ret <4 x i64> %b
}
-define <4 x i64> @f32tosl(<4 x float> %a) {
-; NODQ-LABEL: f32tosl:
-; NODQ: # BB#0:
+define <4 x i64> @f32to4sl(<4 x float> %a) {
+; NODQ-LABEL: f32to4sl:
+; NODQ: # %bb.0:
; NODQ-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
; NODQ-NEXT: vcvttss2si %xmm1, %rax
; NODQ-NEXT: vmovq %rax, %xmm1
@@ -240,339 +231,389 @@ define <4 x i64> @f32tosl(<4 x float> %a) {
; NODQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; NODQ-NEXT: retq
;
-; VLDQ-LABEL: f32tosl:
-; VLDQ: # BB#0:
+; VLDQ-LABEL: f32to4sl:
+; VLDQ: # %bb.0:
; VLDQ-NEXT: vcvttps2qq %xmm0, %ymm0
; VLDQ-NEXT: retq
;
-; AVX512DQ-LABEL: f32tosl:
-; AVX512DQ: # BB#0:
-; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; AVX512DQ-LABEL: f32to4sl:
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
; AVX512DQ-NEXT: vcvttps2qq %ymm0, %zmm0
-; AVX512DQ-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512DQ-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
; AVX512DQ-NEXT: retq
%b = fptosi <4 x float> %a to <4 x i64>
ret <4 x i64> %b
}
-define <4 x float> @sltof432(<4 x i64> %a) {
-; KNL-LABEL: sltof432:
-; KNL: # BB#0:
-; KNL-NEXT: vpextrq $1, %xmm0, %rax
-; KNL-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
-; KNL-NEXT: vmovq %xmm0, %rax
-; KNL-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2
-; KNL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
-; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
-; KNL-NEXT: vmovq %xmm0, %rax
-; KNL-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2
-; KNL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
-; KNL-NEXT: vpextrq $1, %xmm0, %rax
-; KNL-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0
-; KNL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; KNL-NEXT: retq
+define <4 x float> @slto4f32(<4 x i64> %a) {
+; NODQ-LABEL: slto4f32:
+; NODQ: # %bb.0:
+; NODQ-NEXT: vpextrq $1, %xmm0, %rax
+; NODQ-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
+; NODQ-NEXT: vmovq %xmm0, %rax
+; NODQ-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2
+; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
+; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm0
+; NODQ-NEXT: vmovq %xmm0, %rax
+; NODQ-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2
+; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
+; NODQ-NEXT: vpextrq $1, %xmm0, %rax
+; NODQ-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0
+; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; NODQ-NEXT: vzeroupper
+; NODQ-NEXT: retq
;
-; VLDQ-LABEL: sltof432:
-; VLDQ: # BB#0:
+; VLDQ-LABEL: slto4f32:
+; VLDQ: # %bb.0:
; VLDQ-NEXT: vcvtqq2ps %ymm0, %xmm0
; VLDQ-NEXT: vzeroupper
; VLDQ-NEXT: retq
;
-; VLNODQ-LABEL: sltof432:
-; VLNODQ: # BB#0:
-; VLNODQ-NEXT: vpextrq $1, %xmm0, %rax
-; VLNODQ-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
-; VLNODQ-NEXT: vmovq %xmm0, %rax
-; VLNODQ-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2
-; VLNODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
-; VLNODQ-NEXT: vextracti128 $1, %ymm0, %xmm0
-; VLNODQ-NEXT: vmovq %xmm0, %rax
-; VLNODQ-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2
-; VLNODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
-; VLNODQ-NEXT: vpextrq $1, %xmm0, %rax
-; VLNODQ-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0
-; VLNODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; VLNODQ-NEXT: vzeroupper
-; VLNODQ-NEXT: retq
-;
-; AVX512DQ-LABEL: sltof432:
-; AVX512DQ: # BB#0:
-; AVX512DQ-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512DQ-LABEL: slto4f32:
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; AVX512DQ-NEXT: vcvtqq2ps %zmm0, %ymm0
-; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512DQ-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
-;
-; AVX512BW-LABEL: sltof432:
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vpextrq $1, %xmm0, %rax
-; AVX512BW-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
-; AVX512BW-NEXT: vmovq %xmm0, %rax
-; AVX512BW-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2
-; AVX512BW-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
-; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX512BW-NEXT: vmovq %xmm0, %rax
-; AVX512BW-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2
-; AVX512BW-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
-; AVX512BW-NEXT: vpextrq $1, %xmm0, %rax
-; AVX512BW-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0
-; AVX512BW-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
%b = sitofp <4 x i64> %a to <4 x float>
ret <4 x float> %b
}
-define <4 x float> @ultof432(<4 x i64> %a) {
-; KNL-LABEL: ultof432:
-; KNL: # BB#0:
-; KNL-NEXT: vpextrq $1, %xmm0, %rax
-; KNL-NEXT: vcvtusi2ssq %rax, %xmm1, %xmm1
-; KNL-NEXT: vmovq %xmm0, %rax
-; KNL-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm2
-; KNL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
-; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
-; KNL-NEXT: vmovq %xmm0, %rax
-; KNL-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm2
-; KNL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
-; KNL-NEXT: vpextrq $1, %xmm0, %rax
-; KNL-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm0
-; KNL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; KNL-NEXT: retq
+define <4 x float> @ulto4f32(<4 x i64> %a) {
+; NODQ-LABEL: ulto4f32:
+; NODQ: # %bb.0:
+; NODQ-NEXT: vpextrq $1, %xmm0, %rax
+; NODQ-NEXT: vcvtusi2ssq %rax, %xmm1, %xmm1
+; NODQ-NEXT: vmovq %xmm0, %rax
+; NODQ-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm2
+; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
+; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm0
+; NODQ-NEXT: vmovq %xmm0, %rax
+; NODQ-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm2
+; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
+; NODQ-NEXT: vpextrq $1, %xmm0, %rax
+; NODQ-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm0
+; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; NODQ-NEXT: vzeroupper
+; NODQ-NEXT: retq
;
-; VLDQ-LABEL: ultof432:
-; VLDQ: # BB#0:
+; VLDQ-LABEL: ulto4f32:
+; VLDQ: # %bb.0:
; VLDQ-NEXT: vcvtuqq2ps %ymm0, %xmm0
; VLDQ-NEXT: vzeroupper
; VLDQ-NEXT: retq
;
-; VLNODQ-LABEL: ultof432:
-; VLNODQ: # BB#0:
-; VLNODQ-NEXT: vpextrq $1, %xmm0, %rax
-; VLNODQ-NEXT: vcvtusi2ssq %rax, %xmm1, %xmm1
-; VLNODQ-NEXT: vmovq %xmm0, %rax
-; VLNODQ-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm2
-; VLNODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
-; VLNODQ-NEXT: vextracti128 $1, %ymm0, %xmm0
-; VLNODQ-NEXT: vmovq %xmm0, %rax
-; VLNODQ-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm2
-; VLNODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
-; VLNODQ-NEXT: vpextrq $1, %xmm0, %rax
-; VLNODQ-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm0
-; VLNODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; VLNODQ-NEXT: vzeroupper
-; VLNODQ-NEXT: retq
-;
-; AVX512DQ-LABEL: ultof432:
-; AVX512DQ: # BB#0:
-; AVX512DQ-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512DQ-LABEL: ulto4f32:
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; AVX512DQ-NEXT: vcvtuqq2ps %zmm0, %ymm0
-; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512DQ-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
-;
-; AVX512BW-LABEL: ultof432:
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vpextrq $1, %xmm0, %rax
-; AVX512BW-NEXT: vcvtusi2ssq %rax, %xmm1, %xmm1
-; AVX512BW-NEXT: vmovq %xmm0, %rax
-; AVX512BW-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm2
-; AVX512BW-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
-; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX512BW-NEXT: vmovq %xmm0, %rax
-; AVX512BW-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm2
-; AVX512BW-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
-; AVX512BW-NEXT: vpextrq $1, %xmm0, %rax
-; AVX512BW-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm0
-; AVX512BW-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
%b = uitofp <4 x i64> %a to <4 x float>
ret <4 x float> %b
}
-define <8 x double> @ultof64(<8 x i64> %a) {
-; NODQ-LABEL: ultof64:
-; NODQ: # BB#0:
+define <8 x double> @ulto8f64(<8 x i64> %a) {
+; NODQ-LABEL: ulto8f64:
+; NODQ: # %bb.0:
; NODQ-NEXT: vextracti32x4 $3, %zmm0, %xmm1
; NODQ-NEXT: vpextrq $1, %xmm1, %rax
; NODQ-NEXT: vcvtusi2sdq %rax, %xmm2, %xmm2
; NODQ-NEXT: vmovq %xmm1, %rax
; NODQ-NEXT: vcvtusi2sdq %rax, %xmm3, %xmm1
-; NODQ-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; NODQ-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; NODQ-NEXT: vextracti32x4 $2, %zmm0, %xmm2
; NODQ-NEXT: vpextrq $1, %xmm2, %rax
; NODQ-NEXT: vcvtusi2sdq %rax, %xmm3, %xmm3
; NODQ-NEXT: vmovq %xmm2, %rax
; NODQ-NEXT: vcvtusi2sdq %rax, %xmm4, %xmm2
-; NODQ-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; NODQ-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
; NODQ-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
-; NODQ-NEXT: vextracti32x4 $1, %zmm0, %xmm2
+; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm2
; NODQ-NEXT: vpextrq $1, %xmm2, %rax
; NODQ-NEXT: vcvtusi2sdq %rax, %xmm4, %xmm3
; NODQ-NEXT: vmovq %xmm2, %rax
; NODQ-NEXT: vcvtusi2sdq %rax, %xmm4, %xmm2
-; NODQ-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; NODQ-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
; NODQ-NEXT: vpextrq $1, %xmm0, %rax
; NODQ-NEXT: vcvtusi2sdq %rax, %xmm4, %xmm3
; NODQ-NEXT: vmovq %xmm0, %rax
; NODQ-NEXT: vcvtusi2sdq %rax, %xmm4, %xmm0
-; NODQ-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm3[0]
+; NODQ-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0]
; NODQ-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; NODQ-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
; NODQ-NEXT: retq
;
-; DQ-LABEL: ultof64:
-; DQ: # BB#0:
+; DQ-LABEL: ulto8f64:
+; DQ: # %bb.0:
; DQ-NEXT: vcvtuqq2pd %zmm0, %zmm0
; DQ-NEXT: retq
%b = uitofp <8 x i64> %a to <8 x double>
ret <8 x double> %b
}
-define <16 x i32> @fptosi00(<16 x float> %a) nounwind {
-; ALL-LABEL: fptosi00:
-; ALL: # BB#0:
+define <16 x double> @ulto16f64(<16 x i64> %a) {
+; NODQ-LABEL: ulto16f64:
+; NODQ: # %bb.0:
+; NODQ-NEXT: vextracti32x4 $3, %zmm0, %xmm2
+; NODQ-NEXT: vpextrq $1, %xmm2, %rax
+; NODQ-NEXT: vcvtusi2sdq %rax, %xmm3, %xmm3
+; NODQ-NEXT: vmovq %xmm2, %rax
+; NODQ-NEXT: vcvtusi2sdq %rax, %xmm4, %xmm2
+; NODQ-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; NODQ-NEXT: vextracti32x4 $2, %zmm0, %xmm3
+; NODQ-NEXT: vpextrq $1, %xmm3, %rax
+; NODQ-NEXT: vcvtusi2sdq %rax, %xmm4, %xmm4
+; NODQ-NEXT: vmovq %xmm3, %rax
+; NODQ-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm3
+; NODQ-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
+; NODQ-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm3
+; NODQ-NEXT: vpextrq $1, %xmm3, %rax
+; NODQ-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm4
+; NODQ-NEXT: vmovq %xmm3, %rax
+; NODQ-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm3
+; NODQ-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
+; NODQ-NEXT: vpextrq $1, %xmm0, %rax
+; NODQ-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm4
+; NODQ-NEXT: vmovq %xmm0, %rax
+; NODQ-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm0
+; NODQ-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0]
+; NODQ-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
+; NODQ-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; NODQ-NEXT: vextracti32x4 $3, %zmm1, %xmm2
+; NODQ-NEXT: vpextrq $1, %xmm2, %rax
+; NODQ-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm3
+; NODQ-NEXT: vmovq %xmm2, %rax
+; NODQ-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm2
+; NODQ-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; NODQ-NEXT: vextracti32x4 $2, %zmm1, %xmm3
+; NODQ-NEXT: vpextrq $1, %xmm3, %rax
+; NODQ-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm4
+; NODQ-NEXT: vmovq %xmm3, %rax
+; NODQ-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm3
+; NODQ-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
+; NODQ-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; NODQ-NEXT: vextracti128 $1, %ymm1, %xmm3
+; NODQ-NEXT: vpextrq $1, %xmm3, %rax
+; NODQ-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm4
+; NODQ-NEXT: vmovq %xmm3, %rax
+; NODQ-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm3
+; NODQ-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
+; NODQ-NEXT: vpextrq $1, %xmm1, %rax
+; NODQ-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm4
+; NODQ-NEXT: vmovq %xmm1, %rax
+; NODQ-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm1
+; NODQ-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm4[0]
+; NODQ-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; NODQ-NEXT: vinsertf64x4 $1, %ymm2, %zmm1, %zmm1
+; NODQ-NEXT: retq
+;
+; DQ-LABEL: ulto16f64:
+; DQ: # %bb.0:
+; DQ-NEXT: vcvtuqq2pd %zmm0, %zmm0
+; DQ-NEXT: vcvtuqq2pd %zmm1, %zmm1
+; DQ-NEXT: retq
+ %b = uitofp <16 x i64> %a to <16 x double>
+ ret <16 x double> %b
+}
+
+define <16 x i32> @f64to16si(<16 x float> %a) nounwind {
+; ALL-LABEL: f64to16si:
+; ALL: # %bb.0:
; ALL-NEXT: vcvttps2dq %zmm0, %zmm0
; ALL-NEXT: retq
%b = fptosi <16 x float> %a to <16 x i32>
ret <16 x i32> %b
}
-define <16 x i32> @fptoui00(<16 x float> %a) nounwind {
-; ALL-LABEL: fptoui00:
-; ALL: # BB#0:
+define <16 x i8> @f32to16sc(<16 x float> %f) {
+; ALL-LABEL: f32to16sc:
+; ALL: # %bb.0:
+; ALL-NEXT: vcvttps2dq %zmm0, %zmm0
+; ALL-NEXT: vpmovdb %zmm0, %xmm0
+; ALL-NEXT: vzeroupper
+; ALL-NEXT: retq
+ %res = fptosi <16 x float> %f to <16 x i8>
+ ret <16 x i8> %res
+}
+
+define <16 x i16> @f32to16ss(<16 x float> %f) {
+; ALL-LABEL: f32to16ss:
+; ALL: # %bb.0:
+; ALL-NEXT: vcvttps2dq %zmm0, %zmm0
+; ALL-NEXT: vpmovdw %zmm0, %ymm0
+; ALL-NEXT: retq
+ %res = fptosi <16 x float> %f to <16 x i16>
+ ret <16 x i16> %res
+}
+
+define <16 x i32> @f32to16ui(<16 x float> %a) nounwind {
+; ALL-LABEL: f32to16ui:
+; ALL: # %bb.0:
; ALL-NEXT: vcvttps2udq %zmm0, %zmm0
; ALL-NEXT: retq
%b = fptoui <16 x float> %a to <16 x i32>
ret <16 x i32> %b
}
-define <8 x i32> @fptoui_256(<8 x float> %a) nounwind {
-; NOVL-LABEL: fptoui_256:
-; NOVL: # BB#0:
-; NOVL-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+define <16 x i8> @f32to16uc(<16 x float> %f) {
+; ALL-LABEL: f32to16uc:
+; ALL: # %bb.0:
+; ALL-NEXT: vcvttps2dq %zmm0, %zmm0
+; ALL-NEXT: vpmovdb %zmm0, %xmm0
+; ALL-NEXT: vzeroupper
+; ALL-NEXT: retq
+ %res = fptoui <16 x float> %f to <16 x i8>
+ ret <16 x i8> %res
+}
+
+define <16 x i16> @f32to16us(<16 x float> %f) {
+; ALL-LABEL: f32to16us:
+; ALL: # %bb.0:
+; ALL-NEXT: vcvttps2dq %zmm0, %zmm0
+; ALL-NEXT: vpmovdw %zmm0, %ymm0
+; ALL-NEXT: retq
+ %res = fptoui <16 x float> %f to <16 x i16>
+ ret <16 x i16> %res
+}
+
+define <8 x i32> @f32to8ui(<8 x float> %a) nounwind {
+; NOVL-LABEL: f32to8ui:
+; NOVL: # %bb.0:
+; NOVL-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NOVL-NEXT: vcvttps2udq %zmm0, %zmm0
-; NOVL-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; NOVL-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
; NOVL-NEXT: retq
;
-; VL-LABEL: fptoui_256:
-; VL: # BB#0:
+; VL-LABEL: f32to8ui:
+; VL: # %bb.0:
; VL-NEXT: vcvttps2udq %ymm0, %ymm0
; VL-NEXT: retq
%b = fptoui <8 x float> %a to <8 x i32>
ret <8 x i32> %b
}
-define <4 x i32> @fptoui_128(<4 x float> %a) nounwind {
-; KNL-LABEL: fptoui_128:
-; KNL: # BB#0:
-; KNL-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
-; KNL-NEXT: vcvttps2udq %zmm0, %zmm0
-; KNL-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
-; KNL-NEXT: retq
+define <4 x i32> @f32to4ui(<4 x float> %a) nounwind {
+; NOVL-LABEL: f32to4ui:
+; NOVL: # %bb.0:
+; NOVL-NEXT: # kill: def %xmm0 killed %xmm0 def %zmm0
+; NOVL-NEXT: vcvttps2udq %zmm0, %zmm0
+; NOVL-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0
+; NOVL-NEXT: vzeroupper
+; NOVL-NEXT: retq
;
-; VL-LABEL: fptoui_128:
-; VL: # BB#0:
+; VL-LABEL: f32to4ui:
+; VL: # %bb.0:
; VL-NEXT: vcvttps2udq %xmm0, %xmm0
; VL-NEXT: retq
-;
-; AVX512DQ-LABEL: fptoui_128:
-; AVX512DQ: # BB#0:
-; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
-; AVX512DQ-NEXT: vcvttps2udq %zmm0, %zmm0
-; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
-;
-; AVX512BW-LABEL: fptoui_128:
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
-; AVX512BW-NEXT: vcvttps2udq %zmm0, %zmm0
-; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
%b = fptoui <4 x float> %a to <4 x i32>
ret <4 x i32> %b
}
-define <8 x i32> @fptoui01(<8 x double> %a) nounwind {
-; ALL-LABEL: fptoui01:
-; ALL: # BB#0:
+define <8 x i32> @f64to8ui(<8 x double> %a) nounwind {
+; ALL-LABEL: f64to8ui:
+; ALL: # %bb.0:
; ALL-NEXT: vcvttpd2udq %zmm0, %ymm0
; ALL-NEXT: retq
%b = fptoui <8 x double> %a to <8 x i32>
ret <8 x i32> %b
}
-define <4 x i32> @fptoui_256d(<4 x double> %a) nounwind {
-; KNL-LABEL: fptoui_256d:
-; KNL: # BB#0:
-; KNL-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
-; KNL-NEXT: vcvttpd2udq %zmm0, %ymm0
-; KNL-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
-; KNL-NEXT: retq
+define <8 x i16> @f64to8us(<8 x double> %f) {
+; NOVL-LABEL: f64to8us:
+; NOVL: # %bb.0:
+; NOVL-NEXT: vcvttpd2dq %zmm0, %ymm0
+; NOVL-NEXT: vpmovdw %zmm0, %ymm0
+; NOVL-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
+; NOVL-NEXT: vzeroupper
+; NOVL-NEXT: retq
;
-; VL-LABEL: fptoui_256d:
-; VL: # BB#0:
-; VL-NEXT: vcvttpd2udq %ymm0, %xmm0
+; VL-LABEL: f64to8us:
+; VL: # %bb.0:
+; VL-NEXT: vcvttpd2dq %zmm0, %ymm0
+; VL-NEXT: vpmovdw %ymm0, %xmm0
; VL-NEXT: vzeroupper
; VL-NEXT: retq
+ %res = fptoui <8 x double> %f to <8 x i16>
+ ret <8 x i16> %res
+}
+
+define <8 x i8> @f64to8uc(<8 x double> %f) {
+; NOVL-LABEL: f64to8uc:
+; NOVL: # %bb.0:
+; NOVL-NEXT: vcvttpd2dq %zmm0, %ymm0
+; NOVL-NEXT: vpmovdw %zmm0, %ymm0
+; NOVL-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
+; NOVL-NEXT: vzeroupper
+; NOVL-NEXT: retq
;
-; AVX512DQ-LABEL: fptoui_256d:
-; AVX512DQ: # BB#0:
-; AVX512DQ-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
-; AVX512DQ-NEXT: vcvttpd2udq %zmm0, %ymm0
-; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
+; VL-LABEL: f64to8uc:
+; VL: # %bb.0:
+; VL-NEXT: vcvttpd2dq %zmm0, %ymm0
+; VL-NEXT: vpmovdw %ymm0, %xmm0
+; VL-NEXT: vzeroupper
+; VL-NEXT: retq
+ %res = fptoui <8 x double> %f to <8 x i8>
+ ret <8 x i8> %res
+}
+
+define <4 x i32> @f64to4ui(<4 x double> %a) nounwind {
+; NOVL-LABEL: f64to4ui:
+; NOVL: # %bb.0:
+; NOVL-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
+; NOVL-NEXT: vcvttpd2udq %zmm0, %ymm0
+; NOVL-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
+; NOVL-NEXT: vzeroupper
+; NOVL-NEXT: retq
;
-; AVX512BW-LABEL: fptoui_256d:
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
-; AVX512BW-NEXT: vcvttpd2udq %zmm0, %ymm0
-; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
+; VL-LABEL: f64to4ui:
+; VL: # %bb.0:
+; VL-NEXT: vcvttpd2udq %ymm0, %xmm0
+; VL-NEXT: vzeroupper
+; VL-NEXT: retq
%b = fptoui <4 x double> %a to <4 x i32>
ret <4 x i32> %b
}
-define <8 x double> @sitof64(<8 x i32> %a) {
-; ALL-LABEL: sitof64:
-; ALL: # BB#0:
+define <8 x double> @sito8f64(<8 x i32> %a) {
+; ALL-LABEL: sito8f64:
+; ALL: # %bb.0:
; ALL-NEXT: vcvtdq2pd %ymm0, %zmm0
; ALL-NEXT: retq
%b = sitofp <8 x i32> %a to <8 x double>
ret <8 x double> %b
}
-define <8 x double> @sitof64_mask(<8 x double> %a, <8 x i32> %b, i8 %c) nounwind {
-; KNL-LABEL: sitof64_mask:
-; KNL: # BB#0:
+define <8 x double> @i32to8f64_mask(<8 x double> %a, <8 x i32> %b, i8 %c) nounwind {
+; KNL-LABEL: i32to8f64_mask:
+; KNL: # %bb.0:
; KNL-NEXT: kmovw %edi, %k1
; KNL-NEXT: vcvtdq2pd %ymm1, %zmm0 {%k1}
; KNL-NEXT: retq
;
-; VLBW-LABEL: sitof64_mask:
-; VLBW: # BB#0:
+; VLBW-LABEL: i32to8f64_mask:
+; VLBW: # %bb.0:
; VLBW-NEXT: kmovd %edi, %k1
; VLBW-NEXT: vcvtdq2pd %ymm1, %zmm0 {%k1}
; VLBW-NEXT: retq
;
-; VLNOBW-LABEL: sitof64_mask:
-; VLNOBW: # BB#0:
+; VLNOBW-LABEL: i32to8f64_mask:
+; VLNOBW: # %bb.0:
; VLNOBW-NEXT: kmovw %edi, %k1
; VLNOBW-NEXT: vcvtdq2pd %ymm1, %zmm0 {%k1}
; VLNOBW-NEXT: retq
;
-; AVX512DQ-LABEL: sitof64_mask:
-; AVX512DQ: # BB#0:
+; AVX512DQ-LABEL: i32to8f64_mask:
+; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: kmovw %edi, %k1
; AVX512DQ-NEXT: vcvtdq2pd %ymm1, %zmm0 {%k1}
; AVX512DQ-NEXT: retq
;
-; AVX512BW-LABEL: sitof64_mask:
-; AVX512BW: # BB#0:
+; AVX512BW-LABEL: i32to8f64_mask:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: kmovd %edi, %k1
; AVX512BW-NEXT: vcvtdq2pd %ymm1, %zmm0 {%k1}
; AVX512BW-NEXT: retq
@@ -581,33 +622,33 @@ define <8 x double> @sitof64_mask(<8 x double> %a, <8 x i32> %b, i8 %c) nounwind
%3 = select <8 x i1> %1, <8 x double> %2, <8 x double> %a
ret <8 x double> %3
}
-define <8 x double> @sitof64_maskz(<8 x i32> %a, i8 %b) nounwind {
-; KNL-LABEL: sitof64_maskz:
-; KNL: # BB#0:
+define <8 x double> @sito8f64_maskz(<8 x i32> %a, i8 %b) nounwind {
+; KNL-LABEL: sito8f64_maskz:
+; KNL: # %bb.0:
; KNL-NEXT: kmovw %edi, %k1
; KNL-NEXT: vcvtdq2pd %ymm0, %zmm0 {%k1} {z}
; KNL-NEXT: retq
;
-; VLBW-LABEL: sitof64_maskz:
-; VLBW: # BB#0:
+; VLBW-LABEL: sito8f64_maskz:
+; VLBW: # %bb.0:
; VLBW-NEXT: kmovd %edi, %k1
; VLBW-NEXT: vcvtdq2pd %ymm0, %zmm0 {%k1} {z}
; VLBW-NEXT: retq
;
-; VLNOBW-LABEL: sitof64_maskz:
-; VLNOBW: # BB#0:
+; VLNOBW-LABEL: sito8f64_maskz:
+; VLNOBW: # %bb.0:
; VLNOBW-NEXT: kmovw %edi, %k1
; VLNOBW-NEXT: vcvtdq2pd %ymm0, %zmm0 {%k1} {z}
; VLNOBW-NEXT: retq
;
-; AVX512DQ-LABEL: sitof64_maskz:
-; AVX512DQ: # BB#0:
+; AVX512DQ-LABEL: sito8f64_maskz:
+; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: kmovw %edi, %k1
; AVX512DQ-NEXT: vcvtdq2pd %ymm0, %zmm0 {%k1} {z}
; AVX512DQ-NEXT: retq
;
-; AVX512BW-LABEL: sitof64_maskz:
-; AVX512BW: # BB#0:
+; AVX512BW-LABEL: sito8f64_maskz:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: kmovd %edi, %k1
; AVX512BW-NEXT: vcvtdq2pd %ymm0, %zmm0 {%k1} {z}
; AVX512BW-NEXT: retq
@@ -617,105 +658,71 @@ define <8 x double> @sitof64_maskz(<8 x i32> %a, i8 %b) nounwind {
ret <8 x double> %3
}
-define <8 x i32> @fptosi01(<8 x double> %a) {
-; ALL-LABEL: fptosi01:
-; ALL: # BB#0:
+define <8 x i32> @f64to8si(<8 x double> %a) {
+; ALL-LABEL: f64to8si:
+; ALL: # %bb.0:
; ALL-NEXT: vcvttpd2dq %zmm0, %ymm0
; ALL-NEXT: retq
%b = fptosi <8 x double> %a to <8 x i32>
ret <8 x i32> %b
}
-define <4 x i32> @fptosi03(<4 x double> %a) {
-; KNL-LABEL: fptosi03:
-; KNL: # BB#0:
-; KNL-NEXT: vcvttpd2dq %ymm0, %xmm0
-; KNL-NEXT: retq
-;
-; AVX512-LABEL: fptosi03:
-; AVX512: # BB#0:
-; AVX512-NEXT: vcvttpd2dq %ymm0, %xmm0
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
+define <4 x i32> @f64to4si(<4 x double> %a) {
+; ALL-LABEL: f64to4si:
+; ALL: # %bb.0:
+; ALL-NEXT: vcvttpd2dq %ymm0, %xmm0
+; ALL-NEXT: vzeroupper
+; ALL-NEXT: retq
%b = fptosi <4 x double> %a to <4 x i32>
ret <4 x i32> %b
}
-define <16 x float> @fptrunc00(<16 x double> %b) nounwind {
-; NODQ-LABEL: fptrunc00:
-; NODQ: # BB#0:
-; NODQ-NEXT: vcvtpd2ps %zmm0, %ymm0
-; NODQ-NEXT: vcvtpd2ps %zmm1, %ymm1
-; NODQ-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
-; NODQ-NEXT: retq
-;
-; DQ-LABEL: fptrunc00:
-; DQ: # BB#0:
-; DQ-NEXT: vcvtpd2ps %zmm0, %ymm0
-; DQ-NEXT: vcvtpd2ps %zmm1, %ymm1
-; DQ-NEXT: vinsertf32x8 $1, %ymm1, %zmm0, %zmm0
-; DQ-NEXT: retq
+define <16 x float> @f64to16f32(<16 x double> %b) nounwind {
+; ALL-LABEL: f64to16f32:
+; ALL: # %bb.0:
+; ALL-NEXT: vcvtpd2ps %zmm0, %ymm0
+; ALL-NEXT: vcvtpd2ps %zmm1, %ymm1
+; ALL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; ALL-NEXT: retq
%a = fptrunc <16 x double> %b to <16 x float>
ret <16 x float> %a
}
-define <4 x float> @fptrunc01(<4 x double> %b) {
-; KNL-LABEL: fptrunc01:
-; KNL: # BB#0:
-; KNL-NEXT: vcvtpd2ps %ymm0, %xmm0
-; KNL-NEXT: retq
-;
-; AVX512-LABEL: fptrunc01:
-; AVX512: # BB#0:
-; AVX512-NEXT: vcvtpd2ps %ymm0, %xmm0
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
+define <4 x float> @f64to4f32(<4 x double> %b) {
+; ALL-LABEL: f64to4f32:
+; ALL: # %bb.0:
+; ALL-NEXT: vcvtpd2ps %ymm0, %xmm0
+; ALL-NEXT: vzeroupper
+; ALL-NEXT: retq
%a = fptrunc <4 x double> %b to <4 x float>
ret <4 x float> %a
}
-define <4 x float> @fptrunc02(<4 x double> %b, <4 x i1> %mask) {
-; KNL-LABEL: fptrunc02:
-; KNL: # BB#0:
-; KNL-NEXT: vpslld $31, %xmm1, %xmm1
-; KNL-NEXT: vpsrad $31, %xmm1, %xmm1
-; KNL-NEXT: vcvtpd2ps %ymm0, %xmm0
-; KNL-NEXT: vpand %xmm0, %xmm1, %xmm0
-; KNL-NEXT: retq
+define <4 x float> @f64to4f32_mask(<4 x double> %b, <4 x i1> %mask) {
+; NOVL-LABEL: f64to4f32_mask:
+; NOVL: # %bb.0:
+; NOVL-NEXT: vpslld $31, %xmm1, %xmm1
+; NOVL-NEXT: vpsrad $31, %xmm1, %xmm1
+; NOVL-NEXT: vcvtpd2ps %ymm0, %xmm0
+; NOVL-NEXT: vpand %xmm0, %xmm1, %xmm0
+; NOVL-NEXT: vzeroupper
+; NOVL-NEXT: retq
;
-; VL-LABEL: fptrunc02:
-; VL: # BB#0:
+; VL-LABEL: f64to4f32_mask:
+; VL: # %bb.0:
; VL-NEXT: vpslld $31, %xmm1, %xmm1
; VL-NEXT: vptestmd %xmm1, %xmm1, %k1
; VL-NEXT: vcvtpd2ps %ymm0, %xmm0 {%k1} {z}
; VL-NEXT: vzeroupper
; VL-NEXT: retq
-;
-; AVX512DQ-LABEL: fptrunc02:
-; AVX512DQ: # BB#0:
-; AVX512DQ-NEXT: vpslld $31, %xmm1, %xmm1
-; AVX512DQ-NEXT: vpsrad $31, %xmm1, %xmm1
-; AVX512DQ-NEXT: vcvtpd2ps %ymm0, %xmm0
-; AVX512DQ-NEXT: vpand %xmm0, %xmm1, %xmm0
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
-;
-; AVX512BW-LABEL: fptrunc02:
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vpslld $31, %xmm1, %xmm1
-; AVX512BW-NEXT: vpsrad $31, %xmm1, %xmm1
-; AVX512BW-NEXT: vcvtpd2ps %ymm0, %xmm0
-; AVX512BW-NEXT: vpand %xmm0, %xmm1, %xmm0
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
%a = fptrunc <4 x double> %b to <4 x float>
%c = select <4 x i1>%mask, <4 x float>%a, <4 x float> zeroinitializer
ret <4 x float> %c
}
-define <4 x float> @fptrunc03(<2 x double> %a0, <4 x float> %a1) nounwind {
-; ALL-LABEL: fptrunc03:
-; ALL: # BB#0:
+define <4 x float> @f64tof32_inreg(<2 x double> %a0, <4 x float> %a1) nounwind {
+; ALL-LABEL: f64tof32_inreg:
+; ALL: # %bb.0:
; ALL-NEXT: vcvtsd2ss %xmm0, %xmm1, %xmm0
; ALL-NEXT: retq
%ext = extractelement <2 x double> %a0, i32 0
@@ -724,37 +731,37 @@ define <4 x float> @fptrunc03(<2 x double> %a0, <4 x float> %a1) nounwind {
ret <4 x float> %res
}
-define <8 x double> @fpext00(<8 x float> %b) nounwind {
-; ALL-LABEL: fpext00:
-; ALL: # BB#0:
+define <8 x double> @f32to8f64(<8 x float> %b) nounwind {
+; ALL-LABEL: f32to8f64:
+; ALL: # %bb.0:
; ALL-NEXT: vcvtps2pd %ymm0, %zmm0
; ALL-NEXT: retq
%a = fpext <8 x float> %b to <8 x double>
ret <8 x double> %a
}
-define <4 x double> @fpext01(<4 x float> %b, <4 x double>%b1, <4 x double>%a1) {
-; NOVL-LABEL: fpext01:
-; NOVL: # BB#0:
+define <4 x double> @f32to4f64_mask(<4 x float> %b, <4 x double> %b1, <4 x double> %a1) {
+; NOVL-LABEL: f32to4f64_mask:
+; NOVL: # %bb.0:
; NOVL-NEXT: vcvtps2pd %xmm0, %ymm0
; NOVL-NEXT: vcmpltpd %ymm2, %ymm1, %ymm1
; NOVL-NEXT: vandpd %ymm0, %ymm1, %ymm0
; NOVL-NEXT: retq
;
-; VL-LABEL: fpext01:
-; VL: # BB#0:
+; VL-LABEL: f32to4f64_mask:
+; VL: # %bb.0:
; VL-NEXT: vcmpltpd %ymm2, %ymm1, %k1
; VL-NEXT: vcvtps2pd %xmm0, %ymm0 {%k1} {z}
; VL-NEXT: retq
%a = fpext <4 x float> %b to <4 x double>
- %mask = fcmp ogt <4 x double>%a1, %b1
- %c = select <4 x i1>%mask, <4 x double>%a, <4 x double>zeroinitializer
+ %mask = fcmp ogt <4 x double> %a1, %b1
+ %c = select <4 x i1> %mask, <4 x double> %a, <4 x double> zeroinitializer
ret <4 x double> %c
}
-define <2 x double> @fpext02(<2 x double> %a0, <4 x float> %a1) nounwind {
-; ALL-LABEL: fpext02:
-; ALL: # BB#0:
+define <2 x double> @f32tof64_inreg(<2 x double> %a0, <4 x float> %a1) nounwind {
+; ALL-LABEL: f32tof64_inreg:
+; ALL: # %bb.0:
; ALL-NEXT: vcvtss2sd %xmm1, %xmm0, %xmm0
; ALL-NEXT: retq
%ext = extractelement <4 x float> %a1, i32 0
@@ -763,9 +770,9 @@ define <2 x double> @fpext02(<2 x double> %a0, <4 x float> %a1) nounwind {
ret <2 x double> %res
}
-define double @funcA(i64* nocapture %e) {
-; ALL-LABEL: funcA:
-; ALL: # BB#0: # %entry
+define double @sltof64_load(i64* nocapture %e) {
+; ALL-LABEL: sltof64_load:
+; ALL: # %bb.0: # %entry
; ALL-NEXT: vcvtsi2sdq (%rdi), %xmm0, %xmm0
; ALL-NEXT: retq
entry:
@@ -774,9 +781,9 @@ entry:
ret double %conv
}
-define double @funcB(i32* %e) {
-; ALL-LABEL: funcB:
-; ALL: # BB#0: # %entry
+define double @sitof64_load(i32* %e) {
+; ALL-LABEL: sitof64_load:
+; ALL: # %bb.0: # %entry
; ALL-NEXT: vcvtsi2sdl (%rdi), %xmm0, %xmm0
; ALL-NEXT: retq
entry:
@@ -785,9 +792,9 @@ entry:
ret double %conv
}
-define float @funcC(i32* %e) {
-; ALL-LABEL: funcC:
-; ALL: # BB#0: # %entry
+define float @sitof32_load(i32* %e) {
+; ALL-LABEL: sitof32_load:
+; ALL: # %bb.0: # %entry
; ALL-NEXT: vcvtsi2ssl (%rdi), %xmm0, %xmm0
; ALL-NEXT: retq
entry:
@@ -796,9 +803,9 @@ entry:
ret float %conv
}
-define float @i64tof32(i64* %e) {
-; ALL-LABEL: i64tof32:
-; ALL: # BB#0: # %entry
+define float @sltof32_load(i64* %e) {
+; ALL-LABEL: sltof32_load:
+; ALL: # %bb.0: # %entry
; ALL-NEXT: vcvtsi2ssq (%rdi), %xmm0, %xmm0
; ALL-NEXT: retq
entry:
@@ -807,9 +814,9 @@ entry:
ret float %conv
}
-define void @fpext() {
-; ALL-LABEL: fpext:
-; ALL: # BB#0: # %entry
+define void @f32tof64_loadstore() {
+; ALL-LABEL: f32tof64_loadstore:
+; ALL: # %bb.0: # %entry
; ALL-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
; ALL-NEXT: vmovsd %xmm0, -{{[0-9]+}}(%rsp)
@@ -823,9 +830,9 @@ entry:
ret void
}
-define void @fpround_scalar() nounwind uwtable {
-; ALL-LABEL: fpround_scalar:
-; ALL: # BB#0: # %entry
+define void @f64tof32_loadstore() nounwind uwtable {
+; ALL-LABEL: f64tof32_loadstore:
+; ALL: # %bb.0: # %entry
; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; ALL-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0
; ALL-NEXT: vmovss %xmm0, -{{[0-9]+}}(%rsp)
@@ -841,7 +848,7 @@ entry:
define double @long_to_double(i64 %x) {
; ALL-LABEL: long_to_double:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vmovq %rdi, %xmm0
; ALL-NEXT: retq
%res = bitcast i64 %x to double
@@ -850,7 +857,7 @@ define double @long_to_double(i64 %x) {
define i64 @double_to_long(double %x) {
; ALL-LABEL: double_to_long:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vmovq %xmm0, %rax
; ALL-NEXT: retq
%res = bitcast double %x to i64
@@ -859,7 +866,7 @@ define i64 @double_to_long(double %x) {
define float @int_to_float(i32 %x) {
; ALL-LABEL: int_to_float:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vmovd %edi, %xmm0
; ALL-NEXT: retq
%res = bitcast i32 %x to float
@@ -868,59 +875,371 @@ define float @int_to_float(i32 %x) {
define i32 @float_to_int(float %x) {
; ALL-LABEL: float_to_int:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vmovd %xmm0, %eax
; ALL-NEXT: retq
%res = bitcast float %x to i32
ret i32 %res
}
-define <16 x double> @uitof64(<16 x i32> %a) nounwind {
-; NODQ-LABEL: uitof64:
-; NODQ: # BB#0:
-; NODQ-NEXT: vcvtudq2pd %ymm0, %zmm2
-; NODQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0
-; NODQ-NEXT: vcvtudq2pd %ymm0, %zmm1
-; NODQ-NEXT: vmovaps %zmm2, %zmm0
+define <16 x double> @uito16f64(<16 x i32> %a) nounwind {
+; ALL-LABEL: uito16f64:
+; ALL: # %bb.0:
+; ALL-NEXT: vcvtudq2pd %ymm0, %zmm2
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vcvtudq2pd %ymm0, %zmm1
+; ALL-NEXT: vmovaps %zmm2, %zmm0
+; ALL-NEXT: retq
+ %b = uitofp <16 x i32> %a to <16 x double>
+ ret <16 x double> %b
+}
+
+define <8 x float> @slto8f32(<8 x i64> %a) {
+; NODQ-LABEL: slto8f32:
+; NODQ: # %bb.0:
+; NODQ-NEXT: vextracti32x4 $2, %zmm0, %xmm1
+; NODQ-NEXT: vpextrq $1, %xmm1, %rax
+; NODQ-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2
+; NODQ-NEXT: vmovq %xmm1, %rax
+; NODQ-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm1
+; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3]
+; NODQ-NEXT: vextracti32x4 $3, %zmm0, %xmm2
+; NODQ-NEXT: vmovq %xmm2, %rax
+; NODQ-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm3
+; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3]
+; NODQ-NEXT: vpextrq $1, %xmm2, %rax
+; NODQ-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm2
+; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0]
+; NODQ-NEXT: vpextrq $1, %xmm0, %rax
+; NODQ-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm2
+; NODQ-NEXT: vmovq %xmm0, %rax
+; NODQ-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm3
+; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
+; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm0
+; NODQ-NEXT: vmovq %xmm0, %rax
+; NODQ-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm3
+; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3]
+; NODQ-NEXT: vpextrq $1, %xmm0, %rax
+; NODQ-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm0
+; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0]
+; NODQ-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; NODQ-NEXT: retq
;
-; DQ-LABEL: uitof64:
-; DQ: # BB#0:
-; DQ-NEXT: vcvtudq2pd %ymm0, %zmm2
-; DQ-NEXT: vextracti32x8 $1, %zmm0, %ymm0
-; DQ-NEXT: vcvtudq2pd %ymm0, %zmm1
-; DQ-NEXT: vmovaps %zmm2, %zmm0
+; DQ-LABEL: slto8f32:
+; DQ: # %bb.0:
+; DQ-NEXT: vcvtqq2ps %zmm0, %ymm0
; DQ-NEXT: retq
- %b = uitofp <16 x i32> %a to <16 x double>
+ %b = sitofp <8 x i64> %a to <8 x float>
+ ret <8 x float> %b
+}
+
+define <16 x float> @slto16f32(<16 x i64> %a) {
+; NODQ-LABEL: slto16f32:
+; NODQ: # %bb.0:
+; NODQ-NEXT: vextracti32x4 $2, %zmm1, %xmm2
+; NODQ-NEXT: vpextrq $1, %xmm2, %rax
+; NODQ-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm3
+; NODQ-NEXT: vmovq %xmm2, %rax
+; NODQ-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm2
+; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
+; NODQ-NEXT: vextracti32x4 $3, %zmm1, %xmm3
+; NODQ-NEXT: vmovq %xmm3, %rax
+; NODQ-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm4
+; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3]
+; NODQ-NEXT: vpextrq $1, %xmm3, %rax
+; NODQ-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm3
+; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0]
+; NODQ-NEXT: vpextrq $1, %xmm1, %rax
+; NODQ-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm3
+; NODQ-NEXT: vmovq %xmm1, %rax
+; NODQ-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm4
+; NODQ-NEXT: vinsertps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[2,3]
+; NODQ-NEXT: vextracti128 $1, %ymm1, %xmm1
+; NODQ-NEXT: vmovq %xmm1, %rax
+; NODQ-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm4
+; NODQ-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0],xmm3[3]
+; NODQ-NEXT: vpextrq $1, %xmm1, %rax
+; NODQ-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm1
+; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[0]
+; NODQ-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; NODQ-NEXT: vextracti32x4 $2, %zmm0, %xmm2
+; NODQ-NEXT: vpextrq $1, %xmm2, %rax
+; NODQ-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm3
+; NODQ-NEXT: vmovq %xmm2, %rax
+; NODQ-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm2
+; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
+; NODQ-NEXT: vextracti32x4 $3, %zmm0, %xmm3
+; NODQ-NEXT: vmovq %xmm3, %rax
+; NODQ-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm4
+; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3]
+; NODQ-NEXT: vpextrq $1, %xmm3, %rax
+; NODQ-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm3
+; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0]
+; NODQ-NEXT: vpextrq $1, %xmm0, %rax
+; NODQ-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm3
+; NODQ-NEXT: vmovq %xmm0, %rax
+; NODQ-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm4
+; NODQ-NEXT: vinsertps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[2,3]
+; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm0
+; NODQ-NEXT: vmovq %xmm0, %rax
+; NODQ-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm4
+; NODQ-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0],xmm3[3]
+; NODQ-NEXT: vpextrq $1, %xmm0, %rax
+; NODQ-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm0
+; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[0]
+; NODQ-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; NODQ-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; NODQ-NEXT: retq
+;
+; DQ-LABEL: slto16f32:
+; DQ: # %bb.0:
+; DQ-NEXT: vcvtqq2ps %zmm0, %ymm0
+; DQ-NEXT: vcvtqq2ps %zmm1, %ymm1
+; DQ-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; DQ-NEXT: retq
+ %b = sitofp <16 x i64> %a to <16 x float>
+ ret <16 x float> %b
+}
+
+define <8 x double> @slto8f64(<8 x i64> %a) {
+; NODQ-LABEL: slto8f64:
+; NODQ: # %bb.0:
+; NODQ-NEXT: vextracti32x4 $3, %zmm0, %xmm1
+; NODQ-NEXT: vpextrq $1, %xmm1, %rax
+; NODQ-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm2
+; NODQ-NEXT: vmovq %xmm1, %rax
+; NODQ-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm1
+; NODQ-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; NODQ-NEXT: vextracti32x4 $2, %zmm0, %xmm2
+; NODQ-NEXT: vpextrq $1, %xmm2, %rax
+; NODQ-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm3
+; NODQ-NEXT: vmovq %xmm2, %rax
+; NODQ-NEXT: vcvtsi2sdq %rax, %xmm4, %xmm2
+; NODQ-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; NODQ-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm2
+; NODQ-NEXT: vpextrq $1, %xmm2, %rax
+; NODQ-NEXT: vcvtsi2sdq %rax, %xmm4, %xmm3
+; NODQ-NEXT: vmovq %xmm2, %rax
+; NODQ-NEXT: vcvtsi2sdq %rax, %xmm4, %xmm2
+; NODQ-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; NODQ-NEXT: vpextrq $1, %xmm0, %rax
+; NODQ-NEXT: vcvtsi2sdq %rax, %xmm4, %xmm3
+; NODQ-NEXT: vmovq %xmm0, %rax
+; NODQ-NEXT: vcvtsi2sdq %rax, %xmm4, %xmm0
+; NODQ-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0]
+; NODQ-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; NODQ-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; NODQ-NEXT: retq
+;
+; DQ-LABEL: slto8f64:
+; DQ: # %bb.0:
+; DQ-NEXT: vcvtqq2pd %zmm0, %zmm0
+; DQ-NEXT: retq
+ %b = sitofp <8 x i64> %a to <8 x double>
+ ret <8 x double> %b
+}
+
+define <16 x double> @slto16f64(<16 x i64> %a) {
+; NODQ-LABEL: slto16f64:
+; NODQ: # %bb.0:
+; NODQ-NEXT: vextracti32x4 $3, %zmm0, %xmm2
+; NODQ-NEXT: vpextrq $1, %xmm2, %rax
+; NODQ-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm3
+; NODQ-NEXT: vmovq %xmm2, %rax
+; NODQ-NEXT: vcvtsi2sdq %rax, %xmm4, %xmm2
+; NODQ-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; NODQ-NEXT: vextracti32x4 $2, %zmm0, %xmm3
+; NODQ-NEXT: vpextrq $1, %xmm3, %rax
+; NODQ-NEXT: vcvtsi2sdq %rax, %xmm4, %xmm4
+; NODQ-NEXT: vmovq %xmm3, %rax
+; NODQ-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm3
+; NODQ-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
+; NODQ-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm3
+; NODQ-NEXT: vpextrq $1, %xmm3, %rax
+; NODQ-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm4
+; NODQ-NEXT: vmovq %xmm3, %rax
+; NODQ-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm3
+; NODQ-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
+; NODQ-NEXT: vpextrq $1, %xmm0, %rax
+; NODQ-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm4
+; NODQ-NEXT: vmovq %xmm0, %rax
+; NODQ-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm0
+; NODQ-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0]
+; NODQ-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
+; NODQ-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; NODQ-NEXT: vextracti32x4 $3, %zmm1, %xmm2
+; NODQ-NEXT: vpextrq $1, %xmm2, %rax
+; NODQ-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm3
+; NODQ-NEXT: vmovq %xmm2, %rax
+; NODQ-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm2
+; NODQ-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; NODQ-NEXT: vextracti32x4 $2, %zmm1, %xmm3
+; NODQ-NEXT: vpextrq $1, %xmm3, %rax
+; NODQ-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm4
+; NODQ-NEXT: vmovq %xmm3, %rax
+; NODQ-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm3
+; NODQ-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
+; NODQ-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; NODQ-NEXT: vextracti128 $1, %ymm1, %xmm3
+; NODQ-NEXT: vpextrq $1, %xmm3, %rax
+; NODQ-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm4
+; NODQ-NEXT: vmovq %xmm3, %rax
+; NODQ-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm3
+; NODQ-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
+; NODQ-NEXT: vpextrq $1, %xmm1, %rax
+; NODQ-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm4
+; NODQ-NEXT: vmovq %xmm1, %rax
+; NODQ-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm1
+; NODQ-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm4[0]
+; NODQ-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; NODQ-NEXT: vinsertf64x4 $1, %ymm2, %zmm1, %zmm1
+; NODQ-NEXT: retq
+;
+; DQ-LABEL: slto16f64:
+; DQ: # %bb.0:
+; DQ-NEXT: vcvtqq2pd %zmm0, %zmm0
+; DQ-NEXT: vcvtqq2pd %zmm1, %zmm1
+; DQ-NEXT: retq
+ %b = sitofp <16 x i64> %a to <16 x double>
ret <16 x double> %b
}
-define <8 x double> @uitof64_mask(<8 x double> %a, <8 x i32> %b, i8 %c) nounwind {
-; KNL-LABEL: uitof64_mask:
-; KNL: # BB#0:
+
+define <8 x float> @ulto8f32(<8 x i64> %a) {
+; NODQ-LABEL: ulto8f32:
+; NODQ: # %bb.0:
+; NODQ-NEXT: vextracti32x4 $2, %zmm0, %xmm1
+; NODQ-NEXT: vpextrq $1, %xmm1, %rax
+; NODQ-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm2
+; NODQ-NEXT: vmovq %xmm1, %rax
+; NODQ-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm1
+; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3]
+; NODQ-NEXT: vextracti32x4 $3, %zmm0, %xmm2
+; NODQ-NEXT: vmovq %xmm2, %rax
+; NODQ-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm3
+; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3]
+; NODQ-NEXT: vpextrq $1, %xmm2, %rax
+; NODQ-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm2
+; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0]
+; NODQ-NEXT: vpextrq $1, %xmm0, %rax
+; NODQ-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm2
+; NODQ-NEXT: vmovq %xmm0, %rax
+; NODQ-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm3
+; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
+; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm0
+; NODQ-NEXT: vmovq %xmm0, %rax
+; NODQ-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm3
+; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3]
+; NODQ-NEXT: vpextrq $1, %xmm0, %rax
+; NODQ-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm0
+; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0]
+; NODQ-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; NODQ-NEXT: retq
+;
+; DQ-LABEL: ulto8f32:
+; DQ: # %bb.0:
+; DQ-NEXT: vcvtuqq2ps %zmm0, %ymm0
+; DQ-NEXT: retq
+ %b = uitofp <8 x i64> %a to <8 x float>
+ ret <8 x float> %b
+}
+
+define <16 x float> @ulto16f32(<16 x i64> %a) {
+; NODQ-LABEL: ulto16f32:
+; NODQ: # %bb.0:
+; NODQ-NEXT: vextracti32x4 $2, %zmm1, %xmm2
+; NODQ-NEXT: vpextrq $1, %xmm2, %rax
+; NODQ-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm3
+; NODQ-NEXT: vmovq %xmm2, %rax
+; NODQ-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm2
+; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
+; NODQ-NEXT: vextracti32x4 $3, %zmm1, %xmm3
+; NODQ-NEXT: vmovq %xmm3, %rax
+; NODQ-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm4
+; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3]
+; NODQ-NEXT: vpextrq $1, %xmm3, %rax
+; NODQ-NEXT: vcvtusi2ssq %rax, %xmm5, %xmm3
+; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0]
+; NODQ-NEXT: vpextrq $1, %xmm1, %rax
+; NODQ-NEXT: vcvtusi2ssq %rax, %xmm5, %xmm3
+; NODQ-NEXT: vmovq %xmm1, %rax
+; NODQ-NEXT: vcvtusi2ssq %rax, %xmm5, %xmm4
+; NODQ-NEXT: vinsertps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[2,3]
+; NODQ-NEXT: vextracti128 $1, %ymm1, %xmm1
+; NODQ-NEXT: vmovq %xmm1, %rax
+; NODQ-NEXT: vcvtusi2ssq %rax, %xmm5, %xmm4
+; NODQ-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0],xmm3[3]
+; NODQ-NEXT: vpextrq $1, %xmm1, %rax
+; NODQ-NEXT: vcvtusi2ssq %rax, %xmm5, %xmm1
+; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[0]
+; NODQ-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; NODQ-NEXT: vextracti32x4 $2, %zmm0, %xmm2
+; NODQ-NEXT: vpextrq $1, %xmm2, %rax
+; NODQ-NEXT: vcvtusi2ssq %rax, %xmm5, %xmm3
+; NODQ-NEXT: vmovq %xmm2, %rax
+; NODQ-NEXT: vcvtusi2ssq %rax, %xmm5, %xmm2
+; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
+; NODQ-NEXT: vextracti32x4 $3, %zmm0, %xmm3
+; NODQ-NEXT: vmovq %xmm3, %rax
+; NODQ-NEXT: vcvtusi2ssq %rax, %xmm5, %xmm4
+; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3]
+; NODQ-NEXT: vpextrq $1, %xmm3, %rax
+; NODQ-NEXT: vcvtusi2ssq %rax, %xmm5, %xmm3
+; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0]
+; NODQ-NEXT: vpextrq $1, %xmm0, %rax
+; NODQ-NEXT: vcvtusi2ssq %rax, %xmm5, %xmm3
+; NODQ-NEXT: vmovq %xmm0, %rax
+; NODQ-NEXT: vcvtusi2ssq %rax, %xmm5, %xmm4
+; NODQ-NEXT: vinsertps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[2,3]
+; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm0
+; NODQ-NEXT: vmovq %xmm0, %rax
+; NODQ-NEXT: vcvtusi2ssq %rax, %xmm5, %xmm4
+; NODQ-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0],xmm3[3]
+; NODQ-NEXT: vpextrq $1, %xmm0, %rax
+; NODQ-NEXT: vcvtusi2ssq %rax, %xmm5, %xmm0
+; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[0]
+; NODQ-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; NODQ-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; NODQ-NEXT: retq
+;
+; DQ-LABEL: ulto16f32:
+; DQ: # %bb.0:
+; DQ-NEXT: vcvtuqq2ps %zmm0, %ymm0
+; DQ-NEXT: vcvtuqq2ps %zmm1, %ymm1
+; DQ-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; DQ-NEXT: retq
+ %b = uitofp <16 x i64> %a to <16 x float>
+ ret <16 x float> %b
+}
+
+define <8 x double> @uito8f64_mask(<8 x double> %a, <8 x i32> %b, i8 %c) nounwind {
+; KNL-LABEL: uito8f64_mask:
+; KNL: # %bb.0:
; KNL-NEXT: kmovw %edi, %k1
; KNL-NEXT: vcvtudq2pd %ymm1, %zmm0 {%k1}
; KNL-NEXT: retq
;
-; VLBW-LABEL: uitof64_mask:
-; VLBW: # BB#0:
+; VLBW-LABEL: uito8f64_mask:
+; VLBW: # %bb.0:
; VLBW-NEXT: kmovd %edi, %k1
; VLBW-NEXT: vcvtudq2pd %ymm1, %zmm0 {%k1}
; VLBW-NEXT: retq
;
-; VLNOBW-LABEL: uitof64_mask:
-; VLNOBW: # BB#0:
+; VLNOBW-LABEL: uito8f64_mask:
+; VLNOBW: # %bb.0:
; VLNOBW-NEXT: kmovw %edi, %k1
; VLNOBW-NEXT: vcvtudq2pd %ymm1, %zmm0 {%k1}
; VLNOBW-NEXT: retq
;
-; AVX512DQ-LABEL: uitof64_mask:
-; AVX512DQ: # BB#0:
+; AVX512DQ-LABEL: uito8f64_mask:
+; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: kmovw %edi, %k1
; AVX512DQ-NEXT: vcvtudq2pd %ymm1, %zmm0 {%k1}
; AVX512DQ-NEXT: retq
;
-; AVX512BW-LABEL: uitof64_mask:
-; AVX512BW: # BB#0:
+; AVX512BW-LABEL: uito8f64_mask:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: kmovd %edi, %k1
; AVX512BW-NEXT: vcvtudq2pd %ymm1, %zmm0 {%k1}
; AVX512BW-NEXT: retq
@@ -929,33 +1248,33 @@ define <8 x double> @uitof64_mask(<8 x double> %a, <8 x i32> %b, i8 %c) nounwind
%3 = select <8 x i1> %1, <8 x double> %2, <8 x double> %a
ret <8 x double> %3
}
-define <8 x double> @uitof64_maskz(<8 x i32> %a, i8 %b) nounwind {
-; KNL-LABEL: uitof64_maskz:
-; KNL: # BB#0:
+define <8 x double> @uito8f64_maskz(<8 x i32> %a, i8 %b) nounwind {
+; KNL-LABEL: uito8f64_maskz:
+; KNL: # %bb.0:
; KNL-NEXT: kmovw %edi, %k1
; KNL-NEXT: vcvtudq2pd %ymm0, %zmm0 {%k1} {z}
; KNL-NEXT: retq
;
-; VLBW-LABEL: uitof64_maskz:
-; VLBW: # BB#0:
+; VLBW-LABEL: uito8f64_maskz:
+; VLBW: # %bb.0:
; VLBW-NEXT: kmovd %edi, %k1
; VLBW-NEXT: vcvtudq2pd %ymm0, %zmm0 {%k1} {z}
; VLBW-NEXT: retq
;
-; VLNOBW-LABEL: uitof64_maskz:
-; VLNOBW: # BB#0:
+; VLNOBW-LABEL: uito8f64_maskz:
+; VLNOBW: # %bb.0:
; VLNOBW-NEXT: kmovw %edi, %k1
; VLNOBW-NEXT: vcvtudq2pd %ymm0, %zmm0 {%k1} {z}
; VLNOBW-NEXT: retq
;
-; AVX512DQ-LABEL: uitof64_maskz:
-; AVX512DQ: # BB#0:
+; AVX512DQ-LABEL: uito8f64_maskz:
+; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: kmovw %edi, %k1
; AVX512DQ-NEXT: vcvtudq2pd %ymm0, %zmm0 {%k1} {z}
; AVX512DQ-NEXT: retq
;
-; AVX512BW-LABEL: uitof64_maskz:
-; AVX512BW: # BB#0:
+; AVX512BW-LABEL: uito8f64_maskz:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: kmovd %edi, %k1
; AVX512BW-NEXT: vcvtudq2pd %ymm0, %zmm0 {%k1} {z}
; AVX512BW-NEXT: retq
@@ -965,127 +1284,121 @@ define <8 x double> @uitof64_maskz(<8 x i32> %a, i8 %b) nounwind {
ret <8 x double> %3
}
-define <4 x double> @uitof64_256(<4 x i32> %a) nounwind {
-; NOVL-LABEL: uitof64_256:
-; NOVL: # BB#0:
-; NOVL-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+define <4 x double> @uito4f64(<4 x i32> %a) nounwind {
+; NOVL-LABEL: uito4f64:
+; NOVL: # %bb.0:
+; NOVL-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
; NOVL-NEXT: vcvtudq2pd %ymm0, %zmm0
-; NOVL-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; NOVL-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
; NOVL-NEXT: retq
;
-; VL-LABEL: uitof64_256:
-; VL: # BB#0:
+; VL-LABEL: uito4f64:
+; VL: # %bb.0:
; VL-NEXT: vcvtudq2pd %xmm0, %ymm0
; VL-NEXT: retq
%b = uitofp <4 x i32> %a to <4 x double>
ret <4 x double> %b
}
-define <16 x float> @uitof32(<16 x i32> %a) nounwind {
-; ALL-LABEL: uitof32:
-; ALL: # BB#0:
+define <16 x float> @uito16f32(<16 x i32> %a) nounwind {
+; ALL-LABEL: uito16f32:
+; ALL: # %bb.0:
; ALL-NEXT: vcvtudq2ps %zmm0, %zmm0
; ALL-NEXT: retq
%b = uitofp <16 x i32> %a to <16 x float>
ret <16 x float> %b
}
-define <8 x float> @uitof32_256(<8 x i32> %a) nounwind {
-; NOVL-LABEL: uitof32_256:
-; NOVL: # BB#0:
-; NOVL-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+define <8 x double> @uito8f64(<8 x i32> %a) {
+; ALL-LABEL: uito8f64:
+; ALL: # %bb.0:
+; ALL-NEXT: vcvtudq2pd %ymm0, %zmm0
+; ALL-NEXT: retq
+ %b = uitofp <8 x i32> %a to <8 x double>
+ ret <8 x double> %b
+}
+
+define <8 x float> @uito8f32(<8 x i32> %a) nounwind {
+; NOVL-LABEL: uito8f32:
+; NOVL: # %bb.0:
+; NOVL-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NOVL-NEXT: vcvtudq2ps %zmm0, %zmm0
-; NOVL-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; NOVL-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
; NOVL-NEXT: retq
;
-; VL-LABEL: uitof32_256:
-; VL: # BB#0:
+; VL-LABEL: uito8f32:
+; VL: # %bb.0:
; VL-NEXT: vcvtudq2ps %ymm0, %ymm0
; VL-NEXT: retq
%b = uitofp <8 x i32> %a to <8 x float>
ret <8 x float> %b
}
-define <4 x float> @uitof32_128(<4 x i32> %a) nounwind {
-; KNL-LABEL: uitof32_128:
-; KNL: # BB#0:
-; KNL-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
-; KNL-NEXT: vcvtudq2ps %zmm0, %zmm0
-; KNL-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
-; KNL-NEXT: retq
+define <4 x float> @uito4f32(<4 x i32> %a) nounwind {
+; NOVL-LABEL: uito4f32:
+; NOVL: # %bb.0:
+; NOVL-NEXT: # kill: def %xmm0 killed %xmm0 def %zmm0
+; NOVL-NEXT: vcvtudq2ps %zmm0, %zmm0
+; NOVL-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0
+; NOVL-NEXT: vzeroupper
+; NOVL-NEXT: retq
;
-; VL-LABEL: uitof32_128:
-; VL: # BB#0:
+; VL-LABEL: uito4f32:
+; VL: # %bb.0:
; VL-NEXT: vcvtudq2ps %xmm0, %xmm0
; VL-NEXT: retq
-;
-; AVX512DQ-LABEL: uitof32_128:
-; AVX512DQ: # BB#0:
-; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
-; AVX512DQ-NEXT: vcvtudq2ps %zmm0, %zmm0
-; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
-;
-; AVX512BW-LABEL: uitof32_128:
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
-; AVX512BW-NEXT: vcvtudq2ps %zmm0, %zmm0
-; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
%b = uitofp <4 x i32> %a to <4 x float>
ret <4 x float> %b
}
-define i32 @fptosi02(float %a) nounwind {
-; ALL-LABEL: fptosi02:
-; ALL: # BB#0:
+define i32 @fptosi(float %a) nounwind {
+; ALL-LABEL: fptosi:
+; ALL: # %bb.0:
; ALL-NEXT: vcvttss2si %xmm0, %eax
; ALL-NEXT: retq
%b = fptosi float %a to i32
ret i32 %b
}
-define i32 @fptoui02(float %a) nounwind {
-; ALL-LABEL: fptoui02:
-; ALL: # BB#0:
+define i32 @fptoui(float %a) nounwind {
+; ALL-LABEL: fptoui:
+; ALL: # %bb.0:
; ALL-NEXT: vcvttss2usi %xmm0, %eax
; ALL-NEXT: retq
%b = fptoui float %a to i32
ret i32 %b
}
-define float @uitofp02(i32 %a) nounwind {
-; ALL-LABEL: uitofp02:
-; ALL: # BB#0:
+define float @uitof32(i32 %a) nounwind {
+; ALL-LABEL: uitof32:
+; ALL: # %bb.0:
; ALL-NEXT: vcvtusi2ssl %edi, %xmm0, %xmm0
; ALL-NEXT: retq
%b = uitofp i32 %a to float
ret float %b
}
-define double @uitofp03(i32 %a) nounwind {
-; ALL-LABEL: uitofp03:
-; ALL: # BB#0:
+define double @uitof64(i32 %a) nounwind {
+; ALL-LABEL: uitof64:
+; ALL: # %bb.0:
; ALL-NEXT: vcvtusi2sdl %edi, %xmm0, %xmm0
; ALL-NEXT: retq
%b = uitofp i32 %a to double
ret double %b
}
-define <16 x float> @sitofp_16i1_float(<16 x i32> %a) {
-; NODQ-LABEL: sitofp_16i1_float:
-; NODQ: # BB#0:
-; NODQ-NEXT: vpxord %zmm1, %zmm1, %zmm1
+define <16 x float> @sbto16f32(<16 x i32> %a) {
+; NODQ-LABEL: sbto16f32:
+; NODQ: # %bb.0:
+; NODQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
; NODQ-NEXT: vpcmpgtd %zmm0, %zmm1, %k1
; NODQ-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NODQ-NEXT: vcvtdq2ps %zmm0, %zmm0
; NODQ-NEXT: retq
;
-; DQ-LABEL: sitofp_16i1_float:
-; DQ: # BB#0:
-; DQ-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; DQ-LABEL: sbto16f32:
+; DQ: # %bb.0:
+; DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
; DQ-NEXT: vpcmpgtd %zmm0, %zmm1, %k0
; DQ-NEXT: vpmovm2d %k0, %zmm0
; DQ-NEXT: vcvtdq2ps %zmm0, %zmm0
@@ -1095,9 +1408,9 @@ define <16 x float> @sitofp_16i1_float(<16 x i32> %a) {
ret <16 x float> %1
}
-define <16 x float> @sitofp_16i8_float(<16 x i8> %a) {
-; ALL-LABEL: sitofp_16i8_float:
-; ALL: # BB#0:
+define <16 x float> @scto16f32(<16 x i8> %a) {
+; ALL-LABEL: scto16f32:
+; ALL: # %bb.0:
; ALL-NEXT: vpmovsxbd %xmm0, %zmm0
; ALL-NEXT: vcvtdq2ps %zmm0, %zmm0
; ALL-NEXT: retq
@@ -1105,9 +1418,9 @@ define <16 x float> @sitofp_16i8_float(<16 x i8> %a) {
ret <16 x float> %1
}
-define <16 x float> @sitofp_16i16_float(<16 x i16> %a) {
-; ALL-LABEL: sitofp_16i16_float:
-; ALL: # BB#0:
+define <16 x float> @ssto16f32(<16 x i16> %a) {
+; ALL-LABEL: ssto16f32:
+; ALL: # %bb.0:
; ALL-NEXT: vpmovsxwd %ymm0, %zmm0
; ALL-NEXT: vcvtdq2ps %zmm0, %zmm0
; ALL-NEXT: retq
@@ -1115,9 +1428,9 @@ define <16 x float> @sitofp_16i16_float(<16 x i16> %a) {
ret <16 x float> %1
}
-define <8 x double> @sitofp_8i16_double(<8 x i16> %a) {
-; ALL-LABEL: sitofp_8i16_double:
-; ALL: # BB#0:
+define <8 x double> @ssto16f64(<8 x i16> %a) {
+; ALL-LABEL: ssto16f64:
+; ALL: # %bb.0:
; ALL-NEXT: vpmovsxwd %xmm0, %ymm0
; ALL-NEXT: vcvtdq2pd %ymm0, %zmm0
; ALL-NEXT: retq
@@ -1125,9 +1438,9 @@ define <8 x double> @sitofp_8i16_double(<8 x i16> %a) {
ret <8 x double> %1
}
-define <8 x double> @sitofp_8i8_double(<8 x i8> %a) {
-; ALL-LABEL: sitofp_8i8_double:
-; ALL: # BB#0:
+define <8 x double> @scto8f64(<8 x i8> %a) {
+; ALL-LABEL: scto8f64:
+; ALL: # %bb.0:
; ALL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; ALL-NEXT: vpslld $24, %ymm0, %ymm0
; ALL-NEXT: vpsrad $24, %ymm0, %ymm0
@@ -1137,23 +1450,33 @@ define <8 x double> @sitofp_8i8_double(<8 x i8> %a) {
ret <8 x double> %1
}
-define <16 x double> @sitofp_16i1_double(<16 x double> %a) {
-; NOVLDQ-LABEL: sitofp_16i1_double:
-; NOVLDQ: # BB#0:
-; NOVLDQ-NEXT: vpxord %zmm2, %zmm2, %zmm2
+define <16 x double> @scto16f64(<16 x i8> %a) {
+; ALL-LABEL: scto16f64:
+; ALL: # %bb.0:
+; ALL-NEXT: vpmovsxbd %xmm0, %zmm1
+; ALL-NEXT: vcvtdq2pd %ymm1, %zmm0
+; ALL-NEXT: vextracti64x4 $1, %zmm1, %ymm1
+; ALL-NEXT: vcvtdq2pd %ymm1, %zmm1
+; ALL-NEXT: retq
+ %b = sitofp <16 x i8> %a to <16 x double>
+ ret <16 x double> %b
+}
+
+define <16 x double> @sbto16f64(<16 x double> %a) {
+; NOVLDQ-LABEL: sbto16f64:
+; NOVLDQ: # %bb.0:
+; NOVLDQ-NEXT: vxorpd %xmm2, %xmm2, %xmm2
; NOVLDQ-NEXT: vcmpltpd %zmm1, %zmm2, %k1
; NOVLDQ-NEXT: vcmpltpd %zmm0, %zmm2, %k2
-; NOVLDQ-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
-; NOVLDQ-NEXT: vpmovqd %zmm0, %ymm0
+; NOVLDQ-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
; NOVLDQ-NEXT: vcvtdq2pd %ymm0, %zmm0
-; NOVLDQ-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NOVLDQ-NEXT: vpmovqd %zmm1, %ymm1
+; NOVLDQ-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; NOVLDQ-NEXT: vcvtdq2pd %ymm1, %zmm1
; NOVLDQ-NEXT: retq
;
-; VLDQ-LABEL: sitofp_16i1_double:
-; VLDQ: # BB#0:
-; VLDQ-NEXT: vxorpd %zmm2, %zmm2, %zmm2
+; VLDQ-LABEL: sbto16f64:
+; VLDQ: # %bb.0:
+; VLDQ-NEXT: vxorpd %xmm2, %xmm2, %xmm2
; VLDQ-NEXT: vcmpltpd %zmm1, %zmm2, %k0
; VLDQ-NEXT: vcmpltpd %zmm0, %zmm2, %k1
; VLDQ-NEXT: vpmovm2d %k1, %ymm0
@@ -1162,9 +1485,9 @@ define <16 x double> @sitofp_16i1_double(<16 x double> %a) {
; VLDQ-NEXT: vcvtdq2pd %ymm1, %zmm1
; VLDQ-NEXT: retq
;
-; VLNODQ-LABEL: sitofp_16i1_double:
-; VLNODQ: # BB#0:
-; VLNODQ-NEXT: vpxord %zmm2, %zmm2, %zmm2
+; VLNODQ-LABEL: sbto16f64:
+; VLNODQ: # %bb.0:
+; VLNODQ-NEXT: vpxor %xmm2, %xmm2, %xmm2
; VLNODQ-NEXT: vcmpltpd %zmm1, %zmm2, %k1
; VLNODQ-NEXT: vcmpltpd %zmm0, %zmm2, %k2
; VLNODQ-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
@@ -1174,9 +1497,9 @@ define <16 x double> @sitofp_16i1_double(<16 x double> %a) {
; VLNODQ-NEXT: vcvtdq2pd %ymm1, %zmm1
; VLNODQ-NEXT: retq
;
-; AVX512DQ-LABEL: sitofp_16i1_double:
-; AVX512DQ: # BB#0:
-; AVX512DQ-NEXT: vxorpd %zmm2, %zmm2, %zmm2
+; AVX512DQ-LABEL: sbto16f64:
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: vxorpd %xmm2, %xmm2, %xmm2
; AVX512DQ-NEXT: vcmpltpd %zmm1, %zmm2, %k0
; AVX512DQ-NEXT: vcmpltpd %zmm0, %zmm2, %k1
; AVX512DQ-NEXT: vpmovm2d %k1, %zmm0
@@ -1189,36 +1512,35 @@ define <16 x double> @sitofp_16i1_double(<16 x double> %a) {
ret <16 x double> %1
}
-define <8 x double> @sitofp_8i1_double(<8 x double> %a) {
-; NOVLDQ-LABEL: sitofp_8i1_double:
-; NOVLDQ: # BB#0:
-; NOVLDQ-NEXT: vpxord %zmm1, %zmm1, %zmm1
+define <8 x double> @sbto8f64(<8 x double> %a) {
+; NOVLDQ-LABEL: sbto8f64:
+; NOVLDQ: # %bb.0:
+; NOVLDQ-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; NOVLDQ-NEXT: vcmpltpd %zmm0, %zmm1, %k1
-; NOVLDQ-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NOVLDQ-NEXT: vpmovqd %zmm0, %ymm0
+; NOVLDQ-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NOVLDQ-NEXT: vcvtdq2pd %ymm0, %zmm0
; NOVLDQ-NEXT: retq
;
-; VLDQ-LABEL: sitofp_8i1_double:
-; VLDQ: # BB#0:
-; VLDQ-NEXT: vxorpd %zmm1, %zmm1, %zmm1
+; VLDQ-LABEL: sbto8f64:
+; VLDQ: # %bb.0:
+; VLDQ-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; VLDQ-NEXT: vcmpltpd %zmm0, %zmm1, %k0
; VLDQ-NEXT: vpmovm2d %k0, %ymm0
; VLDQ-NEXT: vcvtdq2pd %ymm0, %zmm0
; VLDQ-NEXT: retq
;
-; VLNODQ-LABEL: sitofp_8i1_double:
-; VLNODQ: # BB#0:
-; VLNODQ-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; VLNODQ-LABEL: sbto8f64:
+; VLNODQ: # %bb.0:
+; VLNODQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
; VLNODQ-NEXT: vcmpltpd %zmm0, %zmm1, %k1
; VLNODQ-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; VLNODQ-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
; VLNODQ-NEXT: vcvtdq2pd %ymm0, %zmm0
; VLNODQ-NEXT: retq
;
-; AVX512DQ-LABEL: sitofp_8i1_double:
-; AVX512DQ: # BB#0:
-; AVX512DQ-NEXT: vxorpd %zmm1, %zmm1, %zmm1
+; AVX512DQ-LABEL: sbto8f64:
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; AVX512DQ-NEXT: vcmpltpd %zmm0, %zmm1, %k0
; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
; AVX512DQ-NEXT: vcvtdq2pd %ymm0, %zmm0
@@ -1228,38 +1550,37 @@ define <8 x double> @sitofp_8i1_double(<8 x double> %a) {
ret <8 x double> %1
}
-define <8 x float> @sitofp_8i1_float(<8 x float> %a) {
-; NOVLDQ-LABEL: sitofp_8i1_float:
-; NOVLDQ: # BB#0:
-; NOVLDQ-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
-; NOVLDQ-NEXT: vxorps %ymm1, %ymm1, %ymm1
+define <8 x float> @sbto8f32(<8 x float> %a) {
+; NOVLDQ-LABEL: sbto8f32:
+; NOVLDQ: # %bb.0:
+; NOVLDQ-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
+; NOVLDQ-NEXT: vxorps %xmm1, %xmm1, %xmm1
; NOVLDQ-NEXT: vcmpltps %zmm0, %zmm1, %k1
-; NOVLDQ-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NOVLDQ-NEXT: vpmovqd %zmm0, %ymm0
+; NOVLDQ-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NOVLDQ-NEXT: vcvtdq2ps %ymm0, %ymm0
; NOVLDQ-NEXT: retq
;
-; VLDQ-LABEL: sitofp_8i1_float:
-; VLDQ: # BB#0:
-; VLDQ-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; VLDQ-LABEL: sbto8f32:
+; VLDQ: # %bb.0:
+; VLDQ-NEXT: vxorps %xmm1, %xmm1, %xmm1
; VLDQ-NEXT: vcmpltps %ymm0, %ymm1, %k0
; VLDQ-NEXT: vpmovm2d %k0, %ymm0
; VLDQ-NEXT: vcvtdq2ps %ymm0, %ymm0
; VLDQ-NEXT: retq
;
-; VLNODQ-LABEL: sitofp_8i1_float:
-; VLNODQ: # BB#0:
-; VLNODQ-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; VLNODQ-LABEL: sbto8f32:
+; VLNODQ: # %bb.0:
+; VLNODQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
; VLNODQ-NEXT: vcmpltps %ymm0, %ymm1, %k1
; VLNODQ-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; VLNODQ-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
; VLNODQ-NEXT: vcvtdq2ps %ymm0, %ymm0
; VLNODQ-NEXT: retq
;
-; AVX512DQ-LABEL: sitofp_8i1_float:
-; AVX512DQ: # BB#0:
-; AVX512DQ-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
-; AVX512DQ-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; AVX512DQ-LABEL: sbto8f32:
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
+; AVX512DQ-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX512DQ-NEXT: vcmpltps %zmm0, %zmm1, %k0
; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
; AVX512DQ-NEXT: vcvtdq2ps %ymm0, %ymm0
@@ -1269,24 +1590,24 @@ define <8 x float> @sitofp_8i1_float(<8 x float> %a) {
ret <8 x float> %1
}
-define <4 x float> @sitofp_4i1_float(<4 x float> %a) {
-; NOVL-LABEL: sitofp_4i1_float:
-; NOVL: # BB#0:
+define <4 x float> @sbto4f32(<4 x float> %a) {
+; NOVL-LABEL: sbto4f32:
+; NOVL: # %bb.0:
; NOVL-NEXT: vxorps %xmm1, %xmm1, %xmm1
; NOVL-NEXT: vcmpltps %xmm0, %xmm1, %xmm0
; NOVL-NEXT: vcvtdq2ps %xmm0, %xmm0
; NOVL-NEXT: retq
;
-; VLDQ-LABEL: sitofp_4i1_float:
-; VLDQ: # BB#0:
+; VLDQ-LABEL: sbto4f32:
+; VLDQ: # %bb.0:
; VLDQ-NEXT: vxorps %xmm1, %xmm1, %xmm1
; VLDQ-NEXT: vcmpltps %xmm0, %xmm1, %k0
; VLDQ-NEXT: vpmovm2d %k0, %xmm0
; VLDQ-NEXT: vcvtdq2ps %xmm0, %xmm0
; VLDQ-NEXT: retq
;
-; VLNODQ-LABEL: sitofp_4i1_float:
-; VLNODQ: # BB#0:
+; VLNODQ-LABEL: sbto4f32:
+; VLNODQ: # %bb.0:
; VLNODQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
; VLNODQ-NEXT: vcmpltps %xmm0, %xmm1, %k1
; VLNODQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
@@ -1298,26 +1619,26 @@ define <4 x float> @sitofp_4i1_float(<4 x float> %a) {
ret <4 x float> %1
}
-define <4 x double> @sitofp_4i1_double(<4 x double> %a) {
-; NOVL-LABEL: sitofp_4i1_double:
-; NOVL: # BB#0:
-; NOVL-NEXT: vxorpd %ymm1, %ymm1, %ymm1
+define <4 x double> @sbto4f64(<4 x double> %a) {
+; NOVL-LABEL: sbto4f64:
+; NOVL: # %bb.0:
+; NOVL-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; NOVL-NEXT: vcmpltpd %ymm0, %ymm1, %ymm0
; NOVL-NEXT: vpmovqd %zmm0, %ymm0
; NOVL-NEXT: vcvtdq2pd %xmm0, %ymm0
; NOVL-NEXT: retq
;
-; VLDQ-LABEL: sitofp_4i1_double:
-; VLDQ: # BB#0:
-; VLDQ-NEXT: vxorpd %ymm1, %ymm1, %ymm1
+; VLDQ-LABEL: sbto4f64:
+; VLDQ: # %bb.0:
+; VLDQ-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; VLDQ-NEXT: vcmpltpd %ymm0, %ymm1, %k0
; VLDQ-NEXT: vpmovm2d %k0, %xmm0
; VLDQ-NEXT: vcvtdq2pd %xmm0, %ymm0
; VLDQ-NEXT: retq
;
-; VLNODQ-LABEL: sitofp_4i1_double:
-; VLNODQ: # BB#0:
-; VLNODQ-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; VLNODQ-LABEL: sbto4f64:
+; VLNODQ: # %bb.0:
+; VLNODQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
; VLNODQ-NEXT: vcmpltpd %ymm0, %ymm1, %k1
; VLNODQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; VLNODQ-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
@@ -1328,24 +1649,24 @@ define <4 x double> @sitofp_4i1_double(<4 x double> %a) {
ret <4 x double> %1
}
-define <2 x float> @sitofp_2i1_float(<2 x float> %a) {
-; NOVL-LABEL: sitofp_2i1_float:
-; NOVL: # BB#0:
+define <2 x float> @sbto2f32(<2 x float> %a) {
+; NOVL-LABEL: sbto2f32:
+; NOVL: # %bb.0:
; NOVL-NEXT: vxorps %xmm1, %xmm1, %xmm1
; NOVL-NEXT: vcmpltps %xmm0, %xmm1, %xmm0
; NOVL-NEXT: vcvtdq2ps %xmm0, %xmm0
; NOVL-NEXT: retq
;
-; VLDQ-LABEL: sitofp_2i1_float:
-; VLDQ: # BB#0:
+; VLDQ-LABEL: sbto2f32:
+; VLDQ: # %bb.0:
; VLDQ-NEXT: vxorps %xmm1, %xmm1, %xmm1
; VLDQ-NEXT: vcmpltps %xmm0, %xmm1, %k0
; VLDQ-NEXT: vpmovm2d %k0, %xmm0
; VLDQ-NEXT: vcvtdq2ps %xmm0, %xmm0
; VLDQ-NEXT: retq
;
-; VLNODQ-LABEL: sitofp_2i1_float:
-; VLNODQ: # BB#0:
+; VLNODQ-LABEL: sbto2f32:
+; VLNODQ: # %bb.0:
; VLNODQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
; VLNODQ-NEXT: vcmpltps %xmm0, %xmm1, %k1
; VLNODQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
@@ -1357,43 +1678,43 @@ define <2 x float> @sitofp_2i1_float(<2 x float> %a) {
ret <2 x float> %1
}
-define <2 x double> @sitofp_2i1_double(<2 x double> %a) {
-; NOVL-LABEL: sitofp_2i1_double:
-; NOVL: # BB#0:
+define <2 x double> @sbto2f64(<2 x double> %a) {
+; NOVL-LABEL: sbto2f64:
+; NOVL: # %bb.0:
; NOVL-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; NOVL-NEXT: vcmpltpd %xmm0, %xmm1, %xmm0
; NOVL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3]
; NOVL-NEXT: vcvtdq2pd %xmm0, %xmm0
; NOVL-NEXT: retq
;
-; VLDQ-LABEL: sitofp_2i1_double:
-; VLDQ: # BB#0:
+; VLDQ-LABEL: sbto2f64:
+; VLDQ: # %bb.0:
; VLDQ-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; VLDQ-NEXT: vcmpltpd %xmm0, %xmm1, %k0
; VLDQ-NEXT: vpmovm2q %k0, %xmm0
; VLDQ-NEXT: vcvtqq2pd %xmm0, %xmm0
; VLDQ-NEXT: retq
;
-; VLNODQ-LABEL: sitofp_2i1_double:
-; VLNODQ: # BB#0:
+; VLNODQ-LABEL: sbto2f64:
+; VLNODQ: # %bb.0:
; VLNODQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
; VLNODQ-NEXT: vcmpltpd %xmm0, %xmm1, %k1
; VLNODQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; VLNODQ-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
; VLNODQ-NEXT: vpextrq $1, %xmm0, %rax
-; VLNODQ-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm1
+; VLNODQ-NEXT: vcvtsi2sdl %eax, %xmm2, %xmm1
; VLNODQ-NEXT: vmovq %xmm0, %rax
-; VLNODQ-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm0
-; VLNODQ-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; VLNODQ-NEXT: vcvtsi2sdl %eax, %xmm2, %xmm0
+; VLNODQ-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; VLNODQ-NEXT: retq
%cmpres = fcmp ogt <2 x double> %a, zeroinitializer
%1 = sitofp <2 x i1> %cmpres to <2 x double>
ret <2 x double> %1
}
-define <16 x float> @uitofp_16i8(<16 x i8>%a) {
-; ALL-LABEL: uitofp_16i8:
-; ALL: # BB#0:
+define <16 x float> @ucto16f32(<16 x i8> %a) {
+; ALL-LABEL: ucto16f32:
+; ALL: # %bb.0:
; ALL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
; ALL-NEXT: vcvtdq2ps %zmm0, %zmm0
; ALL-NEXT: retq
@@ -1401,152 +1722,255 @@ define <16 x float> @uitofp_16i8(<16 x i8>%a) {
ret <16 x float>%b
}
-define <16 x float> @uitofp_16i16(<16 x i16>%a) {
-; ALL-LABEL: uitofp_16i16:
-; ALL: # BB#0:
+define <8 x double> @ucto8f64(<8 x i8> %a) {
+; ALL-LABEL: ucto8f64:
+; ALL: # %bb.0:
+; ALL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; ALL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; ALL-NEXT: vcvtdq2pd %ymm0, %zmm0
+; ALL-NEXT: retq
+ %b = uitofp <8 x i8> %a to <8 x double>
+ ret <8 x double> %b
+}
+
+define <16 x float> @swto16f32(<16 x i16> %a) {
+; ALL-LABEL: swto16f32:
+; ALL: # %bb.0:
+; ALL-NEXT: vpmovsxwd %ymm0, %zmm0
+; ALL-NEXT: vcvtdq2ps %zmm0, %zmm0
+; ALL-NEXT: retq
+ %b = sitofp <16 x i16> %a to <16 x float>
+ ret <16 x float> %b
+}
+
+define <8 x double> @swto8f64(<8 x i16> %a) {
+; ALL-LABEL: swto8f64:
+; ALL: # %bb.0:
+; ALL-NEXT: vpmovsxwd %xmm0, %ymm0
+; ALL-NEXT: vcvtdq2pd %ymm0, %zmm0
+; ALL-NEXT: retq
+ %b = sitofp <8 x i16> %a to <8 x double>
+ ret <8 x double> %b
+}
+
+define <16 x double> @swto16f64(<16 x i16> %a) {
+; ALL-LABEL: swto16f64:
+; ALL: # %bb.0:
+; ALL-NEXT: vpmovsxwd %ymm0, %zmm1
+; ALL-NEXT: vcvtdq2pd %ymm1, %zmm0
+; ALL-NEXT: vextracti64x4 $1, %zmm1, %ymm1
+; ALL-NEXT: vcvtdq2pd %ymm1, %zmm1
+; ALL-NEXT: retq
+ %b = sitofp <16 x i16> %a to <16 x double>
+ ret <16 x double> %b
+}
+
+define <16 x double> @ucto16f64(<16 x i8> %a) {
+; ALL-LABEL: ucto16f64:
+; ALL: # %bb.0:
+; ALL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; ALL-NEXT: vcvtdq2pd %ymm1, %zmm0
+; ALL-NEXT: vextracti64x4 $1, %zmm1, %ymm1
+; ALL-NEXT: vcvtdq2pd %ymm1, %zmm1
+; ALL-NEXT: retq
+ %b = uitofp <16 x i8> %a to <16 x double>
+ ret <16 x double> %b
+}
+
+define <16 x float> @uwto16f32(<16 x i16> %a) {
+; ALL-LABEL: uwto16f32:
+; ALL: # %bb.0:
; ALL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
; ALL-NEXT: vcvtdq2ps %zmm0, %zmm0
; ALL-NEXT: retq
%b = uitofp <16 x i16> %a to <16 x float>
- ret <16 x float>%b
+ ret <16 x float> %b
}
-define <16 x float> @uitofp_16i1_float(<16 x i32> %a) {
-; ALL-LABEL: uitofp_16i1_float:
-; ALL: # BB#0:
-; ALL-NEXT: vpxord %zmm1, %zmm1, %zmm1
+define <8 x double> @uwto8f64(<8 x i16> %a) {
+; ALL-LABEL: uwto8f64:
+; ALL: # %bb.0:
+; ALL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; ALL-NEXT: vcvtdq2pd %ymm0, %zmm0
+; ALL-NEXT: retq
+ %b = uitofp <8 x i16> %a to <8 x double>
+ ret <8 x double> %b
+}
+
+define <16 x double> @uwto16f64(<16 x i16> %a) {
+; ALL-LABEL: uwto16f64:
+; ALL: # %bb.0:
+; ALL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; ALL-NEXT: vcvtdq2pd %ymm1, %zmm0
+; ALL-NEXT: vextracti64x4 $1, %zmm1, %ymm1
+; ALL-NEXT: vcvtdq2pd %ymm1, %zmm1
+; ALL-NEXT: retq
+ %b = uitofp <16 x i16> %a to <16 x double>
+ ret <16 x double> %b
+}
+
+define <16 x float> @sito16f32(<16 x i32> %a) {
+; ALL-LABEL: sito16f32:
+; ALL: # %bb.0:
+; ALL-NEXT: vcvtdq2ps %zmm0, %zmm0
+; ALL-NEXT: retq
+ %b = sitofp <16 x i32> %a to <16 x float>
+ ret <16 x float> %b
+}
+
+define <16 x double> @sito16f64(<16 x i32> %a) {
+; ALL-LABEL: sito16f64:
+; ALL: # %bb.0:
+; ALL-NEXT: vcvtdq2pd %ymm0, %zmm2
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vcvtdq2pd %ymm0, %zmm1
+; ALL-NEXT: vmovaps %zmm2, %zmm0
+; ALL-NEXT: retq
+ %b = sitofp <16 x i32> %a to <16 x double>
+ ret <16 x double> %b
+}
+
+define <16 x float> @usto16f32(<16 x i16> %a) {
+; ALL-LABEL: usto16f32:
+; ALL: # %bb.0:
+; ALL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; ALL-NEXT: vcvtdq2ps %zmm0, %zmm0
+; ALL-NEXT: retq
+ %b = uitofp <16 x i16> %a to <16 x float>
+ ret <16 x float> %b
+}
+
+define <16 x float> @ubto16f32(<16 x i32> %a) {
+; ALL-LABEL: ubto16f32:
+; ALL: # %bb.0:
+; ALL-NEXT: vpxor %xmm1, %xmm1, %xmm1
; ALL-NEXT: vpcmpgtd %zmm0, %zmm1, %k1
; ALL-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
-; ALL-NEXT: vcvtudq2ps %zmm0, %zmm0
+; ALL-NEXT: vcvtdq2ps %zmm0, %zmm0
; ALL-NEXT: retq
%mask = icmp slt <16 x i32> %a, zeroinitializer
%1 = uitofp <16 x i1> %mask to <16 x float>
ret <16 x float> %1
}
-define <16 x double> @uitofp_16i1_double(<16 x i32> %a) {
-; NOVL-LABEL: uitofp_16i1_double:
-; NOVL: # BB#0:
-; NOVL-NEXT: vpxord %zmm1, %zmm1, %zmm1
+define <16 x double> @ubto16f64(<16 x i32> %a) {
+; NOVL-LABEL: ubto16f64:
+; NOVL: # %bb.0:
+; NOVL-NEXT: vpxor %xmm1, %xmm1, %xmm1
; NOVL-NEXT: vpcmpgtd %zmm0, %zmm1, %k1
-; NOVL-NEXT: movq {{.*}}(%rip), %rax
-; NOVL-NEXT: vpbroadcastq %rax, %zmm0 {%k1} {z}
-; NOVL-NEXT: vpmovqd %zmm0, %ymm0
-; NOVL-NEXT: vcvtudq2pd %ymm0, %zmm0
+; NOVL-NEXT: movl {{.*}}(%rip), %eax
+; NOVL-NEXT: vpbroadcastd %eax, %zmm0 {%k1} {z}
+; NOVL-NEXT: vcvtdq2pd %ymm0, %zmm0
; NOVL-NEXT: kshiftrw $8, %k1, %k1
-; NOVL-NEXT: vpbroadcastq %rax, %zmm1 {%k1} {z}
-; NOVL-NEXT: vpmovqd %zmm1, %ymm1
-; NOVL-NEXT: vcvtudq2pd %ymm1, %zmm1
+; NOVL-NEXT: vpbroadcastd %eax, %zmm1 {%k1} {z}
+; NOVL-NEXT: vcvtdq2pd %ymm1, %zmm1
; NOVL-NEXT: retq
;
-; VL-LABEL: uitofp_16i1_double:
-; VL: # BB#0:
-; VL-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; VL-LABEL: ubto16f64:
+; VL: # %bb.0:
+; VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
; VL-NEXT: vpcmpgtd %zmm0, %zmm1, %k1
; VL-NEXT: movl {{.*}}(%rip), %eax
; VL-NEXT: vpbroadcastd %eax, %ymm0 {%k1} {z}
-; VL-NEXT: vcvtudq2pd %ymm0, %zmm0
+; VL-NEXT: vcvtdq2pd %ymm0, %zmm0
; VL-NEXT: kshiftrw $8, %k1, %k1
; VL-NEXT: vpbroadcastd %eax, %ymm1 {%k1} {z}
-; VL-NEXT: vcvtudq2pd %ymm1, %zmm1
+; VL-NEXT: vcvtdq2pd %ymm1, %zmm1
; VL-NEXT: retq
%mask = icmp slt <16 x i32> %a, zeroinitializer
%1 = uitofp <16 x i1> %mask to <16 x double>
ret <16 x double> %1
}
-define <8 x float> @uitofp_8i1_float(<8 x i32> %a) {
-; NOVL-LABEL: uitofp_8i1_float:
-; NOVL: # BB#0:
-; NOVL-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
-; NOVL-NEXT: vpxor %ymm1, %ymm1, %ymm1
+define <8 x float> @ubto8f32(<8 x i32> %a) {
+; NOVL-LABEL: ubto8f32:
+; NOVL: # %bb.0:
+; NOVL-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
+; NOVL-NEXT: vpxor %xmm1, %xmm1, %xmm1
; NOVL-NEXT: vpcmpgtd %zmm0, %zmm1, %k1
-; NOVL-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z}
-; NOVL-NEXT: vpmovqd %zmm0, %ymm0
-; NOVL-NEXT: vcvtudq2ps %zmm0, %zmm0
-; NOVL-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; NOVL-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NOVL-NEXT: vcvtdq2ps %ymm0, %ymm0
; NOVL-NEXT: retq
;
-; VL-LABEL: uitofp_8i1_float:
-; VL: # BB#0:
-; VL-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; VL-LABEL: ubto8f32:
+; VL: # %bb.0:
+; VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
; VL-NEXT: vpcmpgtd %ymm0, %ymm1, %k1
; VL-NEXT: vpbroadcastd {{.*}}(%rip), %ymm0 {%k1} {z}
-; VL-NEXT: vcvtudq2ps %ymm0, %ymm0
+; VL-NEXT: vcvtdq2ps %ymm0, %ymm0
; VL-NEXT: retq
%mask = icmp slt <8 x i32> %a, zeroinitializer
%1 = uitofp <8 x i1> %mask to <8 x float>
ret <8 x float> %1
}
-define <8 x double> @uitofp_8i1_double(<8 x i32> %a) {
-; NOVL-LABEL: uitofp_8i1_double:
-; NOVL: # BB#0:
-; NOVL-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
-; NOVL-NEXT: vpxor %ymm1, %ymm1, %ymm1
+define <8 x double> @ubto8f64(<8 x i32> %a) {
+; NOVL-LABEL: ubto8f64:
+; NOVL: # %bb.0:
+; NOVL-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
+; NOVL-NEXT: vpxor %xmm1, %xmm1, %xmm1
; NOVL-NEXT: vpcmpgtd %zmm0, %zmm1, %k1
-; NOVL-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z}
-; NOVL-NEXT: vpmovqd %zmm0, %ymm0
-; NOVL-NEXT: vcvtudq2pd %ymm0, %zmm0
+; NOVL-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NOVL-NEXT: vcvtdq2pd %ymm0, %zmm0
; NOVL-NEXT: retq
;
-; VL-LABEL: uitofp_8i1_double:
-; VL: # BB#0:
-; VL-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; VL-LABEL: ubto8f64:
+; VL: # %bb.0:
+; VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
; VL-NEXT: vpcmpgtd %ymm0, %ymm1, %k1
; VL-NEXT: vpbroadcastd {{.*}}(%rip), %ymm0 {%k1} {z}
-; VL-NEXT: vcvtudq2pd %ymm0, %zmm0
+; VL-NEXT: vcvtdq2pd %ymm0, %zmm0
; VL-NEXT: retq
%mask = icmp slt <8 x i32> %a, zeroinitializer
%1 = uitofp <8 x i1> %mask to <8 x double>
ret <8 x double> %1
}
-define <4 x float> @uitofp_4i1_float(<4 x i32> %a) {
-; NOVL-LABEL: uitofp_4i1_float:
-; NOVL: # BB#0:
+define <4 x float> @ubto4f32(<4 x i32> %a) {
+; NOVL-LABEL: ubto4f32:
+; NOVL: # %bb.0:
; NOVL-NEXT: vpxor %xmm1, %xmm1, %xmm1
; NOVL-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; NOVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1]
; NOVL-NEXT: vpand %xmm1, %xmm0, %xmm0
; NOVL-NEXT: retq
;
-; VL-LABEL: uitofp_4i1_float:
-; VL: # BB#0:
+; VL-LABEL: ubto4f32:
+; VL: # %bb.0:
; VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
; VL-NEXT: vpcmpgtd %xmm0, %xmm1, %k1
; VL-NEXT: vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} {z}
-; VL-NEXT: vcvtudq2ps %xmm0, %xmm0
+; VL-NEXT: vcvtdq2ps %xmm0, %xmm0
; VL-NEXT: retq
%mask = icmp slt <4 x i32> %a, zeroinitializer
%1 = uitofp <4 x i1> %mask to <4 x float>
ret <4 x float> %1
}
-define <4 x double> @uitofp_4i1_double(<4 x i32> %a) {
-; NOVL-LABEL: uitofp_4i1_double:
-; NOVL: # BB#0:
+define <4 x double> @ubto4f64(<4 x i32> %a) {
+; NOVL-LABEL: ubto4f64:
+; NOVL: # %bb.0:
; NOVL-NEXT: vpxor %xmm1, %xmm1, %xmm1
; NOVL-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; NOVL-NEXT: vpsrld $31, %xmm0, %xmm0
; NOVL-NEXT: vcvtdq2pd %xmm0, %ymm0
; NOVL-NEXT: retq
;
-; VL-LABEL: uitofp_4i1_double:
-; VL: # BB#0:
+; VL-LABEL: ubto4f64:
+; VL: # %bb.0:
; VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
; VL-NEXT: vpcmpgtd %xmm0, %xmm1, %k1
; VL-NEXT: vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} {z}
-; VL-NEXT: vcvtudq2pd %xmm0, %ymm0
+; VL-NEXT: vcvtdq2pd %xmm0, %ymm0
; VL-NEXT: retq
%mask = icmp slt <4 x i32> %a, zeroinitializer
%1 = uitofp <4 x i1> %mask to <4 x double>
ret <4 x double> %1
}
-define <2 x float> @uitofp_2i1_float(<2 x i32> %a) {
-; NOVL-LABEL: uitofp_2i1_float:
-; NOVL: # BB#0:
+define <2 x float> @ubto2f32(<2 x i32> %a) {
+; NOVL-LABEL: ubto2f32:
+; NOVL: # %bb.0:
; NOVL-NEXT: vpxor %xmm1, %xmm1, %xmm1
; NOVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
; NOVL-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
@@ -1559,48 +1983,48 @@ define <2 x float> @uitofp_2i1_float(<2 x i32> %a) {
; NOVL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
; NOVL-NEXT: retq
;
-; VL-LABEL: uitofp_2i1_float:
-; VL: # BB#0:
+; VL-LABEL: ubto2f32:
+; VL: # %bb.0:
; VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
; VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
; VL-NEXT: vpcmpltuq %xmm1, %xmm0, %k1
; VL-NEXT: vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} {z}
-; VL-NEXT: vcvtudq2ps %xmm0, %xmm0
+; VL-NEXT: vcvtdq2ps %xmm0, %xmm0
; VL-NEXT: retq
%mask = icmp ult <2 x i32> %a, zeroinitializer
%1 = uitofp <2 x i1> %mask to <2 x float>
ret <2 x float> %1
}
-define <2 x double> @uitofp_2i1_double(<2 x i32> %a) {
-; NOVL-LABEL: uitofp_2i1_double:
-; NOVL: # BB#0:
+define <2 x double> @ubto2f64(<2 x i32> %a) {
+; NOVL-LABEL: ubto2f64:
+; NOVL: # %bb.0:
; NOVL-NEXT: vpxor %xmm1, %xmm1, %xmm1
; NOVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
; NOVL-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; NOVL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; NOVL-NEXT: retq
;
-; VLDQ-LABEL: uitofp_2i1_double:
-; VLDQ: # BB#0:
+; VLDQ-LABEL: ubto2f64:
+; VLDQ: # %bb.0:
; VLDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
; VLDQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
; VLDQ-NEXT: vpcmpltuq %xmm1, %xmm0, %k1
; VLDQ-NEXT: vmovdqa64 {{.*}}(%rip), %xmm0 {%k1} {z}
-; VLDQ-NEXT: vcvtuqq2pd %xmm0, %xmm0
+; VLDQ-NEXT: vcvtqq2pd %xmm0, %xmm0
; VLDQ-NEXT: retq
;
-; VLNODQ-LABEL: uitofp_2i1_double:
-; VLNODQ: # BB#0:
+; VLNODQ-LABEL: ubto2f64:
+; VLNODQ: # %bb.0:
; VLNODQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
; VLNODQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
; VLNODQ-NEXT: vpcmpltuq %xmm1, %xmm0, %k1
; VLNODQ-NEXT: vmovdqa64 {{.*}}(%rip), %xmm0 {%k1} {z}
; VLNODQ-NEXT: vpextrq $1, %xmm0, %rax
-; VLNODQ-NEXT: vcvtusi2sdq %rax, %xmm2, %xmm1
+; VLNODQ-NEXT: vcvtsi2sdl %eax, %xmm2, %xmm1
; VLNODQ-NEXT: vmovq %xmm0, %rax
-; VLNODQ-NEXT: vcvtusi2sdq %rax, %xmm2, %xmm0
-; VLNODQ-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; VLNODQ-NEXT: vcvtsi2sdl %eax, %xmm2, %xmm0
+; VLNODQ-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; VLNODQ-NEXT: retq
%mask = icmp ult <2 x i32> %a, zeroinitializer
%1 = uitofp <2 x i1> %mask to <2 x double>
diff --git a/test/CodeGen/X86/avx512-ext.ll b/test/CodeGen/X86/avx512-ext.ll
index 2145f5fb09a8..97beff63811a 100644
--- a/test/CodeGen/X86/avx512-ext.ll
+++ b/test/CodeGen/X86/avx512-ext.ll
@@ -1,10 +1,10 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s --check-prefix=ALL --check-prefix=KNL
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s --check-prefix=ALL --check-prefix=SKX
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl | FileCheck %s --check-prefix=ALL --check-prefix=KNL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skx | FileCheck %s --check-prefix=ALL --check-prefix=SKX
define <8 x i16> @zext_8x8mem_to_8x16(<8 x i8> *%i , <8 x i1> %mask) nounwind readnone {
; KNL-LABEL: zext_8x8mem_to_8x16:
-; KNL: ## BB#0:
+; KNL: # %bb.0:
; KNL-NEXT: vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
; KNL-NEXT: vpsllw $15, %xmm0, %xmm0
; KNL-NEXT: vpsraw $15, %xmm0, %xmm0
@@ -12,7 +12,7 @@ define <8 x i16> @zext_8x8mem_to_8x16(<8 x i8> *%i , <8 x i1> %mask) nounwind re
; KNL-NEXT: retq
;
; SKX-LABEL: zext_8x8mem_to_8x16:
-; SKX: ## BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vpsllw $15, %xmm0, %xmm0
; SKX-NEXT: vpmovw2m %xmm0, %k1
; SKX-NEXT: vpmovzxbw {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
@@ -25,7 +25,7 @@ define <8 x i16> @zext_8x8mem_to_8x16(<8 x i8> *%i , <8 x i1> %mask) nounwind re
define <8 x i16> @sext_8x8mem_to_8x16(<8 x i8> *%i , <8 x i1> %mask) nounwind readnone {
; KNL-LABEL: sext_8x8mem_to_8x16:
-; KNL: ## BB#0:
+; KNL: # %bb.0:
; KNL-NEXT: vpmovsxbw (%rdi), %xmm1
; KNL-NEXT: vpsllw $15, %xmm0, %xmm0
; KNL-NEXT: vpsraw $15, %xmm0, %xmm0
@@ -33,7 +33,7 @@ define <8 x i16> @sext_8x8mem_to_8x16(<8 x i8> *%i , <8 x i1> %mask) nounwind re
; KNL-NEXT: retq
;
; SKX-LABEL: sext_8x8mem_to_8x16:
-; SKX: ## BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vpsllw $15, %xmm0, %xmm0
; SKX-NEXT: vpmovw2m %xmm0, %k1
; SKX-NEXT: vpmovsxbw (%rdi), %xmm0 {%k1} {z}
@@ -47,16 +47,16 @@ define <8 x i16> @sext_8x8mem_to_8x16(<8 x i8> *%i , <8 x i1> %mask) nounwind re
define <16 x i16> @zext_16x8mem_to_16x16(<16 x i8> *%i , <16 x i1> %mask) nounwind readnone {
; KNL-LABEL: zext_16x8mem_to_16x16:
-; KNL: ## BB#0:
-; KNL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; KNL: # %bb.0:
; KNL-NEXT: vpmovzxbw {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
+; KNL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
; KNL-NEXT: vpsllw $15, %ymm0, %ymm0
; KNL-NEXT: vpsraw $15, %ymm0, %ymm0
; KNL-NEXT: vpand %ymm1, %ymm0, %ymm0
; KNL-NEXT: retq
;
; SKX-LABEL: zext_16x8mem_to_16x16:
-; SKX: ## BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vpsllw $7, %xmm0, %xmm0
; SKX-NEXT: vpmovb2m %xmm0, %k1
; SKX-NEXT: vpmovzxbw {{.*#+}} ymm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
@@ -69,16 +69,16 @@ define <16 x i16> @zext_16x8mem_to_16x16(<16 x i8> *%i , <16 x i1> %mask) nounwi
define <16 x i16> @sext_16x8mem_to_16x16(<16 x i8> *%i , <16 x i1> %mask) nounwind readnone {
; KNL-LABEL: sext_16x8mem_to_16x16:
-; KNL: ## BB#0:
-; KNL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; KNL: # %bb.0:
; KNL-NEXT: vpmovsxbw (%rdi), %ymm1
+; KNL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
; KNL-NEXT: vpsllw $15, %ymm0, %ymm0
; KNL-NEXT: vpsraw $15, %ymm0, %ymm0
; KNL-NEXT: vpand %ymm1, %ymm0, %ymm0
; KNL-NEXT: retq
;
; SKX-LABEL: sext_16x8mem_to_16x16:
-; SKX: ## BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vpsllw $7, %xmm0, %xmm0
; SKX-NEXT: vpmovb2m %xmm0, %k1
; SKX-NEXT: vpmovsxbw (%rdi), %ymm0 {%k1} {z}
@@ -91,7 +91,7 @@ define <16 x i16> @sext_16x8mem_to_16x16(<16 x i8> *%i , <16 x i1> %mask) nounwi
define <16 x i16> @zext_16x8_to_16x16(<16 x i8> %a ) nounwind readnone {
; ALL-LABEL: zext_16x8_to_16x16:
-; ALL: ## BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
; ALL-NEXT: retq
%x = zext <16 x i8> %a to <16 x i16>
@@ -100,7 +100,7 @@ define <16 x i16> @zext_16x8_to_16x16(<16 x i8> %a ) nounwind readnone {
define <16 x i16> @zext_16x8_to_16x16_mask(<16 x i8> %a ,<16 x i1> %mask) nounwind readnone {
; KNL-LABEL: zext_16x8_to_16x16_mask:
-; KNL: ## BB#0:
+; KNL: # %bb.0:
; KNL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
; KNL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
; KNL-NEXT: vpsllw $15, %ymm1, %ymm1
@@ -109,7 +109,7 @@ define <16 x i16> @zext_16x8_to_16x16_mask(<16 x i8> %a ,<16 x i1> %mask) nounwi
; KNL-NEXT: retq
;
; SKX-LABEL: zext_16x8_to_16x16_mask:
-; SKX: ## BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vpsllw $7, %xmm1, %xmm1
; SKX-NEXT: vpmovb2m %xmm1, %k1
; SKX-NEXT: vpmovzxbw {{.*#+}} ymm0 {%k1} {z} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
@@ -121,7 +121,7 @@ define <16 x i16> @zext_16x8_to_16x16_mask(<16 x i8> %a ,<16 x i1> %mask) nounwi
define <16 x i16> @sext_16x8_to_16x16(<16 x i8> %a ) nounwind readnone {
; ALL-LABEL: sext_16x8_to_16x16:
-; ALL: ## BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vpmovsxbw %xmm0, %ymm0
; ALL-NEXT: retq
%x = sext <16 x i8> %a to <16 x i16>
@@ -130,7 +130,7 @@ define <16 x i16> @sext_16x8_to_16x16(<16 x i8> %a ) nounwind readnone {
define <16 x i16> @sext_16x8_to_16x16_mask(<16 x i8> %a ,<16 x i1> %mask) nounwind readnone {
; KNL-LABEL: sext_16x8_to_16x16_mask:
-; KNL: ## BB#0:
+; KNL: # %bb.0:
; KNL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
; KNL-NEXT: vpmovsxbw %xmm0, %ymm0
; KNL-NEXT: vpsllw $15, %ymm1, %ymm1
@@ -139,7 +139,7 @@ define <16 x i16> @sext_16x8_to_16x16_mask(<16 x i8> %a ,<16 x i1> %mask) nounwi
; KNL-NEXT: retq
;
; SKX-LABEL: sext_16x8_to_16x16_mask:
-; SKX: ## BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vpsllw $7, %xmm1, %xmm1
; SKX-NEXT: vpmovb2m %xmm1, %k1
; SKX-NEXT: vpmovsxbw %xmm0, %ymm0 {%k1} {z}
@@ -151,7 +151,7 @@ define <16 x i16> @sext_16x8_to_16x16_mask(<16 x i8> %a ,<16 x i1> %mask) nounwi
define <32 x i16> @zext_32x8mem_to_32x16(<32 x i8> *%i , <32 x i1> %mask) nounwind readnone {
; KNL-LABEL: zext_32x8mem_to_32x16:
-; KNL: ## BB#0:
+; KNL: # %bb.0:
; KNL-NEXT: vpmovzxbw {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
; KNL-NEXT: vpmovzxbw {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
; KNL-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
@@ -167,7 +167,7 @@ define <32 x i16> @zext_32x8mem_to_32x16(<32 x i8> *%i , <32 x i1> %mask) nounwi
; KNL-NEXT: retq
;
; SKX-LABEL: zext_32x8mem_to_32x16:
-; SKX: ## BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vpsllw $7, %ymm0, %ymm0
; SKX-NEXT: vpmovb2m %ymm0, %k1
; SKX-NEXT: vpmovzxbw {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero,mem[16],zero,mem[17],zero,mem[18],zero,mem[19],zero,mem[20],zero,mem[21],zero,mem[22],zero,mem[23],zero,mem[24],zero,mem[25],zero,mem[26],zero,mem[27],zero,mem[28],zero,mem[29],zero,mem[30],zero,mem[31],zero
@@ -180,7 +180,7 @@ define <32 x i16> @zext_32x8mem_to_32x16(<32 x i8> *%i , <32 x i1> %mask) nounwi
define <32 x i16> @sext_32x8mem_to_32x16(<32 x i8> *%i , <32 x i1> %mask) nounwind readnone {
; KNL-LABEL: sext_32x8mem_to_32x16:
-; KNL: ## BB#0:
+; KNL: # %bb.0:
; KNL-NEXT: vpmovsxbw 16(%rdi), %ymm1
; KNL-NEXT: vpmovsxbw (%rdi), %ymm2
; KNL-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
@@ -196,7 +196,7 @@ define <32 x i16> @sext_32x8mem_to_32x16(<32 x i8> *%i , <32 x i1> %mask) nounwi
; KNL-NEXT: retq
;
; SKX-LABEL: sext_32x8mem_to_32x16:
-; SKX: ## BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vpsllw $7, %ymm0, %ymm0
; SKX-NEXT: vpmovb2m %ymm0, %k1
; SKX-NEXT: vpmovsxbw (%rdi), %zmm0 {%k1} {z}
@@ -209,7 +209,7 @@ define <32 x i16> @sext_32x8mem_to_32x16(<32 x i8> *%i , <32 x i1> %mask) nounwi
define <32 x i16> @zext_32x8_to_32x16(<32 x i8> %a ) nounwind readnone {
; KNL-LABEL: zext_32x8_to_32x16:
-; KNL: ## BB#0:
+; KNL: # %bb.0:
; KNL-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
; KNL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
@@ -217,7 +217,7 @@ define <32 x i16> @zext_32x8_to_32x16(<32 x i8> %a ) nounwind readnone {
; KNL-NEXT: retq
;
; SKX-LABEL: zext_32x8_to_32x16:
-; SKX: ## BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
; SKX-NEXT: retq
%x = zext <32 x i8> %a to <32 x i16>
@@ -226,7 +226,7 @@ define <32 x i16> @zext_32x8_to_32x16(<32 x i8> %a ) nounwind readnone {
define <32 x i16> @zext_32x8_to_32x16_mask(<32 x i8> %a ,<32 x i1> %mask) nounwind readnone {
; KNL-LABEL: zext_32x8_to_32x16_mask:
-; KNL: ## BB#0:
+; KNL: # %bb.0:
; KNL-NEXT: vextracti128 $1, %ymm0, %xmm2
; KNL-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
; KNL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
@@ -242,7 +242,7 @@ define <32 x i16> @zext_32x8_to_32x16_mask(<32 x i8> %a ,<32 x i1> %mask) nounwi
; KNL-NEXT: retq
;
; SKX-LABEL: zext_32x8_to_32x16_mask:
-; SKX: ## BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vpsllw $7, %ymm1, %ymm1
; SKX-NEXT: vpmovb2m %ymm1, %k1
; SKX-NEXT: vpmovzxbw {{.*#+}} zmm0 {%k1} {z} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
@@ -254,7 +254,7 @@ define <32 x i16> @zext_32x8_to_32x16_mask(<32 x i8> %a ,<32 x i1> %mask) nounwi
define <32 x i16> @sext_32x8_to_32x16(<32 x i8> %a ) nounwind readnone {
; KNL-LABEL: sext_32x8_to_32x16:
-; KNL: ## BB#0:
+; KNL: # %bb.0:
; KNL-NEXT: vpmovsxbw %xmm0, %ymm2
; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
; KNL-NEXT: vpmovsxbw %xmm0, %ymm1
@@ -262,7 +262,7 @@ define <32 x i16> @sext_32x8_to_32x16(<32 x i8> %a ) nounwind readnone {
; KNL-NEXT: retq
;
; SKX-LABEL: sext_32x8_to_32x16:
-; SKX: ## BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vpmovsxbw %ymm0, %zmm0
; SKX-NEXT: retq
%x = sext <32 x i8> %a to <32 x i16>
@@ -271,7 +271,7 @@ define <32 x i16> @sext_32x8_to_32x16(<32 x i8> %a ) nounwind readnone {
define <32 x i16> @sext_32x8_to_32x16_mask(<32 x i8> %a ,<32 x i1> %mask) nounwind readnone {
; KNL-LABEL: sext_32x8_to_32x16_mask:
-; KNL: ## BB#0:
+; KNL: # %bb.0:
; KNL-NEXT: vextracti128 $1, %ymm0, %xmm2
; KNL-NEXT: vpmovsxbw %xmm2, %ymm2
; KNL-NEXT: vpmovsxbw %xmm0, %ymm0
@@ -287,7 +287,7 @@ define <32 x i16> @sext_32x8_to_32x16_mask(<32 x i8> %a ,<32 x i1> %mask) nounwi
; KNL-NEXT: retq
;
; SKX-LABEL: sext_32x8_to_32x16_mask:
-; SKX: ## BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vpsllw $7, %ymm1, %ymm1
; SKX-NEXT: vpmovb2m %ymm1, %k1
; SKX-NEXT: vpmovsxbw %ymm0, %zmm0 {%k1} {z}
@@ -299,7 +299,7 @@ define <32 x i16> @sext_32x8_to_32x16_mask(<32 x i8> %a ,<32 x i1> %mask) nounwi
define <4 x i32> @zext_4x8mem_to_4x32(<4 x i8> *%i , <4 x i1> %mask) nounwind readnone {
; KNL-LABEL: zext_4x8mem_to_4x32:
-; KNL: ## BB#0:
+; KNL: # %bb.0:
; KNL-NEXT: vpslld $31, %xmm0, %xmm0
; KNL-NEXT: vpsrad $31, %xmm0, %xmm0
; KNL-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
@@ -307,7 +307,7 @@ define <4 x i32> @zext_4x8mem_to_4x32(<4 x i8> *%i , <4 x i1> %mask) nounwind re
; KNL-NEXT: retq
;
; SKX-LABEL: zext_4x8mem_to_4x32:
-; SKX: ## BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vpslld $31, %xmm0, %xmm0
; SKX-NEXT: vptestmd %xmm0, %xmm0, %k1
; SKX-NEXT: vpmovzxbd {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
@@ -320,7 +320,7 @@ define <4 x i32> @zext_4x8mem_to_4x32(<4 x i8> *%i , <4 x i1> %mask) nounwind re
define <4 x i32> @sext_4x8mem_to_4x32(<4 x i8> *%i , <4 x i1> %mask) nounwind readnone {
; KNL-LABEL: sext_4x8mem_to_4x32:
-; KNL: ## BB#0:
+; KNL: # %bb.0:
; KNL-NEXT: vpslld $31, %xmm0, %xmm0
; KNL-NEXT: vpsrad $31, %xmm0, %xmm0
; KNL-NEXT: vpmovsxbd (%rdi), %xmm1
@@ -328,7 +328,7 @@ define <4 x i32> @sext_4x8mem_to_4x32(<4 x i8> *%i , <4 x i1> %mask) nounwind re
; KNL-NEXT: retq
;
; SKX-LABEL: sext_4x8mem_to_4x32:
-; SKX: ## BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vpslld $31, %xmm0, %xmm0
; SKX-NEXT: vptestmd %xmm0, %xmm0, %k1
; SKX-NEXT: vpmovsxbd (%rdi), %xmm0 {%k1} {z}
@@ -341,18 +341,18 @@ define <4 x i32> @sext_4x8mem_to_4x32(<4 x i8> *%i , <4 x i1> %mask) nounwind re
define <8 x i32> @zext_8x8mem_to_8x32(<8 x i8> *%i , <8 x i1> %mask) nounwind readnone {
; KNL-LABEL: zext_8x8mem_to_8x32:
-; KNL: ## BB#0:
+; KNL: # %bb.0:
; KNL-NEXT: vpmovsxwq %xmm0, %zmm0
; KNL-NEXT: vpsllq $63, %zmm0, %zmm0
; KNL-NEXT: vptestmq %zmm0, %zmm0, %k1
; KNL-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; KNL-NEXT: vpxor %ymm0, %ymm0, %ymm0
+; KNL-NEXT: vpxor %xmm0, %xmm0, %xmm0
; KNL-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
-; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; KNL-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
; KNL-NEXT: retq
;
; SKX-LABEL: zext_8x8mem_to_8x32:
-; SKX: ## BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vpsllw $15, %xmm0, %xmm0
; SKX-NEXT: vpmovw2m %xmm0, %k1
; SKX-NEXT: vpmovzxbd {{.*#+}} ymm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
@@ -365,18 +365,18 @@ define <8 x i32> @zext_8x8mem_to_8x32(<8 x i8> *%i , <8 x i1> %mask) nounwind re
define <8 x i32> @sext_8x8mem_to_8x32(<8 x i8> *%i , <8 x i1> %mask) nounwind readnone {
; KNL-LABEL: sext_8x8mem_to_8x32:
-; KNL: ## BB#0:
+; KNL: # %bb.0:
; KNL-NEXT: vpmovsxwq %xmm0, %zmm0
; KNL-NEXT: vpsllq $63, %zmm0, %zmm0
; KNL-NEXT: vptestmq %zmm0, %zmm0, %k1
; KNL-NEXT: vpmovsxbd (%rdi), %ymm1
-; KNL-NEXT: vpxor %ymm0, %ymm0, %ymm0
+; KNL-NEXT: vpxor %xmm0, %xmm0, %xmm0
; KNL-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
-; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; KNL-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
; KNL-NEXT: retq
;
; SKX-LABEL: sext_8x8mem_to_8x32:
-; SKX: ## BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vpsllw $15, %xmm0, %xmm0
; SKX-NEXT: vpmovw2m %xmm0, %k1
; SKX-NEXT: vpmovsxbd (%rdi), %ymm0 {%k1} {z}
@@ -389,7 +389,7 @@ define <8 x i32> @sext_8x8mem_to_8x32(<8 x i8> *%i , <8 x i1> %mask) nounwind re
define <16 x i32> @zext_16x8mem_to_16x32(<16 x i8> *%i , <16 x i1> %mask) nounwind readnone {
; KNL-LABEL: zext_16x8mem_to_16x32:
-; KNL: ## BB#0:
+; KNL: # %bb.0:
; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k1
@@ -397,7 +397,7 @@ define <16 x i32> @zext_16x8mem_to_16x32(<16 x i8> *%i , <16 x i1> %mask) nounwi
; KNL-NEXT: retq
;
; SKX-LABEL: zext_16x8mem_to_16x32:
-; SKX: ## BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vpsllw $7, %xmm0, %xmm0
; SKX-NEXT: vpmovb2m %xmm0, %k1
; SKX-NEXT: vpmovzxbd {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
@@ -410,7 +410,7 @@ define <16 x i32> @zext_16x8mem_to_16x32(<16 x i8> *%i , <16 x i1> %mask) nounwi
define <16 x i32> @sext_16x8mem_to_16x32(<16 x i8> *%i , <16 x i1> %mask) nounwind readnone {
; KNL-LABEL: sext_16x8mem_to_16x32:
-; KNL: ## BB#0:
+; KNL: # %bb.0:
; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k1
@@ -418,7 +418,7 @@ define <16 x i32> @sext_16x8mem_to_16x32(<16 x i8> *%i , <16 x i1> %mask) nounwi
; KNL-NEXT: retq
;
; SKX-LABEL: sext_16x8mem_to_16x32:
-; SKX: ## BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vpsllw $7, %xmm0, %xmm0
; SKX-NEXT: vpmovb2m %xmm0, %k1
; SKX-NEXT: vpmovsxbd (%rdi), %zmm0 {%k1} {z}
@@ -431,7 +431,7 @@ define <16 x i32> @sext_16x8mem_to_16x32(<16 x i8> *%i , <16 x i1> %mask) nounwi
define <16 x i32> @zext_16x8_to_16x32_mask(<16 x i8> %a , <16 x i1> %mask) nounwind readnone {
; KNL-LABEL: zext_16x8_to_16x32_mask:
-; KNL: ## BB#0:
+; KNL: # %bb.0:
; KNL-NEXT: vpmovsxbd %xmm1, %zmm1
; KNL-NEXT: vpslld $31, %zmm1, %zmm1
; KNL-NEXT: vptestmd %zmm1, %zmm1, %k1
@@ -439,7 +439,7 @@ define <16 x i32> @zext_16x8_to_16x32_mask(<16 x i8> %a , <16 x i1> %mask) nounw
; KNL-NEXT: retq
;
; SKX-LABEL: zext_16x8_to_16x32_mask:
-; SKX: ## BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vpsllw $7, %xmm1, %xmm1
; SKX-NEXT: vpmovb2m %xmm1, %k1
; SKX-NEXT: vpmovzxbd {{.*#+}} zmm0 {%k1} {z} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
@@ -451,7 +451,7 @@ define <16 x i32> @zext_16x8_to_16x32_mask(<16 x i8> %a , <16 x i1> %mask) nounw
define <16 x i32> @sext_16x8_to_16x32_mask(<16 x i8> %a , <16 x i1> %mask) nounwind readnone {
; KNL-LABEL: sext_16x8_to_16x32_mask:
-; KNL: ## BB#0:
+; KNL: # %bb.0:
; KNL-NEXT: vpmovsxbd %xmm1, %zmm1
; KNL-NEXT: vpslld $31, %zmm1, %zmm1
; KNL-NEXT: vptestmd %zmm1, %zmm1, %k1
@@ -459,7 +459,7 @@ define <16 x i32> @sext_16x8_to_16x32_mask(<16 x i8> %a , <16 x i1> %mask) nounw
; KNL-NEXT: retq
;
; SKX-LABEL: sext_16x8_to_16x32_mask:
-; SKX: ## BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vpsllw $7, %xmm1, %xmm1
; SKX-NEXT: vpmovb2m %xmm1, %k1
; SKX-NEXT: vpmovsxbd %xmm0, %zmm0 {%k1} {z}
@@ -471,7 +471,7 @@ define <16 x i32> @sext_16x8_to_16x32_mask(<16 x i8> %a , <16 x i1> %mask) nounw
define <16 x i32> @zext_16x8_to_16x32(<16 x i8> %i) nounwind readnone {
; ALL-LABEL: zext_16x8_to_16x32:
-; ALL: ## BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
; ALL-NEXT: retq
%x = zext <16 x i8> %i to <16 x i32>
@@ -480,7 +480,7 @@ define <16 x i32> @zext_16x8_to_16x32(<16 x i8> %i) nounwind readnone {
define <16 x i32> @sext_16x8_to_16x32(<16 x i8> %i) nounwind readnone {
; ALL-LABEL: sext_16x8_to_16x32:
-; ALL: ## BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vpmovsxbd %xmm0, %zmm0
; ALL-NEXT: retq
%x = sext <16 x i8> %i to <16 x i32>
@@ -489,7 +489,7 @@ define <16 x i32> @sext_16x8_to_16x32(<16 x i8> %i) nounwind readnone {
define <2 x i64> @zext_2x8mem_to_2x64(<2 x i8> *%i , <2 x i1> %mask) nounwind readnone {
; KNL-LABEL: zext_2x8mem_to_2x64:
-; KNL: ## BB#0:
+; KNL: # %bb.0:
; KNL-NEXT: vpsllq $63, %xmm0, %xmm0
; KNL-NEXT: vpsraq $63, %zmm0, %zmm0
; KNL-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
@@ -497,7 +497,7 @@ define <2 x i64> @zext_2x8mem_to_2x64(<2 x i8> *%i , <2 x i1> %mask) nounwind re
; KNL-NEXT: retq
;
; SKX-LABEL: zext_2x8mem_to_2x64:
-; SKX: ## BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vpsllq $63, %xmm0, %xmm0
; SKX-NEXT: vptestmq %xmm0, %xmm0, %k1
; SKX-NEXT: vpmovzxbq {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
@@ -509,7 +509,7 @@ define <2 x i64> @zext_2x8mem_to_2x64(<2 x i8> *%i , <2 x i1> %mask) nounwind re
}
define <2 x i64> @sext_2x8mem_to_2x64mask(<2 x i8> *%i , <2 x i1> %mask) nounwind readnone {
; KNL-LABEL: sext_2x8mem_to_2x64mask:
-; KNL: ## BB#0:
+; KNL: # %bb.0:
; KNL-NEXT: vpsllq $63, %xmm0, %xmm0
; KNL-NEXT: vpsraq $63, %zmm0, %zmm0
; KNL-NEXT: vpmovsxbq (%rdi), %xmm1
@@ -517,7 +517,7 @@ define <2 x i64> @sext_2x8mem_to_2x64mask(<2 x i8> *%i , <2 x i1> %mask) nounwin
; KNL-NEXT: retq
;
; SKX-LABEL: sext_2x8mem_to_2x64mask:
-; SKX: ## BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vpsllq $63, %xmm0, %xmm0
; SKX-NEXT: vptestmq %xmm0, %xmm0, %k1
; SKX-NEXT: vpmovsxbq (%rdi), %xmm0 {%k1} {z}
@@ -529,7 +529,7 @@ define <2 x i64> @sext_2x8mem_to_2x64mask(<2 x i8> *%i , <2 x i1> %mask) nounwin
}
define <2 x i64> @sext_2x8mem_to_2x64(<2 x i8> *%i) nounwind readnone {
; ALL-LABEL: sext_2x8mem_to_2x64:
-; ALL: ## BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vpmovsxbq (%rdi), %xmm0
; ALL-NEXT: retq
%a = load <2 x i8>,<2 x i8> *%i,align 1
@@ -539,7 +539,7 @@ define <2 x i64> @sext_2x8mem_to_2x64(<2 x i8> *%i) nounwind readnone {
define <4 x i64> @zext_4x8mem_to_4x64(<4 x i8> *%i , <4 x i1> %mask) nounwind readnone {
; KNL-LABEL: zext_4x8mem_to_4x64:
-; KNL: ## BB#0:
+; KNL: # %bb.0:
; KNL-NEXT: vpslld $31, %xmm0, %xmm0
; KNL-NEXT: vpsrad $31, %xmm0, %xmm0
; KNL-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
@@ -548,7 +548,7 @@ define <4 x i64> @zext_4x8mem_to_4x64(<4 x i8> *%i , <4 x i1> %mask) nounwind re
; KNL-NEXT: retq
;
; SKX-LABEL: zext_4x8mem_to_4x64:
-; SKX: ## BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vpslld $31, %xmm0, %xmm0
; SKX-NEXT: vptestmd %xmm0, %xmm0, %k1
; SKX-NEXT: vpmovzxbq {{.*#+}} ymm0 {%k1} {z} = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero
@@ -561,7 +561,7 @@ define <4 x i64> @zext_4x8mem_to_4x64(<4 x i8> *%i , <4 x i1> %mask) nounwind re
define <4 x i64> @sext_4x8mem_to_4x64mask(<4 x i8> *%i , <4 x i1> %mask) nounwind readnone {
; KNL-LABEL: sext_4x8mem_to_4x64mask:
-; KNL: ## BB#0:
+; KNL: # %bb.0:
; KNL-NEXT: vpslld $31, %xmm0, %xmm0
; KNL-NEXT: vpsrad $31, %xmm0, %xmm0
; KNL-NEXT: vpmovsxdq %xmm0, %ymm0
@@ -570,7 +570,7 @@ define <4 x i64> @sext_4x8mem_to_4x64mask(<4 x i8> *%i , <4 x i1> %mask) nounwin
; KNL-NEXT: retq
;
; SKX-LABEL: sext_4x8mem_to_4x64mask:
-; SKX: ## BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vpslld $31, %xmm0, %xmm0
; SKX-NEXT: vptestmd %xmm0, %xmm0, %k1
; SKX-NEXT: vpmovsxbq (%rdi), %ymm0 {%k1} {z}
@@ -583,7 +583,7 @@ define <4 x i64> @sext_4x8mem_to_4x64mask(<4 x i8> *%i , <4 x i1> %mask) nounwin
define <4 x i64> @sext_4x8mem_to_4x64(<4 x i8> *%i) nounwind readnone {
; ALL-LABEL: sext_4x8mem_to_4x64:
-; ALL: ## BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vpmovsxbq (%rdi), %ymm0
; ALL-NEXT: retq
%a = load <4 x i8>,<4 x i8> *%i,align 1
@@ -593,7 +593,7 @@ define <4 x i64> @sext_4x8mem_to_4x64(<4 x i8> *%i) nounwind readnone {
define <8 x i64> @zext_8x8mem_to_8x64(<8 x i8> *%i , <8 x i1> %mask) nounwind readnone {
; KNL-LABEL: zext_8x8mem_to_8x64:
-; KNL: ## BB#0:
+; KNL: # %bb.0:
; KNL-NEXT: vpmovsxwq %xmm0, %zmm0
; KNL-NEXT: vpsllq $63, %zmm0, %zmm0
; KNL-NEXT: vptestmq %zmm0, %zmm0, %k1
@@ -601,7 +601,7 @@ define <8 x i64> @zext_8x8mem_to_8x64(<8 x i8> *%i , <8 x i1> %mask) nounwind re
; KNL-NEXT: retq
;
; SKX-LABEL: zext_8x8mem_to_8x64:
-; SKX: ## BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vpsllw $15, %xmm0, %xmm0
; SKX-NEXT: vpmovw2m %xmm0, %k1
; SKX-NEXT: vpmovzxbq {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero,mem[4],zero,zero,zero,zero,zero,zero,zero,mem[5],zero,zero,zero,zero,zero,zero,zero,mem[6],zero,zero,zero,zero,zero,zero,zero,mem[7],zero,zero,zero,zero,zero,zero,zero
@@ -614,7 +614,7 @@ define <8 x i64> @zext_8x8mem_to_8x64(<8 x i8> *%i , <8 x i1> %mask) nounwind re
define <8 x i64> @sext_8x8mem_to_8x64mask(<8 x i8> *%i , <8 x i1> %mask) nounwind readnone {
; KNL-LABEL: sext_8x8mem_to_8x64mask:
-; KNL: ## BB#0:
+; KNL: # %bb.0:
; KNL-NEXT: vpmovsxwq %xmm0, %zmm0
; KNL-NEXT: vpsllq $63, %zmm0, %zmm0
; KNL-NEXT: vptestmq %zmm0, %zmm0, %k1
@@ -622,7 +622,7 @@ define <8 x i64> @sext_8x8mem_to_8x64mask(<8 x i8> *%i , <8 x i1> %mask) nounwin
; KNL-NEXT: retq
;
; SKX-LABEL: sext_8x8mem_to_8x64mask:
-; SKX: ## BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vpsllw $15, %xmm0, %xmm0
; SKX-NEXT: vpmovw2m %xmm0, %k1
; SKX-NEXT: vpmovsxbq (%rdi), %zmm0 {%k1} {z}
@@ -635,7 +635,7 @@ define <8 x i64> @sext_8x8mem_to_8x64mask(<8 x i8> *%i , <8 x i1> %mask) nounwin
define <8 x i64> @sext_8x8mem_to_8x64(<8 x i8> *%i) nounwind readnone {
; ALL-LABEL: sext_8x8mem_to_8x64:
-; ALL: ## BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vpmovsxbq (%rdi), %zmm0
; ALL-NEXT: retq
%a = load <8 x i8>,<8 x i8> *%i,align 1
@@ -645,7 +645,7 @@ define <8 x i64> @sext_8x8mem_to_8x64(<8 x i8> *%i) nounwind readnone {
define <4 x i32> @zext_4x16mem_to_4x32(<4 x i16> *%i , <4 x i1> %mask) nounwind readnone {
; KNL-LABEL: zext_4x16mem_to_4x32:
-; KNL: ## BB#0:
+; KNL: # %bb.0:
; KNL-NEXT: vpslld $31, %xmm0, %xmm0
; KNL-NEXT: vpsrad $31, %xmm0, %xmm0
; KNL-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
@@ -653,7 +653,7 @@ define <4 x i32> @zext_4x16mem_to_4x32(<4 x i16> *%i , <4 x i1> %mask) nounwind
; KNL-NEXT: retq
;
; SKX-LABEL: zext_4x16mem_to_4x32:
-; SKX: ## BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vpslld $31, %xmm0, %xmm0
; SKX-NEXT: vptestmd %xmm0, %xmm0, %k1
; SKX-NEXT: vpmovzxwd {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
@@ -666,7 +666,7 @@ define <4 x i32> @zext_4x16mem_to_4x32(<4 x i16> *%i , <4 x i1> %mask) nounwind
define <4 x i32> @sext_4x16mem_to_4x32mask(<4 x i16> *%i , <4 x i1> %mask) nounwind readnone {
; KNL-LABEL: sext_4x16mem_to_4x32mask:
-; KNL: ## BB#0:
+; KNL: # %bb.0:
; KNL-NEXT: vpslld $31, %xmm0, %xmm0
; KNL-NEXT: vpsrad $31, %xmm0, %xmm0
; KNL-NEXT: vpmovsxwd (%rdi), %xmm1
@@ -674,7 +674,7 @@ define <4 x i32> @sext_4x16mem_to_4x32mask(<4 x i16> *%i , <4 x i1> %mask) nounw
; KNL-NEXT: retq
;
; SKX-LABEL: sext_4x16mem_to_4x32mask:
-; SKX: ## BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vpslld $31, %xmm0, %xmm0
; SKX-NEXT: vptestmd %xmm0, %xmm0, %k1
; SKX-NEXT: vpmovsxwd (%rdi), %xmm0 {%k1} {z}
@@ -687,7 +687,7 @@ define <4 x i32> @sext_4x16mem_to_4x32mask(<4 x i16> *%i , <4 x i1> %mask) nounw
define <4 x i32> @sext_4x16mem_to_4x32(<4 x i16> *%i) nounwind readnone {
; ALL-LABEL: sext_4x16mem_to_4x32:
-; ALL: ## BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vpmovsxwd (%rdi), %xmm0
; ALL-NEXT: retq
%a = load <4 x i16>,<4 x i16> *%i,align 1
@@ -698,18 +698,18 @@ define <4 x i32> @sext_4x16mem_to_4x32(<4 x i16> *%i) nounwind readnone {
define <8 x i32> @zext_8x16mem_to_8x32(<8 x i16> *%i , <8 x i1> %mask) nounwind readnone {
; KNL-LABEL: zext_8x16mem_to_8x32:
-; KNL: ## BB#0:
+; KNL: # %bb.0:
; KNL-NEXT: vpmovsxwq %xmm0, %zmm0
; KNL-NEXT: vpsllq $63, %zmm0, %zmm0
; KNL-NEXT: vptestmq %zmm0, %zmm0, %k1
; KNL-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; KNL-NEXT: vpxor %ymm0, %ymm0, %ymm0
+; KNL-NEXT: vpxor %xmm0, %xmm0, %xmm0
; KNL-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
-; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; KNL-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
; KNL-NEXT: retq
;
; SKX-LABEL: zext_8x16mem_to_8x32:
-; SKX: ## BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vpsllw $15, %xmm0, %xmm0
; SKX-NEXT: vpmovw2m %xmm0, %k1
; SKX-NEXT: vpmovzxwd {{.*#+}} ymm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
@@ -722,18 +722,18 @@ define <8 x i32> @zext_8x16mem_to_8x32(<8 x i16> *%i , <8 x i1> %mask) nounwind
define <8 x i32> @sext_8x16mem_to_8x32mask(<8 x i16> *%i , <8 x i1> %mask) nounwind readnone {
; KNL-LABEL: sext_8x16mem_to_8x32mask:
-; KNL: ## BB#0:
+; KNL: # %bb.0:
; KNL-NEXT: vpmovsxwq %xmm0, %zmm0
; KNL-NEXT: vpsllq $63, %zmm0, %zmm0
; KNL-NEXT: vptestmq %zmm0, %zmm0, %k1
; KNL-NEXT: vpmovsxwd (%rdi), %ymm1
-; KNL-NEXT: vpxor %ymm0, %ymm0, %ymm0
+; KNL-NEXT: vpxor %xmm0, %xmm0, %xmm0
; KNL-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
-; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; KNL-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
; KNL-NEXT: retq
;
; SKX-LABEL: sext_8x16mem_to_8x32mask:
-; SKX: ## BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vpsllw $15, %xmm0, %xmm0
; SKX-NEXT: vpmovw2m %xmm0, %k1
; SKX-NEXT: vpmovsxwd (%rdi), %ymm0 {%k1} {z}
@@ -746,7 +746,7 @@ define <8 x i32> @sext_8x16mem_to_8x32mask(<8 x i16> *%i , <8 x i1> %mask) nounw
define <8 x i32> @sext_8x16mem_to_8x32(<8 x i16> *%i) nounwind readnone {
; ALL-LABEL: sext_8x16mem_to_8x32:
-; ALL: ## BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vpmovsxwd (%rdi), %ymm0
; ALL-NEXT: retq
%a = load <8 x i16>,<8 x i16> *%i,align 1
@@ -756,18 +756,18 @@ define <8 x i32> @sext_8x16mem_to_8x32(<8 x i16> *%i) nounwind readnone {
define <8 x i32> @zext_8x16_to_8x32mask(<8 x i16> %a , <8 x i1> %mask) nounwind readnone {
; KNL-LABEL: zext_8x16_to_8x32mask:
-; KNL: ## BB#0:
+; KNL: # %bb.0:
; KNL-NEXT: vpmovsxwq %xmm1, %zmm1
; KNL-NEXT: vpsllq $63, %zmm1, %zmm1
; KNL-NEXT: vptestmq %zmm1, %zmm1, %k1
; KNL-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; KNL-NEXT: vpxor %ymm0, %ymm0, %ymm0
+; KNL-NEXT: vpxor %xmm0, %xmm0, %xmm0
; KNL-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
-; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; KNL-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
; KNL-NEXT: retq
;
; SKX-LABEL: zext_8x16_to_8x32mask:
-; SKX: ## BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vpsllw $15, %xmm1, %xmm1
; SKX-NEXT: vpmovw2m %xmm1, %k1
; SKX-NEXT: vpmovzxwd {{.*#+}} ymm0 {%k1} {z} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
@@ -779,7 +779,7 @@ define <8 x i32> @zext_8x16_to_8x32mask(<8 x i16> %a , <8 x i1> %mask) nounwind
define <8 x i32> @zext_8x16_to_8x32(<8 x i16> %a ) nounwind readnone {
; ALL-LABEL: zext_8x16_to_8x32:
-; ALL: ## BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; ALL-NEXT: retq
%x = zext <8 x i16> %a to <8 x i32>
@@ -788,7 +788,7 @@ define <8 x i32> @zext_8x16_to_8x32(<8 x i16> %a ) nounwind readnone {
define <16 x i32> @zext_16x16mem_to_16x32(<16 x i16> *%i , <16 x i1> %mask) nounwind readnone {
; KNL-LABEL: zext_16x16mem_to_16x32:
-; KNL: ## BB#0:
+; KNL: # %bb.0:
; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k1
@@ -796,7 +796,7 @@ define <16 x i32> @zext_16x16mem_to_16x32(<16 x i16> *%i , <16 x i1> %mask) noun
; KNL-NEXT: retq
;
; SKX-LABEL: zext_16x16mem_to_16x32:
-; SKX: ## BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vpsllw $7, %xmm0, %xmm0
; SKX-NEXT: vpmovb2m %xmm0, %k1
; SKX-NEXT: vpmovzxwd {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
@@ -809,7 +809,7 @@ define <16 x i32> @zext_16x16mem_to_16x32(<16 x i16> *%i , <16 x i1> %mask) noun
define <16 x i32> @sext_16x16mem_to_16x32mask(<16 x i16> *%i , <16 x i1> %mask) nounwind readnone {
; KNL-LABEL: sext_16x16mem_to_16x32mask:
-; KNL: ## BB#0:
+; KNL: # %bb.0:
; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k1
@@ -817,7 +817,7 @@ define <16 x i32> @sext_16x16mem_to_16x32mask(<16 x i16> *%i , <16 x i1> %mask)
; KNL-NEXT: retq
;
; SKX-LABEL: sext_16x16mem_to_16x32mask:
-; SKX: ## BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vpsllw $7, %xmm0, %xmm0
; SKX-NEXT: vpmovb2m %xmm0, %k1
; SKX-NEXT: vpmovsxwd (%rdi), %zmm0 {%k1} {z}
@@ -830,7 +830,7 @@ define <16 x i32> @sext_16x16mem_to_16x32mask(<16 x i16> *%i , <16 x i1> %mask)
define <16 x i32> @sext_16x16mem_to_16x32(<16 x i16> *%i) nounwind readnone {
; ALL-LABEL: sext_16x16mem_to_16x32:
-; ALL: ## BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vpmovsxwd (%rdi), %zmm0
; ALL-NEXT: retq
%a = load <16 x i16>,<16 x i16> *%i,align 1
@@ -839,7 +839,7 @@ define <16 x i32> @sext_16x16mem_to_16x32(<16 x i16> *%i) nounwind readnone {
}
define <16 x i32> @zext_16x16_to_16x32mask(<16 x i16> %a , <16 x i1> %mask) nounwind readnone {
; KNL-LABEL: zext_16x16_to_16x32mask:
-; KNL: ## BB#0:
+; KNL: # %bb.0:
; KNL-NEXT: vpmovsxbd %xmm1, %zmm1
; KNL-NEXT: vpslld $31, %zmm1, %zmm1
; KNL-NEXT: vptestmd %zmm1, %zmm1, %k1
@@ -847,7 +847,7 @@ define <16 x i32> @zext_16x16_to_16x32mask(<16 x i16> %a , <16 x i1> %mask) noun
; KNL-NEXT: retq
;
; SKX-LABEL: zext_16x16_to_16x32mask:
-; SKX: ## BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vpsllw $7, %xmm1, %xmm1
; SKX-NEXT: vpmovb2m %xmm1, %k1
; SKX-NEXT: vpmovzxwd {{.*#+}} zmm0 {%k1} {z} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
@@ -859,7 +859,7 @@ define <16 x i32> @zext_16x16_to_16x32mask(<16 x i16> %a , <16 x i1> %mask) noun
define <16 x i32> @zext_16x16_to_16x32(<16 x i16> %a ) nounwind readnone {
; ALL-LABEL: zext_16x16_to_16x32:
-; ALL: ## BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
; ALL-NEXT: retq
%x = zext <16 x i16> %a to <16 x i32>
@@ -868,7 +868,7 @@ define <16 x i32> @zext_16x16_to_16x32(<16 x i16> %a ) nounwind readnone {
define <2 x i64> @zext_2x16mem_to_2x64(<2 x i16> *%i , <2 x i1> %mask) nounwind readnone {
; KNL-LABEL: zext_2x16mem_to_2x64:
-; KNL: ## BB#0:
+; KNL: # %bb.0:
; KNL-NEXT: vpsllq $63, %xmm0, %xmm0
; KNL-NEXT: vpsraq $63, %zmm0, %zmm0
; KNL-NEXT: vpmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero
@@ -876,7 +876,7 @@ define <2 x i64> @zext_2x16mem_to_2x64(<2 x i16> *%i , <2 x i1> %mask) nounwind
; KNL-NEXT: retq
;
; SKX-LABEL: zext_2x16mem_to_2x64:
-; SKX: ## BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vpsllq $63, %xmm0, %xmm0
; SKX-NEXT: vptestmq %xmm0, %xmm0, %k1
; SKX-NEXT: vpmovzxwq {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero
@@ -889,7 +889,7 @@ define <2 x i64> @zext_2x16mem_to_2x64(<2 x i16> *%i , <2 x i1> %mask) nounwind
define <2 x i64> @sext_2x16mem_to_2x64mask(<2 x i16> *%i , <2 x i1> %mask) nounwind readnone {
; KNL-LABEL: sext_2x16mem_to_2x64mask:
-; KNL: ## BB#0:
+; KNL: # %bb.0:
; KNL-NEXT: vpsllq $63, %xmm0, %xmm0
; KNL-NEXT: vpsraq $63, %zmm0, %zmm0
; KNL-NEXT: vpmovsxwq (%rdi), %xmm1
@@ -897,7 +897,7 @@ define <2 x i64> @sext_2x16mem_to_2x64mask(<2 x i16> *%i , <2 x i1> %mask) nounw
; KNL-NEXT: retq
;
; SKX-LABEL: sext_2x16mem_to_2x64mask:
-; SKX: ## BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vpsllq $63, %xmm0, %xmm0
; SKX-NEXT: vptestmq %xmm0, %xmm0, %k1
; SKX-NEXT: vpmovsxwq (%rdi), %xmm0 {%k1} {z}
@@ -910,7 +910,7 @@ define <2 x i64> @sext_2x16mem_to_2x64mask(<2 x i16> *%i , <2 x i1> %mask) nounw
define <2 x i64> @sext_2x16mem_to_2x64(<2 x i16> *%i) nounwind readnone {
; ALL-LABEL: sext_2x16mem_to_2x64:
-; ALL: ## BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vpmovsxwq (%rdi), %xmm0
; ALL-NEXT: retq
%a = load <2 x i16>,<2 x i16> *%i,align 1
@@ -920,7 +920,7 @@ define <2 x i64> @sext_2x16mem_to_2x64(<2 x i16> *%i) nounwind readnone {
define <4 x i64> @zext_4x16mem_to_4x64(<4 x i16> *%i , <4 x i1> %mask) nounwind readnone {
; KNL-LABEL: zext_4x16mem_to_4x64:
-; KNL: ## BB#0:
+; KNL: # %bb.0:
; KNL-NEXT: vpslld $31, %xmm0, %xmm0
; KNL-NEXT: vpsrad $31, %xmm0, %xmm0
; KNL-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
@@ -929,7 +929,7 @@ define <4 x i64> @zext_4x16mem_to_4x64(<4 x i16> *%i , <4 x i1> %mask) nounwind
; KNL-NEXT: retq
;
; SKX-LABEL: zext_4x16mem_to_4x64:
-; SKX: ## BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vpslld $31, %xmm0, %xmm0
; SKX-NEXT: vptestmd %xmm0, %xmm0, %k1
; SKX-NEXT: vpmovzxwq {{.*#+}} ymm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
@@ -942,7 +942,7 @@ define <4 x i64> @zext_4x16mem_to_4x64(<4 x i16> *%i , <4 x i1> %mask) nounwind
define <4 x i64> @sext_4x16mem_to_4x64mask(<4 x i16> *%i , <4 x i1> %mask) nounwind readnone {
; KNL-LABEL: sext_4x16mem_to_4x64mask:
-; KNL: ## BB#0:
+; KNL: # %bb.0:
; KNL-NEXT: vpslld $31, %xmm0, %xmm0
; KNL-NEXT: vpsrad $31, %xmm0, %xmm0
; KNL-NEXT: vpmovsxdq %xmm0, %ymm0
@@ -951,7 +951,7 @@ define <4 x i64> @sext_4x16mem_to_4x64mask(<4 x i16> *%i , <4 x i1> %mask) nounw
; KNL-NEXT: retq
;
; SKX-LABEL: sext_4x16mem_to_4x64mask:
-; SKX: ## BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vpslld $31, %xmm0, %xmm0
; SKX-NEXT: vptestmd %xmm0, %xmm0, %k1
; SKX-NEXT: vpmovsxwq (%rdi), %ymm0 {%k1} {z}
@@ -964,7 +964,7 @@ define <4 x i64> @sext_4x16mem_to_4x64mask(<4 x i16> *%i , <4 x i1> %mask) nounw
define <4 x i64> @sext_4x16mem_to_4x64(<4 x i16> *%i) nounwind readnone {
; ALL-LABEL: sext_4x16mem_to_4x64:
-; ALL: ## BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vpmovsxwq (%rdi), %ymm0
; ALL-NEXT: retq
%a = load <4 x i16>,<4 x i16> *%i,align 1
@@ -974,7 +974,7 @@ define <4 x i64> @sext_4x16mem_to_4x64(<4 x i16> *%i) nounwind readnone {
define <8 x i64> @zext_8x16mem_to_8x64(<8 x i16> *%i , <8 x i1> %mask) nounwind readnone {
; KNL-LABEL: zext_8x16mem_to_8x64:
-; KNL: ## BB#0:
+; KNL: # %bb.0:
; KNL-NEXT: vpmovsxwq %xmm0, %zmm0
; KNL-NEXT: vpsllq $63, %zmm0, %zmm0
; KNL-NEXT: vptestmq %zmm0, %zmm0, %k1
@@ -982,7 +982,7 @@ define <8 x i64> @zext_8x16mem_to_8x64(<8 x i16> *%i , <8 x i1> %mask) nounwind
; KNL-NEXT: retq
;
; SKX-LABEL: zext_8x16mem_to_8x64:
-; SKX: ## BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vpsllw $15, %xmm0, %xmm0
; SKX-NEXT: vpmovw2m %xmm0, %k1
; SKX-NEXT: vpmovzxwq {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
@@ -995,7 +995,7 @@ define <8 x i64> @zext_8x16mem_to_8x64(<8 x i16> *%i , <8 x i1> %mask) nounwind
define <8 x i64> @sext_8x16mem_to_8x64mask(<8 x i16> *%i , <8 x i1> %mask) nounwind readnone {
; KNL-LABEL: sext_8x16mem_to_8x64mask:
-; KNL: ## BB#0:
+; KNL: # %bb.0:
; KNL-NEXT: vpmovsxwq %xmm0, %zmm0
; KNL-NEXT: vpsllq $63, %zmm0, %zmm0
; KNL-NEXT: vptestmq %zmm0, %zmm0, %k1
@@ -1003,7 +1003,7 @@ define <8 x i64> @sext_8x16mem_to_8x64mask(<8 x i16> *%i , <8 x i1> %mask) nounw
; KNL-NEXT: retq
;
; SKX-LABEL: sext_8x16mem_to_8x64mask:
-; SKX: ## BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vpsllw $15, %xmm0, %xmm0
; SKX-NEXT: vpmovw2m %xmm0, %k1
; SKX-NEXT: vpmovsxwq (%rdi), %zmm0 {%k1} {z}
@@ -1016,7 +1016,7 @@ define <8 x i64> @sext_8x16mem_to_8x64mask(<8 x i16> *%i , <8 x i1> %mask) nounw
define <8 x i64> @sext_8x16mem_to_8x64(<8 x i16> *%i) nounwind readnone {
; ALL-LABEL: sext_8x16mem_to_8x64:
-; ALL: ## BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vpmovsxwq (%rdi), %zmm0
; ALL-NEXT: retq
%a = load <8 x i16>,<8 x i16> *%i,align 1
@@ -1026,7 +1026,7 @@ define <8 x i64> @sext_8x16mem_to_8x64(<8 x i16> *%i) nounwind readnone {
define <8 x i64> @zext_8x16_to_8x64mask(<8 x i16> %a , <8 x i1> %mask) nounwind readnone {
; KNL-LABEL: zext_8x16_to_8x64mask:
-; KNL: ## BB#0:
+; KNL: # %bb.0:
; KNL-NEXT: vpmovsxwq %xmm1, %zmm1
; KNL-NEXT: vpsllq $63, %zmm1, %zmm1
; KNL-NEXT: vptestmq %zmm1, %zmm1, %k1
@@ -1034,7 +1034,7 @@ define <8 x i64> @zext_8x16_to_8x64mask(<8 x i16> %a , <8 x i1> %mask) nounwind
; KNL-NEXT: retq
;
; SKX-LABEL: zext_8x16_to_8x64mask:
-; SKX: ## BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vpsllw $15, %xmm1, %xmm1
; SKX-NEXT: vpmovw2m %xmm1, %k1
; SKX-NEXT: vpmovzxwq {{.*#+}} zmm0 {%k1} {z} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
@@ -1046,7 +1046,7 @@ define <8 x i64> @zext_8x16_to_8x64mask(<8 x i16> %a , <8 x i1> %mask) nounwind
define <8 x i64> @zext_8x16_to_8x64(<8 x i16> %a) nounwind readnone {
; ALL-LABEL: zext_8x16_to_8x64:
-; ALL: ## BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
; ALL-NEXT: retq
%ret = zext <8 x i16> %a to <8 x i64>
@@ -1055,7 +1055,7 @@ define <8 x i64> @zext_8x16_to_8x64(<8 x i16> %a) nounwind readnone {
define <2 x i64> @zext_2x32mem_to_2x64(<2 x i32> *%i , <2 x i1> %mask) nounwind readnone {
; KNL-LABEL: zext_2x32mem_to_2x64:
-; KNL: ## BB#0:
+; KNL: # %bb.0:
; KNL-NEXT: vpsllq $63, %xmm0, %xmm0
; KNL-NEXT: vpsraq $63, %zmm0, %zmm0
; KNL-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
@@ -1063,7 +1063,7 @@ define <2 x i64> @zext_2x32mem_to_2x64(<2 x i32> *%i , <2 x i1> %mask) nounwind
; KNL-NEXT: retq
;
; SKX-LABEL: zext_2x32mem_to_2x64:
-; SKX: ## BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vpsllq $63, %xmm0, %xmm0
; SKX-NEXT: vptestmq %xmm0, %xmm0, %k1
; SKX-NEXT: vpmovzxdq {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,mem[1],zero
@@ -1076,7 +1076,7 @@ define <2 x i64> @zext_2x32mem_to_2x64(<2 x i32> *%i , <2 x i1> %mask) nounwind
define <2 x i64> @sext_2x32mem_to_2x64mask(<2 x i32> *%i , <2 x i1> %mask) nounwind readnone {
; KNL-LABEL: sext_2x32mem_to_2x64mask:
-; KNL: ## BB#0:
+; KNL: # %bb.0:
; KNL-NEXT: vpsllq $63, %xmm0, %xmm0
; KNL-NEXT: vpsraq $63, %zmm0, %zmm0
; KNL-NEXT: vpmovsxdq (%rdi), %xmm1
@@ -1084,7 +1084,7 @@ define <2 x i64> @sext_2x32mem_to_2x64mask(<2 x i32> *%i , <2 x i1> %mask) nounw
; KNL-NEXT: retq
;
; SKX-LABEL: sext_2x32mem_to_2x64mask:
-; SKX: ## BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vpsllq $63, %xmm0, %xmm0
; SKX-NEXT: vptestmq %xmm0, %xmm0, %k1
; SKX-NEXT: vpmovsxdq (%rdi), %xmm0 {%k1} {z}
@@ -1097,7 +1097,7 @@ define <2 x i64> @sext_2x32mem_to_2x64mask(<2 x i32> *%i , <2 x i1> %mask) nounw
define <2 x i64> @sext_2x32mem_to_2x64(<2 x i32> *%i) nounwind readnone {
; ALL-LABEL: sext_2x32mem_to_2x64:
-; ALL: ## BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vpmovsxdq (%rdi), %xmm0
; ALL-NEXT: retq
%a = load <2 x i32>,<2 x i32> *%i,align 1
@@ -1107,7 +1107,7 @@ define <2 x i64> @sext_2x32mem_to_2x64(<2 x i32> *%i) nounwind readnone {
define <4 x i64> @zext_4x32mem_to_4x64(<4 x i32> *%i , <4 x i1> %mask) nounwind readnone {
; KNL-LABEL: zext_4x32mem_to_4x64:
-; KNL: ## BB#0:
+; KNL: # %bb.0:
; KNL-NEXT: vpslld $31, %xmm0, %xmm0
; KNL-NEXT: vpsrad $31, %xmm0, %xmm0
; KNL-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
@@ -1116,7 +1116,7 @@ define <4 x i64> @zext_4x32mem_to_4x64(<4 x i32> *%i , <4 x i1> %mask) nounwind
; KNL-NEXT: retq
;
; SKX-LABEL: zext_4x32mem_to_4x64:
-; SKX: ## BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vpslld $31, %xmm0, %xmm0
; SKX-NEXT: vptestmd %xmm0, %xmm0, %k1
; SKX-NEXT: vpmovzxdq {{.*#+}} ymm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
@@ -1129,7 +1129,7 @@ define <4 x i64> @zext_4x32mem_to_4x64(<4 x i32> *%i , <4 x i1> %mask) nounwind
define <4 x i64> @sext_4x32mem_to_4x64mask(<4 x i32> *%i , <4 x i1> %mask) nounwind readnone {
; KNL-LABEL: sext_4x32mem_to_4x64mask:
-; KNL: ## BB#0:
+; KNL: # %bb.0:
; KNL-NEXT: vpslld $31, %xmm0, %xmm0
; KNL-NEXT: vpsrad $31, %xmm0, %xmm0
; KNL-NEXT: vpmovsxdq %xmm0, %ymm0
@@ -1138,7 +1138,7 @@ define <4 x i64> @sext_4x32mem_to_4x64mask(<4 x i32> *%i , <4 x i1> %mask) nounw
; KNL-NEXT: retq
;
; SKX-LABEL: sext_4x32mem_to_4x64mask:
-; SKX: ## BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vpslld $31, %xmm0, %xmm0
; SKX-NEXT: vptestmd %xmm0, %xmm0, %k1
; SKX-NEXT: vpmovsxdq (%rdi), %ymm0 {%k1} {z}
@@ -1151,7 +1151,7 @@ define <4 x i64> @sext_4x32mem_to_4x64mask(<4 x i32> *%i , <4 x i1> %mask) nounw
define <4 x i64> @sext_4x32mem_to_4x64(<4 x i32> *%i) nounwind readnone {
; ALL-LABEL: sext_4x32mem_to_4x64:
-; ALL: ## BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vpmovsxdq (%rdi), %ymm0
; ALL-NEXT: retq
%a = load <4 x i32>,<4 x i32> *%i,align 1
@@ -1161,7 +1161,7 @@ define <4 x i64> @sext_4x32mem_to_4x64(<4 x i32> *%i) nounwind readnone {
define <4 x i64> @sext_4x32_to_4x64(<4 x i32> %a) nounwind readnone {
; ALL-LABEL: sext_4x32_to_4x64:
-; ALL: ## BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vpmovsxdq %xmm0, %ymm0
; ALL-NEXT: retq
%x = sext <4 x i32> %a to <4 x i64>
@@ -1170,7 +1170,7 @@ define <4 x i64> @sext_4x32_to_4x64(<4 x i32> %a) nounwind readnone {
define <4 x i64> @zext_4x32_to_4x64mask(<4 x i32> %a , <4 x i1> %mask) nounwind readnone {
; KNL-LABEL: zext_4x32_to_4x64mask:
-; KNL: ## BB#0:
+; KNL: # %bb.0:
; KNL-NEXT: vpslld $31, %xmm1, %xmm1
; KNL-NEXT: vpsrad $31, %xmm1, %xmm1
; KNL-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
@@ -1179,7 +1179,7 @@ define <4 x i64> @zext_4x32_to_4x64mask(<4 x i32> %a , <4 x i1> %mask) nounwind
; KNL-NEXT: retq
;
; SKX-LABEL: zext_4x32_to_4x64mask:
-; SKX: ## BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vpslld $31, %xmm1, %xmm1
; SKX-NEXT: vptestmd %xmm1, %xmm1, %k1
; SKX-NEXT: vpmovzxdq {{.*#+}} ymm0 {%k1} {z} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
@@ -1191,7 +1191,7 @@ define <4 x i64> @zext_4x32_to_4x64mask(<4 x i32> %a , <4 x i1> %mask) nounwind
define <8 x i64> @zext_8x32mem_to_8x64(<8 x i32> *%i , <8 x i1> %mask) nounwind readnone {
; KNL-LABEL: zext_8x32mem_to_8x64:
-; KNL: ## BB#0:
+; KNL: # %bb.0:
; KNL-NEXT: vpmovsxwq %xmm0, %zmm0
; KNL-NEXT: vpsllq $63, %zmm0, %zmm0
; KNL-NEXT: vptestmq %zmm0, %zmm0, %k1
@@ -1199,7 +1199,7 @@ define <8 x i64> @zext_8x32mem_to_8x64(<8 x i32> *%i , <8 x i1> %mask) nounwind
; KNL-NEXT: retq
;
; SKX-LABEL: zext_8x32mem_to_8x64:
-; SKX: ## BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vpsllw $15, %xmm0, %xmm0
; SKX-NEXT: vpmovw2m %xmm0, %k1
; SKX-NEXT: vpmovzxdq {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
@@ -1212,7 +1212,7 @@ define <8 x i64> @zext_8x32mem_to_8x64(<8 x i32> *%i , <8 x i1> %mask) nounwind
define <8 x i64> @sext_8x32mem_to_8x64mask(<8 x i32> *%i , <8 x i1> %mask) nounwind readnone {
; KNL-LABEL: sext_8x32mem_to_8x64mask:
-; KNL: ## BB#0:
+; KNL: # %bb.0:
; KNL-NEXT: vpmovsxwq %xmm0, %zmm0
; KNL-NEXT: vpsllq $63, %zmm0, %zmm0
; KNL-NEXT: vptestmq %zmm0, %zmm0, %k1
@@ -1220,7 +1220,7 @@ define <8 x i64> @sext_8x32mem_to_8x64mask(<8 x i32> *%i , <8 x i1> %mask) nounw
; KNL-NEXT: retq
;
; SKX-LABEL: sext_8x32mem_to_8x64mask:
-; SKX: ## BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vpsllw $15, %xmm0, %xmm0
; SKX-NEXT: vpmovw2m %xmm0, %k1
; SKX-NEXT: vpmovsxdq (%rdi), %zmm0 {%k1} {z}
@@ -1233,7 +1233,7 @@ define <8 x i64> @sext_8x32mem_to_8x64mask(<8 x i32> *%i , <8 x i1> %mask) nounw
define <8 x i64> @sext_8x32mem_to_8x64(<8 x i32> *%i) nounwind readnone {
; ALL-LABEL: sext_8x32mem_to_8x64:
-; ALL: ## BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vpmovsxdq (%rdi), %zmm0
; ALL-NEXT: retq
%a = load <8 x i32>,<8 x i32> *%i,align 1
@@ -1243,7 +1243,7 @@ define <8 x i64> @sext_8x32mem_to_8x64(<8 x i32> *%i) nounwind readnone {
define <8 x i64> @sext_8x32_to_8x64(<8 x i32> %a) nounwind readnone {
; ALL-LABEL: sext_8x32_to_8x64:
-; ALL: ## BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vpmovsxdq %ymm0, %zmm0
; ALL-NEXT: retq
%x = sext <8 x i32> %a to <8 x i64>
@@ -1252,7 +1252,7 @@ define <8 x i64> @sext_8x32_to_8x64(<8 x i32> %a) nounwind readnone {
define <8 x i64> @zext_8x32_to_8x64mask(<8 x i32> %a , <8 x i1> %mask) nounwind readnone {
; KNL-LABEL: zext_8x32_to_8x64mask:
-; KNL: ## BB#0:
+; KNL: # %bb.0:
; KNL-NEXT: vpmovsxwq %xmm1, %zmm1
; KNL-NEXT: vpsllq $63, %zmm1, %zmm1
; KNL-NEXT: vptestmq %zmm1, %zmm1, %k1
@@ -1260,7 +1260,7 @@ define <8 x i64> @zext_8x32_to_8x64mask(<8 x i32> %a , <8 x i1> %mask) nounwind
; KNL-NEXT: retq
;
; SKX-LABEL: zext_8x32_to_8x64mask:
-; SKX: ## BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vpsllw $15, %xmm1, %xmm1
; SKX-NEXT: vpmovw2m %xmm1, %k1
; SKX-NEXT: vpmovzxdq {{.*#+}} zmm0 {%k1} {z} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
@@ -1271,7 +1271,7 @@ define <8 x i64> @zext_8x32_to_8x64mask(<8 x i32> %a , <8 x i1> %mask) nounwind
}
define <8 x float> @fptrunc_test(<8 x double> %a) nounwind readnone {
; ALL-LABEL: fptrunc_test:
-; ALL: ## BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vcvtpd2ps %zmm0, %ymm0
; ALL-NEXT: retq
%b = fptrunc <8 x double> %a to <8 x float>
@@ -1280,7 +1280,7 @@ define <8 x float> @fptrunc_test(<8 x double> %a) nounwind readnone {
define <8 x double> @fpext_test(<8 x float> %a) nounwind readnone {
; ALL-LABEL: fpext_test:
-; ALL: ## BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vcvtps2pd %ymm0, %zmm0
; ALL-NEXT: retq
%b = fpext <8 x float> %a to <8 x double>
@@ -1289,13 +1289,13 @@ define <8 x double> @fpext_test(<8 x float> %a) nounwind readnone {
define <16 x i32> @zext_16i1_to_16xi32(i16 %b) {
; KNL-LABEL: zext_16i1_to_16xi32:
-; KNL: ## BB#0:
+; KNL: # %bb.0:
; KNL-NEXT: kmovw %edi, %k1
; KNL-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
; KNL-NEXT: retq
;
; SKX-LABEL: zext_16i1_to_16xi32:
-; SKX: ## BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: kmovd %edi, %k1
; SKX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
; SKX-NEXT: retq
@@ -1306,13 +1306,13 @@ define <16 x i32> @zext_16i1_to_16xi32(i16 %b) {
define <8 x i64> @zext_8i1_to_8xi64(i8 %b) {
; KNL-LABEL: zext_8i1_to_8xi64:
-; KNL: ## BB#0:
+; KNL: # %bb.0:
; KNL-NEXT: kmovw %edi, %k1
; KNL-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z}
; KNL-NEXT: retq
;
; SKX-LABEL: zext_8i1_to_8xi64:
-; SKX: ## BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: kmovd %edi, %k1
; SKX-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z}
; SKX-NEXT: retq
@@ -1323,20 +1323,20 @@ define <8 x i64> @zext_8i1_to_8xi64(i8 %b) {
define i16 @trunc_16i8_to_16i1(<16 x i8> %a) {
; KNL-LABEL: trunc_16i8_to_16i1:
-; KNL: ## BB#0:
+; KNL: # %bb.0:
; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; KNL-NEXT: # kill: def %ax killed %ax killed %eax
; KNL-NEXT: retq
;
; SKX-LABEL: trunc_16i8_to_16i1:
-; SKX: ## BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vpsllw $7, %xmm0, %xmm0
; SKX-NEXT: vpmovb2m %xmm0, %k0
; SKX-NEXT: kmovd %k0, %eax
-; SKX-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; SKX-NEXT: # kill: def %ax killed %ax killed %eax
; SKX-NEXT: retq
%mask_b = trunc <16 x i8>%a to <16 x i1>
%mask = bitcast <16 x i1> %mask_b to i16
@@ -1345,19 +1345,19 @@ define i16 @trunc_16i8_to_16i1(<16 x i8> %a) {
define i16 @trunc_16i32_to_16i1(<16 x i32> %a) {
; KNL-LABEL: trunc_16i32_to_16i1:
-; KNL: ## BB#0:
+; KNL: # %bb.0:
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; KNL-NEXT: # kill: def %ax killed %ax killed %eax
; KNL-NEXT: retq
;
; SKX-LABEL: trunc_16i32_to_16i1:
-; SKX: ## BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vpslld $31, %zmm0, %zmm0
; SKX-NEXT: vptestmd %zmm0, %zmm0, %k0
; SKX-NEXT: kmovd %k0, %eax
-; SKX-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; SKX-NEXT: # kill: def %ax killed %ax killed %eax
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%mask_b = trunc <16 x i32>%a to <16 x i1>
@@ -1367,14 +1367,14 @@ define i16 @trunc_16i32_to_16i1(<16 x i32> %a) {
define <4 x i32> @trunc_4i32_to_4i1(<4 x i32> %a, <4 x i32> %b) {
; KNL-LABEL: trunc_4i32_to_4i1:
-; KNL: ## BB#0:
+; KNL: # %bb.0:
; KNL-NEXT: vpand %xmm1, %xmm0, %xmm0
; KNL-NEXT: vpslld $31, %xmm0, %xmm0
; KNL-NEXT: vpsrad $31, %xmm0, %xmm0
; KNL-NEXT: retq
;
; SKX-LABEL: trunc_4i32_to_4i1:
-; SKX: ## BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vpslld $31, %xmm0, %xmm0
; SKX-NEXT: vptestmd %xmm0, %xmm0, %k1
; SKX-NEXT: vpslld $31, %xmm1, %xmm0
@@ -1391,20 +1391,20 @@ define <4 x i32> @trunc_4i32_to_4i1(<4 x i32> %a, <4 x i32> %b) {
define i8 @trunc_8i16_to_8i1(<8 x i16> %a) {
; KNL-LABEL: trunc_8i16_to_8i1:
-; KNL: ## BB#0:
+; KNL: # %bb.0:
; KNL-NEXT: vpmovsxwq %xmm0, %zmm0
; KNL-NEXT: vpsllq $63, %zmm0, %zmm0
; KNL-NEXT: vptestmq %zmm0, %zmm0, %k0
; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; KNL-NEXT: # kill: def %al killed %al killed %eax
; KNL-NEXT: retq
;
; SKX-LABEL: trunc_8i16_to_8i1:
-; SKX: ## BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vpsllw $15, %xmm0, %xmm0
; SKX-NEXT: vpmovw2m %xmm0, %k0
; SKX-NEXT: kmovd %k0, %eax
-; SKX-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; SKX-NEXT: # kill: def %al killed %al killed %eax
; SKX-NEXT: retq
%mask_b = trunc <8 x i16>%a to <8 x i1>
%mask = bitcast <8 x i1> %mask_b to i8
@@ -1413,14 +1413,14 @@ define i8 @trunc_8i16_to_8i1(<8 x i16> %a) {
define <8 x i32> @sext_8i1_8i32(<8 x i32> %a1, <8 x i32> %a2) nounwind {
; KNL-LABEL: sext_8i1_8i32:
-; KNL: ## BB#0:
+; KNL: # %bb.0:
; KNL-NEXT: vpcmpgtd %ymm0, %ymm1, %ymm0
; KNL-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; KNL-NEXT: vpxor %ymm1, %ymm0, %ymm0
; KNL-NEXT: retq
;
; SKX-LABEL: sext_8i1_8i32:
-; SKX: ## BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vpcmpled %ymm0, %ymm1, %k0
; SKX-NEXT: vpmovm2d %k0, %ymm0
; SKX-NEXT: retq
@@ -1433,7 +1433,7 @@ define <8 x i32> @sext_8i1_8i32(<8 x i32> %a1, <8 x i32> %a2) nounwind {
define i16 @trunc_i32_to_i1(i32 %a) {
; KNL-LABEL: trunc_i32_to_i1:
-; KNL: ## BB#0:
+; KNL: # %bb.0:
; KNL-NEXT: movw $-4, %ax
; KNL-NEXT: kmovw %eax, %k0
; KNL-NEXT: kshiftrw $1, %k0, %k0
@@ -1442,11 +1442,11 @@ define i16 @trunc_i32_to_i1(i32 %a) {
; KNL-NEXT: kmovw %edi, %k1
; KNL-NEXT: korw %k1, %k0, %k0
; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; KNL-NEXT: # kill: def %ax killed %ax killed %eax
; KNL-NEXT: retq
;
; SKX-LABEL: trunc_i32_to_i1:
-; SKX: ## BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: movw $-4, %ax
; SKX-NEXT: kmovd %eax, %k0
; SKX-NEXT: kshiftrw $1, %k0, %k0
@@ -1455,7 +1455,7 @@ define i16 @trunc_i32_to_i1(i32 %a) {
; SKX-NEXT: kmovw %edi, %k1
; SKX-NEXT: korw %k1, %k0, %k0
; SKX-NEXT: kmovd %k0, %eax
-; SKX-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; SKX-NEXT: # kill: def %ax killed %ax killed %eax
; SKX-NEXT: retq
%a_i = trunc i32 %a to i1
%maskv = insertelement <16 x i1> <i1 true, i1 false, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i1 %a_i, i32 0
@@ -1465,14 +1465,14 @@ define i16 @trunc_i32_to_i1(i32 %a) {
define <8 x i16> @sext_8i1_8i16(<8 x i32> %a1, <8 x i32> %a2) nounwind {
; KNL-LABEL: sext_8i1_8i16:
-; KNL: ## BB#0:
+; KNL: # %bb.0:
; KNL-NEXT: vpcmpgtd %ymm0, %ymm1, %ymm0
; KNL-NEXT: vpmovdw %zmm0, %ymm0
-; KNL-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; KNL-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; KNL-NEXT: retq
;
; SKX-LABEL: sext_8i1_8i16:
-; SKX: ## BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vpcmpgtd %ymm0, %ymm1, %k0
; SKX-NEXT: vpmovm2w %k0, %xmm0
; SKX-NEXT: vzeroupper
@@ -1484,13 +1484,13 @@ define <8 x i16> @sext_8i1_8i16(<8 x i32> %a1, <8 x i32> %a2) nounwind {
define <16 x i32> @sext_16i1_16i32(<16 x i32> %a1, <16 x i32> %a2) nounwind {
; KNL-LABEL: sext_16i1_16i32:
-; KNL: ## BB#0:
+; KNL: # %bb.0:
; KNL-NEXT: vpcmpgtd %zmm0, %zmm1, %k1
; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; KNL-NEXT: retq
;
; SKX-LABEL: sext_16i1_16i32:
-; SKX: ## BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vpcmpgtd %zmm0, %zmm1, %k0
; SKX-NEXT: vpmovm2d %k0, %zmm0
; SKX-NEXT: retq
@@ -1501,13 +1501,13 @@ define <16 x i32> @sext_16i1_16i32(<16 x i32> %a1, <16 x i32> %a2) nounwind {
define <8 x i64> @sext_8i1_8i64(<8 x i32> %a1, <8 x i32> %a2) nounwind {
; KNL-LABEL: sext_8i1_8i64:
-; KNL: ## BB#0:
+; KNL: # %bb.0:
; KNL-NEXT: vpcmpgtd %ymm0, %ymm1, %ymm0
; KNL-NEXT: vpmovsxdq %ymm0, %zmm0
; KNL-NEXT: retq
;
; SKX-LABEL: sext_8i1_8i64:
-; SKX: ## BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vpcmpgtd %ymm0, %ymm1, %k0
; SKX-NEXT: vpmovm2q %k0, %zmm0
; SKX-NEXT: retq
@@ -1518,13 +1518,13 @@ define <8 x i64> @sext_8i1_8i64(<8 x i32> %a1, <8 x i32> %a2) nounwind {
define void @extload_v8i64(<8 x i8>* %a, <8 x i64>* %res) {
; KNL-LABEL: extload_v8i64:
-; KNL: ## BB#0:
+; KNL: # %bb.0:
; KNL-NEXT: vpmovsxbq (%rdi), %zmm0
; KNL-NEXT: vmovdqa64 %zmm0, (%rsi)
; KNL-NEXT: retq
;
; SKX-LABEL: extload_v8i64:
-; SKX: ## BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vpmovsxbq (%rdi), %zmm0
; SKX-NEXT: vmovdqa64 %zmm0, (%rsi)
; SKX-NEXT: vzeroupper
@@ -1537,7 +1537,7 @@ define void @extload_v8i64(<8 x i8>* %a, <8 x i64>* %res) {
define <64 x i16> @test21(<64 x i16> %x , <64 x i1> %mask) nounwind readnone {
; KNL-LABEL: test21:
-; KNL: ## BB#0:
+; KNL: # %bb.0:
; KNL-NEXT: vpmovzxbw {{.*#+}} ymm7 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero,xmm7[4],zero,xmm7[5],zero,xmm7[6],zero,xmm7[7],zero,xmm7[8],zero,xmm7[9],zero,xmm7[10],zero,xmm7[11],zero,xmm7[12],zero,xmm7[13],zero,xmm7[14],zero,xmm7[15],zero
; KNL-NEXT: vpmovzxbw {{.*#+}} ymm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero,xmm6[8],zero,xmm6[9],zero,xmm6[10],zero,xmm6[11],zero,xmm6[12],zero,xmm6[13],zero,xmm6[14],zero,xmm6[15],zero
; KNL-NEXT: vpmovzxbw {{.*#+}} ymm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero
@@ -1557,7 +1557,7 @@ define <64 x i16> @test21(<64 x i16> %x , <64 x i1> %mask) nounwind readnone {
; KNL-NEXT: retq
;
; SKX-LABEL: test21:
-; SKX: ## BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vpsllw $7, %zmm2, %zmm2
; SKX-NEXT: vpmovb2m %zmm2, %k1
; SKX-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z}
@@ -1570,7 +1570,7 @@ define <64 x i16> @test21(<64 x i16> %x , <64 x i1> %mask) nounwind readnone {
define <16 x i16> @shuffle_zext_16x8_to_16x16(<16 x i8> %a) nounwind readnone {
; ALL-LABEL: shuffle_zext_16x8_to_16x16:
-; ALL: ## BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
; ALL-NEXT: retq
%1 = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <32 x i32> <i32 0, i32 16, i32 1, i32 16, i32 2, i32 16, i32 3, i32 16, i32 4, i32 16, i32 5, i32 16, i32 6, i32 16, i32 7, i32 16, i32 8, i32 16, i32 9, i32 16, i32 10, i32 16, i32 11, i32 16, i32 12, i32 16, i32 13, i32 16, i32 14, i32 16, i32 15, i32 16>
@@ -1580,7 +1580,7 @@ define <16 x i16> @shuffle_zext_16x8_to_16x16(<16 x i8> %a) nounwind readnone {
define <16 x i16> @shuffle_zext_16x8_to_16x16_mask(<16 x i8> %a, <16 x i1> %mask) nounwind readnone {
; KNL-LABEL: shuffle_zext_16x8_to_16x16_mask:
-; KNL: ## BB#0:
+; KNL: # %bb.0:
; KNL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
; KNL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
; KNL-NEXT: vpsllw $15, %ymm1, %ymm1
@@ -1589,7 +1589,7 @@ define <16 x i16> @shuffle_zext_16x8_to_16x16_mask(<16 x i8> %a, <16 x i1> %mask
; KNL-NEXT: retq
;
; SKX-LABEL: shuffle_zext_16x8_to_16x16_mask:
-; SKX: ## BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vpsllw $7, %xmm1, %xmm1
; SKX-NEXT: vpmovb2m %xmm1, %k1
; SKX-NEXT: vpmovzxbw {{.*#+}} ymm0 {%k1} {z} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
@@ -1602,7 +1602,7 @@ define <16 x i16> @shuffle_zext_16x8_to_16x16_mask(<16 x i8> %a, <16 x i1> %mask
define <16 x i16> @zext_32x8_to_16x16(<32 x i8> %a) {
; ALL-LABEL: zext_32x8_to_16x16:
-; ALL: ## BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
; ALL-NEXT: retq
%1 = shufflevector <32 x i8> %a, <32 x i8> zeroinitializer, <32 x i32> <i32 0, i32 32, i32 1, i32 32, i32 2, i32 32, i32 3, i32 32, i32 4, i32 32, i32 5, i32 32, i32 6, i32 32, i32 7, i32 32, i32 8, i32 32, i32 9, i32 32, i32 10, i32 32, i32 11, i32 32, i32 12, i32 32, i32 13, i32 32, i32 14, i32 32, i32 15, i32 32>
@@ -1612,7 +1612,7 @@ define <16 x i16> @zext_32x8_to_16x16(<32 x i8> %a) {
define <8 x i32> @zext_32x8_to_8x32(<32 x i8> %a) {
; ALL-LABEL: zext_32x8_to_8x32:
-; ALL: ## BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
; ALL-NEXT: retq
%1 = shufflevector <32 x i8> %a, <32 x i8> zeroinitializer, <32 x i32> <i32 0, i32 32, i32 32, i32 32, i32 1, i32 32, i32 32, i32 32, i32 2, i32 32, i32 32, i32 32, i32 3, i32 32, i32 32, i32 32, i32 4, i32 32, i32 32, i32 32, i32 5, i32 32, i32 32, i32 32, i32 6, i32 32, i32 32, i32 32, i32 7, i32 32, i32 32, i32 32>
@@ -1622,7 +1622,7 @@ define <8 x i32> @zext_32x8_to_8x32(<32 x i8> %a) {
define <4 x i64> @zext_32x8_to_4x64(<32 x i8> %a) {
; ALL-LABEL: zext_32x8_to_4x64:
-; ALL: ## BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
; ALL-NEXT: retq
%1 = shufflevector <32 x i8> %a, <32 x i8> zeroinitializer, <32 x i32> <i32 0, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 1, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 2, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 3, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32>
@@ -1632,7 +1632,7 @@ define <4 x i64> @zext_32x8_to_4x64(<32 x i8> %a) {
define <8 x i32> @zext_16x16_to_8x32(<16 x i16> %a) {
; ALL-LABEL: zext_16x16_to_8x32:
-; ALL: ## BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; ALL-NEXT: retq
%1 = shufflevector <16 x i16> %a, <16 x i16> zeroinitializer, <16 x i32> <i32 0, i32 16, i32 1, i32 16, i32 2, i32 16, i32 3, i32 16, i32 4, i32 16, i32 5, i32 16, i32 6, i32 16, i32 7, i32 16>
@@ -1642,7 +1642,7 @@ define <8 x i32> @zext_16x16_to_8x32(<16 x i16> %a) {
define <4 x i64> @zext_16x16_to_4x64(<16 x i16> %a) {
; ALL-LABEL: zext_16x16_to_4x64:
-; ALL: ## BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; ALL-NEXT: retq
%1 = shufflevector <16 x i16> %a, <16 x i16> zeroinitializer, <16 x i32> <i32 0, i32 16, i32 16, i32 16, i32 1, i32 16, i32 16, i32 16, i32 2, i32 16, i32 16, i32 16, i32 3, i32 16, i32 16, i32 16>
@@ -1652,7 +1652,7 @@ define <4 x i64> @zext_16x16_to_4x64(<16 x i16> %a) {
define <4 x i64> @zext_8x32_to_4x64(<8 x i32> %a) {
; ALL-LABEL: zext_8x32_to_4x64:
-; ALL: ## BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; ALL-NEXT: retq
%1 = shufflevector <8 x i32> %a, <8 x i32> zeroinitializer, <8 x i32> <i32 0, i32 8, i32 1, i32 8, i32 2, i32 8, i32 3, i32 8>
@@ -1662,7 +1662,7 @@ define <4 x i64> @zext_8x32_to_4x64(<8 x i32> %a) {
define <64 x i8> @zext_64xi1_to_64xi8(<64 x i8> %x, <64 x i8> %y) #0 {
; KNL-LABEL: zext_64xi1_to_64xi8:
-; KNL: ## BB#0:
+; KNL: # %bb.0:
; KNL-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm0
; KNL-NEXT: vmovdqa {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; KNL-NEXT: vpand %ymm2, %ymm0, %ymm0
@@ -1671,7 +1671,7 @@ define <64 x i8> @zext_64xi1_to_64xi8(<64 x i8> %x, <64 x i8> %y) #0 {
; KNL-NEXT: retq
;
; SKX-LABEL: zext_64xi1_to_64xi8:
-; SKX: ## BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vpcmpeqb %zmm1, %zmm0, %k1
; SKX-NEXT: vmovdqu8 {{.*}}(%rip), %zmm0 {%k1} {z}
; SKX-NEXT: retq
@@ -1682,7 +1682,7 @@ define <64 x i8> @zext_64xi1_to_64xi8(<64 x i8> %x, <64 x i8> %y) #0 {
define <32 x i16> @zext_32xi1_to_32xi16(<32 x i16> %x, <32 x i16> %y) #0 {
; KNL-LABEL: zext_32xi1_to_32xi16:
-; KNL: ## BB#0:
+; KNL: # %bb.0:
; KNL-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm0
; KNL-NEXT: vpsrlw $15, %ymm0, %ymm0
; KNL-NEXT: vpcmpeqw %ymm3, %ymm1, %ymm1
@@ -1690,7 +1690,7 @@ define <32 x i16> @zext_32xi1_to_32xi16(<32 x i16> %x, <32 x i16> %y) #0 {
; KNL-NEXT: retq
;
; SKX-LABEL: zext_32xi1_to_32xi16:
-; SKX: ## BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vpcmpeqw %zmm1, %zmm0, %k1
; SKX-NEXT: vmovdqu16 {{.*}}(%rip), %zmm0 {%k1} {z}
; SKX-NEXT: retq
@@ -1701,13 +1701,13 @@ define <32 x i16> @zext_32xi1_to_32xi16(<32 x i16> %x, <32 x i16> %y) #0 {
define <16 x i16> @zext_16xi1_to_16xi16(<16 x i16> %x, <16 x i16> %y) #0 {
; KNL-LABEL: zext_16xi1_to_16xi16:
-; KNL: ## BB#0:
+; KNL: # %bb.0:
; KNL-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0
; KNL-NEXT: vpsrlw $15, %ymm0, %ymm0
; KNL-NEXT: retq
;
; SKX-LABEL: zext_16xi1_to_16xi16:
-; SKX: ## BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vpcmpeqw %ymm1, %ymm0, %k1
; SKX-NEXT: vmovdqu16 {{.*}}(%rip), %ymm0 {%k1} {z}
; SKX-NEXT: retq
@@ -1719,7 +1719,7 @@ define <16 x i16> @zext_16xi1_to_16xi16(<16 x i16> %x, <16 x i16> %y) #0 {
define <32 x i8> @zext_32xi1_to_32xi8(<32 x i16> %x, <32 x i16> %y) #0 {
; KNL-LABEL: zext_32xi1_to_32xi8:
-; KNL: ## BB#0:
+; KNL: # %bb.0:
; KNL-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm0
; KNL-NEXT: vpmovsxwd %ymm0, %zmm0
; KNL-NEXT: vpmovdb %zmm0, %xmm0
@@ -1731,7 +1731,7 @@ define <32 x i8> @zext_32xi1_to_32xi8(<32 x i16> %x, <32 x i16> %y) #0 {
; KNL-NEXT: retq
;
; SKX-LABEL: zext_32xi1_to_32xi8:
-; SKX: ## BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vpcmpeqw %zmm1, %zmm0, %k1
; SKX-NEXT: vmovdqu8 {{.*}}(%rip), %ymm0 {%k1} {z}
; SKX-NEXT: retq
@@ -1742,7 +1742,7 @@ define <32 x i8> @zext_32xi1_to_32xi8(<32 x i16> %x, <32 x i16> %y) #0 {
define <4 x i32> @zext_4xi1_to_4x32(<4 x i8> %x, <4 x i8> %y) #0 {
; KNL-LABEL: zext_4xi1_to_4x32:
-; KNL: ## BB#0:
+; KNL: # %bb.0:
; KNL-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
; KNL-NEXT: vpand %xmm2, %xmm1, %xmm1
; KNL-NEXT: vpand %xmm2, %xmm0, %xmm0
@@ -1751,7 +1751,7 @@ define <4 x i32> @zext_4xi1_to_4x32(<4 x i8> %x, <4 x i8> %y) #0 {
; KNL-NEXT: retq
;
; SKX-LABEL: zext_4xi1_to_4x32:
-; SKX: ## BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
; SKX-NEXT: vpand %xmm2, %xmm1, %xmm1
; SKX-NEXT: vpand %xmm2, %xmm0, %xmm0
@@ -1765,7 +1765,7 @@ define <4 x i32> @zext_4xi1_to_4x32(<4 x i8> %x, <4 x i8> %y) #0 {
define <2 x i64> @zext_2xi1_to_2xi64(<2 x i8> %x, <2 x i8> %y) #0 {
; KNL-LABEL: zext_2xi1_to_2xi64:
-; KNL: ## BB#0:
+; KNL: # %bb.0:
; KNL-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
; KNL-NEXT: vpand %xmm2, %xmm1, %xmm1
; KNL-NEXT: vpand %xmm2, %xmm0, %xmm0
@@ -1774,7 +1774,7 @@ define <2 x i64> @zext_2xi1_to_2xi64(<2 x i8> %x, <2 x i8> %y) #0 {
; KNL-NEXT: retq
;
; SKX-LABEL: zext_2xi1_to_2xi64:
-; SKX: ## BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
; SKX-NEXT: vpand %xmm2, %xmm1, %xmm1
; SKX-NEXT: vpand %xmm2, %xmm0, %xmm0
diff --git a/test/CodeGen/X86/avx512-extract-subvector-load-store.ll b/test/CodeGen/X86/avx512-extract-subvector-load-store.ll
new file mode 100644
index 000000000000..34ea468aebee
--- /dev/null
+++ b/test/CodeGen/X86/avx512-extract-subvector-load-store.ll
@@ -0,0 +1,1458 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq -O2 | FileCheck %s --check-prefix=AVX512
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl -O2 | FileCheck %s --check-prefix=AVX512NOTDQ
+
+define void @load_v8i1_broadcast_4_v2i1(<8 x i1>* %a0,<2 x double> %a1,<2 x double> %a2,<2 x double>* %a3) {
+; AVX512-LABEL: load_v8i1_broadcast_4_v2i1:
+; AVX512: # %bb.0:
+; AVX512-NEXT: kmovb (%rdi), %k0
+; AVX512-NEXT: kshiftrw $4, %k0, %k0
+; AVX512-NEXT: vpmovm2q %k0, %xmm2
+; AVX512-NEXT: vpbroadcastq %xmm2, %xmm2
+; AVX512-NEXT: vpmovq2m %xmm2, %k1
+; AVX512-NEXT: vmovapd %xmm0, %xmm1 {%k1}
+; AVX512-NEXT: vmovapd %xmm1, (%rsi)
+; AVX512-NEXT: retq
+;
+; AVX512NOTDQ-LABEL: load_v8i1_broadcast_4_v2i1:
+; AVX512NOTDQ: # %bb.0:
+; AVX512NOTDQ-NEXT: movzbl (%rdi), %eax
+; AVX512NOTDQ-NEXT: kmovd %eax, %k0
+; AVX512NOTDQ-NEXT: kshiftrw $4, %k0, %k1
+; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX512NOTDQ-NEXT: vmovdqa64 %xmm2, %xmm2 {%k1} {z}
+; AVX512NOTDQ-NEXT: vpbroadcastq %xmm2, %xmm2
+; AVX512NOTDQ-NEXT: vpsllq $63, %xmm2, %xmm2
+; AVX512NOTDQ-NEXT: vptestmq %xmm2, %xmm2, %k1
+; AVX512NOTDQ-NEXT: vmovapd %xmm0, %xmm1 {%k1}
+; AVX512NOTDQ-NEXT: vmovapd %xmm1, (%rsi)
+; AVX512NOTDQ-NEXT: retq
+ %d0 = load <8 x i1>, <8 x i1>* %a0
+ %d1 = shufflevector <8 x i1> %d0,<8 x i1> undef,<2 x i32><i32 4,i32 4>
+ %d2 = select <2 x i1> %d1, <2 x double> %a1, <2 x double> %a2
+ store <2 x double> %d2, <2 x double>* %a3
+ ret void
+}
+define void @load_v8i1_broadcast_7_v2i1(<8 x i1>* %a0,<2 x double> %a1,<2 x double> %a2,<2 x double>* %a3) {
+; AVX512-LABEL: load_v8i1_broadcast_7_v2i1:
+; AVX512: # %bb.0:
+; AVX512-NEXT: kmovb (%rdi), %k0
+; AVX512-NEXT: kshiftrw $6, %k0, %k0
+; AVX512-NEXT: vpmovm2q %k0, %xmm2
+; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
+; AVX512-NEXT: vpmovq2m %xmm2, %k1
+; AVX512-NEXT: vmovapd %xmm0, %xmm1 {%k1}
+; AVX512-NEXT: vmovapd %xmm1, (%rsi)
+; AVX512-NEXT: retq
+;
+; AVX512NOTDQ-LABEL: load_v8i1_broadcast_7_v2i1:
+; AVX512NOTDQ: # %bb.0:
+; AVX512NOTDQ-NEXT: movzbl (%rdi), %eax
+; AVX512NOTDQ-NEXT: kmovd %eax, %k0
+; AVX512NOTDQ-NEXT: kshiftrw $6, %k0, %k1
+; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX512NOTDQ-NEXT: vmovdqa64 %xmm2, %xmm2 {%k1} {z}
+; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
+; AVX512NOTDQ-NEXT: vpsllq $63, %xmm2, %xmm2
+; AVX512NOTDQ-NEXT: vptestmq %xmm2, %xmm2, %k1
+; AVX512NOTDQ-NEXT: vmovapd %xmm0, %xmm1 {%k1}
+; AVX512NOTDQ-NEXT: vmovapd %xmm1, (%rsi)
+; AVX512NOTDQ-NEXT: retq
+ %d0 = load <8 x i1>, <8 x i1>* %a0
+ %d1 = shufflevector <8 x i1> %d0,<8 x i1> undef,<2 x i32><i32 7,i32 7>
+ %d2 = select <2 x i1> %d1, <2 x double> %a1, <2 x double> %a2
+ store <2 x double> %d2, <2 x double>* %a3
+ ret void
+}
+define void @load_v16i1_broadcast_8_v2i1(<16 x i1>* %a0,<2 x double> %a1,<2 x double> %a2,<2 x double>* %a3) {
+; AVX512-LABEL: load_v16i1_broadcast_8_v2i1:
+; AVX512: # %bb.0:
+; AVX512-NEXT: kmovw (%rdi), %k0
+; AVX512-NEXT: kshiftrw $8, %k0, %k0
+; AVX512-NEXT: vpmovm2q %k0, %xmm2
+; AVX512-NEXT: vpbroadcastq %xmm2, %xmm2
+; AVX512-NEXT: vpmovq2m %xmm2, %k1
+; AVX512-NEXT: vmovapd %xmm0, %xmm1 {%k1}
+; AVX512-NEXT: vmovapd %xmm1, (%rsi)
+; AVX512-NEXT: retq
+;
+; AVX512NOTDQ-LABEL: load_v16i1_broadcast_8_v2i1:
+; AVX512NOTDQ: # %bb.0:
+; AVX512NOTDQ-NEXT: kmovw (%rdi), %k0
+; AVX512NOTDQ-NEXT: kshiftrw $8, %k0, %k1
+; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX512NOTDQ-NEXT: vmovdqa64 %xmm2, %xmm2 {%k1} {z}
+; AVX512NOTDQ-NEXT: vpbroadcastq %xmm2, %xmm2
+; AVX512NOTDQ-NEXT: vpsllq $63, %xmm2, %xmm2
+; AVX512NOTDQ-NEXT: vptestmq %xmm2, %xmm2, %k1
+; AVX512NOTDQ-NEXT: vmovapd %xmm0, %xmm1 {%k1}
+; AVX512NOTDQ-NEXT: vmovapd %xmm1, (%rsi)
+; AVX512NOTDQ-NEXT: retq
+ %d0 = load <16 x i1>, <16 x i1>* %a0
+ %d1 = shufflevector <16 x i1> %d0,<16 x i1> undef,<2 x i32><i32 8,i32 8>
+ %d2 = select <2 x i1> %d1, <2 x double> %a1, <2 x double> %a2
+ store <2 x double> %d2, <2 x double>* %a3
+ ret void
+}
+define void @load_v16i1_broadcast_8_v4i1(<16 x i1>* %a0,<4 x float> %a1,<4 x float> %a2,<4 x float>* %a3) {
+; AVX512-LABEL: load_v16i1_broadcast_8_v4i1:
+; AVX512: # %bb.0:
+; AVX512-NEXT: kmovw (%rdi), %k0
+; AVX512-NEXT: kshiftrw $8, %k0, %k0
+; AVX512-NEXT: vpmovm2d %k0, %xmm2
+; AVX512-NEXT: vpbroadcastd %xmm2, %xmm2
+; AVX512-NEXT: vpmovd2m %xmm2, %k1
+; AVX512-NEXT: vmovaps %xmm0, %xmm1 {%k1}
+; AVX512-NEXT: vmovaps %xmm1, (%rsi)
+; AVX512-NEXT: retq
+;
+; AVX512NOTDQ-LABEL: load_v16i1_broadcast_8_v4i1:
+; AVX512NOTDQ: # %bb.0:
+; AVX512NOTDQ-NEXT: kmovw (%rdi), %k0
+; AVX512NOTDQ-NEXT: kshiftrw $8, %k0, %k1
+; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX512NOTDQ-NEXT: vmovdqa32 %xmm2, %xmm2 {%k1} {z}
+; AVX512NOTDQ-NEXT: vpbroadcastd %xmm2, %xmm2
+; AVX512NOTDQ-NEXT: vpslld $31, %xmm2, %xmm2
+; AVX512NOTDQ-NEXT: vptestmd %xmm2, %xmm2, %k1
+; AVX512NOTDQ-NEXT: vmovaps %xmm0, %xmm1 {%k1}
+; AVX512NOTDQ-NEXT: vmovaps %xmm1, (%rsi)
+; AVX512NOTDQ-NEXT: retq
+ %d0 = load <16 x i1>, <16 x i1>* %a0
+ %d1 = shufflevector <16 x i1> %d0,<16 x i1> undef,<4 x i32><i32 8,i32 8,i32 8,i32 8>
+ %d2 = select <4 x i1> %d1, <4 x float> %a1, <4 x float> %a2
+ store <4 x float> %d2, <4 x float>* %a3
+ ret void
+}
+define void @load_v16i1_broadcast_15_v2i1(<16 x i1>* %a0,<2 x double> %a1,<2 x double> %a2,<2 x double>* %a3) {
+; AVX512-LABEL: load_v16i1_broadcast_15_v2i1:
+; AVX512: # %bb.0:
+; AVX512-NEXT: kmovw (%rdi), %k0
+; AVX512-NEXT: kshiftrw $14, %k0, %k0
+; AVX512-NEXT: vpmovm2q %k0, %xmm2
+; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
+; AVX512-NEXT: vpmovq2m %xmm2, %k1
+; AVX512-NEXT: vmovapd %xmm0, %xmm1 {%k1}
+; AVX512-NEXT: vmovapd %xmm1, (%rsi)
+; AVX512-NEXT: retq
+;
+; AVX512NOTDQ-LABEL: load_v16i1_broadcast_15_v2i1:
+; AVX512NOTDQ: # %bb.0:
+; AVX512NOTDQ-NEXT: kmovw (%rdi), %k0
+; AVX512NOTDQ-NEXT: kshiftrw $14, %k0, %k1
+; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX512NOTDQ-NEXT: vmovdqa64 %xmm2, %xmm2 {%k1} {z}
+; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
+; AVX512NOTDQ-NEXT: vpsllq $63, %xmm2, %xmm2
+; AVX512NOTDQ-NEXT: vptestmq %xmm2, %xmm2, %k1
+; AVX512NOTDQ-NEXT: vmovapd %xmm0, %xmm1 {%k1}
+; AVX512NOTDQ-NEXT: vmovapd %xmm1, (%rsi)
+; AVX512NOTDQ-NEXT: retq
+ %d0 = load <16 x i1>, <16 x i1>* %a0
+ %d1 = shufflevector <16 x i1> %d0,<16 x i1> undef,<2 x i32><i32 15,i32 15>
+ %d2 = select <2 x i1> %d1, <2 x double> %a1, <2 x double> %a2
+ store <2 x double> %d2, <2 x double>* %a3
+ ret void
+}
+define void @load_v16i1_broadcast_15_v4i1(<16 x i1>* %a0,<4 x float> %a1,<4 x float> %a2,<4 x float>* %a3) {
+; AVX512-LABEL: load_v16i1_broadcast_15_v4i1:
+; AVX512: # %bb.0:
+; AVX512-NEXT: kmovw (%rdi), %k0
+; AVX512-NEXT: kshiftrw $12, %k0, %k0
+; AVX512-NEXT: vpmovm2d %k0, %xmm2
+; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3]
+; AVX512-NEXT: vpmovd2m %xmm2, %k1
+; AVX512-NEXT: vmovaps %xmm0, %xmm1 {%k1}
+; AVX512-NEXT: vmovaps %xmm1, (%rsi)
+; AVX512-NEXT: retq
+;
+; AVX512NOTDQ-LABEL: load_v16i1_broadcast_15_v4i1:
+; AVX512NOTDQ: # %bb.0:
+; AVX512NOTDQ-NEXT: kmovw (%rdi), %k0
+; AVX512NOTDQ-NEXT: kshiftrw $12, %k0, %k1
+; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX512NOTDQ-NEXT: vmovdqa32 %xmm2, %xmm2 {%k1} {z}
+; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3]
+; AVX512NOTDQ-NEXT: vpslld $31, %xmm2, %xmm2
+; AVX512NOTDQ-NEXT: vptestmd %xmm2, %xmm2, %k1
+; AVX512NOTDQ-NEXT: vmovaps %xmm0, %xmm1 {%k1}
+; AVX512NOTDQ-NEXT: vmovaps %xmm1, (%rsi)
+; AVX512NOTDQ-NEXT: retq
+ %d0 = load <16 x i1>, <16 x i1>* %a0
+ %d1 = shufflevector <16 x i1> %d0,<16 x i1> undef,<4 x i32><i32 15,i32 15,i32 15,i32 15>
+ %d2 = select <4 x i1> %d1, <4 x float> %a1, <4 x float> %a2
+ store <4 x float> %d2, <4 x float>* %a3
+ ret void
+}
+define void @load_v32i1_broadcast_16_v2i1(<32 x i1>* %a0,<2 x double> %a1,<2 x double> %a2,<2 x double>* %a3) {
+; AVX512-LABEL: load_v32i1_broadcast_16_v2i1:
+; AVX512: # %bb.0:
+; AVX512-NEXT: kmovd (%rdi), %k0
+; AVX512-NEXT: kshiftrd $16, %k0, %k0
+; AVX512-NEXT: vpmovm2q %k0, %xmm2
+; AVX512-NEXT: vpbroadcastq %xmm2, %xmm2
+; AVX512-NEXT: vpmovq2m %xmm2, %k1
+; AVX512-NEXT: vmovapd %xmm0, %xmm1 {%k1}
+; AVX512-NEXT: vmovapd %xmm1, (%rsi)
+; AVX512-NEXT: retq
+;
+; AVX512NOTDQ-LABEL: load_v32i1_broadcast_16_v2i1:
+; AVX512NOTDQ: # %bb.0:
+; AVX512NOTDQ-NEXT: kmovd (%rdi), %k0
+; AVX512NOTDQ-NEXT: kshiftrd $16, %k0, %k1
+; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX512NOTDQ-NEXT: vmovdqa64 %xmm2, %xmm2 {%k1} {z}
+; AVX512NOTDQ-NEXT: vpbroadcastq %xmm2, %xmm2
+; AVX512NOTDQ-NEXT: vpsllq $63, %xmm2, %xmm2
+; AVX512NOTDQ-NEXT: vptestmq %xmm2, %xmm2, %k1
+; AVX512NOTDQ-NEXT: vmovapd %xmm0, %xmm1 {%k1}
+; AVX512NOTDQ-NEXT: vmovapd %xmm1, (%rsi)
+; AVX512NOTDQ-NEXT: retq
+ %d0 = load <32 x i1>, <32 x i1>* %a0
+ %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<2 x i32><i32 16,i32 16>
+ %d2 = select <2 x i1> %d1, <2 x double> %a1, <2 x double> %a2
+ store <2 x double> %d2, <2 x double>* %a3
+ ret void
+}
+define void @load_v32i1_broadcast_16_v4i1(<32 x i1>* %a0,<4 x float> %a1,<4 x float> %a2,<4 x float>* %a3) {
+; AVX512-LABEL: load_v32i1_broadcast_16_v4i1:
+; AVX512: # %bb.0:
+; AVX512-NEXT: kmovd (%rdi), %k0
+; AVX512-NEXT: kshiftrd $16, %k0, %k0
+; AVX512-NEXT: vpmovm2d %k0, %xmm2
+; AVX512-NEXT: vpbroadcastd %xmm2, %xmm2
+; AVX512-NEXT: vpmovd2m %xmm2, %k1
+; AVX512-NEXT: vmovaps %xmm0, %xmm1 {%k1}
+; AVX512-NEXT: vmovaps %xmm1, (%rsi)
+; AVX512-NEXT: retq
+;
+; AVX512NOTDQ-LABEL: load_v32i1_broadcast_16_v4i1:
+; AVX512NOTDQ: # %bb.0:
+; AVX512NOTDQ-NEXT: kmovd (%rdi), %k0
+; AVX512NOTDQ-NEXT: kshiftrd $16, %k0, %k1
+; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX512NOTDQ-NEXT: vmovdqa32 %xmm2, %xmm2 {%k1} {z}
+; AVX512NOTDQ-NEXT: vpbroadcastd %xmm2, %xmm2
+; AVX512NOTDQ-NEXT: vpslld $31, %xmm2, %xmm2
+; AVX512NOTDQ-NEXT: vptestmd %xmm2, %xmm2, %k1
+; AVX512NOTDQ-NEXT: vmovaps %xmm0, %xmm1 {%k1}
+; AVX512NOTDQ-NEXT: vmovaps %xmm1, (%rsi)
+; AVX512NOTDQ-NEXT: retq
+ %d0 = load <32 x i1>, <32 x i1>* %a0
+ %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<4 x i32><i32 16,i32 16,i32 16,i32 16>
+ %d2 = select <4 x i1> %d1, <4 x float> %a1, <4 x float> %a2
+ store <4 x float> %d2, <4 x float>* %a3
+ ret void
+}
+define void @load_v32i1_broadcast_16_v8i1(<32 x i1>* %a0,<8 x float> %a1,<8 x float> %a2,<8 x float>* %a3) {
+; AVX512-LABEL: load_v32i1_broadcast_16_v8i1:
+; AVX512: # %bb.0:
+; AVX512-NEXT: kmovd (%rdi), %k0
+; AVX512-NEXT: kshiftrd $16, %k0, %k0
+; AVX512-NEXT: vpmovm2q %k0, %zmm2
+; AVX512-NEXT: vpbroadcastq %xmm2, %zmm2
+; AVX512-NEXT: vpmovq2m %zmm2, %k1
+; AVX512-NEXT: vmovaps %ymm0, %ymm1 {%k1}
+; AVX512-NEXT: vmovaps %ymm1, (%rsi)
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
+;
+; AVX512NOTDQ-LABEL: load_v32i1_broadcast_16_v8i1:
+; AVX512NOTDQ: # %bb.0:
+; AVX512NOTDQ-NEXT: kmovd (%rdi), %k0
+; AVX512NOTDQ-NEXT: kshiftrd $16, %k0, %k1
+; AVX512NOTDQ-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; AVX512NOTDQ-NEXT: vpbroadcastq %xmm2, %zmm2
+; AVX512NOTDQ-NEXT: vpsllq $63, %zmm2, %zmm2
+; AVX512NOTDQ-NEXT: vptestmq %zmm2, %zmm2, %k1
+; AVX512NOTDQ-NEXT: vmovaps %ymm0, %ymm1 {%k1}
+; AVX512NOTDQ-NEXT: vmovaps %ymm1, (%rsi)
+; AVX512NOTDQ-NEXT: vzeroupper
+; AVX512NOTDQ-NEXT: retq
+ %d0 = load <32 x i1>, <32 x i1>* %a0
+ %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<8 x i32><i32 16,i32 16,i32 16,i32 16,i32 16,i32 16,i32 16,i32 16>
+ %d2 = select <8 x i1> %d1, <8 x float> %a1, <8 x float> %a2
+ store <8 x float> %d2, <8 x float>* %a3
+ ret void
+}
+define void @load_v32i1_broadcast_31_v2i1(<32 x i1>* %a0,<2 x double> %a1,<2 x double> %a2,<2 x double>* %a3) {
+; AVX512-LABEL: load_v32i1_broadcast_31_v2i1:
+; AVX512: # %bb.0:
+; AVX512-NEXT: kmovd (%rdi), %k0
+; AVX512-NEXT: kshiftrd $30, %k0, %k0
+; AVX512-NEXT: vpmovm2q %k0, %xmm2
+; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
+; AVX512-NEXT: vpmovq2m %xmm2, %k1
+; AVX512-NEXT: vmovapd %xmm0, %xmm1 {%k1}
+; AVX512-NEXT: vmovapd %xmm1, (%rsi)
+; AVX512-NEXT: retq
+;
+; AVX512NOTDQ-LABEL: load_v32i1_broadcast_31_v2i1:
+; AVX512NOTDQ: # %bb.0:
+; AVX512NOTDQ-NEXT: kmovd (%rdi), %k0
+; AVX512NOTDQ-NEXT: kshiftrd $30, %k0, %k1
+; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX512NOTDQ-NEXT: vmovdqa64 %xmm2, %xmm2 {%k1} {z}
+; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
+; AVX512NOTDQ-NEXT: vpsllq $63, %xmm2, %xmm2
+; AVX512NOTDQ-NEXT: vptestmq %xmm2, %xmm2, %k1
+; AVX512NOTDQ-NEXT: vmovapd %xmm0, %xmm1 {%k1}
+; AVX512NOTDQ-NEXT: vmovapd %xmm1, (%rsi)
+; AVX512NOTDQ-NEXT: retq
+ %d0 = load <32 x i1>, <32 x i1>* %a0
+ %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<2 x i32><i32 31,i32 31>
+ %d2 = select <2 x i1> %d1, <2 x double> %a1, <2 x double> %a2
+ store <2 x double> %d2, <2 x double>* %a3
+ ret void
+}
+define void @load_v32i1_broadcast_31_v4i1(<32 x i1>* %a0,<4 x float> %a1,<4 x float> %a2,<4 x float>* %a3) {
+; AVX512-LABEL: load_v32i1_broadcast_31_v4i1:
+; AVX512: # %bb.0:
+; AVX512-NEXT: kmovd (%rdi), %k0
+; AVX512-NEXT: kshiftrd $28, %k0, %k0
+; AVX512-NEXT: vpmovm2d %k0, %xmm2
+; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3]
+; AVX512-NEXT: vpmovd2m %xmm2, %k1
+; AVX512-NEXT: vmovaps %xmm0, %xmm1 {%k1}
+; AVX512-NEXT: vmovaps %xmm1, (%rsi)
+; AVX512-NEXT: retq
+;
+; AVX512NOTDQ-LABEL: load_v32i1_broadcast_31_v4i1:
+; AVX512NOTDQ: # %bb.0:
+; AVX512NOTDQ-NEXT: kmovd (%rdi), %k0
+; AVX512NOTDQ-NEXT: kshiftrd $28, %k0, %k1
+; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX512NOTDQ-NEXT: vmovdqa32 %xmm2, %xmm2 {%k1} {z}
+; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3]
+; AVX512NOTDQ-NEXT: vpslld $31, %xmm2, %xmm2
+; AVX512NOTDQ-NEXT: vptestmd %xmm2, %xmm2, %k1
+; AVX512NOTDQ-NEXT: vmovaps %xmm0, %xmm1 {%k1}
+; AVX512NOTDQ-NEXT: vmovaps %xmm1, (%rsi)
+; AVX512NOTDQ-NEXT: retq
+ %d0 = load <32 x i1>, <32 x i1>* %a0
+ %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<4 x i32><i32 31,i32 31,i32 31,i32 31>
+ %d2 = select <4 x i1> %d1, <4 x float> %a1, <4 x float> %a2
+ store <4 x float> %d2, <4 x float>* %a3
+ ret void
+}
+define void @load_v32i1_broadcast_31_v8i1(<32 x i1>* %a0,<8 x float> %a1,<8 x float> %a2,<8 x float>* %a3) {
+; AVX512-LABEL: load_v32i1_broadcast_31_v8i1:
+; AVX512: # %bb.0:
+; AVX512-NEXT: kmovd (%rdi), %k0
+; AVX512-NEXT: kshiftrd $24, %k0, %k0
+; AVX512-NEXT: vpmovm2q %k0, %zmm2
+; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7]
+; AVX512-NEXT: vpermq %zmm2, %zmm3, %zmm2
+; AVX512-NEXT: vpmovq2m %zmm2, %k1
+; AVX512-NEXT: vmovaps %ymm0, %ymm1 {%k1}
+; AVX512-NEXT: vmovaps %ymm1, (%rsi)
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
+;
+; AVX512NOTDQ-LABEL: load_v32i1_broadcast_31_v8i1:
+; AVX512NOTDQ: # %bb.0:
+; AVX512NOTDQ-NEXT: kmovd (%rdi), %k0
+; AVX512NOTDQ-NEXT: kshiftrd $24, %k0, %k1
+; AVX512NOTDQ-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; AVX512NOTDQ-NEXT: vpbroadcastq {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7]
+; AVX512NOTDQ-NEXT: vpermq %zmm2, %zmm3, %zmm2
+; AVX512NOTDQ-NEXT: vpsllq $63, %zmm2, %zmm2
+; AVX512NOTDQ-NEXT: vptestmq %zmm2, %zmm2, %k1
+; AVX512NOTDQ-NEXT: vmovaps %ymm0, %ymm1 {%k1}
+; AVX512NOTDQ-NEXT: vmovaps %ymm1, (%rsi)
+; AVX512NOTDQ-NEXT: vzeroupper
+; AVX512NOTDQ-NEXT: retq
+ %d0 = load <32 x i1>, <32 x i1>* %a0
+ %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<8 x i32><i32 31,i32 31,i32 31,i32 31,i32 31,i32 31,i32 31,i32 31>
+ %d2 = select <8 x i1> %d1, <8 x float> %a1, <8 x float> %a2
+ store <8 x float> %d2, <8 x float>* %a3
+ ret void
+}
+define void @load_v64i1_broadcast_32_v2i1(<64 x i1>* %a0,<2 x double> %a1,<2 x double> %a2,<2 x double>* %a3) {
+; AVX512-LABEL: load_v64i1_broadcast_32_v2i1:
+; AVX512: # %bb.0:
+; AVX512-NEXT: kmovq (%rdi), %k0
+; AVX512-NEXT: kshiftrq $32, %k0, %k0
+; AVX512-NEXT: vpmovm2q %k0, %xmm2
+; AVX512-NEXT: vpbroadcastq %xmm2, %xmm2
+; AVX512-NEXT: vpmovq2m %xmm2, %k1
+; AVX512-NEXT: vmovapd %xmm0, %xmm1 {%k1}
+; AVX512-NEXT: vmovapd %xmm1, (%rsi)
+; AVX512-NEXT: retq
+;
+; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v2i1:
+; AVX512NOTDQ: # %bb.0:
+; AVX512NOTDQ-NEXT: kmovq (%rdi), %k0
+; AVX512NOTDQ-NEXT: kshiftrq $32, %k0, %k1
+; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX512NOTDQ-NEXT: vmovdqa64 %xmm2, %xmm2 {%k1} {z}
+; AVX512NOTDQ-NEXT: vpbroadcastq %xmm2, %xmm2
+; AVX512NOTDQ-NEXT: vpsllq $63, %xmm2, %xmm2
+; AVX512NOTDQ-NEXT: vptestmq %xmm2, %xmm2, %k1
+; AVX512NOTDQ-NEXT: vmovapd %xmm0, %xmm1 {%k1}
+; AVX512NOTDQ-NEXT: vmovapd %xmm1, (%rsi)
+; AVX512NOTDQ-NEXT: retq
+ %d0 = load <64 x i1>, <64 x i1>* %a0
+ %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<2 x i32><i32 32,i32 32>
+ %d2 = select <2 x i1> %d1, <2 x double> %a1, <2 x double> %a2
+ store <2 x double> %d2, <2 x double>* %a3
+ ret void
+}
+define void @load_v64i1_broadcast_32_v4i1(<64 x i1>* %a0,<4 x float> %a1,<4 x float> %a2,<4 x float>* %a3) {
+; AVX512-LABEL: load_v64i1_broadcast_32_v4i1:
+; AVX512: # %bb.0:
+; AVX512-NEXT: kmovq (%rdi), %k0
+; AVX512-NEXT: kshiftrq $32, %k0, %k0
+; AVX512-NEXT: vpmovm2d %k0, %xmm2
+; AVX512-NEXT: vpbroadcastd %xmm2, %xmm2
+; AVX512-NEXT: vpmovd2m %xmm2, %k1
+; AVX512-NEXT: vmovaps %xmm0, %xmm1 {%k1}
+; AVX512-NEXT: vmovaps %xmm1, (%rsi)
+; AVX512-NEXT: retq
+;
+; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v4i1:
+; AVX512NOTDQ: # %bb.0:
+; AVX512NOTDQ-NEXT: kmovq (%rdi), %k0
+; AVX512NOTDQ-NEXT: kshiftrq $32, %k0, %k1
+; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX512NOTDQ-NEXT: vmovdqa32 %xmm2, %xmm2 {%k1} {z}
+; AVX512NOTDQ-NEXT: vpbroadcastd %xmm2, %xmm2
+; AVX512NOTDQ-NEXT: vpslld $31, %xmm2, %xmm2
+; AVX512NOTDQ-NEXT: vptestmd %xmm2, %xmm2, %k1
+; AVX512NOTDQ-NEXT: vmovaps %xmm0, %xmm1 {%k1}
+; AVX512NOTDQ-NEXT: vmovaps %xmm1, (%rsi)
+; AVX512NOTDQ-NEXT: retq
+ %d0 = load <64 x i1>, <64 x i1>* %a0
+ %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<4 x i32><i32 32,i32 32,i32 32,i32 32>
+ %d2 = select <4 x i1> %d1, <4 x float> %a1, <4 x float> %a2
+ store <4 x float> %d2, <4 x float>* %a3
+ ret void
+}
+define void @load_v64i1_broadcast_32_v8i1(<64 x i1>* %a0,<8 x float> %a1,<8 x float> %a2,<8 x float>* %a3) {
+; AVX512-LABEL: load_v64i1_broadcast_32_v8i1:
+; AVX512: # %bb.0:
+; AVX512-NEXT: kmovq (%rdi), %k0
+; AVX512-NEXT: kshiftrq $32, %k0, %k0
+; AVX512-NEXT: vpmovm2q %k0, %zmm2
+; AVX512-NEXT: vpbroadcastq %xmm2, %zmm2
+; AVX512-NEXT: vpmovq2m %zmm2, %k1
+; AVX512-NEXT: vmovaps %ymm0, %ymm1 {%k1}
+; AVX512-NEXT: vmovaps %ymm1, (%rsi)
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
+;
+; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v8i1:
+; AVX512NOTDQ: # %bb.0:
+; AVX512NOTDQ-NEXT: kmovq (%rdi), %k0
+; AVX512NOTDQ-NEXT: kshiftrq $32, %k0, %k1
+; AVX512NOTDQ-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; AVX512NOTDQ-NEXT: vpbroadcastq %xmm2, %zmm2
+; AVX512NOTDQ-NEXT: vpsllq $63, %zmm2, %zmm2
+; AVX512NOTDQ-NEXT: vptestmq %zmm2, %zmm2, %k1
+; AVX512NOTDQ-NEXT: vmovaps %ymm0, %ymm1 {%k1}
+; AVX512NOTDQ-NEXT: vmovaps %ymm1, (%rsi)
+; AVX512NOTDQ-NEXT: vzeroupper
+; AVX512NOTDQ-NEXT: retq
+ %d0 = load <64 x i1>, <64 x i1>* %a0
+ %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<8 x i32><i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32>
+ %d2 = select <8 x i1> %d1, <8 x float> %a1, <8 x float> %a2
+ store <8 x float> %d2, <8 x float>* %a3
+ ret void
+}
+define void @load_v64i1_broadcast_32_v16i1(<64 x i1>* %a0,<16 x float> %a1,<16 x float> %a2,<16 x float>* %a3) {
+; AVX512-LABEL: load_v64i1_broadcast_32_v16i1:
+; AVX512: # %bb.0:
+; AVX512-NEXT: kmovq (%rdi), %k0
+; AVX512-NEXT: kshiftrq $32, %k0, %k0
+; AVX512-NEXT: vpmovm2d %k0, %zmm2
+; AVX512-NEXT: vpbroadcastd %xmm2, %zmm2
+; AVX512-NEXT: vpmovd2m %zmm2, %k1
+; AVX512-NEXT: vmovaps %zmm0, %zmm1 {%k1}
+; AVX512-NEXT: vmovaps %zmm1, (%rsi)
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
+;
+; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v16i1:
+; AVX512NOTDQ: # %bb.0:
+; AVX512NOTDQ-NEXT: kmovq (%rdi), %k0
+; AVX512NOTDQ-NEXT: kshiftrq $32, %k0, %k1
+; AVX512NOTDQ-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; AVX512NOTDQ-NEXT: vpbroadcastd %xmm2, %zmm2
+; AVX512NOTDQ-NEXT: vpslld $31, %zmm2, %zmm2
+; AVX512NOTDQ-NEXT: vptestmd %zmm2, %zmm2, %k1
+; AVX512NOTDQ-NEXT: vmovaps %zmm0, %zmm1 {%k1}
+; AVX512NOTDQ-NEXT: vmovaps %zmm1, (%rsi)
+; AVX512NOTDQ-NEXT: vzeroupper
+; AVX512NOTDQ-NEXT: retq
+ %d0 = load <64 x i1>, <64 x i1>* %a0
+ %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<16 x i32><i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32>
+ %d2 = select <16 x i1> %d1, <16 x float> %a1, <16 x float> %a2
+ store <16 x float> %d2, <16 x float>* %a3
+ ret void
+}
+define void @load_v64i1_broadcast_63_v2i1(<64 x i1>* %a0,<2 x double> %a1,<2 x double> %a2,<2 x double>* %a3) {
+; AVX512-LABEL: load_v64i1_broadcast_63_v2i1:
+; AVX512: # %bb.0:
+; AVX512-NEXT: kmovq (%rdi), %k0
+; AVX512-NEXT: kshiftrq $62, %k0, %k0
+; AVX512-NEXT: vpmovm2q %k0, %xmm2
+; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
+; AVX512-NEXT: vpmovq2m %xmm2, %k1
+; AVX512-NEXT: vmovapd %xmm0, %xmm1 {%k1}
+; AVX512-NEXT: vmovapd %xmm1, (%rsi)
+; AVX512-NEXT: retq
+;
+; AVX512NOTDQ-LABEL: load_v64i1_broadcast_63_v2i1:
+; AVX512NOTDQ: # %bb.0:
+; AVX512NOTDQ-NEXT: kmovq (%rdi), %k0
+; AVX512NOTDQ-NEXT: kshiftrq $62, %k0, %k1
+; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX512NOTDQ-NEXT: vmovdqa64 %xmm2, %xmm2 {%k1} {z}
+; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
+; AVX512NOTDQ-NEXT: vpsllq $63, %xmm2, %xmm2
+; AVX512NOTDQ-NEXT: vptestmq %xmm2, %xmm2, %k1
+; AVX512NOTDQ-NEXT: vmovapd %xmm0, %xmm1 {%k1}
+; AVX512NOTDQ-NEXT: vmovapd %xmm1, (%rsi)
+; AVX512NOTDQ-NEXT: retq
+ %d0 = load <64 x i1>, <64 x i1>* %a0
+ %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<2 x i32><i32 63,i32 63>
+ %d2 = select <2 x i1> %d1, <2 x double> %a1, <2 x double> %a2
+ store <2 x double> %d2, <2 x double>* %a3
+ ret void
+}
+define void @load_v64i1_broadcast_63_v4i1(<64 x i1>* %a0,<4 x float> %a1,<4 x float> %a2,<4 x float>* %a3) {
+; AVX512-LABEL: load_v64i1_broadcast_63_v4i1:
+; AVX512: # %bb.0:
+; AVX512-NEXT: kmovq (%rdi), %k0
+; AVX512-NEXT: kshiftrq $60, %k0, %k0
+; AVX512-NEXT: vpmovm2d %k0, %xmm2
+; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3]
+; AVX512-NEXT: vpmovd2m %xmm2, %k1
+; AVX512-NEXT: vmovaps %xmm0, %xmm1 {%k1}
+; AVX512-NEXT: vmovaps %xmm1, (%rsi)
+; AVX512-NEXT: retq
+;
+; AVX512NOTDQ-LABEL: load_v64i1_broadcast_63_v4i1:
+; AVX512NOTDQ: # %bb.0:
+; AVX512NOTDQ-NEXT: kmovq (%rdi), %k0
+; AVX512NOTDQ-NEXT: kshiftrq $60, %k0, %k1
+; AVX512NOTDQ-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX512NOTDQ-NEXT: vmovdqa32 %xmm2, %xmm2 {%k1} {z}
+; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3]
+; AVX512NOTDQ-NEXT: vpslld $31, %xmm2, %xmm2
+; AVX512NOTDQ-NEXT: vptestmd %xmm2, %xmm2, %k1
+; AVX512NOTDQ-NEXT: vmovaps %xmm0, %xmm1 {%k1}
+; AVX512NOTDQ-NEXT: vmovaps %xmm1, (%rsi)
+; AVX512NOTDQ-NEXT: retq
+ %d0 = load <64 x i1>, <64 x i1>* %a0
+ %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<4 x i32><i32 63,i32 63,i32 63,i32 63>
+ %d2 = select <4 x i1> %d1, <4 x float> %a1, <4 x float> %a2
+ store <4 x float> %d2, <4 x float>* %a3
+ ret void
+}
+define void @load_v64i1_broadcast_63_v8i1(<64 x i1>* %a0,<8 x float> %a1,<8 x float> %a2,<8 x float>* %a3) {
+; AVX512-LABEL: load_v64i1_broadcast_63_v8i1:
+; AVX512: # %bb.0:
+; AVX512-NEXT: kmovq (%rdi), %k0
+; AVX512-NEXT: kshiftrq $56, %k0, %k0
+; AVX512-NEXT: vpmovm2q %k0, %zmm2
+; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7]
+; AVX512-NEXT: vpermq %zmm2, %zmm3, %zmm2
+; AVX512-NEXT: vpmovq2m %zmm2, %k1
+; AVX512-NEXT: vmovaps %ymm0, %ymm1 {%k1}
+; AVX512-NEXT: vmovaps %ymm1, (%rsi)
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
+;
+; AVX512NOTDQ-LABEL: load_v64i1_broadcast_63_v8i1:
+; AVX512NOTDQ: # %bb.0:
+; AVX512NOTDQ-NEXT: kmovq (%rdi), %k0
+; AVX512NOTDQ-NEXT: kshiftrq $56, %k0, %k1
+; AVX512NOTDQ-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; AVX512NOTDQ-NEXT: vpbroadcastq {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7]
+; AVX512NOTDQ-NEXT: vpermq %zmm2, %zmm3, %zmm2
+; AVX512NOTDQ-NEXT: vpsllq $63, %zmm2, %zmm2
+; AVX512NOTDQ-NEXT: vptestmq %zmm2, %zmm2, %k1
+; AVX512NOTDQ-NEXT: vmovaps %ymm0, %ymm1 {%k1}
+; AVX512NOTDQ-NEXT: vmovaps %ymm1, (%rsi)
+; AVX512NOTDQ-NEXT: vzeroupper
+; AVX512NOTDQ-NEXT: retq
+ %d0 = load <64 x i1>, <64 x i1>* %a0
+ %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<8 x i32><i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63>
+ %d2 = select <8 x i1> %d1, <8 x float> %a1, <8 x float> %a2
+ store <8 x float> %d2, <8 x float>* %a3
+ ret void
+}
+define void @load_v64i1_broadcast_63_v16i1(<64 x i1>* %a0,<16 x float> %a1,<16 x float> %a2,<16 x float>* %a3) {
+; AVX512-LABEL: load_v64i1_broadcast_63_v16i1:
+; AVX512: # %bb.0:
+; AVX512-NEXT: kmovq (%rdi), %k0
+; AVX512-NEXT: kshiftrq $48, %k0, %k0
+; AVX512-NEXT: vpmovm2d %k0, %zmm2
+; AVX512-NEXT: vpbroadcastd {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512-NEXT: vpermd %zmm2, %zmm3, %zmm2
+; AVX512-NEXT: vpmovd2m %zmm2, %k1
+; AVX512-NEXT: vmovaps %zmm0, %zmm1 {%k1}
+; AVX512-NEXT: vmovaps %zmm1, (%rsi)
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
+;
+; AVX512NOTDQ-LABEL: load_v64i1_broadcast_63_v16i1:
+; AVX512NOTDQ: # %bb.0:
+; AVX512NOTDQ-NEXT: kmovq (%rdi), %k0
+; AVX512NOTDQ-NEXT: kshiftrq $48, %k0, %k1
+; AVX512NOTDQ-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; AVX512NOTDQ-NEXT: vpbroadcastd {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512NOTDQ-NEXT: vpermd %zmm2, %zmm3, %zmm2
+; AVX512NOTDQ-NEXT: vpslld $31, %zmm2, %zmm2
+; AVX512NOTDQ-NEXT: vptestmd %zmm2, %zmm2, %k1
+; AVX512NOTDQ-NEXT: vmovaps %zmm0, %zmm1 {%k1}
+; AVX512NOTDQ-NEXT: vmovaps %zmm1, (%rsi)
+; AVX512NOTDQ-NEXT: vzeroupper
+; AVX512NOTDQ-NEXT: retq
+ %d0 = load <64 x i1>, <64 x i1>* %a0
+ %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<16 x i32><i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63>
+ %d2 = select <16 x i1> %d1, <16 x float> %a1, <16 x float> %a2
+ store <16 x float> %d2, <16 x float>* %a3
+ ret void
+}
+define void @load_v2i1_broadcast_1_v1i1_store(<2 x i1>* %a0,<1 x i1>* %a1) {
+; AVX512-LABEL: load_v2i1_broadcast_1_v1i1_store:
+; AVX512: # %bb.0:
+; AVX512-NEXT: kmovb (%rdi), %k0
+; AVX512-NEXT: kshiftrw $1, %k0, %k0
+; AVX512-NEXT: kmovb %k0, (%rsi)
+; AVX512-NEXT: retq
+;
+; AVX512NOTDQ-LABEL: load_v2i1_broadcast_1_v1i1_store:
+; AVX512NOTDQ: # %bb.0:
+; AVX512NOTDQ-NEXT: movzbl (%rdi), %eax
+; AVX512NOTDQ-NEXT: kmovd %eax, %k0
+; AVX512NOTDQ-NEXT: kshiftrw $1, %k0, %k0
+; AVX512NOTDQ-NEXT: kmovd %k0, %eax
+; AVX512NOTDQ-NEXT: movb %al, (%rsi)
+; AVX512NOTDQ-NEXT: retq
+ %d0 = load <2 x i1>, <2 x i1>* %a0
+ %d1 = shufflevector <2 x i1> %d0,<2 x i1> undef,<1 x i32><i32 1>
+ store <1 x i1> %d1, <1 x i1>* %a1
+ ret void
+}
+define void @load_v3i1_broadcast_1_v1i1_store(<3 x i1>* %a0,<1 x i1>* %a1) {
+; AVX512-LABEL: load_v3i1_broadcast_1_v1i1_store:
+; AVX512: # %bb.0:
+; AVX512-NEXT: kmovb (%rdi), %k0
+; AVX512-NEXT: kshiftrw $1, %k0, %k0
+; AVX512-NEXT: kmovb %k0, (%rsi)
+; AVX512-NEXT: retq
+;
+; AVX512NOTDQ-LABEL: load_v3i1_broadcast_1_v1i1_store:
+; AVX512NOTDQ: # %bb.0:
+; AVX512NOTDQ-NEXT: movzbl (%rdi), %eax
+; AVX512NOTDQ-NEXT: kmovd %eax, %k0
+; AVX512NOTDQ-NEXT: kshiftrw $1, %k0, %k0
+; AVX512NOTDQ-NEXT: kmovd %k0, %eax
+; AVX512NOTDQ-NEXT: movb %al, (%rsi)
+; AVX512NOTDQ-NEXT: retq
+ %d0 = load <3 x i1>, <3 x i1>* %a0
+ %d1 = shufflevector <3 x i1> %d0,<3 x i1> undef,<1 x i32><i32 1>
+ store <1 x i1> %d1, <1 x i1>* %a1
+ ret void
+}
+define void @load_v3i1_broadcast_2_v1i1_store(<3 x i1>* %a0,<1 x i1>* %a1) {
+; AVX512-LABEL: load_v3i1_broadcast_2_v1i1_store:
+; AVX512: # %bb.0:
+; AVX512-NEXT: kmovb (%rdi), %k0
+; AVX512-NEXT: kshiftrw $2, %k0, %k0
+; AVX512-NEXT: kmovb %k0, (%rsi)
+; AVX512-NEXT: retq
+;
+; AVX512NOTDQ-LABEL: load_v3i1_broadcast_2_v1i1_store:
+; AVX512NOTDQ: # %bb.0:
+; AVX512NOTDQ-NEXT: movzbl (%rdi), %eax
+; AVX512NOTDQ-NEXT: kmovd %eax, %k0
+; AVX512NOTDQ-NEXT: kshiftrw $2, %k0, %k0
+; AVX512NOTDQ-NEXT: kmovd %k0, %eax
+; AVX512NOTDQ-NEXT: movb %al, (%rsi)
+; AVX512NOTDQ-NEXT: retq
+ %d0 = load <3 x i1>, <3 x i1>* %a0
+ %d1 = shufflevector <3 x i1> %d0,<3 x i1> undef,<1 x i32><i32 2>
+ store <1 x i1> %d1, <1 x i1>* %a1
+ ret void
+}
+define void @load_v4i1_broadcast_2_v1i1_store(<4 x i1>* %a0,<1 x i1>* %a1) {
+; AVX512-LABEL: load_v4i1_broadcast_2_v1i1_store:
+; AVX512: # %bb.0:
+; AVX512-NEXT: kmovb (%rdi), %k0
+; AVX512-NEXT: kshiftrw $2, %k0, %k0
+; AVX512-NEXT: kmovb %k0, (%rsi)
+; AVX512-NEXT: retq
+;
+; AVX512NOTDQ-LABEL: load_v4i1_broadcast_2_v1i1_store:
+; AVX512NOTDQ: # %bb.0:
+; AVX512NOTDQ-NEXT: movzbl (%rdi), %eax
+; AVX512NOTDQ-NEXT: kmovd %eax, %k0
+; AVX512NOTDQ-NEXT: kshiftrw $2, %k0, %k0
+; AVX512NOTDQ-NEXT: kmovd %k0, %eax
+; AVX512NOTDQ-NEXT: movb %al, (%rsi)
+; AVX512NOTDQ-NEXT: retq
+ %d0 = load <4 x i1>, <4 x i1>* %a0
+ %d1 = shufflevector <4 x i1> %d0,<4 x i1> undef,<1 x i32><i32 2>
+ store <1 x i1> %d1, <1 x i1>* %a1
+ ret void
+}
+define void @load_v4i1_broadcast_3_v1i1_store(<4 x i1>* %a0,<1 x i1>* %a1) {
+; AVX512-LABEL: load_v4i1_broadcast_3_v1i1_store:
+; AVX512: # %bb.0:
+; AVX512-NEXT: kmovb (%rdi), %k0
+; AVX512-NEXT: kshiftrw $3, %k0, %k0
+; AVX512-NEXT: kmovb %k0, (%rsi)
+; AVX512-NEXT: retq
+;
+; AVX512NOTDQ-LABEL: load_v4i1_broadcast_3_v1i1_store:
+; AVX512NOTDQ: # %bb.0:
+; AVX512NOTDQ-NEXT: movzbl (%rdi), %eax
+; AVX512NOTDQ-NEXT: kmovd %eax, %k0
+; AVX512NOTDQ-NEXT: kshiftrw $3, %k0, %k0
+; AVX512NOTDQ-NEXT: kmovd %k0, %eax
+; AVX512NOTDQ-NEXT: movb %al, (%rsi)
+; AVX512NOTDQ-NEXT: retq
+ %d0 = load <4 x i1>, <4 x i1>* %a0
+ %d1 = shufflevector <4 x i1> %d0,<4 x i1> undef,<1 x i32><i32 3>
+ store <1 x i1> %d1, <1 x i1>* %a1
+ ret void
+}
+define void @load_v8i1_broadcast_4_v1i1_store(<8 x i1>* %a0,<1 x i1>* %a1) {
+; AVX512-LABEL: load_v8i1_broadcast_4_v1i1_store:
+; AVX512: # %bb.0:
+; AVX512-NEXT: kmovb (%rdi), %k0
+; AVX512-NEXT: kshiftrw $4, %k0, %k0
+; AVX512-NEXT: kmovb %k0, (%rsi)
+; AVX512-NEXT: retq
+;
+; AVX512NOTDQ-LABEL: load_v8i1_broadcast_4_v1i1_store:
+; AVX512NOTDQ: # %bb.0:
+; AVX512NOTDQ-NEXT: movzbl (%rdi), %eax
+; AVX512NOTDQ-NEXT: kmovd %eax, %k0
+; AVX512NOTDQ-NEXT: kshiftrw $4, %k0, %k0
+; AVX512NOTDQ-NEXT: kmovd %k0, %eax
+; AVX512NOTDQ-NEXT: movb %al, (%rsi)
+; AVX512NOTDQ-NEXT: retq
+ %d0 = load <8 x i1>, <8 x i1>* %a0
+ %d1 = shufflevector <8 x i1> %d0,<8 x i1> undef,<1 x i32><i32 4>
+ store <1 x i1> %d1, <1 x i1>* %a1
+ ret void
+}
+define void @load_v8i1_broadcast_4_v2i1_store(<8 x i1>* %a0,<2 x i1>* %a1) {
+; AVX512-LABEL: load_v8i1_broadcast_4_v2i1_store:
+; AVX512: # %bb.0:
+; AVX512-NEXT: kmovb (%rdi), %k0
+; AVX512-NEXT: kshiftrw $4, %k0, %k0
+; AVX512-NEXT: vpmovm2q %k0, %xmm0
+; AVX512-NEXT: vpbroadcastq %xmm0, %xmm0
+; AVX512-NEXT: vpmovq2m %xmm0, %k0
+; AVX512-NEXT: kmovb %k0, (%rsi)
+; AVX512-NEXT: retq
+;
+; AVX512NOTDQ-LABEL: load_v8i1_broadcast_4_v2i1_store:
+; AVX512NOTDQ: # %bb.0:
+; AVX512NOTDQ-NEXT: movzbl (%rdi), %eax
+; AVX512NOTDQ-NEXT: kmovd %eax, %k0
+; AVX512NOTDQ-NEXT: kshiftrw $4, %k0, %k1
+; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX512NOTDQ-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
+; AVX512NOTDQ-NEXT: vpbroadcastq %xmm0, %xmm0
+; AVX512NOTDQ-NEXT: vpsllq $63, %xmm0, %xmm0
+; AVX512NOTDQ-NEXT: vptestmq %xmm0, %xmm0, %k0
+; AVX512NOTDQ-NEXT: kmovd %k0, %eax
+; AVX512NOTDQ-NEXT: movb %al, (%rsi)
+; AVX512NOTDQ-NEXT: retq
+ %d0 = load <8 x i1>, <8 x i1>* %a0
+ %d1 = shufflevector <8 x i1> %d0,<8 x i1> undef,<2 x i32><i32 4,i32 4>
+ store <2 x i1> %d1, <2 x i1>* %a1
+ ret void
+}
+define void @load_v8i1_broadcast_7_v1i1_store(<8 x i1>* %a0,<1 x i1>* %a1) {
+; AVX512-LABEL: load_v8i1_broadcast_7_v1i1_store:
+; AVX512: # %bb.0:
+; AVX512-NEXT: kmovb (%rdi), %k0
+; AVX512-NEXT: kshiftrw $7, %k0, %k0
+; AVX512-NEXT: kmovb %k0, (%rsi)
+; AVX512-NEXT: retq
+;
+; AVX512NOTDQ-LABEL: load_v8i1_broadcast_7_v1i1_store:
+; AVX512NOTDQ: # %bb.0:
+; AVX512NOTDQ-NEXT: movzbl (%rdi), %eax
+; AVX512NOTDQ-NEXT: kmovd %eax, %k0
+; AVX512NOTDQ-NEXT: kshiftrw $7, %k0, %k0
+; AVX512NOTDQ-NEXT: kmovd %k0, %eax
+; AVX512NOTDQ-NEXT: movb %al, (%rsi)
+; AVX512NOTDQ-NEXT: retq
+ %d0 = load <8 x i1>, <8 x i1>* %a0
+ %d1 = shufflevector <8 x i1> %d0,<8 x i1> undef,<1 x i32><i32 7>
+ store <1 x i1> %d1, <1 x i1>* %a1
+ ret void
+}
+define void @load_v8i1_broadcast_7_v2i1_store(<8 x i1>* %a0,<2 x i1>* %a1) {
+; AVX512-LABEL: load_v8i1_broadcast_7_v2i1_store:
+; AVX512: # %bb.0:
+; AVX512-NEXT: kmovb (%rdi), %k0
+; AVX512-NEXT: kshiftrw $6, %k0, %k0
+; AVX512-NEXT: vpmovm2q %k0, %xmm0
+; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX512-NEXT: vpmovq2m %xmm0, %k0
+; AVX512-NEXT: kmovb %k0, (%rsi)
+; AVX512-NEXT: retq
+;
+; AVX512NOTDQ-LABEL: load_v8i1_broadcast_7_v2i1_store:
+; AVX512NOTDQ: # %bb.0:
+; AVX512NOTDQ-NEXT: movzbl (%rdi), %eax
+; AVX512NOTDQ-NEXT: kmovd %eax, %k0
+; AVX512NOTDQ-NEXT: kshiftrw $6, %k0, %k1
+; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX512NOTDQ-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
+; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX512NOTDQ-NEXT: vpsllq $63, %xmm0, %xmm0
+; AVX512NOTDQ-NEXT: vptestmq %xmm0, %xmm0, %k0
+; AVX512NOTDQ-NEXT: kmovd %k0, %eax
+; AVX512NOTDQ-NEXT: movb %al, (%rsi)
+; AVX512NOTDQ-NEXT: retq
+ %d0 = load <8 x i1>, <8 x i1>* %a0
+ %d1 = shufflevector <8 x i1> %d0,<8 x i1> undef,<2 x i32><i32 7,i32 7>
+ store <2 x i1> %d1, <2 x i1>* %a1
+ ret void
+}
+define void @load_v16i1_broadcast_8_v1i1_store(<16 x i1>* %a0,<1 x i1>* %a1) {
+; AVX512-LABEL: load_v16i1_broadcast_8_v1i1_store:
+; AVX512: # %bb.0:
+; AVX512-NEXT: kmovw (%rdi), %k0
+; AVX512-NEXT: kshiftrw $8, %k0, %k0
+; AVX512-NEXT: kmovb %k0, (%rsi)
+; AVX512-NEXT: retq
+;
+; AVX512NOTDQ-LABEL: load_v16i1_broadcast_8_v1i1_store:
+; AVX512NOTDQ: # %bb.0:
+; AVX512NOTDQ-NEXT: kmovw (%rdi), %k0
+; AVX512NOTDQ-NEXT: kshiftrw $8, %k0, %k0
+; AVX512NOTDQ-NEXT: kmovd %k0, %eax
+; AVX512NOTDQ-NEXT: movb %al, (%rsi)
+; AVX512NOTDQ-NEXT: retq
+ %d0 = load <16 x i1>, <16 x i1>* %a0
+ %d1 = shufflevector <16 x i1> %d0,<16 x i1> undef,<1 x i32><i32 8>
+ store <1 x i1> %d1, <1 x i1>* %a1
+ ret void
+}
+define void @load_v16i1_broadcast_8_v2i1_store(<16 x i1>* %a0,<2 x i1>* %a1) {
+; AVX512-LABEL: load_v16i1_broadcast_8_v2i1_store:
+; AVX512: # %bb.0:
+; AVX512-NEXT: kmovw (%rdi), %k0
+; AVX512-NEXT: kshiftrw $8, %k0, %k0
+; AVX512-NEXT: vpmovm2q %k0, %xmm0
+; AVX512-NEXT: vpbroadcastq %xmm0, %xmm0
+; AVX512-NEXT: vpmovq2m %xmm0, %k0
+; AVX512-NEXT: kmovb %k0, (%rsi)
+; AVX512-NEXT: retq
+;
+; AVX512NOTDQ-LABEL: load_v16i1_broadcast_8_v2i1_store:
+; AVX512NOTDQ: # %bb.0:
+; AVX512NOTDQ-NEXT: kmovw (%rdi), %k0
+; AVX512NOTDQ-NEXT: kshiftrw $8, %k0, %k1
+; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX512NOTDQ-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
+; AVX512NOTDQ-NEXT: vpbroadcastq %xmm0, %xmm0
+; AVX512NOTDQ-NEXT: vpsllq $63, %xmm0, %xmm0
+; AVX512NOTDQ-NEXT: vptestmq %xmm0, %xmm0, %k0
+; AVX512NOTDQ-NEXT: kmovd %k0, %eax
+; AVX512NOTDQ-NEXT: movb %al, (%rsi)
+; AVX512NOTDQ-NEXT: retq
+ %d0 = load <16 x i1>, <16 x i1>* %a0
+ %d1 = shufflevector <16 x i1> %d0,<16 x i1> undef,<2 x i32><i32 8,i32 8>
+ store <2 x i1> %d1, <2 x i1>* %a1
+ ret void
+}
+define void @load_v16i1_broadcast_8_v4i1_store(<16 x i1>* %a0,<4 x i1>* %a1) {
+; AVX512-LABEL: load_v16i1_broadcast_8_v4i1_store:
+; AVX512: # %bb.0:
+; AVX512-NEXT: kmovw (%rdi), %k0
+; AVX512-NEXT: kshiftrw $8, %k0, %k0
+; AVX512-NEXT: vpmovm2d %k0, %xmm0
+; AVX512-NEXT: vpbroadcastd %xmm0, %xmm0
+; AVX512-NEXT: vpmovd2m %xmm0, %k0
+; AVX512-NEXT: kmovb %k0, (%rsi)
+; AVX512-NEXT: retq
+;
+; AVX512NOTDQ-LABEL: load_v16i1_broadcast_8_v4i1_store:
+; AVX512NOTDQ: # %bb.0:
+; AVX512NOTDQ-NEXT: kmovw (%rdi), %k0
+; AVX512NOTDQ-NEXT: kshiftrw $8, %k0, %k1
+; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX512NOTDQ-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
+; AVX512NOTDQ-NEXT: vpbroadcastd %xmm0, %xmm0
+; AVX512NOTDQ-NEXT: vpslld $31, %xmm0, %xmm0
+; AVX512NOTDQ-NEXT: vptestmd %xmm0, %xmm0, %k0
+; AVX512NOTDQ-NEXT: kmovd %k0, %eax
+; AVX512NOTDQ-NEXT: movb %al, (%rsi)
+; AVX512NOTDQ-NEXT: retq
+ %d0 = load <16 x i1>, <16 x i1>* %a0
+ %d1 = shufflevector <16 x i1> %d0,<16 x i1> undef,<4 x i32><i32 8,i32 8,i32 8,i32 8>
+ store <4 x i1> %d1, <4 x i1>* %a1
+ ret void
+}
+define void @load_v16i1_broadcast_15_v1i1_store(<16 x i1>* %a0,<1 x i1>* %a1) {
+; AVX512-LABEL: load_v16i1_broadcast_15_v1i1_store:
+; AVX512: # %bb.0:
+; AVX512-NEXT: kmovw (%rdi), %k0
+; AVX512-NEXT: kshiftrw $15, %k0, %k0
+; AVX512-NEXT: kmovb %k0, (%rsi)
+; AVX512-NEXT: retq
+;
+; AVX512NOTDQ-LABEL: load_v16i1_broadcast_15_v1i1_store:
+; AVX512NOTDQ: # %bb.0:
+; AVX512NOTDQ-NEXT: kmovw (%rdi), %k0
+; AVX512NOTDQ-NEXT: kshiftrw $15, %k0, %k0
+; AVX512NOTDQ-NEXT: kmovd %k0, %eax
+; AVX512NOTDQ-NEXT: movb %al, (%rsi)
+; AVX512NOTDQ-NEXT: retq
+ %d0 = load <16 x i1>, <16 x i1>* %a0
+ %d1 = shufflevector <16 x i1> %d0,<16 x i1> undef,<1 x i32><i32 15>
+ store <1 x i1> %d1, <1 x i1>* %a1
+ ret void
+}
+define void @load_v16i1_broadcast_15_v2i1_store(<16 x i1>* %a0,<2 x i1>* %a1) {
+; AVX512-LABEL: load_v16i1_broadcast_15_v2i1_store:
+; AVX512: # %bb.0:
+; AVX512-NEXT: kmovw (%rdi), %k0
+; AVX512-NEXT: kshiftrw $14, %k0, %k0
+; AVX512-NEXT: vpmovm2q %k0, %xmm0
+; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX512-NEXT: vpmovq2m %xmm0, %k0
+; AVX512-NEXT: kmovb %k0, (%rsi)
+; AVX512-NEXT: retq
+;
+; AVX512NOTDQ-LABEL: load_v16i1_broadcast_15_v2i1_store:
+; AVX512NOTDQ: # %bb.0:
+; AVX512NOTDQ-NEXT: kmovw (%rdi), %k0
+; AVX512NOTDQ-NEXT: kshiftrw $14, %k0, %k1
+; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX512NOTDQ-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
+; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX512NOTDQ-NEXT: vpsllq $63, %xmm0, %xmm0
+; AVX512NOTDQ-NEXT: vptestmq %xmm0, %xmm0, %k0
+; AVX512NOTDQ-NEXT: kmovd %k0, %eax
+; AVX512NOTDQ-NEXT: movb %al, (%rsi)
+; AVX512NOTDQ-NEXT: retq
+ %d0 = load <16 x i1>, <16 x i1>* %a0
+ %d1 = shufflevector <16 x i1> %d0,<16 x i1> undef,<2 x i32><i32 15,i32 15>
+ store <2 x i1> %d1, <2 x i1>* %a1
+ ret void
+}
+define void @load_v16i1_broadcast_15_v4i1_store(<16 x i1>* %a0,<4 x i1>* %a1) {
+; AVX512-LABEL: load_v16i1_broadcast_15_v4i1_store:
+; AVX512: # %bb.0:
+; AVX512-NEXT: kmovw (%rdi), %k0
+; AVX512-NEXT: kshiftrw $12, %k0, %k0
+; AVX512-NEXT: vpmovm2d %k0, %xmm0
+; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX512-NEXT: vpmovd2m %xmm0, %k0
+; AVX512-NEXT: kmovb %k0, (%rsi)
+; AVX512-NEXT: retq
+;
+; AVX512NOTDQ-LABEL: load_v16i1_broadcast_15_v4i1_store:
+; AVX512NOTDQ: # %bb.0:
+; AVX512NOTDQ-NEXT: kmovw (%rdi), %k0
+; AVX512NOTDQ-NEXT: kshiftrw $12, %k0, %k1
+; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX512NOTDQ-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
+; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX512NOTDQ-NEXT: vpslld $31, %xmm0, %xmm0
+; AVX512NOTDQ-NEXT: vptestmd %xmm0, %xmm0, %k0
+; AVX512NOTDQ-NEXT: kmovd %k0, %eax
+; AVX512NOTDQ-NEXT: movb %al, (%rsi)
+; AVX512NOTDQ-NEXT: retq
+ %d0 = load <16 x i1>, <16 x i1>* %a0
+ %d1 = shufflevector <16 x i1> %d0,<16 x i1> undef,<4 x i32><i32 15,i32 15,i32 15,i32 15>
+ store <4 x i1> %d1, <4 x i1>* %a1
+ ret void
+}
+define void @load_v32i1_broadcast_16_v1i1_store(<32 x i1>* %a0,<1 x i1>* %a1) {
+; AVX512-LABEL: load_v32i1_broadcast_16_v1i1_store:
+; AVX512: # %bb.0:
+; AVX512-NEXT: kmovd (%rdi), %k0
+; AVX512-NEXT: kshiftrd $16, %k0, %k0
+; AVX512-NEXT: kmovb %k0, (%rsi)
+; AVX512-NEXT: retq
+;
+; AVX512NOTDQ-LABEL: load_v32i1_broadcast_16_v1i1_store:
+; AVX512NOTDQ: # %bb.0:
+; AVX512NOTDQ-NEXT: kmovd (%rdi), %k0
+; AVX512NOTDQ-NEXT: kshiftrd $16, %k0, %k0
+; AVX512NOTDQ-NEXT: kmovd %k0, %eax
+; AVX512NOTDQ-NEXT: movb %al, (%rsi)
+; AVX512NOTDQ-NEXT: retq
+ %d0 = load <32 x i1>, <32 x i1>* %a0
+ %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<1 x i32><i32 16>
+ store <1 x i1> %d1, <1 x i1>* %a1
+ ret void
+}
+define void @load_v32i1_broadcast_16_v2i1_store(<32 x i1>* %a0,<2 x i1>* %a1) {
+; AVX512-LABEL: load_v32i1_broadcast_16_v2i1_store:
+; AVX512: # %bb.0:
+; AVX512-NEXT: kmovd (%rdi), %k0
+; AVX512-NEXT: kshiftrd $16, %k0, %k0
+; AVX512-NEXT: vpmovm2q %k0, %xmm0
+; AVX512-NEXT: vpbroadcastq %xmm0, %xmm0
+; AVX512-NEXT: vpmovq2m %xmm0, %k0
+; AVX512-NEXT: kmovb %k0, (%rsi)
+; AVX512-NEXT: retq
+;
+; AVX512NOTDQ-LABEL: load_v32i1_broadcast_16_v2i1_store:
+; AVX512NOTDQ: # %bb.0:
+; AVX512NOTDQ-NEXT: kmovd (%rdi), %k0
+; AVX512NOTDQ-NEXT: kshiftrd $16, %k0, %k1
+; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX512NOTDQ-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
+; AVX512NOTDQ-NEXT: vpbroadcastq %xmm0, %xmm0
+; AVX512NOTDQ-NEXT: vpsllq $63, %xmm0, %xmm0
+; AVX512NOTDQ-NEXT: vptestmq %xmm0, %xmm0, %k0
+; AVX512NOTDQ-NEXT: kmovd %k0, %eax
+; AVX512NOTDQ-NEXT: movb %al, (%rsi)
+; AVX512NOTDQ-NEXT: retq
+ %d0 = load <32 x i1>, <32 x i1>* %a0
+ %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<2 x i32><i32 16,i32 16>
+ store <2 x i1> %d1, <2 x i1>* %a1
+ ret void
+}
+define void @load_v32i1_broadcast_16_v4i1_store(<32 x i1>* %a0,<4 x i1>* %a1) {
+; AVX512-LABEL: load_v32i1_broadcast_16_v4i1_store:
+; AVX512: # %bb.0:
+; AVX512-NEXT: kmovd (%rdi), %k0
+; AVX512-NEXT: kshiftrd $16, %k0, %k0
+; AVX512-NEXT: vpmovm2d %k0, %xmm0
+; AVX512-NEXT: vpbroadcastd %xmm0, %xmm0
+; AVX512-NEXT: vpmovd2m %xmm0, %k0
+; AVX512-NEXT: kmovb %k0, (%rsi)
+; AVX512-NEXT: retq
+;
+; AVX512NOTDQ-LABEL: load_v32i1_broadcast_16_v4i1_store:
+; AVX512NOTDQ: # %bb.0:
+; AVX512NOTDQ-NEXT: kmovd (%rdi), %k0
+; AVX512NOTDQ-NEXT: kshiftrd $16, %k0, %k1
+; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX512NOTDQ-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
+; AVX512NOTDQ-NEXT: vpbroadcastd %xmm0, %xmm0
+; AVX512NOTDQ-NEXT: vpslld $31, %xmm0, %xmm0
+; AVX512NOTDQ-NEXT: vptestmd %xmm0, %xmm0, %k0
+; AVX512NOTDQ-NEXT: kmovd %k0, %eax
+; AVX512NOTDQ-NEXT: movb %al, (%rsi)
+; AVX512NOTDQ-NEXT: retq
+ %d0 = load <32 x i1>, <32 x i1>* %a0
+ %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<4 x i32><i32 16,i32 16,i32 16,i32 16>
+ store <4 x i1> %d1, <4 x i1>* %a1
+ ret void
+}
+define void @load_v32i1_broadcast_16_v8i1_store(<32 x i1>* %a0,<8 x i1>* %a1) {
+; AVX512-LABEL: load_v32i1_broadcast_16_v8i1_store:
+; AVX512: # %bb.0:
+; AVX512-NEXT: kmovd (%rdi), %k0
+; AVX512-NEXT: kshiftrd $16, %k0, %k0
+; AVX512-NEXT: vpmovm2q %k0, %zmm0
+; AVX512-NEXT: vpbroadcastq %xmm0, %zmm0
+; AVX512-NEXT: vpmovq2m %zmm0, %k0
+; AVX512-NEXT: kmovb %k0, (%rsi)
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
+;
+; AVX512NOTDQ-LABEL: load_v32i1_broadcast_16_v8i1_store:
+; AVX512NOTDQ: # %bb.0:
+; AVX512NOTDQ-NEXT: kmovd (%rdi), %k0
+; AVX512NOTDQ-NEXT: kshiftrd $16, %k0, %k1
+; AVX512NOTDQ-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512NOTDQ-NEXT: vpbroadcastq %xmm0, %zmm0
+; AVX512NOTDQ-NEXT: vpsllq $63, %zmm0, %zmm0
+; AVX512NOTDQ-NEXT: vptestmq %zmm0, %zmm0, %k0
+; AVX512NOTDQ-NEXT: kmovd %k0, %eax
+; AVX512NOTDQ-NEXT: movb %al, (%rsi)
+; AVX512NOTDQ-NEXT: vzeroupper
+; AVX512NOTDQ-NEXT: retq
+ %d0 = load <32 x i1>, <32 x i1>* %a0
+ %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<8 x i32><i32 16,i32 16,i32 16,i32 16,i32 16,i32 16,i32 16,i32 16>
+ store <8 x i1> %d1, <8 x i1>* %a1
+ ret void
+}
+define void @load_v32i1_broadcast_31_v1i1_store(<32 x i1>* %a0,<1 x i1>* %a1) {
+; AVX512-LABEL: load_v32i1_broadcast_31_v1i1_store:
+; AVX512: # %bb.0:
+; AVX512-NEXT: kmovd (%rdi), %k0
+; AVX512-NEXT: kshiftrd $31, %k0, %k0
+; AVX512-NEXT: kmovb %k0, (%rsi)
+; AVX512-NEXT: retq
+;
+; AVX512NOTDQ-LABEL: load_v32i1_broadcast_31_v1i1_store:
+; AVX512NOTDQ: # %bb.0:
+; AVX512NOTDQ-NEXT: kmovd (%rdi), %k0
+; AVX512NOTDQ-NEXT: kshiftrd $31, %k0, %k0
+; AVX512NOTDQ-NEXT: kmovd %k0, %eax
+; AVX512NOTDQ-NEXT: movb %al, (%rsi)
+; AVX512NOTDQ-NEXT: retq
+ %d0 = load <32 x i1>, <32 x i1>* %a0
+ %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<1 x i32><i32 31>
+ store <1 x i1> %d1, <1 x i1>* %a1
+ ret void
+}
+define void @load_v32i1_broadcast_31_v2i1_store(<32 x i1>* %a0,<2 x i1>* %a1) {
+; AVX512-LABEL: load_v32i1_broadcast_31_v2i1_store:
+; AVX512: # %bb.0:
+; AVX512-NEXT: kmovd (%rdi), %k0
+; AVX512-NEXT: kshiftrd $30, %k0, %k0
+; AVX512-NEXT: vpmovm2q %k0, %xmm0
+; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX512-NEXT: vpmovq2m %xmm0, %k0
+; AVX512-NEXT: kmovb %k0, (%rsi)
+; AVX512-NEXT: retq
+;
+; AVX512NOTDQ-LABEL: load_v32i1_broadcast_31_v2i1_store:
+; AVX512NOTDQ: # %bb.0:
+; AVX512NOTDQ-NEXT: kmovd (%rdi), %k0
+; AVX512NOTDQ-NEXT: kshiftrd $30, %k0, %k1
+; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX512NOTDQ-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
+; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX512NOTDQ-NEXT: vpsllq $63, %xmm0, %xmm0
+; AVX512NOTDQ-NEXT: vptestmq %xmm0, %xmm0, %k0
+; AVX512NOTDQ-NEXT: kmovd %k0, %eax
+; AVX512NOTDQ-NEXT: movb %al, (%rsi)
+; AVX512NOTDQ-NEXT: retq
+ %d0 = load <32 x i1>, <32 x i1>* %a0
+ %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<2 x i32><i32 31,i32 31>
+ store <2 x i1> %d1, <2 x i1>* %a1
+ ret void
+}
+define void @load_v32i1_broadcast_31_v4i1_store(<32 x i1>* %a0,<4 x i1>* %a1) {
+; AVX512-LABEL: load_v32i1_broadcast_31_v4i1_store:
+; AVX512: # %bb.0:
+; AVX512-NEXT: kmovd (%rdi), %k0
+; AVX512-NEXT: kshiftrd $28, %k0, %k0
+; AVX512-NEXT: vpmovm2d %k0, %xmm0
+; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX512-NEXT: vpmovd2m %xmm0, %k0
+; AVX512-NEXT: kmovb %k0, (%rsi)
+; AVX512-NEXT: retq
+;
+; AVX512NOTDQ-LABEL: load_v32i1_broadcast_31_v4i1_store:
+; AVX512NOTDQ: # %bb.0:
+; AVX512NOTDQ-NEXT: kmovd (%rdi), %k0
+; AVX512NOTDQ-NEXT: kshiftrd $28, %k0, %k1
+; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX512NOTDQ-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
+; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX512NOTDQ-NEXT: vpslld $31, %xmm0, %xmm0
+; AVX512NOTDQ-NEXT: vptestmd %xmm0, %xmm0, %k0
+; AVX512NOTDQ-NEXT: kmovd %k0, %eax
+; AVX512NOTDQ-NEXT: movb %al, (%rsi)
+; AVX512NOTDQ-NEXT: retq
+ %d0 = load <32 x i1>, <32 x i1>* %a0
+ %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<4 x i32><i32 31,i32 31,i32 31,i32 31>
+ store <4 x i1> %d1, <4 x i1>* %a1
+ ret void
+}
+define void @load_v32i1_broadcast_31_v8i1_store(<32 x i1>* %a0,<8 x i1>* %a1) {
+; AVX512-LABEL: load_v32i1_broadcast_31_v8i1_store:
+; AVX512: # %bb.0:
+; AVX512-NEXT: kmovd (%rdi), %k0
+; AVX512-NEXT: kshiftrd $24, %k0, %k0
+; AVX512-NEXT: vpmovm2q %k0, %zmm0
+; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm1 = [7,7,7,7,7,7,7,7]
+; AVX512-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512-NEXT: vpmovq2m %zmm0, %k0
+; AVX512-NEXT: kmovb %k0, (%rsi)
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
+;
+; AVX512NOTDQ-LABEL: load_v32i1_broadcast_31_v8i1_store:
+; AVX512NOTDQ: # %bb.0:
+; AVX512NOTDQ-NEXT: kmovd (%rdi), %k0
+; AVX512NOTDQ-NEXT: kshiftrd $24, %k0, %k1
+; AVX512NOTDQ-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512NOTDQ-NEXT: vpbroadcastq {{.*#+}} zmm1 = [7,7,7,7,7,7,7,7]
+; AVX512NOTDQ-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512NOTDQ-NEXT: vpsllq $63, %zmm0, %zmm0
+; AVX512NOTDQ-NEXT: vptestmq %zmm0, %zmm0, %k0
+; AVX512NOTDQ-NEXT: kmovd %k0, %eax
+; AVX512NOTDQ-NEXT: movb %al, (%rsi)
+; AVX512NOTDQ-NEXT: vzeroupper
+; AVX512NOTDQ-NEXT: retq
+ %d0 = load <32 x i1>, <32 x i1>* %a0
+ %d1 = shufflevector <32 x i1> %d0,<32 x i1> undef,<8 x i32><i32 31,i32 31,i32 31,i32 31,i32 31,i32 31,i32 31,i32 31>
+ store <8 x i1> %d1, <8 x i1>* %a1
+ ret void
+}
+define void @load_v64i1_broadcast_32_v1i1_store(<64 x i1>* %a0,<1 x i1>* %a1) {
+; AVX512-LABEL: load_v64i1_broadcast_32_v1i1_store:
+; AVX512: # %bb.0:
+; AVX512-NEXT: kmovq (%rdi), %k0
+; AVX512-NEXT: kshiftrq $32, %k0, %k0
+; AVX512-NEXT: kmovb %k0, (%rsi)
+; AVX512-NEXT: retq
+;
+; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v1i1_store:
+; AVX512NOTDQ: # %bb.0:
+; AVX512NOTDQ-NEXT: kmovq (%rdi), %k0
+; AVX512NOTDQ-NEXT: kshiftrq $32, %k0, %k0
+; AVX512NOTDQ-NEXT: kmovd %k0, %eax
+; AVX512NOTDQ-NEXT: movb %al, (%rsi)
+; AVX512NOTDQ-NEXT: retq
+ %d0 = load <64 x i1>, <64 x i1>* %a0
+ %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<1 x i32><i32 32>
+ store <1 x i1> %d1, <1 x i1>* %a1
+ ret void
+}
+define void @load_v64i1_broadcast_32_v2i1_store(<64 x i1>* %a0,<2 x i1>* %a1) {
+; AVX512-LABEL: load_v64i1_broadcast_32_v2i1_store:
+; AVX512: # %bb.0:
+; AVX512-NEXT: kmovq (%rdi), %k0
+; AVX512-NEXT: kshiftrq $32, %k0, %k0
+; AVX512-NEXT: vpmovm2q %k0, %xmm0
+; AVX512-NEXT: vpbroadcastq %xmm0, %xmm0
+; AVX512-NEXT: vpmovq2m %xmm0, %k0
+; AVX512-NEXT: kmovb %k0, (%rsi)
+; AVX512-NEXT: retq
+;
+; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v2i1_store:
+; AVX512NOTDQ: # %bb.0:
+; AVX512NOTDQ-NEXT: kmovq (%rdi), %k0
+; AVX512NOTDQ-NEXT: kshiftrq $32, %k0, %k1
+; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX512NOTDQ-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
+; AVX512NOTDQ-NEXT: vpbroadcastq %xmm0, %xmm0
+; AVX512NOTDQ-NEXT: vpsllq $63, %xmm0, %xmm0
+; AVX512NOTDQ-NEXT: vptestmq %xmm0, %xmm0, %k0
+; AVX512NOTDQ-NEXT: kmovd %k0, %eax
+; AVX512NOTDQ-NEXT: movb %al, (%rsi)
+; AVX512NOTDQ-NEXT: retq
+ %d0 = load <64 x i1>, <64 x i1>* %a0
+ %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<2 x i32><i32 32,i32 32>
+ store <2 x i1> %d1, <2 x i1>* %a1
+ ret void
+}
+define void @load_v64i1_broadcast_32_v4i1_store(<64 x i1>* %a0,<4 x i1>* %a1) {
+; AVX512-LABEL: load_v64i1_broadcast_32_v4i1_store:
+; AVX512: # %bb.0:
+; AVX512-NEXT: kmovq (%rdi), %k0
+; AVX512-NEXT: kshiftrq $32, %k0, %k0
+; AVX512-NEXT: vpmovm2d %k0, %xmm0
+; AVX512-NEXT: vpbroadcastd %xmm0, %xmm0
+; AVX512-NEXT: vpmovd2m %xmm0, %k0
+; AVX512-NEXT: kmovb %k0, (%rsi)
+; AVX512-NEXT: retq
+;
+; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v4i1_store:
+; AVX512NOTDQ: # %bb.0:
+; AVX512NOTDQ-NEXT: kmovq (%rdi), %k0
+; AVX512NOTDQ-NEXT: kshiftrq $32, %k0, %k1
+; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX512NOTDQ-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
+; AVX512NOTDQ-NEXT: vpbroadcastd %xmm0, %xmm0
+; AVX512NOTDQ-NEXT: vpslld $31, %xmm0, %xmm0
+; AVX512NOTDQ-NEXT: vptestmd %xmm0, %xmm0, %k0
+; AVX512NOTDQ-NEXT: kmovd %k0, %eax
+; AVX512NOTDQ-NEXT: movb %al, (%rsi)
+; AVX512NOTDQ-NEXT: retq
+ %d0 = load <64 x i1>, <64 x i1>* %a0
+ %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<4 x i32><i32 32,i32 32,i32 32,i32 32>
+ store <4 x i1> %d1, <4 x i1>* %a1
+ ret void
+}
+define void @load_v64i1_broadcast_32_v8i1_store(<64 x i1>* %a0,<8 x i1>* %a1) {
+; AVX512-LABEL: load_v64i1_broadcast_32_v8i1_store:
+; AVX512: # %bb.0:
+; AVX512-NEXT: kmovq (%rdi), %k0
+; AVX512-NEXT: kshiftrq $32, %k0, %k0
+; AVX512-NEXT: vpmovm2q %k0, %zmm0
+; AVX512-NEXT: vpbroadcastq %xmm0, %zmm0
+; AVX512-NEXT: vpmovq2m %zmm0, %k0
+; AVX512-NEXT: kmovb %k0, (%rsi)
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
+;
+; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v8i1_store:
+; AVX512NOTDQ: # %bb.0:
+; AVX512NOTDQ-NEXT: kmovq (%rdi), %k0
+; AVX512NOTDQ-NEXT: kshiftrq $32, %k0, %k1
+; AVX512NOTDQ-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512NOTDQ-NEXT: vpbroadcastq %xmm0, %zmm0
+; AVX512NOTDQ-NEXT: vpsllq $63, %zmm0, %zmm0
+; AVX512NOTDQ-NEXT: vptestmq %zmm0, %zmm0, %k0
+; AVX512NOTDQ-NEXT: kmovd %k0, %eax
+; AVX512NOTDQ-NEXT: movb %al, (%rsi)
+; AVX512NOTDQ-NEXT: vzeroupper
+; AVX512NOTDQ-NEXT: retq
+ %d0 = load <64 x i1>, <64 x i1>* %a0
+ %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<8 x i32><i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32>
+ store <8 x i1> %d1, <8 x i1>* %a1
+ ret void
+}
+define void @load_v64i1_broadcast_32_v16i1_store(<64 x i1>* %a0,<16 x i1>* %a1) {
+; AVX512-LABEL: load_v64i1_broadcast_32_v16i1_store:
+; AVX512: # %bb.0:
+; AVX512-NEXT: kmovq (%rdi), %k0
+; AVX512-NEXT: kshiftrq $32, %k0, %k0
+; AVX512-NEXT: vpmovm2d %k0, %zmm0
+; AVX512-NEXT: vpbroadcastd %xmm0, %zmm0
+; AVX512-NEXT: vpmovd2m %zmm0, %k0
+; AVX512-NEXT: kmovw %k0, (%rsi)
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
+;
+; AVX512NOTDQ-LABEL: load_v64i1_broadcast_32_v16i1_store:
+; AVX512NOTDQ: # %bb.0:
+; AVX512NOTDQ-NEXT: kmovq (%rdi), %k0
+; AVX512NOTDQ-NEXT: kshiftrq $32, %k0, %k1
+; AVX512NOTDQ-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512NOTDQ-NEXT: vpbroadcastd %xmm0, %zmm0
+; AVX512NOTDQ-NEXT: vpslld $31, %zmm0, %zmm0
+; AVX512NOTDQ-NEXT: vptestmd %zmm0, %zmm0, %k0
+; AVX512NOTDQ-NEXT: kmovw %k0, (%rsi)
+; AVX512NOTDQ-NEXT: vzeroupper
+; AVX512NOTDQ-NEXT: retq
+ %d0 = load <64 x i1>, <64 x i1>* %a0
+ %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<16 x i32><i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32,i32 32>
+ store <16 x i1> %d1, <16 x i1>* %a1
+ ret void
+}
+define void @load_v64i1_broadcast_63_v1i1_store(<64 x i1>* %a0,<1 x i1>* %a1) {
+; AVX512-LABEL: load_v64i1_broadcast_63_v1i1_store:
+; AVX512: # %bb.0:
+; AVX512-NEXT: kmovq (%rdi), %k0
+; AVX512-NEXT: kshiftrq $63, %k0, %k0
+; AVX512-NEXT: kmovb %k0, (%rsi)
+; AVX512-NEXT: retq
+;
+; AVX512NOTDQ-LABEL: load_v64i1_broadcast_63_v1i1_store:
+; AVX512NOTDQ: # %bb.0:
+; AVX512NOTDQ-NEXT: kmovq (%rdi), %k0
+; AVX512NOTDQ-NEXT: kshiftrq $63, %k0, %k0
+; AVX512NOTDQ-NEXT: kmovd %k0, %eax
+; AVX512NOTDQ-NEXT: movb %al, (%rsi)
+; AVX512NOTDQ-NEXT: retq
+ %d0 = load <64 x i1>, <64 x i1>* %a0
+ %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<1 x i32><i32 63>
+ store <1 x i1> %d1, <1 x i1>* %a1
+ ret void
+}
+define void @load_v64i1_broadcast_63_v2i1_store(<64 x i1>* %a0,<2 x i1>* %a1) {
+; AVX512-LABEL: load_v64i1_broadcast_63_v2i1_store:
+; AVX512: # %bb.0:
+; AVX512-NEXT: kmovq (%rdi), %k0
+; AVX512-NEXT: kshiftrq $62, %k0, %k0
+; AVX512-NEXT: vpmovm2q %k0, %xmm0
+; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX512-NEXT: vpmovq2m %xmm0, %k0
+; AVX512-NEXT: kmovb %k0, (%rsi)
+; AVX512-NEXT: retq
+;
+; AVX512NOTDQ-LABEL: load_v64i1_broadcast_63_v2i1_store:
+; AVX512NOTDQ: # %bb.0:
+; AVX512NOTDQ-NEXT: kmovq (%rdi), %k0
+; AVX512NOTDQ-NEXT: kshiftrq $62, %k0, %k1
+; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX512NOTDQ-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
+; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX512NOTDQ-NEXT: vpsllq $63, %xmm0, %xmm0
+; AVX512NOTDQ-NEXT: vptestmq %xmm0, %xmm0, %k0
+; AVX512NOTDQ-NEXT: kmovd %k0, %eax
+; AVX512NOTDQ-NEXT: movb %al, (%rsi)
+; AVX512NOTDQ-NEXT: retq
+ %d0 = load <64 x i1>, <64 x i1>* %a0
+ %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<2 x i32><i32 63,i32 63>
+ store <2 x i1> %d1, <2 x i1>* %a1
+ ret void
+}
+define void @load_v64i1_broadcast_63_v4i1_store(<64 x i1>* %a0,<4 x i1>* %a1) {
+; AVX512-LABEL: load_v64i1_broadcast_63_v4i1_store:
+; AVX512: # %bb.0:
+; AVX512-NEXT: kmovq (%rdi), %k0
+; AVX512-NEXT: kshiftrq $60, %k0, %k0
+; AVX512-NEXT: vpmovm2d %k0, %xmm0
+; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX512-NEXT: vpmovd2m %xmm0, %k0
+; AVX512-NEXT: kmovb %k0, (%rsi)
+; AVX512-NEXT: retq
+;
+; AVX512NOTDQ-LABEL: load_v64i1_broadcast_63_v4i1_store:
+; AVX512NOTDQ: # %bb.0:
+; AVX512NOTDQ-NEXT: kmovq (%rdi), %k0
+; AVX512NOTDQ-NEXT: kshiftrq $60, %k0, %k1
+; AVX512NOTDQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX512NOTDQ-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
+; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX512NOTDQ-NEXT: vpslld $31, %xmm0, %xmm0
+; AVX512NOTDQ-NEXT: vptestmd %xmm0, %xmm0, %k0
+; AVX512NOTDQ-NEXT: kmovd %k0, %eax
+; AVX512NOTDQ-NEXT: movb %al, (%rsi)
+; AVX512NOTDQ-NEXT: retq
+ %d0 = load <64 x i1>, <64 x i1>* %a0
+ %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<4 x i32><i32 63,i32 63,i32 63,i32 63>
+ store <4 x i1> %d1, <4 x i1>* %a1
+ ret void
+}
+define void @load_v64i1_broadcast_63_v8i1_store(<64 x i1>* %a0,<8 x i1>* %a1) {
+; AVX512-LABEL: load_v64i1_broadcast_63_v8i1_store:
+; AVX512: # %bb.0:
+; AVX512-NEXT: kmovq (%rdi), %k0
+; AVX512-NEXT: kshiftrq $56, %k0, %k0
+; AVX512-NEXT: vpmovm2q %k0, %zmm0
+; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm1 = [7,7,7,7,7,7,7,7]
+; AVX512-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512-NEXT: vpmovq2m %zmm0, %k0
+; AVX512-NEXT: kmovb %k0, (%rsi)
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
+;
+; AVX512NOTDQ-LABEL: load_v64i1_broadcast_63_v8i1_store:
+; AVX512NOTDQ: # %bb.0:
+; AVX512NOTDQ-NEXT: kmovq (%rdi), %k0
+; AVX512NOTDQ-NEXT: kshiftrq $56, %k0, %k1
+; AVX512NOTDQ-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512NOTDQ-NEXT: vpbroadcastq {{.*#+}} zmm1 = [7,7,7,7,7,7,7,7]
+; AVX512NOTDQ-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512NOTDQ-NEXT: vpsllq $63, %zmm0, %zmm0
+; AVX512NOTDQ-NEXT: vptestmq %zmm0, %zmm0, %k0
+; AVX512NOTDQ-NEXT: kmovd %k0, %eax
+; AVX512NOTDQ-NEXT: movb %al, (%rsi)
+; AVX512NOTDQ-NEXT: vzeroupper
+; AVX512NOTDQ-NEXT: retq
+ %d0 = load <64 x i1>, <64 x i1>* %a0
+ %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<8 x i32><i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63>
+ store <8 x i1> %d1, <8 x i1>* %a1
+ ret void
+}
+define void @load_v64i1_broadcast_63_v16i1_store(<64 x i1>* %a0,<16 x i1>* %a1) {
+; AVX512-LABEL: load_v64i1_broadcast_63_v16i1_store:
+; AVX512: # %bb.0:
+; AVX512-NEXT: kmovq (%rdi), %k0
+; AVX512-NEXT: kshiftrq $48, %k0, %k0
+; AVX512-NEXT: vpmovm2d %k0, %zmm0
+; AVX512-NEXT: vpbroadcastd {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512-NEXT: vpermd %zmm0, %zmm1, %zmm0
+; AVX512-NEXT: vpmovd2m %zmm0, %k0
+; AVX512-NEXT: kmovw %k0, (%rsi)
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
+;
+; AVX512NOTDQ-LABEL: load_v64i1_broadcast_63_v16i1_store:
+; AVX512NOTDQ: # %bb.0:
+; AVX512NOTDQ-NEXT: kmovq (%rdi), %k0
+; AVX512NOTDQ-NEXT: kshiftrq $48, %k0, %k1
+; AVX512NOTDQ-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512NOTDQ-NEXT: vpbroadcastd {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512NOTDQ-NEXT: vpermd %zmm0, %zmm1, %zmm0
+; AVX512NOTDQ-NEXT: vpslld $31, %zmm0, %zmm0
+; AVX512NOTDQ-NEXT: vptestmd %zmm0, %zmm0, %k0
+; AVX512NOTDQ-NEXT: kmovw %k0, (%rsi)
+; AVX512NOTDQ-NEXT: vzeroupper
+; AVX512NOTDQ-NEXT: retq
+ %d0 = load <64 x i1>, <64 x i1>* %a0
+ %d1 = shufflevector <64 x i1> %d0,<64 x i1> undef,<16 x i32><i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63,i32 63>
+ store <16 x i1> %d1, <16 x i1>* %a1
+ ret void
+}
+
diff --git a/test/CodeGen/X86/avx512-extract-subvector.ll b/test/CodeGen/X86/avx512-extract-subvector.ll
index 85db44ddd232..d0b6369556e0 100644
--- a/test/CodeGen/X86/avx512-extract-subvector.ll
+++ b/test/CodeGen/X86/avx512-extract-subvector.ll
@@ -4,8 +4,8 @@
define <8 x i16> @extract_subvector128_v32i16(<32 x i16> %x) nounwind {
; SKX-LABEL: extract_subvector128_v32i16:
-; SKX: ## BB#0:
-; SKX-NEXT: vextracti32x4 $2, %zmm0, %xmm0
+; SKX: ## %bb.0:
+; SKX-NEXT: vextractf32x4 $2, %zmm0, %xmm0
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%r1 = shufflevector <32 x i16> %x, <32 x i16> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
@@ -14,8 +14,8 @@ define <8 x i16> @extract_subvector128_v32i16(<32 x i16> %x) nounwind {
define <8 x i16> @extract_subvector128_v32i16_first_element(<32 x i16> %x) nounwind {
; SKX-LABEL: extract_subvector128_v32i16_first_element:
-; SKX: ## BB#0:
-; SKX-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; SKX: ## %bb.0:
+; SKX-NEXT: ## kill: def %xmm0 killed %xmm0 killed %zmm0
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%r1 = shufflevector <32 x i16> %x, <32 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -24,8 +24,8 @@ define <8 x i16> @extract_subvector128_v32i16_first_element(<32 x i16> %x) nounw
define <16 x i8> @extract_subvector128_v64i8(<64 x i8> %x) nounwind {
; SKX-LABEL: extract_subvector128_v64i8:
-; SKX: ## BB#0:
-; SKX-NEXT: vextracti32x4 $2, %zmm0, %xmm0
+; SKX: ## %bb.0:
+; SKX-NEXT: vextractf32x4 $2, %zmm0, %xmm0
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%r1 = shufflevector <64 x i8> %x, <64 x i8> undef, <16 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38,i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
@@ -34,8 +34,8 @@ define <16 x i8> @extract_subvector128_v64i8(<64 x i8> %x) nounwind {
define <16 x i8> @extract_subvector128_v64i8_first_element(<64 x i8> %x) nounwind {
; SKX-LABEL: extract_subvector128_v64i8_first_element:
-; SKX: ## BB#0:
-; SKX-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; SKX: ## %bb.0:
+; SKX-NEXT: ## kill: def %xmm0 killed %xmm0 killed %zmm0
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%r1 = shufflevector <64 x i8> %x, <64 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -45,8 +45,8 @@ define <16 x i8> @extract_subvector128_v64i8_first_element(<64 x i8> %x) nounwin
define <16 x i16> @extract_subvector256_v32i16(<32 x i16> %x) nounwind {
; SKX-LABEL: extract_subvector256_v32i16:
-; SKX: ## BB#0:
-; SKX-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; SKX: ## %bb.0:
+; SKX-NEXT: vextractf64x4 $1, %zmm0, %ymm0
; SKX-NEXT: retq
%r1 = shufflevector <32 x i16> %x, <32 x i16> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
ret <16 x i16> %r1
@@ -54,8 +54,8 @@ define <16 x i16> @extract_subvector256_v32i16(<32 x i16> %x) nounwind {
define <32 x i8> @extract_subvector256_v64i8(<64 x i8> %x) nounwind {
; SKX-LABEL: extract_subvector256_v64i8:
-; SKX: ## BB#0:
-; SKX-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; SKX: ## %bb.0:
+; SKX-NEXT: vextractf64x4 $1, %zmm0, %ymm0
; SKX-NEXT: retq
%r1 = shufflevector <64 x i8> %x, <64 x i8> undef, <32 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
ret <32 x i8> %r1
@@ -63,7 +63,7 @@ define <32 x i8> @extract_subvector256_v64i8(<64 x i8> %x) nounwind {
define void @extract_subvector256_v8f64_store(double* nocapture %addr, <4 x double> %a) nounwind uwtable ssp {
; SKX-LABEL: extract_subvector256_v8f64_store:
-; SKX: ## BB#0: ## %entry
+; SKX: ## %bb.0: ## %entry
; SKX-NEXT: vextractf128 $1, %ymm0, (%rdi)
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
@@ -76,7 +76,7 @@ entry:
define void @extract_subvector256_v8f32_store(float* nocapture %addr, <8 x float> %a) nounwind uwtable ssp {
; SKX-LABEL: extract_subvector256_v8f32_store:
-; SKX: ## BB#0: ## %entry
+; SKX: ## %bb.0: ## %entry
; SKX-NEXT: vextractf128 $1, %ymm0, (%rdi)
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
@@ -89,8 +89,8 @@ entry:
define void @extract_subvector256_v4i64_store(i64* nocapture %addr, <4 x i64> %a) nounwind uwtable ssp {
; SKX-LABEL: extract_subvector256_v4i64_store:
-; SKX: ## BB#0: ## %entry
-; SKX-NEXT: vextracti128 $1, %ymm0, (%rdi)
+; SKX: ## %bb.0: ## %entry
+; SKX-NEXT: vextractf128 $1, %ymm0, (%rdi)
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
entry:
@@ -102,8 +102,8 @@ entry:
define void @extract_subvector256_v8i32_store(i32* nocapture %addr, <8 x i32> %a) nounwind uwtable ssp {
; SKX-LABEL: extract_subvector256_v8i32_store:
-; SKX: ## BB#0: ## %entry
-; SKX-NEXT: vextracti128 $1, %ymm0, (%rdi)
+; SKX: ## %bb.0: ## %entry
+; SKX-NEXT: vextractf128 $1, %ymm0, (%rdi)
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
entry:
@@ -115,8 +115,8 @@ entry:
define void @extract_subvector256_v16i16_store(i16* nocapture %addr, <16 x i16> %a) nounwind uwtable ssp {
; SKX-LABEL: extract_subvector256_v16i16_store:
-; SKX: ## BB#0: ## %entry
-; SKX-NEXT: vextracti128 $1, %ymm0, (%rdi)
+; SKX: ## %bb.0: ## %entry
+; SKX-NEXT: vextractf128 $1, %ymm0, (%rdi)
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
entry:
@@ -128,8 +128,8 @@ entry:
define void @extract_subvector256_v32i8_store(i8* nocapture %addr, <32 x i8> %a) nounwind uwtable ssp {
; SKX-LABEL: extract_subvector256_v32i8_store:
-; SKX: ## BB#0: ## %entry
-; SKX-NEXT: vextracti128 $1, %ymm0, (%rdi)
+; SKX: ## %bb.0: ## %entry
+; SKX-NEXT: vextractf128 $1, %ymm0, (%rdi)
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
entry:
@@ -141,7 +141,7 @@ entry:
define void @extract_subvector256_v4f64_store_lo(double* nocapture %addr, <4 x double> %a) nounwind uwtable ssp {
; SKX-LABEL: extract_subvector256_v4f64_store_lo:
-; SKX: ## BB#0: ## %entry
+; SKX: ## %bb.0: ## %entry
; SKX-NEXT: vmovups %xmm0, (%rdi)
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
@@ -154,7 +154,7 @@ entry:
define void @extract_subvector256_v4f64_store_lo_align_16(double* nocapture %addr, <4 x double> %a) nounwind uwtable ssp {
; SKX-LABEL: extract_subvector256_v4f64_store_lo_align_16:
-; SKX: ## BB#0: ## %entry
+; SKX: ## %bb.0: ## %entry
; SKX-NEXT: vmovaps %xmm0, (%rdi)
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
@@ -167,7 +167,7 @@ entry:
define void @extract_subvector256_v4f32_store_lo(float* nocapture %addr, <8 x float> %a) nounwind uwtable ssp {
; SKX-LABEL: extract_subvector256_v4f32_store_lo:
-; SKX: ## BB#0: ## %entry
+; SKX: ## %bb.0: ## %entry
; SKX-NEXT: vmovups %xmm0, (%rdi)
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
@@ -180,7 +180,7 @@ entry:
define void @extract_subvector256_v4f32_store_lo_align_16(float* nocapture %addr, <8 x float> %a) nounwind uwtable ssp {
; SKX-LABEL: extract_subvector256_v4f32_store_lo_align_16:
-; SKX: ## BB#0: ## %entry
+; SKX: ## %bb.0: ## %entry
; SKX-NEXT: vmovaps %xmm0, (%rdi)
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
@@ -193,7 +193,7 @@ entry:
define void @extract_subvector256_v2i64_store_lo(i64* nocapture %addr, <4 x i64> %a) nounwind uwtable ssp {
; SKX-LABEL: extract_subvector256_v2i64_store_lo:
-; SKX: ## BB#0: ## %entry
+; SKX: ## %bb.0: ## %entry
; SKX-NEXT: vmovups %xmm0, (%rdi)
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
@@ -206,7 +206,7 @@ entry:
define void @extract_subvector256_v2i64_store_lo_align_16(i64* nocapture %addr, <4 x i64> %a) nounwind uwtable ssp {
; SKX-LABEL: extract_subvector256_v2i64_store_lo_align_16:
-; SKX: ## BB#0: ## %entry
+; SKX: ## %bb.0: ## %entry
; SKX-NEXT: vmovaps %xmm0, (%rdi)
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
@@ -219,7 +219,7 @@ entry:
define void @extract_subvector256_v4i32_store_lo(i32* nocapture %addr, <8 x i32> %a) nounwind uwtable ssp {
; SKX-LABEL: extract_subvector256_v4i32_store_lo:
-; SKX: ## BB#0: ## %entry
+; SKX: ## %bb.0: ## %entry
; SKX-NEXT: vmovups %xmm0, (%rdi)
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
@@ -232,7 +232,7 @@ entry:
define void @extract_subvector256_v4i32_store_lo_align_16(i32* nocapture %addr, <8 x i32> %a) nounwind uwtable ssp {
; SKX-LABEL: extract_subvector256_v4i32_store_lo_align_16:
-; SKX: ## BB#0: ## %entry
+; SKX: ## %bb.0: ## %entry
; SKX-NEXT: vmovaps %xmm0, (%rdi)
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
@@ -245,7 +245,7 @@ entry:
define void @extract_subvector256_v8i16_store_lo(i16* nocapture %addr, <16 x i16> %a) nounwind uwtable ssp {
; SKX-LABEL: extract_subvector256_v8i16_store_lo:
-; SKX: ## BB#0: ## %entry
+; SKX: ## %bb.0: ## %entry
; SKX-NEXT: vmovups %xmm0, (%rdi)
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
@@ -258,7 +258,7 @@ entry:
define void @extract_subvector256_v8i16_store_lo_align_16(i16* nocapture %addr, <16 x i16> %a) nounwind uwtable ssp {
; SKX-LABEL: extract_subvector256_v8i16_store_lo_align_16:
-; SKX: ## BB#0: ## %entry
+; SKX: ## %bb.0: ## %entry
; SKX-NEXT: vmovaps %xmm0, (%rdi)
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
@@ -271,7 +271,7 @@ entry:
define void @extract_subvector256_v16i8_store_lo(i8* nocapture %addr, <32 x i8> %a) nounwind uwtable ssp {
; SKX-LABEL: extract_subvector256_v16i8_store_lo:
-; SKX: ## BB#0: ## %entry
+; SKX: ## %bb.0: ## %entry
; SKX-NEXT: vmovups %xmm0, (%rdi)
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
@@ -284,7 +284,7 @@ entry:
define void @extract_subvector256_v16i8_store_lo_align_16(i8* nocapture %addr, <32 x i8> %a) nounwind uwtable ssp {
; SKX-LABEL: extract_subvector256_v16i8_store_lo_align_16:
-; SKX: ## BB#0: ## %entry
+; SKX: ## %bb.0: ## %entry
; SKX-NEXT: vmovaps %xmm0, (%rdi)
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
@@ -297,7 +297,7 @@ entry:
define void @extract_subvector512_v2f64_store_lo(double* nocapture %addr, <8 x double> %a) nounwind uwtable ssp {
; SKX-LABEL: extract_subvector512_v2f64_store_lo:
-; SKX: ## BB#0: ## %entry
+; SKX: ## %bb.0: ## %entry
; SKX-NEXT: vmovups %xmm0, (%rdi)
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
@@ -310,7 +310,7 @@ entry:
define void @extract_subvector512_v2f64_store_lo_align_16(double* nocapture %addr, <8 x double> %a) nounwind uwtable ssp {
; SKX-LABEL: extract_subvector512_v2f64_store_lo_align_16:
-; SKX: ## BB#0: ## %entry
+; SKX: ## %bb.0: ## %entry
; SKX-NEXT: vmovaps %xmm0, (%rdi)
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
@@ -323,7 +323,7 @@ entry:
define void @extract_subvector512_v4f32_store_lo(float* nocapture %addr, <16 x float> %a) nounwind uwtable ssp {
; SKX-LABEL: extract_subvector512_v4f32_store_lo:
-; SKX: ## BB#0: ## %entry
+; SKX: ## %bb.0: ## %entry
; SKX-NEXT: vmovups %xmm0, (%rdi)
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
@@ -336,7 +336,7 @@ entry:
define void @extract_subvector512_v4f32_store_lo_align_16(float* nocapture %addr, <16 x float> %a) nounwind uwtable ssp {
; SKX-LABEL: extract_subvector512_v4f32_store_lo_align_16:
-; SKX: ## BB#0: ## %entry
+; SKX: ## %bb.0: ## %entry
; SKX-NEXT: vmovaps %xmm0, (%rdi)
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
@@ -349,7 +349,7 @@ entry:
define void @extract_subvector512_v2i64_store_lo(i64* nocapture %addr, <8 x i64> %a) nounwind uwtable ssp {
; SKX-LABEL: extract_subvector512_v2i64_store_lo:
-; SKX: ## BB#0: ## %entry
+; SKX: ## %bb.0: ## %entry
; SKX-NEXT: vmovups %xmm0, (%rdi)
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
@@ -362,7 +362,7 @@ entry:
define void @extract_subvector512_v2i64_store_lo_align_16(i64* nocapture %addr, <8 x i64> %a) nounwind uwtable ssp {
; SKX-LABEL: extract_subvector512_v2i64_store_lo_align_16:
-; SKX: ## BB#0: ## %entry
+; SKX: ## %bb.0: ## %entry
; SKX-NEXT: vmovaps %xmm0, (%rdi)
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
@@ -375,7 +375,7 @@ entry:
define void @extract_subvector512_v4i32_store_lo(i32* nocapture %addr, <16 x i32> %a) nounwind uwtable ssp {
; SKX-LABEL: extract_subvector512_v4i32_store_lo:
-; SKX: ## BB#0: ## %entry
+; SKX: ## %bb.0: ## %entry
; SKX-NEXT: vmovups %xmm0, (%rdi)
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
@@ -388,7 +388,7 @@ entry:
define void @extract_subvector512_v4i32_store_lo_align_16(i32* nocapture %addr, <16 x i32> %a) nounwind uwtable ssp {
; SKX-LABEL: extract_subvector512_v4i32_store_lo_align_16:
-; SKX: ## BB#0: ## %entry
+; SKX: ## %bb.0: ## %entry
; SKX-NEXT: vmovaps %xmm0, (%rdi)
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
@@ -401,7 +401,7 @@ entry:
define void @extract_subvector512_v8i16_store_lo(i16* nocapture %addr, <32 x i16> %a) nounwind uwtable ssp {
; SKX-LABEL: extract_subvector512_v8i16_store_lo:
-; SKX: ## BB#0: ## %entry
+; SKX: ## %bb.0: ## %entry
; SKX-NEXT: vmovups %xmm0, (%rdi)
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
@@ -414,7 +414,7 @@ entry:
define void @extract_subvector512_v16i8_store_lo(i8* nocapture %addr, <64 x i8> %a) nounwind uwtable ssp {
; SKX-LABEL: extract_subvector512_v16i8_store_lo:
-; SKX: ## BB#0: ## %entry
+; SKX: ## %bb.0: ## %entry
; SKX-NEXT: vmovups %xmm0, (%rdi)
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
@@ -427,7 +427,7 @@ entry:
define void @extract_subvector512_v16i8_store_lo_align_16(i8* nocapture %addr, <64 x i8> %a) nounwind uwtable ssp {
; SKX-LABEL: extract_subvector512_v16i8_store_lo_align_16:
-; SKX: ## BB#0: ## %entry
+; SKX: ## %bb.0: ## %entry
; SKX-NEXT: vmovaps %xmm0, (%rdi)
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
@@ -440,7 +440,7 @@ entry:
define void @extract_subvector512_v4f64_store_lo(double* nocapture %addr, <8 x double> %a) nounwind uwtable ssp {
; SKX-LABEL: extract_subvector512_v4f64_store_lo:
-; SKX: ## BB#0: ## %entry
+; SKX: ## %bb.0: ## %entry
; SKX-NEXT: vmovups %ymm0, (%rdi)
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
@@ -453,7 +453,7 @@ entry:
define void @extract_subvector512_v4f64_store_lo_align_16(double* nocapture %addr, <8 x double> %a) nounwind uwtable ssp {
; SKX-LABEL: extract_subvector512_v4f64_store_lo_align_16:
-; SKX: ## BB#0: ## %entry
+; SKX: ## %bb.0: ## %entry
; SKX-NEXT: vmovups %ymm0, (%rdi)
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
@@ -466,7 +466,7 @@ entry:
define void @extract_subvector512_v4f64_store_lo_align_32(double* nocapture %addr, <8 x double> %a) nounwind uwtable ssp {
; SKX-LABEL: extract_subvector512_v4f64_store_lo_align_32:
-; SKX: ## BB#0: ## %entry
+; SKX: ## %bb.0: ## %entry
; SKX-NEXT: vmovaps %ymm0, (%rdi)
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
@@ -479,7 +479,7 @@ entry:
define void @extract_subvector512_v8f32_store_lo(float* nocapture %addr, <16 x float> %a) nounwind uwtable ssp {
; SKX-LABEL: extract_subvector512_v8f32_store_lo:
-; SKX: ## BB#0: ## %entry
+; SKX: ## %bb.0: ## %entry
; SKX-NEXT: vmovups %ymm0, (%rdi)
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
@@ -492,7 +492,7 @@ entry:
define void @extract_subvector512_v8f32_store_lo_align_16(float* nocapture %addr, <16 x float> %a) nounwind uwtable ssp {
; SKX-LABEL: extract_subvector512_v8f32_store_lo_align_16:
-; SKX: ## BB#0: ## %entry
+; SKX: ## %bb.0: ## %entry
; SKX-NEXT: vmovups %ymm0, (%rdi)
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
@@ -505,7 +505,7 @@ entry:
define void @extract_subvector512_v8f32_store_lo_align_32(float* nocapture %addr, <16 x float> %a) nounwind uwtable ssp {
; SKX-LABEL: extract_subvector512_v8f32_store_lo_align_32:
-; SKX: ## BB#0: ## %entry
+; SKX: ## %bb.0: ## %entry
; SKX-NEXT: vmovaps %ymm0, (%rdi)
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
@@ -518,7 +518,7 @@ entry:
define void @extract_subvector512_v4i64_store_lo(i64* nocapture %addr, <8 x i64> %a) nounwind uwtable ssp {
; SKX-LABEL: extract_subvector512_v4i64_store_lo:
-; SKX: ## BB#0: ## %entry
+; SKX: ## %bb.0: ## %entry
; SKX-NEXT: vmovups %ymm0, (%rdi)
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
@@ -531,7 +531,7 @@ entry:
define void @extract_subvector512_v4i64_store_lo_align_16(i64* nocapture %addr, <8 x i64> %a) nounwind uwtable ssp {
; SKX-LABEL: extract_subvector512_v4i64_store_lo_align_16:
-; SKX: ## BB#0: ## %entry
+; SKX: ## %bb.0: ## %entry
; SKX-NEXT: vmovups %ymm0, (%rdi)
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
@@ -544,7 +544,7 @@ entry:
define void @extract_subvector512_v4i64_store_lo_align_32(i64* nocapture %addr, <8 x i64> %a) nounwind uwtable ssp {
; SKX-LABEL: extract_subvector512_v4i64_store_lo_align_32:
-; SKX: ## BB#0: ## %entry
+; SKX: ## %bb.0: ## %entry
; SKX-NEXT: vmovaps %ymm0, (%rdi)
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
@@ -557,7 +557,7 @@ entry:
define void @extract_subvector512_v8i32_store_lo(i32* nocapture %addr, <16 x i32> %a) nounwind uwtable ssp {
; SKX-LABEL: extract_subvector512_v8i32_store_lo:
-; SKX: ## BB#0: ## %entry
+; SKX: ## %bb.0: ## %entry
; SKX-NEXT: vmovups %ymm0, (%rdi)
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
@@ -570,7 +570,7 @@ entry:
define void @extract_subvector512_v8i32_store_lo_align_16(i32* nocapture %addr, <16 x i32> %a) nounwind uwtable ssp {
; SKX-LABEL: extract_subvector512_v8i32_store_lo_align_16:
-; SKX: ## BB#0: ## %entry
+; SKX: ## %bb.0: ## %entry
; SKX-NEXT: vmovups %ymm0, (%rdi)
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
@@ -583,7 +583,7 @@ entry:
define void @extract_subvector512_v8i32_store_lo_align_32(i32* nocapture %addr, <16 x i32> %a) nounwind uwtable ssp {
; SKX-LABEL: extract_subvector512_v8i32_store_lo_align_32:
-; SKX: ## BB#0: ## %entry
+; SKX: ## %bb.0: ## %entry
; SKX-NEXT: vmovaps %ymm0, (%rdi)
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
@@ -596,7 +596,7 @@ entry:
define void @extract_subvector512_v16i16_store_lo(i16* nocapture %addr, <32 x i16> %a) nounwind uwtable ssp {
; SKX-LABEL: extract_subvector512_v16i16_store_lo:
-; SKX: ## BB#0: ## %entry
+; SKX: ## %bb.0: ## %entry
; SKX-NEXT: vmovups %ymm0, (%rdi)
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
@@ -609,7 +609,7 @@ entry:
define void @extract_subvector512_v16i16_store_lo_align_16(i16* nocapture %addr, <32 x i16> %a) nounwind uwtable ssp {
; SKX-LABEL: extract_subvector512_v16i16_store_lo_align_16:
-; SKX: ## BB#0: ## %entry
+; SKX: ## %bb.0: ## %entry
; SKX-NEXT: vmovups %ymm0, (%rdi)
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
@@ -622,7 +622,7 @@ entry:
define void @extract_subvector512_v16i16_store_lo_align_32(i16* nocapture %addr, <32 x i16> %a) nounwind uwtable ssp {
; SKX-LABEL: extract_subvector512_v16i16_store_lo_align_32:
-; SKX: ## BB#0: ## %entry
+; SKX: ## %bb.0: ## %entry
; SKX-NEXT: vmovaps %ymm0, (%rdi)
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
@@ -635,7 +635,7 @@ entry:
define void @extract_subvector512_v32i8_store_lo(i8* nocapture %addr, <64 x i8> %a) nounwind uwtable ssp {
; SKX-LABEL: extract_subvector512_v32i8_store_lo:
-; SKX: ## BB#0: ## %entry
+; SKX: ## %bb.0: ## %entry
; SKX-NEXT: vmovups %ymm0, (%rdi)
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
@@ -648,7 +648,7 @@ entry:
define void @extract_subvector512_v32i8_store_lo_align_16(i8* nocapture %addr, <64 x i8> %a) nounwind uwtable ssp {
; SKX-LABEL: extract_subvector512_v32i8_store_lo_align_16:
-; SKX: ## BB#0: ## %entry
+; SKX: ## %bb.0: ## %entry
; SKX-NEXT: vmovups %ymm0, (%rdi)
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
@@ -661,7 +661,7 @@ entry:
define void @extract_subvector512_v32i8_store_lo_align_32(i8* nocapture %addr, <64 x i8> %a) nounwind uwtable ssp {
; SKX-LABEL: extract_subvector512_v32i8_store_lo_align_32:
-; SKX: ## BB#0: ## %entry
+; SKX: ## %bb.0: ## %entry
; SKX-NEXT: vmovaps %ymm0, (%rdi)
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
@@ -674,7 +674,7 @@ entry:
define <4 x double> @test_mm512_mask_extractf64x4_pd(<4 x double> %__W, i8 %__U, <8 x double> %__A) {
; SKX-LABEL: test_mm512_mask_extractf64x4_pd:
-; SKX: ## BB#0: ## %entry
+; SKX: ## %bb.0: ## %entry
; SKX-NEXT: kmovd %edi, %k1
; SKX-NEXT: vextractf64x4 $1, %zmm1, %ymm0 {%k1}
; SKX-NEXT: retq
@@ -688,7 +688,7 @@ entry:
define <4 x double> @test_mm512_maskz_extractf64x4_pd(i8 %__U, <8 x double> %__A) {
; SKX-LABEL: test_mm512_maskz_extractf64x4_pd:
-; SKX: ## BB#0: ## %entry
+; SKX: ## %bb.0: ## %entry
; SKX-NEXT: kmovd %edi, %k1
; SKX-NEXT: vextractf64x4 $1, %zmm0, %ymm0 {%k1} {z}
; SKX-NEXT: retq
@@ -702,7 +702,7 @@ entry:
define <4 x float> @test_mm512_mask_extractf32x4_ps(<4 x float> %__W, i8 %__U, <8 x double> %__A) {
; SKX-LABEL: test_mm512_mask_extractf32x4_ps:
-; SKX: ## BB#0: ## %entry
+; SKX: ## %bb.0: ## %entry
; SKX-NEXT: kmovd %edi, %k1
; SKX-NEXT: vextractf32x4 $1, %zmm1, %xmm0 {%k1}
; SKX-NEXT: vzeroupper
@@ -718,7 +718,7 @@ entry:
define <4 x float> @test_mm512_maskz_extractf32x4_ps(i8 %__U, <8 x double> %__A) {
; SKX-LABEL: test_mm512_maskz_extractf32x4_ps:
-; SKX: ## BB#0: ## %entry
+; SKX: ## %bb.0: ## %entry
; SKX-NEXT: kmovd %edi, %k1
; SKX-NEXT: vextractf32x4 $1, %zmm0, %xmm0 {%k1} {z}
; SKX-NEXT: vzeroupper
@@ -734,7 +734,7 @@ entry:
define <2 x double> @test_mm256_mask_extractf64x2_pd(<2 x double> %__W, i8 %__U, <4 x double> %__A) {
; SKX-LABEL: test_mm256_mask_extractf64x2_pd:
-; SKX: ## BB#0: ## %entry
+; SKX: ## %bb.0: ## %entry
; SKX-NEXT: kmovd %edi, %k1
; SKX-NEXT: vextractf64x2 $1, %ymm1, %xmm0 {%k1}
; SKX-NEXT: vzeroupper
@@ -749,7 +749,7 @@ entry:
define <2 x double> @test_mm256_maskz_extractf64x2_pd(i8 %__U, <4 x double> %__A) {
; SKX-LABEL: test_mm256_maskz_extractf64x2_pd:
-; SKX: ## BB#0: ## %entry
+; SKX: ## %bb.0: ## %entry
; SKX-NEXT: kmovd %edi, %k1
; SKX-NEXT: vextractf64x2 $1, %ymm0, %xmm0 {%k1} {z}
; SKX-NEXT: vzeroupper
@@ -764,7 +764,7 @@ entry:
define <2 x i64> @test_mm256_mask_extracti64x2_epi64(<2 x i64> %__W, i8 %__U, <4 x i64> %__A) {
; SKX-LABEL: test_mm256_mask_extracti64x2_epi64:
-; SKX: ## BB#0: ## %entry
+; SKX: ## %bb.0: ## %entry
; SKX-NEXT: kmovd %edi, %k1
; SKX-NEXT: vextracti64x2 $1, %ymm1, %xmm0 {%k1}
; SKX-NEXT: vzeroupper
@@ -779,7 +779,7 @@ entry:
define <2 x i64> @test_mm256_maskz_extracti64x2_epi64(i8 %__U, <4 x i64> %__A) {
; SKX-LABEL: test_mm256_maskz_extracti64x2_epi64:
-; SKX: ## BB#0: ## %entry
+; SKX: ## %bb.0: ## %entry
; SKX-NEXT: kmovd %edi, %k1
; SKX-NEXT: vextracti64x2 $1, %ymm0, %xmm0 {%k1} {z}
; SKX-NEXT: vzeroupper
@@ -794,7 +794,7 @@ entry:
define <4 x float> @test_mm256_mask_extractf32x4_ps(<4 x float> %__W, i8 %__U, <8 x float> %__A) {
; SKX-LABEL: test_mm256_mask_extractf32x4_ps:
-; SKX: ## BB#0: ## %entry
+; SKX: ## %bb.0: ## %entry
; SKX-NEXT: kmovd %edi, %k1
; SKX-NEXT: vextractf32x4 $1, %ymm1, %xmm0 {%k1}
; SKX-NEXT: vzeroupper
@@ -809,7 +809,7 @@ entry:
define <4 x float> @test_mm256_maskz_extractf32x4_ps(i8 %__U, <8 x float> %__A) {
; SKX-LABEL: test_mm256_maskz_extractf32x4_ps:
-; SKX: ## BB#0: ## %entry
+; SKX: ## %bb.0: ## %entry
; SKX-NEXT: kmovd %edi, %k1
; SKX-NEXT: vextractf32x4 $1, %ymm0, %xmm0 {%k1} {z}
; SKX-NEXT: vzeroupper
@@ -824,7 +824,7 @@ entry:
define <2 x i64> @test_mm256_mask_extracti32x4_epi32(<2 x i64> %__W, i8 %__U, <4 x i64> %__A) {
; SKX-LABEL: test_mm256_mask_extracti32x4_epi32:
-; SKX: ## BB#0: ## %entry
+; SKX: ## %bb.0: ## %entry
; SKX-NEXT: kmovd %edi, %k1
; SKX-NEXT: vextracti32x4 $1, %ymm1, %xmm0 {%k1}
; SKX-NEXT: vzeroupper
@@ -842,7 +842,7 @@ entry:
define <2 x i64> @test_mm256_maskz_extracti32x4_epi32(i8 %__U, <4 x i64> %__A) {
; SKX-LABEL: test_mm256_maskz_extracti32x4_epi32:
-; SKX: ## BB#0: ## %entry
+; SKX: ## %bb.0: ## %entry
; SKX-NEXT: kmovd %edi, %k1
; SKX-NEXT: vextracti32x4 $1, %ymm0, %xmm0 {%k1} {z}
; SKX-NEXT: vzeroupper
@@ -859,7 +859,7 @@ entry:
define <8 x float> @test_mm512_mask_extractf32x8_ps(<8 x float> %__W, i8 %__U, <16 x float> %__A) {
; SKX-LABEL: test_mm512_mask_extractf32x8_ps:
-; SKX: ## BB#0: ## %entry
+; SKX: ## %bb.0: ## %entry
; SKX-NEXT: kmovd %edi, %k1
; SKX-NEXT: vextractf32x8 $1, %zmm1, %ymm0 {%k1}
; SKX-NEXT: retq
@@ -872,7 +872,7 @@ entry:
define <8 x float> @test_mm512_maskz_extractf32x8_ps(i8 %__U, <16 x float> %__A) {
; SKX-LABEL: test_mm512_maskz_extractf32x8_ps:
-; SKX: ## BB#0: ## %entry
+; SKX: ## %bb.0: ## %entry
; SKX-NEXT: kmovd %edi, %k1
; SKX-NEXT: vextractf32x8 $1, %zmm0, %ymm0 {%k1} {z}
; SKX-NEXT: retq
@@ -885,7 +885,7 @@ entry:
define <2 x double> @test_mm512_mask_extractf64x2_pd(<2 x double> %__W, i8 %__U, <8 x double> %__A) {
; SKX-LABEL: test_mm512_mask_extractf64x2_pd:
-; SKX: ## BB#0: ## %entry
+; SKX: ## %bb.0: ## %entry
; SKX-NEXT: kmovd %edi, %k1
; SKX-NEXT: vextractf64x2 $3, %zmm1, %xmm0 {%k1}
; SKX-NEXT: vzeroupper
@@ -900,7 +900,7 @@ entry:
define <2 x double> @test_mm512_maskz_extractf64x2_pd(i8 %__U, <8 x double> %__A) {
; SKX-LABEL: test_mm512_maskz_extractf64x2_pd:
-; SKX: ## BB#0: ## %entry
+; SKX: ## %bb.0: ## %entry
; SKX-NEXT: kmovd %edi, %k1
; SKX-NEXT: vextractf64x2 $3, %zmm0, %xmm0 {%k1} {z}
; SKX-NEXT: vzeroupper
diff --git a/test/CodeGen/X86/avx512-fma-commute.ll b/test/CodeGen/X86/avx512-fma-commute.ll
new file mode 100644
index 000000000000..194255179270
--- /dev/null
+++ b/test/CodeGen/X86/avx512-fma-commute.ll
@@ -0,0 +1,95 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=avx512f | FileCheck %s
+
+declare <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32)
+declare <2 x double> @llvm.x86.avx512.mask3.vfmadd.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32)
+declare <4 x float> @llvm.x86.avx512.mask3.vfmsub.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32)
+declare <2 x double> @llvm.x86.avx512.mask3.vfmsub.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32)
+
+define <4 x float> @test_int_x86_avx512_mask3_vfmadd_ss_load0(<4 x float>* %x0ptr, <4 x float> %x1, <4 x float> %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask3_vfmadd_ss_load0:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vfmadd231ss (%rdi), %xmm0, %xmm1
+; CHECK-NEXT: vmovaps %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %x0 = load <4 x float>, <4 x float>* %x0ptr
+ %res = call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 4)
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_int_x86_avx512_mask3_vfmadd_ss_load1(<4 x float> %x0, <4 x float>* %x1ptr, <4 x float> %x2){
+; CHECK-LABEL: test_int_x86_avx512_mask3_vfmadd_ss_load1:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vfmadd231ss (%rdi), %xmm0, %xmm1
+; CHECK-NEXT: vmovaps %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %x1 = load <4 x float>, <4 x float>* %x1ptr
+ %res = call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 4)
+ ret <4 x float> %res
+}
+
+define <2 x double> @test_int_x86_avx512_mask3_vfmadd_sd_load0(<2 x double>* %x0ptr, <2 x double> %x1, <2 x double> %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask3_vfmadd_sd_load0:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vfmadd231sd (%rdi), %xmm0, %xmm1
+; CHECK-NEXT: vmovapd %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %x0 = load <2 x double>, <2 x double>* %x0ptr
+ %res = call <2 x double> @llvm.x86.avx512.mask3.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 4)
+ ret <2 x double> %res
+}
+
+define <2 x double> @test_int_x86_avx512_mask3_vfmadd_sd_load1(<2 x double> %x0, <2 x double>* %x1ptr, <2 x double> %x2){
+; CHECK-LABEL: test_int_x86_avx512_mask3_vfmadd_sd_load1:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vfmadd231sd (%rdi), %xmm0, %xmm1
+; CHECK-NEXT: vmovapd %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %x1 = load <2 x double>, <2 x double>* %x1ptr
+ %res = call <2 x double> @llvm.x86.avx512.mask3.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 4)
+ ret <2 x double> %res
+}
+
+define <4 x float> @test_int_x86_avx512_mask3_vfmsub_ss_load0(<4 x float>* %x0ptr, <4 x float> %x1, <4 x float> %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsub_ss_load0:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vfmsub231ss (%rdi), %xmm0, %xmm1
+; CHECK-NEXT: vmovaps %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %x0 = load <4 x float>, <4 x float>* %x0ptr
+ %res = call <4 x float> @llvm.x86.avx512.mask3.vfmsub.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 4)
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_int_x86_avx512_mask3_vfmsub_ss_load1(<4 x float> %x0, <4 x float>* %x1ptr, <4 x float> %x2){
+; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsub_ss_load1:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vfmsub231ss (%rdi), %xmm0, %xmm1
+; CHECK-NEXT: vmovaps %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %x1 = load <4 x float>, <4 x float>* %x1ptr
+ %res = call <4 x float> @llvm.x86.avx512.mask3.vfmsub.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 4)
+ ret <4 x float> %res
+}
+
+define <2 x double> @test_int_x86_avx512_mask3_vfmsub_sd_load0(<2 x double>* %x0ptr, <2 x double> %x1, <2 x double> %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsub_sd_load0:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vfmsub231sd (%rdi), %xmm0, %xmm1
+; CHECK-NEXT: vmovapd %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %x0 = load <2 x double>, <2 x double>* %x0ptr
+ %res = call <2 x double> @llvm.x86.avx512.mask3.vfmsub.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 4)
+ ret <2 x double> %res
+}
+
+define <2 x double> @test_int_x86_avx512_mask3_vfmsub_sd_load1(<2 x double> %x0, <2 x double>* %x1ptr, <2 x double> %x2){
+; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsub_sd_load1:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vfmsub231sd (%rdi), %xmm0, %xmm1
+; CHECK-NEXT: vmovapd %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %x1 = load <2 x double>, <2 x double>* %x1ptr
+ %res = call <2 x double> @llvm.x86.avx512.mask3.vfmsub.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 4)
+ ret <2 x double> %res
+}
diff --git a/test/CodeGen/X86/avx512-fma-intrinsics.ll b/test/CodeGen/X86/avx512-fma-intrinsics.ll
index 27350f5d4c3f..f24856e54da6 100644
--- a/test/CodeGen/X86/avx512-fma-intrinsics.ll
+++ b/test/CodeGen/X86/avx512-fma-intrinsics.ll
@@ -6,7 +6,7 @@ declare <8 x double> @llvm.x86.avx512.mask.vfmadd.pd.512(<8 x double>, <8 x doub
define <16 x float> @test_x86_vfnmadd_ps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
; CHECK-LABEL: test_x86_vfnmadd_ps_z:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vfnmadd213ps %zmm2, %zmm1, %zmm0
; CHECK-NEXT: retq
%res = call <16 x float> @llvm.x86.avx512.mask.vfnmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 4) nounwind
@@ -16,7 +16,7 @@ declare <16 x float> @llvm.x86.avx512.mask.vfnmadd.ps.512(<16 x float>, <16 x fl
define <16 x float> @test_mask_vfnmadd_ps(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask) {
; CHECK-LABEL: test_mask_vfnmadd_ps:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vfnmadd132ps %zmm1, %zmm2, %zmm0 {%k1}
; CHECK-NEXT: retq
@@ -26,7 +26,7 @@ define <16 x float> @test_mask_vfnmadd_ps(<16 x float> %a0, <16 x float> %a1, <1
define <8 x double> @test_x86_vfnmadd_pd_z(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
; CHECK-LABEL: test_x86_vfnmadd_pd_z:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vfnmadd213pd %zmm2, %zmm1, %zmm0
; CHECK-NEXT: retq
%res = call <8 x double> @llvm.x86.avx512.mask.vfnmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 4) nounwind
@@ -36,7 +36,7 @@ declare <8 x double> @llvm.x86.avx512.mask.vfnmadd.pd.512(<8 x double>, <8 x dou
define <8 x double> @test_mask_vfnmadd_pd(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) {
; CHECK-LABEL: test_mask_vfnmadd_pd:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vfnmadd132pd %zmm1, %zmm2, %zmm0 {%k1}
; CHECK-NEXT: retq
@@ -46,7 +46,7 @@ define <8 x double> @test_mask_vfnmadd_pd(<8 x double> %a0, <8 x double> %a1, <8
define <16 x float> @test_x86_vfnmsubps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
; CHECK-LABEL: test_x86_vfnmsubps_z:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vfnmsub213ps %zmm2, %zmm1, %zmm0
; CHECK-NEXT: retq
%res = call <16 x float> @llvm.x86.avx512.mask.vfnmsub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 4) nounwind
@@ -56,7 +56,7 @@ declare <16 x float> @llvm.x86.avx512.mask.vfnmsub.ps.512(<16 x float>, <16 x fl
define <16 x float> @test_mask_vfnmsub_ps(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask) {
; CHECK-LABEL: test_mask_vfnmsub_ps:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vfnmsub132ps %zmm1, %zmm2, %zmm0 {%k1}
; CHECK-NEXT: retq
@@ -66,7 +66,7 @@ define <16 x float> @test_mask_vfnmsub_ps(<16 x float> %a0, <16 x float> %a1, <1
define <8 x double> @test_x86_vfnmsubpd_z(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
; CHECK-LABEL: test_x86_vfnmsubpd_z:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vfnmsub213pd %zmm2, %zmm1, %zmm0
; CHECK-NEXT: retq
%res = call <8 x double> @llvm.x86.avx512.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 4) nounwind
@@ -76,7 +76,7 @@ declare <8 x double> @llvm.x86.avx512.mask.vfnmsub.pd.512(<8 x double>, <8 x dou
define <8 x double> @test_mask_vfnmsub_pd(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) {
; CHECK-LABEL: test_mask_vfnmsub_pd:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vfnmsub132pd %zmm1, %zmm2, %zmm0 {%k1}
; CHECK-NEXT: retq
@@ -86,7 +86,7 @@ define <8 x double> @test_mask_vfnmsub_pd(<8 x double> %a0, <8 x double> %a1, <8
define <16 x float> @test_x86_vfmaddsubps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
; CHECK-LABEL: test_x86_vfmaddsubps_z:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vfmaddsub213ps %zmm2, %zmm1, %zmm0
; CHECK-NEXT: retq
%res = call <16 x float> @llvm.x86.avx512.mask.vfmaddsub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 4) nounwind
@@ -95,7 +95,7 @@ define <16 x float> @test_x86_vfmaddsubps_z(<16 x float> %a0, <16 x float> %a1,
define <16 x float> @test_mask_fmaddsub_ps(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask) {
; CHECK-LABEL: test_mask_fmaddsub_ps:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vfmaddsub132ps %zmm1, %zmm2, %zmm0 {%k1}
; CHECK-NEXT: retq
@@ -107,7 +107,7 @@ declare <16 x float> @llvm.x86.avx512.mask.vfmaddsub.ps.512(<16 x float>, <16 x
define <8 x double> @test_x86_vfmaddsubpd_z(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
; CHECK-LABEL: test_x86_vfmaddsubpd_z:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vfmaddsub213pd %zmm2, %zmm1, %zmm0
; CHECK-NEXT: retq
%res = call <8 x double> @llvm.x86.avx512.mask.vfmaddsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 4) nounwind
@@ -117,7 +117,7 @@ declare <8 x double> @llvm.x86.avx512.mask.vfmaddsub.pd.512(<8 x double>, <8 x d
define <8 x double> @test_mask_vfmaddsub_pd(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) {
; CHECK-LABEL: test_mask_vfmaddsub_pd:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vfmaddsub132pd %zmm1, %zmm2, %zmm0 {%k1}
; CHECK-NEXT: retq
@@ -127,7 +127,7 @@ define <8 x double> @test_mask_vfmaddsub_pd(<8 x double> %a0, <8 x double> %a1,
define <8 x double>@test_int_x86_avx512_mask_vfmaddsub_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3){
; CHECK-LABEL: test_int_x86_avx512_mask_vfmaddsub_pd_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovapd %zmm0, %zmm3
; CHECK-NEXT: vfmaddsub132pd %zmm1, %zmm2, %zmm3 {%k1}
@@ -144,7 +144,7 @@ declare <8 x double> @llvm.x86.avx512.mask3.vfmaddsub.pd.512(<8 x double>, <8 x
define <8 x double>@test_int_x86_avx512_mask3_vfmaddsub_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3){
; CHECK-LABEL: test_int_x86_avx512_mask3_vfmaddsub_pd_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovapd %zmm2, %zmm3
; CHECK-NEXT: vfmaddsub231pd %zmm1, %zmm0, %zmm3 {%k1}
@@ -161,7 +161,7 @@ declare <8 x double> @llvm.x86.avx512.maskz.vfmaddsub.pd.512(<8 x double>, <8 x
define <8 x double>@test_int_x86_avx512_maskz_vfmaddsub_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3){
; CHECK-LABEL: test_int_x86_avx512_maskz_vfmaddsub_pd_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovapd %zmm1, %zmm3
; CHECK-NEXT: vfmaddsub213pd %zmm2, %zmm0, %zmm3 {%k1} {z}
@@ -176,7 +176,7 @@ define <8 x double>@test_int_x86_avx512_maskz_vfmaddsub_pd_512(<8 x double> %x0,
define <16 x float>@test_int_x86_avx512_mask_vfmaddsub_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3){
; CHECK-LABEL: test_int_x86_avx512_mask_vfmaddsub_ps_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovaps %zmm0, %zmm3
; CHECK-NEXT: vfmaddsub132ps %zmm1, %zmm2, %zmm3 {%k1}
@@ -193,7 +193,7 @@ declare <16 x float> @llvm.x86.avx512.mask3.vfmaddsub.ps.512(<16 x float>, <16 x
define <16 x float>@test_int_x86_avx512_mask3_vfmaddsub_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3){
; CHECK-LABEL: test_int_x86_avx512_mask3_vfmaddsub_ps_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovaps %zmm2, %zmm3
; CHECK-NEXT: vfmaddsub231ps %zmm1, %zmm0, %zmm3 {%k1}
@@ -210,7 +210,7 @@ declare <16 x float> @llvm.x86.avx512.maskz.vfmaddsub.ps.512(<16 x float>, <16 x
define <16 x float>@test_int_x86_avx512_maskz_vfmaddsub_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3){
; CHECK-LABEL: test_int_x86_avx512_maskz_vfmaddsub_ps_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovaps %zmm1, %zmm3
; CHECK-NEXT: vfmaddsub213ps %zmm2, %zmm0, %zmm3 {%k1} {z}
@@ -227,7 +227,7 @@ declare <8 x double> @llvm.x86.avx512.mask3.vfmsubadd.pd.512(<8 x double>, <8 x
define <8 x double>@test_int_x86_avx512_mask3_vfmsubadd_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3){
; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsubadd_pd_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovapd %zmm2, %zmm3
; CHECK-NEXT: vfmsubadd231pd %zmm1, %zmm0, %zmm3 {%k1}
@@ -244,7 +244,7 @@ declare <16 x float> @llvm.x86.avx512.mask3.vfmsubadd.ps.512(<16 x float>, <16 x
define <16 x float>@test_int_x86_avx512_mask3_vfmsubadd_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3){
; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsubadd_ps_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovaps %zmm2, %zmm3
; CHECK-NEXT: vfmsubadd231ps %zmm1, %zmm0, %zmm3 {%k1}
@@ -259,7 +259,7 @@ define <16 x float>@test_int_x86_avx512_mask3_vfmsubadd_ps_512(<16 x float> %x0,
define <16 x float> @test_mask_round_vfmadd512_ps_rrb_rne(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask) {
; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrb_rne:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vfmadd132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
; CHECK-NEXT: retq
@@ -269,7 +269,7 @@ define <16 x float> @test_mask_round_vfmadd512_ps_rrb_rne(<16 x float> %a0, <16
define <16 x float> @test_mask_round_vfmadd512_ps_rrb_rtn(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask) {
; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrb_rtn:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vfmadd132ps {rd-sae}, %zmm1, %zmm2, %zmm0 {%k1}
; CHECK-NEXT: retq
@@ -279,7 +279,7 @@ define <16 x float> @test_mask_round_vfmadd512_ps_rrb_rtn(<16 x float> %a0, <16
define <16 x float> @test_mask_round_vfmadd512_ps_rrb_rtp(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask) {
; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrb_rtp:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vfmadd132ps {ru-sae}, %zmm1, %zmm2, %zmm0 {%k1}
; CHECK-NEXT: retq
@@ -289,7 +289,7 @@ define <16 x float> @test_mask_round_vfmadd512_ps_rrb_rtp(<16 x float> %a0, <16
define <16 x float> @test_mask_round_vfmadd512_ps_rrb_rtz(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask) {
; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrb_rtz:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vfmadd132ps {rz-sae}, %zmm1, %zmm2, %zmm0 {%k1}
; CHECK-NEXT: retq
@@ -299,7 +299,7 @@ define <16 x float> @test_mask_round_vfmadd512_ps_rrb_rtz(<16 x float> %a0, <16
define <16 x float> @test_mask_round_vfmadd512_ps_rrb_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask) {
; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrb_current:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vfmadd132ps %zmm1, %zmm2, %zmm0 {%k1}
; CHECK-NEXT: retq
@@ -309,7 +309,7 @@ define <16 x float> @test_mask_round_vfmadd512_ps_rrb_current(<16 x float> %a0,
define <16 x float> @test_mask_round_vfmadd512_ps_rrbz_rne(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrbz_rne:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vfmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0
; CHECK-NEXT: retq
%res = call <16 x float> @llvm.x86.avx512.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 0) nounwind
@@ -318,7 +318,7 @@ define <16 x float> @test_mask_round_vfmadd512_ps_rrbz_rne(<16 x float> %a0, <16
define <16 x float> @test_mask_round_vfmadd512_ps_rrbz_rtn(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrbz_rtn:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vfmadd213ps {rd-sae}, %zmm2, %zmm1, %zmm0
; CHECK-NEXT: retq
%res = call <16 x float> @llvm.x86.avx512.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 1) nounwind
@@ -327,7 +327,7 @@ define <16 x float> @test_mask_round_vfmadd512_ps_rrbz_rtn(<16 x float> %a0, <16
define <16 x float> @test_mask_round_vfmadd512_ps_rrbz_rtp(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrbz_rtp:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vfmadd213ps {ru-sae}, %zmm2, %zmm1, %zmm0
; CHECK-NEXT: retq
%res = call <16 x float> @llvm.x86.avx512.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 2) nounwind
@@ -336,7 +336,7 @@ define <16 x float> @test_mask_round_vfmadd512_ps_rrbz_rtp(<16 x float> %a0, <16
define <16 x float> @test_mask_round_vfmadd512_ps_rrbz_rtz(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrbz_rtz:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vfmadd213ps {rz-sae}, %zmm2, %zmm1, %zmm0
; CHECK-NEXT: retq
%res = call <16 x float> @llvm.x86.avx512.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 3) nounwind
@@ -345,7 +345,7 @@ define <16 x float> @test_mask_round_vfmadd512_ps_rrbz_rtz(<16 x float> %a0, <16
define <16 x float> @test_mask_round_vfmadd512_ps_rrbz_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrbz_current:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vfmadd213ps %zmm2, %zmm1, %zmm0
; CHECK-NEXT: retq
%res = call <16 x float> @llvm.x86.avx512.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 4) nounwind
@@ -356,7 +356,7 @@ declare <8 x double> @llvm.x86.avx512.mask3.vfmsub.pd.512(<8 x double>, <8 x dou
define <8 x double>@test_int_x86_avx512_mask3_vfmsub_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3){
; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsub_pd_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovapd %zmm2, %zmm3
; CHECK-NEXT: vfmsub231pd %zmm1, %zmm0, %zmm3 {%k1}
@@ -373,7 +373,7 @@ declare <16 x float> @llvm.x86.avx512.mask3.vfmsub.ps.512(<16 x float>, <16 x fl
define <16 x float>@test_int_x86_avx512_mask3_vfmsub_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3){
; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsub_ps_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovaps %zmm2, %zmm3
; CHECK-NEXT: vfmsub231ps %zmm1, %zmm0, %zmm3 {%k1}
@@ -388,7 +388,7 @@ define <16 x float>@test_int_x86_avx512_mask3_vfmsub_ps_512(<16 x float> %x0, <1
define <8 x double> @test_mask_round_vfmadd512_pd_rrb_rne(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) {
; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrb_rne:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vfmadd132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
; CHECK-NEXT: retq
@@ -398,7 +398,7 @@ define <8 x double> @test_mask_round_vfmadd512_pd_rrb_rne(<8 x double> %a0, <8 x
define <8 x double> @test_mask_round_vfmadd512_pd_rrb_rtn(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) {
; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrb_rtn:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vfmadd132pd {rd-sae}, %zmm1, %zmm2, %zmm0 {%k1}
; CHECK-NEXT: retq
@@ -408,7 +408,7 @@ define <8 x double> @test_mask_round_vfmadd512_pd_rrb_rtn(<8 x double> %a0, <8 x
define <8 x double> @test_mask_round_vfmadd512_pd_rrb_rtp(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) {
; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrb_rtp:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vfmadd132pd {ru-sae}, %zmm1, %zmm2, %zmm0 {%k1}
; CHECK-NEXT: retq
@@ -418,7 +418,7 @@ define <8 x double> @test_mask_round_vfmadd512_pd_rrb_rtp(<8 x double> %a0, <8 x
define <8 x double> @test_mask_round_vfmadd512_pd_rrb_rtz(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) {
; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrb_rtz:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vfmadd132pd {rz-sae}, %zmm1, %zmm2, %zmm0 {%k1}
; CHECK-NEXT: retq
@@ -428,7 +428,7 @@ define <8 x double> @test_mask_round_vfmadd512_pd_rrb_rtz(<8 x double> %a0, <8 x
define <8 x double> @test_mask_round_vfmadd512_pd_rrb_current(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) {
; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrb_current:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vfmadd132pd %zmm1, %zmm2, %zmm0 {%k1}
; CHECK-NEXT: retq
@@ -438,7 +438,7 @@ define <8 x double> @test_mask_round_vfmadd512_pd_rrb_current(<8 x double> %a0,
define <8 x double> @test_mask_round_vfmadd512_pd_rrbz_rne(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrbz_rne:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0
; CHECK-NEXT: retq
%res = call <8 x double> @llvm.x86.avx512.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 0) nounwind
@@ -447,7 +447,7 @@ define <8 x double> @test_mask_round_vfmadd512_pd_rrbz_rne(<8 x double> %a0, <8
define <8 x double> @test_mask_round_vfmadd512_pd_rrbz_rtn(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrbz_rtn:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vfmadd213pd {rd-sae}, %zmm2, %zmm1, %zmm0
; CHECK-NEXT: retq
%res = call <8 x double> @llvm.x86.avx512.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 1) nounwind
@@ -456,7 +456,7 @@ define <8 x double> @test_mask_round_vfmadd512_pd_rrbz_rtn(<8 x double> %a0, <8
define <8 x double> @test_mask_round_vfmadd512_pd_rrbz_rtp(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrbz_rtp:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vfmadd213pd {ru-sae}, %zmm2, %zmm1, %zmm0
; CHECK-NEXT: retq
%res = call <8 x double> @llvm.x86.avx512.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 2) nounwind
@@ -465,7 +465,7 @@ define <8 x double> @test_mask_round_vfmadd512_pd_rrbz_rtp(<8 x double> %a0, <8
define <8 x double> @test_mask_round_vfmadd512_pd_rrbz_rtz(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrbz_rtz:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vfmadd213pd {rz-sae}, %zmm2, %zmm1, %zmm0
; CHECK-NEXT: retq
%res = call <8 x double> @llvm.x86.avx512.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 3) nounwind
@@ -474,7 +474,7 @@ define <8 x double> @test_mask_round_vfmadd512_pd_rrbz_rtz(<8 x double> %a0, <8
define <8 x double> @test_mask_round_vfmadd512_pd_rrbz_current(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrbz_current:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vfmadd213pd %zmm2, %zmm1, %zmm0
; CHECK-NEXT: retq
%res = call <8 x double> @llvm.x86.avx512.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 4) nounwind
@@ -483,7 +483,7 @@ define <8 x double> @test_mask_round_vfmadd512_pd_rrbz_current(<8 x double> %a0,
define <8 x double>@test_int_x86_avx512_mask_vfmadd_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3){
; CHECK-LABEL: test_int_x86_avx512_mask_vfmadd_pd_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovapd %zmm0, %zmm3
; CHECK-NEXT: vfmadd132pd %zmm1, %zmm2, %zmm3 {%k1}
@@ -500,7 +500,7 @@ declare <8 x double> @llvm.x86.avx512.mask3.vfmadd.pd.512(<8 x double>, <8 x dou
define <8 x double>@test_int_x86_avx512_mask3_vfmadd_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3){
; CHECK-LABEL: test_int_x86_avx512_mask3_vfmadd_pd_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovapd %zmm2, %zmm3
; CHECK-NEXT: vfmadd231pd %zmm1, %zmm0, %zmm3 {%k1}
@@ -517,7 +517,7 @@ declare <8 x double> @llvm.x86.avx512.maskz.vfmadd.pd.512(<8 x double>, <8 x dou
define <8 x double>@test_int_x86_avx512_maskz_vfmadd_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3){
; CHECK-LABEL: test_int_x86_avx512_maskz_vfmadd_pd_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovapd %zmm1, %zmm3
; CHECK-NEXT: vfmadd213pd %zmm2, %zmm0, %zmm3 {%k1} {z}
@@ -532,7 +532,7 @@ define <8 x double>@test_int_x86_avx512_maskz_vfmadd_pd_512(<8 x double> %x0, <8
define <16 x float>@test_int_x86_avx512_mask_vfmadd_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3){
; CHECK-LABEL: test_int_x86_avx512_mask_vfmadd_ps_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovaps %zmm0, %zmm3
; CHECK-NEXT: vfmadd132ps %zmm1, %zmm2, %zmm3 {%k1}
@@ -549,7 +549,7 @@ declare <16 x float> @llvm.x86.avx512.mask3.vfmadd.ps.512(<16 x float>, <16 x fl
define <16 x float>@test_int_x86_avx512_mask3_vfmadd_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3){
; CHECK-LABEL: test_int_x86_avx512_mask3_vfmadd_ps_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovaps %zmm2, %zmm3
; CHECK-NEXT: vfmadd231ps %zmm1, %zmm0, %zmm3 {%k1}
@@ -566,7 +566,7 @@ declare <16 x float> @llvm.x86.avx512.maskz.vfmadd.ps.512(<16 x float>, <16 x fl
define <16 x float>@test_int_x86_avx512_maskz_vfmadd_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3){
; CHECK-LABEL: test_int_x86_avx512_maskz_vfmadd_ps_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovaps %zmm1, %zmm3
; CHECK-NEXT: vfmadd213ps %zmm2, %zmm0, %zmm3 {%k1} {z}
@@ -582,7 +582,7 @@ define <16 x float>@test_int_x86_avx512_maskz_vfmadd_ps_512(<16 x float> %x0, <1
define <8 x double> @test_mask_round_vfnmsub512_pd_rrb_rne(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) {
; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrb_rne:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vfnmsub132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1}
; CHECK-NEXT: retq
@@ -592,7 +592,7 @@ define <8 x double> @test_mask_round_vfnmsub512_pd_rrb_rne(<8 x double> %a0, <8
define <8 x double> @test_mask_round_vfnmsub512_pd_rrb_rtn(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) {
; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrb_rtn:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vfnmsub132pd {rd-sae}, %zmm1, %zmm2, %zmm0 {%k1}
; CHECK-NEXT: retq
@@ -602,7 +602,7 @@ define <8 x double> @test_mask_round_vfnmsub512_pd_rrb_rtn(<8 x double> %a0, <8
define <8 x double> @test_mask_round_vfnmsub512_pd_rrb_rtp(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) {
; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrb_rtp:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vfnmsub132pd {ru-sae}, %zmm1, %zmm2, %zmm0 {%k1}
; CHECK-NEXT: retq
@@ -612,7 +612,7 @@ define <8 x double> @test_mask_round_vfnmsub512_pd_rrb_rtp(<8 x double> %a0, <8
define <8 x double> @test_mask_round_vfnmsub512_pd_rrb_rtz(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) {
; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrb_rtz:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vfnmsub132pd {rz-sae}, %zmm1, %zmm2, %zmm0 {%k1}
; CHECK-NEXT: retq
@@ -622,7 +622,7 @@ define <8 x double> @test_mask_round_vfnmsub512_pd_rrb_rtz(<8 x double> %a0, <8
define <8 x double> @test_mask_round_vfnmsub512_pd_rrb_current(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) {
; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrb_current:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vfnmsub132pd %zmm1, %zmm2, %zmm0 {%k1}
; CHECK-NEXT: retq
@@ -632,7 +632,7 @@ define <8 x double> @test_mask_round_vfnmsub512_pd_rrb_current(<8 x double> %a0,
define <8 x double> @test_mask_round_vfnmsub512_pd_rrbz_rne(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrbz_rne:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vfnmsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0
; CHECK-NEXT: retq
%res = call <8 x double> @llvm.x86.avx512.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 0) nounwind
@@ -641,7 +641,7 @@ define <8 x double> @test_mask_round_vfnmsub512_pd_rrbz_rne(<8 x double> %a0, <8
define <8 x double> @test_mask_round_vfnmsub512_pd_rrbz_rtn(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrbz_rtn:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vfnmsub213pd {rd-sae}, %zmm2, %zmm1, %zmm0
; CHECK-NEXT: retq
%res = call <8 x double> @llvm.x86.avx512.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 1) nounwind
@@ -650,7 +650,7 @@ define <8 x double> @test_mask_round_vfnmsub512_pd_rrbz_rtn(<8 x double> %a0, <8
define <8 x double> @test_mask_round_vfnmsub512_pd_rrbz_rtp(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrbz_rtp:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vfnmsub213pd {ru-sae}, %zmm2, %zmm1, %zmm0
; CHECK-NEXT: retq
%res = call <8 x double> @llvm.x86.avx512.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 2) nounwind
@@ -659,7 +659,7 @@ define <8 x double> @test_mask_round_vfnmsub512_pd_rrbz_rtp(<8 x double> %a0, <8
define <8 x double> @test_mask_round_vfnmsub512_pd_rrbz_rtz(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrbz_rtz:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vfnmsub213pd {rz-sae}, %zmm2, %zmm1, %zmm0
; CHECK-NEXT: retq
%res = call <8 x double> @llvm.x86.avx512.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 3) nounwind
@@ -668,7 +668,7 @@ define <8 x double> @test_mask_round_vfnmsub512_pd_rrbz_rtz(<8 x double> %a0, <8
define <8 x double> @test_mask_round_vfnmsub512_pd_rrbz_current(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrbz_current:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vfnmsub213pd %zmm2, %zmm1, %zmm0
; CHECK-NEXT: retq
%res = call <8 x double> @llvm.x86.avx512.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 4) nounwind
@@ -677,7 +677,7 @@ define <8 x double> @test_mask_round_vfnmsub512_pd_rrbz_current(<8 x double> %a0
define <8 x double>@test_int_x86_avx512_mask_vfnmsub_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3){
; CHECK-LABEL: test_int_x86_avx512_mask_vfnmsub_pd_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovapd %zmm0, %zmm3
; CHECK-NEXT: vfnmsub132pd %zmm1, %zmm2, %zmm3 {%k1}
@@ -694,7 +694,7 @@ declare <8 x double> @llvm.x86.avx512.mask3.vfnmsub.pd.512(<8 x double>, <8 x do
define <8 x double>@test_int_x86_avx512_mask3_vfnmsub_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3){
; CHECK-LABEL: test_int_x86_avx512_mask3_vfnmsub_pd_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovapd %zmm2, %zmm3
; CHECK-NEXT: vfnmsub231pd %zmm1, %zmm0, %zmm3 {%k1}
@@ -709,7 +709,7 @@ define <8 x double>@test_int_x86_avx512_mask3_vfnmsub_pd_512(<8 x double> %x0, <
define <16 x float>@test_int_x86_avx512_mask_vfnmsub_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3){
; CHECK-LABEL: test_int_x86_avx512_mask_vfnmsub_ps_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovaps %zmm0, %zmm3
; CHECK-NEXT: vfnmsub132ps %zmm1, %zmm2, %zmm3 {%k1}
@@ -726,7 +726,7 @@ declare <16 x float> @llvm.x86.avx512.mask3.vfnmsub.ps.512(<16 x float>, <16 x f
define <16 x float>@test_int_x86_avx512_mask3_vfnmsub_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3){
; CHECK-LABEL: test_int_x86_avx512_mask3_vfnmsub_ps_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovaps %zmm2, %zmm3
; CHECK-NEXT: vfnmsub231ps %zmm1, %zmm0, %zmm3 {%k1}
@@ -741,7 +741,7 @@ define <16 x float>@test_int_x86_avx512_mask3_vfnmsub_ps_512(<16 x float> %x0, <
define <8 x double>@test_int_x86_avx512_mask_vfnmadd_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3){
; CHECK-LABEL: test_int_x86_avx512_mask_vfnmadd_pd_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovapd %zmm0, %zmm3
; CHECK-NEXT: vfnmadd132pd %zmm1, %zmm2, %zmm3 {%k1}
@@ -756,7 +756,7 @@ define <8 x double>@test_int_x86_avx512_mask_vfnmadd_pd_512(<8 x double> %x0, <8
define <16 x float>@test_int_x86_avx512_mask_vfnmadd_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3){
; CHECK-LABEL: test_int_x86_avx512_mask_vfnmadd_ps_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovaps %zmm0, %zmm3
; CHECK-NEXT: vfnmadd132ps %zmm1, %zmm2, %zmm3 {%k1}
diff --git a/test/CodeGen/X86/avx512-fma.ll b/test/CodeGen/X86/avx512-fma.ll
index 9622b81fd760..29ab76d4d372 100644
--- a/test/CodeGen/X86/avx512-fma.ll
+++ b/test/CodeGen/X86/avx512-fma.ll
@@ -4,7 +4,7 @@
define <16 x float> @test_x86_fmadd_ps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
; ALL-LABEL: test_x86_fmadd_ps_z:
-; ALL: ## BB#0:
+; ALL: ## %bb.0:
; ALL-NEXT: vfmadd213ps %zmm2, %zmm1, %zmm0
; ALL-NEXT: retq
%x = fmul <16 x float> %a0, %a1
@@ -14,7 +14,7 @@ define <16 x float> @test_x86_fmadd_ps_z(<16 x float> %a0, <16 x float> %a1, <16
define <16 x float> @test_x86_fmsub_ps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
; ALL-LABEL: test_x86_fmsub_ps_z:
-; ALL: ## BB#0:
+; ALL: ## %bb.0:
; ALL-NEXT: vfmsub213ps %zmm2, %zmm1, %zmm0
; ALL-NEXT: retq
%x = fmul <16 x float> %a0, %a1
@@ -24,7 +24,7 @@ define <16 x float> @test_x86_fmsub_ps_z(<16 x float> %a0, <16 x float> %a1, <16
define <16 x float> @test_x86_fnmadd_ps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
; ALL-LABEL: test_x86_fnmadd_ps_z:
-; ALL: ## BB#0:
+; ALL: ## %bb.0:
; ALL-NEXT: vfnmadd213ps %zmm2, %zmm1, %zmm0
; ALL-NEXT: retq
%x = fmul <16 x float> %a0, %a1
@@ -34,7 +34,7 @@ define <16 x float> @test_x86_fnmadd_ps_z(<16 x float> %a0, <16 x float> %a1, <1
define <16 x float> @test_x86_fnmsub_ps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
; ALL-LABEL: test_x86_fnmsub_ps_z:
-; ALL: ## BB#0:
+; ALL: ## %bb.0:
; ALL-NEXT: vfnmsub213ps %zmm2, %zmm1, %zmm0
; ALL-NEXT: retq
%x = fmul <16 x float> %a0, %a1
@@ -48,7 +48,7 @@ define <16 x float> @test_x86_fnmsub_ps_z(<16 x float> %a0, <16 x float> %a1, <1
define <8 x double> @test_x86_fmadd_pd_z(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
; ALL-LABEL: test_x86_fmadd_pd_z:
-; ALL: ## BB#0:
+; ALL: ## %bb.0:
; ALL-NEXT: vfmadd213pd %zmm2, %zmm1, %zmm0
; ALL-NEXT: retq
%x = fmul <8 x double> %a0, %a1
@@ -58,7 +58,7 @@ define <8 x double> @test_x86_fmadd_pd_z(<8 x double> %a0, <8 x double> %a1, <8
define <8 x double> @test_x86_fmsub_pd_z(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
; ALL-LABEL: test_x86_fmsub_pd_z:
-; ALL: ## BB#0:
+; ALL: ## %bb.0:
; ALL-NEXT: vfmsub213pd %zmm2, %zmm1, %zmm0
; ALL-NEXT: retq
%x = fmul <8 x double> %a0, %a1
@@ -68,7 +68,7 @@ define <8 x double> @test_x86_fmsub_pd_z(<8 x double> %a0, <8 x double> %a1, <8
define double @test_x86_fmsub_213(double %a0, double %a1, double %a2) {
; ALL-LABEL: test_x86_fmsub_213:
-; ALL: ## BB#0:
+; ALL: ## %bb.0:
; ALL-NEXT: vfmsub213sd %xmm2, %xmm1, %xmm0
; ALL-NEXT: retq
%x = fmul double %a0, %a1
@@ -78,7 +78,7 @@ define double @test_x86_fmsub_213(double %a0, double %a1, double %a2) {
define double @test_x86_fmsub_213_m(double %a0, double %a1, double * %a2_ptr) {
; ALL-LABEL: test_x86_fmsub_213_m:
-; ALL: ## BB#0:
+; ALL: ## %bb.0:
; ALL-NEXT: vfmsub213sd (%rdi), %xmm1, %xmm0
; ALL-NEXT: retq
%a2 = load double , double *%a2_ptr
@@ -89,7 +89,7 @@ define double @test_x86_fmsub_213_m(double %a0, double %a1, double * %a2_ptr) {
define double @test_x86_fmsub_231_m(double %a0, double %a1, double * %a2_ptr) {
; ALL-LABEL: test_x86_fmsub_231_m:
-; ALL: ## BB#0:
+; ALL: ## %bb.0:
; ALL-NEXT: vfmsub132sd (%rdi), %xmm1, %xmm0
; ALL-NEXT: retq
%a2 = load double , double *%a2_ptr
@@ -100,7 +100,7 @@ define double @test_x86_fmsub_231_m(double %a0, double %a1, double * %a2_ptr) {
define <16 x float> @test231_br(<16 x float> %a1, <16 x float> %a2) nounwind {
; ALL-LABEL: test231_br:
-; ALL: ## BB#0:
+; ALL: ## %bb.0:
; ALL-NEXT: vfmadd132ps {{.*}}(%rip){1to16}, %zmm1, %zmm0
; ALL-NEXT: retq
%b1 = fmul <16 x float> %a1, <float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000>
@@ -110,7 +110,7 @@ define <16 x float> @test231_br(<16 x float> %a1, <16 x float> %a2) nounwind {
define <16 x float> @test213_br(<16 x float> %a1, <16 x float> %a2) nounwind {
; ALL-LABEL: test213_br:
-; ALL: ## BB#0:
+; ALL: ## %bb.0:
; ALL-NEXT: vfmadd213ps {{.*}}(%rip){1to16}, %zmm1, %zmm0
; ALL-NEXT: retq
%b1 = fmul <16 x float> %a1, %a2
@@ -121,7 +121,7 @@ define <16 x float> @test213_br(<16 x float> %a1, <16 x float> %a2) nounwind {
;mask (a*c+b , a)
define <16 x float> @test_x86_fmadd132_ps(<16 x float> %a0, <16 x float> %a1, <16 x float> *%a2_ptrt, <16 x i1> %mask) {
; KNL-LABEL: test_x86_fmadd132_ps:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: vpmovsxbd %xmm2, %zmm2
; KNL-NEXT: vpslld $31, %zmm2, %zmm2
; KNL-NEXT: vptestmd %zmm2, %zmm2, %k1
@@ -129,7 +129,7 @@ define <16 x float> @test_x86_fmadd132_ps(<16 x float> %a0, <16 x float> %a1, <1
; KNL-NEXT: retq
;
; SKX-LABEL: test_x86_fmadd132_ps:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpsllw $7, %xmm2, %xmm2
; SKX-NEXT: vpmovb2m %xmm2, %k1
; SKX-NEXT: vfmadd132ps (%rdi), %zmm1, %zmm0 {%k1}
@@ -144,7 +144,7 @@ define <16 x float> @test_x86_fmadd132_ps(<16 x float> %a0, <16 x float> %a1, <1
;mask (a*c+b , b)
define <16 x float> @test_x86_fmadd231_ps(<16 x float> %a0, <16 x float> %a1, <16 x float> *%a2_ptrt, <16 x i1> %mask) {
; KNL-LABEL: test_x86_fmadd231_ps:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: vpmovsxbd %xmm2, %zmm2
; KNL-NEXT: vpslld $31, %zmm2, %zmm2
; KNL-NEXT: vptestmd %zmm2, %zmm2, %k1
@@ -153,7 +153,7 @@ define <16 x float> @test_x86_fmadd231_ps(<16 x float> %a0, <16 x float> %a1, <1
; KNL-NEXT: retq
;
; SKX-LABEL: test_x86_fmadd231_ps:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpsllw $7, %xmm2, %xmm2
; SKX-NEXT: vpmovb2m %xmm2, %k1
; SKX-NEXT: vfmadd231ps (%rdi), %zmm0, %zmm1 {%k1}
@@ -169,7 +169,7 @@ define <16 x float> @test_x86_fmadd231_ps(<16 x float> %a0, <16 x float> %a1, <1
;mask (b*a+c , b)
define <16 x float> @test_x86_fmadd213_ps(<16 x float> %a0, <16 x float> %a1, <16 x float> *%a2_ptrt, <16 x i1> %mask) {
; KNL-LABEL: test_x86_fmadd213_ps:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: vpmovsxbd %xmm2, %zmm2
; KNL-NEXT: vpslld $31, %zmm2, %zmm2
; KNL-NEXT: vptestmd %zmm2, %zmm2, %k1
@@ -178,7 +178,7 @@ define <16 x float> @test_x86_fmadd213_ps(<16 x float> %a0, <16 x float> %a1, <1
; KNL-NEXT: retq
;
; SKX-LABEL: test_x86_fmadd213_ps:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpsllw $7, %xmm2, %xmm2
; SKX-NEXT: vpmovb2m %xmm2, %k1
; SKX-NEXT: vfmadd213ps (%rdi), %zmm0, %zmm1 {%k1}
diff --git a/test/CodeGen/X86/avx512-fsel.ll b/test/CodeGen/X86/avx512-fsel.ll
index 7777ba795416..0da690669c34 100644
--- a/test/CodeGen/X86/avx512-fsel.ll
+++ b/test/CodeGen/X86/avx512-fsel.ll
@@ -6,15 +6,15 @@ target triple = "x86_64-apple-macosx10.11.0"
define i32 @test(float %a, float %b) {
; CHECK-LABEL: test:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: pushq %rax
-; CHECK-NEXT: Lcfi0:
; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: vcmpeqss %xmm1, %xmm0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: movb %al, %cl
-; CHECK-NEXT: xorb $-1, %cl
-; CHECK-NEXT: testb $1, %cl
+; CHECK-NEXT: vucomiss %xmm1, %xmm0
+; CHECK-NEXT: sete %al
+; CHECK-NEXT: setnp %cl
+; CHECK-NEXT: andb %cl, %al
+; CHECK-NEXT: xorb $-1, %al
+; CHECK-NEXT: testb $1, %al
; CHECK-NEXT: jne LBB0_1
; CHECK-NEXT: jmp LBB0_2
; CHECK-NEXT: LBB0_1: ## %L_0
diff --git a/test/CodeGen/X86/avx512-gather-scatter-intrin.ll b/test/CodeGen/X86/avx512-gather-scatter-intrin.ll
index c03623a2f035..9502ec95d092 100644
--- a/test/CodeGen/X86/avx512-gather-scatter-intrin.ll
+++ b/test/CodeGen/X86/avx512-gather-scatter-intrin.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq | FileCheck %s
declare <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float>, i8*, <16 x i32>, i16, i32)
declare void @llvm.x86.avx512.scatter.dps.512 (i8*, i16, <16 x i32>, <16 x float>, i32)
@@ -13,7 +13,7 @@ declare void @llvm.x86.avx512.scatter.qpd.512 (i8*, i8, <8 x i64>, <8 x double>,
define void @gather_mask_dps(<16 x i32> %ind, <16 x float> %src, i16 %mask, i8* %base, i8* %stbuf) {
; CHECK-LABEL: gather_mask_dps:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1
; CHECK-NEXT: kmovq %k1, %k2
; CHECK-NEXT: vgatherdps (%rsi,%zmm0,4), %zmm1 {%k2}
@@ -29,7 +29,7 @@ define void @gather_mask_dps(<16 x i32> %ind, <16 x float> %src, i16 %mask, i8*
define void @gather_mask_dpd(<8 x i32> %ind, <8 x double> %src, i8 %mask, i8* %base, i8* %stbuf) {
; CHECK-LABEL: gather_mask_dpd:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1
; CHECK-NEXT: kmovq %k1, %k2
; CHECK-NEXT: vgatherdpd (%rsi,%ymm0,4), %zmm1 {%k2}
@@ -45,7 +45,7 @@ define void @gather_mask_dpd(<8 x i32> %ind, <8 x double> %src, i8 %mask, i8* %b
define void @gather_mask_qps(<8 x i64> %ind, <8 x float> %src, i8 %mask, i8* %base, i8* %stbuf) {
; CHECK-LABEL: gather_mask_qps:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1
; CHECK-NEXT: kmovq %k1, %k2
; CHECK-NEXT: vgatherqps (%rsi,%zmm0,4), %ymm1 {%k2}
@@ -61,7 +61,7 @@ define void @gather_mask_qps(<8 x i64> %ind, <8 x float> %src, i8 %mask, i8* %ba
define void @gather_mask_qpd(<8 x i64> %ind, <8 x double> %src, i8 %mask, i8* %base, i8* %stbuf) {
; CHECK-LABEL: gather_mask_qpd:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1
; CHECK-NEXT: kmovq %k1, %k2
; CHECK-NEXT: vgatherqpd (%rsi,%zmm0,4), %zmm1 {%k2}
@@ -89,7 +89,7 @@ declare void @llvm.x86.avx512.scatter.qpq.512 (i8*, i8, <8 x i64>, <8 x i64>, i3
define void @gather_mask_dd(<16 x i32> %ind, <16 x i32> %src, i16 %mask, i8* %base, i8* %stbuf) {
; CHECK-LABEL: gather_mask_dd:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1
; CHECK-NEXT: kmovq %k1, %k2
; CHECK-NEXT: vpgatherdd (%rsi,%zmm0,4), %zmm1 {%k2}
@@ -105,7 +105,7 @@ define void @gather_mask_dd(<16 x i32> %ind, <16 x i32> %src, i16 %mask, i8* %ba
define void @gather_mask_qd(<8 x i64> %ind, <8 x i32> %src, i8 %mask, i8* %base, i8* %stbuf) {
; CHECK-LABEL: gather_mask_qd:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1
; CHECK-NEXT: kmovq %k1, %k2
; CHECK-NEXT: vpgatherqd (%rsi,%zmm0,4), %ymm1 {%k2}
@@ -121,7 +121,7 @@ define void @gather_mask_qd(<8 x i64> %ind, <8 x i32> %src, i8 %mask, i8* %base,
define void @gather_mask_qq(<8 x i64> %ind, <8 x i64> %src, i8 %mask, i8* %base, i8* %stbuf) {
; CHECK-LABEL: gather_mask_qq:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1
; CHECK-NEXT: kmovq %k1, %k2
; CHECK-NEXT: vpgatherqq (%rsi,%zmm0,4), %zmm1 {%k2}
@@ -137,7 +137,7 @@ define void @gather_mask_qq(<8 x i64> %ind, <8 x i64> %src, i8 %mask, i8* %base,
define void @gather_mask_dq(<8 x i32> %ind, <8 x i64> %src, i8 %mask, i8* %base, i8* %stbuf) {
; CHECK-LABEL: gather_mask_dq:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1
; CHECK-NEXT: kmovq %k1, %k2
; CHECK-NEXT: vpgatherdq (%rsi,%ymm0,4), %zmm1 {%k2}
@@ -153,7 +153,7 @@ define void @gather_mask_dq(<8 x i32> %ind, <8 x i64> %src, i8 %mask, i8* %base,
define void @gather_mask_dpd_execdomain(<8 x i32> %ind, <8 x double> %src, i8 %mask, i8* %base, <8 x double>* %stbuf) {
; CHECK-LABEL: gather_mask_dpd_execdomain:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1
; CHECK-NEXT: vgatherdpd (%rsi,%ymm0,4), %zmm1 {%k1}
; CHECK-NEXT: vmovapd %zmm1, (%rdx)
@@ -166,7 +166,7 @@ define void @gather_mask_dpd_execdomain(<8 x i32> %ind, <8 x double> %src, i8 %m
define void @gather_mask_qpd_execdomain(<8 x i64> %ind, <8 x double> %src, i8 %mask, i8* %base, <8 x double>* %stbuf) {
; CHECK-LABEL: gather_mask_qpd_execdomain:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1
; CHECK-NEXT: vgatherqpd (%rsi,%zmm0,4), %zmm1 {%k1}
; CHECK-NEXT: vmovapd %zmm1, (%rdx)
@@ -179,7 +179,7 @@ define void @gather_mask_qpd_execdomain(<8 x i64> %ind, <8 x double> %src, i8 %m
define <16 x float> @gather_mask_dps_execdomain(<16 x i32> %ind, <16 x float> %src, i16 %mask, i8* %base) {
; CHECK-LABEL: gather_mask_dps_execdomain:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1
; CHECK-NEXT: vgatherdps (%rsi,%zmm0,4), %zmm1 {%k1}
; CHECK-NEXT: vmovaps %zmm1, %zmm0
@@ -190,7 +190,7 @@ define <16 x float> @gather_mask_dps_execdomain(<16 x i32> %ind, <16 x float> %s
define <8 x float> @gather_mask_qps_execdomain(<8 x i64> %ind, <8 x float> %src, i8 %mask, i8* %base) {
; CHECK-LABEL: gather_mask_qps_execdomain:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1
; CHECK-NEXT: vgatherqps (%rsi,%zmm0,4), %ymm1 {%k1}
; CHECK-NEXT: vmovaps %ymm1, %ymm0
@@ -201,7 +201,7 @@ define <8 x float> @gather_mask_qps_execdomain(<8 x i64> %ind, <8 x float> %src,
define void @scatter_mask_dpd_execdomain(<8 x i32> %ind, <8 x double>* %src, i8 %mask, i8* %base, i8* %stbuf) {
; CHECK-LABEL: scatter_mask_dpd_execdomain:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: vmovapd (%rdi), %zmm1
; CHECK-NEXT: vscatterdpd %zmm1, (%rcx,%ymm0,4) {%k1}
@@ -214,7 +214,7 @@ define void @scatter_mask_dpd_execdomain(<8 x i32> %ind, <8 x double>* %src, i8
define void @scatter_mask_qpd_execdomain(<8 x i64> %ind, <8 x double>* %src, i8 %mask, i8* %base, i8* %stbuf) {
; CHECK-LABEL: scatter_mask_qpd_execdomain:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: vmovapd (%rdi), %zmm1
; CHECK-NEXT: vscatterqpd %zmm1, (%rcx,%zmm0,4) {%k1}
@@ -227,7 +227,7 @@ define void @scatter_mask_qpd_execdomain(<8 x i64> %ind, <8 x double>* %src, i8
define void @scatter_mask_dps_execdomain(<16 x i32> %ind, <16 x float>* %src, i16 %mask, i8* %base, i8* %stbuf) {
; CHECK-LABEL: scatter_mask_dps_execdomain:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: vmovaps (%rdi), %zmm1
; CHECK-NEXT: vscatterdps %zmm1, (%rcx,%zmm0,4) {%k1}
@@ -240,7 +240,7 @@ define void @scatter_mask_dps_execdomain(<16 x i32> %ind, <16 x float>* %src, i1
define void @scatter_mask_qps_execdomain(<8 x i64> %ind, <8 x float>* %src, i8 %mask, i8* %base, i8* %stbuf) {
; CHECK-LABEL: scatter_mask_qps_execdomain:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: vmovaps (%rdi), %ymm1
; CHECK-NEXT: vscatterqps %ymm1, (%rcx,%zmm0,4) {%k1}
@@ -253,8 +253,8 @@ define void @scatter_mask_qps_execdomain(<8 x i64> %ind, <8 x float>* %src, i8 %
define void @gather_qps(<8 x i64> %ind, <8 x float> %src, i8* %base, i8* %stbuf) {
; CHECK-LABEL: gather_qps:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
; CHECK-NEXT: kxnorw %k0, %k0, %k1
; CHECK-NEXT: kxnorw %k0, %k0, %k2
; CHECK-NEXT: vgatherqps (%rdi,%zmm0,4), %ymm1 {%k2}
@@ -272,7 +272,7 @@ declare void @llvm.x86.avx512.gatherpf.qps.512(i8, <8 x i64>, i8* , i32, i32);
declare void @llvm.x86.avx512.scatterpf.qps.512(i8, <8 x i64>, i8* , i32, i32);
define void @prefetch(<8 x i64> %ind, i8* %base) {
; CHECK-LABEL: prefetch:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kxnorw %k0, %k0, %k1
; CHECK-NEXT: vgatherpf0qps (%rdi,%zmm0,4) {%k1}
; CHECK-NEXT: kxorw %k0, %k0, %k1
@@ -296,7 +296,7 @@ declare <2 x double> @llvm.x86.avx512.gather3div2.df(<2 x double>, i8*, <2 x i64
define <2 x double>@test_int_x86_avx512_gather3div2_df(<2 x double> %x0, i8* %x1, <2 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_gather3div2_df:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: vgatherqpd (%rdi,%xmm1,4), %xmm0 {%k1}
; CHECK-NEXT: kxnorw %k0, %k0, %k1
@@ -314,7 +314,7 @@ declare <2 x i64> @llvm.x86.avx512.gather3div2.di(<2 x i64>, i8*, <2 x i64>, i8,
define <2 x i64>@test_int_x86_avx512_gather3div2_di(<2 x i64> %x0, i8* %x1, <2 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_gather3div2_di:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: vpgatherqq (%rdi,%xmm1,8), %xmm0 {%k1}
; CHECK-NEXT: vpaddq %xmm0, %xmm0, %xmm0
@@ -329,11 +329,11 @@ declare <4 x double> @llvm.x86.avx512.gather3div4.df(<4 x double>, i8*, <4 x i64
define <4 x double>@test_int_x86_avx512_gather3div4_df(<4 x double> %x0, i8* %x1, <4 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_gather3div4_df:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: vgatherqpd (%rdi,%ymm1,4), %ymm0 {%k1}
; CHECK-NEXT: kxnorw %k0, %k0, %k1
-; CHECK-NEXT: vxorpd %ymm2, %ymm2, %ymm2
+; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vgatherqpd (%rdi,%ymm1,2), %ymm2 {%k1}
; CHECK-NEXT: vaddpd %ymm2, %ymm0, %ymm0
; CHECK-NEXT: retq
@@ -347,11 +347,11 @@ declare <4 x i64> @llvm.x86.avx512.gather3div4.di(<4 x i64>, i8*, <4 x i64>, i8,
define <4 x i64>@test_int_x86_avx512_gather3div4_di(<4 x i64> %x0, i8* %x1, <4 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_gather3div4_di:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: vpgatherqq (%rdi,%ymm1,8), %ymm0 {%k1}
; CHECK-NEXT: kxnorw %k0, %k0, %k1
-; CHECK-NEXT: vpxor %ymm2, %ymm2, %ymm2
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vpgatherqq (%rdi,%ymm1,8), %ymm2 {%k1}
; CHECK-NEXT: vpaddq %ymm2, %ymm0, %ymm0
; CHECK-NEXT: retq
@@ -365,7 +365,7 @@ declare <4 x float> @llvm.x86.avx512.gather3div4.sf(<4 x float>, i8*, <2 x i64>,
define <4 x float>@test_int_x86_avx512_gather3div4_sf(<4 x float> %x0, i8* %x1, <2 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_gather3div4_sf:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: vgatherqps (%rdi,%xmm1,4), %xmm0 {%k1}
; CHECK-NEXT: kxnorw %k0, %k0, %k1
@@ -383,7 +383,7 @@ declare <4 x i32> @llvm.x86.avx512.gather3div4.si(<4 x i32>, i8*, <2 x i64>, i8,
define <4 x i32>@test_int_x86_avx512_gather3div4_si(<4 x i32> %x0, i8* %x1, <2 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_gather3div4_si:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: kxnorw %k0, %k0, %k2
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
@@ -401,7 +401,7 @@ declare <4 x float> @llvm.x86.avx512.gather3div8.sf(<4 x float>, i8*, <4 x i64>,
define <4 x float>@test_int_x86_avx512_gather3div8_sf(<4 x float> %x0, i8* %x1, <4 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_gather3div8_sf:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: vgatherqps (%rdi,%ymm1,4), %xmm0 {%k1}
; CHECK-NEXT: kxnorw %k0, %k0, %k1
@@ -420,7 +420,7 @@ declare <4 x i32> @llvm.x86.avx512.gather3div8.si(<4 x i32>, i8*, <4 x i64>, i8,
define <4 x i32>@test_int_x86_avx512_gather3div8_si(<4 x i32> %x0, i8* %x1, <4 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_gather3div8_si:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: vmovdqa %xmm0, %xmm2
; CHECK-NEXT: kmovq %k1, %k2
@@ -439,7 +439,7 @@ declare <2 x double> @llvm.x86.avx512.gather3siv2.df(<2 x double>, i8*, <4 x i32
define <2 x double>@test_int_x86_avx512_gather3siv2_df(<2 x double> %x0, i8* %x1, <4 x i32> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_gather3siv2_df:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: vgatherdpd (%rdi,%xmm1,4), %xmm0 {%k1}
; CHECK-NEXT: kxnorw %k0, %k0, %k1
@@ -457,7 +457,7 @@ declare <2 x i64> @llvm.x86.avx512.gather3siv2.di(<2 x i64>, i8*, <4 x i32>, i8,
define <2 x i64>@test_int_x86_avx512_gather3siv2_di(<2 x i64> %x0, i8* %x1, <4 x i32> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_gather3siv2_di:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: vpgatherdq (%rdi,%xmm1,8), %xmm0 {%k1}
; CHECK-NEXT: vpaddq %xmm0, %xmm0, %xmm0
@@ -472,11 +472,11 @@ declare <4 x double> @llvm.x86.avx512.gather3siv4.df(<4 x double>, i8*, <4 x i32
define <4 x double>@test_int_x86_avx512_gather3siv4_df(<4 x double> %x0, i8* %x1, <4 x i32> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_gather3siv4_df:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: vgatherdpd (%rdi,%xmm1,4), %ymm0 {%k1}
; CHECK-NEXT: kxnorw %k0, %k0, %k1
-; CHECK-NEXT: vxorpd %ymm2, %ymm2, %ymm2
+; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vgatherdpd (%rdi,%xmm1,2), %ymm2 {%k1}
; CHECK-NEXT: vaddpd %ymm2, %ymm0, %ymm0
; CHECK-NEXT: retq
@@ -490,7 +490,7 @@ declare <4 x i64> @llvm.x86.avx512.gather3siv4.di(<4 x i64>, i8*, <4 x i32>, i8,
define <4 x i64>@test_int_x86_avx512_gather3siv4_di(<4 x i64> %x0, i8* %x1, <4 x i32> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_gather3siv4_di:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: vpgatherdq (%rdi,%xmm1,8), %ymm0 {%k1}
; CHECK-NEXT: vpaddq %ymm0, %ymm0, %ymm0
@@ -505,7 +505,7 @@ declare <4 x float> @llvm.x86.avx512.gather3siv4.sf(<4 x float>, i8*, <4 x i32>,
define <4 x float>@test_int_x86_avx512_gather3siv4_sf(<4 x float> %x0, i8* %x1, <4 x i32> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_gather3siv4_sf:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: vgatherdps (%rdi,%xmm1,4), %xmm0 {%k1}
; CHECK-NEXT: kxnorw %k0, %k0, %k1
@@ -523,7 +523,7 @@ declare <4 x i32> @llvm.x86.avx512.gather3siv4.si(<4 x i32>, i8*, <4 x i32>, i8,
define <4 x i32>@test_int_x86_avx512_gather3siv4_si(<4 x i32> %x0, i8* %x1, <4 x i32> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_gather3siv4_si:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: kxnorw %k0, %k0, %k2
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
@@ -541,11 +541,11 @@ declare <8 x float> @llvm.x86.avx512.gather3siv8.sf(<8 x float>, i8*, <8 x i32>,
define <8 x float>@test_int_x86_avx512_gather3siv8_sf(<8 x float> %x0, i8* %x1, <8 x i32> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_gather3siv8_sf:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: vgatherdps (%rdi,%ymm1,4), %ymm0 {%k1}
; CHECK-NEXT: kxnorw %k0, %k0, %k1
-; CHECK-NEXT: vxorps %ymm2, %ymm2, %ymm2
+; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vgatherdps (%rdi,%ymm1,2), %ymm2 {%k1}
; CHECK-NEXT: vaddps %ymm2, %ymm0, %ymm0
; CHECK-NEXT: retq
@@ -559,7 +559,7 @@ declare <8 x i32> @llvm.x86.avx512.gather3siv8.si(<8 x i32>, i8*, <8 x i32>, i8,
define <8 x i32>@test_int_x86_avx512_gather3siv8_si(<8 x i32> %x0, i8* %x1, <8 x i32> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_gather3siv8_si:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: vmovdqa %ymm0, %ymm2
; CHECK-NEXT: kmovq %k1, %k2
@@ -577,7 +577,7 @@ declare void @llvm.x86.avx512.scatterdiv2.df(i8*, i8, <2 x i64>, <2 x double>, i
define void@test_int_x86_avx512_scatterdiv2_df(i8* %x0, i8 %x1, <2 x i64> %x2, <2 x double> %x3) {
; CHECK-LABEL: test_int_x86_avx512_scatterdiv2_df:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: kxnorw %k0, %k0, %k2
; CHECK-NEXT: vscatterqpd %xmm1, (%rdi,%xmm0,2) {%k2}
@@ -592,7 +592,7 @@ declare void @llvm.x86.avx512.scatterdiv2.di(i8*, i8, <2 x i64>, <2 x i64>, i32)
define void@test_int_x86_avx512_scatterdiv2_di(i8* %x0, i8 %x1, <2 x i64> %x2, <2 x i64> %x3) {
; CHECK-LABEL: test_int_x86_avx512_scatterdiv2_di:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: vpscatterqq %xmm1, (%rdi,%xmm0,2) {%k1}
; CHECK-NEXT: kxnorw %k0, %k0, %k1
@@ -607,7 +607,7 @@ declare void @llvm.x86.avx512.scatterdiv4.df(i8*, i8, <4 x i64>, <4 x double>, i
define void@test_int_x86_avx512_scatterdiv4_df(i8* %x0, i8 %x1, <4 x i64> %x2, <4 x double> %x3) {
; CHECK-LABEL: test_int_x86_avx512_scatterdiv4_df:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: vscatterqpd %ymm1, (%rdi,%ymm0,2) {%k1}
; CHECK-NEXT: kxnorw %k0, %k0, %k1
@@ -623,7 +623,7 @@ declare void @llvm.x86.avx512.scatterdiv4.di(i8*, i8, <4 x i64>, <4 x i64>, i32)
define void@test_int_x86_avx512_scatterdiv4_di(i8* %x0, i8 %x1, <4 x i64> %x2, <4 x i64> %x3) {
; CHECK-LABEL: test_int_x86_avx512_scatterdiv4_di:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: vpscatterqq %ymm1, (%rdi,%ymm0,2) {%k1}
; CHECK-NEXT: kxnorw %k0, %k0, %k1
@@ -639,7 +639,7 @@ declare void @llvm.x86.avx512.scatterdiv4.sf(i8*, i8, <2 x i64>, <4 x float>, i3
define void@test_int_x86_avx512_scatterdiv4_sf(i8* %x0, i8 %x1, <2 x i64> %x2, <4 x float> %x3) {
; CHECK-LABEL: test_int_x86_avx512_scatterdiv4_sf:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: vscatterqps %xmm1, (%rdi,%xmm0,2) {%k1}
; CHECK-NEXT: kxnorw %k0, %k0, %k1
@@ -654,7 +654,7 @@ declare void @llvm.x86.avx512.scatterdiv4.si(i8*, i8, <2 x i64>, <4 x i32>, i32)
define void@test_int_x86_avx512_scatterdiv4_si(i8* %x0, i8 %x1, <2 x i64> %x2, <4 x i32> %x3) {
; CHECK-LABEL: test_int_x86_avx512_scatterdiv4_si:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: kxnorw %k0, %k0, %k2
; CHECK-NEXT: vpscatterqd %xmm1, (%rdi,%xmm0,2) {%k2}
@@ -669,7 +669,7 @@ declare void @llvm.x86.avx512.scatterdiv8.sf(i8*, i8, <4 x i64>, <4 x float>, i3
define void@test_int_x86_avx512_scatterdiv8_sf(i8* %x0, i8 %x1, <4 x i64> %x2, <4 x float> %x3) {
; CHECK-LABEL: test_int_x86_avx512_scatterdiv8_sf:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: vscatterqps %xmm1, (%rdi,%ymm0,2) {%k1}
; CHECK-NEXT: kxnorw %k0, %k0, %k1
@@ -685,7 +685,7 @@ declare void @llvm.x86.avx512.scatterdiv8.si(i8*, i8, <4 x i64>, <4 x i32>, i32)
define void@test_int_x86_avx512_scatterdiv8_si(i8* %x0, i8 %x1, <4 x i64> %x2, <4 x i32> %x3) {
; CHECK-LABEL: test_int_x86_avx512_scatterdiv8_si:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: vpscatterqd %xmm1, (%rdi,%ymm0,2) {%k1}
; CHECK-NEXT: kxnorw %k0, %k0, %k1
@@ -701,7 +701,7 @@ declare void @llvm.x86.avx512.scattersiv2.df(i8*, i8, <4 x i32>, <2 x double>, i
define void@test_int_x86_avx512_scattersiv2_df(i8* %x0, i8 %x1, <4 x i32> %x2, <2 x double> %x3) {
; CHECK-LABEL: test_int_x86_avx512_scattersiv2_df:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: kxnorw %k0, %k0, %k2
; CHECK-NEXT: vscatterdpd %xmm1, (%rdi,%xmm0,2) {%k2}
@@ -716,7 +716,7 @@ declare void @llvm.x86.avx512.scattersiv2.di(i8*, i8, <4 x i32>, <2 x i64>, i32)
define void@test_int_x86_avx512_scattersiv2_di(i8* %x0, i8 %x1, <4 x i32> %x2, <2 x i64> %x3) {
; CHECK-LABEL: test_int_x86_avx512_scattersiv2_di:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: kxnorw %k0, %k0, %k2
; CHECK-NEXT: vpscatterdq %xmm1, (%rdi,%xmm0,2) {%k2}
@@ -731,7 +731,7 @@ declare void @llvm.x86.avx512.scattersiv4.df(i8*, i8, <4 x i32>, <4 x double>, i
define void@test_int_x86_avx512_scattersiv4_df(i8* %x0, i8 %x1, <4 x i32> %x2, <4 x double> %x3) {
; CHECK-LABEL: test_int_x86_avx512_scattersiv4_df:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: vscatterdpd %ymm1, (%rdi,%xmm0,2) {%k1}
; CHECK-NEXT: kxnorw %k0, %k0, %k1
@@ -747,7 +747,7 @@ declare void @llvm.x86.avx512.scattersiv4.di(i8*, i8, <4 x i32>, <4 x i64>, i32)
define void@test_int_x86_avx512_scattersiv4_di(i8* %x0, i8 %x1, <4 x i32> %x2, <4 x i64> %x3) {
; CHECK-LABEL: test_int_x86_avx512_scattersiv4_di:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: kxnorw %k0, %k0, %k2
; CHECK-NEXT: vpscatterdq %ymm1, (%rdi,%xmm0,2) {%k2}
@@ -763,7 +763,7 @@ declare void @llvm.x86.avx512.scattersiv4.sf(i8*, i8, <4 x i32>, <4 x float>, i3
define void@test_int_x86_avx512_scattersiv4_sf(i8* %x0, i8 %x1, <4 x i32> %x2, <4 x float> %x3) {
; CHECK-LABEL: test_int_x86_avx512_scattersiv4_sf:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: vscatterdps %xmm1, (%rdi,%xmm0,2) {%k1}
; CHECK-NEXT: kxnorw %k0, %k0, %k1
@@ -778,7 +778,7 @@ declare void @llvm.x86.avx512.scattersiv4.si(i8*, i8, <4 x i32>, <4 x i32>, i32)
define void@test_int_x86_avx512_scattersiv4_si(i8* %x0, i8 %x1, <4 x i32> %x2, <4 x i32> %x3) {
; CHECK-LABEL: test_int_x86_avx512_scattersiv4_si:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: vpscatterdd %xmm1, (%rdi,%xmm0,2) {%k1}
; CHECK-NEXT: kxnorw %k0, %k0, %k1
@@ -793,7 +793,7 @@ declare void @llvm.x86.avx512.scattersiv8.sf(i8*, i8, <8 x i32>, <8 x float>, i3
define void@test_int_x86_avx512_scattersiv8_sf(i8* %x0, i8 %x1, <8 x i32> %x2, <8 x float> %x3) {
; CHECK-LABEL: test_int_x86_avx512_scattersiv8_sf:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: vscatterdps %ymm1, (%rdi,%ymm0,2) {%k1}
; CHECK-NEXT: kxnorw %k0, %k0, %k1
@@ -809,7 +809,7 @@ declare void @llvm.x86.avx512.scattersiv8.si(i8*, i8, <8 x i32>, <8 x i32>, i32)
define void@test_int_x86_avx512_scattersiv8_si(i8* %x0, i8 %x1, <8 x i32> %x2, <8 x i32> %x3) {
; CHECK-LABEL: test_int_x86_avx512_scattersiv8_si:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,2) {%k1}
; CHECK-NEXT: kxnorw %k0, %k0, %k1
@@ -823,7 +823,7 @@ define void@test_int_x86_avx512_scattersiv8_si(i8* %x0, i8 %x1, <8 x i32> %x2, <
define void @scatter_mask_test(i8* %x0, <8 x i32> %x2, <8 x i32> %x3) {
; CHECK-LABEL: scatter_mask_test:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kxnorw %k0, %k0, %k1
; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,2) {%k1}
; CHECK-NEXT: kxorw %k0, %k0, %k1
@@ -845,9 +845,9 @@ define void @scatter_mask_test(i8* %x0, <8 x i32> %x2, <8 x i32> %x3) {
define <16 x float> @gather_mask_test(<16 x i32> %ind, <16 x float> %src, i8* %base) {
; CHECK-LABEL: gather_mask_test:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kxnorw %k0, %k0, %k1
-; CHECK-NEXT: vxorps %zmm2, %zmm2, %zmm2
+; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm2 {%k1}
; CHECK-NEXT: kxorw %k0, %k0, %k1
; CHECK-NEXT: vmovaps %zmm1, %zmm3
diff --git a/test/CodeGen/X86/avx512-gfni-intrinsics.ll b/test/CodeGen/X86/avx512-gfni-intrinsics.ll
new file mode 100644
index 000000000000..a1a6aaf53b40
--- /dev/null
+++ b/test/CodeGen/X86/avx512-gfni-intrinsics.ll
@@ -0,0 +1,183 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512vl,+gfni,+avx512bw --show-mc-encoding | FileCheck %s
+
+declare <16 x i8> @llvm.x86.vgf2p8affineinvqb.128(<16 x i8>, <16 x i8>, i8)
+define <16 x i8> @test_vgf2p8affineinvqb_128(<16 x i8> %src1, <16 x i8> %src2, <16 x i8> %passthru, i16 %mask) {
+; CHECK-LABEL: test_vgf2p8affineinvqb_128:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT: vgf2p8affineinvqb $3, %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0xf9,0xcf,0xd9,0x03]
+; CHECK-NEXT: vgf2p8affineinvqb $3, %xmm1, %xmm0, %xmm4 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0x89,0xcf,0xe1,0x03]
+; CHECK-NEXT: vgf2p8affineinvqb $3, %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0xcf,0xd1,0x03]
+; CHECK-NEXT: vpxor %xmm3, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xc3]
+; CHECK-NEXT: vpxor %xmm0, %xmm4, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %1 = bitcast i16 %mask to <16 x i1>
+ %2 = call <16 x i8> @llvm.x86.vgf2p8affineinvqb.128(<16 x i8> %src1, <16 x i8> %src2, i8 3)
+ %3 = select <16 x i1> %1, <16 x i8> %2, <16 x i8> zeroinitializer
+ %4 = select <16 x i1> %1, <16 x i8> %2, <16 x i8> %passthru
+ %5 = xor <16 x i8> %3, %4
+ %6 = xor <16 x i8> %5, %2
+ ret <16 x i8> %6
+}
+
+declare <32 x i8> @llvm.x86.vgf2p8affineinvqb.256(<32 x i8>, <32 x i8>, i8)
+define <32 x i8> @test_vgf2p8affineinvqb_256(<32 x i8> %src1, <32 x i8> %src2, <32 x i8> %passthru, i32 %mask) {
+; CHECK-LABEL: test_vgf2p8affineinvqb_256:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT: vgf2p8affineinvqb $3, %ymm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0xcf,0xd9,0x03]
+; CHECK-NEXT: vgf2p8affineinvqb $3, %ymm1, %ymm0, %ymm4 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xa9,0xcf,0xe1,0x03]
+; CHECK-NEXT: vgf2p8affineinvqb $3, %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0xcf,0xd1,0x03]
+; CHECK-NEXT: vpxor %ymm3, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xef,0xc3]
+; CHECK-NEXT: vpxor %ymm0, %ymm4, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xdd,0xef,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %1 = bitcast i32 %mask to <32 x i1>
+ %2 = call <32 x i8> @llvm.x86.vgf2p8affineinvqb.256(<32 x i8> %src1, <32 x i8> %src2, i8 3)
+ %3 = select <32 x i1> %1, <32 x i8> %2, <32 x i8> zeroinitializer
+ %4 = select <32 x i1> %1, <32 x i8> %2, <32 x i8> %passthru
+ %5 = xor <32 x i8> %3, %4
+ %6 = xor <32 x i8> %5, %2
+ ret <32 x i8> %6
+}
+
+declare <64 x i8> @llvm.x86.vgf2p8affineinvqb.512(<64 x i8>, <64 x i8>, i8)
+define <64 x i8> @test_vgf2p8affineinvqb_512(<64 x i8> %src1, <64 x i8> %src2, <64 x i8> %passthru, i64 %mask) {
+; CHECK-LABEL: test_vgf2p8affineinvqb_512:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovq %rdi, %k1 ## encoding: [0xc4,0xe1,0xfb,0x92,0xcf]
+; CHECK-NEXT: vgf2p8affineinvqb $3, %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf3,0xfd,0x48,0xcf,0xd9,0x03]
+; CHECK-NEXT: vgf2p8affineinvqb $3, %zmm1, %zmm0, %zmm4 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xc9,0xcf,0xe1,0x03]
+; CHECK-NEXT: vgf2p8affineinvqb $3, %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0xcf,0xd1,0x03]
+; CHECK-NEXT: vpxorq %zmm3, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xed,0x48,0xef,0xc3]
+; CHECK-NEXT: vpxorq %zmm0, %zmm4, %zmm0 ## encoding: [0x62,0xf1,0xdd,0x48,0xef,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %1 = bitcast i64 %mask to <64 x i1>
+ %2 = call <64 x i8> @llvm.x86.vgf2p8affineinvqb.512(<64 x i8> %src1, <64 x i8> %src2, i8 3)
+ %3 = select <64 x i1> %1, <64 x i8> %2, <64 x i8> zeroinitializer
+ %4 = select <64 x i1> %1, <64 x i8> %2, <64 x i8> %passthru
+ %5 = xor <64 x i8> %3, %4
+ %6 = xor <64 x i8> %5, %2
+ ret <64 x i8> %6
+}
+
+declare <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8>, <16 x i8>, i8)
+define <16 x i8> @test_vgf2p8affineqb_128(<16 x i8> %src1, <16 x i8> %src2, <16 x i8> %passthru, i16 %mask) {
+; CHECK-LABEL: test_vgf2p8affineqb_128:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT: vgf2p8affineqb $3, %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0xf9,0xce,0xd9,0x03]
+; CHECK-NEXT: vgf2p8affineqb $3, %xmm1, %xmm0, %xmm4 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0x89,0xce,0xe1,0x03]
+; CHECK-NEXT: vgf2p8affineqb $3, %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0xce,0xd1,0x03]
+; CHECK-NEXT: vpxor %xmm3, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xc3]
+; CHECK-NEXT: vpxor %xmm0, %xmm4, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %1 = bitcast i16 %mask to <16 x i1>
+ %2 = call <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8> %src1, <16 x i8> %src2, i8 3)
+ %3 = select <16 x i1> %1, <16 x i8> %2, <16 x i8> zeroinitializer
+ %4 = select <16 x i1> %1, <16 x i8> %2, <16 x i8> %passthru
+ %5 = xor <16 x i8> %3, %4
+ %6 = xor <16 x i8> %5, %2
+ ret <16 x i8> %6
+}
+
+declare <32 x i8> @llvm.x86.vgf2p8affineqb.256(<32 x i8>, <32 x i8>, i8)
+define <32 x i8> @test_vgf2p8affineqb_256(<32 x i8> %src1, <32 x i8> %src2, <32 x i8> %passthru, i32 %mask) {
+; CHECK-LABEL: test_vgf2p8affineqb_256:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT: vgf2p8affineqb $3, %ymm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0xce,0xd9,0x03]
+; CHECK-NEXT: vgf2p8affineqb $3, %ymm1, %ymm0, %ymm4 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xa9,0xce,0xe1,0x03]
+; CHECK-NEXT: vgf2p8affineqb $3, %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0xce,0xd1,0x03]
+; CHECK-NEXT: vpxor %ymm3, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xef,0xc3]
+; CHECK-NEXT: vpxor %ymm0, %ymm4, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xdd,0xef,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %1 = bitcast i32 %mask to <32 x i1>
+ %2 = call <32 x i8> @llvm.x86.vgf2p8affineqb.256(<32 x i8> %src1, <32 x i8> %src2, i8 3)
+ %3 = select <32 x i1> %1, <32 x i8> %2, <32 x i8> zeroinitializer
+ %4 = select <32 x i1> %1, <32 x i8> %2, <32 x i8> %passthru
+ %5 = xor <32 x i8> %3, %4
+ %6 = xor <32 x i8> %5, %2
+ ret <32 x i8> %6
+}
+
+declare <64 x i8> @llvm.x86.vgf2p8affineqb.512(<64 x i8>, <64 x i8>, i8)
+define <64 x i8> @test_vgf2p8affineqb_512(<64 x i8> %src1, <64 x i8> %src2, <64 x i8> %passthru, i64 %mask) {
+; CHECK-LABEL: test_vgf2p8affineqb_512:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovq %rdi, %k1 ## encoding: [0xc4,0xe1,0xfb,0x92,0xcf]
+; CHECK-NEXT: vgf2p8affineqb $3, %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf3,0xfd,0x48,0xce,0xd9,0x03]
+; CHECK-NEXT: vgf2p8affineqb $3, %zmm1, %zmm0, %zmm4 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xc9,0xce,0xe1,0x03]
+; CHECK-NEXT: vgf2p8affineqb $3, %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0xce,0xd1,0x03]
+; CHECK-NEXT: vpxorq %zmm3, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xed,0x48,0xef,0xc3]
+; CHECK-NEXT: vpxorq %zmm0, %zmm4, %zmm0 ## encoding: [0x62,0xf1,0xdd,0x48,0xef,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %1 = bitcast i64 %mask to <64 x i1>
+ %2 = call <64 x i8> @llvm.x86.vgf2p8affineqb.512(<64 x i8> %src1, <64 x i8> %src2, i8 3)
+ %3 = select <64 x i1> %1, <64 x i8> %2, <64 x i8> zeroinitializer
+ %4 = select <64 x i1> %1, <64 x i8> %2, <64 x i8> %passthru
+ %5 = xor <64 x i8> %3, %4
+ %6 = xor <64 x i8> %5, %2
+ ret <64 x i8> %6
+}
+
+declare <16 x i8> @llvm.x86.vgf2p8mulb.128(<16 x i8>, <16 x i8>)
+define <16 x i8> @test_vgf2p8mulb_128(<16 x i8> %src1, <16 x i8> %src2, <16 x i8> %passthru, i16 %mask) {
+; CHECK-LABEL: test_vgf2p8mulb_128:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT: vgf2p8mulb %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xcf,0xd9]
+; CHECK-NEXT: vgf2p8mulb %xmm1, %xmm0, %xmm4 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0xcf,0xe1]
+; CHECK-NEXT: vgf2p8mulb %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0xcf,0xd1]
+; CHECK-NEXT: vpxor %xmm3, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xc3]
+; CHECK-NEXT: vpxor %xmm0, %xmm4, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %1 = bitcast i16 %mask to <16 x i1>
+ %2 = call <16 x i8> @llvm.x86.vgf2p8mulb.128(<16 x i8> %src1, <16 x i8> %src2)
+ %3 = select <16 x i1> %1, <16 x i8> %2, <16 x i8> zeroinitializer
+ %4 = select <16 x i1> %1, <16 x i8> %2, <16 x i8> %passthru
+ %5 = xor <16 x i8> %3, %4
+ %6 = xor <16 x i8> %5, %2
+ ret <16 x i8> %6
+}
+
+declare <32 x i8> @llvm.x86.vgf2p8mulb.256(<32 x i8>, <32 x i8>)
+define <32 x i8> @test_vgf2p8mulb_256(<32 x i8> %src1, <32 x i8> %src2, <32 x i8> %passthru, i32 %mask) {
+; CHECK-LABEL: test_vgf2p8mulb_256:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT: vgf2p8mulb %ymm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0xcf,0xd9]
+; CHECK-NEXT: vgf2p8mulb %ymm1, %ymm0, %ymm4 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0xcf,0xe1]
+; CHECK-NEXT: vgf2p8mulb %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0xcf,0xd1]
+; CHECK-NEXT: vpxor %ymm3, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xef,0xc3]
+; CHECK-NEXT: vpxor %ymm0, %ymm4, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xdd,0xef,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %1 = bitcast i32 %mask to <32 x i1>
+ %2 = call <32 x i8> @llvm.x86.vgf2p8mulb.256(<32 x i8> %src1, <32 x i8> %src2)
+ %3 = select <32 x i1> %1, <32 x i8> %2, <32 x i8> zeroinitializer
+ %4 = select <32 x i1> %1, <32 x i8> %2, <32 x i8> %passthru
+ %5 = xor <32 x i8> %3, %4
+ %6 = xor <32 x i8> %5, %2
+ ret <32 x i8> %6
+}
+
+declare <64 x i8> @llvm.x86.vgf2p8mulb.512(<64 x i8>, <64 x i8>)
+define <64 x i8> @test_vgf2p8mulb_512(<64 x i8> %src1, <64 x i8> %src2, <64 x i8> %passthru, i64 %mask) {
+; CHECK-LABEL: test_vgf2p8mulb_512:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovq %rdi, %k1 ## encoding: [0xc4,0xe1,0xfb,0x92,0xcf]
+; CHECK-NEXT: vgf2p8mulb %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf2,0x7d,0x48,0xcf,0xd9]
+; CHECK-NEXT: vgf2p8mulb %zmm1, %zmm0, %zmm4 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0xcf,0xe1]
+; CHECK-NEXT: vgf2p8mulb %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0xcf,0xd1]
+; CHECK-NEXT: vpxorq %zmm3, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xed,0x48,0xef,0xc3]
+; CHECK-NEXT: vpxorq %zmm0, %zmm4, %zmm0 ## encoding: [0x62,0xf1,0xdd,0x48,0xef,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %1 = bitcast i64 %mask to <64 x i1>
+ %2 = call <64 x i8> @llvm.x86.vgf2p8mulb.512(<64 x i8> %src1, <64 x i8> %src2)
+ %3 = select <64 x i1> %1, <64 x i8> %2, <64 x i8> zeroinitializer
+ %4 = select <64 x i1> %1, <64 x i8> %2, <64 x i8> %passthru
+ %5 = xor <64 x i8> %3, %4
+ %6 = xor <64 x i8> %5, %2
+ ret <64 x i8> %6
+}
+
diff --git a/test/CodeGen/X86/avx512-hadd-hsub.ll b/test/CodeGen/X86/avx512-hadd-hsub.ll
new file mode 100644
index 000000000000..d5bd7622a18e
--- /dev/null
+++ b/test/CodeGen/X86/avx512-hadd-hsub.ll
@@ -0,0 +1,303 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+;RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl | FileCheck %s --check-prefix=KNL
+;RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skx | FileCheck %s --check-prefix=SKX
+
+define i32 @hadd_16(<16 x i32> %x225) {
+; KNL-LABEL: hadd_16:
+; KNL: # %bb.0:
+; KNL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; KNL-NEXT: vpaddd %zmm1, %zmm0, %zmm0
+; KNL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; KNL-NEXT: vpaddd %zmm1, %zmm0, %zmm0
+; KNL-NEXT: vmovd %xmm0, %eax
+; KNL-NEXT: retq
+;
+; SKX-LABEL: hadd_16:
+; SKX: # %bb.0:
+; SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SKX-NEXT: vpaddd %zmm1, %zmm0, %zmm0
+; SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SKX-NEXT: vpaddd %zmm1, %zmm0, %zmm0
+; SKX-NEXT: vmovd %xmm0, %eax
+; SKX-NEXT: vzeroupper
+; SKX-NEXT: retq
+ %x226 = shufflevector <16 x i32> %x225, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %x227 = add <16 x i32> %x225, %x226
+ %x228 = shufflevector <16 x i32> %x227, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %x229 = add <16 x i32> %x227, %x228
+ %x230 = extractelement <16 x i32> %x229, i32 0
+ ret i32 %x230
+}
+
+define i32 @hsub_16(<16 x i32> %x225) {
+; KNL-LABEL: hsub_16:
+; KNL: # %bb.0:
+; KNL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; KNL-NEXT: vpaddd %zmm1, %zmm0, %zmm0
+; KNL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; KNL-NEXT: vpsubd %zmm1, %zmm0, %zmm0
+; KNL-NEXT: vmovd %xmm0, %eax
+; KNL-NEXT: retq
+;
+; SKX-LABEL: hsub_16:
+; SKX: # %bb.0:
+; SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SKX-NEXT: vpaddd %zmm1, %zmm0, %zmm0
+; SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SKX-NEXT: vpsubd %zmm1, %zmm0, %zmm0
+; SKX-NEXT: vmovd %xmm0, %eax
+; SKX-NEXT: vzeroupper
+; SKX-NEXT: retq
+ %x226 = shufflevector <16 x i32> %x225, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %x227 = add <16 x i32> %x225, %x226
+ %x228 = shufflevector <16 x i32> %x227, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %x229 = sub <16 x i32> %x227, %x228
+ %x230 = extractelement <16 x i32> %x229, i32 0
+ ret i32 %x230
+}
+
+define float @fhadd_16(<16 x float> %x225) {
+; KNL-LABEL: fhadd_16:
+; KNL: # %bb.0:
+; KNL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; KNL-NEXT: vaddps %zmm1, %zmm0, %zmm0
+; KNL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; KNL-NEXT: vaddps %zmm1, %zmm0, %zmm0
+; KNL-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: fhadd_16:
+; SKX: # %bb.0:
+; SKX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; SKX-NEXT: vaddps %zmm1, %zmm0, %zmm0
+; SKX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; SKX-NEXT: vaddps %zmm1, %zmm0, %zmm0
+; SKX-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0
+; SKX-NEXT: vzeroupper
+; SKX-NEXT: retq
+ %x226 = shufflevector <16 x float> %x225, <16 x float> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %x227 = fadd <16 x float> %x225, %x226
+ %x228 = shufflevector <16 x float> %x227, <16 x float> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %x229 = fadd <16 x float> %x227, %x228
+ %x230 = extractelement <16 x float> %x229, i32 0
+ ret float %x230
+}
+
+define float @fhsub_16(<16 x float> %x225) {
+; KNL-LABEL: fhsub_16:
+; KNL: # %bb.0:
+; KNL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; KNL-NEXT: vaddps %zmm1, %zmm0, %zmm0
+; KNL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; KNL-NEXT: vsubps %zmm1, %zmm0, %zmm0
+; KNL-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: fhsub_16:
+; SKX: # %bb.0:
+; SKX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; SKX-NEXT: vaddps %zmm1, %zmm0, %zmm0
+; SKX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; SKX-NEXT: vsubps %zmm1, %zmm0, %zmm0
+; SKX-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0
+; SKX-NEXT: vzeroupper
+; SKX-NEXT: retq
+ %x226 = shufflevector <16 x float> %x225, <16 x float> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %x227 = fadd <16 x float> %x225, %x226
+ %x228 = shufflevector <16 x float> %x227, <16 x float> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %x229 = fsub <16 x float> %x227, %x228
+ %x230 = extractelement <16 x float> %x229, i32 0
+ ret float %x230
+}
+
+define <16 x i32> @hadd_16_3(<16 x i32> %x225, <16 x i32> %x227) {
+; KNL-LABEL: hadd_16_3:
+; KNL: # %bb.0:
+; KNL-NEXT: vshufps {{.*#+}} ymm2 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
+; KNL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
+; KNL-NEXT: vpaddd %zmm0, %zmm2, %zmm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: hadd_16_3:
+; SKX: # %bb.0:
+; SKX-NEXT: vshufps {{.*#+}} ymm2 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
+; SKX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
+; SKX-NEXT: vpaddd %zmm0, %zmm2, %zmm0
+; SKX-NEXT: retq
+ %x226 = shufflevector <16 x i32> %x225, <16 x i32> %x227, <16 x i32> <i32 0, i32 2, i32 16, i32 18
+, i32 4, i32 6, i32 20, i32 22, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %x228 = shufflevector <16 x i32> %x225, <16 x i32> %x227, <16 x i32> <i32 1, i32 3, i32 17, i32 19
+, i32 5 , i32 7, i32 21, i32 23, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef,
+ i32 undef, i32 undef>
+ %x229 = add <16 x i32> %x226, %x228
+ ret <16 x i32> %x229
+}
+
+define <16 x float> @fhadd_16_3(<16 x float> %x225, <16 x float> %x227) {
+; KNL-LABEL: fhadd_16_3:
+; KNL: # %bb.0:
+; KNL-NEXT: vshufps {{.*#+}} ymm2 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
+; KNL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
+; KNL-NEXT: vaddps %zmm0, %zmm2, %zmm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: fhadd_16_3:
+; SKX: # %bb.0:
+; SKX-NEXT: vshufps {{.*#+}} ymm2 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
+; SKX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
+; SKX-NEXT: vaddps %zmm0, %zmm2, %zmm0
+; SKX-NEXT: retq
+ %x226 = shufflevector <16 x float> %x225, <16 x float> %x227, <16 x i32> <i32 0, i32 2, i32 16, i32 18
+, i32 4, i32 6, i32 20, i32 22, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %x228 = shufflevector <16 x float> %x225, <16 x float> %x227, <16 x i32> <i32 1, i32 3, i32 17, i32 19
+, i32 5 , i32 7, i32 21, i32 23, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %x229 = fadd <16 x float> %x226, %x228
+ ret <16 x float> %x229
+}
+
+define <8 x double> @fhadd_16_4(<8 x double> %x225, <8 x double> %x227) {
+; KNL-LABEL: fhadd_16_4:
+; KNL: # %bb.0:
+; KNL-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; KNL-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; KNL-NEXT: vaddpd %zmm0, %zmm2, %zmm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: fhadd_16_4:
+; SKX: # %bb.0:
+; SKX-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; SKX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; SKX-NEXT: vaddpd %zmm0, %zmm2, %zmm0
+; SKX-NEXT: retq
+ %x226 = shufflevector <8 x double> %x225, <8 x double> %x227, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 undef, i32 undef, i32 undef, i32 undef>
+ %x228 = shufflevector <8 x double> %x225, <8 x double> %x227, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 undef ,i32 undef, i32 undef, i32 undef>
+ %x229 = fadd <8 x double> %x226, %x228
+ ret <8 x double> %x229
+}
+
+define <4 x double> @fadd_noundef_low(<8 x double> %x225, <8 x double> %x227) {
+; KNL-LABEL: fadd_noundef_low:
+; KNL: # %bb.0:
+; KNL-NEXT: vunpcklpd {{.*#+}} zmm2 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
+; KNL-NEXT: vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
+; KNL-NEXT: vaddpd %zmm0, %zmm2, %zmm0
+; KNL-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: fadd_noundef_low:
+; SKX: # %bb.0:
+; SKX-NEXT: vunpcklpd {{.*#+}} zmm2 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
+; SKX-NEXT: vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
+; SKX-NEXT: vaddpd %zmm0, %zmm2, %zmm0
+; SKX-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
+; SKX-NEXT: retq
+ %x226 = shufflevector <8 x double> %x225, <8 x double> %x227, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+ %x228 = shufflevector <8 x double> %x225, <8 x double> %x227, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5 ,i32 13, i32 7, i32 15>
+ %x229 = fadd <8 x double> %x226, %x228
+ %x230 = shufflevector <8 x double> %x229, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ ret <4 x double> %x230
+}
+
+define <4 x double> @fadd_noundef_high(<8 x double> %x225, <8 x double> %x227) {
+; KNL-LABEL: fadd_noundef_high:
+; KNL: # %bb.0:
+; KNL-NEXT: vunpcklpd {{.*#+}} zmm2 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
+; KNL-NEXT: vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
+; KNL-NEXT: vaddpd %zmm0, %zmm2, %zmm0
+; KNL-NEXT: vextractf64x4 $1, %zmm0, %ymm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: fadd_noundef_high:
+; SKX: # %bb.0:
+; SKX-NEXT: vunpcklpd {{.*#+}} zmm2 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
+; SKX-NEXT: vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
+; SKX-NEXT: vaddpd %zmm0, %zmm2, %zmm0
+; SKX-NEXT: vextractf64x4 $1, %zmm0, %ymm0
+; SKX-NEXT: retq
+ %x226 = shufflevector <8 x double> %x225, <8 x double> %x227, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+ %x228 = shufflevector <8 x double> %x225, <8 x double> %x227, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5 ,i32 13, i32 7, i32 15>
+ %x229 = fadd <8 x double> %x226, %x228
+ %x230 = shufflevector <8 x double> %x229, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ ret <4 x double> %x230
+}
+
+
+define <8 x i32> @hadd_16_3_sv(<16 x i32> %x225, <16 x i32> %x227) {
+; KNL-LABEL: hadd_16_3_sv:
+; KNL: # %bb.0:
+; KNL-NEXT: vshufps {{.*#+}} zmm2 = zmm0[0,2],zmm1[0,2],zmm0[4,6],zmm1[4,6],zmm0[8,10],zmm1[8,10],zmm0[12,14],zmm1[12,14]
+; KNL-NEXT: vshufps {{.*#+}} zmm0 = zmm0[1,3],zmm1[1,3],zmm0[5,7],zmm1[5,7],zmm0[9,11],zmm1[9,11],zmm0[13,15],zmm1[13,15]
+; KNL-NEXT: vpaddd %zmm0, %zmm2, %zmm0
+; KNL-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: hadd_16_3_sv:
+; SKX: # %bb.0:
+; SKX-NEXT: vshufps {{.*#+}} zmm2 = zmm0[0,2],zmm1[0,2],zmm0[4,6],zmm1[4,6],zmm0[8,10],zmm1[8,10],zmm0[12,14],zmm1[12,14]
+; SKX-NEXT: vshufps {{.*#+}} zmm0 = zmm0[1,3],zmm1[1,3],zmm0[5,7],zmm1[5,7],zmm0[9,11],zmm1[9,11],zmm0[13,15],zmm1[13,15]
+; SKX-NEXT: vpaddd %zmm0, %zmm2, %zmm0
+; SKX-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
+; SKX-NEXT: retq
+ %x226 = shufflevector <16 x i32> %x225, <16 x i32> %x227, <16 x i32> <i32 0, i32 2, i32 16, i32 18
+, i32 4, i32 6, i32 20, i32 22, i32 8, i32 10, i32 24, i32 26, i32 12, i32 14, i32 28, i32 30>
+ %x228 = shufflevector <16 x i32> %x225, <16 x i32> %x227, <16 x i32> <i32 1, i32 3, i32 17, i32 19
+, i32 5 , i32 7, i32 21, i32 23, i32 9, i32 11, i32 25, i32 27, i32 13, i32 15,
+ i32 29, i32 31>
+ %x229 = add <16 x i32> %x226, %x228
+ %x230 = shufflevector <16 x i32> %x229, <16 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4 ,i32 5, i32 6, i32 7>
+ ret <8 x i32> %x230
+}
+
+
+define double @fadd_noundef_eel(<8 x double> %x225, <8 x double> %x227) {
+; KNL-LABEL: fadd_noundef_eel:
+; KNL: # %bb.0:
+; KNL-NEXT: vunpcklpd {{.*#+}} zmm2 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
+; KNL-NEXT: vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
+; KNL-NEXT: vaddpd %zmm0, %zmm2, %zmm0
+; KNL-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: fadd_noundef_eel:
+; SKX: # %bb.0:
+; SKX-NEXT: vunpcklpd {{.*#+}} zmm2 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
+; SKX-NEXT: vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
+; SKX-NEXT: vaddpd %zmm0, %zmm2, %zmm0
+; SKX-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0
+; SKX-NEXT: vzeroupper
+; SKX-NEXT: retq
+ %x226 = shufflevector <8 x double> %x225, <8 x double> %x227, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+ %x228 = shufflevector <8 x double> %x225, <8 x double> %x227, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5 ,i32 13, i32 7, i32 15>
+ %x229 = fadd <8 x double> %x226, %x228
+ %x230 = extractelement <8 x double> %x229, i32 0
+ ret double %x230
+}
+
+
+
+define double @fsub_noundef_ee (<8 x double> %x225, <8 x double> %x227) {
+; KNL-LABEL: fsub_noundef_ee:
+; KNL: # %bb.0:
+; KNL-NEXT: vunpcklpd {{.*#+}} zmm2 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
+; KNL-NEXT: vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
+; KNL-NEXT: vsubpd %zmm0, %zmm2, %zmm0
+; KNL-NEXT: vextractf32x4 $2, %zmm0, %xmm0
+; KNL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; KNL-NEXT: retq
+;
+; SKX-LABEL: fsub_noundef_ee:
+; SKX: # %bb.0:
+; SKX-NEXT: vunpcklpd {{.*#+}} zmm2 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
+; SKX-NEXT: vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
+; SKX-NEXT: vsubpd %zmm0, %zmm2, %zmm0
+; SKX-NEXT: vextractf32x4 $2, %zmm0, %xmm0
+; SKX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; SKX-NEXT: vzeroupper
+; SKX-NEXT: retq
+ %x226 = shufflevector <8 x double> %x225, <8 x double> %x227, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+ %x228 = shufflevector <8 x double> %x225, <8 x double> %x227, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5 ,i32 13, i32 7, i32 15>
+ %x229 = fsub <8 x double> %x226, %x228
+ %x230 = extractelement <8 x double> %x229, i32 5
+ ret double %x230
+}
+
diff --git a/test/CodeGen/X86/avx512-i1test.ll b/test/CodeGen/X86/avx512-i1test.ll
index 321f26674e1e..df81b83d7c29 100755
--- a/test/CodeGen/X86/avx512-i1test.ll
+++ b/test/CodeGen/X86/avx512-i1test.ll
@@ -7,11 +7,11 @@ target triple = "x86_64-unknown-linux-gnu"
define void @func() {
; CHECK-LABEL: func:
-; CHECK: # BB#0: # %L_10
+; CHECK: # %bb.0: # %L_10
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: testb %al, %al
; CHECK-NEXT: je .LBB0_1
-; CHECK-NEXT: # BB#4: # %L_30
+; CHECK-NEXT: # %bb.4: # %L_30
; CHECK-NEXT: retq
; CHECK-NEXT: .LBB0_1: # %bb56
; CHECK-NEXT: xorl %eax, %eax
@@ -65,10 +65,10 @@ L_30: ; preds = %bb51, %L_10
; PR 28175
define i64 @func2(i1 zeroext %i, i32 %j) {
; CHECK-LABEL: func2:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: testl %esi, %esi
; CHECK-NEXT: je .LBB1_1
-; CHECK-NEXT: # BB#2: # %if.then
+; CHECK-NEXT: # %bb.2: # %if.then
; CHECK-NEXT: jmp bar # TAILCALL
; CHECK-NEXT: .LBB1_1: # %return
; CHECK-NEXT: movzbl %dil, %eax
diff --git a/test/CodeGen/X86/avx512-inc-dec.ll b/test/CodeGen/X86/avx512-inc-dec.ll
index 5183c9d0fb8f..4fa4f27beb79 100644
--- a/test/CodeGen/X86/avx512-inc-dec.ll
+++ b/test/CodeGen/X86/avx512-inc-dec.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
;CHECK-LABEL: test
;CHECK-NOT: dec
diff --git a/test/CodeGen/X86/avx512-insert-extract.ll b/test/CodeGen/X86/avx512-insert-extract.ll
index f858e7eb792f..7e0b981b2c6a 100644
--- a/test/CodeGen/X86/avx512-insert-extract.ll
+++ b/test/CodeGen/X86/avx512-insert-extract.ll
@@ -1,28 +1,17 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck --check-prefix=KNL %s
-; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck --check-prefix=SKX --check-prefix=SKX_ONLY %s
-; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=skx -mattr=avx512vbmi | FileCheck --check-prefix=SKX --check-prefix=SKX_VBMI %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f | FileCheck --check-prefix=CHECK --check-prefix=KNL %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq | FileCheck --check-prefix=CHECK --check-prefix=SKX --check-prefix=SKX_ONLY %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq,+avx512vbmi | FileCheck --check-prefix=CHECK --check-prefix=SKX --check-prefix=SKX_VBMI %s
define <16 x float> @test1(<16 x float> %x, float* %br, float %y) nounwind {
-; KNL-LABEL: test1:
-; KNL: ## BB#0:
-; KNL-NEXT: vinsertps {{.*#+}} xmm2 = xmm0[0],mem[0],xmm0[2,3]
-; KNL-NEXT: vinsertf32x4 $0, %xmm2, %zmm0, %zmm2
-; KNL-NEXT: vextractf32x4 $3, %zmm0, %xmm0
-; KNL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
-; KNL-NEXT: vinsertf32x4 $3, %xmm0, %zmm2, %zmm0
-; KNL-NEXT: retq
-; KNL-NEXT: ## -- End function
-;
-; SKX-LABEL: test1:
-; SKX: ## BB#0:
-; SKX-NEXT: vinsertps {{.*#+}} xmm2 = xmm0[0],mem[0],xmm0[2,3]
-; SKX-NEXT: vinsertf32x4 $0, %xmm2, %zmm0, %zmm2
-; SKX-NEXT: vextractf32x4 $3, %zmm0, %xmm0
-; SKX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
-; SKX-NEXT: vinsertf32x4 $3, %xmm0, %zmm2, %zmm0
-; SKX-NEXT: retq
-; SKX-NEXT: ## -- End function
+; CHECK-LABEL: test1:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vinsertps {{.*#+}} xmm2 = xmm0[0],mem[0],xmm0[2,3]
+; CHECK-NEXT: vinsertf32x4 $0, %xmm2, %zmm0, %zmm2
+; CHECK-NEXT: vextractf32x4 $3, %zmm0, %xmm0
+; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
+; CHECK-NEXT: vinsertf32x4 $3, %xmm0, %zmm2, %zmm0
+; CHECK-NEXT: retq
%rrr = load float, float* %br
%rrr2 = insertelement <16 x float> %x, float %rrr, i32 1
%rrr3 = insertelement <16 x float> %rrr2, float %y, i32 14
@@ -30,25 +19,14 @@ define <16 x float> @test1(<16 x float> %x, float* %br, float %y) nounwind {
}
define <8 x double> @test2(<8 x double> %x, double* %br, double %y) nounwind {
-; KNL-LABEL: test2:
-; KNL: ## BB#0:
-; KNL-NEXT: vmovhpd {{.*#+}} xmm2 = xmm0[0],mem[0]
-; KNL-NEXT: vinsertf32x4 $0, %xmm2, %zmm0, %zmm2
-; KNL-NEXT: vextractf32x4 $3, %zmm0, %xmm0
-; KNL-NEXT: vmovsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; KNL-NEXT: vinsertf32x4 $3, %xmm0, %zmm2, %zmm0
-; KNL-NEXT: retq
-; KNL-NEXT: ## -- End function
-;
-; SKX-LABEL: test2:
-; SKX: ## BB#0:
-; SKX-NEXT: vmovhpd {{.*#+}} xmm2 = xmm0[0],mem[0]
-; SKX-NEXT: vinsertf64x2 $0, %xmm2, %zmm0, %zmm2
-; SKX-NEXT: vextractf64x2 $3, %zmm0, %xmm0
-; SKX-NEXT: vmovsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; SKX-NEXT: vinsertf64x2 $3, %xmm0, %zmm2, %zmm0
-; SKX-NEXT: retq
-; SKX-NEXT: ## -- End function
+; CHECK-LABEL: test2:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vmovhpd {{.*#+}} xmm2 = xmm0[0],mem[0]
+; CHECK-NEXT: vinsertf32x4 $0, %xmm2, %zmm0, %zmm2
+; CHECK-NEXT: vextractf32x4 $3, %zmm0, %xmm0
+; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; CHECK-NEXT: vinsertf32x4 $3, %xmm0, %zmm2, %zmm0
+; CHECK-NEXT: retq
%rrr = load double, double* %br
%rrr2 = insertelement <8 x double> %x, double %rrr, i32 1
%rrr3 = insertelement <8 x double> %rrr2, double %y, i32 6
@@ -56,233 +34,135 @@ define <8 x double> @test2(<8 x double> %x, double* %br, double %y) nounwind {
}
define <16 x float> @test3(<16 x float> %x) nounwind {
-; KNL-LABEL: test3:
-; KNL: ## BB#0:
-; KNL-NEXT: vextractf32x4 $1, %zmm0, %xmm1
-; KNL-NEXT: vinsertps {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[2,3]
-; KNL-NEXT: vinsertf32x4 $0, %xmm1, %zmm0, %zmm0
-; KNL-NEXT: retq
-; KNL-NEXT: ## -- End function
-;
-; SKX-LABEL: test3:
-; SKX: ## BB#0:
-; SKX-NEXT: vextractf32x4 $1, %zmm0, %xmm1
-; SKX-NEXT: vinsertps {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[2,3]
-; SKX-NEXT: vinsertf32x4 $0, %xmm1, %zmm0, %zmm0
-; SKX-NEXT: retq
-; SKX-NEXT: ## -- End function
+; CHECK-LABEL: test3:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1
+; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[2,3]
+; CHECK-NEXT: vinsertf32x4 $0, %xmm1, %zmm0, %zmm0
+; CHECK-NEXT: retq
%eee = extractelement <16 x float> %x, i32 4
%rrr2 = insertelement <16 x float> %x, float %eee, i32 1
ret <16 x float> %rrr2
}
define <8 x i64> @test4(<8 x i64> %x) nounwind {
-; KNL-LABEL: test4:
-; KNL: ## BB#0:
-; KNL-NEXT: vextracti32x4 $2, %zmm0, %xmm1
-; KNL-NEXT: vmovq %xmm1, %rax
-; KNL-NEXT: vpinsrq $1, %rax, %xmm0, %xmm1
-; KNL-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0
-; KNL-NEXT: retq
-; KNL-NEXT: ## -- End function
-;
-; SKX-LABEL: test4:
-; SKX: ## BB#0:
-; SKX-NEXT: vextracti64x2 $2, %zmm0, %xmm1
-; SKX-NEXT: vmovq %xmm1, %rax
-; SKX-NEXT: vpinsrq $1, %rax, %xmm0, %xmm1
-; SKX-NEXT: vinserti64x2 $0, %xmm1, %zmm0, %zmm0
-; SKX-NEXT: retq
-; SKX-NEXT: ## -- End function
+; CHECK-LABEL: test4:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vextracti32x4 $2, %zmm0, %xmm1
+; CHECK-NEXT: vmovq %xmm1, %rax
+; CHECK-NEXT: vpinsrq $1, %rax, %xmm0, %xmm1
+; CHECK-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0
+; CHECK-NEXT: retq
%eee = extractelement <8 x i64> %x, i32 4
%rrr2 = insertelement <8 x i64> %x, i64 %eee, i32 1
ret <8 x i64> %rrr2
}
define i32 @test5(<4 x float> %x) nounwind {
-; KNL-LABEL: test5:
-; KNL: ## BB#0:
-; KNL-NEXT: vextractps $3, %xmm0, %eax
-; KNL-NEXT: retq
-; KNL-NEXT: ## -- End function
-;
-; SKX-LABEL: test5:
-; SKX: ## BB#0:
-; SKX-NEXT: vextractps $3, %xmm0, %eax
-; SKX-NEXT: retq
-; SKX-NEXT: ## -- End function
+; CHECK-LABEL: test5:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vextractps $3, %xmm0, %eax
+; CHECK-NEXT: retq
%ef = extractelement <4 x float> %x, i32 3
%ei = bitcast float %ef to i32
ret i32 %ei
}
define void @test6(<4 x float> %x, float* %out) nounwind {
-; KNL-LABEL: test6:
-; KNL: ## BB#0:
-; KNL-NEXT: vextractps $3, %xmm0, (%rdi)
-; KNL-NEXT: retq
-; KNL-NEXT: ## -- End function
-;
-; SKX-LABEL: test6:
-; SKX: ## BB#0:
-; SKX-NEXT: vextractps $3, %xmm0, (%rdi)
-; SKX-NEXT: retq
-; SKX-NEXT: ## -- End function
+; CHECK-LABEL: test6:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vextractps $3, %xmm0, (%rdi)
+; CHECK-NEXT: retq
%ef = extractelement <4 x float> %x, i32 3
store float %ef, float* %out, align 4
ret void
}
define float @test7(<16 x float> %x, i32 %ind) nounwind {
-; KNL-LABEL: test7:
-; KNL: ## BB#0:
-; KNL-NEXT: pushq %rbp
-; KNL-NEXT: movq %rsp, %rbp
-; KNL-NEXT: andq $-64, %rsp
-; KNL-NEXT: subq $128, %rsp
-; KNL-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
-; KNL-NEXT: vmovaps %zmm0, (%rsp)
-; KNL-NEXT: andl $15, %edi
-; KNL-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; KNL-NEXT: movq %rbp, %rsp
-; KNL-NEXT: popq %rbp
-; KNL-NEXT: retq
-; KNL-NEXT: ## -- End function
-;
-; SKX-LABEL: test7:
-; SKX: ## BB#0:
-; SKX-NEXT: pushq %rbp
-; SKX-NEXT: movq %rsp, %rbp
-; SKX-NEXT: andq $-64, %rsp
-; SKX-NEXT: subq $128, %rsp
-; SKX-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
-; SKX-NEXT: vmovaps %zmm0, (%rsp)
-; SKX-NEXT: andl $15, %edi
-; SKX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SKX-NEXT: movq %rbp, %rsp
-; SKX-NEXT: popq %rbp
-; SKX-NEXT: vzeroupper
-; SKX-NEXT: retq
-; SKX-NEXT: ## -- End function
+; CHECK-LABEL: test7:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: pushq %rbp
+; CHECK-NEXT: movq %rsp, %rbp
+; CHECK-NEXT: andq $-64, %rsp
+; CHECK-NEXT: subq $128, %rsp
+; CHECK-NEXT: ## kill: def %edi killed %edi def %rdi
+; CHECK-NEXT: vmovaps %zmm0, (%rsp)
+; CHECK-NEXT: andl $15, %edi
+; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT: movq %rbp, %rsp
+; CHECK-NEXT: popq %rbp
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
%e = extractelement <16 x float> %x, i32 %ind
ret float %e
}
define double @test8(<8 x double> %x, i32 %ind) nounwind {
-; KNL-LABEL: test8:
-; KNL: ## BB#0:
-; KNL-NEXT: pushq %rbp
-; KNL-NEXT: movq %rsp, %rbp
-; KNL-NEXT: andq $-64, %rsp
-; KNL-NEXT: subq $128, %rsp
-; KNL-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
-; KNL-NEXT: vmovaps %zmm0, (%rsp)
-; KNL-NEXT: andl $7, %edi
-; KNL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; KNL-NEXT: movq %rbp, %rsp
-; KNL-NEXT: popq %rbp
-; KNL-NEXT: retq
-; KNL-NEXT: ## -- End function
-;
-; SKX-LABEL: test8:
-; SKX: ## BB#0:
-; SKX-NEXT: pushq %rbp
-; SKX-NEXT: movq %rsp, %rbp
-; SKX-NEXT: andq $-64, %rsp
-; SKX-NEXT: subq $128, %rsp
-; SKX-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
-; SKX-NEXT: vmovaps %zmm0, (%rsp)
-; SKX-NEXT: andl $7, %edi
-; SKX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; SKX-NEXT: movq %rbp, %rsp
-; SKX-NEXT: popq %rbp
-; SKX-NEXT: vzeroupper
-; SKX-NEXT: retq
-; SKX-NEXT: ## -- End function
+; CHECK-LABEL: test8:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: pushq %rbp
+; CHECK-NEXT: movq %rsp, %rbp
+; CHECK-NEXT: andq $-64, %rsp
+; CHECK-NEXT: subq $128, %rsp
+; CHECK-NEXT: ## kill: def %edi killed %edi def %rdi
+; CHECK-NEXT: vmovaps %zmm0, (%rsp)
+; CHECK-NEXT: andl $7, %edi
+; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT: movq %rbp, %rsp
+; CHECK-NEXT: popq %rbp
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
%e = extractelement <8 x double> %x, i32 %ind
ret double %e
}
define float @test9(<8 x float> %x, i32 %ind) nounwind {
-; KNL-LABEL: test9:
-; KNL: ## BB#0:
-; KNL-NEXT: pushq %rbp
-; KNL-NEXT: movq %rsp, %rbp
-; KNL-NEXT: andq $-32, %rsp
-; KNL-NEXT: subq $64, %rsp
-; KNL-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
-; KNL-NEXT: vmovaps %ymm0, (%rsp)
-; KNL-NEXT: andl $7, %edi
-; KNL-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; KNL-NEXT: movq %rbp, %rsp
-; KNL-NEXT: popq %rbp
-; KNL-NEXT: retq
-; KNL-NEXT: ## -- End function
-;
-; SKX-LABEL: test9:
-; SKX: ## BB#0:
-; SKX-NEXT: pushq %rbp
-; SKX-NEXT: movq %rsp, %rbp
-; SKX-NEXT: andq $-32, %rsp
-; SKX-NEXT: subq $64, %rsp
-; SKX-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
-; SKX-NEXT: vmovaps %ymm0, (%rsp)
-; SKX-NEXT: andl $7, %edi
-; SKX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SKX-NEXT: movq %rbp, %rsp
-; SKX-NEXT: popq %rbp
-; SKX-NEXT: vzeroupper
-; SKX-NEXT: retq
-; SKX-NEXT: ## -- End function
+; CHECK-LABEL: test9:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: pushq %rbp
+; CHECK-NEXT: movq %rsp, %rbp
+; CHECK-NEXT: andq $-32, %rsp
+; CHECK-NEXT: subq $64, %rsp
+; CHECK-NEXT: ## kill: def %edi killed %edi def %rdi
+; CHECK-NEXT: vmovaps %ymm0, (%rsp)
+; CHECK-NEXT: andl $7, %edi
+; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT: movq %rbp, %rsp
+; CHECK-NEXT: popq %rbp
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
%e = extractelement <8 x float> %x, i32 %ind
ret float %e
}
define i32 @test10(<16 x i32> %x, i32 %ind) nounwind {
-; KNL-LABEL: test10:
-; KNL: ## BB#0:
-; KNL-NEXT: pushq %rbp
-; KNL-NEXT: movq %rsp, %rbp
-; KNL-NEXT: andq $-64, %rsp
-; KNL-NEXT: subq $128, %rsp
-; KNL-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
-; KNL-NEXT: vmovaps %zmm0, (%rsp)
-; KNL-NEXT: andl $15, %edi
-; KNL-NEXT: movl (%rsp,%rdi,4), %eax
-; KNL-NEXT: movq %rbp, %rsp
-; KNL-NEXT: popq %rbp
-; KNL-NEXT: retq
-; KNL-NEXT: ## -- End function
-;
-; SKX-LABEL: test10:
-; SKX: ## BB#0:
-; SKX-NEXT: pushq %rbp
-; SKX-NEXT: movq %rsp, %rbp
-; SKX-NEXT: andq $-64, %rsp
-; SKX-NEXT: subq $128, %rsp
-; SKX-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
-; SKX-NEXT: vmovaps %zmm0, (%rsp)
-; SKX-NEXT: andl $15, %edi
-; SKX-NEXT: movl (%rsp,%rdi,4), %eax
-; SKX-NEXT: movq %rbp, %rsp
-; SKX-NEXT: popq %rbp
-; SKX-NEXT: vzeroupper
-; SKX-NEXT: retq
-; SKX-NEXT: ## -- End function
+; CHECK-LABEL: test10:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: pushq %rbp
+; CHECK-NEXT: movq %rsp, %rbp
+; CHECK-NEXT: andq $-64, %rsp
+; CHECK-NEXT: subq $128, %rsp
+; CHECK-NEXT: ## kill: def %edi killed %edi def %rdi
+; CHECK-NEXT: vmovaps %zmm0, (%rsp)
+; CHECK-NEXT: andl $15, %edi
+; CHECK-NEXT: movl (%rsp,%rdi,4), %eax
+; CHECK-NEXT: movq %rbp, %rsp
+; CHECK-NEXT: popq %rbp
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
%e = extractelement <16 x i32> %x, i32 %ind
ret i32 %e
}
define <16 x i32> @test11(<16 x i32>%a, <16 x i32>%b) {
; KNL-LABEL: test11:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: vpcmpltud %zmm1, %zmm0, %k0
-; KNL-NEXT: kshiftlw $11, %k0, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kshiftrw $4, %k0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: testb $1, %al
; KNL-NEXT: je LBB10_2
-; KNL-NEXT: ## BB#1: ## %A
+; KNL-NEXT: ## %bb.1: ## %A
; KNL-NEXT: vmovdqa64 %zmm1, %zmm0
; KNL-NEXT: retq
; KNL-NEXT: LBB10_2: ## %B
@@ -290,14 +170,13 @@ define <16 x i32> @test11(<16 x i32>%a, <16 x i32>%b) {
; KNL-NEXT: retq
;
; SKX-LABEL: test11:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpcmpltud %zmm1, %zmm0, %k0
-; SKX-NEXT: kshiftlw $11, %k0, %k0
-; SKX-NEXT: kshiftrw $15, %k0, %k0
+; SKX-NEXT: kshiftrw $4, %k0, %k0
; SKX-NEXT: kmovd %k0, %eax
; SKX-NEXT: testb $1, %al
; SKX-NEXT: je LBB10_2
-; SKX-NEXT: ## BB#1: ## %A
+; SKX-NEXT: ## %bb.1: ## %A
; SKX-NEXT: vmovdqa64 %zmm1, %zmm0
; SKX-NEXT: retq
; SKX-NEXT: LBB10_2: ## %B
@@ -315,21 +194,18 @@ define <16 x i32> @test11(<16 x i32>%a, <16 x i32>%b) {
define i64 @test12(<16 x i64>%a, <16 x i64>%b, i64 %a1, i64 %b1) {
; KNL-LABEL: test12:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: vpcmpgtq %zmm0, %zmm2, %k0
-; KNL-NEXT: kshiftlw $15, %k0, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: testb $1, %al
; KNL-NEXT: cmoveq %rsi, %rdi
; KNL-NEXT: movq %rdi, %rax
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: test12:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpcmpgtq %zmm0, %zmm2, %k0
-; SKX-NEXT: kshiftlb $7, %k0, %k0
-; SKX-NEXT: kshiftrb $7, %k0, %k0
; SKX-NEXT: kmovd %k0, %eax
; SKX-NEXT: testb $1, %al
; SKX-NEXT: cmoveq %rsi, %rdi
@@ -344,7 +220,7 @@ define i64 @test12(<16 x i64>%a, <16 x i64>%b, i64 %a1, i64 %b1) {
define i16 @test13(i32 %a, i32 %b) {
; KNL-LABEL: test13:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: cmpl %esi, %edi
; KNL-NEXT: setb %al
; KNL-NEXT: movw $-4, %cx
@@ -355,11 +231,11 @@ define i16 @test13(i32 %a, i32 %b) {
; KNL-NEXT: kmovw %eax, %k1
; KNL-NEXT: korw %k1, %k0, %k0
; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; KNL-NEXT: ## kill: def %ax killed %ax killed %eax
; KNL-NEXT: retq
;
; SKX-LABEL: test13:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: cmpl %esi, %edi
; SKX-NEXT: setb %al
; SKX-NEXT: movw $-4, %cx
@@ -370,7 +246,7 @@ define i16 @test13(i32 %a, i32 %b) {
; SKX-NEXT: kmovw %eax, %k1
; SKX-NEXT: korw %k1, %k0, %k0
; SKX-NEXT: kmovd %k0, %eax
-; SKX-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; SKX-NEXT: ## kill: def %ax killed %ax killed %eax
; SKX-NEXT: retq
%cmp_res = icmp ult i32 %a, %b
%maskv = insertelement <16 x i1> <i1 true, i1 false, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i1 %cmp_res, i32 0
@@ -380,21 +256,20 @@ define i16 @test13(i32 %a, i32 %b) {
define i64 @test14(<8 x i64>%a, <8 x i64>%b, i64 %a1, i64 %b1) {
; KNL-LABEL: test14:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: vpcmpgtq %zmm0, %zmm1, %k0
-; KNL-NEXT: kshiftlw $11, %k0, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kshiftrw $4, %k0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: testb $1, %al
; KNL-NEXT: cmoveq %rsi, %rdi
; KNL-NEXT: movq %rdi, %rax
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: test14:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpcmpgtq %zmm0, %zmm1, %k0
-; SKX-NEXT: kshiftlb $3, %k0, %k0
-; SKX-NEXT: kshiftrb $7, %k0, %k0
+; SKX-NEXT: kshiftrb $4, %k0, %k0
; SKX-NEXT: kmovd %k0, %eax
; SKX-NEXT: testb $1, %al
; SKX-NEXT: cmoveq %rsi, %rdi
@@ -408,23 +283,14 @@ define i64 @test14(<8 x i64>%a, <8 x i64>%b, i64 %a1, i64 %b1) {
}
define i16 @test15(i1 *%addr) {
-; KNL-LABEL: test15:
-; KNL: ## BB#0:
-; KNL-NEXT: movb (%rdi), %al
-; KNL-NEXT: xorl %ecx, %ecx
-; KNL-NEXT: testb %al, %al
-; KNL-NEXT: movw $-1, %ax
-; KNL-NEXT: cmovew %cx, %ax
-; KNL-NEXT: retq
-;
-; SKX-LABEL: test15:
-; SKX: ## BB#0:
-; SKX-NEXT: movb (%rdi), %al
-; SKX-NEXT: xorl %ecx, %ecx
-; SKX-NEXT: testb %al, %al
-; SKX-NEXT: movw $-1, %ax
-; SKX-NEXT: cmovew %cx, %ax
-; SKX-NEXT: retq
+; CHECK-LABEL: test15:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: movb (%rdi), %al
+; CHECK-NEXT: xorl %ecx, %ecx
+; CHECK-NEXT: testb %al, %al
+; CHECK-NEXT: movw $-1, %ax
+; CHECK-NEXT: cmovew %cx, %ax
+; CHECK-NEXT: retq
%x = load i1 , i1 * %addr, align 1
%x1 = insertelement <16 x i1> undef, i1 %x, i32 10
%x2 = bitcast <16 x i1>%x1 to i16
@@ -433,33 +299,30 @@ define i16 @test15(i1 *%addr) {
define i16 @test16(i1 *%addr, i16 %a) {
; KNL-LABEL: test16:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: movb (%rdi), %al
-; KNL-NEXT: kmovw %esi, %k1
-; KNL-NEXT: kmovw %eax, %k2
-; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
-; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; KNL-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,3,4,5,6,7,8,9,16,11,12,13,14,15]
-; KNL-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
-; KNL-NEXT: vpslld $31, %zmm2, %zmm0
-; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT: kmovw %esi, %k0
+; KNL-NEXT: kmovw %eax, %k1
+; KNL-NEXT: kshiftrw $10, %k0, %k2
+; KNL-NEXT: kxorw %k1, %k2, %k1
+; KNL-NEXT: kshiftlw $15, %k1, %k1
+; KNL-NEXT: kshiftrw $5, %k1, %k1
+; KNL-NEXT: kxorw %k0, %k1, %k0
; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; KNL-NEXT: ## kill: def %ax killed %ax killed %eax
; KNL-NEXT: retq
;
; SKX-LABEL: test16:
-; SKX: ## BB#0:
-; SKX-NEXT: movb (%rdi), %al
-; SKX-NEXT: kmovd %esi, %k0
-; SKX-NEXT: kmovd %eax, %k1
-; SKX-NEXT: vpmovm2d %k1, %zmm0
-; SKX-NEXT: vpmovm2d %k0, %zmm1
-; SKX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,3,4,5,6,7,8,9,16,11,12,13,14,15]
-; SKX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
-; SKX-NEXT: vpmovd2m %zmm2, %k0
+; SKX: ## %bb.0:
+; SKX-NEXT: kmovb (%rdi), %k0
+; SKX-NEXT: kmovd %esi, %k1
+; SKX-NEXT: kshiftrw $10, %k1, %k2
+; SKX-NEXT: kxorw %k0, %k2, %k0
+; SKX-NEXT: kshiftlw $15, %k0, %k0
+; SKX-NEXT: kshiftrw $5, %k0, %k0
+; SKX-NEXT: kxorw %k1, %k0, %k0
; SKX-NEXT: kmovd %k0, %eax
-; SKX-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
-; SKX-NEXT: vzeroupper
+; SKX-NEXT: ## kill: def %ax killed %ax killed %eax
; SKX-NEXT: retq
%x = load i1 , i1 * %addr, align 128
%a1 = bitcast i16 %a to <16 x i1>
@@ -470,33 +333,30 @@ define i16 @test16(i1 *%addr, i16 %a) {
define i8 @test17(i1 *%addr, i8 %a) {
; KNL-LABEL: test17:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: movb (%rdi), %al
-; KNL-NEXT: kmovw %esi, %k1
-; KNL-NEXT: kmovw %eax, %k2
-; KNL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
-; KNL-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; KNL-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,8,5,6,7]
-; KNL-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
-; KNL-NEXT: vpsllq $63, %zmm2, %zmm0
-; KNL-NEXT: vptestmq %zmm0, %zmm0, %k0
+; KNL-NEXT: kmovw %esi, %k0
+; KNL-NEXT: kmovw %eax, %k1
+; KNL-NEXT: kshiftrw $4, %k0, %k2
+; KNL-NEXT: kxorw %k1, %k2, %k1
+; KNL-NEXT: kshiftlw $15, %k1, %k1
+; KNL-NEXT: kshiftrw $11, %k1, %k1
+; KNL-NEXT: kxorw %k0, %k1, %k0
; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; KNL-NEXT: ## kill: def %al killed %al killed %eax
; KNL-NEXT: retq
;
; SKX-LABEL: test17:
-; SKX: ## BB#0:
-; SKX-NEXT: movb (%rdi), %al
-; SKX-NEXT: kmovd %esi, %k0
-; SKX-NEXT: kmovd %eax, %k1
-; SKX-NEXT: vpmovm2q %k1, %zmm0
-; SKX-NEXT: vpmovm2q %k0, %zmm1
-; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,8,5,6,7]
-; SKX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
-; SKX-NEXT: vpmovq2m %zmm2, %k0
+; SKX: ## %bb.0:
+; SKX-NEXT: kmovb (%rdi), %k0
+; SKX-NEXT: kmovd %esi, %k1
+; SKX-NEXT: kshiftrb $4, %k1, %k2
+; SKX-NEXT: kxorb %k0, %k2, %k0
+; SKX-NEXT: kshiftlb $7, %k0, %k0
+; SKX-NEXT: kshiftrb $3, %k0, %k0
+; SKX-NEXT: kxorb %k1, %k0, %k0
; SKX-NEXT: kmovd %k0, %eax
-; SKX-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
-; SKX-NEXT: vzeroupper
+; SKX-NEXT: ## kill: def %al killed %al killed %eax
; SKX-NEXT: retq
%x = load i1 , i1 * %addr, align 128
%a1 = bitcast i8 %a to <8 x i1>
@@ -506,20 +366,13 @@ define i8 @test17(i1 *%addr, i8 %a) {
}
define i64 @extract_v8i64(<8 x i64> %x, i64* %dst) {
-; KNL-LABEL: extract_v8i64:
-; KNL: ## BB#0:
-; KNL-NEXT: vpextrq $1, %xmm0, %rax
-; KNL-NEXT: vextracti32x4 $1, %zmm0, %xmm0
-; KNL-NEXT: vpextrq $1, %xmm0, (%rdi)
-; KNL-NEXT: retq
-;
-; SKX-LABEL: extract_v8i64:
-; SKX: ## BB#0:
-; SKX-NEXT: vpextrq $1, %xmm0, %rax
-; SKX-NEXT: vextracti64x2 $1, %zmm0, %xmm0
-; SKX-NEXT: vpextrq $1, %xmm0, (%rdi)
-; SKX-NEXT: vzeroupper
-; SKX-NEXT: retq
+; CHECK-LABEL: extract_v8i64:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpextrq $1, %xmm0, %rax
+; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
+; CHECK-NEXT: vpextrq $1, %xmm0, (%rdi)
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
%r1 = extractelement <8 x i64> %x, i32 1
%r2 = extractelement <8 x i64> %x, i32 3
store i64 %r2, i64* %dst, align 1
@@ -527,20 +380,13 @@ define i64 @extract_v8i64(<8 x i64> %x, i64* %dst) {
}
define i64 @extract_v4i64(<4 x i64> %x, i64* %dst) {
-; KNL-LABEL: extract_v4i64:
-; KNL: ## BB#0:
-; KNL-NEXT: vpextrq $1, %xmm0, %rax
-; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
-; KNL-NEXT: vpextrq $1, %xmm0, (%rdi)
-; KNL-NEXT: retq
-;
-; SKX-LABEL: extract_v4i64:
-; SKX: ## BB#0:
-; SKX-NEXT: vpextrq $1, %xmm0, %rax
-; SKX-NEXT: vextracti128 $1, %ymm0, %xmm0
-; SKX-NEXT: vpextrq $1, %xmm0, (%rdi)
-; SKX-NEXT: vzeroupper
-; SKX-NEXT: retq
+; CHECK-LABEL: extract_v4i64:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpextrq $1, %xmm0, %rax
+; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
+; CHECK-NEXT: vpextrq $1, %xmm0, (%rdi)
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
%r1 = extractelement <4 x i64> %x, i32 1
%r2 = extractelement <4 x i64> %x, i32 3
store i64 %r2, i64* %dst, align 1
@@ -548,17 +394,11 @@ define i64 @extract_v4i64(<4 x i64> %x, i64* %dst) {
}
define i64 @extract_v2i64(<2 x i64> %x, i64* %dst) {
-; KNL-LABEL: extract_v2i64:
-; KNL: ## BB#0:
-; KNL-NEXT: vmovq %xmm0, %rax
-; KNL-NEXT: vpextrq $1, %xmm0, (%rdi)
-; KNL-NEXT: retq
-;
-; SKX-LABEL: extract_v2i64:
-; SKX: ## BB#0:
-; SKX-NEXT: vmovq %xmm0, %rax
-; SKX-NEXT: vpextrq $1, %xmm0, (%rdi)
-; SKX-NEXT: retq
+; CHECK-LABEL: extract_v2i64:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vmovq %xmm0, %rax
+; CHECK-NEXT: vpextrq $1, %xmm0, (%rdi)
+; CHECK-NEXT: retq
%r1 = extractelement <2 x i64> %x, i32 0
%r2 = extractelement <2 x i64> %x, i32 1
store i64 %r2, i64* %dst, align 1
@@ -566,20 +406,13 @@ define i64 @extract_v2i64(<2 x i64> %x, i64* %dst) {
}
define i32 @extract_v16i32(<16 x i32> %x, i32* %dst) {
-; KNL-LABEL: extract_v16i32:
-; KNL: ## BB#0:
-; KNL-NEXT: vpextrd $1, %xmm0, %eax
-; KNL-NEXT: vextracti32x4 $1, %zmm0, %xmm0
-; KNL-NEXT: vpextrd $1, %xmm0, (%rdi)
-; KNL-NEXT: retq
-;
-; SKX-LABEL: extract_v16i32:
-; SKX: ## BB#0:
-; SKX-NEXT: vpextrd $1, %xmm0, %eax
-; SKX-NEXT: vextracti32x4 $1, %zmm0, %xmm0
-; SKX-NEXT: vpextrd $1, %xmm0, (%rdi)
-; SKX-NEXT: vzeroupper
-; SKX-NEXT: retq
+; CHECK-LABEL: extract_v16i32:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vextractps $1, %xmm0, %eax
+; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT: vextractps $1, %xmm0, (%rdi)
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
%r1 = extractelement <16 x i32> %x, i32 1
%r2 = extractelement <16 x i32> %x, i32 5
store i32 %r2, i32* %dst, align 1
@@ -587,20 +420,13 @@ define i32 @extract_v16i32(<16 x i32> %x, i32* %dst) {
}
define i32 @extract_v8i32(<8 x i32> %x, i32* %dst) {
-; KNL-LABEL: extract_v8i32:
-; KNL: ## BB#0:
-; KNL-NEXT: vpextrd $1, %xmm0, %eax
-; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
-; KNL-NEXT: vpextrd $1, %xmm0, (%rdi)
-; KNL-NEXT: retq
-;
-; SKX-LABEL: extract_v8i32:
-; SKX: ## BB#0:
-; SKX-NEXT: vpextrd $1, %xmm0, %eax
-; SKX-NEXT: vextracti128 $1, %ymm0, %xmm0
-; SKX-NEXT: vpextrd $1, %xmm0, (%rdi)
-; SKX-NEXT: vzeroupper
-; SKX-NEXT: retq
+; CHECK-LABEL: extract_v8i32:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vextractps $1, %xmm0, %eax
+; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT: vextractps $1, %xmm0, (%rdi)
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
%r1 = extractelement <8 x i32> %x, i32 1
%r2 = extractelement <8 x i32> %x, i32 5
store i32 %r2, i32* %dst, align 1
@@ -608,17 +434,11 @@ define i32 @extract_v8i32(<8 x i32> %x, i32* %dst) {
}
define i32 @extract_v4i32(<4 x i32> %x, i32* %dst) {
-; KNL-LABEL: extract_v4i32:
-; KNL: ## BB#0:
-; KNL-NEXT: vpextrd $1, %xmm0, %eax
-; KNL-NEXT: vpextrd $3, %xmm0, (%rdi)
-; KNL-NEXT: retq
-;
-; SKX-LABEL: extract_v4i32:
-; SKX: ## BB#0:
-; SKX-NEXT: vpextrd $1, %xmm0, %eax
-; SKX-NEXT: vpextrd $3, %xmm0, (%rdi)
-; SKX-NEXT: retq
+; CHECK-LABEL: extract_v4i32:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vextractps $1, %xmm0, %eax
+; CHECK-NEXT: vextractps $3, %xmm0, (%rdi)
+; CHECK-NEXT: retq
%r1 = extractelement <4 x i32> %x, i32 1
%r2 = extractelement <4 x i32> %x, i32 3
store i32 %r2, i32* %dst, align 1
@@ -626,22 +446,14 @@ define i32 @extract_v4i32(<4 x i32> %x, i32* %dst) {
}
define i16 @extract_v32i16(<32 x i16> %x, i16* %dst) {
-; KNL-LABEL: extract_v32i16:
-; KNL: ## BB#0:
-; KNL-NEXT: vpextrw $1, %xmm0, %eax
-; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
-; KNL-NEXT: vpextrw $1, %xmm0, (%rdi)
-; KNL-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
-; KNL-NEXT: retq
-;
-; SKX-LABEL: extract_v32i16:
-; SKX: ## BB#0:
-; SKX-NEXT: vpextrw $1, %xmm0, %eax
-; SKX-NEXT: vextracti32x4 $1, %zmm0, %xmm0
-; SKX-NEXT: vpextrw $1, %xmm0, (%rdi)
-; SKX-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
-; SKX-NEXT: vzeroupper
-; SKX-NEXT: retq
+; CHECK-LABEL: extract_v32i16:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpextrw $1, %xmm0, %eax
+; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
+; CHECK-NEXT: vpextrw $1, %xmm0, (%rdi)
+; CHECK-NEXT: ## kill: def %ax killed %ax killed %eax
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
%r1 = extractelement <32 x i16> %x, i32 1
%r2 = extractelement <32 x i16> %x, i32 9
store i16 %r2, i16* %dst, align 1
@@ -649,22 +461,14 @@ define i16 @extract_v32i16(<32 x i16> %x, i16* %dst) {
}
define i16 @extract_v16i16(<16 x i16> %x, i16* %dst) {
-; KNL-LABEL: extract_v16i16:
-; KNL: ## BB#0:
-; KNL-NEXT: vpextrw $1, %xmm0, %eax
-; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
-; KNL-NEXT: vpextrw $1, %xmm0, (%rdi)
-; KNL-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
-; KNL-NEXT: retq
-;
-; SKX-LABEL: extract_v16i16:
-; SKX: ## BB#0:
-; SKX-NEXT: vpextrw $1, %xmm0, %eax
-; SKX-NEXT: vextracti128 $1, %ymm0, %xmm0
-; SKX-NEXT: vpextrw $1, %xmm0, (%rdi)
-; SKX-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
-; SKX-NEXT: vzeroupper
-; SKX-NEXT: retq
+; CHECK-LABEL: extract_v16i16:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpextrw $1, %xmm0, %eax
+; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
+; CHECK-NEXT: vpextrw $1, %xmm0, (%rdi)
+; CHECK-NEXT: ## kill: def %ax killed %ax killed %eax
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
%r1 = extractelement <16 x i16> %x, i32 1
%r2 = extractelement <16 x i16> %x, i32 9
store i16 %r2, i16* %dst, align 1
@@ -672,19 +476,12 @@ define i16 @extract_v16i16(<16 x i16> %x, i16* %dst) {
}
define i16 @extract_v8i16(<8 x i16> %x, i16* %dst) {
-; KNL-LABEL: extract_v8i16:
-; KNL: ## BB#0:
-; KNL-NEXT: vpextrw $1, %xmm0, %eax
-; KNL-NEXT: vpextrw $3, %xmm0, (%rdi)
-; KNL-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
-; KNL-NEXT: retq
-;
-; SKX-LABEL: extract_v8i16:
-; SKX: ## BB#0:
-; SKX-NEXT: vpextrw $1, %xmm0, %eax
-; SKX-NEXT: vpextrw $3, %xmm0, (%rdi)
-; SKX-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
-; SKX-NEXT: retq
+; CHECK-LABEL: extract_v8i16:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpextrw $1, %xmm0, %eax
+; CHECK-NEXT: vpextrw $3, %xmm0, (%rdi)
+; CHECK-NEXT: ## kill: def %ax killed %ax killed %eax
+; CHECK-NEXT: retq
%r1 = extractelement <8 x i16> %x, i32 1
%r2 = extractelement <8 x i16> %x, i32 3
store i16 %r2, i16* %dst, align 1
@@ -692,22 +489,14 @@ define i16 @extract_v8i16(<8 x i16> %x, i16* %dst) {
}
define i8 @extract_v64i8(<64 x i8> %x, i8* %dst) {
-; KNL-LABEL: extract_v64i8:
-; KNL: ## BB#0:
-; KNL-NEXT: vpextrb $1, %xmm0, %eax
-; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
-; KNL-NEXT: vpextrb $1, %xmm0, (%rdi)
-; KNL-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
-; KNL-NEXT: retq
-;
-; SKX-LABEL: extract_v64i8:
-; SKX: ## BB#0:
-; SKX-NEXT: vpextrb $1, %xmm0, %eax
-; SKX-NEXT: vextracti32x4 $1, %zmm0, %xmm0
-; SKX-NEXT: vpextrb $1, %xmm0, (%rdi)
-; SKX-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
-; SKX-NEXT: vzeroupper
-; SKX-NEXT: retq
+; CHECK-LABEL: extract_v64i8:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpextrb $1, %xmm0, %eax
+; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
+; CHECK-NEXT: vpextrb $1, %xmm0, (%rdi)
+; CHECK-NEXT: ## kill: def %al killed %al killed %eax
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
%r1 = extractelement <64 x i8> %x, i32 1
%r2 = extractelement <64 x i8> %x, i32 17
store i8 %r2, i8* %dst, align 1
@@ -715,22 +504,14 @@ define i8 @extract_v64i8(<64 x i8> %x, i8* %dst) {
}
define i8 @extract_v32i8(<32 x i8> %x, i8* %dst) {
-; KNL-LABEL: extract_v32i8:
-; KNL: ## BB#0:
-; KNL-NEXT: vpextrb $1, %xmm0, %eax
-; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
-; KNL-NEXT: vpextrb $1, %xmm0, (%rdi)
-; KNL-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
-; KNL-NEXT: retq
-;
-; SKX-LABEL: extract_v32i8:
-; SKX: ## BB#0:
-; SKX-NEXT: vpextrb $1, %xmm0, %eax
-; SKX-NEXT: vextracti128 $1, %ymm0, %xmm0
-; SKX-NEXT: vpextrb $1, %xmm0, (%rdi)
-; SKX-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
-; SKX-NEXT: vzeroupper
-; SKX-NEXT: retq
+; CHECK-LABEL: extract_v32i8:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpextrb $1, %xmm0, %eax
+; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
+; CHECK-NEXT: vpextrb $1, %xmm0, (%rdi)
+; CHECK-NEXT: ## kill: def %al killed %al killed %eax
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
%r1 = extractelement <32 x i8> %x, i32 1
%r2 = extractelement <32 x i8> %x, i32 17
store i8 %r2, i8* %dst, align 1
@@ -738,19 +519,12 @@ define i8 @extract_v32i8(<32 x i8> %x, i8* %dst) {
}
define i8 @extract_v16i8(<16 x i8> %x, i8* %dst) {
-; KNL-LABEL: extract_v16i8:
-; KNL: ## BB#0:
-; KNL-NEXT: vpextrb $1, %xmm0, %eax
-; KNL-NEXT: vpextrb $3, %xmm0, (%rdi)
-; KNL-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
-; KNL-NEXT: retq
-;
-; SKX-LABEL: extract_v16i8:
-; SKX: ## BB#0:
-; SKX-NEXT: vpextrb $1, %xmm0, %eax
-; SKX-NEXT: vpextrb $3, %xmm0, (%rdi)
-; SKX-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
-; SKX-NEXT: retq
+; CHECK-LABEL: extract_v16i8:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpextrb $1, %xmm0, %eax
+; CHECK-NEXT: vpextrb $3, %xmm0, (%rdi)
+; CHECK-NEXT: ## kill: def %al killed %al killed %eax
+; CHECK-NEXT: retq
%r1 = extractelement <16 x i8> %x, i32 1
%r2 = extractelement <16 x i8> %x, i32 3
store i8 %r2, i8* %dst, align 1
@@ -758,23 +532,14 @@ define i8 @extract_v16i8(<16 x i8> %x, i8* %dst) {
}
define <8 x i64> @insert_v8i64(<8 x i64> %x, i64 %y , i64* %ptr) {
-; KNL-LABEL: insert_v8i64:
-; KNL: ## BB#0:
-; KNL-NEXT: vpinsrq $1, (%rsi), %xmm0, %xmm1
-; KNL-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm1
-; KNL-NEXT: vextracti32x4 $1, %zmm0, %xmm0
-; KNL-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm0
-; KNL-NEXT: vinserti32x4 $1, %xmm0, %zmm1, %zmm0
-; KNL-NEXT: retq
-;
-; SKX-LABEL: insert_v8i64:
-; SKX: ## BB#0:
-; SKX-NEXT: vpinsrq $1, (%rsi), %xmm0, %xmm1
-; SKX-NEXT: vinserti64x2 $0, %xmm1, %zmm0, %zmm1
-; SKX-NEXT: vextracti64x2 $1, %zmm0, %xmm0
-; SKX-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm0
-; SKX-NEXT: vinserti64x2 $1, %xmm0, %zmm1, %zmm0
-; SKX-NEXT: retq
+; CHECK-LABEL: insert_v8i64:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpinsrq $1, (%rsi), %xmm0, %xmm1
+; CHECK-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm1
+; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
+; CHECK-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm0
+; CHECK-NEXT: vinserti32x4 $1, %xmm0, %zmm1, %zmm0
+; CHECK-NEXT: retq
%val = load i64, i64* %ptr
%r1 = insertelement <8 x i64> %x, i64 %val, i32 1
%r2 = insertelement <8 x i64> %r1, i64 %y, i32 3
@@ -782,23 +547,14 @@ define <8 x i64> @insert_v8i64(<8 x i64> %x, i64 %y , i64* %ptr) {
}
define <4 x i64> @insert_v4i64(<4 x i64> %x, i64 %y , i64* %ptr) {
-; KNL-LABEL: insert_v4i64:
-; KNL: ## BB#0:
-; KNL-NEXT: vpinsrq $1, (%rsi), %xmm0, %xmm1
-; KNL-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
-; KNL-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm0
-; KNL-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; KNL-NEXT: retq
-;
-; SKX-LABEL: insert_v4i64:
-; SKX: ## BB#0:
-; SKX-NEXT: vpinsrq $1, (%rsi), %xmm0, %xmm1
-; SKX-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; SKX-NEXT: vextracti128 $1, %ymm0, %xmm0
-; SKX-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm0
-; SKX-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; SKX-NEXT: retq
+; CHECK-LABEL: insert_v4i64:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpinsrq $1, (%rsi), %xmm0, %xmm1
+; CHECK-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
+; CHECK-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm0
+; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; CHECK-NEXT: retq
%val = load i64, i64* %ptr
%r1 = insertelement <4 x i64> %x, i64 %val, i32 1
%r2 = insertelement <4 x i64> %r1, i64 %y, i32 3
@@ -806,17 +562,11 @@ define <4 x i64> @insert_v4i64(<4 x i64> %x, i64 %y , i64* %ptr) {
}
define <2 x i64> @insert_v2i64(<2 x i64> %x, i64 %y , i64* %ptr) {
-; KNL-LABEL: insert_v2i64:
-; KNL: ## BB#0:
-; KNL-NEXT: vpinsrq $0, %rdi, %xmm0, %xmm0
-; KNL-NEXT: vpinsrq $1, (%rsi), %xmm0, %xmm0
-; KNL-NEXT: retq
-;
-; SKX-LABEL: insert_v2i64:
-; SKX: ## BB#0:
-; SKX-NEXT: vpinsrq $0, %rdi, %xmm0, %xmm0
-; SKX-NEXT: vpinsrq $1, (%rsi), %xmm0, %xmm0
-; SKX-NEXT: retq
+; CHECK-LABEL: insert_v2i64:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpinsrq $0, %rdi, %xmm0, %xmm0
+; CHECK-NEXT: vpinsrq $1, (%rsi), %xmm0, %xmm0
+; CHECK-NEXT: retq
%val = load i64, i64* %ptr
%r1 = insertelement <2 x i64> %x, i64 %val, i32 1
%r2 = insertelement <2 x i64> %r1, i64 %y, i32 0
@@ -824,23 +574,14 @@ define <2 x i64> @insert_v2i64(<2 x i64> %x, i64 %y , i64* %ptr) {
}
define <16 x i32> @insert_v16i32(<16 x i32> %x, i32 %y, i32* %ptr) {
-; KNL-LABEL: insert_v16i32:
-; KNL: ## BB#0:
-; KNL-NEXT: vpinsrd $1, (%rsi), %xmm0, %xmm1
-; KNL-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm1
-; KNL-NEXT: vextracti32x4 $1, %zmm0, %xmm0
-; KNL-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0
-; KNL-NEXT: vinserti32x4 $1, %xmm0, %zmm1, %zmm0
-; KNL-NEXT: retq
-;
-; SKX-LABEL: insert_v16i32:
-; SKX: ## BB#0:
-; SKX-NEXT: vpinsrd $1, (%rsi), %xmm0, %xmm1
-; SKX-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm1
-; SKX-NEXT: vextracti32x4 $1, %zmm0, %xmm0
-; SKX-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0
-; SKX-NEXT: vinserti32x4 $1, %xmm0, %zmm1, %zmm0
-; SKX-NEXT: retq
+; CHECK-LABEL: insert_v16i32:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpinsrd $1, (%rsi), %xmm0, %xmm1
+; CHECK-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm1
+; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
+; CHECK-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0
+; CHECK-NEXT: vinserti32x4 $1, %xmm0, %zmm1, %zmm0
+; CHECK-NEXT: retq
%val = load i32, i32* %ptr
%r1 = insertelement <16 x i32> %x, i32 %val, i32 1
%r2 = insertelement <16 x i32> %r1, i32 %y, i32 5
@@ -848,23 +589,14 @@ define <16 x i32> @insert_v16i32(<16 x i32> %x, i32 %y, i32* %ptr) {
}
define <8 x i32> @insert_v8i32(<8 x i32> %x, i32 %y, i32* %ptr) {
-; KNL-LABEL: insert_v8i32:
-; KNL: ## BB#0:
-; KNL-NEXT: vpinsrd $1, (%rsi), %xmm0, %xmm1
-; KNL-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
-; KNL-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0
-; KNL-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; KNL-NEXT: retq
-;
-; SKX-LABEL: insert_v8i32:
-; SKX: ## BB#0:
-; SKX-NEXT: vpinsrd $1, (%rsi), %xmm0, %xmm1
-; SKX-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; SKX-NEXT: vextracti128 $1, %ymm0, %xmm0
-; SKX-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0
-; SKX-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; SKX-NEXT: retq
+; CHECK-LABEL: insert_v8i32:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpinsrd $1, (%rsi), %xmm0, %xmm1
+; CHECK-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
+; CHECK-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0
+; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; CHECK-NEXT: retq
%val = load i32, i32* %ptr
%r1 = insertelement <8 x i32> %x, i32 %val, i32 1
%r2 = insertelement <8 x i32> %r1, i32 %y, i32 5
@@ -872,17 +604,11 @@ define <8 x i32> @insert_v8i32(<8 x i32> %x, i32 %y, i32* %ptr) {
}
define <4 x i32> @insert_v4i32(<4 x i32> %x, i32 %y, i32* %ptr) {
-; KNL-LABEL: insert_v4i32:
-; KNL: ## BB#0:
-; KNL-NEXT: vpinsrd $1, (%rsi), %xmm0, %xmm0
-; KNL-NEXT: vpinsrd $3, %edi, %xmm0, %xmm0
-; KNL-NEXT: retq
-;
-; SKX-LABEL: insert_v4i32:
-; SKX: ## BB#0:
-; SKX-NEXT: vpinsrd $1, (%rsi), %xmm0, %xmm0
-; SKX-NEXT: vpinsrd $3, %edi, %xmm0, %xmm0
-; SKX-NEXT: retq
+; CHECK-LABEL: insert_v4i32:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpinsrd $1, (%rsi), %xmm0, %xmm0
+; CHECK-NEXT: vpinsrd $3, %edi, %xmm0, %xmm0
+; CHECK-NEXT: retq
%val = load i32, i32* %ptr
%r1 = insertelement <4 x i32> %x, i32 %val, i32 1
%r2 = insertelement <4 x i32> %r1, i32 %y, i32 3
@@ -891,7 +617,7 @@ define <4 x i32> @insert_v4i32(<4 x i32> %x, i32 %y, i32* %ptr) {
define <32 x i16> @insert_v32i16(<32 x i16> %x, i16 %y, i16* %ptr) {
; KNL-LABEL: insert_v32i16:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: vpinsrw $1, (%rsi), %xmm0, %xmm2
; KNL-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5,6,7]
; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
@@ -900,10 +626,10 @@ define <32 x i16> @insert_v32i16(<32 x i16> %x, i16 %y, i16* %ptr) {
; KNL-NEXT: retq
;
; SKX-LABEL: insert_v32i16:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpinsrw $1, (%rsi), %xmm0, %xmm1
; SKX-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm1
-; SKX-NEXT: vextracti32x4 $1, %zmm0, %xmm0
+; SKX-NEXT: vextracti128 $1, %ymm0, %xmm0
; SKX-NEXT: vpinsrw $1, %edi, %xmm0, %xmm0
; SKX-NEXT: vinserti32x4 $1, %xmm0, %zmm1, %zmm0
; SKX-NEXT: retq
@@ -914,23 +640,14 @@ define <32 x i16> @insert_v32i16(<32 x i16> %x, i16 %y, i16* %ptr) {
}
define <16 x i16> @insert_v16i16(<16 x i16> %x, i16 %y, i16* %ptr) {
-; KNL-LABEL: insert_v16i16:
-; KNL: ## BB#0:
-; KNL-NEXT: vpinsrw $1, (%rsi), %xmm0, %xmm1
-; KNL-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
-; KNL-NEXT: vpinsrw $1, %edi, %xmm0, %xmm0
-; KNL-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; KNL-NEXT: retq
-;
-; SKX-LABEL: insert_v16i16:
-; SKX: ## BB#0:
-; SKX-NEXT: vpinsrw $1, (%rsi), %xmm0, %xmm1
-; SKX-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; SKX-NEXT: vextracti128 $1, %ymm0, %xmm0
-; SKX-NEXT: vpinsrw $1, %edi, %xmm0, %xmm0
-; SKX-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; SKX-NEXT: retq
+; CHECK-LABEL: insert_v16i16:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpinsrw $1, (%rsi), %xmm0, %xmm1
+; CHECK-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
+; CHECK-NEXT: vpinsrw $1, %edi, %xmm0, %xmm0
+; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; CHECK-NEXT: retq
%val = load i16, i16* %ptr
%r1 = insertelement <16 x i16> %x, i16 %val, i32 1
%r2 = insertelement <16 x i16> %r1, i16 %y, i32 9
@@ -938,17 +655,11 @@ define <16 x i16> @insert_v16i16(<16 x i16> %x, i16 %y, i16* %ptr) {
}
define <8 x i16> @insert_v8i16(<8 x i16> %x, i16 %y, i16* %ptr) {
-; KNL-LABEL: insert_v8i16:
-; KNL: ## BB#0:
-; KNL-NEXT: vpinsrw $1, (%rsi), %xmm0, %xmm0
-; KNL-NEXT: vpinsrw $5, %edi, %xmm0, %xmm0
-; KNL-NEXT: retq
-;
-; SKX-LABEL: insert_v8i16:
-; SKX: ## BB#0:
-; SKX-NEXT: vpinsrw $1, (%rsi), %xmm0, %xmm0
-; SKX-NEXT: vpinsrw $5, %edi, %xmm0, %xmm0
-; SKX-NEXT: retq
+; CHECK-LABEL: insert_v8i16:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpinsrw $1, (%rsi), %xmm0, %xmm0
+; CHECK-NEXT: vpinsrw $5, %edi, %xmm0, %xmm0
+; CHECK-NEXT: retq
%val = load i16, i16* %ptr
%r1 = insertelement <8 x i16> %x, i16 %val, i32 1
%r2 = insertelement <8 x i16> %r1, i16 %y, i32 5
@@ -957,7 +668,7 @@ define <8 x i16> @insert_v8i16(<8 x i16> %x, i16 %y, i16* %ptr) {
define <64 x i8> @insert_v64i8(<64 x i8> %x, i8 %y, i8* %ptr) {
; KNL-LABEL: insert_v64i8:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: vpinsrb $1, (%rsi), %xmm0, %xmm2
; KNL-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
; KNL-NEXT: vextracti128 $1, %ymm1, %xmm2
@@ -966,7 +677,7 @@ define <64 x i8> @insert_v64i8(<64 x i8> %x, i8 %y, i8* %ptr) {
; KNL-NEXT: retq
;
; SKX-LABEL: insert_v64i8:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpinsrb $1, (%rsi), %xmm0, %xmm1
; SKX-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm1
; SKX-NEXT: vextracti32x4 $3, %zmm0, %xmm0
@@ -980,23 +691,14 @@ define <64 x i8> @insert_v64i8(<64 x i8> %x, i8 %y, i8* %ptr) {
}
define <32 x i8> @insert_v32i8(<32 x i8> %x, i8 %y, i8* %ptr) {
-; KNL-LABEL: insert_v32i8:
-; KNL: ## BB#0:
-; KNL-NEXT: vpinsrb $1, (%rsi), %xmm0, %xmm1
-; KNL-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
-; KNL-NEXT: vpinsrb $1, %edi, %xmm0, %xmm0
-; KNL-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; KNL-NEXT: retq
-;
-; SKX-LABEL: insert_v32i8:
-; SKX: ## BB#0:
-; SKX-NEXT: vpinsrb $1, (%rsi), %xmm0, %xmm1
-; SKX-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; SKX-NEXT: vextracti128 $1, %ymm0, %xmm0
-; SKX-NEXT: vpinsrb $1, %edi, %xmm0, %xmm0
-; SKX-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; SKX-NEXT: retq
+; CHECK-LABEL: insert_v32i8:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpinsrb $1, (%rsi), %xmm0, %xmm1
+; CHECK-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
+; CHECK-NEXT: vpinsrb $1, %edi, %xmm0, %xmm0
+; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; CHECK-NEXT: retq
%val = load i8, i8* %ptr
%r1 = insertelement <32 x i8> %x, i8 %val, i32 1
%r2 = insertelement <32 x i8> %r1, i8 %y, i32 17
@@ -1004,17 +706,11 @@ define <32 x i8> @insert_v32i8(<32 x i8> %x, i8 %y, i8* %ptr) {
}
define <16 x i8> @insert_v16i8(<16 x i8> %x, i8 %y, i8* %ptr) {
-; KNL-LABEL: insert_v16i8:
-; KNL: ## BB#0:
-; KNL-NEXT: vpinsrb $3, (%rsi), %xmm0, %xmm0
-; KNL-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; KNL-NEXT: retq
-;
-; SKX-LABEL: insert_v16i8:
-; SKX: ## BB#0:
-; SKX-NEXT: vpinsrb $3, (%rsi), %xmm0, %xmm0
-; SKX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; SKX-NEXT: retq
+; CHECK-LABEL: insert_v16i8:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpinsrb $3, (%rsi), %xmm0, %xmm0
+; CHECK-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
+; CHECK-NEXT: retq
%val = load i8, i8* %ptr
%r1 = insertelement <16 x i8> %x, i8 %val, i32 3
%r2 = insertelement <16 x i8> %r1, i8 %y, i32 10
@@ -1022,253 +718,91 @@ define <16 x i8> @insert_v16i8(<16 x i8> %x, i8 %y, i8* %ptr) {
}
define <8 x i64> @test_insert_128_v8i64(<8 x i64> %x, i64 %y) {
-; KNL-LABEL: test_insert_128_v8i64:
-; KNL: ## BB#0:
-; KNL-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm1
-; KNL-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0
-; KNL-NEXT: retq
-;
-; SKX-LABEL: test_insert_128_v8i64:
-; SKX: ## BB#0:
-; SKX-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm1
-; SKX-NEXT: vinserti64x2 $0, %xmm1, %zmm0, %zmm0
-; SKX-NEXT: retq
+; CHECK-LABEL: test_insert_128_v8i64:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm1
+; CHECK-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0
+; CHECK-NEXT: retq
%r = insertelement <8 x i64> %x, i64 %y, i32 1
ret <8 x i64> %r
}
define <16 x i32> @test_insert_128_v16i32(<16 x i32> %x, i32 %y) {
-; KNL-LABEL: test_insert_128_v16i32:
-; KNL: ## BB#0:
-; KNL-NEXT: vpinsrd $1, %edi, %xmm0, %xmm1
-; KNL-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0
-; KNL-NEXT: retq
-;
-; SKX-LABEL: test_insert_128_v16i32:
-; SKX: ## BB#0:
-; SKX-NEXT: vpinsrd $1, %edi, %xmm0, %xmm1
-; SKX-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0
-; SKX-NEXT: retq
+; CHECK-LABEL: test_insert_128_v16i32:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpinsrd $1, %edi, %xmm0, %xmm1
+; CHECK-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0
+; CHECK-NEXT: retq
%r = insertelement <16 x i32> %x, i32 %y, i32 1
ret <16 x i32> %r
}
define <8 x double> @test_insert_128_v8f64(<8 x double> %x, double %y) {
-; KNL-LABEL: test_insert_128_v8f64:
-; KNL: ## BB#0:
-; KNL-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],xmm1[0]
-; KNL-NEXT: vinsertf32x4 $0, %xmm1, %zmm0, %zmm0
-; KNL-NEXT: retq
-;
-; SKX-LABEL: test_insert_128_v8f64:
-; SKX: ## BB#0:
-; SKX-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],xmm1[0]
-; SKX-NEXT: vinsertf64x2 $0, %xmm1, %zmm0, %zmm0
-; SKX-NEXT: retq
+; CHECK-LABEL: test_insert_128_v8f64:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vmovlhps {{.*#+}} xmm1 = xmm0[0],xmm1[0]
+; CHECK-NEXT: vinsertf32x4 $0, %xmm1, %zmm0, %zmm0
+; CHECK-NEXT: retq
%r = insertelement <8 x double> %x, double %y, i32 1
ret <8 x double> %r
}
define <16 x float> @test_insert_128_v16f32(<16 x float> %x, float %y) {
-; KNL-LABEL: test_insert_128_v16f32:
-; KNL: ## BB#0:
-; KNL-NEXT: vinsertps {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[2,3]
-; KNL-NEXT: vinsertf32x4 $0, %xmm1, %zmm0, %zmm0
-; KNL-NEXT: retq
-;
-; SKX-LABEL: test_insert_128_v16f32:
-; SKX: ## BB#0:
-; SKX-NEXT: vinsertps {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[2,3]
-; SKX-NEXT: vinsertf32x4 $0, %xmm1, %zmm0, %zmm0
-; SKX-NEXT: retq
+; CHECK-LABEL: test_insert_128_v16f32:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[2,3]
+; CHECK-NEXT: vinsertf32x4 $0, %xmm1, %zmm0, %zmm0
+; CHECK-NEXT: retq
%r = insertelement <16 x float> %x, float %y, i32 1
ret <16 x float> %r
}
define <16 x i16> @test_insert_128_v16i16(<16 x i16> %x, i16 %y) {
-; KNL-LABEL: test_insert_128_v16i16:
-; KNL: ## BB#0:
-; KNL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; KNL-NEXT: vpinsrw $2, %edi, %xmm1, %xmm1
-; KNL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; KNL-NEXT: retq
-;
-; SKX-LABEL: test_insert_128_v16i16:
-; SKX: ## BB#0:
-; SKX-NEXT: vextracti128 $1, %ymm0, %xmm1
-; SKX-NEXT: vpinsrw $2, %edi, %xmm1, %xmm1
-; SKX-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; SKX-NEXT: retq
+; CHECK-LABEL: test_insert_128_v16i16:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1
+; CHECK-NEXT: vpinsrw $2, %edi, %xmm1, %xmm1
+; CHECK-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; CHECK-NEXT: retq
%r = insertelement <16 x i16> %x, i16 %y, i32 10
ret <16 x i16> %r
}
define <32 x i8> @test_insert_128_v32i8(<32 x i8> %x, i8 %y) {
-; KNL-LABEL: test_insert_128_v32i8:
-; KNL: ## BB#0:
-; KNL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; KNL-NEXT: vpinsrb $4, %edi, %xmm1, %xmm1
-; KNL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; KNL-NEXT: retq
-;
-; SKX-LABEL: test_insert_128_v32i8:
-; SKX: ## BB#0:
-; SKX-NEXT: vextracti128 $1, %ymm0, %xmm1
-; SKX-NEXT: vpinsrb $4, %edi, %xmm1, %xmm1
-; SKX-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; SKX-NEXT: retq
+; CHECK-LABEL: test_insert_128_v32i8:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1
+; CHECK-NEXT: vpinsrb $4, %edi, %xmm1, %xmm1
+; CHECK-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; CHECK-NEXT: retq
%r = insertelement <32 x i8> %x, i8 %y, i32 20
ret <32 x i8> %r
}
define i32 @test_insertelement_v32i1(i32 %a, i32 %b, <32 x i32> %x , <32 x i32> %y) {
; KNL-LABEL: test_insertelement_v32i1:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: pushq %rbp
-; KNL-NEXT: Lcfi0:
; KNL-NEXT: .cfi_def_cfa_offset 16
-; KNL-NEXT: Lcfi1:
; KNL-NEXT: .cfi_offset %rbp, -16
; KNL-NEXT: movq %rsp, %rbp
-; KNL-NEXT: Lcfi2:
; KNL-NEXT: .cfi_def_cfa_register %rbp
; KNL-NEXT: andq $-32, %rsp
; KNL-NEXT: subq $32, %rsp
; KNL-NEXT: xorl %eax, %eax
; KNL-NEXT: cmpl %esi, %edi
; KNL-NEXT: setb %al
-; KNL-NEXT: vpcmpltud %zmm3, %zmm1, %k0
-; KNL-NEXT: kshiftlw $14, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %ecx
-; KNL-NEXT: kshiftlw $15, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %edx
-; KNL-NEXT: vmovd %edx, %xmm1
-; KNL-NEXT: vpinsrb $1, %ecx, %xmm1, %xmm1
-; KNL-NEXT: kshiftlw $13, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %ecx
-; KNL-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1
-; KNL-NEXT: kshiftlw $12, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %ecx
-; KNL-NEXT: vpinsrb $3, %ecx, %xmm1, %xmm1
-; KNL-NEXT: kshiftlw $11, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %ecx
-; KNL-NEXT: vpinsrb $4, %ecx, %xmm1, %xmm1
-; KNL-NEXT: kshiftlw $10, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %ecx
-; KNL-NEXT: vpinsrb $5, %ecx, %xmm1, %xmm1
-; KNL-NEXT: kshiftlw $9, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %ecx
-; KNL-NEXT: vpinsrb $6, %ecx, %xmm1, %xmm1
-; KNL-NEXT: kshiftlw $8, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %ecx
-; KNL-NEXT: vpinsrb $7, %ecx, %xmm1, %xmm1
-; KNL-NEXT: kshiftlw $7, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %ecx
-; KNL-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
-; KNL-NEXT: kshiftlw $6, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %ecx
-; KNL-NEXT: vpinsrb $9, %ecx, %xmm1, %xmm1
-; KNL-NEXT: kshiftlw $5, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %ecx
-; KNL-NEXT: vpinsrb $10, %ecx, %xmm1, %xmm1
-; KNL-NEXT: kshiftlw $4, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %ecx
-; KNL-NEXT: vpinsrb $11, %ecx, %xmm1, %xmm1
-; KNL-NEXT: kshiftlw $3, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %ecx
-; KNL-NEXT: vpinsrb $12, %ecx, %xmm1, %xmm1
-; KNL-NEXT: kshiftlw $2, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %ecx
-; KNL-NEXT: vpinsrb $13, %ecx, %xmm1, %xmm1
-; KNL-NEXT: kshiftlw $1, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %ecx
-; KNL-NEXT: vpinsrb $14, %ecx, %xmm1, %xmm1
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %ecx
-; KNL-NEXT: vpinsrb $15, %ecx, %xmm1, %xmm1
-; KNL-NEXT: vpcmpltud %zmm2, %zmm0, %k0
-; KNL-NEXT: kshiftlw $14, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %ecx
-; KNL-NEXT: kshiftlw $15, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %edx
-; KNL-NEXT: vmovd %edx, %xmm0
-; KNL-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
-; KNL-NEXT: kshiftlw $13, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %ecx
-; KNL-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0
-; KNL-NEXT: kshiftlw $12, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %ecx
-; KNL-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0
-; KNL-NEXT: kshiftlw $11, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %ecx
-; KNL-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0
-; KNL-NEXT: kshiftlw $10, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %ecx
-; KNL-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0
-; KNL-NEXT: kshiftlw $9, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %ecx
-; KNL-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; KNL-NEXT: kshiftlw $8, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %ecx
-; KNL-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0
-; KNL-NEXT: kshiftlw $7, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %ecx
-; KNL-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0
-; KNL-NEXT: kshiftlw $6, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %ecx
-; KNL-NEXT: vpinsrb $9, %ecx, %xmm0, %xmm0
-; KNL-NEXT: kshiftlw $5, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %ecx
-; KNL-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0
-; KNL-NEXT: kshiftlw $4, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %ecx
-; KNL-NEXT: vpinsrb $11, %ecx, %xmm0, %xmm0
-; KNL-NEXT: kshiftlw $3, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %ecx
-; KNL-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0
-; KNL-NEXT: kshiftlw $2, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %ecx
-; KNL-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0
-; KNL-NEXT: kshiftlw $1, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %ecx
-; KNL-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %ecx
-; KNL-NEXT: vpinsrb $15, %ecx, %xmm0, %xmm0
+; KNL-NEXT: vpcmpltud %zmm2, %zmm0, %k1
+; KNL-NEXT: movl {{.*}}(%rip), %ecx
+; KNL-NEXT: vpbroadcastd %ecx, %zmm0 {%k1} {z}
+; KNL-NEXT: vpmovdb %zmm0, %xmm0
+; KNL-NEXT: vpcmpltud %zmm3, %zmm1, %k1
+; KNL-NEXT: vpbroadcastd %ecx, %zmm1 {%k1} {z}
+; KNL-NEXT: vpmovdb %zmm1, %xmm1
; KNL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; KNL-NEXT: vpsllw $7, %ymm0, %ymm0
; KNL-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
-; KNL-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1
; KNL-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0
; KNL-NEXT: vextracti128 $1, %ymm0, %xmm1
; KNL-NEXT: vpmovsxbd %xmm1, %zmm1
@@ -1283,21 +817,22 @@ define i32 @test_insertelement_v32i1(i32 %a, i32 %b, <32 x i32> %x , <32 x i32>
; KNL-NEXT: movl (%rsp), %eax
; KNL-NEXT: movq %rbp, %rsp
; KNL-NEXT: popq %rbp
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: test_insertelement_v32i1:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: cmpl %esi, %edi
; SKX-NEXT: setb %al
; SKX-NEXT: vpcmpltud %zmm2, %zmm0, %k0
; SKX-NEXT: vpcmpltud %zmm3, %zmm1, %k1
; SKX-NEXT: kunpckwd %k0, %k1, %k0
-; SKX-NEXT: vpmovm2w %k0, %zmm0
-; SKX-NEXT: kmovd %eax, %k0
-; SKX-NEXT: vpmovm2w %k0, %zmm1
-; SKX-NEXT: vmovdqu16 {{.*#+}} zmm2 = [0,1,2,3,32,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]
-; SKX-NEXT: vpermi2w %zmm1, %zmm0, %zmm2
-; SKX-NEXT: vpmovw2m %zmm2, %k0
+; SKX-NEXT: kshiftrd $4, %k0, %k1
+; SKX-NEXT: kmovd %eax, %k2
+; SKX-NEXT: kxord %k2, %k1, %k1
+; SKX-NEXT: kshiftld $31, %k1, %k1
+; SKX-NEXT: kshiftrd $27, %k1, %k1
+; SKX-NEXT: kxord %k0, %k1, %k0
; SKX-NEXT: kmovd %k0, %eax
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
@@ -1310,7 +845,7 @@ define i32 @test_insertelement_v32i1(i32 %a, i32 %b, <32 x i32> %x , <32 x i32>
define i8 @test_iinsertelement_v4i1(i32 %a, i32 %b, <4 x i32> %x , <4 x i32> %y) {
; KNL-LABEL: test_iinsertelement_v4i1:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: cmpl %esi, %edi
; KNL-NEXT: setb %al
; KNL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
@@ -1318,47 +853,48 @@ define i8 @test_iinsertelement_v4i1(i32 %a, i32 %b, <4 x i32> %x , <4 x i32> %y)
; KNL-NEXT: vpxor %xmm2, %xmm1, %xmm1
; KNL-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; KNL-NEXT: vpextrb $4, %xmm0, %ecx
-; KNL-NEXT: kmovw %ecx, %k1
-; KNL-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; KNL-NEXT: kmovw %ecx, %k0
; KNL-NEXT: vpextrb $0, %xmm0, %ecx
+; KNL-NEXT: andl $1, %ecx
; KNL-NEXT: kmovw %ecx, %k1
-; KNL-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; KNL-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
-; KNL-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
-; KNL-NEXT: vpsllq $63, %zmm3, %zmm1
-; KNL-NEXT: vptestmq %zmm1, %zmm1, %k1
-; KNL-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; KNL-NEXT: kmovw %eax, %k1
-; KNL-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; KNL-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
-; KNL-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
-; KNL-NEXT: vpsllq $63, %zmm3, %zmm1
-; KNL-NEXT: vptestmq %zmm1, %zmm1, %k1
-; KNL-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; KNL-NEXT: kshiftrw $1, %k0, %k2
+; KNL-NEXT: kshiftlw $1, %k2, %k2
+; KNL-NEXT: korw %k1, %k2, %k1
+; KNL-NEXT: kshiftrw $1, %k1, %k2
+; KNL-NEXT: kxorw %k0, %k2, %k0
+; KNL-NEXT: kshiftlw $15, %k0, %k0
+; KNL-NEXT: kshiftrw $14, %k0, %k0
+; KNL-NEXT: kxorw %k1, %k0, %k0
+; KNL-NEXT: kshiftrw $2, %k0, %k1
+; KNL-NEXT: kmovw %eax, %k2
+; KNL-NEXT: kxorw %k2, %k1, %k1
+; KNL-NEXT: kshiftlw $15, %k1, %k1
+; KNL-NEXT: kshiftrw $13, %k1, %k1
+; KNL-NEXT: kxorw %k0, %k1, %k0
+; KNL-NEXT: kshiftrw $3, %k0, %k1
; KNL-NEXT: vpextrb $12, %xmm0, %eax
-; KNL-NEXT: kmovw %eax, %k1
-; KNL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; KNL-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
-; KNL-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
-; KNL-NEXT: vpsllq $63, %zmm2, %zmm0
-; KNL-NEXT: vptestmq %zmm0, %zmm0, %k0
+; KNL-NEXT: kmovw %eax, %k2
+; KNL-NEXT: kxorw %k2, %k1, %k1
+; KNL-NEXT: kshiftlw $15, %k1, %k1
+; KNL-NEXT: kshiftrw $12, %k1, %k1
+; KNL-NEXT: kxorw %k0, %k1, %k0
; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; KNL-NEXT: ## kill: def %al killed %al killed %eax
; KNL-NEXT: retq
;
; SKX-LABEL: test_iinsertelement_v4i1:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: cmpl %esi, %edi
; SKX-NEXT: setb %al
; SKX-NEXT: vpcmpltud %xmm1, %xmm0, %k0
-; SKX-NEXT: vpmovm2d %k0, %xmm0
-; SKX-NEXT: kmovd %eax, %k0
-; SKX-NEXT: vpmovm2d %k0, %xmm1
-; SKX-NEXT: vpbroadcastq %xmm1, %xmm1
-; SKX-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
-; SKX-NEXT: vpmovd2m %xmm0, %k0
+; SKX-NEXT: kshiftrw $2, %k0, %k1
+; SKX-NEXT: kmovd %eax, %k2
+; SKX-NEXT: kxorw %k2, %k1, %k1
+; SKX-NEXT: kshiftlw $15, %k1, %k1
+; SKX-NEXT: kshiftrw $13, %k1, %k1
+; SKX-NEXT: kxorw %k0, %k1, %k0
; SKX-NEXT: kmovd %k0, %eax
-; SKX-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; SKX-NEXT: ## kill: def %al killed %al killed %eax
; SKX-NEXT: retq
%cmp_res_i1 = icmp ult i32 %a, %b
%cmp_cmp_vec = icmp ult <4 x i32> %x, %y
@@ -1370,7 +906,7 @@ define i8 @test_iinsertelement_v4i1(i32 %a, i32 %b, <4 x i32> %x , <4 x i32> %y)
define i8 @test_iinsertelement_v2i1(i32 %a, i32 %b, <2 x i64> %x , <2 x i64> %y) {
; KNL-LABEL: test_iinsertelement_v2i1:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: cmpl %esi, %edi
; KNL-NEXT: setb %al
; KNL-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
@@ -1378,30 +914,34 @@ define i8 @test_iinsertelement_v2i1(i32 %a, i32 %b, <2 x i64> %x , <2 x i64> %y)
; KNL-NEXT: vpxor %xmm2, %xmm1, %xmm1
; KNL-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; KNL-NEXT: vpextrb $0, %xmm0, %ecx
-; KNL-NEXT: kmovw %ecx, %k1
-; KNL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; KNL-NEXT: kmovw %eax, %k1
-; KNL-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; KNL-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
-; KNL-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
-; KNL-NEXT: vpsllq $63, %zmm2, %zmm0
-; KNL-NEXT: vptestmq %zmm0, %zmm0, %k0
+; KNL-NEXT: andl $1, %ecx
+; KNL-NEXT: kmovw %ecx, %k0
+; KNL-NEXT: kshiftrw $1, %k0, %k1
+; KNL-NEXT: kshiftlw $1, %k1, %k1
+; KNL-NEXT: korw %k0, %k1, %k0
+; KNL-NEXT: kshiftrw $1, %k0, %k1
+; KNL-NEXT: kmovw %eax, %k2
+; KNL-NEXT: kxorw %k2, %k1, %k1
+; KNL-NEXT: kshiftlw $15, %k1, %k1
+; KNL-NEXT: kshiftrw $14, %k1, %k1
+; KNL-NEXT: kxorw %k0, %k1, %k0
; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; KNL-NEXT: ## kill: def %al killed %al killed %eax
; KNL-NEXT: retq
;
; SKX-LABEL: test_iinsertelement_v2i1:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: cmpl %esi, %edi
; SKX-NEXT: setb %al
; SKX-NEXT: vpcmpltuq %xmm1, %xmm0, %k0
-; SKX-NEXT: kmovd %eax, %k1
-; SKX-NEXT: kshiftlw $1, %k1, %k1
-; SKX-NEXT: kshiftlw $1, %k0, %k0
-; SKX-NEXT: kshiftrw $1, %k0, %k0
-; SKX-NEXT: korw %k1, %k0, %k0
+; SKX-NEXT: kshiftrw $1, %k0, %k1
+; SKX-NEXT: kmovd %eax, %k2
+; SKX-NEXT: kxorw %k2, %k1, %k1
+; SKX-NEXT: kshiftlw $15, %k1, %k1
+; SKX-NEXT: kshiftrw $14, %k1, %k1
+; SKX-NEXT: kxorw %k0, %k1, %k0
; SKX-NEXT: kmovd %k0, %eax
-; SKX-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; SKX-NEXT: ## kill: def %al killed %al killed %eax
; SKX-NEXT: retq
%cmp_res_i1 = icmp ult i32 %a, %b
%cmp_cmp_vec = icmp ult <2 x i64> %x, %y
@@ -1413,7 +953,7 @@ define i8 @test_iinsertelement_v2i1(i32 %a, i32 %b, <2 x i64> %x , <2 x i64> %y)
define zeroext i8 @test_extractelement_v2i1(<2 x i64> %a, <2 x i64> %b) {
; KNL-LABEL: test_extractelement_v2i1:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
; KNL-NEXT: vpxor %xmm2, %xmm1, %xmm1
; KNL-NEXT: vpxor %xmm2, %xmm0, %xmm0
@@ -1426,10 +966,8 @@ define zeroext i8 @test_extractelement_v2i1(<2 x i64> %a, <2 x i64> %b) {
; KNL-NEXT: retq
;
; SKX-LABEL: test_extractelement_v2i1:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpcmpnleuq %xmm1, %xmm0, %k0
-; SKX-NEXT: kshiftlw $15, %k0, %k0
-; SKX-NEXT: kshiftrw $15, %k0, %k0
; SKX-NEXT: kmovd %k0, %eax
; SKX-NEXT: andb $1, %al
; SKX-NEXT: movb $4, %cl
@@ -1444,7 +982,7 @@ define zeroext i8 @test_extractelement_v2i1(<2 x i64> %a, <2 x i64> %b) {
define zeroext i8 @extractelement_v2i1_alt(<2 x i64> %a, <2 x i64> %b) {
; KNL-LABEL: extractelement_v2i1_alt:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
; KNL-NEXT: vpxor %xmm2, %xmm1, %xmm1
; KNL-NEXT: vpxor %xmm2, %xmm0, %xmm0
@@ -1457,10 +995,8 @@ define zeroext i8 @extractelement_v2i1_alt(<2 x i64> %a, <2 x i64> %b) {
; KNL-NEXT: retq
;
; SKX-LABEL: extractelement_v2i1_alt:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpcmpnleuq %xmm1, %xmm0, %k0
-; SKX-NEXT: kshiftlw $15, %k0, %k0
-; SKX-NEXT: kshiftrw $15, %k0, %k0
; SKX-NEXT: kmovd %k0, %eax
; SKX-NEXT: andb $1, %al
; SKX-NEXT: movb $4, %cl
@@ -1476,7 +1012,7 @@ define zeroext i8 @extractelement_v2i1_alt(<2 x i64> %a, <2 x i64> %b) {
define zeroext i8 @test_extractelement_v4i1(<4 x i32> %a, <4 x i32> %b) {
; KNL-LABEL: test_extractelement_v4i1:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
; KNL-NEXT: vpxor %xmm2, %xmm1, %xmm1
; KNL-NEXT: vpxor %xmm2, %xmm0, %xmm0
@@ -1486,10 +1022,9 @@ define zeroext i8 @test_extractelement_v4i1(<4 x i32> %a, <4 x i32> %b) {
; KNL-NEXT: retq
;
; SKX-LABEL: test_extractelement_v4i1:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpcmpnleud %xmm1, %xmm0, %k0
-; SKX-NEXT: kshiftlw $12, %k0, %k0
-; SKX-NEXT: kshiftrw $15, %k0, %k0
+; SKX-NEXT: kshiftrw $3, %k0, %k0
; SKX-NEXT: kmovd %k0, %eax
; SKX-NEXT: andl $1, %eax
; SKX-NEXT: retq
@@ -1501,20 +1036,20 @@ define zeroext i8 @test_extractelement_v4i1(<4 x i32> %a, <4 x i32> %b) {
define zeroext i8 @test_extractelement_v32i1(<32 x i8> %a, <32 x i8> %b) {
; KNL-LABEL: test_extractelement_v32i1:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
; KNL-NEXT: vpxor %ymm2, %ymm1, %ymm1
; KNL-NEXT: vpxor %ymm2, %ymm0, %ymm0
; KNL-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0
; KNL-NEXT: vpextrb $2, %xmm0, %eax
; KNL-NEXT: andl $1, %eax
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: test_extractelement_v32i1:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpcmpnleub %ymm1, %ymm0, %k0
-; SKX-NEXT: kshiftld $29, %k0, %k0
-; SKX-NEXT: kshiftrd $31, %k0, %k0
+; SKX-NEXT: kshiftrd $2, %k0, %k0
; SKX-NEXT: kmovd %k0, %eax
; SKX-NEXT: andl $1, %eax
; SKX-NEXT: vzeroupper
@@ -1527,7 +1062,7 @@ define zeroext i8 @test_extractelement_v32i1(<32 x i8> %a, <32 x i8> %b) {
define zeroext i8 @test_extractelement_v64i1(<64 x i8> %a, <64 x i8> %b) {
; KNL-LABEL: test_extractelement_v64i1:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
; KNL-NEXT: vpxor %ymm0, %ymm3, %ymm2
; KNL-NEXT: vpxor %ymm0, %ymm1, %ymm0
@@ -1538,10 +1073,11 @@ define zeroext i8 @test_extractelement_v64i1(<64 x i8> %a, <64 x i8> %b) {
; KNL-NEXT: movb $4, %cl
; KNL-NEXT: subb %al, %cl
; KNL-NEXT: movzbl %cl, %eax
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: test_extractelement_v64i1:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpcmpnleub %zmm1, %zmm0, %k0
; SKX-NEXT: kshiftrq $63, %k0, %k0
; SKX-NEXT: kmovd %k0, %eax
@@ -1559,7 +1095,7 @@ define zeroext i8 @test_extractelement_v64i1(<64 x i8> %a, <64 x i8> %b) {
define zeroext i8 @extractelement_v64i1_alt(<64 x i8> %a, <64 x i8> %b) {
; KNL-LABEL: extractelement_v64i1_alt:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
; KNL-NEXT: vpxor %ymm0, %ymm3, %ymm2
; KNL-NEXT: vpxor %ymm0, %ymm1, %ymm0
@@ -1570,10 +1106,11 @@ define zeroext i8 @extractelement_v64i1_alt(<64 x i8> %a, <64 x i8> %b) {
; KNL-NEXT: movb $4, %cl
; KNL-NEXT: subb %al, %cl
; KNL-NEXT: movzbl %cl, %eax
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: extractelement_v64i1_alt:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpcmpnleub %zmm1, %zmm0, %k0
; SKX-NEXT: kshiftrq $63, %k0, %k0
; SKX-NEXT: kmovd %k0, %eax
@@ -1591,546 +1128,294 @@ define zeroext i8 @extractelement_v64i1_alt(<64 x i8> %a, <64 x i8> %b) {
}
define i64 @test_extractelement_variable_v2i64(<2 x i64> %t1, i32 %index) {
-; KNL-LABEL: test_extractelement_variable_v2i64:
-; KNL: ## BB#0:
-; KNL-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
-; KNL-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; KNL-NEXT: andl $1, %edi
-; KNL-NEXT: movq -24(%rsp,%rdi,8), %rax
-; KNL-NEXT: retq
-;
-; SKX-LABEL: test_extractelement_variable_v2i64:
-; SKX: ## BB#0:
-; SKX-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
-; SKX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; SKX-NEXT: andl $1, %edi
-; SKX-NEXT: movq -24(%rsp,%rdi,8), %rax
-; SKX-NEXT: retq
+; CHECK-LABEL: test_extractelement_variable_v2i64:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: ## kill: def %edi killed %edi def %rdi
+; CHECK-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: andl $1, %edi
+; CHECK-NEXT: movq -24(%rsp,%rdi,8), %rax
+; CHECK-NEXT: retq
%t2 = extractelement <2 x i64> %t1, i32 %index
ret i64 %t2
}
define i64 @test_extractelement_variable_v4i64(<4 x i64> %t1, i32 %index) {
-; KNL-LABEL: test_extractelement_variable_v4i64:
-; KNL: ## BB#0:
-; KNL-NEXT: pushq %rbp
-; KNL-NEXT: Lcfi3:
-; KNL-NEXT: .cfi_def_cfa_offset 16
-; KNL-NEXT: Lcfi4:
-; KNL-NEXT: .cfi_offset %rbp, -16
-; KNL-NEXT: movq %rsp, %rbp
-; KNL-NEXT: Lcfi5:
-; KNL-NEXT: .cfi_def_cfa_register %rbp
-; KNL-NEXT: andq $-32, %rsp
-; KNL-NEXT: subq $64, %rsp
-; KNL-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
-; KNL-NEXT: vmovaps %ymm0, (%rsp)
-; KNL-NEXT: andl $3, %edi
-; KNL-NEXT: movq (%rsp,%rdi,8), %rax
-; KNL-NEXT: movq %rbp, %rsp
-; KNL-NEXT: popq %rbp
-; KNL-NEXT: retq
-;
-; SKX-LABEL: test_extractelement_variable_v4i64:
-; SKX: ## BB#0:
-; SKX-NEXT: pushq %rbp
-; SKX-NEXT: Lcfi0:
-; SKX-NEXT: .cfi_def_cfa_offset 16
-; SKX-NEXT: Lcfi1:
-; SKX-NEXT: .cfi_offset %rbp, -16
-; SKX-NEXT: movq %rsp, %rbp
-; SKX-NEXT: Lcfi2:
-; SKX-NEXT: .cfi_def_cfa_register %rbp
-; SKX-NEXT: andq $-32, %rsp
-; SKX-NEXT: subq $64, %rsp
-; SKX-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
-; SKX-NEXT: vmovaps %ymm0, (%rsp)
-; SKX-NEXT: andl $3, %edi
-; SKX-NEXT: movq (%rsp,%rdi,8), %rax
-; SKX-NEXT: movq %rbp, %rsp
-; SKX-NEXT: popq %rbp
-; SKX-NEXT: vzeroupper
-; SKX-NEXT: retq
+; CHECK-LABEL: test_extractelement_variable_v4i64:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: pushq %rbp
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: .cfi_offset %rbp, -16
+; CHECK-NEXT: movq %rsp, %rbp
+; CHECK-NEXT: .cfi_def_cfa_register %rbp
+; CHECK-NEXT: andq $-32, %rsp
+; CHECK-NEXT: subq $64, %rsp
+; CHECK-NEXT: ## kill: def %edi killed %edi def %rdi
+; CHECK-NEXT: vmovaps %ymm0, (%rsp)
+; CHECK-NEXT: andl $3, %edi
+; CHECK-NEXT: movq (%rsp,%rdi,8), %rax
+; CHECK-NEXT: movq %rbp, %rsp
+; CHECK-NEXT: popq %rbp
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
%t2 = extractelement <4 x i64> %t1, i32 %index
ret i64 %t2
}
define i64 @test_extractelement_variable_v8i64(<8 x i64> %t1, i32 %index) {
-; KNL-LABEL: test_extractelement_variable_v8i64:
-; KNL: ## BB#0:
-; KNL-NEXT: pushq %rbp
-; KNL-NEXT: Lcfi6:
-; KNL-NEXT: .cfi_def_cfa_offset 16
-; KNL-NEXT: Lcfi7:
-; KNL-NEXT: .cfi_offset %rbp, -16
-; KNL-NEXT: movq %rsp, %rbp
-; KNL-NEXT: Lcfi8:
-; KNL-NEXT: .cfi_def_cfa_register %rbp
-; KNL-NEXT: andq $-64, %rsp
-; KNL-NEXT: subq $128, %rsp
-; KNL-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
-; KNL-NEXT: vmovaps %zmm0, (%rsp)
-; KNL-NEXT: andl $7, %edi
-; KNL-NEXT: movq (%rsp,%rdi,8), %rax
-; KNL-NEXT: movq %rbp, %rsp
-; KNL-NEXT: popq %rbp
-; KNL-NEXT: retq
-;
-; SKX-LABEL: test_extractelement_variable_v8i64:
-; SKX: ## BB#0:
-; SKX-NEXT: pushq %rbp
-; SKX-NEXT: Lcfi3:
-; SKX-NEXT: .cfi_def_cfa_offset 16
-; SKX-NEXT: Lcfi4:
-; SKX-NEXT: .cfi_offset %rbp, -16
-; SKX-NEXT: movq %rsp, %rbp
-; SKX-NEXT: Lcfi5:
-; SKX-NEXT: .cfi_def_cfa_register %rbp
-; SKX-NEXT: andq $-64, %rsp
-; SKX-NEXT: subq $128, %rsp
-; SKX-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
-; SKX-NEXT: vmovaps %zmm0, (%rsp)
-; SKX-NEXT: andl $7, %edi
-; SKX-NEXT: movq (%rsp,%rdi,8), %rax
-; SKX-NEXT: movq %rbp, %rsp
-; SKX-NEXT: popq %rbp
-; SKX-NEXT: vzeroupper
-; SKX-NEXT: retq
+; CHECK-LABEL: test_extractelement_variable_v8i64:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: pushq %rbp
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: .cfi_offset %rbp, -16
+; CHECK-NEXT: movq %rsp, %rbp
+; CHECK-NEXT: .cfi_def_cfa_register %rbp
+; CHECK-NEXT: andq $-64, %rsp
+; CHECK-NEXT: subq $128, %rsp
+; CHECK-NEXT: ## kill: def %edi killed %edi def %rdi
+; CHECK-NEXT: vmovaps %zmm0, (%rsp)
+; CHECK-NEXT: andl $7, %edi
+; CHECK-NEXT: movq (%rsp,%rdi,8), %rax
+; CHECK-NEXT: movq %rbp, %rsp
+; CHECK-NEXT: popq %rbp
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
%t2 = extractelement <8 x i64> %t1, i32 %index
ret i64 %t2
}
define double @test_extractelement_variable_v2f64(<2 x double> %t1, i32 %index) {
-; KNL-LABEL: test_extractelement_variable_v2f64:
-; KNL: ## BB#0:
-; KNL-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
-; KNL-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; KNL-NEXT: andl $1, %edi
-; KNL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; KNL-NEXT: retq
-;
-; SKX-LABEL: test_extractelement_variable_v2f64:
-; SKX: ## BB#0:
-; SKX-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
-; SKX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; SKX-NEXT: andl $1, %edi
-; SKX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; SKX-NEXT: retq
+; CHECK-LABEL: test_extractelement_variable_v2f64:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: ## kill: def %edi killed %edi def %rdi
+; CHECK-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: andl $1, %edi
+; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT: retq
%t2 = extractelement <2 x double> %t1, i32 %index
ret double %t2
}
define double @test_extractelement_variable_v4f64(<4 x double> %t1, i32 %index) {
-; KNL-LABEL: test_extractelement_variable_v4f64:
-; KNL: ## BB#0:
-; KNL-NEXT: pushq %rbp
-; KNL-NEXT: Lcfi9:
-; KNL-NEXT: .cfi_def_cfa_offset 16
-; KNL-NEXT: Lcfi10:
-; KNL-NEXT: .cfi_offset %rbp, -16
-; KNL-NEXT: movq %rsp, %rbp
-; KNL-NEXT: Lcfi11:
-; KNL-NEXT: .cfi_def_cfa_register %rbp
-; KNL-NEXT: andq $-32, %rsp
-; KNL-NEXT: subq $64, %rsp
-; KNL-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
-; KNL-NEXT: vmovaps %ymm0, (%rsp)
-; KNL-NEXT: andl $3, %edi
-; KNL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; KNL-NEXT: movq %rbp, %rsp
-; KNL-NEXT: popq %rbp
-; KNL-NEXT: retq
-;
-; SKX-LABEL: test_extractelement_variable_v4f64:
-; SKX: ## BB#0:
-; SKX-NEXT: pushq %rbp
-; SKX-NEXT: Lcfi6:
-; SKX-NEXT: .cfi_def_cfa_offset 16
-; SKX-NEXT: Lcfi7:
-; SKX-NEXT: .cfi_offset %rbp, -16
-; SKX-NEXT: movq %rsp, %rbp
-; SKX-NEXT: Lcfi8:
-; SKX-NEXT: .cfi_def_cfa_register %rbp
-; SKX-NEXT: andq $-32, %rsp
-; SKX-NEXT: subq $64, %rsp
-; SKX-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
-; SKX-NEXT: vmovaps %ymm0, (%rsp)
-; SKX-NEXT: andl $3, %edi
-; SKX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; SKX-NEXT: movq %rbp, %rsp
-; SKX-NEXT: popq %rbp
-; SKX-NEXT: vzeroupper
-; SKX-NEXT: retq
+; CHECK-LABEL: test_extractelement_variable_v4f64:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: pushq %rbp
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: .cfi_offset %rbp, -16
+; CHECK-NEXT: movq %rsp, %rbp
+; CHECK-NEXT: .cfi_def_cfa_register %rbp
+; CHECK-NEXT: andq $-32, %rsp
+; CHECK-NEXT: subq $64, %rsp
+; CHECK-NEXT: ## kill: def %edi killed %edi def %rdi
+; CHECK-NEXT: vmovaps %ymm0, (%rsp)
+; CHECK-NEXT: andl $3, %edi
+; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT: movq %rbp, %rsp
+; CHECK-NEXT: popq %rbp
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
%t2 = extractelement <4 x double> %t1, i32 %index
ret double %t2
}
define double @test_extractelement_variable_v8f64(<8 x double> %t1, i32 %index) {
-; KNL-LABEL: test_extractelement_variable_v8f64:
-; KNL: ## BB#0:
-; KNL-NEXT: pushq %rbp
-; KNL-NEXT: Lcfi12:
-; KNL-NEXT: .cfi_def_cfa_offset 16
-; KNL-NEXT: Lcfi13:
-; KNL-NEXT: .cfi_offset %rbp, -16
-; KNL-NEXT: movq %rsp, %rbp
-; KNL-NEXT: Lcfi14:
-; KNL-NEXT: .cfi_def_cfa_register %rbp
-; KNL-NEXT: andq $-64, %rsp
-; KNL-NEXT: subq $128, %rsp
-; KNL-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
-; KNL-NEXT: vmovaps %zmm0, (%rsp)
-; KNL-NEXT: andl $7, %edi
-; KNL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; KNL-NEXT: movq %rbp, %rsp
-; KNL-NEXT: popq %rbp
-; KNL-NEXT: retq
-;
-; SKX-LABEL: test_extractelement_variable_v8f64:
-; SKX: ## BB#0:
-; SKX-NEXT: pushq %rbp
-; SKX-NEXT: Lcfi9:
-; SKX-NEXT: .cfi_def_cfa_offset 16
-; SKX-NEXT: Lcfi10:
-; SKX-NEXT: .cfi_offset %rbp, -16
-; SKX-NEXT: movq %rsp, %rbp
-; SKX-NEXT: Lcfi11:
-; SKX-NEXT: .cfi_def_cfa_register %rbp
-; SKX-NEXT: andq $-64, %rsp
-; SKX-NEXT: subq $128, %rsp
-; SKX-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
-; SKX-NEXT: vmovaps %zmm0, (%rsp)
-; SKX-NEXT: andl $7, %edi
-; SKX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; SKX-NEXT: movq %rbp, %rsp
-; SKX-NEXT: popq %rbp
-; SKX-NEXT: vzeroupper
-; SKX-NEXT: retq
+; CHECK-LABEL: test_extractelement_variable_v8f64:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: pushq %rbp
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: .cfi_offset %rbp, -16
+; CHECK-NEXT: movq %rsp, %rbp
+; CHECK-NEXT: .cfi_def_cfa_register %rbp
+; CHECK-NEXT: andq $-64, %rsp
+; CHECK-NEXT: subq $128, %rsp
+; CHECK-NEXT: ## kill: def %edi killed %edi def %rdi
+; CHECK-NEXT: vmovaps %zmm0, (%rsp)
+; CHECK-NEXT: andl $7, %edi
+; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT: movq %rbp, %rsp
+; CHECK-NEXT: popq %rbp
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
%t2 = extractelement <8 x double> %t1, i32 %index
ret double %t2
}
define i32 @test_extractelement_variable_v4i32(<4 x i32> %t1, i32 %index) {
-; KNL-LABEL: test_extractelement_variable_v4i32:
-; KNL: ## BB#0:
-; KNL-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
-; KNL-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; KNL-NEXT: andl $3, %edi
-; KNL-NEXT: movl -24(%rsp,%rdi,4), %eax
-; KNL-NEXT: retq
-;
-; SKX-LABEL: test_extractelement_variable_v4i32:
-; SKX: ## BB#0:
-; SKX-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
-; SKX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; SKX-NEXT: andl $3, %edi
-; SKX-NEXT: movl -24(%rsp,%rdi,4), %eax
-; SKX-NEXT: retq
+; CHECK-LABEL: test_extractelement_variable_v4i32:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: ## kill: def %edi killed %edi def %rdi
+; CHECK-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: andl $3, %edi
+; CHECK-NEXT: movl -24(%rsp,%rdi,4), %eax
+; CHECK-NEXT: retq
%t2 = extractelement <4 x i32> %t1, i32 %index
ret i32 %t2
}
define i32 @test_extractelement_variable_v8i32(<8 x i32> %t1, i32 %index) {
-; KNL-LABEL: test_extractelement_variable_v8i32:
-; KNL: ## BB#0:
-; KNL-NEXT: pushq %rbp
-; KNL-NEXT: Lcfi15:
-; KNL-NEXT: .cfi_def_cfa_offset 16
-; KNL-NEXT: Lcfi16:
-; KNL-NEXT: .cfi_offset %rbp, -16
-; KNL-NEXT: movq %rsp, %rbp
-; KNL-NEXT: Lcfi17:
-; KNL-NEXT: .cfi_def_cfa_register %rbp
-; KNL-NEXT: andq $-32, %rsp
-; KNL-NEXT: subq $64, %rsp
-; KNL-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
-; KNL-NEXT: vmovaps %ymm0, (%rsp)
-; KNL-NEXT: andl $7, %edi
-; KNL-NEXT: movl (%rsp,%rdi,4), %eax
-; KNL-NEXT: movq %rbp, %rsp
-; KNL-NEXT: popq %rbp
-; KNL-NEXT: retq
-;
-; SKX-LABEL: test_extractelement_variable_v8i32:
-; SKX: ## BB#0:
-; SKX-NEXT: pushq %rbp
-; SKX-NEXT: Lcfi12:
-; SKX-NEXT: .cfi_def_cfa_offset 16
-; SKX-NEXT: Lcfi13:
-; SKX-NEXT: .cfi_offset %rbp, -16
-; SKX-NEXT: movq %rsp, %rbp
-; SKX-NEXT: Lcfi14:
-; SKX-NEXT: .cfi_def_cfa_register %rbp
-; SKX-NEXT: andq $-32, %rsp
-; SKX-NEXT: subq $64, %rsp
-; SKX-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
-; SKX-NEXT: vmovaps %ymm0, (%rsp)
-; SKX-NEXT: andl $7, %edi
-; SKX-NEXT: movl (%rsp,%rdi,4), %eax
-; SKX-NEXT: movq %rbp, %rsp
-; SKX-NEXT: popq %rbp
-; SKX-NEXT: vzeroupper
-; SKX-NEXT: retq
+; CHECK-LABEL: test_extractelement_variable_v8i32:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: pushq %rbp
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: .cfi_offset %rbp, -16
+; CHECK-NEXT: movq %rsp, %rbp
+; CHECK-NEXT: .cfi_def_cfa_register %rbp
+; CHECK-NEXT: andq $-32, %rsp
+; CHECK-NEXT: subq $64, %rsp
+; CHECK-NEXT: ## kill: def %edi killed %edi def %rdi
+; CHECK-NEXT: vmovaps %ymm0, (%rsp)
+; CHECK-NEXT: andl $7, %edi
+; CHECK-NEXT: movl (%rsp,%rdi,4), %eax
+; CHECK-NEXT: movq %rbp, %rsp
+; CHECK-NEXT: popq %rbp
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
%t2 = extractelement <8 x i32> %t1, i32 %index
ret i32 %t2
}
define i32 @test_extractelement_variable_v16i32(<16 x i32> %t1, i32 %index) {
-; KNL-LABEL: test_extractelement_variable_v16i32:
-; KNL: ## BB#0:
-; KNL-NEXT: pushq %rbp
-; KNL-NEXT: Lcfi18:
-; KNL-NEXT: .cfi_def_cfa_offset 16
-; KNL-NEXT: Lcfi19:
-; KNL-NEXT: .cfi_offset %rbp, -16
-; KNL-NEXT: movq %rsp, %rbp
-; KNL-NEXT: Lcfi20:
-; KNL-NEXT: .cfi_def_cfa_register %rbp
-; KNL-NEXT: andq $-64, %rsp
-; KNL-NEXT: subq $128, %rsp
-; KNL-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
-; KNL-NEXT: vmovaps %zmm0, (%rsp)
-; KNL-NEXT: andl $15, %edi
-; KNL-NEXT: movl (%rsp,%rdi,4), %eax
-; KNL-NEXT: movq %rbp, %rsp
-; KNL-NEXT: popq %rbp
-; KNL-NEXT: retq
-;
-; SKX-LABEL: test_extractelement_variable_v16i32:
-; SKX: ## BB#0:
-; SKX-NEXT: pushq %rbp
-; SKX-NEXT: Lcfi15:
-; SKX-NEXT: .cfi_def_cfa_offset 16
-; SKX-NEXT: Lcfi16:
-; SKX-NEXT: .cfi_offset %rbp, -16
-; SKX-NEXT: movq %rsp, %rbp
-; SKX-NEXT: Lcfi17:
-; SKX-NEXT: .cfi_def_cfa_register %rbp
-; SKX-NEXT: andq $-64, %rsp
-; SKX-NEXT: subq $128, %rsp
-; SKX-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
-; SKX-NEXT: vmovaps %zmm0, (%rsp)
-; SKX-NEXT: andl $15, %edi
-; SKX-NEXT: movl (%rsp,%rdi,4), %eax
-; SKX-NEXT: movq %rbp, %rsp
-; SKX-NEXT: popq %rbp
-; SKX-NEXT: vzeroupper
-; SKX-NEXT: retq
+; CHECK-LABEL: test_extractelement_variable_v16i32:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: pushq %rbp
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: .cfi_offset %rbp, -16
+; CHECK-NEXT: movq %rsp, %rbp
+; CHECK-NEXT: .cfi_def_cfa_register %rbp
+; CHECK-NEXT: andq $-64, %rsp
+; CHECK-NEXT: subq $128, %rsp
+; CHECK-NEXT: ## kill: def %edi killed %edi def %rdi
+; CHECK-NEXT: vmovaps %zmm0, (%rsp)
+; CHECK-NEXT: andl $15, %edi
+; CHECK-NEXT: movl (%rsp,%rdi,4), %eax
+; CHECK-NEXT: movq %rbp, %rsp
+; CHECK-NEXT: popq %rbp
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
%t2 = extractelement <16 x i32> %t1, i32 %index
ret i32 %t2
}
define float @test_extractelement_variable_v4f32(<4 x float> %t1, i32 %index) {
-; KNL-LABEL: test_extractelement_variable_v4f32:
-; KNL: ## BB#0:
-; KNL-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
-; KNL-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; KNL-NEXT: andl $3, %edi
-; KNL-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; KNL-NEXT: retq
-;
-; SKX-LABEL: test_extractelement_variable_v4f32:
-; SKX: ## BB#0:
-; SKX-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
-; SKX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; SKX-NEXT: andl $3, %edi
-; SKX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SKX-NEXT: retq
+; CHECK-LABEL: test_extractelement_variable_v4f32:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: ## kill: def %edi killed %edi def %rdi
+; CHECK-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: andl $3, %edi
+; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT: retq
%t2 = extractelement <4 x float> %t1, i32 %index
ret float %t2
}
define float @test_extractelement_variable_v8f32(<8 x float> %t1, i32 %index) {
-; KNL-LABEL: test_extractelement_variable_v8f32:
-; KNL: ## BB#0:
-; KNL-NEXT: pushq %rbp
-; KNL-NEXT: Lcfi21:
-; KNL-NEXT: .cfi_def_cfa_offset 16
-; KNL-NEXT: Lcfi22:
-; KNL-NEXT: .cfi_offset %rbp, -16
-; KNL-NEXT: movq %rsp, %rbp
-; KNL-NEXT: Lcfi23:
-; KNL-NEXT: .cfi_def_cfa_register %rbp
-; KNL-NEXT: andq $-32, %rsp
-; KNL-NEXT: subq $64, %rsp
-; KNL-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
-; KNL-NEXT: vmovaps %ymm0, (%rsp)
-; KNL-NEXT: andl $7, %edi
-; KNL-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; KNL-NEXT: movq %rbp, %rsp
-; KNL-NEXT: popq %rbp
-; KNL-NEXT: retq
-;
-; SKX-LABEL: test_extractelement_variable_v8f32:
-; SKX: ## BB#0:
-; SKX-NEXT: pushq %rbp
-; SKX-NEXT: Lcfi18:
-; SKX-NEXT: .cfi_def_cfa_offset 16
-; SKX-NEXT: Lcfi19:
-; SKX-NEXT: .cfi_offset %rbp, -16
-; SKX-NEXT: movq %rsp, %rbp
-; SKX-NEXT: Lcfi20:
-; SKX-NEXT: .cfi_def_cfa_register %rbp
-; SKX-NEXT: andq $-32, %rsp
-; SKX-NEXT: subq $64, %rsp
-; SKX-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
-; SKX-NEXT: vmovaps %ymm0, (%rsp)
-; SKX-NEXT: andl $7, %edi
-; SKX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SKX-NEXT: movq %rbp, %rsp
-; SKX-NEXT: popq %rbp
-; SKX-NEXT: vzeroupper
-; SKX-NEXT: retq
+; CHECK-LABEL: test_extractelement_variable_v8f32:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: pushq %rbp
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: .cfi_offset %rbp, -16
+; CHECK-NEXT: movq %rsp, %rbp
+; CHECK-NEXT: .cfi_def_cfa_register %rbp
+; CHECK-NEXT: andq $-32, %rsp
+; CHECK-NEXT: subq $64, %rsp
+; CHECK-NEXT: ## kill: def %edi killed %edi def %rdi
+; CHECK-NEXT: vmovaps %ymm0, (%rsp)
+; CHECK-NEXT: andl $7, %edi
+; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT: movq %rbp, %rsp
+; CHECK-NEXT: popq %rbp
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
%t2 = extractelement <8 x float> %t1, i32 %index
ret float %t2
}
define float @test_extractelement_variable_v16f32(<16 x float> %t1, i32 %index) {
-; KNL-LABEL: test_extractelement_variable_v16f32:
-; KNL: ## BB#0:
-; KNL-NEXT: pushq %rbp
-; KNL-NEXT: Lcfi24:
-; KNL-NEXT: .cfi_def_cfa_offset 16
-; KNL-NEXT: Lcfi25:
-; KNL-NEXT: .cfi_offset %rbp, -16
-; KNL-NEXT: movq %rsp, %rbp
-; KNL-NEXT: Lcfi26:
-; KNL-NEXT: .cfi_def_cfa_register %rbp
-; KNL-NEXT: andq $-64, %rsp
-; KNL-NEXT: subq $128, %rsp
-; KNL-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
-; KNL-NEXT: vmovaps %zmm0, (%rsp)
-; KNL-NEXT: andl $15, %edi
-; KNL-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; KNL-NEXT: movq %rbp, %rsp
-; KNL-NEXT: popq %rbp
-; KNL-NEXT: retq
-;
-; SKX-LABEL: test_extractelement_variable_v16f32:
-; SKX: ## BB#0:
-; SKX-NEXT: pushq %rbp
-; SKX-NEXT: Lcfi21:
-; SKX-NEXT: .cfi_def_cfa_offset 16
-; SKX-NEXT: Lcfi22:
-; SKX-NEXT: .cfi_offset %rbp, -16
-; SKX-NEXT: movq %rsp, %rbp
-; SKX-NEXT: Lcfi23:
-; SKX-NEXT: .cfi_def_cfa_register %rbp
-; SKX-NEXT: andq $-64, %rsp
-; SKX-NEXT: subq $128, %rsp
-; SKX-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
-; SKX-NEXT: vmovaps %zmm0, (%rsp)
-; SKX-NEXT: andl $15, %edi
-; SKX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SKX-NEXT: movq %rbp, %rsp
-; SKX-NEXT: popq %rbp
-; SKX-NEXT: vzeroupper
-; SKX-NEXT: retq
+; CHECK-LABEL: test_extractelement_variable_v16f32:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: pushq %rbp
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: .cfi_offset %rbp, -16
+; CHECK-NEXT: movq %rsp, %rbp
+; CHECK-NEXT: .cfi_def_cfa_register %rbp
+; CHECK-NEXT: andq $-64, %rsp
+; CHECK-NEXT: subq $128, %rsp
+; CHECK-NEXT: ## kill: def %edi killed %edi def %rdi
+; CHECK-NEXT: vmovaps %zmm0, (%rsp)
+; CHECK-NEXT: andl $15, %edi
+; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT: movq %rbp, %rsp
+; CHECK-NEXT: popq %rbp
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
%t2 = extractelement <16 x float> %t1, i32 %index
ret float %t2
}
define i16 @test_extractelement_variable_v8i16(<8 x i16> %t1, i32 %index) {
-; KNL-LABEL: test_extractelement_variable_v8i16:
-; KNL: ## BB#0:
-; KNL-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
-; KNL-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; KNL-NEXT: andl $7, %edi
-; KNL-NEXT: movzwl -24(%rsp,%rdi,2), %eax
-; KNL-NEXT: retq
-;
-; SKX-LABEL: test_extractelement_variable_v8i16:
-; SKX: ## BB#0:
-; SKX-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
-; SKX-NEXT: vmovdqu %xmm0, -{{[0-9]+}}(%rsp)
-; SKX-NEXT: andl $7, %edi
-; SKX-NEXT: movzwl -24(%rsp,%rdi,2), %eax
-; SKX-NEXT: retq
+; CHECK-LABEL: test_extractelement_variable_v8i16:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: ## kill: def %edi killed %edi def %rdi
+; CHECK-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: andl $7, %edi
+; CHECK-NEXT: movzwl -24(%rsp,%rdi,2), %eax
+; CHECK-NEXT: retq
%t2 = extractelement <8 x i16> %t1, i32 %index
ret i16 %t2
}
define i16 @test_extractelement_variable_v16i16(<16 x i16> %t1, i32 %index) {
-; KNL-LABEL: test_extractelement_variable_v16i16:
-; KNL: ## BB#0:
-; KNL-NEXT: pushq %rbp
-; KNL-NEXT: Lcfi27:
-; KNL-NEXT: .cfi_def_cfa_offset 16
-; KNL-NEXT: Lcfi28:
-; KNL-NEXT: .cfi_offset %rbp, -16
-; KNL-NEXT: movq %rsp, %rbp
-; KNL-NEXT: Lcfi29:
-; KNL-NEXT: .cfi_def_cfa_register %rbp
-; KNL-NEXT: andq $-32, %rsp
-; KNL-NEXT: subq $64, %rsp
-; KNL-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
-; KNL-NEXT: vmovaps %ymm0, (%rsp)
-; KNL-NEXT: andl $15, %edi
-; KNL-NEXT: movzwl (%rsp,%rdi,2), %eax
-; KNL-NEXT: movq %rbp, %rsp
-; KNL-NEXT: popq %rbp
-; KNL-NEXT: retq
-;
-; SKX-LABEL: test_extractelement_variable_v16i16:
-; SKX: ## BB#0:
-; SKX-NEXT: pushq %rbp
-; SKX-NEXT: Lcfi24:
-; SKX-NEXT: .cfi_def_cfa_offset 16
-; SKX-NEXT: Lcfi25:
-; SKX-NEXT: .cfi_offset %rbp, -16
-; SKX-NEXT: movq %rsp, %rbp
-; SKX-NEXT: Lcfi26:
-; SKX-NEXT: .cfi_def_cfa_register %rbp
-; SKX-NEXT: andq $-32, %rsp
-; SKX-NEXT: subq $64, %rsp
-; SKX-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
-; SKX-NEXT: vmovdqu %ymm0, (%rsp)
-; SKX-NEXT: andl $15, %edi
-; SKX-NEXT: movzwl (%rsp,%rdi,2), %eax
-; SKX-NEXT: movq %rbp, %rsp
-; SKX-NEXT: popq %rbp
-; SKX-NEXT: vzeroupper
-; SKX-NEXT: retq
+; CHECK-LABEL: test_extractelement_variable_v16i16:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: pushq %rbp
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: .cfi_offset %rbp, -16
+; CHECK-NEXT: movq %rsp, %rbp
+; CHECK-NEXT: .cfi_def_cfa_register %rbp
+; CHECK-NEXT: andq $-32, %rsp
+; CHECK-NEXT: subq $64, %rsp
+; CHECK-NEXT: ## kill: def %edi killed %edi def %rdi
+; CHECK-NEXT: vmovaps %ymm0, (%rsp)
+; CHECK-NEXT: andl $15, %edi
+; CHECK-NEXT: movzwl (%rsp,%rdi,2), %eax
+; CHECK-NEXT: movq %rbp, %rsp
+; CHECK-NEXT: popq %rbp
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
%t2 = extractelement <16 x i16> %t1, i32 %index
ret i16 %t2
}
define i16 @test_extractelement_variable_v32i16(<32 x i16> %t1, i32 %index) {
; KNL-LABEL: test_extractelement_variable_v32i16:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: pushq %rbp
-; KNL-NEXT: Lcfi30:
; KNL-NEXT: .cfi_def_cfa_offset 16
-; KNL-NEXT: Lcfi31:
; KNL-NEXT: .cfi_offset %rbp, -16
; KNL-NEXT: movq %rsp, %rbp
-; KNL-NEXT: Lcfi32:
; KNL-NEXT: .cfi_def_cfa_register %rbp
; KNL-NEXT: andq $-64, %rsp
; KNL-NEXT: subq $128, %rsp
-; KNL-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; KNL-NEXT: ## kill: def %edi killed %edi def %rdi
; KNL-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
; KNL-NEXT: vmovaps %ymm0, (%rsp)
; KNL-NEXT: andl $31, %edi
; KNL-NEXT: movzwl (%rsp,%rdi,2), %eax
; KNL-NEXT: movq %rbp, %rsp
; KNL-NEXT: popq %rbp
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: test_extractelement_variable_v32i16:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: pushq %rbp
-; SKX-NEXT: Lcfi27:
; SKX-NEXT: .cfi_def_cfa_offset 16
-; SKX-NEXT: Lcfi28:
; SKX-NEXT: .cfi_offset %rbp, -16
; SKX-NEXT: movq %rsp, %rbp
-; SKX-NEXT: Lcfi29:
; SKX-NEXT: .cfi_def_cfa_register %rbp
; SKX-NEXT: andq $-64, %rsp
; SKX-NEXT: subq $128, %rsp
-; SKX-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
-; SKX-NEXT: vmovdqu16 %zmm0, (%rsp)
+; SKX-NEXT: ## kill: def %edi killed %edi def %rdi
+; SKX-NEXT: vmovaps %zmm0, (%rsp)
; SKX-NEXT: andl $31, %edi
; SKX-NEXT: movzwl (%rsp,%rdi,2), %eax
; SKX-NEXT: movq %rbp, %rsp
@@ -2142,70 +1427,37 @@ define i16 @test_extractelement_variable_v32i16(<32 x i16> %t1, i32 %index) {
}
define i8 @test_extractelement_variable_v16i8(<16 x i8> %t1, i32 %index) {
-; KNL-LABEL: test_extractelement_variable_v16i8:
-; KNL: ## BB#0:
-; KNL-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
-; KNL-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; KNL-NEXT: andl $15, %edi
-; KNL-NEXT: leaq -{{[0-9]+}}(%rsp), %rax
-; KNL-NEXT: movb (%rdi,%rax), %al
-; KNL-NEXT: retq
-;
-; SKX-LABEL: test_extractelement_variable_v16i8:
-; SKX: ## BB#0:
-; SKX-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
-; SKX-NEXT: vmovdqu %xmm0, -{{[0-9]+}}(%rsp)
-; SKX-NEXT: andl $15, %edi
-; SKX-NEXT: leaq -{{[0-9]+}}(%rsp), %rax
-; SKX-NEXT: movb (%rdi,%rax), %al
-; SKX-NEXT: retq
+; CHECK-LABEL: test_extractelement_variable_v16i8:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: ## kill: def %edi killed %edi def %rdi
+; CHECK-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: andl $15, %edi
+; CHECK-NEXT: leaq -{{[0-9]+}}(%rsp), %rax
+; CHECK-NEXT: movb (%rdi,%rax), %al
+; CHECK-NEXT: retq
%t2 = extractelement <16 x i8> %t1, i32 %index
ret i8 %t2
}
define i8 @test_extractelement_variable_v32i8(<32 x i8> %t1, i32 %index) {
-; KNL-LABEL: test_extractelement_variable_v32i8:
-; KNL: ## BB#0:
-; KNL-NEXT: pushq %rbp
-; KNL-NEXT: Lcfi33:
-; KNL-NEXT: .cfi_def_cfa_offset 16
-; KNL-NEXT: Lcfi34:
-; KNL-NEXT: .cfi_offset %rbp, -16
-; KNL-NEXT: movq %rsp, %rbp
-; KNL-NEXT: Lcfi35:
-; KNL-NEXT: .cfi_def_cfa_register %rbp
-; KNL-NEXT: andq $-32, %rsp
-; KNL-NEXT: subq $64, %rsp
-; KNL-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
-; KNL-NEXT: vmovaps %ymm0, (%rsp)
-; KNL-NEXT: andl $31, %edi
-; KNL-NEXT: movq %rsp, %rax
-; KNL-NEXT: movb (%rdi,%rax), %al
-; KNL-NEXT: movq %rbp, %rsp
-; KNL-NEXT: popq %rbp
-; KNL-NEXT: retq
-;
-; SKX-LABEL: test_extractelement_variable_v32i8:
-; SKX: ## BB#0:
-; SKX-NEXT: pushq %rbp
-; SKX-NEXT: Lcfi30:
-; SKX-NEXT: .cfi_def_cfa_offset 16
-; SKX-NEXT: Lcfi31:
-; SKX-NEXT: .cfi_offset %rbp, -16
-; SKX-NEXT: movq %rsp, %rbp
-; SKX-NEXT: Lcfi32:
-; SKX-NEXT: .cfi_def_cfa_register %rbp
-; SKX-NEXT: andq $-32, %rsp
-; SKX-NEXT: subq $64, %rsp
-; SKX-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
-; SKX-NEXT: vmovdqu %ymm0, (%rsp)
-; SKX-NEXT: andl $31, %edi
-; SKX-NEXT: movq %rsp, %rax
-; SKX-NEXT: movb (%rdi,%rax), %al
-; SKX-NEXT: movq %rbp, %rsp
-; SKX-NEXT: popq %rbp
-; SKX-NEXT: vzeroupper
-; SKX-NEXT: retq
+; CHECK-LABEL: test_extractelement_variable_v32i8:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: pushq %rbp
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: .cfi_offset %rbp, -16
+; CHECK-NEXT: movq %rsp, %rbp
+; CHECK-NEXT: .cfi_def_cfa_register %rbp
+; CHECK-NEXT: andq $-32, %rsp
+; CHECK-NEXT: subq $64, %rsp
+; CHECK-NEXT: ## kill: def %edi killed %edi def %rdi
+; CHECK-NEXT: vmovaps %ymm0, (%rsp)
+; CHECK-NEXT: andl $31, %edi
+; CHECK-NEXT: movq %rsp, %rax
+; CHECK-NEXT: movb (%rdi,%rax), %al
+; CHECK-NEXT: movq %rbp, %rsp
+; CHECK-NEXT: popq %rbp
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
%t2 = extractelement <32 x i8> %t1, i32 %index
ret i8 %t2
@@ -2213,18 +1465,15 @@ define i8 @test_extractelement_variable_v32i8(<32 x i8> %t1, i32 %index) {
define i8 @test_extractelement_variable_v64i8(<64 x i8> %t1, i32 %index) {
; KNL-LABEL: test_extractelement_variable_v64i8:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: pushq %rbp
-; KNL-NEXT: Lcfi36:
; KNL-NEXT: .cfi_def_cfa_offset 16
-; KNL-NEXT: Lcfi37:
; KNL-NEXT: .cfi_offset %rbp, -16
; KNL-NEXT: movq %rsp, %rbp
-; KNL-NEXT: Lcfi38:
; KNL-NEXT: .cfi_def_cfa_register %rbp
; KNL-NEXT: andq $-64, %rsp
; KNL-NEXT: subq $128, %rsp
-; KNL-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; KNL-NEXT: ## kill: def %edi killed %edi def %rdi
; KNL-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
; KNL-NEXT: vmovaps %ymm0, (%rsp)
; KNL-NEXT: andl $63, %edi
@@ -2232,22 +1481,20 @@ define i8 @test_extractelement_variable_v64i8(<64 x i8> %t1, i32 %index) {
; KNL-NEXT: movb (%rdi,%rax), %al
; KNL-NEXT: movq %rbp, %rsp
; KNL-NEXT: popq %rbp
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: test_extractelement_variable_v64i8:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: pushq %rbp
-; SKX-NEXT: Lcfi33:
; SKX-NEXT: .cfi_def_cfa_offset 16
-; SKX-NEXT: Lcfi34:
; SKX-NEXT: .cfi_offset %rbp, -16
; SKX-NEXT: movq %rsp, %rbp
-; SKX-NEXT: Lcfi35:
; SKX-NEXT: .cfi_def_cfa_register %rbp
; SKX-NEXT: andq $-64, %rsp
; SKX-NEXT: subq $128, %rsp
-; SKX-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
-; SKX-NEXT: vmovdqu8 %zmm0, (%rsp)
+; SKX-NEXT: ## kill: def %edi killed %edi def %rdi
+; SKX-NEXT: vmovaps %zmm0, (%rsp)
; SKX-NEXT: andl $63, %edi
; SKX-NEXT: movq %rsp, %rax
; SKX-NEXT: movb (%rdi,%rax), %al
@@ -2262,14 +1509,11 @@ define i8 @test_extractelement_variable_v64i8(<64 x i8> %t1, i32 %index) {
define i8 @test_extractelement_variable_v64i8_indexi8(<64 x i8> %t1, i8 %index) {
; KNL-LABEL: test_extractelement_variable_v64i8_indexi8:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: pushq %rbp
-; KNL-NEXT: Lcfi39:
; KNL-NEXT: .cfi_def_cfa_offset 16
-; KNL-NEXT: Lcfi40:
; KNL-NEXT: .cfi_offset %rbp, -16
; KNL-NEXT: movq %rsp, %rbp
-; KNL-NEXT: Lcfi41:
; KNL-NEXT: .cfi_def_cfa_register %rbp
; KNL-NEXT: andq $-64, %rsp
; KNL-NEXT: subq $128, %rsp
@@ -2282,22 +1526,20 @@ define i8 @test_extractelement_variable_v64i8_indexi8(<64 x i8> %t1, i8 %index)
; KNL-NEXT: movb (%rax,%rcx), %al
; KNL-NEXT: movq %rbp, %rsp
; KNL-NEXT: popq %rbp
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: test_extractelement_variable_v64i8_indexi8:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: pushq %rbp
-; SKX-NEXT: Lcfi36:
; SKX-NEXT: .cfi_def_cfa_offset 16
-; SKX-NEXT: Lcfi37:
; SKX-NEXT: .cfi_offset %rbp, -16
; SKX-NEXT: movq %rsp, %rbp
-; SKX-NEXT: Lcfi38:
; SKX-NEXT: .cfi_def_cfa_register %rbp
; SKX-NEXT: andq $-64, %rsp
; SKX-NEXT: subq $128, %rsp
; SKX-NEXT: addb %dil, %dil
-; SKX-NEXT: vmovdqu8 %zmm0, (%rsp)
+; SKX-NEXT: vmovaps %zmm0, (%rsp)
; SKX-NEXT: movzbl %dil, %eax
; SKX-NEXT: andl $63, %eax
; SKX-NEXT: movq %rsp, %rcx
@@ -2314,8 +1556,8 @@ define i8 @test_extractelement_variable_v64i8_indexi8(<64 x i8> %t1, i8 %index)
define zeroext i8 @test_extractelement_varible_v2i1(<2 x i64> %a, <2 x i64> %b, i32 %index) {
; KNL-LABEL: test_extractelement_varible_v2i1:
-; KNL: ## BB#0:
-; KNL-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; KNL: ## %bb.0:
+; KNL-NEXT: ## kill: def %edi killed %edi def %rdi
; KNL-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
; KNL-NEXT: vpxor %xmm2, %xmm1, %xmm1
; KNL-NEXT: vpxor %xmm2, %xmm0, %xmm0
@@ -2327,8 +1569,8 @@ define zeroext i8 @test_extractelement_varible_v2i1(<2 x i64> %a, <2 x i64> %b,
; KNL-NEXT: retq
;
; SKX-LABEL: test_extractelement_varible_v2i1:
-; SKX: ## BB#0:
-; SKX-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; SKX: ## %bb.0:
+; SKX-NEXT: ## kill: def %edi killed %edi def %rdi
; SKX-NEXT: vpcmpnleuq %xmm1, %xmm0, %k0
; SKX-NEXT: vpmovm2q %k0, %xmm0
; SKX-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp)
@@ -2344,8 +1586,8 @@ define zeroext i8 @test_extractelement_varible_v2i1(<2 x i64> %a, <2 x i64> %b,
define zeroext i8 @test_extractelement_varible_v4i1(<4 x i32> %a, <4 x i32> %b, i32 %index) {
; KNL-LABEL: test_extractelement_varible_v4i1:
-; KNL: ## BB#0:
-; KNL-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; KNL: ## %bb.0:
+; KNL-NEXT: ## kill: def %edi killed %edi def %rdi
; KNL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
; KNL-NEXT: vpxor %xmm2, %xmm1, %xmm1
; KNL-NEXT: vpxor %xmm2, %xmm0, %xmm0
@@ -2357,8 +1599,8 @@ define zeroext i8 @test_extractelement_varible_v4i1(<4 x i32> %a, <4 x i32> %b,
; KNL-NEXT: retq
;
; SKX-LABEL: test_extractelement_varible_v4i1:
-; SKX: ## BB#0:
-; SKX-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; SKX: ## %bb.0:
+; SKX-NEXT: ## kill: def %edi killed %edi def %rdi
; SKX-NEXT: vpcmpnleud %xmm1, %xmm0, %k0
; SKX-NEXT: vpmovm2d %k0, %xmm0
; SKX-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp)
@@ -2374,20 +1616,17 @@ define zeroext i8 @test_extractelement_varible_v4i1(<4 x i32> %a, <4 x i32> %b,
define zeroext i8 @test_extractelement_varible_v8i1(<8 x i32> %a, <8 x i32> %b, i32 %index) {
; KNL-LABEL: test_extractelement_varible_v8i1:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: pushq %rbp
-; KNL-NEXT: Lcfi42:
; KNL-NEXT: .cfi_def_cfa_offset 16
-; KNL-NEXT: Lcfi43:
; KNL-NEXT: .cfi_offset %rbp, -16
; KNL-NEXT: movq %rsp, %rbp
-; KNL-NEXT: Lcfi44:
; KNL-NEXT: .cfi_def_cfa_register %rbp
; KNL-NEXT: andq $-64, %rsp
; KNL-NEXT: subq $128, %rsp
-; KNL-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
-; KNL-NEXT: ## kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
-; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; KNL-NEXT: ## kill: def %edi killed %edi def %rdi
+; KNL-NEXT: ## kill: def %ymm1 killed %ymm1 def %zmm1
+; KNL-NEXT: ## kill: def %ymm0 killed %ymm0 def %zmm0
; KNL-NEXT: vpcmpnleud %zmm1, %zmm0, %k1
; KNL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; KNL-NEXT: vmovdqa64 %zmm0, (%rsp)
@@ -2396,21 +1635,19 @@ define zeroext i8 @test_extractelement_varible_v8i1(<8 x i32> %a, <8 x i32> %b,
; KNL-NEXT: andl $1, %eax
; KNL-NEXT: movq %rbp, %rsp
; KNL-NEXT: popq %rbp
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: test_extractelement_varible_v8i1:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: pushq %rbp
-; SKX-NEXT: Lcfi39:
; SKX-NEXT: .cfi_def_cfa_offset 16
-; SKX-NEXT: Lcfi40:
; SKX-NEXT: .cfi_offset %rbp, -16
; SKX-NEXT: movq %rsp, %rbp
-; SKX-NEXT: Lcfi41:
; SKX-NEXT: .cfi_def_cfa_register %rbp
; SKX-NEXT: andq $-64, %rsp
; SKX-NEXT: subq $128, %rsp
-; SKX-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; SKX-NEXT: ## kill: def %edi killed %edi def %rdi
; SKX-NEXT: vpcmpnleud %ymm1, %ymm0, %k0
; SKX-NEXT: vpmovm2q %k0, %zmm0
; SKX-NEXT: vmovdqa64 %zmm0, (%rsp)
@@ -2429,18 +1666,15 @@ define zeroext i8 @test_extractelement_varible_v8i1(<8 x i32> %a, <8 x i32> %b,
define zeroext i8 @test_extractelement_varible_v16i1(<16 x i32> %a, <16 x i32> %b, i32 %index) {
; KNL-LABEL: test_extractelement_varible_v16i1:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: pushq %rbp
-; KNL-NEXT: Lcfi45:
; KNL-NEXT: .cfi_def_cfa_offset 16
-; KNL-NEXT: Lcfi46:
; KNL-NEXT: .cfi_offset %rbp, -16
; KNL-NEXT: movq %rsp, %rbp
-; KNL-NEXT: Lcfi47:
; KNL-NEXT: .cfi_def_cfa_register %rbp
; KNL-NEXT: andq $-64, %rsp
; KNL-NEXT: subq $128, %rsp
-; KNL-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; KNL-NEXT: ## kill: def %edi killed %edi def %rdi
; KNL-NEXT: vpcmpnleud %zmm1, %zmm0, %k1
; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; KNL-NEXT: vmovdqa32 %zmm0, (%rsp)
@@ -2449,21 +1683,19 @@ define zeroext i8 @test_extractelement_varible_v16i1(<16 x i32> %a, <16 x i32> %
; KNL-NEXT: andl $1, %eax
; KNL-NEXT: movq %rbp, %rsp
; KNL-NEXT: popq %rbp
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: test_extractelement_varible_v16i1:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: pushq %rbp
-; SKX-NEXT: Lcfi42:
; SKX-NEXT: .cfi_def_cfa_offset 16
-; SKX-NEXT: Lcfi43:
; SKX-NEXT: .cfi_offset %rbp, -16
; SKX-NEXT: movq %rsp, %rbp
-; SKX-NEXT: Lcfi44:
; SKX-NEXT: .cfi_def_cfa_register %rbp
; SKX-NEXT: andq $-64, %rsp
; SKX-NEXT: subq $128, %rsp
-; SKX-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; SKX-NEXT: ## kill: def %edi killed %edi def %rdi
; SKX-NEXT: vpcmpnleud %zmm1, %zmm0, %k0
; SKX-NEXT: vpmovm2d %k0, %zmm0
; SKX-NEXT: vmovdqa32 %zmm0, (%rsp)
@@ -2482,18 +1714,15 @@ define zeroext i8 @test_extractelement_varible_v16i1(<16 x i32> %a, <16 x i32> %
define zeroext i8 @test_extractelement_varible_v32i1(<32 x i8> %a, <32 x i8> %b, i32 %index) {
; KNL-LABEL: test_extractelement_varible_v32i1:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: pushq %rbp
-; KNL-NEXT: Lcfi48:
; KNL-NEXT: .cfi_def_cfa_offset 16
-; KNL-NEXT: Lcfi49:
; KNL-NEXT: .cfi_offset %rbp, -16
; KNL-NEXT: movq %rsp, %rbp
-; KNL-NEXT: Lcfi50:
; KNL-NEXT: .cfi_def_cfa_register %rbp
; KNL-NEXT: andq $-32, %rsp
; KNL-NEXT: subq $64, %rsp
-; KNL-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; KNL-NEXT: ## kill: def %edi killed %edi def %rdi
; KNL-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
; KNL-NEXT: vpxor %ymm2, %ymm1, %ymm1
; KNL-NEXT: vpxor %ymm2, %ymm0, %ymm0
@@ -2505,24 +1734,22 @@ define zeroext i8 @test_extractelement_varible_v32i1(<32 x i8> %a, <32 x i8> %b,
; KNL-NEXT: andl $1, %eax
; KNL-NEXT: movq %rbp, %rsp
; KNL-NEXT: popq %rbp
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: test_extractelement_varible_v32i1:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: pushq %rbp
-; SKX-NEXT: Lcfi45:
; SKX-NEXT: .cfi_def_cfa_offset 16
-; SKX-NEXT: Lcfi46:
; SKX-NEXT: .cfi_offset %rbp, -16
; SKX-NEXT: movq %rsp, %rbp
-; SKX-NEXT: Lcfi47:
; SKX-NEXT: .cfi_def_cfa_register %rbp
; SKX-NEXT: andq $-64, %rsp
; SKX-NEXT: subq $128, %rsp
-; SKX-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; SKX-NEXT: ## kill: def %edi killed %edi def %rdi
; SKX-NEXT: vpcmpnleub %ymm1, %ymm0, %k0
; SKX-NEXT: vpmovm2w %k0, %zmm0
-; SKX-NEXT: vmovdqu16 %zmm0, (%rsp)
+; SKX-NEXT: vmovdqa32 %zmm0, (%rsp)
; SKX-NEXT: andl $31, %edi
; SKX-NEXT: movzbl (%rsp,%rdi,2), %eax
; SKX-NEXT: andl $1, %eax
@@ -2536,3 +1763,603 @@ define zeroext i8 @test_extractelement_varible_v32i1(<32 x i8> %a, <32 x i8> %b,
ret i8 %res
}
+define <8 x i64> @insert_double_zero(<2 x i64> %a) nounwind {
+; CHECK-LABEL: insert_double_zero:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vinsertf32x4 $2, %xmm0, %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %b = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %d = shufflevector <4 x i64> %b, <4 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %e = shufflevector <8 x i64> %d, <8 x i64> zeroinitializer, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 0, i32 1, i32 2, i32 3>
+ ret <8 x i64> %e
+}
+
+define i32 @test_insertelement_variable_v32i1(<32 x i8> %a, i8 %b, i32 %index) {
+; KNL-LABEL: test_insertelement_variable_v32i1:
+; KNL: ## %bb.0:
+; KNL-NEXT: pushq %rbp
+; KNL-NEXT: .cfi_def_cfa_offset 16
+; KNL-NEXT: .cfi_offset %rbp, -16
+; KNL-NEXT: movq %rsp, %rbp
+; KNL-NEXT: .cfi_def_cfa_register %rbp
+; KNL-NEXT: andq $-32, %rsp
+; KNL-NEXT: subq $96, %rsp
+; KNL-NEXT: ## kill: def %esi killed %esi def %rsi
+; KNL-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; KNL-NEXT: vpxor %ymm1, %ymm0, %ymm0
+; KNL-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0
+; KNL-NEXT: andl $31, %esi
+; KNL-NEXT: testb %dil, %dil
+; KNL-NEXT: vmovdqa %ymm0, {{[0-9]+}}(%rsp)
+; KNL-NEXT: leaq {{[0-9]+}}(%rsp), %rax
+; KNL-NEXT: setne (%rsi,%rax)
+; KNL-NEXT: vmovdqa {{[0-9]+}}(%rsp), %ymm0
+; KNL-NEXT: vextracti128 $1, %ymm0, %xmm1
+; KNL-NEXT: vpmovsxbd %xmm1, %zmm1
+; KNL-NEXT: vpslld $31, %zmm1, %zmm1
+; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0
+; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT: vpslld $31, %zmm0, %zmm0
+; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT: kmovw %k0, (%rsp)
+; KNL-NEXT: movl (%rsp), %eax
+; KNL-NEXT: movq %rbp, %rsp
+; KNL-NEXT: popq %rbp
+; KNL-NEXT: vzeroupper
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test_insertelement_variable_v32i1:
+; SKX: ## %bb.0:
+; SKX-NEXT: pushq %rbp
+; SKX-NEXT: .cfi_def_cfa_offset 16
+; SKX-NEXT: .cfi_offset %rbp, -16
+; SKX-NEXT: movq %rsp, %rbp
+; SKX-NEXT: .cfi_def_cfa_register %rbp
+; SKX-NEXT: andq $-64, %rsp
+; SKX-NEXT: subq $128, %rsp
+; SKX-NEXT: ## kill: def %esi killed %esi def %rsi
+; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; SKX-NEXT: vpcmpnleub %ymm1, %ymm0, %k1
+; SKX-NEXT: xorl %eax, %eax
+; SKX-NEXT: testb %dil, %dil
+; SKX-NEXT: setne %al
+; SKX-NEXT: vmovdqu16 {{.*}}(%rip), %zmm0 {%k1} {z}
+; SKX-NEXT: vmovdqa32 %zmm0, (%rsp)
+; SKX-NEXT: andl $31, %esi
+; SKX-NEXT: movw %ax, (%rsp,%rsi,2)
+; SKX-NEXT: vpsllw $15, (%rsp), %zmm0
+; SKX-NEXT: vpmovw2m %zmm0, %k0
+; SKX-NEXT: kmovd %k0, %eax
+; SKX-NEXT: movq %rbp, %rsp
+; SKX-NEXT: popq %rbp
+; SKX-NEXT: vzeroupper
+; SKX-NEXT: retq
+ %t1 = icmp ugt <32 x i8> %a, zeroinitializer
+ %t2 = icmp ugt i8 %b, 0
+ %t3 = insertelement <32 x i1> %t1, i1 %t2, i32 %index
+ %t4 = bitcast <32 x i1> %t3 to i32
+ ret i32 %t4
+}
+
+define i64 @test_insertelement_variable_v64i1(<64 x i8> %a, i8 %b, i32 %index) {
+; KNL-LABEL: test_insertelement_variable_v64i1:
+; KNL: ## %bb.0:
+; KNL-NEXT: pushq %rbp
+; KNL-NEXT: .cfi_def_cfa_offset 16
+; KNL-NEXT: .cfi_offset %rbp, -16
+; KNL-NEXT: movq %rsp, %rbp
+; KNL-NEXT: .cfi_def_cfa_register %rbp
+; KNL-NEXT: andq $-64, %rsp
+; KNL-NEXT: subq $192, %rsp
+; KNL-NEXT: ## kill: def %esi killed %esi def %rsi
+; KNL-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; KNL-NEXT: vpxor %ymm2, %ymm0, %ymm0
+; KNL-NEXT: vpcmpgtb %ymm2, %ymm0, %ymm0
+; KNL-NEXT: vpxor %ymm2, %ymm1, %ymm1
+; KNL-NEXT: vpcmpgtb %ymm2, %ymm1, %ymm1
+; KNL-NEXT: andl $63, %esi
+; KNL-NEXT: testb %dil, %dil
+; KNL-NEXT: vmovdqa %ymm1, {{[0-9]+}}(%rsp)
+; KNL-NEXT: vmovdqa %ymm0, {{[0-9]+}}(%rsp)
+; KNL-NEXT: leaq {{[0-9]+}}(%rsp), %rax
+; KNL-NEXT: setne (%rsi,%rax)
+; KNL-NEXT: vmovdqa {{[0-9]+}}(%rsp), %ymm0
+; KNL-NEXT: vmovdqa {{[0-9]+}}(%rsp), %ymm1
+; KNL-NEXT: vextracti128 $1, %ymm0, %xmm2
+; KNL-NEXT: vpmovsxbd %xmm2, %zmm2
+; KNL-NEXT: vpslld $31, %zmm2, %zmm2
+; KNL-NEXT: vptestmd %zmm2, %zmm2, %k0
+; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT: vpslld $31, %zmm0, %zmm0
+; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; KNL-NEXT: vextracti128 $1, %ymm1, %xmm0
+; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT: vpslld $31, %zmm0, %zmm0
+; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; KNL-NEXT: vpmovsxbd %xmm1, %zmm0
+; KNL-NEXT: vpslld $31, %zmm0, %zmm0
+; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT: kmovw %k0, (%rsp)
+; KNL-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; KNL-NEXT: movl (%rsp), %eax
+; KNL-NEXT: shlq $32, %rax
+; KNL-NEXT: orq %rcx, %rax
+; KNL-NEXT: movq %rbp, %rsp
+; KNL-NEXT: popq %rbp
+; KNL-NEXT: vzeroupper
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test_insertelement_variable_v64i1:
+; SKX: ## %bb.0:
+; SKX-NEXT: pushq %rbp
+; SKX-NEXT: .cfi_def_cfa_offset 16
+; SKX-NEXT: .cfi_offset %rbp, -16
+; SKX-NEXT: movq %rsp, %rbp
+; SKX-NEXT: .cfi_def_cfa_register %rbp
+; SKX-NEXT: andq $-64, %rsp
+; SKX-NEXT: subq $128, %rsp
+; SKX-NEXT: ## kill: def %esi killed %esi def %rsi
+; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; SKX-NEXT: vpcmpnleub %zmm1, %zmm0, %k1
+; SKX-NEXT: andl $63, %esi
+; SKX-NEXT: testb %dil, %dil
+; SKX-NEXT: vmovdqu8 {{.*}}(%rip), %zmm0 {%k1} {z}
+; SKX-NEXT: vmovdqa32 %zmm0, (%rsp)
+; SKX-NEXT: movq %rsp, %rax
+; SKX-NEXT: setne (%rsi,%rax)
+; SKX-NEXT: vpsllw $7, (%rsp), %zmm0
+; SKX-NEXT: vpmovb2m %zmm0, %k0
+; SKX-NEXT: kmovq %k0, %rax
+; SKX-NEXT: movq %rbp, %rsp
+; SKX-NEXT: popq %rbp
+; SKX-NEXT: vzeroupper
+; SKX-NEXT: retq
+ %t1 = icmp ugt <64 x i8> %a, zeroinitializer
+ %t2 = icmp ugt i8 %b, 0
+ %t3 = insertelement <64 x i1> %t1, i1 %t2, i32 %index
+ %t4 = bitcast <64 x i1> %t3 to i64
+ ret i64 %t4
+}
+
+define i96 @test_insertelement_variable_v96i1(<96 x i8> %a, i8 %b, i32 %index) {
+; KNL-LABEL: test_insertelement_variable_v96i1:
+; KNL: ## %bb.0:
+; KNL-NEXT: pushq %rbp
+; KNL-NEXT: .cfi_def_cfa_offset 16
+; KNL-NEXT: .cfi_offset %rbp, -16
+; KNL-NEXT: movq %rsp, %rbp
+; KNL-NEXT: .cfi_def_cfa_register %rbp
+; KNL-NEXT: andq $-128, %rsp
+; KNL-NEXT: subq $384, %rsp ## imm = 0x180
+; KNL-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; KNL-NEXT: vpinsrb $1, 488(%rbp), %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $2, 496(%rbp), %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $3, 504(%rbp), %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $4, 512(%rbp), %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $5, 520(%rbp), %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $6, 528(%rbp), %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $7, 536(%rbp), %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $8, 544(%rbp), %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $9, 552(%rbp), %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $10, 560(%rbp), %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $11, 568(%rbp), %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $12, 576(%rbp), %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $13, 584(%rbp), %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $14, 592(%rbp), %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $15, 600(%rbp), %xmm0, %xmm0
+; KNL-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; KNL-NEXT: vpinsrb $1, 616(%rbp), %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $2, 624(%rbp), %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $3, 632(%rbp), %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $4, 640(%rbp), %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $5, 648(%rbp), %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $6, 656(%rbp), %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $7, 664(%rbp), %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $8, 672(%rbp), %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $9, 680(%rbp), %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $10, 688(%rbp), %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $11, 696(%rbp), %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $12, 704(%rbp), %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $13, 712(%rbp), %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $14, 720(%rbp), %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $15, 728(%rbp), %xmm1, %xmm1
+; KNL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; KNL-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; KNL-NEXT: vpinsrb $1, 232(%rbp), %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $2, 240(%rbp), %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $3, 248(%rbp), %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $4, 256(%rbp), %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $5, 264(%rbp), %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $6, 272(%rbp), %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $7, 280(%rbp), %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $8, 288(%rbp), %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $9, 296(%rbp), %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $10, 304(%rbp), %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $11, 312(%rbp), %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $12, 320(%rbp), %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $13, 328(%rbp), %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $14, 336(%rbp), %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $15, 344(%rbp), %xmm1, %xmm1
+; KNL-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; KNL-NEXT: vpinsrb $1, 360(%rbp), %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $2, 368(%rbp), %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $3, 376(%rbp), %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $4, 384(%rbp), %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $5, 392(%rbp), %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $6, 400(%rbp), %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $7, 408(%rbp), %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $8, 416(%rbp), %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $9, 424(%rbp), %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $10, 432(%rbp), %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $11, 440(%rbp), %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $12, 448(%rbp), %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $13, 456(%rbp), %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $14, 464(%rbp), %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $15, 472(%rbp), %xmm2, %xmm2
+; KNL-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; KNL-NEXT: vmovd %edi, %xmm2
+; KNL-NEXT: vpinsrb $1, %esi, %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $2, %edx, %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $3, %ecx, %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $4, %r8d, %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $5, %r9d, %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $6, 16(%rbp), %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $7, 24(%rbp), %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $8, 32(%rbp), %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $9, 40(%rbp), %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $10, 48(%rbp), %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $11, 56(%rbp), %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $12, 64(%rbp), %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $13, 72(%rbp), %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $14, 80(%rbp), %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $15, 88(%rbp), %xmm2, %xmm2
+; KNL-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; KNL-NEXT: vpinsrb $1, 104(%rbp), %xmm3, %xmm3
+; KNL-NEXT: vpinsrb $2, 112(%rbp), %xmm3, %xmm3
+; KNL-NEXT: vpinsrb $3, 120(%rbp), %xmm3, %xmm3
+; KNL-NEXT: vpinsrb $4, 128(%rbp), %xmm3, %xmm3
+; KNL-NEXT: vpinsrb $5, 136(%rbp), %xmm3, %xmm3
+; KNL-NEXT: vpinsrb $6, 144(%rbp), %xmm3, %xmm3
+; KNL-NEXT: vpinsrb $7, 152(%rbp), %xmm3, %xmm3
+; KNL-NEXT: vpinsrb $8, 160(%rbp), %xmm3, %xmm3
+; KNL-NEXT: vpinsrb $9, 168(%rbp), %xmm3, %xmm3
+; KNL-NEXT: vpinsrb $10, 176(%rbp), %xmm3, %xmm3
+; KNL-NEXT: vpinsrb $11, 184(%rbp), %xmm3, %xmm3
+; KNL-NEXT: vpinsrb $12, 192(%rbp), %xmm3, %xmm3
+; KNL-NEXT: vpinsrb $13, 200(%rbp), %xmm3, %xmm3
+; KNL-NEXT: vpinsrb $14, 208(%rbp), %xmm3, %xmm3
+; KNL-NEXT: vpinsrb $15, 216(%rbp), %xmm3, %xmm3
+; KNL-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; KNL-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; KNL-NEXT: vpxor %ymm3, %ymm2, %ymm2
+; KNL-NEXT: vpcmpgtb %ymm3, %ymm2, %ymm2
+; KNL-NEXT: vpxor %ymm3, %ymm1, %ymm1
+; KNL-NEXT: vpcmpgtb %ymm3, %ymm1, %ymm1
+; KNL-NEXT: vpxor %ymm3, %ymm0, %ymm0
+; KNL-NEXT: vpcmpgtb %ymm3, %ymm0, %ymm0
+; KNL-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; KNL-NEXT: movl 744(%rbp), %eax
+; KNL-NEXT: andl $127, %eax
+; KNL-NEXT: cmpb $0, 736(%rbp)
+; KNL-NEXT: vmovdqa %ymm3, {{[0-9]+}}(%rsp)
+; KNL-NEXT: vmovdqa %ymm0, {{[0-9]+}}(%rsp)
+; KNL-NEXT: vmovdqa %ymm1, {{[0-9]+}}(%rsp)
+; KNL-NEXT: vmovdqa %ymm2, {{[0-9]+}}(%rsp)
+; KNL-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
+; KNL-NEXT: setne (%rax,%rcx)
+; KNL-NEXT: vmovdqa {{[0-9]+}}(%rsp), %ymm1
+; KNL-NEXT: vmovdqa {{[0-9]+}}(%rsp), %ymm2
+; KNL-NEXT: vmovdqa {{[0-9]+}}(%rsp), %ymm3
+; KNL-NEXT: vmovdqa {{[0-9]+}}(%rsp), %ymm0
+; KNL-NEXT: vextracti128 $1, %ymm1, %xmm4
+; KNL-NEXT: vpmovsxbd %xmm4, %zmm4
+; KNL-NEXT: vpslld $31, %zmm4, %zmm4
+; KNL-NEXT: vptestmd %zmm4, %zmm4, %k0
+; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; KNL-NEXT: vpmovsxbd %xmm1, %zmm1
+; KNL-NEXT: vpslld $31, %zmm1, %zmm1
+; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0
+; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; KNL-NEXT: vextracti128 $1, %ymm2, %xmm1
+; KNL-NEXT: vpmovsxbd %xmm1, %zmm1
+; KNL-NEXT: vpslld $31, %zmm1, %zmm1
+; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0
+; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; KNL-NEXT: vpmovsxbd %xmm2, %zmm1
+; KNL-NEXT: vpslld $31, %zmm1, %zmm1
+; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0
+; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; KNL-NEXT: vextracti128 $1, %ymm3, %xmm1
+; KNL-NEXT: vpmovsxbd %xmm1, %zmm1
+; KNL-NEXT: vpslld $31, %zmm1, %zmm1
+; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0
+; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; KNL-NEXT: vpmovsxbd %xmm3, %zmm1
+; KNL-NEXT: vpslld $31, %zmm1, %zmm1
+; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0
+; KNL-NEXT: kmovw %k0, (%rsp)
+; KNL-NEXT: vextracti128 $1, %ymm0, %xmm1
+; KNL-NEXT: vpmovsxbd %xmm1, %zmm1
+; KNL-NEXT: vpslld $31, %zmm1, %zmm1
+; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0
+; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT: vpslld $31, %zmm0, %zmm0
+; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; KNL-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; KNL-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; KNL-NEXT: shlq $32, %rax
+; KNL-NEXT: orq %rcx, %rax
+; KNL-NEXT: movl (%rsp), %ecx
+; KNL-NEXT: movl {{[0-9]+}}(%rsp), %edx
+; KNL-NEXT: shlq $32, %rdx
+; KNL-NEXT: orq %rcx, %rdx
+; KNL-NEXT: movq %rbp, %rsp
+; KNL-NEXT: popq %rbp
+; KNL-NEXT: vzeroupper
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test_insertelement_variable_v96i1:
+; SKX: ## %bb.0:
+; SKX-NEXT: pushq %rbp
+; SKX-NEXT: .cfi_def_cfa_offset 16
+; SKX-NEXT: .cfi_offset %rbp, -16
+; SKX-NEXT: movq %rsp, %rbp
+; SKX-NEXT: .cfi_def_cfa_register %rbp
+; SKX-NEXT: andq $-128, %rsp
+; SKX-NEXT: subq $256, %rsp ## imm = 0x100
+; SKX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SKX-NEXT: vpinsrb $1, 232(%rbp), %xmm0, %xmm0
+; SKX-NEXT: vpinsrb $2, 240(%rbp), %xmm0, %xmm0
+; SKX-NEXT: vpinsrb $3, 248(%rbp), %xmm0, %xmm0
+; SKX-NEXT: vpinsrb $4, 256(%rbp), %xmm0, %xmm0
+; SKX-NEXT: vpinsrb $5, 264(%rbp), %xmm0, %xmm0
+; SKX-NEXT: vpinsrb $6, 272(%rbp), %xmm0, %xmm0
+; SKX-NEXT: vpinsrb $7, 280(%rbp), %xmm0, %xmm0
+; SKX-NEXT: vpinsrb $8, 288(%rbp), %xmm0, %xmm0
+; SKX-NEXT: vpinsrb $9, 296(%rbp), %xmm0, %xmm0
+; SKX-NEXT: vpinsrb $10, 304(%rbp), %xmm0, %xmm0
+; SKX-NEXT: vpinsrb $11, 312(%rbp), %xmm0, %xmm0
+; SKX-NEXT: vpinsrb $12, 320(%rbp), %xmm0, %xmm0
+; SKX-NEXT: vpinsrb $13, 328(%rbp), %xmm0, %xmm0
+; SKX-NEXT: vpinsrb $14, 336(%rbp), %xmm0, %xmm0
+; SKX-NEXT: vpinsrb $15, 344(%rbp), %xmm0, %xmm0
+; SKX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SKX-NEXT: vpinsrb $1, 360(%rbp), %xmm1, %xmm1
+; SKX-NEXT: vpinsrb $2, 368(%rbp), %xmm1, %xmm1
+; SKX-NEXT: vpinsrb $3, 376(%rbp), %xmm1, %xmm1
+; SKX-NEXT: vpinsrb $4, 384(%rbp), %xmm1, %xmm1
+; SKX-NEXT: vpinsrb $5, 392(%rbp), %xmm1, %xmm1
+; SKX-NEXT: vpinsrb $6, 400(%rbp), %xmm1, %xmm1
+; SKX-NEXT: vpinsrb $7, 408(%rbp), %xmm1, %xmm1
+; SKX-NEXT: vpinsrb $8, 416(%rbp), %xmm1, %xmm1
+; SKX-NEXT: vpinsrb $9, 424(%rbp), %xmm1, %xmm1
+; SKX-NEXT: vpinsrb $10, 432(%rbp), %xmm1, %xmm1
+; SKX-NEXT: vpinsrb $11, 440(%rbp), %xmm1, %xmm1
+; SKX-NEXT: vpinsrb $12, 448(%rbp), %xmm1, %xmm1
+; SKX-NEXT: vpinsrb $13, 456(%rbp), %xmm1, %xmm1
+; SKX-NEXT: vpinsrb $14, 464(%rbp), %xmm1, %xmm1
+; SKX-NEXT: vpinsrb $15, 472(%rbp), %xmm1, %xmm1
+; SKX-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; SKX-NEXT: vmovd %edi, %xmm1
+; SKX-NEXT: vpinsrb $1, %esi, %xmm1, %xmm1
+; SKX-NEXT: vpinsrb $2, %edx, %xmm1, %xmm1
+; SKX-NEXT: vpinsrb $3, %ecx, %xmm1, %xmm1
+; SKX-NEXT: vpinsrb $4, %r8d, %xmm1, %xmm1
+; SKX-NEXT: vpinsrb $5, %r9d, %xmm1, %xmm1
+; SKX-NEXT: vpinsrb $6, 16(%rbp), %xmm1, %xmm1
+; SKX-NEXT: vpinsrb $7, 24(%rbp), %xmm1, %xmm1
+; SKX-NEXT: vpinsrb $8, 32(%rbp), %xmm1, %xmm1
+; SKX-NEXT: vpinsrb $9, 40(%rbp), %xmm1, %xmm1
+; SKX-NEXT: vpinsrb $10, 48(%rbp), %xmm1, %xmm1
+; SKX-NEXT: vpinsrb $11, 56(%rbp), %xmm1, %xmm1
+; SKX-NEXT: vpinsrb $12, 64(%rbp), %xmm1, %xmm1
+; SKX-NEXT: vpinsrb $13, 72(%rbp), %xmm1, %xmm1
+; SKX-NEXT: vpinsrb $14, 80(%rbp), %xmm1, %xmm1
+; SKX-NEXT: vpinsrb $15, 88(%rbp), %xmm1, %xmm1
+; SKX-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SKX-NEXT: vpinsrb $1, 104(%rbp), %xmm2, %xmm2
+; SKX-NEXT: vpinsrb $2, 112(%rbp), %xmm2, %xmm2
+; SKX-NEXT: vpinsrb $3, 120(%rbp), %xmm2, %xmm2
+; SKX-NEXT: vpinsrb $4, 128(%rbp), %xmm2, %xmm2
+; SKX-NEXT: vpinsrb $5, 136(%rbp), %xmm2, %xmm2
+; SKX-NEXT: vpinsrb $6, 144(%rbp), %xmm2, %xmm2
+; SKX-NEXT: vpinsrb $7, 152(%rbp), %xmm2, %xmm2
+; SKX-NEXT: vpinsrb $8, 160(%rbp), %xmm2, %xmm2
+; SKX-NEXT: vpinsrb $9, 168(%rbp), %xmm2, %xmm2
+; SKX-NEXT: vpinsrb $10, 176(%rbp), %xmm2, %xmm2
+; SKX-NEXT: vpinsrb $11, 184(%rbp), %xmm2, %xmm2
+; SKX-NEXT: vpinsrb $12, 192(%rbp), %xmm2, %xmm2
+; SKX-NEXT: vpinsrb $13, 200(%rbp), %xmm2, %xmm2
+; SKX-NEXT: vpinsrb $14, 208(%rbp), %xmm2, %xmm2
+; SKX-NEXT: vpinsrb $15, 216(%rbp), %xmm2, %xmm2
+; SKX-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; SKX-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; SKX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SKX-NEXT: vpinsrb $1, 488(%rbp), %xmm1, %xmm1
+; SKX-NEXT: vpinsrb $2, 496(%rbp), %xmm1, %xmm1
+; SKX-NEXT: vpinsrb $3, 504(%rbp), %xmm1, %xmm1
+; SKX-NEXT: vpinsrb $4, 512(%rbp), %xmm1, %xmm1
+; SKX-NEXT: vpinsrb $5, 520(%rbp), %xmm1, %xmm1
+; SKX-NEXT: vpinsrb $6, 528(%rbp), %xmm1, %xmm1
+; SKX-NEXT: vpinsrb $7, 536(%rbp), %xmm1, %xmm1
+; SKX-NEXT: vpinsrb $8, 544(%rbp), %xmm1, %xmm1
+; SKX-NEXT: vpinsrb $9, 552(%rbp), %xmm1, %xmm1
+; SKX-NEXT: vpinsrb $10, 560(%rbp), %xmm1, %xmm1
+; SKX-NEXT: vpinsrb $11, 568(%rbp), %xmm1, %xmm1
+; SKX-NEXT: vpinsrb $12, 576(%rbp), %xmm1, %xmm1
+; SKX-NEXT: vpinsrb $13, 584(%rbp), %xmm1, %xmm1
+; SKX-NEXT: vpinsrb $14, 592(%rbp), %xmm1, %xmm1
+; SKX-NEXT: vpinsrb $15, 600(%rbp), %xmm1, %xmm1
+; SKX-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SKX-NEXT: vpinsrb $1, 616(%rbp), %xmm2, %xmm2
+; SKX-NEXT: vpinsrb $2, 624(%rbp), %xmm2, %xmm2
+; SKX-NEXT: vpinsrb $3, 632(%rbp), %xmm2, %xmm2
+; SKX-NEXT: vpinsrb $4, 640(%rbp), %xmm2, %xmm2
+; SKX-NEXT: vpinsrb $5, 648(%rbp), %xmm2, %xmm2
+; SKX-NEXT: vpinsrb $6, 656(%rbp), %xmm2, %xmm2
+; SKX-NEXT: vpinsrb $7, 664(%rbp), %xmm2, %xmm2
+; SKX-NEXT: vpinsrb $8, 672(%rbp), %xmm2, %xmm2
+; SKX-NEXT: vpinsrb $9, 680(%rbp), %xmm2, %xmm2
+; SKX-NEXT: vpinsrb $10, 688(%rbp), %xmm2, %xmm2
+; SKX-NEXT: vpinsrb $11, 696(%rbp), %xmm2, %xmm2
+; SKX-NEXT: vpinsrb $12, 704(%rbp), %xmm2, %xmm2
+; SKX-NEXT: vpinsrb $13, 712(%rbp), %xmm2, %xmm2
+; SKX-NEXT: vpinsrb $14, 720(%rbp), %xmm2, %xmm2
+; SKX-NEXT: vpinsrb $15, 728(%rbp), %xmm2, %xmm2
+; SKX-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; SKX-NEXT: vpcmpnleub %zmm2, %zmm0, %k1
+; SKX-NEXT: vpcmpnleub %zmm2, %zmm1, %k2
+; SKX-NEXT: movl 744(%rbp), %eax
+; SKX-NEXT: andl $127, %eax
+; SKX-NEXT: cmpb $0, 736(%rbp)
+; SKX-NEXT: vmovdqa64 {{.*#+}} zmm0 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; SKX-NEXT: vmovdqu8 %zmm0, %zmm1 {%k2} {z}
+; SKX-NEXT: vmovdqa32 %zmm1, {{[0-9]+}}(%rsp)
+; SKX-NEXT: vmovdqu8 %zmm0, %zmm0 {%k1} {z}
+; SKX-NEXT: vmovdqa32 %zmm0, (%rsp)
+; SKX-NEXT: movq %rsp, %rcx
+; SKX-NEXT: setne (%rax,%rcx)
+; SKX-NEXT: vpsllw $7, {{[0-9]+}}(%rsp), %zmm0
+; SKX-NEXT: vpmovb2m %zmm0, %k0
+; SKX-NEXT: vpsllw $7, (%rsp), %zmm0
+; SKX-NEXT: vpmovb2m %zmm0, %k1
+; SKX-NEXT: kmovq %k1, %rax
+; SKX-NEXT: kmovq %k0, %rdx
+; SKX-NEXT: movq %rbp, %rsp
+; SKX-NEXT: popq %rbp
+; SKX-NEXT: vzeroupper
+; SKX-NEXT: retq
+ %t1 = icmp ugt <96 x i8> %a, zeroinitializer
+ %t2 = icmp ugt i8 %b, 0
+ %t3 = insertelement <96 x i1> %t1, i1 %t2, i32 %index
+ %t4 = bitcast <96 x i1> %t3 to i96
+ ret i96 %t4
+}
+
+define i128 @test_insertelement_variable_v128i1(<128 x i8> %a, i8 %b, i32 %index) {
+; KNL-LABEL: test_insertelement_variable_v128i1:
+; KNL: ## %bb.0:
+; KNL-NEXT: pushq %rbp
+; KNL-NEXT: .cfi_def_cfa_offset 16
+; KNL-NEXT: .cfi_offset %rbp, -16
+; KNL-NEXT: movq %rsp, %rbp
+; KNL-NEXT: .cfi_def_cfa_register %rbp
+; KNL-NEXT: andq $-128, %rsp
+; KNL-NEXT: subq $384, %rsp ## imm = 0x180
+; KNL-NEXT: ## kill: def %esi killed %esi def %rsi
+; KNL-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; KNL-NEXT: vpxor %ymm4, %ymm0, %ymm0
+; KNL-NEXT: vpcmpgtb %ymm4, %ymm0, %ymm0
+; KNL-NEXT: vpxor %ymm4, %ymm1, %ymm1
+; KNL-NEXT: vpcmpgtb %ymm4, %ymm1, %ymm1
+; KNL-NEXT: vpxor %ymm4, %ymm2, %ymm2
+; KNL-NEXT: vpcmpgtb %ymm4, %ymm2, %ymm2
+; KNL-NEXT: vpxor %ymm4, %ymm3, %ymm3
+; KNL-NEXT: vpcmpgtb %ymm4, %ymm3, %ymm3
+; KNL-NEXT: andl $127, %esi
+; KNL-NEXT: testb %dil, %dil
+; KNL-NEXT: vmovdqa %ymm3, {{[0-9]+}}(%rsp)
+; KNL-NEXT: vmovdqa %ymm2, {{[0-9]+}}(%rsp)
+; KNL-NEXT: vmovdqa %ymm1, {{[0-9]+}}(%rsp)
+; KNL-NEXT: vmovdqa %ymm0, {{[0-9]+}}(%rsp)
+; KNL-NEXT: leaq {{[0-9]+}}(%rsp), %rax
+; KNL-NEXT: setne (%rsi,%rax)
+; KNL-NEXT: vmovdqa {{[0-9]+}}(%rsp), %ymm1
+; KNL-NEXT: vmovdqa {{[0-9]+}}(%rsp), %ymm2
+; KNL-NEXT: vmovdqa {{[0-9]+}}(%rsp), %ymm3
+; KNL-NEXT: vmovdqa {{[0-9]+}}(%rsp), %ymm0
+; KNL-NEXT: vextracti128 $1, %ymm1, %xmm4
+; KNL-NEXT: vpmovsxbd %xmm4, %zmm4
+; KNL-NEXT: vpslld $31, %zmm4, %zmm4
+; KNL-NEXT: vptestmd %zmm4, %zmm4, %k0
+; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; KNL-NEXT: vpmovsxbd %xmm1, %zmm1
+; KNL-NEXT: vpslld $31, %zmm1, %zmm1
+; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0
+; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; KNL-NEXT: vextracti128 $1, %ymm2, %xmm1
+; KNL-NEXT: vpmovsxbd %xmm1, %zmm1
+; KNL-NEXT: vpslld $31, %zmm1, %zmm1
+; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0
+; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; KNL-NEXT: vpmovsxbd %xmm2, %zmm1
+; KNL-NEXT: vpslld $31, %zmm1, %zmm1
+; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0
+; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; KNL-NEXT: vextracti128 $1, %ymm3, %xmm1
+; KNL-NEXT: vpmovsxbd %xmm1, %zmm1
+; KNL-NEXT: vpslld $31, %zmm1, %zmm1
+; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0
+; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; KNL-NEXT: vpmovsxbd %xmm3, %zmm1
+; KNL-NEXT: vpslld $31, %zmm1, %zmm1
+; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0
+; KNL-NEXT: kmovw %k0, (%rsp)
+; KNL-NEXT: vextracti128 $1, %ymm0, %xmm1
+; KNL-NEXT: vpmovsxbd %xmm1, %zmm1
+; KNL-NEXT: vpslld $31, %zmm1, %zmm1
+; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0
+; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT: vpslld $31, %zmm0, %zmm0
+; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; KNL-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; KNL-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; KNL-NEXT: shlq $32, %rax
+; KNL-NEXT: orq %rcx, %rax
+; KNL-NEXT: movl (%rsp), %ecx
+; KNL-NEXT: movl {{[0-9]+}}(%rsp), %edx
+; KNL-NEXT: shlq $32, %rdx
+; KNL-NEXT: orq %rcx, %rdx
+; KNL-NEXT: movq %rbp, %rsp
+; KNL-NEXT: popq %rbp
+; KNL-NEXT: vzeroupper
+; KNL-NEXT: retq
+;
+; SKX-LABEL: test_insertelement_variable_v128i1:
+; SKX: ## %bb.0:
+; SKX-NEXT: pushq %rbp
+; SKX-NEXT: .cfi_def_cfa_offset 16
+; SKX-NEXT: .cfi_offset %rbp, -16
+; SKX-NEXT: movq %rsp, %rbp
+; SKX-NEXT: .cfi_def_cfa_register %rbp
+; SKX-NEXT: andq $-128, %rsp
+; SKX-NEXT: subq $256, %rsp ## imm = 0x100
+; SKX-NEXT: ## kill: def %esi killed %esi def %rsi
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; SKX-NEXT: vpcmpnleub %zmm2, %zmm0, %k1
+; SKX-NEXT: vpcmpnleub %zmm2, %zmm1, %k2
+; SKX-NEXT: andl $127, %esi
+; SKX-NEXT: testb %dil, %dil
+; SKX-NEXT: vmovdqa64 {{.*#+}} zmm0 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; SKX-NEXT: vmovdqu8 %zmm0, %zmm1 {%k2} {z}
+; SKX-NEXT: vmovdqa32 %zmm1, {{[0-9]+}}(%rsp)
+; SKX-NEXT: vmovdqu8 %zmm0, %zmm0 {%k1} {z}
+; SKX-NEXT: vmovdqa32 %zmm0, (%rsp)
+; SKX-NEXT: movq %rsp, %rax
+; SKX-NEXT: setne (%rsi,%rax)
+; SKX-NEXT: vpsllw $7, {{[0-9]+}}(%rsp), %zmm0
+; SKX-NEXT: vpmovb2m %zmm0, %k0
+; SKX-NEXT: vpsllw $7, (%rsp), %zmm0
+; SKX-NEXT: vpmovb2m %zmm0, %k1
+; SKX-NEXT: kmovq %k1, %rax
+; SKX-NEXT: kmovq %k0, %rdx
+; SKX-NEXT: movq %rbp, %rsp
+; SKX-NEXT: popq %rbp
+; SKX-NEXT: vzeroupper
+; SKX-NEXT: retq
+ %t1 = icmp ugt <128 x i8> %a, zeroinitializer
+ %t2 = icmp ugt i8 %b, 0
+ %t3 = insertelement <128 x i1> %t1, i1 %t2, i32 %index
+ %t4 = bitcast <128 x i1> %t3 to i128
+ ret i128 %t4
+}
diff --git a/test/CodeGen/X86/avx512-insert-extract_i1.ll b/test/CodeGen/X86/avx512-insert-extract_i1.ll
index a099b80898ee..e28e384ae996 100644
--- a/test/CodeGen/X86/avx512-insert-extract_i1.ll
+++ b/test/CodeGen/X86/avx512-insert-extract_i1.ll
@@ -1,25 +1,22 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck --check-prefix=SKX --check-prefix=SKX_ONLY %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck --check-prefix=SKX --check-prefix=SKX_ONLY %s
; TODO - fix fail on KNL and move this test to avx512-insert-extract.ll
define zeroext i8 @test_extractelement_varible_v64i1(<64 x i8> %a, <64 x i8> %b, i32 %index) {
; SKX-LABEL: test_extractelement_varible_v64i1:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: pushq %rbp
-; SKX-NEXT: Lcfi0:
; SKX-NEXT: .cfi_def_cfa_offset 16
-; SKX-NEXT: Lcfi1:
; SKX-NEXT: .cfi_offset %rbp, -16
; SKX-NEXT: movq %rsp, %rbp
-; SKX-NEXT: Lcfi2:
; SKX-NEXT: .cfi_def_cfa_register %rbp
; SKX-NEXT: andq $-64, %rsp
; SKX-NEXT: subq $128, %rsp
-; SKX-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; SKX-NEXT: ## kill: def %edi killed %edi def %rdi
; SKX-NEXT: vpcmpnleub %zmm1, %zmm0, %k0
; SKX-NEXT: vpmovm2b %k0, %zmm0
-; SKX-NEXT: vmovdqu8 %zmm0, (%rsp)
+; SKX-NEXT: vmovdqa32 %zmm0, (%rsp)
; SKX-NEXT: andl $63, %edi
; SKX-NEXT: movq %rsp, %rax
; SKX-NEXT: movzbl (%rdi,%rax), %eax
diff --git a/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll b/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll
index 652f85d8833b..50de773af001 100644
--- a/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll
+++ b/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll
@@ -4,14 +4,552 @@
; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/avx512f-builtins.c
+
+define zeroext i16 @test_mm512_kunpackb(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C, <8 x i64> %__D, <8 x i64> %__E, <8 x i64> %__F) local_unnamed_addr #0 {
+; X32-LABEL: test_mm512_kunpackb:
+; X32: # %bb.0: # %entry
+; X32-NEXT: pushl %ebp
+; X32-NEXT: .cfi_def_cfa_offset 8
+; X32-NEXT: .cfi_offset %ebp, -8
+; X32-NEXT: movl %esp, %ebp
+; X32-NEXT: .cfi_def_cfa_register %ebp
+; X32-NEXT: andl $-64, %esp
+; X32-NEXT: subl $64, %esp
+; X32-NEXT: vmovdqa64 136(%ebp), %zmm3
+; X32-NEXT: vpcmpneqd %zmm1, %zmm0, %k0
+; X32-NEXT: vpcmpneqd 8(%ebp), %zmm2, %k1
+; X32-NEXT: kunpckbw %k0, %k1, %k1
+; X32-NEXT: vpcmpneqd 72(%ebp), %zmm3, %k0 {%k1}
+; X32-NEXT: kmovw %k0, %eax
+; X32-NEXT: movzwl %ax, %eax
+; X32-NEXT: movl %ebp, %esp
+; X32-NEXT: popl %ebp
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_kunpackb:
+; X64: # %bb.0: # %entry
+; X64-NEXT: vpcmpneqd %zmm1, %zmm0, %k0
+; X64-NEXT: vpcmpneqd %zmm3, %zmm2, %k1
+; X64-NEXT: kunpckbw %k0, %k1, %k1
+; X64-NEXT: vpcmpneqd %zmm5, %zmm4, %k0 {%k1}
+; X64-NEXT: kmovw %k0, %eax
+; X64-NEXT: movzwl %ax, %eax
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+entry:
+ %0 = bitcast <8 x i64> %__A to <16 x i32>
+ %1 = bitcast <8 x i64> %__B to <16 x i32>
+ %2 = icmp ne <16 x i32> %0, %1
+ %3 = bitcast <16 x i1> %2 to i16
+ %4 = bitcast <8 x i64> %__C to <16 x i32>
+ %5 = bitcast <8 x i64> %__D to <16 x i32>
+ %6 = icmp ne <16 x i32> %4, %5
+ %7 = bitcast <16 x i1> %6 to i16
+ %8 = and i16 %7, 255
+ %shl.i = shl i16 %3, 8
+ %or.i = or i16 %8, %shl.i
+ %9 = bitcast <8 x i64> %__E to <16 x i32>
+ %10 = bitcast <8 x i64> %__F to <16 x i32>
+ %11 = icmp ne <16 x i32> %9, %10
+ %12 = bitcast i16 %or.i to <16 x i1>
+ %13 = and <16 x i1> %11, %12
+ %14 = bitcast <16 x i1> %13 to i16
+ ret i16 %14
+}
+
+define <16 x float> @test_mm512_shuffle_f32x4(<16 x float> %__A, <16 x float> %__B) {
+; X32-LABEL: test_mm512_shuffle_f32x4:
+; X32: # %bb.0: # %entry
+; X32-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[0,1,0,1]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_shuffle_f32x4:
+; X64: # %bb.0: # %entry
+; X64-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[0,1,0,1]
+; X64-NEXT: retq
+entry:
+ %shuffle = shufflevector <16 x float> %__A, <16 x float> %__B, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 16, i32 17, i32 18, i32 19>
+ ret <16 x float> %shuffle
+}
+
+
+define <16 x float> @test_mm512_mask_shuffle_f32x4(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) {
+; X32-LABEL: test_mm512_mask_shuffle_f32x4:
+; X32: # %bb.0: # %entry
+; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
+; X32-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,3,4,5,6,7],zmm2[0,1,2,3,0,1,2,3]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_mask_shuffle_f32x4:
+; X64: # %bb.0: # %entry
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,3,4,5,6,7],zmm2[0,1,2,3,0,1,2,3]
+; X64-NEXT: retq
+entry:
+ %shuffle = shufflevector <16 x float> %__A, <16 x float> %__B, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 16, i32 17, i32 18, i32 19>
+ %0 = bitcast i16 %__U to <16 x i1>
+ %1 = select <16 x i1> %0, <16 x float> %shuffle, <16 x float> %__W
+ ret <16 x float> %1
+}
+
+define <16 x float> @test_mm512_maskz_shuffle_f32x4(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) {
+; X32-LABEL: test_mm512_maskz_shuffle_f32x4:
+; X32: # %bb.0: # %entry
+; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
+; X32-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,5,6,7],zmm1[0,1,2,3,0,1,2,3]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_maskz_shuffle_f32x4:
+; X64: # %bb.0: # %entry
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,5,6,7],zmm1[0,1,2,3,0,1,2,3]
+; X64-NEXT: retq
+entry:
+ %shuffle = shufflevector <16 x float> %__A, <16 x float> %__B, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 16, i32 17, i32 18, i32 19>
+ %0 = bitcast i16 %__U to <16 x i1>
+ %1 = select <16 x i1> %0, <16 x float> %shuffle, <16 x float> zeroinitializer
+ ret <16 x float> %1
+}
+
+define <8 x double> @test_mm512_shuffle_f64x2(<8 x double> %__A, <8 x double> %__B) {
+; X32-LABEL: test_mm512_shuffle_f64x2:
+; X32: # %bb.0: # %entry
+; X32-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[0,1,0,1]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_shuffle_f64x2:
+; X64: # %bb.0: # %entry
+; X64-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[0,1,0,1]
+; X64-NEXT: retq
+entry:
+ %shuffle = shufflevector <8 x double> %__A, <8 x double> %__B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9>
+ ret <8 x double> %shuffle
+}
+
+define <8 x double> @test_mm512_mask_shuffle_f64x2(<8 x double> %__W, i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) {
+; X32-LABEL: test_mm512_mask_shuffle_f64x2:
+; X32: # %bb.0: # %entry
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,3],zmm2[0,1,0,1]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_mask_shuffle_f64x2:
+; X64: # %bb.0: # %entry
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,3],zmm2[0,1,0,1]
+; X64-NEXT: retq
+entry:
+ %shuffle = shufflevector <8 x double> %__A, <8 x double> %__B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9>
+ %0 = bitcast i8 %__U to <8 x i1>
+ %1 = select <8 x i1> %0, <8 x double> %shuffle, <8 x double> %__W
+ ret <8 x double> %1
+}
+
+define <8 x double> @test_mm512_maskz_shuffle_f64x2(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) {
+; X32-LABEL: test_mm512_maskz_shuffle_f64x2:
+; X32: # %bb.0: # %entry
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3],zmm1[0,1,0,1]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_maskz_shuffle_f64x2:
+; X64: # %bb.0: # %entry
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3],zmm1[0,1,0,1]
+; X64-NEXT: retq
+entry:
+ %shuffle = shufflevector <8 x double> %__A, <8 x double> %__B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9>
+ %0 = bitcast i8 %__U to <8 x i1>
+ %1 = select <8 x i1> %0, <8 x double> %shuffle, <8 x double> zeroinitializer
+ ret <8 x double> %1
+}
+
+define <8 x i64> @test_mm512_shuffle_i32x4(<8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
+; X32-LABEL: test_mm512_shuffle_i32x4:
+; X32: # %bb.0: # %entry
+; X32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[0,1,0,1]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_shuffle_i32x4:
+; X64: # %bb.0: # %entry
+; X64-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[0,1,0,1]
+; X64-NEXT: retq
+entry:
+ %shuffle = shufflevector <8 x i64> %__A, <8 x i64> %__B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9>
+ ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @test_mm512_mask_shuffle_i32x4(<8 x i64> %__W, i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
+; X32-LABEL: test_mm512_mask_shuffle_i32x4:
+; X32: # %bb.0: # %entry
+; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
+; X32-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,3,4,5,6,7],zmm2[0,1,2,3,0,1,2,3]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_mask_shuffle_i32x4:
+; X64: # %bb.0: # %entry
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,3,4,5,6,7],zmm2[0,1,2,3,0,1,2,3]
+; X64-NEXT: retq
+entry:
+ %shuffle = shufflevector <8 x i64> %__A, <8 x i64> %__B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9>
+ %0 = bitcast <8 x i64> %shuffle to <16 x i32>
+ %1 = bitcast <8 x i64> %__W to <16 x i32>
+ %2 = bitcast i16 %__U to <16 x i1>
+ %3 = select <16 x i1> %2, <16 x i32> %0, <16 x i32> %1
+ %4 = bitcast <16 x i32> %3 to <8 x i64>
+ ret <8 x i64> %4
+}
+
+define <8 x i64> @test_mm512_maskz_shuffle_i32x4(i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
+; X32-LABEL: test_mm512_maskz_shuffle_i32x4:
+; X32: # %bb.0: # %entry
+; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
+; X32-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,5,6,7],zmm1[0,1,2,3,0,1,2,3]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_maskz_shuffle_i32x4:
+; X64: # %bb.0: # %entry
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,5,6,7],zmm1[0,1,2,3,0,1,2,3]
+; X64-NEXT: retq
+entry:
+ %shuffle = shufflevector <8 x i64> %__A, <8 x i64> %__B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9>
+ %0 = bitcast <8 x i64> %shuffle to <16 x i32>
+ %1 = bitcast i16 %__U to <16 x i1>
+ %2 = select <16 x i1> %1, <16 x i32> %0, <16 x i32> zeroinitializer
+ %3 = bitcast <16 x i32> %2 to <8 x i64>
+ ret <8 x i64> %3
+}
+
+define <8 x i64> @test_mm512_shuffle_i64x2(<8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
+; X32-LABEL: test_mm512_shuffle_i64x2:
+; X32: # %bb.0: # %entry
+; X32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[0,1,0,1]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_shuffle_i64x2:
+; X64: # %bb.0: # %entry
+; X64-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[0,1,0,1]
+; X64-NEXT: retq
+entry:
+ %shuffle = shufflevector <8 x i64> %__A, <8 x i64> %__B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9>
+ ret <8 x i64> %shuffle
+}
+
+define <8 x i64> @test_mm512_mask_shuffle_i64x2(<8 x i64> %__W, i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
+; X32-LABEL: test_mm512_mask_shuffle_i64x2:
+; X32: # %bb.0: # %entry
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,3],zmm2[0,1,0,1]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_mask_shuffle_i64x2:
+; X64: # %bb.0: # %entry
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,3],zmm2[0,1,0,1]
+; X64-NEXT: retq
+entry:
+ %shuffle = shufflevector <8 x i64> %__A, <8 x i64> %__B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9>
+ %0 = bitcast i8 %__U to <8 x i1>
+ %1 = select <8 x i1> %0, <8 x i64> %shuffle, <8 x i64> %__W
+ ret <8 x i64> %1
+}
+
+define <8 x i64> @test_mm512_maskz_shuffle_i64x2(i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
+; X32-LABEL: test_mm512_maskz_shuffle_i64x2:
+; X32: # %bb.0: # %entry
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3],zmm1[0,1,0,1]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_maskz_shuffle_i64x2:
+; X64: # %bb.0: # %entry
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3],zmm1[0,1,0,1]
+; X64-NEXT: retq
+entry:
+ %shuffle = shufflevector <8 x i64> %__A, <8 x i64> %__B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9>
+ %0 = bitcast i8 %__U to <8 x i1>
+ %1 = select <8 x i1> %0, <8 x i64> %shuffle, <8 x i64> zeroinitializer
+ ret <8 x i64> %1
+}
+
+
+define zeroext i16 @test_mm512_testn_epi32_mask(<8 x i64> %__A, <8 x i64> %__B) {
+; X32-LABEL: test_mm512_testn_epi32_mask:
+; X32: # %bb.0: # %entry
+; X32-NEXT: vptestnmd %zmm0, %zmm1, %k0
+; X32-NEXT: kmovw %k0, %eax
+; X32-NEXT: movzwl %ax, %eax
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_testn_epi32_mask:
+; X64: # %bb.0: # %entry
+; X64-NEXT: vptestnmd %zmm0, %zmm1, %k0
+; X64-NEXT: kmovw %k0, %eax
+; X64-NEXT: movzwl %ax, %eax
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+entry:
+ %and1.i.i = and <8 x i64> %__B, %__A
+ %0 = bitcast <8 x i64> %and1.i.i to <16 x i32>
+ %1 = icmp eq <16 x i32> %0, zeroinitializer
+ %2 = bitcast <16 x i1> %1 to i16
+ ret i16 %2
+}
+
+define zeroext i16 @test_mm512_mask_testn_epi32_mask(i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) {
+; X32-LABEL: test_mm512_mask_testn_epi32_mask:
+; X32: # %bb.0: # %entry
+; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
+; X32-NEXT: vptestnmd %zmm0, %zmm1, %k0 {%k1}
+; X32-NEXT: kmovw %k0, %eax
+; X32-NEXT: movzwl %ax, %eax
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_mask_testn_epi32_mask:
+; X64: # %bb.0: # %entry
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vptestnmd %zmm0, %zmm1, %k0 {%k1}
+; X64-NEXT: kmovw %k0, %eax
+; X64-NEXT: movzwl %ax, %eax
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+entry:
+ %and1.i.i = and <8 x i64> %__B, %__A
+ %0 = bitcast <8 x i64> %and1.i.i to <16 x i32>
+ %1 = icmp eq <16 x i32> %0, zeroinitializer
+ %2 = bitcast i16 %__U to <16 x i1>
+ %3 = and <16 x i1> %1, %2
+ %4 = bitcast <16 x i1> %3 to i16
+ ret i16 %4
+}
+
+define zeroext i8 @test_mm512_testn_epi64_mask(<8 x i64> %__A, <8 x i64> %__B) {
+; X32-LABEL: test_mm512_testn_epi64_mask:
+; X32: # %bb.0: # %entry
+; X32-NEXT: vptestnmq %zmm0, %zmm1, %k0
+; X32-NEXT: kmovw %k0, %eax
+; X32-NEXT: movzbl %al, %eax
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_testn_epi64_mask:
+; X64: # %bb.0: # %entry
+; X64-NEXT: vptestnmq %zmm0, %zmm1, %k0
+; X64-NEXT: kmovw %k0, %eax
+; X64-NEXT: movzbl %al, %eax
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+entry:
+ %and1.i.i = and <8 x i64> %__B, %__A
+ %0 = icmp eq <8 x i64> %and1.i.i, zeroinitializer
+ %1 = bitcast <8 x i1> %0 to i8
+ ret i8 %1
+}
+
+define zeroext i8 @test_mm512_mask_testn_epi64_mask(i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) {
+; X32-LABEL: test_mm512_mask_testn_epi64_mask:
+; X32: # %bb.0: # %entry
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vptestnmq %zmm0, %zmm1, %k0 {%k1}
+; X32-NEXT: kmovw %k0, %eax
+; X32-NEXT: movzbl %al, %eax
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_mask_testn_epi64_mask:
+; X64: # %bb.0: # %entry
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vptestnmq %zmm0, %zmm1, %k0 {%k1}
+; X64-NEXT: kmovw %k0, %eax
+; X64-NEXT: movzbl %al, %eax
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+entry:
+ %and1.i.i = and <8 x i64> %__B, %__A
+ %0 = icmp eq <8 x i64> %and1.i.i, zeroinitializer
+ %1 = bitcast i8 %__U to <8 x i1>
+ %2 = and <8 x i1> %0, %1
+ %3 = bitcast <8 x i1> %2 to i8
+ ret i8 %3
+}
+
+define zeroext i16 @test_mm512_mask_test_epi32_mask(i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) {
+; X32-LABEL: test_mm512_mask_test_epi32_mask:
+; X32: # %bb.0: # %entry
+; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
+; X32-NEXT: vptestmd %zmm0, %zmm1, %k0 {%k1}
+; X32-NEXT: kmovw %k0, %eax
+; X32-NEXT: movzwl %ax, %eax
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_mask_test_epi32_mask:
+; X64: # %bb.0: # %entry
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vptestmd %zmm0, %zmm1, %k0 {%k1}
+; X64-NEXT: kmovw %k0, %eax
+; X64-NEXT: movzwl %ax, %eax
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+entry:
+ %and1.i.i = and <8 x i64> %__B, %__A
+ %0 = bitcast <8 x i64> %and1.i.i to <16 x i32>
+ %1 = icmp ne <16 x i32> %0, zeroinitializer
+ %2 = bitcast i16 %__U to <16 x i1>
+ %3 = and <16 x i1> %1, %2
+ %4 = bitcast <16 x i1> %3 to i16
+ ret i16 %4
+}
+
+define zeroext i8 @test_mm512_mask_test_epi64_mask(i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) {
+; X32-LABEL: test_mm512_mask_test_epi64_mask:
+; X32: # %bb.0: # %entry
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vptestmq %zmm0, %zmm1, %k0 {%k1}
+; X32-NEXT: kmovw %k0, %eax
+; X32-NEXT: movzbl %al, %eax
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_mask_test_epi64_mask:
+; X64: # %bb.0: # %entry
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vptestmq %zmm0, %zmm1, %k0 {%k1}
+; X64-NEXT: kmovw %k0, %eax
+; X64-NEXT: movzbl %al, %eax
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+entry:
+ %and1.i.i = and <8 x i64> %__B, %__A
+ %0 = icmp ne <8 x i64> %and1.i.i, zeroinitializer
+ %1 = bitcast i8 %__U to <8 x i1>
+ %2 = and <8 x i1> %0, %1
+ %3 = bitcast <8 x i1> %2 to i8
+ ret i8 %3
+}
+
+define <8 x i64> @test_mm512_mask_set1_epi32(<8 x i64> %__O, i16 zeroext %__M, i32 %__A) {
+; X32-LABEL: test_mm512_mask_set1_epi32:
+; X32: # %bb.0: # %entry
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
+; X32-NEXT: vpbroadcastd %eax, %zmm0 {%k1}
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_mask_set1_epi32:
+; X64: # %bb.0: # %entry
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vpbroadcastd %esi, %zmm0 {%k1}
+; X64-NEXT: retq
+entry:
+ %vecinit.i.i = insertelement <16 x i32> undef, i32 %__A, i32 0
+ %vecinit15.i.i = shufflevector <16 x i32> %vecinit.i.i, <16 x i32> undef, <16 x i32> zeroinitializer
+ %0 = bitcast <8 x i64> %__O to <16 x i32>
+ %1 = bitcast i16 %__M to <16 x i1>
+ %2 = select <16 x i1> %1, <16 x i32> %vecinit15.i.i, <16 x i32> %0
+ %3 = bitcast <16 x i32> %2 to <8 x i64>
+ ret <8 x i64> %3
+}
+
+define <8 x i64> @test_mm512_maskz_set1_epi32(i16 zeroext %__M, i32 %__A) {
+; X32-LABEL: test_mm512_maskz_set1_epi32:
+; X32: # %bb.0: # %entry
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
+; X32-NEXT: vpbroadcastd %eax, %zmm0 {%k1} {z}
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_maskz_set1_epi32:
+; X64: # %bb.0: # %entry
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vpbroadcastd %esi, %zmm0 {%k1} {z}
+; X64-NEXT: retq
+entry:
+ %vecinit.i.i = insertelement <16 x i32> undef, i32 %__A, i32 0
+ %vecinit15.i.i = shufflevector <16 x i32> %vecinit.i.i, <16 x i32> undef, <16 x i32> zeroinitializer
+ %0 = bitcast i16 %__M to <16 x i1>
+ %1 = select <16 x i1> %0, <16 x i32> %vecinit15.i.i, <16 x i32> zeroinitializer
+ %2 = bitcast <16 x i32> %1 to <8 x i64>
+ ret <8 x i64> %2
+}
+
+define <8 x i64> @test_mm512_mask_set1_epi64(<8 x i64> %__O, i8 zeroext %__M, i64 %__A) {
+; X32-LABEL: test_mm512_mask_set1_epi64:
+; X32: # %bb.0: # %entry
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: vmovd %edx, %xmm1
+; X32-NEXT: vpinsrd $1, %ecx, %xmm1, %xmm1
+; X32-NEXT: vpinsrd $2, %edx, %xmm1, %xmm1
+; X32-NEXT: vpinsrd $3, %ecx, %xmm1, %xmm1
+; X32-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vinserti64x4 $1, %ymm1, %zmm1, %zmm0 {%k1}
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_mask_set1_epi64:
+; X64: # %bb.0: # %entry
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vpbroadcastq %rsi, %zmm0 {%k1}
+; X64-NEXT: retq
+entry:
+ %vecinit.i.i = insertelement <8 x i64> undef, i64 %__A, i32 0
+ %vecinit7.i.i = shufflevector <8 x i64> %vecinit.i.i, <8 x i64> undef, <8 x i32> zeroinitializer
+ %0 = bitcast i8 %__M to <8 x i1>
+ %1 = select <8 x i1> %0, <8 x i64> %vecinit7.i.i, <8 x i64> %__O
+ ret <8 x i64> %1
+}
+
+define <8 x i64> @test_mm512_maskz_set1_epi64(i8 zeroext %__M, i64 %__A) {
+; X32-LABEL: test_mm512_maskz_set1_epi64:
+; X32: # %bb.0: # %entry
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: vmovd %edx, %xmm0
+; X32-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0
+; X32-NEXT: vpinsrd $2, %edx, %xmm0, %xmm0
+; X32-NEXT: vpinsrd $3, %ecx, %xmm0, %xmm0
+; X32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 {%k1} {z}
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_maskz_set1_epi64:
+; X64: # %bb.0: # %entry
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vpbroadcastq %rsi, %zmm0 {%k1} {z}
+; X64-NEXT: retq
+entry:
+ %vecinit.i.i = insertelement <8 x i64> undef, i64 %__A, i32 0
+ %vecinit7.i.i = shufflevector <8 x i64> %vecinit.i.i, <8 x i64> undef, <8 x i32> zeroinitializer
+ %0 = bitcast i8 %__M to <8 x i1>
+ %1 = select <8 x i1> %0, <8 x i64> %vecinit7.i.i, <8 x i64> zeroinitializer
+ ret <8 x i64> %1
+}
+
+
define <8 x i64> @test_mm512_broadcastd_epi32(<2 x i64> %a0) {
; X32-LABEL: test_mm512_broadcastd_epi32:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vbroadcastss %xmm0, %zmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm512_broadcastd_epi32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vbroadcastss %xmm0, %zmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <4 x i32>
@@ -22,14 +560,13 @@ define <8 x i64> @test_mm512_broadcastd_epi32(<2 x i64> %a0) {
define <8 x i64> @test_mm512_mask_broadcastd_epi32(<8 x i64> %a0, i16 %a1, <2 x i64> %a2) {
; X32-LABEL: test_mm512_mask_broadcastd_epi32:
-; X32: # BB#0:
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
-; X32-NEXT: kmovw %eax, %k1
+; X32: # %bb.0:
+; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
; X32-NEXT: vpbroadcastd %xmm1, %zmm0 {%k1}
; X32-NEXT: retl
;
; X64-LABEL: test_mm512_mask_broadcastd_epi32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: kmovw %edi, %k1
; X64-NEXT: vpbroadcastd %xmm1, %zmm0 {%k1}
; X64-NEXT: retq
@@ -44,14 +581,13 @@ define <8 x i64> @test_mm512_mask_broadcastd_epi32(<8 x i64> %a0, i16 %a1, <2 x
define <8 x i64> @test_mm512_maskz_broadcastd_epi32(i16 %a0, <2 x i64> %a1) {
; X32-LABEL: test_mm512_maskz_broadcastd_epi32:
-; X32: # BB#0:
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
-; X32-NEXT: kmovw %eax, %k1
+; X32: # %bb.0:
+; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
; X32-NEXT: vpbroadcastd %xmm0, %zmm0 {%k1} {z}
; X32-NEXT: retl
;
; X64-LABEL: test_mm512_maskz_broadcastd_epi32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: kmovw %edi, %k1
; X64-NEXT: vpbroadcastd %xmm0, %zmm0 {%k1} {z}
; X64-NEXT: retq
@@ -65,12 +601,12 @@ define <8 x i64> @test_mm512_maskz_broadcastd_epi32(i16 %a0, <2 x i64> %a1) {
define <8 x i64> @test_mm512_broadcastq_epi64(<2 x i64> %a0) {
; X32-LABEL: test_mm512_broadcastq_epi64:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vbroadcastsd %xmm0, %zmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm512_broadcastq_epi64:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vbroadcastsd %xmm0, %zmm0
; X64-NEXT: retq
%res = shufflevector <2 x i64> %a0, <2 x i64> undef, <8 x i32> zeroinitializer
@@ -79,14 +615,14 @@ define <8 x i64> @test_mm512_broadcastq_epi64(<2 x i64> %a0) {
define <8 x i64> @test_mm512_mask_broadcastq_epi64(<8 x i64> %a0, i8 %a1, <2 x i64> %a2) {
; X32-LABEL: test_mm512_mask_broadcastq_epi64:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movb {{[0-9]+}}(%esp), %al
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vpbroadcastq %xmm1, %zmm0 {%k1}
; X32-NEXT: retl
;
; X64-LABEL: test_mm512_mask_broadcastq_epi64:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: kmovw %edi, %k1
; X64-NEXT: vpbroadcastq %xmm1, %zmm0 {%k1}
; X64-NEXT: retq
@@ -98,14 +634,14 @@ define <8 x i64> @test_mm512_mask_broadcastq_epi64(<8 x i64> %a0, i8 %a1, <2 x i
define <8 x i64> @test_mm512_maskz_broadcastq_epi64(i8 %a0, <2 x i64> %a1) {
; X32-LABEL: test_mm512_maskz_broadcastq_epi64:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movb {{[0-9]+}}(%esp), %al
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vpbroadcastq %xmm0, %zmm0 {%k1} {z}
; X32-NEXT: retl
;
; X64-LABEL: test_mm512_maskz_broadcastq_epi64:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: kmovw %edi, %k1
; X64-NEXT: vpbroadcastq %xmm0, %zmm0 {%k1} {z}
; X64-NEXT: retq
@@ -117,12 +653,12 @@ define <8 x i64> @test_mm512_maskz_broadcastq_epi64(i8 %a0, <2 x i64> %a1) {
define <8 x double> @test_mm512_broadcastsd_pd(<2 x double> %a0) {
; X32-LABEL: test_mm512_broadcastsd_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vbroadcastsd %xmm0, %zmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm512_broadcastsd_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vbroadcastsd %xmm0, %zmm0
; X64-NEXT: retq
%res = shufflevector <2 x double> %a0, <2 x double> undef, <8 x i32> zeroinitializer
@@ -131,14 +667,14 @@ define <8 x double> @test_mm512_broadcastsd_pd(<2 x double> %a0) {
define <8 x double> @test_mm512_mask_broadcastsd_pd(<8 x double> %a0, i8 %a1, <2 x double> %a2) {
; X32-LABEL: test_mm512_mask_broadcastsd_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movb {{[0-9]+}}(%esp), %al
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vbroadcastsd %xmm1, %zmm0 {%k1}
; X32-NEXT: retl
;
; X64-LABEL: test_mm512_mask_broadcastsd_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: kmovw %edi, %k1
; X64-NEXT: vbroadcastsd %xmm1, %zmm0 {%k1}
; X64-NEXT: retq
@@ -150,14 +686,14 @@ define <8 x double> @test_mm512_mask_broadcastsd_pd(<8 x double> %a0, i8 %a1, <2
define <8 x double> @test_mm512_maskz_broadcastsd_pd(i8 %a0, <2 x double> %a1) {
; X32-LABEL: test_mm512_maskz_broadcastsd_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movb {{[0-9]+}}(%esp), %al
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vbroadcastsd %xmm0, %zmm0 {%k1} {z}
; X32-NEXT: retl
;
; X64-LABEL: test_mm512_maskz_broadcastsd_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: kmovw %edi, %k1
; X64-NEXT: vbroadcastsd %xmm0, %zmm0 {%k1} {z}
; X64-NEXT: retq
@@ -169,12 +705,12 @@ define <8 x double> @test_mm512_maskz_broadcastsd_pd(i8 %a0, <2 x double> %a1) {
define <16 x float> @test_mm512_broadcastss_ps(<4 x float> %a0) {
; X32-LABEL: test_mm512_broadcastss_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vbroadcastss %xmm0, %zmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm512_broadcastss_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vbroadcastss %xmm0, %zmm0
; X64-NEXT: retq
%res = shufflevector <4 x float> %a0, <4 x float> undef, <16 x i32> zeroinitializer
@@ -183,14 +719,13 @@ define <16 x float> @test_mm512_broadcastss_ps(<4 x float> %a0) {
define <16 x float> @test_mm512_mask_broadcastss_ps(<16 x float> %a0, i16 %a1, <4 x float> %a2) {
; X32-LABEL: test_mm512_mask_broadcastss_ps:
-; X32: # BB#0:
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
-; X32-NEXT: kmovw %eax, %k1
+; X32: # %bb.0:
+; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
; X32-NEXT: vbroadcastss %xmm1, %zmm0 {%k1}
; X32-NEXT: retl
;
; X64-LABEL: test_mm512_mask_broadcastss_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: kmovw %edi, %k1
; X64-NEXT: vbroadcastss %xmm1, %zmm0 {%k1}
; X64-NEXT: retq
@@ -202,14 +737,13 @@ define <16 x float> @test_mm512_mask_broadcastss_ps(<16 x float> %a0, i16 %a1, <
define <16 x float> @test_mm512_maskz_broadcastss_ps(i16 %a0, <4 x float> %a1) {
; X32-LABEL: test_mm512_maskz_broadcastss_ps:
-; X32: # BB#0:
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
-; X32-NEXT: kmovw %eax, %k1
+; X32: # %bb.0:
+; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
; X32-NEXT: vbroadcastss %xmm0, %zmm0 {%k1} {z}
; X32-NEXT: retl
;
; X64-LABEL: test_mm512_maskz_broadcastss_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: kmovw %edi, %k1
; X64-NEXT: vbroadcastss %xmm0, %zmm0 {%k1} {z}
; X64-NEXT: retq
@@ -221,12 +755,12 @@ define <16 x float> @test_mm512_maskz_broadcastss_ps(i16 %a0, <4 x float> %a1) {
define <8 x double> @test_mm512_movddup_pd(<8 x double> %a0) {
; X32-LABEL: test_mm512_movddup_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vmovddup {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6]
; X32-NEXT: retl
;
; X64-LABEL: test_mm512_movddup_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovddup {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6]
; X64-NEXT: retq
%res = shufflevector <8 x double> %a0, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
@@ -235,14 +769,14 @@ define <8 x double> @test_mm512_movddup_pd(<8 x double> %a0) {
define <8 x double> @test_mm512_mask_movddup_pd(<8 x double> %a0, i8 %a1, <8 x double> %a2) {
; X32-LABEL: test_mm512_mask_movddup_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movb {{[0-9]+}}(%esp), %al
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vmovddup {{.*#+}} zmm0 {%k1} = zmm1[0,0,2,2,4,4,6,6]
; X32-NEXT: retl
;
; X64-LABEL: test_mm512_mask_movddup_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: kmovw %edi, %k1
; X64-NEXT: vmovddup {{.*#+}} zmm0 {%k1} = zmm1[0,0,2,2,4,4,6,6]
; X64-NEXT: retq
@@ -254,14 +788,14 @@ define <8 x double> @test_mm512_mask_movddup_pd(<8 x double> %a0, i8 %a1, <8 x d
define <8 x double> @test_mm512_maskz_movddup_pd(i8 %a0, <8 x double> %a1) {
; X32-LABEL: test_mm512_maskz_movddup_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movb {{[0-9]+}}(%esp), %al
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vmovddup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6]
; X32-NEXT: retl
;
; X64-LABEL: test_mm512_maskz_movddup_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: kmovw %edi, %k1
; X64-NEXT: vmovddup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6]
; X64-NEXT: retq
@@ -273,12 +807,12 @@ define <8 x double> @test_mm512_maskz_movddup_pd(i8 %a0, <8 x double> %a1) {
define <16 x float> @test_mm512_movehdup_ps(<16 x float> %a0) {
; X32-LABEL: test_mm512_movehdup_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vmovshdup {{.*#+}} zmm0 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
; X32-NEXT: retl
;
; X64-LABEL: test_mm512_movehdup_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovshdup {{.*#+}} zmm0 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
; X64-NEXT: retq
%res = shufflevector <16 x float> %a0, <16 x float> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
@@ -287,14 +821,13 @@ define <16 x float> @test_mm512_movehdup_ps(<16 x float> %a0) {
define <16 x float> @test_mm512_mask_movehdup_ps(<16 x float> %a0, i16 %a1, <16 x float> %a2) {
; X32-LABEL: test_mm512_mask_movehdup_ps:
-; X32: # BB#0:
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
-; X32-NEXT: kmovw %eax, %k1
+; X32: # %bb.0:
+; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
; X32-NEXT: vmovshdup {{.*#+}} zmm0 {%k1} = zmm1[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
; X32-NEXT: retl
;
; X64-LABEL: test_mm512_mask_movehdup_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: kmovw %edi, %k1
; X64-NEXT: vmovshdup {{.*#+}} zmm0 {%k1} = zmm1[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
; X64-NEXT: retq
@@ -306,14 +839,13 @@ define <16 x float> @test_mm512_mask_movehdup_ps(<16 x float> %a0, i16 %a1, <16
define <16 x float> @test_mm512_maskz_movehdup_ps(i16 %a0, <16 x float> %a1) {
; X32-LABEL: test_mm512_maskz_movehdup_ps:
-; X32: # BB#0:
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
-; X32-NEXT: kmovw %eax, %k1
+; X32: # %bb.0:
+; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
; X32-NEXT: vmovshdup {{.*#+}} zmm0 {%k1} {z} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
; X32-NEXT: retl
;
; X64-LABEL: test_mm512_maskz_movehdup_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: kmovw %edi, %k1
; X64-NEXT: vmovshdup {{.*#+}} zmm0 {%k1} {z} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
; X64-NEXT: retq
@@ -325,12 +857,12 @@ define <16 x float> @test_mm512_maskz_movehdup_ps(i16 %a0, <16 x float> %a1) {
define <16 x float> @test_mm512_moveldup_ps(<16 x float> %a0) {
; X32-LABEL: test_mm512_moveldup_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vmovsldup {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
; X32-NEXT: retl
;
; X64-LABEL: test_mm512_moveldup_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovsldup {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
; X64-NEXT: retq
%res = shufflevector <16 x float> %a0, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
@@ -339,14 +871,13 @@ define <16 x float> @test_mm512_moveldup_ps(<16 x float> %a0) {
define <16 x float> @test_mm512_mask_moveldup_ps(<16 x float> %a0, i16 %a1, <16 x float> %a2) {
; X32-LABEL: test_mm512_mask_moveldup_ps:
-; X32: # BB#0:
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
-; X32-NEXT: kmovw %eax, %k1
+; X32: # %bb.0:
+; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
; X32-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} = zmm1[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
; X32-NEXT: retl
;
; X64-LABEL: test_mm512_mask_moveldup_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: kmovw %edi, %k1
; X64-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} = zmm1[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
; X64-NEXT: retq
@@ -358,14 +889,13 @@ define <16 x float> @test_mm512_mask_moveldup_ps(<16 x float> %a0, i16 %a1, <16
define <16 x float> @test_mm512_maskz_moveldup_ps(i16 %a0, <16 x float> %a1) {
; X32-LABEL: test_mm512_maskz_moveldup_ps:
-; X32: # BB#0:
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
-; X32-NEXT: kmovw %eax, %k1
+; X32: # %bb.0:
+; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
; X32-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
; X32-NEXT: retl
;
; X64-LABEL: test_mm512_maskz_moveldup_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: kmovw %edi, %k1
; X64-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
; X64-NEXT: retq
@@ -377,12 +907,12 @@ define <16 x float> @test_mm512_maskz_moveldup_ps(i16 %a0, <16 x float> %a1) {
define <8 x double> @test_mm512_permute_pd(<8 x double> %a0) {
; X32-LABEL: test_mm512_permute_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpermilpd {{.*#+}} zmm0 = zmm0[0,1,2,2,4,4,6,6]
; X32-NEXT: retl
;
; X64-LABEL: test_mm512_permute_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpermilpd {{.*#+}} zmm0 = zmm0[0,1,2,2,4,4,6,6]
; X64-NEXT: retq
%res = shufflevector <8 x double> %a0, <8 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
@@ -391,14 +921,14 @@ define <8 x double> @test_mm512_permute_pd(<8 x double> %a0) {
define <8 x double> @test_mm512_mask_permute_pd(<8 x double> %a0, i8 %a1, <8 x double> %a2) {
; X32-LABEL: test_mm512_mask_permute_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movb {{[0-9]+}}(%esp), %al
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vpermilpd {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,2,4,4,6,6]
; X32-NEXT: retl
;
; X64-LABEL: test_mm512_mask_permute_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: kmovw %edi, %k1
; X64-NEXT: vpermilpd {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,2,4,4,6,6]
; X64-NEXT: retq
@@ -410,14 +940,14 @@ define <8 x double> @test_mm512_mask_permute_pd(<8 x double> %a0, i8 %a1, <8 x d
define <8 x double> @test_mm512_maskz_permute_pd(i8 %a0, <8 x double> %a1) {
; X32-LABEL: test_mm512_maskz_permute_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movb {{[0-9]+}}(%esp), %al
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vpermilpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,2,4,4,6,6]
; X32-NEXT: retl
;
; X64-LABEL: test_mm512_maskz_permute_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: kmovw %edi, %k1
; X64-NEXT: vpermilpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,2,4,4,6,6]
; X64-NEXT: retq
@@ -429,12 +959,12 @@ define <8 x double> @test_mm512_maskz_permute_pd(i8 %a0, <8 x double> %a1) {
define <16 x float> @test_mm512_permute_ps(<16 x float> %a0) {
; X32-LABEL: test_mm512_permute_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[2,0,0,0,6,4,4,4,10,8,8,8,14,12,12,12]
; X32-NEXT: retl
;
; X64-LABEL: test_mm512_permute_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[2,0,0,0,6,4,4,4,10,8,8,8,14,12,12,12]
; X64-NEXT: retq
%res = shufflevector <16 x float> %a0, <16 x float> undef, <16 x i32> <i32 2, i32 0, i32 0, i32 0, i32 6, i32 4, i32 4, i32 4, i32 10, i32 8, i32 8, i32 8, i32 14, i32 12, i32 12, i32 12>
@@ -443,14 +973,13 @@ define <16 x float> @test_mm512_permute_ps(<16 x float> %a0) {
define <16 x float> @test_mm512_mask_permute_ps(<16 x float> %a0, i16 %a1, <16 x float> %a2) {
; X32-LABEL: test_mm512_mask_permute_ps:
-; X32: # BB#0:
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
-; X32-NEXT: kmovw %eax, %k1
+; X32: # %bb.0:
+; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
; X32-NEXT: vpermilps {{.*#+}} zmm0 {%k1} = zmm1[2,0,0,0,6,4,4,4,10,8,8,8,14,12,12,12]
; X32-NEXT: retl
;
; X64-LABEL: test_mm512_mask_permute_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: kmovw %edi, %k1
; X64-NEXT: vpermilps {{.*#+}} zmm0 {%k1} = zmm1[2,0,0,0,6,4,4,4,10,8,8,8,14,12,12,12]
; X64-NEXT: retq
@@ -462,14 +991,13 @@ define <16 x float> @test_mm512_mask_permute_ps(<16 x float> %a0, i16 %a1, <16 x
define <16 x float> @test_mm512_maskz_permute_ps(i16 %a0, <16 x float> %a1) {
; X32-LABEL: test_mm512_maskz_permute_ps:
-; X32: # BB#0:
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
-; X32-NEXT: kmovw %eax, %k1
+; X32: # %bb.0:
+; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
; X32-NEXT: vpermilps {{.*#+}} zmm0 {%k1} {z} = zmm0[2,0,0,0,6,4,4,4,10,8,8,8,14,12,12,12]
; X32-NEXT: retl
;
; X64-LABEL: test_mm512_maskz_permute_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: kmovw %edi, %k1
; X64-NEXT: vpermilps {{.*#+}} zmm0 {%k1} {z} = zmm0[2,0,0,0,6,4,4,4,10,8,8,8,14,12,12,12]
; X64-NEXT: retq
@@ -481,13 +1009,13 @@ define <16 x float> @test_mm512_maskz_permute_ps(i16 %a0, <16 x float> %a1) {
define <8 x i64> @test_mm512_permutex_epi64(<8 x i64> %a0) {
; X32-LABEL: test_mm512_permutex_epi64:
-; X32: # BB#0:
-; X32-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,0,0,0,4,4,4,4]
+; X32: # %bb.0:
+; X32-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[0,0,0,0,4,4,4,4]
; X32-NEXT: retl
;
; X64-LABEL: test_mm512_permutex_epi64:
-; X64: # BB#0:
-; X64-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,0,0,0,4,4,4,4]
+; X64: # %bb.0:
+; X64-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[0,0,0,0,4,4,4,4]
; X64-NEXT: retq
%res = shufflevector <8 x i64> %a0, <8 x i64> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
ret <8 x i64> %res
@@ -495,14 +1023,14 @@ define <8 x i64> @test_mm512_permutex_epi64(<8 x i64> %a0) {
define <8 x i64> @test_mm512_mask_permutex_epi64(<8 x i64> %a0, i8 %a1, <8 x i64> %a2) {
; X32-LABEL: test_mm512_mask_permutex_epi64:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movb {{[0-9]+}}(%esp), %al
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vpermq {{.*#+}} zmm0 {%k1} = zmm1[0,0,0,0,4,4,4,4]
; X32-NEXT: retl
;
; X64-LABEL: test_mm512_mask_permutex_epi64:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: kmovw %edi, %k1
; X64-NEXT: vpermq {{.*#+}} zmm0 {%k1} = zmm1[0,0,0,0,4,4,4,4]
; X64-NEXT: retq
@@ -514,14 +1042,14 @@ define <8 x i64> @test_mm512_mask_permutex_epi64(<8 x i64> %a0, i8 %a1, <8 x i64
define <8 x i64> @test_mm512_maskz_permutex_epi64(i8 %a0, <8 x i64> %a1) {
; X32-LABEL: test_mm512_maskz_permutex_epi64:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movb {{[0-9]+}}(%esp), %al
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,0,0,4,4,4,4]
; X32-NEXT: retl
;
; X64-LABEL: test_mm512_maskz_permutex_epi64:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: kmovw %edi, %k1
; X64-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,0,0,4,4,4,4]
; X64-NEXT: retq
@@ -533,12 +1061,12 @@ define <8 x i64> @test_mm512_maskz_permutex_epi64(i8 %a0, <8 x i64> %a1) {
define <8 x double> @test_mm512_permutex_pd(<8 x double> %a0) {
; X32-LABEL: test_mm512_permutex_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[0,0,0,0,4,4,4,4]
; X32-NEXT: retl
;
; X64-LABEL: test_mm512_permutex_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[0,0,0,0,4,4,4,4]
; X64-NEXT: retq
%res = shufflevector <8 x double> %a0, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
@@ -547,14 +1075,14 @@ define <8 x double> @test_mm512_permutex_pd(<8 x double> %a0) {
define <8 x double> @test_mm512_mask_permutex_pd(<8 x double> %a0, i8 %a1, <8 x double> %a2) {
; X32-LABEL: test_mm512_mask_permutex_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movb {{[0-9]+}}(%esp), %al
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vpermpd {{.*#+}} zmm0 {%k1} = zmm1[0,0,0,0,4,4,4,4]
; X32-NEXT: retl
;
; X64-LABEL: test_mm512_mask_permutex_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: kmovw %edi, %k1
; X64-NEXT: vpermpd {{.*#+}} zmm0 {%k1} = zmm1[0,0,0,0,4,4,4,4]
; X64-NEXT: retq
@@ -566,14 +1094,14 @@ define <8 x double> @test_mm512_mask_permutex_pd(<8 x double> %a0, i8 %a1, <8 x
define <8 x double> @test_mm512_maskz_permutex_pd(i8 %a0, <8 x double> %a1) {
; X32-LABEL: test_mm512_maskz_permutex_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movb {{[0-9]+}}(%esp), %al
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,0,0,4,4,4,4]
; X32-NEXT: retl
;
; X64-LABEL: test_mm512_maskz_permutex_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: kmovw %edi, %k1
; X64-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,0,0,4,4,4,4]
; X64-NEXT: retq
@@ -585,13 +1113,13 @@ define <8 x double> @test_mm512_maskz_permutex_pd(i8 %a0, <8 x double> %a1) {
define <8 x i64> @test_mm512_shuffle_epi32(<8 x i64> %a0) {
; X32-LABEL: test_mm512_shuffle_epi32:
-; X32: # BB#0:
-; X32-NEXT: vpshufd {{.*#+}} zmm0 = zmm0[1,0,0,0,5,4,4,4,9,8,8,8,13,12,12,12]
+; X32: # %bb.0:
+; X32-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[1,0,0,0,5,4,4,4,9,8,8,8,13,12,12,12]
; X32-NEXT: retl
;
; X64-LABEL: test_mm512_shuffle_epi32:
-; X64: # BB#0:
-; X64-NEXT: vpshufd {{.*#+}} zmm0 = zmm0[1,0,0,0,5,4,4,4,9,8,8,8,13,12,12,12]
+; X64: # %bb.0:
+; X64-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[1,0,0,0,5,4,4,4,9,8,8,8,13,12,12,12]
; X64-NEXT: retq
%arg0 = bitcast <8 x i64> %a0 to <16 x i32>
%res0 = shufflevector <16 x i32> %arg0, <16 x i32> undef, <16 x i32> <i32 1, i32 0, i32 0, i32 0, i32 5, i32 4, i32 4, i32 4, i32 9, i32 8, i32 8, i32 8, i32 13, i32 12, i32 12, i32 12>
@@ -601,14 +1129,13 @@ define <8 x i64> @test_mm512_shuffle_epi32(<8 x i64> %a0) {
define <8 x i64> @test_mm512_mask_shuffle_epi32(<8 x i64> %a0, i16 %a1, <8 x i64> %a2) {
; X32-LABEL: test_mm512_mask_shuffle_epi32:
-; X32: # BB#0:
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
-; X32-NEXT: kmovw %eax, %k1
+; X32: # %bb.0:
+; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
; X32-NEXT: vpshufd {{.*#+}} zmm0 {%k1} = zmm1[1,0,0,0,5,4,4,4,9,8,8,8,13,12,12,12]
; X32-NEXT: retl
;
; X64-LABEL: test_mm512_mask_shuffle_epi32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: kmovw %edi, %k1
; X64-NEXT: vpshufd {{.*#+}} zmm0 {%k1} = zmm1[1,0,0,0,5,4,4,4,9,8,8,8,13,12,12,12]
; X64-NEXT: retq
@@ -623,14 +1150,13 @@ define <8 x i64> @test_mm512_mask_shuffle_epi32(<8 x i64> %a0, i16 %a1, <8 x i64
define <8 x i64> @test_mm512_maskz_shuffle_epi32(i16 %a0, <8 x i64> %a1) {
; X32-LABEL: test_mm512_maskz_shuffle_epi32:
-; X32: # BB#0:
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
-; X32-NEXT: kmovw %eax, %k1
+; X32: # %bb.0:
+; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
; X32-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[1,0,0,0,5,4,4,4,9,8,8,8,13,12,12,12]
; X32-NEXT: retl
;
; X64-LABEL: test_mm512_maskz_shuffle_epi32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: kmovw %edi, %k1
; X64-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[1,0,0,0,5,4,4,4,9,8,8,8,13,12,12,12]
; X64-NEXT: retq
@@ -644,12 +1170,12 @@ define <8 x i64> @test_mm512_maskz_shuffle_epi32(i16 %a0, <8 x i64> %a1) {
define <8 x double> @test_mm512_shuffle_pd(<8 x double> %a0, <8 x double> %a1) {
; X32-LABEL: test_mm512_shuffle_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vshufpd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[3],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
; X32-NEXT: retl
;
; X64-LABEL: test_mm512_shuffle_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vshufpd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[3],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
; X64-NEXT: retq
%res = shufflevector <8 x double> %a0, <8 x double> %a1, <8 x i32> <i32 0, i32 8, i32 3, i32 10, i32 4, i32 12, i32 6, i32 14>
@@ -658,14 +1184,14 @@ define <8 x double> @test_mm512_shuffle_pd(<8 x double> %a0, <8 x double> %a1) {
define <8 x double> @test_mm512_mask_shuffle_pd(<8 x double> %a0, i8 %a1, <8 x double> %a2, <8 x double> %a3) {
; X32-LABEL: test_mm512_mask_shuffle_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movb {{[0-9]+}}(%esp), %al
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vshufpd {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[3],zmm2[2],zmm1[4],zmm2[4],zmm1[6],zmm2[6]
; X32-NEXT: retl
;
; X64-LABEL: test_mm512_mask_shuffle_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: kmovw %edi, %k1
; X64-NEXT: vshufpd {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[3],zmm2[2],zmm1[4],zmm2[4],zmm1[6],zmm2[6]
; X64-NEXT: retq
@@ -677,14 +1203,14 @@ define <8 x double> @test_mm512_mask_shuffle_pd(<8 x double> %a0, i8 %a1, <8 x d
define <8 x double> @test_mm512_maskz_shuffle_pd(i8 %a0, <8 x double> %a1, <8 x double> %a2) {
; X32-LABEL: test_mm512_maskz_shuffle_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movb {{[0-9]+}}(%esp), %al
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vshufpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[3],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
; X32-NEXT: retl
;
; X64-LABEL: test_mm512_maskz_shuffle_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: kmovw %edi, %k1
; X64-NEXT: vshufpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[3],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
; X64-NEXT: retq
@@ -696,13 +1222,13 @@ define <8 x double> @test_mm512_maskz_shuffle_pd(i8 %a0, <8 x double> %a1, <8 x
define <8 x i64> @test_mm512_unpackhi_epi32(<8 x i64> %a0, <8 x i64> %a1) {
; X32-LABEL: test_mm512_unpackhi_epi32:
-; X32: # BB#0:
-; X32-NEXT: vpunpckhdq {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
+; X32: # %bb.0:
+; X32-NEXT: vunpckhps {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
; X32-NEXT: retl
;
; X64-LABEL: test_mm512_unpackhi_epi32:
-; X64: # BB#0:
-; X64-NEXT: vpunpckhdq {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
+; X64: # %bb.0:
+; X64-NEXT: vunpckhps {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
; X64-NEXT: retq
%arg0 = bitcast <8 x i64> %a0 to <16 x i32>
%arg1 = bitcast <8 x i64> %a1 to <16 x i32>
@@ -713,14 +1239,13 @@ define <8 x i64> @test_mm512_unpackhi_epi32(<8 x i64> %a0, <8 x i64> %a1) {
define <8 x i64> @test_mm512_mask_unpackhi_epi32(<8 x i64> %a0, i16 %a1, <8 x i64> %a2, <8 x i64> %a3) {
; X32-LABEL: test_mm512_mask_unpackhi_epi32:
-; X32: # BB#0:
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
-; X32-NEXT: kmovw %eax, %k1
+; X32: # %bb.0:
+; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
; X32-NEXT: vpunpckhdq {{.*#+}} zmm0 {%k1} = zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[14],zmm2[14],zmm1[15],zmm2[15]
; X32-NEXT: retl
;
; X64-LABEL: test_mm512_mask_unpackhi_epi32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: kmovw %edi, %k1
; X64-NEXT: vpunpckhdq {{.*#+}} zmm0 {%k1} = zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[14],zmm2[14],zmm1[15],zmm2[15]
; X64-NEXT: retq
@@ -736,14 +1261,13 @@ define <8 x i64> @test_mm512_mask_unpackhi_epi32(<8 x i64> %a0, i16 %a1, <8 x i6
define <8 x i64> @test_mm512_maskz_unpackhi_epi32(i16 %a0, <8 x i64> %a1, <8 x i64> %a2) {
; X32-LABEL: test_mm512_maskz_unpackhi_epi32:
-; X32: # BB#0:
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
-; X32-NEXT: kmovw %eax, %k1
+; X32: # %bb.0:
+; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
; X32-NEXT: vpunpckhdq {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
; X32-NEXT: retl
;
; X64-LABEL: test_mm512_maskz_unpackhi_epi32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: kmovw %edi, %k1
; X64-NEXT: vpunpckhdq {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
; X64-NEXT: retq
@@ -758,13 +1282,13 @@ define <8 x i64> @test_mm512_maskz_unpackhi_epi32(i16 %a0, <8 x i64> %a1, <8 x i
define <8 x i64> @test_mm512_unpackhi_epi64(<8 x i64> %a0, <8 x i64> %a1) {
; X32-LABEL: test_mm512_unpackhi_epi64:
-; X32: # BB#0:
-; X32-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
+; X32: # %bb.0:
+; X32-NEXT: vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
; X32-NEXT: retl
;
; X64-LABEL: test_mm512_unpackhi_epi64:
-; X64: # BB#0:
-; X64-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
+; X64: # %bb.0:
+; X64-NEXT: vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
; X64-NEXT: retq
%res = shufflevector <8 x i64> %a0, <8 x i64> %a1, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
ret <8 x i64> %res
@@ -772,14 +1296,14 @@ define <8 x i64> @test_mm512_unpackhi_epi64(<8 x i64> %a0, <8 x i64> %a1) {
define <8 x i64> @test_mm512_mask_unpackhi_epi64(<8 x i64> %a0, i8 %a1, <8 x i64> %a2, <8 x i64> %a3) {
; X32-LABEL: test_mm512_mask_unpackhi_epi64:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movb {{[0-9]+}}(%esp), %al
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm1[1],zmm2[1],zmm1[3],zmm2[3],zmm1[5],zmm2[5],zmm1[7],zmm2[7]
; X32-NEXT: retl
;
; X64-LABEL: test_mm512_mask_unpackhi_epi64:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: kmovw %edi, %k1
; X64-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm1[1],zmm2[1],zmm1[3],zmm2[3],zmm1[5],zmm2[5],zmm1[7],zmm2[7]
; X64-NEXT: retq
@@ -791,14 +1315,14 @@ define <8 x i64> @test_mm512_mask_unpackhi_epi64(<8 x i64> %a0, i8 %a1, <8 x i64
define <8 x i64> @test_mm512_maskz_unpackhi_epi64(i8 %a0, <8 x i64> %a1, <8 x i64> %a2) {
; X32-LABEL: test_mm512_maskz_unpackhi_epi64:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movb {{[0-9]+}}(%esp), %al
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
; X32-NEXT: retl
;
; X64-LABEL: test_mm512_maskz_unpackhi_epi64:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: kmovw %edi, %k1
; X64-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
; X64-NEXT: retq
@@ -810,12 +1334,12 @@ define <8 x i64> @test_mm512_maskz_unpackhi_epi64(i8 %a0, <8 x i64> %a1, <8 x i6
define <8 x double> @test_mm512_unpackhi_pd(<8 x double> %a0, <8 x double> %a1) {
; X32-LABEL: test_mm512_unpackhi_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
; X32-NEXT: retl
;
; X64-LABEL: test_mm512_unpackhi_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
; X64-NEXT: retq
%res = shufflevector <8 x double> %a0, <8 x double> %a1, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
@@ -824,14 +1348,14 @@ define <8 x double> @test_mm512_unpackhi_pd(<8 x double> %a0, <8 x double> %a1)
define <8 x double> @test_mm512_mask_unpackhi_pd(<8 x double> %a0, i8 %a1, <8 x double> %a2, <8 x double> %a3) {
; X32-LABEL: test_mm512_mask_unpackhi_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movb {{[0-9]+}}(%esp), %al
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} = zmm1[1],zmm2[1],zmm1[3],zmm2[3],zmm1[5],zmm2[5],zmm1[7],zmm2[7]
; X32-NEXT: retl
;
; X64-LABEL: test_mm512_mask_unpackhi_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: kmovw %edi, %k1
; X64-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} = zmm1[1],zmm2[1],zmm1[3],zmm2[3],zmm1[5],zmm2[5],zmm1[7],zmm2[7]
; X64-NEXT: retq
@@ -843,14 +1367,14 @@ define <8 x double> @test_mm512_mask_unpackhi_pd(<8 x double> %a0, i8 %a1, <8 x
define <8 x double> @test_mm512_maskz_unpackhi_pd(i8 %a0, <8 x double> %a1, <8 x double> %a2) {
; X32-LABEL: test_mm512_maskz_unpackhi_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movb {{[0-9]+}}(%esp), %al
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
; X32-NEXT: retl
;
; X64-LABEL: test_mm512_maskz_unpackhi_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: kmovw %edi, %k1
; X64-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
; X64-NEXT: retq
@@ -862,12 +1386,12 @@ define <8 x double> @test_mm512_maskz_unpackhi_pd(i8 %a0, <8 x double> %a1, <8 x
define <16 x float> @test_mm512_unpackhi_ps(<16 x float> %a0, <16 x float> %a1) {
; X32-LABEL: test_mm512_unpackhi_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vunpckhps {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
; X32-NEXT: retl
;
; X64-LABEL: test_mm512_unpackhi_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vunpckhps {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
; X64-NEXT: retq
%res = shufflevector <16 x float> %a0, <16 x float> %a1, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
@@ -876,14 +1400,13 @@ define <16 x float> @test_mm512_unpackhi_ps(<16 x float> %a0, <16 x float> %a1)
define <16 x float> @test_mm512_mask_unpackhi_ps(<16 x float> %a0, i16 %a1, <16 x float> %a2, <16 x float> %a3) {
; X32-LABEL: test_mm512_mask_unpackhi_ps:
-; X32: # BB#0:
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
-; X32-NEXT: kmovw %eax, %k1
+; X32: # %bb.0:
+; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
; X32-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} = zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[14],zmm2[14],zmm1[15],zmm2[15]
; X32-NEXT: retl
;
; X64-LABEL: test_mm512_mask_unpackhi_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: kmovw %edi, %k1
; X64-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} = zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[14],zmm2[14],zmm1[15],zmm2[15]
; X64-NEXT: retq
@@ -895,14 +1418,13 @@ define <16 x float> @test_mm512_mask_unpackhi_ps(<16 x float> %a0, i16 %a1, <16
define <16 x float> @test_mm512_maskz_unpackhi_ps(i16 %a0, <16 x float> %a1, <16 x float> %a2) {
; X32-LABEL: test_mm512_maskz_unpackhi_ps:
-; X32: # BB#0:
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
-; X32-NEXT: kmovw %eax, %k1
+; X32: # %bb.0:
+; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
; X32-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
; X32-NEXT: retl
;
; X64-LABEL: test_mm512_maskz_unpackhi_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: kmovw %edi, %k1
; X64-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
; X64-NEXT: retq
@@ -914,13 +1436,13 @@ define <16 x float> @test_mm512_maskz_unpackhi_ps(i16 %a0, <16 x float> %a1, <16
define <8 x i64> @test_mm512_unpacklo_epi32(<8 x i64> %a0, <8 x i64> %a1) {
; X32-LABEL: test_mm512_unpacklo_epi32:
-; X32: # BB#0:
-; X32-NEXT: vpunpckldq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
+; X32: # %bb.0:
+; X32-NEXT: vunpcklps {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
; X32-NEXT: retl
;
; X64-LABEL: test_mm512_unpacklo_epi32:
-; X64: # BB#0:
-; X64-NEXT: vpunpckldq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
+; X64: # %bb.0:
+; X64-NEXT: vunpcklps {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
; X64-NEXT: retq
%arg0 = bitcast <8 x i64> %a0 to <16 x i32>
%arg1 = bitcast <8 x i64> %a1 to <16 x i32>
@@ -931,14 +1453,13 @@ define <8 x i64> @test_mm512_unpacklo_epi32(<8 x i64> %a0, <8 x i64> %a1) {
define <8 x i64> @test_mm512_mask_unpacklo_epi32(<8 x i64> %a0, i16 %a1, <8 x i64> %a2, <8 x i64> %a3) {
; X32-LABEL: test_mm512_mask_unpacklo_epi32:
-; X32: # BB#0:
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
-; X32-NEXT: kmovw %eax, %k1
+; X32: # %bb.0:
+; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
; X32-NEXT: vpunpckldq {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[12],zmm2[12],zmm1[13],zmm2[13]
; X32-NEXT: retl
;
; X64-LABEL: test_mm512_mask_unpacklo_epi32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: kmovw %edi, %k1
; X64-NEXT: vpunpckldq {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[12],zmm2[12],zmm1[13],zmm2[13]
; X64-NEXT: retq
@@ -954,14 +1475,13 @@ define <8 x i64> @test_mm512_mask_unpacklo_epi32(<8 x i64> %a0, i16 %a1, <8 x i6
define <8 x i64> @test_mm512_maskz_unpacklo_epi32(i16 %a0, <8 x i64> %a1, <8 x i64> %a2) {
; X32-LABEL: test_mm512_maskz_unpacklo_epi32:
-; X32: # BB#0:
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
-; X32-NEXT: kmovw %eax, %k1
+; X32: # %bb.0:
+; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
; X32-NEXT: vpunpckldq {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
; X32-NEXT: retl
;
; X64-LABEL: test_mm512_maskz_unpacklo_epi32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: kmovw %edi, %k1
; X64-NEXT: vpunpckldq {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
; X64-NEXT: retq
@@ -976,13 +1496,13 @@ define <8 x i64> @test_mm512_maskz_unpacklo_epi32(i16 %a0, <8 x i64> %a1, <8 x i
define <8 x i64> @test_mm512_unpacklo_epi64(<8 x i64> %a0, <8 x i64> %a1) {
; X32-LABEL: test_mm512_unpacklo_epi64:
-; X32: # BB#0:
-; X32-NEXT: vpunpcklqdq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
+; X32: # %bb.0:
+; X32-NEXT: vunpcklpd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
; X32-NEXT: retl
;
; X64-LABEL: test_mm512_unpacklo_epi64:
-; X64: # BB#0:
-; X64-NEXT: vpunpcklqdq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
+; X64: # %bb.0:
+; X64-NEXT: vunpcklpd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
; X64-NEXT: retq
%res = shufflevector <8 x i64> %a0, <8 x i64> %a1, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
ret <8 x i64> %res
@@ -990,14 +1510,14 @@ define <8 x i64> @test_mm512_unpacklo_epi64(<8 x i64> %a0, <8 x i64> %a1) {
define <8 x i64> @test_mm512_mask_unpacklo_epi64(<8 x i64> %a0, i8 %a1, <8 x i64> %a2, <8 x i64> %a3) {
; X32-LABEL: test_mm512_mask_unpacklo_epi64:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movb {{[0-9]+}}(%esp), %al
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vpunpcklqdq {{.*#+}} zmm0 = zmm1[0],zmm2[0],zmm1[2],zmm2[2],zmm1[4],zmm2[4],zmm1[6],zmm2[6]
; X32-NEXT: retl
;
; X64-LABEL: test_mm512_mask_unpacklo_epi64:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: kmovw %edi, %k1
; X64-NEXT: vpunpcklqdq {{.*#+}} zmm0 = zmm1[0],zmm2[0],zmm1[2],zmm2[2],zmm1[4],zmm2[4],zmm1[6],zmm2[6]
; X64-NEXT: retq
@@ -1009,14 +1529,14 @@ define <8 x i64> @test_mm512_mask_unpacklo_epi64(<8 x i64> %a0, i8 %a1, <8 x i64
define <8 x i64> @test_mm512_maskz_unpacklo_epi64(i8 %a0, <8 x i64> %a1, <8 x i64> %a2) {
; X32-LABEL: test_mm512_maskz_unpacklo_epi64:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movb {{[0-9]+}}(%esp), %al
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vpunpcklqdq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
; X32-NEXT: retl
;
; X64-LABEL: test_mm512_maskz_unpacklo_epi64:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: kmovw %edi, %k1
; X64-NEXT: vpunpcklqdq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
; X64-NEXT: retq
@@ -1028,12 +1548,12 @@ define <8 x i64> @test_mm512_maskz_unpacklo_epi64(i8 %a0, <8 x i64> %a1, <8 x i6
define <8 x double> @test_mm512_unpacklo_pd(<8 x double> %a0, <8 x double> %a1) {
; X32-LABEL: test_mm512_unpacklo_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vunpcklpd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
; X32-NEXT: retl
;
; X64-LABEL: test_mm512_unpacklo_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vunpcklpd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
; X64-NEXT: retq
%res = shufflevector <8 x double> %a0, <8 x double> %a1, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
@@ -1042,14 +1562,14 @@ define <8 x double> @test_mm512_unpacklo_pd(<8 x double> %a0, <8 x double> %a1)
define <8 x double> @test_mm512_mask_unpacklo_pd(<8 x double> %a0, i8 %a1, <8 x double> %a2, <8 x double> %a3) {
; X32-LABEL: test_mm512_mask_unpacklo_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movb {{[0-9]+}}(%esp), %al
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[2],zmm2[2],zmm1[4],zmm2[4],zmm1[6],zmm2[6]
; X32-NEXT: retl
;
; X64-LABEL: test_mm512_mask_unpacklo_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: kmovw %edi, %k1
; X64-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[2],zmm2[2],zmm1[4],zmm2[4],zmm1[6],zmm2[6]
; X64-NEXT: retq
@@ -1061,14 +1581,14 @@ define <8 x double> @test_mm512_mask_unpacklo_pd(<8 x double> %a0, i8 %a1, <8 x
define <8 x double> @test_mm512_maskz_unpacklo_pd(i8 %a0, <8 x double> %a1, <8 x double> %a2) {
; X32-LABEL: test_mm512_maskz_unpacklo_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movb {{[0-9]+}}(%esp), %al
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
; X32-NEXT: retl
;
; X64-LABEL: test_mm512_maskz_unpacklo_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: kmovw %edi, %k1
; X64-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
; X64-NEXT: retq
@@ -1080,12 +1600,12 @@ define <8 x double> @test_mm512_maskz_unpacklo_pd(i8 %a0, <8 x double> %a1, <8 x
define <16 x float> @test_mm512_unpacklo_ps(<16 x float> %a0, <16 x float> %a1) {
; X32-LABEL: test_mm512_unpacklo_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vunpcklps {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
; X32-NEXT: retl
;
; X64-LABEL: test_mm512_unpacklo_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vunpcklps {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
; X64-NEXT: retq
%res = shufflevector <16 x float> %a0, <16 x float> %a1, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
@@ -1094,14 +1614,13 @@ define <16 x float> @test_mm512_unpacklo_ps(<16 x float> %a0, <16 x float> %a1)
define <16 x float> @test_mm512_mask_unpacklo_ps(<16 x float> %a0, i16 %a1, <16 x float> %a2, <16 x float> %a3) {
; X32-LABEL: test_mm512_mask_unpacklo_ps:
-; X32: # BB#0:
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
-; X32-NEXT: kmovw %eax, %k1
+; X32: # %bb.0:
+; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
; X32-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[12],zmm2[12],zmm1[13],zmm2[13]
; X32-NEXT: retl
;
; X64-LABEL: test_mm512_mask_unpacklo_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: kmovw %edi, %k1
; X64-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[12],zmm2[12],zmm1[13],zmm2[13]
; X64-NEXT: retq
@@ -1113,14 +1632,13 @@ define <16 x float> @test_mm512_mask_unpacklo_ps(<16 x float> %a0, i16 %a1, <16
define <16 x float> @test_mm512_maskz_unpacklo_ps(i16 %a0, <16 x float> %a1, <16 x float> %a2) {
; X32-LABEL: test_mm512_maskz_unpacklo_ps:
-; X32: # BB#0:
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
-; X32-NEXT: kmovw %eax, %k1
+; X32: # %bb.0:
+; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
; X32-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
; X32-NEXT: retl
;
; X64-LABEL: test_mm512_maskz_unpacklo_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: kmovw %edi, %k1
; X64-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
; X64-NEXT: retq
@@ -1132,21 +1650,13 @@ define <16 x float> @test_mm512_maskz_unpacklo_ps(i16 %a0, <16 x float> %a1, <16
define <8 x double> @test_mm512_zextpd128_pd512(<2 x double> %a0) nounwind {
; X32-LABEL: test_mm512_zextpd128_pd512:
-; X32: # BB#0:
-; X32-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
-; X32-NEXT: vxorpd %xmm1, %xmm1, %xmm1
-; X32-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm2
-; X32-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; X32-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; X32: # %bb.0:
+; X32-NEXT: vmovaps %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm512_zextpd128_pd512:
-; X64: # BB#0:
-; X64-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
-; X64-NEXT: vxorpd %xmm1, %xmm1, %xmm1
-; X64-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm2
-; X64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; X64-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; X64: # %bb.0:
+; X64-NEXT: vmovaps %xmm0, %xmm0
; X64-NEXT: retq
%res = shufflevector <2 x double> %a0, <2 x double> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
ret <8 x double> %res
@@ -1154,17 +1664,13 @@ define <8 x double> @test_mm512_zextpd128_pd512(<2 x double> %a0) nounwind {
define <8 x double> @test_mm512_zextpd256_pd512(<4 x double> %a0) nounwind {
; X32-LABEL: test_mm512_zextpd256_pd512:
-; X32: # BB#0:
-; X32-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
-; X32-NEXT: vxorpd %ymm1, %ymm1, %ymm1
-; X32-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; X32: # %bb.0:
+; X32-NEXT: vmovaps %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm512_zextpd256_pd512:
-; X64: # BB#0:
-; X64-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
-; X64-NEXT: vxorpd %ymm1, %ymm1, %ymm1
-; X64-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; X64: # %bb.0:
+; X64-NEXT: vmovaps %ymm0, %ymm0
; X64-NEXT: retq
%res = shufflevector <4 x double> %a0, <4 x double> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
ret <8 x double> %res
@@ -1172,21 +1678,17 @@ define <8 x double> @test_mm512_zextpd256_pd512(<4 x double> %a0) nounwind {
define <16 x float> @test_mm512_zextps128_ps512(<4 x float> %a0) nounwind {
; X32-LABEL: test_mm512_zextps128_ps512:
-; X32: # BB#0:
-; X32-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
-; X32-NEXT: vxorpd %xmm1, %xmm1, %xmm1
-; X32-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm2
-; X32-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; X32-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; X32: # %bb.0:
+; X32-NEXT: vmovaps %xmm0, %xmm0
+; X32-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; X32-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm512_zextps128_ps512:
-; X64: # BB#0:
-; X64-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
-; X64-NEXT: vxorpd %xmm1, %xmm1, %xmm1
-; X64-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm2
-; X64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; X64-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; X64: # %bb.0:
+; X64-NEXT: vmovaps %xmm0, %xmm0
+; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; X64-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
; X64-NEXT: retq
%res = shufflevector <4 x float> %a0, <4 x float> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
ret <16 x float> %res
@@ -1194,17 +1696,13 @@ define <16 x float> @test_mm512_zextps128_ps512(<4 x float> %a0) nounwind {
define <16 x float> @test_mm512_zextps256_ps512(<8 x float> %a0) nounwind {
; X32-LABEL: test_mm512_zextps256_ps512:
-; X32: # BB#0:
-; X32-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
-; X32-NEXT: vxorpd %ymm1, %ymm1, %ymm1
-; X32-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; X32: # %bb.0:
+; X32-NEXT: vmovaps %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm512_zextps256_ps512:
-; X64: # BB#0:
-; X64-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
-; X64-NEXT: vxorpd %ymm1, %ymm1, %ymm1
-; X64-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; X64: # %bb.0:
+; X64-NEXT: vmovaps %ymm0, %ymm0
; X64-NEXT: retq
%res = shufflevector <8 x float> %a0, <8 x float> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
ret <16 x float> %res
@@ -1212,21 +1710,13 @@ define <16 x float> @test_mm512_zextps256_ps512(<8 x float> %a0) nounwind {
define <8 x i64> @test_mm512_zextsi128_si512(<2 x i64> %a0) nounwind {
; X32-LABEL: test_mm512_zextsi128_si512:
-; X32: # BB#0:
-; X32-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
-; X32-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; X32-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm2
-; X32-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; X32-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; X32: # %bb.0:
+; X32-NEXT: vmovaps %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm512_zextsi128_si512:
-; X64: # BB#0:
-; X64-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
-; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; X64-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm2
-; X64-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; X64-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; X64: # %bb.0:
+; X64-NEXT: vmovaps %xmm0, %xmm0
; X64-NEXT: retq
%res = shufflevector <2 x i64> %a0, <2 x i64> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
ret <8 x i64> %res
@@ -1234,17 +1724,13 @@ define <8 x i64> @test_mm512_zextsi128_si512(<2 x i64> %a0) nounwind {
define <8 x i64> @test_mm512_zextsi256_si512(<4 x i64> %a0) nounwind {
; X32-LABEL: test_mm512_zextsi256_si512:
-; X32: # BB#0:
-; X32-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
-; X32-NEXT: vpxor %ymm1, %ymm1, %ymm1
-; X32-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; X32: # %bb.0:
+; X32-NEXT: vmovaps %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm512_zextsi256_si512:
-; X64: # BB#0:
-; X64-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
-; X64-NEXT: vpxor %ymm1, %ymm1, %ymm1
-; X64-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; X64: # %bb.0:
+; X64-NEXT: vmovaps %ymm0, %ymm0
; X64-NEXT: retq
%res = shufflevector <4 x i64> %a0, <4 x i64> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
ret <8 x i64> %res
diff --git a/test/CodeGen/X86/avx512-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512-intrinsics-upgrade.ll
index 86902ac926a0..f3ca0644e463 100644
--- a/test/CodeGen/X86/avx512-intrinsics-upgrade.ll
+++ b/test/CodeGen/X86/avx512-intrinsics-upgrade.ll
@@ -1,11 +1,65 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
+declare i16 @llvm.x86.avx512.kunpck.bw(i16, i16) nounwind readnone
+
+define i16 @unpckbw_test(i16 %a0, i16 %a1) {
+; CHECK-LABEL: unpckbw_test:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: movzbl %dil, %eax
+; CHECK-NEXT: shll $8, %esi
+; CHECK-NEXT: orl %esi, %eax
+; CHECK-NEXT: ## kill: def %ax killed %ax killed %eax
+; CHECK-NEXT: retq
+ %res = call i16 @llvm.x86.avx512.kunpck.bw(i16 %a0, i16 %a1)
+ ret i16 %res
+}
+
+define <16 x i32>@test_int_x86_avx512_mask_pbroadcastd_gpr_512(i32 %x0, <16 x i32> %x1, i16 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcastd_gpr_512:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpbroadcastd %edi, %zmm1
+; CHECK-NEXT: kmovw %esi, %k1
+; CHECK-NEXT: vpbroadcastd %edi, %zmm0 {%k1}
+; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: vpbroadcastd %edi, %zmm1 {%k1} {z}
+; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %res = call <16 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.512(i32 %x0, <16 x i32> %x1, i16 -1)
+ %res1 = call <16 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.512(i32 %x0, <16 x i32> %x1, i16 %mask)
+ %res2 = call <16 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.512(i32 %x0, <16 x i32> zeroinitializer, i16 %mask)
+ %res3 = add <16 x i32> %res, %res1
+ %res4 = add <16 x i32> %res2, %res3
+ ret <16 x i32> %res4
+ }
+declare <16 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.512(i32, <16 x i32>, i16)
+
+
+define <8 x i64>@test_int_x86_avx512_mask_pbroadcastq_gpr_512(i64 %x0, <8 x i64> %x1, i8 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcastq_gpr_512:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpbroadcastq %rdi, %zmm1
+; CHECK-NEXT: kmovw %esi, %k1
+; CHECK-NEXT: vpbroadcastq %rdi, %zmm0 {%k1}
+; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: vpbroadcastq %rdi, %zmm1 {%k1} {z}
+; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %res = call <8 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.512(i64 %x0, <8 x i64> %x1,i8 -1)
+ %res1 = call <8 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.512(i64 %x0, <8 x i64> %x1,i8 %mask)
+ %res2 = call <8 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.512(i64 %x0, <8 x i64> zeroinitializer,i8 %mask)
+ %res3 = add <8 x i64> %res, %res1
+ %res4 = add <8 x i64> %res2, %res3
+ ret <8 x i64> %res4
+}
+declare <8 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.512(i64, <8 x i64>, i8)
+
+
declare <16 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.512(<4 x float>, <16 x float>, i16) nounwind readonly
define <16 x float> @test_x86_vbroadcast_ss_ps_512(<4 x float> %a0, <16 x float> %a1, i16 %mask ) {
; CHECK-LABEL: test_x86_vbroadcast_ss_ps_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vbroadcastss %xmm0, %zmm2
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vbroadcastss %xmm0, %zmm1 {%k1}
@@ -26,7 +80,7 @@ declare <8 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.512(<2 x double>, <8
define <8 x double> @test_x86_vbroadcast_sd_pd_512(<2 x double> %a0, <8 x double> %a1, i8 %mask ) {
; CHECK-LABEL: test_x86_vbroadcast_sd_pd_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vbroadcastsd %xmm0, %zmm2
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vbroadcastsd %xmm0, %zmm1 {%k1}
@@ -47,7 +101,7 @@ declare <16 x i32> @llvm.x86.avx512.pbroadcastd.512(<4 x i32>, <16 x i32>, i16)
define <16 x i32>@test_int_x86_avx512_pbroadcastd_512(<4 x i32> %x0, <16 x i32> %x1, i16 %mask) {
; CHECK-LABEL: test_int_x86_avx512_pbroadcastd_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpbroadcastd %xmm0, %zmm2
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpbroadcastd %xmm0, %zmm1 {%k1}
@@ -67,7 +121,7 @@ declare <8 x i64> @llvm.x86.avx512.pbroadcastq.512(<2 x i64>, <8 x i64>, i8)
define <8 x i64>@test_int_x86_avx512_pbroadcastq_512(<2 x i64> %x0, <8 x i64> %x1, i8 %mask) {
; CHECK-LABEL: test_int_x86_avx512_pbroadcastq_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpbroadcastq %xmm0, %zmm2
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpbroadcastq %xmm0, %zmm1 {%k1}
@@ -87,7 +141,7 @@ declare <16 x float> @llvm.x86.avx512.mask.movsldup.512(<16 x float>, <16 x floa
define <16 x float>@test_int_x86_avx512_mask_movsldup_512(<16 x float> %x0, <16 x float> %x1, i16 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_movsldup_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovsldup {{.*#+}} zmm2 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovsldup {{.*#+}} zmm1 {%k1} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
@@ -107,7 +161,7 @@ declare <16 x float> @llvm.x86.avx512.mask.movshdup.512(<16 x float>, <16 x floa
define <16 x float>@test_int_x86_avx512_mask_movshdup_512(<16 x float> %x0, <16 x float> %x1, i16 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_movshdup_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovshdup {{.*#+}} zmm2 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovshdup {{.*#+}} zmm1 {%k1} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
@@ -127,7 +181,7 @@ declare <8 x double> @llvm.x86.avx512.mask.movddup.512(<8 x double>, <8 x double
define <8 x double>@test_int_x86_avx512_mask_movddup_512(<8 x double> %x0, <8 x double> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_movddup_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovddup {{.*#+}} zmm2 = zmm0[0,0,2,2,4,4,6,6]
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovddup {{.*#+}} zmm1 {%k1} = zmm0[0,0,2,2,4,4,6,6]
@@ -147,7 +201,7 @@ declare <8 x double> @llvm.x86.avx512.mask.perm.df.512(<8 x double>, i32, <8 x d
define <8 x double>@test_int_x86_avx512_mask_perm_df_512(<8 x double> %x0, i32 %x1, <8 x double> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_perm_df_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpermpd {{.*#+}} zmm2 = zmm0[3,0,0,0,7,4,4,4]
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,0,0,0,7,4,4,4]
@@ -167,7 +221,7 @@ declare <8 x i64> @llvm.x86.avx512.mask.perm.di.512(<8 x i64>, i32, <8 x i64>, i
define <8 x i64>@test_int_x86_avx512_mask_perm_di_512(<8 x i64> %x0, i32 %x1, <8 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_perm_di_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpermq {{.*#+}} zmm2 = zmm0[3,0,0,0,7,4,4,4]
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,0,0,0,7,4,4,4]
@@ -185,7 +239,7 @@ define <8 x i64>@test_int_x86_avx512_mask_perm_di_512(<8 x i64> %x0, i32 %x1, <8
define void @test_store1(<16 x float> %data, i8* %ptr, i8* %ptr2, i16 %mask) {
; CHECK-LABEL: test_store1:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edx, %k1
; CHECK-NEXT: vmovups %zmm0, (%rdi) {%k1}
; CHECK-NEXT: vmovups %zmm0, (%rsi)
@@ -199,7 +253,7 @@ declare void @llvm.x86.avx512.mask.storeu.ps.512(i8*, <16 x float>, i16 )
define void @test_store2(<8 x double> %data, i8* %ptr, i8* %ptr2, i8 %mask) {
; CHECK-LABEL: test_store2:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edx, %k1
; CHECK-NEXT: vmovupd %zmm0, (%rdi) {%k1}
; CHECK-NEXT: vmovupd %zmm0, (%rsi)
@@ -213,7 +267,7 @@ declare void @llvm.x86.avx512.mask.storeu.pd.512(i8*, <8 x double>, i8)
define void @test_mask_store_aligned_ps(<16 x float> %data, i8* %ptr, i8* %ptr2, i16 %mask) {
; CHECK-LABEL: test_mask_store_aligned_ps:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edx, %k1
; CHECK-NEXT: vmovaps %zmm0, (%rdi) {%k1}
; CHECK-NEXT: vmovaps %zmm0, (%rsi)
@@ -227,7 +281,7 @@ declare void @llvm.x86.avx512.mask.store.ps.512(i8*, <16 x float>, i16 )
define void @test_mask_store_aligned_pd(<8 x double> %data, i8* %ptr, i8* %ptr2, i8 %mask) {
; CHECK-LABEL: test_mask_store_aligned_pd:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edx, %k1
; CHECK-NEXT: vmovapd %zmm0, (%rdi) {%k1}
; CHECK-NEXT: vmovapd %zmm0, (%rsi)
@@ -241,7 +295,7 @@ declare void @llvm.x86.avx512.mask.store.pd.512(i8*, <8 x double>, i8)
define void@test_int_x86_avx512_mask_storeu_q_512(i8* %ptr1, i8* %ptr2, <8 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_storeu_q_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edx, %k1
; CHECK-NEXT: vmovdqu64 %zmm0, (%rdi) {%k1}
; CHECK-NEXT: vmovdqu64 %zmm0, (%rsi)
@@ -255,7 +309,7 @@ declare void @llvm.x86.avx512.mask.storeu.q.512(i8*, <8 x i64>, i8)
define void@test_int_x86_avx512_mask_storeu_d_512(i8* %ptr1, i8* %ptr2, <16 x i32> %x1, i16 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_storeu_d_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edx, %k1
; CHECK-NEXT: vmovdqu32 %zmm0, (%rdi) {%k1}
; CHECK-NEXT: vmovdqu32 %zmm0, (%rsi)
@@ -269,7 +323,7 @@ declare void @llvm.x86.avx512.mask.storeu.d.512(i8*, <16 x i32>, i16)
define void@test_int_x86_avx512_mask_store_q_512(i8* %ptr1, i8* %ptr2, <8 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_store_q_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edx, %k1
; CHECK-NEXT: vmovdqa64 %zmm0, (%rdi) {%k1}
; CHECK-NEXT: vmovdqa64 %zmm0, (%rsi)
@@ -283,7 +337,7 @@ declare void @llvm.x86.avx512.mask.store.q.512(i8*, <8 x i64>, i8)
define void@test_int_x86_avx512_mask_store_d_512(i8* %ptr1, i8* %ptr2, <16 x i32> %x1, i16 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_store_d_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edx, %k1
; CHECK-NEXT: vmovdqa32 %zmm0, (%rdi) {%k1}
; CHECK-NEXT: vmovdqa32 %zmm0, (%rsi)
@@ -297,7 +351,7 @@ declare void @llvm.x86.avx512.mask.store.d.512(i8*, <16 x i32>, i16)
define <16 x float> @test_mask_load_aligned_ps(<16 x float> %data, i8* %ptr, i16 %mask) {
; CHECK-LABEL: test_mask_load_aligned_ps:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovaps (%rdi), %zmm0
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vmovaps (%rdi), %zmm0 {%k1}
@@ -315,7 +369,7 @@ declare <16 x float> @llvm.x86.avx512.mask.load.ps.512(i8*, <16 x float>, i16)
define <16 x float> @test_mask_load_unaligned_ps(<16 x float> %data, i8* %ptr, i16 %mask) {
; CHECK-LABEL: test_mask_load_unaligned_ps:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovups (%rdi), %zmm0
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vmovups (%rdi), %zmm0 {%k1}
@@ -333,7 +387,7 @@ declare <16 x float> @llvm.x86.avx512.mask.loadu.ps.512(i8*, <16 x float>, i16)
define <8 x double> @test_mask_load_aligned_pd(<8 x double> %data, i8* %ptr, i8 %mask) {
; CHECK-LABEL: test_mask_load_aligned_pd:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovapd (%rdi), %zmm0
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vmovapd (%rdi), %zmm0 {%k1}
@@ -351,7 +405,7 @@ declare <8 x double> @llvm.x86.avx512.mask.load.pd.512(i8*, <8 x double>, i8)
define <8 x double> @test_mask_load_unaligned_pd(<8 x double> %data, i8* %ptr, i8 %mask) {
; CHECK-LABEL: test_mask_load_unaligned_pd:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovupd (%rdi), %zmm0
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vmovupd (%rdi), %zmm0 {%k1}
@@ -371,7 +425,7 @@ declare <16 x i32> @llvm.x86.avx512.mask.loadu.d.512(i8*, <16 x i32>, i16)
define <16 x i32> @test_mask_load_unaligned_d(i8* %ptr, i8* %ptr2, <16 x i32> %data, i16 %mask) {
; CHECK-LABEL: test_mask_load_unaligned_d:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovdqu32 (%rdi), %zmm0
; CHECK-NEXT: kmovw %edx, %k1
; CHECK-NEXT: vmovdqu32 (%rsi), %zmm0 {%k1}
@@ -389,7 +443,7 @@ declare <8 x i64> @llvm.x86.avx512.mask.loadu.q.512(i8*, <8 x i64>, i8)
define <8 x i64> @test_mask_load_unaligned_q(i8* %ptr, i8* %ptr2, <8 x i64> %data, i8 %mask) {
; CHECK-LABEL: test_mask_load_unaligned_q:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovdqu64 (%rdi), %zmm0
; CHECK-NEXT: kmovw %edx, %k1
; CHECK-NEXT: vmovdqu64 (%rsi), %zmm0 {%k1}
@@ -407,7 +461,7 @@ declare <16 x i32> @llvm.x86.avx512.mask.load.d.512(i8*, <16 x i32>, i16)
define <16 x i32> @test_mask_load_aligned_d(<16 x i32> %data, i8* %ptr, i16 %mask) {
; CHECK-LABEL: test_mask_load_aligned_d:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovdqa32 (%rdi), %zmm0
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vmovdqa32 (%rdi), %zmm0 {%k1}
@@ -425,7 +479,7 @@ declare <8 x i64> @llvm.x86.avx512.mask.load.q.512(i8*, <8 x i64>, i8)
define <8 x i64> @test_mask_load_aligned_q(<8 x i64> %data, i8* %ptr, i8 %mask) {
; CHECK-LABEL: test_mask_load_aligned_q:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 {%k1}
@@ -443,7 +497,7 @@ declare <8 x double> @llvm.x86.avx512.mask.vpermil.pd.512(<8 x double>, i32, <8
define <8 x double>@test_int_x86_avx512_mask_vpermil_pd_512(<8 x double> %x0, <8 x double> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vpermil_pd_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpermilpd {{.*#+}} zmm2 = zmm0[0,1,3,2,5,4,6,6]
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpermilpd {{.*#+}} zmm1 {%k1} = zmm0[0,1,3,2,5,4,6,6]
@@ -463,7 +517,7 @@ declare <16 x float> @llvm.x86.avx512.mask.vpermil.ps.512(<16 x float>, i32, <16
define <16 x float>@test_int_x86_avx512_mask_vpermil_ps_512(<16 x float> %x0, <16 x float> %x2, i16 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vpermil_ps_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpermilps {{.*#+}} zmm2 = zmm0[2,1,1,0,6,5,5,4,10,9,9,8,14,13,13,12]
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpermilps {{.*#+}} zmm1 {%k1} = zmm0[2,1,1,0,6,5,5,4,10,9,9,8,14,13,13,12]
@@ -483,7 +537,7 @@ declare <16 x i32> @llvm.x86.avx512.mask.pshuf.d.512(<16 x i32>, i32, <16 x i32>
define <16 x i32>@test_int_x86_avx512_mask_pshuf_d_512(<16 x i32> %x0, i32 %x1, <16 x i32> %x2, i16 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_pshuf_d_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpshufd {{.*#+}} zmm2 = zmm0[3,0,0,0,7,4,4,4,11,8,8,8,15,12,12,12]
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpshufd {{.*#+}} zmm1 {%k1} = zmm0[3,0,0,0,7,4,4,4,11,8,8,8,15,12,12,12]
@@ -501,10 +555,10 @@ define <16 x i32>@test_int_x86_avx512_mask_pshuf_d_512(<16 x i32> %x0, i32 %x1,
define i16 @test_pcmpeq_d(<16 x i32> %a, <16 x i32> %b) {
; CHECK-LABEL: test_pcmpeq_d:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; CHECK-NEXT: ## kill: def %ax killed %ax killed %eax
; CHECK-NEXT: retq
%res = call i16 @llvm.x86.avx512.mask.pcmpeq.d.512(<16 x i32> %a, <16 x i32> %b, i16 -1)
ret i16 %res
@@ -512,11 +566,11 @@ define i16 @test_pcmpeq_d(<16 x i32> %a, <16 x i32> %b) {
define i16 @test_mask_pcmpeq_d(<16 x i32> %a, <16 x i32> %b, i16 %mask) {
; CHECK-LABEL: test_mask_pcmpeq_d:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 {%k1}
; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; CHECK-NEXT: ## kill: def %ax killed %ax killed %eax
; CHECK-NEXT: retq
%res = call i16 @llvm.x86.avx512.mask.pcmpeq.d.512(<16 x i32> %a, <16 x i32> %b, i16 %mask)
ret i16 %res
@@ -526,10 +580,10 @@ declare i16 @llvm.x86.avx512.mask.pcmpeq.d.512(<16 x i32>, <16 x i32>, i16)
define i8 @test_pcmpeq_q(<8 x i64> %a, <8 x i64> %b) {
; CHECK-LABEL: test_pcmpeq_q:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k0
; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: ## kill: def %al killed %al killed %eax
; CHECK-NEXT: retq
%res = call i8 @llvm.x86.avx512.mask.pcmpeq.q.512(<8 x i64> %a, <8 x i64> %b, i8 -1)
ret i8 %res
@@ -537,11 +591,11 @@ define i8 @test_pcmpeq_q(<8 x i64> %a, <8 x i64> %b) {
define i8 @test_mask_pcmpeq_q(<8 x i64> %a, <8 x i64> %b, i8 %mask) {
; CHECK-LABEL: test_mask_pcmpeq_q:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 {%k1}
; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: ## kill: def %al killed %al killed %eax
; CHECK-NEXT: retq
%res = call i8 @llvm.x86.avx512.mask.pcmpeq.q.512(<8 x i64> %a, <8 x i64> %b, i8 %mask)
ret i8 %res
@@ -551,10 +605,10 @@ declare i8 @llvm.x86.avx512.mask.pcmpeq.q.512(<8 x i64>, <8 x i64>, i8)
define i16 @test_pcmpgt_d(<16 x i32> %a, <16 x i32> %b) {
; CHECK-LABEL: test_pcmpgt_d:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpcmpgtd %zmm1, %zmm0, %k0
; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; CHECK-NEXT: ## kill: def %ax killed %ax killed %eax
; CHECK-NEXT: retq
%res = call i16 @llvm.x86.avx512.mask.pcmpgt.d.512(<16 x i32> %a, <16 x i32> %b, i16 -1)
ret i16 %res
@@ -562,11 +616,11 @@ define i16 @test_pcmpgt_d(<16 x i32> %a, <16 x i32> %b) {
define i16 @test_mask_pcmpgt_d(<16 x i32> %a, <16 x i32> %b, i16 %mask) {
; CHECK-LABEL: test_mask_pcmpgt_d:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 {%k1}
; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; CHECK-NEXT: ## kill: def %ax killed %ax killed %eax
; CHECK-NEXT: retq
%res = call i16 @llvm.x86.avx512.mask.pcmpgt.d.512(<16 x i32> %a, <16 x i32> %b, i16 %mask)
ret i16 %res
@@ -576,10 +630,10 @@ declare i16 @llvm.x86.avx512.mask.pcmpgt.d.512(<16 x i32>, <16 x i32>, i16)
define i8 @test_pcmpgt_q(<8 x i64> %a, <8 x i64> %b) {
; CHECK-LABEL: test_pcmpgt_q:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpcmpgtq %zmm1, %zmm0, %k0
; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: ## kill: def %al killed %al killed %eax
; CHECK-NEXT: retq
%res = call i8 @llvm.x86.avx512.mask.pcmpgt.q.512(<8 x i64> %a, <8 x i64> %b, i8 -1)
ret i8 %res
@@ -587,11 +641,11 @@ define i8 @test_pcmpgt_q(<8 x i64> %a, <8 x i64> %b) {
define i8 @test_mask_pcmpgt_q(<8 x i64> %a, <8 x i64> %b, i8 %mask) {
; CHECK-LABEL: test_mask_pcmpgt_q:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpcmpgtq %zmm1, %zmm0, %k0 {%k1}
; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: ## kill: def %al killed %al killed %eax
; CHECK-NEXT: retq
%res = call i8 @llvm.x86.avx512.mask.pcmpgt.q.512(<8 x i64> %a, <8 x i64> %b, i8 %mask)
ret i8 %res
@@ -603,7 +657,7 @@ declare <8 x double> @llvm.x86.avx512.mask.unpckh.pd.512(<8 x double>, <8 x doub
define <8 x double>@test_int_x86_avx512_mask_unpckh_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_unpckh_pd_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vunpckhpd {{.*#+}} zmm3 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
@@ -619,7 +673,7 @@ declare <16 x float> @llvm.x86.avx512.mask.unpckh.ps.512(<16 x float>, <16 x flo
define <16 x float>@test_int_x86_avx512_mask_unpckh_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_unpckh_ps_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vunpckhps {{.*#+}} zmm3 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vunpckhps {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
@@ -635,7 +689,7 @@ declare <8 x double> @llvm.x86.avx512.mask.unpckl.pd.512(<8 x double>, <8 x doub
define <8 x double>@test_int_x86_avx512_mask_unpckl_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_unpckl_pd_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vunpcklpd {{.*#+}} zmm3 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
@@ -651,7 +705,7 @@ declare <16 x float> @llvm.x86.avx512.mask.unpckl.ps.512(<16 x float>, <16 x flo
define <16 x float>@test_int_x86_avx512_mask_unpckl_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_unpckl_ps_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vunpcklps {{.*#+}} zmm3 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
@@ -667,7 +721,7 @@ declare <8 x i64> @llvm.x86.avx512.mask.punpcklqd.q.512(<8 x i64>, <8 x i64>, <8
define <8 x i64>@test_int_x86_avx512_mask_punpcklqd_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_punpcklqd_q_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
@@ -687,7 +741,7 @@ declare <8 x i64> @llvm.x86.avx512.mask.punpckhqd.q.512(<8 x i64>, <8 x i64>, <8
define <8 x i64>@test_int_x86_avx512_mask_punpckhqd_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_punpckhqd_q_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpunpckhqdq {{.*#+}} zmm3 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpunpckhqdq {{.*#+}} zmm2 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
@@ -703,7 +757,7 @@ declare <16 x i32> @llvm.x86.avx512.mask.punpckhd.q.512(<16 x i32>, <16 x i32>,
define <16 x i32>@test_int_x86_avx512_mask_punpckhd_q_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_punpckhd_q_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpunpckhdq {{.*#+}} zmm3 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpunpckhdq {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
@@ -719,7 +773,7 @@ declare <16 x i32> @llvm.x86.avx512.mask.punpckld.q.512(<16 x i32>, <16 x i32>,
define <16 x i32>@test_int_x86_avx512_mask_punpckld_q_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_punpckld_q_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpunpckldq {{.*#+}} zmm3 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpunpckldq {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
@@ -733,7 +787,7 @@ define <16 x i32>@test_int_x86_avx512_mask_punpckld_q_512(<16 x i32> %x0, <16 x
define <16 x i32> @test_x86_avx512_pslli_d(<16 x i32> %a0) {
; CHECK-LABEL: test_x86_avx512_pslli_d:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpslld $7, %zmm0, %zmm0
; CHECK-NEXT: retq
%res = call <16 x i32> @llvm.x86.avx512.mask.pslli.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 -1)
@@ -742,7 +796,7 @@ define <16 x i32> @test_x86_avx512_pslli_d(<16 x i32> %a0) {
define <16 x i32> @test_x86_avx512_mask_pslli_d(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
; CHECK-LABEL: test_x86_avx512_mask_pslli_d:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpslld $7, %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
@@ -753,7 +807,7 @@ define <16 x i32> @test_x86_avx512_mask_pslli_d(<16 x i32> %a0, <16 x i32> %a1,
define <16 x i32> @test_x86_avx512_maskz_pslli_d(<16 x i32> %a0, i16 %mask) {
; CHECK-LABEL: test_x86_avx512_maskz_pslli_d:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpslld $7, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -765,7 +819,7 @@ declare <16 x i32> @llvm.x86.avx512.mask.pslli.d(<16 x i32>, i32, <16 x i32>, i1
define <8 x i64> @test_x86_avx512_pslli_q(<8 x i64> %a0) {
; CHECK-LABEL: test_x86_avx512_pslli_q:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsllq $7, %zmm0, %zmm0
; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.mask.pslli.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 -1)
@@ -774,7 +828,7 @@ define <8 x i64> @test_x86_avx512_pslli_q(<8 x i64> %a0) {
define <8 x i64> @test_x86_avx512_mask_pslli_q(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
; CHECK-LABEL: test_x86_avx512_mask_pslli_q:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpsllq $7, %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
@@ -785,7 +839,7 @@ define <8 x i64> @test_x86_avx512_mask_pslli_q(<8 x i64> %a0, <8 x i64> %a1, i8
define <8 x i64> @test_x86_avx512_maskz_pslli_q(<8 x i64> %a0, i8 %mask) {
; CHECK-LABEL: test_x86_avx512_maskz_pslli_q:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpsllq $7, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -797,7 +851,7 @@ declare <8 x i64> @llvm.x86.avx512.mask.pslli.q(<8 x i64>, i32, <8 x i64>, i8) n
define <16 x i32> @test_x86_avx512_psrli_d(<16 x i32> %a0) {
; CHECK-LABEL: test_x86_avx512_psrli_d:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsrld $7, %zmm0, %zmm0
; CHECK-NEXT: retq
%res = call <16 x i32> @llvm.x86.avx512.mask.psrli.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 -1)
@@ -806,7 +860,7 @@ define <16 x i32> @test_x86_avx512_psrli_d(<16 x i32> %a0) {
define <16 x i32> @test_x86_avx512_mask_psrli_d(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
; CHECK-LABEL: test_x86_avx512_mask_psrli_d:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpsrld $7, %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
@@ -817,7 +871,7 @@ define <16 x i32> @test_x86_avx512_mask_psrli_d(<16 x i32> %a0, <16 x i32> %a1,
define <16 x i32> @test_x86_avx512_maskz_psrli_d(<16 x i32> %a0, i16 %mask) {
; CHECK-LABEL: test_x86_avx512_maskz_psrli_d:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpsrld $7, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -829,7 +883,7 @@ declare <16 x i32> @llvm.x86.avx512.mask.psrli.d(<16 x i32>, i32, <16 x i32>, i1
define <8 x i64> @test_x86_avx512_psrli_q(<8 x i64> %a0) {
; CHECK-LABEL: test_x86_avx512_psrli_q:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsrlq $7, %zmm0, %zmm0
; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.mask.psrli.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 -1)
@@ -838,7 +892,7 @@ define <8 x i64> @test_x86_avx512_psrli_q(<8 x i64> %a0) {
define <8 x i64> @test_x86_avx512_mask_psrli_q(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
; CHECK-LABEL: test_x86_avx512_mask_psrli_q:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpsrlq $7, %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
@@ -849,7 +903,7 @@ define <8 x i64> @test_x86_avx512_mask_psrli_q(<8 x i64> %a0, <8 x i64> %a1, i8
define <8 x i64> @test_x86_avx512_maskz_psrli_q(<8 x i64> %a0, i8 %mask) {
; CHECK-LABEL: test_x86_avx512_maskz_psrli_q:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpsrlq $7, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -861,7 +915,7 @@ declare <8 x i64> @llvm.x86.avx512.mask.psrli.q(<8 x i64>, i32, <8 x i64>, i8) n
define <16 x i32> @test_x86_avx512_psrai_d(<16 x i32> %a0) {
; CHECK-LABEL: test_x86_avx512_psrai_d:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsrad $7, %zmm0, %zmm0
; CHECK-NEXT: retq
%res = call <16 x i32> @llvm.x86.avx512.mask.psrai.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 -1)
@@ -870,7 +924,7 @@ define <16 x i32> @test_x86_avx512_psrai_d(<16 x i32> %a0) {
define <16 x i32> @test_x86_avx512_mask_psrai_d(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
; CHECK-LABEL: test_x86_avx512_mask_psrai_d:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpsrad $7, %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
@@ -881,7 +935,7 @@ define <16 x i32> @test_x86_avx512_mask_psrai_d(<16 x i32> %a0, <16 x i32> %a1,
define <16 x i32> @test_x86_avx512_maskz_psrai_d(<16 x i32> %a0, i16 %mask) {
; CHECK-LABEL: test_x86_avx512_maskz_psrai_d:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpsrad $7, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -893,7 +947,7 @@ declare <16 x i32> @llvm.x86.avx512.mask.psrai.d(<16 x i32>, i32, <16 x i32>, i1
define <8 x i64> @test_x86_avx512_psrai_q(<8 x i64> %a0) {
; CHECK-LABEL: test_x86_avx512_psrai_q:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsraq $7, %zmm0, %zmm0
; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.mask.psrai.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 -1)
@@ -902,7 +956,7 @@ define <8 x i64> @test_x86_avx512_psrai_q(<8 x i64> %a0) {
define <8 x i64> @test_x86_avx512_mask_psrai_q(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
; CHECK-LABEL: test_x86_avx512_mask_psrai_q:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpsraq $7, %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
@@ -913,7 +967,7 @@ define <8 x i64> @test_x86_avx512_mask_psrai_q(<8 x i64> %a0, <8 x i64> %a1, i8
define <8 x i64> @test_x86_avx512_maskz_psrai_q(<8 x i64> %a0, i8 %mask) {
; CHECK-LABEL: test_x86_avx512_maskz_psrai_q:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpsraq $7, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -927,7 +981,7 @@ declare void @llvm.x86.avx512.storent.q.512(i8*, <8 x i64>)
define void@test_storent_q_512(<8 x i64> %data, i8* %ptr) {
; CHECK-LABEL: test_storent_q_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovntps %zmm0, (%rdi)
; CHECK-NEXT: retq
call void @llvm.x86.avx512.storent.q.512(i8* %ptr, <8 x i64> %data)
@@ -938,7 +992,7 @@ declare void @llvm.x86.avx512.storent.pd.512(i8*, <8 x double>)
define void @test_storent_pd_512(<8 x double> %data, i8* %ptr) {
; CHECK-LABEL: test_storent_pd_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovntps %zmm0, (%rdi)
; CHECK-NEXT: retq
call void @llvm.x86.avx512.storent.pd.512(i8* %ptr, <8 x double> %data)
@@ -949,7 +1003,7 @@ declare void @llvm.x86.avx512.storent.ps.512(i8*, <16 x float>)
define void @test_storent_ps_512(<16 x float> %data, i8* %ptr) {
; CHECK-LABEL: test_storent_ps_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovntps %zmm0, (%rdi)
; CHECK-NEXT: retq
call void @llvm.x86.avx512.storent.ps.512(i8* %ptr, <16 x float> %data)
@@ -958,8 +1012,8 @@ define void @test_storent_ps_512(<16 x float> %data, i8* %ptr) {
define <16 x i32> @test_xor_epi32(<16 x i32> %a, <16 x i32> %b) {
; CHECK-LABEL: test_xor_epi32:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpxord %zmm1, %zmm0, %zmm0
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpxorq %zmm1, %zmm0, %zmm0
; CHECK-NEXT: retq
%res = call <16 x i32> @llvm.x86.avx512.mask.pxor.d.512(<16 x i32> %a,<16 x i32> %b, <16 x i32>zeroinitializer, i16 -1)
ret < 16 x i32> %res
@@ -967,7 +1021,7 @@ define <16 x i32> @test_xor_epi32(<16 x i32> %a, <16 x i32> %b) {
define <16 x i32> @test_mask_xor_epi32(<16 x i32> %a,<16 x i32> %b, <16 x i32> %passThru, i16 %mask) {
; CHECK-LABEL: test_mask_xor_epi32:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpxord %zmm1, %zmm0, %zmm2 {%k1}
; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
@@ -980,8 +1034,8 @@ declare <16 x i32> @llvm.x86.avx512.mask.pxor.d.512(<16 x i32>, <16 x i32>, <16
define <16 x i32> @test_or_epi32(<16 x i32> %a, <16 x i32> %b) {
; CHECK-LABEL: test_or_epi32:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpord %zmm1, %zmm0, %zmm0
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vporq %zmm1, %zmm0, %zmm0
; CHECK-NEXT: retq
%res = call <16 x i32> @llvm.x86.avx512.mask.por.d.512(<16 x i32> %a,<16 x i32> %b, <16 x i32>zeroinitializer, i16 -1)
ret < 16 x i32> %res
@@ -989,7 +1043,7 @@ define <16 x i32> @test_or_epi32(<16 x i32> %a, <16 x i32> %b) {
define <16 x i32> @test_mask_or_epi32(<16 x i32> %a,<16 x i32> %b, <16 x i32> %passThru, i16 %mask) {
; CHECK-LABEL: test_mask_or_epi32:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpord %zmm1, %zmm0, %zmm2 {%k1}
; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
@@ -1002,8 +1056,8 @@ declare <16 x i32> @llvm.x86.avx512.mask.por.d.512(<16 x i32>, <16 x i32>, <16 x
define <16 x i32> @test_and_epi32(<16 x i32> %a, <16 x i32> %b) {
; CHECK-LABEL: test_and_epi32:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpandd %zmm1, %zmm0, %zmm0
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpandq %zmm1, %zmm0, %zmm0
; CHECK-NEXT: retq
%res = call <16 x i32> @llvm.x86.avx512.mask.pand.d.512(<16 x i32> %a,<16 x i32> %b, <16 x i32>zeroinitializer, i16 -1)
ret < 16 x i32> %res
@@ -1011,7 +1065,7 @@ define <16 x i32> @test_and_epi32(<16 x i32> %a, <16 x i32> %b) {
define <16 x i32> @test_mask_and_epi32(<16 x i32> %a,<16 x i32> %b, <16 x i32> %passThru, i16 %mask) {
; CHECK-LABEL: test_mask_and_epi32:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpandd %zmm1, %zmm0, %zmm2 {%k1}
; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
@@ -1024,7 +1078,7 @@ declare <16 x i32> @llvm.x86.avx512.mask.pand.d.512(<16 x i32>, <16 x i32>, <16
define <8 x i64> @test_xor_epi64(<8 x i64> %a, <8 x i64> %b) {
; CHECK-LABEL: test_xor_epi64:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpxorq %zmm1, %zmm0, %zmm0
; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.mask.pxor.q.512(<8 x i64> %a,<8 x i64> %b, <8 x i64>zeroinitializer, i8 -1)
@@ -1033,7 +1087,7 @@ define <8 x i64> @test_xor_epi64(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @test_mask_xor_epi64(<8 x i64> %a,<8 x i64> %b, <8 x i64> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_xor_epi64:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpxorq %zmm1, %zmm0, %zmm2 {%k1}
; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
@@ -1046,7 +1100,7 @@ declare <8 x i64> @llvm.x86.avx512.mask.pxor.q.512(<8 x i64>, <8 x i64>, <8 x i6
define <8 x i64> @test_or_epi64(<8 x i64> %a, <8 x i64> %b) {
; CHECK-LABEL: test_or_epi64:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vporq %zmm1, %zmm0, %zmm0
; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.mask.por.q.512(<8 x i64> %a,<8 x i64> %b, <8 x i64>zeroinitializer, i8 -1)
@@ -1055,7 +1109,7 @@ define <8 x i64> @test_or_epi64(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @test_mask_or_epi64(<8 x i64> %a,<8 x i64> %b, <8 x i64> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_or_epi64:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vporq %zmm1, %zmm0, %zmm2 {%k1}
; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
@@ -1068,7 +1122,7 @@ declare <8 x i64> @llvm.x86.avx512.mask.por.q.512(<8 x i64>, <8 x i64>, <8 x i64
define <8 x i64> @test_and_epi64(<8 x i64> %a, <8 x i64> %b) {
; CHECK-LABEL: test_and_epi64:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpandq %zmm1, %zmm0, %zmm0
; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.mask.pand.q.512(<8 x i64> %a,<8 x i64> %b, <8 x i64>zeroinitializer, i8 -1)
@@ -1077,7 +1131,7 @@ define <8 x i64> @test_and_epi64(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @test_mask_and_epi64(<8 x i64> %a,<8 x i64> %b, <8 x i64> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_and_epi64:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpandq %zmm1, %zmm0, %zmm2 {%k1}
; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
@@ -1090,7 +1144,7 @@ declare <8 x i64> @llvm.x86.avx512.mask.pand.q.512(<8 x i64>, <8 x i64>, <8 x i6
define <16 x i32> @test_mask_add_epi32_rr(<16 x i32> %a, <16 x i32> %b) {
; CHECK-LABEL: test_mask_add_epi32_rr:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0
; CHECK-NEXT: retq
%res = call <16 x i32> @llvm.x86.avx512.mask.padd.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 -1)
@@ -1099,7 +1153,7 @@ define <16 x i32> @test_mask_add_epi32_rr(<16 x i32> %a, <16 x i32> %b) {
define <16 x i32> @test_mask_add_epi32_rrk(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask) {
; CHECK-LABEL: test_mask_add_epi32_rrk:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm2 {%k1}
; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
@@ -1110,7 +1164,7 @@ define <16 x i32> @test_mask_add_epi32_rrk(<16 x i32> %a, <16 x i32> %b, <16 x i
define <16 x i32> @test_mask_add_epi32_rrkz(<16 x i32> %a, <16 x i32> %b, i16 %mask) {
; CHECK-LABEL: test_mask_add_epi32_rrkz:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -1120,7 +1174,7 @@ define <16 x i32> @test_mask_add_epi32_rrkz(<16 x i32> %a, <16 x i32> %b, i16 %m
define <16 x i32> @test_mask_add_epi32_rm(<16 x i32> %a, <16 x i32>* %ptr_b) {
; CHECK-LABEL: test_mask_add_epi32_rm:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpaddd (%rdi), %zmm0, %zmm0
; CHECK-NEXT: retq
%b = load <16 x i32>, <16 x i32>* %ptr_b
@@ -1130,7 +1184,7 @@ define <16 x i32> @test_mask_add_epi32_rm(<16 x i32> %a, <16 x i32>* %ptr_b) {
define <16 x i32> @test_mask_add_epi32_rmk(<16 x i32> %a, <16 x i32>* %ptr_b, <16 x i32> %passThru, i16 %mask) {
; CHECK-LABEL: test_mask_add_epi32_rmk:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpaddd (%rdi), %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
@@ -1142,7 +1196,7 @@ define <16 x i32> @test_mask_add_epi32_rmk(<16 x i32> %a, <16 x i32>* %ptr_b, <1
define <16 x i32> @test_mask_add_epi32_rmkz(<16 x i32> %a, <16 x i32>* %ptr_b, i16 %mask) {
; CHECK-LABEL: test_mask_add_epi32_rmkz:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpaddd (%rdi), %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -1153,7 +1207,7 @@ define <16 x i32> @test_mask_add_epi32_rmkz(<16 x i32> %a, <16 x i32>* %ptr_b, i
define <16 x i32> @test_mask_add_epi32_rmb(<16 x i32> %a, i32* %ptr_b) {
; CHECK-LABEL: test_mask_add_epi32_rmb:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpaddd (%rdi){1to16}, %zmm0, %zmm0
; CHECK-NEXT: retq
%q = load i32, i32* %ptr_b
@@ -1165,7 +1219,7 @@ define <16 x i32> @test_mask_add_epi32_rmb(<16 x i32> %a, i32* %ptr_b) {
define <16 x i32> @test_mask_add_epi32_rmbk(<16 x i32> %a, i32* %ptr_b, <16 x i32> %passThru, i16 %mask) {
; CHECK-LABEL: test_mask_add_epi32_rmbk:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpaddd (%rdi){1to16}, %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
@@ -1179,7 +1233,7 @@ define <16 x i32> @test_mask_add_epi32_rmbk(<16 x i32> %a, i32* %ptr_b, <16 x i3
define <16 x i32> @test_mask_add_epi32_rmbkz(<16 x i32> %a, i32* %ptr_b, i16 %mask) {
; CHECK-LABEL: test_mask_add_epi32_rmbkz:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpaddd (%rdi){1to16}, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -1194,7 +1248,7 @@ declare <16 x i32> @llvm.x86.avx512.mask.padd.d.512(<16 x i32>, <16 x i32>, <16
define <16 x i32> @test_mask_sub_epi32_rr(<16 x i32> %a, <16 x i32> %b) {
; CHECK-LABEL: test_mask_sub_epi32_rr:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsubd %zmm1, %zmm0, %zmm0
; CHECK-NEXT: retq
%res = call <16 x i32> @llvm.x86.avx512.mask.psub.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 -1)
@@ -1203,7 +1257,7 @@ define <16 x i32> @test_mask_sub_epi32_rr(<16 x i32> %a, <16 x i32> %b) {
define <16 x i32> @test_mask_sub_epi32_rrk(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask) {
; CHECK-LABEL: test_mask_sub_epi32_rrk:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpsubd %zmm1, %zmm0, %zmm2 {%k1}
; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
@@ -1214,7 +1268,7 @@ define <16 x i32> @test_mask_sub_epi32_rrk(<16 x i32> %a, <16 x i32> %b, <16 x i
define <16 x i32> @test_mask_sub_epi32_rrkz(<16 x i32> %a, <16 x i32> %b, i16 %mask) {
; CHECK-LABEL: test_mask_sub_epi32_rrkz:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpsubd %zmm1, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -1224,7 +1278,7 @@ define <16 x i32> @test_mask_sub_epi32_rrkz(<16 x i32> %a, <16 x i32> %b, i16 %m
define <16 x i32> @test_mask_sub_epi32_rm(<16 x i32> %a, <16 x i32>* %ptr_b) {
; CHECK-LABEL: test_mask_sub_epi32_rm:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsubd (%rdi), %zmm0, %zmm0
; CHECK-NEXT: retq
%b = load <16 x i32>, <16 x i32>* %ptr_b
@@ -1234,7 +1288,7 @@ define <16 x i32> @test_mask_sub_epi32_rm(<16 x i32> %a, <16 x i32>* %ptr_b) {
define <16 x i32> @test_mask_sub_epi32_rmk(<16 x i32> %a, <16 x i32>* %ptr_b, <16 x i32> %passThru, i16 %mask) {
; CHECK-LABEL: test_mask_sub_epi32_rmk:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpsubd (%rdi), %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
@@ -1246,7 +1300,7 @@ define <16 x i32> @test_mask_sub_epi32_rmk(<16 x i32> %a, <16 x i32>* %ptr_b, <1
define <16 x i32> @test_mask_sub_epi32_rmkz(<16 x i32> %a, <16 x i32>* %ptr_b, i16 %mask) {
; CHECK-LABEL: test_mask_sub_epi32_rmkz:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpsubd (%rdi), %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -1257,7 +1311,7 @@ define <16 x i32> @test_mask_sub_epi32_rmkz(<16 x i32> %a, <16 x i32>* %ptr_b, i
define <16 x i32> @test_mask_sub_epi32_rmb(<16 x i32> %a, i32* %ptr_b) {
; CHECK-LABEL: test_mask_sub_epi32_rmb:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsubd (%rdi){1to16}, %zmm0, %zmm0
; CHECK-NEXT: retq
%q = load i32, i32* %ptr_b
@@ -1269,7 +1323,7 @@ define <16 x i32> @test_mask_sub_epi32_rmb(<16 x i32> %a, i32* %ptr_b) {
define <16 x i32> @test_mask_sub_epi32_rmbk(<16 x i32> %a, i32* %ptr_b, <16 x i32> %passThru, i16 %mask) {
; CHECK-LABEL: test_mask_sub_epi32_rmbk:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpsubd (%rdi){1to16}, %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
@@ -1283,7 +1337,7 @@ define <16 x i32> @test_mask_sub_epi32_rmbk(<16 x i32> %a, i32* %ptr_b, <16 x i3
define <16 x i32> @test_mask_sub_epi32_rmbkz(<16 x i32> %a, i32* %ptr_b, i16 %mask) {
; CHECK-LABEL: test_mask_sub_epi32_rmbkz:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpsubd (%rdi){1to16}, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -1298,7 +1352,7 @@ declare <16 x i32> @llvm.x86.avx512.mask.psub.d.512(<16 x i32>, <16 x i32>, <16
define <8 x i64> @test_mask_add_epi64_rr(<8 x i64> %a, <8 x i64> %b) {
; CHECK-LABEL: test_mask_add_epi64_rr:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpaddq %zmm1, %zmm0, %zmm0
; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 -1)
@@ -1307,7 +1361,7 @@ define <8 x i64> @test_mask_add_epi64_rr(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @test_mask_add_epi64_rrk(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_add_epi64_rrk:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpaddq %zmm1, %zmm0, %zmm2 {%k1}
; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
@@ -1318,7 +1372,7 @@ define <8 x i64> @test_mask_add_epi64_rrk(<8 x i64> %a, <8 x i64> %b, <8 x i64>
define <8 x i64> @test_mask_add_epi64_rrkz(<8 x i64> %a, <8 x i64> %b, i8 %mask) {
; CHECK-LABEL: test_mask_add_epi64_rrkz:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpaddq %zmm1, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -1328,7 +1382,7 @@ define <8 x i64> @test_mask_add_epi64_rrkz(<8 x i64> %a, <8 x i64> %b, i8 %mask)
define <8 x i64> @test_mask_add_epi64_rm(<8 x i64> %a, <8 x i64>* %ptr_b) {
; CHECK-LABEL: test_mask_add_epi64_rm:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpaddq (%rdi), %zmm0, %zmm0
; CHECK-NEXT: retq
%b = load <8 x i64>, <8 x i64>* %ptr_b
@@ -1338,7 +1392,7 @@ define <8 x i64> @test_mask_add_epi64_rm(<8 x i64> %a, <8 x i64>* %ptr_b) {
define <8 x i64> @test_mask_add_epi64_rmk(<8 x i64> %a, <8 x i64>* %ptr_b, <8 x i64> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_add_epi64_rmk:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpaddq (%rdi), %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
@@ -1350,7 +1404,7 @@ define <8 x i64> @test_mask_add_epi64_rmk(<8 x i64> %a, <8 x i64>* %ptr_b, <8 x
define <8 x i64> @test_mask_add_epi64_rmkz(<8 x i64> %a, <8 x i64>* %ptr_b, i8 %mask) {
; CHECK-LABEL: test_mask_add_epi64_rmkz:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpaddq (%rdi), %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -1361,7 +1415,7 @@ define <8 x i64> @test_mask_add_epi64_rmkz(<8 x i64> %a, <8 x i64>* %ptr_b, i8 %
define <8 x i64> @test_mask_add_epi64_rmb(<8 x i64> %a, i64* %ptr_b) {
; CHECK-LABEL: test_mask_add_epi64_rmb:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpaddq (%rdi){1to8}, %zmm0, %zmm0
; CHECK-NEXT: retq
%q = load i64, i64* %ptr_b
@@ -1373,7 +1427,7 @@ define <8 x i64> @test_mask_add_epi64_rmb(<8 x i64> %a, i64* %ptr_b) {
define <8 x i64> @test_mask_add_epi64_rmbk(<8 x i64> %a, i64* %ptr_b, <8 x i64> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_add_epi64_rmbk:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpaddq (%rdi){1to8}, %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
@@ -1387,7 +1441,7 @@ define <8 x i64> @test_mask_add_epi64_rmbk(<8 x i64> %a, i64* %ptr_b, <8 x i64>
define <8 x i64> @test_mask_add_epi64_rmbkz(<8 x i64> %a, i64* %ptr_b, i8 %mask) {
; CHECK-LABEL: test_mask_add_epi64_rmbkz:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpaddq (%rdi){1to8}, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -1402,7 +1456,7 @@ declare <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64>, <8 x i64>, <8 x i6
define <8 x i64> @test_mask_sub_epi64_rr(<8 x i64> %a, <8 x i64> %b) {
; CHECK-LABEL: test_mask_sub_epi64_rr:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsubq %zmm1, %zmm0, %zmm0
; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 -1)
@@ -1411,7 +1465,7 @@ define <8 x i64> @test_mask_sub_epi64_rr(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @test_mask_sub_epi64_rrk(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_sub_epi64_rrk:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpsubq %zmm1, %zmm0, %zmm2 {%k1}
; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
@@ -1422,7 +1476,7 @@ define <8 x i64> @test_mask_sub_epi64_rrk(<8 x i64> %a, <8 x i64> %b, <8 x i64>
define <8 x i64> @test_mask_sub_epi64_rrkz(<8 x i64> %a, <8 x i64> %b, i8 %mask) {
; CHECK-LABEL: test_mask_sub_epi64_rrkz:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpsubq %zmm1, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -1432,7 +1486,7 @@ define <8 x i64> @test_mask_sub_epi64_rrkz(<8 x i64> %a, <8 x i64> %b, i8 %mask)
define <8 x i64> @test_mask_sub_epi64_rm(<8 x i64> %a, <8 x i64>* %ptr_b) {
; CHECK-LABEL: test_mask_sub_epi64_rm:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsubq (%rdi), %zmm0, %zmm0
; CHECK-NEXT: retq
%b = load <8 x i64>, <8 x i64>* %ptr_b
@@ -1442,7 +1496,7 @@ define <8 x i64> @test_mask_sub_epi64_rm(<8 x i64> %a, <8 x i64>* %ptr_b) {
define <8 x i64> @test_mask_sub_epi64_rmk(<8 x i64> %a, <8 x i64>* %ptr_b, <8 x i64> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_sub_epi64_rmk:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpsubq (%rdi), %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
@@ -1454,7 +1508,7 @@ define <8 x i64> @test_mask_sub_epi64_rmk(<8 x i64> %a, <8 x i64>* %ptr_b, <8 x
define <8 x i64> @test_mask_sub_epi64_rmkz(<8 x i64> %a, <8 x i64>* %ptr_b, i8 %mask) {
; CHECK-LABEL: test_mask_sub_epi64_rmkz:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpsubq (%rdi), %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -1465,7 +1519,7 @@ define <8 x i64> @test_mask_sub_epi64_rmkz(<8 x i64> %a, <8 x i64>* %ptr_b, i8 %
define <8 x i64> @test_mask_sub_epi64_rmb(<8 x i64> %a, i64* %ptr_b) {
; CHECK-LABEL: test_mask_sub_epi64_rmb:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsubq (%rdi){1to8}, %zmm0, %zmm0
; CHECK-NEXT: retq
%q = load i64, i64* %ptr_b
@@ -1477,7 +1531,7 @@ define <8 x i64> @test_mask_sub_epi64_rmb(<8 x i64> %a, i64* %ptr_b) {
define <8 x i64> @test_mask_sub_epi64_rmbk(<8 x i64> %a, i64* %ptr_b, <8 x i64> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_sub_epi64_rmbk:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpsubq (%rdi){1to8}, %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
@@ -1491,7 +1545,7 @@ define <8 x i64> @test_mask_sub_epi64_rmbk(<8 x i64> %a, i64* %ptr_b, <8 x i64>
define <8 x i64> @test_mask_sub_epi64_rmbkz(<8 x i64> %a, i64* %ptr_b, i8 %mask) {
; CHECK-LABEL: test_mask_sub_epi64_rmbkz:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpsubq (%rdi){1to8}, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -1506,7 +1560,7 @@ declare <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64>, <8 x i64>, <8 x i6
define <16 x i32> @test_mask_mullo_epi32_rr_512(<16 x i32> %a, <16 x i32> %b) {
; CHECK-LABEL: test_mask_mullo_epi32_rr_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpmulld %zmm1, %zmm0, %zmm0
; CHECK-NEXT: retq
%res = call <16 x i32> @llvm.x86.avx512.mask.pmull.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 -1)
@@ -1515,7 +1569,7 @@ define <16 x i32> @test_mask_mullo_epi32_rr_512(<16 x i32> %a, <16 x i32> %b) {
define <16 x i32> @test_mask_mullo_epi32_rrk_512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask) {
; CHECK-LABEL: test_mask_mullo_epi32_rrk_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpmulld %zmm1, %zmm0, %zmm2 {%k1}
; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
@@ -1526,7 +1580,7 @@ define <16 x i32> @test_mask_mullo_epi32_rrk_512(<16 x i32> %a, <16 x i32> %b, <
define <16 x i32> @test_mask_mullo_epi32_rrkz_512(<16 x i32> %a, <16 x i32> %b, i16 %mask) {
; CHECK-LABEL: test_mask_mullo_epi32_rrkz_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpmulld %zmm1, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -1536,7 +1590,7 @@ define <16 x i32> @test_mask_mullo_epi32_rrkz_512(<16 x i32> %a, <16 x i32> %b,
define <16 x i32> @test_mask_mullo_epi32_rm_512(<16 x i32> %a, <16 x i32>* %ptr_b) {
; CHECK-LABEL: test_mask_mullo_epi32_rm_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpmulld (%rdi), %zmm0, %zmm0
; CHECK-NEXT: retq
%b = load <16 x i32>, <16 x i32>* %ptr_b
@@ -1546,7 +1600,7 @@ define <16 x i32> @test_mask_mullo_epi32_rm_512(<16 x i32> %a, <16 x i32>* %ptr_
define <16 x i32> @test_mask_mullo_epi32_rmk_512(<16 x i32> %a, <16 x i32>* %ptr_b, <16 x i32> %passThru, i16 %mask) {
; CHECK-LABEL: test_mask_mullo_epi32_rmk_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpmulld (%rdi), %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
@@ -1558,7 +1612,7 @@ define <16 x i32> @test_mask_mullo_epi32_rmk_512(<16 x i32> %a, <16 x i32>* %ptr
define <16 x i32> @test_mask_mullo_epi32_rmkz_512(<16 x i32> %a, <16 x i32>* %ptr_b, i16 %mask) {
; CHECK-LABEL: test_mask_mullo_epi32_rmkz_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpmulld (%rdi), %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -1569,7 +1623,7 @@ define <16 x i32> @test_mask_mullo_epi32_rmkz_512(<16 x i32> %a, <16 x i32>* %pt
define <16 x i32> @test_mask_mullo_epi32_rmb_512(<16 x i32> %a, i32* %ptr_b) {
; CHECK-LABEL: test_mask_mullo_epi32_rmb_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpmulld (%rdi){1to16}, %zmm0, %zmm0
; CHECK-NEXT: retq
%q = load i32, i32* %ptr_b
@@ -1581,7 +1635,7 @@ define <16 x i32> @test_mask_mullo_epi32_rmb_512(<16 x i32> %a, i32* %ptr_b) {
define <16 x i32> @test_mask_mullo_epi32_rmbk_512(<16 x i32> %a, i32* %ptr_b, <16 x i32> %passThru, i16 %mask) {
; CHECK-LABEL: test_mask_mullo_epi32_rmbk_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpmulld (%rdi){1to16}, %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
@@ -1595,7 +1649,7 @@ define <16 x i32> @test_mask_mullo_epi32_rmbk_512(<16 x i32> %a, i32* %ptr_b, <1
define <16 x i32> @test_mask_mullo_epi32_rmbkz_512(<16 x i32> %a, i32* %ptr_b, i16 %mask) {
; CHECK-LABEL: test_mask_mullo_epi32_rmbkz_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpmulld (%rdi){1to16}, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -1608,11 +1662,81 @@ define <16 x i32> @test_mask_mullo_epi32_rmbkz_512(<16 x i32> %a, i32* %ptr_b, i
declare <16 x i32> @llvm.x86.avx512.mask.pmull.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
+
+declare <16 x float> @llvm.x86.avx512.mask.shuf.f32x4(<16 x float>, <16 x float>, i32, <16 x float>, i16)
+
+define <16 x float>@test_int_x86_avx512_mask_shuf_f32x4(<16 x float> %x0, <16 x float> %x1, <16 x float> %x3, i16 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_shuf_f32x4:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vshuff32x4 {{.*#+}} zmm3 = zmm0[8,9,10,11,4,5,6,7],zmm1[4,5,6,7,0,1,2,3]
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[8,9,10,11,4,5,6,7],zmm1[4,5,6,7,0,1,2,3]
+; CHECK-NEXT: vaddps %zmm3, %zmm2, %zmm0
+; CHECK-NEXT: retq
+ %res = call <16 x float> @llvm.x86.avx512.mask.shuf.f32x4(<16 x float> %x0, <16 x float> %x1, i32 22, <16 x float> %x3, i16 %x4)
+ %res1 = call <16 x float> @llvm.x86.avx512.mask.shuf.f32x4(<16 x float> %x0, <16 x float> %x1, i32 22, <16 x float> %x3, i16 -1)
+ %res2 = fadd <16 x float> %res, %res1
+ ret <16 x float> %res2
+}
+
+declare <8 x double> @llvm.x86.avx512.mask.shuf.f64x2(<8 x double>, <8 x double>, i32, <8 x double>, i8)
+
+define <8 x double>@test_int_x86_avx512_mask_shuf_f64x2(<8 x double> %x0, <8 x double> %x1, <8 x double> %x3, i8 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_shuf_f64x2:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm3 = zmm0[4,5,2,3],zmm1[2,3,0,1]
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[4,5,2,3],zmm1[2,3,0,1]
+; CHECK-NEXT: vaddpd %zmm3, %zmm2, %zmm2
+; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,2,3],zmm1[2,3,0,1]
+; CHECK-NEXT: vaddpd %zmm0, %zmm2, %zmm0
+; CHECK-NEXT: retq
+ %res = call <8 x double> @llvm.x86.avx512.mask.shuf.f64x2(<8 x double> %x0, <8 x double> %x1, i32 22, <8 x double> %x3, i8 %x4)
+ %res1 = call <8 x double> @llvm.x86.avx512.mask.shuf.f64x2(<8 x double> %x0, <8 x double> %x1, i32 22, <8 x double> %x3, i8 -1)
+ %res2 = call <8 x double> @llvm.x86.avx512.mask.shuf.f64x2(<8 x double> %x0, <8 x double> %x1, i32 22, <8 x double> zeroinitializer, i8 %x4)
+
+ %res3 = fadd <8 x double> %res, %res1
+ %res4 = fadd <8 x double> %res3, %res2
+ ret <8 x double> %res4
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.shuf.i32x4(<16 x i32>, <16 x i32>, i32, <16 x i32>, i16)
+
+define <16 x i32>@test_int_x86_avx512_mask_shuf_i32x4(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x3, i16 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_shuf_i32x4:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vshufi32x4 {{.*#+}} zmm3 = zmm0[8,9,10,11,4,5,6,7],zmm1[4,5,6,7,0,1,2,3]
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[8,9,10,11,4,5,6,7],zmm1[4,5,6,7,0,1,2,3]
+; CHECK-NEXT: vpaddd %zmm3, %zmm2, %zmm0
+; CHECK-NEXT: retq
+ %res = call <16 x i32> @llvm.x86.avx512.mask.shuf.i32x4(<16 x i32> %x0, <16 x i32> %x1, i32 22, <16 x i32> %x3, i16 %x4)
+ %res1 = call <16 x i32> @llvm.x86.avx512.mask.shuf.i32x4(<16 x i32> %x0, <16 x i32> %x1, i32 22, <16 x i32> %x3, i16 -1)
+ %res2 = add <16 x i32> %res, %res1
+ ret <16 x i32> %res2
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.shuf.i64x2(<8 x i64>, <8 x i64>, i32, <8 x i64>, i8)
+
+define <8 x i64>@test_int_x86_avx512_mask_shuf_i64x2(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x3, i8 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_shuf_i64x2:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm0[4,5,2,3],zmm1[2,3,0,1]
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[4,5,2,3],zmm1[2,3,0,1]
+; CHECK-NEXT: vpaddq %zmm3, %zmm2, %zmm0
+; CHECK-NEXT: retq
+ %res = call <8 x i64> @llvm.x86.avx512.mask.shuf.i64x2(<8 x i64> %x0, <8 x i64> %x1, i32 22, <8 x i64> %x3, i8 %x4)
+ %res1 = call <8 x i64> @llvm.x86.avx512.mask.shuf.i64x2(<8 x i64> %x0, <8 x i64> %x1, i32 22, <8 x i64> %x3, i8 -1)
+ %res2 = add <8 x i64> %res, %res1
+ ret <8 x i64> %res2
+}
+
declare <8 x double> @llvm.x86.avx512.mask.shuf.pd.512(<8 x double>, <8 x double>, i32, <8 x double>, i8)
define <8 x double>@test_int_x86_avx512_mask_shuf_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x3, i8 %x4) {
; CHECK-LABEL: test_int_x86_avx512_mask_shuf_pd_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vshufpd {{.*#+}} zmm3 = zmm0[0],zmm1[1],zmm0[3],zmm1[2],zmm0[5],zmm1[4],zmm0[6],zmm1[6]
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vshufpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[1],zmm0[3],zmm1[2],zmm0[5],zmm1[4],zmm0[6],zmm1[6]
@@ -1633,7 +1757,7 @@ declare <16 x float> @llvm.x86.avx512.mask.shuf.ps.512(<16 x float>, <16 x float
define <16 x float>@test_int_x86_avx512_mask_shuf_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x3, i16 %x4) {
; CHECK-LABEL: test_int_x86_avx512_mask_shuf_ps_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vshufps {{.*#+}} zmm3 = zmm0[2,1],zmm1[1,0],zmm0[6,5],zmm1[5,4],zmm0[10,9],zmm1[9,8],zmm0[14,13],zmm1[13,12]
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vshufps {{.*#+}} zmm2 {%k1} = zmm0[2,1],zmm1[1,0],zmm0[6,5],zmm1[5,4],zmm0[10,9],zmm1[9,8],zmm0[14,13],zmm1[13,12]
@@ -1649,7 +1773,7 @@ declare <16 x i32> @llvm.x86.avx512.mask.pmaxs.d.512(<16 x i32>, <16 x i32>, <16
define <16 x i32>@test_int_x86_avx512_mask_pmaxs_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmaxs_d_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpmaxsd %zmm1, %zmm0, %zmm3
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpmaxsd %zmm1, %zmm0, %zmm2 {%k1}
@@ -1665,7 +1789,7 @@ declare <8 x i64> @llvm.x86.avx512.mask.pmaxs.q.512(<8 x i64>, <8 x i64>, <8 x i
define <8 x i64>@test_int_x86_avx512_mask_pmaxs_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmaxs_q_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpmaxsq %zmm1, %zmm0, %zmm3
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpmaxsq %zmm1, %zmm0, %zmm2 {%k1}
@@ -1681,7 +1805,7 @@ declare <16 x i32> @llvm.x86.avx512.mask.pmaxu.d.512(<16 x i32>, <16 x i32>, <16
define <16 x i32>@test_int_x86_avx512_mask_pmaxu_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmaxu_d_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpmaxud %zmm1, %zmm0, %zmm3
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpmaxud %zmm1, %zmm0, %zmm2 {%k1}
@@ -1697,7 +1821,7 @@ declare <8 x i64> @llvm.x86.avx512.mask.pmaxu.q.512(<8 x i64>, <8 x i64>, <8 x i
define <8 x i64>@test_int_x86_avx512_mask_pmaxu_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmaxu_q_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpmaxuq %zmm1, %zmm0, %zmm3
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpmaxuq %zmm1, %zmm0, %zmm2 {%k1}
@@ -1713,7 +1837,7 @@ declare <16 x i32> @llvm.x86.avx512.mask.pmins.d.512(<16 x i32>, <16 x i32>, <16
define <16 x i32>@test_int_x86_avx512_mask_pmins_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmins_d_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpminsd %zmm1, %zmm0, %zmm3
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpminsd %zmm1, %zmm0, %zmm2 {%k1}
@@ -1729,7 +1853,7 @@ declare <8 x i64> @llvm.x86.avx512.mask.pmins.q.512(<8 x i64>, <8 x i64>, <8 x i
define <8 x i64>@test_int_x86_avx512_mask_pmins_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmins_q_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpminsq %zmm1, %zmm0, %zmm3
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpminsq %zmm1, %zmm0, %zmm2 {%k1}
@@ -1745,7 +1869,7 @@ declare <16 x i32> @llvm.x86.avx512.mask.pminu.d.512(<16 x i32>, <16 x i32>, <16
define <16 x i32>@test_int_x86_avx512_mask_pminu_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_pminu_d_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpminud %zmm1, %zmm0, %zmm3
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpminud %zmm1, %zmm0, %zmm2 {%k1}
@@ -1761,7 +1885,7 @@ declare <8 x i64> @llvm.x86.avx512.mask.pminu.q.512(<8 x i64>, <8 x i64>, <8 x i
define <8 x i64>@test_int_x86_avx512_mask_pminu_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_pminu_q_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpminuq %zmm1, %zmm0, %zmm3
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpminuq %zmm1, %zmm0, %zmm2 {%k1}
@@ -1775,7 +1899,7 @@ define <8 x i64>@test_int_x86_avx512_mask_pminu_q_512(<8 x i64> %x0, <8 x i64> %
define <4 x float> @test_mm_mask_move_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
; CHECK-LABEL: test_mm_mask_move_ss:
-; CHECK: ## BB#0: ## %entry
+; CHECK: ## %bb.0: ## %entry
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovss %xmm2, %xmm1, %xmm0 {%k1}
; CHECK-NEXT: retq
@@ -1787,7 +1911,7 @@ entry:
define <4 x float> @test_mm_maskz_move_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
; CHECK-LABEL: test_mm_maskz_move_ss:
-; CHECK: ## BB#0: ## %entry
+; CHECK: ## %bb.0: ## %entry
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -1798,7 +1922,7 @@ entry:
define <2 x double> @test_mm_mask_move_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
; CHECK-LABEL: test_mm_mask_move_sd:
-; CHECK: ## BB#0: ## %entry
+; CHECK: ## %bb.0: ## %entry
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovsd %xmm2, %xmm1, %xmm0 {%k1}
; CHECK-NEXT: retq
@@ -1809,7 +1933,7 @@ entry:
define <2 x double> @test_mm_maskz_move_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
; CHECK-LABEL: test_mm_maskz_move_sd:
-; CHECK: ## BB#0: ## %entry
+; CHECK: ## %bb.0: ## %entry
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -1825,7 +1949,7 @@ declare <16 x i32> @llvm.x86.avx512.mask.pmovzxb.d.512(<16 x i8>, <16 x i32>, i1
define <16 x i32>@test_int_x86_avx512_mask_pmovzxb_d_512(<16 x i8> %x0, <16 x i32> %x1, i16 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovzxb_d_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpmovzxbd {{.*#+}} zmm1 {%k1} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
@@ -1845,7 +1969,7 @@ declare <8 x i64> @llvm.x86.avx512.mask.pmovzxb.q.512(<16 x i8>, <8 x i64>, i8)
define <8 x i64>@test_int_x86_avx512_mask_pmovzxb_q_512(<16 x i8> %x0, <8 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovzxb_q_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpmovzxbq {{.*#+}} zmm2 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpmovzxbq {{.*#+}} zmm1 {%k1} = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero
@@ -1865,7 +1989,7 @@ declare <8 x i64> @llvm.x86.avx512.mask.pmovzxd.q.512(<8 x i32>, <8 x i64>, i8)
define <8 x i64>@test_int_x86_avx512_mask_pmovzxd_q_512(<8 x i32> %x0, <8 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovzxd_q_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpmovzxdq {{.*#+}} zmm2 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpmovzxdq {{.*#+}} zmm1 {%k1} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
@@ -1885,7 +2009,7 @@ declare <16 x i32> @llvm.x86.avx512.mask.pmovzxw.d.512(<16 x i16>, <16 x i32>, i
define <16 x i32>@test_int_x86_avx512_mask_pmovzxw_d_512(<16 x i16> %x0, <16 x i32> %x1, i16 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovzxw_d_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpmovzxwd {{.*#+}} zmm1 {%k1} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
@@ -1905,7 +2029,7 @@ declare <8 x i64> @llvm.x86.avx512.mask.pmovzxw.q.512(<8 x i16>, <8 x i64>, i8)
define <8 x i64>@test_int_x86_avx512_mask_pmovzxw_q_512(<8 x i16> %x0, <8 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovzxw_q_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpmovzxwq {{.*#+}} zmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpmovzxwq {{.*#+}} zmm1 {%k1} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
@@ -1925,7 +2049,7 @@ declare <16 x i32> @llvm.x86.avx512.mask.pmovsxb.d.512(<16 x i8>, <16 x i32>, i1
define <16 x i32>@test_int_x86_avx512_mask_pmovsxb_d_512(<16 x i8> %x0, <16 x i32> %x1, i16 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovsxb_d_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpmovsxbd %xmm0, %zmm2
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpmovsxbd %xmm0, %zmm1 {%k1}
@@ -1945,7 +2069,7 @@ declare <8 x i64> @llvm.x86.avx512.mask.pmovsxb.q.512(<16 x i8>, <8 x i64>, i8)
define <8 x i64>@test_int_x86_avx512_mask_pmovsxb_q_512(<16 x i8> %x0, <8 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovsxb_q_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpmovsxbq %xmm0, %zmm2
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpmovsxbq %xmm0, %zmm1 {%k1}
@@ -1965,7 +2089,7 @@ declare <8 x i64> @llvm.x86.avx512.mask.pmovsxd.q.512(<8 x i32>, <8 x i64>, i8)
define <8 x i64>@test_int_x86_avx512_mask_pmovsxd_q_512(<8 x i32> %x0, <8 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovsxd_q_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpmovsxdq %ymm0, %zmm2
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpmovsxdq %ymm0, %zmm1 {%k1}
@@ -1986,7 +2110,7 @@ declare <16 x i32> @llvm.x86.avx512.mask.pmovsxw.d.512(<16 x i16>, <16 x i32>, i
define <16 x i32>@test_int_x86_avx512_mask_pmovsxw_d_512(<16 x i16> %x0, <16 x i32> %x1, i16 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovsxw_d_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpmovsxwd %ymm0, %zmm2
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpmovsxwd %ymm0, %zmm1 {%k1}
@@ -2007,7 +2131,7 @@ declare <8 x i64> @llvm.x86.avx512.mask.pmovsxw.q.512(<8 x i16>, <8 x i64>, i8)
define <8 x i64>@test_int_x86_avx512_mask_pmovsxw_q_512(<8 x i16> %x0, <8 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovsxw_q_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpmovsxwq %xmm0, %zmm2
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpmovsxwq %xmm0, %zmm1 {%k1}
@@ -2027,7 +2151,7 @@ declare <8 x i64> @llvm.x86.avx512.mask.psrl.qi.512(<8 x i64>, i32, <8 x i64>, i
define <8 x i64>@test_int_x86_avx512_mask_psrl_qi_512(<8 x i64> %x0, i32 %x1, <8 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psrl_qi_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsrlq $4, %zmm0, %zmm2
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpsrlq $4, %zmm0, %zmm1 {%k1}
@@ -2047,7 +2171,7 @@ declare <16 x i32> @llvm.x86.avx512.mask.psrl.di.512(<16 x i32>, i32, <16 x i32>
define <16 x i32>@test_int_x86_avx512_mask_psrl_di_512(<16 x i32> %x0, i32 %x1, <16 x i32> %x2, i16 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psrl_di_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsrld $4, %zmm0, %zmm2
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpsrld $4, %zmm0, %zmm1 {%k1}
@@ -2067,7 +2191,7 @@ declare <16 x i32> @llvm.x86.avx512.mask.psra.di.512(<16 x i32>, i32, <16 x i32>
define <16 x i32>@test_int_x86_avx512_mask_psra_di_512(<16 x i32> %x0, i32 %x1, <16 x i32> %x2, i16 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psra_di_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsrad $3, %zmm0, %zmm2
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpsrad $3, %zmm0, %zmm1 {%k1}
@@ -2087,7 +2211,7 @@ declare <8 x i64> @llvm.x86.avx512.mask.psra.qi.512(<8 x i64>, i32, <8 x i64>, i
define <8 x i64>@test_int_x86_avx512_mask_psra_qi_512(<8 x i64> %x0, i32 %x1, <8 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psra_qi_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsraq $3, %zmm0, %zmm2
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpsraq $3, %zmm0, %zmm1 {%k1}
@@ -2107,7 +2231,7 @@ declare <16 x i32> @llvm.x86.avx512.mask.psll.di.512(<16 x i32>, i32, <16 x i32>
define <16 x i32>@test_int_x86_avx512_mask_psll_di_512(<16 x i32> %x0, i32 %x1, <16 x i32> %x2, i16 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psll_di_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpslld $3, %zmm0, %zmm2
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpslld $3, %zmm0, %zmm1 {%k1}
@@ -2127,7 +2251,7 @@ declare <8 x i64> @llvm.x86.avx512.mask.psll.qi.512(<8 x i64>, i32, <8 x i64>, i
define <8 x i64>@test_int_x86_avx512_mask_psll_qi_512(<8 x i64> %x0, i32 %x1, <8 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psll_qi_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsllq $3, %zmm0, %zmm2
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpsllq $3, %zmm0, %zmm1 {%k1}
@@ -2145,7 +2269,7 @@ define <8 x i64>@test_int_x86_avx512_mask_psll_qi_512(<8 x i64> %x0, i32 %x1, <8
define <16 x i32> @test_x86_avx512_psll_d(<16 x i32> %a0, <4 x i32> %a1) {
; CHECK-LABEL: test_x86_avx512_psll_d:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpslld %xmm1, %zmm0, %zmm0
; CHECK-NEXT: retq
%res = call <16 x i32> @llvm.x86.avx512.mask.psll.d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> zeroinitializer, i16 -1)
@@ -2154,7 +2278,7 @@ define <16 x i32> @test_x86_avx512_psll_d(<16 x i32> %a0, <4 x i32> %a1) {
define <16 x i32> @test_x86_avx512_mask_psll_d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> %a2, i16 %mask) {
; CHECK-LABEL: test_x86_avx512_mask_psll_d:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpslld %xmm1, %zmm0, %zmm2 {%k1}
; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
@@ -2165,7 +2289,7 @@ define <16 x i32> @test_x86_avx512_mask_psll_d(<16 x i32> %a0, <4 x i32> %a1, <1
define <16 x i32> @test_x86_avx512_maskz_psll_d(<16 x i32> %a0, <4 x i32> %a1, i16 %mask) {
; CHECK-LABEL: test_x86_avx512_maskz_psll_d:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpslld %xmm1, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -2177,7 +2301,7 @@ declare <16 x i32> @llvm.x86.avx512.mask.psll.d(<16 x i32>, <4 x i32>, <16 x i32
define <8 x i64> @test_x86_avx512_psll_q(<8 x i64> %a0, <2 x i64> %a1) {
; CHECK-LABEL: test_x86_avx512_psll_q:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsllq %xmm1, %zmm0, %zmm0
; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.mask.psll.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> zeroinitializer, i8 -1)
@@ -2186,7 +2310,7 @@ define <8 x i64> @test_x86_avx512_psll_q(<8 x i64> %a0, <2 x i64> %a1) {
define <8 x i64> @test_x86_avx512_mask_psll_q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %a2, i8 %mask) {
; CHECK-LABEL: test_x86_avx512_mask_psll_q:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpsllq %xmm1, %zmm0, %zmm2 {%k1}
; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
@@ -2197,7 +2321,7 @@ define <8 x i64> @test_x86_avx512_mask_psll_q(<8 x i64> %a0, <2 x i64> %a1, <8 x
define <8 x i64> @test_x86_avx512_maskz_psll_q(<8 x i64> %a0, <2 x i64> %a1, i8 %mask) {
; CHECK-LABEL: test_x86_avx512_maskz_psll_q:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpsllq %xmm1, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -2209,7 +2333,7 @@ declare <8 x i64> @llvm.x86.avx512.mask.psll.q(<8 x i64>, <2 x i64>, <8 x i64>,
define <16 x i32> @test_x86_avx512_psrl_d(<16 x i32> %a0, <4 x i32> %a1) {
; CHECK-LABEL: test_x86_avx512_psrl_d:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsrld %xmm1, %zmm0, %zmm0
; CHECK-NEXT: retq
%res = call <16 x i32> @llvm.x86.avx512.mask.psrl.d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> zeroinitializer, i16 -1)
@@ -2218,7 +2342,7 @@ define <16 x i32> @test_x86_avx512_psrl_d(<16 x i32> %a0, <4 x i32> %a1) {
define <16 x i32> @test_x86_avx512_mask_psrl_d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> %a2, i16 %mask) {
; CHECK-LABEL: test_x86_avx512_mask_psrl_d:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpsrld %xmm1, %zmm0, %zmm2 {%k1}
; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
@@ -2229,7 +2353,7 @@ define <16 x i32> @test_x86_avx512_mask_psrl_d(<16 x i32> %a0, <4 x i32> %a1, <1
define <16 x i32> @test_x86_avx512_maskz_psrl_d(<16 x i32> %a0, <4 x i32> %a1, i16 %mask) {
; CHECK-LABEL: test_x86_avx512_maskz_psrl_d:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpsrld %xmm1, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -2241,7 +2365,7 @@ declare <16 x i32> @llvm.x86.avx512.mask.psrl.d(<16 x i32>, <4 x i32>, <16 x i32
define <8 x i64> @test_x86_avx512_psrl_q(<8 x i64> %a0, <2 x i64> %a1) {
; CHECK-LABEL: test_x86_avx512_psrl_q:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsrlq %xmm1, %zmm0, %zmm0
; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.mask.psrl.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> zeroinitializer, i8 -1)
@@ -2250,7 +2374,7 @@ define <8 x i64> @test_x86_avx512_psrl_q(<8 x i64> %a0, <2 x i64> %a1) {
define <8 x i64> @test_x86_avx512_mask_psrl_q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %a2, i8 %mask) {
; CHECK-LABEL: test_x86_avx512_mask_psrl_q:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpsrlq %xmm1, %zmm0, %zmm2 {%k1}
; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
@@ -2261,7 +2385,7 @@ define <8 x i64> @test_x86_avx512_mask_psrl_q(<8 x i64> %a0, <2 x i64> %a1, <8 x
define <8 x i64> @test_x86_avx512_maskz_psrl_q(<8 x i64> %a0, <2 x i64> %a1, i8 %mask) {
; CHECK-LABEL: test_x86_avx512_maskz_psrl_q:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpsrlq %xmm1, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -2273,7 +2397,7 @@ declare <8 x i64> @llvm.x86.avx512.mask.psrl.q(<8 x i64>, <2 x i64>, <8 x i64>,
define <16 x i32> @test_x86_avx512_psra_d(<16 x i32> %a0, <4 x i32> %a1) {
; CHECK-LABEL: test_x86_avx512_psra_d:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsrad %xmm1, %zmm0, %zmm0
; CHECK-NEXT: retq
%res = call <16 x i32> @llvm.x86.avx512.mask.psra.d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> zeroinitializer, i16 -1)
@@ -2282,7 +2406,7 @@ define <16 x i32> @test_x86_avx512_psra_d(<16 x i32> %a0, <4 x i32> %a1) {
define <16 x i32> @test_x86_avx512_mask_psra_d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> %a2, i16 %mask) {
; CHECK-LABEL: test_x86_avx512_mask_psra_d:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpsrad %xmm1, %zmm0, %zmm2 {%k1}
; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
@@ -2293,7 +2417,7 @@ define <16 x i32> @test_x86_avx512_mask_psra_d(<16 x i32> %a0, <4 x i32> %a1, <1
define <16 x i32> @test_x86_avx512_maskz_psra_d(<16 x i32> %a0, <4 x i32> %a1, i16 %mask) {
; CHECK-LABEL: test_x86_avx512_maskz_psra_d:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpsrad %xmm1, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -2305,7 +2429,7 @@ declare <16 x i32> @llvm.x86.avx512.mask.psra.d(<16 x i32>, <4 x i32>, <16 x i32
define <8 x i64> @test_x86_avx512_psra_q(<8 x i64> %a0, <2 x i64> %a1) {
; CHECK-LABEL: test_x86_avx512_psra_q:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsraq %xmm1, %zmm0, %zmm0
; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.mask.psra.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> zeroinitializer, i8 -1)
@@ -2314,7 +2438,7 @@ define <8 x i64> @test_x86_avx512_psra_q(<8 x i64> %a0, <2 x i64> %a1) {
define <8 x i64> @test_x86_avx512_mask_psra_q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %a2, i8 %mask) {
; CHECK-LABEL: test_x86_avx512_mask_psra_q:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpsraq %xmm1, %zmm0, %zmm2 {%k1}
; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
@@ -2325,7 +2449,7 @@ define <8 x i64> @test_x86_avx512_mask_psra_q(<8 x i64> %a0, <2 x i64> %a1, <8 x
define <8 x i64> @test_x86_avx512_maskz_psra_q(<8 x i64> %a0, <2 x i64> %a1, i8 %mask) {
; CHECK-LABEL: test_x86_avx512_maskz_psra_q:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpsraq %xmm1, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -2337,7 +2461,7 @@ declare <8 x i64> @llvm.x86.avx512.mask.psra.q(<8 x i64>, <2 x i64>, <8 x i64>,
define <16 x i32> @test_x86_avx512_psllv_d(<16 x i32> %a0, <16 x i32> %a1) {
; CHECK-LABEL: test_x86_avx512_psllv_d:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsllvd %zmm1, %zmm0, %zmm0
; CHECK-NEXT: retq
%res = call <16 x i32> @llvm.x86.avx512.mask.psllv.d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> zeroinitializer, i16 -1)
@@ -2346,7 +2470,7 @@ define <16 x i32> @test_x86_avx512_psllv_d(<16 x i32> %a0, <16 x i32> %a1) {
define <16 x i32> @test_x86_avx512_mask_psllv_d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask) {
; CHECK-LABEL: test_x86_avx512_mask_psllv_d:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpsllvd %zmm1, %zmm0, %zmm2 {%k1}
; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
@@ -2357,7 +2481,7 @@ define <16 x i32> @test_x86_avx512_mask_psllv_d(<16 x i32> %a0, <16 x i32> %a1,
define <16 x i32> @test_x86_avx512_maskz_psllv_d(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
; CHECK-LABEL: test_x86_avx512_maskz_psllv_d:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpsllvd %zmm1, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -2369,7 +2493,7 @@ declare <16 x i32> @llvm.x86.avx512.mask.psllv.d(<16 x i32>, <16 x i32>, <16 x i
define <8 x i64> @test_x86_avx512_psllv_q(<8 x i64> %a0, <8 x i64> %a1) {
; CHECK-LABEL: test_x86_avx512_psllv_q:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsllvq %zmm1, %zmm0, %zmm0
; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.mask.psllv.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> zeroinitializer, i8 -1)
@@ -2378,7 +2502,7 @@ define <8 x i64> @test_x86_avx512_psllv_q(<8 x i64> %a0, <8 x i64> %a1) {
define <8 x i64> @test_x86_avx512_mask_psllv_q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask) {
; CHECK-LABEL: test_x86_avx512_mask_psllv_q:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpsllvq %zmm1, %zmm0, %zmm2 {%k1}
; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
@@ -2389,7 +2513,7 @@ define <8 x i64> @test_x86_avx512_mask_psllv_q(<8 x i64> %a0, <8 x i64> %a1, <8
define <8 x i64> @test_x86_avx512_maskz_psllv_q(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
; CHECK-LABEL: test_x86_avx512_maskz_psllv_q:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpsllvq %zmm1, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -2402,7 +2526,7 @@ declare <8 x i64> @llvm.x86.avx512.mask.psllv.q(<8 x i64>, <8 x i64>, <8 x i64>,
define <16 x i32> @test_x86_avx512_psrav_d(<16 x i32> %a0, <16 x i32> %a1) {
; CHECK-LABEL: test_x86_avx512_psrav_d:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsravd %zmm1, %zmm0, %zmm0
; CHECK-NEXT: retq
%res = call <16 x i32> @llvm.x86.avx512.mask.psrav.d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> zeroinitializer, i16 -1)
@@ -2411,7 +2535,7 @@ define <16 x i32> @test_x86_avx512_psrav_d(<16 x i32> %a0, <16 x i32> %a1) {
define <16 x i32> @test_x86_avx512_mask_psrav_d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask) {
; CHECK-LABEL: test_x86_avx512_mask_psrav_d:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpsravd %zmm1, %zmm0, %zmm2 {%k1}
; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
@@ -2422,7 +2546,7 @@ define <16 x i32> @test_x86_avx512_mask_psrav_d(<16 x i32> %a0, <16 x i32> %a1,
define <16 x i32> @test_x86_avx512_maskz_psrav_d(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
; CHECK-LABEL: test_x86_avx512_maskz_psrav_d:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpsravd %zmm1, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -2434,7 +2558,7 @@ declare <16 x i32> @llvm.x86.avx512.mask.psrav.d(<16 x i32>, <16 x i32>, <16 x i
define <8 x i64> @test_x86_avx512_psrav_q(<8 x i64> %a0, <8 x i64> %a1) {
; CHECK-LABEL: test_x86_avx512_psrav_q:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsravq %zmm1, %zmm0, %zmm0
; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.mask.psrav.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> zeroinitializer, i8 -1)
@@ -2443,7 +2567,7 @@ define <8 x i64> @test_x86_avx512_psrav_q(<8 x i64> %a0, <8 x i64> %a1) {
define <8 x i64> @test_x86_avx512_mask_psrav_q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask) {
; CHECK-LABEL: test_x86_avx512_mask_psrav_q:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpsravq %zmm1, %zmm0, %zmm2 {%k1}
; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
@@ -2454,7 +2578,7 @@ define <8 x i64> @test_x86_avx512_mask_psrav_q(<8 x i64> %a0, <8 x i64> %a1, <8
define <8 x i64> @test_x86_avx512_maskz_psrav_q(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
; CHECK-LABEL: test_x86_avx512_maskz_psrav_q:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpsravq %zmm1, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -2466,7 +2590,7 @@ declare <8 x i64> @llvm.x86.avx512.mask.psrav.q(<8 x i64>, <8 x i64>, <8 x i64>,
define <16 x i32> @test_x86_avx512_psrlv_d(<16 x i32> %a0, <16 x i32> %a1) {
; CHECK-LABEL: test_x86_avx512_psrlv_d:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0
; CHECK-NEXT: retq
%res = call <16 x i32> @llvm.x86.avx512.mask.psrlv.d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> zeroinitializer, i16 -1)
@@ -2475,7 +2599,7 @@ define <16 x i32> @test_x86_avx512_psrlv_d(<16 x i32> %a0, <16 x i32> %a1) {
define <16 x i32> @test_x86_avx512_mask_psrlv_d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask) {
; CHECK-LABEL: test_x86_avx512_mask_psrlv_d:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpsrlvd %zmm1, %zmm0, %zmm2 {%k1}
; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
@@ -2486,7 +2610,7 @@ define <16 x i32> @test_x86_avx512_mask_psrlv_d(<16 x i32> %a0, <16 x i32> %a1,
define <16 x i32> @test_x86_avx512_maskz_psrlv_d(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
; CHECK-LABEL: test_x86_avx512_maskz_psrlv_d:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -2498,7 +2622,7 @@ declare <16 x i32> @llvm.x86.avx512.mask.psrlv.d(<16 x i32>, <16 x i32>, <16 x i
define <8 x i64> @test_x86_avx512_psrlv_q(<8 x i64> %a0, <8 x i64> %a1) {
; CHECK-LABEL: test_x86_avx512_psrlv_q:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsrlvq %zmm1, %zmm0, %zmm0
; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.mask.psrlv.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> zeroinitializer, i8 -1)
@@ -2507,7 +2631,7 @@ define <8 x i64> @test_x86_avx512_psrlv_q(<8 x i64> %a0, <8 x i64> %a1) {
define <8 x i64> @test_x86_avx512_mask_psrlv_q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask) {
; CHECK-LABEL: test_x86_avx512_mask_psrlv_q:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpsrlvq %zmm1, %zmm0, %zmm2 {%k1}
; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
@@ -2518,7 +2642,7 @@ define <8 x i64> @test_x86_avx512_mask_psrlv_q(<8 x i64> %a0, <8 x i64> %a1, <8
define <8 x i64> @test_x86_avx512_maskz_psrlv_q(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
; CHECK-LABEL: test_x86_avx512_maskz_psrlv_q:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpsrlvq %zmm1, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -2530,7 +2654,7 @@ declare <8 x i64> @llvm.x86.avx512.mask.psrlv.q(<8 x i64>, <8 x i64>, <8 x i64>,
define <8 x i64> @test_x86_avx512_psrlv_q_memop(<8 x i64> %a0, <8 x i64>* %ptr) {
; CHECK-LABEL: test_x86_avx512_psrlv_q_memop:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsrlvq (%rdi), %zmm0, %zmm0
; CHECK-NEXT: retq
%b = load <8 x i64>, <8 x i64>* %ptr
@@ -2542,7 +2666,7 @@ declare <8 x double> @llvm.x86.avx512.mask.cvtdq2pd.512(<8 x i32>, <8 x double>,
define <8 x double>@test_int_x86_avx512_mask_cvt_dq2pd_512(<8 x i32> %x0, <8 x double> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_dq2pd_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vcvtdq2pd %ymm0, %zmm2
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vcvtdq2pd %ymm0, %zmm1 {%k1}
@@ -2558,7 +2682,7 @@ declare <8 x double> @llvm.x86.avx512.mask.cvtudq2pd.512(<8 x i32>, <8 x double>
define <8 x double>@test_int_x86_avx512_mask_cvt_udq2pd_512(<8 x i32> %x0, <8 x double> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_udq2pd_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vcvtudq2pd %ymm0, %zmm2
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vcvtudq2pd %ymm0, %zmm1 {%k1}
@@ -2572,7 +2696,7 @@ define <8 x double>@test_int_x86_avx512_mask_cvt_udq2pd_512(<8 x i32> %x0, <8 x
define <8 x i64> @test_valign_q(<8 x i64> %a, <8 x i64> %b) {
; CHECK-LABEL: test_valign_q:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: valignq {{.*#+}} zmm0 = zmm1[2,3,4,5,6,7],zmm0[0,1]
; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.mask.valign.q.512(<8 x i64> %a, <8 x i64> %b, i32 2, <8 x i64> zeroinitializer, i8 -1)
@@ -2581,7 +2705,7 @@ define <8 x i64> @test_valign_q(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @test_mask_valign_q(<8 x i64> %a, <8 x i64> %b, <8 x i64> %src, i8 %mask) {
; CHECK-LABEL: test_mask_valign_q:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: valignq {{.*#+}} zmm2 {%k1} = zmm1[2,3,4,5,6,7],zmm0[0,1]
; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
@@ -2594,7 +2718,7 @@ declare <8 x i64> @llvm.x86.avx512.mask.valign.q.512(<8 x i64>, <8 x i64>, i32,
define <16 x i32> @test_maskz_valign_d(<16 x i32> %a, <16 x i32> %b, i16 %mask) {
; CHECK-LABEL: test_maskz_valign_d:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: valignd {{.*#+}} zmm0 {%k1} {z} = zmm1[5,6,7,8,9,10,11,12,13,14,15],zmm0[0,1,2,3,4]
; CHECK-NEXT: retq
@@ -2608,7 +2732,7 @@ declare <8 x double> @llvm.x86.avx512.mask.vpermilvar.pd.512(<8 x double>, <8 x
define <8 x double>@test_int_x86_avx512_mask_vpermilvar_pd_512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vpermilvar_pd_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpermilpd %zmm1, %zmm0, %zmm3
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpermilpd %zmm1, %zmm0, %zmm2 {%k1}
@@ -2628,7 +2752,7 @@ declare <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float>, <16 x
define <16 x float>@test_int_x86_avx512_mask_vpermilvar_ps_512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vpermilvar_ps_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpermilps %zmm1, %zmm0, %zmm3
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpermilps %zmm1, %zmm0, %zmm2 {%k1}
@@ -2647,7 +2771,7 @@ define <16 x float>@test_int_x86_avx512_mask_vpermilvar_ps_512(<16 x float> %x0,
; Test case to make sure we can print shuffle decode comments for constant pool loads.
define <16 x float>@test_int_x86_avx512_mask_vpermilvar_ps_512_constant_pool(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vpermilvar_ps_512_constant_pool:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpermilps {{.*#+}} zmm2 {%k1} = zmm0[2,3,0,1,7,6,5,4,9,8,11,10,12,13,14,15]
; CHECK-NEXT: vpermilps {{.*#+}} zmm1 {%k1} {z} = zmm0[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15]
@@ -2665,7 +2789,7 @@ define <16 x float>@test_int_x86_avx512_mask_vpermilvar_ps_512_constant_pool(<16
define <8 x i64> @test_mask_mul_epi32_rr(<16 x i32> %a, <16 x i32> %b) {
; CHECK-LABEL: test_mask_mul_epi32_rr:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpmuldq %zmm1, %zmm0, %zmm0
; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 -1)
@@ -2674,7 +2798,7 @@ define <8 x i64> @test_mask_mul_epi32_rr(<16 x i32> %a, <16 x i32> %b) {
define <8 x i64> @test_mask_mul_epi32_rrk(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_mul_epi32_rrk:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpmuldq %zmm1, %zmm0, %zmm2 {%k1}
; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
@@ -2685,7 +2809,7 @@ define <8 x i64> @test_mask_mul_epi32_rrk(<16 x i32> %a, <16 x i32> %b, <8 x i64
define <8 x i64> @test_mask_mul_epi32_rrkz(<16 x i32> %a, <16 x i32> %b, i8 %mask) {
; CHECK-LABEL: test_mask_mul_epi32_rrkz:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpmuldq %zmm1, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -2695,7 +2819,7 @@ define <8 x i64> @test_mask_mul_epi32_rrkz(<16 x i32> %a, <16 x i32> %b, i8 %mas
define <8 x i64> @test_mask_mul_epi32_rm(<16 x i32> %a, <16 x i32>* %ptr_b) {
; CHECK-LABEL: test_mask_mul_epi32_rm:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpmuldq (%rdi), %zmm0, %zmm0
; CHECK-NEXT: retq
%b = load <16 x i32>, <16 x i32>* %ptr_b
@@ -2705,7 +2829,7 @@ define <8 x i64> @test_mask_mul_epi32_rm(<16 x i32> %a, <16 x i32>* %ptr_b) {
define <8 x i64> @test_mask_mul_epi32_rmk(<16 x i32> %a, <16 x i32>* %ptr_b, <8 x i64> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_mul_epi32_rmk:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpmuldq (%rdi), %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
@@ -2717,7 +2841,7 @@ define <8 x i64> @test_mask_mul_epi32_rmk(<16 x i32> %a, <16 x i32>* %ptr_b, <8
define <8 x i64> @test_mask_mul_epi32_rmkz(<16 x i32> %a, <16 x i32>* %ptr_b, i8 %mask) {
; CHECK-LABEL: test_mask_mul_epi32_rmkz:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpmuldq (%rdi), %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -2728,7 +2852,7 @@ define <8 x i64> @test_mask_mul_epi32_rmkz(<16 x i32> %a, <16 x i32>* %ptr_b, i8
define <8 x i64> @test_mask_mul_epi32_rmb(<16 x i32> %a, i64* %ptr_b) {
; CHECK-LABEL: test_mask_mul_epi32_rmb:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpmuldq (%rdi){1to8}, %zmm0, %zmm0
; CHECK-NEXT: retq
%q = load i64, i64* %ptr_b
@@ -2741,7 +2865,7 @@ define <8 x i64> @test_mask_mul_epi32_rmb(<16 x i32> %a, i64* %ptr_b) {
define <8 x i64> @test_mask_mul_epi32_rmbk(<16 x i32> %a, i64* %ptr_b, <8 x i64> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_mul_epi32_rmbk:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpmuldq (%rdi){1to8}, %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
@@ -2756,7 +2880,7 @@ define <8 x i64> @test_mask_mul_epi32_rmbk(<16 x i32> %a, i64* %ptr_b, <8 x i64>
define <8 x i64> @test_mask_mul_epi32_rmbkz(<16 x i32> %a, i64* %ptr_b, i8 %mask) {
; CHECK-LABEL: test_mask_mul_epi32_rmbkz:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpmuldq (%rdi){1to8}, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -2772,7 +2896,7 @@ declare <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32>, <16 x i32>, <8 x
define <8 x i64> @test_mask_mul_epu32_rr(<16 x i32> %a, <16 x i32> %b) {
; CHECK-LABEL: test_mask_mul_epu32_rr:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpmuludq %zmm1, %zmm0, %zmm0
; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 -1)
@@ -2781,7 +2905,7 @@ define <8 x i64> @test_mask_mul_epu32_rr(<16 x i32> %a, <16 x i32> %b) {
define <8 x i64> @test_mask_mul_epu32_rrk(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_mul_epu32_rrk:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpmuludq %zmm1, %zmm0, %zmm2 {%k1}
; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
@@ -2792,7 +2916,7 @@ define <8 x i64> @test_mask_mul_epu32_rrk(<16 x i32> %a, <16 x i32> %b, <8 x i64
define <8 x i64> @test_mask_mul_epu32_rrkz(<16 x i32> %a, <16 x i32> %b, i8 %mask) {
; CHECK-LABEL: test_mask_mul_epu32_rrkz:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -2802,7 +2926,7 @@ define <8 x i64> @test_mask_mul_epu32_rrkz(<16 x i32> %a, <16 x i32> %b, i8 %mas
define <8 x i64> @test_mask_mul_epu32_rm(<16 x i32> %a, <16 x i32>* %ptr_b) {
; CHECK-LABEL: test_mask_mul_epu32_rm:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpmuludq (%rdi), %zmm0, %zmm0
; CHECK-NEXT: retq
%b = load <16 x i32>, <16 x i32>* %ptr_b
@@ -2812,7 +2936,7 @@ define <8 x i64> @test_mask_mul_epu32_rm(<16 x i32> %a, <16 x i32>* %ptr_b) {
define <8 x i64> @test_mask_mul_epu32_rmk(<16 x i32> %a, <16 x i32>* %ptr_b, <8 x i64> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_mul_epu32_rmk:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpmuludq (%rdi), %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
@@ -2824,7 +2948,7 @@ define <8 x i64> @test_mask_mul_epu32_rmk(<16 x i32> %a, <16 x i32>* %ptr_b, <8
define <8 x i64> @test_mask_mul_epu32_rmkz(<16 x i32> %a, <16 x i32>* %ptr_b, i8 %mask) {
; CHECK-LABEL: test_mask_mul_epu32_rmkz:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpmuludq (%rdi), %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -2835,7 +2959,7 @@ define <8 x i64> @test_mask_mul_epu32_rmkz(<16 x i32> %a, <16 x i32>* %ptr_b, i8
define <8 x i64> @test_mask_mul_epu32_rmb(<16 x i32> %a, i64* %ptr_b) {
; CHECK-LABEL: test_mask_mul_epu32_rmb:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpmuludq (%rdi){1to8}, %zmm0, %zmm0
; CHECK-NEXT: retq
%q = load i64, i64* %ptr_b
@@ -2848,7 +2972,7 @@ define <8 x i64> @test_mask_mul_epu32_rmb(<16 x i32> %a, i64* %ptr_b) {
define <8 x i64> @test_mask_mul_epu32_rmbk(<16 x i32> %a, i64* %ptr_b, <8 x i64> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_mul_epu32_rmbk:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpmuludq (%rdi){1to8}, %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
@@ -2863,7 +2987,7 @@ define <8 x i64> @test_mask_mul_epu32_rmbk(<16 x i32> %a, i64* %ptr_b, <8 x i64>
define <8 x i64> @test_mask_mul_epu32_rmbkz(<16 x i32> %a, i64* %ptr_b, i8 %mask) {
; CHECK-LABEL: test_mask_mul_epu32_rmbkz:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpmuludq (%rdi){1to8}, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -2879,25 +3003,19 @@ declare <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32>, <16 x i32>, <8
define <4 x float> @test_mask_vextractf32x4(<4 x float> %b, <16 x float> %a, i8 %mask) {
; CHECK-LABEL: test_mask_vextractf32x4:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vextractf32x4 $2, %zmm1, %xmm1
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vmovd %edi, %xmm2
; CHECK-NEXT: kmovw %edi, %k0
-; CHECK-NEXT: kshiftlw $12, %k0, %k1
-; CHECK-NEXT: kshiftrw $15, %k1, %k1
-; CHECK-NEXT: kshiftlw $13, %k0, %k2
-; CHECK-NEXT: kshiftrw $15, %k2, %k2
-; CHECK-NEXT: kshiftlw $15, %k0, %k3
-; CHECK-NEXT: kshiftrw $15, %k3, %k3
-; CHECK-NEXT: kshiftlw $14, %k0, %k0
-; CHECK-NEXT: kshiftrw $15, %k0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: kmovw %k3, %ecx
-; CHECK-NEXT: vmovd %ecx, %xmm2
-; CHECK-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
-; CHECK-NEXT: kmovw %k2, %eax
-; CHECK-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
+; CHECK-NEXT: kshiftrw $3, %k0, %k1
; CHECK-NEXT: kmovw %k1, %eax
+; CHECK-NEXT: kshiftrw $2, %k0, %k1
+; CHECK-NEXT: kmovw %k1, %ecx
+; CHECK-NEXT: kshiftrw $1, %k0, %k0
+; CHECK-NEXT: kmovw %k0, %edx
+; CHECK-NEXT: vpinsrb $4, %edx, %xmm2, %xmm2
+; CHECK-NEXT: vpinsrb $8, %ecx, %xmm2, %xmm2
; CHECK-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
+; CHECK-NEXT: vextractf32x4 $2, %zmm1, %xmm1
; CHECK-NEXT: vpslld $31, %xmm2, %xmm2
; CHECK-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
@@ -2909,24 +3027,18 @@ declare <4 x float> @llvm.x86.avx512.mask.vextractf32x4.512(<16 x float>, i32, <
define <4 x i64> @test_mask_vextracti64x4(<4 x i64> %b, <8 x i64> %a, i8 %mask) {
; CHECK-LABEL: test_mask_vextracti64x4:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vextracti64x4 $1, %zmm1, %ymm1
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vextractf64x4 $1, %zmm1, %ymm1
+; CHECK-NEXT: vmovd %edi, %xmm2
; CHECK-NEXT: kmovw %edi, %k0
-; CHECK-NEXT: kshiftlw $12, %k0, %k1
-; CHECK-NEXT: kshiftrw $15, %k1, %k1
-; CHECK-NEXT: kshiftlw $13, %k0, %k2
-; CHECK-NEXT: kshiftrw $15, %k2, %k2
-; CHECK-NEXT: kshiftlw $15, %k0, %k3
-; CHECK-NEXT: kshiftrw $15, %k3, %k3
-; CHECK-NEXT: kshiftlw $14, %k0, %k0
-; CHECK-NEXT: kshiftrw $15, %k0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: kmovw %k3, %ecx
-; CHECK-NEXT: vmovd %ecx, %xmm2
-; CHECK-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
-; CHECK-NEXT: kmovw %k2, %eax
-; CHECK-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
+; CHECK-NEXT: kshiftrw $3, %k0, %k1
; CHECK-NEXT: kmovw %k1, %eax
+; CHECK-NEXT: kshiftrw $2, %k0, %k1
+; CHECK-NEXT: kmovw %k1, %ecx
+; CHECK-NEXT: kshiftrw $1, %k0, %k0
+; CHECK-NEXT: kmovw %k0, %edx
+; CHECK-NEXT: vpinsrb $4, %edx, %xmm2, %xmm2
+; CHECK-NEXT: vpinsrb $8, %ecx, %xmm2, %xmm2
; CHECK-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
; CHECK-NEXT: vpslld $31, %xmm2, %xmm2
; CHECK-NEXT: vpmovsxdq %xmm2, %ymm2
@@ -2940,25 +3052,19 @@ declare <4 x i64> @llvm.x86.avx512.mask.vextracti64x4.512(<8 x i64>, i32, <4 x i
define <4 x i32> @test_maskz_vextracti32x4(<16 x i32> %a, i8 %mask) {
; CHECK-LABEL: test_maskz_vextracti32x4:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vextracti32x4 $2, %zmm0, %xmm0
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vmovd %edi, %xmm1
; CHECK-NEXT: kmovw %edi, %k0
-; CHECK-NEXT: kshiftlw $12, %k0, %k1
-; CHECK-NEXT: kshiftrw $15, %k1, %k1
-; CHECK-NEXT: kshiftlw $13, %k0, %k2
-; CHECK-NEXT: kshiftrw $15, %k2, %k2
-; CHECK-NEXT: kshiftlw $15, %k0, %k3
-; CHECK-NEXT: kshiftrw $15, %k3, %k3
-; CHECK-NEXT: kshiftlw $14, %k0, %k0
-; CHECK-NEXT: kshiftrw $15, %k0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: kmovw %k3, %ecx
-; CHECK-NEXT: vmovd %ecx, %xmm1
-; CHECK-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; CHECK-NEXT: kmovw %k2, %eax
-; CHECK-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; CHECK-NEXT: kshiftrw $3, %k0, %k1
; CHECK-NEXT: kmovw %k1, %eax
+; CHECK-NEXT: kshiftrw $2, %k0, %k1
+; CHECK-NEXT: kmovw %k1, %ecx
+; CHECK-NEXT: kshiftrw $1, %k0, %k0
+; CHECK-NEXT: kmovw %k0, %edx
+; CHECK-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; CHECK-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; CHECK-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; CHECK-NEXT: vextracti32x4 $2, %zmm0, %xmm0
; CHECK-NEXT: vpslld $31, %xmm1, %xmm1
; CHECK-NEXT: vpsrad $31, %xmm1, %xmm1
; CHECK-NEXT: vpand %xmm0, %xmm1, %xmm0
@@ -2971,7 +3077,7 @@ declare <4 x i32> @llvm.x86.avx512.mask.vextracti32x4.512(<16 x i32>, i32, <4 x
define <4 x double> @test_vextractf64x4(<8 x double> %a) {
; CHECK-LABEL: test_vextractf64x4:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm0
; CHECK-NEXT: retq
%res = call <4 x double> @llvm.x86.avx512.mask.vextractf64x4.512(<8 x double> %a, i32 1, <4 x double> zeroinitializer, i8 -1)
@@ -2984,8 +3090,8 @@ declare <16 x float> @llvm.x86.avx512.mask.insertf32x4.512(<16 x float>, <4 x fl
define <16 x float>@test_int_x86_avx512_mask_insertf32x4_512(<16 x float> %x0, <4 x float> %x1, <16 x float> %x3, i16 %x4) {
; CHECK-LABEL: test_int_x86_avx512_mask_insertf32x4_512:
-; CHECK: ## BB#0:
-; CHECK-NEXT: ## kill: %XMM1<def> %XMM1<kill> %ZMM1<def>
+; CHECK: ## %bb.0:
+; CHECK-NEXT: ## kill: def %xmm1 killed %xmm1 def %zmm1
; CHECK-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm3
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm2 {%k1}
@@ -3005,8 +3111,8 @@ declare <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32>, <4 x i32>,
define <16 x i32>@test_int_x86_avx512_mask_inserti32x4_512(<16 x i32> %x0, <4 x i32> %x1, <16 x i32> %x3, i16 %x4) {
; CHECK-LABEL: test_int_x86_avx512_mask_inserti32x4_512:
-; CHECK: ## BB#0:
-; CHECK-NEXT: ## kill: %XMM1<def> %XMM1<kill> %ZMM1<def>
+; CHECK: ## %bb.0:
+; CHECK-NEXT: ## kill: def %xmm1 killed %xmm1 def %zmm1
; CHECK-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm3
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm2 {%k1}
@@ -3026,7 +3132,7 @@ declare <8 x double> @llvm.x86.avx512.mask.insertf64x4.512(<8 x double>, <4 x do
define <8 x double>@test_int_x86_avx512_mask_insertf64x4_512(<8 x double> %x0, <4 x double> %x1, <8 x double> %x3, i8 %x4) {
; CHECK-LABEL: test_int_x86_avx512_mask_insertf64x4_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm3
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm2 {%k1}
@@ -3046,7 +3152,7 @@ declare <8 x i64> @llvm.x86.avx512.mask.inserti64x4.512(<8 x i64>, <4 x i64>, i3
define <8 x i64>@test_int_x86_avx512_mask_inserti64x4_512(<8 x i64> %x0, <4 x i64> %x1, <8 x i64> %x3, i8 %x4) {
; CHECK-LABEL: test_int_x86_avx512_mask_inserti64x4_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm3
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm2 {%k1}
@@ -3064,7 +3170,7 @@ define <8 x i64>@test_int_x86_avx512_mask_inserti64x4_512(<8 x i64> %x0, <4 x i6
define <8 x i64> @test_x86_avx512_movntdqa(i8* %a0) {
; CHECK-LABEL: test_x86_avx512_movntdqa:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovntdqa (%rdi), %zmm0
; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.movntdqa(i8* %a0)
@@ -3075,7 +3181,7 @@ declare <8 x i64> @llvm.x86.avx512.movntdqa(i8*) nounwind readonly
define <8 x i16> @test_cmp_d_512(<16 x i32> %a0, <16 x i32> %a1) {
; CHECK-LABEL: test_cmp_d_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
; CHECK-NEXT: vpcmpgtd %zmm0, %zmm1, %k1
; CHECK-NEXT: vpcmpled %zmm1, %zmm0, %k2
@@ -3120,7 +3226,7 @@ define <8 x i16> @test_cmp_d_512(<16 x i32> %a0, <16 x i32> %a1) {
define <8 x i16> @test_mask_cmp_d_512(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
; CHECK-LABEL: test_mask_cmp_d_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 {%k1}
; CHECK-NEXT: vpcmpgtd %zmm0, %zmm1, %k2 {%k1}
@@ -3168,7 +3274,7 @@ declare i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32>, <16 x i32>, i32, i16) no
define <8 x i16> @test_ucmp_d_512(<16 x i32> %a0, <16 x i32> %a1) {
; CHECK-LABEL: test_ucmp_d_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
; CHECK-NEXT: vpcmpltud %zmm1, %zmm0, %k1
; CHECK-NEXT: vpcmpleud %zmm1, %zmm0, %k2
@@ -3213,7 +3319,7 @@ define <8 x i16> @test_ucmp_d_512(<16 x i32> %a0, <16 x i32> %a1) {
define <8 x i16> @test_mask_ucmp_d_512(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
; CHECK-LABEL: test_mask_ucmp_d_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 {%k1}
; CHECK-NEXT: vpcmpltud %zmm1, %zmm0, %k2 {%k1}
@@ -3261,7 +3367,7 @@ declare i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32>, <16 x i32>, i32, i16) n
define <8 x i8> @test_cmp_q_512(<8 x i64> %a0, <8 x i64> %a1) {
; CHECK-LABEL: test_cmp_q_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k0
; CHECK-NEXT: vpcmpgtq %zmm0, %zmm1, %k1
; CHECK-NEXT: vpcmpleq %zmm1, %zmm0, %k2
@@ -3306,7 +3412,7 @@ define <8 x i8> @test_cmp_q_512(<8 x i64> %a0, <8 x i64> %a1) {
define <8 x i8> @test_mask_cmp_q_512(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
; CHECK-LABEL: test_mask_cmp_q_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 {%k1}
; CHECK-NEXT: vpcmpgtq %zmm0, %zmm1, %k2 {%k1}
@@ -3354,7 +3460,7 @@ declare i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64>, <8 x i64>, i32, i8) nounwi
define <8 x i8> @test_ucmp_q_512(<8 x i64> %a0, <8 x i64> %a1) {
; CHECK-LABEL: test_ucmp_q_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k0
; CHECK-NEXT: vpcmpltuq %zmm1, %zmm0, %k1
; CHECK-NEXT: vpcmpleuq %zmm1, %zmm0, %k2
@@ -3399,7 +3505,7 @@ define <8 x i8> @test_ucmp_q_512(<8 x i64> %a0, <8 x i64> %a1) {
define <8 x i8> @test_mask_ucmp_q_512(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
; CHECK-LABEL: test_mask_ucmp_q_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 {%k1}
; CHECK-NEXT: vpcmpltuq %zmm1, %zmm0, %k2 {%k1}
@@ -3444,3 +3550,247 @@ define <8 x i8> @test_mask_ucmp_q_512(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
}
declare i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64>, <8 x i64>, i32, i8) nounwind readnone
+
+declare <16 x float> @llvm.x86.avx512.mask.broadcastf32x4.512(<4 x float>, <16 x float>, i16)
+
+define <16 x float>@test_int_x86_avx512_mask_broadcastf32x4_512(<4 x float> %x0, <16 x float> %x2, i16 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_mask_broadcastf32x4_512:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: ## kill: def %xmm0 killed %xmm0 def %ymm0
+; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; CHECK-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vmovaps %zmm0, %zmm1 {%k1}
+; CHECK-NEXT: vmovaps %zmm0, %zmm2 {%k1} {z}
+; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: vaddps %zmm0, %zmm2, %zmm0
+; CHECK-NEXT: retq
+
+ %res1 = call <16 x float> @llvm.x86.avx512.mask.broadcastf32x4.512(<4 x float> %x0, <16 x float> %x2, i16 -1)
+ %res2 = call <16 x float> @llvm.x86.avx512.mask.broadcastf32x4.512(<4 x float> %x0, <16 x float> %x2, i16 %mask)
+ %res3 = call <16 x float> @llvm.x86.avx512.mask.broadcastf32x4.512(<4 x float> %x0, <16 x float> zeroinitializer, i16 %mask)
+ %res4 = fadd <16 x float> %res1, %res2
+ %res5 = fadd <16 x float> %res3, %res4
+ ret <16 x float> %res5
+}
+
+define <16 x float>@test_int_x86_avx512_mask_broadcastf32x4_512_load(<4 x float>* %x0ptr, <16 x float> %x2, i16 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_mask_broadcastf32x4_512_load:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovw %esi, %k1
+; CHECK-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; CHECK-NEXT: retq
+ %x0 = load <4 x float>, <4 x float>* %x0ptr
+ %res = call <16 x float> @llvm.x86.avx512.mask.broadcastf32x4.512(<4 x float> %x0, <16 x float> %x2, i16 %mask)
+ ret <16 x float> %res
+}
+
+declare <8 x double> @llvm.x86.avx512.mask.broadcastf64x4.512(<4 x double>, <8 x double>, i8)
+
+define <8 x double>@test_int_x86_avx512_mask_broadcastf64x4_512(<4 x double> %x0, <8 x double> %x2, i8 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_mask_broadcastf64x4_512:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: ## kill: def %ymm0 killed %ymm0 def %zmm0
+; CHECK-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm2
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm1 {%k1}
+; CHECK-NEXT: vaddpd %zmm1, %zmm2, %zmm1
+; CHECK-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT: vaddpd %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: retq
+
+ %res1 = call <8 x double> @llvm.x86.avx512.mask.broadcastf64x4.512(<4 x double> %x0, <8 x double> %x2, i8 -1)
+ %res2 = call <8 x double> @llvm.x86.avx512.mask.broadcastf64x4.512(<4 x double> %x0, <8 x double> %x2, i8 %mask)
+ %res3 = call <8 x double> @llvm.x86.avx512.mask.broadcastf64x4.512(<4 x double> %x0, <8 x double> zeroinitializer, i8 %mask)
+ %res4 = fadd <8 x double> %res1, %res2
+ %res5 = fadd <8 x double> %res3, %res4
+ ret <8 x double> %res5
+}
+
+define <8 x double>@test_int_x86_avx512_mask_broadcastf64x4_512_load(<4 x double>* %x0ptr, <8 x double> %x2, i8 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_mask_broadcastf64x4_512_load:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovw %esi, %k1
+; CHECK-NEXT: vbroadcastf64x4 {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,0,1,2,3]
+; CHECK-NEXT: retq
+
+ %x0 = load <4 x double>, <4 x double>* %x0ptr
+ %res = call <8 x double> @llvm.x86.avx512.mask.broadcastf64x4.512(<4 x double> %x0, <8 x double> %x2, i8 %mask)
+ ret <8 x double> %res
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.broadcasti32x4.512(<4 x i32>, <16 x i32>, i16)
+
+define <16 x i32>@test_int_x86_avx512_mask_broadcasti32x4_512(<4 x i32> %x0, <16 x i32> %x2, i16 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_mask_broadcasti32x4_512:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: ## kill: def %xmm0 killed %xmm0 def %ymm0
+; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; CHECK-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
+; CHECK-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} {z}
+; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: vpaddd %zmm0, %zmm2, %zmm0
+; CHECK-NEXT: retq
+
+ %res1 = call <16 x i32> @llvm.x86.avx512.mask.broadcasti32x4.512(<4 x i32> %x0, <16 x i32> %x2, i16 -1)
+ %res2 = call <16 x i32> @llvm.x86.avx512.mask.broadcasti32x4.512(<4 x i32> %x0, <16 x i32> %x2, i16 %mask)
+ %res3 = call <16 x i32> @llvm.x86.avx512.mask.broadcasti32x4.512(<4 x i32> %x0, <16 x i32> zeroinitializer, i16 %mask)
+ %res4 = add <16 x i32> %res1, %res2
+ %res5 = add <16 x i32> %res3, %res4
+ ret <16 x i32> %res5
+}
+
+define <16 x i32>@test_int_x86_avx512_mask_broadcasti32x4_512_load(<4 x i32>* %x0ptr, <16 x i32> %x2, i16 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_mask_broadcasti32x4_512_load:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovw %esi, %k1
+; CHECK-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; CHECK-NEXT: retq
+
+ %x0 = load <4 x i32>, <4 x i32>* %x0ptr
+ %res = call <16 x i32> @llvm.x86.avx512.mask.broadcasti32x4.512(<4 x i32> %x0, <16 x i32> %x2, i16 %mask)
+ ret <16 x i32> %res
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.broadcasti64x4.512(<4 x i64>, <8 x i64>, i8)
+
+define <8 x i64>@test_int_x86_avx512_mask_broadcasti64x4_512(<4 x i64> %x0, <8 x i64> %x2, i8 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_mask_broadcasti64x4_512:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: ## kill: def %ymm0 killed %ymm0 def %zmm0
+; CHECK-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm2
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm1 {%k1}
+; CHECK-NEXT: vpaddq %zmm1, %zmm2, %zmm1
+; CHECK-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT: vpaddq %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: retq
+
+ %res1 = call <8 x i64> @llvm.x86.avx512.mask.broadcasti64x4.512(<4 x i64> %x0, <8 x i64> %x2, i8 -1)
+ %res2 = call <8 x i64> @llvm.x86.avx512.mask.broadcasti64x4.512(<4 x i64> %x0, <8 x i64> %x2, i8 %mask)
+ %res3 = call <8 x i64> @llvm.x86.avx512.mask.broadcasti64x4.512(<4 x i64> %x0, <8 x i64> zeroinitializer, i8 %mask)
+ %res4 = add <8 x i64> %res1, %res2
+ %res5 = add <8 x i64> %res3, %res4
+ ret <8 x i64> %res5
+}
+
+define <8 x i64>@test_int_x86_avx512_mask_broadcasti64x4_512_load(<4 x i64>* %x0ptr, <8 x i64> %x2, i8 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_mask_broadcasti64x4_512_load:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovw %esi, %k1
+; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,0,1,2,3]
+; CHECK-NEXT: retq
+
+ %x0 = load <4 x i64>, <4 x i64>* %x0ptr
+ %res = call <8 x i64> @llvm.x86.avx512.mask.broadcasti64x4.512(<4 x i64> %x0, <8 x i64> %x2, i8 %mask)
+ ret <8 x i64> %res
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.pabs.d.512(<16 x i32>, <16 x i32>, i16)
+
+define <16 x i32>@test_int_x86_avx512_mask_pabs_d_512(<16 x i32> %x0, <16 x i32> %x1, i16 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pabs_d_512:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpabsd %zmm0, %zmm2
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vpabsd %zmm0, %zmm1 {%k1}
+; CHECK-NEXT: vpaddd %zmm2, %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %res = call <16 x i32> @llvm.x86.avx512.mask.pabs.d.512(<16 x i32> %x0, <16 x i32> %x1, i16 %x2)
+ %res1 = call <16 x i32> @llvm.x86.avx512.mask.pabs.d.512(<16 x i32> %x0, <16 x i32> %x1, i16 -1)
+ %res2 = add <16 x i32> %res, %res1
+ ret <16 x i32> %res2
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.pabs.q.512(<8 x i64>, <8 x i64>, i8)
+
+define <8 x i64>@test_int_x86_avx512_mask_pabs_q_512(<8 x i64> %x0, <8 x i64> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pabs_q_512:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpabsq %zmm0, %zmm2
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vpabsq %zmm0, %zmm1 {%k1}
+; CHECK-NEXT: vpaddq %zmm2, %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %res = call <8 x i64> @llvm.x86.avx512.mask.pabs.q.512(<8 x i64> %x0, <8 x i64> %x1, i8 %x2)
+ %res1 = call <8 x i64> @llvm.x86.avx512.mask.pabs.q.512(<8 x i64> %x0, <8 x i64> %x1, i8 -1)
+ %res2 = add <8 x i64> %res, %res1
+ ret <8 x i64> %res2
+}
+
+define i8 @test_vptestmq(<8 x i64> %a0, <8 x i64> %a1, i8 %m) {
+; CHECK-LABEL: test_vptestmq:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vptestmq %zmm1, %zmm0, %k0
+; CHECK-NEXT: kmovw %k0, %ecx
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vptestmq %zmm1, %zmm0, %k0 {%k1}
+; CHECK-NEXT: kmovw %k0, %eax
+; CHECK-NEXT: addb %cl, %al
+; CHECK-NEXT: ## kill: def %al killed %al killed %eax
+; CHECK-NEXT: retq
+ %res = call i8 @llvm.x86.avx512.ptestm.q.512(<8 x i64> %a0, <8 x i64> %a1, i8 -1)
+ %res1 = call i8 @llvm.x86.avx512.ptestm.q.512(<8 x i64> %a0, <8 x i64> %a1, i8 %m)
+ %res2 = add i8 %res1, %res
+ ret i8 %res2
+}
+declare i8 @llvm.x86.avx512.ptestm.q.512(<8 x i64>, <8 x i64>, i8)
+
+define i16 @test_vptestmd(<16 x i32> %a0, <16 x i32> %a1, i16 %m) {
+; CHECK-LABEL: test_vptestmd:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vptestmd %zmm1, %zmm0, %k0
+; CHECK-NEXT: kmovw %k0, %ecx
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vptestmd %zmm1, %zmm0, %k0 {%k1}
+; CHECK-NEXT: kmovw %k0, %eax
+; CHECK-NEXT: addl %ecx, %eax
+; CHECK-NEXT: ## kill: def %ax killed %ax killed %eax
+; CHECK-NEXT: retq
+ %res = call i16 @llvm.x86.avx512.ptestm.d.512(<16 x i32> %a0, <16 x i32> %a1, i16 -1)
+ %res1 = call i16 @llvm.x86.avx512.ptestm.d.512(<16 x i32> %a0, <16 x i32> %a1, i16 %m)
+ %res2 = add i16 %res1, %res
+ ret i16 %res2
+}
+declare i16 @llvm.x86.avx512.ptestm.d.512(<16 x i32>, <16 x i32>, i16)
+
+declare i16 @llvm.x86.avx512.ptestnm.d.512(<16 x i32>, <16 x i32>, i16 %x2)
+
+define i16@test_int_x86_avx512_ptestnm_d_512(<16 x i32> %x0, <16 x i32> %x1, i16 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_ptestnm_d_512:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vptestnmd %zmm1, %zmm0, %k0
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vptestnmd %zmm1, %zmm0, %k1 {%k1}
+; CHECK-NEXT: kmovw %k1, %ecx
+; CHECK-NEXT: kmovw %k0, %eax
+; CHECK-NEXT: addl %ecx, %eax
+; CHECK-NEXT: ## kill: def %ax killed %ax killed %eax
+; CHECK-NEXT: retq
+ %res = call i16 @llvm.x86.avx512.ptestnm.d.512(<16 x i32> %x0, <16 x i32> %x1, i16 %x2)
+ %res1 = call i16 @llvm.x86.avx512.ptestnm.d.512(<16 x i32> %x0, <16 x i32> %x1, i16-1)
+ %res2 = add i16 %res, %res1
+ ret i16 %res2
+}
+
+declare i8 @llvm.x86.avx512.ptestnm.q.512(<8 x i64>, <8 x i64>, i8 %x2)
+
+define i8@test_int_x86_avx512_ptestnm_q_512(<8 x i64> %x0, <8 x i64> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_ptestnm_q_512:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vptestnmq %zmm1, %zmm0, %k0
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vptestnmq %zmm1, %zmm0, %k1 {%k1}
+; CHECK-NEXT: kmovw %k1, %ecx
+; CHECK-NEXT: kmovw %k0, %eax
+; CHECK-NEXT: addb %cl, %al
+; CHECK-NEXT: ## kill: def %al killed %al killed %eax
+; CHECK-NEXT: retq
+ %res = call i8 @llvm.x86.avx512.ptestnm.q.512(<8 x i64> %x0, <8 x i64> %x1, i8 %x2)
+ %res1 = call i8 @llvm.x86.avx512.ptestnm.q.512(<8 x i64> %x0, <8 x i64> %x1, i8-1)
+ %res2 = add i8 %res, %res1
+ ret i8 %res2
+}
+
diff --git a/test/CodeGen/X86/avx512-intrinsics.ll b/test/CodeGen/X86/avx512-intrinsics.ll
index 5a41b906008b..5faa202c30f3 100644
--- a/test/CodeGen/X86/avx512-intrinsics.ll
+++ b/test/CodeGen/X86/avx512-intrinsics.ll
@@ -4,7 +4,7 @@
declare i32 @llvm.x86.avx512.kortestz.w(i16, i16) nounwind readnone
define i32 @test_kortestz(i16 %a0, i16 %a1) {
; CHECK-LABEL: test_kortestz:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k0
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: xorl %eax, %eax
@@ -18,7 +18,7 @@ define i32 @test_kortestz(i16 %a0, i16 %a1) {
declare i32 @llvm.x86.avx512.kortestc.w(i16, i16) nounwind readnone
define i32 @test_kortestc(i16 %a0, i16 %a1) {
; CHECK-LABEL: test_kortestc:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k0
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: xorl %eax, %eax
@@ -32,7 +32,7 @@ define i32 @test_kortestc(i16 %a0, i16 %a1) {
declare i16 @llvm.x86.avx512.kand.w(i16, i16) nounwind readnone
define i16 @test_kand(i16 %a0, i16 %a1) {
; CHECK-LABEL: test_kand:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k0
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: movw $8, %ax
@@ -40,7 +40,7 @@ define i16 @test_kand(i16 %a0, i16 %a1) {
; CHECK-NEXT: kandw %k0, %k1, %k0
; CHECK-NEXT: kandw %k0, %k2, %k0
; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; CHECK-NEXT: ## kill: def %ax killed %ax killed %eax
; CHECK-NEXT: retq
%t1 = call i16 @llvm.x86.avx512.kand.w(i16 %a0, i16 8)
%t2 = call i16 @llvm.x86.avx512.kand.w(i16 %t1, i16 %a1)
@@ -50,7 +50,7 @@ define i16 @test_kand(i16 %a0, i16 %a1) {
declare i16 @llvm.x86.avx512.kandn.w(i16, i16) nounwind readnone
define i16 @test_kandn(i16 %a0, i16 %a1) {
; CHECK-LABEL: test_kandn:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k0
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: movw $8, %ax
@@ -58,7 +58,7 @@ define i16 @test_kandn(i16 %a0, i16 %a1) {
; CHECK-NEXT: kandnw %k2, %k1, %k1
; CHECK-NEXT: kandnw %k0, %k1, %k0
; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; CHECK-NEXT: ## kill: def %ax killed %ax killed %eax
; CHECK-NEXT: retq
%t1 = call i16 @llvm.x86.avx512.kandn.w(i16 %a0, i16 8)
%t2 = call i16 @llvm.x86.avx512.kandn.w(i16 %t1, i16 %a1)
@@ -68,11 +68,11 @@ define i16 @test_kandn(i16 %a0, i16 %a1) {
declare i16 @llvm.x86.avx512.knot.w(i16) nounwind readnone
define i16 @test_knot(i16 %a0) {
; CHECK-LABEL: test_knot:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k0
; CHECK-NEXT: knotw %k0, %k0
; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; CHECK-NEXT: ## kill: def %ax killed %ax killed %eax
; CHECK-NEXT: retq
%res = call i16 @llvm.x86.avx512.knot.w(i16 %a0)
ret i16 %res
@@ -81,7 +81,7 @@ define i16 @test_knot(i16 %a0) {
declare i16 @llvm.x86.avx512.kor.w(i16, i16) nounwind readnone
define i16 @test_kor(i16 %a0, i16 %a1) {
; CHECK-LABEL: test_kor:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k0
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: movw $8, %ax
@@ -89,44 +89,27 @@ define i16 @test_kor(i16 %a0, i16 %a1) {
; CHECK-NEXT: korw %k0, %k1, %k0
; CHECK-NEXT: korw %k0, %k2, %k0
; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; CHECK-NEXT: ## kill: def %ax killed %ax killed %eax
; CHECK-NEXT: retq
%t1 = call i16 @llvm.x86.avx512.kor.w(i16 %a0, i16 8)
%t2 = call i16 @llvm.x86.avx512.kor.w(i16 %t1, i16 %a1)
ret i16 %t2
}
-declare i16 @llvm.x86.avx512.kunpck.bw(i16, i16) nounwind readnone
-
-define i16 @unpckbw_test(i16 %a0, i16 %a1) {
-; CHECK-LABEL: unpckbw_test:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k0
-; CHECK-NEXT: kmovw %esi, %k1
-; CHECK-NEXT: kunpckbw %k1, %k0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
-; CHECK-NEXT: retq
- %res = call i16 @llvm.x86.avx512.kunpck.bw(i16 %a0, i16 %a1)
- ret i16 %res
-}
-
declare i16 @llvm.x86.avx512.kxnor.w(i16, i16) nounwind readnone
; TODO: the two kxnor instructions here a no op and should be elimintaed,
; probably by FoldConstantArithmetic in SelectionDAG.
define i16 @test_kxnor(i16 %a0, i16 %a1) {
; CHECK-LABEL: test_kxnor:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k0
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: movw $8, %ax
; CHECK-NEXT: kmovw %eax, %k2
; CHECK-NEXT: kxorw %k0, %k1, %k0
; CHECK-NEXT: kxorw %k0, %k2, %k0
-; CHECK-NEXT: kxnorw %k0, %k0, %k1
-; CHECK-NEXT: kxnorw %k1, %k0, %k0
; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; CHECK-NEXT: ## kill: def %ax killed %ax killed %eax
; CHECK-NEXT: retq
%t1 = call i16 @llvm.x86.avx512.kxnor.w(i16 %a0, i16 8)
%t2 = call i16 @llvm.x86.avx512.kxnor.w(i16 %t1, i16 %a1)
@@ -136,7 +119,7 @@ define i16 @test_kxnor(i16 %a0, i16 %a1) {
declare i16 @llvm.x86.avx512.kxor.w(i16, i16) nounwind readnone
define i16 @test_kxor(i16 %a0, i16 %a1) {
; CHECK-LABEL: test_kxor:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k0
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: movw $8, %ax
@@ -144,7 +127,7 @@ define i16 @test_kxor(i16 %a0, i16 %a1) {
; CHECK-NEXT: kxorw %k0, %k1, %k0
; CHECK-NEXT: kxorw %k0, %k2, %k0
; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; CHECK-NEXT: ## kill: def %ax killed %ax killed %eax
; CHECK-NEXT: retq
%t1 = call i16 @llvm.x86.avx512.kxor.w(i16 %a0, i16 8)
%t2 = call i16 @llvm.x86.avx512.kxor.w(i16 %t1, i16 %a1)
@@ -153,7 +136,7 @@ define i16 @test_kxor(i16 %a0, i16 %a1) {
define <16 x float> @test_rcp_ps_512(<16 x float> %a0) {
; CHECK-LABEL: test_rcp_ps_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vrcp14ps %zmm0, %zmm0
; CHECK-NEXT: retq
%res = call <16 x float> @llvm.x86.avx512.rcp14.ps.512(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1) ; <<16 x float>> [#uses=1]
@@ -163,7 +146,7 @@ declare <16 x float> @llvm.x86.avx512.rcp14.ps.512(<16 x float>, <16 x float>, i
define <8 x double> @test_rcp_pd_512(<8 x double> %a0) {
; CHECK-LABEL: test_rcp_pd_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vrcp14pd %zmm0, %zmm0
; CHECK-NEXT: retq
%res = call <8 x double> @llvm.x86.avx512.rcp14.pd.512(<8 x double> %a0, <8 x double> zeroinitializer, i8 -1) ; <<8 x double>> [#uses=1]
@@ -171,11 +154,97 @@ define <8 x double> @test_rcp_pd_512(<8 x double> %a0) {
}
declare <8 x double> @llvm.x86.avx512.rcp14.pd.512(<8 x double>, <8 x double>, i8) nounwind readnone
+declare <2 x double> @llvm.x86.avx512.mask.rndscale.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32, i32)
+
+define <2 x double> @test_rndscale_sd(<2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: test_rndscale_sd:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vrndscalesd $11, %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <2 x double> @llvm.x86.avx512.mask.rndscale.sd(<2 x double> %a, <2 x double> %b, <2 x double> undef, i8 -1, i32 11, i32 4)
+ ret <2 x double>%res
+}
+
+define <2 x double> @test_rndscale_sd_mask(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
+; CHECK-LABEL: test_rndscale_sd_mask:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vrndscalesd $11, %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT: vmovapd %xmm2, %xmm0
+; CHECK-NEXT: retq
+ %res = call <2 x double> @llvm.x86.avx512.mask.rndscale.sd(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask, i32 11, i32 4)
+ ret <2 x double>%res
+}
+
+define <2 x double> @test_rndscale_sd_mask_load(<2 x double> %a, <2 x double>* %bptr, <2 x double> %c, i8 %mask) {
+; CHECK-LABEL: test_rndscale_sd_mask_load:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovw %esi, %k1
+; CHECK-NEXT: vrndscalesd $11, (%rdi), %xmm0, %xmm1 {%k1}
+; CHECK-NEXT: vmovapd %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %b = load <2 x double>, <2 x double>* %bptr
+ %res = call <2 x double> @llvm.x86.avx512.mask.rndscale.sd(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask, i32 11, i32 4)
+ ret <2 x double>%res
+}
+
+define <2 x double> @test_rndscale_sd_maskz(<2 x double> %a, <2 x double> %b, i8 %mask) {
+; CHECK-LABEL: test_rndscale_sd_maskz:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vrndscalesd $11, %xmm1, %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %res = call <2 x double> @llvm.x86.avx512.mask.rndscale.sd(<2 x double> %a, <2 x double> %b, <2 x double> zeroinitializer, i8 %mask, i32 11, i32 4)
+ ret <2 x double>%res
+}
+
+declare <4 x float> @llvm.x86.avx512.mask.rndscale.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32, i32)
+
+define <4 x float> @test_rndscale_ss(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: test_rndscale_ss:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vrndscaless $11, %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <4 x float> @llvm.x86.avx512.mask.rndscale.ss(<4 x float> %a, <4 x float> %b, <4 x float> undef, i8 -1, i32 11, i32 4)
+ ret <4 x float>%res
+}
+
+define <4 x float> @test_rndscale_ss_load(<4 x float> %a, <4 x float>* %bptr) {
+; CHECK-LABEL: test_rndscale_ss_load:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vrndscaless $11, (%rdi), %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %b = load <4 x float>, <4 x float>* %bptr
+ %res = call <4 x float> @llvm.x86.avx512.mask.rndscale.ss(<4 x float> %a, <4 x float> %b, <4 x float> undef, i8 -1, i32 11, i32 4)
+ ret <4 x float>%res
+}
+
+define <4 x float> @test_rndscale_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
+; CHECK-LABEL: test_rndscale_ss_mask:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vrndscaless $11, %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT: vmovaps %xmm2, %xmm0
+; CHECK-NEXT: retq
+ %res = call <4 x float> @llvm.x86.avx512.mask.rndscale.ss(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask, i32 11, i32 4)
+ ret <4 x float>%res
+}
+
+define <4 x float> @test_rndscale_ss_maskz(<4 x float> %a, <4 x float> %b, i8 %mask) {
+; CHECK-LABEL: test_rndscale_ss_maskz:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vrndscaless $11, %xmm1, %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %res = call <4 x float> @llvm.x86.avx512.mask.rndscale.ss(<4 x float> %a, <4 x float> %b, <4 x float> zeroinitializer, i8 %mask, i32 11, i32 4)
+ ret <4 x float>%res
+}
+
declare <8 x double> @llvm.x86.avx512.mask.rndscale.pd.512(<8 x double>, i32, <8 x double>, i8, i32)
define <8 x double> @test7(<8 x double> %a) {
; CHECK-LABEL: test7:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vrndscalepd $11, %zmm0, %zmm0
; CHECK-NEXT: retq
%res = call <8 x double> @llvm.x86.avx512.mask.rndscale.pd.512(<8 x double> %a, i32 11, <8 x double> %a, i8 -1, i32 4)
@@ -186,7 +255,7 @@ declare <16 x float> @llvm.x86.avx512.mask.rndscale.ps.512(<16 x float>, i32, <1
define <16 x float> @test8(<16 x float> %a) {
; CHECK-LABEL: test8:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vrndscaleps $11, %zmm0, %zmm0
; CHECK-NEXT: retq
%res = call <16 x float> @llvm.x86.avx512.mask.rndscale.ps.512(<16 x float> %a, i32 11, <16 x float> %a, i16 -1, i32 4)
@@ -195,7 +264,7 @@ define <16 x float> @test8(<16 x float> %a) {
define <16 x float> @test_rsqrt_ps_512(<16 x float> %a0) {
; CHECK-LABEL: test_rsqrt_ps_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vrsqrt14ps %zmm0, %zmm0
; CHECK-NEXT: retq
%res = call <16 x float> @llvm.x86.avx512.rsqrt14.ps.512(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1) ; <<16 x float>> [#uses=1]
@@ -205,7 +274,7 @@ declare <16 x float> @llvm.x86.avx512.rsqrt14.ps.512(<16 x float>, <16 x float>,
define <8 x double> @test_sqrt_pd_512(<8 x double> %a0) {
; CHECK-LABEL: test_sqrt_pd_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vsqrtpd %zmm0, %zmm0
; CHECK-NEXT: retq
%res = call <8 x double> @llvm.x86.avx512.mask.sqrt.pd.512(<8 x double> %a0, <8 x double> zeroinitializer, i8 -1, i32 4)
@@ -215,7 +284,7 @@ declare <8 x double> @llvm.x86.avx512.mask.sqrt.pd.512(<8 x double>, <8 x double
define <16 x float> @test_sqrt_ps_512(<16 x float> %a0) {
; CHECK-LABEL: test_sqrt_ps_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vsqrtps %zmm0, %zmm0
; CHECK-NEXT: retq
%res = call <16 x float> @llvm.x86.avx512.mask.sqrt.ps.512(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 4)
@@ -223,7 +292,7 @@ define <16 x float> @test_sqrt_ps_512(<16 x float> %a0) {
}
define <16 x float> @test_sqrt_round_ps_512(<16 x float> %a0) {
; CHECK-LABEL: test_sqrt_round_ps_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vsqrtps {rz-sae}, %zmm0, %zmm0
; CHECK-NEXT: retq
%res = call <16 x float> @llvm.x86.avx512.mask.sqrt.ps.512(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 3)
@@ -233,7 +302,7 @@ declare <16 x float> @llvm.x86.avx512.mask.sqrt.ps.512(<16 x float>, <16 x float
define <8 x double> @test_getexp_pd_512(<8 x double> %a0) {
; CHECK-LABEL: test_getexp_pd_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vgetexppd %zmm0, %zmm0
; CHECK-NEXT: retq
%res = call <8 x double> @llvm.x86.avx512.mask.getexp.pd.512(<8 x double> %a0, <8 x double> zeroinitializer, i8 -1, i32 4)
@@ -241,7 +310,7 @@ define <8 x double> @test_getexp_pd_512(<8 x double> %a0) {
}
define <8 x double> @test_getexp_round_pd_512(<8 x double> %a0) {
; CHECK-LABEL: test_getexp_round_pd_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vgetexppd {sae}, %zmm0, %zmm0
; CHECK-NEXT: retq
%res = call <8 x double> @llvm.x86.avx512.mask.getexp.pd.512(<8 x double> %a0, <8 x double> zeroinitializer, i8 -1, i32 8)
@@ -251,7 +320,7 @@ declare <8 x double> @llvm.x86.avx512.mask.getexp.pd.512(<8 x double>, <8 x doub
define <16 x float> @test_getexp_ps_512(<16 x float> %a0) {
; CHECK-LABEL: test_getexp_ps_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vgetexpps %zmm0, %zmm0
; CHECK-NEXT: retq
%res = call <16 x float> @llvm.x86.avx512.mask.getexp.ps.512(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 4)
@@ -260,7 +329,7 @@ define <16 x float> @test_getexp_ps_512(<16 x float> %a0) {
define <16 x float> @test_getexp_round_ps_512(<16 x float> %a0) {
; CHECK-LABEL: test_getexp_round_ps_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vgetexpps {sae}, %zmm0, %zmm0
; CHECK-NEXT: retq
%res = call <16 x float> @llvm.x86.avx512.mask.getexp.ps.512(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 8)
@@ -272,7 +341,7 @@ declare <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float>, <4 x float>, <4 x
define <4 x float> @test_sqrt_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
; CHECK-LABEL: test_sqrt_ss:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovaps %xmm2, %xmm3
; CHECK-NEXT: vsqrtss %xmm1, %xmm0, %xmm3 {%k1}
@@ -298,7 +367,7 @@ declare <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double>, <2 x double>, <
define <2 x double> @test_sqrt_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
; CHECK-LABEL: test_sqrt_sd:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovapd %xmm2, %xmm3
; CHECK-NEXT: vsqrtsd %xmm1, %xmm0, %xmm3 {%k1}
@@ -322,7 +391,7 @@ define <2 x double> @test_sqrt_sd(<2 x double> %a0, <2 x double> %a1, <2 x doubl
define i64 @test_x86_sse2_cvtsd2si64(<2 x double> %a0) {
; CHECK-LABEL: test_x86_sse2_cvtsd2si64:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vcvtsd2si %xmm0, %rax
; CHECK-NEXT: retq
%res = call i64 @llvm.x86.sse2.cvtsd2si64(<2 x double> %a0) ; <i64> [#uses=1]
@@ -332,7 +401,7 @@ declare i64 @llvm.x86.sse2.cvtsd2si64(<2 x double>) nounwind readnone
define <2 x double> @test_x86_sse2_cvtsi642sd(<2 x double> %a0, i64 %a1) {
; CHECK-LABEL: test_x86_sse2_cvtsi642sd:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vcvtsi2sdq %rdi, %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <2 x double> @llvm.x86.sse2.cvtsi642sd(<2 x double> %a0, i64 %a1) ; <<2 x double>> [#uses=1]
@@ -342,7 +411,7 @@ declare <2 x double> @llvm.x86.sse2.cvtsi642sd(<2 x double>, i64) nounwind readn
define i64 @test_x86_avx512_cvttsd2si64(<2 x double> %a0) {
; CHECK-LABEL: test_x86_avx512_cvttsd2si64:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vcvttsd2si %xmm0, %rcx
; CHECK-NEXT: vcvttsd2si {sae}, %xmm0, %rax
; CHECK-NEXT: addq %rcx, %rax
@@ -356,7 +425,7 @@ declare i64 @llvm.x86.avx512.cvttsd2si64(<2 x double>, i32) nounwind readnone
define i32 @test_x86_avx512_cvttsd2usi(<2 x double> %a0) {
; CHECK-LABEL: test_x86_avx512_cvttsd2usi:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vcvttsd2usi %xmm0, %ecx
; CHECK-NEXT: vcvttsd2usi {sae}, %xmm0, %eax
; CHECK-NEXT: addl %ecx, %eax
@@ -370,7 +439,7 @@ declare i32 @llvm.x86.avx512.cvttsd2usi(<2 x double>, i32) nounwind readnone
define i32 @test_x86_avx512_cvttsd2si(<2 x double> %a0) {
; CHECK-LABEL: test_x86_avx512_cvttsd2si:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vcvttsd2si %xmm0, %ecx
; CHECK-NEXT: vcvttsd2si {sae}, %xmm0, %eax
; CHECK-NEXT: addl %ecx, %eax
@@ -386,7 +455,7 @@ declare i32 @llvm.x86.avx512.cvttsd2si(<2 x double>, i32) nounwind readnone
define i64 @test_x86_avx512_cvttsd2usi64(<2 x double> %a0) {
; CHECK-LABEL: test_x86_avx512_cvttsd2usi64:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vcvttsd2usi %xmm0, %rcx
; CHECK-NEXT: vcvttsd2usi {sae}, %xmm0, %rax
; CHECK-NEXT: addq %rcx, %rax
@@ -400,7 +469,7 @@ declare i64 @llvm.x86.avx512.cvttsd2usi64(<2 x double>, i32) nounwind readnone
define i64 @test_x86_sse_cvtss2si64(<4 x float> %a0) {
; CHECK-LABEL: test_x86_sse_cvtss2si64:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vcvtss2si %xmm0, %rax
; CHECK-NEXT: retq
%res = call i64 @llvm.x86.sse.cvtss2si64(<4 x float> %a0) ; <i64> [#uses=1]
@@ -411,7 +480,7 @@ declare i64 @llvm.x86.sse.cvtss2si64(<4 x float>) nounwind readnone
define <4 x float> @test_x86_sse_cvtsi642ss(<4 x float> %a0, i64 %a1) {
; CHECK-LABEL: test_x86_sse_cvtsi642ss:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vcvtsi2ssq %rdi, %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <4 x float> @llvm.x86.sse.cvtsi642ss(<4 x float> %a0, i64 %a1) ; <<4 x float>> [#uses=1]
@@ -422,7 +491,7 @@ declare <4 x float> @llvm.x86.sse.cvtsi642ss(<4 x float>, i64) nounwind readnone
define i32 @test_x86_avx512_cvttss2si(<4 x float> %a0) {
; CHECK-LABEL: test_x86_avx512_cvttss2si:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vcvttss2si {sae}, %xmm0, %ecx
; CHECK-NEXT: vcvttss2si %xmm0, %eax
; CHECK-NEXT: addl %ecx, %eax
@@ -436,7 +505,7 @@ declare i32 @llvm.x86.avx512.cvttss2si(<4 x float>, i32) nounwind readnone
define i64 @test_x86_avx512_cvttss2si64(<4 x float> %a0) {
; CHECK-LABEL: test_x86_avx512_cvttss2si64:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vcvttss2si %xmm0, %rcx
; CHECK-NEXT: vcvttss2si {sae}, %xmm0, %rax
; CHECK-NEXT: addq %rcx, %rax
@@ -450,7 +519,7 @@ declare i64 @llvm.x86.avx512.cvttss2si64(<4 x float>, i32) nounwind readnone
define i32 @test_x86_avx512_cvttss2usi(<4 x float> %a0) {
; CHECK-LABEL: test_x86_avx512_cvttss2usi:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vcvttss2usi {sae}, %xmm0, %ecx
; CHECK-NEXT: vcvttss2usi %xmm0, %eax
; CHECK-NEXT: addl %ecx, %eax
@@ -464,7 +533,7 @@ declare i32 @llvm.x86.avx512.cvttss2usi(<4 x float>, i32) nounwind readnone
define i64 @test_x86_avx512_cvttss2usi64(<4 x float> %a0) {
; CHECK-LABEL: test_x86_avx512_cvttss2usi64:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vcvttss2usi %xmm0, %rcx
; CHECK-NEXT: vcvttss2usi {sae}, %xmm0, %rax
; CHECK-NEXT: addq %rcx, %rax
@@ -478,7 +547,7 @@ declare i64 @llvm.x86.avx512.cvttss2usi64(<4 x float>, i32) nounwind readnone
define i64 @test_x86_avx512_cvtsd2usi64(<2 x double> %a0) {
; CHECK-LABEL: test_x86_avx512_cvtsd2usi64:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vcvtsd2usi %xmm0, %rax
; CHECK-NEXT: vcvtsd2usi {rz-sae}, %xmm0, %rcx
; CHECK-NEXT: addq %rax, %rcx
@@ -497,7 +566,7 @@ declare i64 @llvm.x86.avx512.vcvtsd2usi64(<2 x double>, i32) nounwind readnone
define i64 @test_x86_avx512_cvtsd2si64(<2 x double> %a0) {
; CHECK-LABEL: test_x86_avx512_cvtsd2si64:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vcvtsd2si %xmm0, %rax
; CHECK-NEXT: vcvtsd2si {rz-sae}, %xmm0, %rcx
; CHECK-NEXT: addq %rax, %rcx
@@ -516,7 +585,7 @@ declare i64 @llvm.x86.avx512.vcvtsd2si64(<2 x double>, i32) nounwind readnone
define i64 @test_x86_avx512_cvtss2usi64(<4 x float> %a0) {
; CHECK-LABEL: test_x86_avx512_cvtss2usi64:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vcvtss2usi %xmm0, %rax
; CHECK-NEXT: vcvtss2usi {rz-sae}, %xmm0, %rcx
; CHECK-NEXT: addq %rax, %rcx
@@ -535,7 +604,7 @@ declare i64 @llvm.x86.avx512.vcvtss2usi64(<4 x float>, i32) nounwind readnone
define i64 @test_x86_avx512_cvtss2si64(<4 x float> %a0) {
; CHECK-LABEL: test_x86_avx512_cvtss2si64:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vcvtss2si %xmm0, %rax
; CHECK-NEXT: vcvtss2si {rz-sae}, %xmm0, %rcx
; CHECK-NEXT: addq %rax, %rcx
@@ -554,7 +623,7 @@ declare i64 @llvm.x86.avx512.vcvtss2si64(<4 x float>, i32) nounwind readnone
define i32 @test_x86_avx512_cvtsd2usi32(<2 x double> %a0) {
; CHECK-LABEL: test_x86_avx512_cvtsd2usi32:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vcvtsd2usi %xmm0, %eax
; CHECK-NEXT: vcvtsd2usi {rz-sae}, %xmm0, %ecx
; CHECK-NEXT: addl %eax, %ecx
@@ -573,7 +642,7 @@ declare i32 @llvm.x86.avx512.vcvtsd2usi32(<2 x double>, i32) nounwind readnone
define i32 @test_x86_avx512_cvtsd2si32(<2 x double> %a0) {
; CHECK-LABEL: test_x86_avx512_cvtsd2si32:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vcvtsd2si %xmm0, %eax
; CHECK-NEXT: vcvtsd2si {rz-sae}, %xmm0, %ecx
; CHECK-NEXT: addl %eax, %ecx
@@ -592,7 +661,7 @@ declare i32 @llvm.x86.avx512.vcvtsd2si32(<2 x double>, i32) nounwind readnone
define i32 @test_x86_avx512_cvtss2usi32(<4 x float> %a0) {
; CHECK-LABEL: test_x86_avx512_cvtss2usi32:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vcvtss2usi %xmm0, %eax
; CHECK-NEXT: vcvtss2usi {rz-sae}, %xmm0, %ecx
; CHECK-NEXT: addl %eax, %ecx
@@ -611,7 +680,7 @@ declare i32 @llvm.x86.avx512.vcvtss2usi32(<4 x float>, i32) nounwind readnone
define i32 @test_x86_avx512_cvtss2si32(<4 x float> %a0) {
; CHECK-LABEL: test_x86_avx512_cvtss2si32:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vcvtss2si %xmm0, %eax
; CHECK-NEXT: vcvtss2si {rz-sae}, %xmm0, %ecx
; CHECK-NEXT: addl %eax, %ecx
@@ -630,7 +699,7 @@ declare i32 @llvm.x86.avx512.vcvtss2si32(<4 x float>, i32) nounwind readnone
define <16 x float> @test_x86_vcvtph2ps_512(<16 x i16> %a0) {
; CHECK-LABEL: test_x86_vcvtph2ps_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vcvtph2ps %ymm0, %zmm0
; CHECK-NEXT: retq
%res = call <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16> %a0, <16 x float> zeroinitializer, i16 -1, i32 4)
@@ -639,7 +708,7 @@ define <16 x float> @test_x86_vcvtph2ps_512(<16 x i16> %a0) {
define <16 x float> @test_x86_vcvtph2ps_512_sae(<16 x i16> %a0) {
; CHECK-LABEL: test_x86_vcvtph2ps_512_sae:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vcvtph2ps {sae}, %ymm0, %zmm0
; CHECK-NEXT: retq
%res = call <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16> %a0, <16 x float> zeroinitializer, i16 -1, i32 8)
@@ -648,7 +717,7 @@ define <16 x float> @test_x86_vcvtph2ps_512_sae(<16 x i16> %a0) {
define <16 x float> @test_x86_vcvtph2ps_512_rrk(<16 x i16> %a0,<16 x float> %a1, i16 %mask) {
; CHECK-LABEL: test_x86_vcvtph2ps_512_rrk:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vcvtph2ps %ymm0, %zmm1 {%k1}
; CHECK-NEXT: vmovaps %zmm1, %zmm0
@@ -659,7 +728,7 @@ define <16 x float> @test_x86_vcvtph2ps_512_rrk(<16 x i16> %a0,<16 x float> %a1,
define <16 x float> @test_x86_vcvtph2ps_512_sae_rrkz(<16 x i16> %a0, i16 %mask) {
; CHECK-LABEL: test_x86_vcvtph2ps_512_sae_rrkz:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vcvtph2ps {sae}, %ymm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -669,7 +738,7 @@ define <16 x float> @test_x86_vcvtph2ps_512_sae_rrkz(<16 x i16> %a0, i16 %mask)
define <16 x float> @test_x86_vcvtph2ps_512_rrkz(<16 x i16> %a0, i16 %mask) {
; CHECK-LABEL: test_x86_vcvtph2ps_512_rrkz:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vcvtph2ps %ymm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -681,7 +750,7 @@ declare <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16>, <16 x float
define <16 x i16> @test_x86_vcvtps2ph_256(<16 x float> %a0, <16 x i16> %src, i16 %mask, <16 x i16> * %dst) {
; CHECK-LABEL: test_x86_vcvtps2ph_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vcvtps2ph $2, %zmm0, %ymm1 {%k1}
; CHECK-NEXT: vcvtps2ph $2, %zmm0, %ymm2 {%k1} {z}
@@ -701,7 +770,7 @@ declare <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float>, i32, <16 x
define <16 x float> @test_x86_vbroadcast_ss_512(i8* %a0) {
; CHECK-LABEL: test_x86_vbroadcast_ss_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vbroadcastss (%rdi), %zmm0
; CHECK-NEXT: retq
%res = call <16 x float> @llvm.x86.avx512.vbroadcast.ss.512(i8* %a0) ; <<16 x float>> [#uses=1]
@@ -711,7 +780,7 @@ declare <16 x float> @llvm.x86.avx512.vbroadcast.ss.512(i8*) nounwind readonly
define <8 x double> @test_x86_vbroadcast_sd_512(i8* %a0) {
; CHECK-LABEL: test_x86_vbroadcast_sd_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vbroadcastsd (%rdi), %zmm0
; CHECK-NEXT: retq
%res = call <8 x double> @llvm.x86.avx512.vbroadcast.sd.512(i8* %a0) ; <<8 x double>> [#uses=1]
@@ -721,10 +790,10 @@ declare <8 x double> @llvm.x86.avx512.vbroadcast.sd.512(i8*) nounwind readonly
define i16 @test_cmpps(<16 x float> %a, <16 x float> %b) {
; CHECK-LABEL: test_cmpps:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vcmpleps {sae}, %zmm1, %zmm0, %k0
; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; CHECK-NEXT: ## kill: def %ax killed %ax killed %eax
; CHECK-NEXT: retq
%res = call i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %a, <16 x float> %b, i32 2, i16 -1, i32 8)
ret i16 %res
@@ -733,10 +802,10 @@ declare <8 x double> @llvm.x86.avx512.vbroadcast.sd.512(i8*) nounwind readonly
define i8 @test_cmppd(<8 x double> %a, <8 x double> %b) {
; CHECK-LABEL: test_cmppd:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vcmpneqpd %zmm1, %zmm0, %k0
; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: ## kill: def %al killed %al killed %eax
; CHECK-NEXT: retq
%res = call i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %a, <8 x double> %b, i32 4, i8 -1, i32 4)
ret i8 %res
@@ -746,7 +815,7 @@ declare <8 x double> @llvm.x86.avx512.vbroadcast.sd.512(i8*) nounwind readonly
; fp min - max
define <8 x double> @test_vmaxpd(<8 x double> %a0, <8 x double> %a1) {
; CHECK-LABEL: test_vmaxpd:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmaxpd %zmm1, %zmm0, %zmm0
; CHECK-NEXT: retq
%res = call <8 x double> @llvm.x86.avx512.mask.max.pd.512(<8 x double> %a0, <8 x double> %a1,
@@ -758,7 +827,7 @@ declare <8 x double> @llvm.x86.avx512.mask.max.pd.512(<8 x double>, <8 x double>
define <8 x double> @test_vminpd(<8 x double> %a0, <8 x double> %a1) {
; CHECK-LABEL: test_vminpd:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vminpd %zmm1, %zmm0, %zmm0
; CHECK-NEXT: retq
%res = call <8 x double> @llvm.x86.avx512.mask.min.pd.512(<8 x double> %a0, <8 x double> %a1,
@@ -768,77 +837,9 @@ define <8 x double> @test_vminpd(<8 x double> %a0, <8 x double> %a1) {
declare <8 x double> @llvm.x86.avx512.mask.min.pd.512(<8 x double>, <8 x double>,
<8 x double>, i8, i32)
- declare <16 x i32> @llvm.x86.avx512.mask.pabs.d.512(<16 x i32>, <16 x i32>, i16)
-
-define <16 x i32>@test_int_x86_avx512_mask_pabs_d_512(<16 x i32> %x0, <16 x i32> %x1, i16 %x2) {
-; CHECK-LABEL: test_int_x86_avx512_mask_pabs_d_512:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vpabsd %zmm0, %zmm1 {%k1}
-; CHECK-NEXT: vpabsd %zmm0, %zmm0
-; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0
-; CHECK-NEXT: retq
- %res = call <16 x i32> @llvm.x86.avx512.mask.pabs.d.512(<16 x i32> %x0, <16 x i32> %x1, i16 %x2)
- %res1 = call <16 x i32> @llvm.x86.avx512.mask.pabs.d.512(<16 x i32> %x0, <16 x i32> %x1, i16 -1)
- %res2 = add <16 x i32> %res, %res1
- ret <16 x i32> %res2
-}
-
-declare <8 x i64> @llvm.x86.avx512.mask.pabs.q.512(<8 x i64>, <8 x i64>, i8)
-
-define <8 x i64>@test_int_x86_avx512_mask_pabs_q_512(<8 x i64> %x0, <8 x i64> %x1, i8 %x2) {
-; CHECK-LABEL: test_int_x86_avx512_mask_pabs_q_512:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vpabsq %zmm0, %zmm1 {%k1}
-; CHECK-NEXT: vpabsq %zmm0, %zmm0
-; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
-; CHECK-NEXT: retq
- %res = call <8 x i64> @llvm.x86.avx512.mask.pabs.q.512(<8 x i64> %x0, <8 x i64> %x1, i8 %x2)
- %res1 = call <8 x i64> @llvm.x86.avx512.mask.pabs.q.512(<8 x i64> %x0, <8 x i64> %x1, i8 -1)
- %res2 = add <8 x i64> %res, %res1
- ret <8 x i64> %res2
-}
-
-define i8 @test_vptestmq(<8 x i64> %a0, <8 x i64> %a1, i8 %m) {
-; CHECK-LABEL: test_vptestmq:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vptestmq %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovw %k0, %ecx
-; CHECK-NEXT: vptestmq %zmm1, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: addb %cl, %al
-; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
-; CHECK-NEXT: retq
- %res = call i8 @llvm.x86.avx512.ptestm.q.512(<8 x i64> %a0, <8 x i64> %a1, i8 -1)
- %res1 = call i8 @llvm.x86.avx512.ptestm.q.512(<8 x i64> %a0, <8 x i64> %a1, i8 %m)
- %res2 = add i8 %res1, %res
- ret i8 %res2
-}
-declare i8 @llvm.x86.avx512.ptestm.q.512(<8 x i64>, <8 x i64>, i8)
-
-define i16 @test_vptestmd(<16 x i32> %a0, <16 x i32> %a1, i16 %m) {
-; CHECK-LABEL: test_vptestmd:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vptestmd %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovw %k0, %ecx
-; CHECK-NEXT: vptestmd %zmm1, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: addl %ecx, %eax
-; CHECK-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
-; CHECK-NEXT: retq
- %res = call i16 @llvm.x86.avx512.ptestm.d.512(<16 x i32> %a0, <16 x i32> %a1, i16 -1)
- %res1 = call i16 @llvm.x86.avx512.ptestm.d.512(<16 x i32> %a0, <16 x i32> %a1, i16 %m)
- %res2 = add i16 %res1, %res
- ret i16 %res2
-}
-declare i16 @llvm.x86.avx512.ptestm.d.512(<16 x i32>, <16 x i32>, i16)
-
define void @test_mask_store_ss(i8* %ptr, <4 x float> %data, i8 %mask) {
; CHECK-LABEL: test_mask_store_ss:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vmovss %xmm0, (%rdi) {%k1}
; CHECK-NEXT: retq
@@ -854,7 +855,7 @@ declare <8 x double> @llvm.x86.avx512.mask.mul.pd.512(<8 x double>, <8 x double>
define <16 x float> @test_vsubps_rn(<16 x float> %a0, <16 x float> %a1) {
; CHECK-LABEL: test_vsubps_rn:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vsubps {rn-sae}, %zmm1, %zmm0, %zmm0
; CHECK-NEXT: retq
%res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1,
@@ -864,7 +865,7 @@ define <16 x float> @test_vsubps_rn(<16 x float> %a0, <16 x float> %a1) {
define <16 x float> @test_vsubps_rd(<16 x float> %a0, <16 x float> %a1) {
; CHECK-LABEL: test_vsubps_rd:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vsubps {rd-sae}, %zmm1, %zmm0, %zmm0
; CHECK-NEXT: retq
%res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1,
@@ -874,7 +875,7 @@ define <16 x float> @test_vsubps_rd(<16 x float> %a0, <16 x float> %a1) {
define <16 x float> @test_vsubps_ru(<16 x float> %a0, <16 x float> %a1) {
; CHECK-LABEL: test_vsubps_ru:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vsubps {ru-sae}, %zmm1, %zmm0, %zmm0
; CHECK-NEXT: retq
%res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1,
@@ -884,7 +885,7 @@ define <16 x float> @test_vsubps_ru(<16 x float> %a0, <16 x float> %a1) {
define <16 x float> @test_vsubps_rz(<16 x float> %a0, <16 x float> %a1) {
; CHECK-LABEL: test_vsubps_rz:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vsubps {rz-sae}, %zmm1, %zmm0, %zmm0
; CHECK-NEXT: retq
%res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1,
@@ -894,7 +895,7 @@ define <16 x float> @test_vsubps_rz(<16 x float> %a0, <16 x float> %a1) {
define <16 x float> @test_vmulps_rn(<16 x float> %a0, <16 x float> %a1) {
; CHECK-LABEL: test_vmulps_rn:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmulps {rn-sae}, %zmm1, %zmm0, %zmm0
; CHECK-NEXT: retq
%res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1,
@@ -904,7 +905,7 @@ define <16 x float> @test_vmulps_rn(<16 x float> %a0, <16 x float> %a1) {
define <16 x float> @test_vmulps_rd(<16 x float> %a0, <16 x float> %a1) {
; CHECK-LABEL: test_vmulps_rd:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmulps {rd-sae}, %zmm1, %zmm0, %zmm0
; CHECK-NEXT: retq
%res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1,
@@ -914,7 +915,7 @@ define <16 x float> @test_vmulps_rd(<16 x float> %a0, <16 x float> %a1) {
define <16 x float> @test_vmulps_ru(<16 x float> %a0, <16 x float> %a1) {
; CHECK-LABEL: test_vmulps_ru:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmulps {ru-sae}, %zmm1, %zmm0, %zmm0
; CHECK-NEXT: retq
%res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1,
@@ -924,7 +925,7 @@ define <16 x float> @test_vmulps_ru(<16 x float> %a0, <16 x float> %a1) {
define <16 x float> @test_vmulps_rz(<16 x float> %a0, <16 x float> %a1) {
; CHECK-LABEL: test_vmulps_rz:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmulps {rz-sae}, %zmm1, %zmm0, %zmm0
; CHECK-NEXT: retq
%res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1,
@@ -935,7 +936,7 @@ define <16 x float> @test_vmulps_rz(<16 x float> %a0, <16 x float> %a1) {
;; mask float
define <16 x float> @test_vmulps_mask_rn(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
; CHECK-LABEL: test_vmulps_mask_rn:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmulps {rn-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -946,7 +947,7 @@ define <16 x float> @test_vmulps_mask_rn(<16 x float> %a0, <16 x float> %a1, i16
define <16 x float> @test_vmulps_mask_rd(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
; CHECK-LABEL: test_vmulps_mask_rd:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmulps {rd-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -957,7 +958,7 @@ define <16 x float> @test_vmulps_mask_rd(<16 x float> %a0, <16 x float> %a1, i16
define <16 x float> @test_vmulps_mask_ru(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
; CHECK-LABEL: test_vmulps_mask_ru:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmulps {ru-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -968,7 +969,7 @@ define <16 x float> @test_vmulps_mask_ru(<16 x float> %a0, <16 x float> %a1, i16
define <16 x float> @test_vmulps_mask_rz(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
; CHECK-LABEL: test_vmulps_mask_rz:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmulps {rz-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -980,7 +981,7 @@ define <16 x float> @test_vmulps_mask_rz(<16 x float> %a0, <16 x float> %a1, i16
;; With Passthru value
define <16 x float> @test_vmulps_mask_passthru_rn(<16 x float> %a0, <16 x float> %a1, <16 x float> %passthru, i16 %mask) {
; CHECK-LABEL: test_vmulps_mask_passthru_rn:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmulps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
; CHECK-NEXT: vmovaps %zmm2, %zmm0
@@ -992,7 +993,7 @@ define <16 x float> @test_vmulps_mask_passthru_rn(<16 x float> %a0, <16 x float>
define <16 x float> @test_vmulps_mask_passthru_rd(<16 x float> %a0, <16 x float> %a1, <16 x float> %passthru, i16 %mask) {
; CHECK-LABEL: test_vmulps_mask_passthru_rd:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmulps {rd-sae}, %zmm1, %zmm0, %zmm2 {%k1}
; CHECK-NEXT: vmovaps %zmm2, %zmm0
@@ -1004,7 +1005,7 @@ define <16 x float> @test_vmulps_mask_passthru_rd(<16 x float> %a0, <16 x float>
define <16 x float> @test_vmulps_mask_passthru_ru(<16 x float> %a0, <16 x float> %a1, <16 x float> %passthru, i16 %mask) {
; CHECK-LABEL: test_vmulps_mask_passthru_ru:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmulps {ru-sae}, %zmm1, %zmm0, %zmm2 {%k1}
; CHECK-NEXT: vmovaps %zmm2, %zmm0
@@ -1016,7 +1017,7 @@ define <16 x float> @test_vmulps_mask_passthru_ru(<16 x float> %a0, <16 x float>
define <16 x float> @test_vmulps_mask_passthru_rz(<16 x float> %a0, <16 x float> %a1, <16 x float> %passthru, i16 %mask) {
; CHECK-LABEL: test_vmulps_mask_passthru_rz:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmulps {rz-sae}, %zmm1, %zmm0, %zmm2 {%k1}
; CHECK-NEXT: vmovaps %zmm2, %zmm0
@@ -1029,7 +1030,7 @@ define <16 x float> @test_vmulps_mask_passthru_rz(<16 x float> %a0, <16 x float>
;; mask double
define <8 x double> @test_vmulpd_mask_rn(<8 x double> %a0, <8 x double> %a1, i8 %mask) {
; CHECK-LABEL: test_vmulpd_mask_rn:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmulpd {rn-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -1040,7 +1041,7 @@ define <8 x double> @test_vmulpd_mask_rn(<8 x double> %a0, <8 x double> %a1, i8
define <8 x double> @test_vmulpd_mask_rd(<8 x double> %a0, <8 x double> %a1, i8 %mask) {
; CHECK-LABEL: test_vmulpd_mask_rd:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmulpd {rd-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -1051,7 +1052,7 @@ define <8 x double> @test_vmulpd_mask_rd(<8 x double> %a0, <8 x double> %a1, i8
define <8 x double> @test_vmulpd_mask_ru(<8 x double> %a0, <8 x double> %a1, i8 %mask) {
; CHECK-LABEL: test_vmulpd_mask_ru:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmulpd {ru-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -1062,7 +1063,7 @@ define <8 x double> @test_vmulpd_mask_ru(<8 x double> %a0, <8 x double> %a1, i8
define <8 x double> @test_vmulpd_mask_rz(<8 x double> %a0, <8 x double> %a1, i8 %mask) {
; CHECK-LABEL: test_vmulpd_mask_rz:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmulpd {rz-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -1073,7 +1074,7 @@ define <8 x double> @test_vmulpd_mask_rz(<8 x double> %a0, <8 x double> %a1, i8
define <8 x i64> @test_mul_epi32_rr(<16 x i32> %a, <16 x i32> %b) {
; CHECK-LABEL: test_mul_epi32_rr:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpmuldq %zmm1, %zmm0, %zmm0
; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32> %a, <16 x i32> %b)
@@ -1082,7 +1083,7 @@ define <8 x i64> @test_mul_epi32_rr(<16 x i32> %a, <16 x i32> %b) {
define <8 x i64> @test_mul_epi32_rrk(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask) {
; CHECK-LABEL: test_mul_epi32_rrk:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpmuldq %zmm1, %zmm0, %zmm2 {%k1}
; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
@@ -1095,7 +1096,7 @@ define <8 x i64> @test_mul_epi32_rrk(<16 x i32> %a, <16 x i32> %b, <8 x i64> %pa
define <8 x i64> @test_mul_epi32_rrkz(<16 x i32> %a, <16 x i32> %b, i8 %mask) {
; CHECK-LABEL: test_mul_epi32_rrkz:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpmuldq %zmm1, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -1107,7 +1108,7 @@ define <8 x i64> @test_mul_epi32_rrkz(<16 x i32> %a, <16 x i32> %b, i8 %mask) {
define <8 x i64> @test_mul_epi32_rm(<16 x i32> %a, <16 x i32>* %ptr_b) {
; CHECK-LABEL: test_mul_epi32_rm:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpmuldq (%rdi), %zmm0, %zmm0
; CHECK-NEXT: retq
%b = load <16 x i32>, <16 x i32>* %ptr_b
@@ -1117,7 +1118,7 @@ define <8 x i64> @test_mul_epi32_rm(<16 x i32> %a, <16 x i32>* %ptr_b) {
define <8 x i64> @test_mul_epi32_rmk(<16 x i32> %a, <16 x i32>* %ptr_b, <8 x i64> %passThru, i8 %mask) {
; CHECK-LABEL: test_mul_epi32_rmk:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpmuldq (%rdi), %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
@@ -1131,7 +1132,7 @@ define <8 x i64> @test_mul_epi32_rmk(<16 x i32> %a, <16 x i32>* %ptr_b, <8 x i64
define <8 x i64> @test_mul_epi32_rmkz(<16 x i32> %a, <16 x i32>* %ptr_b, i8 %mask) {
; CHECK-LABEL: test_mul_epi32_rmkz:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpmuldq (%rdi), %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -1144,7 +1145,7 @@ define <8 x i64> @test_mul_epi32_rmkz(<16 x i32> %a, <16 x i32>* %ptr_b, i8 %mas
define <8 x i64> @test_mul_epi32_rmb(<16 x i32> %a, i64* %ptr_b) {
; CHECK-LABEL: test_mul_epi32_rmb:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpmuldq (%rdi){1to8}, %zmm0, %zmm0
; CHECK-NEXT: retq
%q = load i64, i64* %ptr_b
@@ -1157,7 +1158,7 @@ define <8 x i64> @test_mul_epi32_rmb(<16 x i32> %a, i64* %ptr_b) {
define <8 x i64> @test_mul_epi32_rmbk(<16 x i32> %a, i64* %ptr_b, <8 x i64> %passThru, i8 %mask) {
; CHECK-LABEL: test_mul_epi32_rmbk:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpmuldq (%rdi){1to8}, %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
@@ -1174,7 +1175,7 @@ define <8 x i64> @test_mul_epi32_rmbk(<16 x i32> %a, i64* %ptr_b, <8 x i64> %pas
define <8 x i64> @test_mul_epi32_rmbkz(<16 x i32> %a, i64* %ptr_b, i8 %mask) {
; CHECK-LABEL: test_mul_epi32_rmbkz:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpmuldq (%rdi){1to8}, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -1192,7 +1193,7 @@ declare <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32>, <16 x i32>)
define <8 x i64> @test_mul_epu32_rr(<16 x i32> %a, <16 x i32> %b) {
; CHECK-LABEL: test_mul_epu32_rr:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpmuludq %zmm1, %zmm0, %zmm0
; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b)
@@ -1201,7 +1202,7 @@ define <8 x i64> @test_mul_epu32_rr(<16 x i32> %a, <16 x i32> %b) {
define <8 x i64> @test_mul_epu32_rrk(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask) {
; CHECK-LABEL: test_mul_epu32_rrk:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpmuludq %zmm1, %zmm0, %zmm2 {%k1}
; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
@@ -1214,7 +1215,7 @@ define <8 x i64> @test_mul_epu32_rrk(<16 x i32> %a, <16 x i32> %b, <8 x i64> %pa
define <8 x i64> @test_mul_epu32_rrkz(<16 x i32> %a, <16 x i32> %b, i8 %mask) {
; CHECK-LABEL: test_mul_epu32_rrkz:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -1226,7 +1227,7 @@ define <8 x i64> @test_mul_epu32_rrkz(<16 x i32> %a, <16 x i32> %b, i8 %mask) {
define <8 x i64> @test_mul_epu32_rm(<16 x i32> %a, <16 x i32>* %ptr_b) {
; CHECK-LABEL: test_mul_epu32_rm:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpmuludq (%rdi), %zmm0, %zmm0
; CHECK-NEXT: retq
%b = load <16 x i32>, <16 x i32>* %ptr_b
@@ -1236,7 +1237,7 @@ define <8 x i64> @test_mul_epu32_rm(<16 x i32> %a, <16 x i32>* %ptr_b) {
define <8 x i64> @test_mul_epu32_rmk(<16 x i32> %a, <16 x i32>* %ptr_b, <8 x i64> %passThru, i8 %mask) {
; CHECK-LABEL: test_mul_epu32_rmk:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpmuludq (%rdi), %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
@@ -1250,7 +1251,7 @@ define <8 x i64> @test_mul_epu32_rmk(<16 x i32> %a, <16 x i32>* %ptr_b, <8 x i64
define <8 x i64> @test_mul_epu32_rmkz(<16 x i32> %a, <16 x i32>* %ptr_b, i8 %mask) {
; CHECK-LABEL: test_mul_epu32_rmkz:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpmuludq (%rdi), %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -1263,7 +1264,7 @@ define <8 x i64> @test_mul_epu32_rmkz(<16 x i32> %a, <16 x i32>* %ptr_b, i8 %mas
define <8 x i64> @test_mul_epu32_rmb(<16 x i32> %a, i64* %ptr_b) {
; CHECK-LABEL: test_mul_epu32_rmb:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpmuludq (%rdi){1to8}, %zmm0, %zmm0
; CHECK-NEXT: retq
%q = load i64, i64* %ptr_b
@@ -1276,7 +1277,7 @@ define <8 x i64> @test_mul_epu32_rmb(<16 x i32> %a, i64* %ptr_b) {
define <8 x i64> @test_mul_epu32_rmbk(<16 x i32> %a, i64* %ptr_b, <8 x i64> %passThru, i8 %mask) {
; CHECK-LABEL: test_mul_epu32_rmbk:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpmuludq (%rdi){1to8}, %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
@@ -1293,7 +1294,7 @@ define <8 x i64> @test_mul_epu32_rmbk(<16 x i32> %a, i64* %ptr_b, <8 x i64> %pas
define <8 x i64> @test_mul_epu32_rmbkz(<16 x i32> %a, i64* %ptr_b, i8 %mask) {
; CHECK-LABEL: test_mul_epu32_rmbkz:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpmuludq (%rdi){1to8}, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -1311,7 +1312,7 @@ declare <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32>, <16 x i32>)
define <16 x float> @test_mm512_maskz_add_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
; CHECK-LABEL: test_mm512_maskz_add_round_ps_rn_sae:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vaddps {rn-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -1320,7 +1321,7 @@ define <16 x float> @test_mm512_maskz_add_round_ps_rn_sae(<16 x float> %a0, <16
}
define <16 x float> @test_mm512_maskz_add_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
; CHECK-LABEL: test_mm512_maskz_add_round_ps_rd_sae:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vaddps {rd-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -1329,7 +1330,7 @@ define <16 x float> @test_mm512_maskz_add_round_ps_rd_sae(<16 x float> %a0, <16
}
define <16 x float> @test_mm512_maskz_add_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
; CHECK-LABEL: test_mm512_maskz_add_round_ps_ru_sae:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vaddps {ru-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -1339,7 +1340,7 @@ define <16 x float> @test_mm512_maskz_add_round_ps_ru_sae(<16 x float> %a0, <16
define <16 x float> @test_mm512_maskz_add_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
; CHECK-LABEL: test_mm512_maskz_add_round_ps_rz_sae:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vaddps {rz-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -1350,7 +1351,7 @@ define <16 x float> @test_mm512_maskz_add_round_ps_rz_sae(<16 x float> %a0, <16
define <16 x float> @test_mm512_maskz_add_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
; CHECK-LABEL: test_mm512_maskz_add_round_ps_current:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -1360,7 +1361,7 @@ define <16 x float> @test_mm512_maskz_add_round_ps_current(<16 x float> %a0, <16
define <16 x float> @test_mm512_mask_add_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
; CHECK-LABEL: test_mm512_mask_add_round_ps_rn_sae:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vaddps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
; CHECK-NEXT: vmovaps %zmm2, %zmm0
@@ -1370,7 +1371,7 @@ define <16 x float> @test_mm512_mask_add_round_ps_rn_sae(<16 x float> %a0, <16 x
}
define <16 x float> @test_mm512_mask_add_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
; CHECK-LABEL: test_mm512_mask_add_round_ps_rd_sae:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vaddps {rd-sae}, %zmm1, %zmm0, %zmm2 {%k1}
; CHECK-NEXT: vmovaps %zmm2, %zmm0
@@ -1380,7 +1381,7 @@ define <16 x float> @test_mm512_mask_add_round_ps_rd_sae(<16 x float> %a0, <16 x
}
define <16 x float> @test_mm512_mask_add_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
; CHECK-LABEL: test_mm512_mask_add_round_ps_ru_sae:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vaddps {ru-sae}, %zmm1, %zmm0, %zmm2 {%k1}
; CHECK-NEXT: vmovaps %zmm2, %zmm0
@@ -1391,7 +1392,7 @@ define <16 x float> @test_mm512_mask_add_round_ps_ru_sae(<16 x float> %a0, <16 x
define <16 x float> @test_mm512_mask_add_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
; CHECK-LABEL: test_mm512_mask_add_round_ps_rz_sae:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vaddps {rz-sae}, %zmm1, %zmm0, %zmm2 {%k1}
; CHECK-NEXT: vmovaps %zmm2, %zmm0
@@ -1403,7 +1404,7 @@ define <16 x float> @test_mm512_mask_add_round_ps_rz_sae(<16 x float> %a0, <16 x
define <16 x float> @test_mm512_mask_add_round_ps_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
; CHECK-LABEL: test_mm512_mask_add_round_ps_current:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm2 {%k1}
; CHECK-NEXT: vmovaps %zmm2, %zmm0
@@ -1415,7 +1416,7 @@ define <16 x float> @test_mm512_mask_add_round_ps_current(<16 x float> %a0, <16
define <16 x float> @test_mm512_add_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
; CHECK-LABEL: test_mm512_add_round_ps_rn_sae:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vaddps {rn-sae}, %zmm1, %zmm0, %zmm0
; CHECK-NEXT: retq
%res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 0)
@@ -1423,7 +1424,7 @@ define <16 x float> @test_mm512_add_round_ps_rn_sae(<16 x float> %a0, <16 x floa
}
define <16 x float> @test_mm512_add_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
; CHECK-LABEL: test_mm512_add_round_ps_rd_sae:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vaddps {rd-sae}, %zmm1, %zmm0, %zmm0
; CHECK-NEXT: retq
%res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 1)
@@ -1431,7 +1432,7 @@ define <16 x float> @test_mm512_add_round_ps_rd_sae(<16 x float> %a0, <16 x floa
}
define <16 x float> @test_mm512_add_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
; CHECK-LABEL: test_mm512_add_round_ps_ru_sae:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vaddps {ru-sae}, %zmm1, %zmm0, %zmm0
; CHECK-NEXT: retq
%res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 2)
@@ -1440,7 +1441,7 @@ define <16 x float> @test_mm512_add_round_ps_ru_sae(<16 x float> %a0, <16 x floa
define <16 x float> @test_mm512_add_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
; CHECK-LABEL: test_mm512_add_round_ps_rz_sae:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vaddps {rz-sae}, %zmm1, %zmm0, %zmm0
; CHECK-NEXT: retq
%res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 3)
@@ -1449,7 +1450,7 @@ define <16 x float> @test_mm512_add_round_ps_rz_sae(<16 x float> %a0, <16 x floa
define <16 x float> @test_mm512_add_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
; CHECK-LABEL: test_mm512_add_round_ps_current:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0
; CHECK-NEXT: retq
%res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 4)
@@ -1459,7 +1460,7 @@ declare <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float>, <16 x float>
define <16 x float> @test_mm512_mask_sub_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
; CHECK-LABEL: test_mm512_mask_sub_round_ps_rn_sae:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vsubps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
; CHECK-NEXT: vmovaps %zmm2, %zmm0
@@ -1469,7 +1470,7 @@ define <16 x float> @test_mm512_mask_sub_round_ps_rn_sae(<16 x float> %a0, <16 x
}
define <16 x float> @test_mm512_mask_sub_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
; CHECK-LABEL: test_mm512_mask_sub_round_ps_rd_sae:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vsubps {rd-sae}, %zmm1, %zmm0, %zmm2 {%k1}
; CHECK-NEXT: vmovaps %zmm2, %zmm0
@@ -1479,7 +1480,7 @@ define <16 x float> @test_mm512_mask_sub_round_ps_rd_sae(<16 x float> %a0, <16 x
}
define <16 x float> @test_mm512_mask_sub_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
; CHECK-LABEL: test_mm512_mask_sub_round_ps_ru_sae:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vsubps {ru-sae}, %zmm1, %zmm0, %zmm2 {%k1}
; CHECK-NEXT: vmovaps %zmm2, %zmm0
@@ -1490,7 +1491,7 @@ define <16 x float> @test_mm512_mask_sub_round_ps_ru_sae(<16 x float> %a0, <16 x
define <16 x float> @test_mm512_mask_sub_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
; CHECK-LABEL: test_mm512_mask_sub_round_ps_rz_sae:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vsubps {rz-sae}, %zmm1, %zmm0, %zmm2 {%k1}
; CHECK-NEXT: vmovaps %zmm2, %zmm0
@@ -1502,7 +1503,7 @@ define <16 x float> @test_mm512_mask_sub_round_ps_rz_sae(<16 x float> %a0, <16 x
define <16 x float> @test_mm512_mask_sub_round_ps_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
; CHECK-LABEL: test_mm512_mask_sub_round_ps_current:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vsubps %zmm1, %zmm0, %zmm2 {%k1}
; CHECK-NEXT: vmovaps %zmm2, %zmm0
@@ -1513,7 +1514,7 @@ define <16 x float> @test_mm512_mask_sub_round_ps_current(<16 x float> %a0, <16
define <16 x float> @test_mm512_sub_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
; CHECK-LABEL: test_mm512_sub_round_ps_rn_sae:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vsubps {rn-sae}, %zmm1, %zmm0, %zmm0
; CHECK-NEXT: retq
%res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 0)
@@ -1521,7 +1522,7 @@ define <16 x float> @test_mm512_sub_round_ps_rn_sae(<16 x float> %a0, <16 x floa
}
define <16 x float> @test_mm512_sub_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
; CHECK-LABEL: test_mm512_sub_round_ps_rd_sae:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vsubps {rd-sae}, %zmm1, %zmm0, %zmm0
; CHECK-NEXT: retq
%res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 1)
@@ -1529,7 +1530,7 @@ define <16 x float> @test_mm512_sub_round_ps_rd_sae(<16 x float> %a0, <16 x floa
}
define <16 x float> @test_mm512_sub_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
; CHECK-LABEL: test_mm512_sub_round_ps_ru_sae:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vsubps {ru-sae}, %zmm1, %zmm0, %zmm0
; CHECK-NEXT: retq
%res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 2)
@@ -1538,7 +1539,7 @@ define <16 x float> @test_mm512_sub_round_ps_ru_sae(<16 x float> %a0, <16 x floa
define <16 x float> @test_mm512_sub_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
; CHECK-LABEL: test_mm512_sub_round_ps_rz_sae:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vsubps {rz-sae}, %zmm1, %zmm0, %zmm0
; CHECK-NEXT: retq
%res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 3)
@@ -1547,7 +1548,7 @@ define <16 x float> @test_mm512_sub_round_ps_rz_sae(<16 x float> %a0, <16 x floa
define <16 x float> @test_mm512_sub_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
; CHECK-LABEL: test_mm512_sub_round_ps_current:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vsubps %zmm1, %zmm0, %zmm0
; CHECK-NEXT: retq
%res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 4)
@@ -1556,7 +1557,7 @@ define <16 x float> @test_mm512_sub_round_ps_current(<16 x float> %a0, <16 x flo
define <16 x float> @test_mm512_maskz_div_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
; CHECK-LABEL: test_mm512_maskz_div_round_ps_rn_sae:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vdivps {rn-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -1565,7 +1566,7 @@ define <16 x float> @test_mm512_maskz_div_round_ps_rn_sae(<16 x float> %a0, <16
}
define <16 x float> @test_mm512_maskz_div_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
; CHECK-LABEL: test_mm512_maskz_div_round_ps_rd_sae:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vdivps {rd-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -1574,7 +1575,7 @@ define <16 x float> @test_mm512_maskz_div_round_ps_rd_sae(<16 x float> %a0, <16
}
define <16 x float> @test_mm512_maskz_div_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
; CHECK-LABEL: test_mm512_maskz_div_round_ps_ru_sae:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vdivps {ru-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -1584,7 +1585,7 @@ define <16 x float> @test_mm512_maskz_div_round_ps_ru_sae(<16 x float> %a0, <16
define <16 x float> @test_mm512_maskz_div_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
; CHECK-LABEL: test_mm512_maskz_div_round_ps_rz_sae:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vdivps {rz-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -1595,7 +1596,7 @@ define <16 x float> @test_mm512_maskz_div_round_ps_rz_sae(<16 x float> %a0, <16
define <16 x float> @test_mm512_maskz_div_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
; CHECK-LABEL: test_mm512_maskz_div_round_ps_current:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vdivps %zmm1, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -1605,7 +1606,7 @@ define <16 x float> @test_mm512_maskz_div_round_ps_current(<16 x float> %a0, <16
define <16 x float> @test_mm512_mask_div_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
; CHECK-LABEL: test_mm512_mask_div_round_ps_rn_sae:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vdivps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1}
; CHECK-NEXT: vmovaps %zmm2, %zmm0
@@ -1615,7 +1616,7 @@ define <16 x float> @test_mm512_mask_div_round_ps_rn_sae(<16 x float> %a0, <16 x
}
define <16 x float> @test_mm512_mask_div_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
; CHECK-LABEL: test_mm512_mask_div_round_ps_rd_sae:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vdivps {rd-sae}, %zmm1, %zmm0, %zmm2 {%k1}
; CHECK-NEXT: vmovaps %zmm2, %zmm0
@@ -1625,7 +1626,7 @@ define <16 x float> @test_mm512_mask_div_round_ps_rd_sae(<16 x float> %a0, <16 x
}
define <16 x float> @test_mm512_mask_div_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
; CHECK-LABEL: test_mm512_mask_div_round_ps_ru_sae:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vdivps {ru-sae}, %zmm1, %zmm0, %zmm2 {%k1}
; CHECK-NEXT: vmovaps %zmm2, %zmm0
@@ -1636,7 +1637,7 @@ define <16 x float> @test_mm512_mask_div_round_ps_ru_sae(<16 x float> %a0, <16 x
define <16 x float> @test_mm512_mask_div_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
; CHECK-LABEL: test_mm512_mask_div_round_ps_rz_sae:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vdivps {rz-sae}, %zmm1, %zmm0, %zmm2 {%k1}
; CHECK-NEXT: vmovaps %zmm2, %zmm0
@@ -1648,7 +1649,7 @@ define <16 x float> @test_mm512_mask_div_round_ps_rz_sae(<16 x float> %a0, <16 x
define <16 x float> @test_mm512_mask_div_round_ps_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
; CHECK-LABEL: test_mm512_mask_div_round_ps_current:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vdivps %zmm1, %zmm0, %zmm2 {%k1}
; CHECK-NEXT: vmovaps %zmm2, %zmm0
@@ -1660,7 +1661,7 @@ define <16 x float> @test_mm512_mask_div_round_ps_current(<16 x float> %a0, <16
define <16 x float> @test_mm512_div_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
; CHECK-LABEL: test_mm512_div_round_ps_rn_sae:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vdivps {rn-sae}, %zmm1, %zmm0, %zmm0
; CHECK-NEXT: retq
%res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 0)
@@ -1668,7 +1669,7 @@ define <16 x float> @test_mm512_div_round_ps_rn_sae(<16 x float> %a0, <16 x floa
}
define <16 x float> @test_mm512_div_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
; CHECK-LABEL: test_mm512_div_round_ps_rd_sae:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vdivps {rd-sae}, %zmm1, %zmm0, %zmm0
; CHECK-NEXT: retq
%res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 1)
@@ -1676,7 +1677,7 @@ define <16 x float> @test_mm512_div_round_ps_rd_sae(<16 x float> %a0, <16 x floa
}
define <16 x float> @test_mm512_div_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
; CHECK-LABEL: test_mm512_div_round_ps_ru_sae:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vdivps {ru-sae}, %zmm1, %zmm0, %zmm0
; CHECK-NEXT: retq
%res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 2)
@@ -1685,7 +1686,7 @@ define <16 x float> @test_mm512_div_round_ps_ru_sae(<16 x float> %a0, <16 x floa
define <16 x float> @test_mm512_div_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
; CHECK-LABEL: test_mm512_div_round_ps_rz_sae:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vdivps {rz-sae}, %zmm1, %zmm0, %zmm0
; CHECK-NEXT: retq
%res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 3)
@@ -1694,7 +1695,7 @@ define <16 x float> @test_mm512_div_round_ps_rz_sae(<16 x float> %a0, <16 x floa
define <16 x float> @test_mm512_div_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
; CHECK-LABEL: test_mm512_div_round_ps_current:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vdivps %zmm1, %zmm0, %zmm0
; CHECK-NEXT: retq
%res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 4)
@@ -1704,7 +1705,7 @@ declare <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float>, <16 x float>
define <16 x float> @test_mm512_maskz_min_round_ps_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
; CHECK-LABEL: test_mm512_maskz_min_round_ps_sae:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vminps {sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -1714,7 +1715,7 @@ define <16 x float> @test_mm512_maskz_min_round_ps_sae(<16 x float> %a0, <16 x f
define <16 x float> @test_mm512_maskz_min_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
; CHECK-LABEL: test_mm512_maskz_min_round_ps_current:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vminps %zmm1, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -1724,7 +1725,7 @@ define <16 x float> @test_mm512_maskz_min_round_ps_current(<16 x float> %a0, <16
define <16 x float> @test_mm512_mask_min_round_ps_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
; CHECK-LABEL: test_mm512_mask_min_round_ps_sae:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vminps {sae}, %zmm1, %zmm0, %zmm2 {%k1}
; CHECK-NEXT: vmovaps %zmm2, %zmm0
@@ -1735,7 +1736,7 @@ define <16 x float> @test_mm512_mask_min_round_ps_sae(<16 x float> %a0, <16 x fl
define <16 x float> @test_mm512_mask_min_round_ps_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
; CHECK-LABEL: test_mm512_mask_min_round_ps_current:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vminps %zmm1, %zmm0, %zmm2 {%k1}
; CHECK-NEXT: vmovaps %zmm2, %zmm0
@@ -1746,7 +1747,7 @@ define <16 x float> @test_mm512_mask_min_round_ps_current(<16 x float> %a0, <16
define <16 x float> @test_mm512_min_round_ps_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
; CHECK-LABEL: test_mm512_min_round_ps_sae:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vminps {sae}, %zmm1, %zmm0, %zmm0
; CHECK-NEXT: retq
%res = call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 8)
@@ -1755,7 +1756,7 @@ define <16 x float> @test_mm512_min_round_ps_sae(<16 x float> %a0, <16 x float>
define <16 x float> @test_mm512_min_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
; CHECK-LABEL: test_mm512_min_round_ps_current:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vminps %zmm1, %zmm0, %zmm0
; CHECK-NEXT: retq
%res = call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 4)
@@ -1765,7 +1766,7 @@ declare <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float>, <16 x float>
define <16 x float> @test_mm512_maskz_max_round_ps_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
; CHECK-LABEL: test_mm512_maskz_max_round_ps_sae:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmaxps {sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -1775,7 +1776,7 @@ define <16 x float> @test_mm512_maskz_max_round_ps_sae(<16 x float> %a0, <16 x f
define <16 x float> @test_mm512_maskz_max_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
; CHECK-LABEL: test_mm512_maskz_max_round_ps_current:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmaxps %zmm1, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -1785,7 +1786,7 @@ define <16 x float> @test_mm512_maskz_max_round_ps_current(<16 x float> %a0, <16
define <16 x float> @test_mm512_mask_max_round_ps_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
; CHECK-LABEL: test_mm512_mask_max_round_ps_sae:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmaxps {sae}, %zmm1, %zmm0, %zmm2 {%k1}
; CHECK-NEXT: vmovaps %zmm2, %zmm0
@@ -1796,7 +1797,7 @@ define <16 x float> @test_mm512_mask_max_round_ps_sae(<16 x float> %a0, <16 x fl
define <16 x float> @test_mm512_mask_max_round_ps_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) {
; CHECK-LABEL: test_mm512_mask_max_round_ps_current:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmaxps %zmm1, %zmm0, %zmm2 {%k1}
; CHECK-NEXT: vmovaps %zmm2, %zmm0
@@ -1807,7 +1808,7 @@ define <16 x float> @test_mm512_mask_max_round_ps_current(<16 x float> %a0, <16
define <16 x float> @test_mm512_max_round_ps_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
; CHECK-LABEL: test_mm512_max_round_ps_sae:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmaxps {sae}, %zmm1, %zmm0, %zmm0
; CHECK-NEXT: retq
%res = call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 8)
@@ -1816,7 +1817,7 @@ define <16 x float> @test_mm512_max_round_ps_sae(<16 x float> %a0, <16 x float>
define <16 x float> @test_mm512_max_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) {
; CHECK-LABEL: test_mm512_max_round_ps_current:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmaxps %zmm1, %zmm0, %zmm0
; CHECK-NEXT: retq
%res = call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 4)
@@ -1828,7 +1829,7 @@ declare <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>, <4 x float>,
define <4 x float> @test_mask_add_ss_rn(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
; CHECK-LABEL: test_mask_add_ss_rn:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vaddss {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
; CHECK-NEXT: vmovaps %xmm2, %xmm0
@@ -1839,7 +1840,7 @@ define <4 x float> @test_mask_add_ss_rn(<4 x float> %a0, <4 x float> %a1, <4 x f
define <4 x float> @test_mask_add_ss_rd(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
; CHECK-LABEL: test_mask_add_ss_rd:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vaddss {rd-sae}, %xmm1, %xmm0, %xmm2 {%k1}
; CHECK-NEXT: vmovaps %xmm2, %xmm0
@@ -1850,7 +1851,7 @@ define <4 x float> @test_mask_add_ss_rd(<4 x float> %a0, <4 x float> %a1, <4 x f
define <4 x float> @test_mask_add_ss_ru(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
; CHECK-LABEL: test_mask_add_ss_ru:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vaddss {ru-sae}, %xmm1, %xmm0, %xmm2 {%k1}
; CHECK-NEXT: vmovaps %xmm2, %xmm0
@@ -1861,7 +1862,7 @@ define <4 x float> @test_mask_add_ss_ru(<4 x float> %a0, <4 x float> %a1, <4 x f
define <4 x float> @test_mask_add_ss_rz(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
; CHECK-LABEL: test_mask_add_ss_rz:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vaddss {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1}
; CHECK-NEXT: vmovaps %xmm2, %xmm0
@@ -1872,7 +1873,7 @@ define <4 x float> @test_mask_add_ss_rz(<4 x float> %a0, <4 x float> %a1, <4 x f
define <4 x float> @test_mask_add_ss_current(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
; CHECK-LABEL: test_mask_add_ss_current:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm2 {%k1}
; CHECK-NEXT: vmovaps %xmm2, %xmm0
@@ -1883,7 +1884,7 @@ define <4 x float> @test_mask_add_ss_current(<4 x float> %a0, <4 x float> %a1, <
define <4 x float> @test_maskz_add_ss_rn(<4 x float> %a0, <4 x float> %a1, i8 %mask) {
; CHECK-LABEL: test_maskz_add_ss_rn:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vaddss {rn-sae}, %xmm1, %xmm0, %xmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -1893,7 +1894,7 @@ define <4 x float> @test_maskz_add_ss_rn(<4 x float> %a0, <4 x float> %a1, i8 %m
define <4 x float> @test_add_ss_rn(<4 x float> %a0, <4 x float> %a1) {
; CHECK-LABEL: test_add_ss_rn:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vaddss {rn-sae}, %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 -1, i32 0)
@@ -1902,7 +1903,7 @@ define <4 x float> @test_add_ss_rn(<4 x float> %a0, <4 x float> %a1) {
define <4 x float> @test_mask_add_ss_current_memfold(<4 x float> %a0, float* %a1, <4 x float> %a2, i8 %mask) {
; CHECK-LABEL: test_mask_add_ss_current_memfold:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vaddss (%rdi), %xmm0, %xmm1 {%k1}
; CHECK-NEXT: vmovaps %xmm1, %xmm0
@@ -1918,7 +1919,7 @@ define <4 x float> @test_mask_add_ss_current_memfold(<4 x float> %a0, float* %a1
define <4 x float> @test_maskz_add_ss_current_memfold(<4 x float> %a0, float* %a1, i8 %mask) {
; CHECK-LABEL: test_maskz_add_ss_current_memfold:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vaddss (%rdi), %xmm0, %xmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -1935,7 +1936,7 @@ declare <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>, <2 x doubl
define <2 x double> @test_mask_add_sd_rn(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
; CHECK-LABEL: test_mask_add_sd_rn:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vaddsd {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
; CHECK-NEXT: vmovapd %xmm2, %xmm0
@@ -1946,7 +1947,7 @@ define <2 x double> @test_mask_add_sd_rn(<2 x double> %a0, <2 x double> %a1, <2
define <2 x double> @test_mask_add_sd_rd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
; CHECK-LABEL: test_mask_add_sd_rd:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vaddsd {rd-sae}, %xmm1, %xmm0, %xmm2 {%k1}
; CHECK-NEXT: vmovapd %xmm2, %xmm0
@@ -1957,7 +1958,7 @@ define <2 x double> @test_mask_add_sd_rd(<2 x double> %a0, <2 x double> %a1, <2
define <2 x double> @test_mask_add_sd_ru(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
; CHECK-LABEL: test_mask_add_sd_ru:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vaddsd {ru-sae}, %xmm1, %xmm0, %xmm2 {%k1}
; CHECK-NEXT: vmovapd %xmm2, %xmm0
@@ -1968,7 +1969,7 @@ define <2 x double> @test_mask_add_sd_ru(<2 x double> %a0, <2 x double> %a1, <2
define <2 x double> @test_mask_add_sd_rz(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
; CHECK-LABEL: test_mask_add_sd_rz:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vaddsd {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1}
; CHECK-NEXT: vmovapd %xmm2, %xmm0
@@ -1979,7 +1980,7 @@ define <2 x double> @test_mask_add_sd_rz(<2 x double> %a0, <2 x double> %a1, <2
define <2 x double> @test_mask_add_sd_current(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
; CHECK-LABEL: test_mask_add_sd_current:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vaddsd %xmm1, %xmm0, %xmm2 {%k1}
; CHECK-NEXT: vmovapd %xmm2, %xmm0
@@ -1990,7 +1991,7 @@ define <2 x double> @test_mask_add_sd_current(<2 x double> %a0, <2 x double> %a1
define <2 x double> @test_maskz_add_sd_rn(<2 x double> %a0, <2 x double> %a1, i8 %mask) {
; CHECK-LABEL: test_maskz_add_sd_rn:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vaddsd {rn-sae}, %xmm1, %xmm0, %xmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -2000,7 +2001,7 @@ define <2 x double> @test_maskz_add_sd_rn(<2 x double> %a0, <2 x double> %a1, i8
define <2 x double> @test_add_sd_rn(<2 x double> %a0, <2 x double> %a1) {
; CHECK-LABEL: test_add_sd_rn:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vaddsd {rn-sae}, %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 -1, i32 0)
@@ -2009,7 +2010,7 @@ define <2 x double> @test_add_sd_rn(<2 x double> %a0, <2 x double> %a1) {
define <2 x double> @test_mask_add_sd_current_memfold(<2 x double> %a0, double* %a1, <2 x double> %a2, i8 %mask) {
; CHECK-LABEL: test_mask_add_sd_current_memfold:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vaddsd (%rdi), %xmm0, %xmm1 {%k1}
; CHECK-NEXT: vmovapd %xmm1, %xmm0
@@ -2023,7 +2024,7 @@ define <2 x double> @test_mask_add_sd_current_memfold(<2 x double> %a0, double*
define <2 x double> @test_maskz_add_sd_current_memfold(<2 x double> %a0, double* %a1, i8 %mask) {
; CHECK-LABEL: test_maskz_add_sd_current_memfold:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vaddsd (%rdi), %xmm0, %xmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -2038,7 +2039,7 @@ declare <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>, <4 x float>,
define <4 x float> @test_mask_max_ss_sae(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
; CHECK-LABEL: test_mask_max_ss_sae:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmaxss {sae}, %xmm1, %xmm0, %xmm2 {%k1}
; CHECK-NEXT: vmovaps %xmm2, %xmm0
@@ -2049,7 +2050,7 @@ define <4 x float> @test_mask_max_ss_sae(<4 x float> %a0, <4 x float> %a1, <4 x
define <4 x float> @test_maskz_max_ss_sae(<4 x float> %a0, <4 x float> %a1, i8 %mask) {
; CHECK-LABEL: test_maskz_max_ss_sae:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmaxss {sae}, %xmm1, %xmm0, %xmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -2059,7 +2060,7 @@ define <4 x float> @test_maskz_max_ss_sae(<4 x float> %a0, <4 x float> %a1, i8 %
define <4 x float> @test_max_ss_sae(<4 x float> %a0, <4 x float> %a1) {
; CHECK-LABEL: test_max_ss_sae:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmaxss {sae}, %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 -1, i32 8)
@@ -2068,7 +2069,7 @@ define <4 x float> @test_max_ss_sae(<4 x float> %a0, <4 x float> %a1) {
define <4 x float> @test_mask_max_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
; CHECK-LABEL: test_mask_max_ss:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmaxss %xmm1, %xmm0, %xmm2 {%k1}
; CHECK-NEXT: vmovaps %xmm2, %xmm0
@@ -2079,7 +2080,7 @@ define <4 x float> @test_mask_max_ss(<4 x float> %a0, <4 x float> %a1, <4 x floa
define <4 x float> @test_maskz_max_ss(<4 x float> %a0, <4 x float> %a1, i8 %mask) {
; CHECK-LABEL: test_maskz_max_ss:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmaxss %xmm1, %xmm0, %xmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -2089,7 +2090,7 @@ define <4 x float> @test_maskz_max_ss(<4 x float> %a0, <4 x float> %a1, i8 %mask
define <4 x float> @test_max_ss(<4 x float> %a0, <4 x float> %a1) {
; CHECK-LABEL: test_max_ss:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmaxss %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 -1, i32 4)
@@ -2098,7 +2099,7 @@ define <4 x float> @test_max_ss(<4 x float> %a0, <4 x float> %a1) {
define <4 x float> @test_mask_max_ss_memfold(<4 x float> %a0, float* %a1, <4 x float> %a2, i8 %mask) {
; CHECK-LABEL: test_mask_max_ss_memfold:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vmaxss (%rdi), %xmm0, %xmm1 {%k1}
; CHECK-NEXT: vmovaps %xmm1, %xmm0
@@ -2114,7 +2115,7 @@ define <4 x float> @test_mask_max_ss_memfold(<4 x float> %a0, float* %a1, <4 x f
define <4 x float> @test_maskz_max_ss_memfold(<4 x float> %a0, float* %a1, i8 %mask) {
; CHECK-LABEL: test_maskz_max_ss_memfold:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vmaxss (%rdi), %xmm0, %xmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -2130,7 +2131,7 @@ declare <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>, <2 x doubl
define <2 x double> @test_mask_max_sd_sae(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
; CHECK-LABEL: test_mask_max_sd_sae:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmaxsd {sae}, %xmm1, %xmm0, %xmm2 {%k1}
; CHECK-NEXT: vmovapd %xmm2, %xmm0
@@ -2141,7 +2142,7 @@ define <2 x double> @test_mask_max_sd_sae(<2 x double> %a0, <2 x double> %a1, <2
define <2 x double> @test_maskz_max_sd_sae(<2 x double> %a0, <2 x double> %a1, i8 %mask) {
; CHECK-LABEL: test_maskz_max_sd_sae:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmaxsd {sae}, %xmm1, %xmm0, %xmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -2151,7 +2152,7 @@ define <2 x double> @test_maskz_max_sd_sae(<2 x double> %a0, <2 x double> %a1, i
define <2 x double> @test_max_sd_sae(<2 x double> %a0, <2 x double> %a1) {
; CHECK-LABEL: test_max_sd_sae:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmaxsd {sae}, %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 -1, i32 8)
@@ -2160,7 +2161,7 @@ define <2 x double> @test_max_sd_sae(<2 x double> %a0, <2 x double> %a1) {
define <2 x double> @test_mask_max_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
; CHECK-LABEL: test_mask_max_sd:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmaxsd %xmm1, %xmm0, %xmm2 {%k1}
; CHECK-NEXT: vmovapd %xmm2, %xmm0
@@ -2171,7 +2172,7 @@ define <2 x double> @test_mask_max_sd(<2 x double> %a0, <2 x double> %a1, <2 x d
define <2 x double> @test_maskz_max_sd(<2 x double> %a0, <2 x double> %a1, i8 %mask) {
; CHECK-LABEL: test_maskz_max_sd:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -2181,7 +2182,7 @@ define <2 x double> @test_maskz_max_sd(<2 x double> %a0, <2 x double> %a1, i8 %m
define <2 x double> @test_max_sd(<2 x double> %a0, <2 x double> %a1) {
; CHECK-LABEL: test_max_sd:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmaxsd %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 -1, i32 4)
@@ -2190,7 +2191,7 @@ define <2 x double> @test_max_sd(<2 x double> %a0, <2 x double> %a1) {
define <2 x double> @test_mask_max_sd_memfold(<2 x double> %a0, double* %a1, <2 x double> %a2, i8 %mask) {
; CHECK-LABEL: test_mask_max_sd_memfold:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vmaxsd (%rdi), %xmm0, %xmm1 {%k1}
; CHECK-NEXT: vmovapd %xmm1, %xmm0
@@ -2204,7 +2205,7 @@ define <2 x double> @test_mask_max_sd_memfold(<2 x double> %a0, double* %a1, <2
define <2 x double> @test_maskz_max_sd_memfold(<2 x double> %a0, double* %a1, i8 %mask) {
; CHECK-LABEL: test_maskz_max_sd_memfold:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vmaxsd (%rdi), %xmm0, %xmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -2217,7 +2218,7 @@ define <2 x double> @test_maskz_max_sd_memfold(<2 x double> %a0, double* %a1, i8
define <2 x double> @test_x86_avx512_cvtsi2sd64(<2 x double> %a, i64 %b) {
; CHECK-LABEL: test_x86_avx512_cvtsi2sd64:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vcvtsi2sdq %rdi, {rz-sae}, %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <2 x double> @llvm.x86.avx512.cvtsi2sd64(<2 x double> %a, i64 %b, i32 3) ; <<<2 x double>> [#uses=1]
@@ -2227,7 +2228,7 @@ declare <2 x double> @llvm.x86.avx512.cvtsi2sd64(<2 x double>, i64, i32) nounwin
define <4 x float> @test_x86_avx512_cvtsi2ss32(<4 x float> %a, i32 %b) {
; CHECK-LABEL: test_x86_avx512_cvtsi2ss32:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vcvtsi2ssl %edi, {rz-sae}, %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <4 x float> @llvm.x86.avx512.cvtsi2ss32(<4 x float> %a, i32 %b, i32 3) ; <<<4 x float>> [#uses=1]
@@ -2237,7 +2238,7 @@ declare <4 x float> @llvm.x86.avx512.cvtsi2ss32(<4 x float>, i32, i32) nounwind
define <4 x float> @test_x86_avx512_cvtsi2ss64(<4 x float> %a, i64 %b) {
; CHECK-LABEL: test_x86_avx512_cvtsi2ss64:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vcvtsi2ssq %rdi, {rz-sae}, %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <4 x float> @llvm.x86.avx512.cvtsi2ss64(<4 x float> %a, i64 %b, i32 3) ; <<<4 x float>> [#uses=1]
@@ -2247,7 +2248,7 @@ declare <4 x float> @llvm.x86.avx512.cvtsi2ss64(<4 x float>, i64, i32) nounwind
define <4 x float> @test_x86_avx512__mm_cvt_roundu32_ss (<4 x float> %a, i32 %b)
; CHECK-LABEL: test_x86_avx512__mm_cvt_roundu32_ss:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vcvtusi2ssl %edi, {rd-sae}, %xmm0, %xmm0
; CHECK-NEXT: retq
{
@@ -2257,7 +2258,7 @@ define <4 x float> @test_x86_avx512__mm_cvt_roundu32_ss (<4 x float> %a, i32 %b)
define <4 x float> @test_x86_avx512__mm_cvt_roundu32_ss_mem(<4 x float> %a, i32* %ptr)
; CHECK-LABEL: test_x86_avx512__mm_cvt_roundu32_ss_mem:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: movl (%rdi), %eax
; CHECK-NEXT: vcvtusi2ssl %eax, {rd-sae}, %xmm0, %xmm0
; CHECK-NEXT: retq
@@ -2269,7 +2270,7 @@ define <4 x float> @test_x86_avx512__mm_cvt_roundu32_ss_mem(<4 x float> %a, i32*
define <4 x float> @test_x86_avx512__mm_cvtu32_ss(<4 x float> %a, i32 %b)
; CHECK-LABEL: test_x86_avx512__mm_cvtu32_ss:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vcvtusi2ssl %edi, %xmm0, %xmm0
; CHECK-NEXT: retq
{
@@ -2279,7 +2280,7 @@ define <4 x float> @test_x86_avx512__mm_cvtu32_ss(<4 x float> %a, i32 %b)
define <4 x float> @test_x86_avx512__mm_cvtu32_ss_mem(<4 x float> %a, i32* %ptr)
; CHECK-LABEL: test_x86_avx512__mm_cvtu32_ss_mem:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vcvtusi2ssl (%rdi), %xmm0, %xmm0
; CHECK-NEXT: retq
{
@@ -2291,7 +2292,7 @@ declare <4 x float> @llvm.x86.avx512.cvtusi2ss(<4 x float>, i32, i32) nounwind r
define <4 x float> @_mm_cvt_roundu64_ss (<4 x float> %a, i64 %b)
; CHECK-LABEL: _mm_cvt_roundu64_ss:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vcvtusi2ssq %rdi, {rd-sae}, %xmm0, %xmm0
; CHECK-NEXT: retq
{
@@ -2301,7 +2302,7 @@ define <4 x float> @_mm_cvt_roundu64_ss (<4 x float> %a, i64 %b)
define <4 x float> @_mm_cvtu64_ss(<4 x float> %a, i64 %b)
; CHECK-LABEL: _mm_cvtu64_ss:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vcvtusi2ssq %rdi, %xmm0, %xmm0
; CHECK-NEXT: retq
{
@@ -2312,7 +2313,7 @@ declare <4 x float> @llvm.x86.avx512.cvtusi642ss(<4 x float>, i64, i32) nounwind
define <2 x double> @test_x86_avx512_mm_cvtu32_sd(<2 x double> %a, i32 %b)
; CHECK-LABEL: test_x86_avx512_mm_cvtu32_sd:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vcvtusi2sdl %edi, %xmm0, %xmm0
; CHECK-NEXT: retq
{
@@ -2323,7 +2324,7 @@ declare <2 x double> @llvm.x86.avx512.cvtusi2sd(<2 x double>, i32) nounwind read
define <2 x double> @test_x86_avx512_mm_cvtu64_sd(<2 x double> %a, i64 %b)
; CHECK-LABEL: test_x86_avx512_mm_cvtu64_sd:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vcvtusi2sdq %rdi, {rd-sae}, %xmm0, %xmm0
; CHECK-NEXT: retq
{
@@ -2333,7 +2334,7 @@ define <2 x double> @test_x86_avx512_mm_cvtu64_sd(<2 x double> %a, i64 %b)
define <2 x double> @test_x86_avx512__mm_cvt_roundu64_sd(<2 x double> %a, i64 %b)
; CHECK-LABEL: test_x86_avx512__mm_cvt_roundu64_sd:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vcvtusi2sdq %rdi, %xmm0, %xmm0
; CHECK-NEXT: retq
{
@@ -2346,7 +2347,7 @@ declare <16 x i32> @llvm.x86.avx512.mask.vpermi2var.d.512(<16 x i32>, <16 x i32>
define <16 x i32>@test_int_x86_avx512_mask_vpermi2var_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32>* %x2p, <16 x i32> %x4, i16 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_d_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vmovdqa64 %zmm1, %zmm3
; CHECK-NEXT: vpermi2d (%rdi), %zmm0, %zmm3 {%k1}
@@ -2364,7 +2365,7 @@ declare <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double>, <8 x
define <8 x double>@test_int_x86_avx512_mask_vpermi2var_pd_512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_pd_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovapd %zmm1, %zmm3
; CHECK-NEXT: vpermi2pd %zmm2, %zmm0, %zmm3
@@ -2381,7 +2382,7 @@ declare <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float>, <16 x
define <16 x float>@test_int_x86_avx512_mask_vpermi2var_ps_512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_ps_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovaps %zmm1, %zmm3
; CHECK-NEXT: vpermi2ps %zmm2, %zmm0, %zmm3
@@ -2398,7 +2399,7 @@ declare <8 x i64> @llvm.x86.avx512.mask.vpermi2var.q.512(<8 x i64>, <8 x i64>, <
define <8 x i64>@test_int_x86_avx512_mask_vpermi2var_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_q_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovdqa64 %zmm1, %zmm3
; CHECK-NEXT: vpermi2q %zmm2, %zmm0, %zmm3
@@ -2415,7 +2416,7 @@ declare <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32>, <16 x i32
define <16 x i32>@test_int_x86_avx512_maskz_vpermt2var_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32>* %x2p, i16 %x3) {
; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_d_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vmovdqa64 %zmm1, %zmm2
; CHECK-NEXT: vpermt2d (%rdi), %zmm0, %zmm2 {%k1} {z}
@@ -2433,7 +2434,7 @@ declare <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64>, <8 x do
define <8 x double>@test_int_x86_avx512_maskz_vpermt2var_pd_512(<8 x i64> %x0, <8 x double> %x1, double* %x2ptr, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_pd_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vmovapd %zmm1, %zmm2
; CHECK-NEXT: vpermt2pd (%rdi){1to8}, %zmm0, %zmm2 {%k1} {z}
@@ -2453,7 +2454,7 @@ declare <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32>, <16 x
define <16 x float>@test_int_x86_avx512_maskz_vpermt2var_ps_512(<16 x i32> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3) {
; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_ps_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovaps %zmm1, %zmm3
; CHECK-NEXT: vpermt2ps %zmm2, %zmm0, %zmm3
@@ -2471,7 +2472,7 @@ declare <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64>, <8 x i64>,
define <8 x i64>@test_int_x86_avx512_maskz_vpermt2var_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_q_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovdqa64 %zmm1, %zmm3
; CHECK-NEXT: vpermt2q %zmm2, %zmm0, %zmm3
@@ -2488,7 +2489,7 @@ declare <16 x i32> @llvm.x86.avx512.mask.vpermt2var.d.512(<16 x i32>, <16 x i32>
define <16 x i32>@test_int_x86_avx512_mask_vpermt2var_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vpermt2var_d_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovdqa64 %zmm1, %zmm3
; CHECK-NEXT: vpermt2d %zmm2, %zmm0, %zmm3
@@ -2504,7 +2505,7 @@ define <16 x i32>@test_int_x86_avx512_mask_vpermt2var_d_512(<16 x i32> %x0, <16
declare <8 x double> @llvm.x86.avx512.mask.scalef.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32)
define <8 x double>@test_int_x86_avx512_mask_scalef_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_scalef_pd_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vscalefpd {rz-sae}, %zmm1, %zmm0, %zmm2 {%k1}
; CHECK-NEXT: vscalefpd {rn-sae}, %zmm1, %zmm0, %zmm0
@@ -2519,7 +2520,7 @@ define <8 x double>@test_int_x86_avx512_mask_scalef_pd_512(<8 x double> %x0, <8
declare <16 x float> @llvm.x86.avx512.mask.scalef.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
define <16 x float>@test_int_x86_avx512_mask_scalef_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_scalef_ps_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vscalefps {ru-sae}, %zmm1, %zmm0, %zmm2 {%k1}
; CHECK-NEXT: vscalefps {rn-sae}, %zmm1, %zmm0, %zmm0
@@ -2535,7 +2536,7 @@ declare <16 x i8> @llvm.x86.avx512.mask.pmov.qb.512(<8 x i64>, <16 x i8>, i8)
define <16 x i8>@test_int_x86_avx512_mask_pmov_qb_512(<8 x i64> %x0, <16 x i8> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qb_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpmovqb %zmm0, %xmm2 {%k1} {z}
; CHECK-NEXT: vpmovqb %zmm0, %xmm1 {%k1}
@@ -2555,7 +2556,7 @@ declare void @llvm.x86.avx512.mask.pmov.qb.mem.512(i8* %ptr, <8 x i64>, i8)
define void @test_int_x86_avx512_mask_pmov_qb_mem_512(i8* %ptr, <8 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qb_mem_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpmovqb %zmm0, (%rdi)
; CHECK-NEXT: vpmovqb %zmm0, (%rdi) {%k1}
@@ -2569,7 +2570,7 @@ declare <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.512(<8 x i64>, <16 x i8>, i8)
define <16 x i8>@test_int_x86_avx512_mask_pmovs_qb_512(<8 x i64> %x0, <16 x i8> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qb_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpmovsqb %zmm0, %xmm2 {%k1} {z}
; CHECK-NEXT: vpmovsqb %zmm0, %xmm1 {%k1}
@@ -2589,7 +2590,7 @@ declare void @llvm.x86.avx512.mask.pmovs.qb.mem.512(i8* %ptr, <8 x i64>, i8)
define void @test_int_x86_avx512_mask_pmovs_qb_mem_512(i8* %ptr, <8 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qb_mem_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpmovsqb %zmm0, (%rdi)
; CHECK-NEXT: vpmovsqb %zmm0, (%rdi) {%k1}
@@ -2603,7 +2604,7 @@ declare <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.512(<8 x i64>, <16 x i8>, i8)
define <16 x i8>@test_int_x86_avx512_mask_pmovus_qb_512(<8 x i64> %x0, <16 x i8> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qb_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpmovusqb %zmm0, %xmm2 {%k1} {z}
; CHECK-NEXT: vpmovusqb %zmm0, %xmm1 {%k1}
@@ -2623,7 +2624,7 @@ declare void @llvm.x86.avx512.mask.pmovus.qb.mem.512(i8* %ptr, <8 x i64>, i8)
define void @test_int_x86_avx512_mask_pmovus_qb_mem_512(i8* %ptr, <8 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qb_mem_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpmovusqb %zmm0, (%rdi)
; CHECK-NEXT: vpmovusqb %zmm0, (%rdi) {%k1}
@@ -2637,7 +2638,7 @@ declare <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64>, <8 x i16>, i8)
define <8 x i16>@test_int_x86_avx512_mask_pmov_qw_512(<8 x i64> %x0, <8 x i16> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qw_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpmovqw %zmm0, %xmm2 {%k1} {z}
; CHECK-NEXT: vpmovqw %zmm0, %xmm1 {%k1}
@@ -2657,7 +2658,7 @@ declare void @llvm.x86.avx512.mask.pmov.qw.mem.512(i8* %ptr, <8 x i64>, i8)
define void @test_int_x86_avx512_mask_pmov_qw_mem_512(i8* %ptr, <8 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qw_mem_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpmovqw %zmm0, (%rdi)
; CHECK-NEXT: vpmovqw %zmm0, (%rdi) {%k1}
@@ -2671,7 +2672,7 @@ declare <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.512(<8 x i64>, <8 x i16>, i8)
define <8 x i16>@test_int_x86_avx512_mask_pmovs_qw_512(<8 x i64> %x0, <8 x i16> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qw_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpmovsqw %zmm0, %xmm2 {%k1} {z}
; CHECK-NEXT: vpmovsqw %zmm0, %xmm1 {%k1}
@@ -2691,7 +2692,7 @@ declare void @llvm.x86.avx512.mask.pmovs.qw.mem.512(i8* %ptr, <8 x i64>, i8)
define void @test_int_x86_avx512_mask_pmovs_qw_mem_512(i8* %ptr, <8 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qw_mem_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpmovsqw %zmm0, (%rdi)
; CHECK-NEXT: vpmovsqw %zmm0, (%rdi) {%k1}
@@ -2705,7 +2706,7 @@ declare <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.512(<8 x i64>, <8 x i16>, i8)
define <8 x i16>@test_int_x86_avx512_mask_pmovus_qw_512(<8 x i64> %x0, <8 x i16> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qw_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpmovusqw %zmm0, %xmm2 {%k1} {z}
; CHECK-NEXT: vpmovusqw %zmm0, %xmm1 {%k1}
@@ -2725,7 +2726,7 @@ declare void @llvm.x86.avx512.mask.pmovus.qw.mem.512(i8* %ptr, <8 x i64>, i8)
define void @test_int_x86_avx512_mask_pmovus_qw_mem_512(i8* %ptr, <8 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qw_mem_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpmovusqw %zmm0, (%rdi)
; CHECK-NEXT: vpmovusqw %zmm0, (%rdi) {%k1}
@@ -2739,7 +2740,7 @@ declare <8 x i32> @llvm.x86.avx512.mask.pmov.qd.512(<8 x i64>, <8 x i32>, i8)
define <8 x i32>@test_int_x86_avx512_mask_pmov_qd_512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qd_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpmovqd %zmm0, %ymm2 {%k1} {z}
; CHECK-NEXT: vpmovqd %zmm0, %ymm1 {%k1}
@@ -2759,7 +2760,7 @@ declare void @llvm.x86.avx512.mask.pmov.qd.mem.512(i8* %ptr, <8 x i64>, i8)
define void @test_int_x86_avx512_mask_pmov_qd_mem_512(i8* %ptr, <8 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qd_mem_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpmovqd %zmm0, (%rdi)
; CHECK-NEXT: vpmovqd %zmm0, (%rdi) {%k1}
@@ -2773,7 +2774,7 @@ declare <8 x i32> @llvm.x86.avx512.mask.pmovs.qd.512(<8 x i64>, <8 x i32>, i8)
define <8 x i32>@test_int_x86_avx512_mask_pmovs_qd_512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qd_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpmovsqd %zmm0, %ymm2 {%k1} {z}
; CHECK-NEXT: vpmovsqd %zmm0, %ymm1 {%k1}
@@ -2793,7 +2794,7 @@ declare void @llvm.x86.avx512.mask.pmovs.qd.mem.512(i8* %ptr, <8 x i64>, i8)
define void @test_int_x86_avx512_mask_pmovs_qd_mem_512(i8* %ptr, <8 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qd_mem_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpmovsqd %zmm0, (%rdi)
; CHECK-NEXT: vpmovsqd %zmm0, (%rdi) {%k1}
@@ -2807,7 +2808,7 @@ declare <8 x i32> @llvm.x86.avx512.mask.pmovus.qd.512(<8 x i64>, <8 x i32>, i8)
define <8 x i32>@test_int_x86_avx512_mask_pmovus_qd_512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qd_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpmovusqd %zmm0, %ymm2 {%k1} {z}
; CHECK-NEXT: vpmovusqd %zmm0, %ymm1 {%k1}
@@ -2827,7 +2828,7 @@ declare void @llvm.x86.avx512.mask.pmovus.qd.mem.512(i8* %ptr, <8 x i64>, i8)
define void @test_int_x86_avx512_mask_pmovus_qd_mem_512(i8* %ptr, <8 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qd_mem_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpmovusqd %zmm0, (%rdi)
; CHECK-NEXT: vpmovusqd %zmm0, (%rdi) {%k1}
@@ -2841,7 +2842,7 @@ declare <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32>, <16 x i8>, i16)
define <16 x i8>@test_int_x86_avx512_mask_pmov_db_512(<16 x i32> %x0, <16 x i8> %x1, i16 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmov_db_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpmovdb %zmm0, %xmm2 {%k1} {z}
; CHECK-NEXT: vpmovdb %zmm0, %xmm1 {%k1}
@@ -2861,7 +2862,7 @@ declare void @llvm.x86.avx512.mask.pmov.db.mem.512(i8* %ptr, <16 x i32>, i16)
define void @test_int_x86_avx512_mask_pmov_db_mem_512(i8* %ptr, <16 x i32> %x1, i16 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmov_db_mem_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpmovdb %zmm0, (%rdi)
; CHECK-NEXT: vpmovdb %zmm0, (%rdi) {%k1}
@@ -2875,7 +2876,7 @@ declare <16 x i8> @llvm.x86.avx512.mask.pmovs.db.512(<16 x i32>, <16 x i8>, i16)
define <16 x i8>@test_int_x86_avx512_mask_pmovs_db_512(<16 x i32> %x0, <16 x i8> %x1, i16 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_db_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpmovsdb %zmm0, %xmm2 {%k1} {z}
; CHECK-NEXT: vpmovsdb %zmm0, %xmm1 {%k1}
@@ -2895,7 +2896,7 @@ declare void @llvm.x86.avx512.mask.pmovs.db.mem.512(i8* %ptr, <16 x i32>, i16)
define void @test_int_x86_avx512_mask_pmovs_db_mem_512(i8* %ptr, <16 x i32> %x1, i16 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_db_mem_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpmovsdb %zmm0, (%rdi)
; CHECK-NEXT: vpmovsdb %zmm0, (%rdi) {%k1}
@@ -2909,7 +2910,7 @@ declare <16 x i8> @llvm.x86.avx512.mask.pmovus.db.512(<16 x i32>, <16 x i8>, i16
define <16 x i8>@test_int_x86_avx512_mask_pmovus_db_512(<16 x i32> %x0, <16 x i8> %x1, i16 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_db_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpmovusdb %zmm0, %xmm2 {%k1} {z}
; CHECK-NEXT: vpmovusdb %zmm0, %xmm1 {%k1}
@@ -2929,7 +2930,7 @@ declare void @llvm.x86.avx512.mask.pmovus.db.mem.512(i8* %ptr, <16 x i32>, i16)
define void @test_int_x86_avx512_mask_pmovus_db_mem_512(i8* %ptr, <16 x i32> %x1, i16 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_db_mem_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpmovusdb %zmm0, (%rdi)
; CHECK-NEXT: vpmovusdb %zmm0, (%rdi) {%k1}
@@ -2943,7 +2944,7 @@ declare <16 x i16> @llvm.x86.avx512.mask.pmov.dw.512(<16 x i32>, <16 x i16>, i16
define <16 x i16>@test_int_x86_avx512_mask_pmov_dw_512(<16 x i32> %x0, <16 x i16> %x1, i16 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmov_dw_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpmovdw %zmm0, %ymm2 {%k1} {z}
; CHECK-NEXT: vpmovdw %zmm0, %ymm1 {%k1}
@@ -2963,7 +2964,7 @@ declare void @llvm.x86.avx512.mask.pmov.dw.mem.512(i8* %ptr, <16 x i32>, i16)
define void @test_int_x86_avx512_mask_pmov_dw_mem_512(i8* %ptr, <16 x i32> %x1, i16 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmov_dw_mem_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpmovdw %zmm0, (%rdi)
; CHECK-NEXT: vpmovdw %zmm0, (%rdi) {%k1}
@@ -2977,7 +2978,7 @@ declare <16 x i16> @llvm.x86.avx512.mask.pmovs.dw.512(<16 x i32>, <16 x i16>, i1
define <16 x i16>@test_int_x86_avx512_mask_pmovs_dw_512(<16 x i32> %x0, <16 x i16> %x1, i16 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_dw_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpmovsdw %zmm0, %ymm2 {%k1} {z}
; CHECK-NEXT: vpmovsdw %zmm0, %ymm1 {%k1}
@@ -2997,7 +2998,7 @@ declare void @llvm.x86.avx512.mask.pmovs.dw.mem.512(i8* %ptr, <16 x i32>, i16)
define void @test_int_x86_avx512_mask_pmovs_dw_mem_512(i8* %ptr, <16 x i32> %x1, i16 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_dw_mem_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpmovsdw %zmm0, (%rdi)
; CHECK-NEXT: vpmovsdw %zmm0, (%rdi) {%k1}
@@ -3011,7 +3012,7 @@ declare <16 x i16> @llvm.x86.avx512.mask.pmovus.dw.512(<16 x i32>, <16 x i16>, i
define <16 x i16>@test_int_x86_avx512_mask_pmovus_dw_512(<16 x i32> %x0, <16 x i16> %x1, i16 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_dw_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpmovusdw %zmm0, %ymm2 {%k1} {z}
; CHECK-NEXT: vpmovusdw %zmm0, %ymm1 {%k1}
@@ -3031,7 +3032,7 @@ declare void @llvm.x86.avx512.mask.pmovus.dw.mem.512(i8* %ptr, <16 x i32>, i16)
define void @test_int_x86_avx512_mask_pmovus_dw_mem_512(i8* %ptr, <16 x i32> %x1, i16 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_dw_mem_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vpmovusdw %zmm0, (%rdi)
; CHECK-NEXT: vpmovusdw %zmm0, (%rdi) {%k1}
@@ -3045,7 +3046,7 @@ declare <16 x float> @llvm.x86.avx512.mask.cvtdq2ps.512(<16 x i32>, <16 x float>
define <16 x float>@test_int_x86_avx512_mask_cvt_dq2ps_512(<16 x i32> %x0, <16 x float> %x1, i16 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_dq2ps_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vcvtdq2ps %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vcvtdq2ps {rn-sae}, %zmm0, %zmm0
@@ -3061,7 +3062,7 @@ declare <8 x i32> @llvm.x86.avx512.mask.cvtpd2dq.512(<8 x double>, <8 x i32>, i8
define <8 x i32>@test_int_x86_avx512_mask_cvt_pd2dq_512(<8 x double> %x0, <8 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_pd2dq_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vcvtpd2dq %zmm0, %ymm1 {%k1}
; CHECK-NEXT: vcvtpd2dq {rn-sae}, %zmm0, %ymm0
@@ -3077,7 +3078,7 @@ declare <8 x float> @llvm.x86.avx512.mask.cvtpd2ps.512(<8 x double>, <8 x float>
define <8 x float>@test_int_x86_avx512_mask_cvt_pd2ps_512(<8 x double> %x0, <8 x float> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_pd2ps_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vcvtpd2ps %zmm0, %ymm1 {%k1}
; CHECK-NEXT: vcvtpd2ps {ru-sae}, %zmm0, %ymm0
@@ -3093,7 +3094,7 @@ declare <8 x i32> @llvm.x86.avx512.mask.cvtpd2udq.512(<8 x double>, <8 x i32>, i
define <8 x i32>@test_int_x86_avx512_mask_cvt_pd2udq_512(<8 x double> %x0, <8 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_pd2udq_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vcvtpd2udq {ru-sae}, %zmm0, %ymm1 {%k1}
; CHECK-NEXT: vcvtpd2udq {rn-sae}, %zmm0, %ymm0
@@ -3109,7 +3110,7 @@ declare <16 x i32> @llvm.x86.avx512.mask.cvtps2dq.512(<16 x float>, <16 x i32>,
define <16 x i32>@test_int_x86_avx512_mask_cvt_ps2dq_512(<16 x float> %x0, <16 x i32> %x1, i16 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ps2dq_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vcvtps2dq {ru-sae}, %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vcvtps2dq {rn-sae}, %zmm0, %zmm0
@@ -3125,7 +3126,7 @@ declare <8 x double> @llvm.x86.avx512.mask.cvtps2pd.512(<8 x float>, <8 x double
define <8 x double>@test_int_x86_avx512_mask_cvt_ps2pd_512(<8 x float> %x0, <8 x double> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ps2pd_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vcvtps2pd %ymm0, %zmm1 {%k1}
; CHECK-NEXT: vcvtps2pd {sae}, %ymm0, %zmm0
@@ -3141,7 +3142,7 @@ declare <16 x i32> @llvm.x86.avx512.mask.cvtps2udq.512(<16 x float>, <16 x i32>,
define <16 x i32>@test_int_x86_avx512_mask_cvt_ps2udq_512(<16 x float> %x0, <16 x i32> %x1, i16 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ps2udq_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vcvtps2udq {ru-sae}, %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vcvtps2udq {rn-sae}, %zmm0, %zmm0
@@ -3157,7 +3158,7 @@ declare <8 x i32> @llvm.x86.avx512.mask.cvttpd2dq.512(<8 x double>, <8 x i32>, i
define <8 x i32>@test_int_x86_avx512_mask_cvtt_pd2dq_512(<8 x double> %x0, <8 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_pd2dq_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vcvttpd2dq %zmm0, %ymm1 {%k1}
; CHECK-NEXT: vcvttpd2dq {sae}, %zmm0, %ymm0
@@ -3173,7 +3174,7 @@ declare <16 x float> @llvm.x86.avx512.mask.cvtudq2ps.512(<16 x i32>, <16 x float
define <16 x float>@test_int_x86_avx512_mask_cvt_udq2ps_512(<16 x i32> %x0, <16 x float> %x1, i16 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_udq2ps_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vcvtudq2ps %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vcvtudq2ps {rn-sae}, %zmm0, %zmm0
@@ -3189,7 +3190,7 @@ declare <8 x i32> @llvm.x86.avx512.mask.cvttpd2udq.512(<8 x double>, <8 x i32>,
define <8 x i32>@test_int_x86_avx512_mask_cvtt_pd2udq_512(<8 x double> %x0, <8 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_pd2udq_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vcvttpd2udq %zmm0, %ymm1 {%k1}
; CHECK-NEXT: vcvttpd2udq {sae}, %zmm0, %ymm0
@@ -3205,7 +3206,7 @@ declare <16 x i32> @llvm.x86.avx512.mask.cvttps2dq.512(<16 x float>, <16 x i32>,
define <16 x i32>@test_int_x86_avx512_mask_cvtt_ps2dq_512(<16 x float> %x0, <16 x i32> %x1, i16 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_ps2dq_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vcvttps2dq %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vcvttps2dq {sae}, %zmm0, %zmm0
@@ -3221,7 +3222,7 @@ declare <16 x i32> @llvm.x86.avx512.mask.cvttps2udq.512(<16 x float>, <16 x i32>
define <16 x i32>@test_int_x86_avx512_mask_cvtt_ps2udq_512(<16 x float> %x0, <16 x i32> %x1, i16 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_ps2udq_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vcvttps2udq %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vcvttps2udq {sae}, %zmm0, %zmm0
@@ -3237,7 +3238,7 @@ declare <4 x float> @llvm.x86.avx512.mask.getexp.ss(<4 x float>, <4 x float>, <4
define <4 x float> @test_getexp_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
; CHECK-LABEL: test_getexp_ss:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovaps %xmm2, %xmm3
; CHECK-NEXT: vgetexpss %xmm1, %xmm0, %xmm3 {%k1}
@@ -3263,7 +3264,7 @@ declare <2 x double> @llvm.x86.avx512.mask.getexp.sd(<2 x double>, <2 x double>,
define <2 x double> @test_getexp_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
; CHECK-LABEL: test_getexp_sd:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vgetexpsd %xmm1, %xmm0, %xmm3
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovapd %xmm2, %xmm4
@@ -3289,11 +3290,11 @@ declare i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double>, <2 x double>, i32, i8, i32
define i8@test_int_x86_avx512_mask_cmp_sd(<2 x double> %x0, <2 x double> %x1, i8 %x3, i32 %x4) {
; CHECK-LABEL: test_int_x86_avx512_mask_cmp_sd:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vcmpnltsd {sae}, %xmm1, %xmm0, %k0 {%k1}
; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: ## kill: def %al killed %al killed %eax
; CHECK-NEXT: retq
%res4 = call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> %x0, <2 x double> %x1, i32 5, i8 %x3, i32 8)
@@ -3302,20 +3303,20 @@ define i8@test_int_x86_avx512_mask_cmp_sd(<2 x double> %x0, <2 x double> %x1, i8
define i8@test_int_x86_avx512_mask_cmp_sd_all(<2 x double> %x0, <2 x double> %x1, i8 %x3, i32 %x4) {
; CHECK-LABEL: test_int_x86_avx512_mask_cmp_sd_all:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vcmplesd %xmm1, %xmm0, %k0
-; CHECK-NEXT: kmovw %k0, %ecx
+; CHECK-NEXT: kmovw %k0, %eax
; CHECK-NEXT: vcmpunordsd {sae}, %xmm1, %xmm0, %k0
-; CHECK-NEXT: kmovw %k0, %edx
+; CHECK-NEXT: kmovw %k0, %ecx
+; CHECK-NEXT: orl %eax, %ecx
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vcmpneqsd %xmm1, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %esi
+; CHECK-NEXT: kmovw %k0, %edx
; CHECK-NEXT: vcmpnltsd {sae}, %xmm1, %xmm0, %k0 {%k1}
; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: orb %cl, %dl
-; CHECK-NEXT: orb %sil, %al
-; CHECK-NEXT: orb %dl, %al
-; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: orl %edx, %eax
+; CHECK-NEXT: orl %ecx, %eax
+; CHECK-NEXT: ## kill: def %al killed %al killed %eax
; CHECK-NEXT: retq
%res1 = call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> %x0, <2 x double> %x1, i32 2, i8 -1, i32 4)
@@ -3333,11 +3334,11 @@ declare i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float>, <4 x float>, i32, i8, i32)
define i8@test_int_x86_avx512_mask_cmp_ss(<4 x float> %x0, <4 x float> %x1, i8 %x3, i32 %x4) {
; CHECK-LABEL: test_int_x86_avx512_mask_cmp_ss:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vcmpunordss %xmm1, %xmm0, %k0 {%k1}
; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: ## kill: def %al killed %al killed %eax
; CHECK-NEXT: retq
%res2 = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %x0, <4 x float> %x1, i32 3, i8 %x3, i32 4)
@@ -3347,20 +3348,20 @@ define i8@test_int_x86_avx512_mask_cmp_ss(<4 x float> %x0, <4 x float> %x1, i8 %
define i8@test_int_x86_avx512_mask_cmp_ss_all(<4 x float> %x0, <4 x float> %x1, i8 %x3, i32 %x4) {
; CHECK-LABEL: test_int_x86_avx512_mask_cmp_ss_all:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vcmpless %xmm1, %xmm0, %k0
-; CHECK-NEXT: kmovw %k0, %ecx
+; CHECK-NEXT: kmovw %k0, %eax
; CHECK-NEXT: vcmpunordss {sae}, %xmm1, %xmm0, %k0
-; CHECK-NEXT: kmovw %k0, %edx
+; CHECK-NEXT: kmovw %k0, %ecx
+; CHECK-NEXT: andl %eax, %ecx
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vcmpneqss %xmm1, %xmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %esi
+; CHECK-NEXT: kmovw %k0, %edx
; CHECK-NEXT: vcmpnltss {sae}, %xmm1, %xmm0, %k0 {%k1}
; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: andb %cl, %dl
-; CHECK-NEXT: andb %sil, %al
-; CHECK-NEXT: andb %dl, %al
-; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: andl %edx, %eax
+; CHECK-NEXT: andl %ecx, %eax
+; CHECK-NEXT: ## kill: def %al killed %al killed %eax
; CHECK-NEXT: retq
%res1 = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %x0, <4 x float> %x1, i32 2, i8 -1, i32 4)
%res2 = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %x0, <4 x float> %x1, i32 3, i8 -1, i32 8)
@@ -3373,80 +3374,11 @@ define i8@test_int_x86_avx512_mask_cmp_ss_all(<4 x float> %x0, <4 x float> %x1,
ret i8 %res13
}
-declare <16 x float> @llvm.x86.avx512.mask.shuf.f32x4(<16 x float>, <16 x float>, i32, <16 x float>, i16)
-
-define <16 x float>@test_int_x86_avx512_mask_shuf_f32x4(<16 x float> %x0, <16 x float> %x1, <16 x float> %x3, i16 %x4) {
-; CHECK-LABEL: test_int_x86_avx512_mask_shuf_f32x4:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[8,9,10,11,4,5,6,7],zmm1[4,5,6,7,0,1,2,3]
-; CHECK-NEXT: vshuff32x4 {{.*#+}} zmm0 = zmm0[8,9,10,11,4,5,6,7],zmm1[4,5,6,7,0,1,2,3]
-; CHECK-NEXT: vaddps %zmm0, %zmm2, %zmm0
-; CHECK-NEXT: retq
- %res = call <16 x float> @llvm.x86.avx512.mask.shuf.f32x4(<16 x float> %x0, <16 x float> %x1, i32 22, <16 x float> %x3, i16 %x4)
- %res1 = call <16 x float> @llvm.x86.avx512.mask.shuf.f32x4(<16 x float> %x0, <16 x float> %x1, i32 22, <16 x float> %x3, i16 -1)
- %res2 = fadd <16 x float> %res, %res1
- ret <16 x float> %res2
-}
-
-declare <8 x double> @llvm.x86.avx512.mask.shuf.f64x2(<8 x double>, <8 x double>, i32, <8 x double>, i8)
-
-define <8 x double>@test_int_x86_avx512_mask_shuf_f64x2(<8 x double> %x0, <8 x double> %x1, <8 x double> %x3, i8 %x4) {
-; CHECK-LABEL: test_int_x86_avx512_mask_shuf_f64x2:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm3 {%k1} {z} = zmm0[4,5,2,3],zmm1[2,3,0,1]
-; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[4,5,2,3],zmm1[2,3,0,1]
-; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[4,5,2,3],zmm1[2,3,0,1]
-; CHECK-NEXT: vaddpd %zmm0, %zmm2, %zmm0
-; CHECK-NEXT: vaddpd %zmm3, %zmm0, %zmm0
-; CHECK-NEXT: retq
- %res = call <8 x double> @llvm.x86.avx512.mask.shuf.f64x2(<8 x double> %x0, <8 x double> %x1, i32 22, <8 x double> %x3, i8 %x4)
- %res1 = call <8 x double> @llvm.x86.avx512.mask.shuf.f64x2(<8 x double> %x0, <8 x double> %x1, i32 22, <8 x double> %x3, i8 -1)
- %res2 = call <8 x double> @llvm.x86.avx512.mask.shuf.f64x2(<8 x double> %x0, <8 x double> %x1, i32 22, <8 x double> zeroinitializer, i8 %x4)
-
- %res3 = fadd <8 x double> %res, %res1
- %res4 = fadd <8 x double> %res3, %res2
- ret <8 x double> %res4
-}
-
-declare <16 x i32> @llvm.x86.avx512.mask.shuf.i32x4(<16 x i32>, <16 x i32>, i32, <16 x i32>, i16)
-
-define <16 x i32>@test_int_x86_avx512_mask_shuf_i32x4(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x3, i16 %x4) {
-; CHECK-LABEL: test_int_x86_avx512_mask_shuf_i32x4:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[8,9,10,11,4,5,6,7],zmm1[4,5,6,7,0,1,2,3]
-; CHECK-NEXT: vshufi32x4 {{.*#+}} zmm0 = zmm0[8,9,10,11,4,5,6,7],zmm1[4,5,6,7,0,1,2,3]
-; CHECK-NEXT: vpaddd %zmm0, %zmm2, %zmm0
-; CHECK-NEXT: retq
- %res = call <16 x i32> @llvm.x86.avx512.mask.shuf.i32x4(<16 x i32> %x0, <16 x i32> %x1, i32 22, <16 x i32> %x3, i16 %x4)
- %res1 = call <16 x i32> @llvm.x86.avx512.mask.shuf.i32x4(<16 x i32> %x0, <16 x i32> %x1, i32 22, <16 x i32> %x3, i16 -1)
- %res2 = add <16 x i32> %res, %res1
- ret <16 x i32> %res2
-}
-
-declare <8 x i64> @llvm.x86.avx512.mask.shuf.i64x2(<8 x i64>, <8 x i64>, i32, <8 x i64>, i8)
-
-define <8 x i64>@test_int_x86_avx512_mask_shuf_i64x2(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x3, i8 %x4) {
-; CHECK-LABEL: test_int_x86_avx512_mask_shuf_i64x2:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[4,5,2,3],zmm1[2,3,0,1]
-; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[4,5,2,3],zmm1[2,3,0,1]
-; CHECK-NEXT: vpaddq %zmm0, %zmm2, %zmm0
-; CHECK-NEXT: retq
- %res = call <8 x i64> @llvm.x86.avx512.mask.shuf.i64x2(<8 x i64> %x0, <8 x i64> %x1, i32 22, <8 x i64> %x3, i8 %x4)
- %res1 = call <8 x i64> @llvm.x86.avx512.mask.shuf.i64x2(<8 x i64> %x0, <8 x i64> %x1, i32 22, <8 x i64> %x3, i8 -1)
- %res2 = add <8 x i64> %res, %res1
- ret <8 x i64> %res2
-}
-
declare <8 x double> @llvm.x86.avx512.mask.getmant.pd.512(<8 x double>, i32, <8 x double>, i8, i32)
define <8 x double>@test_int_x86_avx512_mask_getmant_pd_512(<8 x double> %x0, <8 x double> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_getmant_pd_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vgetmantpd $11, %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vgetmantpd $11, {sae}, %zmm0, %zmm0
@@ -3462,7 +3394,7 @@ declare <16 x float> @llvm.x86.avx512.mask.getmant.ps.512(<16 x float>, i32, <16
define <16 x float>@test_int_x86_avx512_mask_getmant_ps_512(<16 x float> %x0, <16 x float> %x2, i16 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_getmant_ps_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vgetmantps $11, %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vgetmantps $11, {sae}, %zmm0, %zmm0
@@ -3478,7 +3410,7 @@ declare <2 x double> @llvm.x86.avx512.mask.getmant.sd(<2 x double>, <2 x double>
define <2 x double>@test_int_x86_avx512_mask_getmant_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_getmant_sd:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vgetmantsd $11, %xmm1, %xmm0, %xmm3
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovapd %xmm2, %xmm4
@@ -3503,7 +3435,7 @@ declare <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float>, <4 x float>, i
define <4 x float>@test_int_x86_avx512_mask_getmant_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_getmant_ss:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vgetmantss $11, %xmm1, %xmm0, %xmm3
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vgetmantss $11, %xmm1, %xmm0, %xmm2 {%k1}
@@ -3527,7 +3459,7 @@ declare <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double>, <8 x i64>)
define <8 x double>@test_int_x86_avx512_vpermilvar_pd_512(<8 x double> %x0, <8 x i64> %x1) {
; CHECK-LABEL: test_int_x86_avx512_vpermilvar_pd_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpermilpd %zmm1, %zmm0, %zmm0
; CHECK-NEXT: retq
%res = call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> %x0, <8 x i64> %x1)
@@ -3536,7 +3468,7 @@ define <8 x double>@test_int_x86_avx512_vpermilvar_pd_512(<8 x double> %x0, <8 x
define <8 x double>@test_int_x86_avx512_vpermilvar_pd_512_mask(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %mask) {
; CHECK-LABEL: test_int_x86_avx512_vpermilvar_pd_512_mask:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpermilpd %zmm1, %zmm0, %zmm2 {%k1}
; CHECK-NEXT: vmovapd %zmm2, %zmm0
@@ -3549,7 +3481,7 @@ define <8 x double>@test_int_x86_avx512_vpermilvar_pd_512_mask(<8 x double> %x0,
define <8 x double>@test_int_x86_avx512_vpermilvar_pd_512_maskz(<8 x double> %x0, <8 x i64> %x1, i8 %mask) {
; CHECK-LABEL: test_int_x86_avx512_vpermilvar_pd_512_maskz:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpermilpd %zmm1, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -3563,7 +3495,7 @@ declare <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float>, <16 x i32>
define <16 x float>@test_int_x86_avx512_vpermilvar_ps_512(<16 x float> %x0, <16 x i32> %x1) {
; CHECK-LABEL: test_int_x86_avx512_vpermilvar_ps_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpermilps %zmm1, %zmm0, %zmm0
; CHECK-NEXT: retq
%res = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> %x1)
@@ -3572,7 +3504,7 @@ define <16 x float>@test_int_x86_avx512_vpermilvar_ps_512(<16 x float> %x0, <16
define <16 x float>@test_int_x86_avx512_vpermilvar_ps_512_mask(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %mask) {
; CHECK-LABEL: test_int_x86_avx512_vpermilvar_ps_512_mask:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpermilps %zmm1, %zmm0, %zmm2 {%k1}
; CHECK-NEXT: vmovaps %zmm2, %zmm0
@@ -3585,7 +3517,7 @@ define <16 x float>@test_int_x86_avx512_vpermilvar_ps_512_mask(<16 x float> %x0,
define <16 x float>@test_int_x86_avx512_vpermilvar_ps_512_maskz(<16 x float> %x0, <16 x i32> %x1, i16 %mask) {
; CHECK-LABEL: test_int_x86_avx512_vpermilvar_ps_512_maskz:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpermilps %zmm1, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -3598,7 +3530,7 @@ define <16 x float>@test_int_x86_avx512_vpermilvar_ps_512_maskz(<16 x float> %x0
; Test case to make sure we can print shuffle decode comments for constant pool loads.
define <16 x float>@test_int_x86_avx512_vpermilvar_ps_512_constant_pool(<16 x float> %x0, <16 x i32> %x1) {
; CHECK-LABEL: test_int_x86_avx512_vpermilvar_ps_512_constant_pool:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[1,0,3,2,4,5,6,7,10,11,8,9,14,15,13,12]
; CHECK-NEXT: retq
%res = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 1, i32 0>)
@@ -3607,7 +3539,7 @@ define <16 x float>@test_int_x86_avx512_vpermilvar_ps_512_constant_pool(<16 x fl
define <16 x float>@test_int_x86_avx512_vpermilvar_ps_512_constant_pool_mask(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %mask) {
; CHECK-LABEL: test_int_x86_avx512_vpermilvar_ps_512_constant_pool_mask:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpermilps {{.*#+}} zmm2 {%k1} = zmm0[1,0,3,2,4,5,6,7,10,11,8,9,14,15,13,12]
; CHECK-NEXT: vmovaps %zmm2, %zmm0
@@ -3620,7 +3552,7 @@ define <16 x float>@test_int_x86_avx512_vpermilvar_ps_512_constant_pool_mask(<16
define <16 x float>@test_int_x86_avx512_vpermilvar_ps_512_constant_pool_maskz(<16 x float> %x0, <16 x i32> %x1, i16 %mask) {
; CHECK-LABEL: test_int_x86_avx512_vpermilvar_ps_512_constant_pool_maskz:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpermilps {{.*#+}} zmm0 {%k1} {z} = zmm0[1,0,3,2,4,5,6,7,10,11,8,9,14,15,13,12]
; CHECK-NEXT: retq
@@ -3634,7 +3566,7 @@ declare <2 x double> @llvm.x86.avx512.mask.cvtss2sd.round(<2 x double>, <4 x flo
define <2 x double>@test_int_x86_avx512_mask_cvt_ss2sd_round(<2 x double> %x0,<4 x float> %x1, <2 x double> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ss2sd_round:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vcvtss2sd %xmm1, %xmm0, %xmm2 {%k1}
; CHECK-NEXT: vcvtss2sd {sae}, %xmm1, %xmm0, %xmm0
@@ -3650,7 +3582,7 @@ declare <4 x float> @llvm.x86.avx512.mask.cvtsd2ss.round(<4 x float>, <2 x doubl
define <4 x float>@test_int_x86_avx512_mask_cvt_sd2ss_round(<4 x float> %x0,<2 x double> %x1, <4 x float> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_sd2ss_round:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vcvtsd2ss {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1}
; CHECK-NEXT: vcvtsd2ss {rn-sae}, %xmm1, %xmm0, %xmm0
@@ -3666,7 +3598,7 @@ declare <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32>, <16 x i32>,
define <16 x i32>@test_int_x86_avx512_mask_pternlog_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x4) {
; CHECK-LABEL: test_int_x86_avx512_mask_pternlog_d_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovdqa64 %zmm0, %zmm3
; CHECK-NEXT: vpternlogd $33, %zmm2, %zmm1, %zmm3
@@ -3683,7 +3615,7 @@ declare <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32>, <16 x i32>,
define <16 x i32>@test_int_x86_avx512_maskz_pternlog_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x4) {
; CHECK-LABEL: test_int_x86_avx512_maskz_pternlog_d_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovdqa64 %zmm0, %zmm3
; CHECK-NEXT: vpternlogd $33, %zmm2, %zmm1, %zmm3
@@ -3700,7 +3632,7 @@ declare <8 x i64> @llvm.x86.avx512.mask.pternlog.q.512(<8 x i64>, <8 x i64>, <8
define <8 x i64>@test_int_x86_avx512_mask_pternlog_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x4) {
; CHECK-LABEL: test_int_x86_avx512_mask_pternlog_q_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovdqa64 %zmm0, %zmm3
; CHECK-NEXT: vpternlogq $33, %zmm2, %zmm1, %zmm3
@@ -3717,7 +3649,7 @@ declare <8 x i64> @llvm.x86.avx512.maskz.pternlog.q.512(<8 x i64>, <8 x i64>, <8
define <8 x i64>@test_int_x86_avx512_maskz_pternlog_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x4) {
; CHECK-LABEL: test_int_x86_avx512_maskz_pternlog_q_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovdqa64 %zmm0, %zmm3
; CHECK-NEXT: vpternlogq $33, %zmm2, %zmm1, %zmm3
@@ -3732,7 +3664,7 @@ define <8 x i64>@test_int_x86_avx512_maskz_pternlog_q_512(<8 x i64> %x0, <8 x i6
define i32 @test_x86_avx512_comi_sd_eq_sae(<2 x double> %a0, <2 x double> %a1) {
; CHECK-LABEL: test_x86_avx512_comi_sd_eq_sae:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vcmpeqsd {sae}, %xmm1, %xmm0, %k0
; CHECK-NEXT: kmovw %k0, %eax
; CHECK-NEXT: retq
@@ -3742,7 +3674,7 @@ define i32 @test_x86_avx512_comi_sd_eq_sae(<2 x double> %a0, <2 x double> %a1) {
define i32 @test_x86_avx512_ucomi_sd_eq_sae(<2 x double> %a0, <2 x double> %a1) {
; CHECK-LABEL: test_x86_avx512_ucomi_sd_eq_sae:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vcmpeq_uqsd {sae}, %xmm1, %xmm0, %k0
; CHECK-NEXT: kmovw %k0, %eax
; CHECK-NEXT: retq
@@ -3752,7 +3684,7 @@ define i32 @test_x86_avx512_ucomi_sd_eq_sae(<2 x double> %a0, <2 x double> %a1)
define i32 @test_x86_avx512_comi_sd_eq(<2 x double> %a0, <2 x double> %a1) {
; CHECK-LABEL: test_x86_avx512_comi_sd_eq:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vcmpeqsd %xmm1, %xmm0, %k0
; CHECK-NEXT: kmovw %k0, %eax
; CHECK-NEXT: retq
@@ -3762,7 +3694,7 @@ define i32 @test_x86_avx512_comi_sd_eq(<2 x double> %a0, <2 x double> %a1) {
define i32 @test_x86_avx512_ucomi_sd_eq(<2 x double> %a0, <2 x double> %a1) {
; CHECK-LABEL: test_x86_avx512_ucomi_sd_eq:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vcmpeq_uqsd %xmm1, %xmm0, %k0
; CHECK-NEXT: kmovw %k0, %eax
; CHECK-NEXT: retq
@@ -3772,7 +3704,7 @@ define i32 @test_x86_avx512_ucomi_sd_eq(<2 x double> %a0, <2 x double> %a1) {
define i32 @test_x86_avx512_comi_sd_lt_sae(<2 x double> %a0, <2 x double> %a1) {
; CHECK-LABEL: test_x86_avx512_comi_sd_lt_sae:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vcmpltsd {sae}, %xmm1, %xmm0, %k0
; CHECK-NEXT: kmovw %k0, %eax
; CHECK-NEXT: retq
@@ -3782,7 +3714,7 @@ define i32 @test_x86_avx512_comi_sd_lt_sae(<2 x double> %a0, <2 x double> %a1) {
define i32 @test_x86_avx512_ucomi_sd_lt_sae(<2 x double> %a0, <2 x double> %a1) {
; CHECK-LABEL: test_x86_avx512_ucomi_sd_lt_sae:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vcmpngesd {sae}, %xmm1, %xmm0, %k0
; CHECK-NEXT: kmovw %k0, %eax
; CHECK-NEXT: retq
@@ -3792,7 +3724,7 @@ define i32 @test_x86_avx512_ucomi_sd_lt_sae(<2 x double> %a0, <2 x double> %a1)
define i32 @test_x86_avx512_comi_sd_lt(<2 x double> %a0, <2 x double> %a1) {
; CHECK-LABEL: test_x86_avx512_comi_sd_lt:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vcmpltsd %xmm1, %xmm0, %k0
; CHECK-NEXT: kmovw %k0, %eax
; CHECK-NEXT: retq
@@ -3802,7 +3734,7 @@ define i32 @test_x86_avx512_comi_sd_lt(<2 x double> %a0, <2 x double> %a1) {
define i32 @test_x86_avx512_ucomi_sd_lt(<2 x double> %a0, <2 x double> %a1) {
; CHECK-LABEL: test_x86_avx512_ucomi_sd_lt:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vcmpngesd %xmm1, %xmm0, %k0
; CHECK-NEXT: kmovw %k0, %eax
; CHECK-NEXT: retq
@@ -3814,7 +3746,7 @@ declare i32 @llvm.x86.avx512.vcomi.sd(<2 x double>, <2 x double>, i32, i32)
define i32 @test_x86_avx512_ucomi_ss_lt(<4 x float> %a0, <4 x float> %a1) {
; CHECK-LABEL: test_x86_avx512_ucomi_ss_lt:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vcmpngess %xmm1, %xmm0, %k0
; CHECK-NEXT: kmovw %k0, %eax
; CHECK-NEXT: retq
@@ -3824,150 +3756,11 @@ define i32 @test_x86_avx512_ucomi_ss_lt(<4 x float> %a0, <4 x float> %a1) {
declare i32 @llvm.x86.avx512.vcomi.ss(<4 x float>, <4 x float>, i32, i32)
-declare <16 x float> @llvm.x86.avx512.mask.broadcastf32x4.512(<4 x float>, <16 x float>, i16)
-
-define <16 x float>@test_int_x86_avx512_mask_broadcastf32x4_512(<4 x float> %x0, <16 x float> %x2, i16 %mask) {
-; CHECK-LABEL: test_int_x86_avx512_mask_broadcastf32x4_512:
-; CHECK: ## BB#0:
-; CHECK-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
-; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vshuff32x4 {{.*#+}} zmm2 {%k1} {z} = zmm0[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; CHECK-NEXT: vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; CHECK-NEXT: vshuff32x4 {{.*#+}} zmm0 = zmm0[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0
-; CHECK-NEXT: vaddps %zmm0, %zmm2, %zmm0
-; CHECK-NEXT: retq
-
- %res1 = call <16 x float> @llvm.x86.avx512.mask.broadcastf32x4.512(<4 x float> %x0, <16 x float> %x2, i16 -1)
- %res2 = call <16 x float> @llvm.x86.avx512.mask.broadcastf32x4.512(<4 x float> %x0, <16 x float> %x2, i16 %mask)
- %res3 = call <16 x float> @llvm.x86.avx512.mask.broadcastf32x4.512(<4 x float> %x0, <16 x float> zeroinitializer, i16 %mask)
- %res4 = fadd <16 x float> %res1, %res2
- %res5 = fadd <16 x float> %res3, %res4
- ret <16 x float> %res5
-}
-
-define <16 x float>@test_int_x86_avx512_mask_broadcastf32x4_512_load(<4 x float>* %x0ptr, <16 x float> %x2, i16 %mask) {
-; CHECK-LABEL: test_int_x86_avx512_mask_broadcastf32x4_512_load:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %esi, %k1
-; CHECK-NEXT: vmovaps (%rdi), %xmm1
-; CHECK-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; CHECK-NEXT: retq
- %x0 = load <4 x float>, <4 x float>* %x0ptr
- %res = call <16 x float> @llvm.x86.avx512.mask.broadcastf32x4.512(<4 x float> %x0, <16 x float> %x2, i16 %mask)
- ret <16 x float> %res
-}
-
-declare <8 x double> @llvm.x86.avx512.mask.broadcastf64x4.512(<4 x double>, <8 x double>, i8)
-
-define <8 x double>@test_int_x86_avx512_mask_broadcastf64x4_512(<4 x double> %x0, <8 x double> %x2, i8 %mask) {
-; CHECK-LABEL: test_int_x86_avx512_mask_broadcastf64x4_512:
-; CHECK: ## BB#0:
-; CHECK-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
-; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm2 {%k1} {z} = zmm0[0,1,2,3,0,1,2,3]
-; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,0,1,2,3]
-; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3,0,1,2,3]
-; CHECK-NEXT: vaddpd %zmm1, %zmm0, %zmm0
-; CHECK-NEXT: vaddpd %zmm0, %zmm2, %zmm0
-; CHECK-NEXT: retq
-
- %res1 = call <8 x double> @llvm.x86.avx512.mask.broadcastf64x4.512(<4 x double> %x0, <8 x double> %x2, i8 -1)
- %res2 = call <8 x double> @llvm.x86.avx512.mask.broadcastf64x4.512(<4 x double> %x0, <8 x double> %x2, i8 %mask)
- %res3 = call <8 x double> @llvm.x86.avx512.mask.broadcastf64x4.512(<4 x double> %x0, <8 x double> zeroinitializer, i8 %mask)
- %res4 = fadd <8 x double> %res1, %res2
- %res5 = fadd <8 x double> %res3, %res4
- ret <8 x double> %res5
-}
-
-define <8 x double>@test_int_x86_avx512_mask_broadcastf64x4_512_load(<4 x double>* %x0ptr, <8 x double> %x2, i8 %mask) {
-; CHECK-LABEL: test_int_x86_avx512_mask_broadcastf64x4_512_load:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %esi, %k1
-; CHECK-NEXT: vmovapd (%rdi), %ymm1
-; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,3,0,1,2,3]
-; CHECK-NEXT: retq
-
- %x0 = load <4 x double>, <4 x double>* %x0ptr
- %res = call <8 x double> @llvm.x86.avx512.mask.broadcastf64x4.512(<4 x double> %x0, <8 x double> %x2, i8 %mask)
- ret <8 x double> %res
-}
-
-declare <16 x i32> @llvm.x86.avx512.mask.broadcasti32x4.512(<4 x i32>, <16 x i32>, i16)
-
-define <16 x i32>@test_int_x86_avx512_mask_broadcasti32x4_512(<4 x i32> %x0, <16 x i32> %x2, i16 %mask) {
-; CHECK-LABEL: test_int_x86_avx512_mask_broadcasti32x4_512:
-; CHECK: ## BB#0:
-; CHECK-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
-; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vshufi32x4 {{.*#+}} zmm2 {%k1} {z} = zmm0[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; CHECK-NEXT: vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; CHECK-NEXT: vshufi32x4 {{.*#+}} zmm0 = zmm0[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0
-; CHECK-NEXT: vpaddd %zmm0, %zmm2, %zmm0
-; CHECK-NEXT: retq
-
- %res1 = call <16 x i32> @llvm.x86.avx512.mask.broadcasti32x4.512(<4 x i32> %x0, <16 x i32> %x2, i16 -1)
- %res2 = call <16 x i32> @llvm.x86.avx512.mask.broadcasti32x4.512(<4 x i32> %x0, <16 x i32> %x2, i16 %mask)
- %res3 = call <16 x i32> @llvm.x86.avx512.mask.broadcasti32x4.512(<4 x i32> %x0, <16 x i32> zeroinitializer, i16 %mask)
- %res4 = add <16 x i32> %res1, %res2
- %res5 = add <16 x i32> %res3, %res4
- ret <16 x i32> %res5
-}
-
-define <16 x i32>@test_int_x86_avx512_mask_broadcasti32x4_512_load(<4 x i32>* %x0ptr, <16 x i32> %x2, i16 %mask) {
-; CHECK-LABEL: test_int_x86_avx512_mask_broadcasti32x4_512_load:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %esi, %k1
-; CHECK-NEXT: vmovdqa (%rdi), %xmm1
-; CHECK-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; CHECK-NEXT: retq
-
- %x0 = load <4 x i32>, <4 x i32>* %x0ptr
- %res = call <16 x i32> @llvm.x86.avx512.mask.broadcasti32x4.512(<4 x i32> %x0, <16 x i32> %x2, i16 %mask)
- ret <16 x i32> %res
-}
-
-declare <8 x i64> @llvm.x86.avx512.mask.broadcasti64x4.512(<4 x i64>, <8 x i64>, i8)
-
-define <8 x i64>@test_int_x86_avx512_mask_broadcasti64x4_512(<4 x i64> %x0, <8 x i64> %x2, i8 %mask) {
-; CHECK-LABEL: test_int_x86_avx512_mask_broadcasti64x4_512:
-; CHECK: ## BB#0:
-; CHECK-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
-; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k1} {z} = zmm0[0,1,2,3,0,1,2,3]
-; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,0,1,2,3]
-; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3,0,1,2,3]
-; CHECK-NEXT: vpaddq %zmm1, %zmm0, %zmm0
-; CHECK-NEXT: vpaddq %zmm0, %zmm2, %zmm0
-; CHECK-NEXT: retq
-
- %res1 = call <8 x i64> @llvm.x86.avx512.mask.broadcasti64x4.512(<4 x i64> %x0, <8 x i64> %x2, i8 -1)
- %res2 = call <8 x i64> @llvm.x86.avx512.mask.broadcasti64x4.512(<4 x i64> %x0, <8 x i64> %x2, i8 %mask)
- %res3 = call <8 x i64> @llvm.x86.avx512.mask.broadcasti64x4.512(<4 x i64> %x0, <8 x i64> zeroinitializer, i8 %mask)
- %res4 = add <8 x i64> %res1, %res2
- %res5 = add <8 x i64> %res3, %res4
- ret <8 x i64> %res5
-}
-
-define <8 x i64>@test_int_x86_avx512_mask_broadcasti64x4_512_load(<4 x i64>* %x0ptr, <8 x i64> %x2, i8 %mask) {
-; CHECK-LABEL: test_int_x86_avx512_mask_broadcasti64x4_512_load:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %esi, %k1
-; CHECK-NEXT: vmovdqa (%rdi), %ymm1
-; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,3,0,1,2,3]
-; CHECK-NEXT: retq
-
- %x0 = load <4 x i64>, <4 x i64>* %x0ptr
- %res = call <8 x i64> @llvm.x86.avx512.mask.broadcasti64x4.512(<4 x i64> %x0, <8 x i64> %x2, i8 %mask)
- ret <8 x i64> %res
-}
-
declare <16 x i32> @llvm.x86.avx512.mask.prorv.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
define <16 x i32>@test_int_x86_avx512_mask_prorv_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_prorv_d_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vprorvd %zmm1, %zmm0, %zmm3
; CHECK-NEXT: vprorvd %zmm1, %zmm0, %zmm2 {%k1}
@@ -3987,7 +3780,7 @@ declare <8 x i64> @llvm.x86.avx512.mask.prorv.q.512(<8 x i64>, <8 x i64>, <8 x i
define <8 x i64>@test_int_x86_avx512_mask_prorv_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_prorv_q_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vprorvq %zmm1, %zmm0, %zmm3
; CHECK-NEXT: vprorvq %zmm1, %zmm0, %zmm2 {%k1}
@@ -4007,7 +3800,7 @@ declare <16 x i32> @llvm.x86.avx512.mask.prol.d.512(<16 x i32>, i32, <16 x i32>,
define <16 x i32>@test_int_x86_avx512_mask_prol_d_512(<16 x i32> %x0, i32 %x1, <16 x i32> %x2, i16 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_prol_d_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vprold $3, %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vprold $3, %zmm0, %zmm2 {%k1} {z}
@@ -4027,7 +3820,7 @@ declare <8 x i64> @llvm.x86.avx512.mask.prol.q.512(<8 x i64>, i32, <8 x i64>, i8
define <8 x i64>@test_int_x86_avx512_mask_prol_q_512(<8 x i64> %x0, i32 %x1, <8 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_prol_q_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vprolq $3, %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vprolq $3, %zmm0, %zmm2 {%k1} {z}
@@ -4047,7 +3840,7 @@ declare <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double>, <8 x i64
define <8 x double>@test_int_x86_avx512_mask_permvar_df_512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_permvar_df_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpermpd %zmm0, %zmm1, %zmm3
; CHECK-NEXT: vpermpd %zmm0, %zmm1, %zmm2 {%k1}
@@ -4067,7 +3860,7 @@ declare <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64>, <8 x i64>, <8
define <8 x i64>@test_int_x86_avx512_mask_permvar_di_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_permvar_di_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpermq %zmm0, %zmm1, %zmm3
; CHECK-NEXT: vpermq %zmm0, %zmm1, %zmm2 {%k1}
@@ -4087,7 +3880,7 @@ declare <16 x float> @llvm.x86.avx512.mask.permvar.sf.512(<16 x float>, <16 x i3
define <16 x float>@test_int_x86_avx512_mask_permvar_sf_512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_permvar_sf_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm3
; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm2 {%k1}
@@ -4107,7 +3900,7 @@ declare <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32>, <16 x i32>,
define <16 x i32>@test_int_x86_avx512_mask_permvar_si_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_permvar_si_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpermd %zmm0, %zmm1, %zmm3
; CHECK-NEXT: vpermd %zmm0, %zmm1, %zmm2 {%k1}
@@ -4127,11 +3920,11 @@ declare <8 x double> @llvm.x86.avx512.mask.fixupimm.pd.512(<8 x double>, <8 x do
define <8 x double>@test_int_x86_avx512_mask_fixupimm_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x i64> %x2, i8 %x4) {
; CHECK-LABEL: test_int_x86_avx512_mask_fixupimm_pd_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovapd %zmm0, %zmm3
; CHECK-NEXT: vfixupimmpd $4, %zmm2, %zmm1, %zmm3 {%k1}
-; CHECK-NEXT: vpxord %zmm4, %zmm4, %zmm4
+; CHECK-NEXT: vxorpd %xmm4, %xmm4, %xmm4
; CHECK-NEXT: vfixupimmpd $5, %zmm2, %zmm1, %zmm4 {%k1} {z}
; CHECK-NEXT: vaddpd %zmm4, %zmm3, %zmm3
; CHECK-NEXT: vfixupimmpd $3, {sae}, %zmm2, %zmm1, %zmm0
@@ -4149,11 +3942,11 @@ declare <8 x double> @llvm.x86.avx512.maskz.fixupimm.pd.512(<8 x double>, <8 x d
define <8 x double>@test_int_x86_avx512_maskz_fixupimm_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x i64> %x2, i8 %x4) {
; CHECK-LABEL: test_int_x86_avx512_maskz_fixupimm_pd_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovapd %zmm0, %zmm3
; CHECK-NEXT: vfixupimmpd $3, %zmm2, %zmm1, %zmm3 {%k1} {z}
-; CHECK-NEXT: vpxord %zmm4, %zmm4, %zmm4
+; CHECK-NEXT: vxorpd %xmm4, %xmm4, %xmm4
; CHECK-NEXT: vmovapd %zmm0, %zmm5
; CHECK-NEXT: vfixupimmpd $5, %zmm4, %zmm1, %zmm5 {%k1} {z}
; CHECK-NEXT: vaddpd %zmm5, %zmm3, %zmm3
@@ -4172,7 +3965,7 @@ declare <4 x float> @llvm.x86.avx512.mask.fixupimm.ss(<4 x float>, <4 x float>,
define <4 x float>@test_int_x86_avx512_mask_fixupimm_ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i8 %x4) {
; CHECK-LABEL: test_int_x86_avx512_mask_fixupimm_ss:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovaps %xmm0, %xmm3
; CHECK-NEXT: vfixupimmss $5, %xmm2, %xmm1, %xmm3 {%k1}
@@ -4195,7 +3988,7 @@ declare <4 x float> @llvm.x86.avx512.maskz.fixupimm.ss(<4 x float>, <4 x float>,
define <4 x float>@test_int_x86_avx512_maskz_fixupimm_ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i8 %x4) {
; CHECK-LABEL: test_int_x86_avx512_maskz_fixupimm_ss:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovaps %xmm0, %xmm3
; CHECK-NEXT: vfixupimmss $5, %xmm2, %xmm1, %xmm3
; CHECK-NEXT: kmovw %edi, %k1
@@ -4218,11 +4011,11 @@ declare <16 x float> @llvm.x86.avx512.mask.fixupimm.ps.512(<16 x float>, <16 x f
define <16 x float>@test_int_x86_avx512_mask_fixupimm_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x i32> %x2, i16 %x4) {
; CHECK-LABEL: test_int_x86_avx512_mask_fixupimm_ps_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovaps %zmm0, %zmm3
; CHECK-NEXT: vfixupimmps $5, %zmm2, %zmm1, %zmm3 {%k1}
-; CHECK-NEXT: vpxord %zmm4, %zmm4, %zmm4
+; CHECK-NEXT: vxorps %xmm4, %xmm4, %xmm4
; CHECK-NEXT: vmovaps %zmm0, %zmm5
; CHECK-NEXT: vfixupimmps $5, %zmm4, %zmm1, %zmm5 {%k1}
; CHECK-NEXT: vaddps %zmm5, %zmm3, %zmm3
@@ -4241,13 +4034,13 @@ declare <16 x float> @llvm.x86.avx512.maskz.fixupimm.ps.512(<16 x float>, <16 x
define <16 x float>@test_int_x86_avx512_maskz_fixupimm_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x i32> %x2, i16 %x4) {
; CHECK-LABEL: test_int_x86_avx512_maskz_fixupimm_ps_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovaps %zmm0, %zmm3
; CHECK-NEXT: vfixupimmps $5, %zmm2, %zmm1, %zmm3
; CHECK-NEXT: vmovaps %zmm0, %zmm4
; CHECK-NEXT: vfixupimmps $5, %zmm2, %zmm1, %zmm4 {%k1} {z}
-; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2
+; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vfixupimmps $5, {sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
; CHECK-NEXT: vaddps %zmm0, %zmm4, %zmm0
; CHECK-NEXT: vaddps %zmm3, %zmm0, %zmm0
@@ -4264,7 +4057,7 @@ declare <2 x double> @llvm.x86.avx512.mask.fixupimm.sd(<2 x double>, <2 x double
define <2 x double>@test_int_x86_avx512_mask_fixupimm_sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> %x2, i8 %x4) {
; CHECK-LABEL: test_int_x86_avx512_mask_fixupimm_sd:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovapd %xmm0, %xmm3
; CHECK-NEXT: vfixupimmsd $5, %xmm2, %xmm1, %xmm3
; CHECK-NEXT: kmovw %edi, %k1
@@ -4287,7 +4080,7 @@ declare <2 x double> @llvm.x86.avx512.maskz.fixupimm.sd(<2 x double>, <2 x doubl
define <2 x double>@test_int_x86_avx512_maskz_fixupimm_sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> %x2, i8 %x4) {
; CHECK-LABEL: test_int_x86_avx512_maskz_fixupimm_sd:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovapd %xmm0, %xmm3
; CHECK-NEXT: vfixupimmsd $5, %xmm2, %xmm1, %xmm3 {%k1} {z}
@@ -4306,88 +4099,11 @@ define <2 x double>@test_int_x86_avx512_maskz_fixupimm_sd(<2 x double> %x0, <2 x
ret <2 x double> %res4
}
-declare i16 @llvm.x86.avx512.ptestnm.d.512(<16 x i32>, <16 x i32>, i16 %x2)
-
-define i16@test_int_x86_avx512_ptestnm_d_512(<16 x i32> %x0, <16 x i32> %x1, i16 %x2) {
-; CHECK-LABEL: test_int_x86_avx512_ptestnm_d_512:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vptestnmd %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovw %k0, %ecx
-; CHECK-NEXT: vptestnmd %zmm1, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: addl %ecx, %eax
-; CHECK-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
-; CHECK-NEXT: retq
- %res = call i16 @llvm.x86.avx512.ptestnm.d.512(<16 x i32> %x0, <16 x i32> %x1, i16 %x2)
- %res1 = call i16 @llvm.x86.avx512.ptestnm.d.512(<16 x i32> %x0, <16 x i32> %x1, i16-1)
- %res2 = add i16 %res, %res1
- ret i16 %res2
-}
-
-declare i8 @llvm.x86.avx512.ptestnm.q.512(<8 x i64>, <8 x i64>, i8 %x2)
-
-define i8@test_int_x86_avx512_ptestnm_q_512(<8 x i64> %x0, <8 x i64> %x1, i8 %x2) {
-; CHECK-LABEL: test_int_x86_avx512_ptestnm_q_512:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vptestnmq %zmm1, %zmm0, %k0
-; CHECK-NEXT: kmovw %k0, %ecx
-; CHECK-NEXT: vptestnmq %zmm1, %zmm0, %k0 {%k1}
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: addb %cl, %al
-; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
-; CHECK-NEXT: retq
- %res = call i8 @llvm.x86.avx512.ptestnm.q.512(<8 x i64> %x0, <8 x i64> %x1, i8 %x2)
- %res1 = call i8 @llvm.x86.avx512.ptestnm.q.512(<8 x i64> %x0, <8 x i64> %x1, i8-1)
- %res2 = add i8 %res, %res1
- ret i8 %res2
-}
-
-define <16 x i32>@test_int_x86_avx512_mask_pbroadcastd_gpr_512(i32 %x0, <16 x i32> %x1, i16 %mask) {
-; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcastd_gpr_512:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %esi, %k1
-; CHECK-NEXT: vpbroadcastd %edi, %zmm1 {%k1} {z}
-; CHECK-NEXT: vpbroadcastd %edi, %zmm0 {%k1}
-; CHECK-NEXT: vpbroadcastd %edi, %zmm2
-; CHECK-NEXT: vpaddd %zmm0, %zmm2, %zmm0
-; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0
-; CHECK-NEXT: retq
- %res = call <16 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.512(i32 %x0, <16 x i32> %x1, i16 -1)
- %res1 = call <16 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.512(i32 %x0, <16 x i32> %x1, i16 %mask)
- %res2 = call <16 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.512(i32 %x0, <16 x i32> zeroinitializer, i16 %mask)
- %res3 = add <16 x i32> %res, %res1
- %res4 = add <16 x i32> %res2, %res3
- ret <16 x i32> %res4
-}
-
-declare <16 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.512(i32, <16 x i32>, i16)
-
-define <8 x i64>@test_int_x86_avx512_mask_pbroadcastq_gpr_512(i64 %x0, <8 x i64> %x1, i8 %mask) {
-; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcastq_gpr_512:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %esi, %k1
-; CHECK-NEXT: vpbroadcastq %rdi, %zmm1 {%k1} {z}
-; CHECK-NEXT: vpbroadcastq %rdi, %zmm0 {%k1}
-; CHECK-NEXT: vpbroadcastq %rdi, %zmm2
-; CHECK-NEXT: vpaddq %zmm0, %zmm2, %zmm0
-; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
-; CHECK-NEXT: retq
- %res = call <8 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.512(i64 %x0, <8 x i64> %x1,i8 -1)
- %res1 = call <8 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.512(i64 %x0, <8 x i64> %x1,i8 %mask)
- %res2 = call <8 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.512(i64 %x0, <8 x i64> zeroinitializer,i8 %mask)
- %res3 = add <8 x i64> %res, %res1
- %res4 = add <8 x i64> %res2, %res3
- ret <8 x i64> %res4
-}
-declare <8 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.512(i64, <8 x i64>, i8)
-
declare <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32)
define <2 x double>@test_int_x86_avx512_mask_vfmadd_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3,i32 %x4 ){
; CHECK-LABEL: test_int_x86_avx512_mask_vfmadd_sd:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovapd %xmm0, %xmm3
; CHECK-NEXT: vfmadd213sd %xmm2, %xmm1, %xmm3
; CHECK-NEXT: kmovw %edi, %k1
@@ -4414,7 +4130,7 @@ declare <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float>, <4 x float>, <4
define <4 x float>@test_int_x86_avx512_mask_vfmadd_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3,i32 %x4 ){
; CHECK-LABEL: test_int_x86_avx512_mask_vfmadd_ss:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovaps %xmm0, %xmm3
; CHECK-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm3
; CHECK-NEXT: kmovw %edi, %k1
@@ -4441,7 +4157,7 @@ declare <2 x double> @llvm.x86.avx512.maskz.vfmadd.sd(<2 x double>, <2 x double>
define <2 x double>@test_int_x86_avx512_maskz_vfmadd_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3,i32 %x4 ){
; CHECK-LABEL: test_int_x86_avx512_maskz_vfmadd_sd:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovapd %xmm0, %xmm3
; CHECK-NEXT: vfmadd213sd %xmm2, %xmm1, %xmm3 {%k1} {z}
@@ -4458,7 +4174,7 @@ declare <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float>, <4 x float>, <
define <4 x float>@test_int_x86_avx512_maskz_vfmadd_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3,i32 %x4 ){
; CHECK-LABEL: test_int_x86_avx512_maskz_vfmadd_ss:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -4471,7 +4187,7 @@ declare <2 x double> @llvm.x86.avx512.mask3.vfmadd.sd(<2 x double>, <2 x double>
define <2 x double>@test_int_x86_avx512_mask3_vfmadd_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3,i32 %x4 ){
; CHECK-LABEL: test_int_x86_avx512_mask3_vfmadd_sd:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovapd %xmm2, %xmm3
; CHECK-NEXT: vfmadd231sd %xmm1, %xmm0, %xmm3
; CHECK-NEXT: kmovw %edi, %k1
@@ -4498,7 +4214,7 @@ declare <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float>, <4 x float>, <
define <4 x float>@test_int_x86_avx512_mask3_vfmadd_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3,i32 %x4 ){
; CHECK-LABEL: test_int_x86_avx512_mask3_vfmadd_ss:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovaps %xmm2, %xmm3
; CHECK-NEXT: vfmadd231ss %xmm1, %xmm0, %xmm3
; CHECK-NEXT: kmovw %edi, %k1
@@ -4523,7 +4239,7 @@ define <4 x float>@test_int_x86_avx512_mask3_vfmadd_ss(<4 x float> %x0, <4 x flo
define void @fmadd_ss_mask_memfold(float* %a, float* %b, i8 %c) {
; CHECK-LABEL: fmadd_ss_mask_memfold:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: kmovw %edx, %k1
; CHECK-NEXT: vfmadd132ss (%rsi), %xmm0, %xmm0 {%k1}
@@ -4550,7 +4266,7 @@ define void @fmadd_ss_mask_memfold(float* %a, float* %b, i8 %c) {
define void @fmadd_ss_maskz_memfold(float* %a, float* %b, i8 %c) {
; CHECK-LABEL: fmadd_ss_maskz_memfold:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: kmovw %edx, %k1
; CHECK-NEXT: vfmadd132ss (%rsi), %xmm0, %xmm0 {%k1} {z}
@@ -4577,7 +4293,7 @@ define void @fmadd_ss_maskz_memfold(float* %a, float* %b, i8 %c) {
define void @fmadd_sd_mask_memfold(double* %a, double* %b, i8 %c) {
; CHECK-LABEL: fmadd_sd_mask_memfold:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: kmovw %edx, %k1
; CHECK-NEXT: vfmadd132sd (%rsi), %xmm0, %xmm0 {%k1}
@@ -4600,7 +4316,7 @@ define void @fmadd_sd_mask_memfold(double* %a, double* %b, i8 %c) {
define void @fmadd_sd_maskz_memfold(double* %a, double* %b, i8 %c) {
; CHECK-LABEL: fmadd_sd_maskz_memfold:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: kmovw %edx, %k1
; CHECK-NEXT: vfmadd132sd (%rsi), %xmm0, %xmm0 {%k1} {z}
@@ -4625,7 +4341,7 @@ declare <2 x double> @llvm.x86.avx512.mask3.vfmsub.sd(<2 x double>, <2 x double>
define <2 x double>@test_int_x86_avx512_mask3_vfmsub_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3,i32 %x4 ){
; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsub_sd:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovapd %xmm2, %xmm3
; CHECK-NEXT: vfmsub231sd %xmm1, %xmm0, %xmm3
; CHECK-NEXT: kmovw %edi, %k1
@@ -4652,7 +4368,7 @@ declare <4 x float> @llvm.x86.avx512.mask3.vfmsub.ss(<4 x float>, <4 x float>, <
define <4 x float>@test_int_x86_avx512_mask3_vfmsub_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3,i32 %x4 ){
; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsub_ss:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovaps %xmm2, %xmm3
; CHECK-NEXT: vfmsub231ss %xmm1, %xmm0, %xmm3
; CHECK-NEXT: kmovw %edi, %k1
@@ -4679,7 +4395,7 @@ declare <2 x double> @llvm.x86.avx512.mask3.vfnmsub.sd(<2 x double>, <2 x double
define <2 x double>@test_int_x86_avx512_mask3_vfnmsub_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3,i32 %x4 ){
; CHECK-LABEL: test_int_x86_avx512_mask3_vfnmsub_sd:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovapd %xmm2, %xmm3
; CHECK-NEXT: vfnmsub231sd %xmm1, %xmm0, %xmm3
; CHECK-NEXT: kmovw %edi, %k1
@@ -4706,7 +4422,7 @@ declare <4 x float> @llvm.x86.avx512.mask3.vfnmsub.ss(<4 x float>, <4 x float>,
define <4 x float>@test_int_x86_avx512_mask3_vfnmsub_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3,i32 %x4 ){
; CHECK-LABEL: test_int_x86_avx512_mask3_vfnmsub_ss:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovaps %xmm2, %xmm3
; CHECK-NEXT: vfnmsub231ss %xmm1, %xmm0, %xmm3
; CHECK-NEXT: kmovw %edi, %k1
@@ -4731,7 +4447,7 @@ define <4 x float>@test_int_x86_avx512_mask3_vfnmsub_ss(<4 x float> %x0, <4 x fl
define <4 x float>@test_int_x86_avx512_mask3_vfmadd_ss_rm(<4 x float> %x0, <4 x float> %x1, float *%ptr_b ,i8 %x3,i32 %x4) {
; CHECK-LABEL: test_int_x86_avx512_mask3_vfmadd_ss_rm:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vfmadd231ss (%rdi), %xmm0, %xmm1 {%k1}
; CHECK-NEXT: vmovaps %xmm1, %xmm0
@@ -4744,7 +4460,7 @@ define <4 x float>@test_int_x86_avx512_mask3_vfmadd_ss_rm(<4 x float> %x0, <4 x
define <4 x float>@test_int_x86_avx512_mask_vfmadd_ss_rm(<4 x float> %x0, <4 x float> %x1,float *%ptr_b ,i8 %x3,i32 %x4) {
; CHECK-LABEL: test_int_x86_avx512_mask_vfmadd_ss_rm:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vfmadd132ss (%rdi), %xmm1, %xmm0 {%k1}
; CHECK-NEXT: retq
@@ -4757,7 +4473,7 @@ define <4 x float>@test_int_x86_avx512_mask_vfmadd_ss_rm(<4 x float> %x0, <4 x f
define <4 x float>@test_int_x86_avx512_maskz_vfmadd_ss_rm(<4 x float> %x0, <4 x float> %x1,float *%ptr_b ,i8 %x3,i32 %x4) {
; CHECK-LABEL: test_int_x86_avx512_maskz_vfmadd_ss_rm:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: kmovw %eax, %k1
; CHECK-NEXT: vfmadd213ss (%rdi), %xmm1, %xmm0 {%k1} {z}
@@ -4770,7 +4486,7 @@ define <4 x float>@test_int_x86_avx512_maskz_vfmadd_ss_rm(<4 x float> %x0, <4 x
define <16 x i32> @test_x86_avx512_psll_d_512(<16 x i32> %a0, <4 x i32> %a1) {
; CHECK-LABEL: test_x86_avx512_psll_d_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpslld %xmm1, %zmm0, %zmm0
; CHECK-NEXT: retq
%res = call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> %a0, <4 x i32> %a1) ; <<16 x i32>> [#uses=1]
@@ -4778,7 +4494,7 @@ define <16 x i32> @test_x86_avx512_psll_d_512(<16 x i32> %a0, <4 x i32> %a1) {
}
define <16 x i32> @test_x86_avx512_mask_psll_d_512(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> %passthru, i16 %mask) {
; CHECK-LABEL: test_x86_avx512_mask_psll_d_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpslld %xmm1, %zmm0, %zmm2 {%k1}
; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
@@ -4790,7 +4506,7 @@ define <16 x i32> @test_x86_avx512_mask_psll_d_512(<16 x i32> %a0, <4 x i32> %a1
}
define <16 x i32> @test_x86_avx512_maskz_psll_d_512(<16 x i32> %a0, <4 x i32> %a1, i16 %mask) {
; CHECK-LABEL: test_x86_avx512_maskz_psll_d_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpslld %xmm1, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -4804,7 +4520,7 @@ declare <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32>, <4 x i32>) nounwind r
define <8 x i64> @test_x86_avx512_psll_q_512(<8 x i64> %a0, <2 x i64> %a1) {
; CHECK-LABEL: test_x86_avx512_psll_q_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsllq %xmm1, %zmm0, %zmm0
; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64> %a0, <2 x i64> %a1) ; <<8 x i64>> [#uses=1]
@@ -4812,7 +4528,7 @@ define <8 x i64> @test_x86_avx512_psll_q_512(<8 x i64> %a0, <2 x i64> %a1) {
}
define <8 x i64> @test_x86_avx512_mask_psll_q_512(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %passthru, i8 %mask) {
; CHECK-LABEL: test_x86_avx512_mask_psll_q_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpsllq %xmm1, %zmm0, %zmm2 {%k1}
; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
@@ -4824,7 +4540,7 @@ define <8 x i64> @test_x86_avx512_mask_psll_q_512(<8 x i64> %a0, <2 x i64> %a1,
}
define <8 x i64> @test_x86_avx512_maskz_psll_q_512(<8 x i64> %a0, <2 x i64> %a1, i8 %mask) {
; CHECK-LABEL: test_x86_avx512_maskz_psll_q_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpsllq %xmm1, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -4838,7 +4554,7 @@ declare <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64>, <2 x i64>) nounwind rea
define <16 x i32> @test_x86_avx512_pslli_d_512(<16 x i32> %a0) {
; CHECK-LABEL: test_x86_avx512_pslli_d_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpslld $7, %zmm0, %zmm0
; CHECK-NEXT: retq
%res = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> %a0, i32 7) ; <<16 x i32>> [#uses=1]
@@ -4846,7 +4562,7 @@ define <16 x i32> @test_x86_avx512_pslli_d_512(<16 x i32> %a0) {
}
define <16 x i32> @test_x86_avx512_mask_pslli_d_512(<16 x i32> %a0, <16 x i32> %passthru, i16 %mask) {
; CHECK-LABEL: test_x86_avx512_mask_pslli_d_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpslld $7, %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
@@ -4858,7 +4574,7 @@ define <16 x i32> @test_x86_avx512_mask_pslli_d_512(<16 x i32> %a0, <16 x i32> %
}
define <16 x i32> @test_x86_avx512_maskz_pslli_d_512(<16 x i32> %a0, i16 %mask) {
; CHECK-LABEL: test_x86_avx512_maskz_pslli_d_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpslld $7, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -4872,7 +4588,7 @@ declare <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32>, i32) nounwind readno
define <8 x i64> @test_x86_avx512_pslli_q_512(<8 x i64> %a0) {
; CHECK-LABEL: test_x86_avx512_pslli_q_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsllq $7, %zmm0, %zmm0
; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> %a0, i32 7) ; <<8 x i64>> [#uses=1]
@@ -4880,7 +4596,7 @@ define <8 x i64> @test_x86_avx512_pslli_q_512(<8 x i64> %a0) {
}
define <8 x i64> @test_x86_avx512_mask_pslli_q_512(<8 x i64> %a0, <8 x i64> %passthru, i8 %mask) {
; CHECK-LABEL: test_x86_avx512_mask_pslli_q_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpsllq $7, %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
@@ -4892,7 +4608,7 @@ define <8 x i64> @test_x86_avx512_mask_pslli_q_512(<8 x i64> %a0, <8 x i64> %pas
}
define <8 x i64> @test_x86_avx512_maskz_pslli_q_512(<8 x i64> %a0, <8 x i64> %passthru, i8 %mask) {
; CHECK-LABEL: test_x86_avx512_maskz_pslli_q_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpsllq $7, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -4906,7 +4622,7 @@ declare <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64>, i32) nounwind readnone
define <8 x i64> @test_x86_avx512_psra_q_512(<8 x i64> %a0, <2 x i64> %a1) {
; CHECK-LABEL: test_x86_avx512_psra_q_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsraq %xmm1, %zmm0, %zmm0
; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64> %a0, <2 x i64> %a1) ; <<8 x i64>> [#uses=1]
@@ -4914,7 +4630,7 @@ define <8 x i64> @test_x86_avx512_psra_q_512(<8 x i64> %a0, <2 x i64> %a1) {
}
define <8 x i64> @test_x86_avx512_mask_psra_q_512(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %passthru, i8 %mask) {
; CHECK-LABEL: test_x86_avx512_mask_psra_q_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpsraq %xmm1, %zmm0, %zmm2 {%k1}
; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
@@ -4926,7 +4642,7 @@ define <8 x i64> @test_x86_avx512_mask_psra_q_512(<8 x i64> %a0, <2 x i64> %a1,
}
define <8 x i64> @test_x86_avx512_maskz_psra_q_512(<8 x i64> %a0, <2 x i64> %a1, i8 %mask) {
; CHECK-LABEL: test_x86_avx512_maskz_psra_q_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpsraq %xmm1, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -4940,7 +4656,7 @@ declare <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64>, <2 x i64>) nounwind rea
define <16 x i32> @test_x86_avx512_psra_d_512(<16 x i32> %a0, <4 x i32> %a1) {
; CHECK-LABEL: test_x86_avx512_psra_d_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsrad %xmm1, %zmm0, %zmm0
; CHECK-NEXT: retq
%res = call <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32> %a0, <4 x i32> %a1) ; <<16 x i32>> [#uses=1]
@@ -4948,7 +4664,7 @@ define <16 x i32> @test_x86_avx512_psra_d_512(<16 x i32> %a0, <4 x i32> %a1) {
}
define <16 x i32> @test_x86_avx512_mask_psra_d_512(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> %passthru, i16 %mask) {
; CHECK-LABEL: test_x86_avx512_mask_psra_d_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpsrad %xmm1, %zmm0, %zmm2 {%k1}
; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
@@ -4960,7 +4676,7 @@ define <16 x i32> @test_x86_avx512_mask_psra_d_512(<16 x i32> %a0, <4 x i32> %a1
}
define <16 x i32> @test_x86_avx512_maskz_psra_d_512(<16 x i32> %a0, <4 x i32> %a1, i16 %mask) {
; CHECK-LABEL: test_x86_avx512_maskz_psra_d_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpsrad %xmm1, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -4975,7 +4691,7 @@ declare <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32>, <4 x i32>) nounwind r
define <8 x i64> @test_x86_avx512_psrai_q_512(<8 x i64> %a0) {
; CHECK-LABEL: test_x86_avx512_psrai_q_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsraq $7, %zmm0, %zmm0
; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> %a0, i32 7) ; <<8 x i64>> [#uses=1]
@@ -4983,7 +4699,7 @@ define <8 x i64> @test_x86_avx512_psrai_q_512(<8 x i64> %a0) {
}
define <8 x i64> @test_x86_avx512_mask_psrai_q_512(<8 x i64> %a0, <8 x i64> %passthru, i8 %mask) {
; CHECK-LABEL: test_x86_avx512_mask_psrai_q_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpsraq $7, %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
@@ -4995,7 +4711,7 @@ define <8 x i64> @test_x86_avx512_mask_psrai_q_512(<8 x i64> %a0, <8 x i64> %pas
}
define <8 x i64> @test_x86_avx512_maskz_psrai_q_512(<8 x i64> %a0, i8 %mask) {
; CHECK-LABEL: test_x86_avx512_maskz_psrai_q_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpsraq $7, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -5009,7 +4725,7 @@ declare <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64>, i32) nounwind readnone
define <16 x i32> @test_x86_avx512_psrai_d_512(<16 x i32> %a0) {
; CHECK-LABEL: test_x86_avx512_psrai_d_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsrad $7, %zmm0, %zmm0
; CHECK-NEXT: retq
%res = call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> %a0, i32 7) ; <<16 x i32>> [#uses=1]
@@ -5017,7 +4733,7 @@ define <16 x i32> @test_x86_avx512_psrai_d_512(<16 x i32> %a0) {
}
define <16 x i32> @test_x86_avx512_mask_psrai_d_512(<16 x i32> %a0, <16 x i32> %passthru, i16 %mask) {
; CHECK-LABEL: test_x86_avx512_mask_psrai_d_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpsrad $7, %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
@@ -5029,7 +4745,7 @@ define <16 x i32> @test_x86_avx512_mask_psrai_d_512(<16 x i32> %a0, <16 x i32> %
}
define <16 x i32> @test_x86_avx512_maskz_psrai_d_512(<16 x i32> %a0, i16 %mask) {
; CHECK-LABEL: test_x86_avx512_maskz_psrai_d_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpsrad $7, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -5044,7 +4760,7 @@ declare <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32>, i32) nounwind readno
define <16 x i32> @test_x86_avx512_psrl_d_512(<16 x i32> %a0, <4 x i32> %a1) {
; CHECK-LABEL: test_x86_avx512_psrl_d_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsrld %xmm1, %zmm0, %zmm0
; CHECK-NEXT: retq
%res = call <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32> %a0, <4 x i32> %a1) ; <<16 x i32>> [#uses=1]
@@ -5052,7 +4768,7 @@ define <16 x i32> @test_x86_avx512_psrl_d_512(<16 x i32> %a0, <4 x i32> %a1) {
}
define <16 x i32> @test_x86_avx512_mask_psrl_d_512(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> %passthru, i16 %mask) {
; CHECK-LABEL: test_x86_avx512_mask_psrl_d_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpsrld %xmm1, %zmm0, %zmm2 {%k1}
; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
@@ -5064,7 +4780,7 @@ define <16 x i32> @test_x86_avx512_mask_psrl_d_512(<16 x i32> %a0, <4 x i32> %a1
}
define <16 x i32> @test_x86_avx512_maskz_psrl_d_512(<16 x i32> %a0, <4 x i32> %a1, i16 %mask) {
; CHECK-LABEL: test_x86_avx512_maskz_psrl_d_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpsrld %xmm1, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -5078,7 +4794,7 @@ declare <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32>, <4 x i32>) nounwind r
define <8 x i64> @test_x86_avx512_psrl_q_512(<8 x i64> %a0, <2 x i64> %a1) {
; CHECK-LABEL: test_x86_avx512_psrl_q_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsrlq %xmm1, %zmm0, %zmm0
; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64> %a0, <2 x i64> %a1) ; <<8 x i64>> [#uses=1]
@@ -5086,7 +4802,7 @@ define <8 x i64> @test_x86_avx512_psrl_q_512(<8 x i64> %a0, <2 x i64> %a1) {
}
define <8 x i64> @test_x86_avx512_mask_psrl_q_512(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %passthru, i8 %mask) {
; CHECK-LABEL: test_x86_avx512_mask_psrl_q_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpsrlq %xmm1, %zmm0, %zmm2 {%k1}
; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
@@ -5098,7 +4814,7 @@ define <8 x i64> @test_x86_avx512_mask_psrl_q_512(<8 x i64> %a0, <2 x i64> %a1,
}
define <8 x i64> @test_x86_avx512_maskz_psrl_q_512(<8 x i64> %a0, <2 x i64> %a1, i8 %mask) {
; CHECK-LABEL: test_x86_avx512_maskz_psrl_q_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpsrlq %xmm1, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -5112,7 +4828,7 @@ declare <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64>, <2 x i64>) nounwind rea
define <16 x i32> @test_x86_avx512_psrli_d_512(<16 x i32> %a0) {
; CHECK-LABEL: test_x86_avx512_psrli_d_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsrld $7, %zmm0, %zmm0
; CHECK-NEXT: retq
%res = call <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32> %a0, i32 7) ; <<16 x i32>> [#uses=1]
@@ -5120,7 +4836,7 @@ define <16 x i32> @test_x86_avx512_psrli_d_512(<16 x i32> %a0) {
}
define <16 x i32> @test_x86_avx512_mask_psrli_d_512(<16 x i32> %a0, <16 x i32> %passthru, i16 %mask) {
; CHECK-LABEL: test_x86_avx512_mask_psrli_d_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpsrld $7, %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
@@ -5132,7 +4848,7 @@ define <16 x i32> @test_x86_avx512_mask_psrli_d_512(<16 x i32> %a0, <16 x i32> %
}
define <16 x i32> @test_x86_avx512_maskz_psrli_d_512(<16 x i32> %a0, i16 %mask) {
; CHECK-LABEL: test_x86_avx512_maskz_psrli_d_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpsrld $7, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -5146,7 +4862,7 @@ declare <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32>, i32) nounwind readno
define <8 x i64> @test_x86_avx512_psrli_q_512(<8 x i64> %a0) {
; CHECK-LABEL: test_x86_avx512_psrli_q_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsrlq $7, %zmm0, %zmm0
; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64> %a0, i32 7) ; <<8 x i64>> [#uses=1]
@@ -5154,7 +4870,7 @@ define <8 x i64> @test_x86_avx512_psrli_q_512(<8 x i64> %a0) {
}
define <8 x i64> @test_x86_avx512_mask_psrli_q_512(<8 x i64> %a0, <8 x i64> %passthru, i8 %mask) {
; CHECK-LABEL: test_x86_avx512_mask_psrli_q_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpsrlq $7, %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
@@ -5166,7 +4882,7 @@ define <8 x i64> @test_x86_avx512_mask_psrli_q_512(<8 x i64> %a0, <8 x i64> %pas
}
define <8 x i64> @test_x86_avx512_maskz_psrli_q_512(<8 x i64> %a0, i8 %mask) {
; CHECK-LABEL: test_x86_avx512_maskz_psrli_q_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpsrlq $7, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -5179,7 +4895,7 @@ declare <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64>, i32) nounwind readnone
define <16 x i32> @test_x86_avx512_psllv_d_512(<16 x i32> %a0, <16 x i32> %a1) {
; CHECK-LABEL: test_x86_avx512_psllv_d_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsllvd %zmm1, %zmm0, %zmm0
; CHECK-NEXT: retq
%res = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> %a0, <16 x i32> %a1)
@@ -5188,7 +4904,7 @@ define <16 x i32> @test_x86_avx512_psllv_d_512(<16 x i32> %a0, <16 x i32> %a1) {
define <16 x i32> @test_x86_avx512_mask_psllv_d_512(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask) {
; CHECK-LABEL: test_x86_avx512_mask_psllv_d_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpsllvd %zmm1, %zmm0, %zmm2 {%k1}
; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
@@ -5201,7 +4917,7 @@ define <16 x i32> @test_x86_avx512_mask_psllv_d_512(<16 x i32> %a0, <16 x i32> %
define <16 x i32> @test_x86_avx512_maskz_psllv_d_512(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
; CHECK-LABEL: test_x86_avx512_maskz_psllv_d_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpsllvd %zmm1, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -5215,7 +4931,7 @@ declare <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32>, <16 x i32>) nounwind
define <8 x i64> @test_x86_avx512_psllv_q_512(<8 x i64> %a0, <8 x i64> %a1) {
; CHECK-LABEL: test_x86_avx512_psllv_q_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsllvq %zmm1, %zmm0, %zmm0
; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> %a0, <8 x i64> %a1)
@@ -5224,7 +4940,7 @@ define <8 x i64> @test_x86_avx512_psllv_q_512(<8 x i64> %a0, <8 x i64> %a1) {
define <8 x i64> @test_x86_avx512_mask_psllv_q_512(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask) {
; CHECK-LABEL: test_x86_avx512_mask_psllv_q_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpsllvq %zmm1, %zmm0, %zmm2 {%k1}
; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
@@ -5237,7 +4953,7 @@ define <8 x i64> @test_x86_avx512_mask_psllv_q_512(<8 x i64> %a0, <8 x i64> %a1,
define <8 x i64> @test_x86_avx512_maskz_psllv_q_512(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
; CHECK-LABEL: test_x86_avx512_maskz_psllv_q_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpsllvq %zmm1, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -5251,7 +4967,7 @@ declare <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64>, <8 x i64>) nounwind re
define <16 x i32> @test_x86_avx512_psrav_d_512(<16 x i32> %a0, <16 x i32> %a1) {
; CHECK-LABEL: test_x86_avx512_psrav_d_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsravd %zmm1, %zmm0, %zmm0
; CHECK-NEXT: retq
%res = call <16 x i32> @llvm.x86.avx512.psrav.d.512(<16 x i32> %a0, <16 x i32> %a1)
@@ -5260,7 +4976,7 @@ define <16 x i32> @test_x86_avx512_psrav_d_512(<16 x i32> %a0, <16 x i32> %a1) {
define <16 x i32> @test_x86_avx512_mask_psrav_d_512(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask) {
; CHECK-LABEL: test_x86_avx512_mask_psrav_d_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpsravd %zmm1, %zmm0, %zmm2 {%k1}
; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
@@ -5273,7 +4989,7 @@ define <16 x i32> @test_x86_avx512_mask_psrav_d_512(<16 x i32> %a0, <16 x i32> %
define <16 x i32> @test_x86_avx512_maskz_psrav_d_512(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
; CHECK-LABEL: test_x86_avx512_maskz_psrav_d_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpsravd %zmm1, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -5287,7 +5003,7 @@ declare <16 x i32> @llvm.x86.avx512.psrav.d.512(<16 x i32>, <16 x i32>) nounwind
define <8 x i64> @test_x86_avx512_psrav_q_512(<8 x i64> %a0, <8 x i64> %a1) {
; CHECK-LABEL: test_x86_avx512_psrav_q_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsravq %zmm1, %zmm0, %zmm0
; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.psrav.q.512(<8 x i64> %a0, <8 x i64> %a1)
@@ -5296,7 +5012,7 @@ define <8 x i64> @test_x86_avx512_psrav_q_512(<8 x i64> %a0, <8 x i64> %a1) {
define <8 x i64> @test_x86_avx512_mask_psrav_q_512(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask) {
; CHECK-LABEL: test_x86_avx512_mask_psrav_q_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpsravq %zmm1, %zmm0, %zmm2 {%k1}
; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
@@ -5309,7 +5025,7 @@ define <8 x i64> @test_x86_avx512_mask_psrav_q_512(<8 x i64> %a0, <8 x i64> %a1,
define <8 x i64> @test_x86_avx512_maskz_psrav_q_512(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
; CHECK-LABEL: test_x86_avx512_maskz_psrav_q_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpsravq %zmm1, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -5323,7 +5039,7 @@ declare <8 x i64> @llvm.x86.avx512.psrav.q.512(<8 x i64>, <8 x i64>) nounwind re
define <16 x i32> @test_x86_avx512_psrlv_d_512(<16 x i32> %a0, <16 x i32> %a1) {
; CHECK-LABEL: test_x86_avx512_psrlv_d_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0
; CHECK-NEXT: retq
%res = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> %a0, <16 x i32> %a1)
@@ -5332,7 +5048,7 @@ define <16 x i32> @test_x86_avx512_psrlv_d_512(<16 x i32> %a0, <16 x i32> %a1) {
define <16 x i32> @test_x86_avx512_mask_psrlv_d_512(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask) {
; CHECK-LABEL: test_x86_avx512_mask_psrlv_d_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpsrlvd %zmm1, %zmm0, %zmm2 {%k1}
; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
@@ -5345,7 +5061,7 @@ define <16 x i32> @test_x86_avx512_mask_psrlv_d_512(<16 x i32> %a0, <16 x i32> %
define <16 x i32> @test_x86_avx512_maskz_psrlv_d_512(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
; CHECK-LABEL: test_x86_avx512_maskz_psrlv_d_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -5359,7 +5075,7 @@ declare <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32>, <16 x i32>) nounwind
define <8 x i64> @test_x86_avx512_psrlv_q_512(<8 x i64> %a0, <8 x i64> %a1) {
; CHECK-LABEL: test_x86_avx512_psrlv_q_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsrlvq %zmm1, %zmm0, %zmm0
; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> %a0, <8 x i64> %a1)
@@ -5368,7 +5084,7 @@ define <8 x i64> @test_x86_avx512_psrlv_q_512(<8 x i64> %a0, <8 x i64> %a1) {
define <8 x i64> @test_x86_avx512_mask_psrlv_q_512(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask) {
; CHECK-LABEL: test_x86_avx512_mask_psrlv_q_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpsrlvq %zmm1, %zmm0, %zmm2 {%k1}
; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
@@ -5381,7 +5097,7 @@ define <8 x i64> @test_x86_avx512_mask_psrlv_q_512(<8 x i64> %a0, <8 x i64> %a1,
define <8 x i64> @test_x86_avx512_maskz_psrlv_q_512(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
; CHECK-LABEL: test_x86_avx512_maskz_psrlv_q_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpsrlvq %zmm1, %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
diff --git a/test/CodeGen/X86/avx512-load-store.ll b/test/CodeGen/X86/avx512-load-store.ll
index 4fd985bf24cd..8589215f4a10 100644
--- a/test/CodeGen/X86/avx512-load-store.ll
+++ b/test/CodeGen/X86/avx512-load-store.ll
@@ -4,15 +4,14 @@
define <4 x float> @test_mm_mask_move_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) local_unnamed_addr #0 {
; CHECK64-LABEL: test_mm_mask_move_ss:
-; CHECK64: # BB#0: # %entry
+; CHECK64: # %bb.0: # %entry
; CHECK64-NEXT: kmovw %edi, %k1
; CHECK64-NEXT: vmovss %xmm2, %xmm1, %xmm0 {%k1}
; CHECK64-NEXT: retq
;
; CHECK32-LABEL: test_mm_mask_move_ss:
-; CHECK32: # BB#0: # %entry
+; CHECK32: # %bb.0: # %entry
; CHECK32-NEXT: movb {{[0-9]+}}(%esp), %al
-; CHECK32-NEXT: andb $1, %al
; CHECK32-NEXT: kmovw %eax, %k1
; CHECK32-NEXT: vmovss %xmm2, %xmm0, %xmm0 {%k1}
; CHECK32-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
@@ -29,15 +28,14 @@ entry:
define <4 x float> @test_mm_maskz_move_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) local_unnamed_addr #0 {
; CHECK64-LABEL: test_mm_maskz_move_ss:
-; CHECK64: # BB#0: # %entry
+; CHECK64: # %bb.0: # %entry
; CHECK64-NEXT: kmovw %edi, %k1
; CHECK64-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} {z}
; CHECK64-NEXT: retq
;
; CHECK32-LABEL: test_mm_maskz_move_ss:
-; CHECK32: # BB#0: # %entry
+; CHECK32: # %bb.0: # %entry
; CHECK32-NEXT: movb {{[0-9]+}}(%esp), %al
-; CHECK32-NEXT: andb $1, %al
; CHECK32-NEXT: kmovw %eax, %k1
; CHECK32-NEXT: vxorps %xmm2, %xmm2, %xmm2
; CHECK32-NEXT: vmovss %xmm1, %xmm0, %xmm2 {%k1}
@@ -54,15 +52,14 @@ entry:
define <2 x double> @test_mm_mask_move_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) local_unnamed_addr #0 {
; CHECK64-LABEL: test_mm_mask_move_sd:
-; CHECK64: # BB#0: # %entry
+; CHECK64: # %bb.0: # %entry
; CHECK64-NEXT: kmovw %edi, %k1
; CHECK64-NEXT: vmovsd %xmm2, %xmm1, %xmm0 {%k1}
; CHECK64-NEXT: retq
;
; CHECK32-LABEL: test_mm_mask_move_sd:
-; CHECK32: # BB#0: # %entry
+; CHECK32: # %bb.0: # %entry
; CHECK32-NEXT: movb {{[0-9]+}}(%esp), %al
-; CHECK32-NEXT: andb $1, %al
; CHECK32-NEXT: kmovw %eax, %k1
; CHECK32-NEXT: vmovsd %xmm2, %xmm0, %xmm0 {%k1}
; CHECK32-NEXT: vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
@@ -79,15 +76,14 @@ entry:
define <2 x double> @test_mm_maskz_move_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) local_unnamed_addr #0 {
; CHECK64-LABEL: test_mm_maskz_move_sd:
-; CHECK64: # BB#0: # %entry
+; CHECK64: # %bb.0: # %entry
; CHECK64-NEXT: kmovw %edi, %k1
; CHECK64-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1} {z}
; CHECK64-NEXT: retq
;
; CHECK32-LABEL: test_mm_maskz_move_sd:
-; CHECK32: # BB#0: # %entry
+; CHECK32: # %bb.0: # %entry
; CHECK32-NEXT: movb {{[0-9]+}}(%esp), %al
-; CHECK32-NEXT: andb $1, %al
; CHECK32-NEXT: kmovw %eax, %k1
; CHECK32-NEXT: vxorpd %xmm2, %xmm2, %xmm2
; CHECK32-NEXT: vmovsd %xmm1, %xmm0, %xmm2 {%k1}
@@ -104,13 +100,13 @@ entry:
define void @test_mm_mask_store_ss(float* %__W, i8 zeroext %__U, <4 x float> %__A) local_unnamed_addr #1 {
; CHECK64-LABEL: test_mm_mask_store_ss:
-; CHECK64: # BB#0: # %entry
+; CHECK64: # %bb.0: # %entry
; CHECK64-NEXT: kmovw %esi, %k1
; CHECK64-NEXT: vmovss %xmm0, (%rdi) {%k1}
; CHECK64-NEXT: retq
;
; CHECK32-LABEL: test_mm_mask_store_ss:
-; CHECK32: # BB#0: # %entry
+; CHECK32: # %bb.0: # %entry
; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK32-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
; CHECK32-NEXT: kmovw %ecx, %k1
@@ -128,13 +124,13 @@ entry:
define void @test_mm_mask_store_sd(double* %__W, i8 zeroext %__U, <2 x double> %__A) local_unnamed_addr #1 {
; CHECK64-LABEL: test_mm_mask_store_sd:
-; CHECK64: # BB#0: # %entry
+; CHECK64: # %bb.0: # %entry
; CHECK64-NEXT: kmovw %esi, %k1
; CHECK64-NEXT: vmovsd %xmm0, (%rdi) {%k1}
; CHECK64-NEXT: retq
;
; CHECK32-LABEL: test_mm_mask_store_sd:
-; CHECK32: # BB#0: # %entry
+; CHECK32: # %bb.0: # %entry
; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK32-NEXT: movb {{[0-9]+}}(%esp), %cl
; CHECK32-NEXT: kmovw %ecx, %k1
@@ -151,13 +147,13 @@ entry:
define <4 x float> @test_mm_mask_load_ss(<4 x float> %__A, i8 zeroext %__U, float* %__W) local_unnamed_addr #2 {
; CHECK64-LABEL: test_mm_mask_load_ss:
-; CHECK64: # BB#0: # %entry
+; CHECK64: # %bb.0: # %entry
; CHECK64-NEXT: kmovw %edi, %k1
; CHECK64-NEXT: vmovss (%rsi), %xmm0 {%k1}
; CHECK64-NEXT: retq
;
; CHECK32-LABEL: test_mm_mask_load_ss:
-; CHECK32: # BB#0: # %entry
+; CHECK32: # %bb.0: # %entry
; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK32-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
; CHECK32-NEXT: kmovw %ecx, %k1
@@ -177,13 +173,13 @@ entry:
define <2 x double> @test_mm_mask_load_sd(<2 x double> %__A, i8 zeroext %__U, double* %__W) local_unnamed_addr #2 {
; CHECK64-LABEL: test_mm_mask_load_sd:
-; CHECK64: # BB#0: # %entry
+; CHECK64: # %bb.0: # %entry
; CHECK64-NEXT: kmovw %edi, %k1
; CHECK64-NEXT: vmovsd (%rsi), %xmm0 {%k1}
; CHECK64-NEXT: retq
;
; CHECK32-LABEL: test_mm_mask_load_sd:
-; CHECK32: # BB#0: # %entry
+; CHECK32: # %bb.0: # %entry
; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK32-NEXT: movb {{[0-9]+}}(%esp), %cl
; CHECK32-NEXT: kmovw %ecx, %k1
@@ -202,13 +198,13 @@ entry:
define <4 x float> @test_mm_maskz_load_ss(i8 zeroext %__U, float* %__W) local_unnamed_addr #2 {
; CHECK64-LABEL: test_mm_maskz_load_ss:
-; CHECK64: # BB#0: # %entry
+; CHECK64: # %bb.0: # %entry
; CHECK64-NEXT: kmovw %edi, %k1
; CHECK64-NEXT: vmovss (%rsi), %xmm0 {%k1} {z}
; CHECK64-NEXT: retq
;
; CHECK32-LABEL: test_mm_maskz_load_ss:
-; CHECK32: # BB#0: # %entry
+; CHECK32: # %bb.0: # %entry
; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK32-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
; CHECK32-NEXT: kmovw %ecx, %k1
@@ -226,13 +222,13 @@ entry:
define <2 x double> @test_mm_maskz_load_sd(i8 zeroext %__U, double* %__W) local_unnamed_addr #2 {
; CHECK64-LABEL: test_mm_maskz_load_sd:
-; CHECK64: # BB#0: # %entry
+; CHECK64: # %bb.0: # %entry
; CHECK64-NEXT: kmovw %edi, %k1
; CHECK64-NEXT: vmovsd (%rsi), %xmm0 {%k1} {z}
; CHECK64-NEXT: retq
;
; CHECK32-LABEL: test_mm_maskz_load_sd:
-; CHECK32: # BB#0: # %entry
+; CHECK32: # %bb.0: # %entry
; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK32-NEXT: movb {{[0-9]+}}(%esp), %cl
; CHECK32-NEXT: kmovw %ecx, %k1
diff --git a/test/CodeGen/X86/avx512-load-trunc-store-i1.ll b/test/CodeGen/X86/avx512-load-trunc-store-i1.ll
new file mode 100644
index 000000000000..bfcac8935123
--- /dev/null
+++ b/test/CodeGen/X86/avx512-load-trunc-store-i1.ll
@@ -0,0 +1,151 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq -O2 | FileCheck %s --check-prefix=AVX512-ALL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f -O2| FileCheck %s --check-prefix=AVX512-ONLY
+
+define void @load_v1i2_trunc_v1i1_store(<1 x i2>* %a0,<1 x i1>* %a1) {
+; AVX512-ALL-LABEL: load_v1i2_trunc_v1i1_store:
+; AVX512-ALL: # %bb.0:
+; AVX512-ALL-NEXT: movb (%rdi), %al
+; AVX512-ALL-NEXT: testb %al, %al
+; AVX512-ALL-NEXT: setne %al
+; AVX512-ALL-NEXT: kmovd %eax, %k0
+; AVX512-ALL-NEXT: kmovb %k0, (%rsi)
+; AVX512-ALL-NEXT: retq
+;
+; AVX512-ONLY-LABEL: load_v1i2_trunc_v1i1_store:
+; AVX512-ONLY: # %bb.0:
+; AVX512-ONLY-NEXT: movb (%rdi), %al
+; AVX512-ONLY-NEXT: testb %al, %al
+; AVX512-ONLY-NEXT: setne %al
+; AVX512-ONLY-NEXT: movb %al, (%rsi)
+; AVX512-ONLY-NEXT: retq
+ %d0 = load <1 x i2>, <1 x i2>* %a0
+ %d1 = trunc <1 x i2> %d0 to <1 x i1>
+ store <1 x i1> %d1, <1 x i1>* %a1
+ ret void
+}
+define void @load_v1i3_trunc_v1i1_store(<1 x i3>* %a0,<1 x i1>* %a1) {
+; AVX512-ALL-LABEL: load_v1i3_trunc_v1i1_store:
+; AVX512-ALL: # %bb.0:
+; AVX512-ALL-NEXT: movb (%rdi), %al
+; AVX512-ALL-NEXT: testb %al, %al
+; AVX512-ALL-NEXT: setne %al
+; AVX512-ALL-NEXT: kmovd %eax, %k0
+; AVX512-ALL-NEXT: kmovb %k0, (%rsi)
+; AVX512-ALL-NEXT: retq
+;
+; AVX512-ONLY-LABEL: load_v1i3_trunc_v1i1_store:
+; AVX512-ONLY: # %bb.0:
+; AVX512-ONLY-NEXT: movb (%rdi), %al
+; AVX512-ONLY-NEXT: testb %al, %al
+; AVX512-ONLY-NEXT: setne %al
+; AVX512-ONLY-NEXT: movb %al, (%rsi)
+; AVX512-ONLY-NEXT: retq
+ %d0 = load <1 x i3>, <1 x i3>* %a0
+ %d1 = trunc <1 x i3> %d0 to <1 x i1>
+ store <1 x i1> %d1, <1 x i1>* %a1
+ ret void
+}
+define void @load_v1i4_trunc_v1i1_store(<1 x i4>* %a0,<1 x i1>* %a1) {
+; AVX512-ALL-LABEL: load_v1i4_trunc_v1i1_store:
+; AVX512-ALL: # %bb.0:
+; AVX512-ALL-NEXT: movb (%rdi), %al
+; AVX512-ALL-NEXT: testb %al, %al
+; AVX512-ALL-NEXT: setne %al
+; AVX512-ALL-NEXT: kmovd %eax, %k0
+; AVX512-ALL-NEXT: kmovb %k0, (%rsi)
+; AVX512-ALL-NEXT: retq
+;
+; AVX512-ONLY-LABEL: load_v1i4_trunc_v1i1_store:
+; AVX512-ONLY: # %bb.0:
+; AVX512-ONLY-NEXT: movb (%rdi), %al
+; AVX512-ONLY-NEXT: testb %al, %al
+; AVX512-ONLY-NEXT: setne %al
+; AVX512-ONLY-NEXT: movb %al, (%rsi)
+; AVX512-ONLY-NEXT: retq
+ %d0 = load <1 x i4>, <1 x i4>* %a0
+ %d1 = trunc <1 x i4> %d0 to <1 x i1>
+ store <1 x i1> %d1, <1 x i1>* %a1
+ ret void
+}
+define void @load_v1i8_trunc_v1i1_store(<1 x i8>* %a0,<1 x i1>* %a1) {
+; AVX512-ALL-LABEL: load_v1i8_trunc_v1i1_store:
+; AVX512-ALL: # %bb.0:
+; AVX512-ALL-NEXT: cmpb $0, (%rdi)
+; AVX512-ALL-NEXT: setne %al
+; AVX512-ALL-NEXT: kmovd %eax, %k0
+; AVX512-ALL-NEXT: kmovb %k0, (%rsi)
+; AVX512-ALL-NEXT: retq
+;
+; AVX512-ONLY-LABEL: load_v1i8_trunc_v1i1_store:
+; AVX512-ONLY: # %bb.0:
+; AVX512-ONLY-NEXT: cmpb $0, (%rdi)
+; AVX512-ONLY-NEXT: setne %al
+; AVX512-ONLY-NEXT: movb %al, (%rsi)
+; AVX512-ONLY-NEXT: retq
+ %d0 = load <1 x i8>, <1 x i8>* %a0
+ %d1 = trunc <1 x i8> %d0 to <1 x i1>
+ store <1 x i1> %d1, <1 x i1>* %a1
+ ret void
+}
+define void @load_v1i16_trunc_v1i1_store(<1 x i16>* %a0,<1 x i1>* %a1) {
+; AVX512-ALL-LABEL: load_v1i16_trunc_v1i1_store:
+; AVX512-ALL: # %bb.0:
+; AVX512-ALL-NEXT: cmpb $0, (%rdi)
+; AVX512-ALL-NEXT: setne %al
+; AVX512-ALL-NEXT: kmovd %eax, %k0
+; AVX512-ALL-NEXT: kmovb %k0, (%rsi)
+; AVX512-ALL-NEXT: retq
+;
+; AVX512-ONLY-LABEL: load_v1i16_trunc_v1i1_store:
+; AVX512-ONLY: # %bb.0:
+; AVX512-ONLY-NEXT: cmpb $0, (%rdi)
+; AVX512-ONLY-NEXT: setne %al
+; AVX512-ONLY-NEXT: movb %al, (%rsi)
+; AVX512-ONLY-NEXT: retq
+ %d0 = load <1 x i16>, <1 x i16>* %a0
+ %d1 = trunc <1 x i16> %d0 to <1 x i1>
+ store <1 x i1> %d1, <1 x i1>* %a1
+ ret void
+}
+define void @load_v1i32_trunc_v1i1_store(<1 x i32>* %a0,<1 x i1>* %a1) {
+; AVX512-ALL-LABEL: load_v1i32_trunc_v1i1_store:
+; AVX512-ALL: # %bb.0:
+; AVX512-ALL-NEXT: cmpb $0, (%rdi)
+; AVX512-ALL-NEXT: setne %al
+; AVX512-ALL-NEXT: kmovd %eax, %k0
+; AVX512-ALL-NEXT: kmovb %k0, (%rsi)
+; AVX512-ALL-NEXT: retq
+;
+; AVX512-ONLY-LABEL: load_v1i32_trunc_v1i1_store:
+; AVX512-ONLY: # %bb.0:
+; AVX512-ONLY-NEXT: cmpb $0, (%rdi)
+; AVX512-ONLY-NEXT: setne %al
+; AVX512-ONLY-NEXT: movb %al, (%rsi)
+; AVX512-ONLY-NEXT: retq
+ %d0 = load <1 x i32>, <1 x i32>* %a0
+ %d1 = trunc <1 x i32> %d0 to <1 x i1>
+ store <1 x i1> %d1, <1 x i1>* %a1
+ ret void
+}
+define void @load_v1i64_trunc_v1i1_store(<1 x i64>* %a0,<1 x i1>* %a1) {
+; AVX512-ALL-LABEL: load_v1i64_trunc_v1i1_store:
+; AVX512-ALL: # %bb.0:
+; AVX512-ALL-NEXT: cmpb $0, (%rdi)
+; AVX512-ALL-NEXT: setne %al
+; AVX512-ALL-NEXT: kmovd %eax, %k0
+; AVX512-ALL-NEXT: kmovb %k0, (%rsi)
+; AVX512-ALL-NEXT: retq
+;
+; AVX512-ONLY-LABEL: load_v1i64_trunc_v1i1_store:
+; AVX512-ONLY: # %bb.0:
+; AVX512-ONLY-NEXT: cmpb $0, (%rdi)
+; AVX512-ONLY-NEXT: setne %al
+; AVX512-ONLY-NEXT: movb %al, (%rsi)
+; AVX512-ONLY-NEXT: retq
+ %d0 = load <1 x i64>, <1 x i64>* %a0
+ %d1 = trunc <1 x i64> %d0 to <1 x i1>
+ store <1 x i1> %d1, <1 x i1>* %a1
+ ret void
+}
+
diff --git a/test/CodeGen/X86/avx512-logic.ll b/test/CodeGen/X86/avx512-logic.ll
index 6e08753dbbb1..bb1e8550ba23 100644
--- a/test/CodeGen/X86/avx512-logic.ll
+++ b/test/CodeGen/X86/avx512-logic.ll
@@ -5,9 +5,9 @@
define <16 x i32> @vpandd(<16 x i32> %a, <16 x i32> %b) nounwind uwtable readnone ssp {
; ALL-LABEL: vpandd:
-; ALL: ## BB#0: ## %entry
+; ALL: ## %bb.0: ## %entry
; ALL-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0
-; ALL-NEXT: vpandd %zmm1, %zmm0, %zmm0
+; ALL-NEXT: vpandq %zmm1, %zmm0, %zmm0
; ALL-NEXT: retq
entry:
; Force the execution domain with an add.
@@ -19,9 +19,9 @@ entry:
define <16 x i32> @vpandnd(<16 x i32> %a, <16 x i32> %b) nounwind uwtable readnone ssp {
; ALL-LABEL: vpandnd:
-; ALL: ## BB#0: ## %entry
+; ALL: ## %bb.0: ## %entry
; ALL-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0
-; ALL-NEXT: vpandnd %zmm0, %zmm1, %zmm0
+; ALL-NEXT: vpandnq %zmm0, %zmm1, %zmm0
; ALL-NEXT: retq
entry:
; Force the execution domain with an add.
@@ -35,9 +35,9 @@ entry:
define <16 x i32> @vpord(<16 x i32> %a, <16 x i32> %b) nounwind uwtable readnone ssp {
; ALL-LABEL: vpord:
-; ALL: ## BB#0: ## %entry
+; ALL: ## %bb.0: ## %entry
; ALL-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0
-; ALL-NEXT: vpord %zmm1, %zmm0, %zmm0
+; ALL-NEXT: vporq %zmm1, %zmm0, %zmm0
; ALL-NEXT: retq
entry:
; Force the execution domain with an add.
@@ -49,9 +49,9 @@ entry:
define <16 x i32> @vpxord(<16 x i32> %a, <16 x i32> %b) nounwind uwtable readnone ssp {
; ALL-LABEL: vpxord:
-; ALL: ## BB#0: ## %entry
+; ALL: ## %bb.0: ## %entry
; ALL-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0
-; ALL-NEXT: vpxord %zmm1, %zmm0, %zmm0
+; ALL-NEXT: vpxorq %zmm1, %zmm0, %zmm0
; ALL-NEXT: retq
entry:
; Force the execution domain with an add.
@@ -63,7 +63,7 @@ entry:
define <8 x i64> @vpandq(<8 x i64> %a, <8 x i64> %b) nounwind uwtable readnone ssp {
; ALL-LABEL: vpandq:
-; ALL: ## BB#0: ## %entry
+; ALL: ## %bb.0: ## %entry
; ALL-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm0
; ALL-NEXT: vpandq %zmm1, %zmm0, %zmm0
; ALL-NEXT: retq
@@ -76,7 +76,7 @@ entry:
define <8 x i64> @vpandnq(<8 x i64> %a, <8 x i64> %b) nounwind uwtable readnone ssp {
; ALL-LABEL: vpandnq:
-; ALL: ## BB#0: ## %entry
+; ALL: ## %bb.0: ## %entry
; ALL-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm0
; ALL-NEXT: vpandnq %zmm0, %zmm1, %zmm0
; ALL-NEXT: retq
@@ -90,7 +90,7 @@ entry:
define <8 x i64> @vporq(<8 x i64> %a, <8 x i64> %b) nounwind uwtable readnone ssp {
; ALL-LABEL: vporq:
-; ALL: ## BB#0: ## %entry
+; ALL: ## %bb.0: ## %entry
; ALL-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm0
; ALL-NEXT: vporq %zmm1, %zmm0, %zmm0
; ALL-NEXT: retq
@@ -103,7 +103,7 @@ entry:
define <8 x i64> @vpxorq(<8 x i64> %a, <8 x i64> %b) nounwind uwtable readnone ssp {
; ALL-LABEL: vpxorq:
-; ALL: ## BB#0: ## %entry
+; ALL: ## %bb.0: ## %entry
; ALL-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm0
; ALL-NEXT: vpxorq %zmm1, %zmm0, %zmm0
; ALL-NEXT: retq
@@ -117,12 +117,12 @@ entry:
define <8 x i64> @orq_broadcast(<8 x i64> %a) nounwind {
; KNL-LABEL: orq_broadcast:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: vporq {{.*}}(%rip){1to8}, %zmm0, %zmm0
; KNL-NEXT: retq
;
; SKX-LABEL: orq_broadcast:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vorpd {{.*}}(%rip){1to8}, %zmm0, %zmm0
; SKX-NEXT: retq
%b = or <8 x i64> %a, <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
@@ -131,12 +131,12 @@ define <8 x i64> @orq_broadcast(<8 x i64> %a) nounwind {
define <16 x i32> @andd512fold(<16 x i32> %y, <16 x i32>* %x) {
; KNL-LABEL: andd512fold:
-; KNL: ## BB#0: ## %entry
-; KNL-NEXT: vpandd (%rdi), %zmm0, %zmm0
+; KNL: ## %bb.0: ## %entry
+; KNL-NEXT: vpandq (%rdi), %zmm0, %zmm0
; KNL-NEXT: retq
;
; SKX-LABEL: andd512fold:
-; SKX: ## BB#0: ## %entry
+; SKX: ## %bb.0: ## %entry
; SKX-NEXT: vandps (%rdi), %zmm0, %zmm0
; SKX-NEXT: retq
entry:
@@ -147,12 +147,12 @@ entry:
define <8 x i64> @andqbrst(<8 x i64> %p1, i64* %ap) {
; KNL-LABEL: andqbrst:
-; KNL: ## BB#0: ## %entry
+; KNL: ## %bb.0: ## %entry
; KNL-NEXT: vpandq (%rdi){1to8}, %zmm0, %zmm0
; KNL-NEXT: retq
;
; SKX-LABEL: andqbrst:
-; SKX: ## BB#0: ## %entry
+; SKX: ## %bb.0: ## %entry
; SKX-NEXT: vandpd (%rdi){1to8}, %zmm0, %zmm0
; SKX-NEXT: retq
entry:
@@ -165,13 +165,13 @@ entry:
define <64 x i8> @and_v64i8(<64 x i8> %a, <64 x i8> %b) {
; KNL-LABEL: and_v64i8:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: vandps %ymm2, %ymm0, %ymm0
; KNL-NEXT: vandps %ymm3, %ymm1, %ymm1
; KNL-NEXT: retq
;
; SKX-LABEL: and_v64i8:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vandps %zmm1, %zmm0, %zmm0
; SKX-NEXT: retq
%res = and <64 x i8> %a, %b
@@ -180,13 +180,13 @@ define <64 x i8> @and_v64i8(<64 x i8> %a, <64 x i8> %b) {
define <64 x i8> @andn_v64i8(<64 x i8> %a, <64 x i8> %b) {
; KNL-LABEL: andn_v64i8:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: vandnps %ymm0, %ymm2, %ymm0
; KNL-NEXT: vandnps %ymm1, %ymm3, %ymm1
; KNL-NEXT: retq
;
; SKX-LABEL: andn_v64i8:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vandnps %zmm0, %zmm1, %zmm0
; SKX-NEXT: retq
%b2 = xor <64 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1,
@@ -199,13 +199,13 @@ define <64 x i8> @andn_v64i8(<64 x i8> %a, <64 x i8> %b) {
define <64 x i8> @or_v64i8(<64 x i8> %a, <64 x i8> %b) {
; KNL-LABEL: or_v64i8:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: vorps %ymm2, %ymm0, %ymm0
; KNL-NEXT: vorps %ymm3, %ymm1, %ymm1
; KNL-NEXT: retq
;
; SKX-LABEL: or_v64i8:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vorps %zmm1, %zmm0, %zmm0
; SKX-NEXT: retq
%res = or <64 x i8> %a, %b
@@ -214,13 +214,13 @@ define <64 x i8> @or_v64i8(<64 x i8> %a, <64 x i8> %b) {
define <64 x i8> @xor_v64i8(<64 x i8> %a, <64 x i8> %b) {
; KNL-LABEL: xor_v64i8:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: vxorps %ymm2, %ymm0, %ymm0
; KNL-NEXT: vxorps %ymm3, %ymm1, %ymm1
; KNL-NEXT: retq
;
; SKX-LABEL: xor_v64i8:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vxorps %zmm1, %zmm0, %zmm0
; SKX-NEXT: retq
%res = xor <64 x i8> %a, %b
@@ -229,13 +229,13 @@ define <64 x i8> @xor_v64i8(<64 x i8> %a, <64 x i8> %b) {
define <32 x i16> @and_v32i16(<32 x i16> %a, <32 x i16> %b) {
; KNL-LABEL: and_v32i16:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: vandps %ymm2, %ymm0, %ymm0
; KNL-NEXT: vandps %ymm3, %ymm1, %ymm1
; KNL-NEXT: retq
;
; SKX-LABEL: and_v32i16:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vandps %zmm1, %zmm0, %zmm0
; SKX-NEXT: retq
%res = and <32 x i16> %a, %b
@@ -244,13 +244,13 @@ define <32 x i16> @and_v32i16(<32 x i16> %a, <32 x i16> %b) {
define <32 x i16> @andn_v32i16(<32 x i16> %a, <32 x i16> %b) {
; KNL-LABEL: andn_v32i16:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: vandnps %ymm0, %ymm2, %ymm0
; KNL-NEXT: vandnps %ymm1, %ymm3, %ymm1
; KNL-NEXT: retq
;
; SKX-LABEL: andn_v32i16:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vandnps %zmm0, %zmm1, %zmm0
; SKX-NEXT: retq
%b2 = xor <32 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1,
@@ -261,13 +261,13 @@ define <32 x i16> @andn_v32i16(<32 x i16> %a, <32 x i16> %b) {
define <32 x i16> @or_v32i16(<32 x i16> %a, <32 x i16> %b) {
; KNL-LABEL: or_v32i16:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: vorps %ymm2, %ymm0, %ymm0
; KNL-NEXT: vorps %ymm3, %ymm1, %ymm1
; KNL-NEXT: retq
;
; SKX-LABEL: or_v32i16:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vorps %zmm1, %zmm0, %zmm0
; SKX-NEXT: retq
%res = or <32 x i16> %a, %b
@@ -276,13 +276,13 @@ define <32 x i16> @or_v32i16(<32 x i16> %a, <32 x i16> %b) {
define <32 x i16> @xor_v32i16(<32 x i16> %a, <32 x i16> %b) {
; KNL-LABEL: xor_v32i16:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: vxorps %ymm2, %ymm0, %ymm0
; KNL-NEXT: vxorps %ymm3, %ymm1, %ymm1
; KNL-NEXT: retq
;
; SKX-LABEL: xor_v32i16:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vxorps %zmm1, %zmm0, %zmm0
; SKX-NEXT: retq
%res = xor <32 x i16> %a, %b
@@ -291,14 +291,14 @@ define <32 x i16> @xor_v32i16(<32 x i16> %a, <32 x i16> %b) {
define <16 x float> @masked_and_v16f32(<16 x float> %a, <16 x float> %b, <16 x float> %passThru, i16 %mask, <16 x float> %c) {
; KNL-LABEL: masked_and_v16f32:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: kmovw %edi, %k1
; KNL-NEXT: vpandd %zmm1, %zmm0, %zmm2 {%k1}
; KNL-NEXT: vaddps %zmm2, %zmm3, %zmm0
; KNL-NEXT: retq
;
; SKX-LABEL: masked_and_v16f32:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: kmovd %edi, %k1
; SKX-NEXT: vandps %zmm1, %zmm0, %zmm2 {%k1}
; SKX-NEXT: vaddps %zmm2, %zmm3, %zmm0
@@ -316,14 +316,14 @@ define <16 x float> @masked_and_v16f32(<16 x float> %a, <16 x float> %b, <16 x f
define <16 x float> @masked_or_v16f32(<16 x float> %a, <16 x float> %b, <16 x float> %passThru, i16 %mask, <16 x float> %c) {
; KNL-LABEL: masked_or_v16f32:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: kmovw %edi, %k1
; KNL-NEXT: vpandd %zmm1, %zmm0, %zmm2 {%k1}
; KNL-NEXT: vaddps %zmm2, %zmm3, %zmm0
; KNL-NEXT: retq
;
; SKX-LABEL: masked_or_v16f32:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: kmovd %edi, %k1
; SKX-NEXT: vandps %zmm1, %zmm0, %zmm2 {%k1}
; SKX-NEXT: vaddps %zmm2, %zmm3, %zmm0
@@ -341,14 +341,14 @@ define <16 x float> @masked_or_v16f32(<16 x float> %a, <16 x float> %b, <16 x fl
define <16 x float> @masked_xor_v16f32(<16 x float> %a, <16 x float> %b, <16 x float> %passThru, i16 %mask, <16 x float> %c) {
; KNL-LABEL: masked_xor_v16f32:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: kmovw %edi, %k1
; KNL-NEXT: vpandd %zmm1, %zmm0, %zmm2 {%k1}
; KNL-NEXT: vaddps %zmm2, %zmm3, %zmm0
; KNL-NEXT: retq
;
; SKX-LABEL: masked_xor_v16f32:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: kmovd %edi, %k1
; SKX-NEXT: vandps %zmm1, %zmm0, %zmm2 {%k1}
; SKX-NEXT: vaddps %zmm2, %zmm3, %zmm0
@@ -366,14 +366,14 @@ define <16 x float> @masked_xor_v16f32(<16 x float> %a, <16 x float> %b, <16 x f
define <8 x double> @masked_and_v8f64(<8 x double> %a, <8 x double> %b, <8 x double> %passThru, i8 %mask, <8 x double> %c) {
; KNL-LABEL: masked_and_v8f64:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: kmovw %edi, %k1
; KNL-NEXT: vpandq %zmm1, %zmm0, %zmm2 {%k1}
; KNL-NEXT: vaddpd %zmm2, %zmm3, %zmm0
; KNL-NEXT: retq
;
; SKX-LABEL: masked_and_v8f64:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: kmovd %edi, %k1
; SKX-NEXT: vandpd %zmm1, %zmm0, %zmm2 {%k1}
; SKX-NEXT: vaddpd %zmm2, %zmm3, %zmm0
@@ -391,14 +391,14 @@ define <8 x double> @masked_and_v8f64(<8 x double> %a, <8 x double> %b, <8 x dou
define <8 x double> @masked_or_v8f64(<8 x double> %a, <8 x double> %b, <8 x double> %passThru, i8 %mask, <8 x double> %c) {
; KNL-LABEL: masked_or_v8f64:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: kmovw %edi, %k1
; KNL-NEXT: vpandq %zmm1, %zmm0, %zmm2 {%k1}
; KNL-NEXT: vaddpd %zmm2, %zmm3, %zmm0
; KNL-NEXT: retq
;
; SKX-LABEL: masked_or_v8f64:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: kmovd %edi, %k1
; SKX-NEXT: vandpd %zmm1, %zmm0, %zmm2 {%k1}
; SKX-NEXT: vaddpd %zmm2, %zmm3, %zmm0
@@ -416,14 +416,14 @@ define <8 x double> @masked_or_v8f64(<8 x double> %a, <8 x double> %b, <8 x doub
define <8 x double> @masked_xor_v8f64(<8 x double> %a, <8 x double> %b, <8 x double> %passThru, i8 %mask, <8 x double> %c) {
; KNL-LABEL: masked_xor_v8f64:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: kmovw %edi, %k1
; KNL-NEXT: vpandq %zmm1, %zmm0, %zmm2 {%k1}
; KNL-NEXT: vaddpd %zmm2, %zmm3, %zmm0
; KNL-NEXT: retq
;
; SKX-LABEL: masked_xor_v8f64:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: kmovd %edi, %k1
; SKX-NEXT: vandpd %zmm1, %zmm0, %zmm2 {%k1}
; SKX-NEXT: vaddpd %zmm2, %zmm3, %zmm0
@@ -441,13 +441,13 @@ define <8 x double> @masked_xor_v8f64(<8 x double> %a, <8 x double> %b, <8 x dou
define <8 x i64> @test_mm512_mask_and_epi32(<8 x i64> %__src, i16 zeroext %__k, <8 x i64> %__a, <8 x i64> %__b) {
; KNL-LABEL: test_mm512_mask_and_epi32:
-; KNL: ## BB#0: ## %entry
+; KNL: ## %bb.0: ## %entry
; KNL-NEXT: kmovw %edi, %k1
; KNL-NEXT: vpandd %zmm2, %zmm1, %zmm0 {%k1}
; KNL-NEXT: retq
;
; SKX-LABEL: test_mm512_mask_and_epi32:
-; SKX: ## BB#0: ## %entry
+; SKX: ## %bb.0: ## %entry
; SKX-NEXT: kmovd %edi, %k1
; SKX-NEXT: vandps %zmm2, %zmm1, %zmm0 {%k1}
; SKX-NEXT: retq
@@ -463,13 +463,13 @@ entry:
define <8 x i64> @test_mm512_mask_or_epi32(<8 x i64> %__src, i16 zeroext %__k, <8 x i64> %__a, <8 x i64> %__b) {
; KNL-LABEL: test_mm512_mask_or_epi32:
-; KNL: ## BB#0: ## %entry
+; KNL: ## %bb.0: ## %entry
; KNL-NEXT: kmovw %edi, %k1
; KNL-NEXT: vpord %zmm2, %zmm1, %zmm0 {%k1}
; KNL-NEXT: retq
;
; SKX-LABEL: test_mm512_mask_or_epi32:
-; SKX: ## BB#0: ## %entry
+; SKX: ## %bb.0: ## %entry
; SKX-NEXT: kmovd %edi, %k1
; SKX-NEXT: vorps %zmm2, %zmm1, %zmm0 {%k1}
; SKX-NEXT: retq
@@ -485,13 +485,13 @@ entry:
define <8 x i64> @test_mm512_mask_xor_epi32(<8 x i64> %__src, i16 zeroext %__k, <8 x i64> %__a, <8 x i64> %__b) {
; KNL-LABEL: test_mm512_mask_xor_epi32:
-; KNL: ## BB#0: ## %entry
+; KNL: ## %bb.0: ## %entry
; KNL-NEXT: kmovw %edi, %k1
; KNL-NEXT: vpxord %zmm2, %zmm1, %zmm0 {%k1}
; KNL-NEXT: retq
;
; SKX-LABEL: test_mm512_mask_xor_epi32:
-; SKX: ## BB#0: ## %entry
+; SKX: ## %bb.0: ## %entry
; SKX-NEXT: kmovd %edi, %k1
; SKX-NEXT: vxorps %zmm2, %zmm1, %zmm0 {%k1}
; SKX-NEXT: retq
@@ -507,13 +507,13 @@ entry:
define <8 x double> @test_mm512_mask_xor_pd(<8 x double> %__W, i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) {
; KNL-LABEL: test_mm512_mask_xor_pd:
-; KNL: ## BB#0: ## %entry
+; KNL: ## %bb.0: ## %entry
; KNL-NEXT: kmovw %edi, %k1
; KNL-NEXT: vpxorq %zmm2, %zmm1, %zmm0 {%k1}
; KNL-NEXT: retq
;
; SKX-LABEL: test_mm512_mask_xor_pd:
-; SKX: ## BB#0: ## %entry
+; SKX: ## %bb.0: ## %entry
; SKX-NEXT: kmovd %edi, %k1
; SKX-NEXT: vxorpd %zmm2, %zmm1, %zmm0 {%k1}
; SKX-NEXT: retq
@@ -529,13 +529,13 @@ entry:
define <8 x double> @test_mm512_maskz_xor_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) {
; KNL-LABEL: test_mm512_maskz_xor_pd:
-; KNL: ## BB#0: ## %entry
+; KNL: ## %bb.0: ## %entry
; KNL-NEXT: kmovw %edi, %k1
; KNL-NEXT: vpxorq %zmm1, %zmm0, %zmm0 {%k1} {z}
; KNL-NEXT: retq
;
; SKX-LABEL: test_mm512_maskz_xor_pd:
-; SKX: ## BB#0: ## %entry
+; SKX: ## %bb.0: ## %entry
; SKX-NEXT: kmovd %edi, %k1
; SKX-NEXT: vxorpd %zmm1, %zmm0, %zmm0 {%k1} {z}
; SKX-NEXT: retq
@@ -551,13 +551,13 @@ entry:
define <16 x float> @test_mm512_mask_xor_ps(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) {
; KNL-LABEL: test_mm512_mask_xor_ps:
-; KNL: ## BB#0: ## %entry
+; KNL: ## %bb.0: ## %entry
; KNL-NEXT: kmovw %edi, %k1
; KNL-NEXT: vpxord %zmm2, %zmm1, %zmm0 {%k1}
; KNL-NEXT: retq
;
; SKX-LABEL: test_mm512_mask_xor_ps:
-; SKX: ## BB#0: ## %entry
+; SKX: ## %bb.0: ## %entry
; SKX-NEXT: kmovd %edi, %k1
; SKX-NEXT: vxorps %zmm2, %zmm1, %zmm0 {%k1}
; SKX-NEXT: retq
@@ -573,13 +573,13 @@ entry:
define <16 x float> @test_mm512_maskz_xor_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) {
; KNL-LABEL: test_mm512_maskz_xor_ps:
-; KNL: ## BB#0: ## %entry
+; KNL: ## %bb.0: ## %entry
; KNL-NEXT: kmovw %edi, %k1
; KNL-NEXT: vpxord %zmm1, %zmm0, %zmm0 {%k1} {z}
; KNL-NEXT: retq
;
; SKX-LABEL: test_mm512_maskz_xor_ps:
-; SKX: ## BB#0: ## %entry
+; SKX: ## %bb.0: ## %entry
; SKX-NEXT: kmovd %edi, %k1
; SKX-NEXT: vxorps %zmm1, %zmm0, %zmm0 {%k1} {z}
; SKX-NEXT: retq
@@ -595,13 +595,13 @@ entry:
define <8 x double> @test_mm512_mask_or_pd(<8 x double> %__W, i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) {
; KNL-LABEL: test_mm512_mask_or_pd:
-; KNL: ## BB#0: ## %entry
+; KNL: ## %bb.0: ## %entry
; KNL-NEXT: kmovw %edi, %k1
; KNL-NEXT: vporq %zmm1, %zmm2, %zmm0 {%k1}
; KNL-NEXT: retq
;
; SKX-LABEL: test_mm512_mask_or_pd:
-; SKX: ## BB#0: ## %entry
+; SKX: ## %bb.0: ## %entry
; SKX-NEXT: kmovd %edi, %k1
; SKX-NEXT: vorpd %zmm1, %zmm2, %zmm0 {%k1}
; SKX-NEXT: retq
@@ -617,13 +617,13 @@ entry:
define <8 x double> @test_mm512_maskz_or_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) {
; KNL-LABEL: test_mm512_maskz_or_pd:
-; KNL: ## BB#0: ## %entry
+; KNL: ## %bb.0: ## %entry
; KNL-NEXT: kmovw %edi, %k1
; KNL-NEXT: vporq %zmm0, %zmm1, %zmm0 {%k1} {z}
; KNL-NEXT: retq
;
; SKX-LABEL: test_mm512_maskz_or_pd:
-; SKX: ## BB#0: ## %entry
+; SKX: ## %bb.0: ## %entry
; SKX-NEXT: kmovd %edi, %k1
; SKX-NEXT: vorpd %zmm0, %zmm1, %zmm0 {%k1} {z}
; SKX-NEXT: retq
@@ -639,13 +639,13 @@ entry:
define <16 x float> @test_mm512_mask_or_ps(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) {
; KNL-LABEL: test_mm512_mask_or_ps:
-; KNL: ## BB#0: ## %entry
+; KNL: ## %bb.0: ## %entry
; KNL-NEXT: kmovw %edi, %k1
; KNL-NEXT: vpord %zmm1, %zmm2, %zmm0 {%k1}
; KNL-NEXT: retq
;
; SKX-LABEL: test_mm512_mask_or_ps:
-; SKX: ## BB#0: ## %entry
+; SKX: ## %bb.0: ## %entry
; SKX-NEXT: kmovd %edi, %k1
; SKX-NEXT: vorps %zmm1, %zmm2, %zmm0 {%k1}
; SKX-NEXT: retq
@@ -661,13 +661,13 @@ entry:
define <16 x float> @test_mm512_maskz_or_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) {
; KNL-LABEL: test_mm512_maskz_or_ps:
-; KNL: ## BB#0: ## %entry
+; KNL: ## %bb.0: ## %entry
; KNL-NEXT: kmovw %edi, %k1
; KNL-NEXT: vpord %zmm0, %zmm1, %zmm0 {%k1} {z}
; KNL-NEXT: retq
;
; SKX-LABEL: test_mm512_maskz_or_ps:
-; SKX: ## BB#0: ## %entry
+; SKX: ## %bb.0: ## %entry
; SKX-NEXT: kmovd %edi, %k1
; SKX-NEXT: vorps %zmm0, %zmm1, %zmm0 {%k1} {z}
; SKX-NEXT: retq
@@ -683,13 +683,13 @@ entry:
define <8 x double> @test_mm512_mask_and_pd(<8 x double> %__W, i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) {
; KNL-LABEL: test_mm512_mask_and_pd:
-; KNL: ## BB#0: ## %entry
+; KNL: ## %bb.0: ## %entry
; KNL-NEXT: kmovw %edi, %k1
; KNL-NEXT: vpandq %zmm1, %zmm2, %zmm0 {%k1}
; KNL-NEXT: retq
;
; SKX-LABEL: test_mm512_mask_and_pd:
-; SKX: ## BB#0: ## %entry
+; SKX: ## %bb.0: ## %entry
; SKX-NEXT: kmovd %edi, %k1
; SKX-NEXT: vandpd %zmm1, %zmm2, %zmm0 {%k1}
; SKX-NEXT: retq
@@ -705,13 +705,13 @@ entry:
define <8 x double> @test_mm512_maskz_and_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) {
; KNL-LABEL: test_mm512_maskz_and_pd:
-; KNL: ## BB#0: ## %entry
+; KNL: ## %bb.0: ## %entry
; KNL-NEXT: kmovw %edi, %k1
; KNL-NEXT: vpandq %zmm0, %zmm1, %zmm0 {%k1} {z}
; KNL-NEXT: retq
;
; SKX-LABEL: test_mm512_maskz_and_pd:
-; SKX: ## BB#0: ## %entry
+; SKX: ## %bb.0: ## %entry
; SKX-NEXT: kmovd %edi, %k1
; SKX-NEXT: vandpd %zmm0, %zmm1, %zmm0 {%k1} {z}
; SKX-NEXT: retq
@@ -727,13 +727,13 @@ entry:
define <16 x float> @test_mm512_mask_and_ps(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) {
; KNL-LABEL: test_mm512_mask_and_ps:
-; KNL: ## BB#0: ## %entry
+; KNL: ## %bb.0: ## %entry
; KNL-NEXT: kmovw %edi, %k1
; KNL-NEXT: vpandd %zmm1, %zmm2, %zmm0 {%k1}
; KNL-NEXT: retq
;
; SKX-LABEL: test_mm512_mask_and_ps:
-; SKX: ## BB#0: ## %entry
+; SKX: ## %bb.0: ## %entry
; SKX-NEXT: kmovd %edi, %k1
; SKX-NEXT: vandps %zmm1, %zmm2, %zmm0 {%k1}
; SKX-NEXT: retq
@@ -749,13 +749,13 @@ entry:
define <16 x float> @test_mm512_maskz_and_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) {
; KNL-LABEL: test_mm512_maskz_and_ps:
-; KNL: ## BB#0: ## %entry
+; KNL: ## %bb.0: ## %entry
; KNL-NEXT: kmovw %edi, %k1
; KNL-NEXT: vpandd %zmm0, %zmm1, %zmm0 {%k1} {z}
; KNL-NEXT: retq
;
; SKX-LABEL: test_mm512_maskz_and_ps:
-; SKX: ## BB#0: ## %entry
+; SKX: ## %bb.0: ## %entry
; SKX-NEXT: kmovd %edi, %k1
; SKX-NEXT: vandps %zmm0, %zmm1, %zmm0 {%k1} {z}
; SKX-NEXT: retq
@@ -771,13 +771,13 @@ entry:
define <8 x double> @test_mm512_mask_andnot_pd(<8 x double> %__W, i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) {
; KNL-LABEL: test_mm512_mask_andnot_pd:
-; KNL: ## BB#0: ## %entry
+; KNL: ## %bb.0: ## %entry
; KNL-NEXT: kmovw %edi, %k1
; KNL-NEXT: vpandnq %zmm2, %zmm1, %zmm0 {%k1}
; KNL-NEXT: retq
;
; SKX-LABEL: test_mm512_mask_andnot_pd:
-; SKX: ## BB#0: ## %entry
+; SKX: ## %bb.0: ## %entry
; SKX-NEXT: kmovd %edi, %k1
; SKX-NEXT: vandnpd %zmm2, %zmm1, %zmm0 {%k1}
; SKX-NEXT: retq
@@ -794,13 +794,13 @@ entry:
define <8 x double> @test_mm512_maskz_andnot_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) {
; KNL-LABEL: test_mm512_maskz_andnot_pd:
-; KNL: ## BB#0: ## %entry
+; KNL: ## %bb.0: ## %entry
; KNL-NEXT: kmovw %edi, %k1
; KNL-NEXT: vpandnq %zmm1, %zmm0, %zmm0 {%k1} {z}
; KNL-NEXT: retq
;
; SKX-LABEL: test_mm512_maskz_andnot_pd:
-; SKX: ## BB#0: ## %entry
+; SKX: ## %bb.0: ## %entry
; SKX-NEXT: kmovd %edi, %k1
; SKX-NEXT: vandnpd %zmm1, %zmm0, %zmm0 {%k1} {z}
; SKX-NEXT: retq
@@ -817,13 +817,13 @@ entry:
define <16 x float> @test_mm512_mask_andnot_ps(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) {
; KNL-LABEL: test_mm512_mask_andnot_ps:
-; KNL: ## BB#0: ## %entry
+; KNL: ## %bb.0: ## %entry
; KNL-NEXT: kmovw %edi, %k1
; KNL-NEXT: vpandnd %zmm2, %zmm1, %zmm0 {%k1}
; KNL-NEXT: retq
;
; SKX-LABEL: test_mm512_mask_andnot_ps:
-; SKX: ## BB#0: ## %entry
+; SKX: ## %bb.0: ## %entry
; SKX-NEXT: kmovd %edi, %k1
; SKX-NEXT: vandnps %zmm2, %zmm1, %zmm0 {%k1}
; SKX-NEXT: retq
@@ -840,13 +840,13 @@ entry:
define <16 x float> @test_mm512_maskz_andnot_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) {
; KNL-LABEL: test_mm512_maskz_andnot_ps:
-; KNL: ## BB#0: ## %entry
+; KNL: ## %bb.0: ## %entry
; KNL-NEXT: kmovw %edi, %k1
; KNL-NEXT: vpandnd %zmm1, %zmm0, %zmm0 {%k1} {z}
; KNL-NEXT: retq
;
; SKX-LABEL: test_mm512_maskz_andnot_ps:
-; SKX: ## BB#0: ## %entry
+; SKX: ## %bb.0: ## %entry
; SKX-NEXT: kmovd %edi, %k1
; SKX-NEXT: vandnps %zmm1, %zmm0, %zmm0 {%k1} {z}
; SKX-NEXT: retq
diff --git a/test/CodeGen/X86/avx512-mask-op.ll b/test/CodeGen/X86/avx512-mask-op.ll
index 6f4bf061a215..dfe42d53483f 100644
--- a/test/CodeGen/X86/avx512-mask-op.ll
+++ b/test/CodeGen/X86/avx512-mask-op.ll
@@ -1,41 +1,41 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -stack-symbol-ordering=0 -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s --check-prefix=CHECK --check-prefix=KNL
-; RUN: llc < %s -stack-symbol-ordering=0 -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s --check-prefix=CHECK --check-prefix=SKX
-; RUN: llc < %s -stack-symbol-ordering=0 -march=x86-64 -mtriple=x86_64-apple-darwin -mattr=+avx512bw | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512BW
-; RUN: llc < %s -stack-symbol-ordering=0 -march=x86-64 -mtriple=x86_64-apple-darwin -mattr=+avx512dq | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512DQ
+; RUN: llc < %s -stack-symbol-ordering=0 -mtriple=x86_64-apple-darwin -mattr=+avx512f | FileCheck %s --check-prefix=CHECK --check-prefix=KNL
+; RUN: llc < %s -stack-symbol-ordering=0 -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq | FileCheck %s --check-prefix=CHECK --check-prefix=SKX
+; RUN: llc < %s -stack-symbol-ordering=0 -mtriple=x86_64-apple-darwin -mattr=+avx512bw | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512BW
+; RUN: llc < %s -stack-symbol-ordering=0 -mtriple=x86_64-apple-darwin -mattr=+avx512dq | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512DQ
define i16 @mask16(i16 %x) {
; KNL-LABEL: mask16:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: kmovw %edi, %k0
; KNL-NEXT: knotw %k0, %k0
; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; KNL-NEXT: ## kill: def %ax killed %ax killed %eax
; KNL-NEXT: retq
;
; SKX-LABEL: mask16:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: kmovd %edi, %k0
; SKX-NEXT: knotw %k0, %k0
; SKX-NEXT: kmovd %k0, %eax
-; SKX-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; SKX-NEXT: ## kill: def %ax killed %ax killed %eax
; SKX-NEXT: retq
;
; AVX512BW-LABEL: mask16:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovd %edi, %k0
; AVX512BW-NEXT: knotw %k0, %k0
; AVX512BW-NEXT: kmovd %k0, %eax
-; AVX512BW-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX512BW-NEXT: ## kill: def %ax killed %ax killed %eax
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: mask16:
-; AVX512DQ: ## BB#0:
+; AVX512DQ: ## %bb.0:
; AVX512DQ-NEXT: kmovw %edi, %k0
; AVX512DQ-NEXT: knotw %k0, %k0
; AVX512DQ-NEXT: kmovw %k0, %eax
-; AVX512DQ-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX512DQ-NEXT: ## kill: def %ax killed %ax killed %eax
; AVX512DQ-NEXT: retq
%m0 = bitcast i16 %x to <16 x i1>
%m1 = xor <16 x i1> %m0, <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1>
@@ -45,28 +45,28 @@ define i16 @mask16(i16 %x) {
define i32 @mask16_zext(i16 %x) {
; KNL-LABEL: mask16_zext:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: kmovw %edi, %k0
; KNL-NEXT: knotw %k0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: retq
;
; SKX-LABEL: mask16_zext:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: kmovd %edi, %k0
; SKX-NEXT: knotw %k0, %k0
; SKX-NEXT: kmovw %k0, %eax
; SKX-NEXT: retq
;
; AVX512BW-LABEL: mask16_zext:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovd %edi, %k0
; AVX512BW-NEXT: knotw %k0, %k0
; AVX512BW-NEXT: kmovw %k0, %eax
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: mask16_zext:
-; AVX512DQ: ## BB#0:
+; AVX512DQ: ## %bb.0:
; AVX512DQ-NEXT: kmovw %edi, %k0
; AVX512DQ-NEXT: knotw %k0, %k0
; AVX512DQ-NEXT: kmovw %k0, %eax
@@ -80,35 +80,35 @@ define i32 @mask16_zext(i16 %x) {
define i8 @mask8(i8 %x) {
; KNL-LABEL: mask8:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: kmovw %edi, %k0
; KNL-NEXT: knotw %k0, %k0
; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; KNL-NEXT: ## kill: def %al killed %al killed %eax
; KNL-NEXT: retq
;
; SKX-LABEL: mask8:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: kmovd %edi, %k0
; SKX-NEXT: knotb %k0, %k0
; SKX-NEXT: kmovd %k0, %eax
-; SKX-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; SKX-NEXT: ## kill: def %al killed %al killed %eax
; SKX-NEXT: retq
;
; AVX512BW-LABEL: mask8:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovd %edi, %k0
; AVX512BW-NEXT: knotw %k0, %k0
; AVX512BW-NEXT: kmovd %k0, %eax
-; AVX512BW-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: ## kill: def %al killed %al killed %eax
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: mask8:
-; AVX512DQ: ## BB#0:
+; AVX512DQ: ## %bb.0:
; AVX512DQ-NEXT: kmovw %edi, %k0
; AVX512DQ-NEXT: knotb %k0, %k0
; AVX512DQ-NEXT: kmovw %k0, %eax
-; AVX512DQ-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512DQ-NEXT: ## kill: def %al killed %al killed %eax
; AVX512DQ-NEXT: retq
%m0 = bitcast i8 %x to <8 x i1>
%m1 = xor <8 x i1> %m0, <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1>
@@ -118,7 +118,7 @@ define i8 @mask8(i8 %x) {
define i32 @mask8_zext(i8 %x) {
; KNL-LABEL: mask8_zext:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: kmovw %edi, %k0
; KNL-NEXT: knotw %k0, %k0
; KNL-NEXT: kmovw %k0, %eax
@@ -126,14 +126,14 @@ define i32 @mask8_zext(i8 %x) {
; KNL-NEXT: retq
;
; SKX-LABEL: mask8_zext:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: kmovd %edi, %k0
; SKX-NEXT: knotb %k0, %k0
; SKX-NEXT: kmovb %k0, %eax
; SKX-NEXT: retq
;
; AVX512BW-LABEL: mask8_zext:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovd %edi, %k0
; AVX512BW-NEXT: knotw %k0, %k0
; AVX512BW-NEXT: kmovd %k0, %eax
@@ -141,7 +141,7 @@ define i32 @mask8_zext(i8 %x) {
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: mask8_zext:
-; AVX512DQ: ## BB#0:
+; AVX512DQ: ## %bb.0:
; AVX512DQ-NEXT: kmovw %edi, %k0
; AVX512DQ-NEXT: knotb %k0, %k0
; AVX512DQ-NEXT: kmovb %k0, %eax
@@ -155,7 +155,7 @@ define i32 @mask8_zext(i8 %x) {
define void @mask16_mem(i16* %ptr) {
; CHECK-LABEL: mask16_mem:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw (%rdi), %k0
; CHECK-NEXT: knotw %k0, %k0
; CHECK-NEXT: kmovw %k0, (%rdi)
@@ -170,7 +170,7 @@ define void @mask16_mem(i16* %ptr) {
define void @mask8_mem(i8* %ptr) {
; KNL-LABEL: mask8_mem:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: movzbl (%rdi), %eax
; KNL-NEXT: kmovw %eax, %k0
; KNL-NEXT: knotw %k0, %k0
@@ -179,14 +179,14 @@ define void @mask8_mem(i8* %ptr) {
; KNL-NEXT: retq
;
; SKX-LABEL: mask8_mem:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: kmovb (%rdi), %k0
; SKX-NEXT: knotb %k0, %k0
; SKX-NEXT: kmovb %k0, (%rdi)
; SKX-NEXT: retq
;
; AVX512BW-LABEL: mask8_mem:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: movzbl (%rdi), %eax
; AVX512BW-NEXT: kmovd %eax, %k0
; AVX512BW-NEXT: knotw %k0, %k0
@@ -195,7 +195,7 @@ define void @mask8_mem(i8* %ptr) {
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: mask8_mem:
-; AVX512DQ: ## BB#0:
+; AVX512DQ: ## %bb.0:
; AVX512DQ-NEXT: kmovb (%rdi), %k0
; AVX512DQ-NEXT: knotb %k0, %k0
; AVX512DQ-NEXT: kmovb %k0, (%rdi)
@@ -210,7 +210,7 @@ define void @mask8_mem(i8* %ptr) {
define i16 @mand16(i16 %x, i16 %y) {
; CHECK-LABEL: mand16:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: movl %edi, %eax
; CHECK-NEXT: xorl %esi, %eax
; CHECK-NEXT: andl %esi, %edi
@@ -228,47 +228,47 @@ define i16 @mand16(i16 %x, i16 %y) {
define i16 @mand16_mem(<16 x i1>* %x, <16 x i1>* %y) {
; KNL-LABEL: mand16_mem:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: kmovw (%rdi), %k0
; KNL-NEXT: kmovw (%rsi), %k1
; KNL-NEXT: kandw %k1, %k0, %k2
; KNL-NEXT: kxorw %k1, %k0, %k0
; KNL-NEXT: korw %k0, %k2, %k0
; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; KNL-NEXT: ## kill: def %ax killed %ax killed %eax
; KNL-NEXT: retq
;
; SKX-LABEL: mand16_mem:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: kmovw (%rdi), %k0
; SKX-NEXT: kmovw (%rsi), %k1
; SKX-NEXT: kandw %k1, %k0, %k2
; SKX-NEXT: kxorw %k1, %k0, %k0
; SKX-NEXT: korw %k0, %k2, %k0
; SKX-NEXT: kmovd %k0, %eax
-; SKX-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; SKX-NEXT: ## kill: def %ax killed %ax killed %eax
; SKX-NEXT: retq
;
; AVX512BW-LABEL: mand16_mem:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovw (%rdi), %k0
; AVX512BW-NEXT: kmovw (%rsi), %k1
; AVX512BW-NEXT: kandw %k1, %k0, %k2
; AVX512BW-NEXT: kxorw %k1, %k0, %k0
; AVX512BW-NEXT: korw %k0, %k2, %k0
; AVX512BW-NEXT: kmovd %k0, %eax
-; AVX512BW-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX512BW-NEXT: ## kill: def %ax killed %ax killed %eax
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: mand16_mem:
-; AVX512DQ: ## BB#0:
+; AVX512DQ: ## %bb.0:
; AVX512DQ-NEXT: kmovw (%rdi), %k0
; AVX512DQ-NEXT: kmovw (%rsi), %k1
; AVX512DQ-NEXT: kandw %k1, %k0, %k2
; AVX512DQ-NEXT: kxorw %k1, %k0, %k0
; AVX512DQ-NEXT: korw %k0, %k2, %k0
; AVX512DQ-NEXT: kmovw %k0, %eax
-; AVX512DQ-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX512DQ-NEXT: ## kill: def %ax killed %ax killed %eax
; AVX512DQ-NEXT: retq
%ma = load <16 x i1>, <16 x i1>* %x
%mb = load <16 x i1>, <16 x i1>* %y
@@ -281,40 +281,36 @@ define i16 @mand16_mem(<16 x i1>* %x, <16 x i1>* %y) {
define i8 @shuf_test1(i16 %v) nounwind {
; KNL-LABEL: shuf_test1:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: kmovw %edi, %k0
; KNL-NEXT: kshiftrw $8, %k0, %k0
; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; KNL-NEXT: ## kill: def %al killed %al killed %eax
; KNL-NEXT: retq
-; KNL-NEXT: ## -- End function
;
; SKX-LABEL: shuf_test1:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: kmovd %edi, %k0
; SKX-NEXT: kshiftrw $8, %k0, %k0
; SKX-NEXT: kmovd %k0, %eax
-; SKX-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; SKX-NEXT: ## kill: def %al killed %al killed %eax
; SKX-NEXT: retq
-; SKX-NEXT: ## -- End function
;
; AVX512BW-LABEL: shuf_test1:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovd %edi, %k0
; AVX512BW-NEXT: kshiftrw $8, %k0, %k0
; AVX512BW-NEXT: kmovd %k0, %eax
-; AVX512BW-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: ## kill: def %al killed %al killed %eax
; AVX512BW-NEXT: retq
-; AVX512BW-NEXT: ## -- End function
;
; AVX512DQ-LABEL: shuf_test1:
-; AVX512DQ: ## BB#0:
+; AVX512DQ: ## %bb.0:
; AVX512DQ-NEXT: kmovw %edi, %k0
; AVX512DQ-NEXT: kshiftrw $8, %k0, %k0
; AVX512DQ-NEXT: kmovw %k0, %eax
-; AVX512DQ-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512DQ-NEXT: ## kill: def %al killed %al killed %eax
; AVX512DQ-NEXT: retq
-; AVX512DQ-NEXT: ## -- End function
%v1 = bitcast i16 %v to <16 x i1>
%mask = shufflevector <16 x i1> %v1, <16 x i1> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%mask1 = bitcast <8 x i1> %mask to i8
@@ -323,39 +319,36 @@ define i8 @shuf_test1(i16 %v) nounwind {
define i32 @zext_test1(<16 x i32> %a, <16 x i32> %b) {
; KNL-LABEL: zext_test1:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: vpcmpnleud %zmm1, %zmm0, %k0
-; KNL-NEXT: kshiftlw $10, %k0, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kshiftrw $5, %k0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: andl $1, %eax
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: zext_test1:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpcmpnleud %zmm1, %zmm0, %k0
-; SKX-NEXT: kshiftlw $10, %k0, %k0
-; SKX-NEXT: kshiftrw $15, %k0, %k0
+; SKX-NEXT: kshiftrw $5, %k0, %k0
; SKX-NEXT: kmovd %k0, %eax
; SKX-NEXT: andl $1, %eax
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
;
; AVX512BW-LABEL: zext_test1:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: vpcmpnleud %zmm1, %zmm0, %k0
-; AVX512BW-NEXT: kshiftlw $10, %k0, %k0
-; AVX512BW-NEXT: kshiftrw $15, %k0, %k0
+; AVX512BW-NEXT: kshiftrw $5, %k0, %k0
; AVX512BW-NEXT: kmovd %k0, %eax
; AVX512BW-NEXT: andl $1, %eax
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: zext_test1:
-; AVX512DQ: ## BB#0:
+; AVX512DQ: ## %bb.0:
; AVX512DQ-NEXT: vpcmpnleud %zmm1, %zmm0, %k0
-; AVX512DQ-NEXT: kshiftlw $10, %k0, %k0
-; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0
+; AVX512DQ-NEXT: kshiftrw $5, %k0, %k0
; AVX512DQ-NEXT: kmovw %k0, %eax
; AVX512DQ-NEXT: andl $1, %eax
; AVX512DQ-NEXT: vzeroupper
@@ -368,45 +361,42 @@ define i32 @zext_test1(<16 x i32> %a, <16 x i32> %b) {
define i16 @zext_test2(<16 x i32> %a, <16 x i32> %b) {
; KNL-LABEL: zext_test2:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: vpcmpnleud %zmm1, %zmm0, %k0
-; KNL-NEXT: kshiftlw $10, %k0, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kshiftrw $5, %k0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: andl $1, %eax
-; KNL-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; KNL-NEXT: ## kill: def %ax killed %ax killed %eax
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: zext_test2:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpcmpnleud %zmm1, %zmm0, %k0
-; SKX-NEXT: kshiftlw $10, %k0, %k0
-; SKX-NEXT: kshiftrw $15, %k0, %k0
+; SKX-NEXT: kshiftrw $5, %k0, %k0
; SKX-NEXT: kmovd %k0, %eax
; SKX-NEXT: andl $1, %eax
-; SKX-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; SKX-NEXT: ## kill: def %ax killed %ax killed %eax
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
;
; AVX512BW-LABEL: zext_test2:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: vpcmpnleud %zmm1, %zmm0, %k0
-; AVX512BW-NEXT: kshiftlw $10, %k0, %k0
-; AVX512BW-NEXT: kshiftrw $15, %k0, %k0
+; AVX512BW-NEXT: kshiftrw $5, %k0, %k0
; AVX512BW-NEXT: kmovd %k0, %eax
; AVX512BW-NEXT: andl $1, %eax
-; AVX512BW-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX512BW-NEXT: ## kill: def %ax killed %ax killed %eax
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: zext_test2:
-; AVX512DQ: ## BB#0:
+; AVX512DQ: ## %bb.0:
; AVX512DQ-NEXT: vpcmpnleud %zmm1, %zmm0, %k0
-; AVX512DQ-NEXT: kshiftlw $10, %k0, %k0
-; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0
+; AVX512DQ-NEXT: kshiftrw $5, %k0, %k0
; AVX512DQ-NEXT: kmovw %k0, %eax
; AVX512DQ-NEXT: andl $1, %eax
-; AVX512DQ-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX512DQ-NEXT: ## kill: def %ax killed %ax killed %eax
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
%cmp_res = icmp ugt <16 x i32> %a, %b
@@ -417,45 +407,42 @@ define i16 @zext_test2(<16 x i32> %a, <16 x i32> %b) {
define i8 @zext_test3(<16 x i32> %a, <16 x i32> %b) {
; KNL-LABEL: zext_test3:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: vpcmpnleud %zmm1, %zmm0, %k0
-; KNL-NEXT: kshiftlw $10, %k0, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kshiftrw $5, %k0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: andb $1, %al
-; KNL-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; KNL-NEXT: ## kill: def %al killed %al killed %eax
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: zext_test3:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpcmpnleud %zmm1, %zmm0, %k0
-; SKX-NEXT: kshiftlw $10, %k0, %k0
-; SKX-NEXT: kshiftrw $15, %k0, %k0
+; SKX-NEXT: kshiftrw $5, %k0, %k0
; SKX-NEXT: kmovd %k0, %eax
; SKX-NEXT: andb $1, %al
-; SKX-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; SKX-NEXT: ## kill: def %al killed %al killed %eax
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
;
; AVX512BW-LABEL: zext_test3:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: vpcmpnleud %zmm1, %zmm0, %k0
-; AVX512BW-NEXT: kshiftlw $10, %k0, %k0
-; AVX512BW-NEXT: kshiftrw $15, %k0, %k0
+; AVX512BW-NEXT: kshiftrw $5, %k0, %k0
; AVX512BW-NEXT: kmovd %k0, %eax
; AVX512BW-NEXT: andb $1, %al
-; AVX512BW-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: ## kill: def %al killed %al killed %eax
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: zext_test3:
-; AVX512DQ: ## BB#0:
+; AVX512DQ: ## %bb.0:
; AVX512DQ-NEXT: vpcmpnleud %zmm1, %zmm0, %k0
-; AVX512DQ-NEXT: kshiftlw $10, %k0, %k0
-; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0
+; AVX512DQ-NEXT: kshiftrw $5, %k0, %k0
; AVX512DQ-NEXT: kmovw %k0, %eax
; AVX512DQ-NEXT: andb $1, %al
-; AVX512DQ-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512DQ-NEXT: ## kill: def %al killed %al killed %eax
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
%cmp_res = icmp ugt <16 x i32> %a, %b
@@ -466,7 +453,7 @@ define i8 @zext_test3(<16 x i32> %a, <16 x i32> %b) {
define i8 @conv1(<8 x i1>* %R) {
; KNL-LABEL: conv1:
-; KNL: ## BB#0: ## %entry
+; KNL: ## %bb.0: ## %entry
; KNL-NEXT: kxnorw %k0, %k0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: movb %al, (%rdi)
@@ -475,7 +462,7 @@ define i8 @conv1(<8 x i1>* %R) {
; KNL-NEXT: retq
;
; SKX-LABEL: conv1:
-; SKX: ## BB#0: ## %entry
+; SKX: ## %bb.0: ## %entry
; SKX-NEXT: kxnorw %k0, %k0, %k0
; SKX-NEXT: kmovb %k0, (%rdi)
; SKX-NEXT: movb $-2, -{{[0-9]+}}(%rsp)
@@ -483,7 +470,7 @@ define i8 @conv1(<8 x i1>* %R) {
; SKX-NEXT: retq
;
; AVX512BW-LABEL: conv1:
-; AVX512BW: ## BB#0: ## %entry
+; AVX512BW: ## %bb.0: ## %entry
; AVX512BW-NEXT: kxnorw %k0, %k0, %k0
; AVX512BW-NEXT: kmovd %k0, %eax
; AVX512BW-NEXT: movb %al, (%rdi)
@@ -492,7 +479,7 @@ define i8 @conv1(<8 x i1>* %R) {
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: conv1:
-; AVX512DQ: ## BB#0: ## %entry
+; AVX512DQ: ## %bb.0: ## %entry
; AVX512DQ-NEXT: kxnorw %k0, %k0, %k0
; AVX512DQ-NEXT: kmovb %k0, (%rdi)
; AVX512DQ-NEXT: movb $-2, -{{[0-9]+}}(%rsp)
@@ -510,16 +497,17 @@ entry:
define <4 x i32> @test4(<4 x i64> %x, <4 x i64> %y, <4 x i64> %x1, <4 x i64> %y1) {
; KNL-LABEL: test4:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0
; KNL-NEXT: vpmovqd %zmm0, %ymm0
; KNL-NEXT: vpcmpgtq %ymm3, %ymm2, %ymm1
; KNL-NEXT: vpmovqd %zmm1, %ymm1
; KNL-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: test4:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpcmpgtq %ymm1, %ymm0, %k0
; SKX-NEXT: vpcmpgtq %ymm3, %ymm2, %k1
; SKX-NEXT: kandnw %k0, %k1, %k0
@@ -528,7 +516,7 @@ define <4 x i32> @test4(<4 x i64> %x, <4 x i64> %y, <4 x i64> %x1, <4 x i64> %y1
; SKX-NEXT: retq
;
; AVX512BW-LABEL: test4:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0
; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
; AVX512BW-NEXT: vpcmpgtq %ymm3, %ymm2, %ymm1
@@ -538,7 +526,7 @@ define <4 x i32> @test4(<4 x i64> %x, <4 x i64> %y, <4 x i64> %x1, <4 x i64> %y1
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: test4:
-; AVX512DQ: ## BB#0:
+; AVX512DQ: ## %bb.0:
; AVX512DQ-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0
; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0
; AVX512DQ-NEXT: vpcmpgtq %ymm3, %ymm2, %ymm1
@@ -555,14 +543,14 @@ define <4 x i32> @test4(<4 x i64> %x, <4 x i64> %y, <4 x i64> %x1, <4 x i64> %y1
define <2 x i64> @test5(<2 x i64> %x, <2 x i64> %y, <2 x i64> %x1, <2 x i64> %y1) {
; KNL-LABEL: test5:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; KNL-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm1
; KNL-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; KNL-NEXT: retq
;
; SKX-LABEL: test5:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpcmpgtq %xmm0, %xmm1, %k0
; SKX-NEXT: vpcmpgtq %xmm3, %xmm2, %k1
; SKX-NEXT: kandnw %k1, %k0, %k0
@@ -570,14 +558,14 @@ define <2 x i64> @test5(<2 x i64> %x, <2 x i64> %y, <2 x i64> %x1, <2 x i64> %y1
; SKX-NEXT: retq
;
; AVX512BW-LABEL: test5:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX512BW-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm1
; AVX512BW-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: test5:
-; AVX512DQ: ## BB#0:
+; AVX512DQ: ## %bb.0:
; AVX512DQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX512DQ-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm1
; AVX512DQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
@@ -602,7 +590,7 @@ false:
}
define void @test7(<8 x i1> %mask) {
; KNL-LABEL: test7:
-; KNL: ## BB#0: ## %allocas
+; KNL: ## %bb.0: ## %allocas
; KNL-NEXT: vpmovsxwq %xmm0, %zmm0
; KNL-NEXT: vpsllq $63, %zmm0, %zmm0
; KNL-NEXT: vptestmq %zmm0, %zmm0, %k0
@@ -611,10 +599,11 @@ define void @test7(<8 x i1> %mask) {
; KNL-NEXT: korw %k1, %k0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: testb %al, %al
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: test7:
-; SKX: ## BB#0: ## %allocas
+; SKX: ## %bb.0: ## %allocas
; SKX-NEXT: vpsllw $15, %xmm0, %xmm0
; SKX-NEXT: vpmovw2m %xmm0, %k0
; SKX-NEXT: movb $85, %al
@@ -624,7 +613,7 @@ define void @test7(<8 x i1> %mask) {
; SKX-NEXT: retq
;
; AVX512BW-LABEL: test7:
-; AVX512BW: ## BB#0: ## %allocas
+; AVX512BW: ## %bb.0: ## %allocas
; AVX512BW-NEXT: vpsllw $15, %xmm0, %xmm0
; AVX512BW-NEXT: vpmovw2m %zmm0, %k0
; AVX512BW-NEXT: movb $85, %al
@@ -636,7 +625,7 @@ define void @test7(<8 x i1> %mask) {
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: test7:
-; AVX512DQ: ## BB#0: ## %allocas
+; AVX512DQ: ## %bb.0: ## %allocas
; AVX512DQ-NEXT: vpmovsxwq %xmm0, %zmm0
; AVX512DQ-NEXT: vpsllq $63, %zmm0, %zmm0
; AVX512DQ-NEXT: vptestmq %zmm0, %zmm0, %k0
@@ -660,11 +649,11 @@ false:
}
define <16 x i8> @test8(<16 x i32>%a, <16 x i32>%b, i32 %a1, i32 %b1) {
; KNL-LABEL: test8:
-; KNL: ## BB#0:
-; KNL-NEXT: vpxord %zmm2, %zmm2, %zmm2
+; KNL: ## %bb.0:
; KNL-NEXT: cmpl %esi, %edi
+; KNL-NEXT: vpxor %xmm2, %xmm2, %xmm2
; KNL-NEXT: jg LBB17_1
-; KNL-NEXT: ## BB#2:
+; KNL-NEXT: ## %bb.2:
; KNL-NEXT: vpcmpltud %zmm2, %zmm1, %k1
; KNL-NEXT: jmp LBB17_3
; KNL-NEXT: LBB17_1:
@@ -672,14 +661,15 @@ define <16 x i8> @test8(<16 x i32>%a, <16 x i32>%b, i32 %a1, i32 %b1) {
; KNL-NEXT: LBB17_3:
; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; KNL-NEXT: vpmovdb %zmm0, %xmm0
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: test8:
-; SKX: ## BB#0:
-; SKX-NEXT: vpxord %zmm2, %zmm2, %zmm2
+; SKX: ## %bb.0:
; SKX-NEXT: cmpl %esi, %edi
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2
; SKX-NEXT: jg LBB17_1
-; SKX-NEXT: ## BB#2:
+; SKX-NEXT: ## %bb.2:
; SKX-NEXT: vpcmpltud %zmm2, %zmm1, %k0
; SKX-NEXT: vpmovm2b %k0, %xmm0
; SKX-NEXT: vzeroupper
@@ -691,27 +681,27 @@ define <16 x i8> @test8(<16 x i32>%a, <16 x i32>%b, i32 %a1, i32 %b1) {
; SKX-NEXT: retq
;
; AVX512BW-LABEL: test8:
-; AVX512BW: ## BB#0:
-; AVX512BW-NEXT: vpxord %zmm2, %zmm2, %zmm2
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: cmpl %esi, %edi
+; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512BW-NEXT: jg LBB17_1
-; AVX512BW-NEXT: ## BB#2:
+; AVX512BW-NEXT: ## %bb.2:
; AVX512BW-NEXT: vpcmpltud %zmm2, %zmm1, %k0
; AVX512BW-NEXT: jmp LBB17_3
; AVX512BW-NEXT: LBB17_1:
; AVX512BW-NEXT: vpcmpgtd %zmm2, %zmm0, %k0
; AVX512BW-NEXT: LBB17_3:
; AVX512BW-NEXT: vpmovm2b %k0, %zmm0
-; AVX512BW-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512BW-NEXT: ## kill: def %xmm0 killed %xmm0 killed %zmm0
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: test8:
-; AVX512DQ: ## BB#0:
-; AVX512DQ-NEXT: vpxord %zmm2, %zmm2, %zmm2
+; AVX512DQ: ## %bb.0:
; AVX512DQ-NEXT: cmpl %esi, %edi
+; AVX512DQ-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512DQ-NEXT: jg LBB17_1
-; AVX512DQ-NEXT: ## BB#2:
+; AVX512DQ-NEXT: ## %bb.2:
; AVX512DQ-NEXT: vpcmpltud %zmm2, %zmm1, %k0
; AVX512DQ-NEXT: jmp LBB17_3
; AVX512DQ-NEXT: LBB17_1:
@@ -730,10 +720,10 @@ define <16 x i8> @test8(<16 x i32>%a, <16 x i32>%b, i32 %a1, i32 %b1) {
}
define <16 x i1> @test9(<16 x i1>%a, <16 x i1>%b, i32 %a1, i32 %b1) {
; KNL-LABEL: test9:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: cmpl %esi, %edi
; KNL-NEXT: jg LBB18_1
-; KNL-NEXT: ## BB#2:
+; KNL-NEXT: ## %bb.2:
; KNL-NEXT: vpmovsxbd %xmm1, %zmm0
; KNL-NEXT: jmp LBB18_3
; KNL-NEXT: LBB18_1:
@@ -743,13 +733,14 @@ define <16 x i1> @test9(<16 x i1>%a, <16 x i1>%b, i32 %a1, i32 %b1) {
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k1
; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; KNL-NEXT: vpmovdb %zmm0, %xmm0
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: test9:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: cmpl %esi, %edi
; SKX-NEXT: jg LBB18_1
-; SKX-NEXT: ## BB#2:
+; SKX-NEXT: ## %bb.2:
; SKX-NEXT: vpsllw $7, %xmm1, %xmm0
; SKX-NEXT: jmp LBB18_3
; SKX-NEXT: LBB18_1:
@@ -760,10 +751,10 @@ define <16 x i1> @test9(<16 x i1>%a, <16 x i1>%b, i32 %a1, i32 %b1) {
; SKX-NEXT: retq
;
; AVX512BW-LABEL: test9:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: cmpl %esi, %edi
; AVX512BW-NEXT: jg LBB18_1
-; AVX512BW-NEXT: ## BB#2:
+; AVX512BW-NEXT: ## %bb.2:
; AVX512BW-NEXT: vpsllw $7, %xmm1, %xmm0
; AVX512BW-NEXT: jmp LBB18_3
; AVX512BW-NEXT: LBB18_1:
@@ -771,15 +762,15 @@ define <16 x i1> @test9(<16 x i1>%a, <16 x i1>%b, i32 %a1, i32 %b1) {
; AVX512BW-NEXT: LBB18_3:
; AVX512BW-NEXT: vpmovb2m %zmm0, %k0
; AVX512BW-NEXT: vpmovm2b %k0, %zmm0
-; AVX512BW-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512BW-NEXT: ## kill: def %xmm0 killed %xmm0 killed %zmm0
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: test9:
-; AVX512DQ: ## BB#0:
+; AVX512DQ: ## %bb.0:
; AVX512DQ-NEXT: cmpl %esi, %edi
; AVX512DQ-NEXT: jg LBB18_1
-; AVX512DQ-NEXT: ## BB#2:
+; AVX512DQ-NEXT: ## %bb.2:
; AVX512DQ-NEXT: vpmovsxbd %xmm1, %zmm0
; AVX512DQ-NEXT: jmp LBB18_3
; AVX512DQ-NEXT: LBB18_1:
@@ -802,19 +793,19 @@ define <16 x i1> @test9(<16 x i1>%a, <16 x i1>%b, i32 %a1, i32 %b1) {
define <4 x i1> @test11(<4 x i1>%a, <4 x i1>%b, i32 %a1, i32 %b1) {
; KNL-LABEL: test11:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: cmpl %esi, %edi
; KNL-NEXT: jg LBB20_2
-; KNL-NEXT: ## BB#1:
+; KNL-NEXT: ## %bb.1:
; KNL-NEXT: vmovaps %xmm1, %xmm0
; KNL-NEXT: LBB20_2:
; KNL-NEXT: retq
;
; SKX-LABEL: test11:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: cmpl %esi, %edi
; SKX-NEXT: jg LBB20_1
-; SKX-NEXT: ## BB#2:
+; SKX-NEXT: ## %bb.2:
; SKX-NEXT: vpslld $31, %xmm1, %xmm0
; SKX-NEXT: jmp LBB20_3
; SKX-NEXT: LBB20_1:
@@ -825,19 +816,19 @@ define <4 x i1> @test11(<4 x i1>%a, <4 x i1>%b, i32 %a1, i32 %b1) {
; SKX-NEXT: retq
;
; AVX512BW-LABEL: test11:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: cmpl %esi, %edi
; AVX512BW-NEXT: jg LBB20_2
-; AVX512BW-NEXT: ## BB#1:
+; AVX512BW-NEXT: ## %bb.1:
; AVX512BW-NEXT: vmovaps %xmm1, %xmm0
; AVX512BW-NEXT: LBB20_2:
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: test11:
-; AVX512DQ: ## BB#0:
+; AVX512DQ: ## %bb.0:
; AVX512DQ-NEXT: cmpl %esi, %edi
; AVX512DQ-NEXT: jg LBB20_2
-; AVX512DQ-NEXT: ## BB#1:
+; AVX512DQ-NEXT: ## %bb.1:
; AVX512DQ-NEXT: vmovaps %xmm1, %xmm0
; AVX512DQ-NEXT: LBB20_2:
; AVX512DQ-NEXT: retq
@@ -848,7 +839,7 @@ define <4 x i1> @test11(<4 x i1>%a, <4 x i1>%b, i32 %a1, i32 %b1) {
define i32 @test12(i32 %x, i32 %y) {
; CHECK-LABEL: test12:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: movl %edi, %eax
; CHECK-NEXT: retq
%a = bitcast i16 21845 to <16 x i1>
@@ -859,7 +850,7 @@ define i32 @test12(i32 %x, i32 %y) {
define i32 @test13(i32 %x, i32 %y) {
; CHECK-LABEL: test13:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: movl %esi, %eax
; CHECK-NEXT: retq
%a = bitcast i16 21845 to <16 x i1>
@@ -875,7 +866,7 @@ define i32 @test13(i32 %x, i32 %y) {
define <16 x i1> @test15(i32 %x, i32 %y) {
; KNL-LABEL: test15:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: cmpl %esi, %edi
; KNL-NEXT: movw $21845, %ax ## imm = 0x5555
; KNL-NEXT: movw $1, %cx
@@ -883,10 +874,11 @@ define <16 x i1> @test15(i32 %x, i32 %y) {
; KNL-NEXT: kmovw %ecx, %k1
; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; KNL-NEXT: vpmovdb %zmm0, %xmm0
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: test15:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: cmpl %esi, %edi
; SKX-NEXT: movw $21845, %ax ## imm = 0x5555
; SKX-NEXT: movw $1, %cx
@@ -896,19 +888,19 @@ define <16 x i1> @test15(i32 %x, i32 %y) {
; SKX-NEXT: retq
;
; AVX512BW-LABEL: test15:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: cmpl %esi, %edi
; AVX512BW-NEXT: movw $21845, %ax ## imm = 0x5555
; AVX512BW-NEXT: movw $1, %cx
; AVX512BW-NEXT: cmovgw %ax, %cx
; AVX512BW-NEXT: kmovd %ecx, %k0
; AVX512BW-NEXT: vpmovm2b %k0, %zmm0
-; AVX512BW-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512BW-NEXT: ## kill: def %xmm0 killed %xmm0 killed %zmm0
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: test15:
-; AVX512DQ: ## BB#0:
+; AVX512DQ: ## %bb.0:
; AVX512DQ-NEXT: cmpl %esi, %edi
; AVX512DQ-NEXT: movw $21845, %ax ## imm = 0x5555
; AVX512DQ-NEXT: movw $1, %cx
@@ -928,14 +920,11 @@ define <16 x i1> @test15(i32 %x, i32 %y) {
define <64 x i8> @test16(i64 %x) {
;
; KNL-LABEL: test16:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: pushq %rbp
-; KNL-NEXT: Lcfi0:
; KNL-NEXT: .cfi_def_cfa_offset 16
-; KNL-NEXT: Lcfi1:
; KNL-NEXT: .cfi_offset %rbp, -16
; KNL-NEXT: movq %rsp, %rbp
-; KNL-NEXT: Lcfi2:
; KNL-NEXT: .cfi_def_cfa_register %rbp
; KNL-NEXT: andq $-32, %rsp
; KNL-NEXT: subq $64, %rsp
@@ -960,52 +949,44 @@ define <64 x i8> @test16(i64 %x) {
; KNL-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
; KNL-NEXT: vpsllw $7, %ymm0, %ymm0
; KNL-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
-; KNL-NEXT: vpxor %ymm2, %ymm2, %ymm2
+; KNL-NEXT: vpxor %xmm2, %xmm2, %xmm2
; KNL-NEXT: vpcmpgtb %ymm0, %ymm2, %ymm0
; KNL-NEXT: movq %rbp, %rsp
; KNL-NEXT: popq %rbp
; KNL-NEXT: retq
;
; SKX-LABEL: test16:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: kmovq %rdi, %k0
; SKX-NEXT: movb $1, %al
; SKX-NEXT: kmovd %eax, %k1
-; SKX-NEXT: vpmovm2b %k1, %zmm0
-; SKX-NEXT: vpsllq $40, %xmm0, %xmm0
-; SKX-NEXT: vpmovm2b %k0, %zmm1
-; SKX-NEXT: movl $32, %eax
-; SKX-NEXT: kmovd %eax, %k1
-; SKX-NEXT: vpblendmb %ymm0, %ymm1, %ymm0 {%k1}
-; SKX-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
-; SKX-NEXT: vpmovb2m %zmm0, %k0
+; SKX-NEXT: kshiftrq $5, %k0, %k2
+; SKX-NEXT: kxorq %k1, %k2, %k1
+; SKX-NEXT: kshiftlq $63, %k1, %k1
+; SKX-NEXT: kshiftrq $58, %k1, %k1
+; SKX-NEXT: kxorq %k0, %k1, %k0
; SKX-NEXT: vpmovm2b %k0, %zmm0
; SKX-NEXT: retq
;
; AVX512BW-LABEL: test16:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovq %rdi, %k0
; AVX512BW-NEXT: movb $1, %al
; AVX512BW-NEXT: kmovd %eax, %k1
-; AVX512BW-NEXT: vpmovm2b %k1, %zmm0
-; AVX512BW-NEXT: vpsllq $40, %xmm0, %xmm0
-; AVX512BW-NEXT: vpmovm2b %k0, %zmm1
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512BW-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
-; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
-; AVX512BW-NEXT: vpmovb2m %zmm0, %k0
+; AVX512BW-NEXT: kshiftrq $5, %k0, %k2
+; AVX512BW-NEXT: kxorq %k1, %k2, %k1
+; AVX512BW-NEXT: kshiftlq $63, %k1, %k1
+; AVX512BW-NEXT: kshiftrq $58, %k1, %k1
+; AVX512BW-NEXT: kxorq %k0, %k1, %k0
; AVX512BW-NEXT: vpmovm2b %k0, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: test16:
-; AVX512DQ: ## BB#0:
+; AVX512DQ: ## %bb.0:
; AVX512DQ-NEXT: pushq %rbp
-; AVX512DQ-NEXT: Lcfi0:
; AVX512DQ-NEXT: .cfi_def_cfa_offset 16
-; AVX512DQ-NEXT: Lcfi1:
; AVX512DQ-NEXT: .cfi_offset %rbp, -16
; AVX512DQ-NEXT: movq %rsp, %rbp
-; AVX512DQ-NEXT: Lcfi2:
; AVX512DQ-NEXT: .cfi_def_cfa_register %rbp
; AVX512DQ-NEXT: andq $-32, %rsp
; AVX512DQ-NEXT: subq $64, %rsp
@@ -1030,7 +1011,7 @@ define <64 x i8> @test16(i64 %x) {
; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
; AVX512DQ-NEXT: vpsllw $7, %ymm0, %ymm0
; AVX512DQ-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
-; AVX512DQ-NEXT: vpxor %ymm2, %ymm2, %ymm2
+; AVX512DQ-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512DQ-NEXT: vpcmpgtb %ymm0, %ymm2, %ymm0
; AVX512DQ-NEXT: movq %rbp, %rsp
; AVX512DQ-NEXT: popq %rbp
@@ -1044,14 +1025,11 @@ define <64 x i8> @test16(i64 %x) {
define <64 x i8> @test17(i64 %x, i32 %y, i32 %z) {
;
; KNL-LABEL: test17:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: pushq %rbp
-; KNL-NEXT: Lcfi3:
; KNL-NEXT: .cfi_def_cfa_offset 16
-; KNL-NEXT: Lcfi4:
; KNL-NEXT: .cfi_offset %rbp, -16
; KNL-NEXT: movq %rsp, %rbp
-; KNL-NEXT: Lcfi5:
; KNL-NEXT: .cfi_def_cfa_register %rbp
; KNL-NEXT: andq $-32, %rsp
; KNL-NEXT: subq $64, %rsp
@@ -1078,54 +1056,46 @@ define <64 x i8> @test17(i64 %x, i32 %y, i32 %z) {
; KNL-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
; KNL-NEXT: vpsllw $7, %ymm0, %ymm0
; KNL-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
-; KNL-NEXT: vpxor %ymm2, %ymm2, %ymm2
+; KNL-NEXT: vpxor %xmm2, %xmm2, %xmm2
; KNL-NEXT: vpcmpgtb %ymm0, %ymm2, %ymm0
; KNL-NEXT: movq %rbp, %rsp
; KNL-NEXT: popq %rbp
; KNL-NEXT: retq
;
; SKX-LABEL: test17:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: kmovq %rdi, %k0
; SKX-NEXT: cmpl %edx, %esi
; SKX-NEXT: setg %al
; SKX-NEXT: kmovd %eax, %k1
-; SKX-NEXT: vpmovm2b %k1, %zmm0
-; SKX-NEXT: vpsllq $40, %xmm0, %xmm0
-; SKX-NEXT: vpmovm2b %k0, %zmm1
-; SKX-NEXT: movl $32, %eax
-; SKX-NEXT: kmovd %eax, %k1
-; SKX-NEXT: vpblendmb %ymm0, %ymm1, %ymm0 {%k1}
-; SKX-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
-; SKX-NEXT: vpmovb2m %zmm0, %k0
+; SKX-NEXT: kshiftrq $5, %k0, %k2
+; SKX-NEXT: kxorq %k1, %k2, %k1
+; SKX-NEXT: kshiftlq $63, %k1, %k1
+; SKX-NEXT: kshiftrq $58, %k1, %k1
+; SKX-NEXT: kxorq %k0, %k1, %k0
; SKX-NEXT: vpmovm2b %k0, %zmm0
; SKX-NEXT: retq
;
; AVX512BW-LABEL: test17:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovq %rdi, %k0
; AVX512BW-NEXT: cmpl %edx, %esi
; AVX512BW-NEXT: setg %al
; AVX512BW-NEXT: kmovd %eax, %k1
-; AVX512BW-NEXT: vpmovm2b %k1, %zmm0
-; AVX512BW-NEXT: vpsllq $40, %xmm0, %xmm0
-; AVX512BW-NEXT: vpmovm2b %k0, %zmm1
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512BW-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
-; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
-; AVX512BW-NEXT: vpmovb2m %zmm0, %k0
+; AVX512BW-NEXT: kshiftrq $5, %k0, %k2
+; AVX512BW-NEXT: kxorq %k1, %k2, %k1
+; AVX512BW-NEXT: kshiftlq $63, %k1, %k1
+; AVX512BW-NEXT: kshiftrq $58, %k1, %k1
+; AVX512BW-NEXT: kxorq %k0, %k1, %k0
; AVX512BW-NEXT: vpmovm2b %k0, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: test17:
-; AVX512DQ: ## BB#0:
+; AVX512DQ: ## %bb.0:
; AVX512DQ-NEXT: pushq %rbp
-; AVX512DQ-NEXT: Lcfi3:
; AVX512DQ-NEXT: .cfi_def_cfa_offset 16
-; AVX512DQ-NEXT: Lcfi4:
; AVX512DQ-NEXT: .cfi_offset %rbp, -16
; AVX512DQ-NEXT: movq %rsp, %rbp
-; AVX512DQ-NEXT: Lcfi5:
; AVX512DQ-NEXT: .cfi_def_cfa_register %rbp
; AVX512DQ-NEXT: andq $-32, %rsp
; AVX512DQ-NEXT: subq $64, %rsp
@@ -1152,7 +1122,7 @@ define <64 x i8> @test17(i64 %x, i32 %y, i32 %z) {
; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
; AVX512DQ-NEXT: vpsllw $7, %ymm0, %ymm0
; AVX512DQ-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
-; AVX512DQ-NEXT: vpxor %ymm2, %ymm2, %ymm2
+; AVX512DQ-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512DQ-NEXT: vpcmpgtb %ymm0, %ymm2, %ymm0
; AVX512DQ-NEXT: movq %rbp, %rsp
; AVX512DQ-NEXT: popq %rbp
@@ -1166,106 +1136,84 @@ define <64 x i8> @test17(i64 %x, i32 %y, i32 %z) {
define <8 x i1> @test18(i8 %a, i16 %y) {
; KNL-LABEL: test18:
-; KNL: ## BB#0:
-; KNL-NEXT: kmovw %edi, %k1
-; KNL-NEXT: kmovw %esi, %k0
-; KNL-NEXT: kshiftlw $7, %k0, %k2
-; KNL-NEXT: kshiftrw $15, %k2, %k2
-; KNL-NEXT: kmovw %k2, %eax
-; KNL-NEXT: kshiftlw $6, %k0, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %ecx
-; KNL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; KNL-NEXT: kmovw %ecx, %k1
-; KNL-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; KNL-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,5,8,7]
-; KNL-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
-; KNL-NEXT: vpsllq $63, %zmm2, %zmm0
-; KNL-NEXT: vptestmq %zmm0, %zmm0, %k0
-; KNL-NEXT: kshiftlw $1, %k0, %k0
-; KNL-NEXT: kshiftrw $1, %k0, %k0
-; KNL-NEXT: kmovw %eax, %k1
-; KNL-NEXT: kshiftlw $7, %k1, %k1
-; KNL-NEXT: korw %k1, %k0, %k1
-; KNL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; KNL-NEXT: vpmovqw %zmm0, %xmm0
+; KNL: ## %bb.0:
+; KNL-NEXT: kmovw %edi, %k0
+; KNL-NEXT: kmovw %esi, %k1
+; KNL-NEXT: kshiftrw $8, %k1, %k2
+; KNL-NEXT: kshiftrw $9, %k1, %k1
+; KNL-NEXT: kshiftrw $6, %k0, %k3
+; KNL-NEXT: kxorw %k1, %k3, %k1
+; KNL-NEXT: kshiftlw $15, %k1, %k1
+; KNL-NEXT: kshiftrw $9, %k1, %k1
+; KNL-NEXT: kxorw %k0, %k1, %k0
+; KNL-NEXT: kshiftrw $7, %k0, %k1
+; KNL-NEXT: kxorw %k2, %k1, %k1
+; KNL-NEXT: kshiftlw $15, %k1, %k1
+; KNL-NEXT: kshiftrw $8, %k1, %k1
+; KNL-NEXT: kxorw %k0, %k1, %k1
+; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT: vpmovdw %zmm0, %ymm0
+; KNL-NEXT: ## kill: def %xmm0 killed %xmm0 killed %ymm0
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: test18:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: kmovd %edi, %k0
; SKX-NEXT: kmovd %esi, %k1
-; SKX-NEXT: kshiftlw $7, %k1, %k2
-; SKX-NEXT: kshiftrw $15, %k2, %k2
-; SKX-NEXT: kmovd %k2, %eax
-; SKX-NEXT: kshiftlw $6, %k1, %k1
-; SKX-NEXT: kshiftrw $15, %k1, %k1
-; SKX-NEXT: kmovd %k1, %ecx
-; SKX-NEXT: vpmovm2q %k0, %zmm0
-; SKX-NEXT: kmovd %ecx, %k0
-; SKX-NEXT: vpmovm2q %k0, %zmm1
-; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,5,8,7]
-; SKX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
-; SKX-NEXT: vpmovq2m %zmm2, %k0
+; SKX-NEXT: kshiftrw $8, %k1, %k2
+; SKX-NEXT: kshiftrw $9, %k1, %k1
+; SKX-NEXT: kshiftrb $6, %k0, %k3
+; SKX-NEXT: kxorb %k1, %k3, %k1
+; SKX-NEXT: kshiftlb $7, %k1, %k1
+; SKX-NEXT: kshiftrb $1, %k1, %k1
+; SKX-NEXT: kxorb %k0, %k1, %k0
; SKX-NEXT: kshiftlb $1, %k0, %k0
; SKX-NEXT: kshiftrb $1, %k0, %k0
-; SKX-NEXT: kmovd %eax, %k1
-; SKX-NEXT: kshiftlb $7, %k1, %k1
+; SKX-NEXT: kshiftlb $7, %k2, %k1
; SKX-NEXT: korb %k1, %k0, %k0
; SKX-NEXT: vpmovm2w %k0, %xmm0
-; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
;
; AVX512BW-LABEL: test18:
-; AVX512BW: ## BB#0:
-; AVX512BW-NEXT: kmovd %edi, %k1
-; AVX512BW-NEXT: kmovd %esi, %k0
-; AVX512BW-NEXT: kshiftlw $7, %k0, %k2
-; AVX512BW-NEXT: kshiftrw $15, %k2, %k2
-; AVX512BW-NEXT: kmovd %k2, %eax
-; AVX512BW-NEXT: kshiftlw $6, %k0, %k0
-; AVX512BW-NEXT: kshiftrw $15, %k0, %k0
-; AVX512BW-NEXT: kmovd %k0, %ecx
-; AVX512BW-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; AVX512BW-NEXT: kmovd %ecx, %k1
-; AVX512BW-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,5,8,7]
-; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vpsllq $63, %zmm2, %zmm0
-; AVX512BW-NEXT: vptestmq %zmm0, %zmm0, %k0
-; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
-; AVX512BW-NEXT: kshiftrw $1, %k0, %k0
-; AVX512BW-NEXT: kmovd %eax, %k1
-; AVX512BW-NEXT: kshiftlw $7, %k1, %k1
-; AVX512BW-NEXT: korw %k1, %k0, %k0
+; AVX512BW: ## %bb.0:
+; AVX512BW-NEXT: kmovd %edi, %k0
+; AVX512BW-NEXT: kmovd %esi, %k1
+; AVX512BW-NEXT: kshiftrw $8, %k1, %k2
+; AVX512BW-NEXT: kshiftrw $9, %k1, %k1
+; AVX512BW-NEXT: kshiftrw $6, %k0, %k3
+; AVX512BW-NEXT: kxorw %k1, %k3, %k1
+; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
+; AVX512BW-NEXT: kshiftrw $9, %k1, %k1
+; AVX512BW-NEXT: kxorw %k0, %k1, %k0
+; AVX512BW-NEXT: kshiftrw $7, %k0, %k1
+; AVX512BW-NEXT: kxorw %k2, %k1, %k1
+; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
+; AVX512BW-NEXT: kshiftrw $8, %k1, %k1
+; AVX512BW-NEXT: kxorw %k0, %k1, %k0
; AVX512BW-NEXT: vpmovm2w %k0, %zmm0
-; AVX512BW-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512BW-NEXT: ## kill: def %xmm0 killed %xmm0 killed %zmm0
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: test18:
-; AVX512DQ: ## BB#0:
+; AVX512DQ: ## %bb.0:
; AVX512DQ-NEXT: kmovw %edi, %k0
; AVX512DQ-NEXT: kmovw %esi, %k1
-; AVX512DQ-NEXT: kshiftlw $7, %k1, %k2
-; AVX512DQ-NEXT: kshiftrw $15, %k2, %k2
-; AVX512DQ-NEXT: kmovw %k2, %eax
-; AVX512DQ-NEXT: kshiftlw $6, %k1, %k1
-; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
-; AVX512DQ-NEXT: kmovw %k1, %ecx
-; AVX512DQ-NEXT: vpmovm2q %k0, %zmm0
-; AVX512DQ-NEXT: kmovw %ecx, %k0
-; AVX512DQ-NEXT: vpmovm2q %k0, %zmm1
-; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,5,8,7]
-; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
-; AVX512DQ-NEXT: vpmovq2m %zmm2, %k0
+; AVX512DQ-NEXT: kshiftrw $8, %k1, %k2
+; AVX512DQ-NEXT: kshiftrw $9, %k1, %k1
+; AVX512DQ-NEXT: kshiftrb $6, %k0, %k3
+; AVX512DQ-NEXT: kxorb %k1, %k3, %k1
+; AVX512DQ-NEXT: kshiftlb $7, %k1, %k1
+; AVX512DQ-NEXT: kshiftrb $1, %k1, %k1
+; AVX512DQ-NEXT: kxorb %k0, %k1, %k0
; AVX512DQ-NEXT: kshiftlb $1, %k0, %k0
; AVX512DQ-NEXT: kshiftrb $1, %k0, %k0
-; AVX512DQ-NEXT: kmovw %eax, %k1
-; AVX512DQ-NEXT: kshiftlb $7, %k1, %k1
+; AVX512DQ-NEXT: kshiftlb $7, %k2, %k1
; AVX512DQ-NEXT: korb %k1, %k0, %k0
-; AVX512DQ-NEXT: vpmovm2q %k0, %zmm0
-; AVX512DQ-NEXT: vpmovqw %zmm0, %xmm0
+; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
+; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512DQ-NEXT: ## kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
%b = bitcast i8 %a to <8 x i1>
@@ -1278,7 +1226,7 @@ define <8 x i1> @test18(i8 %a, i16 %y) {
}
define <32 x i16> @test21(<32 x i16> %x , <32 x i1> %mask) nounwind readnone {
; KNL-LABEL: test21:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
; KNL-NEXT: vpsllw $15, %ymm3, %ymm3
; KNL-NEXT: vpsraw $15, %ymm3, %ymm3
@@ -1289,26 +1237,23 @@ define <32 x i16> @test21(<32 x i16> %x , <32 x i1> %mask) nounwind readnone {
; KNL-NEXT: vpsraw $15, %ymm2, %ymm2
; KNL-NEXT: vpand %ymm1, %ymm2, %ymm1
; KNL-NEXT: retq
-; KNL-NEXT: ## -- End function
;
; SKX-LABEL: test21:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpsllw $7, %ymm1, %ymm1
; SKX-NEXT: vpmovb2m %ymm1, %k1
; SKX-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z}
; SKX-NEXT: retq
-; SKX-NEXT: ## -- End function
;
; AVX512BW-LABEL: test21:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: vpsllw $7, %ymm1, %ymm1
; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z}
; AVX512BW-NEXT: retq
-; AVX512BW-NEXT: ## -- End function
;
; AVX512DQ-LABEL: test21:
-; AVX512DQ: ## BB#0:
+; AVX512DQ: ## %bb.0:
; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
; AVX512DQ-NEXT: vpsllw $15, %ymm3, %ymm3
; AVX512DQ-NEXT: vpsraw $15, %ymm3, %ymm3
@@ -1319,31 +1264,31 @@ define <32 x i16> @test21(<32 x i16> %x , <32 x i1> %mask) nounwind readnone {
; AVX512DQ-NEXT: vpsraw $15, %ymm2, %ymm2
; AVX512DQ-NEXT: vpand %ymm1, %ymm2, %ymm1
; AVX512DQ-NEXT: retq
-; AVX512DQ-NEXT: ## -- End function
%ret = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> zeroinitializer
ret <32 x i16> %ret
}
define void @test22(<4 x i1> %a, <4 x i1>* %addr) {
; KNL-LABEL: test22:
-; KNL: ## BB#0:
-; KNL-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; KNL: ## %bb.0:
+; KNL-NEXT: ## kill: def %xmm0 killed %xmm0 def %ymm0
; KNL-NEXT: vpslld $31, %ymm0, %ymm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: movb %al, (%rdi)
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: test22:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpslld $31, %xmm0, %xmm0
; SKX-NEXT: vptestmd %xmm0, %xmm0, %k0
; SKX-NEXT: kmovb %k0, (%rdi)
; SKX-NEXT: retq
;
; AVX512BW-LABEL: test22:
-; AVX512BW: ## BB#0:
-; AVX512BW-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; AVX512BW: ## %bb.0:
+; AVX512BW-NEXT: ## kill: def %xmm0 killed %xmm0 def %ymm0
; AVX512BW-NEXT: vpslld $31, %ymm0, %ymm0
; AVX512BW-NEXT: vptestmd %zmm0, %zmm0, %k0
; AVX512BW-NEXT: kmovd %k0, %eax
@@ -1352,8 +1297,8 @@ define void @test22(<4 x i1> %a, <4 x i1>* %addr) {
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: test22:
-; AVX512DQ: ## BB#0:
-; AVX512DQ-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; AVX512DQ: ## %bb.0:
+; AVX512DQ-NEXT: ## kill: def %xmm0 killed %xmm0 def %ymm0
; AVX512DQ-NEXT: vpslld $31, %ymm0, %ymm0
; AVX512DQ-NEXT: vptestmd %zmm0, %zmm0, %k0
; AVX512DQ-NEXT: kmovb %k0, (%rdi)
@@ -1365,24 +1310,25 @@ define void @test22(<4 x i1> %a, <4 x i1>* %addr) {
define void @test23(<2 x i1> %a, <2 x i1>* %addr) {
; KNL-LABEL: test23:
-; KNL: ## BB#0:
-; KNL-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; KNL: ## %bb.0:
+; KNL-NEXT: ## kill: def %xmm0 killed %xmm0 def %zmm0
; KNL-NEXT: vpsllq $63, %zmm0, %zmm0
; KNL-NEXT: vptestmq %zmm0, %zmm0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: movb %al, (%rdi)
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: test23:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpsllq $63, %xmm0, %xmm0
; SKX-NEXT: vptestmq %xmm0, %xmm0, %k0
; SKX-NEXT: kmovb %k0, (%rdi)
; SKX-NEXT: retq
;
; AVX512BW-LABEL: test23:
-; AVX512BW: ## BB#0:
-; AVX512BW-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; AVX512BW: ## %bb.0:
+; AVX512BW-NEXT: ## kill: def %xmm0 killed %xmm0 def %zmm0
; AVX512BW-NEXT: vpsllq $63, %zmm0, %zmm0
; AVX512BW-NEXT: vptestmq %zmm0, %zmm0, %k0
; AVX512BW-NEXT: kmovd %k0, %eax
@@ -1391,8 +1337,8 @@ define void @test23(<2 x i1> %a, <2 x i1>* %addr) {
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: test23:
-; AVX512DQ: ## BB#0:
-; AVX512DQ-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; AVX512DQ: ## %bb.0:
+; AVX512DQ-NEXT: ## kill: def %xmm0 killed %xmm0 def %zmm0
; AVX512DQ-NEXT: vpsllq $63, %zmm0, %zmm0
; AVX512DQ-NEXT: vptestmq %zmm0, %zmm0, %k0
; AVX512DQ-NEXT: kmovb %k0, (%rdi)
@@ -1404,7 +1350,7 @@ define void @test23(<2 x i1> %a, <2 x i1>* %addr) {
define void @store_v1i1(<1 x i1> %c , <1 x i1>* %ptr) {
; KNL-LABEL: store_v1i1:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: kmovw %edi, %k0
; KNL-NEXT: kxnorw %k0, %k0, %k1
; KNL-NEXT: kxorw %k1, %k0, %k0
@@ -1413,7 +1359,7 @@ define void @store_v1i1(<1 x i1> %c , <1 x i1>* %ptr) {
; KNL-NEXT: retq
;
; SKX-LABEL: store_v1i1:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: kmovd %edi, %k0
; SKX-NEXT: kxnorw %k0, %k0, %k1
; SKX-NEXT: kxorw %k1, %k0, %k0
@@ -1421,7 +1367,7 @@ define void @store_v1i1(<1 x i1> %c , <1 x i1>* %ptr) {
; SKX-NEXT: retq
;
; AVX512BW-LABEL: store_v1i1:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovd %edi, %k0
; AVX512BW-NEXT: kxnorw %k0, %k0, %k1
; AVX512BW-NEXT: kxorw %k1, %k0, %k0
@@ -1430,7 +1376,7 @@ define void @store_v1i1(<1 x i1> %c , <1 x i1>* %ptr) {
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: store_v1i1:
-; AVX512DQ: ## BB#0:
+; AVX512DQ: ## %bb.0:
; AVX512DQ-NEXT: kmovw %edi, %k0
; AVX512DQ-NEXT: kxnorw %k0, %k0, %k1
; AVX512DQ-NEXT: kxorw %k1, %k0, %k0
@@ -1443,17 +1389,18 @@ define void @store_v1i1(<1 x i1> %c , <1 x i1>* %ptr) {
define void @store_v2i1(<2 x i1> %c , <2 x i1>* %ptr) {
; KNL-LABEL: store_v2i1:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; KNL-NEXT: vpxor %xmm1, %xmm0, %xmm0
; KNL-NEXT: vpsllq $63, %zmm0, %zmm0
; KNL-NEXT: vptestmq %zmm0, %zmm0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: movb %al, (%rdi)
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: store_v2i1:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpsllq $63, %xmm0, %xmm0
; SKX-NEXT: vptestmq %xmm0, %xmm0, %k0
; SKX-NEXT: knotw %k0, %k0
@@ -1461,7 +1408,7 @@ define void @store_v2i1(<2 x i1> %c , <2 x i1>* %ptr) {
; SKX-NEXT: retq
;
; AVX512BW-LABEL: store_v2i1:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; AVX512BW-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX512BW-NEXT: vpsllq $63, %zmm0, %zmm0
@@ -1472,7 +1419,7 @@ define void @store_v2i1(<2 x i1> %c , <2 x i1>* %ptr) {
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: store_v2i1:
-; AVX512DQ: ## BB#0:
+; AVX512DQ: ## %bb.0:
; AVX512DQ-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; AVX512DQ-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX512DQ-NEXT: vpsllq $63, %zmm0, %zmm0
@@ -1487,17 +1434,18 @@ define void @store_v2i1(<2 x i1> %c , <2 x i1>* %ptr) {
define void @store_v4i1(<4 x i1> %c , <4 x i1>* %ptr) {
; KNL-LABEL: store_v4i1:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; KNL-NEXT: vpxor %xmm1, %xmm0, %xmm0
; KNL-NEXT: vpslld $31, %ymm0, %ymm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: movb %al, (%rdi)
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: store_v4i1:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpslld $31, %xmm0, %xmm0
; SKX-NEXT: vptestmd %xmm0, %xmm0, %k0
; SKX-NEXT: knotw %k0, %k0
@@ -1505,7 +1453,7 @@ define void @store_v4i1(<4 x i1> %c , <4 x i1>* %ptr) {
; SKX-NEXT: retq
;
; AVX512BW-LABEL: store_v4i1:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; AVX512BW-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX512BW-NEXT: vpslld $31, %ymm0, %ymm0
@@ -1516,7 +1464,7 @@ define void @store_v4i1(<4 x i1> %c , <4 x i1>* %ptr) {
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: store_v4i1:
-; AVX512DQ: ## BB#0:
+; AVX512DQ: ## %bb.0:
; AVX512DQ-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; AVX512DQ-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX512DQ-NEXT: vpslld $31, %ymm0, %ymm0
@@ -1531,17 +1479,18 @@ define void @store_v4i1(<4 x i1> %c , <4 x i1>* %ptr) {
define void @store_v8i1(<8 x i1> %c , <8 x i1>* %ptr) {
; KNL-LABEL: store_v8i1:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: vpmovsxwq %xmm0, %zmm0
; KNL-NEXT: vpsllq $63, %zmm0, %zmm0
; KNL-NEXT: vptestmq %zmm0, %zmm0, %k0
; KNL-NEXT: knotw %k0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: movb %al, (%rdi)
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: store_v8i1:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpsllw $15, %xmm0, %xmm0
; SKX-NEXT: vpmovw2m %xmm0, %k0
; SKX-NEXT: knotb %k0, %k0
@@ -1549,7 +1498,7 @@ define void @store_v8i1(<8 x i1> %c , <8 x i1>* %ptr) {
; SKX-NEXT: retq
;
; AVX512BW-LABEL: store_v8i1:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: vpsllw $15, %xmm0, %xmm0
; AVX512BW-NEXT: vpmovw2m %zmm0, %k0
; AVX512BW-NEXT: knotw %k0, %k0
@@ -1559,7 +1508,7 @@ define void @store_v8i1(<8 x i1> %c , <8 x i1>* %ptr) {
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: store_v8i1:
-; AVX512DQ: ## BB#0:
+; AVX512DQ: ## %bb.0:
; AVX512DQ-NEXT: vpmovsxwq %xmm0, %zmm0
; AVX512DQ-NEXT: vpsllq $63, %zmm0, %zmm0
; AVX512DQ-NEXT: vptestmq %zmm0, %zmm0, %k0
@@ -1574,16 +1523,17 @@ define void @store_v8i1(<8 x i1> %c , <8 x i1>* %ptr) {
define void @store_v16i1(<16 x i1> %c , <16 x i1>* %ptr) {
; KNL-LABEL: store_v16i1:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
; KNL-NEXT: knotw %k0, %k0
; KNL-NEXT: kmovw %k0, (%rdi)
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: store_v16i1:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpsllw $7, %xmm0, %xmm0
; SKX-NEXT: vpmovb2m %xmm0, %k0
; SKX-NEXT: knotw %k0, %k0
@@ -1591,7 +1541,7 @@ define void @store_v16i1(<16 x i1> %c , <16 x i1>* %ptr) {
; SKX-NEXT: retq
;
; AVX512BW-LABEL: store_v16i1:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: vpsllw $7, %xmm0, %xmm0
; AVX512BW-NEXT: vpmovb2m %zmm0, %k0
; AVX512BW-NEXT: knotw %k0, %k0
@@ -1600,7 +1550,7 @@ define void @store_v16i1(<16 x i1> %c , <16 x i1>* %ptr) {
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: store_v16i1:
-; AVX512DQ: ## BB#0:
+; AVX512DQ: ## %bb.0:
; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0
; AVX512DQ-NEXT: vpslld $31, %zmm0, %zmm0
; AVX512DQ-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -1628,13 +1578,10 @@ define void @store_v16i1(<16 x i1> %c , <16 x i1>* %ptr) {
define void @f1(i32 %c) {
; CHECK-LABEL: f1:
-; CHECK: ## BB#0: ## %entry
+; CHECK: ## %bb.0: ## %entry
; CHECK-NEXT: movzbl {{.*}}(%rip), %edi
-; CHECK-NEXT: movb {{.*}}(%rip), %al
-; CHECK-NEXT: notb %al
-; CHECK-NEXT: andb $1, %al
-; CHECK-NEXT: movb %al, {{.*}}(%rip)
; CHECK-NEXT: xorl $1, %edi
+; CHECK-NEXT: movb %dil, {{.*}}(%rip)
; CHECK-NEXT: jmp _f2 ## TAILCALL
entry:
%.b1 = load i1, i1* @f1.v, align 4
@@ -1649,7 +1596,7 @@ declare void @f2(i32) #1
define void @store_i16_i1(i16 %x, i1 *%y) {
; CHECK-LABEL: store_i16_i1:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: andl $1, %edi
; CHECK-NEXT: movb %dil, (%rsi)
; CHECK-NEXT: retq
@@ -1660,7 +1607,7 @@ define void @store_i16_i1(i16 %x, i1 *%y) {
define void @store_i8_i1(i8 %x, i1 *%y) {
; CHECK-LABEL: store_i8_i1:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: andl $1, %edi
; CHECK-NEXT: movb %dil, (%rsi)
; CHECK-NEXT: retq
@@ -1671,27 +1618,27 @@ define void @store_i8_i1(i8 %x, i1 *%y) {
define <32 x i16> @test_build_vec_v32i1(<32 x i16> %x) {
; KNL-LABEL: test_build_vec_v32i1:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
; KNL-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1
; KNL-NEXT: retq
;
; SKX-LABEL: test_build_vec_v32i1:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: movl $1497715861, %eax ## imm = 0x59455495
; SKX-NEXT: kmovd %eax, %k1
; SKX-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z}
; SKX-NEXT: retq
;
; AVX512BW-LABEL: test_build_vec_v32i1:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: movl $1497715861, %eax ## imm = 0x59455495
; AVX512BW-NEXT: kmovd %eax, %k1
; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z}
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: test_build_vec_v32i1:
-; AVX512DQ: ## BB#0:
+; AVX512DQ: ## %bb.0:
; AVX512DQ-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
; AVX512DQ-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1
; AVX512DQ-NEXT: retq
@@ -1701,27 +1648,23 @@ define <32 x i16> @test_build_vec_v32i1(<32 x i16> %x) {
define <64 x i8> @test_build_vec_v64i1(<64 x i8> %x) {
; KNL-LABEL: test_build_vec_v64i1:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
; KNL-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1
; KNL-NEXT: retq
;
; SKX-LABEL: test_build_vec_v64i1:
-; SKX: ## BB#0:
-; SKX-NEXT: movabsq $6432645796886517060, %rax ## imm = 0x5945594549549544
-; SKX-NEXT: kmovq %rax, %k1
-; SKX-NEXT: vmovdqu8 %zmm0, %zmm0 {%k1} {z}
+; SKX: ## %bb.0:
+; SKX-NEXT: vpshufb {{.*#+}} zmm0 = zero,zero,zmm0[2],zero,zero,zero,zmm0[6],zero,zmm0[8],zero,zmm0[10],zero,zmm0[12],zero,zero,zmm0[15],zero,zero,zmm0[18],zero,zmm0[20],zero,zmm0[22],zero,zmm0[24],zero,zero,zmm0[27],zero,zero,zmm0[30],zero,zmm0[32],zero,zmm0[34],zero,zero,zero,zmm0[38],zero,zmm0[40],zero,zero,zmm0[43,44],zero,zmm0[46],zero,zmm0[48],zero,zmm0[50],zero,zero,zero,zmm0[54],zero,zmm0[56],zero,zero,zmm0[59,60],zero,zmm0[62],zero
; SKX-NEXT: retq
;
; AVX512BW-LABEL: test_build_vec_v64i1:
-; AVX512BW: ## BB#0:
-; AVX512BW-NEXT: movabsq $6432645796886517060, %rax ## imm = 0x5945594549549544
-; AVX512BW-NEXT: kmovq %rax, %k1
-; AVX512BW-NEXT: vmovdqu8 %zmm0, %zmm0 {%k1} {z}
+; AVX512BW: ## %bb.0:
+; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zero,zero,zmm0[2],zero,zero,zero,zmm0[6],zero,zmm0[8],zero,zmm0[10],zero,zmm0[12],zero,zero,zmm0[15],zero,zero,zmm0[18],zero,zmm0[20],zero,zmm0[22],zero,zmm0[24],zero,zero,zmm0[27],zero,zero,zmm0[30],zero,zmm0[32],zero,zmm0[34],zero,zero,zero,zmm0[38],zero,zmm0[40],zero,zero,zmm0[43,44],zero,zmm0[46],zero,zmm0[48],zero,zmm0[50],zero,zero,zero,zmm0[54],zero,zmm0[56],zero,zero,zmm0[59,60],zero,zmm0[62],zero
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: test_build_vec_v64i1:
-; AVX512DQ: ## BB#0:
+; AVX512DQ: ## %bb.0:
; AVX512DQ-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
; AVX512DQ-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1
; AVX512DQ-NEXT: retq
@@ -1731,7 +1674,7 @@ define <64 x i8> @test_build_vec_v64i1(<64 x i8> %x) {
define void @ktest_1(<8 x double> %in, double * %base) {
; KNL-LABEL: ktest_1:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: vmovupd (%rdi), %zmm1
; KNL-NEXT: vcmpltpd %zmm0, %zmm1, %k1
; KNL-NEXT: vmovupd 8(%rdi), %zmm1 {%k1} {z}
@@ -1739,22 +1682,24 @@ define void @ktest_1(<8 x double> %in, double * %base) {
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: testb %al, %al
; KNL-NEXT: je LBB41_2
-; KNL-NEXT: ## BB#1: ## %L1
+; KNL-NEXT: ## %bb.1: ## %L1
; KNL-NEXT: vmovapd %zmm0, (%rdi)
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
; KNL-NEXT: LBB41_2: ## %L2
; KNL-NEXT: vmovapd %zmm0, 8(%rdi)
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: ktest_1:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vmovupd (%rdi), %zmm1
; SKX-NEXT: vcmpltpd %zmm0, %zmm1, %k1
; SKX-NEXT: vmovupd 8(%rdi), %zmm1 {%k1} {z}
; SKX-NEXT: vcmpltpd %zmm1, %zmm0, %k0 {%k1}
; SKX-NEXT: ktestb %k0, %k0
; SKX-NEXT: je LBB41_2
-; SKX-NEXT: ## BB#1: ## %L1
+; SKX-NEXT: ## %bb.1: ## %L1
; SKX-NEXT: vmovapd %zmm0, (%rdi)
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
@@ -1764,7 +1709,7 @@ define void @ktest_1(<8 x double> %in, double * %base) {
; SKX-NEXT: retq
;
; AVX512BW-LABEL: ktest_1:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: vmovupd (%rdi), %zmm1
; AVX512BW-NEXT: vcmpltpd %zmm0, %zmm1, %k1
; AVX512BW-NEXT: vmovupd 8(%rdi), %zmm1 {%k1} {z}
@@ -1772,7 +1717,7 @@ define void @ktest_1(<8 x double> %in, double * %base) {
; AVX512BW-NEXT: kmovd %k0, %eax
; AVX512BW-NEXT: testb %al, %al
; AVX512BW-NEXT: je LBB41_2
-; AVX512BW-NEXT: ## BB#1: ## %L1
+; AVX512BW-NEXT: ## %bb.1: ## %L1
; AVX512BW-NEXT: vmovapd %zmm0, (%rdi)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
@@ -1782,14 +1727,14 @@ define void @ktest_1(<8 x double> %in, double * %base) {
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: ktest_1:
-; AVX512DQ: ## BB#0:
+; AVX512DQ: ## %bb.0:
; AVX512DQ-NEXT: vmovupd (%rdi), %zmm1
; AVX512DQ-NEXT: vcmpltpd %zmm0, %zmm1, %k1
; AVX512DQ-NEXT: vmovupd 8(%rdi), %zmm1 {%k1} {z}
; AVX512DQ-NEXT: vcmpltpd %zmm1, %zmm0, %k0 {%k1}
; AVX512DQ-NEXT: ktestb %k0, %k0
; AVX512DQ-NEXT: je LBB41_2
-; AVX512DQ-NEXT: ## BB#1: ## %L1
+; AVX512DQ-NEXT: ## %bb.1: ## %L1
; AVX512DQ-NEXT: vmovapd %zmm0, (%rdi)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
@@ -1827,281 +1772,33 @@ End:
define void @ktest_2(<32 x float> %in, float * %base) {
;
; KNL-LABEL: ktest_2:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: pushq %rbp
-; KNL-NEXT: Lcfi6:
; KNL-NEXT: .cfi_def_cfa_offset 16
-; KNL-NEXT: Lcfi7:
; KNL-NEXT: .cfi_offset %rbp, -16
; KNL-NEXT: movq %rsp, %rbp
-; KNL-NEXT: Lcfi8:
; KNL-NEXT: .cfi_def_cfa_register %rbp
; KNL-NEXT: andq $-32, %rsp
; KNL-NEXT: subq $32, %rsp
; KNL-NEXT: vmovups (%rdi), %zmm2
; KNL-NEXT: vmovups 64(%rdi), %zmm3
-; KNL-NEXT: vcmpltps %zmm1, %zmm3, %k1
-; KNL-NEXT: kshiftlw $14, %k1, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: kshiftlw $15, %k1, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %ecx
-; KNL-NEXT: vmovd %ecx, %xmm3
-; KNL-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3
-; KNL-NEXT: kshiftlw $13, %k1, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3
-; KNL-NEXT: kshiftlw $12, %k1, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3
-; KNL-NEXT: kshiftlw $11, %k1, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3
-; KNL-NEXT: kshiftlw $10, %k1, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3
-; KNL-NEXT: kshiftlw $9, %k1, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3
-; KNL-NEXT: kshiftlw $8, %k1, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3
-; KNL-NEXT: kshiftlw $7, %k1, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3
-; KNL-NEXT: kshiftlw $6, %k1, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3
-; KNL-NEXT: kshiftlw $5, %k1, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3
-; KNL-NEXT: kshiftlw $4, %k1, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3
-; KNL-NEXT: kshiftlw $3, %k1, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3
-; KNL-NEXT: kshiftlw $2, %k1, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3
-; KNL-NEXT: kshiftlw $1, %k1, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3
-; KNL-NEXT: kshiftrw $15, %k1, %k0
-; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: vpinsrb $15, %eax, %xmm3, %xmm3
-; KNL-NEXT: vcmpltps %zmm0, %zmm2, %k2
-; KNL-NEXT: kshiftlw $14, %k2, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: kshiftlw $15, %k2, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %ecx
-; KNL-NEXT: vmovd %ecx, %xmm2
-; KNL-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2
-; KNL-NEXT: kshiftlw $13, %k2, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2
-; KNL-NEXT: kshiftlw $12, %k2, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2
-; KNL-NEXT: kshiftlw $11, %k2, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
-; KNL-NEXT: kshiftlw $10, %k2, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2
-; KNL-NEXT: kshiftlw $9, %k2, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2
-; KNL-NEXT: kshiftlw $8, %k2, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2
-; KNL-NEXT: kshiftlw $7, %k2, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
-; KNL-NEXT: kshiftlw $6, %k2, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2
-; KNL-NEXT: kshiftlw $5, %k2, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2
-; KNL-NEXT: kshiftlw $4, %k2, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2
-; KNL-NEXT: kshiftlw $3, %k2, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
-; KNL-NEXT: kshiftlw $2, %k2, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2
-; KNL-NEXT: kshiftlw $1, %k2, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2
-; KNL-NEXT: kshiftrw $15, %k2, %k0
-; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2
-; KNL-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
-; KNL-NEXT: vmovups 4(%rdi), %zmm3 {%k2} {z}
-; KNL-NEXT: vmovups 68(%rdi), %zmm4 {%k1} {z}
-; KNL-NEXT: vcmpltps %zmm4, %zmm1, %k0
-; KNL-NEXT: kshiftlw $14, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: kshiftlw $15, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %ecx
-; KNL-NEXT: vmovd %ecx, %xmm4
-; KNL-NEXT: vpinsrb $1, %eax, %xmm4, %xmm4
-; KNL-NEXT: kshiftlw $13, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $2, %eax, %xmm4, %xmm4
-; KNL-NEXT: kshiftlw $12, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $3, %eax, %xmm4, %xmm4
-; KNL-NEXT: kshiftlw $11, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $4, %eax, %xmm4, %xmm4
-; KNL-NEXT: kshiftlw $10, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $5, %eax, %xmm4, %xmm4
-; KNL-NEXT: kshiftlw $9, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $6, %eax, %xmm4, %xmm4
-; KNL-NEXT: kshiftlw $8, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $7, %eax, %xmm4, %xmm4
-; KNL-NEXT: kshiftlw $7, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $8, %eax, %xmm4, %xmm4
-; KNL-NEXT: kshiftlw $6, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $9, %eax, %xmm4, %xmm4
-; KNL-NEXT: kshiftlw $5, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $10, %eax, %xmm4, %xmm4
-; KNL-NEXT: kshiftlw $4, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $11, %eax, %xmm4, %xmm4
-; KNL-NEXT: kshiftlw $3, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $12, %eax, %xmm4, %xmm4
-; KNL-NEXT: kshiftlw $2, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $13, %eax, %xmm4, %xmm4
-; KNL-NEXT: kshiftlw $1, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $14, %eax, %xmm4, %xmm4
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: vpinsrb $15, %eax, %xmm4, %xmm4
-; KNL-NEXT: vcmpltps %zmm3, %zmm0, %k0
-; KNL-NEXT: kshiftlw $14, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: kshiftlw $15, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %ecx
-; KNL-NEXT: vmovd %ecx, %xmm3
-; KNL-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3
-; KNL-NEXT: kshiftlw $13, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3
-; KNL-NEXT: kshiftlw $12, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3
-; KNL-NEXT: kshiftlw $11, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3
-; KNL-NEXT: kshiftlw $10, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3
-; KNL-NEXT: kshiftlw $9, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3
-; KNL-NEXT: kshiftlw $8, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3
-; KNL-NEXT: kshiftlw $7, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3
-; KNL-NEXT: kshiftlw $6, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3
-; KNL-NEXT: kshiftlw $5, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3
-; KNL-NEXT: kshiftlw $4, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3
-; KNL-NEXT: kshiftlw $3, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3
-; KNL-NEXT: kshiftlw $2, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3
-; KNL-NEXT: kshiftlw $1, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: vpinsrb $15, %eax, %xmm3, %xmm3
-; KNL-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
-; KNL-NEXT: vpor %ymm3, %ymm2, %ymm2
-; KNL-NEXT: vextracti128 $1, %ymm2, %xmm3
+; KNL-NEXT: vcmpltps %zmm0, %zmm2, %k1
+; KNL-NEXT: movl {{.*}}(%rip), %eax
+; KNL-NEXT: vpbroadcastd %eax, %zmm2 {%k1} {z}
+; KNL-NEXT: vpmovdb %zmm2, %xmm2
+; KNL-NEXT: vcmpltps %zmm1, %zmm3, %k2
+; KNL-NEXT: vpbroadcastd %eax, %zmm3 {%k2} {z}
+; KNL-NEXT: vpmovdb %zmm3, %xmm3
+; KNL-NEXT: vmovups 68(%rdi), %zmm4 {%k2} {z}
+; KNL-NEXT: vmovups 4(%rdi), %zmm5 {%k1} {z}
+; KNL-NEXT: vcmpltps %zmm5, %zmm0, %k1
+; KNL-NEXT: vpbroadcastd %eax, %zmm5 {%k1} {z}
+; KNL-NEXT: vpmovdb %zmm5, %xmm5
+; KNL-NEXT: vpor %xmm5, %xmm2, %xmm2
+; KNL-NEXT: vcmpltps %zmm4, %zmm1, %k1
+; KNL-NEXT: vpbroadcastd %eax, %zmm4 {%k1} {z}
+; KNL-NEXT: vpmovdb %zmm4, %xmm4
+; KNL-NEXT: vpor %xmm4, %xmm3, %xmm3
; KNL-NEXT: vpmovsxbd %xmm3, %zmm3
; KNL-NEXT: vpslld $31, %zmm3, %zmm3
; KNL-NEXT: vptestmd %zmm3, %zmm3, %k0
@@ -2112,7 +1809,7 @@ define void @ktest_2(<32 x float> %in, float * %base) {
; KNL-NEXT: kmovw %k0, (%rsp)
; KNL-NEXT: cmpl $0, (%rsp)
; KNL-NEXT: je LBB42_2
-; KNL-NEXT: ## BB#1: ## %L1
+; KNL-NEXT: ## %bb.1: ## %L1
; KNL-NEXT: vmovaps %zmm0, (%rdi)
; KNL-NEXT: vmovaps %zmm1, 64(%rdi)
; KNL-NEXT: jmp LBB42_3
@@ -2122,10 +1819,11 @@ define void @ktest_2(<32 x float> %in, float * %base) {
; KNL-NEXT: LBB42_3: ## %End
; KNL-NEXT: movq %rbp, %rsp
; KNL-NEXT: popq %rbp
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: ktest_2:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vmovups (%rdi), %zmm2
; SKX-NEXT: vmovups 64(%rdi), %zmm3
; SKX-NEXT: vcmpltps %zmm0, %zmm2, %k1
@@ -2139,7 +1837,7 @@ define void @ktest_2(<32 x float> %in, float * %base) {
; SKX-NEXT: kord %k1, %k0, %k0
; SKX-NEXT: ktestd %k0, %k0
; SKX-NEXT: je LBB42_2
-; SKX-NEXT: ## BB#1: ## %L1
+; SKX-NEXT: ## %bb.1: ## %L1
; SKX-NEXT: vmovaps %zmm0, (%rdi)
; SKX-NEXT: vmovaps %zmm1, 64(%rdi)
; SKX-NEXT: vzeroupper
@@ -2151,7 +1849,7 @@ define void @ktest_2(<32 x float> %in, float * %base) {
; SKX-NEXT: retq
;
; AVX512BW-LABEL: ktest_2:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: vmovups (%rdi), %zmm2
; AVX512BW-NEXT: vmovups 64(%rdi), %zmm3
; AVX512BW-NEXT: vcmpltps %zmm0, %zmm2, %k1
@@ -2165,7 +1863,7 @@ define void @ktest_2(<32 x float> %in, float * %base) {
; AVX512BW-NEXT: kord %k1, %k0, %k0
; AVX512BW-NEXT: ktestd %k0, %k0
; AVX512BW-NEXT: je LBB42_2
-; AVX512BW-NEXT: ## BB#1: ## %L1
+; AVX512BW-NEXT: ## %bb.1: ## %L1
; AVX512BW-NEXT: vmovaps %zmm0, (%rdi)
; AVX512BW-NEXT: vmovaps %zmm1, 64(%rdi)
; AVX512BW-NEXT: vzeroupper
@@ -2177,281 +1875,33 @@ define void @ktest_2(<32 x float> %in, float * %base) {
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: ktest_2:
-; AVX512DQ: ## BB#0:
+; AVX512DQ: ## %bb.0:
; AVX512DQ-NEXT: pushq %rbp
-; AVX512DQ-NEXT: Lcfi6:
; AVX512DQ-NEXT: .cfi_def_cfa_offset 16
-; AVX512DQ-NEXT: Lcfi7:
; AVX512DQ-NEXT: .cfi_offset %rbp, -16
; AVX512DQ-NEXT: movq %rsp, %rbp
-; AVX512DQ-NEXT: Lcfi8:
; AVX512DQ-NEXT: .cfi_def_cfa_register %rbp
; AVX512DQ-NEXT: andq $-32, %rsp
; AVX512DQ-NEXT: subq $32, %rsp
; AVX512DQ-NEXT: vmovups (%rdi), %zmm2
; AVX512DQ-NEXT: vmovups 64(%rdi), %zmm3
-; AVX512DQ-NEXT: vcmpltps %zmm1, %zmm3, %k1
-; AVX512DQ-NEXT: kshiftlw $14, %k1, %k0
-; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0
-; AVX512DQ-NEXT: kmovw %k0, %eax
-; AVX512DQ-NEXT: kshiftlw $15, %k1, %k0
-; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0
-; AVX512DQ-NEXT: kmovw %k0, %ecx
-; AVX512DQ-NEXT: vmovd %ecx, %xmm3
-; AVX512DQ-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3
-; AVX512DQ-NEXT: kshiftlw $13, %k1, %k0
-; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0
-; AVX512DQ-NEXT: kmovw %k0, %eax
-; AVX512DQ-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3
-; AVX512DQ-NEXT: kshiftlw $12, %k1, %k0
-; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0
-; AVX512DQ-NEXT: kmovw %k0, %eax
-; AVX512DQ-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3
-; AVX512DQ-NEXT: kshiftlw $11, %k1, %k0
-; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0
-; AVX512DQ-NEXT: kmovw %k0, %eax
-; AVX512DQ-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3
-; AVX512DQ-NEXT: kshiftlw $10, %k1, %k0
-; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0
-; AVX512DQ-NEXT: kmovw %k0, %eax
-; AVX512DQ-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3
-; AVX512DQ-NEXT: kshiftlw $9, %k1, %k0
-; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0
-; AVX512DQ-NEXT: kmovw %k0, %eax
-; AVX512DQ-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3
-; AVX512DQ-NEXT: kshiftlw $8, %k1, %k0
-; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0
-; AVX512DQ-NEXT: kmovw %k0, %eax
-; AVX512DQ-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3
-; AVX512DQ-NEXT: kshiftlw $7, %k1, %k0
-; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0
-; AVX512DQ-NEXT: kmovw %k0, %eax
-; AVX512DQ-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3
-; AVX512DQ-NEXT: kshiftlw $6, %k1, %k0
-; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0
-; AVX512DQ-NEXT: kmovw %k0, %eax
-; AVX512DQ-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3
-; AVX512DQ-NEXT: kshiftlw $5, %k1, %k0
-; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0
-; AVX512DQ-NEXT: kmovw %k0, %eax
-; AVX512DQ-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3
-; AVX512DQ-NEXT: kshiftlw $4, %k1, %k0
-; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0
-; AVX512DQ-NEXT: kmovw %k0, %eax
-; AVX512DQ-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3
-; AVX512DQ-NEXT: kshiftlw $3, %k1, %k0
-; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0
-; AVX512DQ-NEXT: kmovw %k0, %eax
-; AVX512DQ-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3
-; AVX512DQ-NEXT: kshiftlw $2, %k1, %k0
-; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0
-; AVX512DQ-NEXT: kmovw %k0, %eax
-; AVX512DQ-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3
-; AVX512DQ-NEXT: kshiftlw $1, %k1, %k0
-; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0
-; AVX512DQ-NEXT: kmovw %k0, %eax
-; AVX512DQ-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3
-; AVX512DQ-NEXT: kshiftrw $15, %k1, %k0
-; AVX512DQ-NEXT: kmovw %k0, %eax
-; AVX512DQ-NEXT: vpinsrb $15, %eax, %xmm3, %xmm3
-; AVX512DQ-NEXT: vcmpltps %zmm0, %zmm2, %k2
-; AVX512DQ-NEXT: kshiftlw $14, %k2, %k0
-; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0
-; AVX512DQ-NEXT: kmovw %k0, %eax
-; AVX512DQ-NEXT: kshiftlw $15, %k2, %k0
-; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0
-; AVX512DQ-NEXT: kmovw %k0, %ecx
-; AVX512DQ-NEXT: vmovd %ecx, %xmm2
-; AVX512DQ-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2
-; AVX512DQ-NEXT: kshiftlw $13, %k2, %k0
-; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0
-; AVX512DQ-NEXT: kmovw %k0, %eax
-; AVX512DQ-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2
-; AVX512DQ-NEXT: kshiftlw $12, %k2, %k0
-; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0
-; AVX512DQ-NEXT: kmovw %k0, %eax
-; AVX512DQ-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2
-; AVX512DQ-NEXT: kshiftlw $11, %k2, %k0
-; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0
-; AVX512DQ-NEXT: kmovw %k0, %eax
-; AVX512DQ-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
-; AVX512DQ-NEXT: kshiftlw $10, %k2, %k0
-; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0
-; AVX512DQ-NEXT: kmovw %k0, %eax
-; AVX512DQ-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2
-; AVX512DQ-NEXT: kshiftlw $9, %k2, %k0
-; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0
-; AVX512DQ-NEXT: kmovw %k0, %eax
-; AVX512DQ-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2
-; AVX512DQ-NEXT: kshiftlw $8, %k2, %k0
-; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0
-; AVX512DQ-NEXT: kmovw %k0, %eax
-; AVX512DQ-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2
-; AVX512DQ-NEXT: kshiftlw $7, %k2, %k0
-; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0
-; AVX512DQ-NEXT: kmovw %k0, %eax
-; AVX512DQ-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
-; AVX512DQ-NEXT: kshiftlw $6, %k2, %k0
-; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0
-; AVX512DQ-NEXT: kmovw %k0, %eax
-; AVX512DQ-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2
-; AVX512DQ-NEXT: kshiftlw $5, %k2, %k0
-; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0
-; AVX512DQ-NEXT: kmovw %k0, %eax
-; AVX512DQ-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2
-; AVX512DQ-NEXT: kshiftlw $4, %k2, %k0
-; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0
-; AVX512DQ-NEXT: kmovw %k0, %eax
-; AVX512DQ-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2
-; AVX512DQ-NEXT: kshiftlw $3, %k2, %k0
-; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0
-; AVX512DQ-NEXT: kmovw %k0, %eax
-; AVX512DQ-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
-; AVX512DQ-NEXT: kshiftlw $2, %k2, %k0
-; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0
-; AVX512DQ-NEXT: kmovw %k0, %eax
-; AVX512DQ-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2
-; AVX512DQ-NEXT: kshiftlw $1, %k2, %k0
-; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0
-; AVX512DQ-NEXT: kmovw %k0, %eax
-; AVX512DQ-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2
-; AVX512DQ-NEXT: kshiftrw $15, %k2, %k0
-; AVX512DQ-NEXT: kmovw %k0, %eax
-; AVX512DQ-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2
-; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
-; AVX512DQ-NEXT: vmovups 4(%rdi), %zmm3 {%k2} {z}
-; AVX512DQ-NEXT: vmovups 68(%rdi), %zmm4 {%k1} {z}
-; AVX512DQ-NEXT: vcmpltps %zmm4, %zmm1, %k0
-; AVX512DQ-NEXT: kshiftlw $14, %k0, %k1
-; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
-; AVX512DQ-NEXT: kmovw %k1, %eax
-; AVX512DQ-NEXT: kshiftlw $15, %k0, %k1
-; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
-; AVX512DQ-NEXT: kmovw %k1, %ecx
-; AVX512DQ-NEXT: vmovd %ecx, %xmm4
-; AVX512DQ-NEXT: vpinsrb $1, %eax, %xmm4, %xmm4
-; AVX512DQ-NEXT: kshiftlw $13, %k0, %k1
-; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
-; AVX512DQ-NEXT: kmovw %k1, %eax
-; AVX512DQ-NEXT: vpinsrb $2, %eax, %xmm4, %xmm4
-; AVX512DQ-NEXT: kshiftlw $12, %k0, %k1
-; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
-; AVX512DQ-NEXT: kmovw %k1, %eax
-; AVX512DQ-NEXT: vpinsrb $3, %eax, %xmm4, %xmm4
-; AVX512DQ-NEXT: kshiftlw $11, %k0, %k1
-; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
-; AVX512DQ-NEXT: kmovw %k1, %eax
-; AVX512DQ-NEXT: vpinsrb $4, %eax, %xmm4, %xmm4
-; AVX512DQ-NEXT: kshiftlw $10, %k0, %k1
-; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
-; AVX512DQ-NEXT: kmovw %k1, %eax
-; AVX512DQ-NEXT: vpinsrb $5, %eax, %xmm4, %xmm4
-; AVX512DQ-NEXT: kshiftlw $9, %k0, %k1
-; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
-; AVX512DQ-NEXT: kmovw %k1, %eax
-; AVX512DQ-NEXT: vpinsrb $6, %eax, %xmm4, %xmm4
-; AVX512DQ-NEXT: kshiftlw $8, %k0, %k1
-; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
-; AVX512DQ-NEXT: kmovw %k1, %eax
-; AVX512DQ-NEXT: vpinsrb $7, %eax, %xmm4, %xmm4
-; AVX512DQ-NEXT: kshiftlw $7, %k0, %k1
-; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
-; AVX512DQ-NEXT: kmovw %k1, %eax
-; AVX512DQ-NEXT: vpinsrb $8, %eax, %xmm4, %xmm4
-; AVX512DQ-NEXT: kshiftlw $6, %k0, %k1
-; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
-; AVX512DQ-NEXT: kmovw %k1, %eax
-; AVX512DQ-NEXT: vpinsrb $9, %eax, %xmm4, %xmm4
-; AVX512DQ-NEXT: kshiftlw $5, %k0, %k1
-; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
-; AVX512DQ-NEXT: kmovw %k1, %eax
-; AVX512DQ-NEXT: vpinsrb $10, %eax, %xmm4, %xmm4
-; AVX512DQ-NEXT: kshiftlw $4, %k0, %k1
-; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
-; AVX512DQ-NEXT: kmovw %k1, %eax
-; AVX512DQ-NEXT: vpinsrb $11, %eax, %xmm4, %xmm4
-; AVX512DQ-NEXT: kshiftlw $3, %k0, %k1
-; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
-; AVX512DQ-NEXT: kmovw %k1, %eax
-; AVX512DQ-NEXT: vpinsrb $12, %eax, %xmm4, %xmm4
-; AVX512DQ-NEXT: kshiftlw $2, %k0, %k1
-; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
-; AVX512DQ-NEXT: kmovw %k1, %eax
-; AVX512DQ-NEXT: vpinsrb $13, %eax, %xmm4, %xmm4
-; AVX512DQ-NEXT: kshiftlw $1, %k0, %k1
-; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
-; AVX512DQ-NEXT: kmovw %k1, %eax
-; AVX512DQ-NEXT: vpinsrb $14, %eax, %xmm4, %xmm4
-; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0
-; AVX512DQ-NEXT: kmovw %k0, %eax
-; AVX512DQ-NEXT: vpinsrb $15, %eax, %xmm4, %xmm4
-; AVX512DQ-NEXT: vcmpltps %zmm3, %zmm0, %k0
-; AVX512DQ-NEXT: kshiftlw $14, %k0, %k1
-; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
-; AVX512DQ-NEXT: kmovw %k1, %eax
-; AVX512DQ-NEXT: kshiftlw $15, %k0, %k1
-; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
-; AVX512DQ-NEXT: kmovw %k1, %ecx
-; AVX512DQ-NEXT: vmovd %ecx, %xmm3
-; AVX512DQ-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3
-; AVX512DQ-NEXT: kshiftlw $13, %k0, %k1
-; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
-; AVX512DQ-NEXT: kmovw %k1, %eax
-; AVX512DQ-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3
-; AVX512DQ-NEXT: kshiftlw $12, %k0, %k1
-; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
-; AVX512DQ-NEXT: kmovw %k1, %eax
-; AVX512DQ-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3
-; AVX512DQ-NEXT: kshiftlw $11, %k0, %k1
-; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
-; AVX512DQ-NEXT: kmovw %k1, %eax
-; AVX512DQ-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3
-; AVX512DQ-NEXT: kshiftlw $10, %k0, %k1
-; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
-; AVX512DQ-NEXT: kmovw %k1, %eax
-; AVX512DQ-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3
-; AVX512DQ-NEXT: kshiftlw $9, %k0, %k1
-; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
-; AVX512DQ-NEXT: kmovw %k1, %eax
-; AVX512DQ-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3
-; AVX512DQ-NEXT: kshiftlw $8, %k0, %k1
-; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
-; AVX512DQ-NEXT: kmovw %k1, %eax
-; AVX512DQ-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3
-; AVX512DQ-NEXT: kshiftlw $7, %k0, %k1
-; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
-; AVX512DQ-NEXT: kmovw %k1, %eax
-; AVX512DQ-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3
-; AVX512DQ-NEXT: kshiftlw $6, %k0, %k1
-; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
-; AVX512DQ-NEXT: kmovw %k1, %eax
-; AVX512DQ-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3
-; AVX512DQ-NEXT: kshiftlw $5, %k0, %k1
-; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
-; AVX512DQ-NEXT: kmovw %k1, %eax
-; AVX512DQ-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3
-; AVX512DQ-NEXT: kshiftlw $4, %k0, %k1
-; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
-; AVX512DQ-NEXT: kmovw %k1, %eax
-; AVX512DQ-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3
-; AVX512DQ-NEXT: kshiftlw $3, %k0, %k1
-; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
-; AVX512DQ-NEXT: kmovw %k1, %eax
-; AVX512DQ-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3
-; AVX512DQ-NEXT: kshiftlw $2, %k0, %k1
-; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
-; AVX512DQ-NEXT: kmovw %k1, %eax
-; AVX512DQ-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3
-; AVX512DQ-NEXT: kshiftlw $1, %k0, %k1
-; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
-; AVX512DQ-NEXT: kmovw %k1, %eax
-; AVX512DQ-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3
-; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0
-; AVX512DQ-NEXT: kmovw %k0, %eax
-; AVX512DQ-NEXT: vpinsrb $15, %eax, %xmm3, %xmm3
-; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
-; AVX512DQ-NEXT: vpor %ymm3, %ymm2, %ymm2
-; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX512DQ-NEXT: vcmpltps %zmm0, %zmm2, %k1
+; AVX512DQ-NEXT: movl {{.*}}(%rip), %eax
+; AVX512DQ-NEXT: vpbroadcastd %eax, %zmm2 {%k1} {z}
+; AVX512DQ-NEXT: vpmovdb %zmm2, %xmm2
+; AVX512DQ-NEXT: vcmpltps %zmm1, %zmm3, %k2
+; AVX512DQ-NEXT: vpbroadcastd %eax, %zmm3 {%k2} {z}
+; AVX512DQ-NEXT: vpmovdb %zmm3, %xmm3
+; AVX512DQ-NEXT: vmovups 68(%rdi), %zmm4 {%k2} {z}
+; AVX512DQ-NEXT: vmovups 4(%rdi), %zmm5 {%k1} {z}
+; AVX512DQ-NEXT: vcmpltps %zmm5, %zmm0, %k1
+; AVX512DQ-NEXT: vpbroadcastd %eax, %zmm5 {%k1} {z}
+; AVX512DQ-NEXT: vpmovdb %zmm5, %xmm5
+; AVX512DQ-NEXT: vpor %xmm5, %xmm2, %xmm2
+; AVX512DQ-NEXT: vcmpltps %zmm4, %zmm1, %k1
+; AVX512DQ-NEXT: vpbroadcastd %eax, %zmm4 {%k1} {z}
+; AVX512DQ-NEXT: vpmovdb %zmm4, %xmm4
+; AVX512DQ-NEXT: vpor %xmm4, %xmm3, %xmm3
; AVX512DQ-NEXT: vpmovsxbd %xmm3, %zmm3
; AVX512DQ-NEXT: vpslld $31, %zmm3, %zmm3
; AVX512DQ-NEXT: vptestmd %zmm3, %zmm3, %k0
@@ -2462,7 +1912,7 @@ define void @ktest_2(<32 x float> %in, float * %base) {
; AVX512DQ-NEXT: kmovw %k0, (%rsp)
; AVX512DQ-NEXT: cmpl $0, (%rsp)
; AVX512DQ-NEXT: je LBB42_2
-; AVX512DQ-NEXT: ## BB#1: ## %L1
+; AVX512DQ-NEXT: ## %bb.1: ## %L1
; AVX512DQ-NEXT: vmovaps %zmm0, (%rdi)
; AVX512DQ-NEXT: vmovaps %zmm1, 64(%rdi)
; AVX512DQ-NEXT: jmp LBB42_3
@@ -2503,27 +1953,27 @@ End:
define <8 x i64> @load_8i1(<8 x i1>* %a) {
; KNL-LABEL: load_8i1:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: movzbl (%rdi), %eax
; KNL-NEXT: kmovw %eax, %k1
; KNL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; KNL-NEXT: retq
;
; SKX-LABEL: load_8i1:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: kmovb (%rdi), %k0
; SKX-NEXT: vpmovm2q %k0, %zmm0
; SKX-NEXT: retq
;
; AVX512BW-LABEL: load_8i1:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: movzbl (%rdi), %eax
; AVX512BW-NEXT: kmovd %eax, %k1
; AVX512BW-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: load_8i1:
-; AVX512DQ: ## BB#0:
+; AVX512DQ: ## %bb.0:
; AVX512DQ-NEXT: kmovb (%rdi), %k0
; AVX512DQ-NEXT: vpmovm2q %k0, %zmm0
; AVX512DQ-NEXT: retq
@@ -2534,25 +1984,25 @@ define <8 x i64> @load_8i1(<8 x i1>* %a) {
define <16 x i32> @load_16i1(<16 x i1>* %a) {
; KNL-LABEL: load_16i1:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: kmovw (%rdi), %k1
; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; KNL-NEXT: retq
;
; SKX-LABEL: load_16i1:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: kmovw (%rdi), %k0
; SKX-NEXT: vpmovm2d %k0, %zmm0
; SKX-NEXT: retq
;
; AVX512BW-LABEL: load_16i1:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovw (%rdi), %k1
; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: load_16i1:
-; AVX512DQ: ## BB#0:
+; AVX512DQ: ## %bb.0:
; AVX512DQ-NEXT: kmovw (%rdi), %k0
; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
; AVX512DQ-NEXT: retq
@@ -2563,33 +2013,34 @@ define <16 x i32> @load_16i1(<16 x i1>* %a) {
define <2 x i16> @load_2i1(<2 x i1>* %a) {
; KNL-LABEL: load_2i1:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: movzbl (%rdi), %eax
; KNL-NEXT: kmovw %eax, %k1
; KNL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; KNL-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; KNL-NEXT: ## kill: def %xmm0 killed %xmm0 killed %zmm0
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: load_2i1:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: kmovb (%rdi), %k0
; SKX-NEXT: vpmovm2q %k0, %xmm0
; SKX-NEXT: retq
;
; AVX512BW-LABEL: load_2i1:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: movzbl (%rdi), %eax
; AVX512BW-NEXT: kmovd %eax, %k1
; AVX512BW-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; AVX512BW-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512BW-NEXT: ## kill: def %xmm0 killed %xmm0 killed %zmm0
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: load_2i1:
-; AVX512DQ: ## BB#0:
+; AVX512DQ: ## %bb.0:
; AVX512DQ-NEXT: kmovb (%rdi), %k0
; AVX512DQ-NEXT: vpmovm2q %k0, %zmm0
-; AVX512DQ-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512DQ-NEXT: ## kill: def %xmm0 killed %xmm0 killed %zmm0
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
%b = load <2 x i1>, <2 x i1>* %a
@@ -2599,35 +2050,34 @@ define <2 x i16> @load_2i1(<2 x i1>* %a) {
define <4 x i16> @load_4i1(<4 x i1>* %a) {
; KNL-LABEL: load_4i1:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: movzbl (%rdi), %eax
; KNL-NEXT: kmovw %eax, %k1
-; KNL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; KNL-NEXT: vpmovqd %zmm0, %ymm0
-; KNL-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT: ## kill: def %xmm0 killed %xmm0 killed %zmm0
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: load_4i1:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: kmovb (%rdi), %k0
; SKX-NEXT: vpmovm2d %k0, %xmm0
; SKX-NEXT: retq
;
; AVX512BW-LABEL: load_4i1:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: movzbl (%rdi), %eax
; AVX512BW-NEXT: kmovd %eax, %k1
-; AVX512BW-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512BW-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT: ## kill: def %xmm0 killed %xmm0 killed %zmm0
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: load_4i1:
-; AVX512DQ: ## BB#0:
+; AVX512DQ: ## %bb.0:
; AVX512DQ-NEXT: kmovb (%rdi), %k0
; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
-; AVX512DQ-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512DQ-NEXT: ## kill: def %xmm0 killed %xmm0 killed %zmm0
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
%b = load <4 x i1>, <4 x i1>* %a
@@ -2637,7 +2087,7 @@ define <4 x i16> @load_4i1(<4 x i1>* %a) {
define <32 x i16> @load_32i1(<32 x i1>* %a) {
; KNL-LABEL: load_32i1:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: kmovw (%rdi), %k1
; KNL-NEXT: kmovw 2(%rdi), %k2
; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
@@ -2647,19 +2097,19 @@ define <32 x i16> @load_32i1(<32 x i1>* %a) {
; KNL-NEXT: retq
;
; SKX-LABEL: load_32i1:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: kmovd (%rdi), %k0
; SKX-NEXT: vpmovm2w %k0, %zmm0
; SKX-NEXT: retq
;
; AVX512BW-LABEL: load_32i1:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovd (%rdi), %k0
; AVX512BW-NEXT: vpmovm2w %k0, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: load_32i1:
-; AVX512DQ: ## BB#0:
+; AVX512DQ: ## %bb.0:
; AVX512DQ-NEXT: kmovw (%rdi), %k0
; AVX512DQ-NEXT: kmovw 2(%rdi), %k1
; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
@@ -2674,7 +2124,7 @@ define <32 x i16> @load_32i1(<32 x i1>* %a) {
define <64 x i8> @load_64i1(<64 x i1>* %a) {
; KNL-LABEL: load_64i1:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: kmovw (%rdi), %k1
; KNL-NEXT: kmovw 2(%rdi), %k2
; KNL-NEXT: kmovw 4(%rdi), %k3
@@ -2692,19 +2142,19 @@ define <64 x i8> @load_64i1(<64 x i1>* %a) {
; KNL-NEXT: retq
;
; SKX-LABEL: load_64i1:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: kmovq (%rdi), %k0
; SKX-NEXT: vpmovm2b %k0, %zmm0
; SKX-NEXT: retq
;
; AVX512BW-LABEL: load_64i1:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovq (%rdi), %k0
; AVX512BW-NEXT: vpmovm2b %k0, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: load_64i1:
-; AVX512DQ: ## BB#0:
+; AVX512DQ: ## %bb.0:
; AVX512DQ-NEXT: kmovw (%rdi), %k0
; AVX512DQ-NEXT: kmovw 2(%rdi), %k1
; AVX512DQ-NEXT: kmovw 4(%rdi), %k2
@@ -2727,23 +2177,24 @@ define <64 x i8> @load_64i1(<64 x i1>* %a) {
define void @store_8i1(<8 x i1>* %a, <8 x i1> %v) {
; KNL-LABEL: store_8i1:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: vpmovsxwq %xmm0, %zmm0
; KNL-NEXT: vpsllq $63, %zmm0, %zmm0
; KNL-NEXT: vptestmq %zmm0, %zmm0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: movb %al, (%rdi)
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: store_8i1:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpsllw $15, %xmm0, %xmm0
; SKX-NEXT: vpmovw2m %xmm0, %k0
; SKX-NEXT: kmovb %k0, (%rdi)
; SKX-NEXT: retq
;
; AVX512BW-LABEL: store_8i1:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: vpsllw $15, %xmm0, %xmm0
; AVX512BW-NEXT: vpmovw2m %zmm0, %k0
; AVX512BW-NEXT: kmovd %k0, %eax
@@ -2752,7 +2203,7 @@ define void @store_8i1(<8 x i1>* %a, <8 x i1> %v) {
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: store_8i1:
-; AVX512DQ: ## BB#0:
+; AVX512DQ: ## %bb.0:
; AVX512DQ-NEXT: vpmovsxwq %xmm0, %zmm0
; AVX512DQ-NEXT: vpsllq $63, %zmm0, %zmm0
; AVX512DQ-NEXT: vptestmq %zmm0, %zmm0, %k0
@@ -2765,23 +2216,24 @@ define void @store_8i1(<8 x i1>* %a, <8 x i1> %v) {
define void @store_8i1_1(<8 x i1>* %a, <8 x i16> %v) {
; KNL-LABEL: store_8i1_1:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: vpmovsxwq %xmm0, %zmm0
; KNL-NEXT: vpsllq $63, %zmm0, %zmm0
; KNL-NEXT: vptestmq %zmm0, %zmm0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: movb %al, (%rdi)
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: store_8i1_1:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpsllw $15, %xmm0, %xmm0
; SKX-NEXT: vpmovw2m %xmm0, %k0
; SKX-NEXT: kmovb %k0, (%rdi)
; SKX-NEXT: retq
;
; AVX512BW-LABEL: store_8i1_1:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: vpsllw $15, %xmm0, %xmm0
; AVX512BW-NEXT: vpmovw2m %zmm0, %k0
; AVX512BW-NEXT: kmovd %k0, %eax
@@ -2790,7 +2242,7 @@ define void @store_8i1_1(<8 x i1>* %a, <8 x i16> %v) {
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: store_8i1_1:
-; AVX512DQ: ## BB#0:
+; AVX512DQ: ## %bb.0:
; AVX512DQ-NEXT: vpmovsxwq %xmm0, %zmm0
; AVX512DQ-NEXT: vpsllq $63, %zmm0, %zmm0
; AVX512DQ-NEXT: vptestmq %zmm0, %zmm0, %k0
@@ -2804,22 +2256,23 @@ define void @store_8i1_1(<8 x i1>* %a, <8 x i16> %v) {
define void @store_16i1(<16 x i1>* %a, <16 x i1> %v) {
; KNL-LABEL: store_16i1:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
; KNL-NEXT: kmovw %k0, (%rdi)
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: store_16i1:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpsllw $7, %xmm0, %xmm0
; SKX-NEXT: vpmovb2m %xmm0, %k0
; SKX-NEXT: kmovw %k0, (%rdi)
; SKX-NEXT: retq
;
; AVX512BW-LABEL: store_16i1:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: vpsllw $7, %xmm0, %xmm0
; AVX512BW-NEXT: vpmovb2m %zmm0, %k0
; AVX512BW-NEXT: kmovw %k0, (%rdi)
@@ -2827,7 +2280,7 @@ define void @store_16i1(<16 x i1>* %a, <16 x i1> %v) {
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: store_16i1:
-; AVX512DQ: ## BB#0:
+; AVX512DQ: ## %bb.0:
; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0
; AVX512DQ-NEXT: vpslld $31, %zmm0, %zmm0
; AVX512DQ-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -2840,7 +2293,7 @@ define void @store_16i1(<16 x i1>* %a, <16 x i1> %v) {
define void @store_32i1(<32 x i1>* %a, <32 x i1> %v) {
; KNL-LABEL: store_32i1:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: vextracti128 $1, %ymm0, %xmm1
; KNL-NEXT: vpmovsxbd %xmm1, %zmm1
; KNL-NEXT: vpslld $31, %zmm1, %zmm1
@@ -2850,10 +2303,11 @@ define void @store_32i1(<32 x i1>* %a, <32 x i1> %v) {
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
; KNL-NEXT: kmovw %k0, (%rdi)
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: store_32i1:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpsllw $7, %ymm0, %ymm0
; SKX-NEXT: vpmovb2m %ymm0, %k0
; SKX-NEXT: kmovd %k0, (%rdi)
@@ -2861,7 +2315,7 @@ define void @store_32i1(<32 x i1>* %a, <32 x i1> %v) {
; SKX-NEXT: retq
;
; AVX512BW-LABEL: store_32i1:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: vpsllw $7, %ymm0, %ymm0
; AVX512BW-NEXT: vpmovb2m %zmm0, %k0
; AVX512BW-NEXT: kmovd %k0, (%rdi)
@@ -2869,7 +2323,7 @@ define void @store_32i1(<32 x i1>* %a, <32 x i1> %v) {
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: store_32i1:
-; AVX512DQ: ## BB#0:
+; AVX512DQ: ## %bb.0:
; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512DQ-NEXT: vpmovsxbd %xmm1, %zmm1
; AVX512DQ-NEXT: vpslld $31, %zmm1, %zmm1
@@ -2887,7 +2341,7 @@ define void @store_32i1(<32 x i1>* %a, <32 x i1> %v) {
define void @store_32i1_1(<32 x i1>* %a, <32 x i16> %v) {
; KNL-LABEL: store_32i1_1:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: vpmovsxwd %ymm0, %zmm0
; KNL-NEXT: vpmovdb %zmm0, %xmm0
; KNL-NEXT: vpmovsxwd %ymm1, %zmm1
@@ -2900,10 +2354,11 @@ define void @store_32i1_1(<32 x i1>* %a, <32 x i16> %v) {
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
; KNL-NEXT: kmovw %k0, (%rdi)
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: store_32i1_1:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpsllw $15, %zmm0, %zmm0
; SKX-NEXT: vpmovw2m %zmm0, %k0
; SKX-NEXT: kmovd %k0, (%rdi)
@@ -2911,7 +2366,7 @@ define void @store_32i1_1(<32 x i1>* %a, <32 x i16> %v) {
; SKX-NEXT: retq
;
; AVX512BW-LABEL: store_32i1_1:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: vpsllw $15, %zmm0, %zmm0
; AVX512BW-NEXT: vpmovw2m %zmm0, %k0
; AVX512BW-NEXT: kmovd %k0, (%rdi)
@@ -2919,7 +2374,7 @@ define void @store_32i1_1(<32 x i1>* %a, <32 x i16> %v) {
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: store_32i1_1:
-; AVX512DQ: ## BB#0:
+; AVX512DQ: ## %bb.0:
; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
; AVX512DQ-NEXT: vpmovsxwd %ymm1, %zmm1
@@ -2943,327 +2398,28 @@ define void @store_32i1_1(<32 x i1>* %a, <32 x i16> %v) {
define void @store_64i1(<64 x i1>* %a, <64 x i1> %v) {
;
; KNL-LABEL: store_64i1:
-; KNL: ## BB#0:
-; KNL-NEXT: pushq %rbp
-; KNL-NEXT: Lcfi9:
-; KNL-NEXT: .cfi_def_cfa_offset 16
-; KNL-NEXT: pushq %r15
-; KNL-NEXT: Lcfi10:
-; KNL-NEXT: .cfi_def_cfa_offset 24
-; KNL-NEXT: pushq %r14
-; KNL-NEXT: Lcfi11:
-; KNL-NEXT: .cfi_def_cfa_offset 32
-; KNL-NEXT: pushq %r13
-; KNL-NEXT: Lcfi12:
-; KNL-NEXT: .cfi_def_cfa_offset 40
-; KNL-NEXT: pushq %r12
-; KNL-NEXT: Lcfi13:
-; KNL-NEXT: .cfi_def_cfa_offset 48
-; KNL-NEXT: pushq %rbx
-; KNL-NEXT: Lcfi14:
-; KNL-NEXT: .cfi_def_cfa_offset 56
-; KNL-NEXT: Lcfi15:
-; KNL-NEXT: .cfi_offset %rbx, -56
-; KNL-NEXT: Lcfi16:
-; KNL-NEXT: .cfi_offset %r12, -48
-; KNL-NEXT: Lcfi17:
-; KNL-NEXT: .cfi_offset %r13, -40
-; KNL-NEXT: Lcfi18:
-; KNL-NEXT: .cfi_offset %r14, -32
-; KNL-NEXT: Lcfi19:
-; KNL-NEXT: .cfi_offset %r15, -24
-; KNL-NEXT: Lcfi20:
-; KNL-NEXT: .cfi_offset %rbp, -16
-; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
-; KNL-NEXT: vpslld $31, %zmm0, %zmm0
-; KNL-NEXT: vpmovsxbd %xmm1, %zmm1
-; KNL-NEXT: vpslld $31, %zmm1, %zmm1
-; KNL-NEXT: vpmovsxbd %xmm2, %zmm2
-; KNL-NEXT: vpslld $31, %zmm2, %zmm2
+; KNL: ## %bb.0:
; KNL-NEXT: vpmovsxbd %xmm3, %zmm3
; KNL-NEXT: vpslld $31, %zmm3, %zmm3
; KNL-NEXT: vptestmd %zmm3, %zmm3, %k0
-; KNL-NEXT: kshiftlw $14, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %r8d
-; KNL-NEXT: kshiftlw $15, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %r9d
-; KNL-NEXT: kshiftlw $13, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %r10d
-; KNL-NEXT: kshiftlw $12, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %r11d
-; KNL-NEXT: kshiftlw $11, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %r14d
-; KNL-NEXT: kshiftlw $10, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %r15d
-; KNL-NEXT: kshiftlw $9, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %r12d
-; KNL-NEXT: kshiftlw $8, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %r13d
-; KNL-NEXT: kshiftlw $7, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %ebx
-; KNL-NEXT: kshiftlw $6, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %ebp
-; KNL-NEXT: kshiftlw $5, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: kshiftlw $4, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %ecx
-; KNL-NEXT: kshiftlw $3, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %edx
-; KNL-NEXT: kshiftlw $2, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %esi
-; KNL-NEXT: kshiftlw $1, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: vmovd %r9d, %xmm3
-; KNL-NEXT: kmovw %k1, %r9d
-; KNL-NEXT: vptestmd %zmm2, %zmm2, %k2
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: vpinsrb $1, %r8d, %xmm3, %xmm2
-; KNL-NEXT: vpinsrb $2, %r10d, %xmm2, %xmm2
-; KNL-NEXT: vpinsrb $3, %r11d, %xmm2, %xmm2
-; KNL-NEXT: vpinsrb $4, %r14d, %xmm2, %xmm2
-; KNL-NEXT: vpinsrb $5, %r15d, %xmm2, %xmm2
-; KNL-NEXT: vpinsrb $6, %r12d, %xmm2, %xmm2
-; KNL-NEXT: vpinsrb $7, %r13d, %xmm2, %xmm2
-; KNL-NEXT: vpinsrb $8, %ebx, %xmm2, %xmm2
-; KNL-NEXT: vpinsrb $9, %ebp, %xmm2, %xmm2
-; KNL-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2
-; KNL-NEXT: vpinsrb $11, %ecx, %xmm2, %xmm2
-; KNL-NEXT: vpinsrb $12, %edx, %xmm2, %xmm2
-; KNL-NEXT: vpinsrb $13, %esi, %xmm2, %xmm2
-; KNL-NEXT: vpinsrb $14, %r9d, %xmm2, %xmm2
-; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2
+; KNL-NEXT: kmovw %k0, 6(%rdi)
; KNL-NEXT: vpmovsxbd %xmm2, %zmm2
; KNL-NEXT: vpslld $31, %zmm2, %zmm2
; KNL-NEXT: vptestmd %zmm2, %zmm2, %k0
-; KNL-NEXT: kmovw %k0, 6(%rdi)
-; KNL-NEXT: kshiftlw $14, %k2, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %r8d
-; KNL-NEXT: kshiftlw $15, %k2, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %r10d
-; KNL-NEXT: kshiftlw $13, %k2, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %r9d
-; KNL-NEXT: kshiftlw $12, %k2, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %r11d
-; KNL-NEXT: kshiftlw $11, %k2, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %r14d
-; KNL-NEXT: kshiftlw $10, %k2, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %r15d
-; KNL-NEXT: kshiftlw $9, %k2, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %r12d
-; KNL-NEXT: kshiftlw $8, %k2, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %r13d
-; KNL-NEXT: kshiftlw $7, %k2, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %ecx
-; KNL-NEXT: kshiftlw $6, %k2, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %esi
-; KNL-NEXT: kshiftlw $5, %k2, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %ebp
-; KNL-NEXT: kshiftlw $4, %k2, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %ebx
-; KNL-NEXT: kshiftlw $3, %k2, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: kshiftlw $2, %k2, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %edx
-; KNL-NEXT: kshiftlw $1, %k2, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: vmovd %r10d, %xmm2
-; KNL-NEXT: kmovw %k0, %r10d
-; KNL-NEXT: vptestmd %zmm1, %zmm1, %k1
-; KNL-NEXT: kshiftrw $15, %k2, %k0
-; KNL-NEXT: vpinsrb $1, %r8d, %xmm2, %xmm1
-; KNL-NEXT: vpinsrb $2, %r9d, %xmm1, %xmm1
-; KNL-NEXT: vpinsrb $3, %r11d, %xmm1, %xmm1
-; KNL-NEXT: vpinsrb $4, %r14d, %xmm1, %xmm1
-; KNL-NEXT: vpinsrb $5, %r15d, %xmm1, %xmm1
-; KNL-NEXT: vpinsrb $6, %r12d, %xmm1, %xmm1
-; KNL-NEXT: vpinsrb $7, %r13d, %xmm1, %xmm1
-; KNL-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
-; KNL-NEXT: vpinsrb $9, %esi, %xmm1, %xmm1
-; KNL-NEXT: vpinsrb $10, %ebp, %xmm1, %xmm1
-; KNL-NEXT: vpinsrb $11, %ebx, %xmm1, %xmm1
-; KNL-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
-; KNL-NEXT: vpinsrb $13, %edx, %xmm1, %xmm1
-; KNL-NEXT: vpinsrb $14, %r10d, %xmm1, %xmm1
-; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1
+; KNL-NEXT: kmovw %k0, 4(%rdi)
; KNL-NEXT: vpmovsxbd %xmm1, %zmm1
; KNL-NEXT: vpslld $31, %zmm1, %zmm1
; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0
-; KNL-NEXT: kmovw %k0, 4(%rdi)
-; KNL-NEXT: kshiftlw $14, %k1, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %r8d
-; KNL-NEXT: kshiftlw $15, %k1, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %r10d
-; KNL-NEXT: kshiftlw $13, %k1, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %r9d
-; KNL-NEXT: kshiftlw $12, %k1, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %r11d
-; KNL-NEXT: kshiftlw $11, %k1, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %r14d
-; KNL-NEXT: kshiftlw $10, %k1, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %r15d
-; KNL-NEXT: kshiftlw $9, %k1, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %r12d
-; KNL-NEXT: kshiftlw $8, %k1, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %r13d
-; KNL-NEXT: kshiftlw $7, %k1, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %ecx
-; KNL-NEXT: kshiftlw $6, %k1, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %esi
-; KNL-NEXT: kshiftlw $5, %k1, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %ebp
-; KNL-NEXT: kshiftlw $4, %k1, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %ebx
-; KNL-NEXT: kshiftlw $3, %k1, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: kshiftlw $2, %k1, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %edx
-; KNL-NEXT: kshiftlw $1, %k1, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: vmovd %r10d, %xmm1
-; KNL-NEXT: kmovw %k0, %r10d
-; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: vpinsrb $1, %r8d, %xmm1, %xmm0
-; KNL-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; KNL-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; KNL-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; KNL-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; KNL-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; KNL-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; KNL-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0
-; KNL-NEXT: vpinsrb $9, %esi, %xmm0, %xmm0
-; KNL-NEXT: vpinsrb $10, %ebp, %xmm0, %xmm0
-; KNL-NEXT: vpinsrb $11, %ebx, %xmm0, %xmm0
-; KNL-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
-; KNL-NEXT: vpinsrb $13, %edx, %xmm0, %xmm0
-; KNL-NEXT: vpinsrb $14, %r10d, %xmm0, %xmm0
-; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
-; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
-; KNL-NEXT: vpslld $31, %zmm0, %zmm0
-; KNL-NEXT: vptestmd %zmm0, %zmm0, %k1
-; KNL-NEXT: kmovw %k1, 2(%rdi)
-; KNL-NEXT: kshiftlw $14, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %r8d
-; KNL-NEXT: kshiftlw $15, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %r9d
-; KNL-NEXT: kshiftlw $13, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %r10d
-; KNL-NEXT: kshiftlw $12, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %r11d
-; KNL-NEXT: kshiftlw $11, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %r14d
-; KNL-NEXT: kshiftlw $10, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %r15d
-; KNL-NEXT: kshiftlw $9, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %r12d
-; KNL-NEXT: kshiftlw $8, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %r13d
-; KNL-NEXT: kshiftlw $7, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %edx
-; KNL-NEXT: kshiftlw $6, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %esi
-; KNL-NEXT: kshiftlw $5, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %ebp
-; KNL-NEXT: kshiftlw $4, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %ebx
-; KNL-NEXT: kshiftlw $3, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: kshiftlw $2, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %ecx
-; KNL-NEXT: kshiftlw $1, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: vmovd %r9d, %xmm0
-; KNL-NEXT: kmovw %k1, %r9d
-; KNL-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; KNL-NEXT: vpinsrb $2, %r10d, %xmm0, %xmm0
-; KNL-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; KNL-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; KNL-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; KNL-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; KNL-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; KNL-NEXT: vpinsrb $8, %edx, %xmm0, %xmm0
-; KNL-NEXT: vpinsrb $9, %esi, %xmm0, %xmm0
-; KNL-NEXT: vpinsrb $10, %ebp, %xmm0, %xmm0
-; KNL-NEXT: vpinsrb $11, %ebx, %xmm0, %xmm0
-; KNL-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0
-; KNL-NEXT: vpinsrb $14, %r9d, %xmm0, %xmm0
-; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; KNL-NEXT: kmovw %k0, 2(%rdi)
; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
; KNL-NEXT: kmovw %k0, (%rdi)
-; KNL-NEXT: popq %rbx
-; KNL-NEXT: popq %r12
-; KNL-NEXT: popq %r13
-; KNL-NEXT: popq %r14
-; KNL-NEXT: popq %r15
-; KNL-NEXT: popq %rbp
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: store_64i1:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpsllw $7, %zmm0, %zmm0
; SKX-NEXT: vpmovb2m %zmm0, %k0
; SKX-NEXT: kmovq %k0, (%rdi)
@@ -3271,7 +2427,7 @@ define void @store_64i1(<64 x i1>* %a, <64 x i1> %v) {
; SKX-NEXT: retq
;
; AVX512BW-LABEL: store_64i1:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: vpsllw $7, %zmm0, %zmm0
; AVX512BW-NEXT: vpmovb2m %zmm0, %k0
; AVX512BW-NEXT: kmovq %k0, (%rdi)
@@ -3279,323 +2435,23 @@ define void @store_64i1(<64 x i1>* %a, <64 x i1> %v) {
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: store_64i1:
-; AVX512DQ: ## BB#0:
-; AVX512DQ-NEXT: pushq %rbp
-; AVX512DQ-NEXT: Lcfi9:
-; AVX512DQ-NEXT: .cfi_def_cfa_offset 16
-; AVX512DQ-NEXT: pushq %r15
-; AVX512DQ-NEXT: Lcfi10:
-; AVX512DQ-NEXT: .cfi_def_cfa_offset 24
-; AVX512DQ-NEXT: pushq %r14
-; AVX512DQ-NEXT: Lcfi11:
-; AVX512DQ-NEXT: .cfi_def_cfa_offset 32
-; AVX512DQ-NEXT: pushq %r13
-; AVX512DQ-NEXT: Lcfi12:
-; AVX512DQ-NEXT: .cfi_def_cfa_offset 40
-; AVX512DQ-NEXT: pushq %r12
-; AVX512DQ-NEXT: Lcfi13:
-; AVX512DQ-NEXT: .cfi_def_cfa_offset 48
-; AVX512DQ-NEXT: pushq %rbx
-; AVX512DQ-NEXT: Lcfi14:
-; AVX512DQ-NEXT: .cfi_def_cfa_offset 56
-; AVX512DQ-NEXT: Lcfi15:
-; AVX512DQ-NEXT: .cfi_offset %rbx, -56
-; AVX512DQ-NEXT: Lcfi16:
-; AVX512DQ-NEXT: .cfi_offset %r12, -48
-; AVX512DQ-NEXT: Lcfi17:
-; AVX512DQ-NEXT: .cfi_offset %r13, -40
-; AVX512DQ-NEXT: Lcfi18:
-; AVX512DQ-NEXT: .cfi_offset %r14, -32
-; AVX512DQ-NEXT: Lcfi19:
-; AVX512DQ-NEXT: .cfi_offset %r15, -24
-; AVX512DQ-NEXT: Lcfi20:
-; AVX512DQ-NEXT: .cfi_offset %rbp, -16
-; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0
-; AVX512DQ-NEXT: vpslld $31, %zmm0, %zmm0
-; AVX512DQ-NEXT: vpmovsxbd %xmm1, %zmm1
-; AVX512DQ-NEXT: vpslld $31, %zmm1, %zmm1
-; AVX512DQ-NEXT: vpmovsxbd %xmm2, %zmm2
-; AVX512DQ-NEXT: vpslld $31, %zmm2, %zmm2
+; AVX512DQ: ## %bb.0:
; AVX512DQ-NEXT: vpmovsxbd %xmm3, %zmm3
; AVX512DQ-NEXT: vpslld $31, %zmm3, %zmm3
; AVX512DQ-NEXT: vptestmd %zmm3, %zmm3, %k0
-; AVX512DQ-NEXT: kshiftlw $14, %k0, %k1
-; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
-; AVX512DQ-NEXT: kmovw %k1, %r8d
-; AVX512DQ-NEXT: kshiftlw $15, %k0, %k1
-; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
-; AVX512DQ-NEXT: kmovw %k1, %r9d
-; AVX512DQ-NEXT: kshiftlw $13, %k0, %k1
-; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
-; AVX512DQ-NEXT: kmovw %k1, %r10d
-; AVX512DQ-NEXT: kshiftlw $12, %k0, %k1
-; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
-; AVX512DQ-NEXT: kmovw %k1, %r11d
-; AVX512DQ-NEXT: kshiftlw $11, %k0, %k1
-; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
-; AVX512DQ-NEXT: kmovw %k1, %r14d
-; AVX512DQ-NEXT: kshiftlw $10, %k0, %k1
-; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
-; AVX512DQ-NEXT: kmovw %k1, %r15d
-; AVX512DQ-NEXT: kshiftlw $9, %k0, %k1
-; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
-; AVX512DQ-NEXT: kmovw %k1, %r12d
-; AVX512DQ-NEXT: kshiftlw $8, %k0, %k1
-; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
-; AVX512DQ-NEXT: kmovw %k1, %r13d
-; AVX512DQ-NEXT: kshiftlw $7, %k0, %k1
-; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
-; AVX512DQ-NEXT: kmovw %k1, %ebx
-; AVX512DQ-NEXT: kshiftlw $6, %k0, %k1
-; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
-; AVX512DQ-NEXT: kmovw %k1, %ebp
-; AVX512DQ-NEXT: kshiftlw $5, %k0, %k1
-; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
-; AVX512DQ-NEXT: kmovw %k1, %eax
-; AVX512DQ-NEXT: kshiftlw $4, %k0, %k1
-; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
-; AVX512DQ-NEXT: kmovw %k1, %ecx
-; AVX512DQ-NEXT: kshiftlw $3, %k0, %k1
-; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
-; AVX512DQ-NEXT: kmovw %k1, %edx
-; AVX512DQ-NEXT: kshiftlw $2, %k0, %k1
-; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
-; AVX512DQ-NEXT: kmovw %k1, %esi
-; AVX512DQ-NEXT: kshiftlw $1, %k0, %k1
-; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
-; AVX512DQ-NEXT: vmovd %r9d, %xmm3
-; AVX512DQ-NEXT: kmovw %k1, %r9d
-; AVX512DQ-NEXT: vptestmd %zmm2, %zmm2, %k2
-; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0
-; AVX512DQ-NEXT: vpinsrb $1, %r8d, %xmm3, %xmm2
-; AVX512DQ-NEXT: vpinsrb $2, %r10d, %xmm2, %xmm2
-; AVX512DQ-NEXT: vpinsrb $3, %r11d, %xmm2, %xmm2
-; AVX512DQ-NEXT: vpinsrb $4, %r14d, %xmm2, %xmm2
-; AVX512DQ-NEXT: vpinsrb $5, %r15d, %xmm2, %xmm2
-; AVX512DQ-NEXT: vpinsrb $6, %r12d, %xmm2, %xmm2
-; AVX512DQ-NEXT: vpinsrb $7, %r13d, %xmm2, %xmm2
-; AVX512DQ-NEXT: vpinsrb $8, %ebx, %xmm2, %xmm2
-; AVX512DQ-NEXT: vpinsrb $9, %ebp, %xmm2, %xmm2
-; AVX512DQ-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2
-; AVX512DQ-NEXT: vpinsrb $11, %ecx, %xmm2, %xmm2
-; AVX512DQ-NEXT: vpinsrb $12, %edx, %xmm2, %xmm2
-; AVX512DQ-NEXT: vpinsrb $13, %esi, %xmm2, %xmm2
-; AVX512DQ-NEXT: vpinsrb $14, %r9d, %xmm2, %xmm2
-; AVX512DQ-NEXT: kmovw %k0, %eax
-; AVX512DQ-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2
+; AVX512DQ-NEXT: kmovw %k0, 6(%rdi)
; AVX512DQ-NEXT: vpmovsxbd %xmm2, %zmm2
; AVX512DQ-NEXT: vpslld $31, %zmm2, %zmm2
; AVX512DQ-NEXT: vptestmd %zmm2, %zmm2, %k0
-; AVX512DQ-NEXT: kmovw %k0, 6(%rdi)
-; AVX512DQ-NEXT: kshiftlw $14, %k2, %k0
-; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0
-; AVX512DQ-NEXT: kmovw %k0, %r8d
-; AVX512DQ-NEXT: kshiftlw $15, %k2, %k0
-; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0
-; AVX512DQ-NEXT: kmovw %k0, %r10d
-; AVX512DQ-NEXT: kshiftlw $13, %k2, %k0
-; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0
-; AVX512DQ-NEXT: kmovw %k0, %r9d
-; AVX512DQ-NEXT: kshiftlw $12, %k2, %k0
-; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0
-; AVX512DQ-NEXT: kmovw %k0, %r11d
-; AVX512DQ-NEXT: kshiftlw $11, %k2, %k0
-; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0
-; AVX512DQ-NEXT: kmovw %k0, %r14d
-; AVX512DQ-NEXT: kshiftlw $10, %k2, %k0
-; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0
-; AVX512DQ-NEXT: kmovw %k0, %r15d
-; AVX512DQ-NEXT: kshiftlw $9, %k2, %k0
-; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0
-; AVX512DQ-NEXT: kmovw %k0, %r12d
-; AVX512DQ-NEXT: kshiftlw $8, %k2, %k0
-; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0
-; AVX512DQ-NEXT: kmovw %k0, %r13d
-; AVX512DQ-NEXT: kshiftlw $7, %k2, %k0
-; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0
-; AVX512DQ-NEXT: kmovw %k0, %ecx
-; AVX512DQ-NEXT: kshiftlw $6, %k2, %k0
-; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0
-; AVX512DQ-NEXT: kmovw %k0, %esi
-; AVX512DQ-NEXT: kshiftlw $5, %k2, %k0
-; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0
-; AVX512DQ-NEXT: kmovw %k0, %ebp
-; AVX512DQ-NEXT: kshiftlw $4, %k2, %k0
-; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0
-; AVX512DQ-NEXT: kmovw %k0, %ebx
-; AVX512DQ-NEXT: kshiftlw $3, %k2, %k0
-; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0
-; AVX512DQ-NEXT: kmovw %k0, %eax
-; AVX512DQ-NEXT: kshiftlw $2, %k2, %k0
-; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0
-; AVX512DQ-NEXT: kmovw %k0, %edx
-; AVX512DQ-NEXT: kshiftlw $1, %k2, %k0
-; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0
-; AVX512DQ-NEXT: vmovd %r10d, %xmm2
-; AVX512DQ-NEXT: kmovw %k0, %r10d
-; AVX512DQ-NEXT: vptestmd %zmm1, %zmm1, %k1
-; AVX512DQ-NEXT: kshiftrw $15, %k2, %k0
-; AVX512DQ-NEXT: vpinsrb $1, %r8d, %xmm2, %xmm1
-; AVX512DQ-NEXT: vpinsrb $2, %r9d, %xmm1, %xmm1
-; AVX512DQ-NEXT: vpinsrb $3, %r11d, %xmm1, %xmm1
-; AVX512DQ-NEXT: vpinsrb $4, %r14d, %xmm1, %xmm1
-; AVX512DQ-NEXT: vpinsrb $5, %r15d, %xmm1, %xmm1
-; AVX512DQ-NEXT: vpinsrb $6, %r12d, %xmm1, %xmm1
-; AVX512DQ-NEXT: vpinsrb $7, %r13d, %xmm1, %xmm1
-; AVX512DQ-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
-; AVX512DQ-NEXT: vpinsrb $9, %esi, %xmm1, %xmm1
-; AVX512DQ-NEXT: vpinsrb $10, %ebp, %xmm1, %xmm1
-; AVX512DQ-NEXT: vpinsrb $11, %ebx, %xmm1, %xmm1
-; AVX512DQ-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
-; AVX512DQ-NEXT: vpinsrb $13, %edx, %xmm1, %xmm1
-; AVX512DQ-NEXT: vpinsrb $14, %r10d, %xmm1, %xmm1
-; AVX512DQ-NEXT: kmovw %k0, %eax
-; AVX512DQ-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1
+; AVX512DQ-NEXT: kmovw %k0, 4(%rdi)
; AVX512DQ-NEXT: vpmovsxbd %xmm1, %zmm1
; AVX512DQ-NEXT: vpslld $31, %zmm1, %zmm1
; AVX512DQ-NEXT: vptestmd %zmm1, %zmm1, %k0
-; AVX512DQ-NEXT: kmovw %k0, 4(%rdi)
-; AVX512DQ-NEXT: kshiftlw $14, %k1, %k0
-; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0
-; AVX512DQ-NEXT: kmovw %k0, %r8d
-; AVX512DQ-NEXT: kshiftlw $15, %k1, %k0
-; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0
-; AVX512DQ-NEXT: kmovw %k0, %r10d
-; AVX512DQ-NEXT: kshiftlw $13, %k1, %k0
-; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0
-; AVX512DQ-NEXT: kmovw %k0, %r9d
-; AVX512DQ-NEXT: kshiftlw $12, %k1, %k0
-; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0
-; AVX512DQ-NEXT: kmovw %k0, %r11d
-; AVX512DQ-NEXT: kshiftlw $11, %k1, %k0
-; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0
-; AVX512DQ-NEXT: kmovw %k0, %r14d
-; AVX512DQ-NEXT: kshiftlw $10, %k1, %k0
-; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0
-; AVX512DQ-NEXT: kmovw %k0, %r15d
-; AVX512DQ-NEXT: kshiftlw $9, %k1, %k0
-; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0
-; AVX512DQ-NEXT: kmovw %k0, %r12d
-; AVX512DQ-NEXT: kshiftlw $8, %k1, %k0
-; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0
-; AVX512DQ-NEXT: kmovw %k0, %r13d
-; AVX512DQ-NEXT: kshiftlw $7, %k1, %k0
-; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0
-; AVX512DQ-NEXT: kmovw %k0, %ecx
-; AVX512DQ-NEXT: kshiftlw $6, %k1, %k0
-; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0
-; AVX512DQ-NEXT: kmovw %k0, %esi
-; AVX512DQ-NEXT: kshiftlw $5, %k1, %k0
-; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0
-; AVX512DQ-NEXT: kmovw %k0, %ebp
-; AVX512DQ-NEXT: kshiftlw $4, %k1, %k0
-; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0
-; AVX512DQ-NEXT: kmovw %k0, %ebx
-; AVX512DQ-NEXT: kshiftlw $3, %k1, %k0
-; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0
-; AVX512DQ-NEXT: kmovw %k0, %eax
-; AVX512DQ-NEXT: kshiftlw $2, %k1, %k0
-; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0
-; AVX512DQ-NEXT: kmovw %k0, %edx
-; AVX512DQ-NEXT: kshiftlw $1, %k1, %k0
-; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0
-; AVX512DQ-NEXT: vmovd %r10d, %xmm1
-; AVX512DQ-NEXT: kmovw %k0, %r10d
-; AVX512DQ-NEXT: vptestmd %zmm0, %zmm0, %k0
-; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
-; AVX512DQ-NEXT: vpinsrb $1, %r8d, %xmm1, %xmm0
-; AVX512DQ-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; AVX512DQ-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; AVX512DQ-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; AVX512DQ-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; AVX512DQ-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; AVX512DQ-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; AVX512DQ-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0
-; AVX512DQ-NEXT: vpinsrb $9, %esi, %xmm0, %xmm0
-; AVX512DQ-NEXT: vpinsrb $10, %ebp, %xmm0, %xmm0
-; AVX512DQ-NEXT: vpinsrb $11, %ebx, %xmm0, %xmm0
-; AVX512DQ-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
-; AVX512DQ-NEXT: vpinsrb $13, %edx, %xmm0, %xmm0
-; AVX512DQ-NEXT: vpinsrb $14, %r10d, %xmm0, %xmm0
-; AVX512DQ-NEXT: kmovw %k1, %eax
-; AVX512DQ-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
-; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0
-; AVX512DQ-NEXT: vpslld $31, %zmm0, %zmm0
-; AVX512DQ-NEXT: vptestmd %zmm0, %zmm0, %k1
-; AVX512DQ-NEXT: kmovw %k1, 2(%rdi)
-; AVX512DQ-NEXT: kshiftlw $14, %k0, %k1
-; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
-; AVX512DQ-NEXT: kmovw %k1, %r8d
-; AVX512DQ-NEXT: kshiftlw $15, %k0, %k1
-; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
-; AVX512DQ-NEXT: kmovw %k1, %r9d
-; AVX512DQ-NEXT: kshiftlw $13, %k0, %k1
-; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
-; AVX512DQ-NEXT: kmovw %k1, %r10d
-; AVX512DQ-NEXT: kshiftlw $12, %k0, %k1
-; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
-; AVX512DQ-NEXT: kmovw %k1, %r11d
-; AVX512DQ-NEXT: kshiftlw $11, %k0, %k1
-; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
-; AVX512DQ-NEXT: kmovw %k1, %r14d
-; AVX512DQ-NEXT: kshiftlw $10, %k0, %k1
-; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
-; AVX512DQ-NEXT: kmovw %k1, %r15d
-; AVX512DQ-NEXT: kshiftlw $9, %k0, %k1
-; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
-; AVX512DQ-NEXT: kmovw %k1, %r12d
-; AVX512DQ-NEXT: kshiftlw $8, %k0, %k1
-; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
-; AVX512DQ-NEXT: kmovw %k1, %r13d
-; AVX512DQ-NEXT: kshiftlw $7, %k0, %k1
-; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
-; AVX512DQ-NEXT: kmovw %k1, %edx
-; AVX512DQ-NEXT: kshiftlw $6, %k0, %k1
-; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
-; AVX512DQ-NEXT: kmovw %k1, %esi
-; AVX512DQ-NEXT: kshiftlw $5, %k0, %k1
-; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
-; AVX512DQ-NEXT: kmovw %k1, %ebp
-; AVX512DQ-NEXT: kshiftlw $4, %k0, %k1
-; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
-; AVX512DQ-NEXT: kmovw %k1, %ebx
-; AVX512DQ-NEXT: kshiftlw $3, %k0, %k1
-; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
-; AVX512DQ-NEXT: kmovw %k1, %eax
-; AVX512DQ-NEXT: kshiftlw $2, %k0, %k1
-; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
-; AVX512DQ-NEXT: kmovw %k1, %ecx
-; AVX512DQ-NEXT: kshiftlw $1, %k0, %k1
-; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
-; AVX512DQ-NEXT: vmovd %r9d, %xmm0
-; AVX512DQ-NEXT: kmovw %k1, %r9d
-; AVX512DQ-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; AVX512DQ-NEXT: vpinsrb $2, %r10d, %xmm0, %xmm0
-; AVX512DQ-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; AVX512DQ-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; AVX512DQ-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; AVX512DQ-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; AVX512DQ-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; AVX512DQ-NEXT: vpinsrb $8, %edx, %xmm0, %xmm0
-; AVX512DQ-NEXT: vpinsrb $9, %esi, %xmm0, %xmm0
-; AVX512DQ-NEXT: vpinsrb $10, %ebp, %xmm0, %xmm0
-; AVX512DQ-NEXT: vpinsrb $11, %ebx, %xmm0, %xmm0
-; AVX512DQ-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
-; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0
-; AVX512DQ-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0
-; AVX512DQ-NEXT: vpinsrb $14, %r9d, %xmm0, %xmm0
-; AVX512DQ-NEXT: kmovw %k0, %eax
-; AVX512DQ-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; AVX512DQ-NEXT: kmovw %k0, 2(%rdi)
; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0
; AVX512DQ-NEXT: vpslld $31, %zmm0, %zmm0
; AVX512DQ-NEXT: vptestmd %zmm0, %zmm0, %k0
; AVX512DQ-NEXT: kmovw %k0, (%rdi)
-; AVX512DQ-NEXT: popq %rbx
-; AVX512DQ-NEXT: popq %r12
-; AVX512DQ-NEXT: popq %r13
-; AVX512DQ-NEXT: popq %r14
-; AVX512DQ-NEXT: popq %r15
-; AVX512DQ-NEXT: popq %rbp
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
store <64 x i1> %v, <64 x i1>* %a
@@ -3604,17 +2460,18 @@ define void @store_64i1(<64 x i1>* %a, <64 x i1> %v) {
define i32 @test_bitcast_v8i1_zext(<16 x i32> %a) {
; KNL-LABEL: test_bitcast_v8i1_zext:
-; KNL: ## BB#0:
-; KNL-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; KNL: ## %bb.0:
+; KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1
; KNL-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: movzbl %al, %eax
; KNL-NEXT: addl %eax, %eax
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: test_bitcast_v8i1_zext:
-; SKX: ## BB#0:
-; SKX-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; SKX: ## %bb.0:
+; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; SKX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
; SKX-NEXT: kmovb %k0, %eax
; SKX-NEXT: addl %eax, %eax
@@ -3622,8 +2479,8 @@ define i32 @test_bitcast_v8i1_zext(<16 x i32> %a) {
; SKX-NEXT: retq
;
; AVX512BW-LABEL: test_bitcast_v8i1_zext:
-; AVX512BW: ## BB#0:
-; AVX512BW-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; AVX512BW: ## %bb.0:
+; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512BW-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
; AVX512BW-NEXT: kmovd %k0, %eax
; AVX512BW-NEXT: movzbl %al, %eax
@@ -3632,8 +2489,8 @@ define i32 @test_bitcast_v8i1_zext(<16 x i32> %a) {
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: test_bitcast_v8i1_zext:
-; AVX512DQ: ## BB#0:
-; AVX512DQ-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; AVX512DQ: ## %bb.0:
+; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512DQ-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
; AVX512DQ-NEXT: kmovb %k0, %eax
; AVX512DQ-NEXT: addl %eax, %eax
@@ -3648,40 +2505,14 @@ define i32 @test_bitcast_v8i1_zext(<16 x i32> %a) {
}
define i32 @test_bitcast_v16i1_zext(<16 x i32> %a) {
-; KNL-LABEL: test_bitcast_v16i1_zext:
-; KNL: ## BB#0:
-; KNL-NEXT: vpxord %zmm1, %zmm1, %zmm1
-; KNL-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
-; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: addl %eax, %eax
-; KNL-NEXT: retq
-;
-; SKX-LABEL: test_bitcast_v16i1_zext:
-; SKX: ## BB#0:
-; SKX-NEXT: vpxord %zmm1, %zmm1, %zmm1
-; SKX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
-; SKX-NEXT: kmovw %k0, %eax
-; SKX-NEXT: addl %eax, %eax
-; SKX-NEXT: vzeroupper
-; SKX-NEXT: retq
-;
-; AVX512BW-LABEL: test_bitcast_v16i1_zext:
-; AVX512BW: ## BB#0:
-; AVX512BW-NEXT: vpxord %zmm1, %zmm1, %zmm1
-; AVX512BW-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
-; AVX512BW-NEXT: kmovw %k0, %eax
-; AVX512BW-NEXT: addl %eax, %eax
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512DQ-LABEL: test_bitcast_v16i1_zext:
-; AVX512DQ: ## BB#0:
-; AVX512DQ-NEXT: vpxord %zmm1, %zmm1, %zmm1
-; AVX512DQ-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
-; AVX512DQ-NEXT: kmovw %k0, %eax
-; AVX512DQ-NEXT: addl %eax, %eax
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
+; CHECK-LABEL: test_bitcast_v16i1_zext:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
+; CHECK-NEXT: kmovw %k0, %eax
+; CHECK-NEXT: addl %eax, %eax
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
%v1 = icmp eq <16 x i32> %a, zeroinitializer
%mask1 = bitcast <16 x i1> %v1 to i16
%val = zext i16 %mask1 to i32
@@ -3691,39 +2522,39 @@ define i32 @test_bitcast_v16i1_zext(<16 x i32> %a) {
define i16 @test_v16i1_add(i16 %x, i16 %y) {
; KNL-LABEL: test_v16i1_add:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: kmovw %edi, %k0
; KNL-NEXT: kmovw %esi, %k1
; KNL-NEXT: kxorw %k1, %k0, %k0
; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; KNL-NEXT: ## kill: def %ax killed %ax killed %eax
; KNL-NEXT: retq
;
; SKX-LABEL: test_v16i1_add:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: kmovd %edi, %k0
; SKX-NEXT: kmovd %esi, %k1
; SKX-NEXT: kxorw %k1, %k0, %k0
; SKX-NEXT: kmovd %k0, %eax
-; SKX-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; SKX-NEXT: ## kill: def %ax killed %ax killed %eax
; SKX-NEXT: retq
;
; AVX512BW-LABEL: test_v16i1_add:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovd %edi, %k0
; AVX512BW-NEXT: kmovd %esi, %k1
; AVX512BW-NEXT: kxorw %k1, %k0, %k0
; AVX512BW-NEXT: kmovd %k0, %eax
-; AVX512BW-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX512BW-NEXT: ## kill: def %ax killed %ax killed %eax
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: test_v16i1_add:
-; AVX512DQ: ## BB#0:
+; AVX512DQ: ## %bb.0:
; AVX512DQ-NEXT: kmovw %edi, %k0
; AVX512DQ-NEXT: kmovw %esi, %k1
; AVX512DQ-NEXT: kxorw %k1, %k0, %k0
; AVX512DQ-NEXT: kmovw %k0, %eax
-; AVX512DQ-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX512DQ-NEXT: ## kill: def %ax killed %ax killed %eax
; AVX512DQ-NEXT: retq
%m0 = bitcast i16 %x to <16 x i1>
%m1 = bitcast i16 %y to <16 x i1>
@@ -3734,39 +2565,39 @@ define i16 @test_v16i1_add(i16 %x, i16 %y) {
define i16 @test_v16i1_sub(i16 %x, i16 %y) {
; KNL-LABEL: test_v16i1_sub:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: kmovw %edi, %k0
; KNL-NEXT: kmovw %esi, %k1
; KNL-NEXT: kxorw %k1, %k0, %k0
; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; KNL-NEXT: ## kill: def %ax killed %ax killed %eax
; KNL-NEXT: retq
;
; SKX-LABEL: test_v16i1_sub:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: kmovd %edi, %k0
; SKX-NEXT: kmovd %esi, %k1
; SKX-NEXT: kxorw %k1, %k0, %k0
; SKX-NEXT: kmovd %k0, %eax
-; SKX-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; SKX-NEXT: ## kill: def %ax killed %ax killed %eax
; SKX-NEXT: retq
;
; AVX512BW-LABEL: test_v16i1_sub:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovd %edi, %k0
; AVX512BW-NEXT: kmovd %esi, %k1
; AVX512BW-NEXT: kxorw %k1, %k0, %k0
; AVX512BW-NEXT: kmovd %k0, %eax
-; AVX512BW-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX512BW-NEXT: ## kill: def %ax killed %ax killed %eax
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: test_v16i1_sub:
-; AVX512DQ: ## BB#0:
+; AVX512DQ: ## %bb.0:
; AVX512DQ-NEXT: kmovw %edi, %k0
; AVX512DQ-NEXT: kmovw %esi, %k1
; AVX512DQ-NEXT: kxorw %k1, %k0, %k0
; AVX512DQ-NEXT: kmovw %k0, %eax
-; AVX512DQ-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX512DQ-NEXT: ## kill: def %ax killed %ax killed %eax
; AVX512DQ-NEXT: retq
%m0 = bitcast i16 %x to <16 x i1>
%m1 = bitcast i16 %y to <16 x i1>
@@ -3777,39 +2608,39 @@ define i16 @test_v16i1_sub(i16 %x, i16 %y) {
define i16 @test_v16i1_mul(i16 %x, i16 %y) {
; KNL-LABEL: test_v16i1_mul:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: kmovw %edi, %k0
; KNL-NEXT: kmovw %esi, %k1
; KNL-NEXT: kandw %k1, %k0, %k0
; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; KNL-NEXT: ## kill: def %ax killed %ax killed %eax
; KNL-NEXT: retq
;
; SKX-LABEL: test_v16i1_mul:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: kmovd %edi, %k0
; SKX-NEXT: kmovd %esi, %k1
; SKX-NEXT: kandw %k1, %k0, %k0
; SKX-NEXT: kmovd %k0, %eax
-; SKX-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; SKX-NEXT: ## kill: def %ax killed %ax killed %eax
; SKX-NEXT: retq
;
; AVX512BW-LABEL: test_v16i1_mul:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovd %edi, %k0
; AVX512BW-NEXT: kmovd %esi, %k1
; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kmovd %k0, %eax
-; AVX512BW-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX512BW-NEXT: ## kill: def %ax killed %ax killed %eax
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: test_v16i1_mul:
-; AVX512DQ: ## BB#0:
+; AVX512DQ: ## %bb.0:
; AVX512DQ-NEXT: kmovw %edi, %k0
; AVX512DQ-NEXT: kmovw %esi, %k1
; AVX512DQ-NEXT: kandw %k1, %k0, %k0
; AVX512DQ-NEXT: kmovw %k0, %eax
-; AVX512DQ-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX512DQ-NEXT: ## kill: def %ax killed %ax killed %eax
; AVX512DQ-NEXT: retq
%m0 = bitcast i16 %x to <16 x i1>
%m1 = bitcast i16 %y to <16 x i1>
@@ -3820,39 +2651,39 @@ define i16 @test_v16i1_mul(i16 %x, i16 %y) {
define i8 @test_v8i1_add(i8 %x, i8 %y) {
; KNL-LABEL: test_v8i1_add:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: kmovw %edi, %k0
; KNL-NEXT: kmovw %esi, %k1
; KNL-NEXT: kxorw %k1, %k0, %k0
; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; KNL-NEXT: ## kill: def %al killed %al killed %eax
; KNL-NEXT: retq
;
; SKX-LABEL: test_v8i1_add:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: kmovd %edi, %k0
; SKX-NEXT: kmovd %esi, %k1
; SKX-NEXT: kxorb %k1, %k0, %k0
; SKX-NEXT: kmovd %k0, %eax
-; SKX-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; SKX-NEXT: ## kill: def %al killed %al killed %eax
; SKX-NEXT: retq
;
; AVX512BW-LABEL: test_v8i1_add:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovd %edi, %k0
; AVX512BW-NEXT: kmovd %esi, %k1
; AVX512BW-NEXT: kxorw %k1, %k0, %k0
; AVX512BW-NEXT: kmovd %k0, %eax
-; AVX512BW-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: ## kill: def %al killed %al killed %eax
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: test_v8i1_add:
-; AVX512DQ: ## BB#0:
+; AVX512DQ: ## %bb.0:
; AVX512DQ-NEXT: kmovw %edi, %k0
; AVX512DQ-NEXT: kmovw %esi, %k1
; AVX512DQ-NEXT: kxorb %k1, %k0, %k0
; AVX512DQ-NEXT: kmovw %k0, %eax
-; AVX512DQ-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512DQ-NEXT: ## kill: def %al killed %al killed %eax
; AVX512DQ-NEXT: retq
%m0 = bitcast i8 %x to <8 x i1>
%m1 = bitcast i8 %y to <8 x i1>
@@ -3863,39 +2694,39 @@ define i8 @test_v8i1_add(i8 %x, i8 %y) {
define i8 @test_v8i1_sub(i8 %x, i8 %y) {
; KNL-LABEL: test_v8i1_sub:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: kmovw %edi, %k0
; KNL-NEXT: kmovw %esi, %k1
; KNL-NEXT: kxorw %k1, %k0, %k0
; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; KNL-NEXT: ## kill: def %al killed %al killed %eax
; KNL-NEXT: retq
;
; SKX-LABEL: test_v8i1_sub:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: kmovd %edi, %k0
; SKX-NEXT: kmovd %esi, %k1
; SKX-NEXT: kxorb %k1, %k0, %k0
; SKX-NEXT: kmovd %k0, %eax
-; SKX-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; SKX-NEXT: ## kill: def %al killed %al killed %eax
; SKX-NEXT: retq
;
; AVX512BW-LABEL: test_v8i1_sub:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovd %edi, %k0
; AVX512BW-NEXT: kmovd %esi, %k1
; AVX512BW-NEXT: kxorw %k1, %k0, %k0
; AVX512BW-NEXT: kmovd %k0, %eax
-; AVX512BW-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: ## kill: def %al killed %al killed %eax
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: test_v8i1_sub:
-; AVX512DQ: ## BB#0:
+; AVX512DQ: ## %bb.0:
; AVX512DQ-NEXT: kmovw %edi, %k0
; AVX512DQ-NEXT: kmovw %esi, %k1
; AVX512DQ-NEXT: kxorb %k1, %k0, %k0
; AVX512DQ-NEXT: kmovw %k0, %eax
-; AVX512DQ-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512DQ-NEXT: ## kill: def %al killed %al killed %eax
; AVX512DQ-NEXT: retq
%m0 = bitcast i8 %x to <8 x i1>
%m1 = bitcast i8 %y to <8 x i1>
@@ -3906,39 +2737,39 @@ define i8 @test_v8i1_sub(i8 %x, i8 %y) {
define i8 @test_v8i1_mul(i8 %x, i8 %y) {
; KNL-LABEL: test_v8i1_mul:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: kmovw %edi, %k0
; KNL-NEXT: kmovw %esi, %k1
; KNL-NEXT: kandw %k1, %k0, %k0
; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; KNL-NEXT: ## kill: def %al killed %al killed %eax
; KNL-NEXT: retq
;
; SKX-LABEL: test_v8i1_mul:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: kmovd %edi, %k0
; SKX-NEXT: kmovd %esi, %k1
; SKX-NEXT: kandb %k1, %k0, %k0
; SKX-NEXT: kmovd %k0, %eax
-; SKX-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; SKX-NEXT: ## kill: def %al killed %al killed %eax
; SKX-NEXT: retq
;
; AVX512BW-LABEL: test_v8i1_mul:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovd %edi, %k0
; AVX512BW-NEXT: kmovd %esi, %k1
; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kmovd %k0, %eax
-; AVX512BW-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: ## kill: def %al killed %al killed %eax
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: test_v8i1_mul:
-; AVX512DQ: ## BB#0:
+; AVX512DQ: ## %bb.0:
; AVX512DQ-NEXT: kmovw %edi, %k0
; AVX512DQ-NEXT: kmovw %esi, %k1
; AVX512DQ-NEXT: kandb %k1, %k0, %k0
; AVX512DQ-NEXT: kmovw %k0, %eax
-; AVX512DQ-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512DQ-NEXT: ## kill: def %al killed %al killed %eax
; AVX512DQ-NEXT: retq
%m0 = bitcast i8 %x to <8 x i1>
%m1 = bitcast i8 %y to <8 x i1>
diff --git a/test/CodeGen/X86/avx512-mask-spills.ll b/test/CodeGen/X86/avx512-mask-spills.ll
index 96aefdb10584..b9f483e997c4 100644
--- a/test/CodeGen/X86/avx512-mask-spills.ll
+++ b/test/CodeGen/X86/avx512-mask-spills.ll
@@ -4,9 +4,8 @@
declare void @f()
define <4 x i1> @test_4i1(<4 x i32> %a, <4 x i32> %b) {
; CHECK-LABEL: test_4i1:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: pushq %rax
-; CHECK-NEXT: Lcfi0:
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: vpcmpnleud %xmm1, %xmm0, %k0
; CHECK-NEXT: vpcmpgtd %xmm1, %xmm0, %k1
@@ -27,9 +26,8 @@ define <4 x i1> @test_4i1(<4 x i32> %a, <4 x i32> %b) {
define <8 x i1> @test_8i1(<8 x i32> %a, <8 x i32> %b) {
; CHECK-LABEL: test_8i1:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: pushq %rax
-; CHECK-NEXT: Lcfi1:
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: vpcmpnleud %ymm1, %ymm0, %k0
; CHECK-NEXT: vpcmpgtd %ymm1, %ymm0, %k1
@@ -51,9 +49,8 @@ define <8 x i1> @test_8i1(<8 x i32> %a, <8 x i32> %b) {
define <16 x i1> @test_16i1(<16 x i32> %a, <16 x i32> %b) {
; CHECK-LABEL: test_16i1:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: pushq %rax
-; CHECK-NEXT: Lcfi2:
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: vpcmpnleud %zmm1, %zmm0, %k0
; CHECK-NEXT: vpcmpgtd %zmm1, %zmm0, %k1
@@ -74,9 +71,8 @@ define <16 x i1> @test_16i1(<16 x i32> %a, <16 x i32> %b) {
define <32 x i1> @test_32i1(<32 x i16> %a, <32 x i16> %b) {
; CHECK-LABEL: test_32i1:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: pushq %rax
-; CHECK-NEXT: Lcfi3:
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: vpcmpnleuw %zmm1, %zmm0, %k0
; CHECK-NEXT: vpcmpgtw %zmm1, %zmm0, %k1
@@ -97,9 +93,8 @@ define <32 x i1> @test_32i1(<32 x i16> %a, <32 x i16> %b) {
define <64 x i1> @test_64i1(<64 x i8> %a, <64 x i8> %b) {
; CHECK-LABEL: test_64i1:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: pushq %rax
-; CHECK-NEXT: Lcfi4:
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: vpcmpnleub %zmm1, %zmm0, %k0
; CHECK-NEXT: vpcmpgtb %zmm1, %zmm0, %k1
diff --git a/test/CodeGen/X86/avx512-mask-zext-bugfix.ll b/test/CodeGen/X86/avx512-mask-zext-bugfix.ll
index 8f02695af704..f828c4dcef7d 100755
--- a/test/CodeGen/X86/avx512-mask-zext-bugfix.ll
+++ b/test/CodeGen/X86/avx512-mask-zext-bugfix.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -O0 -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s
+; RUN: llc < %s -O0 -mtriple=x86_64-apple-darwin -mcpu=skx -fast-isel-abort=1 | FileCheck %s
; ModuleID = 'mask_set.c'
source_filename = "mask_set.c"
@@ -16,16 +16,53 @@ declare i32 @check_mask16(i16 zeroext %res_mask, i16 zeroext %exp_mask, i8* %fna
; Function Attrs: nounwind uwtable
define void @test_xmm(i32 %shift, i32 %mulp, <2 x i64> %a,i8* %arraydecay,i8* %fname){
; CHECK-LABEL: test_xmm:
-; CHECK: ## BB#0:
-; CHECK: callq _calc_expected_mask_val
+; CHECK: ## %bb.0:
+; CHECK-NEXT: subq $56, %rsp
+; CHECK-NEXT: .cfi_def_cfa_offset 64
+; CHECK-NEXT: movl $2, %esi
+; CHECK-NEXT: movl $8, %eax
+; CHECK-NEXT: vpmovw2m %xmm0, %k0
+; CHECK-NEXT: kmovd %k0, %edi
+; CHECK-NEXT: movb %dil, %r8b
+; CHECK-NEXT: movzbl %r8b, %edi
+; CHECK-NEXT: movw %di, %r9w
+; CHECK-NEXT: movq %rdx, %rdi
+; CHECK-NEXT: movq %rdx, {{[0-9]+}}(%rsp) ## 8-byte Spill
; CHECK-NEXT: movl %eax, %edx
-; CHECK-NEXT: movw %dx, %r9w
+; CHECK-NEXT: movw %r9w, {{[0-9]+}}(%rsp) ## 2-byte Spill
+; CHECK-NEXT: movq %rcx, {{[0-9]+}}(%rsp) ## 8-byte Spill
+; CHECK-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) ## 16-byte Spill
+; CHECK-NEXT: callq _calc_expected_mask_val
+; CHECK-NEXT: movw %ax, %r9w
+; CHECK-NEXT: movw {{[0-9]+}}(%rsp), %r10w ## 2-byte Reload
+; CHECK-NEXT: movzwl %r10w, %edi
; CHECK-NEXT: movzwl %r9w, %esi
-; CHECK-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
-; CHECK-NEXT: kmovb %k0, %edi
; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rdx ## 8-byte Reload
; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rcx ## 8-byte Reload
; CHECK-NEXT: callq _check_mask16
+; CHECK-NEXT: movl $4, %esi
+; CHECK-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 ## 16-byte Reload
+; CHECK-NEXT: vpmovd2m %xmm0, %k0
+; CHECK-NEXT: kmovd %k0, %edi
+; CHECK-NEXT: movb %dil, %r8b
+; CHECK-NEXT: movzbl %r8b, %edi
+; CHECK-NEXT: movw %di, %r9w
+; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rdi ## 8-byte Reload
+; CHECK-NEXT: movl %esi, {{[0-9]+}}(%rsp) ## 4-byte Spill
+; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %edx ## 4-byte Reload
+; CHECK-NEXT: movl %eax, {{[0-9]+}}(%rsp) ## 4-byte Spill
+; CHECK-NEXT: movw %r9w, {{[0-9]+}}(%rsp) ## 2-byte Spill
+; CHECK-NEXT: callq _calc_expected_mask_val
+; CHECK-NEXT: movw %ax, %r9w
+; CHECK-NEXT: movw {{[0-9]+}}(%rsp), %r10w ## 2-byte Reload
+; CHECK-NEXT: movzwl %r10w, %edi
+; CHECK-NEXT: movzwl %r9w, %esi
+; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rdx ## 8-byte Reload
+; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rcx ## 8-byte Reload
+; CHECK-NEXT: callq _check_mask16
+; CHECK-NEXT: movl %eax, (%rsp) ## 4-byte Spill
+; CHECK-NEXT: addq $56, %rsp
+; CHECK-NEXT: retq
%d2 = bitcast <2 x i64> %a to <8 x i16>
%m2 = call i8 @llvm.x86.avx512.cvtw2mask.128(<8 x i16> %d2)
%conv7 = zext i8 %m2 to i16
diff --git a/test/CodeGen/X86/avx512-masked-memop-64-32.ll b/test/CodeGen/X86/avx512-masked-memop-64-32.ll
index 607c4f4ade6f..e64ac5c58736 100644
--- a/test/CodeGen/X86/avx512-masked-memop-64-32.ll
+++ b/test/CodeGen/X86/avx512-masked-memop-64-32.ll
@@ -4,8 +4,8 @@
define <16 x i32> @test1(<16 x i32> %trigger, <16 x i32>* %addr) {
; AVX512-LABEL: test1:
-; AVX512: ## BB#0:
-; AVX512-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; AVX512: ## %bb.0:
+; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512-NEXT: vpcmpeqd %zmm1, %zmm0, %k1
; AVX512-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1} {z}
; AVX512-NEXT: retq
@@ -16,8 +16,8 @@ define <16 x i32> @test1(<16 x i32> %trigger, <16 x i32>* %addr) {
define <16 x i32> @test2(<16 x i32> %trigger, <16 x i32>* %addr) {
; AVX512-LABEL: test2:
-; AVX512: ## BB#0:
-; AVX512-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; AVX512: ## %bb.0:
+; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512-NEXT: vpcmpeqd %zmm1, %zmm0, %k1
; AVX512-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1} {z}
; AVX512-NEXT: retq
@@ -28,8 +28,8 @@ define <16 x i32> @test2(<16 x i32> %trigger, <16 x i32>* %addr) {
define void @test3(<16 x i32> %trigger, <16 x i32>* %addr, <16 x i32> %val) {
; AVX512-LABEL: test3:
-; AVX512: ## BB#0:
-; AVX512-NEXT: vpxord %zmm2, %zmm2, %zmm2
+; AVX512: ## %bb.0:
+; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512-NEXT: vpcmpeqd %zmm2, %zmm0, %k1
; AVX512-NEXT: vmovdqu32 %zmm1, (%rdi) {%k1}
; AVX512-NEXT: vzeroupper
@@ -41,8 +41,8 @@ define void @test3(<16 x i32> %trigger, <16 x i32>* %addr, <16 x i32> %val) {
define <16 x float> @test4(<16 x i32> %trigger, <16 x float>* %addr, <16 x float> %dst) {
; AVX512-LABEL: test4:
-; AVX512: ## BB#0:
-; AVX512-NEXT: vpxord %zmm2, %zmm2, %zmm2
+; AVX512: ## %bb.0:
+; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512-NEXT: vpcmpeqd %zmm2, %zmm0, %k1
; AVX512-NEXT: vblendmps (%rdi), %zmm1, %zmm0 {%k1}
; AVX512-NEXT: retq
@@ -53,8 +53,8 @@ define <16 x float> @test4(<16 x i32> %trigger, <16 x float>* %addr, <16 x float
define void @test13(<16 x i32> %trigger, <16 x float>* %addr, <16 x float> %val) {
; AVX512-LABEL: test13:
-; AVX512: ## BB#0:
-; AVX512-NEXT: vpxord %zmm2, %zmm2, %zmm2
+; AVX512: ## %bb.0:
+; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512-NEXT: vpcmpeqd %zmm2, %zmm0, %k1
; AVX512-NEXT: vmovups %zmm1, (%rdi) {%k1}
; AVX512-NEXT: vzeroupper
@@ -66,7 +66,7 @@ define void @test13(<16 x i32> %trigger, <16 x float>* %addr, <16 x float> %val)
define void @one_mask_bit_set5(<8 x double>* %addr, <8 x double> %val) {
; AVX512-LABEL: one_mask_bit_set5:
-; AVX512: ## BB#0:
+; AVX512: ## %bb.0:
; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0
; AVX512-NEXT: vmovlps %xmm0, 48(%rdi)
; AVX512-NEXT: vzeroupper
@@ -78,7 +78,7 @@ define void @one_mask_bit_set5(<8 x double>* %addr, <8 x double> %val) {
define <8 x double> @load_one_mask_bit_set5(<8 x double>* %addr, <8 x double> %val) {
;
; AVX512-LABEL: load_one_mask_bit_set5:
-; AVX512: ## BB#0:
+; AVX512: ## %bb.0:
; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm1
; AVX512-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
; AVX512-NEXT: vinsertf32x4 $3, %xmm1, %zmm0, %zmm0
@@ -98,8 +98,8 @@ declare <16 x i32*> @llvm.masked.load.v16p0i32.p0v16p0i32(<16 x i32*>*, i32, <16
define <16 x i32*> @test23(<16 x i32*> %trigger, <16 x i32*>* %addr) {
; AVX512-LABEL: test23:
-; AVX512: ## BB#0:
-; AVX512-NEXT: vpxord %zmm2, %zmm2, %zmm2
+; AVX512: ## %bb.0:
+; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512-NEXT: vpcmpeqq %zmm2, %zmm0, %k1
; AVX512-NEXT: vpcmpeqq %zmm2, %zmm1, %k2
; AVX512-NEXT: vmovdqu64 64(%rdi), %zmm1 {%k2} {z}
@@ -116,7 +116,7 @@ declare <16 x %mystruct*> @llvm.masked.load.v16p0mystruct.p0v16p0mystruct(<16 x
define <16 x %mystruct*> @test24(<16 x i1> %mask, <16 x %mystruct*>* %addr) {
; AVX512F-LABEL: test24:
-; AVX512F: ## BB#0:
+; AVX512F: ## %bb.0:
; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1
@@ -126,7 +126,7 @@ define <16 x %mystruct*> @test24(<16 x i1> %mask, <16 x %mystruct*>* %addr) {
; AVX512F-NEXT: retq
;
; SKX-LABEL: test24:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpsllw $7, %xmm0, %xmm0
; SKX-NEXT: vpmovb2m %xmm0, %k1
; SKX-NEXT: vmovdqu64 (%rdi), %zmm0 {%k1} {z}
@@ -139,7 +139,7 @@ define <16 x %mystruct*> @test24(<16 x i1> %mask, <16 x %mystruct*>* %addr) {
define void @test_store_16i64(<16 x i64>* %ptrs, <16 x i1> %mask, <16 x i64> %src0) {
; AVX512F-LABEL: test_store_16i64:
-; AVX512F: ## BB#0:
+; AVX512F: ## %bb.0:
; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1
@@ -150,7 +150,7 @@ define void @test_store_16i64(<16 x i64>* %ptrs, <16 x i1> %mask, <16 x i64> %sr
; AVX512F-NEXT: retq
;
; SKX-LABEL: test_store_16i64:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpsllw $7, %xmm0, %xmm0
; SKX-NEXT: vpmovb2m %xmm0, %k1
; SKX-NEXT: vmovdqu64 %zmm1, (%rdi) {%k1}
@@ -165,7 +165,7 @@ declare void @llvm.masked.store.v16i64.p0v16i64(<16 x i64> %src0, <16 x i64>* %p
define void @test_store_16f64(<16 x double>* %ptrs, <16 x i1> %mask, <16 x double> %src0) {
; AVX512F-LABEL: test_store_16f64:
-; AVX512F: ## BB#0:
+; AVX512F: ## %bb.0:
; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1
@@ -176,7 +176,7 @@ define void @test_store_16f64(<16 x double>* %ptrs, <16 x i1> %mask, <16 x doubl
; AVX512F-NEXT: retq
;
; SKX-LABEL: test_store_16f64:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpsllw $7, %xmm0, %xmm0
; SKX-NEXT: vpmovb2m %xmm0, %k1
; SKX-NEXT: vmovupd %zmm1, (%rdi) {%k1}
@@ -191,7 +191,7 @@ declare void @llvm.masked.store.v16f64.p0v16f64(<16 x double> %src0, <16 x doubl
define <16 x i64> @test_load_16i64(<16 x i64>* %ptrs, <16 x i1> %mask, <16 x i64> %src0) {
; AVX512F-LABEL: test_load_16i64:
-; AVX512F: ## BB#0:
+; AVX512F: ## %bb.0:
; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1
@@ -201,7 +201,7 @@ define <16 x i64> @test_load_16i64(<16 x i64>* %ptrs, <16 x i1> %mask, <16 x i64
; AVX512F-NEXT: retq
;
; SKX-LABEL: test_load_16i64:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpsllw $7, %xmm0, %xmm0
; SKX-NEXT: vpmovb2m %xmm0, %k1
; SKX-NEXT: vpblendmq (%rdi), %zmm1, %zmm0 {%k1}
@@ -215,7 +215,7 @@ declare <16 x i64> @llvm.masked.load.v16i64.p0v16i64(<16 x i64>* %ptrs, i32, <16
define <16 x double> @test_load_16f64(<16 x double>* %ptrs, <16 x i1> %mask, <16 x double> %src0) {
; AVX512F-LABEL: test_load_16f64:
-; AVX512F: ## BB#0:
+; AVX512F: ## %bb.0:
; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1
@@ -225,7 +225,7 @@ define <16 x double> @test_load_16f64(<16 x double>* %ptrs, <16 x i1> %mask, <16
; AVX512F-NEXT: retq
;
; SKX-LABEL: test_load_16f64:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpsllw $7, %xmm0, %xmm0
; SKX-NEXT: vpmovb2m %xmm0, %k1
; SKX-NEXT: vblendmpd (%rdi), %zmm1, %zmm0 {%k1}
@@ -239,7 +239,7 @@ declare <16 x double> @llvm.masked.load.v16f64.p0v16f64(<16 x double>* %ptrs, i3
define <32 x double> @test_load_32f64(<32 x double>* %ptrs, <32 x i1> %mask, <32 x double> %src0) {
; AVX512F-LABEL: test_load_32f64:
-; AVX512F: ## BB#0:
+; AVX512F: ## %bb.0:
; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm5
; AVX512F-NEXT: vpmovsxbd %xmm5, %zmm5
; AVX512F-NEXT: vpslld $31, %zmm5, %zmm5
@@ -257,7 +257,7 @@ define <32 x double> @test_load_32f64(<32 x double>* %ptrs, <32 x i1> %mask, <32
; AVX512F-NEXT: retq
;
; SKX-LABEL: test_load_32f64:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpsllw $7, %ymm0, %ymm0
; SKX-NEXT: vpmovb2m %ymm0, %k1
; SKX-NEXT: vblendmpd (%rdi), %zmm1, %zmm0 {%k1}
diff --git a/test/CodeGen/X86/avx512-masked_memop-16-8.ll b/test/CodeGen/X86/avx512-masked_memop-16-8.ll
index aedfbf7dbd65..c8df2bffd9a4 100644
--- a/test/CodeGen/X86/avx512-masked_memop-16-8.ll
+++ b/test/CodeGen/X86/avx512-masked_memop-16-8.ll
@@ -5,7 +5,7 @@
define <16 x i8> @test_mask_load_16xi8(<16 x i1> %mask, <16 x i8>* %addr, <16 x i8> %val) {
; CHECK-LABEL: test_mask_load_16xi8:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsllw $7, %xmm0, %xmm0
; CHECK-NEXT: vpmovb2m %xmm0, %k1
; CHECK-NEXT: vmovdqu8 (%rdi), %xmm0 {%k1} {z}
@@ -17,7 +17,7 @@ declare <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>*, i32, <16 x i1>, <1
define <32 x i8> @test_mask_load_32xi8(<32 x i1> %mask, <32 x i8>* %addr, <32 x i8> %val) {
; CHECK-LABEL: test_mask_load_32xi8:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsllw $7, %ymm0, %ymm0
; CHECK-NEXT: vpmovb2m %ymm0, %k1
; CHECK-NEXT: vpblendmb (%rdi), %ymm1, %ymm0 {%k1}
@@ -29,7 +29,7 @@ declare <32 x i8> @llvm.masked.load.v32i8.p0v32i8(<32 x i8>*, i32, <32 x i1>, <3
define <64 x i8> @test_mask_load_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x i8> %val) {
; CHECK-LABEL: test_mask_load_64xi8:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsllw $7, %zmm0, %zmm0
; CHECK-NEXT: vpmovb2m %zmm0, %k1
; CHECK-NEXT: vpblendmb (%rdi), %zmm1, %zmm0 {%k1}
@@ -41,7 +41,7 @@ declare <64 x i8> @llvm.masked.load.v64i8.p0v64i8(<64 x i8>*, i32, <64 x i1>, <6
define <8 x i16> @test_mask_load_8xi16(<8 x i1> %mask, <8 x i16>* %addr, <8 x i16> %val) {
; CHECK-LABEL: test_mask_load_8xi16:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsllw $15, %xmm0, %xmm0
; CHECK-NEXT: vpmovw2m %xmm0, %k1
; CHECK-NEXT: vmovdqu16 (%rdi), %xmm0 {%k1} {z}
@@ -53,7 +53,7 @@ declare <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>*, i32, <8 x i1>, <8
define <16 x i16> @test_mask_load_16xi16(<16 x i1> %mask, <16 x i16>* %addr, <16 x i16> %val) {
; CHECK-LABEL: test_mask_load_16xi16:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsllw $7, %xmm0, %xmm0
; CHECK-NEXT: vpmovb2m %xmm0, %k1
; CHECK-NEXT: vmovdqu16 (%rdi), %ymm0 {%k1} {z}
@@ -65,7 +65,7 @@ declare <16 x i16> @llvm.masked.load.v16i16.p0v16i16(<16 x i16>*, i32, <16 x i1>
define <32 x i16> @test_mask_load_32xi16(<32 x i1> %mask, <32 x i16>* %addr, <32 x i16> %val) {
; CHECK-LABEL: test_mask_load_32xi16:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsllw $7, %ymm0, %ymm0
; CHECK-NEXT: vpmovb2m %ymm0, %k1
; CHECK-NEXT: vpblendmw (%rdi), %zmm1, %zmm0 {%k1}
@@ -77,7 +77,7 @@ declare <32 x i16> @llvm.masked.load.v32i16.p0v32i16(<32 x i16>*, i32, <32 x i1>
define void @test_mask_store_16xi8(<16 x i1> %mask, <16 x i8>* %addr, <16 x i8> %val) {
; CHECK-LABEL: test_mask_store_16xi8:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsllw $7, %xmm0, %xmm0
; CHECK-NEXT: vpmovb2m %xmm0, %k1
; CHECK-NEXT: vmovdqu8 %xmm1, (%rdi) {%k1}
@@ -89,7 +89,7 @@ declare void @llvm.masked.store.v16i8.p0v16i8(<16 x i8>, <16 x i8>*, i32, <16 x
define void @test_mask_store_32xi8(<32 x i1> %mask, <32 x i8>* %addr, <32 x i8> %val) {
; CHECK-LABEL: test_mask_store_32xi8:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsllw $7, %ymm0, %ymm0
; CHECK-NEXT: vpmovb2m %ymm0, %k1
; CHECK-NEXT: vmovdqu8 %ymm1, (%rdi) {%k1}
@@ -102,7 +102,7 @@ declare void @llvm.masked.store.v32i8.p0v32i8(<32 x i8>, <32 x i8>*, i32, <32 x
define void @test_mask_store_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x i8> %val) {
; CHECK-LABEL: test_mask_store_64xi8:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsllw $7, %zmm0, %zmm0
; CHECK-NEXT: vpmovb2m %zmm0, %k1
; CHECK-NEXT: vmovdqu8 %zmm1, (%rdi) {%k1}
@@ -115,7 +115,7 @@ declare void @llvm.masked.store.v64i8.p0v64i8(<64 x i8>, <64 x i8>*, i32, <64 x
define void @test_mask_store_8xi16(<8 x i1> %mask, <8 x i16>* %addr, <8 x i16> %val) {
; CHECK-LABEL: test_mask_store_8xi16:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsllw $15, %xmm0, %xmm0
; CHECK-NEXT: vpmovw2m %xmm0, %k1
; CHECK-NEXT: vmovdqu16 %xmm1, (%rdi) {%k1}
@@ -127,7 +127,7 @@ declare void @llvm.masked.store.v8i16.p0v8i16(<8 x i16>, <8 x i16>*, i32, <8 x i
define void @test_mask_store_16xi16(<16 x i1> %mask, <16 x i16>* %addr, <16 x i16> %val) {
; CHECK-LABEL: test_mask_store_16xi16:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsllw $7, %xmm0, %xmm0
; CHECK-NEXT: vpmovb2m %xmm0, %k1
; CHECK-NEXT: vmovdqu16 %ymm1, (%rdi) {%k1}
@@ -140,7 +140,7 @@ declare void @llvm.masked.store.v16i16.p0v16i16(<16 x i16>, <16 x i16>*, i32, <1
define void @test_mask_store_32xi16(<32 x i1> %mask, <32 x i16>* %addr, <32 x i16> %val) {
; CHECK-LABEL: test_mask_store_32xi16:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsllw $7, %ymm0, %ymm0
; CHECK-NEXT: vpmovb2m %ymm0, %k1
; CHECK-NEXT: vmovdqu16 %zmm1, (%rdi) {%k1}
diff --git a/test/CodeGen/X86/avx512-memfold.ll b/test/CodeGen/X86/avx512-memfold.ll
index 17cb30255f75..02c51316f2e1 100644
--- a/test/CodeGen/X86/avx512-memfold.ll
+++ b/test/CodeGen/X86/avx512-memfold.ll
@@ -3,11 +3,11 @@
define i8 @test_int_x86_avx512_mask_cmp_ss(<4 x float> %a, float* %b, i8 %mask) {
; CHECK-LABEL: test_int_x86_avx512_mask_cmp_ss:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vcmpunordss (%rdi), %xmm0, %k0 {%k1}
; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: ## kill: def %al killed %al killed %eax
; CHECK-NEXT: retq
%b.val = load float, float* %b
%bv0 = insertelement <4 x float> undef, float %b.val, i32 0
@@ -21,7 +21,7 @@ declare i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float>, <4 x float>, i32, i8, i32)
define <4 x float> @test_mask_max_ss(<4 x float> %a, float* %b, i8 %mask) {
; CHECK-LABEL: test_mask_max_ss:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vmaxss (%rdi), %xmm0, %xmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -37,7 +37,7 @@ declare <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>, <4 x float>,
define <4 x float> @test_maskz_add_ss(<4 x float> %a, float* %b, i8 %mask) {
; CHECK-LABEL: test_maskz_add_ss:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vaddss (%rdi), %xmm0, %xmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -56,7 +56,7 @@ declare <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double>, <2 x double>,
define <2 x double> @test_int_x86_avx512_mask_vfmadd_sd(<2 x double> %a, <2 x double> %b, double* %c, i8 %mask){
; CHECK-LABEL: test_int_x86_avx512_mask_vfmadd_sd:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1
; CHECK-NEXT: vfmadd213sd (%rdi), %xmm1, %xmm0 {%k1}
; CHECK-NEXT: retq
@@ -66,3 +66,25 @@ define <2 x double> @test_int_x86_avx512_mask_vfmadd_sd(<2 x double> %a, <2 x do
%res = call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %a, <2 x double> %b, <2 x double> %cv, i8 %mask, i32 4)
ret <2 x double> %res
}
+
+; Test what happens when the load when we have multiple uses of the fadds DAG node via separate vselect nodes.
+; TODO: We shouldn't fold the load twice here.
+define <4 x float> @test_mask_add_ss_double_use(<4 x float> %a, float* %b, i8 %mask, <4 x float> %c) {
+; CHECK-LABEL: test_mask_add_ss_double_use:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; CHECK-NEXT: kmovw %esi, %k1
+; CHECK-NEXT: vaddss %xmm2, %xmm0, %xmm1 {%k1}
+; CHECK-NEXT: vaddss %xmm2, %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT: vmulps %xmm0, %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %b.val = load float, float* %b
+ %bv0 = insertelement <4 x float> undef, float %b.val, i32 0
+ %bv1 = insertelement <4 x float> %bv0, float 0.000000e+00, i32 1
+ %bv2 = insertelement <4 x float> %bv1, float 0.000000e+00, i32 2
+ %bv = insertelement <4 x float> %bv2, float 0.000000e+00, i32 3
+ %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float> %a, <4 x float> %bv, <4 x float> %c, i8 %mask, i32 4)
+ %res2 = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float> %a, <4 x float> %bv, <4 x float> zeroinitializer, i8 %mask, i32 4)
+ %res3 = fmul <4 x float> %res, %res2
+ ret <4 x float> %res3
+}
diff --git a/test/CodeGen/X86/avx512-mov.ll b/test/CodeGen/X86/avx512-mov.ll
index df988185efc5..f1a2ac880ed4 100644
--- a/test/CodeGen/X86/avx512-mov.ll
+++ b/test/CodeGen/X86/avx512-mov.ll
@@ -1,9 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=knl --show-mc-encoding| FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl --show-mc-encoding| FileCheck %s
define i32 @test1(float %x) {
; CHECK-LABEL: test1:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovd %xmm0, %eax ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x7e,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = bitcast float %x to i32
@@ -12,7 +12,7 @@ define i32 @test1(float %x) {
define <4 x i32> @test2(i32 %x) {
; CHECK-LABEL: test2:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovd %edi, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc7]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = insertelement <4 x i32>undef, i32 %x, i32 0
@@ -21,7 +21,7 @@ define <4 x i32> @test2(i32 %x) {
define <2 x i64> @test3(i64 %x) {
; CHECK-LABEL: test3:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovq %rdi, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe1,0xf9,0x6e,0xc7]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = insertelement <2 x i64>undef, i64 %x, i32 0
@@ -30,7 +30,7 @@ define <2 x i64> @test3(i64 %x) {
define <4 x i32> @test4(i32* %x) {
; CHECK-LABEL: test4:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovss (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x07]
; CHECK-NEXT: ## xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -41,7 +41,7 @@ define <4 x i32> @test4(i32* %x) {
define void @test5(float %x, float* %y) {
; CHECK-LABEL: test5:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovss %xmm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x11,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
store float %x, float* %y, align 4
@@ -50,7 +50,7 @@ define void @test5(float %x, float* %y) {
define void @test6(double %x, double* %y) {
; CHECK-LABEL: test6:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovsd %xmm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xfb,0x11,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
store double %x, double* %y, align 8
@@ -59,7 +59,7 @@ define void @test6(double %x, double* %y) {
define float @test7(i32* %x) {
; CHECK-LABEL: test7:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovss (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x07]
; CHECK-NEXT: ## xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -70,7 +70,7 @@ define float @test7(i32* %x) {
define i32 @test8(<4 x i32> %x) {
; CHECK-LABEL: test8:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovd %xmm0, %eax ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x7e,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = extractelement <4 x i32> %x, i32 0
@@ -79,7 +79,7 @@ define i32 @test8(<4 x i32> %x) {
define i64 @test9(<2 x i64> %x) {
; CHECK-LABEL: test9:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovq %xmm0, %rax ## EVEX TO VEX Compression encoding: [0xc4,0xe1,0xf9,0x7e,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = extractelement <2 x i64> %x, i32 0
@@ -88,7 +88,7 @@ define i64 @test9(<2 x i64> %x) {
define <4 x i32> @test10(i32* %x) {
; CHECK-LABEL: test10:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovss (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x07]
; CHECK-NEXT: ## xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -99,7 +99,7 @@ define <4 x i32> @test10(i32* %x) {
define <4 x float> @test11(float* %x) {
; CHECK-LABEL: test11:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovss (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x07]
; CHECK-NEXT: ## xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -110,7 +110,7 @@ define <4 x float> @test11(float* %x) {
define <2 x double> @test12(double* %x) {
; CHECK-LABEL: test12:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovsd (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0x07]
; CHECK-NEXT: ## xmm0 = mem[0],zero
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -121,7 +121,7 @@ define <2 x double> @test12(double* %x) {
define <2 x i64> @test13(i64 %x) {
; CHECK-LABEL: test13:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovq %rdi, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe1,0xf9,0x6e,0xc7]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = insertelement <2 x i64>zeroinitializer, i64 %x, i32 0
@@ -130,7 +130,7 @@ define <2 x i64> @test13(i64 %x) {
define <4 x i32> @test14(i32 %x) {
; CHECK-LABEL: test14:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovd %edi, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc7]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = insertelement <4 x i32>zeroinitializer, i32 %x, i32 0
@@ -139,7 +139,7 @@ define <4 x i32> @test14(i32 %x) {
define <4 x i32> @test15(i32* %x) {
; CHECK-LABEL: test15:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovss (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x07]
; CHECK-NEXT: ## xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -150,7 +150,7 @@ define <4 x i32> @test15(i32* %x) {
define <16 x i32> @test16(i8 * %addr) {
; CHECK-LABEL: test16:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovups (%rdi), %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x10,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <16 x i32>*
@@ -160,7 +160,7 @@ define <16 x i32> @test16(i8 * %addr) {
define <16 x i32> @test17(i8 * %addr) {
; CHECK-LABEL: test17:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovaps (%rdi), %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <16 x i32>*
@@ -170,7 +170,7 @@ define <16 x i32> @test17(i8 * %addr) {
define void @test18(i8 * %addr, <8 x i64> %data) {
; CHECK-LABEL: test18:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovaps %zmm0, (%rdi) ## encoding: [0x62,0xf1,0x7c,0x48,0x29,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <8 x i64>*
@@ -180,7 +180,7 @@ define void @test18(i8 * %addr, <8 x i64> %data) {
define void @test19(i8 * %addr, <16 x i32> %data) {
; CHECK-LABEL: test19:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovups %zmm0, (%rdi) ## encoding: [0x62,0xf1,0x7c,0x48,0x11,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <16 x i32>*
@@ -190,7 +190,7 @@ define void @test19(i8 * %addr, <16 x i32> %data) {
define void @test20(i8 * %addr, <16 x i32> %data) {
; CHECK-LABEL: test20:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovaps %zmm0, (%rdi) ## encoding: [0x62,0xf1,0x7c,0x48,0x29,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <16 x i32>*
@@ -200,7 +200,7 @@ define void @test20(i8 * %addr, <16 x i32> %data) {
define <8 x i64> @test21(i8 * %addr) {
; CHECK-LABEL: test21:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovaps (%rdi), %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <8 x i64>*
@@ -210,7 +210,7 @@ define <8 x i64> @test21(i8 * %addr) {
define void @test22(i8 * %addr, <8 x i64> %data) {
; CHECK-LABEL: test22:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovups %zmm0, (%rdi) ## encoding: [0x62,0xf1,0x7c,0x48,0x11,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <8 x i64>*
@@ -220,7 +220,7 @@ define void @test22(i8 * %addr, <8 x i64> %data) {
define <8 x i64> @test23(i8 * %addr) {
; CHECK-LABEL: test23:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovups (%rdi), %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x10,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <8 x i64>*
@@ -230,7 +230,7 @@ define <8 x i64> @test23(i8 * %addr) {
define void @test24(i8 * %addr, <8 x double> %data) {
; CHECK-LABEL: test24:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovaps %zmm0, (%rdi) ## encoding: [0x62,0xf1,0x7c,0x48,0x29,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <8 x double>*
@@ -240,7 +240,7 @@ define void @test24(i8 * %addr, <8 x double> %data) {
define <8 x double> @test25(i8 * %addr) {
; CHECK-LABEL: test25:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovaps (%rdi), %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <8 x double>*
@@ -250,7 +250,7 @@ define <8 x double> @test25(i8 * %addr) {
define void @test26(i8 * %addr, <16 x float> %data) {
; CHECK-LABEL: test26:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovaps %zmm0, (%rdi) ## encoding: [0x62,0xf1,0x7c,0x48,0x29,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <16 x float>*
@@ -260,7 +260,7 @@ define void @test26(i8 * %addr, <16 x float> %data) {
define <16 x float> @test27(i8 * %addr) {
; CHECK-LABEL: test27:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovaps (%rdi), %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <16 x float>*
@@ -270,7 +270,7 @@ define <16 x float> @test27(i8 * %addr) {
define void @test28(i8 * %addr, <8 x double> %data) {
; CHECK-LABEL: test28:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovups %zmm0, (%rdi) ## encoding: [0x62,0xf1,0x7c,0x48,0x11,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <8 x double>*
@@ -280,7 +280,7 @@ define void @test28(i8 * %addr, <8 x double> %data) {
define <8 x double> @test29(i8 * %addr) {
; CHECK-LABEL: test29:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovups (%rdi), %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x10,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <8 x double>*
@@ -290,7 +290,7 @@ define <8 x double> @test29(i8 * %addr) {
define void @test30(i8 * %addr, <16 x float> %data) {
; CHECK-LABEL: test30:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovups %zmm0, (%rdi) ## encoding: [0x62,0xf1,0x7c,0x48,0x11,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <16 x float>*
@@ -300,7 +300,7 @@ define void @test30(i8 * %addr, <16 x float> %data) {
define <16 x float> @test31(i8 * %addr) {
; CHECK-LABEL: test31:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovups (%rdi), %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x10,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <16 x float>*
@@ -310,8 +310,8 @@ define <16 x float> @test31(i8 * %addr) {
define <16 x i32> @test32(i8 * %addr, <16 x i32> %old, <16 x i32> %mask1) {
; CHECK-LABEL: test32:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2 ## encoding: [0x62,0xf1,0x6d,0x48,0xef,0xd2]
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xef,0xd2]
; CHECK-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 ## encoding: [0x62,0xf3,0x75,0x48,0x1f,0xca,0x04]
; CHECK-NEXT: vmovdqa32 (%rdi), %zmm0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0x6f,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -324,8 +324,8 @@ define <16 x i32> @test32(i8 * %addr, <16 x i32> %old, <16 x i32> %mask1) {
define <16 x i32> @test33(i8 * %addr, <16 x i32> %old, <16 x i32> %mask1) {
; CHECK-LABEL: test33:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2 ## encoding: [0x62,0xf1,0x6d,0x48,0xef,0xd2]
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xef,0xd2]
; CHECK-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 ## encoding: [0x62,0xf3,0x75,0x48,0x1f,0xca,0x04]
; CHECK-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1} ## encoding: [0x62,0xf1,0x7e,0x49,0x6f,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -338,8 +338,8 @@ define <16 x i32> @test33(i8 * %addr, <16 x i32> %old, <16 x i32> %mask1) {
define <16 x i32> @test34(i8 * %addr, <16 x i32> %mask1) {
; CHECK-LABEL: test34:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpxord %zmm1, %zmm1, %zmm1 ## encoding: [0x62,0xf1,0x75,0x48,0xef,0xc9]
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf1,0xef,0xc9]
; CHECK-NEXT: vpcmpneqd %zmm1, %zmm0, %k1 ## encoding: [0x62,0xf3,0x7d,0x48,0x1f,0xc9,0x04]
; CHECK-NEXT: vmovdqa32 (%rdi), %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xc9,0x6f,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -352,8 +352,8 @@ define <16 x i32> @test34(i8 * %addr, <16 x i32> %mask1) {
define <16 x i32> @test35(i8 * %addr, <16 x i32> %mask1) {
; CHECK-LABEL: test35:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpxord %zmm1, %zmm1, %zmm1 ## encoding: [0x62,0xf1,0x75,0x48,0xef,0xc9]
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf1,0xef,0xc9]
; CHECK-NEXT: vpcmpneqd %zmm1, %zmm0, %k1 ## encoding: [0x62,0xf3,0x7d,0x48,0x1f,0xc9,0x04]
; CHECK-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0xc9,0x6f,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -366,8 +366,8 @@ define <16 x i32> @test35(i8 * %addr, <16 x i32> %mask1) {
define <8 x i64> @test36(i8 * %addr, <8 x i64> %old, <8 x i64> %mask1) {
; CHECK-LABEL: test36:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2 ## encoding: [0x62,0xf1,0x6d,0x48,0xef,0xd2]
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xef,0xd2]
; CHECK-NEXT: vpcmpneqq %zmm2, %zmm1, %k1 ## encoding: [0x62,0xf3,0xf5,0x48,0x1f,0xca,0x04]
; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0x6f,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -380,8 +380,8 @@ define <8 x i64> @test36(i8 * %addr, <8 x i64> %old, <8 x i64> %mask1) {
define <8 x i64> @test37(i8 * %addr, <8 x i64> %old, <8 x i64> %mask1) {
; CHECK-LABEL: test37:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2 ## encoding: [0x62,0xf1,0x6d,0x48,0xef,0xd2]
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0xef,0xd2]
; CHECK-NEXT: vpcmpneqq %zmm2, %zmm1, %k1 ## encoding: [0x62,0xf3,0xf5,0x48,0x1f,0xca,0x04]
; CHECK-NEXT: vmovdqu64 (%rdi), %zmm0 {%k1} ## encoding: [0x62,0xf1,0xfe,0x49,0x6f,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -394,8 +394,8 @@ define <8 x i64> @test37(i8 * %addr, <8 x i64> %old, <8 x i64> %mask1) {
define <8 x i64> @test38(i8 * %addr, <8 x i64> %mask1) {
; CHECK-LABEL: test38:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpxord %zmm1, %zmm1, %zmm1 ## encoding: [0x62,0xf1,0x75,0x48,0xef,0xc9]
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf1,0xef,0xc9]
; CHECK-NEXT: vpcmpneqq %zmm1, %zmm0, %k1 ## encoding: [0x62,0xf3,0xfd,0x48,0x1f,0xc9,0x04]
; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0x6f,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -408,8 +408,8 @@ define <8 x i64> @test38(i8 * %addr, <8 x i64> %mask1) {
define <8 x i64> @test39(i8 * %addr, <8 x i64> %mask1) {
; CHECK-LABEL: test39:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpxord %zmm1, %zmm1, %zmm1 ## encoding: [0x62,0xf1,0x75,0x48,0xef,0xc9]
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf1,0xef,0xc9]
; CHECK-NEXT: vpcmpneqq %zmm1, %zmm0, %k1 ## encoding: [0x62,0xf3,0xfd,0x48,0x1f,0xc9,0x04]
; CHECK-NEXT: vmovdqu64 (%rdi), %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfe,0xc9,0x6f,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -422,10 +422,9 @@ define <8 x i64> @test39(i8 * %addr, <8 x i64> %mask1) {
define <16 x float> @test40(i8 * %addr, <16 x float> %old, <16 x float> %mask1) {
; CHECK-LABEL: test40:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2 ## encoding: [0x62,0xf1,0x6d,0x48,0xef,0xd2]
-; CHECK-NEXT: vcmpordps %zmm2, %zmm1, %k1 ## encoding: [0x62,0xf1,0x74,0x48,0xc2,0xca,0x07]
-; CHECK-NEXT: vcmpneqps %zmm2, %zmm1, %k1 {%k1} ## encoding: [0x62,0xf1,0x74,0x49,0xc2,0xca,0x04]
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 ## encoding: [0xc5,0xe8,0x57,0xd2]
+; CHECK-NEXT: vcmpneq_oqps %zmm2, %zmm1, %k1 ## encoding: [0x62,0xf1,0x74,0x48,0xc2,0xca,0x0c]
; CHECK-NEXT: vmovaps (%rdi), %zmm0 {%k1} ## encoding: [0x62,0xf1,0x7c,0x49,0x28,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = fcmp one <16 x float> %mask1, zeroinitializer
@@ -437,10 +436,9 @@ define <16 x float> @test40(i8 * %addr, <16 x float> %old, <16 x float> %mask1)
define <16 x float> @test41(i8 * %addr, <16 x float> %old, <16 x float> %mask1) {
; CHECK-LABEL: test41:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2 ## encoding: [0x62,0xf1,0x6d,0x48,0xef,0xd2]
-; CHECK-NEXT: vcmpordps %zmm2, %zmm1, %k1 ## encoding: [0x62,0xf1,0x74,0x48,0xc2,0xca,0x07]
-; CHECK-NEXT: vcmpneqps %zmm2, %zmm1, %k1 {%k1} ## encoding: [0x62,0xf1,0x74,0x49,0xc2,0xca,0x04]
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 ## encoding: [0xc5,0xe8,0x57,0xd2]
+; CHECK-NEXT: vcmpneq_oqps %zmm2, %zmm1, %k1 ## encoding: [0x62,0xf1,0x74,0x48,0xc2,0xca,0x0c]
; CHECK-NEXT: vmovups (%rdi), %zmm0 {%k1} ## encoding: [0x62,0xf1,0x7c,0x49,0x10,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = fcmp one <16 x float> %mask1, zeroinitializer
@@ -452,10 +450,9 @@ define <16 x float> @test41(i8 * %addr, <16 x float> %old, <16 x float> %mask1)
define <16 x float> @test42(i8 * %addr, <16 x float> %mask1) {
; CHECK-LABEL: test42:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpxord %zmm1, %zmm1, %zmm1 ## encoding: [0x62,0xf1,0x75,0x48,0xef,0xc9]
-; CHECK-NEXT: vcmpordps %zmm1, %zmm0, %k1 ## encoding: [0x62,0xf1,0x7c,0x48,0xc2,0xc9,0x07]
-; CHECK-NEXT: vcmpneqps %zmm1, %zmm0, %k1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x49,0xc2,0xc9,0x04]
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x57,0xc9]
+; CHECK-NEXT: vcmpneq_oqps %zmm1, %zmm0, %k1 ## encoding: [0x62,0xf1,0x7c,0x48,0xc2,0xc9,0x0c]
; CHECK-NEXT: vmovaps (%rdi), %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xc9,0x28,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = fcmp one <16 x float> %mask1, zeroinitializer
@@ -467,10 +464,9 @@ define <16 x float> @test42(i8 * %addr, <16 x float> %mask1) {
define <16 x float> @test43(i8 * %addr, <16 x float> %mask1) {
; CHECK-LABEL: test43:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpxord %zmm1, %zmm1, %zmm1 ## encoding: [0x62,0xf1,0x75,0x48,0xef,0xc9]
-; CHECK-NEXT: vcmpordps %zmm1, %zmm0, %k1 ## encoding: [0x62,0xf1,0x7c,0x48,0xc2,0xc9,0x07]
-; CHECK-NEXT: vcmpneqps %zmm1, %zmm0, %k1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x49,0xc2,0xc9,0x04]
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x57,0xc9]
+; CHECK-NEXT: vcmpneq_oqps %zmm1, %zmm0, %k1 ## encoding: [0x62,0xf1,0x7c,0x48,0xc2,0xc9,0x0c]
; CHECK-NEXT: vmovups (%rdi), %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xc9,0x10,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = fcmp one <16 x float> %mask1, zeroinitializer
@@ -482,10 +478,9 @@ define <16 x float> @test43(i8 * %addr, <16 x float> %mask1) {
define <8 x double> @test44(i8 * %addr, <8 x double> %old, <8 x double> %mask1) {
; CHECK-LABEL: test44:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2 ## encoding: [0x62,0xf1,0x6d,0x48,0xef,0xd2]
-; CHECK-NEXT: vcmpordpd %zmm2, %zmm1, %k1 ## encoding: [0x62,0xf1,0xf5,0x48,0xc2,0xca,0x07]
-; CHECK-NEXT: vcmpneqpd %zmm2, %zmm1, %k1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x49,0xc2,0xca,0x04]
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0x57,0xd2]
+; CHECK-NEXT: vcmpneq_oqpd %zmm2, %zmm1, %k1 ## encoding: [0x62,0xf1,0xf5,0x48,0xc2,0xca,0x0c]
; CHECK-NEXT: vmovapd (%rdi), %zmm0 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0x28,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = fcmp one <8 x double> %mask1, zeroinitializer
@@ -497,10 +492,9 @@ define <8 x double> @test44(i8 * %addr, <8 x double> %old, <8 x double> %mask1)
define <8 x double> @test45(i8 * %addr, <8 x double> %old, <8 x double> %mask1) {
; CHECK-LABEL: test45:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2 ## encoding: [0x62,0xf1,0x6d,0x48,0xef,0xd2]
-; CHECK-NEXT: vcmpordpd %zmm2, %zmm1, %k1 ## encoding: [0x62,0xf1,0xf5,0x48,0xc2,0xca,0x07]
-; CHECK-NEXT: vcmpneqpd %zmm2, %zmm1, %k1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x49,0xc2,0xca,0x04]
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 ## encoding: [0xc5,0xe9,0x57,0xd2]
+; CHECK-NEXT: vcmpneq_oqpd %zmm2, %zmm1, %k1 ## encoding: [0x62,0xf1,0xf5,0x48,0xc2,0xca,0x0c]
; CHECK-NEXT: vmovupd (%rdi), %zmm0 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0x10,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = fcmp one <8 x double> %mask1, zeroinitializer
@@ -512,10 +506,9 @@ define <8 x double> @test45(i8 * %addr, <8 x double> %old, <8 x double> %mask1)
define <8 x double> @test46(i8 * %addr, <8 x double> %mask1) {
; CHECK-LABEL: test46:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpxord %zmm1, %zmm1, %zmm1 ## encoding: [0x62,0xf1,0x75,0x48,0xef,0xc9]
-; CHECK-NEXT: vcmpordpd %zmm1, %zmm0, %k1 ## encoding: [0x62,0xf1,0xfd,0x48,0xc2,0xc9,0x07]
-; CHECK-NEXT: vcmpneqpd %zmm1, %zmm0, %k1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0xc2,0xc9,0x04]
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf1,0x57,0xc9]
+; CHECK-NEXT: vcmpneq_oqpd %zmm1, %zmm0, %k1 ## encoding: [0x62,0xf1,0xfd,0x48,0xc2,0xc9,0x0c]
; CHECK-NEXT: vmovapd (%rdi), %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0x28,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = fcmp one <8 x double> %mask1, zeroinitializer
@@ -527,10 +520,9 @@ define <8 x double> @test46(i8 * %addr, <8 x double> %mask1) {
define <8 x double> @test47(i8 * %addr, <8 x double> %mask1) {
; CHECK-LABEL: test47:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpxord %zmm1, %zmm1, %zmm1 ## encoding: [0x62,0xf1,0x75,0x48,0xef,0xc9]
-; CHECK-NEXT: vcmpordpd %zmm1, %zmm0, %k1 ## encoding: [0x62,0xf1,0xfd,0x48,0xc2,0xc9,0x07]
-; CHECK-NEXT: vcmpneqpd %zmm1, %zmm0, %k1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0xc2,0xc9,0x04]
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf1,0x57,0xc9]
+; CHECK-NEXT: vcmpneq_oqpd %zmm1, %zmm0, %k1 ## encoding: [0x62,0xf1,0xfd,0x48,0xc2,0xc9,0x0c]
; CHECK-NEXT: vmovupd (%rdi), %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0x10,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = fcmp one <8 x double> %mask1, zeroinitializer
diff --git a/test/CodeGen/X86/avx512-nontemporal.ll b/test/CodeGen/X86/avx512-nontemporal.ll
index adfaef25b7d3..9bc8a8f97526 100644
--- a/test/CodeGen/X86/avx512-nontemporal.ll
+++ b/test/CodeGen/X86/avx512-nontemporal.ll
@@ -1,31 +1,44 @@
-; RUN: llc < %s -march=x86-64 -mattr=+avx512f,+avx512bw | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512f,+avx512bw | FileCheck %s
-define void @f(<16 x float> %A, <16 x float> %AA, i8* %B, <8 x double> %C, <8 x double> %CC, <8 x i64> %E, <8 x i64> %EE, <16 x i32> %F, <16 x i32> %FF, <32 x i16> %G, <32 x i16> %GG, <64 x i8> %H, <64 x i8> %HH) {
+define i32 @f(<16 x float> %A, <16 x float> %AA, i8* %B, <8 x double> %C, <8 x double> %CC, <8 x i64> %E, <8 x i64> %EE, <16 x i32> %F, <16 x i32> %FF, <32 x i16> %G, <32 x i16> %GG, <64 x i8> %H, <64 x i8> %HH, i32 * %loadptr) {
; CHECK: vmovntps %z
+ %v0 = load i32, i32* %loadptr, align 1
%cast = bitcast i8* %B to <16 x float>*
%A2 = fadd <16 x float> %A, %AA
store <16 x float> %A2, <16 x float>* %cast, align 64, !nontemporal !0
+ %v1 = load i32, i32* %loadptr, align 1
; CHECK: vmovntdq %z
%cast1 = bitcast i8* %B to <8 x i64>*
%E2 = add <8 x i64> %E, %EE
store <8 x i64> %E2, <8 x i64>* %cast1, align 64, !nontemporal !0
+ %v2 = load i32, i32* %loadptr, align 1
; CHECK: vmovntpd %z
%cast2 = bitcast i8* %B to <8 x double>*
%C2 = fadd <8 x double> %C, %CC
store <8 x double> %C2, <8 x double>* %cast2, align 64, !nontemporal !0
+ %v3 = load i32, i32* %loadptr, align 1
; CHECK: vmovntdq %z
%cast3 = bitcast i8* %B to <16 x i32>*
%F2 = add <16 x i32> %F, %FF
store <16 x i32> %F2, <16 x i32>* %cast3, align 64, !nontemporal !0
+ %v4 = load i32, i32* %loadptr, align 1
; CHECK: vmovntdq %z
%cast4 = bitcast i8* %B to <32 x i16>*
%G2 = add <32 x i16> %G, %GG
store <32 x i16> %G2, <32 x i16>* %cast4, align 64, !nontemporal !0
+ %v5 = load i32, i32* %loadptr, align 1
; CHECK: vmovntdq %z
%cast5 = bitcast i8* %B to <64 x i8>*
%H2 = add <64 x i8> %H, %HH
store <64 x i8> %H2, <64 x i8>* %cast5, align 64, !nontemporal !0
- ret void
+ %v6 = load i32, i32* %loadptr, align 1
+ %sum1 = add i32 %v0, %v1
+ %sum2 = add i32 %sum1, %v2
+ %sum3 = add i32 %sum2, %v3
+ %sum4 = add i32 %sum3, %v4
+ %sum5 = add i32 %sum4, %v5
+ %sum6 = add i32 %sum5, %v6
+ ret i32 %sum6
}
!0 = !{i32 1}
diff --git a/test/CodeGen/X86/avx512-pmovxrm.ll b/test/CodeGen/X86/avx512-pmovxrm.ll
index ab3f32091fcb..7725f1602007 100644
--- a/test/CodeGen/X86/avx512-pmovxrm.ll
+++ b/test/CodeGen/X86/avx512-pmovxrm.ll
@@ -4,13 +4,13 @@
define <32 x i16> @test_llvm_x86_avx512_pmovsxbw(<32 x i8>* %a) {
; X32-LABEL: test_llvm_x86_avx512_pmovsxbw:
-; X32: ## BB#0:
+; X32: ## %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpmovsxbw (%eax), %zmm0
; X32-NEXT: retl
;
; X64-LABEL: test_llvm_x86_avx512_pmovsxbw:
-; X64: ## BB#0:
+; X64: ## %bb.0:
; X64-NEXT: vpmovsxbw (%rdi), %zmm0
; X64-NEXT: retq
%1 = load <32 x i8>, <32 x i8>* %a, align 1
@@ -20,13 +20,13 @@ define <32 x i16> @test_llvm_x86_avx512_pmovsxbw(<32 x i8>* %a) {
define <16 x i32> @test_llvm_x86_avx512_pmovsxbd(<16 x i8>* %a) {
; X32-LABEL: test_llvm_x86_avx512_pmovsxbd:
-; X32: ## BB#0:
+; X32: ## %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpmovsxbd (%eax), %zmm0
; X32-NEXT: retl
;
; X64-LABEL: test_llvm_x86_avx512_pmovsxbd:
-; X64: ## BB#0:
+; X64: ## %bb.0:
; X64-NEXT: vpmovsxbd (%rdi), %zmm0
; X64-NEXT: retq
%1 = load <16 x i8>, <16 x i8>* %a, align 1
@@ -36,13 +36,13 @@ define <16 x i32> @test_llvm_x86_avx512_pmovsxbd(<16 x i8>* %a) {
define <8 x i64> @test_llvm_x86_avx512_pmovsxbq(<16 x i8>* %a) {
; X32-LABEL: test_llvm_x86_avx512_pmovsxbq:
-; X32: ## BB#0:
+; X32: ## %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpmovsxbq (%eax), %zmm0
; X32-NEXT: retl
;
; X64-LABEL: test_llvm_x86_avx512_pmovsxbq:
-; X64: ## BB#0:
+; X64: ## %bb.0:
; X64-NEXT: vpmovsxbq (%rdi), %zmm0
; X64-NEXT: retq
%1 = load <16 x i8>, <16 x i8>* %a, align 1
@@ -53,13 +53,13 @@ define <8 x i64> @test_llvm_x86_avx512_pmovsxbq(<16 x i8>* %a) {
define <16 x i32> @test_llvm_x86_avx512_pmovsxwd(<16 x i16>* %a) {
; X32-LABEL: test_llvm_x86_avx512_pmovsxwd:
-; X32: ## BB#0:
+; X32: ## %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpmovsxwd (%eax), %zmm0
; X32-NEXT: retl
;
; X64-LABEL: test_llvm_x86_avx512_pmovsxwd:
-; X64: ## BB#0:
+; X64: ## %bb.0:
; X64-NEXT: vpmovsxwd (%rdi), %zmm0
; X64-NEXT: retq
%1 = load <16 x i16>, <16 x i16>* %a, align 1
@@ -69,13 +69,13 @@ define <16 x i32> @test_llvm_x86_avx512_pmovsxwd(<16 x i16>* %a) {
define <8 x i64> @test_llvm_x86_avx512_pmovsxwq(<8 x i16>* %a) {
; X32-LABEL: test_llvm_x86_avx512_pmovsxwq:
-; X32: ## BB#0:
+; X32: ## %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpmovsxwq (%eax), %zmm0
; X32-NEXT: retl
;
; X64-LABEL: test_llvm_x86_avx512_pmovsxwq:
-; X64: ## BB#0:
+; X64: ## %bb.0:
; X64-NEXT: vpmovsxwq (%rdi), %zmm0
; X64-NEXT: retq
%1 = load <8 x i16>, <8 x i16>* %a, align 1
@@ -85,13 +85,13 @@ define <8 x i64> @test_llvm_x86_avx512_pmovsxwq(<8 x i16>* %a) {
define <8 x i64> @test_llvm_x86_avx512_pmovsxdq(<8 x i32>* %a) {
; X32-LABEL: test_llvm_x86_avx512_pmovsxdq:
-; X32: ## BB#0:
+; X32: ## %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpmovsxdq (%eax), %zmm0
; X32-NEXT: retl
;
; X64-LABEL: test_llvm_x86_avx512_pmovsxdq:
-; X64: ## BB#0:
+; X64: ## %bb.0:
; X64-NEXT: vpmovsxdq (%rdi), %zmm0
; X64-NEXT: retq
%1 = load <8 x i32>, <8 x i32>* %a, align 1
@@ -101,13 +101,13 @@ define <8 x i64> @test_llvm_x86_avx512_pmovsxdq(<8 x i32>* %a) {
define <32 x i16> @test_llvm_x86_avx512_pmovzxbw(<32 x i8>* %a) {
; X32-LABEL: test_llvm_x86_avx512_pmovzxbw:
-; X32: ## BB#0:
+; X32: ## %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpmovzxbw {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero,mem[16],zero,mem[17],zero,mem[18],zero,mem[19],zero,mem[20],zero,mem[21],zero,mem[22],zero,mem[23],zero,mem[24],zero,mem[25],zero,mem[26],zero,mem[27],zero,mem[28],zero,mem[29],zero,mem[30],zero,mem[31],zero
; X32-NEXT: retl
;
; X64-LABEL: test_llvm_x86_avx512_pmovzxbw:
-; X64: ## BB#0:
+; X64: ## %bb.0:
; X64-NEXT: vpmovzxbw {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero,mem[16],zero,mem[17],zero,mem[18],zero,mem[19],zero,mem[20],zero,mem[21],zero,mem[22],zero,mem[23],zero,mem[24],zero,mem[25],zero,mem[26],zero,mem[27],zero,mem[28],zero,mem[29],zero,mem[30],zero,mem[31],zero
; X64-NEXT: retq
%1 = load <32 x i8>, <32 x i8>* %a, align 1
@@ -117,13 +117,13 @@ define <32 x i16> @test_llvm_x86_avx512_pmovzxbw(<32 x i8>* %a) {
define <16 x i32> @test_llvm_x86_avx512_pmovzxbd(<16 x i8>* %a) {
; X32-LABEL: test_llvm_x86_avx512_pmovzxbd:
-; X32: ## BB#0:
+; X32: ## %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpmovzxbd {{.*#+}} zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
; X32-NEXT: retl
;
; X64-LABEL: test_llvm_x86_avx512_pmovzxbd:
-; X64: ## BB#0:
+; X64: ## %bb.0:
; X64-NEXT: vpmovzxbd {{.*#+}} zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
; X64-NEXT: retq
%1 = load <16 x i8>, <16 x i8>* %a, align 1
@@ -133,13 +133,13 @@ define <16 x i32> @test_llvm_x86_avx512_pmovzxbd(<16 x i8>* %a) {
define <8 x i64> @test_llvm_x86_avx512_pmovzxbq(<16 x i8>* %a) {
; X32-LABEL: test_llvm_x86_avx512_pmovzxbq:
-; X32: ## BB#0:
+; X32: ## %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpmovzxbq {{.*#+}} zmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero,mem[4],zero,zero,zero,zero,zero,zero,zero,mem[5],zero,zero,zero,zero,zero,zero,zero,mem[6],zero,zero,zero,zero,zero,zero,zero,mem[7],zero,zero,zero,zero,zero,zero,zero
; X32-NEXT: retl
;
; X64-LABEL: test_llvm_x86_avx512_pmovzxbq:
-; X64: ## BB#0:
+; X64: ## %bb.0:
; X64-NEXT: vpmovzxbq {{.*#+}} zmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero,mem[4],zero,zero,zero,zero,zero,zero,zero,mem[5],zero,zero,zero,zero,zero,zero,zero,mem[6],zero,zero,zero,zero,zero,zero,zero,mem[7],zero,zero,zero,zero,zero,zero,zero
; X64-NEXT: retq
%1 = load <16 x i8>, <16 x i8>* %a, align 1
@@ -150,13 +150,13 @@ define <8 x i64> @test_llvm_x86_avx512_pmovzxbq(<16 x i8>* %a) {
define <16 x i32> @test_llvm_x86_avx512_pmovzxwd(<16 x i16>* %a) {
; X32-LABEL: test_llvm_x86_avx512_pmovzxwd:
-; X32: ## BB#0:
+; X32: ## %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
; X32-NEXT: retl
;
; X64-LABEL: test_llvm_x86_avx512_pmovzxwd:
-; X64: ## BB#0:
+; X64: ## %bb.0:
; X64-NEXT: vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
; X64-NEXT: retq
%1 = load <16 x i16>, <16 x i16>* %a, align 1
@@ -166,13 +166,13 @@ define <16 x i32> @test_llvm_x86_avx512_pmovzxwd(<16 x i16>* %a) {
define <8 x i64> @test_llvm_x86_avx512_pmovzxwq(<8 x i16>* %a) {
; X32-LABEL: test_llvm_x86_avx512_pmovzxwq:
-; X32: ## BB#0:
+; X32: ## %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpmovzxwq {{.*#+}} zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
; X32-NEXT: retl
;
; X64-LABEL: test_llvm_x86_avx512_pmovzxwq:
-; X64: ## BB#0:
+; X64: ## %bb.0:
; X64-NEXT: vpmovzxwq {{.*#+}} zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
; X64-NEXT: retq
%1 = load <8 x i16>, <8 x i16>* %a, align 1
@@ -182,13 +182,13 @@ define <8 x i64> @test_llvm_x86_avx512_pmovzxwq(<8 x i16>* %a) {
define <8 x i64> @test_llvm_x86_avx512_pmovzxdq(<8 x i32>* %a) {
; X32-LABEL: test_llvm_x86_avx512_pmovzxdq:
-; X32: ## BB#0:
+; X32: ## %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpmovzxdq {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
; X32-NEXT: retl
;
; X64-LABEL: test_llvm_x86_avx512_pmovzxdq:
-; X64: ## BB#0:
+; X64: ## %bb.0:
; X64-NEXT: vpmovzxdq {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
; X64-NEXT: retq
%1 = load <8 x i32>, <8 x i32>* %a, align 1
diff --git a/test/CodeGen/X86/avx512-regcall-Mask.ll b/test/CodeGen/X86/avx512-regcall-Mask.ll
index 781112866ca5..3bd69ef77fa3 100644
--- a/test/CodeGen/X86/avx512-regcall-Mask.ll
+++ b/test/CodeGen/X86/avx512-regcall-Mask.ll
@@ -1,72 +1,85 @@
-; RUN: llc < %s -mtriple=i386-pc-win32 -mattr=+avx512bw | FileCheck --check-prefix=CHECK --check-prefix=X32 %s
-; RUN: llc < %s -mtriple=x86_64-win32 -mattr=+avx512bw | FileCheck --check-prefix=CHECK --check-prefix=CHECK64 --check-prefix=WIN64 %s
-; RUN: llc < %s -mtriple=x86_64-linux-gnu -mattr=+avx512bw | FileCheck --check-prefix=CHECK --check-prefix=CHECK64 --check-prefix=LINUXOSX64 %s
-
-; X32-LABEL: test_argv64i1:
-; X32: kmovd %edx, %k0
-; X32: kmovd %edi, %k1
-; X32: kmovd %eax, %k1
-; X32: kmovd %ecx, %k2
-; X32: ad{{d|c}}l {{([0-9])*}}(%ebp), %e{{a|c}}x
-; X32: ad{{d|c}}l {{([0-9])*}}(%ebp), %e{{a|c}}x
-; X32: ad{{d|c}}l {{([0-9])*}}(%ebp), %e{{a|c}}x
-; X32: ad{{d|c}}l {{([0-9])*}}(%ebp), %e{{a|c}}x
-; X32: ad{{d|c}}l {{([0-9])*}}(%ebp), %e{{a|c}}x
-; X32: ad{{d|c}}l {{([0-9])*}}(%ebp), %e{{a|c}}x
-; X32: ad{{d|c}}l {{([0-9])*}}(%ebp), %e{{a|c}}x
-; X32: ad{{d|c}}l {{([0-9])*}}(%ebp), %e{{a|c}}x
-; X32: ad{{d|c}}l {{([0-9])*}}(%ebp), %e{{a|c}}x
-; X32: ad{{d|c}}l {{([0-9])*}}(%ebp), %e{{a|c}}x
-; X32: ad{{d|c}}l {{([0-9])*}}(%ebp), %e{{a|c}}x
-; X32: ad{{d|c}}l {{([0-9])*}}(%ebp), %e{{a|c}}x
-; X32: ad{{d|c}}l {{([0-9])*}}(%ebp), %e{{a|c}}x
-; X32: ad{{d|c}}l {{([0-9])*}}(%ebp), %e{{a|c}}x
-; X32: ad{{d|c}}l {{([0-9])*}}(%ebp), %e{{a|c}}x
-; X32: ad{{d|c}}l {{([0-9])*}}(%ebp), %e{{a|c}}x
-; X32: ad{{d|c}}l {{([0-9])*}}(%ebp), %e{{a|c}}x
-; X32: ad{{d|c}}l {{([0-9])*}}(%ebp), %e{{a|c}}x
-; X32: ad{{d|c}}l {{([0-9])*}}(%ebp), %e{{a|c}}x
-; X32: ad{{d|c}}l {{([0-9])*}}(%ebp), %e{{a|c}}x
-; X32: ad{{d|c}}l {{([0-9])*}}(%ebp), %e{{a|c}}x
-; X32: ad{{d|c}}l {{([0-9])*}}(%ebp), %e{{a|c}}x
-; X32: retl
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i386-pc-win32 -mattr=+avx512bw | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-win32 -mattr=+avx512bw | FileCheck %s --check-prefix=CHECK64 --check-prefix=WIN64
+; RUN: llc < %s -mtriple=x86_64-linux-gnu -mattr=+avx512bw | FileCheck %s --check-prefix=CHECK64 --check-prefix=LINUXOSX64
+; Test regcall when receiving arguments of v64i1 type
+define x86_regcallcc i64 @test_argv64i1(<64 x i1> %x0, <64 x i1> %x1, <64 x i1> %x2, <64 x i1> %x3, <64 x i1> %x4, <64 x i1> %x5, <64 x i1> %x6, <64 x i1> %x7, <64 x i1> %x8, <64 x i1> %x9, <64 x i1> %x10, <64 x i1> %x11, <64 x i1> %x12) {
+; X32-LABEL: test_argv64i1:
+; X32: # %bb.0:
+; X32-NEXT: pushl %ebp
+; X32-NEXT: movl %esp, %ebp
+; X32-NEXT: andl $-8, %esp
+; X32-NEXT: subl $16, %esp
+; X32-NEXT: kmovd %edx, %k0
+; X32-NEXT: kmovd %edi, %k1
+; X32-NEXT: kunpckdq %k0, %k1, %k0
+; X32-NEXT: kmovd %eax, %k1
+; X32-NEXT: kmovd %ecx, %k2
+; X32-NEXT: kunpckdq %k1, %k2, %k1
+; X32-NEXT: kmovq %k1, {{[0-9]+}}(%esp)
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: kmovq %k0, (%esp)
+; X32-NEXT: addl (%esp), %eax
+; X32-NEXT: adcl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: addl 8(%ebp), %eax
+; X32-NEXT: adcl 12(%ebp), %ecx
+; X32-NEXT: addl 16(%ebp), %eax
+; X32-NEXT: adcl 20(%ebp), %ecx
+; X32-NEXT: addl 24(%ebp), %eax
+; X32-NEXT: adcl 28(%ebp), %ecx
+; X32-NEXT: addl 32(%ebp), %eax
+; X32-NEXT: adcl 36(%ebp), %ecx
+; X32-NEXT: addl 40(%ebp), %eax
+; X32-NEXT: adcl 44(%ebp), %ecx
+; X32-NEXT: addl 48(%ebp), %eax
+; X32-NEXT: adcl 52(%ebp), %ecx
+; X32-NEXT: addl 56(%ebp), %eax
+; X32-NEXT: adcl 60(%ebp), %ecx
+; X32-NEXT: addl 64(%ebp), %eax
+; X32-NEXT: adcl 68(%ebp), %ecx
+; X32-NEXT: addl 72(%ebp), %eax
+; X32-NEXT: adcl 76(%ebp), %ecx
+; X32-NEXT: addl 80(%ebp), %eax
+; X32-NEXT: adcl 84(%ebp), %ecx
+; X32-NEXT: addl 88(%ebp), %eax
+; X32-NEXT: adcl 92(%ebp), %ecx
+; X32-NEXT: movl %ebp, %esp
+; X32-NEXT: popl %ebp
+; X32-NEXT: retl
+;
; WIN64-LABEL: test_argv64i1:
-; WIN64: addq %rcx, %rax
-; WIN64: addq %rdx, %rax
-; WIN64: addq %rdi, %rax
-; WIN64: addq %rsi, %rax
-; WIN64: addq %r8, %rax
-; WIN64: addq %r9, %rax
-; WIN64: addq %r10, %rax
-; WIN64: addq %r11, %rax
-; WIN64: addq %r12, %rax
-; WIN64: addq %r14, %rax
-; WIN64: addq %r15, %rax
-; WIN64: addq {{([0-9])*}}(%rsp), %rax
-; WIN64: retq
-
+; WIN64: # %bb.0:
+; WIN64-NEXT: addq %rcx, %rax
+; WIN64-NEXT: addq %rdx, %rax
+; WIN64-NEXT: addq %rdi, %rax
+; WIN64-NEXT: addq %rsi, %rax
+; WIN64-NEXT: addq %r8, %rax
+; WIN64-NEXT: addq %r9, %rax
+; WIN64-NEXT: addq %r10, %rax
+; WIN64-NEXT: addq %r11, %rax
+; WIN64-NEXT: addq %r12, %rax
+; WIN64-NEXT: addq %r14, %rax
+; WIN64-NEXT: addq %r15, %rax
+; WIN64-NEXT: addq {{[0-9]+}}(%rsp), %rax
+; WIN64-NEXT: retq
+;
; LINUXOSX64-LABEL: test_argv64i1:
-; LINUXOSX64: addq %rcx, %rax
-; LINUXOSX64: addq %rdx, %rax
-; LINUXOSX64: addq %rdi, %rax
-; LINUXOSX64: addq %rsi, %rax
-; LINUXOSX64: addq %r8, %rax
-; LINUXOSX64: addq %r9, %rax
-; LINUXOSX64: addq %r12, %rax
-; LINUXOSX64: addq %r13, %rax
-; LINUXOSX64: addq %r14, %rax
-; LINUXOSX64: addq %r15, %rax
-; LINUXOSX64: addq {{([0-9])*}}(%rsp), %rax
-; LINUXOSX64: addq {{([0-9])*}}(%rsp), %rax
-; LINUXOSX64: retq
-
-; Test regcall when receiving arguments of v64i1 type
-define x86_regcallcc i64 @test_argv64i1(<64 x i1> %x0, <64 x i1> %x1, <64 x i1> %x2,
- <64 x i1> %x3, <64 x i1> %x4, <64 x i1> %x5,
- <64 x i1> %x6, <64 x i1> %x7, <64 x i1> %x8,
- <64 x i1> %x9, <64 x i1> %x10, <64 x i1> %x11,
- <64 x i1> %x12) {
+; LINUXOSX64: # %bb.0:
+; LINUXOSX64-NEXT: addq %rcx, %rax
+; LINUXOSX64-NEXT: addq %rdx, %rax
+; LINUXOSX64-NEXT: addq %rdi, %rax
+; LINUXOSX64-NEXT: addq %rsi, %rax
+; LINUXOSX64-NEXT: addq %r8, %rax
+; LINUXOSX64-NEXT: addq %r9, %rax
+; LINUXOSX64-NEXT: addq %r12, %rax
+; LINUXOSX64-NEXT: addq %r13, %rax
+; LINUXOSX64-NEXT: addq %r14, %rax
+; LINUXOSX64-NEXT: addq %r15, %rax
+; LINUXOSX64-NEXT: addq {{[0-9]+}}(%rsp), %rax
+; LINUXOSX64-NEXT: addq {{[0-9]+}}(%rsp), %rax
+; LINUXOSX64-NEXT: retq
%y0 = bitcast <64 x i1> %x0 to i64
%y1 = bitcast <64 x i1> %x1 to i64
%y2 = bitcast <64 x i1> %x2 to i64
@@ -95,67 +108,114 @@ define x86_regcallcc i64 @test_argv64i1(<64 x i1> %x0, <64 x i1> %x1, <64 x i1>
ret i64 %add12
}
-; X32-LABEL: caller_argv64i1:
-; X32: movl $2, %eax
-; X32: movl $1, %ecx
-; X32: movl $2, %edx
-; X32: movl $1, %edi
-; X32: pushl ${{1|2}}
-; X32: pushl ${{1|2}}
-; X32: pushl ${{1|2}}
-; X32: pushl ${{1|2}}
-; X32: pushl ${{1|2}}
-; X32: pushl ${{1|2}}
-; X32: pushl ${{1|2}}
-; X32: pushl ${{1|2}}
-; X32: pushl ${{1|2}}
-; X32: pushl ${{1|2}}
-; X32: pushl ${{1|2}}
-; X32: pushl ${{1|2}}
-; X32: pushl ${{1|2}}
-; X32: pushl ${{1|2}}
-; X32: pushl ${{1|2}}
-; X32: pushl ${{1|2}}
-; X32: pushl ${{1|2}}
-; X32: pushl ${{1|2}}
-; X32: pushl ${{1|2}}
-; X32: pushl ${{1|2}}
-; X32: pushl ${{1|2}}
-; X32: pushl ${{1|2}}
-; X32: call{{.*}} _test_argv64i1
-
-; WIN64-LABEL: caller_argv64i1:
-; WIN64: movabsq $4294967298, %rax
-; WIN64: movq %rax, (%rsp)
-; WIN64: movq %rax, %rcx
-; WIN64: movq %rax, %rdx
-; WIN64: movq %rax, %rdi
-; WIN64: movq %rax, %rsi
-; WIN64: movq %rax, %r8
-; WIN64: movq %rax, %r9
-; WIN64: movq %rax, %r10
-; WIN64: movq %rax, %r11
-; WIN64: movq %rax, %r12
-; WIN64: movq %rax, %r14
-; WIN64: movq %rax, %r15
-; WIN64: callq test_argv64i1
-
-; LINUXOSX64-LABEL: caller_argv64i1:
-; LINUXOSX64: movabsq $4294967298, %rax
-; LINUXOSX64: movq %rax, %rcx
-; LINUXOSX64: movq %rax, %rdx
-; LINUXOSX64: movq %rax, %rdi
-; LINUXOSX64: movq %rax, %rsi
-; LINUXOSX64: movq %rax, %r8
-; LINUXOSX64: movq %rax, %r9
-; LINUXOSX64: movq %rax, %r12
-; LINUXOSX64: movq %rax, %r13
-; LINUXOSX64: movq %rax, %r14
-; LINUXOSX64: movq %rax, %r15
-; LINUXOSX64: call{{.*}} test_argv64i1
-
; Test regcall when passing arguments of v64i1 type
define i64 @caller_argv64i1() #0 {
+; X32-LABEL: caller_argv64i1:
+; X32: # %bb.0: # %entry
+; X32-NEXT: pushl %edi
+; X32-NEXT: subl $88, %esp
+; X32-NEXT: vmovaps {{.*#+}} xmm0 = [2,1,2,1]
+; X32-NEXT: vmovups %xmm0, {{[0-9]+}}(%esp)
+; X32-NEXT: vmovaps {{.*#+}} zmm0 = [2,1,2,1,2,1,2,1,2,1,2,1,2,1,2,1]
+; X32-NEXT: vmovups %zmm0, (%esp)
+; X32-NEXT: movl $1, {{[0-9]+}}(%esp)
+; X32-NEXT: movl $2, {{[0-9]+}}(%esp)
+; X32-NEXT: movl $2, %eax
+; X32-NEXT: movl $1, %ecx
+; X32-NEXT: movl $2, %edx
+; X32-NEXT: movl $1, %edi
+; X32-NEXT: vzeroupper
+; X32-NEXT: calll _test_argv64i1
+; X32-NEXT: movl %ecx, %edx
+; X32-NEXT: addl $88, %esp
+; X32-NEXT: popl %edi
+; X32-NEXT: retl
+;
+; WIN64-LABEL: caller_argv64i1:
+; WIN64: # %bb.0: # %entry
+; WIN64-NEXT: pushq %r15
+; WIN64-NEXT: .seh_pushreg 15
+; WIN64-NEXT: pushq %r14
+; WIN64-NEXT: .seh_pushreg 14
+; WIN64-NEXT: pushq %r12
+; WIN64-NEXT: .seh_pushreg 12
+; WIN64-NEXT: pushq %rsi
+; WIN64-NEXT: .seh_pushreg 6
+; WIN64-NEXT: pushq %rdi
+; WIN64-NEXT: .seh_pushreg 7
+; WIN64-NEXT: subq $48, %rsp
+; WIN64-NEXT: .seh_stackalloc 48
+; WIN64-NEXT: vmovaps %xmm7, {{[0-9]+}}(%rsp) # 16-byte Spill
+; WIN64-NEXT: .seh_savexmm 7, 32
+; WIN64-NEXT: vmovaps %xmm6, {{[0-9]+}}(%rsp) # 16-byte Spill
+; WIN64-NEXT: .seh_savexmm 6, 16
+; WIN64-NEXT: .seh_endprologue
+; WIN64-NEXT: movabsq $4294967298, %rax # imm = 0x100000002
+; WIN64-NEXT: movq %rax, (%rsp)
+; WIN64-NEXT: movq %rax, %rcx
+; WIN64-NEXT: movq %rax, %rdx
+; WIN64-NEXT: movq %rax, %rdi
+; WIN64-NEXT: movq %rax, %rsi
+; WIN64-NEXT: movq %rax, %r8
+; WIN64-NEXT: movq %rax, %r9
+; WIN64-NEXT: movq %rax, %r10
+; WIN64-NEXT: movq %rax, %r11
+; WIN64-NEXT: movq %rax, %r12
+; WIN64-NEXT: movq %rax, %r14
+; WIN64-NEXT: movq %rax, %r15
+; WIN64-NEXT: callq test_argv64i1
+; WIN64-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm6 # 16-byte Reload
+; WIN64-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm7 # 16-byte Reload
+; WIN64-NEXT: addq $48, %rsp
+; WIN64-NEXT: popq %rdi
+; WIN64-NEXT: popq %rsi
+; WIN64-NEXT: popq %r12
+; WIN64-NEXT: popq %r14
+; WIN64-NEXT: popq %r15
+; WIN64-NEXT: retq
+; WIN64-NEXT: .seh_handlerdata
+; WIN64-NEXT: .text
+; WIN64-NEXT: .seh_endproc
+;
+; LINUXOSX64-LABEL: caller_argv64i1:
+; LINUXOSX64: # %bb.0: # %entry
+; LINUXOSX64-NEXT: pushq %r15
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 16
+; LINUXOSX64-NEXT: pushq %r14
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 24
+; LINUXOSX64-NEXT: pushq %r13
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 32
+; LINUXOSX64-NEXT: pushq %r12
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 40
+; LINUXOSX64-NEXT: pushq %rax
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 48
+; LINUXOSX64-NEXT: .cfi_offset %r12, -40
+; LINUXOSX64-NEXT: .cfi_offset %r13, -32
+; LINUXOSX64-NEXT: .cfi_offset %r14, -24
+; LINUXOSX64-NEXT: .cfi_offset %r15, -16
+; LINUXOSX64-NEXT: movabsq $4294967298, %rax # imm = 0x100000002
+; LINUXOSX64-NEXT: movq %rax, %rcx
+; LINUXOSX64-NEXT: movq %rax, %rdx
+; LINUXOSX64-NEXT: movq %rax, %rdi
+; LINUXOSX64-NEXT: movq %rax, %rsi
+; LINUXOSX64-NEXT: movq %rax, %r8
+; LINUXOSX64-NEXT: movq %rax, %r9
+; LINUXOSX64-NEXT: movq %rax, %r12
+; LINUXOSX64-NEXT: movq %rax, %r13
+; LINUXOSX64-NEXT: movq %rax, %r14
+; LINUXOSX64-NEXT: movq %rax, %r15
+; LINUXOSX64-NEXT: pushq %rax
+; LINUXOSX64-NEXT: .cfi_adjust_cfa_offset 8
+; LINUXOSX64-NEXT: pushq %rax
+; LINUXOSX64-NEXT: .cfi_adjust_cfa_offset 8
+; LINUXOSX64-NEXT: callq test_argv64i1
+; LINUXOSX64-NEXT: addq $24, %rsp
+; LINUXOSX64-NEXT: .cfi_adjust_cfa_offset -16
+; LINUXOSX64-NEXT: popq %r12
+; LINUXOSX64-NEXT: popq %r13
+; LINUXOSX64-NEXT: popq %r14
+; LINUXOSX64-NEXT: popq %r15
+; LINUXOSX64-NEXT: retq
entry:
%v0 = bitcast i64 4294967298 to <64 x i1>
%call = call x86_regcallcc i64 @test_argv64i1(<64 x i1> %v0, <64 x i1> %v0, <64 x i1> %v0,
@@ -166,83 +226,294 @@ entry:
ret i64 %call
}
-; X32-LABEL: test_retv64i1:
-; X32: mov{{.*}} $2, %eax
-; X32: mov{{.*}} $1, %ecx
-; X32: ret{{.*}}
-
-; CHECK64-LABEL: test_retv64i1:
-; CHECK64: mov{{.*}} $4294967298, %rax
-; CHECK64: ret{{.*}}
-
; Test regcall when returning v64i1 type
define x86_regcallcc <64 x i1> @test_retv64i1() {
+; X32-LABEL: test_retv64i1:
+; X32: # %bb.0:
+; X32-NEXT: movl $2, %eax
+; X32-NEXT: movl $1, %ecx
+; X32-NEXT: retl
+;
+; CHECK64-LABEL: test_retv64i1:
+; CHECK64: # %bb.0:
+; CHECK64-NEXT: movabsq $4294967298, %rax # imm = 0x100000002
+; CHECK64-NEXT: retq
%a = bitcast i64 4294967298 to <64 x i1>
ret <64 x i1> %a
}
-; X32-LABEL: caller_retv64i1:
-; X32: call{{.*}} _test_retv64i1
-; X32: kmov{{.*}} %eax, %k0
-; X32: kmov{{.*}} %ecx, %k1
-; X32: kunpckdq %k0, %k1, %k0
-
-; CHECK64-LABEL: caller_retv64i1:
-; CHECK64: call{{.*}} {{_*}}test_retv64i1
-; CHECK64: kmovq %rax, %k0
-; CHECK64: ret{{.*}}
-
; Test regcall when processing result of v64i1 type
define <64 x i1> @caller_retv64i1() #0 {
+; X32-LABEL: caller_retv64i1:
+; X32: # %bb.0: # %entry
+; X32-NEXT: calll _test_retv64i1
+; X32-NEXT: kmovd %eax, %k0
+; X32-NEXT: kmovd %ecx, %k1
+; X32-NEXT: kunpckdq %k0, %k1, %k0
+; X32-NEXT: vpmovm2b %k0, %zmm0
+; X32-NEXT: retl
+;
+; WIN64-LABEL: caller_retv64i1:
+; WIN64: # %bb.0: # %entry
+; WIN64-NEXT: pushq %rsi
+; WIN64-NEXT: .seh_pushreg 6
+; WIN64-NEXT: pushq %rdi
+; WIN64-NEXT: .seh_pushreg 7
+; WIN64-NEXT: subq $40, %rsp
+; WIN64-NEXT: .seh_stackalloc 40
+; WIN64-NEXT: vmovaps %xmm7, {{[0-9]+}}(%rsp) # 16-byte Spill
+; WIN64-NEXT: .seh_savexmm 7, 16
+; WIN64-NEXT: vmovaps %xmm6, (%rsp) # 16-byte Spill
+; WIN64-NEXT: .seh_savexmm 6, 0
+; WIN64-NEXT: .seh_endprologue
+; WIN64-NEXT: callq test_retv64i1
+; WIN64-NEXT: kmovq %rax, %k0
+; WIN64-NEXT: vpmovm2b %k0, %zmm0
+; WIN64-NEXT: vmovaps (%rsp), %xmm6 # 16-byte Reload
+; WIN64-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm7 # 16-byte Reload
+; WIN64-NEXT: addq $40, %rsp
+; WIN64-NEXT: popq %rdi
+; WIN64-NEXT: popq %rsi
+; WIN64-NEXT: retq
+; WIN64-NEXT: .seh_handlerdata
+; WIN64-NEXT: .text
+; WIN64-NEXT: .seh_endproc
+;
+; LINUXOSX64-LABEL: caller_retv64i1:
+; LINUXOSX64: # %bb.0: # %entry
+; LINUXOSX64-NEXT: pushq %rax
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 16
+; LINUXOSX64-NEXT: callq test_retv64i1
+; LINUXOSX64-NEXT: kmovq %rax, %k0
+; LINUXOSX64-NEXT: vpmovm2b %k0, %zmm0
+; LINUXOSX64-NEXT: popq %rax
+; LINUXOSX64-NEXT: retq
entry:
%call = call x86_regcallcc <64 x i1> @test_retv64i1()
ret <64 x i1> %call
}
-; CHECK-LABEL: test_argv32i1:
-; CHECK: kmovd %edx, %k{{[0-9]+}}
-; CHECK: kmovd %ecx, %k{{[0-9]+}}
-; CHECK: kmovd %eax, %k{{[0-9]+}}
-; CHECK: ret{{l|q}}
-
; Test regcall when receiving arguments of v32i1 type
declare i32 @test_argv32i1helper(<32 x i1> %x0, <32 x i1> %x1, <32 x i1> %x2)
define x86_regcallcc i32 @test_argv32i1(<32 x i1> %x0, <32 x i1> %x1, <32 x i1> %x2) {
+; X32-LABEL: test_argv32i1:
+; X32: # %bb.0: # %entry
+; X32-NEXT: pushl %esp
+; X32-NEXT: subl $72, %esp
+; X32-NEXT: vmovups %xmm7, {{[0-9]+}}(%esp) # 16-byte Spill
+; X32-NEXT: vmovups %xmm6, {{[0-9]+}}(%esp) # 16-byte Spill
+; X32-NEXT: vmovups %xmm5, {{[0-9]+}}(%esp) # 16-byte Spill
+; X32-NEXT: vmovups %xmm4, (%esp) # 16-byte Spill
+; X32-NEXT: kmovd %edx, %k0
+; X32-NEXT: kmovd %ecx, %k1
+; X32-NEXT: kmovd %eax, %k2
+; X32-NEXT: vpmovm2b %k2, %zmm0
+; X32-NEXT: vpmovm2b %k1, %zmm1
+; X32-NEXT: vpmovm2b %k0, %zmm2
+; X32-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
+; X32-NEXT: # kill: def %ymm1 killed %ymm1 killed %zmm1
+; X32-NEXT: # kill: def %ymm2 killed %ymm2 killed %zmm2
+; X32-NEXT: calll _test_argv32i1helper
+; X32-NEXT: vmovups (%esp), %xmm4 # 16-byte Reload
+; X32-NEXT: vmovups {{[0-9]+}}(%esp), %xmm5 # 16-byte Reload
+; X32-NEXT: vmovups {{[0-9]+}}(%esp), %xmm6 # 16-byte Reload
+; X32-NEXT: vmovups {{[0-9]+}}(%esp), %xmm7 # 16-byte Reload
+; X32-NEXT: addl $72, %esp
+; X32-NEXT: popl %esp
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; WIN64-LABEL: test_argv32i1:
+; WIN64: # %bb.0: # %entry
+; WIN64-NEXT: pushq %r11
+; WIN64-NEXT: .seh_pushreg 11
+; WIN64-NEXT: pushq %r10
+; WIN64-NEXT: .seh_pushreg 10
+; WIN64-NEXT: pushq %rsp
+; WIN64-NEXT: .seh_pushreg 4
+; WIN64-NEXT: subq $32, %rsp
+; WIN64-NEXT: .seh_stackalloc 32
+; WIN64-NEXT: .seh_endprologue
+; WIN64-NEXT: kmovd %edx, %k0
+; WIN64-NEXT: kmovd %ecx, %k1
+; WIN64-NEXT: kmovd %eax, %k2
+; WIN64-NEXT: vpmovm2b %k2, %zmm0
+; WIN64-NEXT: vpmovm2b %k1, %zmm1
+; WIN64-NEXT: vpmovm2b %k0, %zmm2
+; WIN64-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
+; WIN64-NEXT: # kill: def %ymm1 killed %ymm1 killed %zmm1
+; WIN64-NEXT: # kill: def %ymm2 killed %ymm2 killed %zmm2
+; WIN64-NEXT: callq test_argv32i1helper
+; WIN64-NEXT: nop
+; WIN64-NEXT: addq $32, %rsp
+; WIN64-NEXT: popq %rsp
+; WIN64-NEXT: popq %r10
+; WIN64-NEXT: popq %r11
+; WIN64-NEXT: vzeroupper
+; WIN64-NEXT: retq
+; WIN64-NEXT: .seh_handlerdata
+; WIN64-NEXT: .text
+; WIN64-NEXT: .seh_endproc
+;
+; LINUXOSX64-LABEL: test_argv32i1:
+; LINUXOSX64: # %bb.0: # %entry
+; LINUXOSX64-NEXT: pushq %rsp
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 16
+; LINUXOSX64-NEXT: subq $128, %rsp
+; LINUXOSX64-NEXT: vmovaps %xmm15, {{[0-9]+}}(%rsp) # 16-byte Spill
+; LINUXOSX64-NEXT: vmovaps %xmm14, {{[0-9]+}}(%rsp) # 16-byte Spill
+; LINUXOSX64-NEXT: vmovaps %xmm13, {{[0-9]+}}(%rsp) # 16-byte Spill
+; LINUXOSX64-NEXT: vmovaps %xmm12, {{[0-9]+}}(%rsp) # 16-byte Spill
+; LINUXOSX64-NEXT: vmovaps %xmm11, {{[0-9]+}}(%rsp) # 16-byte Spill
+; LINUXOSX64-NEXT: vmovaps %xmm10, {{[0-9]+}}(%rsp) # 16-byte Spill
+; LINUXOSX64-NEXT: vmovaps %xmm9, {{[0-9]+}}(%rsp) # 16-byte Spill
+; LINUXOSX64-NEXT: vmovaps %xmm8, (%rsp) # 16-byte Spill
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 144
+; LINUXOSX64-NEXT: .cfi_offset %rsp, -16
+; LINUXOSX64-NEXT: .cfi_offset %xmm8, -144
+; LINUXOSX64-NEXT: .cfi_offset %xmm9, -128
+; LINUXOSX64-NEXT: .cfi_offset %xmm10, -112
+; LINUXOSX64-NEXT: .cfi_offset %xmm11, -96
+; LINUXOSX64-NEXT: .cfi_offset %xmm12, -80
+; LINUXOSX64-NEXT: .cfi_offset %xmm13, -64
+; LINUXOSX64-NEXT: .cfi_offset %xmm14, -48
+; LINUXOSX64-NEXT: .cfi_offset %xmm15, -32
+; LINUXOSX64-NEXT: kmovd %edx, %k0
+; LINUXOSX64-NEXT: kmovd %ecx, %k1
+; LINUXOSX64-NEXT: kmovd %eax, %k2
+; LINUXOSX64-NEXT: vpmovm2b %k2, %zmm0
+; LINUXOSX64-NEXT: vpmovm2b %k1, %zmm1
+; LINUXOSX64-NEXT: vpmovm2b %k0, %zmm2
+; LINUXOSX64-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
+; LINUXOSX64-NEXT: # kill: def %ymm1 killed %ymm1 killed %zmm1
+; LINUXOSX64-NEXT: # kill: def %ymm2 killed %ymm2 killed %zmm2
+; LINUXOSX64-NEXT: callq test_argv32i1helper
+; LINUXOSX64-NEXT: vmovaps (%rsp), %xmm8 # 16-byte Reload
+; LINUXOSX64-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm9 # 16-byte Reload
+; LINUXOSX64-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm10 # 16-byte Reload
+; LINUXOSX64-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm11 # 16-byte Reload
+; LINUXOSX64-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm12 # 16-byte Reload
+; LINUXOSX64-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm13 # 16-byte Reload
+; LINUXOSX64-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm14 # 16-byte Reload
+; LINUXOSX64-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm15 # 16-byte Reload
+; LINUXOSX64-NEXT: addq $128, %rsp
+; LINUXOSX64-NEXT: popq %rsp
+; LINUXOSX64-NEXT: vzeroupper
+; LINUXOSX64-NEXT: retq
entry:
%res = call i32 @test_argv32i1helper(<32 x i1> %x0, <32 x i1> %x1, <32 x i1> %x2)
ret i32 %res
}
-; CHECK-LABEL: caller_argv32i1:
-; CHECK: mov{{.*}} $1, %eax
-; CHECK: mov{{.*}} $1, %ecx
-; CHECK: mov{{.*}} $1, %edx
-; CHECK: call{{.*}} {{_*}}test_argv32i1
-
; Test regcall when passing arguments of v32i1 type
define i32 @caller_argv32i1() #0 {
+; X32-LABEL: caller_argv32i1:
+; X32: # %bb.0: # %entry
+; X32-NEXT: movl $1, %eax
+; X32-NEXT: movl $1, %ecx
+; X32-NEXT: movl $1, %edx
+; X32-NEXT: calll _test_argv32i1
+; X32-NEXT: retl
+;
+; WIN64-LABEL: caller_argv32i1:
+; WIN64: # %bb.0: # %entry
+; WIN64-NEXT: pushq %rsi
+; WIN64-NEXT: .seh_pushreg 6
+; WIN64-NEXT: pushq %rdi
+; WIN64-NEXT: .seh_pushreg 7
+; WIN64-NEXT: subq $40, %rsp
+; WIN64-NEXT: .seh_stackalloc 40
+; WIN64-NEXT: vmovaps %xmm7, {{[0-9]+}}(%rsp) # 16-byte Spill
+; WIN64-NEXT: .seh_savexmm 7, 16
+; WIN64-NEXT: vmovaps %xmm6, (%rsp) # 16-byte Spill
+; WIN64-NEXT: .seh_savexmm 6, 0
+; WIN64-NEXT: .seh_endprologue
+; WIN64-NEXT: movl $1, %eax
+; WIN64-NEXT: movl $1, %ecx
+; WIN64-NEXT: movl $1, %edx
+; WIN64-NEXT: callq test_argv32i1
+; WIN64-NEXT: vmovaps (%rsp), %xmm6 # 16-byte Reload
+; WIN64-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm7 # 16-byte Reload
+; WIN64-NEXT: addq $40, %rsp
+; WIN64-NEXT: popq %rdi
+; WIN64-NEXT: popq %rsi
+; WIN64-NEXT: retq
+; WIN64-NEXT: .seh_handlerdata
+; WIN64-NEXT: .text
+; WIN64-NEXT: .seh_endproc
+;
+; LINUXOSX64-LABEL: caller_argv32i1:
+; LINUXOSX64: # %bb.0: # %entry
+; LINUXOSX64-NEXT: pushq %rax
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 16
+; LINUXOSX64-NEXT: movl $1, %eax
+; LINUXOSX64-NEXT: movl $1, %ecx
+; LINUXOSX64-NEXT: movl $1, %edx
+; LINUXOSX64-NEXT: callq test_argv32i1
+; LINUXOSX64-NEXT: popq %rcx
+; LINUXOSX64-NEXT: retq
entry:
%v0 = bitcast i32 1 to <32 x i1>
%call = call x86_regcallcc i32 @test_argv32i1(<32 x i1> %v0, <32 x i1> %v0, <32 x i1> %v0)
ret i32 %call
}
-; CHECK-LABEL: test_retv32i1:
-; CHECK: movl $1, %eax
-; CHECK: ret{{l|q}}
-
; Test regcall when returning v32i1 type
define x86_regcallcc <32 x i1> @test_retv32i1() {
+; X32-LABEL: test_retv32i1:
+; X32: # %bb.0:
+; X32-NEXT: movl $1, %eax
+; X32-NEXT: retl
+;
+; CHECK64-LABEL: test_retv32i1:
+; CHECK64: # %bb.0:
+; CHECK64-NEXT: movl $1, %eax
+; CHECK64-NEXT: retq
%a = bitcast i32 1 to <32 x i1>
ret <32 x i1> %a
}
-; CHECK-LABEL: caller_retv32i1:
-; CHECK: call{{.*}} {{_*}}test_retv32i1
-; CHECK: incl %eax
-
; Test regcall when processing result of v32i1 type
define i32 @caller_retv32i1() #0 {
+; X32-LABEL: caller_retv32i1:
+; X32: # %bb.0: # %entry
+; X32-NEXT: calll _test_retv32i1
+; X32-NEXT: incl %eax
+; X32-NEXT: retl
+;
+; WIN64-LABEL: caller_retv32i1:
+; WIN64: # %bb.0: # %entry
+; WIN64-NEXT: pushq %rsi
+; WIN64-NEXT: .seh_pushreg 6
+; WIN64-NEXT: pushq %rdi
+; WIN64-NEXT: .seh_pushreg 7
+; WIN64-NEXT: subq $40, %rsp
+; WIN64-NEXT: .seh_stackalloc 40
+; WIN64-NEXT: vmovaps %xmm7, {{[0-9]+}}(%rsp) # 16-byte Spill
+; WIN64-NEXT: .seh_savexmm 7, 16
+; WIN64-NEXT: vmovaps %xmm6, (%rsp) # 16-byte Spill
+; WIN64-NEXT: .seh_savexmm 6, 0
+; WIN64-NEXT: .seh_endprologue
+; WIN64-NEXT: callq test_retv32i1
+; WIN64-NEXT: incl %eax
+; WIN64-NEXT: vmovaps (%rsp), %xmm6 # 16-byte Reload
+; WIN64-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm7 # 16-byte Reload
+; WIN64-NEXT: addq $40, %rsp
+; WIN64-NEXT: popq %rdi
+; WIN64-NEXT: popq %rsi
+; WIN64-NEXT: retq
+; WIN64-NEXT: .seh_handlerdata
+; WIN64-NEXT: .text
+; WIN64-NEXT: .seh_endproc
+;
+; LINUXOSX64-LABEL: caller_retv32i1:
+; LINUXOSX64: # %bb.0: # %entry
+; LINUXOSX64-NEXT: pushq %rax
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 16
+; LINUXOSX64-NEXT: callq test_retv32i1
+; LINUXOSX64-NEXT: incl %eax
+; LINUXOSX64-NEXT: popq %rcx
+; LINUXOSX64-NEXT: retq
entry:
%call = call x86_regcallcc <32 x i1> @test_retv32i1()
%c = bitcast <32 x i1> %call to i32
@@ -250,49 +521,232 @@ entry:
ret i32 %add
}
-; CHECK-LABEL: test_argv16i1:
-; CHECK: kmovd %edx, %k{{[0-9]+}}
-; CHECK: kmovd %ecx, %k{{[0-9]+}}
-; CHECK: kmovd %eax, %k{{[0-9]+}}
-; CHECK: ret{{l|q}}
-
; Test regcall when receiving arguments of v16i1 type
declare i16 @test_argv16i1helper(<16 x i1> %x0, <16 x i1> %x1, <16 x i1> %x2)
define x86_regcallcc i16 @test_argv16i1(<16 x i1> %x0, <16 x i1> %x1, <16 x i1> %x2) {
+; X32-LABEL: test_argv16i1:
+; X32: # %bb.0:
+; X32-NEXT: pushl %esp
+; X32-NEXT: subl $72, %esp
+; X32-NEXT: vmovups %xmm7, {{[0-9]+}}(%esp) # 16-byte Spill
+; X32-NEXT: vmovups %xmm6, {{[0-9]+}}(%esp) # 16-byte Spill
+; X32-NEXT: vmovups %xmm5, {{[0-9]+}}(%esp) # 16-byte Spill
+; X32-NEXT: vmovups %xmm4, (%esp) # 16-byte Spill
+; X32-NEXT: kmovd %edx, %k0
+; X32-NEXT: kmovd %ecx, %k1
+; X32-NEXT: kmovd %eax, %k2
+; X32-NEXT: vpmovm2b %k2, %zmm0
+; X32-NEXT: vpmovm2b %k1, %zmm1
+; X32-NEXT: vpmovm2b %k0, %zmm2
+; X32-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0
+; X32-NEXT: # kill: def %xmm1 killed %xmm1 killed %zmm1
+; X32-NEXT: # kill: def %xmm2 killed %xmm2 killed %zmm2
+; X32-NEXT: vzeroupper
+; X32-NEXT: calll _test_argv16i1helper
+; X32-NEXT: vmovups (%esp), %xmm4 # 16-byte Reload
+; X32-NEXT: vmovups {{[0-9]+}}(%esp), %xmm5 # 16-byte Reload
+; X32-NEXT: vmovups {{[0-9]+}}(%esp), %xmm6 # 16-byte Reload
+; X32-NEXT: vmovups {{[0-9]+}}(%esp), %xmm7 # 16-byte Reload
+; X32-NEXT: addl $72, %esp
+; X32-NEXT: popl %esp
+; X32-NEXT: retl
+;
+; WIN64-LABEL: test_argv16i1:
+; WIN64: # %bb.0:
+; WIN64-NEXT: pushq %r11
+; WIN64-NEXT: .seh_pushreg 11
+; WIN64-NEXT: pushq %r10
+; WIN64-NEXT: .seh_pushreg 10
+; WIN64-NEXT: pushq %rsp
+; WIN64-NEXT: .seh_pushreg 4
+; WIN64-NEXT: subq $32, %rsp
+; WIN64-NEXT: .seh_stackalloc 32
+; WIN64-NEXT: .seh_endprologue
+; WIN64-NEXT: kmovd %edx, %k0
+; WIN64-NEXT: kmovd %ecx, %k1
+; WIN64-NEXT: kmovd %eax, %k2
+; WIN64-NEXT: vpmovm2b %k2, %zmm0
+; WIN64-NEXT: vpmovm2b %k1, %zmm1
+; WIN64-NEXT: vpmovm2b %k0, %zmm2
+; WIN64-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0
+; WIN64-NEXT: # kill: def %xmm1 killed %xmm1 killed %zmm1
+; WIN64-NEXT: # kill: def %xmm2 killed %xmm2 killed %zmm2
+; WIN64-NEXT: vzeroupper
+; WIN64-NEXT: callq test_argv16i1helper
+; WIN64-NEXT: nop
+; WIN64-NEXT: addq $32, %rsp
+; WIN64-NEXT: popq %rsp
+; WIN64-NEXT: popq %r10
+; WIN64-NEXT: popq %r11
+; WIN64-NEXT: retq
+; WIN64-NEXT: .seh_handlerdata
+; WIN64-NEXT: .text
+; WIN64-NEXT: .seh_endproc
+;
+; LINUXOSX64-LABEL: test_argv16i1:
+; LINUXOSX64: # %bb.0:
+; LINUXOSX64-NEXT: pushq %rsp
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 16
+; LINUXOSX64-NEXT: subq $128, %rsp
+; LINUXOSX64-NEXT: vmovaps %xmm15, {{[0-9]+}}(%rsp) # 16-byte Spill
+; LINUXOSX64-NEXT: vmovaps %xmm14, {{[0-9]+}}(%rsp) # 16-byte Spill
+; LINUXOSX64-NEXT: vmovaps %xmm13, {{[0-9]+}}(%rsp) # 16-byte Spill
+; LINUXOSX64-NEXT: vmovaps %xmm12, {{[0-9]+}}(%rsp) # 16-byte Spill
+; LINUXOSX64-NEXT: vmovaps %xmm11, {{[0-9]+}}(%rsp) # 16-byte Spill
+; LINUXOSX64-NEXT: vmovaps %xmm10, {{[0-9]+}}(%rsp) # 16-byte Spill
+; LINUXOSX64-NEXT: vmovaps %xmm9, {{[0-9]+}}(%rsp) # 16-byte Spill
+; LINUXOSX64-NEXT: vmovaps %xmm8, (%rsp) # 16-byte Spill
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 144
+; LINUXOSX64-NEXT: .cfi_offset %rsp, -16
+; LINUXOSX64-NEXT: .cfi_offset %xmm8, -144
+; LINUXOSX64-NEXT: .cfi_offset %xmm9, -128
+; LINUXOSX64-NEXT: .cfi_offset %xmm10, -112
+; LINUXOSX64-NEXT: .cfi_offset %xmm11, -96
+; LINUXOSX64-NEXT: .cfi_offset %xmm12, -80
+; LINUXOSX64-NEXT: .cfi_offset %xmm13, -64
+; LINUXOSX64-NEXT: .cfi_offset %xmm14, -48
+; LINUXOSX64-NEXT: .cfi_offset %xmm15, -32
+; LINUXOSX64-NEXT: kmovd %edx, %k0
+; LINUXOSX64-NEXT: kmovd %ecx, %k1
+; LINUXOSX64-NEXT: kmovd %eax, %k2
+; LINUXOSX64-NEXT: vpmovm2b %k2, %zmm0
+; LINUXOSX64-NEXT: vpmovm2b %k1, %zmm1
+; LINUXOSX64-NEXT: vpmovm2b %k0, %zmm2
+; LINUXOSX64-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0
+; LINUXOSX64-NEXT: # kill: def %xmm1 killed %xmm1 killed %zmm1
+; LINUXOSX64-NEXT: # kill: def %xmm2 killed %xmm2 killed %zmm2
+; LINUXOSX64-NEXT: vzeroupper
+; LINUXOSX64-NEXT: callq test_argv16i1helper
+; LINUXOSX64-NEXT: vmovaps (%rsp), %xmm8 # 16-byte Reload
+; LINUXOSX64-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm9 # 16-byte Reload
+; LINUXOSX64-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm10 # 16-byte Reload
+; LINUXOSX64-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm11 # 16-byte Reload
+; LINUXOSX64-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm12 # 16-byte Reload
+; LINUXOSX64-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm13 # 16-byte Reload
+; LINUXOSX64-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm14 # 16-byte Reload
+; LINUXOSX64-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm15 # 16-byte Reload
+; LINUXOSX64-NEXT: addq $128, %rsp
+; LINUXOSX64-NEXT: popq %rsp
+; LINUXOSX64-NEXT: retq
%res = call i16 @test_argv16i1helper(<16 x i1> %x0, <16 x i1> %x1, <16 x i1> %x2)
ret i16 %res
}
-; CHECK-LABEL: caller_argv16i1:
-; CHECK: movl $1, %eax
-; CHECK: movl $1, %ecx
-; CHECK: movl $1, %edx
-; CHECK: call{{l|q}} {{_*}}test_argv16i1
-
; Test regcall when passing arguments of v16i1 type
define i16 @caller_argv16i1() #0 {
+; X32-LABEL: caller_argv16i1:
+; X32: # %bb.0: # %entry
+; X32-NEXT: movl $1, %eax
+; X32-NEXT: movl $1, %ecx
+; X32-NEXT: movl $1, %edx
+; X32-NEXT: calll _test_argv16i1
+; X32-NEXT: retl
+;
+; WIN64-LABEL: caller_argv16i1:
+; WIN64: # %bb.0: # %entry
+; WIN64-NEXT: pushq %rsi
+; WIN64-NEXT: .seh_pushreg 6
+; WIN64-NEXT: pushq %rdi
+; WIN64-NEXT: .seh_pushreg 7
+; WIN64-NEXT: subq $40, %rsp
+; WIN64-NEXT: .seh_stackalloc 40
+; WIN64-NEXT: vmovaps %xmm7, {{[0-9]+}}(%rsp) # 16-byte Spill
+; WIN64-NEXT: .seh_savexmm 7, 16
+; WIN64-NEXT: vmovaps %xmm6, (%rsp) # 16-byte Spill
+; WIN64-NEXT: .seh_savexmm 6, 0
+; WIN64-NEXT: .seh_endprologue
+; WIN64-NEXT: movl $1, %eax
+; WIN64-NEXT: movl $1, %ecx
+; WIN64-NEXT: movl $1, %edx
+; WIN64-NEXT: callq test_argv16i1
+; WIN64-NEXT: vmovaps (%rsp), %xmm6 # 16-byte Reload
+; WIN64-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm7 # 16-byte Reload
+; WIN64-NEXT: addq $40, %rsp
+; WIN64-NEXT: popq %rdi
+; WIN64-NEXT: popq %rsi
+; WIN64-NEXT: retq
+; WIN64-NEXT: .seh_handlerdata
+; WIN64-NEXT: .text
+; WIN64-NEXT: .seh_endproc
+;
+; LINUXOSX64-LABEL: caller_argv16i1:
+; LINUXOSX64: # %bb.0: # %entry
+; LINUXOSX64-NEXT: pushq %rax
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 16
+; LINUXOSX64-NEXT: movl $1, %eax
+; LINUXOSX64-NEXT: movl $1, %ecx
+; LINUXOSX64-NEXT: movl $1, %edx
+; LINUXOSX64-NEXT: callq test_argv16i1
+; LINUXOSX64-NEXT: popq %rcx
+; LINUXOSX64-NEXT: retq
entry:
%v0 = bitcast i16 1 to <16 x i1>
%call = call x86_regcallcc i16 @test_argv16i1(<16 x i1> %v0, <16 x i1> %v0, <16 x i1> %v0)
ret i16 %call
}
-; CHECK-LABEL: test_retv16i1:
-; CHECK: movw $1, %ax
-; CHECK: ret{{l|q}}
-
; Test regcall when returning v16i1 type
define x86_regcallcc <16 x i1> @test_retv16i1() {
+; X32-LABEL: test_retv16i1:
+; X32: # %bb.0:
+; X32-NEXT: movw $1, %ax
+; X32-NEXT: retl
+;
+; CHECK64-LABEL: test_retv16i1:
+; CHECK64: # %bb.0:
+; CHECK64-NEXT: movw $1, %ax
+; CHECK64-NEXT: retq
%a = bitcast i16 1 to <16 x i1>
ret <16 x i1> %a
}
-; CHECK-LABEL: caller_retv16i1:
-; CHECK: call{{l|q}} {{_*}}test_retv16i1
-; CHECK: incl %eax
-
; Test regcall when processing result of v16i1 type
define i16 @caller_retv16i1() #0 {
+; X32-LABEL: caller_retv16i1:
+; X32: # %bb.0: # %entry
+; X32-NEXT: calll _test_retv16i1
+; X32-NEXT: # kill: def %ax killed %ax def %eax
+; X32-NEXT: incl %eax
+; X32-NEXT: # kill: def %ax killed %ax killed %eax
+; X32-NEXT: retl
+;
+; WIN64-LABEL: caller_retv16i1:
+; WIN64: # %bb.0: # %entry
+; WIN64-NEXT: pushq %rsi
+; WIN64-NEXT: .seh_pushreg 6
+; WIN64-NEXT: pushq %rdi
+; WIN64-NEXT: .seh_pushreg 7
+; WIN64-NEXT: subq $40, %rsp
+; WIN64-NEXT: .seh_stackalloc 40
+; WIN64-NEXT: vmovaps %xmm7, {{[0-9]+}}(%rsp) # 16-byte Spill
+; WIN64-NEXT: .seh_savexmm 7, 16
+; WIN64-NEXT: vmovaps %xmm6, (%rsp) # 16-byte Spill
+; WIN64-NEXT: .seh_savexmm 6, 0
+; WIN64-NEXT: .seh_endprologue
+; WIN64-NEXT: callq test_retv16i1
+; WIN64-NEXT: # kill: def %ax killed %ax def %eax
+; WIN64-NEXT: incl %eax
+; WIN64-NEXT: # kill: def %ax killed %ax killed %eax
+; WIN64-NEXT: vmovaps (%rsp), %xmm6 # 16-byte Reload
+; WIN64-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm7 # 16-byte Reload
+; WIN64-NEXT: addq $40, %rsp
+; WIN64-NEXT: popq %rdi
+; WIN64-NEXT: popq %rsi
+; WIN64-NEXT: retq
+; WIN64-NEXT: .seh_handlerdata
+; WIN64-NEXT: .text
+; WIN64-NEXT: .seh_endproc
+;
+; LINUXOSX64-LABEL: caller_retv16i1:
+; LINUXOSX64: # %bb.0: # %entry
+; LINUXOSX64-NEXT: pushq %rax
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 16
+; LINUXOSX64-NEXT: callq test_retv16i1
+; LINUXOSX64-NEXT: # kill: def %ax killed %ax def %eax
+; LINUXOSX64-NEXT: incl %eax
+; LINUXOSX64-NEXT: # kill: def %ax killed %ax killed %eax
+; LINUXOSX64-NEXT: popq %rcx
+; LINUXOSX64-NEXT: retq
entry:
%call = call x86_regcallcc <16 x i1> @test_retv16i1()
%c = bitcast <16 x i1> %call to i16
@@ -300,50 +754,238 @@ entry:
ret i16 %add
}
-; CHECK-LABEL: test_argv8i1:
-; CHECK: kmovd %edx, %k{{[0-9]+}}
-; CHECK: kmovd %ecx, %k{{[0-9]+}}
-; CHECK: kmovd %eax, %k{{[0-9]+}}
-; CHECK: ret{{l|q}}
-
; Test regcall when receiving arguments of v8i1 type
declare i8 @test_argv8i1helper(<8 x i1> %x0, <8 x i1> %x1, <8 x i1> %x2)
define x86_regcallcc i8 @test_argv8i1(<8 x i1> %x0, <8 x i1> %x1, <8 x i1> %x2) {
+; X32-LABEL: test_argv8i1:
+; X32: # %bb.0:
+; X32-NEXT: pushl %esp
+; X32-NEXT: subl $72, %esp
+; X32-NEXT: vmovups %xmm7, {{[0-9]+}}(%esp) # 16-byte Spill
+; X32-NEXT: vmovups %xmm6, {{[0-9]+}}(%esp) # 16-byte Spill
+; X32-NEXT: vmovups %xmm5, {{[0-9]+}}(%esp) # 16-byte Spill
+; X32-NEXT: vmovups %xmm4, (%esp) # 16-byte Spill
+; X32-NEXT: kmovd %edx, %k0
+; X32-NEXT: kmovd %ecx, %k1
+; X32-NEXT: kmovd %eax, %k2
+; X32-NEXT: vpmovm2w %k2, %zmm0
+; X32-NEXT: vpmovm2w %k1, %zmm1
+; X32-NEXT: vpmovm2w %k0, %zmm2
+; X32-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0
+; X32-NEXT: # kill: def %xmm1 killed %xmm1 killed %zmm1
+; X32-NEXT: # kill: def %xmm2 killed %xmm2 killed %zmm2
+; X32-NEXT: vzeroupper
+; X32-NEXT: calll _test_argv8i1helper
+; X32-NEXT: vmovups (%esp), %xmm4 # 16-byte Reload
+; X32-NEXT: vmovups {{[0-9]+}}(%esp), %xmm5 # 16-byte Reload
+; X32-NEXT: vmovups {{[0-9]+}}(%esp), %xmm6 # 16-byte Reload
+; X32-NEXT: vmovups {{[0-9]+}}(%esp), %xmm7 # 16-byte Reload
+; X32-NEXT: addl $72, %esp
+; X32-NEXT: popl %esp
+; X32-NEXT: retl
+;
+; WIN64-LABEL: test_argv8i1:
+; WIN64: # %bb.0:
+; WIN64-NEXT: pushq %r11
+; WIN64-NEXT: .seh_pushreg 11
+; WIN64-NEXT: pushq %r10
+; WIN64-NEXT: .seh_pushreg 10
+; WIN64-NEXT: pushq %rsp
+; WIN64-NEXT: .seh_pushreg 4
+; WIN64-NEXT: subq $32, %rsp
+; WIN64-NEXT: .seh_stackalloc 32
+; WIN64-NEXT: .seh_endprologue
+; WIN64-NEXT: kmovd %edx, %k0
+; WIN64-NEXT: kmovd %ecx, %k1
+; WIN64-NEXT: kmovd %eax, %k2
+; WIN64-NEXT: vpmovm2w %k2, %zmm0
+; WIN64-NEXT: vpmovm2w %k1, %zmm1
+; WIN64-NEXT: vpmovm2w %k0, %zmm2
+; WIN64-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0
+; WIN64-NEXT: # kill: def %xmm1 killed %xmm1 killed %zmm1
+; WIN64-NEXT: # kill: def %xmm2 killed %xmm2 killed %zmm2
+; WIN64-NEXT: vzeroupper
+; WIN64-NEXT: callq test_argv8i1helper
+; WIN64-NEXT: nop
+; WIN64-NEXT: addq $32, %rsp
+; WIN64-NEXT: popq %rsp
+; WIN64-NEXT: popq %r10
+; WIN64-NEXT: popq %r11
+; WIN64-NEXT: retq
+; WIN64-NEXT: .seh_handlerdata
+; WIN64-NEXT: .text
+; WIN64-NEXT: .seh_endproc
+;
+; LINUXOSX64-LABEL: test_argv8i1:
+; LINUXOSX64: # %bb.0:
+; LINUXOSX64-NEXT: pushq %rsp
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 16
+; LINUXOSX64-NEXT: subq $128, %rsp
+; LINUXOSX64-NEXT: vmovaps %xmm15, {{[0-9]+}}(%rsp) # 16-byte Spill
+; LINUXOSX64-NEXT: vmovaps %xmm14, {{[0-9]+}}(%rsp) # 16-byte Spill
+; LINUXOSX64-NEXT: vmovaps %xmm13, {{[0-9]+}}(%rsp) # 16-byte Spill
+; LINUXOSX64-NEXT: vmovaps %xmm12, {{[0-9]+}}(%rsp) # 16-byte Spill
+; LINUXOSX64-NEXT: vmovaps %xmm11, {{[0-9]+}}(%rsp) # 16-byte Spill
+; LINUXOSX64-NEXT: vmovaps %xmm10, {{[0-9]+}}(%rsp) # 16-byte Spill
+; LINUXOSX64-NEXT: vmovaps %xmm9, {{[0-9]+}}(%rsp) # 16-byte Spill
+; LINUXOSX64-NEXT: vmovaps %xmm8, (%rsp) # 16-byte Spill
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 144
+; LINUXOSX64-NEXT: .cfi_offset %rsp, -16
+; LINUXOSX64-NEXT: .cfi_offset %xmm8, -144
+; LINUXOSX64-NEXT: .cfi_offset %xmm9, -128
+; LINUXOSX64-NEXT: .cfi_offset %xmm10, -112
+; LINUXOSX64-NEXT: .cfi_offset %xmm11, -96
+; LINUXOSX64-NEXT: .cfi_offset %xmm12, -80
+; LINUXOSX64-NEXT: .cfi_offset %xmm13, -64
+; LINUXOSX64-NEXT: .cfi_offset %xmm14, -48
+; LINUXOSX64-NEXT: .cfi_offset %xmm15, -32
+; LINUXOSX64-NEXT: kmovd %edx, %k0
+; LINUXOSX64-NEXT: kmovd %ecx, %k1
+; LINUXOSX64-NEXT: kmovd %eax, %k2
+; LINUXOSX64-NEXT: vpmovm2w %k2, %zmm0
+; LINUXOSX64-NEXT: vpmovm2w %k1, %zmm1
+; LINUXOSX64-NEXT: vpmovm2w %k0, %zmm2
+; LINUXOSX64-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0
+; LINUXOSX64-NEXT: # kill: def %xmm1 killed %xmm1 killed %zmm1
+; LINUXOSX64-NEXT: # kill: def %xmm2 killed %xmm2 killed %zmm2
+; LINUXOSX64-NEXT: vzeroupper
+; LINUXOSX64-NEXT: callq test_argv8i1helper
+; LINUXOSX64-NEXT: vmovaps (%rsp), %xmm8 # 16-byte Reload
+; LINUXOSX64-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm9 # 16-byte Reload
+; LINUXOSX64-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm10 # 16-byte Reload
+; LINUXOSX64-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm11 # 16-byte Reload
+; LINUXOSX64-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm12 # 16-byte Reload
+; LINUXOSX64-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm13 # 16-byte Reload
+; LINUXOSX64-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm14 # 16-byte Reload
+; LINUXOSX64-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm15 # 16-byte Reload
+; LINUXOSX64-NEXT: addq $128, %rsp
+; LINUXOSX64-NEXT: popq %rsp
+; LINUXOSX64-NEXT: retq
%res = call i8 @test_argv8i1helper(<8 x i1> %x0, <8 x i1> %x1, <8 x i1> %x2)
ret i8 %res
}
-; CHECK-LABEL: caller_argv8i1:
-; CHECK: movl $1, %eax
-; CHECK: movl $1, %ecx
-; CHECK: movl $1, %edx
-; CHECK: call{{l|q}} {{_*}}test_argv8i1
-
; Test regcall when passing arguments of v8i1 type
define i8 @caller_argv8i1() #0 {
+; X32-LABEL: caller_argv8i1:
+; X32: # %bb.0: # %entry
+; X32-NEXT: movl $1, %eax
+; X32-NEXT: movl $1, %ecx
+; X32-NEXT: movl $1, %edx
+; X32-NEXT: calll _test_argv8i1
+; X32-NEXT: retl
+;
+; WIN64-LABEL: caller_argv8i1:
+; WIN64: # %bb.0: # %entry
+; WIN64-NEXT: pushq %rsi
+; WIN64-NEXT: .seh_pushreg 6
+; WIN64-NEXT: pushq %rdi
+; WIN64-NEXT: .seh_pushreg 7
+; WIN64-NEXT: subq $40, %rsp
+; WIN64-NEXT: .seh_stackalloc 40
+; WIN64-NEXT: vmovaps %xmm7, {{[0-9]+}}(%rsp) # 16-byte Spill
+; WIN64-NEXT: .seh_savexmm 7, 16
+; WIN64-NEXT: vmovaps %xmm6, (%rsp) # 16-byte Spill
+; WIN64-NEXT: .seh_savexmm 6, 0
+; WIN64-NEXT: .seh_endprologue
+; WIN64-NEXT: movl $1, %eax
+; WIN64-NEXT: movl $1, %ecx
+; WIN64-NEXT: movl $1, %edx
+; WIN64-NEXT: callq test_argv8i1
+; WIN64-NEXT: vmovaps (%rsp), %xmm6 # 16-byte Reload
+; WIN64-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm7 # 16-byte Reload
+; WIN64-NEXT: addq $40, %rsp
+; WIN64-NEXT: popq %rdi
+; WIN64-NEXT: popq %rsi
+; WIN64-NEXT: retq
+; WIN64-NEXT: .seh_handlerdata
+; WIN64-NEXT: .text
+; WIN64-NEXT: .seh_endproc
+;
+; LINUXOSX64-LABEL: caller_argv8i1:
+; LINUXOSX64: # %bb.0: # %entry
+; LINUXOSX64-NEXT: pushq %rax
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 16
+; LINUXOSX64-NEXT: movl $1, %eax
+; LINUXOSX64-NEXT: movl $1, %ecx
+; LINUXOSX64-NEXT: movl $1, %edx
+; LINUXOSX64-NEXT: callq test_argv8i1
+; LINUXOSX64-NEXT: popq %rcx
+; LINUXOSX64-NEXT: retq
entry:
%v0 = bitcast i8 1 to <8 x i1>
%call = call x86_regcallcc i8 @test_argv8i1(<8 x i1> %v0, <8 x i1> %v0, <8 x i1> %v0)
ret i8 %call
}
-; CHECK-LABEL: test_retv8i1:
-; CHECK: movb $1, %al
-; CHECK: ret{{q|l}}
-
; Test regcall when returning v8i1 type
define x86_regcallcc <8 x i1> @test_retv8i1() {
+; X32-LABEL: test_retv8i1:
+; X32: # %bb.0:
+; X32-NEXT: movb $1, %al
+; X32-NEXT: retl
+;
+; CHECK64-LABEL: test_retv8i1:
+; CHECK64: # %bb.0:
+; CHECK64-NEXT: movb $1, %al
+; CHECK64-NEXT: retq
%a = bitcast i8 1 to <8 x i1>
ret <8 x i1> %a
}
-; CHECK-LABEL: caller_retv8i1:
-; CHECK: call{{l|q}} {{_*}}test_retv8i1
-; CHECK: kmovd %eax, %k{{[0-9]+}}
-; CHECK: ret{{l|q}}
-
; Test regcall when processing result of v8i1 type
define <8 x i1> @caller_retv8i1() #0 {
+; X32-LABEL: caller_retv8i1:
+; X32: # %bb.0: # %entry
+; X32-NEXT: calll _test_retv8i1
+; X32-NEXT: # kill: def %al killed %al def %eax
+; X32-NEXT: kmovd %eax, %k0
+; X32-NEXT: vpmovm2w %k0, %zmm0
+; X32-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; WIN64-LABEL: caller_retv8i1:
+; WIN64: # %bb.0: # %entry
+; WIN64-NEXT: pushq %rsi
+; WIN64-NEXT: .seh_pushreg 6
+; WIN64-NEXT: pushq %rdi
+; WIN64-NEXT: .seh_pushreg 7
+; WIN64-NEXT: subq $40, %rsp
+; WIN64-NEXT: .seh_stackalloc 40
+; WIN64-NEXT: vmovaps %xmm7, {{[0-9]+}}(%rsp) # 16-byte Spill
+; WIN64-NEXT: .seh_savexmm 7, 16
+; WIN64-NEXT: vmovaps %xmm6, (%rsp) # 16-byte Spill
+; WIN64-NEXT: .seh_savexmm 6, 0
+; WIN64-NEXT: .seh_endprologue
+; WIN64-NEXT: callq test_retv8i1
+; WIN64-NEXT: # kill: def %al killed %al def %eax
+; WIN64-NEXT: kmovd %eax, %k0
+; WIN64-NEXT: vpmovm2w %k0, %zmm0
+; WIN64-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0
+; WIN64-NEXT: vmovaps (%rsp), %xmm6 # 16-byte Reload
+; WIN64-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm7 # 16-byte Reload
+; WIN64-NEXT: addq $40, %rsp
+; WIN64-NEXT: popq %rdi
+; WIN64-NEXT: popq %rsi
+; WIN64-NEXT: vzeroupper
+; WIN64-NEXT: retq
+; WIN64-NEXT: .seh_handlerdata
+; WIN64-NEXT: .text
+; WIN64-NEXT: .seh_endproc
+;
+; LINUXOSX64-LABEL: caller_retv8i1:
+; LINUXOSX64: # %bb.0: # %entry
+; LINUXOSX64-NEXT: pushq %rax
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 16
+; LINUXOSX64-NEXT: callq test_retv8i1
+; LINUXOSX64-NEXT: # kill: def %al killed %al def %eax
+; LINUXOSX64-NEXT: kmovd %eax, %k0
+; LINUXOSX64-NEXT: vpmovm2w %k0, %zmm0
+; LINUXOSX64-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0
+; LINUXOSX64-NEXT: popq %rax
+; LINUXOSX64-NEXT: vzeroupper
+; LINUXOSX64-NEXT: retq
entry:
%call = call x86_regcallcc <8 x i1> @test_retv8i1()
ret <8 x i1> %call
diff --git a/test/CodeGen/X86/avx512-regcall-NoMask.ll b/test/CodeGen/X86/avx512-regcall-NoMask.ll
index f43d5b3e11dd..9096720f172d 100644
--- a/test/CodeGen/X86/avx512-regcall-NoMask.ll
+++ b/test/CodeGen/X86/avx512-regcall-NoMask.ll
@@ -1,307 +1,617 @@
-; RUN: llc < %s -mtriple=i386-pc-win32 -mattr=+avx512f -mattr=+avx512vl -mattr=+avx512bw -mattr=+avx512dq | FileCheck --check-prefix=ALL --check-prefix=X32 %s
-; RUN: llc < %s -mtriple=x86_64-win32 -mattr=+avx512f -mattr=+avx512vl -mattr=+avx512bw -mattr=+avx512dq | FileCheck --check-prefix=ALL --check-prefix=WIN64 %s
-; RUN: llc < %s -mtriple=x86_64-linux-gnu -mattr=+avx512f -mattr=+avx512vl -mattr=+avx512bw -mattr=+avx512dq | FileCheck --check-prefix=LINUXOSX64 %s
-
-; ALL-LABEL: test_argReti1:
-; ALL: incb %al
-; ALL: ret{{.*}}
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i386-pc-win32 -mattr=+avx512f -mattr=+avx512vl -mattr=+avx512bw -mattr=+avx512dq | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-win32 -mattr=+avx512f -mattr=+avx512vl -mattr=+avx512bw -mattr=+avx512dq | FileCheck %s --check-prefix=WIN64
+; RUN: llc < %s -mtriple=x86_64-linux-gnu -mattr=+avx512f -mattr=+avx512vl -mattr=+avx512bw -mattr=+avx512dq | FileCheck %s --check-prefix=LINUXOSX64
; Test regcall when receiving/returning i1
define x86_regcallcc i1 @test_argReti1(i1 %a) {
+; X32-LABEL: test_argReti1:
+; X32: # %bb.0:
+; X32-NEXT: incb %al
+; X32-NEXT: # kill: def %al killed %al killed %eax
+; X32-NEXT: retl
+;
+; WIN64-LABEL: test_argReti1:
+; WIN64: # %bb.0:
+; WIN64-NEXT: incb %al
+; WIN64-NEXT: # kill: def %al killed %al killed %eax
+; WIN64-NEXT: retq
+;
+; LINUXOSX64-LABEL: test_argReti1:
+; LINUXOSX64: # %bb.0:
+; LINUXOSX64-NEXT: incb %al
+; LINUXOSX64-NEXT: # kill: def %al killed %al killed %eax
+; LINUXOSX64-NEXT: retq
%add = add i1 %a, 1
ret i1 %add
}
-; ALL-LABEL: test_CallargReti1:
-; ALL: movzbl %al, %eax
-; ALL: call{{.*}}test_argReti1
-; ALL: incb %al
-; ALL: ret{{.*}}
-
; Test regcall when passing/retrieving i1
define x86_regcallcc i1 @test_CallargReti1(i1 %a) {
+; X32-LABEL: test_CallargReti1:
+; X32: # %bb.0:
+; X32-NEXT: pushl %esp
+; X32-NEXT: incb %al
+; X32-NEXT: movzbl %al, %eax
+; X32-NEXT: calll _test_argReti1
+; X32-NEXT: incb %al
+; X32-NEXT: popl %esp
+; X32-NEXT: retl
+;
+; WIN64-LABEL: test_CallargReti1:
+; WIN64: # %bb.0:
+; WIN64-NEXT: pushq %rsp
+; WIN64-NEXT: .seh_pushreg 4
+; WIN64-NEXT: .seh_endprologue
+; WIN64-NEXT: incb %al
+; WIN64-NEXT: movzbl %al, %eax
+; WIN64-NEXT: callq test_argReti1
+; WIN64-NEXT: incb %al
+; WIN64-NEXT: popq %rsp
+; WIN64-NEXT: retq
+; WIN64-NEXT: .seh_handlerdata
+; WIN64-NEXT: .text
+; WIN64-NEXT: .seh_endproc
+;
+; LINUXOSX64-LABEL: test_CallargReti1:
+; LINUXOSX64: # %bb.0:
+; LINUXOSX64-NEXT: pushq %rsp
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 16
+; LINUXOSX64-NEXT: .cfi_offset %rsp, -16
+; LINUXOSX64-NEXT: incb %al
+; LINUXOSX64-NEXT: movzbl %al, %eax
+; LINUXOSX64-NEXT: callq test_argReti1
+; LINUXOSX64-NEXT: incb %al
+; LINUXOSX64-NEXT: popq %rsp
+; LINUXOSX64-NEXT: retq
%b = add i1 %a, 1
%c = call x86_regcallcc i1 @test_argReti1(i1 %b)
%d = add i1 %c, 1
ret i1 %d
}
-; X32-LABEL: test_argReti8:
-; X32: incb %al
-; X32: ret{{.*}}
-
-; WIN64-LABEL: test_argReti8:
-; WIN64: incb %al
-; WIN64: ret{{.*}}
-
; Test regcall when receiving/returning i8
define x86_regcallcc i8 @test_argReti8(i8 %a) {
+; X32-LABEL: test_argReti8:
+; X32: # %bb.0:
+; X32-NEXT: incb %al
+; X32-NEXT: # kill: def %al killed %al killed %eax
+; X32-NEXT: retl
+;
+; WIN64-LABEL: test_argReti8:
+; WIN64: # %bb.0:
+; WIN64-NEXT: incb %al
+; WIN64-NEXT: # kill: def %al killed %al killed %eax
+; WIN64-NEXT: retq
+;
+; LINUXOSX64-LABEL: test_argReti8:
+; LINUXOSX64: # %bb.0:
+; LINUXOSX64-NEXT: incb %al
+; LINUXOSX64-NEXT: # kill: def %al killed %al killed %eax
+; LINUXOSX64-NEXT: retq
%add = add i8 %a, 1
ret i8 %add
}
-; X32-LABEL: test_CallargReti8:
-; X32: incb %al
-; X32: call{{.*}} {{.*}}test_argReti8
-; X32: incb %al
-; X32: ret{{.*}}
-
-; WIN64-LABEL: test_CallargReti8:
-; WIN64: incb %al
-; WIN64: call{{.*}} {{.*}}test_argReti8
-; WIN64: incb %al
-; WIN64: ret{{.*}}
-
; Test regcall when passing/retrieving i8
define x86_regcallcc i8 @test_CallargReti8(i8 %a) {
+; X32-LABEL: test_CallargReti8:
+; X32: # %bb.0:
+; X32-NEXT: pushl %esp
+; X32-NEXT: incb %al
+; X32-NEXT: movzbl %al, %eax
+; X32-NEXT: calll _test_argReti8
+; X32-NEXT: incb %al
+; X32-NEXT: popl %esp
+; X32-NEXT: retl
+;
+; WIN64-LABEL: test_CallargReti8:
+; WIN64: # %bb.0:
+; WIN64-NEXT: pushq %rsp
+; WIN64-NEXT: .seh_pushreg 4
+; WIN64-NEXT: .seh_endprologue
+; WIN64-NEXT: incb %al
+; WIN64-NEXT: movzbl %al, %eax
+; WIN64-NEXT: callq test_argReti8
+; WIN64-NEXT: incb %al
+; WIN64-NEXT: popq %rsp
+; WIN64-NEXT: retq
+; WIN64-NEXT: .seh_handlerdata
+; WIN64-NEXT: .text
+; WIN64-NEXT: .seh_endproc
+;
+; LINUXOSX64-LABEL: test_CallargReti8:
+; LINUXOSX64: # %bb.0:
+; LINUXOSX64-NEXT: pushq %rsp
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 16
+; LINUXOSX64-NEXT: .cfi_offset %rsp, -16
+; LINUXOSX64-NEXT: incb %al
+; LINUXOSX64-NEXT: movzbl %al, %eax
+; LINUXOSX64-NEXT: callq test_argReti8
+; LINUXOSX64-NEXT: incb %al
+; LINUXOSX64-NEXT: popq %rsp
+; LINUXOSX64-NEXT: retq
%b = add i8 %a, 1
%c = call x86_regcallcc i8 @test_argReti8(i8 %b)
%d = add i8 %c, 1
ret i8 %d
}
-; X32-LABEL: test_argReti16:
-; X32: incl %eax
-; X32: ret{{.*}}
-
-; WIN64-LABEL: test_argReti16:
-; WIN64: incl %eax
-; WIN64: ret{{.*}}
-
; Test regcall when receiving/returning i16
define x86_regcallcc i16 @test_argReti16(i16 %a) {
+; X32-LABEL: test_argReti16:
+; X32: # %bb.0:
+; X32-NEXT: incl %eax
+; X32-NEXT: # kill: def %ax killed %ax killed %eax
+; X32-NEXT: retl
+;
+; WIN64-LABEL: test_argReti16:
+; WIN64: # %bb.0:
+; WIN64-NEXT: incl %eax
+; WIN64-NEXT: # kill: def %ax killed %ax killed %eax
+; WIN64-NEXT: retq
+;
+; LINUXOSX64-LABEL: test_argReti16:
+; LINUXOSX64: # %bb.0:
+; LINUXOSX64-NEXT: incl %eax
+; LINUXOSX64-NEXT: # kill: def %ax killed %ax killed %eax
+; LINUXOSX64-NEXT: retq
%add = add i16 %a, 1
ret i16 %add
}
-; X32-LABEL: test_CallargReti16:
-; X32: incl %eax
-; X32: call{{.*}} {{.*}}test_argReti16
-; X32: incl %eax
-; X32: ret{{.*}}
-
-; WIN64-LABEL: test_CallargReti16:
-; WIN64: incl %eax
-; WIN64: call{{.*}} {{.*}}test_argReti16
-; WIN64: incl %eax
-; WIN64: ret{{.*}}
-
; Test regcall when passing/retrieving i16
define x86_regcallcc i16 @test_CallargReti16(i16 %a) {
+; X32-LABEL: test_CallargReti16:
+; X32: # %bb.0:
+; X32-NEXT: pushl %esp
+; X32-NEXT: incl %eax
+; X32-NEXT: calll _test_argReti16
+; X32-NEXT: # kill: def %ax killed %ax def %eax
+; X32-NEXT: incl %eax
+; X32-NEXT: # kill: def %ax killed %ax killed %eax
+; X32-NEXT: popl %esp
+; X32-NEXT: retl
+;
+; WIN64-LABEL: test_CallargReti16:
+; WIN64: # %bb.0:
+; WIN64-NEXT: pushq %rsp
+; WIN64-NEXT: .seh_pushreg 4
+; WIN64-NEXT: .seh_endprologue
+; WIN64-NEXT: incl %eax
+; WIN64-NEXT: callq test_argReti16
+; WIN64-NEXT: # kill: def %ax killed %ax def %eax
+; WIN64-NEXT: incl %eax
+; WIN64-NEXT: # kill: def %ax killed %ax killed %eax
+; WIN64-NEXT: popq %rsp
+; WIN64-NEXT: retq
+; WIN64-NEXT: .seh_handlerdata
+; WIN64-NEXT: .text
+; WIN64-NEXT: .seh_endproc
+;
+; LINUXOSX64-LABEL: test_CallargReti16:
+; LINUXOSX64: # %bb.0:
+; LINUXOSX64-NEXT: pushq %rsp
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 16
+; LINUXOSX64-NEXT: .cfi_offset %rsp, -16
+; LINUXOSX64-NEXT: incl %eax
+; LINUXOSX64-NEXT: callq test_argReti16
+; LINUXOSX64-NEXT: # kill: def %ax killed %ax def %eax
+; LINUXOSX64-NEXT: incl %eax
+; LINUXOSX64-NEXT: # kill: def %ax killed %ax killed %eax
+; LINUXOSX64-NEXT: popq %rsp
+; LINUXOSX64-NEXT: retq
%b = add i16 %a, 1
%c = call x86_regcallcc i16 @test_argReti16(i16 %b)
%d = add i16 %c, 1
ret i16 %d
}
-; X32-LABEL: test_argReti32:
-; X32: incl %eax
-; X32: ret{{.*}}
-
-; WIN64-LABEL: test_argReti32:
-; WIN64: incl %eax
-; WIN64: ret{{.*}}
-
; Test regcall when receiving/returning i32
define x86_regcallcc i32 @test_argReti32(i32 %a) {
+; X32-LABEL: test_argReti32:
+; X32: # %bb.0:
+; X32-NEXT: incl %eax
+; X32-NEXT: retl
+;
+; WIN64-LABEL: test_argReti32:
+; WIN64: # %bb.0:
+; WIN64-NEXT: incl %eax
+; WIN64-NEXT: retq
+;
+; LINUXOSX64-LABEL: test_argReti32:
+; LINUXOSX64: # %bb.0:
+; LINUXOSX64-NEXT: incl %eax
+; LINUXOSX64-NEXT: retq
%add = add i32 %a, 1
ret i32 %add
}
-; X32-LABEL: test_CallargReti32:
-; X32: incl %eax
-; X32: call{{.*}} {{.*}}test_argReti32
-; X32: incl %eax
-; X32: ret{{.*}}
-
-; WIN64-LABEL: test_CallargReti32:
-; WIN64: incl %eax
-; WIN64: call{{.*}} {{.*}}test_argReti32
-; WIN64: incl %eax
-; WIN64: ret{{.*}}
-
; Test regcall when passing/retrieving i32
define x86_regcallcc i32 @test_CallargReti32(i32 %a) {
+; X32-LABEL: test_CallargReti32:
+; X32: # %bb.0:
+; X32-NEXT: pushl %esp
+; X32-NEXT: incl %eax
+; X32-NEXT: calll _test_argReti32
+; X32-NEXT: incl %eax
+; X32-NEXT: popl %esp
+; X32-NEXT: retl
+;
+; WIN64-LABEL: test_CallargReti32:
+; WIN64: # %bb.0:
+; WIN64-NEXT: pushq %rsp
+; WIN64-NEXT: .seh_pushreg 4
+; WIN64-NEXT: .seh_endprologue
+; WIN64-NEXT: incl %eax
+; WIN64-NEXT: callq test_argReti32
+; WIN64-NEXT: incl %eax
+; WIN64-NEXT: popq %rsp
+; WIN64-NEXT: retq
+; WIN64-NEXT: .seh_handlerdata
+; WIN64-NEXT: .text
+; WIN64-NEXT: .seh_endproc
+;
+; LINUXOSX64-LABEL: test_CallargReti32:
+; LINUXOSX64: # %bb.0:
+; LINUXOSX64-NEXT: pushq %rsp
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 16
+; LINUXOSX64-NEXT: .cfi_offset %rsp, -16
+; LINUXOSX64-NEXT: incl %eax
+; LINUXOSX64-NEXT: callq test_argReti32
+; LINUXOSX64-NEXT: incl %eax
+; LINUXOSX64-NEXT: popq %rsp
+; LINUXOSX64-NEXT: retq
%b = add i32 %a, 1
%c = call x86_regcallcc i32 @test_argReti32(i32 %b)
%d = add i32 %c, 1
ret i32 %d
}
-; X32-LABEL: test_argReti64:
-; X32: addl $3, %eax
-; X32: adcl $1, %ecx
-; X32: ret{{.*}}
-
-; WIN64-LABEL: test_argReti64:
-; WIN64: movabsq $4294967299, %r{{.*}}
-; WIN64: addq %r{{.*}}, %rax
-; WIN64: ret{{.*}}
-
; Test regcall when receiving/returning i64
define x86_regcallcc i64 @test_argReti64(i64 %a) {
+; X32-LABEL: test_argReti64:
+; X32: # %bb.0:
+; X32-NEXT: addl $3, %eax
+; X32-NEXT: adcl $1, %ecx
+; X32-NEXT: retl
+;
+; WIN64-LABEL: test_argReti64:
+; WIN64: # %bb.0:
+; WIN64-NEXT: movabsq $4294967299, %rcx # imm = 0x100000003
+; WIN64-NEXT: addq %rcx, %rax
+; WIN64-NEXT: retq
+;
+; LINUXOSX64-LABEL: test_argReti64:
+; LINUXOSX64: # %bb.0:
+; LINUXOSX64-NEXT: movabsq $4294967299, %rcx # imm = 0x100000003
+; LINUXOSX64-NEXT: addq %rcx, %rax
+; LINUXOSX64-NEXT: retq
%add = add i64 %a, 4294967299
ret i64 %add
}
-; X32-LABEL: test_CallargReti64:
-; X32: add{{.*}} $1, %eax
-; X32: adcl $0, {{%e(cx|dx|si|di|bx|bp)}}
-; X32: call{{.*}} {{.*}}test_argReti64
-; X32: add{{.*}} $1, %eax
-; X32: adcl $0, {{%e(cx|dx|si|di|bx|bp)}}
-; X32: ret{{.*}}
-
-; WIN64-LABEL: test_CallargReti64:
-; WIN64: incq %rax
-; WIN64: call{{.*}} {{.*}}test_argReti64
-; WIN64: incq %rax
-; WIN64: ret{{.*}}
-
; Test regcall when passing/retrieving i64
define x86_regcallcc i64 @test_CallargReti64(i64 %a) {
+; X32-LABEL: test_CallargReti64:
+; X32: # %bb.0:
+; X32-NEXT: pushl %esp
+; X32-NEXT: addl $1, %eax
+; X32-NEXT: adcl $0, %ecx
+; X32-NEXT: calll _test_argReti64
+; X32-NEXT: addl $1, %eax
+; X32-NEXT: adcl $0, %ecx
+; X32-NEXT: popl %esp
+; X32-NEXT: retl
+;
+; WIN64-LABEL: test_CallargReti64:
+; WIN64: # %bb.0:
+; WIN64-NEXT: pushq %rsp
+; WIN64-NEXT: .seh_pushreg 4
+; WIN64-NEXT: .seh_endprologue
+; WIN64-NEXT: incq %rax
+; WIN64-NEXT: callq test_argReti64
+; WIN64-NEXT: incq %rax
+; WIN64-NEXT: popq %rsp
+; WIN64-NEXT: retq
+; WIN64-NEXT: .seh_handlerdata
+; WIN64-NEXT: .text
+; WIN64-NEXT: .seh_endproc
+;
+; LINUXOSX64-LABEL: test_CallargReti64:
+; LINUXOSX64: # %bb.0:
+; LINUXOSX64-NEXT: pushq %rsp
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 16
+; LINUXOSX64-NEXT: .cfi_offset %rsp, -16
+; LINUXOSX64-NEXT: incq %rax
+; LINUXOSX64-NEXT: callq test_argReti64
+; LINUXOSX64-NEXT: incq %rax
+; LINUXOSX64-NEXT: popq %rsp
+; LINUXOSX64-NEXT: retq
%b = add i64 %a, 1
%c = call x86_regcallcc i64 @test_argReti64(i64 %b)
%d = add i64 %c, 1
ret i64 %d
}
-; X32-LABEL: test_argRetFloat:
-; X32: vadd{{.*}} {{.*}}, %xmm0
-; X32: ret{{.*}}
-
-; WIN64-LABEL: test_argRetFloat:
-; WIN64: vadd{{.*}} {{.*}}, %xmm0
-; WIN64: ret{{.*}}
-
; Test regcall when receiving/returning float
define x86_regcallcc float @test_argRetFloat(float %a) {
+; X32-LABEL: test_argRetFloat:
+; X32: # %bb.0:
+; X32-NEXT: vaddss __real@3f800000, %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; WIN64-LABEL: test_argRetFloat:
+; WIN64: # %bb.0:
+; WIN64-NEXT: vaddss __real@{{.*}}(%rip), %xmm0, %xmm0
+; WIN64-NEXT: retq
+;
+; LINUXOSX64-LABEL: test_argRetFloat:
+; LINUXOSX64: # %bb.0:
+; LINUXOSX64-NEXT: vaddss {{.*}}(%rip), %xmm0, %xmm0
+; LINUXOSX64-NEXT: retq
%add = fadd float 1.0, %a
ret float %add
}
-; X32-LABEL: test_CallargRetFloat:
-; X32: vadd{{.*}} {{%xmm([0-7])}}, %xmm0, %xmm0
-; X32: call{{.*}} {{.*}}test_argRetFloat
-; X32: vadd{{.*}} {{%xmm([0-7])}}, %xmm0, %xmm0
-; X32: ret{{.*}}
-
-; WIN64-LABEL: test_CallargRetFloat:
-; WIN64: vadd{{.*}} {{%xmm([0-9]+)}}, %xmm0, %xmm0
-; WIN64: call{{.*}} {{.*}}test_argRetFloat
-; WIN64: vadd{{.*}} {{%xmm([0-9]+)}}, %xmm0, %xmm0
-; WIN64: ret{{.*}}
-
; Test regcall when passing/retrieving float
define x86_regcallcc float @test_CallargRetFloat(float %a) {
+; X32-LABEL: test_CallargRetFloat:
+; X32: # %bb.0:
+; X32-NEXT: pushl %esp
+; X32-NEXT: subl $24, %esp
+; X32-NEXT: vmovups %xmm4, (%esp) # 16-byte Spill
+; X32-NEXT: vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero
+; X32-NEXT: vaddss %xmm4, %xmm0, %xmm0
+; X32-NEXT: calll _test_argRetFloat
+; X32-NEXT: vaddss %xmm4, %xmm0, %xmm0
+; X32-NEXT: vmovups (%esp), %xmm4 # 16-byte Reload
+; X32-NEXT: addl $24, %esp
+; X32-NEXT: popl %esp
+; X32-NEXT: retl
+;
+; WIN64-LABEL: test_CallargRetFloat:
+; WIN64: # %bb.0:
+; WIN64-NEXT: pushq %rsp
+; WIN64-NEXT: .seh_pushreg 4
+; WIN64-NEXT: subq $16, %rsp
+; WIN64-NEXT: .seh_stackalloc 16
+; WIN64-NEXT: vmovaps %xmm8, (%rsp) # 16-byte Spill
+; WIN64-NEXT: .seh_savexmm 8, 0
+; WIN64-NEXT: .seh_endprologue
+; WIN64-NEXT: vmovss {{.*#+}} xmm8 = mem[0],zero,zero,zero
+; WIN64-NEXT: vaddss %xmm8, %xmm0, %xmm0
+; WIN64-NEXT: callq test_argRetFloat
+; WIN64-NEXT: vaddss %xmm8, %xmm0, %xmm0
+; WIN64-NEXT: vmovaps (%rsp), %xmm8 # 16-byte Reload
+; WIN64-NEXT: addq $16, %rsp
+; WIN64-NEXT: popq %rsp
+; WIN64-NEXT: retq
+; WIN64-NEXT: .seh_handlerdata
+; WIN64-NEXT: .text
+; WIN64-NEXT: .seh_endproc
+;
+; LINUXOSX64-LABEL: test_CallargRetFloat:
+; LINUXOSX64: # %bb.0:
+; LINUXOSX64-NEXT: pushq %rsp
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 16
+; LINUXOSX64-NEXT: subq $16, %rsp
+; LINUXOSX64-NEXT: vmovaps %xmm8, (%rsp) # 16-byte Spill
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 32
+; LINUXOSX64-NEXT: .cfi_offset %rsp, -16
+; LINUXOSX64-NEXT: .cfi_offset %xmm8, -32
+; LINUXOSX64-NEXT: vmovss {{.*#+}} xmm8 = mem[0],zero,zero,zero
+; LINUXOSX64-NEXT: vaddss %xmm8, %xmm0, %xmm0
+; LINUXOSX64-NEXT: callq test_argRetFloat
+; LINUXOSX64-NEXT: vaddss %xmm8, %xmm0, %xmm0
+; LINUXOSX64-NEXT: vmovaps (%rsp), %xmm8 # 16-byte Reload
+; LINUXOSX64-NEXT: addq $16, %rsp
+; LINUXOSX64-NEXT: popq %rsp
+; LINUXOSX64-NEXT: retq
%b = fadd float 1.0, %a
%c = call x86_regcallcc float @test_argRetFloat(float %b)
%d = fadd float 1.0, %c
ret float %d
}
-; X32-LABEL: test_argRetDouble:
-; X32: vadd{{.*}} {{.*}}, %xmm0
-; X32: ret{{.*}}
-
-; WIN64-LABEL: test_argRetDouble:
-; WIN64: vadd{{.*}} {{.*}}, %xmm0
-; WIN64: ret{{.*}}
-
; Test regcall when receiving/returning double
define x86_regcallcc double @test_argRetDouble(double %a) {
+; X32-LABEL: test_argRetDouble:
+; X32: # %bb.0:
+; X32-NEXT: vaddsd __real@3ff0000000000000, %xmm0, %xmm0
+; X32-NEXT: retl
+;
+; WIN64-LABEL: test_argRetDouble:
+; WIN64: # %bb.0:
+; WIN64-NEXT: vaddsd __real@{{.*}}(%rip), %xmm0, %xmm0
+; WIN64-NEXT: retq
+;
+; LINUXOSX64-LABEL: test_argRetDouble:
+; LINUXOSX64: # %bb.0:
+; LINUXOSX64-NEXT: vaddsd {{.*}}(%rip), %xmm0, %xmm0
+; LINUXOSX64-NEXT: retq
%add = fadd double %a, 1.0
ret double %add
}
-; X32-LABEL: test_CallargRetDouble:
-; X32: vadd{{.*}} {{%xmm([0-7])}}, %xmm0, %xmm0
-; X32: call{{.*}} {{.*}}test_argRetDouble
-; X32: vadd{{.*}} {{%xmm([0-7])}}, %xmm0, %xmm0
-; X32: ret{{.*}}
-
-; WIN64-LABEL: test_CallargRetDouble:
-; WIN64: vadd{{.*}} {{%xmm([0-9]+)}}, %xmm0, %xmm0
-; WIN64: call{{.*}} {{.*}}test_argRetDouble
-; WIN64: vadd{{.*}} {{%xmm([0-9]+)}}, %xmm0, %xmm0
-; WIN64: ret{{.*}}
-
; Test regcall when passing/retrieving double
define x86_regcallcc double @test_CallargRetDouble(double %a) {
+; X32-LABEL: test_CallargRetDouble:
+; X32: # %bb.0:
+; X32-NEXT: pushl %esp
+; X32-NEXT: subl $24, %esp
+; X32-NEXT: vmovups %xmm4, (%esp) # 16-byte Spill
+; X32-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
+; X32-NEXT: vaddsd %xmm4, %xmm0, %xmm0
+; X32-NEXT: calll _test_argRetDouble
+; X32-NEXT: vaddsd %xmm4, %xmm0, %xmm0
+; X32-NEXT: vmovups (%esp), %xmm4 # 16-byte Reload
+; X32-NEXT: addl $24, %esp
+; X32-NEXT: popl %esp
+; X32-NEXT: retl
+;
+; WIN64-LABEL: test_CallargRetDouble:
+; WIN64: # %bb.0:
+; WIN64-NEXT: pushq %rsp
+; WIN64-NEXT: .seh_pushreg 4
+; WIN64-NEXT: subq $16, %rsp
+; WIN64-NEXT: .seh_stackalloc 16
+; WIN64-NEXT: vmovaps %xmm8, (%rsp) # 16-byte Spill
+; WIN64-NEXT: .seh_savexmm 8, 0
+; WIN64-NEXT: .seh_endprologue
+; WIN64-NEXT: vmovsd {{.*#+}} xmm8 = mem[0],zero
+; WIN64-NEXT: vaddsd %xmm8, %xmm0, %xmm0
+; WIN64-NEXT: callq test_argRetDouble
+; WIN64-NEXT: vaddsd %xmm8, %xmm0, %xmm0
+; WIN64-NEXT: vmovaps (%rsp), %xmm8 # 16-byte Reload
+; WIN64-NEXT: addq $16, %rsp
+; WIN64-NEXT: popq %rsp
+; WIN64-NEXT: retq
+; WIN64-NEXT: .seh_handlerdata
+; WIN64-NEXT: .text
+; WIN64-NEXT: .seh_endproc
+;
+; LINUXOSX64-LABEL: test_CallargRetDouble:
+; LINUXOSX64: # %bb.0:
+; LINUXOSX64-NEXT: pushq %rsp
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 16
+; LINUXOSX64-NEXT: subq $16, %rsp
+; LINUXOSX64-NEXT: vmovaps %xmm8, (%rsp) # 16-byte Spill
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 32
+; LINUXOSX64-NEXT: .cfi_offset %rsp, -16
+; LINUXOSX64-NEXT: .cfi_offset %xmm8, -32
+; LINUXOSX64-NEXT: vmovsd {{.*#+}} xmm8 = mem[0],zero
+; LINUXOSX64-NEXT: vaddsd %xmm8, %xmm0, %xmm0
+; LINUXOSX64-NEXT: callq test_argRetDouble
+; LINUXOSX64-NEXT: vaddsd %xmm8, %xmm0, %xmm0
+; LINUXOSX64-NEXT: vmovaps (%rsp), %xmm8 # 16-byte Reload
+; LINUXOSX64-NEXT: addq $16, %rsp
+; LINUXOSX64-NEXT: popq %rsp
+; LINUXOSX64-NEXT: retq
%b = fadd double 1.0, %a
%c = call x86_regcallcc double @test_argRetDouble(double %b)
%d = fadd double 1.0, %c
ret double %d
}
-; X32: test_argRetf80
-; X32-NOT: fldt
-; X32: fadd %st(0), %st(0)
-; X32: retl
-
-; WIN64: test_argRetf80
-; WIN64-NOT: fldt
-; WIN64: fadd %st(0), %st(0)
-; WIN64: retq
-
; Test regcall when receiving/returning long double
define x86_regcallcc x86_fp80 @test_argRetf80(x86_fp80 %a0) nounwind {
+; X32-LABEL: test_argRetf80:
+; X32: # %bb.0:
+; X32-NEXT: fadd %st(0), %st(0)
+; X32-NEXT: retl
+;
+; WIN64-LABEL: test_argRetf80:
+; WIN64: # %bb.0:
+; WIN64-NEXT: fadd %st(0), %st(0)
+; WIN64-NEXT: retq
+;
+; LINUXOSX64-LABEL: test_argRetf80:
+; LINUXOSX64: # %bb.0:
+; LINUXOSX64-NEXT: fadd %st(0), %st(0)
+; LINUXOSX64-NEXT: retq
%r0 = fadd x86_fp80 %a0, %a0
ret x86_fp80 %r0
}
-; X32: test_CallargRetf80
-; X32-NOT: fldt
-; X32: fadd %st({{[0-7]}}), %st({{[0-7]}})
-; X32: call{{.*}} {{.*}}test_argRetf80
-; X32: fadd{{.*}} %st({{[0-7]}})
-; X32: retl
-
-; WIN64: test_CallargRetf80
-; WIN64-NOT: fldt
-; WIN64: fadd %st({{[0-7]}}), %st({{[0-7]}})
-; WIN64: call{{.*}} {{.*}}test_argRetf80
-; WIN64: fadd{{.*}} %st({{[0-7]}})
-; WIN64: retq
-
; Test regcall when passing/retrieving long double
define x86_regcallcc x86_fp80 @test_CallargRetf80(x86_fp80 %a) {
+; X32-LABEL: test_CallargRetf80:
+; X32: # %bb.0:
+; X32-NEXT: pushl %esp
+; X32-NEXT: fadd %st(0), %st(0)
+; X32-NEXT: calll _test_argRetf80
+; X32-NEXT: fadd %st(0), %st(0)
+; X32-NEXT: popl %esp
+; X32-NEXT: retl
+;
+; WIN64-LABEL: test_CallargRetf80:
+; WIN64: # %bb.0:
+; WIN64-NEXT: pushq %rsp
+; WIN64-NEXT: .seh_pushreg 4
+; WIN64-NEXT: .seh_endprologue
+; WIN64-NEXT: fadd %st(0), %st(0)
+; WIN64-NEXT: callq test_argRetf80
+; WIN64-NEXT: fadd %st(0), %st(0)
+; WIN64-NEXT: popq %rsp
+; WIN64-NEXT: retq
+; WIN64-NEXT: .seh_handlerdata
+; WIN64-NEXT: .text
+; WIN64-NEXT: .seh_endproc
+;
+; LINUXOSX64-LABEL: test_CallargRetf80:
+; LINUXOSX64: # %bb.0:
+; LINUXOSX64-NEXT: pushq %rsp
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 16
+; LINUXOSX64-NEXT: .cfi_offset %rsp, -16
+; LINUXOSX64-NEXT: fadd %st(0), %st(0)
+; LINUXOSX64-NEXT: callq test_argRetf80
+; LINUXOSX64-NEXT: fadd %st(0), %st(0)
+; LINUXOSX64-NEXT: popq %rsp
+; LINUXOSX64-NEXT: retq
%b = fadd x86_fp80 %a, %a
%c = call x86_regcallcc x86_fp80 @test_argRetf80(x86_fp80 %b)
%d = fadd x86_fp80 %c, %c
ret x86_fp80 %d
}
-; X32-LABEL: test_argRetPointer:
-; X32: incl %eax
-; X32: ret{{.*}}
-
-; WIN64-LABEL: test_argRetPointer:
-; WIN64: incl %eax
-; WIN64: ret{{.*}}
-
; Test regcall when receiving/returning pointer
define x86_regcallcc [4 x i32]* @test_argRetPointer([4 x i32]* %a) {
+; X32-LABEL: test_argRetPointer:
+; X32: # %bb.0:
+; X32-NEXT: incl %eax
+; X32-NEXT: retl
+;
+; WIN64-LABEL: test_argRetPointer:
+; WIN64: # %bb.0:
+; WIN64-NEXT: incl %eax
+; WIN64-NEXT: retq
+;
+; LINUXOSX64-LABEL: test_argRetPointer:
+; LINUXOSX64: # %bb.0:
+; LINUXOSX64-NEXT: incl %eax
+; LINUXOSX64-NEXT: retq
%b = ptrtoint [4 x i32]* %a to i32
%c = add i32 %b, 1
%d = inttoptr i32 %c to [4 x i32]*
ret [4 x i32]* %d
}
-; X32-LABEL: test_CallargRetPointer:
-; X32: incl %eax
-; X32: call{{.*}} {{.*}}test_argRetPointer
-; X32: incl %eax
-; X32: ret{{.*}}
-
-; WIN64-LABEL: test_CallargRetPointer:
-; WIN64: incl %eax
-; WIN64: call{{.*}} {{.*}}test_argRetPointer
-; WIN64: incl %eax
-; WIN64: ret{{.*}}
-
; Test regcall when passing/retrieving pointer
define x86_regcallcc [4 x i32]* @test_CallargRetPointer([4 x i32]* %a) {
+; X32-LABEL: test_CallargRetPointer:
+; X32: # %bb.0:
+; X32-NEXT: pushl %esp
+; X32-NEXT: incl %eax
+; X32-NEXT: calll _test_argRetPointer
+; X32-NEXT: incl %eax
+; X32-NEXT: popl %esp
+; X32-NEXT: retl
+;
+; WIN64-LABEL: test_CallargRetPointer:
+; WIN64: # %bb.0:
+; WIN64-NEXT: pushq %rsp
+; WIN64-NEXT: .seh_pushreg 4
+; WIN64-NEXT: .seh_endprologue
+; WIN64-NEXT: incl %eax
+; WIN64-NEXT: callq test_argRetPointer
+; WIN64-NEXT: incl %eax
+; WIN64-NEXT: popq %rsp
+; WIN64-NEXT: retq
+; WIN64-NEXT: .seh_handlerdata
+; WIN64-NEXT: .text
+; WIN64-NEXT: .seh_endproc
+;
+; LINUXOSX64-LABEL: test_CallargRetPointer:
+; LINUXOSX64: # %bb.0:
+; LINUXOSX64-NEXT: pushq %rsp
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 16
+; LINUXOSX64-NEXT: .cfi_offset %rsp, -16
+; LINUXOSX64-NEXT: incl %eax
+; LINUXOSX64-NEXT: callq test_argRetPointer
+; LINUXOSX64-NEXT: incl %eax
+; LINUXOSX64-NEXT: popq %rsp
+; LINUXOSX64-NEXT: retq
%b = ptrtoint [4 x i32]* %a to i32
%c = add i32 %b, 1
%d = inttoptr i32 %c to [4 x i32]*
@@ -312,144 +622,276 @@ define x86_regcallcc [4 x i32]* @test_CallargRetPointer([4 x i32]* %a) {
ret [4 x i32]* %h
}
-; X32-LABEL: test_argRet128Vector:
-; X32: vpblend{{.*}} %xmm0, %xmm1, %xmm0
-; X32: ret{{.*}}
-
-; WIN64-LABEL: test_argRet128Vector:
-; WIN64: vpblend{{.*}} %xmm0, %xmm1, %xmm0
-; WIN64: ret{{.*}}
-
; Test regcall when receiving/returning 128 bit vector
define x86_regcallcc <4 x i32> @test_argRet128Vector(<4 x i32> %a, <4 x i32> %b) {
+; X32-LABEL: test_argRet128Vector:
+; X32: # %bb.0:
+; X32-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; X32-NEXT: retl
+;
+; WIN64-LABEL: test_argRet128Vector:
+; WIN64: # %bb.0:
+; WIN64-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; WIN64-NEXT: retq
+;
+; LINUXOSX64-LABEL: test_argRet128Vector:
+; LINUXOSX64: # %bb.0:
+; LINUXOSX64-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; LINUXOSX64-NEXT: retq
%d = select <4 x i1> undef , <4 x i32> %a, <4 x i32> %b
ret <4 x i32> %d
}
-; X32-LABEL: test_CallargRet128Vector:
-; X32: vmov{{.*}} %xmm0, {{%xmm([0-7])}}
-; X32: call{{.*}} {{.*}}test_argRet128Vector
-; X32: vmovdqa{{.*}} {{%xmm([0-7])}}, %xmm0
-; X32: ret{{.*}}
-
-; WIN64-LABEL: test_CallargRet128Vector:
-; WIN64: vmov{{.*}} %xmm0, {{%xmm([0-9]+)}}
-; WIN64: call{{.*}} {{.*}}test_argRet128Vector
-; WIN64: vmovdqa{{.*}} {{%xmm([0-9]+)}}, %xmm0
-; WIN64: ret{{.*}}
-
; Test regcall when passing/retrieving 128 bit vector
define x86_regcallcc <4 x i32> @test_CallargRet128Vector(<4 x i32> %a) {
+; X32-LABEL: test_CallargRet128Vector:
+; X32: # %bb.0:
+; X32-NEXT: pushl %esp
+; X32-NEXT: subl $24, %esp
+; X32-NEXT: vmovups %xmm4, (%esp) # 16-byte Spill
+; X32-NEXT: vmovdqa %xmm0, %xmm4
+; X32-NEXT: vmovdqa %xmm4, %xmm1
+; X32-NEXT: calll _test_argRet128Vector
+; X32-NEXT: vmovdqa32 %xmm4, %xmm0 {%k1}
+; X32-NEXT: vmovups (%esp), %xmm4 # 16-byte Reload
+; X32-NEXT: addl $24, %esp
+; X32-NEXT: popl %esp
+; X32-NEXT: retl
+;
+; WIN64-LABEL: test_CallargRet128Vector:
+; WIN64: # %bb.0:
+; WIN64-NEXT: pushq %rsp
+; WIN64-NEXT: .seh_pushreg 4
+; WIN64-NEXT: subq $16, %rsp
+; WIN64-NEXT: .seh_stackalloc 16
+; WIN64-NEXT: vmovaps %xmm8, (%rsp) # 16-byte Spill
+; WIN64-NEXT: .seh_savexmm 8, 0
+; WIN64-NEXT: .seh_endprologue
+; WIN64-NEXT: vmovdqa %xmm0, %xmm8
+; WIN64-NEXT: vmovdqa %xmm8, %xmm1
+; WIN64-NEXT: callq test_argRet128Vector
+; WIN64-NEXT: vmovdqa32 %xmm8, %xmm0 {%k1}
+; WIN64-NEXT: vmovaps (%rsp), %xmm8 # 16-byte Reload
+; WIN64-NEXT: addq $16, %rsp
+; WIN64-NEXT: popq %rsp
+; WIN64-NEXT: retq
+; WIN64-NEXT: .seh_handlerdata
+; WIN64-NEXT: .text
+; WIN64-NEXT: .seh_endproc
+;
+; LINUXOSX64-LABEL: test_CallargRet128Vector:
+; LINUXOSX64: # %bb.0:
+; LINUXOSX64-NEXT: pushq %rsp
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 16
+; LINUXOSX64-NEXT: subq $16, %rsp
+; LINUXOSX64-NEXT: vmovaps %xmm8, (%rsp) # 16-byte Spill
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 32
+; LINUXOSX64-NEXT: .cfi_offset %rsp, -16
+; LINUXOSX64-NEXT: .cfi_offset %xmm8, -32
+; LINUXOSX64-NEXT: vmovdqa %xmm0, %xmm8
+; LINUXOSX64-NEXT: vmovdqa %xmm8, %xmm1
+; LINUXOSX64-NEXT: callq test_argRet128Vector
+; LINUXOSX64-NEXT: vmovdqa32 %xmm8, %xmm0 {%k1}
+; LINUXOSX64-NEXT: vmovaps (%rsp), %xmm8 # 16-byte Reload
+; LINUXOSX64-NEXT: addq $16, %rsp
+; LINUXOSX64-NEXT: popq %rsp
+; LINUXOSX64-NEXT: retq
%b = call x86_regcallcc <4 x i32> @test_argRet128Vector(<4 x i32> %a, <4 x i32> %a)
%c = select <4 x i1> undef , <4 x i32> %a, <4 x i32> %b
ret <4 x i32> %c
}
-; X32-LABEL: test_argRet256Vector:
-; X32: vpblend{{.*}} %ymm0, %ymm1, %ymm0
-; X32: ret{{.*}}
-
-; WIN64-LABEL: test_argRet256Vector:
-; WIN64: vpblend{{.*}} %ymm0, %ymm1, %ymm0
-; WIN64: ret{{.*}}
-
; Test regcall when receiving/returning 256 bit vector
define x86_regcallcc <8 x i32> @test_argRet256Vector(<8 x i32> %a, <8 x i32> %b) {
+; X32-LABEL: test_argRet256Vector:
+; X32: # %bb.0:
+; X32-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; X32-NEXT: retl
+;
+; WIN64-LABEL: test_argRet256Vector:
+; WIN64: # %bb.0:
+; WIN64-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; WIN64-NEXT: retq
+;
+; LINUXOSX64-LABEL: test_argRet256Vector:
+; LINUXOSX64: # %bb.0:
+; LINUXOSX64-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; LINUXOSX64-NEXT: retq
%d = select <8 x i1> undef , <8 x i32> %a, <8 x i32> %b
ret <8 x i32> %d
}
-; X32-LABEL: test_CallargRet256Vector:
-; X32: vmov{{.*}} %ymm0, %ymm1
-; X32: call{{.*}} {{.*}}test_argRet256Vector
-; X32: vmovdqa{{.*}} %ymm1, %ymm0
-; X32: ret{{.*}}
-
-; WIN64-LABEL: test_CallargRet256Vector:
-; WIN64: vmov{{.*}} %ymm0, %ymm1
-; WIN64: call{{.*}} {{.*}}test_argRet256Vector
-; WIN64: vmovdqa{{.*}} %ymm1, %ymm0
-; WIN64: ret{{.*}}
-
; Test regcall when passing/retrieving 256 bit vector
define x86_regcallcc <8 x i32> @test_CallargRet256Vector(<8 x i32> %a) {
+; X32-LABEL: test_CallargRet256Vector:
+; X32: # %bb.0:
+; X32-NEXT: pushl %esp
+; X32-NEXT: subl $56, %esp
+; X32-NEXT: vmovdqu %ymm0, (%esp) # 32-byte Spill
+; X32-NEXT: vmovdqa %ymm0, %ymm1
+; X32-NEXT: calll _test_argRet256Vector
+; X32-NEXT: vmovdqu (%esp), %ymm1 # 32-byte Reload
+; X32-NEXT: vmovdqa32 %ymm1, %ymm0 {%k1}
+; X32-NEXT: addl $56, %esp
+; X32-NEXT: popl %esp
+; X32-NEXT: retl
+;
+; WIN64-LABEL: test_CallargRet256Vector:
+; WIN64: # %bb.0:
+; WIN64-NEXT: pushq %rsp
+; WIN64-NEXT: .seh_pushreg 4
+; WIN64-NEXT: subq $48, %rsp
+; WIN64-NEXT: .seh_stackalloc 48
+; WIN64-NEXT: .seh_endprologue
+; WIN64-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill
+; WIN64-NEXT: vmovdqa %ymm0, %ymm1
+; WIN64-NEXT: callq test_argRet256Vector
+; WIN64-NEXT: vmovdqu (%rsp), %ymm1 # 32-byte Reload
+; WIN64-NEXT: vmovdqa32 %ymm1, %ymm0 {%k1}
+; WIN64-NEXT: addq $48, %rsp
+; WIN64-NEXT: popq %rsp
+; WIN64-NEXT: retq
+; WIN64-NEXT: .seh_handlerdata
+; WIN64-NEXT: .text
+; WIN64-NEXT: .seh_endproc
+;
+; LINUXOSX64-LABEL: test_CallargRet256Vector:
+; LINUXOSX64: # %bb.0:
+; LINUXOSX64-NEXT: pushq %rsp
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 16
+; LINUXOSX64-NEXT: subq $48, %rsp
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 64
+; LINUXOSX64-NEXT: .cfi_offset %rsp, -16
+; LINUXOSX64-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill
+; LINUXOSX64-NEXT: vmovdqa %ymm0, %ymm1
+; LINUXOSX64-NEXT: callq test_argRet256Vector
+; LINUXOSX64-NEXT: vmovdqu (%rsp), %ymm1 # 32-byte Reload
+; LINUXOSX64-NEXT: vmovdqa32 %ymm1, %ymm0 {%k1}
+; LINUXOSX64-NEXT: addq $48, %rsp
+; LINUXOSX64-NEXT: popq %rsp
+; LINUXOSX64-NEXT: retq
%b = call x86_regcallcc <8 x i32> @test_argRet256Vector(<8 x i32> %a, <8 x i32> %a)
%c = select <8 x i1> undef , <8 x i32> %a, <8 x i32> %b
ret <8 x i32> %c
}
-; X32-LABEL: test_argRet512Vector:
-; X32: vpblend{{.*}} %zmm0, %zmm1, %zmm0
-; X32: ret{{.*}}
-
-; WIN64-LABEL: test_argRet512Vector:
-; WIN64: vpblend{{.*}} %zmm0, %zmm1, %zmm0
-; WIN64: ret{{.*}}
-
; Test regcall when receiving/returning 512 bit vector
define x86_regcallcc <16 x i32> @test_argRet512Vector(<16 x i32> %a, <16 x i32> %b) {
+; X32-LABEL: test_argRet512Vector:
+; X32: # %bb.0:
+; X32-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; X32-NEXT: retl
+;
+; WIN64-LABEL: test_argRet512Vector:
+; WIN64: # %bb.0:
+; WIN64-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; WIN64-NEXT: retq
+;
+; LINUXOSX64-LABEL: test_argRet512Vector:
+; LINUXOSX64: # %bb.0:
+; LINUXOSX64-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; LINUXOSX64-NEXT: retq
%d = select <16 x i1> undef , <16 x i32> %a, <16 x i32> %b
ret <16 x i32> %d
}
-; X32-LABEL: test_CallargRet512Vector:
-; X32: vmov{{.*}} %zmm0, %zmm1
-; X32: call{{.*}} {{.*}}test_argRet512Vector
-; X32: movdqa{{.*}} %zmm1, %zmm0
-; X32: ret{{.*}}
-
-; WIN64-LABEL: test_CallargRet512Vector:
-; WIN64: vmov{{.*}} %zmm0, %zmm1
-; WIN64: call{{.*}} {{.*}}test_argRet512Vector
-; WIN64: vmovdqa{{.*}} %zmm1, %zmm0
-; WIN64: ret{{.*}}
-
; Test regcall when passing/retrieving 512 bit vector
define x86_regcallcc <16 x i32> @test_CallargRet512Vector(<16 x i32> %a) {
+; X32-LABEL: test_CallargRet512Vector:
+; X32: # %bb.0:
+; X32-NEXT: pushl %esp
+; X32-NEXT: subl $120, %esp
+; X32-NEXT: vmovdqu64 %zmm0, (%esp) # 64-byte Spill
+; X32-NEXT: vmovdqa64 %zmm0, %zmm1
+; X32-NEXT: calll _test_argRet512Vector
+; X32-NEXT: vmovdqu64 (%esp), %zmm1 # 64-byte Reload
+; X32-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
+; X32-NEXT: addl $120, %esp
+; X32-NEXT: popl %esp
+; X32-NEXT: retl
+;
+; WIN64-LABEL: test_CallargRet512Vector:
+; WIN64: # %bb.0:
+; WIN64-NEXT: pushq %rsp
+; WIN64-NEXT: .seh_pushreg 4
+; WIN64-NEXT: subq $112, %rsp
+; WIN64-NEXT: .seh_stackalloc 112
+; WIN64-NEXT: .seh_endprologue
+; WIN64-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; WIN64-NEXT: vmovdqa64 %zmm0, %zmm1
+; WIN64-NEXT: callq test_argRet512Vector
+; WIN64-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload
+; WIN64-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
+; WIN64-NEXT: addq $112, %rsp
+; WIN64-NEXT: popq %rsp
+; WIN64-NEXT: retq
+; WIN64-NEXT: .seh_handlerdata
+; WIN64-NEXT: .text
+; WIN64-NEXT: .seh_endproc
+;
+; LINUXOSX64-LABEL: test_CallargRet512Vector:
+; LINUXOSX64: # %bb.0:
+; LINUXOSX64-NEXT: pushq %rsp
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 16
+; LINUXOSX64-NEXT: subq $112, %rsp
+; LINUXOSX64-NEXT: .cfi_def_cfa_offset 128
+; LINUXOSX64-NEXT: .cfi_offset %rsp, -16
+; LINUXOSX64-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; LINUXOSX64-NEXT: vmovdqa64 %zmm0, %zmm1
+; LINUXOSX64-NEXT: callq test_argRet512Vector
+; LINUXOSX64-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload
+; LINUXOSX64-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
+; LINUXOSX64-NEXT: addq $112, %rsp
+; LINUXOSX64-NEXT: popq %rsp
+; LINUXOSX64-NEXT: retq
%b = call x86_regcallcc <16 x i32> @test_argRet512Vector(<16 x i32> %a, <16 x i32> %a)
%c = select <16 x i1> undef , <16 x i32> %a, <16 x i32> %b
ret <16 x i32> %c
}
-; WIN64-LABEL: testf32_inp
-; WIN64: {{.*}} {{%zmm([0-9]|1[0-1])}}, {{%zmm([0-9]|1[0-1])}}, {{%zmm([0-9]|1[0-1])}}
-; WIN64: {{.*}} {{%zmm([0-9]|1[0-1])}}, {{%zmm([0-9]|1[0-1])}}, {{%zmm([0-9]|1[0-1])}}
-; WIN64: {{.*}} {{%zmm([0-9]|1[0-1])}}, {{%zmm([0-9]|1[0-1])}}, {{%zmm([0-9]|1[0-1])}}
-; WIN64: {{.*}} {{%zmm([0-9]|1[0-1])}}, {{%zmm([0-9]|1[0-1])}}, {{%zmm([0-9]|1[0-1])}}
-; WIN64: {{.*}} {{%zmm([0-9]|1[0-1])}}, {{%zmm([0-9]|1[0-1])}}, {{%zmm([0-9]|1[0-1])}}
-; WIN64: {{.*}} {{%zmm([0-9]|1[0-1])}}, {{%zmm([0-9]|1[0-1])}}, {{%zmm([0-9]|1[0-1])}}
-; WIN64: {{.*}} {{%zmm([0-9]|1[0-1])}}, {{%zmm([0-9]|1[0-1])}}, {{%zmm([0-9]|1[0-1])}}
-; WIN64: {{.*}} {{%zmm([0-9]|1[0-1])}}, {{%zmm([0-9]|1[0-1])}}, {{%zmm([0-9]|1[0-1])}}
-; WIN64: retq
-
-; X32-LABEL: testf32_inp
-; X32: vmovups {{%xmm([0-7])}}, {{.*(%esp).*}} {{#+}} 16-byte Spill
-; X32: vmovups {{%xmm([0-7])}}, {{.*(%esp).*}} {{#+}} 16-byte Spill
-; X32: {{.*}} {{%zmm[0-7]}}, {{%zmm[0-7]}}, {{%zmm[0-7]}}
-; X32: {{.*}} {{%zmm[0-7]}}, {{%zmm[0-7]}}, {{%zmm[0-7]}}
-; X32: {{.*}} {{%zmm[0-7]}}, {{%zmm[0-7]}}, {{%zmm[0-7]}}
-; X32: {{.*}} {{%zmm[0-7]}}, {{%zmm[0-7]}}, {{%zmm[0-7]}}
-; X32: {{.*}} {{%zmm[0-7]}}, {{%zmm[0-7]}}, {{%zmm[0-7]}}
-; X32: {{.*}} {{%zmm[0-7]}}, {{%zmm[0-7]}}, {{%zmm[0-7]}}
-; X32: {{.*}} {{%zmm[0-7]}}, {{%zmm[0-7]}}, {{%zmm[0-7]}}
-; X32: {{.*}} {{%zmm[0-7]}}, {{%zmm[0-7]}}, {{%zmm[0-7]}}
-; X32: vmovups {{.*(%esp).*}}, {{%xmm([0-7])}} {{#+}} 16-byte Reload
-; X32: vmovups {{.*(%esp).*}}, {{%xmm([0-7])}} {{#+}} 16-byte Reload
-; X32: retl
-
-; LINUXOSX64-LABEL: testf32_inp
-; LINUXOSX64: {{.*}} {{%zmm([0-9]|1[0-1])}}, {{%zmm([0-9]|1[0-1])}}, {{%zmm([0-9]|1[0-1])}}
-; LINUXOSX64: {{.*}} {{%zmm([0-9]|1[0-1])}}, {{%zmm([0-9]|1[0-1])}}, {{%zmm([0-9]|1[0-1])}}
-; LINUXOSX64: {{.*}} {{%zmm([0-9]|1[0-1])}}, {{%zmm([0-9]|1[0-1])}}, {{%zmm([0-9]|1[0-1])}}
-; LINUXOSX64: {{.*}} {{%zmm([0-9]|1[0-1])}}, {{%zmm([0-9]|1[0-1])}}, {{%zmm([0-9]|1[0-1])}}
-; LINUXOSX64: {{.*}} {{%zmm([0-9]|1[0-1])}}, {{%zmm([0-9]|1[0-1])}}, {{%zmm([0-9]|1[0-1])}}
-; LINUXOSX64: {{.*}} {{%zmm([0-9]|1[0-1])}}, {{%zmm([0-9]|1[0-1])}}, {{%zmm([0-9]|1[0-1])}}
-; LINUXOSX64: {{.*}} {{%zmm([0-9]|1[0-1])}}, {{%zmm([0-9]|1[0-1])}}, {{%zmm([0-9]|1[0-1])}}
-; LINUXOSX64: {{.*}} {{%zmm([0-9]|1[0-1])}}, {{%zmm([0-9]|1[0-1])}}, {{%zmm([0-9]|1[0-1])}}
-; LINUXOSX64: retq
-
-; Test regcall when running multiple input parameters - callee saved XMMs
+; Test regcall when running multiple input parameters - callee saved xmms
define x86_regcallcc <32 x float> @testf32_inp(<32 x float> %a, <32 x float> %b, <32 x float> %c) nounwind {
+; X32-LABEL: testf32_inp:
+; X32: # %bb.0:
+; X32-NEXT: subl $44, %esp
+; X32-NEXT: vmovups %xmm7, {{[0-9]+}}(%esp) # 16-byte Spill
+; X32-NEXT: vmovups %xmm6, (%esp) # 16-byte Spill
+; X32-NEXT: vaddps %zmm2, %zmm0, %zmm6
+; X32-NEXT: vaddps %zmm3, %zmm1, %zmm7
+; X32-NEXT: vmulps %zmm2, %zmm0, %zmm0
+; X32-NEXT: vsubps %zmm0, %zmm6, %zmm0
+; X32-NEXT: vmulps %zmm3, %zmm1, %zmm1
+; X32-NEXT: vsubps %zmm1, %zmm7, %zmm1
+; X32-NEXT: vaddps %zmm4, %zmm0, %zmm0
+; X32-NEXT: vaddps %zmm5, %zmm1, %zmm1
+; X32-NEXT: vmovups (%esp), %xmm6 # 16-byte Reload
+; X32-NEXT: vmovups {{[0-9]+}}(%esp), %xmm7 # 16-byte Reload
+; X32-NEXT: addl $44, %esp
+; X32-NEXT: retl
+;
+; WIN64-LABEL: testf32_inp:
+; WIN64: # %bb.0:
+; WIN64-NEXT: vaddps %zmm2, %zmm0, %zmm6
+; WIN64-NEXT: vaddps %zmm3, %zmm1, %zmm7
+; WIN64-NEXT: vmulps %zmm2, %zmm0, %zmm0
+; WIN64-NEXT: vsubps %zmm0, %zmm6, %zmm0
+; WIN64-NEXT: vmulps %zmm3, %zmm1, %zmm1
+; WIN64-NEXT: vsubps %zmm1, %zmm7, %zmm1
+; WIN64-NEXT: vaddps %zmm4, %zmm0, %zmm0
+; WIN64-NEXT: vaddps %zmm5, %zmm1, %zmm1
+; WIN64-NEXT: retq
+;
+; LINUXOSX64-LABEL: testf32_inp:
+; LINUXOSX64: # %bb.0:
+; LINUXOSX64-NEXT: vaddps %zmm2, %zmm0, %zmm6
+; LINUXOSX64-NEXT: vaddps %zmm3, %zmm1, %zmm7
+; LINUXOSX64-NEXT: vmulps %zmm2, %zmm0, %zmm0
+; LINUXOSX64-NEXT: vsubps %zmm0, %zmm6, %zmm0
+; LINUXOSX64-NEXT: vmulps %zmm3, %zmm1, %zmm1
+; LINUXOSX64-NEXT: vsubps %zmm1, %zmm7, %zmm1
+; LINUXOSX64-NEXT: vaddps %zmm4, %zmm0, %zmm0
+; LINUXOSX64-NEXT: vaddps %zmm5, %zmm1, %zmm1
+; LINUXOSX64-NEXT: retq
%x1 = fadd <32 x float> %a, %b
%x2 = fmul <32 x float> %a, %b
%x3 = fsub <32 x float> %x1, %x2
@@ -457,32 +899,136 @@ define x86_regcallcc <32 x float> @testf32_inp(<32 x float> %a, <32 x float> %b,
ret <32 x float> %x4
}
-; X32-LABEL: testi32_inp
-; X32: pushl {{%e(bx|bp)}}
-; X32: pushl {{%e(bx|bp)}}
-; X32: popl {{%e(bx|bp)}}
-; X32: popl {{%e(bx|bp)}}
-; X32: retl
-
-; WIN64-LABEL: testi32_inp
-; WIN64: pushq {{%r(bp|bx|1[0-5])}}
-; WIN64: pushq {{%r(bp|bx|1[0-5])}}
-; WIN64: pushq {{%r(bp|bx|1[0-5])}}
-; WIN64: popq {{%r(bp|bx|1[0-5])}}
-; WIN64: popq {{%r(bp|bx|1[0-5])}}
-; WIN64: popq {{%r(bp|bx|1[0-5])}}
-; WIN64: retq
-
-; LINUXOSX64-LABEL: testi32_inp
-; LINUXOSX64: pushq {{%r(bp|bx|1[2-5])}}
-; LINUXOSX64: pushq {{%r(bp|bx|1[2-5])}}
-; LINUXOSX64: popq {{%r(bp|bx|1[2-5])}}
-; LINUXOSX64: popq {{%r(bp|bx|1[2-5])}}
-; LINUXOSX64: retq
-
; Test regcall when running multiple input parameters - callee saved GPRs
-define x86_regcallcc i32 @testi32_inp(i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6,
- i32 %b1, i32 %b2, i32 %b3, i32 %b4, i32 %b5, i32 %b6) nounwind {
+define x86_regcallcc i32 @testi32_inp(i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %b1, i32 %b2, i32 %b3, i32 %b4, i32 %b5, i32 %b6) nounwind {
+; X32-LABEL: testi32_inp:
+; X32: # %bb.0:
+; X32-NEXT: pushl %ebp
+; X32-NEXT: pushl %ebx
+; X32-NEXT: subl $20, %esp
+; X32-NEXT: movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: movl %edi, %esi
+; X32-NEXT: movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: movl %ebx, (%esp) # 4-byte Spill
+; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: movl %eax, %edx
+; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: subl %ecx, %edx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X32-NEXT: movl %edi, %ebp
+; X32-NEXT: subl {{[0-9]+}}(%esp), %ebp
+; X32-NEXT: imull %ebp, %edx
+; X32-NEXT: subl %esi, %ebx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT: movl %esi, %ecx
+; X32-NEXT: subl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: imull %ebx, %ecx
+; X32-NEXT: addl %ecx, %edx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
+; X32-NEXT: movl %ebx, %ebp
+; X32-NEXT: subl {{[0-9]+}}(%esp), %ebp
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: subl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: imull %ebp, %eax
+; X32-NEXT: addl %eax, %edx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X32-NEXT: addl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
+; X32-NEXT: movl (%esp), %ebp # 4-byte Reload
+; X32-NEXT: addl {{[0-9]+}}(%esp), %ebp # 4-byte Folded Reload
+; X32-NEXT: addl {{[0-9]+}}(%esp), %ebx
+; X32-NEXT: addl {{[0-9]+}}(%esp), %edi
+; X32-NEXT: imull %eax, %edi
+; X32-NEXT: addl {{[0-9]+}}(%esp), %esi
+; X32-NEXT: imull %ebp, %esi
+; X32-NEXT: addl %edi, %esi
+; X32-NEXT: addl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: imull %ebx, %ecx
+; X32-NEXT: addl %esi, %ecx
+; X32-NEXT: addl %ecx, %edx
+; X32-NEXT: movl %edx, %eax
+; X32-NEXT: addl $20, %esp
+; X32-NEXT: popl %ebx
+; X32-NEXT: popl %ebp
+; X32-NEXT: retl
+;
+; WIN64-LABEL: testi32_inp:
+; WIN64: # %bb.0:
+; WIN64-NEXT: pushq %r13
+; WIN64-NEXT: pushq %rbp
+; WIN64-NEXT: pushq %rbx
+; WIN64-NEXT: movl %eax, %r13d
+; WIN64-NEXT: subl %ecx, %eax
+; WIN64-NEXT: movl %edx, %ebp
+; WIN64-NEXT: subl %edi, %ebp
+; WIN64-NEXT: movl %r9d, %ebx
+; WIN64-NEXT: subl %r10d, %ebx
+; WIN64-NEXT: imull %ebx, %eax
+; WIN64-NEXT: movl %r11d, %ebx
+; WIN64-NEXT: subl %r12d, %ebx
+; WIN64-NEXT: imull %ebp, %ebx
+; WIN64-NEXT: movl %esi, %ebp
+; WIN64-NEXT: subl %r8d, %ebp
+; WIN64-NEXT: addl %ebx, %eax
+; WIN64-NEXT: movl %r14d, %ebx
+; WIN64-NEXT: subl %r15d, %ebx
+; WIN64-NEXT: imull %ebp, %ebx
+; WIN64-NEXT: addl %ebx, %eax
+; WIN64-NEXT: addl %ecx, %r13d
+; WIN64-NEXT: addl %edi, %edx
+; WIN64-NEXT: addl %r8d, %esi
+; WIN64-NEXT: addl %r10d, %r9d
+; WIN64-NEXT: imull %r13d, %r9d
+; WIN64-NEXT: addl %r12d, %r11d
+; WIN64-NEXT: imull %edx, %r11d
+; WIN64-NEXT: addl %r9d, %r11d
+; WIN64-NEXT: addl %r15d, %r14d
+; WIN64-NEXT: imull %esi, %r14d
+; WIN64-NEXT: addl %r11d, %r14d
+; WIN64-NEXT: addl %r14d, %eax
+; WIN64-NEXT: popq %rbx
+; WIN64-NEXT: popq %rbp
+; WIN64-NEXT: popq %r13
+; WIN64-NEXT: retq
+;
+; LINUXOSX64-LABEL: testi32_inp:
+; LINUXOSX64: # %bb.0:
+; LINUXOSX64-NEXT: pushq %rbp
+; LINUXOSX64-NEXT: pushq %rbx
+; LINUXOSX64-NEXT: movl %eax, %r10d
+; LINUXOSX64-NEXT: movl {{[0-9]+}}(%rsp), %r11d
+; LINUXOSX64-NEXT: subl %ecx, %eax
+; LINUXOSX64-NEXT: movl %edx, %ebx
+; LINUXOSX64-NEXT: subl %edi, %ebx
+; LINUXOSX64-NEXT: movl %r9d, %ebp
+; LINUXOSX64-NEXT: subl %r12d, %ebp
+; LINUXOSX64-NEXT: imull %ebp, %eax
+; LINUXOSX64-NEXT: movl %r13d, %ebp
+; LINUXOSX64-NEXT: subl %r14d, %ebp
+; LINUXOSX64-NEXT: imull %ebx, %ebp
+; LINUXOSX64-NEXT: movl %esi, %ebx
+; LINUXOSX64-NEXT: subl %r8d, %ebx
+; LINUXOSX64-NEXT: addl %ebp, %eax
+; LINUXOSX64-NEXT: movl %r15d, %ebp
+; LINUXOSX64-NEXT: subl %r11d, %ebp
+; LINUXOSX64-NEXT: imull %ebx, %ebp
+; LINUXOSX64-NEXT: addl %ebp, %eax
+; LINUXOSX64-NEXT: addl %ecx, %r10d
+; LINUXOSX64-NEXT: addl %edi, %edx
+; LINUXOSX64-NEXT: addl %r8d, %esi
+; LINUXOSX64-NEXT: addl %r12d, %r9d
+; LINUXOSX64-NEXT: imull %r10d, %r9d
+; LINUXOSX64-NEXT: addl %r14d, %r13d
+; LINUXOSX64-NEXT: imull %edx, %r13d
+; LINUXOSX64-NEXT: addl %r9d, %r13d
+; LINUXOSX64-NEXT: addl %r11d, %r15d
+; LINUXOSX64-NEXT: imull %esi, %r15d
+; LINUXOSX64-NEXT: addl %r13d, %r15d
+; LINUXOSX64-NEXT: addl %r15d, %eax
+; LINUXOSX64-NEXT: popq %rbx
+; LINUXOSX64-NEXT: popq %rbp
+; LINUXOSX64-NEXT: retq
%x1 = sub i32 %a1, %a2
%x2 = sub i32 %a3, %a4
%x3 = sub i32 %a5, %a6
@@ -509,48 +1055,85 @@ define x86_regcallcc i32 @testi32_inp(i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a
ret i32 %r1
}
-; X32-LABEL: testf32_stack
-; X32: vaddps {{%zmm([0-7])}}, {{%zmm([0-7])}}, {{%zmm([0-7])}}
-; X32: vaddps {{%zmm([0-7])}}, {{%zmm([0-7])}}, {{%zmm([0-7])}}
-; X32: vaddps {{%zmm([0-7])}}, {{%zmm([0-7])}}, {{%zmm([0-7])}}
-; X32: vaddps {{%zmm([0-7])}}, {{%zmm([0-7])}}, {{%zmm([0-7])}}
-; X32: vaddps {{%zmm([0-7])}}, {{%zmm([0-7])}}, {{%zmm([0-7])}}
-; X32: vaddps {{%zmm([0-7])}}, {{%zmm([0-7])}}, {{%zmm([0-7])}}
-; X32: vaddps {{([0-9])+}}(%ebp), {{%zmm([0-7])}}, {{%zmm([0-7])}}
-; X32: vaddps {{([0-9])+}}(%ebp), {{%zmm([0-7])}}, {{%zmm([0-7])}}
-; X32: vaddps {{([0-9])+}}(%ebp), {{%zmm([0-7])}}, {{%zmm([0-7])}}
-; X32: vaddps {{([0-9])+}}(%ebp), {{%zmm([0-7])}}, {{%zmm([0-7])}}
-; X32: vaddps {{([0-9])+}}(%ebp), {{%zmm([0-7])}}, {{%zmm([0-7])}}
-; X32: vaddps {{([0-9])+}}(%ebp), {{%zmm([0-7])}}, {{%zmm([0-7])}}
-; X32: vaddps {{([0-9])+}}(%ebp), {{%zmm([0-7])}}, {{%zmm([0-7])}}
-; X32: vaddps {{([0-9])+}}(%ebp), {{%zmm([0-7])}}, {{%zmm([0-7])}}
-; X32: vaddps {{([0-9])+}}(%ebp), {{%zmm([0-7])}}, {{%zmm([0-1])}}
-; X32: vaddps {{([0-9])+}}(%ebp), {{%zmm([0-7])}}, {{%zmm([0-1])}}
-; X32: retl
-
-; LINUXOSX64-LABEL: testf32_stack
-; LINUXOSX64: vaddps {{%zmm([0-9]+)}}, {{%zmm([0-9]+)}}, {{%zmm([0-9]+)}}
-; LINUXOSX64: vaddps {{%zmm([0-9]+)}}, {{%zmm([0-9]+)}}, {{%zmm([0-9]+)}}
-; LINUXOSX64: vaddps {{%zmm([0-9]+)}}, {{%zmm([0-9]+)}}, {{%zmm([0-9]+)}}
-; LINUXOSX64: vaddps {{%zmm([0-9]+)}}, {{%zmm([0-9]+)}}, {{%zmm([0-9]+)}}
-; LINUXOSX64: vaddps {{%zmm([0-9]+)}}, {{%zmm([0-9]+)}}, {{%zmm([0-9]+)}}
-; LINUXOSX64: vaddps {{%zmm([0-9]+)}}, {{%zmm([0-9]+)}}, {{%zmm([0-9]+)}}
-; LINUXOSX64: vaddps {{%zmm([0-9]+)}}, {{%zmm([0-9]+)}}, {{%zmm([0-9]+)}}
-; LINUXOSX64: vaddps {{%zmm([0-9]+)}}, {{%zmm([0-9]+)}}, {{%zmm([0-9]+)}}
-; LINUXOSX64: vaddps {{%zmm([0-9]+)}}, {{%zmm([0-9]+)}}, {{%zmm([0-9]+)}}
-; LINUXOSX64: vaddps {{%zmm([0-9]+)}}, {{%zmm([0-9]+)}}, {{%zmm([0-9]+)}}
-; LINUXOSX64: vaddps {{%zmm([0-9]+)}}, {{%zmm([0-9]+)}}, {{%zmm([0-9]+)}}
-; LINUXOSX64: vaddps {{%zmm([0-9]+)}}, {{%zmm([0-9]+)}}, {{%zmm([0-9]+)}}
-; LINUXOSX64: vaddps {{%zmm([0-9]+)}}, {{%zmm([0-9]+)}}, {{%zmm([0-9]+)}}
-; LINUXOSX64: vaddps {{%zmm([0-9]+)}}, {{%zmm([0-9]+)}}, {{%zmm([0-9]+)}}
-; LINUXOSX64: vaddps {{([0-9])+}}(%rbp), {{%zmm([0-9]+)}}, {{%zmm([0-1])}}
-; LINUXOSX64: vaddps {{([0-9])+}}(%rbp), {{%zmm([0-9]+)}}, {{%zmm([0-1])}}
-; LINUXOSX64: retq
-
; Test that parameters, overflowing register capacity, are passed through the stack
-define x86_regcallcc <32 x float> @testf32_stack(<32 x float> %a0, <32 x float> %b0, <32 x float> %c0,
- <32 x float> %a1, <32 x float> %b1, <32 x float> %c1,
- <32 x float> %a2, <32 x float> %b2, <32 x float> %c2) nounwind {
+define x86_regcallcc <32 x float> @testf32_stack(<32 x float> %a0, <32 x float> %b0, <32 x float> %c0, <32 x float> %a1, <32 x float> %b1, <32 x float> %c1, <32 x float> %a2, <32 x float> %b2, <32 x float> %c2) nounwind {
+; X32-LABEL: testf32_stack:
+; X32: # %bb.0:
+; X32-NEXT: pushl %ebp
+; X32-NEXT: movl %esp, %ebp
+; X32-NEXT: andl $-64, %esp
+; X32-NEXT: subl $64, %esp
+; X32-NEXT: vaddps %zmm3, %zmm1, %zmm1
+; X32-NEXT: vaddps %zmm2, %zmm0, %zmm0
+; X32-NEXT: vaddps %zmm0, %zmm4, %zmm0
+; X32-NEXT: vaddps %zmm1, %zmm5, %zmm1
+; X32-NEXT: vaddps %zmm1, %zmm7, %zmm1
+; X32-NEXT: vaddps %zmm0, %zmm6, %zmm0
+; X32-NEXT: vaddps 8(%ebp), %zmm0, %zmm0
+; X32-NEXT: vaddps 72(%ebp), %zmm1, %zmm1
+; X32-NEXT: vaddps 200(%ebp), %zmm1, %zmm1
+; X32-NEXT: vaddps 136(%ebp), %zmm0, %zmm0
+; X32-NEXT: vaddps 264(%ebp), %zmm0, %zmm0
+; X32-NEXT: vaddps 328(%ebp), %zmm1, %zmm1
+; X32-NEXT: vaddps 456(%ebp), %zmm1, %zmm1
+; X32-NEXT: vaddps 392(%ebp), %zmm0, %zmm0
+; X32-NEXT: vaddps 520(%ebp), %zmm0, %zmm0
+; X32-NEXT: vaddps 584(%ebp), %zmm1, %zmm1
+; X32-NEXT: movl %ebp, %esp
+; X32-NEXT: popl %ebp
+; X32-NEXT: retl
+;
+; WIN64-LABEL: testf32_stack:
+; WIN64: # %bb.0:
+; WIN64-NEXT: pushq %rbp
+; WIN64-NEXT: subq $48, %rsp
+; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rbp
+; WIN64-NEXT: andq $-64, %rsp
+; WIN64-NEXT: vaddps %zmm3, %zmm1, %zmm1
+; WIN64-NEXT: vaddps %zmm2, %zmm0, %zmm0
+; WIN64-NEXT: vaddps %zmm0, %zmm4, %zmm0
+; WIN64-NEXT: vaddps %zmm1, %zmm5, %zmm1
+; WIN64-NEXT: vaddps %zmm1, %zmm7, %zmm1
+; WIN64-NEXT: vaddps %zmm0, %zmm6, %zmm0
+; WIN64-NEXT: vaddps %zmm0, %zmm8, %zmm0
+; WIN64-NEXT: vaddps %zmm1, %zmm9, %zmm1
+; WIN64-NEXT: vaddps %zmm1, %zmm11, %zmm1
+; WIN64-NEXT: vaddps %zmm0, %zmm10, %zmm0
+; WIN64-NEXT: vaddps %zmm0, %zmm12, %zmm0
+; WIN64-NEXT: vaddps %zmm1, %zmm13, %zmm1
+; WIN64-NEXT: vaddps %zmm1, %zmm15, %zmm1
+; WIN64-NEXT: vaddps %zmm0, %zmm14, %zmm0
+; WIN64-NEXT: vaddps 16(%rbp), %zmm0, %zmm0
+; WIN64-NEXT: vaddps 80(%rbp), %zmm1, %zmm1
+; WIN64-NEXT: movq %rbp, %rsp
+; WIN64-NEXT: popq %rbp
+; WIN64-NEXT: retq
+;
+; LINUXOSX64-LABEL: testf32_stack:
+; LINUXOSX64: # %bb.0:
+; LINUXOSX64-NEXT: pushq %rbp
+; LINUXOSX64-NEXT: movq %rsp, %rbp
+; LINUXOSX64-NEXT: andq $-64, %rsp
+; LINUXOSX64-NEXT: subq $64, %rsp
+; LINUXOSX64-NEXT: vaddps %zmm3, %zmm1, %zmm1
+; LINUXOSX64-NEXT: vaddps %zmm2, %zmm0, %zmm0
+; LINUXOSX64-NEXT: vaddps %zmm0, %zmm4, %zmm0
+; LINUXOSX64-NEXT: vaddps %zmm1, %zmm5, %zmm1
+; LINUXOSX64-NEXT: vaddps %zmm1, %zmm7, %zmm1
+; LINUXOSX64-NEXT: vaddps %zmm0, %zmm6, %zmm0
+; LINUXOSX64-NEXT: vaddps %zmm0, %zmm8, %zmm0
+; LINUXOSX64-NEXT: vaddps %zmm1, %zmm9, %zmm1
+; LINUXOSX64-NEXT: vaddps %zmm1, %zmm11, %zmm1
+; LINUXOSX64-NEXT: vaddps %zmm0, %zmm10, %zmm0
+; LINUXOSX64-NEXT: vaddps %zmm0, %zmm12, %zmm0
+; LINUXOSX64-NEXT: vaddps %zmm1, %zmm13, %zmm1
+; LINUXOSX64-NEXT: vaddps %zmm1, %zmm15, %zmm1
+; LINUXOSX64-NEXT: vaddps %zmm0, %zmm14, %zmm0
+; LINUXOSX64-NEXT: vaddps 16(%rbp), %zmm0, %zmm0
+; LINUXOSX64-NEXT: vaddps 80(%rbp), %zmm1, %zmm1
+; LINUXOSX64-NEXT: movq %rbp, %rsp
+; LINUXOSX64-NEXT: popq %rbp
+; LINUXOSX64-NEXT: retq
%x1 = fadd <32 x float> %a0, %b0
%x2 = fadd <32 x float> %c0, %x1
%x3 = fadd <32 x float> %a1, %x2
@@ -562,25 +1145,69 @@ define x86_regcallcc <32 x float> @testf32_stack(<32 x float> %a0, <32 x float>
ret <32 x float> %x8
}
-; X32-LABEL: vmovd %edx, {{%xmm([0-9])}}
-; X32: vcvtsi2sdl %eax, {{%xmm([0-9])}}, {{%xmm([0-9])}}
-; X32: vcvtsi2sdl %ecx, {{%xmm([0-9])}}, {{%xmm([0-9])}}
-; X32: vcvtsi2sdl %esi, {{%xmm([0-9])}}, {{%xmm([0-9])}}
-; X32: vaddsd %xmm1, %xmm0, %xmm0
-; X32: vcvttsd2si %xmm0, %eax
-; X32: retl
-
-; LINUXOSX64-LABEL: test_argRetMixTypes
-; LINUXOSX64: vcvtss2sd %xmm1, %xmm1, %xmm1
-; LINUXOSX64: vcvtsi2sdl %eax, {{%xmm([0-9])}}, {{%xmm([0-9])}}
-; LINUXOSX64: vcvtsi2sdl %ecx, {{%xmm([0-9])}}, {{%xmm([0-9])}}
-; LINUXOSX64: vcvtsi2sdq %rdx, {{%xmm([0-9])}}, {{%xmm([0-9])}}
-; LINUXOSX64: vcvtsi2sdl %edi, {{%xmm([0-9])}}, {{%xmm([0-9])}}
-; LINUXOSX64: vcvtsi2sdl (%rsi), {{%xmm([0-9])}}, {{%xmm([0-9])}}
-; LINUXOSX64: vcvttsd2si {{%xmm([0-9])}}, %eax
-
; Test regcall when passing/retrieving mixed types
define x86_regcallcc i32 @test_argRetMixTypes(double, float, i8 signext, i32, i64, i16 signext, i32*) #0 {
+; X32-LABEL: test_argRetMixTypes:
+; X32: # %bb.0:
+; X32-NEXT: pushl %ebp
+; X32-NEXT: movl %esp, %ebp
+; X32-NEXT: andl $-8, %esp
+; X32-NEXT: subl $16, %esp
+; X32-NEXT: vmovd %edx, %xmm2
+; X32-NEXT: vpinsrd $1, %edi, %xmm2, %xmm2
+; X32-NEXT: movl 8(%ebp), %edx
+; X32-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
+; X32-NEXT: vaddsd %xmm0, %xmm1, %xmm0
+; X32-NEXT: vcvtsi2sdl %eax, %xmm3, %xmm1
+; X32-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; X32-NEXT: vcvtsi2sdl %ecx, %xmm3, %xmm1
+; X32-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; X32-NEXT: vmovq %xmm2, {{[0-9]+}}(%esp)
+; X32-NEXT: fildll {{[0-9]+}}(%esp)
+; X32-NEXT: fstpl (%esp)
+; X32-NEXT: vaddsd (%esp), %xmm0, %xmm0
+; X32-NEXT: vcvtsi2sdl %esi, %xmm3, %xmm1
+; X32-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; X32-NEXT: vcvtsi2sdl (%edx), %xmm3, %xmm1
+; X32-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; X32-NEXT: vcvttsd2si %xmm0, %eax
+; X32-NEXT: movl %ebp, %esp
+; X32-NEXT: popl %ebp
+; X32-NEXT: retl
+;
+; WIN64-LABEL: test_argRetMixTypes:
+; WIN64: # %bb.0:
+; WIN64-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
+; WIN64-NEXT: vaddsd %xmm0, %xmm1, %xmm0
+; WIN64-NEXT: vcvtsi2sdl %eax, %xmm2, %xmm1
+; WIN64-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; WIN64-NEXT: vcvtsi2sdl %ecx, %xmm2, %xmm1
+; WIN64-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; WIN64-NEXT: vcvtsi2sdq %rdx, %xmm2, %xmm1
+; WIN64-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; WIN64-NEXT: vcvtsi2sdl %edi, %xmm2, %xmm1
+; WIN64-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; WIN64-NEXT: vcvtsi2sdl (%rsi), %xmm2, %xmm1
+; WIN64-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; WIN64-NEXT: vcvttsd2si %xmm0, %eax
+; WIN64-NEXT: retq
+;
+; LINUXOSX64-LABEL: test_argRetMixTypes:
+; LINUXOSX64: # %bb.0:
+; LINUXOSX64-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
+; LINUXOSX64-NEXT: vaddsd %xmm0, %xmm1, %xmm0
+; LINUXOSX64-NEXT: vcvtsi2sdl %eax, %xmm2, %xmm1
+; LINUXOSX64-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; LINUXOSX64-NEXT: vcvtsi2sdl %ecx, %xmm2, %xmm1
+; LINUXOSX64-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; LINUXOSX64-NEXT: vcvtsi2sdq %rdx, %xmm2, %xmm1
+; LINUXOSX64-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; LINUXOSX64-NEXT: vcvtsi2sdl %edi, %xmm2, %xmm1
+; LINUXOSX64-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; LINUXOSX64-NEXT: vcvtsi2sdl (%rsi), %xmm2, %xmm1
+; LINUXOSX64-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; LINUXOSX64-NEXT: vcvttsd2si %xmm0, %eax
+; LINUXOSX64-NEXT: retq
%8 = fpext float %1 to double
%9 = fadd double %8, %0
%10 = sitofp i8 %2 to double
@@ -600,23 +1227,31 @@ define x86_regcallcc i32 @test_argRetMixTypes(double, float, i8 signext, i32, i6
%struct.complex = type { float, double, i32, i8, i64}
-
-; X32-LABEL: test_argMultiRet
-; X32: vaddsd {{.*}}, %xmm1, %xmm1
-; X32: movl $4, %eax
-; X32: movb $7, %cl
-; X32: movl $999, %edx
-; X32: xorl %edi, %edi
-; X32: retl
-
-; LINUXOSX64-LABEL: test_argMultiRet
-; LINUXOSX64: vaddsd {{.*}}, %xmm1, %xmm1
-; LINUXOSX64: movl $4, %eax
-; LINUXOSX64: movb $7, %cl
-; LINUXOSX64: movl $999, %edx
-; LINUXOSX64: retq
-
define x86_regcallcc %struct.complex @test_argMultiRet(float, double, i32, i8, i64) local_unnamed_addr #0 {
+; X32-LABEL: test_argMultiRet:
+; X32: # %bb.0:
+; X32-NEXT: vaddsd __real@4014000000000000, %xmm1, %xmm1
+; X32-NEXT: movl $4, %eax
+; X32-NEXT: movb $7, %cl
+; X32-NEXT: movl $999, %edx # imm = 0x3E7
+; X32-NEXT: xorl %edi, %edi
+; X32-NEXT: retl
+;
+; WIN64-LABEL: test_argMultiRet:
+; WIN64: # %bb.0:
+; WIN64-NEXT: vaddsd __real@{{.*}}(%rip), %xmm1, %xmm1
+; WIN64-NEXT: movl $4, %eax
+; WIN64-NEXT: movb $7, %cl
+; WIN64-NEXT: movl $999, %edx # imm = 0x3E7
+; WIN64-NEXT: retq
+;
+; LINUXOSX64-LABEL: test_argMultiRet:
+; LINUXOSX64: # %bb.0:
+; LINUXOSX64-NEXT: vaddsd {{.*}}(%rip), %xmm1, %xmm1
+; LINUXOSX64-NEXT: movl $4, %eax
+; LINUXOSX64-NEXT: movb $7, %cl
+; LINUXOSX64-NEXT: movl $999, %edx # imm = 0x3E7
+; LINUXOSX64-NEXT: retq
%6 = fadd double %1, 5.000000e+00
%7 = insertvalue %struct.complex undef, float %0, 0
%8 = insertvalue %struct.complex %7, double %6, 1
@@ -625,4 +1260,3 @@ define x86_regcallcc %struct.complex @test_argMultiRet(float, double, i32, i8, i
%11 = insertvalue %struct.complex %10, i64 999, 4
ret %struct.complex %11
}
-
diff --git a/test/CodeGen/X86/avx512-rotate.ll b/test/CodeGen/X86/avx512-rotate.ll
index 98fa67ad793d..203092e88d31 100644
--- a/test/CodeGen/X86/avx512-rotate.ll
+++ b/test/CodeGen/X86/avx512-rotate.ll
@@ -7,11 +7,14 @@ declare <16 x i32> @llvm.x86.avx512.mask.prorv.d.512(<16 x i32>, <16 x i32>, <16
declare <8 x i64> @llvm.x86.avx512.mask.prolv.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
declare <8 x i64> @llvm.x86.avx512.mask.prorv.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
+declare <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
+declare <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
+
; Tests showing replacement of variable rotates with immediate splat versions.
define <16 x i32> @test_splat_rol_v16i32(<16 x i32> %x0, <16 x i32> %x1, i16 %x2) {
; KNL-LABEL: test_splat_rol_v16i32:
-; KNL: # BB#0:
+; KNL: # %bb.0:
; KNL-NEXT: kmovw %edi, %k1
; KNL-NEXT: vprold $5, %zmm0, %zmm1 {%k1}
; KNL-NEXT: vprold $5, %zmm0, %zmm2 {%k1} {z}
@@ -21,7 +24,7 @@ define <16 x i32> @test_splat_rol_v16i32(<16 x i32> %x0, <16 x i32> %x1, i16 %x2
; KNL-NEXT: retq
;
; SKX-LABEL: test_splat_rol_v16i32:
-; SKX: # BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: kmovd %edi, %k1
; SKX-NEXT: vprold $5, %zmm0, %zmm1 {%k1}
; SKX-NEXT: vprold $5, %zmm0, %zmm2 {%k1} {z}
@@ -39,7 +42,7 @@ define <16 x i32> @test_splat_rol_v16i32(<16 x i32> %x0, <16 x i32> %x1, i16 %x2
define <8 x i64>@test_splat_rol_v8i64(<8 x i64> %x0, <8 x i64> %x1, i8 %x2) {
; KNL-LABEL: test_splat_rol_v8i64:
-; KNL: # BB#0:
+; KNL: # %bb.0:
; KNL-NEXT: kmovw %edi, %k1
; KNL-NEXT: vprolq $5, %zmm0, %zmm1 {%k1}
; KNL-NEXT: vprolq $5, %zmm0, %zmm2 {%k1} {z}
@@ -49,7 +52,7 @@ define <8 x i64>@test_splat_rol_v8i64(<8 x i64> %x0, <8 x i64> %x1, i8 %x2) {
; KNL-NEXT: retq
;
; SKX-LABEL: test_splat_rol_v8i64:
-; SKX: # BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: kmovd %edi, %k1
; SKX-NEXT: vprolq $5, %zmm0, %zmm1 {%k1}
; SKX-NEXT: vprolq $5, %zmm0, %zmm2 {%k1} {z}
@@ -67,7 +70,7 @@ define <8 x i64>@test_splat_rol_v8i64(<8 x i64> %x0, <8 x i64> %x1, i8 %x2) {
define <16 x i32> @test_splat_ror_v16i32(<16 x i32> %x0, <16 x i32> %x1, i16 %x2) {
; KNL-LABEL: test_splat_ror_v16i32:
-; KNL: # BB#0:
+; KNL: # %bb.0:
; KNL-NEXT: kmovw %edi, %k1
; KNL-NEXT: vprord $5, %zmm0, %zmm1 {%k1}
; KNL-NEXT: vprord $5, %zmm0, %zmm2 {%k1} {z}
@@ -77,7 +80,7 @@ define <16 x i32> @test_splat_ror_v16i32(<16 x i32> %x0, <16 x i32> %x1, i16 %x2
; KNL-NEXT: retq
;
; SKX-LABEL: test_splat_ror_v16i32:
-; SKX: # BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: kmovd %edi, %k1
; SKX-NEXT: vprord $5, %zmm0, %zmm1 {%k1}
; SKX-NEXT: vprord $5, %zmm0, %zmm2 {%k1} {z}
@@ -95,7 +98,7 @@ define <16 x i32> @test_splat_ror_v16i32(<16 x i32> %x0, <16 x i32> %x1, i16 %x2
define <8 x i64>@test_splat_ror_v8i64(<8 x i64> %x0, <8 x i64> %x1, i8 %x2) {
; KNL-LABEL: test_splat_ror_v8i64:
-; KNL: # BB#0:
+; KNL: # %bb.0:
; KNL-NEXT: kmovw %edi, %k1
; KNL-NEXT: vprorq $5, %zmm0, %zmm1 {%k1}
; KNL-NEXT: vprorq $5, %zmm0, %zmm2 {%k1} {z}
@@ -105,7 +108,7 @@ define <8 x i64>@test_splat_ror_v8i64(<8 x i64> %x0, <8 x i64> %x1, i8 %x2) {
; KNL-NEXT: retq
;
; SKX-LABEL: test_splat_ror_v8i64:
-; SKX: # BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: kmovd %edi, %k1
; SKX-NEXT: vprorq $5, %zmm0, %zmm1 {%k1}
; SKX-NEXT: vprorq $5, %zmm0, %zmm2 {%k1} {z}
@@ -125,7 +128,7 @@ define <8 x i64>@test_splat_ror_v8i64(<8 x i64> %x0, <8 x i64> %x1, i8 %x2) {
define <16 x i32> @test_splat_bounds_rol_v16i32(<16 x i32> %x0, <16 x i32> %x1, i16 %x2) {
; KNL-LABEL: test_splat_bounds_rol_v16i32:
-; KNL: # BB#0:
+; KNL: # %bb.0:
; KNL-NEXT: kmovw %edi, %k1
; KNL-NEXT: vprold $1, %zmm0, %zmm1 {%k1}
; KNL-NEXT: vprold $31, %zmm0, %zmm2 {%k1} {z}
@@ -135,7 +138,7 @@ define <16 x i32> @test_splat_bounds_rol_v16i32(<16 x i32> %x0, <16 x i32> %x1,
; KNL-NEXT: retq
;
; SKX-LABEL: test_splat_bounds_rol_v16i32:
-; SKX: # BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: kmovd %edi, %k1
; SKX-NEXT: vprold $1, %zmm0, %zmm1 {%k1}
; SKX-NEXT: vprold $31, %zmm0, %zmm2 {%k1} {z}
@@ -153,7 +156,7 @@ define <16 x i32> @test_splat_bounds_rol_v16i32(<16 x i32> %x0, <16 x i32> %x1,
define <8 x i64>@test_splat_bounds_rol_v8i64(<8 x i64> %x0, <8 x i64> %x1, i8 %x2) {
; KNL-LABEL: test_splat_bounds_rol_v8i64:
-; KNL: # BB#0:
+; KNL: # %bb.0:
; KNL-NEXT: kmovw %edi, %k1
; KNL-NEXT: vprolq $62, %zmm0, %zmm1 {%k1}
; KNL-NEXT: vprolq $1, %zmm0, %zmm2 {%k1} {z}
@@ -163,7 +166,7 @@ define <8 x i64>@test_splat_bounds_rol_v8i64(<8 x i64> %x0, <8 x i64> %x1, i8 %x
; KNL-NEXT: retq
;
; SKX-LABEL: test_splat_bounds_rol_v8i64:
-; SKX: # BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: kmovd %edi, %k1
; SKX-NEXT: vprolq $62, %zmm0, %zmm1 {%k1}
; SKX-NEXT: vprolq $1, %zmm0, %zmm2 {%k1} {z}
@@ -181,7 +184,7 @@ define <8 x i64>@test_splat_bounds_rol_v8i64(<8 x i64> %x0, <8 x i64> %x1, i8 %x
define <16 x i32> @test_splat_bounds_ror_v16i32(<16 x i32> %x0, <16 x i32> %x1, i16 %x2) {
; KNL-LABEL: test_splat_bounds_ror_v16i32:
-; KNL: # BB#0:
+; KNL: # %bb.0:
; KNL-NEXT: kmovw %edi, %k1
; KNL-NEXT: vprord $1, %zmm0, %zmm1 {%k1}
; KNL-NEXT: vprord $31, %zmm0, %zmm2 {%k1} {z}
@@ -191,7 +194,7 @@ define <16 x i32> @test_splat_bounds_ror_v16i32(<16 x i32> %x0, <16 x i32> %x1,
; KNL-NEXT: retq
;
; SKX-LABEL: test_splat_bounds_ror_v16i32:
-; SKX: # BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: kmovd %edi, %k1
; SKX-NEXT: vprord $1, %zmm0, %zmm1 {%k1}
; SKX-NEXT: vprord $31, %zmm0, %zmm2 {%k1} {z}
@@ -209,7 +212,7 @@ define <16 x i32> @test_splat_bounds_ror_v16i32(<16 x i32> %x0, <16 x i32> %x1,
define <8 x i64>@test_splat_bounds_ror_v8i64(<8 x i64> %x0, <8 x i64> %x1, i8 %x2) {
; KNL-LABEL: test_splat_bounds_ror_v8i64:
-; KNL: # BB#0:
+; KNL: # %bb.0:
; KNL-NEXT: kmovw %edi, %k1
; KNL-NEXT: vprorq $62, %zmm0, %zmm1 {%k1}
; KNL-NEXT: vprorq $1, %zmm0, %zmm2 {%k1} {z}
@@ -219,7 +222,7 @@ define <8 x i64>@test_splat_bounds_ror_v8i64(<8 x i64> %x0, <8 x i64> %x1, i8 %x
; KNL-NEXT: retq
;
; SKX-LABEL: test_splat_bounds_ror_v8i64:
-; SKX: # BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: kmovd %edi, %k1
; SKX-NEXT: vprorq $62, %zmm0, %zmm1 {%k1}
; SKX-NEXT: vprorq $1, %zmm0, %zmm2 {%k1} {z}
@@ -236,21 +239,47 @@ define <8 x i64>@test_splat_bounds_ror_v8i64(<8 x i64> %x0, <8 x i64> %x1, i8 %x
}
; Constant folding
+; We also test with a target shuffle so that this can't be constant folded upon creation, it must
+; wait until the target shuffle has been constant folded in combineX86ShufflesRecursively.
define <8 x i64> @test_fold_rol_v8i64() {
; CHECK-LABEL: test_fold_rol_v8i64:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps {{.*#+}} zmm0 = [1,2,4,9223372036854775808,2,4611686018427387904,9223372036854775808,9223372036854775808]
; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.mask.prolv.q.512(<8 x i64> <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>, <8 x i64> <i64 0, i64 1, i64 2, i64 63, i64 65, i64 65534, i64 65535, i64 -1>, <8 x i64> zeroinitializer, i8 -1)
ret <8 x i64> %res
}
+define <16 x i32> @test_fold_rol_v16i32(<16 x i32> %x0, <16 x i32> %x1) {
+; CHECK-LABEL: test_fold_rol_v16i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm0 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; CHECK-NEXT: vprolvd {{.*}}(%rip), %zmm0, %zmm0
+; CHECK-NEXT: retq
+ %res0 = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, <16 x i32> zeroinitializer, i16 -1)
+ %res1 = call <16 x i32> @llvm.x86.avx512.mask.prolv.d.512(<16 x i32> %res0, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>, <16 x i32> zeroinitializer, i16 -1)
+ ret <16 x i32> %res1
+}
+
define <8 x i64> @test_fold_ror_v8i64() {
; CHECK-LABEL: test_fold_ror_v8i64:
-; CHECK: # BB#0:
-; CHECK-NEXT: vmovaps {{.*#+}} zmm0 = [1,9223372036854775808,4611686018427387904,2,9223372036854775808,4,2,2]
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm0 = [1,1,1,1,1,1,1,1]
+; CHECK-NEXT: vprorvq {{.*}}(%rip), %zmm0, %zmm0
; CHECK-NEXT: retq
- %res = call <8 x i64> @llvm.x86.avx512.mask.prorv.q.512(<8 x i64> <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>, <8 x i64> <i64 0, i64 1, i64 2, i64 63, i64 65, i64 65534, i64 65535, i64 -1>, <8 x i64> zeroinitializer, i8 -1)
- ret <8 x i64> %res
+ %res0 = call <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64> <i64 undef, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>, <8 x i64> <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>, <8 x i64> zeroinitializer, i8 -1)
+ %res1 = call <8 x i64> @llvm.x86.avx512.mask.prorv.q.512(<8 x i64> %res0, <8 x i64> <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, <8 x i64> zeroinitializer, i8 -1)
+ ret <8 x i64> %res1
+}
+
+define <16 x i32> @test_fold_ror_v16i32(<16 x i32> %x0, <16 x i32> %x1) {
+; CHECK-LABEL: test_fold_ror_v16i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm0 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; CHECK-NEXT: vprorvd {{.*}}(%rip), %zmm0, %zmm0
+; CHECK-NEXT: retq
+ %res0 = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, <16 x i32> zeroinitializer, i16 -1)
+ %res1 = call <16 x i32> @llvm.x86.avx512.mask.prorv.d.512(<16 x i32> %res0, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>, <16 x i32> zeroinitializer, i16 -1)
+ ret <16 x i32> %res1
}
diff --git a/test/CodeGen/X86/avx512-round.ll b/test/CodeGen/X86/avx512-round.ll
deleted file mode 100644
index b23af2b09a78..000000000000
--- a/test/CodeGen/X86/avx512-round.ll
+++ /dev/null
@@ -1,106 +0,0 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl --show-mc-encoding| FileCheck %s
-
-define <16 x float> @floor_v16f32(<16 x float> %a) {
-; CHECK-LABEL: floor_v16f32
-; CHECK: vrndscaleps $9, {{.*}}encoding: [0x62,0xf3,0x7d,0x48,0x08,0xc0,0x09]
- %res = call <16 x float> @llvm.floor.v16f32(<16 x float> %a)
- ret <16 x float> %res
-}
-declare <16 x float> @llvm.floor.v16f32(<16 x float> %p)
-
-define <8 x double> @floor_v8f64(<8 x double> %a) {
-; CHECK-LABEL: floor_v8f64
-; CHECK: vrndscalepd $9, {{.*}}encoding: [0x62,0xf3,0xfd,0x48,0x09,0xc0,0x09]
- %res = call <8 x double> @llvm.floor.v8f64(<8 x double> %a)
- ret <8 x double> %res
-}
-declare <8 x double> @llvm.floor.v8f64(<8 x double> %p)
-
-define <16 x float> @ceil_v16f32(<16 x float> %a) {
-; CHECK-LABEL: ceil_v16f32
-; CHECK: vrndscaleps $10, {{.*}}encoding: [0x62,0xf3,0x7d,0x48,0x08,0xc0,0x0a]
- %res = call <16 x float> @llvm.ceil.v16f32(<16 x float> %a)
- ret <16 x float> %res
-}
-declare <16 x float> @llvm.ceil.v16f32(<16 x float> %p)
-
-define <8 x double> @ceil_v8f64(<8 x double> %a) {
-; CHECK-LABEL: ceil_v8f64
-; CHECK: vrndscalepd $10, {{.*}}encoding: [0x62,0xf3,0xfd,0x48,0x09,0xc0,0x0a]
- %res = call <8 x double> @llvm.ceil.v8f64(<8 x double> %a)
- ret <8 x double> %res
-}
-declare <8 x double> @llvm.ceil.v8f64(<8 x double> %p)
-
-define <16 x float> @trunc_v16f32(<16 x float> %a) {
-; CHECK-LABEL: trunc_v16f32
-; CHECK: vrndscaleps $11, {{.*}}encoding: [0x62,0xf3,0x7d,0x48,0x08,0xc0,0x0b]
- %res = call <16 x float> @llvm.trunc.v16f32(<16 x float> %a)
- ret <16 x float> %res
-}
-declare <16 x float> @llvm.trunc.v16f32(<16 x float> %p)
-
-define <8 x double> @trunc_v8f64(<8 x double> %a) {
-; CHECK-LABEL: trunc_v8f64
-; CHECK: vrndscalepd $11, {{.*}}encoding: [0x62,0xf3,0xfd,0x48,0x09,0xc0,0x0b]
- %res = call <8 x double> @llvm.trunc.v8f64(<8 x double> %a)
- ret <8 x double> %res
-}
-declare <8 x double> @llvm.trunc.v8f64(<8 x double> %p)
-
-define <16 x float> @rint_v16f32(<16 x float> %a) {
-; CHECK-LABEL: rint_v16f32
-; CHECK: vrndscaleps $4, {{.*}}encoding: [0x62,0xf3,0x7d,0x48,0x08,0xc0,0x04]
- %res = call <16 x float> @llvm.rint.v16f32(<16 x float> %a)
- ret <16 x float> %res
-}
-declare <16 x float> @llvm.rint.v16f32(<16 x float> %p)
-
-define <8 x double> @rint_v8f64(<8 x double> %a) {
-; CHECK-LABEL: rint_v8f64
-; CHECK: vrndscalepd $4, {{.*}}encoding: [0x62,0xf3,0xfd,0x48,0x09,0xc0,0x04]
- %res = call <8 x double> @llvm.rint.v8f64(<8 x double> %a)
- ret <8 x double> %res
-}
-declare <8 x double> @llvm.rint.v8f64(<8 x double> %p)
-
-define <16 x float> @nearbyint_v16f32(<16 x float> %a) {
-; CHECK-LABEL: nearbyint_v16f32
-; CHECK: vrndscaleps $12, {{.*}}encoding: [0x62,0xf3,0x7d,0x48,0x08,0xc0,0x0c]
- %res = call <16 x float> @llvm.nearbyint.v16f32(<16 x float> %a)
- ret <16 x float> %res
-}
-declare <16 x float> @llvm.nearbyint.v16f32(<16 x float> %p)
-
-define <8 x double> @nearbyint_v8f64(<8 x double> %a) {
-; CHECK-LABEL: nearbyint_v8f64
-; CHECK: vrndscalepd $12, {{.*}}encoding: [0x62,0xf3,0xfd,0x48,0x09,0xc0,0x0c]
- %res = call <8 x double> @llvm.nearbyint.v8f64(<8 x double> %a)
- ret <8 x double> %res
-}
-declare <8 x double> @llvm.nearbyint.v8f64(<8 x double> %p)
-
-define double @nearbyint_f64(double %a) {
-; CHECK-LABEL: nearbyint_f64
-; CHECK: vrndscalesd $12, {{.*}}encoding: [0x62,0xf3,0xfd,0x08,0x0b,0xc0,0x0c]
- %res = call double @llvm.nearbyint.f64(double %a)
- ret double %res
-}
-declare double @llvm.nearbyint.f64(double %p)
-
-define float @floor_f32(float %a) {
-; CHECK-LABEL: floor_f32
-; CHECK: vrndscaless $9, {{.*}}encoding: [0x62,0xf3,0x7d,0x08,0x0a,0xc0,0x09]
- %res = call float @llvm.floor.f32(float %a)
- ret float %res
-}
-declare float @llvm.floor.f32(float %p)
-
-define float @floor_f32m(float* %aptr) {
-; CHECK-LABEL: floor_f32m
-; CHECK: vrndscaless $9, (%rdi), {{.*}}encoding: [0x62,0xf3,0x7d,0x08,0x0a,0x07,0x09]
- %a = load float, float* %aptr, align 4
- %res = call float @llvm.floor.f32(float %a)
- ret float %res
-}
-
diff --git a/test/CodeGen/X86/avx512-scalarIntrinsics.ll b/test/CodeGen/X86/avx512-scalarIntrinsics.ll
index c26e1fb070fc..0286aabd61a9 100644
--- a/test/CodeGen/X86/avx512-scalarIntrinsics.ll
+++ b/test/CodeGen/X86/avx512-scalarIntrinsics.ll
@@ -1,66 +1,151 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s --check-prefix=CHECK --check-prefix=SKX
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s --check-prefix=CHECK --check-prefix=KNL
define <4 x float> @test_rsqrt14_ss(<4 x float> %a0) {
- ; CHECK-LABEL: test_rsqrt14_ss:
- ; CHECK: ## BB#0:
- ; CHECK-NEXT: vrsqrt14ss %xmm0, %xmm0, %xmm0
- ; CHECK-NEXT: retq
+; CHECK-LABEL: test_rsqrt14_ss:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vrsqrt14ss %xmm0, %xmm0, %xmm0
+; CHECK-NEXT: retq
%res = call <4 x float> @llvm.x86.avx512.rsqrt14.ss(<4 x float> %a0, <4 x float> %a0, <4 x float> zeroinitializer, i8 -1) ;
ret <4 x float> %res
}
+
+define <4 x float> @test_rsqrt14_ss_load(<4 x float> %a0, <4 x float>* %a1ptr) {
+; CHECK-LABEL: test_rsqrt14_ss_load:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vrsqrt14ss (%rdi), %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %a1 = load <4 x float>, <4 x float>* %a1ptr
+ %res = call <4 x float> @llvm.x86.avx512.rsqrt14.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 -1) ;
+ ret <4 x float> %res
+}
declare <4 x float> @llvm.x86.avx512.rsqrt14.ss(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone
define <4 x float> @test_rcp14_ss(<4 x float> %a0) {
- ; CHECK-LABEL: test_rcp14_ss:
- ; CHECK: ## BB#0:
- ; CHECK-NEXT: vrcp14ss %xmm0, %xmm0, %xmm0
- ; CHECK-NEXT: retq
+; CHECK-LABEL: test_rcp14_ss:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vrcp14ss %xmm0, %xmm0, %xmm0
+; CHECK-NEXT: retq
%res = call <4 x float> @llvm.x86.avx512.rcp14.ss(<4 x float> %a0, <4 x float> %a0, <4 x float> zeroinitializer, i8 -1) ;
ret <4 x float> %res
}
+
+define <4 x float> @test_rcp14_ss_load(<4 x float> %a0, <4 x float>* %a1ptr) {
+; CHECK-LABEL: test_rcp14_ss_load:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vrcp14ss (%rdi), %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %a1 = load <4 x float>, <4 x float>* %a1ptr
+ %res = call <4 x float> @llvm.x86.avx512.rcp14.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 -1) ;
+ ret <4 x float> %res
+}
declare <4 x float> @llvm.x86.avx512.rcp14.ss(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone
define <2 x double> @test_rsqrt14_sd(<2 x double> %a0) {
- ; CHECK-LABEL: test_rsqrt14_sd:
- ; CHECK: ## BB#0:
- ; CHECK-NEXT: vrsqrt14sd %xmm0, %xmm0, %xmm0
- ; CHECK-NEXT: retq
+; CHECK-LABEL: test_rsqrt14_sd:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vrsqrt14sd %xmm0, %xmm0, %xmm0
+; CHECK-NEXT: retq
%res = call <2 x double> @llvm.x86.avx512.rsqrt14.sd(<2 x double> %a0, <2 x double> %a0, <2 x double> zeroinitializer, i8 -1) ;
ret <2 x double> %res
}
+
+define <2 x double> @test_rsqrt14_sd_load(<2 x double> %a0, <2 x double>* %a1ptr) {
+; CHECK-LABEL: test_rsqrt14_sd_load:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vrsqrt14sd (%rdi), %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %a1 = load <2 x double>, <2 x double>* %a1ptr
+ %res = call <2 x double> @llvm.x86.avx512.rsqrt14.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 -1) ;
+ ret <2 x double> %res
+}
declare <2 x double> @llvm.x86.avx512.rsqrt14.sd(<2 x double>, <2 x double>, <2 x double>, i8) nounwind readnone
define <2 x double> @test_rcp14_sd(<2 x double> %a0) {
- ; CHECK-LABEL: test_rcp14_sd:
- ; CHECK: ## BB#0:
- ; CHECK-NEXT: vrcp14sd %xmm0, %xmm0, %xmm0
- ; CHECK-NEXT: retq
+; CHECK-LABEL: test_rcp14_sd:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vrcp14sd %xmm0, %xmm0, %xmm0
+; CHECK-NEXT: retq
%res = call <2 x double> @llvm.x86.avx512.rcp14.sd(<2 x double> %a0, <2 x double> %a0, <2 x double> zeroinitializer, i8 -1) ;
ret <2 x double> %res
}
+
+define <2 x double> @test_rcp14_sd_load(<2 x double> %a0, <2 x double>* %a1ptr) {
+; CHECK-LABEL: test_rcp14_sd_load:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vrcp14sd (%rdi), %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %a1 = load <2 x double>, <2 x double>* %a1ptr
+ %res = call <2 x double> @llvm.x86.avx512.rcp14.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 -1) ;
+ ret <2 x double> %res
+}
declare <2 x double> @llvm.x86.avx512.rcp14.sd(<2 x double>, <2 x double>, <2 x double>, i8) nounwind readnone
declare <4 x float> @llvm.x86.avx512.mask.scalef.ss(<4 x float>, <4 x float>,<4 x float>, i8, i32)
define <4 x float>@test_int_x86_avx512_mask_scalef_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x3, i8 %x4) {
- ; CHECK-LABEL: test_int_x86_avx512_mask_scalef_ss:
- ; CHECK: vscalefss %xmm1, %xmm0, %xmm2 {%k1}
- ; CHECK: vscalefss {rn-sae}, %xmm1, %xmm0, %xmm0
+; SKX-LABEL: test_int_x86_avx512_mask_scalef_ss:
+; SKX: ## %bb.0:
+; SKX-NEXT: kmovd %edi, %k1
+; SKX-NEXT: vscalefss %xmm1, %xmm0, %xmm2 {%k1}
+; SKX-NEXT: vscalefss {rn-sae}, %xmm1, %xmm0, %xmm0
+; SKX-NEXT: vaddps %xmm0, %xmm2, %xmm0
+; SKX-NEXT: retq
+;
+; KNL-LABEL: test_int_x86_avx512_mask_scalef_ss:
+; KNL: ## %bb.0:
+; KNL-NEXT: kmovw %edi, %k1
+; KNL-NEXT: vscalefss %xmm1, %xmm0, %xmm2 {%k1}
+; KNL-NEXT: vscalefss {rn-sae}, %xmm1, %xmm0, %xmm0
+; KNL-NEXT: vaddps %xmm0, %xmm2, %xmm0
+; KNL-NEXT: retq
%res = call <4 x float> @llvm.x86.avx512.mask.scalef.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x3, i8 %x4, i32 4)
%res1 = call <4 x float> @llvm.x86.avx512.mask.scalef.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x3, i8 -1, i32 8)
%res2 = fadd <4 x float> %res, %res1
ret <4 x float> %res2
}
+define <4 x float>@test_int_x86_avx512_mask_scalef_ss_load(<4 x float> %x0, <4 x float>* %x1ptr) {
+; CHECK-LABEL: test_int_x86_avx512_mask_scalef_ss_load:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vscalefss (%rdi), %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %x1 = load <4 x float>, <4 x float>* %x1ptr
+ %res = call <4 x float> @llvm.x86.avx512.mask.scalef.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> undef, i8 -1, i32 4)
+ ret <4 x float> %res
+}
+
declare <2 x double> @llvm.x86.avx512.mask.scalef.sd(<2 x double>, <2 x double>,<2 x double>, i8, i32)
define <2 x double>@test_int_x86_avx512_mask_scalef_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x3, i8 %x4) {
- ; CHECK-LABEL: test_int_x86_avx512_mask_scalef_sd:
- ; CHECK: vscalefsd %xmm1, %xmm0, %xmm2 {%k1}
- ; CHECK: vscalefsd {rn-sae}, %xmm1, %xmm0, %xmm0
+; SKX-LABEL: test_int_x86_avx512_mask_scalef_sd:
+; SKX: ## %bb.0:
+; SKX-NEXT: kmovd %edi, %k1
+; SKX-NEXT: vscalefsd %xmm1, %xmm0, %xmm2 {%k1}
+; SKX-NEXT: vscalefsd {rn-sae}, %xmm1, %xmm0, %xmm0
+; SKX-NEXT: vaddpd %xmm0, %xmm2, %xmm0
+; SKX-NEXT: retq
+;
+; KNL-LABEL: test_int_x86_avx512_mask_scalef_sd:
+; KNL: ## %bb.0:
+; KNL-NEXT: kmovw %edi, %k1
+; KNL-NEXT: vscalefsd %xmm1, %xmm0, %xmm2 {%k1}
+; KNL-NEXT: vscalefsd {rn-sae}, %xmm1, %xmm0, %xmm0
+; KNL-NEXT: vaddpd %xmm0, %xmm2, %xmm0
+; KNL-NEXT: retq
%res = call <2 x double> @llvm.x86.avx512.mask.scalef.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x3, i8 %x4, i32 4)
%res1 = call <2 x double> @llvm.x86.avx512.mask.scalef.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x3, i8 -1, i32 8)
%res2 = fadd <2 x double> %res, %res1
ret <2 x double> %res2
}
+
+define <2 x double>@test_int_x86_avx512_mask_scalef_sd_load(<2 x double> %x0, <2 x double>* %x1ptr) {
+; CHECK-LABEL: test_int_x86_avx512_mask_scalef_sd_load:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vscalefsd (%rdi), %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %x1 = load <2 x double>, <2 x double>* %x1ptr
+ %res = call <2 x double> @llvm.x86.avx512.mask.scalef.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> undef, i8 -1, i32 4)
+ ret <2 x double> %res
+}
diff --git a/test/CodeGen/X86/avx512-scalar_mask.ll b/test/CodeGen/X86/avx512-scalar_mask.ll
index f6ee8ff4c0f6..e0a91575636b 100644
--- a/test/CodeGen/X86/avx512-scalar_mask.ll
+++ b/test/CodeGen/X86/avx512-scalar_mask.ll
@@ -6,7 +6,7 @@ declare <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float>, <4 x float>, <
define <4 x float>@test_var_mask(<4 x float> %v0, <4 x float> %v1, <4 x float> %v2, i8 %mask) {
; CHECK-LABEL: test_var_mask:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm0 {%k1}
; CHECK-NEXT: retq
@@ -16,7 +16,7 @@ define <4 x float>@test_var_mask(<4 x float> %v0, <4 x float> %v1, <4 x float> %
define <4 x float>@test_var_maskz(<4 x float> %v0, <4 x float> %v1, <4 x float> %v2, i8 %mask) {
; CHECK-LABEL: test_var_maskz:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -27,7 +27,7 @@ define <4 x float>@test_var_maskz(<4 x float> %v0, <4 x float> %v1, <4 x float>
; FIXME: we should just return %xmm0 here.
define <4 x float>@test_const0_mask(<4 x float> %v0, <4 x float> %v1, <4 x float> %v2) {
; CHECK-LABEL: test_const0_mask:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: kmovw %eax, %k1
; CHECK-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm0 {%k1}
@@ -39,7 +39,7 @@ define <4 x float>@test_const0_mask(<4 x float> %v0, <4 x float> %v1, <4 x float
; FIXME: we should zero the lower element of xmm0 and return it.
define <4 x float>@test_const0_maskz(<4 x float> %v0, <4 x float> %v1, <4 x float> %v2) {
; CHECK-LABEL: test_const0_maskz:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: kmovw %eax, %k1
; CHECK-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm0 {%k1} {z}
@@ -51,7 +51,7 @@ define <4 x float>@test_const0_maskz(<4 x float> %v0, <4 x float> %v1, <4 x floa
; FIXME: we should just return %xmm0 here.
define <4 x float>@test_const2_mask(<4 x float> %v0, <4 x float> %v1, <4 x float> %v2) {
; CHECK-LABEL: test_const2_mask:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: movb $2, %al
; CHECK-NEXT: kmovw %eax, %k1
; CHECK-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm0 {%k1}
@@ -63,7 +63,7 @@ define <4 x float>@test_const2_mask(<4 x float> %v0, <4 x float> %v1, <4 x float
; FIXME: we should zero the lower element of xmm0 and return it.
define <4 x float>@test_const2_maskz(<4 x float> %v0, <4 x float> %v1, <4 x float> %v2) {
; CHECK-LABEL: test_const2_maskz:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: movb $2, %al
; CHECK-NEXT: kmovw %eax, %k1
; CHECK-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm0 {%k1} {z}
@@ -74,7 +74,7 @@ define <4 x float>@test_const2_maskz(<4 x float> %v0, <4 x float> %v1, <4 x floa
define <4 x float>@test_const_allone_mask(<4 x float> %v0, <4 x float> %v1, <4 x float> %v2) {
; CHECK-LABEL: test_const_allone_mask:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm0
; CHECK-NEXT: retq
%res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %v0,<4 x float> %v1, <4 x float> %v2, i8 -1, i32 4)
@@ -83,7 +83,7 @@ define <4 x float>@test_const_allone_mask(<4 x float> %v0, <4 x float> %v1, <4 x
define <4 x float>@test_const_allone_maskz(<4 x float> %v0, <4 x float> %v1, <4 x float> %v2) {
; CHECK-LABEL: test_const_allone_maskz:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm0
; CHECK-NEXT: retq
%res = call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float> %v0,<4 x float> %v1, <4 x float> %v2, i8 -1, i32 4)
@@ -92,7 +92,7 @@ define <4 x float>@test_const_allone_maskz(<4 x float> %v0, <4 x float> %v1, <4
define <4 x float>@test_const_3_mask(<4 x float> %v0, <4 x float> %v1, <4 x float> %v2) {
; CHECK-LABEL: test_const_3_mask:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm0
; CHECK-NEXT: retq
%res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %v0,<4 x float> %v1, <4 x float> %v2, i8 3, i32 4)
@@ -101,7 +101,7 @@ define <4 x float>@test_const_3_mask(<4 x float> %v0, <4 x float> %v1, <4 x floa
define <4 x float>@test_const_3_maskz(<4 x float> %v0, <4 x float> %v1, <4 x float> %v2) {
; CHECK-LABEL: test_const_3_maskz:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm0
; CHECK-NEXT: retq
%res = call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float> %v0,<4 x float> %v1, <4 x float> %v2, i8 3, i32 4)
diff --git a/test/CodeGen/X86/avx512-schedule.ll b/test/CodeGen/X86/avx512-schedule.ll
new file mode 100755
index 000000000000..1b450b98a6d5
--- /dev/null
+++ b/test/CodeGen/X86/avx512-schedule.ll
@@ -0,0 +1,8864 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=+avx512f,+avx512dq,+avx512bw,+avx512vl | FileCheck %s --check-prefix=GENERIC
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skx | FileCheck %s --check-prefix=SKX
+
+; This test is an assembly of avx512 instructions to check their scheduling
+
+define <8 x double> @addpd512(<8 x double> %y, <8 x double> %x) {
+; GENERIC-LABEL: addpd512:
+; GENERIC: # %bb.0: # %entry
+; GENERIC-NEXT: vaddpd %zmm0, %zmm1, %zmm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: addpd512:
+; SKX: # %bb.0: # %entry
+; SKX-NEXT: vaddpd %zmm0, %zmm1, %zmm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+entry:
+ %add.i = fadd <8 x double> %x, %y
+ ret <8 x double> %add.i
+}
+
+define <8 x double> @addpd512fold(<8 x double> %y) {
+; GENERIC-LABEL: addpd512fold:
+; GENERIC: # %bb.0: # %entry
+; GENERIC-NEXT: vaddpd {{.*}}(%rip), %zmm0, %zmm0 # sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: addpd512fold:
+; SKX: # %bb.0: # %entry
+; SKX-NEXT: vaddpd {{.*}}(%rip), %zmm0, %zmm0 # sched: [11:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+entry:
+ %add.i = fadd <8 x double> %y, <double 4.500000e+00, double 3.400000e+00, double 2.300000e+00, double 1.200000e+00, double 4.500000e+00, double 3.800000e+00, double 2.300000e+00, double 1.200000e+00>
+ ret <8 x double> %add.i
+}
+
+define <16 x float> @addps512(<16 x float> %y, <16 x float> %x) {
+; GENERIC-LABEL: addps512:
+; GENERIC: # %bb.0: # %entry
+; GENERIC-NEXT: vaddps %zmm0, %zmm1, %zmm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: addps512:
+; SKX: # %bb.0: # %entry
+; SKX-NEXT: vaddps %zmm0, %zmm1, %zmm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+entry:
+ %add.i = fadd <16 x float> %x, %y
+ ret <16 x float> %add.i
+}
+
+define <16 x float> @addps512fold(<16 x float> %y) {
+; GENERIC-LABEL: addps512fold:
+; GENERIC: # %bb.0: # %entry
+; GENERIC-NEXT: vaddps {{.*}}(%rip), %zmm0, %zmm0 # sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: addps512fold:
+; SKX: # %bb.0: # %entry
+; SKX-NEXT: vaddps {{.*}}(%rip), %zmm0, %zmm0 # sched: [11:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+entry:
+ %add.i = fadd <16 x float> %y, <float 4.500000e+00, float 0x400B333340000000, float 0x4002666660000000, float 0x3FF3333340000000, float 4.500000e+00, float 0x400B333340000000, float 0x4002666660000000, float 0x3FF3333340000000, float 4.500000e+00, float 0x400B333340000000, float 0x4002666660000000, float 4.500000e+00, float 4.500000e+00, float 0x400B333340000000, float 0x4002666660000000, float 0x3FF3333340000000>
+ ret <16 x float> %add.i
+}
+
+define <8 x double> @subpd512(<8 x double> %y, <8 x double> %x) {
+; GENERIC-LABEL: subpd512:
+; GENERIC: # %bb.0: # %entry
+; GENERIC-NEXT: vsubpd %zmm0, %zmm1, %zmm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: subpd512:
+; SKX: # %bb.0: # %entry
+; SKX-NEXT: vsubpd %zmm0, %zmm1, %zmm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+entry:
+ %sub.i = fsub <8 x double> %x, %y
+ ret <8 x double> %sub.i
+}
+
+define <8 x double> @subpd512fold(<8 x double> %y, <8 x double>* %x) {
+; GENERIC-LABEL: subpd512fold:
+; GENERIC: # %bb.0: # %entry
+; GENERIC-NEXT: vsubpd (%rdi), %zmm0, %zmm0 # sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: subpd512fold:
+; SKX: # %bb.0: # %entry
+; SKX-NEXT: vsubpd (%rdi), %zmm0, %zmm0 # sched: [11:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+entry:
+ %tmp2 = load <8 x double>, <8 x double>* %x, align 8
+ %sub.i = fsub <8 x double> %y, %tmp2
+ ret <8 x double> %sub.i
+}
+
+define <16 x float> @subps512(<16 x float> %y, <16 x float> %x) {
+; GENERIC-LABEL: subps512:
+; GENERIC: # %bb.0: # %entry
+; GENERIC-NEXT: vsubps %zmm0, %zmm1, %zmm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: subps512:
+; SKX: # %bb.0: # %entry
+; SKX-NEXT: vsubps %zmm0, %zmm1, %zmm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+entry:
+ %sub.i = fsub <16 x float> %x, %y
+ ret <16 x float> %sub.i
+}
+
+define <16 x float> @subps512fold(<16 x float> %y, <16 x float>* %x) {
+; GENERIC-LABEL: subps512fold:
+; GENERIC: # %bb.0: # %entry
+; GENERIC-NEXT: vsubps (%rdi), %zmm0, %zmm0 # sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: subps512fold:
+; SKX: # %bb.0: # %entry
+; SKX-NEXT: vsubps (%rdi), %zmm0, %zmm0 # sched: [11:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+entry:
+ %tmp2 = load <16 x float>, <16 x float>* %x, align 4
+ %sub.i = fsub <16 x float> %y, %tmp2
+ ret <16 x float> %sub.i
+}
+
+define <8 x i64> @imulq512(<8 x i64> %y, <8 x i64> %x) {
+; GENERIC-LABEL: imulq512:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpmullq %zmm0, %zmm1, %zmm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: imulq512:
+; SKX: # %bb.0:
+; SKX-NEXT: vpmullq %zmm0, %zmm1, %zmm0 # sched: [12:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %z = mul <8 x i64>%x, %y
+ ret <8 x i64>%z
+}
+
+define <4 x i64> @imulq256(<4 x i64> %y, <4 x i64> %x) {
+; GENERIC-LABEL: imulq256:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpmullq %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: imulq256:
+; SKX: # %bb.0:
+; SKX-NEXT: vpmullq %ymm0, %ymm1, %ymm0 # sched: [12:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %z = mul <4 x i64>%x, %y
+ ret <4 x i64>%z
+}
+
+define <2 x i64> @imulq128(<2 x i64> %y, <2 x i64> %x) {
+; GENERIC-LABEL: imulq128:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpmullq %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: imulq128:
+; SKX: # %bb.0:
+; SKX-NEXT: vpmullq %xmm0, %xmm1, %xmm0 # sched: [12:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %z = mul <2 x i64>%x, %y
+ ret <2 x i64>%z
+}
+
+define <8 x double> @mulpd512(<8 x double> %y, <8 x double> %x) {
+; GENERIC-LABEL: mulpd512:
+; GENERIC: # %bb.0: # %entry
+; GENERIC-NEXT: vmulpd %zmm0, %zmm1, %zmm0 # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: mulpd512:
+; SKX: # %bb.0: # %entry
+; SKX-NEXT: vmulpd %zmm0, %zmm1, %zmm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+entry:
+ %mul.i = fmul <8 x double> %x, %y
+ ret <8 x double> %mul.i
+}
+
+define <8 x double> @mulpd512fold(<8 x double> %y) {
+; GENERIC-LABEL: mulpd512fold:
+; GENERIC: # %bb.0: # %entry
+; GENERIC-NEXT: vmulpd {{.*}}(%rip), %zmm0, %zmm0 # sched: [9:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: mulpd512fold:
+; SKX: # %bb.0: # %entry
+; SKX-NEXT: vmulpd {{.*}}(%rip), %zmm0, %zmm0 # sched: [11:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+entry:
+ %mul.i = fmul <8 x double> %y, <double 4.500000e+00, double 3.400000e+00, double 2.300000e+00, double 1.200000e+00, double 4.500000e+00, double 3.400000e+00, double 2.300000e+00, double 1.200000e+00>
+ ret <8 x double> %mul.i
+}
+
+define <16 x float> @mulps512(<16 x float> %y, <16 x float> %x) {
+; GENERIC-LABEL: mulps512:
+; GENERIC: # %bb.0: # %entry
+; GENERIC-NEXT: vmulps %zmm0, %zmm1, %zmm0 # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: mulps512:
+; SKX: # %bb.0: # %entry
+; SKX-NEXT: vmulps %zmm0, %zmm1, %zmm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+entry:
+ %mul.i = fmul <16 x float> %x, %y
+ ret <16 x float> %mul.i
+}
+
+define <16 x float> @mulps512fold(<16 x float> %y) {
+; GENERIC-LABEL: mulps512fold:
+; GENERIC: # %bb.0: # %entry
+; GENERIC-NEXT: vmulps {{.*}}(%rip), %zmm0, %zmm0 # sched: [9:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: mulps512fold:
+; SKX: # %bb.0: # %entry
+; SKX-NEXT: vmulps {{.*}}(%rip), %zmm0, %zmm0 # sched: [11:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+entry:
+ %mul.i = fmul <16 x float> %y, <float 4.500000e+00, float 0x400B333340000000, float 0x4002666660000000, float 0x3FF3333340000000, float 4.500000e+00, float 0x400B333340000000, float 0x4002666660000000, float 0x3FF3333340000000, float 4.500000e+00, float 0x400B333340000000, float 0x4002666660000000, float 0x3FF3333340000000, float 4.500000e+00, float 0x400B333340000000, float 0x4002666660000000, float 0x3FF3333340000000>
+ ret <16 x float> %mul.i
+}
+
+define <8 x double> @divpd512(<8 x double> %y, <8 x double> %x) {
+; GENERIC-LABEL: divpd512:
+; GENERIC: # %bb.0: # %entry
+; GENERIC-NEXT: vdivpd %zmm0, %zmm1, %zmm0 # sched: [24:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: divpd512:
+; SKX: # %bb.0: # %entry
+; SKX-NEXT: vdivpd %zmm0, %zmm1, %zmm0 # sched: [23:2.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+entry:
+ %div.i = fdiv <8 x double> %x, %y
+ ret <8 x double> %div.i
+}
+
+define <8 x double> @divpd512fold(<8 x double> %y) {
+; GENERIC-LABEL: divpd512fold:
+; GENERIC: # %bb.0: # %entry
+; GENERIC-NEXT: vdivpd {{.*}}(%rip), %zmm0, %zmm0 # sched: [28:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: divpd512fold:
+; SKX: # %bb.0: # %entry
+; SKX-NEXT: vdivpd {{.*}}(%rip), %zmm0, %zmm0 # sched: [30:2.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+entry:
+ %div.i = fdiv <8 x double> %y, <double 4.500000e+00, double 3.400000e+00, double 2.300000e+00, double 1.200000e+00, double 4.500000e+00, double 3.400000e+00, double 2.300000e+00, double 1.200000e+00>
+ ret <8 x double> %div.i
+}
+
+define <16 x float> @divps512(<16 x float> %y, <16 x float> %x) {
+; GENERIC-LABEL: divps512:
+; GENERIC: # %bb.0: # %entry
+; GENERIC-NEXT: vdivps %zmm0, %zmm1, %zmm0 # sched: [24:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: divps512:
+; SKX: # %bb.0: # %entry
+; SKX-NEXT: vdivps %zmm0, %zmm1, %zmm0 # sched: [23:2.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+entry:
+ %div.i = fdiv <16 x float> %x, %y
+ ret <16 x float> %div.i
+}
+
+define <16 x float> @divps512fold(<16 x float> %y) {
+; GENERIC-LABEL: divps512fold:
+; GENERIC: # %bb.0: # %entry
+; GENERIC-NEXT: vdivps {{.*}}(%rip), %zmm0, %zmm0 # sched: [28:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: divps512fold:
+; SKX: # %bb.0: # %entry
+; SKX-NEXT: vdivps {{.*}}(%rip), %zmm0, %zmm0 # sched: [24:2.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+entry:
+ %div.i = fdiv <16 x float> %y, <float 4.500000e+00, float 0x400B333340000000, float 0x4002666660000000, float 0x3FF3333340000000, float 4.500000e+00, float 4.500000e+00, float 0x4002666660000000, float 0x3FF3333340000000, float 4.500000e+00, float 0x400B333340000000, float 0x4002666660000000, float 0x3FF3333340000000, float 4.500000e+00, float 4.500000e+00, float 0x4002666660000000, float 0x3FF3333340000000>
+ ret <16 x float> %div.i
+}
+
+define <8 x i64> @vpaddq_test(<8 x i64> %i, <8 x i64> %j) nounwind readnone {
+; GENERIC-LABEL: vpaddq_test:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpaddq %zmm1, %zmm0, %zmm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: vpaddq_test:
+; SKX: # %bb.0:
+; SKX-NEXT: vpaddq %zmm1, %zmm0, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %x = add <8 x i64> %i, %j
+ ret <8 x i64> %x
+}
+
+define <8 x i64> @vpaddq_fold_test(<8 x i64> %i, <8 x i64>* %j) nounwind {
+; GENERIC-LABEL: vpaddq_fold_test:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpaddq (%rdi), %zmm0, %zmm0 # sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: vpaddq_fold_test:
+; SKX: # %bb.0:
+; SKX-NEXT: vpaddq (%rdi), %zmm0, %zmm0 # sched: [8:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %tmp = load <8 x i64>, <8 x i64>* %j, align 4
+ %x = add <8 x i64> %i, %tmp
+ ret <8 x i64> %x
+}
+
+define <8 x i64> @vpaddq_broadcast_test(<8 x i64> %i) nounwind {
+; GENERIC-LABEL: vpaddq_broadcast_test:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm0 # sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: vpaddq_broadcast_test:
+; SKX: # %bb.0:
+; SKX-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm0 # sched: [8:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %x = add <8 x i64> %i, <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
+ ret <8 x i64> %x
+}
+
+define <8 x i64> @vpaddq_broadcast2_test(<8 x i64> %i, i64* %j) nounwind {
+; GENERIC-LABEL: vpaddq_broadcast2_test:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpaddq (%rdi){1to8}, %zmm0, %zmm0 # sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: vpaddq_broadcast2_test:
+; SKX: # %bb.0:
+; SKX-NEXT: vpaddq (%rdi){1to8}, %zmm0, %zmm0 # sched: [8:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %tmp = load i64, i64* %j
+ %j.0 = insertelement <8 x i64> undef, i64 %tmp, i32 0
+ %j.1 = insertelement <8 x i64> %j.0, i64 %tmp, i32 1
+ %j.2 = insertelement <8 x i64> %j.1, i64 %tmp, i32 2
+ %j.3 = insertelement <8 x i64> %j.2, i64 %tmp, i32 3
+ %j.4 = insertelement <8 x i64> %j.3, i64 %tmp, i32 4
+ %j.5 = insertelement <8 x i64> %j.4, i64 %tmp, i32 5
+ %j.6 = insertelement <8 x i64> %j.5, i64 %tmp, i32 6
+ %j.7 = insertelement <8 x i64> %j.6, i64 %tmp, i32 7
+ %x = add <8 x i64> %i, %j.7
+ ret <8 x i64> %x
+}
+
+define <16 x i32> @vpaddd_test(<16 x i32> %i, <16 x i32> %j) nounwind readnone {
+; GENERIC-LABEL: vpaddd_test:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpaddd %zmm1, %zmm0, %zmm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: vpaddd_test:
+; SKX: # %bb.0:
+; SKX-NEXT: vpaddd %zmm1, %zmm0, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %x = add <16 x i32> %i, %j
+ ret <16 x i32> %x
+}
+
+define <16 x i32> @vpaddd_fold_test(<16 x i32> %i, <16 x i32>* %j) nounwind {
+; GENERIC-LABEL: vpaddd_fold_test:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpaddd (%rdi), %zmm0, %zmm0 # sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: vpaddd_fold_test:
+; SKX: # %bb.0:
+; SKX-NEXT: vpaddd (%rdi), %zmm0, %zmm0 # sched: [8:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %tmp = load <16 x i32>, <16 x i32>* %j, align 4
+ %x = add <16 x i32> %i, %tmp
+ ret <16 x i32> %x
+}
+
+define <16 x i32> @vpaddd_broadcast_test(<16 x i32> %i) nounwind {
+; GENERIC-LABEL: vpaddd_broadcast_test:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 # sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: vpaddd_broadcast_test:
+; SKX: # %bb.0:
+; SKX-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 # sched: [8:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %x = add <16 x i32> %i, <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+ ret <16 x i32> %x
+}
+
+define <16 x i32> @vpaddd_mask_test(<16 x i32> %i, <16 x i32> %j, <16 x i32> %mask1) nounwind readnone {
+; GENERIC-LABEL: vpaddd_mask_test:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpneqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpaddd %zmm1, %zmm0, %zmm0 {%k1} # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: vpaddd_mask_test:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpneqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpaddd %zmm1, %zmm0, %zmm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %mask = icmp ne <16 x i32> %mask1, zeroinitializer
+ %x = add <16 x i32> %i, %j
+ %r = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %i
+ ret <16 x i32> %r
+}
+
+define <16 x i32> @vpaddd_maskz_test(<16 x i32> %i, <16 x i32> %j, <16 x i32> %mask1) nounwind readnone {
+; GENERIC-LABEL: vpaddd_maskz_test:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpneqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpaddd %zmm1, %zmm0, %zmm0 {%k1} {z} # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: vpaddd_maskz_test:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpneqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpaddd %zmm1, %zmm0, %zmm0 {%k1} {z} # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %mask = icmp ne <16 x i32> %mask1, zeroinitializer
+ %x = add <16 x i32> %i, %j
+ %r = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> zeroinitializer
+ ret <16 x i32> %r
+}
+
+define <16 x i32> @vpaddd_mask_fold_test(<16 x i32> %i, <16 x i32>* %j.ptr, <16 x i32> %mask1) nounwind readnone {
+; GENERIC-LABEL: vpaddd_mask_fold_test:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpaddd (%rdi), %zmm0, %zmm0 {%k1} # sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: vpaddd_mask_fold_test:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpaddd (%rdi), %zmm0, %zmm0 {%k1} # sched: [8:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %mask = icmp ne <16 x i32> %mask1, zeroinitializer
+ %j = load <16 x i32>, <16 x i32>* %j.ptr
+ %x = add <16 x i32> %i, %j
+ %r = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %i
+ ret <16 x i32> %r
+}
+
+define <16 x i32> @vpaddd_mask_broadcast_test(<16 x i32> %i, <16 x i32> %mask1) nounwind readnone {
+; GENERIC-LABEL: vpaddd_mask_broadcast_test:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 {%k1} # sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: vpaddd_mask_broadcast_test:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 {%k1} # sched: [8:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %mask = icmp ne <16 x i32> %mask1, zeroinitializer
+ %x = add <16 x i32> %i, <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
+ %r = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %i
+ ret <16 x i32> %r
+}
+
+define <16 x i32> @vpaddd_maskz_fold_test(<16 x i32> %i, <16 x i32>* %j.ptr, <16 x i32> %mask1) nounwind readnone {
+; GENERIC-LABEL: vpaddd_maskz_fold_test:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpaddd (%rdi), %zmm0, %zmm0 {%k1} {z} # sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: vpaddd_maskz_fold_test:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpaddd (%rdi), %zmm0, %zmm0 {%k1} {z} # sched: [8:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %mask = icmp ne <16 x i32> %mask1, zeroinitializer
+ %j = load <16 x i32>, <16 x i32>* %j.ptr
+ %x = add <16 x i32> %i, %j
+ %r = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> zeroinitializer
+ ret <16 x i32> %r
+}
+
+define <16 x i32> @vpaddd_maskz_broadcast_test(<16 x i32> %i, <16 x i32> %mask1) nounwind readnone {
+; GENERIC-LABEL: vpaddd_maskz_broadcast_test:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 {%k1} {z} # sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: vpaddd_maskz_broadcast_test:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 {%k1} {z} # sched: [8:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %mask = icmp ne <16 x i32> %mask1, zeroinitializer
+ %x = add <16 x i32> %i, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
+ %r = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> zeroinitializer
+ ret <16 x i32> %r
+}
+
+define <8 x i64> @vpsubq_test(<8 x i64> %i, <8 x i64> %j) nounwind readnone {
+; GENERIC-LABEL: vpsubq_test:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpsubq %zmm1, %zmm0, %zmm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: vpsubq_test:
+; SKX: # %bb.0:
+; SKX-NEXT: vpsubq %zmm1, %zmm0, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %x = sub <8 x i64> %i, %j
+ ret <8 x i64> %x
+}
+
+define <16 x i32> @vpsubd_test(<16 x i32> %i, <16 x i32> %j) nounwind readnone {
+; GENERIC-LABEL: vpsubd_test:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpsubd %zmm1, %zmm0, %zmm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: vpsubd_test:
+; SKX: # %bb.0:
+; SKX-NEXT: vpsubd %zmm1, %zmm0, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %x = sub <16 x i32> %i, %j
+ ret <16 x i32> %x
+}
+
+define <16 x i32> @vpmulld_test(<16 x i32> %i, <16 x i32> %j) {
+; GENERIC-LABEL: vpmulld_test:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpmulld %zmm1, %zmm0, %zmm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: vpmulld_test:
+; SKX: # %bb.0:
+; SKX-NEXT: vpmulld %zmm1, %zmm0, %zmm0 # sched: [8:0.67]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %x = mul <16 x i32> %i, %j
+ ret <16 x i32> %x
+}
+
+declare float @sqrtf(float) readnone
+define float @sqrtA(float %a) nounwind uwtable readnone ssp {
+; GENERIC-LABEL: sqrtA:
+; GENERIC: # %bb.0: # %entry
+; GENERIC-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 # sched: [114:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: sqrtA:
+; SKX: # %bb.0: # %entry
+; SKX-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 # sched: [12:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+entry:
+ %conv1 = tail call float @sqrtf(float %a) nounwind readnone
+ ret float %conv1
+}
+
+declare double @sqrt(double) readnone
+define double @sqrtB(double %a) nounwind uwtable readnone ssp {
+; GENERIC-LABEL: sqrtB:
+; GENERIC: # %bb.0: # %entry
+; GENERIC-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0 # sched: [21:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: sqrtB:
+; SKX: # %bb.0: # %entry
+; SKX-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0 # sched: [18:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+entry:
+ %call = tail call double @sqrt(double %a) nounwind readnone
+ ret double %call
+}
+
+declare float @llvm.sqrt.f32(float)
+define float @sqrtC(float %a) nounwind {
+; GENERIC-LABEL: sqrtC:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 # sched: [114:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: sqrtC:
+; SKX: # %bb.0:
+; SKX-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 # sched: [12:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %b = call float @llvm.sqrt.f32(float %a)
+ ret float %b
+}
+
+declare <16 x float> @llvm.sqrt.v16f32(<16 x float>)
+define <16 x float> @sqrtD(<16 x float> %a) nounwind {
+; GENERIC-LABEL: sqrtD:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vsqrtps %zmm0, %zmm0 # sched: [14:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: sqrtD:
+; SKX: # %bb.0:
+; SKX-NEXT: vsqrtps %zmm0, %zmm0 # sched: [19:2.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %b = call <16 x float> @llvm.sqrt.v16f32(<16 x float> %a)
+ ret <16 x float> %b
+}
+
+declare <8 x double> @llvm.sqrt.v8f64(<8 x double>)
+define <8 x double> @sqrtE(<8 x double> %a) nounwind {
+; GENERIC-LABEL: sqrtE:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vsqrtpd %zmm0, %zmm0 # sched: [14:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: sqrtE:
+; SKX: # %bb.0:
+; SKX-NEXT: vsqrtpd %zmm0, %zmm0 # sched: [31:2.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %b = call <8 x double> @llvm.sqrt.v8f64(<8 x double> %a)
+ ret <8 x double> %b
+}
+
+define <16 x float> @fadd_broadcast(<16 x float> %a) nounwind {
+; GENERIC-LABEL: fadd_broadcast:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vaddps {{.*}}(%rip){1to16}, %zmm0, %zmm0 # sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: fadd_broadcast:
+; SKX: # %bb.0:
+; SKX-NEXT: vaddps {{.*}}(%rip){1to16}, %zmm0, %zmm0 # sched: [11:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %b = fadd <16 x float> %a, <float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000>
+ ret <16 x float> %b
+}
+
+define <8 x i64> @addq_broadcast(<8 x i64> %a) nounwind {
+; GENERIC-LABEL: addq_broadcast:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm0 # sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: addq_broadcast:
+; SKX: # %bb.0:
+; SKX-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm0 # sched: [8:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %b = add <8 x i64> %a, <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
+ ret <8 x i64> %b
+}
+
+define <8 x i64> @orq_broadcast(<8 x i64> %a) nounwind {
+; GENERIC-LABEL: orq_broadcast:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vorpd {{.*}}(%rip){1to8}, %zmm0, %zmm0 # sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: orq_broadcast:
+; SKX: # %bb.0:
+; SKX-NEXT: vorpd {{.*}}(%rip){1to8}, %zmm0, %zmm0 # sched: [8:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %b = or <8 x i64> %a, <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
+ ret <8 x i64> %b
+}
+
+define <16 x i32> @andd512fold(<16 x i32> %y, <16 x i32>* %x) {
+; GENERIC-LABEL: andd512fold:
+; GENERIC: # %bb.0: # %entry
+; GENERIC-NEXT: vandps (%rdi), %zmm0, %zmm0 # sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: andd512fold:
+; SKX: # %bb.0: # %entry
+; SKX-NEXT: vandps (%rdi), %zmm0, %zmm0 # sched: [8:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+entry:
+ %a = load <16 x i32>, <16 x i32>* %x, align 4
+ %b = and <16 x i32> %y, %a
+ ret <16 x i32> %b
+}
+
+define <8 x i64> @andqbrst(<8 x i64> %p1, i64* %ap) {
+; GENERIC-LABEL: andqbrst:
+; GENERIC: # %bb.0: # %entry
+; GENERIC-NEXT: vandpd (%rdi){1to8}, %zmm0, %zmm0 # sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: andqbrst:
+; SKX: # %bb.0: # %entry
+; SKX-NEXT: vandpd (%rdi){1to8}, %zmm0, %zmm0 # sched: [8:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+entry:
+ %a = load i64, i64* %ap, align 8
+ %b = insertelement <8 x i64> undef, i64 %a, i32 0
+ %c = shufflevector <8 x i64> %b, <8 x i64> undef, <8 x i32> zeroinitializer
+ %d = and <8 x i64> %p1, %c
+ ret <8 x i64>%d
+}
+
+define <16 x float> @test_mask_vaddps(<16 x float> %dst, <16 x float> %i,
+; GENERIC-LABEL: test_mask_vaddps:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpneqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vaddps %zmm2, %zmm1, %zmm0 {%k1} # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_mask_vaddps:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpneqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vaddps %zmm2, %zmm1, %zmm0 {%k1} # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ <16 x float> %j, <16 x i32> %mask1)
+ nounwind readnone {
+ %mask = icmp ne <16 x i32> %mask1, zeroinitializer
+ %x = fadd <16 x float> %i, %j
+ %r = select <16 x i1> %mask, <16 x float> %x, <16 x float> %dst
+ ret <16 x float> %r
+}
+
+define <16 x float> @test_mask_vmulps(<16 x float> %dst, <16 x float> %i, <16 x float> %j, <16 x i32> %mask1) nounwind readnone {
+; GENERIC-LABEL: test_mask_vmulps:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpneqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vmulps %zmm2, %zmm1, %zmm0 {%k1} # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_mask_vmulps:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpneqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vmulps %zmm2, %zmm1, %zmm0 {%k1} # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %mask = icmp ne <16 x i32> %mask1, zeroinitializer
+ %x = fmul <16 x float> %i, %j
+ %r = select <16 x i1> %mask, <16 x float> %x, <16 x float> %dst
+ ret <16 x float> %r
+}
+
+define <16 x float> @test_mask_vminps(<16 x float> %dst, <16 x float> %i, <16 x float> %j, <16 x i32> %mask1) nounwind readnone {
+; GENERIC-LABEL: test_mask_vminps:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpneqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vminps %zmm2, %zmm1, %zmm0 {%k1} # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_mask_vminps:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpneqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vminps %zmm2, %zmm1, %zmm0 {%k1} # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %mask = icmp ne <16 x i32> %mask1, zeroinitializer
+ %cmp_res = fcmp olt <16 x float> %i, %j
+ %min = select <16 x i1> %cmp_res, <16 x float> %i, <16 x float> %j
+ %r = select <16 x i1> %mask, <16 x float> %min, <16 x float> %dst
+ ret <16 x float> %r
+}
+
+define <8 x double> @test_mask_vminpd(<8 x double> %dst, <8 x double> %i, <8 x double> %j, <8 x i32> %mask1) nounwind readnone {
+; GENERIC-LABEL: test_mask_vminpd:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpneqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vminpd %zmm2, %zmm1, %zmm0 {%k1} # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_mask_vminpd:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpneqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vminpd %zmm2, %zmm1, %zmm0 {%k1} # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %mask = icmp ne <8 x i32> %mask1, zeroinitializer
+ %cmp_res = fcmp olt <8 x double> %i, %j
+ %min = select <8 x i1> %cmp_res, <8 x double> %i, <8 x double> %j
+ %r = select <8 x i1> %mask, <8 x double> %min, <8 x double> %dst
+ ret <8 x double> %r
+}
+
+define <16 x float> @test_mask_vmaxps(<16 x float> %dst, <16 x float> %i, <16 x float> %j, <16 x i32> %mask1) nounwind readnone {
+; GENERIC-LABEL: test_mask_vmaxps:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpneqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vmaxps %zmm2, %zmm1, %zmm0 {%k1} # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_mask_vmaxps:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpneqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vmaxps %zmm2, %zmm1, %zmm0 {%k1} # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %mask = icmp ne <16 x i32> %mask1, zeroinitializer
+ %cmp_res = fcmp ogt <16 x float> %i, %j
+ %max = select <16 x i1> %cmp_res, <16 x float> %i, <16 x float> %j
+ %r = select <16 x i1> %mask, <16 x float> %max, <16 x float> %dst
+ ret <16 x float> %r
+}
+
+define <8 x double> @test_mask_vmaxpd(<8 x double> %dst, <8 x double> %i, <8 x double> %j, <8 x i32> %mask1) nounwind readnone {
+; GENERIC-LABEL: test_mask_vmaxpd:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpneqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vmaxpd %zmm2, %zmm1, %zmm0 {%k1} # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_mask_vmaxpd:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpneqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vmaxpd %zmm2, %zmm1, %zmm0 {%k1} # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %mask = icmp ne <8 x i32> %mask1, zeroinitializer
+ %cmp_res = fcmp ogt <8 x double> %i, %j
+ %max = select <8 x i1> %cmp_res, <8 x double> %i, <8 x double> %j
+ %r = select <8 x i1> %mask, <8 x double> %max, <8 x double> %dst
+ ret <8 x double> %r
+}
+
+define <16 x float> @test_mask_vsubps(<16 x float> %dst, <16 x float> %i, <16 x float> %j, <16 x i32> %mask1) nounwind readnone {
+; GENERIC-LABEL: test_mask_vsubps:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpneqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vsubps %zmm2, %zmm1, %zmm0 {%k1} # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_mask_vsubps:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpneqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vsubps %zmm2, %zmm1, %zmm0 {%k1} # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %mask = icmp ne <16 x i32> %mask1, zeroinitializer
+ %x = fsub <16 x float> %i, %j
+ %r = select <16 x i1> %mask, <16 x float> %x, <16 x float> %dst
+ ret <16 x float> %r
+}
+
+define <16 x float> @test_mask_vdivps(<16 x float> %dst, <16 x float> %i, <16 x float> %j, <16 x i32> %mask1) nounwind readnone {
+; GENERIC-LABEL: test_mask_vdivps:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpneqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vdivps %zmm2, %zmm1, %zmm0 {%k1} # sched: [24:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_mask_vdivps:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpneqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vdivps %zmm2, %zmm1, %zmm0 {%k1} # sched: [23:2.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %mask = icmp ne <16 x i32> %mask1, zeroinitializer
+ %x = fdiv <16 x float> %i, %j
+ %r = select <16 x i1> %mask, <16 x float> %x, <16 x float> %dst
+ ret <16 x float> %r
+}
+
+define <8 x double> @test_mask_vaddpd(<8 x double> %dst, <8 x double> %i, <8 x double> %j, <8 x i64> %mask1) nounwind readnone {
+; GENERIC-LABEL: test_mask_vaddpd:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpneqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vaddpd %zmm2, %zmm1, %zmm0 {%k1} # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_mask_vaddpd:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpneqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vaddpd %zmm2, %zmm1, %zmm0 {%k1} # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %mask = icmp ne <8 x i64> %mask1, zeroinitializer
+ %x = fadd <8 x double> %i, %j
+ %r = select <8 x i1> %mask, <8 x double> %x, <8 x double> %dst
+ ret <8 x double> %r
+}
+
+define <8 x double> @test_maskz_vaddpd(<8 x double> %i, <8 x double> %j, <8 x i64> %mask1) nounwind readnone {
+; GENERIC-LABEL: test_maskz_vaddpd:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpneqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vaddpd %zmm1, %zmm0, %zmm0 {%k1} {z} # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_maskz_vaddpd:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpneqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vaddpd %zmm1, %zmm0, %zmm0 {%k1} {z} # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %mask = icmp ne <8 x i64> %mask1, zeroinitializer
+ %x = fadd <8 x double> %i, %j
+ %r = select <8 x i1> %mask, <8 x double> %x, <8 x double> zeroinitializer
+ ret <8 x double> %r
+}
+
+define <8 x double> @test_mask_fold_vaddpd(<8 x double> %dst, <8 x double> %i, <8 x double>* %j, <8 x i64> %mask1) nounwind {
+; GENERIC-LABEL: test_mask_fold_vaddpd:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpneqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vaddpd (%rdi), %zmm1, %zmm0 {%k1} # sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_mask_fold_vaddpd:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpneqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vaddpd (%rdi), %zmm1, %zmm0 {%k1} # sched: [11:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %mask = icmp ne <8 x i64> %mask1, zeroinitializer
+ %tmp = load <8 x double>, <8 x double>* %j, align 8
+ %x = fadd <8 x double> %i, %tmp
+ %r = select <8 x i1> %mask, <8 x double> %x, <8 x double> %dst
+ ret <8 x double> %r
+}
+
+define <8 x double> @test_maskz_fold_vaddpd(<8 x double> %i, <8 x double>* %j, <8 x i64> %mask1) nounwind {
+; GENERIC-LABEL: test_maskz_fold_vaddpd:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpneqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vaddpd (%rdi), %zmm0, %zmm0 {%k1} {z} # sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_maskz_fold_vaddpd:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpneqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vaddpd (%rdi), %zmm0, %zmm0 {%k1} {z} # sched: [11:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %mask = icmp ne <8 x i64> %mask1, zeroinitializer
+ %tmp = load <8 x double>, <8 x double>* %j, align 8
+ %x = fadd <8 x double> %i, %tmp
+ %r = select <8 x i1> %mask, <8 x double> %x, <8 x double> zeroinitializer
+ ret <8 x double> %r
+}
+
+define <8 x double> @test_broadcast_vaddpd(<8 x double> %i, double* %j) nounwind {
+; GENERIC-LABEL: test_broadcast_vaddpd:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vaddpd (%rdi){1to8}, %zmm0, %zmm0 # sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_broadcast_vaddpd:
+; SKX: # %bb.0:
+; SKX-NEXT: vaddpd (%rdi){1to8}, %zmm0, %zmm0 # sched: [11:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %tmp = load double, double* %j
+ %b = insertelement <8 x double> undef, double %tmp, i32 0
+ %c = shufflevector <8 x double> %b, <8 x double> undef,
+ <8 x i32> zeroinitializer
+ %x = fadd <8 x double> %c, %i
+ ret <8 x double> %x
+}
+
+define <8 x double> @test_mask_broadcast_vaddpd(<8 x double> %dst, <8 x double> %i, double* %j, <8 x i64> %mask1) nounwind {
+; GENERIC-LABEL: test_mask_broadcast_vaddpd:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm0, %xmm0, %xmm0 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpneqq %zmm0, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vaddpd (%rdi){1to8}, %zmm1, %zmm1 {%k1} # sched: [7:1.00]
+; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_mask_broadcast_vaddpd:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm0, %xmm0, %xmm0 # sched: [1:0.33]
+; SKX-NEXT: vpcmpneqq %zmm0, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vaddpd (%rdi){1to8}, %zmm1, %zmm1 {%k1} # sched: [11:0.50]
+; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %mask = icmp ne <8 x i64> %mask1, zeroinitializer
+ %tmp = load double, double* %j
+ %b = insertelement <8 x double> undef, double %tmp, i32 0
+ %c = shufflevector <8 x double> %b, <8 x double> undef,
+ <8 x i32> zeroinitializer
+ %x = fadd <8 x double> %c, %i
+ %r = select <8 x i1> %mask, <8 x double> %x, <8 x double> %i
+ ret <8 x double> %r
+}
+
+define <8 x double> @test_maskz_broadcast_vaddpd(<8 x double> %i, double* %j,
+; GENERIC-LABEL: test_maskz_broadcast_vaddpd:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpneqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vaddpd (%rdi){1to8}, %zmm0, %zmm0 {%k1} {z} # sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_maskz_broadcast_vaddpd:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpneqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vaddpd (%rdi){1to8}, %zmm0, %zmm0 {%k1} {z} # sched: [11:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+ <8 x i64> %mask1) nounwind {
+ %mask = icmp ne <8 x i64> %mask1, zeroinitializer
+ %tmp = load double, double* %j
+ %b = insertelement <8 x double> undef, double %tmp, i32 0
+ %c = shufflevector <8 x double> %b, <8 x double> undef,
+ <8 x i32> zeroinitializer
+ %x = fadd <8 x double> %c, %i
+ %r = select <8 x i1> %mask, <8 x double> %x, <8 x double> zeroinitializer
+ ret <8 x double> %r
+}
+
+define <16 x float> @test_fxor(<16 x float> %a) {
+; GENERIC-LABEL: test_fxor:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vxorps {{.*}}(%rip){1to16}, %zmm0, %zmm0 # sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_fxor:
+; SKX: # %bb.0:
+; SKX-NEXT: vxorps {{.*}}(%rip){1to16}, %zmm0, %zmm0 # sched: [8:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+
+ %res = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %a
+ ret <16 x float>%res
+}
+
+define <8 x float> @test_fxor_8f32(<8 x float> %a) {
+; GENERIC-LABEL: test_fxor_8f32:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vxorps {{.*}}(%rip){1to8}, %ymm0, %ymm0 # sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_fxor_8f32:
+; SKX: # %bb.0:
+; SKX-NEXT: vxorps {{.*}}(%rip){1to8}, %ymm0, %ymm0 # sched: [8:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %res = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %a
+ ret <8 x float>%res
+}
+
+define <8 x double> @fabs_v8f64(<8 x double> %p)
+; GENERIC-LABEL: fabs_v8f64:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vandpd {{.*}}(%rip){1to8}, %zmm0, %zmm0 # sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: fabs_v8f64:
+; SKX: # %bb.0:
+; SKX-NEXT: vandpd {{.*}}(%rip){1to8}, %zmm0, %zmm0 # sched: [8:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+{
+ %t = call <8 x double> @llvm.fabs.v8f64(<8 x double> %p)
+ ret <8 x double> %t
+}
+declare <8 x double> @llvm.fabs.v8f64(<8 x double> %p)
+
+define <16 x float> @fabs_v16f32(<16 x float> %p)
+; GENERIC-LABEL: fabs_v16f32:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vandps {{.*}}(%rip){1to16}, %zmm0, %zmm0 # sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: fabs_v16f32:
+; SKX: # %bb.0:
+; SKX-NEXT: vandps {{.*}}(%rip){1to16}, %zmm0, %zmm0 # sched: [8:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+{
+ %t = call <16 x float> @llvm.fabs.v16f32(<16 x float> %p)
+ ret <16 x float> %t
+}
+declare <16 x float> @llvm.fabs.v16f32(<16 x float> %p)
+
+define double @test1(double %a, double %b) nounwind {
+; GENERIC-LABEL: test1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vucomisd %xmm1, %xmm0 # sched: [2:1.00]
+; GENERIC-NEXT: jne .LBB64_1 # sched: [1:1.00]
+; GENERIC-NEXT: jnp .LBB64_2 # sched: [1:1.00]
+; GENERIC-NEXT: .LBB64_1: # %l1
+; GENERIC-NEXT: vsubsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+; GENERIC-NEXT: .LBB64_2: # %l2
+; GENERIC-NEXT: vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test1:
+; SKX: # %bb.0:
+; SKX-NEXT: vucomisd %xmm1, %xmm0 # sched: [2:1.00]
+; SKX-NEXT: jne .LBB64_1 # sched: [1:0.50]
+; SKX-NEXT: jnp .LBB64_2 # sched: [1:0.50]
+; SKX-NEXT: .LBB64_1: # %l1
+; SKX-NEXT: vsubsd %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+; SKX-NEXT: .LBB64_2: # %l2
+; SKX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %tobool = fcmp une double %a, %b
+ br i1 %tobool, label %l1, label %l2
+
+l1:
+ %c = fsub double %a, %b
+ ret double %c
+l2:
+ %c1 = fadd double %a, %b
+ ret double %c1
+}
+
+define float @test2(float %a, float %b) nounwind {
+; GENERIC-LABEL: test2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vucomiss %xmm0, %xmm1 # sched: [2:1.00]
+; GENERIC-NEXT: jbe .LBB65_2 # sched: [1:1.00]
+; GENERIC-NEXT: # %bb.1: # %l1
+; GENERIC-NEXT: vsubss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+; GENERIC-NEXT: .LBB65_2: # %l2
+; GENERIC-NEXT: vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test2:
+; SKX: # %bb.0:
+; SKX-NEXT: vucomiss %xmm0, %xmm1 # sched: [2:1.00]
+; SKX-NEXT: jbe .LBB65_2 # sched: [1:0.50]
+; SKX-NEXT: # %bb.1: # %l1
+; SKX-NEXT: vsubss %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+; SKX-NEXT: .LBB65_2: # %l2
+; SKX-NEXT: vaddss %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %tobool = fcmp olt float %a, %b
+ br i1 %tobool, label %l1, label %l2
+
+l1:
+ %c = fsub float %a, %b
+ ret float %c
+l2:
+ %c1 = fadd float %a, %b
+ ret float %c1
+}
+
+define i32 @test3(float %a, float %b) {
+; GENERIC-LABEL: test3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vcmpeqss %xmm1, %xmm0, %k0 # sched: [3:1.00]
+; GENERIC-NEXT: kmovd %k0, %eax # sched: [1:0.33]
+; GENERIC-NEXT: movzbl %al, %eax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test3:
+; SKX: # %bb.0:
+; SKX-NEXT: vcmpeqss %xmm1, %xmm0, %k0 # sched: [3:1.00]
+; SKX-NEXT: kmovd %k0, %eax # sched: [3:1.00]
+; SKX-NEXT: movzbl %al, %eax # sched: [1:0.25]
+; SKX-NEXT: retq # sched: [7:1.00]
+
+ %cmp10.i = fcmp oeq float %a, %b
+ %conv11.i = zext i1 %cmp10.i to i32
+ ret i32 %conv11.i
+}
+
+define float @test5(float %p) #0 {
+; GENERIC-LABEL: test5:
+; GENERIC: # %bb.0: # %entry
+; GENERIC-NEXT: vxorps %xmm1, %xmm1, %xmm1 # sched: [1:1.00]
+; GENERIC-NEXT: vucomiss %xmm1, %xmm0 # sched: [2:1.00]
+; GENERIC-NEXT: jne .LBB67_1 # sched: [1:1.00]
+; GENERIC-NEXT: jp .LBB67_1 # sched: [1:1.00]
+; GENERIC-NEXT: # %bb.2: # %return
+; GENERIC-NEXT: retq # sched: [1:1.00]
+; GENERIC-NEXT: .LBB67_1: # %if.end
+; GENERIC-NEXT: seta %al # sched: [2:1.00]
+; GENERIC-NEXT: movzbl %al, %eax # sched: [1:0.33]
+; GENERIC-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [6:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test5:
+; SKX: # %bb.0: # %entry
+; SKX-NEXT: vxorps %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT: vucomiss %xmm1, %xmm0 # sched: [2:1.00]
+; SKX-NEXT: jne .LBB67_1 # sched: [1:0.50]
+; SKX-NEXT: jp .LBB67_1 # sched: [1:0.50]
+; SKX-NEXT: # %bb.2: # %return
+; SKX-NEXT: retq # sched: [7:1.00]
+; SKX-NEXT: .LBB67_1: # %if.end
+; SKX-NEXT: seta %al # sched: [2:1.00]
+; SKX-NEXT: movzbl %al, %eax # sched: [1:0.25]
+; SKX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [5:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+entry:
+ %cmp = fcmp oeq float %p, 0.000000e+00
+ br i1 %cmp, label %return, label %if.end
+
+if.end: ; preds = %entry
+ %cmp1 = fcmp ogt float %p, 0.000000e+00
+ %cond = select i1 %cmp1, float 1.000000e+00, float -1.000000e+00
+ br label %return
+
+return: ; preds = %if.end, %entry
+ %retval.0 = phi float [ %cond, %if.end ], [ %p, %entry ]
+ ret float %retval.0
+}
+
+define i32 @test6(i32 %a, i32 %b) {
+; GENERIC-LABEL: test6:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: xorl %eax, %eax # sched: [1:0.33]
+; GENERIC-NEXT: cmpl %esi, %edi # sched: [1:0.33]
+; GENERIC-NEXT: sete %al # sched: [1:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test6:
+; SKX: # %bb.0:
+; SKX-NEXT: xorl %eax, %eax # sched: [1:0.25]
+; SKX-NEXT: cmpl %esi, %edi # sched: [1:0.25]
+; SKX-NEXT: sete %al # sched: [1:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %cmp = icmp eq i32 %a, %b
+ %res = zext i1 %cmp to i32
+ ret i32 %res
+}
+
+define i32 @test7(double %x, double %y) #2 {
+; GENERIC-LABEL: test7:
+; GENERIC: # %bb.0: # %entry
+; GENERIC-NEXT: xorl %eax, %eax # sched: [1:0.33]
+; GENERIC-NEXT: vucomisd %xmm1, %xmm0 # sched: [2:1.00]
+; GENERIC-NEXT: setne %al # sched: [1:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test7:
+; SKX: # %bb.0: # %entry
+; SKX-NEXT: xorl %eax, %eax # sched: [1:0.25]
+; SKX-NEXT: vucomisd %xmm1, %xmm0 # sched: [2:1.00]
+; SKX-NEXT: setne %al # sched: [1:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+entry:
+ %0 = fcmp one double %x, %y
+ %or = zext i1 %0 to i32
+ ret i32 %or
+}
+
+define i32 @test8(i32 %a1, i32 %a2, i32 %a3) {
+; GENERIC-LABEL: test8:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: xorl $-2147483648, %esi # imm = 0x80000000
+; GENERIC-NEXT: # sched: [1:0.33]
+; GENERIC-NEXT: testl %edx, %edx # sched: [1:0.33]
+; GENERIC-NEXT: movl $1, %eax # sched: [1:0.33]
+; GENERIC-NEXT: cmovel %eax, %edx # sched: [2:0.67]
+; GENERIC-NEXT: notl %edi # sched: [1:0.33]
+; GENERIC-NEXT: orl %edi, %esi # sched: [1:0.33]
+; GENERIC-NEXT: cmovnel %edx, %eax # sched: [2:0.67]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test8:
+; SKX: # %bb.0:
+; SKX-NEXT: notl %edi # sched: [1:0.25]
+; SKX-NEXT: xorl $-2147483648, %esi # imm = 0x80000000
+; SKX-NEXT: # sched: [1:0.25]
+; SKX-NEXT: testl %edx, %edx # sched: [1:0.25]
+; SKX-NEXT: movl $1, %eax # sched: [1:0.25]
+; SKX-NEXT: cmovel %eax, %edx # sched: [1:0.50]
+; SKX-NEXT: orl %edi, %esi # sched: [1:0.25]
+; SKX-NEXT: cmovnel %edx, %eax # sched: [1:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %tmp1 = icmp eq i32 %a1, -1
+ %tmp2 = icmp eq i32 %a2, -2147483648
+ %tmp3 = and i1 %tmp1, %tmp2
+ %tmp4 = icmp eq i32 %a3, 0
+ %tmp5 = or i1 %tmp3, %tmp4
+ %res = select i1 %tmp5, i32 1, i32 %a3
+ ret i32 %res
+}
+
+define i32 @test9(i64 %a) {
+; GENERIC-LABEL: test9:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: testb $1, %dil # sched: [1:0.33]
+; GENERIC-NEXT: jne .LBB71_2 # sched: [1:1.00]
+; GENERIC-NEXT: # %bb.1: # %A
+; GENERIC-NEXT: movl $6, %eax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+; GENERIC-NEXT: .LBB71_2: # %B
+; GENERIC-NEXT: movl $7, %eax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test9:
+; SKX: # %bb.0:
+; SKX-NEXT: testb $1, %dil # sched: [1:0.25]
+; SKX-NEXT: jne .LBB71_2 # sched: [1:0.50]
+; SKX-NEXT: # %bb.1: # %A
+; SKX-NEXT: movl $6, %eax # sched: [1:0.25]
+; SKX-NEXT: retq # sched: [7:1.00]
+; SKX-NEXT: .LBB71_2: # %B
+; SKX-NEXT: movl $7, %eax # sched: [1:0.25]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %b = and i64 %a, 1
+ %cmp10.i = icmp eq i64 %b, 0
+ br i1 %cmp10.i, label %A, label %B
+A:
+ ret i32 6
+B:
+ ret i32 7
+}
+
+define i32 @test10(i64 %b, i64 %c, i1 %d) {
+; GENERIC-LABEL: test10:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: movl %edx, %eax # sched: [1:0.33]
+; GENERIC-NEXT: andb $1, %al # sched: [1:0.33]
+; GENERIC-NEXT: cmpq %rsi, %rdi # sched: [1:0.33]
+; GENERIC-NEXT: sete %cl # sched: [1:0.50]
+; GENERIC-NEXT: orb %dl, %cl # sched: [1:0.33]
+; GENERIC-NEXT: andb $1, %cl # sched: [1:0.33]
+; GENERIC-NEXT: cmpb %cl, %al # sched: [1:0.33]
+; GENERIC-NEXT: je .LBB72_1 # sched: [1:1.00]
+; GENERIC-NEXT: # %bb.2: # %if.end.i
+; GENERIC-NEXT: movl $6, %eax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+; GENERIC-NEXT: .LBB72_1: # %if.then.i
+; GENERIC-NEXT: movl $5, %eax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test10:
+; SKX: # %bb.0:
+; SKX-NEXT: movl %edx, %eax # sched: [1:0.25]
+; SKX-NEXT: andb $1, %al # sched: [1:0.25]
+; SKX-NEXT: cmpq %rsi, %rdi # sched: [1:0.25]
+; SKX-NEXT: sete %cl # sched: [1:0.50]
+; SKX-NEXT: orb %dl, %cl # sched: [1:0.25]
+; SKX-NEXT: andb $1, %cl # sched: [1:0.25]
+; SKX-NEXT: cmpb %cl, %al # sched: [1:0.25]
+; SKX-NEXT: je .LBB72_1 # sched: [1:0.50]
+; SKX-NEXT: # %bb.2: # %if.end.i
+; SKX-NEXT: movl $6, %eax # sched: [1:0.25]
+; SKX-NEXT: retq # sched: [7:1.00]
+; SKX-NEXT: .LBB72_1: # %if.then.i
+; SKX-NEXT: movl $5, %eax # sched: [1:0.25]
+; SKX-NEXT: retq # sched: [7:1.00]
+
+ %cmp8.i = icmp eq i64 %b, %c
+ %or1 = or i1 %d, %cmp8.i
+ %xor1 = xor i1 %d, %or1
+ br i1 %xor1, label %if.end.i, label %if.then.i
+
+if.then.i:
+ ret i32 5
+
+if.end.i:
+ ret i32 6
+}
+
+define <16 x float> @sitof32(<16 x i32> %a) nounwind {
+; GENERIC-LABEL: sitof32:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vcvtdq2ps %zmm0, %zmm0 # sched: [4:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: sitof32:
+; SKX: # %bb.0:
+; SKX-NEXT: vcvtdq2ps %zmm0, %zmm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %b = sitofp <16 x i32> %a to <16 x float>
+ ret <16 x float> %b
+}
+
+define <8 x double> @sltof864(<8 x i64> %a) {
+; GENERIC-LABEL: sltof864:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vcvtqq2pd %zmm0, %zmm0 # sched: [4:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: sltof864:
+; SKX: # %bb.0:
+; SKX-NEXT: vcvtqq2pd %zmm0, %zmm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %b = sitofp <8 x i64> %a to <8 x double>
+ ret <8 x double> %b
+}
+
+define <4 x double> @slto4f64(<4 x i64> %a) {
+; GENERIC-LABEL: slto4f64:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vcvtqq2pd %ymm0, %ymm0 # sched: [4:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: slto4f64:
+; SKX: # %bb.0:
+; SKX-NEXT: vcvtqq2pd %ymm0, %ymm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %b = sitofp <4 x i64> %a to <4 x double>
+ ret <4 x double> %b
+}
+
+define <2 x double> @slto2f64(<2 x i64> %a) {
+; GENERIC-LABEL: slto2f64:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vcvtqq2pd %xmm0, %xmm0 # sched: [4:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: slto2f64:
+; SKX: # %bb.0:
+; SKX-NEXT: vcvtqq2pd %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %b = sitofp <2 x i64> %a to <2 x double>
+ ret <2 x double> %b
+}
+
+define <2 x float> @sltof2f32(<2 x i64> %a) {
+; GENERIC-LABEL: sltof2f32:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vcvtqq2ps %xmm0, %xmm0 # sched: [4:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: sltof2f32:
+; SKX: # %bb.0:
+; SKX-NEXT: vcvtqq2ps %xmm0, %xmm0 # sched: [5:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %b = sitofp <2 x i64> %a to <2 x float>
+ ret <2 x float>%b
+}
+
+define <4 x float> @slto4f32_mem(<4 x i64>* %a) {
+; GENERIC-LABEL: slto4f32_mem:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vcvtqq2psy (%rdi), %xmm0 # sched: [8:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: slto4f32_mem:
+; SKX: # %bb.0:
+; SKX-NEXT: vcvtqq2psy (%rdi), %xmm0 # sched: [11:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %a1 = load <4 x i64>, <4 x i64>* %a, align 8
+ %b = sitofp <4 x i64> %a1 to <4 x float>
+ ret <4 x float>%b
+}
+
+define <4 x i64> @f64to4sl(<4 x double> %a) {
+; GENERIC-LABEL: f64to4sl:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vcvttpd2qq %ymm0, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: f64to4sl:
+; SKX: # %bb.0:
+; SKX-NEXT: vcvttpd2qq %ymm0, %ymm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %b = fptosi <4 x double> %a to <4 x i64>
+ ret <4 x i64> %b
+}
+
+define <4 x i64> @f32to4sl(<4 x float> %a) {
+; GENERIC-LABEL: f32to4sl:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vcvttps2qq %xmm0, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: f32to4sl:
+; SKX: # %bb.0:
+; SKX-NEXT: vcvttps2qq %xmm0, %ymm0 # sched: [7:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %b = fptosi <4 x float> %a to <4 x i64>
+ ret <4 x i64> %b
+}
+
+define <4 x float> @slto4f32(<4 x i64> %a) {
+; GENERIC-LABEL: slto4f32:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vcvtqq2ps %ymm0, %xmm0 # sched: [4:1.00]
+; GENERIC-NEXT: vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: slto4f32:
+; SKX: # %bb.0:
+; SKX-NEXT: vcvtqq2ps %ymm0, %xmm0 # sched: [7:1.00]
+; SKX-NEXT: vzeroupper # sched: [4:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %b = sitofp <4 x i64> %a to <4 x float>
+ ret <4 x float> %b
+}
+
+define <4 x float> @ulto4f32(<4 x i64> %a) {
+; GENERIC-LABEL: ulto4f32:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vcvtuqq2ps %ymm0, %xmm0 # sched: [4:1.00]
+; GENERIC-NEXT: vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: ulto4f32:
+; SKX: # %bb.0:
+; SKX-NEXT: vcvtuqq2ps %ymm0, %xmm0 # sched: [7:1.00]
+; SKX-NEXT: vzeroupper # sched: [4:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %b = uitofp <4 x i64> %a to <4 x float>
+ ret <4 x float> %b
+}
+
+define <8 x double> @ulto8f64(<8 x i64> %a) {
+; GENERIC-LABEL: ulto8f64:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vcvtuqq2pd %zmm0, %zmm0 # sched: [4:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: ulto8f64:
+; SKX: # %bb.0:
+; SKX-NEXT: vcvtuqq2pd %zmm0, %zmm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %b = uitofp <8 x i64> %a to <8 x double>
+ ret <8 x double> %b
+}
+
+define <16 x double> @ulto16f64(<16 x i64> %a) {
+; GENERIC-LABEL: ulto16f64:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vcvtuqq2pd %zmm0, %zmm0 # sched: [4:1.00]
+; GENERIC-NEXT: vcvtuqq2pd %zmm1, %zmm1 # sched: [4:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: ulto16f64:
+; SKX: # %bb.0:
+; SKX-NEXT: vcvtuqq2pd %zmm0, %zmm0 # sched: [4:0.33]
+; SKX-NEXT: vcvtuqq2pd %zmm1, %zmm1 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %b = uitofp <16 x i64> %a to <16 x double>
+ ret <16 x double> %b
+}
+
+define <16 x i32> @f64to16si(<16 x float> %a) nounwind {
+; GENERIC-LABEL: f64to16si:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vcvttps2dq %zmm0, %zmm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: f64to16si:
+; SKX: # %bb.0:
+; SKX-NEXT: vcvttps2dq %zmm0, %zmm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %b = fptosi <16 x float> %a to <16 x i32>
+ ret <16 x i32> %b
+}
+
+define <16 x i32> @f32to16ui(<16 x float> %a) nounwind {
+; GENERIC-LABEL: f32to16ui:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vcvttps2udq %zmm0, %zmm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: f32to16ui:
+; SKX: # %bb.0:
+; SKX-NEXT: vcvttps2udq %zmm0, %zmm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %b = fptoui <16 x float> %a to <16 x i32>
+ ret <16 x i32> %b
+}
+
+define <16 x i8> @f32to16uc(<16 x float> %f) {
+; GENERIC-LABEL: f32to16uc:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vcvttps2dq %zmm0, %zmm0 # sched: [3:1.00]
+; GENERIC-NEXT: vpmovdb %zmm0, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: f32to16uc:
+; SKX: # %bb.0:
+; SKX-NEXT: vcvttps2dq %zmm0, %zmm0 # sched: [4:0.33]
+; SKX-NEXT: vpmovdb %zmm0, %xmm0 # sched: [4:2.00]
+; SKX-NEXT: vzeroupper # sched: [4:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %res = fptoui <16 x float> %f to <16 x i8>
+ ret <16 x i8> %res
+}
+
+define <16 x i16> @f32to16us(<16 x float> %f) {
+; GENERIC-LABEL: f32to16us:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vcvttps2dq %zmm0, %zmm0 # sched: [3:1.00]
+; GENERIC-NEXT: vpmovdw %zmm0, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: f32to16us:
+; SKX: # %bb.0:
+; SKX-NEXT: vcvttps2dq %zmm0, %zmm0 # sched: [4:0.33]
+; SKX-NEXT: vpmovdw %zmm0, %ymm0 # sched: [4:2.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %res = fptoui <16 x float> %f to <16 x i16>
+ ret <16 x i16> %res
+}
+
+define <8 x i32> @f32to8ui(<8 x float> %a) nounwind {
+; GENERIC-LABEL: f32to8ui:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vcvttps2udq %ymm0, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: f32to8ui:
+; SKX: # %bb.0:
+; SKX-NEXT: vcvttps2udq %ymm0, %ymm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %b = fptoui <8 x float> %a to <8 x i32>
+ ret <8 x i32> %b
+}
+
+define <4 x i32> @f32to4ui(<4 x float> %a) nounwind {
+; GENERIC-LABEL: f32to4ui:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vcvttps2udq %xmm0, %xmm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: f32to4ui:
+; SKX: # %bb.0:
+; SKX-NEXT: vcvttps2udq %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %b = fptoui <4 x float> %a to <4 x i32>
+ ret <4 x i32> %b
+}
+
+define <8 x i32> @f64to8ui(<8 x double> %a) nounwind {
+; GENERIC-LABEL: f64to8ui:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vcvttpd2udq %zmm0, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: f64to8ui:
+; SKX: # %bb.0:
+; SKX-NEXT: vcvttpd2udq %zmm0, %ymm0 # sched: [7:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %b = fptoui <8 x double> %a to <8 x i32>
+ ret <8 x i32> %b
+}
+
+define <8 x i16> @f64to8us(<8 x double> %f) {
+; GENERIC-LABEL: f64to8us:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vcvttpd2dq %zmm0, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: vpmovdw %ymm0, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: f64to8us:
+; SKX: # %bb.0:
+; SKX-NEXT: vcvttpd2dq %zmm0, %ymm0 # sched: [7:1.00]
+; SKX-NEXT: vpmovdw %ymm0, %xmm0 # sched: [4:2.00]
+; SKX-NEXT: vzeroupper # sched: [4:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %res = fptoui <8 x double> %f to <8 x i16>
+ ret <8 x i16> %res
+}
+
+define <8 x i8> @f64to8uc(<8 x double> %f) {
+; GENERIC-LABEL: f64to8uc:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vcvttpd2dq %zmm0, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: vpmovdw %ymm0, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: f64to8uc:
+; SKX: # %bb.0:
+; SKX-NEXT: vcvttpd2dq %zmm0, %ymm0 # sched: [7:1.00]
+; SKX-NEXT: vpmovdw %ymm0, %xmm0 # sched: [4:2.00]
+; SKX-NEXT: vzeroupper # sched: [4:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %res = fptoui <8 x double> %f to <8 x i8>
+ ret <8 x i8> %res
+}
+
+define <4 x i32> @f64to4ui(<4 x double> %a) nounwind {
+; GENERIC-LABEL: f64to4ui:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vcvttpd2udq %ymm0, %xmm0 # sched: [3:1.00]
+; GENERIC-NEXT: vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: f64to4ui:
+; SKX: # %bb.0:
+; SKX-NEXT: vcvttpd2udq %ymm0, %xmm0 # sched: [7:1.00]
+; SKX-NEXT: vzeroupper # sched: [4:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %b = fptoui <4 x double> %a to <4 x i32>
+ ret <4 x i32> %b
+}
+
+define <8 x double> @sito8f64(<8 x i32> %a) {
+; GENERIC-LABEL: sito8f64:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vcvtdq2pd %ymm0, %zmm0 # sched: [4:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: sito8f64:
+; SKX: # %bb.0:
+; SKX-NEXT: vcvtdq2pd %ymm0, %zmm0 # sched: [7:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %b = sitofp <8 x i32> %a to <8 x double>
+ ret <8 x double> %b
+}
+define <8 x double> @i32to8f64_mask(<8 x double> %a, <8 x i32> %b, i8 %c) nounwind {
+; GENERIC-LABEL: i32to8f64_mask:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: kmovd %edi, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vcvtdq2pd %ymm1, %zmm0 {%k1} # sched: [4:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: i32to8f64_mask:
+; SKX: # %bb.0:
+; SKX-NEXT: kmovd %edi, %k1 # sched: [1:1.00]
+; SKX-NEXT: vcvtdq2pd %ymm1, %zmm0 {%k1} # sched: [7:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+; VLNOBW-LABEL: i32to8f64_mask:
+; VLNOBW: # %bb.0:
+; VLNOBW-NEXT: kmovw %edi, %k1
+; VLNOBW-NEXT: vcvtdq2pd %ymm1, %zmm0 {%k1}
+; VLNOBW-NEXT: ret{{[l|q]}}
+ %1 = bitcast i8 %c to <8 x i1>
+ %2 = sitofp <8 x i32> %b to <8 x double>
+ %3 = select <8 x i1> %1, <8 x double> %2, <8 x double> %a
+ ret <8 x double> %3
+}
+define <8 x double> @sito8f64_maskz(<8 x i32> %a, i8 %b) nounwind {
+; GENERIC-LABEL: sito8f64_maskz:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: kmovd %edi, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vcvtdq2pd %ymm0, %zmm0 {%k1} {z} # sched: [4:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: sito8f64_maskz:
+; SKX: # %bb.0:
+; SKX-NEXT: kmovd %edi, %k1 # sched: [1:1.00]
+; SKX-NEXT: vcvtdq2pd %ymm0, %zmm0 {%k1} {z} # sched: [7:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+; VLNOBW-LABEL: sito8f64_maskz:
+; VLNOBW: # %bb.0:
+; VLNOBW-NEXT: kmovw %edi, %k1
+; VLNOBW-NEXT: vcvtdq2pd %ymm0, %zmm0 {%k1} {z}
+; VLNOBW-NEXT: ret{{[l|q]}}
+ %1 = bitcast i8 %b to <8 x i1>
+ %2 = sitofp <8 x i32> %a to <8 x double>
+ %3 = select <8 x i1> %1, <8 x double> %2, <8 x double> zeroinitializer
+ ret <8 x double> %3
+}
+
+define <8 x i32> @f64to8si(<8 x double> %a) {
+; GENERIC-LABEL: f64to8si:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vcvttpd2dq %zmm0, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: f64to8si:
+; SKX: # %bb.0:
+; SKX-NEXT: vcvttpd2dq %zmm0, %ymm0 # sched: [7:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %b = fptosi <8 x double> %a to <8 x i32>
+ ret <8 x i32> %b
+}
+
+define <4 x i32> @f64to4si(<4 x double> %a) {
+; GENERIC-LABEL: f64to4si:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vcvttpd2dq %ymm0, %xmm0 # sched: [4:1.00]
+; GENERIC-NEXT: vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: f64to4si:
+; SKX: # %bb.0:
+; SKX-NEXT: vcvttpd2dq %ymm0, %xmm0 # sched: [7:1.00]
+; SKX-NEXT: vzeroupper # sched: [4:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %b = fptosi <4 x double> %a to <4 x i32>
+ ret <4 x i32> %b
+}
+
+define <16 x float> @f64to16f32(<16 x double> %b) nounwind {
+; GENERIC-LABEL: f64to16f32:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vcvtpd2ps %zmm0, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: vcvtpd2ps %zmm1, %ymm1 # sched: [3:1.00]
+; GENERIC-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: f64to16f32:
+; SKX: # %bb.0:
+; SKX-NEXT: vcvtpd2ps %zmm0, %ymm0 # sched: [7:1.00]
+; SKX-NEXT: vcvtpd2ps %zmm1, %ymm1 # sched: [7:1.00]
+; SKX-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 # sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %a = fptrunc <16 x double> %b to <16 x float>
+ ret <16 x float> %a
+}
+
+define <4 x float> @f64to4f32(<4 x double> %b) {
+; GENERIC-LABEL: f64to4f32:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vcvtpd2ps %ymm0, %xmm0 # sched: [4:1.00]
+; GENERIC-NEXT: vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: f64to4f32:
+; SKX: # %bb.0:
+; SKX-NEXT: vcvtpd2ps %ymm0, %xmm0 # sched: [7:1.00]
+; SKX-NEXT: vzeroupper # sched: [4:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %a = fptrunc <4 x double> %b to <4 x float>
+ ret <4 x float> %a
+}
+
+define <4 x float> @f64to4f32_mask(<4 x double> %b, <4 x i1> %mask) {
+; GENERIC-LABEL: f64to4f32_mask:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpslld $31, %xmm1, %xmm1 # sched: [1:1.00]
+; GENERIC-NEXT: vptestmd %xmm1, %xmm1, %k1 # sched: [1:1.00]
+; GENERIC-NEXT: vcvtpd2ps %ymm0, %xmm0 {%k1} {z} # sched: [3:1.00]
+; GENERIC-NEXT: vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: f64to4f32_mask:
+; SKX: # %bb.0:
+; SKX-NEXT: vpslld $31, %xmm1, %xmm1 # sched: [1:0.50]
+; SKX-NEXT: vptestmd %xmm1, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vcvtpd2ps %ymm0, %xmm0 {%k1} {z} # sched: [7:1.00]
+; SKX-NEXT: vzeroupper # sched: [4:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %a = fptrunc <4 x double> %b to <4 x float>
+ %c = select <4 x i1>%mask, <4 x float>%a, <4 x float> zeroinitializer
+ ret <4 x float> %c
+}
+
+define <4 x float> @f64tof32_inreg(<2 x double> %a0, <4 x float> %a1) nounwind {
+; GENERIC-LABEL: f64tof32_inreg:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vcvtsd2ss %xmm0, %xmm1, %xmm0 # sched: [4:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: f64tof32_inreg:
+; SKX: # %bb.0:
+; SKX-NEXT: vcvtsd2ss %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %ext = extractelement <2 x double> %a0, i32 0
+ %cvt = fptrunc double %ext to float
+ %res = insertelement <4 x float> %a1, float %cvt, i32 0
+ ret <4 x float> %res
+}
+
+define <8 x double> @f32to8f64(<8 x float> %b) nounwind {
+; GENERIC-LABEL: f32to8f64:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vcvtps2pd %ymm0, %zmm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: f32to8f64:
+; SKX: # %bb.0:
+; SKX-NEXT: vcvtps2pd %ymm0, %zmm0 # sched: [7:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %a = fpext <8 x float> %b to <8 x double>
+ ret <8 x double> %a
+}
+
+define <4 x double> @f32to4f64_mask(<4 x float> %b, <4 x double> %b1, <4 x double> %a1) {
+; GENERIC-LABEL: f32to4f64_mask:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vcmpltpd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vcvtps2pd %xmm0, %ymm0 {%k1} {z} # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: f32to4f64_mask:
+; SKX: # %bb.0:
+; SKX-NEXT: vcmpltpd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vcvtps2pd %xmm0, %ymm0 {%k1} {z} # sched: [7:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %a = fpext <4 x float> %b to <4 x double>
+ %mask = fcmp ogt <4 x double> %a1, %b1
+ %c = select <4 x i1> %mask, <4 x double> %a, <4 x double> zeroinitializer
+ ret <4 x double> %c
+}
+
+define <2 x double> @f32tof64_inreg(<2 x double> %a0, <4 x float> %a1) nounwind {
+; GENERIC-LABEL: f32tof64_inreg:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vcvtss2sd %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: f32tof64_inreg:
+; SKX: # %bb.0:
+; SKX-NEXT: vcvtss2sd %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %ext = extractelement <4 x float> %a1, i32 0
+ %cvt = fpext float %ext to double
+ %res = insertelement <2 x double> %a0, double %cvt, i32 0
+ ret <2 x double> %res
+}
+
+define double @sltof64_load(i64* nocapture %e) {
+; GENERIC-LABEL: sltof64_load:
+; GENERIC: # %bb.0: # %entry
+; GENERIC-NEXT: vcvtsi2sdq (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: sltof64_load:
+; SKX: # %bb.0: # %entry
+; SKX-NEXT: vcvtsi2sdq (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+entry:
+ %tmp1 = load i64, i64* %e, align 8
+ %conv = sitofp i64 %tmp1 to double
+ ret double %conv
+}
+
+define double @sitof64_load(i32* %e) {
+; GENERIC-LABEL: sitof64_load:
+; GENERIC: # %bb.0: # %entry
+; GENERIC-NEXT: vcvtsi2sdl (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: sitof64_load:
+; SKX: # %bb.0: # %entry
+; SKX-NEXT: vcvtsi2sdl (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+entry:
+ %tmp1 = load i32, i32* %e, align 4
+ %conv = sitofp i32 %tmp1 to double
+ ret double %conv
+}
+
+define float @sitof32_load(i32* %e) {
+; GENERIC-LABEL: sitof32_load:
+; GENERIC: # %bb.0: # %entry
+; GENERIC-NEXT: vcvtsi2ssl (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: sitof32_load:
+; SKX: # %bb.0: # %entry
+; SKX-NEXT: vcvtsi2ssl (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+entry:
+ %tmp1 = load i32, i32* %e, align 4
+ %conv = sitofp i32 %tmp1 to float
+ ret float %conv
+}
+
+define float @sltof32_load(i64* %e) {
+; GENERIC-LABEL: sltof32_load:
+; GENERIC: # %bb.0: # %entry
+; GENERIC-NEXT: vcvtsi2ssq (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: sltof32_load:
+; SKX: # %bb.0: # %entry
+; SKX-NEXT: vcvtsi2ssq (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+entry:
+ %tmp1 = load i64, i64* %e, align 8
+ %conv = sitofp i64 %tmp1 to float
+ ret float %conv
+}
+
+define void @f32tof64_loadstore() {
+; GENERIC-LABEL: f32tof64_loadstore:
+; GENERIC: # %bb.0: # %entry
+; GENERIC-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [6:0.50]
+; GENERIC-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: vmovsd %xmm0, -{{[0-9]+}}(%rsp) # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: f32tof64_loadstore:
+; SKX: # %bb.0: # %entry
+; SKX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [5:0.50]
+; SKX-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 # sched: [5:1.00]
+; SKX-NEXT: vmovsd %xmm0, -{{[0-9]+}}(%rsp) # sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+entry:
+ %f = alloca float, align 4
+ %d = alloca double, align 8
+ %tmp = load float, float* %f, align 4
+ %conv = fpext float %tmp to double
+ store double %conv, double* %d, align 8
+ ret void
+}
+
+define void @f64tof32_loadstore() nounwind uwtable {
+; GENERIC-LABEL: f64tof32_loadstore:
+; GENERIC: # %bb.0: # %entry
+; GENERIC-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero sched: [6:0.50]
+; GENERIC-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0 # sched: [4:1.00]
+; GENERIC-NEXT: vmovss %xmm0, -{{[0-9]+}}(%rsp) # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: f64tof32_loadstore:
+; SKX: # %bb.0: # %entry
+; SKX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero sched: [5:0.50]
+; SKX-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0 # sched: [5:1.00]
+; SKX-NEXT: vmovss %xmm0, -{{[0-9]+}}(%rsp) # sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+entry:
+ %f = alloca float, align 4
+ %d = alloca double, align 8
+ %tmp = load double, double* %d, align 8
+ %conv = fptrunc double %tmp to float
+ store float %conv, float* %f, align 4
+ ret void
+}
+
+define double @long_to_double(i64 %x) {
+; GENERIC-LABEL: long_to_double:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovq %rdi, %xmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: long_to_double:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovq %rdi, %xmm0 # sched: [1:0.25]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %res = bitcast i64 %x to double
+ ret double %res
+}
+
+define i64 @double_to_long(double %x) {
+; GENERIC-LABEL: double_to_long:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovq %xmm0, %rax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: double_to_long:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovq %xmm0, %rax # sched: [1:0.25]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %res = bitcast double %x to i64
+ ret i64 %res
+}
+
+define float @int_to_float(i32 %x) {
+; GENERIC-LABEL: int_to_float:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovd %edi, %xmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: int_to_float:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovd %edi, %xmm0 # sched: [1:0.25]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %res = bitcast i32 %x to float
+ ret float %res
+}
+
+define i32 @float_to_int(float %x) {
+; GENERIC-LABEL: float_to_int:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovd %xmm0, %eax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: float_to_int:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovd %xmm0, %eax # sched: [1:0.25]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %res = bitcast float %x to i32
+ ret i32 %res
+}
+
+define <16 x double> @uito16f64(<16 x i32> %a) nounwind {
+; GENERIC-LABEL: uito16f64:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vcvtudq2pd %ymm0, %zmm2 # sched: [4:1.00]
+; GENERIC-NEXT: vextractf64x4 $1, %zmm0, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: vcvtudq2pd %ymm0, %zmm1 # sched: [4:1.00]
+; GENERIC-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: uito16f64:
+; SKX: # %bb.0:
+; SKX-NEXT: vcvtudq2pd %ymm0, %zmm2 # sched: [7:1.00]
+; SKX-NEXT: vextractf64x4 $1, %zmm0, %ymm0 # sched: [3:1.00]
+; SKX-NEXT: vcvtudq2pd %ymm0, %zmm1 # sched: [7:1.00]
+; SKX-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %b = uitofp <16 x i32> %a to <16 x double>
+ ret <16 x double> %b
+}
+
+define <8 x float> @slto8f32(<8 x i64> %a) {
+; GENERIC-LABEL: slto8f32:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vcvtqq2ps %zmm0, %ymm0 # sched: [4:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: slto8f32:
+; SKX: # %bb.0:
+; SKX-NEXT: vcvtqq2ps %zmm0, %ymm0 # sched: [7:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %b = sitofp <8 x i64> %a to <8 x float>
+ ret <8 x float> %b
+}
+
+define <16 x float> @slto16f32(<16 x i64> %a) {
+; GENERIC-LABEL: slto16f32:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vcvtqq2ps %zmm0, %ymm0 # sched: [4:1.00]
+; GENERIC-NEXT: vcvtqq2ps %zmm1, %ymm1 # sched: [4:1.00]
+; GENERIC-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: slto16f32:
+; SKX: # %bb.0:
+; SKX-NEXT: vcvtqq2ps %zmm0, %ymm0 # sched: [7:1.00]
+; SKX-NEXT: vcvtqq2ps %zmm1, %ymm1 # sched: [7:1.00]
+; SKX-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 # sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %b = sitofp <16 x i64> %a to <16 x float>
+ ret <16 x float> %b
+}
+
+define <8 x double> @slto8f64(<8 x i64> %a) {
+; GENERIC-LABEL: slto8f64:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vcvtqq2pd %zmm0, %zmm0 # sched: [4:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: slto8f64:
+; SKX: # %bb.0:
+; SKX-NEXT: vcvtqq2pd %zmm0, %zmm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %b = sitofp <8 x i64> %a to <8 x double>
+ ret <8 x double> %b
+}
+
+define <16 x double> @slto16f64(<16 x i64> %a) {
+; GENERIC-LABEL: slto16f64:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vcvtqq2pd %zmm0, %zmm0 # sched: [4:1.00]
+; GENERIC-NEXT: vcvtqq2pd %zmm1, %zmm1 # sched: [4:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: slto16f64:
+; SKX: # %bb.0:
+; SKX-NEXT: vcvtqq2pd %zmm0, %zmm0 # sched: [4:0.33]
+; SKX-NEXT: vcvtqq2pd %zmm1, %zmm1 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %b = sitofp <16 x i64> %a to <16 x double>
+ ret <16 x double> %b
+}
+
+define <8 x float> @ulto8f32(<8 x i64> %a) {
+; GENERIC-LABEL: ulto8f32:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vcvtuqq2ps %zmm0, %ymm0 # sched: [4:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: ulto8f32:
+; SKX: # %bb.0:
+; SKX-NEXT: vcvtuqq2ps %zmm0, %ymm0 # sched: [7:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %b = uitofp <8 x i64> %a to <8 x float>
+ ret <8 x float> %b
+}
+
+define <16 x float> @ulto16f32(<16 x i64> %a) {
+; GENERIC-LABEL: ulto16f32:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vcvtuqq2ps %zmm0, %ymm0 # sched: [4:1.00]
+; GENERIC-NEXT: vcvtuqq2ps %zmm1, %ymm1 # sched: [4:1.00]
+; GENERIC-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: ulto16f32:
+; SKX: # %bb.0:
+; SKX-NEXT: vcvtuqq2ps %zmm0, %ymm0 # sched: [7:1.00]
+; SKX-NEXT: vcvtuqq2ps %zmm1, %ymm1 # sched: [7:1.00]
+; SKX-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 # sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %b = uitofp <16 x i64> %a to <16 x float>
+ ret <16 x float> %b
+}
+
+define <8 x double> @uito8f64_mask(<8 x double> %a, <8 x i32> %b, i8 %c) nounwind {
+; GENERIC-LABEL: uito8f64_mask:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: kmovd %edi, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vcvtudq2pd %ymm1, %zmm0 {%k1} # sched: [4:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: uito8f64_mask:
+; SKX: # %bb.0:
+; SKX-NEXT: kmovd %edi, %k1 # sched: [1:1.00]
+; SKX-NEXT: vcvtudq2pd %ymm1, %zmm0 {%k1} # sched: [7:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+; VLNOBW-LABEL: uito8f64_mask:
+; VLNOBW: # %bb.0:
+; VLNOBW-NEXT: kmovw %edi, %k1
+; VLNOBW-NEXT: vcvtudq2pd %ymm1, %zmm0 {%k1}
+; VLNOBW-NEXT: ret{{[l|q]}}
+ %1 = bitcast i8 %c to <8 x i1>
+ %2 = uitofp <8 x i32> %b to <8 x double>
+ %3 = select <8 x i1> %1, <8 x double> %2, <8 x double> %a
+ ret <8 x double> %3
+}
+define <8 x double> @uito8f64_maskz(<8 x i32> %a, i8 %b) nounwind {
+; GENERIC-LABEL: uito8f64_maskz:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: kmovd %edi, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vcvtudq2pd %ymm0, %zmm0 {%k1} {z} # sched: [4:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: uito8f64_maskz:
+; SKX: # %bb.0:
+; SKX-NEXT: kmovd %edi, %k1 # sched: [1:1.00]
+; SKX-NEXT: vcvtudq2pd %ymm0, %zmm0 {%k1} {z} # sched: [7:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %1 = bitcast i8 %b to <8 x i1>
+ %2 = uitofp <8 x i32> %a to <8 x double>
+ %3 = select <8 x i1> %1, <8 x double> %2, <8 x double> zeroinitializer
+ ret <8 x double> %3
+}
+
+define <4 x double> @uito4f64(<4 x i32> %a) nounwind {
+; GENERIC-LABEL: uito4f64:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vcvtudq2pd %xmm0, %ymm0 # sched: [4:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: uito4f64:
+; SKX: # %bb.0:
+; SKX-NEXT: vcvtudq2pd %xmm0, %ymm0 # sched: [7:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %b = uitofp <4 x i32> %a to <4 x double>
+ ret <4 x double> %b
+}
+
+define <16 x float> @uito16f32(<16 x i32> %a) nounwind {
+; GENERIC-LABEL: uito16f32:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vcvtudq2ps %zmm0, %zmm0 # sched: [4:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: uito16f32:
+; SKX: # %bb.0:
+; SKX-NEXT: vcvtudq2ps %zmm0, %zmm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %b = uitofp <16 x i32> %a to <16 x float>
+ ret <16 x float> %b
+}
+
+define <8 x double> @uito8f64(<8 x i32> %a) {
+; GENERIC-LABEL: uito8f64:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vcvtudq2pd %ymm0, %zmm0 # sched: [4:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: uito8f64:
+; SKX: # %bb.0:
+; SKX-NEXT: vcvtudq2pd %ymm0, %zmm0 # sched: [7:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %b = uitofp <8 x i32> %a to <8 x double>
+ ret <8 x double> %b
+}
+
+define <8 x float> @uito8f32(<8 x i32> %a) nounwind {
+; GENERIC-LABEL: uito8f32:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vcvtudq2ps %ymm0, %ymm0 # sched: [4:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: uito8f32:
+; SKX: # %bb.0:
+; SKX-NEXT: vcvtudq2ps %ymm0, %ymm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %b = uitofp <8 x i32> %a to <8 x float>
+ ret <8 x float> %b
+}
+
+define <4 x float> @uito4f32(<4 x i32> %a) nounwind {
+; GENERIC-LABEL: uito4f32:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vcvtudq2ps %xmm0, %xmm0 # sched: [4:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: uito4f32:
+; SKX: # %bb.0:
+; SKX-NEXT: vcvtudq2ps %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %b = uitofp <4 x i32> %a to <4 x float>
+ ret <4 x float> %b
+}
+
+define i32 @fptosi(float %a) nounwind {
+; GENERIC-LABEL: fptosi:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vcvttss2si %xmm0, %eax # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: fptosi:
+; SKX: # %bb.0:
+; SKX-NEXT: vcvttss2si %xmm0, %eax # sched: [7:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %b = fptosi float %a to i32
+ ret i32 %b
+}
+
+define i32 @fptoui(float %a) nounwind {
+; GENERIC-LABEL: fptoui:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vcvttss2usi %xmm0, %eax # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: fptoui:
+; SKX: # %bb.0:
+; SKX-NEXT: vcvttss2usi %xmm0, %eax # sched: [6:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %b = fptoui float %a to i32
+ ret i32 %b
+}
+
+define float @uitof32(i32 %a) nounwind {
+; GENERIC-LABEL: uitof32:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vcvtusi2ssl %edi, %xmm0, %xmm0 # sched: [4:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: uitof32:
+; SKX: # %bb.0:
+; SKX-NEXT: vcvtusi2ssl %edi, %xmm0, %xmm0 # sched: [5:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %b = uitofp i32 %a to float
+ ret float %b
+}
+
+define double @uitof64(i32 %a) nounwind {
+; GENERIC-LABEL: uitof64:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vcvtusi2sdl %edi, %xmm0, %xmm0 # sched: [4:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: uitof64:
+; SKX: # %bb.0:
+; SKX-NEXT: vcvtusi2sdl %edi, %xmm0, %xmm0 # sched: [5:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %b = uitofp i32 %a to double
+ ret double %b
+}
+
+define <16 x float> @sbto16f32(<16 x i32> %a) {
+; GENERIC-LABEL: sbto16f32:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpgtd %zmm0, %zmm1, %k0 # sched: [3:1.00]
+; GENERIC-NEXT: vpmovm2d %k0, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: vcvtdq2ps %zmm0, %zmm0 # sched: [4:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: sbto16f32:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT: vpcmpgtd %zmm0, %zmm1, %k0 # sched: [3:1.00]
+; SKX-NEXT: vpmovm2d %k0, %zmm0 # sched: [1:0.25]
+; SKX-NEXT: vcvtdq2ps %zmm0, %zmm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %mask = icmp slt <16 x i32> %a, zeroinitializer
+ %1 = sitofp <16 x i1> %mask to <16 x float>
+ ret <16 x float> %1
+}
+
+define <16 x float> @scto16f32(<16 x i8> %a) {
+; GENERIC-LABEL: scto16f32:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpmovsxbd %xmm0, %zmm0 # sched: [1:1.00]
+; GENERIC-NEXT: vcvtdq2ps %zmm0, %zmm0 # sched: [4:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: scto16f32:
+; SKX: # %bb.0:
+; SKX-NEXT: vpmovsxbd %xmm0, %zmm0 # sched: [3:1.00]
+; SKX-NEXT: vcvtdq2ps %zmm0, %zmm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %1 = sitofp <16 x i8> %a to <16 x float>
+ ret <16 x float> %1
+}
+
+define <16 x float> @ssto16f32(<16 x i16> %a) {
+; GENERIC-LABEL: ssto16f32:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpmovsxwd %ymm0, %zmm0 # sched: [1:1.00]
+; GENERIC-NEXT: vcvtdq2ps %zmm0, %zmm0 # sched: [4:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: ssto16f32:
+; SKX: # %bb.0:
+; SKX-NEXT: vpmovsxwd %ymm0, %zmm0 # sched: [3:1.00]
+; SKX-NEXT: vcvtdq2ps %zmm0, %zmm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %1 = sitofp <16 x i16> %a to <16 x float>
+ ret <16 x float> %1
+}
+
+define <8 x double> @ssto16f64(<8 x i16> %a) {
+; GENERIC-LABEL: ssto16f64:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpmovsxwd %xmm0, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: vcvtdq2pd %ymm0, %zmm0 # sched: [4:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: ssto16f64:
+; SKX: # %bb.0:
+; SKX-NEXT: vpmovsxwd %xmm0, %ymm0 # sched: [3:1.00]
+; SKX-NEXT: vcvtdq2pd %ymm0, %zmm0 # sched: [7:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %1 = sitofp <8 x i16> %a to <8 x double>
+ ret <8 x double> %1
+}
+
+define <8 x double> @scto8f64(<8 x i8> %a) {
+; GENERIC-LABEL: scto8f64:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [1:1.00]
+; GENERIC-NEXT: vpslld $24, %ymm0, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: vpsrad $24, %ymm0, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: vcvtdq2pd %ymm0, %zmm0 # sched: [4:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: scto8f64:
+; SKX: # %bb.0:
+; SKX-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [3:1.00]
+; SKX-NEXT: vpslld $24, %ymm0, %ymm0 # sched: [1:0.50]
+; SKX-NEXT: vpsrad $24, %ymm0, %ymm0 # sched: [1:0.50]
+; SKX-NEXT: vcvtdq2pd %ymm0, %zmm0 # sched: [7:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %1 = sitofp <8 x i8> %a to <8 x double>
+ ret <8 x double> %1
+}
+
+define <16 x double> @scto16f64(<16 x i8> %a) {
+; GENERIC-LABEL: scto16f64:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpmovsxbd %xmm0, %zmm1 # sched: [1:1.00]
+; GENERIC-NEXT: vcvtdq2pd %ymm1, %zmm0 # sched: [4:1.00]
+; GENERIC-NEXT: vextracti64x4 $1, %zmm1, %ymm1 # sched: [1:1.00]
+; GENERIC-NEXT: vcvtdq2pd %ymm1, %zmm1 # sched: [4:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: scto16f64:
+; SKX: # %bb.0:
+; SKX-NEXT: vpmovsxbd %xmm0, %zmm1 # sched: [3:1.00]
+; SKX-NEXT: vcvtdq2pd %ymm1, %zmm0 # sched: [7:1.00]
+; SKX-NEXT: vextracti64x4 $1, %zmm1, %ymm1 # sched: [3:1.00]
+; SKX-NEXT: vcvtdq2pd %ymm1, %zmm1 # sched: [7:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %b = sitofp <16 x i8> %a to <16 x double>
+ ret <16 x double> %b
+}
+
+define <16 x double> @sbto16f64(<16 x double> %a) {
+; GENERIC-LABEL: sbto16f64:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vxorpd %xmm2, %xmm2, %xmm2 # sched: [1:1.00]
+; GENERIC-NEXT: vcmpltpd %zmm1, %zmm2, %k0 # sched: [3:1.00]
+; GENERIC-NEXT: vcmpltpd %zmm0, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpmovm2d %k1, %ymm0 # sched: [1:0.33]
+; GENERIC-NEXT: vcvtdq2pd %ymm0, %zmm0 # sched: [4:1.00]
+; GENERIC-NEXT: vpmovm2d %k0, %ymm1 # sched: [1:0.33]
+; GENERIC-NEXT: vcvtdq2pd %ymm1, %zmm1 # sched: [4:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: sbto16f64:
+; SKX: # %bb.0:
+; SKX-NEXT: vxorpd %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vcmpltpd %zmm1, %zmm2, %k0 # sched: [3:1.00]
+; SKX-NEXT: vcmpltpd %zmm0, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpmovm2d %k1, %ymm0 # sched: [1:0.25]
+; SKX-NEXT: vcvtdq2pd %ymm0, %zmm0 # sched: [7:1.00]
+; SKX-NEXT: vpmovm2d %k0, %ymm1 # sched: [1:0.25]
+; SKX-NEXT: vcvtdq2pd %ymm1, %zmm1 # sched: [7:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %cmpres = fcmp ogt <16 x double> %a, zeroinitializer
+ %1 = sitofp <16 x i1> %cmpres to <16 x double>
+ ret <16 x double> %1
+}
+
+define <8 x double> @sbto8f64(<8 x double> %a) {
+; GENERIC-LABEL: sbto8f64:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vxorpd %xmm1, %xmm1, %xmm1 # sched: [1:1.00]
+; GENERIC-NEXT: vcmpltpd %zmm0, %zmm1, %k0 # sched: [3:1.00]
+; GENERIC-NEXT: vpmovm2d %k0, %ymm0 # sched: [1:0.33]
+; GENERIC-NEXT: vcvtdq2pd %ymm0, %zmm0 # sched: [4:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: sbto8f64:
+; SKX: # %bb.0:
+; SKX-NEXT: vxorpd %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT: vcmpltpd %zmm0, %zmm1, %k0 # sched: [3:1.00]
+; SKX-NEXT: vpmovm2d %k0, %ymm0 # sched: [1:0.25]
+; SKX-NEXT: vcvtdq2pd %ymm0, %zmm0 # sched: [7:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %cmpres = fcmp ogt <8 x double> %a, zeroinitializer
+ %1 = sitofp <8 x i1> %cmpres to <8 x double>
+ ret <8 x double> %1
+}
+
+define <8 x float> @sbto8f32(<8 x float> %a) {
+; GENERIC-LABEL: sbto8f32:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vxorps %xmm1, %xmm1, %xmm1 # sched: [1:1.00]
+; GENERIC-NEXT: vcmpltps %ymm0, %ymm1, %k0 # sched: [3:1.00]
+; GENERIC-NEXT: vpmovm2d %k0, %ymm0 # sched: [1:0.33]
+; GENERIC-NEXT: vcvtdq2ps %ymm0, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: sbto8f32:
+; SKX: # %bb.0:
+; SKX-NEXT: vxorps %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT: vcmpltps %ymm0, %ymm1, %k0 # sched: [3:1.00]
+; SKX-NEXT: vpmovm2d %k0, %ymm0 # sched: [1:0.25]
+; SKX-NEXT: vcvtdq2ps %ymm0, %ymm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %cmpres = fcmp ogt <8 x float> %a, zeroinitializer
+ %1 = sitofp <8 x i1> %cmpres to <8 x float>
+ ret <8 x float> %1
+}
+
+define <4 x float> @sbto4f32(<4 x float> %a) {
+; GENERIC-LABEL: sbto4f32:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vxorps %xmm1, %xmm1, %xmm1 # sched: [1:1.00]
+; GENERIC-NEXT: vcmpltps %xmm0, %xmm1, %k0 # sched: [3:1.00]
+; GENERIC-NEXT: vpmovm2d %k0, %xmm0 # sched: [1:0.33]
+; GENERIC-NEXT: vcvtdq2ps %xmm0, %xmm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: sbto4f32:
+; SKX: # %bb.0:
+; SKX-NEXT: vxorps %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT: vcmpltps %xmm0, %xmm1, %k0 # sched: [3:1.00]
+; SKX-NEXT: vpmovm2d %k0, %xmm0 # sched: [1:0.25]
+; SKX-NEXT: vcvtdq2ps %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %cmpres = fcmp ogt <4 x float> %a, zeroinitializer
+ %1 = sitofp <4 x i1> %cmpres to <4 x float>
+ ret <4 x float> %1
+}
+
+define <4 x double> @sbto4f64(<4 x double> %a) {
+; GENERIC-LABEL: sbto4f64:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vxorpd %xmm1, %xmm1, %xmm1 # sched: [1:1.00]
+; GENERIC-NEXT: vcmpltpd %ymm0, %ymm1, %k0 # sched: [3:1.00]
+; GENERIC-NEXT: vpmovm2d %k0, %xmm0 # sched: [1:0.33]
+; GENERIC-NEXT: vcvtdq2pd %xmm0, %ymm0 # sched: [4:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: sbto4f64:
+; SKX: # %bb.0:
+; SKX-NEXT: vxorpd %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT: vcmpltpd %ymm0, %ymm1, %k0 # sched: [3:1.00]
+; SKX-NEXT: vpmovm2d %k0, %xmm0 # sched: [1:0.25]
+; SKX-NEXT: vcvtdq2pd %xmm0, %ymm0 # sched: [7:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %cmpres = fcmp ogt <4 x double> %a, zeroinitializer
+ %1 = sitofp <4 x i1> %cmpres to <4 x double>
+ ret <4 x double> %1
+}
+
+define <2 x float> @sbto2f32(<2 x float> %a) {
+; GENERIC-LABEL: sbto2f32:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vxorps %xmm1, %xmm1, %xmm1 # sched: [1:1.00]
+; GENERIC-NEXT: vcmpltps %xmm0, %xmm1, %k0 # sched: [3:1.00]
+; GENERIC-NEXT: vpmovm2d %k0, %xmm0 # sched: [1:0.33]
+; GENERIC-NEXT: vcvtdq2ps %xmm0, %xmm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: sbto2f32:
+; SKX: # %bb.0:
+; SKX-NEXT: vxorps %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT: vcmpltps %xmm0, %xmm1, %k0 # sched: [3:1.00]
+; SKX-NEXT: vpmovm2d %k0, %xmm0 # sched: [1:0.25]
+; SKX-NEXT: vcvtdq2ps %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %cmpres = fcmp ogt <2 x float> %a, zeroinitializer
+ %1 = sitofp <2 x i1> %cmpres to <2 x float>
+ ret <2 x float> %1
+}
+
+define <2 x double> @sbto2f64(<2 x double> %a) {
+; GENERIC-LABEL: sbto2f64:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vxorpd %xmm1, %xmm1, %xmm1 # sched: [1:1.00]
+; GENERIC-NEXT: vcmpltpd %xmm0, %xmm1, %k0 # sched: [3:1.00]
+; GENERIC-NEXT: vpmovm2q %k0, %xmm0 # sched: [1:0.33]
+; GENERIC-NEXT: vcvtqq2pd %xmm0, %xmm0 # sched: [4:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: sbto2f64:
+; SKX: # %bb.0:
+; SKX-NEXT: vxorpd %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT: vcmpltpd %xmm0, %xmm1, %k0 # sched: [3:1.00]
+; SKX-NEXT: vpmovm2q %k0, %xmm0 # sched: [1:0.25]
+; SKX-NEXT: vcvtqq2pd %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %cmpres = fcmp ogt <2 x double> %a, zeroinitializer
+ %1 = sitofp <2 x i1> %cmpres to <2 x double>
+ ret <2 x double> %1
+}
+
+define <16 x float> @ucto16f32(<16 x i8> %a) {
+; GENERIC-LABEL: ucto16f32:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero sched: [1:1.00]
+; GENERIC-NEXT: vcvtdq2ps %zmm0, %zmm0 # sched: [4:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: ucto16f32:
+; SKX: # %bb.0:
+; SKX-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero sched: [3:1.00]
+; SKX-NEXT: vcvtdq2ps %zmm0, %zmm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %b = uitofp <16 x i8> %a to <16 x float>
+ ret <16 x float>%b
+}
+
+define <8 x double> @ucto8f64(<8 x i8> %a) {
+; GENERIC-LABEL: ucto8f64:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 # sched: [7:0.50]
+; GENERIC-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [1:1.00]
+; GENERIC-NEXT: vcvtdq2pd %ymm0, %zmm0 # sched: [4:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: ucto8f64:
+; SKX: # %bb.0:
+; SKX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 # sched: [7:0.50]
+; SKX-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [3:1.00]
+; SKX-NEXT: vcvtdq2pd %ymm0, %zmm0 # sched: [7:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %b = uitofp <8 x i8> %a to <8 x double>
+ ret <8 x double> %b
+}
+
+define <16 x float> @swto16f32(<16 x i16> %a) {
+; GENERIC-LABEL: swto16f32:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpmovsxwd %ymm0, %zmm0 # sched: [1:1.00]
+; GENERIC-NEXT: vcvtdq2ps %zmm0, %zmm0 # sched: [4:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: swto16f32:
+; SKX: # %bb.0:
+; SKX-NEXT: vpmovsxwd %ymm0, %zmm0 # sched: [3:1.00]
+; SKX-NEXT: vcvtdq2ps %zmm0, %zmm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %b = sitofp <16 x i16> %a to <16 x float>
+ ret <16 x float> %b
+}
+
+define <8 x double> @swto8f64(<8 x i16> %a) {
+; GENERIC-LABEL: swto8f64:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpmovsxwd %xmm0, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: vcvtdq2pd %ymm0, %zmm0 # sched: [4:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: swto8f64:
+; SKX: # %bb.0:
+; SKX-NEXT: vpmovsxwd %xmm0, %ymm0 # sched: [3:1.00]
+; SKX-NEXT: vcvtdq2pd %ymm0, %zmm0 # sched: [7:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %b = sitofp <8 x i16> %a to <8 x double>
+ ret <8 x double> %b
+}
+
+define <16 x double> @swto16f64(<16 x i16> %a) {
+; GENERIC-LABEL: swto16f64:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpmovsxwd %ymm0, %zmm1 # sched: [1:1.00]
+; GENERIC-NEXT: vcvtdq2pd %ymm1, %zmm0 # sched: [4:1.00]
+; GENERIC-NEXT: vextracti64x4 $1, %zmm1, %ymm1 # sched: [1:1.00]
+; GENERIC-NEXT: vcvtdq2pd %ymm1, %zmm1 # sched: [4:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: swto16f64:
+; SKX: # %bb.0:
+; SKX-NEXT: vpmovsxwd %ymm0, %zmm1 # sched: [3:1.00]
+; SKX-NEXT: vcvtdq2pd %ymm1, %zmm0 # sched: [7:1.00]
+; SKX-NEXT: vextracti64x4 $1, %zmm1, %ymm1 # sched: [3:1.00]
+; SKX-NEXT: vcvtdq2pd %ymm1, %zmm1 # sched: [7:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %b = sitofp <16 x i16> %a to <16 x double>
+ ret <16 x double> %b
+}
+
+define <16 x double> @ucto16f64(<16 x i8> %a) {
+; GENERIC-LABEL: ucto16f64:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero sched: [1:1.00]
+; GENERIC-NEXT: vcvtdq2pd %ymm1, %zmm0 # sched: [4:1.00]
+; GENERIC-NEXT: vextracti64x4 $1, %zmm1, %ymm1 # sched: [1:1.00]
+; GENERIC-NEXT: vcvtdq2pd %ymm1, %zmm1 # sched: [4:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: ucto16f64:
+; SKX: # %bb.0:
+; SKX-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero sched: [3:1.00]
+; SKX-NEXT: vcvtdq2pd %ymm1, %zmm0 # sched: [7:1.00]
+; SKX-NEXT: vextracti64x4 $1, %zmm1, %ymm1 # sched: [3:1.00]
+; SKX-NEXT: vcvtdq2pd %ymm1, %zmm1 # sched: [7:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %b = uitofp <16 x i8> %a to <16 x double>
+ ret <16 x double> %b
+}
+
+define <16 x float> @uwto16f32(<16 x i16> %a) {
+; GENERIC-LABEL: uwto16f32:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero sched: [1:1.00]
+; GENERIC-NEXT: vcvtdq2ps %zmm0, %zmm0 # sched: [4:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: uwto16f32:
+; SKX: # %bb.0:
+; SKX-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero sched: [3:1.00]
+; SKX-NEXT: vcvtdq2ps %zmm0, %zmm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %b = uitofp <16 x i16> %a to <16 x float>
+ ret <16 x float> %b
+}
+
+define <8 x double> @uwto8f64(<8 x i16> %a) {
+; GENERIC-LABEL: uwto8f64:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [1:1.00]
+; GENERIC-NEXT: vcvtdq2pd %ymm0, %zmm0 # sched: [4:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: uwto8f64:
+; SKX: # %bb.0:
+; SKX-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [3:1.00]
+; SKX-NEXT: vcvtdq2pd %ymm0, %zmm0 # sched: [7:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %b = uitofp <8 x i16> %a to <8 x double>
+ ret <8 x double> %b
+}
+
+define <16 x double> @uwto16f64(<16 x i16> %a) {
+; GENERIC-LABEL: uwto16f64:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero sched: [1:1.00]
+; GENERIC-NEXT: vcvtdq2pd %ymm1, %zmm0 # sched: [4:1.00]
+; GENERIC-NEXT: vextracti64x4 $1, %zmm1, %ymm1 # sched: [1:1.00]
+; GENERIC-NEXT: vcvtdq2pd %ymm1, %zmm1 # sched: [4:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: uwto16f64:
+; SKX: # %bb.0:
+; SKX-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero sched: [3:1.00]
+; SKX-NEXT: vcvtdq2pd %ymm1, %zmm0 # sched: [7:1.00]
+; SKX-NEXT: vextracti64x4 $1, %zmm1, %ymm1 # sched: [3:1.00]
+; SKX-NEXT: vcvtdq2pd %ymm1, %zmm1 # sched: [7:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %b = uitofp <16 x i16> %a to <16 x double>
+ ret <16 x double> %b
+}
+
+define <16 x float> @sito16f32(<16 x i32> %a) {
+; GENERIC-LABEL: sito16f32:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vcvtdq2ps %zmm0, %zmm0 # sched: [4:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: sito16f32:
+; SKX: # %bb.0:
+; SKX-NEXT: vcvtdq2ps %zmm0, %zmm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %b = sitofp <16 x i32> %a to <16 x float>
+ ret <16 x float> %b
+}
+
+define <16 x double> @sito16f64(<16 x i32> %a) {
+; GENERIC-LABEL: sito16f64:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vcvtdq2pd %ymm0, %zmm2 # sched: [4:1.00]
+; GENERIC-NEXT: vextractf64x4 $1, %zmm0, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: vcvtdq2pd %ymm0, %zmm1 # sched: [4:1.00]
+; GENERIC-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: sito16f64:
+; SKX: # %bb.0:
+; SKX-NEXT: vcvtdq2pd %ymm0, %zmm2 # sched: [7:1.00]
+; SKX-NEXT: vextractf64x4 $1, %zmm0, %ymm0 # sched: [3:1.00]
+; SKX-NEXT: vcvtdq2pd %ymm0, %zmm1 # sched: [7:1.00]
+; SKX-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %b = sitofp <16 x i32> %a to <16 x double>
+ ret <16 x double> %b
+}
+
+define <16 x float> @usto16f32(<16 x i16> %a) {
+; GENERIC-LABEL: usto16f32:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero sched: [1:1.00]
+; GENERIC-NEXT: vcvtdq2ps %zmm0, %zmm0 # sched: [4:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: usto16f32:
+; SKX: # %bb.0:
+; SKX-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero sched: [3:1.00]
+; SKX-NEXT: vcvtdq2ps %zmm0, %zmm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %b = uitofp <16 x i16> %a to <16 x float>
+ ret <16 x float> %b
+}
+
+define <16 x float> @ubto16f32(<16 x i32> %a) {
+; GENERIC-LABEL: ubto16f32:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpgtd %zmm0, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} # sched: [5:1.00]
+; GENERIC-NEXT: vcvtdq2ps %zmm0, %zmm0 # sched: [4:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: ubto16f32:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT: vpcmpgtd %zmm0, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} # sched: [8:0.50]
+; SKX-NEXT: vcvtdq2ps %zmm0, %zmm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %mask = icmp slt <16 x i32> %a, zeroinitializer
+ %1 = uitofp <16 x i1> %mask to <16 x float>
+ ret <16 x float> %1
+}
+
+define <16 x double> @ubto16f64(<16 x i32> %a) {
+; GENERIC-LABEL: ubto16f64:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpgtd %zmm0, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: movl {{.*}}(%rip), %eax # sched: [5:0.50]
+; GENERIC-NEXT: vpbroadcastd %eax, %ymm0 {%k1} {z} # sched: [1:1.00]
+; GENERIC-NEXT: vcvtdq2pd %ymm0, %zmm0 # sched: [4:1.00]
+; GENERIC-NEXT: kshiftrw $8, %k1, %k1 # sched: [1:1.00]
+; GENERIC-NEXT: vpbroadcastd %eax, %ymm1 {%k1} {z} # sched: [1:1.00]
+; GENERIC-NEXT: vcvtdq2pd %ymm1, %zmm1 # sched: [4:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: ubto16f64:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT: vpcmpgtd %zmm0, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: movl {{.*}}(%rip), %eax # sched: [5:0.50]
+; SKX-NEXT: vpbroadcastd %eax, %ymm0 {%k1} {z} # sched: [3:1.00]
+; SKX-NEXT: vcvtdq2pd %ymm0, %zmm0 # sched: [7:1.00]
+; SKX-NEXT: kshiftrw $8, %k1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpbroadcastd %eax, %ymm1 {%k1} {z} # sched: [3:1.00]
+; SKX-NEXT: vcvtdq2pd %ymm1, %zmm1 # sched: [7:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %mask = icmp slt <16 x i32> %a, zeroinitializer
+ %1 = uitofp <16 x i1> %mask to <16 x double>
+ ret <16 x double> %1
+}
+
+define <8 x float> @ubto8f32(<8 x i32> %a) {
+; GENERIC-LABEL: ubto8f32:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpgtd %ymm0, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpbroadcastd {{.*}}(%rip), %ymm0 {%k1} {z} # sched: [5:1.00]
+; GENERIC-NEXT: vcvtdq2ps %ymm0, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: ubto8f32:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT: vpcmpgtd %ymm0, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpbroadcastd {{.*}}(%rip), %ymm0 {%k1} {z} # sched: [8:0.50]
+; SKX-NEXT: vcvtdq2ps %ymm0, %ymm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %mask = icmp slt <8 x i32> %a, zeroinitializer
+ %1 = uitofp <8 x i1> %mask to <8 x float>
+ ret <8 x float> %1
+}
+
+define <8 x double> @ubto8f64(<8 x i32> %a) {
+; GENERIC-LABEL: ubto8f64:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpgtd %ymm0, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpbroadcastd {{.*}}(%rip), %ymm0 {%k1} {z} # sched: [5:1.00]
+; GENERIC-NEXT: vcvtdq2pd %ymm0, %zmm0 # sched: [4:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: ubto8f64:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT: vpcmpgtd %ymm0, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpbroadcastd {{.*}}(%rip), %ymm0 {%k1} {z} # sched: [8:0.50]
+; SKX-NEXT: vcvtdq2pd %ymm0, %zmm0 # sched: [7:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %mask = icmp slt <8 x i32> %a, zeroinitializer
+ %1 = uitofp <8 x i1> %mask to <8 x double>
+ ret <8 x double> %1
+}
+
+define <4 x float> @ubto4f32(<4 x i32> %a) {
+; GENERIC-LABEL: ubto4f32:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpgtd %xmm0, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} {z} # sched: [5:1.00]
+; GENERIC-NEXT: vcvtdq2ps %xmm0, %xmm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: ubto4f32:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT: vpcmpgtd %xmm0, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} {z} # sched: [7:0.50]
+; SKX-NEXT: vcvtdq2ps %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %mask = icmp slt <4 x i32> %a, zeroinitializer
+ %1 = uitofp <4 x i1> %mask to <4 x float>
+ ret <4 x float> %1
+}
+
+define <4 x double> @ubto4f64(<4 x i32> %a) {
+; GENERIC-LABEL: ubto4f64:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpgtd %xmm0, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} {z} # sched: [5:1.00]
+; GENERIC-NEXT: vcvtdq2pd %xmm0, %ymm0 # sched: [4:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: ubto4f64:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT: vpcmpgtd %xmm0, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} {z} # sched: [7:0.50]
+; SKX-NEXT: vcvtdq2pd %xmm0, %ymm0 # sched: [7:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %mask = icmp slt <4 x i32> %a, zeroinitializer
+ %1 = uitofp <4 x i1> %mask to <4 x double>
+ ret <4 x double> %1
+}
+
+define <2 x float> @ubto2f32(<2 x i32> %a) {
+; GENERIC-LABEL: ubto2f32:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] sched: [1:0.50]
+; GENERIC-NEXT: vpcmpltuq %xmm1, %xmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} {z} # sched: [5:1.00]
+; GENERIC-NEXT: vcvtdq2ps %xmm0, %xmm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: ubto2f32:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] sched: [1:0.33]
+; SKX-NEXT: vpcmpltuq %xmm1, %xmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} {z} # sched: [7:0.50]
+; SKX-NEXT: vcvtdq2ps %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %mask = icmp ult <2 x i32> %a, zeroinitializer
+ %1 = uitofp <2 x i1> %mask to <2 x float>
+ ret <2 x float> %1
+}
+
+define <2 x double> @ubto2f64(<2 x i32> %a) {
+; GENERIC-LABEL: ubto2f64:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] sched: [1:0.50]
+; GENERIC-NEXT: vpcmpltuq %xmm1, %xmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vmovdqa64 {{.*}}(%rip), %xmm0 {%k1} {z} # sched: [4:0.50]
+; GENERIC-NEXT: vcvtqq2pd %xmm0, %xmm0 # sched: [4:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: ubto2f64:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] sched: [1:0.33]
+; SKX-NEXT: vpcmpltuq %xmm1, %xmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vmovdqa64 {{.*}}(%rip), %xmm0 {%k1} {z} # sched: [7:0.50]
+; SKX-NEXT: vcvtqq2pd %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %mask = icmp ult <2 x i32> %a, zeroinitializer
+ %1 = uitofp <2 x i1> %mask to <2 x double>
+ ret <2 x double> %1
+}
+
+define <8 x i16> @zext_8x8mem_to_8x16(<8 x i8> *%i , <8 x i1> %mask) nounwind readnone {
+; GENERIC-LABEL: zext_8x8mem_to_8x16:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: vpmovw2m %xmm0, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpmovzxbw {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: zext_8x8mem_to_8x16:
+; SKX: # %bb.0:
+; SKX-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:0.50]
+; SKX-NEXT: vpmovw2m %xmm0, %k1 # sched: [1:1.00]
+; SKX-NEXT: vpmovzxbw {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero sched: [9:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %a = load <8 x i8>,<8 x i8> *%i,align 1
+ %x = zext <8 x i8> %a to <8 x i16>
+ %ret = select <8 x i1> %mask, <8 x i16> %x, <8 x i16> zeroinitializer
+ ret <8 x i16> %ret
+}
+
+define <8 x i16> @sext_8x8mem_to_8x16(<8 x i8> *%i , <8 x i1> %mask) nounwind readnone {
+; GENERIC-LABEL: sext_8x8mem_to_8x16:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: vpmovw2m %xmm0, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpmovsxbw (%rdi), %xmm0 {%k1} {z} # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: sext_8x8mem_to_8x16:
+; SKX: # %bb.0:
+; SKX-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:0.50]
+; SKX-NEXT: vpmovw2m %xmm0, %k1 # sched: [1:1.00]
+; SKX-NEXT: vpmovsxbw (%rdi), %xmm0 {%k1} {z} # sched: [9:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %a = load <8 x i8>,<8 x i8> *%i,align 1
+ %x = sext <8 x i8> %a to <8 x i16>
+ %ret = select <8 x i1> %mask, <8 x i16> %x, <8 x i16> zeroinitializer
+ ret <8 x i16> %ret
+}
+
+
+define <16 x i16> @zext_16x8mem_to_16x16(<16 x i8> *%i , <16 x i1> %mask) nounwind readnone {
+; GENERIC-LABEL: zext_16x8mem_to_16x16:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpsllw $7, %xmm0, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: vpmovb2m %xmm0, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpmovzxbw {{.*#+}} ymm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: zext_16x8mem_to_16x16:
+; SKX: # %bb.0:
+; SKX-NEXT: vpsllw $7, %xmm0, %xmm0 # sched: [1:0.50]
+; SKX-NEXT: vpmovb2m %xmm0, %k1 # sched: [1:1.00]
+; SKX-NEXT: vpmovzxbw {{.*#+}} ymm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %a = load <16 x i8>,<16 x i8> *%i,align 1
+ %x = zext <16 x i8> %a to <16 x i16>
+ %ret = select <16 x i1> %mask, <16 x i16> %x, <16 x i16> zeroinitializer
+ ret <16 x i16> %ret
+}
+
+define <16 x i16> @sext_16x8mem_to_16x16(<16 x i8> *%i , <16 x i1> %mask) nounwind readnone {
+; GENERIC-LABEL: sext_16x8mem_to_16x16:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpsllw $7, %xmm0, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: vpmovb2m %xmm0, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpmovsxbw (%rdi), %ymm0 {%k1} {z} # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: sext_16x8mem_to_16x16:
+; SKX: # %bb.0:
+; SKX-NEXT: vpsllw $7, %xmm0, %xmm0 # sched: [1:0.50]
+; SKX-NEXT: vpmovb2m %xmm0, %k1 # sched: [1:1.00]
+; SKX-NEXT: vpmovsxbw (%rdi), %ymm0 {%k1} {z} # sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %a = load <16 x i8>,<16 x i8> *%i,align 1
+ %x = sext <16 x i8> %a to <16 x i16>
+ %ret = select <16 x i1> %mask, <16 x i16> %x, <16 x i16> zeroinitializer
+ ret <16 x i16> %ret
+}
+
+define <16 x i16> @zext_16x8_to_16x16(<16 x i8> %a ) nounwind readnone {
+; GENERIC-LABEL: zext_16x8_to_16x16:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: zext_16x8_to_16x16:
+; SKX: # %bb.0:
+; SKX-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %x = zext <16 x i8> %a to <16 x i16>
+ ret <16 x i16> %x
+}
+
+define <16 x i16> @zext_16x8_to_16x16_mask(<16 x i8> %a ,<16 x i1> %mask) nounwind readnone {
+; GENERIC-LABEL: zext_16x8_to_16x16_mask:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpsllw $7, %xmm1, %xmm1 # sched: [1:1.00]
+; GENERIC-NEXT: vpmovb2m %xmm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpmovzxbw {{.*#+}} ymm0 {%k1} {z} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: zext_16x8_to_16x16_mask:
+; SKX: # %bb.0:
+; SKX-NEXT: vpsllw $7, %xmm1, %xmm1 # sched: [1:0.50]
+; SKX-NEXT: vpmovb2m %xmm1, %k1 # sched: [1:1.00]
+; SKX-NEXT: vpmovzxbw {{.*#+}} ymm0 {%k1} {z} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %x = zext <16 x i8> %a to <16 x i16>
+ %ret = select <16 x i1> %mask, <16 x i16> %x, <16 x i16> zeroinitializer
+ ret <16 x i16> %ret
+}
+
+define <16 x i16> @sext_16x8_to_16x16(<16 x i8> %a ) nounwind readnone {
+; GENERIC-LABEL: sext_16x8_to_16x16:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpmovsxbw %xmm0, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: sext_16x8_to_16x16:
+; SKX: # %bb.0:
+; SKX-NEXT: vpmovsxbw %xmm0, %ymm0 # sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %x = sext <16 x i8> %a to <16 x i16>
+ ret <16 x i16> %x
+}
+
+define <16 x i16> @sext_16x8_to_16x16_mask(<16 x i8> %a ,<16 x i1> %mask) nounwind readnone {
+; GENERIC-LABEL: sext_16x8_to_16x16_mask:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpsllw $7, %xmm1, %xmm1 # sched: [1:1.00]
+; GENERIC-NEXT: vpmovb2m %xmm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpmovsxbw %xmm0, %ymm0 {%k1} {z} # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: sext_16x8_to_16x16_mask:
+; SKX: # %bb.0:
+; SKX-NEXT: vpsllw $7, %xmm1, %xmm1 # sched: [1:0.50]
+; SKX-NEXT: vpmovb2m %xmm1, %k1 # sched: [1:1.00]
+; SKX-NEXT: vpmovsxbw %xmm0, %ymm0 {%k1} {z} # sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %x = sext <16 x i8> %a to <16 x i16>
+ %ret = select <16 x i1> %mask, <16 x i16> %x, <16 x i16> zeroinitializer
+ ret <16 x i16> %ret
+}
+
+define <32 x i16> @zext_32x8mem_to_32x16(<32 x i8> *%i , <32 x i1> %mask) nounwind readnone {
+; GENERIC-LABEL: zext_32x8mem_to_32x16:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpsllw $7, %ymm0, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: vpmovb2m %ymm0, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpmovzxbw {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero,mem[16],zero,mem[17],zero,mem[18],zero,mem[19],zero,mem[20],zero,mem[21],zero,mem[22],zero,mem[23],zero,mem[24],zero,mem[25],zero,mem[26],zero,mem[27],zero,mem[28],zero,mem[29],zero,mem[30],zero,mem[31],zero sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: zext_32x8mem_to_32x16:
+; SKX: # %bb.0:
+; SKX-NEXT: vpsllw $7, %ymm0, %ymm0 # sched: [1:0.50]
+; SKX-NEXT: vpmovb2m %ymm0, %k1 # sched: [1:1.00]
+; SKX-NEXT: vpmovzxbw {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero,mem[16],zero,mem[17],zero,mem[18],zero,mem[19],zero,mem[20],zero,mem[21],zero,mem[22],zero,mem[23],zero,mem[24],zero,mem[25],zero,mem[26],zero,mem[27],zero,mem[28],zero,mem[29],zero,mem[30],zero,mem[31],zero sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %a = load <32 x i8>,<32 x i8> *%i,align 1
+ %x = zext <32 x i8> %a to <32 x i16>
+ %ret = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> zeroinitializer
+ ret <32 x i16> %ret
+}
+
+define <32 x i16> @sext_32x8mem_to_32x16(<32 x i8> *%i , <32 x i1> %mask) nounwind readnone {
+; GENERIC-LABEL: sext_32x8mem_to_32x16:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpsllw $7, %ymm0, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: vpmovb2m %ymm0, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpmovsxbw (%rdi), %zmm0 {%k1} {z} # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: sext_32x8mem_to_32x16:
+; SKX: # %bb.0:
+; SKX-NEXT: vpsllw $7, %ymm0, %ymm0 # sched: [1:0.50]
+; SKX-NEXT: vpmovb2m %ymm0, %k1 # sched: [1:1.00]
+; SKX-NEXT: vpmovsxbw (%rdi), %zmm0 {%k1} {z} # sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %a = load <32 x i8>,<32 x i8> *%i,align 1
+ %x = sext <32 x i8> %a to <32 x i16>
+ %ret = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> zeroinitializer
+ ret <32 x i16> %ret
+}
+
+define <32 x i16> @zext_32x8_to_32x16(<32 x i8> %a ) nounwind readnone {
+; GENERIC-LABEL: zext_32x8_to_32x16:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: zext_32x8_to_32x16:
+; SKX: # %bb.0:
+; SKX-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %x = zext <32 x i8> %a to <32 x i16>
+ ret <32 x i16> %x
+}
+
+define <32 x i16> @zext_32x8_to_32x16_mask(<32 x i8> %a ,<32 x i1> %mask) nounwind readnone {
+; GENERIC-LABEL: zext_32x8_to_32x16_mask:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpsllw $7, %ymm1, %ymm1 # sched: [1:1.00]
+; GENERIC-NEXT: vpmovb2m %ymm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpmovzxbw {{.*#+}} zmm0 {%k1} {z} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: zext_32x8_to_32x16_mask:
+; SKX: # %bb.0:
+; SKX-NEXT: vpsllw $7, %ymm1, %ymm1 # sched: [1:0.50]
+; SKX-NEXT: vpmovb2m %ymm1, %k1 # sched: [1:1.00]
+; SKX-NEXT: vpmovzxbw {{.*#+}} zmm0 {%k1} {z} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %x = zext <32 x i8> %a to <32 x i16>
+ %ret = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> zeroinitializer
+ ret <32 x i16> %ret
+}
+
+define <32 x i16> @sext_32x8_to_32x16(<32 x i8> %a ) nounwind readnone {
+; GENERIC-LABEL: sext_32x8_to_32x16:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpmovsxbw %ymm0, %zmm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: sext_32x8_to_32x16:
+; SKX: # %bb.0:
+; SKX-NEXT: vpmovsxbw %ymm0, %zmm0 # sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %x = sext <32 x i8> %a to <32 x i16>
+ ret <32 x i16> %x
+}
+
+define <32 x i16> @sext_32x8_to_32x16_mask(<32 x i8> %a ,<32 x i1> %mask) nounwind readnone {
+; GENERIC-LABEL: sext_32x8_to_32x16_mask:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpsllw $7, %ymm1, %ymm1 # sched: [1:1.00]
+; GENERIC-NEXT: vpmovb2m %ymm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpmovsxbw %ymm0, %zmm0 {%k1} {z} # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: sext_32x8_to_32x16_mask:
+; SKX: # %bb.0:
+; SKX-NEXT: vpsllw $7, %ymm1, %ymm1 # sched: [1:0.50]
+; SKX-NEXT: vpmovb2m %ymm1, %k1 # sched: [1:1.00]
+; SKX-NEXT: vpmovsxbw %ymm0, %zmm0 {%k1} {z} # sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %x = sext <32 x i8> %a to <32 x i16>
+ %ret = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> zeroinitializer
+ ret <32 x i16> %ret
+}
+
+define <4 x i32> @zext_4x8mem_to_4x32(<4 x i8> *%i , <4 x i1> %mask) nounwind readnone {
+; GENERIC-LABEL: zext_4x8mem_to_4x32:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: vptestmd %xmm0, %xmm0, %k1 # sched: [1:1.00]
+; GENERIC-NEXT: vpmovzxbd {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: zext_4x8mem_to_4x32:
+; SKX: # %bb.0:
+; SKX-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:0.50]
+; SKX-NEXT: vptestmd %xmm0, %xmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpmovzxbd {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero sched: [9:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %a = load <4 x i8>,<4 x i8> *%i,align 1
+ %x = zext <4 x i8> %a to <4 x i32>
+ %ret = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> zeroinitializer
+ ret <4 x i32> %ret
+}
+
+define <4 x i32> @sext_4x8mem_to_4x32(<4 x i8> *%i , <4 x i1> %mask) nounwind readnone {
+; GENERIC-LABEL: sext_4x8mem_to_4x32:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: vptestmd %xmm0, %xmm0, %k1 # sched: [1:1.00]
+; GENERIC-NEXT: vpmovsxbd (%rdi), %xmm0 {%k1} {z} # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: sext_4x8mem_to_4x32:
+; SKX: # %bb.0:
+; SKX-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:0.50]
+; SKX-NEXT: vptestmd %xmm0, %xmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpmovsxbd (%rdi), %xmm0 {%k1} {z} # sched: [9:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %a = load <4 x i8>,<4 x i8> *%i,align 1
+ %x = sext <4 x i8> %a to <4 x i32>
+ %ret = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> zeroinitializer
+ ret <4 x i32> %ret
+}
+
+define <8 x i32> @zext_8x8mem_to_8x32(<8 x i8> *%i , <8 x i1> %mask) nounwind readnone {
+; GENERIC-LABEL: zext_8x8mem_to_8x32:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: vpmovw2m %xmm0, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpmovzxbd {{.*#+}} ymm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: zext_8x8mem_to_8x32:
+; SKX: # %bb.0:
+; SKX-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:0.50]
+; SKX-NEXT: vpmovw2m %xmm0, %k1 # sched: [1:1.00]
+; SKX-NEXT: vpmovzxbd {{.*#+}} ymm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %a = load <8 x i8>,<8 x i8> *%i,align 1
+ %x = zext <8 x i8> %a to <8 x i32>
+ %ret = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> zeroinitializer
+ ret <8 x i32> %ret
+}
+
+define <8 x i32> @sext_8x8mem_to_8x32(<8 x i8> *%i , <8 x i1> %mask) nounwind readnone {
+; GENERIC-LABEL: sext_8x8mem_to_8x32:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: vpmovw2m %xmm0, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpmovsxbd (%rdi), %ymm0 {%k1} {z} # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: sext_8x8mem_to_8x32:
+; SKX: # %bb.0:
+; SKX-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:0.50]
+; SKX-NEXT: vpmovw2m %xmm0, %k1 # sched: [1:1.00]
+; SKX-NEXT: vpmovsxbd (%rdi), %ymm0 {%k1} {z} # sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %a = load <8 x i8>,<8 x i8> *%i,align 1
+ %x = sext <8 x i8> %a to <8 x i32>
+ %ret = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> zeroinitializer
+ ret <8 x i32> %ret
+}
+
+define <16 x i32> @zext_16x8mem_to_16x32(<16 x i8> *%i , <16 x i1> %mask) nounwind readnone {
+; GENERIC-LABEL: zext_16x8mem_to_16x32:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpsllw $7, %xmm0, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: vpmovb2m %xmm0, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpmovzxbd {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: zext_16x8mem_to_16x32:
+; SKX: # %bb.0:
+; SKX-NEXT: vpsllw $7, %xmm0, %xmm0 # sched: [1:0.50]
+; SKX-NEXT: vpmovb2m %xmm0, %k1 # sched: [1:1.00]
+; SKX-NEXT: vpmovzxbd {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %a = load <16 x i8>,<16 x i8> *%i,align 1
+ %x = zext <16 x i8> %a to <16 x i32>
+ %ret = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> zeroinitializer
+ ret <16 x i32> %ret
+}
+
+define <16 x i32> @sext_16x8mem_to_16x32(<16 x i8> *%i , <16 x i1> %mask) nounwind readnone {
+; GENERIC-LABEL: sext_16x8mem_to_16x32:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpsllw $7, %xmm0, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: vpmovb2m %xmm0, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpmovsxbd (%rdi), %zmm0 {%k1} {z} # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: sext_16x8mem_to_16x32:
+; SKX: # %bb.0:
+; SKX-NEXT: vpsllw $7, %xmm0, %xmm0 # sched: [1:0.50]
+; SKX-NEXT: vpmovb2m %xmm0, %k1 # sched: [1:1.00]
+; SKX-NEXT: vpmovsxbd (%rdi), %zmm0 {%k1} {z} # sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %a = load <16 x i8>,<16 x i8> *%i,align 1
+ %x = sext <16 x i8> %a to <16 x i32>
+ %ret = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> zeroinitializer
+ ret <16 x i32> %ret
+}
+
+define <16 x i32> @zext_16x8_to_16x32_mask(<16 x i8> %a , <16 x i1> %mask) nounwind readnone {
+; GENERIC-LABEL: zext_16x8_to_16x32_mask:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpsllw $7, %xmm1, %xmm1 # sched: [1:1.00]
+; GENERIC-NEXT: vpmovb2m %xmm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpmovzxbd {{.*#+}} zmm0 {%k1} {z} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: zext_16x8_to_16x32_mask:
+; SKX: # %bb.0:
+; SKX-NEXT: vpsllw $7, %xmm1, %xmm1 # sched: [1:0.50]
+; SKX-NEXT: vpmovb2m %xmm1, %k1 # sched: [1:1.00]
+; SKX-NEXT: vpmovzxbd {{.*#+}} zmm0 {%k1} {z} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %x = zext <16 x i8> %a to <16 x i32>
+ %ret = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> zeroinitializer
+ ret <16 x i32> %ret
+}
+
+define <16 x i32> @sext_16x8_to_16x32_mask(<16 x i8> %a , <16 x i1> %mask) nounwind readnone {
+; GENERIC-LABEL: sext_16x8_to_16x32_mask:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpsllw $7, %xmm1, %xmm1 # sched: [1:1.00]
+; GENERIC-NEXT: vpmovb2m %xmm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpmovsxbd %xmm0, %zmm0 {%k1} {z} # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: sext_16x8_to_16x32_mask:
+; SKX: # %bb.0:
+; SKX-NEXT: vpsllw $7, %xmm1, %xmm1 # sched: [1:0.50]
+; SKX-NEXT: vpmovb2m %xmm1, %k1 # sched: [1:1.00]
+; SKX-NEXT: vpmovsxbd %xmm0, %zmm0 {%k1} {z} # sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %x = sext <16 x i8> %a to <16 x i32>
+ %ret = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> zeroinitializer
+ ret <16 x i32> %ret
+}
+
+define <16 x i32> @zext_16x8_to_16x32(<16 x i8> %i) nounwind readnone {
+; GENERIC-LABEL: zext_16x8_to_16x32:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: zext_16x8_to_16x32:
+; SKX: # %bb.0:
+; SKX-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %x = zext <16 x i8> %i to <16 x i32>
+ ret <16 x i32> %x
+}
+
+define <16 x i32> @sext_16x8_to_16x32(<16 x i8> %i) nounwind readnone {
+; GENERIC-LABEL: sext_16x8_to_16x32:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpmovsxbd %xmm0, %zmm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: sext_16x8_to_16x32:
+; SKX: # %bb.0:
+; SKX-NEXT: vpmovsxbd %xmm0, %zmm0 # sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %x = sext <16 x i8> %i to <16 x i32>
+ ret <16 x i32> %x
+}
+
+define <2 x i64> @zext_2x8mem_to_2x64(<2 x i8> *%i , <2 x i1> %mask) nounwind readnone {
+; GENERIC-LABEL: zext_2x8mem_to_2x64:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpsllq $63, %xmm0, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: vptestmq %xmm0, %xmm0, %k1 # sched: [1:1.00]
+; GENERIC-NEXT: vpmovzxbq {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: zext_2x8mem_to_2x64:
+; SKX: # %bb.0:
+; SKX-NEXT: vpsllq $63, %xmm0, %xmm0 # sched: [1:0.50]
+; SKX-NEXT: vptestmq %xmm0, %xmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpmovzxbq {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero sched: [9:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %a = load <2 x i8>,<2 x i8> *%i,align 1
+ %x = zext <2 x i8> %a to <2 x i64>
+ %ret = select <2 x i1> %mask, <2 x i64> %x, <2 x i64> zeroinitializer
+ ret <2 x i64> %ret
+}
+define <2 x i64> @sext_2x8mem_to_2x64mask(<2 x i8> *%i , <2 x i1> %mask) nounwind readnone {
+; GENERIC-LABEL: sext_2x8mem_to_2x64mask:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpsllq $63, %xmm0, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: vptestmq %xmm0, %xmm0, %k1 # sched: [1:1.00]
+; GENERIC-NEXT: vpmovsxbq (%rdi), %xmm0 {%k1} {z} # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: sext_2x8mem_to_2x64mask:
+; SKX: # %bb.0:
+; SKX-NEXT: vpsllq $63, %xmm0, %xmm0 # sched: [1:0.50]
+; SKX-NEXT: vptestmq %xmm0, %xmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpmovsxbq (%rdi), %xmm0 {%k1} {z} # sched: [9:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %a = load <2 x i8>,<2 x i8> *%i,align 1
+ %x = sext <2 x i8> %a to <2 x i64>
+ %ret = select <2 x i1> %mask, <2 x i64> %x, <2 x i64> zeroinitializer
+ ret <2 x i64> %ret
+}
+define <2 x i64> @sext_2x8mem_to_2x64(<2 x i8> *%i) nounwind readnone {
+; GENERIC-LABEL: sext_2x8mem_to_2x64:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpmovsxbq (%rdi), %xmm0 # sched: [7:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: sext_2x8mem_to_2x64:
+; SKX: # %bb.0:
+; SKX-NEXT: vpmovsxbq (%rdi), %xmm0 # sched: [6:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %a = load <2 x i8>,<2 x i8> *%i,align 1
+ %x = sext <2 x i8> %a to <2 x i64>
+ ret <2 x i64> %x
+}
+
+define <4 x i64> @zext_4x8mem_to_4x64(<4 x i8> *%i , <4 x i1> %mask) nounwind readnone {
+; GENERIC-LABEL: zext_4x8mem_to_4x64:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: vptestmd %xmm0, %xmm0, %k1 # sched: [1:1.00]
+; GENERIC-NEXT: vpmovzxbq {{.*#+}} ymm0 {%k1} {z} = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: zext_4x8mem_to_4x64:
+; SKX: # %bb.0:
+; SKX-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:0.50]
+; SKX-NEXT: vptestmd %xmm0, %xmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpmovzxbq {{.*#+}} ymm0 {%k1} {z} = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %a = load <4 x i8>,<4 x i8> *%i,align 1
+ %x = zext <4 x i8> %a to <4 x i64>
+ %ret = select <4 x i1> %mask, <4 x i64> %x, <4 x i64> zeroinitializer
+ ret <4 x i64> %ret
+}
+
+define <4 x i64> @sext_4x8mem_to_4x64mask(<4 x i8> *%i , <4 x i1> %mask) nounwind readnone {
+; GENERIC-LABEL: sext_4x8mem_to_4x64mask:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: vptestmd %xmm0, %xmm0, %k1 # sched: [1:1.00]
+; GENERIC-NEXT: vpmovsxbq (%rdi), %ymm0 {%k1} {z} # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: sext_4x8mem_to_4x64mask:
+; SKX: # %bb.0:
+; SKX-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:0.50]
+; SKX-NEXT: vptestmd %xmm0, %xmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpmovsxbq (%rdi), %ymm0 {%k1} {z} # sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %a = load <4 x i8>,<4 x i8> *%i,align 1
+ %x = sext <4 x i8> %a to <4 x i64>
+ %ret = select <4 x i1> %mask, <4 x i64> %x, <4 x i64> zeroinitializer
+ ret <4 x i64> %ret
+}
+
+define <4 x i64> @sext_4x8mem_to_4x64(<4 x i8> *%i) nounwind readnone {
+; GENERIC-LABEL: sext_4x8mem_to_4x64:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpmovsxbq (%rdi), %ymm0 # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: sext_4x8mem_to_4x64:
+; SKX: # %bb.0:
+; SKX-NEXT: vpmovsxbq (%rdi), %ymm0 # sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %a = load <4 x i8>,<4 x i8> *%i,align 1
+ %x = sext <4 x i8> %a to <4 x i64>
+ ret <4 x i64> %x
+}
+
+define <8 x i64> @zext_8x8mem_to_8x64(<8 x i8> *%i , <8 x i1> %mask) nounwind readnone {
+; GENERIC-LABEL: zext_8x8mem_to_8x64:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: vpmovw2m %xmm0, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpmovzxbq {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero,mem[4],zero,zero,zero,zero,zero,zero,zero,mem[5],zero,zero,zero,zero,zero,zero,zero,mem[6],zero,zero,zero,zero,zero,zero,zero,mem[7],zero,zero,zero,zero,zero,zero,zero sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: zext_8x8mem_to_8x64:
+; SKX: # %bb.0:
+; SKX-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:0.50]
+; SKX-NEXT: vpmovw2m %xmm0, %k1 # sched: [1:1.00]
+; SKX-NEXT: vpmovzxbq {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero,mem[4],zero,zero,zero,zero,zero,zero,zero,mem[5],zero,zero,zero,zero,zero,zero,zero,mem[6],zero,zero,zero,zero,zero,zero,zero,mem[7],zero,zero,zero,zero,zero,zero,zero sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %a = load <8 x i8>,<8 x i8> *%i,align 1
+ %x = zext <8 x i8> %a to <8 x i64>
+ %ret = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> zeroinitializer
+ ret <8 x i64> %ret
+}
+
+define <8 x i64> @sext_8x8mem_to_8x64mask(<8 x i8> *%i , <8 x i1> %mask) nounwind readnone {
+; GENERIC-LABEL: sext_8x8mem_to_8x64mask:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: vpmovw2m %xmm0, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpmovsxbq (%rdi), %zmm0 {%k1} {z} # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: sext_8x8mem_to_8x64mask:
+; SKX: # %bb.0:
+; SKX-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:0.50]
+; SKX-NEXT: vpmovw2m %xmm0, %k1 # sched: [1:1.00]
+; SKX-NEXT: vpmovsxbq (%rdi), %zmm0 {%k1} {z} # sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %a = load <8 x i8>,<8 x i8> *%i,align 1
+ %x = sext <8 x i8> %a to <8 x i64>
+ %ret = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> zeroinitializer
+ ret <8 x i64> %ret
+}
+
+define <8 x i64> @sext_8x8mem_to_8x64(<8 x i8> *%i) nounwind readnone {
+; GENERIC-LABEL: sext_8x8mem_to_8x64:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpmovsxbq (%rdi), %zmm0 # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: sext_8x8mem_to_8x64:
+; SKX: # %bb.0:
+; SKX-NEXT: vpmovsxbq (%rdi), %zmm0 # sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %a = load <8 x i8>,<8 x i8> *%i,align 1
+ %x = sext <8 x i8> %a to <8 x i64>
+ ret <8 x i64> %x
+}
+
+define <4 x i32> @zext_4x16mem_to_4x32(<4 x i16> *%i , <4 x i1> %mask) nounwind readnone {
+; GENERIC-LABEL: zext_4x16mem_to_4x32:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: vptestmd %xmm0, %xmm0, %k1 # sched: [1:1.00]
+; GENERIC-NEXT: vpmovzxwd {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: zext_4x16mem_to_4x32:
+; SKX: # %bb.0:
+; SKX-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:0.50]
+; SKX-NEXT: vptestmd %xmm0, %xmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpmovzxwd {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero sched: [9:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %a = load <4 x i16>,<4 x i16> *%i,align 1
+ %x = zext <4 x i16> %a to <4 x i32>
+ %ret = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> zeroinitializer
+ ret <4 x i32> %ret
+}
+
+define <4 x i32> @sext_4x16mem_to_4x32mask(<4 x i16> *%i , <4 x i1> %mask) nounwind readnone {
+; GENERIC-LABEL: sext_4x16mem_to_4x32mask:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: vptestmd %xmm0, %xmm0, %k1 # sched: [1:1.00]
+; GENERIC-NEXT: vpmovsxwd (%rdi), %xmm0 {%k1} {z} # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: sext_4x16mem_to_4x32mask:
+; SKX: # %bb.0:
+; SKX-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:0.50]
+; SKX-NEXT: vptestmd %xmm0, %xmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpmovsxwd (%rdi), %xmm0 {%k1} {z} # sched: [9:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %a = load <4 x i16>,<4 x i16> *%i,align 1
+ %x = sext <4 x i16> %a to <4 x i32>
+ %ret = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> zeroinitializer
+ ret <4 x i32> %ret
+}
+
+define <4 x i32> @sext_4x16mem_to_4x32(<4 x i16> *%i) nounwind readnone {
+; GENERIC-LABEL: sext_4x16mem_to_4x32:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpmovsxwd (%rdi), %xmm0 # sched: [7:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: sext_4x16mem_to_4x32:
+; SKX: # %bb.0:
+; SKX-NEXT: vpmovsxwd (%rdi), %xmm0 # sched: [6:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %a = load <4 x i16>,<4 x i16> *%i,align 1
+ %x = sext <4 x i16> %a to <4 x i32>
+ ret <4 x i32> %x
+}
+
+
+define <8 x i32> @zext_8x16mem_to_8x32(<8 x i16> *%i , <8 x i1> %mask) nounwind readnone {
+; GENERIC-LABEL: zext_8x16mem_to_8x32:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: vpmovw2m %xmm0, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpmovzxwd {{.*#+}} ymm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: zext_8x16mem_to_8x32:
+; SKX: # %bb.0:
+; SKX-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:0.50]
+; SKX-NEXT: vpmovw2m %xmm0, %k1 # sched: [1:1.00]
+; SKX-NEXT: vpmovzxwd {{.*#+}} ymm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %a = load <8 x i16>,<8 x i16> *%i,align 1
+ %x = zext <8 x i16> %a to <8 x i32>
+ %ret = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> zeroinitializer
+ ret <8 x i32> %ret
+}
+
+define <8 x i32> @sext_8x16mem_to_8x32mask(<8 x i16> *%i , <8 x i1> %mask) nounwind readnone {
+; GENERIC-LABEL: sext_8x16mem_to_8x32mask:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: vpmovw2m %xmm0, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpmovsxwd (%rdi), %ymm0 {%k1} {z} # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: sext_8x16mem_to_8x32mask:
+; SKX: # %bb.0:
+; SKX-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:0.50]
+; SKX-NEXT: vpmovw2m %xmm0, %k1 # sched: [1:1.00]
+; SKX-NEXT: vpmovsxwd (%rdi), %ymm0 {%k1} {z} # sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %a = load <8 x i16>,<8 x i16> *%i,align 1
+ %x = sext <8 x i16> %a to <8 x i32>
+ %ret = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> zeroinitializer
+ ret <8 x i32> %ret
+}
+
+define <8 x i32> @sext_8x16mem_to_8x32(<8 x i16> *%i) nounwind readnone {
+; GENERIC-LABEL: sext_8x16mem_to_8x32:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpmovsxwd (%rdi), %ymm0 # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: sext_8x16mem_to_8x32:
+; SKX: # %bb.0:
+; SKX-NEXT: vpmovsxwd (%rdi), %ymm0 # sched: [9:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %a = load <8 x i16>,<8 x i16> *%i,align 1
+ %x = sext <8 x i16> %a to <8 x i32>
+ ret <8 x i32> %x
+}
+
+define <8 x i32> @zext_8x16_to_8x32mask(<8 x i16> %a , <8 x i1> %mask) nounwind readnone {
+; GENERIC-LABEL: zext_8x16_to_8x32mask:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpsllw $15, %xmm1, %xmm1 # sched: [1:1.00]
+; GENERIC-NEXT: vpmovw2m %xmm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpmovzxwd {{.*#+}} ymm0 {%k1} {z} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: zext_8x16_to_8x32mask:
+; SKX: # %bb.0:
+; SKX-NEXT: vpsllw $15, %xmm1, %xmm1 # sched: [1:0.50]
+; SKX-NEXT: vpmovw2m %xmm1, %k1 # sched: [1:1.00]
+; SKX-NEXT: vpmovzxwd {{.*#+}} ymm0 {%k1} {z} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %x = zext <8 x i16> %a to <8 x i32>
+ %ret = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> zeroinitializer
+ ret <8 x i32> %ret
+}
+
+define <8 x i32> @zext_8x16_to_8x32(<8 x i16> %a ) nounwind readnone {
+; GENERIC-LABEL: zext_8x16_to_8x32:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: zext_8x16_to_8x32:
+; SKX: # %bb.0:
+; SKX-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %x = zext <8 x i16> %a to <8 x i32>
+ ret <8 x i32> %x
+}
+
+define <16 x i32> @zext_16x16mem_to_16x32(<16 x i16> *%i , <16 x i1> %mask) nounwind readnone {
+; GENERIC-LABEL: zext_16x16mem_to_16x32:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpsllw $7, %xmm0, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: vpmovb2m %xmm0, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpmovzxwd {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: zext_16x16mem_to_16x32:
+; SKX: # %bb.0:
+; SKX-NEXT: vpsllw $7, %xmm0, %xmm0 # sched: [1:0.50]
+; SKX-NEXT: vpmovb2m %xmm0, %k1 # sched: [1:1.00]
+; SKX-NEXT: vpmovzxwd {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %a = load <16 x i16>,<16 x i16> *%i,align 1
+ %x = zext <16 x i16> %a to <16 x i32>
+ %ret = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> zeroinitializer
+ ret <16 x i32> %ret
+}
+
+define <16 x i32> @sext_16x16mem_to_16x32mask(<16 x i16> *%i , <16 x i1> %mask) nounwind readnone {
+; GENERIC-LABEL: sext_16x16mem_to_16x32mask:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpsllw $7, %xmm0, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: vpmovb2m %xmm0, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpmovsxwd (%rdi), %zmm0 {%k1} {z} # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: sext_16x16mem_to_16x32mask:
+; SKX: # %bb.0:
+; SKX-NEXT: vpsllw $7, %xmm0, %xmm0 # sched: [1:0.50]
+; SKX-NEXT: vpmovb2m %xmm0, %k1 # sched: [1:1.00]
+; SKX-NEXT: vpmovsxwd (%rdi), %zmm0 {%k1} {z} # sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %a = load <16 x i16>,<16 x i16> *%i,align 1
+ %x = sext <16 x i16> %a to <16 x i32>
+ %ret = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> zeroinitializer
+ ret <16 x i32> %ret
+}
+
+define <16 x i32> @sext_16x16mem_to_16x32(<16 x i16> *%i) nounwind readnone {
+; GENERIC-LABEL: sext_16x16mem_to_16x32:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpmovsxwd (%rdi), %zmm0 # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: sext_16x16mem_to_16x32:
+; SKX: # %bb.0:
+; SKX-NEXT: vpmovsxwd (%rdi), %zmm0 # sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %a = load <16 x i16>,<16 x i16> *%i,align 1
+ %x = sext <16 x i16> %a to <16 x i32>
+ ret <16 x i32> %x
+}
+define <16 x i32> @zext_16x16_to_16x32mask(<16 x i16> %a , <16 x i1> %mask) nounwind readnone {
+; GENERIC-LABEL: zext_16x16_to_16x32mask:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpsllw $7, %xmm1, %xmm1 # sched: [1:1.00]
+; GENERIC-NEXT: vpmovb2m %xmm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpmovzxwd {{.*#+}} zmm0 {%k1} {z} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: zext_16x16_to_16x32mask:
+; SKX: # %bb.0:
+; SKX-NEXT: vpsllw $7, %xmm1, %xmm1 # sched: [1:0.50]
+; SKX-NEXT: vpmovb2m %xmm1, %k1 # sched: [1:1.00]
+; SKX-NEXT: vpmovzxwd {{.*#+}} zmm0 {%k1} {z} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %x = zext <16 x i16> %a to <16 x i32>
+ %ret = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> zeroinitializer
+ ret <16 x i32> %ret
+}
+
+define <16 x i32> @zext_16x16_to_16x32(<16 x i16> %a ) nounwind readnone {
+; GENERIC-LABEL: zext_16x16_to_16x32:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: zext_16x16_to_16x32:
+; SKX: # %bb.0:
+; SKX-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %x = zext <16 x i16> %a to <16 x i32>
+ ret <16 x i32> %x
+}
+
+define <2 x i64> @zext_2x16mem_to_2x64(<2 x i16> *%i , <2 x i1> %mask) nounwind readnone {
+; GENERIC-LABEL: zext_2x16mem_to_2x64:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpsllq $63, %xmm0, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: vptestmq %xmm0, %xmm0, %k1 # sched: [1:1.00]
+; GENERIC-NEXT: vpmovzxwq {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: zext_2x16mem_to_2x64:
+; SKX: # %bb.0:
+; SKX-NEXT: vpsllq $63, %xmm0, %xmm0 # sched: [1:0.50]
+; SKX-NEXT: vptestmq %xmm0, %xmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpmovzxwq {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero sched: [9:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %a = load <2 x i16>,<2 x i16> *%i,align 1
+ %x = zext <2 x i16> %a to <2 x i64>
+ %ret = select <2 x i1> %mask, <2 x i64> %x, <2 x i64> zeroinitializer
+ ret <2 x i64> %ret
+}
+
+define <2 x i64> @sext_2x16mem_to_2x64mask(<2 x i16> *%i , <2 x i1> %mask) nounwind readnone {
+; GENERIC-LABEL: sext_2x16mem_to_2x64mask:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpsllq $63, %xmm0, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: vptestmq %xmm0, %xmm0, %k1 # sched: [1:1.00]
+; GENERIC-NEXT: vpmovsxwq (%rdi), %xmm0 {%k1} {z} # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: sext_2x16mem_to_2x64mask:
+; SKX: # %bb.0:
+; SKX-NEXT: vpsllq $63, %xmm0, %xmm0 # sched: [1:0.50]
+; SKX-NEXT: vptestmq %xmm0, %xmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpmovsxwq (%rdi), %xmm0 {%k1} {z} # sched: [9:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %a = load <2 x i16>,<2 x i16> *%i,align 1
+ %x = sext <2 x i16> %a to <2 x i64>
+ %ret = select <2 x i1> %mask, <2 x i64> %x, <2 x i64> zeroinitializer
+ ret <2 x i64> %ret
+}
+
+define <2 x i64> @sext_2x16mem_to_2x64(<2 x i16> *%i) nounwind readnone {
+; GENERIC-LABEL: sext_2x16mem_to_2x64:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpmovsxwq (%rdi), %xmm0 # sched: [7:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: sext_2x16mem_to_2x64:
+; SKX: # %bb.0:
+; SKX-NEXT: vpmovsxwq (%rdi), %xmm0 # sched: [6:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %a = load <2 x i16>,<2 x i16> *%i,align 1
+ %x = sext <2 x i16> %a to <2 x i64>
+ ret <2 x i64> %x
+}
+
+define <4 x i64> @zext_4x16mem_to_4x64(<4 x i16> *%i , <4 x i1> %mask) nounwind readnone {
+; GENERIC-LABEL: zext_4x16mem_to_4x64:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: vptestmd %xmm0, %xmm0, %k1 # sched: [1:1.00]
+; GENERIC-NEXT: vpmovzxwq {{.*#+}} ymm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: zext_4x16mem_to_4x64:
+; SKX: # %bb.0:
+; SKX-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:0.50]
+; SKX-NEXT: vptestmd %xmm0, %xmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpmovzxwq {{.*#+}} ymm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %a = load <4 x i16>,<4 x i16> *%i,align 1
+ %x = zext <4 x i16> %a to <4 x i64>
+ %ret = select <4 x i1> %mask, <4 x i64> %x, <4 x i64> zeroinitializer
+ ret <4 x i64> %ret
+}
+
+define <4 x i64> @sext_4x16mem_to_4x64mask(<4 x i16> *%i , <4 x i1> %mask) nounwind readnone {
+; GENERIC-LABEL: sext_4x16mem_to_4x64mask:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: vptestmd %xmm0, %xmm0, %k1 # sched: [1:1.00]
+; GENERIC-NEXT: vpmovsxwq (%rdi), %ymm0 {%k1} {z} # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: sext_4x16mem_to_4x64mask:
+; SKX: # %bb.0:
+; SKX-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:0.50]
+; SKX-NEXT: vptestmd %xmm0, %xmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpmovsxwq (%rdi), %ymm0 {%k1} {z} # sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %a = load <4 x i16>,<4 x i16> *%i,align 1
+ %x = sext <4 x i16> %a to <4 x i64>
+ %ret = select <4 x i1> %mask, <4 x i64> %x, <4 x i64> zeroinitializer
+ ret <4 x i64> %ret
+}
+
+define <4 x i64> @sext_4x16mem_to_4x64(<4 x i16> *%i) nounwind readnone {
+; GENERIC-LABEL: sext_4x16mem_to_4x64:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpmovsxwq (%rdi), %ymm0 # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: sext_4x16mem_to_4x64:
+; SKX: # %bb.0:
+; SKX-NEXT: vpmovsxwq (%rdi), %ymm0 # sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %a = load <4 x i16>,<4 x i16> *%i,align 1
+ %x = sext <4 x i16> %a to <4 x i64>
+ ret <4 x i64> %x
+}
+
+define <8 x i64> @zext_8x16mem_to_8x64(<8 x i16> *%i , <8 x i1> %mask) nounwind readnone {
+; GENERIC-LABEL: zext_8x16mem_to_8x64:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: vpmovw2m %xmm0, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpmovzxwq {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: zext_8x16mem_to_8x64:
+; SKX: # %bb.0:
+; SKX-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:0.50]
+; SKX-NEXT: vpmovw2m %xmm0, %k1 # sched: [1:1.00]
+; SKX-NEXT: vpmovzxwq {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %a = load <8 x i16>,<8 x i16> *%i,align 1
+ %x = zext <8 x i16> %a to <8 x i64>
+ %ret = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> zeroinitializer
+ ret <8 x i64> %ret
+}
+
+define <8 x i64> @sext_8x16mem_to_8x64mask(<8 x i16> *%i , <8 x i1> %mask) nounwind readnone {
+; GENERIC-LABEL: sext_8x16mem_to_8x64mask:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: vpmovw2m %xmm0, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpmovsxwq (%rdi), %zmm0 {%k1} {z} # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: sext_8x16mem_to_8x64mask:
+; SKX: # %bb.0:
+; SKX-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:0.50]
+; SKX-NEXT: vpmovw2m %xmm0, %k1 # sched: [1:1.00]
+; SKX-NEXT: vpmovsxwq (%rdi), %zmm0 {%k1} {z} # sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %a = load <8 x i16>,<8 x i16> *%i,align 1
+ %x = sext <8 x i16> %a to <8 x i64>
+ %ret = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> zeroinitializer
+ ret <8 x i64> %ret
+}
+
+define <8 x i64> @sext_8x16mem_to_8x64(<8 x i16> *%i) nounwind readnone {
+; GENERIC-LABEL: sext_8x16mem_to_8x64:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpmovsxwq (%rdi), %zmm0 # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: sext_8x16mem_to_8x64:
+; SKX: # %bb.0:
+; SKX-NEXT: vpmovsxwq (%rdi), %zmm0 # sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %a = load <8 x i16>,<8 x i16> *%i,align 1
+ %x = sext <8 x i16> %a to <8 x i64>
+ ret <8 x i64> %x
+}
+
+define <8 x i64> @zext_8x16_to_8x64mask(<8 x i16> %a , <8 x i1> %mask) nounwind readnone {
+; GENERIC-LABEL: zext_8x16_to_8x64mask:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpsllw $15, %xmm1, %xmm1 # sched: [1:1.00]
+; GENERIC-NEXT: vpmovw2m %xmm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpmovzxwq {{.*#+}} zmm0 {%k1} {z} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: zext_8x16_to_8x64mask:
+; SKX: # %bb.0:
+; SKX-NEXT: vpsllw $15, %xmm1, %xmm1 # sched: [1:0.50]
+; SKX-NEXT: vpmovw2m %xmm1, %k1 # sched: [1:1.00]
+; SKX-NEXT: vpmovzxwq {{.*#+}} zmm0 {%k1} {z} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %x = zext <8 x i16> %a to <8 x i64>
+ %ret = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> zeroinitializer
+ ret <8 x i64> %ret
+}
+
+define <8 x i64> @zext_8x16_to_8x64(<8 x i16> %a) nounwind readnone {
+; GENERIC-LABEL: zext_8x16_to_8x64:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: zext_8x16_to_8x64:
+; SKX: # %bb.0:
+; SKX-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %ret = zext <8 x i16> %a to <8 x i64>
+ ret <8 x i64> %ret
+}
+
+define <2 x i64> @zext_2x32mem_to_2x64(<2 x i32> *%i , <2 x i1> %mask) nounwind readnone {
+; GENERIC-LABEL: zext_2x32mem_to_2x64:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpsllq $63, %xmm0, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: vptestmq %xmm0, %xmm0, %k1 # sched: [1:1.00]
+; GENERIC-NEXT: vpmovzxdq {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,mem[1],zero sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: zext_2x32mem_to_2x64:
+; SKX: # %bb.0:
+; SKX-NEXT: vpsllq $63, %xmm0, %xmm0 # sched: [1:0.50]
+; SKX-NEXT: vptestmq %xmm0, %xmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpmovzxdq {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,mem[1],zero sched: [9:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %a = load <2 x i32>,<2 x i32> *%i,align 1
+ %x = zext <2 x i32> %a to <2 x i64>
+ %ret = select <2 x i1> %mask, <2 x i64> %x, <2 x i64> zeroinitializer
+ ret <2 x i64> %ret
+}
+
+define <2 x i64> @sext_2x32mem_to_2x64mask(<2 x i32> *%i , <2 x i1> %mask) nounwind readnone {
+; GENERIC-LABEL: sext_2x32mem_to_2x64mask:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpsllq $63, %xmm0, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: vptestmq %xmm0, %xmm0, %k1 # sched: [1:1.00]
+; GENERIC-NEXT: vpmovsxdq (%rdi), %xmm0 {%k1} {z} # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: sext_2x32mem_to_2x64mask:
+; SKX: # %bb.0:
+; SKX-NEXT: vpsllq $63, %xmm0, %xmm0 # sched: [1:0.50]
+; SKX-NEXT: vptestmq %xmm0, %xmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpmovsxdq (%rdi), %xmm0 {%k1} {z} # sched: [9:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %a = load <2 x i32>,<2 x i32> *%i,align 1
+ %x = sext <2 x i32> %a to <2 x i64>
+ %ret = select <2 x i1> %mask, <2 x i64> %x, <2 x i64> zeroinitializer
+ ret <2 x i64> %ret
+}
+
+define <2 x i64> @sext_2x32mem_to_2x64(<2 x i32> *%i) nounwind readnone {
+; GENERIC-LABEL: sext_2x32mem_to_2x64:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpmovsxdq (%rdi), %xmm0 # sched: [7:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: sext_2x32mem_to_2x64:
+; SKX: # %bb.0:
+; SKX-NEXT: vpmovsxdq (%rdi), %xmm0 # sched: [6:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %a = load <2 x i32>,<2 x i32> *%i,align 1
+ %x = sext <2 x i32> %a to <2 x i64>
+ ret <2 x i64> %x
+}
+
+define <4 x i64> @zext_4x32mem_to_4x64(<4 x i32> *%i , <4 x i1> %mask) nounwind readnone {
+; GENERIC-LABEL: zext_4x32mem_to_4x64:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: vptestmd %xmm0, %xmm0, %k1 # sched: [1:1.00]
+; GENERIC-NEXT: vpmovzxdq {{.*#+}} ymm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: zext_4x32mem_to_4x64:
+; SKX: # %bb.0:
+; SKX-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:0.50]
+; SKX-NEXT: vptestmd %xmm0, %xmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpmovzxdq {{.*#+}} ymm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %a = load <4 x i32>,<4 x i32> *%i,align 1
+ %x = zext <4 x i32> %a to <4 x i64>
+ %ret = select <4 x i1> %mask, <4 x i64> %x, <4 x i64> zeroinitializer
+ ret <4 x i64> %ret
+}
+
+define <4 x i64> @sext_4x32mem_to_4x64mask(<4 x i32> *%i , <4 x i1> %mask) nounwind readnone {
+; GENERIC-LABEL: sext_4x32mem_to_4x64mask:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: vptestmd %xmm0, %xmm0, %k1 # sched: [1:1.00]
+; GENERIC-NEXT: vpmovsxdq (%rdi), %ymm0 {%k1} {z} # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: sext_4x32mem_to_4x64mask:
+; SKX: # %bb.0:
+; SKX-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:0.50]
+; SKX-NEXT: vptestmd %xmm0, %xmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpmovsxdq (%rdi), %ymm0 {%k1} {z} # sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %a = load <4 x i32>,<4 x i32> *%i,align 1
+ %x = sext <4 x i32> %a to <4 x i64>
+ %ret = select <4 x i1> %mask, <4 x i64> %x, <4 x i64> zeroinitializer
+ ret <4 x i64> %ret
+}
+
+define <4 x i64> @sext_4x32mem_to_4x64(<4 x i32> *%i) nounwind readnone {
+; GENERIC-LABEL: sext_4x32mem_to_4x64:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpmovsxdq (%rdi), %ymm0 # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: sext_4x32mem_to_4x64:
+; SKX: # %bb.0:
+; SKX-NEXT: vpmovsxdq (%rdi), %ymm0 # sched: [9:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %a = load <4 x i32>,<4 x i32> *%i,align 1
+ %x = sext <4 x i32> %a to <4 x i64>
+ ret <4 x i64> %x
+}
+
+define <4 x i64> @sext_4x32_to_4x64(<4 x i32> %a) nounwind readnone {
+; GENERIC-LABEL: sext_4x32_to_4x64:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpmovsxdq %xmm0, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: sext_4x32_to_4x64:
+; SKX: # %bb.0:
+; SKX-NEXT: vpmovsxdq %xmm0, %ymm0 # sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %x = sext <4 x i32> %a to <4 x i64>
+ ret <4 x i64> %x
+}
+
+define <4 x i64> @zext_4x32_to_4x64mask(<4 x i32> %a , <4 x i1> %mask) nounwind readnone {
+; GENERIC-LABEL: zext_4x32_to_4x64mask:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpslld $31, %xmm1, %xmm1 # sched: [1:1.00]
+; GENERIC-NEXT: vptestmd %xmm1, %xmm1, %k1 # sched: [1:1.00]
+; GENERIC-NEXT: vpmovzxdq {{.*#+}} ymm0 {%k1} {z} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: zext_4x32_to_4x64mask:
+; SKX: # %bb.0:
+; SKX-NEXT: vpslld $31, %xmm1, %xmm1 # sched: [1:0.50]
+; SKX-NEXT: vptestmd %xmm1, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpmovzxdq {{.*#+}} ymm0 {%k1} {z} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %x = zext <4 x i32> %a to <4 x i64>
+ %ret = select <4 x i1> %mask, <4 x i64> %x, <4 x i64> zeroinitializer
+ ret <4 x i64> %ret
+}
+
+define <8 x i64> @zext_8x32mem_to_8x64(<8 x i32> *%i , <8 x i1> %mask) nounwind readnone {
+; GENERIC-LABEL: zext_8x32mem_to_8x64:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: vpmovw2m %xmm0, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpmovzxdq {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: zext_8x32mem_to_8x64:
+; SKX: # %bb.0:
+; SKX-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:0.50]
+; SKX-NEXT: vpmovw2m %xmm0, %k1 # sched: [1:1.00]
+; SKX-NEXT: vpmovzxdq {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %a = load <8 x i32>,<8 x i32> *%i,align 1
+ %x = zext <8 x i32> %a to <8 x i64>
+ %ret = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> zeroinitializer
+ ret <8 x i64> %ret
+}
+
+define <8 x i64> @sext_8x32mem_to_8x64mask(<8 x i32> *%i , <8 x i1> %mask) nounwind readnone {
+; GENERIC-LABEL: sext_8x32mem_to_8x64mask:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: vpmovw2m %xmm0, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpmovsxdq (%rdi), %zmm0 {%k1} {z} # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: sext_8x32mem_to_8x64mask:
+; SKX: # %bb.0:
+; SKX-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:0.50]
+; SKX-NEXT: vpmovw2m %xmm0, %k1 # sched: [1:1.00]
+; SKX-NEXT: vpmovsxdq (%rdi), %zmm0 {%k1} {z} # sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %a = load <8 x i32>,<8 x i32> *%i,align 1
+ %x = sext <8 x i32> %a to <8 x i64>
+ %ret = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> zeroinitializer
+ ret <8 x i64> %ret
+}
+
+define <8 x i64> @sext_8x32mem_to_8x64(<8 x i32> *%i) nounwind readnone {
+; GENERIC-LABEL: sext_8x32mem_to_8x64:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpmovsxdq (%rdi), %zmm0 # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: sext_8x32mem_to_8x64:
+; SKX: # %bb.0:
+; SKX-NEXT: vpmovsxdq (%rdi), %zmm0 # sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %a = load <8 x i32>,<8 x i32> *%i,align 1
+ %x = sext <8 x i32> %a to <8 x i64>
+ ret <8 x i64> %x
+}
+
+define <8 x i64> @sext_8x32_to_8x64(<8 x i32> %a) nounwind readnone {
+; GENERIC-LABEL: sext_8x32_to_8x64:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpmovsxdq %ymm0, %zmm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: sext_8x32_to_8x64:
+; SKX: # %bb.0:
+; SKX-NEXT: vpmovsxdq %ymm0, %zmm0 # sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %x = sext <8 x i32> %a to <8 x i64>
+ ret <8 x i64> %x
+}
+
+define <8 x i64> @zext_8x32_to_8x64mask(<8 x i32> %a , <8 x i1> %mask) nounwind readnone {
+; GENERIC-LABEL: zext_8x32_to_8x64mask:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpsllw $15, %xmm1, %xmm1 # sched: [1:1.00]
+; GENERIC-NEXT: vpmovw2m %xmm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpmovzxdq {{.*#+}} zmm0 {%k1} {z} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: zext_8x32_to_8x64mask:
+; SKX: # %bb.0:
+; SKX-NEXT: vpsllw $15, %xmm1, %xmm1 # sched: [1:0.50]
+; SKX-NEXT: vpmovw2m %xmm1, %k1 # sched: [1:1.00]
+; SKX-NEXT: vpmovzxdq {{.*#+}} zmm0 {%k1} {z} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %x = zext <8 x i32> %a to <8 x i64>
+ %ret = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> zeroinitializer
+ ret <8 x i64> %ret
+}
+define <8 x float> @fptrunc_test(<8 x double> %a) nounwind readnone {
+; GENERIC-LABEL: fptrunc_test:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vcvtpd2ps %zmm0, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: fptrunc_test:
+; SKX: # %bb.0:
+; SKX-NEXT: vcvtpd2ps %zmm0, %ymm0 # sched: [7:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %b = fptrunc <8 x double> %a to <8 x float>
+ ret <8 x float> %b
+}
+
+define <8 x double> @fpext_test(<8 x float> %a) nounwind readnone {
+; GENERIC-LABEL: fpext_test:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vcvtps2pd %ymm0, %zmm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: fpext_test:
+; SKX: # %bb.0:
+; SKX-NEXT: vcvtps2pd %ymm0, %zmm0 # sched: [7:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %b = fpext <8 x float> %a to <8 x double>
+ ret <8 x double> %b
+}
+
+define <16 x i32> @zext_16i1_to_16xi32(i16 %b) {
+; GENERIC-LABEL: zext_16i1_to_16xi32:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: kmovd %edi, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: zext_16i1_to_16xi32:
+; SKX: # %bb.0:
+; SKX-NEXT: kmovd %edi, %k1 # sched: [1:1.00]
+; SKX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} # sched: [8:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %a = bitcast i16 %b to <16 x i1>
+ %c = zext <16 x i1> %a to <16 x i32>
+ ret <16 x i32> %c
+}
+
+define <8 x i64> @zext_8i1_to_8xi64(i8 %b) {
+; GENERIC-LABEL: zext_8i1_to_8xi64:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: kmovd %edi, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z} # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: zext_8i1_to_8xi64:
+; SKX: # %bb.0:
+; SKX-NEXT: kmovd %edi, %k1 # sched: [1:1.00]
+; SKX-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z} # sched: [8:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %a = bitcast i8 %b to <8 x i1>
+ %c = zext <8 x i1> %a to <8 x i64>
+ ret <8 x i64> %c
+}
+
+define i16 @trunc_16i8_to_16i1(<16 x i8> %a) {
+; GENERIC-LABEL: trunc_16i8_to_16i1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpsllw $7, %xmm0, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: vpmovb2m %xmm0, %k0 # sched: [1:0.33]
+; GENERIC-NEXT: kmovd %k0, %eax # sched: [1:0.33]
+; GENERIC-NEXT: # kill: def %ax killed %ax killed %eax
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: trunc_16i8_to_16i1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpsllw $7, %xmm0, %xmm0 # sched: [1:0.50]
+; SKX-NEXT: vpmovb2m %xmm0, %k0 # sched: [1:1.00]
+; SKX-NEXT: kmovd %k0, %eax # sched: [3:1.00]
+; SKX-NEXT: # kill: def %ax killed %ax killed %eax
+; SKX-NEXT: retq # sched: [7:1.00]
+ %mask_b = trunc <16 x i8>%a to <16 x i1>
+ %mask = bitcast <16 x i1> %mask_b to i16
+ ret i16 %mask
+}
+
+define i16 @trunc_16i32_to_16i1(<16 x i32> %a) {
+; GENERIC-LABEL: trunc_16i32_to_16i1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpslld $31, %zmm0, %zmm0 # sched: [3:1.00]
+; GENERIC-NEXT: vptestmd %zmm0, %zmm0, %k0 # sched: [1:1.00]
+; GENERIC-NEXT: kmovd %k0, %eax # sched: [1:0.33]
+; GENERIC-NEXT: # kill: def %ax killed %ax killed %eax
+; GENERIC-NEXT: vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: trunc_16i32_to_16i1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpslld $31, %zmm0, %zmm0 # sched: [1:0.50]
+; SKX-NEXT: vptestmd %zmm0, %zmm0, %k0 # sched: [3:1.00]
+; SKX-NEXT: kmovd %k0, %eax # sched: [3:1.00]
+; SKX-NEXT: # kill: def %ax killed %ax killed %eax
+; SKX-NEXT: vzeroupper # sched: [4:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %mask_b = trunc <16 x i32>%a to <16 x i1>
+ %mask = bitcast <16 x i1> %mask_b to i16
+ ret i16 %mask
+}
+
+define <4 x i32> @trunc_4i32_to_4i1(<4 x i32> %a, <4 x i32> %b) {
+; GENERIC-LABEL: trunc_4i32_to_4i1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: vptestmd %xmm0, %xmm0, %k1 # sched: [1:1.00]
+; GENERIC-NEXT: vpslld $31, %xmm1, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: vptestmd %xmm0, %xmm0, %k0 {%k1} # sched: [1:1.00]
+; GENERIC-NEXT: vpmovm2d %k0, %xmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: trunc_4i32_to_4i1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:0.50]
+; SKX-NEXT: vptestmd %xmm0, %xmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpslld $31, %xmm1, %xmm0 # sched: [1:0.50]
+; SKX-NEXT: vptestmd %xmm0, %xmm0, %k0 {%k1} # sched: [3:1.00]
+; SKX-NEXT: vpmovm2d %k0, %xmm0 # sched: [1:0.25]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %mask_a = trunc <4 x i32>%a to <4 x i1>
+ %mask_b = trunc <4 x i32>%b to <4 x i1>
+ %a_and_b = and <4 x i1>%mask_a, %mask_b
+ %res = sext <4 x i1>%a_and_b to <4 x i32>
+ ret <4 x i32>%res
+}
+
+
+define i8 @trunc_8i16_to_8i1(<8 x i16> %a) {
+; GENERIC-LABEL: trunc_8i16_to_8i1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: vpmovw2m %xmm0, %k0 # sched: [1:0.33]
+; GENERIC-NEXT: kmovd %k0, %eax # sched: [1:0.33]
+; GENERIC-NEXT: # kill: def %al killed %al killed %eax
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: trunc_8i16_to_8i1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:0.50]
+; SKX-NEXT: vpmovw2m %xmm0, %k0 # sched: [1:1.00]
+; SKX-NEXT: kmovd %k0, %eax # sched: [3:1.00]
+; SKX-NEXT: # kill: def %al killed %al killed %eax
+; SKX-NEXT: retq # sched: [7:1.00]
+ %mask_b = trunc <8 x i16>%a to <8 x i1>
+ %mask = bitcast <8 x i1> %mask_b to i8
+ ret i8 %mask
+}
+
+define <8 x i32> @sext_8i1_8i32(<8 x i32> %a1, <8 x i32> %a2) nounwind {
+; GENERIC-LABEL: sext_8i1_8i32:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpcmpled %ymm0, %ymm1, %k0 # sched: [3:1.00]
+; GENERIC-NEXT: vpmovm2d %k0, %ymm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: sext_8i1_8i32:
+; SKX: # %bb.0:
+; SKX-NEXT: vpcmpled %ymm0, %ymm1, %k0 # sched: [3:1.00]
+; SKX-NEXT: vpmovm2d %k0, %ymm0 # sched: [1:0.25]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %x = icmp slt <8 x i32> %a1, %a2
+ %x1 = xor <8 x i1>%x, <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
+ %y = sext <8 x i1> %x1 to <8 x i32>
+ ret <8 x i32> %y
+}
+
+
+define i16 @trunc_i32_to_i1(i32 %a) {
+; GENERIC-LABEL: trunc_i32_to_i1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: movw $-4, %ax # sched: [1:0.33]
+; GENERIC-NEXT: kmovd %eax, %k0 # sched: [1:0.33]
+; GENERIC-NEXT: kshiftrw $1, %k0, %k0 # sched: [1:1.00]
+; GENERIC-NEXT: kshiftlw $1, %k0, %k0 # sched: [1:1.00]
+; GENERIC-NEXT: andl $1, %edi # sched: [1:0.33]
+; GENERIC-NEXT: kmovw %edi, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: korw %k1, %k0, %k0 # sched: [1:1.00]
+; GENERIC-NEXT: kmovd %k0, %eax # sched: [1:0.33]
+; GENERIC-NEXT: # kill: def %ax killed %ax killed %eax
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: trunc_i32_to_i1:
+; SKX: # %bb.0:
+; SKX-NEXT: movw $-4, %ax # sched: [1:0.25]
+; SKX-NEXT: kmovd %eax, %k0 # sched: [1:1.00]
+; SKX-NEXT: kshiftrw $1, %k0, %k0 # sched: [3:1.00]
+; SKX-NEXT: kshiftlw $1, %k0, %k0 # sched: [3:1.00]
+; SKX-NEXT: andl $1, %edi # sched: [1:0.25]
+; SKX-NEXT: kmovw %edi, %k1 # sched: [1:1.00]
+; SKX-NEXT: korw %k1, %k0, %k0 # sched: [1:1.00]
+; SKX-NEXT: kmovd %k0, %eax # sched: [3:1.00]
+; SKX-NEXT: # kill: def %ax killed %ax killed %eax
+; SKX-NEXT: retq # sched: [7:1.00]
+ %a_i = trunc i32 %a to i1
+ %maskv = insertelement <16 x i1> <i1 true, i1 false, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i1 %a_i, i32 0
+ %res = bitcast <16 x i1> %maskv to i16
+ ret i16 %res
+}
+
+define <8 x i16> @sext_8i1_8i16(<8 x i32> %a1, <8 x i32> %a2) nounwind {
+; GENERIC-LABEL: sext_8i1_8i16:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpcmpgtd %ymm0, %ymm1, %k0 # sched: [3:1.00]
+; GENERIC-NEXT: vpmovm2w %k0, %xmm0 # sched: [1:0.33]
+; GENERIC-NEXT: vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: sext_8i1_8i16:
+; SKX: # %bb.0:
+; SKX-NEXT: vpcmpgtd %ymm0, %ymm1, %k0 # sched: [3:1.00]
+; SKX-NEXT: vpmovm2w %k0, %xmm0 # sched: [1:0.25]
+; SKX-NEXT: vzeroupper # sched: [4:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %x = icmp slt <8 x i32> %a1, %a2
+ %y = sext <8 x i1> %x to <8 x i16>
+ ret <8 x i16> %y
+}
+
+define <16 x i32> @sext_16i1_16i32(<16 x i32> %a1, <16 x i32> %a2) nounwind {
+; GENERIC-LABEL: sext_16i1_16i32:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpcmpgtd %zmm0, %zmm1, %k0 # sched: [3:1.00]
+; GENERIC-NEXT: vpmovm2d %k0, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: sext_16i1_16i32:
+; SKX: # %bb.0:
+; SKX-NEXT: vpcmpgtd %zmm0, %zmm1, %k0 # sched: [3:1.00]
+; SKX-NEXT: vpmovm2d %k0, %zmm0 # sched: [1:0.25]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %x = icmp slt <16 x i32> %a1, %a2
+ %y = sext <16 x i1> %x to <16 x i32>
+ ret <16 x i32> %y
+}
+
+define <8 x i64> @sext_8i1_8i64(<8 x i32> %a1, <8 x i32> %a2) nounwind {
+; GENERIC-LABEL: sext_8i1_8i64:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpcmpgtd %ymm0, %ymm1, %k0 # sched: [3:1.00]
+; GENERIC-NEXT: vpmovm2q %k0, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: sext_8i1_8i64:
+; SKX: # %bb.0:
+; SKX-NEXT: vpcmpgtd %ymm0, %ymm1, %k0 # sched: [3:1.00]
+; SKX-NEXT: vpmovm2q %k0, %zmm0 # sched: [1:0.25]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %x = icmp slt <8 x i32> %a1, %a2
+ %y = sext <8 x i1> %x to <8 x i64>
+ ret <8 x i64> %y
+}
+
+define void @extload_v8i64(<8 x i8>* %a, <8 x i64>* %res) {
+; GENERIC-LABEL: extload_v8i64:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpmovsxbq (%rdi), %zmm0 # sched: [5:1.00]
+; GENERIC-NEXT: vmovdqa64 %zmm0, (%rsi) # sched: [1:1.00]
+; GENERIC-NEXT: vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: extload_v8i64:
+; SKX: # %bb.0:
+; SKX-NEXT: vpmovsxbq (%rdi), %zmm0 # sched: [10:1.00]
+; SKX-NEXT: vmovdqa64 %zmm0, (%rsi) # sched: [1:1.00]
+; SKX-NEXT: vzeroupper # sched: [4:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %sign_load = load <8 x i8>, <8 x i8>* %a
+ %c = sext <8 x i8> %sign_load to <8 x i64>
+ store <8 x i64> %c, <8 x i64>* %res
+ ret void
+}
+
+define <64 x i16> @test21(<64 x i16> %x , <64 x i1> %mask) nounwind readnone {
+; GENERIC-LABEL: test21:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpsllw $7, %zmm2, %zmm2 # sched: [3:1.00]
+; GENERIC-NEXT: vpmovb2m %zmm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z} # sched: [1:0.33]
+; GENERIC-NEXT: kshiftrq $32, %k1, %k1 # sched: [1:1.00]
+; GENERIC-NEXT: vmovdqu16 %zmm1, %zmm1 {%k1} {z} # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test21:
+; SKX: # %bb.0:
+; SKX-NEXT: vpsllw $7, %zmm2, %zmm2 # sched: [1:0.50]
+; SKX-NEXT: vpmovb2m %zmm2, %k1 # sched: [1:1.00]
+; SKX-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z} # sched: [1:0.33]
+; SKX-NEXT: kshiftrq $32, %k1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vmovdqu16 %zmm1, %zmm1 {%k1} {z} # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %ret = select <64 x i1> %mask, <64 x i16> %x, <64 x i16> zeroinitializer
+ ret <64 x i16> %ret
+}
+
+define <16 x i16> @shuffle_zext_16x8_to_16x16(<16 x i8> %a) nounwind readnone {
+; GENERIC-LABEL: shuffle_zext_16x8_to_16x16:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: shuffle_zext_16x8_to_16x16:
+; SKX: # %bb.0:
+; SKX-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %1 = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <32 x i32> <i32 0, i32 16, i32 1, i32 16, i32 2, i32 16, i32 3, i32 16, i32 4, i32 16, i32 5, i32 16, i32 6, i32 16, i32 7, i32 16, i32 8, i32 16, i32 9, i32 16, i32 10, i32 16, i32 11, i32 16, i32 12, i32 16, i32 13, i32 16, i32 14, i32 16, i32 15, i32 16>
+ %2 = bitcast <32 x i8> %1 to <16 x i16>
+ ret <16 x i16> %2
+}
+
+define <16 x i16> @shuffle_zext_16x8_to_16x16_mask(<16 x i8> %a, <16 x i1> %mask) nounwind readnone {
+; GENERIC-LABEL: shuffle_zext_16x8_to_16x16_mask:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpsllw $7, %xmm1, %xmm1 # sched: [1:1.00]
+; GENERIC-NEXT: vpmovb2m %xmm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vpmovzxbw {{.*#+}} ymm0 {%k1} {z} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: shuffle_zext_16x8_to_16x16_mask:
+; SKX: # %bb.0:
+; SKX-NEXT: vpsllw $7, %xmm1, %xmm1 # sched: [1:0.50]
+; SKX-NEXT: vpmovb2m %xmm1, %k1 # sched: [1:1.00]
+; SKX-NEXT: vpmovzxbw {{.*#+}} ymm0 {%k1} {z} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %x = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <32 x i32> <i32 0, i32 16, i32 1, i32 16, i32 2, i32 16, i32 3, i32 16, i32 4, i32 16, i32 5, i32 16, i32 6, i32 16, i32 7, i32 16, i32 8, i32 16, i32 9, i32 16, i32 10, i32 16, i32 11, i32 16, i32 12, i32 16, i32 13, i32 16, i32 14, i32 16, i32 15, i32 16>
+ %bc = bitcast <32 x i8> %x to <16 x i16>
+ %ret = select <16 x i1> %mask, <16 x i16> %bc, <16 x i16> zeroinitializer
+ ret <16 x i16> %ret
+}
+
+define <16 x i16> @zext_32x8_to_16x16(<32 x i8> %a) {
+; GENERIC-LABEL: zext_32x8_to_16x16:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: zext_32x8_to_16x16:
+; SKX: # %bb.0:
+; SKX-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %1 = shufflevector <32 x i8> %a, <32 x i8> zeroinitializer, <32 x i32> <i32 0, i32 32, i32 1, i32 32, i32 2, i32 32, i32 3, i32 32, i32 4, i32 32, i32 5, i32 32, i32 6, i32 32, i32 7, i32 32, i32 8, i32 32, i32 9, i32 32, i32 10, i32 32, i32 11, i32 32, i32 12, i32 32, i32 13, i32 32, i32 14, i32 32, i32 15, i32 32>
+ %2 = bitcast <32 x i8> %1 to <16 x i16>
+ ret <16 x i16> %2
+}
+
+define <8 x i32> @zext_32x8_to_8x32(<32 x i8> %a) {
+; GENERIC-LABEL: zext_32x8_to_8x32:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: zext_32x8_to_8x32:
+; SKX: # %bb.0:
+; SKX-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %1 = shufflevector <32 x i8> %a, <32 x i8> zeroinitializer, <32 x i32> <i32 0, i32 32, i32 32, i32 32, i32 1, i32 32, i32 32, i32 32, i32 2, i32 32, i32 32, i32 32, i32 3, i32 32, i32 32, i32 32, i32 4, i32 32, i32 32, i32 32, i32 5, i32 32, i32 32, i32 32, i32 6, i32 32, i32 32, i32 32, i32 7, i32 32, i32 32, i32 32>
+ %2 = bitcast <32 x i8> %1 to <8 x i32>
+ ret <8 x i32> %2
+}
+
+define <4 x i64> @zext_32x8_to_4x64(<32 x i8> %a) {
+; GENERIC-LABEL: zext_32x8_to_4x64:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: zext_32x8_to_4x64:
+; SKX: # %bb.0:
+; SKX-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %1 = shufflevector <32 x i8> %a, <32 x i8> zeroinitializer, <32 x i32> <i32 0, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 1, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 2, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 3, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32>
+ %2 = bitcast <32 x i8> %1 to <4 x i64>
+ ret <4 x i64> %2
+}
+
+define <8 x i32> @zext_16x16_to_8x32(<16 x i16> %a) {
+; GENERIC-LABEL: zext_16x16_to_8x32:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: zext_16x16_to_8x32:
+; SKX: # %bb.0:
+; SKX-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %1 = shufflevector <16 x i16> %a, <16 x i16> zeroinitializer, <16 x i32> <i32 0, i32 16, i32 1, i32 16, i32 2, i32 16, i32 3, i32 16, i32 4, i32 16, i32 5, i32 16, i32 6, i32 16, i32 7, i32 16>
+ %2 = bitcast <16 x i16> %1 to <8 x i32>
+ ret <8 x i32> %2
+}
+
+define <4 x i64> @zext_16x16_to_4x64(<16 x i16> %a) {
+; GENERIC-LABEL: zext_16x16_to_4x64:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: zext_16x16_to_4x64:
+; SKX: # %bb.0:
+; SKX-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %1 = shufflevector <16 x i16> %a, <16 x i16> zeroinitializer, <16 x i32> <i32 0, i32 16, i32 16, i32 16, i32 1, i32 16, i32 16, i32 16, i32 2, i32 16, i32 16, i32 16, i32 3, i32 16, i32 16, i32 16>
+ %2 = bitcast <16 x i16> %1 to <4 x i64>
+ ret <4 x i64> %2
+}
+
+define <4 x i64> @zext_8x32_to_4x64(<8 x i32> %a) {
+; GENERIC-LABEL: zext_8x32_to_4x64:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: zext_8x32_to_4x64:
+; SKX: # %bb.0:
+; SKX-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %1 = shufflevector <8 x i32> %a, <8 x i32> zeroinitializer, <8 x i32> <i32 0, i32 8, i32 1, i32 8, i32 2, i32 8, i32 3, i32 8>
+ %2 = bitcast <8 x i32> %1 to <4 x i64>
+ ret <4 x i64> %2
+}
+
+define <64 x i8> @zext_64xi1_to_64xi8(<64 x i8> %x, <64 x i8> %y) #0 {
+; GENERIC-LABEL: zext_64xi1_to_64xi8:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpcmpeqb %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vmovdqu8 {{.*}}(%rip), %zmm0 {%k1} {z} # sched: [4:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: zext_64xi1_to_64xi8:
+; SKX: # %bb.0:
+; SKX-NEXT: vpcmpeqb %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vmovdqu8 {{.*}}(%rip), %zmm0 {%k1} {z} # sched: [8:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %mask = icmp eq <64 x i8> %x, %y
+ %1 = zext <64 x i1> %mask to <64 x i8>
+ ret <64 x i8> %1
+}
+
+define <32 x i16> @zext_32xi1_to_32xi16(<32 x i16> %x, <32 x i16> %y) #0 {
+; GENERIC-LABEL: zext_32xi1_to_32xi16:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vmovdqu16 {{.*}}(%rip), %zmm0 {%k1} {z} # sched: [4:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: zext_32xi1_to_32xi16:
+; SKX: # %bb.0:
+; SKX-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vmovdqu16 {{.*}}(%rip), %zmm0 {%k1} {z} # sched: [8:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %mask = icmp eq <32 x i16> %x, %y
+ %1 = zext <32 x i1> %mask to <32 x i16>
+ ret <32 x i16> %1
+}
+
+define <16 x i16> @zext_16xi1_to_16xi16(<16 x i16> %x, <16 x i16> %y) #0 {
+; GENERIC-LABEL: zext_16xi1_to_16xi16:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vmovdqu16 {{.*}}(%rip), %ymm0 {%k1} {z} # sched: [4:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: zext_16xi1_to_16xi16:
+; SKX: # %bb.0:
+; SKX-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vmovdqu16 {{.*}}(%rip), %ymm0 {%k1} {z} # sched: [8:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %mask = icmp eq <16 x i16> %x, %y
+ %1 = zext <16 x i1> %mask to <16 x i16>
+ ret <16 x i16> %1
+}
+
+
+define <32 x i8> @zext_32xi1_to_32xi8(<32 x i16> %x, <32 x i16> %y) #0 {
+; GENERIC-LABEL: zext_32xi1_to_32xi8:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vmovdqu8 {{.*}}(%rip), %ymm0 {%k1} {z} # sched: [4:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: zext_32xi1_to_32xi8:
+; SKX: # %bb.0:
+; SKX-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vmovdqu8 {{.*}}(%rip), %ymm0 {%k1} {z} # sched: [8:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %mask = icmp eq <32 x i16> %x, %y
+ %1 = zext <32 x i1> %mask to <32 x i8>
+ ret <32 x i8> %1
+}
+
+define <4 x i32> @zext_4xi1_to_4x32(<4 x i8> %x, <4 x i8> %y) #0 {
+; GENERIC-LABEL: zext_4xi1_to_4x32:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] sched: [6:0.50]
+; GENERIC-NEXT: vpand %xmm2, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT: vpand %xmm2, %xmm0, %xmm0 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %xmm1, %xmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} {z} # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: zext_4xi1_to_4x32:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] sched: [6:0.50]
+; SKX-NEXT: vpand %xmm2, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT: vpand %xmm2, %xmm0, %xmm0 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %xmm1, %xmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} {z} # sched: [7:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %mask = icmp eq <4 x i8> %x, %y
+ %1 = zext <4 x i1> %mask to <4 x i32>
+ ret <4 x i32> %1
+}
+
+define <2 x i64> @zext_2xi1_to_2xi64(<2 x i8> %x, <2 x i8> %y) #0 {
+; GENERIC-LABEL: zext_2xi1_to_2xi64:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] sched: [6:0.50]
+; GENERIC-NEXT: vpand %xmm2, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT: vpand %xmm2, %xmm0, %xmm0 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %xmm1, %xmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vmovdqa64 {{.*}}(%rip), %xmm0 {%k1} {z} # sched: [4:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: zext_2xi1_to_2xi64:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] sched: [6:0.50]
+; SKX-NEXT: vpand %xmm2, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT: vpand %xmm2, %xmm0, %xmm0 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %xmm1, %xmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vmovdqa64 {{.*}}(%rip), %xmm0 {%k1} {z} # sched: [7:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %mask = icmp eq <2 x i8> %x, %y
+ %1 = zext <2 x i1> %mask to <2 x i64>
+ ret <2 x i64> %1
+}
+
+define <16 x float> @test_x86_fmadd_ps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
+; GENERIC-LABEL: test_x86_fmadd_ps_z:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmulps %zmm1, %zmm0, %zmm0 # sched: [5:1.00]
+; GENERIC-NEXT: vaddps %zmm2, %zmm0, %zmm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_x86_fmadd_ps_z:
+; SKX: # %bb.0:
+; SKX-NEXT: vmulps %zmm1, %zmm0, %zmm0 # sched: [4:0.33]
+; SKX-NEXT: vaddps %zmm2, %zmm0, %zmm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %x = fmul <16 x float> %a0, %a1
+ %res = fadd <16 x float> %x, %a2
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_x86_fmsub_ps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
+; GENERIC-LABEL: test_x86_fmsub_ps_z:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmulps %zmm1, %zmm0, %zmm0 # sched: [5:1.00]
+; GENERIC-NEXT: vsubps %zmm2, %zmm0, %zmm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_x86_fmsub_ps_z:
+; SKX: # %bb.0:
+; SKX-NEXT: vmulps %zmm1, %zmm0, %zmm0 # sched: [4:0.33]
+; SKX-NEXT: vsubps %zmm2, %zmm0, %zmm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %x = fmul <16 x float> %a0, %a1
+ %res = fsub <16 x float> %x, %a2
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_x86_fnmadd_ps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
+; GENERIC-LABEL: test_x86_fnmadd_ps_z:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmulps %zmm1, %zmm0, %zmm0 # sched: [5:1.00]
+; GENERIC-NEXT: vsubps %zmm0, %zmm2, %zmm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_x86_fnmadd_ps_z:
+; SKX: # %bb.0:
+; SKX-NEXT: vmulps %zmm1, %zmm0, %zmm0 # sched: [4:0.33]
+; SKX-NEXT: vsubps %zmm0, %zmm2, %zmm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %x = fmul <16 x float> %a0, %a1
+ %res = fsub <16 x float> %a2, %x
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_x86_fnmsub_ps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
+; GENERIC-LABEL: test_x86_fnmsub_ps_z:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmulps %zmm1, %zmm0, %zmm0 # sched: [5:1.00]
+; GENERIC-NEXT: vxorps {{.*}}(%rip){1to16}, %zmm0, %zmm0 # sched: [7:1.00]
+; GENERIC-NEXT: vsubps %zmm2, %zmm0, %zmm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_x86_fnmsub_ps_z:
+; SKX: # %bb.0:
+; SKX-NEXT: vmulps %zmm1, %zmm0, %zmm0 # sched: [4:0.33]
+; SKX-NEXT: vxorps {{.*}}(%rip){1to16}, %zmm0, %zmm0 # sched: [8:0.50]
+; SKX-NEXT: vsubps %zmm2, %zmm0, %zmm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %x = fmul <16 x float> %a0, %a1
+ %y = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00,
+ float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00,
+ float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00,
+ float -0.000000e+00>, %x
+ %res = fsub <16 x float> %y, %a2
+ ret <16 x float> %res
+}
+
+define <8 x double> @test_x86_fmadd_pd_z(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
+; GENERIC-LABEL: test_x86_fmadd_pd_z:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmulpd %zmm1, %zmm0, %zmm0 # sched: [5:1.00]
+; GENERIC-NEXT: vaddpd %zmm2, %zmm0, %zmm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_x86_fmadd_pd_z:
+; SKX: # %bb.0:
+; SKX-NEXT: vmulpd %zmm1, %zmm0, %zmm0 # sched: [4:0.33]
+; SKX-NEXT: vaddpd %zmm2, %zmm0, %zmm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %x = fmul <8 x double> %a0, %a1
+ %res = fadd <8 x double> %x, %a2
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_x86_fmsub_pd_z(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
+; GENERIC-LABEL: test_x86_fmsub_pd_z:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmulpd %zmm1, %zmm0, %zmm0 # sched: [5:1.00]
+; GENERIC-NEXT: vsubpd %zmm2, %zmm0, %zmm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_x86_fmsub_pd_z:
+; SKX: # %bb.0:
+; SKX-NEXT: vmulpd %zmm1, %zmm0, %zmm0 # sched: [4:0.33]
+; SKX-NEXT: vsubpd %zmm2, %zmm0, %zmm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %x = fmul <8 x double> %a0, %a1
+ %res = fsub <8 x double> %x, %a2
+ ret <8 x double> %res
+}
+
+define double @test_x86_fmsub_213(double %a0, double %a1, double %a2) {
+; GENERIC-LABEL: test_x86_fmsub_213:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmulsd %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; GENERIC-NEXT: vsubsd %xmm2, %xmm0, %xmm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_x86_fmsub_213:
+; SKX: # %bb.0:
+; SKX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: vsubsd %xmm2, %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %x = fmul double %a0, %a1
+ %res = fsub double %x, %a2
+ ret double %res
+}
+
+define double @test_x86_fmsub_213_m(double %a0, double %a1, double * %a2_ptr) {
+; GENERIC-LABEL: test_x86_fmsub_213_m:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmulsd %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; GENERIC-NEXT: vsubsd (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_x86_fmsub_213_m:
+; SKX: # %bb.0:
+; SKX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: vsubsd (%rdi), %xmm0, %xmm0 # sched: [9:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %a2 = load double , double *%a2_ptr
+ %x = fmul double %a0, %a1
+ %res = fsub double %x, %a2
+ ret double %res
+}
+
+define double @test_x86_fmsub_231_m(double %a0, double %a1, double * %a2_ptr) {
+; GENERIC-LABEL: test_x86_fmsub_231_m:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmulsd (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
+; GENERIC-NEXT: vsubsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_x86_fmsub_231_m:
+; SKX: # %bb.0:
+; SKX-NEXT: vmulsd (%rdi), %xmm0, %xmm0 # sched: [9:0.50]
+; SKX-NEXT: vsubsd %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %a2 = load double , double *%a2_ptr
+ %x = fmul double %a0, %a2
+ %res = fsub double %x, %a1
+ ret double %res
+}
+
+define <16 x float> @test231_br(<16 x float> %a1, <16 x float> %a2) nounwind {
+; GENERIC-LABEL: test231_br:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmulps {{.*}}(%rip){1to16}, %zmm0, %zmm0 # sched: [9:1.00]
+; GENERIC-NEXT: vaddps %zmm1, %zmm0, %zmm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test231_br:
+; SKX: # %bb.0:
+; SKX-NEXT: vmulps {{.*}}(%rip){1to16}, %zmm0, %zmm0 # sched: [11:0.50]
+; SKX-NEXT: vaddps %zmm1, %zmm0, %zmm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %b1 = fmul <16 x float> %a1, <float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000>
+ %b2 = fadd <16 x float> %b1, %a2
+ ret <16 x float> %b2
+}
+
+define <16 x float> @test213_br(<16 x float> %a1, <16 x float> %a2) nounwind {
+; GENERIC-LABEL: test213_br:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmulps %zmm1, %zmm0, %zmm0 # sched: [5:1.00]
+; GENERIC-NEXT: vaddps {{.*}}(%rip){1to16}, %zmm0, %zmm0 # sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test213_br:
+; SKX: # %bb.0:
+; SKX-NEXT: vmulps %zmm1, %zmm0, %zmm0 # sched: [4:0.33]
+; SKX-NEXT: vaddps {{.*}}(%rip){1to16}, %zmm0, %zmm0 # sched: [11:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %b1 = fmul <16 x float> %a1, %a2
+ %b2 = fadd <16 x float> %b1, <float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000>
+ ret <16 x float> %b2
+}
+
+;mask (a*c+b , a)
+define <16 x float> @test_x86_fmadd132_ps(<16 x float> %a0, <16 x float> %a1, <16 x float> *%a2_ptrt, <16 x i1> %mask) {
+; GENERIC-LABEL: test_x86_fmadd132_ps:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpsllw $7, %xmm2, %xmm2 # sched: [1:1.00]
+; GENERIC-NEXT: vpmovb2m %xmm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vmulps (%rdi), %zmm0, %zmm2 # sched: [9:1.00]
+; GENERIC-NEXT: vaddps %zmm1, %zmm2, %zmm0 {%k1} # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_x86_fmadd132_ps:
+; SKX: # %bb.0:
+; SKX-NEXT: vpsllw $7, %xmm2, %xmm2 # sched: [1:0.50]
+; SKX-NEXT: vpmovb2m %xmm2, %k1 # sched: [1:1.00]
+; SKX-NEXT: vmulps (%rdi), %zmm0, %zmm2 # sched: [11:0.50]
+; SKX-NEXT: vaddps %zmm1, %zmm2, %zmm0 {%k1} # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %a2 = load <16 x float>,<16 x float> *%a2_ptrt,align 1
+ %x = fmul <16 x float> %a0, %a2
+ %y = fadd <16 x float> %x, %a1
+ %res = select <16 x i1> %mask, <16 x float> %y, <16 x float> %a0
+ ret <16 x float> %res
+}
+
+;mask (a*c+b , b)
+define <16 x float> @test_x86_fmadd231_ps(<16 x float> %a0, <16 x float> %a1, <16 x float> *%a2_ptrt, <16 x i1> %mask) {
+; GENERIC-LABEL: test_x86_fmadd231_ps:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpsllw $7, %xmm2, %xmm2 # sched: [1:1.00]
+; GENERIC-NEXT: vpmovb2m %xmm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vmulps (%rdi), %zmm0, %zmm0 # sched: [9:1.00]
+; GENERIC-NEXT: vaddps %zmm1, %zmm0, %zmm1 {%k1} # sched: [3:1.00]
+; GENERIC-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_x86_fmadd231_ps:
+; SKX: # %bb.0:
+; SKX-NEXT: vpsllw $7, %xmm2, %xmm2 # sched: [1:0.50]
+; SKX-NEXT: vpmovb2m %xmm2, %k1 # sched: [1:1.00]
+; SKX-NEXT: vmulps (%rdi), %zmm0, %zmm0 # sched: [11:0.50]
+; SKX-NEXT: vaddps %zmm1, %zmm0, %zmm1 {%k1} # sched: [4:0.33]
+; SKX-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %a2 = load <16 x float>,<16 x float> *%a2_ptrt,align 1
+ %x = fmul <16 x float> %a0, %a2
+ %y = fadd <16 x float> %x, %a1
+ %res = select <16 x i1> %mask, <16 x float> %y, <16 x float> %a1
+ ret <16 x float> %res
+}
+
+;mask (b*a+c , b)
+define <16 x float> @test_x86_fmadd213_ps(<16 x float> %a0, <16 x float> %a1, <16 x float> *%a2_ptrt, <16 x i1> %mask) {
+; GENERIC-LABEL: test_x86_fmadd213_ps:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpsllw $7, %xmm2, %xmm2 # sched: [1:1.00]
+; GENERIC-NEXT: vpmovb2m %xmm2, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vmulps %zmm0, %zmm1, %zmm0 # sched: [5:1.00]
+; GENERIC-NEXT: vaddps (%rdi), %zmm0, %zmm1 {%k1} # sched: [7:1.00]
+; GENERIC-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_x86_fmadd213_ps:
+; SKX: # %bb.0:
+; SKX-NEXT: vpsllw $7, %xmm2, %xmm2 # sched: [1:0.50]
+; SKX-NEXT: vpmovb2m %xmm2, %k1 # sched: [1:1.00]
+; SKX-NEXT: vmulps %zmm0, %zmm1, %zmm0 # sched: [4:0.33]
+; SKX-NEXT: vaddps (%rdi), %zmm0, %zmm1 {%k1} # sched: [11:0.50]
+; SKX-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %a2 = load <16 x float>,<16 x float> *%a2_ptrt,align 1
+ %x = fmul <16 x float> %a1, %a0
+ %y = fadd <16 x float> %x, %a2
+ %res = select <16 x i1> %mask, <16 x float> %y, <16 x float> %a1
+ ret <16 x float> %res
+}
+
+define <16 x i32> @vpandd(<16 x i32> %a, <16 x i32> %b) nounwind uwtable readnone ssp {
+; GENERIC-LABEL: vpandd:
+; GENERIC: # %bb.0: # %entry
+; GENERIC-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 # sched: [7:1.00]
+; GENERIC-NEXT: vpandq %zmm1, %zmm0, %zmm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: vpandd:
+; SKX: # %bb.0: # %entry
+; SKX-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 # sched: [8:0.50]
+; SKX-NEXT: vpandq %zmm1, %zmm0, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+entry:
+ ; Force the execution domain with an add.
+ %a2 = add <16 x i32> %a, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2,
+ i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+ %x = and <16 x i32> %a2, %b
+ ret <16 x i32> %x
+}
+
+define <16 x i32> @vpandnd(<16 x i32> %a, <16 x i32> %b) nounwind uwtable readnone ssp {
+; GENERIC-LABEL: vpandnd:
+; GENERIC: # %bb.0: # %entry
+; GENERIC-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 # sched: [7:1.00]
+; GENERIC-NEXT: vpandnq %zmm0, %zmm1, %zmm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: vpandnd:
+; SKX: # %bb.0: # %entry
+; SKX-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 # sched: [8:0.50]
+; SKX-NEXT: vpandnq %zmm0, %zmm1, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+entry:
+ ; Force the execution domain with an add.
+ %a2 = add <16 x i32> %a, <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3,
+ i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+ %b2 = xor <16 x i32> %b, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1,
+ i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
+ %x = and <16 x i32> %a2, %b2
+ ret <16 x i32> %x
+}
+
+define <16 x i32> @vpord(<16 x i32> %a, <16 x i32> %b) nounwind uwtable readnone ssp {
+; GENERIC-LABEL: vpord:
+; GENERIC: # %bb.0: # %entry
+; GENERIC-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 # sched: [7:1.00]
+; GENERIC-NEXT: vporq %zmm1, %zmm0, %zmm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: vpord:
+; SKX: # %bb.0: # %entry
+; SKX-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 # sched: [8:0.50]
+; SKX-NEXT: vporq %zmm1, %zmm0, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+entry:
+ ; Force the execution domain with an add.
+ %a2 = add <16 x i32> %a, <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4,
+ i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
+ %x = or <16 x i32> %a2, %b
+ ret <16 x i32> %x
+}
+
+define <16 x i32> @vpxord(<16 x i32> %a, <16 x i32> %b) nounwind uwtable readnone ssp {
+; GENERIC-LABEL: vpxord:
+; GENERIC: # %bb.0: # %entry
+; GENERIC-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 # sched: [7:1.00]
+; GENERIC-NEXT: vpxorq %zmm1, %zmm0, %zmm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: vpxord:
+; SKX: # %bb.0: # %entry
+; SKX-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 # sched: [8:0.50]
+; SKX-NEXT: vpxorq %zmm1, %zmm0, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+entry:
+ ; Force the execution domain with an add.
+ %a2 = add <16 x i32> %a, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5,
+ i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
+ %x = xor <16 x i32> %a2, %b
+ ret <16 x i32> %x
+}
+
+define <8 x i64> @vpandq(<8 x i64> %a, <8 x i64> %b) nounwind uwtable readnone ssp {
+; GENERIC-LABEL: vpandq:
+; GENERIC: # %bb.0: # %entry
+; GENERIC-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm0 # sched: [7:1.00]
+; GENERIC-NEXT: vpandq %zmm1, %zmm0, %zmm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: vpandq:
+; SKX: # %bb.0: # %entry
+; SKX-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm0 # sched: [8:0.50]
+; SKX-NEXT: vpandq %zmm1, %zmm0, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+entry:
+ ; Force the execution domain with an add.
+ %a2 = add <8 x i64> %a, <i64 6, i64 6, i64 6, i64 6, i64 6, i64 6, i64 6, i64 6>
+ %x = and <8 x i64> %a2, %b
+ ret <8 x i64> %x
+}
+
+define <8 x i64> @vpandnq(<8 x i64> %a, <8 x i64> %b) nounwind uwtable readnone ssp {
+; GENERIC-LABEL: vpandnq:
+; GENERIC: # %bb.0: # %entry
+; GENERIC-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm0 # sched: [7:1.00]
+; GENERIC-NEXT: vpandnq %zmm0, %zmm1, %zmm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: vpandnq:
+; SKX: # %bb.0: # %entry
+; SKX-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm0 # sched: [8:0.50]
+; SKX-NEXT: vpandnq %zmm0, %zmm1, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+entry:
+ ; Force the execution domain with an add.
+ %a2 = add <8 x i64> %a, <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>
+ %b2 = xor <8 x i64> %b, <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>
+ %x = and <8 x i64> %a2, %b2
+ ret <8 x i64> %x
+}
+
+define <8 x i64> @vporq(<8 x i64> %a, <8 x i64> %b) nounwind uwtable readnone ssp {
+; GENERIC-LABEL: vporq:
+; GENERIC: # %bb.0: # %entry
+; GENERIC-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm0 # sched: [7:1.00]
+; GENERIC-NEXT: vporq %zmm1, %zmm0, %zmm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: vporq:
+; SKX: # %bb.0: # %entry
+; SKX-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm0 # sched: [8:0.50]
+; SKX-NEXT: vporq %zmm1, %zmm0, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+entry:
+ ; Force the execution domain with an add.
+ %a2 = add <8 x i64> %a, <i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8>
+ %x = or <8 x i64> %a2, %b
+ ret <8 x i64> %x
+}
+
+define <8 x i64> @vpxorq(<8 x i64> %a, <8 x i64> %b) nounwind uwtable readnone ssp {
+; GENERIC-LABEL: vpxorq:
+; GENERIC: # %bb.0: # %entry
+; GENERIC-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm0 # sched: [7:1.00]
+; GENERIC-NEXT: vpxorq %zmm1, %zmm0, %zmm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: vpxorq:
+; SKX: # %bb.0: # %entry
+; SKX-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm0 # sched: [8:0.50]
+; SKX-NEXT: vpxorq %zmm1, %zmm0, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+entry:
+ ; Force the execution domain with an add.
+ %a2 = add <8 x i64> %a, <i64 9, i64 9, i64 9, i64 9, i64 9, i64 9, i64 9, i64 9>
+ %x = xor <8 x i64> %a2, %b
+ ret <8 x i64> %x
+}
+
+define <64 x i8> @and_v64i8(<64 x i8> %a, <64 x i8> %b) {
+; GENERIC-LABEL: and_v64i8:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vandps %zmm1, %zmm0, %zmm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: and_v64i8:
+; SKX: # %bb.0:
+; SKX-NEXT: vandps %zmm1, %zmm0, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %res = and <64 x i8> %a, %b
+ ret <64 x i8> %res
+}
+
+define <64 x i8> @andn_v64i8(<64 x i8> %a, <64 x i8> %b) {
+; GENERIC-LABEL: andn_v64i8:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vandnps %zmm0, %zmm1, %zmm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: andn_v64i8:
+; SKX: # %bb.0:
+; SKX-NEXT: vandnps %zmm0, %zmm1, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %b2 = xor <64 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1,
+ i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1,
+ i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1,
+ i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+ %res = and <64 x i8> %a, %b2
+ ret <64 x i8> %res
+}
+
+define <64 x i8> @or_v64i8(<64 x i8> %a, <64 x i8> %b) {
+; GENERIC-LABEL: or_v64i8:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vorps %zmm1, %zmm0, %zmm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: or_v64i8:
+; SKX: # %bb.0:
+; SKX-NEXT: vorps %zmm1, %zmm0, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %res = or <64 x i8> %a, %b
+ ret <64 x i8> %res
+}
+
+define <64 x i8> @xor_v64i8(<64 x i8> %a, <64 x i8> %b) {
+; GENERIC-LABEL: xor_v64i8:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vxorps %zmm1, %zmm0, %zmm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: xor_v64i8:
+; SKX: # %bb.0:
+; SKX-NEXT: vxorps %zmm1, %zmm0, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %res = xor <64 x i8> %a, %b
+ ret <64 x i8> %res
+}
+
+define <32 x i16> @and_v32i16(<32 x i16> %a, <32 x i16> %b) {
+; GENERIC-LABEL: and_v32i16:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vandps %zmm1, %zmm0, %zmm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: and_v32i16:
+; SKX: # %bb.0:
+; SKX-NEXT: vandps %zmm1, %zmm0, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %res = and <32 x i16> %a, %b
+ ret <32 x i16> %res
+}
+
+define <32 x i16> @andn_v32i16(<32 x i16> %a, <32 x i16> %b) {
+; GENERIC-LABEL: andn_v32i16:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vandnps %zmm0, %zmm1, %zmm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: andn_v32i16:
+; SKX: # %bb.0:
+; SKX-NEXT: vandnps %zmm0, %zmm1, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %b2 = xor <32 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1,
+ i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+ %res = and <32 x i16> %a, %b2
+ ret <32 x i16> %res
+}
+
+define <32 x i16> @or_v32i16(<32 x i16> %a, <32 x i16> %b) {
+; GENERIC-LABEL: or_v32i16:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vorps %zmm1, %zmm0, %zmm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: or_v32i16:
+; SKX: # %bb.0:
+; SKX-NEXT: vorps %zmm1, %zmm0, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %res = or <32 x i16> %a, %b
+ ret <32 x i16> %res
+}
+
+define <32 x i16> @xor_v32i16(<32 x i16> %a, <32 x i16> %b) {
+; GENERIC-LABEL: xor_v32i16:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vxorps %zmm1, %zmm0, %zmm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: xor_v32i16:
+; SKX: # %bb.0:
+; SKX-NEXT: vxorps %zmm1, %zmm0, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %res = xor <32 x i16> %a, %b
+ ret <32 x i16> %res
+}
+
+define <16 x float> @masked_and_v16f32(<16 x float> %a, <16 x float> %b, <16 x float> %passThru, i16 %mask, <16 x float> %c) {
+; GENERIC-LABEL: masked_and_v16f32:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: kmovd %edi, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vandps %zmm1, %zmm0, %zmm2 {%k1} # sched: [3:1.00]
+; GENERIC-NEXT: vaddps %zmm2, %zmm3, %zmm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: masked_and_v16f32:
+; SKX: # %bb.0:
+; SKX-NEXT: kmovd %edi, %k1 # sched: [1:1.00]
+; SKX-NEXT: vandps %zmm1, %zmm0, %zmm2 {%k1} # sched: [1:0.33]
+; SKX-NEXT: vaddps %zmm2, %zmm3, %zmm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %a1 = bitcast <16 x float> %a to <16 x i32>
+ %b1 = bitcast <16 x float> %b to <16 x i32>
+ %passThru1 = bitcast <16 x float> %passThru to <16 x i32>
+ %mask1 = bitcast i16 %mask to <16 x i1>
+ %op = and <16 x i32> %a1, %b1
+ %select = select <16 x i1> %mask1, <16 x i32> %op, <16 x i32> %passThru1
+ %cast = bitcast <16 x i32> %select to <16 x float>
+ %add = fadd <16 x float> %c, %cast
+ ret <16 x float> %add
+}
+
+define <16 x float> @masked_or_v16f32(<16 x float> %a, <16 x float> %b, <16 x float> %passThru, i16 %mask, <16 x float> %c) {
+; GENERIC-LABEL: masked_or_v16f32:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: kmovd %edi, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vandps %zmm1, %zmm0, %zmm2 {%k1} # sched: [3:1.00]
+; GENERIC-NEXT: vaddps %zmm2, %zmm3, %zmm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: masked_or_v16f32:
+; SKX: # %bb.0:
+; SKX-NEXT: kmovd %edi, %k1 # sched: [1:1.00]
+; SKX-NEXT: vandps %zmm1, %zmm0, %zmm2 {%k1} # sched: [1:0.33]
+; SKX-NEXT: vaddps %zmm2, %zmm3, %zmm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %a1 = bitcast <16 x float> %a to <16 x i32>
+ %b1 = bitcast <16 x float> %b to <16 x i32>
+ %passThru1 = bitcast <16 x float> %passThru to <16 x i32>
+ %mask1 = bitcast i16 %mask to <16 x i1>
+ %op = and <16 x i32> %a1, %b1
+ %select = select <16 x i1> %mask1, <16 x i32> %op, <16 x i32> %passThru1
+ %cast = bitcast <16 x i32> %select to <16 x float>
+ %add = fadd <16 x float> %c, %cast
+ ret <16 x float> %add
+}
+
+define <16 x float> @masked_xor_v16f32(<16 x float> %a, <16 x float> %b, <16 x float> %passThru, i16 %mask, <16 x float> %c) {
+; GENERIC-LABEL: masked_xor_v16f32:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: kmovd %edi, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vandps %zmm1, %zmm0, %zmm2 {%k1} # sched: [3:1.00]
+; GENERIC-NEXT: vaddps %zmm2, %zmm3, %zmm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: masked_xor_v16f32:
+; SKX: # %bb.0:
+; SKX-NEXT: kmovd %edi, %k1 # sched: [1:1.00]
+; SKX-NEXT: vandps %zmm1, %zmm0, %zmm2 {%k1} # sched: [1:0.33]
+; SKX-NEXT: vaddps %zmm2, %zmm3, %zmm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %a1 = bitcast <16 x float> %a to <16 x i32>
+ %b1 = bitcast <16 x float> %b to <16 x i32>
+ %passThru1 = bitcast <16 x float> %passThru to <16 x i32>
+ %mask1 = bitcast i16 %mask to <16 x i1>
+ %op = and <16 x i32> %a1, %b1
+ %select = select <16 x i1> %mask1, <16 x i32> %op, <16 x i32> %passThru1
+ %cast = bitcast <16 x i32> %select to <16 x float>
+ %add = fadd <16 x float> %c, %cast
+ ret <16 x float> %add
+}
+
+define <8 x double> @masked_and_v8f64(<8 x double> %a, <8 x double> %b, <8 x double> %passThru, i8 %mask, <8 x double> %c) {
+; GENERIC-LABEL: masked_and_v8f64:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: kmovd %edi, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vandpd %zmm1, %zmm0, %zmm2 {%k1} # sched: [3:1.00]
+; GENERIC-NEXT: vaddpd %zmm2, %zmm3, %zmm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: masked_and_v8f64:
+; SKX: # %bb.0:
+; SKX-NEXT: kmovd %edi, %k1 # sched: [1:1.00]
+; SKX-NEXT: vandpd %zmm1, %zmm0, %zmm2 {%k1} # sched: [1:0.33]
+; SKX-NEXT: vaddpd %zmm2, %zmm3, %zmm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %a1 = bitcast <8 x double> %a to <8 x i64>
+ %b1 = bitcast <8 x double> %b to <8 x i64>
+ %passThru1 = bitcast <8 x double> %passThru to <8 x i64>
+ %mask1 = bitcast i8 %mask to <8 x i1>
+ %op = and <8 x i64> %a1, %b1
+ %select = select <8 x i1> %mask1, <8 x i64> %op, <8 x i64> %passThru1
+ %cast = bitcast <8 x i64> %select to <8 x double>
+ %add = fadd <8 x double> %c, %cast
+ ret <8 x double> %add
+}
+
+define <8 x double> @masked_or_v8f64(<8 x double> %a, <8 x double> %b, <8 x double> %passThru, i8 %mask, <8 x double> %c) {
+; GENERIC-LABEL: masked_or_v8f64:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: kmovd %edi, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vandpd %zmm1, %zmm0, %zmm2 {%k1} # sched: [3:1.00]
+; GENERIC-NEXT: vaddpd %zmm2, %zmm3, %zmm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: masked_or_v8f64:
+; SKX: # %bb.0:
+; SKX-NEXT: kmovd %edi, %k1 # sched: [1:1.00]
+; SKX-NEXT: vandpd %zmm1, %zmm0, %zmm2 {%k1} # sched: [1:0.33]
+; SKX-NEXT: vaddpd %zmm2, %zmm3, %zmm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %a1 = bitcast <8 x double> %a to <8 x i64>
+ %b1 = bitcast <8 x double> %b to <8 x i64>
+ %passThru1 = bitcast <8 x double> %passThru to <8 x i64>
+ %mask1 = bitcast i8 %mask to <8 x i1>
+ %op = and <8 x i64> %a1, %b1
+ %select = select <8 x i1> %mask1, <8 x i64> %op, <8 x i64> %passThru1
+ %cast = bitcast <8 x i64> %select to <8 x double>
+ %add = fadd <8 x double> %c, %cast
+ ret <8 x double> %add
+}
+
+define <8 x double> @masked_xor_v8f64(<8 x double> %a, <8 x double> %b, <8 x double> %passThru, i8 %mask, <8 x double> %c) {
+; GENERIC-LABEL: masked_xor_v8f64:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: kmovd %edi, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vandpd %zmm1, %zmm0, %zmm2 {%k1} # sched: [3:1.00]
+; GENERIC-NEXT: vaddpd %zmm2, %zmm3, %zmm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: masked_xor_v8f64:
+; SKX: # %bb.0:
+; SKX-NEXT: kmovd %edi, %k1 # sched: [1:1.00]
+; SKX-NEXT: vandpd %zmm1, %zmm0, %zmm2 {%k1} # sched: [1:0.33]
+; SKX-NEXT: vaddpd %zmm2, %zmm3, %zmm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %a1 = bitcast <8 x double> %a to <8 x i64>
+ %b1 = bitcast <8 x double> %b to <8 x i64>
+ %passThru1 = bitcast <8 x double> %passThru to <8 x i64>
+ %mask1 = bitcast i8 %mask to <8 x i1>
+ %op = and <8 x i64> %a1, %b1
+ %select = select <8 x i1> %mask1, <8 x i64> %op, <8 x i64> %passThru1
+ %cast = bitcast <8 x i64> %select to <8 x double>
+ %add = fadd <8 x double> %c, %cast
+ ret <8 x double> %add
+}
+
+define <8 x i64> @test_mm512_mask_and_epi32(<8 x i64> %__src, i16 zeroext %__k, <8 x i64> %__a, <8 x i64> %__b) {
+; GENERIC-LABEL: test_mm512_mask_and_epi32:
+; GENERIC: # %bb.0: # %entry
+; GENERIC-NEXT: kmovd %edi, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vandps %zmm2, %zmm1, %zmm0 {%k1} # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_mm512_mask_and_epi32:
+; SKX: # %bb.0: # %entry
+; SKX-NEXT: kmovd %edi, %k1 # sched: [1:1.00]
+; SKX-NEXT: vandps %zmm2, %zmm1, %zmm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+entry:
+ %and1.i.i = and <8 x i64> %__a, %__b
+ %0 = bitcast <8 x i64> %and1.i.i to <16 x i32>
+ %1 = bitcast <8 x i64> %__src to <16 x i32>
+ %2 = bitcast i16 %__k to <16 x i1>
+ %3 = select <16 x i1> %2, <16 x i32> %0, <16 x i32> %1
+ %4 = bitcast <16 x i32> %3 to <8 x i64>
+ ret <8 x i64> %4
+}
+
+define <8 x i64> @test_mm512_mask_or_epi32(<8 x i64> %__src, i16 zeroext %__k, <8 x i64> %__a, <8 x i64> %__b) {
+; GENERIC-LABEL: test_mm512_mask_or_epi32:
+; GENERIC: # %bb.0: # %entry
+; GENERIC-NEXT: kmovd %edi, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vorps %zmm2, %zmm1, %zmm0 {%k1} # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_mm512_mask_or_epi32:
+; SKX: # %bb.0: # %entry
+; SKX-NEXT: kmovd %edi, %k1 # sched: [1:1.00]
+; SKX-NEXT: vorps %zmm2, %zmm1, %zmm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+entry:
+ %or1.i.i = or <8 x i64> %__a, %__b
+ %0 = bitcast <8 x i64> %or1.i.i to <16 x i32>
+ %1 = bitcast <8 x i64> %__src to <16 x i32>
+ %2 = bitcast i16 %__k to <16 x i1>
+ %3 = select <16 x i1> %2, <16 x i32> %0, <16 x i32> %1
+ %4 = bitcast <16 x i32> %3 to <8 x i64>
+ ret <8 x i64> %4
+}
+
+define <8 x i64> @test_mm512_mask_xor_epi32(<8 x i64> %__src, i16 zeroext %__k, <8 x i64> %__a, <8 x i64> %__b) {
+; GENERIC-LABEL: test_mm512_mask_xor_epi32:
+; GENERIC: # %bb.0: # %entry
+; GENERIC-NEXT: kmovd %edi, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vxorps %zmm2, %zmm1, %zmm0 {%k1} # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_mm512_mask_xor_epi32:
+; SKX: # %bb.0: # %entry
+; SKX-NEXT: kmovd %edi, %k1 # sched: [1:1.00]
+; SKX-NEXT: vxorps %zmm2, %zmm1, %zmm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+entry:
+ %xor1.i.i = xor <8 x i64> %__a, %__b
+ %0 = bitcast <8 x i64> %xor1.i.i to <16 x i32>
+ %1 = bitcast <8 x i64> %__src to <16 x i32>
+ %2 = bitcast i16 %__k to <16 x i1>
+ %3 = select <16 x i1> %2, <16 x i32> %0, <16 x i32> %1
+ %4 = bitcast <16 x i32> %3 to <8 x i64>
+ ret <8 x i64> %4
+}
+
+define <8 x double> @test_mm512_mask_xor_pd(<8 x double> %__W, i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) {
+; GENERIC-LABEL: test_mm512_mask_xor_pd:
+; GENERIC: # %bb.0: # %entry
+; GENERIC-NEXT: kmovd %edi, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vxorpd %zmm2, %zmm1, %zmm0 {%k1} # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_mm512_mask_xor_pd:
+; SKX: # %bb.0: # %entry
+; SKX-NEXT: kmovd %edi, %k1 # sched: [1:1.00]
+; SKX-NEXT: vxorpd %zmm2, %zmm1, %zmm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+entry:
+ %0 = bitcast <8 x double> %__A to <8 x i64>
+ %1 = bitcast <8 x double> %__B to <8 x i64>
+ %xor.i.i = xor <8 x i64> %0, %1
+ %2 = bitcast <8 x i64> %xor.i.i to <8 x double>
+ %3 = bitcast i8 %__U to <8 x i1>
+ %4 = select <8 x i1> %3, <8 x double> %2, <8 x double> %__W
+ ret <8 x double> %4
+}
+
+define <8 x double> @test_mm512_maskz_xor_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) {
+; GENERIC-LABEL: test_mm512_maskz_xor_pd:
+; GENERIC: # %bb.0: # %entry
+; GENERIC-NEXT: kmovd %edi, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vxorpd %zmm1, %zmm0, %zmm0 {%k1} {z} # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_mm512_maskz_xor_pd:
+; SKX: # %bb.0: # %entry
+; SKX-NEXT: kmovd %edi, %k1 # sched: [1:1.00]
+; SKX-NEXT: vxorpd %zmm1, %zmm0, %zmm0 {%k1} {z} # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+entry:
+ %0 = bitcast <8 x double> %__A to <8 x i64>
+ %1 = bitcast <8 x double> %__B to <8 x i64>
+ %xor.i.i = xor <8 x i64> %0, %1
+ %2 = bitcast <8 x i64> %xor.i.i to <8 x double>
+ %3 = bitcast i8 %__U to <8 x i1>
+ %4 = select <8 x i1> %3, <8 x double> %2, <8 x double> zeroinitializer
+ ret <8 x double> %4
+}
+
+define <16 x float> @test_mm512_mask_xor_ps(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) {
+; GENERIC-LABEL: test_mm512_mask_xor_ps:
+; GENERIC: # %bb.0: # %entry
+; GENERIC-NEXT: kmovd %edi, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vxorps %zmm2, %zmm1, %zmm0 {%k1} # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_mm512_mask_xor_ps:
+; SKX: # %bb.0: # %entry
+; SKX-NEXT: kmovd %edi, %k1 # sched: [1:1.00]
+; SKX-NEXT: vxorps %zmm2, %zmm1, %zmm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+entry:
+ %0 = bitcast <16 x float> %__A to <16 x i32>
+ %1 = bitcast <16 x float> %__B to <16 x i32>
+ %xor.i.i = xor <16 x i32> %0, %1
+ %2 = bitcast <16 x i32> %xor.i.i to <16 x float>
+ %3 = bitcast i16 %__U to <16 x i1>
+ %4 = select <16 x i1> %3, <16 x float> %2, <16 x float> %__W
+ ret <16 x float> %4
+}
+
+define <16 x float> @test_mm512_maskz_xor_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) {
+; GENERIC-LABEL: test_mm512_maskz_xor_ps:
+; GENERIC: # %bb.0: # %entry
+; GENERIC-NEXT: kmovd %edi, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vxorps %zmm1, %zmm0, %zmm0 {%k1} {z} # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_mm512_maskz_xor_ps:
+; SKX: # %bb.0: # %entry
+; SKX-NEXT: kmovd %edi, %k1 # sched: [1:1.00]
+; SKX-NEXT: vxorps %zmm1, %zmm0, %zmm0 {%k1} {z} # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+entry:
+ %0 = bitcast <16 x float> %__A to <16 x i32>
+ %1 = bitcast <16 x float> %__B to <16 x i32>
+ %xor.i.i = xor <16 x i32> %0, %1
+ %2 = bitcast <16 x i32> %xor.i.i to <16 x float>
+ %3 = bitcast i16 %__U to <16 x i1>
+ %4 = select <16 x i1> %3, <16 x float> %2, <16 x float> zeroinitializer
+ ret <16 x float> %4
+}
+
+define <8 x double> @test_mm512_mask_or_pd(<8 x double> %__W, i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) {
+; GENERIC-LABEL: test_mm512_mask_or_pd:
+; GENERIC: # %bb.0: # %entry
+; GENERIC-NEXT: kmovd %edi, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vorpd %zmm1, %zmm2, %zmm0 {%k1} # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_mm512_mask_or_pd:
+; SKX: # %bb.0: # %entry
+; SKX-NEXT: kmovd %edi, %k1 # sched: [1:1.00]
+; SKX-NEXT: vorpd %zmm1, %zmm2, %zmm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+entry:
+ %0 = bitcast <8 x double> %__A to <8 x i64>
+ %1 = bitcast <8 x double> %__B to <8 x i64>
+ %or.i.i = or <8 x i64> %1, %0
+ %2 = bitcast <8 x i64> %or.i.i to <8 x double>
+ %3 = bitcast i8 %__U to <8 x i1>
+ %4 = select <8 x i1> %3, <8 x double> %2, <8 x double> %__W
+ ret <8 x double> %4
+}
+
+define <8 x double> @test_mm512_maskz_or_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) {
+; GENERIC-LABEL: test_mm512_maskz_or_pd:
+; GENERIC: # %bb.0: # %entry
+; GENERIC-NEXT: kmovd %edi, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vorpd %zmm0, %zmm1, %zmm0 {%k1} {z} # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_mm512_maskz_or_pd:
+; SKX: # %bb.0: # %entry
+; SKX-NEXT: kmovd %edi, %k1 # sched: [1:1.00]
+; SKX-NEXT: vorpd %zmm0, %zmm1, %zmm0 {%k1} {z} # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+entry:
+ %0 = bitcast <8 x double> %__A to <8 x i64>
+ %1 = bitcast <8 x double> %__B to <8 x i64>
+ %or.i.i = or <8 x i64> %1, %0
+ %2 = bitcast <8 x i64> %or.i.i to <8 x double>
+ %3 = bitcast i8 %__U to <8 x i1>
+ %4 = select <8 x i1> %3, <8 x double> %2, <8 x double> zeroinitializer
+ ret <8 x double> %4
+}
+
+define <16 x float> @test_mm512_mask_or_ps(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) {
+; GENERIC-LABEL: test_mm512_mask_or_ps:
+; GENERIC: # %bb.0: # %entry
+; GENERIC-NEXT: kmovd %edi, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vorps %zmm1, %zmm2, %zmm0 {%k1} # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_mm512_mask_or_ps:
+; SKX: # %bb.0: # %entry
+; SKX-NEXT: kmovd %edi, %k1 # sched: [1:1.00]
+; SKX-NEXT: vorps %zmm1, %zmm2, %zmm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+entry:
+ %0 = bitcast <16 x float> %__A to <16 x i32>
+ %1 = bitcast <16 x float> %__B to <16 x i32>
+ %or.i.i = or <16 x i32> %1, %0
+ %2 = bitcast <16 x i32> %or.i.i to <16 x float>
+ %3 = bitcast i16 %__U to <16 x i1>
+ %4 = select <16 x i1> %3, <16 x float> %2, <16 x float> %__W
+ ret <16 x float> %4
+}
+
+define <16 x float> @test_mm512_maskz_or_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) {
+; GENERIC-LABEL: test_mm512_maskz_or_ps:
+; GENERIC: # %bb.0: # %entry
+; GENERIC-NEXT: kmovd %edi, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vorps %zmm0, %zmm1, %zmm0 {%k1} {z} # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_mm512_maskz_or_ps:
+; SKX: # %bb.0: # %entry
+; SKX-NEXT: kmovd %edi, %k1 # sched: [1:1.00]
+; SKX-NEXT: vorps %zmm0, %zmm1, %zmm0 {%k1} {z} # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+entry:
+ %0 = bitcast <16 x float> %__A to <16 x i32>
+ %1 = bitcast <16 x float> %__B to <16 x i32>
+ %or.i.i = or <16 x i32> %1, %0
+ %2 = bitcast <16 x i32> %or.i.i to <16 x float>
+ %3 = bitcast i16 %__U to <16 x i1>
+ %4 = select <16 x i1> %3, <16 x float> %2, <16 x float> zeroinitializer
+ ret <16 x float> %4
+}
+
+define <8 x double> @test_mm512_mask_and_pd(<8 x double> %__W, i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) {
+; GENERIC-LABEL: test_mm512_mask_and_pd:
+; GENERIC: # %bb.0: # %entry
+; GENERIC-NEXT: kmovd %edi, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vandpd %zmm1, %zmm2, %zmm0 {%k1} # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_mm512_mask_and_pd:
+; SKX: # %bb.0: # %entry
+; SKX-NEXT: kmovd %edi, %k1 # sched: [1:1.00]
+; SKX-NEXT: vandpd %zmm1, %zmm2, %zmm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+entry:
+ %0 = bitcast <8 x double> %__A to <8 x i64>
+ %1 = bitcast <8 x double> %__B to <8 x i64>
+ %and.i.i = and <8 x i64> %1, %0
+ %2 = bitcast <8 x i64> %and.i.i to <8 x double>
+ %3 = bitcast i8 %__U to <8 x i1>
+ %4 = select <8 x i1> %3, <8 x double> %2, <8 x double> %__W
+ ret <8 x double> %4
+}
+
+define <8 x double> @test_mm512_maskz_and_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) {
+; GENERIC-LABEL: test_mm512_maskz_and_pd:
+; GENERIC: # %bb.0: # %entry
+; GENERIC-NEXT: kmovd %edi, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vandpd %zmm0, %zmm1, %zmm0 {%k1} {z} # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_mm512_maskz_and_pd:
+; SKX: # %bb.0: # %entry
+; SKX-NEXT: kmovd %edi, %k1 # sched: [1:1.00]
+; SKX-NEXT: vandpd %zmm0, %zmm1, %zmm0 {%k1} {z} # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+entry:
+ %0 = bitcast <8 x double> %__A to <8 x i64>
+ %1 = bitcast <8 x double> %__B to <8 x i64>
+ %and.i.i = and <8 x i64> %1, %0
+ %2 = bitcast <8 x i64> %and.i.i to <8 x double>
+ %3 = bitcast i8 %__U to <8 x i1>
+ %4 = select <8 x i1> %3, <8 x double> %2, <8 x double> zeroinitializer
+ ret <8 x double> %4
+}
+
+define <16 x float> @test_mm512_mask_and_ps(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) {
+; GENERIC-LABEL: test_mm512_mask_and_ps:
+; GENERIC: # %bb.0: # %entry
+; GENERIC-NEXT: kmovd %edi, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vandps %zmm1, %zmm2, %zmm0 {%k1} # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_mm512_mask_and_ps:
+; SKX: # %bb.0: # %entry
+; SKX-NEXT: kmovd %edi, %k1 # sched: [1:1.00]
+; SKX-NEXT: vandps %zmm1, %zmm2, %zmm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+entry:
+ %0 = bitcast <16 x float> %__A to <16 x i32>
+ %1 = bitcast <16 x float> %__B to <16 x i32>
+ %and.i.i = and <16 x i32> %1, %0
+ %2 = bitcast <16 x i32> %and.i.i to <16 x float>
+ %3 = bitcast i16 %__U to <16 x i1>
+ %4 = select <16 x i1> %3, <16 x float> %2, <16 x float> %__W
+ ret <16 x float> %4
+}
+
+define <16 x float> @test_mm512_maskz_and_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) {
+; GENERIC-LABEL: test_mm512_maskz_and_ps:
+; GENERIC: # %bb.0: # %entry
+; GENERIC-NEXT: kmovd %edi, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vandps %zmm0, %zmm1, %zmm0 {%k1} {z} # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_mm512_maskz_and_ps:
+; SKX: # %bb.0: # %entry
+; SKX-NEXT: kmovd %edi, %k1 # sched: [1:1.00]
+; SKX-NEXT: vandps %zmm0, %zmm1, %zmm0 {%k1} {z} # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+entry:
+ %0 = bitcast <16 x float> %__A to <16 x i32>
+ %1 = bitcast <16 x float> %__B to <16 x i32>
+ %and.i.i = and <16 x i32> %1, %0
+ %2 = bitcast <16 x i32> %and.i.i to <16 x float>
+ %3 = bitcast i16 %__U to <16 x i1>
+ %4 = select <16 x i1> %3, <16 x float> %2, <16 x float> zeroinitializer
+ ret <16 x float> %4
+}
+
+define <8 x double> @test_mm512_mask_andnot_pd(<8 x double> %__W, i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) {
+; GENERIC-LABEL: test_mm512_mask_andnot_pd:
+; GENERIC: # %bb.0: # %entry
+; GENERIC-NEXT: kmovd %edi, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vandnpd %zmm2, %zmm1, %zmm0 {%k1} # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_mm512_mask_andnot_pd:
+; SKX: # %bb.0: # %entry
+; SKX-NEXT: kmovd %edi, %k1 # sched: [1:1.00]
+; SKX-NEXT: vandnpd %zmm2, %zmm1, %zmm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+entry:
+ %0 = bitcast <8 x double> %__A to <8 x i64>
+ %neg.i.i = xor <8 x i64> %0, <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>
+ %1 = bitcast <8 x double> %__B to <8 x i64>
+ %and.i.i = and <8 x i64> %1, %neg.i.i
+ %2 = bitcast <8 x i64> %and.i.i to <8 x double>
+ %3 = bitcast i8 %__U to <8 x i1>
+ %4 = select <8 x i1> %3, <8 x double> %2, <8 x double> %__W
+ ret <8 x double> %4
+}
+
+define <8 x double> @test_mm512_maskz_andnot_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) {
+; GENERIC-LABEL: test_mm512_maskz_andnot_pd:
+; GENERIC: # %bb.0: # %entry
+; GENERIC-NEXT: kmovd %edi, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vandnpd %zmm1, %zmm0, %zmm0 {%k1} {z} # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_mm512_maskz_andnot_pd:
+; SKX: # %bb.0: # %entry
+; SKX-NEXT: kmovd %edi, %k1 # sched: [1:1.00]
+; SKX-NEXT: vandnpd %zmm1, %zmm0, %zmm0 {%k1} {z} # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+entry:
+ %0 = bitcast <8 x double> %__A to <8 x i64>
+ %neg.i.i = xor <8 x i64> %0, <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>
+ %1 = bitcast <8 x double> %__B to <8 x i64>
+ %and.i.i = and <8 x i64> %1, %neg.i.i
+ %2 = bitcast <8 x i64> %and.i.i to <8 x double>
+ %3 = bitcast i8 %__U to <8 x i1>
+ %4 = select <8 x i1> %3, <8 x double> %2, <8 x double> zeroinitializer
+ ret <8 x double> %4
+}
+
+define <16 x float> @test_mm512_mask_andnot_ps(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) {
+; GENERIC-LABEL: test_mm512_mask_andnot_ps:
+; GENERIC: # %bb.0: # %entry
+; GENERIC-NEXT: kmovd %edi, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vandnps %zmm2, %zmm1, %zmm0 {%k1} # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_mm512_mask_andnot_ps:
+; SKX: # %bb.0: # %entry
+; SKX-NEXT: kmovd %edi, %k1 # sched: [1:1.00]
+; SKX-NEXT: vandnps %zmm2, %zmm1, %zmm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+entry:
+ %0 = bitcast <16 x float> %__A to <16 x i32>
+ %neg.i.i = xor <16 x i32> %0, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
+ %1 = bitcast <16 x float> %__B to <16 x i32>
+ %and.i.i = and <16 x i32> %1, %neg.i.i
+ %2 = bitcast <16 x i32> %and.i.i to <16 x float>
+ %3 = bitcast i16 %__U to <16 x i1>
+ %4 = select <16 x i1> %3, <16 x float> %2, <16 x float> %__W
+ ret <16 x float> %4
+}
+
+define <16 x float> @test_mm512_maskz_andnot_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) {
+; GENERIC-LABEL: test_mm512_maskz_andnot_ps:
+; GENERIC: # %bb.0: # %entry
+; GENERIC-NEXT: kmovd %edi, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vandnps %zmm1, %zmm0, %zmm0 {%k1} {z} # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_mm512_maskz_andnot_ps:
+; SKX: # %bb.0: # %entry
+; SKX-NEXT: kmovd %edi, %k1 # sched: [1:1.00]
+; SKX-NEXT: vandnps %zmm1, %zmm0, %zmm0 {%k1} {z} # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+entry:
+ %0 = bitcast <16 x float> %__A to <16 x i32>
+ %neg.i.i = xor <16 x i32> %0, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
+ %1 = bitcast <16 x float> %__B to <16 x i32>
+ %and.i.i = and <16 x i32> %1, %neg.i.i
+ %2 = bitcast <16 x i32> %and.i.i to <16 x float>
+ %3 = bitcast i16 %__U to <16 x i1>
+ %4 = select <16 x i1> %3, <16 x float> %2, <16 x float> zeroinitializer
+ ret <16 x float> %4
+}
+
+define i32 @mov_test1(float %x) {
+; GENERIC-LABEL: mov_test1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovd %xmm0, %eax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: mov_test1:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovd %xmm0, %eax # sched: [1:0.25]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %res = bitcast float %x to i32
+ ret i32 %res
+}
+
+define <4 x i32> @mov_test2(i32 %x) {
+; GENERIC-LABEL: mov_test2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovd %edi, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: mov_test2:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovd %edi, %xmm0 # sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %res = insertelement <4 x i32>undef, i32 %x, i32 0
+ ret <4 x i32>%res
+}
+
+define <2 x i64> @mov_test3(i64 %x) {
+; GENERIC-LABEL: mov_test3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovq %rdi, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: mov_test3:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovq %rdi, %xmm0 # sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %res = insertelement <2 x i64>undef, i64 %x, i32 0
+ ret <2 x i64>%res
+}
+
+define <4 x i32> @mov_test4(i32* %x) {
+; GENERIC-LABEL: mov_test4:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [6:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: mov_test4:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [5:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %y = load i32, i32* %x
+ %res = insertelement <4 x i32>undef, i32 %y, i32 0
+ ret <4 x i32>%res
+}
+
+define void @mov_test5(float %x, float* %y) {
+; GENERIC-LABEL: mov_test5:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovss %xmm0, (%rdi) # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: mov_test5:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovss %xmm0, (%rdi) # sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ store float %x, float* %y, align 4
+ ret void
+}
+
+define void @mov_test6(double %x, double* %y) {
+; GENERIC-LABEL: mov_test6:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovsd %xmm0, (%rdi) # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: mov_test6:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovsd %xmm0, (%rdi) # sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ store double %x, double* %y, align 8
+ ret void
+}
+
+define float @mov_test7(i32* %x) {
+; GENERIC-LABEL: mov_test7:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [6:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: mov_test7:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [5:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %y = load i32, i32* %x
+ %res = bitcast i32 %y to float
+ ret float %res
+}
+
+define i32 @mov_test8(<4 x i32> %x) {
+; GENERIC-LABEL: mov_test8:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovd %xmm0, %eax # sched: [2:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: mov_test8:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovd %xmm0, %eax # sched: [2:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %res = extractelement <4 x i32> %x, i32 0
+ ret i32 %res
+}
+
+define i64 @mov_test9(<2 x i64> %x) {
+; GENERIC-LABEL: mov_test9:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovq %xmm0, %rax # sched: [2:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: mov_test9:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovq %xmm0, %rax # sched: [2:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %res = extractelement <2 x i64> %x, i32 0
+ ret i64 %res
+}
+
+define <4 x i32> @mov_test10(i32* %x) {
+; GENERIC-LABEL: mov_test10:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [6:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: mov_test10:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [5:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %y = load i32, i32* %x, align 4
+ %res = insertelement <4 x i32>zeroinitializer, i32 %y, i32 0
+ ret <4 x i32>%res
+}
+
+define <4 x float> @mov_test11(float* %x) {
+; GENERIC-LABEL: mov_test11:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [6:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: mov_test11:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [5:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %y = load float, float* %x, align 4
+ %res = insertelement <4 x float>zeroinitializer, float %y, i32 0
+ ret <4 x float>%res
+}
+
+define <2 x double> @mov_test12(double* %x) {
+; GENERIC-LABEL: mov_test12:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero sched: [6:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: mov_test12:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero sched: [5:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %y = load double, double* %x, align 8
+ %res = insertelement <2 x double>zeroinitializer, double %y, i32 0
+ ret <2 x double>%res
+}
+
+define <2 x i64> @mov_test13(i64 %x) {
+; GENERIC-LABEL: mov_test13:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovq %rdi, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: mov_test13:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovq %rdi, %xmm0 # sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %res = insertelement <2 x i64>zeroinitializer, i64 %x, i32 0
+ ret <2 x i64>%res
+}
+
+define <4 x i32> @mov_test14(i32 %x) {
+; GENERIC-LABEL: mov_test14:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovd %edi, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: mov_test14:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovd %edi, %xmm0 # sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %res = insertelement <4 x i32>zeroinitializer, i32 %x, i32 0
+ ret <4 x i32>%res
+}
+
+define <4 x i32> @mov_test15(i32* %x) {
+; GENERIC-LABEL: mov_test15:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [6:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: mov_test15:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [5:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %y = load i32, i32* %x, align 4
+ %res = insertelement <4 x i32>zeroinitializer, i32 %y, i32 0
+ ret <4 x i32>%res
+}
+
+define <16 x i32> @mov_test16(i8 * %addr) {
+; GENERIC-LABEL: mov_test16:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovups (%rdi), %zmm0 # sched: [4:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: mov_test16:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovups (%rdi), %zmm0 # sched: [8:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vaddr = bitcast i8* %addr to <16 x i32>*
+ %res = load <16 x i32>, <16 x i32>* %vaddr, align 1
+ ret <16 x i32>%res
+}
+
+define <16 x i32> @mov_test17(i8 * %addr) {
+; GENERIC-LABEL: mov_test17:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovaps (%rdi), %zmm0 # sched: [4:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: mov_test17:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovaps (%rdi), %zmm0 # sched: [8:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vaddr = bitcast i8* %addr to <16 x i32>*
+ %res = load <16 x i32>, <16 x i32>* %vaddr, align 64
+ ret <16 x i32>%res
+}
+
+define void @mov_test18(i8 * %addr, <8 x i64> %data) {
+; GENERIC-LABEL: mov_test18:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovaps %zmm0, (%rdi) # sched: [1:1.00]
+; GENERIC-NEXT: vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: mov_test18:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovaps %zmm0, (%rdi) # sched: [1:1.00]
+; SKX-NEXT: vzeroupper # sched: [4:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vaddr = bitcast i8* %addr to <8 x i64>*
+ store <8 x i64>%data, <8 x i64>* %vaddr, align 64
+ ret void
+}
+
+define void @mov_test19(i8 * %addr, <16 x i32> %data) {
+; GENERIC-LABEL: mov_test19:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovups %zmm0, (%rdi) # sched: [1:1.00]
+; GENERIC-NEXT: vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: mov_test19:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovups %zmm0, (%rdi) # sched: [1:1.00]
+; SKX-NEXT: vzeroupper # sched: [4:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vaddr = bitcast i8* %addr to <16 x i32>*
+ store <16 x i32>%data, <16 x i32>* %vaddr, align 1
+ ret void
+}
+
+define void @mov_test20(i8 * %addr, <16 x i32> %data) {
+; GENERIC-LABEL: mov_test20:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovaps %zmm0, (%rdi) # sched: [1:1.00]
+; GENERIC-NEXT: vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: mov_test20:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovaps %zmm0, (%rdi) # sched: [1:1.00]
+; SKX-NEXT: vzeroupper # sched: [4:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vaddr = bitcast i8* %addr to <16 x i32>*
+ store <16 x i32>%data, <16 x i32>* %vaddr, align 64
+ ret void
+}
+
+define <8 x i64> @mov_test21(i8 * %addr) {
+; GENERIC-LABEL: mov_test21:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovaps (%rdi), %zmm0 # sched: [4:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: mov_test21:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovaps (%rdi), %zmm0 # sched: [8:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vaddr = bitcast i8* %addr to <8 x i64>*
+ %res = load <8 x i64>, <8 x i64>* %vaddr, align 64
+ ret <8 x i64>%res
+}
+
+define void @mov_test22(i8 * %addr, <8 x i64> %data) {
+; GENERIC-LABEL: mov_test22:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovups %zmm0, (%rdi) # sched: [1:1.00]
+; GENERIC-NEXT: vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: mov_test22:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovups %zmm0, (%rdi) # sched: [1:1.00]
+; SKX-NEXT: vzeroupper # sched: [4:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vaddr = bitcast i8* %addr to <8 x i64>*
+ store <8 x i64>%data, <8 x i64>* %vaddr, align 1
+ ret void
+}
+
+define <8 x i64> @mov_test23(i8 * %addr) {
+; GENERIC-LABEL: mov_test23:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovups (%rdi), %zmm0 # sched: [4:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: mov_test23:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovups (%rdi), %zmm0 # sched: [8:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vaddr = bitcast i8* %addr to <8 x i64>*
+ %res = load <8 x i64>, <8 x i64>* %vaddr, align 1
+ ret <8 x i64>%res
+}
+
+define void @mov_test24(i8 * %addr, <8 x double> %data) {
+; GENERIC-LABEL: mov_test24:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovaps %zmm0, (%rdi) # sched: [1:1.00]
+; GENERIC-NEXT: vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: mov_test24:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovaps %zmm0, (%rdi) # sched: [1:1.00]
+; SKX-NEXT: vzeroupper # sched: [4:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vaddr = bitcast i8* %addr to <8 x double>*
+ store <8 x double>%data, <8 x double>* %vaddr, align 64
+ ret void
+}
+
+define <8 x double> @mov_test25(i8 * %addr) {
+; GENERIC-LABEL: mov_test25:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovaps (%rdi), %zmm0 # sched: [4:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: mov_test25:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovaps (%rdi), %zmm0 # sched: [8:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vaddr = bitcast i8* %addr to <8 x double>*
+ %res = load <8 x double>, <8 x double>* %vaddr, align 64
+ ret <8 x double>%res
+}
+
+define void @mov_test26(i8 * %addr, <16 x float> %data) {
+; GENERIC-LABEL: mov_test26:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovaps %zmm0, (%rdi) # sched: [1:1.00]
+; GENERIC-NEXT: vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: mov_test26:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovaps %zmm0, (%rdi) # sched: [1:1.00]
+; SKX-NEXT: vzeroupper # sched: [4:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vaddr = bitcast i8* %addr to <16 x float>*
+ store <16 x float>%data, <16 x float>* %vaddr, align 64
+ ret void
+}
+
+define <16 x float> @mov_test27(i8 * %addr) {
+; GENERIC-LABEL: mov_test27:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovaps (%rdi), %zmm0 # sched: [4:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: mov_test27:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovaps (%rdi), %zmm0 # sched: [8:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vaddr = bitcast i8* %addr to <16 x float>*
+ %res = load <16 x float>, <16 x float>* %vaddr, align 64
+ ret <16 x float>%res
+}
+
+define void @mov_test28(i8 * %addr, <8 x double> %data) {
+; GENERIC-LABEL: mov_test28:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovups %zmm0, (%rdi) # sched: [1:1.00]
+; GENERIC-NEXT: vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: mov_test28:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovups %zmm0, (%rdi) # sched: [1:1.00]
+; SKX-NEXT: vzeroupper # sched: [4:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vaddr = bitcast i8* %addr to <8 x double>*
+ store <8 x double>%data, <8 x double>* %vaddr, align 1
+ ret void
+}
+
+define <8 x double> @mov_test29(i8 * %addr) {
+; GENERIC-LABEL: mov_test29:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovups (%rdi), %zmm0 # sched: [4:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: mov_test29:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovups (%rdi), %zmm0 # sched: [8:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vaddr = bitcast i8* %addr to <8 x double>*
+ %res = load <8 x double>, <8 x double>* %vaddr, align 1
+ ret <8 x double>%res
+}
+
+define void @mov_test30(i8 * %addr, <16 x float> %data) {
+; GENERIC-LABEL: mov_test30:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovups %zmm0, (%rdi) # sched: [1:1.00]
+; GENERIC-NEXT: vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: mov_test30:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovups %zmm0, (%rdi) # sched: [1:1.00]
+; SKX-NEXT: vzeroupper # sched: [4:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vaddr = bitcast i8* %addr to <16 x float>*
+ store <16 x float>%data, <16 x float>* %vaddr, align 1
+ ret void
+}
+
+define <16 x float> @mov_test31(i8 * %addr) {
+; GENERIC-LABEL: mov_test31:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovups (%rdi), %zmm0 # sched: [4:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: mov_test31:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovups (%rdi), %zmm0 # sched: [8:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vaddr = bitcast i8* %addr to <16 x float>*
+ %res = load <16 x float>, <16 x float>* %vaddr, align 1
+ ret <16 x float>%res
+}
+
+define <16 x i32> @mov_test32(i8 * %addr, <16 x i32> %old, <16 x i32> %mask1) {
+; GENERIC-LABEL: mov_test32:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vmovdqa32 (%rdi), %zmm0 {%k1} # sched: [4:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: mov_test32:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vmovdqa32 (%rdi), %zmm0 {%k1} # sched: [8:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %mask = icmp ne <16 x i32> %mask1, zeroinitializer
+ %vaddr = bitcast i8* %addr to <16 x i32>*
+ %r = load <16 x i32>, <16 x i32>* %vaddr, align 64
+ %res = select <16 x i1> %mask, <16 x i32> %r, <16 x i32> %old
+ ret <16 x i32>%res
+}
+
+define <16 x i32> @mov_test33(i8 * %addr, <16 x i32> %old, <16 x i32> %mask1) {
+; GENERIC-LABEL: mov_test33:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1} # sched: [4:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: mov_test33:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1} # sched: [8:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %mask = icmp ne <16 x i32> %mask1, zeroinitializer
+ %vaddr = bitcast i8* %addr to <16 x i32>*
+ %r = load <16 x i32>, <16 x i32>* %vaddr, align 1
+ %res = select <16 x i1> %mask, <16 x i32> %r, <16 x i32> %old
+ ret <16 x i32>%res
+}
+
+define <16 x i32> @mov_test34(i8 * %addr, <16 x i32> %mask1) {
+; GENERIC-LABEL: mov_test34:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpneqd %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vmovdqa32 (%rdi), %zmm0 {%k1} {z} # sched: [4:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: mov_test34:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT: vpcmpneqd %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vmovdqa32 (%rdi), %zmm0 {%k1} {z} # sched: [8:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %mask = icmp ne <16 x i32> %mask1, zeroinitializer
+ %vaddr = bitcast i8* %addr to <16 x i32>*
+ %r = load <16 x i32>, <16 x i32>* %vaddr, align 64
+ %res = select <16 x i1> %mask, <16 x i32> %r, <16 x i32> zeroinitializer
+ ret <16 x i32>%res
+}
+
+define <16 x i32> @mov_test35(i8 * %addr, <16 x i32> %mask1) {
+; GENERIC-LABEL: mov_test35:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpneqd %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1} {z} # sched: [4:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: mov_test35:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT: vpcmpneqd %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1} {z} # sched: [8:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %mask = icmp ne <16 x i32> %mask1, zeroinitializer
+ %vaddr = bitcast i8* %addr to <16 x i32>*
+ %r = load <16 x i32>, <16 x i32>* %vaddr, align 1
+ %res = select <16 x i1> %mask, <16 x i32> %r, <16 x i32> zeroinitializer
+ ret <16 x i32>%res
+}
+
+define <8 x i64> @mov_test36(i8 * %addr, <8 x i64> %old, <8 x i64> %mask1) {
+; GENERIC-LABEL: mov_test36:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpneqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vmovdqa64 (%rdi), %zmm0 {%k1} # sched: [4:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: mov_test36:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpneqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vmovdqa64 (%rdi), %zmm0 {%k1} # sched: [8:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %mask = icmp ne <8 x i64> %mask1, zeroinitializer
+ %vaddr = bitcast i8* %addr to <8 x i64>*
+ %r = load <8 x i64>, <8 x i64>* %vaddr, align 64
+ %res = select <8 x i1> %mask, <8 x i64> %r, <8 x i64> %old
+ ret <8 x i64>%res
+}
+
+define <8 x i64> @mov_test37(i8 * %addr, <8 x i64> %old, <8 x i64> %mask1) {
+; GENERIC-LABEL: mov_test37:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpneqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vmovdqu64 (%rdi), %zmm0 {%k1} # sched: [4:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: mov_test37:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpneqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vmovdqu64 (%rdi), %zmm0 {%k1} # sched: [8:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %mask = icmp ne <8 x i64> %mask1, zeroinitializer
+ %vaddr = bitcast i8* %addr to <8 x i64>*
+ %r = load <8 x i64>, <8 x i64>* %vaddr, align 1
+ %res = select <8 x i1> %mask, <8 x i64> %r, <8 x i64> %old
+ ret <8 x i64>%res
+}
+
+define <8 x i64> @mov_test38(i8 * %addr, <8 x i64> %mask1) {
+; GENERIC-LABEL: mov_test38:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpneqq %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vmovdqa64 (%rdi), %zmm0 {%k1} {z} # sched: [4:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: mov_test38:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT: vpcmpneqq %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vmovdqa64 (%rdi), %zmm0 {%k1} {z} # sched: [8:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %mask = icmp ne <8 x i64> %mask1, zeroinitializer
+ %vaddr = bitcast i8* %addr to <8 x i64>*
+ %r = load <8 x i64>, <8 x i64>* %vaddr, align 64
+ %res = select <8 x i1> %mask, <8 x i64> %r, <8 x i64> zeroinitializer
+ ret <8 x i64>%res
+}
+
+define <8 x i64> @mov_test39(i8 * %addr, <8 x i64> %mask1) {
+; GENERIC-LABEL: mov_test39:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpneqq %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vmovdqu64 (%rdi), %zmm0 {%k1} {z} # sched: [4:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: mov_test39:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT: vpcmpneqq %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vmovdqu64 (%rdi), %zmm0 {%k1} {z} # sched: [8:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %mask = icmp ne <8 x i64> %mask1, zeroinitializer
+ %vaddr = bitcast i8* %addr to <8 x i64>*
+ %r = load <8 x i64>, <8 x i64>* %vaddr, align 1
+ %res = select <8 x i1> %mask, <8 x i64> %r, <8 x i64> zeroinitializer
+ ret <8 x i64>%res
+}
+
+define <16 x float> @mov_test40(i8 * %addr, <16 x float> %old, <16 x float> %mask1) {
+; GENERIC-LABEL: mov_test40:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vxorps %xmm2, %xmm2, %xmm2 # sched: [1:1.00]
+; GENERIC-NEXT: vcmpneq_oqps %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vmovaps (%rdi), %zmm0 {%k1} # sched: [4:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: mov_test40:
+; SKX: # %bb.0:
+; SKX-NEXT: vxorps %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vcmpneq_oqps %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vmovaps (%rdi), %zmm0 {%k1} # sched: [8:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %mask = fcmp one <16 x float> %mask1, zeroinitializer
+ %vaddr = bitcast i8* %addr to <16 x float>*
+ %r = load <16 x float>, <16 x float>* %vaddr, align 64
+ %res = select <16 x i1> %mask, <16 x float> %r, <16 x float> %old
+ ret <16 x float>%res
+}
+
+define <16 x float> @mov_test41(i8 * %addr, <16 x float> %old, <16 x float> %mask1) {
+; GENERIC-LABEL: mov_test41:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vxorps %xmm2, %xmm2, %xmm2 # sched: [1:1.00]
+; GENERIC-NEXT: vcmpneq_oqps %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vmovups (%rdi), %zmm0 {%k1} # sched: [4:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: mov_test41:
+; SKX: # %bb.0:
+; SKX-NEXT: vxorps %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vcmpneq_oqps %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vmovups (%rdi), %zmm0 {%k1} # sched: [8:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %mask = fcmp one <16 x float> %mask1, zeroinitializer
+ %vaddr = bitcast i8* %addr to <16 x float>*
+ %r = load <16 x float>, <16 x float>* %vaddr, align 1
+ %res = select <16 x i1> %mask, <16 x float> %r, <16 x float> %old
+ ret <16 x float>%res
+}
+
+define <16 x float> @mov_test42(i8 * %addr, <16 x float> %mask1) {
+; GENERIC-LABEL: mov_test42:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vxorps %xmm1, %xmm1, %xmm1 # sched: [1:1.00]
+; GENERIC-NEXT: vcmpneq_oqps %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vmovaps (%rdi), %zmm0 {%k1} {z} # sched: [4:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: mov_test42:
+; SKX: # %bb.0:
+; SKX-NEXT: vxorps %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT: vcmpneq_oqps %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vmovaps (%rdi), %zmm0 {%k1} {z} # sched: [8:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %mask = fcmp one <16 x float> %mask1, zeroinitializer
+ %vaddr = bitcast i8* %addr to <16 x float>*
+ %r = load <16 x float>, <16 x float>* %vaddr, align 64
+ %res = select <16 x i1> %mask, <16 x float> %r, <16 x float> zeroinitializer
+ ret <16 x float>%res
+}
+
+define <16 x float> @mov_test43(i8 * %addr, <16 x float> %mask1) {
+; GENERIC-LABEL: mov_test43:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vxorps %xmm1, %xmm1, %xmm1 # sched: [1:1.00]
+; GENERIC-NEXT: vcmpneq_oqps %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vmovups (%rdi), %zmm0 {%k1} {z} # sched: [4:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: mov_test43:
+; SKX: # %bb.0:
+; SKX-NEXT: vxorps %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT: vcmpneq_oqps %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vmovups (%rdi), %zmm0 {%k1} {z} # sched: [8:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %mask = fcmp one <16 x float> %mask1, zeroinitializer
+ %vaddr = bitcast i8* %addr to <16 x float>*
+ %r = load <16 x float>, <16 x float>* %vaddr, align 1
+ %res = select <16 x i1> %mask, <16 x float> %r, <16 x float> zeroinitializer
+ ret <16 x float>%res
+}
+
+define <8 x double> @mov_test44(i8 * %addr, <8 x double> %old, <8 x double> %mask1) {
+; GENERIC-LABEL: mov_test44:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vxorpd %xmm2, %xmm2, %xmm2 # sched: [1:1.00]
+; GENERIC-NEXT: vcmpneq_oqpd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vmovapd (%rdi), %zmm0 {%k1} # sched: [4:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: mov_test44:
+; SKX: # %bb.0:
+; SKX-NEXT: vxorpd %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vcmpneq_oqpd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vmovapd (%rdi), %zmm0 {%k1} # sched: [8:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %mask = fcmp one <8 x double> %mask1, zeroinitializer
+ %vaddr = bitcast i8* %addr to <8 x double>*
+ %r = load <8 x double>, <8 x double>* %vaddr, align 64
+ %res = select <8 x i1> %mask, <8 x double> %r, <8 x double> %old
+ ret <8 x double>%res
+}
+
+define <8 x double> @mov_test45(i8 * %addr, <8 x double> %old, <8 x double> %mask1) {
+; GENERIC-LABEL: mov_test45:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vxorpd %xmm2, %xmm2, %xmm2 # sched: [1:1.00]
+; GENERIC-NEXT: vcmpneq_oqpd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vmovupd (%rdi), %zmm0 {%k1} # sched: [4:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: mov_test45:
+; SKX: # %bb.0:
+; SKX-NEXT: vxorpd %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vcmpneq_oqpd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vmovupd (%rdi), %zmm0 {%k1} # sched: [8:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %mask = fcmp one <8 x double> %mask1, zeroinitializer
+ %vaddr = bitcast i8* %addr to <8 x double>*
+ %r = load <8 x double>, <8 x double>* %vaddr, align 1
+ %res = select <8 x i1> %mask, <8 x double> %r, <8 x double> %old
+ ret <8 x double>%res
+}
+
+define <8 x double> @mov_test46(i8 * %addr, <8 x double> %mask1) {
+; GENERIC-LABEL: mov_test46:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vxorpd %xmm1, %xmm1, %xmm1 # sched: [1:1.00]
+; GENERIC-NEXT: vcmpneq_oqpd %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vmovapd (%rdi), %zmm0 {%k1} {z} # sched: [4:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: mov_test46:
+; SKX: # %bb.0:
+; SKX-NEXT: vxorpd %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT: vcmpneq_oqpd %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vmovapd (%rdi), %zmm0 {%k1} {z} # sched: [8:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %mask = fcmp one <8 x double> %mask1, zeroinitializer
+ %vaddr = bitcast i8* %addr to <8 x double>*
+ %r = load <8 x double>, <8 x double>* %vaddr, align 64
+ %res = select <8 x i1> %mask, <8 x double> %r, <8 x double> zeroinitializer
+ ret <8 x double>%res
+}
+
+define <8 x double> @mov_test47(i8 * %addr, <8 x double> %mask1) {
+; GENERIC-LABEL: mov_test47:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vxorpd %xmm1, %xmm1, %xmm1 # sched: [1:1.00]
+; GENERIC-NEXT: vcmpneq_oqpd %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vmovupd (%rdi), %zmm0 {%k1} {z} # sched: [4:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: mov_test47:
+; SKX: # %bb.0:
+; SKX-NEXT: vxorpd %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT: vcmpneq_oqpd %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vmovupd (%rdi), %zmm0 {%k1} {z} # sched: [8:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %mask = fcmp one <8 x double> %mask1, zeroinitializer
+ %vaddr = bitcast i8* %addr to <8 x double>*
+ %r = load <8 x double>, <8 x double>* %vaddr, align 1
+ %res = select <8 x i1> %mask, <8 x double> %r, <8 x double> zeroinitializer
+ ret <8 x double>%res
+}
+
+define i16 @mask16(i16 %x) {
+; GENERIC-LABEL: mask16:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: kmovd %edi, %k0 # sched: [1:0.33]
+; GENERIC-NEXT: knotw %k0, %k0 # sched: [1:1.00]
+; GENERIC-NEXT: kmovd %k0, %eax # sched: [1:0.33]
+; GENERIC-NEXT: # kill: def %ax killed %ax killed %eax
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: mask16:
+; SKX: # %bb.0:
+; SKX-NEXT: kmovd %edi, %k0 # sched: [1:1.00]
+; SKX-NEXT: knotw %k0, %k0 # sched: [1:1.00]
+; SKX-NEXT: kmovd %k0, %eax # sched: [3:1.00]
+; SKX-NEXT: # kill: def %ax killed %ax killed %eax
+; SKX-NEXT: retq # sched: [7:1.00]
+ %m0 = bitcast i16 %x to <16 x i1>
+ %m1 = xor <16 x i1> %m0, <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1>
+ %ret = bitcast <16 x i1> %m1 to i16
+ ret i16 %ret
+}
+
+define i32 @mask16_zext(i16 %x) {
+; GENERIC-LABEL: mask16_zext:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: kmovd %edi, %k0 # sched: [1:0.33]
+; GENERIC-NEXT: knotw %k0, %k0 # sched: [1:1.00]
+; GENERIC-NEXT: kmovw %k0, %eax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: mask16_zext:
+; SKX: # %bb.0:
+; SKX-NEXT: kmovd %edi, %k0 # sched: [1:1.00]
+; SKX-NEXT: knotw %k0, %k0 # sched: [1:1.00]
+; SKX-NEXT: kmovw %k0, %eax # sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %m0 = bitcast i16 %x to <16 x i1>
+ %m1 = xor <16 x i1> %m0, <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1>
+ %m2 = bitcast <16 x i1> %m1 to i16
+ %ret = zext i16 %m2 to i32
+ ret i32 %ret
+}
+
+define i8 @mask8(i8 %x) {
+; GENERIC-LABEL: mask8:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: kmovd %edi, %k0 # sched: [1:0.33]
+; GENERIC-NEXT: knotb %k0, %k0 # sched: [1:1.00]
+; GENERIC-NEXT: kmovd %k0, %eax # sched: [1:0.33]
+; GENERIC-NEXT: # kill: def %al killed %al killed %eax
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: mask8:
+; SKX: # %bb.0:
+; SKX-NEXT: kmovd %edi, %k0 # sched: [1:1.00]
+; SKX-NEXT: knotb %k0, %k0 # sched: [1:1.00]
+; SKX-NEXT: kmovd %k0, %eax # sched: [3:1.00]
+; SKX-NEXT: # kill: def %al killed %al killed %eax
+; SKX-NEXT: retq # sched: [7:1.00]
+ %m0 = bitcast i8 %x to <8 x i1>
+ %m1 = xor <8 x i1> %m0, <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1>
+ %ret = bitcast <8 x i1> %m1 to i8
+ ret i8 %ret
+}
+
+define i32 @mask8_zext(i8 %x) {
+; GENERIC-LABEL: mask8_zext:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: kmovd %edi, %k0 # sched: [1:0.33]
+; GENERIC-NEXT: knotb %k0, %k0 # sched: [1:1.00]
+; GENERIC-NEXT: kmovb %k0, %eax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: mask8_zext:
+; SKX: # %bb.0:
+; SKX-NEXT: kmovd %edi, %k0 # sched: [1:1.00]
+; SKX-NEXT: knotb %k0, %k0 # sched: [1:1.00]
+; SKX-NEXT: kmovb %k0, %eax # sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %m0 = bitcast i8 %x to <8 x i1>
+ %m1 = xor <8 x i1> %m0, <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1>
+ %m2 = bitcast <8 x i1> %m1 to i8
+ %ret = zext i8 %m2 to i32
+ ret i32 %ret
+}
+
+define void @mask16_mem(i16* %ptr) {
+; GENERIC-LABEL: mask16_mem:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: kmovw (%rdi), %k0
+; GENERIC-NEXT: knotw %k0, %k0 # sched: [1:1.00]
+; GENERIC-NEXT: kmovw %k0, (%rdi)
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: mask16_mem:
+; SKX: # %bb.0:
+; SKX-NEXT: kmovw (%rdi), %k0 # sched: [7:1.00]
+; SKX-NEXT: knotw %k0, %k0 # sched: [1:1.00]
+; SKX-NEXT: kmovw %k0, (%rdi) # sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %x = load i16, i16* %ptr, align 4
+ %m0 = bitcast i16 %x to <16 x i1>
+ %m1 = xor <16 x i1> %m0, <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1>
+ %ret = bitcast <16 x i1> %m1 to i16
+ store i16 %ret, i16* %ptr, align 4
+ ret void
+}
+
+define void @mask8_mem(i8* %ptr) {
+; GENERIC-LABEL: mask8_mem:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: kmovb (%rdi), %k0
+; GENERIC-NEXT: knotb %k0, %k0 # sched: [1:1.00]
+; GENERIC-NEXT: kmovb %k0, (%rdi)
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: mask8_mem:
+; SKX: # %bb.0:
+; SKX-NEXT: kmovb (%rdi), %k0 # sched: [7:1.00]
+; SKX-NEXT: knotb %k0, %k0 # sched: [1:1.00]
+; SKX-NEXT: kmovb %k0, (%rdi) # sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %x = load i8, i8* %ptr, align 4
+ %m0 = bitcast i8 %x to <8 x i1>
+ %m1 = xor <8 x i1> %m0, <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1>
+ %ret = bitcast <8 x i1> %m1 to i8
+ store i8 %ret, i8* %ptr, align 4
+ ret void
+}
+
+define i16 @mand16(i16 %x, i16 %y) {
+; GENERIC-LABEL: mand16:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: movl %edi, %eax # sched: [1:0.33]
+; GENERIC-NEXT: xorl %esi, %eax # sched: [1:0.33]
+; GENERIC-NEXT: andl %esi, %edi # sched: [1:0.33]
+; GENERIC-NEXT: orl %eax, %edi # sched: [1:0.33]
+; GENERIC-NEXT: movl %edi, %eax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: mand16:
+; SKX: # %bb.0:
+; SKX-NEXT: movl %edi, %eax # sched: [1:0.25]
+; SKX-NEXT: xorl %esi, %eax # sched: [1:0.25]
+; SKX-NEXT: andl %esi, %edi # sched: [1:0.25]
+; SKX-NEXT: orl %eax, %edi # sched: [1:0.25]
+; SKX-NEXT: movl %edi, %eax # sched: [1:0.25]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %ma = bitcast i16 %x to <16 x i1>
+ %mb = bitcast i16 %y to <16 x i1>
+ %mc = and <16 x i1> %ma, %mb
+ %md = xor <16 x i1> %ma, %mb
+ %me = or <16 x i1> %mc, %md
+ %ret = bitcast <16 x i1> %me to i16
+ ret i16 %ret
+}
+
+define i16 @mand16_mem(<16 x i1>* %x, <16 x i1>* %y) {
+; GENERIC-LABEL: mand16_mem:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: kmovw (%rdi), %k0
+; GENERIC-NEXT: kmovw (%rsi), %k1
+; GENERIC-NEXT: kandw %k1, %k0, %k2 # sched: [1:1.00]
+; GENERIC-NEXT: kxorw %k1, %k0, %k0 # sched: [1:1.00]
+; GENERIC-NEXT: korw %k0, %k2, %k0 # sched: [1:1.00]
+; GENERIC-NEXT: kmovd %k0, %eax # sched: [1:0.33]
+; GENERIC-NEXT: # kill: def %ax killed %ax killed %eax
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: mand16_mem:
+; SKX: # %bb.0:
+; SKX-NEXT: kmovw (%rdi), %k0 # sched: [7:1.00]
+; SKX-NEXT: kmovw (%rsi), %k1 # sched: [7:1.00]
+; SKX-NEXT: kandw %k1, %k0, %k2 # sched: [1:1.00]
+; SKX-NEXT: kxorw %k1, %k0, %k0 # sched: [1:1.00]
+; SKX-NEXT: korw %k0, %k2, %k0 # sched: [1:1.00]
+; SKX-NEXT: kmovd %k0, %eax # sched: [3:1.00]
+; SKX-NEXT: # kill: def %ax killed %ax killed %eax
+; SKX-NEXT: retq # sched: [7:1.00]
+ %ma = load <16 x i1>, <16 x i1>* %x
+ %mb = load <16 x i1>, <16 x i1>* %y
+ %mc = and <16 x i1> %ma, %mb
+ %md = xor <16 x i1> %ma, %mb
+ %me = or <16 x i1> %mc, %md
+ %ret = bitcast <16 x i1> %me to i16
+ ret i16 %ret
+}
+
+define i8 @shuf_test1(i16 %v) nounwind {
+; GENERIC-LABEL: shuf_test1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: kmovd %edi, %k0 # sched: [1:0.33]
+; GENERIC-NEXT: kshiftrw $8, %k0, %k0 # sched: [1:1.00]
+; GENERIC-NEXT: kmovd %k0, %eax # sched: [1:0.33]
+; GENERIC-NEXT: # kill: def %al killed %al killed %eax
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: shuf_test1:
+; SKX: # %bb.0:
+; SKX-NEXT: kmovd %edi, %k0 # sched: [1:1.00]
+; SKX-NEXT: kshiftrw $8, %k0, %k0 # sched: [3:1.00]
+; SKX-NEXT: kmovd %k0, %eax # sched: [3:1.00]
+; SKX-NEXT: # kill: def %al killed %al killed %eax
+; SKX-NEXT: retq # sched: [7:1.00]
+ %v1 = bitcast i16 %v to <16 x i1>
+ %mask = shufflevector <16 x i1> %v1, <16 x i1> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %mask1 = bitcast <8 x i1> %mask to i8
+ ret i8 %mask1
+}
+
+define i32 @zext_test1(<16 x i32> %a, <16 x i32> %b) {
+; GENERIC-LABEL: zext_test1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpcmpnleud %zmm1, %zmm0, %k0 # sched: [3:1.00]
+; GENERIC-NEXT: kshiftrw $5, %k0, %k0 # sched: [1:1.00]
+; GENERIC-NEXT: kmovd %k0, %eax # sched: [1:0.33]
+; GENERIC-NEXT: andl $1, %eax # sched: [1:0.33]
+; GENERIC-NEXT: vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: zext_test1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpcmpnleud %zmm1, %zmm0, %k0 # sched: [3:1.00]
+; SKX-NEXT: kshiftrw $5, %k0, %k0 # sched: [3:1.00]
+; SKX-NEXT: kmovd %k0, %eax # sched: [3:1.00]
+; SKX-NEXT: andl $1, %eax # sched: [1:0.25]
+; SKX-NEXT: vzeroupper # sched: [4:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %cmp_res = icmp ugt <16 x i32> %a, %b
+ %cmp_res.i1 = extractelement <16 x i1> %cmp_res, i32 5
+ %res = zext i1 %cmp_res.i1 to i32
+ ret i32 %res
+}
+
+define i16 @zext_test2(<16 x i32> %a, <16 x i32> %b) {
+; GENERIC-LABEL: zext_test2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpcmpnleud %zmm1, %zmm0, %k0 # sched: [3:1.00]
+; GENERIC-NEXT: kshiftrw $5, %k0, %k0 # sched: [1:1.00]
+; GENERIC-NEXT: kmovd %k0, %eax # sched: [1:0.33]
+; GENERIC-NEXT: andl $1, %eax # sched: [1:0.33]
+; GENERIC-NEXT: # kill: def %ax killed %ax killed %eax
+; GENERIC-NEXT: vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: zext_test2:
+; SKX: # %bb.0:
+; SKX-NEXT: vpcmpnleud %zmm1, %zmm0, %k0 # sched: [3:1.00]
+; SKX-NEXT: kshiftrw $5, %k0, %k0 # sched: [3:1.00]
+; SKX-NEXT: kmovd %k0, %eax # sched: [3:1.00]
+; SKX-NEXT: andl $1, %eax # sched: [1:0.25]
+; SKX-NEXT: # kill: def %ax killed %ax killed %eax
+; SKX-NEXT: vzeroupper # sched: [4:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %cmp_res = icmp ugt <16 x i32> %a, %b
+ %cmp_res.i1 = extractelement <16 x i1> %cmp_res, i32 5
+ %res = zext i1 %cmp_res.i1 to i16
+ ret i16 %res
+}
+
+define i8 @zext_test3(<16 x i32> %a, <16 x i32> %b) {
+; GENERIC-LABEL: zext_test3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpcmpnleud %zmm1, %zmm0, %k0 # sched: [3:1.00]
+; GENERIC-NEXT: kshiftrw $5, %k0, %k0 # sched: [1:1.00]
+; GENERIC-NEXT: kmovd %k0, %eax # sched: [1:0.33]
+; GENERIC-NEXT: andb $1, %al # sched: [1:0.33]
+; GENERIC-NEXT: # kill: def %al killed %al killed %eax
+; GENERIC-NEXT: vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: zext_test3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpcmpnleud %zmm1, %zmm0, %k0 # sched: [3:1.00]
+; SKX-NEXT: kshiftrw $5, %k0, %k0 # sched: [3:1.00]
+; SKX-NEXT: kmovd %k0, %eax # sched: [3:1.00]
+; SKX-NEXT: andb $1, %al # sched: [1:0.25]
+; SKX-NEXT: # kill: def %al killed %al killed %eax
+; SKX-NEXT: vzeroupper # sched: [4:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %cmp_res = icmp ugt <16 x i32> %a, %b
+ %cmp_res.i1 = extractelement <16 x i1> %cmp_res, i32 5
+ %res = zext i1 %cmp_res.i1 to i8
+ ret i8 %res
+}
+
+define i8 @conv1(<8 x i1>* %R) {
+; GENERIC-LABEL: conv1:
+; GENERIC: # %bb.0: # %entry
+; GENERIC-NEXT: kxnorw %k0, %k0, %k0 # sched: [1:1.00]
+; GENERIC-NEXT: kmovb %k0, (%rdi)
+; GENERIC-NEXT: movb $-2, -{{[0-9]+}}(%rsp) # sched: [5:1.00]
+; GENERIC-NEXT: movb $-2, %al # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: conv1:
+; SKX: # %bb.0: # %entry
+; SKX-NEXT: kxnorw %k0, %k0, %k0 # sched: [1:1.00]
+; SKX-NEXT: kmovb %k0, (%rdi) # sched: [1:1.00]
+; SKX-NEXT: movb $-2, -{{[0-9]+}}(%rsp) # sched: [1:1.00]
+; SKX-NEXT: movb $-2, %al # sched: [1:0.25]
+; SKX-NEXT: retq # sched: [7:1.00]
+entry:
+ store <8 x i1> <i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1>, <8 x i1>* %R
+
+ %maskPtr = alloca <8 x i1>
+ store <8 x i1> <i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1>, <8 x i1>* %maskPtr
+ %mask = load <8 x i1>, <8 x i1>* %maskPtr
+ %mask_convert = bitcast <8 x i1> %mask to i8
+ ret i8 %mask_convert
+}
+
+define <4 x i32> @test4(<4 x i64> %x, <4 x i64> %y, <4 x i64> %x1, <4 x i64> %y1) {
+; GENERIC-LABEL: test4:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpcmpgtq %ymm1, %ymm0, %k0 # sched: [3:1.00]
+; GENERIC-NEXT: vpcmpgtq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: kandnw %k0, %k1, %k0 # sched: [1:1.00]
+; GENERIC-NEXT: vpmovm2d %k0, %xmm0 # sched: [1:0.33]
+; GENERIC-NEXT: vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test4:
+; SKX: # %bb.0:
+; SKX-NEXT: vpcmpgtq %ymm1, %ymm0, %k0 # sched: [3:1.00]
+; SKX-NEXT: vpcmpgtq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: kandnw %k0, %k1, %k0 # sched: [1:1.00]
+; SKX-NEXT: vpmovm2d %k0, %xmm0 # sched: [1:0.25]
+; SKX-NEXT: vzeroupper # sched: [4:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %x_gt_y = icmp sgt <4 x i64> %x, %y
+ %x1_gt_y1 = icmp sgt <4 x i64> %x1, %y1
+ %res = icmp sgt <4 x i1>%x_gt_y, %x1_gt_y1
+ %resse = sext <4 x i1>%res to <4 x i32>
+ ret <4 x i32> %resse
+}
+
+define <2 x i64> @vcmp_test5(<2 x i64> %x, <2 x i64> %y, <2 x i64> %x1, <2 x i64> %y1) {
+; GENERIC-LABEL: vcmp_test5:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpcmpgtq %xmm0, %xmm1, %k0 # sched: [3:1.00]
+; GENERIC-NEXT: vpcmpgtq %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: kandnw %k1, %k0, %k0 # sched: [1:1.00]
+; GENERIC-NEXT: vpmovm2q %k0, %xmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: vcmp_test5:
+; SKX: # %bb.0:
+; SKX-NEXT: vpcmpgtq %xmm0, %xmm1, %k0 # sched: [3:1.00]
+; SKX-NEXT: vpcmpgtq %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: kandnw %k1, %k0, %k0 # sched: [1:1.00]
+; SKX-NEXT: vpmovm2q %k0, %xmm0 # sched: [1:0.25]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %x_gt_y = icmp slt <2 x i64> %x, %y
+ %x1_gt_y1 = icmp sgt <2 x i64> %x1, %y1
+ %res = icmp slt <2 x i1>%x_gt_y, %x1_gt_y1
+ %resse = sext <2 x i1>%res to <2 x i64>
+ ret <2 x i64> %resse
+}define void @vcmp_test6(<16 x i1> %mask) {
+allocas:
+ %a= and <16 x i1> %mask, <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false>
+ %b = bitcast <16 x i1> %a to i16
+ %c = icmp eq i16 %b, 0
+ br i1 %c, label %true, label %false
+
+true:
+ ret void
+
+false:
+ ret void
+}
+define void @vcmp_test7(<8 x i1> %mask) {
+; GENERIC-LABEL: vcmp_test7:
+; GENERIC: # %bb.0: # %allocas
+; GENERIC-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: vpmovw2m %xmm0, %k0 # sched: [1:0.33]
+; GENERIC-NEXT: movb $85, %al # sched: [1:0.33]
+; GENERIC-NEXT: kmovd %eax, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: korb %k1, %k0, %k0 # sched: [1:1.00]
+; GENERIC-NEXT: ktestb %k0, %k0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: vcmp_test7:
+; SKX: # %bb.0: # %allocas
+; SKX-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:0.50]
+; SKX-NEXT: vpmovw2m %xmm0, %k0 # sched: [1:1.00]
+; SKX-NEXT: movb $85, %al # sched: [1:0.25]
+; SKX-NEXT: kmovd %eax, %k1 # sched: [1:1.00]
+; SKX-NEXT: korb %k1, %k0, %k0 # sched: [1:1.00]
+; SKX-NEXT: ktestb %k0, %k0 # sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+allocas:
+ %a= or <8 x i1> %mask, <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false>
+ %b = bitcast <8 x i1> %a to i8
+ %c = icmp eq i8 %b, 0
+ br i1 %c, label %true, label %false
+
+true:
+ ret void
+
+false:
+ ret void
+}
+define <16 x i8> @vcmp_test8(<16 x i32>%a, <16 x i32>%b, i32 %a1, i32 %b1) {
+; GENERIC-LABEL: vcmp_test8:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: cmpl %esi, %edi # sched: [1:0.33]
+; GENERIC-NEXT: jg .LBB386_1 # sched: [1:1.00]
+; GENERIC-NEXT: # %bb.2:
+; GENERIC-NEXT: vpcmpltud %zmm2, %zmm1, %k0 # sched: [3:1.00]
+; GENERIC-NEXT: vpmovm2b %k0, %xmm0 # sched: [1:0.33]
+; GENERIC-NEXT: vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+; GENERIC-NEXT: .LBB386_1:
+; GENERIC-NEXT: vpcmpgtd %zmm2, %zmm0, %k0 # sched: [3:1.00]
+; GENERIC-NEXT: vpmovm2b %k0, %xmm0 # sched: [1:0.33]
+; GENERIC-NEXT: vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: vcmp_test8:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: cmpl %esi, %edi # sched: [1:0.25]
+; SKX-NEXT: jg .LBB386_1 # sched: [1:0.50]
+; SKX-NEXT: # %bb.2:
+; SKX-NEXT: vpcmpltud %zmm2, %zmm1, %k0 # sched: [3:1.00]
+; SKX-NEXT: vpmovm2b %k0, %xmm0 # sched: [1:0.25]
+; SKX-NEXT: vzeroupper # sched: [4:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+; SKX-NEXT: .LBB386_1:
+; SKX-NEXT: vpcmpgtd %zmm2, %zmm0, %k0 # sched: [3:1.00]
+; SKX-NEXT: vpmovm2b %k0, %xmm0 # sched: [1:0.25]
+; SKX-NEXT: vzeroupper # sched: [4:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %cond = icmp sgt i32 %a1, %b1
+ %cmp1 = icmp sgt <16 x i32> %a, zeroinitializer
+ %cmp2 = icmp ult <16 x i32> %b, zeroinitializer
+ %mix = select i1 %cond, <16 x i1> %cmp1, <16 x i1> %cmp2
+ %res = sext <16 x i1> %mix to <16 x i8>
+ ret <16 x i8> %res
+}
+define <16 x i1> @vpmov_test9(<16 x i1>%a, <16 x i1>%b, i32 %a1, i32 %b1) {
+; GENERIC-LABEL: vpmov_test9:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: cmpl %esi, %edi # sched: [1:0.33]
+; GENERIC-NEXT: jg .LBB387_1 # sched: [1:1.00]
+; GENERIC-NEXT: # %bb.2:
+; GENERIC-NEXT: vpsllw $7, %xmm1, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: jmp .LBB387_3 # sched: [1:1.00]
+; GENERIC-NEXT: .LBB387_1:
+; GENERIC-NEXT: vpsllw $7, %xmm0, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: .LBB387_3:
+; GENERIC-NEXT: vpmovb2m %xmm0, %k0 # sched: [1:0.33]
+; GENERIC-NEXT: vpmovm2b %k0, %xmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: vpmov_test9:
+; SKX: # %bb.0:
+; SKX-NEXT: cmpl %esi, %edi # sched: [1:0.25]
+; SKX-NEXT: jg .LBB387_1 # sched: [1:0.50]
+; SKX-NEXT: # %bb.2:
+; SKX-NEXT: vpsllw $7, %xmm1, %xmm0 # sched: [1:0.50]
+; SKX-NEXT: jmp .LBB387_3 # sched: [1:0.50]
+; SKX-NEXT: .LBB387_1:
+; SKX-NEXT: vpsllw $7, %xmm0, %xmm0 # sched: [1:0.50]
+; SKX-NEXT: .LBB387_3:
+; SKX-NEXT: vpmovb2m %xmm0, %k0 # sched: [1:1.00]
+; SKX-NEXT: vpmovm2b %k0, %xmm0 # sched: [1:0.25]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %mask = icmp sgt i32 %a1, %b1
+ %c = select i1 %mask, <16 x i1>%a, <16 x i1>%b
+ ret <16 x i1>%c
+}define <8 x i1> @vpmov_test10(<8 x i1>%a, <8 x i1>%b, i32 %a1, i32 %b1) {
+ %mask = icmp sgt i32 %a1, %b1
+ %c = select i1 %mask, <8 x i1>%a, <8 x i1>%b
+ ret <8 x i1>%c
+}
+
+define <4 x i1> @vmov_test11(<4 x i1>%a, <4 x i1>%b, i32 %a1, i32 %b1) {
+; GENERIC-LABEL: vmov_test11:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: cmpl %esi, %edi # sched: [1:0.33]
+; GENERIC-NEXT: jg .LBB389_1 # sched: [1:1.00]
+; GENERIC-NEXT: # %bb.2:
+; GENERIC-NEXT: vpslld $31, %xmm1, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: jmp .LBB389_3 # sched: [1:1.00]
+; GENERIC-NEXT: .LBB389_1:
+; GENERIC-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: .LBB389_3:
+; GENERIC-NEXT: vptestmd %xmm0, %xmm0, %k0 # sched: [1:1.00]
+; GENERIC-NEXT: vpmovm2d %k0, %xmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: vmov_test11:
+; SKX: # %bb.0:
+; SKX-NEXT: cmpl %esi, %edi # sched: [1:0.25]
+; SKX-NEXT: jg .LBB389_1 # sched: [1:0.50]
+; SKX-NEXT: # %bb.2:
+; SKX-NEXT: vpslld $31, %xmm1, %xmm0 # sched: [1:0.50]
+; SKX-NEXT: jmp .LBB389_3 # sched: [1:0.50]
+; SKX-NEXT: .LBB389_1:
+; SKX-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:0.50]
+; SKX-NEXT: .LBB389_3:
+; SKX-NEXT: vptestmd %xmm0, %xmm0, %k0 # sched: [3:1.00]
+; SKX-NEXT: vpmovm2d %k0, %xmm0 # sched: [1:0.25]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %mask = icmp sgt i32 %a1, %b1
+ %c = select i1 %mask, <4 x i1>%a, <4 x i1>%b
+ ret <4 x i1>%c
+}
+
+define i32 @vmov_test12(i32 %x, i32 %y) {
+; GENERIC-LABEL: vmov_test12:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: movl %edi, %eax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: vmov_test12:
+; SKX: # %bb.0:
+; SKX-NEXT: movl %edi, %eax # sched: [1:0.25]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %a = bitcast i16 21845 to <16 x i1>
+ %b = extractelement <16 x i1> %a, i32 0
+ %c = select i1 %b, i32 %x, i32 %y
+ ret i32 %c
+}
+
+define i32 @vmov_test13(i32 %x, i32 %y) {
+; GENERIC-LABEL: vmov_test13:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: movl %esi, %eax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: vmov_test13:
+; SKX: # %bb.0:
+; SKX-NEXT: movl %esi, %eax # sched: [1:0.25]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %a = bitcast i16 21845 to <16 x i1>
+ %b = extractelement <16 x i1> %a, i32 3
+ %c = select i1 %b, i32 %x, i32 %y
+ ret i32 %c
+}define <4 x i1> @vmov_test14() {
+ %a = bitcast i16 21845 to <16 x i1>
+ %b = extractelement <16 x i1> %a, i32 2
+ %c = insertelement <4 x i1> <i1 true, i1 false, i1 false, i1 true>, i1 %b, i32 1
+ ret <4 x i1> %c
+}
+
+define <16 x i1> @vmov_test15(i32 %x, i32 %y) {
+; GENERIC-LABEL: vmov_test15:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: cmpl %esi, %edi # sched: [1:0.33]
+; GENERIC-NEXT: movw $21845, %ax # imm = 0x5555
+; GENERIC-NEXT: # sched: [1:0.33]
+; GENERIC-NEXT: movw $1, %cx # sched: [1:0.33]
+; GENERIC-NEXT: cmovgw %ax, %cx # sched: [2:0.67]
+; GENERIC-NEXT: kmovd %ecx, %k0 # sched: [1:0.33]
+; GENERIC-NEXT: vpmovm2b %k0, %xmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: vmov_test15:
+; SKX: # %bb.0:
+; SKX-NEXT: cmpl %esi, %edi # sched: [1:0.25]
+; SKX-NEXT: movw $21845, %ax # imm = 0x5555
+; SKX-NEXT: # sched: [1:0.25]
+; SKX-NEXT: movw $1, %cx # sched: [1:0.25]
+; SKX-NEXT: cmovgw %ax, %cx # sched: [1:0.50]
+; SKX-NEXT: kmovd %ecx, %k0 # sched: [1:1.00]
+; SKX-NEXT: vpmovm2b %k0, %xmm0 # sched: [1:0.25]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %a = bitcast i16 21845 to <16 x i1>
+ %b = bitcast i16 1 to <16 x i1>
+ %mask = icmp sgt i32 %x, %y
+ %c = select i1 %mask, <16 x i1> %a, <16 x i1> %b
+ ret <16 x i1> %c
+}
+
+define <64 x i8> @vmov_test16(i64 %x) {
+;
+; GENERIC-LABEL: vmov_test16:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: kmovq %rdi, %k0 # sched: [1:0.33]
+; GENERIC-NEXT: movb $1, %al # sched: [1:0.33]
+; GENERIC-NEXT: kmovd %eax, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: kshiftrq $5, %k0, %k2 # sched: [1:1.00]
+; GENERIC-NEXT: kxorq %k1, %k2, %k1 # sched: [1:1.00]
+; GENERIC-NEXT: kshiftlq $63, %k1, %k1 # sched: [1:1.00]
+; GENERIC-NEXT: kshiftrq $58, %k1, %k1 # sched: [1:1.00]
+; GENERIC-NEXT: kxorq %k0, %k1, %k0 # sched: [1:1.00]
+; GENERIC-NEXT: vpmovm2b %k0, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: vmov_test16:
+; SKX: # %bb.0:
+; SKX-NEXT: kmovq %rdi, %k0 # sched: [1:1.00]
+; SKX-NEXT: movb $1, %al # sched: [1:0.25]
+; SKX-NEXT: kmovd %eax, %k1 # sched: [1:1.00]
+; SKX-NEXT: kshiftrq $5, %k0, %k2 # sched: [3:1.00]
+; SKX-NEXT: kxorq %k1, %k2, %k1 # sched: [1:1.00]
+; SKX-NEXT: kshiftlq $63, %k1, %k1 # sched: [3:1.00]
+; SKX-NEXT: kshiftrq $58, %k1, %k1 # sched: [3:1.00]
+; SKX-NEXT: kxorq %k0, %k1, %k0 # sched: [1:1.00]
+; SKX-NEXT: vpmovm2b %k0, %zmm0 # sched: [1:0.25]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %a = bitcast i64 %x to <64 x i1>
+ %b = insertelement <64 x i1>%a, i1 true, i32 5
+ %c = sext <64 x i1>%b to <64 x i8>
+ ret <64 x i8>%c
+}
+
+define <64 x i8> @vmov_test17(i64 %x, i32 %y, i32 %z) {
+;
+; GENERIC-LABEL: vmov_test17:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: kmovq %rdi, %k0 # sched: [1:0.33]
+; GENERIC-NEXT: cmpl %edx, %esi # sched: [1:0.33]
+; GENERIC-NEXT: setg %al # sched: [1:0.50]
+; GENERIC-NEXT: kmovd %eax, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: kshiftrq $5, %k0, %k2 # sched: [1:1.00]
+; GENERIC-NEXT: kxorq %k1, %k2, %k1 # sched: [1:1.00]
+; GENERIC-NEXT: kshiftlq $63, %k1, %k1 # sched: [1:1.00]
+; GENERIC-NEXT: kshiftrq $58, %k1, %k1 # sched: [1:1.00]
+; GENERIC-NEXT: kxorq %k0, %k1, %k0 # sched: [1:1.00]
+; GENERIC-NEXT: vpmovm2b %k0, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: vmov_test17:
+; SKX: # %bb.0:
+; SKX-NEXT: kmovq %rdi, %k0 # sched: [1:1.00]
+; SKX-NEXT: cmpl %edx, %esi # sched: [1:0.25]
+; SKX-NEXT: setg %al # sched: [1:0.50]
+; SKX-NEXT: kmovd %eax, %k1 # sched: [1:1.00]
+; SKX-NEXT: kshiftrq $5, %k0, %k2 # sched: [3:1.00]
+; SKX-NEXT: kxorq %k1, %k2, %k1 # sched: [1:1.00]
+; SKX-NEXT: kshiftlq $63, %k1, %k1 # sched: [3:1.00]
+; SKX-NEXT: kshiftrq $58, %k1, %k1 # sched: [3:1.00]
+; SKX-NEXT: kxorq %k0, %k1, %k0 # sched: [1:1.00]
+; SKX-NEXT: vpmovm2b %k0, %zmm0 # sched: [1:0.25]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %a = bitcast i64 %x to <64 x i1>
+ %b = icmp sgt i32 %y, %z
+ %c = insertelement <64 x i1>%a, i1 %b, i32 5
+ %d = sext <64 x i1>%c to <64 x i8>
+ ret <64 x i8>%d
+}
+
+define <8 x i1> @vmov_test18(i8 %a, i16 %y) {
+; GENERIC-LABEL: vmov_test18:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: kmovd %edi, %k0 # sched: [1:0.33]
+; GENERIC-NEXT: kmovd %esi, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: kshiftrw $8, %k1, %k2 # sched: [1:1.00]
+; GENERIC-NEXT: kshiftrw $9, %k1, %k1 # sched: [1:1.00]
+; GENERIC-NEXT: kshiftrb $6, %k0, %k3 # sched: [1:1.00]
+; GENERIC-NEXT: kxorb %k1, %k3, %k1 # sched: [1:1.00]
+; GENERIC-NEXT: kshiftlb $7, %k1, %k1 # sched: [1:1.00]
+; GENERIC-NEXT: kshiftrb $1, %k1, %k1 # sched: [1:1.00]
+; GENERIC-NEXT: kxorb %k0, %k1, %k0 # sched: [1:1.00]
+; GENERIC-NEXT: kshiftlb $1, %k0, %k0 # sched: [1:1.00]
+; GENERIC-NEXT: kshiftrb $1, %k0, %k0 # sched: [1:1.00]
+; GENERIC-NEXT: kshiftlb $7, %k2, %k1 # sched: [1:1.00]
+; GENERIC-NEXT: korb %k1, %k0, %k0 # sched: [1:1.00]
+; GENERIC-NEXT: vpmovm2w %k0, %xmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: vmov_test18:
+; SKX: # %bb.0:
+; SKX-NEXT: kmovd %edi, %k0 # sched: [1:1.00]
+; SKX-NEXT: kmovd %esi, %k1 # sched: [1:1.00]
+; SKX-NEXT: kshiftrw $8, %k1, %k2 # sched: [3:1.00]
+; SKX-NEXT: kshiftrw $9, %k1, %k1 # sched: [3:1.00]
+; SKX-NEXT: kshiftrb $6, %k0, %k3 # sched: [3:1.00]
+; SKX-NEXT: kxorb %k1, %k3, %k1 # sched: [1:1.00]
+; SKX-NEXT: kshiftlb $7, %k1, %k1 # sched: [3:1.00]
+; SKX-NEXT: kshiftrb $1, %k1, %k1 # sched: [3:1.00]
+; SKX-NEXT: kxorb %k0, %k1, %k0 # sched: [1:1.00]
+; SKX-NEXT: kshiftlb $1, %k0, %k0 # sched: [3:1.00]
+; SKX-NEXT: kshiftrb $1, %k0, %k0 # sched: [3:1.00]
+; SKX-NEXT: kshiftlb $7, %k2, %k1 # sched: [3:1.00]
+; SKX-NEXT: korb %k1, %k0, %k0 # sched: [1:1.00]
+; SKX-NEXT: vpmovm2w %k0, %xmm0 # sched: [1:0.25]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %b = bitcast i8 %a to <8 x i1>
+ %b1 = bitcast i16 %y to <16 x i1>
+ %el1 = extractelement <16 x i1>%b1, i32 8
+ %el2 = extractelement <16 x i1>%b1, i32 9
+ %c = insertelement <8 x i1>%b, i1 %el1, i32 7
+ %d = insertelement <8 x i1>%c, i1 %el2, i32 6
+ ret <8 x i1>%d
+}
+define <32 x i16> @vmov_test21(<32 x i16> %x , <32 x i1> %mask) nounwind readnone {
+; GENERIC-LABEL: vmov_test21:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpsllw $7, %ymm1, %ymm1 # sched: [1:1.00]
+; GENERIC-NEXT: vpmovb2m %ymm1, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z} # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: vmov_test21:
+; SKX: # %bb.0:
+; SKX-NEXT: vpsllw $7, %ymm1, %ymm1 # sched: [1:0.50]
+; SKX-NEXT: vpmovb2m %ymm1, %k1 # sched: [1:1.00]
+; SKX-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z} # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %ret = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> zeroinitializer
+ ret <32 x i16> %ret
+}
+
+define void @vmov_test22(<4 x i1> %a, <4 x i1>* %addr) {
+; GENERIC-LABEL: vmov_test22:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: vptestmd %xmm0, %xmm0, %k0 # sched: [1:1.00]
+; GENERIC-NEXT: kmovb %k0, (%rdi)
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: vmov_test22:
+; SKX: # %bb.0:
+; SKX-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:0.50]
+; SKX-NEXT: vptestmd %xmm0, %xmm0, %k0 # sched: [3:1.00]
+; SKX-NEXT: kmovb %k0, (%rdi) # sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ store <4 x i1> %a, <4 x i1>* %addr
+ ret void
+}
+
+define void @vmov_test23(<2 x i1> %a, <2 x i1>* %addr) {
+; GENERIC-LABEL: vmov_test23:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpsllq $63, %xmm0, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: vptestmq %xmm0, %xmm0, %k0 # sched: [1:1.00]
+; GENERIC-NEXT: kmovb %k0, (%rdi)
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: vmov_test23:
+; SKX: # %bb.0:
+; SKX-NEXT: vpsllq $63, %xmm0, %xmm0 # sched: [1:0.50]
+; SKX-NEXT: vptestmq %xmm0, %xmm0, %k0 # sched: [3:1.00]
+; SKX-NEXT: kmovb %k0, (%rdi) # sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ store <2 x i1> %a, <2 x i1>* %addr
+ ret void
+}
+
+define void @store_v1i1(<1 x i1> %c , <1 x i1>* %ptr) {
+; GENERIC-LABEL: store_v1i1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: kmovd %edi, %k0 # sched: [1:0.33]
+; GENERIC-NEXT: kxnorw %k0, %k0, %k1 # sched: [1:1.00]
+; GENERIC-NEXT: kxorw %k1, %k0, %k0 # sched: [1:1.00]
+; GENERIC-NEXT: kmovb %k0, (%rsi)
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: store_v1i1:
+; SKX: # %bb.0:
+; SKX-NEXT: kmovd %edi, %k0 # sched: [1:1.00]
+; SKX-NEXT: kxnorw %k0, %k0, %k1 # sched: [1:1.00]
+; SKX-NEXT: kxorw %k1, %k0, %k0 # sched: [1:1.00]
+; SKX-NEXT: kmovb %k0, (%rsi) # sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %x = xor <1 x i1> %c, <i1 1>
+ store <1 x i1> %x, <1 x i1>* %ptr, align 4
+ ret void
+}
+
+define void @store_v2i1(<2 x i1> %c , <2 x i1>* %ptr) {
+; GENERIC-LABEL: store_v2i1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpsllq $63, %xmm0, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: vptestmq %xmm0, %xmm0, %k0 # sched: [1:1.00]
+; GENERIC-NEXT: knotw %k0, %k0 # sched: [1:1.00]
+; GENERIC-NEXT: kmovb %k0, (%rdi)
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: store_v2i1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpsllq $63, %xmm0, %xmm0 # sched: [1:0.50]
+; SKX-NEXT: vptestmq %xmm0, %xmm0, %k0 # sched: [3:1.00]
+; SKX-NEXT: knotw %k0, %k0 # sched: [1:1.00]
+; SKX-NEXT: kmovb %k0, (%rdi) # sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %x = xor <2 x i1> %c, <i1 1, i1 1>
+ store <2 x i1> %x, <2 x i1>* %ptr, align 4
+ ret void
+}
+
+define void @store_v4i1(<4 x i1> %c , <4 x i1>* %ptr) {
+; GENERIC-LABEL: store_v4i1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: vptestmd %xmm0, %xmm0, %k0 # sched: [1:1.00]
+; GENERIC-NEXT: knotw %k0, %k0 # sched: [1:1.00]
+; GENERIC-NEXT: kmovb %k0, (%rdi)
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: store_v4i1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:0.50]
+; SKX-NEXT: vptestmd %xmm0, %xmm0, %k0 # sched: [3:1.00]
+; SKX-NEXT: knotw %k0, %k0 # sched: [1:1.00]
+; SKX-NEXT: kmovb %k0, (%rdi) # sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %x = xor <4 x i1> %c, <i1 1, i1 1, i1 1, i1 1>
+ store <4 x i1> %x, <4 x i1>* %ptr, align 4
+ ret void
+}
+
+define void @store_v8i1(<8 x i1> %c , <8 x i1>* %ptr) {
+; GENERIC-LABEL: store_v8i1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: vpmovw2m %xmm0, %k0 # sched: [1:0.33]
+; GENERIC-NEXT: knotb %k0, %k0 # sched: [1:1.00]
+; GENERIC-NEXT: kmovb %k0, (%rdi)
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: store_v8i1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:0.50]
+; SKX-NEXT: vpmovw2m %xmm0, %k0 # sched: [1:1.00]
+; SKX-NEXT: knotb %k0, %k0 # sched: [1:1.00]
+; SKX-NEXT: kmovb %k0, (%rdi) # sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %x = xor <8 x i1> %c, <i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1>
+ store <8 x i1> %x, <8 x i1>* %ptr, align 4
+ ret void
+}
+
+define void @store_v16i1(<16 x i1> %c , <16 x i1>* %ptr) {
+; GENERIC-LABEL: store_v16i1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpsllw $7, %xmm0, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: vpmovb2m %xmm0, %k0 # sched: [1:0.33]
+; GENERIC-NEXT: knotw %k0, %k0 # sched: [1:1.00]
+; GENERIC-NEXT: kmovw %k0, (%rdi)
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: store_v16i1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpsllw $7, %xmm0, %xmm0 # sched: [1:0.50]
+; SKX-NEXT: vpmovb2m %xmm0, %k0 # sched: [1:1.00]
+; SKX-NEXT: knotw %k0, %k0 # sched: [1:1.00]
+; SKX-NEXT: kmovw %k0, (%rdi) # sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %x = xor <16 x i1> %c, <i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1>
+ store <16 x i1> %x, <16 x i1>* %ptr, align 4
+ ret void
+}
+
+;void f2(int);
+;void f1(int c)
+;{
+; static int v = 0;
+; if (v == 0)
+; v = 1;
+; else
+; v = 0;
+; f2(v);
+;}
+
+@f1.v = internal unnamed_addr global i1 false, align 4
+
+define void @f1(i32 %c) {
+; GENERIC-LABEL: f1:
+; GENERIC: # %bb.0: # %entry
+; GENERIC-NEXT: movzbl {{.*}}(%rip), %edi # sched: [5:0.50]
+; GENERIC-NEXT: xorl $1, %edi # sched: [1:0.33]
+; GENERIC-NEXT: movb %dil, {{.*}}(%rip) # sched: [5:1.00]
+; GENERIC-NEXT: jmp f2 # TAILCALL
+;
+; SKX-LABEL: f1:
+; SKX: # %bb.0: # %entry
+; SKX-NEXT: movzbl {{.*}}(%rip), %edi # sched: [5:0.50]
+; SKX-NEXT: xorl $1, %edi # sched: [1:0.25]
+; SKX-NEXT: movb %dil, {{.*}}(%rip) # sched: [1:1.00]
+; SKX-NEXT: jmp f2 # TAILCALL
+entry:
+ %.b1 = load i1, i1* @f1.v, align 4
+ %not..b1 = xor i1 %.b1, true
+ store i1 %not..b1, i1* @f1.v, align 4
+ %0 = zext i1 %not..b1 to i32
+ tail call void @f2(i32 %0) #2
+ ret void
+}
+
+declare void @f2(i32) #1
+
+define void @store_i16_i1(i16 %x, i1 *%y) {
+; GENERIC-LABEL: store_i16_i1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: andl $1, %edi # sched: [1:0.33]
+; GENERIC-NEXT: movb %dil, (%rsi) # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: store_i16_i1:
+; SKX: # %bb.0:
+; SKX-NEXT: andl $1, %edi # sched: [1:0.25]
+; SKX-NEXT: movb %dil, (%rsi) # sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %c = trunc i16 %x to i1
+ store i1 %c, i1* %y
+ ret void
+}
+
+define void @store_i8_i1(i8 %x, i1 *%y) {
+; GENERIC-LABEL: store_i8_i1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: andl $1, %edi # sched: [1:0.33]
+; GENERIC-NEXT: movb %dil, (%rsi) # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: store_i8_i1:
+; SKX: # %bb.0:
+; SKX-NEXT: andl $1, %edi # sched: [1:0.25]
+; SKX-NEXT: movb %dil, (%rsi) # sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %c = trunc i8 %x to i1
+ store i1 %c, i1* %y
+ ret void
+}
+
+define <32 x i16> @test_build_vec_v32i1(<32 x i16> %x) {
+; GENERIC-LABEL: test_build_vec_v32i1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: movl $1497715861, %eax # imm = 0x59455495
+; GENERIC-NEXT: # sched: [1:0.33]
+; GENERIC-NEXT: kmovd %eax, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z} # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_build_vec_v32i1:
+; SKX: # %bb.0:
+; SKX-NEXT: movl $1497715861, %eax # imm = 0x59455495
+; SKX-NEXT: # sched: [1:0.25]
+; SKX-NEXT: kmovd %eax, %k1 # sched: [1:1.00]
+; SKX-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z} # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %ret = select <32 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 true, i1 false, i1 true, i1 false>, <32 x i16> %x, <32 x i16> zeroinitializer
+ ret <32 x i16> %ret
+}
+
+define <64 x i8> @test_build_vec_v64i1(<64 x i8> %x) {
+; GENERIC-LABEL: test_build_vec_v64i1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 = zero,zero,zmm0[2],zero,zero,zero,zmm0[6],zero,zmm0[8],zero,zmm0[10],zero,zmm0[12],zero,zero,zmm0[15],zero,zero,zmm0[18],zero,zmm0[20],zero,zmm0[22],zero,zmm0[24],zero,zero,zmm0[27],zero,zero,zmm0[30],zero,zmm0[32],zero,zmm0[34],zero,zero,zero,zmm0[38],zero,zmm0[40],zero,zero,zmm0[43,44],zero,zmm0[46],zero,zmm0[48],zero,zmm0[50],zero,zero,zero,zmm0[54],zero,zmm0[56],zero,zero,zmm0[59,60],zero,zmm0[62],zero sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_build_vec_v64i1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpshufb {{.*#+}} zmm0 = zero,zero,zmm0[2],zero,zero,zero,zmm0[6],zero,zmm0[8],zero,zmm0[10],zero,zmm0[12],zero,zero,zmm0[15],zero,zero,zmm0[18],zero,zmm0[20],zero,zmm0[22],zero,zmm0[24],zero,zero,zmm0[27],zero,zero,zmm0[30],zero,zmm0[32],zero,zmm0[34],zero,zero,zero,zmm0[38],zero,zmm0[40],zero,zero,zmm0[43,44],zero,zmm0[46],zero,zmm0[48],zero,zmm0[50],zero,zero,zero,zmm0[54],zero,zmm0[56],zero,zero,zmm0[59,60],zero,zmm0[62],zero sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %ret = select <64 x i1> <i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 true, i1 false, i1 true, i1 false>, <64 x i8> %x, <64 x i8> zeroinitializer
+ ret <64 x i8> %ret
+}
+
+define void @ktest_1(<8 x double> %in, double * %base) {
+; GENERIC-LABEL: ktest_1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovupd (%rdi), %zmm1 # sched: [4:0.50]
+; GENERIC-NEXT: vcmpltpd %zmm0, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vmovupd 8(%rdi), %zmm1 {%k1} {z} # sched: [4:0.50]
+; GENERIC-NEXT: vcmpltpd %zmm1, %zmm0, %k0 {%k1} # sched: [3:1.00]
+; GENERIC-NEXT: ktestb %k0, %k0 # sched: [1:1.00]
+; GENERIC-NEXT: je .LBB410_2 # sched: [1:1.00]
+; GENERIC-NEXT: # %bb.1: # %L1
+; GENERIC-NEXT: vmovapd %zmm0, (%rdi) # sched: [1:1.00]
+; GENERIC-NEXT: vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+; GENERIC-NEXT: .LBB410_2: # %L2
+; GENERIC-NEXT: vmovapd %zmm0, 8(%rdi) # sched: [1:1.00]
+; GENERIC-NEXT: vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: ktest_1:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovupd (%rdi), %zmm1 # sched: [8:0.50]
+; SKX-NEXT: vcmpltpd %zmm0, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vmovupd 8(%rdi), %zmm1 {%k1} {z} # sched: [8:0.50]
+; SKX-NEXT: vcmpltpd %zmm1, %zmm0, %k0 {%k1} # sched: [3:1.00]
+; SKX-NEXT: ktestb %k0, %k0 # sched: [3:1.00]
+; SKX-NEXT: je .LBB410_2 # sched: [1:0.50]
+; SKX-NEXT: # %bb.1: # %L1
+; SKX-NEXT: vmovapd %zmm0, (%rdi) # sched: [1:1.00]
+; SKX-NEXT: vzeroupper # sched: [4:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+; SKX-NEXT: .LBB410_2: # %L2
+; SKX-NEXT: vmovapd %zmm0, 8(%rdi) # sched: [1:1.00]
+; SKX-NEXT: vzeroupper # sched: [4:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %addr1 = getelementptr double, double * %base, i64 0
+ %addr2 = getelementptr double, double * %base, i64 1
+
+ %vaddr1 = bitcast double* %addr1 to <8 x double>*
+ %vaddr2 = bitcast double* %addr2 to <8 x double>*
+
+ %val1 = load <8 x double>, <8 x double> *%vaddr1, align 1
+ %val2 = load <8 x double>, <8 x double> *%vaddr2, align 1
+
+ %sel1 = fcmp ogt <8 x double>%in, %val1
+ %val3 = select <8 x i1> %sel1, <8 x double> %val2, <8 x double> zeroinitializer
+ %sel2 = fcmp olt <8 x double> %in, %val3
+ %sel3 = and <8 x i1> %sel1, %sel2
+
+ %int_sel3 = bitcast <8 x i1> %sel3 to i8
+ %res = icmp eq i8 %int_sel3, zeroinitializer
+ br i1 %res, label %L2, label %L1
+L1:
+ store <8 x double> %in, <8 x double>* %vaddr1
+ br label %End
+L2:
+ store <8 x double> %in, <8 x double>* %vaddr2
+ br label %End
+End:
+ ret void
+}
+
+define void @ktest_2(<32 x float> %in, float * %base) {
+;
+; GENERIC-LABEL: ktest_2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovups (%rdi), %zmm2 # sched: [4:0.50]
+; GENERIC-NEXT: vmovups 64(%rdi), %zmm3 # sched: [4:0.50]
+; GENERIC-NEXT: vcmpltps %zmm0, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vcmpltps %zmm1, %zmm3, %k2 # sched: [3:1.00]
+; GENERIC-NEXT: kunpckwd %k1, %k2, %k0 # sched: [1:1.00]
+; GENERIC-NEXT: vmovups 68(%rdi), %zmm2 {%k2} {z} # sched: [4:0.50]
+; GENERIC-NEXT: vmovups 4(%rdi), %zmm3 {%k1} {z} # sched: [4:0.50]
+; GENERIC-NEXT: vcmpltps %zmm3, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vcmpltps %zmm2, %zmm1, %k2 # sched: [3:1.00]
+; GENERIC-NEXT: kunpckwd %k1, %k2, %k1 # sched: [1:1.00]
+; GENERIC-NEXT: kord %k1, %k0, %k0 # sched: [1:1.00]
+; GENERIC-NEXT: ktestd %k0, %k0 # sched: [1:1.00]
+; GENERIC-NEXT: je .LBB411_2 # sched: [1:1.00]
+; GENERIC-NEXT: # %bb.1: # %L1
+; GENERIC-NEXT: vmovaps %zmm0, (%rdi) # sched: [1:1.00]
+; GENERIC-NEXT: vmovaps %zmm1, 64(%rdi) # sched: [1:1.00]
+; GENERIC-NEXT: vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+; GENERIC-NEXT: .LBB411_2: # %L2
+; GENERIC-NEXT: vmovaps %zmm0, 4(%rdi) # sched: [1:1.00]
+; GENERIC-NEXT: vmovaps %zmm1, 68(%rdi) # sched: [1:1.00]
+; GENERIC-NEXT: vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: ktest_2:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovups (%rdi), %zmm2 # sched: [8:0.50]
+; SKX-NEXT: vmovups 64(%rdi), %zmm3 # sched: [8:0.50]
+; SKX-NEXT: vcmpltps %zmm0, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vcmpltps %zmm1, %zmm3, %k2 # sched: [3:1.00]
+; SKX-NEXT: vmovups 68(%rdi), %zmm2 {%k2} {z} # sched: [8:0.50]
+; SKX-NEXT: vmovups 4(%rdi), %zmm3 {%k1} {z} # sched: [8:0.50]
+; SKX-NEXT: kunpckwd %k1, %k2, %k0 # sched: [3:1.00]
+; SKX-NEXT: vcmpltps %zmm3, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vcmpltps %zmm2, %zmm1, %k2 # sched: [3:1.00]
+; SKX-NEXT: kunpckwd %k1, %k2, %k1 # sched: [3:1.00]
+; SKX-NEXT: kord %k1, %k0, %k0 # sched: [1:1.00]
+; SKX-NEXT: ktestd %k0, %k0 # sched: [3:1.00]
+; SKX-NEXT: je .LBB411_2 # sched: [1:0.50]
+; SKX-NEXT: # %bb.1: # %L1
+; SKX-NEXT: vmovaps %zmm0, (%rdi) # sched: [1:1.00]
+; SKX-NEXT: vmovaps %zmm1, 64(%rdi) # sched: [1:1.00]
+; SKX-NEXT: vzeroupper # sched: [4:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+; SKX-NEXT: .LBB411_2: # %L2
+; SKX-NEXT: vmovaps %zmm0, 4(%rdi) # sched: [1:1.00]
+; SKX-NEXT: vmovaps %zmm1, 68(%rdi) # sched: [1:1.00]
+; SKX-NEXT: vzeroupper # sched: [4:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %addr1 = getelementptr float, float * %base, i64 0
+ %addr2 = getelementptr float, float * %base, i64 1
+
+ %vaddr1 = bitcast float* %addr1 to <32 x float>*
+ %vaddr2 = bitcast float* %addr2 to <32 x float>*
+
+ %val1 = load <32 x float>, <32 x float> *%vaddr1, align 1
+ %val2 = load <32 x float>, <32 x float> *%vaddr2, align 1
+
+ %sel1 = fcmp ogt <32 x float>%in, %val1
+ %val3 = select <32 x i1> %sel1, <32 x float> %val2, <32 x float> zeroinitializer
+ %sel2 = fcmp olt <32 x float> %in, %val3
+ %sel3 = or <32 x i1> %sel1, %sel2
+
+ %int_sel3 = bitcast <32 x i1> %sel3 to i32
+ %res = icmp eq i32 %int_sel3, zeroinitializer
+ br i1 %res, label %L2, label %L1
+L1:
+ store <32 x float> %in, <32 x float>* %vaddr1
+ br label %End
+L2:
+ store <32 x float> %in, <32 x float>* %vaddr2
+ br label %End
+End:
+ ret void
+}
+
+define <8 x i64> @load_8i1(<8 x i1>* %a) {
+; GENERIC-LABEL: load_8i1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: kmovb (%rdi), %k0
+; GENERIC-NEXT: vpmovm2q %k0, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: load_8i1:
+; SKX: # %bb.0:
+; SKX-NEXT: kmovb (%rdi), %k0 # sched: [7:1.00]
+; SKX-NEXT: vpmovm2q %k0, %zmm0 # sched: [1:0.25]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %b = load <8 x i1>, <8 x i1>* %a
+ %c = sext <8 x i1> %b to <8 x i64>
+ ret <8 x i64> %c
+}
+
+define <16 x i32> @load_16i1(<16 x i1>* %a) {
+; GENERIC-LABEL: load_16i1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: kmovw (%rdi), %k0
+; GENERIC-NEXT: vpmovm2d %k0, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: load_16i1:
+; SKX: # %bb.0:
+; SKX-NEXT: kmovw (%rdi), %k0 # sched: [7:1.00]
+; SKX-NEXT: vpmovm2d %k0, %zmm0 # sched: [1:0.25]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %b = load <16 x i1>, <16 x i1>* %a
+ %c = sext <16 x i1> %b to <16 x i32>
+ ret <16 x i32> %c
+}
+
+define <2 x i16> @load_2i1(<2 x i1>* %a) {
+; GENERIC-LABEL: load_2i1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: kmovb (%rdi), %k0
+; GENERIC-NEXT: vpmovm2q %k0, %xmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: load_2i1:
+; SKX: # %bb.0:
+; SKX-NEXT: kmovb (%rdi), %k0 # sched: [7:1.00]
+; SKX-NEXT: vpmovm2q %k0, %xmm0 # sched: [1:0.25]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %b = load <2 x i1>, <2 x i1>* %a
+ %c = sext <2 x i1> %b to <2 x i16>
+ ret <2 x i16> %c
+}
+
+define <4 x i16> @load_4i1(<4 x i1>* %a) {
+; GENERIC-LABEL: load_4i1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: kmovb (%rdi), %k0
+; GENERIC-NEXT: vpmovm2d %k0, %xmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: load_4i1:
+; SKX: # %bb.0:
+; SKX-NEXT: kmovb (%rdi), %k0 # sched: [7:1.00]
+; SKX-NEXT: vpmovm2d %k0, %xmm0 # sched: [1:0.25]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %b = load <4 x i1>, <4 x i1>* %a
+ %c = sext <4 x i1> %b to <4 x i16>
+ ret <4 x i16> %c
+}
+
+define <32 x i16> @load_32i1(<32 x i1>* %a) {
+; GENERIC-LABEL: load_32i1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: kmovd (%rdi), %k0
+; GENERIC-NEXT: vpmovm2w %k0, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: load_32i1:
+; SKX: # %bb.0:
+; SKX-NEXT: kmovd (%rdi), %k0 # sched: [7:1.00]
+; SKX-NEXT: vpmovm2w %k0, %zmm0 # sched: [1:0.25]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %b = load <32 x i1>, <32 x i1>* %a
+ %c = sext <32 x i1> %b to <32 x i16>
+ ret <32 x i16> %c
+}
+
+define <64 x i8> @load_64i1(<64 x i1>* %a) {
+; GENERIC-LABEL: load_64i1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: kmovq (%rdi), %k0
+; GENERIC-NEXT: vpmovm2b %k0, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: load_64i1:
+; SKX: # %bb.0:
+; SKX-NEXT: kmovq (%rdi), %k0 # sched: [7:1.00]
+; SKX-NEXT: vpmovm2b %k0, %zmm0 # sched: [1:0.25]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %b = load <64 x i1>, <64 x i1>* %a
+ %c = sext <64 x i1> %b to <64 x i8>
+ ret <64 x i8> %c
+}
+
+define void @store_8i1(<8 x i1>* %a, <8 x i1> %v) {
+; GENERIC-LABEL: store_8i1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: vpmovw2m %xmm0, %k0 # sched: [1:0.33]
+; GENERIC-NEXT: kmovb %k0, (%rdi)
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: store_8i1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:0.50]
+; SKX-NEXT: vpmovw2m %xmm0, %k0 # sched: [1:1.00]
+; SKX-NEXT: kmovb %k0, (%rdi) # sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ store <8 x i1> %v, <8 x i1>* %a
+ ret void
+}
+
+define void @store_8i1_1(<8 x i1>* %a, <8 x i16> %v) {
+; GENERIC-LABEL: store_8i1_1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: vpmovw2m %xmm0, %k0 # sched: [1:0.33]
+; GENERIC-NEXT: kmovb %k0, (%rdi)
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: store_8i1_1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:0.50]
+; SKX-NEXT: vpmovw2m %xmm0, %k0 # sched: [1:1.00]
+; SKX-NEXT: kmovb %k0, (%rdi) # sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %v1 = trunc <8 x i16> %v to <8 x i1>
+ store <8 x i1> %v1, <8 x i1>* %a
+ ret void
+}
+
+define void @store_16i1(<16 x i1>* %a, <16 x i1> %v) {
+; GENERIC-LABEL: store_16i1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpsllw $7, %xmm0, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: vpmovb2m %xmm0, %k0 # sched: [1:0.33]
+; GENERIC-NEXT: kmovw %k0, (%rdi)
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: store_16i1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpsllw $7, %xmm0, %xmm0 # sched: [1:0.50]
+; SKX-NEXT: vpmovb2m %xmm0, %k0 # sched: [1:1.00]
+; SKX-NEXT: kmovw %k0, (%rdi) # sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ store <16 x i1> %v, <16 x i1>* %a
+ ret void
+}
+
+define void @store_32i1(<32 x i1>* %a, <32 x i1> %v) {
+; GENERIC-LABEL: store_32i1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpsllw $7, %ymm0, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: vpmovb2m %ymm0, %k0 # sched: [1:0.33]
+; GENERIC-NEXT: kmovd %k0, (%rdi)
+; GENERIC-NEXT: vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: store_32i1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpsllw $7, %ymm0, %ymm0 # sched: [1:0.50]
+; SKX-NEXT: vpmovb2m %ymm0, %k0 # sched: [1:1.00]
+; SKX-NEXT: kmovd %k0, (%rdi) # sched: [1:1.00]
+; SKX-NEXT: vzeroupper # sched: [4:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ store <32 x i1> %v, <32 x i1>* %a
+ ret void
+}
+
+define void @store_32i1_1(<32 x i1>* %a, <32 x i16> %v) {
+; GENERIC-LABEL: store_32i1_1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpsllw $15, %zmm0, %zmm0 # sched: [3:1.00]
+; GENERIC-NEXT: vpmovw2m %zmm0, %k0 # sched: [1:0.33]
+; GENERIC-NEXT: kmovd %k0, (%rdi)
+; GENERIC-NEXT: vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: store_32i1_1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpsllw $15, %zmm0, %zmm0 # sched: [1:0.50]
+; SKX-NEXT: vpmovw2m %zmm0, %k0 # sched: [1:1.00]
+; SKX-NEXT: kmovd %k0, (%rdi) # sched: [1:1.00]
+; SKX-NEXT: vzeroupper # sched: [4:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %v1 = trunc <32 x i16> %v to <32 x i1>
+ store <32 x i1> %v1, <32 x i1>* %a
+ ret void
+}
+
+
+define void @store_64i1(<64 x i1>* %a, <64 x i1> %v) {
+;
+; GENERIC-LABEL: store_64i1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpsllw $7, %zmm0, %zmm0 # sched: [3:1.00]
+; GENERIC-NEXT: vpmovb2m %zmm0, %k0 # sched: [1:0.33]
+; GENERIC-NEXT: kmovq %k0, (%rdi)
+; GENERIC-NEXT: vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: store_64i1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpsllw $7, %zmm0, %zmm0 # sched: [1:0.50]
+; SKX-NEXT: vpmovb2m %zmm0, %k0 # sched: [1:1.00]
+; SKX-NEXT: kmovq %k0, (%rdi) # sched: [1:1.00]
+; SKX-NEXT: vzeroupper # sched: [4:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ store <64 x i1> %v, <64 x i1>* %a
+ ret void
+}
+
+define i32 @test_bitcast_v8i1_zext(<16 x i32> %a) {
+; GENERIC-LABEL: test_bitcast_v8i1_zext:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 # sched: [3:1.00]
+; GENERIC-NEXT: kmovb %k0, %eax # sched: [1:0.33]
+; GENERIC-NEXT: addl %eax, %eax # sched: [1:0.33]
+; GENERIC-NEXT: vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_bitcast_v8i1_zext:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 # sched: [3:1.00]
+; SKX-NEXT: kmovb %k0, %eax # sched: [3:1.00]
+; SKX-NEXT: addl %eax, %eax # sched: [1:0.25]
+; SKX-NEXT: vzeroupper # sched: [4:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %v1 = icmp eq <16 x i32> %a, zeroinitializer
+ %mask = shufflevector <16 x i1> %v1, <16 x i1> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %mask1 = bitcast <8 x i1> %mask to i8
+ %val = zext i8 %mask1 to i32
+ %val1 = add i32 %val, %val
+ ret i32 %val1
+}
+
+define i32 @test_bitcast_v16i1_zext(<16 x i32> %a) {
+; GENERIC-LABEL: test_bitcast_v16i1_zext:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 # sched: [3:1.00]
+; GENERIC-NEXT: kmovw %k0, %eax # sched: [1:0.33]
+; GENERIC-NEXT: addl %eax, %eax # sched: [1:0.33]
+; GENERIC-NEXT: vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_bitcast_v16i1_zext:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 # sched: [3:1.00]
+; SKX-NEXT: kmovw %k0, %eax # sched: [3:1.00]
+; SKX-NEXT: addl %eax, %eax # sched: [1:0.25]
+; SKX-NEXT: vzeroupper # sched: [4:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %v1 = icmp eq <16 x i32> %a, zeroinitializer
+ %mask1 = bitcast <16 x i1> %v1 to i16
+ %val = zext i16 %mask1 to i32
+ %val1 = add i32 %val, %val
+ ret i32 %val1
+}
+
+define i16 @test_v16i1_add(i16 %x, i16 %y) {
+; GENERIC-LABEL: test_v16i1_add:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: kmovd %edi, %k0 # sched: [1:0.33]
+; GENERIC-NEXT: kmovd %esi, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: kxorw %k1, %k0, %k0 # sched: [1:1.00]
+; GENERIC-NEXT: kmovd %k0, %eax # sched: [1:0.33]
+; GENERIC-NEXT: # kill: def %ax killed %ax killed %eax
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_v16i1_add:
+; SKX: # %bb.0:
+; SKX-NEXT: kmovd %edi, %k0 # sched: [1:1.00]
+; SKX-NEXT: kmovd %esi, %k1 # sched: [1:1.00]
+; SKX-NEXT: kxorw %k1, %k0, %k0 # sched: [1:1.00]
+; SKX-NEXT: kmovd %k0, %eax # sched: [3:1.00]
+; SKX-NEXT: # kill: def %ax killed %ax killed %eax
+; SKX-NEXT: retq # sched: [7:1.00]
+ %m0 = bitcast i16 %x to <16 x i1>
+ %m1 = bitcast i16 %y to <16 x i1>
+ %m2 = add <16 x i1> %m0, %m1
+ %ret = bitcast <16 x i1> %m2 to i16
+ ret i16 %ret
+}
+
+define i16 @test_v16i1_sub(i16 %x, i16 %y) {
+; GENERIC-LABEL: test_v16i1_sub:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: kmovd %edi, %k0 # sched: [1:0.33]
+; GENERIC-NEXT: kmovd %esi, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: kxorw %k1, %k0, %k0 # sched: [1:1.00]
+; GENERIC-NEXT: kmovd %k0, %eax # sched: [1:0.33]
+; GENERIC-NEXT: # kill: def %ax killed %ax killed %eax
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_v16i1_sub:
+; SKX: # %bb.0:
+; SKX-NEXT: kmovd %edi, %k0 # sched: [1:1.00]
+; SKX-NEXT: kmovd %esi, %k1 # sched: [1:1.00]
+; SKX-NEXT: kxorw %k1, %k0, %k0 # sched: [1:1.00]
+; SKX-NEXT: kmovd %k0, %eax # sched: [3:1.00]
+; SKX-NEXT: # kill: def %ax killed %ax killed %eax
+; SKX-NEXT: retq # sched: [7:1.00]
+ %m0 = bitcast i16 %x to <16 x i1>
+ %m1 = bitcast i16 %y to <16 x i1>
+ %m2 = sub <16 x i1> %m0, %m1
+ %ret = bitcast <16 x i1> %m2 to i16
+ ret i16 %ret
+}
+
+define i16 @test_v16i1_mul(i16 %x, i16 %y) {
+; GENERIC-LABEL: test_v16i1_mul:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: kmovd %edi, %k0 # sched: [1:0.33]
+; GENERIC-NEXT: kmovd %esi, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: kandw %k1, %k0, %k0 # sched: [1:1.00]
+; GENERIC-NEXT: kmovd %k0, %eax # sched: [1:0.33]
+; GENERIC-NEXT: # kill: def %ax killed %ax killed %eax
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_v16i1_mul:
+; SKX: # %bb.0:
+; SKX-NEXT: kmovd %edi, %k0 # sched: [1:1.00]
+; SKX-NEXT: kmovd %esi, %k1 # sched: [1:1.00]
+; SKX-NEXT: kandw %k1, %k0, %k0 # sched: [1:1.00]
+; SKX-NEXT: kmovd %k0, %eax # sched: [3:1.00]
+; SKX-NEXT: # kill: def %ax killed %ax killed %eax
+; SKX-NEXT: retq # sched: [7:1.00]
+ %m0 = bitcast i16 %x to <16 x i1>
+ %m1 = bitcast i16 %y to <16 x i1>
+ %m2 = mul <16 x i1> %m0, %m1
+ %ret = bitcast <16 x i1> %m2 to i16
+ ret i16 %ret
+}
+
+define i8 @test_v8i1_add(i8 %x, i8 %y) {
+; GENERIC-LABEL: test_v8i1_add:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: kmovd %edi, %k0 # sched: [1:0.33]
+; GENERIC-NEXT: kmovd %esi, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: kxorb %k1, %k0, %k0 # sched: [1:1.00]
+; GENERIC-NEXT: kmovd %k0, %eax # sched: [1:0.33]
+; GENERIC-NEXT: # kill: def %al killed %al killed %eax
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_v8i1_add:
+; SKX: # %bb.0:
+; SKX-NEXT: kmovd %edi, %k0 # sched: [1:1.00]
+; SKX-NEXT: kmovd %esi, %k1 # sched: [1:1.00]
+; SKX-NEXT: kxorb %k1, %k0, %k0 # sched: [1:1.00]
+; SKX-NEXT: kmovd %k0, %eax # sched: [3:1.00]
+; SKX-NEXT: # kill: def %al killed %al killed %eax
+; SKX-NEXT: retq # sched: [7:1.00]
+ %m0 = bitcast i8 %x to <8 x i1>
+ %m1 = bitcast i8 %y to <8 x i1>
+ %m2 = add <8 x i1> %m0, %m1
+ %ret = bitcast <8 x i1> %m2 to i8
+ ret i8 %ret
+}
+
+define i8 @test_v8i1_sub(i8 %x, i8 %y) {
+; GENERIC-LABEL: test_v8i1_sub:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: kmovd %edi, %k0 # sched: [1:0.33]
+; GENERIC-NEXT: kmovd %esi, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: kxorb %k1, %k0, %k0 # sched: [1:1.00]
+; GENERIC-NEXT: kmovd %k0, %eax # sched: [1:0.33]
+; GENERIC-NEXT: # kill: def %al killed %al killed %eax
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_v8i1_sub:
+; SKX: # %bb.0:
+; SKX-NEXT: kmovd %edi, %k0 # sched: [1:1.00]
+; SKX-NEXT: kmovd %esi, %k1 # sched: [1:1.00]
+; SKX-NEXT: kxorb %k1, %k0, %k0 # sched: [1:1.00]
+; SKX-NEXT: kmovd %k0, %eax # sched: [3:1.00]
+; SKX-NEXT: # kill: def %al killed %al killed %eax
+; SKX-NEXT: retq # sched: [7:1.00]
+ %m0 = bitcast i8 %x to <8 x i1>
+ %m1 = bitcast i8 %y to <8 x i1>
+ %m2 = sub <8 x i1> %m0, %m1
+ %ret = bitcast <8 x i1> %m2 to i8
+ ret i8 %ret
+}
+
+define i8 @test_v8i1_mul(i8 %x, i8 %y) {
+; GENERIC-LABEL: test_v8i1_mul:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: kmovd %edi, %k0 # sched: [1:0.33]
+; GENERIC-NEXT: kmovd %esi, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: kandb %k1, %k0, %k0 # sched: [1:1.00]
+; GENERIC-NEXT: kmovd %k0, %eax # sched: [1:0.33]
+; GENERIC-NEXT: # kill: def %al killed %al killed %eax
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_v8i1_mul:
+; SKX: # %bb.0:
+; SKX-NEXT: kmovd %edi, %k0 # sched: [1:1.00]
+; SKX-NEXT: kmovd %esi, %k1 # sched: [1:1.00]
+; SKX-NEXT: kandb %k1, %k0, %k0 # sched: [1:1.00]
+; SKX-NEXT: kmovd %k0, %eax # sched: [3:1.00]
+; SKX-NEXT: # kill: def %al killed %al killed %eax
+; SKX-NEXT: retq # sched: [7:1.00]
+ %m0 = bitcast i8 %x to <8 x i1>
+ %m1 = bitcast i8 %y to <8 x i1>
+ %m2 = mul <8 x i1> %m0, %m1
+ %ret = bitcast <8 x i1> %m2 to i8
+ ret i8 %ret
+}
+
+define <16 x i32> @_inreg16xi32(i32 %a) {
+; GENERIC-LABEL: _inreg16xi32:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpbroadcastd %edi, %zmm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: _inreg16xi32:
+; SKX: # %bb.0:
+; SKX-NEXT: vpbroadcastd %edi, %zmm0 # sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %b = insertelement <16 x i32> undef, i32 %a, i32 0
+ %c = shufflevector <16 x i32> %b, <16 x i32> undef, <16 x i32> zeroinitializer
+ ret <16 x i32> %c
+}
+
+define <8 x i64> @_inreg8xi64(i64 %a) {
+; GENERIC-LABEL: _inreg8xi64:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpbroadcastq %rdi, %zmm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: _inreg8xi64:
+; SKX: # %bb.0:
+; SKX-NEXT: vpbroadcastq %rdi, %zmm0 # sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %b = insertelement <8 x i64> undef, i64 %a, i32 0
+ %c = shufflevector <8 x i64> %b, <8 x i64> undef, <8 x i32> zeroinitializer
+ ret <8 x i64> %c
+}
+
+define <16 x float> @_ss16xfloat_v4(<4 x float> %a) {
+; GENERIC-LABEL: _ss16xfloat_v4:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vbroadcastss %xmm0, %zmm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: _ss16xfloat_v4:
+; SKX: # %bb.0:
+; SKX-NEXT: vbroadcastss %xmm0, %zmm0 # sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %b = shufflevector <4 x float> %a, <4 x float> undef, <16 x i32> zeroinitializer
+ ret <16 x float> %b
+}
+
+define <16 x float> @_inreg16xfloat(float %a) {
+; GENERIC-LABEL: _inreg16xfloat:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vbroadcastss %xmm0, %zmm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: _inreg16xfloat:
+; SKX: # %bb.0:
+; SKX-NEXT: vbroadcastss %xmm0, %zmm0 # sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %b = insertelement <16 x float> undef, float %a, i32 0
+ %c = shufflevector <16 x float> %b, <16 x float> undef, <16 x i32> zeroinitializer
+ ret <16 x float> %c
+}
+
+define <16 x float> @_ss16xfloat_mask(float %a, <16 x float> %i, <16 x i32> %mask1) {
+; GENERIC-LABEL: _ss16xfloat_mask:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpneqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vbroadcastss %xmm0, %zmm1 {%k1} # sched: [1:1.00]
+; GENERIC-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: _ss16xfloat_mask:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpneqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vbroadcastss %xmm0, %zmm1 {%k1} # sched: [3:1.00]
+; SKX-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %mask = icmp ne <16 x i32> %mask1, zeroinitializer
+ %b = insertelement <16 x float> undef, float %a, i32 0
+ %c = shufflevector <16 x float> %b, <16 x float> undef, <16 x i32> zeroinitializer
+ %r = select <16 x i1> %mask, <16 x float> %c, <16 x float> %i
+ ret <16 x float> %r
+}
+
+define <16 x float> @_ss16xfloat_maskz(float %a, <16 x i32> %mask1) {
+; GENERIC-LABEL: _ss16xfloat_maskz:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vbroadcastss %xmm0, %zmm0 {%k1} {z} # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: _ss16xfloat_maskz:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vbroadcastss %xmm0, %zmm0 {%k1} {z} # sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %mask = icmp ne <16 x i32> %mask1, zeroinitializer
+ %b = insertelement <16 x float> undef, float %a, i32 0
+ %c = shufflevector <16 x float> %b, <16 x float> undef, <16 x i32> zeroinitializer
+ %r = select <16 x i1> %mask, <16 x float> %c, <16 x float> zeroinitializer
+ ret <16 x float> %r
+}
+
+define <16 x float> @_ss16xfloat_load(float* %a.ptr) {
+; GENERIC-LABEL: _ss16xfloat_load:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vbroadcastss (%rdi), %zmm0 # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: _ss16xfloat_load:
+; SKX: # %bb.0:
+; SKX-NEXT: vbroadcastss (%rdi), %zmm0 # sched: [8:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %a = load float, float* %a.ptr
+ %b = insertelement <16 x float> undef, float %a, i32 0
+ %c = shufflevector <16 x float> %b, <16 x float> undef, <16 x i32> zeroinitializer
+ ret <16 x float> %c
+}
+
+define <16 x float> @_ss16xfloat_mask_load(float* %a.ptr, <16 x float> %i, <16 x i32> %mask1) {
+; GENERIC-LABEL: _ss16xfloat_mask_load:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vbroadcastss (%rdi), %zmm0 {%k1} # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: _ss16xfloat_mask_load:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vbroadcastss (%rdi), %zmm0 {%k1} # sched: [8:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %a = load float, float* %a.ptr
+ %mask = icmp ne <16 x i32> %mask1, zeroinitializer
+ %b = insertelement <16 x float> undef, float %a, i32 0
+ %c = shufflevector <16 x float> %b, <16 x float> undef, <16 x i32> zeroinitializer
+ %r = select <16 x i1> %mask, <16 x float> %c, <16 x float> %i
+ ret <16 x float> %r
+}
+
+define <16 x float> @_ss16xfloat_maskz_load(float* %a.ptr, <16 x i32> %mask1) {
+; GENERIC-LABEL: _ss16xfloat_maskz_load:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpneqd %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vbroadcastss (%rdi), %zmm0 {%k1} {z} # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: _ss16xfloat_maskz_load:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT: vpcmpneqd %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vbroadcastss (%rdi), %zmm0 {%k1} {z} # sched: [8:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %a = load float, float* %a.ptr
+ %mask = icmp ne <16 x i32> %mask1, zeroinitializer
+ %b = insertelement <16 x float> undef, float %a, i32 0
+ %c = shufflevector <16 x float> %b, <16 x float> undef, <16 x i32> zeroinitializer
+ %r = select <16 x i1> %mask, <16 x float> %c, <16 x float> zeroinitializer
+ ret <16 x float> %r
+}
+
+define <8 x double> @_inreg8xdouble(double %a) {
+; GENERIC-LABEL: _inreg8xdouble:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vbroadcastsd %xmm0, %zmm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: _inreg8xdouble:
+; SKX: # %bb.0:
+; SKX-NEXT: vbroadcastsd %xmm0, %zmm0 # sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %b = insertelement <8 x double> undef, double %a, i32 0
+ %c = shufflevector <8 x double> %b, <8 x double> undef, <8 x i32> zeroinitializer
+ ret <8 x double> %c
+}
+
+define <8 x double> @_sd8xdouble_mask(double %a, <8 x double> %i, <8 x i32> %mask1) {
+; GENERIC-LABEL: _sd8xdouble_mask:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpneqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vbroadcastsd %xmm0, %zmm1 {%k1} # sched: [1:1.00]
+; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: _sd8xdouble_mask:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpneqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vbroadcastsd %xmm0, %zmm1 {%k1} # sched: [3:1.00]
+; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %mask = icmp ne <8 x i32> %mask1, zeroinitializer
+ %b = insertelement <8 x double> undef, double %a, i32 0
+ %c = shufflevector <8 x double> %b, <8 x double> undef, <8 x i32> zeroinitializer
+ %r = select <8 x i1> %mask, <8 x double> %c, <8 x double> %i
+ ret <8 x double> %r
+}
+
+define <8 x double> @_sd8xdouble_maskz(double %a, <8 x i32> %mask1) {
+; GENERIC-LABEL: _sd8xdouble_maskz:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpneqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vbroadcastsd %xmm0, %zmm0 {%k1} {z} # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: _sd8xdouble_maskz:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpneqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vbroadcastsd %xmm0, %zmm0 {%k1} {z} # sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %mask = icmp ne <8 x i32> %mask1, zeroinitializer
+ %b = insertelement <8 x double> undef, double %a, i32 0
+ %c = shufflevector <8 x double> %b, <8 x double> undef, <8 x i32> zeroinitializer
+ %r = select <8 x i1> %mask, <8 x double> %c, <8 x double> zeroinitializer
+ ret <8 x double> %r
+}
+
+define <8 x double> @_sd8xdouble_load(double* %a.ptr) {
+; GENERIC-LABEL: _sd8xdouble_load:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vbroadcastsd (%rdi), %zmm0 # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: _sd8xdouble_load:
+; SKX: # %bb.0:
+; SKX-NEXT: vbroadcastsd (%rdi), %zmm0 # sched: [8:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %a = load double, double* %a.ptr
+ %b = insertelement <8 x double> undef, double %a, i32 0
+ %c = shufflevector <8 x double> %b, <8 x double> undef, <8 x i32> zeroinitializer
+ ret <8 x double> %c
+}
+
+define <8 x double> @_sd8xdouble_mask_load(double* %a.ptr, <8 x double> %i, <8 x i32> %mask1) {
+; GENERIC-LABEL: _sd8xdouble_mask_load:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpneqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vbroadcastsd (%rdi), %zmm0 {%k1} # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: _sd8xdouble_mask_load:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpneqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vbroadcastsd (%rdi), %zmm0 {%k1} # sched: [8:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %a = load double, double* %a.ptr
+ %mask = icmp ne <8 x i32> %mask1, zeroinitializer
+ %b = insertelement <8 x double> undef, double %a, i32 0
+ %c = shufflevector <8 x double> %b, <8 x double> undef, <8 x i32> zeroinitializer
+ %r = select <8 x i1> %mask, <8 x double> %c, <8 x double> %i
+ ret <8 x double> %r
+}
+
+define <8 x double> @_sd8xdouble_maskz_load(double* %a.ptr, <8 x i32> %mask1) {
+; GENERIC-LABEL: _sd8xdouble_maskz_load:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpneqd %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vbroadcastsd (%rdi), %zmm0 {%k1} {z} # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: _sd8xdouble_maskz_load:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT: vpcmpneqd %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vbroadcastsd (%rdi), %zmm0 {%k1} {z} # sched: [8:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %a = load double, double* %a.ptr
+ %mask = icmp ne <8 x i32> %mask1, zeroinitializer
+ %b = insertelement <8 x double> undef, double %a, i32 0
+ %c = shufflevector <8 x double> %b, <8 x double> undef, <8 x i32> zeroinitializer
+ %r = select <8 x i1> %mask, <8 x double> %c, <8 x double> zeroinitializer
+ ret <8 x double> %r
+}
+
+define <16 x i32> @_xmm16xi32(<16 x i32> %a) {
+; GENERIC-LABEL: _xmm16xi32:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vbroadcastss %xmm0, %zmm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: _xmm16xi32:
+; SKX: # %bb.0:
+; SKX-NEXT: vbroadcastss %xmm0, %zmm0 # sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %b = shufflevector <16 x i32> %a, <16 x i32> undef, <16 x i32> zeroinitializer
+ ret <16 x i32> %b
+}
+
+define <16 x float> @_xmm16xfloat(<16 x float> %a) {
+; GENERIC-LABEL: _xmm16xfloat:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vbroadcastss %xmm0, %zmm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: _xmm16xfloat:
+; SKX: # %bb.0:
+; SKX-NEXT: vbroadcastss %xmm0, %zmm0 # sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %b = shufflevector <16 x float> %a, <16 x float> undef, <16 x i32> zeroinitializer
+ ret <16 x float> %b
+}
+
+define <16 x i32> @test_vbroadcast() {
+; GENERIC-LABEL: test_vbroadcast:
+; GENERIC: # %bb.0: # %entry
+; GENERIC-NEXT: vxorps %xmm0, %xmm0, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: vcmpunordps %zmm0, %zmm0, %k0 # sched: [3:1.00]
+; GENERIC-NEXT: vpmovm2d %k0, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: knotw %k0, %k1 # sched: [1:1.00]
+; GENERIC-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z} # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_vbroadcast:
+; SKX: # %bb.0: # %entry
+; SKX-NEXT: vxorps %xmm0, %xmm0, %xmm0 # sched: [1:0.33]
+; SKX-NEXT: vcmpunordps %zmm0, %zmm0, %k0 # sched: [3:1.00]
+; SKX-NEXT: vpmovm2d %k0, %zmm0 # sched: [1:0.25]
+; SKX-NEXT: knotw %k0, %k1 # sched: [1:1.00]
+; SKX-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z} # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+entry:
+ %0 = sext <16 x i1> zeroinitializer to <16 x i32>
+ %1 = fcmp uno <16 x float> undef, zeroinitializer
+ %2 = sext <16 x i1> %1 to <16 x i32>
+ %3 = select <16 x i1> %1, <16 x i32> %0, <16 x i32> %2
+ ret <16 x i32> %3
+}
+
+; We implement the set1 intrinsics with vector initializers. Verify that the
+; IR generated will produce broadcasts at the end.
+define <8 x double> @test_set1_pd(double %d) #2 {
+; GENERIC-LABEL: test_set1_pd:
+; GENERIC: # %bb.0: # %entry
+; GENERIC-NEXT: vbroadcastsd %xmm0, %zmm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_set1_pd:
+; SKX: # %bb.0: # %entry
+; SKX-NEXT: vbroadcastsd %xmm0, %zmm0 # sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+entry:
+ %vecinit.i = insertelement <8 x double> undef, double %d, i32 0
+ %vecinit1.i = insertelement <8 x double> %vecinit.i, double %d, i32 1
+ %vecinit2.i = insertelement <8 x double> %vecinit1.i, double %d, i32 2
+ %vecinit3.i = insertelement <8 x double> %vecinit2.i, double %d, i32 3
+ %vecinit4.i = insertelement <8 x double> %vecinit3.i, double %d, i32 4
+ %vecinit5.i = insertelement <8 x double> %vecinit4.i, double %d, i32 5
+ %vecinit6.i = insertelement <8 x double> %vecinit5.i, double %d, i32 6
+ %vecinit7.i = insertelement <8 x double> %vecinit6.i, double %d, i32 7
+ ret <8 x double> %vecinit7.i
+}
+
+define <8 x i64> @test_set1_epi64(i64 %d) #2 {
+; GENERIC-LABEL: test_set1_epi64:
+; GENERIC: # %bb.0: # %entry
+; GENERIC-NEXT: vpbroadcastq %rdi, %zmm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_set1_epi64:
+; SKX: # %bb.0: # %entry
+; SKX-NEXT: vpbroadcastq %rdi, %zmm0 # sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+entry:
+ %vecinit.i = insertelement <8 x i64> undef, i64 %d, i32 0
+ %vecinit1.i = insertelement <8 x i64> %vecinit.i, i64 %d, i32 1
+ %vecinit2.i = insertelement <8 x i64> %vecinit1.i, i64 %d, i32 2
+ %vecinit3.i = insertelement <8 x i64> %vecinit2.i, i64 %d, i32 3
+ %vecinit4.i = insertelement <8 x i64> %vecinit3.i, i64 %d, i32 4
+ %vecinit5.i = insertelement <8 x i64> %vecinit4.i, i64 %d, i32 5
+ %vecinit6.i = insertelement <8 x i64> %vecinit5.i, i64 %d, i32 6
+ %vecinit7.i = insertelement <8 x i64> %vecinit6.i, i64 %d, i32 7
+ ret <8 x i64> %vecinit7.i
+}
+
+define <16 x float> @test_set1_ps(float %f) #2 {
+; GENERIC-LABEL: test_set1_ps:
+; GENERIC: # %bb.0: # %entry
+; GENERIC-NEXT: vbroadcastss %xmm0, %zmm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_set1_ps:
+; SKX: # %bb.0: # %entry
+; SKX-NEXT: vbroadcastss %xmm0, %zmm0 # sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+entry:
+ %vecinit.i = insertelement <16 x float> undef, float %f, i32 0
+ %vecinit1.i = insertelement <16 x float> %vecinit.i, float %f, i32 1
+ %vecinit2.i = insertelement <16 x float> %vecinit1.i, float %f, i32 2
+ %vecinit3.i = insertelement <16 x float> %vecinit2.i, float %f, i32 3
+ %vecinit4.i = insertelement <16 x float> %vecinit3.i, float %f, i32 4
+ %vecinit5.i = insertelement <16 x float> %vecinit4.i, float %f, i32 5
+ %vecinit6.i = insertelement <16 x float> %vecinit5.i, float %f, i32 6
+ %vecinit7.i = insertelement <16 x float> %vecinit6.i, float %f, i32 7
+ %vecinit8.i = insertelement <16 x float> %vecinit7.i, float %f, i32 8
+ %vecinit9.i = insertelement <16 x float> %vecinit8.i, float %f, i32 9
+ %vecinit10.i = insertelement <16 x float> %vecinit9.i, float %f, i32 10
+ %vecinit11.i = insertelement <16 x float> %vecinit10.i, float %f, i32 11
+ %vecinit12.i = insertelement <16 x float> %vecinit11.i, float %f, i32 12
+ %vecinit13.i = insertelement <16 x float> %vecinit12.i, float %f, i32 13
+ %vecinit14.i = insertelement <16 x float> %vecinit13.i, float %f, i32 14
+ %vecinit15.i = insertelement <16 x float> %vecinit14.i, float %f, i32 15
+ ret <16 x float> %vecinit15.i
+}
+
+define <16 x i32> @test_set1_epi32(i32 %f) #2 {
+; GENERIC-LABEL: test_set1_epi32:
+; GENERIC: # %bb.0: # %entry
+; GENERIC-NEXT: vpbroadcastd %edi, %zmm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_set1_epi32:
+; SKX: # %bb.0: # %entry
+; SKX-NEXT: vpbroadcastd %edi, %zmm0 # sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+entry:
+ %vecinit.i = insertelement <16 x i32> undef, i32 %f, i32 0
+ %vecinit1.i = insertelement <16 x i32> %vecinit.i, i32 %f, i32 1
+ %vecinit2.i = insertelement <16 x i32> %vecinit1.i, i32 %f, i32 2
+ %vecinit3.i = insertelement <16 x i32> %vecinit2.i, i32 %f, i32 3
+ %vecinit4.i = insertelement <16 x i32> %vecinit3.i, i32 %f, i32 4
+ %vecinit5.i = insertelement <16 x i32> %vecinit4.i, i32 %f, i32 5
+ %vecinit6.i = insertelement <16 x i32> %vecinit5.i, i32 %f, i32 6
+ %vecinit7.i = insertelement <16 x i32> %vecinit6.i, i32 %f, i32 7
+ %vecinit8.i = insertelement <16 x i32> %vecinit7.i, i32 %f, i32 8
+ %vecinit9.i = insertelement <16 x i32> %vecinit8.i, i32 %f, i32 9
+ %vecinit10.i = insertelement <16 x i32> %vecinit9.i, i32 %f, i32 10
+ %vecinit11.i = insertelement <16 x i32> %vecinit10.i, i32 %f, i32 11
+ %vecinit12.i = insertelement <16 x i32> %vecinit11.i, i32 %f, i32 12
+ %vecinit13.i = insertelement <16 x i32> %vecinit12.i, i32 %f, i32 13
+ %vecinit14.i = insertelement <16 x i32> %vecinit13.i, i32 %f, i32 14
+ %vecinit15.i = insertelement <16 x i32> %vecinit14.i, i32 %f, i32 15
+ ret <16 x i32> %vecinit15.i
+}
+
+; We implement the scalar broadcast intrinsics with vector initializers.
+; Verify that the IR generated will produce the broadcast at the end.
+define <8 x double> @test_mm512_broadcastsd_pd(<2 x double> %a) {
+; GENERIC-LABEL: test_mm512_broadcastsd_pd:
+; GENERIC: # %bb.0: # %entry
+; GENERIC-NEXT: vbroadcastsd %xmm0, %zmm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_mm512_broadcastsd_pd:
+; SKX: # %bb.0: # %entry
+; SKX-NEXT: vbroadcastsd %xmm0, %zmm0 # sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+entry:
+ %0 = extractelement <2 x double> %a, i32 0
+ %vecinit.i = insertelement <8 x double> undef, double %0, i32 0
+ %vecinit1.i = insertelement <8 x double> %vecinit.i, double %0, i32 1
+ %vecinit2.i = insertelement <8 x double> %vecinit1.i, double %0, i32 2
+ %vecinit3.i = insertelement <8 x double> %vecinit2.i, double %0, i32 3
+ %vecinit4.i = insertelement <8 x double> %vecinit3.i, double %0, i32 4
+ %vecinit5.i = insertelement <8 x double> %vecinit4.i, double %0, i32 5
+ %vecinit6.i = insertelement <8 x double> %vecinit5.i, double %0, i32 6
+ %vecinit7.i = insertelement <8 x double> %vecinit6.i, double %0, i32 7
+ ret <8 x double> %vecinit7.i
+}
+
+define <16 x float> @suff_test1(<8 x float>%a) {
+; GENERIC-LABEL: suff_test1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vbroadcastss %xmm0, %zmm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: suff_test1:
+; SKX: # %bb.0:
+; SKX-NEXT: vbroadcastss %xmm0, %zmm0 # sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %res = shufflevector <8 x float> %a, <8 x float> undef, <16 x i32> zeroinitializer
+ ret <16 x float>%res
+}
+
+define <8 x double> @suff_test2(<4 x double>%a) {
+; GENERIC-LABEL: suff_test2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vbroadcastsd %xmm0, %zmm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: suff_test2:
+; SKX: # %bb.0:
+; SKX-NEXT: vbroadcastsd %xmm0, %zmm0 # sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %res = shufflevector <4 x double> %a, <4 x double> undef, <8 x i32> zeroinitializer
+ ret <8 x double>%res
+}
+
+define <64 x i8> @_invec32xi8(<32 x i8>%a) {
+; GENERIC-LABEL: _invec32xi8:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpbroadcastb %xmm0, %zmm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: _invec32xi8:
+; SKX: # %bb.0:
+; SKX-NEXT: vpbroadcastb %xmm0, %zmm0 # sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %res = shufflevector <32 x i8> %a, <32 x i8> undef, <64 x i32> zeroinitializer
+ ret <64 x i8>%res
+}
+
+define <32 x i16> @_invec16xi16(<16 x i16>%a) {
+; GENERIC-LABEL: _invec16xi16:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpbroadcastw %xmm0, %zmm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: _invec16xi16:
+; SKX: # %bb.0:
+; SKX-NEXT: vpbroadcastw %xmm0, %zmm0 # sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %res = shufflevector <16 x i16> %a, <16 x i16> undef, <32 x i32> zeroinitializer
+ ret <32 x i16>%res
+}
+
+define <16 x i32> @_invec8xi32(<8 x i32>%a) {
+; GENERIC-LABEL: _invec8xi32:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vbroadcastss %xmm0, %zmm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: _invec8xi32:
+; SKX: # %bb.0:
+; SKX-NEXT: vbroadcastss %xmm0, %zmm0 # sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %res = shufflevector <8 x i32> %a, <8 x i32> undef, <16 x i32> zeroinitializer
+ ret <16 x i32>%res
+}
+
+define <8 x i64> @_invec4xi64(<4 x i64>%a) {
+; GENERIC-LABEL: _invec4xi64:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vbroadcastsd %xmm0, %zmm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: _invec4xi64:
+; SKX: # %bb.0:
+; SKX-NEXT: vbroadcastsd %xmm0, %zmm0 # sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %res = shufflevector <4 x i64> %a, <4 x i64> undef, <8 x i32> zeroinitializer
+ ret <8 x i64>%res
+}
+
+declare void @func_f32(float)
+define <16 x float> @broadcast_ss_spill(float %x) {
+; GENERIC-LABEL: broadcast_ss_spill:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: subq $24, %rsp # sched: [1:0.33]
+; GENERIC-NEXT: .cfi_def_cfa_offset 32
+; GENERIC-NEXT: vaddss %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
+; GENERIC-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill sched: [5:1.00]
+; GENERIC-NEXT: callq func_f32
+; GENERIC-NEXT: vbroadcastss (%rsp), %zmm0 # 16-byte Folded Reload sched: [5:1.00]
+; GENERIC-NEXT: addq $24, %rsp # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: broadcast_ss_spill:
+; SKX: # %bb.0:
+; SKX-NEXT: subq $24, %rsp # sched: [1:0.25]
+; SKX-NEXT: .cfi_def_cfa_offset 32
+; SKX-NEXT: vaddss %xmm0, %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill sched: [1:1.00]
+; SKX-NEXT: callq func_f32
+; SKX-NEXT: vbroadcastss (%rsp), %zmm0 # 16-byte Folded Reload sched: [8:0.50]
+; SKX-NEXT: addq $24, %rsp # sched: [1:0.25]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %a = fadd float %x, %x
+ call void @func_f32(float %a)
+ %b = insertelement <16 x float> undef, float %a, i32 0
+ %c = shufflevector <16 x float> %b, <16 x float> undef, <16 x i32> zeroinitializer
+ ret <16 x float> %c
+}
+
+declare void @func_f64(double)
+define <8 x double> @broadcast_sd_spill(double %x) {
+; GENERIC-LABEL: broadcast_sd_spill:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: subq $24, %rsp # sched: [1:0.33]
+; GENERIC-NEXT: .cfi_def_cfa_offset 32
+; GENERIC-NEXT: vaddsd %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
+; GENERIC-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill sched: [5:1.00]
+; GENERIC-NEXT: callq func_f64
+; GENERIC-NEXT: vbroadcastsd (%rsp), %zmm0 # 16-byte Folded Reload sched: [5:1.00]
+; GENERIC-NEXT: addq $24, %rsp # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: broadcast_sd_spill:
+; SKX: # %bb.0:
+; SKX-NEXT: subq $24, %rsp # sched: [1:0.25]
+; SKX-NEXT: .cfi_def_cfa_offset 32
+; SKX-NEXT: vaddsd %xmm0, %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill sched: [1:1.00]
+; SKX-NEXT: callq func_f64
+; SKX-NEXT: vbroadcastsd (%rsp), %zmm0 # 16-byte Folded Reload sched: [8:0.50]
+; SKX-NEXT: addq $24, %rsp # sched: [1:0.25]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %a = fadd double %x, %x
+ call void @func_f64(double %a)
+ %b = insertelement <8 x double> undef, double %a, i32 0
+ %c = shufflevector <8 x double> %b, <8 x double> undef, <8 x i32> zeroinitializer
+ ret <8 x double> %c
+}
diff --git a/test/CodeGen/X86/avx512-select.ll b/test/CodeGen/X86/avx512-select.ll
index e81f983d9fe6..6491863d939a 100644
--- a/test/CodeGen/X86/avx512-select.ll
+++ b/test/CodeGen/X86/avx512-select.ll
@@ -1,17 +1,29 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mcpu=knl | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl | FileCheck %s --check-prefix=X64
define <16 x i32> @select00(i32 %a, <16 x i32> %b) nounwind {
-; CHECK-LABEL: select00:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpxord %zmm1, %zmm1, %zmm1
-; CHECK-NEXT: cmpl $255, %edi
-; CHECK-NEXT: je LBB0_2
-; CHECK-NEXT: ## BB#1:
-; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1
-; CHECK-NEXT: LBB0_2:
-; CHECK-NEXT: vpxorq %zmm1, %zmm0, %zmm0
-; CHECK-NEXT: retq
+; X86-LABEL: select00:
+; X86: # %bb.0:
+; X86-NEXT: cmpl $255, {{[0-9]+}}(%esp)
+; X86-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; X86-NEXT: je .LBB0_2
+; X86-NEXT: # %bb.1:
+; X86-NEXT: vmovdqa64 %zmm0, %zmm1
+; X86-NEXT: .LBB0_2:
+; X86-NEXT: vpxorq %zmm1, %zmm0, %zmm0
+; X86-NEXT: retl
+;
+; X64-LABEL: select00:
+; X64: # %bb.0:
+; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; X64-NEXT: cmpl $255, %edi
+; X64-NEXT: je .LBB0_2
+; X64-NEXT: # %bb.1:
+; X64-NEXT: vmovdqa64 %zmm0, %zmm1
+; X64-NEXT: .LBB0_2:
+; X64-NEXT: vpxorq %zmm1, %zmm0, %zmm0
+; X64-NEXT: retq
%cmpres = icmp eq i32 %a, 255
%selres = select i1 %cmpres, <16 x i32> zeroinitializer, <16 x i32> %b
%res = xor <16 x i32> %b, %selres
@@ -19,16 +31,27 @@ define <16 x i32> @select00(i32 %a, <16 x i32> %b) nounwind {
}
define <8 x i64> @select01(i32 %a, <8 x i64> %b) nounwind {
-; CHECK-LABEL: select01:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpxord %zmm1, %zmm1, %zmm1
-; CHECK-NEXT: cmpl $255, %edi
-; CHECK-NEXT: je LBB1_2
-; CHECK-NEXT: ## BB#1:
-; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1
-; CHECK-NEXT: LBB1_2:
-; CHECK-NEXT: vpxorq %zmm1, %zmm0, %zmm0
-; CHECK-NEXT: retq
+; X86-LABEL: select01:
+; X86: # %bb.0:
+; X86-NEXT: cmpl $255, {{[0-9]+}}(%esp)
+; X86-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; X86-NEXT: je .LBB1_2
+; X86-NEXT: # %bb.1:
+; X86-NEXT: vmovdqa64 %zmm0, %zmm1
+; X86-NEXT: .LBB1_2:
+; X86-NEXT: vpxorq %zmm1, %zmm0, %zmm0
+; X86-NEXT: retl
+;
+; X64-LABEL: select01:
+; X64: # %bb.0:
+; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; X64-NEXT: cmpl $255, %edi
+; X64-NEXT: je .LBB1_2
+; X64-NEXT: # %bb.1:
+; X64-NEXT: vmovdqa64 %zmm0, %zmm1
+; X64-NEXT: .LBB1_2:
+; X64-NEXT: vpxorq %zmm1, %zmm0, %zmm0
+; X64-NEXT: retq
%cmpres = icmp eq i32 %a, 255
%selres = select i1 %cmpres, <8 x i64> zeroinitializer, <8 x i64> %b
%res = xor <8 x i64> %b, %selres
@@ -36,44 +59,84 @@ define <8 x i64> @select01(i32 %a, <8 x i64> %b) nounwind {
}
define float @select02(float %a, float %b, float %c, float %eps) {
-; CHECK-LABEL: select02:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vcmpless %xmm0, %xmm3, %k1
-; CHECK-NEXT: vmovss %xmm2, %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vmovaps %xmm1, %xmm0
-; CHECK-NEXT: retq
+; X86-LABEL: select02:
+; X86: # %bb.0:
+; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT: vucomiss {{[0-9]+}}(%esp), %xmm0
+; X86-NEXT: leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT: leal {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: cmovael %eax, %ecx
+; X86-NEXT: flds (%ecx)
+; X86-NEXT: retl
+;
+; X64-LABEL: select02:
+; X64: # %bb.0:
+; X64-NEXT: vcmpless %xmm0, %xmm3, %k1
+; X64-NEXT: vmovss %xmm2, %xmm0, %xmm1 {%k1}
+; X64-NEXT: vmovaps %xmm1, %xmm0
+; X64-NEXT: retq
%cmp = fcmp oge float %a, %eps
%cond = select i1 %cmp, float %c, float %b
ret float %cond
}
define double @select03(double %a, double %b, double %c, double %eps) {
-; CHECK-LABEL: select03:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vcmplesd %xmm0, %xmm3, %k1
-; CHECK-NEXT: vmovsd %xmm2, %xmm0, %xmm1 {%k1}
-; CHECK-NEXT: vmovapd %xmm1, %xmm0
-; CHECK-NEXT: retq
+; X86-LABEL: select03:
+; X86: # %bb.0:
+; X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT: vucomisd {{[0-9]+}}(%esp), %xmm0
+; X86-NEXT: leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT: leal {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: cmovael %eax, %ecx
+; X86-NEXT: fldl (%ecx)
+; X86-NEXT: retl
+;
+; X64-LABEL: select03:
+; X64: # %bb.0:
+; X64-NEXT: vcmplesd %xmm0, %xmm3, %k1
+; X64-NEXT: vmovsd %xmm2, %xmm0, %xmm1 {%k1}
+; X64-NEXT: vmovapd %xmm1, %xmm0
+; X64-NEXT: retq
%cmp = fcmp oge double %a, %eps
%cond = select i1 %cmp, double %c, double %b
ret double %cond
}
define <16 x double> @select04(<16 x double> %a, <16 x double> %b) {
-; CHECK-LABEL: select04:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vmovaps %zmm3, %zmm1
-; CHECK-NEXT: retq
+; X86-LABEL: select04:
+; X86: # %bb.0:
+; X86-NEXT: pushl %ebp
+; X86-NEXT: .cfi_def_cfa_offset 8
+; X86-NEXT: .cfi_offset %ebp, -8
+; X86-NEXT: movl %esp, %ebp
+; X86-NEXT: .cfi_def_cfa_register %ebp
+; X86-NEXT: andl $-64, %esp
+; X86-NEXT: subl $64, %esp
+; X86-NEXT: vmovaps 8(%ebp), %zmm1
+; X86-NEXT: movl %ebp, %esp
+; X86-NEXT: popl %ebp
+; X86-NEXT: retl
+;
+; X64-LABEL: select04:
+; X64: # %bb.0:
+; X64-NEXT: vmovaps %zmm3, %zmm1
+; X64-NEXT: retq
%sel = select <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <16 x double> %a, <16 x double> %b
ret <16 x double> %sel
}
define i8 @select05(i8 %a.0, i8 %m) {
-; CHECK-LABEL: select05:
-; CHECK: ## BB#0:
-; CHECK-NEXT: orl %esi, %edi
-; CHECK-NEXT: movl %edi, %eax
-; CHECK-NEXT: retq
+; X86-LABEL: select05:
+; X86: # %bb.0:
+; X86-NEXT: movb {{[0-9]+}}(%esp), %al
+; X86-NEXT: orb {{[0-9]+}}(%esp), %al
+; X86-NEXT: retl
+;
+; X64-LABEL: select05:
+; X64: # %bb.0:
+; X64-NEXT: orl %esi, %edi
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: retq
%mask = bitcast i8 %m to <8 x i1>
%a = bitcast i8 %a.0 to <8 x i1>
%r = select <8 x i1> %mask, <8 x i1> <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1>, <8 x i1> %a
@@ -82,16 +145,29 @@ define i8 @select05(i8 %a.0, i8 %m) {
}
define i8 @select05_mem(<8 x i1>* %a.0, <8 x i1>* %m) {
-; CHECK-LABEL: select05_mem:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl (%rsi), %eax
-; CHECK-NEXT: kmovw %eax, %k0
-; CHECK-NEXT: movzbl (%rdi), %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: korw %k1, %k0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
-; CHECK-NEXT: retq
+; X86-LABEL: select05_mem:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movzbl (%ecx), %ecx
+; X86-NEXT: kmovw %ecx, %k0
+; X86-NEXT: movzbl (%eax), %eax
+; X86-NEXT: kmovw %eax, %k1
+; X86-NEXT: korw %k1, %k0, %k0
+; X86-NEXT: kmovw %k0, %eax
+; X86-NEXT: # kill: def %al killed %al killed %eax
+; X86-NEXT: retl
+;
+; X64-LABEL: select05_mem:
+; X64: # %bb.0:
+; X64-NEXT: movzbl (%rsi), %eax
+; X64-NEXT: kmovw %eax, %k0
+; X64-NEXT: movzbl (%rdi), %eax
+; X64-NEXT: kmovw %eax, %k1
+; X64-NEXT: korw %k1, %k0, %k0
+; X64-NEXT: kmovw %k0, %eax
+; X64-NEXT: # kill: def %al killed %al killed %eax
+; X64-NEXT: retq
%mask = load <8 x i1> , <8 x i1>* %m
%a = load <8 x i1> , <8 x i1>* %a.0
%r = select <8 x i1> %mask, <8 x i1> <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1>, <8 x i1> %a
@@ -100,11 +176,17 @@ define i8 @select05_mem(<8 x i1>* %a.0, <8 x i1>* %m) {
}
define i8 @select06(i8 %a.0, i8 %m) {
-; CHECK-LABEL: select06:
-; CHECK: ## BB#0:
-; CHECK-NEXT: andl %esi, %edi
-; CHECK-NEXT: movl %edi, %eax
-; CHECK-NEXT: retq
+; X86-LABEL: select06:
+; X86: # %bb.0:
+; X86-NEXT: movb {{[0-9]+}}(%esp), %al
+; X86-NEXT: andb {{[0-9]+}}(%esp), %al
+; X86-NEXT: retl
+;
+; X64-LABEL: select06:
+; X64: # %bb.0:
+; X64-NEXT: andl %esi, %edi
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: retq
%mask = bitcast i8 %m to <8 x i1>
%a = bitcast i8 %a.0 to <8 x i1>
%r = select <8 x i1> %mask, <8 x i1> %a, <8 x i1> zeroinitializer
@@ -113,16 +195,29 @@ define i8 @select06(i8 %a.0, i8 %m) {
}
define i8 @select06_mem(<8 x i1>* %a.0, <8 x i1>* %m) {
-; CHECK-LABEL: select06_mem:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movzbl (%rsi), %eax
-; CHECK-NEXT: kmovw %eax, %k0
-; CHECK-NEXT: movzbl (%rdi), %eax
-; CHECK-NEXT: kmovw %eax, %k1
-; CHECK-NEXT: kandw %k1, %k0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
-; CHECK-NEXT: retq
+; X86-LABEL: select06_mem:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movzbl (%ecx), %ecx
+; X86-NEXT: kmovw %ecx, %k0
+; X86-NEXT: movzbl (%eax), %eax
+; X86-NEXT: kmovw %eax, %k1
+; X86-NEXT: kandw %k1, %k0, %k0
+; X86-NEXT: kmovw %k0, %eax
+; X86-NEXT: # kill: def %al killed %al killed %eax
+; X86-NEXT: retl
+;
+; X64-LABEL: select06_mem:
+; X64: # %bb.0:
+; X64-NEXT: movzbl (%rsi), %eax
+; X64-NEXT: kmovw %eax, %k0
+; X64-NEXT: movzbl (%rdi), %eax
+; X64-NEXT: kmovw %eax, %k1
+; X64-NEXT: kandw %k1, %k0, %k0
+; X64-NEXT: kmovw %k0, %eax
+; X64-NEXT: # kill: def %al killed %al killed %eax
+; X64-NEXT: retq
%mask = load <8 x i1> , <8 x i1>* %m
%a = load <8 x i1> , <8 x i1>* %a.0
%r = select <8 x i1> %mask, <8 x i1> %a, <8 x i1> zeroinitializer
@@ -130,17 +225,32 @@ define i8 @select06_mem(<8 x i1>* %a.0, <8 x i1>* %m) {
ret i8 %res;
}
define i8 @select07(i8 %a.0, i8 %b.0, i8 %m) {
-; CHECK-LABEL: select07:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edx, %k0
-; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: kmovw %esi, %k2
-; CHECK-NEXT: kandnw %k2, %k0, %k2
-; CHECK-NEXT: kandw %k0, %k1, %k0
-; CHECK-NEXT: korw %k2, %k0, %k0
-; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
-; CHECK-NEXT: retq
+; X86-LABEL: select07:
+; X86: # %bb.0:
+; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: kmovw %eax, %k0
+; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: kmovw %eax, %k1
+; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: kmovw %eax, %k2
+; X86-NEXT: kandnw %k2, %k0, %k2
+; X86-NEXT: kandw %k0, %k1, %k0
+; X86-NEXT: korw %k2, %k0, %k0
+; X86-NEXT: kmovw %k0, %eax
+; X86-NEXT: # kill: def %al killed %al killed %eax
+; X86-NEXT: retl
+;
+; X64-LABEL: select07:
+; X64: # %bb.0:
+; X64-NEXT: kmovw %edx, %k0
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: kmovw %esi, %k2
+; X64-NEXT: kandnw %k2, %k0, %k2
+; X64-NEXT: kandw %k0, %k1, %k0
+; X64-NEXT: korw %k2, %k0, %k0
+; X64-NEXT: kmovw %k0, %eax
+; X64-NEXT: # kill: def %al killed %al killed %eax
+; X64-NEXT: retq
%mask = bitcast i8 %m to <8 x i1>
%a = bitcast i8 %a.0 to <8 x i1>
%b = bitcast i8 %b.0 to <8 x i1>
@@ -150,49 +260,84 @@ define i8 @select07(i8 %a.0, i8 %b.0, i8 %m) {
}
define i64 @pr30249() {
-; CHECK-LABEL: pr30249:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl $2, %eax
-; CHECK-NEXT: retq
+; X86-LABEL: pr30249:
+; X86: # %bb.0:
+; X86-NEXT: movl $2, %eax
+; X86-NEXT: xorl %edx, %edx
+; X86-NEXT: retl
+;
+; X64-LABEL: pr30249:
+; X64: # %bb.0:
+; X64-NEXT: movl $2, %eax
+; X64-NEXT: retq
%v = select i1 undef , i64 1, i64 2
ret i64 %v
}
define double @pr30561_f64(double %b, double %a, i1 %c) {
-; CHECK-LABEL: pr30561_f64:
-; CHECK: ## BB#0:
-; CHECK-NEXT: andb $1, %dil
-; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1}
-; CHECK-NEXT: retq
+; X86-LABEL: pr30561_f64:
+; X86: # %bb.0:
+; X86-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X86-NEXT: leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT: leal {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: cmovnel %eax, %ecx
+; X86-NEXT: fldl (%ecx)
+; X86-NEXT: retl
+;
+; X64-LABEL: pr30561_f64:
+; X64: # %bb.0:
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1}
+; X64-NEXT: retq
%cond = select i1 %c, double %a, double %b
ret double %cond
}
define float @pr30561_f32(float %b, float %a, i1 %c) {
-; CHECK-LABEL: pr30561_f32:
-; CHECK: ## BB#0:
-; CHECK-NEXT: andb $1, %dil
-; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1}
-; CHECK-NEXT: retq
+; X86-LABEL: pr30561_f32:
+; X86: # %bb.0:
+; X86-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X86-NEXT: leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT: leal {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: cmovnel %eax, %ecx
+; X86-NEXT: flds (%ecx)
+; X86-NEXT: retl
+;
+; X64-LABEL: pr30561_f32:
+; X64: # %bb.0:
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1}
+; X64-NEXT: retq
%cond = select i1 %c, float %a, float %b
ret float %cond
}
define <16 x i16> @pr31515(<16 x i1> %a, <16 x i1> %b, <16 x i16> %c) nounwind {
-; CHECK-LABEL: pr31515:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpmovsxbd %xmm1, %zmm1
-; CHECK-NEXT: vpslld $31, %zmm1, %zmm1
-; CHECK-NEXT: vpmovsxbd %xmm0, %zmm0
-; CHECK-NEXT: vpslld $31, %zmm0, %zmm0
-; CHECK-NEXT: vptestmd %zmm0, %zmm0, %k1
-; CHECK-NEXT: vptestmd %zmm1, %zmm1, %k1 {%k1}
-; CHECK-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT: vpmovdw %zmm0, %ymm0
-; CHECK-NEXT: vpandn %ymm2, %ymm0, %ymm0
-; CHECK-NEXT: retq
+; X86-LABEL: pr31515:
+; X86: # %bb.0:
+; X86-NEXT: vpmovsxbd %xmm1, %zmm1
+; X86-NEXT: vpslld $31, %zmm1, %zmm1
+; X86-NEXT: vpmovsxbd %xmm0, %zmm0
+; X86-NEXT: vpslld $31, %zmm0, %zmm0
+; X86-NEXT: vptestmd %zmm0, %zmm0, %k1
+; X86-NEXT: vptestmd %zmm1, %zmm1, %k1 {%k1}
+; X86-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; X86-NEXT: vpmovdw %zmm0, %ymm0
+; X86-NEXT: vpandn %ymm2, %ymm0, %ymm0
+; X86-NEXT: retl
+;
+; X64-LABEL: pr31515:
+; X64: # %bb.0:
+; X64-NEXT: vpmovsxbd %xmm1, %zmm1
+; X64-NEXT: vpslld $31, %zmm1, %zmm1
+; X64-NEXT: vpmovsxbd %xmm0, %zmm0
+; X64-NEXT: vpslld $31, %zmm0, %zmm0
+; X64-NEXT: vptestmd %zmm0, %zmm0, %k1
+; X64-NEXT: vptestmd %zmm1, %zmm1, %k1 {%k1}
+; X64-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; X64-NEXT: vpmovdw %zmm0, %ymm0
+; X64-NEXT: vpandn %ymm2, %ymm0, %ymm0
+; X64-NEXT: retq
%mask = and <16 x i1> %a, %b
%res = select <16 x i1> %mask, <16 x i16> zeroinitializer, <16 x i16> %c
ret <16 x i16> %res
diff --git a/test/CodeGen/X86/avx512-shift.ll b/test/CodeGen/X86/avx512-shift.ll
index ce2b010ec0f2..eb424a8d935a 100644
--- a/test/CodeGen/X86/avx512-shift.ll
+++ b/test/CodeGen/X86/avx512-shift.ll
@@ -4,7 +4,7 @@
define <16 x i32> @shift_16_i32(<16 x i32> %a) {
; CHECK-LABEL: shift_16_i32:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpsrld $1, %zmm0, %zmm0
; CHECK-NEXT: vpslld $12, %zmm0, %zmm0
; CHECK-NEXT: vpsrad $12, %zmm0, %zmm0
@@ -17,7 +17,7 @@ define <16 x i32> @shift_16_i32(<16 x i32> %a) {
define <8 x i64> @shift_8_i64(<8 x i64> %a) {
; CHECK-LABEL: shift_8_i64:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpsrlq $1, %zmm0, %zmm0
; CHECK-NEXT: vpsllq $12, %zmm0, %zmm0
; CHECK-NEXT: vpsraq $12, %zmm0, %zmm0
@@ -30,15 +30,15 @@ define <8 x i64> @shift_8_i64(<8 x i64> %a) {
define <4 x i64> @shift_4_i64(<4 x i64> %a) {
; KNL-LABEL: shift_4_i64:
-; KNL: # BB#0:
+; KNL: # %bb.0:
; KNL-NEXT: vpsrlq $1, %ymm0, %ymm0
; KNL-NEXT: vpsllq $12, %ymm0, %ymm0
; KNL-NEXT: vpsraq $12, %zmm0, %zmm0
-; KNL-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; KNL-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
; KNL-NEXT: retq
;
; SKX-LABEL: shift_4_i64:
-; SKX: # BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vpsrlq $1, %ymm0, %ymm0
; SKX-NEXT: vpsllq $12, %ymm0, %ymm0
; SKX-NEXT: vpsraq $12, %ymm0, %ymm0
@@ -51,7 +51,7 @@ define <4 x i64> @shift_4_i64(<4 x i64> %a) {
define <8 x i64> @variable_shl4(<8 x i64> %x, <8 x i64> %y) {
; CHECK-LABEL: variable_shl4:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpsllvq %zmm1, %zmm0, %zmm0
; CHECK-NEXT: retq
%k = shl <8 x i64> %x, %y
@@ -60,7 +60,7 @@ define <8 x i64> @variable_shl4(<8 x i64> %x, <8 x i64> %y) {
define <16 x i32> @variable_shl5(<16 x i32> %x, <16 x i32> %y) {
; CHECK-LABEL: variable_shl5:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpsllvd %zmm1, %zmm0, %zmm0
; CHECK-NEXT: retq
%k = shl <16 x i32> %x, %y
@@ -69,7 +69,7 @@ define <16 x i32> @variable_shl5(<16 x i32> %x, <16 x i32> %y) {
define <16 x i32> @variable_srl0(<16 x i32> %x, <16 x i32> %y) {
; CHECK-LABEL: variable_srl0:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0
; CHECK-NEXT: retq
%k = lshr <16 x i32> %x, %y
@@ -78,7 +78,7 @@ define <16 x i32> @variable_srl0(<16 x i32> %x, <16 x i32> %y) {
define <8 x i64> @variable_srl2(<8 x i64> %x, <8 x i64> %y) {
; CHECK-LABEL: variable_srl2:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpsrlvq %zmm1, %zmm0, %zmm0
; CHECK-NEXT: retq
%k = lshr <8 x i64> %x, %y
@@ -87,7 +87,7 @@ define <8 x i64> @variable_srl2(<8 x i64> %x, <8 x i64> %y) {
define <16 x i32> @variable_sra1(<16 x i32> %x, <16 x i32> %y) {
; CHECK-LABEL: variable_sra1:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpsravd %zmm1, %zmm0, %zmm0
; CHECK-NEXT: retq
%k = ashr <16 x i32> %x, %y
@@ -96,7 +96,7 @@ define <16 x i32> @variable_sra1(<16 x i32> %x, <16 x i32> %y) {
define <8 x i64> @variable_sra2(<8 x i64> %x, <8 x i64> %y) {
; CHECK-LABEL: variable_sra2:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpsravq %zmm1, %zmm0, %zmm0
; CHECK-NEXT: retq
%k = ashr <8 x i64> %x, %y
@@ -105,15 +105,15 @@ define <8 x i64> @variable_sra2(<8 x i64> %x, <8 x i64> %y) {
define <4 x i64> @variable_sra3(<4 x i64> %x, <4 x i64> %y) {
; KNL-LABEL: variable_sra3:
-; KNL: # BB#0:
-; KNL-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
-; KNL-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; KNL: # %bb.0:
+; KNL-NEXT: # kill: def %ymm1 killed %ymm1 def %zmm1
+; KNL-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; KNL-NEXT: vpsravq %zmm1, %zmm0, %zmm0
-; KNL-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; KNL-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
; KNL-NEXT: retq
;
; SKX-LABEL: variable_sra3:
-; SKX: # BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vpsravq %ymm1, %ymm0, %ymm0
; SKX-NEXT: retq
%k = ashr <4 x i64> %x, %y
@@ -122,16 +122,16 @@ define <4 x i64> @variable_sra3(<4 x i64> %x, <4 x i64> %y) {
define <8 x i16> @variable_sra4(<8 x i16> %x, <8 x i16> %y) {
; KNL-LABEL: variable_sra4:
-; KNL: # BB#0:
+; KNL: # %bb.0:
; KNL-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; KNL-NEXT: vpmovsxwd %xmm0, %ymm0
; KNL-NEXT: vpsravd %ymm1, %ymm0, %ymm0
; KNL-NEXT: vpmovdw %zmm0, %ymm0
-; KNL-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; KNL-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; KNL-NEXT: retq
;
; SKX-LABEL: variable_sra4:
-; SKX: # BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vpsravw %xmm1, %xmm0, %xmm0
; SKX-NEXT: retq
%k = ashr <8 x i16> %x, %y
@@ -140,7 +140,7 @@ define <8 x i16> @variable_sra4(<8 x i16> %x, <8 x i16> %y) {
define <16 x i32> @variable_sra01_load(<16 x i32> %x, <16 x i32>* %y) {
; CHECK-LABEL: variable_sra01_load:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpsravd (%rdi), %zmm0, %zmm0
; CHECK-NEXT: retq
%y1 = load <16 x i32>, <16 x i32>* %y
@@ -150,7 +150,7 @@ define <16 x i32> @variable_sra01_load(<16 x i32> %x, <16 x i32>* %y) {
define <16 x i32> @variable_shl1_load(<16 x i32> %x, <16 x i32>* %y) {
; CHECK-LABEL: variable_shl1_load:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpsllvd (%rdi), %zmm0, %zmm0
; CHECK-NEXT: retq
%y1 = load <16 x i32>, <16 x i32>* %y
@@ -160,7 +160,7 @@ define <16 x i32> @variable_shl1_load(<16 x i32> %x, <16 x i32>* %y) {
define <16 x i32> @variable_srl0_load(<16 x i32> %x, <16 x i32>* %y) {
; CHECK-LABEL: variable_srl0_load:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpsrlvd (%rdi), %zmm0, %zmm0
; CHECK-NEXT: retq
%y1 = load <16 x i32>, <16 x i32>* %y
@@ -170,7 +170,7 @@ define <16 x i32> @variable_srl0_load(<16 x i32> %x, <16 x i32>* %y) {
define <8 x i64> @variable_srl3_load(<8 x i64> %x, <8 x i64>* %y) {
; CHECK-LABEL: variable_srl3_load:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpsrlvq (%rdi), %zmm0, %zmm0
; CHECK-NEXT: retq
%y1 = load <8 x i64>, <8 x i64>* %y
diff --git a/test/CodeGen/X86/avx512-shuffle-schedule.ll b/test/CodeGen/X86/avx512-shuffle-schedule.ll
new file mode 100755
index 000000000000..618909c151fa
--- /dev/null
+++ b/test/CodeGen/X86/avx512-shuffle-schedule.ll
@@ -0,0 +1,17005 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=+avx512f,+avx512dq,+avx512bw,+avx512vl | FileCheck %s --check-prefix=GENERIC
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skx | FileCheck %s --check-prefix=SKX
+
+; This test is an assembly of avx512 shuffling instructions to check their scheduling
+
+define <16 x i16> @test_16xi16_perm_mask0(<16 x i16> %vec) {
+; GENERIC-LABEL: test_16xi16_perm_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovdqa {{.*#+}} ymm1 = [8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [7:0.50]
+; GENERIC-NEXT: vpermw %ymm0, %ymm1, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xi16_perm_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovdqa {{.*#+}} ymm1 = [8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [7:0.50]
+; SKX-NEXT: vpermw %ymm0, %ymm1, %ymm0 # sched: [6:2.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 8, i32 6, i32 12, i32 4, i32 7, i32 9, i32 14, i32 8, i32 4, i32 12, i32 9, i32 4, i32 14, i32 15, i32 12, i32 14>
+ ret <16 x i16> %res
+}
+define <16 x i16> @test_masked_16xi16_perm_mask0(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
+; GENERIC-LABEL: test_masked_16xi16_perm_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovdqa {{.*#+}} ymm3 = [8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [7:0.50]
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqw %ymm4, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermw %ymm0, %ymm3, %ymm1 {%k1} # sched: [1:1.00]
+; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_16xi16_perm_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovdqa {{.*#+}} ymm3 = [8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [7:0.50]
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqw %ymm4, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermw %ymm0, %ymm3, %ymm1 {%k1} # sched: [6:2.00]
+; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 8, i32 6, i32 12, i32 4, i32 7, i32 9, i32 14, i32 8, i32 4, i32 12, i32 9, i32 4, i32 14, i32 15, i32 12, i32 14>
+ %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
+ ret <16 x i16> %res
+}
+
+define <16 x i16> @test_masked_z_16xi16_perm_mask0(<16 x i16> %vec, <16 x i16> %mask) {
+; GENERIC-LABEL: test_masked_z_16xi16_perm_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovdqa {{.*#+}} ymm2 = [8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [7:0.50]
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqw %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermw %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_16xi16_perm_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovdqa {{.*#+}} ymm2 = [8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [7:0.50]
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqw %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermw %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [6:2.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 8, i32 6, i32 12, i32 4, i32 7, i32 9, i32 14, i32 8, i32 4, i32 12, i32 9, i32 4, i32 14, i32 15, i32 12, i32 14>
+ %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
+ ret <16 x i16> %res
+}
+define <16 x i16> @test_masked_16xi16_perm_mask1(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
+; GENERIC-LABEL: test_masked_16xi16_perm_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovdqa {{.*#+}} ymm3 = [4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0] sched: [7:0.50]
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqw %ymm4, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermw %ymm0, %ymm3, %ymm1 {%k1} # sched: [1:1.00]
+; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_16xi16_perm_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovdqa {{.*#+}} ymm3 = [4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0] sched: [7:0.50]
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqw %ymm4, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermw %ymm0, %ymm3, %ymm1 {%k1} # sched: [6:2.00]
+; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 4, i32 11, i32 14, i32 10, i32 7, i32 1, i32 6, i32 9, i32 14, i32 15, i32 7, i32 13, i32 4, i32 12, i32 8, i32 0>
+ %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
+ ret <16 x i16> %res
+}
+
+define <16 x i16> @test_masked_z_16xi16_perm_mask1(<16 x i16> %vec, <16 x i16> %mask) {
+; GENERIC-LABEL: test_masked_z_16xi16_perm_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovdqa {{.*#+}} ymm2 = [4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0] sched: [7:0.50]
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqw %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermw %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_16xi16_perm_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovdqa {{.*#+}} ymm2 = [4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0] sched: [7:0.50]
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqw %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermw %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [6:2.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 4, i32 11, i32 14, i32 10, i32 7, i32 1, i32 6, i32 9, i32 14, i32 15, i32 7, i32 13, i32 4, i32 12, i32 8, i32 0>
+ %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
+ ret <16 x i16> %res
+}
+define <16 x i16> @test_masked_16xi16_perm_mask2(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
+; GENERIC-LABEL: test_masked_16xi16_perm_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovdqa {{.*#+}} ymm3 = [11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7] sched: [7:0.50]
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqw %ymm4, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermw %ymm0, %ymm3, %ymm1 {%k1} # sched: [1:1.00]
+; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_16xi16_perm_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovdqa {{.*#+}} ymm3 = [11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7] sched: [7:0.50]
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqw %ymm4, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermw %ymm0, %ymm3, %ymm1 {%k1} # sched: [6:2.00]
+; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 11, i32 6, i32 13, i32 10, i32 0, i32 7, i32 13, i32 3, i32 5, i32 13, i32 3, i32 9, i32 3, i32 15, i32 12, i32 7>
+ %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
+ ret <16 x i16> %res
+}
+
+define <16 x i16> @test_masked_z_16xi16_perm_mask2(<16 x i16> %vec, <16 x i16> %mask) {
+; GENERIC-LABEL: test_masked_z_16xi16_perm_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovdqa {{.*#+}} ymm2 = [11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7] sched: [7:0.50]
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqw %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermw %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_16xi16_perm_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovdqa {{.*#+}} ymm2 = [11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7] sched: [7:0.50]
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqw %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermw %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [6:2.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 11, i32 6, i32 13, i32 10, i32 0, i32 7, i32 13, i32 3, i32 5, i32 13, i32 3, i32 9, i32 3, i32 15, i32 12, i32 7>
+ %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
+ ret <16 x i16> %res
+}
+define <16 x i16> @test_16xi16_perm_mask3(<16 x i16> %vec) {
+; GENERIC-LABEL: test_16xi16_perm_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovdqa {{.*#+}} ymm1 = [1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [7:0.50]
+; GENERIC-NEXT: vpermw %ymm0, %ymm1, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xi16_perm_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovdqa {{.*#+}} ymm1 = [1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [7:0.50]
+; SKX-NEXT: vpermw %ymm0, %ymm1, %ymm0 # sched: [6:2.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 1, i32 5, i32 8, i32 14, i32 1, i32 8, i32 11, i32 8, i32 13, i32 8, i32 15, i32 9, i32 9, i32 7, i32 9, i32 6>
+ ret <16 x i16> %res
+}
+define <16 x i16> @test_masked_16xi16_perm_mask3(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
+; GENERIC-LABEL: test_masked_16xi16_perm_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovdqa {{.*#+}} ymm3 = [1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [7:0.50]
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqw %ymm4, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermw %ymm0, %ymm3, %ymm1 {%k1} # sched: [1:1.00]
+; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_16xi16_perm_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovdqa {{.*#+}} ymm3 = [1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [7:0.50]
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqw %ymm4, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermw %ymm0, %ymm3, %ymm1 {%k1} # sched: [6:2.00]
+; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 1, i32 5, i32 8, i32 14, i32 1, i32 8, i32 11, i32 8, i32 13, i32 8, i32 15, i32 9, i32 9, i32 7, i32 9, i32 6>
+ %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
+ ret <16 x i16> %res
+}
+
+define <16 x i16> @test_masked_z_16xi16_perm_mask3(<16 x i16> %vec, <16 x i16> %mask) {
+; GENERIC-LABEL: test_masked_z_16xi16_perm_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovdqa {{.*#+}} ymm2 = [1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [7:0.50]
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqw %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermw %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_16xi16_perm_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovdqa {{.*#+}} ymm2 = [1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [7:0.50]
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqw %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermw %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [6:2.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 1, i32 5, i32 8, i32 14, i32 1, i32 8, i32 11, i32 8, i32 13, i32 8, i32 15, i32 9, i32 9, i32 7, i32 9, i32 6>
+ %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
+ ret <16 x i16> %res
+}
+define <16 x i16> @test_16xi16_perm_mem_mask0(<16 x i16>* %vp) {
+; GENERIC-LABEL: test_16xi16_perm_mem_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovdqa {{.*#+}} ymm0 = [9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [7:0.50]
+; GENERIC-NEXT: vpermw (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xi16_perm_mem_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovdqa {{.*#+}} ymm0 = [9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [7:0.50]
+; SKX-NEXT: vpermw (%rdi), %ymm0, %ymm0 # sched: [13:2.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <16 x i16>, <16 x i16>* %vp
+ %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 9, i32 10, i32 7, i32 1, i32 12, i32 14, i32 14, i32 13, i32 14, i32 14, i32 8, i32 6, i32 11, i32 4, i32 12, i32 13>
+ ret <16 x i16> %res
+}
+define <16 x i16> @test_masked_16xi16_perm_mem_mask0(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
+; GENERIC-LABEL: test_masked_16xi16_perm_mem_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovdqa {{.*#+}} ymm2 = [9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [7:0.50]
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqw %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermw (%rdi), %ymm2, %ymm0 {%k1} # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_16xi16_perm_mem_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovdqa {{.*#+}} ymm2 = [9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [7:0.50]
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqw %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermw (%rdi), %ymm2, %ymm0 {%k1} # sched: [13:2.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <16 x i16>, <16 x i16>* %vp
+ %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 9, i32 10, i32 7, i32 1, i32 12, i32 14, i32 14, i32 13, i32 14, i32 14, i32 8, i32 6, i32 11, i32 4, i32 12, i32 13>
+ %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
+ ret <16 x i16> %res
+}
+
+define <16 x i16> @test_masked_z_16xi16_perm_mem_mask0(<16 x i16>* %vp, <16 x i16> %mask) {
+; GENERIC-LABEL: test_masked_z_16xi16_perm_mem_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovdqa {{.*#+}} ymm1 = [9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [7:0.50]
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_16xi16_perm_mem_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovdqa {{.*#+}} ymm1 = [9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [7:0.50]
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqw %ymm2, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [13:2.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <16 x i16>, <16 x i16>* %vp
+ %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 9, i32 10, i32 7, i32 1, i32 12, i32 14, i32 14, i32 13, i32 14, i32 14, i32 8, i32 6, i32 11, i32 4, i32 12, i32 13>
+ %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
+ ret <16 x i16> %res
+}
+
+define <16 x i16> @test_masked_16xi16_perm_mem_mask1(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
+; GENERIC-LABEL: test_masked_16xi16_perm_mem_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovdqa {{.*#+}} ymm2 = [14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11] sched: [7:0.50]
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqw %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermw (%rdi), %ymm2, %ymm0 {%k1} # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_16xi16_perm_mem_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovdqa {{.*#+}} ymm2 = [14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11] sched: [7:0.50]
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqw %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermw (%rdi), %ymm2, %ymm0 {%k1} # sched: [13:2.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <16 x i16>, <16 x i16>* %vp
+ %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 14, i32 9, i32 15, i32 9, i32 7, i32 10, i32 15, i32 14, i32 12, i32 1, i32 9, i32 7, i32 10, i32 13, i32 3, i32 11>
+ %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
+ ret <16 x i16> %res
+}
+
+define <16 x i16> @test_masked_z_16xi16_perm_mem_mask1(<16 x i16>* %vp, <16 x i16> %mask) {
+; GENERIC-LABEL: test_masked_z_16xi16_perm_mem_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovdqa {{.*#+}} ymm1 = [14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11] sched: [7:0.50]
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_16xi16_perm_mem_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovdqa {{.*#+}} ymm1 = [14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11] sched: [7:0.50]
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqw %ymm2, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [13:2.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <16 x i16>, <16 x i16>* %vp
+ %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 14, i32 9, i32 15, i32 9, i32 7, i32 10, i32 15, i32 14, i32 12, i32 1, i32 9, i32 7, i32 10, i32 13, i32 3, i32 11>
+ %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
+ ret <16 x i16> %res
+}
+
+define <16 x i16> @test_masked_16xi16_perm_mem_mask2(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
+; GENERIC-LABEL: test_masked_16xi16_perm_mem_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovdqa {{.*#+}} ymm2 = [1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9] sched: [7:0.50]
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqw %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermw (%rdi), %ymm2, %ymm0 {%k1} # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_16xi16_perm_mem_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovdqa {{.*#+}} ymm2 = [1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9] sched: [7:0.50]
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqw %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermw (%rdi), %ymm2, %ymm0 {%k1} # sched: [13:2.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <16 x i16>, <16 x i16>* %vp
+ %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 1, i32 3, i32 12, i32 5, i32 13, i32 1, i32 2, i32 11, i32 0, i32 9, i32 14, i32 8, i32 10, i32 0, i32 10, i32 9>
+ %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
+ ret <16 x i16> %res
+}
+
+define <16 x i16> @test_masked_z_16xi16_perm_mem_mask2(<16 x i16>* %vp, <16 x i16> %mask) {
+; GENERIC-LABEL: test_masked_z_16xi16_perm_mem_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovdqa {{.*#+}} ymm1 = [1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9] sched: [7:0.50]
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_16xi16_perm_mem_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovdqa {{.*#+}} ymm1 = [1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9] sched: [7:0.50]
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqw %ymm2, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [13:2.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <16 x i16>, <16 x i16>* %vp
+ %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 1, i32 3, i32 12, i32 5, i32 13, i32 1, i32 2, i32 11, i32 0, i32 9, i32 14, i32 8, i32 10, i32 0, i32 10, i32 9>
+ %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
+ ret <16 x i16> %res
+}
+
+define <16 x i16> @test_16xi16_perm_mem_mask3(<16 x i16>* %vp) {
+; GENERIC-LABEL: test_16xi16_perm_mem_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovdqa {{.*#+}} ymm0 = [9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [7:0.50]
+; GENERIC-NEXT: vpermw (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xi16_perm_mem_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovdqa {{.*#+}} ymm0 = [9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [7:0.50]
+; SKX-NEXT: vpermw (%rdi), %ymm0, %ymm0 # sched: [13:2.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <16 x i16>, <16 x i16>* %vp
+ %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 9, i32 6, i32 5, i32 15, i32 0, i32 0, i32 15, i32 2, i32 1, i32 3, i32 12, i32 14, i32 0, i32 6, i32 1, i32 4>
+ ret <16 x i16> %res
+}
+define <16 x i16> @test_masked_16xi16_perm_mem_mask3(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
+; GENERIC-LABEL: test_masked_16xi16_perm_mem_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovdqa {{.*#+}} ymm2 = [9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [7:0.50]
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqw %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermw (%rdi), %ymm2, %ymm0 {%k1} # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_16xi16_perm_mem_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovdqa {{.*#+}} ymm2 = [9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [7:0.50]
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqw %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermw (%rdi), %ymm2, %ymm0 {%k1} # sched: [13:2.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <16 x i16>, <16 x i16>* %vp
+ %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 9, i32 6, i32 5, i32 15, i32 0, i32 0, i32 15, i32 2, i32 1, i32 3, i32 12, i32 14, i32 0, i32 6, i32 1, i32 4>
+ %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
+ ret <16 x i16> %res
+}
+
+define <16 x i16> @test_masked_z_16xi16_perm_mem_mask3(<16 x i16>* %vp, <16 x i16> %mask) {
+; GENERIC-LABEL: test_masked_z_16xi16_perm_mem_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovdqa {{.*#+}} ymm1 = [9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [7:0.50]
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_16xi16_perm_mem_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovdqa {{.*#+}} ymm1 = [9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [7:0.50]
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqw %ymm2, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [13:2.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <16 x i16>, <16 x i16>* %vp
+ %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 9, i32 6, i32 5, i32 15, i32 0, i32 0, i32 15, i32 2, i32 1, i32 3, i32 12, i32 14, i32 0, i32 6, i32 1, i32 4>
+ %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
+ ret <16 x i16> %res
+}
+
+define <32 x i16> @test_32xi16_perm_mask0(<32 x i16> %vec) {
+; GENERIC-LABEL: test_32xi16_perm_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm1 = [16,1,3,31,6,11,23,26,29,5,21,30,1,21,27,10,8,19,14,5,15,13,18,16,9,11,26,8,17,0,23,10] sched: [4:0.50]
+; GENERIC-NEXT: vpermw %zmm0, %zmm1, %zmm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_32xi16_perm_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovdqa64 {{.*#+}} zmm1 = [16,1,3,31,6,11,23,26,29,5,21,30,1,21,27,10,8,19,14,5,15,13,18,16,9,11,26,8,17,0,23,10] sched: [8:0.50]
+; SKX-NEXT: vpermw %zmm0, %zmm1, %zmm0 # sched: [6:2.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 16, i32 1, i32 3, i32 31, i32 6, i32 11, i32 23, i32 26, i32 29, i32 5, i32 21, i32 30, i32 1, i32 21, i32 27, i32 10, i32 8, i32 19, i32 14, i32 5, i32 15, i32 13, i32 18, i32 16, i32 9, i32 11, i32 26, i32 8, i32 17, i32 0, i32 23, i32 10>
+ ret <32 x i16> %res
+}
+define <32 x i16> @test_masked_32xi16_perm_mask0(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
+; GENERIC-LABEL: test_masked_32xi16_perm_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm3 = [16,1,3,31,6,11,23,26,29,5,21,30,1,21,27,10,8,19,14,5,15,13,18,16,9,11,26,8,17,0,23,10] sched: [4:0.50]
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqw %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermw %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
+; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_32xi16_perm_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [16,1,3,31,6,11,23,26,29,5,21,30,1,21,27,10,8,19,14,5,15,13,18,16,9,11,26,8,17,0,23,10] sched: [8:0.50]
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqw %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermw %zmm0, %zmm3, %zmm1 {%k1} # sched: [6:2.00]
+; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 16, i32 1, i32 3, i32 31, i32 6, i32 11, i32 23, i32 26, i32 29, i32 5, i32 21, i32 30, i32 1, i32 21, i32 27, i32 10, i32 8, i32 19, i32 14, i32 5, i32 15, i32 13, i32 18, i32 16, i32 9, i32 11, i32 26, i32 8, i32 17, i32 0, i32 23, i32 10>
+ %cmp = icmp eq <32 x i16> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
+ ret <32 x i16> %res
+}
+
+define <32 x i16> @test_masked_z_32xi16_perm_mask0(<32 x i16> %vec, <32 x i16> %mask) {
+; GENERIC-LABEL: test_masked_z_32xi16_perm_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [16,1,3,31,6,11,23,26,29,5,21,30,1,21,27,10,8,19,14,5,15,13,18,16,9,11,26,8,17,0,23,10] sched: [4:0.50]
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_32xi16_perm_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [16,1,3,31,6,11,23,26,29,5,21,30,1,21,27,10,8,19,14,5,15,13,18,16,9,11,26,8,17,0,23,10] sched: [8:0.50]
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [6:2.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 16, i32 1, i32 3, i32 31, i32 6, i32 11, i32 23, i32 26, i32 29, i32 5, i32 21, i32 30, i32 1, i32 21, i32 27, i32 10, i32 8, i32 19, i32 14, i32 5, i32 15, i32 13, i32 18, i32 16, i32 9, i32 11, i32 26, i32 8, i32 17, i32 0, i32 23, i32 10>
+ %cmp = icmp eq <32 x i16> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
+ ret <32 x i16> %res
+}
+define <32 x i16> @test_masked_32xi16_perm_mask1(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
+; GENERIC-LABEL: test_masked_32xi16_perm_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,8,7,30,11,9,11,30,20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12,22,13,21,1,14,8,5,16] sched: [4:0.50]
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqw %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermw %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
+; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_32xi16_perm_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,8,7,30,11,9,11,30,20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12,22,13,21,1,14,8,5,16] sched: [8:0.50]
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqw %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermw %zmm0, %zmm3, %zmm1 {%k1} # sched: [6:2.00]
+; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 1, i32 8, i32 7, i32 30, i32 11, i32 9, i32 11, i32 30, i32 20, i32 19, i32 22, i32 12, i32 13, i32 20, i32 0, i32 6, i32 10, i32 7, i32 20, i32 12, i32 28, i32 18, i32 13, i32 12, i32 22, i32 13, i32 21, i32 1, i32 14, i32 8, i32 5, i32 16>
+ %cmp = icmp eq <32 x i16> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
+ ret <32 x i16> %res
+}
+
+define <32 x i16> @test_masked_z_32xi16_perm_mask1(<32 x i16> %vec, <32 x i16> %mask) {
+; GENERIC-LABEL: test_masked_z_32xi16_perm_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [1,8,7,30,11,9,11,30,20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12,22,13,21,1,14,8,5,16] sched: [4:0.50]
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_32xi16_perm_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [1,8,7,30,11,9,11,30,20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12,22,13,21,1,14,8,5,16] sched: [8:0.50]
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [6:2.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 1, i32 8, i32 7, i32 30, i32 11, i32 9, i32 11, i32 30, i32 20, i32 19, i32 22, i32 12, i32 13, i32 20, i32 0, i32 6, i32 10, i32 7, i32 20, i32 12, i32 28, i32 18, i32 13, i32 12, i32 22, i32 13, i32 21, i32 1, i32 14, i32 8, i32 5, i32 16>
+ %cmp = icmp eq <32 x i16> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
+ ret <32 x i16> %res
+}
+define <32 x i16> @test_masked_32xi16_perm_mask2(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
+; GENERIC-LABEL: test_masked_32xi16_perm_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm3 = [15,17,24,28,15,9,14,25,28,25,6,31,20,2,23,31,12,21,10,6,22,0,26,16,3,3,20,27,8,31,3,27] sched: [4:0.50]
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqw %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermw %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
+; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_32xi16_perm_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [15,17,24,28,15,9,14,25,28,25,6,31,20,2,23,31,12,21,10,6,22,0,26,16,3,3,20,27,8,31,3,27] sched: [8:0.50]
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqw %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermw %zmm0, %zmm3, %zmm1 {%k1} # sched: [6:2.00]
+; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 15, i32 17, i32 24, i32 28, i32 15, i32 9, i32 14, i32 25, i32 28, i32 25, i32 6, i32 31, i32 20, i32 2, i32 23, i32 31, i32 12, i32 21, i32 10, i32 6, i32 22, i32 0, i32 26, i32 16, i32 3, i32 3, i32 20, i32 27, i32 8, i32 31, i32 3, i32 27>
+ %cmp = icmp eq <32 x i16> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
+ ret <32 x i16> %res
+}
+
+define <32 x i16> @test_masked_z_32xi16_perm_mask2(<32 x i16> %vec, <32 x i16> %mask) {
+; GENERIC-LABEL: test_masked_z_32xi16_perm_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,17,24,28,15,9,14,25,28,25,6,31,20,2,23,31,12,21,10,6,22,0,26,16,3,3,20,27,8,31,3,27] sched: [4:0.50]
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_32xi16_perm_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,17,24,28,15,9,14,25,28,25,6,31,20,2,23,31,12,21,10,6,22,0,26,16,3,3,20,27,8,31,3,27] sched: [8:0.50]
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [6:2.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 15, i32 17, i32 24, i32 28, i32 15, i32 9, i32 14, i32 25, i32 28, i32 25, i32 6, i32 31, i32 20, i32 2, i32 23, i32 31, i32 12, i32 21, i32 10, i32 6, i32 22, i32 0, i32 26, i32 16, i32 3, i32 3, i32 20, i32 27, i32 8, i32 31, i32 3, i32 27>
+ %cmp = icmp eq <32 x i16> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
+ ret <32 x i16> %res
+}
+define <32 x i16> @test_32xi16_perm_mask3(<32 x i16> %vec) {
+; GENERIC-LABEL: test_32xi16_perm_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm1 = [12,2,8,14,25,27,4,16,20,11,27,8,0,1,21,17,30,30,29,1,23,22,20,22,28,20,11,17,6,18,0,4] sched: [4:0.50]
+; GENERIC-NEXT: vpermw %zmm0, %zmm1, %zmm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_32xi16_perm_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovdqa64 {{.*#+}} zmm1 = [12,2,8,14,25,27,4,16,20,11,27,8,0,1,21,17,30,30,29,1,23,22,20,22,28,20,11,17,6,18,0,4] sched: [8:0.50]
+; SKX-NEXT: vpermw %zmm0, %zmm1, %zmm0 # sched: [6:2.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 12, i32 2, i32 8, i32 14, i32 25, i32 27, i32 4, i32 16, i32 20, i32 11, i32 27, i32 8, i32 0, i32 1, i32 21, i32 17, i32 30, i32 30, i32 29, i32 1, i32 23, i32 22, i32 20, i32 22, i32 28, i32 20, i32 11, i32 17, i32 6, i32 18, i32 0, i32 4>
+ ret <32 x i16> %res
+}
+define <32 x i16> @test_masked_32xi16_perm_mask3(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
+; GENERIC-LABEL: test_masked_32xi16_perm_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm3 = [12,2,8,14,25,27,4,16,20,11,27,8,0,1,21,17,30,30,29,1,23,22,20,22,28,20,11,17,6,18,0,4] sched: [4:0.50]
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqw %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermw %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
+; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_32xi16_perm_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [12,2,8,14,25,27,4,16,20,11,27,8,0,1,21,17,30,30,29,1,23,22,20,22,28,20,11,17,6,18,0,4] sched: [8:0.50]
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqw %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermw %zmm0, %zmm3, %zmm1 {%k1} # sched: [6:2.00]
+; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 12, i32 2, i32 8, i32 14, i32 25, i32 27, i32 4, i32 16, i32 20, i32 11, i32 27, i32 8, i32 0, i32 1, i32 21, i32 17, i32 30, i32 30, i32 29, i32 1, i32 23, i32 22, i32 20, i32 22, i32 28, i32 20, i32 11, i32 17, i32 6, i32 18, i32 0, i32 4>
+ %cmp = icmp eq <32 x i16> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
+ ret <32 x i16> %res
+}
+
+define <32 x i16> @test_masked_z_32xi16_perm_mask3(<32 x i16> %vec, <32 x i16> %mask) {
+; GENERIC-LABEL: test_masked_z_32xi16_perm_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [12,2,8,14,25,27,4,16,20,11,27,8,0,1,21,17,30,30,29,1,23,22,20,22,28,20,11,17,6,18,0,4] sched: [4:0.50]
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_32xi16_perm_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [12,2,8,14,25,27,4,16,20,11,27,8,0,1,21,17,30,30,29,1,23,22,20,22,28,20,11,17,6,18,0,4] sched: [8:0.50]
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [6:2.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 12, i32 2, i32 8, i32 14, i32 25, i32 27, i32 4, i32 16, i32 20, i32 11, i32 27, i32 8, i32 0, i32 1, i32 21, i32 17, i32 30, i32 30, i32 29, i32 1, i32 23, i32 22, i32 20, i32 22, i32 28, i32 20, i32 11, i32 17, i32 6, i32 18, i32 0, i32 4>
+ %cmp = icmp eq <32 x i16> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
+ ret <32 x i16> %res
+}
+define <32 x i16> @test_32xi16_perm_mem_mask0(<32 x i16>* %vp) {
+; GENERIC-LABEL: test_32xi16_perm_mem_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm0 = [19,1,5,31,9,12,17,9,15,7,1,5,16,2,12,10,13,3,29,15,26,31,10,15,22,13,9,23,28,29,20,12] sched: [4:0.50]
+; GENERIC-NEXT: vpermw (%rdi), %zmm0, %zmm0 # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_32xi16_perm_mem_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovdqa64 {{.*#+}} zmm0 = [19,1,5,31,9,12,17,9,15,7,1,5,16,2,12,10,13,3,29,15,26,31,10,15,22,13,9,23,28,29,20,12] sched: [8:0.50]
+; SKX-NEXT: vpermw (%rdi), %zmm0, %zmm0 # sched: [13:2.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <32 x i16>, <32 x i16>* %vp
+ %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 19, i32 1, i32 5, i32 31, i32 9, i32 12, i32 17, i32 9, i32 15, i32 7, i32 1, i32 5, i32 16, i32 2, i32 12, i32 10, i32 13, i32 3, i32 29, i32 15, i32 26, i32 31, i32 10, i32 15, i32 22, i32 13, i32 9, i32 23, i32 28, i32 29, i32 20, i32 12>
+ ret <32 x i16> %res
+}
+define <32 x i16> @test_masked_32xi16_perm_mem_mask0(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
+; GENERIC-LABEL: test_masked_32xi16_perm_mem_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [19,1,5,31,9,12,17,9,15,7,1,5,16,2,12,10,13,3,29,15,26,31,10,15,22,13,9,23,28,29,20,12] sched: [4:0.50]
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermw (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_32xi16_perm_mem_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [19,1,5,31,9,12,17,9,15,7,1,5,16,2,12,10,13,3,29,15,26,31,10,15,22,13,9,23,28,29,20,12] sched: [8:0.50]
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermw (%rdi), %zmm2, %zmm0 {%k1} # sched: [13:2.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <32 x i16>, <32 x i16>* %vp
+ %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 19, i32 1, i32 5, i32 31, i32 9, i32 12, i32 17, i32 9, i32 15, i32 7, i32 1, i32 5, i32 16, i32 2, i32 12, i32 10, i32 13, i32 3, i32 29, i32 15, i32 26, i32 31, i32 10, i32 15, i32 22, i32 13, i32 9, i32 23, i32 28, i32 29, i32 20, i32 12>
+ %cmp = icmp eq <32 x i16> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
+ ret <32 x i16> %res
+}
+
+define <32 x i16> @test_masked_z_32xi16_perm_mem_mask0(<32 x i16>* %vp, <32 x i16> %mask) {
+; GENERIC-LABEL: test_masked_z_32xi16_perm_mem_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm1 = [19,1,5,31,9,12,17,9,15,7,1,5,16,2,12,10,13,3,29,15,26,31,10,15,22,13,9,23,28,29,20,12] sched: [4:0.50]
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_32xi16_perm_mem_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovdqa64 {{.*#+}} zmm1 = [19,1,5,31,9,12,17,9,15,7,1,5,16,2,12,10,13,3,29,15,26,31,10,15,22,13,9,23,28,29,20,12] sched: [8:0.50]
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqw %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [13:2.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <32 x i16>, <32 x i16>* %vp
+ %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 19, i32 1, i32 5, i32 31, i32 9, i32 12, i32 17, i32 9, i32 15, i32 7, i32 1, i32 5, i32 16, i32 2, i32 12, i32 10, i32 13, i32 3, i32 29, i32 15, i32 26, i32 31, i32 10, i32 15, i32 22, i32 13, i32 9, i32 23, i32 28, i32 29, i32 20, i32 12>
+ %cmp = icmp eq <32 x i16> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
+ ret <32 x i16> %res
+}
+
+define <32 x i16> @test_masked_32xi16_perm_mem_mask1(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
+; GENERIC-LABEL: test_masked_32xi16_perm_mem_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [31,20,2,2,23,1,0,12,16,14,15,18,21,13,11,31,8,24,13,11,2,27,22,28,14,21,3,12,6,1,30,6] sched: [4:0.50]
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermw (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_32xi16_perm_mem_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [31,20,2,2,23,1,0,12,16,14,15,18,21,13,11,31,8,24,13,11,2,27,22,28,14,21,3,12,6,1,30,6] sched: [8:0.50]
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermw (%rdi), %zmm2, %zmm0 {%k1} # sched: [13:2.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <32 x i16>, <32 x i16>* %vp
+ %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 31, i32 20, i32 2, i32 2, i32 23, i32 1, i32 0, i32 12, i32 16, i32 14, i32 15, i32 18, i32 21, i32 13, i32 11, i32 31, i32 8, i32 24, i32 13, i32 11, i32 2, i32 27, i32 22, i32 28, i32 14, i32 21, i32 3, i32 12, i32 6, i32 1, i32 30, i32 6>
+ %cmp = icmp eq <32 x i16> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
+ ret <32 x i16> %res
+}
+
+define <32 x i16> @test_masked_z_32xi16_perm_mem_mask1(<32 x i16>* %vp, <32 x i16> %mask) {
+; GENERIC-LABEL: test_masked_z_32xi16_perm_mem_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm1 = [31,20,2,2,23,1,0,12,16,14,15,18,21,13,11,31,8,24,13,11,2,27,22,28,14,21,3,12,6,1,30,6] sched: [4:0.50]
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_32xi16_perm_mem_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovdqa64 {{.*#+}} zmm1 = [31,20,2,2,23,1,0,12,16,14,15,18,21,13,11,31,8,24,13,11,2,27,22,28,14,21,3,12,6,1,30,6] sched: [8:0.50]
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqw %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [13:2.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <32 x i16>, <32 x i16>* %vp
+ %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 31, i32 20, i32 2, i32 2, i32 23, i32 1, i32 0, i32 12, i32 16, i32 14, i32 15, i32 18, i32 21, i32 13, i32 11, i32 31, i32 8, i32 24, i32 13, i32 11, i32 2, i32 27, i32 22, i32 28, i32 14, i32 21, i32 3, i32 12, i32 6, i32 1, i32 30, i32 6>
+ %cmp = icmp eq <32 x i16> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
+ ret <32 x i16> %res
+}
+
+define <32 x i16> @test_masked_32xi16_perm_mem_mask2(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
+; GENERIC-LABEL: test_masked_32xi16_perm_mem_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [4,6,12,17,4,31,31,4,12,21,28,15,29,10,15,15,21,6,19,7,10,30,28,26,1,4,8,25,26,18,22,25] sched: [4:0.50]
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermw (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_32xi16_perm_mem_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [4,6,12,17,4,31,31,4,12,21,28,15,29,10,15,15,21,6,19,7,10,30,28,26,1,4,8,25,26,18,22,25] sched: [8:0.50]
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermw (%rdi), %zmm2, %zmm0 {%k1} # sched: [13:2.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <32 x i16>, <32 x i16>* %vp
+ %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 4, i32 6, i32 12, i32 17, i32 4, i32 31, i32 31, i32 4, i32 12, i32 21, i32 28, i32 15, i32 29, i32 10, i32 15, i32 15, i32 21, i32 6, i32 19, i32 7, i32 10, i32 30, i32 28, i32 26, i32 1, i32 4, i32 8, i32 25, i32 26, i32 18, i32 22, i32 25>
+ %cmp = icmp eq <32 x i16> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
+ ret <32 x i16> %res
+}
+
+define <32 x i16> @test_masked_z_32xi16_perm_mem_mask2(<32 x i16>* %vp, <32 x i16> %mask) {
+; GENERIC-LABEL: test_masked_z_32xi16_perm_mem_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,6,12,17,4,31,31,4,12,21,28,15,29,10,15,15,21,6,19,7,10,30,28,26,1,4,8,25,26,18,22,25] sched: [4:0.50]
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_32xi16_perm_mem_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,6,12,17,4,31,31,4,12,21,28,15,29,10,15,15,21,6,19,7,10,30,28,26,1,4,8,25,26,18,22,25] sched: [8:0.50]
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqw %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [13:2.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <32 x i16>, <32 x i16>* %vp
+ %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 4, i32 6, i32 12, i32 17, i32 4, i32 31, i32 31, i32 4, i32 12, i32 21, i32 28, i32 15, i32 29, i32 10, i32 15, i32 15, i32 21, i32 6, i32 19, i32 7, i32 10, i32 30, i32 28, i32 26, i32 1, i32 4, i32 8, i32 25, i32 26, i32 18, i32 22, i32 25>
+ %cmp = icmp eq <32 x i16> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
+ ret <32 x i16> %res
+}
+
+define <32 x i16> @test_32xi16_perm_mem_mask3(<32 x i16>* %vp) {
+; GENERIC-LABEL: test_32xi16_perm_mem_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm0 = [2,2,27,1,7,1,0,27,10,5,4,20,30,16,28,16,18,21,25,24,31,23,28,6,17,19,26,15,25,12,18,27] sched: [4:0.50]
+; GENERIC-NEXT: vpermw (%rdi), %zmm0, %zmm0 # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_32xi16_perm_mem_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovdqa64 {{.*#+}} zmm0 = [2,2,27,1,7,1,0,27,10,5,4,20,30,16,28,16,18,21,25,24,31,23,28,6,17,19,26,15,25,12,18,27] sched: [8:0.50]
+; SKX-NEXT: vpermw (%rdi), %zmm0, %zmm0 # sched: [13:2.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <32 x i16>, <32 x i16>* %vp
+ %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 2, i32 2, i32 27, i32 1, i32 7, i32 1, i32 0, i32 27, i32 10, i32 5, i32 4, i32 20, i32 30, i32 16, i32 28, i32 16, i32 18, i32 21, i32 25, i32 24, i32 31, i32 23, i32 28, i32 6, i32 17, i32 19, i32 26, i32 15, i32 25, i32 12, i32 18, i32 27>
+ ret <32 x i16> %res
+}
+define <32 x i16> @test_masked_32xi16_perm_mem_mask3(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
+; GENERIC-LABEL: test_masked_32xi16_perm_mem_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [2,2,27,1,7,1,0,27,10,5,4,20,30,16,28,16,18,21,25,24,31,23,28,6,17,19,26,15,25,12,18,27] sched: [4:0.50]
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermw (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_32xi16_perm_mem_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [2,2,27,1,7,1,0,27,10,5,4,20,30,16,28,16,18,21,25,24,31,23,28,6,17,19,26,15,25,12,18,27] sched: [8:0.50]
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermw (%rdi), %zmm2, %zmm0 {%k1} # sched: [13:2.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <32 x i16>, <32 x i16>* %vp
+ %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 2, i32 2, i32 27, i32 1, i32 7, i32 1, i32 0, i32 27, i32 10, i32 5, i32 4, i32 20, i32 30, i32 16, i32 28, i32 16, i32 18, i32 21, i32 25, i32 24, i32 31, i32 23, i32 28, i32 6, i32 17, i32 19, i32 26, i32 15, i32 25, i32 12, i32 18, i32 27>
+ %cmp = icmp eq <32 x i16> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
+ ret <32 x i16> %res
+}
+
+define <32 x i16> @test_masked_z_32xi16_perm_mem_mask3(<32 x i16>* %vp, <32 x i16> %mask) {
+; GENERIC-LABEL: test_masked_z_32xi16_perm_mem_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,27,1,7,1,0,27,10,5,4,20,30,16,28,16,18,21,25,24,31,23,28,6,17,19,26,15,25,12,18,27] sched: [4:0.50]
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_32xi16_perm_mem_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,27,1,7,1,0,27,10,5,4,20,30,16,28,16,18,21,25,24,31,23,28,6,17,19,26,15,25,12,18,27] sched: [8:0.50]
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqw %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [13:2.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <32 x i16>, <32 x i16>* %vp
+ %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 2, i32 2, i32 27, i32 1, i32 7, i32 1, i32 0, i32 27, i32 10, i32 5, i32 4, i32 20, i32 30, i32 16, i32 28, i32 16, i32 18, i32 21, i32 25, i32 24, i32 31, i32 23, i32 28, i32 6, i32 17, i32 19, i32 26, i32 15, i32 25, i32 12, i32 18, i32 27>
+ %cmp = icmp eq <32 x i16> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
+ ret <32 x i16> %res
+}
+
+define <8 x i32> @test_8xi32_perm_mask0(<8 x i32> %vec) {
+; GENERIC-LABEL: test_8xi32_perm_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovaps {{.*#+}} ymm1 = [4,2,0,6,7,2,3,6] sched: [7:0.50]
+; GENERIC-NEXT: vpermps %ymm0, %ymm1, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xi32_perm_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovaps {{.*#+}} ymm1 = [4,2,0,6,7,2,3,6] sched: [7:0.50]
+; SKX-NEXT: vpermps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 4, i32 2, i32 0, i32 6, i32 7, i32 2, i32 3, i32 6>
+ ret <8 x i32> %res
+}
+define <8 x i32> @test_masked_8xi32_perm_mask0(<8 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
+; GENERIC-LABEL: test_masked_8xi32_perm_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovdqa {{.*#+}} ymm3 = [4,2,0,6,7,2,3,6] sched: [7:0.50]
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermd %ymm0, %ymm3, %ymm1 {%k1} # sched: [1:1.00]
+; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_8xi32_perm_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovdqa {{.*#+}} ymm3 = [4,2,0,6,7,2,3,6] sched: [7:0.50]
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermd %ymm0, %ymm3, %ymm1 {%k1} # sched: [3:1.00]
+; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 4, i32 2, i32 0, i32 6, i32 7, i32 2, i32 3, i32 6>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_masked_z_8xi32_perm_mask0(<8 x i32> %vec, <8 x i32> %mask) {
+; GENERIC-LABEL: test_masked_z_8xi32_perm_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovdqa {{.*#+}} ymm2 = [4,2,0,6,7,2,3,6] sched: [7:0.50]
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermd %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_8xi32_perm_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovdqa {{.*#+}} ymm2 = [4,2,0,6,7,2,3,6] sched: [7:0.50]
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermd %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 4, i32 2, i32 0, i32 6, i32 7, i32 2, i32 3, i32 6>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
+ ret <8 x i32> %res
+}
+define <8 x i32> @test_masked_8xi32_perm_mask1(<8 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
+; GENERIC-LABEL: test_masked_8xi32_perm_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovdqa {{.*#+}} ymm3 = [0,5,1,2,6,0,0,3] sched: [7:0.50]
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermd %ymm0, %ymm3, %ymm1 {%k1} # sched: [1:1.00]
+; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_8xi32_perm_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovdqa {{.*#+}} ymm3 = [0,5,1,2,6,0,0,3] sched: [7:0.50]
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermd %ymm0, %ymm3, %ymm1 {%k1} # sched: [3:1.00]
+; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 0, i32 5, i32 1, i32 2, i32 6, i32 0, i32 0, i32 3>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_masked_z_8xi32_perm_mask1(<8 x i32> %vec, <8 x i32> %mask) {
+; GENERIC-LABEL: test_masked_z_8xi32_perm_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovdqa {{.*#+}} ymm2 = [0,5,1,2,6,0,0,3] sched: [7:0.50]
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermd %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_8xi32_perm_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovdqa {{.*#+}} ymm2 = [0,5,1,2,6,0,0,3] sched: [7:0.50]
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermd %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 0, i32 5, i32 1, i32 2, i32 6, i32 0, i32 0, i32 3>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
+ ret <8 x i32> %res
+}
+define <8 x i32> @test_masked_8xi32_perm_mask2(<8 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
+; GENERIC-LABEL: test_masked_8xi32_perm_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovdqa {{.*#+}} ymm3 = [3,6,5,5,1,7,3,4] sched: [7:0.50]
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermd %ymm0, %ymm3, %ymm1 {%k1} # sched: [1:1.00]
+; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_8xi32_perm_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovdqa {{.*#+}} ymm3 = [3,6,5,5,1,7,3,4] sched: [7:0.50]
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermd %ymm0, %ymm3, %ymm1 {%k1} # sched: [3:1.00]
+; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 6, i32 5, i32 5, i32 1, i32 7, i32 3, i32 4>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_masked_z_8xi32_perm_mask2(<8 x i32> %vec, <8 x i32> %mask) {
+; GENERIC-LABEL: test_masked_z_8xi32_perm_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovdqa {{.*#+}} ymm2 = [3,6,5,5,1,7,3,4] sched: [7:0.50]
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermd %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_8xi32_perm_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovdqa {{.*#+}} ymm2 = [3,6,5,5,1,7,3,4] sched: [7:0.50]
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermd %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 6, i32 5, i32 5, i32 1, i32 7, i32 3, i32 4>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
+ ret <8 x i32> %res
+}
+define <8 x i32> @test_8xi32_perm_mask3(<8 x i32> %vec) {
+; GENERIC-LABEL: test_8xi32_perm_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovaps {{.*#+}} ymm1 = [3,0,3,1,0,4,5,0] sched: [7:0.50]
+; GENERIC-NEXT: vpermps %ymm0, %ymm1, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xi32_perm_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovaps {{.*#+}} ymm1 = [3,0,3,1,0,4,5,0] sched: [7:0.50]
+; SKX-NEXT: vpermps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 0, i32 3, i32 1, i32 0, i32 4, i32 5, i32 0>
+ ret <8 x i32> %res
+}
+define <8 x i32> @test_masked_8xi32_perm_mask3(<8 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
+; GENERIC-LABEL: test_masked_8xi32_perm_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovdqa {{.*#+}} ymm3 = [3,0,3,1,0,4,5,0] sched: [7:0.50]
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermd %ymm0, %ymm3, %ymm1 {%k1} # sched: [1:1.00]
+; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_8xi32_perm_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovdqa {{.*#+}} ymm3 = [3,0,3,1,0,4,5,0] sched: [7:0.50]
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermd %ymm0, %ymm3, %ymm1 {%k1} # sched: [3:1.00]
+; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 0, i32 3, i32 1, i32 0, i32 4, i32 5, i32 0>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_masked_z_8xi32_perm_mask3(<8 x i32> %vec, <8 x i32> %mask) {
+; GENERIC-LABEL: test_masked_z_8xi32_perm_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovdqa {{.*#+}} ymm2 = [3,0,3,1,0,4,5,0] sched: [7:0.50]
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermd %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_8xi32_perm_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovdqa {{.*#+}} ymm2 = [3,0,3,1,0,4,5,0] sched: [7:0.50]
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermd %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 0, i32 3, i32 1, i32 0, i32 4, i32 5, i32 0>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
+ ret <8 x i32> %res
+}
+define <8 x i32> @test_8xi32_perm_mem_mask0(<8 x i32>* %vp) {
+; GENERIC-LABEL: test_8xi32_perm_mem_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovaps {{.*#+}} ymm0 = [3,7,4,3,5,2,0,5] sched: [7:0.50]
+; GENERIC-NEXT: vpermps (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xi32_perm_mem_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovaps {{.*#+}} ymm0 = [3,7,4,3,5,2,0,5] sched: [7:0.50]
+; SKX-NEXT: vpermps (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <8 x i32>, <8 x i32>* %vp
+ %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 7, i32 4, i32 3, i32 5, i32 2, i32 0, i32 5>
+ ret <8 x i32> %res
+}
+define <8 x i32> @test_masked_8xi32_perm_mem_mask0(<8 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) {
+; GENERIC-LABEL: test_masked_8xi32_perm_mem_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovdqa {{.*#+}} ymm2 = [3,7,4,3,5,2,0,5] sched: [7:0.50]
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermd (%rdi), %ymm2, %ymm0 {%k1} # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_8xi32_perm_mem_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovdqa {{.*#+}} ymm2 = [3,7,4,3,5,2,0,5] sched: [7:0.50]
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermd (%rdi), %ymm2, %ymm0 {%k1} # sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <8 x i32>, <8 x i32>* %vp
+ %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 7, i32 4, i32 3, i32 5, i32 2, i32 0, i32 5>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_masked_z_8xi32_perm_mem_mask0(<8 x i32>* %vp, <8 x i32> %mask) {
+; GENERIC-LABEL: test_masked_z_8xi32_perm_mem_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovdqa {{.*#+}} ymm1 = [3,7,4,3,5,2,0,5] sched: [7:0.50]
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_8xi32_perm_mem_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovdqa {{.*#+}} ymm1 = [3,7,4,3,5,2,0,5] sched: [7:0.50]
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <8 x i32>, <8 x i32>* %vp
+ %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 7, i32 4, i32 3, i32 5, i32 2, i32 0, i32 5>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_masked_8xi32_perm_mem_mask1(<8 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) {
+; GENERIC-LABEL: test_masked_8xi32_perm_mem_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovdqa {{.*#+}} ymm2 = [4,6,1,7,6,7,6,5] sched: [7:0.50]
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermd (%rdi), %ymm2, %ymm0 {%k1} # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_8xi32_perm_mem_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovdqa {{.*#+}} ymm2 = [4,6,1,7,6,7,6,5] sched: [7:0.50]
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermd (%rdi), %ymm2, %ymm0 {%k1} # sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <8 x i32>, <8 x i32>* %vp
+ %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 4, i32 6, i32 1, i32 7, i32 6, i32 7, i32 6, i32 5>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_masked_z_8xi32_perm_mem_mask1(<8 x i32>* %vp, <8 x i32> %mask) {
+; GENERIC-LABEL: test_masked_z_8xi32_perm_mem_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovdqa {{.*#+}} ymm1 = [4,6,1,7,6,7,6,5] sched: [7:0.50]
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_8xi32_perm_mem_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovdqa {{.*#+}} ymm1 = [4,6,1,7,6,7,6,5] sched: [7:0.50]
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <8 x i32>, <8 x i32>* %vp
+ %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 4, i32 6, i32 1, i32 7, i32 6, i32 7, i32 6, i32 5>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_masked_8xi32_perm_mem_mask2(<8 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) {
+; GENERIC-LABEL: test_masked_8xi32_perm_mem_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovdqa {{.*#+}} ymm2 = [6,4,6,1,6,3,6,3] sched: [7:0.50]
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermd (%rdi), %ymm2, %ymm0 {%k1} # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_8xi32_perm_mem_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovdqa {{.*#+}} ymm2 = [6,4,6,1,6,3,6,3] sched: [7:0.50]
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermd (%rdi), %ymm2, %ymm0 {%k1} # sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <8 x i32>, <8 x i32>* %vp
+ %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 6, i32 4, i32 6, i32 1, i32 6, i32 3, i32 6, i32 3>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_masked_z_8xi32_perm_mem_mask2(<8 x i32>* %vp, <8 x i32> %mask) {
+; GENERIC-LABEL: test_masked_z_8xi32_perm_mem_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovdqa {{.*#+}} ymm1 = [6,4,6,1,6,3,6,3] sched: [7:0.50]
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_8xi32_perm_mem_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovdqa {{.*#+}} ymm1 = [6,4,6,1,6,3,6,3] sched: [7:0.50]
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <8 x i32>, <8 x i32>* %vp
+ %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 6, i32 4, i32 6, i32 1, i32 6, i32 3, i32 6, i32 3>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_8xi32_perm_mem_mask3(<8 x i32>* %vp) {
+; GENERIC-LABEL: test_8xi32_perm_mem_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovaps {{.*#+}} ymm0 = [6,0,0,7,3,7,7,5] sched: [7:0.50]
+; GENERIC-NEXT: vpermps (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xi32_perm_mem_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovaps {{.*#+}} ymm0 = [6,0,0,7,3,7,7,5] sched: [7:0.50]
+; SKX-NEXT: vpermps (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <8 x i32>, <8 x i32>* %vp
+ %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 6, i32 0, i32 0, i32 7, i32 3, i32 7, i32 7, i32 5>
+ ret <8 x i32> %res
+}
+define <8 x i32> @test_masked_8xi32_perm_mem_mask3(<8 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) {
+; GENERIC-LABEL: test_masked_8xi32_perm_mem_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovdqa {{.*#+}} ymm2 = [6,0,0,7,3,7,7,5] sched: [7:0.50]
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermd (%rdi), %ymm2, %ymm0 {%k1} # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_8xi32_perm_mem_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovdqa {{.*#+}} ymm2 = [6,0,0,7,3,7,7,5] sched: [7:0.50]
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermd (%rdi), %ymm2, %ymm0 {%k1} # sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <8 x i32>, <8 x i32>* %vp
+ %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 6, i32 0, i32 0, i32 7, i32 3, i32 7, i32 7, i32 5>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_masked_z_8xi32_perm_mem_mask3(<8 x i32>* %vp, <8 x i32> %mask) {
+; GENERIC-LABEL: test_masked_z_8xi32_perm_mem_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovdqa {{.*#+}} ymm1 = [6,0,0,7,3,7,7,5] sched: [7:0.50]
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_8xi32_perm_mem_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovdqa {{.*#+}} ymm1 = [6,0,0,7,3,7,7,5] sched: [7:0.50]
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <8 x i32>, <8 x i32>* %vp
+ %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 6, i32 0, i32 0, i32 7, i32 3, i32 7, i32 7, i32 5>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
+ ret <8 x i32> %res
+}
+
+define <16 x i32> @test_16xi32_perm_mask0(<16 x i32> %vec, <16 x i32> %mask) {
+; GENERIC-LABEL: test_16xi32_perm_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovaps {{.*#+}} zmm1 = [14,12,11,6,4,1,6,9,14,14,6,1,12,11,0,7] sched: [4:0.50]
+; GENERIC-NEXT: vpermps %zmm0, %zmm1, %zmm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xi32_perm_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovaps {{.*#+}} zmm1 = [14,12,11,6,4,1,6,9,14,14,6,1,12,11,0,7] sched: [8:0.50]
+; SKX-NEXT: vpermps %zmm0, %zmm1, %zmm0 # sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 14, i32 12, i32 11, i32 6, i32 4, i32 1, i32 6, i32 9, i32 14, i32 14, i32 6, i32 1, i32 12, i32 11, i32 0, i32 7>
+ ret <16 x i32> %res
+}
+define <16 x i32> @test_masked_16xi32_perm_mask0(<16 x i32> %vec, <16 x i32> %vec2, <16 x i32> %mask) {
+; GENERIC-LABEL: test_masked_16xi32_perm_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovdqa32 {{.*#+}} zmm3 = [14,12,11,6,4,1,6,9,14,14,6,1,12,11,0,7] sched: [4:0.50]
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermd %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
+; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_16xi32_perm_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [14,12,11,6,4,1,6,9,14,14,6,1,12,11,0,7] sched: [8:0.50]
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermd %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
+; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 14, i32 12, i32 11, i32 6, i32 4, i32 1, i32 6, i32 9, i32 14, i32 14, i32 6, i32 1, i32 12, i32 11, i32 0, i32 7>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2
+ ret <16 x i32> %res
+}
+
+define <16 x i32> @test_masked_z_16xi32_perm_mask0(<16 x i32> %vec, <16 x i32> %mask) {
+; GENERIC-LABEL: test_masked_z_16xi32_perm_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovdqa32 {{.*#+}} zmm2 = [14,12,11,6,4,1,6,9,14,14,6,1,12,11,0,7] sched: [4:0.50]
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_16xi32_perm_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [14,12,11,6,4,1,6,9,14,14,6,1,12,11,0,7] sched: [8:0.50]
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 14, i32 12, i32 11, i32 6, i32 4, i32 1, i32 6, i32 9, i32 14, i32 14, i32 6, i32 1, i32 12, i32 11, i32 0, i32 7>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
+ ret <16 x i32> %res
+}
+define <16 x i32> @test_masked_16xi32_perm_mask1(<16 x i32> %vec, <16 x i32> %vec2, <16 x i32> %mask) {
+; GENERIC-LABEL: test_masked_16xi32_perm_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovdqa32 {{.*#+}} zmm3 = [10,0,14,15,11,1,1,5,0,5,0,15,13,1,14,3] sched: [4:0.50]
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermd %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
+; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_16xi32_perm_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [10,0,14,15,11,1,1,5,0,5,0,15,13,1,14,3] sched: [8:0.50]
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermd %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
+; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 10, i32 0, i32 14, i32 15, i32 11, i32 1, i32 1, i32 5, i32 0, i32 5, i32 0, i32 15, i32 13, i32 1, i32 14, i32 3>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2
+ ret <16 x i32> %res
+}
+
+define <16 x i32> @test_masked_z_16xi32_perm_mask1(<16 x i32> %vec, <16 x i32> %mask) {
+; GENERIC-LABEL: test_masked_z_16xi32_perm_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovdqa32 {{.*#+}} zmm2 = [10,0,14,15,11,1,1,5,0,5,0,15,13,1,14,3] sched: [4:0.50]
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_16xi32_perm_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [10,0,14,15,11,1,1,5,0,5,0,15,13,1,14,3] sched: [8:0.50]
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 10, i32 0, i32 14, i32 15, i32 11, i32 1, i32 1, i32 5, i32 0, i32 5, i32 0, i32 15, i32 13, i32 1, i32 14, i32 3>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
+ ret <16 x i32> %res
+}
+define <16 x i32> @test_masked_16xi32_perm_mask2(<16 x i32> %vec, <16 x i32> %vec2, <16 x i32> %mask) {
+; GENERIC-LABEL: test_masked_16xi32_perm_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovdqa32 {{.*#+}} zmm3 = [3,10,15,1,0,5,0,9,13,2,1,5,15,2,15,5] sched: [4:0.50]
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermd %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
+; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_16xi32_perm_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [3,10,15,1,0,5,0,9,13,2,1,5,15,2,15,5] sched: [8:0.50]
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermd %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
+; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 10, i32 15, i32 1, i32 0, i32 5, i32 0, i32 9, i32 13, i32 2, i32 1, i32 5, i32 15, i32 2, i32 15, i32 5>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2
+ ret <16 x i32> %res
+}
+
+define <16 x i32> @test_masked_z_16xi32_perm_mask2(<16 x i32> %vec, <16 x i32> %mask) {
+; GENERIC-LABEL: test_masked_z_16xi32_perm_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovdqa32 {{.*#+}} zmm2 = [3,10,15,1,0,5,0,9,13,2,1,5,15,2,15,5] sched: [4:0.50]
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_16xi32_perm_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [3,10,15,1,0,5,0,9,13,2,1,5,15,2,15,5] sched: [8:0.50]
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 10, i32 15, i32 1, i32 0, i32 5, i32 0, i32 9, i32 13, i32 2, i32 1, i32 5, i32 15, i32 2, i32 15, i32 5>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
+ ret <16 x i32> %res
+}
+define <16 x i32> @test_16xi32_perm_mask3(<16 x i32> %vec) {
+; GENERIC-LABEL: test_16xi32_perm_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovaps {{.*#+}} zmm1 = [7,4,14,15,10,2,15,1,9,2,14,15,12,5,3,12] sched: [4:0.50]
+; GENERIC-NEXT: vpermps %zmm0, %zmm1, %zmm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xi32_perm_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovaps {{.*#+}} zmm1 = [7,4,14,15,10,2,15,1,9,2,14,15,12,5,3,12] sched: [8:0.50]
+; SKX-NEXT: vpermps %zmm0, %zmm1, %zmm0 # sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 7, i32 4, i32 14, i32 15, i32 10, i32 2, i32 15, i32 1, i32 9, i32 2, i32 14, i32 15, i32 12, i32 5, i32 3, i32 12>
+ ret <16 x i32> %res
+}
+define <16 x i32> @test_masked_16xi32_perm_mask3(<16 x i32> %vec, <16 x i32> %vec2, <16 x i32> %mask) {
+; GENERIC-LABEL: test_masked_16xi32_perm_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovdqa32 {{.*#+}} zmm3 = [7,4,14,15,10,2,15,1,9,2,14,15,12,5,3,12] sched: [4:0.50]
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermd %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
+; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_16xi32_perm_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [7,4,14,15,10,2,15,1,9,2,14,15,12,5,3,12] sched: [8:0.50]
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermd %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
+; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 7, i32 4, i32 14, i32 15, i32 10, i32 2, i32 15, i32 1, i32 9, i32 2, i32 14, i32 15, i32 12, i32 5, i32 3, i32 12>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2
+ ret <16 x i32> %res
+}
+
+define <16 x i32> @test_masked_z_16xi32_perm_mask3(<16 x i32> %vec, <16 x i32> %mask) {
+; GENERIC-LABEL: test_masked_z_16xi32_perm_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovdqa32 {{.*#+}} zmm2 = [7,4,14,15,10,2,15,1,9,2,14,15,12,5,3,12] sched: [4:0.50]
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_16xi32_perm_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [7,4,14,15,10,2,15,1,9,2,14,15,12,5,3,12] sched: [8:0.50]
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 7, i32 4, i32 14, i32 15, i32 10, i32 2, i32 15, i32 1, i32 9, i32 2, i32 14, i32 15, i32 12, i32 5, i32 3, i32 12>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
+ ret <16 x i32> %res
+}
+define <16 x i32> @test_16xi32_perm_mem_mask0(<16 x i32>* %vp) {
+; GENERIC-LABEL: test_16xi32_perm_mem_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovaps {{.*#+}} zmm0 = [0,1,1,6,8,11,2,6,10,1,7,5,15,0,6,6] sched: [4:0.50]
+; GENERIC-NEXT: vpermps (%rdi), %zmm0, %zmm0 # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xi32_perm_mem_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovaps {{.*#+}} zmm0 = [0,1,1,6,8,11,2,6,10,1,7,5,15,0,6,6] sched: [8:0.50]
+; SKX-NEXT: vpermps (%rdi), %zmm0, %zmm0 # sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <16 x i32>, <16 x i32>* %vp
+ %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 0, i32 1, i32 1, i32 6, i32 8, i32 11, i32 2, i32 6, i32 10, i32 1, i32 7, i32 5, i32 15, i32 0, i32 6, i32 6>
+ ret <16 x i32> %res
+}
+define <16 x i32> @test_masked_16xi32_perm_mem_mask0(<16 x i32>* %vp, <16 x i32> %vec2, <16 x i32> %mask) {
+; GENERIC-LABEL: test_masked_16xi32_perm_mem_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,1,6,8,11,2,6,10,1,7,5,15,0,6,6] sched: [4:0.50]
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermd (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_16xi32_perm_mem_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,1,6,8,11,2,6,10,1,7,5,15,0,6,6] sched: [8:0.50]
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermd (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <16 x i32>, <16 x i32>* %vp
+ %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 0, i32 1, i32 1, i32 6, i32 8, i32 11, i32 2, i32 6, i32 10, i32 1, i32 7, i32 5, i32 15, i32 0, i32 6, i32 6>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2
+ ret <16 x i32> %res
+}
+
+define <16 x i32> @test_masked_z_16xi32_perm_mem_mask0(<16 x i32>* %vp, <16 x i32> %mask) {
+; GENERIC-LABEL: test_masked_z_16xi32_perm_mem_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovdqa32 {{.*#+}} zmm1 = [0,1,1,6,8,11,2,6,10,1,7,5,15,0,6,6] sched: [4:0.50]
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_16xi32_perm_mem_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovdqa32 {{.*#+}} zmm1 = [0,1,1,6,8,11,2,6,10,1,7,5,15,0,6,6] sched: [8:0.50]
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <16 x i32>, <16 x i32>* %vp
+ %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 0, i32 1, i32 1, i32 6, i32 8, i32 11, i32 2, i32 6, i32 10, i32 1, i32 7, i32 5, i32 15, i32 0, i32 6, i32 6>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
+ ret <16 x i32> %res
+}
+
+define <16 x i32> @test_masked_16xi32_perm_mem_mask1(<16 x i32>* %vp, <16 x i32> %vec2, <16 x i32> %mask) {
+; GENERIC-LABEL: test_masked_16xi32_perm_mem_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovdqa32 {{.*#+}} zmm2 = [11,5,3,4,7,15,12,4,8,11,12,7,6,12,6,3] sched: [4:0.50]
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermd (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_16xi32_perm_mem_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [11,5,3,4,7,15,12,4,8,11,12,7,6,12,6,3] sched: [8:0.50]
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermd (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <16 x i32>, <16 x i32>* %vp
+ %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 11, i32 5, i32 3, i32 4, i32 7, i32 15, i32 12, i32 4, i32 8, i32 11, i32 12, i32 7, i32 6, i32 12, i32 6, i32 3>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2
+ ret <16 x i32> %res
+}
+
+define <16 x i32> @test_masked_z_16xi32_perm_mem_mask1(<16 x i32>* %vp, <16 x i32> %mask) {
+; GENERIC-LABEL: test_masked_z_16xi32_perm_mem_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovdqa32 {{.*#+}} zmm1 = [11,5,3,4,7,15,12,4,8,11,12,7,6,12,6,3] sched: [4:0.50]
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_16xi32_perm_mem_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovdqa32 {{.*#+}} zmm1 = [11,5,3,4,7,15,12,4,8,11,12,7,6,12,6,3] sched: [8:0.50]
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <16 x i32>, <16 x i32>* %vp
+ %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 11, i32 5, i32 3, i32 4, i32 7, i32 15, i32 12, i32 4, i32 8, i32 11, i32 12, i32 7, i32 6, i32 12, i32 6, i32 3>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
+ ret <16 x i32> %res
+}
+
+define <16 x i32> @test_masked_16xi32_perm_mem_mask2(<16 x i32>* %vp, <16 x i32> %vec2, <16 x i32> %mask) {
+; GENERIC-LABEL: test_masked_16xi32_perm_mem_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovdqa32 {{.*#+}} zmm2 = [7,14,2,7,10,7,3,0,11,9,0,4,12,10,8,2] sched: [4:0.50]
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermd (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_16xi32_perm_mem_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [7,14,2,7,10,7,3,0,11,9,0,4,12,10,8,2] sched: [8:0.50]
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermd (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <16 x i32>, <16 x i32>* %vp
+ %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 7, i32 14, i32 2, i32 7, i32 10, i32 7, i32 3, i32 0, i32 11, i32 9, i32 0, i32 4, i32 12, i32 10, i32 8, i32 2>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2
+ ret <16 x i32> %res
+}
+
+define <16 x i32> @test_masked_z_16xi32_perm_mem_mask2(<16 x i32>* %vp, <16 x i32> %mask) {
+; GENERIC-LABEL: test_masked_z_16xi32_perm_mem_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovdqa32 {{.*#+}} zmm1 = [7,14,2,7,10,7,3,0,11,9,0,4,12,10,8,2] sched: [4:0.50]
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_16xi32_perm_mem_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovdqa32 {{.*#+}} zmm1 = [7,14,2,7,10,7,3,0,11,9,0,4,12,10,8,2] sched: [8:0.50]
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <16 x i32>, <16 x i32>* %vp
+ %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 7, i32 14, i32 2, i32 7, i32 10, i32 7, i32 3, i32 0, i32 11, i32 9, i32 0, i32 4, i32 12, i32 10, i32 8, i32 2>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
+ ret <16 x i32> %res
+}
+
+define <16 x i32> @test_16xi32_perm_mem_mask3(<16 x i32>* %vp) {
+; GENERIC-LABEL: test_16xi32_perm_mem_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovaps {{.*#+}} zmm0 = [11,7,10,12,3,12,4,15,1,14,0,4,8,9,6,1] sched: [4:0.50]
+; GENERIC-NEXT: vpermps (%rdi), %zmm0, %zmm0 # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xi32_perm_mem_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovaps {{.*#+}} zmm0 = [11,7,10,12,3,12,4,15,1,14,0,4,8,9,6,1] sched: [8:0.50]
+; SKX-NEXT: vpermps (%rdi), %zmm0, %zmm0 # sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <16 x i32>, <16 x i32>* %vp
+ %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 11, i32 7, i32 10, i32 12, i32 3, i32 12, i32 4, i32 15, i32 1, i32 14, i32 0, i32 4, i32 8, i32 9, i32 6, i32 1>
+ ret <16 x i32> %res
+}
+define <16 x i32> @test_masked_16xi32_perm_mem_mask3(<16 x i32>* %vp, <16 x i32> %vec2, <16 x i32> %mask) {
+; GENERIC-LABEL: test_masked_16xi32_perm_mem_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovdqa32 {{.*#+}} zmm2 = [11,7,10,12,3,12,4,15,1,14,0,4,8,9,6,1] sched: [4:0.50]
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermd (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_16xi32_perm_mem_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [11,7,10,12,3,12,4,15,1,14,0,4,8,9,6,1] sched: [8:0.50]
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermd (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <16 x i32>, <16 x i32>* %vp
+ %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 11, i32 7, i32 10, i32 12, i32 3, i32 12, i32 4, i32 15, i32 1, i32 14, i32 0, i32 4, i32 8, i32 9, i32 6, i32 1>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2
+ ret <16 x i32> %res
+}
+
+define <16 x i32> @test_masked_z_16xi32_perm_mem_mask3(<16 x i32>* %vp, <16 x i32> %mask) {
+; GENERIC-LABEL: test_masked_z_16xi32_perm_mem_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovdqa32 {{.*#+}} zmm1 = [11,7,10,12,3,12,4,15,1,14,0,4,8,9,6,1] sched: [4:0.50]
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_16xi32_perm_mem_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovdqa32 {{.*#+}} zmm1 = [11,7,10,12,3,12,4,15,1,14,0,4,8,9,6,1] sched: [8:0.50]
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <16 x i32>, <16 x i32>* %vp
+ %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 11, i32 7, i32 10, i32 12, i32 3, i32 12, i32 4, i32 15, i32 1, i32 14, i32 0, i32 4, i32 8, i32 9, i32 6, i32 1>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
+ ret <16 x i32> %res
+}
+
+define <4 x i64> @test_4xi64_perm_mask0(<4 x i64> %vec) {
+; GENERIC-LABEL: test_4xi64_perm_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,0,3,1] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xi64_perm_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,0,3,1] sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %res = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 0, i32 3, i32 1>
+ ret <4 x i64> %res
+}
+define <4 x i64> @test_masked_4xi64_perm_mask0(<4 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
+; GENERIC-LABEL: test_masked_4xi64_perm_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,0,3,1] sched: [1:1.00]
+; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_4xi64_perm_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,0,3,1] sched: [3:1.00]
+; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 0, i32 3, i32 1>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_masked_z_4xi64_perm_mask0(<4 x i64> %vec, <4 x i64> %mask) {
+; GENERIC-LABEL: test_masked_z_4xi64_perm_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,0,3,1] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_4xi64_perm_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,0,3,1] sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 0, i32 3, i32 1>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
+ ret <4 x i64> %res
+}
+define <4 x i64> @test_masked_4xi64_perm_mask1(<4 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
+; GENERIC-LABEL: test_masked_4xi64_perm_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[1,2,0,3] sched: [1:1.00]
+; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_4xi64_perm_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[1,2,0,3] sched: [3:1.00]
+; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 1, i32 2, i32 0, i32 3>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_masked_z_4xi64_perm_mask1(<4 x i64> %vec, <4 x i64> %mask) {
+; GENERIC-LABEL: test_masked_z_4xi64_perm_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[1,2,0,3] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_4xi64_perm_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[1,2,0,3] sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 1, i32 2, i32 0, i32 3>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
+ ret <4 x i64> %res
+}
+define <4 x i64> @test_masked_4xi64_perm_mask2(<4 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
+; GENERIC-LABEL: test_masked_4xi64_perm_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,2,2,1] sched: [1:1.00]
+; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_4xi64_perm_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,2,2,1] sched: [3:1.00]
+; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 1>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_masked_z_4xi64_perm_mask2(<4 x i64> %vec, <4 x i64> %mask) {
+; GENERIC-LABEL: test_masked_z_4xi64_perm_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,2,2,1] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_4xi64_perm_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,2,2,1] sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 1>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
+ ret <4 x i64> %res
+}
+define <4 x i64> @test_4xi64_perm_mask3(<4 x i64> %vec) {
+; GENERIC-LABEL: test_4xi64_perm_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xi64_perm_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %res = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 1, i32 3, i32 3>
+ ret <4 x i64> %res
+}
+define <4 x i64> @test_masked_4xi64_perm_mask3(<4 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
+; GENERIC-LABEL: test_masked_4xi64_perm_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,1,3,3] sched: [1:1.00]
+; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_4xi64_perm_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,1,3,3] sched: [3:1.00]
+; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 1, i32 3, i32 3>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_masked_z_4xi64_perm_mask3(<4 x i64> %vec, <4 x i64> %mask) {
+; GENERIC-LABEL: test_masked_z_4xi64_perm_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,1,3,3] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_4xi64_perm_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,1,3,3] sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 1, i32 3, i32 3>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
+ ret <4 x i64> %res
+}
+define <4 x i64> @test_4xi64_perm_mem_mask0(<4 x i64>* %vp) {
+; GENERIC-LABEL: test_4xi64_perm_mem_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 = mem[2,1,2,0] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xi64_perm_mem_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpermpd {{.*#+}} ymm0 = mem[2,1,2,0] sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <4 x i64>, <4 x i64>* %vp
+ %res = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 1, i32 2, i32 0>
+ ret <4 x i64> %res
+}
+define <4 x i64> @test_masked_4xi64_perm_mem_mask0(<4 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) {
+; GENERIC-LABEL: test_masked_4xi64_perm_mem_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermq {{.*#+}} ymm0 {%k1} = mem[2,1,2,0] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_4xi64_perm_mem_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermq {{.*#+}} ymm0 {%k1} = mem[2,1,2,0] sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <4 x i64>, <4 x i64>* %vp
+ %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 1, i32 2, i32 0>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_masked_z_4xi64_perm_mem_mask0(<4 x i64>* %vp, <4 x i64> %mask) {
+; GENERIC-LABEL: test_masked_z_4xi64_perm_mem_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = mem[2,1,2,0] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_4xi64_perm_mem_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = mem[2,1,2,0] sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <4 x i64>, <4 x i64>* %vp
+ %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 1, i32 2, i32 0>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_masked_4xi64_perm_mem_mask1(<4 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) {
+; GENERIC-LABEL: test_masked_4xi64_perm_mem_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermq {{.*#+}} ymm0 {%k1} = mem[2,1,1,1] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_4xi64_perm_mem_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermq {{.*#+}} ymm0 {%k1} = mem[2,1,1,1] sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <4 x i64>, <4 x i64>* %vp
+ %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 1, i32 1, i32 1>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_masked_z_4xi64_perm_mem_mask1(<4 x i64>* %vp, <4 x i64> %mask) {
+; GENERIC-LABEL: test_masked_z_4xi64_perm_mem_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = mem[2,1,1,1] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_4xi64_perm_mem_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = mem[2,1,1,1] sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <4 x i64>, <4 x i64>* %vp
+ %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 1, i32 1, i32 1>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_masked_4xi64_perm_mem_mask2(<4 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) {
+; GENERIC-LABEL: test_masked_4xi64_perm_mem_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermq {{.*#+}} ymm0 {%k1} = mem[0,1,2,0] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_4xi64_perm_mem_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermq {{.*#+}} ymm0 {%k1} = mem[0,1,2,0] sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <4 x i64>, <4 x i64>* %vp
+ %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 0>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_masked_z_4xi64_perm_mem_mask2(<4 x i64>* %vp, <4 x i64> %mask) {
+; GENERIC-LABEL: test_masked_z_4xi64_perm_mem_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,0] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_4xi64_perm_mem_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,0] sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <4 x i64>, <4 x i64>* %vp
+ %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 0>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_4xi64_perm_mem_mask3(<4 x i64>* %vp) {
+; GENERIC-LABEL: test_4xi64_perm_mem_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 = mem[2,0,1,3] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xi64_perm_mem_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpermpd {{.*#+}} ymm0 = mem[2,0,1,3] sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <4 x i64>, <4 x i64>* %vp
+ %res = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 0, i32 1, i32 3>
+ ret <4 x i64> %res
+}
+define <4 x i64> @test_masked_4xi64_perm_mem_mask3(<4 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) {
+; GENERIC-LABEL: test_masked_4xi64_perm_mem_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermq {{.*#+}} ymm0 {%k1} = mem[2,0,1,3] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_4xi64_perm_mem_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermq {{.*#+}} ymm0 {%k1} = mem[2,0,1,3] sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <4 x i64>, <4 x i64>* %vp
+ %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 0, i32 1, i32 3>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_masked_z_4xi64_perm_mem_mask3(<4 x i64>* %vp, <4 x i64> %mask) {
+; GENERIC-LABEL: test_masked_z_4xi64_perm_mem_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = mem[2,0,1,3] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_4xi64_perm_mem_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = mem[2,0,1,3] sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <4 x i64>, <4 x i64>* %vp
+ %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 0, i32 1, i32 3>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
+ ret <4 x i64> %res
+}
+
+define <8 x i64> @test_8xi64_perm_mask0(<8 x i64> %vec) {
+; GENERIC-LABEL: test_8xi64_perm_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovaps {{.*#+}} zmm1 = [0,4,7,6,5,5,1,6] sched: [4:0.50]
+; GENERIC-NEXT: vpermpd %zmm0, %zmm1, %zmm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xi64_perm_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovaps {{.*#+}} zmm1 = [0,4,7,6,5,5,1,6] sched: [8:0.50]
+; SKX-NEXT: vpermpd %zmm0, %zmm1, %zmm0 # sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 0, i32 4, i32 7, i32 6, i32 5, i32 5, i32 1, i32 6>
+ ret <8 x i64> %res
+}
+define <8 x i64> @test_masked_8xi64_perm_mask0(<8 x i64> %vec, <8 x i64> %vec2, <8 x i64> %mask) {
+; GENERIC-LABEL: test_masked_8xi64_perm_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,4,7,6,5,5,1,6] sched: [4:0.50]
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermq %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
+; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_8xi64_perm_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,4,7,6,5,5,1,6] sched: [8:0.50]
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermq %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
+; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 0, i32 4, i32 7, i32 6, i32 5, i32 5, i32 1, i32 6>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
+ ret <8 x i64> %res
+}
+
+define <8 x i64> @test_masked_z_8xi64_perm_mask0(<8 x i64> %vec, <8 x i64> %mask) {
+; GENERIC-LABEL: test_masked_z_8xi64_perm_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,4,7,6,5,5,1,6] sched: [4:0.50]
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_8xi64_perm_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,4,7,6,5,5,1,6] sched: [8:0.50]
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 0, i32 4, i32 7, i32 6, i32 5, i32 5, i32 1, i32 6>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
+ ret <8 x i64> %res
+}
+define <8 x i64> @test_masked_8xi64_perm_imm_mask1(<8 x i64> %vec, <8 x i64> %vec2, <8 x i64> %mask) {
+; GENERIC-LABEL: test_masked_8xi64_perm_imm_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermq {{.*#+}} zmm1 {%k1} = zmm0[1,0,1,1,5,4,5,5] sched: [1:1.00]
+; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_8xi64_perm_imm_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermq {{.*#+}} zmm1 {%k1} = zmm0[1,0,1,1,5,4,5,5] sched: [3:1.00]
+; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 1, i32 0, i32 1, i32 1, i32 5, i32 4, i32 5, i32 5>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
+ ret <8 x i64> %res
+}
+
+define <8 x i64> @test_masked_z_8xi64_perm_imm_mask1(<8 x i64> %vec, <8 x i64> %mask) {
+; GENERIC-LABEL: test_masked_z_8xi64_perm_imm_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[1,0,1,1,5,4,5,5] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_8xi64_perm_imm_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[1,0,1,1,5,4,5,5] sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 1, i32 0, i32 1, i32 1, i32 5, i32 4, i32 5, i32 5>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
+ ret <8 x i64> %res
+}
+define <8 x i64> @test_masked_8xi64_perm_mask2(<8 x i64> %vec, <8 x i64> %vec2, <8 x i64> %mask) {
+; GENERIC-LABEL: test_masked_8xi64_perm_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,3,7,3,3,5,4,1] sched: [4:0.50]
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermq %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
+; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_8xi64_perm_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,3,7,3,3,5,4,1] sched: [8:0.50]
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermq %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
+; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 1, i32 3, i32 7, i32 3, i32 3, i32 5, i32 4, i32 1>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
+ ret <8 x i64> %res
+}
+
+define <8 x i64> @test_masked_z_8xi64_perm_mask2(<8 x i64> %vec, <8 x i64> %mask) {
+; GENERIC-LABEL: test_masked_z_8xi64_perm_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [1,3,7,3,3,5,4,1] sched: [4:0.50]
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_8xi64_perm_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [1,3,7,3,3,5,4,1] sched: [8:0.50]
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 1, i32 3, i32 7, i32 3, i32 3, i32 5, i32 4, i32 1>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
+ ret <8 x i64> %res
+}
+define <8 x i64> @test_8xi64_perm_imm_mask3(<8 x i64> %vec) {
+; GENERIC-LABEL: test_8xi64_perm_imm_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[3,1,3,1,7,5,7,5] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xi64_perm_imm_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[3,1,3,1,7,5,7,5] sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 3, i32 1, i32 3, i32 1, i32 7, i32 5, i32 7, i32 5>
+ ret <8 x i64> %res
+}
+define <8 x i64> @test_masked_8xi64_perm_imm_mask3(<8 x i64> %vec, <8 x i64> %vec2, <8 x i64> %mask) {
+; GENERIC-LABEL: test_masked_8xi64_perm_imm_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,1,3,1,7,5,7,5] sched: [1:1.00]
+; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_8xi64_perm_imm_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,1,3,1,7,5,7,5] sched: [3:1.00]
+; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 3, i32 1, i32 3, i32 1, i32 7, i32 5, i32 7, i32 5>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
+ ret <8 x i64> %res
+}
+
+define <8 x i64> @test_masked_z_8xi64_perm_imm_mask3(<8 x i64> %vec, <8 x i64> %mask) {
+; GENERIC-LABEL: test_masked_z_8xi64_perm_imm_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[3,1,3,1,7,5,7,5] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_8xi64_perm_imm_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[3,1,3,1,7,5,7,5] sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 3, i32 1, i32 3, i32 1, i32 7, i32 5, i32 7, i32 5>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
+ ret <8 x i64> %res
+}
+define <8 x i64> @test_masked_8xi64_perm_mask4(<8 x i64> %vec, <8 x i64> %vec2, <8 x i64> %mask) {
+; GENERIC-LABEL: test_masked_8xi64_perm_mask4:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm3 = [6,3,1,1,7,4,0,3] sched: [4:0.50]
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermq %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
+; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_8xi64_perm_mask4:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [6,3,1,1,7,4,0,3] sched: [8:0.50]
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermq %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
+; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 6, i32 3, i32 1, i32 1, i32 7, i32 4, i32 0, i32 3>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
+ ret <8 x i64> %res
+}
+
+define <8 x i64> @test_masked_z_8xi64_perm_mask4(<8 x i64> %vec, <8 x i64> %mask) {
+; GENERIC-LABEL: test_masked_z_8xi64_perm_mask4:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [6,3,1,1,7,4,0,3] sched: [4:0.50]
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_8xi64_perm_mask4:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [6,3,1,1,7,4,0,3] sched: [8:0.50]
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 6, i32 3, i32 1, i32 1, i32 7, i32 4, i32 0, i32 3>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
+ ret <8 x i64> %res
+}
+define <8 x i64> @test_masked_8xi64_perm_imm_mask5(<8 x i64> %vec, <8 x i64> %vec2, <8 x i64> %mask) {
+; GENERIC-LABEL: test_masked_8xi64_perm_imm_mask5:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermq {{.*#+}} zmm1 {%k1} = zmm0[0,0,0,0,4,4,4,4] sched: [1:1.00]
+; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_8xi64_perm_imm_mask5:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermq {{.*#+}} zmm1 {%k1} = zmm0[0,0,0,0,4,4,4,4] sched: [3:1.00]
+; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
+ ret <8 x i64> %res
+}
+
+define <8 x i64> @test_masked_z_8xi64_perm_imm_mask5(<8 x i64> %vec, <8 x i64> %mask) {
+; GENERIC-LABEL: test_masked_z_8xi64_perm_imm_mask5:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,0,0,4,4,4,4] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_8xi64_perm_imm_mask5:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,0,0,4,4,4,4] sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
+ ret <8 x i64> %res
+}
+define <8 x i64> @test_8xi64_perm_mask6(<8 x i64> %vec) {
+; GENERIC-LABEL: test_8xi64_perm_mask6:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovaps {{.*#+}} zmm1 = [5,1,4,4,5,4,2,7] sched: [4:0.50]
+; GENERIC-NEXT: vpermpd %zmm0, %zmm1, %zmm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xi64_perm_mask6:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovaps {{.*#+}} zmm1 = [5,1,4,4,5,4,2,7] sched: [8:0.50]
+; SKX-NEXT: vpermpd %zmm0, %zmm1, %zmm0 # sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 5, i32 1, i32 4, i32 4, i32 5, i32 4, i32 2, i32 7>
+ ret <8 x i64> %res
+}
+define <8 x i64> @test_masked_8xi64_perm_mask6(<8 x i64> %vec, <8 x i64> %vec2, <8 x i64> %mask) {
+; GENERIC-LABEL: test_masked_8xi64_perm_mask6:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm3 = [5,1,4,4,5,4,2,7] sched: [4:0.50]
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermq %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
+; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_8xi64_perm_mask6:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [5,1,4,4,5,4,2,7] sched: [8:0.50]
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermq %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
+; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 5, i32 1, i32 4, i32 4, i32 5, i32 4, i32 2, i32 7>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
+ ret <8 x i64> %res
+}
+
+define <8 x i64> @test_masked_z_8xi64_perm_mask6(<8 x i64> %vec, <8 x i64> %mask) {
+; GENERIC-LABEL: test_masked_z_8xi64_perm_mask6:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [5,1,4,4,5,4,2,7] sched: [4:0.50]
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_8xi64_perm_mask6:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [5,1,4,4,5,4,2,7] sched: [8:0.50]
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 5, i32 1, i32 4, i32 4, i32 5, i32 4, i32 2, i32 7>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
+ ret <8 x i64> %res
+}
+define <8 x i64> @test_masked_8xi64_perm_imm_mask7(<8 x i64> %vec, <8 x i64> %vec2, <8 x i64> %mask) {
+; GENERIC-LABEL: test_masked_8xi64_perm_imm_mask7:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,3,3,3,7,7,7,7] sched: [1:1.00]
+; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_8xi64_perm_imm_mask7:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,3,3,3,7,7,7,7] sched: [3:1.00]
+; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 7, i32 7, i32 7, i32 7>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
+ ret <8 x i64> %res
+}
+
+define <8 x i64> @test_masked_z_8xi64_perm_imm_mask7(<8 x i64> %vec, <8 x i64> %mask) {
+; GENERIC-LABEL: test_masked_z_8xi64_perm_imm_mask7:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[3,3,3,3,7,7,7,7] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_8xi64_perm_imm_mask7:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[3,3,3,3,7,7,7,7] sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 7, i32 7, i32 7, i32 7>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
+ ret <8 x i64> %res
+}
+define <8 x i64> @test_8xi64_perm_mem_mask0(<8 x i64>* %vp) {
+; GENERIC-LABEL: test_8xi64_perm_mem_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovaps {{.*#+}} zmm0 = [5,1,6,5,7,3,7,3] sched: [4:0.50]
+; GENERIC-NEXT: vpermpd (%rdi), %zmm0, %zmm0 # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xi64_perm_mem_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovaps {{.*#+}} zmm0 = [5,1,6,5,7,3,7,3] sched: [8:0.50]
+; SKX-NEXT: vpermpd (%rdi), %zmm0, %zmm0 # sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <8 x i64>, <8 x i64>* %vp
+ %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 5, i32 1, i32 6, i32 5, i32 7, i32 3, i32 7, i32 3>
+ ret <8 x i64> %res
+}
+define <8 x i64> @test_masked_8xi64_perm_mem_mask0(<8 x i64>* %vp, <8 x i64> %vec2, <8 x i64> %mask) {
+; GENERIC-LABEL: test_masked_8xi64_perm_mem_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [5,1,6,5,7,3,7,3] sched: [4:0.50]
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermq (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_8xi64_perm_mem_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [5,1,6,5,7,3,7,3] sched: [8:0.50]
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermq (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <8 x i64>, <8 x i64>* %vp
+ %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 5, i32 1, i32 6, i32 5, i32 7, i32 3, i32 7, i32 3>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
+ ret <8 x i64> %res
+}
+
+define <8 x i64> @test_masked_z_8xi64_perm_mem_mask0(<8 x i64>* %vp, <8 x i64> %mask) {
+; GENERIC-LABEL: test_masked_z_8xi64_perm_mem_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm1 = [5,1,6,5,7,3,7,3] sched: [4:0.50]
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_8xi64_perm_mem_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovdqa64 {{.*#+}} zmm1 = [5,1,6,5,7,3,7,3] sched: [8:0.50]
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <8 x i64>, <8 x i64>* %vp
+ %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 5, i32 1, i32 6, i32 5, i32 7, i32 3, i32 7, i32 3>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
+ ret <8 x i64> %res
+}
+
+define <8 x i64> @test_masked_8xi64_perm_imm_mem_mask1(<8 x i64>* %vp, <8 x i64> %vec2, <8 x i64> %mask) {
+; GENERIC-LABEL: test_masked_8xi64_perm_imm_mem_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermq {{.*#+}} zmm0 {%k1} = mem[1,1,1,0,5,5,5,4] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_8xi64_perm_imm_mem_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermq {{.*#+}} zmm0 {%k1} = mem[1,1,1,0,5,5,5,4] sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <8 x i64>, <8 x i64>* %vp
+ %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 0, i32 5, i32 5, i32 5, i32 4>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
+ ret <8 x i64> %res
+}
+
+define <8 x i64> @test_masked_z_8xi64_perm_imm_mem_mask1(<8 x i64>* %vp, <8 x i64> %mask) {
+; GENERIC-LABEL: test_masked_z_8xi64_perm_imm_mem_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = mem[1,1,1,0,5,5,5,4] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_8xi64_perm_imm_mem_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = mem[1,1,1,0,5,5,5,4] sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <8 x i64>, <8 x i64>* %vp
+ %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 0, i32 5, i32 5, i32 5, i32 4>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
+ ret <8 x i64> %res
+}
+
+define <8 x i64> @test_masked_8xi64_perm_mem_mask2(<8 x i64>* %vp, <8 x i64> %vec2, <8 x i64> %mask) {
+; GENERIC-LABEL: test_masked_8xi64_perm_mem_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,2,1,4,1,1,5,5] sched: [4:0.50]
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermq (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_8xi64_perm_mem_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,2,1,4,1,1,5,5] sched: [8:0.50]
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermq (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <8 x i64>, <8 x i64>* %vp
+ %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 0, i32 2, i32 1, i32 4, i32 1, i32 1, i32 5, i32 5>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
+ ret <8 x i64> %res
+}
+
+define <8 x i64> @test_masked_z_8xi64_perm_mem_mask2(<8 x i64>* %vp, <8 x i64> %mask) {
+; GENERIC-LABEL: test_masked_z_8xi64_perm_mem_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,2,1,4,1,1,5,5] sched: [4:0.50]
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_8xi64_perm_mem_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,2,1,4,1,1,5,5] sched: [8:0.50]
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <8 x i64>, <8 x i64>* %vp
+ %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 0, i32 2, i32 1, i32 4, i32 1, i32 1, i32 5, i32 5>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
+ ret <8 x i64> %res
+}
+
+define <8 x i64> @test_8xi64_perm_imm_mem_mask3(<8 x i64>* %vp) {
+; GENERIC-LABEL: test_8xi64_perm_imm_mem_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 = mem[1,3,1,1,5,7,5,5] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xi64_perm_imm_mem_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpermpd {{.*#+}} zmm0 = mem[1,3,1,1,5,7,5,5] sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <8 x i64>, <8 x i64>* %vp
+ %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 1, i32 3, i32 1, i32 1, i32 5, i32 7, i32 5, i32 5>
+ ret <8 x i64> %res
+}
+define <8 x i64> @test_masked_8xi64_perm_imm_mem_mask3(<8 x i64>* %vp, <8 x i64> %vec2, <8 x i64> %mask) {
+; GENERIC-LABEL: test_masked_8xi64_perm_imm_mem_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermq {{.*#+}} zmm0 {%k1} = mem[1,3,1,1,5,7,5,5] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_8xi64_perm_imm_mem_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermq {{.*#+}} zmm0 {%k1} = mem[1,3,1,1,5,7,5,5] sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <8 x i64>, <8 x i64>* %vp
+ %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 1, i32 3, i32 1, i32 1, i32 5, i32 7, i32 5, i32 5>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
+ ret <8 x i64> %res
+}
+
+define <8 x i64> @test_masked_z_8xi64_perm_imm_mem_mask3(<8 x i64>* %vp, <8 x i64> %mask) {
+; GENERIC-LABEL: test_masked_z_8xi64_perm_imm_mem_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = mem[1,3,1,1,5,7,5,5] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_8xi64_perm_imm_mem_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = mem[1,3,1,1,5,7,5,5] sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <8 x i64>, <8 x i64>* %vp
+ %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 1, i32 3, i32 1, i32 1, i32 5, i32 7, i32 5, i32 5>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
+ ret <8 x i64> %res
+}
+
+define <8 x i64> @test_masked_8xi64_perm_mem_mask4(<8 x i64>* %vp, <8 x i64> %vec2, <8 x i64> %mask) {
+; GENERIC-LABEL: test_masked_8xi64_perm_mem_mask4:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [5,0,7,0,3,5,0,6] sched: [4:0.50]
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermq (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_8xi64_perm_mem_mask4:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [5,0,7,0,3,5,0,6] sched: [8:0.50]
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermq (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <8 x i64>, <8 x i64>* %vp
+ %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 5, i32 0, i32 7, i32 0, i32 3, i32 5, i32 0, i32 6>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
+ ret <8 x i64> %res
+}
+
+define <8 x i64> @test_masked_z_8xi64_perm_mem_mask4(<8 x i64>* %vp, <8 x i64> %mask) {
+; GENERIC-LABEL: test_masked_z_8xi64_perm_mem_mask4:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm1 = [5,0,7,0,3,5,0,6] sched: [4:0.50]
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_8xi64_perm_mem_mask4:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovdqa64 {{.*#+}} zmm1 = [5,0,7,0,3,5,0,6] sched: [8:0.50]
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <8 x i64>, <8 x i64>* %vp
+ %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 5, i32 0, i32 7, i32 0, i32 3, i32 5, i32 0, i32 6>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
+ ret <8 x i64> %res
+}
+
+define <8 x i64> @test_masked_8xi64_perm_imm_mem_mask5(<8 x i64>* %vp, <8 x i64> %vec2, <8 x i64> %mask) {
+; GENERIC-LABEL: test_masked_8xi64_perm_imm_mem_mask5:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermq {{.*#+}} zmm0 {%k1} = mem[3,1,0,0,7,5,4,4] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_8xi64_perm_imm_mem_mask5:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermq {{.*#+}} zmm0 {%k1} = mem[3,1,0,0,7,5,4,4] sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <8 x i64>, <8 x i64>* %vp
+ %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 3, i32 1, i32 0, i32 0, i32 7, i32 5, i32 4, i32 4>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
+ ret <8 x i64> %res
+}
+
+define <8 x i64> @test_masked_z_8xi64_perm_imm_mem_mask5(<8 x i64>* %vp, <8 x i64> %mask) {
+; GENERIC-LABEL: test_masked_z_8xi64_perm_imm_mem_mask5:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = mem[3,1,0,0,7,5,4,4] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_8xi64_perm_imm_mem_mask5:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = mem[3,1,0,0,7,5,4,4] sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <8 x i64>, <8 x i64>* %vp
+ %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 3, i32 1, i32 0, i32 0, i32 7, i32 5, i32 4, i32 4>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
+ ret <8 x i64> %res
+}
+
+define <8 x i64> @test_8xi64_perm_mem_mask6(<8 x i64>* %vp) {
+; GENERIC-LABEL: test_8xi64_perm_mem_mask6:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovaps {{.*#+}} zmm0 = [0,6,3,7,3,0,3,6] sched: [4:0.50]
+; GENERIC-NEXT: vpermpd (%rdi), %zmm0, %zmm0 # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xi64_perm_mem_mask6:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovaps {{.*#+}} zmm0 = [0,6,3,7,3,0,3,6] sched: [8:0.50]
+; SKX-NEXT: vpermpd (%rdi), %zmm0, %zmm0 # sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <8 x i64>, <8 x i64>* %vp
+ %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 0, i32 6, i32 3, i32 7, i32 3, i32 0, i32 3, i32 6>
+ ret <8 x i64> %res
+}
+define <8 x i64> @test_masked_8xi64_perm_mem_mask6(<8 x i64>* %vp, <8 x i64> %vec2, <8 x i64> %mask) {
+; GENERIC-LABEL: test_masked_8xi64_perm_mem_mask6:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,6,3,7,3,0,3,6] sched: [4:0.50]
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermq (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_8xi64_perm_mem_mask6:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,6,3,7,3,0,3,6] sched: [8:0.50]
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermq (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <8 x i64>, <8 x i64>* %vp
+ %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 0, i32 6, i32 3, i32 7, i32 3, i32 0, i32 3, i32 6>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
+ ret <8 x i64> %res
+}
+
+define <8 x i64> @test_masked_z_8xi64_perm_mem_mask6(<8 x i64>* %vp, <8 x i64> %mask) {
+; GENERIC-LABEL: test_masked_z_8xi64_perm_mem_mask6:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,6,3,7,3,0,3,6] sched: [4:0.50]
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_8xi64_perm_mem_mask6:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,6,3,7,3,0,3,6] sched: [8:0.50]
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <8 x i64>, <8 x i64>* %vp
+ %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 0, i32 6, i32 3, i32 7, i32 3, i32 0, i32 3, i32 6>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
+ ret <8 x i64> %res
+}
+
+define <8 x i64> @test_masked_8xi64_perm_imm_mem_mask7(<8 x i64>* %vp, <8 x i64> %vec2, <8 x i64> %mask) {
+; GENERIC-LABEL: test_masked_8xi64_perm_imm_mem_mask7:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermq {{.*#+}} zmm0 {%k1} = mem[3,0,0,1,7,4,4,5] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_8xi64_perm_imm_mem_mask7:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermq {{.*#+}} zmm0 {%k1} = mem[3,0,0,1,7,4,4,5] sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <8 x i64>, <8 x i64>* %vp
+ %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 3, i32 0, i32 0, i32 1, i32 7, i32 4, i32 4, i32 5>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
+ ret <8 x i64> %res
+}
+
+define <8 x i64> @test_masked_z_8xi64_perm_imm_mem_mask7(<8 x i64>* %vp, <8 x i64> %mask) {
+; GENERIC-LABEL: test_masked_z_8xi64_perm_imm_mem_mask7:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = mem[3,0,0,1,7,4,4,5] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_8xi64_perm_imm_mem_mask7:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = mem[3,0,0,1,7,4,4,5] sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <8 x i64>, <8 x i64>* %vp
+ %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 3, i32 0, i32 0, i32 1, i32 7, i32 4, i32 4, i32 5>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
+ ret <8 x i64> %res
+}
+
+define <8 x float> @test_8xfloat_perm_mask0(<8 x float> %vec) {
+; GENERIC-LABEL: test_8xfloat_perm_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovaps {{.*#+}} ymm1 = [3,4,2,4,1,2,3,4] sched: [7:0.50]
+; GENERIC-NEXT: vpermps %ymm0, %ymm1, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xfloat_perm_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovaps {{.*#+}} ymm1 = [3,4,2,4,1,2,3,4] sched: [7:0.50]
+; SKX-NEXT: vpermps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %res = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 3, i32 4, i32 2, i32 4, i32 1, i32 2, i32 3, i32 4>
+ ret <8 x float> %res
+}
+define <8 x float> @test_masked_8xfloat_perm_mask0(<8 x float> %vec, <8 x float> %vec2, <8 x i32> %mask) {
+; GENERIC-LABEL: test_masked_8xfloat_perm_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovaps {{.*#+}} ymm3 = [3,4,2,4,1,2,3,4] sched: [7:0.50]
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermps %ymm0, %ymm3, %ymm1 {%k1} # sched: [1:1.00]
+; GENERIC-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_8xfloat_perm_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovaps {{.*#+}} ymm3 = [3,4,2,4,1,2,3,4] sched: [7:0.50]
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermps %ymm0, %ymm3, %ymm1 {%k1} # sched: [3:1.00]
+; SKX-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 3, i32 4, i32 2, i32 4, i32 1, i32 2, i32 3, i32 4>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_masked_z_8xfloat_perm_mask0(<8 x float> %vec, <8 x i32> %mask) {
+; GENERIC-LABEL: test_masked_z_8xfloat_perm_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovaps {{.*#+}} ymm2 = [3,4,2,4,1,2,3,4] sched: [7:0.50]
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermps %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_8xfloat_perm_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovaps {{.*#+}} ymm2 = [3,4,2,4,1,2,3,4] sched: [7:0.50]
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermps %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 3, i32 4, i32 2, i32 4, i32 1, i32 2, i32 3, i32 4>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+define <8 x float> @test_masked_8xfloat_perm_mask1(<8 x float> %vec, <8 x float> %vec2, <8 x i32> %mask) {
+; GENERIC-LABEL: test_masked_8xfloat_perm_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovaps {{.*#+}} ymm3 = [4,2,1,0,6,0,5,1] sched: [7:0.50]
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermps %ymm0, %ymm3, %ymm1 {%k1} # sched: [1:1.00]
+; GENERIC-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_8xfloat_perm_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovaps {{.*#+}} ymm3 = [4,2,1,0,6,0,5,1] sched: [7:0.50]
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermps %ymm0, %ymm3, %ymm1 {%k1} # sched: [3:1.00]
+; SKX-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 4, i32 2, i32 1, i32 0, i32 6, i32 0, i32 5, i32 1>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_masked_z_8xfloat_perm_mask1(<8 x float> %vec, <8 x i64> %mask) {
+; GENERIC-LABEL: test_masked_z_8xfloat_perm_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovaps {{.*#+}} ymm2 = [4,2,1,0,6,0,5,1] sched: [7:0.50]
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermps %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_8xfloat_perm_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovaps {{.*#+}} ymm2 = [4,2,1,0,6,0,5,1] sched: [7:0.50]
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermps %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 4, i32 2, i32 1, i32 0, i32 6, i32 0, i32 5, i32 1>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+define <8 x float> @test_masked_8xfloat_perm_mask2(<8 x float> %vec, <8 x float> %vec2, <8 x i32> %mask) {
+; GENERIC-LABEL: test_masked_8xfloat_perm_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovaps {{.*#+}} ymm3 = [2,5,5,5,4,6,0,5] sched: [7:0.50]
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermps %ymm0, %ymm3, %ymm1 {%k1} # sched: [1:1.00]
+; GENERIC-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_8xfloat_perm_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovaps {{.*#+}} ymm3 = [2,5,5,5,4,6,0,5] sched: [7:0.50]
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermps %ymm0, %ymm3, %ymm1 {%k1} # sched: [3:1.00]
+; SKX-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 2, i32 5, i32 5, i32 5, i32 4, i32 6, i32 0, i32 5>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_masked_z_8xfloat_perm_mask2(<8 x float> %vec, <8 x i32> %mask) {
+; GENERIC-LABEL: test_masked_z_8xfloat_perm_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovaps {{.*#+}} ymm2 = [2,5,5,5,4,6,0,5] sched: [7:0.50]
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermps %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_8xfloat_perm_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovaps {{.*#+}} ymm2 = [2,5,5,5,4,6,0,5] sched: [7:0.50]
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermps %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 2, i32 5, i32 5, i32 5, i32 4, i32 6, i32 0, i32 5>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+define <8 x float> @test_8xfloat_perm_mask3(<8 x float> %vec) {
+; GENERIC-LABEL: test_8xfloat_perm_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovaps {{.*#+}} ymm1 = [0,5,2,5,5,5,1,6] sched: [7:0.50]
+; GENERIC-NEXT: vpermps %ymm0, %ymm1, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xfloat_perm_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovaps {{.*#+}} ymm1 = [0,5,2,5,5,5,1,6] sched: [7:0.50]
+; SKX-NEXT: vpermps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %res = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 5, i32 2, i32 5, i32 5, i32 5, i32 1, i32 6>
+ ret <8 x float> %res
+}
+define <8 x float> @test_masked_8xfloat_perm_mask3(<8 x float> %vec, <8 x float> %vec2, <8 x i32> %mask) {
+; GENERIC-LABEL: test_masked_8xfloat_perm_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovaps {{.*#+}} ymm3 = [0,5,2,5,5,5,1,6] sched: [7:0.50]
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermps %ymm0, %ymm3, %ymm1 {%k1} # sched: [1:1.00]
+; GENERIC-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_8xfloat_perm_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovaps {{.*#+}} ymm3 = [0,5,2,5,5,5,1,6] sched: [7:0.50]
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermps %ymm0, %ymm3, %ymm1 {%k1} # sched: [3:1.00]
+; SKX-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 5, i32 2, i32 5, i32 5, i32 5, i32 1, i32 6>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_masked_z_8xfloat_perm_mask3(<8 x float> %vec, <8 x i32> %mask) {
+; GENERIC-LABEL: test_masked_z_8xfloat_perm_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovaps {{.*#+}} ymm2 = [0,5,2,5,5,5,1,6] sched: [7:0.50]
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermps %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_8xfloat_perm_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovaps {{.*#+}} ymm2 = [0,5,2,5,5,5,1,6] sched: [7:0.50]
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermps %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 5, i32 2, i32 5, i32 5, i32 5, i32 1, i32 6>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+define <8 x float> @test_8xfloat_perm_mem_mask0(<8 x float>* %vp) {
+; GENERIC-LABEL: test_8xfloat_perm_mem_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovaps {{.*#+}} ymm0 = [5,2,1,6,4,2,4,0] sched: [7:0.50]
+; GENERIC-NEXT: vpermps (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xfloat_perm_mem_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovaps {{.*#+}} ymm0 = [5,2,1,6,4,2,4,0] sched: [7:0.50]
+; SKX-NEXT: vpermps (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <8 x float>, <8 x float>* %vp
+ %res = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 5, i32 2, i32 1, i32 6, i32 4, i32 2, i32 4, i32 0>
+ ret <8 x float> %res
+}
+define <8 x float> @test_masked_8xfloat_perm_mem_mask0(<8 x float>* %vp, <8 x float> %vec2, <8 x i32> %mask) {
+; GENERIC-LABEL: test_masked_8xfloat_perm_mem_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovaps {{.*#+}} ymm2 = [5,2,1,6,4,2,4,0] sched: [7:0.50]
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermps (%rdi), %ymm2, %ymm0 {%k1} # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_8xfloat_perm_mem_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovaps {{.*#+}} ymm2 = [5,2,1,6,4,2,4,0] sched: [7:0.50]
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermps (%rdi), %ymm2, %ymm0 {%k1} # sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <8 x float>, <8 x float>* %vp
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 5, i32 2, i32 1, i32 6, i32 4, i32 2, i32 4, i32 0>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_masked_z_8xfloat_perm_mem_mask0(<8 x float>* %vp, <8 x i32> %mask) {
+; GENERIC-LABEL: test_masked_z_8xfloat_perm_mem_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovaps {{.*#+}} ymm1 = [5,2,1,6,4,2,4,0] sched: [7:0.50]
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_8xfloat_perm_mem_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovaps {{.*#+}} ymm1 = [5,2,1,6,4,2,4,0] sched: [7:0.50]
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <8 x float>, <8 x float>* %vp
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 5, i32 2, i32 1, i32 6, i32 4, i32 2, i32 4, i32 0>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_masked_8xfloat_perm_mem_mask1(<8 x float>* %vp, <8 x float> %vec2, <8 x i32> %mask) {
+; GENERIC-LABEL: test_masked_8xfloat_perm_mem_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovaps {{.*#+}} ymm2 = [1,3,7,4,0,6,6,6] sched: [7:0.50]
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermps (%rdi), %ymm2, %ymm0 {%k1} # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_8xfloat_perm_mem_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovaps {{.*#+}} ymm2 = [1,3,7,4,0,6,6,6] sched: [7:0.50]
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermps (%rdi), %ymm2, %ymm0 {%k1} # sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <8 x float>, <8 x float>* %vp
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 7, i32 4, i32 0, i32 6, i32 6, i32 6>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_masked_z_8xfloat_perm_mem_mask1(<8 x float>* %vp, <8 x i32> %mask) {
+; GENERIC-LABEL: test_masked_z_8xfloat_perm_mem_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovaps {{.*#+}} ymm1 = [1,3,7,4,0,6,6,6] sched: [7:0.50]
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_8xfloat_perm_mem_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovaps {{.*#+}} ymm1 = [1,3,7,4,0,6,6,6] sched: [7:0.50]
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <8 x float>, <8 x float>* %vp
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 7, i32 4, i32 0, i32 6, i32 6, i32 6>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_masked_8xfloat_perm_mem_mask2(<8 x float>* %vp, <8 x float> %vec2, <8 x i32> %mask) {
+; GENERIC-LABEL: test_masked_8xfloat_perm_mem_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovaps {{.*#+}} ymm2 = [4,5,1,5,6,6,2,4] sched: [7:0.50]
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermps (%rdi), %ymm2, %ymm0 {%k1} # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_8xfloat_perm_mem_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovaps {{.*#+}} ymm2 = [4,5,1,5,6,6,2,4] sched: [7:0.50]
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermps (%rdi), %ymm2, %ymm0 {%k1} # sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <8 x float>, <8 x float>* %vp
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 1, i32 5, i32 6, i32 6, i32 2, i32 4>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_masked_z_8xfloat_perm_mem_mask2(<8 x float>* %vp, <8 x i32> %mask) {
+; GENERIC-LABEL: test_masked_z_8xfloat_perm_mem_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovaps {{.*#+}} ymm1 = [4,5,1,5,6,6,2,4] sched: [7:0.50]
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_8xfloat_perm_mem_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovaps {{.*#+}} ymm1 = [4,5,1,5,6,6,2,4] sched: [7:0.50]
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <8 x float>, <8 x float>* %vp
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 1, i32 5, i32 6, i32 6, i32 2, i32 4>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_8xfloat_perm_mem_mask3(<8 x float>* %vp, <8 x i32> %mask) {
+; GENERIC-LABEL: test_8xfloat_perm_mem_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovaps {{.*#+}} ymm0 = [5,7,0,6,4,2,3,0] sched: [7:0.50]
+; GENERIC-NEXT: vpermps (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xfloat_perm_mem_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovaps {{.*#+}} ymm0 = [5,7,0,6,4,2,3,0] sched: [7:0.50]
+; SKX-NEXT: vpermps (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <8 x float>, <8 x float>* %vp
+ %res = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 5, i32 7, i32 0, i32 6, i32 4, i32 2, i32 3, i32 0>
+ ret <8 x float> %res
+}
+define <8 x float> @test_masked_8xfloat_perm_mem_mask3(<8 x float>* %vp, <8 x float> %vec2, <8 x i32> %mask) {
+; GENERIC-LABEL: test_masked_8xfloat_perm_mem_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovaps {{.*#+}} ymm2 = [5,7,0,6,4,2,3,0] sched: [7:0.50]
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermps (%rdi), %ymm2, %ymm0 {%k1} # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_8xfloat_perm_mem_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovaps {{.*#+}} ymm2 = [5,7,0,6,4,2,3,0] sched: [7:0.50]
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermps (%rdi), %ymm2, %ymm0 {%k1} # sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <8 x float>, <8 x float>* %vp
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 5, i32 7, i32 0, i32 6, i32 4, i32 2, i32 3, i32 0>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_masked_z_8xfloat_perm_mem_mask3(<8 x float>* %vp, <8 x i32> %mask) {
+; GENERIC-LABEL: test_masked_z_8xfloat_perm_mem_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovaps {{.*#+}} ymm1 = [5,7,0,6,4,2,3,0] sched: [7:0.50]
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_8xfloat_perm_mem_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovaps {{.*#+}} ymm1 = [5,7,0,6,4,2,3,0] sched: [7:0.50]
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <8 x float>, <8 x float>* %vp
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 5, i32 7, i32 0, i32 6, i32 4, i32 2, i32 3, i32 0>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+
+define <16 x float> @test_16xfloat_perm_mask0(<16 x float> %vec) {
+; GENERIC-LABEL: test_16xfloat_perm_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovaps {{.*#+}} zmm1 = [15,7,5,13,4,9,11,13,12,6,0,0,11,15,5,7] sched: [4:0.50]
+; GENERIC-NEXT: vpermps %zmm0, %zmm1, %zmm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xfloat_perm_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovaps {{.*#+}} zmm1 = [15,7,5,13,4,9,11,13,12,6,0,0,11,15,5,7] sched: [8:0.50]
+; SKX-NEXT: vpermps %zmm0, %zmm1, %zmm0 # sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %res = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 15, i32 7, i32 5, i32 13, i32 4, i32 9, i32 11, i32 13, i32 12, i32 6, i32 0, i32 0, i32 11, i32 15, i32 5, i32 7>
+ ret <16 x float> %res
+}
+define <16 x float> @test_masked_16xfloat_perm_mask0(<16 x float> %vec, <16 x float> %vec2, <16 x i32> %mask) {
+; GENERIC-LABEL: test_masked_16xfloat_perm_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovaps {{.*#+}} zmm3 = [15,7,5,13,4,9,11,13,12,6,0,0,11,15,5,7] sched: [4:0.50]
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermps %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
+; GENERIC-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_16xfloat_perm_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovaps {{.*#+}} zmm3 = [15,7,5,13,4,9,11,13,12,6,0,0,11,15,5,7] sched: [8:0.50]
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermps %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
+; SKX-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 15, i32 7, i32 5, i32 13, i32 4, i32 9, i32 11, i32 13, i32 12, i32 6, i32 0, i32 0, i32 11, i32 15, i32 5, i32 7>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_masked_z_16xfloat_perm_mask0(<16 x float> %vec, <16 x i32> %mask) {
+; GENERIC-LABEL: test_masked_z_16xfloat_perm_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovaps {{.*#+}} zmm2 = [15,7,5,13,4,9,11,13,12,6,0,0,11,15,5,7] sched: [4:0.50]
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_16xfloat_perm_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovaps {{.*#+}} zmm2 = [15,7,5,13,4,9,11,13,12,6,0,0,11,15,5,7] sched: [8:0.50]
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 15, i32 7, i32 5, i32 13, i32 4, i32 9, i32 11, i32 13, i32 12, i32 6, i32 0, i32 0, i32 11, i32 15, i32 5, i32 7>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
+define <16 x float> @test_masked_16xfloat_perm_mask1(<16 x float> %vec, <16 x float> %vec2, <16 x i32> %mask) {
+; GENERIC-LABEL: test_masked_16xfloat_perm_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovaps {{.*#+}} zmm3 = [11,10,4,10,4,5,8,11,2,0,10,0,0,3,10,1] sched: [4:0.50]
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermps %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
+; GENERIC-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_16xfloat_perm_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovaps {{.*#+}} zmm3 = [11,10,4,10,4,5,8,11,2,0,10,0,0,3,10,1] sched: [8:0.50]
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermps %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
+; SKX-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 11, i32 10, i32 4, i32 10, i32 4, i32 5, i32 8, i32 11, i32 2, i32 0, i32 10, i32 0, i32 0, i32 3, i32 10, i32 1>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_masked_z_16xfloat_perm_mask1(<16 x float> %vec, <16 x i32> %mask) {
+; GENERIC-LABEL: test_masked_z_16xfloat_perm_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovaps {{.*#+}} zmm2 = [11,10,4,10,4,5,8,11,2,0,10,0,0,3,10,1] sched: [4:0.50]
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_16xfloat_perm_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovaps {{.*#+}} zmm2 = [11,10,4,10,4,5,8,11,2,0,10,0,0,3,10,1] sched: [8:0.50]
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 11, i32 10, i32 4, i32 10, i32 4, i32 5, i32 8, i32 11, i32 2, i32 0, i32 10, i32 0, i32 0, i32 3, i32 10, i32 1>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
+define <16 x float> @test_masked_16xfloat_perm_mask2(<16 x float> %vec, <16 x float> %vec2, <16 x i32> %mask) {
+; GENERIC-LABEL: test_masked_16xfloat_perm_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovaps {{.*#+}} zmm3 = [0,15,6,14,3,6,5,2,5,15,11,6,6,4,8,11] sched: [4:0.50]
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermps %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
+; GENERIC-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_16xfloat_perm_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovaps {{.*#+}} zmm3 = [0,15,6,14,3,6,5,2,5,15,11,6,6,4,8,11] sched: [8:0.50]
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermps %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
+; SKX-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 15, i32 6, i32 14, i32 3, i32 6, i32 5, i32 2, i32 5, i32 15, i32 11, i32 6, i32 6, i32 4, i32 8, i32 11>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_masked_z_16xfloat_perm_mask2(<16 x float> %vec, <16 x i32> %mask) {
+; GENERIC-LABEL: test_masked_z_16xfloat_perm_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovaps {{.*#+}} zmm2 = [0,15,6,14,3,6,5,2,5,15,11,6,6,4,8,11] sched: [4:0.50]
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_16xfloat_perm_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovaps {{.*#+}} zmm2 = [0,15,6,14,3,6,5,2,5,15,11,6,6,4,8,11] sched: [8:0.50]
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 15, i32 6, i32 14, i32 3, i32 6, i32 5, i32 2, i32 5, i32 15, i32 11, i32 6, i32 6, i32 4, i32 8, i32 11>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
+define <16 x float> @test_16xfloat_perm_mask3(<16 x float> %vec) {
+; GENERIC-LABEL: test_16xfloat_perm_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovaps {{.*#+}} zmm1 = [10,7,0,14,6,6,0,2,13,8,11,2,5,13,13,3] sched: [4:0.50]
+; GENERIC-NEXT: vpermps %zmm0, %zmm1, %zmm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xfloat_perm_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovaps {{.*#+}} zmm1 = [10,7,0,14,6,6,0,2,13,8,11,2,5,13,13,3] sched: [8:0.50]
+; SKX-NEXT: vpermps %zmm0, %zmm1, %zmm0 # sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %res = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 10, i32 7, i32 0, i32 14, i32 6, i32 6, i32 0, i32 2, i32 13, i32 8, i32 11, i32 2, i32 5, i32 13, i32 13, i32 3>
+ ret <16 x float> %res
+}
+define <16 x float> @test_masked_16xfloat_perm_mask3(<16 x float> %vec, <16 x float> %vec2, <16 x i32> %mask) {
+; GENERIC-LABEL: test_masked_16xfloat_perm_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovaps {{.*#+}} zmm3 = [10,7,0,14,6,6,0,2,13,8,11,2,5,13,13,3] sched: [4:0.50]
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermps %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
+; GENERIC-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_16xfloat_perm_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovaps {{.*#+}} zmm3 = [10,7,0,14,6,6,0,2,13,8,11,2,5,13,13,3] sched: [8:0.50]
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermps %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
+; SKX-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 10, i32 7, i32 0, i32 14, i32 6, i32 6, i32 0, i32 2, i32 13, i32 8, i32 11, i32 2, i32 5, i32 13, i32 13, i32 3>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_masked_z_16xfloat_perm_mask3(<16 x float> %vec, <16 x i32> %mask) {
+; GENERIC-LABEL: test_masked_z_16xfloat_perm_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovaps {{.*#+}} zmm2 = [10,7,0,14,6,6,0,2,13,8,11,2,5,13,13,3] sched: [4:0.50]
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_16xfloat_perm_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovaps {{.*#+}} zmm2 = [10,7,0,14,6,6,0,2,13,8,11,2,5,13,13,3] sched: [8:0.50]
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 10, i32 7, i32 0, i32 14, i32 6, i32 6, i32 0, i32 2, i32 13, i32 8, i32 11, i32 2, i32 5, i32 13, i32 13, i32 3>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
+define <16 x float> @test_16xfloat_perm_mem_mask0(<16 x float>* %vp) {
+; GENERIC-LABEL: test_16xfloat_perm_mem_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovaps {{.*#+}} zmm0 = [10,2,1,14,9,9,7,2,9,4,12,11,0,14,0,1] sched: [4:0.50]
+; GENERIC-NEXT: vpermps (%rdi), %zmm0, %zmm0 # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xfloat_perm_mem_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovaps {{.*#+}} zmm0 = [10,2,1,14,9,9,7,2,9,4,12,11,0,14,0,1] sched: [8:0.50]
+; SKX-NEXT: vpermps (%rdi), %zmm0, %zmm0 # sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <16 x float>, <16 x float>* %vp
+ %res = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 10, i32 2, i32 1, i32 14, i32 9, i32 9, i32 7, i32 2, i32 9, i32 4, i32 12, i32 11, i32 0, i32 14, i32 0, i32 1>
+ ret <16 x float> %res
+}
+define <16 x float> @test_masked_16xfloat_perm_mem_mask0(<16 x float>* %vp, <16 x float> %vec2, <16 x i32> %mask) {
+; GENERIC-LABEL: test_masked_16xfloat_perm_mem_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovaps {{.*#+}} zmm2 = [10,2,1,14,9,9,7,2,9,4,12,11,0,14,0,1] sched: [4:0.50]
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermps (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_16xfloat_perm_mem_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovaps {{.*#+}} zmm2 = [10,2,1,14,9,9,7,2,9,4,12,11,0,14,0,1] sched: [8:0.50]
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermps (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <16 x float>, <16 x float>* %vp
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 10, i32 2, i32 1, i32 14, i32 9, i32 9, i32 7, i32 2, i32 9, i32 4, i32 12, i32 11, i32 0, i32 14, i32 0, i32 1>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_masked_z_16xfloat_perm_mem_mask0(<16 x float>* %vp, <16 x i32> %mask) {
+; GENERIC-LABEL: test_masked_z_16xfloat_perm_mem_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovaps {{.*#+}} zmm1 = [10,2,1,14,9,9,7,2,9,4,12,11,0,14,0,1] sched: [4:0.50]
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_16xfloat_perm_mem_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovaps {{.*#+}} zmm1 = [10,2,1,14,9,9,7,2,9,4,12,11,0,14,0,1] sched: [8:0.50]
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <16 x float>, <16 x float>* %vp
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 10, i32 2, i32 1, i32 14, i32 9, i32 9, i32 7, i32 2, i32 9, i32 4, i32 12, i32 11, i32 0, i32 14, i32 0, i32 1>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_masked_16xfloat_perm_mem_mask1(<16 x float>* %vp, <16 x float> %vec2, <16 x i32> %mask) {
+; GENERIC-LABEL: test_masked_16xfloat_perm_mem_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovaps {{.*#+}} zmm2 = [4,2,3,5,11,6,4,7,6,4,14,8,15,12,9,4] sched: [4:0.50]
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermps (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_16xfloat_perm_mem_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovaps {{.*#+}} zmm2 = [4,2,3,5,11,6,4,7,6,4,14,8,15,12,9,4] sched: [8:0.50]
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermps (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <16 x float>, <16 x float>* %vp
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 4, i32 2, i32 3, i32 5, i32 11, i32 6, i32 4, i32 7, i32 6, i32 4, i32 14, i32 8, i32 15, i32 12, i32 9, i32 4>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_masked_z_16xfloat_perm_mem_mask1(<16 x float>* %vp, <16 x i32> %mask) {
+; GENERIC-LABEL: test_masked_z_16xfloat_perm_mem_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovaps {{.*#+}} zmm1 = [4,2,3,5,11,6,4,7,6,4,14,8,15,12,9,4] sched: [4:0.50]
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_16xfloat_perm_mem_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovaps {{.*#+}} zmm1 = [4,2,3,5,11,6,4,7,6,4,14,8,15,12,9,4] sched: [8:0.50]
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <16 x float>, <16 x float>* %vp
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 4, i32 2, i32 3, i32 5, i32 11, i32 6, i32 4, i32 7, i32 6, i32 4, i32 14, i32 8, i32 15, i32 12, i32 9, i32 4>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_masked_16xfloat_perm_mem_mask2(<16 x float>* %vp, <16 x float> %vec2, <16 x i32> %mask) {
+; GENERIC-LABEL: test_masked_16xfloat_perm_mem_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovaps {{.*#+}} zmm2 = [10,7,11,6,7,0,11,0,10,9,12,4,10,3,8,5] sched: [4:0.50]
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermps (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_16xfloat_perm_mem_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovaps {{.*#+}} zmm2 = [10,7,11,6,7,0,11,0,10,9,12,4,10,3,8,5] sched: [8:0.50]
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermps (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <16 x float>, <16 x float>* %vp
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 10, i32 7, i32 11, i32 6, i32 7, i32 0, i32 11, i32 0, i32 10, i32 9, i32 12, i32 4, i32 10, i32 3, i32 8, i32 5>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_masked_z_16xfloat_perm_mem_mask2(<16 x float>* %vp, <16 x i32> %mask) {
+; GENERIC-LABEL: test_masked_z_16xfloat_perm_mem_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovaps {{.*#+}} zmm1 = [10,7,11,6,7,0,11,0,10,9,12,4,10,3,8,5] sched: [4:0.50]
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_16xfloat_perm_mem_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovaps {{.*#+}} zmm1 = [10,7,11,6,7,0,11,0,10,9,12,4,10,3,8,5] sched: [8:0.50]
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <16 x float>, <16 x float>* %vp
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 10, i32 7, i32 11, i32 6, i32 7, i32 0, i32 11, i32 0, i32 10, i32 9, i32 12, i32 4, i32 10, i32 3, i32 8, i32 5>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_16xfloat_perm_mem_mask3(<16 x float>* %vp) {
+; GENERIC-LABEL: test_16xfloat_perm_mem_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovaps {{.*#+}} zmm0 = [15,15,3,9,5,15,14,9,11,10,5,14,14,5,11,0] sched: [4:0.50]
+; GENERIC-NEXT: vpermps (%rdi), %zmm0, %zmm0 # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xfloat_perm_mem_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovaps {{.*#+}} zmm0 = [15,15,3,9,5,15,14,9,11,10,5,14,14,5,11,0] sched: [8:0.50]
+; SKX-NEXT: vpermps (%rdi), %zmm0, %zmm0 # sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <16 x float>, <16 x float>* %vp
+ %res = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 15, i32 15, i32 3, i32 9, i32 5, i32 15, i32 14, i32 9, i32 11, i32 10, i32 5, i32 14, i32 14, i32 5, i32 11, i32 0>
+ ret <16 x float> %res
+}
+define <16 x float> @test_masked_16xfloat_perm_mem_mask3(<16 x float>* %vp, <16 x float> %vec2, <16 x i32> %mask) {
+; GENERIC-LABEL: test_masked_16xfloat_perm_mem_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovaps {{.*#+}} zmm2 = [15,15,3,9,5,15,14,9,11,10,5,14,14,5,11,0] sched: [4:0.50]
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermps (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_16xfloat_perm_mem_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovaps {{.*#+}} zmm2 = [15,15,3,9,5,15,14,9,11,10,5,14,14,5,11,0] sched: [8:0.50]
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermps (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <16 x float>, <16 x float>* %vp
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 15, i32 15, i32 3, i32 9, i32 5, i32 15, i32 14, i32 9, i32 11, i32 10, i32 5, i32 14, i32 14, i32 5, i32 11, i32 0>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_masked_z_16xfloat_perm_mem_mask3(<16 x float>* %vp, <16 x i32> %mask) {
+; GENERIC-LABEL: test_masked_z_16xfloat_perm_mem_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovaps {{.*#+}} zmm1 = [15,15,3,9,5,15,14,9,11,10,5,14,14,5,11,0] sched: [4:0.50]
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_16xfloat_perm_mem_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovaps {{.*#+}} zmm1 = [15,15,3,9,5,15,14,9,11,10,5,14,14,5,11,0] sched: [8:0.50]
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <16 x float>, <16 x float>* %vp
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 15, i32 15, i32 3, i32 9, i32 5, i32 15, i32 14, i32 9, i32 11, i32 10, i32 5, i32 14, i32 14, i32 5, i32 11, i32 0>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
+
+define <4 x double> @test_4xdouble_perm_mask0(<4 x double> %vec) {
+; GENERIC-LABEL: test_4xdouble_perm_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,2] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xdouble_perm_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,2] sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %res = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 2, i32 1, i32 3, i32 2>
+ ret <4 x double> %res
+}
+define <4 x double> @test_masked_4xdouble_perm_mask0(<4 x double> %vec, <4 x double> %vec2, <4 x i64> %mask) {
+; GENERIC-LABEL: test_masked_4xdouble_perm_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermpd {{.*#+}} ymm1 {%k1} = ymm0[2,1,3,2] sched: [1:1.00]
+; GENERIC-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_4xdouble_perm_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermpd {{.*#+}} ymm1 {%k1} = ymm0[2,1,3,2] sched: [3:1.00]
+; SKX-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 2, i32 1, i32 3, i32 2>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_masked_z_4xdouble_perm_mask0(<4 x double> %vec, <4 x i64> %mask) {
+; GENERIC-LABEL: test_masked_z_4xdouble_perm_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[2,1,3,2] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_4xdouble_perm_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[2,1,3,2] sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 2, i32 1, i32 3, i32 2>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
+ ret <4 x double> %res
+}
+define <4 x double> @test_masked_4xdouble_perm_mask1(<4 x double> %vec, <4 x double> %vec2, <4 x i64> %mask) {
+; GENERIC-LABEL: test_masked_4xdouble_perm_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermpd {{.*#+}} ymm1 {%k1} = ymm0[3,0,0,0] sched: [1:1.00]
+; GENERIC-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_4xdouble_perm_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermpd {{.*#+}} ymm1 {%k1} = ymm0[3,0,0,0] sched: [3:1.00]
+; SKX-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_masked_z_4xdouble_perm_mask1(<4 x double> %vec, <4 x i64> %mask) {
+; GENERIC-LABEL: test_masked_z_4xdouble_perm_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[3,0,0,0] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_4xdouble_perm_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[3,0,0,0] sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
+ ret <4 x double> %res
+}
+define <4 x double> @test_masked_4xdouble_perm_mask2(<4 x double> %vec, <4 x double> %vec2, <4 x i64> %mask) {
+; GENERIC-LABEL: test_masked_4xdouble_perm_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermpd {{.*#+}} ymm1 {%k1} = ymm0[0,3,3,1] sched: [1:1.00]
+; GENERIC-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_4xdouble_perm_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermpd {{.*#+}} ymm1 {%k1} = ymm0[0,3,3,1] sched: [3:1.00]
+; SKX-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 3, i32 3, i32 1>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_masked_z_4xdouble_perm_mask2(<4 x double> %vec, <4 x i64> %mask) {
+; GENERIC-LABEL: test_masked_z_4xdouble_perm_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0,3,3,1] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_4xdouble_perm_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0,3,3,1] sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 3, i32 3, i32 1>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
+ ret <4 x double> %res
+}
+define <4 x double> @test_4xdouble_perm_mask3(<4 x double> %vec) {
+; GENERIC-LABEL: test_4xdouble_perm_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,2] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xdouble_perm_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,2] sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %res = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 2>
+ ret <4 x double> %res
+}
+define <4 x double> @test_masked_4xdouble_perm_mask3(<4 x double> %vec, <4 x double> %vec2, <4 x i64> %mask) {
+; GENERIC-LABEL: test_masked_4xdouble_perm_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermpd {{.*#+}} ymm1 {%k1} = ymm0[3,3,3,2] sched: [1:1.00]
+; GENERIC-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_4xdouble_perm_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermpd {{.*#+}} ymm1 {%k1} = ymm0[3,3,3,2] sched: [3:1.00]
+; SKX-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 2>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_masked_z_4xdouble_perm_mask3(<4 x double> %vec, <4 x i64> %mask) {
+; GENERIC-LABEL: test_masked_z_4xdouble_perm_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[3,3,3,2] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_4xdouble_perm_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[3,3,3,2] sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 2>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
+ ret <4 x double> %res
+}
+define <4 x double> @test_4xdouble_perm_mem_mask0(<4 x double>* %vp) {
+; GENERIC-LABEL: test_4xdouble_perm_mem_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 = mem[0,0,2,0] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xdouble_perm_mem_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpermpd {{.*#+}} ymm0 = mem[0,0,2,0] sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <4 x double>, <4 x double>* %vp
+ %res = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 0>
+ ret <4 x double> %res
+}
+define <4 x double> @test_masked_4xdouble_perm_mem_mask0(<4 x double>* %vp, <4 x double> %vec2, <4 x i64> %mask) {
+; GENERIC-LABEL: test_masked_4xdouble_perm_mem_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 {%k1} = mem[0,0,2,0] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_4xdouble_perm_mem_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermpd {{.*#+}} ymm0 {%k1} = mem[0,0,2,0] sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <4 x double>, <4 x double>* %vp
+ %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 0>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_masked_z_4xdouble_perm_mem_mask0(<4 x double>* %vp, <4 x i64> %mask) {
+; GENERIC-LABEL: test_masked_z_4xdouble_perm_mem_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[0,0,2,0] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_4xdouble_perm_mem_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[0,0,2,0] sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <4 x double>, <4 x double>* %vp
+ %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 0>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_masked_4xdouble_perm_mem_mask1(<4 x double>* %vp, <4 x double> %vec2, <4 x i64> %mask) {
+; GENERIC-LABEL: test_masked_4xdouble_perm_mem_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 {%k1} = mem[0,2,3,2] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_4xdouble_perm_mem_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermpd {{.*#+}} ymm0 {%k1} = mem[0,2,3,2] sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <4 x double>, <4 x double>* %vp
+ %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 2, i32 3, i32 2>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_masked_z_4xdouble_perm_mem_mask1(<4 x double>* %vp, <4 x i64> %mask) {
+; GENERIC-LABEL: test_masked_z_4xdouble_perm_mem_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[0,2,3,2] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_4xdouble_perm_mem_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[0,2,3,2] sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <4 x double>, <4 x double>* %vp
+ %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 2, i32 3, i32 2>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_masked_4xdouble_perm_mem_mask2(<4 x double>* %vp, <4 x double> %vec2, <4 x i64> %mask) {
+; GENERIC-LABEL: test_masked_4xdouble_perm_mem_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 {%k1} = mem[3,1,1,1] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_4xdouble_perm_mem_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermpd {{.*#+}} ymm0 {%k1} = mem[3,1,1,1] sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <4 x double>, <4 x double>* %vp
+ %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 3, i32 1, i32 1, i32 1>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_masked_z_4xdouble_perm_mem_mask2(<4 x double>* %vp, <4 x i64> %mask) {
+; GENERIC-LABEL: test_masked_z_4xdouble_perm_mem_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[3,1,1,1] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_4xdouble_perm_mem_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[3,1,1,1] sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <4 x double>, <4 x double>* %vp
+ %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 3, i32 1, i32 1, i32 1>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_4xdouble_perm_mem_mask3(<4 x double>* %vp) {
+; GENERIC-LABEL: test_4xdouble_perm_mem_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 = mem[3,2,3,2] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xdouble_perm_mem_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpermpd {{.*#+}} ymm0 = mem[3,2,3,2] sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <4 x double>, <4 x double>* %vp
+ %res = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 3, i32 2, i32 3, i32 2>
+ ret <4 x double> %res
+}
+define <4 x double> @test_masked_4xdouble_perm_mem_mask3(<4 x double>* %vp, <4 x double> %vec2, <4 x i64> %mask) {
+; GENERIC-LABEL: test_masked_4xdouble_perm_mem_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 {%k1} = mem[3,2,3,2] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_4xdouble_perm_mem_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermpd {{.*#+}} ymm0 {%k1} = mem[3,2,3,2] sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <4 x double>, <4 x double>* %vp
+ %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 3, i32 2, i32 3, i32 2>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_masked_z_4xdouble_perm_mem_mask3(<4 x double>* %vp, <4 x i64> %mask) {
+; GENERIC-LABEL: test_masked_z_4xdouble_perm_mem_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[3,2,3,2] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_4xdouble_perm_mem_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[3,2,3,2] sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <4 x double>, <4 x double>* %vp
+ %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 3, i32 2, i32 3, i32 2>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
+ ret <4 x double> %res
+}
+
+define <8 x double> @test_8xdouble_perm_mask0(<8 x double> %vec) {
+; GENERIC-LABEL: test_8xdouble_perm_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovaps {{.*#+}} zmm1 = [5,7,4,2,7,4,3,4] sched: [4:0.50]
+; GENERIC-NEXT: vpermpd %zmm0, %zmm1, %zmm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xdouble_perm_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovaps {{.*#+}} zmm1 = [5,7,4,2,7,4,3,4] sched: [8:0.50]
+; SKX-NEXT: vpermpd %zmm0, %zmm1, %zmm0 # sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %res = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 5, i32 7, i32 4, i32 2, i32 7, i32 4, i32 3, i32 4>
+ ret <8 x double> %res
+}
+define <8 x double> @test_masked_8xdouble_perm_mask0(<8 x double> %vec, <8 x double> %vec2, <8 x i64> %mask) {
+; GENERIC-LABEL: test_masked_8xdouble_perm_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovapd {{.*#+}} zmm3 = [5,7,4,2,7,4,3,4] sched: [4:0.50]
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermpd %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
+; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_8xdouble_perm_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovapd {{.*#+}} zmm3 = [5,7,4,2,7,4,3,4] sched: [8:0.50]
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermpd %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
+; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 5, i32 7, i32 4, i32 2, i32 7, i32 4, i32 3, i32 4>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_masked_z_8xdouble_perm_mask0(<8 x double> %vec, <8 x i64> %mask) {
+; GENERIC-LABEL: test_masked_z_8xdouble_perm_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovapd {{.*#+}} zmm2 = [5,7,4,2,7,4,3,4] sched: [4:0.50]
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_8xdouble_perm_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovapd {{.*#+}} zmm2 = [5,7,4,2,7,4,3,4] sched: [8:0.50]
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 5, i32 7, i32 4, i32 2, i32 7, i32 4, i32 3, i32 4>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
+ ret <8 x double> %res
+}
+define <8 x double> @test_masked_8xdouble_perm_imm_mask1(<8 x double> %vec, <8 x double> %vec2, <8 x i64> %mask) {
+; GENERIC-LABEL: test_masked_8xdouble_perm_imm_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,0,0,2,7,4,4,6] sched: [1:1.00]
+; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_8xdouble_perm_imm_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,0,0,2,7,4,4,6] sched: [3:1.00]
+; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 3, i32 0, i32 0, i32 2, i32 7, i32 4, i32 4, i32 6>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_masked_z_8xdouble_perm_imm_mask1(<8 x double> %vec, <8 x i64> %mask) {
+; GENERIC-LABEL: test_masked_z_8xdouble_perm_imm_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,0,0,2,7,4,4,6] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_8xdouble_perm_imm_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,0,0,2,7,4,4,6] sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 3, i32 0, i32 0, i32 2, i32 7, i32 4, i32 4, i32 6>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
+ ret <8 x double> %res
+}
+define <8 x double> @test_masked_8xdouble_perm_mask2(<8 x double> %vec, <8 x double> %vec2, <8 x i64> %mask) {
+; GENERIC-LABEL: test_masked_8xdouble_perm_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovapd {{.*#+}} zmm3 = [7,5,5,5,3,5,1,7] sched: [4:0.50]
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermpd %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
+; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_8xdouble_perm_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovapd {{.*#+}} zmm3 = [7,5,5,5,3,5,1,7] sched: [8:0.50]
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermpd %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
+; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 7, i32 5, i32 5, i32 5, i32 3, i32 5, i32 1, i32 7>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_masked_z_8xdouble_perm_mask2(<8 x double> %vec, <8 x i64> %mask) {
+; GENERIC-LABEL: test_masked_z_8xdouble_perm_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovapd {{.*#+}} zmm2 = [7,5,5,5,3,5,1,7] sched: [4:0.50]
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_8xdouble_perm_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovapd {{.*#+}} zmm2 = [7,5,5,5,3,5,1,7] sched: [8:0.50]
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 7, i32 5, i32 5, i32 5, i32 3, i32 5, i32 1, i32 7>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
+ ret <8 x double> %res
+}
+define <8 x double> @test_8xdouble_perm_imm_mask3(<8 x double> %vec) {
+; GENERIC-LABEL: test_8xdouble_perm_imm_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[1,3,3,0,5,7,7,4] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xdouble_perm_imm_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[1,3,3,0,5,7,7,4] sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %res = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 1, i32 3, i32 3, i32 0, i32 5, i32 7, i32 7, i32 4>
+ ret <8 x double> %res
+}
+define <8 x double> @test_masked_8xdouble_perm_imm_mask3(<8 x double> %vec, <8 x double> %vec2, <8 x i64> %mask) {
+; GENERIC-LABEL: test_masked_8xdouble_perm_imm_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermpd {{.*#+}} zmm1 {%k1} = zmm0[1,3,3,0,5,7,7,4] sched: [1:1.00]
+; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_8xdouble_perm_imm_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermpd {{.*#+}} zmm1 {%k1} = zmm0[1,3,3,0,5,7,7,4] sched: [3:1.00]
+; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 1, i32 3, i32 3, i32 0, i32 5, i32 7, i32 7, i32 4>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_masked_z_8xdouble_perm_imm_mask3(<8 x double> %vec, <8 x i64> %mask) {
+; GENERIC-LABEL: test_masked_z_8xdouble_perm_imm_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1,3,3,0,5,7,7,4] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_8xdouble_perm_imm_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1,3,3,0,5,7,7,4] sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 1, i32 3, i32 3, i32 0, i32 5, i32 7, i32 7, i32 4>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
+ ret <8 x double> %res
+}
+define <8 x double> @test_masked_8xdouble_perm_mask4(<8 x double> %vec, <8 x double> %vec2, <8 x i64> %mask) {
+; GENERIC-LABEL: test_masked_8xdouble_perm_mask4:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovapd {{.*#+}} zmm3 = [3,5,3,4,6,5,7,1] sched: [4:0.50]
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermpd %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
+; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_8xdouble_perm_mask4:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovapd {{.*#+}} zmm3 = [3,5,3,4,6,5,7,1] sched: [8:0.50]
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermpd %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
+; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 3, i32 5, i32 3, i32 4, i32 6, i32 5, i32 7, i32 1>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_masked_z_8xdouble_perm_mask4(<8 x double> %vec, <8 x i64> %mask) {
+; GENERIC-LABEL: test_masked_z_8xdouble_perm_mask4:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovapd {{.*#+}} zmm2 = [3,5,3,4,6,5,7,1] sched: [4:0.50]
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_8xdouble_perm_mask4:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovapd {{.*#+}} zmm2 = [3,5,3,4,6,5,7,1] sched: [8:0.50]
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 3, i32 5, i32 3, i32 4, i32 6, i32 5, i32 7, i32 1>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
+ ret <8 x double> %res
+}
+define <8 x double> @test_masked_8xdouble_perm_imm_mask5(<8 x double> %vec, <8 x double> %vec2, <8 x i64> %mask) {
+; GENERIC-LABEL: test_masked_8xdouble_perm_imm_mask5:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,3,2,3,7,7,6,7] sched: [1:1.00]
+; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_8xdouble_perm_imm_mask5:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,3,2,3,7,7,6,7] sched: [3:1.00]
+; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 3, i32 3, i32 2, i32 3, i32 7, i32 7, i32 6, i32 7>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_masked_z_8xdouble_perm_imm_mask5(<8 x double> %vec, <8 x i64> %mask) {
+; GENERIC-LABEL: test_masked_z_8xdouble_perm_imm_mask5:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,3,2,3,7,7,6,7] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_8xdouble_perm_imm_mask5:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,3,2,3,7,7,6,7] sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 3, i32 3, i32 2, i32 3, i32 7, i32 7, i32 6, i32 7>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
+ ret <8 x double> %res
+}
+define <8 x double> @test_8xdouble_perm_mask6(<8 x double> %vec) {
+; GENERIC-LABEL: test_8xdouble_perm_mask6:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovaps {{.*#+}} zmm1 = [2,7,6,4,0,0,0,2] sched: [4:0.50]
+; GENERIC-NEXT: vpermpd %zmm0, %zmm1, %zmm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xdouble_perm_mask6:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovaps {{.*#+}} zmm1 = [2,7,6,4,0,0,0,2] sched: [8:0.50]
+; SKX-NEXT: vpermpd %zmm0, %zmm1, %zmm0 # sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %res = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 2, i32 7, i32 6, i32 4, i32 0, i32 0, i32 0, i32 2>
+ ret <8 x double> %res
+}
+define <8 x double> @test_masked_8xdouble_perm_mask6(<8 x double> %vec, <8 x double> %vec2, <8 x i64> %mask) {
+; GENERIC-LABEL: test_masked_8xdouble_perm_mask6:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovapd {{.*#+}} zmm3 = [2,7,6,4,0,0,0,2] sched: [4:0.50]
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermpd %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
+; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_8xdouble_perm_mask6:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovapd {{.*#+}} zmm3 = [2,7,6,4,0,0,0,2] sched: [8:0.50]
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermpd %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
+; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 2, i32 7, i32 6, i32 4, i32 0, i32 0, i32 0, i32 2>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_masked_z_8xdouble_perm_mask6(<8 x double> %vec, <8 x i64> %mask) {
+; GENERIC-LABEL: test_masked_z_8xdouble_perm_mask6:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovapd {{.*#+}} zmm2 = [2,7,6,4,0,0,0,2] sched: [4:0.50]
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_8xdouble_perm_mask6:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovapd {{.*#+}} zmm2 = [2,7,6,4,0,0,0,2] sched: [8:0.50]
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 2, i32 7, i32 6, i32 4, i32 0, i32 0, i32 0, i32 2>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
+ ret <8 x double> %res
+}
+define <8 x double> @test_masked_8xdouble_perm_imm_mask7(<8 x double> %vec, <8 x double> %vec2, <8 x i64> %mask) {
+; GENERIC-LABEL: test_masked_8xdouble_perm_imm_mask7:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,1,3,2,7,5,7,6] sched: [1:1.00]
+; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_8xdouble_perm_imm_mask7:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,1,3,2,7,5,7,6] sched: [3:1.00]
+; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 3, i32 1, i32 3, i32 2, i32 7, i32 5, i32 7, i32 6>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_masked_z_8xdouble_perm_imm_mask7(<8 x double> %vec, <8 x i64> %mask) {
+; GENERIC-LABEL: test_masked_z_8xdouble_perm_imm_mask7:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,1,3,2,7,5,7,6] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_8xdouble_perm_imm_mask7:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,1,3,2,7,5,7,6] sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 3, i32 1, i32 3, i32 2, i32 7, i32 5, i32 7, i32 6>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
+ ret <8 x double> %res
+}
+define <8 x double> @test_8xdouble_perm_mem_mask0(<8 x double>* %vp) {
+; GENERIC-LABEL: test_8xdouble_perm_mem_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovaps {{.*#+}} zmm0 = [0,3,4,0,4,2,0,1] sched: [4:0.50]
+; GENERIC-NEXT: vpermpd (%rdi), %zmm0, %zmm0 # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xdouble_perm_mem_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovaps {{.*#+}} zmm0 = [0,3,4,0,4,2,0,1] sched: [8:0.50]
+; SKX-NEXT: vpermpd (%rdi), %zmm0, %zmm0 # sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <8 x double>, <8 x double>* %vp
+ %res = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 3, i32 4, i32 0, i32 4, i32 2, i32 0, i32 1>
+ ret <8 x double> %res
+}
+define <8 x double> @test_masked_8xdouble_perm_mem_mask0(<8 x double>* %vp, <8 x double> %vec2, <8 x i64> %mask) {
+; GENERIC-LABEL: test_masked_8xdouble_perm_mem_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovapd {{.*#+}} zmm2 = [0,3,4,0,4,2,0,1] sched: [4:0.50]
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermpd (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_8xdouble_perm_mem_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovapd {{.*#+}} zmm2 = [0,3,4,0,4,2,0,1] sched: [8:0.50]
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermpd (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <8 x double>, <8 x double>* %vp
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 3, i32 4, i32 0, i32 4, i32 2, i32 0, i32 1>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_masked_z_8xdouble_perm_mem_mask0(<8 x double>* %vp, <8 x i64> %mask) {
+; GENERIC-LABEL: test_masked_z_8xdouble_perm_mem_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovapd {{.*#+}} zmm1 = [0,3,4,0,4,2,0,1] sched: [4:0.50]
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_8xdouble_perm_mem_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovapd {{.*#+}} zmm1 = [0,3,4,0,4,2,0,1] sched: [8:0.50]
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <8 x double>, <8 x double>* %vp
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 3, i32 4, i32 0, i32 4, i32 2, i32 0, i32 1>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_masked_8xdouble_perm_imm_mem_mask1(<8 x double>* %vp, <8 x double> %vec2, <8 x i64> %mask) {
+; GENERIC-LABEL: test_masked_8xdouble_perm_imm_mem_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 {%k1} = mem[0,2,0,3,4,6,4,7] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_8xdouble_perm_imm_mem_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermpd {{.*#+}} zmm0 {%k1} = mem[0,2,0,3,4,6,4,7] sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <8 x double>, <8 x double>* %vp
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 2, i32 0, i32 3, i32 4, i32 6, i32 4, i32 7>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_masked_z_8xdouble_perm_imm_mem_mask1(<8 x double>* %vp, <8 x i64> %mask) {
+; GENERIC-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[0,2,0,3,4,6,4,7] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[0,2,0,3,4,6,4,7] sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <8 x double>, <8 x double>* %vp
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 2, i32 0, i32 3, i32 4, i32 6, i32 4, i32 7>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_masked_8xdouble_perm_mem_mask2(<8 x double>* %vp, <8 x double> %vec2, <8 x i64> %mask) {
+; GENERIC-LABEL: test_masked_8xdouble_perm_mem_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovapd {{.*#+}} zmm2 = [6,7,2,7,7,6,2,5] sched: [4:0.50]
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermpd (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_8xdouble_perm_mem_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovapd {{.*#+}} zmm2 = [6,7,2,7,7,6,2,5] sched: [8:0.50]
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermpd (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <8 x double>, <8 x double>* %vp
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 6, i32 7, i32 2, i32 7, i32 7, i32 6, i32 2, i32 5>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_masked_z_8xdouble_perm_mem_mask2(<8 x double>* %vp, <8 x i64> %mask) {
+; GENERIC-LABEL: test_masked_z_8xdouble_perm_mem_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovapd {{.*#+}} zmm1 = [6,7,2,7,7,6,2,5] sched: [4:0.50]
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_8xdouble_perm_mem_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovapd {{.*#+}} zmm1 = [6,7,2,7,7,6,2,5] sched: [8:0.50]
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <8 x double>, <8 x double>* %vp
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 6, i32 7, i32 2, i32 7, i32 7, i32 6, i32 2, i32 5>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_8xdouble_perm_imm_mem_mask3(<8 x double>* %vp) {
+; GENERIC-LABEL: test_8xdouble_perm_imm_mem_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 = mem[2,1,1,0,6,5,5,4] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xdouble_perm_imm_mem_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpermpd {{.*#+}} zmm0 = mem[2,1,1,0,6,5,5,4] sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <8 x double>, <8 x double>* %vp
+ %res = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 2, i32 1, i32 1, i32 0, i32 6, i32 5, i32 5, i32 4>
+ ret <8 x double> %res
+}
+define <8 x double> @test_masked_8xdouble_perm_imm_mem_mask3(<8 x double>* %vp, <8 x double> %vec2, <8 x i64> %mask) {
+; GENERIC-LABEL: test_masked_8xdouble_perm_imm_mem_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 {%k1} = mem[2,1,1,0,6,5,5,4] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_8xdouble_perm_imm_mem_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermpd {{.*#+}} zmm0 {%k1} = mem[2,1,1,0,6,5,5,4] sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <8 x double>, <8 x double>* %vp
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 2, i32 1, i32 1, i32 0, i32 6, i32 5, i32 5, i32 4>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_masked_z_8xdouble_perm_imm_mem_mask3(<8 x double>* %vp, <8 x i64> %mask) {
+; GENERIC-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[2,1,1,0,6,5,5,4] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[2,1,1,0,6,5,5,4] sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <8 x double>, <8 x double>* %vp
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 2, i32 1, i32 1, i32 0, i32 6, i32 5, i32 5, i32 4>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_masked_8xdouble_perm_mem_mask4(<8 x double>* %vp, <8 x double> %vec2, <8 x i64> %mask) {
+; GENERIC-LABEL: test_masked_8xdouble_perm_mem_mask4:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovapd {{.*#+}} zmm2 = [1,1,3,5,6,0,6,0] sched: [4:0.50]
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermpd (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_8xdouble_perm_mem_mask4:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovapd {{.*#+}} zmm2 = [1,1,3,5,6,0,6,0] sched: [8:0.50]
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermpd (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <8 x double>, <8 x double>* %vp
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 5, i32 6, i32 0, i32 6, i32 0>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_masked_z_8xdouble_perm_mem_mask4(<8 x double>* %vp, <8 x i64> %mask) {
+; GENERIC-LABEL: test_masked_z_8xdouble_perm_mem_mask4:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovapd {{.*#+}} zmm1 = [1,1,3,5,6,0,6,0] sched: [4:0.50]
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_8xdouble_perm_mem_mask4:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovapd {{.*#+}} zmm1 = [1,1,3,5,6,0,6,0] sched: [8:0.50]
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <8 x double>, <8 x double>* %vp
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 5, i32 6, i32 0, i32 6, i32 0>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_masked_8xdouble_perm_imm_mem_mask5(<8 x double>* %vp, <8 x double> %vec2, <8 x i64> %mask) {
+; GENERIC-LABEL: test_masked_8xdouble_perm_imm_mem_mask5:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 {%k1} = mem[2,2,2,3,6,6,6,7] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_8xdouble_perm_imm_mem_mask5:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermpd {{.*#+}} zmm0 {%k1} = mem[2,2,2,3,6,6,6,7] sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <8 x double>, <8 x double>* %vp
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 3, i32 6, i32 6, i32 6, i32 7>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_masked_z_8xdouble_perm_imm_mem_mask5(<8 x double>* %vp, <8 x i64> %mask) {
+; GENERIC-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask5:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[2,2,2,3,6,6,6,7] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask5:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[2,2,2,3,6,6,6,7] sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <8 x double>, <8 x double>* %vp
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 3, i32 6, i32 6, i32 6, i32 7>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_8xdouble_perm_mem_mask6(<8 x double>* %vp) {
+; GENERIC-LABEL: test_8xdouble_perm_mem_mask6:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovaps {{.*#+}} zmm0 = [2,4,0,4,6,1,2,5] sched: [4:0.50]
+; GENERIC-NEXT: vpermpd (%rdi), %zmm0, %zmm0 # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xdouble_perm_mem_mask6:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovaps {{.*#+}} zmm0 = [2,4,0,4,6,1,2,5] sched: [8:0.50]
+; SKX-NEXT: vpermpd (%rdi), %zmm0, %zmm0 # sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <8 x double>, <8 x double>* %vp
+ %res = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 2, i32 4, i32 0, i32 4, i32 6, i32 1, i32 2, i32 5>
+ ret <8 x double> %res
+}
+define <8 x double> @test_masked_8xdouble_perm_mem_mask6(<8 x double>* %vp, <8 x double> %vec2, <8 x i64> %mask) {
+; GENERIC-LABEL: test_masked_8xdouble_perm_mem_mask6:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovapd {{.*#+}} zmm2 = [2,4,0,4,6,1,2,5] sched: [4:0.50]
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermpd (%rdi), %zmm2, %zmm0 {%k1} # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_8xdouble_perm_mem_mask6:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovapd {{.*#+}} zmm2 = [2,4,0,4,6,1,2,5] sched: [8:0.50]
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermpd (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <8 x double>, <8 x double>* %vp
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 2, i32 4, i32 0, i32 4, i32 6, i32 1, i32 2, i32 5>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_masked_z_8xdouble_perm_mem_mask6(<8 x double>* %vp, <8 x i64> %mask) {
+; GENERIC-LABEL: test_masked_z_8xdouble_perm_mem_mask6:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovapd {{.*#+}} zmm1 = [2,4,0,4,6,1,2,5] sched: [4:0.50]
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_8xdouble_perm_mem_mask6:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovapd {{.*#+}} zmm1 = [2,4,0,4,6,1,2,5] sched: [8:0.50]
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <8 x double>, <8 x double>* %vp
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 2, i32 4, i32 0, i32 4, i32 6, i32 1, i32 2, i32 5>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_masked_8xdouble_perm_imm_mem_mask7(<8 x double>* %vp, <8 x double> %vec2, <8 x i64> %mask) {
+; GENERIC-LABEL: test_masked_8xdouble_perm_imm_mem_mask7:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 {%k1} = mem[0,3,2,0,4,7,6,4] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_8xdouble_perm_imm_mem_mask7:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermpd {{.*#+}} zmm0 {%k1} = mem[0,3,2,0,4,7,6,4] sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <8 x double>, <8 x double>* %vp
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 3, i32 2, i32 0, i32 4, i32 7, i32 6, i32 4>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_masked_z_8xdouble_perm_imm_mem_mask7(<8 x double>* %vp, <8 x i64> %mask) {
+; GENERIC-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask7:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[0,3,2,0,4,7,6,4] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask7:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[0,3,2,0,4,7,6,4] sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <8 x double>, <8 x double>* %vp
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 3, i32 2, i32 0, i32 4, i32 7, i32 6, i32 4>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
+ ret <8 x double> %res
+}
+
+define <16 x i8> @test_16xi8_perm_mask0(<16 x i8> %vec) {
+; GENERIC-LABEL: test_16xi8_perm_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [7:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xi8_perm_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [7:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %res = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 8, i32 6, i32 12, i32 4, i32 7, i32 9, i32 14, i32 8, i32 4, i32 12, i32 9, i32 4, i32 14, i32 15, i32 12, i32 14>
+ ret <16 x i8> %res
+}
+define <16 x i8> @test_masked_16xi8_perm_mask0(<16 x i8> %vec, <16 x i8> %vec2, <16 x i8> %mask) {
+; GENERIC-LABEL: test_masked_16xi8_perm_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqb %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufb {{.*#+}} xmm1 {%k1} = xmm0[8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [5:1.00]
+; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_16xi8_perm_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqb %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufb {{.*#+}} xmm1 {%k1} = xmm0[8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [7:1.00]
+; SKX-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 8, i32 6, i32 12, i32 4, i32 7, i32 9, i32 14, i32 8, i32 4, i32 12, i32 9, i32 4, i32 14, i32 15, i32 12, i32 14>
+ %cmp = icmp eq <16 x i8> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> %vec2
+ ret <16 x i8> %res
+}
+
+define <16 x i8> @test_masked_z_16xi8_perm_mask0(<16 x i8> %vec, <16 x i8> %mask) {
+; GENERIC-LABEL: test_masked_z_16xi8_perm_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqb %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_16xi8_perm_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqb %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [7:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 8, i32 6, i32 12, i32 4, i32 7, i32 9, i32 14, i32 8, i32 4, i32 12, i32 9, i32 4, i32 14, i32 15, i32 12, i32 14>
+ %cmp = icmp eq <16 x i8> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> zeroinitializer
+ ret <16 x i8> %res
+}
+define <16 x i8> @test_masked_16xi8_perm_mask1(<16 x i8> %vec, <16 x i8> %vec2, <16 x i8> %mask) {
+; GENERIC-LABEL: test_masked_16xi8_perm_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqb %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufb {{.*#+}} xmm1 {%k1} = xmm0[4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0] sched: [5:1.00]
+; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_16xi8_perm_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqb %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufb {{.*#+}} xmm1 {%k1} = xmm0[4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0] sched: [7:1.00]
+; SKX-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 4, i32 11, i32 14, i32 10, i32 7, i32 1, i32 6, i32 9, i32 14, i32 15, i32 7, i32 13, i32 4, i32 12, i32 8, i32 0>
+ %cmp = icmp eq <16 x i8> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> %vec2
+ ret <16 x i8> %res
+}
+
+define <16 x i8> @test_masked_z_16xi8_perm_mask1(<16 x i8> %vec, <16 x i8> %mask) {
+; GENERIC-LABEL: test_masked_z_16xi8_perm_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqb %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_16xi8_perm_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqb %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0] sched: [7:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 4, i32 11, i32 14, i32 10, i32 7, i32 1, i32 6, i32 9, i32 14, i32 15, i32 7, i32 13, i32 4, i32 12, i32 8, i32 0>
+ %cmp = icmp eq <16 x i8> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> zeroinitializer
+ ret <16 x i8> %res
+}
+define <16 x i8> @test_masked_16xi8_perm_mask2(<16 x i8> %vec, <16 x i8> %vec2, <16 x i8> %mask) {
+; GENERIC-LABEL: test_masked_16xi8_perm_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqb %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufb {{.*#+}} xmm1 {%k1} = xmm0[11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7] sched: [5:1.00]
+; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_16xi8_perm_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqb %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufb {{.*#+}} xmm1 {%k1} = xmm0[11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7] sched: [7:1.00]
+; SKX-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 11, i32 6, i32 13, i32 10, i32 0, i32 7, i32 13, i32 3, i32 5, i32 13, i32 3, i32 9, i32 3, i32 15, i32 12, i32 7>
+ %cmp = icmp eq <16 x i8> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> %vec2
+ ret <16 x i8> %res
+}
+
+define <16 x i8> @test_masked_z_16xi8_perm_mask2(<16 x i8> %vec, <16 x i8> %mask) {
+; GENERIC-LABEL: test_masked_z_16xi8_perm_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqb %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_16xi8_perm_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqb %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7] sched: [7:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 11, i32 6, i32 13, i32 10, i32 0, i32 7, i32 13, i32 3, i32 5, i32 13, i32 3, i32 9, i32 3, i32 15, i32 12, i32 7>
+ %cmp = icmp eq <16 x i8> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> zeroinitializer
+ ret <16 x i8> %res
+}
+define <16 x i8> @test_16xi8_perm_mask3(<16 x i8> %vec) {
+; GENERIC-LABEL: test_16xi8_perm_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [7:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xi8_perm_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [7:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %res = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 1, i32 5, i32 8, i32 14, i32 1, i32 8, i32 11, i32 8, i32 13, i32 8, i32 15, i32 9, i32 9, i32 7, i32 9, i32 6>
+ ret <16 x i8> %res
+}
+define <16 x i8> @test_masked_16xi8_perm_mask3(<16 x i8> %vec, <16 x i8> %vec2, <16 x i8> %mask) {
+; GENERIC-LABEL: test_masked_16xi8_perm_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqb %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufb {{.*#+}} xmm1 {%k1} = xmm0[1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [5:1.00]
+; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_16xi8_perm_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqb %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufb {{.*#+}} xmm1 {%k1} = xmm0[1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [7:1.00]
+; SKX-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 1, i32 5, i32 8, i32 14, i32 1, i32 8, i32 11, i32 8, i32 13, i32 8, i32 15, i32 9, i32 9, i32 7, i32 9, i32 6>
+ %cmp = icmp eq <16 x i8> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> %vec2
+ ret <16 x i8> %res
+}
+
+define <16 x i8> @test_masked_z_16xi8_perm_mask3(<16 x i8> %vec, <16 x i8> %mask) {
+; GENERIC-LABEL: test_masked_z_16xi8_perm_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqb %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_16xi8_perm_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqb %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [7:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 1, i32 5, i32 8, i32 14, i32 1, i32 8, i32 11, i32 8, i32 13, i32 8, i32 15, i32 9, i32 9, i32 7, i32 9, i32 6>
+ %cmp = icmp eq <16 x i8> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> zeroinitializer
+ ret <16 x i8> %res
+}
+define <16 x i8> @test_16xi8_perm_mem_mask0(<16 x i8>* %vp) {
+; GENERIC-LABEL: test_16xi8_perm_mem_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovdqa (%rdi), %xmm0 # sched: [6:0.50]
+; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [7:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xi8_perm_mem_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovdqa (%rdi), %xmm0 # sched: [6:0.50]
+; SKX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [7:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <16 x i8>, <16 x i8>* %vp
+ %res = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 9, i32 10, i32 7, i32 1, i32 12, i32 14, i32 14, i32 13, i32 14, i32 14, i32 8, i32 6, i32 11, i32 4, i32 12, i32 13>
+ ret <16 x i8> %res
+}
+define <16 x i8> @test_masked_16xi8_perm_mem_mask0(<16 x i8>* %vp, <16 x i8> %vec2, <16 x i8> %mask) {
+; GENERIC-LABEL: test_masked_16xi8_perm_mem_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovdqa (%rdi), %xmm2 # sched: [6:0.50]
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqb %xmm3, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 {%k1} = xmm2[9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_16xi8_perm_mem_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovdqa (%rdi), %xmm2 # sched: [6:0.50]
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqb %xmm3, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufb {{.*#+}} xmm0 {%k1} = xmm2[9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [7:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <16 x i8>, <16 x i8>* %vp
+ %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 9, i32 10, i32 7, i32 1, i32 12, i32 14, i32 14, i32 13, i32 14, i32 14, i32 8, i32 6, i32 11, i32 4, i32 12, i32 13>
+ %cmp = icmp eq <16 x i8> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> %vec2
+ ret <16 x i8> %res
+}
+
+define <16 x i8> @test_masked_z_16xi8_perm_mem_mask0(<16 x i8>* %vp, <16 x i8> %mask) {
+; GENERIC-LABEL: test_masked_z_16xi8_perm_mem_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovdqa (%rdi), %xmm1 # sched: [6:0.50]
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqb %xmm2, %xmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_16xi8_perm_mem_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovdqa (%rdi), %xmm1 # sched: [6:0.50]
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqb %xmm2, %xmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [7:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <16 x i8>, <16 x i8>* %vp
+ %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 9, i32 10, i32 7, i32 1, i32 12, i32 14, i32 14, i32 13, i32 14, i32 14, i32 8, i32 6, i32 11, i32 4, i32 12, i32 13>
+ %cmp = icmp eq <16 x i8> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> zeroinitializer
+ ret <16 x i8> %res
+}
+
+define <16 x i8> @test_masked_16xi8_perm_mem_mask1(<16 x i8>* %vp, <16 x i8> %vec2, <16 x i8> %mask) {
+; GENERIC-LABEL: test_masked_16xi8_perm_mem_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovdqa (%rdi), %xmm2 # sched: [6:0.50]
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqb %xmm3, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 {%k1} = xmm2[14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_16xi8_perm_mem_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovdqa (%rdi), %xmm2 # sched: [6:0.50]
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqb %xmm3, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufb {{.*#+}} xmm0 {%k1} = xmm2[14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11] sched: [7:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <16 x i8>, <16 x i8>* %vp
+ %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 14, i32 9, i32 15, i32 9, i32 7, i32 10, i32 15, i32 14, i32 12, i32 1, i32 9, i32 7, i32 10, i32 13, i32 3, i32 11>
+ %cmp = icmp eq <16 x i8> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> %vec2
+ ret <16 x i8> %res
+}
+
+define <16 x i8> @test_masked_z_16xi8_perm_mem_mask1(<16 x i8>* %vp, <16 x i8> %mask) {
+; GENERIC-LABEL: test_masked_z_16xi8_perm_mem_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovdqa (%rdi), %xmm1 # sched: [6:0.50]
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqb %xmm2, %xmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_16xi8_perm_mem_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovdqa (%rdi), %xmm1 # sched: [6:0.50]
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqb %xmm2, %xmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11] sched: [7:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <16 x i8>, <16 x i8>* %vp
+ %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 14, i32 9, i32 15, i32 9, i32 7, i32 10, i32 15, i32 14, i32 12, i32 1, i32 9, i32 7, i32 10, i32 13, i32 3, i32 11>
+ %cmp = icmp eq <16 x i8> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> zeroinitializer
+ ret <16 x i8> %res
+}
+
+define <16 x i8> @test_masked_16xi8_perm_mem_mask2(<16 x i8>* %vp, <16 x i8> %vec2, <16 x i8> %mask) {
+; GENERIC-LABEL: test_masked_16xi8_perm_mem_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovdqa (%rdi), %xmm2 # sched: [6:0.50]
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqb %xmm3, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 {%k1} = xmm2[1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_16xi8_perm_mem_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovdqa (%rdi), %xmm2 # sched: [6:0.50]
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqb %xmm3, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufb {{.*#+}} xmm0 {%k1} = xmm2[1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9] sched: [7:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <16 x i8>, <16 x i8>* %vp
+ %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 1, i32 3, i32 12, i32 5, i32 13, i32 1, i32 2, i32 11, i32 0, i32 9, i32 14, i32 8, i32 10, i32 0, i32 10, i32 9>
+ %cmp = icmp eq <16 x i8> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> %vec2
+ ret <16 x i8> %res
+}
+
+define <16 x i8> @test_masked_z_16xi8_perm_mem_mask2(<16 x i8>* %vp, <16 x i8> %mask) {
+; GENERIC-LABEL: test_masked_z_16xi8_perm_mem_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovdqa (%rdi), %xmm1 # sched: [6:0.50]
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqb %xmm2, %xmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_16xi8_perm_mem_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovdqa (%rdi), %xmm1 # sched: [6:0.50]
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqb %xmm2, %xmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9] sched: [7:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <16 x i8>, <16 x i8>* %vp
+ %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 1, i32 3, i32 12, i32 5, i32 13, i32 1, i32 2, i32 11, i32 0, i32 9, i32 14, i32 8, i32 10, i32 0, i32 10, i32 9>
+ %cmp = icmp eq <16 x i8> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> zeroinitializer
+ ret <16 x i8> %res
+}
+
+define <16 x i8> @test_16xi8_perm_mem_mask3(<16 x i8>* %vp) {
+; GENERIC-LABEL: test_16xi8_perm_mem_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovdqa (%rdi), %xmm0 # sched: [6:0.50]
+; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [7:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xi8_perm_mem_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovdqa (%rdi), %xmm0 # sched: [6:0.50]
+; SKX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [7:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <16 x i8>, <16 x i8>* %vp
+ %res = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 9, i32 6, i32 5, i32 15, i32 0, i32 0, i32 15, i32 2, i32 1, i32 3, i32 12, i32 14, i32 0, i32 6, i32 1, i32 4>
+ ret <16 x i8> %res
+}
+define <16 x i8> @test_masked_16xi8_perm_mem_mask3(<16 x i8>* %vp, <16 x i8> %vec2, <16 x i8> %mask) {
+; GENERIC-LABEL: test_masked_16xi8_perm_mem_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovdqa (%rdi), %xmm2 # sched: [6:0.50]
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqb %xmm3, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 {%k1} = xmm2[9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_16xi8_perm_mem_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovdqa (%rdi), %xmm2 # sched: [6:0.50]
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqb %xmm3, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufb {{.*#+}} xmm0 {%k1} = xmm2[9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [7:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <16 x i8>, <16 x i8>* %vp
+ %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 9, i32 6, i32 5, i32 15, i32 0, i32 0, i32 15, i32 2, i32 1, i32 3, i32 12, i32 14, i32 0, i32 6, i32 1, i32 4>
+ %cmp = icmp eq <16 x i8> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> %vec2
+ ret <16 x i8> %res
+}
+
+define <16 x i8> @test_masked_z_16xi8_perm_mem_mask3(<16 x i8>* %vp, <16 x i8> %mask) {
+; GENERIC-LABEL: test_masked_z_16xi8_perm_mem_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovdqa (%rdi), %xmm1 # sched: [6:0.50]
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqb %xmm2, %xmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_16xi8_perm_mem_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovdqa (%rdi), %xmm1 # sched: [6:0.50]
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqb %xmm2, %xmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [7:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <16 x i8>, <16 x i8>* %vp
+ %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 9, i32 6, i32 5, i32 15, i32 0, i32 0, i32 15, i32 2, i32 1, i32 3, i32 12, i32 14, i32 0, i32 6, i32 1, i32 4>
+ %cmp = icmp eq <16 x i8> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> zeroinitializer
+ ret <16 x i8> %res
+}
+
+define <32 x i8> @test_32xi8_perm_mask0(<32 x i8> %vec) {
+; GENERIC-LABEL: test_32xi8_perm_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,0,1,15,3,5,11,13,14,2,10,15,0,10,13,5,20,25,23,18,23,22,25,24,20,21,29,20,24,16,27,21] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_32xi8_perm_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,0,1,15,3,5,11,13,14,2,10,15,0,10,13,5,20,25,23,18,23,22,25,24,20,21,29,20,24,16,27,21] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %res = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 8, i32 0, i32 1, i32 15, i32 3, i32 5, i32 11, i32 13, i32 14, i32 2, i32 10, i32 15, i32 0, i32 10, i32 13, i32 5, i32 20, i32 25, i32 23, i32 18, i32 23, i32 22, i32 25, i32 24, i32 20, i32 21, i32 29, i32 20, i32 24, i32 16, i32 27, i32 21>
+ ret <32 x i8> %res
+}
+define <32 x i8> @test_masked_32xi8_perm_mask0(<32 x i8> %vec, <32 x i8> %vec2, <32 x i8> %mask) {
+; GENERIC-LABEL: test_masked_32xi8_perm_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqb %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufb {{.*#+}} ymm1 {%k1} = ymm0[8,0,1,15,3,5,11,13,14,2,10,15,0,10,13,5,20,25,23,18,23,22,25,24,20,21,29,20,24,16,27,21] sched: [5:1.00]
+; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_32xi8_perm_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqb %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufb {{.*#+}} ymm1 {%k1} = ymm0[8,0,1,15,3,5,11,13,14,2,10,15,0,10,13,5,20,25,23,18,23,22,25,24,20,21,29,20,24,16,27,21] sched: [8:1.00]
+; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 8, i32 0, i32 1, i32 15, i32 3, i32 5, i32 11, i32 13, i32 14, i32 2, i32 10, i32 15, i32 0, i32 10, i32 13, i32 5, i32 20, i32 25, i32 23, i32 18, i32 23, i32 22, i32 25, i32 24, i32 20, i32 21, i32 29, i32 20, i32 24, i32 16, i32 27, i32 21>
+ %cmp = icmp eq <32 x i8> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> %vec2
+ ret <32 x i8> %res
+}
+
+define <32 x i8> @test_masked_z_32xi8_perm_mask0(<32 x i8> %vec, <32 x i8> %mask) {
+; GENERIC-LABEL: test_masked_z_32xi8_perm_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqb %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[8,0,1,15,3,5,11,13,14,2,10,15,0,10,13,5,20,25,23,18,23,22,25,24,20,21,29,20,24,16,27,21] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_32xi8_perm_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqb %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[8,0,1,15,3,5,11,13,14,2,10,15,0,10,13,5,20,25,23,18,23,22,25,24,20,21,29,20,24,16,27,21] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 8, i32 0, i32 1, i32 15, i32 3, i32 5, i32 11, i32 13, i32 14, i32 2, i32 10, i32 15, i32 0, i32 10, i32 13, i32 5, i32 20, i32 25, i32 23, i32 18, i32 23, i32 22, i32 25, i32 24, i32 20, i32 21, i32 29, i32 20, i32 24, i32 16, i32 27, i32 21>
+ %cmp = icmp eq <32 x i8> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> zeroinitializer
+ ret <32 x i8> %res
+}
+define <32 x i8> @test_masked_32xi8_perm_mask1(<32 x i8> %vec, <32 x i8> %vec2, <32 x i8> %mask) {
+; GENERIC-LABEL: test_masked_32xi8_perm_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqb %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufb {{.*#+}} ymm1 {%k1} = ymm0[0,4,3,15,5,4,5,15,10,9,11,6,6,10,0,3,21,19,26,22,30,25,22,22,27,22,26,16,23,20,18,24] sched: [5:1.00]
+; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_32xi8_perm_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqb %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufb {{.*#+}} ymm1 {%k1} = ymm0[0,4,3,15,5,4,5,15,10,9,11,6,6,10,0,3,21,19,26,22,30,25,22,22,27,22,26,16,23,20,18,24] sched: [8:1.00]
+; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 0, i32 4, i32 3, i32 15, i32 5, i32 4, i32 5, i32 15, i32 10, i32 9, i32 11, i32 6, i32 6, i32 10, i32 0, i32 3, i32 21, i32 19, i32 26, i32 22, i32 30, i32 25, i32 22, i32 22, i32 27, i32 22, i32 26, i32 16, i32 23, i32 20, i32 18, i32 24>
+ %cmp = icmp eq <32 x i8> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> %vec2
+ ret <32 x i8> %res
+}
+
+define <32 x i8> @test_masked_z_32xi8_perm_mask1(<32 x i8> %vec, <32 x i8> %mask) {
+; GENERIC-LABEL: test_masked_z_32xi8_perm_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqb %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[0,4,3,15,5,4,5,15,10,9,11,6,6,10,0,3,21,19,26,22,30,25,22,22,27,22,26,16,23,20,18,24] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_32xi8_perm_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqb %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[0,4,3,15,5,4,5,15,10,9,11,6,6,10,0,3,21,19,26,22,30,25,22,22,27,22,26,16,23,20,18,24] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 0, i32 4, i32 3, i32 15, i32 5, i32 4, i32 5, i32 15, i32 10, i32 9, i32 11, i32 6, i32 6, i32 10, i32 0, i32 3, i32 21, i32 19, i32 26, i32 22, i32 30, i32 25, i32 22, i32 22, i32 27, i32 22, i32 26, i32 16, i32 23, i32 20, i32 18, i32 24>
+ %cmp = icmp eq <32 x i8> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> zeroinitializer
+ ret <32 x i8> %res
+}
+define <32 x i8> @test_masked_32xi8_perm_mask2(<32 x i8> %vec, <32 x i8> %vec2, <32 x i8> %mask) {
+; GENERIC-LABEL: test_masked_32xi8_perm_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqb %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufb {{.*#+}} ymm1 {%k1} = ymm0[7,8,12,14,7,4,7,12,14,12,3,15,10,1,11,15,22,26,21,19,27,16,29,24,17,17,26,29,20,31,17,29] sched: [5:1.00]
+; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_32xi8_perm_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqb %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufb {{.*#+}} ymm1 {%k1} = ymm0[7,8,12,14,7,4,7,12,14,12,3,15,10,1,11,15,22,26,21,19,27,16,29,24,17,17,26,29,20,31,17,29] sched: [8:1.00]
+; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 7, i32 8, i32 12, i32 14, i32 7, i32 4, i32 7, i32 12, i32 14, i32 12, i32 3, i32 15, i32 10, i32 1, i32 11, i32 15, i32 22, i32 26, i32 21, i32 19, i32 27, i32 16, i32 29, i32 24, i32 17, i32 17, i32 26, i32 29, i32 20, i32 31, i32 17, i32 29>
+ %cmp = icmp eq <32 x i8> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> %vec2
+ ret <32 x i8> %res
+}
+
+define <32 x i8> @test_masked_z_32xi8_perm_mask2(<32 x i8> %vec, <32 x i8> %mask) {
+; GENERIC-LABEL: test_masked_z_32xi8_perm_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqb %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[7,8,12,14,7,4,7,12,14,12,3,15,10,1,11,15,22,26,21,19,27,16,29,24,17,17,26,29,20,31,17,29] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_32xi8_perm_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqb %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[7,8,12,14,7,4,7,12,14,12,3,15,10,1,11,15,22,26,21,19,27,16,29,24,17,17,26,29,20,31,17,29] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 7, i32 8, i32 12, i32 14, i32 7, i32 4, i32 7, i32 12, i32 14, i32 12, i32 3, i32 15, i32 10, i32 1, i32 11, i32 15, i32 22, i32 26, i32 21, i32 19, i32 27, i32 16, i32 29, i32 24, i32 17, i32 17, i32 26, i32 29, i32 20, i32 31, i32 17, i32 29>
+ %cmp = icmp eq <32 x i8> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> zeroinitializer
+ ret <32 x i8> %res
+}
+define <32 x i8> @test_32xi8_perm_mask3(<32 x i8> %vec) {
+; GENERIC-LABEL: test_32xi8_perm_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[6,1,4,7,12,13,2,8,10,5,13,4,0,0,10,8,31,31,30,16,27,27,26,27,30,26,21,24,19,25,16,18] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_32xi8_perm_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[6,1,4,7,12,13,2,8,10,5,13,4,0,0,10,8,31,31,30,16,27,27,26,27,30,26,21,24,19,25,16,18] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %res = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 6, i32 1, i32 4, i32 7, i32 12, i32 13, i32 2, i32 8, i32 10, i32 5, i32 13, i32 4, i32 0, i32 0, i32 10, i32 8, i32 31, i32 31, i32 30, i32 16, i32 27, i32 27, i32 26, i32 27, i32 30, i32 26, i32 21, i32 24, i32 19, i32 25, i32 16, i32 18>
+ ret <32 x i8> %res
+}
+define <32 x i8> @test_masked_32xi8_perm_mask3(<32 x i8> %vec, <32 x i8> %vec2, <32 x i8> %mask) {
+; GENERIC-LABEL: test_masked_32xi8_perm_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqb %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufb {{.*#+}} ymm1 {%k1} = ymm0[6,1,4,7,12,13,2,8,10,5,13,4,0,0,10,8,31,31,30,16,27,27,26,27,30,26,21,24,19,25,16,18] sched: [5:1.00]
+; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_32xi8_perm_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqb %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufb {{.*#+}} ymm1 {%k1} = ymm0[6,1,4,7,12,13,2,8,10,5,13,4,0,0,10,8,31,31,30,16,27,27,26,27,30,26,21,24,19,25,16,18] sched: [8:1.00]
+; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 6, i32 1, i32 4, i32 7, i32 12, i32 13, i32 2, i32 8, i32 10, i32 5, i32 13, i32 4, i32 0, i32 0, i32 10, i32 8, i32 31, i32 31, i32 30, i32 16, i32 27, i32 27, i32 26, i32 27, i32 30, i32 26, i32 21, i32 24, i32 19, i32 25, i32 16, i32 18>
+ %cmp = icmp eq <32 x i8> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> %vec2
+ ret <32 x i8> %res
+}
+
+define <32 x i8> @test_masked_z_32xi8_perm_mask3(<32 x i8> %vec, <32 x i8> %mask) {
+; GENERIC-LABEL: test_masked_z_32xi8_perm_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqb %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[6,1,4,7,12,13,2,8,10,5,13,4,0,0,10,8,31,31,30,16,27,27,26,27,30,26,21,24,19,25,16,18] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_32xi8_perm_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqb %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[6,1,4,7,12,13,2,8,10,5,13,4,0,0,10,8,31,31,30,16,27,27,26,27,30,26,21,24,19,25,16,18] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 6, i32 1, i32 4, i32 7, i32 12, i32 13, i32 2, i32 8, i32 10, i32 5, i32 13, i32 4, i32 0, i32 0, i32 10, i32 8, i32 31, i32 31, i32 30, i32 16, i32 27, i32 27, i32 26, i32 27, i32 30, i32 26, i32 21, i32 24, i32 19, i32 25, i32 16, i32 18>
+ %cmp = icmp eq <32 x i8> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> zeroinitializer
+ ret <32 x i8> %res
+}
+define <32 x i8> @test_32xi8_perm_mem_mask0(<32 x i8>* %vp) {
+; GENERIC-LABEL: test_32xi8_perm_mem_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovdqa (%rdi), %ymm0 # sched: [7:0.50]
+; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[9,0,2,15,4,6,8,4,7,3,0,2,8,1,6,5,22,17,30,23,29,31,21,23,27,22,20,27,30,30,26,22] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_32xi8_perm_mem_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovdqa (%rdi), %ymm0 # sched: [7:0.50]
+; SKX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[9,0,2,15,4,6,8,4,7,3,0,2,8,1,6,5,22,17,30,23,29,31,21,23,27,22,20,27,30,30,26,22] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <32 x i8>, <32 x i8>* %vp
+ %res = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 9, i32 0, i32 2, i32 15, i32 4, i32 6, i32 8, i32 4, i32 7, i32 3, i32 0, i32 2, i32 8, i32 1, i32 6, i32 5, i32 22, i32 17, i32 30, i32 23, i32 29, i32 31, i32 21, i32 23, i32 27, i32 22, i32 20, i32 27, i32 30, i32 30, i32 26, i32 22>
+ ret <32 x i8> %res
+}
+define <32 x i8> @test_masked_32xi8_perm_mem_mask0(<32 x i8>* %vp, <32 x i8> %vec2, <32 x i8> %mask) {
+; GENERIC-LABEL: test_masked_32xi8_perm_mem_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovdqa (%rdi), %ymm2 # sched: [7:0.50]
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqb %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm2[9,0,2,15,4,6,8,4,7,3,0,2,8,1,6,5,22,17,30,23,29,31,21,23,27,22,20,27,30,30,26,22] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_32xi8_perm_mem_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovdqa (%rdi), %ymm2 # sched: [7:0.50]
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqb %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm2[9,0,2,15,4,6,8,4,7,3,0,2,8,1,6,5,22,17,30,23,29,31,21,23,27,22,20,27,30,30,26,22] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <32 x i8>, <32 x i8>* %vp
+ %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 9, i32 0, i32 2, i32 15, i32 4, i32 6, i32 8, i32 4, i32 7, i32 3, i32 0, i32 2, i32 8, i32 1, i32 6, i32 5, i32 22, i32 17, i32 30, i32 23, i32 29, i32 31, i32 21, i32 23, i32 27, i32 22, i32 20, i32 27, i32 30, i32 30, i32 26, i32 22>
+ %cmp = icmp eq <32 x i8> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> %vec2
+ ret <32 x i8> %res
+}
+
+define <32 x i8> @test_masked_z_32xi8_perm_mem_mask0(<32 x i8>* %vp, <32 x i8> %mask) {
+; GENERIC-LABEL: test_masked_z_32xi8_perm_mem_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovdqa (%rdi), %ymm1 # sched: [7:0.50]
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqb %ymm2, %ymm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[9,0,2,15,4,6,8,4,7,3,0,2,8,1,6,5,22,17,30,23,29,31,21,23,27,22,20,27,30,30,26,22] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_32xi8_perm_mem_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovdqa (%rdi), %ymm1 # sched: [7:0.50]
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqb %ymm2, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[9,0,2,15,4,6,8,4,7,3,0,2,8,1,6,5,22,17,30,23,29,31,21,23,27,22,20,27,30,30,26,22] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <32 x i8>, <32 x i8>* %vp
+ %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 9, i32 0, i32 2, i32 15, i32 4, i32 6, i32 8, i32 4, i32 7, i32 3, i32 0, i32 2, i32 8, i32 1, i32 6, i32 5, i32 22, i32 17, i32 30, i32 23, i32 29, i32 31, i32 21, i32 23, i32 27, i32 22, i32 20, i32 27, i32 30, i32 30, i32 26, i32 22>
+ %cmp = icmp eq <32 x i8> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> zeroinitializer
+ ret <32 x i8> %res
+}
+
+define <32 x i8> @test_masked_32xi8_perm_mem_mask1(<32 x i8>* %vp, <32 x i8> %vec2, <32 x i8> %mask) {
+; GENERIC-LABEL: test_masked_32xi8_perm_mem_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovdqa (%rdi), %ymm2 # sched: [7:0.50]
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqb %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm2[15,10,1,1,11,0,0,6,8,7,7,9,10,6,5,15,20,28,22,21,17,29,27,30,23,26,17,22,19,16,31,19] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_32xi8_perm_mem_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovdqa (%rdi), %ymm2 # sched: [7:0.50]
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqb %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm2[15,10,1,1,11,0,0,6,8,7,7,9,10,6,5,15,20,28,22,21,17,29,27,30,23,26,17,22,19,16,31,19] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <32 x i8>, <32 x i8>* %vp
+ %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 15, i32 10, i32 1, i32 1, i32 11, i32 0, i32 0, i32 6, i32 8, i32 7, i32 7, i32 9, i32 10, i32 6, i32 5, i32 15, i32 20, i32 28, i32 22, i32 21, i32 17, i32 29, i32 27, i32 30, i32 23, i32 26, i32 17, i32 22, i32 19, i32 16, i32 31, i32 19>
+ %cmp = icmp eq <32 x i8> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> %vec2
+ ret <32 x i8> %res
+}
+
+define <32 x i8> @test_masked_z_32xi8_perm_mem_mask1(<32 x i8>* %vp, <32 x i8> %mask) {
+; GENERIC-LABEL: test_masked_z_32xi8_perm_mem_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovdqa (%rdi), %ymm1 # sched: [7:0.50]
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqb %ymm2, %ymm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[15,10,1,1,11,0,0,6,8,7,7,9,10,6,5,15,20,28,22,21,17,29,27,30,23,26,17,22,19,16,31,19] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_32xi8_perm_mem_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovdqa (%rdi), %ymm1 # sched: [7:0.50]
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqb %ymm2, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[15,10,1,1,11,0,0,6,8,7,7,9,10,6,5,15,20,28,22,21,17,29,27,30,23,26,17,22,19,16,31,19] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <32 x i8>, <32 x i8>* %vp
+ %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 15, i32 10, i32 1, i32 1, i32 11, i32 0, i32 0, i32 6, i32 8, i32 7, i32 7, i32 9, i32 10, i32 6, i32 5, i32 15, i32 20, i32 28, i32 22, i32 21, i32 17, i32 29, i32 27, i32 30, i32 23, i32 26, i32 17, i32 22, i32 19, i32 16, i32 31, i32 19>
+ %cmp = icmp eq <32 x i8> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> zeroinitializer
+ ret <32 x i8> %res
+}
+
+define <32 x i8> @test_masked_32xi8_perm_mem_mask2(<32 x i8>* %vp, <32 x i8> %vec2, <32 x i8> %mask) {
+; GENERIC-LABEL: test_masked_32xi8_perm_mem_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovdqa (%rdi), %ymm2 # sched: [7:0.50]
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqb %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm2[2,3,6,8,2,15,15,2,6,10,14,7,14,5,7,7,26,19,25,19,21,31,30,29,16,18,20,28,29,25,27,28] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_32xi8_perm_mem_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovdqa (%rdi), %ymm2 # sched: [7:0.50]
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqb %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm2[2,3,6,8,2,15,15,2,6,10,14,7,14,5,7,7,26,19,25,19,21,31,30,29,16,18,20,28,29,25,27,28] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <32 x i8>, <32 x i8>* %vp
+ %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 2, i32 3, i32 6, i32 8, i32 2, i32 15, i32 15, i32 2, i32 6, i32 10, i32 14, i32 7, i32 14, i32 5, i32 7, i32 7, i32 26, i32 19, i32 25, i32 19, i32 21, i32 31, i32 30, i32 29, i32 16, i32 18, i32 20, i32 28, i32 29, i32 25, i32 27, i32 28>
+ %cmp = icmp eq <32 x i8> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> %vec2
+ ret <32 x i8> %res
+}
+
+define <32 x i8> @test_masked_z_32xi8_perm_mem_mask2(<32 x i8>* %vp, <32 x i8> %mask) {
+; GENERIC-LABEL: test_masked_z_32xi8_perm_mem_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovdqa (%rdi), %ymm1 # sched: [7:0.50]
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqb %ymm2, %ymm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[2,3,6,8,2,15,15,2,6,10,14,7,14,5,7,7,26,19,25,19,21,31,30,29,16,18,20,28,29,25,27,28] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_32xi8_perm_mem_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovdqa (%rdi), %ymm1 # sched: [7:0.50]
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqb %ymm2, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[2,3,6,8,2,15,15,2,6,10,14,7,14,5,7,7,26,19,25,19,21,31,30,29,16,18,20,28,29,25,27,28] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <32 x i8>, <32 x i8>* %vp
+ %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 2, i32 3, i32 6, i32 8, i32 2, i32 15, i32 15, i32 2, i32 6, i32 10, i32 14, i32 7, i32 14, i32 5, i32 7, i32 7, i32 26, i32 19, i32 25, i32 19, i32 21, i32 31, i32 30, i32 29, i32 16, i32 18, i32 20, i32 28, i32 29, i32 25, i32 27, i32 28>
+ %cmp = icmp eq <32 x i8> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> zeroinitializer
+ ret <32 x i8> %res
+}
+
+define <32 x i8> @test_32xi8_perm_mem_mask3(<32 x i8>* %vp) {
+; GENERIC-LABEL: test_32xi8_perm_mem_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovdqa (%rdi), %ymm0 # sched: [7:0.50]
+; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,1,13,0,3,0,0,13,5,2,2,10,15,8,14,8,25,26,28,28,31,27,30,19,24,25,29,23,28,22,25,29] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_32xi8_perm_mem_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovdqa (%rdi), %ymm0 # sched: [7:0.50]
+; SKX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,1,13,0,3,0,0,13,5,2,2,10,15,8,14,8,25,26,28,28,31,27,30,19,24,25,29,23,28,22,25,29] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <32 x i8>, <32 x i8>* %vp
+ %res = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 1, i32 1, i32 13, i32 0, i32 3, i32 0, i32 0, i32 13, i32 5, i32 2, i32 2, i32 10, i32 15, i32 8, i32 14, i32 8, i32 25, i32 26, i32 28, i32 28, i32 31, i32 27, i32 30, i32 19, i32 24, i32 25, i32 29, i32 23, i32 28, i32 22, i32 25, i32 29>
+ ret <32 x i8> %res
+}
+define <32 x i8> @test_masked_32xi8_perm_mem_mask3(<32 x i8>* %vp, <32 x i8> %vec2, <32 x i8> %mask) {
+; GENERIC-LABEL: test_masked_32xi8_perm_mem_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovdqa (%rdi), %ymm2 # sched: [7:0.50]
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqb %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm2[1,1,13,0,3,0,0,13,5,2,2,10,15,8,14,8,25,26,28,28,31,27,30,19,24,25,29,23,28,22,25,29] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_32xi8_perm_mem_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovdqa (%rdi), %ymm2 # sched: [7:0.50]
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqb %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm2[1,1,13,0,3,0,0,13,5,2,2,10,15,8,14,8,25,26,28,28,31,27,30,19,24,25,29,23,28,22,25,29] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <32 x i8>, <32 x i8>* %vp
+ %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 1, i32 1, i32 13, i32 0, i32 3, i32 0, i32 0, i32 13, i32 5, i32 2, i32 2, i32 10, i32 15, i32 8, i32 14, i32 8, i32 25, i32 26, i32 28, i32 28, i32 31, i32 27, i32 30, i32 19, i32 24, i32 25, i32 29, i32 23, i32 28, i32 22, i32 25, i32 29>
+ %cmp = icmp eq <32 x i8> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> %vec2
+ ret <32 x i8> %res
+}
+
+define <32 x i8> @test_masked_z_32xi8_perm_mem_mask3(<32 x i8>* %vp, <32 x i8> %mask) {
+; GENERIC-LABEL: test_masked_z_32xi8_perm_mem_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovdqa (%rdi), %ymm1 # sched: [7:0.50]
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqb %ymm2, %ymm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[1,1,13,0,3,0,0,13,5,2,2,10,15,8,14,8,25,26,28,28,31,27,30,19,24,25,29,23,28,22,25,29] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_32xi8_perm_mem_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovdqa (%rdi), %ymm1 # sched: [7:0.50]
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqb %ymm2, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[1,1,13,0,3,0,0,13,5,2,2,10,15,8,14,8,25,26,28,28,31,27,30,19,24,25,29,23,28,22,25,29] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <32 x i8>, <32 x i8>* %vp
+ %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 1, i32 1, i32 13, i32 0, i32 3, i32 0, i32 0, i32 13, i32 5, i32 2, i32 2, i32 10, i32 15, i32 8, i32 14, i32 8, i32 25, i32 26, i32 28, i32 28, i32 31, i32 27, i32 30, i32 19, i32 24, i32 25, i32 29, i32 23, i32 28, i32 22, i32 25, i32 29>
+ %cmp = icmp eq <32 x i8> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> zeroinitializer
+ ret <32 x i8> %res
+}
+
+define <64 x i8> @test_64xi8_perm_mask0(<64 x i8> %vec) {
+; GENERIC-LABEL: test_64xi8_perm_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[8,4,1,13,15,4,6,12,0,10,2,4,13,0,0,6,23,29,27,26,18,31,22,25,22,16,23,18,16,25,26,17,40,37,38,44,39,46,41,39,42,37,33,42,41,44,34,46,60,62,61,58,60,56,60,51,60,55,60,55,60,49,48,62] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_64xi8_perm_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[8,4,1,13,15,4,6,12,0,10,2,4,13,0,0,6,23,29,27,26,18,31,22,25,22,16,23,18,16,25,26,17,40,37,38,44,39,46,41,39,42,37,33,42,41,44,34,46,60,62,61,58,60,56,60,51,60,55,60,55,60,49,48,62] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %res = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 8, i32 4, i32 1, i32 13, i32 15, i32 4, i32 6, i32 12, i32 0, i32 10, i32 2, i32 4, i32 13, i32 0, i32 0, i32 6, i32 23, i32 29, i32 27, i32 26, i32 18, i32 31, i32 22, i32 25, i32 22, i32 16, i32 23, i32 18, i32 16, i32 25, i32 26, i32 17, i32 40, i32 37, i32 38, i32 44, i32 39, i32 46, i32 41, i32 39, i32 42, i32 37, i32 33, i32 42, i32 41, i32 44, i32 34, i32 46, i32 60, i32 62, i32 61, i32 58, i32 60, i32 56, i32 60, i32 51, i32 60, i32 55, i32 60, i32 55, i32 60, i32 49, i32 48, i32 62>
+ ret <64 x i8> %res
+}
+define <64 x i8> @test_masked_64xi8_perm_mask0(<64 x i8> %vec, <64 x i8> %vec2, <64 x i8> %mask) {
+; GENERIC-LABEL: test_masked_64xi8_perm_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqb %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufb {{.*#+}} zmm1 {%k1} = zmm0[8,4,1,13,15,4,6,12,0,10,2,4,13,0,0,6,23,29,27,26,18,31,22,25,22,16,23,18,16,25,26,17,40,37,38,44,39,46,41,39,42,37,33,42,41,44,34,46,60,62,61,58,60,56,60,51,60,55,60,55,60,49,48,62] sched: [5:1.00]
+; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_64xi8_perm_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqb %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufb {{.*#+}} zmm1 {%k1} = zmm0[8,4,1,13,15,4,6,12,0,10,2,4,13,0,0,6,23,29,27,26,18,31,22,25,22,16,23,18,16,25,26,17,40,37,38,44,39,46,41,39,42,37,33,42,41,44,34,46,60,62,61,58,60,56,60,51,60,55,60,55,60,49,48,62] sched: [8:1.00]
+; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 8, i32 4, i32 1, i32 13, i32 15, i32 4, i32 6, i32 12, i32 0, i32 10, i32 2, i32 4, i32 13, i32 0, i32 0, i32 6, i32 23, i32 29, i32 27, i32 26, i32 18, i32 31, i32 22, i32 25, i32 22, i32 16, i32 23, i32 18, i32 16, i32 25, i32 26, i32 17, i32 40, i32 37, i32 38, i32 44, i32 39, i32 46, i32 41, i32 39, i32 42, i32 37, i32 33, i32 42, i32 41, i32 44, i32 34, i32 46, i32 60, i32 62, i32 61, i32 58, i32 60, i32 56, i32 60, i32 51, i32 60, i32 55, i32 60, i32 55, i32 60, i32 49, i32 48, i32 62>
+ %cmp = icmp eq <64 x i8> %mask, zeroinitializer
+ %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> %vec2
+ ret <64 x i8> %res
+}
+
+define <64 x i8> @test_masked_z_64xi8_perm_mask0(<64 x i8> %vec, <64 x i8> %mask) {
+; GENERIC-LABEL: test_masked_z_64xi8_perm_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqb %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[8,4,1,13,15,4,6,12,0,10,2,4,13,0,0,6,23,29,27,26,18,31,22,25,22,16,23,18,16,25,26,17,40,37,38,44,39,46,41,39,42,37,33,42,41,44,34,46,60,62,61,58,60,56,60,51,60,55,60,55,60,49,48,62] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_64xi8_perm_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqb %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[8,4,1,13,15,4,6,12,0,10,2,4,13,0,0,6,23,29,27,26,18,31,22,25,22,16,23,18,16,25,26,17,40,37,38,44,39,46,41,39,42,37,33,42,41,44,34,46,60,62,61,58,60,56,60,51,60,55,60,55,60,49,48,62] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 8, i32 4, i32 1, i32 13, i32 15, i32 4, i32 6, i32 12, i32 0, i32 10, i32 2, i32 4, i32 13, i32 0, i32 0, i32 6, i32 23, i32 29, i32 27, i32 26, i32 18, i32 31, i32 22, i32 25, i32 22, i32 16, i32 23, i32 18, i32 16, i32 25, i32 26, i32 17, i32 40, i32 37, i32 38, i32 44, i32 39, i32 46, i32 41, i32 39, i32 42, i32 37, i32 33, i32 42, i32 41, i32 44, i32 34, i32 46, i32 60, i32 62, i32 61, i32 58, i32 60, i32 56, i32 60, i32 51, i32 60, i32 55, i32 60, i32 55, i32 60, i32 49, i32 48, i32 62>
+ %cmp = icmp eq <64 x i8> %mask, zeroinitializer
+ %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> zeroinitializer
+ ret <64 x i8> %res
+}
+define <64 x i8> @test_masked_64xi8_perm_mask1(<64 x i8> %vec, <64 x i8> %vec2, <64 x i8> %mask) {
+; GENERIC-LABEL: test_masked_64xi8_perm_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqb %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufb {{.*#+}} zmm1 {%k1} = zmm0[7,14,15,10,9,3,1,13,14,12,11,6,4,1,6,9,30,30,22,17,28,27,16,23,26,16,30,31,27,17,17,21,32,37,32,47,45,33,46,35,35,42,47,33,32,37,32,41,61,50,49,53,63,50,63,53,55,52,62,63,58,50,63,49] sched: [5:1.00]
+; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_64xi8_perm_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqb %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufb {{.*#+}} zmm1 {%k1} = zmm0[7,14,15,10,9,3,1,13,14,12,11,6,4,1,6,9,30,30,22,17,28,27,16,23,26,16,30,31,27,17,17,21,32,37,32,47,45,33,46,35,35,42,47,33,32,37,32,41,61,50,49,53,63,50,63,53,55,52,62,63,58,50,63,49] sched: [8:1.00]
+; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 7, i32 14, i32 15, i32 10, i32 9, i32 3, i32 1, i32 13, i32 14, i32 12, i32 11, i32 6, i32 4, i32 1, i32 6, i32 9, i32 30, i32 30, i32 22, i32 17, i32 28, i32 27, i32 16, i32 23, i32 26, i32 16, i32 30, i32 31, i32 27, i32 17, i32 17, i32 21, i32 32, i32 37, i32 32, i32 47, i32 45, i32 33, i32 46, i32 35, i32 35, i32 42, i32 47, i32 33, i32 32, i32 37, i32 32, i32 41, i32 61, i32 50, i32 49, i32 53, i32 63, i32 50, i32 63, i32 53, i32 55, i32 52, i32 62, i32 63, i32 58, i32 50, i32 63, i32 49>
+ %cmp = icmp eq <64 x i8> %mask, zeroinitializer
+ %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> %vec2
+ ret <64 x i8> %res
+}
+
+define <64 x i8> @test_masked_z_64xi8_perm_mask1(<64 x i8> %vec, <64 x i8> %mask) {
+; GENERIC-LABEL: test_masked_z_64xi8_perm_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqb %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[7,14,15,10,9,3,1,13,14,12,11,6,4,1,6,9,30,30,22,17,28,27,16,23,26,16,30,31,27,17,17,21,32,37,32,47,45,33,46,35,35,42,47,33,32,37,32,41,61,50,49,53,63,50,63,53,55,52,62,63,58,50,63,49] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_64xi8_perm_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqb %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[7,14,15,10,9,3,1,13,14,12,11,6,4,1,6,9,30,30,22,17,28,27,16,23,26,16,30,31,27,17,17,21,32,37,32,47,45,33,46,35,35,42,47,33,32,37,32,41,61,50,49,53,63,50,63,53,55,52,62,63,58,50,63,49] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 7, i32 14, i32 15, i32 10, i32 9, i32 3, i32 1, i32 13, i32 14, i32 12, i32 11, i32 6, i32 4, i32 1, i32 6, i32 9, i32 30, i32 30, i32 22, i32 17, i32 28, i32 27, i32 16, i32 23, i32 26, i32 16, i32 30, i32 31, i32 27, i32 17, i32 17, i32 21, i32 32, i32 37, i32 32, i32 47, i32 45, i32 33, i32 46, i32 35, i32 35, i32 42, i32 47, i32 33, i32 32, i32 37, i32 32, i32 41, i32 61, i32 50, i32 49, i32 53, i32 63, i32 50, i32 63, i32 53, i32 55, i32 52, i32 62, i32 63, i32 58, i32 50, i32 63, i32 49>
+ %cmp = icmp eq <64 x i8> %mask, zeroinitializer
+ %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> zeroinitializer
+ ret <64 x i8> %res
+}
+define <64 x i8> @test_masked_64xi8_perm_mask2(<64 x i8> %vec, <64 x i8> %vec2, <64 x i8> %mask) {
+; GENERIC-LABEL: test_masked_64xi8_perm_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqb %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufb {{.*#+}} zmm1 {%k1} = zmm0[9,2,14,15,12,5,3,12,4,6,0,2,0,1,1,6,24,27,18,22,26,17,23,21,31,16,22,22,27,21,19,20,39,47,44,36,40,43,44,39,38,44,38,35,39,46,34,39,58,55,51,48,59,57,48,52,60,58,56,50,59,55,58,60] sched: [5:1.00]
+; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_64xi8_perm_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqb %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufb {{.*#+}} zmm1 {%k1} = zmm0[9,2,14,15,12,5,3,12,4,6,0,2,0,1,1,6,24,27,18,22,26,17,23,21,31,16,22,22,27,21,19,20,39,47,44,36,40,43,44,39,38,44,38,35,39,46,34,39,58,55,51,48,59,57,48,52,60,58,56,50,59,55,58,60] sched: [8:1.00]
+; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 9, i32 2, i32 14, i32 15, i32 12, i32 5, i32 3, i32 12, i32 4, i32 6, i32 0, i32 2, i32 0, i32 1, i32 1, i32 6, i32 24, i32 27, i32 18, i32 22, i32 26, i32 17, i32 23, i32 21, i32 31, i32 16, i32 22, i32 22, i32 27, i32 21, i32 19, i32 20, i32 39, i32 47, i32 44, i32 36, i32 40, i32 43, i32 44, i32 39, i32 38, i32 44, i32 38, i32 35, i32 39, i32 46, i32 34, i32 39, i32 58, i32 55, i32 51, i32 48, i32 59, i32 57, i32 48, i32 52, i32 60, i32 58, i32 56, i32 50, i32 59, i32 55, i32 58, i32 60>
+ %cmp = icmp eq <64 x i8> %mask, zeroinitializer
+ %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> %vec2
+ ret <64 x i8> %res
+}
+
+define <64 x i8> @test_masked_z_64xi8_perm_mask2(<64 x i8> %vec, <64 x i8> %mask) {
+; GENERIC-LABEL: test_masked_z_64xi8_perm_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqb %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[9,2,14,15,12,5,3,12,4,6,0,2,0,1,1,6,24,27,18,22,26,17,23,21,31,16,22,22,27,21,19,20,39,47,44,36,40,43,44,39,38,44,38,35,39,46,34,39,58,55,51,48,59,57,48,52,60,58,56,50,59,55,58,60] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_64xi8_perm_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqb %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[9,2,14,15,12,5,3,12,4,6,0,2,0,1,1,6,24,27,18,22,26,17,23,21,31,16,22,22,27,21,19,20,39,47,44,36,40,43,44,39,38,44,38,35,39,46,34,39,58,55,51,48,59,57,48,52,60,58,56,50,59,55,58,60] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 9, i32 2, i32 14, i32 15, i32 12, i32 5, i32 3, i32 12, i32 4, i32 6, i32 0, i32 2, i32 0, i32 1, i32 1, i32 6, i32 24, i32 27, i32 18, i32 22, i32 26, i32 17, i32 23, i32 21, i32 31, i32 16, i32 22, i32 22, i32 27, i32 21, i32 19, i32 20, i32 39, i32 47, i32 44, i32 36, i32 40, i32 43, i32 44, i32 39, i32 38, i32 44, i32 38, i32 35, i32 39, i32 46, i32 34, i32 39, i32 58, i32 55, i32 51, i32 48, i32 59, i32 57, i32 48, i32 52, i32 60, i32 58, i32 56, i32 50, i32 59, i32 55, i32 58, i32 60>
+ %cmp = icmp eq <64 x i8> %mask, zeroinitializer
+ %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> zeroinitializer
+ ret <64 x i8> %res
+}
+define <64 x i8> @test_64xi8_perm_mask3(<64 x i8> %vec) {
+; GENERIC-LABEL: test_64xi8_perm_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[3,12,4,15,1,14,0,4,8,9,6,1,4,4,12,14,25,16,28,20,21,24,19,30,18,22,20,24,25,26,24,22,42,38,44,44,36,37,42,34,43,38,41,34,42,37,39,38,55,59,53,58,48,52,59,48,57,48,55,62,48,56,49,61] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_64xi8_perm_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[3,12,4,15,1,14,0,4,8,9,6,1,4,4,12,14,25,16,28,20,21,24,19,30,18,22,20,24,25,26,24,22,42,38,44,44,36,37,42,34,43,38,41,34,42,37,39,38,55,59,53,58,48,52,59,48,57,48,55,62,48,56,49,61] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %res = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 3, i32 12, i32 4, i32 15, i32 1, i32 14, i32 0, i32 4, i32 8, i32 9, i32 6, i32 1, i32 4, i32 4, i32 12, i32 14, i32 25, i32 16, i32 28, i32 20, i32 21, i32 24, i32 19, i32 30, i32 18, i32 22, i32 20, i32 24, i32 25, i32 26, i32 24, i32 22, i32 42, i32 38, i32 44, i32 44, i32 36, i32 37, i32 42, i32 34, i32 43, i32 38, i32 41, i32 34, i32 42, i32 37, i32 39, i32 38, i32 55, i32 59, i32 53, i32 58, i32 48, i32 52, i32 59, i32 48, i32 57, i32 48, i32 55, i32 62, i32 48, i32 56, i32 49, i32 61>
+ ret <64 x i8> %res
+}
+define <64 x i8> @test_masked_64xi8_perm_mask3(<64 x i8> %vec, <64 x i8> %vec2, <64 x i8> %mask) {
+; GENERIC-LABEL: test_masked_64xi8_perm_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqb %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufb {{.*#+}} zmm1 {%k1} = zmm0[3,12,4,15,1,14,0,4,8,9,6,1,4,4,12,14,25,16,28,20,21,24,19,30,18,22,20,24,25,26,24,22,42,38,44,44,36,37,42,34,43,38,41,34,42,37,39,38,55,59,53,58,48,52,59,48,57,48,55,62,48,56,49,61] sched: [5:1.00]
+; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_64xi8_perm_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqb %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufb {{.*#+}} zmm1 {%k1} = zmm0[3,12,4,15,1,14,0,4,8,9,6,1,4,4,12,14,25,16,28,20,21,24,19,30,18,22,20,24,25,26,24,22,42,38,44,44,36,37,42,34,43,38,41,34,42,37,39,38,55,59,53,58,48,52,59,48,57,48,55,62,48,56,49,61] sched: [8:1.00]
+; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 3, i32 12, i32 4, i32 15, i32 1, i32 14, i32 0, i32 4, i32 8, i32 9, i32 6, i32 1, i32 4, i32 4, i32 12, i32 14, i32 25, i32 16, i32 28, i32 20, i32 21, i32 24, i32 19, i32 30, i32 18, i32 22, i32 20, i32 24, i32 25, i32 26, i32 24, i32 22, i32 42, i32 38, i32 44, i32 44, i32 36, i32 37, i32 42, i32 34, i32 43, i32 38, i32 41, i32 34, i32 42, i32 37, i32 39, i32 38, i32 55, i32 59, i32 53, i32 58, i32 48, i32 52, i32 59, i32 48, i32 57, i32 48, i32 55, i32 62, i32 48, i32 56, i32 49, i32 61>
+ %cmp = icmp eq <64 x i8> %mask, zeroinitializer
+ %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> %vec2
+ ret <64 x i8> %res
+}
+
+define <64 x i8> @test_masked_z_64xi8_perm_mask3(<64 x i8> %vec, <64 x i8> %mask) {
+; GENERIC-LABEL: test_masked_z_64xi8_perm_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqb %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[3,12,4,15,1,14,0,4,8,9,6,1,4,4,12,14,25,16,28,20,21,24,19,30,18,22,20,24,25,26,24,22,42,38,44,44,36,37,42,34,43,38,41,34,42,37,39,38,55,59,53,58,48,52,59,48,57,48,55,62,48,56,49,61] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_64xi8_perm_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqb %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[3,12,4,15,1,14,0,4,8,9,6,1,4,4,12,14,25,16,28,20,21,24,19,30,18,22,20,24,25,26,24,22,42,38,44,44,36,37,42,34,43,38,41,34,42,37,39,38,55,59,53,58,48,52,59,48,57,48,55,62,48,56,49,61] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 3, i32 12, i32 4, i32 15, i32 1, i32 14, i32 0, i32 4, i32 8, i32 9, i32 6, i32 1, i32 4, i32 4, i32 12, i32 14, i32 25, i32 16, i32 28, i32 20, i32 21, i32 24, i32 19, i32 30, i32 18, i32 22, i32 20, i32 24, i32 25, i32 26, i32 24, i32 22, i32 42, i32 38, i32 44, i32 44, i32 36, i32 37, i32 42, i32 34, i32 43, i32 38, i32 41, i32 34, i32 42, i32 37, i32 39, i32 38, i32 55, i32 59, i32 53, i32 58, i32 48, i32 52, i32 59, i32 48, i32 57, i32 48, i32 55, i32 62, i32 48, i32 56, i32 49, i32 61>
+ %cmp = icmp eq <64 x i8> %mask, zeroinitializer
+ %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> zeroinitializer
+ ret <64 x i8> %res
+}
+define <64 x i8> @test_64xi8_perm_mem_mask0(<64 x i8>* %vp) {
+; GENERIC-LABEL: test_64xi8_perm_mem_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovdqa64 (%rdi), %zmm0 # sched: [4:0.50]
+; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,9,15,13,11,11,3,12,4,1,7,5,2,6,14,6,23,27,24,18,30,23,28,22,28,22,19,19,31,25,16,22,35,33,34,32,42,34,41,41,43,40,36,46,37,39,42,40,63,63,62,62,57,55,59,51,52,48,50,48,58,50,60,58] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_64xi8_perm_mem_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovdqa64 (%rdi), %zmm0 # sched: [8:0.50]
+; SKX-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,9,15,13,11,11,3,12,4,1,7,5,2,6,14,6,23,27,24,18,30,23,28,22,28,22,19,19,31,25,16,22,35,33,34,32,42,34,41,41,43,40,36,46,37,39,42,40,63,63,62,62,57,55,59,51,52,48,50,48,58,50,60,58] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <64 x i8>, <64 x i8>* %vp
+ %res = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 0, i32 9, i32 15, i32 13, i32 11, i32 11, i32 3, i32 12, i32 4, i32 1, i32 7, i32 5, i32 2, i32 6, i32 14, i32 6, i32 23, i32 27, i32 24, i32 18, i32 30, i32 23, i32 28, i32 22, i32 28, i32 22, i32 19, i32 19, i32 31, i32 25, i32 16, i32 22, i32 35, i32 33, i32 34, i32 32, i32 42, i32 34, i32 41, i32 41, i32 43, i32 40, i32 36, i32 46, i32 37, i32 39, i32 42, i32 40, i32 63, i32 63, i32 62, i32 62, i32 57, i32 55, i32 59, i32 51, i32 52, i32 48, i32 50, i32 48, i32 58, i32 50, i32 60, i32 58>
+ ret <64 x i8> %res
+}
+define <64 x i8> @test_masked_64xi8_perm_mem_mask0(<64 x i8>* %vp, <64 x i8> %vec2, <64 x i8> %mask) {
+; GENERIC-LABEL: test_masked_64xi8_perm_mem_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovdqa64 (%rdi), %zmm2 # sched: [4:0.50]
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqb %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} = zmm2[0,9,15,13,11,11,3,12,4,1,7,5,2,6,14,6,23,27,24,18,30,23,28,22,28,22,19,19,31,25,16,22,35,33,34,32,42,34,41,41,43,40,36,46,37,39,42,40,63,63,62,62,57,55,59,51,52,48,50,48,58,50,60,58] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_64xi8_perm_mem_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovdqa64 (%rdi), %zmm2 # sched: [8:0.50]
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqb %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufb {{.*#+}} zmm0 {%k1} = zmm2[0,9,15,13,11,11,3,12,4,1,7,5,2,6,14,6,23,27,24,18,30,23,28,22,28,22,19,19,31,25,16,22,35,33,34,32,42,34,41,41,43,40,36,46,37,39,42,40,63,63,62,62,57,55,59,51,52,48,50,48,58,50,60,58] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <64 x i8>, <64 x i8>* %vp
+ %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 0, i32 9, i32 15, i32 13, i32 11, i32 11, i32 3, i32 12, i32 4, i32 1, i32 7, i32 5, i32 2, i32 6, i32 14, i32 6, i32 23, i32 27, i32 24, i32 18, i32 30, i32 23, i32 28, i32 22, i32 28, i32 22, i32 19, i32 19, i32 31, i32 25, i32 16, i32 22, i32 35, i32 33, i32 34, i32 32, i32 42, i32 34, i32 41, i32 41, i32 43, i32 40, i32 36, i32 46, i32 37, i32 39, i32 42, i32 40, i32 63, i32 63, i32 62, i32 62, i32 57, i32 55, i32 59, i32 51, i32 52, i32 48, i32 50, i32 48, i32 58, i32 50, i32 60, i32 58>
+ %cmp = icmp eq <64 x i8> %mask, zeroinitializer
+ %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> %vec2
+ ret <64 x i8> %res
+}
+
+define <64 x i8> @test_masked_z_64xi8_perm_mem_mask0(<64 x i8>* %vp, <64 x i8> %mask) {
+; GENERIC-LABEL: test_masked_z_64xi8_perm_mem_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovdqa64 (%rdi), %zmm1 # sched: [4:0.50]
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqb %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[0,9,15,13,11,11,3,12,4,1,7,5,2,6,14,6,23,27,24,18,30,23,28,22,28,22,19,19,31,25,16,22,35,33,34,32,42,34,41,41,43,40,36,46,37,39,42,40,63,63,62,62,57,55,59,51,52,48,50,48,58,50,60,58] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_64xi8_perm_mem_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovdqa64 (%rdi), %zmm1 # sched: [8:0.50]
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqb %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[0,9,15,13,11,11,3,12,4,1,7,5,2,6,14,6,23,27,24,18,30,23,28,22,28,22,19,19,31,25,16,22,35,33,34,32,42,34,41,41,43,40,36,46,37,39,42,40,63,63,62,62,57,55,59,51,52,48,50,48,58,50,60,58] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <64 x i8>, <64 x i8>* %vp
+ %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 0, i32 9, i32 15, i32 13, i32 11, i32 11, i32 3, i32 12, i32 4, i32 1, i32 7, i32 5, i32 2, i32 6, i32 14, i32 6, i32 23, i32 27, i32 24, i32 18, i32 30, i32 23, i32 28, i32 22, i32 28, i32 22, i32 19, i32 19, i32 31, i32 25, i32 16, i32 22, i32 35, i32 33, i32 34, i32 32, i32 42, i32 34, i32 41, i32 41, i32 43, i32 40, i32 36, i32 46, i32 37, i32 39, i32 42, i32 40, i32 63, i32 63, i32 62, i32 62, i32 57, i32 55, i32 59, i32 51, i32 52, i32 48, i32 50, i32 48, i32 58, i32 50, i32 60, i32 58>
+ %cmp = icmp eq <64 x i8> %mask, zeroinitializer
+ %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> zeroinitializer
+ ret <64 x i8> %res
+}
+
+define <64 x i8> @test_masked_64xi8_perm_mem_mask1(<64 x i8>* %vp, <64 x i8> %vec2, <64 x i8> %mask) {
+; GENERIC-LABEL: test_masked_64xi8_perm_mem_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovdqa64 (%rdi), %zmm2 # sched: [4:0.50]
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqb %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} = zmm2[15,6,14,7,5,1,14,12,5,7,5,0,0,5,3,8,19,19,26,27,20,29,20,21,27,16,30,17,23,27,16,28,47,39,33,33,33,44,38,46,39,33,38,44,45,32,34,39,50,61,62,53,54,56,52,56,51,52,55,57,56,52,51,49] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_64xi8_perm_mem_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovdqa64 (%rdi), %zmm2 # sched: [8:0.50]
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqb %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufb {{.*#+}} zmm0 {%k1} = zmm2[15,6,14,7,5,1,14,12,5,7,5,0,0,5,3,8,19,19,26,27,20,29,20,21,27,16,30,17,23,27,16,28,47,39,33,33,33,44,38,46,39,33,38,44,45,32,34,39,50,61,62,53,54,56,52,56,51,52,55,57,56,52,51,49] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <64 x i8>, <64 x i8>* %vp
+ %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 15, i32 6, i32 14, i32 7, i32 5, i32 1, i32 14, i32 12, i32 5, i32 7, i32 5, i32 0, i32 0, i32 5, i32 3, i32 8, i32 19, i32 19, i32 26, i32 27, i32 20, i32 29, i32 20, i32 21, i32 27, i32 16, i32 30, i32 17, i32 23, i32 27, i32 16, i32 28, i32 47, i32 39, i32 33, i32 33, i32 33, i32 44, i32 38, i32 46, i32 39, i32 33, i32 38, i32 44, i32 45, i32 32, i32 34, i32 39, i32 50, i32 61, i32 62, i32 53, i32 54, i32 56, i32 52, i32 56, i32 51, i32 52, i32 55, i32 57, i32 56, i32 52, i32 51, i32 49>
+ %cmp = icmp eq <64 x i8> %mask, zeroinitializer
+ %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> %vec2
+ ret <64 x i8> %res
+}
+
+define <64 x i8> @test_masked_z_64xi8_perm_mem_mask1(<64 x i8>* %vp, <64 x i8> %mask) {
+; GENERIC-LABEL: test_masked_z_64xi8_perm_mem_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovdqa64 (%rdi), %zmm1 # sched: [4:0.50]
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqb %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[15,6,14,7,5,1,14,12,5,7,5,0,0,5,3,8,19,19,26,27,20,29,20,21,27,16,30,17,23,27,16,28,47,39,33,33,33,44,38,46,39,33,38,44,45,32,34,39,50,61,62,53,54,56,52,56,51,52,55,57,56,52,51,49] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_64xi8_perm_mem_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovdqa64 (%rdi), %zmm1 # sched: [8:0.50]
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqb %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[15,6,14,7,5,1,14,12,5,7,5,0,0,5,3,8,19,19,26,27,20,29,20,21,27,16,30,17,23,27,16,28,47,39,33,33,33,44,38,46,39,33,38,44,45,32,34,39,50,61,62,53,54,56,52,56,51,52,55,57,56,52,51,49] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <64 x i8>, <64 x i8>* %vp
+ %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 15, i32 6, i32 14, i32 7, i32 5, i32 1, i32 14, i32 12, i32 5, i32 7, i32 5, i32 0, i32 0, i32 5, i32 3, i32 8, i32 19, i32 19, i32 26, i32 27, i32 20, i32 29, i32 20, i32 21, i32 27, i32 16, i32 30, i32 17, i32 23, i32 27, i32 16, i32 28, i32 47, i32 39, i32 33, i32 33, i32 33, i32 44, i32 38, i32 46, i32 39, i32 33, i32 38, i32 44, i32 45, i32 32, i32 34, i32 39, i32 50, i32 61, i32 62, i32 53, i32 54, i32 56, i32 52, i32 56, i32 51, i32 52, i32 55, i32 57, i32 56, i32 52, i32 51, i32 49>
+ %cmp = icmp eq <64 x i8> %mask, zeroinitializer
+ %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> zeroinitializer
+ ret <64 x i8> %res
+}
+
+define <64 x i8> @test_masked_64xi8_perm_mem_mask2(<64 x i8>* %vp, <64 x i8> %vec2, <64 x i8> %mask) {
+; GENERIC-LABEL: test_masked_64xi8_perm_mem_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovdqa64 (%rdi), %zmm2 # sched: [4:0.50]
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqb %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} = zmm2[12,1,11,3,4,11,10,11,8,13,1,10,1,11,5,10,27,26,19,29,19,24,26,19,26,20,18,28,24,21,25,16,34,38,47,40,33,44,44,44,41,43,35,43,45,44,37,41,58,62,49,61,56,53,55,48,51,58,58,55,63,55,53,61] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_64xi8_perm_mem_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovdqa64 (%rdi), %zmm2 # sched: [8:0.50]
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqb %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufb {{.*#+}} zmm0 {%k1} = zmm2[12,1,11,3,4,11,10,11,8,13,1,10,1,11,5,10,27,26,19,29,19,24,26,19,26,20,18,28,24,21,25,16,34,38,47,40,33,44,44,44,41,43,35,43,45,44,37,41,58,62,49,61,56,53,55,48,51,58,58,55,63,55,53,61] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <64 x i8>, <64 x i8>* %vp
+ %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 12, i32 1, i32 11, i32 3, i32 4, i32 11, i32 10, i32 11, i32 8, i32 13, i32 1, i32 10, i32 1, i32 11, i32 5, i32 10, i32 27, i32 26, i32 19, i32 29, i32 19, i32 24, i32 26, i32 19, i32 26, i32 20, i32 18, i32 28, i32 24, i32 21, i32 25, i32 16, i32 34, i32 38, i32 47, i32 40, i32 33, i32 44, i32 44, i32 44, i32 41, i32 43, i32 35, i32 43, i32 45, i32 44, i32 37, i32 41, i32 58, i32 62, i32 49, i32 61, i32 56, i32 53, i32 55, i32 48, i32 51, i32 58, i32 58, i32 55, i32 63, i32 55, i32 53, i32 61>
+ %cmp = icmp eq <64 x i8> %mask, zeroinitializer
+ %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> %vec2
+ ret <64 x i8> %res
+}
+
+define <64 x i8> @test_masked_z_64xi8_perm_mem_mask2(<64 x i8>* %vp, <64 x i8> %mask) {
+; GENERIC-LABEL: test_masked_z_64xi8_perm_mem_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovdqa64 (%rdi), %zmm1 # sched: [4:0.50]
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqb %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[12,1,11,3,4,11,10,11,8,13,1,10,1,11,5,10,27,26,19,29,19,24,26,19,26,20,18,28,24,21,25,16,34,38,47,40,33,44,44,44,41,43,35,43,45,44,37,41,58,62,49,61,56,53,55,48,51,58,58,55,63,55,53,61] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_64xi8_perm_mem_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovdqa64 (%rdi), %zmm1 # sched: [8:0.50]
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqb %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[12,1,11,3,4,11,10,11,8,13,1,10,1,11,5,10,27,26,19,29,19,24,26,19,26,20,18,28,24,21,25,16,34,38,47,40,33,44,44,44,41,43,35,43,45,44,37,41,58,62,49,61,56,53,55,48,51,58,58,55,63,55,53,61] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <64 x i8>, <64 x i8>* %vp
+ %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 12, i32 1, i32 11, i32 3, i32 4, i32 11, i32 10, i32 11, i32 8, i32 13, i32 1, i32 10, i32 1, i32 11, i32 5, i32 10, i32 27, i32 26, i32 19, i32 29, i32 19, i32 24, i32 26, i32 19, i32 26, i32 20, i32 18, i32 28, i32 24, i32 21, i32 25, i32 16, i32 34, i32 38, i32 47, i32 40, i32 33, i32 44, i32 44, i32 44, i32 41, i32 43, i32 35, i32 43, i32 45, i32 44, i32 37, i32 41, i32 58, i32 62, i32 49, i32 61, i32 56, i32 53, i32 55, i32 48, i32 51, i32 58, i32 58, i32 55, i32 63, i32 55, i32 53, i32 61>
+ %cmp = icmp eq <64 x i8> %mask, zeroinitializer
+ %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> zeroinitializer
+ ret <64 x i8> %res
+}
+
+define <64 x i8> @test_64xi8_perm_mem_mask3(<64 x i8>* %vp) {
+; GENERIC-LABEL: test_64xi8_perm_mem_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovdqa64 (%rdi), %zmm0 # sched: [4:0.50]
+; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[4,9,11,13,12,6,0,0,11,15,5,7,11,10,4,10,20,21,24,27,18,16,26,16,16,19,26,17,16,31,22,30,35,38,37,34,37,47,43,38,38,36,40,43,42,39,32,46,54,54,48,50,61,56,59,50,53,61,61,51,48,60,50,60] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_64xi8_perm_mem_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovdqa64 (%rdi), %zmm0 # sched: [8:0.50]
+; SKX-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[4,9,11,13,12,6,0,0,11,15,5,7,11,10,4,10,20,21,24,27,18,16,26,16,16,19,26,17,16,31,22,30,35,38,37,34,37,47,43,38,38,36,40,43,42,39,32,46,54,54,48,50,61,56,59,50,53,61,61,51,48,60,50,60] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <64 x i8>, <64 x i8>* %vp
+ %res = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 4, i32 9, i32 11, i32 13, i32 12, i32 6, i32 0, i32 0, i32 11, i32 15, i32 5, i32 7, i32 11, i32 10, i32 4, i32 10, i32 20, i32 21, i32 24, i32 27, i32 18, i32 16, i32 26, i32 16, i32 16, i32 19, i32 26, i32 17, i32 16, i32 31, i32 22, i32 30, i32 35, i32 38, i32 37, i32 34, i32 37, i32 47, i32 43, i32 38, i32 38, i32 36, i32 40, i32 43, i32 42, i32 39, i32 32, i32 46, i32 54, i32 54, i32 48, i32 50, i32 61, i32 56, i32 59, i32 50, i32 53, i32 61, i32 61, i32 51, i32 48, i32 60, i32 50, i32 60>
+ ret <64 x i8> %res
+}
+define <64 x i8> @test_masked_64xi8_perm_mem_mask3(<64 x i8>* %vp, <64 x i8> %vec2, <64 x i8> %mask) {
+; GENERIC-LABEL: test_masked_64xi8_perm_mem_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovdqa64 (%rdi), %zmm2 # sched: [4:0.50]
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqb %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} = zmm2[4,9,11,13,12,6,0,0,11,15,5,7,11,10,4,10,20,21,24,27,18,16,26,16,16,19,26,17,16,31,22,30,35,38,37,34,37,47,43,38,38,36,40,43,42,39,32,46,54,54,48,50,61,56,59,50,53,61,61,51,48,60,50,60] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_64xi8_perm_mem_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovdqa64 (%rdi), %zmm2 # sched: [8:0.50]
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqb %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufb {{.*#+}} zmm0 {%k1} = zmm2[4,9,11,13,12,6,0,0,11,15,5,7,11,10,4,10,20,21,24,27,18,16,26,16,16,19,26,17,16,31,22,30,35,38,37,34,37,47,43,38,38,36,40,43,42,39,32,46,54,54,48,50,61,56,59,50,53,61,61,51,48,60,50,60] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <64 x i8>, <64 x i8>* %vp
+ %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 4, i32 9, i32 11, i32 13, i32 12, i32 6, i32 0, i32 0, i32 11, i32 15, i32 5, i32 7, i32 11, i32 10, i32 4, i32 10, i32 20, i32 21, i32 24, i32 27, i32 18, i32 16, i32 26, i32 16, i32 16, i32 19, i32 26, i32 17, i32 16, i32 31, i32 22, i32 30, i32 35, i32 38, i32 37, i32 34, i32 37, i32 47, i32 43, i32 38, i32 38, i32 36, i32 40, i32 43, i32 42, i32 39, i32 32, i32 46, i32 54, i32 54, i32 48, i32 50, i32 61, i32 56, i32 59, i32 50, i32 53, i32 61, i32 61, i32 51, i32 48, i32 60, i32 50, i32 60>
+ %cmp = icmp eq <64 x i8> %mask, zeroinitializer
+ %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> %vec2
+ ret <64 x i8> %res
+}
+
+define <64 x i8> @test_masked_z_64xi8_perm_mem_mask3(<64 x i8>* %vp, <64 x i8> %mask) {
+; GENERIC-LABEL: test_masked_z_64xi8_perm_mem_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovdqa64 (%rdi), %zmm1 # sched: [4:0.50]
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqb %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[4,9,11,13,12,6,0,0,11,15,5,7,11,10,4,10,20,21,24,27,18,16,26,16,16,19,26,17,16,31,22,30,35,38,37,34,37,47,43,38,38,36,40,43,42,39,32,46,54,54,48,50,61,56,59,50,53,61,61,51,48,60,50,60] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_64xi8_perm_mem_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovdqa64 (%rdi), %zmm1 # sched: [8:0.50]
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqb %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[4,9,11,13,12,6,0,0,11,15,5,7,11,10,4,10,20,21,24,27,18,16,26,16,16,19,26,17,16,31,22,30,35,38,37,34,37,47,43,38,38,36,40,43,42,39,32,46,54,54,48,50,61,56,59,50,53,61,61,51,48,60,50,60] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <64 x i8>, <64 x i8>* %vp
+ %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 4, i32 9, i32 11, i32 13, i32 12, i32 6, i32 0, i32 0, i32 11, i32 15, i32 5, i32 7, i32 11, i32 10, i32 4, i32 10, i32 20, i32 21, i32 24, i32 27, i32 18, i32 16, i32 26, i32 16, i32 16, i32 19, i32 26, i32 17, i32 16, i32 31, i32 22, i32 30, i32 35, i32 38, i32 37, i32 34, i32 37, i32 47, i32 43, i32 38, i32 38, i32 36, i32 40, i32 43, i32 42, i32 39, i32 32, i32 46, i32 54, i32 54, i32 48, i32 50, i32 61, i32 56, i32 59, i32 50, i32 53, i32 61, i32 61, i32 51, i32 48, i32 60, i32 50, i32 60>
+ %cmp = icmp eq <64 x i8> %mask, zeroinitializer
+ %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> zeroinitializer
+ ret <64 x i8> %res
+}
+
+define <8 x i16> @test_8xi16_perm_high_mask0(<8 x i16> %vec) {
+; GENERIC-LABEL: test_8xi16_perm_high_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,7,6] sched: [1:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xi16_perm_high_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,7,6] sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %res = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 5, i32 7, i32 6>
+ ret <8 x i16> %res
+}
+define <8 x i16> @test_masked_8xi16_perm_high_mask0(<8 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
+; GENERIC-LABEL: test_masked_8xi16_perm_high_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,6,5,7,6] sched: [1:1.00]
+; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_8xi16_perm_high_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,6,5,7,6] sched: [1:1.00]
+; SKX-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 5, i32 7, i32 6>
+ %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
+ ret <8 x i16> %res
+}
+
+define <8 x i16> @test_masked_z_8xi16_perm_high_mask0(<8 x i16> %vec, <8 x i16> %mask) {
+; GENERIC-LABEL: test_masked_z_8xi16_perm_high_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,6,5,7,6] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_8xi16_perm_high_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,6,5,7,6] sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 5, i32 7, i32 6>
+ %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
+ ret <8 x i16> %res
+}
+define <8 x i16> @test_masked_8xi16_perm_low_mask1(<8 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
+; GENERIC-LABEL: test_masked_8xi16_perm_low_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[0,3,0,0,4,5,6,7] sched: [1:1.00]
+; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_8xi16_perm_low_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[0,3,0,0,4,5,6,7] sched: [1:1.00]
+; SKX-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 3, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7>
+ %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
+ ret <8 x i16> %res
+}
+
+define <8 x i16> @test_masked_z_8xi16_perm_low_mask1(<8 x i16> %vec, <8 x i16> %mask) {
+; GENERIC-LABEL: test_masked_z_8xi16_perm_low_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,3,0,0,4,5,6,7] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_8xi16_perm_low_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,3,0,0,4,5,6,7] sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 3, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7>
+ %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
+ ret <8 x i16> %res
+}
+define <8 x i16> @test_masked_8xi16_perm_high_mask2(<8 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
+; GENERIC-LABEL: test_masked_8xi16_perm_high_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,5,4,4,5] sched: [1:1.00]
+; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_8xi16_perm_high_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,5,4,4,5] sched: [1:1.00]
+; SKX-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 4, i32 5>
+ %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
+ ret <8 x i16> %res
+}
+
+define <8 x i16> @test_masked_z_8xi16_perm_high_mask2(<8 x i16> %vec, <8 x i16> %mask) {
+; GENERIC-LABEL: test_masked_z_8xi16_perm_high_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,5,4,4,5] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_8xi16_perm_high_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,5,4,4,5] sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 4, i32 5>
+ %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
+ ret <8 x i16> %res
+}
+define <8 x i16> @test_8xi16_perm_low_mask3(<8 x i16> %vec) {
+; GENERIC-LABEL: test_8xi16_perm_low_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,1,1,1,4,5,6,7] sched: [1:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xi16_perm_low_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,1,1,1,4,5,6,7] sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %res = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 2, i32 1, i32 1, i32 1, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x i16> %res
+}
+define <8 x i16> @test_masked_8xi16_perm_low_mask3(<8 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
+; GENERIC-LABEL: test_masked_8xi16_perm_low_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[2,1,1,1,4,5,6,7] sched: [1:1.00]
+; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_8xi16_perm_low_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[2,1,1,1,4,5,6,7] sched: [1:1.00]
+; SKX-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 2, i32 1, i32 1, i32 1, i32 4, i32 5, i32 6, i32 7>
+ %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
+ ret <8 x i16> %res
+}
+
+define <8 x i16> @test_masked_z_8xi16_perm_low_mask3(<8 x i16> %vec, <8 x i16> %mask) {
+; GENERIC-LABEL: test_masked_z_8xi16_perm_low_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[2,1,1,1,4,5,6,7] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_8xi16_perm_low_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[2,1,1,1,4,5,6,7] sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 2, i32 1, i32 1, i32 1, i32 4, i32 5, i32 6, i32 7>
+ %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
+ ret <8 x i16> %res
+}
+define <8 x i16> @test_masked_8xi16_perm_high_mask4(<8 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
+; GENERIC-LABEL: test_masked_8xi16_perm_high_mask4:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,5,5,7,6] sched: [1:1.00]
+; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_8xi16_perm_high_mask4:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,5,5,7,6] sched: [1:1.00]
+; SKX-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 5, i32 7, i32 6>
+ %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
+ ret <8 x i16> %res
+}
+
+define <8 x i16> @test_masked_z_8xi16_perm_high_mask4(<8 x i16> %vec, <8 x i16> %mask) {
+; GENERIC-LABEL: test_masked_z_8xi16_perm_high_mask4:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,5,5,7,6] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_8xi16_perm_high_mask4:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,5,5,7,6] sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 5, i32 7, i32 6>
+ %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
+ ret <8 x i16> %res
+}
+define <8 x i16> @test_masked_8xi16_perm_low_mask5(<8 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
+; GENERIC-LABEL: test_masked_8xi16_perm_low_mask5:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[3,3,2,1,4,5,6,7] sched: [1:1.00]
+; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_8xi16_perm_low_mask5:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[3,3,2,1,4,5,6,7] sched: [1:1.00]
+; SKX-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 2, i32 1, i32 4, i32 5, i32 6, i32 7>
+ %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
+ ret <8 x i16> %res
+}
+
+define <8 x i16> @test_masked_z_8xi16_perm_low_mask5(<8 x i16> %vec, <8 x i16> %mask) {
+; GENERIC-LABEL: test_masked_z_8xi16_perm_low_mask5:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[3,3,2,1,4,5,6,7] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_8xi16_perm_low_mask5:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[3,3,2,1,4,5,6,7] sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 2, i32 1, i32 4, i32 5, i32 6, i32 7>
+ %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
+ ret <8 x i16> %res
+}
+define <8 x i16> @test_8xi16_perm_high_mask6(<8 x i16> %vec) {
+; GENERIC-LABEL: test_8xi16_perm_high_mask6:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,5] sched: [1:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xi16_perm_high_mask6:
+; SKX: # %bb.0:
+; SKX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,5] sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %res = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 5, i32 6, i32 5>
+ ret <8 x i16> %res
+}
+define <8 x i16> @test_masked_8xi16_perm_high_mask6(<8 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
+; GENERIC-LABEL: test_masked_8xi16_perm_high_mask6:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,6,5,6,5] sched: [1:1.00]
+; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_8xi16_perm_high_mask6:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,6,5,6,5] sched: [1:1.00]
+; SKX-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 5, i32 6, i32 5>
+ %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
+ ret <8 x i16> %res
+}
+
+define <8 x i16> @test_masked_z_8xi16_perm_high_mask6(<8 x i16> %vec, <8 x i16> %mask) {
+; GENERIC-LABEL: test_masked_z_8xi16_perm_high_mask6:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,6,5,6,5] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_8xi16_perm_high_mask6:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,6,5,6,5] sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 5, i32 6, i32 5>
+ %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
+ ret <8 x i16> %res
+}
+define <8 x i16> @test_masked_8xi16_perm_low_mask7(<8 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
+; GENERIC-LABEL: test_masked_8xi16_perm_low_mask7:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[1,0,2,0,4,5,6,7] sched: [1:1.00]
+; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_8xi16_perm_low_mask7:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[1,0,2,0,4,5,6,7] sched: [1:1.00]
+; SKX-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 2, i32 0, i32 4, i32 5, i32 6, i32 7>
+ %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
+ ret <8 x i16> %res
+}
+
+define <8 x i16> @test_masked_z_8xi16_perm_low_mask7(<8 x i16> %vec, <8 x i16> %mask) {
+; GENERIC-LABEL: test_masked_z_8xi16_perm_low_mask7:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[1,0,2,0,4,5,6,7] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_8xi16_perm_low_mask7:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[1,0,2,0,4,5,6,7] sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 2, i32 0, i32 4, i32 5, i32 6, i32 7>
+ %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
+ ret <8 x i16> %res
+}
+define <8 x i16> @test_8xi16_perm_high_mem_mask0(<8 x i16>* %vp) {
+; GENERIC-LABEL: test_8xi16_perm_high_mem_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 = mem[0,1,2,3,7,7,4,6] sched: [7:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xi16_perm_high_mem_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpshufhw {{.*#+}} xmm0 = mem[0,1,2,3,7,7,4,6] sched: [7:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <8 x i16>, <8 x i16>* %vp
+ %res = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 7, i32 4, i32 6>
+ ret <8 x i16> %res
+}
+define <8 x i16> @test_masked_8xi16_perm_high_mem_mask0(<8 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
+; GENERIC-LABEL: test_masked_8xi16_perm_high_mem_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,7,7,4,6] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_8xi16_perm_high_mem_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,7,7,4,6] sched: [7:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <8 x i16>, <8 x i16>* %vp
+ %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 7, i32 4, i32 6>
+ %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
+ ret <8 x i16> %res
+}
+
+define <8 x i16> @test_masked_z_8xi16_perm_high_mem_mask0(<8 x i16>* %vp, <8 x i16> %mask) {
+; GENERIC-LABEL: test_masked_z_8xi16_perm_high_mem_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,7,7,4,6] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_8xi16_perm_high_mem_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,7,7,4,6] sched: [7:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <8 x i16>, <8 x i16>* %vp
+ %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 7, i32 4, i32 6>
+ %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
+ ret <8 x i16> %res
+}
+
+define <8 x i16> @test_masked_8xi16_perm_low_mem_mask1(<8 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
+; GENERIC-LABEL: test_masked_8xi16_perm_low_mem_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} = mem[1,3,3,2,4,5,6,7] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_8xi16_perm_low_mem_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} = mem[1,3,3,2,4,5,6,7] sched: [7:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <8 x i16>, <8 x i16>* %vp
+ %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 1, i32 3, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7>
+ %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
+ ret <8 x i16> %res
+}
+
+define <8 x i16> @test_masked_z_8xi16_perm_low_mem_mask1(<8 x i16>* %vp, <8 x i16> %mask) {
+; GENERIC-LABEL: test_masked_z_8xi16_perm_low_mem_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[1,3,3,2,4,5,6,7] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_8xi16_perm_low_mem_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[1,3,3,2,4,5,6,7] sched: [7:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <8 x i16>, <8 x i16>* %vp
+ %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 1, i32 3, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7>
+ %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
+ ret <8 x i16> %res
+}
+
+define <8 x i16> @test_masked_8xi16_perm_high_mem_mask2(<8 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
+; GENERIC-LABEL: test_masked_8xi16_perm_high_mem_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,6,6,5,7] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_8xi16_perm_high_mem_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,6,6,5,7] sched: [7:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <8 x i16>, <8 x i16>* %vp
+ %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 6, i32 5, i32 7>
+ %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
+ ret <8 x i16> %res
+}
+
+define <8 x i16> @test_masked_z_8xi16_perm_high_mem_mask2(<8 x i16>* %vp, <8 x i16> %mask) {
+; GENERIC-LABEL: test_masked_z_8xi16_perm_high_mem_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,6,6,5,7] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_8xi16_perm_high_mem_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,6,6,5,7] sched: [7:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <8 x i16>, <8 x i16>* %vp
+ %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 6, i32 5, i32 7>
+ %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
+ ret <8 x i16> %res
+}
+
+define <8 x i16> @test_8xi16_perm_low_mem_mask3(<8 x i16>* %vp) {
+; GENERIC-LABEL: test_8xi16_perm_low_mem_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 = mem[3,1,2,0,4,5,6,7] sched: [7:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xi16_perm_low_mem_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpshuflw {{.*#+}} xmm0 = mem[3,1,2,0,4,5,6,7] sched: [7:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <8 x i16>, <8 x i16>* %vp
+ %res = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 3, i32 1, i32 2, i32 0, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x i16> %res
+}
+define <8 x i16> @test_masked_8xi16_perm_low_mem_mask3(<8 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
+; GENERIC-LABEL: test_masked_8xi16_perm_low_mem_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} = mem[3,1,2,0,4,5,6,7] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_8xi16_perm_low_mem_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} = mem[3,1,2,0,4,5,6,7] sched: [7:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <8 x i16>, <8 x i16>* %vp
+ %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 3, i32 1, i32 2, i32 0, i32 4, i32 5, i32 6, i32 7>
+ %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
+ ret <8 x i16> %res
+}
+
+define <8 x i16> @test_masked_z_8xi16_perm_low_mem_mask3(<8 x i16>* %vp, <8 x i16> %mask) {
+; GENERIC-LABEL: test_masked_z_8xi16_perm_low_mem_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[3,1,2,0,4,5,6,7] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_8xi16_perm_low_mem_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[3,1,2,0,4,5,6,7] sched: [7:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <8 x i16>, <8 x i16>* %vp
+ %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 3, i32 1, i32 2, i32 0, i32 4, i32 5, i32 6, i32 7>
+ %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
+ ret <8 x i16> %res
+}
+
+define <8 x i16> @test_masked_8xi16_perm_high_mem_mask4(<8 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
+; GENERIC-LABEL: test_masked_8xi16_perm_high_mem_mask4:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,7,6,7,5] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_8xi16_perm_high_mem_mask4:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,7,6,7,5] sched: [7:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <8 x i16>, <8 x i16>* %vp
+ %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 7, i32 5>
+ %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
+ ret <8 x i16> %res
+}
+
+define <8 x i16> @test_masked_z_8xi16_perm_high_mem_mask4(<8 x i16>* %vp, <8 x i16> %mask) {
+; GENERIC-LABEL: test_masked_z_8xi16_perm_high_mem_mask4:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,7,6,7,5] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_8xi16_perm_high_mem_mask4:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,7,6,7,5] sched: [7:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <8 x i16>, <8 x i16>* %vp
+ %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 7, i32 5>
+ %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
+ ret <8 x i16> %res
+}
+
+define <8 x i16> @test_masked_8xi16_perm_low_mem_mask5(<8 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
+; GENERIC-LABEL: test_masked_8xi16_perm_low_mem_mask5:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} = mem[2,1,3,2,4,5,6,7] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_8xi16_perm_low_mem_mask5:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} = mem[2,1,3,2,4,5,6,7] sched: [7:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <8 x i16>, <8 x i16>* %vp
+ %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 2, i32 1, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7>
+ %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
+ ret <8 x i16> %res
+}
+
+define <8 x i16> @test_masked_z_8xi16_perm_low_mem_mask5(<8 x i16>* %vp, <8 x i16> %mask) {
+; GENERIC-LABEL: test_masked_z_8xi16_perm_low_mem_mask5:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[2,1,3,2,4,5,6,7] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_8xi16_perm_low_mem_mask5:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[2,1,3,2,4,5,6,7] sched: [7:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <8 x i16>, <8 x i16>* %vp
+ %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 2, i32 1, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7>
+ %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
+ ret <8 x i16> %res
+}
+
+define <8 x i16> @test_8xi16_perm_high_mem_mask6(<8 x i16>* %vp) {
+; GENERIC-LABEL: test_8xi16_perm_high_mem_mask6:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 = mem[0,1,2,3,7,4,4,4] sched: [7:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xi16_perm_high_mem_mask6:
+; SKX: # %bb.0:
+; SKX-NEXT: vpshufhw {{.*#+}} xmm0 = mem[0,1,2,3,7,4,4,4] sched: [7:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <8 x i16>, <8 x i16>* %vp
+ %res = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 4, i32 4, i32 4>
+ ret <8 x i16> %res
+}
+define <8 x i16> @test_masked_8xi16_perm_high_mem_mask6(<8 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
+; GENERIC-LABEL: test_masked_8xi16_perm_high_mem_mask6:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,7,4,4,4] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_8xi16_perm_high_mem_mask6:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,7,4,4,4] sched: [7:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <8 x i16>, <8 x i16>* %vp
+ %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 4, i32 4, i32 4>
+ %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
+ ret <8 x i16> %res
+}
+
+define <8 x i16> @test_masked_z_8xi16_perm_high_mem_mask6(<8 x i16>* %vp, <8 x i16> %mask) {
+; GENERIC-LABEL: test_masked_z_8xi16_perm_high_mem_mask6:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,7,4,4,4] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_8xi16_perm_high_mem_mask6:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,7,4,4,4] sched: [7:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <8 x i16>, <8 x i16>* %vp
+ %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 4, i32 4, i32 4>
+ %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
+ ret <8 x i16> %res
+}
+
+define <8 x i16> @test_masked_8xi16_perm_low_mem_mask7(<8 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
+; GENERIC-LABEL: test_masked_8xi16_perm_low_mem_mask7:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} = mem[0,3,3,1,4,5,6,7] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_8xi16_perm_low_mem_mask7:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} = mem[0,3,3,1,4,5,6,7] sched: [7:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <8 x i16>, <8 x i16>* %vp
+ %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 3, i32 3, i32 1, i32 4, i32 5, i32 6, i32 7>
+ %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
+ ret <8 x i16> %res
+}
+
+define <8 x i16> @test_masked_z_8xi16_perm_low_mem_mask7(<8 x i16>* %vp, <8 x i16> %mask) {
+; GENERIC-LABEL: test_masked_z_8xi16_perm_low_mem_mask7:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[0,3,3,1,4,5,6,7] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_8xi16_perm_low_mem_mask7:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[0,3,3,1,4,5,6,7] sched: [7:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <8 x i16>, <8 x i16>* %vp
+ %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 3, i32 3, i32 1, i32 4, i32 5, i32 6, i32 7>
+ %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
+ ret <8 x i16> %res
+}
+
+define <16 x i16> @test_16xi16_perm_high_mask0(<16 x i16> %vec) {
+; GENERIC-LABEL: test_16xi16_perm_high_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,6,4,8,9,10,11,12,12,14,12] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xi16_perm_high_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,6,4,8,9,10,11,12,12,14,12] sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 6, i32 4, i32 8, i32 9, i32 10, i32 11, i32 12, i32 12, i32 14, i32 12>
+ ret <16 x i16> %res
+}
+define <16 x i16> @test_masked_16xi16_perm_high_mask0(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
+; GENERIC-LABEL: test_masked_16xi16_perm_high_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,4,4,6,4,8,9,10,11,12,12,14,12] sched: [1:1.00]
+; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_16xi16_perm_high_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,4,4,6,4,8,9,10,11,12,12,14,12] sched: [1:1.00]
+; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 6, i32 4, i32 8, i32 9, i32 10, i32 11, i32 12, i32 12, i32 14, i32 12>
+ %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
+ ret <16 x i16> %res
+}
+
+define <16 x i16> @test_masked_z_16xi16_perm_high_mask0(<16 x i16> %vec, <16 x i16> %mask) {
+; GENERIC-LABEL: test_masked_z_16xi16_perm_high_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,4,4,6,4,8,9,10,11,12,12,14,12] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_16xi16_perm_high_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,4,4,6,4,8,9,10,11,12,12,14,12] sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 6, i32 4, i32 8, i32 9, i32 10, i32 11, i32 12, i32 12, i32 14, i32 12>
+ %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
+ ret <16 x i16> %res
+}
+define <16 x i16> @test_masked_16xi16_perm_low_mask1(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
+; GENERIC-LABEL: test_masked_16xi16_perm_low_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[0,2,3,2,4,5,6,7,8,10,11,10,12,13,14,15] sched: [1:1.00]
+; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_16xi16_perm_low_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[0,2,3,2,4,5,6,7,8,10,11,10,12,13,14,15] sched: [1:1.00]
+; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 2, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7, i32 8, i32 10, i32 11, i32 10, i32 12, i32 13, i32 14, i32 15>
+ %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
+ ret <16 x i16> %res
+}
+
+define <16 x i16> @test_masked_z_16xi16_perm_low_mask1(<16 x i16> %vec, <16 x i16> %mask) {
+; GENERIC-LABEL: test_masked_z_16xi16_perm_low_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,2,3,2,4,5,6,7,8,10,11,10,12,13,14,15] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_16xi16_perm_low_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,2,3,2,4,5,6,7,8,10,11,10,12,13,14,15] sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 2, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7, i32 8, i32 10, i32 11, i32 10, i32 12, i32 13, i32 14, i32 15>
+ %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
+ ret <16 x i16> %res
+}
+define <16 x i16> @test_masked_16xi16_perm_high_mask2(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
+; GENERIC-LABEL: test_masked_16xi16_perm_high_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,7,5,5,5,8,9,10,11,15,13,13,13] sched: [1:1.00]
+; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_16xi16_perm_high_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,7,5,5,5,8,9,10,11,15,13,13,13] sched: [1:1.00]
+; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 5, i32 5, i32 5, i32 8, i32 9, i32 10, i32 11, i32 15, i32 13, i32 13, i32 13>
+ %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
+ ret <16 x i16> %res
+}
+
+define <16 x i16> @test_masked_z_16xi16_perm_high_mask2(<16 x i16> %vec, <16 x i16> %mask) {
+; GENERIC-LABEL: test_masked_z_16xi16_perm_high_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,7,5,5,5,8,9,10,11,15,13,13,13] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_16xi16_perm_high_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,7,5,5,5,8,9,10,11,15,13,13,13] sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 5, i32 5, i32 5, i32 8, i32 9, i32 10, i32 11, i32 15, i32 13, i32 13, i32 13>
+ %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
+ ret <16 x i16> %res
+}
+define <16 x i16> @test_16xi16_perm_low_mask3(<16 x i16> %vec) {
+; GENERIC-LABEL: test_16xi16_perm_low_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[3,2,3,2,4,5,6,7,11,10,11,10,12,13,14,15] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xi16_perm_low_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[3,2,3,2,4,5,6,7,11,10,11,10,12,13,14,15] sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 11, i32 10, i32 12, i32 13, i32 14, i32 15>
+ ret <16 x i16> %res
+}
+define <16 x i16> @test_masked_16xi16_perm_low_mask3(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
+; GENERIC-LABEL: test_masked_16xi16_perm_low_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[3,2,3,2,4,5,6,7,11,10,11,10,12,13,14,15] sched: [1:1.00]
+; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_16xi16_perm_low_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[3,2,3,2,4,5,6,7,11,10,11,10,12,13,14,15] sched: [1:1.00]
+; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 11, i32 10, i32 12, i32 13, i32 14, i32 15>
+ %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
+ ret <16 x i16> %res
+}
+
+define <16 x i16> @test_masked_z_16xi16_perm_low_mask3(<16 x i16> %vec, <16 x i16> %mask) {
+; GENERIC-LABEL: test_masked_z_16xi16_perm_low_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[3,2,3,2,4,5,6,7,11,10,11,10,12,13,14,15] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_16xi16_perm_low_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[3,2,3,2,4,5,6,7,11,10,11,10,12,13,14,15] sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 11, i32 10, i32 12, i32 13, i32 14, i32 15>
+ %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
+ ret <16 x i16> %res
+}
+define <16 x i16> @test_masked_16xi16_perm_high_mask4(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
+; GENERIC-LABEL: test_masked_16xi16_perm_high_mask4:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,6,7,4,7,8,9,10,11,14,15,12,15] sched: [1:1.00]
+; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_16xi16_perm_high_mask4:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,6,7,4,7,8,9,10,11,14,15,12,15] sched: [1:1.00]
+; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 7, i32 4, i32 7, i32 8, i32 9, i32 10, i32 11, i32 14, i32 15, i32 12, i32 15>
+ %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
+ ret <16 x i16> %res
+}
+
+define <16 x i16> @test_masked_z_16xi16_perm_high_mask4(<16 x i16> %vec, <16 x i16> %mask) {
+; GENERIC-LABEL: test_masked_z_16xi16_perm_high_mask4:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,6,7,4,7,8,9,10,11,14,15,12,15] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_16xi16_perm_high_mask4:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,6,7,4,7,8,9,10,11,14,15,12,15] sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 7, i32 4, i32 7, i32 8, i32 9, i32 10, i32 11, i32 14, i32 15, i32 12, i32 15>
+ %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
+ ret <16 x i16> %res
+}
+define <16 x i16> @test_masked_16xi16_perm_low_mask5(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
+; GENERIC-LABEL: test_masked_16xi16_perm_low_mask5:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[3,3,3,0,4,5,6,7,11,11,11,8,12,13,14,15] sched: [1:1.00]
+; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_16xi16_perm_low_mask5:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[3,3,3,0,4,5,6,7,11,11,11,8,12,13,14,15] sched: [1:1.00]
+; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 11, i32 11, i32 8, i32 12, i32 13, i32 14, i32 15>
+ %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
+ ret <16 x i16> %res
+}
+
+define <16 x i16> @test_masked_z_16xi16_perm_low_mask5(<16 x i16> %vec, <16 x i16> %mask) {
+; GENERIC-LABEL: test_masked_z_16xi16_perm_low_mask5:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[3,3,3,0,4,5,6,7,11,11,11,8,12,13,14,15] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_16xi16_perm_low_mask5:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[3,3,3,0,4,5,6,7,11,11,11,8,12,13,14,15] sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 11, i32 11, i32 8, i32 12, i32 13, i32 14, i32 15>
+ %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
+ ret <16 x i16> %res
+}
+define <16 x i16> @test_16xi16_perm_high_mask6(<16 x i16> %vec) {
+; GENERIC-LABEL: test_16xi16_perm_high_mask6:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,6,7,6,5,8,9,10,11,14,15,14,13] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xi16_perm_high_mask6:
+; SKX: # %bb.0:
+; SKX-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,6,7,6,5,8,9,10,11,14,15,14,13] sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 7, i32 6, i32 5, i32 8, i32 9, i32 10, i32 11, i32 14, i32 15, i32 14, i32 13>
+ ret <16 x i16> %res
+}
+define <16 x i16> @test_masked_16xi16_perm_high_mask6(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
+; GENERIC-LABEL: test_masked_16xi16_perm_high_mask6:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,6,7,6,5,8,9,10,11,14,15,14,13] sched: [1:1.00]
+; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_16xi16_perm_high_mask6:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,6,7,6,5,8,9,10,11,14,15,14,13] sched: [1:1.00]
+; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 7, i32 6, i32 5, i32 8, i32 9, i32 10, i32 11, i32 14, i32 15, i32 14, i32 13>
+ %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
+ ret <16 x i16> %res
+}
+
+define <16 x i16> @test_masked_z_16xi16_perm_high_mask6(<16 x i16> %vec, <16 x i16> %mask) {
+; GENERIC-LABEL: test_masked_z_16xi16_perm_high_mask6:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,6,7,6,5,8,9,10,11,14,15,14,13] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_16xi16_perm_high_mask6:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,6,7,6,5,8,9,10,11,14,15,14,13] sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 7, i32 6, i32 5, i32 8, i32 9, i32 10, i32 11, i32 14, i32 15, i32 14, i32 13>
+ %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
+ ret <16 x i16> %res
+}
+define <16 x i16> @test_masked_16xi16_perm_low_mask7(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
+; GENERIC-LABEL: test_masked_16xi16_perm_low_mask7:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[3,2,1,2,4,5,6,7,11,10,9,10,12,13,14,15] sched: [1:1.00]
+; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_16xi16_perm_low_mask7:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[3,2,1,2,4,5,6,7,11,10,9,10,12,13,14,15] sched: [1:1.00]
+; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 2, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 9, i32 10, i32 12, i32 13, i32 14, i32 15>
+ %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
+ ret <16 x i16> %res
+}
+
+define <16 x i16> @test_masked_z_16xi16_perm_low_mask7(<16 x i16> %vec, <16 x i16> %mask) {
+; GENERIC-LABEL: test_masked_z_16xi16_perm_low_mask7:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[3,2,1,2,4,5,6,7,11,10,9,10,12,13,14,15] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_16xi16_perm_low_mask7:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[3,2,1,2,4,5,6,7,11,10,9,10,12,13,14,15] sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 2, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 9, i32 10, i32 12, i32 13, i32 14, i32 15>
+ %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
+ ret <16 x i16> %res
+}
+define <16 x i16> @test_16xi16_perm_high_mem_mask0(<16 x i16>* %vp) {
+; GENERIC-LABEL: test_16xi16_perm_high_mem_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 = mem[0,1,2,3,5,6,4,7,8,9,10,11,13,14,12,15] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xi16_perm_high_mem_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpshufhw {{.*#+}} ymm0 = mem[0,1,2,3,5,6,4,7,8,9,10,11,13,14,12,15] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <16 x i16>, <16 x i16>* %vp
+ %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 6, i32 4, i32 7, i32 8, i32 9, i32 10, i32 11, i32 13, i32 14, i32 12, i32 15>
+ ret <16 x i16> %res
+}
+define <16 x i16> @test_masked_16xi16_perm_high_mem_mask0(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
+; GENERIC-LABEL: test_masked_16xi16_perm_high_mem_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,5,6,4,7,8,9,10,11,13,14,12,15] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_16xi16_perm_high_mem_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,5,6,4,7,8,9,10,11,13,14,12,15] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <16 x i16>, <16 x i16>* %vp
+ %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 6, i32 4, i32 7, i32 8, i32 9, i32 10, i32 11, i32 13, i32 14, i32 12, i32 15>
+ %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
+ ret <16 x i16> %res
+}
+
+define <16 x i16> @test_masked_z_16xi16_perm_high_mem_mask0(<16 x i16>* %vp, <16 x i16> %mask) {
+; GENERIC-LABEL: test_masked_z_16xi16_perm_high_mem_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,5,6,4,7,8,9,10,11,13,14,12,15] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_16xi16_perm_high_mem_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,5,6,4,7,8,9,10,11,13,14,12,15] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <16 x i16>, <16 x i16>* %vp
+ %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 6, i32 4, i32 7, i32 8, i32 9, i32 10, i32 11, i32 13, i32 14, i32 12, i32 15>
+ %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
+ ret <16 x i16> %res
+}
+
+define <16 x i16> @test_masked_16xi16_perm_low_mem_mask1(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
+; GENERIC-LABEL: test_masked_16xi16_perm_low_mem_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} = mem[1,3,3,0,4,5,6,7,9,11,11,8,12,13,14,15] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_16xi16_perm_low_mem_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} = mem[1,3,3,0,4,5,6,7,9,11,11,8,12,13,14,15] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <16 x i16>, <16 x i16>* %vp
+ %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 1, i32 3, i32 3, i32 0, i32 4, i32 5, i32 6, i32 7, i32 9, i32 11, i32 11, i32 8, i32 12, i32 13, i32 14, i32 15>
+ %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
+ ret <16 x i16> %res
+}
+
+define <16 x i16> @test_masked_z_16xi16_perm_low_mem_mask1(<16 x i16>* %vp, <16 x i16> %mask) {
+; GENERIC-LABEL: test_masked_z_16xi16_perm_low_mem_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[1,3,3,0,4,5,6,7,9,11,11,8,12,13,14,15] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_16xi16_perm_low_mem_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[1,3,3,0,4,5,6,7,9,11,11,8,12,13,14,15] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <16 x i16>, <16 x i16>* %vp
+ %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 1, i32 3, i32 3, i32 0, i32 4, i32 5, i32 6, i32 7, i32 9, i32 11, i32 11, i32 8, i32 12, i32 13, i32 14, i32 15>
+ %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
+ ret <16 x i16> %res
+}
+
+define <16 x i16> @test_masked_16xi16_perm_high_mem_mask2(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
+; GENERIC-LABEL: test_masked_16xi16_perm_high_mem_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,5,6,5,6,8,9,10,11,13,14,13,14] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_16xi16_perm_high_mem_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,5,6,5,6,8,9,10,11,13,14,13,14] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <16 x i16>, <16 x i16>* %vp
+ %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 6, i32 5, i32 6, i32 8, i32 9, i32 10, i32 11, i32 13, i32 14, i32 13, i32 14>
+ %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
+ ret <16 x i16> %res
+}
+
+define <16 x i16> @test_masked_z_16xi16_perm_high_mem_mask2(<16 x i16>* %vp, <16 x i16> %mask) {
+; GENERIC-LABEL: test_masked_z_16xi16_perm_high_mem_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,5,6,5,6,8,9,10,11,13,14,13,14] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_16xi16_perm_high_mem_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,5,6,5,6,8,9,10,11,13,14,13,14] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <16 x i16>, <16 x i16>* %vp
+ %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 6, i32 5, i32 6, i32 8, i32 9, i32 10, i32 11, i32 13, i32 14, i32 13, i32 14>
+ %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
+ ret <16 x i16> %res
+}
+
+define <16 x i16> @test_16xi16_perm_low_mem_mask3(<16 x i16>* %vp) {
+; GENERIC-LABEL: test_16xi16_perm_low_mem_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpshuflw {{.*#+}} ymm0 = mem[3,2,3,0,4,5,6,7,11,10,11,8,12,13,14,15] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xi16_perm_low_mem_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpshuflw {{.*#+}} ymm0 = mem[3,2,3,0,4,5,6,7,11,10,11,8,12,13,14,15] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <16 x i16>, <16 x i16>* %vp
+ %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 3, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 11, i32 8, i32 12, i32 13, i32 14, i32 15>
+ ret <16 x i16> %res
+}
+define <16 x i16> @test_masked_16xi16_perm_low_mem_mask3(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
+; GENERIC-LABEL: test_masked_16xi16_perm_low_mem_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} = mem[3,2,3,0,4,5,6,7,11,10,11,8,12,13,14,15] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_16xi16_perm_low_mem_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} = mem[3,2,3,0,4,5,6,7,11,10,11,8,12,13,14,15] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <16 x i16>, <16 x i16>* %vp
+ %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 3, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 11, i32 8, i32 12, i32 13, i32 14, i32 15>
+ %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
+ ret <16 x i16> %res
+}
+
+define <16 x i16> @test_masked_z_16xi16_perm_low_mem_mask3(<16 x i16>* %vp, <16 x i16> %mask) {
+; GENERIC-LABEL: test_masked_z_16xi16_perm_low_mem_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[3,2,3,0,4,5,6,7,11,10,11,8,12,13,14,15] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_16xi16_perm_low_mem_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[3,2,3,0,4,5,6,7,11,10,11,8,12,13,14,15] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <16 x i16>, <16 x i16>* %vp
+ %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 3, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 11, i32 8, i32 12, i32 13, i32 14, i32 15>
+ %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
+ ret <16 x i16> %res
+}
+
+define <16 x i16> @test_masked_16xi16_perm_high_mem_mask4(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
+; GENERIC-LABEL: test_masked_16xi16_perm_high_mem_mask4:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,7,7,6,7,8,9,10,11,15,15,14,15] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_16xi16_perm_high_mem_mask4:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,7,7,6,7,8,9,10,11,15,15,14,15] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <16 x i16>, <16 x i16>* %vp
+ %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 7, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 15, i32 15, i32 14, i32 15>
+ %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
+ ret <16 x i16> %res
+}
+
+define <16 x i16> @test_masked_z_16xi16_perm_high_mem_mask4(<16 x i16>* %vp, <16 x i16> %mask) {
+; GENERIC-LABEL: test_masked_z_16xi16_perm_high_mem_mask4:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,7,7,6,7,8,9,10,11,15,15,14,15] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_16xi16_perm_high_mem_mask4:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,7,7,6,7,8,9,10,11,15,15,14,15] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <16 x i16>, <16 x i16>* %vp
+ %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 7, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 15, i32 15, i32 14, i32 15>
+ %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
+ ret <16 x i16> %res
+}
+
+define <16 x i16> @test_masked_16xi16_perm_low_mem_mask5(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
+; GENERIC-LABEL: test_masked_16xi16_perm_low_mem_mask5:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} = mem[1,3,3,2,4,5,6,7,9,11,11,10,12,13,14,15] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_16xi16_perm_low_mem_mask5:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} = mem[1,3,3,2,4,5,6,7,9,11,11,10,12,13,14,15] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <16 x i16>, <16 x i16>* %vp
+ %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 1, i32 3, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7, i32 9, i32 11, i32 11, i32 10, i32 12, i32 13, i32 14, i32 15>
+ %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
+ ret <16 x i16> %res
+}
+
+define <16 x i16> @test_masked_z_16xi16_perm_low_mem_mask5(<16 x i16>* %vp, <16 x i16> %mask) {
+; GENERIC-LABEL: test_masked_z_16xi16_perm_low_mem_mask5:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[1,3,3,2,4,5,6,7,9,11,11,10,12,13,14,15] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_16xi16_perm_low_mem_mask5:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[1,3,3,2,4,5,6,7,9,11,11,10,12,13,14,15] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <16 x i16>, <16 x i16>* %vp
+ %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 1, i32 3, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7, i32 9, i32 11, i32 11, i32 10, i32 12, i32 13, i32 14, i32 15>
+ %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
+ ret <16 x i16> %res
+}
+
+define <16 x i16> @test_16xi16_perm_high_mem_mask6(<16 x i16>* %vp) {
+; GENERIC-LABEL: test_16xi16_perm_high_mem_mask6:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 = mem[0,1,2,3,4,4,4,5,8,9,10,11,12,12,12,13] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xi16_perm_high_mem_mask6:
+; SKX: # %bb.0:
+; SKX-NEXT: vpshufhw {{.*#+}} ymm0 = mem[0,1,2,3,4,4,4,5,8,9,10,11,12,12,12,13] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <16 x i16>, <16 x i16>* %vp
+ %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 5, i32 8, i32 9, i32 10, i32 11, i32 12, i32 12, i32 12, i32 13>
+ ret <16 x i16> %res
+}
+define <16 x i16> @test_masked_16xi16_perm_high_mem_mask6(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
+; GENERIC-LABEL: test_masked_16xi16_perm_high_mem_mask6:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,4,4,4,5,8,9,10,11,12,12,12,13] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_16xi16_perm_high_mem_mask6:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,4,4,4,5,8,9,10,11,12,12,12,13] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <16 x i16>, <16 x i16>* %vp
+ %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 5, i32 8, i32 9, i32 10, i32 11, i32 12, i32 12, i32 12, i32 13>
+ %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
+ ret <16 x i16> %res
+}
+
+define <16 x i16> @test_masked_z_16xi16_perm_high_mem_mask6(<16 x i16>* %vp, <16 x i16> %mask) {
+; GENERIC-LABEL: test_masked_z_16xi16_perm_high_mem_mask6:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,4,4,4,5,8,9,10,11,12,12,12,13] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_16xi16_perm_high_mem_mask6:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,4,4,4,5,8,9,10,11,12,12,12,13] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <16 x i16>, <16 x i16>* %vp
+ %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 5, i32 8, i32 9, i32 10, i32 11, i32 12, i32 12, i32 12, i32 13>
+ %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
+ ret <16 x i16> %res
+}
+
+define <16 x i16> @test_masked_16xi16_perm_low_mem_mask7(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
+; GENERIC-LABEL: test_masked_16xi16_perm_low_mem_mask7:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} = mem[3,1,3,2,4,5,6,7,11,9,11,10,12,13,14,15] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_16xi16_perm_low_mem_mask7:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} = mem[3,1,3,2,4,5,6,7,11,9,11,10,12,13,14,15] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <16 x i16>, <16 x i16>* %vp
+ %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 1, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7, i32 11, i32 9, i32 11, i32 10, i32 12, i32 13, i32 14, i32 15>
+ %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
+ ret <16 x i16> %res
+}
+
+define <16 x i16> @test_masked_z_16xi16_perm_low_mem_mask7(<16 x i16>* %vp, <16 x i16> %mask) {
+; GENERIC-LABEL: test_masked_z_16xi16_perm_low_mem_mask7:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[3,1,3,2,4,5,6,7,11,9,11,10,12,13,14,15] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_16xi16_perm_low_mem_mask7:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[3,1,3,2,4,5,6,7,11,9,11,10,12,13,14,15] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <16 x i16>, <16 x i16>* %vp
+ %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 1, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7, i32 11, i32 9, i32 11, i32 10, i32 12, i32 13, i32 14, i32 15>
+ %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
+ ret <16 x i16> %res
+}
+
+define <32 x i16> @test_32xi16_perm_high_mask0(<32 x i16> %vec) {
+; GENERIC-LABEL: test_32xi16_perm_high_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 = zmm0[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12,16,17,18,19,20,21,22,20,24,25,26,27,28,29,30,28] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_32xi16_perm_high_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpshufhw {{.*#+}} zmm0 = zmm0[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12,16,17,18,19,20,21,22,20,24,25,26,27,28,29,30,28] sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 4, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 12, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 20, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 28>
+ ret <32 x i16> %res
+}
+define <32 x i16> @test_masked_32xi16_perm_high_mask0(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
+; GENERIC-LABEL: test_masked_32xi16_perm_high_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12,16,17,18,19,20,21,22,20,24,25,26,27,28,29,30,28] sched: [1:1.00]
+; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_32xi16_perm_high_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12,16,17,18,19,20,21,22,20,24,25,26,27,28,29,30,28] sched: [1:1.00]
+; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 4, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 12, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 20, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 28>
+ %cmp = icmp eq <32 x i16> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
+ ret <32 x i16> %res
+}
+
+define <32 x i16> @test_masked_z_32xi16_perm_high_mask0(<32 x i16> %vec, <32 x i16> %mask) {
+; GENERIC-LABEL: test_masked_z_32xi16_perm_high_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12,16,17,18,19,20,21,22,20,24,25,26,27,28,29,30,28] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_32xi16_perm_high_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12,16,17,18,19,20,21,22,20,24,25,26,27,28,29,30,28] sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 4, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 12, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 20, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 28>
+ %cmp = icmp eq <32 x i16> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
+ ret <32 x i16> %res
+}
+define <32 x i16> @test_masked_32xi16_perm_low_mask1(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
+; GENERIC-LABEL: test_masked_32xi16_perm_low_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[2,1,0,0,4,5,6,7,10,9,8,8,12,13,14,15,18,17,16,16,20,21,22,23,26,25,24,24,28,29,30,31] sched: [1:1.00]
+; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_32xi16_perm_low_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[2,1,0,0,4,5,6,7,10,9,8,8,12,13,14,15,18,17,16,16,20,21,22,23,26,25,24,24,28,29,30,31] sched: [1:1.00]
+; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 2, i32 1, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7, i32 10, i32 9, i32 8, i32 8, i32 12, i32 13, i32 14, i32 15, i32 18, i32 17, i32 16, i32 16, i32 20, i32 21, i32 22, i32 23, i32 26, i32 25, i32 24, i32 24, i32 28, i32 29, i32 30, i32 31>
+ %cmp = icmp eq <32 x i16> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
+ ret <32 x i16> %res
+}
+
+define <32 x i16> @test_masked_z_32xi16_perm_low_mask1(<32 x i16> %vec, <32 x i16> %mask) {
+; GENERIC-LABEL: test_masked_z_32xi16_perm_low_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[2,1,0,0,4,5,6,7,10,9,8,8,12,13,14,15,18,17,16,16,20,21,22,23,26,25,24,24,28,29,30,31] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_32xi16_perm_low_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[2,1,0,0,4,5,6,7,10,9,8,8,12,13,14,15,18,17,16,16,20,21,22,23,26,25,24,24,28,29,30,31] sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 2, i32 1, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7, i32 10, i32 9, i32 8, i32 8, i32 12, i32 13, i32 14, i32 15, i32 18, i32 17, i32 16, i32 16, i32 20, i32 21, i32 22, i32 23, i32 26, i32 25, i32 24, i32 24, i32 28, i32 29, i32 30, i32 31>
+ %cmp = icmp eq <32 x i16> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
+ ret <32 x i16> %res
+}
+define <32 x i16> @test_masked_32xi16_perm_high_mask2(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
+; GENERIC-LABEL: test_masked_32xi16_perm_high_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,6,4,7,8,9,10,11,12,14,12,15,16,17,18,19,20,22,20,23,24,25,26,27,28,30,28,31] sched: [1:1.00]
+; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_32xi16_perm_high_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,6,4,7,8,9,10,11,12,14,12,15,16,17,18,19,20,22,20,23,24,25,26,27,28,30,28,31] sched: [1:1.00]
+; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 6, i32 4, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 14, i32 12, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 22, i32 20, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 30, i32 28, i32 31>
+ %cmp = icmp eq <32 x i16> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
+ ret <32 x i16> %res
+}
+
+define <32 x i16> @test_masked_z_32xi16_perm_high_mask2(<32 x i16> %vec, <32 x i16> %mask) {
+; GENERIC-LABEL: test_masked_z_32xi16_perm_high_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,6,4,7,8,9,10,11,12,14,12,15,16,17,18,19,20,22,20,23,24,25,26,27,28,30,28,31] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_32xi16_perm_high_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,6,4,7,8,9,10,11,12,14,12,15,16,17,18,19,20,22,20,23,24,25,26,27,28,30,28,31] sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 6, i32 4, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 14, i32 12, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 22, i32 20, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 30, i32 28, i32 31>
+ %cmp = icmp eq <32 x i16> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
+ ret <32 x i16> %res
+}
+define <32 x i16> @test_32xi16_perm_low_mask3(<32 x i16> %vec) {
+; GENERIC-LABEL: test_32xi16_perm_low_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpshuflw {{.*#+}} zmm0 = zmm0[3,3,1,3,4,5,6,7,11,11,9,11,12,13,14,15,19,19,17,19,20,21,22,23,27,27,25,27,28,29,30,31] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_32xi16_perm_low_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpshuflw {{.*#+}} zmm0 = zmm0[3,3,1,3,4,5,6,7,11,11,9,11,12,13,14,15,19,19,17,19,20,21,22,23,27,27,25,27,28,29,30,31] sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 1, i32 3, i32 4, i32 5, i32 6, i32 7, i32 11, i32 11, i32 9, i32 11, i32 12, i32 13, i32 14, i32 15, i32 19, i32 19, i32 17, i32 19, i32 20, i32 21, i32 22, i32 23, i32 27, i32 27, i32 25, i32 27, i32 28, i32 29, i32 30, i32 31>
+ ret <32 x i16> %res
+}
+define <32 x i16> @test_masked_32xi16_perm_low_mask3(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
+; GENERIC-LABEL: test_masked_32xi16_perm_low_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[3,3,1,3,4,5,6,7,11,11,9,11,12,13,14,15,19,19,17,19,20,21,22,23,27,27,25,27,28,29,30,31] sched: [1:1.00]
+; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_32xi16_perm_low_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[3,3,1,3,4,5,6,7,11,11,9,11,12,13,14,15,19,19,17,19,20,21,22,23,27,27,25,27,28,29,30,31] sched: [1:1.00]
+; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 1, i32 3, i32 4, i32 5, i32 6, i32 7, i32 11, i32 11, i32 9, i32 11, i32 12, i32 13, i32 14, i32 15, i32 19, i32 19, i32 17, i32 19, i32 20, i32 21, i32 22, i32 23, i32 27, i32 27, i32 25, i32 27, i32 28, i32 29, i32 30, i32 31>
+ %cmp = icmp eq <32 x i16> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
+ ret <32 x i16> %res
+}
+
+define <32 x i16> @test_masked_z_32xi16_perm_low_mask3(<32 x i16> %vec, <32 x i16> %mask) {
+; GENERIC-LABEL: test_masked_z_32xi16_perm_low_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[3,3,1,3,4,5,6,7,11,11,9,11,12,13,14,15,19,19,17,19,20,21,22,23,27,27,25,27,28,29,30,31] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_32xi16_perm_low_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[3,3,1,3,4,5,6,7,11,11,9,11,12,13,14,15,19,19,17,19,20,21,22,23,27,27,25,27,28,29,30,31] sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 1, i32 3, i32 4, i32 5, i32 6, i32 7, i32 11, i32 11, i32 9, i32 11, i32 12, i32 13, i32 14, i32 15, i32 19, i32 19, i32 17, i32 19, i32 20, i32 21, i32 22, i32 23, i32 27, i32 27, i32 25, i32 27, i32 28, i32 29, i32 30, i32 31>
+ %cmp = icmp eq <32 x i16> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
+ ret <32 x i16> %res
+}
+define <32 x i16> @test_masked_32xi16_perm_high_mask4(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
+; GENERIC-LABEL: test_masked_32xi16_perm_high_mask4:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,7,7,5,6,8,9,10,11,15,15,13,14,16,17,18,19,23,23,21,22,24,25,26,27,31,31,29,30] sched: [1:1.00]
+; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_32xi16_perm_high_mask4:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,7,7,5,6,8,9,10,11,15,15,13,14,16,17,18,19,23,23,21,22,24,25,26,27,31,31,29,30] sched: [1:1.00]
+; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 7, i32 5, i32 6, i32 8, i32 9, i32 10, i32 11, i32 15, i32 15, i32 13, i32 14, i32 16, i32 17, i32 18, i32 19, i32 23, i32 23, i32 21, i32 22, i32 24, i32 25, i32 26, i32 27, i32 31, i32 31, i32 29, i32 30>
+ %cmp = icmp eq <32 x i16> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
+ ret <32 x i16> %res
+}
+
+define <32 x i16> @test_masked_z_32xi16_perm_high_mask4(<32 x i16> %vec, <32 x i16> %mask) {
+; GENERIC-LABEL: test_masked_z_32xi16_perm_high_mask4:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,7,7,5,6,8,9,10,11,15,15,13,14,16,17,18,19,23,23,21,22,24,25,26,27,31,31,29,30] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_32xi16_perm_high_mask4:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,7,7,5,6,8,9,10,11,15,15,13,14,16,17,18,19,23,23,21,22,24,25,26,27,31,31,29,30] sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 7, i32 5, i32 6, i32 8, i32 9, i32 10, i32 11, i32 15, i32 15, i32 13, i32 14, i32 16, i32 17, i32 18, i32 19, i32 23, i32 23, i32 21, i32 22, i32 24, i32 25, i32 26, i32 27, i32 31, i32 31, i32 29, i32 30>
+ %cmp = icmp eq <32 x i16> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
+ ret <32 x i16> %res
+}
+define <32 x i16> @test_masked_32xi16_perm_low_mask5(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
+; GENERIC-LABEL: test_masked_32xi16_perm_low_mask5:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[2,1,1,0,4,5,6,7,10,9,9,8,12,13,14,15,18,17,17,16,20,21,22,23,26,25,25,24,28,29,30,31] sched: [1:1.00]
+; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_32xi16_perm_low_mask5:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[2,1,1,0,4,5,6,7,10,9,9,8,12,13,14,15,18,17,17,16,20,21,22,23,26,25,25,24,28,29,30,31] sched: [1:1.00]
+; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 2, i32 1, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7, i32 10, i32 9, i32 9, i32 8, i32 12, i32 13, i32 14, i32 15, i32 18, i32 17, i32 17, i32 16, i32 20, i32 21, i32 22, i32 23, i32 26, i32 25, i32 25, i32 24, i32 28, i32 29, i32 30, i32 31>
+ %cmp = icmp eq <32 x i16> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
+ ret <32 x i16> %res
+}
+
+define <32 x i16> @test_masked_z_32xi16_perm_low_mask5(<32 x i16> %vec, <32 x i16> %mask) {
+; GENERIC-LABEL: test_masked_z_32xi16_perm_low_mask5:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[2,1,1,0,4,5,6,7,10,9,9,8,12,13,14,15,18,17,17,16,20,21,22,23,26,25,25,24,28,29,30,31] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_32xi16_perm_low_mask5:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[2,1,1,0,4,5,6,7,10,9,9,8,12,13,14,15,18,17,17,16,20,21,22,23,26,25,25,24,28,29,30,31] sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 2, i32 1, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7, i32 10, i32 9, i32 9, i32 8, i32 12, i32 13, i32 14, i32 15, i32 18, i32 17, i32 17, i32 16, i32 20, i32 21, i32 22, i32 23, i32 26, i32 25, i32 25, i32 24, i32 28, i32 29, i32 30, i32 31>
+ %cmp = icmp eq <32 x i16> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
+ ret <32 x i16> %res
+}
+define <32 x i16> @test_32xi16_perm_high_mask6(<32 x i16> %vec) {
+; GENERIC-LABEL: test_32xi16_perm_high_mask6:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 = zmm0[0,1,2,3,4,4,5,6,8,9,10,11,12,12,13,14,16,17,18,19,20,20,21,22,24,25,26,27,28,28,29,30] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_32xi16_perm_high_mask6:
+; SKX: # %bb.0:
+; SKX-NEXT: vpshufhw {{.*#+}} zmm0 = zmm0[0,1,2,3,4,4,5,6,8,9,10,11,12,12,13,14,16,17,18,19,20,20,21,22,24,25,26,27,28,28,29,30] sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 5, i32 6, i32 8, i32 9, i32 10, i32 11, i32 12, i32 12, i32 13, i32 14, i32 16, i32 17, i32 18, i32 19, i32 20, i32 20, i32 21, i32 22, i32 24, i32 25, i32 26, i32 27, i32 28, i32 28, i32 29, i32 30>
+ ret <32 x i16> %res
+}
+define <32 x i16> @test_masked_32xi16_perm_high_mask6(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
+; GENERIC-LABEL: test_masked_32xi16_perm_high_mask6:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,4,5,6,8,9,10,11,12,12,13,14,16,17,18,19,20,20,21,22,24,25,26,27,28,28,29,30] sched: [1:1.00]
+; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_32xi16_perm_high_mask6:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,4,5,6,8,9,10,11,12,12,13,14,16,17,18,19,20,20,21,22,24,25,26,27,28,28,29,30] sched: [1:1.00]
+; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 5, i32 6, i32 8, i32 9, i32 10, i32 11, i32 12, i32 12, i32 13, i32 14, i32 16, i32 17, i32 18, i32 19, i32 20, i32 20, i32 21, i32 22, i32 24, i32 25, i32 26, i32 27, i32 28, i32 28, i32 29, i32 30>
+ %cmp = icmp eq <32 x i16> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
+ ret <32 x i16> %res
+}
+
+define <32 x i16> @test_masked_z_32xi16_perm_high_mask6(<32 x i16> %vec, <32 x i16> %mask) {
+; GENERIC-LABEL: test_masked_z_32xi16_perm_high_mask6:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,4,5,6,8,9,10,11,12,12,13,14,16,17,18,19,20,20,21,22,24,25,26,27,28,28,29,30] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_32xi16_perm_high_mask6:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,4,5,6,8,9,10,11,12,12,13,14,16,17,18,19,20,20,21,22,24,25,26,27,28,28,29,30] sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 5, i32 6, i32 8, i32 9, i32 10, i32 11, i32 12, i32 12, i32 13, i32 14, i32 16, i32 17, i32 18, i32 19, i32 20, i32 20, i32 21, i32 22, i32 24, i32 25, i32 26, i32 27, i32 28, i32 28, i32 29, i32 30>
+ %cmp = icmp eq <32 x i16> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
+ ret <32 x i16> %res
+}
+define <32 x i16> @test_masked_32xi16_perm_low_mask7(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
+; GENERIC-LABEL: test_masked_32xi16_perm_low_mask7:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[3,0,3,0,4,5,6,7,11,8,11,8,12,13,14,15,19,16,19,16,20,21,22,23,27,24,27,24,28,29,30,31] sched: [1:1.00]
+; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_32xi16_perm_low_mask7:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[3,0,3,0,4,5,6,7,11,8,11,8,12,13,14,15,19,16,19,16,20,21,22,23,27,24,27,24,28,29,30,31] sched: [1:1.00]
+; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 3, i32 0, i32 3, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 8, i32 11, i32 8, i32 12, i32 13, i32 14, i32 15, i32 19, i32 16, i32 19, i32 16, i32 20, i32 21, i32 22, i32 23, i32 27, i32 24, i32 27, i32 24, i32 28, i32 29, i32 30, i32 31>
+ %cmp = icmp eq <32 x i16> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
+ ret <32 x i16> %res
+}
+
+define <32 x i16> @test_masked_z_32xi16_perm_low_mask7(<32 x i16> %vec, <32 x i16> %mask) {
+; GENERIC-LABEL: test_masked_z_32xi16_perm_low_mask7:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[3,0,3,0,4,5,6,7,11,8,11,8,12,13,14,15,19,16,19,16,20,21,22,23,27,24,27,24,28,29,30,31] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_32xi16_perm_low_mask7:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[3,0,3,0,4,5,6,7,11,8,11,8,12,13,14,15,19,16,19,16,20,21,22,23,27,24,27,24,28,29,30,31] sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 3, i32 0, i32 3, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 8, i32 11, i32 8, i32 12, i32 13, i32 14, i32 15, i32 19, i32 16, i32 19, i32 16, i32 20, i32 21, i32 22, i32 23, i32 27, i32 24, i32 27, i32 24, i32 28, i32 29, i32 30, i32 31>
+ %cmp = icmp eq <32 x i16> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
+ ret <32 x i16> %res
+}
+define <32 x i16> @test_32xi16_perm_high_mem_mask0(<32 x i16>* %vp) {
+; GENERIC-LABEL: test_32xi16_perm_high_mem_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 = mem[0,1,2,3,7,4,5,6,8,9,10,11,15,12,13,14,16,17,18,19,23,20,21,22,24,25,26,27,31,28,29,30] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_32xi16_perm_high_mem_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpshufhw {{.*#+}} zmm0 = mem[0,1,2,3,7,4,5,6,8,9,10,11,15,12,13,14,16,17,18,19,23,20,21,22,24,25,26,27,31,28,29,30] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <32 x i16>, <32 x i16>* %vp
+ %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 4, i32 5, i32 6, i32 8, i32 9, i32 10, i32 11, i32 15, i32 12, i32 13, i32 14, i32 16, i32 17, i32 18, i32 19, i32 23, i32 20, i32 21, i32 22, i32 24, i32 25, i32 26, i32 27, i32 31, i32 28, i32 29, i32 30>
+ ret <32 x i16> %res
+}
+define <32 x i16> @test_masked_32xi16_perm_high_mem_mask0(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
+; GENERIC-LABEL: test_masked_32xi16_perm_high_mem_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,7,4,5,6,8,9,10,11,15,12,13,14,16,17,18,19,23,20,21,22,24,25,26,27,31,28,29,30] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_32xi16_perm_high_mem_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,7,4,5,6,8,9,10,11,15,12,13,14,16,17,18,19,23,20,21,22,24,25,26,27,31,28,29,30] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <32 x i16>, <32 x i16>* %vp
+ %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 4, i32 5, i32 6, i32 8, i32 9, i32 10, i32 11, i32 15, i32 12, i32 13, i32 14, i32 16, i32 17, i32 18, i32 19, i32 23, i32 20, i32 21, i32 22, i32 24, i32 25, i32 26, i32 27, i32 31, i32 28, i32 29, i32 30>
+ %cmp = icmp eq <32 x i16> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
+ ret <32 x i16> %res
+}
+
+define <32 x i16> @test_masked_z_32xi16_perm_high_mem_mask0(<32 x i16>* %vp, <32 x i16> %mask) {
+; GENERIC-LABEL: test_masked_z_32xi16_perm_high_mem_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,7,4,5,6,8,9,10,11,15,12,13,14,16,17,18,19,23,20,21,22,24,25,26,27,31,28,29,30] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_32xi16_perm_high_mem_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,7,4,5,6,8,9,10,11,15,12,13,14,16,17,18,19,23,20,21,22,24,25,26,27,31,28,29,30] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <32 x i16>, <32 x i16>* %vp
+ %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 4, i32 5, i32 6, i32 8, i32 9, i32 10, i32 11, i32 15, i32 12, i32 13, i32 14, i32 16, i32 17, i32 18, i32 19, i32 23, i32 20, i32 21, i32 22, i32 24, i32 25, i32 26, i32 27, i32 31, i32 28, i32 29, i32 30>
+ %cmp = icmp eq <32 x i16> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
+ ret <32 x i16> %res
+}
+
+define <32 x i16> @test_masked_32xi16_perm_low_mem_mask1(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
+; GENERIC-LABEL: test_masked_32xi16_perm_low_mem_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} = mem[1,1,3,3,4,5,6,7,9,9,11,11,12,13,14,15,17,17,19,19,20,21,22,23,25,25,27,27,28,29,30,31] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_32xi16_perm_low_mem_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} = mem[1,1,3,3,4,5,6,7,9,9,11,11,12,13,14,15,17,17,19,19,20,21,22,23,25,25,27,27,28,29,30,31] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <32 x i16>, <32 x i16>* %vp
+ %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 1, i32 1, i32 3, i32 3, i32 4, i32 5, i32 6, i32 7, i32 9, i32 9, i32 11, i32 11, i32 12, i32 13, i32 14, i32 15, i32 17, i32 17, i32 19, i32 19, i32 20, i32 21, i32 22, i32 23, i32 25, i32 25, i32 27, i32 27, i32 28, i32 29, i32 30, i32 31>
+ %cmp = icmp eq <32 x i16> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
+ ret <32 x i16> %res
+}
+
+define <32 x i16> @test_masked_z_32xi16_perm_low_mem_mask1(<32 x i16>* %vp, <32 x i16> %mask) {
+; GENERIC-LABEL: test_masked_z_32xi16_perm_low_mem_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = mem[1,1,3,3,4,5,6,7,9,9,11,11,12,13,14,15,17,17,19,19,20,21,22,23,25,25,27,27,28,29,30,31] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_32xi16_perm_low_mem_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = mem[1,1,3,3,4,5,6,7,9,9,11,11,12,13,14,15,17,17,19,19,20,21,22,23,25,25,27,27,28,29,30,31] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <32 x i16>, <32 x i16>* %vp
+ %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 1, i32 1, i32 3, i32 3, i32 4, i32 5, i32 6, i32 7, i32 9, i32 9, i32 11, i32 11, i32 12, i32 13, i32 14, i32 15, i32 17, i32 17, i32 19, i32 19, i32 20, i32 21, i32 22, i32 23, i32 25, i32 25, i32 27, i32 27, i32 28, i32 29, i32 30, i32 31>
+ %cmp = icmp eq <32 x i16> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
+ ret <32 x i16> %res
+}
+
+define <32 x i16> @test_masked_32xi16_perm_high_mem_mask2(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
+; GENERIC-LABEL: test_masked_32xi16_perm_high_mem_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,4,7,6,4,8,9,10,11,12,15,14,12,16,17,18,19,20,23,22,20,24,25,26,27,28,31,30,28] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_32xi16_perm_high_mem_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,4,7,6,4,8,9,10,11,12,15,14,12,16,17,18,19,20,23,22,20,24,25,26,27,28,31,30,28] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <32 x i16>, <32 x i16>* %vp
+ %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 7, i32 6, i32 4, i32 8, i32 9, i32 10, i32 11, i32 12, i32 15, i32 14, i32 12, i32 16, i32 17, i32 18, i32 19, i32 20, i32 23, i32 22, i32 20, i32 24, i32 25, i32 26, i32 27, i32 28, i32 31, i32 30, i32 28>
+ %cmp = icmp eq <32 x i16> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
+ ret <32 x i16> %res
+}
+
+define <32 x i16> @test_masked_z_32xi16_perm_high_mem_mask2(<32 x i16>* %vp, <32 x i16> %mask) {
+; GENERIC-LABEL: test_masked_z_32xi16_perm_high_mem_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,4,7,6,4,8,9,10,11,12,15,14,12,16,17,18,19,20,23,22,20,24,25,26,27,28,31,30,28] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_32xi16_perm_high_mem_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,4,7,6,4,8,9,10,11,12,15,14,12,16,17,18,19,20,23,22,20,24,25,26,27,28,31,30,28] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <32 x i16>, <32 x i16>* %vp
+ %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 7, i32 6, i32 4, i32 8, i32 9, i32 10, i32 11, i32 12, i32 15, i32 14, i32 12, i32 16, i32 17, i32 18, i32 19, i32 20, i32 23, i32 22, i32 20, i32 24, i32 25, i32 26, i32 27, i32 28, i32 31, i32 30, i32 28>
+ %cmp = icmp eq <32 x i16> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
+ ret <32 x i16> %res
+}
+
+define <32 x i16> @test_32xi16_perm_low_mem_mask3(<32 x i16>* %vp) {
+; GENERIC-LABEL: test_32xi16_perm_low_mem_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpshuflw {{.*#+}} zmm0 = mem[2,2,0,3,4,5,6,7,10,10,8,11,12,13,14,15,18,18,16,19,20,21,22,23,26,26,24,27,28,29,30,31] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_32xi16_perm_low_mem_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpshuflw {{.*#+}} zmm0 = mem[2,2,0,3,4,5,6,7,10,10,8,11,12,13,14,15,18,18,16,19,20,21,22,23,26,26,24,27,28,29,30,31] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <32 x i16>, <32 x i16>* %vp
+ %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 2, i32 2, i32 0, i32 3, i32 4, i32 5, i32 6, i32 7, i32 10, i32 10, i32 8, i32 11, i32 12, i32 13, i32 14, i32 15, i32 18, i32 18, i32 16, i32 19, i32 20, i32 21, i32 22, i32 23, i32 26, i32 26, i32 24, i32 27, i32 28, i32 29, i32 30, i32 31>
+ ret <32 x i16> %res
+}
+define <32 x i16> @test_masked_32xi16_perm_low_mem_mask3(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
+; GENERIC-LABEL: test_masked_32xi16_perm_low_mem_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} = mem[2,2,0,3,4,5,6,7,10,10,8,11,12,13,14,15,18,18,16,19,20,21,22,23,26,26,24,27,28,29,30,31] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_32xi16_perm_low_mem_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} = mem[2,2,0,3,4,5,6,7,10,10,8,11,12,13,14,15,18,18,16,19,20,21,22,23,26,26,24,27,28,29,30,31] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <32 x i16>, <32 x i16>* %vp
+ %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 2, i32 2, i32 0, i32 3, i32 4, i32 5, i32 6, i32 7, i32 10, i32 10, i32 8, i32 11, i32 12, i32 13, i32 14, i32 15, i32 18, i32 18, i32 16, i32 19, i32 20, i32 21, i32 22, i32 23, i32 26, i32 26, i32 24, i32 27, i32 28, i32 29, i32 30, i32 31>
+ %cmp = icmp eq <32 x i16> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
+ ret <32 x i16> %res
+}
+
+define <32 x i16> @test_masked_z_32xi16_perm_low_mem_mask3(<32 x i16>* %vp, <32 x i16> %mask) {
+; GENERIC-LABEL: test_masked_z_32xi16_perm_low_mem_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = mem[2,2,0,3,4,5,6,7,10,10,8,11,12,13,14,15,18,18,16,19,20,21,22,23,26,26,24,27,28,29,30,31] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_32xi16_perm_low_mem_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = mem[2,2,0,3,4,5,6,7,10,10,8,11,12,13,14,15,18,18,16,19,20,21,22,23,26,26,24,27,28,29,30,31] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <32 x i16>, <32 x i16>* %vp
+ %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 2, i32 2, i32 0, i32 3, i32 4, i32 5, i32 6, i32 7, i32 10, i32 10, i32 8, i32 11, i32 12, i32 13, i32 14, i32 15, i32 18, i32 18, i32 16, i32 19, i32 20, i32 21, i32 22, i32 23, i32 26, i32 26, i32 24, i32 27, i32 28, i32 29, i32 30, i32 31>
+ %cmp = icmp eq <32 x i16> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
+ ret <32 x i16> %res
+}
+
+define <32 x i16> @test_masked_32xi16_perm_high_mem_mask4(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
+; GENERIC-LABEL: test_masked_32xi16_perm_high_mem_mask4:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,7,4,6,5,8,9,10,11,15,12,14,13,16,17,18,19,23,20,22,21,24,25,26,27,31,28,30,29] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_32xi16_perm_high_mem_mask4:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,7,4,6,5,8,9,10,11,15,12,14,13,16,17,18,19,23,20,22,21,24,25,26,27,31,28,30,29] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <32 x i16>, <32 x i16>* %vp
+ %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 4, i32 6, i32 5, i32 8, i32 9, i32 10, i32 11, i32 15, i32 12, i32 14, i32 13, i32 16, i32 17, i32 18, i32 19, i32 23, i32 20, i32 22, i32 21, i32 24, i32 25, i32 26, i32 27, i32 31, i32 28, i32 30, i32 29>
+ %cmp = icmp eq <32 x i16> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
+ ret <32 x i16> %res
+}
+
+define <32 x i16> @test_masked_z_32xi16_perm_high_mem_mask4(<32 x i16>* %vp, <32 x i16> %mask) {
+; GENERIC-LABEL: test_masked_z_32xi16_perm_high_mem_mask4:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,7,4,6,5,8,9,10,11,15,12,14,13,16,17,18,19,23,20,22,21,24,25,26,27,31,28,30,29] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_32xi16_perm_high_mem_mask4:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,7,4,6,5,8,9,10,11,15,12,14,13,16,17,18,19,23,20,22,21,24,25,26,27,31,28,30,29] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <32 x i16>, <32 x i16>* %vp
+ %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 4, i32 6, i32 5, i32 8, i32 9, i32 10, i32 11, i32 15, i32 12, i32 14, i32 13, i32 16, i32 17, i32 18, i32 19, i32 23, i32 20, i32 22, i32 21, i32 24, i32 25, i32 26, i32 27, i32 31, i32 28, i32 30, i32 29>
+ %cmp = icmp eq <32 x i16> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
+ ret <32 x i16> %res
+}
+
+define <32 x i16> @test_masked_32xi16_perm_low_mem_mask5(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
+; GENERIC-LABEL: test_masked_32xi16_perm_low_mem_mask5:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpshufd {{.*#+}} zmm2 = mem[0,0,2,3,4,4,6,7,8,8,10,11,12,12,14,15] sched: [5:1.00]
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vmovdqu16 %zmm2, %zmm0 {%k1} # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_32xi16_perm_low_mem_mask5:
+; SKX: # %bb.0:
+; SKX-NEXT: vpshufd {{.*#+}} zmm2 = mem[0,0,2,3,4,4,6,7,8,8,10,11,12,12,14,15] sched: [8:1.00]
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vmovdqu16 %zmm2, %zmm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <32 x i16>, <32 x i16>* %vp
+ %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 8, i32 9, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 16, i32 17, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 24, i32 25, i32 28, i32 29, i32 30, i32 31>
+ %cmp = icmp eq <32 x i16> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
+ ret <32 x i16> %res
+}
+
+define <32 x i16> @test_masked_z_32xi16_perm_low_mem_mask5(<32 x i16>* %vp, <32 x i16> %mask) {
+; GENERIC-LABEL: test_masked_z_32xi16_perm_low_mem_mask5:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpshufd {{.*#+}} zmm1 = mem[0,0,2,3,4,4,6,7,8,8,10,11,12,12,14,15] sched: [5:1.00]
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1} {z} # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_32xi16_perm_low_mem_mask5:
+; SKX: # %bb.0:
+; SKX-NEXT: vpshufd {{.*#+}} zmm1 = mem[0,0,2,3,4,4,6,7,8,8,10,11,12,12,14,15] sched: [8:1.00]
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqw %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1} {z} # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <32 x i16>, <32 x i16>* %vp
+ %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 8, i32 9, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 16, i32 17, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 24, i32 25, i32 28, i32 29, i32 30, i32 31>
+ %cmp = icmp eq <32 x i16> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
+ ret <32 x i16> %res
+}
+
+define <32 x i16> @test_32xi16_perm_high_mem_mask6(<32 x i16>* %vp) {
+; GENERIC-LABEL: test_32xi16_perm_high_mem_mask6:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 = mem[0,1,2,3,6,5,6,6,8,9,10,11,14,13,14,14,16,17,18,19,22,21,22,22,24,25,26,27,30,29,30,30] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_32xi16_perm_high_mem_mask6:
+; SKX: # %bb.0:
+; SKX-NEXT: vpshufhw {{.*#+}} zmm0 = mem[0,1,2,3,6,5,6,6,8,9,10,11,14,13,14,14,16,17,18,19,22,21,22,22,24,25,26,27,30,29,30,30] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <32 x i16>, <32 x i16>* %vp
+ %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 5, i32 6, i32 6, i32 8, i32 9, i32 10, i32 11, i32 14, i32 13, i32 14, i32 14, i32 16, i32 17, i32 18, i32 19, i32 22, i32 21, i32 22, i32 22, i32 24, i32 25, i32 26, i32 27, i32 30, i32 29, i32 30, i32 30>
+ ret <32 x i16> %res
+}
+define <32 x i16> @test_masked_32xi16_perm_high_mem_mask6(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
+; GENERIC-LABEL: test_masked_32xi16_perm_high_mem_mask6:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,6,5,6,6,8,9,10,11,14,13,14,14,16,17,18,19,22,21,22,22,24,25,26,27,30,29,30,30] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_32xi16_perm_high_mem_mask6:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,6,5,6,6,8,9,10,11,14,13,14,14,16,17,18,19,22,21,22,22,24,25,26,27,30,29,30,30] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <32 x i16>, <32 x i16>* %vp
+ %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 5, i32 6, i32 6, i32 8, i32 9, i32 10, i32 11, i32 14, i32 13, i32 14, i32 14, i32 16, i32 17, i32 18, i32 19, i32 22, i32 21, i32 22, i32 22, i32 24, i32 25, i32 26, i32 27, i32 30, i32 29, i32 30, i32 30>
+ %cmp = icmp eq <32 x i16> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
+ ret <32 x i16> %res
+}
+
+define <32 x i16> @test_masked_z_32xi16_perm_high_mem_mask6(<32 x i16>* %vp, <32 x i16> %mask) {
+; GENERIC-LABEL: test_masked_z_32xi16_perm_high_mem_mask6:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,6,5,6,6,8,9,10,11,14,13,14,14,16,17,18,19,22,21,22,22,24,25,26,27,30,29,30,30] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_32xi16_perm_high_mem_mask6:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,6,5,6,6,8,9,10,11,14,13,14,14,16,17,18,19,22,21,22,22,24,25,26,27,30,29,30,30] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <32 x i16>, <32 x i16>* %vp
+ %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 5, i32 6, i32 6, i32 8, i32 9, i32 10, i32 11, i32 14, i32 13, i32 14, i32 14, i32 16, i32 17, i32 18, i32 19, i32 22, i32 21, i32 22, i32 22, i32 24, i32 25, i32 26, i32 27, i32 30, i32 29, i32 30, i32 30>
+ %cmp = icmp eq <32 x i16> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
+ ret <32 x i16> %res
+}
+
+define <32 x i16> @test_masked_32xi16_perm_low_mem_mask7(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
+; GENERIC-LABEL: test_masked_32xi16_perm_low_mem_mask7:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} = mem[3,1,3,0,4,5,6,7,11,9,11,8,12,13,14,15,19,17,19,16,20,21,22,23,27,25,27,24,28,29,30,31] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_32xi16_perm_low_mem_mask7:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} = mem[3,1,3,0,4,5,6,7,11,9,11,8,12,13,14,15,19,17,19,16,20,21,22,23,27,25,27,24,28,29,30,31] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <32 x i16>, <32 x i16>* %vp
+ %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 3, i32 1, i32 3, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 9, i32 11, i32 8, i32 12, i32 13, i32 14, i32 15, i32 19, i32 17, i32 19, i32 16, i32 20, i32 21, i32 22, i32 23, i32 27, i32 25, i32 27, i32 24, i32 28, i32 29, i32 30, i32 31>
+ %cmp = icmp eq <32 x i16> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
+ ret <32 x i16> %res
+}
+
+define <32 x i16> @test_masked_z_32xi16_perm_low_mem_mask7(<32 x i16>* %vp, <32 x i16> %mask) {
+; GENERIC-LABEL: test_masked_z_32xi16_perm_low_mem_mask7:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = mem[3,1,3,0,4,5,6,7,11,9,11,8,12,13,14,15,19,17,19,16,20,21,22,23,27,25,27,24,28,29,30,31] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_32xi16_perm_low_mem_mask7:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = mem[3,1,3,0,4,5,6,7,11,9,11,8,12,13,14,15,19,17,19,16,20,21,22,23,27,25,27,24,28,29,30,31] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <32 x i16>, <32 x i16>* %vp
+ %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 3, i32 1, i32 3, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 9, i32 11, i32 8, i32 12, i32 13, i32 14, i32 15, i32 19, i32 17, i32 19, i32 16, i32 20, i32 21, i32 22, i32 23, i32 27, i32 25, i32 27, i32 24, i32 28, i32 29, i32 30, i32 31>
+ %cmp = icmp eq <32 x i16> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
+ ret <32 x i16> %res
+}
+
+define <4 x i32> @test_4xi32_perm_mask0(<4 x i32> %vec) {
+; GENERIC-LABEL: test_4xi32_perm_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,3,0] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xi32_perm_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,3,0] sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %res = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 3, i32 0>
+ ret <4 x i32> %res
+}
+define <4 x i32> @test_masked_4xi32_perm_mask0(<4 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
+; GENERIC-LABEL: test_masked_4xi32_perm_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufd {{.*#+}} xmm1 {%k1} = xmm0[2,3,3,0] sched: [1:1.00]
+; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_4xi32_perm_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufd {{.*#+}} xmm1 {%k1} = xmm0[2,3,3,0] sched: [1:1.00]
+; SKX-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 3, i32 0>
+ %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_masked_z_4xi32_perm_mask0(<4 x i32> %vec, <4 x i32> %mask) {
+; GENERIC-LABEL: test_masked_z_4xi32_perm_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[2,3,3,0] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_4xi32_perm_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[2,3,3,0] sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 3, i32 0>
+ %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
+ ret <4 x i32> %res
+}
+define <4 x i32> @test_masked_4xi32_perm_mask1(<4 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
+; GENERIC-LABEL: test_masked_4xi32_perm_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufd {{.*#+}} xmm1 {%k1} = xmm0[1,0,2,0] sched: [1:1.00]
+; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_4xi32_perm_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufd {{.*#+}} xmm1 {%k1} = xmm0[1,0,2,0] sched: [1:1.00]
+; SKX-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 2, i32 0>
+ %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_masked_z_4xi32_perm_mask1(<4 x i32> %vec, <4 x i32> %mask) {
+; GENERIC-LABEL: test_masked_z_4xi32_perm_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[1,0,2,0] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_4xi32_perm_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[1,0,2,0] sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 2, i32 0>
+ %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
+ ret <4 x i32> %res
+}
+define <4 x i32> @test_masked_4xi32_perm_mask2(<4 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
+; GENERIC-LABEL: test_masked_4xi32_perm_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufd {{.*#+}} xmm1 {%k1} = xmm0[3,0,1,0] sched: [1:1.00]
+; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_4xi32_perm_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufd {{.*#+}} xmm1 {%k1} = xmm0[3,0,1,0] sched: [1:1.00]
+; SKX-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 3, i32 0, i32 1, i32 0>
+ %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_masked_z_4xi32_perm_mask2(<4 x i32> %vec, <4 x i32> %mask) {
+; GENERIC-LABEL: test_masked_z_4xi32_perm_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[3,0,1,0] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_4xi32_perm_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[3,0,1,0] sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 3, i32 0, i32 1, i32 0>
+ %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
+ ret <4 x i32> %res
+}
+define <4 x i32> @test_4xi32_perm_mask3(<4 x i32> %vec) {
+; GENERIC-LABEL: test_4xi32_perm_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,0,3] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xi32_perm_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,0,3] sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %res = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 0, i32 3>
+ ret <4 x i32> %res
+}
+define <4 x i32> @test_masked_4xi32_perm_mask3(<4 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
+; GENERIC-LABEL: test_masked_4xi32_perm_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufd {{.*#+}} xmm1 {%k1} = xmm0[1,1,0,3] sched: [1:1.00]
+; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_4xi32_perm_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufd {{.*#+}} xmm1 {%k1} = xmm0[1,1,0,3] sched: [1:1.00]
+; SKX-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 0, i32 3>
+ %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_masked_z_4xi32_perm_mask3(<4 x i32> %vec, <4 x i32> %mask) {
+; GENERIC-LABEL: test_masked_z_4xi32_perm_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[1,1,0,3] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_4xi32_perm_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[1,1,0,3] sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 0, i32 3>
+ %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
+ ret <4 x i32> %res
+}
+define <4 x i32> @test_4xi32_perm_mem_mask0(<4 x i32>* %vp) {
+; GENERIC-LABEL: test_4xi32_perm_mem_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpermilps {{.*#+}} xmm0 = mem[0,1,3,3] sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xi32_perm_mem_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpermilps {{.*#+}} xmm0 = mem[0,1,3,3] sched: [7:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <4 x i32>, <4 x i32>* %vp
+ %res = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 3, i32 3>
+ ret <4 x i32> %res
+}
+define <4 x i32> @test_masked_4xi32_perm_mem_mask0(<4 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) {
+; GENERIC-LABEL: test_masked_4xi32_perm_mem_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 {%k1} = mem[0,1,3,3] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_4xi32_perm_mem_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufd {{.*#+}} xmm0 {%k1} = mem[0,1,3,3] sched: [7:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <4 x i32>, <4 x i32>* %vp
+ %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 3, i32 3>
+ %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_masked_z_4xi32_perm_mem_mask0(<4 x i32>* %vp, <4 x i32> %mask) {
+; GENERIC-LABEL: test_masked_z_4xi32_perm_mem_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %xmm1, %xmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[0,1,3,3] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_4xi32_perm_mem_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %xmm1, %xmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[0,1,3,3] sched: [7:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <4 x i32>, <4 x i32>* %vp
+ %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 3, i32 3>
+ %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_masked_4xi32_perm_mem_mask1(<4 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) {
+; GENERIC-LABEL: test_masked_4xi32_perm_mem_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 {%k1} = mem[2,2,3,1] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_4xi32_perm_mem_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufd {{.*#+}} xmm0 {%k1} = mem[2,2,3,1] sched: [7:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <4 x i32>, <4 x i32>* %vp
+ %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 2, i32 2, i32 3, i32 1>
+ %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_masked_z_4xi32_perm_mem_mask1(<4 x i32>* %vp, <4 x i32> %mask) {
+; GENERIC-LABEL: test_masked_z_4xi32_perm_mem_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %xmm1, %xmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[2,2,3,1] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_4xi32_perm_mem_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %xmm1, %xmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[2,2,3,1] sched: [7:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <4 x i32>, <4 x i32>* %vp
+ %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 2, i32 2, i32 3, i32 1>
+ %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_masked_4xi32_perm_mem_mask2(<4 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) {
+; GENERIC-LABEL: test_masked_4xi32_perm_mem_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 {%k1} = mem[0,3,0,1] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_4xi32_perm_mem_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufd {{.*#+}} xmm0 {%k1} = mem[0,3,0,1] sched: [7:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <4 x i32>, <4 x i32>* %vp
+ %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 0, i32 3, i32 0, i32 1>
+ %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_masked_z_4xi32_perm_mem_mask2(<4 x i32>* %vp, <4 x i32> %mask) {
+; GENERIC-LABEL: test_masked_z_4xi32_perm_mem_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %xmm1, %xmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[0,3,0,1] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_4xi32_perm_mem_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %xmm1, %xmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[0,3,0,1] sched: [7:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <4 x i32>, <4 x i32>* %vp
+ %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 0, i32 3, i32 0, i32 1>
+ %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_4xi32_perm_mem_mask3(<4 x i32>* %vp) {
+; GENERIC-LABEL: test_4xi32_perm_mem_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,0,1,0] sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xi32_perm_mem_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,0,1,0] sched: [7:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <4 x i32>, <4 x i32>* %vp
+ %res = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
+ ret <4 x i32> %res
+}
+define <4 x i32> @test_masked_4xi32_perm_mem_mask3(<4 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) {
+; GENERIC-LABEL: test_masked_4xi32_perm_mem_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 {%k1} = mem[1,0,1,0] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_4xi32_perm_mem_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufd {{.*#+}} xmm0 {%k1} = mem[1,0,1,0] sched: [7:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <4 x i32>, <4 x i32>* %vp
+ %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
+ %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_masked_z_4xi32_perm_mem_mask3(<4 x i32>* %vp, <4 x i32> %mask) {
+; GENERIC-LABEL: test_masked_z_4xi32_perm_mem_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %xmm1, %xmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[1,0,1,0] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_4xi32_perm_mem_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %xmm1, %xmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[1,0,1,0] sched: [7:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <4 x i32>, <4 x i32>* %vp
+ %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
+ %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
+ ret <4 x i32> %res
+}
+
+define <8 x i32> @test2_8xi32_perm_mask0(<8 x i32> %vec) {
+; GENERIC-LABEL: test2_8xi32_perm_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,1,0,6,7,5,4] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test2_8xi32_perm_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,1,0,6,7,5,4] sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 1, i32 0, i32 6, i32 7, i32 5, i32 4>
+ ret <8 x i32> %res
+}
+define <8 x i32> @test2_masked_8xi32_perm_mask0(<8 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
+; GENERIC-LABEL: test2_masked_8xi32_perm_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufd {{.*#+}} ymm1 {%k1} = ymm0[2,3,1,0,6,7,5,4] sched: [1:1.00]
+; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test2_masked_8xi32_perm_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufd {{.*#+}} ymm1 {%k1} = ymm0[2,3,1,0,6,7,5,4] sched: [1:1.00]
+; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 1, i32 0, i32 6, i32 7, i32 5, i32 4>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test2_masked_z_8xi32_perm_mask0(<8 x i32> %vec, <8 x i32> %mask) {
+; GENERIC-LABEL: test2_masked_z_8xi32_perm_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3,1,0,6,7,5,4] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test2_masked_z_8xi32_perm_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3,1,0,6,7,5,4] sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 1, i32 0, i32 6, i32 7, i32 5, i32 4>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
+ ret <8 x i32> %res
+}
+define <8 x i32> @test2_masked_8xi32_perm_mask1(<8 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
+; GENERIC-LABEL: test2_masked_8xi32_perm_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufd {{.*#+}} ymm1 {%k1} = ymm0[0,3,3,3,4,7,7,7] sched: [1:1.00]
+; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test2_masked_8xi32_perm_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufd {{.*#+}} ymm1 {%k1} = ymm0[0,3,3,3,4,7,7,7] sched: [1:1.00]
+; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 0, i32 3, i32 3, i32 3, i32 4, i32 7, i32 7, i32 7>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test2_masked_z_8xi32_perm_mask1(<8 x i32> %vec, <8 x i32> %mask) {
+; GENERIC-LABEL: test2_masked_z_8xi32_perm_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[0,3,3,3,4,7,7,7] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test2_masked_z_8xi32_perm_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[0,3,3,3,4,7,7,7] sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 0, i32 3, i32 3, i32 3, i32 4, i32 7, i32 7, i32 7>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
+ ret <8 x i32> %res
+}
+define <8 x i32> @test2_masked_8xi32_perm_mask2(<8 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
+; GENERIC-LABEL: test2_masked_8xi32_perm_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufd {{.*#+}} ymm1 {%k1} = ymm0[1,2,0,3,5,6,4,7] sched: [1:1.00]
+; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test2_masked_8xi32_perm_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufd {{.*#+}} ymm1 {%k1} = ymm0[1,2,0,3,5,6,4,7] sched: [1:1.00]
+; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 1, i32 2, i32 0, i32 3, i32 5, i32 6, i32 4, i32 7>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test2_masked_z_8xi32_perm_mask2(<8 x i32> %vec, <8 x i32> %mask) {
+; GENERIC-LABEL: test2_masked_z_8xi32_perm_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,2,0,3,5,6,4,7] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test2_masked_z_8xi32_perm_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,2,0,3,5,6,4,7] sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 1, i32 2, i32 0, i32 3, i32 5, i32 6, i32 4, i32 7>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
+ ret <8 x i32> %res
+}
+define <8 x i32> @test2_8xi32_perm_mask3(<8 x i32> %vec) {
+; GENERIC-LABEL: test2_8xi32_perm_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,3,1,0,5,7,5,4] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test2_8xi32_perm_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,3,1,0,5,7,5,4] sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 1, i32 3, i32 1, i32 0, i32 5, i32 7, i32 5, i32 4>
+ ret <8 x i32> %res
+}
+define <8 x i32> @test2_masked_8xi32_perm_mask3(<8 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
+; GENERIC-LABEL: test2_masked_8xi32_perm_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufd {{.*#+}} ymm1 {%k1} = ymm0[1,3,1,0,5,7,5,4] sched: [1:1.00]
+; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test2_masked_8xi32_perm_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufd {{.*#+}} ymm1 {%k1} = ymm0[1,3,1,0,5,7,5,4] sched: [1:1.00]
+; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 1, i32 3, i32 1, i32 0, i32 5, i32 7, i32 5, i32 4>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test2_masked_z_8xi32_perm_mask3(<8 x i32> %vec, <8 x i32> %mask) {
+; GENERIC-LABEL: test2_masked_z_8xi32_perm_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,3,1,0,5,7,5,4] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test2_masked_z_8xi32_perm_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,3,1,0,5,7,5,4] sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 1, i32 3, i32 1, i32 0, i32 5, i32 7, i32 5, i32 4>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
+ ret <8 x i32> %res
+}
+define <8 x i32> @test2_8xi32_perm_mem_mask0(<8 x i32>* %vp) {
+; GENERIC-LABEL: test2_8xi32_perm_mem_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpermilps {{.*#+}} ymm0 = mem[1,0,2,0,5,4,6,4] sched: [8:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test2_8xi32_perm_mem_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpermilps {{.*#+}} ymm0 = mem[1,0,2,0,5,4,6,4] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <8 x i32>, <8 x i32>* %vp
+ %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 1, i32 0, i32 2, i32 0, i32 5, i32 4, i32 6, i32 4>
+ ret <8 x i32> %res
+}
+define <8 x i32> @test2_masked_8xi32_perm_mem_mask0(<8 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) {
+; GENERIC-LABEL: test2_masked_8xi32_perm_mem_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufd {{.*#+}} ymm0 {%k1} = mem[1,0,2,0,5,4,6,4] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test2_masked_8xi32_perm_mem_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufd {{.*#+}} ymm0 {%k1} = mem[1,0,2,0,5,4,6,4] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <8 x i32>, <8 x i32>* %vp
+ %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 1, i32 0, i32 2, i32 0, i32 5, i32 4, i32 6, i32 4>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test2_masked_z_8xi32_perm_mem_mask0(<8 x i32>* %vp, <8 x i32> %mask) {
+; GENERIC-LABEL: test2_masked_z_8xi32_perm_mem_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[1,0,2,0,5,4,6,4] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test2_masked_z_8xi32_perm_mem_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[1,0,2,0,5,4,6,4] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <8 x i32>, <8 x i32>* %vp
+ %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 1, i32 0, i32 2, i32 0, i32 5, i32 4, i32 6, i32 4>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test2_masked_8xi32_perm_mem_mask1(<8 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) {
+; GENERIC-LABEL: test2_masked_8xi32_perm_mem_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufd {{.*#+}} ymm0 {%k1} = mem[0,3,2,0,4,7,6,4] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test2_masked_8xi32_perm_mem_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufd {{.*#+}} ymm0 {%k1} = mem[0,3,2,0,4,7,6,4] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <8 x i32>, <8 x i32>* %vp
+ %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 0, i32 3, i32 2, i32 0, i32 4, i32 7, i32 6, i32 4>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test2_masked_z_8xi32_perm_mem_mask1(<8 x i32>* %vp, <8 x i32> %mask) {
+; GENERIC-LABEL: test2_masked_z_8xi32_perm_mem_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[0,3,2,0,4,7,6,4] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test2_masked_z_8xi32_perm_mem_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[0,3,2,0,4,7,6,4] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <8 x i32>, <8 x i32>* %vp
+ %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 0, i32 3, i32 2, i32 0, i32 4, i32 7, i32 6, i32 4>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test2_masked_8xi32_perm_mem_mask2(<8 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) {
+; GENERIC-LABEL: test2_masked_8xi32_perm_mem_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufd {{.*#+}} ymm0 {%k1} = mem[3,2,3,1,7,6,7,5] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test2_masked_8xi32_perm_mem_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufd {{.*#+}} ymm0 {%k1} = mem[3,2,3,1,7,6,7,5] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <8 x i32>, <8 x i32>* %vp
+ %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 2, i32 3, i32 1, i32 7, i32 6, i32 7, i32 5>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test2_masked_z_8xi32_perm_mem_mask2(<8 x i32>* %vp, <8 x i32> %mask) {
+; GENERIC-LABEL: test2_masked_z_8xi32_perm_mem_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[3,2,3,1,7,6,7,5] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test2_masked_z_8xi32_perm_mem_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[3,2,3,1,7,6,7,5] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <8 x i32>, <8 x i32>* %vp
+ %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 2, i32 3, i32 1, i32 7, i32 6, i32 7, i32 5>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test2_8xi32_perm_mem_mask3(<8 x i32>* %vp) {
+; GENERIC-LABEL: test2_8xi32_perm_mem_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpermilps {{.*#+}} ymm0 = mem[3,2,0,0,7,6,4,4] sched: [8:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test2_8xi32_perm_mem_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpermilps {{.*#+}} ymm0 = mem[3,2,0,0,7,6,4,4] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <8 x i32>, <8 x i32>* %vp
+ %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 2, i32 0, i32 0, i32 7, i32 6, i32 4, i32 4>
+ ret <8 x i32> %res
+}
+define <8 x i32> @test2_masked_8xi32_perm_mem_mask3(<8 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) {
+; GENERIC-LABEL: test2_masked_8xi32_perm_mem_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufd {{.*#+}} ymm0 {%k1} = mem[3,2,0,0,7,6,4,4] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test2_masked_8xi32_perm_mem_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufd {{.*#+}} ymm0 {%k1} = mem[3,2,0,0,7,6,4,4] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <8 x i32>, <8 x i32>* %vp
+ %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 2, i32 0, i32 0, i32 7, i32 6, i32 4, i32 4>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test2_masked_z_8xi32_perm_mem_mask3(<8 x i32>* %vp, <8 x i32> %mask) {
+; GENERIC-LABEL: test2_masked_z_8xi32_perm_mem_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[3,2,0,0,7,6,4,4] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test2_masked_z_8xi32_perm_mem_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[3,2,0,0,7,6,4,4] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <8 x i32>, <8 x i32>* %vp
+ %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 2, i32 0, i32 0, i32 7, i32 6, i32 4, i32 4>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
+ ret <8 x i32> %res
+}
+
+define <16 x i32> @test2_16xi32_perm_mask0(<16 x i32> %vec) {
+; GENERIC-LABEL: test2_16xi32_perm_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[3,1,3,0,7,5,7,4,11,9,11,8,15,13,15,12] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test2_16xi32_perm_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[3,1,3,0,7,5,7,4,11,9,11,8,15,13,15,12] sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 1, i32 3, i32 0, i32 7, i32 5, i32 7, i32 4, i32 11, i32 9, i32 11, i32 8, i32 15, i32 13, i32 15, i32 12>
+ ret <16 x i32> %res
+}
+define <16 x i32> @test2_masked_16xi32_perm_mask0(<16 x i32> %vec, <16 x i32> %vec2, <16 x i32> %mask) {
+; GENERIC-LABEL: test2_masked_16xi32_perm_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufd {{.*#+}} zmm1 {%k1} = zmm0[3,1,3,0,7,5,7,4,11,9,11,8,15,13,15,12] sched: [1:1.00]
+; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test2_masked_16xi32_perm_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufd {{.*#+}} zmm1 {%k1} = zmm0[3,1,3,0,7,5,7,4,11,9,11,8,15,13,15,12] sched: [1:1.00]
+; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 1, i32 3, i32 0, i32 7, i32 5, i32 7, i32 4, i32 11, i32 9, i32 11, i32 8, i32 15, i32 13, i32 15, i32 12>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2
+ ret <16 x i32> %res
+}
+
+define <16 x i32> @test2_masked_z_16xi32_perm_mask0(<16 x i32> %vec, <16 x i32> %mask) {
+; GENERIC-LABEL: test2_masked_z_16xi32_perm_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,1,3,0,7,5,7,4,11,9,11,8,15,13,15,12] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test2_masked_z_16xi32_perm_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,1,3,0,7,5,7,4,11,9,11,8,15,13,15,12] sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 1, i32 3, i32 0, i32 7, i32 5, i32 7, i32 4, i32 11, i32 9, i32 11, i32 8, i32 15, i32 13, i32 15, i32 12>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
+ ret <16 x i32> %res
+}
+define <16 x i32> @test2_masked_16xi32_perm_mask1(<16 x i32> %vec, <16 x i32> %vec2, <16 x i32> %mask) {
+; GENERIC-LABEL: test2_masked_16xi32_perm_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufd {{.*#+}} zmm1 {%k1} = zmm0[2,0,3,0,6,4,7,4,10,8,11,8,14,12,15,12] sched: [1:1.00]
+; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test2_masked_16xi32_perm_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufd {{.*#+}} zmm1 {%k1} = zmm0[2,0,3,0,6,4,7,4,10,8,11,8,14,12,15,12] sched: [1:1.00]
+; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 2, i32 0, i32 3, i32 0, i32 6, i32 4, i32 7, i32 4, i32 10, i32 8, i32 11, i32 8, i32 14, i32 12, i32 15, i32 12>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2
+ ret <16 x i32> %res
+}
+
+define <16 x i32> @test2_masked_z_16xi32_perm_mask1(<16 x i32> %vec, <16 x i32> %mask) {
+; GENERIC-LABEL: test2_masked_z_16xi32_perm_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[2,0,3,0,6,4,7,4,10,8,11,8,14,12,15,12] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test2_masked_z_16xi32_perm_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[2,0,3,0,6,4,7,4,10,8,11,8,14,12,15,12] sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 2, i32 0, i32 3, i32 0, i32 6, i32 4, i32 7, i32 4, i32 10, i32 8, i32 11, i32 8, i32 14, i32 12, i32 15, i32 12>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
+ ret <16 x i32> %res
+}
+define <16 x i32> @test2_masked_16xi32_perm_mask2(<16 x i32> %vec, <16 x i32> %vec2, <16 x i32> %mask) {
+; GENERIC-LABEL: test2_masked_16xi32_perm_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufd {{.*#+}} zmm1 {%k1} = zmm0[1,3,3,0,5,7,7,4,9,11,11,8,13,15,15,12] sched: [1:1.00]
+; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test2_masked_16xi32_perm_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufd {{.*#+}} zmm1 {%k1} = zmm0[1,3,3,0,5,7,7,4,9,11,11,8,13,15,15,12] sched: [1:1.00]
+; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 1, i32 3, i32 3, i32 0, i32 5, i32 7, i32 7, i32 4, i32 9, i32 11, i32 11, i32 8, i32 13, i32 15, i32 15, i32 12>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2
+ ret <16 x i32> %res
+}
+
+define <16 x i32> @test2_masked_z_16xi32_perm_mask2(<16 x i32> %vec, <16 x i32> %mask) {
+; GENERIC-LABEL: test2_masked_z_16xi32_perm_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[1,3,3,0,5,7,7,4,9,11,11,8,13,15,15,12] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test2_masked_z_16xi32_perm_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[1,3,3,0,5,7,7,4,9,11,11,8,13,15,15,12] sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 1, i32 3, i32 3, i32 0, i32 5, i32 7, i32 7, i32 4, i32 9, i32 11, i32 11, i32 8, i32 13, i32 15, i32 15, i32 12>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
+ ret <16 x i32> %res
+}
+define <16 x i32> @test2_16xi32_perm_mask3(<16 x i32> %vec) {
+; GENERIC-LABEL: test2_16xi32_perm_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[3,2,0,3,7,6,4,7,11,10,8,11,15,14,12,15] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test2_16xi32_perm_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[3,2,0,3,7,6,4,7,11,10,8,11,15,14,12,15] sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 2, i32 0, i32 3, i32 7, i32 6, i32 4, i32 7, i32 11, i32 10, i32 8, i32 11, i32 15, i32 14, i32 12, i32 15>
+ ret <16 x i32> %res
+}
+define <16 x i32> @test2_masked_16xi32_perm_mask3(<16 x i32> %vec, <16 x i32> %vec2, <16 x i32> %mask) {
+; GENERIC-LABEL: test2_masked_16xi32_perm_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufd {{.*#+}} zmm1 {%k1} = zmm0[3,2,0,3,7,6,4,7,11,10,8,11,15,14,12,15] sched: [1:1.00]
+; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test2_masked_16xi32_perm_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufd {{.*#+}} zmm1 {%k1} = zmm0[3,2,0,3,7,6,4,7,11,10,8,11,15,14,12,15] sched: [1:1.00]
+; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 2, i32 0, i32 3, i32 7, i32 6, i32 4, i32 7, i32 11, i32 10, i32 8, i32 11, i32 15, i32 14, i32 12, i32 15>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2
+ ret <16 x i32> %res
+}
+
+define <16 x i32> @test2_masked_z_16xi32_perm_mask3(<16 x i32> %vec, <16 x i32> %mask) {
+; GENERIC-LABEL: test2_masked_z_16xi32_perm_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,2,0,3,7,6,4,7,11,10,8,11,15,14,12,15] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test2_masked_z_16xi32_perm_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,2,0,3,7,6,4,7,11,10,8,11,15,14,12,15] sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 2, i32 0, i32 3, i32 7, i32 6, i32 4, i32 7, i32 11, i32 10, i32 8, i32 11, i32 15, i32 14, i32 12, i32 15>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
+ ret <16 x i32> %res
+}
+define <16 x i32> @test2_16xi32_perm_mem_mask0(<16 x i32>* %vp) {
+; GENERIC-LABEL: test2_16xi32_perm_mem_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpermilps {{.*#+}} zmm0 = mem[1,0,1,3,5,4,5,7,9,8,9,11,13,12,13,15] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test2_16xi32_perm_mem_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpermilps {{.*#+}} zmm0 = mem[1,0,1,3,5,4,5,7,9,8,9,11,13,12,13,15] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <16 x i32>, <16 x i32>* %vp
+ %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 1, i32 0, i32 1, i32 3, i32 5, i32 4, i32 5, i32 7, i32 9, i32 8, i32 9, i32 11, i32 13, i32 12, i32 13, i32 15>
+ ret <16 x i32> %res
+}
+define <16 x i32> @test2_masked_16xi32_perm_mem_mask0(<16 x i32>* %vp, <16 x i32> %vec2, <16 x i32> %mask) {
+; GENERIC-LABEL: test2_masked_16xi32_perm_mem_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufd {{.*#+}} zmm0 {%k1} = mem[1,0,1,3,5,4,5,7,9,8,9,11,13,12,13,15] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test2_masked_16xi32_perm_mem_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufd {{.*#+}} zmm0 {%k1} = mem[1,0,1,3,5,4,5,7,9,8,9,11,13,12,13,15] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <16 x i32>, <16 x i32>* %vp
+ %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 1, i32 0, i32 1, i32 3, i32 5, i32 4, i32 5, i32 7, i32 9, i32 8, i32 9, i32 11, i32 13, i32 12, i32 13, i32 15>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2
+ ret <16 x i32> %res
+}
+
+define <16 x i32> @test2_masked_z_16xi32_perm_mem_mask0(<16 x i32>* %vp, <16 x i32> %mask) {
+; GENERIC-LABEL: test2_masked_z_16xi32_perm_mem_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[1,0,1,3,5,4,5,7,9,8,9,11,13,12,13,15] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test2_masked_z_16xi32_perm_mem_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[1,0,1,3,5,4,5,7,9,8,9,11,13,12,13,15] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <16 x i32>, <16 x i32>* %vp
+ %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 1, i32 0, i32 1, i32 3, i32 5, i32 4, i32 5, i32 7, i32 9, i32 8, i32 9, i32 11, i32 13, i32 12, i32 13, i32 15>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
+ ret <16 x i32> %res
+}
+
+define <16 x i32> @test2_masked_16xi32_perm_mem_mask1(<16 x i32>* %vp, <16 x i32> %vec2, <16 x i32> %mask) {
+; GENERIC-LABEL: test2_masked_16xi32_perm_mem_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufd {{.*#+}} zmm0 {%k1} = mem[1,0,0,2,5,4,4,6,9,8,8,10,13,12,12,14] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test2_masked_16xi32_perm_mem_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufd {{.*#+}} zmm0 {%k1} = mem[1,0,0,2,5,4,4,6,9,8,8,10,13,12,12,14] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <16 x i32>, <16 x i32>* %vp
+ %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 1, i32 0, i32 0, i32 2, i32 5, i32 4, i32 4, i32 6, i32 9, i32 8, i32 8, i32 10, i32 13, i32 12, i32 12, i32 14>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2
+ ret <16 x i32> %res
+}
+
+define <16 x i32> @test2_masked_z_16xi32_perm_mem_mask1(<16 x i32>* %vp, <16 x i32> %mask) {
+; GENERIC-LABEL: test2_masked_z_16xi32_perm_mem_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[1,0,0,2,5,4,4,6,9,8,8,10,13,12,12,14] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test2_masked_z_16xi32_perm_mem_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[1,0,0,2,5,4,4,6,9,8,8,10,13,12,12,14] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <16 x i32>, <16 x i32>* %vp
+ %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 1, i32 0, i32 0, i32 2, i32 5, i32 4, i32 4, i32 6, i32 9, i32 8, i32 8, i32 10, i32 13, i32 12, i32 12, i32 14>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
+ ret <16 x i32> %res
+}
+
+define <16 x i32> @test2_masked_16xi32_perm_mem_mask2(<16 x i32>* %vp, <16 x i32> %vec2, <16 x i32> %mask) {
+; GENERIC-LABEL: test2_masked_16xi32_perm_mem_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufd {{.*#+}} zmm0 {%k1} = mem[2,0,1,2,6,4,5,6,10,8,9,10,14,12,13,14] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test2_masked_16xi32_perm_mem_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufd {{.*#+}} zmm0 {%k1} = mem[2,0,1,2,6,4,5,6,10,8,9,10,14,12,13,14] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <16 x i32>, <16 x i32>* %vp
+ %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 2, i32 0, i32 1, i32 2, i32 6, i32 4, i32 5, i32 6, i32 10, i32 8, i32 9, i32 10, i32 14, i32 12, i32 13, i32 14>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2
+ ret <16 x i32> %res
+}
+
+define <16 x i32> @test2_masked_z_16xi32_perm_mem_mask2(<16 x i32>* %vp, <16 x i32> %mask) {
+; GENERIC-LABEL: test2_masked_z_16xi32_perm_mem_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[2,0,1,2,6,4,5,6,10,8,9,10,14,12,13,14] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test2_masked_z_16xi32_perm_mem_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[2,0,1,2,6,4,5,6,10,8,9,10,14,12,13,14] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <16 x i32>, <16 x i32>* %vp
+ %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 2, i32 0, i32 1, i32 2, i32 6, i32 4, i32 5, i32 6, i32 10, i32 8, i32 9, i32 10, i32 14, i32 12, i32 13, i32 14>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
+ ret <16 x i32> %res
+}
+
+define <16 x i32> @test2_16xi32_perm_mem_mask3(<16 x i32>* %vp) {
+; GENERIC-LABEL: test2_16xi32_perm_mem_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpermilps {{.*#+}} zmm0 = mem[3,1,1,1,7,5,5,5,11,9,9,9,15,13,13,13] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test2_16xi32_perm_mem_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpermilps {{.*#+}} zmm0 = mem[3,1,1,1,7,5,5,5,11,9,9,9,15,13,13,13] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <16 x i32>, <16 x i32>* %vp
+ %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 1, i32 1, i32 1, i32 7, i32 5, i32 5, i32 5, i32 11, i32 9, i32 9, i32 9, i32 15, i32 13, i32 13, i32 13>
+ ret <16 x i32> %res
+}
+define <16 x i32> @test2_masked_16xi32_perm_mem_mask3(<16 x i32>* %vp, <16 x i32> %vec2, <16 x i32> %mask) {
+; GENERIC-LABEL: test2_masked_16xi32_perm_mem_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufd {{.*#+}} zmm0 {%k1} = mem[3,1,1,1,7,5,5,5,11,9,9,9,15,13,13,13] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test2_masked_16xi32_perm_mem_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufd {{.*#+}} zmm0 {%k1} = mem[3,1,1,1,7,5,5,5,11,9,9,9,15,13,13,13] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <16 x i32>, <16 x i32>* %vp
+ %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 1, i32 1, i32 1, i32 7, i32 5, i32 5, i32 5, i32 11, i32 9, i32 9, i32 9, i32 15, i32 13, i32 13, i32 13>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2
+ ret <16 x i32> %res
+}
+
+define <16 x i32> @test2_masked_z_16xi32_perm_mem_mask3(<16 x i32>* %vp, <16 x i32> %mask) {
+; GENERIC-LABEL: test2_masked_z_16xi32_perm_mem_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[3,1,1,1,7,5,5,5,11,9,9,9,15,13,13,13] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test2_masked_z_16xi32_perm_mem_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[3,1,1,1,7,5,5,5,11,9,9,9,15,13,13,13] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec = load <16 x i32>, <16 x i32>* %vp
+ %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 1, i32 1, i32 1, i32 7, i32 5, i32 5, i32 5, i32 11, i32 9, i32 9, i32 9, i32 15, i32 13, i32 13, i32 13>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
+ ret <16 x i32> %res
+}
+
+define <8 x float> @test2_8xfloat_shuff_mask0(<8 x float> %vec1, <8 x float> %vec2) {
+; GENERIC-LABEL: test2_8xfloat_shuff_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test2_8xfloat_shuff_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+ ret <8 x float> %res
+}
+define <8 x float> @test2_8xfloat_masked_shuff_mask0(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) {
+; GENERIC-LABEL: test2_8xfloat_masked_shuff_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [1:1.00]
+; GENERIC-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test2_8xfloat_masked_shuff_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00]
+; SKX-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
+ ret <8 x float> %res
+}
+
+define <8 x float> @test2_8xfloat_zero_masked_shuff_mask0(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) {
+; GENERIC-LABEL: test2_8xfloat_zero_masked_shuff_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test2_8xfloat_zero_masked_shuff_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+define <8 x float> @test2_8xfloat_masked_shuff_mask1(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) {
+; GENERIC-LABEL: test2_8xfloat_masked_shuff_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [1:1.00]
+; GENERIC-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test2_8xfloat_masked_shuff_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00]
+; SKX-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
+ ret <8 x float> %res
+}
+
+define <8 x float> @test2_8xfloat_zero_masked_shuff_mask1(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) {
+; GENERIC-LABEL: test2_8xfloat_zero_masked_shuff_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test2_8xfloat_zero_masked_shuff_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+define <8 x float> @test2_8xfloat_masked_shuff_mask2(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) {
+; GENERIC-LABEL: test2_8xfloat_masked_shuff_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [1:1.00]
+; GENERIC-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test2_8xfloat_masked_shuff_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [3:1.00]
+; SKX-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
+ ret <8 x float> %res
+}
+
+define <8 x float> @test2_8xfloat_zero_masked_shuff_mask2(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) {
+; GENERIC-LABEL: test2_8xfloat_zero_masked_shuff_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test2_8xfloat_zero_masked_shuff_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+define <8 x float> @test2_8xfloat_shuff_mask3(<8 x float> %vec1, <8 x float> %vec2) {
+; GENERIC-LABEL: test2_8xfloat_shuff_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test2_8xfloat_shuff_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+ ret <8 x float> %res
+}
+define <8 x float> @test2_8xfloat_masked_shuff_mask3(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) {
+; GENERIC-LABEL: test2_8xfloat_masked_shuff_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [1:1.00]
+; GENERIC-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test2_8xfloat_masked_shuff_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00]
+; SKX-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_8xfloat_zero_masked_shuff_mask3(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) {
+; GENERIC-LABEL: test_8xfloat_zero_masked_shuff_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xfloat_zero_masked_shuff_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+define <8 x float> @test_8xfloat_shuff_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p) {
+; GENERIC-LABEL: test_8xfloat_shuff_mem_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [8:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xfloat_shuff_mem_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <8 x float>, <8 x float>* %vec2p
+ %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
+ ret <8 x float> %res
+}
+define <8 x float> @test_8xfloat_masked_shuff_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) {
+; GENERIC-LABEL: test_8xfloat_masked_shuff_mem_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [5:1.00]
+; GENERIC-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xfloat_masked_shuff_mem_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [10:1.00]
+; SKX-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <8 x float>, <8 x float>* %vec2p
+ %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) {
+; GENERIC-LABEL: test_8xfloat_zero_masked_shuff_mem_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xfloat_zero_masked_shuff_mem_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <8 x float>, <8 x float>* %vec2p
+ %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_8xfloat_masked_shuff_mem_mask1(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) {
+; GENERIC-LABEL: test_8xfloat_masked_shuff_mem_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [5:1.00]
+; GENERIC-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xfloat_masked_shuff_mem_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [10:1.00]
+; SKX-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <8 x float>, <8 x float>* %vec2p
+ %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask1(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) {
+; GENERIC-LABEL: test_8xfloat_zero_masked_shuff_mem_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xfloat_zero_masked_shuff_mem_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <8 x float>, <8 x float>* %vec2p
+ %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_8xfloat_masked_shuff_mem_mask2(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) {
+; GENERIC-LABEL: test_8xfloat_masked_shuff_mem_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [5:1.00]
+; GENERIC-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xfloat_masked_shuff_mem_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00]
+; SKX-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <8 x float>, <8 x float>* %vec2p
+ %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask2(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) {
+; GENERIC-LABEL: test_8xfloat_zero_masked_shuff_mem_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xfloat_zero_masked_shuff_mem_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <8 x float>, <8 x float>* %vec2p
+ %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_8xfloat_shuff_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p) {
+; GENERIC-LABEL: test_8xfloat_shuff_mem_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [8:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xfloat_shuff_mem_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <8 x float>, <8 x float>* %vec2p
+ %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+ ret <8 x float> %res
+}
+define <8 x float> @test_8xfloat_masked_shuff_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) {
+; GENERIC-LABEL: test_8xfloat_masked_shuff_mem_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [5:1.00]
+; GENERIC-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xfloat_masked_shuff_mem_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00]
+; SKX-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <8 x float>, <8 x float>* %vec2p
+ %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) {
+; GENERIC-LABEL: test_8xfloat_zero_masked_shuff_mem_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xfloat_zero_masked_shuff_mem_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <8 x float>, <8 x float>* %vec2p
+ %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+
+define <16 x float> @test_16xfloat_shuff_mask0(<16 x float> %vec1, <16 x float> %vec2, <16 x i32> %mask) {
+; GENERIC-LABEL: test_16xfloat_shuff_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[6,7,0,1],zmm1[2,3,6,7] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xfloat_shuff_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[6,7,0,1],zmm1[2,3,6,7] sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31>
+ ret <16 x float> %res
+}
+define <16 x float> @test_16xfloat_masked_shuff_mask0(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x i32> %mask) {
+; GENERIC-LABEL: test_16xfloat_masked_shuff_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[12,13,14,15,0,1,2,3],zmm1[4,5,6,7,12,13,14,15] sched: [1:1.00]
+; GENERIC-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xfloat_masked_shuff_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[12,13,14,15,0,1,2,3],zmm1[4,5,6,7,12,13,14,15] sched: [3:1.00]
+; SKX-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_16xfloat_zero_masked_shuff_mask0(<16 x float> %vec1, <16 x float> %vec2, <16 x i32> %mask) {
+; GENERIC-LABEL: test_16xfloat_zero_masked_shuff_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[12,13,14,15,0,1,2,3],zmm1[4,5,6,7,12,13,14,15] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xfloat_zero_masked_shuff_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[12,13,14,15,0,1,2,3],zmm1[4,5,6,7,12,13,14,15] sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
+define <16 x float> @test_16xfloat_masked_shuff_mask1(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x i32> %mask) {
+; GENERIC-LABEL: test_16xfloat_masked_shuff_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[0,1,2,3,8,9,10,11],zmm1[0,1,2,3,12,13,14,15] sched: [1:1.00]
+; GENERIC-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xfloat_masked_shuff_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[0,1,2,3,8,9,10,11],zmm1[0,1,2,3,12,13,14,15] sched: [3:1.00]
+; SKX-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19, i32 28, i32 29, i32 30, i32 31>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_16xfloat_zero_masked_shuff_mask1(<16 x float> %vec1, <16 x float> %vec2, <16 x i32> %mask) {
+; GENERIC-LABEL: test_16xfloat_zero_masked_shuff_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,8,9,10,11],zmm1[0,1,2,3,12,13,14,15] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xfloat_zero_masked_shuff_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,8,9,10,11],zmm1[0,1,2,3,12,13,14,15] sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19, i32 28, i32 29, i32 30, i32 31>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
+define <16 x float> @test_16xfloat_masked_shuff_mask2(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x i32> %mask) {
+; GENERIC-LABEL: test_16xfloat_masked_shuff_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[12,13,14,15,4,5,6,7],zmm1[0,1,2,3,4,5,6,7] sched: [1:1.00]
+; GENERIC-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xfloat_masked_shuff_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[12,13,14,15,4,5,6,7],zmm1[0,1,2,3,4,5,6,7] sched: [3:1.00]
+; SKX-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_16xfloat_zero_masked_shuff_mask2(<16 x float> %vec1, <16 x float> %vec2, <16 x i32> %mask) {
+; GENERIC-LABEL: test_16xfloat_zero_masked_shuff_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[12,13,14,15,4,5,6,7],zmm1[0,1,2,3,4,5,6,7] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xfloat_zero_masked_shuff_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[12,13,14,15,4,5,6,7],zmm1[0,1,2,3,4,5,6,7] sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
+define <16 x float> @test_16xfloat_shuff_mask3(<16 x float> %vec1, <16 x float> %vec2) {
+; GENERIC-LABEL: test_16xfloat_shuff_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[4,5,6,7],zmm1[0,1,4,5] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xfloat_shuff_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[4,5,6,7],zmm1[0,1,4,5] sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 24, i32 25, i32 26, i32 27>
+ ret <16 x float> %res
+}
+define <16 x float> @test_16xfloat_masked_shuff_mask3(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x i32> %mask) {
+; GENERIC-LABEL: test_16xfloat_masked_shuff_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[8,9,10,11,12,13,14,15],zmm1[0,1,2,3,8,9,10,11] sched: [1:1.00]
+; GENERIC-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xfloat_masked_shuff_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[8,9,10,11,12,13,14,15],zmm1[0,1,2,3,8,9,10,11] sched: [3:1.00]
+; SKX-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 24, i32 25, i32 26, i32 27>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_16xfloat_zero_masked_shuff_mask3(<16 x float> %vec1, <16 x float> %vec2, <16 x i32> %mask) {
+; GENERIC-LABEL: test_16xfloat_zero_masked_shuff_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,12,13,14,15],zmm1[0,1,2,3,8,9,10,11] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xfloat_zero_masked_shuff_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,12,13,14,15],zmm1[0,1,2,3,8,9,10,11] sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 24, i32 25, i32 26, i32 27>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
+define <16 x float> @test_16xfloat_shuff_mem_mask0(<16 x float> %vec1, <16 x float>* %vec2p) {
+; GENERIC-LABEL: test_16xfloat_shuff_mem_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[6,7,4,5],mem[4,5,2,3] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xfloat_shuff_mem_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[6,7,4,5],mem[4,5,2,3] sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <16 x float>, <16 x float>* %vec2p
+ %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23>
+ ret <16 x float> %res
+}
+define <16 x float> @test_16xfloat_masked_shuff_mem_mask0(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x i32> %mask) {
+; GENERIC-LABEL: test_16xfloat_masked_shuff_mem_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[12,13,14,15,8,9,10,11],mem[8,9,10,11,4,5,6,7] sched: [5:1.00]
+; GENERIC-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xfloat_masked_shuff_mem_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[12,13,14,15,8,9,10,11],mem[8,9,10,11,4,5,6,7] sched: [10:1.00]
+; SKX-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <16 x float>, <16 x float>* %vec2p
+ %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_16xfloat_zero_masked_shuff_mem_mask0(<16 x float> %vec1, <16 x float>* %vec2p, <16 x i32> %mask) {
+; GENERIC-LABEL: test_16xfloat_zero_masked_shuff_mem_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[12,13,14,15,8,9,10,11],mem[8,9,10,11,4,5,6,7] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xfloat_zero_masked_shuff_mem_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[12,13,14,15,8,9,10,11],mem[8,9,10,11,4,5,6,7] sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <16 x float>, <16 x float>* %vec2p
+ %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_16xfloat_masked_shuff_mem_mask1(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x i32> %mask) {
+; GENERIC-LABEL: test_16xfloat_masked_shuff_mem_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,4,5,6,7] sched: [5:1.00]
+; GENERIC-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xfloat_masked_shuff_mem_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,4,5,6,7] sched: [10:1.00]
+; SKX-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <16 x float>, <16 x float>* %vec2p
+ %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_16xfloat_zero_masked_shuff_mem_mask1(<16 x float> %vec1, <16 x float>* %vec2p, <16 x i32> %mask) {
+; GENERIC-LABEL: test_16xfloat_zero_masked_shuff_mem_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,4,5,6,7] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xfloat_zero_masked_shuff_mem_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,4,5,6,7] sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <16 x float>, <16 x float>* %vec2p
+ %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_16xfloat_masked_shuff_mem_mask2(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x i32> %mask) {
+; GENERIC-LABEL: test_16xfloat_masked_shuff_mem_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,0,1,2,3],mem[8,9,10,11,8,9,10,11] sched: [5:1.00]
+; GENERIC-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xfloat_masked_shuff_mem_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,0,1,2,3],mem[8,9,10,11,8,9,10,11] sched: [10:1.00]
+; SKX-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <16 x float>, <16 x float>* %vec2p
+ %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 24, i32 25, i32 26, i32 27, i32 24, i32 25, i32 26, i32 27>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_16xfloat_zero_masked_shuff_mem_mask2(<16 x float> %vec1, <16 x float>* %vec2p, <16 x i32> %mask) {
+; GENERIC-LABEL: test_16xfloat_zero_masked_shuff_mem_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,0,1,2,3],mem[8,9,10,11,8,9,10,11] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xfloat_zero_masked_shuff_mem_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,0,1,2,3],mem[8,9,10,11,8,9,10,11] sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <16 x float>, <16 x float>* %vec2p
+ %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 24, i32 25, i32 26, i32 27, i32 24, i32 25, i32 26, i32 27>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_16xfloat_shuff_mem_mask3(<16 x float> %vec1, <16 x float>* %vec2p) {
+; GENERIC-LABEL: test_16xfloat_shuff_mem_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[2,3,0,1],mem[6,7,6,7] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xfloat_shuff_mem_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[2,3,0,1],mem[6,7,6,7] sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <16 x float>, <16 x float>* %vec2p
+ %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 28, i32 29, i32 30, i32 31, i32 28, i32 29, i32 30, i32 31>
+ ret <16 x float> %res
+}
+define <16 x float> @test_16xfloat_masked_shuff_mem_mask3(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x i32> %mask) {
+; GENERIC-LABEL: test_16xfloat_masked_shuff_mem_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,0,1,2,3],mem[12,13,14,15,12,13,14,15] sched: [5:1.00]
+; GENERIC-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xfloat_masked_shuff_mem_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,0,1,2,3],mem[12,13,14,15,12,13,14,15] sched: [10:1.00]
+; SKX-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <16 x float>, <16 x float>* %vec2p
+ %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 28, i32 29, i32 30, i32 31, i32 28, i32 29, i32 30, i32 31>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_16xfloat_zero_masked_shuff_mem_mask3(<16 x float> %vec1, <16 x float>* %vec2p, <16 x i32> %mask) {
+; GENERIC-LABEL: test_16xfloat_zero_masked_shuff_mem_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,0,1,2,3],mem[12,13,14,15,12,13,14,15] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xfloat_zero_masked_shuff_mem_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,0,1,2,3],mem[12,13,14,15,12,13,14,15] sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <16 x float>, <16 x float>* %vec2p
+ %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 28, i32 29, i32 30, i32 31, i32 28, i32 29, i32 30, i32 31>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
+
+define <4 x double> @test_4xdouble_shuff_mask0(<4 x double> %vec1, <4 x double> %vec2) {
+; GENERIC-LABEL: test_4xdouble_shuff_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xdouble_shuff_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+ ret <4 x double> %res
+}
+define <4 x double> @test_4xdouble_masked_shuff_mask0(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) {
+; GENERIC-LABEL: test_4xdouble_masked_shuff_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
+; GENERIC-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xdouble_masked_shuff_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
+; SKX-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_4xdouble_zero_masked_shuff_mask0(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) {
+; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xdouble_zero_masked_shuff_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
+ ret <4 x double> %res
+}
+define <4 x double> @test_4xdouble_masked_shuff_mask1(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) {
+; GENERIC-LABEL: test_4xdouble_masked_shuff_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
+; GENERIC-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xdouble_masked_shuff_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
+; SKX-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_4xdouble_zero_masked_shuff_mask1(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) {
+; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xdouble_zero_masked_shuff_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
+ ret <4 x double> %res
+}
+define <4 x double> @test_4xdouble_masked_shuff_mask2(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) {
+; GENERIC-LABEL: test_4xdouble_masked_shuff_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
+; GENERIC-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xdouble_masked_shuff_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
+; SKX-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_4xdouble_zero_masked_shuff_mask2(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) {
+; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xdouble_zero_masked_shuff_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
+ ret <4 x double> %res
+}
+define <4 x double> @test_4xdouble_shuff_mask3(<4 x double> %vec1, <4 x double> %vec2) {
+; GENERIC-LABEL: test_4xdouble_shuff_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xdouble_shuff_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+ ret <4 x double> %res
+}
+define <4 x double> @test_4xdouble_masked_shuff_mask3(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) {
+; GENERIC-LABEL: test_4xdouble_masked_shuff_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
+; GENERIC-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xdouble_masked_shuff_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
+; SKX-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_4xdouble_zero_masked_shuff_mask3(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) {
+; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xdouble_zero_masked_shuff_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
+ ret <4 x double> %res
+}
+define <4 x double> @test_4xdouble_shuff_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p) {
+; GENERIC-LABEL: test_4xdouble_shuff_mem_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [8:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xdouble_shuff_mem_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <4 x double>, <4 x double>* %vec2p
+ %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+ ret <4 x double> %res
+}
+define <4 x double> @test_4xdouble_masked_shuff_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) {
+; GENERIC-LABEL: test_4xdouble_masked_shuff_mem_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] sched: [5:1.00]
+; GENERIC-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xdouble_masked_shuff_mem_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] sched: [10:1.00]
+; SKX-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <4 x double>, <4 x double>* %vec2p
+ %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) {
+; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mem_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xdouble_zero_masked_shuff_mem_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <4 x double>, <4 x double>* %vec2p
+ %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_4xdouble_masked_shuff_mem_mask1(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) {
+; GENERIC-LABEL: test_4xdouble_masked_shuff_mem_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] sched: [5:1.00]
+; GENERIC-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xdouble_masked_shuff_mem_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] sched: [10:1.00]
+; SKX-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <4 x double>, <4 x double>* %vec2p
+ %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask1(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) {
+; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mem_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xdouble_zero_masked_shuff_mem_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <4 x double>, <4 x double>* %vec2p
+ %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_4xdouble_masked_shuff_mem_mask2(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) {
+; GENERIC-LABEL: test_4xdouble_masked_shuff_mem_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] sched: [5:1.00]
+; GENERIC-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xdouble_masked_shuff_mem_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] sched: [10:1.00]
+; SKX-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <4 x double>, <4 x double>* %vec2p
+ %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask2(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) {
+; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mem_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xdouble_zero_masked_shuff_mem_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <4 x double>, <4 x double>* %vec2p
+ %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_4xdouble_shuff_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p) {
+; GENERIC-LABEL: test_4xdouble_shuff_mem_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [8:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xdouble_shuff_mem_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <4 x double>, <4 x double>* %vec2p
+ %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+ ret <4 x double> %res
+}
+define <4 x double> @test_4xdouble_masked_shuff_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) {
+; GENERIC-LABEL: test_4xdouble_masked_shuff_mem_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] sched: [5:1.00]
+; GENERIC-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xdouble_masked_shuff_mem_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] sched: [10:1.00]
+; SKX-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <4 x double>, <4 x double>* %vec2p
+ %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) {
+; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mem_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xdouble_zero_masked_shuff_mem_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <4 x double>, <4 x double>* %vec2p
+ %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
+ ret <4 x double> %res
+}
+
+define <8 x double> @test_8xdouble_shuff_mask0(<8 x double> %vec1, <8 x double> %vec2) {
+; GENERIC-LABEL: test_8xdouble_shuff_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[6,7,2,3],zmm1[6,7,0,1] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xdouble_shuff_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[6,7,2,3],zmm1[6,7,0,1] sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %res = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 6, i32 7, i32 2, i32 3, i32 14, i32 15, i32 8, i32 9>
+ ret <8 x double> %res
+}
+define <8 x double> @test_8xdouble_masked_shuff_mask0(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x i64> %mask) {
+; GENERIC-LABEL: test_8xdouble_masked_shuff_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[6,7,2,3],zmm1[6,7,0,1] sched: [1:1.00]
+; GENERIC-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xdouble_masked_shuff_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[6,7,2,3],zmm1[6,7,0,1] sched: [3:1.00]
+; SKX-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 6, i32 7, i32 2, i32 3, i32 14, i32 15, i32 8, i32 9>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_8xdouble_zero_masked_shuff_mask0(<8 x double> %vec1, <8 x double> %vec2, <8 x i64> %mask) {
+; GENERIC-LABEL: test_8xdouble_zero_masked_shuff_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,2,3],zmm1[6,7,0,1] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xdouble_zero_masked_shuff_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,2,3],zmm1[6,7,0,1] sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 6, i32 7, i32 2, i32 3, i32 14, i32 15, i32 8, i32 9>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
+ ret <8 x double> %res
+}
+define <8 x double> @test_8xdouble_masked_shuff_mask1(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x i64> %mask) {
+; GENERIC-LABEL: test_8xdouble_masked_shuff_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[0,1,4,5],zmm1[0,1,4,5] sched: [1:1.00]
+; GENERIC-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xdouble_masked_shuff_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[0,1,4,5],zmm1[0,1,4,5] sched: [3:1.00]
+; SKX-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 8, i32 9, i32 12, i32 13>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_8xdouble_zero_masked_shuff_mask1(<8 x double> %vec1, <8 x double> %vec2, <8 x i64> %mask) {
+; GENERIC-LABEL: test_8xdouble_zero_masked_shuff_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,4,5],zmm1[0,1,4,5] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xdouble_zero_masked_shuff_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,4,5],zmm1[0,1,4,5] sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 8, i32 9, i32 12, i32 13>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
+ ret <8 x double> %res
+}
+define <8 x double> @test_8xdouble_masked_shuff_mask2(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x i64> %mask) {
+; GENERIC-LABEL: test_8xdouble_masked_shuff_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[6,7,4,5],zmm1[4,5,0,1] sched: [1:1.00]
+; GENERIC-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xdouble_masked_shuff_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[6,7,4,5],zmm1[4,5,0,1] sched: [3:1.00]
+; SKX-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 6, i32 7, i32 4, i32 5, i32 12, i32 13, i32 8, i32 9>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_8xdouble_zero_masked_shuff_mask2(<8 x double> %vec1, <8 x double> %vec2, <8 x i64> %mask) {
+; GENERIC-LABEL: test_8xdouble_zero_masked_shuff_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,4,5],zmm1[4,5,0,1] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xdouble_zero_masked_shuff_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,4,5],zmm1[4,5,0,1] sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 6, i32 7, i32 4, i32 5, i32 12, i32 13, i32 8, i32 9>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
+ ret <8 x double> %res
+}
+define <8 x double> @test_8xdouble_shuff_mask3(<8 x double> %vec1, <8 x double> %vec2) {
+; GENERIC-LABEL: test_8xdouble_shuff_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[4,5,4,5],zmm1[4,5,2,3] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xdouble_shuff_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[4,5,4,5],zmm1[4,5,2,3] sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %res = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 4, i32 5, i32 4, i32 5, i32 12, i32 13, i32 10, i32 11>
+ ret <8 x double> %res
+}
+define <8 x double> @test_8xdouble_masked_shuff_mask3(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x i64> %mask) {
+; GENERIC-LABEL: test_8xdouble_masked_shuff_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[4,5,4,5],zmm1[4,5,2,3] sched: [1:1.00]
+; GENERIC-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xdouble_masked_shuff_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[4,5,4,5],zmm1[4,5,2,3] sched: [3:1.00]
+; SKX-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 4, i32 5, i32 4, i32 5, i32 12, i32 13, i32 10, i32 11>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_8xdouble_zero_masked_shuff_mask3(<8 x double> %vec1, <8 x double> %vec2, <8 x i64> %mask) {
+; GENERIC-LABEL: test_8xdouble_zero_masked_shuff_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,4,5],zmm1[4,5,2,3] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xdouble_zero_masked_shuff_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,4,5],zmm1[4,5,2,3] sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 4, i32 5, i32 4, i32 5, i32 12, i32 13, i32 10, i32 11>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
+ ret <8 x double> %res
+}
+define <8 x double> @test_8xdouble_shuff_mem_mask0(<8 x double> %vec1, <8 x double>* %vec2p) {
+; GENERIC-LABEL: test_8xdouble_shuff_mem_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[6,7,0,1],mem[0,1,0,1] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xdouble_shuff_mem_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[6,7,0,1],mem[0,1,0,1] sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <8 x double>, <8 x double>* %vec2p
+ %res = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 6, i32 7, i32 0, i32 1, i32 8, i32 9, i32 8, i32 9>
+ ret <8 x double> %res
+}
+define <8 x double> @test_8xdouble_masked_shuff_mem_mask0(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x i64> %mask) {
+; GENERIC-LABEL: test_8xdouble_masked_shuff_mem_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[6,7,0,1],mem[0,1,0,1] sched: [5:1.00]
+; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xdouble_masked_shuff_mem_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[6,7,0,1],mem[0,1,0,1] sched: [10:1.00]
+; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <8 x double>, <8 x double>* %vec2p
+ %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 6, i32 7, i32 0, i32 1, i32 8, i32 9, i32 8, i32 9>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_8xdouble_zero_masked_shuff_mem_mask0(<8 x double> %vec1, <8 x double>* %vec2p, <8 x i64> %mask) {
+; GENERIC-LABEL: test_8xdouble_zero_masked_shuff_mem_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,0,1],mem[0,1,0,1] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xdouble_zero_masked_shuff_mem_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,0,1],mem[0,1,0,1] sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <8 x double>, <8 x double>* %vec2p
+ %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 6, i32 7, i32 0, i32 1, i32 8, i32 9, i32 8, i32 9>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_8xdouble_masked_shuff_mem_mask1(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x i64> %mask) {
+; GENERIC-LABEL: test_8xdouble_masked_shuff_mem_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[6,7,6,7],mem[0,1,2,3] sched: [5:1.00]
+; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xdouble_masked_shuff_mem_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[6,7,6,7],mem[0,1,2,3] sched: [10:1.00]
+; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <8 x double>, <8 x double>* %vec2p
+ %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 6, i32 7, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_8xdouble_zero_masked_shuff_mem_mask1(<8 x double> %vec1, <8 x double>* %vec2p, <8 x i64> %mask) {
+; GENERIC-LABEL: test_8xdouble_zero_masked_shuff_mem_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,6,7],mem[0,1,2,3] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xdouble_zero_masked_shuff_mem_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,6,7],mem[0,1,2,3] sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <8 x double>, <8 x double>* %vec2p
+ %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 6, i32 7, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_8xdouble_masked_shuff_mem_mask2(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x i64> %mask) {
+; GENERIC-LABEL: test_8xdouble_masked_shuff_mem_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3],mem[0,1,4,5] sched: [5:1.00]
+; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xdouble_masked_shuff_mem_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3],mem[0,1,4,5] sched: [10:1.00]
+; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <8 x double>, <8 x double>* %vec2p
+ %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 12, i32 13>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_8xdouble_zero_masked_shuff_mem_mask2(<8 x double> %vec1, <8 x double>* %vec2p, <8 x i64> %mask) {
+; GENERIC-LABEL: test_8xdouble_zero_masked_shuff_mem_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3],mem[0,1,4,5] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xdouble_zero_masked_shuff_mem_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3],mem[0,1,4,5] sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <8 x double>, <8 x double>* %vec2p
+ %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 12, i32 13>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_8xdouble_shuff_mem_mask3(<8 x double> %vec1, <8 x double>* %vec2p) {
+; GENERIC-LABEL: test_8xdouble_shuff_mem_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[2,3,0,1],mem[4,5,0,1] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xdouble_shuff_mem_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[2,3,0,1],mem[4,5,0,1] sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <8 x double>, <8 x double>* %vec2p
+ %res = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 12, i32 13, i32 8, i32 9>
+ ret <8 x double> %res
+}
+define <8 x double> @test_8xdouble_masked_shuff_mem_mask3(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x i64> %mask) {
+; GENERIC-LABEL: test_8xdouble_masked_shuff_mem_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[4,5,0,1] sched: [5:1.00]
+; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xdouble_masked_shuff_mem_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[4,5,0,1] sched: [10:1.00]
+; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <8 x double>, <8 x double>* %vec2p
+ %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 12, i32 13, i32 8, i32 9>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_8xdouble_zero_masked_shuff_mem_mask3(<8 x double> %vec1, <8 x double>* %vec2p, <8 x i64> %mask) {
+; GENERIC-LABEL: test_8xdouble_zero_masked_shuff_mem_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,0,1],mem[4,5,0,1] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xdouble_zero_masked_shuff_mem_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,0,1],mem[4,5,0,1] sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <8 x double>, <8 x double>* %vec2p
+ %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 12, i32 13, i32 8, i32 9>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
+ ret <8 x double> %res
+}
+
+define <8 x i32> @test_8xi32_shuff_mask0(<8 x i32> %vec1, <8 x i32> %vec2) {
+; GENERIC-LABEL: test_8xi32_shuff_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xi32_shuff_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %res = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
+ ret <8 x i32> %res
+}
+define <8 x i32> @test_8xi32_masked_shuff_mask0(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %vec3, <8 x i32> %mask) {
+; GENERIC-LABEL: test_8xi32_masked_shuff_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [1:1.00]
+; GENERIC-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xi32_masked_shuff_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [3:1.00]
+; SKX-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec3
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_8xi32_zero_masked_shuff_mask0(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %mask) {
+; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xi32_zero_masked_shuff_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
+ ret <8 x i32> %res
+}
+define <8 x i32> @test_8xi32_masked_shuff_mask1(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %vec3, <8 x i32> %mask) {
+; GENERIC-LABEL: test_8xi32_masked_shuff_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [1:1.00]
+; GENERIC-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xi32_masked_shuff_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00]
+; SKX-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec3
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_8xi32_zero_masked_shuff_mask1(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %mask) {
+; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xi32_zero_masked_shuff_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
+ ret <8 x i32> %res
+}
+define <8 x i32> @test_8xi32_masked_shuff_mask2(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %vec3, <8 x i32> %mask) {
+; GENERIC-LABEL: test_8xi32_masked_shuff_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [1:1.00]
+; GENERIC-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xi32_masked_shuff_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [3:1.00]
+; SKX-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec3
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_8xi32_zero_masked_shuff_mask2(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %mask) {
+; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xi32_zero_masked_shuff_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
+ ret <8 x i32> %res
+}
+define <8 x i32> @test_8xi32_shuff_mask3(<8 x i32> %vec1, <8 x i32> %vec2) {
+; GENERIC-LABEL: test_8xi32_shuff_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xi32_shuff_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %res = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+ ret <8 x i32> %res
+}
+define <8 x i32> @test_8xi32_masked_shuff_mask3(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %vec3, <8 x i32> %mask) {
+; GENERIC-LABEL: test_8xi32_masked_shuff_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [1:1.00]
+; GENERIC-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xi32_masked_shuff_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00]
+; SKX-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec3
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_8xi32_zero_masked_shuff_mask3(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %mask) {
+; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xi32_zero_masked_shuff_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
+ ret <8 x i32> %res
+}
+define <8 x i32> @test_8xi32_shuff_mem_mask0(<8 x i32> %vec1, <8 x i32>* %vec2p) {
+; GENERIC-LABEL: test_8xi32_shuff_mem_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xi32_shuff_mem_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <8 x i32>, <8 x i32>* %vec2p
+ %res = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
+ ret <8 x i32> %res
+}
+define <8 x i32> @test_8xi32_masked_shuff_mem_mask0(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %vec3, <8 x i32> %mask) {
+; GENERIC-LABEL: test_8xi32_masked_shuff_mem_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [5:1.00]
+; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xi32_masked_shuff_mem_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [10:1.00]
+; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <8 x i32>, <8 x i32>* %vec2p
+ %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec3
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask0(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %mask) {
+; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mem_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xi32_zero_masked_shuff_mem_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <8 x i32>, <8 x i32>* %vec2p
+ %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_8xi32_masked_shuff_mem_mask1(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %vec3, <8 x i32> %mask) {
+; GENERIC-LABEL: test_8xi32_masked_shuff_mem_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [5:1.00]
+; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xi32_masked_shuff_mem_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00]
+; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <8 x i32>, <8 x i32>* %vec2p
+ %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec3
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask1(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %mask) {
+; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mem_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xi32_zero_masked_shuff_mem_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <8 x i32>, <8 x i32>* %vec2p
+ %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_8xi32_masked_shuff_mem_mask2(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %vec3, <8 x i32> %mask) {
+; GENERIC-LABEL: test_8xi32_masked_shuff_mem_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [5:1.00]
+; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xi32_masked_shuff_mem_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00]
+; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <8 x i32>, <8 x i32>* %vec2p
+ %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec3
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask2(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %mask) {
+; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mem_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xi32_zero_masked_shuff_mem_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <8 x i32>, <8 x i32>* %vec2p
+ %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_8xi32_shuff_mem_mask3(<8 x i32> %vec1, <8 x i32>* %vec2p) {
+; GENERIC-LABEL: test_8xi32_shuff_mem_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xi32_shuff_mem_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <8 x i32>, <8 x i32>* %vec2p
+ %res = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+ ret <8 x i32> %res
+}
+define <8 x i32> @test_8xi32_masked_shuff_mem_mask3(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %vec3, <8 x i32> %mask) {
+; GENERIC-LABEL: test_8xi32_masked_shuff_mem_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [5:1.00]
+; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xi32_masked_shuff_mem_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00]
+; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <8 x i32>, <8 x i32>* %vec2p
+ %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec3
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask3(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %mask) {
+; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mem_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xi32_zero_masked_shuff_mem_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <8 x i32>, <8 x i32>* %vec2p
+ %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
+ ret <8 x i32> %res
+}
+
+define <16 x i32> @test_16xi32_shuff_mask0(<16 x i32> %vec1, <16 x i32> %vec2) {
+; GENERIC-LABEL: test_16xi32_shuff_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,2,3],zmm1[2,3,6,7] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xi32_shuff_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,2,3],zmm1[2,3,6,7] sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %res = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31>
+ ret <16 x i32> %res
+}
+define <16 x i32> @test_16xi32_masked_shuff_mask0(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %vec3, <16 x i32> %mask) {
+; GENERIC-LABEL: test_16xi32_masked_shuff_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[4,5,6,7,4,5,6,7],zmm1[4,5,6,7,12,13,14,15] sched: [1:1.00]
+; GENERIC-NEXT: vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xi32_masked_shuff_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[4,5,6,7,4,5,6,7],zmm1[4,5,6,7,12,13,14,15] sched: [3:1.00]
+; SKX-NEXT: vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec3
+ ret <16 x i32> %res
+}
+
+define <16 x i32> @test_16xi32_zero_masked_shuff_mask0(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %mask) {
+; GENERIC-LABEL: test_16xi32_zero_masked_shuff_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,4,5,6,7],zmm1[4,5,6,7,12,13,14,15] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xi32_zero_masked_shuff_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,4,5,6,7],zmm1[4,5,6,7,12,13,14,15] sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
+ ret <16 x i32> %res
+}
+define <16 x i32> @test_16xi32_masked_shuff_mask1(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %vec3, <16 x i32> %mask) {
+; GENERIC-LABEL: test_16xi32_masked_shuff_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[8,9,10,11,8,9,10,11],zmm1[8,9,10,11,4,5,6,7] sched: [1:1.00]
+; GENERIC-NEXT: vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xi32_masked_shuff_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[8,9,10,11,8,9,10,11],zmm1[8,9,10,11,4,5,6,7] sched: [3:1.00]
+; SKX-NEXT: vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 8, i32 9, i32 10, i32 11, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec3
+ ret <16 x i32> %res
+}
+
+define <16 x i32> @test_16xi32_zero_masked_shuff_mask1(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %mask) {
+; GENERIC-LABEL: test_16xi32_zero_masked_shuff_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,8,9,10,11],zmm1[8,9,10,11,4,5,6,7] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xi32_zero_masked_shuff_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,8,9,10,11],zmm1[8,9,10,11,4,5,6,7] sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 8, i32 9, i32 10, i32 11, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
+ ret <16 x i32> %res
+}
+define <16 x i32> @test_16xi32_masked_shuff_mask2(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %vec3, <16 x i32> %mask) {
+; GENERIC-LABEL: test_16xi32_masked_shuff_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[4,5,6,7,8,9,10,11],zmm1[0,1,2,3,0,1,2,3] sched: [1:1.00]
+; GENERIC-NEXT: vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xi32_masked_shuff_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[4,5,6,7,8,9,10,11],zmm1[0,1,2,3,0,1,2,3] sched: [3:1.00]
+; SKX-NEXT: vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19, i32 16, i32 17, i32 18, i32 19>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec3
+ ret <16 x i32> %res
+}
+
+define <16 x i32> @test_16xi32_zero_masked_shuff_mask2(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %mask) {
+; GENERIC-LABEL: test_16xi32_zero_masked_shuff_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,8,9,10,11],zmm1[0,1,2,3,0,1,2,3] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xi32_zero_masked_shuff_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,8,9,10,11],zmm1[0,1,2,3,0,1,2,3] sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19, i32 16, i32 17, i32 18, i32 19>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
+ ret <16 x i32> %res
+}
+define <16 x i32> @test_16xi32_shuff_mask3(<16 x i32> %vec1, <16 x i32> %vec2) {
+; GENERIC-LABEL: test_16xi32_shuff_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,0,1],zmm1[4,5,2,3] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xi32_shuff_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,0,1],zmm1[4,5,2,3] sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %res = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23>
+ ret <16 x i32> %res
+}
+define <16 x i32> @test_16xi32_masked_shuff_mask3(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %vec3, <16 x i32> %mask) {
+; GENERIC-LABEL: test_16xi32_masked_shuff_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[4,5,6,7,0,1,2,3],zmm1[8,9,10,11,4,5,6,7] sched: [1:1.00]
+; GENERIC-NEXT: vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xi32_masked_shuff_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[4,5,6,7,0,1,2,3],zmm1[8,9,10,11,4,5,6,7] sched: [3:1.00]
+; SKX-NEXT: vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec3
+ ret <16 x i32> %res
+}
+
+define <16 x i32> @test_16xi32_zero_masked_shuff_mask3(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %mask) {
+; GENERIC-LABEL: test_16xi32_zero_masked_shuff_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,0,1,2,3],zmm1[8,9,10,11,4,5,6,7] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xi32_zero_masked_shuff_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,0,1,2,3],zmm1[8,9,10,11,4,5,6,7] sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
+ ret <16 x i32> %res
+}
+define <16 x i32> @test_16xi32_shuff_mem_mask0(<16 x i32> %vec1, <16 x i32>* %vec2p) {
+; GENERIC-LABEL: test_16xi32_shuff_mem_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[4,5,2,3],mem[4,5,0,1] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xi32_shuff_mem_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[4,5,2,3],mem[4,5,0,1] sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <16 x i32>, <16 x i32>* %vec2p
+ %res = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 16, i32 17, i32 18, i32 19>
+ ret <16 x i32> %res
+}
+define <16 x i32> @test_16xi32_masked_shuff_mem_mask0(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %vec3, <16 x i32> %mask) {
+; GENERIC-LABEL: test_16xi32_masked_shuff_mem_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,0,1,2,3] sched: [5:1.00]
+; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xi32_masked_shuff_mem_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,0,1,2,3] sched: [10:1.00]
+; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <16 x i32>, <16 x i32>* %vec2p
+ %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 16, i32 17, i32 18, i32 19>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec3
+ ret <16 x i32> %res
+}
+
+define <16 x i32> @test_16xi32_zero_masked_shuff_mem_mask0(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %mask) {
+; GENERIC-LABEL: test_16xi32_zero_masked_shuff_mem_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,0,1,2,3] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xi32_zero_masked_shuff_mem_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,0,1,2,3] sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <16 x i32>, <16 x i32>* %vec2p
+ %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 16, i32 17, i32 18, i32 19>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
+ ret <16 x i32> %res
+}
+
+define <16 x i32> @test_16xi32_masked_shuff_mem_mask1(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %vec3, <16 x i32> %mask) {
+; GENERIC-LABEL: test_16xi32_masked_shuff_mem_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,4,5,6,7],mem[0,1,2,3,8,9,10,11] sched: [5:1.00]
+; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xi32_masked_shuff_mem_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,4,5,6,7],mem[0,1,2,3,8,9,10,11] sched: [10:1.00]
+; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <16 x i32>, <16 x i32>* %vec2p
+ %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 24, i32 25, i32 26, i32 27>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec3
+ ret <16 x i32> %res
+}
+
+define <16 x i32> @test_16xi32_zero_masked_shuff_mem_mask1(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %mask) {
+; GENERIC-LABEL: test_16xi32_zero_masked_shuff_mem_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,4,5,6,7],mem[0,1,2,3,8,9,10,11] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xi32_zero_masked_shuff_mem_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,4,5,6,7],mem[0,1,2,3,8,9,10,11] sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <16 x i32>, <16 x i32>* %vec2p
+ %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 24, i32 25, i32 26, i32 27>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
+ ret <16 x i32> %res
+}
+
+define <16 x i32> @test_16xi32_masked_shuff_mem_mask2(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %vec3, <16 x i32> %mask) {
+; GENERIC-LABEL: test_16xi32_masked_shuff_mem_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,8,9,10,11],mem[12,13,14,15,12,13,14,15] sched: [5:1.00]
+; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xi32_masked_shuff_mem_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,8,9,10,11],mem[12,13,14,15,12,13,14,15] sched: [10:1.00]
+; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <16 x i32>, <16 x i32>* %vec2p
+ %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 28, i32 29, i32 30, i32 31, i32 28, i32 29, i32 30, i32 31>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec3
+ ret <16 x i32> %res
+}
+
+define <16 x i32> @test_16xi32_zero_masked_shuff_mem_mask2(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %mask) {
+; GENERIC-LABEL: test_16xi32_zero_masked_shuff_mem_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,8,9,10,11],mem[12,13,14,15,12,13,14,15] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xi32_zero_masked_shuff_mem_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,8,9,10,11],mem[12,13,14,15,12,13,14,15] sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <16 x i32>, <16 x i32>* %vec2p
+ %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 28, i32 29, i32 30, i32 31, i32 28, i32 29, i32 30, i32 31>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
+ ret <16 x i32> %res
+}
+
+define <16 x i32> @test_16xi32_shuff_mem_mask3(<16 x i32> %vec1, <16 x i32>* %vec2p) {
+; GENERIC-LABEL: test_16xi32_shuff_mem_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,2,3],mem[2,3,6,7] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xi32_shuff_mem_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,2,3],mem[2,3,6,7] sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <16 x i32>, <16 x i32>* %vec2p
+ %res = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31>
+ ret <16 x i32> %res
+}
+define <16 x i32> @test_16xi32_masked_shuff_mem_mask3(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %vec3, <16 x i32> %mask) {
+; GENERIC-LABEL: test_16xi32_masked_shuff_mem_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,4,5,6,7],mem[4,5,6,7,12,13,14,15] sched: [5:1.00]
+; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xi32_masked_shuff_mem_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,4,5,6,7],mem[4,5,6,7,12,13,14,15] sched: [10:1.00]
+; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <16 x i32>, <16 x i32>* %vec2p
+ %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec3
+ ret <16 x i32> %res
+}
+
+define <16 x i32> @test_16xi32_zero_masked_shuff_mem_mask3(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %mask) {
+; GENERIC-LABEL: test_16xi32_zero_masked_shuff_mem_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,4,5,6,7],mem[4,5,6,7,12,13,14,15] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xi32_zero_masked_shuff_mem_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,4,5,6,7],mem[4,5,6,7,12,13,14,15] sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <16 x i32>, <16 x i32>* %vec2p
+ %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
+ ret <16 x i32> %res
+}
+
+define <4 x i64> @test_4xi64_shuff_mask0(<4 x i64> %vec1, <4 x i64> %vec2) {
+; GENERIC-LABEL: test_4xi64_shuff_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xi64_shuff_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %res = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+ ret <4 x i64> %res
+}
+define <4 x i64> @test_4xi64_masked_shuff_mask0(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %vec3, <4 x i64> %mask) {
+; GENERIC-LABEL: test_4xi64_masked_shuff_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
+; GENERIC-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xi64_masked_shuff_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
+; SKX-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec3
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_4xi64_zero_masked_shuff_mask0(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %mask) {
+; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xi64_zero_masked_shuff_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
+ ret <4 x i64> %res
+}
+define <4 x i64> @test_4xi64_masked_shuff_mask1(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %vec3, <4 x i64> %mask) {
+; GENERIC-LABEL: test_4xi64_masked_shuff_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
+; GENERIC-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xi64_masked_shuff_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
+; SKX-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec3
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_4xi64_zero_masked_shuff_mask1(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %mask) {
+; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xi64_zero_masked_shuff_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
+ ret <4 x i64> %res
+}
+define <4 x i64> @test_4xi64_masked_shuff_mask2(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %vec3, <4 x i64> %mask) {
+; GENERIC-LABEL: test_4xi64_masked_shuff_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
+; GENERIC-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xi64_masked_shuff_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
+; SKX-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec3
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_4xi64_zero_masked_shuff_mask2(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %mask) {
+; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xi64_zero_masked_shuff_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
+ ret <4 x i64> %res
+}
+define <4 x i64> @test_4xi64_shuff_mask3(<4 x i64> %vec1, <4 x i64> %vec2) {
+; GENERIC-LABEL: test_4xi64_shuff_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xi64_shuff_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %res = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+ ret <4 x i64> %res
+}
+define <4 x i64> @test_4xi64_masked_shuff_mask3(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %vec3, <4 x i64> %mask) {
+; GENERIC-LABEL: test_4xi64_masked_shuff_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
+; GENERIC-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xi64_masked_shuff_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
+; SKX-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec3
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_4xi64_zero_masked_shuff_mask3(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %mask) {
+; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xi64_zero_masked_shuff_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
+ ret <4 x i64> %res
+}
+define <4 x i64> @test_4xi64_shuff_mem_mask0(<4 x i64> %vec1, <4 x i64>* %vec2p) {
+; GENERIC-LABEL: test_4xi64_shuff_mem_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xi64_shuff_mem_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <4 x i64>, <4 x i64>* %vec2p
+ %res = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+ ret <4 x i64> %res
+}
+define <4 x i64> @test_4xi64_masked_shuff_mem_mask0(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %vec3, <4 x i64> %mask) {
+; GENERIC-LABEL: test_4xi64_masked_shuff_mem_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] sched: [5:1.00]
+; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xi64_masked_shuff_mem_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] sched: [10:1.00]
+; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <4 x i64>, <4 x i64>* %vec2p
+ %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec3
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask0(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %mask) {
+; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mem_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xi64_zero_masked_shuff_mem_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <4 x i64>, <4 x i64>* %vec2p
+ %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_4xi64_masked_shuff_mem_mask1(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %vec3, <4 x i64> %mask) {
+; GENERIC-LABEL: test_4xi64_masked_shuff_mem_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] sched: [5:1.00]
+; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xi64_masked_shuff_mem_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] sched: [10:1.00]
+; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <4 x i64>, <4 x i64>* %vec2p
+ %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec3
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask1(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %mask) {
+; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mem_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xi64_zero_masked_shuff_mem_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <4 x i64>, <4 x i64>* %vec2p
+ %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_4xi64_masked_shuff_mem_mask2(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %vec3, <4 x i64> %mask) {
+; GENERIC-LABEL: test_4xi64_masked_shuff_mem_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] sched: [5:1.00]
+; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xi64_masked_shuff_mem_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] sched: [10:1.00]
+; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <4 x i64>, <4 x i64>* %vec2p
+ %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec3
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask2(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %mask) {
+; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mem_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xi64_zero_masked_shuff_mem_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <4 x i64>, <4 x i64>* %vec2p
+ %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_4xi64_shuff_mem_mask3(<4 x i64> %vec1, <4 x i64>* %vec2p) {
+; GENERIC-LABEL: test_4xi64_shuff_mem_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xi64_shuff_mem_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <4 x i64>, <4 x i64>* %vec2p
+ %res = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+ ret <4 x i64> %res
+}
+define <4 x i64> @test_4xi64_masked_shuff_mem_mask3(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %vec3, <4 x i64> %mask) {
+; GENERIC-LABEL: test_4xi64_masked_shuff_mem_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] sched: [5:1.00]
+; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xi64_masked_shuff_mem_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] sched: [10:1.00]
+; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <4 x i64>, <4 x i64>* %vec2p
+ %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec3
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask3(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %mask) {
+; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mem_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xi64_zero_masked_shuff_mem_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <4 x i64>, <4 x i64>* %vec2p
+ %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
+ ret <4 x i64> %res
+}
+
+define <8 x i64> @test_8xi64_shuff_mask0(<8 x i64> %vec1, <8 x i64> %vec2) {
+; GENERIC-LABEL: test_8xi64_shuff_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[4,5,4,5],zmm1[4,5,4,5] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xi64_shuff_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[4,5,4,5],zmm1[4,5,4,5] sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %res = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 4, i32 5, i32 4, i32 5, i32 12, i32 13, i32 12, i32 13>
+ ret <8 x i64> %res
+}
+define <8 x i64> @test_8xi64_masked_shuff_mask0(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %vec3, <8 x i64> %mask) {
+; GENERIC-LABEL: test_8xi64_masked_shuff_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[4,5,4,5],zmm1[4,5,4,5] sched: [1:1.00]
+; GENERIC-NEXT: vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xi64_masked_shuff_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[4,5,4,5],zmm1[4,5,4,5] sched: [3:1.00]
+; SKX-NEXT: vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 4, i32 5, i32 4, i32 5, i32 12, i32 13, i32 12, i32 13>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec3
+ ret <8 x i64> %res
+}
+
+define <8 x i64> @test_8xi64_zero_masked_shuff_mask0(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %mask) {
+; GENERIC-LABEL: test_8xi64_zero_masked_shuff_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,4,5],zmm1[4,5,4,5] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xi64_zero_masked_shuff_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,4,5],zmm1[4,5,4,5] sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 4, i32 5, i32 4, i32 5, i32 12, i32 13, i32 12, i32 13>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
+ ret <8 x i64> %res
+}
+define <8 x i64> @test_8xi64_masked_shuff_mask1(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %vec3, <8 x i64> %mask) {
+; GENERIC-LABEL: test_8xi64_masked_shuff_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[6,7,4,5],zmm1[2,3,4,5] sched: [1:1.00]
+; GENERIC-NEXT: vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xi64_masked_shuff_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[6,7,4,5],zmm1[2,3,4,5] sched: [3:1.00]
+; SKX-NEXT: vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 6, i32 7, i32 4, i32 5, i32 10, i32 11, i32 12, i32 13>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec3
+ ret <8 x i64> %res
+}
+
+define <8 x i64> @test_8xi64_zero_masked_shuff_mask1(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %mask) {
+; GENERIC-LABEL: test_8xi64_zero_masked_shuff_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,4,5],zmm1[2,3,4,5] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xi64_zero_masked_shuff_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,4,5],zmm1[2,3,4,5] sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 6, i32 7, i32 4, i32 5, i32 10, i32 11, i32 12, i32 13>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
+ ret <8 x i64> %res
+}
+define <8 x i64> @test_8xi64_masked_shuff_mask2(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %vec3, <8 x i64> %mask) {
+; GENERIC-LABEL: test_8xi64_masked_shuff_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[0,1,4,5],zmm1[0,1,0,1] sched: [1:1.00]
+; GENERIC-NEXT: vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xi64_masked_shuff_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[0,1,4,5],zmm1[0,1,0,1] sched: [3:1.00]
+; SKX-NEXT: vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 8, i32 9, i32 8, i32 9>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec3
+ ret <8 x i64> %res
+}
+
+define <8 x i64> @test_8xi64_zero_masked_shuff_mask2(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %mask) {
+; GENERIC-LABEL: test_8xi64_zero_masked_shuff_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,4,5],zmm1[0,1,0,1] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xi64_zero_masked_shuff_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,4,5],zmm1[0,1,0,1] sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 8, i32 9, i32 8, i32 9>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
+ ret <8 x i64> %res
+}
+define <8 x i64> @test_8xi64_shuff_mask3(<8 x i64> %vec1, <8 x i64> %vec2) {
+; GENERIC-LABEL: test_8xi64_shuff_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,6,7],zmm1[4,5,2,3] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xi64_shuff_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,6,7],zmm1[4,5,2,3] sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %res = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 6, i32 7, i32 12, i32 13, i32 10, i32 11>
+ ret <8 x i64> %res
+}
+define <8 x i64> @test_8xi64_masked_shuff_mask3(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %vec3, <8 x i64> %mask) {
+; GENERIC-LABEL: test_8xi64_masked_shuff_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[2,3,6,7],zmm1[4,5,2,3] sched: [1:1.00]
+; GENERIC-NEXT: vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xi64_masked_shuff_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[2,3,6,7],zmm1[4,5,2,3] sched: [3:1.00]
+; SKX-NEXT: vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 6, i32 7, i32 12, i32 13, i32 10, i32 11>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec3
+ ret <8 x i64> %res
+}
+
+define <8 x i64> @test_8xi64_zero_masked_shuff_mask3(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %mask) {
+; GENERIC-LABEL: test_8xi64_zero_masked_shuff_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,6,7],zmm1[4,5,2,3] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xi64_zero_masked_shuff_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,6,7],zmm1[4,5,2,3] sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 6, i32 7, i32 12, i32 13, i32 10, i32 11>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
+ ret <8 x i64> %res
+}
+define <8 x i64> @test_8xi64_shuff_mem_mask0(<8 x i64> %vec1, <8 x i64>* %vec2p) {
+; GENERIC-LABEL: test_8xi64_shuff_mem_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,2,3],mem[4,5,2,3] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xi64_shuff_mem_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,2,3],mem[4,5,2,3] sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <8 x i64>, <8 x i64>* %vec2p
+ %res = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 2, i32 3, i32 12, i32 13, i32 10, i32 11>
+ ret <8 x i64> %res
+}
+define <8 x i64> @test_8xi64_masked_shuff_mem_mask0(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %vec3, <8 x i64> %mask) {
+; GENERIC-LABEL: test_8xi64_masked_shuff_mem_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,2,3],mem[4,5,2,3] sched: [5:1.00]
+; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xi64_masked_shuff_mem_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,2,3],mem[4,5,2,3] sched: [10:1.00]
+; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <8 x i64>, <8 x i64>* %vec2p
+ %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 2, i32 3, i32 12, i32 13, i32 10, i32 11>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec3
+ ret <8 x i64> %res
+}
+
+define <8 x i64> @test_8xi64_zero_masked_shuff_mem_mask0(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %mask) {
+; GENERIC-LABEL: test_8xi64_zero_masked_shuff_mem_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,2,3],mem[4,5,2,3] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xi64_zero_masked_shuff_mem_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,2,3],mem[4,5,2,3] sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <8 x i64>, <8 x i64>* %vec2p
+ %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 2, i32 3, i32 12, i32 13, i32 10, i32 11>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
+ ret <8 x i64> %res
+}
+
+define <8 x i64> @test_8xi64_masked_shuff_mem_mask1(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %vec3, <8 x i64> %mask) {
+; GENERIC-LABEL: test_8xi64_masked_shuff_mem_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[0,1,0,1] sched: [5:1.00]
+; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xi64_masked_shuff_mem_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[0,1,0,1] sched: [10:1.00]
+; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <8 x i64>, <8 x i64>* %vec2p
+ %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 8, i32 9, i32 8, i32 9>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec3
+ ret <8 x i64> %res
+}
+
+define <8 x i64> @test_8xi64_zero_masked_shuff_mem_mask1(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %mask) {
+; GENERIC-LABEL: test_8xi64_zero_masked_shuff_mem_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,0,1],mem[0,1,0,1] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xi64_zero_masked_shuff_mem_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,0,1],mem[0,1,0,1] sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <8 x i64>, <8 x i64>* %vec2p
+ %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 8, i32 9, i32 8, i32 9>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
+ ret <8 x i64> %res
+}
+
+define <8 x i64> @test_8xi64_masked_shuff_mem_mask2(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %vec3, <8 x i64> %mask) {
+; GENERIC-LABEL: test_8xi64_masked_shuff_mem_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[4,5,0,1],mem[2,3,2,3] sched: [5:1.00]
+; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xi64_masked_shuff_mem_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[4,5,0,1],mem[2,3,2,3] sched: [10:1.00]
+; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <8 x i64>, <8 x i64>* %vec2p
+ %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 4, i32 5, i32 0, i32 1, i32 10, i32 11, i32 10, i32 11>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec3
+ ret <8 x i64> %res
+}
+
+define <8 x i64> @test_8xi64_zero_masked_shuff_mem_mask2(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %mask) {
+; GENERIC-LABEL: test_8xi64_zero_masked_shuff_mem_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,0,1],mem[2,3,2,3] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xi64_zero_masked_shuff_mem_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,0,1],mem[2,3,2,3] sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <8 x i64>, <8 x i64>* %vec2p
+ %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 4, i32 5, i32 0, i32 1, i32 10, i32 11, i32 10, i32 11>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
+ ret <8 x i64> %res
+}
+
+define <8 x i64> @test_8xi64_shuff_mem_mask3(<8 x i64> %vec1, <8 x i64>* %vec2p) {
+; GENERIC-LABEL: test_8xi64_shuff_mem_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,0,1],mem[6,7,2,3] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xi64_shuff_mem_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,0,1],mem[6,7,2,3] sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <8 x i64>, <8 x i64>* %vec2p
+ %res = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 14, i32 15, i32 10, i32 11>
+ ret <8 x i64> %res
+}
+define <8 x i64> @test_8xi64_masked_shuff_mem_mask3(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %vec3, <8 x i64> %mask) {
+; GENERIC-LABEL: test_8xi64_masked_shuff_mem_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[6,7,2,3] sched: [5:1.00]
+; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xi64_masked_shuff_mem_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[6,7,2,3] sched: [10:1.00]
+; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <8 x i64>, <8 x i64>* %vec2p
+ %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 14, i32 15, i32 10, i32 11>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec3
+ ret <8 x i64> %res
+}
+
+define <8 x i64> @test_8xi64_zero_masked_shuff_mem_mask3(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %mask) {
+; GENERIC-LABEL: test_8xi64_zero_masked_shuff_mem_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,0,1],mem[6,7,2,3] sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xi64_zero_masked_shuff_mem_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,0,1],mem[6,7,2,3] sched: [10:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <8 x i64>, <8 x i64>* %vec2p
+ %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 14, i32 15, i32 10, i32 11>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
+ ret <8 x i64> %res
+}
+
+define <4 x float> @test_4xfloat_unpack_low_mask0(<4 x float> %vec1, <4 x float> %vec2) {
+; GENERIC-LABEL: test_4xfloat_unpack_low_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xfloat_unpack_low_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %res = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+ ret <4 x float> %res
+}
+define <4 x float> @test_4xfloat_masked_unpack_low_mask0(<4 x float> %vec1, <4 x float> %vec2, <4 x float> %vec3, <4 x i32> %mask) {
+; GENERIC-LABEL: test_4xfloat_masked_unpack_low_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %xmm4, %xmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpcklps {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [3:1.00]
+; GENERIC-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xfloat_masked_unpack_low_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %xmm4, %xmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpcklps {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
+; SKX-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+ %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec3
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_4xfloat_zero_masked_unpack_low_mask0(<4 x float> %vec1, <4 x float> %vec2, <4 x i32> %mask) {
+; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_low_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xfloat_zero_masked_unpack_low_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+ %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
+ ret <4 x float> %res
+}
+define <4 x float> @test_4xfloat_masked_unpack_low_mask1(<4 x float> %vec1, <4 x float> %vec2, <4 x float> %vec3, <4 x i32> %mask) {
+; GENERIC-LABEL: test_4xfloat_masked_unpack_low_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %xmm4, %xmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpcklps {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [3:1.00]
+; GENERIC-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xfloat_masked_unpack_low_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %xmm4, %xmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpcklps {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
+; SKX-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+ %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec3
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_4xfloat_zero_masked_unpack_low_mask1(<4 x float> %vec1, <4 x float> %vec2, <4 x i32> %mask) {
+; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_low_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xfloat_zero_masked_unpack_low_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+ %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
+ ret <4 x float> %res
+}
+define <4 x float> @test_4xfloat_masked_unpack_low_mask2(<4 x float> %vec1, <4 x float> %vec2, <4 x float> %vec3, <4 x i32> %mask) {
+; GENERIC-LABEL: test_4xfloat_masked_unpack_low_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %xmm4, %xmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpcklps {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [3:1.00]
+; GENERIC-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xfloat_masked_unpack_low_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %xmm4, %xmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpcklps {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
+; SKX-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+ %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec3
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_4xfloat_zero_masked_unpack_low_mask2(<4 x float> %vec1, <4 x float> %vec2, <4 x i32> %mask) {
+; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_low_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xfloat_zero_masked_unpack_low_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+ %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
+ ret <4 x float> %res
+}
+define <4 x float> @test_4xfloat_unpack_low_mask3(<4 x float> %vec1, <4 x float> %vec2) {
+; GENERIC-LABEL: test_4xfloat_unpack_low_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xfloat_unpack_low_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %res = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+ ret <4 x float> %res
+}
+define <4 x float> @test_4xfloat_masked_unpack_low_mask3(<4 x float> %vec1, <4 x float> %vec2, <4 x float> %vec3, <4 x i32> %mask) {
+; GENERIC-LABEL: test_4xfloat_masked_unpack_low_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %xmm4, %xmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpcklps {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [3:1.00]
+; GENERIC-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xfloat_masked_unpack_low_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %xmm4, %xmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpcklps {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
+; SKX-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+ %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec3
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_4xfloat_zero_masked_unpack_low_mask3(<4 x float> %vec1, <4 x float> %vec2, <4 x i32> %mask) {
+; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_low_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xfloat_zero_masked_unpack_low_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+ %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
+ ret <4 x float> %res
+}
+define <4 x float> @test_4xfloat_unpack_low_mem_mask0(<4 x float> %vec1, <4 x float>* %vec2p) {
+; GENERIC-LABEL: test_4xfloat_unpack_low_mem_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xfloat_unpack_low_mem_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <4 x float>, <4 x float>* %vec2p
+ %res = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+ ret <4 x float> %res
+}
+define <4 x float> @test_4xfloat_masked_unpack_low_mem_mask0(<4 x float> %vec1, <4 x float>* %vec2p, <4 x float> %vec3, <4 x i32> %mask) {
+; GENERIC-LABEL: test_4xfloat_masked_unpack_low_mem_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
+; GENERIC-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xfloat_masked_unpack_low_mem_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
+; SKX-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <4 x float>, <4 x float>* %vec2p
+ %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+ %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec3
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_4xfloat_zero_masked_unpack_low_mem_mask0(<4 x float> %vec1, <4 x float>* %vec2p, <4 x i32> %mask) {
+; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_low_mem_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xfloat_zero_masked_unpack_low_mem_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <4 x float>, <4 x float>* %vec2p
+ %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+ %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_4xfloat_masked_unpack_low_mem_mask1(<4 x float> %vec1, <4 x float>* %vec2p, <4 x float> %vec3, <4 x i32> %mask) {
+; GENERIC-LABEL: test_4xfloat_masked_unpack_low_mem_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
+; GENERIC-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xfloat_masked_unpack_low_mem_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
+; SKX-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <4 x float>, <4 x float>* %vec2p
+ %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+ %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec3
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_4xfloat_zero_masked_unpack_low_mem_mask1(<4 x float> %vec1, <4 x float>* %vec2p, <4 x i32> %mask) {
+; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_low_mem_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xfloat_zero_masked_unpack_low_mem_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <4 x float>, <4 x float>* %vec2p
+ %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+ %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_4xfloat_masked_unpack_low_mem_mask2(<4 x float> %vec1, <4 x float>* %vec2p, <4 x float> %vec3, <4 x i32> %mask) {
+; GENERIC-LABEL: test_4xfloat_masked_unpack_low_mem_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
+; GENERIC-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xfloat_masked_unpack_low_mem_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
+; SKX-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <4 x float>, <4 x float>* %vec2p
+ %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+ %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec3
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_4xfloat_zero_masked_unpack_low_mem_mask2(<4 x float> %vec1, <4 x float>* %vec2p, <4 x i32> %mask) {
+; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_low_mem_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xfloat_zero_masked_unpack_low_mem_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <4 x float>, <4 x float>* %vec2p
+ %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+ %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_4xfloat_unpack_low_mem_mask3(<4 x float> %vec1, <4 x float>* %vec2p) {
+; GENERIC-LABEL: test_4xfloat_unpack_low_mem_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xfloat_unpack_low_mem_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <4 x float>, <4 x float>* %vec2p
+ %res = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+ ret <4 x float> %res
+}
+define <4 x float> @test_4xfloat_masked_unpack_low_mem_mask3(<4 x float> %vec1, <4 x float>* %vec2p, <4 x float> %vec3, <4 x i32> %mask) {
+; GENERIC-LABEL: test_4xfloat_masked_unpack_low_mem_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
+; GENERIC-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xfloat_masked_unpack_low_mem_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
+; SKX-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <4 x float>, <4 x float>* %vec2p
+ %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+ %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec3
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_4xfloat_zero_masked_unpack_low_mem_mask3(<4 x float> %vec1, <4 x float>* %vec2p, <4 x i32> %mask) {
+; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_low_mem_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xfloat_zero_masked_unpack_low_mem_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <4 x float>, <4 x float>* %vec2p
+ %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+ %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
+ ret <4 x float> %res
+}
+
+define <8 x float> @test_8xfloat_unpack_low_mask0(<8 x float> %vec1, <8 x float> %vec2) {
+; GENERIC-LABEL: test_8xfloat_unpack_low_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xfloat_unpack_low_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
+ ret <8 x float> %res
+}
+define <8 x float> @test_8xfloat_masked_unpack_low_mask0(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) {
+; GENERIC-LABEL: test_8xfloat_masked_unpack_low_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpcklps {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [3:1.00]
+; GENERIC-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xfloat_masked_unpack_low_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpcklps {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
+; SKX-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_8xfloat_zero_masked_unpack_low_mask0(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) {
+; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_low_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xfloat_zero_masked_unpack_low_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+define <8 x float> @test_8xfloat_masked_unpack_low_mask1(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) {
+; GENERIC-LABEL: test_8xfloat_masked_unpack_low_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpcklps {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [3:1.00]
+; GENERIC-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xfloat_masked_unpack_low_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpcklps {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
+; SKX-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_8xfloat_zero_masked_unpack_low_mask1(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) {
+; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_low_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xfloat_zero_masked_unpack_low_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+define <8 x float> @test_8xfloat_masked_unpack_low_mask2(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) {
+; GENERIC-LABEL: test_8xfloat_masked_unpack_low_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpcklps {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [3:1.00]
+; GENERIC-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xfloat_masked_unpack_low_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpcklps {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
+; SKX-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_8xfloat_zero_masked_unpack_low_mask2(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) {
+; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_low_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xfloat_zero_masked_unpack_low_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+define <8 x float> @test_8xfloat_unpack_low_mask3(<8 x float> %vec1, <8 x float> %vec2) {
+; GENERIC-LABEL: test_8xfloat_unpack_low_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xfloat_unpack_low_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
+ ret <8 x float> %res
+}
+define <8 x float> @test_8xfloat_masked_unpack_low_mask3(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) {
+; GENERIC-LABEL: test_8xfloat_masked_unpack_low_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpcklps {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [3:1.00]
+; GENERIC-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xfloat_masked_unpack_low_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpcklps {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
+; SKX-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_8xfloat_zero_masked_unpack_low_mask3(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) {
+; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_low_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xfloat_zero_masked_unpack_low_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+define <8 x float> @test_8xfloat_unpack_low_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p) {
+; GENERIC-LABEL: test_8xfloat_unpack_low_mem_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xfloat_unpack_low_mem_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <8 x float>, <8 x float>* %vec2p
+ %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
+ ret <8 x float> %res
+}
+define <8 x float> @test_8xfloat_masked_unpack_low_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) {
+; GENERIC-LABEL: test_8xfloat_masked_unpack_low_mem_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpcklps {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [7:1.00]
+; GENERIC-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xfloat_masked_unpack_low_mem_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpcklps {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
+; SKX-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <8 x float>, <8 x float>* %vec2p
+ %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_8xfloat_zero_masked_unpack_low_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) {
+; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_low_mem_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xfloat_zero_masked_unpack_low_mem_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <8 x float>, <8 x float>* %vec2p
+ %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_8xfloat_masked_unpack_low_mem_mask1(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) {
+; GENERIC-LABEL: test_8xfloat_masked_unpack_low_mem_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpcklps {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [7:1.00]
+; GENERIC-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xfloat_masked_unpack_low_mem_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpcklps {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
+; SKX-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <8 x float>, <8 x float>* %vec2p
+ %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_8xfloat_zero_masked_unpack_low_mem_mask1(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) {
+; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_low_mem_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xfloat_zero_masked_unpack_low_mem_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <8 x float>, <8 x float>* %vec2p
+ %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_8xfloat_masked_unpack_low_mem_mask2(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) {
+; GENERIC-LABEL: test_8xfloat_masked_unpack_low_mem_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpcklps {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [7:1.00]
+; GENERIC-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xfloat_masked_unpack_low_mem_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpcklps {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
+; SKX-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <8 x float>, <8 x float>* %vec2p
+ %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_8xfloat_zero_masked_unpack_low_mem_mask2(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) {
+; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_low_mem_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xfloat_zero_masked_unpack_low_mem_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <8 x float>, <8 x float>* %vec2p
+ %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_8xfloat_unpack_low_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p) {
+; GENERIC-LABEL: test_8xfloat_unpack_low_mem_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xfloat_unpack_low_mem_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <8 x float>, <8 x float>* %vec2p
+ %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
+ ret <8 x float> %res
+}
+define <8 x float> @test_8xfloat_masked_unpack_low_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) {
+; GENERIC-LABEL: test_8xfloat_masked_unpack_low_mem_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpcklps {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [7:1.00]
+; GENERIC-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xfloat_masked_unpack_low_mem_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpcklps {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
+; SKX-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <8 x float>, <8 x float>* %vec2p
+ %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_8xfloat_zero_masked_unpack_low_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) {
+; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_low_mem_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xfloat_zero_masked_unpack_low_mem_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <8 x float>, <8 x float>* %vec2p
+ %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+
+define <16 x float> @test_16xfloat_unpack_low_mask0(<16 x float> %vec1, <16 x float> %vec2) {
+; GENERIC-LABEL: test_16xfloat_unpack_low_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vunpcklps {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xfloat_unpack_low_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vunpcklps {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
+ ret <16 x float> %res
+}
+define <16 x float> @test_16xfloat_masked_unpack_low_mask0(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x i32> %mask) {
+; GENERIC-LABEL: test_16xfloat_masked_unpack_low_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [3:1.00]
+; GENERIC-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xfloat_masked_unpack_low_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00]
+; SKX-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_16xfloat_zero_masked_unpack_low_mask0(<16 x float> %vec1, <16 x float> %vec2, <16 x i32> %mask) {
+; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_low_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xfloat_zero_masked_unpack_low_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
+define <16 x float> @test_16xfloat_masked_unpack_low_mask1(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x i32> %mask) {
+; GENERIC-LABEL: test_16xfloat_masked_unpack_low_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [3:1.00]
+; GENERIC-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xfloat_masked_unpack_low_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00]
+; SKX-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_16xfloat_zero_masked_unpack_low_mask1(<16 x float> %vec1, <16 x float> %vec2, <16 x i32> %mask) {
+; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_low_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xfloat_zero_masked_unpack_low_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
+define <16 x float> @test_16xfloat_masked_unpack_low_mask2(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x i32> %mask) {
+; GENERIC-LABEL: test_16xfloat_masked_unpack_low_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [3:1.00]
+; GENERIC-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xfloat_masked_unpack_low_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00]
+; SKX-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_16xfloat_zero_masked_unpack_low_mask2(<16 x float> %vec1, <16 x float> %vec2, <16 x i32> %mask) {
+; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_low_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xfloat_zero_masked_unpack_low_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
+define <16 x float> @test_16xfloat_unpack_low_mask3(<16 x float> %vec1, <16 x float> %vec2) {
+; GENERIC-LABEL: test_16xfloat_unpack_low_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vunpcklps {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xfloat_unpack_low_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vunpcklps {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
+ ret <16 x float> %res
+}
+define <16 x float> @test_16xfloat_masked_unpack_low_mask3(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x i32> %mask) {
+; GENERIC-LABEL: test_16xfloat_masked_unpack_low_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [3:1.00]
+; GENERIC-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xfloat_masked_unpack_low_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00]
+; SKX-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_16xfloat_zero_masked_unpack_low_mask3(<16 x float> %vec1, <16 x float> %vec2, <16 x i32> %mask) {
+; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_low_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xfloat_zero_masked_unpack_low_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
+define <16 x float> @test_16xfloat_unpack_low_mem_mask0(<16 x float> %vec1, <16 x float>* %vec2p) {
+; GENERIC-LABEL: test_16xfloat_unpack_low_mem_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vunpcklps {{.*#+}} zmm0 = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xfloat_unpack_low_mem_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vunpcklps {{.*#+}} zmm0 = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <16 x float>, <16 x float>* %vec2p
+ %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
+ ret <16 x float> %res
+}
+define <16 x float> @test_16xfloat_masked_unpack_low_mem_mask0(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x i32> %mask) {
+; GENERIC-LABEL: test_16xfloat_masked_unpack_low_mem_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpcklps {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [7:1.00]
+; GENERIC-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xfloat_masked_unpack_low_mem_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpcklps {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00]
+; SKX-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <16 x float>, <16 x float>* %vec2p
+ %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_16xfloat_zero_masked_unpack_low_mem_mask0(<16 x float> %vec1, <16 x float>* %vec2p, <16 x i32> %mask) {
+; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_low_mem_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xfloat_zero_masked_unpack_low_mem_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <16 x float>, <16 x float>* %vec2p
+ %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_16xfloat_masked_unpack_low_mem_mask1(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x i32> %mask) {
+; GENERIC-LABEL: test_16xfloat_masked_unpack_low_mem_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpcklps {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [7:1.00]
+; GENERIC-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xfloat_masked_unpack_low_mem_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpcklps {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00]
+; SKX-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <16 x float>, <16 x float>* %vec2p
+ %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_16xfloat_zero_masked_unpack_low_mem_mask1(<16 x float> %vec1, <16 x float>* %vec2p, <16 x i32> %mask) {
+; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_low_mem_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xfloat_zero_masked_unpack_low_mem_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <16 x float>, <16 x float>* %vec2p
+ %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_16xfloat_masked_unpack_low_mem_mask2(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x i32> %mask) {
+; GENERIC-LABEL: test_16xfloat_masked_unpack_low_mem_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpcklps {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [7:1.00]
+; GENERIC-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xfloat_masked_unpack_low_mem_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpcklps {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00]
+; SKX-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <16 x float>, <16 x float>* %vec2p
+ %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_16xfloat_zero_masked_unpack_low_mem_mask2(<16 x float> %vec1, <16 x float>* %vec2p, <16 x i32> %mask) {
+; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_low_mem_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xfloat_zero_masked_unpack_low_mem_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <16 x float>, <16 x float>* %vec2p
+ %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_16xfloat_unpack_low_mem_mask3(<16 x float> %vec1, <16 x float>* %vec2p) {
+; GENERIC-LABEL: test_16xfloat_unpack_low_mem_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vunpcklps {{.*#+}} zmm0 = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xfloat_unpack_low_mem_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vunpcklps {{.*#+}} zmm0 = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <16 x float>, <16 x float>* %vec2p
+ %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
+ ret <16 x float> %res
+}
+define <16 x float> @test_16xfloat_masked_unpack_low_mem_mask3(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x i32> %mask) {
+; GENERIC-LABEL: test_16xfloat_masked_unpack_low_mem_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpcklps {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [7:1.00]
+; GENERIC-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xfloat_masked_unpack_low_mem_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpcklps {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00]
+; SKX-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <16 x float>, <16 x float>* %vec2p
+ %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_16xfloat_zero_masked_unpack_low_mem_mask3(<16 x float> %vec1, <16 x float>* %vec2p, <16 x i32> %mask) {
+; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_low_mem_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xfloat_zero_masked_unpack_low_mem_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <16 x float>, <16 x float>* %vec2p
+ %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
+
+define <2 x double> @test_2xdouble_unpack_low_mask0(<2 x double> %vec1, <2 x double> %vec2) {
+; GENERIC-LABEL: test_2xdouble_unpack_low_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_2xdouble_unpack_low_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %res = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 0, i32 2>
+ ret <2 x double> %res
+}
+define <2 x double> @test_2xdouble_masked_unpack_low_mask0(<2 x double> %vec1, <2 x double> %vec2, <2 x double> %vec3, <2 x i64> %mask) {
+; GENERIC-LABEL: test_2xdouble_masked_unpack_low_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %xmm4, %xmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpcklpd {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0] sched: [3:1.00]
+; GENERIC-NEXT: vmovapd %xmm2, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_2xdouble_masked_unpack_low_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %xmm4, %xmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpcklpd {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0] sched: [1:1.00]
+; SKX-NEXT: vmovapd %xmm2, %xmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 0, i32 2>
+ %cmp = icmp eq <2 x i64> %mask, zeroinitializer
+ %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec3
+ ret <2 x double> %res
+}
+
+define <2 x double> @test_2xdouble_zero_masked_unpack_low_mask0(<2 x double> %vec1, <2 x double> %vec2, <2 x i64> %mask) {
+; GENERIC-LABEL: test_2xdouble_zero_masked_unpack_low_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0] sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_2xdouble_zero_masked_unpack_low_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0] sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 0, i32 2>
+ %cmp = icmp eq <2 x i64> %mask, zeroinitializer
+ %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
+ ret <2 x double> %res
+}
+define <2 x double> @test_2xdouble_masked_unpack_low_mask1(<2 x double> %vec1, <2 x double> %vec2, <2 x double> %vec3, <2 x i64> %mask) {
+; GENERIC-LABEL: test_2xdouble_masked_unpack_low_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %xmm4, %xmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpcklpd {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0] sched: [3:1.00]
+; GENERIC-NEXT: vmovapd %xmm2, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_2xdouble_masked_unpack_low_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %xmm4, %xmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpcklpd {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0] sched: [1:1.00]
+; SKX-NEXT: vmovapd %xmm2, %xmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 0, i32 2>
+ %cmp = icmp eq <2 x i64> %mask, zeroinitializer
+ %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec3
+ ret <2 x double> %res
+}
+
+define <2 x double> @test_2xdouble_zero_masked_unpack_low_mask1(<2 x double> %vec1, <2 x double> %vec2, <2 x i64> %mask) {
+; GENERIC-LABEL: test_2xdouble_zero_masked_unpack_low_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0] sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_2xdouble_zero_masked_unpack_low_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0] sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 0, i32 2>
+ %cmp = icmp eq <2 x i64> %mask, zeroinitializer
+ %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
+ ret <2 x double> %res
+}
+define <2 x double> @test_2xdouble_unpack_low_mem_mask0(<2 x double> %vec1, <2 x double>* %vec2p) {
+; GENERIC-LABEL: test_2xdouble_unpack_low_mem_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_2xdouble_unpack_low_mem_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] sched: [7:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <2 x double>, <2 x double>* %vec2p
+ %res = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 0, i32 2>
+ ret <2 x double> %res
+}
+define <2 x double> @test_2xdouble_masked_unpack_low_mem_mask0(<2 x double> %vec1, <2 x double>* %vec2p, <2 x double> %vec3, <2 x i64> %mask) {
+; GENERIC-LABEL: test_2xdouble_masked_unpack_low_mem_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0] sched: [7:1.00]
+; GENERIC-NEXT: vmovapd %xmm1, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_2xdouble_masked_unpack_low_mem_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0] sched: [7:1.00]
+; SKX-NEXT: vmovapd %xmm1, %xmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <2 x double>, <2 x double>* %vec2p
+ %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 0, i32 2>
+ %cmp = icmp eq <2 x i64> %mask, zeroinitializer
+ %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec3
+ ret <2 x double> %res
+}
+
+define <2 x double> @test_2xdouble_zero_masked_unpack_low_mem_mask0(<2 x double> %vec1, <2 x double>* %vec2p, <2 x i64> %mask) {
+; GENERIC-LABEL: test_2xdouble_zero_masked_unpack_low_mem_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0] sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_2xdouble_zero_masked_unpack_low_mem_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0] sched: [7:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <2 x double>, <2 x double>* %vec2p
+ %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 0, i32 2>
+ %cmp = icmp eq <2 x i64> %mask, zeroinitializer
+ %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
+ ret <2 x double> %res
+}
+
+define <2 x double> @test_2xdouble_masked_unpack_low_mem_mask1(<2 x double> %vec1, <2 x double>* %vec2p, <2 x double> %vec3, <2 x i64> %mask) {
+; GENERIC-LABEL: test_2xdouble_masked_unpack_low_mem_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0] sched: [7:1.00]
+; GENERIC-NEXT: vmovapd %xmm1, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_2xdouble_masked_unpack_low_mem_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0] sched: [7:1.00]
+; SKX-NEXT: vmovapd %xmm1, %xmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <2 x double>, <2 x double>* %vec2p
+ %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 0, i32 2>
+ %cmp = icmp eq <2 x i64> %mask, zeroinitializer
+ %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec3
+ ret <2 x double> %res
+}
+
+define <2 x double> @test_2xdouble_zero_masked_unpack_low_mem_mask1(<2 x double> %vec1, <2 x double>* %vec2p, <2 x i64> %mask) {
+; GENERIC-LABEL: test_2xdouble_zero_masked_unpack_low_mem_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0] sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_2xdouble_zero_masked_unpack_low_mem_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0] sched: [7:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <2 x double>, <2 x double>* %vec2p
+ %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 0, i32 2>
+ %cmp = icmp eq <2 x i64> %mask, zeroinitializer
+ %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
+ ret <2 x double> %res
+}
+
+define <4 x double> @test_4xdouble_unpack_low_mask0(<4 x double> %vec1, <4 x double> %vec2) {
+; GENERIC-LABEL: test_4xdouble_unpack_low_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xdouble_unpack_low_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+ ret <4 x double> %res
+}
+define <4 x double> @test_4xdouble_masked_unpack_low_mask0(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) {
+; GENERIC-LABEL: test_4xdouble_masked_unpack_low_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpcklpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [3:1.00]
+; GENERIC-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xdouble_masked_unpack_low_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpcklpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
+; SKX-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_4xdouble_zero_masked_unpack_low_mask0(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) {
+; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_low_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xdouble_zero_masked_unpack_low_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
+ ret <4 x double> %res
+}
+define <4 x double> @test_4xdouble_masked_unpack_low_mask1(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) {
+; GENERIC-LABEL: test_4xdouble_masked_unpack_low_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpcklpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [3:1.00]
+; GENERIC-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xdouble_masked_unpack_low_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpcklpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
+; SKX-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_4xdouble_zero_masked_unpack_low_mask1(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) {
+; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_low_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xdouble_zero_masked_unpack_low_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
+ ret <4 x double> %res
+}
+define <4 x double> @test_4xdouble_masked_unpack_low_mask2(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) {
+; GENERIC-LABEL: test_4xdouble_masked_unpack_low_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpcklpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [3:1.00]
+; GENERIC-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xdouble_masked_unpack_low_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpcklpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
+; SKX-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_4xdouble_zero_masked_unpack_low_mask2(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) {
+; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_low_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xdouble_zero_masked_unpack_low_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
+ ret <4 x double> %res
+}
+define <4 x double> @test_4xdouble_unpack_low_mask3(<4 x double> %vec1, <4 x double> %vec2) {
+; GENERIC-LABEL: test_4xdouble_unpack_low_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xdouble_unpack_low_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+ ret <4 x double> %res
+}
+define <4 x double> @test_4xdouble_masked_unpack_low_mask3(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) {
+; GENERIC-LABEL: test_4xdouble_masked_unpack_low_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpcklpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [3:1.00]
+; GENERIC-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xdouble_masked_unpack_low_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpcklpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
+; SKX-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_4xdouble_zero_masked_unpack_low_mask3(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) {
+; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_low_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xdouble_zero_masked_unpack_low_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
+ ret <4 x double> %res
+}
+define <4 x double> @test_4xdouble_unpack_low_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p) {
+; GENERIC-LABEL: test_4xdouble_unpack_low_mem_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xdouble_unpack_low_mem_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <4 x double>, <4 x double>* %vec2p
+ %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+ ret <4 x double> %res
+}
+define <4 x double> @test_4xdouble_masked_unpack_low_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) {
+; GENERIC-LABEL: test_4xdouble_masked_unpack_low_mem_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [7:1.00]
+; GENERIC-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xdouble_masked_unpack_low_mem_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
+; SKX-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <4 x double>, <4 x double>* %vec2p
+ %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_4xdouble_zero_masked_unpack_low_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) {
+; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_low_mem_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xdouble_zero_masked_unpack_low_mem_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <4 x double>, <4 x double>* %vec2p
+ %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_4xdouble_masked_unpack_low_mem_mask1(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) {
+; GENERIC-LABEL: test_4xdouble_masked_unpack_low_mem_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [7:1.00]
+; GENERIC-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xdouble_masked_unpack_low_mem_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
+; SKX-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <4 x double>, <4 x double>* %vec2p
+ %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_4xdouble_zero_masked_unpack_low_mem_mask1(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) {
+; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_low_mem_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xdouble_zero_masked_unpack_low_mem_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <4 x double>, <4 x double>* %vec2p
+ %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_4xdouble_masked_unpack_low_mem_mask2(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) {
+; GENERIC-LABEL: test_4xdouble_masked_unpack_low_mem_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [7:1.00]
+; GENERIC-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xdouble_masked_unpack_low_mem_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
+; SKX-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <4 x double>, <4 x double>* %vec2p
+ %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_4xdouble_zero_masked_unpack_low_mem_mask2(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) {
+; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_low_mem_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xdouble_zero_masked_unpack_low_mem_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <4 x double>, <4 x double>* %vec2p
+ %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_4xdouble_unpack_low_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p) {
+; GENERIC-LABEL: test_4xdouble_unpack_low_mem_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xdouble_unpack_low_mem_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <4 x double>, <4 x double>* %vec2p
+ %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+ ret <4 x double> %res
+}
+define <4 x double> @test_4xdouble_masked_unpack_low_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) {
+; GENERIC-LABEL: test_4xdouble_masked_unpack_low_mem_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [7:1.00]
+; GENERIC-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xdouble_masked_unpack_low_mem_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
+; SKX-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <4 x double>, <4 x double>* %vec2p
+ %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_4xdouble_zero_masked_unpack_low_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) {
+; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_low_mem_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xdouble_zero_masked_unpack_low_mem_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <4 x double>, <4 x double>* %vec2p
+ %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
+ ret <4 x double> %res
+}
+
+define <8 x double> @test_8xdouble_unpack_low_mask0(<8 x double> %vec1, <8 x double> %vec2) {
+; GENERIC-LABEL: test_8xdouble_unpack_low_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xdouble_unpack_low_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vunpcklpd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %res = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+ ret <8 x double> %res
+}
+define <8 x double> @test_8xdouble_masked_unpack_low_mask0(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x i64> %mask) {
+; GENERIC-LABEL: test_8xdouble_masked_unpack_low_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [3:1.00]
+; GENERIC-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xdouble_masked_unpack_low_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00]
+; SKX-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_8xdouble_zero_masked_unpack_low_mask0(<8 x double> %vec1, <8 x double> %vec2, <8 x i64> %mask) {
+; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_low_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xdouble_zero_masked_unpack_low_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
+ ret <8 x double> %res
+}
+define <8 x double> @test_8xdouble_masked_unpack_low_mask1(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x i64> %mask) {
+; GENERIC-LABEL: test_8xdouble_masked_unpack_low_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [3:1.00]
+; GENERIC-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xdouble_masked_unpack_low_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00]
+; SKX-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_8xdouble_zero_masked_unpack_low_mask1(<8 x double> %vec1, <8 x double> %vec2, <8 x i64> %mask) {
+; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_low_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xdouble_zero_masked_unpack_low_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
+ ret <8 x double> %res
+}
+define <8 x double> @test_8xdouble_masked_unpack_low_mask2(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x i64> %mask) {
+; GENERIC-LABEL: test_8xdouble_masked_unpack_low_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [3:1.00]
+; GENERIC-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xdouble_masked_unpack_low_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00]
+; SKX-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_8xdouble_zero_masked_unpack_low_mask2(<8 x double> %vec1, <8 x double> %vec2, <8 x i64> %mask) {
+; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_low_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xdouble_zero_masked_unpack_low_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
+ ret <8 x double> %res
+}
+define <8 x double> @test_8xdouble_unpack_low_mask3(<8 x double> %vec1, <8 x double> %vec2) {
+; GENERIC-LABEL: test_8xdouble_unpack_low_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xdouble_unpack_low_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vunpcklpd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %res = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+ ret <8 x double> %res
+}
+define <8 x double> @test_8xdouble_masked_unpack_low_mask3(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x i64> %mask) {
+; GENERIC-LABEL: test_8xdouble_masked_unpack_low_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [3:1.00]
+; GENERIC-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xdouble_masked_unpack_low_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00]
+; SKX-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_8xdouble_zero_masked_unpack_low_mask3(<8 x double> %vec1, <8 x double> %vec2, <8 x i64> %mask) {
+; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_low_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xdouble_zero_masked_unpack_low_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
+ ret <8 x double> %res
+}
+define <8 x double> @test_8xdouble_unpack_low_mem_mask0(<8 x double> %vec1, <8 x double>* %vec2p) {
+; GENERIC-LABEL: test_8xdouble_unpack_low_mem_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm0 = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xdouble_unpack_low_mem_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vunpcklpd {{.*#+}} zmm0 = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <8 x double>, <8 x double>* %vec2p
+ %res = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+ ret <8 x double> %res
+}
+define <8 x double> @test_8xdouble_masked_unpack_low_mem_mask0(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x i64> %mask) {
+; GENERIC-LABEL: test_8xdouble_masked_unpack_low_mem_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [7:1.00]
+; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xdouble_masked_unpack_low_mem_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00]
+; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <8 x double>, <8 x double>* %vec2p
+ %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_8xdouble_zero_masked_unpack_low_mem_mask0(<8 x double> %vec1, <8 x double>* %vec2p, <8 x i64> %mask) {
+; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_low_mem_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xdouble_zero_masked_unpack_low_mem_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <8 x double>, <8 x double>* %vec2p
+ %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_8xdouble_masked_unpack_low_mem_mask1(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x i64> %mask) {
+; GENERIC-LABEL: test_8xdouble_masked_unpack_low_mem_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [7:1.00]
+; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xdouble_masked_unpack_low_mem_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00]
+; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <8 x double>, <8 x double>* %vec2p
+ %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_8xdouble_zero_masked_unpack_low_mem_mask1(<8 x double> %vec1, <8 x double>* %vec2p, <8 x i64> %mask) {
+; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_low_mem_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xdouble_zero_masked_unpack_low_mem_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <8 x double>, <8 x double>* %vec2p
+ %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_8xdouble_masked_unpack_low_mem_mask2(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x i64> %mask) {
+; GENERIC-LABEL: test_8xdouble_masked_unpack_low_mem_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [7:1.00]
+; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xdouble_masked_unpack_low_mem_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00]
+; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <8 x double>, <8 x double>* %vec2p
+ %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_8xdouble_zero_masked_unpack_low_mem_mask2(<8 x double> %vec1, <8 x double>* %vec2p, <8 x i64> %mask) {
+; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_low_mem_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xdouble_zero_masked_unpack_low_mem_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <8 x double>, <8 x double>* %vec2p
+ %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_8xdouble_unpack_low_mem_mask3(<8 x double> %vec1, <8 x double>* %vec2p) {
+; GENERIC-LABEL: test_8xdouble_unpack_low_mem_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm0 = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xdouble_unpack_low_mem_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vunpcklpd {{.*#+}} zmm0 = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <8 x double>, <8 x double>* %vec2p
+ %res = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+ ret <8 x double> %res
+}
+define <8 x double> @test_8xdouble_masked_unpack_low_mem_mask3(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x i64> %mask) {
+; GENERIC-LABEL: test_8xdouble_masked_unpack_low_mem_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [7:1.00]
+; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xdouble_masked_unpack_low_mem_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00]
+; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <8 x double>, <8 x double>* %vec2p
+ %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_8xdouble_zero_masked_unpack_low_mem_mask3(<8 x double> %vec1, <8 x double>* %vec2p, <8 x i64> %mask) {
+; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_low_mem_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xdouble_zero_masked_unpack_low_mem_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <8 x double>, <8 x double>* %vec2p
+ %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
+ ret <8 x double> %res
+}
+
+define <4 x float> @test_4xfloat_unpack_high_mask0(<4 x float> %vec1, <4 x float> %vec2) {
+; GENERIC-LABEL: test_4xfloat_unpack_high_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xfloat_unpack_high_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %res = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+ ret <4 x float> %res
+}
+define <4 x float> @test_4xfloat_masked_unpack_high_mask0(<4 x float> %vec1, <4 x float> %vec2, <4 x float> %vec3, <4 x i32> %mask) {
+; GENERIC-LABEL: test_4xfloat_masked_unpack_high_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %xmm4, %xmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpckhps {{.*#+}} xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [3:1.00]
+; GENERIC-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xfloat_masked_unpack_high_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %xmm4, %xmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpckhps {{.*#+}} xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
+; SKX-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+ %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec3
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_4xfloat_zero_masked_unpack_high_mask0(<4 x float> %vec1, <4 x float> %vec2, <4 x i32> %mask) {
+; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_high_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xfloat_zero_masked_unpack_high_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+ %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
+ ret <4 x float> %res
+}
+define <4 x float> @test_4xfloat_masked_unpack_high_mask1(<4 x float> %vec1, <4 x float> %vec2, <4 x float> %vec3, <4 x i32> %mask) {
+; GENERIC-LABEL: test_4xfloat_masked_unpack_high_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %xmm4, %xmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpckhps {{.*#+}} xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [3:1.00]
+; GENERIC-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xfloat_masked_unpack_high_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %xmm4, %xmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpckhps {{.*#+}} xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
+; SKX-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+ %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec3
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_4xfloat_zero_masked_unpack_high_mask1(<4 x float> %vec1, <4 x float> %vec2, <4 x i32> %mask) {
+; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_high_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xfloat_zero_masked_unpack_high_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+ %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
+ ret <4 x float> %res
+}
+define <4 x float> @test_4xfloat_masked_unpack_high_mask2(<4 x float> %vec1, <4 x float> %vec2, <4 x float> %vec3, <4 x i32> %mask) {
+; GENERIC-LABEL: test_4xfloat_masked_unpack_high_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %xmm4, %xmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpckhps {{.*#+}} xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [3:1.00]
+; GENERIC-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xfloat_masked_unpack_high_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %xmm4, %xmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpckhps {{.*#+}} xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
+; SKX-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+ %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec3
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_4xfloat_zero_masked_unpack_high_mask2(<4 x float> %vec1, <4 x float> %vec2, <4 x i32> %mask) {
+; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_high_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xfloat_zero_masked_unpack_high_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+ %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
+ ret <4 x float> %res
+}
+define <4 x float> @test_4xfloat_unpack_high_mask3(<4 x float> %vec1, <4 x float> %vec2) {
+; GENERIC-LABEL: test_4xfloat_unpack_high_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xfloat_unpack_high_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %res = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+ ret <4 x float> %res
+}
+define <4 x float> @test_4xfloat_masked_unpack_high_mask3(<4 x float> %vec1, <4 x float> %vec2, <4 x float> %vec3, <4 x i32> %mask) {
+; GENERIC-LABEL: test_4xfloat_masked_unpack_high_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %xmm4, %xmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpckhps {{.*#+}} xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [3:1.00]
+; GENERIC-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xfloat_masked_unpack_high_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %xmm4, %xmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpckhps {{.*#+}} xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
+; SKX-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+ %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec3
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_4xfloat_zero_masked_unpack_high_mask3(<4 x float> %vec1, <4 x float> %vec2, <4 x i32> %mask) {
+; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_high_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xfloat_zero_masked_unpack_high_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+ %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
+ ret <4 x float> %res
+}
+define <4 x float> @test_4xfloat_unpack_high_mem_mask0(<4 x float> %vec1, <4 x float>* %vec2p) {
+; GENERIC-LABEL: test_4xfloat_unpack_high_mem_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xfloat_unpack_high_mem_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <4 x float>, <4 x float>* %vec2p
+ %res = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+ ret <4 x float> %res
+}
+define <4 x float> @test_4xfloat_masked_unpack_high_mem_mask0(<4 x float> %vec1, <4 x float>* %vec2p, <4 x float> %vec3, <4 x i32> %mask) {
+; GENERIC-LABEL: test_4xfloat_masked_unpack_high_mem_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpckhps {{.*#+}} xmm1 {%k1} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
+; GENERIC-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xfloat_masked_unpack_high_mem_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpckhps {{.*#+}} xmm1 {%k1} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
+; SKX-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <4 x float>, <4 x float>* %vec2p
+ %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+ %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec3
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_4xfloat_zero_masked_unpack_high_mem_mask0(<4 x float> %vec1, <4 x float>* %vec2p, <4 x i32> %mask) {
+; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_high_mem_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xfloat_zero_masked_unpack_high_mem_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <4 x float>, <4 x float>* %vec2p
+ %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+ %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_4xfloat_masked_unpack_high_mem_mask1(<4 x float> %vec1, <4 x float>* %vec2p, <4 x float> %vec3, <4 x i32> %mask) {
+; GENERIC-LABEL: test_4xfloat_masked_unpack_high_mem_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpckhps {{.*#+}} xmm1 {%k1} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
+; GENERIC-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xfloat_masked_unpack_high_mem_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpckhps {{.*#+}} xmm1 {%k1} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
+; SKX-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <4 x float>, <4 x float>* %vec2p
+ %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+ %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec3
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_4xfloat_zero_masked_unpack_high_mem_mask1(<4 x float> %vec1, <4 x float>* %vec2p, <4 x i32> %mask) {
+; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_high_mem_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xfloat_zero_masked_unpack_high_mem_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <4 x float>, <4 x float>* %vec2p
+ %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+ %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_4xfloat_masked_unpack_high_mem_mask2(<4 x float> %vec1, <4 x float>* %vec2p, <4 x float> %vec3, <4 x i32> %mask) {
+; GENERIC-LABEL: test_4xfloat_masked_unpack_high_mem_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpckhps {{.*#+}} xmm1 {%k1} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
+; GENERIC-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xfloat_masked_unpack_high_mem_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpckhps {{.*#+}} xmm1 {%k1} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
+; SKX-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <4 x float>, <4 x float>* %vec2p
+ %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+ %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec3
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_4xfloat_zero_masked_unpack_high_mem_mask2(<4 x float> %vec1, <4 x float>* %vec2p, <4 x i32> %mask) {
+; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_high_mem_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xfloat_zero_masked_unpack_high_mem_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <4 x float>, <4 x float>* %vec2p
+ %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+ %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_4xfloat_unpack_high_mem_mask3(<4 x float> %vec1, <4 x float>* %vec2p) {
+; GENERIC-LABEL: test_4xfloat_unpack_high_mem_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xfloat_unpack_high_mem_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <4 x float>, <4 x float>* %vec2p
+ %res = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+ ret <4 x float> %res
+}
+define <4 x float> @test_4xfloat_masked_unpack_high_mem_mask3(<4 x float> %vec1, <4 x float>* %vec2p, <4 x float> %vec3, <4 x i32> %mask) {
+; GENERIC-LABEL: test_4xfloat_masked_unpack_high_mem_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpckhps {{.*#+}} xmm1 {%k1} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
+; GENERIC-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xfloat_masked_unpack_high_mem_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpckhps {{.*#+}} xmm1 {%k1} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
+; SKX-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <4 x float>, <4 x float>* %vec2p
+ %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+ %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec3
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_4xfloat_zero_masked_unpack_high_mem_mask3(<4 x float> %vec1, <4 x float>* %vec2p, <4 x i32> %mask) {
+; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_high_mem_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xfloat_zero_masked_unpack_high_mem_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <4 x float>, <4 x float>* %vec2p
+ %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+ %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
+ ret <4 x float> %res
+}
+
+define <8 x float> @test_8xfloat_unpack_high_mask0(<8 x float> %vec1, <8 x float> %vec2) {
+; GENERIC-LABEL: test_8xfloat_unpack_high_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xfloat_unpack_high_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
+ ret <8 x float> %res
+}
+define <8 x float> @test_8xfloat_masked_unpack_high_mask0(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) {
+; GENERIC-LABEL: test_8xfloat_masked_unpack_high_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpckhps {{.*#+}} ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [3:1.00]
+; GENERIC-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xfloat_masked_unpack_high_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpckhps {{.*#+}} ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
+; SKX-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_8xfloat_zero_masked_unpack_high_mask0(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) {
+; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_high_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xfloat_zero_masked_unpack_high_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+define <8 x float> @test_8xfloat_masked_unpack_high_mask1(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) {
+; GENERIC-LABEL: test_8xfloat_masked_unpack_high_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpckhps {{.*#+}} ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [3:1.00]
+; GENERIC-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xfloat_masked_unpack_high_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpckhps {{.*#+}} ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
+; SKX-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_8xfloat_zero_masked_unpack_high_mask1(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) {
+; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_high_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xfloat_zero_masked_unpack_high_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+define <8 x float> @test_8xfloat_masked_unpack_high_mask2(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) {
+; GENERIC-LABEL: test_8xfloat_masked_unpack_high_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpckhps {{.*#+}} ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [3:1.00]
+; GENERIC-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xfloat_masked_unpack_high_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpckhps {{.*#+}} ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
+; SKX-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_8xfloat_zero_masked_unpack_high_mask2(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) {
+; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_high_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xfloat_zero_masked_unpack_high_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+define <8 x float> @test_8xfloat_unpack_high_mask3(<8 x float> %vec1, <8 x float> %vec2) {
+; GENERIC-LABEL: test_8xfloat_unpack_high_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xfloat_unpack_high_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
+ ret <8 x float> %res
+}
+define <8 x float> @test_8xfloat_masked_unpack_high_mask3(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) {
+; GENERIC-LABEL: test_8xfloat_masked_unpack_high_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpckhps {{.*#+}} ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [3:1.00]
+; GENERIC-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xfloat_masked_unpack_high_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpckhps {{.*#+}} ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
+; SKX-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_8xfloat_zero_masked_unpack_high_mask3(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) {
+; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_high_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xfloat_zero_masked_unpack_high_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+define <8 x float> @test_8xfloat_unpack_high_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p) {
+; GENERIC-LABEL: test_8xfloat_unpack_high_mem_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xfloat_unpack_high_mem_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <8 x float>, <8 x float>* %vec2p
+ %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
+ ret <8 x float> %res
+}
+define <8 x float> @test_8xfloat_masked_unpack_high_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) {
+; GENERIC-LABEL: test_8xfloat_masked_unpack_high_mem_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpckhps {{.*#+}} ymm1 {%k1} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [7:1.00]
+; GENERIC-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xfloat_masked_unpack_high_mem_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpckhps {{.*#+}} ymm1 {%k1} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
+; SKX-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <8 x float>, <8 x float>* %vec2p
+ %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_8xfloat_zero_masked_unpack_high_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) {
+; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_high_mem_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xfloat_zero_masked_unpack_high_mem_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <8 x float>, <8 x float>* %vec2p
+ %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_8xfloat_masked_unpack_high_mem_mask1(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) {
+; GENERIC-LABEL: test_8xfloat_masked_unpack_high_mem_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpckhps {{.*#+}} ymm1 {%k1} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [7:1.00]
+; GENERIC-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xfloat_masked_unpack_high_mem_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpckhps {{.*#+}} ymm1 {%k1} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
+; SKX-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <8 x float>, <8 x float>* %vec2p
+ %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_8xfloat_zero_masked_unpack_high_mem_mask1(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) {
+; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_high_mem_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xfloat_zero_masked_unpack_high_mem_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <8 x float>, <8 x float>* %vec2p
+ %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_8xfloat_masked_unpack_high_mem_mask2(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) {
+; GENERIC-LABEL: test_8xfloat_masked_unpack_high_mem_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpckhps {{.*#+}} ymm1 {%k1} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [7:1.00]
+; GENERIC-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xfloat_masked_unpack_high_mem_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpckhps {{.*#+}} ymm1 {%k1} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
+; SKX-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <8 x float>, <8 x float>* %vec2p
+ %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_8xfloat_zero_masked_unpack_high_mem_mask2(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) {
+; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_high_mem_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xfloat_zero_masked_unpack_high_mem_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <8 x float>, <8 x float>* %vec2p
+ %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_8xfloat_unpack_high_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p) {
+; GENERIC-LABEL: test_8xfloat_unpack_high_mem_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xfloat_unpack_high_mem_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <8 x float>, <8 x float>* %vec2p
+ %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
+ ret <8 x float> %res
+}
+define <8 x float> @test_8xfloat_masked_unpack_high_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) {
+; GENERIC-LABEL: test_8xfloat_masked_unpack_high_mem_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpckhps {{.*#+}} ymm1 {%k1} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [7:1.00]
+; GENERIC-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xfloat_masked_unpack_high_mem_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpckhps {{.*#+}} ymm1 {%k1} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
+; SKX-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <8 x float>, <8 x float>* %vec2p
+ %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_8xfloat_zero_masked_unpack_high_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) {
+; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_high_mem_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xfloat_zero_masked_unpack_high_mem_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <8 x float>, <8 x float>* %vec2p
+ %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+
+define <16 x float> @test_16xfloat_unpack_high_mask0(<16 x float> %vec1, <16 x float> %vec2) {
+; GENERIC-LABEL: test_16xfloat_unpack_high_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vunpckhps {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xfloat_unpack_high_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vunpckhps {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
+ ret <16 x float> %res
+}
+define <16 x float> @test_16xfloat_masked_unpack_high_mask0(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x i32> %mask) {
+; GENERIC-LABEL: test_16xfloat_masked_unpack_high_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpckhps {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [3:1.00]
+; GENERIC-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xfloat_masked_unpack_high_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpckhps {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00]
+; SKX-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_16xfloat_zero_masked_unpack_high_mask0(<16 x float> %vec1, <16 x float> %vec2, <16 x i32> %mask) {
+; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_high_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xfloat_zero_masked_unpack_high_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
+define <16 x float> @test_16xfloat_masked_unpack_high_mask1(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x i32> %mask) {
+; GENERIC-LABEL: test_16xfloat_masked_unpack_high_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpckhps {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [3:1.00]
+; GENERIC-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xfloat_masked_unpack_high_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpckhps {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00]
+; SKX-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_16xfloat_zero_masked_unpack_high_mask1(<16 x float> %vec1, <16 x float> %vec2, <16 x i32> %mask) {
+; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_high_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xfloat_zero_masked_unpack_high_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
+define <16 x float> @test_16xfloat_masked_unpack_high_mask2(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x i32> %mask) {
+; GENERIC-LABEL: test_16xfloat_masked_unpack_high_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpckhps {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [3:1.00]
+; GENERIC-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xfloat_masked_unpack_high_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpckhps {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00]
+; SKX-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_16xfloat_zero_masked_unpack_high_mask2(<16 x float> %vec1, <16 x float> %vec2, <16 x i32> %mask) {
+; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_high_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xfloat_zero_masked_unpack_high_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
+define <16 x float> @test_16xfloat_unpack_high_mask3(<16 x float> %vec1, <16 x float> %vec2) {
+; GENERIC-LABEL: test_16xfloat_unpack_high_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vunpckhps {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xfloat_unpack_high_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vunpckhps {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
+ ret <16 x float> %res
+}
+define <16 x float> @test_16xfloat_masked_unpack_high_mask3(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x i32> %mask) {
+; GENERIC-LABEL: test_16xfloat_masked_unpack_high_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpckhps {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [3:1.00]
+; GENERIC-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xfloat_masked_unpack_high_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpckhps {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00]
+; SKX-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_16xfloat_zero_masked_unpack_high_mask3(<16 x float> %vec1, <16 x float> %vec2, <16 x i32> %mask) {
+; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_high_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xfloat_zero_masked_unpack_high_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
+define <16 x float> @test_16xfloat_unpack_high_mem_mask0(<16 x float> %vec1, <16 x float>* %vec2p) {
+; GENERIC-LABEL: test_16xfloat_unpack_high_mem_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vunpckhps {{.*#+}} zmm0 = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xfloat_unpack_high_mem_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vunpckhps {{.*#+}} zmm0 = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <16 x float>, <16 x float>* %vec2p
+ %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
+ ret <16 x float> %res
+}
+define <16 x float> @test_16xfloat_masked_unpack_high_mem_mask0(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x i32> %mask) {
+; GENERIC-LABEL: test_16xfloat_masked_unpack_high_mem_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpckhps {{.*#+}} zmm1 {%k1} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [7:1.00]
+; GENERIC-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xfloat_masked_unpack_high_mem_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpckhps {{.*#+}} zmm1 {%k1} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00]
+; SKX-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <16 x float>, <16 x float>* %vec2p
+ %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_16xfloat_zero_masked_unpack_high_mem_mask0(<16 x float> %vec1, <16 x float>* %vec2p, <16 x i32> %mask) {
+; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_high_mem_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xfloat_zero_masked_unpack_high_mem_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <16 x float>, <16 x float>* %vec2p
+ %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_16xfloat_masked_unpack_high_mem_mask1(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x i32> %mask) {
+; GENERIC-LABEL: test_16xfloat_masked_unpack_high_mem_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpckhps {{.*#+}} zmm1 {%k1} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [7:1.00]
+; GENERIC-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xfloat_masked_unpack_high_mem_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpckhps {{.*#+}} zmm1 {%k1} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00]
+; SKX-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <16 x float>, <16 x float>* %vec2p
+ %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_16xfloat_zero_masked_unpack_high_mem_mask1(<16 x float> %vec1, <16 x float>* %vec2p, <16 x i32> %mask) {
+; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_high_mem_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xfloat_zero_masked_unpack_high_mem_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <16 x float>, <16 x float>* %vec2p
+ %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_16xfloat_masked_unpack_high_mem_mask2(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x i32> %mask) {
+; GENERIC-LABEL: test_16xfloat_masked_unpack_high_mem_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpckhps {{.*#+}} zmm1 {%k1} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [7:1.00]
+; GENERIC-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xfloat_masked_unpack_high_mem_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpckhps {{.*#+}} zmm1 {%k1} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00]
+; SKX-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <16 x float>, <16 x float>* %vec2p
+ %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_16xfloat_zero_masked_unpack_high_mem_mask2(<16 x float> %vec1, <16 x float>* %vec2p, <16 x i32> %mask) {
+; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_high_mem_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xfloat_zero_masked_unpack_high_mem_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <16 x float>, <16 x float>* %vec2p
+ %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_16xfloat_unpack_high_mem_mask3(<16 x float> %vec1, <16 x float>* %vec2p) {
+; GENERIC-LABEL: test_16xfloat_unpack_high_mem_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vunpckhps {{.*#+}} zmm0 = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xfloat_unpack_high_mem_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vunpckhps {{.*#+}} zmm0 = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <16 x float>, <16 x float>* %vec2p
+ %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
+ ret <16 x float> %res
+}
+define <16 x float> @test_16xfloat_masked_unpack_high_mem_mask3(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x i32> %mask) {
+; GENERIC-LABEL: test_16xfloat_masked_unpack_high_mem_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpckhps {{.*#+}} zmm1 {%k1} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [7:1.00]
+; GENERIC-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xfloat_masked_unpack_high_mem_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpckhps {{.*#+}} zmm1 {%k1} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00]
+; SKX-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <16 x float>, <16 x float>* %vec2p
+ %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_16xfloat_zero_masked_unpack_high_mem_mask3(<16 x float> %vec1, <16 x float>* %vec2p, <16 x i32> %mask) {
+; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_high_mem_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xfloat_zero_masked_unpack_high_mem_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <16 x float>, <16 x float>* %vec2p
+ %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
+
+define <2 x double> @test_2xdouble_unpack_high_mask0(<2 x double> %vec1, <2 x double> %vec2) {
+; GENERIC-LABEL: test_2xdouble_unpack_high_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_2xdouble_unpack_high_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %res = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 1, i32 3>
+ ret <2 x double> %res
+}
+define <2 x double> @test_2xdouble_masked_unpack_high_mask0(<2 x double> %vec1, <2 x double> %vec2, <2 x double> %vec3, <2 x i64> %mask) {
+; GENERIC-LABEL: test_2xdouble_masked_unpack_high_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %xmm4, %xmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpckhpd {{.*#+}} xmm2 {%k1} = xmm0[1],xmm1[1] sched: [3:1.00]
+; GENERIC-NEXT: vmovapd %xmm2, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_2xdouble_masked_unpack_high_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %xmm4, %xmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpckhpd {{.*#+}} xmm2 {%k1} = xmm0[1],xmm1[1] sched: [1:1.00]
+; SKX-NEXT: vmovapd %xmm2, %xmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 1, i32 3>
+ %cmp = icmp eq <2 x i64> %mask, zeroinitializer
+ %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec3
+ ret <2 x double> %res
+}
+
+define <2 x double> @test_2xdouble_zero_masked_unpack_high_mask0(<2 x double> %vec1, <2 x double> %vec2, <2 x i64> %mask) {
+; GENERIC-LABEL: test_2xdouble_zero_masked_unpack_high_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[1] sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_2xdouble_zero_masked_unpack_high_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[1] sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 1, i32 3>
+ %cmp = icmp eq <2 x i64> %mask, zeroinitializer
+ %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
+ ret <2 x double> %res
+}
+define <2 x double> @test_2xdouble_masked_unpack_high_mask1(<2 x double> %vec1, <2 x double> %vec2, <2 x double> %vec3, <2 x i64> %mask) {
+; GENERIC-LABEL: test_2xdouble_masked_unpack_high_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %xmm4, %xmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpckhpd {{.*#+}} xmm2 {%k1} = xmm0[1],xmm1[1] sched: [3:1.00]
+; GENERIC-NEXT: vmovapd %xmm2, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_2xdouble_masked_unpack_high_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %xmm4, %xmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpckhpd {{.*#+}} xmm2 {%k1} = xmm0[1],xmm1[1] sched: [1:1.00]
+; SKX-NEXT: vmovapd %xmm2, %xmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 1, i32 3>
+ %cmp = icmp eq <2 x i64> %mask, zeroinitializer
+ %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec3
+ ret <2 x double> %res
+}
+
+define <2 x double> @test_2xdouble_zero_masked_unpack_high_mask1(<2 x double> %vec1, <2 x double> %vec2, <2 x i64> %mask) {
+; GENERIC-LABEL: test_2xdouble_zero_masked_unpack_high_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[1] sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_2xdouble_zero_masked_unpack_high_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[1] sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 1, i32 3>
+ %cmp = icmp eq <2 x i64> %mask, zeroinitializer
+ %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
+ ret <2 x double> %res
+}
+define <2 x double> @test_2xdouble_unpack_high_mem_mask0(<2 x double> %vec1, <2 x double>* %vec2p) {
+; GENERIC-LABEL: test_2xdouble_unpack_high_mem_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_2xdouble_unpack_high_mem_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] sched: [7:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <2 x double>, <2 x double>* %vec2p
+ %res = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 1, i32 3>
+ ret <2 x double> %res
+}
+define <2 x double> @test_2xdouble_masked_unpack_high_mem_mask0(<2 x double> %vec1, <2 x double>* %vec2p, <2 x double> %vec3, <2 x i64> %mask) {
+; GENERIC-LABEL: test_2xdouble_masked_unpack_high_mem_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpckhpd {{.*#+}} xmm1 {%k1} = xmm0[1],mem[1] sched: [7:1.00]
+; GENERIC-NEXT: vmovapd %xmm1, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_2xdouble_masked_unpack_high_mem_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpckhpd {{.*#+}} xmm1 {%k1} = xmm0[1],mem[1] sched: [7:1.00]
+; SKX-NEXT: vmovapd %xmm1, %xmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <2 x double>, <2 x double>* %vec2p
+ %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 1, i32 3>
+ %cmp = icmp eq <2 x i64> %mask, zeroinitializer
+ %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec3
+ ret <2 x double> %res
+}
+
+define <2 x double> @test_2xdouble_zero_masked_unpack_high_mem_mask0(<2 x double> %vec1, <2 x double>* %vec2p, <2 x i64> %mask) {
+; GENERIC-LABEL: test_2xdouble_zero_masked_unpack_high_mem_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],mem[1] sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_2xdouble_zero_masked_unpack_high_mem_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],mem[1] sched: [7:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <2 x double>, <2 x double>* %vec2p
+ %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 1, i32 3>
+ %cmp = icmp eq <2 x i64> %mask, zeroinitializer
+ %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
+ ret <2 x double> %res
+}
+
+define <2 x double> @test_2xdouble_masked_unpack_high_mem_mask1(<2 x double> %vec1, <2 x double>* %vec2p, <2 x double> %vec3, <2 x i64> %mask) {
+; GENERIC-LABEL: test_2xdouble_masked_unpack_high_mem_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpckhpd {{.*#+}} xmm1 {%k1} = xmm0[1],mem[1] sched: [7:1.00]
+; GENERIC-NEXT: vmovapd %xmm1, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_2xdouble_masked_unpack_high_mem_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpckhpd {{.*#+}} xmm1 {%k1} = xmm0[1],mem[1] sched: [7:1.00]
+; SKX-NEXT: vmovapd %xmm1, %xmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <2 x double>, <2 x double>* %vec2p
+ %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 1, i32 3>
+ %cmp = icmp eq <2 x i64> %mask, zeroinitializer
+ %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec3
+ ret <2 x double> %res
+}
+
+define <2 x double> @test_2xdouble_zero_masked_unpack_high_mem_mask1(<2 x double> %vec1, <2 x double>* %vec2p, <2 x i64> %mask) {
+; GENERIC-LABEL: test_2xdouble_zero_masked_unpack_high_mem_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],mem[1] sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_2xdouble_zero_masked_unpack_high_mem_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],mem[1] sched: [7:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <2 x double>, <2 x double>* %vec2p
+ %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 1, i32 3>
+ %cmp = icmp eq <2 x i64> %mask, zeroinitializer
+ %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
+ ret <2 x double> %res
+}
+
+define <4 x double> @test_4xdouble_unpack_high_mask0(<4 x double> %vec1, <4 x double> %vec2) {
+; GENERIC-LABEL: test_4xdouble_unpack_high_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xdouble_unpack_high_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+ ret <4 x double> %res
+}
+define <4 x double> @test_4xdouble_masked_unpack_high_mask0(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) {
+; GENERIC-LABEL: test_4xdouble_masked_unpack_high_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpckhpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [3:1.00]
+; GENERIC-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xdouble_masked_unpack_high_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpckhpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
+; SKX-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_4xdouble_zero_masked_unpack_high_mask0(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) {
+; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_high_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xdouble_zero_masked_unpack_high_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
+ ret <4 x double> %res
+}
+define <4 x double> @test_4xdouble_masked_unpack_high_mask1(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) {
+; GENERIC-LABEL: test_4xdouble_masked_unpack_high_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpckhpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [3:1.00]
+; GENERIC-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xdouble_masked_unpack_high_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpckhpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
+; SKX-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_4xdouble_zero_masked_unpack_high_mask1(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) {
+; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_high_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xdouble_zero_masked_unpack_high_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
+ ret <4 x double> %res
+}
+define <4 x double> @test_4xdouble_masked_unpack_high_mask2(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) {
+; GENERIC-LABEL: test_4xdouble_masked_unpack_high_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpckhpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [3:1.00]
+; GENERIC-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xdouble_masked_unpack_high_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpckhpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
+; SKX-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_4xdouble_zero_masked_unpack_high_mask2(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) {
+; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_high_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xdouble_zero_masked_unpack_high_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
+ ret <4 x double> %res
+}
+define <4 x double> @test_4xdouble_unpack_high_mask3(<4 x double> %vec1, <4 x double> %vec2) {
+; GENERIC-LABEL: test_4xdouble_unpack_high_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xdouble_unpack_high_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+ ret <4 x double> %res
+}
+define <4 x double> @test_4xdouble_masked_unpack_high_mask3(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) {
+; GENERIC-LABEL: test_4xdouble_masked_unpack_high_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpckhpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [3:1.00]
+; GENERIC-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xdouble_masked_unpack_high_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpckhpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
+; SKX-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_4xdouble_zero_masked_unpack_high_mask3(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) {
+; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_high_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xdouble_zero_masked_unpack_high_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
+ ret <4 x double> %res
+}
+define <4 x double> @test_4xdouble_unpack_high_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p) {
+; GENERIC-LABEL: test_4xdouble_unpack_high_mem_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xdouble_unpack_high_mem_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <4 x double>, <4 x double>* %vec2p
+ %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+ ret <4 x double> %res
+}
+define <4 x double> @test_4xdouble_masked_unpack_high_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) {
+; GENERIC-LABEL: test_4xdouble_masked_unpack_high_mem_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [7:1.00]
+; GENERIC-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xdouble_masked_unpack_high_mem_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
+; SKX-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <4 x double>, <4 x double>* %vec2p
+ %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_4xdouble_zero_masked_unpack_high_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) {
+; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_high_mem_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xdouble_zero_masked_unpack_high_mem_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <4 x double>, <4 x double>* %vec2p
+ %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_4xdouble_masked_unpack_high_mem_mask1(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) {
+; GENERIC-LABEL: test_4xdouble_masked_unpack_high_mem_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [7:1.00]
+; GENERIC-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xdouble_masked_unpack_high_mem_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
+; SKX-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <4 x double>, <4 x double>* %vec2p
+ %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_4xdouble_zero_masked_unpack_high_mem_mask1(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) {
+; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_high_mem_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xdouble_zero_masked_unpack_high_mem_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <4 x double>, <4 x double>* %vec2p
+ %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_4xdouble_masked_unpack_high_mem_mask2(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) {
+; GENERIC-LABEL: test_4xdouble_masked_unpack_high_mem_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [7:1.00]
+; GENERIC-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xdouble_masked_unpack_high_mem_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
+; SKX-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <4 x double>, <4 x double>* %vec2p
+ %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_4xdouble_zero_masked_unpack_high_mem_mask2(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) {
+; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_high_mem_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xdouble_zero_masked_unpack_high_mem_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <4 x double>, <4 x double>* %vec2p
+ %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_4xdouble_unpack_high_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p) {
+; GENERIC-LABEL: test_4xdouble_unpack_high_mem_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xdouble_unpack_high_mem_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <4 x double>, <4 x double>* %vec2p
+ %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+ ret <4 x double> %res
+}
+define <4 x double> @test_4xdouble_masked_unpack_high_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) {
+; GENERIC-LABEL: test_4xdouble_masked_unpack_high_mem_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [7:1.00]
+; GENERIC-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xdouble_masked_unpack_high_mem_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
+; SKX-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <4 x double>, <4 x double>* %vec2p
+ %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_4xdouble_zero_masked_unpack_high_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) {
+; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_high_mem_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xdouble_zero_masked_unpack_high_mem_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <4 x double>, <4 x double>* %vec2p
+ %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
+ ret <4 x double> %res
+}
+
+define <8 x double> @test_8xdouble_unpack_high_mask0(<8 x double> %vec1, <8 x double> %vec2) {
+; GENERIC-LABEL: test_8xdouble_unpack_high_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xdouble_unpack_high_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %res = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+ ret <8 x double> %res
+}
+define <8 x double> @test_8xdouble_masked_unpack_high_mask0(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x i64> %mask) {
+; GENERIC-LABEL: test_8xdouble_masked_unpack_high_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [3:1.00]
+; GENERIC-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xdouble_masked_unpack_high_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00]
+; SKX-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_8xdouble_zero_masked_unpack_high_mask0(<8 x double> %vec1, <8 x double> %vec2, <8 x i64> %mask) {
+; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_high_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xdouble_zero_masked_unpack_high_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
+ ret <8 x double> %res
+}
+define <8 x double> @test_8xdouble_masked_unpack_high_mask1(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x i64> %mask) {
+; GENERIC-LABEL: test_8xdouble_masked_unpack_high_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [3:1.00]
+; GENERIC-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xdouble_masked_unpack_high_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00]
+; SKX-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_8xdouble_zero_masked_unpack_high_mask1(<8 x double> %vec1, <8 x double> %vec2, <8 x i64> %mask) {
+; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_high_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xdouble_zero_masked_unpack_high_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
+ ret <8 x double> %res
+}
+define <8 x double> @test_8xdouble_masked_unpack_high_mask2(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x i64> %mask) {
+; GENERIC-LABEL: test_8xdouble_masked_unpack_high_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [3:1.00]
+; GENERIC-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xdouble_masked_unpack_high_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00]
+; SKX-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_8xdouble_zero_masked_unpack_high_mask2(<8 x double> %vec1, <8 x double> %vec2, <8 x i64> %mask) {
+; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_high_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xdouble_zero_masked_unpack_high_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
+ ret <8 x double> %res
+}
+define <8 x double> @test_8xdouble_unpack_high_mask3(<8 x double> %vec1, <8 x double> %vec2) {
+; GENERIC-LABEL: test_8xdouble_unpack_high_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xdouble_unpack_high_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %res = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+ ret <8 x double> %res
+}
+define <8 x double> @test_8xdouble_masked_unpack_high_mask3(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x i64> %mask) {
+; GENERIC-LABEL: test_8xdouble_masked_unpack_high_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [3:1.00]
+; GENERIC-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xdouble_masked_unpack_high_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00]
+; SKX-NEXT: vmovapd %zmm2, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_8xdouble_zero_masked_unpack_high_mask3(<8 x double> %vec1, <8 x double> %vec2, <8 x i64> %mask) {
+; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_high_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xdouble_zero_masked_unpack_high_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
+ ret <8 x double> %res
+}
+define <8 x double> @test_8xdouble_unpack_high_mem_mask0(<8 x double> %vec1, <8 x double>* %vec2p) {
+; GENERIC-LABEL: test_8xdouble_unpack_high_mem_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm0 = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xdouble_unpack_high_mem_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vunpckhpd {{.*#+}} zmm0 = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <8 x double>, <8 x double>* %vec2p
+ %res = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+ ret <8 x double> %res
+}
+define <8 x double> @test_8xdouble_masked_unpack_high_mem_mask0(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x i64> %mask) {
+; GENERIC-LABEL: test_8xdouble_masked_unpack_high_mem_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [7:1.00]
+; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xdouble_masked_unpack_high_mem_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00]
+; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <8 x double>, <8 x double>* %vec2p
+ %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_8xdouble_zero_masked_unpack_high_mem_mask0(<8 x double> %vec1, <8 x double>* %vec2p, <8 x i64> %mask) {
+; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_high_mem_mask0:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xdouble_zero_masked_unpack_high_mem_mask0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <8 x double>, <8 x double>* %vec2p
+ %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_8xdouble_masked_unpack_high_mem_mask1(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x i64> %mask) {
+; GENERIC-LABEL: test_8xdouble_masked_unpack_high_mem_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [7:1.00]
+; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xdouble_masked_unpack_high_mem_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00]
+; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <8 x double>, <8 x double>* %vec2p
+ %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_8xdouble_zero_masked_unpack_high_mem_mask1(<8 x double> %vec1, <8 x double>* %vec2p, <8 x i64> %mask) {
+; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_high_mem_mask1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xdouble_zero_masked_unpack_high_mem_mask1:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <8 x double>, <8 x double>* %vec2p
+ %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_8xdouble_masked_unpack_high_mem_mask2(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x i64> %mask) {
+; GENERIC-LABEL: test_8xdouble_masked_unpack_high_mem_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [7:1.00]
+; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xdouble_masked_unpack_high_mem_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00]
+; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <8 x double>, <8 x double>* %vec2p
+ %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_8xdouble_zero_masked_unpack_high_mem_mask2(<8 x double> %vec1, <8 x double>* %vec2p, <8 x i64> %mask) {
+; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_high_mem_mask2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xdouble_zero_masked_unpack_high_mem_mask2:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <8 x double>, <8 x double>* %vec2p
+ %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_8xdouble_unpack_high_mem_mask3(<8 x double> %vec1, <8 x double>* %vec2p) {
+; GENERIC-LABEL: test_8xdouble_unpack_high_mem_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm0 = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xdouble_unpack_high_mem_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vunpckhpd {{.*#+}} zmm0 = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <8 x double>, <8 x double>* %vec2p
+ %res = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+ ret <8 x double> %res
+}
+define <8 x double> @test_8xdouble_masked_unpack_high_mem_mask3(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x i64> %mask) {
+; GENERIC-LABEL: test_8xdouble_masked_unpack_high_mem_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [7:1.00]
+; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xdouble_masked_unpack_high_mem_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00]
+; SKX-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <8 x double>, <8 x double>* %vec2p
+ %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_8xdouble_zero_masked_unpack_high_mem_mask3(<8 x double> %vec1, <8 x double>* %vec2p, <8 x i64> %mask) {
+; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_high_mem_mask3:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xdouble_zero_masked_unpack_high_mem_mask3:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+ %vec2 = load <8 x double>, <8 x double>* %vec2p
+ %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
+ ret <8 x double> %res
+}
+
diff --git a/test/CodeGen/X86/avx512-shuffles/broadcast-scalar-fp.ll b/test/CodeGen/X86/avx512-shuffles/broadcast-scalar-fp.ll
new file mode 100644
index 000000000000..1d477940c6e7
--- /dev/null
+++ b/test/CodeGen/X86/avx512-shuffles/broadcast-scalar-fp.ll
@@ -0,0 +1,1238 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512vl %s -o - | FileCheck %s
+
+define <4 x double> @test_double_to_4(double %s) {
+; CHECK-LABEL: test_double_to_4:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x double> undef, double %s, i32 0
+ %res = shufflevector <2 x double> %vec, <2 x double> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+ ret <4 x double> %res
+}
+define <4 x double> @test_masked_double_to_4_mask0(double %s, <4 x double> %default, <4 x double> %mask) {
+; CHECK-LABEL: test_masked_double_to_4_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
+; CHECK-NEXT: vbroadcastsd %xmm0, %ymm1 {%k1}
+; CHECK-NEXT: vmovapd %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x double> undef, double %s, i32 0
+ %shuf = shufflevector <2 x double> %vec, <2 x double> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %default
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_masked_z_double_to_4_mask0(double %s, <4 x double> %mask) {
+; CHECK-LABEL: test_masked_z_double_to_4_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x double> undef, double %s, i32 0
+ %shuf = shufflevector <2 x double> %vec, <2 x double> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
+ ret <4 x double> %res
+}
+define <4 x double> @test_masked_double_to_4_mask1(double %s, <4 x double> %default, <4 x double> %mask) {
+; CHECK-LABEL: test_masked_double_to_4_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
+; CHECK-NEXT: vbroadcastsd %xmm0, %ymm1 {%k1}
+; CHECK-NEXT: vmovapd %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x double> undef, double %s, i32 0
+ %shuf = shufflevector <2 x double> %vec, <2 x double> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %default
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_masked_z_double_to_4_mask1(double %s, <4 x double> %mask) {
+; CHECK-LABEL: test_masked_z_double_to_4_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x double> undef, double %s, i32 0
+ %shuf = shufflevector <2 x double> %vec, <2 x double> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
+ ret <4 x double> %res
+}
+define <4 x double> @test_masked_double_to_4_mask2(double %s, <4 x double> %default, <4 x double> %mask) {
+; CHECK-LABEL: test_masked_double_to_4_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
+; CHECK-NEXT: vbroadcastsd %xmm0, %ymm1 {%k1}
+; CHECK-NEXT: vmovapd %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x double> undef, double %s, i32 0
+ %shuf = shufflevector <2 x double> %vec, <2 x double> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %default
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_masked_z_double_to_4_mask2(double %s, <4 x double> %mask) {
+; CHECK-LABEL: test_masked_z_double_to_4_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x double> undef, double %s, i32 0
+ %shuf = shufflevector <2 x double> %vec, <2 x double> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
+ ret <4 x double> %res
+}
+define <4 x double> @test_masked_double_to_4_mask3(double %s, <4 x double> %default, <4 x double> %mask) {
+; CHECK-LABEL: test_masked_double_to_4_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
+; CHECK-NEXT: vbroadcastsd %xmm0, %ymm1 {%k1}
+; CHECK-NEXT: vmovapd %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x double> undef, double %s, i32 0
+ %shuf = shufflevector <2 x double> %vec, <2 x double> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %default
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_masked_z_double_to_4_mask3(double %s, <4 x double> %mask) {
+; CHECK-LABEL: test_masked_z_double_to_4_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x double> undef, double %s, i32 0
+ %shuf = shufflevector <2 x double> %vec, <2 x double> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
+ ret <4 x double> %res
+}
+define <8 x double> @test_double_to_8(double %s) {
+; CHECK-LABEL: test_double_to_8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vbroadcastsd %xmm0, %zmm0
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x double> undef, double %s, i32 0
+ %res = shufflevector <2 x double> %vec, <2 x double> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ ret <8 x double> %res
+}
+define <8 x double> @test_masked_double_to_8_mask0(double %s, <8 x double> %default, <8 x double> %mask) {
+; CHECK-LABEL: test_masked_double_to_8_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %zmm3, %zmm2, %k1
+; CHECK-NEXT: vbroadcastsd %xmm0, %zmm1 {%k1}
+; CHECK-NEXT: vmovapd %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x double> undef, double %s, i32 0
+ %shuf = shufflevector <2 x double> %vec, <2 x double> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %default
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_masked_z_double_to_8_mask0(double %s, <8 x double> %mask) {
+; CHECK-LABEL: test_masked_z_double_to_8_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %zmm2, %zmm1, %k1
+; CHECK-NEXT: vbroadcastsd %xmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x double> undef, double %s, i32 0
+ %shuf = shufflevector <2 x double> %vec, <2 x double> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
+ ret <8 x double> %res
+}
+define <8 x double> @test_masked_double_to_8_mask1(double %s, <8 x double> %default, <8 x double> %mask) {
+; CHECK-LABEL: test_masked_double_to_8_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %zmm3, %zmm2, %k1
+; CHECK-NEXT: vbroadcastsd %xmm0, %zmm1 {%k1}
+; CHECK-NEXT: vmovapd %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x double> undef, double %s, i32 0
+ %shuf = shufflevector <2 x double> %vec, <2 x double> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %default
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_masked_z_double_to_8_mask1(double %s, <8 x double> %mask) {
+; CHECK-LABEL: test_masked_z_double_to_8_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %zmm2, %zmm1, %k1
+; CHECK-NEXT: vbroadcastsd %xmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x double> undef, double %s, i32 0
+ %shuf = shufflevector <2 x double> %vec, <2 x double> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
+ ret <8 x double> %res
+}
+define <8 x double> @test_masked_double_to_8_mask2(double %s, <8 x double> %default, <8 x double> %mask) {
+; CHECK-LABEL: test_masked_double_to_8_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %zmm3, %zmm2, %k1
+; CHECK-NEXT: vbroadcastsd %xmm0, %zmm1 {%k1}
+; CHECK-NEXT: vmovapd %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x double> undef, double %s, i32 0
+ %shuf = shufflevector <2 x double> %vec, <2 x double> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %default
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_masked_z_double_to_8_mask2(double %s, <8 x double> %mask) {
+; CHECK-LABEL: test_masked_z_double_to_8_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %zmm2, %zmm1, %k1
+; CHECK-NEXT: vbroadcastsd %xmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x double> undef, double %s, i32 0
+ %shuf = shufflevector <2 x double> %vec, <2 x double> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
+ ret <8 x double> %res
+}
+define <8 x double> @test_masked_double_to_8_mask3(double %s, <8 x double> %default, <8 x double> %mask) {
+; CHECK-LABEL: test_masked_double_to_8_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %zmm3, %zmm2, %k1
+; CHECK-NEXT: vbroadcastsd %xmm0, %zmm1 {%k1}
+; CHECK-NEXT: vmovapd %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x double> undef, double %s, i32 0
+ %shuf = shufflevector <2 x double> %vec, <2 x double> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %default
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_masked_z_double_to_8_mask3(double %s, <8 x double> %mask) {
+; CHECK-LABEL: test_masked_z_double_to_8_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %zmm2, %zmm1, %k1
+; CHECK-NEXT: vbroadcastsd %xmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x double> undef, double %s, i32 0
+ %shuf = shufflevector <2 x double> %vec, <2 x double> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
+ ret <8 x double> %res
+}
+define <4 x float> @test_float_to_4(float %s) {
+; CHECK-LABEL: test_float_to_4:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vbroadcastss %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x float> undef, float %s, i32 0
+ %res = shufflevector <2 x float> %vec, <2 x float> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+ ret <4 x float> %res
+}
+define <4 x float> @test_masked_float_to_4_mask0(float %s, <4 x float> %default, <4 x float> %mask) {
+; CHECK-LABEL: test_masked_float_to_4_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1
+; CHECK-NEXT: vbroadcastss %xmm0, %xmm1 {%k1}
+; CHECK-NEXT: vmovaps %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x float> undef, float %s, i32 0
+ %shuf = shufflevector <2 x float> %vec, <2 x float> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %default
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_masked_z_float_to_4_mask0(float %s, <4 x float> %mask) {
+; CHECK-LABEL: test_masked_z_float_to_4_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
+; CHECK-NEXT: vbroadcastss %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x float> undef, float %s, i32 0
+ %shuf = shufflevector <2 x float> %vec, <2 x float> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
+ ret <4 x float> %res
+}
+define <4 x float> @test_masked_float_to_4_mask1(float %s, <4 x float> %default, <4 x float> %mask) {
+; CHECK-LABEL: test_masked_float_to_4_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1
+; CHECK-NEXT: vbroadcastss %xmm0, %xmm1 {%k1}
+; CHECK-NEXT: vmovaps %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x float> undef, float %s, i32 0
+ %shuf = shufflevector <2 x float> %vec, <2 x float> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %default
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_masked_z_float_to_4_mask1(float %s, <4 x float> %mask) {
+; CHECK-LABEL: test_masked_z_float_to_4_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
+; CHECK-NEXT: vbroadcastss %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x float> undef, float %s, i32 0
+ %shuf = shufflevector <2 x float> %vec, <2 x float> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
+ ret <4 x float> %res
+}
+define <4 x float> @test_masked_float_to_4_mask2(float %s, <4 x float> %default, <4 x float> %mask) {
+; CHECK-LABEL: test_masked_float_to_4_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1
+; CHECK-NEXT: vbroadcastss %xmm0, %xmm1 {%k1}
+; CHECK-NEXT: vmovaps %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x float> undef, float %s, i32 0
+ %shuf = shufflevector <2 x float> %vec, <2 x float> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %default
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_masked_z_float_to_4_mask2(float %s, <4 x float> %mask) {
+; CHECK-LABEL: test_masked_z_float_to_4_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
+; CHECK-NEXT: vbroadcastss %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x float> undef, float %s, i32 0
+ %shuf = shufflevector <2 x float> %vec, <2 x float> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
+ ret <4 x float> %res
+}
+define <4 x float> @test_masked_float_to_4_mask3(float %s, <4 x float> %default, <4 x float> %mask) {
+; CHECK-LABEL: test_masked_float_to_4_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1
+; CHECK-NEXT: vbroadcastss %xmm0, %xmm1 {%k1}
+; CHECK-NEXT: vmovaps %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x float> undef, float %s, i32 0
+ %shuf = shufflevector <2 x float> %vec, <2 x float> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %default
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_masked_z_float_to_4_mask3(float %s, <4 x float> %mask) {
+; CHECK-LABEL: test_masked_z_float_to_4_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
+; CHECK-NEXT: vbroadcastss %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x float> undef, float %s, i32 0
+ %shuf = shufflevector <2 x float> %vec, <2 x float> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
+ ret <4 x float> %res
+}
+define <8 x float> @test_float_to_8(float %s) {
+; CHECK-LABEL: test_float_to_8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vbroadcastss %xmm0, %ymm0
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x float> undef, float %s, i32 0
+ %res = shufflevector <2 x float> %vec, <2 x float> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ ret <8 x float> %res
+}
+define <8 x float> @test_masked_float_to_8_mask0(float %s, <8 x float> %default, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_float_to_8_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1
+; CHECK-NEXT: vbroadcastss %xmm0, %ymm1 {%k1}
+; CHECK-NEXT: vmovaps %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x float> undef, float %s, i32 0
+ %shuf = shufflevector <2 x float> %vec, <2 x float> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %default
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_masked_z_float_to_8_mask0(float %s, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_z_float_to_8_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1
+; CHECK-NEXT: vbroadcastss %xmm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x float> undef, float %s, i32 0
+ %shuf = shufflevector <2 x float> %vec, <2 x float> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+define <8 x float> @test_masked_float_to_8_mask1(float %s, <8 x float> %default, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_float_to_8_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1
+; CHECK-NEXT: vbroadcastss %xmm0, %ymm1 {%k1}
+; CHECK-NEXT: vmovaps %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x float> undef, float %s, i32 0
+ %shuf = shufflevector <2 x float> %vec, <2 x float> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %default
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_masked_z_float_to_8_mask1(float %s, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_z_float_to_8_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1
+; CHECK-NEXT: vbroadcastss %xmm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x float> undef, float %s, i32 0
+ %shuf = shufflevector <2 x float> %vec, <2 x float> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+define <8 x float> @test_masked_float_to_8_mask2(float %s, <8 x float> %default, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_float_to_8_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1
+; CHECK-NEXT: vbroadcastss %xmm0, %ymm1 {%k1}
+; CHECK-NEXT: vmovaps %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x float> undef, float %s, i32 0
+ %shuf = shufflevector <2 x float> %vec, <2 x float> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %default
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_masked_z_float_to_8_mask2(float %s, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_z_float_to_8_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1
+; CHECK-NEXT: vbroadcastss %xmm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x float> undef, float %s, i32 0
+ %shuf = shufflevector <2 x float> %vec, <2 x float> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+define <8 x float> @test_masked_float_to_8_mask3(float %s, <8 x float> %default, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_float_to_8_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1
+; CHECK-NEXT: vbroadcastss %xmm0, %ymm1 {%k1}
+; CHECK-NEXT: vmovaps %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x float> undef, float %s, i32 0
+ %shuf = shufflevector <2 x float> %vec, <2 x float> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %default
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_masked_z_float_to_8_mask3(float %s, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_z_float_to_8_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1
+; CHECK-NEXT: vbroadcastss %xmm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x float> undef, float %s, i32 0
+ %shuf = shufflevector <2 x float> %vec, <2 x float> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+define <16 x float> @test_float_to_16(float %s) {
+; CHECK-LABEL: test_float_to_16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vbroadcastss %xmm0, %zmm0
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x float> undef, float %s, i32 0
+ %res = shufflevector <2 x float> %vec, <2 x float> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ ret <16 x float> %res
+}
+define <16 x float> @test_masked_float_to_16_mask0(float %s, <16 x float> %default, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_float_to_16_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %zmm3, %zmm2, %k1
+; CHECK-NEXT: vbroadcastss %xmm0, %zmm1 {%k1}
+; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x float> undef, float %s, i32 0
+ %shuf = shufflevector <2 x float> %vec, <2 x float> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %default
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_masked_z_float_to_16_mask0(float %s, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_z_float_to_16_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %zmm2, %zmm1, %k1
+; CHECK-NEXT: vbroadcastss %xmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x float> undef, float %s, i32 0
+ %shuf = shufflevector <2 x float> %vec, <2 x float> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
+define <16 x float> @test_masked_float_to_16_mask1(float %s, <16 x float> %default, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_float_to_16_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %zmm3, %zmm2, %k1
+; CHECK-NEXT: vbroadcastss %xmm0, %zmm1 {%k1}
+; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x float> undef, float %s, i32 0
+ %shuf = shufflevector <2 x float> %vec, <2 x float> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %default
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_masked_z_float_to_16_mask1(float %s, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_z_float_to_16_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %zmm2, %zmm1, %k1
+; CHECK-NEXT: vbroadcastss %xmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x float> undef, float %s, i32 0
+ %shuf = shufflevector <2 x float> %vec, <2 x float> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
+define <16 x float> @test_masked_float_to_16_mask2(float %s, <16 x float> %default, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_float_to_16_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %zmm3, %zmm2, %k1
+; CHECK-NEXT: vbroadcastss %xmm0, %zmm1 {%k1}
+; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x float> undef, float %s, i32 0
+ %shuf = shufflevector <2 x float> %vec, <2 x float> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %default
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_masked_z_float_to_16_mask2(float %s, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_z_float_to_16_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %zmm2, %zmm1, %k1
+; CHECK-NEXT: vbroadcastss %xmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x float> undef, float %s, i32 0
+ %shuf = shufflevector <2 x float> %vec, <2 x float> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
+define <16 x float> @test_masked_float_to_16_mask3(float %s, <16 x float> %default, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_float_to_16_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %zmm3, %zmm2, %k1
+; CHECK-NEXT: vbroadcastss %xmm0, %zmm1 {%k1}
+; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x float> undef, float %s, i32 0
+ %shuf = shufflevector <2 x float> %vec, <2 x float> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %default
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_masked_z_float_to_16_mask3(float %s, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_z_float_to_16_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %zmm2, %zmm1, %k1
+; CHECK-NEXT: vbroadcastss %xmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x float> undef, float %s, i32 0
+ %shuf = shufflevector <2 x float> %vec, <2 x float> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
+define <4 x double> @test_double_to_4_mem(double* %p) {
+; CHECK-LABEL: test_double_to_4_mem:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vbroadcastsd (%rdi), %ymm0
+; CHECK-NEXT: retq
+ %s = load double, double* %p
+ %vec = insertelement <2 x double> undef, double %s, i32 0
+ %res = shufflevector <2 x double> %vec, <2 x double> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+ ret <4 x double> %res
+}
+define <4 x double> @test_masked_double_to_4_mem_mask0(double* %p, <4 x double> %default, <4 x double> %mask) {
+; CHECK-LABEL: test_masked_double_to_4_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vbroadcastsd (%rdi), %ymm0 {%k1}
+; CHECK-NEXT: retq
+ %s = load double, double* %p
+ %vec = insertelement <2 x double> undef, double %s, i32 0
+ %shuf = shufflevector <2 x double> %vec, <2 x double> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %default
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_masked_z_double_to_4_mem_mask0(double* %p, <4 x double> %mask) {
+; CHECK-LABEL: test_masked_z_double_to_4_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vcmpeqpd %ymm1, %ymm0, %k1
+; CHECK-NEXT: vbroadcastsd (%rdi), %ymm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %s = load double, double* %p
+ %vec = insertelement <2 x double> undef, double %s, i32 0
+ %shuf = shufflevector <2 x double> %vec, <2 x double> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
+ ret <4 x double> %res
+}
+define <4 x double> @test_masked_double_to_4_mem_mask1(double* %p, <4 x double> %default, <4 x double> %mask) {
+; CHECK-LABEL: test_masked_double_to_4_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vbroadcastsd (%rdi), %ymm0 {%k1}
+; CHECK-NEXT: retq
+ %s = load double, double* %p
+ %vec = insertelement <2 x double> undef, double %s, i32 0
+ %shuf = shufflevector <2 x double> %vec, <2 x double> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %default
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_masked_z_double_to_4_mem_mask1(double* %p, <4 x double> %mask) {
+; CHECK-LABEL: test_masked_z_double_to_4_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vcmpeqpd %ymm1, %ymm0, %k1
+; CHECK-NEXT: vbroadcastsd (%rdi), %ymm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %s = load double, double* %p
+ %vec = insertelement <2 x double> undef, double %s, i32 0
+ %shuf = shufflevector <2 x double> %vec, <2 x double> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
+ ret <4 x double> %res
+}
+define <4 x double> @test_masked_double_to_4_mem_mask2(double* %p, <4 x double> %default, <4 x double> %mask) {
+; CHECK-LABEL: test_masked_double_to_4_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vbroadcastsd (%rdi), %ymm0 {%k1}
+; CHECK-NEXT: retq
+ %s = load double, double* %p
+ %vec = insertelement <2 x double> undef, double %s, i32 0
+ %shuf = shufflevector <2 x double> %vec, <2 x double> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %default
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_masked_z_double_to_4_mem_mask2(double* %p, <4 x double> %mask) {
+; CHECK-LABEL: test_masked_z_double_to_4_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vcmpeqpd %ymm1, %ymm0, %k1
+; CHECK-NEXT: vbroadcastsd (%rdi), %ymm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %s = load double, double* %p
+ %vec = insertelement <2 x double> undef, double %s, i32 0
+ %shuf = shufflevector <2 x double> %vec, <2 x double> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
+ ret <4 x double> %res
+}
+define <4 x double> @test_masked_double_to_4_mem_mask3(double* %p, <4 x double> %default, <4 x double> %mask) {
+; CHECK-LABEL: test_masked_double_to_4_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vbroadcastsd (%rdi), %ymm0 {%k1}
+; CHECK-NEXT: retq
+ %s = load double, double* %p
+ %vec = insertelement <2 x double> undef, double %s, i32 0
+ %shuf = shufflevector <2 x double> %vec, <2 x double> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %default
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_masked_z_double_to_4_mem_mask3(double* %p, <4 x double> %mask) {
+; CHECK-LABEL: test_masked_z_double_to_4_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vcmpeqpd %ymm1, %ymm0, %k1
+; CHECK-NEXT: vbroadcastsd (%rdi), %ymm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %s = load double, double* %p
+ %vec = insertelement <2 x double> undef, double %s, i32 0
+ %shuf = shufflevector <2 x double> %vec, <2 x double> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
+ ret <4 x double> %res
+}
+define <8 x double> @test_double_to_8_mem(double* %p) {
+; CHECK-LABEL: test_double_to_8_mem:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vbroadcastsd (%rdi), %zmm0
+; CHECK-NEXT: retq
+ %s = load double, double* %p
+ %vec = insertelement <2 x double> undef, double %s, i32 0
+ %res = shufflevector <2 x double> %vec, <2 x double> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ ret <8 x double> %res
+}
+define <8 x double> @test_masked_double_to_8_mem_mask0(double* %p, <8 x double> %default, <8 x double> %mask) {
+; CHECK-LABEL: test_masked_double_to_8_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %zmm2, %zmm1, %k1
+; CHECK-NEXT: vbroadcastsd (%rdi), %zmm0 {%k1}
+; CHECK-NEXT: retq
+ %s = load double, double* %p
+ %vec = insertelement <2 x double> undef, double %s, i32 0
+ %shuf = shufflevector <2 x double> %vec, <2 x double> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %default
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_masked_z_double_to_8_mem_mask0(double* %p, <8 x double> %mask) {
+; CHECK-LABEL: test_masked_z_double_to_8_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vcmpeqpd %zmm1, %zmm0, %k1
+; CHECK-NEXT: vbroadcastsd (%rdi), %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %s = load double, double* %p
+ %vec = insertelement <2 x double> undef, double %s, i32 0
+ %shuf = shufflevector <2 x double> %vec, <2 x double> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
+ ret <8 x double> %res
+}
+define <8 x double> @test_masked_double_to_8_mem_mask1(double* %p, <8 x double> %default, <8 x double> %mask) {
+; CHECK-LABEL: test_masked_double_to_8_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %zmm2, %zmm1, %k1
+; CHECK-NEXT: vbroadcastsd (%rdi), %zmm0 {%k1}
+; CHECK-NEXT: retq
+ %s = load double, double* %p
+ %vec = insertelement <2 x double> undef, double %s, i32 0
+ %shuf = shufflevector <2 x double> %vec, <2 x double> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %default
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_masked_z_double_to_8_mem_mask1(double* %p, <8 x double> %mask) {
+; CHECK-LABEL: test_masked_z_double_to_8_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vcmpeqpd %zmm1, %zmm0, %k1
+; CHECK-NEXT: vbroadcastsd (%rdi), %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %s = load double, double* %p
+ %vec = insertelement <2 x double> undef, double %s, i32 0
+ %shuf = shufflevector <2 x double> %vec, <2 x double> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
+ ret <8 x double> %res
+}
+define <8 x double> @test_masked_double_to_8_mem_mask2(double* %p, <8 x double> %default, <8 x double> %mask) {
+; CHECK-LABEL: test_masked_double_to_8_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %zmm2, %zmm1, %k1
+; CHECK-NEXT: vbroadcastsd (%rdi), %zmm0 {%k1}
+; CHECK-NEXT: retq
+ %s = load double, double* %p
+ %vec = insertelement <2 x double> undef, double %s, i32 0
+ %shuf = shufflevector <2 x double> %vec, <2 x double> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %default
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_masked_z_double_to_8_mem_mask2(double* %p, <8 x double> %mask) {
+; CHECK-LABEL: test_masked_z_double_to_8_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vcmpeqpd %zmm1, %zmm0, %k1
+; CHECK-NEXT: vbroadcastsd (%rdi), %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %s = load double, double* %p
+ %vec = insertelement <2 x double> undef, double %s, i32 0
+ %shuf = shufflevector <2 x double> %vec, <2 x double> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
+ ret <8 x double> %res
+}
+define <8 x double> @test_masked_double_to_8_mem_mask3(double* %p, <8 x double> %default, <8 x double> %mask) {
+; CHECK-LABEL: test_masked_double_to_8_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %zmm2, %zmm1, %k1
+; CHECK-NEXT: vbroadcastsd (%rdi), %zmm0 {%k1}
+; CHECK-NEXT: retq
+ %s = load double, double* %p
+ %vec = insertelement <2 x double> undef, double %s, i32 0
+ %shuf = shufflevector <2 x double> %vec, <2 x double> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %default
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_masked_z_double_to_8_mem_mask3(double* %p, <8 x double> %mask) {
+; CHECK-LABEL: test_masked_z_double_to_8_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vcmpeqpd %zmm1, %zmm0, %k1
+; CHECK-NEXT: vbroadcastsd (%rdi), %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %s = load double, double* %p
+ %vec = insertelement <2 x double> undef, double %s, i32 0
+ %shuf = shufflevector <2 x double> %vec, <2 x double> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
+ ret <8 x double> %res
+}
+define <4 x float> @test_float_to_4_mem(float* %p) {
+; CHECK-LABEL: test_float_to_4_mem:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vbroadcastss (%rdi), %xmm0
+; CHECK-NEXT: retq
+ %s = load float, float* %p
+ %vec = insertelement <2 x float> undef, float %s, i32 0
+ %res = shufflevector <2 x float> %vec, <2 x float> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+ ret <4 x float> %res
+}
+define <4 x float> @test_masked_float_to_4_mem_mask0(float* %p, <4 x float> %default, <4 x float> %mask) {
+; CHECK-LABEL: test_masked_float_to_4_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
+; CHECK-NEXT: vbroadcastss (%rdi), %xmm0 {%k1}
+; CHECK-NEXT: retq
+ %s = load float, float* %p
+ %vec = insertelement <2 x float> undef, float %s, i32 0
+ %shuf = shufflevector <2 x float> %vec, <2 x float> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %default
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_masked_z_float_to_4_mem_mask0(float* %p, <4 x float> %mask) {
+; CHECK-LABEL: test_masked_z_float_to_4_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vcmpeqps %xmm1, %xmm0, %k1
+; CHECK-NEXT: vbroadcastss (%rdi), %xmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %s = load float, float* %p
+ %vec = insertelement <2 x float> undef, float %s, i32 0
+ %shuf = shufflevector <2 x float> %vec, <2 x float> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
+ ret <4 x float> %res
+}
+define <4 x float> @test_masked_float_to_4_mem_mask1(float* %p, <4 x float> %default, <4 x float> %mask) {
+; CHECK-LABEL: test_masked_float_to_4_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
+; CHECK-NEXT: vbroadcastss (%rdi), %xmm0 {%k1}
+; CHECK-NEXT: retq
+ %s = load float, float* %p
+ %vec = insertelement <2 x float> undef, float %s, i32 0
+ %shuf = shufflevector <2 x float> %vec, <2 x float> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %default
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_masked_z_float_to_4_mem_mask1(float* %p, <4 x float> %mask) {
+; CHECK-LABEL: test_masked_z_float_to_4_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vcmpeqps %xmm1, %xmm0, %k1
+; CHECK-NEXT: vbroadcastss (%rdi), %xmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %s = load float, float* %p
+ %vec = insertelement <2 x float> undef, float %s, i32 0
+ %shuf = shufflevector <2 x float> %vec, <2 x float> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
+ ret <4 x float> %res
+}
+define <4 x float> @test_masked_float_to_4_mem_mask2(float* %p, <4 x float> %default, <4 x float> %mask) {
+; CHECK-LABEL: test_masked_float_to_4_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
+; CHECK-NEXT: vbroadcastss (%rdi), %xmm0 {%k1}
+; CHECK-NEXT: retq
+ %s = load float, float* %p
+ %vec = insertelement <2 x float> undef, float %s, i32 0
+ %shuf = shufflevector <2 x float> %vec, <2 x float> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %default
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_masked_z_float_to_4_mem_mask2(float* %p, <4 x float> %mask) {
+; CHECK-LABEL: test_masked_z_float_to_4_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vcmpeqps %xmm1, %xmm0, %k1
+; CHECK-NEXT: vbroadcastss (%rdi), %xmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %s = load float, float* %p
+ %vec = insertelement <2 x float> undef, float %s, i32 0
+ %shuf = shufflevector <2 x float> %vec, <2 x float> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
+ ret <4 x float> %res
+}
+define <4 x float> @test_masked_float_to_4_mem_mask3(float* %p, <4 x float> %default, <4 x float> %mask) {
+; CHECK-LABEL: test_masked_float_to_4_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
+; CHECK-NEXT: vbroadcastss (%rdi), %xmm0 {%k1}
+; CHECK-NEXT: retq
+ %s = load float, float* %p
+ %vec = insertelement <2 x float> undef, float %s, i32 0
+ %shuf = shufflevector <2 x float> %vec, <2 x float> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %default
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_masked_z_float_to_4_mem_mask3(float* %p, <4 x float> %mask) {
+; CHECK-LABEL: test_masked_z_float_to_4_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vcmpeqps %xmm1, %xmm0, %k1
+; CHECK-NEXT: vbroadcastss (%rdi), %xmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %s = load float, float* %p
+ %vec = insertelement <2 x float> undef, float %s, i32 0
+ %shuf = shufflevector <2 x float> %vec, <2 x float> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
+ ret <4 x float> %res
+}
+define <8 x float> @test_float_to_8_mem(float* %p) {
+; CHECK-LABEL: test_float_to_8_mem:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vbroadcastss (%rdi), %ymm0
+; CHECK-NEXT: retq
+ %s = load float, float* %p
+ %vec = insertelement <2 x float> undef, float %s, i32 0
+ %res = shufflevector <2 x float> %vec, <2 x float> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ ret <8 x float> %res
+}
+define <8 x float> @test_masked_float_to_8_mem_mask0(float* %p, <8 x float> %default, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_float_to_8_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1
+; CHECK-NEXT: vbroadcastss (%rdi), %ymm0 {%k1}
+; CHECK-NEXT: retq
+ %s = load float, float* %p
+ %vec = insertelement <2 x float> undef, float %s, i32 0
+ %shuf = shufflevector <2 x float> %vec, <2 x float> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %default
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_masked_z_float_to_8_mem_mask0(float* %p, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_z_float_to_8_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vcmpeqps %ymm1, %ymm0, %k1
+; CHECK-NEXT: vbroadcastss (%rdi), %ymm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %s = load float, float* %p
+ %vec = insertelement <2 x float> undef, float %s, i32 0
+ %shuf = shufflevector <2 x float> %vec, <2 x float> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+define <8 x float> @test_masked_float_to_8_mem_mask1(float* %p, <8 x float> %default, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_float_to_8_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1
+; CHECK-NEXT: vbroadcastss (%rdi), %ymm0 {%k1}
+; CHECK-NEXT: retq
+ %s = load float, float* %p
+ %vec = insertelement <2 x float> undef, float %s, i32 0
+ %shuf = shufflevector <2 x float> %vec, <2 x float> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %default
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_masked_z_float_to_8_mem_mask1(float* %p, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_z_float_to_8_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vcmpeqps %ymm1, %ymm0, %k1
+; CHECK-NEXT: vbroadcastss (%rdi), %ymm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %s = load float, float* %p
+ %vec = insertelement <2 x float> undef, float %s, i32 0
+ %shuf = shufflevector <2 x float> %vec, <2 x float> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+define <8 x float> @test_masked_float_to_8_mem_mask2(float* %p, <8 x float> %default, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_float_to_8_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1
+; CHECK-NEXT: vbroadcastss (%rdi), %ymm0 {%k1}
+; CHECK-NEXT: retq
+ %s = load float, float* %p
+ %vec = insertelement <2 x float> undef, float %s, i32 0
+ %shuf = shufflevector <2 x float> %vec, <2 x float> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %default
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_masked_z_float_to_8_mem_mask2(float* %p, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_z_float_to_8_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vcmpeqps %ymm1, %ymm0, %k1
+; CHECK-NEXT: vbroadcastss (%rdi), %ymm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %s = load float, float* %p
+ %vec = insertelement <2 x float> undef, float %s, i32 0
+ %shuf = shufflevector <2 x float> %vec, <2 x float> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+define <8 x float> @test_masked_float_to_8_mem_mask3(float* %p, <8 x float> %default, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_float_to_8_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1
+; CHECK-NEXT: vbroadcastss (%rdi), %ymm0 {%k1}
+; CHECK-NEXT: retq
+ %s = load float, float* %p
+ %vec = insertelement <2 x float> undef, float %s, i32 0
+ %shuf = shufflevector <2 x float> %vec, <2 x float> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %default
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_masked_z_float_to_8_mem_mask3(float* %p, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_z_float_to_8_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vcmpeqps %ymm1, %ymm0, %k1
+; CHECK-NEXT: vbroadcastss (%rdi), %ymm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %s = load float, float* %p
+ %vec = insertelement <2 x float> undef, float %s, i32 0
+ %shuf = shufflevector <2 x float> %vec, <2 x float> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+define <16 x float> @test_float_to_16_mem(float* %p) {
+; CHECK-LABEL: test_float_to_16_mem:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vbroadcastss (%rdi), %zmm0
+; CHECK-NEXT: retq
+ %s = load float, float* %p
+ %vec = insertelement <2 x float> undef, float %s, i32 0
+ %res = shufflevector <2 x float> %vec, <2 x float> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ ret <16 x float> %res
+}
+define <16 x float> @test_masked_float_to_16_mem_mask0(float* %p, <16 x float> %default, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_float_to_16_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %zmm2, %zmm1, %k1
+; CHECK-NEXT: vbroadcastss (%rdi), %zmm0 {%k1}
+; CHECK-NEXT: retq
+ %s = load float, float* %p
+ %vec = insertelement <2 x float> undef, float %s, i32 0
+ %shuf = shufflevector <2 x float> %vec, <2 x float> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %default
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_masked_z_float_to_16_mem_mask0(float* %p, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_z_float_to_16_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vcmpeqps %zmm1, %zmm0, %k1
+; CHECK-NEXT: vbroadcastss (%rdi), %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %s = load float, float* %p
+ %vec = insertelement <2 x float> undef, float %s, i32 0
+ %shuf = shufflevector <2 x float> %vec, <2 x float> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
+define <16 x float> @test_masked_float_to_16_mem_mask1(float* %p, <16 x float> %default, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_float_to_16_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %zmm2, %zmm1, %k1
+; CHECK-NEXT: vbroadcastss (%rdi), %zmm0 {%k1}
+; CHECK-NEXT: retq
+ %s = load float, float* %p
+ %vec = insertelement <2 x float> undef, float %s, i32 0
+ %shuf = shufflevector <2 x float> %vec, <2 x float> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %default
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_masked_z_float_to_16_mem_mask1(float* %p, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_z_float_to_16_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vcmpeqps %zmm1, %zmm0, %k1
+; CHECK-NEXT: vbroadcastss (%rdi), %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %s = load float, float* %p
+ %vec = insertelement <2 x float> undef, float %s, i32 0
+ %shuf = shufflevector <2 x float> %vec, <2 x float> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
+define <16 x float> @test_masked_float_to_16_mem_mask2(float* %p, <16 x float> %default, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_float_to_16_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %zmm2, %zmm1, %k1
+; CHECK-NEXT: vbroadcastss (%rdi), %zmm0 {%k1}
+; CHECK-NEXT: retq
+ %s = load float, float* %p
+ %vec = insertelement <2 x float> undef, float %s, i32 0
+ %shuf = shufflevector <2 x float> %vec, <2 x float> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %default
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_masked_z_float_to_16_mem_mask2(float* %p, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_z_float_to_16_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vcmpeqps %zmm1, %zmm0, %k1
+; CHECK-NEXT: vbroadcastss (%rdi), %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %s = load float, float* %p
+ %vec = insertelement <2 x float> undef, float %s, i32 0
+ %shuf = shufflevector <2 x float> %vec, <2 x float> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
+define <16 x float> @test_masked_float_to_16_mem_mask3(float* %p, <16 x float> %default, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_float_to_16_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %zmm2, %zmm1, %k1
+; CHECK-NEXT: vbroadcastss (%rdi), %zmm0 {%k1}
+; CHECK-NEXT: retq
+ %s = load float, float* %p
+ %vec = insertelement <2 x float> undef, float %s, i32 0
+ %shuf = shufflevector <2 x float> %vec, <2 x float> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %default
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_masked_z_float_to_16_mem_mask3(float* %p, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_z_float_to_16_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vcmpeqps %zmm1, %zmm0, %k1
+; CHECK-NEXT: vbroadcastss (%rdi), %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %s = load float, float* %p
+ %vec = insertelement <2 x float> undef, float %s, i32 0
+ %shuf = shufflevector <2 x float> %vec, <2 x float> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
diff --git a/test/CodeGen/X86/avx512-shuffles/broadcast-scalar-int.ll b/test/CodeGen/X86/avx512-shuffles/broadcast-scalar-int.ll
new file mode 100644
index 000000000000..b31302d51ffe
--- /dev/null
+++ b/test/CodeGen/X86/avx512-shuffles/broadcast-scalar-int.ll
@@ -0,0 +1,2807 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512vl,+avx512bw %s -o - | FileCheck %s
+
+define <16 x i8> @test_i8_to_16(i8 %s) {
+; CHECK-LABEL: test_i8_to_16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpbroadcastb %edi, %xmm0
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x i8> undef, i8 %s, i32 0
+ %res = shufflevector <2 x i8> %vec, <2 x i8> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ ret <16 x i8> %res
+}
+define <16 x i8> @test_masked_i8_to_16_mask0(i8 %s, <16 x i8> %default, <16 x i8> %mask) {
+; CHECK-LABEL: test_masked_i8_to_16_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqb %xmm2, %xmm1, %k1
+; CHECK-NEXT: vpbroadcastb %edi, %xmm0 {%k1}
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x i8> undef, i8 %s, i32 0
+ %shuf = shufflevector <2 x i8> %vec, <2 x i8> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <16 x i8> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> %default
+ ret <16 x i8> %res
+}
+
+define <16 x i8> @test_masked_z_i8_to_16_mask0(i8 %s, <16 x i8> %mask) {
+; CHECK-LABEL: test_masked_z_i8_to_16_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqb %xmm1, %xmm0, %k1
+; CHECK-NEXT: vpbroadcastb %edi, %xmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x i8> undef, i8 %s, i32 0
+ %shuf = shufflevector <2 x i8> %vec, <2 x i8> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <16 x i8> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> zeroinitializer
+ ret <16 x i8> %res
+}
+define <16 x i8> @test_masked_i8_to_16_mask1(i8 %s, <16 x i8> %default, <16 x i8> %mask) {
+; CHECK-LABEL: test_masked_i8_to_16_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqb %xmm2, %xmm1, %k1
+; CHECK-NEXT: vpbroadcastb %edi, %xmm0 {%k1}
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x i8> undef, i8 %s, i32 0
+ %shuf = shufflevector <2 x i8> %vec, <2 x i8> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <16 x i8> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> %default
+ ret <16 x i8> %res
+}
+
+define <16 x i8> @test_masked_z_i8_to_16_mask1(i8 %s, <16 x i8> %mask) {
+; CHECK-LABEL: test_masked_z_i8_to_16_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqb %xmm1, %xmm0, %k1
+; CHECK-NEXT: vpbroadcastb %edi, %xmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x i8> undef, i8 %s, i32 0
+ %shuf = shufflevector <2 x i8> %vec, <2 x i8> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <16 x i8> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> zeroinitializer
+ ret <16 x i8> %res
+}
+define <16 x i8> @test_masked_i8_to_16_mask2(i8 %s, <16 x i8> %default, <16 x i8> %mask) {
+; CHECK-LABEL: test_masked_i8_to_16_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqb %xmm2, %xmm1, %k1
+; CHECK-NEXT: vpbroadcastb %edi, %xmm0 {%k1}
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x i8> undef, i8 %s, i32 0
+ %shuf = shufflevector <2 x i8> %vec, <2 x i8> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <16 x i8> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> %default
+ ret <16 x i8> %res
+}
+
+define <16 x i8> @test_masked_z_i8_to_16_mask2(i8 %s, <16 x i8> %mask) {
+; CHECK-LABEL: test_masked_z_i8_to_16_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqb %xmm1, %xmm0, %k1
+; CHECK-NEXT: vpbroadcastb %edi, %xmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x i8> undef, i8 %s, i32 0
+ %shuf = shufflevector <2 x i8> %vec, <2 x i8> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <16 x i8> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> zeroinitializer
+ ret <16 x i8> %res
+}
+define <16 x i8> @test_masked_i8_to_16_mask3(i8 %s, <16 x i8> %default, <16 x i8> %mask) {
+; CHECK-LABEL: test_masked_i8_to_16_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqb %xmm2, %xmm1, %k1
+; CHECK-NEXT: vpbroadcastb %edi, %xmm0 {%k1}
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x i8> undef, i8 %s, i32 0
+ %shuf = shufflevector <2 x i8> %vec, <2 x i8> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <16 x i8> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> %default
+ ret <16 x i8> %res
+}
+
+define <16 x i8> @test_masked_z_i8_to_16_mask3(i8 %s, <16 x i8> %mask) {
+; CHECK-LABEL: test_masked_z_i8_to_16_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqb %xmm1, %xmm0, %k1
+; CHECK-NEXT: vpbroadcastb %edi, %xmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x i8> undef, i8 %s, i32 0
+ %shuf = shufflevector <2 x i8> %vec, <2 x i8> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <16 x i8> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> zeroinitializer
+ ret <16 x i8> %res
+}
+define <32 x i8> @test_i8_to_32(i8 %s) {
+; CHECK-LABEL: test_i8_to_32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpbroadcastb %edi, %ymm0
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x i8> undef, i8 %s, i32 0
+ %res = shufflevector <2 x i8> %vec, <2 x i8> undef, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ ret <32 x i8> %res
+}
+define <32 x i8> @test_masked_i8_to_32_mask0(i8 %s, <32 x i8> %default, <32 x i8> %mask) {
+; CHECK-LABEL: test_masked_i8_to_32_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqb %ymm2, %ymm1, %k1
+; CHECK-NEXT: vpbroadcastb %edi, %ymm0 {%k1}
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x i8> undef, i8 %s, i32 0
+ %shuf = shufflevector <2 x i8> %vec, <2 x i8> undef, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <32 x i8> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> %default
+ ret <32 x i8> %res
+}
+
+define <32 x i8> @test_masked_z_i8_to_32_mask0(i8 %s, <32 x i8> %mask) {
+; CHECK-LABEL: test_masked_z_i8_to_32_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqb %ymm1, %ymm0, %k1
+; CHECK-NEXT: vpbroadcastb %edi, %ymm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x i8> undef, i8 %s, i32 0
+ %shuf = shufflevector <2 x i8> %vec, <2 x i8> undef, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <32 x i8> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> zeroinitializer
+ ret <32 x i8> %res
+}
+define <32 x i8> @test_masked_i8_to_32_mask1(i8 %s, <32 x i8> %default, <32 x i8> %mask) {
+; CHECK-LABEL: test_masked_i8_to_32_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqb %ymm2, %ymm1, %k1
+; CHECK-NEXT: vpbroadcastb %edi, %ymm0 {%k1}
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x i8> undef, i8 %s, i32 0
+ %shuf = shufflevector <2 x i8> %vec, <2 x i8> undef, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <32 x i8> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> %default
+ ret <32 x i8> %res
+}
+
+define <32 x i8> @test_masked_z_i8_to_32_mask1(i8 %s, <32 x i8> %mask) {
+; CHECK-LABEL: test_masked_z_i8_to_32_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqb %ymm1, %ymm0, %k1
+; CHECK-NEXT: vpbroadcastb %edi, %ymm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x i8> undef, i8 %s, i32 0
+ %shuf = shufflevector <2 x i8> %vec, <2 x i8> undef, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <32 x i8> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> zeroinitializer
+ ret <32 x i8> %res
+}
+define <32 x i8> @test_masked_i8_to_32_mask2(i8 %s, <32 x i8> %default, <32 x i8> %mask) {
+; CHECK-LABEL: test_masked_i8_to_32_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqb %ymm2, %ymm1, %k1
+; CHECK-NEXT: vpbroadcastb %edi, %ymm0 {%k1}
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x i8> undef, i8 %s, i32 0
+ %shuf = shufflevector <2 x i8> %vec, <2 x i8> undef, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <32 x i8> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> %default
+ ret <32 x i8> %res
+}
+
+define <32 x i8> @test_masked_z_i8_to_32_mask2(i8 %s, <32 x i8> %mask) {
+; CHECK-LABEL: test_masked_z_i8_to_32_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqb %ymm1, %ymm0, %k1
+; CHECK-NEXT: vpbroadcastb %edi, %ymm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x i8> undef, i8 %s, i32 0
+ %shuf = shufflevector <2 x i8> %vec, <2 x i8> undef, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <32 x i8> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> zeroinitializer
+ ret <32 x i8> %res
+}
+define <32 x i8> @test_masked_i8_to_32_mask3(i8 %s, <32 x i8> %default, <32 x i8> %mask) {
+; CHECK-LABEL: test_masked_i8_to_32_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqb %ymm2, %ymm1, %k1
+; CHECK-NEXT: vpbroadcastb %edi, %ymm0 {%k1}
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x i8> undef, i8 %s, i32 0
+ %shuf = shufflevector <2 x i8> %vec, <2 x i8> undef, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <32 x i8> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> %default
+ ret <32 x i8> %res
+}
+
+define <32 x i8> @test_masked_z_i8_to_32_mask3(i8 %s, <32 x i8> %mask) {
+; CHECK-LABEL: test_masked_z_i8_to_32_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqb %ymm1, %ymm0, %k1
+; CHECK-NEXT: vpbroadcastb %edi, %ymm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x i8> undef, i8 %s, i32 0
+ %shuf = shufflevector <2 x i8> %vec, <2 x i8> undef, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <32 x i8> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> zeroinitializer
+ ret <32 x i8> %res
+}
+define <64 x i8> @test_i8_to_64(i8 %s) {
+; CHECK-LABEL: test_i8_to_64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpbroadcastb %edi, %zmm0
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x i8> undef, i8 %s, i32 0
+ %res = shufflevector <2 x i8> %vec, <2 x i8> undef, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ ret <64 x i8> %res
+}
+define <64 x i8> @test_masked_i8_to_64_mask0(i8 %s, <64 x i8> %default, <64 x i8> %mask) {
+; CHECK-LABEL: test_masked_i8_to_64_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqb %zmm2, %zmm1, %k1
+; CHECK-NEXT: vpbroadcastb %edi, %zmm0 {%k1}
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x i8> undef, i8 %s, i32 0
+ %shuf = shufflevector <2 x i8> %vec, <2 x i8> undef, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <64 x i8> %mask, zeroinitializer
+ %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> %default
+ ret <64 x i8> %res
+}
+
+define <64 x i8> @test_masked_z_i8_to_64_mask0(i8 %s, <64 x i8> %mask) {
+; CHECK-LABEL: test_masked_z_i8_to_64_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqb %zmm1, %zmm0, %k1
+; CHECK-NEXT: vpbroadcastb %edi, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x i8> undef, i8 %s, i32 0
+ %shuf = shufflevector <2 x i8> %vec, <2 x i8> undef, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <64 x i8> %mask, zeroinitializer
+ %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> zeroinitializer
+ ret <64 x i8> %res
+}
+define <64 x i8> @test_masked_i8_to_64_mask1(i8 %s, <64 x i8> %default, <64 x i8> %mask) {
+; CHECK-LABEL: test_masked_i8_to_64_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqb %zmm2, %zmm1, %k1
+; CHECK-NEXT: vpbroadcastb %edi, %zmm0 {%k1}
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x i8> undef, i8 %s, i32 0
+ %shuf = shufflevector <2 x i8> %vec, <2 x i8> undef, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <64 x i8> %mask, zeroinitializer
+ %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> %default
+ ret <64 x i8> %res
+}
+
+define <64 x i8> @test_masked_z_i8_to_64_mask1(i8 %s, <64 x i8> %mask) {
+; CHECK-LABEL: test_masked_z_i8_to_64_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqb %zmm1, %zmm0, %k1
+; CHECK-NEXT: vpbroadcastb %edi, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x i8> undef, i8 %s, i32 0
+ %shuf = shufflevector <2 x i8> %vec, <2 x i8> undef, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <64 x i8> %mask, zeroinitializer
+ %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> zeroinitializer
+ ret <64 x i8> %res
+}
+define <64 x i8> @test_masked_i8_to_64_mask2(i8 %s, <64 x i8> %default, <64 x i8> %mask) {
+; CHECK-LABEL: test_masked_i8_to_64_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqb %zmm2, %zmm1, %k1
+; CHECK-NEXT: vpbroadcastb %edi, %zmm0 {%k1}
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x i8> undef, i8 %s, i32 0
+ %shuf = shufflevector <2 x i8> %vec, <2 x i8> undef, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <64 x i8> %mask, zeroinitializer
+ %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> %default
+ ret <64 x i8> %res
+}
+
+define <64 x i8> @test_masked_z_i8_to_64_mask2(i8 %s, <64 x i8> %mask) {
+; CHECK-LABEL: test_masked_z_i8_to_64_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqb %zmm1, %zmm0, %k1
+; CHECK-NEXT: vpbroadcastb %edi, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x i8> undef, i8 %s, i32 0
+ %shuf = shufflevector <2 x i8> %vec, <2 x i8> undef, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <64 x i8> %mask, zeroinitializer
+ %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> zeroinitializer
+ ret <64 x i8> %res
+}
+define <64 x i8> @test_masked_i8_to_64_mask3(i8 %s, <64 x i8> %default, <64 x i8> %mask) {
+; CHECK-LABEL: test_masked_i8_to_64_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqb %zmm2, %zmm1, %k1
+; CHECK-NEXT: vpbroadcastb %edi, %zmm0 {%k1}
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x i8> undef, i8 %s, i32 0
+ %shuf = shufflevector <2 x i8> %vec, <2 x i8> undef, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <64 x i8> %mask, zeroinitializer
+ %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> %default
+ ret <64 x i8> %res
+}
+
+define <64 x i8> @test_masked_z_i8_to_64_mask3(i8 %s, <64 x i8> %mask) {
+; CHECK-LABEL: test_masked_z_i8_to_64_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqb %zmm1, %zmm0, %k1
+; CHECK-NEXT: vpbroadcastb %edi, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x i8> undef, i8 %s, i32 0
+ %shuf = shufflevector <2 x i8> %vec, <2 x i8> undef, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <64 x i8> %mask, zeroinitializer
+ %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> zeroinitializer
+ ret <64 x i8> %res
+}
+define <8 x i16> @test_i16_to_8(i16 %s) {
+; CHECK-LABEL: test_i16_to_8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpbroadcastw %edi, %xmm0
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x i16> undef, i16 %s, i32 0
+ %res = shufflevector <2 x i16> %vec, <2 x i16> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ ret <8 x i16> %res
+}
+define <8 x i16> @test_masked_i16_to_8_mask0(i16 %s, <8 x i16> %default, <8 x i16> %mask) {
+; CHECK-LABEL: test_masked_i16_to_8_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1
+; CHECK-NEXT: vpbroadcastw %edi, %xmm0 {%k1}
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x i16> undef, i16 %s, i32 0
+ %shuf = shufflevector <2 x i16> %vec, <2 x i16> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %default
+ ret <8 x i16> %res
+}
+
+define <8 x i16> @test_masked_z_i16_to_8_mask0(i16 %s, <8 x i16> %mask) {
+; CHECK-LABEL: test_masked_z_i16_to_8_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k1
+; CHECK-NEXT: vpbroadcastw %edi, %xmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x i16> undef, i16 %s, i32 0
+ %shuf = shufflevector <2 x i16> %vec, <2 x i16> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
+ ret <8 x i16> %res
+}
+define <8 x i16> @test_masked_i16_to_8_mask1(i16 %s, <8 x i16> %default, <8 x i16> %mask) {
+; CHECK-LABEL: test_masked_i16_to_8_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1
+; CHECK-NEXT: vpbroadcastw %edi, %xmm0 {%k1}
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x i16> undef, i16 %s, i32 0
+ %shuf = shufflevector <2 x i16> %vec, <2 x i16> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %default
+ ret <8 x i16> %res
+}
+
+define <8 x i16> @test_masked_z_i16_to_8_mask1(i16 %s, <8 x i16> %mask) {
+; CHECK-LABEL: test_masked_z_i16_to_8_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k1
+; CHECK-NEXT: vpbroadcastw %edi, %xmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x i16> undef, i16 %s, i32 0
+ %shuf = shufflevector <2 x i16> %vec, <2 x i16> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
+ ret <8 x i16> %res
+}
+define <8 x i16> @test_masked_i16_to_8_mask2(i16 %s, <8 x i16> %default, <8 x i16> %mask) {
+; CHECK-LABEL: test_masked_i16_to_8_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1
+; CHECK-NEXT: vpbroadcastw %edi, %xmm0 {%k1}
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x i16> undef, i16 %s, i32 0
+ %shuf = shufflevector <2 x i16> %vec, <2 x i16> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %default
+ ret <8 x i16> %res
+}
+
+define <8 x i16> @test_masked_z_i16_to_8_mask2(i16 %s, <8 x i16> %mask) {
+; CHECK-LABEL: test_masked_z_i16_to_8_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k1
+; CHECK-NEXT: vpbroadcastw %edi, %xmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x i16> undef, i16 %s, i32 0
+ %shuf = shufflevector <2 x i16> %vec, <2 x i16> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
+ ret <8 x i16> %res
+}
+define <8 x i16> @test_masked_i16_to_8_mask3(i16 %s, <8 x i16> %default, <8 x i16> %mask) {
+; CHECK-LABEL: test_masked_i16_to_8_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1
+; CHECK-NEXT: vpbroadcastw %edi, %xmm0 {%k1}
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x i16> undef, i16 %s, i32 0
+ %shuf = shufflevector <2 x i16> %vec, <2 x i16> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %default
+ ret <8 x i16> %res
+}
+
+define <8 x i16> @test_masked_z_i16_to_8_mask3(i16 %s, <8 x i16> %mask) {
+; CHECK-LABEL: test_masked_z_i16_to_8_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k1
+; CHECK-NEXT: vpbroadcastw %edi, %xmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x i16> undef, i16 %s, i32 0
+ %shuf = shufflevector <2 x i16> %vec, <2 x i16> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
+ ret <8 x i16> %res
+}
+define <16 x i16> @test_i16_to_16(i16 %s) {
+; CHECK-LABEL: test_i16_to_16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpbroadcastw %edi, %ymm0
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x i16> undef, i16 %s, i32 0
+ %res = shufflevector <2 x i16> %vec, <2 x i16> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ ret <16 x i16> %res
+}
+define <16 x i16> @test_masked_i16_to_16_mask0(i16 %s, <16 x i16> %default, <16 x i16> %mask) {
+; CHECK-LABEL: test_masked_i16_to_16_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqw %ymm2, %ymm1, %k1
+; CHECK-NEXT: vpbroadcastw %edi, %ymm0 {%k1}
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x i16> undef, i16 %s, i32 0
+ %shuf = shufflevector <2 x i16> %vec, <2 x i16> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %default
+ ret <16 x i16> %res
+}
+
+define <16 x i16> @test_masked_z_i16_to_16_mask0(i16 %s, <16 x i16> %mask) {
+; CHECK-LABEL: test_masked_z_i16_to_16_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k1
+; CHECK-NEXT: vpbroadcastw %edi, %ymm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x i16> undef, i16 %s, i32 0
+ %shuf = shufflevector <2 x i16> %vec, <2 x i16> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
+ ret <16 x i16> %res
+}
+define <16 x i16> @test_masked_i16_to_16_mask1(i16 %s, <16 x i16> %default, <16 x i16> %mask) {
+; CHECK-LABEL: test_masked_i16_to_16_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqw %ymm2, %ymm1, %k1
+; CHECK-NEXT: vpbroadcastw %edi, %ymm0 {%k1}
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x i16> undef, i16 %s, i32 0
+ %shuf = shufflevector <2 x i16> %vec, <2 x i16> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %default
+ ret <16 x i16> %res
+}
+
+define <16 x i16> @test_masked_z_i16_to_16_mask1(i16 %s, <16 x i16> %mask) {
+; CHECK-LABEL: test_masked_z_i16_to_16_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k1
+; CHECK-NEXT: vpbroadcastw %edi, %ymm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x i16> undef, i16 %s, i32 0
+ %shuf = shufflevector <2 x i16> %vec, <2 x i16> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
+ ret <16 x i16> %res
+}
+define <16 x i16> @test_masked_i16_to_16_mask2(i16 %s, <16 x i16> %default, <16 x i16> %mask) {
+; CHECK-LABEL: test_masked_i16_to_16_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqw %ymm2, %ymm1, %k1
+; CHECK-NEXT: vpbroadcastw %edi, %ymm0 {%k1}
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x i16> undef, i16 %s, i32 0
+ %shuf = shufflevector <2 x i16> %vec, <2 x i16> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %default
+ ret <16 x i16> %res
+}
+
+define <16 x i16> @test_masked_z_i16_to_16_mask2(i16 %s, <16 x i16> %mask) {
+; CHECK-LABEL: test_masked_z_i16_to_16_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k1
+; CHECK-NEXT: vpbroadcastw %edi, %ymm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x i16> undef, i16 %s, i32 0
+ %shuf = shufflevector <2 x i16> %vec, <2 x i16> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
+ ret <16 x i16> %res
+}
+define <16 x i16> @test_masked_i16_to_16_mask3(i16 %s, <16 x i16> %default, <16 x i16> %mask) {
+; CHECK-LABEL: test_masked_i16_to_16_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqw %ymm2, %ymm1, %k1
+; CHECK-NEXT: vpbroadcastw %edi, %ymm0 {%k1}
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x i16> undef, i16 %s, i32 0
+ %shuf = shufflevector <2 x i16> %vec, <2 x i16> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %default
+ ret <16 x i16> %res
+}
+
+define <16 x i16> @test_masked_z_i16_to_16_mask3(i16 %s, <16 x i16> %mask) {
+; CHECK-LABEL: test_masked_z_i16_to_16_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k1
+; CHECK-NEXT: vpbroadcastw %edi, %ymm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x i16> undef, i16 %s, i32 0
+ %shuf = shufflevector <2 x i16> %vec, <2 x i16> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
+ ret <16 x i16> %res
+}
+define <32 x i16> @test_i16_to_32(i16 %s) {
+; CHECK-LABEL: test_i16_to_32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpbroadcastw %edi, %zmm0
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x i16> undef, i16 %s, i32 0
+ %res = shufflevector <2 x i16> %vec, <2 x i16> undef, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ ret <32 x i16> %res
+}
+define <32 x i16> @test_masked_i16_to_32_mask0(i16 %s, <32 x i16> %default, <32 x i16> %mask) {
+; CHECK-LABEL: test_masked_i16_to_32_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqw %zmm2, %zmm1, %k1
+; CHECK-NEXT: vpbroadcastw %edi, %zmm0 {%k1}
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x i16> undef, i16 %s, i32 0
+ %shuf = shufflevector <2 x i16> %vec, <2 x i16> undef, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <32 x i16> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %default
+ ret <32 x i16> %res
+}
+
+define <32 x i16> @test_masked_z_i16_to_32_mask0(i16 %s, <32 x i16> %mask) {
+; CHECK-LABEL: test_masked_z_i16_to_32_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqw %zmm1, %zmm0, %k1
+; CHECK-NEXT: vpbroadcastw %edi, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x i16> undef, i16 %s, i32 0
+ %shuf = shufflevector <2 x i16> %vec, <2 x i16> undef, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <32 x i16> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
+ ret <32 x i16> %res
+}
+define <32 x i16> @test_masked_i16_to_32_mask1(i16 %s, <32 x i16> %default, <32 x i16> %mask) {
+; CHECK-LABEL: test_masked_i16_to_32_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqw %zmm2, %zmm1, %k1
+; CHECK-NEXT: vpbroadcastw %edi, %zmm0 {%k1}
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x i16> undef, i16 %s, i32 0
+ %shuf = shufflevector <2 x i16> %vec, <2 x i16> undef, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <32 x i16> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %default
+ ret <32 x i16> %res
+}
+
+define <32 x i16> @test_masked_z_i16_to_32_mask1(i16 %s, <32 x i16> %mask) {
+; CHECK-LABEL: test_masked_z_i16_to_32_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqw %zmm1, %zmm0, %k1
+; CHECK-NEXT: vpbroadcastw %edi, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x i16> undef, i16 %s, i32 0
+ %shuf = shufflevector <2 x i16> %vec, <2 x i16> undef, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <32 x i16> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
+ ret <32 x i16> %res
+}
+define <32 x i16> @test_masked_i16_to_32_mask2(i16 %s, <32 x i16> %default, <32 x i16> %mask) {
+; CHECK-LABEL: test_masked_i16_to_32_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqw %zmm2, %zmm1, %k1
+; CHECK-NEXT: vpbroadcastw %edi, %zmm0 {%k1}
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x i16> undef, i16 %s, i32 0
+ %shuf = shufflevector <2 x i16> %vec, <2 x i16> undef, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <32 x i16> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %default
+ ret <32 x i16> %res
+}
+
+define <32 x i16> @test_masked_z_i16_to_32_mask2(i16 %s, <32 x i16> %mask) {
+; CHECK-LABEL: test_masked_z_i16_to_32_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqw %zmm1, %zmm0, %k1
+; CHECK-NEXT: vpbroadcastw %edi, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x i16> undef, i16 %s, i32 0
+ %shuf = shufflevector <2 x i16> %vec, <2 x i16> undef, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <32 x i16> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
+ ret <32 x i16> %res
+}
+define <32 x i16> @test_masked_i16_to_32_mask3(i16 %s, <32 x i16> %default, <32 x i16> %mask) {
+; CHECK-LABEL: test_masked_i16_to_32_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqw %zmm2, %zmm1, %k1
+; CHECK-NEXT: vpbroadcastw %edi, %zmm0 {%k1}
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x i16> undef, i16 %s, i32 0
+ %shuf = shufflevector <2 x i16> %vec, <2 x i16> undef, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <32 x i16> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %default
+ ret <32 x i16> %res
+}
+
+define <32 x i16> @test_masked_z_i16_to_32_mask3(i16 %s, <32 x i16> %mask) {
+; CHECK-LABEL: test_masked_z_i16_to_32_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqw %zmm1, %zmm0, %k1
+; CHECK-NEXT: vpbroadcastw %edi, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x i16> undef, i16 %s, i32 0
+ %shuf = shufflevector <2 x i16> %vec, <2 x i16> undef, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <32 x i16> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
+ ret <32 x i16> %res
+}
+define <4 x i32> @test_i32_to_4(i32 %s) {
+; CHECK-LABEL: test_i32_to_4:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpbroadcastd %edi, %xmm0
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x i32> undef, i32 %s, i32 0
+ %res = shufflevector <2 x i32> %vec, <2 x i32> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+ ret <4 x i32> %res
+}
+define <4 x i32> @test_masked_i32_to_4_mask0(i32 %s, <4 x i32> %default, <4 x i32> %mask) {
+; CHECK-LABEL: test_masked_i32_to_4_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %k1
+; CHECK-NEXT: vpbroadcastd %edi, %xmm0 {%k1}
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x i32> undef, i32 %s, i32 0
+ %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %default
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_masked_z_i32_to_4_mask0(i32 %s, <4 x i32> %mask) {
+; CHECK-LABEL: test_masked_z_i32_to_4_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k1
+; CHECK-NEXT: vpbroadcastd %edi, %xmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x i32> undef, i32 %s, i32 0
+ %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
+ ret <4 x i32> %res
+}
+define <4 x i32> @test_masked_i32_to_4_mask1(i32 %s, <4 x i32> %default, <4 x i32> %mask) {
+; CHECK-LABEL: test_masked_i32_to_4_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %k1
+; CHECK-NEXT: vpbroadcastd %edi, %xmm0 {%k1}
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x i32> undef, i32 %s, i32 0
+ %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %default
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_masked_z_i32_to_4_mask1(i32 %s, <4 x i32> %mask) {
+; CHECK-LABEL: test_masked_z_i32_to_4_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k1
+; CHECK-NEXT: vpbroadcastd %edi, %xmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x i32> undef, i32 %s, i32 0
+ %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
+ ret <4 x i32> %res
+}
+define <4 x i32> @test_masked_i32_to_4_mask2(i32 %s, <4 x i32> %default, <4 x i32> %mask) {
+; CHECK-LABEL: test_masked_i32_to_4_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %k1
+; CHECK-NEXT: vpbroadcastd %edi, %xmm0 {%k1}
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x i32> undef, i32 %s, i32 0
+ %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %default
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_masked_z_i32_to_4_mask2(i32 %s, <4 x i32> %mask) {
+; CHECK-LABEL: test_masked_z_i32_to_4_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k1
+; CHECK-NEXT: vpbroadcastd %edi, %xmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x i32> undef, i32 %s, i32 0
+ %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
+ ret <4 x i32> %res
+}
+define <4 x i32> @test_masked_i32_to_4_mask3(i32 %s, <4 x i32> %default, <4 x i32> %mask) {
+; CHECK-LABEL: test_masked_i32_to_4_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %k1
+; CHECK-NEXT: vpbroadcastd %edi, %xmm0 {%k1}
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x i32> undef, i32 %s, i32 0
+ %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %default
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_masked_z_i32_to_4_mask3(i32 %s, <4 x i32> %mask) {
+; CHECK-LABEL: test_masked_z_i32_to_4_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k1
+; CHECK-NEXT: vpbroadcastd %edi, %xmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x i32> undef, i32 %s, i32 0
+ %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
+ ret <4 x i32> %res
+}
+define <8 x i32> @test_i32_to_8(i32 %s) {
+; CHECK-LABEL: test_i32_to_8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpbroadcastd %edi, %ymm0
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x i32> undef, i32 %s, i32 0
+ %res = shufflevector <2 x i32> %vec, <2 x i32> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ ret <8 x i32> %res
+}
+define <8 x i32> @test_masked_i32_to_8_mask0(i32 %s, <8 x i32> %default, <8 x i32> %mask) {
+; CHECK-LABEL: test_masked_i32_to_8_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vpbroadcastd %edi, %ymm0 {%k1}
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x i32> undef, i32 %s, i32 0
+ %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %default
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_masked_z_i32_to_8_mask0(i32 %s, <8 x i32> %mask) {
+; CHECK-LABEL: test_masked_z_i32_to_8_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k1
+; CHECK-NEXT: vpbroadcastd %edi, %ymm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x i32> undef, i32 %s, i32 0
+ %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
+ ret <8 x i32> %res
+}
+define <8 x i32> @test_masked_i32_to_8_mask1(i32 %s, <8 x i32> %default, <8 x i32> %mask) {
+; CHECK-LABEL: test_masked_i32_to_8_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vpbroadcastd %edi, %ymm0 {%k1}
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x i32> undef, i32 %s, i32 0
+ %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %default
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_masked_z_i32_to_8_mask1(i32 %s, <8 x i32> %mask) {
+; CHECK-LABEL: test_masked_z_i32_to_8_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k1
+; CHECK-NEXT: vpbroadcastd %edi, %ymm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x i32> undef, i32 %s, i32 0
+ %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
+ ret <8 x i32> %res
+}
+define <8 x i32> @test_masked_i32_to_8_mask2(i32 %s, <8 x i32> %default, <8 x i32> %mask) {
+; CHECK-LABEL: test_masked_i32_to_8_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vpbroadcastd %edi, %ymm0 {%k1}
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x i32> undef, i32 %s, i32 0
+ %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %default
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_masked_z_i32_to_8_mask2(i32 %s, <8 x i32> %mask) {
+; CHECK-LABEL: test_masked_z_i32_to_8_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k1
+; CHECK-NEXT: vpbroadcastd %edi, %ymm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x i32> undef, i32 %s, i32 0
+ %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
+ ret <8 x i32> %res
+}
+define <8 x i32> @test_masked_i32_to_8_mask3(i32 %s, <8 x i32> %default, <8 x i32> %mask) {
+; CHECK-LABEL: test_masked_i32_to_8_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vpbroadcastd %edi, %ymm0 {%k1}
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x i32> undef, i32 %s, i32 0
+ %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %default
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_masked_z_i32_to_8_mask3(i32 %s, <8 x i32> %mask) {
+; CHECK-LABEL: test_masked_z_i32_to_8_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k1
+; CHECK-NEXT: vpbroadcastd %edi, %ymm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x i32> undef, i32 %s, i32 0
+ %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
+ ret <8 x i32> %res
+}
+define <16 x i32> @test_i32_to_16(i32 %s) {
+; CHECK-LABEL: test_i32_to_16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpbroadcastd %edi, %zmm0
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x i32> undef, i32 %s, i32 0
+ %res = shufflevector <2 x i32> %vec, <2 x i32> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ ret <16 x i32> %res
+}
+define <16 x i32> @test_masked_i32_to_16_mask0(i32 %s, <16 x i32> %default, <16 x i32> %mask) {
+; CHECK-LABEL: test_masked_i32_to_16_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqd %zmm2, %zmm1, %k1
+; CHECK-NEXT: vpbroadcastd %edi, %zmm0 {%k1}
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x i32> undef, i32 %s, i32 0
+ %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %default
+ ret <16 x i32> %res
+}
+
+define <16 x i32> @test_masked_z_i32_to_16_mask0(i32 %s, <16 x i32> %mask) {
+; CHECK-LABEL: test_masked_z_i32_to_16_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k1
+; CHECK-NEXT: vpbroadcastd %edi, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x i32> undef, i32 %s, i32 0
+ %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
+ ret <16 x i32> %res
+}
+define <16 x i32> @test_masked_i32_to_16_mask1(i32 %s, <16 x i32> %default, <16 x i32> %mask) {
+; CHECK-LABEL: test_masked_i32_to_16_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqd %zmm2, %zmm1, %k1
+; CHECK-NEXT: vpbroadcastd %edi, %zmm0 {%k1}
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x i32> undef, i32 %s, i32 0
+ %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %default
+ ret <16 x i32> %res
+}
+
+define <16 x i32> @test_masked_z_i32_to_16_mask1(i32 %s, <16 x i32> %mask) {
+; CHECK-LABEL: test_masked_z_i32_to_16_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k1
+; CHECK-NEXT: vpbroadcastd %edi, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x i32> undef, i32 %s, i32 0
+ %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
+ ret <16 x i32> %res
+}
+define <16 x i32> @test_masked_i32_to_16_mask2(i32 %s, <16 x i32> %default, <16 x i32> %mask) {
+; CHECK-LABEL: test_masked_i32_to_16_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqd %zmm2, %zmm1, %k1
+; CHECK-NEXT: vpbroadcastd %edi, %zmm0 {%k1}
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x i32> undef, i32 %s, i32 0
+ %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %default
+ ret <16 x i32> %res
+}
+
+define <16 x i32> @test_masked_z_i32_to_16_mask2(i32 %s, <16 x i32> %mask) {
+; CHECK-LABEL: test_masked_z_i32_to_16_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k1
+; CHECK-NEXT: vpbroadcastd %edi, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x i32> undef, i32 %s, i32 0
+ %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
+ ret <16 x i32> %res
+}
+define <16 x i32> @test_masked_i32_to_16_mask3(i32 %s, <16 x i32> %default, <16 x i32> %mask) {
+; CHECK-LABEL: test_masked_i32_to_16_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqd %zmm2, %zmm1, %k1
+; CHECK-NEXT: vpbroadcastd %edi, %zmm0 {%k1}
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x i32> undef, i32 %s, i32 0
+ %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %default
+ ret <16 x i32> %res
+}
+
+define <16 x i32> @test_masked_z_i32_to_16_mask3(i32 %s, <16 x i32> %mask) {
+; CHECK-LABEL: test_masked_z_i32_to_16_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k1
+; CHECK-NEXT: vpbroadcastd %edi, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x i32> undef, i32 %s, i32 0
+ %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
+ ret <16 x i32> %res
+}
+define <2 x i64> @test_i64_to_2(i64 %s) {
+; CHECK-LABEL: test_i64_to_2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpbroadcastq %rdi, %xmm0
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x i64> undef, i64 %s, i32 0
+ %res = shufflevector <2 x i64> %vec, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
+ ret <2 x i64> %res
+}
+define <2 x i64> @test_masked_i64_to_2_mask0(i64 %s, <2 x i64> %default, <2 x i64> %mask) {
+; CHECK-LABEL: test_masked_i64_to_2_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqq %xmm2, %xmm1, %k1
+; CHECK-NEXT: vpbroadcastq %rdi, %xmm0 {%k1}
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x i64> undef, i64 %s, i32 0
+ %shuf = shufflevector <2 x i64> %vec, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
+ %cmp = icmp eq <2 x i64> %mask, zeroinitializer
+ %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> %default
+ ret <2 x i64> %res
+}
+
+define <2 x i64> @test_masked_z_i64_to_2_mask0(i64 %s, <2 x i64> %mask) {
+; CHECK-LABEL: test_masked_z_i64_to_2_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqq %xmm1, %xmm0, %k1
+; CHECK-NEXT: vpbroadcastq %rdi, %xmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x i64> undef, i64 %s, i32 0
+ %shuf = shufflevector <2 x i64> %vec, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
+ %cmp = icmp eq <2 x i64> %mask, zeroinitializer
+ %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> zeroinitializer
+ ret <2 x i64> %res
+}
+define <2 x i64> @test_masked_i64_to_2_mask1(i64 %s, <2 x i64> %default, <2 x i64> %mask) {
+; CHECK-LABEL: test_masked_i64_to_2_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqq %xmm2, %xmm1, %k1
+; CHECK-NEXT: vpbroadcastq %rdi, %xmm0 {%k1}
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x i64> undef, i64 %s, i32 0
+ %shuf = shufflevector <2 x i64> %vec, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
+ %cmp = icmp eq <2 x i64> %mask, zeroinitializer
+ %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> %default
+ ret <2 x i64> %res
+}
+
+define <2 x i64> @test_masked_z_i64_to_2_mask1(i64 %s, <2 x i64> %mask) {
+; CHECK-LABEL: test_masked_z_i64_to_2_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqq %xmm1, %xmm0, %k1
+; CHECK-NEXT: vpbroadcastq %rdi, %xmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x i64> undef, i64 %s, i32 0
+ %shuf = shufflevector <2 x i64> %vec, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
+ %cmp = icmp eq <2 x i64> %mask, zeroinitializer
+ %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> zeroinitializer
+ ret <2 x i64> %res
+}
+define <4 x i64> @test_i64_to_4(i64 %s) {
+; CHECK-LABEL: test_i64_to_4:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpbroadcastq %rdi, %ymm0
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x i64> undef, i64 %s, i32 0
+ %res = shufflevector <2 x i64> %vec, <2 x i64> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+ ret <4 x i64> %res
+}
+define <4 x i64> @test_masked_i64_to_4_mask0(i64 %s, <4 x i64> %default, <4 x i64> %mask) {
+; CHECK-LABEL: test_masked_i64_to_4_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
+; CHECK-NEXT: vpbroadcastq %rdi, %ymm0 {%k1}
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x i64> undef, i64 %s, i32 0
+ %shuf = shufflevector <2 x i64> %vec, <2 x i64> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %default
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_masked_z_i64_to_4_mask0(i64 %s, <4 x i64> %mask) {
+; CHECK-LABEL: test_masked_z_i64_to_4_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k1
+; CHECK-NEXT: vpbroadcastq %rdi, %ymm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x i64> undef, i64 %s, i32 0
+ %shuf = shufflevector <2 x i64> %vec, <2 x i64> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
+ ret <4 x i64> %res
+}
+define <4 x i64> @test_masked_i64_to_4_mask1(i64 %s, <4 x i64> %default, <4 x i64> %mask) {
+; CHECK-LABEL: test_masked_i64_to_4_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
+; CHECK-NEXT: vpbroadcastq %rdi, %ymm0 {%k1}
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x i64> undef, i64 %s, i32 0
+ %shuf = shufflevector <2 x i64> %vec, <2 x i64> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %default
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_masked_z_i64_to_4_mask1(i64 %s, <4 x i64> %mask) {
+; CHECK-LABEL: test_masked_z_i64_to_4_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k1
+; CHECK-NEXT: vpbroadcastq %rdi, %ymm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x i64> undef, i64 %s, i32 0
+ %shuf = shufflevector <2 x i64> %vec, <2 x i64> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
+ ret <4 x i64> %res
+}
+define <4 x i64> @test_masked_i64_to_4_mask2(i64 %s, <4 x i64> %default, <4 x i64> %mask) {
+; CHECK-LABEL: test_masked_i64_to_4_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
+; CHECK-NEXT: vpbroadcastq %rdi, %ymm0 {%k1}
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x i64> undef, i64 %s, i32 0
+ %shuf = shufflevector <2 x i64> %vec, <2 x i64> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %default
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_masked_z_i64_to_4_mask2(i64 %s, <4 x i64> %mask) {
+; CHECK-LABEL: test_masked_z_i64_to_4_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k1
+; CHECK-NEXT: vpbroadcastq %rdi, %ymm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x i64> undef, i64 %s, i32 0
+ %shuf = shufflevector <2 x i64> %vec, <2 x i64> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
+ ret <4 x i64> %res
+}
+define <4 x i64> @test_masked_i64_to_4_mask3(i64 %s, <4 x i64> %default, <4 x i64> %mask) {
+; CHECK-LABEL: test_masked_i64_to_4_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
+; CHECK-NEXT: vpbroadcastq %rdi, %ymm0 {%k1}
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x i64> undef, i64 %s, i32 0
+ %shuf = shufflevector <2 x i64> %vec, <2 x i64> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %default
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_masked_z_i64_to_4_mask3(i64 %s, <4 x i64> %mask) {
+; CHECK-LABEL: test_masked_z_i64_to_4_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k1
+; CHECK-NEXT: vpbroadcastq %rdi, %ymm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x i64> undef, i64 %s, i32 0
+ %shuf = shufflevector <2 x i64> %vec, <2 x i64> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
+ ret <4 x i64> %res
+}
+define <8 x i64> @test_i64_to_8(i64 %s) {
+; CHECK-LABEL: test_i64_to_8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpbroadcastq %rdi, %zmm0
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x i64> undef, i64 %s, i32 0
+ %res = shufflevector <2 x i64> %vec, <2 x i64> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ ret <8 x i64> %res
+}
+define <8 x i64> @test_masked_i64_to_8_mask0(i64 %s, <8 x i64> %default, <8 x i64> %mask) {
+; CHECK-LABEL: test_masked_i64_to_8_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqq %zmm2, %zmm1, %k1
+; CHECK-NEXT: vpbroadcastq %rdi, %zmm0 {%k1}
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x i64> undef, i64 %s, i32 0
+ %shuf = shufflevector <2 x i64> %vec, <2 x i64> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %default
+ ret <8 x i64> %res
+}
+
+define <8 x i64> @test_masked_z_i64_to_8_mask0(i64 %s, <8 x i64> %mask) {
+; CHECK-LABEL: test_masked_z_i64_to_8_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k1
+; CHECK-NEXT: vpbroadcastq %rdi, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x i64> undef, i64 %s, i32 0
+ %shuf = shufflevector <2 x i64> %vec, <2 x i64> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
+ ret <8 x i64> %res
+}
+define <8 x i64> @test_masked_i64_to_8_mask1(i64 %s, <8 x i64> %default, <8 x i64> %mask) {
+; CHECK-LABEL: test_masked_i64_to_8_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqq %zmm2, %zmm1, %k1
+; CHECK-NEXT: vpbroadcastq %rdi, %zmm0 {%k1}
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x i64> undef, i64 %s, i32 0
+ %shuf = shufflevector <2 x i64> %vec, <2 x i64> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %default
+ ret <8 x i64> %res
+}
+
+define <8 x i64> @test_masked_z_i64_to_8_mask1(i64 %s, <8 x i64> %mask) {
+; CHECK-LABEL: test_masked_z_i64_to_8_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k1
+; CHECK-NEXT: vpbroadcastq %rdi, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x i64> undef, i64 %s, i32 0
+ %shuf = shufflevector <2 x i64> %vec, <2 x i64> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
+ ret <8 x i64> %res
+}
+define <8 x i64> @test_masked_i64_to_8_mask2(i64 %s, <8 x i64> %default, <8 x i64> %mask) {
+; CHECK-LABEL: test_masked_i64_to_8_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqq %zmm2, %zmm1, %k1
+; CHECK-NEXT: vpbroadcastq %rdi, %zmm0 {%k1}
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x i64> undef, i64 %s, i32 0
+ %shuf = shufflevector <2 x i64> %vec, <2 x i64> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %default
+ ret <8 x i64> %res
+}
+
+define <8 x i64> @test_masked_z_i64_to_8_mask2(i64 %s, <8 x i64> %mask) {
+; CHECK-LABEL: test_masked_z_i64_to_8_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k1
+; CHECK-NEXT: vpbroadcastq %rdi, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x i64> undef, i64 %s, i32 0
+ %shuf = shufflevector <2 x i64> %vec, <2 x i64> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
+ ret <8 x i64> %res
+}
+define <8 x i64> @test_masked_i64_to_8_mask3(i64 %s, <8 x i64> %default, <8 x i64> %mask) {
+; CHECK-LABEL: test_masked_i64_to_8_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqq %zmm2, %zmm1, %k1
+; CHECK-NEXT: vpbroadcastq %rdi, %zmm0 {%k1}
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x i64> undef, i64 %s, i32 0
+ %shuf = shufflevector <2 x i64> %vec, <2 x i64> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %default
+ ret <8 x i64> %res
+}
+
+define <8 x i64> @test_masked_z_i64_to_8_mask3(i64 %s, <8 x i64> %mask) {
+; CHECK-LABEL: test_masked_z_i64_to_8_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k1
+; CHECK-NEXT: vpbroadcastq %rdi, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %vec = insertelement <2 x i64> undef, i64 %s, i32 0
+ %shuf = shufflevector <2 x i64> %vec, <2 x i64> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
+ ret <8 x i64> %res
+}
+define <16 x i8> @test_i8_to_16_mem(i8* %p) {
+; CHECK-LABEL: test_i8_to_16_mem:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpbroadcastb (%rdi), %xmm0
+; CHECK-NEXT: retq
+ %s = load i8, i8* %p
+ %vec = insertelement <2 x i8> undef, i8 %s, i32 0
+ %res = shufflevector <2 x i8> %vec, <2 x i8> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ ret <16 x i8> %res
+}
+define <16 x i8> @test_masked_i8_to_16_mem_mask0(i8* %p, <16 x i8> %default, <16 x i8> %mask) {
+; CHECK-LABEL: test_masked_i8_to_16_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqb %xmm2, %xmm1, %k1
+; CHECK-NEXT: vpbroadcastb (%rdi), %xmm0 {%k1}
+; CHECK-NEXT: retq
+ %s = load i8, i8* %p
+ %vec = insertelement <2 x i8> undef, i8 %s, i32 0
+ %shuf = shufflevector <2 x i8> %vec, <2 x i8> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <16 x i8> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> %default
+ ret <16 x i8> %res
+}
+
+define <16 x i8> @test_masked_z_i8_to_16_mem_mask0(i8* %p, <16 x i8> %mask) {
+; CHECK-LABEL: test_masked_z_i8_to_16_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqb %xmm1, %xmm0, %k1
+; CHECK-NEXT: vpbroadcastb (%rdi), %xmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %s = load i8, i8* %p
+ %vec = insertelement <2 x i8> undef, i8 %s, i32 0
+ %shuf = shufflevector <2 x i8> %vec, <2 x i8> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <16 x i8> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> zeroinitializer
+ ret <16 x i8> %res
+}
+define <16 x i8> @test_masked_i8_to_16_mem_mask1(i8* %p, <16 x i8> %default, <16 x i8> %mask) {
+; CHECK-LABEL: test_masked_i8_to_16_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqb %xmm2, %xmm1, %k1
+; CHECK-NEXT: vpbroadcastb (%rdi), %xmm0 {%k1}
+; CHECK-NEXT: retq
+ %s = load i8, i8* %p
+ %vec = insertelement <2 x i8> undef, i8 %s, i32 0
+ %shuf = shufflevector <2 x i8> %vec, <2 x i8> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <16 x i8> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> %default
+ ret <16 x i8> %res
+}
+
+define <16 x i8> @test_masked_z_i8_to_16_mem_mask1(i8* %p, <16 x i8> %mask) {
+; CHECK-LABEL: test_masked_z_i8_to_16_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqb %xmm1, %xmm0, %k1
+; CHECK-NEXT: vpbroadcastb (%rdi), %xmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %s = load i8, i8* %p
+ %vec = insertelement <2 x i8> undef, i8 %s, i32 0
+ %shuf = shufflevector <2 x i8> %vec, <2 x i8> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <16 x i8> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> zeroinitializer
+ ret <16 x i8> %res
+}
+define <16 x i8> @test_masked_i8_to_16_mem_mask2(i8* %p, <16 x i8> %default, <16 x i8> %mask) {
+; CHECK-LABEL: test_masked_i8_to_16_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqb %xmm2, %xmm1, %k1
+; CHECK-NEXT: vpbroadcastb (%rdi), %xmm0 {%k1}
+; CHECK-NEXT: retq
+ %s = load i8, i8* %p
+ %vec = insertelement <2 x i8> undef, i8 %s, i32 0
+ %shuf = shufflevector <2 x i8> %vec, <2 x i8> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <16 x i8> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> %default
+ ret <16 x i8> %res
+}
+
+define <16 x i8> @test_masked_z_i8_to_16_mem_mask2(i8* %p, <16 x i8> %mask) {
+; CHECK-LABEL: test_masked_z_i8_to_16_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqb %xmm1, %xmm0, %k1
+; CHECK-NEXT: vpbroadcastb (%rdi), %xmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %s = load i8, i8* %p
+ %vec = insertelement <2 x i8> undef, i8 %s, i32 0
+ %shuf = shufflevector <2 x i8> %vec, <2 x i8> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <16 x i8> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> zeroinitializer
+ ret <16 x i8> %res
+}
+define <16 x i8> @test_masked_i8_to_16_mem_mask3(i8* %p, <16 x i8> %default, <16 x i8> %mask) {
+; CHECK-LABEL: test_masked_i8_to_16_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqb %xmm2, %xmm1, %k1
+; CHECK-NEXT: vpbroadcastb (%rdi), %xmm0 {%k1}
+; CHECK-NEXT: retq
+ %s = load i8, i8* %p
+ %vec = insertelement <2 x i8> undef, i8 %s, i32 0
+ %shuf = shufflevector <2 x i8> %vec, <2 x i8> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <16 x i8> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> %default
+ ret <16 x i8> %res
+}
+
+define <16 x i8> @test_masked_z_i8_to_16_mem_mask3(i8* %p, <16 x i8> %mask) {
+; CHECK-LABEL: test_masked_z_i8_to_16_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqb %xmm1, %xmm0, %k1
+; CHECK-NEXT: vpbroadcastb (%rdi), %xmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %s = load i8, i8* %p
+ %vec = insertelement <2 x i8> undef, i8 %s, i32 0
+ %shuf = shufflevector <2 x i8> %vec, <2 x i8> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <16 x i8> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> zeroinitializer
+ ret <16 x i8> %res
+}
+define <32 x i8> @test_i8_to_32_mem(i8* %p) {
+; CHECK-LABEL: test_i8_to_32_mem:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpbroadcastb (%rdi), %ymm0
+; CHECK-NEXT: retq
+ %s = load i8, i8* %p
+ %vec = insertelement <2 x i8> undef, i8 %s, i32 0
+ %res = shufflevector <2 x i8> %vec, <2 x i8> undef, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ ret <32 x i8> %res
+}
+define <32 x i8> @test_masked_i8_to_32_mem_mask0(i8* %p, <32 x i8> %default, <32 x i8> %mask) {
+; CHECK-LABEL: test_masked_i8_to_32_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqb %ymm2, %ymm1, %k1
+; CHECK-NEXT: vpbroadcastb (%rdi), %ymm0 {%k1}
+; CHECK-NEXT: retq
+ %s = load i8, i8* %p
+ %vec = insertelement <2 x i8> undef, i8 %s, i32 0
+ %shuf = shufflevector <2 x i8> %vec, <2 x i8> undef, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <32 x i8> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> %default
+ ret <32 x i8> %res
+}
+
+define <32 x i8> @test_masked_z_i8_to_32_mem_mask0(i8* %p, <32 x i8> %mask) {
+; CHECK-LABEL: test_masked_z_i8_to_32_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqb %ymm1, %ymm0, %k1
+; CHECK-NEXT: vpbroadcastb (%rdi), %ymm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %s = load i8, i8* %p
+ %vec = insertelement <2 x i8> undef, i8 %s, i32 0
+ %shuf = shufflevector <2 x i8> %vec, <2 x i8> undef, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <32 x i8> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> zeroinitializer
+ ret <32 x i8> %res
+}
+define <32 x i8> @test_masked_i8_to_32_mem_mask1(i8* %p, <32 x i8> %default, <32 x i8> %mask) {
+; CHECK-LABEL: test_masked_i8_to_32_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqb %ymm2, %ymm1, %k1
+; CHECK-NEXT: vpbroadcastb (%rdi), %ymm0 {%k1}
+; CHECK-NEXT: retq
+ %s = load i8, i8* %p
+ %vec = insertelement <2 x i8> undef, i8 %s, i32 0
+ %shuf = shufflevector <2 x i8> %vec, <2 x i8> undef, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <32 x i8> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> %default
+ ret <32 x i8> %res
+}
+
+define <32 x i8> @test_masked_z_i8_to_32_mem_mask1(i8* %p, <32 x i8> %mask) {
+; CHECK-LABEL: test_masked_z_i8_to_32_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqb %ymm1, %ymm0, %k1
+; CHECK-NEXT: vpbroadcastb (%rdi), %ymm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %s = load i8, i8* %p
+ %vec = insertelement <2 x i8> undef, i8 %s, i32 0
+ %shuf = shufflevector <2 x i8> %vec, <2 x i8> undef, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <32 x i8> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> zeroinitializer
+ ret <32 x i8> %res
+}
+define <32 x i8> @test_masked_i8_to_32_mem_mask2(i8* %p, <32 x i8> %default, <32 x i8> %mask) {
+; CHECK-LABEL: test_masked_i8_to_32_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqb %ymm2, %ymm1, %k1
+; CHECK-NEXT: vpbroadcastb (%rdi), %ymm0 {%k1}
+; CHECK-NEXT: retq
+ %s = load i8, i8* %p
+ %vec = insertelement <2 x i8> undef, i8 %s, i32 0
+ %shuf = shufflevector <2 x i8> %vec, <2 x i8> undef, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <32 x i8> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> %default
+ ret <32 x i8> %res
+}
+
+define <32 x i8> @test_masked_z_i8_to_32_mem_mask2(i8* %p, <32 x i8> %mask) {
+; CHECK-LABEL: test_masked_z_i8_to_32_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqb %ymm1, %ymm0, %k1
+; CHECK-NEXT: vpbroadcastb (%rdi), %ymm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %s = load i8, i8* %p
+ %vec = insertelement <2 x i8> undef, i8 %s, i32 0
+ %shuf = shufflevector <2 x i8> %vec, <2 x i8> undef, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <32 x i8> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> zeroinitializer
+ ret <32 x i8> %res
+}
+define <32 x i8> @test_masked_i8_to_32_mem_mask3(i8* %p, <32 x i8> %default, <32 x i8> %mask) {
+; CHECK-LABEL: test_masked_i8_to_32_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqb %ymm2, %ymm1, %k1
+; CHECK-NEXT: vpbroadcastb (%rdi), %ymm0 {%k1}
+; CHECK-NEXT: retq
+ %s = load i8, i8* %p
+ %vec = insertelement <2 x i8> undef, i8 %s, i32 0
+ %shuf = shufflevector <2 x i8> %vec, <2 x i8> undef, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <32 x i8> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> %default
+ ret <32 x i8> %res
+}
+
+define <32 x i8> @test_masked_z_i8_to_32_mem_mask3(i8* %p, <32 x i8> %mask) {
+; CHECK-LABEL: test_masked_z_i8_to_32_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqb %ymm1, %ymm0, %k1
+; CHECK-NEXT: vpbroadcastb (%rdi), %ymm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %s = load i8, i8* %p
+ %vec = insertelement <2 x i8> undef, i8 %s, i32 0
+ %shuf = shufflevector <2 x i8> %vec, <2 x i8> undef, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <32 x i8> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> zeroinitializer
+ ret <32 x i8> %res
+}
+define <64 x i8> @test_i8_to_64_mem(i8* %p) {
+; CHECK-LABEL: test_i8_to_64_mem:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpbroadcastb (%rdi), %zmm0
+; CHECK-NEXT: retq
+ %s = load i8, i8* %p
+ %vec = insertelement <2 x i8> undef, i8 %s, i32 0
+ %res = shufflevector <2 x i8> %vec, <2 x i8> undef, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ ret <64 x i8> %res
+}
+define <64 x i8> @test_masked_i8_to_64_mem_mask0(i8* %p, <64 x i8> %default, <64 x i8> %mask) {
+; CHECK-LABEL: test_masked_i8_to_64_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqb %zmm2, %zmm1, %k1
+; CHECK-NEXT: vpbroadcastb (%rdi), %zmm0 {%k1}
+; CHECK-NEXT: retq
+ %s = load i8, i8* %p
+ %vec = insertelement <2 x i8> undef, i8 %s, i32 0
+ %shuf = shufflevector <2 x i8> %vec, <2 x i8> undef, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <64 x i8> %mask, zeroinitializer
+ %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> %default
+ ret <64 x i8> %res
+}
+
+define <64 x i8> @test_masked_z_i8_to_64_mem_mask0(i8* %p, <64 x i8> %mask) {
+; CHECK-LABEL: test_masked_z_i8_to_64_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqb %zmm1, %zmm0, %k1
+; CHECK-NEXT: vpbroadcastb (%rdi), %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %s = load i8, i8* %p
+ %vec = insertelement <2 x i8> undef, i8 %s, i32 0
+ %shuf = shufflevector <2 x i8> %vec, <2 x i8> undef, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <64 x i8> %mask, zeroinitializer
+ %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> zeroinitializer
+ ret <64 x i8> %res
+}
+define <64 x i8> @test_masked_i8_to_64_mem_mask1(i8* %p, <64 x i8> %default, <64 x i8> %mask) {
+; CHECK-LABEL: test_masked_i8_to_64_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqb %zmm2, %zmm1, %k1
+; CHECK-NEXT: vpbroadcastb (%rdi), %zmm0 {%k1}
+; CHECK-NEXT: retq
+ %s = load i8, i8* %p
+ %vec = insertelement <2 x i8> undef, i8 %s, i32 0
+ %shuf = shufflevector <2 x i8> %vec, <2 x i8> undef, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <64 x i8> %mask, zeroinitializer
+ %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> %default
+ ret <64 x i8> %res
+}
+
+define <64 x i8> @test_masked_z_i8_to_64_mem_mask1(i8* %p, <64 x i8> %mask) {
+; CHECK-LABEL: test_masked_z_i8_to_64_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqb %zmm1, %zmm0, %k1
+; CHECK-NEXT: vpbroadcastb (%rdi), %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %s = load i8, i8* %p
+ %vec = insertelement <2 x i8> undef, i8 %s, i32 0
+ %shuf = shufflevector <2 x i8> %vec, <2 x i8> undef, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <64 x i8> %mask, zeroinitializer
+ %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> zeroinitializer
+ ret <64 x i8> %res
+}
+define <64 x i8> @test_masked_i8_to_64_mem_mask2(i8* %p, <64 x i8> %default, <64 x i8> %mask) {
+; CHECK-LABEL: test_masked_i8_to_64_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqb %zmm2, %zmm1, %k1
+; CHECK-NEXT: vpbroadcastb (%rdi), %zmm0 {%k1}
+; CHECK-NEXT: retq
+ %s = load i8, i8* %p
+ %vec = insertelement <2 x i8> undef, i8 %s, i32 0
+ %shuf = shufflevector <2 x i8> %vec, <2 x i8> undef, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <64 x i8> %mask, zeroinitializer
+ %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> %default
+ ret <64 x i8> %res
+}
+
+define <64 x i8> @test_masked_z_i8_to_64_mem_mask2(i8* %p, <64 x i8> %mask) {
+; CHECK-LABEL: test_masked_z_i8_to_64_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqb %zmm1, %zmm0, %k1
+; CHECK-NEXT: vpbroadcastb (%rdi), %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %s = load i8, i8* %p
+ %vec = insertelement <2 x i8> undef, i8 %s, i32 0
+ %shuf = shufflevector <2 x i8> %vec, <2 x i8> undef, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <64 x i8> %mask, zeroinitializer
+ %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> zeroinitializer
+ ret <64 x i8> %res
+}
+define <64 x i8> @test_masked_i8_to_64_mem_mask3(i8* %p, <64 x i8> %default, <64 x i8> %mask) {
+; CHECK-LABEL: test_masked_i8_to_64_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqb %zmm2, %zmm1, %k1
+; CHECK-NEXT: vpbroadcastb (%rdi), %zmm0 {%k1}
+; CHECK-NEXT: retq
+ %s = load i8, i8* %p
+ %vec = insertelement <2 x i8> undef, i8 %s, i32 0
+ %shuf = shufflevector <2 x i8> %vec, <2 x i8> undef, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <64 x i8> %mask, zeroinitializer
+ %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> %default
+ ret <64 x i8> %res
+}
+
+define <64 x i8> @test_masked_z_i8_to_64_mem_mask3(i8* %p, <64 x i8> %mask) {
+; CHECK-LABEL: test_masked_z_i8_to_64_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqb %zmm1, %zmm0, %k1
+; CHECK-NEXT: vpbroadcastb (%rdi), %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %s = load i8, i8* %p
+ %vec = insertelement <2 x i8> undef, i8 %s, i32 0
+ %shuf = shufflevector <2 x i8> %vec, <2 x i8> undef, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <64 x i8> %mask, zeroinitializer
+ %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> zeroinitializer
+ ret <64 x i8> %res
+}
+define <8 x i16> @test_i16_to_8_mem(i16* %p) {
+; CHECK-LABEL: test_i16_to_8_mem:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpbroadcastw (%rdi), %xmm0
+; CHECK-NEXT: retq
+ %s = load i16, i16* %p
+ %vec = insertelement <2 x i16> undef, i16 %s, i32 0
+ %res = shufflevector <2 x i16> %vec, <2 x i16> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ ret <8 x i16> %res
+}
+define <8 x i16> @test_masked_i16_to_8_mem_mask0(i16* %p, <8 x i16> %default, <8 x i16> %mask) {
+; CHECK-LABEL: test_masked_i16_to_8_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1
+; CHECK-NEXT: vpbroadcastw (%rdi), %xmm0 {%k1}
+; CHECK-NEXT: retq
+ %s = load i16, i16* %p
+ %vec = insertelement <2 x i16> undef, i16 %s, i32 0
+ %shuf = shufflevector <2 x i16> %vec, <2 x i16> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %default
+ ret <8 x i16> %res
+}
+
+define <8 x i16> @test_masked_z_i16_to_8_mem_mask0(i16* %p, <8 x i16> %mask) {
+; CHECK-LABEL: test_masked_z_i16_to_8_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k1
+; CHECK-NEXT: vpbroadcastw (%rdi), %xmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %s = load i16, i16* %p
+ %vec = insertelement <2 x i16> undef, i16 %s, i32 0
+ %shuf = shufflevector <2 x i16> %vec, <2 x i16> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
+ ret <8 x i16> %res
+}
+define <8 x i16> @test_masked_i16_to_8_mem_mask1(i16* %p, <8 x i16> %default, <8 x i16> %mask) {
+; CHECK-LABEL: test_masked_i16_to_8_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1
+; CHECK-NEXT: vpbroadcastw (%rdi), %xmm0 {%k1}
+; CHECK-NEXT: retq
+ %s = load i16, i16* %p
+ %vec = insertelement <2 x i16> undef, i16 %s, i32 0
+ %shuf = shufflevector <2 x i16> %vec, <2 x i16> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %default
+ ret <8 x i16> %res
+}
+
+define <8 x i16> @test_masked_z_i16_to_8_mem_mask1(i16* %p, <8 x i16> %mask) {
+; CHECK-LABEL: test_masked_z_i16_to_8_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k1
+; CHECK-NEXT: vpbroadcastw (%rdi), %xmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %s = load i16, i16* %p
+ %vec = insertelement <2 x i16> undef, i16 %s, i32 0
+ %shuf = shufflevector <2 x i16> %vec, <2 x i16> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
+ ret <8 x i16> %res
+}
+define <8 x i16> @test_masked_i16_to_8_mem_mask2(i16* %p, <8 x i16> %default, <8 x i16> %mask) {
+; CHECK-LABEL: test_masked_i16_to_8_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1
+; CHECK-NEXT: vpbroadcastw (%rdi), %xmm0 {%k1}
+; CHECK-NEXT: retq
+ %s = load i16, i16* %p
+ %vec = insertelement <2 x i16> undef, i16 %s, i32 0
+ %shuf = shufflevector <2 x i16> %vec, <2 x i16> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %default
+ ret <8 x i16> %res
+}
+
+define <8 x i16> @test_masked_z_i16_to_8_mem_mask2(i16* %p, <8 x i16> %mask) {
+; CHECK-LABEL: test_masked_z_i16_to_8_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k1
+; CHECK-NEXT: vpbroadcastw (%rdi), %xmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %s = load i16, i16* %p
+ %vec = insertelement <2 x i16> undef, i16 %s, i32 0
+ %shuf = shufflevector <2 x i16> %vec, <2 x i16> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
+ ret <8 x i16> %res
+}
+define <8 x i16> @test_masked_i16_to_8_mem_mask3(i16* %p, <8 x i16> %default, <8 x i16> %mask) {
+; CHECK-LABEL: test_masked_i16_to_8_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1
+; CHECK-NEXT: vpbroadcastw (%rdi), %xmm0 {%k1}
+; CHECK-NEXT: retq
+ %s = load i16, i16* %p
+ %vec = insertelement <2 x i16> undef, i16 %s, i32 0
+ %shuf = shufflevector <2 x i16> %vec, <2 x i16> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %default
+ ret <8 x i16> %res
+}
+
+define <8 x i16> @test_masked_z_i16_to_8_mem_mask3(i16* %p, <8 x i16> %mask) {
+; CHECK-LABEL: test_masked_z_i16_to_8_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k1
+; CHECK-NEXT: vpbroadcastw (%rdi), %xmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %s = load i16, i16* %p
+ %vec = insertelement <2 x i16> undef, i16 %s, i32 0
+ %shuf = shufflevector <2 x i16> %vec, <2 x i16> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
+ ret <8 x i16> %res
+}
+define <16 x i16> @test_i16_to_16_mem(i16* %p) {
+; CHECK-LABEL: test_i16_to_16_mem:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpbroadcastw (%rdi), %ymm0
+; CHECK-NEXT: retq
+ %s = load i16, i16* %p
+ %vec = insertelement <2 x i16> undef, i16 %s, i32 0
+ %res = shufflevector <2 x i16> %vec, <2 x i16> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ ret <16 x i16> %res
+}
+define <16 x i16> @test_masked_i16_to_16_mem_mask0(i16* %p, <16 x i16> %default, <16 x i16> %mask) {
+; CHECK-LABEL: test_masked_i16_to_16_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqw %ymm2, %ymm1, %k1
+; CHECK-NEXT: vpbroadcastw (%rdi), %ymm0 {%k1}
+; CHECK-NEXT: retq
+ %s = load i16, i16* %p
+ %vec = insertelement <2 x i16> undef, i16 %s, i32 0
+ %shuf = shufflevector <2 x i16> %vec, <2 x i16> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %default
+ ret <16 x i16> %res
+}
+
+define <16 x i16> @test_masked_z_i16_to_16_mem_mask0(i16* %p, <16 x i16> %mask) {
+; CHECK-LABEL: test_masked_z_i16_to_16_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k1
+; CHECK-NEXT: vpbroadcastw (%rdi), %ymm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %s = load i16, i16* %p
+ %vec = insertelement <2 x i16> undef, i16 %s, i32 0
+ %shuf = shufflevector <2 x i16> %vec, <2 x i16> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
+ ret <16 x i16> %res
+}
+define <16 x i16> @test_masked_i16_to_16_mem_mask1(i16* %p, <16 x i16> %default, <16 x i16> %mask) {
+; CHECK-LABEL: test_masked_i16_to_16_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqw %ymm2, %ymm1, %k1
+; CHECK-NEXT: vpbroadcastw (%rdi), %ymm0 {%k1}
+; CHECK-NEXT: retq
+ %s = load i16, i16* %p
+ %vec = insertelement <2 x i16> undef, i16 %s, i32 0
+ %shuf = shufflevector <2 x i16> %vec, <2 x i16> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %default
+ ret <16 x i16> %res
+}
+
+define <16 x i16> @test_masked_z_i16_to_16_mem_mask1(i16* %p, <16 x i16> %mask) {
+; CHECK-LABEL: test_masked_z_i16_to_16_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k1
+; CHECK-NEXT: vpbroadcastw (%rdi), %ymm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %s = load i16, i16* %p
+ %vec = insertelement <2 x i16> undef, i16 %s, i32 0
+ %shuf = shufflevector <2 x i16> %vec, <2 x i16> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
+ ret <16 x i16> %res
+}
+define <16 x i16> @test_masked_i16_to_16_mem_mask2(i16* %p, <16 x i16> %default, <16 x i16> %mask) {
+; CHECK-LABEL: test_masked_i16_to_16_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqw %ymm2, %ymm1, %k1
+; CHECK-NEXT: vpbroadcastw (%rdi), %ymm0 {%k1}
+; CHECK-NEXT: retq
+ %s = load i16, i16* %p
+ %vec = insertelement <2 x i16> undef, i16 %s, i32 0
+ %shuf = shufflevector <2 x i16> %vec, <2 x i16> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %default
+ ret <16 x i16> %res
+}
+
+define <16 x i16> @test_masked_z_i16_to_16_mem_mask2(i16* %p, <16 x i16> %mask) {
+; CHECK-LABEL: test_masked_z_i16_to_16_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k1
+; CHECK-NEXT: vpbroadcastw (%rdi), %ymm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %s = load i16, i16* %p
+ %vec = insertelement <2 x i16> undef, i16 %s, i32 0
+ %shuf = shufflevector <2 x i16> %vec, <2 x i16> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
+ ret <16 x i16> %res
+}
+define <16 x i16> @test_masked_i16_to_16_mem_mask3(i16* %p, <16 x i16> %default, <16 x i16> %mask) {
+; CHECK-LABEL: test_masked_i16_to_16_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqw %ymm2, %ymm1, %k1
+; CHECK-NEXT: vpbroadcastw (%rdi), %ymm0 {%k1}
+; CHECK-NEXT: retq
+ %s = load i16, i16* %p
+ %vec = insertelement <2 x i16> undef, i16 %s, i32 0
+ %shuf = shufflevector <2 x i16> %vec, <2 x i16> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %default
+ ret <16 x i16> %res
+}
+
+define <16 x i16> @test_masked_z_i16_to_16_mem_mask3(i16* %p, <16 x i16> %mask) {
+; CHECK-LABEL: test_masked_z_i16_to_16_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k1
+; CHECK-NEXT: vpbroadcastw (%rdi), %ymm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %s = load i16, i16* %p
+ %vec = insertelement <2 x i16> undef, i16 %s, i32 0
+ %shuf = shufflevector <2 x i16> %vec, <2 x i16> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
+ ret <16 x i16> %res
+}
+define <32 x i16> @test_i16_to_32_mem(i16* %p) {
+; CHECK-LABEL: test_i16_to_32_mem:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpbroadcastw (%rdi), %zmm0
+; CHECK-NEXT: retq
+ %s = load i16, i16* %p
+ %vec = insertelement <2 x i16> undef, i16 %s, i32 0
+ %res = shufflevector <2 x i16> %vec, <2 x i16> undef, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ ret <32 x i16> %res
+}
+define <32 x i16> @test_masked_i16_to_32_mem_mask0(i16* %p, <32 x i16> %default, <32 x i16> %mask) {
+; CHECK-LABEL: test_masked_i16_to_32_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqw %zmm2, %zmm1, %k1
+; CHECK-NEXT: vpbroadcastw (%rdi), %zmm0 {%k1}
+; CHECK-NEXT: retq
+ %s = load i16, i16* %p
+ %vec = insertelement <2 x i16> undef, i16 %s, i32 0
+ %shuf = shufflevector <2 x i16> %vec, <2 x i16> undef, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <32 x i16> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %default
+ ret <32 x i16> %res
+}
+
+define <32 x i16> @test_masked_z_i16_to_32_mem_mask0(i16* %p, <32 x i16> %mask) {
+; CHECK-LABEL: test_masked_z_i16_to_32_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqw %zmm1, %zmm0, %k1
+; CHECK-NEXT: vpbroadcastw (%rdi), %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %s = load i16, i16* %p
+ %vec = insertelement <2 x i16> undef, i16 %s, i32 0
+ %shuf = shufflevector <2 x i16> %vec, <2 x i16> undef, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <32 x i16> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
+ ret <32 x i16> %res
+}
+define <32 x i16> @test_masked_i16_to_32_mem_mask1(i16* %p, <32 x i16> %default, <32 x i16> %mask) {
+; CHECK-LABEL: test_masked_i16_to_32_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqw %zmm2, %zmm1, %k1
+; CHECK-NEXT: vpbroadcastw (%rdi), %zmm0 {%k1}
+; CHECK-NEXT: retq
+ %s = load i16, i16* %p
+ %vec = insertelement <2 x i16> undef, i16 %s, i32 0
+ %shuf = shufflevector <2 x i16> %vec, <2 x i16> undef, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <32 x i16> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %default
+ ret <32 x i16> %res
+}
+
+define <32 x i16> @test_masked_z_i16_to_32_mem_mask1(i16* %p, <32 x i16> %mask) {
+; CHECK-LABEL: test_masked_z_i16_to_32_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqw %zmm1, %zmm0, %k1
+; CHECK-NEXT: vpbroadcastw (%rdi), %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %s = load i16, i16* %p
+ %vec = insertelement <2 x i16> undef, i16 %s, i32 0
+ %shuf = shufflevector <2 x i16> %vec, <2 x i16> undef, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <32 x i16> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
+ ret <32 x i16> %res
+}
+define <32 x i16> @test_masked_i16_to_32_mem_mask2(i16* %p, <32 x i16> %default, <32 x i16> %mask) {
+; CHECK-LABEL: test_masked_i16_to_32_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqw %zmm2, %zmm1, %k1
+; CHECK-NEXT: vpbroadcastw (%rdi), %zmm0 {%k1}
+; CHECK-NEXT: retq
+ %s = load i16, i16* %p
+ %vec = insertelement <2 x i16> undef, i16 %s, i32 0
+ %shuf = shufflevector <2 x i16> %vec, <2 x i16> undef, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <32 x i16> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %default
+ ret <32 x i16> %res
+}
+
+define <32 x i16> @test_masked_z_i16_to_32_mem_mask2(i16* %p, <32 x i16> %mask) {
+; CHECK-LABEL: test_masked_z_i16_to_32_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqw %zmm1, %zmm0, %k1
+; CHECK-NEXT: vpbroadcastw (%rdi), %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %s = load i16, i16* %p
+ %vec = insertelement <2 x i16> undef, i16 %s, i32 0
+ %shuf = shufflevector <2 x i16> %vec, <2 x i16> undef, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <32 x i16> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
+ ret <32 x i16> %res
+}
+define <32 x i16> @test_masked_i16_to_32_mem_mask3(i16* %p, <32 x i16> %default, <32 x i16> %mask) {
+; CHECK-LABEL: test_masked_i16_to_32_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqw %zmm2, %zmm1, %k1
+; CHECK-NEXT: vpbroadcastw (%rdi), %zmm0 {%k1}
+; CHECK-NEXT: retq
+ %s = load i16, i16* %p
+ %vec = insertelement <2 x i16> undef, i16 %s, i32 0
+ %shuf = shufflevector <2 x i16> %vec, <2 x i16> undef, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <32 x i16> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %default
+ ret <32 x i16> %res
+}
+
+define <32 x i16> @test_masked_z_i16_to_32_mem_mask3(i16* %p, <32 x i16> %mask) {
+; CHECK-LABEL: test_masked_z_i16_to_32_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqw %zmm1, %zmm0, %k1
+; CHECK-NEXT: vpbroadcastw (%rdi), %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %s = load i16, i16* %p
+ %vec = insertelement <2 x i16> undef, i16 %s, i32 0
+ %shuf = shufflevector <2 x i16> %vec, <2 x i16> undef, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <32 x i16> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
+ ret <32 x i16> %res
+}
+define <4 x i32> @test_i32_to_4_mem(i32* %p) {
+; CHECK-LABEL: test_i32_to_4_mem:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vbroadcastss (%rdi), %xmm0
+; CHECK-NEXT: retq
+ %s = load i32, i32* %p
+ %vec = insertelement <2 x i32> undef, i32 %s, i32 0
+ %res = shufflevector <2 x i32> %vec, <2 x i32> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+ ret <4 x i32> %res
+}
+define <4 x i32> @test_masked_i32_to_4_mem_mask0(i32* %p, <4 x i32> %default, <4 x i32> %mask) {
+; CHECK-LABEL: test_masked_i32_to_4_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %k1
+; CHECK-NEXT: vpbroadcastd (%rdi), %xmm0 {%k1}
+; CHECK-NEXT: retq
+ %s = load i32, i32* %p
+ %vec = insertelement <2 x i32> undef, i32 %s, i32 0
+ %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %default
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_masked_z_i32_to_4_mem_mask0(i32* %p, <4 x i32> %mask) {
+; CHECK-LABEL: test_masked_z_i32_to_4_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k1
+; CHECK-NEXT: vpbroadcastd (%rdi), %xmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %s = load i32, i32* %p
+ %vec = insertelement <2 x i32> undef, i32 %s, i32 0
+ %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
+ ret <4 x i32> %res
+}
+define <4 x i32> @test_masked_i32_to_4_mem_mask1(i32* %p, <4 x i32> %default, <4 x i32> %mask) {
+; CHECK-LABEL: test_masked_i32_to_4_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %k1
+; CHECK-NEXT: vpbroadcastd (%rdi), %xmm0 {%k1}
+; CHECK-NEXT: retq
+ %s = load i32, i32* %p
+ %vec = insertelement <2 x i32> undef, i32 %s, i32 0
+ %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %default
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_masked_z_i32_to_4_mem_mask1(i32* %p, <4 x i32> %mask) {
+; CHECK-LABEL: test_masked_z_i32_to_4_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k1
+; CHECK-NEXT: vpbroadcastd (%rdi), %xmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %s = load i32, i32* %p
+ %vec = insertelement <2 x i32> undef, i32 %s, i32 0
+ %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
+ ret <4 x i32> %res
+}
+define <4 x i32> @test_masked_i32_to_4_mem_mask2(i32* %p, <4 x i32> %default, <4 x i32> %mask) {
+; CHECK-LABEL: test_masked_i32_to_4_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %k1
+; CHECK-NEXT: vpbroadcastd (%rdi), %xmm0 {%k1}
+; CHECK-NEXT: retq
+ %s = load i32, i32* %p
+ %vec = insertelement <2 x i32> undef, i32 %s, i32 0
+ %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %default
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_masked_z_i32_to_4_mem_mask2(i32* %p, <4 x i32> %mask) {
+; CHECK-LABEL: test_masked_z_i32_to_4_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k1
+; CHECK-NEXT: vpbroadcastd (%rdi), %xmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %s = load i32, i32* %p
+ %vec = insertelement <2 x i32> undef, i32 %s, i32 0
+ %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
+ ret <4 x i32> %res
+}
+define <4 x i32> @test_masked_i32_to_4_mem_mask3(i32* %p, <4 x i32> %default, <4 x i32> %mask) {
+; CHECK-LABEL: test_masked_i32_to_4_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %k1
+; CHECK-NEXT: vpbroadcastd (%rdi), %xmm0 {%k1}
+; CHECK-NEXT: retq
+ %s = load i32, i32* %p
+ %vec = insertelement <2 x i32> undef, i32 %s, i32 0
+ %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %default
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_masked_z_i32_to_4_mem_mask3(i32* %p, <4 x i32> %mask) {
+; CHECK-LABEL: test_masked_z_i32_to_4_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k1
+; CHECK-NEXT: vpbroadcastd (%rdi), %xmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %s = load i32, i32* %p
+ %vec = insertelement <2 x i32> undef, i32 %s, i32 0
+ %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
+ ret <4 x i32> %res
+}
+define <8 x i32> @test_i32_to_8_mem(i32* %p) {
+; CHECK-LABEL: test_i32_to_8_mem:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vbroadcastss (%rdi), %ymm0
+; CHECK-NEXT: retq
+ %s = load i32, i32* %p
+ %vec = insertelement <2 x i32> undef, i32 %s, i32 0
+ %res = shufflevector <2 x i32> %vec, <2 x i32> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ ret <8 x i32> %res
+}
+define <8 x i32> @test_masked_i32_to_8_mem_mask0(i32* %p, <8 x i32> %default, <8 x i32> %mask) {
+; CHECK-LABEL: test_masked_i32_to_8_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vpbroadcastd (%rdi), %ymm0 {%k1}
+; CHECK-NEXT: retq
+ %s = load i32, i32* %p
+ %vec = insertelement <2 x i32> undef, i32 %s, i32 0
+ %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %default
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_masked_z_i32_to_8_mem_mask0(i32* %p, <8 x i32> %mask) {
+; CHECK-LABEL: test_masked_z_i32_to_8_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k1
+; CHECK-NEXT: vpbroadcastd (%rdi), %ymm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %s = load i32, i32* %p
+ %vec = insertelement <2 x i32> undef, i32 %s, i32 0
+ %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
+ ret <8 x i32> %res
+}
+define <8 x i32> @test_masked_i32_to_8_mem_mask1(i32* %p, <8 x i32> %default, <8 x i32> %mask) {
+; CHECK-LABEL: test_masked_i32_to_8_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vpbroadcastd (%rdi), %ymm0 {%k1}
+; CHECK-NEXT: retq
+ %s = load i32, i32* %p
+ %vec = insertelement <2 x i32> undef, i32 %s, i32 0
+ %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %default
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_masked_z_i32_to_8_mem_mask1(i32* %p, <8 x i32> %mask) {
+; CHECK-LABEL: test_masked_z_i32_to_8_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k1
+; CHECK-NEXT: vpbroadcastd (%rdi), %ymm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %s = load i32, i32* %p
+ %vec = insertelement <2 x i32> undef, i32 %s, i32 0
+ %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
+ ret <8 x i32> %res
+}
+define <8 x i32> @test_masked_i32_to_8_mem_mask2(i32* %p, <8 x i32> %default, <8 x i32> %mask) {
+; CHECK-LABEL: test_masked_i32_to_8_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vpbroadcastd (%rdi), %ymm0 {%k1}
+; CHECK-NEXT: retq
+ %s = load i32, i32* %p
+ %vec = insertelement <2 x i32> undef, i32 %s, i32 0
+ %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %default
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_masked_z_i32_to_8_mem_mask2(i32* %p, <8 x i32> %mask) {
+; CHECK-LABEL: test_masked_z_i32_to_8_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k1
+; CHECK-NEXT: vpbroadcastd (%rdi), %ymm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %s = load i32, i32* %p
+ %vec = insertelement <2 x i32> undef, i32 %s, i32 0
+ %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
+ ret <8 x i32> %res
+}
+define <8 x i32> @test_masked_i32_to_8_mem_mask3(i32* %p, <8 x i32> %default, <8 x i32> %mask) {
+; CHECK-LABEL: test_masked_i32_to_8_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vpbroadcastd (%rdi), %ymm0 {%k1}
+; CHECK-NEXT: retq
+ %s = load i32, i32* %p
+ %vec = insertelement <2 x i32> undef, i32 %s, i32 0
+ %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %default
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_masked_z_i32_to_8_mem_mask3(i32* %p, <8 x i32> %mask) {
+; CHECK-LABEL: test_masked_z_i32_to_8_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k1
+; CHECK-NEXT: vpbroadcastd (%rdi), %ymm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %s = load i32, i32* %p
+ %vec = insertelement <2 x i32> undef, i32 %s, i32 0
+ %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
+ ret <8 x i32> %res
+}
+define <16 x i32> @test_i32_to_16_mem(i32* %p) {
+; CHECK-LABEL: test_i32_to_16_mem:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vbroadcastss (%rdi), %zmm0
+; CHECK-NEXT: retq
+ %s = load i32, i32* %p
+ %vec = insertelement <2 x i32> undef, i32 %s, i32 0
+ %res = shufflevector <2 x i32> %vec, <2 x i32> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ ret <16 x i32> %res
+}
+define <16 x i32> @test_masked_i32_to_16_mem_mask0(i32* %p, <16 x i32> %default, <16 x i32> %mask) {
+; CHECK-LABEL: test_masked_i32_to_16_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqd %zmm2, %zmm1, %k1
+; CHECK-NEXT: vpbroadcastd (%rdi), %zmm0 {%k1}
+; CHECK-NEXT: retq
+ %s = load i32, i32* %p
+ %vec = insertelement <2 x i32> undef, i32 %s, i32 0
+ %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %default
+ ret <16 x i32> %res
+}
+
+define <16 x i32> @test_masked_z_i32_to_16_mem_mask0(i32* %p, <16 x i32> %mask) {
+; CHECK-LABEL: test_masked_z_i32_to_16_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k1
+; CHECK-NEXT: vpbroadcastd (%rdi), %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %s = load i32, i32* %p
+ %vec = insertelement <2 x i32> undef, i32 %s, i32 0
+ %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
+ ret <16 x i32> %res
+}
+define <16 x i32> @test_masked_i32_to_16_mem_mask1(i32* %p, <16 x i32> %default, <16 x i32> %mask) {
+; CHECK-LABEL: test_masked_i32_to_16_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqd %zmm2, %zmm1, %k1
+; CHECK-NEXT: vpbroadcastd (%rdi), %zmm0 {%k1}
+; CHECK-NEXT: retq
+ %s = load i32, i32* %p
+ %vec = insertelement <2 x i32> undef, i32 %s, i32 0
+ %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %default
+ ret <16 x i32> %res
+}
+
+define <16 x i32> @test_masked_z_i32_to_16_mem_mask1(i32* %p, <16 x i32> %mask) {
+; CHECK-LABEL: test_masked_z_i32_to_16_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k1
+; CHECK-NEXT: vpbroadcastd (%rdi), %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %s = load i32, i32* %p
+ %vec = insertelement <2 x i32> undef, i32 %s, i32 0
+ %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
+ ret <16 x i32> %res
+}
+define <16 x i32> @test_masked_i32_to_16_mem_mask2(i32* %p, <16 x i32> %default, <16 x i32> %mask) {
+; CHECK-LABEL: test_masked_i32_to_16_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqd %zmm2, %zmm1, %k1
+; CHECK-NEXT: vpbroadcastd (%rdi), %zmm0 {%k1}
+; CHECK-NEXT: retq
+ %s = load i32, i32* %p
+ %vec = insertelement <2 x i32> undef, i32 %s, i32 0
+ %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %default
+ ret <16 x i32> %res
+}
+
+define <16 x i32> @test_masked_z_i32_to_16_mem_mask2(i32* %p, <16 x i32> %mask) {
+; CHECK-LABEL: test_masked_z_i32_to_16_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k1
+; CHECK-NEXT: vpbroadcastd (%rdi), %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %s = load i32, i32* %p
+ %vec = insertelement <2 x i32> undef, i32 %s, i32 0
+ %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
+ ret <16 x i32> %res
+}
+define <16 x i32> @test_masked_i32_to_16_mem_mask3(i32* %p, <16 x i32> %default, <16 x i32> %mask) {
+; CHECK-LABEL: test_masked_i32_to_16_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqd %zmm2, %zmm1, %k1
+; CHECK-NEXT: vpbroadcastd (%rdi), %zmm0 {%k1}
+; CHECK-NEXT: retq
+ %s = load i32, i32* %p
+ %vec = insertelement <2 x i32> undef, i32 %s, i32 0
+ %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %default
+ ret <16 x i32> %res
+}
+
+define <16 x i32> @test_masked_z_i32_to_16_mem_mask3(i32* %p, <16 x i32> %mask) {
+; CHECK-LABEL: test_masked_z_i32_to_16_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k1
+; CHECK-NEXT: vpbroadcastd (%rdi), %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %s = load i32, i32* %p
+ %vec = insertelement <2 x i32> undef, i32 %s, i32 0
+ %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
+ ret <16 x i32> %res
+}
+define <2 x i64> @test_i64_to_2_mem(i64* %p) {
+; CHECK-LABEL: test_i64_to_2_mem:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpbroadcastq (%rdi), %xmm0
+; CHECK-NEXT: retq
+ %s = load i64, i64* %p
+ %vec = insertelement <2 x i64> undef, i64 %s, i32 0
+ %res = shufflevector <2 x i64> %vec, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
+ ret <2 x i64> %res
+}
+define <2 x i64> @test_masked_i64_to_2_mem_mask0(i64* %p, <2 x i64> %default, <2 x i64> %mask) {
+; CHECK-LABEL: test_masked_i64_to_2_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqq %xmm2, %xmm1, %k1
+; CHECK-NEXT: vpbroadcastq (%rdi), %xmm0 {%k1}
+; CHECK-NEXT: retq
+ %s = load i64, i64* %p
+ %vec = insertelement <2 x i64> undef, i64 %s, i32 0
+ %shuf = shufflevector <2 x i64> %vec, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
+ %cmp = icmp eq <2 x i64> %mask, zeroinitializer
+ %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> %default
+ ret <2 x i64> %res
+}
+
+define <2 x i64> @test_masked_z_i64_to_2_mem_mask0(i64* %p, <2 x i64> %mask) {
+; CHECK-LABEL: test_masked_z_i64_to_2_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqq %xmm1, %xmm0, %k1
+; CHECK-NEXT: vpbroadcastq (%rdi), %xmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %s = load i64, i64* %p
+ %vec = insertelement <2 x i64> undef, i64 %s, i32 0
+ %shuf = shufflevector <2 x i64> %vec, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
+ %cmp = icmp eq <2 x i64> %mask, zeroinitializer
+ %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> zeroinitializer
+ ret <2 x i64> %res
+}
+define <2 x i64> @test_masked_i64_to_2_mem_mask1(i64* %p, <2 x i64> %default, <2 x i64> %mask) {
+; CHECK-LABEL: test_masked_i64_to_2_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqq %xmm2, %xmm1, %k1
+; CHECK-NEXT: vpbroadcastq (%rdi), %xmm0 {%k1}
+; CHECK-NEXT: retq
+ %s = load i64, i64* %p
+ %vec = insertelement <2 x i64> undef, i64 %s, i32 0
+ %shuf = shufflevector <2 x i64> %vec, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
+ %cmp = icmp eq <2 x i64> %mask, zeroinitializer
+ %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> %default
+ ret <2 x i64> %res
+}
+
+define <2 x i64> @test_masked_z_i64_to_2_mem_mask1(i64* %p, <2 x i64> %mask) {
+; CHECK-LABEL: test_masked_z_i64_to_2_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqq %xmm1, %xmm0, %k1
+; CHECK-NEXT: vpbroadcastq (%rdi), %xmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %s = load i64, i64* %p
+ %vec = insertelement <2 x i64> undef, i64 %s, i32 0
+ %shuf = shufflevector <2 x i64> %vec, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
+ %cmp = icmp eq <2 x i64> %mask, zeroinitializer
+ %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> zeroinitializer
+ ret <2 x i64> %res
+}
+define <4 x i64> @test_i64_to_4_mem(i64* %p) {
+; CHECK-LABEL: test_i64_to_4_mem:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vbroadcastsd (%rdi), %ymm0
+; CHECK-NEXT: retq
+ %s = load i64, i64* %p
+ %vec = insertelement <2 x i64> undef, i64 %s, i32 0
+ %res = shufflevector <2 x i64> %vec, <2 x i64> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+ ret <4 x i64> %res
+}
+define <4 x i64> @test_masked_i64_to_4_mem_mask0(i64* %p, <4 x i64> %default, <4 x i64> %mask) {
+; CHECK-LABEL: test_masked_i64_to_4_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
+; CHECK-NEXT: vpbroadcastq (%rdi), %ymm0 {%k1}
+; CHECK-NEXT: retq
+ %s = load i64, i64* %p
+ %vec = insertelement <2 x i64> undef, i64 %s, i32 0
+ %shuf = shufflevector <2 x i64> %vec, <2 x i64> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %default
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_masked_z_i64_to_4_mem_mask0(i64* %p, <4 x i64> %mask) {
+; CHECK-LABEL: test_masked_z_i64_to_4_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k1
+; CHECK-NEXT: vpbroadcastq (%rdi), %ymm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %s = load i64, i64* %p
+ %vec = insertelement <2 x i64> undef, i64 %s, i32 0
+ %shuf = shufflevector <2 x i64> %vec, <2 x i64> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
+ ret <4 x i64> %res
+}
+define <4 x i64> @test_masked_i64_to_4_mem_mask1(i64* %p, <4 x i64> %default, <4 x i64> %mask) {
+; CHECK-LABEL: test_masked_i64_to_4_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
+; CHECK-NEXT: vpbroadcastq (%rdi), %ymm0 {%k1}
+; CHECK-NEXT: retq
+ %s = load i64, i64* %p
+ %vec = insertelement <2 x i64> undef, i64 %s, i32 0
+ %shuf = shufflevector <2 x i64> %vec, <2 x i64> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %default
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_masked_z_i64_to_4_mem_mask1(i64* %p, <4 x i64> %mask) {
+; CHECK-LABEL: test_masked_z_i64_to_4_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k1
+; CHECK-NEXT: vpbroadcastq (%rdi), %ymm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %s = load i64, i64* %p
+ %vec = insertelement <2 x i64> undef, i64 %s, i32 0
+ %shuf = shufflevector <2 x i64> %vec, <2 x i64> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
+ ret <4 x i64> %res
+}
+define <4 x i64> @test_masked_i64_to_4_mem_mask2(i64* %p, <4 x i64> %default, <4 x i64> %mask) {
+; CHECK-LABEL: test_masked_i64_to_4_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
+; CHECK-NEXT: vpbroadcastq (%rdi), %ymm0 {%k1}
+; CHECK-NEXT: retq
+ %s = load i64, i64* %p
+ %vec = insertelement <2 x i64> undef, i64 %s, i32 0
+ %shuf = shufflevector <2 x i64> %vec, <2 x i64> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %default
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_masked_z_i64_to_4_mem_mask2(i64* %p, <4 x i64> %mask) {
+; CHECK-LABEL: test_masked_z_i64_to_4_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k1
+; CHECK-NEXT: vpbroadcastq (%rdi), %ymm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %s = load i64, i64* %p
+ %vec = insertelement <2 x i64> undef, i64 %s, i32 0
+ %shuf = shufflevector <2 x i64> %vec, <2 x i64> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
+ ret <4 x i64> %res
+}
+define <4 x i64> @test_masked_i64_to_4_mem_mask3(i64* %p, <4 x i64> %default, <4 x i64> %mask) {
+; CHECK-LABEL: test_masked_i64_to_4_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
+; CHECK-NEXT: vpbroadcastq (%rdi), %ymm0 {%k1}
+; CHECK-NEXT: retq
+ %s = load i64, i64* %p
+ %vec = insertelement <2 x i64> undef, i64 %s, i32 0
+ %shuf = shufflevector <2 x i64> %vec, <2 x i64> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %default
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_masked_z_i64_to_4_mem_mask3(i64* %p, <4 x i64> %mask) {
+; CHECK-LABEL: test_masked_z_i64_to_4_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k1
+; CHECK-NEXT: vpbroadcastq (%rdi), %ymm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %s = load i64, i64* %p
+ %vec = insertelement <2 x i64> undef, i64 %s, i32 0
+ %shuf = shufflevector <2 x i64> %vec, <2 x i64> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
+ ret <4 x i64> %res
+}
+define <8 x i64> @test_i64_to_8_mem(i64* %p) {
+; CHECK-LABEL: test_i64_to_8_mem:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vbroadcastsd (%rdi), %zmm0
+; CHECK-NEXT: retq
+ %s = load i64, i64* %p
+ %vec = insertelement <2 x i64> undef, i64 %s, i32 0
+ %res = shufflevector <2 x i64> %vec, <2 x i64> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ ret <8 x i64> %res
+}
+define <8 x i64> @test_masked_i64_to_8_mem_mask0(i64* %p, <8 x i64> %default, <8 x i64> %mask) {
+; CHECK-LABEL: test_masked_i64_to_8_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqq %zmm2, %zmm1, %k1
+; CHECK-NEXT: vpbroadcastq (%rdi), %zmm0 {%k1}
+; CHECK-NEXT: retq
+ %s = load i64, i64* %p
+ %vec = insertelement <2 x i64> undef, i64 %s, i32 0
+ %shuf = shufflevector <2 x i64> %vec, <2 x i64> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %default
+ ret <8 x i64> %res
+}
+
+define <8 x i64> @test_masked_z_i64_to_8_mem_mask0(i64* %p, <8 x i64> %mask) {
+; CHECK-LABEL: test_masked_z_i64_to_8_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k1
+; CHECK-NEXT: vpbroadcastq (%rdi), %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %s = load i64, i64* %p
+ %vec = insertelement <2 x i64> undef, i64 %s, i32 0
+ %shuf = shufflevector <2 x i64> %vec, <2 x i64> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
+ ret <8 x i64> %res
+}
+define <8 x i64> @test_masked_i64_to_8_mem_mask1(i64* %p, <8 x i64> %default, <8 x i64> %mask) {
+; CHECK-LABEL: test_masked_i64_to_8_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqq %zmm2, %zmm1, %k1
+; CHECK-NEXT: vpbroadcastq (%rdi), %zmm0 {%k1}
+; CHECK-NEXT: retq
+ %s = load i64, i64* %p
+ %vec = insertelement <2 x i64> undef, i64 %s, i32 0
+ %shuf = shufflevector <2 x i64> %vec, <2 x i64> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %default
+ ret <8 x i64> %res
+}
+
+define <8 x i64> @test_masked_z_i64_to_8_mem_mask1(i64* %p, <8 x i64> %mask) {
+; CHECK-LABEL: test_masked_z_i64_to_8_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k1
+; CHECK-NEXT: vpbroadcastq (%rdi), %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %s = load i64, i64* %p
+ %vec = insertelement <2 x i64> undef, i64 %s, i32 0
+ %shuf = shufflevector <2 x i64> %vec, <2 x i64> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
+ ret <8 x i64> %res
+}
+define <8 x i64> @test_masked_i64_to_8_mem_mask2(i64* %p, <8 x i64> %default, <8 x i64> %mask) {
+; CHECK-LABEL: test_masked_i64_to_8_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqq %zmm2, %zmm1, %k1
+; CHECK-NEXT: vpbroadcastq (%rdi), %zmm0 {%k1}
+; CHECK-NEXT: retq
+ %s = load i64, i64* %p
+ %vec = insertelement <2 x i64> undef, i64 %s, i32 0
+ %shuf = shufflevector <2 x i64> %vec, <2 x i64> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %default
+ ret <8 x i64> %res
+}
+
+define <8 x i64> @test_masked_z_i64_to_8_mem_mask2(i64* %p, <8 x i64> %mask) {
+; CHECK-LABEL: test_masked_z_i64_to_8_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k1
+; CHECK-NEXT: vpbroadcastq (%rdi), %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %s = load i64, i64* %p
+ %vec = insertelement <2 x i64> undef, i64 %s, i32 0
+ %shuf = shufflevector <2 x i64> %vec, <2 x i64> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
+ ret <8 x i64> %res
+}
+define <8 x i64> @test_masked_i64_to_8_mem_mask3(i64* %p, <8 x i64> %default, <8 x i64> %mask) {
+; CHECK-LABEL: test_masked_i64_to_8_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqq %zmm2, %zmm1, %k1
+; CHECK-NEXT: vpbroadcastq (%rdi), %zmm0 {%k1}
+; CHECK-NEXT: retq
+ %s = load i64, i64* %p
+ %vec = insertelement <2 x i64> undef, i64 %s, i32 0
+ %shuf = shufflevector <2 x i64> %vec, <2 x i64> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %default
+ ret <8 x i64> %res
+}
+
+define <8 x i64> @test_masked_z_i64_to_8_mem_mask3(i64* %p, <8 x i64> %mask) {
+; CHECK-LABEL: test_masked_z_i64_to_8_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k1
+; CHECK-NEXT: vpbroadcastq (%rdi), %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %s = load i64, i64* %p
+ %vec = insertelement <2 x i64> undef, i64 %s, i32 0
+ %shuf = shufflevector <2 x i64> %vec, <2 x i64> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
+ ret <8 x i64> %res
+}
diff --git a/test/CodeGen/X86/avx512-shuffles/broadcast-vector-fp.ll b/test/CodeGen/X86/avx512-shuffles/broadcast-vector-fp.ll
new file mode 100644
index 000000000000..c7291b02ae07
--- /dev/null
+++ b/test/CodeGen/X86/avx512-shuffles/broadcast-vector-fp.ll
@@ -0,0 +1,1163 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512vl,+avx512dq %s -o - | FileCheck %s
+
+define <8 x float> @test_2xfloat_to_8xfloat(<8 x float> %vec) {
+; CHECK-LABEL: test_2xfloat_to_8xfloat:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0
+; CHECK-NEXT: retq
+ %res = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+ ret <8 x float> %res
+}
+define <8 x float> @test_masked_2xfloat_to_8xfloat_mask0(<8 x float> %vec, <8 x float> %default, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_2xfloat_to_8xfloat_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1
+; CHECK-NEXT: vbroadcastf32x2 {{.*#+}} ymm1 {%k1} = xmm0[0,1,0,1,0,1,0,1]
+; CHECK-NEXT: vmovapd %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %default
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_masked_z_2xfloat_to_8xfloat_mask0(<8 x float> %vec, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_z_2xfloat_to_8xfloat_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1
+; CHECK-NEXT: vbroadcastf32x2 {{.*#+}} ymm0 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+define <8 x float> @test_masked_2xfloat_to_8xfloat_mask1(<8 x float> %vec, <8 x float> %default, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_2xfloat_to_8xfloat_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1
+; CHECK-NEXT: vbroadcastf32x2 {{.*#+}} ymm1 {%k1} = xmm0[0,1,0,1,0,1,0,1]
+; CHECK-NEXT: vmovapd %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %default
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_masked_z_2xfloat_to_8xfloat_mask1(<8 x float> %vec, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_z_2xfloat_to_8xfloat_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1
+; CHECK-NEXT: vbroadcastf32x2 {{.*#+}} ymm0 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+define <8 x float> @test_masked_2xfloat_to_8xfloat_mask2(<8 x float> %vec, <8 x float> %default, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_2xfloat_to_8xfloat_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1
+; CHECK-NEXT: vbroadcastf32x2 {{.*#+}} ymm1 {%k1} = xmm0[0,1,0,1,0,1,0,1]
+; CHECK-NEXT: vmovapd %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %default
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_masked_z_2xfloat_to_8xfloat_mask2(<8 x float> %vec, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_z_2xfloat_to_8xfloat_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1
+; CHECK-NEXT: vbroadcastf32x2 {{.*#+}} ymm0 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+define <8 x float> @test_masked_2xfloat_to_8xfloat_mask3(<8 x float> %vec, <8 x float> %default, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_2xfloat_to_8xfloat_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1
+; CHECK-NEXT: vbroadcastf32x2 {{.*#+}} ymm1 {%k1} = xmm0[0,1,0,1,0,1,0,1]
+; CHECK-NEXT: vmovapd %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %default
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_masked_z_2xfloat_to_8xfloat_mask3(<8 x float> %vec, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_z_2xfloat_to_8xfloat_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1
+; CHECK-NEXT: vbroadcastf32x2 {{.*#+}} ymm0 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+define <16 x float> @test_2xfloat_to_16xfloat(<16 x float> %vec) {
+; CHECK-LABEL: test_2xfloat_to_16xfloat:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vbroadcastsd %xmm0, %zmm0
+; CHECK-NEXT: retq
+ %res = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+ ret <16 x float> %res
+}
+define <16 x float> @test_masked_2xfloat_to_16xfloat_mask0(<16 x float> %vec, <16 x float> %default, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_2xfloat_to_16xfloat_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %zmm3, %zmm2, %k1
+; CHECK-NEXT: vbroadcastf32x2 {{.*#+}} zmm1 {%k1} = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; CHECK-NEXT: vmovapd %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %default
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_masked_z_2xfloat_to_16xfloat_mask0(<16 x float> %vec, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_z_2xfloat_to_16xfloat_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %zmm2, %zmm1, %k1
+; CHECK-NEXT: vbroadcastf32x2 {{.*#+}} zmm0 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
+define <16 x float> @test_masked_2xfloat_to_16xfloat_mask1(<16 x float> %vec, <16 x float> %default, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_2xfloat_to_16xfloat_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %zmm3, %zmm2, %k1
+; CHECK-NEXT: vbroadcastf32x2 {{.*#+}} zmm1 {%k1} = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; CHECK-NEXT: vmovapd %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %default
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_masked_z_2xfloat_to_16xfloat_mask1(<16 x float> %vec, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_z_2xfloat_to_16xfloat_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %zmm2, %zmm1, %k1
+; CHECK-NEXT: vbroadcastf32x2 {{.*#+}} zmm0 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
+define <16 x float> @test_masked_2xfloat_to_16xfloat_mask2(<16 x float> %vec, <16 x float> %default, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_2xfloat_to_16xfloat_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %zmm3, %zmm2, %k1
+; CHECK-NEXT: vbroadcastf32x2 {{.*#+}} zmm1 {%k1} = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; CHECK-NEXT: vmovapd %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %default
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_masked_z_2xfloat_to_16xfloat_mask2(<16 x float> %vec, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_z_2xfloat_to_16xfloat_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %zmm2, %zmm1, %k1
+; CHECK-NEXT: vbroadcastf32x2 {{.*#+}} zmm0 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
+define <16 x float> @test_masked_2xfloat_to_16xfloat_mask3(<16 x float> %vec, <16 x float> %default, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_2xfloat_to_16xfloat_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %zmm3, %zmm2, %k1
+; CHECK-NEXT: vbroadcastf32x2 {{.*#+}} zmm1 {%k1} = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; CHECK-NEXT: vmovapd %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %default
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_masked_z_2xfloat_to_16xfloat_mask3(<16 x float> %vec, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_z_2xfloat_to_16xfloat_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %zmm2, %zmm1, %k1
+; CHECK-NEXT: vbroadcastf32x2 {{.*#+}} zmm0 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
+define <4 x double> @test_2xdouble_to_4xdouble_mem(<2 x double>* %vp) {
+; CHECK-LABEL: test_2xdouble_to_4xdouble_mem:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
+; CHECK-NEXT: retq
+ %vec = load <2 x double>, <2 x double>* %vp
+ %res = shufflevector <2 x double> %vec, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+ ret <4 x double> %res
+}
+define <4 x double> @test_masked_2xdouble_to_4xdouble_mem_mask0(<2 x double>* %vp, <4 x double> %default, <4 x double> %mask) {
+; CHECK-LABEL: test_masked_2xdouble_to_4xdouble_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vbroadcastf64x2 {{.*#+}} ymm0 {%k1} = mem[0,1,0,1]
+; CHECK-NEXT: retq
+ %vec = load <2 x double>, <2 x double>* %vp
+ %shuf = shufflevector <2 x double> %vec, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %default
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_masked_z_2xdouble_to_4xdouble_mem_mask0(<2 x double>* %vp, <4 x double> %mask) {
+; CHECK-LABEL: test_masked_z_2xdouble_to_4xdouble_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vcmpeqpd %ymm1, %ymm0, %k1
+; CHECK-NEXT: vbroadcastf64x2 {{.*#+}} ymm0 {%k1} {z} = mem[0,1,0,1]
+; CHECK-NEXT: retq
+ %vec = load <2 x double>, <2 x double>* %vp
+ %shuf = shufflevector <2 x double> %vec, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
+ ret <4 x double> %res
+}
+define <4 x double> @test_masked_2xdouble_to_4xdouble_mem_mask1(<2 x double>* %vp, <4 x double> %default, <4 x double> %mask) {
+; CHECK-LABEL: test_masked_2xdouble_to_4xdouble_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vbroadcastf64x2 {{.*#+}} ymm0 {%k1} = mem[0,1,0,1]
+; CHECK-NEXT: retq
+ %vec = load <2 x double>, <2 x double>* %vp
+ %shuf = shufflevector <2 x double> %vec, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %default
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_masked_z_2xdouble_to_4xdouble_mem_mask1(<2 x double>* %vp, <4 x double> %mask) {
+; CHECK-LABEL: test_masked_z_2xdouble_to_4xdouble_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vcmpeqpd %ymm1, %ymm0, %k1
+; CHECK-NEXT: vbroadcastf64x2 {{.*#+}} ymm0 {%k1} {z} = mem[0,1,0,1]
+; CHECK-NEXT: retq
+ %vec = load <2 x double>, <2 x double>* %vp
+ %shuf = shufflevector <2 x double> %vec, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
+ ret <4 x double> %res
+}
+define <4 x double> @test_masked_2xdouble_to_4xdouble_mem_mask2(<2 x double>* %vp, <4 x double> %default, <4 x double> %mask) {
+; CHECK-LABEL: test_masked_2xdouble_to_4xdouble_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vbroadcastf64x2 {{.*#+}} ymm0 {%k1} = mem[0,1,0,1]
+; CHECK-NEXT: retq
+ %vec = load <2 x double>, <2 x double>* %vp
+ %shuf = shufflevector <2 x double> %vec, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %default
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_masked_z_2xdouble_to_4xdouble_mem_mask2(<2 x double>* %vp, <4 x double> %mask) {
+; CHECK-LABEL: test_masked_z_2xdouble_to_4xdouble_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vcmpeqpd %ymm1, %ymm0, %k1
+; CHECK-NEXT: vbroadcastf64x2 {{.*#+}} ymm0 {%k1} {z} = mem[0,1,0,1]
+; CHECK-NEXT: retq
+ %vec = load <2 x double>, <2 x double>* %vp
+ %shuf = shufflevector <2 x double> %vec, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
+ ret <4 x double> %res
+}
+define <4 x double> @test_masked_2xdouble_to_4xdouble_mem_mask3(<2 x double>* %vp, <4 x double> %default, <4 x double> %mask) {
+; CHECK-LABEL: test_masked_2xdouble_to_4xdouble_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vbroadcastf64x2 {{.*#+}} ymm0 {%k1} = mem[0,1,0,1]
+; CHECK-NEXT: retq
+ %vec = load <2 x double>, <2 x double>* %vp
+ %shuf = shufflevector <2 x double> %vec, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %default
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_masked_z_2xdouble_to_4xdouble_mem_mask3(<2 x double>* %vp, <4 x double> %mask) {
+; CHECK-LABEL: test_masked_z_2xdouble_to_4xdouble_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vcmpeqpd %ymm1, %ymm0, %k1
+; CHECK-NEXT: vbroadcastf64x2 {{.*#+}} ymm0 {%k1} {z} = mem[0,1,0,1]
+; CHECK-NEXT: retq
+ %vec = load <2 x double>, <2 x double>* %vp
+ %shuf = shufflevector <2 x double> %vec, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
+ ret <4 x double> %res
+}
+define <8 x double> @test_2xdouble_to_8xdouble_mem(<2 x double>* %vp) {
+; CHECK-LABEL: test_2xdouble_to_8xdouble_mem:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; CHECK-NEXT: retq
+ %vec = load <2 x double>, <2 x double>* %vp
+ %res = shufflevector <2 x double> %vec, <2 x double> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+ ret <8 x double> %res
+}
+define <8 x double> @test_masked_2xdouble_to_8xdouble_mem_mask0(<2 x double>* %vp, <8 x double> %default, <8 x double> %mask) {
+; CHECK-LABEL: test_masked_2xdouble_to_8xdouble_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %zmm2, %zmm1, %k1
+; CHECK-NEXT: vbroadcastf64x2 {{.*#+}} zmm0 {%k1} = mem[0,1,0,1,0,1,0,1]
+; CHECK-NEXT: retq
+ %vec = load <2 x double>, <2 x double>* %vp
+ %shuf = shufflevector <2 x double> %vec, <2 x double> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %default
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_masked_z_2xdouble_to_8xdouble_mem_mask0(<2 x double>* %vp, <8 x double> %mask) {
+; CHECK-LABEL: test_masked_z_2xdouble_to_8xdouble_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vcmpeqpd %zmm1, %zmm0, %k1
+; CHECK-NEXT: vbroadcastf64x2 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,0,1,0,1,0,1]
+; CHECK-NEXT: retq
+ %vec = load <2 x double>, <2 x double>* %vp
+ %shuf = shufflevector <2 x double> %vec, <2 x double> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
+ ret <8 x double> %res
+}
+define <8 x double> @test_masked_2xdouble_to_8xdouble_mem_mask1(<2 x double>* %vp, <8 x double> %default, <8 x double> %mask) {
+; CHECK-LABEL: test_masked_2xdouble_to_8xdouble_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %zmm2, %zmm1, %k1
+; CHECK-NEXT: vbroadcastf64x2 {{.*#+}} zmm0 {%k1} = mem[0,1,0,1,0,1,0,1]
+; CHECK-NEXT: retq
+ %vec = load <2 x double>, <2 x double>* %vp
+ %shuf = shufflevector <2 x double> %vec, <2 x double> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %default
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_masked_z_2xdouble_to_8xdouble_mem_mask1(<2 x double>* %vp, <8 x double> %mask) {
+; CHECK-LABEL: test_masked_z_2xdouble_to_8xdouble_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vcmpeqpd %zmm1, %zmm0, %k1
+; CHECK-NEXT: vbroadcastf64x2 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,0,1,0,1,0,1]
+; CHECK-NEXT: retq
+ %vec = load <2 x double>, <2 x double>* %vp
+ %shuf = shufflevector <2 x double> %vec, <2 x double> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
+ ret <8 x double> %res
+}
+define <8 x double> @test_masked_2xdouble_to_8xdouble_mem_mask2(<2 x double>* %vp, <8 x double> %default, <8 x double> %mask) {
+; CHECK-LABEL: test_masked_2xdouble_to_8xdouble_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %zmm2, %zmm1, %k1
+; CHECK-NEXT: vbroadcastf64x2 {{.*#+}} zmm0 {%k1} = mem[0,1,0,1,0,1,0,1]
+; CHECK-NEXT: retq
+ %vec = load <2 x double>, <2 x double>* %vp
+ %shuf = shufflevector <2 x double> %vec, <2 x double> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %default
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_masked_z_2xdouble_to_8xdouble_mem_mask2(<2 x double>* %vp, <8 x double> %mask) {
+; CHECK-LABEL: test_masked_z_2xdouble_to_8xdouble_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vcmpeqpd %zmm1, %zmm0, %k1
+; CHECK-NEXT: vbroadcastf64x2 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,0,1,0,1,0,1]
+; CHECK-NEXT: retq
+ %vec = load <2 x double>, <2 x double>* %vp
+ %shuf = shufflevector <2 x double> %vec, <2 x double> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
+ ret <8 x double> %res
+}
+define <8 x double> @test_masked_2xdouble_to_8xdouble_mem_mask3(<2 x double>* %vp, <8 x double> %default, <8 x double> %mask) {
+; CHECK-LABEL: test_masked_2xdouble_to_8xdouble_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %zmm2, %zmm1, %k1
+; CHECK-NEXT: vbroadcastf64x2 {{.*#+}} zmm0 {%k1} = mem[0,1,0,1,0,1,0,1]
+; CHECK-NEXT: retq
+ %vec = load <2 x double>, <2 x double>* %vp
+ %shuf = shufflevector <2 x double> %vec, <2 x double> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %default
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_masked_z_2xdouble_to_8xdouble_mem_mask3(<2 x double>* %vp, <8 x double> %mask) {
+; CHECK-LABEL: test_masked_z_2xdouble_to_8xdouble_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vcmpeqpd %zmm1, %zmm0, %k1
+; CHECK-NEXT: vbroadcastf64x2 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,0,1,0,1,0,1]
+; CHECK-NEXT: retq
+ %vec = load <2 x double>, <2 x double>* %vp
+ %shuf = shufflevector <2 x double> %vec, <2 x double> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
+ ret <8 x double> %res
+}
+define <8 x double> @test_4xdouble_to_8xdouble_mem(<4 x double>* %vp) {
+; CHECK-LABEL: test_4xdouble_to_8xdouble_mem:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vbroadcastf64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
+; CHECK-NEXT: retq
+ %vec = load <4 x double>, <4 x double>* %vp
+ %res = shufflevector <4 x double> %vec, <4 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+ ret <8 x double> %res
+}
+define <8 x double> @test_masked_4xdouble_to_8xdouble_mem_mask0(<4 x double>* %vp, <8 x double> %default, <8 x double> %mask) {
+; CHECK-LABEL: test_masked_4xdouble_to_8xdouble_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %zmm2, %zmm1, %k1
+; CHECK-NEXT: vbroadcastf64x4 {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,0,1,2,3]
+; CHECK-NEXT: retq
+ %vec = load <4 x double>, <4 x double>* %vp
+ %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %default
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_masked_z_4xdouble_to_8xdouble_mem_mask0(<4 x double>* %vp, <8 x double> %mask) {
+; CHECK-LABEL: test_masked_z_4xdouble_to_8xdouble_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vcmpeqpd %zmm1, %zmm0, %k1
+; CHECK-NEXT: vbroadcastf64x4 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,0,1,2,3]
+; CHECK-NEXT: retq
+ %vec = load <4 x double>, <4 x double>* %vp
+ %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
+ ret <8 x double> %res
+}
+define <8 x double> @test_masked_4xdouble_to_8xdouble_mem_mask1(<4 x double>* %vp, <8 x double> %default, <8 x double> %mask) {
+; CHECK-LABEL: test_masked_4xdouble_to_8xdouble_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %zmm2, %zmm1, %k1
+; CHECK-NEXT: vbroadcastf64x4 {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,0,1,2,3]
+; CHECK-NEXT: retq
+ %vec = load <4 x double>, <4 x double>* %vp
+ %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %default
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_masked_z_4xdouble_to_8xdouble_mem_mask1(<4 x double>* %vp, <8 x double> %mask) {
+; CHECK-LABEL: test_masked_z_4xdouble_to_8xdouble_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vcmpeqpd %zmm1, %zmm0, %k1
+; CHECK-NEXT: vbroadcastf64x4 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,0,1,2,3]
+; CHECK-NEXT: retq
+ %vec = load <4 x double>, <4 x double>* %vp
+ %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
+ ret <8 x double> %res
+}
+define <8 x double> @test_masked_4xdouble_to_8xdouble_mem_mask2(<4 x double>* %vp, <8 x double> %default, <8 x double> %mask) {
+; CHECK-LABEL: test_masked_4xdouble_to_8xdouble_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %zmm2, %zmm1, %k1
+; CHECK-NEXT: vbroadcastf64x4 {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,0,1,2,3]
+; CHECK-NEXT: retq
+ %vec = load <4 x double>, <4 x double>* %vp
+ %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %default
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_masked_z_4xdouble_to_8xdouble_mem_mask2(<4 x double>* %vp, <8 x double> %mask) {
+; CHECK-LABEL: test_masked_z_4xdouble_to_8xdouble_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vcmpeqpd %zmm1, %zmm0, %k1
+; CHECK-NEXT: vbroadcastf64x4 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,0,1,2,3]
+; CHECK-NEXT: retq
+ %vec = load <4 x double>, <4 x double>* %vp
+ %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
+ ret <8 x double> %res
+}
+define <8 x double> @test_masked_4xdouble_to_8xdouble_mem_mask3(<4 x double>* %vp, <8 x double> %default, <8 x double> %mask) {
+; CHECK-LABEL: test_masked_4xdouble_to_8xdouble_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %zmm2, %zmm1, %k1
+; CHECK-NEXT: vbroadcastf64x4 {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,0,1,2,3]
+; CHECK-NEXT: retq
+ %vec = load <4 x double>, <4 x double>* %vp
+ %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %default
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_masked_z_4xdouble_to_8xdouble_mem_mask3(<4 x double>* %vp, <8 x double> %mask) {
+; CHECK-LABEL: test_masked_z_4xdouble_to_8xdouble_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vcmpeqpd %zmm1, %zmm0, %k1
+; CHECK-NEXT: vbroadcastf64x4 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,0,1,2,3]
+; CHECK-NEXT: retq
+ %vec = load <4 x double>, <4 x double>* %vp
+ %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
+ ret <8 x double> %res
+}
+define <8 x float> @test_2xfloat_to_8xfloat_mem(<2 x float>* %vp) {
+; CHECK-LABEL: test_2xfloat_to_8xfloat_mem:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vbroadcastsd (%rdi), %ymm0
+; CHECK-NEXT: retq
+ %vec = load <2 x float>, <2 x float>* %vp
+ %res = shufflevector <2 x float> %vec, <2 x float> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+ ret <8 x float> %res
+}
+define <8 x float> @test_masked_2xfloat_to_8xfloat_mem_mask0(<2 x float>* %vp, <8 x float> %default, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_2xfloat_to_8xfloat_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1
+; CHECK-NEXT: vbroadcastf32x2 {{.*#+}} ymm0 {%k1} = mem[0,1,0,1,0,1,0,1]
+; CHECK-NEXT: retq
+ %vec = load <2 x float>, <2 x float>* %vp
+ %shuf = shufflevector <2 x float> %vec, <2 x float> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %default
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_masked_z_2xfloat_to_8xfloat_mem_mask0(<2 x float>* %vp, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_z_2xfloat_to_8xfloat_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vcmpeqps %ymm1, %ymm0, %k1
+; CHECK-NEXT: vbroadcastf32x2 {{.*#+}} ymm0 {%k1} {z} = mem[0,1,0,1,0,1,0,1]
+; CHECK-NEXT: retq
+ %vec = load <2 x float>, <2 x float>* %vp
+ %shuf = shufflevector <2 x float> %vec, <2 x float> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+define <8 x float> @test_masked_2xfloat_to_8xfloat_mem_mask1(<2 x float>* %vp, <8 x float> %default, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_2xfloat_to_8xfloat_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1
+; CHECK-NEXT: vbroadcastf32x2 {{.*#+}} ymm0 {%k1} = mem[0,1,0,1,0,1,0,1]
+; CHECK-NEXT: retq
+ %vec = load <2 x float>, <2 x float>* %vp
+ %shuf = shufflevector <2 x float> %vec, <2 x float> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %default
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_masked_z_2xfloat_to_8xfloat_mem_mask1(<2 x float>* %vp, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_z_2xfloat_to_8xfloat_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vcmpeqps %ymm1, %ymm0, %k1
+; CHECK-NEXT: vbroadcastf32x2 {{.*#+}} ymm0 {%k1} {z} = mem[0,1,0,1,0,1,0,1]
+; CHECK-NEXT: retq
+ %vec = load <2 x float>, <2 x float>* %vp
+ %shuf = shufflevector <2 x float> %vec, <2 x float> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+define <8 x float> @test_masked_2xfloat_to_8xfloat_mem_mask2(<2 x float>* %vp, <8 x float> %default, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_2xfloat_to_8xfloat_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1
+; CHECK-NEXT: vbroadcastf32x2 {{.*#+}} ymm0 {%k1} = mem[0,1,0,1,0,1,0,1]
+; CHECK-NEXT: retq
+ %vec = load <2 x float>, <2 x float>* %vp
+ %shuf = shufflevector <2 x float> %vec, <2 x float> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %default
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_masked_z_2xfloat_to_8xfloat_mem_mask2(<2 x float>* %vp, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_z_2xfloat_to_8xfloat_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vcmpeqps %ymm1, %ymm0, %k1
+; CHECK-NEXT: vbroadcastf32x2 {{.*#+}} ymm0 {%k1} {z} = mem[0,1,0,1,0,1,0,1]
+; CHECK-NEXT: retq
+ %vec = load <2 x float>, <2 x float>* %vp
+ %shuf = shufflevector <2 x float> %vec, <2 x float> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+define <8 x float> @test_masked_2xfloat_to_8xfloat_mem_mask3(<2 x float>* %vp, <8 x float> %default, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_2xfloat_to_8xfloat_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1
+; CHECK-NEXT: vbroadcastf32x2 {{.*#+}} ymm0 {%k1} = mem[0,1,0,1,0,1,0,1]
+; CHECK-NEXT: retq
+ %vec = load <2 x float>, <2 x float>* %vp
+ %shuf = shufflevector <2 x float> %vec, <2 x float> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %default
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_masked_z_2xfloat_to_8xfloat_mem_mask3(<2 x float>* %vp, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_z_2xfloat_to_8xfloat_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vcmpeqps %ymm1, %ymm0, %k1
+; CHECK-NEXT: vbroadcastf32x2 {{.*#+}} ymm0 {%k1} {z} = mem[0,1,0,1,0,1,0,1]
+; CHECK-NEXT: retq
+ %vec = load <2 x float>, <2 x float>* %vp
+ %shuf = shufflevector <2 x float> %vec, <2 x float> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+define <16 x float> @test_2xfloat_to_16xfloat_mem(<2 x float>* %vp) {
+; CHECK-LABEL: test_2xfloat_to_16xfloat_mem:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vbroadcastsd (%rdi), %zmm0
+; CHECK-NEXT: retq
+ %vec = load <2 x float>, <2 x float>* %vp
+ %res = shufflevector <2 x float> %vec, <2 x float> undef, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+ ret <16 x float> %res
+}
+define <16 x float> @test_masked_2xfloat_to_16xfloat_mem_mask0(<2 x float>* %vp, <16 x float> %default, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_2xfloat_to_16xfloat_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %zmm2, %zmm1, %k1
+; CHECK-NEXT: vbroadcastf32x2 {{.*#+}} zmm0 {%k1} = mem[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; CHECK-NEXT: retq
+ %vec = load <2 x float>, <2 x float>* %vp
+ %shuf = shufflevector <2 x float> %vec, <2 x float> undef, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %default
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_masked_z_2xfloat_to_16xfloat_mem_mask0(<2 x float>* %vp, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_z_2xfloat_to_16xfloat_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vcmpeqps %zmm1, %zmm0, %k1
+; CHECK-NEXT: vbroadcastf32x2 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; CHECK-NEXT: retq
+ %vec = load <2 x float>, <2 x float>* %vp
+ %shuf = shufflevector <2 x float> %vec, <2 x float> undef, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
+define <16 x float> @test_masked_2xfloat_to_16xfloat_mem_mask1(<2 x float>* %vp, <16 x float> %default, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_2xfloat_to_16xfloat_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %zmm2, %zmm1, %k1
+; CHECK-NEXT: vbroadcastf32x2 {{.*#+}} zmm0 {%k1} = mem[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; CHECK-NEXT: retq
+ %vec = load <2 x float>, <2 x float>* %vp
+ %shuf = shufflevector <2 x float> %vec, <2 x float> undef, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %default
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_masked_z_2xfloat_to_16xfloat_mem_mask1(<2 x float>* %vp, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_z_2xfloat_to_16xfloat_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vcmpeqps %zmm1, %zmm0, %k1
+; CHECK-NEXT: vbroadcastf32x2 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; CHECK-NEXT: retq
+ %vec = load <2 x float>, <2 x float>* %vp
+ %shuf = shufflevector <2 x float> %vec, <2 x float> undef, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
+define <16 x float> @test_masked_2xfloat_to_16xfloat_mem_mask2(<2 x float>* %vp, <16 x float> %default, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_2xfloat_to_16xfloat_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %zmm2, %zmm1, %k1
+; CHECK-NEXT: vbroadcastf32x2 {{.*#+}} zmm0 {%k1} = mem[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; CHECK-NEXT: retq
+ %vec = load <2 x float>, <2 x float>* %vp
+ %shuf = shufflevector <2 x float> %vec, <2 x float> undef, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %default
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_masked_z_2xfloat_to_16xfloat_mem_mask2(<2 x float>* %vp, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_z_2xfloat_to_16xfloat_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vcmpeqps %zmm1, %zmm0, %k1
+; CHECK-NEXT: vbroadcastf32x2 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; CHECK-NEXT: retq
+ %vec = load <2 x float>, <2 x float>* %vp
+ %shuf = shufflevector <2 x float> %vec, <2 x float> undef, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
+define <16 x float> @test_masked_2xfloat_to_16xfloat_mem_mask3(<2 x float>* %vp, <16 x float> %default, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_2xfloat_to_16xfloat_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %zmm2, %zmm1, %k1
+; CHECK-NEXT: vbroadcastf32x2 {{.*#+}} zmm0 {%k1} = mem[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; CHECK-NEXT: retq
+ %vec = load <2 x float>, <2 x float>* %vp
+ %shuf = shufflevector <2 x float> %vec, <2 x float> undef, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %default
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_masked_z_2xfloat_to_16xfloat_mem_mask3(<2 x float>* %vp, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_z_2xfloat_to_16xfloat_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vcmpeqps %zmm1, %zmm0, %k1
+; CHECK-NEXT: vbroadcastf32x2 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; CHECK-NEXT: retq
+ %vec = load <2 x float>, <2 x float>* %vp
+ %shuf = shufflevector <2 x float> %vec, <2 x float> undef, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
+define <8 x float> @test_4xfloat_to_8xfloat_mem(<4 x float>* %vp) {
+; CHECK-LABEL: test_4xfloat_to_8xfloat_mem:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
+; CHECK-NEXT: retq
+ %vec = load <4 x float>, <4 x float>* %vp
+ %res = shufflevector <4 x float> %vec, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+ ret <8 x float> %res
+}
+define <8 x float> @test_masked_4xfloat_to_8xfloat_mem_mask0(<4 x float>* %vp, <8 x float> %default, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_4xfloat_to_8xfloat_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1
+; CHECK-NEXT: vbroadcastf32x4 {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,0,1,2,3]
+; CHECK-NEXT: retq
+ %vec = load <4 x float>, <4 x float>* %vp
+ %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %default
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_masked_z_4xfloat_to_8xfloat_mem_mask0(<4 x float>* %vp, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_z_4xfloat_to_8xfloat_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vcmpeqps %ymm1, %ymm0, %k1
+; CHECK-NEXT: vbroadcastf32x4 {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,0,1,2,3]
+; CHECK-NEXT: retq
+ %vec = load <4 x float>, <4 x float>* %vp
+ %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+define <8 x float> @test_masked_4xfloat_to_8xfloat_mem_mask1(<4 x float>* %vp, <8 x float> %default, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_4xfloat_to_8xfloat_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1
+; CHECK-NEXT: vbroadcastf32x4 {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,0,1,2,3]
+; CHECK-NEXT: retq
+ %vec = load <4 x float>, <4 x float>* %vp
+ %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %default
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_masked_z_4xfloat_to_8xfloat_mem_mask1(<4 x float>* %vp, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_z_4xfloat_to_8xfloat_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vcmpeqps %ymm1, %ymm0, %k1
+; CHECK-NEXT: vbroadcastf32x4 {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,0,1,2,3]
+; CHECK-NEXT: retq
+ %vec = load <4 x float>, <4 x float>* %vp
+ %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+define <8 x float> @test_masked_4xfloat_to_8xfloat_mem_mask2(<4 x float>* %vp, <8 x float> %default, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_4xfloat_to_8xfloat_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1
+; CHECK-NEXT: vbroadcastf32x4 {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,0,1,2,3]
+; CHECK-NEXT: retq
+ %vec = load <4 x float>, <4 x float>* %vp
+ %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %default
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_masked_z_4xfloat_to_8xfloat_mem_mask2(<4 x float>* %vp, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_z_4xfloat_to_8xfloat_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vcmpeqps %ymm1, %ymm0, %k1
+; CHECK-NEXT: vbroadcastf32x4 {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,0,1,2,3]
+; CHECK-NEXT: retq
+ %vec = load <4 x float>, <4 x float>* %vp
+ %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+define <8 x float> @test_masked_4xfloat_to_8xfloat_mem_mask3(<4 x float>* %vp, <8 x float> %default, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_4xfloat_to_8xfloat_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1
+; CHECK-NEXT: vbroadcastf32x4 {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,0,1,2,3]
+; CHECK-NEXT: retq
+ %vec = load <4 x float>, <4 x float>* %vp
+ %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %default
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_masked_z_4xfloat_to_8xfloat_mem_mask3(<4 x float>* %vp, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_z_4xfloat_to_8xfloat_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vcmpeqps %ymm1, %ymm0, %k1
+; CHECK-NEXT: vbroadcastf32x4 {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,0,1,2,3]
+; CHECK-NEXT: retq
+ %vec = load <4 x float>, <4 x float>* %vp
+ %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+define <16 x float> @test_4xfloat_to_16xfloat_mem(<4 x float>* %vp) {
+; CHECK-LABEL: test_4xfloat_to_16xfloat_mem:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; CHECK-NEXT: retq
+ %vec = load <4 x float>, <4 x float>* %vp
+ %res = shufflevector <4 x float> %vec, <4 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+ ret <16 x float> %res
+}
+define <16 x float> @test_masked_4xfloat_to_16xfloat_mem_mask0(<4 x float>* %vp, <16 x float> %default, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_4xfloat_to_16xfloat_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %zmm2, %zmm1, %k1
+; CHECK-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; CHECK-NEXT: retq
+ %vec = load <4 x float>, <4 x float>* %vp
+ %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %default
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_masked_z_4xfloat_to_16xfloat_mem_mask0(<4 x float>* %vp, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_z_4xfloat_to_16xfloat_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vcmpeqps %zmm1, %zmm0, %k1
+; CHECK-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; CHECK-NEXT: retq
+ %vec = load <4 x float>, <4 x float>* %vp
+ %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
+define <16 x float> @test_masked_4xfloat_to_16xfloat_mem_mask1(<4 x float>* %vp, <16 x float> %default, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_4xfloat_to_16xfloat_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %zmm2, %zmm1, %k1
+; CHECK-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; CHECK-NEXT: retq
+ %vec = load <4 x float>, <4 x float>* %vp
+ %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %default
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_masked_z_4xfloat_to_16xfloat_mem_mask1(<4 x float>* %vp, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_z_4xfloat_to_16xfloat_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vcmpeqps %zmm1, %zmm0, %k1
+; CHECK-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; CHECK-NEXT: retq
+ %vec = load <4 x float>, <4 x float>* %vp
+ %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
+define <16 x float> @test_masked_4xfloat_to_16xfloat_mem_mask2(<4 x float>* %vp, <16 x float> %default, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_4xfloat_to_16xfloat_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %zmm2, %zmm1, %k1
+; CHECK-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; CHECK-NEXT: retq
+ %vec = load <4 x float>, <4 x float>* %vp
+ %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %default
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_masked_z_4xfloat_to_16xfloat_mem_mask2(<4 x float>* %vp, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_z_4xfloat_to_16xfloat_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vcmpeqps %zmm1, %zmm0, %k1
+; CHECK-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; CHECK-NEXT: retq
+ %vec = load <4 x float>, <4 x float>* %vp
+ %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
+define <16 x float> @test_masked_4xfloat_to_16xfloat_mem_mask3(<4 x float>* %vp, <16 x float> %default, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_4xfloat_to_16xfloat_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %zmm2, %zmm1, %k1
+; CHECK-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; CHECK-NEXT: retq
+ %vec = load <4 x float>, <4 x float>* %vp
+ %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %default
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_masked_z_4xfloat_to_16xfloat_mem_mask3(<4 x float>* %vp, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_z_4xfloat_to_16xfloat_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vcmpeqps %zmm1, %zmm0, %k1
+; CHECK-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; CHECK-NEXT: retq
+ %vec = load <4 x float>, <4 x float>* %vp
+ %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
+define <16 x float> @test_8xfloat_to_16xfloat_mem(<8 x float>* %vp) {
+; CHECK-LABEL: test_8xfloat_to_16xfloat_mem:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vbroadcastf64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
+; CHECK-NEXT: retq
+ %vec = load <8 x float>, <8 x float>* %vp
+ %res = shufflevector <8 x float> %vec, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <16 x float> %res
+}
+define <16 x float> @test_masked_8xfloat_to_16xfloat_mem_mask0(<8 x float>* %vp, <16 x float> %default, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_8xfloat_to_16xfloat_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %zmm2, %zmm1, %k1
+; CHECK-NEXT: vbroadcastf32x8 {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
+; CHECK-NEXT: retq
+ %vec = load <8 x float>, <8 x float>* %vp
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %default
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_masked_z_8xfloat_to_16xfloat_mem_mask0(<8 x float>* %vp, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_z_8xfloat_to_16xfloat_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vcmpeqps %zmm1, %zmm0, %k1
+; CHECK-NEXT: vbroadcastf32x8 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
+; CHECK-NEXT: retq
+ %vec = load <8 x float>, <8 x float>* %vp
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
+define <16 x float> @test_masked_8xfloat_to_16xfloat_mem_mask1(<8 x float>* %vp, <16 x float> %default, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_8xfloat_to_16xfloat_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %zmm2, %zmm1, %k1
+; CHECK-NEXT: vbroadcastf32x8 {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
+; CHECK-NEXT: retq
+ %vec = load <8 x float>, <8 x float>* %vp
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %default
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_masked_z_8xfloat_to_16xfloat_mem_mask1(<8 x float>* %vp, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_z_8xfloat_to_16xfloat_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vcmpeqps %zmm1, %zmm0, %k1
+; CHECK-NEXT: vbroadcastf32x8 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
+; CHECK-NEXT: retq
+ %vec = load <8 x float>, <8 x float>* %vp
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
+define <16 x float> @test_masked_8xfloat_to_16xfloat_mem_mask2(<8 x float>* %vp, <16 x float> %default, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_8xfloat_to_16xfloat_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %zmm2, %zmm1, %k1
+; CHECK-NEXT: vbroadcastf32x8 {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
+; CHECK-NEXT: retq
+ %vec = load <8 x float>, <8 x float>* %vp
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %default
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_masked_z_8xfloat_to_16xfloat_mem_mask2(<8 x float>* %vp, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_z_8xfloat_to_16xfloat_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vcmpeqps %zmm1, %zmm0, %k1
+; CHECK-NEXT: vbroadcastf32x8 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
+; CHECK-NEXT: retq
+ %vec = load <8 x float>, <8 x float>* %vp
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
+define <16 x float> @test_masked_8xfloat_to_16xfloat_mem_mask3(<8 x float>* %vp, <16 x float> %default, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_8xfloat_to_16xfloat_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %zmm2, %zmm1, %k1
+; CHECK-NEXT: vbroadcastf32x8 {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
+; CHECK-NEXT: retq
+ %vec = load <8 x float>, <8 x float>* %vp
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %default
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_masked_z_8xfloat_to_16xfloat_mem_mask3(<8 x float>* %vp, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_z_8xfloat_to_16xfloat_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vcmpeqps %zmm1, %zmm0, %k1
+; CHECK-NEXT: vbroadcastf32x8 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
+; CHECK-NEXT: retq
+ %vec = load <8 x float>, <8 x float>* %vp
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
diff --git a/test/CodeGen/X86/avx512-shuffles/broadcast-vector-int.ll b/test/CodeGen/X86/avx512-shuffles/broadcast-vector-int.ll
new file mode 100644
index 000000000000..a6abe24d2537
--- /dev/null
+++ b/test/CodeGen/X86/avx512-shuffles/broadcast-vector-int.ll
@@ -0,0 +1,1430 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512vl,+avx512dq %s -o - | FileCheck %s
+
+; FIXME: fixing PR34394 should fix the i32x2 memory cases resulting in a simple vbroadcasti32x2 instruction.
+
+define <4 x i32> @test_2xi32_to_4xi32(<4 x i32> %vec) {
+; CHECK-LABEL: test_2xi32_to_4xi32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpbroadcastq %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+ ret <4 x i32> %res
+}
+define <4 x i32> @test_masked_2xi32_to_4xi32_mask0(<4 x i32> %vec, <4 x i32> %default, <4 x i32> %mask) {
+; CHECK-LABEL: test_masked_2xi32_to_4xi32_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqd %xmm3, %xmm2, %k1
+; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} xmm1 {%k1} = xmm0[0,1,0,1]
+; CHECK-NEXT: vmovdqa %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+ %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %default
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_masked_z_2xi32_to_4xi32_mask0(<4 x i32> %vec, <4 x i32> %mask) {
+; CHECK-LABEL: test_masked_z_2xi32_to_4xi32_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %k1
+; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,0,1]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+ %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
+ ret <4 x i32> %res
+}
+define <4 x i32> @test_masked_2xi32_to_4xi32_mask1(<4 x i32> %vec, <4 x i32> %default, <4 x i32> %mask) {
+; CHECK-LABEL: test_masked_2xi32_to_4xi32_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqd %xmm3, %xmm2, %k1
+; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} xmm1 {%k1} = xmm0[0,1,0,1]
+; CHECK-NEXT: vmovdqa %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+ %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %default
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_masked_z_2xi32_to_4xi32_mask1(<4 x i32> %vec, <4 x i32> %mask) {
+; CHECK-LABEL: test_masked_z_2xi32_to_4xi32_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %k1
+; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,0,1]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+ %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
+ ret <4 x i32> %res
+}
+define <4 x i32> @test_masked_2xi32_to_4xi32_mask2(<4 x i32> %vec, <4 x i32> %default, <4 x i32> %mask) {
+; CHECK-LABEL: test_masked_2xi32_to_4xi32_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqd %xmm3, %xmm2, %k1
+; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} xmm1 {%k1} = xmm0[0,1,0,1]
+; CHECK-NEXT: vmovdqa %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+ %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %default
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_masked_z_2xi32_to_4xi32_mask2(<4 x i32> %vec, <4 x i32> %mask) {
+; CHECK-LABEL: test_masked_z_2xi32_to_4xi32_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %k1
+; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,0,1]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+ %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
+ ret <4 x i32> %res
+}
+define <4 x i32> @test_masked_2xi32_to_4xi32_mask3(<4 x i32> %vec, <4 x i32> %default, <4 x i32> %mask) {
+; CHECK-LABEL: test_masked_2xi32_to_4xi32_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqd %xmm3, %xmm2, %k1
+; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} xmm1 {%k1} = xmm0[0,1,0,1]
+; CHECK-NEXT: vmovdqa %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+ %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %default
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_masked_z_2xi32_to_4xi32_mask3(<4 x i32> %vec, <4 x i32> %mask) {
+; CHECK-LABEL: test_masked_z_2xi32_to_4xi32_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %k1
+; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,0,1]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+ %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
+ ret <4 x i32> %res
+}
+define <8 x i32> @test_2xi32_to_8xi32(<8 x i32> %vec) {
+; CHECK-LABEL: test_2xi32_to_8xi32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0
+; CHECK-NEXT: retq
+ %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+ ret <8 x i32> %res
+}
+define <8 x i32> @test_masked_2xi32_to_8xi32_mask0(<8 x i32> %vec, <8 x i32> %default, <8 x i32> %mask) {
+; CHECK-LABEL: test_masked_2xi32_to_8xi32_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1
+; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} ymm1 {%k1} = xmm0[0,1,0,1,0,1,0,1]
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %default
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_masked_z_2xi32_to_8xi32_mask0(<8 x i32> %vec, <8 x i32> %mask) {
+; CHECK-LABEL: test_masked_z_2xi32_to_8xi32_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} ymm0 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
+ ret <8 x i32> %res
+}
+define <8 x i32> @test_masked_2xi32_to_8xi32_mask1(<8 x i32> %vec, <8 x i32> %default, <8 x i32> %mask) {
+; CHECK-LABEL: test_masked_2xi32_to_8xi32_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1
+; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} ymm1 {%k1} = xmm0[0,1,0,1,0,1,0,1]
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %default
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_masked_z_2xi32_to_8xi32_mask1(<8 x i32> %vec, <8 x i32> %mask) {
+; CHECK-LABEL: test_masked_z_2xi32_to_8xi32_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} ymm0 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
+ ret <8 x i32> %res
+}
+define <8 x i32> @test_masked_2xi32_to_8xi32_mask2(<8 x i32> %vec, <8 x i32> %default, <8 x i32> %mask) {
+; CHECK-LABEL: test_masked_2xi32_to_8xi32_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1
+; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} ymm1 {%k1} = xmm0[0,1,0,1,0,1,0,1]
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %default
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_masked_z_2xi32_to_8xi32_mask2(<8 x i32> %vec, <8 x i32> %mask) {
+; CHECK-LABEL: test_masked_z_2xi32_to_8xi32_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} ymm0 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
+ ret <8 x i32> %res
+}
+define <8 x i32> @test_masked_2xi32_to_8xi32_mask3(<8 x i32> %vec, <8 x i32> %default, <8 x i32> %mask) {
+; CHECK-LABEL: test_masked_2xi32_to_8xi32_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1
+; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} ymm1 {%k1} = xmm0[0,1,0,1,0,1,0,1]
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %default
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_masked_z_2xi32_to_8xi32_mask3(<8 x i32> %vec, <8 x i32> %mask) {
+; CHECK-LABEL: test_masked_z_2xi32_to_8xi32_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} ymm0 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
+ ret <8 x i32> %res
+}
+define <16 x i32> @test_2xi32_to_16xi32(<16 x i32> %vec) {
+; CHECK-LABEL: test_2xi32_to_16xi32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vbroadcastsd %xmm0, %zmm0
+; CHECK-NEXT: retq
+ %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+ ret <16 x i32> %res
+}
+define <16 x i32> @test_masked_2xi32_to_16xi32_mask0(<16 x i32> %vec, <16 x i32> %default, <16 x i32> %mask) {
+; CHECK-LABEL: test_masked_2xi32_to_16xi32_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqd %zmm3, %zmm2, %k1
+; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} zmm1 {%k1} = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %default
+ ret <16 x i32> %res
+}
+
+define <16 x i32> @test_masked_z_2xi32_to_16xi32_mask0(<16 x i32> %vec, <16 x i32> %mask) {
+; CHECK-LABEL: test_masked_z_2xi32_to_16xi32_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqd %zmm2, %zmm1, %k1
+; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} zmm0 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
+ ret <16 x i32> %res
+}
+define <16 x i32> @test_masked_2xi32_to_16xi32_mask1(<16 x i32> %vec, <16 x i32> %default, <16 x i32> %mask) {
+; CHECK-LABEL: test_masked_2xi32_to_16xi32_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqd %zmm3, %zmm2, %k1
+; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} zmm1 {%k1} = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %default
+ ret <16 x i32> %res
+}
+
+define <16 x i32> @test_masked_z_2xi32_to_16xi32_mask1(<16 x i32> %vec, <16 x i32> %mask) {
+; CHECK-LABEL: test_masked_z_2xi32_to_16xi32_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqd %zmm2, %zmm1, %k1
+; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} zmm0 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
+ ret <16 x i32> %res
+}
+define <16 x i32> @test_masked_2xi32_to_16xi32_mask2(<16 x i32> %vec, <16 x i32> %default, <16 x i32> %mask) {
+; CHECK-LABEL: test_masked_2xi32_to_16xi32_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqd %zmm3, %zmm2, %k1
+; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} zmm1 {%k1} = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %default
+ ret <16 x i32> %res
+}
+
+define <16 x i32> @test_masked_z_2xi32_to_16xi32_mask2(<16 x i32> %vec, <16 x i32> %mask) {
+; CHECK-LABEL: test_masked_z_2xi32_to_16xi32_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqd %zmm2, %zmm1, %k1
+; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} zmm0 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
+ ret <16 x i32> %res
+}
+define <16 x i32> @test_masked_2xi32_to_16xi32_mask3(<16 x i32> %vec, <16 x i32> %default, <16 x i32> %mask) {
+; CHECK-LABEL: test_masked_2xi32_to_16xi32_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqd %zmm3, %zmm2, %k1
+; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} zmm1 {%k1} = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %default
+ ret <16 x i32> %res
+}
+
+define <16 x i32> @test_masked_z_2xi32_to_16xi32_mask3(<16 x i32> %vec, <16 x i32> %mask) {
+; CHECK-LABEL: test_masked_z_2xi32_to_16xi32_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqd %zmm2, %zmm1, %k1
+; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} zmm0 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
+ ret <16 x i32> %res
+}
+define <4 x i32> @test_2xi32_to_4xi32_mem(<2 x i32>* %vp) {
+; CHECK-LABEL: test_2xi32_to_4xi32_mem:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpbroadcastq (%rdi), %xmm0
+; CHECK-NEXT: retq
+ %vec = load <2 x i32>, <2 x i32>* %vp
+ %res = shufflevector <2 x i32> %vec, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+ ret <4 x i32> %res
+}
+define <4 x i32> @test_masked_2xi32_to_4xi32_mem_mask0(<2 x i32>* %vp, <4 x i32> %default, <4 x i32> %mask) {
+; CHECK-LABEL: test_masked_2xi32_to_4xi32_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %k1
+; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} xmm0 {%k1} = mem[0,1,0,1]
+; CHECK-NEXT: retq
+ %vec = load <2 x i32>, <2 x i32>* %vp
+ %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+ %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %default
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_masked_z_2xi32_to_4xi32_mem_mask0(<2 x i32>* %vp, <4 x i32> %mask) {
+; CHECK-LABEL: test_masked_z_2xi32_to_4xi32_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k1
+; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} xmm0 {%k1} {z} = mem[0,1,0,1]
+; CHECK-NEXT: retq
+ %vec = load <2 x i32>, <2 x i32>* %vp
+ %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+ %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
+ ret <4 x i32> %res
+}
+define <4 x i32> @test_masked_2xi32_to_4xi32_mem_mask1(<2 x i32>* %vp, <4 x i32> %default, <4 x i32> %mask) {
+; CHECK-LABEL: test_masked_2xi32_to_4xi32_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %k1
+; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} xmm0 {%k1} = mem[0,1,0,1]
+; CHECK-NEXT: retq
+ %vec = load <2 x i32>, <2 x i32>* %vp
+ %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+ %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %default
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_masked_z_2xi32_to_4xi32_mem_mask1(<2 x i32>* %vp, <4 x i32> %mask) {
+; CHECK-LABEL: test_masked_z_2xi32_to_4xi32_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k1
+; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} xmm0 {%k1} {z} = mem[0,1,0,1]
+; CHECK-NEXT: retq
+ %vec = load <2 x i32>, <2 x i32>* %vp
+ %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+ %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
+ ret <4 x i32> %res
+}
+define <4 x i32> @test_masked_2xi32_to_4xi32_mem_mask2(<2 x i32>* %vp, <4 x i32> %default, <4 x i32> %mask) {
+; CHECK-LABEL: test_masked_2xi32_to_4xi32_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %k1
+; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} xmm0 {%k1} = mem[0,1,0,1]
+; CHECK-NEXT: retq
+ %vec = load <2 x i32>, <2 x i32>* %vp
+ %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+ %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %default
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_masked_z_2xi32_to_4xi32_mem_mask2(<2 x i32>* %vp, <4 x i32> %mask) {
+; CHECK-LABEL: test_masked_z_2xi32_to_4xi32_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k1
+; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} xmm0 {%k1} {z} = mem[0,1,0,1]
+; CHECK-NEXT: retq
+ %vec = load <2 x i32>, <2 x i32>* %vp
+ %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+ %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
+ ret <4 x i32> %res
+}
+define <4 x i32> @test_masked_2xi32_to_4xi32_mem_mask3(<2 x i32>* %vp, <4 x i32> %default, <4 x i32> %mask) {
+; CHECK-LABEL: test_masked_2xi32_to_4xi32_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %k1
+; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} xmm0 {%k1} = mem[0,1,0,1]
+; CHECK-NEXT: retq
+ %vec = load <2 x i32>, <2 x i32>* %vp
+ %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+ %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %default
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_masked_z_2xi32_to_4xi32_mem_mask3(<2 x i32>* %vp, <4 x i32> %mask) {
+; CHECK-LABEL: test_masked_z_2xi32_to_4xi32_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k1
+; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} xmm0 {%k1} {z} = mem[0,1,0,1]
+; CHECK-NEXT: retq
+ %vec = load <2 x i32>, <2 x i32>* %vp
+ %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+ %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
+ ret <4 x i32> %res
+}
+define <8 x i32> @test_2xi32_to_8xi32_mem(<2 x i32>* %vp) {
+; CHECK-LABEL: test_2xi32_to_8xi32_mem:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
+; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-NEXT: vpbroadcastq %xmm0, %ymm0
+; CHECK-NEXT: retq
+ %vec = load <2 x i32>, <2 x i32>* %vp
+ %res = shufflevector <2 x i32> %vec, <2 x i32> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+ ret <8 x i32> %res
+}
+define <8 x i32> @test_masked_2xi32_to_8xi32_mem_mask0(<2 x i32>* %vp, <8 x i32> %default, <8 x i32> %mask) {
+; CHECK-LABEL: test_masked_2xi32_to_8xi32_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero
+; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqd %ymm3, %ymm1, %k1
+; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} ymm0 {%k1} = xmm2[0,1,0,1,0,1,0,1]
+; CHECK-NEXT: retq
+ %vec = load <2 x i32>, <2 x i32>* %vp
+ %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %default
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_masked_z_2xi32_to_8xi32_mem_mask0(<2 x i32>* %vp, <8 x i32> %mask) {
+; CHECK-LABEL: test_masked_z_2xi32_to_8xi32_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
+; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqd %ymm2, %ymm0, %k1
+; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} ymm0 {%k1} {z} = xmm1[0,1,0,1,0,1,0,1]
+; CHECK-NEXT: retq
+ %vec = load <2 x i32>, <2 x i32>* %vp
+ %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
+ ret <8 x i32> %res
+}
+define <8 x i32> @test_masked_2xi32_to_8xi32_mem_mask1(<2 x i32>* %vp, <8 x i32> %default, <8 x i32> %mask) {
+; CHECK-LABEL: test_masked_2xi32_to_8xi32_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero
+; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqd %ymm3, %ymm1, %k1
+; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} ymm0 {%k1} = xmm2[0,1,0,1,0,1,0,1]
+; CHECK-NEXT: retq
+ %vec = load <2 x i32>, <2 x i32>* %vp
+ %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %default
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_masked_z_2xi32_to_8xi32_mem_mask1(<2 x i32>* %vp, <8 x i32> %mask) {
+; CHECK-LABEL: test_masked_z_2xi32_to_8xi32_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
+; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqd %ymm2, %ymm0, %k1
+; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} ymm0 {%k1} {z} = xmm1[0,1,0,1,0,1,0,1]
+; CHECK-NEXT: retq
+ %vec = load <2 x i32>, <2 x i32>* %vp
+ %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
+ ret <8 x i32> %res
+}
+define <8 x i32> @test_masked_2xi32_to_8xi32_mem_mask2(<2 x i32>* %vp, <8 x i32> %default, <8 x i32> %mask) {
+; CHECK-LABEL: test_masked_2xi32_to_8xi32_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero
+; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqd %ymm3, %ymm1, %k1
+; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} ymm0 {%k1} = xmm2[0,1,0,1,0,1,0,1]
+; CHECK-NEXT: retq
+ %vec = load <2 x i32>, <2 x i32>* %vp
+ %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %default
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_masked_z_2xi32_to_8xi32_mem_mask2(<2 x i32>* %vp, <8 x i32> %mask) {
+; CHECK-LABEL: test_masked_z_2xi32_to_8xi32_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
+; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqd %ymm2, %ymm0, %k1
+; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} ymm0 {%k1} {z} = xmm1[0,1,0,1,0,1,0,1]
+; CHECK-NEXT: retq
+ %vec = load <2 x i32>, <2 x i32>* %vp
+ %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
+ ret <8 x i32> %res
+}
+define <8 x i32> @test_masked_2xi32_to_8xi32_mem_mask3(<2 x i32>* %vp, <8 x i32> %default, <8 x i32> %mask) {
+; CHECK-LABEL: test_masked_2xi32_to_8xi32_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero
+; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqd %ymm3, %ymm1, %k1
+; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} ymm0 {%k1} = xmm2[0,1,0,1,0,1,0,1]
+; CHECK-NEXT: retq
+ %vec = load <2 x i32>, <2 x i32>* %vp
+ %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %default
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_masked_z_2xi32_to_8xi32_mem_mask3(<2 x i32>* %vp, <8 x i32> %mask) {
+; CHECK-LABEL: test_masked_z_2xi32_to_8xi32_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
+; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqd %ymm2, %ymm0, %k1
+; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} ymm0 {%k1} {z} = xmm1[0,1,0,1,0,1,0,1]
+; CHECK-NEXT: retq
+ %vec = load <2 x i32>, <2 x i32>* %vp
+ %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
+ ret <8 x i32> %res
+}
+define <16 x i32> @test_2xi32_to_16xi32_mem(<2 x i32>* %vp) {
+; CHECK-LABEL: test_2xi32_to_16xi32_mem:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
+; CHECK-NEXT: vmovdqa32 {{.*#+}} zmm1 = [0,2,0,2,0,2,0,2,0,2,0,2,0,2,0,2]
+; CHECK-NEXT: vpermd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %vec = load <2 x i32>, <2 x i32>* %vp
+ %res = shufflevector <2 x i32> %vec, <2 x i32> undef, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+ ret <16 x i32> %res
+}
+define <16 x i32> @test_masked_2xi32_to_16xi32_mem_mask0(<2 x i32>* %vp, <16 x i32> %default, <16 x i32> %mask) {
+; CHECK-LABEL: test_masked_2xi32_to_16xi32_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero
+; CHECK-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,2,0,2,0,2,0,2,0,2,0,2,0,2,0,2]
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vpcmpeqd %zmm4, %zmm1, %k1
+; CHECK-NEXT: vpermd %zmm2, %zmm3, %zmm0 {%k1}
+; CHECK-NEXT: retq
+ %vec = load <2 x i32>, <2 x i32>* %vp
+ %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %default
+ ret <16 x i32> %res
+}
+
+define <16 x i32> @test_masked_z_2xi32_to_16xi32_mem_mask0(<2 x i32>* %vp, <16 x i32> %mask) {
+; CHECK-LABEL: test_masked_z_2xi32_to_16xi32_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
+; CHECK-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,2,0,2,0,2,0,2,0,2,0,2,0,2,0,2]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqd %zmm3, %zmm0, %k1
+; CHECK-NEXT: vpermd %zmm1, %zmm2, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %vec = load <2 x i32>, <2 x i32>* %vp
+ %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
+ ret <16 x i32> %res
+}
+define <16 x i32> @test_masked_2xi32_to_16xi32_mem_mask1(<2 x i32>* %vp, <16 x i32> %default, <16 x i32> %mask) {
+; CHECK-LABEL: test_masked_2xi32_to_16xi32_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero
+; CHECK-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,2,0,2,0,2,0,2,0,2,0,2,0,2,0,2]
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vpcmpeqd %zmm4, %zmm1, %k1
+; CHECK-NEXT: vpermd %zmm2, %zmm3, %zmm0 {%k1}
+; CHECK-NEXT: retq
+ %vec = load <2 x i32>, <2 x i32>* %vp
+ %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %default
+ ret <16 x i32> %res
+}
+
+define <16 x i32> @test_masked_z_2xi32_to_16xi32_mem_mask1(<2 x i32>* %vp, <16 x i32> %mask) {
+; CHECK-LABEL: test_masked_z_2xi32_to_16xi32_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
+; CHECK-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,2,0,2,0,2,0,2,0,2,0,2,0,2,0,2]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqd %zmm3, %zmm0, %k1
+; CHECK-NEXT: vpermd %zmm1, %zmm2, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %vec = load <2 x i32>, <2 x i32>* %vp
+ %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
+ ret <16 x i32> %res
+}
+define <16 x i32> @test_masked_2xi32_to_16xi32_mem_mask2(<2 x i32>* %vp, <16 x i32> %default, <16 x i32> %mask) {
+; CHECK-LABEL: test_masked_2xi32_to_16xi32_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero
+; CHECK-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,2,0,2,0,2,0,2,0,2,0,2,0,2,0,2]
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vpcmpeqd %zmm4, %zmm1, %k1
+; CHECK-NEXT: vpermd %zmm2, %zmm3, %zmm0 {%k1}
+; CHECK-NEXT: retq
+ %vec = load <2 x i32>, <2 x i32>* %vp
+ %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %default
+ ret <16 x i32> %res
+}
+
+define <16 x i32> @test_masked_z_2xi32_to_16xi32_mem_mask2(<2 x i32>* %vp, <16 x i32> %mask) {
+; CHECK-LABEL: test_masked_z_2xi32_to_16xi32_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
+; CHECK-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,2,0,2,0,2,0,2,0,2,0,2,0,2,0,2]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqd %zmm3, %zmm0, %k1
+; CHECK-NEXT: vpermd %zmm1, %zmm2, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %vec = load <2 x i32>, <2 x i32>* %vp
+ %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
+ ret <16 x i32> %res
+}
+define <16 x i32> @test_masked_2xi32_to_16xi32_mem_mask3(<2 x i32>* %vp, <16 x i32> %default, <16 x i32> %mask) {
+; CHECK-LABEL: test_masked_2xi32_to_16xi32_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero
+; CHECK-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,2,0,2,0,2,0,2,0,2,0,2,0,2,0,2]
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vpcmpeqd %zmm4, %zmm1, %k1
+; CHECK-NEXT: vpermd %zmm2, %zmm3, %zmm0 {%k1}
+; CHECK-NEXT: retq
+ %vec = load <2 x i32>, <2 x i32>* %vp
+ %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %default
+ ret <16 x i32> %res
+}
+
+define <16 x i32> @test_masked_z_2xi32_to_16xi32_mem_mask3(<2 x i32>* %vp, <16 x i32> %mask) {
+; CHECK-LABEL: test_masked_z_2xi32_to_16xi32_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
+; CHECK-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,2,0,2,0,2,0,2,0,2,0,2,0,2,0,2]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqd %zmm3, %zmm0, %k1
+; CHECK-NEXT: vpermd %zmm1, %zmm2, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %vec = load <2 x i32>, <2 x i32>* %vp
+ %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
+ ret <16 x i32> %res
+}
+define <8 x i32> @test_4xi32_to_8xi32_mem(<4 x i32>* %vp) {
+; CHECK-LABEL: test_4xi32_to_8xi32_mem:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
+; CHECK-NEXT: retq
+ %vec = load <4 x i32>, <4 x i32>* %vp
+ %res = shufflevector <4 x i32> %vec, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+ ret <8 x i32> %res
+}
+define <8 x i32> @test_masked_4xi32_to_8xi32_mem_mask0(<4 x i32>* %vp, <8 x i32> %default, <8 x i32> %mask) {
+; CHECK-LABEL: test_masked_4xi32_to_8xi32_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vbroadcasti32x4 {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,0,1,2,3]
+; CHECK-NEXT: retq
+ %vec = load <4 x i32>, <4 x i32>* %vp
+ %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %default
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_masked_z_4xi32_to_8xi32_mem_mask0(<4 x i32>* %vp, <8 x i32> %mask) {
+; CHECK-LABEL: test_masked_z_4xi32_to_8xi32_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k1
+; CHECK-NEXT: vbroadcasti32x4 {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,0,1,2,3]
+; CHECK-NEXT: retq
+ %vec = load <4 x i32>, <4 x i32>* %vp
+ %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
+ ret <8 x i32> %res
+}
+define <8 x i32> @test_masked_4xi32_to_8xi32_mem_mask1(<4 x i32>* %vp, <8 x i32> %default, <8 x i32> %mask) {
+; CHECK-LABEL: test_masked_4xi32_to_8xi32_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vbroadcasti32x4 {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,0,1,2,3]
+; CHECK-NEXT: retq
+ %vec = load <4 x i32>, <4 x i32>* %vp
+ %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %default
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_masked_z_4xi32_to_8xi32_mem_mask1(<4 x i32>* %vp, <8 x i32> %mask) {
+; CHECK-LABEL: test_masked_z_4xi32_to_8xi32_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k1
+; CHECK-NEXT: vbroadcasti32x4 {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,0,1,2,3]
+; CHECK-NEXT: retq
+ %vec = load <4 x i32>, <4 x i32>* %vp
+ %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
+ ret <8 x i32> %res
+}
+define <8 x i32> @test_masked_4xi32_to_8xi32_mem_mask2(<4 x i32>* %vp, <8 x i32> %default, <8 x i32> %mask) {
+; CHECK-LABEL: test_masked_4xi32_to_8xi32_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vbroadcasti32x4 {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,0,1,2,3]
+; CHECK-NEXT: retq
+ %vec = load <4 x i32>, <4 x i32>* %vp
+ %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %default
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_masked_z_4xi32_to_8xi32_mem_mask2(<4 x i32>* %vp, <8 x i32> %mask) {
+; CHECK-LABEL: test_masked_z_4xi32_to_8xi32_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k1
+; CHECK-NEXT: vbroadcasti32x4 {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,0,1,2,3]
+; CHECK-NEXT: retq
+ %vec = load <4 x i32>, <4 x i32>* %vp
+ %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
+ ret <8 x i32> %res
+}
+define <8 x i32> @test_masked_4xi32_to_8xi32_mem_mask3(<4 x i32>* %vp, <8 x i32> %default, <8 x i32> %mask) {
+; CHECK-LABEL: test_masked_4xi32_to_8xi32_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vbroadcasti32x4 {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,0,1,2,3]
+; CHECK-NEXT: retq
+ %vec = load <4 x i32>, <4 x i32>* %vp
+ %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %default
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_masked_z_4xi32_to_8xi32_mem_mask3(<4 x i32>* %vp, <8 x i32> %mask) {
+; CHECK-LABEL: test_masked_z_4xi32_to_8xi32_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k1
+; CHECK-NEXT: vbroadcasti32x4 {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,0,1,2,3]
+; CHECK-NEXT: retq
+ %vec = load <4 x i32>, <4 x i32>* %vp
+ %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
+ ret <8 x i32> %res
+}
+define <16 x i32> @test_4xi32_to_16xi32_mem(<4 x i32>* %vp) {
+; CHECK-LABEL: test_4xi32_to_16xi32_mem:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; CHECK-NEXT: retq
+ %vec = load <4 x i32>, <4 x i32>* %vp
+ %res = shufflevector <4 x i32> %vec, <4 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+ ret <16 x i32> %res
+}
+define <16 x i32> @test_masked_4xi32_to_16xi32_mem_mask0(<4 x i32>* %vp, <16 x i32> %default, <16 x i32> %mask) {
+; CHECK-LABEL: test_masked_4xi32_to_16xi32_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqd %zmm2, %zmm1, %k1
+; CHECK-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; CHECK-NEXT: retq
+ %vec = load <4 x i32>, <4 x i32>* %vp
+ %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %default
+ ret <16 x i32> %res
+}
+
+define <16 x i32> @test_masked_z_4xi32_to_16xi32_mem_mask0(<4 x i32>* %vp, <16 x i32> %mask) {
+; CHECK-LABEL: test_masked_z_4xi32_to_16xi32_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k1
+; CHECK-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; CHECK-NEXT: retq
+ %vec = load <4 x i32>, <4 x i32>* %vp
+ %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
+ ret <16 x i32> %res
+}
+define <16 x i32> @test_masked_4xi32_to_16xi32_mem_mask1(<4 x i32>* %vp, <16 x i32> %default, <16 x i32> %mask) {
+; CHECK-LABEL: test_masked_4xi32_to_16xi32_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqd %zmm2, %zmm1, %k1
+; CHECK-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; CHECK-NEXT: retq
+ %vec = load <4 x i32>, <4 x i32>* %vp
+ %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %default
+ ret <16 x i32> %res
+}
+
+define <16 x i32> @test_masked_z_4xi32_to_16xi32_mem_mask1(<4 x i32>* %vp, <16 x i32> %mask) {
+; CHECK-LABEL: test_masked_z_4xi32_to_16xi32_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k1
+; CHECK-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; CHECK-NEXT: retq
+ %vec = load <4 x i32>, <4 x i32>* %vp
+ %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
+ ret <16 x i32> %res
+}
+define <16 x i32> @test_masked_4xi32_to_16xi32_mem_mask2(<4 x i32>* %vp, <16 x i32> %default, <16 x i32> %mask) {
+; CHECK-LABEL: test_masked_4xi32_to_16xi32_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqd %zmm2, %zmm1, %k1
+; CHECK-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; CHECK-NEXT: retq
+ %vec = load <4 x i32>, <4 x i32>* %vp
+ %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %default
+ ret <16 x i32> %res
+}
+
+define <16 x i32> @test_masked_z_4xi32_to_16xi32_mem_mask2(<4 x i32>* %vp, <16 x i32> %mask) {
+; CHECK-LABEL: test_masked_z_4xi32_to_16xi32_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k1
+; CHECK-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; CHECK-NEXT: retq
+ %vec = load <4 x i32>, <4 x i32>* %vp
+ %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
+ ret <16 x i32> %res
+}
+define <16 x i32> @test_masked_4xi32_to_16xi32_mem_mask3(<4 x i32>* %vp, <16 x i32> %default, <16 x i32> %mask) {
+; CHECK-LABEL: test_masked_4xi32_to_16xi32_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqd %zmm2, %zmm1, %k1
+; CHECK-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; CHECK-NEXT: retq
+ %vec = load <4 x i32>, <4 x i32>* %vp
+ %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %default
+ ret <16 x i32> %res
+}
+
+define <16 x i32> @test_masked_z_4xi32_to_16xi32_mem_mask3(<4 x i32>* %vp, <16 x i32> %mask) {
+; CHECK-LABEL: test_masked_z_4xi32_to_16xi32_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k1
+; CHECK-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; CHECK-NEXT: retq
+ %vec = load <4 x i32>, <4 x i32>* %vp
+ %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
+ ret <16 x i32> %res
+}
+define <4 x i64> @test_2xi64_to_4xi64_mem(<2 x i64>* %vp) {
+; CHECK-LABEL: test_2xi64_to_4xi64_mem:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
+; CHECK-NEXT: retq
+ %vec = load <2 x i64>, <2 x i64>* %vp
+ %res = shufflevector <2 x i64> %vec, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+ ret <4 x i64> %res
+}
+define <4 x i64> @test_masked_2xi64_to_4xi64_mem_mask0(<2 x i64>* %vp, <4 x i64> %default, <4 x i64> %mask) {
+; CHECK-LABEL: test_masked_2xi64_to_4xi64_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
+; CHECK-NEXT: vbroadcasti64x2 {{.*#+}} ymm0 {%k1} = mem[0,1,0,1]
+; CHECK-NEXT: retq
+ %vec = load <2 x i64>, <2 x i64>* %vp
+ %shuf = shufflevector <2 x i64> %vec, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %default
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_masked_z_2xi64_to_4xi64_mem_mask0(<2 x i64>* %vp, <4 x i64> %mask) {
+; CHECK-LABEL: test_masked_z_2xi64_to_4xi64_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k1
+; CHECK-NEXT: vbroadcasti64x2 {{.*#+}} ymm0 {%k1} {z} = mem[0,1,0,1]
+; CHECK-NEXT: retq
+ %vec = load <2 x i64>, <2 x i64>* %vp
+ %shuf = shufflevector <2 x i64> %vec, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
+ ret <4 x i64> %res
+}
+define <4 x i64> @test_masked_2xi64_to_4xi64_mem_mask1(<2 x i64>* %vp, <4 x i64> %default, <4 x i64> %mask) {
+; CHECK-LABEL: test_masked_2xi64_to_4xi64_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
+; CHECK-NEXT: vbroadcasti64x2 {{.*#+}} ymm0 {%k1} = mem[0,1,0,1]
+; CHECK-NEXT: retq
+ %vec = load <2 x i64>, <2 x i64>* %vp
+ %shuf = shufflevector <2 x i64> %vec, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %default
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_masked_z_2xi64_to_4xi64_mem_mask1(<2 x i64>* %vp, <4 x i64> %mask) {
+; CHECK-LABEL: test_masked_z_2xi64_to_4xi64_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k1
+; CHECK-NEXT: vbroadcasti64x2 {{.*#+}} ymm0 {%k1} {z} = mem[0,1,0,1]
+; CHECK-NEXT: retq
+ %vec = load <2 x i64>, <2 x i64>* %vp
+ %shuf = shufflevector <2 x i64> %vec, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
+ ret <4 x i64> %res
+}
+define <4 x i64> @test_masked_2xi64_to_4xi64_mem_mask2(<2 x i64>* %vp, <4 x i64> %default, <4 x i64> %mask) {
+; CHECK-LABEL: test_masked_2xi64_to_4xi64_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
+; CHECK-NEXT: vbroadcasti64x2 {{.*#+}} ymm0 {%k1} = mem[0,1,0,1]
+; CHECK-NEXT: retq
+ %vec = load <2 x i64>, <2 x i64>* %vp
+ %shuf = shufflevector <2 x i64> %vec, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %default
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_masked_z_2xi64_to_4xi64_mem_mask2(<2 x i64>* %vp, <4 x i64> %mask) {
+; CHECK-LABEL: test_masked_z_2xi64_to_4xi64_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k1
+; CHECK-NEXT: vbroadcasti64x2 {{.*#+}} ymm0 {%k1} {z} = mem[0,1,0,1]
+; CHECK-NEXT: retq
+ %vec = load <2 x i64>, <2 x i64>* %vp
+ %shuf = shufflevector <2 x i64> %vec, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
+ ret <4 x i64> %res
+}
+define <4 x i64> @test_masked_2xi64_to_4xi64_mem_mask3(<2 x i64>* %vp, <4 x i64> %default, <4 x i64> %mask) {
+; CHECK-LABEL: test_masked_2xi64_to_4xi64_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
+; CHECK-NEXT: vbroadcasti64x2 {{.*#+}} ymm0 {%k1} = mem[0,1,0,1]
+; CHECK-NEXT: retq
+ %vec = load <2 x i64>, <2 x i64>* %vp
+ %shuf = shufflevector <2 x i64> %vec, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %default
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_masked_z_2xi64_to_4xi64_mem_mask3(<2 x i64>* %vp, <4 x i64> %mask) {
+; CHECK-LABEL: test_masked_z_2xi64_to_4xi64_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k1
+; CHECK-NEXT: vbroadcasti64x2 {{.*#+}} ymm0 {%k1} {z} = mem[0,1,0,1]
+; CHECK-NEXT: retq
+ %vec = load <2 x i64>, <2 x i64>* %vp
+ %shuf = shufflevector <2 x i64> %vec, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
+ ret <4 x i64> %res
+}
+define <8 x i64> @test_2xi64_to_8xi64_mem(<2 x i64>* %vp) {
+; CHECK-LABEL: test_2xi64_to_8xi64_mem:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; CHECK-NEXT: retq
+ %vec = load <2 x i64>, <2 x i64>* %vp
+ %res = shufflevector <2 x i64> %vec, <2 x i64> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+ ret <8 x i64> %res
+}
+define <8 x i64> @test_masked_2xi64_to_8xi64_mem_mask0(<2 x i64>* %vp, <8 x i64> %default, <8 x i64> %mask) {
+; CHECK-LABEL: test_masked_2xi64_to_8xi64_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqq %zmm2, %zmm1, %k1
+; CHECK-NEXT: vbroadcasti64x2 {{.*#+}} zmm0 {%k1} = mem[0,1,0,1,0,1,0,1]
+; CHECK-NEXT: retq
+ %vec = load <2 x i64>, <2 x i64>* %vp
+ %shuf = shufflevector <2 x i64> %vec, <2 x i64> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %default
+ ret <8 x i64> %res
+}
+
+define <8 x i64> @test_masked_z_2xi64_to_8xi64_mem_mask0(<2 x i64>* %vp, <8 x i64> %mask) {
+; CHECK-LABEL: test_masked_z_2xi64_to_8xi64_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k1
+; CHECK-NEXT: vbroadcasti64x2 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,0,1,0,1,0,1]
+; CHECK-NEXT: retq
+ %vec = load <2 x i64>, <2 x i64>* %vp
+ %shuf = shufflevector <2 x i64> %vec, <2 x i64> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
+ ret <8 x i64> %res
+}
+define <8 x i64> @test_masked_2xi64_to_8xi64_mem_mask1(<2 x i64>* %vp, <8 x i64> %default, <8 x i64> %mask) {
+; CHECK-LABEL: test_masked_2xi64_to_8xi64_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqq %zmm2, %zmm1, %k1
+; CHECK-NEXT: vbroadcasti64x2 {{.*#+}} zmm0 {%k1} = mem[0,1,0,1,0,1,0,1]
+; CHECK-NEXT: retq
+ %vec = load <2 x i64>, <2 x i64>* %vp
+ %shuf = shufflevector <2 x i64> %vec, <2 x i64> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %default
+ ret <8 x i64> %res
+}
+
+define <8 x i64> @test_masked_z_2xi64_to_8xi64_mem_mask1(<2 x i64>* %vp, <8 x i64> %mask) {
+; CHECK-LABEL: test_masked_z_2xi64_to_8xi64_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k1
+; CHECK-NEXT: vbroadcasti64x2 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,0,1,0,1,0,1]
+; CHECK-NEXT: retq
+ %vec = load <2 x i64>, <2 x i64>* %vp
+ %shuf = shufflevector <2 x i64> %vec, <2 x i64> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
+ ret <8 x i64> %res
+}
+define <8 x i64> @test_masked_2xi64_to_8xi64_mem_mask2(<2 x i64>* %vp, <8 x i64> %default, <8 x i64> %mask) {
+; CHECK-LABEL: test_masked_2xi64_to_8xi64_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqq %zmm2, %zmm1, %k1
+; CHECK-NEXT: vbroadcasti64x2 {{.*#+}} zmm0 {%k1} = mem[0,1,0,1,0,1,0,1]
+; CHECK-NEXT: retq
+ %vec = load <2 x i64>, <2 x i64>* %vp
+ %shuf = shufflevector <2 x i64> %vec, <2 x i64> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %default
+ ret <8 x i64> %res
+}
+
+define <8 x i64> @test_masked_z_2xi64_to_8xi64_mem_mask2(<2 x i64>* %vp, <8 x i64> %mask) {
+; CHECK-LABEL: test_masked_z_2xi64_to_8xi64_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k1
+; CHECK-NEXT: vbroadcasti64x2 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,0,1,0,1,0,1]
+; CHECK-NEXT: retq
+ %vec = load <2 x i64>, <2 x i64>* %vp
+ %shuf = shufflevector <2 x i64> %vec, <2 x i64> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
+ ret <8 x i64> %res
+}
+define <8 x i64> @test_masked_2xi64_to_8xi64_mem_mask3(<2 x i64>* %vp, <8 x i64> %default, <8 x i64> %mask) {
+; CHECK-LABEL: test_masked_2xi64_to_8xi64_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqq %zmm2, %zmm1, %k1
+; CHECK-NEXT: vbroadcasti64x2 {{.*#+}} zmm0 {%k1} = mem[0,1,0,1,0,1,0,1]
+; CHECK-NEXT: retq
+ %vec = load <2 x i64>, <2 x i64>* %vp
+ %shuf = shufflevector <2 x i64> %vec, <2 x i64> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %default
+ ret <8 x i64> %res
+}
+
+define <8 x i64> @test_masked_z_2xi64_to_8xi64_mem_mask3(<2 x i64>* %vp, <8 x i64> %mask) {
+; CHECK-LABEL: test_masked_z_2xi64_to_8xi64_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k1
+; CHECK-NEXT: vbroadcasti64x2 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,0,1,0,1,0,1]
+; CHECK-NEXT: retq
+ %vec = load <2 x i64>, <2 x i64>* %vp
+ %shuf = shufflevector <2 x i64> %vec, <2 x i64> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
+ ret <8 x i64> %res
+}
+define <16 x i32> @test_8xi32_to_16xi32_mem(<8 x i32>* %vp) {
+; CHECK-LABEL: test_8xi32_to_16xi32_mem:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
+; CHECK-NEXT: retq
+ %vec = load <8 x i32>, <8 x i32>* %vp
+ %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <16 x i32> %res
+}
+define <16 x i32> @test_masked_8xi32_to_16xi32_mem_mask0(<8 x i32>* %vp, <16 x i32> %default, <16 x i32> %mask) {
+; CHECK-LABEL: test_masked_8xi32_to_16xi32_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqd %zmm2, %zmm1, %k1
+; CHECK-NEXT: vbroadcasti32x8 {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
+; CHECK-NEXT: retq
+ %vec = load <8 x i32>, <8 x i32>* %vp
+ %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %default
+ ret <16 x i32> %res
+}
+
+define <16 x i32> @test_masked_z_8xi32_to_16xi32_mem_mask0(<8 x i32>* %vp, <16 x i32> %mask) {
+; CHECK-LABEL: test_masked_z_8xi32_to_16xi32_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k1
+; CHECK-NEXT: vbroadcasti32x8 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
+; CHECK-NEXT: retq
+ %vec = load <8 x i32>, <8 x i32>* %vp
+ %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
+ ret <16 x i32> %res
+}
+define <16 x i32> @test_masked_8xi32_to_16xi32_mem_mask1(<8 x i32>* %vp, <16 x i32> %default, <16 x i32> %mask) {
+; CHECK-LABEL: test_masked_8xi32_to_16xi32_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqd %zmm2, %zmm1, %k1
+; CHECK-NEXT: vbroadcasti32x8 {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
+; CHECK-NEXT: retq
+ %vec = load <8 x i32>, <8 x i32>* %vp
+ %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %default
+ ret <16 x i32> %res
+}
+
+define <16 x i32> @test_masked_z_8xi32_to_16xi32_mem_mask1(<8 x i32>* %vp, <16 x i32> %mask) {
+; CHECK-LABEL: test_masked_z_8xi32_to_16xi32_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k1
+; CHECK-NEXT: vbroadcasti32x8 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
+; CHECK-NEXT: retq
+ %vec = load <8 x i32>, <8 x i32>* %vp
+ %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
+ ret <16 x i32> %res
+}
+define <16 x i32> @test_masked_8xi32_to_16xi32_mem_mask2(<8 x i32>* %vp, <16 x i32> %default, <16 x i32> %mask) {
+; CHECK-LABEL: test_masked_8xi32_to_16xi32_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqd %zmm2, %zmm1, %k1
+; CHECK-NEXT: vbroadcasti32x8 {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
+; CHECK-NEXT: retq
+ %vec = load <8 x i32>, <8 x i32>* %vp
+ %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %default
+ ret <16 x i32> %res
+}
+
+define <16 x i32> @test_masked_z_8xi32_to_16xi32_mem_mask2(<8 x i32>* %vp, <16 x i32> %mask) {
+; CHECK-LABEL: test_masked_z_8xi32_to_16xi32_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k1
+; CHECK-NEXT: vbroadcasti32x8 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
+; CHECK-NEXT: retq
+ %vec = load <8 x i32>, <8 x i32>* %vp
+ %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
+ ret <16 x i32> %res
+}
+define <16 x i32> @test_masked_8xi32_to_16xi32_mem_mask3(<8 x i32>* %vp, <16 x i32> %default, <16 x i32> %mask) {
+; CHECK-LABEL: test_masked_8xi32_to_16xi32_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqd %zmm2, %zmm1, %k1
+; CHECK-NEXT: vbroadcasti32x8 {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
+; CHECK-NEXT: retq
+ %vec = load <8 x i32>, <8 x i32>* %vp
+ %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %default
+ ret <16 x i32> %res
+}
+
+define <16 x i32> @test_masked_z_8xi32_to_16xi32_mem_mask3(<8 x i32>* %vp, <16 x i32> %mask) {
+; CHECK-LABEL: test_masked_z_8xi32_to_16xi32_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k1
+; CHECK-NEXT: vbroadcasti32x8 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
+; CHECK-NEXT: retq
+ %vec = load <8 x i32>, <8 x i32>* %vp
+ %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
+ ret <16 x i32> %res
+}
+define <8 x i64> @test_4xi64_to_8xi64_mem(<4 x i64>* %vp) {
+; CHECK-LABEL: test_4xi64_to_8xi64_mem:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
+; CHECK-NEXT: retq
+ %vec = load <4 x i64>, <4 x i64>* %vp
+ %res = shufflevector <4 x i64> %vec, <4 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+ ret <8 x i64> %res
+}
+define <8 x i64> @test_masked_4xi64_to_8xi64_mem_mask0(<4 x i64>* %vp, <8 x i64> %default, <8 x i64> %mask) {
+; CHECK-LABEL: test_masked_4xi64_to_8xi64_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqq %zmm2, %zmm1, %k1
+; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,0,1,2,3]
+; CHECK-NEXT: retq
+ %vec = load <4 x i64>, <4 x i64>* %vp
+ %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %default
+ ret <8 x i64> %res
+}
+
+define <8 x i64> @test_masked_z_4xi64_to_8xi64_mem_mask0(<4 x i64>* %vp, <8 x i64> %mask) {
+; CHECK-LABEL: test_masked_z_4xi64_to_8xi64_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k1
+; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,0,1,2,3]
+; CHECK-NEXT: retq
+ %vec = load <4 x i64>, <4 x i64>* %vp
+ %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
+ ret <8 x i64> %res
+}
+define <8 x i64> @test_masked_4xi64_to_8xi64_mem_mask1(<4 x i64>* %vp, <8 x i64> %default, <8 x i64> %mask) {
+; CHECK-LABEL: test_masked_4xi64_to_8xi64_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqq %zmm2, %zmm1, %k1
+; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,0,1,2,3]
+; CHECK-NEXT: retq
+ %vec = load <4 x i64>, <4 x i64>* %vp
+ %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %default
+ ret <8 x i64> %res
+}
+
+define <8 x i64> @test_masked_z_4xi64_to_8xi64_mem_mask1(<4 x i64>* %vp, <8 x i64> %mask) {
+; CHECK-LABEL: test_masked_z_4xi64_to_8xi64_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k1
+; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,0,1,2,3]
+; CHECK-NEXT: retq
+ %vec = load <4 x i64>, <4 x i64>* %vp
+ %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
+ ret <8 x i64> %res
+}
+define <8 x i64> @test_masked_4xi64_to_8xi64_mem_mask2(<4 x i64>* %vp, <8 x i64> %default, <8 x i64> %mask) {
+; CHECK-LABEL: test_masked_4xi64_to_8xi64_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqq %zmm2, %zmm1, %k1
+; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,0,1,2,3]
+; CHECK-NEXT: retq
+ %vec = load <4 x i64>, <4 x i64>* %vp
+ %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %default
+ ret <8 x i64> %res
+}
+
+define <8 x i64> @test_masked_z_4xi64_to_8xi64_mem_mask2(<4 x i64>* %vp, <8 x i64> %mask) {
+; CHECK-LABEL: test_masked_z_4xi64_to_8xi64_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k1
+; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,0,1,2,3]
+; CHECK-NEXT: retq
+ %vec = load <4 x i64>, <4 x i64>* %vp
+ %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
+ ret <8 x i64> %res
+}
+define <8 x i64> @test_masked_4xi64_to_8xi64_mem_mask3(<4 x i64>* %vp, <8 x i64> %default, <8 x i64> %mask) {
+; CHECK-LABEL: test_masked_4xi64_to_8xi64_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqq %zmm2, %zmm1, %k1
+; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,0,1,2,3]
+; CHECK-NEXT: retq
+ %vec = load <4 x i64>, <4 x i64>* %vp
+ %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %default
+ ret <8 x i64> %res
+}
+
+define <8 x i64> @test_masked_z_4xi64_to_8xi64_mem_mask3(<4 x i64>* %vp, <8 x i64> %mask) {
+; CHECK-LABEL: test_masked_z_4xi64_to_8xi64_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k1
+; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,0,1,2,3]
+; CHECK-NEXT: retq
+ %vec = load <4 x i64>, <4 x i64>* %vp
+ %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
+ ret <8 x i64> %res
+}
diff --git a/test/CodeGen/X86/avx512-shuffles/duplicate-high.ll b/test/CodeGen/X86/avx512-shuffles/duplicate-high.ll
new file mode 100644
index 000000000000..195c8567899d
--- /dev/null
+++ b/test/CodeGen/X86/avx512-shuffles/duplicate-high.ll
@@ -0,0 +1,849 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512vl %s -o - | FileCheck %s
+
+define <4 x float> @test_4xfloat_dup_high(<4 x float> %vec) {
+; CHECK-LABEL: test_4xfloat_dup_high:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; CHECK-NEXT: retq
+ %res = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+ ret <4 x float> %res
+}
+define <4 x float> @test_masked_4xfloat_dup_high_mask0(<4 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
+; CHECK-LABEL: test_masked_4xfloat_dup_high_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1
+; CHECK-NEXT: vmovshdup {{.*#+}} xmm1 {%k1} = xmm0[1,1,3,3]
+; CHECK-NEXT: vmovaps %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_masked_z_4xfloat_dup_high_mask0(<4 x float> %vec, <4 x float> %mask) {
+; CHECK-LABEL: test_masked_z_4xfloat_dup_high_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
+; CHECK-NEXT: vmovshdup {{.*#+}} xmm0 {%k1} {z} = xmm0[1,1,3,3]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
+ ret <4 x float> %res
+}
+define <4 x float> @test_masked_4xfloat_dup_high_mask1(<4 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
+; CHECK-LABEL: test_masked_4xfloat_dup_high_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1
+; CHECK-NEXT: vmovshdup {{.*#+}} xmm1 {%k1} = xmm0[1,1,3,3]
+; CHECK-NEXT: vmovaps %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_masked_z_4xfloat_dup_high_mask1(<4 x float> %vec, <4 x float> %mask) {
+; CHECK-LABEL: test_masked_z_4xfloat_dup_high_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
+; CHECK-NEXT: vmovshdup {{.*#+}} xmm0 {%k1} {z} = xmm0[1,1,3,3]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
+ ret <4 x float> %res
+}
+define <4 x float> @test_masked_4xfloat_dup_high_mask2(<4 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
+; CHECK-LABEL: test_masked_4xfloat_dup_high_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1
+; CHECK-NEXT: vmovshdup {{.*#+}} xmm1 {%k1} = xmm0[1,1,3,3]
+; CHECK-NEXT: vmovaps %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_masked_z_4xfloat_dup_high_mask2(<4 x float> %vec, <4 x float> %mask) {
+; CHECK-LABEL: test_masked_z_4xfloat_dup_high_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
+; CHECK-NEXT: vmovshdup {{.*#+}} xmm0 {%k1} {z} = xmm0[1,1,3,3]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
+ ret <4 x float> %res
+}
+define <4 x float> @test_masked_4xfloat_dup_high_mask3(<4 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
+; CHECK-LABEL: test_masked_4xfloat_dup_high_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1
+; CHECK-NEXT: vmovshdup {{.*#+}} xmm1 {%k1} = xmm0[1,1,3,3]
+; CHECK-NEXT: vmovaps %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_masked_z_4xfloat_dup_high_mask3(<4 x float> %vec, <4 x float> %mask) {
+; CHECK-LABEL: test_masked_z_4xfloat_dup_high_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
+; CHECK-NEXT: vmovshdup {{.*#+}} xmm0 {%k1} {z} = xmm0[1,1,3,3]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
+ ret <4 x float> %res
+}
+define <4 x float> @test_masked_4xfloat_dup_high_mask4(<4 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
+; CHECK-LABEL: test_masked_4xfloat_dup_high_mask4:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1
+; CHECK-NEXT: vmovshdup {{.*#+}} xmm1 {%k1} = xmm0[1,1,3,3]
+; CHECK-NEXT: vmovaps %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_masked_z_4xfloat_dup_high_mask4(<4 x float> %vec, <4 x float> %mask) {
+; CHECK-LABEL: test_masked_z_4xfloat_dup_high_mask4:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
+; CHECK-NEXT: vmovshdup {{.*#+}} xmm0 {%k1} {z} = xmm0[1,1,3,3]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
+ ret <4 x float> %res
+}
+define <4 x float> @test_4xfloat_dup_high_mem(<4 x float>* %vp) {
+; CHECK-LABEL: test_4xfloat_dup_high_mem:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovshdup {{.*#+}} xmm0 = mem[1,1,3,3]
+; CHECK-NEXT: retq
+ %vec = load <4 x float>, <4 x float>* %vp
+ %res = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+ ret <4 x float> %res
+}
+define <4 x float> @test_masked_4xfloat_dup_high_mem_mask0(<4 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) {
+; CHECK-LABEL: test_masked_4xfloat_dup_high_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
+; CHECK-NEXT: vmovshdup {{.*#+}} xmm0 {%k1} = mem[1,1,3,3]
+; CHECK-NEXT: retq
+ %vec = load <4 x float>, <4 x float>* %vp
+ %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_masked_z_4xfloat_dup_high_mem_mask0(<4 x float>* %vp, <4 x float> %mask) {
+; CHECK-LABEL: test_masked_z_4xfloat_dup_high_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vcmpeqps %xmm1, %xmm0, %k1
+; CHECK-NEXT: vmovshdup {{.*#+}} xmm0 {%k1} {z} = mem[1,1,3,3]
+; CHECK-NEXT: retq
+ %vec = load <4 x float>, <4 x float>* %vp
+ %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
+ ret <4 x float> %res
+}
+define <4 x float> @test_masked_4xfloat_dup_high_mem_mask1(<4 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) {
+; CHECK-LABEL: test_masked_4xfloat_dup_high_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
+; CHECK-NEXT: vmovshdup {{.*#+}} xmm0 {%k1} = mem[1,1,3,3]
+; CHECK-NEXT: retq
+ %vec = load <4 x float>, <4 x float>* %vp
+ %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_masked_z_4xfloat_dup_high_mem_mask1(<4 x float>* %vp, <4 x float> %mask) {
+; CHECK-LABEL: test_masked_z_4xfloat_dup_high_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vcmpeqps %xmm1, %xmm0, %k1
+; CHECK-NEXT: vmovshdup {{.*#+}} xmm0 {%k1} {z} = mem[1,1,3,3]
+; CHECK-NEXT: retq
+ %vec = load <4 x float>, <4 x float>* %vp
+ %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
+ ret <4 x float> %res
+}
+define <4 x float> @test_masked_4xfloat_dup_high_mem_mask2(<4 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) {
+; CHECK-LABEL: test_masked_4xfloat_dup_high_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
+; CHECK-NEXT: vmovshdup {{.*#+}} xmm0 {%k1} = mem[1,1,3,3]
+; CHECK-NEXT: retq
+ %vec = load <4 x float>, <4 x float>* %vp
+ %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_masked_z_4xfloat_dup_high_mem_mask2(<4 x float>* %vp, <4 x float> %mask) {
+; CHECK-LABEL: test_masked_z_4xfloat_dup_high_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vcmpeqps %xmm1, %xmm0, %k1
+; CHECK-NEXT: vmovshdup {{.*#+}} xmm0 {%k1} {z} = mem[1,1,3,3]
+; CHECK-NEXT: retq
+ %vec = load <4 x float>, <4 x float>* %vp
+ %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
+ ret <4 x float> %res
+}
+define <4 x float> @test_masked_4xfloat_dup_high_mem_mask3(<4 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) {
+; CHECK-LABEL: test_masked_4xfloat_dup_high_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
+; CHECK-NEXT: vmovshdup {{.*#+}} xmm0 {%k1} = mem[1,1,3,3]
+; CHECK-NEXT: retq
+ %vec = load <4 x float>, <4 x float>* %vp
+ %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_masked_z_4xfloat_dup_high_mem_mask3(<4 x float>* %vp, <4 x float> %mask) {
+; CHECK-LABEL: test_masked_z_4xfloat_dup_high_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vcmpeqps %xmm1, %xmm0, %k1
+; CHECK-NEXT: vmovshdup {{.*#+}} xmm0 {%k1} {z} = mem[1,1,3,3]
+; CHECK-NEXT: retq
+ %vec = load <4 x float>, <4 x float>* %vp
+ %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
+ ret <4 x float> %res
+}
+define <4 x float> @test_masked_4xfloat_dup_high_mem_mask4(<4 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) {
+; CHECK-LABEL: test_masked_4xfloat_dup_high_mem_mask4:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
+; CHECK-NEXT: vmovshdup {{.*#+}} xmm0 {%k1} = mem[1,1,3,3]
+; CHECK-NEXT: retq
+ %vec = load <4 x float>, <4 x float>* %vp
+ %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_masked_z_4xfloat_dup_high_mem_mask4(<4 x float>* %vp, <4 x float> %mask) {
+; CHECK-LABEL: test_masked_z_4xfloat_dup_high_mem_mask4:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vcmpeqps %xmm1, %xmm0, %k1
+; CHECK-NEXT: vmovshdup {{.*#+}} xmm0 {%k1} {z} = mem[1,1,3,3]
+; CHECK-NEXT: retq
+ %vec = load <4 x float>, <4 x float>* %vp
+ %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
+ ret <4 x float> %res
+}
+define <8 x float> @test_8xfloat_dup_high(<8 x float> %vec) {
+; CHECK-LABEL: test_8xfloat_dup_high:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7]
+; CHECK-NEXT: retq
+ %res = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
+ ret <8 x float> %res
+}
+define <8 x float> @test_masked_8xfloat_dup_high_mask0(<8 x float> %vec, <8 x float> %vec2, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_8xfloat_dup_high_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1
+; CHECK-NEXT: vmovshdup {{.*#+}} ymm1 {%k1} = ymm0[1,1,3,3,5,5,7,7]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_masked_z_8xfloat_dup_high_mask0(<8 x float> %vec, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_z_8xfloat_dup_high_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1
+; CHECK-NEXT: vmovshdup {{.*#+}} ymm0 {%k1} {z} = ymm0[1,1,3,3,5,5,7,7]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+define <8 x float> @test_masked_8xfloat_dup_high_mask1(<8 x float> %vec, <8 x float> %vec2, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_8xfloat_dup_high_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1
+; CHECK-NEXT: vmovshdup {{.*#+}} ymm1 {%k1} = ymm0[1,1,3,3,5,5,7,7]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_masked_z_8xfloat_dup_high_mask1(<8 x float> %vec, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_z_8xfloat_dup_high_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1
+; CHECK-NEXT: vmovshdup {{.*#+}} ymm0 {%k1} {z} = ymm0[1,1,3,3,5,5,7,7]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+define <8 x float> @test_masked_8xfloat_dup_high_mask2(<8 x float> %vec, <8 x float> %vec2, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_8xfloat_dup_high_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1
+; CHECK-NEXT: vmovshdup {{.*#+}} ymm1 {%k1} = ymm0[1,1,3,3,5,5,7,7]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_masked_z_8xfloat_dup_high_mask2(<8 x float> %vec, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_z_8xfloat_dup_high_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1
+; CHECK-NEXT: vmovshdup {{.*#+}} ymm0 {%k1} {z} = ymm0[1,1,3,3,5,5,7,7]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+define <8 x float> @test_masked_8xfloat_dup_high_mask3(<8 x float> %vec, <8 x float> %vec2, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_8xfloat_dup_high_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1
+; CHECK-NEXT: vmovshdup {{.*#+}} ymm1 {%k1} = ymm0[1,1,3,3,5,5,7,7]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_masked_z_8xfloat_dup_high_mask3(<8 x float> %vec, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_z_8xfloat_dup_high_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1
+; CHECK-NEXT: vmovshdup {{.*#+}} ymm0 {%k1} {z} = ymm0[1,1,3,3,5,5,7,7]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+define <8 x float> @test_masked_8xfloat_dup_high_mask4(<8 x float> %vec, <8 x float> %vec2, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_8xfloat_dup_high_mask4:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1
+; CHECK-NEXT: vmovshdup {{.*#+}} ymm1 {%k1} = ymm0[1,1,3,3,5,5,7,7]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_masked_z_8xfloat_dup_high_mask4(<8 x float> %vec, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_z_8xfloat_dup_high_mask4:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1
+; CHECK-NEXT: vmovshdup {{.*#+}} ymm0 {%k1} {z} = ymm0[1,1,3,3,5,5,7,7]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+define <8 x float> @test_8xfloat_dup_high_mem(<8 x float>* %vp) {
+; CHECK-LABEL: test_8xfloat_dup_high_mem:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovshdup {{.*#+}} ymm0 = mem[1,1,3,3,5,5,7,7]
+; CHECK-NEXT: retq
+ %vec = load <8 x float>, <8 x float>* %vp
+ %res = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
+ ret <8 x float> %res
+}
+define <8 x float> @test_masked_8xfloat_dup_high_mem_mask0(<8 x float>* %vp, <8 x float> %vec2, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_8xfloat_dup_high_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1
+; CHECK-NEXT: vmovshdup {{.*#+}} ymm0 {%k1} = mem[1,1,3,3,5,5,7,7]
+; CHECK-NEXT: retq
+ %vec = load <8 x float>, <8 x float>* %vp
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_masked_z_8xfloat_dup_high_mem_mask0(<8 x float>* %vp, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_z_8xfloat_dup_high_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vcmpeqps %ymm1, %ymm0, %k1
+; CHECK-NEXT: vmovshdup {{.*#+}} ymm0 {%k1} {z} = mem[1,1,3,3,5,5,7,7]
+; CHECK-NEXT: retq
+ %vec = load <8 x float>, <8 x float>* %vp
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+define <8 x float> @test_masked_8xfloat_dup_high_mem_mask1(<8 x float>* %vp, <8 x float> %vec2, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_8xfloat_dup_high_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1
+; CHECK-NEXT: vmovshdup {{.*#+}} ymm0 {%k1} = mem[1,1,3,3,5,5,7,7]
+; CHECK-NEXT: retq
+ %vec = load <8 x float>, <8 x float>* %vp
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_masked_z_8xfloat_dup_high_mem_mask1(<8 x float>* %vp, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_z_8xfloat_dup_high_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vcmpeqps %ymm1, %ymm0, %k1
+; CHECK-NEXT: vmovshdup {{.*#+}} ymm0 {%k1} {z} = mem[1,1,3,3,5,5,7,7]
+; CHECK-NEXT: retq
+ %vec = load <8 x float>, <8 x float>* %vp
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+define <8 x float> @test_masked_8xfloat_dup_high_mem_mask2(<8 x float>* %vp, <8 x float> %vec2, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_8xfloat_dup_high_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1
+; CHECK-NEXT: vmovshdup {{.*#+}} ymm0 {%k1} = mem[1,1,3,3,5,5,7,7]
+; CHECK-NEXT: retq
+ %vec = load <8 x float>, <8 x float>* %vp
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_masked_z_8xfloat_dup_high_mem_mask2(<8 x float>* %vp, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_z_8xfloat_dup_high_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vcmpeqps %ymm1, %ymm0, %k1
+; CHECK-NEXT: vmovshdup {{.*#+}} ymm0 {%k1} {z} = mem[1,1,3,3,5,5,7,7]
+; CHECK-NEXT: retq
+ %vec = load <8 x float>, <8 x float>* %vp
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+define <8 x float> @test_masked_8xfloat_dup_high_mem_mask3(<8 x float>* %vp, <8 x float> %vec2, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_8xfloat_dup_high_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1
+; CHECK-NEXT: vmovshdup {{.*#+}} ymm0 {%k1} = mem[1,1,3,3,5,5,7,7]
+; CHECK-NEXT: retq
+ %vec = load <8 x float>, <8 x float>* %vp
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_masked_z_8xfloat_dup_high_mem_mask3(<8 x float>* %vp, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_z_8xfloat_dup_high_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vcmpeqps %ymm1, %ymm0, %k1
+; CHECK-NEXT: vmovshdup {{.*#+}} ymm0 {%k1} {z} = mem[1,1,3,3,5,5,7,7]
+; CHECK-NEXT: retq
+ %vec = load <8 x float>, <8 x float>* %vp
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+define <8 x float> @test_masked_8xfloat_dup_high_mem_mask4(<8 x float>* %vp, <8 x float> %vec2, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_8xfloat_dup_high_mem_mask4:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1
+; CHECK-NEXT: vmovshdup {{.*#+}} ymm0 {%k1} = mem[1,1,3,3,5,5,7,7]
+; CHECK-NEXT: retq
+ %vec = load <8 x float>, <8 x float>* %vp
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_masked_z_8xfloat_dup_high_mem_mask4(<8 x float>* %vp, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_z_8xfloat_dup_high_mem_mask4:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vcmpeqps %ymm1, %ymm0, %k1
+; CHECK-NEXT: vmovshdup {{.*#+}} ymm0 {%k1} {z} = mem[1,1,3,3,5,5,7,7]
+; CHECK-NEXT: retq
+ %vec = load <8 x float>, <8 x float>* %vp
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+define <16 x float> @test_16xfloat_dup_high(<16 x float> %vec) {
+; CHECK-LABEL: test_16xfloat_dup_high:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovshdup {{.*#+}} zmm0 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
+; CHECK-NEXT: retq
+ %res = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
+ ret <16 x float> %res
+}
+define <16 x float> @test_masked_16xfloat_dup_high_mask0(<16 x float> %vec, <16 x float> %vec2, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_16xfloat_dup_high_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %zmm3, %zmm2, %k1
+; CHECK-NEXT: vmovshdup {{.*#+}} zmm1 {%k1} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
+; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_masked_z_16xfloat_dup_high_mask0(<16 x float> %vec, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_z_16xfloat_dup_high_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %zmm2, %zmm1, %k1
+; CHECK-NEXT: vmovshdup {{.*#+}} zmm0 {%k1} {z} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
+define <16 x float> @test_masked_16xfloat_dup_high_mask1(<16 x float> %vec, <16 x float> %vec2, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_16xfloat_dup_high_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %zmm3, %zmm2, %k1
+; CHECK-NEXT: vmovshdup {{.*#+}} zmm1 {%k1} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
+; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_masked_z_16xfloat_dup_high_mask1(<16 x float> %vec, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_z_16xfloat_dup_high_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %zmm2, %zmm1, %k1
+; CHECK-NEXT: vmovshdup {{.*#+}} zmm0 {%k1} {z} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
+define <16 x float> @test_masked_16xfloat_dup_high_mask2(<16 x float> %vec, <16 x float> %vec2, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_16xfloat_dup_high_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %zmm3, %zmm2, %k1
+; CHECK-NEXT: vmovshdup {{.*#+}} zmm1 {%k1} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
+; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_masked_z_16xfloat_dup_high_mask2(<16 x float> %vec, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_z_16xfloat_dup_high_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %zmm2, %zmm1, %k1
+; CHECK-NEXT: vmovshdup {{.*#+}} zmm0 {%k1} {z} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
+define <16 x float> @test_masked_16xfloat_dup_high_mask3(<16 x float> %vec, <16 x float> %vec2, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_16xfloat_dup_high_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %zmm3, %zmm2, %k1
+; CHECK-NEXT: vmovshdup {{.*#+}} zmm1 {%k1} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
+; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_masked_z_16xfloat_dup_high_mask3(<16 x float> %vec, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_z_16xfloat_dup_high_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %zmm2, %zmm1, %k1
+; CHECK-NEXT: vmovshdup {{.*#+}} zmm0 {%k1} {z} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
+define <16 x float> @test_masked_16xfloat_dup_high_mask4(<16 x float> %vec, <16 x float> %vec2, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_16xfloat_dup_high_mask4:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %zmm3, %zmm2, %k1
+; CHECK-NEXT: vmovshdup {{.*#+}} zmm1 {%k1} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
+; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_masked_z_16xfloat_dup_high_mask4(<16 x float> %vec, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_z_16xfloat_dup_high_mask4:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %zmm2, %zmm1, %k1
+; CHECK-NEXT: vmovshdup {{.*#+}} zmm0 {%k1} {z} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
+define <16 x float> @test_16xfloat_dup_high_mem(<16 x float>* %vp) {
+; CHECK-LABEL: test_16xfloat_dup_high_mem:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovshdup {{.*#+}} zmm0 = mem[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
+; CHECK-NEXT: retq
+ %vec = load <16 x float>, <16 x float>* %vp
+ %res = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
+ ret <16 x float> %res
+}
+define <16 x float> @test_masked_16xfloat_dup_high_mem_mask0(<16 x float>* %vp, <16 x float> %vec2, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_16xfloat_dup_high_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %zmm2, %zmm1, %k1
+; CHECK-NEXT: vmovshdup {{.*#+}} zmm0 {%k1} = mem[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
+; CHECK-NEXT: retq
+ %vec = load <16 x float>, <16 x float>* %vp
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_masked_z_16xfloat_dup_high_mem_mask0(<16 x float>* %vp, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_z_16xfloat_dup_high_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vcmpeqps %zmm1, %zmm0, %k1
+; CHECK-NEXT: vmovshdup {{.*#+}} zmm0 {%k1} {z} = mem[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
+; CHECK-NEXT: retq
+ %vec = load <16 x float>, <16 x float>* %vp
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
+define <16 x float> @test_masked_16xfloat_dup_high_mem_mask1(<16 x float>* %vp, <16 x float> %vec2, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_16xfloat_dup_high_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %zmm2, %zmm1, %k1
+; CHECK-NEXT: vmovshdup {{.*#+}} zmm0 {%k1} = mem[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
+; CHECK-NEXT: retq
+ %vec = load <16 x float>, <16 x float>* %vp
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_masked_z_16xfloat_dup_high_mem_mask1(<16 x float>* %vp, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_z_16xfloat_dup_high_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vcmpeqps %zmm1, %zmm0, %k1
+; CHECK-NEXT: vmovshdup {{.*#+}} zmm0 {%k1} {z} = mem[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
+; CHECK-NEXT: retq
+ %vec = load <16 x float>, <16 x float>* %vp
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
+define <16 x float> @test_masked_16xfloat_dup_high_mem_mask2(<16 x float>* %vp, <16 x float> %vec2, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_16xfloat_dup_high_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %zmm2, %zmm1, %k1
+; CHECK-NEXT: vmovshdup {{.*#+}} zmm0 {%k1} = mem[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
+; CHECK-NEXT: retq
+ %vec = load <16 x float>, <16 x float>* %vp
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_masked_z_16xfloat_dup_high_mem_mask2(<16 x float>* %vp, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_z_16xfloat_dup_high_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vcmpeqps %zmm1, %zmm0, %k1
+; CHECK-NEXT: vmovshdup {{.*#+}} zmm0 {%k1} {z} = mem[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
+; CHECK-NEXT: retq
+ %vec = load <16 x float>, <16 x float>* %vp
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
+define <16 x float> @test_masked_16xfloat_dup_high_mem_mask3(<16 x float>* %vp, <16 x float> %vec2, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_16xfloat_dup_high_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %zmm2, %zmm1, %k1
+; CHECK-NEXT: vmovshdup {{.*#+}} zmm0 {%k1} = mem[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
+; CHECK-NEXT: retq
+ %vec = load <16 x float>, <16 x float>* %vp
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_masked_z_16xfloat_dup_high_mem_mask3(<16 x float>* %vp, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_z_16xfloat_dup_high_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vcmpeqps %zmm1, %zmm0, %k1
+; CHECK-NEXT: vmovshdup {{.*#+}} zmm0 {%k1} {z} = mem[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
+; CHECK-NEXT: retq
+ %vec = load <16 x float>, <16 x float>* %vp
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
+define <16 x float> @test_masked_16xfloat_dup_high_mem_mask4(<16 x float>* %vp, <16 x float> %vec2, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_16xfloat_dup_high_mem_mask4:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %zmm2, %zmm1, %k1
+; CHECK-NEXT: vmovshdup {{.*#+}} zmm0 {%k1} = mem[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
+; CHECK-NEXT: retq
+ %vec = load <16 x float>, <16 x float>* %vp
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_masked_z_16xfloat_dup_high_mem_mask4(<16 x float>* %vp, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_z_16xfloat_dup_high_mem_mask4:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vcmpeqps %zmm1, %zmm0, %k1
+; CHECK-NEXT: vmovshdup {{.*#+}} zmm0 {%k1} {z} = mem[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
+; CHECK-NEXT: retq
+ %vec = load <16 x float>, <16 x float>* %vp
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
diff --git a/test/CodeGen/X86/avx512-shuffles/duplicate-low.ll b/test/CodeGen/X86/avx512-shuffles/duplicate-low.ll
new file mode 100644
index 000000000000..b32cb60c983b
--- /dev/null
+++ b/test/CodeGen/X86/avx512-shuffles/duplicate-low.ll
@@ -0,0 +1,1536 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512vl %s -o - | FileCheck %s
+
+define <2 x double> @test_2xdouble_dup_low(<2 x double> %vec) {
+; CHECK-LABEL: test_2xdouble_dup_low:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
+; CHECK-NEXT: retq
+ %res = shufflevector <2 x double> %vec, <2 x double> undef, <2 x i32> <i32 0, i32 0>
+ ret <2 x double> %res
+}
+define <2 x double> @test_masked_2xdouble_dup_low_mask0(<2 x double> %vec, <2 x double> %vec2, <2 x double> %mask) {
+; CHECK-LABEL: test_masked_2xdouble_dup_low_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %xmm3, %xmm2, %k1
+; CHECK-NEXT: vmovddup {{.*#+}} xmm1 {%k1} = xmm0[0,0]
+; CHECK-NEXT: vmovapd %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <2 x double> %vec, <2 x double> undef, <2 x i32> <i32 0, i32 0>
+ %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
+ %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec2
+ ret <2 x double> %res
+}
+
+define <2 x double> @test_masked_z_2xdouble_dup_low_mask0(<2 x double> %vec, <2 x double> %mask) {
+; CHECK-LABEL: test_masked_z_2xdouble_dup_low_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %xmm2, %xmm1, %k1
+; CHECK-NEXT: vmovddup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <2 x double> %vec, <2 x double> undef, <2 x i32> <i32 0, i32 0>
+ %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
+ %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
+ ret <2 x double> %res
+}
+define <2 x double> @test_masked_2xdouble_dup_low_mask1(<2 x double> %vec, <2 x double> %vec2, <2 x double> %mask) {
+; CHECK-LABEL: test_masked_2xdouble_dup_low_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %xmm3, %xmm2, %k1
+; CHECK-NEXT: vmovddup {{.*#+}} xmm1 {%k1} = xmm0[0,0]
+; CHECK-NEXT: vmovapd %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <2 x double> %vec, <2 x double> undef, <2 x i32> <i32 0, i32 0>
+ %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
+ %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec2
+ ret <2 x double> %res
+}
+
+define <2 x double> @test_masked_z_2xdouble_dup_low_mask1(<2 x double> %vec, <2 x double> %mask) {
+; CHECK-LABEL: test_masked_z_2xdouble_dup_low_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %xmm2, %xmm1, %k1
+; CHECK-NEXT: vmovddup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <2 x double> %vec, <2 x double> undef, <2 x i32> <i32 0, i32 0>
+ %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
+ %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
+ ret <2 x double> %res
+}
+define <2 x double> @test_2xdouble_dup_low_mem(<2 x double>* %vp) {
+; CHECK-LABEL: test_2xdouble_dup_low_mem:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
+; CHECK-NEXT: retq
+ %vec = load <2 x double>, <2 x double>* %vp
+ %res = shufflevector <2 x double> %vec, <2 x double> undef, <2 x i32> <i32 0, i32 0>
+ ret <2 x double> %res
+}
+define <2 x double> @test_masked_2xdouble_dup_low_mem_mask0(<2 x double>* %vp, <2 x double> %vec2, <2 x double> %mask) {
+; CHECK-LABEL: test_masked_2xdouble_dup_low_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %xmm2, %xmm1, %k1
+; CHECK-NEXT: vmovddup {{.*#+}} xmm0 {%k1} = mem[0,0]
+; CHECK-NEXT: retq
+ %vec = load <2 x double>, <2 x double>* %vp
+ %shuf = shufflevector <2 x double> %vec, <2 x double> undef, <2 x i32> <i32 0, i32 0>
+ %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
+ %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec2
+ ret <2 x double> %res
+}
+
+define <2 x double> @test_masked_z_2xdouble_dup_low_mem_mask0(<2 x double>* %vp, <2 x double> %mask) {
+; CHECK-LABEL: test_masked_z_2xdouble_dup_low_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vcmpeqpd %xmm1, %xmm0, %k1
+; CHECK-NEXT: vmovddup {{.*#+}} xmm0 {%k1} {z} = mem[0,0]
+; CHECK-NEXT: retq
+ %vec = load <2 x double>, <2 x double>* %vp
+ %shuf = shufflevector <2 x double> %vec, <2 x double> undef, <2 x i32> <i32 0, i32 0>
+ %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
+ %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
+ ret <2 x double> %res
+}
+define <2 x double> @test_masked_2xdouble_dup_low_mem_mask1(<2 x double>* %vp, <2 x double> %vec2, <2 x double> %mask) {
+; CHECK-LABEL: test_masked_2xdouble_dup_low_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %xmm2, %xmm1, %k1
+; CHECK-NEXT: vmovddup {{.*#+}} xmm0 {%k1} = mem[0,0]
+; CHECK-NEXT: retq
+ %vec = load <2 x double>, <2 x double>* %vp
+ %shuf = shufflevector <2 x double> %vec, <2 x double> undef, <2 x i32> <i32 0, i32 0>
+ %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
+ %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec2
+ ret <2 x double> %res
+}
+
+define <2 x double> @test_masked_z_2xdouble_dup_low_mem_mask1(<2 x double>* %vp, <2 x double> %mask) {
+; CHECK-LABEL: test_masked_z_2xdouble_dup_low_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vcmpeqpd %xmm1, %xmm0, %k1
+; CHECK-NEXT: vmovddup {{.*#+}} xmm0 {%k1} {z} = mem[0,0]
+; CHECK-NEXT: retq
+ %vec = load <2 x double>, <2 x double>* %vp
+ %shuf = shufflevector <2 x double> %vec, <2 x double> undef, <2 x i32> <i32 0, i32 0>
+ %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
+ %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
+ ret <2 x double> %res
+}
+define <4 x double> @test_4xdouble_dup_low(<4 x double> %vec) {
+; CHECK-LABEL: test_4xdouble_dup_low:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; CHECK-NEXT: retq
+ %res = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+ ret <4 x double> %res
+}
+define <4 x double> @test_masked_4xdouble_dup_low_mask0(<4 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
+; CHECK-LABEL: test_masked_4xdouble_dup_low_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
+; CHECK-NEXT: vmovddup {{.*#+}} ymm1 {%k1} = ymm0[0,0,2,2]
+; CHECK-NEXT: vmovapd %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_masked_z_4xdouble_dup_low_mask0(<4 x double> %vec, <4 x double> %mask) {
+; CHECK-LABEL: test_masked_z_4xdouble_dup_low_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vmovddup {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,2,2]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
+ ret <4 x double> %res
+}
+define <4 x double> @test_masked_4xdouble_dup_low_mask1(<4 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
+; CHECK-LABEL: test_masked_4xdouble_dup_low_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
+; CHECK-NEXT: vmovddup {{.*#+}} ymm1 {%k1} = ymm0[0,0,2,2]
+; CHECK-NEXT: vmovapd %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_masked_z_4xdouble_dup_low_mask1(<4 x double> %vec, <4 x double> %mask) {
+; CHECK-LABEL: test_masked_z_4xdouble_dup_low_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vmovddup {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,2,2]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
+ ret <4 x double> %res
+}
+define <4 x double> @test_masked_4xdouble_dup_low_mask2(<4 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
+; CHECK-LABEL: test_masked_4xdouble_dup_low_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
+; CHECK-NEXT: vmovddup {{.*#+}} ymm1 {%k1} = ymm0[0,0,2,2]
+; CHECK-NEXT: vmovapd %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_masked_z_4xdouble_dup_low_mask2(<4 x double> %vec, <4 x double> %mask) {
+; CHECK-LABEL: test_masked_z_4xdouble_dup_low_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vmovddup {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,2,2]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
+ ret <4 x double> %res
+}
+define <4 x double> @test_masked_4xdouble_dup_low_mask3(<4 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
+; CHECK-LABEL: test_masked_4xdouble_dup_low_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
+; CHECK-NEXT: vmovddup {{.*#+}} ymm1 {%k1} = ymm0[0,0,2,2]
+; CHECK-NEXT: vmovapd %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_masked_z_4xdouble_dup_low_mask3(<4 x double> %vec, <4 x double> %mask) {
+; CHECK-LABEL: test_masked_z_4xdouble_dup_low_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vmovddup {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,2,2]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
+ ret <4 x double> %res
+}
+define <4 x double> @test_masked_4xdouble_dup_low_mask4(<4 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
+; CHECK-LABEL: test_masked_4xdouble_dup_low_mask4:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
+; CHECK-NEXT: vmovddup {{.*#+}} ymm1 {%k1} = ymm0[0,0,2,2]
+; CHECK-NEXT: vmovapd %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_masked_z_4xdouble_dup_low_mask4(<4 x double> %vec, <4 x double> %mask) {
+; CHECK-LABEL: test_masked_z_4xdouble_dup_low_mask4:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vmovddup {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,2,2]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
+ ret <4 x double> %res
+}
+define <4 x double> @test_4xdouble_dup_low_mem(<4 x double>* %vp) {
+; CHECK-LABEL: test_4xdouble_dup_low_mem:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovddup {{.*#+}} ymm0 = mem[0,0,2,2]
+; CHECK-NEXT: retq
+ %vec = load <4 x double>, <4 x double>* %vp
+ %res = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+ ret <4 x double> %res
+}
+define <4 x double> @test_masked_4xdouble_dup_low_mem_mask0(<4 x double>* %vp, <4 x double> %vec2, <4 x double> %mask) {
+; CHECK-LABEL: test_masked_4xdouble_dup_low_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vmovddup {{.*#+}} ymm0 {%k1} = mem[0,0,2,2]
+; CHECK-NEXT: retq
+ %vec = load <4 x double>, <4 x double>* %vp
+ %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_masked_z_4xdouble_dup_low_mem_mask0(<4 x double>* %vp, <4 x double> %mask) {
+; CHECK-LABEL: test_masked_z_4xdouble_dup_low_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vcmpeqpd %ymm1, %ymm0, %k1
+; CHECK-NEXT: vmovddup {{.*#+}} ymm0 {%k1} {z} = mem[0,0,2,2]
+; CHECK-NEXT: retq
+ %vec = load <4 x double>, <4 x double>* %vp
+ %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
+ ret <4 x double> %res
+}
+define <4 x double> @test_masked_4xdouble_dup_low_mem_mask1(<4 x double>* %vp, <4 x double> %vec2, <4 x double> %mask) {
+; CHECK-LABEL: test_masked_4xdouble_dup_low_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vmovddup {{.*#+}} ymm0 {%k1} = mem[0,0,2,2]
+; CHECK-NEXT: retq
+ %vec = load <4 x double>, <4 x double>* %vp
+ %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_masked_z_4xdouble_dup_low_mem_mask1(<4 x double>* %vp, <4 x double> %mask) {
+; CHECK-LABEL: test_masked_z_4xdouble_dup_low_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vcmpeqpd %ymm1, %ymm0, %k1
+; CHECK-NEXT: vmovddup {{.*#+}} ymm0 {%k1} {z} = mem[0,0,2,2]
+; CHECK-NEXT: retq
+ %vec = load <4 x double>, <4 x double>* %vp
+ %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
+ ret <4 x double> %res
+}
+define <4 x double> @test_masked_4xdouble_dup_low_mem_mask2(<4 x double>* %vp, <4 x double> %vec2, <4 x double> %mask) {
+; CHECK-LABEL: test_masked_4xdouble_dup_low_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vmovddup {{.*#+}} ymm0 {%k1} = mem[0,0,2,2]
+; CHECK-NEXT: retq
+ %vec = load <4 x double>, <4 x double>* %vp
+ %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_masked_z_4xdouble_dup_low_mem_mask2(<4 x double>* %vp, <4 x double> %mask) {
+; CHECK-LABEL: test_masked_z_4xdouble_dup_low_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vcmpeqpd %ymm1, %ymm0, %k1
+; CHECK-NEXT: vmovddup {{.*#+}} ymm0 {%k1} {z} = mem[0,0,2,2]
+; CHECK-NEXT: retq
+ %vec = load <4 x double>, <4 x double>* %vp
+ %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
+ ret <4 x double> %res
+}
+define <4 x double> @test_masked_4xdouble_dup_low_mem_mask3(<4 x double>* %vp, <4 x double> %vec2, <4 x double> %mask) {
+; CHECK-LABEL: test_masked_4xdouble_dup_low_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vmovddup {{.*#+}} ymm0 {%k1} = mem[0,0,2,2]
+; CHECK-NEXT: retq
+ %vec = load <4 x double>, <4 x double>* %vp
+ %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_masked_z_4xdouble_dup_low_mem_mask3(<4 x double>* %vp, <4 x double> %mask) {
+; CHECK-LABEL: test_masked_z_4xdouble_dup_low_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vcmpeqpd %ymm1, %ymm0, %k1
+; CHECK-NEXT: vmovddup {{.*#+}} ymm0 {%k1} {z} = mem[0,0,2,2]
+; CHECK-NEXT: retq
+ %vec = load <4 x double>, <4 x double>* %vp
+ %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
+ ret <4 x double> %res
+}
+define <4 x double> @test_masked_4xdouble_dup_low_mem_mask4(<4 x double>* %vp, <4 x double> %vec2, <4 x double> %mask) {
+; CHECK-LABEL: test_masked_4xdouble_dup_low_mem_mask4:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vmovddup {{.*#+}} ymm0 {%k1} = mem[0,0,2,2]
+; CHECK-NEXT: retq
+ %vec = load <4 x double>, <4 x double>* %vp
+ %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_masked_z_4xdouble_dup_low_mem_mask4(<4 x double>* %vp, <4 x double> %mask) {
+; CHECK-LABEL: test_masked_z_4xdouble_dup_low_mem_mask4:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vcmpeqpd %ymm1, %ymm0, %k1
+; CHECK-NEXT: vmovddup {{.*#+}} ymm0 {%k1} {z} = mem[0,0,2,2]
+; CHECK-NEXT: retq
+ %vec = load <4 x double>, <4 x double>* %vp
+ %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
+ ret <4 x double> %res
+}
+define <8 x double> @test_8xdouble_dup_low(<8 x double> %vec) {
+; CHECK-LABEL: test_8xdouble_dup_low:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovddup {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6]
+; CHECK-NEXT: retq
+ %res = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+ ret <8 x double> %res
+}
+define <8 x double> @test_masked_8xdouble_dup_low_mask0(<8 x double> %vec, <8 x double> %vec2, <8 x double> %mask) {
+; CHECK-LABEL: test_masked_8xdouble_dup_low_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %zmm3, %zmm2, %k1
+; CHECK-NEXT: vmovddup {{.*#+}} zmm1 {%k1} = zmm0[0,0,2,2,4,4,6,6]
+; CHECK-NEXT: vmovapd %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_masked_z_8xdouble_dup_low_mask0(<8 x double> %vec, <8 x double> %mask) {
+; CHECK-LABEL: test_masked_z_8xdouble_dup_low_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %zmm2, %zmm1, %k1
+; CHECK-NEXT: vmovddup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
+ ret <8 x double> %res
+}
+define <8 x double> @test_masked_8xdouble_dup_low_mask1(<8 x double> %vec, <8 x double> %vec2, <8 x double> %mask) {
+; CHECK-LABEL: test_masked_8xdouble_dup_low_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %zmm3, %zmm2, %k1
+; CHECK-NEXT: vmovddup {{.*#+}} zmm1 {%k1} = zmm0[0,0,2,2,4,4,6,6]
+; CHECK-NEXT: vmovapd %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_masked_z_8xdouble_dup_low_mask1(<8 x double> %vec, <8 x double> %mask) {
+; CHECK-LABEL: test_masked_z_8xdouble_dup_low_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %zmm2, %zmm1, %k1
+; CHECK-NEXT: vmovddup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
+ ret <8 x double> %res
+}
+define <8 x double> @test_masked_8xdouble_dup_low_mask2(<8 x double> %vec, <8 x double> %vec2, <8 x double> %mask) {
+; CHECK-LABEL: test_masked_8xdouble_dup_low_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %zmm3, %zmm2, %k1
+; CHECK-NEXT: vmovddup {{.*#+}} zmm1 {%k1} = zmm0[0,0,2,2,4,4,6,6]
+; CHECK-NEXT: vmovapd %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_masked_z_8xdouble_dup_low_mask2(<8 x double> %vec, <8 x double> %mask) {
+; CHECK-LABEL: test_masked_z_8xdouble_dup_low_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %zmm2, %zmm1, %k1
+; CHECK-NEXT: vmovddup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
+ ret <8 x double> %res
+}
+define <8 x double> @test_masked_8xdouble_dup_low_mask3(<8 x double> %vec, <8 x double> %vec2, <8 x double> %mask) {
+; CHECK-LABEL: test_masked_8xdouble_dup_low_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %zmm3, %zmm2, %k1
+; CHECK-NEXT: vmovddup {{.*#+}} zmm1 {%k1} = zmm0[0,0,2,2,4,4,6,6]
+; CHECK-NEXT: vmovapd %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_masked_z_8xdouble_dup_low_mask3(<8 x double> %vec, <8 x double> %mask) {
+; CHECK-LABEL: test_masked_z_8xdouble_dup_low_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %zmm2, %zmm1, %k1
+; CHECK-NEXT: vmovddup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
+ ret <8 x double> %res
+}
+define <8 x double> @test_masked_8xdouble_dup_low_mask4(<8 x double> %vec, <8 x double> %vec2, <8 x double> %mask) {
+; CHECK-LABEL: test_masked_8xdouble_dup_low_mask4:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %zmm3, %zmm2, %k1
+; CHECK-NEXT: vmovddup {{.*#+}} zmm1 {%k1} = zmm0[0,0,2,2,4,4,6,6]
+; CHECK-NEXT: vmovapd %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_masked_z_8xdouble_dup_low_mask4(<8 x double> %vec, <8 x double> %mask) {
+; CHECK-LABEL: test_masked_z_8xdouble_dup_low_mask4:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %zmm2, %zmm1, %k1
+; CHECK-NEXT: vmovddup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
+ ret <8 x double> %res
+}
+define <8 x double> @test_8xdouble_dup_low_mem(<8 x double>* %vp) {
+; CHECK-LABEL: test_8xdouble_dup_low_mem:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovddup {{.*#+}} zmm0 = mem[0,0,2,2,4,4,6,6]
+; CHECK-NEXT: retq
+ %vec = load <8 x double>, <8 x double>* %vp
+ %res = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+ ret <8 x double> %res
+}
+define <8 x double> @test_masked_8xdouble_dup_low_mem_mask0(<8 x double>* %vp, <8 x double> %vec2, <8 x double> %mask) {
+; CHECK-LABEL: test_masked_8xdouble_dup_low_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %zmm2, %zmm1, %k1
+; CHECK-NEXT: vmovddup {{.*#+}} zmm0 {%k1} = mem[0,0,2,2,4,4,6,6]
+; CHECK-NEXT: retq
+ %vec = load <8 x double>, <8 x double>* %vp
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_masked_z_8xdouble_dup_low_mem_mask0(<8 x double>* %vp, <8 x double> %mask) {
+; CHECK-LABEL: test_masked_z_8xdouble_dup_low_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vcmpeqpd %zmm1, %zmm0, %k1
+; CHECK-NEXT: vmovddup {{.*#+}} zmm0 {%k1} {z} = mem[0,0,2,2,4,4,6,6]
+; CHECK-NEXT: retq
+ %vec = load <8 x double>, <8 x double>* %vp
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
+ ret <8 x double> %res
+}
+define <8 x double> @test_masked_8xdouble_dup_low_mem_mask1(<8 x double>* %vp, <8 x double> %vec2, <8 x double> %mask) {
+; CHECK-LABEL: test_masked_8xdouble_dup_low_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %zmm2, %zmm1, %k1
+; CHECK-NEXT: vmovddup {{.*#+}} zmm0 {%k1} = mem[0,0,2,2,4,4,6,6]
+; CHECK-NEXT: retq
+ %vec = load <8 x double>, <8 x double>* %vp
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_masked_z_8xdouble_dup_low_mem_mask1(<8 x double>* %vp, <8 x double> %mask) {
+; CHECK-LABEL: test_masked_z_8xdouble_dup_low_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vcmpeqpd %zmm1, %zmm0, %k1
+; CHECK-NEXT: vmovddup {{.*#+}} zmm0 {%k1} {z} = mem[0,0,2,2,4,4,6,6]
+; CHECK-NEXT: retq
+ %vec = load <8 x double>, <8 x double>* %vp
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
+ ret <8 x double> %res
+}
+define <8 x double> @test_masked_8xdouble_dup_low_mem_mask2(<8 x double>* %vp, <8 x double> %vec2, <8 x double> %mask) {
+; CHECK-LABEL: test_masked_8xdouble_dup_low_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %zmm2, %zmm1, %k1
+; CHECK-NEXT: vmovddup {{.*#+}} zmm0 {%k1} = mem[0,0,2,2,4,4,6,6]
+; CHECK-NEXT: retq
+ %vec = load <8 x double>, <8 x double>* %vp
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_masked_z_8xdouble_dup_low_mem_mask2(<8 x double>* %vp, <8 x double> %mask) {
+; CHECK-LABEL: test_masked_z_8xdouble_dup_low_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vcmpeqpd %zmm1, %zmm0, %k1
+; CHECK-NEXT: vmovddup {{.*#+}} zmm0 {%k1} {z} = mem[0,0,2,2,4,4,6,6]
+; CHECK-NEXT: retq
+ %vec = load <8 x double>, <8 x double>* %vp
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
+ ret <8 x double> %res
+}
+define <8 x double> @test_masked_8xdouble_dup_low_mem_mask3(<8 x double>* %vp, <8 x double> %vec2, <8 x double> %mask) {
+; CHECK-LABEL: test_masked_8xdouble_dup_low_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %zmm2, %zmm1, %k1
+; CHECK-NEXT: vmovddup {{.*#+}} zmm0 {%k1} = mem[0,0,2,2,4,4,6,6]
+; CHECK-NEXT: retq
+ %vec = load <8 x double>, <8 x double>* %vp
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_masked_z_8xdouble_dup_low_mem_mask3(<8 x double>* %vp, <8 x double> %mask) {
+; CHECK-LABEL: test_masked_z_8xdouble_dup_low_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vcmpeqpd %zmm1, %zmm0, %k1
+; CHECK-NEXT: vmovddup {{.*#+}} zmm0 {%k1} {z} = mem[0,0,2,2,4,4,6,6]
+; CHECK-NEXT: retq
+ %vec = load <8 x double>, <8 x double>* %vp
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
+ ret <8 x double> %res
+}
+define <8 x double> @test_masked_8xdouble_dup_low_mem_mask4(<8 x double>* %vp, <8 x double> %vec2, <8 x double> %mask) {
+; CHECK-LABEL: test_masked_8xdouble_dup_low_mem_mask4:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %zmm2, %zmm1, %k1
+; CHECK-NEXT: vmovddup {{.*#+}} zmm0 {%k1} = mem[0,0,2,2,4,4,6,6]
+; CHECK-NEXT: retq
+ %vec = load <8 x double>, <8 x double>* %vp
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_masked_z_8xdouble_dup_low_mem_mask4(<8 x double>* %vp, <8 x double> %mask) {
+; CHECK-LABEL: test_masked_z_8xdouble_dup_low_mem_mask4:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vcmpeqpd %zmm1, %zmm0, %k1
+; CHECK-NEXT: vmovddup {{.*#+}} zmm0 {%k1} {z} = mem[0,0,2,2,4,4,6,6]
+; CHECK-NEXT: retq
+ %vec = load <8 x double>, <8 x double>* %vp
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
+ ret <8 x double> %res
+}
+define <4 x float> @test_4xfloat_dup_low(<4 x float> %vec) {
+; CHECK-LABEL: test_4xfloat_dup_low:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; CHECK-NEXT: retq
+ %res = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+ ret <4 x float> %res
+}
+define <4 x float> @test_masked_4xfloat_dup_low_mask0(<4 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
+; CHECK-LABEL: test_masked_4xfloat_dup_low_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1
+; CHECK-NEXT: vmovsldup {{.*#+}} xmm1 {%k1} = xmm0[0,0,2,2]
+; CHECK-NEXT: vmovaps %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_masked_z_4xfloat_dup_low_mask0(<4 x float> %vec, <4 x float> %mask) {
+; CHECK-LABEL: test_masked_z_4xfloat_dup_low_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
+; CHECK-NEXT: vmovsldup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0,2,2]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
+ ret <4 x float> %res
+}
+define <4 x float> @test_masked_4xfloat_dup_low_mask1(<4 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
+; CHECK-LABEL: test_masked_4xfloat_dup_low_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1
+; CHECK-NEXT: vmovsldup {{.*#+}} xmm1 {%k1} = xmm0[0,0,2,2]
+; CHECK-NEXT: vmovaps %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_masked_z_4xfloat_dup_low_mask1(<4 x float> %vec, <4 x float> %mask) {
+; CHECK-LABEL: test_masked_z_4xfloat_dup_low_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
+; CHECK-NEXT: vmovsldup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0,2,2]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
+ ret <4 x float> %res
+}
+define <4 x float> @test_masked_4xfloat_dup_low_mask2(<4 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
+; CHECK-LABEL: test_masked_4xfloat_dup_low_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1
+; CHECK-NEXT: vmovsldup {{.*#+}} xmm1 {%k1} = xmm0[0,0,2,2]
+; CHECK-NEXT: vmovaps %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_masked_z_4xfloat_dup_low_mask2(<4 x float> %vec, <4 x float> %mask) {
+; CHECK-LABEL: test_masked_z_4xfloat_dup_low_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
+; CHECK-NEXT: vmovsldup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0,2,2]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
+ ret <4 x float> %res
+}
+define <4 x float> @test_masked_4xfloat_dup_low_mask3(<4 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
+; CHECK-LABEL: test_masked_4xfloat_dup_low_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1
+; CHECK-NEXT: vmovsldup {{.*#+}} xmm1 {%k1} = xmm0[0,0,2,2]
+; CHECK-NEXT: vmovaps %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_masked_z_4xfloat_dup_low_mask3(<4 x float> %vec, <4 x float> %mask) {
+; CHECK-LABEL: test_masked_z_4xfloat_dup_low_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
+; CHECK-NEXT: vmovsldup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0,2,2]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
+ ret <4 x float> %res
+}
+define <4 x float> @test_masked_4xfloat_dup_low_mask4(<4 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
+; CHECK-LABEL: test_masked_4xfloat_dup_low_mask4:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1
+; CHECK-NEXT: vmovsldup {{.*#+}} xmm1 {%k1} = xmm0[0,0,2,2]
+; CHECK-NEXT: vmovaps %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_masked_z_4xfloat_dup_low_mask4(<4 x float> %vec, <4 x float> %mask) {
+; CHECK-LABEL: test_masked_z_4xfloat_dup_low_mask4:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
+; CHECK-NEXT: vmovsldup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0,2,2]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
+ ret <4 x float> %res
+}
+define <4 x float> @test_4xfloat_dup_low_mem(<4 x float>* %vp) {
+; CHECK-LABEL: test_4xfloat_dup_low_mem:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovsldup {{.*#+}} xmm0 = mem[0,0,2,2]
+; CHECK-NEXT: retq
+ %vec = load <4 x float>, <4 x float>* %vp
+ %res = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+ ret <4 x float> %res
+}
+define <4 x float> @test_masked_4xfloat_dup_low_mem_mask0(<4 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) {
+; CHECK-LABEL: test_masked_4xfloat_dup_low_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
+; CHECK-NEXT: vmovsldup {{.*#+}} xmm0 {%k1} = mem[0,0,2,2]
+; CHECK-NEXT: retq
+ %vec = load <4 x float>, <4 x float>* %vp
+ %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_masked_z_4xfloat_dup_low_mem_mask0(<4 x float>* %vp, <4 x float> %mask) {
+; CHECK-LABEL: test_masked_z_4xfloat_dup_low_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vcmpeqps %xmm1, %xmm0, %k1
+; CHECK-NEXT: vmovsldup {{.*#+}} xmm0 {%k1} {z} = mem[0,0,2,2]
+; CHECK-NEXT: retq
+ %vec = load <4 x float>, <4 x float>* %vp
+ %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
+ ret <4 x float> %res
+}
+define <4 x float> @test_masked_4xfloat_dup_low_mem_mask1(<4 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) {
+; CHECK-LABEL: test_masked_4xfloat_dup_low_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
+; CHECK-NEXT: vmovsldup {{.*#+}} xmm0 {%k1} = mem[0,0,2,2]
+; CHECK-NEXT: retq
+ %vec = load <4 x float>, <4 x float>* %vp
+ %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_masked_z_4xfloat_dup_low_mem_mask1(<4 x float>* %vp, <4 x float> %mask) {
+; CHECK-LABEL: test_masked_z_4xfloat_dup_low_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vcmpeqps %xmm1, %xmm0, %k1
+; CHECK-NEXT: vmovsldup {{.*#+}} xmm0 {%k1} {z} = mem[0,0,2,2]
+; CHECK-NEXT: retq
+ %vec = load <4 x float>, <4 x float>* %vp
+ %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
+ ret <4 x float> %res
+}
+define <4 x float> @test_masked_4xfloat_dup_low_mem_mask2(<4 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) {
+; CHECK-LABEL: test_masked_4xfloat_dup_low_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
+; CHECK-NEXT: vmovsldup {{.*#+}} xmm0 {%k1} = mem[0,0,2,2]
+; CHECK-NEXT: retq
+ %vec = load <4 x float>, <4 x float>* %vp
+ %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_masked_z_4xfloat_dup_low_mem_mask2(<4 x float>* %vp, <4 x float> %mask) {
+; CHECK-LABEL: test_masked_z_4xfloat_dup_low_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vcmpeqps %xmm1, %xmm0, %k1
+; CHECK-NEXT: vmovsldup {{.*#+}} xmm0 {%k1} {z} = mem[0,0,2,2]
+; CHECK-NEXT: retq
+ %vec = load <4 x float>, <4 x float>* %vp
+ %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
+ ret <4 x float> %res
+}
+define <4 x float> @test_masked_4xfloat_dup_low_mem_mask3(<4 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) {
+; CHECK-LABEL: test_masked_4xfloat_dup_low_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
+; CHECK-NEXT: vmovsldup {{.*#+}} xmm0 {%k1} = mem[0,0,2,2]
+; CHECK-NEXT: retq
+ %vec = load <4 x float>, <4 x float>* %vp
+ %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_masked_z_4xfloat_dup_low_mem_mask3(<4 x float>* %vp, <4 x float> %mask) {
+; CHECK-LABEL: test_masked_z_4xfloat_dup_low_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vcmpeqps %xmm1, %xmm0, %k1
+; CHECK-NEXT: vmovsldup {{.*#+}} xmm0 {%k1} {z} = mem[0,0,2,2]
+; CHECK-NEXT: retq
+ %vec = load <4 x float>, <4 x float>* %vp
+ %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
+ ret <4 x float> %res
+}
+define <4 x float> @test_masked_4xfloat_dup_low_mem_mask4(<4 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) {
+; CHECK-LABEL: test_masked_4xfloat_dup_low_mem_mask4:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
+; CHECK-NEXT: vmovsldup {{.*#+}} xmm0 {%k1} = mem[0,0,2,2]
+; CHECK-NEXT: retq
+ %vec = load <4 x float>, <4 x float>* %vp
+ %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_masked_z_4xfloat_dup_low_mem_mask4(<4 x float>* %vp, <4 x float> %mask) {
+; CHECK-LABEL: test_masked_z_4xfloat_dup_low_mem_mask4:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vcmpeqps %xmm1, %xmm0, %k1
+; CHECK-NEXT: vmovsldup {{.*#+}} xmm0 {%k1} {z} = mem[0,0,2,2]
+; CHECK-NEXT: retq
+ %vec = load <4 x float>, <4 x float>* %vp
+ %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
+ ret <4 x float> %res
+}
+define <8 x float> @test_8xfloat_dup_low(<8 x float> %vec) {
+; CHECK-LABEL: test_8xfloat_dup_low:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovsldup {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6]
+; CHECK-NEXT: retq
+ %res = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+ ret <8 x float> %res
+}
+define <8 x float> @test_masked_8xfloat_dup_low_mask0(<8 x float> %vec, <8 x float> %vec2, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_8xfloat_dup_low_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1
+; CHECK-NEXT: vmovsldup {{.*#+}} ymm1 {%k1} = ymm0[0,0,2,2,4,4,6,6]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_masked_z_8xfloat_dup_low_mask0(<8 x float> %vec, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_z_8xfloat_dup_low_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1
+; CHECK-NEXT: vmovsldup {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,2,2,4,4,6,6]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+define <8 x float> @test_masked_8xfloat_dup_low_mask1(<8 x float> %vec, <8 x float> %vec2, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_8xfloat_dup_low_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1
+; CHECK-NEXT: vmovsldup {{.*#+}} ymm1 {%k1} = ymm0[0,0,2,2,4,4,6,6]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_masked_z_8xfloat_dup_low_mask1(<8 x float> %vec, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_z_8xfloat_dup_low_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1
+; CHECK-NEXT: vmovsldup {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,2,2,4,4,6,6]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+define <8 x float> @test_masked_8xfloat_dup_low_mask2(<8 x float> %vec, <8 x float> %vec2, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_8xfloat_dup_low_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1
+; CHECK-NEXT: vmovsldup {{.*#+}} ymm1 {%k1} = ymm0[0,0,2,2,4,4,6,6]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_masked_z_8xfloat_dup_low_mask2(<8 x float> %vec, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_z_8xfloat_dup_low_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1
+; CHECK-NEXT: vmovsldup {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,2,2,4,4,6,6]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+define <8 x float> @test_masked_8xfloat_dup_low_mask3(<8 x float> %vec, <8 x float> %vec2, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_8xfloat_dup_low_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1
+; CHECK-NEXT: vmovsldup {{.*#+}} ymm1 {%k1} = ymm0[0,0,2,2,4,4,6,6]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_masked_z_8xfloat_dup_low_mask3(<8 x float> %vec, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_z_8xfloat_dup_low_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1
+; CHECK-NEXT: vmovsldup {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,2,2,4,4,6,6]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+define <8 x float> @test_masked_8xfloat_dup_low_mask4(<8 x float> %vec, <8 x float> %vec2, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_8xfloat_dup_low_mask4:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1
+; CHECK-NEXT: vmovsldup {{.*#+}} ymm1 {%k1} = ymm0[0,0,2,2,4,4,6,6]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_masked_z_8xfloat_dup_low_mask4(<8 x float> %vec, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_z_8xfloat_dup_low_mask4:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1
+; CHECK-NEXT: vmovsldup {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,2,2,4,4,6,6]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+define <8 x float> @test_8xfloat_dup_low_mem(<8 x float>* %vp) {
+; CHECK-LABEL: test_8xfloat_dup_low_mem:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovsldup {{.*#+}} ymm0 = mem[0,0,2,2,4,4,6,6]
+; CHECK-NEXT: retq
+ %vec = load <8 x float>, <8 x float>* %vp
+ %res = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+ ret <8 x float> %res
+}
+define <8 x float> @test_masked_8xfloat_dup_low_mem_mask0(<8 x float>* %vp, <8 x float> %vec2, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_8xfloat_dup_low_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1
+; CHECK-NEXT: vmovsldup {{.*#+}} ymm0 {%k1} = mem[0,0,2,2,4,4,6,6]
+; CHECK-NEXT: retq
+ %vec = load <8 x float>, <8 x float>* %vp
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_masked_z_8xfloat_dup_low_mem_mask0(<8 x float>* %vp, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_z_8xfloat_dup_low_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vcmpeqps %ymm1, %ymm0, %k1
+; CHECK-NEXT: vmovsldup {{.*#+}} ymm0 {%k1} {z} = mem[0,0,2,2,4,4,6,6]
+; CHECK-NEXT: retq
+ %vec = load <8 x float>, <8 x float>* %vp
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+define <8 x float> @test_masked_8xfloat_dup_low_mem_mask1(<8 x float>* %vp, <8 x float> %vec2, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_8xfloat_dup_low_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1
+; CHECK-NEXT: vmovsldup {{.*#+}} ymm0 {%k1} = mem[0,0,2,2,4,4,6,6]
+; CHECK-NEXT: retq
+ %vec = load <8 x float>, <8 x float>* %vp
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_masked_z_8xfloat_dup_low_mem_mask1(<8 x float>* %vp, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_z_8xfloat_dup_low_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vcmpeqps %ymm1, %ymm0, %k1
+; CHECK-NEXT: vmovsldup {{.*#+}} ymm0 {%k1} {z} = mem[0,0,2,2,4,4,6,6]
+; CHECK-NEXT: retq
+ %vec = load <8 x float>, <8 x float>* %vp
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+define <8 x float> @test_masked_8xfloat_dup_low_mem_mask2(<8 x float>* %vp, <8 x float> %vec2, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_8xfloat_dup_low_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1
+; CHECK-NEXT: vmovsldup {{.*#+}} ymm0 {%k1} = mem[0,0,2,2,4,4,6,6]
+; CHECK-NEXT: retq
+ %vec = load <8 x float>, <8 x float>* %vp
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_masked_z_8xfloat_dup_low_mem_mask2(<8 x float>* %vp, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_z_8xfloat_dup_low_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vcmpeqps %ymm1, %ymm0, %k1
+; CHECK-NEXT: vmovsldup {{.*#+}} ymm0 {%k1} {z} = mem[0,0,2,2,4,4,6,6]
+; CHECK-NEXT: retq
+ %vec = load <8 x float>, <8 x float>* %vp
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+define <8 x float> @test_masked_8xfloat_dup_low_mem_mask3(<8 x float>* %vp, <8 x float> %vec2, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_8xfloat_dup_low_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1
+; CHECK-NEXT: vmovsldup {{.*#+}} ymm0 {%k1} = mem[0,0,2,2,4,4,6,6]
+; CHECK-NEXT: retq
+ %vec = load <8 x float>, <8 x float>* %vp
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_masked_z_8xfloat_dup_low_mem_mask3(<8 x float>* %vp, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_z_8xfloat_dup_low_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vcmpeqps %ymm1, %ymm0, %k1
+; CHECK-NEXT: vmovsldup {{.*#+}} ymm0 {%k1} {z} = mem[0,0,2,2,4,4,6,6]
+; CHECK-NEXT: retq
+ %vec = load <8 x float>, <8 x float>* %vp
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+define <8 x float> @test_masked_8xfloat_dup_low_mem_mask4(<8 x float>* %vp, <8 x float> %vec2, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_8xfloat_dup_low_mem_mask4:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1
+; CHECK-NEXT: vmovsldup {{.*#+}} ymm0 {%k1} = mem[0,0,2,2,4,4,6,6]
+; CHECK-NEXT: retq
+ %vec = load <8 x float>, <8 x float>* %vp
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_masked_z_8xfloat_dup_low_mem_mask4(<8 x float>* %vp, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_z_8xfloat_dup_low_mem_mask4:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vcmpeqps %ymm1, %ymm0, %k1
+; CHECK-NEXT: vmovsldup {{.*#+}} ymm0 {%k1} {z} = mem[0,0,2,2,4,4,6,6]
+; CHECK-NEXT: retq
+ %vec = load <8 x float>, <8 x float>* %vp
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+define <16 x float> @test_16xfloat_dup_low(<16 x float> %vec) {
+; CHECK-LABEL: test_16xfloat_dup_low:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovsldup {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
+; CHECK-NEXT: retq
+ %res = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
+ ret <16 x float> %res
+}
+define <16 x float> @test_masked_16xfloat_dup_low_mask0(<16 x float> %vec, <16 x float> %vec2, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_16xfloat_dup_low_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %zmm3, %zmm2, %k1
+; CHECK-NEXT: vmovsldup {{.*#+}} zmm1 {%k1} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
+; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_masked_z_16xfloat_dup_low_mask0(<16 x float> %vec, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_z_16xfloat_dup_low_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %zmm2, %zmm1, %k1
+; CHECK-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
+define <16 x float> @test_masked_16xfloat_dup_low_mask1(<16 x float> %vec, <16 x float> %vec2, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_16xfloat_dup_low_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %zmm3, %zmm2, %k1
+; CHECK-NEXT: vmovsldup {{.*#+}} zmm1 {%k1} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
+; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_masked_z_16xfloat_dup_low_mask1(<16 x float> %vec, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_z_16xfloat_dup_low_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %zmm2, %zmm1, %k1
+; CHECK-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
+define <16 x float> @test_masked_16xfloat_dup_low_mask2(<16 x float> %vec, <16 x float> %vec2, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_16xfloat_dup_low_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %zmm3, %zmm2, %k1
+; CHECK-NEXT: vmovsldup {{.*#+}} zmm1 {%k1} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
+; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_masked_z_16xfloat_dup_low_mask2(<16 x float> %vec, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_z_16xfloat_dup_low_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %zmm2, %zmm1, %k1
+; CHECK-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
+define <16 x float> @test_masked_16xfloat_dup_low_mask3(<16 x float> %vec, <16 x float> %vec2, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_16xfloat_dup_low_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %zmm3, %zmm2, %k1
+; CHECK-NEXT: vmovsldup {{.*#+}} zmm1 {%k1} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
+; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_masked_z_16xfloat_dup_low_mask3(<16 x float> %vec, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_z_16xfloat_dup_low_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %zmm2, %zmm1, %k1
+; CHECK-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
+define <16 x float> @test_masked_16xfloat_dup_low_mask4(<16 x float> %vec, <16 x float> %vec2, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_16xfloat_dup_low_mask4:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %zmm3, %zmm2, %k1
+; CHECK-NEXT: vmovsldup {{.*#+}} zmm1 {%k1} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
+; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_masked_z_16xfloat_dup_low_mask4(<16 x float> %vec, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_z_16xfloat_dup_low_mask4:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %zmm2, %zmm1, %k1
+; CHECK-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
+define <16 x float> @test_16xfloat_dup_low_mem(<16 x float>* %vp) {
+; CHECK-LABEL: test_16xfloat_dup_low_mem:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovsldup {{.*#+}} zmm0 = mem[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
+; CHECK-NEXT: retq
+ %vec = load <16 x float>, <16 x float>* %vp
+ %res = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
+ ret <16 x float> %res
+}
+define <16 x float> @test_masked_16xfloat_dup_low_mem_mask0(<16 x float>* %vp, <16 x float> %vec2, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_16xfloat_dup_low_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %zmm2, %zmm1, %k1
+; CHECK-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} = mem[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
+; CHECK-NEXT: retq
+ %vec = load <16 x float>, <16 x float>* %vp
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_masked_z_16xfloat_dup_low_mem_mask0(<16 x float>* %vp, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_z_16xfloat_dup_low_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vcmpeqps %zmm1, %zmm0, %k1
+; CHECK-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} {z} = mem[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
+; CHECK-NEXT: retq
+ %vec = load <16 x float>, <16 x float>* %vp
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
+define <16 x float> @test_masked_16xfloat_dup_low_mem_mask1(<16 x float>* %vp, <16 x float> %vec2, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_16xfloat_dup_low_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %zmm2, %zmm1, %k1
+; CHECK-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} = mem[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
+; CHECK-NEXT: retq
+ %vec = load <16 x float>, <16 x float>* %vp
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_masked_z_16xfloat_dup_low_mem_mask1(<16 x float>* %vp, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_z_16xfloat_dup_low_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vcmpeqps %zmm1, %zmm0, %k1
+; CHECK-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} {z} = mem[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
+; CHECK-NEXT: retq
+ %vec = load <16 x float>, <16 x float>* %vp
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
+define <16 x float> @test_masked_16xfloat_dup_low_mem_mask2(<16 x float>* %vp, <16 x float> %vec2, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_16xfloat_dup_low_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %zmm2, %zmm1, %k1
+; CHECK-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} = mem[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
+; CHECK-NEXT: retq
+ %vec = load <16 x float>, <16 x float>* %vp
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_masked_z_16xfloat_dup_low_mem_mask2(<16 x float>* %vp, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_z_16xfloat_dup_low_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vcmpeqps %zmm1, %zmm0, %k1
+; CHECK-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} {z} = mem[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
+; CHECK-NEXT: retq
+ %vec = load <16 x float>, <16 x float>* %vp
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
+define <16 x float> @test_masked_16xfloat_dup_low_mem_mask3(<16 x float>* %vp, <16 x float> %vec2, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_16xfloat_dup_low_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %zmm2, %zmm1, %k1
+; CHECK-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} = mem[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
+; CHECK-NEXT: retq
+ %vec = load <16 x float>, <16 x float>* %vp
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_masked_z_16xfloat_dup_low_mem_mask3(<16 x float>* %vp, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_z_16xfloat_dup_low_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vcmpeqps %zmm1, %zmm0, %k1
+; CHECK-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} {z} = mem[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
+; CHECK-NEXT: retq
+ %vec = load <16 x float>, <16 x float>* %vp
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
+define <16 x float> @test_masked_16xfloat_dup_low_mem_mask4(<16 x float>* %vp, <16 x float> %vec2, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_16xfloat_dup_low_mem_mask4:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %zmm2, %zmm1, %k1
+; CHECK-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} = mem[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
+; CHECK-NEXT: retq
+ %vec = load <16 x float>, <16 x float>* %vp
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_masked_z_16xfloat_dup_low_mem_mask4(<16 x float>* %vp, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_z_16xfloat_dup_low_mem_mask4:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vcmpeqps %zmm1, %zmm0, %k1
+; CHECK-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} {z} = mem[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
+; CHECK-NEXT: retq
+ %vec = load <16 x float>, <16 x float>* %vp
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
diff --git a/test/CodeGen/X86/avx512-shuffles/in_lane_permute.ll b/test/CodeGen/X86/avx512-shuffles/in_lane_permute.ll
new file mode 100644
index 000000000000..24b387d96df4
--- /dev/null
+++ b/test/CodeGen/X86/avx512-shuffles/in_lane_permute.ll
@@ -0,0 +1,1866 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512vl %s -o - | FileCheck %s
+
+; FIXME: The non immediate <16 x float> test cases should be fixed by PR34382
+
+define <4 x float> @test_4xfloat_perm_mask0(<4 x float> %vec) {
+; CHECK-LABEL: test_4xfloat_perm_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,1,3,1]
+; CHECK-NEXT: retq
+ %res = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 2, i32 1, i32 3, i32 1>
+ ret <4 x float> %res
+}
+define <4 x float> @test_masked_4xfloat_perm_mask0(<4 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
+; CHECK-LABEL: test_masked_4xfloat_perm_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1
+; CHECK-NEXT: vpermilps {{.*#+}} xmm1 {%k1} = xmm0[2,1,3,1]
+; CHECK-NEXT: vmovaps %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 2, i32 1, i32 3, i32 1>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_masked_z_4xfloat_perm_mask0(<4 x float> %vec, <4 x float> %mask) {
+; CHECK-LABEL: test_masked_z_4xfloat_perm_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
+; CHECK-NEXT: vpermilps {{.*#+}} xmm0 {%k1} {z} = xmm0[2,1,3,1]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 2, i32 1, i32 3, i32 1>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
+ ret <4 x float> %res
+}
+define <4 x float> @test_masked_4xfloat_perm_mask1(<4 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
+; CHECK-LABEL: test_masked_4xfloat_perm_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1
+; CHECK-NEXT: vpermilps {{.*#+}} xmm1 {%k1} = xmm0[1,2,3,2]
+; CHECK-NEXT: vmovaps %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 1, i32 2, i32 3, i32 2>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_masked_z_4xfloat_perm_mask1(<4 x float> %vec, <4 x float> %mask) {
+; CHECK-LABEL: test_masked_z_4xfloat_perm_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
+; CHECK-NEXT: vpermilps {{.*#+}} xmm0 {%k1} {z} = xmm0[1,2,3,2]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 1, i32 2, i32 3, i32 2>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
+ ret <4 x float> %res
+}
+define <4 x float> @test_masked_4xfloat_perm_mask2(<4 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
+; CHECK-LABEL: test_masked_4xfloat_perm_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1
+; CHECK-NEXT: vpermilps {{.*#+}} xmm1 {%k1} = xmm0[1,3,2,1]
+; CHECK-NEXT: vmovaps %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 2, i32 1>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_masked_z_4xfloat_perm_mask2(<4 x float> %vec, <4 x float> %mask) {
+; CHECK-LABEL: test_masked_z_4xfloat_perm_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
+; CHECK-NEXT: vpermilps {{.*#+}} xmm0 {%k1} {z} = xmm0[1,3,2,1]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 2, i32 1>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
+ ret <4 x float> %res
+}
+define <4 x float> @test_4xfloat_perm_mask3(<4 x float> %vec) {
+; CHECK-LABEL: test_4xfloat_perm_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,2,3,2]
+; CHECK-NEXT: retq
+ %res = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 1, i32 2, i32 3, i32 2>
+ ret <4 x float> %res
+}
+define <4 x float> @test_masked_4xfloat_perm_mask3(<4 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
+; CHECK-LABEL: test_masked_4xfloat_perm_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1
+; CHECK-NEXT: vpermilps {{.*#+}} xmm1 {%k1} = xmm0[1,2,3,2]
+; CHECK-NEXT: vmovaps %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 1, i32 2, i32 3, i32 2>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_masked_z_4xfloat_perm_mask3(<4 x float> %vec, <4 x float> %mask) {
+; CHECK-LABEL: test_masked_z_4xfloat_perm_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
+; CHECK-NEXT: vpermilps {{.*#+}} xmm0 {%k1} {z} = xmm0[1,2,3,2]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 1, i32 2, i32 3, i32 2>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
+ ret <4 x float> %res
+}
+define <4 x float> @test_4xfloat_perm_mem_mask0(<4 x float>* %vp) {
+; CHECK-LABEL: test_4xfloat_perm_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = mem[3,3,1,3]
+; CHECK-NEXT: retq
+ %vec = load <4 x float>, <4 x float>* %vp
+ %res = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 1, i32 3>
+ ret <4 x float> %res
+}
+define <4 x float> @test_masked_4xfloat_perm_mem_mask0(<4 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) {
+; CHECK-LABEL: test_masked_4xfloat_perm_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
+; CHECK-NEXT: vpermilps {{.*#+}} xmm0 {%k1} = mem[3,3,1,3]
+; CHECK-NEXT: retq
+ %vec = load <4 x float>, <4 x float>* %vp
+ %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 1, i32 3>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_masked_z_4xfloat_perm_mem_mask0(<4 x float>* %vp, <4 x float> %mask) {
+; CHECK-LABEL: test_masked_z_4xfloat_perm_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vcmpeqps %xmm1, %xmm0, %k1
+; CHECK-NEXT: vpermilps {{.*#+}} xmm0 {%k1} {z} = mem[3,3,1,3]
+; CHECK-NEXT: retq
+ %vec = load <4 x float>, <4 x float>* %vp
+ %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 1, i32 3>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_masked_4xfloat_perm_mem_mask1(<4 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) {
+; CHECK-LABEL: test_masked_4xfloat_perm_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
+; CHECK-NEXT: vpermilps {{.*#+}} xmm0 {%k1} = mem[1,3,2,0]
+; CHECK-NEXT: retq
+ %vec = load <4 x float>, <4 x float>* %vp
+ %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_masked_z_4xfloat_perm_mem_mask1(<4 x float>* %vp, <4 x float> %mask) {
+; CHECK-LABEL: test_masked_z_4xfloat_perm_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vcmpeqps %xmm1, %xmm0, %k1
+; CHECK-NEXT: vpermilps {{.*#+}} xmm0 {%k1} {z} = mem[1,3,2,0]
+; CHECK-NEXT: retq
+ %vec = load <4 x float>, <4 x float>* %vp
+ %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_masked_4xfloat_perm_mem_mask2(<4 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) {
+; CHECK-LABEL: test_masked_4xfloat_perm_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
+; CHECK-NEXT: vpermilps {{.*#+}} xmm0 {%k1} = mem[2,1,3,2]
+; CHECK-NEXT: retq
+ %vec = load <4 x float>, <4 x float>* %vp
+ %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 2, i32 1, i32 3, i32 2>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_masked_z_4xfloat_perm_mem_mask2(<4 x float>* %vp, <4 x float> %mask) {
+; CHECK-LABEL: test_masked_z_4xfloat_perm_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vcmpeqps %xmm1, %xmm0, %k1
+; CHECK-NEXT: vpermilps {{.*#+}} xmm0 {%k1} {z} = mem[2,1,3,2]
+; CHECK-NEXT: retq
+ %vec = load <4 x float>, <4 x float>* %vp
+ %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 2, i32 1, i32 3, i32 2>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_4xfloat_perm_mem_mask3(<4 x float>* %vp) {
+; CHECK-LABEL: test_4xfloat_perm_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = mem[0,1,3,0]
+; CHECK-NEXT: retq
+ %vec = load <4 x float>, <4 x float>* %vp
+ %res = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 0, i32 1, i32 3, i32 0>
+ ret <4 x float> %res
+}
+define <4 x float> @test_masked_4xfloat_perm_mem_mask3(<4 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) {
+; CHECK-LABEL: test_masked_4xfloat_perm_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
+; CHECK-NEXT: vpermilps {{.*#+}} xmm0 {%k1} = mem[0,1,3,0]
+; CHECK-NEXT: retq
+ %vec = load <4 x float>, <4 x float>* %vp
+ %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 0, i32 1, i32 3, i32 0>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_masked_z_4xfloat_perm_mem_mask3(<4 x float>* %vp, <4 x float> %mask) {
+; CHECK-LABEL: test_masked_z_4xfloat_perm_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vcmpeqps %xmm1, %xmm0, %k1
+; CHECK-NEXT: vpermilps {{.*#+}} xmm0 {%k1} {z} = mem[0,1,3,0]
+; CHECK-NEXT: retq
+ %vec = load <4 x float>, <4 x float>* %vp
+ %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 0, i32 1, i32 3, i32 0>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
+ ret <4 x float> %res
+}
+
+define <8 x float> @test_8xfloat_perm_mask0(<8 x float> %vec) {
+; CHECK-LABEL: test_8xfloat_perm_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,6,6,6]
+; CHECK-NEXT: retq
+ %res = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 6, i32 6, i32 6>
+ ret <8 x float> %res
+}
+define <8 x float> @test_masked_8xfloat_perm_mask0(<8 x float> %vec, <8 x float> %vec2, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_8xfloat_perm_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1
+; CHECK-NEXT: vpermilps {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,4,6,6,6]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 6, i32 6, i32 6>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_masked_z_8xfloat_perm_mask0(<8 x float> %vec, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_z_8xfloat_perm_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1
+; CHECK-NEXT: vpermilps {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,4,6,6,6]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 6, i32 6, i32 6>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+define <8 x float> @test_masked_8xfloat_perm_imm_mask1(<8 x float> %vec, <8 x float> %vec2, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_8xfloat_perm_imm_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1
+; CHECK-NEXT: vpermilps {{.*#+}} ymm1 {%k1} = ymm0[3,2,3,2,7,6,7,6]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 3, i32 2, i32 3, i32 2, i32 7, i32 6, i32 7, i32 6>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_masked_z_8xfloat_perm_imm_mask1(<8 x float> %vec, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_z_8xfloat_perm_imm_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1
+; CHECK-NEXT: vpermilps {{.*#+}} ymm0 {%k1} {z} = ymm0[3,2,3,2,7,6,7,6]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 3, i32 2, i32 3, i32 2, i32 7, i32 6, i32 7, i32 6>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+define <8 x float> @test_masked_8xfloat_perm_mask2(<8 x float> %vec, <8 x float> %vec2, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_8xfloat_perm_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1
+; CHECK-NEXT: vpermilps {{.*#+}} ymm1 {%k1} = ymm0[2,1,2,1,6,5,4,4]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 2, i32 1, i32 2, i32 1, i32 6, i32 5, i32 4, i32 4>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_masked_z_8xfloat_perm_mask2(<8 x float> %vec, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_z_8xfloat_perm_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1
+; CHECK-NEXT: vpermilps {{.*#+}} ymm0 {%k1} {z} = ymm0[2,1,2,1,6,5,4,4]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 2, i32 1, i32 2, i32 1, i32 6, i32 5, i32 4, i32 4>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+define <8 x float> @test_8xfloat_perm_imm_mask3(<8 x float> %vec) {
+; CHECK-LABEL: test_8xfloat_perm_imm_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,2,1,0,6,6,5,4]
+; CHECK-NEXT: retq
+ %res = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 2, i32 2, i32 1, i32 0, i32 6, i32 6, i32 5, i32 4>
+ ret <8 x float> %res
+}
+define <8 x float> @test_masked_8xfloat_perm_imm_mask3(<8 x float> %vec, <8 x float> %vec2, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_8xfloat_perm_imm_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1
+; CHECK-NEXT: vpermilps {{.*#+}} ymm1 {%k1} = ymm0[2,2,1,0,6,6,5,4]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 2, i32 2, i32 1, i32 0, i32 6, i32 6, i32 5, i32 4>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_masked_z_8xfloat_perm_imm_mask3(<8 x float> %vec, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_z_8xfloat_perm_imm_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1
+; CHECK-NEXT: vpermilps {{.*#+}} ymm0 {%k1} {z} = ymm0[2,2,1,0,6,6,5,4]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 2, i32 2, i32 1, i32 0, i32 6, i32 6, i32 5, i32 4>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+define <8 x float> @test_masked_8xfloat_perm_mask4(<8 x float> %vec, <8 x float> %vec2, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_8xfloat_perm_mask4:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1
+; CHECK-NEXT: vpermilps {{.*#+}} ymm1 {%k1} = ymm0[3,3,3,3,7,7,6,5]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 7, i32 7, i32 6, i32 5>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_masked_z_8xfloat_perm_mask4(<8 x float> %vec, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_z_8xfloat_perm_mask4:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1
+; CHECK-NEXT: vpermilps {{.*#+}} ymm0 {%k1} {z} = ymm0[3,3,3,3,7,7,6,5]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 7, i32 7, i32 6, i32 5>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+define <8 x float> @test_masked_8xfloat_perm_imm_mask5(<8 x float> %vec, <8 x float> %vec2, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_8xfloat_perm_imm_mask5:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1
+; CHECK-NEXT: vpermilps {{.*#+}} ymm1 {%k1} = ymm0[2,1,3,3,6,5,7,7]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 2, i32 1, i32 3, i32 3, i32 6, i32 5, i32 7, i32 7>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_masked_z_8xfloat_perm_imm_mask5(<8 x float> %vec, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_z_8xfloat_perm_imm_mask5:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1
+; CHECK-NEXT: vpermilps {{.*#+}} ymm0 {%k1} {z} = ymm0[2,1,3,3,6,5,7,7]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 2, i32 1, i32 3, i32 3, i32 6, i32 5, i32 7, i32 7>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+define <8 x float> @test_8xfloat_perm_mask6(<8 x float> %vec) {
+; CHECK-LABEL: test_8xfloat_perm_mask6:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,3,2,5,6,7,7]
+; CHECK-NEXT: retq
+ %res = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 3, i32 2, i32 3, i32 2, i32 5, i32 6, i32 7, i32 7>
+ ret <8 x float> %res
+}
+define <8 x float> @test_masked_8xfloat_perm_mask6(<8 x float> %vec, <8 x float> %vec2, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_8xfloat_perm_mask6:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1
+; CHECK-NEXT: vpermilps {{.*#+}} ymm1 {%k1} = ymm0[3,2,3,2,5,6,7,7]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 3, i32 2, i32 3, i32 2, i32 5, i32 6, i32 7, i32 7>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_masked_z_8xfloat_perm_mask6(<8 x float> %vec, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_z_8xfloat_perm_mask6:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1
+; CHECK-NEXT: vpermilps {{.*#+}} ymm0 {%k1} {z} = ymm0[3,2,3,2,5,6,7,7]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 3, i32 2, i32 3, i32 2, i32 5, i32 6, i32 7, i32 7>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+define <8 x float> @test_masked_8xfloat_perm_imm_mask7(<8 x float> %vec, <8 x float> %vec2, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_8xfloat_perm_imm_mask7:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1
+; CHECK-NEXT: vpermilps {{.*#+}} ymm1 {%k1} = ymm0[3,0,2,1,7,4,6,5]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 3, i32 0, i32 2, i32 1, i32 7, i32 4, i32 6, i32 5>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_masked_z_8xfloat_perm_imm_mask7(<8 x float> %vec, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_z_8xfloat_perm_imm_mask7:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1
+; CHECK-NEXT: vpermilps {{.*#+}} ymm0 {%k1} {z} = ymm0[3,0,2,1,7,4,6,5]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 3, i32 0, i32 2, i32 1, i32 7, i32 4, i32 6, i32 5>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+define <8 x float> @test_8xfloat_perm_mem_mask0(<8 x float>* %vp) {
+; CHECK-LABEL: test_8xfloat_perm_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps (%rdi), %ymm0
+; CHECK-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,0,0,2,4,6,7,6]
+; CHECK-NEXT: retq
+ %vec = load <8 x float>, <8 x float>* %vp
+ %res = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 3, i32 0, i32 0, i32 2, i32 4, i32 6, i32 7, i32 6>
+ ret <8 x float> %res
+}
+define <8 x float> @test_masked_8xfloat_perm_mem_mask0(<8 x float>* %vp, <8 x float> %vec2, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_8xfloat_perm_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps (%rdi), %ymm2
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %ymm3, %ymm1, %k1
+; CHECK-NEXT: vpermilps {{.*#+}} ymm0 {%k1} = ymm2[3,0,0,2,4,6,7,6]
+; CHECK-NEXT: retq
+ %vec = load <8 x float>, <8 x float>* %vp
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 3, i32 0, i32 0, i32 2, i32 4, i32 6, i32 7, i32 6>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_masked_z_8xfloat_perm_mem_mask0(<8 x float>* %vp, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_z_8xfloat_perm_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps (%rdi), %ymm1
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %ymm2, %ymm0, %k1
+; CHECK-NEXT: vpermilps {{.*#+}} ymm0 {%k1} {z} = ymm1[3,0,0,2,4,6,7,6]
+; CHECK-NEXT: retq
+ %vec = load <8 x float>, <8 x float>* %vp
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 3, i32 0, i32 0, i32 2, i32 4, i32 6, i32 7, i32 6>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_masked_8xfloat_perm_imm_mem_mask1(<8 x float>* %vp, <8 x float> %vec2, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_8xfloat_perm_imm_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1
+; CHECK-NEXT: vpermilps {{.*#+}} ymm0 {%k1} = mem[2,0,2,2,6,4,6,6]
+; CHECK-NEXT: retq
+ %vec = load <8 x float>, <8 x float>* %vp
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 2, i32 0, i32 2, i32 2, i32 6, i32 4, i32 6, i32 6>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_masked_z_8xfloat_perm_imm_mem_mask1(<8 x float>* %vp, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_z_8xfloat_perm_imm_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vcmpeqps %ymm1, %ymm0, %k1
+; CHECK-NEXT: vpermilps {{.*#+}} ymm0 {%k1} {z} = mem[2,0,2,2,6,4,6,6]
+; CHECK-NEXT: retq
+ %vec = load <8 x float>, <8 x float>* %vp
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 2, i32 0, i32 2, i32 2, i32 6, i32 4, i32 6, i32 6>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_masked_8xfloat_perm_mem_mask2(<8 x float>* %vp, <8 x float> %vec2, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_8xfloat_perm_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps (%rdi), %ymm2
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %ymm3, %ymm1, %k1
+; CHECK-NEXT: vpermilps {{.*#+}} ymm0 {%k1} = ymm2[2,1,1,3,4,4,7,4]
+; CHECK-NEXT: retq
+ %vec = load <8 x float>, <8 x float>* %vp
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 2, i32 1, i32 1, i32 3, i32 4, i32 4, i32 7, i32 4>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_masked_z_8xfloat_perm_mem_mask2(<8 x float>* %vp, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_z_8xfloat_perm_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps (%rdi), %ymm1
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %ymm2, %ymm0, %k1
+; CHECK-NEXT: vpermilps {{.*#+}} ymm0 {%k1} {z} = ymm1[2,1,1,3,4,4,7,4]
+; CHECK-NEXT: retq
+ %vec = load <8 x float>, <8 x float>* %vp
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 2, i32 1, i32 1, i32 3, i32 4, i32 4, i32 7, i32 4>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_8xfloat_perm_imm_mem_mask3(<8 x float>* %vp) {
+; CHECK-LABEL: test_8xfloat_perm_imm_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpermilps {{.*#+}} ymm0 = mem[0,0,3,3,4,4,7,7]
+; CHECK-NEXT: retq
+ %vec = load <8 x float>, <8 x float>* %vp
+ %res = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 3, i32 3, i32 4, i32 4, i32 7, i32 7>
+ ret <8 x float> %res
+}
+define <8 x float> @test_masked_8xfloat_perm_imm_mem_mask3(<8 x float>* %vp, <8 x float> %vec2, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_8xfloat_perm_imm_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1
+; CHECK-NEXT: vpermilps {{.*#+}} ymm0 {%k1} = mem[0,0,3,3,4,4,7,7]
+; CHECK-NEXT: retq
+ %vec = load <8 x float>, <8 x float>* %vp
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 3, i32 3, i32 4, i32 4, i32 7, i32 7>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_masked_z_8xfloat_perm_imm_mem_mask3(<8 x float>* %vp, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_z_8xfloat_perm_imm_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vcmpeqps %ymm1, %ymm0, %k1
+; CHECK-NEXT: vpermilps {{.*#+}} ymm0 {%k1} {z} = mem[0,0,3,3,4,4,7,7]
+; CHECK-NEXT: retq
+ %vec = load <8 x float>, <8 x float>* %vp
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 3, i32 3, i32 4, i32 4, i32 7, i32 7>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_masked_8xfloat_perm_mem_mask4(<8 x float>* %vp, <8 x float> %vec2, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_8xfloat_perm_mem_mask4:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps (%rdi), %ymm2
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %ymm3, %ymm1, %k1
+; CHECK-NEXT: vpermilps {{.*#+}} ymm0 {%k1} = ymm2[0,1,0,1,4,6,5,4]
+; CHECK-NEXT: retq
+ %vec = load <8 x float>, <8 x float>* %vp
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 6, i32 5, i32 4>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_masked_z_8xfloat_perm_mem_mask4(<8 x float>* %vp, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_z_8xfloat_perm_mem_mask4:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps (%rdi), %ymm1
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %ymm2, %ymm0, %k1
+; CHECK-NEXT: vpermilps {{.*#+}} ymm0 {%k1} {z} = ymm1[0,1,0,1,4,6,5,4]
+; CHECK-NEXT: retq
+ %vec = load <8 x float>, <8 x float>* %vp
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 6, i32 5, i32 4>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_masked_8xfloat_perm_imm_mem_mask5(<8 x float>* %vp, <8 x float> %vec2, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_8xfloat_perm_imm_mem_mask5:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1
+; CHECK-NEXT: vpermilps {{.*#+}} ymm0 {%k1} = mem[2,0,0,3,6,4,4,7]
+; CHECK-NEXT: retq
+ %vec = load <8 x float>, <8 x float>* %vp
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 2, i32 0, i32 0, i32 3, i32 6, i32 4, i32 4, i32 7>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_masked_z_8xfloat_perm_imm_mem_mask5(<8 x float>* %vp, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_z_8xfloat_perm_imm_mem_mask5:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vcmpeqps %ymm1, %ymm0, %k1
+; CHECK-NEXT: vpermilps {{.*#+}} ymm0 {%k1} {z} = mem[2,0,0,3,6,4,4,7]
+; CHECK-NEXT: retq
+ %vec = load <8 x float>, <8 x float>* %vp
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 2, i32 0, i32 0, i32 3, i32 6, i32 4, i32 4, i32 7>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_8xfloat_perm_mem_mask6(<8 x float>* %vp) {
+; CHECK-LABEL: test_8xfloat_perm_mem_mask6:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps (%rdi), %ymm0
+; CHECK-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,2,3,7,4,6,7]
+; CHECK-NEXT: retq
+ %vec = load <8 x float>, <8 x float>* %vp
+ %res = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 4, i32 6, i32 7>
+ ret <8 x float> %res
+}
+define <8 x float> @test_masked_8xfloat_perm_mem_mask6(<8 x float>* %vp, <8 x float> %vec2, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_8xfloat_perm_mem_mask6:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps (%rdi), %ymm2
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %ymm3, %ymm1, %k1
+; CHECK-NEXT: vpermilps {{.*#+}} ymm0 {%k1} = ymm2[0,1,2,3,7,4,6,7]
+; CHECK-NEXT: retq
+ %vec = load <8 x float>, <8 x float>* %vp
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 4, i32 6, i32 7>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_masked_z_8xfloat_perm_mem_mask6(<8 x float>* %vp, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_z_8xfloat_perm_mem_mask6:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps (%rdi), %ymm1
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %ymm2, %ymm0, %k1
+; CHECK-NEXT: vpermilps {{.*#+}} ymm0 {%k1} {z} = ymm1[0,1,2,3,7,4,6,7]
+; CHECK-NEXT: retq
+ %vec = load <8 x float>, <8 x float>* %vp
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 4, i32 6, i32 7>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_masked_8xfloat_perm_imm_mem_mask7(<8 x float>* %vp, <8 x float> %vec2, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_8xfloat_perm_imm_mem_mask7:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1
+; CHECK-NEXT: vpermilps {{.*#+}} ymm0 {%k1} = mem[0,2,3,1,4,6,7,5]
+; CHECK-NEXT: retq
+ %vec = load <8 x float>, <8 x float>* %vp
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 2, i32 3, i32 1, i32 4, i32 6, i32 7, i32 5>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_masked_z_8xfloat_perm_imm_mem_mask7(<8 x float>* %vp, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_z_8xfloat_perm_imm_mem_mask7:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vcmpeqps %ymm1, %ymm0, %k1
+; CHECK-NEXT: vpermilps {{.*#+}} ymm0 {%k1} {z} = mem[0,2,3,1,4,6,7,5]
+; CHECK-NEXT: retq
+ %vec = load <8 x float>, <8 x float>* %vp
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 2, i32 3, i32 1, i32 4, i32 6, i32 7, i32 5>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+
+define <16 x float> @test_16xfloat_perm_mask0(<16 x float> %vec) {
+; CHECK-LABEL: test_16xfloat_perm_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[1,1,3,1,6,4,6,5,8,9,8,11,13,13,13,15]
+; CHECK-NEXT: retq
+ %res = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 1, i32 6, i32 4, i32 6, i32 5, i32 8, i32 9, i32 8, i32 11, i32 13, i32 13, i32 13, i32 15>
+ ret <16 x float> %res
+}
+define <16 x float> @test_masked_16xfloat_perm_mask0(<16 x float> %vec, <16 x float> %vec2, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_16xfloat_perm_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %zmm3, %zmm2, %k1
+; CHECK-NEXT: vpermilps {{.*#+}} zmm1 {%k1} = zmm0[1,1,3,1,6,4,6,5,8,9,8,11,13,13,13,15]
+; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 1, i32 6, i32 4, i32 6, i32 5, i32 8, i32 9, i32 8, i32 11, i32 13, i32 13, i32 13, i32 15>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_masked_z_16xfloat_perm_mask0(<16 x float> %vec, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_z_16xfloat_perm_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %zmm2, %zmm1, %k1
+; CHECK-NEXT: vpermilps {{.*#+}} zmm0 {%k1} {z} = zmm0[1,1,3,1,6,4,6,5,8,9,8,11,13,13,13,15]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 1, i32 6, i32 4, i32 6, i32 5, i32 8, i32 9, i32 8, i32 11, i32 13, i32 13, i32 13, i32 15>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
+define <16 x float> @test_masked_16xfloat_perm_imm_mask1(<16 x float> %vec, <16 x float> %vec2, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_16xfloat_perm_imm_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %zmm3, %zmm2, %k1
+; CHECK-NEXT: vpermilps {{.*#+}} zmm1 {%k1} = zmm0[2,2,2,1,6,6,6,5,10,10,10,9,14,14,14,13]
+; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 2, i32 2, i32 2, i32 1, i32 6, i32 6, i32 6, i32 5, i32 10, i32 10, i32 10, i32 9, i32 14, i32 14, i32 14, i32 13>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_masked_z_16xfloat_perm_imm_mask1(<16 x float> %vec, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_z_16xfloat_perm_imm_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %zmm2, %zmm1, %k1
+; CHECK-NEXT: vpermilps {{.*#+}} zmm0 {%k1} {z} = zmm0[2,2,2,1,6,6,6,5,10,10,10,9,14,14,14,13]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 2, i32 2, i32 2, i32 1, i32 6, i32 6, i32 6, i32 5, i32 10, i32 10, i32 10, i32 9, i32 14, i32 14, i32 14, i32 13>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
+define <16 x float> @test_masked_16xfloat_perm_mask2(<16 x float> %vec, <16 x float> %vec2, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_16xfloat_perm_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %zmm3, %zmm2, %k1
+; CHECK-NEXT: vpermilps {{.*#+}} zmm1 {%k1} = zmm0[1,2,0,0,5,4,6,5,11,10,9,9,14,13,14,12]
+; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 1, i32 2, i32 0, i32 0, i32 5, i32 4, i32 6, i32 5, i32 11, i32 10, i32 9, i32 9, i32 14, i32 13, i32 14, i32 12>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_masked_z_16xfloat_perm_mask2(<16 x float> %vec, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_z_16xfloat_perm_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %zmm2, %zmm1, %k1
+; CHECK-NEXT: vpermilps {{.*#+}} zmm0 {%k1} {z} = zmm0[1,2,0,0,5,4,6,5,11,10,9,9,14,13,14,12]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 1, i32 2, i32 0, i32 0, i32 5, i32 4, i32 6, i32 5, i32 11, i32 10, i32 9, i32 9, i32 14, i32 13, i32 14, i32 12>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
+define <16 x float> @test_16xfloat_perm_imm_mask3(<16 x float> %vec) {
+; CHECK-LABEL: test_16xfloat_perm_imm_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[1,1,0,2,5,5,4,6,9,9,8,10,13,13,12,14]
+; CHECK-NEXT: retq
+ %res = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 1, i32 1, i32 0, i32 2, i32 5, i32 5, i32 4, i32 6, i32 9, i32 9, i32 8, i32 10, i32 13, i32 13, i32 12, i32 14>
+ ret <16 x float> %res
+}
+define <16 x float> @test_masked_16xfloat_perm_imm_mask3(<16 x float> %vec, <16 x float> %vec2, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_16xfloat_perm_imm_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %zmm3, %zmm2, %k1
+; CHECK-NEXT: vpermilps {{.*#+}} zmm1 {%k1} = zmm0[1,1,0,2,5,5,4,6,9,9,8,10,13,13,12,14]
+; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 1, i32 1, i32 0, i32 2, i32 5, i32 5, i32 4, i32 6, i32 9, i32 9, i32 8, i32 10, i32 13, i32 13, i32 12, i32 14>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_masked_z_16xfloat_perm_imm_mask3(<16 x float> %vec, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_z_16xfloat_perm_imm_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %zmm2, %zmm1, %k1
+; CHECK-NEXT: vpermilps {{.*#+}} zmm0 {%k1} {z} = zmm0[1,1,0,2,5,5,4,6,9,9,8,10,13,13,12,14]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 1, i32 1, i32 0, i32 2, i32 5, i32 5, i32 4, i32 6, i32 9, i32 9, i32 8, i32 10, i32 13, i32 13, i32 12, i32 14>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
+define <16 x float> @test_masked_16xfloat_perm_mask4(<16 x float> %vec, <16 x float> %vec2, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_16xfloat_perm_mask4:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %zmm3, %zmm2, %k1
+; CHECK-NEXT: vpermilps {{.*#+}} zmm1 {%k1} = zmm0[1,2,3,3,5,5,5,7,11,11,8,11,14,12,14,15]
+; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 1, i32 2, i32 3, i32 3, i32 5, i32 5, i32 5, i32 7, i32 11, i32 11, i32 8, i32 11, i32 14, i32 12, i32 14, i32 15>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_masked_z_16xfloat_perm_mask4(<16 x float> %vec, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_z_16xfloat_perm_mask4:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %zmm2, %zmm1, %k1
+; CHECK-NEXT: vpermilps {{.*#+}} zmm0 {%k1} {z} = zmm0[1,2,3,3,5,5,5,7,11,11,8,11,14,12,14,15]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 1, i32 2, i32 3, i32 3, i32 5, i32 5, i32 5, i32 7, i32 11, i32 11, i32 8, i32 11, i32 14, i32 12, i32 14, i32 15>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
+define <16 x float> @test_masked_16xfloat_perm_imm_mask5(<16 x float> %vec, <16 x float> %vec2, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_16xfloat_perm_imm_mask5:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %zmm3, %zmm2, %k1
+; CHECK-NEXT: vpermilps {{.*#+}} zmm1 {%k1} = zmm0[1,2,1,0,5,6,5,4,9,10,9,8,13,14,13,12]
+; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 1, i32 2, i32 1, i32 0, i32 5, i32 6, i32 5, i32 4, i32 9, i32 10, i32 9, i32 8, i32 13, i32 14, i32 13, i32 12>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_masked_z_16xfloat_perm_imm_mask5(<16 x float> %vec, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_z_16xfloat_perm_imm_mask5:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %zmm2, %zmm1, %k1
+; CHECK-NEXT: vpermilps {{.*#+}} zmm0 {%k1} {z} = zmm0[1,2,1,0,5,6,5,4,9,10,9,8,13,14,13,12]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 1, i32 2, i32 1, i32 0, i32 5, i32 6, i32 5, i32 4, i32 9, i32 10, i32 9, i32 8, i32 13, i32 14, i32 13, i32 12>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
+define <16 x float> @test_16xfloat_perm_mask6(<16 x float> %vec) {
+; CHECK-LABEL: test_16xfloat_perm_mask6:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[2,0,3,2,4,4,6,7,9,11,8,11,13,12,13,13]
+; CHECK-NEXT: retq
+ %res = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 2, i32 0, i32 3, i32 2, i32 4, i32 4, i32 6, i32 7, i32 9, i32 11, i32 8, i32 11, i32 13, i32 12, i32 13, i32 13>
+ ret <16 x float> %res
+}
+define <16 x float> @test_masked_16xfloat_perm_mask6(<16 x float> %vec, <16 x float> %vec2, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_16xfloat_perm_mask6:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %zmm3, %zmm2, %k1
+; CHECK-NEXT: vpermilps {{.*#+}} zmm1 {%k1} = zmm0[2,0,3,2,4,4,6,7,9,11,8,11,13,12,13,13]
+; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 2, i32 0, i32 3, i32 2, i32 4, i32 4, i32 6, i32 7, i32 9, i32 11, i32 8, i32 11, i32 13, i32 12, i32 13, i32 13>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_masked_z_16xfloat_perm_mask6(<16 x float> %vec, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_z_16xfloat_perm_mask6:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %zmm2, %zmm1, %k1
+; CHECK-NEXT: vpermilps {{.*#+}} zmm0 {%k1} {z} = zmm0[2,0,3,2,4,4,6,7,9,11,8,11,13,12,13,13]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 2, i32 0, i32 3, i32 2, i32 4, i32 4, i32 6, i32 7, i32 9, i32 11, i32 8, i32 11, i32 13, i32 12, i32 13, i32 13>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
+define <16 x float> @test_masked_16xfloat_perm_imm_mask7(<16 x float> %vec, <16 x float> %vec2, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_16xfloat_perm_imm_mask7:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %zmm3, %zmm2, %k1
+; CHECK-NEXT: vpermilps {{.*#+}} zmm1 {%k1} = zmm0[3,3,0,2,7,7,4,6,11,11,8,10,15,15,12,14]
+; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 3, i32 3, i32 0, i32 2, i32 7, i32 7, i32 4, i32 6, i32 11, i32 11, i32 8, i32 10, i32 15, i32 15, i32 12, i32 14>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_masked_z_16xfloat_perm_imm_mask7(<16 x float> %vec, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_z_16xfloat_perm_imm_mask7:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %zmm2, %zmm1, %k1
+; CHECK-NEXT: vpermilps {{.*#+}} zmm0 {%k1} {z} = zmm0[3,3,0,2,7,7,4,6,11,11,8,10,15,15,12,14]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 3, i32 3, i32 0, i32 2, i32 7, i32 7, i32 4, i32 6, i32 11, i32 11, i32 8, i32 10, i32 15, i32 15, i32 12, i32 14>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
+define <16 x float> @test_16xfloat_perm_mem_mask0(<16 x float>* %vp) {
+; CHECK-LABEL: test_16xfloat_perm_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps (%rdi), %zmm0
+; CHECK-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[3,3,3,0,6,6,6,6,11,10,9,10,12,14,12,12]
+; CHECK-NEXT: retq
+ %vec = load <16 x float>, <16 x float>* %vp
+ %res = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 0, i32 6, i32 6, i32 6, i32 6, i32 11, i32 10, i32 9, i32 10, i32 12, i32 14, i32 12, i32 12>
+ ret <16 x float> %res
+}
+define <16 x float> @test_masked_16xfloat_perm_mem_mask0(<16 x float>* %vp, <16 x float> %vec2, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_16xfloat_perm_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps (%rdi), %zmm2
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %zmm3, %zmm1, %k1
+; CHECK-NEXT: vpermilps {{.*#+}} zmm0 {%k1} = zmm2[3,3,3,0,6,6,6,6,11,10,9,10,12,14,12,12]
+; CHECK-NEXT: retq
+ %vec = load <16 x float>, <16 x float>* %vp
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 0, i32 6, i32 6, i32 6, i32 6, i32 11, i32 10, i32 9, i32 10, i32 12, i32 14, i32 12, i32 12>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_masked_z_16xfloat_perm_mem_mask0(<16 x float>* %vp, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_z_16xfloat_perm_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps (%rdi), %zmm1
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %zmm2, %zmm0, %k1
+; CHECK-NEXT: vpermilps {{.*#+}} zmm0 {%k1} {z} = zmm1[3,3,3,0,6,6,6,6,11,10,9,10,12,14,12,12]
+; CHECK-NEXT: retq
+ %vec = load <16 x float>, <16 x float>* %vp
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 0, i32 6, i32 6, i32 6, i32 6, i32 11, i32 10, i32 9, i32 10, i32 12, i32 14, i32 12, i32 12>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_masked_16xfloat_perm_imm_mem_mask1(<16 x float>* %vp, <16 x float> %vec2, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_16xfloat_perm_imm_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %zmm2, %zmm1, %k1
+; CHECK-NEXT: vpermilps {{.*#+}} zmm0 {%k1} = mem[1,3,2,1,5,7,6,5,9,11,10,9,13,15,14,13]
+; CHECK-NEXT: retq
+ %vec = load <16 x float>, <16 x float>* %vp
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 1, i32 3, i32 2, i32 1, i32 5, i32 7, i32 6, i32 5, i32 9, i32 11, i32 10, i32 9, i32 13, i32 15, i32 14, i32 13>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_masked_z_16xfloat_perm_imm_mem_mask1(<16 x float>* %vp, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_z_16xfloat_perm_imm_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vcmpeqps %zmm1, %zmm0, %k1
+; CHECK-NEXT: vpermilps {{.*#+}} zmm0 {%k1} {z} = mem[1,3,2,1,5,7,6,5,9,11,10,9,13,15,14,13]
+; CHECK-NEXT: retq
+ %vec = load <16 x float>, <16 x float>* %vp
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 1, i32 3, i32 2, i32 1, i32 5, i32 7, i32 6, i32 5, i32 9, i32 11, i32 10, i32 9, i32 13, i32 15, i32 14, i32 13>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_masked_16xfloat_perm_mem_mask2(<16 x float>* %vp, <16 x float> %vec2, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_16xfloat_perm_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps (%rdi), %zmm2
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %zmm3, %zmm1, %k1
+; CHECK-NEXT: vpermilps {{.*#+}} zmm0 {%k1} = zmm2[2,0,0,3,5,5,6,5,9,8,8,8,14,12,13,13]
+; CHECK-NEXT: retq
+ %vec = load <16 x float>, <16 x float>* %vp
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 2, i32 0, i32 0, i32 3, i32 5, i32 5, i32 6, i32 5, i32 9, i32 8, i32 8, i32 8, i32 14, i32 12, i32 13, i32 13>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_masked_z_16xfloat_perm_mem_mask2(<16 x float>* %vp, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_z_16xfloat_perm_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps (%rdi), %zmm1
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %zmm2, %zmm0, %k1
+; CHECK-NEXT: vpermilps {{.*#+}} zmm0 {%k1} {z} = zmm1[2,0,0,3,5,5,6,5,9,8,8,8,14,12,13,13]
+; CHECK-NEXT: retq
+ %vec = load <16 x float>, <16 x float>* %vp
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 2, i32 0, i32 0, i32 3, i32 5, i32 5, i32 6, i32 5, i32 9, i32 8, i32 8, i32 8, i32 14, i32 12, i32 13, i32 13>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_16xfloat_perm_imm_mem_mask3(<16 x float>* %vp) {
+; CHECK-LABEL: test_16xfloat_perm_imm_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpermilps {{.*#+}} zmm0 = mem[1,0,3,1,5,4,7,5,9,8,11,9,13,12,15,13]
+; CHECK-NEXT: retq
+ %vec = load <16 x float>, <16 x float>* %vp
+ %res = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 1, i32 5, i32 4, i32 7, i32 5, i32 9, i32 8, i32 11, i32 9, i32 13, i32 12, i32 15, i32 13>
+ ret <16 x float> %res
+}
+define <16 x float> @test_masked_16xfloat_perm_imm_mem_mask3(<16 x float>* %vp, <16 x float> %vec2, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_16xfloat_perm_imm_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %zmm2, %zmm1, %k1
+; CHECK-NEXT: vpermilps {{.*#+}} zmm0 {%k1} = mem[1,0,3,1,5,4,7,5,9,8,11,9,13,12,15,13]
+; CHECK-NEXT: retq
+ %vec = load <16 x float>, <16 x float>* %vp
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 1, i32 5, i32 4, i32 7, i32 5, i32 9, i32 8, i32 11, i32 9, i32 13, i32 12, i32 15, i32 13>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_masked_z_16xfloat_perm_imm_mem_mask3(<16 x float>* %vp, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_z_16xfloat_perm_imm_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vcmpeqps %zmm1, %zmm0, %k1
+; CHECK-NEXT: vpermilps {{.*#+}} zmm0 {%k1} {z} = mem[1,0,3,1,5,4,7,5,9,8,11,9,13,12,15,13]
+; CHECK-NEXT: retq
+ %vec = load <16 x float>, <16 x float>* %vp
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 1, i32 5, i32 4, i32 7, i32 5, i32 9, i32 8, i32 11, i32 9, i32 13, i32 12, i32 15, i32 13>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_masked_16xfloat_perm_mem_mask4(<16 x float>* %vp, <16 x float> %vec2, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_16xfloat_perm_mem_mask4:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps (%rdi), %zmm2
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %zmm3, %zmm1, %k1
+; CHECK-NEXT: vpermilps {{.*#+}} zmm0 {%k1} = zmm2[3,3,1,1,6,5,5,6,11,11,10,9,15,14,12,12]
+; CHECK-NEXT: retq
+ %vec = load <16 x float>, <16 x float>* %vp
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 3, i32 3, i32 1, i32 1, i32 6, i32 5, i32 5, i32 6, i32 11, i32 11, i32 10, i32 9, i32 15, i32 14, i32 12, i32 12>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_masked_z_16xfloat_perm_mem_mask4(<16 x float>* %vp, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_z_16xfloat_perm_mem_mask4:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps (%rdi), %zmm1
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %zmm2, %zmm0, %k1
+; CHECK-NEXT: vpermilps {{.*#+}} zmm0 {%k1} {z} = zmm1[3,3,1,1,6,5,5,6,11,11,10,9,15,14,12,12]
+; CHECK-NEXT: retq
+ %vec = load <16 x float>, <16 x float>* %vp
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 3, i32 3, i32 1, i32 1, i32 6, i32 5, i32 5, i32 6, i32 11, i32 11, i32 10, i32 9, i32 15, i32 14, i32 12, i32 12>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_masked_16xfloat_perm_imm_mem_mask5(<16 x float>* %vp, <16 x float> %vec2, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_16xfloat_perm_imm_mem_mask5:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %zmm2, %zmm1, %k1
+; CHECK-NEXT: vpermilps {{.*#+}} zmm0 {%k1} = mem[2,0,0,1,6,4,4,5,10,8,8,9,14,12,12,13]
+; CHECK-NEXT: retq
+ %vec = load <16 x float>, <16 x float>* %vp
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 2, i32 0, i32 0, i32 1, i32 6, i32 4, i32 4, i32 5, i32 10, i32 8, i32 8, i32 9, i32 14, i32 12, i32 12, i32 13>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_masked_z_16xfloat_perm_imm_mem_mask5(<16 x float>* %vp, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_z_16xfloat_perm_imm_mem_mask5:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vcmpeqps %zmm1, %zmm0, %k1
+; CHECK-NEXT: vpermilps {{.*#+}} zmm0 {%k1} {z} = mem[2,0,0,1,6,4,4,5,10,8,8,9,14,12,12,13]
+; CHECK-NEXT: retq
+ %vec = load <16 x float>, <16 x float>* %vp
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 2, i32 0, i32 0, i32 1, i32 6, i32 4, i32 4, i32 5, i32 10, i32 8, i32 8, i32 9, i32 14, i32 12, i32 12, i32 13>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_16xfloat_perm_mem_mask6(<16 x float>* %vp) {
+; CHECK-LABEL: test_16xfloat_perm_mem_mask6:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps (%rdi), %zmm0
+; CHECK-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[2,1,1,2,6,5,5,7,9,11,9,9,12,15,14,15]
+; CHECK-NEXT: retq
+ %vec = load <16 x float>, <16 x float>* %vp
+ %res = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 2, i32 1, i32 1, i32 2, i32 6, i32 5, i32 5, i32 7, i32 9, i32 11, i32 9, i32 9, i32 12, i32 15, i32 14, i32 15>
+ ret <16 x float> %res
+}
+define <16 x float> @test_masked_16xfloat_perm_mem_mask6(<16 x float>* %vp, <16 x float> %vec2, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_16xfloat_perm_mem_mask6:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps (%rdi), %zmm2
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %zmm3, %zmm1, %k1
+; CHECK-NEXT: vpermilps {{.*#+}} zmm0 {%k1} = zmm2[2,1,1,2,6,5,5,7,9,11,9,9,12,15,14,15]
+; CHECK-NEXT: retq
+ %vec = load <16 x float>, <16 x float>* %vp
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 2, i32 1, i32 1, i32 2, i32 6, i32 5, i32 5, i32 7, i32 9, i32 11, i32 9, i32 9, i32 12, i32 15, i32 14, i32 15>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_masked_z_16xfloat_perm_mem_mask6(<16 x float>* %vp, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_z_16xfloat_perm_mem_mask6:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps (%rdi), %zmm1
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %zmm2, %zmm0, %k1
+; CHECK-NEXT: vpermilps {{.*#+}} zmm0 {%k1} {z} = zmm1[2,1,1,2,6,5,5,7,9,11,9,9,12,15,14,15]
+; CHECK-NEXT: retq
+ %vec = load <16 x float>, <16 x float>* %vp
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 2, i32 1, i32 1, i32 2, i32 6, i32 5, i32 5, i32 7, i32 9, i32 11, i32 9, i32 9, i32 12, i32 15, i32 14, i32 15>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_masked_16xfloat_perm_imm_mem_mask7(<16 x float>* %vp, <16 x float> %vec2, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_16xfloat_perm_imm_mem_mask7:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %zmm2, %zmm1, %k1
+; CHECK-NEXT: vpermilps {{.*#+}} zmm0 {%k1} = mem[1,2,0,1,5,6,4,5,9,10,8,9,13,14,12,13]
+; CHECK-NEXT: retq
+ %vec = load <16 x float>, <16 x float>* %vp
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 1, i32 2, i32 0, i32 1, i32 5, i32 6, i32 4, i32 5, i32 9, i32 10, i32 8, i32 9, i32 13, i32 14, i32 12, i32 13>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_masked_z_16xfloat_perm_imm_mem_mask7(<16 x float>* %vp, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_z_16xfloat_perm_imm_mem_mask7:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vcmpeqps %zmm1, %zmm0, %k1
+; CHECK-NEXT: vpermilps {{.*#+}} zmm0 {%k1} {z} = mem[1,2,0,1,5,6,4,5,9,10,8,9,13,14,12,13]
+; CHECK-NEXT: retq
+ %vec = load <16 x float>, <16 x float>* %vp
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 1, i32 2, i32 0, i32 1, i32 5, i32 6, i32 4, i32 5, i32 9, i32 10, i32 8, i32 9, i32 13, i32 14, i32 12, i32 13>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
+
+define <2 x double> @test_2xdouble_perm_mask0(<2 x double> %vec) {
+; CHECK-LABEL: test_2xdouble_perm_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; CHECK-NEXT: retq
+ %res = shufflevector <2 x double> %vec, <2 x double> undef, <2 x i32> <i32 1, i32 0>
+ ret <2 x double> %res
+}
+define <2 x double> @test_masked_2xdouble_perm_mask0(<2 x double> %vec, <2 x double> %vec2, <2 x double> %mask) {
+; CHECK-LABEL: test_masked_2xdouble_perm_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %xmm3, %xmm2, %k1
+; CHECK-NEXT: vpermilpd {{.*#+}} xmm1 {%k1} = xmm0[1,0]
+; CHECK-NEXT: vmovapd %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <2 x double> %vec, <2 x double> undef, <2 x i32> <i32 1, i32 0>
+ %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
+ %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec2
+ ret <2 x double> %res
+}
+
+define <2 x double> @test_masked_z_2xdouble_perm_mask0(<2 x double> %vec, <2 x double> %mask) {
+; CHECK-LABEL: test_masked_z_2xdouble_perm_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %xmm2, %xmm1, %k1
+; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1,0]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <2 x double> %vec, <2 x double> undef, <2 x i32> <i32 1, i32 0>
+ %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
+ %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
+ ret <2 x double> %res
+}
+define <2 x double> @test_masked_2xdouble_perm_mask1(<2 x double> %vec, <2 x double> %vec2, <2 x double> %mask) {
+; CHECK-LABEL: test_masked_2xdouble_perm_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %xmm3, %xmm2, %k1
+; CHECK-NEXT: vpermilpd {{.*#+}} xmm1 {%k1} = xmm0[1,0]
+; CHECK-NEXT: vmovapd %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <2 x double> %vec, <2 x double> undef, <2 x i32> <i32 1, i32 0>
+ %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
+ %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec2
+ ret <2 x double> %res
+}
+
+define <2 x double> @test_masked_z_2xdouble_perm_mask1(<2 x double> %vec, <2 x double> %mask) {
+; CHECK-LABEL: test_masked_z_2xdouble_perm_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %xmm2, %xmm1, %k1
+; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1,0]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <2 x double> %vec, <2 x double> undef, <2 x i32> <i32 1, i32 0>
+ %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
+ %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
+ ret <2 x double> %res
+}
+define <2 x double> @test_2xdouble_perm_mem_mask0(<2 x double>* %vp) {
+; CHECK-LABEL: test_2xdouble_perm_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = mem[1,0]
+; CHECK-NEXT: retq
+ %vec = load <2 x double>, <2 x double>* %vp
+ %res = shufflevector <2 x double> %vec, <2 x double> undef, <2 x i32> <i32 1, i32 0>
+ ret <2 x double> %res
+}
+define <2 x double> @test_masked_2xdouble_perm_mem_mask0(<2 x double>* %vp, <2 x double> %vec2, <2 x double> %mask) {
+; CHECK-LABEL: test_masked_2xdouble_perm_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %xmm2, %xmm1, %k1
+; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 {%k1} = mem[1,0]
+; CHECK-NEXT: retq
+ %vec = load <2 x double>, <2 x double>* %vp
+ %shuf = shufflevector <2 x double> %vec, <2 x double> undef, <2 x i32> <i32 1, i32 0>
+ %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
+ %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec2
+ ret <2 x double> %res
+}
+
+define <2 x double> @test_masked_z_2xdouble_perm_mem_mask0(<2 x double>* %vp, <2 x double> %mask) {
+; CHECK-LABEL: test_masked_z_2xdouble_perm_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vcmpeqpd %xmm1, %xmm0, %k1
+; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 {%k1} {z} = mem[1,0]
+; CHECK-NEXT: retq
+ %vec = load <2 x double>, <2 x double>* %vp
+ %shuf = shufflevector <2 x double> %vec, <2 x double> undef, <2 x i32> <i32 1, i32 0>
+ %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
+ %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
+ ret <2 x double> %res
+}
+
+define <2 x double> @test_masked_2xdouble_perm_mem_mask1(<2 x double>* %vp, <2 x double> %vec2, <2 x double> %mask) {
+; CHECK-LABEL: test_masked_2xdouble_perm_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %xmm2, %xmm1, %k1
+; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 {%k1} = mem[1,0]
+; CHECK-NEXT: retq
+ %vec = load <2 x double>, <2 x double>* %vp
+ %shuf = shufflevector <2 x double> %vec, <2 x double> undef, <2 x i32> <i32 1, i32 0>
+ %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
+ %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec2
+ ret <2 x double> %res
+}
+
+define <2 x double> @test_masked_z_2xdouble_perm_mem_mask1(<2 x double>* %vp, <2 x double> %mask) {
+; CHECK-LABEL: test_masked_z_2xdouble_perm_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vcmpeqpd %xmm1, %xmm0, %k1
+; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 {%k1} {z} = mem[1,0]
+; CHECK-NEXT: retq
+ %vec = load <2 x double>, <2 x double>* %vp
+ %shuf = shufflevector <2 x double> %vec, <2 x double> undef, <2 x i32> <i32 1, i32 0>
+ %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
+ %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
+ ret <2 x double> %res
+}
+
+define <4 x double> @test_4xdouble_perm_mask0(<4 x double> %vec) {
+; CHECK-LABEL: test_4xdouble_perm_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,3]
+; CHECK-NEXT: retq
+ %res = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 2, i32 3>
+ ret <4 x double> %res
+}
+define <4 x double> @test_masked_4xdouble_perm_mask0(<4 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
+; CHECK-LABEL: test_masked_4xdouble_perm_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
+; CHECK-NEXT: vpermilpd {{.*#+}} ymm1 {%k1} = ymm0[1,0,2,3]
+; CHECK-NEXT: vmovapd %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 2, i32 3>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_masked_z_4xdouble_perm_mask0(<4 x double> %vec, <4 x double> %mask) {
+; CHECK-LABEL: test_masked_z_4xdouble_perm_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vpermilpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,0,2,3]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 2, i32 3>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
+ ret <4 x double> %res
+}
+define <4 x double> @test_masked_4xdouble_perm_mask1(<4 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
+; CHECK-LABEL: test_masked_4xdouble_perm_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
+; CHECK-NEXT: vpermilpd {{.*#+}} ymm1 {%k1} = ymm0[1,1,2,2]
+; CHECK-NEXT: vmovapd %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 1, i32 1, i32 2, i32 2>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_masked_z_4xdouble_perm_mask1(<4 x double> %vec, <4 x double> %mask) {
+; CHECK-LABEL: test_masked_z_4xdouble_perm_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vpermilpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,1,2,2]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 1, i32 1, i32 2, i32 2>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
+ ret <4 x double> %res
+}
+define <4 x double> @test_masked_4xdouble_perm_mask2(<4 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
+; CHECK-LABEL: test_masked_4xdouble_perm_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
+; CHECK-NEXT: vpermilpd {{.*#+}} ymm1 {%k1} = ymm0[0,1,3,3]
+; CHECK-NEXT: vmovapd %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 1, i32 3, i32 3>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_masked_z_4xdouble_perm_mask2(<4 x double> %vec, <4 x double> %mask) {
+; CHECK-LABEL: test_masked_z_4xdouble_perm_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vpermilpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,3,3]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 1, i32 3, i32 3>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
+ ret <4 x double> %res
+}
+define <4 x double> @test_4xdouble_perm_mask3(<4 x double> %vec) {
+; CHECK-LABEL: test_4xdouble_perm_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,1,2,2]
+; CHECK-NEXT: retq
+ %res = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 1, i32 1, i32 2, i32 2>
+ ret <4 x double> %res
+}
+define <4 x double> @test_masked_4xdouble_perm_mask3(<4 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
+; CHECK-LABEL: test_masked_4xdouble_perm_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
+; CHECK-NEXT: vpermilpd {{.*#+}} ymm1 {%k1} = ymm0[1,1,2,2]
+; CHECK-NEXT: vmovapd %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 1, i32 1, i32 2, i32 2>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_masked_z_4xdouble_perm_mask3(<4 x double> %vec, <4 x double> %mask) {
+; CHECK-LABEL: test_masked_z_4xdouble_perm_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vpermilpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,1,2,2]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 1, i32 1, i32 2, i32 2>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
+ ret <4 x double> %res
+}
+define <4 x double> @test_4xdouble_perm_mem_mask0(<4 x double>* %vp) {
+; CHECK-LABEL: test_4xdouble_perm_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpermilpd {{.*#+}} ymm0 = mem[0,1,2,2]
+; CHECK-NEXT: retq
+ %vec = load <4 x double>, <4 x double>* %vp
+ %res = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
+ ret <4 x double> %res
+}
+define <4 x double> @test_masked_4xdouble_perm_mem_mask0(<4 x double>* %vp, <4 x double> %vec2, <4 x double> %mask) {
+; CHECK-LABEL: test_masked_4xdouble_perm_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vpermilpd {{.*#+}} ymm0 {%k1} = mem[0,1,2,2]
+; CHECK-NEXT: retq
+ %vec = load <4 x double>, <4 x double>* %vp
+ %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_masked_z_4xdouble_perm_mem_mask0(<4 x double>* %vp, <4 x double> %mask) {
+; CHECK-LABEL: test_masked_z_4xdouble_perm_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vcmpeqpd %ymm1, %ymm0, %k1
+; CHECK-NEXT: vpermilpd {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,2]
+; CHECK-NEXT: retq
+ %vec = load <4 x double>, <4 x double>* %vp
+ %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_masked_4xdouble_perm_mem_mask1(<4 x double>* %vp, <4 x double> %vec2, <4 x double> %mask) {
+; CHECK-LABEL: test_masked_4xdouble_perm_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vpermilpd {{.*#+}} ymm0 {%k1} = mem[0,1,3,3]
+; CHECK-NEXT: retq
+ %vec = load <4 x double>, <4 x double>* %vp
+ %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 1, i32 3, i32 3>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_masked_z_4xdouble_perm_mem_mask1(<4 x double>* %vp, <4 x double> %mask) {
+; CHECK-LABEL: test_masked_z_4xdouble_perm_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vcmpeqpd %ymm1, %ymm0, %k1
+; CHECK-NEXT: vpermilpd {{.*#+}} ymm0 {%k1} {z} = mem[0,1,3,3]
+; CHECK-NEXT: retq
+ %vec = load <4 x double>, <4 x double>* %vp
+ %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 1, i32 3, i32 3>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_masked_4xdouble_perm_mem_mask2(<4 x double>* %vp, <4 x double> %vec2, <4 x double> %mask) {
+; CHECK-LABEL: test_masked_4xdouble_perm_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vpermilpd {{.*#+}} ymm0 {%k1} = mem[1,0,3,3]
+; CHECK-NEXT: retq
+ %vec = load <4 x double>, <4 x double>* %vp
+ %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 3>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_masked_z_4xdouble_perm_mem_mask2(<4 x double>* %vp, <4 x double> %mask) {
+; CHECK-LABEL: test_masked_z_4xdouble_perm_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vcmpeqpd %ymm1, %ymm0, %k1
+; CHECK-NEXT: vpermilpd {{.*#+}} ymm0 {%k1} {z} = mem[1,0,3,3]
+; CHECK-NEXT: retq
+ %vec = load <4 x double>, <4 x double>* %vp
+ %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 3>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_4xdouble_perm_mem_mask3(<4 x double>* %vp) {
+; CHECK-LABEL: test_4xdouble_perm_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpermilpd {{.*#+}} ymm0 = mem[1,0,3,2]
+; CHECK-NEXT: retq
+ %vec = load <4 x double>, <4 x double>* %vp
+ %res = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+ ret <4 x double> %res
+}
+define <4 x double> @test_masked_4xdouble_perm_mem_mask3(<4 x double>* %vp, <4 x double> %vec2, <4 x double> %mask) {
+; CHECK-LABEL: test_masked_4xdouble_perm_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vpermilpd {{.*#+}} ymm0 {%k1} = mem[1,0,3,2]
+; CHECK-NEXT: retq
+ %vec = load <4 x double>, <4 x double>* %vp
+ %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_masked_z_4xdouble_perm_mem_mask3(<4 x double>* %vp, <4 x double> %mask) {
+; CHECK-LABEL: test_masked_z_4xdouble_perm_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vcmpeqpd %ymm1, %ymm0, %k1
+; CHECK-NEXT: vpermilpd {{.*#+}} ymm0 {%k1} {z} = mem[1,0,3,2]
+; CHECK-NEXT: retq
+ %vec = load <4 x double>, <4 x double>* %vp
+ %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
+ ret <4 x double> %res
+}
+
+define <8 x double> @test_8xdouble_perm_mask0(<8 x double> %vec) {
+; CHECK-LABEL: test_8xdouble_perm_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpermilpd {{.*#+}} zmm0 = zmm0[0,0,3,2,4,5,7,6]
+; CHECK-NEXT: retq
+ %res = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 3, i32 2, i32 4, i32 5, i32 7, i32 6>
+ ret <8 x double> %res
+}
+define <8 x double> @test_masked_8xdouble_perm_mask0(<8 x double> %vec, <8 x double> %vec2, <8 x double> %mask) {
+; CHECK-LABEL: test_masked_8xdouble_perm_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %zmm3, %zmm2, %k1
+; CHECK-NEXT: vpermilpd {{.*#+}} zmm1 {%k1} = zmm0[0,0,3,2,4,5,7,6]
+; CHECK-NEXT: vmovapd %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 3, i32 2, i32 4, i32 5, i32 7, i32 6>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_masked_z_8xdouble_perm_mask0(<8 x double> %vec, <8 x double> %mask) {
+; CHECK-LABEL: test_masked_z_8xdouble_perm_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %zmm2, %zmm1, %k1
+; CHECK-NEXT: vpermilpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,3,2,4,5,7,6]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 3, i32 2, i32 4, i32 5, i32 7, i32 6>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
+ ret <8 x double> %res
+}
+define <8 x double> @test_masked_8xdouble_perm_mask1(<8 x double> %vec, <8 x double> %vec2, <8 x double> %mask) {
+; CHECK-LABEL: test_masked_8xdouble_perm_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %zmm3, %zmm2, %k1
+; CHECK-NEXT: vpermilpd {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,4,7,6]
+; CHECK-NEXT: vmovapd %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 7, i32 6>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_masked_z_8xdouble_perm_mask1(<8 x double> %vec, <8 x double> %mask) {
+; CHECK-LABEL: test_masked_z_8xdouble_perm_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %zmm2, %zmm1, %k1
+; CHECK-NEXT: vpermilpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,4,7,6]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 7, i32 6>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
+ ret <8 x double> %res
+}
+define <8 x double> @test_masked_8xdouble_perm_mask2(<8 x double> %vec, <8 x double> %vec2, <8 x double> %mask) {
+; CHECK-LABEL: test_masked_8xdouble_perm_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %zmm3, %zmm2, %k1
+; CHECK-NEXT: vpermilpd {{.*#+}} zmm1 {%k1} = zmm0[0,0,2,3,5,5,6,7]
+; CHECK-NEXT: vmovapd %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 3, i32 5, i32 5, i32 6, i32 7>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_masked_z_8xdouble_perm_mask2(<8 x double> %vec, <8 x double> %mask) {
+; CHECK-LABEL: test_masked_z_8xdouble_perm_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %zmm2, %zmm1, %k1
+; CHECK-NEXT: vpermilpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,3,5,5,6,7]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 3, i32 5, i32 5, i32 6, i32 7>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
+ ret <8 x double> %res
+}
+define <8 x double> @test_8xdouble_perm_mask3(<8 x double> %vec) {
+; CHECK-LABEL: test_8xdouble_perm_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpermilpd {{.*#+}} zmm0 = zmm0[0,1,2,2,4,4,6,7]
+; CHECK-NEXT: retq
+ %res = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 2, i32 4, i32 4, i32 6, i32 7>
+ ret <8 x double> %res
+}
+define <8 x double> @test_masked_8xdouble_perm_mask3(<8 x double> %vec, <8 x double> %vec2, <8 x double> %mask) {
+; CHECK-LABEL: test_masked_8xdouble_perm_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %zmm3, %zmm2, %k1
+; CHECK-NEXT: vpermilpd {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,2,4,4,6,7]
+; CHECK-NEXT: vmovapd %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 2, i32 4, i32 4, i32 6, i32 7>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_masked_z_8xdouble_perm_mask3(<8 x double> %vec, <8 x double> %mask) {
+; CHECK-LABEL: test_masked_z_8xdouble_perm_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %zmm2, %zmm1, %k1
+; CHECK-NEXT: vpermilpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,2,4,4,6,7]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 2, i32 4, i32 4, i32 6, i32 7>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
+ ret <8 x double> %res
+}
+define <8 x double> @test_8xdouble_perm_mem_mask0(<8 x double>* %vp) {
+; CHECK-LABEL: test_8xdouble_perm_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpermilpd {{.*#+}} zmm0 = mem[0,1,2,3,5,4,7,6]
+; CHECK-NEXT: retq
+ %vec = load <8 x double>, <8 x double>* %vp
+ %res = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 7, i32 6>
+ ret <8 x double> %res
+}
+define <8 x double> @test_masked_8xdouble_perm_mem_mask0(<8 x double>* %vp, <8 x double> %vec2, <8 x double> %mask) {
+; CHECK-LABEL: test_masked_8xdouble_perm_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %zmm2, %zmm1, %k1
+; CHECK-NEXT: vpermilpd {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,5,4,7,6]
+; CHECK-NEXT: retq
+ %vec = load <8 x double>, <8 x double>* %vp
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 7, i32 6>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_masked_z_8xdouble_perm_mem_mask0(<8 x double>* %vp, <8 x double> %mask) {
+; CHECK-LABEL: test_masked_z_8xdouble_perm_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vcmpeqpd %zmm1, %zmm0, %k1
+; CHECK-NEXT: vpermilpd {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,5,4,7,6]
+; CHECK-NEXT: retq
+ %vec = load <8 x double>, <8 x double>* %vp
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 7, i32 6>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_masked_8xdouble_perm_mem_mask1(<8 x double>* %vp, <8 x double> %vec2, <8 x double> %mask) {
+; CHECK-LABEL: test_masked_8xdouble_perm_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %zmm2, %zmm1, %k1
+; CHECK-NEXT: vpermilpd {{.*#+}} zmm0 {%k1} = mem[0,1,3,3,4,5,7,7]
+; CHECK-NEXT: retq
+ %vec = load <8 x double>, <8 x double>* %vp
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 1, i32 3, i32 3, i32 4, i32 5, i32 7, i32 7>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_masked_z_8xdouble_perm_mem_mask1(<8 x double>* %vp, <8 x double> %mask) {
+; CHECK-LABEL: test_masked_z_8xdouble_perm_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vcmpeqpd %zmm1, %zmm0, %k1
+; CHECK-NEXT: vpermilpd {{.*#+}} zmm0 {%k1} {z} = mem[0,1,3,3,4,5,7,7]
+; CHECK-NEXT: retq
+ %vec = load <8 x double>, <8 x double>* %vp
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 1, i32 3, i32 3, i32 4, i32 5, i32 7, i32 7>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_masked_8xdouble_perm_mem_mask2(<8 x double>* %vp, <8 x double> %vec2, <8 x double> %mask) {
+; CHECK-LABEL: test_masked_8xdouble_perm_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %zmm2, %zmm1, %k1
+; CHECK-NEXT: vpermilpd {{.*#+}} zmm0 {%k1} = mem[1,1,3,3,5,4,7,6]
+; CHECK-NEXT: retq
+ %vec = load <8 x double>, <8 x double>* %vp
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 4, i32 7, i32 6>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_masked_z_8xdouble_perm_mem_mask2(<8 x double>* %vp, <8 x double> %mask) {
+; CHECK-LABEL: test_masked_z_8xdouble_perm_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vcmpeqpd %zmm1, %zmm0, %k1
+; CHECK-NEXT: vpermilpd {{.*#+}} zmm0 {%k1} {z} = mem[1,1,3,3,5,4,7,6]
+; CHECK-NEXT: retq
+ %vec = load <8 x double>, <8 x double>* %vp
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 4, i32 7, i32 6>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_8xdouble_perm_mem_mask3(<8 x double>* %vp) {
+; CHECK-LABEL: test_8xdouble_perm_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpermilpd {{.*#+}} zmm0 = mem[1,0,3,2,4,5,6,7]
+; CHECK-NEXT: retq
+ %vec = load <8 x double>, <8 x double>* %vp
+ %res = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x double> %res
+}
+define <8 x double> @test_masked_8xdouble_perm_mem_mask3(<8 x double>* %vp, <8 x double> %vec2, <8 x double> %mask) {
+; CHECK-LABEL: test_masked_8xdouble_perm_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %zmm2, %zmm1, %k1
+; CHECK-NEXT: vpermilpd {{.*#+}} zmm0 {%k1} = mem[1,0,3,2,4,5,6,7]
+; CHECK-NEXT: retq
+ %vec = load <8 x double>, <8 x double>* %vp
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_masked_z_8xdouble_perm_mem_mask3(<8 x double>* %vp, <8 x double> %mask) {
+; CHECK-LABEL: test_masked_z_8xdouble_perm_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vcmpeqpd %zmm1, %zmm0, %k1
+; CHECK-NEXT: vpermilpd {{.*#+}} zmm0 {%k1} {z} = mem[1,0,3,2,4,5,6,7]
+; CHECK-NEXT: retq
+ %vec = load <8 x double>, <8 x double>* %vp
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
+ ret <8 x double> %res
+}
+
diff --git a/test/CodeGen/X86/avx512-shuffles/partial_permute.ll b/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
new file mode 100644
index 000000000000..df88f0fca456
--- /dev/null
+++ b/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
@@ -0,0 +1,4808 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512vl,+avx512bw %s -o - | FileCheck %s
+
+; FIXME: All cases here should be fixed by PR34380
+
+define <8 x i16> @test_16xi16_to_8xi16_perm_mask0(<16 x i16> %vec) {
+; CHECK-LABEL: test_16xi16_to_8xi16_perm_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[8,9,12,13,12,13,8,9,14,15,10,11,12,13,14,15]
+; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
+; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,0,3]
+; CHECK-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,4]
+; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3,4],xmm0[5,6,7]
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 8, i32 6, i32 12, i32 4, i32 7, i32 9, i32 14, i32 8>
+ ret <8 x i16> %res
+}
+define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mask0(<16 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
+; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[8,9,12,13,12,13,8,9,14,15,10,11,12,13,14,15]
+; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
+; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,0,3]
+; CHECK-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,4]
+; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2],xmm3[3,4],xmm0[5,6,7]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqw %xmm3, %xmm2, %k1
+; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 8, i32 6, i32 12, i32 4, i32 7, i32 9, i32 14, i32 8>
+ %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
+ ret <8 x i16> %res
+}
+
+define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mask0(<16 x i16> %vec, <8 x i16> %mask) {
+; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[8,9,12,13,12,13,8,9,14,15,10,11,12,13,14,15]
+; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
+; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,0,3]
+; CHECK-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,4]
+; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3,4],xmm0[5,6,7]
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1
+; CHECK-NEXT: vmovdqu16 %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 8, i32 6, i32 12, i32 4, i32 7, i32 9, i32 14, i32 8>
+ %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
+ ret <8 x i16> %res
+}
+define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mask1(<16 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
+; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm3
+; CHECK-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,8,9,2,3,10,11,12,13,14,15,8,9,12,13]
+; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
+; CHECK-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7]
+; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2],xmm0[3],xmm3[4,5,6,7]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqw %xmm3, %xmm2, %k1
+; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 4, i32 12, i32 9, i32 4, i32 14, i32 15, i32 12, i32 14>
+ %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
+ ret <8 x i16> %res
+}
+
+define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mask1(<16 x i16> %vec, <8 x i16> %mask) {
+; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm2
+; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,8,9,2,3,10,11,12,13,14,15,8,9,12,13]
+; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
+; CHECK-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7]
+; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2],xmm0[3],xmm2[4,5,6,7]
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1
+; CHECK-NEXT: vmovdqu16 %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 4, i32 12, i32 9, i32 4, i32 14, i32 15, i32 12, i32 14>
+ %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
+ ret <8 x i16> %res
+}
+define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mask2(<16 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
+; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm3
+; CHECK-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[12,13,6,7,12,13,4,5,0,1,2,3,12,13,2,3]
+; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
+; CHECK-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7]
+; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4,5,6],xmm3[7]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqw %xmm3, %xmm2, %k1
+; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 4, i32 11, i32 14, i32 10, i32 7, i32 1, i32 6, i32 9>
+ %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
+ ret <8 x i16> %res
+}
+
+define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mask2(<16 x i16> %vec, <8 x i16> %mask) {
+; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm2
+; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[12,13,6,7,12,13,4,5,0,1,2,3,12,13,2,3]
+; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
+; CHECK-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7]
+; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4,5,6],xmm2[7]
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1
+; CHECK-NEXT: vmovdqu16 %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 4, i32 11, i32 14, i32 10, i32 7, i32 1, i32 6, i32 9>
+ %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
+ ret <8 x i16> %res
+}
+define <8 x i16> @test_16xi16_to_8xi16_perm_mask3(<16 x i16> %vec) {
+; CHECK-LABEL: test_16xi16_to_8xi16_perm_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,2,3,14,15,14,15,8,9,10,11,0,1,0,1]
+; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
+; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,14,15,12,13,10,11,8,9,8,9,0,1,2,3]
+; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4],xmm0[5,6],xmm1[7]
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 14, i32 15, i32 7, i32 13, i32 4, i32 12, i32 8, i32 0>
+ ret <8 x i16> %res
+}
+define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mask3(<16 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
+; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,1,2,3,14,15,14,15,8,9,10,11,0,1,0,1]
+; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
+; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,14,15,12,13,10,11,8,9,8,9,0,1,2,3]
+; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2],xmm0[3],xmm3[4],xmm0[5,6],xmm3[7]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqw %xmm3, %xmm2, %k1
+; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 14, i32 15, i32 7, i32 13, i32 4, i32 12, i32 8, i32 0>
+ %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
+ ret <8 x i16> %res
+}
+
+define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mask3(<16 x i16> %vec, <8 x i16> %mask) {
+; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0,1,2,3,14,15,14,15,8,9,10,11,0,1,0,1]
+; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
+; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,14,15,12,13,10,11,8,9,8,9,0,1,2,3]
+; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3],xmm2[4],xmm0[5,6],xmm2[7]
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1
+; CHECK-NEXT: vmovdqu16 %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 14, i32 15, i32 7, i32 13, i32 4, i32 12, i32 8, i32 0>
+ %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
+ ret <8 x i16> %res
+}
+define <8 x i16> @test_16xi16_to_8xi16_perm_mem_mask0(<16 x i16>* %vp) {
+; CHECK-LABEL: test_16xi16_to_8xi16_perm_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa (%rdi), %ymm0
+; CHECK-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,14,15,12,13,6,7,10,11,10,11,6,7,6,7]
+; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
+; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,0]
+; CHECK-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7]
+; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6],xmm0[7]
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %vec = load <16 x i16>, <16 x i16>* %vp
+ %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 0, i32 7, i32 13, i32 3, i32 5, i32 13, i32 3, i32 9>
+ ret <8 x i16> %res
+}
+define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask0(<16 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
+; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa (%rdi), %ymm2
+; CHECK-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[0,1,14,15,12,13,6,7,10,11,10,11,6,7,6,7]
+; CHECK-NEXT: vextracti128 $1, %ymm2, %xmm2
+; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,0]
+; CHECK-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7]
+; CHECK-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2],xmm3[3,4],xmm2[5],xmm3[6],xmm2[7]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqw %xmm3, %xmm1, %k1
+; CHECK-NEXT: vmovdqu16 %xmm2, %xmm0 {%k1}
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %vec = load <16 x i16>, <16 x i16>* %vp
+ %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 0, i32 7, i32 13, i32 3, i32 5, i32 13, i32 3, i32 9>
+ %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
+ ret <8 x i16> %res
+}
+
+define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask0(<16 x i16>* %vp, <8 x i16> %mask) {
+; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa (%rdi), %ymm1
+; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[0,1,14,15,12,13,6,7,10,11,10,11,6,7,6,7]
+; CHECK-NEXT: vextracti128 $1, %ymm1, %xmm1
+; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,0]
+; CHECK-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7]
+; CHECK-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6],xmm1[7]
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqw %xmm2, %xmm0, %k1
+; CHECK-NEXT: vmovdqu16 %xmm1, %xmm0 {%k1} {z}
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %vec = load <16 x i16>, <16 x i16>* %vp
+ %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 0, i32 7, i32 13, i32 3, i32 5, i32 13, i32 3, i32 9>
+ %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
+ ret <8 x i16> %res
+}
+
+define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask1(<16 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
+; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa (%rdi), %ymm2
+; CHECK-NEXT: vextracti128 $1, %ymm2, %xmm3
+; CHECK-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[8,9,14,15,8,9,14,15,0,1,2,3,0,1,12,13]
+; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[6,7,14,15,4,5,14,15,2,3,10,11,0,1,2,3]
+; CHECK-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2],xmm2[3,4,5],xmm3[6,7]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqw %xmm3, %xmm1, %k1
+; CHECK-NEXT: vmovdqu16 %xmm2, %xmm0 {%k1}
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %vec = load <16 x i16>, <16 x i16>* %vp
+ %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 3, i32 15, i32 12, i32 7, i32 1, i32 5, i32 8, i32 14>
+ %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
+ ret <8 x i16> %res
+}
+
+define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask1(<16 x i16>* %vp, <8 x i16> %mask) {
+; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa (%rdi), %ymm1
+; CHECK-NEXT: vextracti128 $1, %ymm1, %xmm2
+; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[8,9,14,15,8,9,14,15,0,1,2,3,0,1,12,13]
+; CHECK-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,14,15,4,5,14,15,2,3,10,11,0,1,2,3]
+; CHECK-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2],xmm1[3,4,5],xmm2[6,7]
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqw %xmm2, %xmm0, %k1
+; CHECK-NEXT: vmovdqu16 %xmm1, %xmm0 {%k1} {z}
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %vec = load <16 x i16>, <16 x i16>* %vp
+ %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 3, i32 15, i32 12, i32 7, i32 1, i32 5, i32 8, i32 14>
+ %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
+ ret <8 x i16> %res
+}
+
+define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask2(<16 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
+; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa (%rdi), %ymm2
+; CHECK-NEXT: vpsrld $16, %xmm2, %xmm3
+; CHECK-NEXT: vextracti128 $1, %ymm2, %xmm2
+; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,0,1,6,7,0,1,10,11,0,1,14,15,2,3]
+; CHECK-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3,4,5,6,7]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqw %xmm3, %xmm1, %k1
+; CHECK-NEXT: vmovdqu16 %xmm2, %xmm0 {%k1}
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %vec = load <16 x i16>, <16 x i16>* %vp
+ %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 1, i32 8, i32 11, i32 8, i32 13, i32 8, i32 15, i32 9>
+ %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
+ ret <8 x i16> %res
+}
+
+define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask2(<16 x i16>* %vp, <8 x i16> %mask) {
+; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa (%rdi), %ymm1
+; CHECK-NEXT: vpsrld $16, %xmm1, %xmm2
+; CHECK-NEXT: vextracti128 $1, %ymm1, %xmm1
+; CHECK-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,6,7,0,1,10,11,0,1,14,15,2,3]
+; CHECK-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3,4,5,6,7]
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqw %xmm2, %xmm0, %k1
+; CHECK-NEXT: vmovdqu16 %xmm1, %xmm0 {%k1} {z}
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %vec = load <16 x i16>, <16 x i16>* %vp
+ %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 1, i32 8, i32 11, i32 8, i32 13, i32 8, i32 15, i32 9>
+ %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
+ ret <8 x i16> %res
+}
+
+define <8 x i16> @test_16xi16_to_8xi16_perm_mem_mask3(<16 x i16>* %vp) {
+; CHECK-LABEL: test_16xi16_to_8xi16_perm_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa (%rdi), %ymm0
+; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1
+; CHECK-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,14,15,2,3,12,13,2,3,8,9,6,7,4,5]
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %vec = load <16 x i16>, <16 x i16>* %vp
+ %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 9, i32 7, i32 9, i32 6, i32 9, i32 4, i32 3, i32 2>
+ ret <8 x i16> %res
+}
+define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask3(<16 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
+; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa (%rdi), %ymm2
+; CHECK-NEXT: vextracti128 $1, %ymm2, %xmm3
+; CHECK-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3]
+; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,3,14,15,2,3,12,13,2,3,8,9,6,7,4,5]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqw %xmm3, %xmm1, %k1
+; CHECK-NEXT: vmovdqu16 %xmm2, %xmm0 {%k1}
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %vec = load <16 x i16>, <16 x i16>* %vp
+ %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 9, i32 7, i32 9, i32 6, i32 9, i32 4, i32 3, i32 2>
+ %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
+ ret <8 x i16> %res
+}
+
+define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask3(<16 x i16>* %vp, <8 x i16> %mask) {
+; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa (%rdi), %ymm1
+; CHECK-NEXT: vextracti128 $1, %ymm1, %xmm2
+; CHECK-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
+; CHECK-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[2,3,14,15,2,3,12,13,2,3,8,9,6,7,4,5]
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqw %xmm2, %xmm0, %k1
+; CHECK-NEXT: vmovdqu16 %xmm1, %xmm0 {%k1} {z}
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %vec = load <16 x i16>, <16 x i16>* %vp
+ %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 9, i32 7, i32 9, i32 6, i32 9, i32 4, i32 3, i32 2>
+ %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
+ ret <8 x i16> %res
+}
+
+define <16 x i16> @test_32xi16_to_16xi16_perm_mask0(<32 x i16> %vec) {
+; CHECK-LABEL: test_32xi16_to_16xi16_perm_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [8,12,13,10,12,13,1,28,6,24,9,11,12,2,14,2]
+; CHECK-NEXT: vpermi2w %ymm0, %ymm2, %ymm1
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 24, i32 28, i32 29, i32 26, i32 28, i32 29, i32 17, i32 12, i32 22, i32 8, i32 25, i32 27, i32 28, i32 18, i32 30, i32 18>
+ ret <16 x i16> %res
+}
+define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mask0(<32 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
+; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [8,12,13,10,12,13,1,28,6,24,9,11,12,2,14,2]
+; CHECK-NEXT: vpermi2w %ymm0, %ymm3, %ymm4
+; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; CHECK-NEXT: vpcmpeqw %ymm0, %ymm2, %k1
+; CHECK-NEXT: vpblendmw %ymm4, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: retq
+ %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 24, i32 28, i32 29, i32 26, i32 28, i32 29, i32 17, i32 12, i32 22, i32 8, i32 25, i32 27, i32 28, i32 18, i32 30, i32 18>
+ %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
+ ret <16 x i16> %res
+}
+
+define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mask0(<32 x i16> %vec, <16 x i16> %mask) {
+; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [8,12,13,10,12,13,1,28,6,24,9,11,12,2,14,2]
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vpcmpeqw %ymm4, %ymm1, %k1
+; CHECK-NEXT: vpermi2w %ymm0, %ymm3, %ymm2 {%k1} {z}
+; CHECK-NEXT: vmovdqa %ymm2, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 24, i32 28, i32 29, i32 26, i32 28, i32 29, i32 17, i32 12, i32 22, i32 8, i32 25, i32 27, i32 28, i32 18, i32 30, i32 18>
+ %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
+ ret <16 x i16> %res
+}
+define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mask1(<32 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
+; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [30,5,15,13,9,18,3,31,4,11,23,7,19,23,9,26]
+; CHECK-NEXT: vpermi2w %ymm0, %ymm3, %ymm4
+; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; CHECK-NEXT: vpcmpeqw %ymm0, %ymm2, %k1
+; CHECK-NEXT: vpblendmw %ymm4, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: retq
+ %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 14, i32 21, i32 31, i32 29, i32 25, i32 2, i32 19, i32 15, i32 20, i32 27, i32 7, i32 23, i32 3, i32 7, i32 25, i32 10>
+ %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
+ ret <16 x i16> %res
+}
+
+define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mask1(<32 x i16> %vec, <16 x i16> %mask) {
+; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [30,5,15,13,9,18,3,31,4,11,23,7,19,23,9,26]
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vpcmpeqw %ymm4, %ymm1, %k1
+; CHECK-NEXT: vpermi2w %ymm0, %ymm3, %ymm2 {%k1} {z}
+; CHECK-NEXT: vmovdqa %ymm2, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 14, i32 21, i32 31, i32 29, i32 25, i32 2, i32 19, i32 15, i32 20, i32 27, i32 7, i32 23, i32 3, i32 7, i32 25, i32 10>
+ %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
+ ret <16 x i16> %res
+}
+define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mask2(<32 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
+; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [10,19,20,6,17,2,13,1,5,16,4,3,2,28,27,15]
+; CHECK-NEXT: vpermi2w %ymm0, %ymm3, %ymm4
+; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; CHECK-NEXT: vpcmpeqw %ymm0, %ymm2, %k1
+; CHECK-NEXT: vpblendmw %ymm4, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: retq
+ %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 26, i32 3, i32 4, i32 22, i32 1, i32 18, i32 29, i32 17, i32 21, i32 0, i32 20, i32 19, i32 18, i32 12, i32 11, i32 31>
+ %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
+ ret <16 x i16> %res
+}
+
+define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mask2(<32 x i16> %vec, <16 x i16> %mask) {
+; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [10,19,20,6,17,2,13,1,5,16,4,3,2,28,27,15]
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vpcmpeqw %ymm4, %ymm1, %k1
+; CHECK-NEXT: vpermi2w %ymm0, %ymm3, %ymm2 {%k1} {z}
+; CHECK-NEXT: vmovdqa %ymm2, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 26, i32 3, i32 4, i32 22, i32 1, i32 18, i32 29, i32 17, i32 21, i32 0, i32 20, i32 19, i32 18, i32 12, i32 11, i32 31>
+ %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
+ ret <16 x i16> %res
+}
+define <16 x i16> @test_32xi16_to_16xi16_perm_mask3(<32 x i16> %vec) {
+; CHECK-LABEL: test_32xi16_to_16xi16_perm_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [1,0,30,5,3,6,25,29,0,13,3,8,7,20,11,5]
+; CHECK-NEXT: vpermi2w %ymm2, %ymm0, %ymm1
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 1, i32 0, i32 30, i32 5, i32 3, i32 6, i32 25, i32 29, i32 0, i32 13, i32 3, i32 8, i32 7, i32 20, i32 11, i32 5>
+ ret <16 x i16> %res
+}
+define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mask3(<32 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
+; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [1,0,30,5,3,6,25,29,0,13,3,8,7,20,11,5]
+; CHECK-NEXT: vpermi2w %ymm3, %ymm0, %ymm4
+; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; CHECK-NEXT: vpcmpeqw %ymm0, %ymm2, %k1
+; CHECK-NEXT: vpblendmw %ymm4, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: retq
+ %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 1, i32 0, i32 30, i32 5, i32 3, i32 6, i32 25, i32 29, i32 0, i32 13, i32 3, i32 8, i32 7, i32 20, i32 11, i32 5>
+ %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
+ ret <16 x i16> %res
+}
+
+define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mask3(<32 x i16> %vec, <16 x i16> %mask) {
+; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [1,0,30,5,3,6,25,29,0,13,3,8,7,20,11,5]
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vpcmpeqw %ymm4, %ymm1, %k1
+; CHECK-NEXT: vpermi2w %ymm3, %ymm0, %ymm2 {%k1} {z}
+; CHECK-NEXT: vmovdqa %ymm2, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 1, i32 0, i32 30, i32 5, i32 3, i32 6, i32 25, i32 29, i32 0, i32 13, i32 3, i32 8, i32 7, i32 20, i32 11, i32 5>
+ %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
+ ret <16 x i16> %res
+}
+define <8 x i16> @test_32xi16_to_8xi16_perm_mask0(<32 x i16> %vec) {
+; CHECK-LABEL: test_32xi16_to_8xi16_perm_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = <22,27,7,10,13,21,5,14,u,u,u,u,u,u,u,u>
+; CHECK-NEXT: vpermi2w %ymm0, %ymm2, %ymm1
+; CHECK-NEXT: vmovdqa %xmm1, %xmm0
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 6, i32 11, i32 23, i32 26, i32 29, i32 5, i32 21, i32 30>
+ ret <8 x i16> %res
+}
+define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mask0(<32 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
+; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = <22,27,7,10,13,21,5,14,u,u,u,u,u,u,u,u>
+; CHECK-NEXT: vpermi2w %ymm0, %ymm3, %ymm4
+; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; CHECK-NEXT: vpcmpeqw %xmm0, %xmm2, %k1
+; CHECK-NEXT: vpblendmw %xmm4, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 6, i32 11, i32 23, i32 26, i32 29, i32 5, i32 21, i32 30>
+ %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
+ ret <8 x i16> %res
+}
+
+define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mask0(<32 x i16> %vec, <8 x i16> %mask) {
+; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <22,27,7,10,13,21,5,14,u,u,u,u,u,u,u,u>
+; CHECK-NEXT: vpermi2w %ymm0, %ymm2, %ymm3
+; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; CHECK-NEXT: vpcmpeqw %xmm0, %xmm1, %k1
+; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1} {z}
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 6, i32 11, i32 23, i32 26, i32 29, i32 5, i32 21, i32 30>
+ %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
+ ret <8 x i16> %res
+}
+define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mask1(<32 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
+; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = <1,21,27,10,8,19,14,5,u,u,u,u,u,u,u,u>
+; CHECK-NEXT: vpermi2w %ymm3, %ymm0, %ymm4
+; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; CHECK-NEXT: vpcmpeqw %xmm0, %xmm2, %k1
+; CHECK-NEXT: vpblendmw %xmm4, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 1, i32 21, i32 27, i32 10, i32 8, i32 19, i32 14, i32 5>
+ %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
+ ret <8 x i16> %res
+}
+
+define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mask1(<32 x i16> %vec, <8 x i16> %mask) {
+; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <1,21,27,10,8,19,14,5,u,u,u,u,u,u,u,u>
+; CHECK-NEXT: vpermi2w %ymm2, %ymm0, %ymm3
+; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; CHECK-NEXT: vpcmpeqw %xmm0, %xmm1, %k1
+; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1} {z}
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 1, i32 21, i32 27, i32 10, i32 8, i32 19, i32 14, i32 5>
+ %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
+ ret <8 x i16> %res
+}
+define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mask2(<32 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
+; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = <15,13,18,16,9,11,26,8,u,u,u,u,u,u,u,u>
+; CHECK-NEXT: vpermi2w %ymm3, %ymm0, %ymm4
+; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; CHECK-NEXT: vpcmpeqw %xmm0, %xmm2, %k1
+; CHECK-NEXT: vpblendmw %xmm4, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 15, i32 13, i32 18, i32 16, i32 9, i32 11, i32 26, i32 8>
+ %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
+ ret <8 x i16> %res
+}
+
+define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mask2(<32 x i16> %vec, <8 x i16> %mask) {
+; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <15,13,18,16,9,11,26,8,u,u,u,u,u,u,u,u>
+; CHECK-NEXT: vpermi2w %ymm2, %ymm0, %ymm3
+; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; CHECK-NEXT: vpcmpeqw %xmm0, %xmm1, %k1
+; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1} {z}
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 15, i32 13, i32 18, i32 16, i32 9, i32 11, i32 26, i32 8>
+ %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
+ ret <8 x i16> %res
+}
+define <8 x i16> @test_32xi16_to_8xi16_perm_mask3(<32 x i16> %vec) {
+; CHECK-LABEL: test_32xi16_to_8xi16_perm_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = <17,0,23,10,1,8,7,30,u,u,u,u,u,u,u,u>
+; CHECK-NEXT: vpermi2w %ymm2, %ymm0, %ymm1
+; CHECK-NEXT: vmovdqa %xmm1, %xmm0
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 17, i32 0, i32 23, i32 10, i32 1, i32 8, i32 7, i32 30>
+ ret <8 x i16> %res
+}
+define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mask3(<32 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
+; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = <17,0,23,10,1,8,7,30,u,u,u,u,u,u,u,u>
+; CHECK-NEXT: vpermi2w %ymm3, %ymm0, %ymm4
+; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; CHECK-NEXT: vpcmpeqw %xmm0, %xmm2, %k1
+; CHECK-NEXT: vpblendmw %xmm4, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 17, i32 0, i32 23, i32 10, i32 1, i32 8, i32 7, i32 30>
+ %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
+ ret <8 x i16> %res
+}
+
+define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mask3(<32 x i16> %vec, <8 x i16> %mask) {
+; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <17,0,23,10,1,8,7,30,u,u,u,u,u,u,u,u>
+; CHECK-NEXT: vpermi2w %ymm2, %ymm0, %ymm3
+; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; CHECK-NEXT: vpcmpeqw %xmm0, %xmm1, %k1
+; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1} {z}
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 17, i32 0, i32 23, i32 10, i32 1, i32 8, i32 7, i32 30>
+ %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
+ ret <8 x i16> %res
+}
+define <16 x i16> @test_32xi16_to_16xi16_perm_mem_mask0(<32 x i16>* %vp) {
+; CHECK-LABEL: test_32xi16_to_16xi16_perm_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1
+; CHECK-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm0 = [20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12]
+; CHECK-NEXT: vpermi2w %ymm2, %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %vec = load <32 x i16>, <32 x i16>* %vp
+ %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 20, i32 19, i32 22, i32 12, i32 13, i32 20, i32 0, i32 6, i32 10, i32 7, i32 20, i32 12, i32 28, i32 18, i32 13, i32 12>
+ ret <16 x i16> %res
+}
+define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mem_mask0(<32 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
+; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
+; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12]
+; CHECK-NEXT: vpermi2w %ymm3, %ymm2, %ymm4
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqw %ymm2, %ymm1, %k1
+; CHECK-NEXT: vmovdqu16 %ymm4, %ymm0 {%k1}
+; CHECK-NEXT: retq
+ %vec = load <32 x i16>, <32 x i16>* %vp
+ %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 20, i32 19, i32 22, i32 12, i32 13, i32 20, i32 0, i32 6, i32 10, i32 7, i32 20, i32 12, i32 28, i32 18, i32 13, i32 12>
+ %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
+ ret <16 x i16> %res
+}
+
+define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mem_mask0(<32 x i16>* %vp, <16 x i16> %mask) {
+; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
+; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12]
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vpcmpeqw %ymm4, %ymm0, %k1
+; CHECK-NEXT: vpermi2w %ymm3, %ymm2, %ymm1 {%k1} {z}
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %vec = load <32 x i16>, <32 x i16>* %vp
+ %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 20, i32 19, i32 22, i32 12, i32 13, i32 20, i32 0, i32 6, i32 10, i32 7, i32 20, i32 12, i32 28, i32 18, i32 13, i32 12>
+ %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
+ ret <16 x i16> %res
+}
+
+define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mem_mask1(<32 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
+; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
+; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [22,13,21,1,14,8,5,16,15,17,24,28,15,9,14,25]
+; CHECK-NEXT: vpermi2w %ymm3, %ymm2, %ymm4
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqw %ymm2, %ymm1, %k1
+; CHECK-NEXT: vmovdqu16 %ymm4, %ymm0 {%k1}
+; CHECK-NEXT: retq
+ %vec = load <32 x i16>, <32 x i16>* %vp
+ %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 22, i32 13, i32 21, i32 1, i32 14, i32 8, i32 5, i32 16, i32 15, i32 17, i32 24, i32 28, i32 15, i32 9, i32 14, i32 25>
+ %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
+ ret <16 x i16> %res
+}
+
+define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mem_mask1(<32 x i16>* %vp, <16 x i16> %mask) {
+; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
+; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [22,13,21,1,14,8,5,16,15,17,24,28,15,9,14,25]
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vpcmpeqw %ymm4, %ymm0, %k1
+; CHECK-NEXT: vpermi2w %ymm3, %ymm2, %ymm1 {%k1} {z}
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %vec = load <32 x i16>, <32 x i16>* %vp
+ %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 22, i32 13, i32 21, i32 1, i32 14, i32 8, i32 5, i32 16, i32 15, i32 17, i32 24, i32 28, i32 15, i32 9, i32 14, i32 25>
+ %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
+ ret <16 x i16> %res
+}
+
+define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mem_mask2(<32 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
+; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
+; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [12,9,22,15,4,18,7,15,28,5,26,22,6,16,10,0]
+; CHECK-NEXT: vpermi2w %ymm2, %ymm3, %ymm4
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqw %ymm2, %ymm1, %k1
+; CHECK-NEXT: vmovdqu16 %ymm4, %ymm0 {%k1}
+; CHECK-NEXT: retq
+ %vec = load <32 x i16>, <32 x i16>* %vp
+ %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 28, i32 25, i32 6, i32 31, i32 20, i32 2, i32 23, i32 31, i32 12, i32 21, i32 10, i32 6, i32 22, i32 0, i32 26, i32 16>
+ %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
+ ret <16 x i16> %res
+}
+
+define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mem_mask2(<32 x i16>* %vp, <16 x i16> %mask) {
+; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
+; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [12,9,22,15,4,18,7,15,28,5,26,22,6,16,10,0]
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vpcmpeqw %ymm4, %ymm0, %k1
+; CHECK-NEXT: vpermi2w %ymm2, %ymm3, %ymm1 {%k1} {z}
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %vec = load <32 x i16>, <32 x i16>* %vp
+ %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 28, i32 25, i32 6, i32 31, i32 20, i32 2, i32 23, i32 31, i32 12, i32 21, i32 10, i32 6, i32 22, i32 0, i32 26, i32 16>
+ %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
+ ret <16 x i16> %res
+}
+
+define <16 x i16> @test_32xi16_to_16xi16_perm_mem_mask3(<32 x i16>* %vp) {
+; CHECK-LABEL: test_32xi16_to_16xi16_perm_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1
+; CHECK-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm0 = [3,3,20,27,8,31,3,27,12,2,8,14,25,27,4,16]
+; CHECK-NEXT: vpermi2w %ymm2, %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %vec = load <32 x i16>, <32 x i16>* %vp
+ %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 3, i32 3, i32 20, i32 27, i32 8, i32 31, i32 3, i32 27, i32 12, i32 2, i32 8, i32 14, i32 25, i32 27, i32 4, i32 16>
+ ret <16 x i16> %res
+}
+define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mem_mask3(<32 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
+; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
+; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [3,3,20,27,8,31,3,27,12,2,8,14,25,27,4,16]
+; CHECK-NEXT: vpermi2w %ymm3, %ymm2, %ymm4
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqw %ymm2, %ymm1, %k1
+; CHECK-NEXT: vmovdqu16 %ymm4, %ymm0 {%k1}
+; CHECK-NEXT: retq
+ %vec = load <32 x i16>, <32 x i16>* %vp
+ %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 3, i32 3, i32 20, i32 27, i32 8, i32 31, i32 3, i32 27, i32 12, i32 2, i32 8, i32 14, i32 25, i32 27, i32 4, i32 16>
+ %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
+ ret <16 x i16> %res
+}
+
+define <16 x i16> @test_masked_z_32xi16_to_16xi16_perm_mem_mask3(<32 x i16>* %vp, <16 x i16> %mask) {
+; CHECK-LABEL: test_masked_z_32xi16_to_16xi16_perm_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
+; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [3,3,20,27,8,31,3,27,12,2,8,14,25,27,4,16]
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vpcmpeqw %ymm4, %ymm0, %k1
+; CHECK-NEXT: vpermi2w %ymm3, %ymm2, %ymm1 {%k1} {z}
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %vec = load <32 x i16>, <32 x i16>* %vp
+ %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 3, i32 3, i32 20, i32 27, i32 8, i32 31, i32 3, i32 27, i32 12, i32 2, i32 8, i32 14, i32 25, i32 27, i32 4, i32 16>
+ %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
+ ret <16 x i16> %res
+}
+
+define <8 x i16> @test_32xi16_to_8xi16_perm_mem_mask0(<32 x i16>* %vp) {
+; CHECK-LABEL: test_32xi16_to_8xi16_perm_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1
+; CHECK-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm0 = <16,17,5,1,14,14,13,17,u,u,u,u,u,u,u,u>
+; CHECK-NEXT: vpermi2w %ymm1, %ymm2, %ymm0
+; CHECK-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %vec = load <32 x i16>, <32 x i16>* %vp
+ %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 0, i32 1, i32 21, i32 17, i32 30, i32 30, i32 29, i32 1>
+ ret <8 x i16> %res
+}
+define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask0(<32 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
+; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
+; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = <16,17,5,1,14,14,13,17,u,u,u,u,u,u,u,u>
+; CHECK-NEXT: vpermi2w %ymm2, %ymm3, %ymm4
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1
+; CHECK-NEXT: vmovdqu16 %xmm4, %xmm0 {%k1}
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %vec = load <32 x i16>, <32 x i16>* %vp
+ %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 0, i32 1, i32 21, i32 17, i32 30, i32 30, i32 29, i32 1>
+ %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
+ ret <8 x i16> %res
+}
+
+define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask0(<32 x i16>* %vp, <8 x i16> %mask) {
+; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1
+; CHECK-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <16,17,5,1,14,14,13,17,u,u,u,u,u,u,u,u>
+; CHECK-NEXT: vpermi2w %ymm1, %ymm2, %ymm3
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k1
+; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1} {z}
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %vec = load <32 x i16>, <32 x i16>* %vp
+ %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 0, i32 1, i32 21, i32 17, i32 30, i32 30, i32 29, i32 1>
+ %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
+ ret <8 x i16> %res
+}
+
+define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask1(<32 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
+; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
+; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = <7,6,4,6,12,4,27,1,u,u,u,u,u,u,u,u>
+; CHECK-NEXT: vpermi2w %ymm2, %ymm3, %ymm4
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1
+; CHECK-NEXT: vmovdqu16 %xmm4, %xmm0 {%k1}
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %vec = load <32 x i16>, <32 x i16>* %vp
+ %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 23, i32 22, i32 20, i32 22, i32 28, i32 20, i32 11, i32 17>
+ %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
+ ret <8 x i16> %res
+}
+
+define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask1(<32 x i16>* %vp, <8 x i16> %mask) {
+; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1
+; CHECK-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <7,6,4,6,12,4,27,1,u,u,u,u,u,u,u,u>
+; CHECK-NEXT: vpermi2w %ymm1, %ymm2, %ymm3
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k1
+; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1} {z}
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %vec = load <32 x i16>, <32 x i16>* %vp
+ %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 23, i32 22, i32 20, i32 22, i32 28, i32 20, i32 11, i32 17>
+ %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
+ ret <8 x i16> %res
+}
+
+define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask2(<32 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
+; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
+; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = <6,18,0,4,10,25,22,10,u,u,u,u,u,u,u,u>
+; CHECK-NEXT: vpermi2w %ymm3, %ymm2, %ymm4
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1
+; CHECK-NEXT: vmovdqu16 %xmm4, %xmm0 {%k1}
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %vec = load <32 x i16>, <32 x i16>* %vp
+ %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 6, i32 18, i32 0, i32 4, i32 10, i32 25, i32 22, i32 10>
+ %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
+ ret <8 x i16> %res
+}
+
+define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask2(<32 x i16>* %vp, <8 x i16> %mask) {
+; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1
+; CHECK-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <6,18,0,4,10,25,22,10,u,u,u,u,u,u,u,u>
+; CHECK-NEXT: vpermi2w %ymm2, %ymm1, %ymm3
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k1
+; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1} {z}
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %vec = load <32 x i16>, <32 x i16>* %vp
+ %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 6, i32 18, i32 0, i32 4, i32 10, i32 25, i32 22, i32 10>
+ %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
+ ret <8 x i16> %res
+}
+
+define <8 x i16> @test_32xi16_to_8xi16_perm_mem_mask3(<32 x i16>* %vp) {
+; CHECK-LABEL: test_32xi16_to_8xi16_perm_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1
+; CHECK-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm0 = <19,1,5,31,9,12,17,9,u,u,u,u,u,u,u,u>
+; CHECK-NEXT: vpermi2w %ymm2, %ymm1, %ymm0
+; CHECK-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %vec = load <32 x i16>, <32 x i16>* %vp
+ %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 19, i32 1, i32 5, i32 31, i32 9, i32 12, i32 17, i32 9>
+ ret <8 x i16> %res
+}
+define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mem_mask3(<32 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
+; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
+; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = <19,1,5,31,9,12,17,9,u,u,u,u,u,u,u,u>
+; CHECK-NEXT: vpermi2w %ymm3, %ymm2, %ymm4
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1
+; CHECK-NEXT: vmovdqu16 %xmm4, %xmm0 {%k1}
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %vec = load <32 x i16>, <32 x i16>* %vp
+ %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 19, i32 1, i32 5, i32 31, i32 9, i32 12, i32 17, i32 9>
+ %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
+ ret <8 x i16> %res
+}
+
+define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask3(<32 x i16>* %vp, <8 x i16> %mask) {
+; CHECK-LABEL: test_masked_z_32xi16_to_8xi16_perm_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1
+; CHECK-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <19,1,5,31,9,12,17,9,u,u,u,u,u,u,u,u>
+; CHECK-NEXT: vpermi2w %ymm2, %ymm1, %ymm3
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k1
+; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1} {z}
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %vec = load <32 x i16>, <32 x i16>* %vp
+ %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 19, i32 1, i32 5, i32 31, i32 9, i32 12, i32 17, i32 9>
+ %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
+ ret <8 x i16> %res
+}
+
+define <4 x i32> @test_8xi32_to_4xi32_perm_mask0(<8 x i32> %vec) {
+; CHECK-LABEL: test_8xi32_to_4xi32_perm_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1
+; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,3,2]
+; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 4, i32 0, i32 3, i32 2>
+ ret <4 x i32> %res
+}
+define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mask0(<8 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
+; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm3
+; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,3,2]
+; CHECK-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0],xmm0[1,2,3]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqd %xmm3, %xmm2, %k1
+; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 4, i32 0, i32 3, i32 2>
+ %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mask0(<8 x i32> %vec, <4 x i32> %mask) {
+; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm2
+; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,3,2]
+; CHECK-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %k1
+; CHECK-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 4, i32 0, i32 3, i32 2>
+ %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
+ ret <4 x i32> %res
+}
+define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mask1(<8 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
+; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm3
+; CHECK-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3]
+; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,0,2,3]
+; CHECK-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm3[2],xmm0[3]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqd %xmm3, %xmm2, %k1
+; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 3, i32 0, i32 7, i32 3>
+ %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mask1(<8 x i32> %vec, <4 x i32> %mask) {
+; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm2
+; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3]
+; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,0,2,3]
+; CHECK-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3]
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %k1
+; CHECK-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 3, i32 0, i32 7, i32 3>
+ %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
+ ret <4 x i32> %res
+}
+define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mask2(<8 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
+; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm3
+; CHECK-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm3[1],xmm0[1]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqd %xmm3, %xmm2, %k1
+; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 6, i32 7, i32 2, i32 3>
+ %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mask2(<8 x i32> %vec, <4 x i32> %mask) {
+; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm2
+; CHECK-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm2[1],xmm0[1]
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %k1
+; CHECK-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 6, i32 7, i32 2, i32 3>
+ %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
+ ret <4 x i32> %res
+}
+define <4 x i32> @test_8xi32_to_4xi32_perm_mask3(<8 x i32> %vec) {
+; CHECK-LABEL: test_8xi32_to_4xi32_perm_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1
+; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,3,2,1]
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 3, i32 2, i32 5>
+ ret <4 x i32> %res
+}
+define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mask3(<8 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
+; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm3
+; CHECK-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqd %xmm3, %xmm2, %k1
+; CHECK-NEXT: vpshufd {{.*#+}} xmm1 {%k1} = xmm0[1,3,2,1]
+; CHECK-NEXT: vmovdqa %xmm1, %xmm0
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 3, i32 2, i32 5>
+ %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mask3(<8 x i32> %vec, <4 x i32> %mask) {
+; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm2
+; CHECK-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %k1
+; CHECK-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[1,3,2,1]
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 3, i32 2, i32 5>
+ %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
+ ret <4 x i32> %res
+}
+define <4 x i32> @test_8xi32_to_4xi32_perm_mem_mask0(<8 x i32>* %vp) {
+; CHECK-LABEL: test_8xi32_to_4xi32_perm_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps (%rdi), %ymm0
+; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1
+; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm1[3,1],xmm0[0,0]
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %vec = load <8 x i32>, <8 x i32>* %vp
+ %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 7, i32 5, i32 0, i32 0>
+ ret <4 x i32> %res
+}
+define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mem_mask0(<8 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) {
+; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps (%rdi), %ymm2
+; CHECK-NEXT: vextractf128 $1, %ymm2, %xmm3
+; CHECK-NEXT: vshufps {{.*#+}} xmm2 = xmm3[3,1],xmm2[0,0]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqd %xmm3, %xmm1, %k1
+; CHECK-NEXT: vmovdqa32 %xmm2, %xmm0 {%k1}
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %vec = load <8 x i32>, <8 x i32>* %vp
+ %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 7, i32 5, i32 0, i32 0>
+ %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mem_mask0(<8 x i32>* %vp, <4 x i32> %mask) {
+; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps (%rdi), %ymm1
+; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm2
+; CHECK-NEXT: vshufps {{.*#+}} xmm1 = xmm2[3,1],xmm1[0,0]
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqd %xmm2, %xmm0, %k1
+; CHECK-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1} {z}
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %vec = load <8 x i32>, <8 x i32>* %vp
+ %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 7, i32 5, i32 0, i32 0>
+ %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mem_mask1(<8 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) {
+; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa (%rdi), %ymm2
+; CHECK-NEXT: vextracti128 $1, %ymm2, %xmm3
+; CHECK-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqd %xmm3, %xmm1, %k1
+; CHECK-NEXT: vpshufd {{.*#+}} xmm0 {%k1} = xmm2[1,0,0,3]
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %vec = load <8 x i32>, <8 x i32>* %vp
+ %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 0, i32 0, i32 3>
+ %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mem_mask1(<8 x i32>* %vp, <4 x i32> %mask) {
+; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa (%rdi), %ymm1
+; CHECK-NEXT: vextracti128 $1, %ymm1, %xmm2
+; CHECK-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3]
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqd %xmm2, %xmm0, %k1
+; CHECK-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm1[1,0,0,3]
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %vec = load <8 x i32>, <8 x i32>* %vp
+ %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 0, i32 0, i32 3>
+ %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mem_mask2(<8 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) {
+; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa (%rdi), %ymm2
+; CHECK-NEXT: vextracti128 $1, %ymm2, %xmm3
+; CHECK-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqd %xmm3, %xmm1, %k1
+; CHECK-NEXT: vpshufd {{.*#+}} xmm0 {%k1} = xmm2[0,3,3,0]
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %vec = load <8 x i32>, <8 x i32>* %vp
+ %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 4, i32 3, i32 3, i32 4>
+ %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mem_mask2(<8 x i32>* %vp, <4 x i32> %mask) {
+; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa (%rdi), %ymm1
+; CHECK-NEXT: vextracti128 $1, %ymm1, %xmm2
+; CHECK-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqd %xmm2, %xmm0, %k1
+; CHECK-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm1[0,3,3,0]
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %vec = load <8 x i32>, <8 x i32>* %vp
+ %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 4, i32 3, i32 3, i32 4>
+ %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_8xi32_to_4xi32_perm_mem_mask3(<8 x i32>* %vp) {
+; CHECK-LABEL: test_8xi32_to_4xi32_perm_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps (%rdi), %ymm0
+; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1
+; CHECK-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[1,1,2,3]
+; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %vec = load <8 x i32>, <8 x i32>* %vp
+ %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 3, i32 2, i32 7>
+ ret <4 x i32> %res
+}
+define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mem_mask3(<8 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) {
+; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa (%rdi), %ymm2
+; CHECK-NEXT: vextracti128 $1, %ymm2, %xmm3
+; CHECK-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,3]
+; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
+; CHECK-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1,2],xmm3[3]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqd %xmm3, %xmm1, %k1
+; CHECK-NEXT: vmovdqa32 %xmm2, %xmm0 {%k1}
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %vec = load <8 x i32>, <8 x i32>* %vp
+ %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 3, i32 2, i32 7>
+ %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mem_mask3(<8 x i32>* %vp, <4 x i32> %mask) {
+; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa (%rdi), %ymm1
+; CHECK-NEXT: vextracti128 $1, %ymm1, %xmm2
+; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3]
+; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; CHECK-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3]
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqd %xmm2, %xmm0, %k1
+; CHECK-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1} {z}
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %vec = load <8 x i32>, <8 x i32>* %vp
+ %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 3, i32 2, i32 7>
+ %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
+ ret <4 x i32> %res
+}
+
+define <8 x i32> @test_16xi32_to_8xi32_perm_mask0(<16 x i32> %vec) {
+; CHECK-LABEL: test_16xi32_to_8xi32_perm_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [9,5,3,6,15,2,9,14]
+; CHECK-NEXT: vpermi2d %ymm0, %ymm2, %ymm1
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 1, i32 13, i32 11, i32 14, i32 7, i32 10, i32 1, i32 6>
+ ret <8 x i32> %res
+}
+define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mask0(<16 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
+; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [9,5,3,6,15,2,9,14]
+; CHECK-NEXT: vpermi2d %ymm0, %ymm3, %ymm4
+; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; CHECK-NEXT: vpcmpeqd %ymm0, %ymm2, %k1
+; CHECK-NEXT: vpblendmd %ymm4, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 1, i32 13, i32 11, i32 14, i32 7, i32 10, i32 1, i32 6>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mask0(<16 x i32> %vec, <8 x i32> %mask) {
+; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [9,5,3,6,15,2,9,14]
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vpcmpeqd %ymm4, %ymm1, %k1
+; CHECK-NEXT: vpermi2d %ymm0, %ymm3, %ymm2 {%k1} {z}
+; CHECK-NEXT: vmovdqa %ymm2, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 1, i32 13, i32 11, i32 14, i32 7, i32 10, i32 1, i32 6>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
+ ret <8 x i32> %res
+}
+define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mask1(<16 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
+; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [3,0,15,3,2,3,6,8]
+; CHECK-NEXT: vpermi2d %ymm3, %ymm0, %ymm4
+; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; CHECK-NEXT: vpcmpeqd %ymm0, %ymm2, %k1
+; CHECK-NEXT: vpblendmd %ymm4, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 3, i32 0, i32 15, i32 3, i32 2, i32 3, i32 6, i32 8>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mask1(<16 x i32> %vec, <8 x i32> %mask) {
+; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [3,0,15,3,2,3,6,8]
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vpcmpeqd %ymm4, %ymm1, %k1
+; CHECK-NEXT: vpermi2d %ymm3, %ymm0, %ymm2 {%k1} {z}
+; CHECK-NEXT: vmovdqa %ymm2, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 3, i32 0, i32 15, i32 3, i32 2, i32 3, i32 6, i32 8>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
+ ret <8 x i32> %res
+}
+define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mask2(<16 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
+; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [2,15,15,2,6,10,14,7]
+; CHECK-NEXT: vpermi2d %ymm3, %ymm0, %ymm4
+; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; CHECK-NEXT: vpcmpeqd %ymm0, %ymm2, %k1
+; CHECK-NEXT: vpblendmd %ymm4, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 2, i32 15, i32 15, i32 2, i32 6, i32 10, i32 14, i32 7>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mask2(<16 x i32> %vec, <8 x i32> %mask) {
+; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [2,15,15,2,6,10,14,7]
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vpcmpeqd %ymm4, %ymm1, %k1
+; CHECK-NEXT: vpermi2d %ymm3, %ymm0, %ymm2 {%k1} {z}
+; CHECK-NEXT: vmovdqa %ymm2, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 2, i32 15, i32 15, i32 2, i32 6, i32 10, i32 14, i32 7>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
+ ret <8 x i32> %res
+}
+define <8 x i32> @test_16xi32_to_8xi32_perm_mask3(<16 x i32> %vec) {
+; CHECK-LABEL: test_16xi32_to_8xi32_perm_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [14,5,7,7,10,3,9,3]
+; CHECK-NEXT: vpermi2d %ymm2, %ymm0, %ymm1
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 14, i32 5, i32 7, i32 7, i32 10, i32 3, i32 9, i32 3>
+ ret <8 x i32> %res
+}
+define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mask3(<16 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
+; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [14,5,7,7,10,3,9,3]
+; CHECK-NEXT: vpermi2d %ymm3, %ymm0, %ymm4
+; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; CHECK-NEXT: vpcmpeqd %ymm0, %ymm2, %k1
+; CHECK-NEXT: vpblendmd %ymm4, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 14, i32 5, i32 7, i32 7, i32 10, i32 3, i32 9, i32 3>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mask3(<16 x i32> %vec, <8 x i32> %mask) {
+; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [14,5,7,7,10,3,9,3]
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vpcmpeqd %ymm4, %ymm1, %k1
+; CHECK-NEXT: vpermi2d %ymm3, %ymm0, %ymm2 {%k1} {z}
+; CHECK-NEXT: vmovdqa %ymm2, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 14, i32 5, i32 7, i32 7, i32 10, i32 3, i32 9, i32 3>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
+ ret <8 x i32> %res
+}
+define <4 x i32> @test_16xi32_to_4xi32_perm_mask0(<16 x i32> %vec) {
+; CHECK-LABEL: test_16xi32_to_4xi32_perm_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm1
+; CHECK-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
+; CHECK-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,0,3,4,6,4,7]
+; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6],ymm1[7]
+; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3]
+; CHECK-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 12>
+ ret <4 x i32> %res
+}
+define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mask0(<16 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
+; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,1,2,0,4,5,6,4]
+; CHECK-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,3,4,6,4,7]
+; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3],ymm0[4,5,6],ymm3[7]
+; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqd %xmm3, %xmm2, %k1
+; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 12>
+ %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mask0(<16 x i32> %vec, <4 x i32> %mask) {
+; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; CHECK-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4]
+; CHECK-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,3,4,6,4,7]
+; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3],ymm0[4,5,6],ymm2[7]
+; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %k1
+; CHECK-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 12>
+ %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
+ ret <4 x i32> %res
+}
+define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mask1(<16 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
+; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <5,1,3,4,u,u,u,u>
+; CHECK-NEXT: vpermd %ymm0, %ymm3, %ymm0
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqd %xmm3, %xmm2, %k1
+; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 13, i32 9, i32 11, i32 12>
+ %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mask1(<16 x i32> %vec, <4 x i32> %mask) {
+; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = <5,1,3,4,u,u,u,u>
+; CHECK-NEXT: vpermd %ymm0, %ymm2, %ymm0
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %k1
+; CHECK-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 13, i32 9, i32 11, i32 12>
+ %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
+ ret <4 x i32> %res
+}
+define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mask2(<16 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
+; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = <1,1,13,0,u,u,u,u>
+; CHECK-NEXT: vpermi2d %ymm3, %ymm0, %ymm4
+; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; CHECK-NEXT: vpcmpeqd %xmm0, %xmm2, %k1
+; CHECK-NEXT: vpblendmd %xmm4, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 1, i32 1, i32 13, i32 0>
+ %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mask2(<16 x i32> %vec, <4 x i32> %mask) {
+; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <1,1,13,0,u,u,u,u>
+; CHECK-NEXT: vpermi2d %ymm2, %ymm0, %ymm3
+; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; CHECK-NEXT: vpcmpeqd %xmm0, %xmm1, %k1
+; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1} {z}
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 1, i32 1, i32 13, i32 0>
+ %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
+ ret <4 x i32> %res
+}
+define <4 x i32> @test_16xi32_to_4xi32_perm_mask3(<16 x i32> %vec) {
+; CHECK-LABEL: test_16xi32_to_4xi32_perm_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = <3,0,0,13,u,u,u,u>
+; CHECK-NEXT: vpermi2d %ymm2, %ymm0, %ymm1
+; CHECK-NEXT: vmovdqa %xmm1, %xmm0
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 13>
+ ret <4 x i32> %res
+}
+define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mask3(<16 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
+; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = <3,0,0,13,u,u,u,u>
+; CHECK-NEXT: vpermi2d %ymm3, %ymm0, %ymm4
+; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; CHECK-NEXT: vpcmpeqd %xmm0, %xmm2, %k1
+; CHECK-NEXT: vpblendmd %xmm4, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 13>
+ %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mask3(<16 x i32> %vec, <4 x i32> %mask) {
+; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <3,0,0,13,u,u,u,u>
+; CHECK-NEXT: vpermi2d %ymm2, %ymm0, %ymm3
+; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; CHECK-NEXT: vpcmpeqd %xmm0, %xmm1, %k1
+; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1} {z}
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 13>
+ %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
+ ret <4 x i32> %res
+}
+define <8 x i32> @test_16xi32_to_8xi32_perm_mem_mask0(<16 x i32>* %vp) {
+; CHECK-LABEL: test_16xi32_to_8xi32_perm_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps {{.*#+}} ymm0 = [7,0,6,0,1,2,4,4]
+; CHECK-NEXT: vpermps 32(%rdi), %ymm0, %ymm0
+; CHECK-NEXT: retq
+ %vec = load <16 x i32>, <16 x i32>* %vp
+ %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 15, i32 8, i32 14, i32 8, i32 9, i32 10, i32 12, i32 12>
+ ret <8 x i32> %res
+}
+define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mem_mask0(<16 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) {
+; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [7,0,6,0,1,2,4,4]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqd %ymm3, %ymm1, %k1
+; CHECK-NEXT: vpermd 32(%rdi), %ymm2, %ymm0 {%k1}
+; CHECK-NEXT: retq
+ %vec = load <16 x i32>, <16 x i32>* %vp
+ %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 15, i32 8, i32 14, i32 8, i32 9, i32 10, i32 12, i32 12>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mem_mask0(<16 x i32>* %vp, <8 x i32> %mask) {
+; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [7,0,6,0,1,2,4,4]
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqd %ymm2, %ymm0, %k1
+; CHECK-NEXT: vpermd 32(%rdi), %ymm1, %ymm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %vec = load <16 x i32>, <16 x i32>* %vp
+ %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 15, i32 8, i32 14, i32 8, i32 9, i32 10, i32 12, i32 12>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mem_mask1(<16 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) {
+; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa32 (%rdi), %zmm2
+; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [7,3,6,11,0,1,5,15]
+; CHECK-NEXT: vpermi2d %ymm2, %ymm3, %ymm4
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vmovdqa32 %ymm4, %ymm0 {%k1}
+; CHECK-NEXT: retq
+ %vec = load <16 x i32>, <16 x i32>* %vp
+ %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 15, i32 11, i32 14, i32 3, i32 8, i32 9, i32 13, i32 7>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mem_mask1(<16 x i32>* %vp, <8 x i32> %mask) {
+; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa32 (%rdi), %zmm2
+; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [7,3,6,11,0,1,5,15]
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vpcmpeqd %ymm4, %ymm0, %k1
+; CHECK-NEXT: vpermi2d %ymm2, %ymm3, %ymm1 {%k1} {z}
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %vec = load <16 x i32>, <16 x i32>* %vp
+ %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 15, i32 11, i32 14, i32 3, i32 8, i32 9, i32 13, i32 7>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mem_mask2(<16 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) {
+; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa32 (%rdi), %zmm2
+; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [4,14,1,5,4,2,8,10]
+; CHECK-NEXT: vpermi2d %ymm2, %ymm3, %ymm4
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vmovdqa32 %ymm4, %ymm0 {%k1}
+; CHECK-NEXT: retq
+ %vec = load <16 x i32>, <16 x i32>* %vp
+ %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 12, i32 6, i32 9, i32 13, i32 12, i32 10, i32 0, i32 2>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mem_mask2(<16 x i32>* %vp, <8 x i32> %mask) {
+; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa32 (%rdi), %zmm2
+; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [4,14,1,5,4,2,8,10]
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vpcmpeqd %ymm4, %ymm0, %k1
+; CHECK-NEXT: vpermi2d %ymm2, %ymm3, %ymm1 {%k1} {z}
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %vec = load <16 x i32>, <16 x i32>* %vp
+ %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 12, i32 6, i32 9, i32 13, i32 12, i32 10, i32 0, i32 2>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_16xi32_to_8xi32_perm_mem_mask3(<16 x i32>* %vp) {
+; CHECK-LABEL: test_16xi32_to_8xi32_perm_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa32 (%rdi), %zmm1
+; CHECK-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm0 = [8,4,1,13,15,4,6,12]
+; CHECK-NEXT: vpermi2d %ymm2, %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %vec = load <16 x i32>, <16 x i32>* %vp
+ %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 8, i32 4, i32 1, i32 13, i32 15, i32 4, i32 6, i32 12>
+ ret <8 x i32> %res
+}
+define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mem_mask3(<16 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) {
+; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa32 (%rdi), %zmm2
+; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [8,4,1,13,15,4,6,12]
+; CHECK-NEXT: vpermi2d %ymm3, %ymm2, %ymm4
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vmovdqa32 %ymm4, %ymm0 {%k1}
+; CHECK-NEXT: retq
+ %vec = load <16 x i32>, <16 x i32>* %vp
+ %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 8, i32 4, i32 1, i32 13, i32 15, i32 4, i32 6, i32 12>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mem_mask3(<16 x i32>* %vp, <8 x i32> %mask) {
+; CHECK-LABEL: test_masked_z_16xi32_to_8xi32_perm_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa32 (%rdi), %zmm2
+; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [8,4,1,13,15,4,6,12]
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vpcmpeqd %ymm4, %ymm0, %k1
+; CHECK-NEXT: vpermi2d %ymm3, %ymm2, %ymm1 {%k1} {z}
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %vec = load <16 x i32>, <16 x i32>* %vp
+ %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 8, i32 4, i32 1, i32 13, i32 15, i32 4, i32 6, i32 12>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
+ ret <8 x i32> %res
+}
+
+define <4 x i32> @test_16xi32_to_4xi32_perm_mem_mask0(<16 x i32>* %vp) {
+; CHECK-LABEL: test_16xi32_to_4xi32_perm_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa32 (%rdi), %zmm1
+; CHECK-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm0 = <13,0,0,6,u,u,u,u>
+; CHECK-NEXT: vpermi2d %ymm2, %ymm1, %ymm0
+; CHECK-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %vec = load <16 x i32>, <16 x i32>* %vp
+ %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 13, i32 0, i32 0, i32 6>
+ ret <4 x i32> %res
+}
+define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask0(<16 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) {
+; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa32 (%rdi), %zmm2
+; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = <13,0,0,6,u,u,u,u>
+; CHECK-NEXT: vpermi2d %ymm3, %ymm2, %ymm4
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %k1
+; CHECK-NEXT: vmovdqa32 %xmm4, %xmm0 {%k1}
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %vec = load <16 x i32>, <16 x i32>* %vp
+ %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 13, i32 0, i32 0, i32 6>
+ %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask0(<16 x i32>* %vp, <4 x i32> %mask) {
+; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa32 (%rdi), %zmm1
+; CHECK-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <13,0,0,6,u,u,u,u>
+; CHECK-NEXT: vpermi2d %ymm2, %ymm1, %ymm3
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k1
+; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1} {z}
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %vec = load <16 x i32>, <16 x i32>* %vp
+ %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 13, i32 0, i32 0, i32 6>
+ %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask1(<16 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) {
+; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa32 (%rdi), %zmm2
+; CHECK-NEXT: vpshufd {{.*#+}} ymm3 = ymm2[3,1,2,3,7,5,6,7]
+; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm2
+; CHECK-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,3,2,4,5,7,6]
+; CHECK-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1,2,3],ymm3[4],ymm2[5,6,7]
+; CHECK-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqd %xmm3, %xmm1, %k1
+; CHECK-NEXT: vmovdqa32 %xmm2, %xmm0 {%k1}
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %vec = load <16 x i32>, <16 x i32>* %vp
+ %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 7, i32 13, i32 11, i32 10>
+ %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask1(<16 x i32>* %vp, <4 x i32> %mask) {
+; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa32 (%rdi), %zmm1
+; CHECK-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[3,1,2,3,7,5,6,7]
+; CHECK-NEXT: vextracti64x4 $1, %zmm1, %ymm1
+; CHECK-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,3,2,4,5,7,6]
+; CHECK-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7]
+; CHECK-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3]
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqd %xmm2, %xmm0, %k1
+; CHECK-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1} {z}
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %vec = load <16 x i32>, <16 x i32>* %vp
+ %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 7, i32 13, i32 11, i32 10>
+ %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask2(<16 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) {
+; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa32 (%rdi), %zmm2
+; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = <2,15,6,9,u,u,u,u>
+; CHECK-NEXT: vpermi2d %ymm3, %ymm2, %ymm4
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %k1
+; CHECK-NEXT: vmovdqa32 %xmm4, %xmm0 {%k1}
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %vec = load <16 x i32>, <16 x i32>* %vp
+ %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 2, i32 15, i32 6, i32 9>
+ %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask2(<16 x i32>* %vp, <4 x i32> %mask) {
+; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa32 (%rdi), %zmm1
+; CHECK-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <2,15,6,9,u,u,u,u>
+; CHECK-NEXT: vpermi2d %ymm2, %ymm1, %ymm3
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k1
+; CHECK-NEXT: vmovdqa32 %xmm3, %xmm0 {%k1} {z}
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %vec = load <16 x i32>, <16 x i32>* %vp
+ %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 2, i32 15, i32 6, i32 9>
+ %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_16xi32_to_4xi32_perm_mem_mask3(<16 x i32>* %vp) {
+; CHECK-LABEL: test_16xi32_to_4xi32_perm_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa32 (%rdi), %zmm0
+; CHECK-NEXT: vmovd %xmm0, %eax
+; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1
+; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; CHECK-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2
+; CHECK-NEXT: vpextrd $3, %xmm1, %eax
+; CHECK-NEXT: vpinsrd $2, %eax, %xmm2, %xmm1
+; CHECK-NEXT: vpextrd $2, %xmm0, %eax
+; CHECK-NEXT: vpinsrd $3, %eax, %xmm1, %xmm0
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %vec = load <16 x i32>, <16 x i32>* %vp
+ %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 6, i32 0, i32 7, i32 2>
+ ret <4 x i32> %res
+}
+define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask3(<16 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) {
+; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa32 (%rdi), %zmm2
+; CHECK-NEXT: vmovd %xmm2, %eax
+; CHECK-NEXT: vextracti128 $1, %ymm2, %xmm3
+; CHECK-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,3,0,1]
+; CHECK-NEXT: vpinsrd $1, %eax, %xmm4, %xmm4
+; CHECK-NEXT: vpextrd $3, %xmm3, %eax
+; CHECK-NEXT: vpinsrd $2, %eax, %xmm4, %xmm3
+; CHECK-NEXT: vpextrd $2, %xmm2, %eax
+; CHECK-NEXT: vpinsrd $3, %eax, %xmm3, %xmm2
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqd %xmm3, %xmm1, %k1
+; CHECK-NEXT: vmovdqa32 %xmm2, %xmm0 {%k1}
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %vec = load <16 x i32>, <16 x i32>* %vp
+ %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 6, i32 0, i32 7, i32 2>
+ %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask3(<16 x i32>* %vp, <4 x i32> %mask) {
+; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa32 (%rdi), %zmm1
+; CHECK-NEXT: vmovd %xmm1, %eax
+; CHECK-NEXT: vextracti128 $1, %ymm1, %xmm2
+; CHECK-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,3,0,1]
+; CHECK-NEXT: vpinsrd $1, %eax, %xmm3, %xmm3
+; CHECK-NEXT: vpextrd $3, %xmm2, %eax
+; CHECK-NEXT: vpinsrd $2, %eax, %xmm3, %xmm2
+; CHECK-NEXT: vpextrd $2, %xmm1, %eax
+; CHECK-NEXT: vpinsrd $3, %eax, %xmm2, %xmm1
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqd %xmm2, %xmm0, %k1
+; CHECK-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1} {z}
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %vec = load <16 x i32>, <16 x i32>* %vp
+ %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 6, i32 0, i32 7, i32 2>
+ %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
+ ret <4 x i32> %res
+}
+
+define <2 x i64> @test_4xi64_to_2xi64_perm_mask0(<4 x i64> %vec) {
+; CHECK-LABEL: test_4xi64_to_2xi64_perm_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1
+; CHECK-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %res = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 2, i32 0>
+ ret <2 x i64> %res
+}
+define <2 x i64> @test_masked_4xi64_to_2xi64_perm_mask0(<4 x i64> %vec, <2 x i64> %vec2, <2 x i64> %mask) {
+; CHECK-LABEL: test_masked_4xi64_to_2xi64_perm_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm3
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vpcmpeqq %xmm4, %xmm2, %k1
+; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm0[0]
+; CHECK-NEXT: vmovdqa %xmm1, %xmm0
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 2, i32 0>
+ %cmp = icmp eq <2 x i64> %mask, zeroinitializer
+ %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> %vec2
+ ret <2 x i64> %res
+}
+
+define <2 x i64> @test_masked_z_4xi64_to_2xi64_perm_mask0(<4 x i64> %vec, <2 x i64> %mask) {
+; CHECK-LABEL: test_masked_z_4xi64_to_2xi64_perm_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm2
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqq %xmm3, %xmm1, %k1
+; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 2, i32 0>
+ %cmp = icmp eq <2 x i64> %mask, zeroinitializer
+ %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> zeroinitializer
+ ret <2 x i64> %res
+}
+define <2 x i64> @test_masked_4xi64_to_2xi64_perm_mask1(<4 x i64> %vec, <2 x i64> %vec2, <2 x i64> %mask) {
+; CHECK-LABEL: test_masked_4xi64_to_2xi64_perm_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm3
+; CHECK-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqq %xmm3, %xmm2, %k1
+; CHECK-NEXT: vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 2, i32 1>
+ %cmp = icmp eq <2 x i64> %mask, zeroinitializer
+ %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> %vec2
+ ret <2 x i64> %res
+}
+
+define <2 x i64> @test_masked_z_4xi64_to_2xi64_perm_mask1(<4 x i64> %vec, <2 x i64> %mask) {
+; CHECK-LABEL: test_masked_z_4xi64_to_2xi64_perm_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm2
+; CHECK-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqq %xmm2, %xmm1, %k1
+; CHECK-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 2, i32 1>
+ %cmp = icmp eq <2 x i64> %mask, zeroinitializer
+ %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> zeroinitializer
+ ret <2 x i64> %res
+}
+define <2 x i64> @test_4xi64_to_2xi64_perm_mem_mask0(<4 x i64>* %vp) {
+; CHECK-LABEL: test_4xi64_to_2xi64_perm_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps (%rdi), %ymm0
+; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1
+; CHECK-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %vec = load <4 x i64>, <4 x i64>* %vp
+ %res = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 1, i32 3>
+ ret <2 x i64> %res
+}
+define <2 x i64> @test_masked_4xi64_to_2xi64_perm_mem_mask0(<4 x i64>* %vp, <2 x i64> %vec2, <2 x i64> %mask) {
+; CHECK-LABEL: test_masked_4xi64_to_2xi64_perm_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa (%rdi), %ymm2
+; CHECK-NEXT: vextracti128 $1, %ymm2, %xmm3
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vpcmpeqq %xmm4, %xmm1, %k1
+; CHECK-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm2[1],xmm3[1]
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %vec = load <4 x i64>, <4 x i64>* %vp
+ %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 1, i32 3>
+ %cmp = icmp eq <2 x i64> %mask, zeroinitializer
+ %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> %vec2
+ ret <2 x i64> %res
+}
+
+define <2 x i64> @test_masked_z_4xi64_to_2xi64_perm_mem_mask0(<4 x i64>* %vp, <2 x i64> %mask) {
+; CHECK-LABEL: test_masked_z_4xi64_to_2xi64_perm_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa (%rdi), %ymm1
+; CHECK-NEXT: vextracti128 $1, %ymm1, %xmm2
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqq %xmm3, %xmm0, %k1
+; CHECK-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm2[1]
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %vec = load <4 x i64>, <4 x i64>* %vp
+ %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 1, i32 3>
+ %cmp = icmp eq <2 x i64> %mask, zeroinitializer
+ %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> zeroinitializer
+ ret <2 x i64> %res
+}
+
+define <2 x i64> @test_masked_4xi64_to_2xi64_perm_mem_mask1(<4 x i64>* %vp, <2 x i64> %vec2, <2 x i64> %mask) {
+; CHECK-LABEL: test_masked_4xi64_to_2xi64_perm_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa (%rdi), %ymm2
+; CHECK-NEXT: vextracti128 $1, %ymm2, %xmm3
+; CHECK-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqq %xmm3, %xmm1, %k1
+; CHECK-NEXT: vmovdqa64 %xmm2, %xmm0 {%k1}
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %vec = load <4 x i64>, <4 x i64>* %vp
+ %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 2, i32 1>
+ %cmp = icmp eq <2 x i64> %mask, zeroinitializer
+ %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> %vec2
+ ret <2 x i64> %res
+}
+
+define <2 x i64> @test_masked_z_4xi64_to_2xi64_perm_mem_mask1(<4 x i64>* %vp, <2 x i64> %mask) {
+; CHECK-LABEL: test_masked_z_4xi64_to_2xi64_perm_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa (%rdi), %ymm1
+; CHECK-NEXT: vextracti128 $1, %ymm1, %xmm2
+; CHECK-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqq %xmm2, %xmm0, %k1
+; CHECK-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z}
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %vec = load <4 x i64>, <4 x i64>* %vp
+ %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 2, i32 1>
+ %cmp = icmp eq <2 x i64> %mask, zeroinitializer
+ %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> zeroinitializer
+ ret <2 x i64> %res
+}
+
+define <4 x i64> @test_8xi64_to_4xi64_perm_mask0(<8 x i64> %vec) {
+; CHECK-LABEL: test_8xi64_to_4xi64_perm_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm0
+; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,1]
+; CHECK-NEXT: retq
+ %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 7, i32 6, i32 5>
+ ret <4 x i64> %res
+}
+define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask0(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
+; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1
+; CHECK-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,3,2,1]
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 7, i32 6, i32 5>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask0(<8 x i64> %vec, <4 x i64> %mask) {
+; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
+; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3,2,1]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 7, i32 6, i32 5>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
+ ret <4 x i64> %res
+}
+define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask1(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
+; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5,6,7]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1
+; CHECK-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,0,2,1]
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 4, i32 6, i32 1>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask1(<8 x i64> %vec, <4 x i64> %mask) {
+; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7]
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
+; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,0,2,1]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 4, i32 6, i32 1>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
+ ret <4 x i64> %res
+}
+define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask2(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
+; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1
+; CHECK-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,3,2,3]
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 3, i32 6, i32 3>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask2(<8 x i64> %vec, <4 x i64> %mask) {
+; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
+; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3,2,3]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 3, i32 6, i32 3>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
+ ret <4 x i64> %res
+}
+define <4 x i64> @test_8xi64_to_4xi64_perm_mask3(<8 x i64> %vec) {
+; CHECK-LABEL: test_8xi64_to_4xi64_perm_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm1
+; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
+; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,0,0,3]
+; CHECK-NEXT: retq
+ %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 0, i32 0, i32 7>
+ ret <4 x i64> %res
+}
+define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask3(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
+; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3,4,5,6,7]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1
+; CHECK-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,0,0,3]
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 0, i32 0, i32 7>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask3(<8 x i64> %vec, <4 x i64> %mask) {
+; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7]
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
+; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,0,0,3]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 0, i32 0, i32 7>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
+ ret <4 x i64> %res
+}
+define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask4(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
+; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask4:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpermq {{.*#+}} ymm3 = ymm0[3,1,2,3]
+; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,3,1]
+; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5,6,7]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1
+; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 3, i32 7, i32 7, i32 5>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask4(<8 x i64> %vec, <4 x i64> %mask) {
+; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask4:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpermq {{.*#+}} ymm2 = ymm0[3,1,2,3]
+; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,3,1]
+; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7]
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
+; CHECK-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 3, i32 7, i32 7, i32 5>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
+ ret <4 x i64> %res
+}
+define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask5(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
+; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask5:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,1,0,1,4,5,4,5]
+; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3]
+; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5],ymm3[6,7]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1
+; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 4, i32 1, i32 0, i32 6>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask5(<8 x i64> %vec, <4 x i64> %mask) {
+; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask5:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; CHECK-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,0,1,4,5,4,5]
+; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3]
+; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5],ymm2[6,7]
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
+; CHECK-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 4, i32 1, i32 0, i32 6>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
+ ret <4 x i64> %res
+}
+define <4 x i64> @test_8xi64_to_4xi64_perm_mask6(<8 x i64> %vec) {
+; CHECK-LABEL: test_8xi64_to_4xi64_perm_mask6:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm1
+; CHECK-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,2,1,3]
+; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
+; CHECK-NEXT: retq
+ %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 6, i32 5, i32 3>
+ ret <4 x i64> %res
+}
+define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask6(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
+; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask6:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT: vpermq {{.*#+}} ymm3 = ymm3[3,2,1,3]
+; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1
+; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 6, i32 5, i32 3>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask6(<8 x i64> %vec, <4 x i64> %mask) {
+; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask6:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; CHECK-NEXT: vpermq {{.*#+}} ymm2 = ymm2[3,2,1,3]
+; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
+; CHECK-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 6, i32 5, i32 3>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
+ ret <4 x i64> %res
+}
+define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask7(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
+; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask7:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,0,3,3]
+; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; CHECK-NEXT: vpbroadcastq %xmm0, %ymm0
+; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1
+; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 2, i32 0, i32 3, i32 4>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask7(<8 x i64> %vec, <4 x i64> %mask) {
+; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask7:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,0,3,3]
+; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; CHECK-NEXT: vpbroadcastq %xmm0, %ymm0
+; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
+; CHECK-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 2, i32 0, i32 3, i32 4>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
+ ret <4 x i64> %res
+}
+define <2 x i64> @test_8xi64_to_2xi64_perm_mask0(<8 x i64> %vec) {
+; CHECK-LABEL: test_8xi64_to_2xi64_perm_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1
+; CHECK-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; CHECK-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 3, i32 0>
+ ret <2 x i64> %res
+}
+define <2 x i64> @test_masked_8xi64_to_2xi64_perm_mask0(<8 x i64> %vec, <2 x i64> %vec2, <2 x i64> %mask) {
+; CHECK-LABEL: test_masked_8xi64_to_2xi64_perm_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm3
+; CHECK-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vpcmpeqq %xmm4, %xmm2, %k1
+; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm0[0]
+; CHECK-NEXT: vmovdqa %xmm1, %xmm0
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 3, i32 0>
+ %cmp = icmp eq <2 x i64> %mask, zeroinitializer
+ %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> %vec2
+ ret <2 x i64> %res
+}
+
+define <2 x i64> @test_masked_z_8xi64_to_2xi64_perm_mask0(<8 x i64> %vec, <2 x i64> %mask) {
+; CHECK-LABEL: test_masked_z_8xi64_to_2xi64_perm_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm2
+; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqq %xmm3, %xmm1, %k1
+; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 3, i32 0>
+ %cmp = icmp eq <2 x i64> %mask, zeroinitializer
+ %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> zeroinitializer
+ ret <2 x i64> %res
+}
+define <2 x i64> @test_masked_8xi64_to_2xi64_perm_mask1(<8 x i64> %vec, <2 x i64> %vec2, <2 x i64> %mask) {
+; CHECK-LABEL: test_masked_8xi64_to_2xi64_perm_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqq %xmm3, %xmm2, %k1
+; CHECK-NEXT: vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 6, i32 5>
+ %cmp = icmp eq <2 x i64> %mask, zeroinitializer
+ %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> %vec2
+ ret <2 x i64> %res
+}
+
+define <2 x i64> @test_masked_z_8xi64_to_2xi64_perm_mask1(<8 x i64> %vec, <2 x i64> %mask) {
+; CHECK-LABEL: test_masked_z_8xi64_to_2xi64_perm_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqq %xmm2, %xmm1, %k1
+; CHECK-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 6, i32 5>
+ %cmp = icmp eq <2 x i64> %mask, zeroinitializer
+ %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> zeroinitializer
+ ret <2 x i64> %res
+}
+define <4 x i64> @test_8xi64_to_4xi64_perm_mem_mask0(<8 x i64>* %vp) {
+; CHECK-LABEL: test_8xi64_to_4xi64_perm_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = mem[0,2,0,2]
+; CHECK-NEXT: retq
+ %vec = load <8 x i64>, <8 x i64>* %vp
+ %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 2, i32 0, i32 2>
+ ret <4 x i64> %res
+}
+define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask0(<8 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) {
+; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
+; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} = mem[0,2,0,2]
+; CHECK-NEXT: retq
+ %vec = load <8 x i64>, <8 x i64>* %vp
+ %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 2, i32 0, i32 2>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask0(<8 x i64>* %vp, <4 x i64> %mask) {
+; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k1
+; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = mem[0,2,0,2]
+; CHECK-NEXT: retq
+ %vec = load <8 x i64>, <8 x i64>* %vp
+ %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 2, i32 0, i32 2>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask1(<8 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) {
+; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
+; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3,4,5,6,7]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqq %ymm3, %ymm1, %k1
+; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} = ymm2[0,3,2,0]
+; CHECK-NEXT: retq
+ %vec = load <8 x i64>, <8 x i64>* %vp
+ %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 7, i32 6, i32 0>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask1(<8 x i64>* %vp, <4 x i64> %mask) {
+; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1
+; CHECK-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; CHECK-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7]
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqq %ymm2, %ymm0, %k1
+; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm1[0,3,2,0]
+; CHECK-NEXT: retq
+ %vec = load <8 x i64>, <8 x i64>* %vp
+ %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 7, i32 6, i32 0>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask2(<8 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) {
+; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
+; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT: vpermq {{.*#+}} ymm3 = ymm3[3,1,2,1]
+; CHECK-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3]
+; CHECK-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5],ymm3[6,7]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqq %ymm3, %ymm1, %k1
+; CHECK-NEXT: vmovdqa64 %ymm2, %ymm0 {%k1}
+; CHECK-NEXT: retq
+ %vec = load <8 x i64>, <8 x i64>* %vp
+ %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 1, i32 1, i32 5>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask2(<8 x i64>* %vp, <4 x i64> %mask) {
+; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1
+; CHECK-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; CHECK-NEXT: vpermq {{.*#+}} ymm2 = ymm2[3,1,2,1]
+; CHECK-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3]
+; CHECK-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5],ymm2[6,7]
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqq %ymm2, %ymm0, %k1
+; CHECK-NEXT: vmovdqa64 %ymm1, %ymm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %vec = load <8 x i64>, <8 x i64>* %vp
+ %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 1, i32 1, i32 5>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_8xi64_to_4xi64_perm_mem_mask3(<8 x i64>* %vp) {
+; CHECK-LABEL: test_8xi64_to_4xi64_perm_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps (%rdi), %zmm0
+; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm1
+; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
+; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,0,0,2]
+; CHECK-NEXT: retq
+ %vec = load <8 x i64>, <8 x i64>* %vp
+ %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 0, i32 0, i32 2>
+ ret <4 x i64> %res
+}
+define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask3(<8 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) {
+; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
+; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqq %ymm3, %ymm1, %k1
+; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} = ymm2[3,0,0,2]
+; CHECK-NEXT: retq
+ %vec = load <8 x i64>, <8 x i64>* %vp
+ %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 0, i32 0, i32 2>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask3(<8 x i64>* %vp, <4 x i64> %mask) {
+; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1
+; CHECK-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; CHECK-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqq %ymm2, %ymm0, %k1
+; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm1[3,0,0,2]
+; CHECK-NEXT: retq
+ %vec = load <8 x i64>, <8 x i64>* %vp
+ %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 0, i32 0, i32 2>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask4(<8 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) {
+; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask4:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
+; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,1,0,1,4,5,4,5]
+; CHECK-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,1]
+; CHECK-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3,4,5],ymm2[6,7]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqq %ymm3, %ymm1, %k1
+; CHECK-NEXT: vmovdqa64 %ymm2, %ymm0 {%k1}
+; CHECK-NEXT: retq
+ %vec = load <8 x i64>, <8 x i64>* %vp
+ %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 4, i32 6, i32 1>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask4(<8 x i64>* %vp, <4 x i64> %mask) {
+; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask4:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1
+; CHECK-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; CHECK-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,0,1,4,5,4,5]
+; CHECK-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1]
+; CHECK-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5],ymm1[6,7]
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqq %ymm2, %ymm0, %k1
+; CHECK-NEXT: vmovdqa64 %ymm1, %ymm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %vec = load <8 x i64>, <8 x i64>* %vp
+ %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 4, i32 6, i32 1>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask5(<8 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) {
+; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask5:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
+; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqq %ymm3, %ymm1, %k1
+; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} = ymm2[0,2,3,1]
+; CHECK-NEXT: retq
+ %vec = load <8 x i64>, <8 x i64>* %vp
+ %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 2, i32 7, i32 1>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask5(<8 x i64>* %vp, <4 x i64> %mask) {
+; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask5:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1
+; CHECK-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; CHECK-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqq %ymm2, %ymm0, %k1
+; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm1[0,2,3,1]
+; CHECK-NEXT: retq
+ %vec = load <8 x i64>, <8 x i64>* %vp
+ %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 2, i32 7, i32 1>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_8xi64_to_4xi64_perm_mem_mask6(<8 x i64>* %vp) {
+; CHECK-LABEL: test_8xi64_to_4xi64_perm_mem_mask6:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps (%rdi), %zmm0
+; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm1
+; CHECK-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,1,2,3]
+; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,2]
+; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
+; CHECK-NEXT: retq
+ %vec = load <8 x i64>, <8 x i64>* %vp
+ %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 2, i32 3, i32 2>
+ ret <4 x i64> %res
+}
+define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask6(<8 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) {
+; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask6:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
+; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT: vpermq {{.*#+}} ymm3 = ymm3[3,1,2,3]
+; CHECK-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,3,2]
+; CHECK-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqq %ymm3, %ymm1, %k1
+; CHECK-NEXT: vmovdqa64 %ymm2, %ymm0 {%k1}
+; CHECK-NEXT: retq
+ %vec = load <8 x i64>, <8 x i64>* %vp
+ %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 2, i32 3, i32 2>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask6(<8 x i64>* %vp, <4 x i64> %mask) {
+; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask6:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1
+; CHECK-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; CHECK-NEXT: vpermq {{.*#+}} ymm2 = ymm2[3,1,2,3]
+; CHECK-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,3,2]
+; CHECK-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7]
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqq %ymm2, %ymm0, %k1
+; CHECK-NEXT: vmovdqa64 %ymm1, %ymm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %vec = load <8 x i64>, <8 x i64>* %vp
+ %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 2, i32 3, i32 2>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask7(<8 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) {
+; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mem_mask7:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
+; CHECK-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm3
+; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm2
+; CHECK-NEXT: vpermq {{.*#+}} ymm2 = ymm2[3,3,1,3]
+; CHECK-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqq %ymm3, %ymm1, %k1
+; CHECK-NEXT: vmovdqa64 %ymm2, %ymm0 {%k1}
+; CHECK-NEXT: retq
+ %vec = load <8 x i64>, <8 x i64>* %vp
+ %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 7, i32 5, i32 1>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask7(<8 x i64>* %vp, <4 x i64> %mask) {
+; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask7:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1
+; CHECK-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm2
+; CHECK-NEXT: vextracti64x4 $1, %zmm1, %ymm1
+; CHECK-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,3,1,3]
+; CHECK-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqq %ymm2, %ymm0, %k1
+; CHECK-NEXT: vmovdqa64 %ymm1, %ymm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %vec = load <8 x i64>, <8 x i64>* %vp
+ %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 7, i32 5, i32 1>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
+ ret <4 x i64> %res
+}
+
+define <2 x i64> @test_8xi64_to_2xi64_perm_mem_mask0(<8 x i64>* %vp) {
+; CHECK-LABEL: test_8xi64_to_2xi64_perm_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps (%rdi), %zmm0
+; CHECK-NEXT: vextractf32x4 $2, %zmm0, %xmm1
+; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; CHECK-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %vec = load <8 x i64>, <8 x i64>* %vp
+ %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 4, i32 1>
+ ret <2 x i64> %res
+}
+define <2 x i64> @test_masked_8xi64_to_2xi64_perm_mem_mask0(<8 x i64>* %vp, <2 x i64> %vec2, <2 x i64> %mask) {
+; CHECK-LABEL: test_masked_8xi64_to_2xi64_perm_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
+; CHECK-NEXT: vextracti32x4 $2, %zmm2, %xmm3
+; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vpcmpeqq %xmm4, %xmm1, %k1
+; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm2[0]
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %vec = load <8 x i64>, <8 x i64>* %vp
+ %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 4, i32 1>
+ %cmp = icmp eq <2 x i64> %mask, zeroinitializer
+ %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> %vec2
+ ret <2 x i64> %res
+}
+
+define <2 x i64> @test_masked_z_8xi64_to_2xi64_perm_mem_mask0(<8 x i64>* %vp, <2 x i64> %mask) {
+; CHECK-LABEL: test_masked_z_8xi64_to_2xi64_perm_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1
+; CHECK-NEXT: vextracti32x4 $2, %zmm1, %xmm2
+; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqq %xmm3, %xmm0, %k1
+; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm1[0]
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %vec = load <8 x i64>, <8 x i64>* %vp
+ %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 4, i32 1>
+ %cmp = icmp eq <2 x i64> %mask, zeroinitializer
+ %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> zeroinitializer
+ ret <2 x i64> %res
+}
+
+define <2 x i64> @test_masked_8xi64_to_2xi64_perm_mem_mask1(<8 x i64>* %vp, <2 x i64> %vec2, <2 x i64> %mask) {
+; CHECK-LABEL: test_masked_8xi64_to_2xi64_perm_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
+; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT: vextracti128 $1, %ymm3, %xmm3
+; CHECK-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; CHECK-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqq %xmm3, %xmm1, %k1
+; CHECK-NEXT: vmovdqa64 %xmm2, %xmm0 {%k1}
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %vec = load <8 x i64>, <8 x i64>* %vp
+ %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 6, i32 2>
+ %cmp = icmp eq <2 x i64> %mask, zeroinitializer
+ %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> %vec2
+ ret <2 x i64> %res
+}
+
+define <2 x i64> @test_masked_z_8xi64_to_2xi64_perm_mem_mask1(<8 x i64>* %vp, <2 x i64> %mask) {
+; CHECK-LABEL: test_masked_z_8xi64_to_2xi64_perm_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1
+; CHECK-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; CHECK-NEXT: vextracti128 $1, %ymm2, %xmm2
+; CHECK-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; CHECK-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqq %xmm2, %xmm0, %k1
+; CHECK-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z}
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %vec = load <8 x i64>, <8 x i64>* %vp
+ %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 6, i32 2>
+ %cmp = icmp eq <2 x i64> %mask, zeroinitializer
+ %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> zeroinitializer
+ ret <2 x i64> %res
+}
+
+define <4 x float> @test_8xfloat_to_4xfloat_perm_mask0(<8 x float> %vec) {
+; CHECK-LABEL: test_8xfloat_to_4xfloat_perm_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1
+; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[0,1]
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %res = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 0, i32 3, i32 4, i32 5>
+ ret <4 x float> %res
+}
+define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mask0(<8 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
+; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm3
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqps %xmm4, %xmm2, %k1
+; CHECK-NEXT: vshufps {{.*#+}} xmm1 {%k1} = xmm0[0,3],xmm3[0,1]
+; CHECK-NEXT: vmovaps %xmm1, %xmm0
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 0, i32 3, i32 4, i32 5>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mask0(<8 x float> %vec, <4 x float> %mask) {
+; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm2
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1
+; CHECK-NEXT: vshufps {{.*#+}} xmm0 {%k1} {z} = xmm0[0,3],xmm2[0,1]
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 0, i32 3, i32 4, i32 5>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
+ ret <4 x float> %res
+}
+define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mask1(<8 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
+; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm3
+; CHECK-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,0],xmm0[0,0]
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqps %xmm4, %xmm2, %k1
+; CHECK-NEXT: vshufps {{.*#+}} xmm1 {%k1} = xmm0[1,3],xmm3[0,2]
+; CHECK-NEXT: vmovaps %xmm1, %xmm0
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 0>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mask1(<8 x float> %vec, <4 x float> %mask) {
+; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm2
+; CHECK-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,0],xmm0[0,0]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1
+; CHECK-NEXT: vshufps {{.*#+}} xmm0 {%k1} {z} = xmm0[1,3],xmm2[0,2]
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 0>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
+ ret <4 x float> %res
+}
+define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mask2(<8 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
+; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm3
+; CHECK-NEXT: vshufps {{.*#+}} xmm3 = xmm3[3,0],xmm0[0,0]
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqps %xmm4, %xmm2, %k1
+; CHECK-NEXT: vshufps {{.*#+}} xmm1 {%k1} = xmm0[3,2],xmm3[0,2]
+; CHECK-NEXT: vmovaps %xmm1, %xmm0
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 3, i32 2, i32 7, i32 0>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mask2(<8 x float> %vec, <4 x float> %mask) {
+; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm2
+; CHECK-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,0],xmm0[0,0]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1
+; CHECK-NEXT: vshufps {{.*#+}} xmm0 {%k1} {z} = xmm0[3,2],xmm2[0,2]
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 3, i32 2, i32 7, i32 0>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
+ ret <4 x float> %res
+}
+define <4 x float> @test_8xfloat_to_4xfloat_perm_mask3(<8 x float> %vec) {
+; CHECK-LABEL: test_8xfloat_to_4xfloat_perm_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1
+; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,1,2]
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %res = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 3, i32 3, i32 5, i32 2>
+ ret <4 x float> %res
+}
+define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mask3(<8 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
+; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm3
+; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = xmm3[0],xmm0[1]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1
+; CHECK-NEXT: vpermilps {{.*#+}} xmm1 {%k1} = xmm0[3,3,1,2]
+; CHECK-NEXT: vmovaps %xmm1, %xmm0
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 3, i32 3, i32 5, i32 2>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mask3(<8 x float> %vec, <4 x float> %mask) {
+; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm2
+; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
+; CHECK-NEXT: vpermilps {{.*#+}} xmm0 {%k1} {z} = xmm0[3,3,1,2]
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 3, i32 3, i32 5, i32 2>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
+ ret <4 x float> %res
+}
+define <4 x float> @test_8xfloat_to_4xfloat_perm_mem_mask0(<8 x float>* %vp) {
+; CHECK-LABEL: test_8xfloat_to_4xfloat_perm_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps (%rdi), %ymm0
+; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1
+; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,0]
+; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[0,1]
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %vec = load <8 x float>, <8 x float>* %vp
+ %res = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 6, i32 2, i32 4, i32 5>
+ ret <4 x float> %res
+}
+define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask0(<8 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) {
+; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps (%rdi), %ymm2
+; CHECK-NEXT: vextractf128 $1, %ymm2, %xmm3
+; CHECK-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,0],xmm3[2,0]
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqps %xmm4, %xmm1, %k1
+; CHECK-NEXT: vshufps {{.*#+}} xmm0 {%k1} = xmm2[2,0],xmm3[0,1]
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %vec = load <8 x float>, <8 x float>* %vp
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 6, i32 2, i32 4, i32 5>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask0(<8 x float>* %vp, <4 x float> %mask) {
+; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps (%rdi), %ymm1
+; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm2
+; CHECK-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[2,0]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %xmm3, %xmm0, %k1
+; CHECK-NEXT: vshufps {{.*#+}} xmm0 {%k1} {z} = xmm1[2,0],xmm2[0,1]
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %vec = load <8 x float>, <8 x float>* %vp
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 6, i32 2, i32 4, i32 5>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask1(<8 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) {
+; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa (%rdi), %ymm2
+; CHECK-NEXT: vextracti128 $1, %ymm2, %xmm3
+; CHECK-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1
+; CHECK-NEXT: vpermilps {{.*#+}} xmm0 {%k1} = xmm2[2,3,3,2]
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %vec = load <8 x float>, <8 x float>* %vp
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 6, i32 3, i32 3, i32 6>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask1(<8 x float>* %vp, <4 x float> %mask) {
+; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa (%rdi), %ymm1
+; CHECK-NEXT: vextracti128 $1, %ymm1, %xmm2
+; CHECK-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3]
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %xmm2, %xmm0, %k1
+; CHECK-NEXT: vpermilps {{.*#+}} xmm0 {%k1} {z} = xmm1[2,3,3,2]
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %vec = load <8 x float>, <8 x float>* %vp
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 6, i32 3, i32 3, i32 6>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask2(<8 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) {
+; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps (%rdi), %ymm2
+; CHECK-NEXT: vextractf128 $1, %ymm2, %xmm3
+; CHECK-NEXT: vshufps {{.*#+}} xmm3 = xmm3[3,0],xmm2[3,0]
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqps %xmm4, %xmm1, %k1
+; CHECK-NEXT: vshufps {{.*#+}} xmm0 {%k1} = xmm2[3,1],xmm3[2,0]
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %vec = load <8 x float>, <8 x float>* %vp
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 3, i32 1, i32 3, i32 7>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask2(<8 x float>* %vp, <4 x float> %mask) {
+; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps (%rdi), %ymm1
+; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm2
+; CHECK-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,0],xmm1[3,0]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %xmm3, %xmm0, %k1
+; CHECK-NEXT: vshufps {{.*#+}} xmm0 {%k1} {z} = xmm1[3,1],xmm2[2,0]
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %vec = load <8 x float>, <8 x float>* %vp
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 3, i32 1, i32 3, i32 7>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_8xfloat_to_4xfloat_perm_mem_mask3(<8 x float>* %vp) {
+; CHECK-LABEL: test_8xfloat_to_4xfloat_perm_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps (%rdi), %ymm0
+; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1
+; CHECK-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[3,0]
+; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[0,2]
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %vec = load <8 x float>, <8 x float>* %vp
+ %res = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 3>
+ ret <4 x float> %res
+}
+define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mem_mask3(<8 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) {
+; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps (%rdi), %ymm2
+; CHECK-NEXT: vextractf128 $1, %ymm2, %xmm3
+; CHECK-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,0],xmm2[3,0]
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqps %xmm4, %xmm1, %k1
+; CHECK-NEXT: vshufps {{.*#+}} xmm0 {%k1} = xmm2[1,3],xmm3[0,2]
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %vec = load <8 x float>, <8 x float>* %vp
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 3>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask3(<8 x float>* %vp, <4 x float> %mask) {
+; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps (%rdi), %ymm1
+; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm2
+; CHECK-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,0],xmm1[3,0]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %xmm3, %xmm0, %k1
+; CHECK-NEXT: vshufps {{.*#+}} xmm0 {%k1} {z} = xmm1[1,3],xmm2[0,2]
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %vec = load <8 x float>, <8 x float>* %vp
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 3>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
+ ret <4 x float> %res
+}
+
+define <8 x float> @test_16xfloat_to_8xfloat_perm_mask0(<16 x float> %vec) {
+; CHECK-LABEL: test_16xfloat_to_8xfloat_perm_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2
+; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [0,4,12,10,8,2,11,7]
+; CHECK-NEXT: vpermi2ps %ymm2, %ymm0, %ymm1
+; CHECK-NEXT: vmovaps %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %res = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 0, i32 4, i32 12, i32 10, i32 8, i32 2, i32 11, i32 7>
+ ret <8 x float> %res
+}
+define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mask0(<16 x float> %vec, <8 x float> %vec2, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT: vmovaps {{.*#+}} ymm4 = [0,4,12,10,8,2,11,7]
+; CHECK-NEXT: vpermi2ps %ymm3, %ymm0, %ymm4
+; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; CHECK-NEXT: vcmpeqps %ymm0, %ymm2, %k1
+; CHECK-NEXT: vblendmps %ymm4, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 0, i32 4, i32 12, i32 10, i32 8, i32 2, i32 11, i32 7>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mask0(<16 x float> %vec, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = [0,4,12,10,8,2,11,7]
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqps %ymm4, %ymm1, %k1
+; CHECK-NEXT: vpermi2ps %ymm3, %ymm0, %ymm2 {%k1} {z}
+; CHECK-NEXT: vmovaps %ymm2, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 0, i32 4, i32 12, i32 10, i32 8, i32 2, i32 11, i32 7>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mask1(<16 x float> %vec, <8 x float> %vec2, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT: vmovaps {{.*#+}} ymm4 = [2,4,11,4,12,7,9,6]
+; CHECK-NEXT: vpermi2ps %ymm0, %ymm3, %ymm4
+; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; CHECK-NEXT: vcmpeqps %ymm0, %ymm2, %k1
+; CHECK-NEXT: vblendmps %ymm4, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 10, i32 12, i32 3, i32 12, i32 4, i32 15, i32 1, i32 14>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mask1(<16 x float> %vec, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = [2,4,11,4,12,7,9,6]
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqps %ymm4, %ymm1, %k1
+; CHECK-NEXT: vpermi2ps %ymm0, %ymm3, %ymm2 {%k1} {z}
+; CHECK-NEXT: vmovaps %ymm2, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 10, i32 12, i32 3, i32 12, i32 4, i32 15, i32 1, i32 14>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mask2(<16 x float> %vec, <8 x float> %vec2, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovapd {{.*#+}} ymm3 = <0,4,u,u,6,1,4,4>
+; CHECK-NEXT: vpermps %ymm0, %ymm3, %ymm3
+; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm0
+; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
+; CHECK-NEXT: vblendpd {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2,3]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1
+; CHECK-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 9, i32 6, i32 1, i32 4, i32 4>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mask2(<16 x float> %vec, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovapd {{.*#+}} ymm2 = <0,4,u,u,6,1,4,4>
+; CHECK-NEXT: vpermps %ymm0, %ymm2, %ymm2
+; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm0
+; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
+; CHECK-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3]
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1
+; CHECK-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 9, i32 6, i32 1, i32 4, i32 4>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+define <8 x float> @test_16xfloat_to_8xfloat_perm_mask3(<16 x float> %vec) {
+; CHECK-LABEL: test_16xfloat_to_8xfloat_perm_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2
+; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [4,6,1,8,4,12,13,0]
+; CHECK-NEXT: vpermi2ps %ymm0, %ymm2, %ymm1
+; CHECK-NEXT: vmovaps %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %res = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 12, i32 14, i32 9, i32 0, i32 12, i32 4, i32 5, i32 8>
+ ret <8 x float> %res
+}
+define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mask3(<16 x float> %vec, <8 x float> %vec2, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT: vmovaps {{.*#+}} ymm4 = [4,6,1,8,4,12,13,0]
+; CHECK-NEXT: vpermi2ps %ymm0, %ymm3, %ymm4
+; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; CHECK-NEXT: vcmpeqps %ymm0, %ymm2, %k1
+; CHECK-NEXT: vblendmps %ymm4, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 12, i32 14, i32 9, i32 0, i32 12, i32 4, i32 5, i32 8>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mask3(<16 x float> %vec, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = [4,6,1,8,4,12,13,0]
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqps %ymm4, %ymm1, %k1
+; CHECK-NEXT: vpermi2ps %ymm0, %ymm3, %ymm2 {%k1} {z}
+; CHECK-NEXT: vmovaps %ymm2, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 12, i32 14, i32 9, i32 0, i32 12, i32 4, i32 5, i32 8>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+define <4 x float> @test_16xfloat_to_4xfloat_perm_mask0(<16 x float> %vec) {
+; CHECK-LABEL: test_16xfloat_to_4xfloat_perm_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2
+; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = <12,0,1,2,u,u,u,u>
+; CHECK-NEXT: vpermi2ps %ymm0, %ymm2, %ymm1
+; CHECK-NEXT: vmovaps %xmm1, %xmm0
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %res = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 4, i32 8, i32 9, i32 10>
+ ret <4 x float> %res
+}
+define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mask0(<16 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
+; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT: vmovaps {{.*#+}} ymm4 = <12,0,1,2,u,u,u,u>
+; CHECK-NEXT: vpermi2ps %ymm0, %ymm3, %ymm4
+; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; CHECK-NEXT: vcmpeqps %xmm0, %xmm2, %k1
+; CHECK-NEXT: vblendmps %xmm4, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 4, i32 8, i32 9, i32 10>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mask0(<16 x float> %vec, <4 x float> %mask) {
+; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2
+; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = <12,0,1,2,u,u,u,u>
+; CHECK-NEXT: vpermi2ps %ymm0, %ymm2, %ymm3
+; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; CHECK-NEXT: vcmpeqps %xmm0, %xmm1, %k1
+; CHECK-NEXT: vmovaps %xmm3, %xmm0 {%k1} {z}
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 4, i32 8, i32 9, i32 10>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
+ ret <4 x float> %res
+}
+define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mask1(<16 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
+; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
+; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,2]
+; CHECK-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0],xmm0[1],xmm3[2],xmm0[3]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1
+; CHECK-NEXT: vblendmps %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 8, i32 6, i32 10, i32 6>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mask1(<16 x float> %vec, <4 x float> %mask) {
+; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
+; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,2]
+; CHECK-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3]
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
+; CHECK-NEXT: vmovaps %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 8, i32 6, i32 10, i32 6>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
+ ret <4 x float> %res
+}
+define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mask2(<16 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
+; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT: vshufps {{.*#+}} ymm0 = ymm3[0,0],ymm0[0,1],ymm3[4,4],ymm0[4,5]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1
+; CHECK-NEXT: vextractf32x4 $1, %ymm0, %xmm1 {%k1}
+; CHECK-NEXT: vmovaps %xmm1, %xmm0
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 12, i32 12, i32 4, i32 5>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mask2(<16 x float> %vec, <4 x float> %mask) {
+; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2
+; CHECK-NEXT: vshufps {{.*#+}} ymm0 = ymm2[0,0],ymm0[0,1],ymm2[4,4],ymm0[4,5]
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
+; CHECK-NEXT: vextractf32x4 $1, %ymm0, %xmm0 {%k1} {z}
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 12, i32 12, i32 4, i32 5>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
+ ret <4 x float> %res
+}
+define <4 x float> @test_16xfloat_to_4xfloat_perm_mask3(<16 x float> %vec) {
+; CHECK-LABEL: test_16xfloat_to_4xfloat_perm_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm1
+; CHECK-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,1,3,3]
+; CHECK-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %res = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 10, i32 2, i32 11, i32 6>
+ ret <4 x float> %res
+}
+define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mask3(<16 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
+; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,3,3]
+; CHECK-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; CHECK-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0],xmm0[1],xmm3[2],xmm0[3]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1
+; CHECK-NEXT: vblendmps %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 10, i32 2, i32 11, i32 6>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mask3(<16 x float> %vec, <4 x float> %mask) {
+; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,3,3]
+; CHECK-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; CHECK-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3]
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
+; CHECK-NEXT: vmovaps %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 10, i32 2, i32 11, i32 6>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
+ ret <4 x float> %res
+}
+define <8 x float> @test_16xfloat_to_8xfloat_perm_mem_mask0(<16 x float>* %vp) {
+; CHECK-LABEL: test_16xfloat_to_8xfloat_perm_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps (%rdi), %zmm1
+; CHECK-NEXT: vextractf64x4 $1, %zmm1, %ymm2
+; CHECK-NEXT: vmovaps {{.*#+}} ymm0 = [7,6,7,11,5,10,0,4]
+; CHECK-NEXT: vpermi2ps %ymm2, %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %vec = load <16 x float>, <16 x float>* %vp
+ %res = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 7, i32 6, i32 7, i32 11, i32 5, i32 10, i32 0, i32 4>
+ ret <8 x float> %res
+}
+define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mem_mask0(<16 x float>* %vp, <8 x float> %vec2, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps (%rdi), %zmm2
+; CHECK-NEXT: vextractf64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT: vmovaps {{.*#+}} ymm4 = [7,6,7,11,5,10,0,4]
+; CHECK-NEXT: vpermi2ps %ymm3, %ymm2, %ymm4
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1
+; CHECK-NEXT: vmovaps %ymm4, %ymm0 {%k1}
+; CHECK-NEXT: retq
+ %vec = load <16 x float>, <16 x float>* %vp
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 7, i32 6, i32 7, i32 11, i32 5, i32 10, i32 0, i32 4>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mem_mask0(<16 x float>* %vp, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps (%rdi), %zmm2
+; CHECK-NEXT: vextractf64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [7,6,7,11,5,10,0,4]
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqps %ymm4, %ymm0, %k1
+; CHECK-NEXT: vpermi2ps %ymm3, %ymm2, %ymm1 {%k1} {z}
+; CHECK-NEXT: vmovaps %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %vec = load <16 x float>, <16 x float>* %vp
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 7, i32 6, i32 7, i32 11, i32 5, i32 10, i32 0, i32 4>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mem_mask1(<16 x float>* %vp, <8 x float> %vec2, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps (%rdi), %zmm2
+; CHECK-NEXT: vextractf64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT: vmovaps {{.*#+}} ymm4 = [11,0,9,0,7,14,0,8]
+; CHECK-NEXT: vpermi2ps %ymm3, %ymm2, %ymm4
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1
+; CHECK-NEXT: vmovaps %ymm4, %ymm0 {%k1}
+; CHECK-NEXT: retq
+ %vec = load <16 x float>, <16 x float>* %vp
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 11, i32 0, i32 9, i32 0, i32 7, i32 14, i32 0, i32 8>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mem_mask1(<16 x float>* %vp, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps (%rdi), %zmm2
+; CHECK-NEXT: vextractf64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [11,0,9,0,7,14,0,8]
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqps %ymm4, %ymm0, %k1
+; CHECK-NEXT: vpermi2ps %ymm3, %ymm2, %ymm1 {%k1} {z}
+; CHECK-NEXT: vmovaps %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %vec = load <16 x float>, <16 x float>* %vp
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 11, i32 0, i32 9, i32 0, i32 7, i32 14, i32 0, i32 8>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mem_mask2(<16 x float>* %vp, <8 x float> %vec2, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps (%rdi), %zmm2
+; CHECK-NEXT: vpermilps {{.*#+}} xmm3 = xmm2[1,0,0,3]
+; CHECK-NEXT: vextractf64x4 $1, %zmm2, %ymm2
+; CHECK-NEXT: vmovaps {{.*#+}} ymm4 = [8,5,2,3,2,9,10,1]
+; CHECK-NEXT: vpermi2ps %ymm3, %ymm2, %ymm4
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1
+; CHECK-NEXT: vmovaps %ymm4, %ymm0 {%k1}
+; CHECK-NEXT: retq
+ %vec = load <16 x float>, <16 x float>* %vp
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 1, i32 13, i32 10, i32 11, i32 10, i32 0, i32 0, i32 9>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mem_mask2(<16 x float>* %vp, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps (%rdi), %zmm1
+; CHECK-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[1,0,0,3]
+; CHECK-NEXT: vextractf64x4 $1, %zmm1, %ymm3
+; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [8,5,2,3,2,9,10,1]
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqps %ymm4, %ymm0, %k1
+; CHECK-NEXT: vpermi2ps %ymm2, %ymm3, %ymm1 {%k1} {z}
+; CHECK-NEXT: vmovaps %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %vec = load <16 x float>, <16 x float>* %vp
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 1, i32 13, i32 10, i32 11, i32 10, i32 0, i32 0, i32 9>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_16xfloat_to_8xfloat_perm_mem_mask3(<16 x float>* %vp) {
+; CHECK-LABEL: test_16xfloat_to_8xfloat_perm_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps (%rdi), %zmm1
+; CHECK-NEXT: vextractf64x4 $1, %zmm1, %ymm2
+; CHECK-NEXT: vmovaps {{.*#+}} ymm0 = [7,5,3,3,11,4,12,9]
+; CHECK-NEXT: vpermi2ps %ymm1, %ymm2, %ymm0
+; CHECK-NEXT: retq
+ %vec = load <16 x float>, <16 x float>* %vp
+ %res = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 15, i32 13, i32 11, i32 11, i32 3, i32 12, i32 4, i32 1>
+ ret <8 x float> %res
+}
+define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mem_mask3(<16 x float>* %vp, <8 x float> %vec2, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps (%rdi), %zmm2
+; CHECK-NEXT: vextractf64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT: vmovaps {{.*#+}} ymm4 = [7,5,3,3,11,4,12,9]
+; CHECK-NEXT: vpermi2ps %ymm2, %ymm3, %ymm4
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1
+; CHECK-NEXT: vmovaps %ymm4, %ymm0 {%k1}
+; CHECK-NEXT: retq
+ %vec = load <16 x float>, <16 x float>* %vp
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 15, i32 13, i32 11, i32 11, i32 3, i32 12, i32 4, i32 1>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mem_mask3(<16 x float>* %vp, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps (%rdi), %zmm2
+; CHECK-NEXT: vextractf64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [7,5,3,3,11,4,12,9]
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqps %ymm4, %ymm0, %k1
+; CHECK-NEXT: vpermi2ps %ymm2, %ymm3, %ymm1 {%k1} {z}
+; CHECK-NEXT: vmovaps %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %vec = load <16 x float>, <16 x float>* %vp
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 15, i32 13, i32 11, i32 11, i32 3, i32 12, i32 4, i32 1>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+
+define <4 x float> @test_16xfloat_to_4xfloat_perm_mem_mask0(<16 x float>* %vp) {
+; CHECK-LABEL: test_16xfloat_to_4xfloat_perm_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps (%rdi), %zmm0
+; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1
+; CHECK-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,2,3,3]
+; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm0
+; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,1,2,3]
+; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3]
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %vec = load <16 x float>, <16 x float>* %vp
+ %res = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 14, i32 6, i32 7, i32 11>
+ ret <4 x float> %res
+}
+define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask0(<16 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) {
+; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
+; CHECK-NEXT: vextracti128 $1, %ymm2, %xmm3
+; CHECK-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,2,3,3]
+; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm2
+; CHECK-NEXT: vpermq {{.*#+}} ymm2 = ymm2[3,1,2,3]
+; CHECK-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1,2],xmm2[3]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1
+; CHECK-NEXT: vmovaps %xmm2, %xmm0 {%k1}
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %vec = load <16 x float>, <16 x float>* %vp
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 14, i32 6, i32 7, i32 11>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask0(<16 x float>* %vp, <4 x float> %mask) {
+; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1
+; CHECK-NEXT: vextracti128 $1, %ymm1, %xmm2
+; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,3,3]
+; CHECK-NEXT: vextracti64x4 $1, %zmm1, %ymm1
+; CHECK-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,1,2,3]
+; CHECK-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1,2],xmm1[3]
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %xmm2, %xmm0, %k1
+; CHECK-NEXT: vmovaps %xmm1, %xmm0 {%k1} {z}
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %vec = load <16 x float>, <16 x float>* %vp
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 14, i32 6, i32 7, i32 11>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask1(<16 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) {
+; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps (%rdi), %zmm2
+; CHECK-NEXT: vextractf64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT: vshufps {{.*#+}} ymm2 = ymm3[0,2],ymm2[2,3],ymm3[4,6],ymm2[6,7]
+; CHECK-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[0,2,1,3,4,6,5,7]
+; CHECK-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1
+; CHECK-NEXT: vmovaps %xmm2, %xmm0 {%k1}
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %vec = load <16 x float>, <16 x float>* %vp
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 8, i32 2, i32 14, i32 7>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask1(<16 x float>* %vp, <4 x float> %mask) {
+; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps (%rdi), %zmm1
+; CHECK-NEXT: vextractf64x4 $1, %zmm1, %ymm2
+; CHECK-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,2],ymm1[2,3],ymm2[4,6],ymm1[6,7]
+; CHECK-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,1,3,4,6,5,7]
+; CHECK-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3]
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %xmm2, %xmm0, %k1
+; CHECK-NEXT: vmovaps %xmm1, %xmm0 {%k1} {z}
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %vec = load <16 x float>, <16 x float>* %vp
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 8, i32 2, i32 14, i32 7>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask2(<16 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) {
+; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps (%rdi), %zmm2
+; CHECK-NEXT: vextractf64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0],ymm3[0,0],ymm2[6,4],ymm3[4,4]
+; CHECK-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0],ymm3[2,3],ymm2[6,4],ymm3[6,7]
+; CHECK-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,2,2,3]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1
+; CHECK-NEXT: vmovaps %xmm2, %xmm0 {%k1}
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %vec = load <16 x float>, <16 x float>* %vp
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 12, i32 6, i32 12, i32 6>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask2(<16 x float>* %vp, <4 x float> %mask) {
+; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps (%rdi), %zmm1
+; CHECK-NEXT: vextractf64x4 $1, %zmm1, %ymm2
+; CHECK-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm2[0,0],ymm1[6,4],ymm2[4,4]
+; CHECK-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm2[2,3],ymm1[6,4],ymm2[6,7]
+; CHECK-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3]
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %xmm2, %xmm0, %k1
+; CHECK-NEXT: vmovaps %xmm1, %xmm0 {%k1} {z}
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %vec = load <16 x float>, <16 x float>* %vp
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 12, i32 6, i32 12, i32 6>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_16xfloat_to_4xfloat_perm_mem_mask3(<16 x float>* %vp) {
+; CHECK-LABEL: test_16xfloat_to_4xfloat_perm_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps (%rdi), %zmm1
+; CHECK-NEXT: vextractf64x4 $1, %zmm1, %ymm2
+; CHECK-NEXT: vmovaps {{.*#+}} ymm0 = <3,3,15,9,u,u,u,u>
+; CHECK-NEXT: vpermi2ps %ymm2, %ymm1, %ymm0
+; CHECK-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %vec = load <16 x float>, <16 x float>* %vp
+ %res = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 3, i32 3, i32 15, i32 9>
+ ret <4 x float> %res
+}
+define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask3(<16 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) {
+; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps (%rdi), %zmm2
+; CHECK-NEXT: vextractf64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT: vmovaps {{.*#+}} ymm4 = <3,3,15,9,u,u,u,u>
+; CHECK-NEXT: vpermi2ps %ymm3, %ymm2, %ymm4
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
+; CHECK-NEXT: vmovaps %xmm4, %xmm0 {%k1}
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %vec = load <16 x float>, <16 x float>* %vp
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 3, i32 3, i32 15, i32 9>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask3(<16 x float>* %vp, <4 x float> %mask) {
+; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps (%rdi), %zmm1
+; CHECK-NEXT: vextractf64x4 $1, %zmm1, %ymm2
+; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = <3,3,15,9,u,u,u,u>
+; CHECK-NEXT: vpermi2ps %ymm2, %ymm1, %ymm3
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vcmpeqps %xmm1, %xmm0, %k1
+; CHECK-NEXT: vmovaps %xmm3, %xmm0 {%k1} {z}
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %vec = load <16 x float>, <16 x float>* %vp
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 3, i32 3, i32 15, i32 9>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
+ ret <4 x float> %res
+}
+
+define <2 x double> @test_4xdouble_to_2xdouble_perm_mask0(<4 x double> %vec) {
+; CHECK-LABEL: test_4xdouble_to_2xdouble_perm_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1
+; CHECK-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %res = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 2, i32 0>
+ ret <2 x double> %res
+}
+define <2 x double> @test_masked_4xdouble_to_2xdouble_perm_mask0(<4 x double> %vec, <2 x double> %vec2, <2 x double> %mask) {
+; CHECK-LABEL: test_masked_4xdouble_to_2xdouble_perm_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm3
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqpd %xmm4, %xmm2, %k1
+; CHECK-NEXT: vunpcklpd {{.*#+}} xmm1 {%k1} = xmm3[0],xmm0[0]
+; CHECK-NEXT: vmovapd %xmm1, %xmm0
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 2, i32 0>
+ %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
+ %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec2
+ ret <2 x double> %res
+}
+
+define <2 x double> @test_masked_z_4xdouble_to_2xdouble_perm_mask0(<4 x double> %vec, <2 x double> %mask) {
+; CHECK-LABEL: test_masked_z_4xdouble_to_2xdouble_perm_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm2
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %xmm3, %xmm1, %k1
+; CHECK-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm2[0],xmm0[0]
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 2, i32 0>
+ %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
+ %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
+ ret <2 x double> %res
+}
+define <2 x double> @test_masked_4xdouble_to_2xdouble_perm_mask1(<4 x double> %vec, <2 x double> %vec2, <2 x double> %mask) {
+; CHECK-LABEL: test_masked_4xdouble_to_2xdouble_perm_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm3
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqpd %xmm4, %xmm2, %k1
+; CHECK-NEXT: vunpckhpd {{.*#+}} xmm1 {%k1} = xmm0[1],xmm3[1]
+; CHECK-NEXT: vmovapd %xmm1, %xmm0
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 1, i32 3>
+ %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
+ %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec2
+ ret <2 x double> %res
+}
+
+define <2 x double> @test_masked_z_4xdouble_to_2xdouble_perm_mask1(<4 x double> %vec, <2 x double> %mask) {
+; CHECK-LABEL: test_masked_z_4xdouble_to_2xdouble_perm_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm2
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %xmm3, %xmm1, %k1
+; CHECK-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm2[1]
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 1, i32 3>
+ %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
+ %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
+ ret <2 x double> %res
+}
+define <2 x double> @test_4xdouble_to_2xdouble_perm_mem_mask0(<4 x double>* %vp) {
+; CHECK-LABEL: test_4xdouble_to_2xdouble_perm_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovapd (%rdi), %ymm0
+; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1
+; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %vec = load <4 x double>, <4 x double>* %vp
+ %res = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 2, i32 1>
+ ret <2 x double> %res
+}
+define <2 x double> @test_masked_4xdouble_to_2xdouble_perm_mem_mask0(<4 x double>* %vp, <2 x double> %vec2, <2 x double> %mask) {
+; CHECK-LABEL: test_masked_4xdouble_to_2xdouble_perm_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovapd (%rdi), %ymm2
+; CHECK-NEXT: vextractf128 $1, %ymm2, %xmm3
+; CHECK-NEXT: vmovsd {{.*#+}} xmm2 = xmm3[0],xmm2[1]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %xmm3, %xmm1, %k1
+; CHECK-NEXT: vmovapd %xmm2, %xmm0 {%k1}
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %vec = load <4 x double>, <4 x double>* %vp
+ %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 2, i32 1>
+ %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
+ %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec2
+ ret <2 x double> %res
+}
+
+define <2 x double> @test_masked_z_4xdouble_to_2xdouble_perm_mem_mask0(<4 x double>* %vp, <2 x double> %mask) {
+; CHECK-LABEL: test_masked_z_4xdouble_to_2xdouble_perm_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovapd (%rdi), %ymm1
+; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm2
+; CHECK-NEXT: vmovsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %xmm2, %xmm0, %k1
+; CHECK-NEXT: vmovapd %xmm1, %xmm0 {%k1} {z}
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %vec = load <4 x double>, <4 x double>* %vp
+ %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 2, i32 1>
+ %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
+ %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
+ ret <2 x double> %res
+}
+
+define <2 x double> @test_masked_4xdouble_to_2xdouble_perm_mem_mask1(<4 x double>* %vp, <2 x double> %vec2, <2 x double> %mask) {
+; CHECK-LABEL: test_masked_4xdouble_to_2xdouble_perm_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovapd (%rdi), %ymm2
+; CHECK-NEXT: vextractf128 $1, %ymm2, %xmm3
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqpd %xmm4, %xmm1, %k1
+; CHECK-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} = xmm3[0],xmm2[0]
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %vec = load <4 x double>, <4 x double>* %vp
+ %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 2, i32 0>
+ %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
+ %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec2
+ ret <2 x double> %res
+}
+
+define <2 x double> @test_masked_z_4xdouble_to_2xdouble_perm_mem_mask1(<4 x double>* %vp, <2 x double> %mask) {
+; CHECK-LABEL: test_masked_z_4xdouble_to_2xdouble_perm_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovapd (%rdi), %ymm1
+; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm2
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %xmm3, %xmm0, %k1
+; CHECK-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm2[0],xmm1[0]
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %vec = load <4 x double>, <4 x double>* %vp
+ %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 2, i32 0>
+ %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
+ %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
+ ret <2 x double> %res
+}
+
+define <4 x double> @test_8xdouble_to_4xdouble_perm_mask0(<8 x double> %vec) {
+; CHECK-LABEL: test_8xdouble_to_4xdouble_perm_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm1
+; CHECK-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,3]
+; CHECK-NEXT: retq
+ %res = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 7, i32 3, i32 7, i32 3>
+ ret <4 x double> %res
+}
+define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask0(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
+; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm3[1],ymm0[1],ymm3[3],ymm0[3]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
+; CHECK-NEXT: vpermpd {{.*#+}} ymm1 {%k1} = ymm0[2,3,2,3]
+; CHECK-NEXT: vmovapd %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 7, i32 3, i32 7, i32 3>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask0(<8 x double> %vec, <4 x double> %mask) {
+; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; CHECK-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm2[1],ymm0[1],ymm2[3],ymm0[3]
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3,2,3]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 7, i32 3, i32 7, i32 3>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
+ ret <4 x double> %res
+}
+define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask1(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
+; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT: vmovapd {{.*#+}} ymm4 = [2,0,7,6]
+; CHECK-NEXT: vpermi2pd %ymm3, %ymm0, %ymm4
+; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; CHECK-NEXT: vcmpeqpd %ymm0, %ymm2, %k1
+; CHECK-NEXT: vblendmpd %ymm4, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 2, i32 0, i32 7, i32 6>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask1(<8 x double> %vec, <4 x double> %mask) {
+; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT: vmovapd {{.*#+}} ymm2 = [2,0,7,6]
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqpd %ymm4, %ymm1, %k1
+; CHECK-NEXT: vpermi2pd %ymm3, %ymm0, %ymm2 {%k1} {z}
+; CHECK-NEXT: vmovapd %ymm2, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 2, i32 0, i32 7, i32 6>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
+ ret <4 x double> %res
+}
+define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask2(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
+; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
+; CHECK-NEXT: vpermpd {{.*#+}} ymm1 {%k1} = ymm0[2,3,2,0]
+; CHECK-NEXT: vmovapd %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 2, i32 3, i32 2, i32 0>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask2(<8 x double> %vec, <4 x double> %mask) {
+; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3,2,0]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 2, i32 3, i32 2, i32 0>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
+ ret <4 x double> %res
+}
+define <4 x double> @test_8xdouble_to_4xdouble_perm_mask3(<8 x double> %vec) {
+; CHECK-LABEL: test_8xdouble_to_4xdouble_perm_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2
+; CHECK-NEXT: vmovapd {{.*#+}} ymm1 = [0,2,1,4]
+; CHECK-NEXT: vpermi2pd %ymm2, %ymm0, %ymm1
+; CHECK-NEXT: vmovapd %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %res = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 0, i32 2, i32 1, i32 4>
+ ret <4 x double> %res
+}
+define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask3(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
+; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT: vmovapd {{.*#+}} ymm4 = [0,2,1,4]
+; CHECK-NEXT: vpermi2pd %ymm3, %ymm0, %ymm4
+; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; CHECK-NEXT: vcmpeqpd %ymm0, %ymm2, %k1
+; CHECK-NEXT: vblendmpd %ymm4, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 0, i32 2, i32 1, i32 4>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask3(<8 x double> %vec, <4 x double> %mask) {
+; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT: vmovapd {{.*#+}} ymm2 = [0,2,1,4]
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqpd %ymm4, %ymm1, %k1
+; CHECK-NEXT: vpermi2pd %ymm3, %ymm0, %ymm2 {%k1} {z}
+; CHECK-NEXT: vmovapd %ymm2, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 0, i32 2, i32 1, i32 4>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
+ ret <4 x double> %res
+}
+define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask4(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
+; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask4:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm3[1]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
+; CHECK-NEXT: vpermpd {{.*#+}} ymm1 {%k1} = ymm0[0,0,1,1]
+; CHECK-NEXT: vmovapd %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 1, i32 1, i32 5, i32 5>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask4(<8 x double> %vec, <4 x double> %mask) {
+; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask4:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; CHECK-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm2[1]
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,1,1]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 1, i32 1, i32 5, i32 5>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
+ ret <4 x double> %res
+}
+define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask5(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
+; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask5:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
+; CHECK-NEXT: vpermpd {{.*#+}} ymm1 {%k1} = ymm0[2,3,2,2]
+; CHECK-NEXT: vmovapd %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 2, i32 6, i32 2, i32 2>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask5(<8 x double> %vec, <4 x double> %mask) {
+; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask5:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; CHECK-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2]
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3,2,2]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 2, i32 6, i32 2, i32 2>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
+ ret <4 x double> %res
+}
+define <4 x double> @test_8xdouble_to_4xdouble_perm_mask6(<8 x double> %vec) {
+; CHECK-LABEL: test_8xdouble_to_4xdouble_perm_mask6:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm1
+; CHECK-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[1],ymm0[0],ymm1[3],ymm0[2]
+; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,2,1]
+; CHECK-NEXT: retq
+ %res = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 5, i32 0, i32 7, i32 0>
+ ret <4 x double> %res
+}
+define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask6(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
+; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask6:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT: vshufpd {{.*#+}} ymm0 = ymm3[1],ymm0[0],ymm3[3],ymm0[2]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
+; CHECK-NEXT: vpermpd {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,1]
+; CHECK-NEXT: vmovapd %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 5, i32 0, i32 7, i32 0>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask6(<8 x double> %vec, <4 x double> %mask) {
+; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask6:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2
+; CHECK-NEXT: vshufpd {{.*#+}} ymm0 = ymm2[1],ymm0[0],ymm2[3],ymm0[2]
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,1]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 5, i32 0, i32 7, i32 0>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
+ ret <4 x double> %res
+}
+define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask7(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
+; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask7:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm3[1,2],ymm0[3]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
+; CHECK-NEXT: vpermpd {{.*#+}} ymm1 {%k1} = ymm0[3,1,0,2]
+; CHECK-NEXT: vmovapd %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 3, i32 5, i32 0, i32 6>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask7(<8 x double> %vec, <4 x double> %mask) {
+; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask7:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2
+; CHECK-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm2[1,2],ymm0[3]
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[3,1,0,2]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 3, i32 5, i32 0, i32 6>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
+ ret <4 x double> %res
+}
+define <2 x double> @test_8xdouble_to_2xdouble_perm_mask0(<8 x double> %vec) {
+; CHECK-LABEL: test_8xdouble_to_2xdouble_perm_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm1
+; CHECK-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3]
+; CHECK-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %res = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 0, i32 6>
+ ret <2 x double> %res
+}
+define <2 x double> @test_masked_8xdouble_to_2xdouble_perm_mask0(<8 x double> %vec, <2 x double> %vec2, <2 x double> %mask) {
+; CHECK-LABEL: test_masked_8xdouble_to_2xdouble_perm_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
+; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %xmm3, %xmm2, %k1
+; CHECK-NEXT: vblendmpd %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 0, i32 6>
+ %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
+ %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec2
+ ret <2 x double> %res
+}
+
+define <2 x double> @test_masked_z_8xdouble_to_2xdouble_perm_mask0(<8 x double> %vec, <2 x double> %mask) {
+; CHECK-LABEL: test_masked_z_8xdouble_to_2xdouble_perm_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; CHECK-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2]
+; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %xmm2, %xmm1, %k1
+; CHECK-NEXT: vmovapd %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 0, i32 6>
+ %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
+ %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
+ ret <2 x double> %res
+}
+define <2 x double> @test_masked_8xdouble_to_2xdouble_perm_mask1(<8 x double> %vec, <2 x double> %vec2, <2 x double> %mask) {
+; CHECK-LABEL: test_masked_8xdouble_to_2xdouble_perm_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; CHECK-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm3[1],ymm0[3],ymm3[3]
+; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %xmm3, %xmm2, %k1
+; CHECK-NEXT: vblendmpd %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 3, i32 7>
+ %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
+ %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec2
+ ret <2 x double> %res
+}
+
+define <2 x double> @test_masked_z_8xdouble_to_2xdouble_perm_mask1(<8 x double> %vec, <2 x double> %mask) {
+; CHECK-LABEL: test_masked_z_8xdouble_to_2xdouble_perm_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; CHECK-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3]
+; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %xmm2, %xmm1, %k1
+; CHECK-NEXT: vmovapd %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 3, i32 7>
+ %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
+ %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
+ ret <2 x double> %res
+}
+define <4 x double> @test_8xdouble_to_4xdouble_perm_mem_mask0(<8 x double>* %vp) {
+; CHECK-LABEL: test_8xdouble_to_4xdouble_perm_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovapd (%rdi), %zmm1
+; CHECK-NEXT: vextractf64x4 $1, %zmm1, %ymm2
+; CHECK-NEXT: vmovapd {{.*#+}} ymm0 = [1,6,7,2]
+; CHECK-NEXT: vpermi2pd %ymm2, %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %vec = load <8 x double>, <8 x double>* %vp
+ %res = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 1, i32 6, i32 7, i32 2>
+ ret <4 x double> %res
+}
+define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask0(<8 x double>* %vp, <4 x double> %vec2, <4 x double> %mask) {
+; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovapd (%rdi), %zmm2
+; CHECK-NEXT: vextractf64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT: vmovapd {{.*#+}} ymm4 = [1,6,7,2]
+; CHECK-NEXT: vpermi2pd %ymm3, %ymm2, %ymm4
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vmovapd %ymm4, %ymm0 {%k1}
+; CHECK-NEXT: retq
+ %vec = load <8 x double>, <8 x double>* %vp
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 1, i32 6, i32 7, i32 2>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask0(<8 x double>* %vp, <4 x double> %mask) {
+; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovapd (%rdi), %zmm2
+; CHECK-NEXT: vextractf64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT: vmovapd {{.*#+}} ymm1 = [1,6,7,2]
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqpd %ymm4, %ymm0, %k1
+; CHECK-NEXT: vpermi2pd %ymm3, %ymm2, %ymm1 {%k1} {z}
+; CHECK-NEXT: vmovapd %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %vec = load <8 x double>, <8 x double>* %vp
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 1, i32 6, i32 7, i32 2>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask1(<8 x double>* %vp, <4 x double> %vec2, <4 x double> %mask) {
+; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovapd (%rdi), %zmm2
+; CHECK-NEXT: vextractf64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0],ymm2[1,2,3]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %ymm3, %ymm1, %k1
+; CHECK-NEXT: vpermpd {{.*#+}} ymm0 {%k1} = ymm2[3,0,2,0]
+; CHECK-NEXT: retq
+ %vec = load <8 x double>, <8 x double>* %vp
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 3, i32 4, i32 2, i32 4>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask1(<8 x double>* %vp, <4 x double> %mask) {
+; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovapd (%rdi), %zmm1
+; CHECK-NEXT: vextractf64x4 $1, %zmm1, %ymm2
+; CHECK-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3]
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %ymm2, %ymm0, %k1
+; CHECK-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm1[3,0,2,0]
+; CHECK-NEXT: retq
+ %vec = load <8 x double>, <8 x double>* %vp
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 3, i32 4, i32 2, i32 4>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask2(<8 x double>* %vp, <4 x double> %vec2, <4 x double> %mask) {
+; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovapd (%rdi), %zmm2
+; CHECK-NEXT: vextractf64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0],ymm2[1,2,3]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %ymm3, %ymm1, %k1
+; CHECK-NEXT: vpermpd {{.*#+}} ymm0 {%k1} = ymm2[1,2,3,0]
+; CHECK-NEXT: retq
+ %vec = load <8 x double>, <8 x double>* %vp
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask2(<8 x double>* %vp, <4 x double> %mask) {
+; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovapd (%rdi), %zmm1
+; CHECK-NEXT: vextractf64x4 $1, %zmm1, %ymm2
+; CHECK-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3]
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %ymm2, %ymm0, %k1
+; CHECK-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm1[1,2,3,0]
+; CHECK-NEXT: retq
+ %vec = load <8 x double>, <8 x double>* %vp
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_8xdouble_to_4xdouble_perm_mem_mask3(<8 x double>* %vp) {
+; CHECK-LABEL: test_8xdouble_to_4xdouble_perm_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovapd (%rdi), %zmm0
+; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm1
+; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,0]
+; CHECK-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3]
+; CHECK-NEXT: retq
+ %vec = load <8 x double>, <8 x double>* %vp
+ %res = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 4, i32 2, i32 1, i32 0>
+ ret <4 x double> %res
+}
+define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask3(<8 x double>* %vp, <4 x double> %vec2, <4 x double> %mask) {
+; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovapd (%rdi), %zmm2
+; CHECK-NEXT: vextractf64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,0]
+; CHECK-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0],ymm2[1,2,3]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %ymm3, %ymm1, %k1
+; CHECK-NEXT: vmovapd %ymm2, %ymm0 {%k1}
+; CHECK-NEXT: retq
+ %vec = load <8 x double>, <8 x double>* %vp
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 4, i32 2, i32 1, i32 0>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask3(<8 x double>* %vp, <4 x double> %mask) {
+; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovapd (%rdi), %zmm1
+; CHECK-NEXT: vextractf64x4 $1, %zmm1, %ymm2
+; CHECK-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,1,0]
+; CHECK-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3]
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %ymm2, %ymm0, %k1
+; CHECK-NEXT: vmovapd %ymm1, %ymm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %vec = load <8 x double>, <8 x double>* %vp
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 4, i32 2, i32 1, i32 0>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask4(<8 x double>* %vp, <4 x double> %vec2, <4 x double> %mask) {
+; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask4:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovapd (%rdi), %zmm2
+; CHECK-NEXT: vextractf64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT: vmovapd {{.*#+}} ymm4 = [2,4,1,5]
+; CHECK-NEXT: vpermi2pd %ymm2, %ymm3, %ymm4
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vmovapd %ymm4, %ymm0 {%k1}
+; CHECK-NEXT: retq
+ %vec = load <8 x double>, <8 x double>* %vp
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 6, i32 0, i32 5, i32 1>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask4(<8 x double>* %vp, <4 x double> %mask) {
+; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask4:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovapd (%rdi), %zmm2
+; CHECK-NEXT: vextractf64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT: vmovapd {{.*#+}} ymm1 = [2,4,1,5]
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqpd %ymm4, %ymm0, %k1
+; CHECK-NEXT: vpermi2pd %ymm2, %ymm3, %ymm1 {%k1} {z}
+; CHECK-NEXT: vmovapd %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %vec = load <8 x double>, <8 x double>* %vp
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 6, i32 0, i32 5, i32 1>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask5(<8 x double>* %vp, <4 x double> %vec2, <4 x double> %mask) {
+; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask5:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovapd (%rdi), %zmm2
+; CHECK-NEXT: vextractf64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %ymm3, %ymm1, %k1
+; CHECK-NEXT: vpermpd {{.*#+}} ymm0 {%k1} = ymm2[2,1,1,1]
+; CHECK-NEXT: retq
+ %vec = load <8 x double>, <8 x double>* %vp
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 2, i32 5, i32 5, i32 5>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask5(<8 x double>* %vp, <4 x double> %mask) {
+; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask5:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovapd (%rdi), %zmm1
+; CHECK-NEXT: vextractf64x4 $1, %zmm1, %ymm2
+; CHECK-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3]
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %ymm2, %ymm0, %k1
+; CHECK-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm1[2,1,1,1]
+; CHECK-NEXT: retq
+ %vec = load <8 x double>, <8 x double>* %vp
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 2, i32 5, i32 5, i32 5>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_8xdouble_to_4xdouble_perm_mem_mask6(<8 x double>* %vp) {
+; CHECK-LABEL: test_8xdouble_to_4xdouble_perm_mem_mask6:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovapd (%rdi), %zmm0
+; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1
+; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm0
+; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,1]
+; CHECK-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3]
+; CHECK-NEXT: retq
+ %vec = load <8 x double>, <8 x double>* %vp
+ %res = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 4, i32 6, i32 0, i32 5>
+ ret <4 x double> %res
+}
+define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask6(<8 x double>* %vp, <4 x double> %vec2, <4 x double> %mask) {
+; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask6:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovapd (%rdi), %zmm2
+; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm3
+; CHECK-NEXT: vextractf64x4 $1, %zmm2, %ymm2
+; CHECK-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,2,1]
+; CHECK-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %ymm3, %ymm1, %k1
+; CHECK-NEXT: vmovapd %ymm2, %ymm0 {%k1}
+; CHECK-NEXT: retq
+ %vec = load <8 x double>, <8 x double>* %vp
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 4, i32 6, i32 0, i32 5>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask6(<8 x double>* %vp, <4 x double> %mask) {
+; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask6:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovapd (%rdi), %zmm1
+; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm2
+; CHECK-NEXT: vextractf64x4 $1, %zmm1, %ymm1
+; CHECK-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,1]
+; CHECK-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3]
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %ymm2, %ymm0, %k1
+; CHECK-NEXT: vmovapd %ymm1, %ymm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %vec = load <8 x double>, <8 x double>* %vp
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 4, i32 6, i32 0, i32 5>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask7(<8 x double>* %vp, <4 x double> %vec2, <4 x double> %mask) {
+; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask7:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovapd (%rdi), %zmm2
+; CHECK-NEXT: vextractf64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %ymm3, %ymm1, %k1
+; CHECK-NEXT: vpermpd {{.*#+}} ymm0 {%k1} = ymm2[0,1,2,1]
+; CHECK-NEXT: retq
+ %vec = load <8 x double>, <8 x double>* %vp
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 5>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask7(<8 x double>* %vp, <4 x double> %mask) {
+; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask7:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovapd (%rdi), %zmm1
+; CHECK-NEXT: vextractf64x4 $1, %zmm1, %ymm2
+; CHECK-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3]
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %ymm2, %ymm0, %k1
+; CHECK-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm1[0,1,2,1]
+; CHECK-NEXT: retq
+ %vec = load <8 x double>, <8 x double>* %vp
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 5>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
+ ret <4 x double> %res
+}
+
+define <2 x double> @test_8xdouble_to_2xdouble_perm_mem_mask0(<8 x double>* %vp) {
+; CHECK-LABEL: test_8xdouble_to_2xdouble_perm_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovapd (%rdi), %zmm0
+; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm1
+; CHECK-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[0],ymm0[3],ymm1[2]
+; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3]
+; CHECK-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %vec = load <8 x double>, <8 x double>* %vp
+ %res = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 1, i32 6>
+ ret <2 x double> %res
+}
+define <2 x double> @test_masked_8xdouble_to_2xdouble_perm_mem_mask0(<8 x double>* %vp, <2 x double> %vec2, <2 x double> %mask) {
+; CHECK-LABEL: test_masked_8xdouble_to_2xdouble_perm_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovapd (%rdi), %zmm2
+; CHECK-NEXT: vextractf64x4 $1, %zmm2, %ymm3
+; CHECK-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[1],ymm3[0],ymm2[3],ymm3[2]
+; CHECK-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %xmm3, %xmm1, %k1
+; CHECK-NEXT: vmovapd %xmm2, %xmm0 {%k1}
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %vec = load <8 x double>, <8 x double>* %vp
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 1, i32 6>
+ %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
+ %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec2
+ ret <2 x double> %res
+}
+
+define <2 x double> @test_masked_z_8xdouble_to_2xdouble_perm_mem_mask0(<8 x double>* %vp, <2 x double> %mask) {
+; CHECK-LABEL: test_masked_z_8xdouble_to_2xdouble_perm_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovapd (%rdi), %zmm1
+; CHECK-NEXT: vextractf64x4 $1, %zmm1, %ymm2
+; CHECK-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],ymm2[0],ymm1[3],ymm2[2]
+; CHECK-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3]
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %xmm2, %xmm0, %k1
+; CHECK-NEXT: vmovapd %xmm1, %xmm0 {%k1} {z}
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %vec = load <8 x double>, <8 x double>* %vp
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 1, i32 6>
+ %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
+ %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
+ ret <2 x double> %res
+}
+
+define <2 x double> @test_masked_8xdouble_to_2xdouble_perm_mem_mask1(<8 x double>* %vp, <2 x double> %vec2, <2 x double> %mask) {
+; CHECK-LABEL: test_masked_8xdouble_to_2xdouble_perm_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovapd (%rdi), %zmm2
+; CHECK-NEXT: vextractf32x4 $2, %zmm2, %xmm3
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqpd %xmm4, %xmm1, %k1
+; CHECK-NEXT: vshufpd {{.*#+}} xmm0 {%k1} = xmm2[1],xmm3[0]
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %vec = load <8 x double>, <8 x double>* %vp
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 1, i32 4>
+ %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
+ %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec2
+ ret <2 x double> %res
+}
+
+define <2 x double> @test_masked_z_8xdouble_to_2xdouble_perm_mem_mask1(<8 x double>* %vp, <2 x double> %mask) {
+; CHECK-LABEL: test_masked_z_8xdouble_to_2xdouble_perm_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovapd (%rdi), %zmm1
+; CHECK-NEXT: vextractf32x4 $2, %zmm1, %xmm2
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %xmm3, %xmm0, %k1
+; CHECK-NEXT: vshufpd {{.*#+}} xmm0 {%k1} {z} = xmm1[1],xmm2[0]
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %vec = load <8 x double>, <8 x double>* %vp
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 1, i32 4>
+ %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
+ %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
+ ret <2 x double> %res
+}
+
diff --git a/test/CodeGen/X86/avx512-shuffles/permute.ll b/test/CodeGen/X86/avx512-shuffles/permute.ll
new file mode 100644
index 000000000000..ff392cca8ddc
--- /dev/null
+++ b/test/CodeGen/X86/avx512-shuffles/permute.ll
@@ -0,0 +1,3129 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512vl,+avx512bw %s -o - | FileCheck %s
+
+define <16 x i16> @test_16xi16_perm_mask0(<16 x i16> %vec) {
+; CHECK-LABEL: test_16xi16_perm_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14]
+; CHECK-NEXT: vpermw %ymm0, %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 8, i32 6, i32 12, i32 4, i32 7, i32 9, i32 14, i32 8, i32 4, i32 12, i32 9, i32 4, i32 14, i32 15, i32 12, i32 14>
+ ret <16 x i16> %res
+}
+define <16 x i16> @test_masked_16xi16_perm_mask0(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
+; CHECK-LABEL: test_masked_16xi16_perm_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14]
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vpcmpeqw %ymm4, %ymm2, %k1
+; CHECK-NEXT: vpermw %ymm0, %ymm3, %ymm1 {%k1}
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 8, i32 6, i32 12, i32 4, i32 7, i32 9, i32 14, i32 8, i32 4, i32 12, i32 9, i32 4, i32 14, i32 15, i32 12, i32 14>
+ %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
+ ret <16 x i16> %res
+}
+
+define <16 x i16> @test_masked_z_16xi16_perm_mask0(<16 x i16> %vec, <16 x i16> %mask) {
+; CHECK-LABEL: test_masked_z_16xi16_perm_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqw %ymm3, %ymm1, %k1
+; CHECK-NEXT: vpermw %ymm0, %ymm2, %ymm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 8, i32 6, i32 12, i32 4, i32 7, i32 9, i32 14, i32 8, i32 4, i32 12, i32 9, i32 4, i32 14, i32 15, i32 12, i32 14>
+ %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
+ ret <16 x i16> %res
+}
+define <16 x i16> @test_masked_16xi16_perm_mask1(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
+; CHECK-LABEL: test_masked_16xi16_perm_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0]
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vpcmpeqw %ymm4, %ymm2, %k1
+; CHECK-NEXT: vpermw %ymm0, %ymm3, %ymm1 {%k1}
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 4, i32 11, i32 14, i32 10, i32 7, i32 1, i32 6, i32 9, i32 14, i32 15, i32 7, i32 13, i32 4, i32 12, i32 8, i32 0>
+ %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
+ ret <16 x i16> %res
+}
+
+define <16 x i16> @test_masked_z_16xi16_perm_mask1(<16 x i16> %vec, <16 x i16> %mask) {
+; CHECK-LABEL: test_masked_z_16xi16_perm_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqw %ymm3, %ymm1, %k1
+; CHECK-NEXT: vpermw %ymm0, %ymm2, %ymm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 4, i32 11, i32 14, i32 10, i32 7, i32 1, i32 6, i32 9, i32 14, i32 15, i32 7, i32 13, i32 4, i32 12, i32 8, i32 0>
+ %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
+ ret <16 x i16> %res
+}
+define <16 x i16> @test_masked_16xi16_perm_mask2(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
+; CHECK-LABEL: test_masked_16xi16_perm_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7]
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vpcmpeqw %ymm4, %ymm2, %k1
+; CHECK-NEXT: vpermw %ymm0, %ymm3, %ymm1 {%k1}
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 11, i32 6, i32 13, i32 10, i32 0, i32 7, i32 13, i32 3, i32 5, i32 13, i32 3, i32 9, i32 3, i32 15, i32 12, i32 7>
+ %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
+ ret <16 x i16> %res
+}
+
+define <16 x i16> @test_masked_z_16xi16_perm_mask2(<16 x i16> %vec, <16 x i16> %mask) {
+; CHECK-LABEL: test_masked_z_16xi16_perm_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqw %ymm3, %ymm1, %k1
+; CHECK-NEXT: vpermw %ymm0, %ymm2, %ymm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 11, i32 6, i32 13, i32 10, i32 0, i32 7, i32 13, i32 3, i32 5, i32 13, i32 3, i32 9, i32 3, i32 15, i32 12, i32 7>
+ %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
+ ret <16 x i16> %res
+}
+define <16 x i16> @test_16xi16_perm_mask3(<16 x i16> %vec) {
+; CHECK-LABEL: test_16xi16_perm_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6]
+; CHECK-NEXT: vpermw %ymm0, %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 1, i32 5, i32 8, i32 14, i32 1, i32 8, i32 11, i32 8, i32 13, i32 8, i32 15, i32 9, i32 9, i32 7, i32 9, i32 6>
+ ret <16 x i16> %res
+}
+define <16 x i16> @test_masked_16xi16_perm_mask3(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
+; CHECK-LABEL: test_masked_16xi16_perm_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6]
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vpcmpeqw %ymm4, %ymm2, %k1
+; CHECK-NEXT: vpermw %ymm0, %ymm3, %ymm1 {%k1}
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 1, i32 5, i32 8, i32 14, i32 1, i32 8, i32 11, i32 8, i32 13, i32 8, i32 15, i32 9, i32 9, i32 7, i32 9, i32 6>
+ %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
+ ret <16 x i16> %res
+}
+
+define <16 x i16> @test_masked_z_16xi16_perm_mask3(<16 x i16> %vec, <16 x i16> %mask) {
+; CHECK-LABEL: test_masked_z_16xi16_perm_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqw %ymm3, %ymm1, %k1
+; CHECK-NEXT: vpermw %ymm0, %ymm2, %ymm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 1, i32 5, i32 8, i32 14, i32 1, i32 8, i32 11, i32 8, i32 13, i32 8, i32 15, i32 9, i32 9, i32 7, i32 9, i32 6>
+ %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
+ ret <16 x i16> %res
+}
+define <16 x i16> @test_16xi16_perm_mem_mask0(<16 x i16>* %vp) {
+; CHECK-LABEL: test_16xi16_perm_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm0 = [9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13]
+; CHECK-NEXT: vpermw (%rdi), %ymm0, %ymm0
+; CHECK-NEXT: retq
+ %vec = load <16 x i16>, <16 x i16>* %vp
+ %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 9, i32 10, i32 7, i32 1, i32 12, i32 14, i32 14, i32 13, i32 14, i32 14, i32 8, i32 6, i32 11, i32 4, i32 12, i32 13>
+ ret <16 x i16> %res
+}
+define <16 x i16> @test_masked_16xi16_perm_mem_mask0(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
+; CHECK-LABEL: test_masked_16xi16_perm_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqw %ymm3, %ymm1, %k1
+; CHECK-NEXT: vpermw (%rdi), %ymm2, %ymm0 {%k1}
+; CHECK-NEXT: retq
+ %vec = load <16 x i16>, <16 x i16>* %vp
+ %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 9, i32 10, i32 7, i32 1, i32 12, i32 14, i32 14, i32 13, i32 14, i32 14, i32 8, i32 6, i32 11, i32 4, i32 12, i32 13>
+ %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
+ ret <16 x i16> %res
+}
+
+define <16 x i16> @test_masked_z_16xi16_perm_mem_mask0(<16 x i16>* %vp, <16 x i16> %mask) {
+; CHECK-LABEL: test_masked_z_16xi16_perm_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13]
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqw %ymm2, %ymm0, %k1
+; CHECK-NEXT: vpermw (%rdi), %ymm1, %ymm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %vec = load <16 x i16>, <16 x i16>* %vp
+ %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 9, i32 10, i32 7, i32 1, i32 12, i32 14, i32 14, i32 13, i32 14, i32 14, i32 8, i32 6, i32 11, i32 4, i32 12, i32 13>
+ %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
+ ret <16 x i16> %res
+}
+
+define <16 x i16> @test_masked_16xi16_perm_mem_mask1(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
+; CHECK-LABEL: test_masked_16xi16_perm_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqw %ymm3, %ymm1, %k1
+; CHECK-NEXT: vpermw (%rdi), %ymm2, %ymm0 {%k1}
+; CHECK-NEXT: retq
+ %vec = load <16 x i16>, <16 x i16>* %vp
+ %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 14, i32 9, i32 15, i32 9, i32 7, i32 10, i32 15, i32 14, i32 12, i32 1, i32 9, i32 7, i32 10, i32 13, i32 3, i32 11>
+ %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
+ ret <16 x i16> %res
+}
+
+define <16 x i16> @test_masked_z_16xi16_perm_mem_mask1(<16 x i16>* %vp, <16 x i16> %mask) {
+; CHECK-LABEL: test_masked_z_16xi16_perm_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11]
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqw %ymm2, %ymm0, %k1
+; CHECK-NEXT: vpermw (%rdi), %ymm1, %ymm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %vec = load <16 x i16>, <16 x i16>* %vp
+ %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 14, i32 9, i32 15, i32 9, i32 7, i32 10, i32 15, i32 14, i32 12, i32 1, i32 9, i32 7, i32 10, i32 13, i32 3, i32 11>
+ %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
+ ret <16 x i16> %res
+}
+
+define <16 x i16> @test_masked_16xi16_perm_mem_mask2(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
+; CHECK-LABEL: test_masked_16xi16_perm_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqw %ymm3, %ymm1, %k1
+; CHECK-NEXT: vpermw (%rdi), %ymm2, %ymm0 {%k1}
+; CHECK-NEXT: retq
+ %vec = load <16 x i16>, <16 x i16>* %vp
+ %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 1, i32 3, i32 12, i32 5, i32 13, i32 1, i32 2, i32 11, i32 0, i32 9, i32 14, i32 8, i32 10, i32 0, i32 10, i32 9>
+ %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
+ ret <16 x i16> %res
+}
+
+define <16 x i16> @test_masked_z_16xi16_perm_mem_mask2(<16 x i16>* %vp, <16 x i16> %mask) {
+; CHECK-LABEL: test_masked_z_16xi16_perm_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9]
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqw %ymm2, %ymm0, %k1
+; CHECK-NEXT: vpermw (%rdi), %ymm1, %ymm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %vec = load <16 x i16>, <16 x i16>* %vp
+ %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 1, i32 3, i32 12, i32 5, i32 13, i32 1, i32 2, i32 11, i32 0, i32 9, i32 14, i32 8, i32 10, i32 0, i32 10, i32 9>
+ %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
+ ret <16 x i16> %res
+}
+
+define <16 x i16> @test_16xi16_perm_mem_mask3(<16 x i16>* %vp) {
+; CHECK-LABEL: test_16xi16_perm_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm0 = [9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4]
+; CHECK-NEXT: vpermw (%rdi), %ymm0, %ymm0
+; CHECK-NEXT: retq
+ %vec = load <16 x i16>, <16 x i16>* %vp
+ %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 9, i32 6, i32 5, i32 15, i32 0, i32 0, i32 15, i32 2, i32 1, i32 3, i32 12, i32 14, i32 0, i32 6, i32 1, i32 4>
+ ret <16 x i16> %res
+}
+define <16 x i16> @test_masked_16xi16_perm_mem_mask3(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
+; CHECK-LABEL: test_masked_16xi16_perm_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqw %ymm3, %ymm1, %k1
+; CHECK-NEXT: vpermw (%rdi), %ymm2, %ymm0 {%k1}
+; CHECK-NEXT: retq
+ %vec = load <16 x i16>, <16 x i16>* %vp
+ %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 9, i32 6, i32 5, i32 15, i32 0, i32 0, i32 15, i32 2, i32 1, i32 3, i32 12, i32 14, i32 0, i32 6, i32 1, i32 4>
+ %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
+ ret <16 x i16> %res
+}
+
+define <16 x i16> @test_masked_z_16xi16_perm_mem_mask3(<16 x i16>* %vp, <16 x i16> %mask) {
+; CHECK-LABEL: test_masked_z_16xi16_perm_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4]
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqw %ymm2, %ymm0, %k1
+; CHECK-NEXT: vpermw (%rdi), %ymm1, %ymm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %vec = load <16 x i16>, <16 x i16>* %vp
+ %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 9, i32 6, i32 5, i32 15, i32 0, i32 0, i32 15, i32 2, i32 1, i32 3, i32 12, i32 14, i32 0, i32 6, i32 1, i32 4>
+ %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
+ ret <16 x i16> %res
+}
+
+define <32 x i16> @test_32xi16_perm_mask0(<32 x i16> %vec) {
+; CHECK-LABEL: test_32xi16_perm_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm1 = [16,1,3,31,6,11,23,26,29,5,21,30,1,21,27,10,8,19,14,5,15,13,18,16,9,11,26,8,17,0,23,10]
+; CHECK-NEXT: vpermw %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 16, i32 1, i32 3, i32 31, i32 6, i32 11, i32 23, i32 26, i32 29, i32 5, i32 21, i32 30, i32 1, i32 21, i32 27, i32 10, i32 8, i32 19, i32 14, i32 5, i32 15, i32 13, i32 18, i32 16, i32 9, i32 11, i32 26, i32 8, i32 17, i32 0, i32 23, i32 10>
+ ret <32 x i16> %res
+}
+define <32 x i16> @test_masked_32xi16_perm_mask0(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
+; CHECK-LABEL: test_masked_32xi16_perm_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm3 = [16,1,3,31,6,11,23,26,29,5,21,30,1,21,27,10,8,19,14,5,15,13,18,16,9,11,26,8,17,0,23,10]
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vpcmpeqw %zmm4, %zmm2, %k1
+; CHECK-NEXT: vpermw %zmm0, %zmm3, %zmm1 {%k1}
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 16, i32 1, i32 3, i32 31, i32 6, i32 11, i32 23, i32 26, i32 29, i32 5, i32 21, i32 30, i32 1, i32 21, i32 27, i32 10, i32 8, i32 19, i32 14, i32 5, i32 15, i32 13, i32 18, i32 16, i32 9, i32 11, i32 26, i32 8, i32 17, i32 0, i32 23, i32 10>
+ %cmp = icmp eq <32 x i16> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
+ ret <32 x i16> %res
+}
+
+define <32 x i16> @test_masked_z_32xi16_perm_mask0(<32 x i16> %vec, <32 x i16> %mask) {
+; CHECK-LABEL: test_masked_z_32xi16_perm_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [16,1,3,31,6,11,23,26,29,5,21,30,1,21,27,10,8,19,14,5,15,13,18,16,9,11,26,8,17,0,23,10]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqw %zmm3, %zmm1, %k1
+; CHECK-NEXT: vpermw %zmm0, %zmm2, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 16, i32 1, i32 3, i32 31, i32 6, i32 11, i32 23, i32 26, i32 29, i32 5, i32 21, i32 30, i32 1, i32 21, i32 27, i32 10, i32 8, i32 19, i32 14, i32 5, i32 15, i32 13, i32 18, i32 16, i32 9, i32 11, i32 26, i32 8, i32 17, i32 0, i32 23, i32 10>
+ %cmp = icmp eq <32 x i16> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
+ ret <32 x i16> %res
+}
+define <32 x i16> @test_masked_32xi16_perm_mask1(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
+; CHECK-LABEL: test_masked_32xi16_perm_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,8,7,30,11,9,11,30,20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12,22,13,21,1,14,8,5,16]
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vpcmpeqw %zmm4, %zmm2, %k1
+; CHECK-NEXT: vpermw %zmm0, %zmm3, %zmm1 {%k1}
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 1, i32 8, i32 7, i32 30, i32 11, i32 9, i32 11, i32 30, i32 20, i32 19, i32 22, i32 12, i32 13, i32 20, i32 0, i32 6, i32 10, i32 7, i32 20, i32 12, i32 28, i32 18, i32 13, i32 12, i32 22, i32 13, i32 21, i32 1, i32 14, i32 8, i32 5, i32 16>
+ %cmp = icmp eq <32 x i16> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
+ ret <32 x i16> %res
+}
+
+define <32 x i16> @test_masked_z_32xi16_perm_mask1(<32 x i16> %vec, <32 x i16> %mask) {
+; CHECK-LABEL: test_masked_z_32xi16_perm_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [1,8,7,30,11,9,11,30,20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12,22,13,21,1,14,8,5,16]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqw %zmm3, %zmm1, %k1
+; CHECK-NEXT: vpermw %zmm0, %zmm2, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 1, i32 8, i32 7, i32 30, i32 11, i32 9, i32 11, i32 30, i32 20, i32 19, i32 22, i32 12, i32 13, i32 20, i32 0, i32 6, i32 10, i32 7, i32 20, i32 12, i32 28, i32 18, i32 13, i32 12, i32 22, i32 13, i32 21, i32 1, i32 14, i32 8, i32 5, i32 16>
+ %cmp = icmp eq <32 x i16> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
+ ret <32 x i16> %res
+}
+define <32 x i16> @test_masked_32xi16_perm_mask2(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
+; CHECK-LABEL: test_masked_32xi16_perm_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm3 = [15,17,24,28,15,9,14,25,28,25,6,31,20,2,23,31,12,21,10,6,22,0,26,16,3,3,20,27,8,31,3,27]
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vpcmpeqw %zmm4, %zmm2, %k1
+; CHECK-NEXT: vpermw %zmm0, %zmm3, %zmm1 {%k1}
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 15, i32 17, i32 24, i32 28, i32 15, i32 9, i32 14, i32 25, i32 28, i32 25, i32 6, i32 31, i32 20, i32 2, i32 23, i32 31, i32 12, i32 21, i32 10, i32 6, i32 22, i32 0, i32 26, i32 16, i32 3, i32 3, i32 20, i32 27, i32 8, i32 31, i32 3, i32 27>
+ %cmp = icmp eq <32 x i16> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
+ ret <32 x i16> %res
+}
+
+define <32 x i16> @test_masked_z_32xi16_perm_mask2(<32 x i16> %vec, <32 x i16> %mask) {
+; CHECK-LABEL: test_masked_z_32xi16_perm_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,17,24,28,15,9,14,25,28,25,6,31,20,2,23,31,12,21,10,6,22,0,26,16,3,3,20,27,8,31,3,27]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqw %zmm3, %zmm1, %k1
+; CHECK-NEXT: vpermw %zmm0, %zmm2, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 15, i32 17, i32 24, i32 28, i32 15, i32 9, i32 14, i32 25, i32 28, i32 25, i32 6, i32 31, i32 20, i32 2, i32 23, i32 31, i32 12, i32 21, i32 10, i32 6, i32 22, i32 0, i32 26, i32 16, i32 3, i32 3, i32 20, i32 27, i32 8, i32 31, i32 3, i32 27>
+ %cmp = icmp eq <32 x i16> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
+ ret <32 x i16> %res
+}
+define <32 x i16> @test_32xi16_perm_mask3(<32 x i16> %vec) {
+; CHECK-LABEL: test_32xi16_perm_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm1 = [12,2,8,14,25,27,4,16,20,11,27,8,0,1,21,17,30,30,29,1,23,22,20,22,28,20,11,17,6,18,0,4]
+; CHECK-NEXT: vpermw %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 12, i32 2, i32 8, i32 14, i32 25, i32 27, i32 4, i32 16, i32 20, i32 11, i32 27, i32 8, i32 0, i32 1, i32 21, i32 17, i32 30, i32 30, i32 29, i32 1, i32 23, i32 22, i32 20, i32 22, i32 28, i32 20, i32 11, i32 17, i32 6, i32 18, i32 0, i32 4>
+ ret <32 x i16> %res
+}
+define <32 x i16> @test_masked_32xi16_perm_mask3(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
+; CHECK-LABEL: test_masked_32xi16_perm_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm3 = [12,2,8,14,25,27,4,16,20,11,27,8,0,1,21,17,30,30,29,1,23,22,20,22,28,20,11,17,6,18,0,4]
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vpcmpeqw %zmm4, %zmm2, %k1
+; CHECK-NEXT: vpermw %zmm0, %zmm3, %zmm1 {%k1}
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 12, i32 2, i32 8, i32 14, i32 25, i32 27, i32 4, i32 16, i32 20, i32 11, i32 27, i32 8, i32 0, i32 1, i32 21, i32 17, i32 30, i32 30, i32 29, i32 1, i32 23, i32 22, i32 20, i32 22, i32 28, i32 20, i32 11, i32 17, i32 6, i32 18, i32 0, i32 4>
+ %cmp = icmp eq <32 x i16> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
+ ret <32 x i16> %res
+}
+
+define <32 x i16> @test_masked_z_32xi16_perm_mask3(<32 x i16> %vec, <32 x i16> %mask) {
+; CHECK-LABEL: test_masked_z_32xi16_perm_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [12,2,8,14,25,27,4,16,20,11,27,8,0,1,21,17,30,30,29,1,23,22,20,22,28,20,11,17,6,18,0,4]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqw %zmm3, %zmm1, %k1
+; CHECK-NEXT: vpermw %zmm0, %zmm2, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 12, i32 2, i32 8, i32 14, i32 25, i32 27, i32 4, i32 16, i32 20, i32 11, i32 27, i32 8, i32 0, i32 1, i32 21, i32 17, i32 30, i32 30, i32 29, i32 1, i32 23, i32 22, i32 20, i32 22, i32 28, i32 20, i32 11, i32 17, i32 6, i32 18, i32 0, i32 4>
+ %cmp = icmp eq <32 x i16> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
+ ret <32 x i16> %res
+}
+define <32 x i16> @test_32xi16_perm_mem_mask0(<32 x i16>* %vp) {
+; CHECK-LABEL: test_32xi16_perm_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm0 = [19,1,5,31,9,12,17,9,15,7,1,5,16,2,12,10,13,3,29,15,26,31,10,15,22,13,9,23,28,29,20,12]
+; CHECK-NEXT: vpermw (%rdi), %zmm0, %zmm0
+; CHECK-NEXT: retq
+ %vec = load <32 x i16>, <32 x i16>* %vp
+ %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 19, i32 1, i32 5, i32 31, i32 9, i32 12, i32 17, i32 9, i32 15, i32 7, i32 1, i32 5, i32 16, i32 2, i32 12, i32 10, i32 13, i32 3, i32 29, i32 15, i32 26, i32 31, i32 10, i32 15, i32 22, i32 13, i32 9, i32 23, i32 28, i32 29, i32 20, i32 12>
+ ret <32 x i16> %res
+}
+define <32 x i16> @test_masked_32xi16_perm_mem_mask0(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
+; CHECK-LABEL: test_masked_32xi16_perm_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [19,1,5,31,9,12,17,9,15,7,1,5,16,2,12,10,13,3,29,15,26,31,10,15,22,13,9,23,28,29,20,12]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqw %zmm3, %zmm1, %k1
+; CHECK-NEXT: vpermw (%rdi), %zmm2, %zmm0 {%k1}
+; CHECK-NEXT: retq
+ %vec = load <32 x i16>, <32 x i16>* %vp
+ %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 19, i32 1, i32 5, i32 31, i32 9, i32 12, i32 17, i32 9, i32 15, i32 7, i32 1, i32 5, i32 16, i32 2, i32 12, i32 10, i32 13, i32 3, i32 29, i32 15, i32 26, i32 31, i32 10, i32 15, i32 22, i32 13, i32 9, i32 23, i32 28, i32 29, i32 20, i32 12>
+ %cmp = icmp eq <32 x i16> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
+ ret <32 x i16> %res
+}
+
+define <32 x i16> @test_masked_z_32xi16_perm_mem_mask0(<32 x i16>* %vp, <32 x i16> %mask) {
+; CHECK-LABEL: test_masked_z_32xi16_perm_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm1 = [19,1,5,31,9,12,17,9,15,7,1,5,16,2,12,10,13,3,29,15,26,31,10,15,22,13,9,23,28,29,20,12]
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqw %zmm2, %zmm0, %k1
+; CHECK-NEXT: vpermw (%rdi), %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %vec = load <32 x i16>, <32 x i16>* %vp
+ %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 19, i32 1, i32 5, i32 31, i32 9, i32 12, i32 17, i32 9, i32 15, i32 7, i32 1, i32 5, i32 16, i32 2, i32 12, i32 10, i32 13, i32 3, i32 29, i32 15, i32 26, i32 31, i32 10, i32 15, i32 22, i32 13, i32 9, i32 23, i32 28, i32 29, i32 20, i32 12>
+ %cmp = icmp eq <32 x i16> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
+ ret <32 x i16> %res
+}
+
+define <32 x i16> @test_masked_32xi16_perm_mem_mask1(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
+; CHECK-LABEL: test_masked_32xi16_perm_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [31,20,2,2,23,1,0,12,16,14,15,18,21,13,11,31,8,24,13,11,2,27,22,28,14,21,3,12,6,1,30,6]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqw %zmm3, %zmm1, %k1
+; CHECK-NEXT: vpermw (%rdi), %zmm2, %zmm0 {%k1}
+; CHECK-NEXT: retq
+ %vec = load <32 x i16>, <32 x i16>* %vp
+ %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 31, i32 20, i32 2, i32 2, i32 23, i32 1, i32 0, i32 12, i32 16, i32 14, i32 15, i32 18, i32 21, i32 13, i32 11, i32 31, i32 8, i32 24, i32 13, i32 11, i32 2, i32 27, i32 22, i32 28, i32 14, i32 21, i32 3, i32 12, i32 6, i32 1, i32 30, i32 6>
+ %cmp = icmp eq <32 x i16> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
+ ret <32 x i16> %res
+}
+
+define <32 x i16> @test_masked_z_32xi16_perm_mem_mask1(<32 x i16>* %vp, <32 x i16> %mask) {
+; CHECK-LABEL: test_masked_z_32xi16_perm_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm1 = [31,20,2,2,23,1,0,12,16,14,15,18,21,13,11,31,8,24,13,11,2,27,22,28,14,21,3,12,6,1,30,6]
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqw %zmm2, %zmm0, %k1
+; CHECK-NEXT: vpermw (%rdi), %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %vec = load <32 x i16>, <32 x i16>* %vp
+ %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 31, i32 20, i32 2, i32 2, i32 23, i32 1, i32 0, i32 12, i32 16, i32 14, i32 15, i32 18, i32 21, i32 13, i32 11, i32 31, i32 8, i32 24, i32 13, i32 11, i32 2, i32 27, i32 22, i32 28, i32 14, i32 21, i32 3, i32 12, i32 6, i32 1, i32 30, i32 6>
+ %cmp = icmp eq <32 x i16> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
+ ret <32 x i16> %res
+}
+
+define <32 x i16> @test_masked_32xi16_perm_mem_mask2(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
+; CHECK-LABEL: test_masked_32xi16_perm_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [4,6,12,17,4,31,31,4,12,21,28,15,29,10,15,15,21,6,19,7,10,30,28,26,1,4,8,25,26,18,22,25]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqw %zmm3, %zmm1, %k1
+; CHECK-NEXT: vpermw (%rdi), %zmm2, %zmm0 {%k1}
+; CHECK-NEXT: retq
+ %vec = load <32 x i16>, <32 x i16>* %vp
+ %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 4, i32 6, i32 12, i32 17, i32 4, i32 31, i32 31, i32 4, i32 12, i32 21, i32 28, i32 15, i32 29, i32 10, i32 15, i32 15, i32 21, i32 6, i32 19, i32 7, i32 10, i32 30, i32 28, i32 26, i32 1, i32 4, i32 8, i32 25, i32 26, i32 18, i32 22, i32 25>
+ %cmp = icmp eq <32 x i16> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
+ ret <32 x i16> %res
+}
+
+define <32 x i16> @test_masked_z_32xi16_perm_mem_mask2(<32 x i16>* %vp, <32 x i16> %mask) {
+; CHECK-LABEL: test_masked_z_32xi16_perm_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,6,12,17,4,31,31,4,12,21,28,15,29,10,15,15,21,6,19,7,10,30,28,26,1,4,8,25,26,18,22,25]
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqw %zmm2, %zmm0, %k1
+; CHECK-NEXT: vpermw (%rdi), %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %vec = load <32 x i16>, <32 x i16>* %vp
+ %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 4, i32 6, i32 12, i32 17, i32 4, i32 31, i32 31, i32 4, i32 12, i32 21, i32 28, i32 15, i32 29, i32 10, i32 15, i32 15, i32 21, i32 6, i32 19, i32 7, i32 10, i32 30, i32 28, i32 26, i32 1, i32 4, i32 8, i32 25, i32 26, i32 18, i32 22, i32 25>
+ %cmp = icmp eq <32 x i16> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
+ ret <32 x i16> %res
+}
+
+define <32 x i16> @test_32xi16_perm_mem_mask3(<32 x i16>* %vp) {
+; CHECK-LABEL: test_32xi16_perm_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm0 = [2,2,27,1,7,1,0,27,10,5,4,20,30,16,28,16,18,21,25,24,31,23,28,6,17,19,26,15,25,12,18,27]
+; CHECK-NEXT: vpermw (%rdi), %zmm0, %zmm0
+; CHECK-NEXT: retq
+ %vec = load <32 x i16>, <32 x i16>* %vp
+ %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 2, i32 2, i32 27, i32 1, i32 7, i32 1, i32 0, i32 27, i32 10, i32 5, i32 4, i32 20, i32 30, i32 16, i32 28, i32 16, i32 18, i32 21, i32 25, i32 24, i32 31, i32 23, i32 28, i32 6, i32 17, i32 19, i32 26, i32 15, i32 25, i32 12, i32 18, i32 27>
+ ret <32 x i16> %res
+}
+define <32 x i16> @test_masked_32xi16_perm_mem_mask3(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
+; CHECK-LABEL: test_masked_32xi16_perm_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [2,2,27,1,7,1,0,27,10,5,4,20,30,16,28,16,18,21,25,24,31,23,28,6,17,19,26,15,25,12,18,27]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqw %zmm3, %zmm1, %k1
+; CHECK-NEXT: vpermw (%rdi), %zmm2, %zmm0 {%k1}
+; CHECK-NEXT: retq
+ %vec = load <32 x i16>, <32 x i16>* %vp
+ %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 2, i32 2, i32 27, i32 1, i32 7, i32 1, i32 0, i32 27, i32 10, i32 5, i32 4, i32 20, i32 30, i32 16, i32 28, i32 16, i32 18, i32 21, i32 25, i32 24, i32 31, i32 23, i32 28, i32 6, i32 17, i32 19, i32 26, i32 15, i32 25, i32 12, i32 18, i32 27>
+ %cmp = icmp eq <32 x i16> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
+ ret <32 x i16> %res
+}
+
+define <32 x i16> @test_masked_z_32xi16_perm_mem_mask3(<32 x i16>* %vp, <32 x i16> %mask) {
+; CHECK-LABEL: test_masked_z_32xi16_perm_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,27,1,7,1,0,27,10,5,4,20,30,16,28,16,18,21,25,24,31,23,28,6,17,19,26,15,25,12,18,27]
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqw %zmm2, %zmm0, %k1
+; CHECK-NEXT: vpermw (%rdi), %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %vec = load <32 x i16>, <32 x i16>* %vp
+ %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 2, i32 2, i32 27, i32 1, i32 7, i32 1, i32 0, i32 27, i32 10, i32 5, i32 4, i32 20, i32 30, i32 16, i32 28, i32 16, i32 18, i32 21, i32 25, i32 24, i32 31, i32 23, i32 28, i32 6, i32 17, i32 19, i32 26, i32 15, i32 25, i32 12, i32 18, i32 27>
+ %cmp = icmp eq <32 x i16> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
+ ret <32 x i16> %res
+}
+
+define <8 x i32> @test_8xi32_perm_mask0(<8 x i32> %vec) {
+; CHECK-LABEL: test_8xi32_perm_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [4,2,0,6,7,2,3,6]
+; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 4, i32 2, i32 0, i32 6, i32 7, i32 2, i32 3, i32 6>
+ ret <8 x i32> %res
+}
+define <8 x i32> @test_masked_8xi32_perm_mask0(<8 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
+; CHECK-LABEL: test_masked_8xi32_perm_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [4,2,0,6,7,2,3,6]
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vpcmpeqd %ymm4, %ymm2, %k1
+; CHECK-NEXT: vpermd %ymm0, %ymm3, %ymm1 {%k1}
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 4, i32 2, i32 0, i32 6, i32 7, i32 2, i32 3, i32 6>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_masked_z_8xi32_perm_mask0(<8 x i32> %vec, <8 x i32> %mask) {
+; CHECK-LABEL: test_masked_z_8xi32_perm_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [4,2,0,6,7,2,3,6]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqd %ymm3, %ymm1, %k1
+; CHECK-NEXT: vpermd %ymm0, %ymm2, %ymm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 4, i32 2, i32 0, i32 6, i32 7, i32 2, i32 3, i32 6>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
+ ret <8 x i32> %res
+}
+define <8 x i32> @test_masked_8xi32_perm_mask1(<8 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
+; CHECK-LABEL: test_masked_8xi32_perm_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [0,5,1,2,6,0,0,3]
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vpcmpeqd %ymm4, %ymm2, %k1
+; CHECK-NEXT: vpermd %ymm0, %ymm3, %ymm1 {%k1}
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 0, i32 5, i32 1, i32 2, i32 6, i32 0, i32 0, i32 3>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_masked_z_8xi32_perm_mask1(<8 x i32> %vec, <8 x i32> %mask) {
+; CHECK-LABEL: test_masked_z_8xi32_perm_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [0,5,1,2,6,0,0,3]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqd %ymm3, %ymm1, %k1
+; CHECK-NEXT: vpermd %ymm0, %ymm2, %ymm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 0, i32 5, i32 1, i32 2, i32 6, i32 0, i32 0, i32 3>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
+ ret <8 x i32> %res
+}
+define <8 x i32> @test_masked_8xi32_perm_mask2(<8 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
+; CHECK-LABEL: test_masked_8xi32_perm_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [3,6,5,5,1,7,3,4]
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vpcmpeqd %ymm4, %ymm2, %k1
+; CHECK-NEXT: vpermd %ymm0, %ymm3, %ymm1 {%k1}
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 6, i32 5, i32 5, i32 1, i32 7, i32 3, i32 4>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_masked_z_8xi32_perm_mask2(<8 x i32> %vec, <8 x i32> %mask) {
+; CHECK-LABEL: test_masked_z_8xi32_perm_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [3,6,5,5,1,7,3,4]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqd %ymm3, %ymm1, %k1
+; CHECK-NEXT: vpermd %ymm0, %ymm2, %ymm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 6, i32 5, i32 5, i32 1, i32 7, i32 3, i32 4>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
+ ret <8 x i32> %res
+}
+define <8 x i32> @test_8xi32_perm_mask3(<8 x i32> %vec) {
+; CHECK-LABEL: test_8xi32_perm_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [3,0,3,1,0,4,5,0]
+; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 0, i32 3, i32 1, i32 0, i32 4, i32 5, i32 0>
+ ret <8 x i32> %res
+}
+define <8 x i32> @test_masked_8xi32_perm_mask3(<8 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
+; CHECK-LABEL: test_masked_8xi32_perm_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [3,0,3,1,0,4,5,0]
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vpcmpeqd %ymm4, %ymm2, %k1
+; CHECK-NEXT: vpermd %ymm0, %ymm3, %ymm1 {%k1}
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 0, i32 3, i32 1, i32 0, i32 4, i32 5, i32 0>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_masked_z_8xi32_perm_mask3(<8 x i32> %vec, <8 x i32> %mask) {
+; CHECK-LABEL: test_masked_z_8xi32_perm_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [3,0,3,1,0,4,5,0]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqd %ymm3, %ymm1, %k1
+; CHECK-NEXT: vpermd %ymm0, %ymm2, %ymm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 0, i32 3, i32 1, i32 0, i32 4, i32 5, i32 0>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
+ ret <8 x i32> %res
+}
+define <8 x i32> @test_8xi32_perm_mem_mask0(<8 x i32>* %vp) {
+; CHECK-LABEL: test_8xi32_perm_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps {{.*#+}} ymm0 = [3,7,4,3,5,2,0,5]
+; CHECK-NEXT: vpermps (%rdi), %ymm0, %ymm0
+; CHECK-NEXT: retq
+ %vec = load <8 x i32>, <8 x i32>* %vp
+ %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 7, i32 4, i32 3, i32 5, i32 2, i32 0, i32 5>
+ ret <8 x i32> %res
+}
+define <8 x i32> @test_masked_8xi32_perm_mem_mask0(<8 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) {
+; CHECK-LABEL: test_masked_8xi32_perm_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [3,7,4,3,5,2,0,5]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqd %ymm3, %ymm1, %k1
+; CHECK-NEXT: vpermd (%rdi), %ymm2, %ymm0 {%k1}
+; CHECK-NEXT: retq
+ %vec = load <8 x i32>, <8 x i32>* %vp
+ %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 7, i32 4, i32 3, i32 5, i32 2, i32 0, i32 5>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_masked_z_8xi32_perm_mem_mask0(<8 x i32>* %vp, <8 x i32> %mask) {
+; CHECK-LABEL: test_masked_z_8xi32_perm_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [3,7,4,3,5,2,0,5]
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqd %ymm2, %ymm0, %k1
+; CHECK-NEXT: vpermd (%rdi), %ymm1, %ymm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %vec = load <8 x i32>, <8 x i32>* %vp
+ %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 7, i32 4, i32 3, i32 5, i32 2, i32 0, i32 5>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_masked_8xi32_perm_mem_mask1(<8 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) {
+; CHECK-LABEL: test_masked_8xi32_perm_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [4,6,1,7,6,7,6,5]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqd %ymm3, %ymm1, %k1
+; CHECK-NEXT: vpermd (%rdi), %ymm2, %ymm0 {%k1}
+; CHECK-NEXT: retq
+ %vec = load <8 x i32>, <8 x i32>* %vp
+ %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 4, i32 6, i32 1, i32 7, i32 6, i32 7, i32 6, i32 5>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_masked_z_8xi32_perm_mem_mask1(<8 x i32>* %vp, <8 x i32> %mask) {
+; CHECK-LABEL: test_masked_z_8xi32_perm_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [4,6,1,7,6,7,6,5]
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqd %ymm2, %ymm0, %k1
+; CHECK-NEXT: vpermd (%rdi), %ymm1, %ymm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %vec = load <8 x i32>, <8 x i32>* %vp
+ %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 4, i32 6, i32 1, i32 7, i32 6, i32 7, i32 6, i32 5>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_masked_8xi32_perm_mem_mask2(<8 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) {
+; CHECK-LABEL: test_masked_8xi32_perm_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [6,4,6,1,6,3,6,3]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqd %ymm3, %ymm1, %k1
+; CHECK-NEXT: vpermd (%rdi), %ymm2, %ymm0 {%k1}
+; CHECK-NEXT: retq
+ %vec = load <8 x i32>, <8 x i32>* %vp
+ %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 6, i32 4, i32 6, i32 1, i32 6, i32 3, i32 6, i32 3>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_masked_z_8xi32_perm_mem_mask2(<8 x i32>* %vp, <8 x i32> %mask) {
+; CHECK-LABEL: test_masked_z_8xi32_perm_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [6,4,6,1,6,3,6,3]
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqd %ymm2, %ymm0, %k1
+; CHECK-NEXT: vpermd (%rdi), %ymm1, %ymm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %vec = load <8 x i32>, <8 x i32>* %vp
+ %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 6, i32 4, i32 6, i32 1, i32 6, i32 3, i32 6, i32 3>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_8xi32_perm_mem_mask3(<8 x i32>* %vp) {
+; CHECK-LABEL: test_8xi32_perm_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps {{.*#+}} ymm0 = [6,0,0,7,3,7,7,5]
+; CHECK-NEXT: vpermps (%rdi), %ymm0, %ymm0
+; CHECK-NEXT: retq
+ %vec = load <8 x i32>, <8 x i32>* %vp
+ %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 6, i32 0, i32 0, i32 7, i32 3, i32 7, i32 7, i32 5>
+ ret <8 x i32> %res
+}
+define <8 x i32> @test_masked_8xi32_perm_mem_mask3(<8 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) {
+; CHECK-LABEL: test_masked_8xi32_perm_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [6,0,0,7,3,7,7,5]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqd %ymm3, %ymm1, %k1
+; CHECK-NEXT: vpermd (%rdi), %ymm2, %ymm0 {%k1}
+; CHECK-NEXT: retq
+ %vec = load <8 x i32>, <8 x i32>* %vp
+ %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 6, i32 0, i32 0, i32 7, i32 3, i32 7, i32 7, i32 5>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_masked_z_8xi32_perm_mem_mask3(<8 x i32>* %vp, <8 x i32> %mask) {
+; CHECK-LABEL: test_masked_z_8xi32_perm_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [6,0,0,7,3,7,7,5]
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqd %ymm2, %ymm0, %k1
+; CHECK-NEXT: vpermd (%rdi), %ymm1, %ymm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %vec = load <8 x i32>, <8 x i32>* %vp
+ %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 6, i32 0, i32 0, i32 7, i32 3, i32 7, i32 7, i32 5>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
+ ret <8 x i32> %res
+}
+
+define <16 x i32> @test_16xi32_perm_mask0(<16 x i32> %vec) {
+; CHECK-LABEL: test_16xi32_perm_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps {{.*#+}} zmm1 = [14,12,11,6,4,1,6,9,14,14,6,1,12,11,0,7]
+; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 14, i32 12, i32 11, i32 6, i32 4, i32 1, i32 6, i32 9, i32 14, i32 14, i32 6, i32 1, i32 12, i32 11, i32 0, i32 7>
+ ret <16 x i32> %res
+}
+define <16 x i32> @test_masked_16xi32_perm_mask0(<16 x i32> %vec, <16 x i32> %vec2, <16 x i32> %mask) {
+; CHECK-LABEL: test_masked_16xi32_perm_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa32 {{.*#+}} zmm3 = [14,12,11,6,4,1,6,9,14,14,6,1,12,11,0,7]
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vpcmpeqd %zmm4, %zmm2, %k1
+; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm1 {%k1}
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 14, i32 12, i32 11, i32 6, i32 4, i32 1, i32 6, i32 9, i32 14, i32 14, i32 6, i32 1, i32 12, i32 11, i32 0, i32 7>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2
+ ret <16 x i32> %res
+}
+
+define <16 x i32> @test_masked_z_16xi32_perm_mask0(<16 x i32> %vec, <16 x i32> %mask) {
+; CHECK-LABEL: test_masked_z_16xi32_perm_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa32 {{.*#+}} zmm2 = [14,12,11,6,4,1,6,9,14,14,6,1,12,11,0,7]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqd %zmm3, %zmm1, %k1
+; CHECK-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 14, i32 12, i32 11, i32 6, i32 4, i32 1, i32 6, i32 9, i32 14, i32 14, i32 6, i32 1, i32 12, i32 11, i32 0, i32 7>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
+ ret <16 x i32> %res
+}
+define <16 x i32> @test_masked_16xi32_perm_mask1(<16 x i32> %vec, <16 x i32> %vec2, <16 x i32> %mask) {
+; CHECK-LABEL: test_masked_16xi32_perm_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa32 {{.*#+}} zmm3 = [10,0,14,15,11,1,1,5,0,5,0,15,13,1,14,3]
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vpcmpeqd %zmm4, %zmm2, %k1
+; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm1 {%k1}
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 10, i32 0, i32 14, i32 15, i32 11, i32 1, i32 1, i32 5, i32 0, i32 5, i32 0, i32 15, i32 13, i32 1, i32 14, i32 3>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2
+ ret <16 x i32> %res
+}
+
+define <16 x i32> @test_masked_z_16xi32_perm_mask1(<16 x i32> %vec, <16 x i32> %mask) {
+; CHECK-LABEL: test_masked_z_16xi32_perm_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa32 {{.*#+}} zmm2 = [10,0,14,15,11,1,1,5,0,5,0,15,13,1,14,3]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqd %zmm3, %zmm1, %k1
+; CHECK-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 10, i32 0, i32 14, i32 15, i32 11, i32 1, i32 1, i32 5, i32 0, i32 5, i32 0, i32 15, i32 13, i32 1, i32 14, i32 3>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
+ ret <16 x i32> %res
+}
+define <16 x i32> @test_masked_16xi32_perm_mask2(<16 x i32> %vec, <16 x i32> %vec2, <16 x i32> %mask) {
+; CHECK-LABEL: test_masked_16xi32_perm_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa32 {{.*#+}} zmm3 = [3,10,15,1,0,5,0,9,13,2,1,5,15,2,15,5]
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vpcmpeqd %zmm4, %zmm2, %k1
+; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm1 {%k1}
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 10, i32 15, i32 1, i32 0, i32 5, i32 0, i32 9, i32 13, i32 2, i32 1, i32 5, i32 15, i32 2, i32 15, i32 5>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2
+ ret <16 x i32> %res
+}
+
+define <16 x i32> @test_masked_z_16xi32_perm_mask2(<16 x i32> %vec, <16 x i32> %mask) {
+; CHECK-LABEL: test_masked_z_16xi32_perm_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa32 {{.*#+}} zmm2 = [3,10,15,1,0,5,0,9,13,2,1,5,15,2,15,5]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqd %zmm3, %zmm1, %k1
+; CHECK-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 10, i32 15, i32 1, i32 0, i32 5, i32 0, i32 9, i32 13, i32 2, i32 1, i32 5, i32 15, i32 2, i32 15, i32 5>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
+ ret <16 x i32> %res
+}
+define <16 x i32> @test_16xi32_perm_mask3(<16 x i32> %vec) {
+; CHECK-LABEL: test_16xi32_perm_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps {{.*#+}} zmm1 = [7,4,14,15,10,2,15,1,9,2,14,15,12,5,3,12]
+; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 7, i32 4, i32 14, i32 15, i32 10, i32 2, i32 15, i32 1, i32 9, i32 2, i32 14, i32 15, i32 12, i32 5, i32 3, i32 12>
+ ret <16 x i32> %res
+}
+define <16 x i32> @test_masked_16xi32_perm_mask3(<16 x i32> %vec, <16 x i32> %vec2, <16 x i32> %mask) {
+; CHECK-LABEL: test_masked_16xi32_perm_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa32 {{.*#+}} zmm3 = [7,4,14,15,10,2,15,1,9,2,14,15,12,5,3,12]
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vpcmpeqd %zmm4, %zmm2, %k1
+; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm1 {%k1}
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 7, i32 4, i32 14, i32 15, i32 10, i32 2, i32 15, i32 1, i32 9, i32 2, i32 14, i32 15, i32 12, i32 5, i32 3, i32 12>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2
+ ret <16 x i32> %res
+}
+
+define <16 x i32> @test_masked_z_16xi32_perm_mask3(<16 x i32> %vec, <16 x i32> %mask) {
+; CHECK-LABEL: test_masked_z_16xi32_perm_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa32 {{.*#+}} zmm2 = [7,4,14,15,10,2,15,1,9,2,14,15,12,5,3,12]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqd %zmm3, %zmm1, %k1
+; CHECK-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 7, i32 4, i32 14, i32 15, i32 10, i32 2, i32 15, i32 1, i32 9, i32 2, i32 14, i32 15, i32 12, i32 5, i32 3, i32 12>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
+ ret <16 x i32> %res
+}
+define <16 x i32> @test_16xi32_perm_mem_mask0(<16 x i32>* %vp) {
+; CHECK-LABEL: test_16xi32_perm_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps {{.*#+}} zmm0 = [0,1,1,6,8,11,2,6,10,1,7,5,15,0,6,6]
+; CHECK-NEXT: vpermps (%rdi), %zmm0, %zmm0
+; CHECK-NEXT: retq
+ %vec = load <16 x i32>, <16 x i32>* %vp
+ %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 0, i32 1, i32 1, i32 6, i32 8, i32 11, i32 2, i32 6, i32 10, i32 1, i32 7, i32 5, i32 15, i32 0, i32 6, i32 6>
+ ret <16 x i32> %res
+}
+define <16 x i32> @test_masked_16xi32_perm_mem_mask0(<16 x i32>* %vp, <16 x i32> %vec2, <16 x i32> %mask) {
+; CHECK-LABEL: test_masked_16xi32_perm_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,1,6,8,11,2,6,10,1,7,5,15,0,6,6]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqd %zmm3, %zmm1, %k1
+; CHECK-NEXT: vpermd (%rdi), %zmm2, %zmm0 {%k1}
+; CHECK-NEXT: retq
+ %vec = load <16 x i32>, <16 x i32>* %vp
+ %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 0, i32 1, i32 1, i32 6, i32 8, i32 11, i32 2, i32 6, i32 10, i32 1, i32 7, i32 5, i32 15, i32 0, i32 6, i32 6>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2
+ ret <16 x i32> %res
+}
+
+define <16 x i32> @test_masked_z_16xi32_perm_mem_mask0(<16 x i32>* %vp, <16 x i32> %mask) {
+; CHECK-LABEL: test_masked_z_16xi32_perm_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa32 {{.*#+}} zmm1 = [0,1,1,6,8,11,2,6,10,1,7,5,15,0,6,6]
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqd %zmm2, %zmm0, %k1
+; CHECK-NEXT: vpermd (%rdi), %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %vec = load <16 x i32>, <16 x i32>* %vp
+ %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 0, i32 1, i32 1, i32 6, i32 8, i32 11, i32 2, i32 6, i32 10, i32 1, i32 7, i32 5, i32 15, i32 0, i32 6, i32 6>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
+ ret <16 x i32> %res
+}
+
+define <16 x i32> @test_masked_16xi32_perm_mem_mask1(<16 x i32>* %vp, <16 x i32> %vec2, <16 x i32> %mask) {
+; CHECK-LABEL: test_masked_16xi32_perm_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa32 {{.*#+}} zmm2 = [11,5,3,4,7,15,12,4,8,11,12,7,6,12,6,3]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqd %zmm3, %zmm1, %k1
+; CHECK-NEXT: vpermd (%rdi), %zmm2, %zmm0 {%k1}
+; CHECK-NEXT: retq
+ %vec = load <16 x i32>, <16 x i32>* %vp
+ %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 11, i32 5, i32 3, i32 4, i32 7, i32 15, i32 12, i32 4, i32 8, i32 11, i32 12, i32 7, i32 6, i32 12, i32 6, i32 3>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2
+ ret <16 x i32> %res
+}
+
+define <16 x i32> @test_masked_z_16xi32_perm_mem_mask1(<16 x i32>* %vp, <16 x i32> %mask) {
+; CHECK-LABEL: test_masked_z_16xi32_perm_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa32 {{.*#+}} zmm1 = [11,5,3,4,7,15,12,4,8,11,12,7,6,12,6,3]
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqd %zmm2, %zmm0, %k1
+; CHECK-NEXT: vpermd (%rdi), %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %vec = load <16 x i32>, <16 x i32>* %vp
+ %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 11, i32 5, i32 3, i32 4, i32 7, i32 15, i32 12, i32 4, i32 8, i32 11, i32 12, i32 7, i32 6, i32 12, i32 6, i32 3>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
+ ret <16 x i32> %res
+}
+
+define <16 x i32> @test_masked_16xi32_perm_mem_mask2(<16 x i32>* %vp, <16 x i32> %vec2, <16 x i32> %mask) {
+; CHECK-LABEL: test_masked_16xi32_perm_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa32 {{.*#+}} zmm2 = [7,14,2,7,10,7,3,0,11,9,0,4,12,10,8,2]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqd %zmm3, %zmm1, %k1
+; CHECK-NEXT: vpermd (%rdi), %zmm2, %zmm0 {%k1}
+; CHECK-NEXT: retq
+ %vec = load <16 x i32>, <16 x i32>* %vp
+ %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 7, i32 14, i32 2, i32 7, i32 10, i32 7, i32 3, i32 0, i32 11, i32 9, i32 0, i32 4, i32 12, i32 10, i32 8, i32 2>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2
+ ret <16 x i32> %res
+}
+
+define <16 x i32> @test_masked_z_16xi32_perm_mem_mask2(<16 x i32>* %vp, <16 x i32> %mask) {
+; CHECK-LABEL: test_masked_z_16xi32_perm_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa32 {{.*#+}} zmm1 = [7,14,2,7,10,7,3,0,11,9,0,4,12,10,8,2]
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqd %zmm2, %zmm0, %k1
+; CHECK-NEXT: vpermd (%rdi), %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %vec = load <16 x i32>, <16 x i32>* %vp
+ %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 7, i32 14, i32 2, i32 7, i32 10, i32 7, i32 3, i32 0, i32 11, i32 9, i32 0, i32 4, i32 12, i32 10, i32 8, i32 2>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
+ ret <16 x i32> %res
+}
+
+define <16 x i32> @test_16xi32_perm_mem_mask3(<16 x i32>* %vp) {
+; CHECK-LABEL: test_16xi32_perm_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps {{.*#+}} zmm0 = [11,7,10,12,3,12,4,15,1,14,0,4,8,9,6,1]
+; CHECK-NEXT: vpermps (%rdi), %zmm0, %zmm0
+; CHECK-NEXT: retq
+ %vec = load <16 x i32>, <16 x i32>* %vp
+ %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 11, i32 7, i32 10, i32 12, i32 3, i32 12, i32 4, i32 15, i32 1, i32 14, i32 0, i32 4, i32 8, i32 9, i32 6, i32 1>
+ ret <16 x i32> %res
+}
+define <16 x i32> @test_masked_16xi32_perm_mem_mask3(<16 x i32>* %vp, <16 x i32> %vec2, <16 x i32> %mask) {
+; CHECK-LABEL: test_masked_16xi32_perm_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa32 {{.*#+}} zmm2 = [11,7,10,12,3,12,4,15,1,14,0,4,8,9,6,1]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqd %zmm3, %zmm1, %k1
+; CHECK-NEXT: vpermd (%rdi), %zmm2, %zmm0 {%k1}
+; CHECK-NEXT: retq
+ %vec = load <16 x i32>, <16 x i32>* %vp
+ %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 11, i32 7, i32 10, i32 12, i32 3, i32 12, i32 4, i32 15, i32 1, i32 14, i32 0, i32 4, i32 8, i32 9, i32 6, i32 1>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2
+ ret <16 x i32> %res
+}
+
+define <16 x i32> @test_masked_z_16xi32_perm_mem_mask3(<16 x i32>* %vp, <16 x i32> %mask) {
+; CHECK-LABEL: test_masked_z_16xi32_perm_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa32 {{.*#+}} zmm1 = [11,7,10,12,3,12,4,15,1,14,0,4,8,9,6,1]
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqd %zmm2, %zmm0, %k1
+; CHECK-NEXT: vpermd (%rdi), %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %vec = load <16 x i32>, <16 x i32>* %vp
+ %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 11, i32 7, i32 10, i32 12, i32 3, i32 12, i32 4, i32 15, i32 1, i32 14, i32 0, i32 4, i32 8, i32 9, i32 6, i32 1>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
+ ret <16 x i32> %res
+}
+
+define <4 x i64> @test_4xi64_perm_mask0(<4 x i64> %vec) {
+; CHECK-LABEL: test_4xi64_perm_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,0,3,1]
+; CHECK-NEXT: retq
+ %res = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 0, i32 3, i32 1>
+ ret <4 x i64> %res
+}
+define <4 x i64> @test_masked_4xi64_perm_mask0(<4 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
+; CHECK-LABEL: test_masked_4xi64_perm_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1
+; CHECK-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,0,3,1]
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 0, i32 3, i32 1>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_masked_z_4xi64_perm_mask0(<4 x i64> %vec, <4 x i64> %mask) {
+; CHECK-LABEL: test_masked_z_4xi64_perm_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
+; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,0,3,1]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 0, i32 3, i32 1>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
+ ret <4 x i64> %res
+}
+define <4 x i64> @test_masked_4xi64_perm_mask1(<4 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
+; CHECK-LABEL: test_masked_4xi64_perm_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1
+; CHECK-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[1,2,0,3]
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 1, i32 2, i32 0, i32 3>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_masked_z_4xi64_perm_mask1(<4 x i64> %vec, <4 x i64> %mask) {
+; CHECK-LABEL: test_masked_z_4xi64_perm_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
+; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[1,2,0,3]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 1, i32 2, i32 0, i32 3>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
+ ret <4 x i64> %res
+}
+define <4 x i64> @test_masked_4xi64_perm_mask2(<4 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
+; CHECK-LABEL: test_masked_4xi64_perm_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1
+; CHECK-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,2,2,1]
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 1>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_masked_z_4xi64_perm_mask2(<4 x i64> %vec, <4 x i64> %mask) {
+; CHECK-LABEL: test_masked_z_4xi64_perm_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
+; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,2,2,1]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 1>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
+ ret <4 x i64> %res
+}
+define <4 x i64> @test_4xi64_perm_mask3(<4 x i64> %vec) {
+; CHECK-LABEL: test_4xi64_perm_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3]
+; CHECK-NEXT: retq
+ %res = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 1, i32 3, i32 3>
+ ret <4 x i64> %res
+}
+define <4 x i64> @test_masked_4xi64_perm_mask3(<4 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
+; CHECK-LABEL: test_masked_4xi64_perm_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1
+; CHECK-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,1,3,3]
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 1, i32 3, i32 3>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_masked_z_4xi64_perm_mask3(<4 x i64> %vec, <4 x i64> %mask) {
+; CHECK-LABEL: test_masked_z_4xi64_perm_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
+; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,1,3,3]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 1, i32 3, i32 3>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
+ ret <4 x i64> %res
+}
+define <4 x i64> @test_4xi64_perm_mem_mask0(<4 x i64>* %vp) {
+; CHECK-LABEL: test_4xi64_perm_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = mem[2,1,2,0]
+; CHECK-NEXT: retq
+ %vec = load <4 x i64>, <4 x i64>* %vp
+ %res = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 1, i32 2, i32 0>
+ ret <4 x i64> %res
+}
+define <4 x i64> @test_masked_4xi64_perm_mem_mask0(<4 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) {
+; CHECK-LABEL: test_masked_4xi64_perm_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
+; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} = mem[2,1,2,0]
+; CHECK-NEXT: retq
+ %vec = load <4 x i64>, <4 x i64>* %vp
+ %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 1, i32 2, i32 0>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_masked_z_4xi64_perm_mem_mask0(<4 x i64>* %vp, <4 x i64> %mask) {
+; CHECK-LABEL: test_masked_z_4xi64_perm_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k1
+; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = mem[2,1,2,0]
+; CHECK-NEXT: retq
+ %vec = load <4 x i64>, <4 x i64>* %vp
+ %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 1, i32 2, i32 0>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_masked_4xi64_perm_mem_mask1(<4 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) {
+; CHECK-LABEL: test_masked_4xi64_perm_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
+; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} = mem[2,1,1,1]
+; CHECK-NEXT: retq
+ %vec = load <4 x i64>, <4 x i64>* %vp
+ %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 1, i32 1, i32 1>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_masked_z_4xi64_perm_mem_mask1(<4 x i64>* %vp, <4 x i64> %mask) {
+; CHECK-LABEL: test_masked_z_4xi64_perm_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k1
+; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = mem[2,1,1,1]
+; CHECK-NEXT: retq
+ %vec = load <4 x i64>, <4 x i64>* %vp
+ %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 1, i32 1, i32 1>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_masked_4xi64_perm_mem_mask2(<4 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) {
+; CHECK-LABEL: test_masked_4xi64_perm_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
+; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} = mem[0,1,2,0]
+; CHECK-NEXT: retq
+ %vec = load <4 x i64>, <4 x i64>* %vp
+ %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 0>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_masked_z_4xi64_perm_mem_mask2(<4 x i64>* %vp, <4 x i64> %mask) {
+; CHECK-LABEL: test_masked_z_4xi64_perm_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k1
+; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,0]
+; CHECK-NEXT: retq
+ %vec = load <4 x i64>, <4 x i64>* %vp
+ %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 0>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_4xi64_perm_mem_mask3(<4 x i64>* %vp) {
+; CHECK-LABEL: test_4xi64_perm_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = mem[2,0,1,3]
+; CHECK-NEXT: retq
+ %vec = load <4 x i64>, <4 x i64>* %vp
+ %res = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 0, i32 1, i32 3>
+ ret <4 x i64> %res
+}
+define <4 x i64> @test_masked_4xi64_perm_mem_mask3(<4 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) {
+; CHECK-LABEL: test_masked_4xi64_perm_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
+; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} = mem[2,0,1,3]
+; CHECK-NEXT: retq
+ %vec = load <4 x i64>, <4 x i64>* %vp
+ %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 0, i32 1, i32 3>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_masked_z_4xi64_perm_mem_mask3(<4 x i64>* %vp, <4 x i64> %mask) {
+; CHECK-LABEL: test_masked_z_4xi64_perm_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k1
+; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = mem[2,0,1,3]
+; CHECK-NEXT: retq
+ %vec = load <4 x i64>, <4 x i64>* %vp
+ %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 0, i32 1, i32 3>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
+ ret <4 x i64> %res
+}
+
+define <8 x i64> @test_8xi64_perm_mask0(<8 x i64> %vec) {
+; CHECK-LABEL: test_8xi64_perm_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps {{.*#+}} zmm1 = [0,4,7,6,5,5,1,6]
+; CHECK-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 0, i32 4, i32 7, i32 6, i32 5, i32 5, i32 1, i32 6>
+ ret <8 x i64> %res
+}
+define <8 x i64> @test_masked_8xi64_perm_mask0(<8 x i64> %vec, <8 x i64> %vec2, <8 x i64> %mask) {
+; CHECK-LABEL: test_masked_8xi64_perm_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,4,7,6,5,5,1,6]
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vpcmpeqq %zmm4, %zmm2, %k1
+; CHECK-NEXT: vpermq %zmm0, %zmm3, %zmm1 {%k1}
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 0, i32 4, i32 7, i32 6, i32 5, i32 5, i32 1, i32 6>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
+ ret <8 x i64> %res
+}
+
+define <8 x i64> @test_masked_z_8xi64_perm_mask0(<8 x i64> %vec, <8 x i64> %mask) {
+; CHECK-LABEL: test_masked_z_8xi64_perm_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,4,7,6,5,5,1,6]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqq %zmm3, %zmm1, %k1
+; CHECK-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 0, i32 4, i32 7, i32 6, i32 5, i32 5, i32 1, i32 6>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
+ ret <8 x i64> %res
+}
+define <8 x i64> @test_masked_8xi64_perm_imm_mask1(<8 x i64> %vec, <8 x i64> %vec2, <8 x i64> %mask) {
+; CHECK-LABEL: test_masked_8xi64_perm_imm_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqq %zmm3, %zmm2, %k1
+; CHECK-NEXT: vpermq {{.*#+}} zmm1 {%k1} = zmm0[1,0,1,1,5,4,5,5]
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 1, i32 0, i32 1, i32 1, i32 5, i32 4, i32 5, i32 5>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
+ ret <8 x i64> %res
+}
+
+define <8 x i64> @test_masked_z_8xi64_perm_imm_mask1(<8 x i64> %vec, <8 x i64> %mask) {
+; CHECK-LABEL: test_masked_z_8xi64_perm_imm_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqq %zmm2, %zmm1, %k1
+; CHECK-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[1,0,1,1,5,4,5,5]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 1, i32 0, i32 1, i32 1, i32 5, i32 4, i32 5, i32 5>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
+ ret <8 x i64> %res
+}
+define <8 x i64> @test_masked_8xi64_perm_mask2(<8 x i64> %vec, <8 x i64> %vec2, <8 x i64> %mask) {
+; CHECK-LABEL: test_masked_8xi64_perm_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,3,7,3,3,5,4,1]
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vpcmpeqq %zmm4, %zmm2, %k1
+; CHECK-NEXT: vpermq %zmm0, %zmm3, %zmm1 {%k1}
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 1, i32 3, i32 7, i32 3, i32 3, i32 5, i32 4, i32 1>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
+ ret <8 x i64> %res
+}
+
+define <8 x i64> @test_masked_z_8xi64_perm_mask2(<8 x i64> %vec, <8 x i64> %mask) {
+; CHECK-LABEL: test_masked_z_8xi64_perm_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [1,3,7,3,3,5,4,1]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqq %zmm3, %zmm1, %k1
+; CHECK-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 1, i32 3, i32 7, i32 3, i32 3, i32 5, i32 4, i32 1>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
+ ret <8 x i64> %res
+}
+define <8 x i64> @test_8xi64_perm_imm_mask3(<8 x i64> %vec) {
+; CHECK-LABEL: test_8xi64_perm_imm_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[3,1,3,1,7,5,7,5]
+; CHECK-NEXT: retq
+ %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 3, i32 1, i32 3, i32 1, i32 7, i32 5, i32 7, i32 5>
+ ret <8 x i64> %res
+}
+define <8 x i64> @test_masked_8xi64_perm_imm_mask3(<8 x i64> %vec, <8 x i64> %vec2, <8 x i64> %mask) {
+; CHECK-LABEL: test_masked_8xi64_perm_imm_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqq %zmm3, %zmm2, %k1
+; CHECK-NEXT: vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,1,3,1,7,5,7,5]
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 3, i32 1, i32 3, i32 1, i32 7, i32 5, i32 7, i32 5>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
+ ret <8 x i64> %res
+}
+
+define <8 x i64> @test_masked_z_8xi64_perm_imm_mask3(<8 x i64> %vec, <8 x i64> %mask) {
+; CHECK-LABEL: test_masked_z_8xi64_perm_imm_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqq %zmm2, %zmm1, %k1
+; CHECK-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[3,1,3,1,7,5,7,5]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 3, i32 1, i32 3, i32 1, i32 7, i32 5, i32 7, i32 5>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
+ ret <8 x i64> %res
+}
+define <8 x i64> @test_masked_8xi64_perm_mask4(<8 x i64> %vec, <8 x i64> %vec2, <8 x i64> %mask) {
+; CHECK-LABEL: test_masked_8xi64_perm_mask4:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm3 = [6,3,1,1,7,4,0,3]
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vpcmpeqq %zmm4, %zmm2, %k1
+; CHECK-NEXT: vpermq %zmm0, %zmm3, %zmm1 {%k1}
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 6, i32 3, i32 1, i32 1, i32 7, i32 4, i32 0, i32 3>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
+ ret <8 x i64> %res
+}
+
+define <8 x i64> @test_masked_z_8xi64_perm_mask4(<8 x i64> %vec, <8 x i64> %mask) {
+; CHECK-LABEL: test_masked_z_8xi64_perm_mask4:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [6,3,1,1,7,4,0,3]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqq %zmm3, %zmm1, %k1
+; CHECK-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 6, i32 3, i32 1, i32 1, i32 7, i32 4, i32 0, i32 3>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
+ ret <8 x i64> %res
+}
+define <8 x i64> @test_masked_8xi64_perm_imm_mask5(<8 x i64> %vec, <8 x i64> %vec2, <8 x i64> %mask) {
+; CHECK-LABEL: test_masked_8xi64_perm_imm_mask5:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqq %zmm3, %zmm2, %k1
+; CHECK-NEXT: vpermq {{.*#+}} zmm1 {%k1} = zmm0[0,0,0,0,4,4,4,4]
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
+ ret <8 x i64> %res
+}
+
+define <8 x i64> @test_masked_z_8xi64_perm_imm_mask5(<8 x i64> %vec, <8 x i64> %mask) {
+; CHECK-LABEL: test_masked_z_8xi64_perm_imm_mask5:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqq %zmm2, %zmm1, %k1
+; CHECK-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,0,0,4,4,4,4]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
+ ret <8 x i64> %res
+}
+define <8 x i64> @test_8xi64_perm_mask6(<8 x i64> %vec) {
+; CHECK-LABEL: test_8xi64_perm_mask6:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps {{.*#+}} zmm1 = [5,1,4,4,5,4,2,7]
+; CHECK-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 5, i32 1, i32 4, i32 4, i32 5, i32 4, i32 2, i32 7>
+ ret <8 x i64> %res
+}
+define <8 x i64> @test_masked_8xi64_perm_mask6(<8 x i64> %vec, <8 x i64> %vec2, <8 x i64> %mask) {
+; CHECK-LABEL: test_masked_8xi64_perm_mask6:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm3 = [5,1,4,4,5,4,2,7]
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vpcmpeqq %zmm4, %zmm2, %k1
+; CHECK-NEXT: vpermq %zmm0, %zmm3, %zmm1 {%k1}
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 5, i32 1, i32 4, i32 4, i32 5, i32 4, i32 2, i32 7>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
+ ret <8 x i64> %res
+}
+
+define <8 x i64> @test_masked_z_8xi64_perm_mask6(<8 x i64> %vec, <8 x i64> %mask) {
+; CHECK-LABEL: test_masked_z_8xi64_perm_mask6:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [5,1,4,4,5,4,2,7]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqq %zmm3, %zmm1, %k1
+; CHECK-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 5, i32 1, i32 4, i32 4, i32 5, i32 4, i32 2, i32 7>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
+ ret <8 x i64> %res
+}
+define <8 x i64> @test_masked_8xi64_perm_imm_mask7(<8 x i64> %vec, <8 x i64> %vec2, <8 x i64> %mask) {
+; CHECK-LABEL: test_masked_8xi64_perm_imm_mask7:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqq %zmm3, %zmm2, %k1
+; CHECK-NEXT: vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,3,3,3,7,7,7,7]
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 7, i32 7, i32 7, i32 7>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
+ ret <8 x i64> %res
+}
+
+define <8 x i64> @test_masked_z_8xi64_perm_imm_mask7(<8 x i64> %vec, <8 x i64> %mask) {
+; CHECK-LABEL: test_masked_z_8xi64_perm_imm_mask7:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqq %zmm2, %zmm1, %k1
+; CHECK-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[3,3,3,3,7,7,7,7]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 7, i32 7, i32 7, i32 7>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
+ ret <8 x i64> %res
+}
+define <8 x i64> @test_8xi64_perm_mem_mask0(<8 x i64>* %vp) {
+; CHECK-LABEL: test_8xi64_perm_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps {{.*#+}} zmm0 = [5,1,6,5,7,3,7,3]
+; CHECK-NEXT: vpermpd (%rdi), %zmm0, %zmm0
+; CHECK-NEXT: retq
+ %vec = load <8 x i64>, <8 x i64>* %vp
+ %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 5, i32 1, i32 6, i32 5, i32 7, i32 3, i32 7, i32 3>
+ ret <8 x i64> %res
+}
+define <8 x i64> @test_masked_8xi64_perm_mem_mask0(<8 x i64>* %vp, <8 x i64> %vec2, <8 x i64> %mask) {
+; CHECK-LABEL: test_masked_8xi64_perm_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [5,1,6,5,7,3,7,3]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqq %zmm3, %zmm1, %k1
+; CHECK-NEXT: vpermq (%rdi), %zmm2, %zmm0 {%k1}
+; CHECK-NEXT: retq
+ %vec = load <8 x i64>, <8 x i64>* %vp
+ %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 5, i32 1, i32 6, i32 5, i32 7, i32 3, i32 7, i32 3>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
+ ret <8 x i64> %res
+}
+
+define <8 x i64> @test_masked_z_8xi64_perm_mem_mask0(<8 x i64>* %vp, <8 x i64> %mask) {
+; CHECK-LABEL: test_masked_z_8xi64_perm_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm1 = [5,1,6,5,7,3,7,3]
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqq %zmm2, %zmm0, %k1
+; CHECK-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %vec = load <8 x i64>, <8 x i64>* %vp
+ %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 5, i32 1, i32 6, i32 5, i32 7, i32 3, i32 7, i32 3>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
+ ret <8 x i64> %res
+}
+
+define <8 x i64> @test_masked_8xi64_perm_imm_mem_mask1(<8 x i64>* %vp, <8 x i64> %vec2, <8 x i64> %mask) {
+; CHECK-LABEL: test_masked_8xi64_perm_imm_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqq %zmm2, %zmm1, %k1
+; CHECK-NEXT: vpermq {{.*#+}} zmm0 {%k1} = mem[1,1,1,0,5,5,5,4]
+; CHECK-NEXT: retq
+ %vec = load <8 x i64>, <8 x i64>* %vp
+ %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 0, i32 5, i32 5, i32 5, i32 4>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
+ ret <8 x i64> %res
+}
+
+define <8 x i64> @test_masked_z_8xi64_perm_imm_mem_mask1(<8 x i64>* %vp, <8 x i64> %mask) {
+; CHECK-LABEL: test_masked_z_8xi64_perm_imm_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k1
+; CHECK-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = mem[1,1,1,0,5,5,5,4]
+; CHECK-NEXT: retq
+ %vec = load <8 x i64>, <8 x i64>* %vp
+ %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 0, i32 5, i32 5, i32 5, i32 4>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
+ ret <8 x i64> %res
+}
+
+define <8 x i64> @test_masked_8xi64_perm_mem_mask2(<8 x i64>* %vp, <8 x i64> %vec2, <8 x i64> %mask) {
+; CHECK-LABEL: test_masked_8xi64_perm_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,2,1,4,1,1,5,5]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqq %zmm3, %zmm1, %k1
+; CHECK-NEXT: vpermq (%rdi), %zmm2, %zmm0 {%k1}
+; CHECK-NEXT: retq
+ %vec = load <8 x i64>, <8 x i64>* %vp
+ %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 0, i32 2, i32 1, i32 4, i32 1, i32 1, i32 5, i32 5>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
+ ret <8 x i64> %res
+}
+
+define <8 x i64> @test_masked_z_8xi64_perm_mem_mask2(<8 x i64>* %vp, <8 x i64> %mask) {
+; CHECK-LABEL: test_masked_z_8xi64_perm_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,2,1,4,1,1,5,5]
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqq %zmm2, %zmm0, %k1
+; CHECK-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %vec = load <8 x i64>, <8 x i64>* %vp
+ %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 0, i32 2, i32 1, i32 4, i32 1, i32 1, i32 5, i32 5>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
+ ret <8 x i64> %res
+}
+
+define <8 x i64> @test_8xi64_perm_imm_mem_mask3(<8 x i64>* %vp) {
+; CHECK-LABEL: test_8xi64_perm_imm_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpermpd {{.*#+}} zmm0 = mem[1,3,1,1,5,7,5,5]
+; CHECK-NEXT: retq
+ %vec = load <8 x i64>, <8 x i64>* %vp
+ %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 1, i32 3, i32 1, i32 1, i32 5, i32 7, i32 5, i32 5>
+ ret <8 x i64> %res
+}
+define <8 x i64> @test_masked_8xi64_perm_imm_mem_mask3(<8 x i64>* %vp, <8 x i64> %vec2, <8 x i64> %mask) {
+; CHECK-LABEL: test_masked_8xi64_perm_imm_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqq %zmm2, %zmm1, %k1
+; CHECK-NEXT: vpermq {{.*#+}} zmm0 {%k1} = mem[1,3,1,1,5,7,5,5]
+; CHECK-NEXT: retq
+ %vec = load <8 x i64>, <8 x i64>* %vp
+ %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 1, i32 3, i32 1, i32 1, i32 5, i32 7, i32 5, i32 5>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
+ ret <8 x i64> %res
+}
+
+define <8 x i64> @test_masked_z_8xi64_perm_imm_mem_mask3(<8 x i64>* %vp, <8 x i64> %mask) {
+; CHECK-LABEL: test_masked_z_8xi64_perm_imm_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k1
+; CHECK-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = mem[1,3,1,1,5,7,5,5]
+; CHECK-NEXT: retq
+ %vec = load <8 x i64>, <8 x i64>* %vp
+ %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 1, i32 3, i32 1, i32 1, i32 5, i32 7, i32 5, i32 5>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
+ ret <8 x i64> %res
+}
+
+define <8 x i64> @test_masked_8xi64_perm_mem_mask4(<8 x i64>* %vp, <8 x i64> %vec2, <8 x i64> %mask) {
+; CHECK-LABEL: test_masked_8xi64_perm_mem_mask4:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [5,0,7,0,3,5,0,6]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqq %zmm3, %zmm1, %k1
+; CHECK-NEXT: vpermq (%rdi), %zmm2, %zmm0 {%k1}
+; CHECK-NEXT: retq
+ %vec = load <8 x i64>, <8 x i64>* %vp
+ %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 5, i32 0, i32 7, i32 0, i32 3, i32 5, i32 0, i32 6>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
+ ret <8 x i64> %res
+}
+
+define <8 x i64> @test_masked_z_8xi64_perm_mem_mask4(<8 x i64>* %vp, <8 x i64> %mask) {
+; CHECK-LABEL: test_masked_z_8xi64_perm_mem_mask4:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm1 = [5,0,7,0,3,5,0,6]
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqq %zmm2, %zmm0, %k1
+; CHECK-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %vec = load <8 x i64>, <8 x i64>* %vp
+ %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 5, i32 0, i32 7, i32 0, i32 3, i32 5, i32 0, i32 6>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
+ ret <8 x i64> %res
+}
+
+define <8 x i64> @test_masked_8xi64_perm_imm_mem_mask5(<8 x i64>* %vp, <8 x i64> %vec2, <8 x i64> %mask) {
+; CHECK-LABEL: test_masked_8xi64_perm_imm_mem_mask5:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqq %zmm2, %zmm1, %k1
+; CHECK-NEXT: vpermq {{.*#+}} zmm0 {%k1} = mem[3,1,0,0,7,5,4,4]
+; CHECK-NEXT: retq
+ %vec = load <8 x i64>, <8 x i64>* %vp
+ %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 3, i32 1, i32 0, i32 0, i32 7, i32 5, i32 4, i32 4>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
+ ret <8 x i64> %res
+}
+
+define <8 x i64> @test_masked_z_8xi64_perm_imm_mem_mask5(<8 x i64>* %vp, <8 x i64> %mask) {
+; CHECK-LABEL: test_masked_z_8xi64_perm_imm_mem_mask5:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k1
+; CHECK-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = mem[3,1,0,0,7,5,4,4]
+; CHECK-NEXT: retq
+ %vec = load <8 x i64>, <8 x i64>* %vp
+ %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 3, i32 1, i32 0, i32 0, i32 7, i32 5, i32 4, i32 4>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
+ ret <8 x i64> %res
+}
+
+define <8 x i64> @test_8xi64_perm_mem_mask6(<8 x i64>* %vp) {
+; CHECK-LABEL: test_8xi64_perm_mem_mask6:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps {{.*#+}} zmm0 = [0,6,3,7,3,0,3,6]
+; CHECK-NEXT: vpermpd (%rdi), %zmm0, %zmm0
+; CHECK-NEXT: retq
+ %vec = load <8 x i64>, <8 x i64>* %vp
+ %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 0, i32 6, i32 3, i32 7, i32 3, i32 0, i32 3, i32 6>
+ ret <8 x i64> %res
+}
+define <8 x i64> @test_masked_8xi64_perm_mem_mask6(<8 x i64>* %vp, <8 x i64> %vec2, <8 x i64> %mask) {
+; CHECK-LABEL: test_masked_8xi64_perm_mem_mask6:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,6,3,7,3,0,3,6]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqq %zmm3, %zmm1, %k1
+; CHECK-NEXT: vpermq (%rdi), %zmm2, %zmm0 {%k1}
+; CHECK-NEXT: retq
+ %vec = load <8 x i64>, <8 x i64>* %vp
+ %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 0, i32 6, i32 3, i32 7, i32 3, i32 0, i32 3, i32 6>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
+ ret <8 x i64> %res
+}
+
+define <8 x i64> @test_masked_z_8xi64_perm_mem_mask6(<8 x i64>* %vp, <8 x i64> %mask) {
+; CHECK-LABEL: test_masked_z_8xi64_perm_mem_mask6:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,6,3,7,3,0,3,6]
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqq %zmm2, %zmm0, %k1
+; CHECK-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %vec = load <8 x i64>, <8 x i64>* %vp
+ %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 0, i32 6, i32 3, i32 7, i32 3, i32 0, i32 3, i32 6>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
+ ret <8 x i64> %res
+}
+
+define <8 x i64> @test_masked_8xi64_perm_imm_mem_mask7(<8 x i64>* %vp, <8 x i64> %vec2, <8 x i64> %mask) {
+; CHECK-LABEL: test_masked_8xi64_perm_imm_mem_mask7:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqq %zmm2, %zmm1, %k1
+; CHECK-NEXT: vpermq {{.*#+}} zmm0 {%k1} = mem[3,0,0,1,7,4,4,5]
+; CHECK-NEXT: retq
+ %vec = load <8 x i64>, <8 x i64>* %vp
+ %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 3, i32 0, i32 0, i32 1, i32 7, i32 4, i32 4, i32 5>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
+ ret <8 x i64> %res
+}
+
+define <8 x i64> @test_masked_z_8xi64_perm_imm_mem_mask7(<8 x i64>* %vp, <8 x i64> %mask) {
+; CHECK-LABEL: test_masked_z_8xi64_perm_imm_mem_mask7:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k1
+; CHECK-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = mem[3,0,0,1,7,4,4,5]
+; CHECK-NEXT: retq
+ %vec = load <8 x i64>, <8 x i64>* %vp
+ %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 3, i32 0, i32 0, i32 1, i32 7, i32 4, i32 4, i32 5>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
+ ret <8 x i64> %res
+}
+
+define <8 x float> @test_8xfloat_perm_mask0(<8 x float> %vec) {
+; CHECK-LABEL: test_8xfloat_perm_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [3,4,2,4,1,2,3,4]
+; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %res = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 3, i32 4, i32 2, i32 4, i32 1, i32 2, i32 3, i32 4>
+ ret <8 x float> %res
+}
+define <8 x float> @test_masked_8xfloat_perm_mask0(<8 x float> %vec, <8 x float> %vec2, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_8xfloat_perm_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = [3,4,2,4,1,2,3,4]
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqps %ymm4, %ymm2, %k1
+; CHECK-NEXT: vpermps %ymm0, %ymm3, %ymm1 {%k1}
+; CHECK-NEXT: vmovaps %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 3, i32 4, i32 2, i32 4, i32 1, i32 2, i32 3, i32 4>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_masked_z_8xfloat_perm_mask0(<8 x float> %vec, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_z_8xfloat_perm_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = [3,4,2,4,1,2,3,4]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %ymm3, %ymm1, %k1
+; CHECK-NEXT: vpermps %ymm0, %ymm2, %ymm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 3, i32 4, i32 2, i32 4, i32 1, i32 2, i32 3, i32 4>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+define <8 x float> @test_masked_8xfloat_perm_mask1(<8 x float> %vec, <8 x float> %vec2, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_8xfloat_perm_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = [4,2,1,0,6,0,5,1]
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqps %ymm4, %ymm2, %k1
+; CHECK-NEXT: vpermps %ymm0, %ymm3, %ymm1 {%k1}
+; CHECK-NEXT: vmovaps %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 4, i32 2, i32 1, i32 0, i32 6, i32 0, i32 5, i32 1>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_masked_z_8xfloat_perm_mask1(<8 x float> %vec, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_z_8xfloat_perm_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = [4,2,1,0,6,0,5,1]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %ymm3, %ymm1, %k1
+; CHECK-NEXT: vpermps %ymm0, %ymm2, %ymm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 4, i32 2, i32 1, i32 0, i32 6, i32 0, i32 5, i32 1>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+define <8 x float> @test_masked_8xfloat_perm_mask2(<8 x float> %vec, <8 x float> %vec2, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_8xfloat_perm_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = [2,5,5,5,4,6,0,5]
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqps %ymm4, %ymm2, %k1
+; CHECK-NEXT: vpermps %ymm0, %ymm3, %ymm1 {%k1}
+; CHECK-NEXT: vmovaps %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 2, i32 5, i32 5, i32 5, i32 4, i32 6, i32 0, i32 5>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_masked_z_8xfloat_perm_mask2(<8 x float> %vec, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_z_8xfloat_perm_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = [2,5,5,5,4,6,0,5]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %ymm3, %ymm1, %k1
+; CHECK-NEXT: vpermps %ymm0, %ymm2, %ymm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 2, i32 5, i32 5, i32 5, i32 4, i32 6, i32 0, i32 5>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+define <8 x float> @test_8xfloat_perm_mask3(<8 x float> %vec) {
+; CHECK-LABEL: test_8xfloat_perm_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [0,5,2,5,5,5,1,6]
+; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %res = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 5, i32 2, i32 5, i32 5, i32 5, i32 1, i32 6>
+ ret <8 x float> %res
+}
+define <8 x float> @test_masked_8xfloat_perm_mask3(<8 x float> %vec, <8 x float> %vec2, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_8xfloat_perm_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = [0,5,2,5,5,5,1,6]
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqps %ymm4, %ymm2, %k1
+; CHECK-NEXT: vpermps %ymm0, %ymm3, %ymm1 {%k1}
+; CHECK-NEXT: vmovaps %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 5, i32 2, i32 5, i32 5, i32 5, i32 1, i32 6>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_masked_z_8xfloat_perm_mask3(<8 x float> %vec, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_z_8xfloat_perm_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = [0,5,2,5,5,5,1,6]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %ymm3, %ymm1, %k1
+; CHECK-NEXT: vpermps %ymm0, %ymm2, %ymm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 5, i32 2, i32 5, i32 5, i32 5, i32 1, i32 6>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+define <8 x float> @test_8xfloat_perm_mem_mask0(<8 x float>* %vp) {
+; CHECK-LABEL: test_8xfloat_perm_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps {{.*#+}} ymm0 = [5,2,1,6,4,2,4,0]
+; CHECK-NEXT: vpermps (%rdi), %ymm0, %ymm0
+; CHECK-NEXT: retq
+ %vec = load <8 x float>, <8 x float>* %vp
+ %res = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 5, i32 2, i32 1, i32 6, i32 4, i32 2, i32 4, i32 0>
+ ret <8 x float> %res
+}
+define <8 x float> @test_masked_8xfloat_perm_mem_mask0(<8 x float>* %vp, <8 x float> %vec2, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_8xfloat_perm_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = [5,2,1,6,4,2,4,0]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %ymm3, %ymm1, %k1
+; CHECK-NEXT: vpermps (%rdi), %ymm2, %ymm0 {%k1}
+; CHECK-NEXT: retq
+ %vec = load <8 x float>, <8 x float>* %vp
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 5, i32 2, i32 1, i32 6, i32 4, i32 2, i32 4, i32 0>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_masked_z_8xfloat_perm_mem_mask0(<8 x float>* %vp, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_z_8xfloat_perm_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [5,2,1,6,4,2,4,0]
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %ymm2, %ymm0, %k1
+; CHECK-NEXT: vpermps (%rdi), %ymm1, %ymm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %vec = load <8 x float>, <8 x float>* %vp
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 5, i32 2, i32 1, i32 6, i32 4, i32 2, i32 4, i32 0>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_masked_8xfloat_perm_mem_mask1(<8 x float>* %vp, <8 x float> %vec2, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_8xfloat_perm_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = [1,3,7,4,0,6,6,6]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %ymm3, %ymm1, %k1
+; CHECK-NEXT: vpermps (%rdi), %ymm2, %ymm0 {%k1}
+; CHECK-NEXT: retq
+ %vec = load <8 x float>, <8 x float>* %vp
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 7, i32 4, i32 0, i32 6, i32 6, i32 6>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_masked_z_8xfloat_perm_mem_mask1(<8 x float>* %vp, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_z_8xfloat_perm_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [1,3,7,4,0,6,6,6]
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %ymm2, %ymm0, %k1
+; CHECK-NEXT: vpermps (%rdi), %ymm1, %ymm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %vec = load <8 x float>, <8 x float>* %vp
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 7, i32 4, i32 0, i32 6, i32 6, i32 6>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_masked_8xfloat_perm_mem_mask2(<8 x float>* %vp, <8 x float> %vec2, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_8xfloat_perm_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = [4,5,1,5,6,6,2,4]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %ymm3, %ymm1, %k1
+; CHECK-NEXT: vpermps (%rdi), %ymm2, %ymm0 {%k1}
+; CHECK-NEXT: retq
+ %vec = load <8 x float>, <8 x float>* %vp
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 1, i32 5, i32 6, i32 6, i32 2, i32 4>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_masked_z_8xfloat_perm_mem_mask2(<8 x float>* %vp, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_z_8xfloat_perm_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [4,5,1,5,6,6,2,4]
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %ymm2, %ymm0, %k1
+; CHECK-NEXT: vpermps (%rdi), %ymm1, %ymm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %vec = load <8 x float>, <8 x float>* %vp
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 1, i32 5, i32 6, i32 6, i32 2, i32 4>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_8xfloat_perm_mem_mask3(<8 x float>* %vp) {
+; CHECK-LABEL: test_8xfloat_perm_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps {{.*#+}} ymm0 = [5,7,0,6,4,2,3,0]
+; CHECK-NEXT: vpermps (%rdi), %ymm0, %ymm0
+; CHECK-NEXT: retq
+ %vec = load <8 x float>, <8 x float>* %vp
+ %res = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 5, i32 7, i32 0, i32 6, i32 4, i32 2, i32 3, i32 0>
+ ret <8 x float> %res
+}
+define <8 x float> @test_masked_8xfloat_perm_mem_mask3(<8 x float>* %vp, <8 x float> %vec2, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_8xfloat_perm_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = [5,7,0,6,4,2,3,0]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %ymm3, %ymm1, %k1
+; CHECK-NEXT: vpermps (%rdi), %ymm2, %ymm0 {%k1}
+; CHECK-NEXT: retq
+ %vec = load <8 x float>, <8 x float>* %vp
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 5, i32 7, i32 0, i32 6, i32 4, i32 2, i32 3, i32 0>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_masked_z_8xfloat_perm_mem_mask3(<8 x float>* %vp, <8 x float> %mask) {
+; CHECK-LABEL: test_masked_z_8xfloat_perm_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [5,7,0,6,4,2,3,0]
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %ymm2, %ymm0, %k1
+; CHECK-NEXT: vpermps (%rdi), %ymm1, %ymm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %vec = load <8 x float>, <8 x float>* %vp
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 5, i32 7, i32 0, i32 6, i32 4, i32 2, i32 3, i32 0>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+
+define <16 x float> @test_16xfloat_perm_mask0(<16 x float> %vec) {
+; CHECK-LABEL: test_16xfloat_perm_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps {{.*#+}} zmm1 = [15,7,5,13,4,9,11,13,12,6,0,0,11,15,5,7]
+; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %res = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 15, i32 7, i32 5, i32 13, i32 4, i32 9, i32 11, i32 13, i32 12, i32 6, i32 0, i32 0, i32 11, i32 15, i32 5, i32 7>
+ ret <16 x float> %res
+}
+define <16 x float> @test_masked_16xfloat_perm_mask0(<16 x float> %vec, <16 x float> %vec2, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_16xfloat_perm_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps {{.*#+}} zmm3 = [15,7,5,13,4,9,11,13,12,6,0,0,11,15,5,7]
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqps %zmm4, %zmm2, %k1
+; CHECK-NEXT: vpermps %zmm0, %zmm3, %zmm1 {%k1}
+; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 15, i32 7, i32 5, i32 13, i32 4, i32 9, i32 11, i32 13, i32 12, i32 6, i32 0, i32 0, i32 11, i32 15, i32 5, i32 7>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_masked_z_16xfloat_perm_mask0(<16 x float> %vec, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_z_16xfloat_perm_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps {{.*#+}} zmm2 = [15,7,5,13,4,9,11,13,12,6,0,0,11,15,5,7]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %zmm3, %zmm1, %k1
+; CHECK-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 15, i32 7, i32 5, i32 13, i32 4, i32 9, i32 11, i32 13, i32 12, i32 6, i32 0, i32 0, i32 11, i32 15, i32 5, i32 7>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
+define <16 x float> @test_masked_16xfloat_perm_mask1(<16 x float> %vec, <16 x float> %vec2, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_16xfloat_perm_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps {{.*#+}} zmm3 = [11,10,4,10,4,5,8,11,2,0,10,0,0,3,10,1]
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqps %zmm4, %zmm2, %k1
+; CHECK-NEXT: vpermps %zmm0, %zmm3, %zmm1 {%k1}
+; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 11, i32 10, i32 4, i32 10, i32 4, i32 5, i32 8, i32 11, i32 2, i32 0, i32 10, i32 0, i32 0, i32 3, i32 10, i32 1>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_masked_z_16xfloat_perm_mask1(<16 x float> %vec, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_z_16xfloat_perm_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps {{.*#+}} zmm2 = [11,10,4,10,4,5,8,11,2,0,10,0,0,3,10,1]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %zmm3, %zmm1, %k1
+; CHECK-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 11, i32 10, i32 4, i32 10, i32 4, i32 5, i32 8, i32 11, i32 2, i32 0, i32 10, i32 0, i32 0, i32 3, i32 10, i32 1>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
+define <16 x float> @test_masked_16xfloat_perm_mask2(<16 x float> %vec, <16 x float> %vec2, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_16xfloat_perm_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps {{.*#+}} zmm3 = [0,15,6,14,3,6,5,2,5,15,11,6,6,4,8,11]
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqps %zmm4, %zmm2, %k1
+; CHECK-NEXT: vpermps %zmm0, %zmm3, %zmm1 {%k1}
+; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 15, i32 6, i32 14, i32 3, i32 6, i32 5, i32 2, i32 5, i32 15, i32 11, i32 6, i32 6, i32 4, i32 8, i32 11>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_masked_z_16xfloat_perm_mask2(<16 x float> %vec, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_z_16xfloat_perm_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps {{.*#+}} zmm2 = [0,15,6,14,3,6,5,2,5,15,11,6,6,4,8,11]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %zmm3, %zmm1, %k1
+; CHECK-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 15, i32 6, i32 14, i32 3, i32 6, i32 5, i32 2, i32 5, i32 15, i32 11, i32 6, i32 6, i32 4, i32 8, i32 11>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
+define <16 x float> @test_16xfloat_perm_mask3(<16 x float> %vec) {
+; CHECK-LABEL: test_16xfloat_perm_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps {{.*#+}} zmm1 = [10,7,0,14,6,6,0,2,13,8,11,2,5,13,13,3]
+; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %res = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 10, i32 7, i32 0, i32 14, i32 6, i32 6, i32 0, i32 2, i32 13, i32 8, i32 11, i32 2, i32 5, i32 13, i32 13, i32 3>
+ ret <16 x float> %res
+}
+define <16 x float> @test_masked_16xfloat_perm_mask3(<16 x float> %vec, <16 x float> %vec2, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_16xfloat_perm_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps {{.*#+}} zmm3 = [10,7,0,14,6,6,0,2,13,8,11,2,5,13,13,3]
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqps %zmm4, %zmm2, %k1
+; CHECK-NEXT: vpermps %zmm0, %zmm3, %zmm1 {%k1}
+; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 10, i32 7, i32 0, i32 14, i32 6, i32 6, i32 0, i32 2, i32 13, i32 8, i32 11, i32 2, i32 5, i32 13, i32 13, i32 3>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_masked_z_16xfloat_perm_mask3(<16 x float> %vec, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_z_16xfloat_perm_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps {{.*#+}} zmm2 = [10,7,0,14,6,6,0,2,13,8,11,2,5,13,13,3]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %zmm3, %zmm1, %k1
+; CHECK-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 10, i32 7, i32 0, i32 14, i32 6, i32 6, i32 0, i32 2, i32 13, i32 8, i32 11, i32 2, i32 5, i32 13, i32 13, i32 3>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
+define <16 x float> @test_16xfloat_perm_mem_mask0(<16 x float>* %vp) {
+; CHECK-LABEL: test_16xfloat_perm_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps {{.*#+}} zmm0 = [10,2,1,14,9,9,7,2,9,4,12,11,0,14,0,1]
+; CHECK-NEXT: vpermps (%rdi), %zmm0, %zmm0
+; CHECK-NEXT: retq
+ %vec = load <16 x float>, <16 x float>* %vp
+ %res = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 10, i32 2, i32 1, i32 14, i32 9, i32 9, i32 7, i32 2, i32 9, i32 4, i32 12, i32 11, i32 0, i32 14, i32 0, i32 1>
+ ret <16 x float> %res
+}
+define <16 x float> @test_masked_16xfloat_perm_mem_mask0(<16 x float>* %vp, <16 x float> %vec2, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_16xfloat_perm_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps {{.*#+}} zmm2 = [10,2,1,14,9,9,7,2,9,4,12,11,0,14,0,1]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %zmm3, %zmm1, %k1
+; CHECK-NEXT: vpermps (%rdi), %zmm2, %zmm0 {%k1}
+; CHECK-NEXT: retq
+ %vec = load <16 x float>, <16 x float>* %vp
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 10, i32 2, i32 1, i32 14, i32 9, i32 9, i32 7, i32 2, i32 9, i32 4, i32 12, i32 11, i32 0, i32 14, i32 0, i32 1>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_masked_z_16xfloat_perm_mem_mask0(<16 x float>* %vp, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_z_16xfloat_perm_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps {{.*#+}} zmm1 = [10,2,1,14,9,9,7,2,9,4,12,11,0,14,0,1]
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %zmm2, %zmm0, %k1
+; CHECK-NEXT: vpermps (%rdi), %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %vec = load <16 x float>, <16 x float>* %vp
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 10, i32 2, i32 1, i32 14, i32 9, i32 9, i32 7, i32 2, i32 9, i32 4, i32 12, i32 11, i32 0, i32 14, i32 0, i32 1>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_masked_16xfloat_perm_mem_mask1(<16 x float>* %vp, <16 x float> %vec2, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_16xfloat_perm_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps {{.*#+}} zmm2 = [4,2,3,5,11,6,4,7,6,4,14,8,15,12,9,4]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %zmm3, %zmm1, %k1
+; CHECK-NEXT: vpermps (%rdi), %zmm2, %zmm0 {%k1}
+; CHECK-NEXT: retq
+ %vec = load <16 x float>, <16 x float>* %vp
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 4, i32 2, i32 3, i32 5, i32 11, i32 6, i32 4, i32 7, i32 6, i32 4, i32 14, i32 8, i32 15, i32 12, i32 9, i32 4>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_masked_z_16xfloat_perm_mem_mask1(<16 x float>* %vp, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_z_16xfloat_perm_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps {{.*#+}} zmm1 = [4,2,3,5,11,6,4,7,6,4,14,8,15,12,9,4]
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %zmm2, %zmm0, %k1
+; CHECK-NEXT: vpermps (%rdi), %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %vec = load <16 x float>, <16 x float>* %vp
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 4, i32 2, i32 3, i32 5, i32 11, i32 6, i32 4, i32 7, i32 6, i32 4, i32 14, i32 8, i32 15, i32 12, i32 9, i32 4>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_masked_16xfloat_perm_mem_mask2(<16 x float>* %vp, <16 x float> %vec2, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_16xfloat_perm_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps {{.*#+}} zmm2 = [10,7,11,6,7,0,11,0,10,9,12,4,10,3,8,5]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %zmm3, %zmm1, %k1
+; CHECK-NEXT: vpermps (%rdi), %zmm2, %zmm0 {%k1}
+; CHECK-NEXT: retq
+ %vec = load <16 x float>, <16 x float>* %vp
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 10, i32 7, i32 11, i32 6, i32 7, i32 0, i32 11, i32 0, i32 10, i32 9, i32 12, i32 4, i32 10, i32 3, i32 8, i32 5>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_masked_z_16xfloat_perm_mem_mask2(<16 x float>* %vp, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_z_16xfloat_perm_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps {{.*#+}} zmm1 = [10,7,11,6,7,0,11,0,10,9,12,4,10,3,8,5]
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %zmm2, %zmm0, %k1
+; CHECK-NEXT: vpermps (%rdi), %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %vec = load <16 x float>, <16 x float>* %vp
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 10, i32 7, i32 11, i32 6, i32 7, i32 0, i32 11, i32 0, i32 10, i32 9, i32 12, i32 4, i32 10, i32 3, i32 8, i32 5>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_16xfloat_perm_mem_mask3(<16 x float>* %vp) {
+; CHECK-LABEL: test_16xfloat_perm_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps {{.*#+}} zmm0 = [15,15,3,9,5,15,14,9,11,10,5,14,14,5,11,0]
+; CHECK-NEXT: vpermps (%rdi), %zmm0, %zmm0
+; CHECK-NEXT: retq
+ %vec = load <16 x float>, <16 x float>* %vp
+ %res = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 15, i32 15, i32 3, i32 9, i32 5, i32 15, i32 14, i32 9, i32 11, i32 10, i32 5, i32 14, i32 14, i32 5, i32 11, i32 0>
+ ret <16 x float> %res
+}
+define <16 x float> @test_masked_16xfloat_perm_mem_mask3(<16 x float>* %vp, <16 x float> %vec2, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_16xfloat_perm_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps {{.*#+}} zmm2 = [15,15,3,9,5,15,14,9,11,10,5,14,14,5,11,0]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %zmm3, %zmm1, %k1
+; CHECK-NEXT: vpermps (%rdi), %zmm2, %zmm0 {%k1}
+; CHECK-NEXT: retq
+ %vec = load <16 x float>, <16 x float>* %vp
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 15, i32 15, i32 3, i32 9, i32 5, i32 15, i32 14, i32 9, i32 11, i32 10, i32 5, i32 14, i32 14, i32 5, i32 11, i32 0>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_masked_z_16xfloat_perm_mem_mask3(<16 x float>* %vp, <16 x float> %mask) {
+; CHECK-LABEL: test_masked_z_16xfloat_perm_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps {{.*#+}} zmm1 = [15,15,3,9,5,15,14,9,11,10,5,14,14,5,11,0]
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %zmm2, %zmm0, %k1
+; CHECK-NEXT: vpermps (%rdi), %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %vec = load <16 x float>, <16 x float>* %vp
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 15, i32 15, i32 3, i32 9, i32 5, i32 15, i32 14, i32 9, i32 11, i32 10, i32 5, i32 14, i32 14, i32 5, i32 11, i32 0>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
+
+define <4 x double> @test_4xdouble_perm_mask0(<4 x double> %vec) {
+; CHECK-LABEL: test_4xdouble_perm_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,2]
+; CHECK-NEXT: retq
+ %res = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 2, i32 1, i32 3, i32 2>
+ ret <4 x double> %res
+}
+define <4 x double> @test_masked_4xdouble_perm_mask0(<4 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
+; CHECK-LABEL: test_masked_4xdouble_perm_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
+; CHECK-NEXT: vpermpd {{.*#+}} ymm1 {%k1} = ymm0[2,1,3,2]
+; CHECK-NEXT: vmovapd %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 2, i32 1, i32 3, i32 2>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_masked_z_4xdouble_perm_mask0(<4 x double> %vec, <4 x double> %mask) {
+; CHECK-LABEL: test_masked_z_4xdouble_perm_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[2,1,3,2]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 2, i32 1, i32 3, i32 2>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
+ ret <4 x double> %res
+}
+define <4 x double> @test_masked_4xdouble_perm_mask1(<4 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
+; CHECK-LABEL: test_masked_4xdouble_perm_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
+; CHECK-NEXT: vpermpd {{.*#+}} ymm1 {%k1} = ymm0[3,0,0,0]
+; CHECK-NEXT: vmovapd %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 0>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_masked_z_4xdouble_perm_mask1(<4 x double> %vec, <4 x double> %mask) {
+; CHECK-LABEL: test_masked_z_4xdouble_perm_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[3,0,0,0]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 0>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
+ ret <4 x double> %res
+}
+define <4 x double> @test_masked_4xdouble_perm_mask2(<4 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
+; CHECK-LABEL: test_masked_4xdouble_perm_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
+; CHECK-NEXT: vpermpd {{.*#+}} ymm1 {%k1} = ymm0[0,3,3,1]
+; CHECK-NEXT: vmovapd %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 3, i32 3, i32 1>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_masked_z_4xdouble_perm_mask2(<4 x double> %vec, <4 x double> %mask) {
+; CHECK-LABEL: test_masked_z_4xdouble_perm_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0,3,3,1]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 3, i32 3, i32 1>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
+ ret <4 x double> %res
+}
+define <4 x double> @test_4xdouble_perm_mask3(<4 x double> %vec) {
+; CHECK-LABEL: test_4xdouble_perm_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,2]
+; CHECK-NEXT: retq
+ %res = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 2>
+ ret <4 x double> %res
+}
+define <4 x double> @test_masked_4xdouble_perm_mask3(<4 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
+; CHECK-LABEL: test_masked_4xdouble_perm_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
+; CHECK-NEXT: vpermpd {{.*#+}} ymm1 {%k1} = ymm0[3,3,3,2]
+; CHECK-NEXT: vmovapd %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 2>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_masked_z_4xdouble_perm_mask3(<4 x double> %vec, <4 x double> %mask) {
+; CHECK-LABEL: test_masked_z_4xdouble_perm_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[3,3,3,2]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 2>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
+ ret <4 x double> %res
+}
+define <4 x double> @test_4xdouble_perm_mem_mask0(<4 x double>* %vp) {
+; CHECK-LABEL: test_4xdouble_perm_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = mem[0,0,2,0]
+; CHECK-NEXT: retq
+ %vec = load <4 x double>, <4 x double>* %vp
+ %res = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 0>
+ ret <4 x double> %res
+}
+define <4 x double> @test_masked_4xdouble_perm_mem_mask0(<4 x double>* %vp, <4 x double> %vec2, <4 x double> %mask) {
+; CHECK-LABEL: test_masked_4xdouble_perm_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vpermpd {{.*#+}} ymm0 {%k1} = mem[0,0,2,0]
+; CHECK-NEXT: retq
+ %vec = load <4 x double>, <4 x double>* %vp
+ %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 0>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_masked_z_4xdouble_perm_mem_mask0(<4 x double>* %vp, <4 x double> %mask) {
+; CHECK-LABEL: test_masked_z_4xdouble_perm_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vcmpeqpd %ymm1, %ymm0, %k1
+; CHECK-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[0,0,2,0]
+; CHECK-NEXT: retq
+ %vec = load <4 x double>, <4 x double>* %vp
+ %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 0>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_masked_4xdouble_perm_mem_mask1(<4 x double>* %vp, <4 x double> %vec2, <4 x double> %mask) {
+; CHECK-LABEL: test_masked_4xdouble_perm_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vpermpd {{.*#+}} ymm0 {%k1} = mem[0,2,3,2]
+; CHECK-NEXT: retq
+ %vec = load <4 x double>, <4 x double>* %vp
+ %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 2, i32 3, i32 2>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_masked_z_4xdouble_perm_mem_mask1(<4 x double>* %vp, <4 x double> %mask) {
+; CHECK-LABEL: test_masked_z_4xdouble_perm_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vcmpeqpd %ymm1, %ymm0, %k1
+; CHECK-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[0,2,3,2]
+; CHECK-NEXT: retq
+ %vec = load <4 x double>, <4 x double>* %vp
+ %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 2, i32 3, i32 2>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_masked_4xdouble_perm_mem_mask2(<4 x double>* %vp, <4 x double> %vec2, <4 x double> %mask) {
+; CHECK-LABEL: test_masked_4xdouble_perm_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vpermpd {{.*#+}} ymm0 {%k1} = mem[3,1,1,1]
+; CHECK-NEXT: retq
+ %vec = load <4 x double>, <4 x double>* %vp
+ %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 3, i32 1, i32 1, i32 1>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_masked_z_4xdouble_perm_mem_mask2(<4 x double>* %vp, <4 x double> %mask) {
+; CHECK-LABEL: test_masked_z_4xdouble_perm_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vcmpeqpd %ymm1, %ymm0, %k1
+; CHECK-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[3,1,1,1]
+; CHECK-NEXT: retq
+ %vec = load <4 x double>, <4 x double>* %vp
+ %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 3, i32 1, i32 1, i32 1>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_4xdouble_perm_mem_mask3(<4 x double>* %vp) {
+; CHECK-LABEL: test_4xdouble_perm_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = mem[3,2,3,2]
+; CHECK-NEXT: retq
+ %vec = load <4 x double>, <4 x double>* %vp
+ %res = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 3, i32 2, i32 3, i32 2>
+ ret <4 x double> %res
+}
+define <4 x double> @test_masked_4xdouble_perm_mem_mask3(<4 x double>* %vp, <4 x double> %vec2, <4 x double> %mask) {
+; CHECK-LABEL: test_masked_4xdouble_perm_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vpermpd {{.*#+}} ymm0 {%k1} = mem[3,2,3,2]
+; CHECK-NEXT: retq
+ %vec = load <4 x double>, <4 x double>* %vp
+ %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 3, i32 2, i32 3, i32 2>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_masked_z_4xdouble_perm_mem_mask3(<4 x double>* %vp, <4 x double> %mask) {
+; CHECK-LABEL: test_masked_z_4xdouble_perm_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vcmpeqpd %ymm1, %ymm0, %k1
+; CHECK-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[3,2,3,2]
+; CHECK-NEXT: retq
+ %vec = load <4 x double>, <4 x double>* %vp
+ %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 3, i32 2, i32 3, i32 2>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
+ ret <4 x double> %res
+}
+
+define <8 x double> @test_8xdouble_perm_mask0(<8 x double> %vec) {
+; CHECK-LABEL: test_8xdouble_perm_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps {{.*#+}} zmm1 = [5,7,4,2,7,4,3,4]
+; CHECK-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %res = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 5, i32 7, i32 4, i32 2, i32 7, i32 4, i32 3, i32 4>
+ ret <8 x double> %res
+}
+define <8 x double> @test_masked_8xdouble_perm_mask0(<8 x double> %vec, <8 x double> %vec2, <8 x double> %mask) {
+; CHECK-LABEL: test_masked_8xdouble_perm_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovapd {{.*#+}} zmm3 = [5,7,4,2,7,4,3,4]
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqpd %zmm4, %zmm2, %k1
+; CHECK-NEXT: vpermpd %zmm0, %zmm3, %zmm1 {%k1}
+; CHECK-NEXT: vmovapd %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 5, i32 7, i32 4, i32 2, i32 7, i32 4, i32 3, i32 4>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_masked_z_8xdouble_perm_mask0(<8 x double> %vec, <8 x double> %mask) {
+; CHECK-LABEL: test_masked_z_8xdouble_perm_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovapd {{.*#+}} zmm2 = [5,7,4,2,7,4,3,4]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %zmm3, %zmm1, %k1
+; CHECK-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 5, i32 7, i32 4, i32 2, i32 7, i32 4, i32 3, i32 4>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
+ ret <8 x double> %res
+}
+define <8 x double> @test_masked_8xdouble_perm_imm_mask1(<8 x double> %vec, <8 x double> %vec2, <8 x double> %mask) {
+; CHECK-LABEL: test_masked_8xdouble_perm_imm_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %zmm3, %zmm2, %k1
+; CHECK-NEXT: vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,0,0,2,7,4,4,6]
+; CHECK-NEXT: vmovapd %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 3, i32 0, i32 0, i32 2, i32 7, i32 4, i32 4, i32 6>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_masked_z_8xdouble_perm_imm_mask1(<8 x double> %vec, <8 x double> %mask) {
+; CHECK-LABEL: test_masked_z_8xdouble_perm_imm_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %zmm2, %zmm1, %k1
+; CHECK-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,0,0,2,7,4,4,6]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 3, i32 0, i32 0, i32 2, i32 7, i32 4, i32 4, i32 6>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
+ ret <8 x double> %res
+}
+define <8 x double> @test_masked_8xdouble_perm_mask2(<8 x double> %vec, <8 x double> %vec2, <8 x double> %mask) {
+; CHECK-LABEL: test_masked_8xdouble_perm_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovapd {{.*#+}} zmm3 = [7,5,5,5,3,5,1,7]
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqpd %zmm4, %zmm2, %k1
+; CHECK-NEXT: vpermpd %zmm0, %zmm3, %zmm1 {%k1}
+; CHECK-NEXT: vmovapd %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 7, i32 5, i32 5, i32 5, i32 3, i32 5, i32 1, i32 7>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_masked_z_8xdouble_perm_mask2(<8 x double> %vec, <8 x double> %mask) {
+; CHECK-LABEL: test_masked_z_8xdouble_perm_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovapd {{.*#+}} zmm2 = [7,5,5,5,3,5,1,7]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %zmm3, %zmm1, %k1
+; CHECK-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 7, i32 5, i32 5, i32 5, i32 3, i32 5, i32 1, i32 7>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
+ ret <8 x double> %res
+}
+define <8 x double> @test_8xdouble_perm_imm_mask3(<8 x double> %vec) {
+; CHECK-LABEL: test_8xdouble_perm_imm_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[1,3,3,0,5,7,7,4]
+; CHECK-NEXT: retq
+ %res = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 1, i32 3, i32 3, i32 0, i32 5, i32 7, i32 7, i32 4>
+ ret <8 x double> %res
+}
+define <8 x double> @test_masked_8xdouble_perm_imm_mask3(<8 x double> %vec, <8 x double> %vec2, <8 x double> %mask) {
+; CHECK-LABEL: test_masked_8xdouble_perm_imm_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %zmm3, %zmm2, %k1
+; CHECK-NEXT: vpermpd {{.*#+}} zmm1 {%k1} = zmm0[1,3,3,0,5,7,7,4]
+; CHECK-NEXT: vmovapd %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 1, i32 3, i32 3, i32 0, i32 5, i32 7, i32 7, i32 4>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_masked_z_8xdouble_perm_imm_mask3(<8 x double> %vec, <8 x double> %mask) {
+; CHECK-LABEL: test_masked_z_8xdouble_perm_imm_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %zmm2, %zmm1, %k1
+; CHECK-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1,3,3,0,5,7,7,4]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 1, i32 3, i32 3, i32 0, i32 5, i32 7, i32 7, i32 4>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
+ ret <8 x double> %res
+}
+define <8 x double> @test_masked_8xdouble_perm_mask4(<8 x double> %vec, <8 x double> %vec2, <8 x double> %mask) {
+; CHECK-LABEL: test_masked_8xdouble_perm_mask4:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovapd {{.*#+}} zmm3 = [3,5,3,4,6,5,7,1]
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqpd %zmm4, %zmm2, %k1
+; CHECK-NEXT: vpermpd %zmm0, %zmm3, %zmm1 {%k1}
+; CHECK-NEXT: vmovapd %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 3, i32 5, i32 3, i32 4, i32 6, i32 5, i32 7, i32 1>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_masked_z_8xdouble_perm_mask4(<8 x double> %vec, <8 x double> %mask) {
+; CHECK-LABEL: test_masked_z_8xdouble_perm_mask4:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovapd {{.*#+}} zmm2 = [3,5,3,4,6,5,7,1]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %zmm3, %zmm1, %k1
+; CHECK-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 3, i32 5, i32 3, i32 4, i32 6, i32 5, i32 7, i32 1>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
+ ret <8 x double> %res
+}
+define <8 x double> @test_masked_8xdouble_perm_imm_mask5(<8 x double> %vec, <8 x double> %vec2, <8 x double> %mask) {
+; CHECK-LABEL: test_masked_8xdouble_perm_imm_mask5:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %zmm3, %zmm2, %k1
+; CHECK-NEXT: vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,3,2,3,7,7,6,7]
+; CHECK-NEXT: vmovapd %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 3, i32 3, i32 2, i32 3, i32 7, i32 7, i32 6, i32 7>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_masked_z_8xdouble_perm_imm_mask5(<8 x double> %vec, <8 x double> %mask) {
+; CHECK-LABEL: test_masked_z_8xdouble_perm_imm_mask5:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %zmm2, %zmm1, %k1
+; CHECK-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,3,2,3,7,7,6,7]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 3, i32 3, i32 2, i32 3, i32 7, i32 7, i32 6, i32 7>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
+ ret <8 x double> %res
+}
+define <8 x double> @test_8xdouble_perm_mask6(<8 x double> %vec) {
+; CHECK-LABEL: test_8xdouble_perm_mask6:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps {{.*#+}} zmm1 = [2,7,6,4,0,0,0,2]
+; CHECK-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %res = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 2, i32 7, i32 6, i32 4, i32 0, i32 0, i32 0, i32 2>
+ ret <8 x double> %res
+}
+define <8 x double> @test_masked_8xdouble_perm_mask6(<8 x double> %vec, <8 x double> %vec2, <8 x double> %mask) {
+; CHECK-LABEL: test_masked_8xdouble_perm_mask6:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovapd {{.*#+}} zmm3 = [2,7,6,4,0,0,0,2]
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqpd %zmm4, %zmm2, %k1
+; CHECK-NEXT: vpermpd %zmm0, %zmm3, %zmm1 {%k1}
+; CHECK-NEXT: vmovapd %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 2, i32 7, i32 6, i32 4, i32 0, i32 0, i32 0, i32 2>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_masked_z_8xdouble_perm_mask6(<8 x double> %vec, <8 x double> %mask) {
+; CHECK-LABEL: test_masked_z_8xdouble_perm_mask6:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovapd {{.*#+}} zmm2 = [2,7,6,4,0,0,0,2]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %zmm3, %zmm1, %k1
+; CHECK-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 2, i32 7, i32 6, i32 4, i32 0, i32 0, i32 0, i32 2>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
+ ret <8 x double> %res
+}
+define <8 x double> @test_masked_8xdouble_perm_imm_mask7(<8 x double> %vec, <8 x double> %vec2, <8 x double> %mask) {
+; CHECK-LABEL: test_masked_8xdouble_perm_imm_mask7:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %zmm3, %zmm2, %k1
+; CHECK-NEXT: vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,1,3,2,7,5,7,6]
+; CHECK-NEXT: vmovapd %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 3, i32 1, i32 3, i32 2, i32 7, i32 5, i32 7, i32 6>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_masked_z_8xdouble_perm_imm_mask7(<8 x double> %vec, <8 x double> %mask) {
+; CHECK-LABEL: test_masked_z_8xdouble_perm_imm_mask7:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %zmm2, %zmm1, %k1
+; CHECK-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,1,3,2,7,5,7,6]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 3, i32 1, i32 3, i32 2, i32 7, i32 5, i32 7, i32 6>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
+ ret <8 x double> %res
+}
+define <8 x double> @test_8xdouble_perm_mem_mask0(<8 x double>* %vp) {
+; CHECK-LABEL: test_8xdouble_perm_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps {{.*#+}} zmm0 = [0,3,4,0,4,2,0,1]
+; CHECK-NEXT: vpermpd (%rdi), %zmm0, %zmm0
+; CHECK-NEXT: retq
+ %vec = load <8 x double>, <8 x double>* %vp
+ %res = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 3, i32 4, i32 0, i32 4, i32 2, i32 0, i32 1>
+ ret <8 x double> %res
+}
+define <8 x double> @test_masked_8xdouble_perm_mem_mask0(<8 x double>* %vp, <8 x double> %vec2, <8 x double> %mask) {
+; CHECK-LABEL: test_masked_8xdouble_perm_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovapd {{.*#+}} zmm2 = [0,3,4,0,4,2,0,1]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %zmm3, %zmm1, %k1
+; CHECK-NEXT: vpermpd (%rdi), %zmm2, %zmm0 {%k1}
+; CHECK-NEXT: retq
+ %vec = load <8 x double>, <8 x double>* %vp
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 3, i32 4, i32 0, i32 4, i32 2, i32 0, i32 1>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_masked_z_8xdouble_perm_mem_mask0(<8 x double>* %vp, <8 x double> %mask) {
+; CHECK-LABEL: test_masked_z_8xdouble_perm_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovapd {{.*#+}} zmm1 = [0,3,4,0,4,2,0,1]
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %zmm2, %zmm0, %k1
+; CHECK-NEXT: vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %vec = load <8 x double>, <8 x double>* %vp
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 3, i32 4, i32 0, i32 4, i32 2, i32 0, i32 1>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_masked_8xdouble_perm_imm_mem_mask1(<8 x double>* %vp, <8 x double> %vec2, <8 x double> %mask) {
+; CHECK-LABEL: test_masked_8xdouble_perm_imm_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %zmm2, %zmm1, %k1
+; CHECK-NEXT: vpermpd {{.*#+}} zmm0 {%k1} = mem[0,2,0,3,4,6,4,7]
+; CHECK-NEXT: retq
+ %vec = load <8 x double>, <8 x double>* %vp
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 2, i32 0, i32 3, i32 4, i32 6, i32 4, i32 7>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_masked_z_8xdouble_perm_imm_mem_mask1(<8 x double>* %vp, <8 x double> %mask) {
+; CHECK-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vcmpeqpd %zmm1, %zmm0, %k1
+; CHECK-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[0,2,0,3,4,6,4,7]
+; CHECK-NEXT: retq
+ %vec = load <8 x double>, <8 x double>* %vp
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 2, i32 0, i32 3, i32 4, i32 6, i32 4, i32 7>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_masked_8xdouble_perm_mem_mask2(<8 x double>* %vp, <8 x double> %vec2, <8 x double> %mask) {
+; CHECK-LABEL: test_masked_8xdouble_perm_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovapd {{.*#+}} zmm2 = [6,7,2,7,7,6,2,5]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %zmm3, %zmm1, %k1
+; CHECK-NEXT: vpermpd (%rdi), %zmm2, %zmm0 {%k1}
+; CHECK-NEXT: retq
+ %vec = load <8 x double>, <8 x double>* %vp
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 6, i32 7, i32 2, i32 7, i32 7, i32 6, i32 2, i32 5>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_masked_z_8xdouble_perm_mem_mask2(<8 x double>* %vp, <8 x double> %mask) {
+; CHECK-LABEL: test_masked_z_8xdouble_perm_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovapd {{.*#+}} zmm1 = [6,7,2,7,7,6,2,5]
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %zmm2, %zmm0, %k1
+; CHECK-NEXT: vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %vec = load <8 x double>, <8 x double>* %vp
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 6, i32 7, i32 2, i32 7, i32 7, i32 6, i32 2, i32 5>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_8xdouble_perm_imm_mem_mask3(<8 x double>* %vp) {
+; CHECK-LABEL: test_8xdouble_perm_imm_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpermpd {{.*#+}} zmm0 = mem[2,1,1,0,6,5,5,4]
+; CHECK-NEXT: retq
+ %vec = load <8 x double>, <8 x double>* %vp
+ %res = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 2, i32 1, i32 1, i32 0, i32 6, i32 5, i32 5, i32 4>
+ ret <8 x double> %res
+}
+define <8 x double> @test_masked_8xdouble_perm_imm_mem_mask3(<8 x double>* %vp, <8 x double> %vec2, <8 x double> %mask) {
+; CHECK-LABEL: test_masked_8xdouble_perm_imm_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %zmm2, %zmm1, %k1
+; CHECK-NEXT: vpermpd {{.*#+}} zmm0 {%k1} = mem[2,1,1,0,6,5,5,4]
+; CHECK-NEXT: retq
+ %vec = load <8 x double>, <8 x double>* %vp
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 2, i32 1, i32 1, i32 0, i32 6, i32 5, i32 5, i32 4>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_masked_z_8xdouble_perm_imm_mem_mask3(<8 x double>* %vp, <8 x double> %mask) {
+; CHECK-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vcmpeqpd %zmm1, %zmm0, %k1
+; CHECK-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[2,1,1,0,6,5,5,4]
+; CHECK-NEXT: retq
+ %vec = load <8 x double>, <8 x double>* %vp
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 2, i32 1, i32 1, i32 0, i32 6, i32 5, i32 5, i32 4>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_masked_8xdouble_perm_mem_mask4(<8 x double>* %vp, <8 x double> %vec2, <8 x double> %mask) {
+; CHECK-LABEL: test_masked_8xdouble_perm_mem_mask4:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovapd {{.*#+}} zmm2 = [1,1,3,5,6,0,6,0]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %zmm3, %zmm1, %k1
+; CHECK-NEXT: vpermpd (%rdi), %zmm2, %zmm0 {%k1}
+; CHECK-NEXT: retq
+ %vec = load <8 x double>, <8 x double>* %vp
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 5, i32 6, i32 0, i32 6, i32 0>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_masked_z_8xdouble_perm_mem_mask4(<8 x double>* %vp, <8 x double> %mask) {
+; CHECK-LABEL: test_masked_z_8xdouble_perm_mem_mask4:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovapd {{.*#+}} zmm1 = [1,1,3,5,6,0,6,0]
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %zmm2, %zmm0, %k1
+; CHECK-NEXT: vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %vec = load <8 x double>, <8 x double>* %vp
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 5, i32 6, i32 0, i32 6, i32 0>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_masked_8xdouble_perm_imm_mem_mask5(<8 x double>* %vp, <8 x double> %vec2, <8 x double> %mask) {
+; CHECK-LABEL: test_masked_8xdouble_perm_imm_mem_mask5:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %zmm2, %zmm1, %k1
+; CHECK-NEXT: vpermpd {{.*#+}} zmm0 {%k1} = mem[2,2,2,3,6,6,6,7]
+; CHECK-NEXT: retq
+ %vec = load <8 x double>, <8 x double>* %vp
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 3, i32 6, i32 6, i32 6, i32 7>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_masked_z_8xdouble_perm_imm_mem_mask5(<8 x double>* %vp, <8 x double> %mask) {
+; CHECK-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask5:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vcmpeqpd %zmm1, %zmm0, %k1
+; CHECK-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[2,2,2,3,6,6,6,7]
+; CHECK-NEXT: retq
+ %vec = load <8 x double>, <8 x double>* %vp
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 3, i32 6, i32 6, i32 6, i32 7>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_8xdouble_perm_mem_mask6(<8 x double>* %vp) {
+; CHECK-LABEL: test_8xdouble_perm_mem_mask6:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps {{.*#+}} zmm0 = [2,4,0,4,6,1,2,5]
+; CHECK-NEXT: vpermpd (%rdi), %zmm0, %zmm0
+; CHECK-NEXT: retq
+ %vec = load <8 x double>, <8 x double>* %vp
+ %res = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 2, i32 4, i32 0, i32 4, i32 6, i32 1, i32 2, i32 5>
+ ret <8 x double> %res
+}
+define <8 x double> @test_masked_8xdouble_perm_mem_mask6(<8 x double>* %vp, <8 x double> %vec2, <8 x double> %mask) {
+; CHECK-LABEL: test_masked_8xdouble_perm_mem_mask6:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovapd {{.*#+}} zmm2 = [2,4,0,4,6,1,2,5]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %zmm3, %zmm1, %k1
+; CHECK-NEXT: vpermpd (%rdi), %zmm2, %zmm0 {%k1}
+; CHECK-NEXT: retq
+ %vec = load <8 x double>, <8 x double>* %vp
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 2, i32 4, i32 0, i32 4, i32 6, i32 1, i32 2, i32 5>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_masked_z_8xdouble_perm_mem_mask6(<8 x double>* %vp, <8 x double> %mask) {
+; CHECK-LABEL: test_masked_z_8xdouble_perm_mem_mask6:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovapd {{.*#+}} zmm1 = [2,4,0,4,6,1,2,5]
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %zmm2, %zmm0, %k1
+; CHECK-NEXT: vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %vec = load <8 x double>, <8 x double>* %vp
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 2, i32 4, i32 0, i32 4, i32 6, i32 1, i32 2, i32 5>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_masked_8xdouble_perm_imm_mem_mask7(<8 x double>* %vp, <8 x double> %vec2, <8 x double> %mask) {
+; CHECK-LABEL: test_masked_8xdouble_perm_imm_mem_mask7:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %zmm2, %zmm1, %k1
+; CHECK-NEXT: vpermpd {{.*#+}} zmm0 {%k1} = mem[0,3,2,0,4,7,6,4]
+; CHECK-NEXT: retq
+ %vec = load <8 x double>, <8 x double>* %vp
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 3, i32 2, i32 0, i32 4, i32 7, i32 6, i32 4>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_masked_z_8xdouble_perm_imm_mem_mask7(<8 x double>* %vp, <8 x double> %mask) {
+; CHECK-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask7:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vcmpeqpd %zmm1, %zmm0, %k1
+; CHECK-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[0,3,2,0,4,7,6,4]
+; CHECK-NEXT: retq
+ %vec = load <8 x double>, <8 x double>* %vp
+ %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 3, i32 2, i32 0, i32 4, i32 7, i32 6, i32 4>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
+ ret <8 x double> %res
+}
+
diff --git a/test/CodeGen/X86/avx512-shuffles/shuffle-interleave.ll b/test/CodeGen/X86/avx512-shuffles/shuffle-interleave.ll
new file mode 100644
index 000000000000..ff840e6411c1
--- /dev/null
+++ b/test/CodeGen/X86/avx512-shuffles/shuffle-interleave.ll
@@ -0,0 +1,1400 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512vl %s -o - | FileCheck %s
+
+define <4 x float> @test_4xfloat_shuff_mask0(<4 x float> %vec1, <4 x float> %vec2) {
+; CHECK-LABEL: test_4xfloat_shuff_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,1],xmm1[3,1]
+; CHECK-NEXT: retq
+ %res = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 1, i32 7, i32 5>
+ ret <4 x float> %res
+}
+define <4 x float> @test_4xfloat_masked_shuff_mask0(<4 x float> %vec1, <4 x float> %vec2, <4 x float> %vec3, <4 x float> %mask) {
+; CHECK-LABEL: test_4xfloat_masked_shuff_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqps %xmm4, %xmm3, %k1
+; CHECK-NEXT: vshufps {{.*#+}} xmm2 {%k1} = xmm0[2,1],xmm1[3,1]
+; CHECK-NEXT: vmovaps %xmm2, %xmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 1, i32 7, i32 5>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec3
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_4xfloat_zero_masked_shuff_mask0(<4 x float> %vec1, <4 x float> %vec2, <4 x float> %mask) {
+; CHECK-LABEL: test_4xfloat_zero_masked_shuff_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1
+; CHECK-NEXT: vshufps {{.*#+}} xmm0 {%k1} {z} = xmm0[2,1],xmm1[3,1]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 1, i32 7, i32 5>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
+ ret <4 x float> %res
+}
+define <4 x float> @test_4xfloat_masked_shuff_mask1(<4 x float> %vec1, <4 x float> %vec2, <4 x float> %vec3, <4 x float> %mask) {
+; CHECK-LABEL: test_4xfloat_masked_shuff_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqps %xmm4, %xmm3, %k1
+; CHECK-NEXT: vshufps {{.*#+}} xmm2 {%k1} = xmm0[1,2],xmm1[3,2]
+; CHECK-NEXT: vmovaps %xmm2, %xmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 1, i32 2, i32 7, i32 6>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec3
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_4xfloat_zero_masked_shuff_mask1(<4 x float> %vec1, <4 x float> %vec2, <4 x float> %mask) {
+; CHECK-LABEL: test_4xfloat_zero_masked_shuff_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1
+; CHECK-NEXT: vshufps {{.*#+}} xmm0 {%k1} {z} = xmm0[1,2],xmm1[3,2]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 1, i32 2, i32 7, i32 6>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
+ ret <4 x float> %res
+}
+define <4 x float> @test_4xfloat_masked_shuff_mask2(<4 x float> %vec1, <4 x float> %vec2, <4 x float> %vec3, <4 x float> %mask) {
+; CHECK-LABEL: test_4xfloat_masked_shuff_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqps %xmm4, %xmm3, %k1
+; CHECK-NEXT: vshufps {{.*#+}} xmm2 {%k1} = xmm0[1,3],xmm1[2,1]
+; CHECK-NEXT: vmovaps %xmm2, %xmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 1, i32 3, i32 6, i32 5>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec3
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_4xfloat_zero_masked_shuff_mask2(<4 x float> %vec1, <4 x float> %vec2, <4 x float> %mask) {
+; CHECK-LABEL: test_4xfloat_zero_masked_shuff_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1
+; CHECK-NEXT: vshufps {{.*#+}} xmm0 {%k1} {z} = xmm0[1,3],xmm1[2,1]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 1, i32 3, i32 6, i32 5>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
+ ret <4 x float> %res
+}
+define <4 x float> @test_4xfloat_shuff_mask3(<4 x float> %vec1, <4 x float> %vec2) {
+; CHECK-LABEL: test_4xfloat_shuff_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3]
+; CHECK-NEXT: retq
+ %res = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 3, i32 3, i32 7, i32 7>
+ ret <4 x float> %res
+}
+define <4 x float> @test_4xfloat_masked_shuff_mask3(<4 x float> %vec1, <4 x float> %vec2, <4 x float> %vec3, <4 x float> %mask) {
+; CHECK-LABEL: test_4xfloat_masked_shuff_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqps %xmm4, %xmm3, %k1
+; CHECK-NEXT: vshufps {{.*#+}} xmm2 {%k1} = xmm0[3,3],xmm1[3,3]
+; CHECK-NEXT: vmovaps %xmm2, %xmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 3, i32 3, i32 7, i32 7>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec3
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_4xfloat_zero_masked_shuff_mask3(<4 x float> %vec1, <4 x float> %vec2, <4 x float> %mask) {
+; CHECK-LABEL: test_4xfloat_zero_masked_shuff_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1
+; CHECK-NEXT: vshufps {{.*#+}} xmm0 {%k1} {z} = xmm0[3,3],xmm1[3,3]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 3, i32 3, i32 7, i32 7>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
+ ret <4 x float> %res
+}
+define <4 x float> @test_4xfloat_shuff_mem_mask0(<4 x float> %vec1, <4 x float>* %vec2p) {
+; CHECK-LABEL: test_4xfloat_shuff_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0],mem[1,2]
+; CHECK-NEXT: retq
+ %vec2 = load <4 x float>, <4 x float>* %vec2p
+ %res = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 1, i32 0, i32 5, i32 6>
+ ret <4 x float> %res
+}
+define <4 x float> @test_4xfloat_masked_shuff_mem_mask0(<4 x float> %vec1, <4 x float>* %vec2p, <4 x float> %vec3, <4 x float> %mask) {
+; CHECK-LABEL: test_4xfloat_masked_shuff_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1
+; CHECK-NEXT: vshufps {{.*#+}} xmm1 {%k1} = xmm0[1,0],mem[1,2]
+; CHECK-NEXT: vmovaps %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %vec2 = load <4 x float>, <4 x float>* %vec2p
+ %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 1, i32 0, i32 5, i32 6>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec3
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_4xfloat_zero_masked_shuff_mem_mask0(<4 x float> %vec1, <4 x float>* %vec2p, <4 x float> %mask) {
+; CHECK-LABEL: test_4xfloat_zero_masked_shuff_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
+; CHECK-NEXT: vshufps {{.*#+}} xmm0 {%k1} {z} = xmm0[1,0],mem[1,2]
+; CHECK-NEXT: retq
+ %vec2 = load <4 x float>, <4 x float>* %vec2p
+ %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 1, i32 0, i32 5, i32 6>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_4xfloat_masked_shuff_mem_mask1(<4 x float> %vec1, <4 x float>* %vec2p, <4 x float> %vec3, <4 x float> %mask) {
+; CHECK-LABEL: test_4xfloat_masked_shuff_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1
+; CHECK-NEXT: vshufps {{.*#+}} xmm1 {%k1} = xmm0[3,3],mem[1,3]
+; CHECK-NEXT: vmovaps %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %vec2 = load <4 x float>, <4 x float>* %vec2p
+ %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 3, i32 3, i32 5, i32 7>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec3
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_4xfloat_zero_masked_shuff_mem_mask1(<4 x float> %vec1, <4 x float>* %vec2p, <4 x float> %mask) {
+; CHECK-LABEL: test_4xfloat_zero_masked_shuff_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
+; CHECK-NEXT: vshufps {{.*#+}} xmm0 {%k1} {z} = xmm0[3,3],mem[1,3]
+; CHECK-NEXT: retq
+ %vec2 = load <4 x float>, <4 x float>* %vec2p
+ %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 3, i32 3, i32 5, i32 7>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_4xfloat_masked_shuff_mem_mask2(<4 x float> %vec1, <4 x float>* %vec2p, <4 x float> %vec3, <4 x float> %mask) {
+; CHECK-LABEL: test_4xfloat_masked_shuff_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1
+; CHECK-NEXT: vshufps {{.*#+}} xmm1 {%k1} = xmm0[1,3],mem[2,0]
+; CHECK-NEXT: vmovaps %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %vec2 = load <4 x float>, <4 x float>* %vec2p
+ %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 1, i32 3, i32 6, i32 4>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec3
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_4xfloat_zero_masked_shuff_mem_mask2(<4 x float> %vec1, <4 x float>* %vec2p, <4 x float> %mask) {
+; CHECK-LABEL: test_4xfloat_zero_masked_shuff_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
+; CHECK-NEXT: vshufps {{.*#+}} xmm0 {%k1} {z} = xmm0[1,3],mem[2,0]
+; CHECK-NEXT: retq
+ %vec2 = load <4 x float>, <4 x float>* %vec2p
+ %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 1, i32 3, i32 6, i32 4>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_4xfloat_shuff_mem_mask3(<4 x float> %vec1, <4 x float>* %vec2p) {
+; CHECK-LABEL: test_4xfloat_shuff_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,1],mem[3,2]
+; CHECK-NEXT: retq
+ %vec2 = load <4 x float>, <4 x float>* %vec2p
+ %res = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 1, i32 7, i32 6>
+ ret <4 x float> %res
+}
+define <4 x float> @test_4xfloat_masked_shuff_mem_mask3(<4 x float> %vec1, <4 x float>* %vec2p, <4 x float> %vec3, <4 x float> %mask) {
+; CHECK-LABEL: test_4xfloat_masked_shuff_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1
+; CHECK-NEXT: vshufps {{.*#+}} xmm1 {%k1} = xmm0[2,1],mem[3,2]
+; CHECK-NEXT: vmovaps %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %vec2 = load <4 x float>, <4 x float>* %vec2p
+ %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 1, i32 7, i32 6>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec3
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_4xfloat_zero_masked_shuff_mem_mask3(<4 x float> %vec1, <4 x float>* %vec2p, <4 x float> %mask) {
+; CHECK-LABEL: test_4xfloat_zero_masked_shuff_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
+; CHECK-NEXT: vshufps {{.*#+}} xmm0 {%k1} {z} = xmm0[2,1],mem[3,2]
+; CHECK-NEXT: retq
+ %vec2 = load <4 x float>, <4 x float>* %vec2p
+ %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 1, i32 7, i32 6>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
+ ret <4 x float> %res
+}
+
+define <8 x float> @test_8xfloat_shuff_mask0(<8 x float> %vec1, <8 x float> %vec2) {
+; CHECK-LABEL: test_8xfloat_shuff_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[0,2],ymm0[5,7],ymm1[4,6]
+; CHECK-NEXT: retq
+ %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 1, i32 3, i32 8, i32 10, i32 5, i32 7, i32 12, i32 14>
+ ret <8 x float> %res
+}
+define <8 x float> @test_8xfloat_masked_shuff_mask0(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x float> %mask) {
+; CHECK-LABEL: test_8xfloat_masked_shuff_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqps %ymm4, %ymm3, %k1
+; CHECK-NEXT: vshufps {{.*#+}} ymm2 {%k1} = ymm0[1,3],ymm1[0,2],ymm0[5,7],ymm1[4,6]
+; CHECK-NEXT: vmovaps %ymm2, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 1, i32 3, i32 8, i32 10, i32 5, i32 7, i32 12, i32 14>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_8xfloat_zero_masked_shuff_mask0(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %mask) {
+; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1
+; CHECK-NEXT: vshufps {{.*#+}} ymm0 {%k1} {z} = ymm0[1,3],ymm1[0,2],ymm0[5,7],ymm1[4,6]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 1, i32 3, i32 8, i32 10, i32 5, i32 7, i32 12, i32 14>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+define <8 x float> @test_8xfloat_masked_shuff_mask1(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x float> %mask) {
+; CHECK-LABEL: test_8xfloat_masked_shuff_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqps %ymm4, %ymm3, %k1
+; CHECK-NEXT: vshufps {{.*#+}} ymm2 {%k1} = ymm0[0,3],ymm1[3,1],ymm0[4,7],ymm1[7,5]
+; CHECK-NEXT: vmovaps %ymm2, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 3, i32 11, i32 9, i32 4, i32 7, i32 15, i32 13>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_8xfloat_zero_masked_shuff_mask1(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %mask) {
+; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1
+; CHECK-NEXT: vshufps {{.*#+}} ymm0 {%k1} {z} = ymm0[0,3],ymm1[3,1],ymm0[4,7],ymm1[7,5]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 3, i32 11, i32 9, i32 4, i32 7, i32 15, i32 13>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+define <8 x float> @test_8xfloat_masked_shuff_mask2(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x float> %mask) {
+; CHECK-LABEL: test_8xfloat_masked_shuff_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqps %ymm4, %ymm3, %k1
+; CHECK-NEXT: vshufps {{.*#+}} ymm2 {%k1} = ymm0[0,2],ymm1[2,2],ymm0[4,6],ymm1[6,6]
+; CHECK-NEXT: vmovaps %ymm2, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 2, i32 10, i32 10, i32 4, i32 6, i32 14, i32 14>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_8xfloat_zero_masked_shuff_mask2(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %mask) {
+; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1
+; CHECK-NEXT: vshufps {{.*#+}} ymm0 {%k1} {z} = ymm0[0,2],ymm1[2,2],ymm0[4,6],ymm1[6,6]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 2, i32 10, i32 10, i32 4, i32 6, i32 14, i32 14>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+define <8 x float> @test_8xfloat_shuff_mask3(<8 x float> %vec1, <8 x float> %vec2) {
+; CHECK-LABEL: test_8xfloat_shuff_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2],ymm1[3,2],ymm0[7,6],ymm1[7,6]
+; CHECK-NEXT: retq
+ %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 3, i32 2, i32 11, i32 10, i32 7, i32 6, i32 15, i32 14>
+ ret <8 x float> %res
+}
+define <8 x float> @test_8xfloat_masked_shuff_mask3(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x float> %mask) {
+; CHECK-LABEL: test_8xfloat_masked_shuff_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqps %ymm4, %ymm3, %k1
+; CHECK-NEXT: vshufps {{.*#+}} ymm2 {%k1} = ymm0[3,2],ymm1[3,2],ymm0[7,6],ymm1[7,6]
+; CHECK-NEXT: vmovaps %ymm2, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 3, i32 2, i32 11, i32 10, i32 7, i32 6, i32 15, i32 14>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_8xfloat_zero_masked_shuff_mask3(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %mask) {
+; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1
+; CHECK-NEXT: vshufps {{.*#+}} ymm0 {%k1} {z} = ymm0[3,2],ymm1[3,2],ymm0[7,6],ymm1[7,6]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 3, i32 2, i32 11, i32 10, i32 7, i32 6, i32 15, i32 14>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+define <8 x float> @test_8xfloat_shuff_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p) {
+; CHECK-LABEL: test_8xfloat_shuff_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],mem[0,0],ymm0[6,5],mem[4,4]
+; CHECK-NEXT: retq
+ %vec2 = load <8 x float>, <8 x float>* %vec2p
+ %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 1, i32 8, i32 8, i32 6, i32 5, i32 12, i32 12>
+ ret <8 x float> %res
+}
+define <8 x float> @test_8xfloat_masked_shuff_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x float> %mask) {
+; CHECK-LABEL: test_8xfloat_masked_shuff_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1
+; CHECK-NEXT: vshufps {{.*#+}} ymm1 {%k1} = ymm0[2,1],mem[0,0],ymm0[6,5],mem[4,4]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %vec2 = load <8 x float>, <8 x float>* %vec2p
+ %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 1, i32 8, i32 8, i32 6, i32 5, i32 12, i32 12>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %mask) {
+; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1
+; CHECK-NEXT: vshufps {{.*#+}} ymm0 {%k1} {z} = ymm0[2,1],mem[0,0],ymm0[6,5],mem[4,4]
+; CHECK-NEXT: retq
+ %vec2 = load <8 x float>, <8 x float>* %vec2p
+ %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 1, i32 8, i32 8, i32 6, i32 5, i32 12, i32 12>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_8xfloat_masked_shuff_mem_mask1(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x float> %mask) {
+; CHECK-LABEL: test_8xfloat_masked_shuff_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1
+; CHECK-NEXT: vshufps {{.*#+}} ymm1 {%k1} = ymm0[2,2],mem[1,0],ymm0[6,6],mem[5,4]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %vec2 = load <8 x float>, <8 x float>* %vec2p
+ %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 2, i32 9, i32 8, i32 6, i32 6, i32 13, i32 12>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask1(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %mask) {
+; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1
+; CHECK-NEXT: vshufps {{.*#+}} ymm0 {%k1} {z} = ymm0[2,2],mem[1,0],ymm0[6,6],mem[5,4]
+; CHECK-NEXT: retq
+ %vec2 = load <8 x float>, <8 x float>* %vec2p
+ %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 2, i32 9, i32 8, i32 6, i32 6, i32 13, i32 12>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_8xfloat_masked_shuff_mem_mask2(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x float> %mask) {
+; CHECK-LABEL: test_8xfloat_masked_shuff_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1
+; CHECK-NEXT: vshufps {{.*#+}} ymm1 {%k1} = ymm0[3,3],mem[3,3],ymm0[7,7],mem[7,7]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %vec2 = load <8 x float>, <8 x float>* %vec2p
+ %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 3, i32 3, i32 11, i32 11, i32 7, i32 7, i32 15, i32 15>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask2(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %mask) {
+; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1
+; CHECK-NEXT: vshufps {{.*#+}} ymm0 {%k1} {z} = ymm0[3,3],mem[3,3],ymm0[7,7],mem[7,7]
+; CHECK-NEXT: retq
+ %vec2 = load <8 x float>, <8 x float>* %vec2p
+ %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 3, i32 3, i32 11, i32 11, i32 7, i32 7, i32 15, i32 15>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_8xfloat_shuff_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p) {
+; CHECK-LABEL: test_8xfloat_shuff_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,3],mem[2,1],ymm0[7,7],mem[6,5]
+; CHECK-NEXT: retq
+ %vec2 = load <8 x float>, <8 x float>* %vec2p
+ %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 3, i32 3, i32 10, i32 9, i32 7, i32 7, i32 14, i32 13>
+ ret <8 x float> %res
+}
+define <8 x float> @test_8xfloat_masked_shuff_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x float> %mask) {
+; CHECK-LABEL: test_8xfloat_masked_shuff_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1
+; CHECK-NEXT: vshufps {{.*#+}} ymm1 {%k1} = ymm0[3,3],mem[2,1],ymm0[7,7],mem[6,5]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %vec2 = load <8 x float>, <8 x float>* %vec2p
+ %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 3, i32 3, i32 10, i32 9, i32 7, i32 7, i32 14, i32 13>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %mask) {
+; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1
+; CHECK-NEXT: vshufps {{.*#+}} ymm0 {%k1} {z} = ymm0[3,3],mem[2,1],ymm0[7,7],mem[6,5]
+; CHECK-NEXT: retq
+ %vec2 = load <8 x float>, <8 x float>* %vec2p
+ %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 3, i32 3, i32 10, i32 9, i32 7, i32 7, i32 14, i32 13>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+
+define <16 x float> @test_16xfloat_shuff_mask0(<16 x float> %vec1, <16 x float> %vec2) {
+; CHECK-LABEL: test_16xfloat_shuff_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vshufps {{.*#+}} zmm0 = zmm0[3,2],zmm1[3,2],zmm0[7,6],zmm1[7,6],zmm0[11,10],zmm1[11,10],zmm0[15,14],zmm1[15,14]
+; CHECK-NEXT: retq
+ %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 3, i32 2, i32 19, i32 18, i32 7, i32 6, i32 23, i32 22, i32 11, i32 10, i32 27, i32 26, i32 15, i32 14, i32 31, i32 30>
+ ret <16 x float> %res
+}
+define <16 x float> @test_16xfloat_masked_shuff_mask0(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x float> %mask) {
+; CHECK-LABEL: test_16xfloat_masked_shuff_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqps %zmm4, %zmm3, %k1
+; CHECK-NEXT: vshufps {{.*#+}} zmm2 {%k1} = zmm0[3,2],zmm1[3,2],zmm0[7,6],zmm1[7,6],zmm0[11,10],zmm1[11,10],zmm0[15,14],zmm1[15,14]
+; CHECK-NEXT: vmovaps %zmm2, %zmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 3, i32 2, i32 19, i32 18, i32 7, i32 6, i32 23, i32 22, i32 11, i32 10, i32 27, i32 26, i32 15, i32 14, i32 31, i32 30>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_16xfloat_zero_masked_shuff_mask0(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %mask) {
+; CHECK-LABEL: test_16xfloat_zero_masked_shuff_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %zmm3, %zmm2, %k1
+; CHECK-NEXT: vshufps {{.*#+}} zmm0 {%k1} {z} = zmm0[3,2],zmm1[3,2],zmm0[7,6],zmm1[7,6],zmm0[11,10],zmm1[11,10],zmm0[15,14],zmm1[15,14]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 3, i32 2, i32 19, i32 18, i32 7, i32 6, i32 23, i32 22, i32 11, i32 10, i32 27, i32 26, i32 15, i32 14, i32 31, i32 30>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
+define <16 x float> @test_16xfloat_masked_shuff_mask1(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x float> %mask) {
+; CHECK-LABEL: test_16xfloat_masked_shuff_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqps %zmm4, %zmm3, %k1
+; CHECK-NEXT: vshufps {{.*#+}} zmm2 {%k1} = zmm0[1,2],zmm1[3,3],zmm0[5,6],zmm1[7,7],zmm0[9,10],zmm1[11,11],zmm0[13,14],zmm1[15,15]
+; CHECK-NEXT: vmovaps %zmm2, %zmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 1, i32 2, i32 19, i32 19, i32 5, i32 6, i32 23, i32 23, i32 9, i32 10, i32 27, i32 27, i32 13, i32 14, i32 31, i32 31>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_16xfloat_zero_masked_shuff_mask1(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %mask) {
+; CHECK-LABEL: test_16xfloat_zero_masked_shuff_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %zmm3, %zmm2, %k1
+; CHECK-NEXT: vshufps {{.*#+}} zmm0 {%k1} {z} = zmm0[1,2],zmm1[3,3],zmm0[5,6],zmm1[7,7],zmm0[9,10],zmm1[11,11],zmm0[13,14],zmm1[15,15]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 1, i32 2, i32 19, i32 19, i32 5, i32 6, i32 23, i32 23, i32 9, i32 10, i32 27, i32 27, i32 13, i32 14, i32 31, i32 31>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
+define <16 x float> @test_16xfloat_masked_shuff_mask2(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x float> %mask) {
+; CHECK-LABEL: test_16xfloat_masked_shuff_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqps %zmm4, %zmm3, %k1
+; CHECK-NEXT: vshufps {{.*#+}} zmm2 {%k1} = zmm0[3,0],zmm1[2,1],zmm0[7,4],zmm1[6,5],zmm0[11,8],zmm1[10,9],zmm0[15,12],zmm1[14,13]
+; CHECK-NEXT: vmovaps %zmm2, %zmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 3, i32 0, i32 18, i32 17, i32 7, i32 4, i32 22, i32 21, i32 11, i32 8, i32 26, i32 25, i32 15, i32 12, i32 30, i32 29>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_16xfloat_zero_masked_shuff_mask2(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %mask) {
+; CHECK-LABEL: test_16xfloat_zero_masked_shuff_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %zmm3, %zmm2, %k1
+; CHECK-NEXT: vshufps {{.*#+}} zmm0 {%k1} {z} = zmm0[3,0],zmm1[2,1],zmm0[7,4],zmm1[6,5],zmm0[11,8],zmm1[10,9],zmm0[15,12],zmm1[14,13]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 3, i32 0, i32 18, i32 17, i32 7, i32 4, i32 22, i32 21, i32 11, i32 8, i32 26, i32 25, i32 15, i32 12, i32 30, i32 29>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
+define <16 x float> @test_16xfloat_shuff_mask3(<16 x float> %vec1, <16 x float> %vec2) {
+; CHECK-LABEL: test_16xfloat_shuff_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vshufps {{.*#+}} zmm0 = zmm0[2,3],zmm1[0,2],zmm0[6,7],zmm1[4,6],zmm0[10,11],zmm1[8,10],zmm0[14,15],zmm1[12,14]
+; CHECK-NEXT: retq
+ %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 3, i32 16, i32 18, i32 6, i32 7, i32 20, i32 22, i32 10, i32 11, i32 24, i32 26, i32 14, i32 15, i32 28, i32 30>
+ ret <16 x float> %res
+}
+define <16 x float> @test_16xfloat_masked_shuff_mask3(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x float> %mask) {
+; CHECK-LABEL: test_16xfloat_masked_shuff_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqps %zmm4, %zmm3, %k1
+; CHECK-NEXT: vshufps {{.*#+}} zmm2 {%k1} = zmm0[2,3],zmm1[0,2],zmm0[6,7],zmm1[4,6],zmm0[10,11],zmm1[8,10],zmm0[14,15],zmm1[12,14]
+; CHECK-NEXT: vmovaps %zmm2, %zmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 3, i32 16, i32 18, i32 6, i32 7, i32 20, i32 22, i32 10, i32 11, i32 24, i32 26, i32 14, i32 15, i32 28, i32 30>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_16xfloat_zero_masked_shuff_mask3(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %mask) {
+; CHECK-LABEL: test_16xfloat_zero_masked_shuff_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %zmm3, %zmm2, %k1
+; CHECK-NEXT: vshufps {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3],zmm1[0,2],zmm0[6,7],zmm1[4,6],zmm0[10,11],zmm1[8,10],zmm0[14,15],zmm1[12,14]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 3, i32 16, i32 18, i32 6, i32 7, i32 20, i32 22, i32 10, i32 11, i32 24, i32 26, i32 14, i32 15, i32 28, i32 30>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
+define <16 x float> @test_16xfloat_shuff_mem_mask0(<16 x float> %vec1, <16 x float>* %vec2p) {
+; CHECK-LABEL: test_16xfloat_shuff_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vshufps {{.*#+}} zmm0 = zmm0[3,0],mem[0,2],zmm0[7,4],mem[4,6],zmm0[11,8],mem[8,10],zmm0[15,12],mem[12,14]
+; CHECK-NEXT: retq
+ %vec2 = load <16 x float>, <16 x float>* %vec2p
+ %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 3, i32 0, i32 16, i32 18, i32 7, i32 4, i32 20, i32 22, i32 11, i32 8, i32 24, i32 26, i32 15, i32 12, i32 28, i32 30>
+ ret <16 x float> %res
+}
+define <16 x float> @test_16xfloat_masked_shuff_mem_mask0(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x float> %mask) {
+; CHECK-LABEL: test_16xfloat_masked_shuff_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %zmm3, %zmm2, %k1
+; CHECK-NEXT: vshufps {{.*#+}} zmm1 {%k1} = zmm0[3,0],mem[0,2],zmm0[7,4],mem[4,6],zmm0[11,8],mem[8,10],zmm0[15,12],mem[12,14]
+; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %vec2 = load <16 x float>, <16 x float>* %vec2p
+ %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 3, i32 0, i32 16, i32 18, i32 7, i32 4, i32 20, i32 22, i32 11, i32 8, i32 24, i32 26, i32 15, i32 12, i32 28, i32 30>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_16xfloat_zero_masked_shuff_mem_mask0(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %mask) {
+; CHECK-LABEL: test_16xfloat_zero_masked_shuff_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %zmm2, %zmm1, %k1
+; CHECK-NEXT: vshufps {{.*#+}} zmm0 {%k1} {z} = zmm0[3,0],mem[0,2],zmm0[7,4],mem[4,6],zmm0[11,8],mem[8,10],zmm0[15,12],mem[12,14]
+; CHECK-NEXT: retq
+ %vec2 = load <16 x float>, <16 x float>* %vec2p
+ %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 3, i32 0, i32 16, i32 18, i32 7, i32 4, i32 20, i32 22, i32 11, i32 8, i32 24, i32 26, i32 15, i32 12, i32 28, i32 30>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_16xfloat_masked_shuff_mem_mask1(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x float> %mask) {
+; CHECK-LABEL: test_16xfloat_masked_shuff_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %zmm3, %zmm2, %k1
+; CHECK-NEXT: vshufps {{.*#+}} zmm1 {%k1} = zmm0[0,2],mem[3,2],zmm0[4,6],mem[7,6],zmm0[8,10],mem[11,10],zmm0[12,14],mem[15,14]
+; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %vec2 = load <16 x float>, <16 x float>* %vec2p
+ %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 2, i32 19, i32 18, i32 4, i32 6, i32 23, i32 22, i32 8, i32 10, i32 27, i32 26, i32 12, i32 14, i32 31, i32 30>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_16xfloat_zero_masked_shuff_mem_mask1(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %mask) {
+; CHECK-LABEL: test_16xfloat_zero_masked_shuff_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %zmm2, %zmm1, %k1
+; CHECK-NEXT: vshufps {{.*#+}} zmm0 {%k1} {z} = zmm0[0,2],mem[3,2],zmm0[4,6],mem[7,6],zmm0[8,10],mem[11,10],zmm0[12,14],mem[15,14]
+; CHECK-NEXT: retq
+ %vec2 = load <16 x float>, <16 x float>* %vec2p
+ %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 2, i32 19, i32 18, i32 4, i32 6, i32 23, i32 22, i32 8, i32 10, i32 27, i32 26, i32 12, i32 14, i32 31, i32 30>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_16xfloat_masked_shuff_mem_mask2(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x float> %mask) {
+; CHECK-LABEL: test_16xfloat_masked_shuff_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %zmm3, %zmm2, %k1
+; CHECK-NEXT: vshufps {{.*#+}} zmm1 {%k1} = zmm0[2,0],mem[2,2],zmm0[6,4],mem[6,6],zmm0[10,8],mem[10,10],zmm0[14,12],mem[14,14]
+; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %vec2 = load <16 x float>, <16 x float>* %vec2p
+ %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 0, i32 18, i32 18, i32 6, i32 4, i32 22, i32 22, i32 10, i32 8, i32 26, i32 26, i32 14, i32 12, i32 30, i32 30>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_16xfloat_zero_masked_shuff_mem_mask2(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %mask) {
+; CHECK-LABEL: test_16xfloat_zero_masked_shuff_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %zmm2, %zmm1, %k1
+; CHECK-NEXT: vshufps {{.*#+}} zmm0 {%k1} {z} = zmm0[2,0],mem[2,2],zmm0[6,4],mem[6,6],zmm0[10,8],mem[10,10],zmm0[14,12],mem[14,14]
+; CHECK-NEXT: retq
+ %vec2 = load <16 x float>, <16 x float>* %vec2p
+ %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 0, i32 18, i32 18, i32 6, i32 4, i32 22, i32 22, i32 10, i32 8, i32 26, i32 26, i32 14, i32 12, i32 30, i32 30>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_16xfloat_shuff_mem_mask3(<16 x float> %vec1, <16 x float>* %vec2p) {
+; CHECK-LABEL: test_16xfloat_shuff_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vshufps {{.*#+}} zmm0 = zmm0[2,1],mem[1,3],zmm0[6,5],mem[5,7],zmm0[10,9],mem[9,11],zmm0[14,13],mem[13,15]
+; CHECK-NEXT: retq
+ %vec2 = load <16 x float>, <16 x float>* %vec2p
+ %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 1, i32 17, i32 19, i32 6, i32 5, i32 21, i32 23, i32 10, i32 9, i32 25, i32 27, i32 14, i32 13, i32 29, i32 31>
+ ret <16 x float> %res
+}
+define <16 x float> @test_16xfloat_masked_shuff_mem_mask3(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x float> %mask) {
+; CHECK-LABEL: test_16xfloat_masked_shuff_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %zmm3, %zmm2, %k1
+; CHECK-NEXT: vshufps {{.*#+}} zmm1 {%k1} = zmm0[2,1],mem[1,3],zmm0[6,5],mem[5,7],zmm0[10,9],mem[9,11],zmm0[14,13],mem[13,15]
+; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %vec2 = load <16 x float>, <16 x float>* %vec2p
+ %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 1, i32 17, i32 19, i32 6, i32 5, i32 21, i32 23, i32 10, i32 9, i32 25, i32 27, i32 14, i32 13, i32 29, i32 31>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_16xfloat_zero_masked_shuff_mem_mask3(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %mask) {
+; CHECK-LABEL: test_16xfloat_zero_masked_shuff_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %zmm2, %zmm1, %k1
+; CHECK-NEXT: vshufps {{.*#+}} zmm0 {%k1} {z} = zmm0[2,1],mem[1,3],zmm0[6,5],mem[5,7],zmm0[10,9],mem[9,11],zmm0[14,13],mem[13,15]
+; CHECK-NEXT: retq
+ %vec2 = load <16 x float>, <16 x float>* %vec2p
+ %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 1, i32 17, i32 19, i32 6, i32 5, i32 21, i32 23, i32 10, i32 9, i32 25, i32 27, i32 14, i32 13, i32 29, i32 31>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
+
+define <2 x double> @test_2xdouble_shuff_mask0(<2 x double> %vec1, <2 x double> %vec2) {
+; CHECK-LABEL: test_2xdouble_shuff_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0]
+; CHECK-NEXT: retq
+ %res = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 1, i32 2>
+ ret <2 x double> %res
+}
+define <2 x double> @test_2xdouble_masked_shuff_mask0(<2 x double> %vec1, <2 x double> %vec2, <2 x double> %vec3, <2 x double> %mask) {
+; CHECK-LABEL: test_2xdouble_masked_shuff_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqpd %xmm4, %xmm3, %k1
+; CHECK-NEXT: vshufpd {{.*#+}} xmm2 {%k1} = xmm0[1],xmm1[0]
+; CHECK-NEXT: vmovapd %xmm2, %xmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 1, i32 2>
+ %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
+ %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec3
+ ret <2 x double> %res
+}
+
+define <2 x double> @test_2xdouble_zero_masked_shuff_mask0(<2 x double> %vec1, <2 x double> %vec2, <2 x double> %mask) {
+; CHECK-LABEL: test_2xdouble_zero_masked_shuff_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %xmm3, %xmm2, %k1
+; CHECK-NEXT: vshufpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[0]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 1, i32 2>
+ %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
+ %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
+ ret <2 x double> %res
+}
+define <2 x double> @test_2xdouble_masked_shuff_mask1(<2 x double> %vec1, <2 x double> %vec2, <2 x double> %vec3, <2 x double> %mask) {
+; CHECK-LABEL: test_2xdouble_masked_shuff_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqpd %xmm4, %xmm3, %k1
+; CHECK-NEXT: vshufpd {{.*#+}} xmm2 {%k1} = xmm0[1],xmm1[0]
+; CHECK-NEXT: vmovapd %xmm2, %xmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 1, i32 2>
+ %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
+ %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec3
+ ret <2 x double> %res
+}
+
+define <2 x double> @test_2xdouble_zero_masked_shuff_mask1(<2 x double> %vec1, <2 x double> %vec2, <2 x double> %mask) {
+; CHECK-LABEL: test_2xdouble_zero_masked_shuff_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %xmm3, %xmm2, %k1
+; CHECK-NEXT: vshufpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[0]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 1, i32 2>
+ %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
+ %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
+ ret <2 x double> %res
+}
+define <2 x double> @test_2xdouble_shuff_mem_mask0(<2 x double> %vec1, <2 x double>* %vec2p) {
+; CHECK-LABEL: test_2xdouble_shuff_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1],mem[0]
+; CHECK-NEXT: retq
+ %vec2 = load <2 x double>, <2 x double>* %vec2p
+ %res = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 1, i32 2>
+ ret <2 x double> %res
+}
+define <2 x double> @test_2xdouble_masked_shuff_mem_mask0(<2 x double> %vec1, <2 x double>* %vec2p, <2 x double> %vec3, <2 x double> %mask) {
+; CHECK-LABEL: test_2xdouble_masked_shuff_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %xmm3, %xmm2, %k1
+; CHECK-NEXT: vshufpd {{.*#+}} xmm1 {%k1} = xmm0[1],mem[0]
+; CHECK-NEXT: vmovapd %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %vec2 = load <2 x double>, <2 x double>* %vec2p
+ %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 1, i32 2>
+ %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
+ %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec3
+ ret <2 x double> %res
+}
+
+define <2 x double> @test_2xdouble_zero_masked_shuff_mem_mask0(<2 x double> %vec1, <2 x double>* %vec2p, <2 x double> %mask) {
+; CHECK-LABEL: test_2xdouble_zero_masked_shuff_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %xmm2, %xmm1, %k1
+; CHECK-NEXT: vshufpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],mem[0]
+; CHECK-NEXT: retq
+ %vec2 = load <2 x double>, <2 x double>* %vec2p
+ %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 1, i32 2>
+ %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
+ %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
+ ret <2 x double> %res
+}
+
+define <2 x double> @test_2xdouble_masked_shuff_mem_mask1(<2 x double> %vec1, <2 x double>* %vec2p, <2 x double> %vec3, <2 x double> %mask) {
+; CHECK-LABEL: test_2xdouble_masked_shuff_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %xmm3, %xmm2, %k1
+; CHECK-NEXT: vshufpd {{.*#+}} xmm1 {%k1} = xmm0[1],mem[0]
+; CHECK-NEXT: vmovapd %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %vec2 = load <2 x double>, <2 x double>* %vec2p
+ %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 1, i32 2>
+ %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
+ %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec3
+ ret <2 x double> %res
+}
+
+define <2 x double> @test_2xdouble_zero_masked_shuff_mem_mask1(<2 x double> %vec1, <2 x double>* %vec2p, <2 x double> %mask) {
+; CHECK-LABEL: test_2xdouble_zero_masked_shuff_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %xmm2, %xmm1, %k1
+; CHECK-NEXT: vshufpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],mem[0]
+; CHECK-NEXT: retq
+ %vec2 = load <2 x double>, <2 x double>* %vec2p
+ %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 1, i32 2>
+ %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
+ %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
+ ret <2 x double> %res
+}
+
+define <4 x double> @test_4xdouble_shuff_mask0(<4 x double> %vec1, <4 x double> %vec2) {
+; CHECK-LABEL: test_4xdouble_shuff_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[3],ymm1[3]
+; CHECK-NEXT: retq
+ %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 3, i32 7>
+ ret <4 x double> %res
+}
+define <4 x double> @test_4xdouble_masked_shuff_mask0(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x double> %mask) {
+; CHECK-LABEL: test_4xdouble_masked_shuff_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqpd %ymm4, %ymm3, %k1
+; CHECK-NEXT: vshufpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[3],ymm1[3]
+; CHECK-NEXT: vmovapd %ymm2, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 3, i32 7>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_4xdouble_zero_masked_shuff_mask0(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %mask) {
+; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
+; CHECK-NEXT: vshufpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[3],ymm1[3]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 3, i32 7>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
+ ret <4 x double> %res
+}
+define <4 x double> @test_4xdouble_masked_shuff_mask1(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x double> %mask) {
+; CHECK-LABEL: test_4xdouble_masked_shuff_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqpd %ymm4, %ymm3, %k1
+; CHECK-NEXT: vshufpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[3],ymm1[2]
+; CHECK-NEXT: vmovapd %ymm2, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 3, i32 6>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_4xdouble_zero_masked_shuff_mask1(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %mask) {
+; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
+; CHECK-NEXT: vshufpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[3],ymm1[2]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 3, i32 6>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
+ ret <4 x double> %res
+}
+define <4 x double> @test_4xdouble_masked_shuff_mask2(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x double> %mask) {
+; CHECK-LABEL: test_4xdouble_masked_shuff_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqpd %ymm4, %ymm3, %k1
+; CHECK-NEXT: vshufpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[0],ymm0[3],ymm1[2]
+; CHECK-NEXT: vmovapd %ymm2, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 4, i32 3, i32 6>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_4xdouble_zero_masked_shuff_mask2(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %mask) {
+; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
+; CHECK-NEXT: vshufpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[0],ymm0[3],ymm1[2]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 4, i32 3, i32 6>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
+ ret <4 x double> %res
+}
+define <4 x double> @test_4xdouble_shuff_mask3(<4 x double> %vec1, <4 x double> %vec2) {
+; CHECK-LABEL: test_4xdouble_shuff_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[3]
+; CHECK-NEXT: retq
+ %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 7>
+ ret <4 x double> %res
+}
+define <4 x double> @test_4xdouble_masked_shuff_mask3(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x double> %mask) {
+; CHECK-LABEL: test_4xdouble_masked_shuff_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqpd %ymm4, %ymm3, %k1
+; CHECK-NEXT: vshufpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[3]
+; CHECK-NEXT: vmovapd %ymm2, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 7>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_4xdouble_zero_masked_shuff_mask3(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %mask) {
+; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
+; CHECK-NEXT: vshufpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[3]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 7>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
+ ret <4 x double> %res
+}
+define <4 x double> @test_4xdouble_shuff_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p) {
+; CHECK-LABEL: test_4xdouble_shuff_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[2]
+; CHECK-NEXT: retq
+ %vec2 = load <4 x double>, <4 x double>* %vec2p
+ %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 6>
+ ret <4 x double> %res
+}
+define <4 x double> @test_4xdouble_masked_shuff_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x double> %mask) {
+; CHECK-LABEL: test_4xdouble_masked_shuff_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
+; CHECK-NEXT: vshufpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[2]
+; CHECK-NEXT: vmovapd %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %vec2 = load <4 x double>, <4 x double>* %vec2p
+ %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 6>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %mask) {
+; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vshufpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[2]
+; CHECK-NEXT: retq
+ %vec2 = load <4 x double>, <4 x double>* %vec2p
+ %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 6>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_4xdouble_masked_shuff_mem_mask1(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x double> %mask) {
+; CHECK-LABEL: test_4xdouble_masked_shuff_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
+; CHECK-NEXT: vshufpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[1],ymm0[2],mem[2]
+; CHECK-NEXT: vmovapd %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %vec2 = load <4 x double>, <4 x double>* %vec2p
+ %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 5, i32 2, i32 6>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask1(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %mask) {
+; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vshufpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[1],ymm0[2],mem[2]
+; CHECK-NEXT: retq
+ %vec2 = load <4 x double>, <4 x double>* %vec2p
+ %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 5, i32 2, i32 6>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_4xdouble_masked_shuff_mem_mask2(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x double> %mask) {
+; CHECK-LABEL: test_4xdouble_masked_shuff_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
+; CHECK-NEXT: vshufpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[3],mem[2]
+; CHECK-NEXT: vmovapd %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %vec2 = load <4 x double>, <4 x double>* %vec2p
+ %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 3, i32 6>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask2(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %mask) {
+; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vshufpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[3],mem[2]
+; CHECK-NEXT: retq
+ %vec2 = load <4 x double>, <4 x double>* %vec2p
+ %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 3, i32 6>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_4xdouble_shuff_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p) {
+; CHECK-LABEL: test_4xdouble_shuff_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[2],mem[2]
+; CHECK-NEXT: retq
+ %vec2 = load <4 x double>, <4 x double>* %vec2p
+ %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 2, i32 6>
+ ret <4 x double> %res
+}
+define <4 x double> @test_4xdouble_masked_shuff_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x double> %mask) {
+; CHECK-LABEL: test_4xdouble_masked_shuff_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
+; CHECK-NEXT: vshufpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[2],mem[2]
+; CHECK-NEXT: vmovapd %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %vec2 = load <4 x double>, <4 x double>* %vec2p
+ %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 2, i32 6>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %mask) {
+; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vshufpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[2],mem[2]
+; CHECK-NEXT: retq
+ %vec2 = load <4 x double>, <4 x double>* %vec2p
+ %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 2, i32 6>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
+ ret <4 x double> %res
+}
+
+define <8 x double> @test_8xdouble_shuff_mask0(<8 x double> %vec1, <8 x double> %vec2) {
+; CHECK-LABEL: test_8xdouble_shuff_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vshufpd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[3],zmm0[4],zmm1[5],zmm0[7],zmm1[7]
+; CHECK-NEXT: retq
+ %res = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 11, i32 4, i32 13, i32 7, i32 15>
+ ret <8 x double> %res
+}
+define <8 x double> @test_8xdouble_masked_shuff_mask0(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x double> %mask) {
+; CHECK-LABEL: test_8xdouble_masked_shuff_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqpd %zmm4, %zmm3, %k1
+; CHECK-NEXT: vshufpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[3],zmm0[4],zmm1[5],zmm0[7],zmm1[7]
+; CHECK-NEXT: vmovapd %zmm2, %zmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 11, i32 4, i32 13, i32 7, i32 15>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_8xdouble_zero_masked_shuff_mask0(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %mask) {
+; CHECK-LABEL: test_8xdouble_zero_masked_shuff_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %zmm3, %zmm2, %k1
+; CHECK-NEXT: vshufpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[3],zmm0[4],zmm1[5],zmm0[7],zmm1[7]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 11, i32 4, i32 13, i32 7, i32 15>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
+ ret <8 x double> %res
+}
+define <8 x double> @test_8xdouble_masked_shuff_mask1(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x double> %mask) {
+; CHECK-LABEL: test_8xdouble_masked_shuff_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqpd %zmm4, %zmm3, %k1
+; CHECK-NEXT: vshufpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[3],zmm0[5],zmm1[5],zmm0[6],zmm1[7]
+; CHECK-NEXT: vmovapd %zmm2, %zmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 11, i32 5, i32 13, i32 6, i32 15>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_8xdouble_zero_masked_shuff_mask1(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %mask) {
+; CHECK-LABEL: test_8xdouble_zero_masked_shuff_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %zmm3, %zmm2, %k1
+; CHECK-NEXT: vshufpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[3],zmm0[5],zmm1[5],zmm0[6],zmm1[7]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 11, i32 5, i32 13, i32 6, i32 15>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
+ ret <8 x double> %res
+}
+define <8 x double> @test_8xdouble_masked_shuff_mask2(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x double> %mask) {
+; CHECK-LABEL: test_8xdouble_masked_shuff_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqpd %zmm4, %zmm3, %k1
+; CHECK-NEXT: vshufpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[0],zmm0[3],zmm1[3],zmm0[4],zmm1[5],zmm0[6],zmm1[6]
+; CHECK-NEXT: vmovapd %zmm2, %zmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 8, i32 3, i32 11, i32 4, i32 13, i32 6, i32 14>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_8xdouble_zero_masked_shuff_mask2(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %mask) {
+; CHECK-LABEL: test_8xdouble_zero_masked_shuff_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %zmm3, %zmm2, %k1
+; CHECK-NEXT: vshufpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[0],zmm0[3],zmm1[3],zmm0[4],zmm1[5],zmm0[6],zmm1[6]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 8, i32 3, i32 11, i32 4, i32 13, i32 6, i32 14>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
+ ret <8 x double> %res
+}
+define <8 x double> @test_8xdouble_shuff_mask3(<8 x double> %vec1, <8 x double> %vec2) {
+; CHECK-LABEL: test_8xdouble_shuff_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vshufpd {{.*#+}} zmm0 = zmm0[1],zmm1[0],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[7],zmm1[7]
+; CHECK-NEXT: retq
+ %res = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 8, i32 3, i32 11, i32 4, i32 12, i32 7, i32 15>
+ ret <8 x double> %res
+}
+define <8 x double> @test_8xdouble_masked_shuff_mask3(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x double> %mask) {
+; CHECK-LABEL: test_8xdouble_masked_shuff_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqpd %zmm4, %zmm3, %k1
+; CHECK-NEXT: vshufpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[0],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[7],zmm1[7]
+; CHECK-NEXT: vmovapd %zmm2, %zmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 8, i32 3, i32 11, i32 4, i32 12, i32 7, i32 15>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_8xdouble_zero_masked_shuff_mask3(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %mask) {
+; CHECK-LABEL: test_8xdouble_zero_masked_shuff_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %zmm3, %zmm2, %k1
+; CHECK-NEXT: vshufpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[0],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[7],zmm1[7]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 8, i32 3, i32 11, i32 4, i32 12, i32 7, i32 15>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
+ ret <8 x double> %res
+}
+define <8 x double> @test_8xdouble_shuff_mem_mask0(<8 x double> %vec1, <8 x double>* %vec2p) {
+; CHECK-LABEL: test_8xdouble_shuff_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vshufpd {{.*#+}} zmm0 = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[5],mem[5],zmm0[6],mem[7]
+; CHECK-NEXT: retq
+ %vec2 = load <8 x double>, <8 x double>* %vec2p
+ %res = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 5, i32 13, i32 6, i32 15>
+ ret <8 x double> %res
+}
+define <8 x double> @test_8xdouble_masked_shuff_mem_mask0(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x double> %mask) {
+; CHECK-LABEL: test_8xdouble_masked_shuff_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %zmm3, %zmm2, %k1
+; CHECK-NEXT: vshufpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[5],mem[5],zmm0[6],mem[7]
+; CHECK-NEXT: vmovapd %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %vec2 = load <8 x double>, <8 x double>* %vec2p
+ %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 5, i32 13, i32 6, i32 15>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_8xdouble_zero_masked_shuff_mem_mask0(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %mask) {
+; CHECK-LABEL: test_8xdouble_zero_masked_shuff_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %zmm2, %zmm1, %k1
+; CHECK-NEXT: vshufpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[5],mem[5],zmm0[6],mem[7]
+; CHECK-NEXT: retq
+ %vec2 = load <8 x double>, <8 x double>* %vec2p
+ %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 5, i32 13, i32 6, i32 15>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_8xdouble_masked_shuff_mem_mask1(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x double> %mask) {
+; CHECK-LABEL: test_8xdouble_masked_shuff_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %zmm3, %zmm2, %k1
+; CHECK-NEXT: vshufpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[0],zmm0[3],mem[2],zmm0[4],mem[4],zmm0[7],mem[7]
+; CHECK-NEXT: vmovapd %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %vec2 = load <8 x double>, <8 x double>* %vec2p
+ %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 8, i32 3, i32 10, i32 4, i32 12, i32 7, i32 15>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_8xdouble_zero_masked_shuff_mem_mask1(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %mask) {
+; CHECK-LABEL: test_8xdouble_zero_masked_shuff_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %zmm2, %zmm1, %k1
+; CHECK-NEXT: vshufpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],mem[0],zmm0[3],mem[2],zmm0[4],mem[4],zmm0[7],mem[7]
+; CHECK-NEXT: retq
+ %vec2 = load <8 x double>, <8 x double>* %vec2p
+ %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 8, i32 3, i32 10, i32 4, i32 12, i32 7, i32 15>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_8xdouble_masked_shuff_mem_mask2(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x double> %mask) {
+; CHECK-LABEL: test_8xdouble_masked_shuff_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %zmm3, %zmm2, %k1
+; CHECK-NEXT: vshufpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[2],zmm0[5],mem[5],zmm0[7],mem[7]
+; CHECK-NEXT: vmovapd %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %vec2 = load <8 x double>, <8 x double>* %vec2p
+ %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 10, i32 5, i32 13, i32 7, i32 15>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_8xdouble_zero_masked_shuff_mem_mask2(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %mask) {
+; CHECK-LABEL: test_8xdouble_zero_masked_shuff_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %zmm2, %zmm1, %k1
+; CHECK-NEXT: vshufpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],mem[1],zmm0[3],mem[2],zmm0[5],mem[5],zmm0[7],mem[7]
+; CHECK-NEXT: retq
+ %vec2 = load <8 x double>, <8 x double>* %vec2p
+ %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 10, i32 5, i32 13, i32 7, i32 15>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_8xdouble_shuff_mem_mask3(<8 x double> %vec1, <8 x double>* %vec2p) {
+; CHECK-LABEL: test_8xdouble_shuff_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vshufpd {{.*#+}} zmm0 = zmm0[1],mem[1],zmm0[2],mem[3],zmm0[4],mem[5],zmm0[6],mem[6]
+; CHECK-NEXT: retq
+ %vec2 = load <8 x double>, <8 x double>* %vec2p
+ %res = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 14>
+ ret <8 x double> %res
+}
+define <8 x double> @test_8xdouble_masked_shuff_mem_mask3(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x double> %mask) {
+; CHECK-LABEL: test_8xdouble_masked_shuff_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %zmm3, %zmm2, %k1
+; CHECK-NEXT: vshufpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[2],mem[3],zmm0[4],mem[5],zmm0[6],mem[6]
+; CHECK-NEXT: vmovapd %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %vec2 = load <8 x double>, <8 x double>* %vec2p
+ %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 14>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_8xdouble_zero_masked_shuff_mem_mask3(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %mask) {
+; CHECK-LABEL: test_8xdouble_zero_masked_shuff_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %zmm2, %zmm1, %k1
+; CHECK-NEXT: vshufpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],mem[1],zmm0[2],mem[3],zmm0[4],mem[5],zmm0[6],mem[6]
+; CHECK-NEXT: retq
+ %vec2 = load <8 x double>, <8 x double>* %vec2p
+ %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 14>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
+ ret <8 x double> %res
+}
+
diff --git a/test/CodeGen/X86/avx512-shuffles/shuffle-vec.ll b/test/CodeGen/X86/avx512-shuffles/shuffle-vec.ll
new file mode 100644
index 000000000000..1896356dafa4
--- /dev/null
+++ b/test/CodeGen/X86/avx512-shuffles/shuffle-vec.ll
@@ -0,0 +1,2037 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512vl %s -o - | FileCheck %s
+
+; FIXME: 128-bit shuffles of 256-bit vectors cases should be fixed by PR34359
+
+define <8 x float> @test_8xfloat_shuff_mask0(<8 x float> %vec1, <8 x float> %vec2) {
+; CHECK-LABEL: test_8xfloat_shuff_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
+; CHECK-NEXT: retq
+ %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+ ret <8 x float> %res
+}
+define <8 x float> @test_8xfloat_masked_shuff_mask0(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x float> %mask) {
+; CHECK-LABEL: test_8xfloat_masked_shuff_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqps %ymm4, %ymm3, %k1
+; CHECK-NEXT: vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3]
+; CHECK-NEXT: vmovaps %ymm2, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_8xfloat_zero_masked_shuff_mask0(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %mask) {
+; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1
+; CHECK-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+define <8 x float> @test_8xfloat_masked_shuff_mask1(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x float> %mask) {
+; CHECK-LABEL: test_8xfloat_masked_shuff_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqps %ymm4, %ymm3, %k1
+; CHECK-NEXT: vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3]
+; CHECK-NEXT: vmovaps %ymm2, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_8xfloat_zero_masked_shuff_mask1(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %mask) {
+; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1
+; CHECK-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+define <8 x float> @test_8xfloat_masked_shuff_mask2(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x float> %mask) {
+; CHECK-LABEL: test_8xfloat_masked_shuff_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqps %ymm4, %ymm3, %k1
+; CHECK-NEXT: vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[4,5,6,7]
+; CHECK-NEXT: vmovaps %ymm2, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_8xfloat_zero_masked_shuff_mask2(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %mask) {
+; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1
+; CHECK-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+define <8 x float> @test_8xfloat_shuff_mask3(<8 x float> %vec1, <8 x float> %vec2) {
+; CHECK-LABEL: test_8xfloat_shuff_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
+; CHECK-NEXT: retq
+ %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+ ret <8 x float> %res
+}
+define <8 x float> @test_8xfloat_masked_shuff_mask3(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x float> %mask) {
+; CHECK-LABEL: test_8xfloat_masked_shuff_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqps %ymm4, %ymm3, %k1
+; CHECK-NEXT: vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3]
+; CHECK-NEXT: vmovaps %ymm2, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_8xfloat_zero_masked_shuff_mask3(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %mask) {
+; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1
+; CHECK-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+define <8 x float> @test_8xfloat_shuff_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p) {
+; CHECK-LABEL: test_8xfloat_shuff_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
+; CHECK-NEXT: retq
+ %vec2 = load <8 x float>, <8 x float>* %vec2p
+ %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
+ ret <8 x float> %res
+}
+define <8 x float> @test_8xfloat_masked_shuff_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x float> %mask) {
+; CHECK-LABEL: test_8xfloat_masked_shuff_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1
+; CHECK-NEXT: vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %vec2 = load <8 x float>, <8 x float>* %vec2p
+ %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %mask) {
+; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1
+; CHECK-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7]
+; CHECK-NEXT: retq
+ %vec2 = load <8 x float>, <8 x float>* %vec2p
+ %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_8xfloat_masked_shuff_mem_mask1(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x float> %mask) {
+; CHECK-LABEL: test_8xfloat_masked_shuff_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1
+; CHECK-NEXT: vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %vec2 = load <8 x float>, <8 x float>* %vec2p
+ %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask1(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %mask) {
+; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1
+; CHECK-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7]
+; CHECK-NEXT: retq
+ %vec2 = load <8 x float>, <8 x float>* %vec2p
+ %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_8xfloat_masked_shuff_mem_mask2(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x float> %mask) {
+; CHECK-LABEL: test_8xfloat_masked_shuff_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1
+; CHECK-NEXT: vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %vec2 = load <8 x float>, <8 x float>* %vec2p
+ %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask2(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %mask) {
+; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1
+; CHECK-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3]
+; CHECK-NEXT: retq
+ %vec2 = load <8 x float>, <8 x float>* %vec2p
+ %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_8xfloat_shuff_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p) {
+; CHECK-LABEL: test_8xfloat_shuff_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1]
+; CHECK-NEXT: retq
+ %vec2 = load <8 x float>, <8 x float>* %vec2p
+ %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+ ret <8 x float> %res
+}
+define <8 x float> @test_8xfloat_masked_shuff_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x float> %mask) {
+; CHECK-LABEL: test_8xfloat_masked_shuff_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1
+; CHECK-NEXT: vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %vec2 = load <8 x float>, <8 x float>* %vec2p
+ %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %mask) {
+; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1
+; CHECK-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3]
+; CHECK-NEXT: retq
+ %vec2 = load <8 x float>, <8 x float>* %vec2p
+ %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+
+define <16 x float> @test_16xfloat_shuff_mask0(<16 x float> %vec1, <16 x float> %vec2) {
+; CHECK-LABEL: test_16xfloat_shuff_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[6,7,0,1],zmm1[2,3,6,7]
+; CHECK-NEXT: retq
+ %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31>
+ ret <16 x float> %res
+}
+define <16 x float> @test_16xfloat_masked_shuff_mask0(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x float> %mask) {
+; CHECK-LABEL: test_16xfloat_masked_shuff_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqps %zmm4, %zmm3, %k1
+; CHECK-NEXT: vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[12,13,14,15,0,1,2,3],zmm1[4,5,6,7,12,13,14,15]
+; CHECK-NEXT: vmovaps %zmm2, %zmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_16xfloat_zero_masked_shuff_mask0(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %mask) {
+; CHECK-LABEL: test_16xfloat_zero_masked_shuff_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %zmm3, %zmm2, %k1
+; CHECK-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[12,13,14,15,0,1,2,3],zmm1[4,5,6,7,12,13,14,15]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
+define <16 x float> @test_16xfloat_masked_shuff_mask1(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x float> %mask) {
+; CHECK-LABEL: test_16xfloat_masked_shuff_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqps %zmm4, %zmm3, %k1
+; CHECK-NEXT: vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[0,1,2,3,8,9,10,11],zmm1[0,1,2,3,12,13,14,15]
+; CHECK-NEXT: vmovaps %zmm2, %zmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19, i32 28, i32 29, i32 30, i32 31>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_16xfloat_zero_masked_shuff_mask1(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %mask) {
+; CHECK-LABEL: test_16xfloat_zero_masked_shuff_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %zmm3, %zmm2, %k1
+; CHECK-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,8,9,10,11],zmm1[0,1,2,3,12,13,14,15]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19, i32 28, i32 29, i32 30, i32 31>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
+define <16 x float> @test_16xfloat_masked_shuff_mask2(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x float> %mask) {
+; CHECK-LABEL: test_16xfloat_masked_shuff_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqps %zmm4, %zmm3, %k1
+; CHECK-NEXT: vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[12,13,14,15,4,5,6,7],zmm1[0,1,2,3,4,5,6,7]
+; CHECK-NEXT: vmovaps %zmm2, %zmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_16xfloat_zero_masked_shuff_mask2(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %mask) {
+; CHECK-LABEL: test_16xfloat_zero_masked_shuff_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %zmm3, %zmm2, %k1
+; CHECK-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[12,13,14,15,4,5,6,7],zmm1[0,1,2,3,4,5,6,7]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
+define <16 x float> @test_16xfloat_shuff_mask3(<16 x float> %vec1, <16 x float> %vec2) {
+; CHECK-LABEL: test_16xfloat_shuff_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[4,5,6,7],zmm1[0,1,4,5]
+; CHECK-NEXT: retq
+ %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 24, i32 25, i32 26, i32 27>
+ ret <16 x float> %res
+}
+define <16 x float> @test_16xfloat_masked_shuff_mask3(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x float> %mask) {
+; CHECK-LABEL: test_16xfloat_masked_shuff_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqps %zmm4, %zmm3, %k1
+; CHECK-NEXT: vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[8,9,10,11,12,13,14,15],zmm1[0,1,2,3,8,9,10,11]
+; CHECK-NEXT: vmovaps %zmm2, %zmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 24, i32 25, i32 26, i32 27>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_16xfloat_zero_masked_shuff_mask3(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %mask) {
+; CHECK-LABEL: test_16xfloat_zero_masked_shuff_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %zmm3, %zmm2, %k1
+; CHECK-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,12,13,14,15],zmm1[0,1,2,3,8,9,10,11]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 24, i32 25, i32 26, i32 27>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
+define <16 x float> @test_16xfloat_shuff_mem_mask0(<16 x float> %vec1, <16 x float>* %vec2p) {
+; CHECK-LABEL: test_16xfloat_shuff_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[6,7,4,5],mem[4,5,2,3]
+; CHECK-NEXT: retq
+ %vec2 = load <16 x float>, <16 x float>* %vec2p
+ %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23>
+ ret <16 x float> %res
+}
+define <16 x float> @test_16xfloat_masked_shuff_mem_mask0(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x float> %mask) {
+; CHECK-LABEL: test_16xfloat_masked_shuff_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %zmm3, %zmm2, %k1
+; CHECK-NEXT: vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[12,13,14,15,8,9,10,11],mem[8,9,10,11,4,5,6,7]
+; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %vec2 = load <16 x float>, <16 x float>* %vec2p
+ %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_16xfloat_zero_masked_shuff_mem_mask0(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %mask) {
+; CHECK-LABEL: test_16xfloat_zero_masked_shuff_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %zmm2, %zmm1, %k1
+; CHECK-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[12,13,14,15,8,9,10,11],mem[8,9,10,11,4,5,6,7]
+; CHECK-NEXT: retq
+ %vec2 = load <16 x float>, <16 x float>* %vec2p
+ %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_16xfloat_masked_shuff_mem_mask1(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x float> %mask) {
+; CHECK-LABEL: test_16xfloat_masked_shuff_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %zmm3, %zmm2, %k1
+; CHECK-NEXT: vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,4,5,6,7]
+; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %vec2 = load <16 x float>, <16 x float>* %vec2p
+ %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_16xfloat_zero_masked_shuff_mem_mask1(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %mask) {
+; CHECK-LABEL: test_16xfloat_zero_masked_shuff_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %zmm2, %zmm1, %k1
+; CHECK-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,4,5,6,7]
+; CHECK-NEXT: retq
+ %vec2 = load <16 x float>, <16 x float>* %vec2p
+ %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_16xfloat_masked_shuff_mem_mask2(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x float> %mask) {
+; CHECK-LABEL: test_16xfloat_masked_shuff_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %zmm3, %zmm2, %k1
+; CHECK-NEXT: vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,0,1,2,3],mem[8,9,10,11,8,9,10,11]
+; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %vec2 = load <16 x float>, <16 x float>* %vec2p
+ %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 24, i32 25, i32 26, i32 27, i32 24, i32 25, i32 26, i32 27>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_16xfloat_zero_masked_shuff_mem_mask2(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %mask) {
+; CHECK-LABEL: test_16xfloat_zero_masked_shuff_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %zmm2, %zmm1, %k1
+; CHECK-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,0,1,2,3],mem[8,9,10,11,8,9,10,11]
+; CHECK-NEXT: retq
+ %vec2 = load <16 x float>, <16 x float>* %vec2p
+ %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 24, i32 25, i32 26, i32 27, i32 24, i32 25, i32 26, i32 27>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_16xfloat_shuff_mem_mask3(<16 x float> %vec1, <16 x float>* %vec2p) {
+; CHECK-LABEL: test_16xfloat_shuff_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[2,3,0,1],mem[6,7,6,7]
+; CHECK-NEXT: retq
+ %vec2 = load <16 x float>, <16 x float>* %vec2p
+ %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 28, i32 29, i32 30, i32 31, i32 28, i32 29, i32 30, i32 31>
+ ret <16 x float> %res
+}
+define <16 x float> @test_16xfloat_masked_shuff_mem_mask3(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x float> %mask) {
+; CHECK-LABEL: test_16xfloat_masked_shuff_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %zmm3, %zmm2, %k1
+; CHECK-NEXT: vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,0,1,2,3],mem[12,13,14,15,12,13,14,15]
+; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %vec2 = load <16 x float>, <16 x float>* %vec2p
+ %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 28, i32 29, i32 30, i32 31, i32 28, i32 29, i32 30, i32 31>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_16xfloat_zero_masked_shuff_mem_mask3(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %mask) {
+; CHECK-LABEL: test_16xfloat_zero_masked_shuff_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %zmm2, %zmm1, %k1
+; CHECK-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,0,1,2,3],mem[12,13,14,15,12,13,14,15]
+; CHECK-NEXT: retq
+ %vec2 = load <16 x float>, <16 x float>* %vec2p
+ %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 28, i32 29, i32 30, i32 31, i32 28, i32 29, i32 30, i32 31>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
+
+define <4 x double> @test_4xdouble_shuff_mask0(<4 x double> %vec1, <4 x double> %vec2) {
+; CHECK-LABEL: test_4xdouble_shuff_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
+; CHECK-NEXT: retq
+ %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+ ret <4 x double> %res
+}
+define <4 x double> @test_4xdouble_masked_shuff_mask0(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x double> %mask) {
+; CHECK-LABEL: test_4xdouble_masked_shuff_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqpd %ymm4, %ymm3, %k1
+; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1]
+; CHECK-NEXT: vmovapd %ymm2, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_4xdouble_zero_masked_shuff_mask0(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %mask) {
+; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
+; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
+ ret <4 x double> %res
+}
+define <4 x double> @test_4xdouble_masked_shuff_mask1(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x double> %mask) {
+; CHECK-LABEL: test_4xdouble_masked_shuff_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqpd %ymm4, %ymm3, %k1
+; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1]
+; CHECK-NEXT: vmovapd %ymm2, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_4xdouble_zero_masked_shuff_mask1(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %mask) {
+; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
+; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
+ ret <4 x double> %res
+}
+define <4 x double> @test_4xdouble_masked_shuff_mask2(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x double> %mask) {
+; CHECK-LABEL: test_4xdouble_masked_shuff_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqpd %ymm4, %ymm3, %k1
+; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3]
+; CHECK-NEXT: vmovapd %ymm2, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_4xdouble_zero_masked_shuff_mask2(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %mask) {
+; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
+; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
+ ret <4 x double> %res
+}
+define <4 x double> @test_4xdouble_shuff_mask3(<4 x double> %vec1, <4 x double> %vec2) {
+; CHECK-LABEL: test_4xdouble_shuff_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
+; CHECK-NEXT: retq
+ %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+ ret <4 x double> %res
+}
+define <4 x double> @test_4xdouble_masked_shuff_mask3(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x double> %mask) {
+; CHECK-LABEL: test_4xdouble_masked_shuff_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqpd %ymm4, %ymm3, %k1
+; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3]
+; CHECK-NEXT: vmovapd %ymm2, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_4xdouble_zero_masked_shuff_mask3(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %mask) {
+; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
+; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
+ ret <4 x double> %res
+}
+define <4 x double> @test_4xdouble_shuff_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p) {
+; CHECK-LABEL: test_4xdouble_shuff_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
+; CHECK-NEXT: retq
+ %vec2 = load <4 x double>, <4 x double>* %vec2p
+ %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+ ret <4 x double> %res
+}
+define <4 x double> @test_4xdouble_masked_shuff_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x double> %mask) {
+; CHECK-LABEL: test_4xdouble_masked_shuff_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
+; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3]
+; CHECK-NEXT: vmovapd %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %vec2 = load <4 x double>, <4 x double>* %vec2p
+ %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %mask) {
+; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3]
+; CHECK-NEXT: retq
+ %vec2 = load <4 x double>, <4 x double>* %vec2p
+ %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_4xdouble_masked_shuff_mem_mask1(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x double> %mask) {
+; CHECK-LABEL: test_4xdouble_masked_shuff_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
+; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1]
+; CHECK-NEXT: vmovapd %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %vec2 = load <4 x double>, <4 x double>* %vec2p
+ %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask1(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %mask) {
+; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1]
+; CHECK-NEXT: retq
+ %vec2 = load <4 x double>, <4 x double>* %vec2p
+ %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_4xdouble_masked_shuff_mem_mask2(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x double> %mask) {
+; CHECK-LABEL: test_4xdouble_masked_shuff_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
+; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1]
+; CHECK-NEXT: vmovapd %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %vec2 = load <4 x double>, <4 x double>* %vec2p
+ %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask2(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %mask) {
+; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1]
+; CHECK-NEXT: retq
+ %vec2 = load <4 x double>, <4 x double>* %vec2p
+ %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_4xdouble_shuff_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p) {
+; CHECK-LABEL: test_4xdouble_shuff_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
+; CHECK-NEXT: retq
+ %vec2 = load <4 x double>, <4 x double>* %vec2p
+ %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+ ret <4 x double> %res
+}
+define <4 x double> @test_4xdouble_masked_shuff_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x double> %mask) {
+; CHECK-LABEL: test_4xdouble_masked_shuff_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
+; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3]
+; CHECK-NEXT: vmovapd %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %vec2 = load <4 x double>, <4 x double>* %vec2p
+ %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %mask) {
+; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3]
+; CHECK-NEXT: retq
+ %vec2 = load <4 x double>, <4 x double>* %vec2p
+ %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
+ ret <4 x double> %res
+}
+
+define <8 x double> @test_8xdouble_shuff_mask0(<8 x double> %vec1, <8 x double> %vec2) {
+; CHECK-LABEL: test_8xdouble_shuff_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[6,7,2,3],zmm1[6,7,0,1]
+; CHECK-NEXT: retq
+ %res = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 6, i32 7, i32 2, i32 3, i32 14, i32 15, i32 8, i32 9>
+ ret <8 x double> %res
+}
+define <8 x double> @test_8xdouble_masked_shuff_mask0(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x double> %mask) {
+; CHECK-LABEL: test_8xdouble_masked_shuff_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqpd %zmm4, %zmm3, %k1
+; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[6,7,2,3],zmm1[6,7,0,1]
+; CHECK-NEXT: vmovapd %zmm2, %zmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 6, i32 7, i32 2, i32 3, i32 14, i32 15, i32 8, i32 9>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_8xdouble_zero_masked_shuff_mask0(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %mask) {
+; CHECK-LABEL: test_8xdouble_zero_masked_shuff_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %zmm3, %zmm2, %k1
+; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,2,3],zmm1[6,7,0,1]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 6, i32 7, i32 2, i32 3, i32 14, i32 15, i32 8, i32 9>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
+ ret <8 x double> %res
+}
+define <8 x double> @test_8xdouble_masked_shuff_mask1(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x double> %mask) {
+; CHECK-LABEL: test_8xdouble_masked_shuff_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqpd %zmm4, %zmm3, %k1
+; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[0,1,4,5],zmm1[0,1,4,5]
+; CHECK-NEXT: vmovapd %zmm2, %zmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 8, i32 9, i32 12, i32 13>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_8xdouble_zero_masked_shuff_mask1(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %mask) {
+; CHECK-LABEL: test_8xdouble_zero_masked_shuff_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %zmm3, %zmm2, %k1
+; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,4,5],zmm1[0,1,4,5]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 8, i32 9, i32 12, i32 13>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
+ ret <8 x double> %res
+}
+define <8 x double> @test_8xdouble_masked_shuff_mask2(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x double> %mask) {
+; CHECK-LABEL: test_8xdouble_masked_shuff_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqpd %zmm4, %zmm3, %k1
+; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[6,7,4,5],zmm1[4,5,0,1]
+; CHECK-NEXT: vmovapd %zmm2, %zmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 6, i32 7, i32 4, i32 5, i32 12, i32 13, i32 8, i32 9>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_8xdouble_zero_masked_shuff_mask2(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %mask) {
+; CHECK-LABEL: test_8xdouble_zero_masked_shuff_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %zmm3, %zmm2, %k1
+; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,4,5],zmm1[4,5,0,1]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 6, i32 7, i32 4, i32 5, i32 12, i32 13, i32 8, i32 9>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
+ ret <8 x double> %res
+}
+define <8 x double> @test_8xdouble_shuff_mask3(<8 x double> %vec1, <8 x double> %vec2) {
+; CHECK-LABEL: test_8xdouble_shuff_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[4,5,4,5],zmm1[4,5,2,3]
+; CHECK-NEXT: retq
+ %res = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 4, i32 5, i32 4, i32 5, i32 12, i32 13, i32 10, i32 11>
+ ret <8 x double> %res
+}
+define <8 x double> @test_8xdouble_masked_shuff_mask3(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x double> %mask) {
+; CHECK-LABEL: test_8xdouble_masked_shuff_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqpd %zmm4, %zmm3, %k1
+; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[4,5,4,5],zmm1[4,5,2,3]
+; CHECK-NEXT: vmovapd %zmm2, %zmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 4, i32 5, i32 4, i32 5, i32 12, i32 13, i32 10, i32 11>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_8xdouble_zero_masked_shuff_mask3(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %mask) {
+; CHECK-LABEL: test_8xdouble_zero_masked_shuff_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %zmm3, %zmm2, %k1
+; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,4,5],zmm1[4,5,2,3]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 4, i32 5, i32 4, i32 5, i32 12, i32 13, i32 10, i32 11>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
+ ret <8 x double> %res
+}
+define <8 x double> @test_8xdouble_shuff_mem_mask0(<8 x double> %vec1, <8 x double>* %vec2p) {
+; CHECK-LABEL: test_8xdouble_shuff_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[6,7,0,1],mem[0,1,0,1]
+; CHECK-NEXT: retq
+ %vec2 = load <8 x double>, <8 x double>* %vec2p
+ %res = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 6, i32 7, i32 0, i32 1, i32 8, i32 9, i32 8, i32 9>
+ ret <8 x double> %res
+}
+define <8 x double> @test_8xdouble_masked_shuff_mem_mask0(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x double> %mask) {
+; CHECK-LABEL: test_8xdouble_masked_shuff_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %zmm3, %zmm2, %k1
+; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[6,7,0,1],mem[0,1,0,1]
+; CHECK-NEXT: vmovapd %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %vec2 = load <8 x double>, <8 x double>* %vec2p
+ %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 6, i32 7, i32 0, i32 1, i32 8, i32 9, i32 8, i32 9>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_8xdouble_zero_masked_shuff_mem_mask0(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %mask) {
+; CHECK-LABEL: test_8xdouble_zero_masked_shuff_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %zmm2, %zmm1, %k1
+; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,0,1],mem[0,1,0,1]
+; CHECK-NEXT: retq
+ %vec2 = load <8 x double>, <8 x double>* %vec2p
+ %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 6, i32 7, i32 0, i32 1, i32 8, i32 9, i32 8, i32 9>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_8xdouble_masked_shuff_mem_mask1(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x double> %mask) {
+; CHECK-LABEL: test_8xdouble_masked_shuff_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %zmm3, %zmm2, %k1
+; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[6,7,6,7],mem[0,1,2,3]
+; CHECK-NEXT: vmovapd %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %vec2 = load <8 x double>, <8 x double>* %vec2p
+ %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 6, i32 7, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_8xdouble_zero_masked_shuff_mem_mask1(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %mask) {
+; CHECK-LABEL: test_8xdouble_zero_masked_shuff_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %zmm2, %zmm1, %k1
+; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,6,7],mem[0,1,2,3]
+; CHECK-NEXT: retq
+ %vec2 = load <8 x double>, <8 x double>* %vec2p
+ %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 6, i32 7, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_8xdouble_masked_shuff_mem_mask2(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x double> %mask) {
+; CHECK-LABEL: test_8xdouble_masked_shuff_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %zmm3, %zmm2, %k1
+; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3],mem[0,1,4,5]
+; CHECK-NEXT: vmovapd %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %vec2 = load <8 x double>, <8 x double>* %vec2p
+ %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 12, i32 13>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_8xdouble_zero_masked_shuff_mem_mask2(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %mask) {
+; CHECK-LABEL: test_8xdouble_zero_masked_shuff_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %zmm2, %zmm1, %k1
+; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3],mem[0,1,4,5]
+; CHECK-NEXT: retq
+ %vec2 = load <8 x double>, <8 x double>* %vec2p
+ %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 12, i32 13>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_8xdouble_shuff_mem_mask3(<8 x double> %vec1, <8 x double>* %vec2p) {
+; CHECK-LABEL: test_8xdouble_shuff_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[2,3,0,1],mem[4,5,0,1]
+; CHECK-NEXT: retq
+ %vec2 = load <8 x double>, <8 x double>* %vec2p
+ %res = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 12, i32 13, i32 8, i32 9>
+ ret <8 x double> %res
+}
+define <8 x double> @test_8xdouble_masked_shuff_mem_mask3(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x double> %mask) {
+; CHECK-LABEL: test_8xdouble_masked_shuff_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %zmm3, %zmm2, %k1
+; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[4,5,0,1]
+; CHECK-NEXT: vmovapd %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %vec2 = load <8 x double>, <8 x double>* %vec2p
+ %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 12, i32 13, i32 8, i32 9>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_8xdouble_zero_masked_shuff_mem_mask3(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %mask) {
+; CHECK-LABEL: test_8xdouble_zero_masked_shuff_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %zmm2, %zmm1, %k1
+; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,0,1],mem[4,5,0,1]
+; CHECK-NEXT: retq
+ %vec2 = load <8 x double>, <8 x double>* %vec2p
+ %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 12, i32 13, i32 8, i32 9>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
+ ret <8 x double> %res
+}
+
+define <8 x i32> @test_8xi32_shuff_mask0(<8 x i32> %vec1, <8 x i32> %vec2) {
+; CHECK-LABEL: test_8xi32_shuff_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
+; CHECK-NEXT: retq
+ %res = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
+ ret <8 x i32> %res
+}
+define <8 x i32> @test_8xi32_masked_shuff_mask0(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %vec3, <8 x i32> %mask) {
+; CHECK-LABEL: test_8xi32_masked_shuff_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vpcmpeqd %ymm4, %ymm3, %k1
+; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[4,5,6,7]
+; CHECK-NEXT: vmovdqa %ymm2, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec3
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_8xi32_zero_masked_shuff_mask0(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %mask) {
+; CHECK-LABEL: test_8xi32_zero_masked_shuff_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1
+; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
+ ret <8 x i32> %res
+}
+define <8 x i32> @test_8xi32_masked_shuff_mask1(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %vec3, <8 x i32> %mask) {
+; CHECK-LABEL: test_8xi32_masked_shuff_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vpcmpeqd %ymm4, %ymm3, %k1
+; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3]
+; CHECK-NEXT: vmovdqa %ymm2, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec3
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_8xi32_zero_masked_shuff_mask1(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %mask) {
+; CHECK-LABEL: test_8xi32_zero_masked_shuff_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1
+; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
+ ret <8 x i32> %res
+}
+define <8 x i32> @test_8xi32_masked_shuff_mask2(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %vec3, <8 x i32> %mask) {
+; CHECK-LABEL: test_8xi32_masked_shuff_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vpcmpeqd %ymm4, %ymm3, %k1
+; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[4,5,6,7]
+; CHECK-NEXT: vmovdqa %ymm2, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec3
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_8xi32_zero_masked_shuff_mask2(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %mask) {
+; CHECK-LABEL: test_8xi32_zero_masked_shuff_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1
+; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
+ ret <8 x i32> %res
+}
+define <8 x i32> @test_8xi32_shuff_mask3(<8 x i32> %vec1, <8 x i32> %vec2) {
+; CHECK-LABEL: test_8xi32_shuff_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
+; CHECK-NEXT: retq
+ %res = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+ ret <8 x i32> %res
+}
+define <8 x i32> @test_8xi32_masked_shuff_mask3(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %vec3, <8 x i32> %mask) {
+; CHECK-LABEL: test_8xi32_masked_shuff_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vpcmpeqd %ymm4, %ymm3, %k1
+; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3]
+; CHECK-NEXT: vmovdqa %ymm2, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec3
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_8xi32_zero_masked_shuff_mask3(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %mask) {
+; CHECK-LABEL: test_8xi32_zero_masked_shuff_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1
+; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
+ ret <8 x i32> %res
+}
+define <8 x i32> @test_8xi32_shuff_mem_mask0(<8 x i32> %vec1, <8 x i32>* %vec2p) {
+; CHECK-LABEL: test_8xi32_shuff_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
+; CHECK-NEXT: retq
+ %vec2 = load <8 x i32>, <8 x i32>* %vec2p
+ %res = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
+ ret <8 x i32> %res
+}
+define <8 x i32> @test_8xi32_masked_shuff_mem_mask0(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %vec3, <8 x i32> %mask) {
+; CHECK-LABEL: test_8xi32_masked_shuff_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1
+; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7]
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %vec2 = load <8 x i32>, <8 x i32>* %vec2p
+ %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec3
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask0(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %mask) {
+; CHECK-LABEL: test_8xi32_zero_masked_shuff_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7]
+; CHECK-NEXT: retq
+ %vec2 = load <8 x i32>, <8 x i32>* %vec2p
+ %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_8xi32_masked_shuff_mem_mask1(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %vec3, <8 x i32> %mask) {
+; CHECK-LABEL: test_8xi32_masked_shuff_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1
+; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3]
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %vec2 = load <8 x i32>, <8 x i32>* %vec2p
+ %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec3
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask1(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %mask) {
+; CHECK-LABEL: test_8xi32_zero_masked_shuff_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3]
+; CHECK-NEXT: retq
+ %vec2 = load <8 x i32>, <8 x i32>* %vec2p
+ %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_8xi32_masked_shuff_mem_mask2(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %vec3, <8 x i32> %mask) {
+; CHECK-LABEL: test_8xi32_masked_shuff_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1
+; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3]
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %vec2 = load <8 x i32>, <8 x i32>* %vec2p
+ %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec3
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask2(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %mask) {
+; CHECK-LABEL: test_8xi32_zero_masked_shuff_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3]
+; CHECK-NEXT: retq
+ %vec2 = load <8 x i32>, <8 x i32>* %vec2p
+ %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_8xi32_shuff_mem_mask3(<8 x i32> %vec1, <8 x i32>* %vec2p) {
+; CHECK-LABEL: test_8xi32_shuff_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1]
+; CHECK-NEXT: retq
+ %vec2 = load <8 x i32>, <8 x i32>* %vec2p
+ %res = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+ ret <8 x i32> %res
+}
+define <8 x i32> @test_8xi32_masked_shuff_mem_mask3(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %vec3, <8 x i32> %mask) {
+; CHECK-LABEL: test_8xi32_masked_shuff_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1
+; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3]
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %vec2 = load <8 x i32>, <8 x i32>* %vec2p
+ %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec3
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask3(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %mask) {
+; CHECK-LABEL: test_8xi32_zero_masked_shuff_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3]
+; CHECK-NEXT: retq
+ %vec2 = load <8 x i32>, <8 x i32>* %vec2p
+ %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
+ ret <8 x i32> %res
+}
+
+define <16 x i32> @test_16xi32_shuff_mask0(<16 x i32> %vec1, <16 x i32> %vec2) {
+; CHECK-LABEL: test_16xi32_shuff_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,2,3],zmm1[2,3,6,7]
+; CHECK-NEXT: retq
+ %res = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31>
+ ret <16 x i32> %res
+}
+define <16 x i32> @test_16xi32_masked_shuff_mask0(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %vec3, <16 x i32> %mask) {
+; CHECK-LABEL: test_16xi32_masked_shuff_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vpcmpeqd %zmm4, %zmm3, %k1
+; CHECK-NEXT: vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[4,5,6,7,4,5,6,7],zmm1[4,5,6,7,12,13,14,15]
+; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec3
+ ret <16 x i32> %res
+}
+
+define <16 x i32> @test_16xi32_zero_masked_shuff_mask0(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %mask) {
+; CHECK-LABEL: test_16xi32_zero_masked_shuff_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqd %zmm3, %zmm2, %k1
+; CHECK-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,4,5,6,7],zmm1[4,5,6,7,12,13,14,15]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
+ ret <16 x i32> %res
+}
+define <16 x i32> @test_16xi32_masked_shuff_mask1(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %vec3, <16 x i32> %mask) {
+; CHECK-LABEL: test_16xi32_masked_shuff_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vpcmpeqd %zmm4, %zmm3, %k1
+; CHECK-NEXT: vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[8,9,10,11,8,9,10,11],zmm1[8,9,10,11,4,5,6,7]
+; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 8, i32 9, i32 10, i32 11, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec3
+ ret <16 x i32> %res
+}
+
+define <16 x i32> @test_16xi32_zero_masked_shuff_mask1(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %mask) {
+; CHECK-LABEL: test_16xi32_zero_masked_shuff_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqd %zmm3, %zmm2, %k1
+; CHECK-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,8,9,10,11],zmm1[8,9,10,11,4,5,6,7]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 8, i32 9, i32 10, i32 11, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
+ ret <16 x i32> %res
+}
+define <16 x i32> @test_16xi32_masked_shuff_mask2(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %vec3, <16 x i32> %mask) {
+; CHECK-LABEL: test_16xi32_masked_shuff_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vpcmpeqd %zmm4, %zmm3, %k1
+; CHECK-NEXT: vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[4,5,6,7,8,9,10,11],zmm1[0,1,2,3,0,1,2,3]
+; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19, i32 16, i32 17, i32 18, i32 19>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec3
+ ret <16 x i32> %res
+}
+
+define <16 x i32> @test_16xi32_zero_masked_shuff_mask2(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %mask) {
+; CHECK-LABEL: test_16xi32_zero_masked_shuff_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqd %zmm3, %zmm2, %k1
+; CHECK-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,8,9,10,11],zmm1[0,1,2,3,0,1,2,3]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19, i32 16, i32 17, i32 18, i32 19>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
+ ret <16 x i32> %res
+}
+define <16 x i32> @test_16xi32_shuff_mask3(<16 x i32> %vec1, <16 x i32> %vec2) {
+; CHECK-LABEL: test_16xi32_shuff_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,0,1],zmm1[4,5,2,3]
+; CHECK-NEXT: retq
+ %res = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23>
+ ret <16 x i32> %res
+}
+define <16 x i32> @test_16xi32_masked_shuff_mask3(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %vec3, <16 x i32> %mask) {
+; CHECK-LABEL: test_16xi32_masked_shuff_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vpcmpeqd %zmm4, %zmm3, %k1
+; CHECK-NEXT: vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[4,5,6,7,0,1,2,3],zmm1[8,9,10,11,4,5,6,7]
+; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec3
+ ret <16 x i32> %res
+}
+
+define <16 x i32> @test_16xi32_zero_masked_shuff_mask3(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %mask) {
+; CHECK-LABEL: test_16xi32_zero_masked_shuff_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqd %zmm3, %zmm2, %k1
+; CHECK-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,0,1,2,3],zmm1[8,9,10,11,4,5,6,7]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
+ ret <16 x i32> %res
+}
+define <16 x i32> @test_16xi32_shuff_mem_mask0(<16 x i32> %vec1, <16 x i32>* %vec2p) {
+; CHECK-LABEL: test_16xi32_shuff_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[4,5,2,3],mem[4,5,0,1]
+; CHECK-NEXT: retq
+ %vec2 = load <16 x i32>, <16 x i32>* %vec2p
+ %res = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 16, i32 17, i32 18, i32 19>
+ ret <16 x i32> %res
+}
+define <16 x i32> @test_16xi32_masked_shuff_mem_mask0(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %vec3, <16 x i32> %mask) {
+; CHECK-LABEL: test_16xi32_masked_shuff_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqd %zmm3, %zmm2, %k1
+; CHECK-NEXT: vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,0,1,2,3]
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %vec2 = load <16 x i32>, <16 x i32>* %vec2p
+ %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 16, i32 17, i32 18, i32 19>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec3
+ ret <16 x i32> %res
+}
+
+define <16 x i32> @test_16xi32_zero_masked_shuff_mem_mask0(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %mask) {
+; CHECK-LABEL: test_16xi32_zero_masked_shuff_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqd %zmm2, %zmm1, %k1
+; CHECK-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,0,1,2,3]
+; CHECK-NEXT: retq
+ %vec2 = load <16 x i32>, <16 x i32>* %vec2p
+ %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 16, i32 17, i32 18, i32 19>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
+ ret <16 x i32> %res
+}
+
+define <16 x i32> @test_16xi32_masked_shuff_mem_mask1(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %vec3, <16 x i32> %mask) {
+; CHECK-LABEL: test_16xi32_masked_shuff_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqd %zmm3, %zmm2, %k1
+; CHECK-NEXT: vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,4,5,6,7],mem[0,1,2,3,8,9,10,11]
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %vec2 = load <16 x i32>, <16 x i32>* %vec2p
+ %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 24, i32 25, i32 26, i32 27>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec3
+ ret <16 x i32> %res
+}
+
+define <16 x i32> @test_16xi32_zero_masked_shuff_mem_mask1(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %mask) {
+; CHECK-LABEL: test_16xi32_zero_masked_shuff_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqd %zmm2, %zmm1, %k1
+; CHECK-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,4,5,6,7],mem[0,1,2,3,8,9,10,11]
+; CHECK-NEXT: retq
+ %vec2 = load <16 x i32>, <16 x i32>* %vec2p
+ %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 24, i32 25, i32 26, i32 27>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
+ ret <16 x i32> %res
+}
+
+define <16 x i32> @test_16xi32_masked_shuff_mem_mask2(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %vec3, <16 x i32> %mask) {
+; CHECK-LABEL: test_16xi32_masked_shuff_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqd %zmm3, %zmm2, %k1
+; CHECK-NEXT: vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,8,9,10,11],mem[12,13,14,15,12,13,14,15]
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %vec2 = load <16 x i32>, <16 x i32>* %vec2p
+ %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 28, i32 29, i32 30, i32 31, i32 28, i32 29, i32 30, i32 31>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec3
+ ret <16 x i32> %res
+}
+
+define <16 x i32> @test_16xi32_zero_masked_shuff_mem_mask2(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %mask) {
+; CHECK-LABEL: test_16xi32_zero_masked_shuff_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqd %zmm2, %zmm1, %k1
+; CHECK-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,8,9,10,11],mem[12,13,14,15,12,13,14,15]
+; CHECK-NEXT: retq
+ %vec2 = load <16 x i32>, <16 x i32>* %vec2p
+ %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 28, i32 29, i32 30, i32 31, i32 28, i32 29, i32 30, i32 31>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
+ ret <16 x i32> %res
+}
+
+define <16 x i32> @test_16xi32_shuff_mem_mask3(<16 x i32> %vec1, <16 x i32>* %vec2p) {
+; CHECK-LABEL: test_16xi32_shuff_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,2,3],mem[2,3,6,7]
+; CHECK-NEXT: retq
+ %vec2 = load <16 x i32>, <16 x i32>* %vec2p
+ %res = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31>
+ ret <16 x i32> %res
+}
+define <16 x i32> @test_16xi32_masked_shuff_mem_mask3(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %vec3, <16 x i32> %mask) {
+; CHECK-LABEL: test_16xi32_masked_shuff_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqd %zmm3, %zmm2, %k1
+; CHECK-NEXT: vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,4,5,6,7],mem[4,5,6,7,12,13,14,15]
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %vec2 = load <16 x i32>, <16 x i32>* %vec2p
+ %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec3
+ ret <16 x i32> %res
+}
+
+define <16 x i32> @test_16xi32_zero_masked_shuff_mem_mask3(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %mask) {
+; CHECK-LABEL: test_16xi32_zero_masked_shuff_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqd %zmm2, %zmm1, %k1
+; CHECK-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,4,5,6,7],mem[4,5,6,7,12,13,14,15]
+; CHECK-NEXT: retq
+ %vec2 = load <16 x i32>, <16 x i32>* %vec2p
+ %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
+ ret <16 x i32> %res
+}
+
+define <4 x i64> @test_4xi64_shuff_mask0(<4 x i64> %vec1, <4 x i64> %vec2) {
+; CHECK-LABEL: test_4xi64_shuff_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
+; CHECK-NEXT: retq
+ %res = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+ ret <4 x i64> %res
+}
+define <4 x i64> @test_4xi64_masked_shuff_mask0(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %vec3, <4 x i64> %mask) {
+; CHECK-LABEL: test_4xi64_masked_shuff_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vpcmpeqq %ymm4, %ymm3, %k1
+; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1]
+; CHECK-NEXT: vmovdqa %ymm2, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec3
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_4xi64_zero_masked_shuff_mask0(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %mask) {
+; CHECK-LABEL: test_4xi64_zero_masked_shuff_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1
+; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
+ ret <4 x i64> %res
+}
+define <4 x i64> @test_4xi64_masked_shuff_mask1(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %vec3, <4 x i64> %mask) {
+; CHECK-LABEL: test_4xi64_masked_shuff_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vpcmpeqq %ymm4, %ymm3, %k1
+; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3]
+; CHECK-NEXT: vmovdqa %ymm2, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec3
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_4xi64_zero_masked_shuff_mask1(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %mask) {
+; CHECK-LABEL: test_4xi64_zero_masked_shuff_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1
+; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
+ ret <4 x i64> %res
+}
+define <4 x i64> @test_4xi64_masked_shuff_mask2(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %vec3, <4 x i64> %mask) {
+; CHECK-LABEL: test_4xi64_masked_shuff_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vpcmpeqq %ymm4, %ymm3, %k1
+; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1]
+; CHECK-NEXT: vmovdqa %ymm2, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec3
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_4xi64_zero_masked_shuff_mask2(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %mask) {
+; CHECK-LABEL: test_4xi64_zero_masked_shuff_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1
+; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
+ ret <4 x i64> %res
+}
+define <4 x i64> @test_4xi64_shuff_mask3(<4 x i64> %vec1, <4 x i64> %vec2) {
+; CHECK-LABEL: test_4xi64_shuff_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
+; CHECK-NEXT: retq
+ %res = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+ ret <4 x i64> %res
+}
+define <4 x i64> @test_4xi64_masked_shuff_mask3(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %vec3, <4 x i64> %mask) {
+; CHECK-LABEL: test_4xi64_masked_shuff_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vpcmpeqq %ymm4, %ymm3, %k1
+; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3]
+; CHECK-NEXT: vmovdqa %ymm2, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec3
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_4xi64_zero_masked_shuff_mask3(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %mask) {
+; CHECK-LABEL: test_4xi64_zero_masked_shuff_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1
+; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
+ ret <4 x i64> %res
+}
+define <4 x i64> @test_4xi64_shuff_mem_mask0(<4 x i64> %vec1, <4 x i64>* %vec2p) {
+; CHECK-LABEL: test_4xi64_shuff_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
+; CHECK-NEXT: retq
+ %vec2 = load <4 x i64>, <4 x i64>* %vec2p
+ %res = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+ ret <4 x i64> %res
+}
+define <4 x i64> @test_4xi64_masked_shuff_mem_mask0(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %vec3, <4 x i64> %mask) {
+; CHECK-LABEL: test_4xi64_masked_shuff_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1
+; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3]
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %vec2 = load <4 x i64>, <4 x i64>* %vec2p
+ %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec3
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask0(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %mask) {
+; CHECK-LABEL: test_4xi64_zero_masked_shuff_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
+; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3]
+; CHECK-NEXT: retq
+ %vec2 = load <4 x i64>, <4 x i64>* %vec2p
+ %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_4xi64_masked_shuff_mem_mask1(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %vec3, <4 x i64> %mask) {
+; CHECK-LABEL: test_4xi64_masked_shuff_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1
+; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1]
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %vec2 = load <4 x i64>, <4 x i64>* %vec2p
+ %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec3
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask1(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %mask) {
+; CHECK-LABEL: test_4xi64_zero_masked_shuff_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
+; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1]
+; CHECK-NEXT: retq
+ %vec2 = load <4 x i64>, <4 x i64>* %vec2p
+ %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_4xi64_masked_shuff_mem_mask2(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %vec3, <4 x i64> %mask) {
+; CHECK-LABEL: test_4xi64_masked_shuff_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1
+; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1]
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %vec2 = load <4 x i64>, <4 x i64>* %vec2p
+ %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec3
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask2(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %mask) {
+; CHECK-LABEL: test_4xi64_zero_masked_shuff_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
+; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1]
+; CHECK-NEXT: retq
+ %vec2 = load <4 x i64>, <4 x i64>* %vec2p
+ %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_4xi64_shuff_mem_mask3(<4 x i64> %vec1, <4 x i64>* %vec2p) {
+; CHECK-LABEL: test_4xi64_shuff_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
+; CHECK-NEXT: retq
+ %vec2 = load <4 x i64>, <4 x i64>* %vec2p
+ %res = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+ ret <4 x i64> %res
+}
+define <4 x i64> @test_4xi64_masked_shuff_mem_mask3(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %vec3, <4 x i64> %mask) {
+; CHECK-LABEL: test_4xi64_masked_shuff_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1
+; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3]
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %vec2 = load <4 x i64>, <4 x i64>* %vec2p
+ %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec3
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask3(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %mask) {
+; CHECK-LABEL: test_4xi64_zero_masked_shuff_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1
+; CHECK-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3]
+; CHECK-NEXT: retq
+ %vec2 = load <4 x i64>, <4 x i64>* %vec2p
+ %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
+ ret <4 x i64> %res
+}
+
+define <8 x i64> @test_8xi64_shuff_mask0(<8 x i64> %vec1, <8 x i64> %vec2) {
+; CHECK-LABEL: test_8xi64_shuff_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[4,5,4,5],zmm1[4,5,4,5]
+; CHECK-NEXT: retq
+ %res = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 4, i32 5, i32 4, i32 5, i32 12, i32 13, i32 12, i32 13>
+ ret <8 x i64> %res
+}
+define <8 x i64> @test_8xi64_masked_shuff_mask0(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %vec3, <8 x i64> %mask) {
+; CHECK-LABEL: test_8xi64_masked_shuff_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vpcmpeqq %zmm4, %zmm3, %k1
+; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[4,5,4,5],zmm1[4,5,4,5]
+; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 4, i32 5, i32 4, i32 5, i32 12, i32 13, i32 12, i32 13>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec3
+ ret <8 x i64> %res
+}
+
+define <8 x i64> @test_8xi64_zero_masked_shuff_mask0(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %mask) {
+; CHECK-LABEL: test_8xi64_zero_masked_shuff_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqq %zmm3, %zmm2, %k1
+; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,4,5],zmm1[4,5,4,5]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 4, i32 5, i32 4, i32 5, i32 12, i32 13, i32 12, i32 13>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
+ ret <8 x i64> %res
+}
+define <8 x i64> @test_8xi64_masked_shuff_mask1(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %vec3, <8 x i64> %mask) {
+; CHECK-LABEL: test_8xi64_masked_shuff_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vpcmpeqq %zmm4, %zmm3, %k1
+; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[6,7,4,5],zmm1[2,3,4,5]
+; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 6, i32 7, i32 4, i32 5, i32 10, i32 11, i32 12, i32 13>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec3
+ ret <8 x i64> %res
+}
+
+define <8 x i64> @test_8xi64_zero_masked_shuff_mask1(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %mask) {
+; CHECK-LABEL: test_8xi64_zero_masked_shuff_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqq %zmm3, %zmm2, %k1
+; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,4,5],zmm1[2,3,4,5]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 6, i32 7, i32 4, i32 5, i32 10, i32 11, i32 12, i32 13>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
+ ret <8 x i64> %res
+}
+define <8 x i64> @test_8xi64_masked_shuff_mask2(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %vec3, <8 x i64> %mask) {
+; CHECK-LABEL: test_8xi64_masked_shuff_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vpcmpeqq %zmm4, %zmm3, %k1
+; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[0,1,4,5],zmm1[0,1,0,1]
+; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 8, i32 9, i32 8, i32 9>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec3
+ ret <8 x i64> %res
+}
+
+define <8 x i64> @test_8xi64_zero_masked_shuff_mask2(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %mask) {
+; CHECK-LABEL: test_8xi64_zero_masked_shuff_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqq %zmm3, %zmm2, %k1
+; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,4,5],zmm1[0,1,0,1]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 8, i32 9, i32 8, i32 9>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
+ ret <8 x i64> %res
+}
+define <8 x i64> @test_8xi64_shuff_mask3(<8 x i64> %vec1, <8 x i64> %vec2) {
+; CHECK-LABEL: test_8xi64_shuff_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,6,7],zmm1[4,5,2,3]
+; CHECK-NEXT: retq
+ %res = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 6, i32 7, i32 12, i32 13, i32 10, i32 11>
+ ret <8 x i64> %res
+}
+define <8 x i64> @test_8xi64_masked_shuff_mask3(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %vec3, <8 x i64> %mask) {
+; CHECK-LABEL: test_8xi64_masked_shuff_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vpcmpeqq %zmm4, %zmm3, %k1
+; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[2,3,6,7],zmm1[4,5,2,3]
+; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 6, i32 7, i32 12, i32 13, i32 10, i32 11>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec3
+ ret <8 x i64> %res
+}
+
+define <8 x i64> @test_8xi64_zero_masked_shuff_mask3(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %mask) {
+; CHECK-LABEL: test_8xi64_zero_masked_shuff_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqq %zmm3, %zmm2, %k1
+; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,6,7],zmm1[4,5,2,3]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 6, i32 7, i32 12, i32 13, i32 10, i32 11>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
+ ret <8 x i64> %res
+}
+define <8 x i64> @test_8xi64_shuff_mem_mask0(<8 x i64> %vec1, <8 x i64>* %vec2p) {
+; CHECK-LABEL: test_8xi64_shuff_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,2,3],mem[4,5,2,3]
+; CHECK-NEXT: retq
+ %vec2 = load <8 x i64>, <8 x i64>* %vec2p
+ %res = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 2, i32 3, i32 12, i32 13, i32 10, i32 11>
+ ret <8 x i64> %res
+}
+define <8 x i64> @test_8xi64_masked_shuff_mem_mask0(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %vec3, <8 x i64> %mask) {
+; CHECK-LABEL: test_8xi64_masked_shuff_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqq %zmm3, %zmm2, %k1
+; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,2,3],mem[4,5,2,3]
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %vec2 = load <8 x i64>, <8 x i64>* %vec2p
+ %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 2, i32 3, i32 12, i32 13, i32 10, i32 11>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec3
+ ret <8 x i64> %res
+}
+
+define <8 x i64> @test_8xi64_zero_masked_shuff_mem_mask0(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %mask) {
+; CHECK-LABEL: test_8xi64_zero_masked_shuff_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqq %zmm2, %zmm1, %k1
+; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,2,3],mem[4,5,2,3]
+; CHECK-NEXT: retq
+ %vec2 = load <8 x i64>, <8 x i64>* %vec2p
+ %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 2, i32 3, i32 12, i32 13, i32 10, i32 11>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
+ ret <8 x i64> %res
+}
+
+define <8 x i64> @test_8xi64_masked_shuff_mem_mask1(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %vec3, <8 x i64> %mask) {
+; CHECK-LABEL: test_8xi64_masked_shuff_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqq %zmm3, %zmm2, %k1
+; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[0,1,0,1]
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %vec2 = load <8 x i64>, <8 x i64>* %vec2p
+ %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 8, i32 9, i32 8, i32 9>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec3
+ ret <8 x i64> %res
+}
+
+define <8 x i64> @test_8xi64_zero_masked_shuff_mem_mask1(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %mask) {
+; CHECK-LABEL: test_8xi64_zero_masked_shuff_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqq %zmm2, %zmm1, %k1
+; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,0,1],mem[0,1,0,1]
+; CHECK-NEXT: retq
+ %vec2 = load <8 x i64>, <8 x i64>* %vec2p
+ %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 8, i32 9, i32 8, i32 9>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
+ ret <8 x i64> %res
+}
+
+define <8 x i64> @test_8xi64_masked_shuff_mem_mask2(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %vec3, <8 x i64> %mask) {
+; CHECK-LABEL: test_8xi64_masked_shuff_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqq %zmm3, %zmm2, %k1
+; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[4,5,0,1],mem[2,3,2,3]
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %vec2 = load <8 x i64>, <8 x i64>* %vec2p
+ %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 4, i32 5, i32 0, i32 1, i32 10, i32 11, i32 10, i32 11>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec3
+ ret <8 x i64> %res
+}
+
+define <8 x i64> @test_8xi64_zero_masked_shuff_mem_mask2(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %mask) {
+; CHECK-LABEL: test_8xi64_zero_masked_shuff_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqq %zmm2, %zmm1, %k1
+; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,0,1],mem[2,3,2,3]
+; CHECK-NEXT: retq
+ %vec2 = load <8 x i64>, <8 x i64>* %vec2p
+ %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 4, i32 5, i32 0, i32 1, i32 10, i32 11, i32 10, i32 11>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
+ ret <8 x i64> %res
+}
+
+define <8 x i64> @test_8xi64_shuff_mem_mask3(<8 x i64> %vec1, <8 x i64>* %vec2p) {
+; CHECK-LABEL: test_8xi64_shuff_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,0,1],mem[6,7,2,3]
+; CHECK-NEXT: retq
+ %vec2 = load <8 x i64>, <8 x i64>* %vec2p
+ %res = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 14, i32 15, i32 10, i32 11>
+ ret <8 x i64> %res
+}
+define <8 x i64> @test_8xi64_masked_shuff_mem_mask3(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %vec3, <8 x i64> %mask) {
+; CHECK-LABEL: test_8xi64_masked_shuff_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqq %zmm3, %zmm2, %k1
+; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[6,7,2,3]
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %vec2 = load <8 x i64>, <8 x i64>* %vec2p
+ %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 14, i32 15, i32 10, i32 11>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec3
+ ret <8 x i64> %res
+}
+
+define <8 x i64> @test_8xi64_zero_masked_shuff_mem_mask3(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %mask) {
+; CHECK-LABEL: test_8xi64_zero_masked_shuff_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqq %zmm2, %zmm1, %k1
+; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,0,1],mem[6,7,2,3]
+; CHECK-NEXT: retq
+ %vec2 = load <8 x i64>, <8 x i64>* %vec2p
+ %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 14, i32 15, i32 10, i32 11>
+ %cmp = icmp eq <8 x i64> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
+ ret <8 x i64> %res
+}
+
diff --git a/test/CodeGen/X86/avx512-shuffles/shuffle.ll b/test/CodeGen/X86/avx512-shuffles/shuffle.ll
new file mode 100644
index 000000000000..df46487d9ab0
--- /dev/null
+++ b/test/CodeGen/X86/avx512-shuffles/shuffle.ll
@@ -0,0 +1,2984 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512vl,+avx512bw %s -o - | FileCheck %s
+
+define <16 x i8> @test_16xi8_perm_mask0(<16 x i8> %vec) {
+; CHECK-LABEL: test_16xi8_perm_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14]
+; CHECK-NEXT: retq
+ %res = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 8, i32 6, i32 12, i32 4, i32 7, i32 9, i32 14, i32 8, i32 4, i32 12, i32 9, i32 4, i32 14, i32 15, i32 12, i32 14>
+ ret <16 x i8> %res
+}
+define <16 x i8> @test_masked_16xi8_perm_mask0(<16 x i8> %vec, <16 x i8> %vec2, <16 x i8> %mask) {
+; CHECK-LABEL: test_masked_16xi8_perm_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqb %xmm3, %xmm2, %k1
+; CHECK-NEXT: vpshufb {{.*#+}} xmm1 {%k1} = xmm0[8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14]
+; CHECK-NEXT: vmovdqa %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 8, i32 6, i32 12, i32 4, i32 7, i32 9, i32 14, i32 8, i32 4, i32 12, i32 9, i32 4, i32 14, i32 15, i32 12, i32 14>
+ %cmp = icmp eq <16 x i8> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> %vec2
+ ret <16 x i8> %res
+}
+
+define <16 x i8> @test_masked_z_16xi8_perm_mask0(<16 x i8> %vec, <16 x i8> %mask) {
+; CHECK-LABEL: test_masked_z_16xi8_perm_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqb %xmm2, %xmm1, %k1
+; CHECK-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 8, i32 6, i32 12, i32 4, i32 7, i32 9, i32 14, i32 8, i32 4, i32 12, i32 9, i32 4, i32 14, i32 15, i32 12, i32 14>
+ %cmp = icmp eq <16 x i8> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> zeroinitializer
+ ret <16 x i8> %res
+}
+define <16 x i8> @test_masked_16xi8_perm_mask1(<16 x i8> %vec, <16 x i8> %vec2, <16 x i8> %mask) {
+; CHECK-LABEL: test_masked_16xi8_perm_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqb %xmm3, %xmm2, %k1
+; CHECK-NEXT: vpshufb {{.*#+}} xmm1 {%k1} = xmm0[4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0]
+; CHECK-NEXT: vmovdqa %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 4, i32 11, i32 14, i32 10, i32 7, i32 1, i32 6, i32 9, i32 14, i32 15, i32 7, i32 13, i32 4, i32 12, i32 8, i32 0>
+ %cmp = icmp eq <16 x i8> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> %vec2
+ ret <16 x i8> %res
+}
+
+define <16 x i8> @test_masked_z_16xi8_perm_mask1(<16 x i8> %vec, <16 x i8> %mask) {
+; CHECK-LABEL: test_masked_z_16xi8_perm_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqb %xmm2, %xmm1, %k1
+; CHECK-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 4, i32 11, i32 14, i32 10, i32 7, i32 1, i32 6, i32 9, i32 14, i32 15, i32 7, i32 13, i32 4, i32 12, i32 8, i32 0>
+ %cmp = icmp eq <16 x i8> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> zeroinitializer
+ ret <16 x i8> %res
+}
+define <16 x i8> @test_masked_16xi8_perm_mask2(<16 x i8> %vec, <16 x i8> %vec2, <16 x i8> %mask) {
+; CHECK-LABEL: test_masked_16xi8_perm_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqb %xmm3, %xmm2, %k1
+; CHECK-NEXT: vpshufb {{.*#+}} xmm1 {%k1} = xmm0[11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7]
+; CHECK-NEXT: vmovdqa %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 11, i32 6, i32 13, i32 10, i32 0, i32 7, i32 13, i32 3, i32 5, i32 13, i32 3, i32 9, i32 3, i32 15, i32 12, i32 7>
+ %cmp = icmp eq <16 x i8> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> %vec2
+ ret <16 x i8> %res
+}
+
+define <16 x i8> @test_masked_z_16xi8_perm_mask2(<16 x i8> %vec, <16 x i8> %mask) {
+; CHECK-LABEL: test_masked_z_16xi8_perm_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqb %xmm2, %xmm1, %k1
+; CHECK-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 11, i32 6, i32 13, i32 10, i32 0, i32 7, i32 13, i32 3, i32 5, i32 13, i32 3, i32 9, i32 3, i32 15, i32 12, i32 7>
+ %cmp = icmp eq <16 x i8> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> zeroinitializer
+ ret <16 x i8> %res
+}
+define <16 x i8> @test_16xi8_perm_mask3(<16 x i8> %vec) {
+; CHECK-LABEL: test_16xi8_perm_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6]
+; CHECK-NEXT: retq
+ %res = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 1, i32 5, i32 8, i32 14, i32 1, i32 8, i32 11, i32 8, i32 13, i32 8, i32 15, i32 9, i32 9, i32 7, i32 9, i32 6>
+ ret <16 x i8> %res
+}
+define <16 x i8> @test_masked_16xi8_perm_mask3(<16 x i8> %vec, <16 x i8> %vec2, <16 x i8> %mask) {
+; CHECK-LABEL: test_masked_16xi8_perm_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqb %xmm3, %xmm2, %k1
+; CHECK-NEXT: vpshufb {{.*#+}} xmm1 {%k1} = xmm0[1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6]
+; CHECK-NEXT: vmovdqa %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 1, i32 5, i32 8, i32 14, i32 1, i32 8, i32 11, i32 8, i32 13, i32 8, i32 15, i32 9, i32 9, i32 7, i32 9, i32 6>
+ %cmp = icmp eq <16 x i8> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> %vec2
+ ret <16 x i8> %res
+}
+
+define <16 x i8> @test_masked_z_16xi8_perm_mask3(<16 x i8> %vec, <16 x i8> %mask) {
+; CHECK-LABEL: test_masked_z_16xi8_perm_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqb %xmm2, %xmm1, %k1
+; CHECK-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 1, i32 5, i32 8, i32 14, i32 1, i32 8, i32 11, i32 8, i32 13, i32 8, i32 15, i32 9, i32 9, i32 7, i32 9, i32 6>
+ %cmp = icmp eq <16 x i8> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> zeroinitializer
+ ret <16 x i8> %res
+}
+define <16 x i8> @test_16xi8_perm_mem_mask0(<16 x i8>* %vp) {
+; CHECK-LABEL: test_16xi8_perm_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa (%rdi), %xmm0
+; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13]
+; CHECK-NEXT: retq
+ %vec = load <16 x i8>, <16 x i8>* %vp
+ %res = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 9, i32 10, i32 7, i32 1, i32 12, i32 14, i32 14, i32 13, i32 14, i32 14, i32 8, i32 6, i32 11, i32 4, i32 12, i32 13>
+ ret <16 x i8> %res
+}
+define <16 x i8> @test_masked_16xi8_perm_mem_mask0(<16 x i8>* %vp, <16 x i8> %vec2, <16 x i8> %mask) {
+; CHECK-LABEL: test_masked_16xi8_perm_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa (%rdi), %xmm2
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqb %xmm3, %xmm1, %k1
+; CHECK-NEXT: vpshufb {{.*#+}} xmm0 {%k1} = xmm2[9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13]
+; CHECK-NEXT: retq
+ %vec = load <16 x i8>, <16 x i8>* %vp
+ %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 9, i32 10, i32 7, i32 1, i32 12, i32 14, i32 14, i32 13, i32 14, i32 14, i32 8, i32 6, i32 11, i32 4, i32 12, i32 13>
+ %cmp = icmp eq <16 x i8> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> %vec2
+ ret <16 x i8> %res
+}
+
+define <16 x i8> @test_masked_z_16xi8_perm_mem_mask0(<16 x i8>* %vp, <16 x i8> %mask) {
+; CHECK-LABEL: test_masked_z_16xi8_perm_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa (%rdi), %xmm1
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqb %xmm2, %xmm0, %k1
+; CHECK-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13]
+; CHECK-NEXT: retq
+ %vec = load <16 x i8>, <16 x i8>* %vp
+ %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 9, i32 10, i32 7, i32 1, i32 12, i32 14, i32 14, i32 13, i32 14, i32 14, i32 8, i32 6, i32 11, i32 4, i32 12, i32 13>
+ %cmp = icmp eq <16 x i8> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> zeroinitializer
+ ret <16 x i8> %res
+}
+
+define <16 x i8> @test_masked_16xi8_perm_mem_mask1(<16 x i8>* %vp, <16 x i8> %vec2, <16 x i8> %mask) {
+; CHECK-LABEL: test_masked_16xi8_perm_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa (%rdi), %xmm2
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqb %xmm3, %xmm1, %k1
+; CHECK-NEXT: vpshufb {{.*#+}} xmm0 {%k1} = xmm2[14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11]
+; CHECK-NEXT: retq
+ %vec = load <16 x i8>, <16 x i8>* %vp
+ %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 14, i32 9, i32 15, i32 9, i32 7, i32 10, i32 15, i32 14, i32 12, i32 1, i32 9, i32 7, i32 10, i32 13, i32 3, i32 11>
+ %cmp = icmp eq <16 x i8> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> %vec2
+ ret <16 x i8> %res
+}
+
+define <16 x i8> @test_masked_z_16xi8_perm_mem_mask1(<16 x i8>* %vp, <16 x i8> %mask) {
+; CHECK-LABEL: test_masked_z_16xi8_perm_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa (%rdi), %xmm1
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqb %xmm2, %xmm0, %k1
+; CHECK-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11]
+; CHECK-NEXT: retq
+ %vec = load <16 x i8>, <16 x i8>* %vp
+ %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 14, i32 9, i32 15, i32 9, i32 7, i32 10, i32 15, i32 14, i32 12, i32 1, i32 9, i32 7, i32 10, i32 13, i32 3, i32 11>
+ %cmp = icmp eq <16 x i8> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> zeroinitializer
+ ret <16 x i8> %res
+}
+
+define <16 x i8> @test_masked_16xi8_perm_mem_mask2(<16 x i8>* %vp, <16 x i8> %vec2, <16 x i8> %mask) {
+; CHECK-LABEL: test_masked_16xi8_perm_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa (%rdi), %xmm2
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqb %xmm3, %xmm1, %k1
+; CHECK-NEXT: vpshufb {{.*#+}} xmm0 {%k1} = xmm2[1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9]
+; CHECK-NEXT: retq
+ %vec = load <16 x i8>, <16 x i8>* %vp
+ %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 1, i32 3, i32 12, i32 5, i32 13, i32 1, i32 2, i32 11, i32 0, i32 9, i32 14, i32 8, i32 10, i32 0, i32 10, i32 9>
+ %cmp = icmp eq <16 x i8> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> %vec2
+ ret <16 x i8> %res
+}
+
+define <16 x i8> @test_masked_z_16xi8_perm_mem_mask2(<16 x i8>* %vp, <16 x i8> %mask) {
+; CHECK-LABEL: test_masked_z_16xi8_perm_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa (%rdi), %xmm1
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqb %xmm2, %xmm0, %k1
+; CHECK-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9]
+; CHECK-NEXT: retq
+ %vec = load <16 x i8>, <16 x i8>* %vp
+ %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 1, i32 3, i32 12, i32 5, i32 13, i32 1, i32 2, i32 11, i32 0, i32 9, i32 14, i32 8, i32 10, i32 0, i32 10, i32 9>
+ %cmp = icmp eq <16 x i8> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> zeroinitializer
+ ret <16 x i8> %res
+}
+
+define <16 x i8> @test_16xi8_perm_mem_mask3(<16 x i8>* %vp) {
+; CHECK-LABEL: test_16xi8_perm_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa (%rdi), %xmm0
+; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4]
+; CHECK-NEXT: retq
+ %vec = load <16 x i8>, <16 x i8>* %vp
+ %res = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 9, i32 6, i32 5, i32 15, i32 0, i32 0, i32 15, i32 2, i32 1, i32 3, i32 12, i32 14, i32 0, i32 6, i32 1, i32 4>
+ ret <16 x i8> %res
+}
+define <16 x i8> @test_masked_16xi8_perm_mem_mask3(<16 x i8>* %vp, <16 x i8> %vec2, <16 x i8> %mask) {
+; CHECK-LABEL: test_masked_16xi8_perm_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa (%rdi), %xmm2
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqb %xmm3, %xmm1, %k1
+; CHECK-NEXT: vpshufb {{.*#+}} xmm0 {%k1} = xmm2[9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4]
+; CHECK-NEXT: retq
+ %vec = load <16 x i8>, <16 x i8>* %vp
+ %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 9, i32 6, i32 5, i32 15, i32 0, i32 0, i32 15, i32 2, i32 1, i32 3, i32 12, i32 14, i32 0, i32 6, i32 1, i32 4>
+ %cmp = icmp eq <16 x i8> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> %vec2
+ ret <16 x i8> %res
+}
+
+define <16 x i8> @test_masked_z_16xi8_perm_mem_mask3(<16 x i8>* %vp, <16 x i8> %mask) {
+; CHECK-LABEL: test_masked_z_16xi8_perm_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa (%rdi), %xmm1
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqb %xmm2, %xmm0, %k1
+; CHECK-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4]
+; CHECK-NEXT: retq
+ %vec = load <16 x i8>, <16 x i8>* %vp
+ %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 9, i32 6, i32 5, i32 15, i32 0, i32 0, i32 15, i32 2, i32 1, i32 3, i32 12, i32 14, i32 0, i32 6, i32 1, i32 4>
+ %cmp = icmp eq <16 x i8> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> zeroinitializer
+ ret <16 x i8> %res
+}
+
+define <32 x i8> @test_32xi8_perm_mask0(<32 x i8> %vec) {
+; CHECK-LABEL: test_32xi8_perm_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,0,1,15,3,5,11,13,14,2,10,15,0,10,13,5,20,25,23,18,23,22,25,24,20,21,29,20,24,16,27,21]
+; CHECK-NEXT: retq
+ %res = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 8, i32 0, i32 1, i32 15, i32 3, i32 5, i32 11, i32 13, i32 14, i32 2, i32 10, i32 15, i32 0, i32 10, i32 13, i32 5, i32 20, i32 25, i32 23, i32 18, i32 23, i32 22, i32 25, i32 24, i32 20, i32 21, i32 29, i32 20, i32 24, i32 16, i32 27, i32 21>
+ ret <32 x i8> %res
+}
+define <32 x i8> @test_masked_32xi8_perm_mask0(<32 x i8> %vec, <32 x i8> %vec2, <32 x i8> %mask) {
+; CHECK-LABEL: test_masked_32xi8_perm_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqb %ymm3, %ymm2, %k1
+; CHECK-NEXT: vpshufb {{.*#+}} ymm1 {%k1} = ymm0[8,0,1,15,3,5,11,13,14,2,10,15,0,10,13,5,20,25,23,18,23,22,25,24,20,21,29,20,24,16,27,21]
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 8, i32 0, i32 1, i32 15, i32 3, i32 5, i32 11, i32 13, i32 14, i32 2, i32 10, i32 15, i32 0, i32 10, i32 13, i32 5, i32 20, i32 25, i32 23, i32 18, i32 23, i32 22, i32 25, i32 24, i32 20, i32 21, i32 29, i32 20, i32 24, i32 16, i32 27, i32 21>
+ %cmp = icmp eq <32 x i8> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> %vec2
+ ret <32 x i8> %res
+}
+
+define <32 x i8> @test_masked_z_32xi8_perm_mask0(<32 x i8> %vec, <32 x i8> %mask) {
+; CHECK-LABEL: test_masked_z_32xi8_perm_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqb %ymm2, %ymm1, %k1
+; CHECK-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[8,0,1,15,3,5,11,13,14,2,10,15,0,10,13,5,20,25,23,18,23,22,25,24,20,21,29,20,24,16,27,21]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 8, i32 0, i32 1, i32 15, i32 3, i32 5, i32 11, i32 13, i32 14, i32 2, i32 10, i32 15, i32 0, i32 10, i32 13, i32 5, i32 20, i32 25, i32 23, i32 18, i32 23, i32 22, i32 25, i32 24, i32 20, i32 21, i32 29, i32 20, i32 24, i32 16, i32 27, i32 21>
+ %cmp = icmp eq <32 x i8> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> zeroinitializer
+ ret <32 x i8> %res
+}
+define <32 x i8> @test_masked_32xi8_perm_mask1(<32 x i8> %vec, <32 x i8> %vec2, <32 x i8> %mask) {
+; CHECK-LABEL: test_masked_32xi8_perm_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqb %ymm3, %ymm2, %k1
+; CHECK-NEXT: vpshufb {{.*#+}} ymm1 {%k1} = ymm0[0,4,3,15,5,4,5,15,10,9,11,6,6,10,0,3,21,19,26,22,30,25,22,22,27,22,26,16,23,20,18,24]
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 0, i32 4, i32 3, i32 15, i32 5, i32 4, i32 5, i32 15, i32 10, i32 9, i32 11, i32 6, i32 6, i32 10, i32 0, i32 3, i32 21, i32 19, i32 26, i32 22, i32 30, i32 25, i32 22, i32 22, i32 27, i32 22, i32 26, i32 16, i32 23, i32 20, i32 18, i32 24>
+ %cmp = icmp eq <32 x i8> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> %vec2
+ ret <32 x i8> %res
+}
+
+define <32 x i8> @test_masked_z_32xi8_perm_mask1(<32 x i8> %vec, <32 x i8> %mask) {
+; CHECK-LABEL: test_masked_z_32xi8_perm_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqb %ymm2, %ymm1, %k1
+; CHECK-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[0,4,3,15,5,4,5,15,10,9,11,6,6,10,0,3,21,19,26,22,30,25,22,22,27,22,26,16,23,20,18,24]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 0, i32 4, i32 3, i32 15, i32 5, i32 4, i32 5, i32 15, i32 10, i32 9, i32 11, i32 6, i32 6, i32 10, i32 0, i32 3, i32 21, i32 19, i32 26, i32 22, i32 30, i32 25, i32 22, i32 22, i32 27, i32 22, i32 26, i32 16, i32 23, i32 20, i32 18, i32 24>
+ %cmp = icmp eq <32 x i8> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> zeroinitializer
+ ret <32 x i8> %res
+}
+define <32 x i8> @test_masked_32xi8_perm_mask2(<32 x i8> %vec, <32 x i8> %vec2, <32 x i8> %mask) {
+; CHECK-LABEL: test_masked_32xi8_perm_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqb %ymm3, %ymm2, %k1
+; CHECK-NEXT: vpshufb {{.*#+}} ymm1 {%k1} = ymm0[7,8,12,14,7,4,7,12,14,12,3,15,10,1,11,15,22,26,21,19,27,16,29,24,17,17,26,29,20,31,17,29]
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 7, i32 8, i32 12, i32 14, i32 7, i32 4, i32 7, i32 12, i32 14, i32 12, i32 3, i32 15, i32 10, i32 1, i32 11, i32 15, i32 22, i32 26, i32 21, i32 19, i32 27, i32 16, i32 29, i32 24, i32 17, i32 17, i32 26, i32 29, i32 20, i32 31, i32 17, i32 29>
+ %cmp = icmp eq <32 x i8> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> %vec2
+ ret <32 x i8> %res
+}
+
+define <32 x i8> @test_masked_z_32xi8_perm_mask2(<32 x i8> %vec, <32 x i8> %mask) {
+; CHECK-LABEL: test_masked_z_32xi8_perm_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqb %ymm2, %ymm1, %k1
+; CHECK-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[7,8,12,14,7,4,7,12,14,12,3,15,10,1,11,15,22,26,21,19,27,16,29,24,17,17,26,29,20,31,17,29]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 7, i32 8, i32 12, i32 14, i32 7, i32 4, i32 7, i32 12, i32 14, i32 12, i32 3, i32 15, i32 10, i32 1, i32 11, i32 15, i32 22, i32 26, i32 21, i32 19, i32 27, i32 16, i32 29, i32 24, i32 17, i32 17, i32 26, i32 29, i32 20, i32 31, i32 17, i32 29>
+ %cmp = icmp eq <32 x i8> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> zeroinitializer
+ ret <32 x i8> %res
+}
+define <32 x i8> @test_32xi8_perm_mask3(<32 x i8> %vec) {
+; CHECK-LABEL: test_32xi8_perm_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[6,1,4,7,12,13,2,8,10,5,13,4,0,0,10,8,31,31,30,16,27,27,26,27,30,26,21,24,19,25,16,18]
+; CHECK-NEXT: retq
+ %res = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 6, i32 1, i32 4, i32 7, i32 12, i32 13, i32 2, i32 8, i32 10, i32 5, i32 13, i32 4, i32 0, i32 0, i32 10, i32 8, i32 31, i32 31, i32 30, i32 16, i32 27, i32 27, i32 26, i32 27, i32 30, i32 26, i32 21, i32 24, i32 19, i32 25, i32 16, i32 18>
+ ret <32 x i8> %res
+}
+define <32 x i8> @test_masked_32xi8_perm_mask3(<32 x i8> %vec, <32 x i8> %vec2, <32 x i8> %mask) {
+; CHECK-LABEL: test_masked_32xi8_perm_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqb %ymm3, %ymm2, %k1
+; CHECK-NEXT: vpshufb {{.*#+}} ymm1 {%k1} = ymm0[6,1,4,7,12,13,2,8,10,5,13,4,0,0,10,8,31,31,30,16,27,27,26,27,30,26,21,24,19,25,16,18]
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 6, i32 1, i32 4, i32 7, i32 12, i32 13, i32 2, i32 8, i32 10, i32 5, i32 13, i32 4, i32 0, i32 0, i32 10, i32 8, i32 31, i32 31, i32 30, i32 16, i32 27, i32 27, i32 26, i32 27, i32 30, i32 26, i32 21, i32 24, i32 19, i32 25, i32 16, i32 18>
+ %cmp = icmp eq <32 x i8> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> %vec2
+ ret <32 x i8> %res
+}
+
+define <32 x i8> @test_masked_z_32xi8_perm_mask3(<32 x i8> %vec, <32 x i8> %mask) {
+; CHECK-LABEL: test_masked_z_32xi8_perm_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqb %ymm2, %ymm1, %k1
+; CHECK-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[6,1,4,7,12,13,2,8,10,5,13,4,0,0,10,8,31,31,30,16,27,27,26,27,30,26,21,24,19,25,16,18]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 6, i32 1, i32 4, i32 7, i32 12, i32 13, i32 2, i32 8, i32 10, i32 5, i32 13, i32 4, i32 0, i32 0, i32 10, i32 8, i32 31, i32 31, i32 30, i32 16, i32 27, i32 27, i32 26, i32 27, i32 30, i32 26, i32 21, i32 24, i32 19, i32 25, i32 16, i32 18>
+ %cmp = icmp eq <32 x i8> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> zeroinitializer
+ ret <32 x i8> %res
+}
+define <32 x i8> @test_32xi8_perm_mem_mask0(<32 x i8>* %vp) {
+; CHECK-LABEL: test_32xi8_perm_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa (%rdi), %ymm0
+; CHECK-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[9,0,2,15,4,6,8,4,7,3,0,2,8,1,6,5,22,17,30,23,29,31,21,23,27,22,20,27,30,30,26,22]
+; CHECK-NEXT: retq
+ %vec = load <32 x i8>, <32 x i8>* %vp
+ %res = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 9, i32 0, i32 2, i32 15, i32 4, i32 6, i32 8, i32 4, i32 7, i32 3, i32 0, i32 2, i32 8, i32 1, i32 6, i32 5, i32 22, i32 17, i32 30, i32 23, i32 29, i32 31, i32 21, i32 23, i32 27, i32 22, i32 20, i32 27, i32 30, i32 30, i32 26, i32 22>
+ ret <32 x i8> %res
+}
+define <32 x i8> @test_masked_32xi8_perm_mem_mask0(<32 x i8>* %vp, <32 x i8> %vec2, <32 x i8> %mask) {
+; CHECK-LABEL: test_masked_32xi8_perm_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa (%rdi), %ymm2
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqb %ymm3, %ymm1, %k1
+; CHECK-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm2[9,0,2,15,4,6,8,4,7,3,0,2,8,1,6,5,22,17,30,23,29,31,21,23,27,22,20,27,30,30,26,22]
+; CHECK-NEXT: retq
+ %vec = load <32 x i8>, <32 x i8>* %vp
+ %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 9, i32 0, i32 2, i32 15, i32 4, i32 6, i32 8, i32 4, i32 7, i32 3, i32 0, i32 2, i32 8, i32 1, i32 6, i32 5, i32 22, i32 17, i32 30, i32 23, i32 29, i32 31, i32 21, i32 23, i32 27, i32 22, i32 20, i32 27, i32 30, i32 30, i32 26, i32 22>
+ %cmp = icmp eq <32 x i8> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> %vec2
+ ret <32 x i8> %res
+}
+
+define <32 x i8> @test_masked_z_32xi8_perm_mem_mask0(<32 x i8>* %vp, <32 x i8> %mask) {
+; CHECK-LABEL: test_masked_z_32xi8_perm_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa (%rdi), %ymm1
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqb %ymm2, %ymm0, %k1
+; CHECK-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[9,0,2,15,4,6,8,4,7,3,0,2,8,1,6,5,22,17,30,23,29,31,21,23,27,22,20,27,30,30,26,22]
+; CHECK-NEXT: retq
+ %vec = load <32 x i8>, <32 x i8>* %vp
+ %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 9, i32 0, i32 2, i32 15, i32 4, i32 6, i32 8, i32 4, i32 7, i32 3, i32 0, i32 2, i32 8, i32 1, i32 6, i32 5, i32 22, i32 17, i32 30, i32 23, i32 29, i32 31, i32 21, i32 23, i32 27, i32 22, i32 20, i32 27, i32 30, i32 30, i32 26, i32 22>
+ %cmp = icmp eq <32 x i8> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> zeroinitializer
+ ret <32 x i8> %res
+}
+
+define <32 x i8> @test_masked_32xi8_perm_mem_mask1(<32 x i8>* %vp, <32 x i8> %vec2, <32 x i8> %mask) {
+; CHECK-LABEL: test_masked_32xi8_perm_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa (%rdi), %ymm2
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqb %ymm3, %ymm1, %k1
+; CHECK-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm2[15,10,1,1,11,0,0,6,8,7,7,9,10,6,5,15,20,28,22,21,17,29,27,30,23,26,17,22,19,16,31,19]
+; CHECK-NEXT: retq
+ %vec = load <32 x i8>, <32 x i8>* %vp
+ %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 15, i32 10, i32 1, i32 1, i32 11, i32 0, i32 0, i32 6, i32 8, i32 7, i32 7, i32 9, i32 10, i32 6, i32 5, i32 15, i32 20, i32 28, i32 22, i32 21, i32 17, i32 29, i32 27, i32 30, i32 23, i32 26, i32 17, i32 22, i32 19, i32 16, i32 31, i32 19>
+ %cmp = icmp eq <32 x i8> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> %vec2
+ ret <32 x i8> %res
+}
+
+define <32 x i8> @test_masked_z_32xi8_perm_mem_mask1(<32 x i8>* %vp, <32 x i8> %mask) {
+; CHECK-LABEL: test_masked_z_32xi8_perm_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa (%rdi), %ymm1
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqb %ymm2, %ymm0, %k1
+; CHECK-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[15,10,1,1,11,0,0,6,8,7,7,9,10,6,5,15,20,28,22,21,17,29,27,30,23,26,17,22,19,16,31,19]
+; CHECK-NEXT: retq
+ %vec = load <32 x i8>, <32 x i8>* %vp
+ %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 15, i32 10, i32 1, i32 1, i32 11, i32 0, i32 0, i32 6, i32 8, i32 7, i32 7, i32 9, i32 10, i32 6, i32 5, i32 15, i32 20, i32 28, i32 22, i32 21, i32 17, i32 29, i32 27, i32 30, i32 23, i32 26, i32 17, i32 22, i32 19, i32 16, i32 31, i32 19>
+ %cmp = icmp eq <32 x i8> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> zeroinitializer
+ ret <32 x i8> %res
+}
+
+define <32 x i8> @test_masked_32xi8_perm_mem_mask2(<32 x i8>* %vp, <32 x i8> %vec2, <32 x i8> %mask) {
+; CHECK-LABEL: test_masked_32xi8_perm_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa (%rdi), %ymm2
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqb %ymm3, %ymm1, %k1
+; CHECK-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm2[2,3,6,8,2,15,15,2,6,10,14,7,14,5,7,7,26,19,25,19,21,31,30,29,16,18,20,28,29,25,27,28]
+; CHECK-NEXT: retq
+ %vec = load <32 x i8>, <32 x i8>* %vp
+ %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 2, i32 3, i32 6, i32 8, i32 2, i32 15, i32 15, i32 2, i32 6, i32 10, i32 14, i32 7, i32 14, i32 5, i32 7, i32 7, i32 26, i32 19, i32 25, i32 19, i32 21, i32 31, i32 30, i32 29, i32 16, i32 18, i32 20, i32 28, i32 29, i32 25, i32 27, i32 28>
+ %cmp = icmp eq <32 x i8> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> %vec2
+ ret <32 x i8> %res
+}
+
+define <32 x i8> @test_masked_z_32xi8_perm_mem_mask2(<32 x i8>* %vp, <32 x i8> %mask) {
+; CHECK-LABEL: test_masked_z_32xi8_perm_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa (%rdi), %ymm1
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqb %ymm2, %ymm0, %k1
+; CHECK-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[2,3,6,8,2,15,15,2,6,10,14,7,14,5,7,7,26,19,25,19,21,31,30,29,16,18,20,28,29,25,27,28]
+; CHECK-NEXT: retq
+ %vec = load <32 x i8>, <32 x i8>* %vp
+ %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 2, i32 3, i32 6, i32 8, i32 2, i32 15, i32 15, i32 2, i32 6, i32 10, i32 14, i32 7, i32 14, i32 5, i32 7, i32 7, i32 26, i32 19, i32 25, i32 19, i32 21, i32 31, i32 30, i32 29, i32 16, i32 18, i32 20, i32 28, i32 29, i32 25, i32 27, i32 28>
+ %cmp = icmp eq <32 x i8> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> zeroinitializer
+ ret <32 x i8> %res
+}
+
+define <32 x i8> @test_32xi8_perm_mem_mask3(<32 x i8>* %vp) {
+; CHECK-LABEL: test_32xi8_perm_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa (%rdi), %ymm0
+; CHECK-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,1,13,0,3,0,0,13,5,2,2,10,15,8,14,8,25,26,28,28,31,27,30,19,24,25,29,23,28,22,25,29]
+; CHECK-NEXT: retq
+ %vec = load <32 x i8>, <32 x i8>* %vp
+ %res = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 1, i32 1, i32 13, i32 0, i32 3, i32 0, i32 0, i32 13, i32 5, i32 2, i32 2, i32 10, i32 15, i32 8, i32 14, i32 8, i32 25, i32 26, i32 28, i32 28, i32 31, i32 27, i32 30, i32 19, i32 24, i32 25, i32 29, i32 23, i32 28, i32 22, i32 25, i32 29>
+ ret <32 x i8> %res
+}
+define <32 x i8> @test_masked_32xi8_perm_mem_mask3(<32 x i8>* %vp, <32 x i8> %vec2, <32 x i8> %mask) {
+; CHECK-LABEL: test_masked_32xi8_perm_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa (%rdi), %ymm2
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqb %ymm3, %ymm1, %k1
+; CHECK-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm2[1,1,13,0,3,0,0,13,5,2,2,10,15,8,14,8,25,26,28,28,31,27,30,19,24,25,29,23,28,22,25,29]
+; CHECK-NEXT: retq
+ %vec = load <32 x i8>, <32 x i8>* %vp
+ %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 1, i32 1, i32 13, i32 0, i32 3, i32 0, i32 0, i32 13, i32 5, i32 2, i32 2, i32 10, i32 15, i32 8, i32 14, i32 8, i32 25, i32 26, i32 28, i32 28, i32 31, i32 27, i32 30, i32 19, i32 24, i32 25, i32 29, i32 23, i32 28, i32 22, i32 25, i32 29>
+ %cmp = icmp eq <32 x i8> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> %vec2
+ ret <32 x i8> %res
+}
+
+define <32 x i8> @test_masked_z_32xi8_perm_mem_mask3(<32 x i8>* %vp, <32 x i8> %mask) {
+; CHECK-LABEL: test_masked_z_32xi8_perm_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa (%rdi), %ymm1
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqb %ymm2, %ymm0, %k1
+; CHECK-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[1,1,13,0,3,0,0,13,5,2,2,10,15,8,14,8,25,26,28,28,31,27,30,19,24,25,29,23,28,22,25,29]
+; CHECK-NEXT: retq
+ %vec = load <32 x i8>, <32 x i8>* %vp
+ %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 1, i32 1, i32 13, i32 0, i32 3, i32 0, i32 0, i32 13, i32 5, i32 2, i32 2, i32 10, i32 15, i32 8, i32 14, i32 8, i32 25, i32 26, i32 28, i32 28, i32 31, i32 27, i32 30, i32 19, i32 24, i32 25, i32 29, i32 23, i32 28, i32 22, i32 25, i32 29>
+ %cmp = icmp eq <32 x i8> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> zeroinitializer
+ ret <32 x i8> %res
+}
+
+define <64 x i8> @test_64xi8_perm_mask0(<64 x i8> %vec) {
+; CHECK-LABEL: test_64xi8_perm_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[8,4,1,13,15,4,6,12,0,10,2,4,13,0,0,6,23,29,27,26,18,31,22,25,22,16,23,18,16,25,26,17,40,37,38,44,39,46,41,39,42,37,33,42,41,44,34,46,60,62,61,58,60,56,60,51,60,55,60,55,60,49,48,62]
+; CHECK-NEXT: retq
+ %res = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 8, i32 4, i32 1, i32 13, i32 15, i32 4, i32 6, i32 12, i32 0, i32 10, i32 2, i32 4, i32 13, i32 0, i32 0, i32 6, i32 23, i32 29, i32 27, i32 26, i32 18, i32 31, i32 22, i32 25, i32 22, i32 16, i32 23, i32 18, i32 16, i32 25, i32 26, i32 17, i32 40, i32 37, i32 38, i32 44, i32 39, i32 46, i32 41, i32 39, i32 42, i32 37, i32 33, i32 42, i32 41, i32 44, i32 34, i32 46, i32 60, i32 62, i32 61, i32 58, i32 60, i32 56, i32 60, i32 51, i32 60, i32 55, i32 60, i32 55, i32 60, i32 49, i32 48, i32 62>
+ ret <64 x i8> %res
+}
+define <64 x i8> @test_masked_64xi8_perm_mask0(<64 x i8> %vec, <64 x i8> %vec2, <64 x i8> %mask) {
+; CHECK-LABEL: test_masked_64xi8_perm_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqb %zmm3, %zmm2, %k1
+; CHECK-NEXT: vpshufb {{.*#+}} zmm1 {%k1} = zmm0[8,4,1,13,15,4,6,12,0,10,2,4,13,0,0,6,23,29,27,26,18,31,22,25,22,16,23,18,16,25,26,17,40,37,38,44,39,46,41,39,42,37,33,42,41,44,34,46,60,62,61,58,60,56,60,51,60,55,60,55,60,49,48,62]
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 8, i32 4, i32 1, i32 13, i32 15, i32 4, i32 6, i32 12, i32 0, i32 10, i32 2, i32 4, i32 13, i32 0, i32 0, i32 6, i32 23, i32 29, i32 27, i32 26, i32 18, i32 31, i32 22, i32 25, i32 22, i32 16, i32 23, i32 18, i32 16, i32 25, i32 26, i32 17, i32 40, i32 37, i32 38, i32 44, i32 39, i32 46, i32 41, i32 39, i32 42, i32 37, i32 33, i32 42, i32 41, i32 44, i32 34, i32 46, i32 60, i32 62, i32 61, i32 58, i32 60, i32 56, i32 60, i32 51, i32 60, i32 55, i32 60, i32 55, i32 60, i32 49, i32 48, i32 62>
+ %cmp = icmp eq <64 x i8> %mask, zeroinitializer
+ %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> %vec2
+ ret <64 x i8> %res
+}
+
+define <64 x i8> @test_masked_z_64xi8_perm_mask0(<64 x i8> %vec, <64 x i8> %mask) {
+; CHECK-LABEL: test_masked_z_64xi8_perm_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqb %zmm2, %zmm1, %k1
+; CHECK-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[8,4,1,13,15,4,6,12,0,10,2,4,13,0,0,6,23,29,27,26,18,31,22,25,22,16,23,18,16,25,26,17,40,37,38,44,39,46,41,39,42,37,33,42,41,44,34,46,60,62,61,58,60,56,60,51,60,55,60,55,60,49,48,62]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 8, i32 4, i32 1, i32 13, i32 15, i32 4, i32 6, i32 12, i32 0, i32 10, i32 2, i32 4, i32 13, i32 0, i32 0, i32 6, i32 23, i32 29, i32 27, i32 26, i32 18, i32 31, i32 22, i32 25, i32 22, i32 16, i32 23, i32 18, i32 16, i32 25, i32 26, i32 17, i32 40, i32 37, i32 38, i32 44, i32 39, i32 46, i32 41, i32 39, i32 42, i32 37, i32 33, i32 42, i32 41, i32 44, i32 34, i32 46, i32 60, i32 62, i32 61, i32 58, i32 60, i32 56, i32 60, i32 51, i32 60, i32 55, i32 60, i32 55, i32 60, i32 49, i32 48, i32 62>
+ %cmp = icmp eq <64 x i8> %mask, zeroinitializer
+ %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> zeroinitializer
+ ret <64 x i8> %res
+}
+define <64 x i8> @test_masked_64xi8_perm_mask1(<64 x i8> %vec, <64 x i8> %vec2, <64 x i8> %mask) {
+; CHECK-LABEL: test_masked_64xi8_perm_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqb %zmm3, %zmm2, %k1
+; CHECK-NEXT: vpshufb {{.*#+}} zmm1 {%k1} = zmm0[7,14,15,10,9,3,1,13,14,12,11,6,4,1,6,9,30,30,22,17,28,27,16,23,26,16,30,31,27,17,17,21,32,37,32,47,45,33,46,35,35,42,47,33,32,37,32,41,61,50,49,53,63,50,63,53,55,52,62,63,58,50,63,49]
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 7, i32 14, i32 15, i32 10, i32 9, i32 3, i32 1, i32 13, i32 14, i32 12, i32 11, i32 6, i32 4, i32 1, i32 6, i32 9, i32 30, i32 30, i32 22, i32 17, i32 28, i32 27, i32 16, i32 23, i32 26, i32 16, i32 30, i32 31, i32 27, i32 17, i32 17, i32 21, i32 32, i32 37, i32 32, i32 47, i32 45, i32 33, i32 46, i32 35, i32 35, i32 42, i32 47, i32 33, i32 32, i32 37, i32 32, i32 41, i32 61, i32 50, i32 49, i32 53, i32 63, i32 50, i32 63, i32 53, i32 55, i32 52, i32 62, i32 63, i32 58, i32 50, i32 63, i32 49>
+ %cmp = icmp eq <64 x i8> %mask, zeroinitializer
+ %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> %vec2
+ ret <64 x i8> %res
+}
+
+define <64 x i8> @test_masked_z_64xi8_perm_mask1(<64 x i8> %vec, <64 x i8> %mask) {
+; CHECK-LABEL: test_masked_z_64xi8_perm_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqb %zmm2, %zmm1, %k1
+; CHECK-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[7,14,15,10,9,3,1,13,14,12,11,6,4,1,6,9,30,30,22,17,28,27,16,23,26,16,30,31,27,17,17,21,32,37,32,47,45,33,46,35,35,42,47,33,32,37,32,41,61,50,49,53,63,50,63,53,55,52,62,63,58,50,63,49]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 7, i32 14, i32 15, i32 10, i32 9, i32 3, i32 1, i32 13, i32 14, i32 12, i32 11, i32 6, i32 4, i32 1, i32 6, i32 9, i32 30, i32 30, i32 22, i32 17, i32 28, i32 27, i32 16, i32 23, i32 26, i32 16, i32 30, i32 31, i32 27, i32 17, i32 17, i32 21, i32 32, i32 37, i32 32, i32 47, i32 45, i32 33, i32 46, i32 35, i32 35, i32 42, i32 47, i32 33, i32 32, i32 37, i32 32, i32 41, i32 61, i32 50, i32 49, i32 53, i32 63, i32 50, i32 63, i32 53, i32 55, i32 52, i32 62, i32 63, i32 58, i32 50, i32 63, i32 49>
+ %cmp = icmp eq <64 x i8> %mask, zeroinitializer
+ %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> zeroinitializer
+ ret <64 x i8> %res
+}
+define <64 x i8> @test_masked_64xi8_perm_mask2(<64 x i8> %vec, <64 x i8> %vec2, <64 x i8> %mask) {
+; CHECK-LABEL: test_masked_64xi8_perm_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqb %zmm3, %zmm2, %k1
+; CHECK-NEXT: vpshufb {{.*#+}} zmm1 {%k1} = zmm0[9,2,14,15,12,5,3,12,4,6,0,2,0,1,1,6,24,27,18,22,26,17,23,21,31,16,22,22,27,21,19,20,39,47,44,36,40,43,44,39,38,44,38,35,39,46,34,39,58,55,51,48,59,57,48,52,60,58,56,50,59,55,58,60]
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 9, i32 2, i32 14, i32 15, i32 12, i32 5, i32 3, i32 12, i32 4, i32 6, i32 0, i32 2, i32 0, i32 1, i32 1, i32 6, i32 24, i32 27, i32 18, i32 22, i32 26, i32 17, i32 23, i32 21, i32 31, i32 16, i32 22, i32 22, i32 27, i32 21, i32 19, i32 20, i32 39, i32 47, i32 44, i32 36, i32 40, i32 43, i32 44, i32 39, i32 38, i32 44, i32 38, i32 35, i32 39, i32 46, i32 34, i32 39, i32 58, i32 55, i32 51, i32 48, i32 59, i32 57, i32 48, i32 52, i32 60, i32 58, i32 56, i32 50, i32 59, i32 55, i32 58, i32 60>
+ %cmp = icmp eq <64 x i8> %mask, zeroinitializer
+ %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> %vec2
+ ret <64 x i8> %res
+}
+
+define <64 x i8> @test_masked_z_64xi8_perm_mask2(<64 x i8> %vec, <64 x i8> %mask) {
+; CHECK-LABEL: test_masked_z_64xi8_perm_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqb %zmm2, %zmm1, %k1
+; CHECK-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[9,2,14,15,12,5,3,12,4,6,0,2,0,1,1,6,24,27,18,22,26,17,23,21,31,16,22,22,27,21,19,20,39,47,44,36,40,43,44,39,38,44,38,35,39,46,34,39,58,55,51,48,59,57,48,52,60,58,56,50,59,55,58,60]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 9, i32 2, i32 14, i32 15, i32 12, i32 5, i32 3, i32 12, i32 4, i32 6, i32 0, i32 2, i32 0, i32 1, i32 1, i32 6, i32 24, i32 27, i32 18, i32 22, i32 26, i32 17, i32 23, i32 21, i32 31, i32 16, i32 22, i32 22, i32 27, i32 21, i32 19, i32 20, i32 39, i32 47, i32 44, i32 36, i32 40, i32 43, i32 44, i32 39, i32 38, i32 44, i32 38, i32 35, i32 39, i32 46, i32 34, i32 39, i32 58, i32 55, i32 51, i32 48, i32 59, i32 57, i32 48, i32 52, i32 60, i32 58, i32 56, i32 50, i32 59, i32 55, i32 58, i32 60>
+ %cmp = icmp eq <64 x i8> %mask, zeroinitializer
+ %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> zeroinitializer
+ ret <64 x i8> %res
+}
+define <64 x i8> @test_64xi8_perm_mask3(<64 x i8> %vec) {
+; CHECK-LABEL: test_64xi8_perm_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[3,12,4,15,1,14,0,4,8,9,6,1,4,4,12,14,25,16,28,20,21,24,19,30,18,22,20,24,25,26,24,22,42,38,44,44,36,37,42,34,43,38,41,34,42,37,39,38,55,59,53,58,48,52,59,48,57,48,55,62,48,56,49,61]
+; CHECK-NEXT: retq
+ %res = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 3, i32 12, i32 4, i32 15, i32 1, i32 14, i32 0, i32 4, i32 8, i32 9, i32 6, i32 1, i32 4, i32 4, i32 12, i32 14, i32 25, i32 16, i32 28, i32 20, i32 21, i32 24, i32 19, i32 30, i32 18, i32 22, i32 20, i32 24, i32 25, i32 26, i32 24, i32 22, i32 42, i32 38, i32 44, i32 44, i32 36, i32 37, i32 42, i32 34, i32 43, i32 38, i32 41, i32 34, i32 42, i32 37, i32 39, i32 38, i32 55, i32 59, i32 53, i32 58, i32 48, i32 52, i32 59, i32 48, i32 57, i32 48, i32 55, i32 62, i32 48, i32 56, i32 49, i32 61>
+ ret <64 x i8> %res
+}
+define <64 x i8> @test_masked_64xi8_perm_mask3(<64 x i8> %vec, <64 x i8> %vec2, <64 x i8> %mask) {
+; CHECK-LABEL: test_masked_64xi8_perm_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqb %zmm3, %zmm2, %k1
+; CHECK-NEXT: vpshufb {{.*#+}} zmm1 {%k1} = zmm0[3,12,4,15,1,14,0,4,8,9,6,1,4,4,12,14,25,16,28,20,21,24,19,30,18,22,20,24,25,26,24,22,42,38,44,44,36,37,42,34,43,38,41,34,42,37,39,38,55,59,53,58,48,52,59,48,57,48,55,62,48,56,49,61]
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 3, i32 12, i32 4, i32 15, i32 1, i32 14, i32 0, i32 4, i32 8, i32 9, i32 6, i32 1, i32 4, i32 4, i32 12, i32 14, i32 25, i32 16, i32 28, i32 20, i32 21, i32 24, i32 19, i32 30, i32 18, i32 22, i32 20, i32 24, i32 25, i32 26, i32 24, i32 22, i32 42, i32 38, i32 44, i32 44, i32 36, i32 37, i32 42, i32 34, i32 43, i32 38, i32 41, i32 34, i32 42, i32 37, i32 39, i32 38, i32 55, i32 59, i32 53, i32 58, i32 48, i32 52, i32 59, i32 48, i32 57, i32 48, i32 55, i32 62, i32 48, i32 56, i32 49, i32 61>
+ %cmp = icmp eq <64 x i8> %mask, zeroinitializer
+ %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> %vec2
+ ret <64 x i8> %res
+}
+
+define <64 x i8> @test_masked_z_64xi8_perm_mask3(<64 x i8> %vec, <64 x i8> %mask) {
+; CHECK-LABEL: test_masked_z_64xi8_perm_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqb %zmm2, %zmm1, %k1
+; CHECK-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[3,12,4,15,1,14,0,4,8,9,6,1,4,4,12,14,25,16,28,20,21,24,19,30,18,22,20,24,25,26,24,22,42,38,44,44,36,37,42,34,43,38,41,34,42,37,39,38,55,59,53,58,48,52,59,48,57,48,55,62,48,56,49,61]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 3, i32 12, i32 4, i32 15, i32 1, i32 14, i32 0, i32 4, i32 8, i32 9, i32 6, i32 1, i32 4, i32 4, i32 12, i32 14, i32 25, i32 16, i32 28, i32 20, i32 21, i32 24, i32 19, i32 30, i32 18, i32 22, i32 20, i32 24, i32 25, i32 26, i32 24, i32 22, i32 42, i32 38, i32 44, i32 44, i32 36, i32 37, i32 42, i32 34, i32 43, i32 38, i32 41, i32 34, i32 42, i32 37, i32 39, i32 38, i32 55, i32 59, i32 53, i32 58, i32 48, i32 52, i32 59, i32 48, i32 57, i32 48, i32 55, i32 62, i32 48, i32 56, i32 49, i32 61>
+ %cmp = icmp eq <64 x i8> %mask, zeroinitializer
+ %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> zeroinitializer
+ ret <64 x i8> %res
+}
+define <64 x i8> @test_64xi8_perm_mem_mask0(<64 x i8>* %vp) {
+; CHECK-LABEL: test_64xi8_perm_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0
+; CHECK-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,9,15,13,11,11,3,12,4,1,7,5,2,6,14,6,23,27,24,18,30,23,28,22,28,22,19,19,31,25,16,22,35,33,34,32,42,34,41,41,43,40,36,46,37,39,42,40,63,63,62,62,57,55,59,51,52,48,50,48,58,50,60,58]
+; CHECK-NEXT: retq
+ %vec = load <64 x i8>, <64 x i8>* %vp
+ %res = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 0, i32 9, i32 15, i32 13, i32 11, i32 11, i32 3, i32 12, i32 4, i32 1, i32 7, i32 5, i32 2, i32 6, i32 14, i32 6, i32 23, i32 27, i32 24, i32 18, i32 30, i32 23, i32 28, i32 22, i32 28, i32 22, i32 19, i32 19, i32 31, i32 25, i32 16, i32 22, i32 35, i32 33, i32 34, i32 32, i32 42, i32 34, i32 41, i32 41, i32 43, i32 40, i32 36, i32 46, i32 37, i32 39, i32 42, i32 40, i32 63, i32 63, i32 62, i32 62, i32 57, i32 55, i32 59, i32 51, i32 52, i32 48, i32 50, i32 48, i32 58, i32 50, i32 60, i32 58>
+ ret <64 x i8> %res
+}
+define <64 x i8> @test_masked_64xi8_perm_mem_mask0(<64 x i8>* %vp, <64 x i8> %vec2, <64 x i8> %mask) {
+; CHECK-LABEL: test_masked_64xi8_perm_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqb %zmm3, %zmm1, %k1
+; CHECK-NEXT: vpshufb {{.*#+}} zmm0 {%k1} = zmm2[0,9,15,13,11,11,3,12,4,1,7,5,2,6,14,6,23,27,24,18,30,23,28,22,28,22,19,19,31,25,16,22,35,33,34,32,42,34,41,41,43,40,36,46,37,39,42,40,63,63,62,62,57,55,59,51,52,48,50,48,58,50,60,58]
+; CHECK-NEXT: retq
+ %vec = load <64 x i8>, <64 x i8>* %vp
+ %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 0, i32 9, i32 15, i32 13, i32 11, i32 11, i32 3, i32 12, i32 4, i32 1, i32 7, i32 5, i32 2, i32 6, i32 14, i32 6, i32 23, i32 27, i32 24, i32 18, i32 30, i32 23, i32 28, i32 22, i32 28, i32 22, i32 19, i32 19, i32 31, i32 25, i32 16, i32 22, i32 35, i32 33, i32 34, i32 32, i32 42, i32 34, i32 41, i32 41, i32 43, i32 40, i32 36, i32 46, i32 37, i32 39, i32 42, i32 40, i32 63, i32 63, i32 62, i32 62, i32 57, i32 55, i32 59, i32 51, i32 52, i32 48, i32 50, i32 48, i32 58, i32 50, i32 60, i32 58>
+ %cmp = icmp eq <64 x i8> %mask, zeroinitializer
+ %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> %vec2
+ ret <64 x i8> %res
+}
+
+define <64 x i8> @test_masked_z_64xi8_perm_mem_mask0(<64 x i8>* %vp, <64 x i8> %mask) {
+; CHECK-LABEL: test_masked_z_64xi8_perm_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqb %zmm2, %zmm0, %k1
+; CHECK-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[0,9,15,13,11,11,3,12,4,1,7,5,2,6,14,6,23,27,24,18,30,23,28,22,28,22,19,19,31,25,16,22,35,33,34,32,42,34,41,41,43,40,36,46,37,39,42,40,63,63,62,62,57,55,59,51,52,48,50,48,58,50,60,58]
+; CHECK-NEXT: retq
+ %vec = load <64 x i8>, <64 x i8>* %vp
+ %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 0, i32 9, i32 15, i32 13, i32 11, i32 11, i32 3, i32 12, i32 4, i32 1, i32 7, i32 5, i32 2, i32 6, i32 14, i32 6, i32 23, i32 27, i32 24, i32 18, i32 30, i32 23, i32 28, i32 22, i32 28, i32 22, i32 19, i32 19, i32 31, i32 25, i32 16, i32 22, i32 35, i32 33, i32 34, i32 32, i32 42, i32 34, i32 41, i32 41, i32 43, i32 40, i32 36, i32 46, i32 37, i32 39, i32 42, i32 40, i32 63, i32 63, i32 62, i32 62, i32 57, i32 55, i32 59, i32 51, i32 52, i32 48, i32 50, i32 48, i32 58, i32 50, i32 60, i32 58>
+ %cmp = icmp eq <64 x i8> %mask, zeroinitializer
+ %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> zeroinitializer
+ ret <64 x i8> %res
+}
+
+define <64 x i8> @test_masked_64xi8_perm_mem_mask1(<64 x i8>* %vp, <64 x i8> %vec2, <64 x i8> %mask) {
+; CHECK-LABEL: test_masked_64xi8_perm_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqb %zmm3, %zmm1, %k1
+; CHECK-NEXT: vpshufb {{.*#+}} zmm0 {%k1} = zmm2[15,6,14,7,5,1,14,12,5,7,5,0,0,5,3,8,19,19,26,27,20,29,20,21,27,16,30,17,23,27,16,28,47,39,33,33,33,44,38,46,39,33,38,44,45,32,34,39,50,61,62,53,54,56,52,56,51,52,55,57,56,52,51,49]
+; CHECK-NEXT: retq
+ %vec = load <64 x i8>, <64 x i8>* %vp
+ %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 15, i32 6, i32 14, i32 7, i32 5, i32 1, i32 14, i32 12, i32 5, i32 7, i32 5, i32 0, i32 0, i32 5, i32 3, i32 8, i32 19, i32 19, i32 26, i32 27, i32 20, i32 29, i32 20, i32 21, i32 27, i32 16, i32 30, i32 17, i32 23, i32 27, i32 16, i32 28, i32 47, i32 39, i32 33, i32 33, i32 33, i32 44, i32 38, i32 46, i32 39, i32 33, i32 38, i32 44, i32 45, i32 32, i32 34, i32 39, i32 50, i32 61, i32 62, i32 53, i32 54, i32 56, i32 52, i32 56, i32 51, i32 52, i32 55, i32 57, i32 56, i32 52, i32 51, i32 49>
+ %cmp = icmp eq <64 x i8> %mask, zeroinitializer
+ %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> %vec2
+ ret <64 x i8> %res
+}
+
+define <64 x i8> @test_masked_z_64xi8_perm_mem_mask1(<64 x i8>* %vp, <64 x i8> %mask) {
+; CHECK-LABEL: test_masked_z_64xi8_perm_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqb %zmm2, %zmm0, %k1
+; CHECK-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[15,6,14,7,5,1,14,12,5,7,5,0,0,5,3,8,19,19,26,27,20,29,20,21,27,16,30,17,23,27,16,28,47,39,33,33,33,44,38,46,39,33,38,44,45,32,34,39,50,61,62,53,54,56,52,56,51,52,55,57,56,52,51,49]
+; CHECK-NEXT: retq
+ %vec = load <64 x i8>, <64 x i8>* %vp
+ %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 15, i32 6, i32 14, i32 7, i32 5, i32 1, i32 14, i32 12, i32 5, i32 7, i32 5, i32 0, i32 0, i32 5, i32 3, i32 8, i32 19, i32 19, i32 26, i32 27, i32 20, i32 29, i32 20, i32 21, i32 27, i32 16, i32 30, i32 17, i32 23, i32 27, i32 16, i32 28, i32 47, i32 39, i32 33, i32 33, i32 33, i32 44, i32 38, i32 46, i32 39, i32 33, i32 38, i32 44, i32 45, i32 32, i32 34, i32 39, i32 50, i32 61, i32 62, i32 53, i32 54, i32 56, i32 52, i32 56, i32 51, i32 52, i32 55, i32 57, i32 56, i32 52, i32 51, i32 49>
+ %cmp = icmp eq <64 x i8> %mask, zeroinitializer
+ %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> zeroinitializer
+ ret <64 x i8> %res
+}
+
+define <64 x i8> @test_masked_64xi8_perm_mem_mask2(<64 x i8>* %vp, <64 x i8> %vec2, <64 x i8> %mask) {
+; CHECK-LABEL: test_masked_64xi8_perm_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqb %zmm3, %zmm1, %k1
+; CHECK-NEXT: vpshufb {{.*#+}} zmm0 {%k1} = zmm2[12,1,11,3,4,11,10,11,8,13,1,10,1,11,5,10,27,26,19,29,19,24,26,19,26,20,18,28,24,21,25,16,34,38,47,40,33,44,44,44,41,43,35,43,45,44,37,41,58,62,49,61,56,53,55,48,51,58,58,55,63,55,53,61]
+; CHECK-NEXT: retq
+ %vec = load <64 x i8>, <64 x i8>* %vp
+ %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 12, i32 1, i32 11, i32 3, i32 4, i32 11, i32 10, i32 11, i32 8, i32 13, i32 1, i32 10, i32 1, i32 11, i32 5, i32 10, i32 27, i32 26, i32 19, i32 29, i32 19, i32 24, i32 26, i32 19, i32 26, i32 20, i32 18, i32 28, i32 24, i32 21, i32 25, i32 16, i32 34, i32 38, i32 47, i32 40, i32 33, i32 44, i32 44, i32 44, i32 41, i32 43, i32 35, i32 43, i32 45, i32 44, i32 37, i32 41, i32 58, i32 62, i32 49, i32 61, i32 56, i32 53, i32 55, i32 48, i32 51, i32 58, i32 58, i32 55, i32 63, i32 55, i32 53, i32 61>
+ %cmp = icmp eq <64 x i8> %mask, zeroinitializer
+ %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> %vec2
+ ret <64 x i8> %res
+}
+
+define <64 x i8> @test_masked_z_64xi8_perm_mem_mask2(<64 x i8>* %vp, <64 x i8> %mask) {
+; CHECK-LABEL: test_masked_z_64xi8_perm_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqb %zmm2, %zmm0, %k1
+; CHECK-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[12,1,11,3,4,11,10,11,8,13,1,10,1,11,5,10,27,26,19,29,19,24,26,19,26,20,18,28,24,21,25,16,34,38,47,40,33,44,44,44,41,43,35,43,45,44,37,41,58,62,49,61,56,53,55,48,51,58,58,55,63,55,53,61]
+; CHECK-NEXT: retq
+ %vec = load <64 x i8>, <64 x i8>* %vp
+ %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 12, i32 1, i32 11, i32 3, i32 4, i32 11, i32 10, i32 11, i32 8, i32 13, i32 1, i32 10, i32 1, i32 11, i32 5, i32 10, i32 27, i32 26, i32 19, i32 29, i32 19, i32 24, i32 26, i32 19, i32 26, i32 20, i32 18, i32 28, i32 24, i32 21, i32 25, i32 16, i32 34, i32 38, i32 47, i32 40, i32 33, i32 44, i32 44, i32 44, i32 41, i32 43, i32 35, i32 43, i32 45, i32 44, i32 37, i32 41, i32 58, i32 62, i32 49, i32 61, i32 56, i32 53, i32 55, i32 48, i32 51, i32 58, i32 58, i32 55, i32 63, i32 55, i32 53, i32 61>
+ %cmp = icmp eq <64 x i8> %mask, zeroinitializer
+ %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> zeroinitializer
+ ret <64 x i8> %res
+}
+
+define <64 x i8> @test_64xi8_perm_mem_mask3(<64 x i8>* %vp) {
+; CHECK-LABEL: test_64xi8_perm_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0
+; CHECK-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[4,9,11,13,12,6,0,0,11,15,5,7,11,10,4,10,20,21,24,27,18,16,26,16,16,19,26,17,16,31,22,30,35,38,37,34,37,47,43,38,38,36,40,43,42,39,32,46,54,54,48,50,61,56,59,50,53,61,61,51,48,60,50,60]
+; CHECK-NEXT: retq
+ %vec = load <64 x i8>, <64 x i8>* %vp
+ %res = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 4, i32 9, i32 11, i32 13, i32 12, i32 6, i32 0, i32 0, i32 11, i32 15, i32 5, i32 7, i32 11, i32 10, i32 4, i32 10, i32 20, i32 21, i32 24, i32 27, i32 18, i32 16, i32 26, i32 16, i32 16, i32 19, i32 26, i32 17, i32 16, i32 31, i32 22, i32 30, i32 35, i32 38, i32 37, i32 34, i32 37, i32 47, i32 43, i32 38, i32 38, i32 36, i32 40, i32 43, i32 42, i32 39, i32 32, i32 46, i32 54, i32 54, i32 48, i32 50, i32 61, i32 56, i32 59, i32 50, i32 53, i32 61, i32 61, i32 51, i32 48, i32 60, i32 50, i32 60>
+ ret <64 x i8> %res
+}
+define <64 x i8> @test_masked_64xi8_perm_mem_mask3(<64 x i8>* %vp, <64 x i8> %vec2, <64 x i8> %mask) {
+; CHECK-LABEL: test_masked_64xi8_perm_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqb %zmm3, %zmm1, %k1
+; CHECK-NEXT: vpshufb {{.*#+}} zmm0 {%k1} = zmm2[4,9,11,13,12,6,0,0,11,15,5,7,11,10,4,10,20,21,24,27,18,16,26,16,16,19,26,17,16,31,22,30,35,38,37,34,37,47,43,38,38,36,40,43,42,39,32,46,54,54,48,50,61,56,59,50,53,61,61,51,48,60,50,60]
+; CHECK-NEXT: retq
+ %vec = load <64 x i8>, <64 x i8>* %vp
+ %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 4, i32 9, i32 11, i32 13, i32 12, i32 6, i32 0, i32 0, i32 11, i32 15, i32 5, i32 7, i32 11, i32 10, i32 4, i32 10, i32 20, i32 21, i32 24, i32 27, i32 18, i32 16, i32 26, i32 16, i32 16, i32 19, i32 26, i32 17, i32 16, i32 31, i32 22, i32 30, i32 35, i32 38, i32 37, i32 34, i32 37, i32 47, i32 43, i32 38, i32 38, i32 36, i32 40, i32 43, i32 42, i32 39, i32 32, i32 46, i32 54, i32 54, i32 48, i32 50, i32 61, i32 56, i32 59, i32 50, i32 53, i32 61, i32 61, i32 51, i32 48, i32 60, i32 50, i32 60>
+ %cmp = icmp eq <64 x i8> %mask, zeroinitializer
+ %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> %vec2
+ ret <64 x i8> %res
+}
+
+define <64 x i8> @test_masked_z_64xi8_perm_mem_mask3(<64 x i8>* %vp, <64 x i8> %mask) {
+; CHECK-LABEL: test_masked_z_64xi8_perm_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqb %zmm2, %zmm0, %k1
+; CHECK-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[4,9,11,13,12,6,0,0,11,15,5,7,11,10,4,10,20,21,24,27,18,16,26,16,16,19,26,17,16,31,22,30,35,38,37,34,37,47,43,38,38,36,40,43,42,39,32,46,54,54,48,50,61,56,59,50,53,61,61,51,48,60,50,60]
+; CHECK-NEXT: retq
+ %vec = load <64 x i8>, <64 x i8>* %vp
+ %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 4, i32 9, i32 11, i32 13, i32 12, i32 6, i32 0, i32 0, i32 11, i32 15, i32 5, i32 7, i32 11, i32 10, i32 4, i32 10, i32 20, i32 21, i32 24, i32 27, i32 18, i32 16, i32 26, i32 16, i32 16, i32 19, i32 26, i32 17, i32 16, i32 31, i32 22, i32 30, i32 35, i32 38, i32 37, i32 34, i32 37, i32 47, i32 43, i32 38, i32 38, i32 36, i32 40, i32 43, i32 42, i32 39, i32 32, i32 46, i32 54, i32 54, i32 48, i32 50, i32 61, i32 56, i32 59, i32 50, i32 53, i32 61, i32 61, i32 51, i32 48, i32 60, i32 50, i32 60>
+ %cmp = icmp eq <64 x i8> %mask, zeroinitializer
+ %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> zeroinitializer
+ ret <64 x i8> %res
+}
+
+define <8 x i16> @test_8xi16_perm_high_mask0(<8 x i16> %vec) {
+; CHECK-LABEL: test_8xi16_perm_high_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,7,6]
+; CHECK-NEXT: retq
+ %res = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 5, i32 7, i32 6>
+ ret <8 x i16> %res
+}
+define <8 x i16> @test_masked_8xi16_perm_high_mask0(<8 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
+; CHECK-LABEL: test_masked_8xi16_perm_high_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqw %xmm3, %xmm2, %k1
+; CHECK-NEXT: vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,6,5,7,6]
+; CHECK-NEXT: vmovdqa %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 5, i32 7, i32 6>
+ %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
+ ret <8 x i16> %res
+}
+
+define <8 x i16> @test_masked_z_8xi16_perm_high_mask0(<8 x i16> %vec, <8 x i16> %mask) {
+; CHECK-LABEL: test_masked_z_8xi16_perm_high_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1
+; CHECK-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,6,5,7,6]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 5, i32 7, i32 6>
+ %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
+ ret <8 x i16> %res
+}
+define <8 x i16> @test_masked_8xi16_perm_low_mask1(<8 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
+; CHECK-LABEL: test_masked_8xi16_perm_low_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqw %xmm3, %xmm2, %k1
+; CHECK-NEXT: vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[0,3,0,0,4,5,6,7]
+; CHECK-NEXT: vmovdqa %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 3, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7>
+ %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
+ ret <8 x i16> %res
+}
+
+define <8 x i16> @test_masked_z_8xi16_perm_low_mask1(<8 x i16> %vec, <8 x i16> %mask) {
+; CHECK-LABEL: test_masked_z_8xi16_perm_low_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1
+; CHECK-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,3,0,0,4,5,6,7]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 3, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7>
+ %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
+ ret <8 x i16> %res
+}
+define <8 x i16> @test_masked_8xi16_perm_high_mask2(<8 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
+; CHECK-LABEL: test_masked_8xi16_perm_high_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqw %xmm3, %xmm2, %k1
+; CHECK-NEXT: vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,5,4,4,5]
+; CHECK-NEXT: vmovdqa %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 4, i32 5>
+ %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
+ ret <8 x i16> %res
+}
+
+define <8 x i16> @test_masked_z_8xi16_perm_high_mask2(<8 x i16> %vec, <8 x i16> %mask) {
+; CHECK-LABEL: test_masked_z_8xi16_perm_high_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1
+; CHECK-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,5,4,4,5]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 4, i32 5>
+ %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
+ ret <8 x i16> %res
+}
+define <8 x i16> @test_8xi16_perm_low_mask3(<8 x i16> %vec) {
+; CHECK-LABEL: test_8xi16_perm_low_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,1,1,1,4,5,6,7]
+; CHECK-NEXT: retq
+ %res = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 2, i32 1, i32 1, i32 1, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x i16> %res
+}
+define <8 x i16> @test_masked_8xi16_perm_low_mask3(<8 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
+; CHECK-LABEL: test_masked_8xi16_perm_low_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqw %xmm3, %xmm2, %k1
+; CHECK-NEXT: vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[2,1,1,1,4,5,6,7]
+; CHECK-NEXT: vmovdqa %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 2, i32 1, i32 1, i32 1, i32 4, i32 5, i32 6, i32 7>
+ %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
+ ret <8 x i16> %res
+}
+
+define <8 x i16> @test_masked_z_8xi16_perm_low_mask3(<8 x i16> %vec, <8 x i16> %mask) {
+; CHECK-LABEL: test_masked_z_8xi16_perm_low_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1
+; CHECK-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[2,1,1,1,4,5,6,7]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 2, i32 1, i32 1, i32 1, i32 4, i32 5, i32 6, i32 7>
+ %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
+ ret <8 x i16> %res
+}
+define <8 x i16> @test_masked_8xi16_perm_high_mask4(<8 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
+; CHECK-LABEL: test_masked_8xi16_perm_high_mask4:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqw %xmm3, %xmm2, %k1
+; CHECK-NEXT: vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,5,5,7,6]
+; CHECK-NEXT: vmovdqa %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 5, i32 7, i32 6>
+ %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
+ ret <8 x i16> %res
+}
+
+define <8 x i16> @test_masked_z_8xi16_perm_high_mask4(<8 x i16> %vec, <8 x i16> %mask) {
+; CHECK-LABEL: test_masked_z_8xi16_perm_high_mask4:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1
+; CHECK-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,5,5,7,6]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 5, i32 7, i32 6>
+ %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
+ ret <8 x i16> %res
+}
+define <8 x i16> @test_masked_8xi16_perm_low_mask5(<8 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
+; CHECK-LABEL: test_masked_8xi16_perm_low_mask5:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqw %xmm3, %xmm2, %k1
+; CHECK-NEXT: vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[3,3,2,1,4,5,6,7]
+; CHECK-NEXT: vmovdqa %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 2, i32 1, i32 4, i32 5, i32 6, i32 7>
+ %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
+ ret <8 x i16> %res
+}
+
+define <8 x i16> @test_masked_z_8xi16_perm_low_mask5(<8 x i16> %vec, <8 x i16> %mask) {
+; CHECK-LABEL: test_masked_z_8xi16_perm_low_mask5:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1
+; CHECK-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[3,3,2,1,4,5,6,7]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 2, i32 1, i32 4, i32 5, i32 6, i32 7>
+ %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
+ ret <8 x i16> %res
+}
+define <8 x i16> @test_8xi16_perm_high_mask6(<8 x i16> %vec) {
+; CHECK-LABEL: test_8xi16_perm_high_mask6:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,5]
+; CHECK-NEXT: retq
+ %res = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 5, i32 6, i32 5>
+ ret <8 x i16> %res
+}
+define <8 x i16> @test_masked_8xi16_perm_high_mask6(<8 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
+; CHECK-LABEL: test_masked_8xi16_perm_high_mask6:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqw %xmm3, %xmm2, %k1
+; CHECK-NEXT: vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,6,5,6,5]
+; CHECK-NEXT: vmovdqa %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 5, i32 6, i32 5>
+ %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
+ ret <8 x i16> %res
+}
+
+define <8 x i16> @test_masked_z_8xi16_perm_high_mask6(<8 x i16> %vec, <8 x i16> %mask) {
+; CHECK-LABEL: test_masked_z_8xi16_perm_high_mask6:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1
+; CHECK-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,6,5,6,5]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 5, i32 6, i32 5>
+ %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
+ ret <8 x i16> %res
+}
+define <8 x i16> @test_masked_8xi16_perm_low_mask7(<8 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
+; CHECK-LABEL: test_masked_8xi16_perm_low_mask7:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqw %xmm3, %xmm2, %k1
+; CHECK-NEXT: vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[1,0,2,0,4,5,6,7]
+; CHECK-NEXT: vmovdqa %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 2, i32 0, i32 4, i32 5, i32 6, i32 7>
+ %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
+ ret <8 x i16> %res
+}
+
+define <8 x i16> @test_masked_z_8xi16_perm_low_mask7(<8 x i16> %vec, <8 x i16> %mask) {
+; CHECK-LABEL: test_masked_z_8xi16_perm_low_mask7:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1
+; CHECK-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[1,0,2,0,4,5,6,7]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 2, i32 0, i32 4, i32 5, i32 6, i32 7>
+ %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
+ ret <8 x i16> %res
+}
+define <8 x i16> @test_8xi16_perm_high_mem_mask0(<8 x i16>* %vp) {
+; CHECK-LABEL: test_8xi16_perm_high_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpshufhw {{.*#+}} xmm0 = mem[0,1,2,3,7,7,4,6]
+; CHECK-NEXT: retq
+ %vec = load <8 x i16>, <8 x i16>* %vp
+ %res = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 7, i32 4, i32 6>
+ ret <8 x i16> %res
+}
+define <8 x i16> @test_masked_8xi16_perm_high_mem_mask0(<8 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
+; CHECK-LABEL: test_masked_8xi16_perm_high_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1
+; CHECK-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,7,7,4,6]
+; CHECK-NEXT: retq
+ %vec = load <8 x i16>, <8 x i16>* %vp
+ %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 7, i32 4, i32 6>
+ %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
+ ret <8 x i16> %res
+}
+
+define <8 x i16> @test_masked_z_8xi16_perm_high_mem_mask0(<8 x i16>* %vp, <8 x i16> %mask) {
+; CHECK-LABEL: test_masked_z_8xi16_perm_high_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k1
+; CHECK-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,7,7,4,6]
+; CHECK-NEXT: retq
+ %vec = load <8 x i16>, <8 x i16>* %vp
+ %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 7, i32 4, i32 6>
+ %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
+ ret <8 x i16> %res
+}
+
+define <8 x i16> @test_masked_8xi16_perm_low_mem_mask1(<8 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
+; CHECK-LABEL: test_masked_8xi16_perm_low_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1
+; CHECK-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} = mem[1,3,3,2,4,5,6,7]
+; CHECK-NEXT: retq
+ %vec = load <8 x i16>, <8 x i16>* %vp
+ %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 1, i32 3, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7>
+ %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
+ ret <8 x i16> %res
+}
+
+define <8 x i16> @test_masked_z_8xi16_perm_low_mem_mask1(<8 x i16>* %vp, <8 x i16> %mask) {
+; CHECK-LABEL: test_masked_z_8xi16_perm_low_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k1
+; CHECK-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[1,3,3,2,4,5,6,7]
+; CHECK-NEXT: retq
+ %vec = load <8 x i16>, <8 x i16>* %vp
+ %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 1, i32 3, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7>
+ %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
+ ret <8 x i16> %res
+}
+
+define <8 x i16> @test_masked_8xi16_perm_high_mem_mask2(<8 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
+; CHECK-LABEL: test_masked_8xi16_perm_high_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1
+; CHECK-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,6,6,5,7]
+; CHECK-NEXT: retq
+ %vec = load <8 x i16>, <8 x i16>* %vp
+ %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 6, i32 5, i32 7>
+ %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
+ ret <8 x i16> %res
+}
+
+define <8 x i16> @test_masked_z_8xi16_perm_high_mem_mask2(<8 x i16>* %vp, <8 x i16> %mask) {
+; CHECK-LABEL: test_masked_z_8xi16_perm_high_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k1
+; CHECK-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,6,6,5,7]
+; CHECK-NEXT: retq
+ %vec = load <8 x i16>, <8 x i16>* %vp
+ %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 6, i32 5, i32 7>
+ %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
+ ret <8 x i16> %res
+}
+
+define <8 x i16> @test_8xi16_perm_low_mem_mask3(<8 x i16>* %vp) {
+; CHECK-LABEL: test_8xi16_perm_low_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpshuflw {{.*#+}} xmm0 = mem[3,1,2,0,4,5,6,7]
+; CHECK-NEXT: retq
+ %vec = load <8 x i16>, <8 x i16>* %vp
+ %res = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 3, i32 1, i32 2, i32 0, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x i16> %res
+}
+define <8 x i16> @test_masked_8xi16_perm_low_mem_mask3(<8 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
+; CHECK-LABEL: test_masked_8xi16_perm_low_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1
+; CHECK-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} = mem[3,1,2,0,4,5,6,7]
+; CHECK-NEXT: retq
+ %vec = load <8 x i16>, <8 x i16>* %vp
+ %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 3, i32 1, i32 2, i32 0, i32 4, i32 5, i32 6, i32 7>
+ %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
+ ret <8 x i16> %res
+}
+
+define <8 x i16> @test_masked_z_8xi16_perm_low_mem_mask3(<8 x i16>* %vp, <8 x i16> %mask) {
+; CHECK-LABEL: test_masked_z_8xi16_perm_low_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k1
+; CHECK-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[3,1,2,0,4,5,6,7]
+; CHECK-NEXT: retq
+ %vec = load <8 x i16>, <8 x i16>* %vp
+ %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 3, i32 1, i32 2, i32 0, i32 4, i32 5, i32 6, i32 7>
+ %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
+ ret <8 x i16> %res
+}
+
+define <8 x i16> @test_masked_8xi16_perm_high_mem_mask4(<8 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
+; CHECK-LABEL: test_masked_8xi16_perm_high_mem_mask4:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1
+; CHECK-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,7,6,7,5]
+; CHECK-NEXT: retq
+ %vec = load <8 x i16>, <8 x i16>* %vp
+ %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 7, i32 5>
+ %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
+ ret <8 x i16> %res
+}
+
+define <8 x i16> @test_masked_z_8xi16_perm_high_mem_mask4(<8 x i16>* %vp, <8 x i16> %mask) {
+; CHECK-LABEL: test_masked_z_8xi16_perm_high_mem_mask4:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k1
+; CHECK-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,7,6,7,5]
+; CHECK-NEXT: retq
+ %vec = load <8 x i16>, <8 x i16>* %vp
+ %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 7, i32 5>
+ %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
+ ret <8 x i16> %res
+}
+
+define <8 x i16> @test_masked_8xi16_perm_low_mem_mask5(<8 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
+; CHECK-LABEL: test_masked_8xi16_perm_low_mem_mask5:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1
+; CHECK-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} = mem[2,1,3,2,4,5,6,7]
+; CHECK-NEXT: retq
+ %vec = load <8 x i16>, <8 x i16>* %vp
+ %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 2, i32 1, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7>
+ %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
+ ret <8 x i16> %res
+}
+
+define <8 x i16> @test_masked_z_8xi16_perm_low_mem_mask5(<8 x i16>* %vp, <8 x i16> %mask) {
+; CHECK-LABEL: test_masked_z_8xi16_perm_low_mem_mask5:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k1
+; CHECK-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[2,1,3,2,4,5,6,7]
+; CHECK-NEXT: retq
+ %vec = load <8 x i16>, <8 x i16>* %vp
+ %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 2, i32 1, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7>
+ %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
+ ret <8 x i16> %res
+}
+
+define <8 x i16> @test_8xi16_perm_high_mem_mask6(<8 x i16>* %vp) {
+; CHECK-LABEL: test_8xi16_perm_high_mem_mask6:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpshufhw {{.*#+}} xmm0 = mem[0,1,2,3,7,4,4,4]
+; CHECK-NEXT: retq
+ %vec = load <8 x i16>, <8 x i16>* %vp
+ %res = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 4, i32 4, i32 4>
+ ret <8 x i16> %res
+}
+define <8 x i16> @test_masked_8xi16_perm_high_mem_mask6(<8 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
+; CHECK-LABEL: test_masked_8xi16_perm_high_mem_mask6:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1
+; CHECK-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,7,4,4,4]
+; CHECK-NEXT: retq
+ %vec = load <8 x i16>, <8 x i16>* %vp
+ %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 4, i32 4, i32 4>
+ %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
+ ret <8 x i16> %res
+}
+
+define <8 x i16> @test_masked_z_8xi16_perm_high_mem_mask6(<8 x i16>* %vp, <8 x i16> %mask) {
+; CHECK-LABEL: test_masked_z_8xi16_perm_high_mem_mask6:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k1
+; CHECK-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,7,4,4,4]
+; CHECK-NEXT: retq
+ %vec = load <8 x i16>, <8 x i16>* %vp
+ %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 4, i32 4, i32 4>
+ %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
+ ret <8 x i16> %res
+}
+
+define <8 x i16> @test_masked_8xi16_perm_low_mem_mask7(<8 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
+; CHECK-LABEL: test_masked_8xi16_perm_low_mem_mask7:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1
+; CHECK-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} = mem[0,3,3,1,4,5,6,7]
+; CHECK-NEXT: retq
+ %vec = load <8 x i16>, <8 x i16>* %vp
+ %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 3, i32 3, i32 1, i32 4, i32 5, i32 6, i32 7>
+ %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
+ ret <8 x i16> %res
+}
+
+define <8 x i16> @test_masked_z_8xi16_perm_low_mem_mask7(<8 x i16>* %vp, <8 x i16> %mask) {
+; CHECK-LABEL: test_masked_z_8xi16_perm_low_mem_mask7:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k1
+; CHECK-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[0,3,3,1,4,5,6,7]
+; CHECK-NEXT: retq
+ %vec = load <8 x i16>, <8 x i16>* %vp
+ %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 3, i32 3, i32 1, i32 4, i32 5, i32 6, i32 7>
+ %cmp = icmp eq <8 x i16> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
+ ret <8 x i16> %res
+}
+
+define <16 x i16> @test_16xi16_perm_high_mask0(<16 x i16> %vec) {
+; CHECK-LABEL: test_16xi16_perm_high_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,6,4,8,9,10,11,12,12,14,12]
+; CHECK-NEXT: retq
+ %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 6, i32 4, i32 8, i32 9, i32 10, i32 11, i32 12, i32 12, i32 14, i32 12>
+ ret <16 x i16> %res
+}
+define <16 x i16> @test_masked_16xi16_perm_high_mask0(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
+; CHECK-LABEL: test_masked_16xi16_perm_high_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqw %ymm3, %ymm2, %k1
+; CHECK-NEXT: vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,4,4,6,4,8,9,10,11,12,12,14,12]
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 6, i32 4, i32 8, i32 9, i32 10, i32 11, i32 12, i32 12, i32 14, i32 12>
+ %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
+ ret <16 x i16> %res
+}
+
+define <16 x i16> @test_masked_z_16xi16_perm_high_mask0(<16 x i16> %vec, <16 x i16> %mask) {
+; CHECK-LABEL: test_masked_z_16xi16_perm_high_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqw %ymm2, %ymm1, %k1
+; CHECK-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,4,4,6,4,8,9,10,11,12,12,14,12]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 6, i32 4, i32 8, i32 9, i32 10, i32 11, i32 12, i32 12, i32 14, i32 12>
+ %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
+ ret <16 x i16> %res
+}
+define <16 x i16> @test_masked_16xi16_perm_low_mask1(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
+; CHECK-LABEL: test_masked_16xi16_perm_low_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqw %ymm3, %ymm2, %k1
+; CHECK-NEXT: vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[0,2,3,2,4,5,6,7,8,10,11,10,12,13,14,15]
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 2, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7, i32 8, i32 10, i32 11, i32 10, i32 12, i32 13, i32 14, i32 15>
+ %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
+ ret <16 x i16> %res
+}
+
+define <16 x i16> @test_masked_z_16xi16_perm_low_mask1(<16 x i16> %vec, <16 x i16> %mask) {
+; CHECK-LABEL: test_masked_z_16xi16_perm_low_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqw %ymm2, %ymm1, %k1
+; CHECK-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,2,3,2,4,5,6,7,8,10,11,10,12,13,14,15]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 2, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7, i32 8, i32 10, i32 11, i32 10, i32 12, i32 13, i32 14, i32 15>
+ %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
+ ret <16 x i16> %res
+}
+define <16 x i16> @test_masked_16xi16_perm_high_mask2(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
+; CHECK-LABEL: test_masked_16xi16_perm_high_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqw %ymm3, %ymm2, %k1
+; CHECK-NEXT: vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,7,5,5,5,8,9,10,11,15,13,13,13]
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 5, i32 5, i32 5, i32 8, i32 9, i32 10, i32 11, i32 15, i32 13, i32 13, i32 13>
+ %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
+ ret <16 x i16> %res
+}
+
+define <16 x i16> @test_masked_z_16xi16_perm_high_mask2(<16 x i16> %vec, <16 x i16> %mask) {
+; CHECK-LABEL: test_masked_z_16xi16_perm_high_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqw %ymm2, %ymm1, %k1
+; CHECK-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,7,5,5,5,8,9,10,11,15,13,13,13]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 5, i32 5, i32 5, i32 8, i32 9, i32 10, i32 11, i32 15, i32 13, i32 13, i32 13>
+ %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
+ ret <16 x i16> %res
+}
+define <16 x i16> @test_16xi16_perm_low_mask3(<16 x i16> %vec) {
+; CHECK-LABEL: test_16xi16_perm_low_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[3,2,3,2,4,5,6,7,11,10,11,10,12,13,14,15]
+; CHECK-NEXT: retq
+ %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 11, i32 10, i32 12, i32 13, i32 14, i32 15>
+ ret <16 x i16> %res
+}
+define <16 x i16> @test_masked_16xi16_perm_low_mask3(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
+; CHECK-LABEL: test_masked_16xi16_perm_low_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqw %ymm3, %ymm2, %k1
+; CHECK-NEXT: vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[3,2,3,2,4,5,6,7,11,10,11,10,12,13,14,15]
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 11, i32 10, i32 12, i32 13, i32 14, i32 15>
+ %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
+ ret <16 x i16> %res
+}
+
+define <16 x i16> @test_masked_z_16xi16_perm_low_mask3(<16 x i16> %vec, <16 x i16> %mask) {
+; CHECK-LABEL: test_masked_z_16xi16_perm_low_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqw %ymm2, %ymm1, %k1
+; CHECK-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[3,2,3,2,4,5,6,7,11,10,11,10,12,13,14,15]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 11, i32 10, i32 12, i32 13, i32 14, i32 15>
+ %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
+ ret <16 x i16> %res
+}
+define <16 x i16> @test_masked_16xi16_perm_high_mask4(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
+; CHECK-LABEL: test_masked_16xi16_perm_high_mask4:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqw %ymm3, %ymm2, %k1
+; CHECK-NEXT: vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,6,7,4,7,8,9,10,11,14,15,12,15]
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 7, i32 4, i32 7, i32 8, i32 9, i32 10, i32 11, i32 14, i32 15, i32 12, i32 15>
+ %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
+ ret <16 x i16> %res
+}
+
+define <16 x i16> @test_masked_z_16xi16_perm_high_mask4(<16 x i16> %vec, <16 x i16> %mask) {
+; CHECK-LABEL: test_masked_z_16xi16_perm_high_mask4:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqw %ymm2, %ymm1, %k1
+; CHECK-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,6,7,4,7,8,9,10,11,14,15,12,15]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 7, i32 4, i32 7, i32 8, i32 9, i32 10, i32 11, i32 14, i32 15, i32 12, i32 15>
+ %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
+ ret <16 x i16> %res
+}
+define <16 x i16> @test_masked_16xi16_perm_low_mask5(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
+; CHECK-LABEL: test_masked_16xi16_perm_low_mask5:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqw %ymm3, %ymm2, %k1
+; CHECK-NEXT: vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[3,3,3,0,4,5,6,7,11,11,11,8,12,13,14,15]
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 11, i32 11, i32 8, i32 12, i32 13, i32 14, i32 15>
+ %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
+ ret <16 x i16> %res
+}
+
+define <16 x i16> @test_masked_z_16xi16_perm_low_mask5(<16 x i16> %vec, <16 x i16> %mask) {
+; CHECK-LABEL: test_masked_z_16xi16_perm_low_mask5:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqw %ymm2, %ymm1, %k1
+; CHECK-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[3,3,3,0,4,5,6,7,11,11,11,8,12,13,14,15]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 11, i32 11, i32 8, i32 12, i32 13, i32 14, i32 15>
+ %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
+ ret <16 x i16> %res
+}
+define <16 x i16> @test_16xi16_perm_high_mask6(<16 x i16> %vec) {
+; CHECK-LABEL: test_16xi16_perm_high_mask6:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,6,7,6,5,8,9,10,11,14,15,14,13]
+; CHECK-NEXT: retq
+ %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 7, i32 6, i32 5, i32 8, i32 9, i32 10, i32 11, i32 14, i32 15, i32 14, i32 13>
+ ret <16 x i16> %res
+}
+define <16 x i16> @test_masked_16xi16_perm_high_mask6(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
+; CHECK-LABEL: test_masked_16xi16_perm_high_mask6:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqw %ymm3, %ymm2, %k1
+; CHECK-NEXT: vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,6,7,6,5,8,9,10,11,14,15,14,13]
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 7, i32 6, i32 5, i32 8, i32 9, i32 10, i32 11, i32 14, i32 15, i32 14, i32 13>
+ %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
+ ret <16 x i16> %res
+}
+
+define <16 x i16> @test_masked_z_16xi16_perm_high_mask6(<16 x i16> %vec, <16 x i16> %mask) {
+; CHECK-LABEL: test_masked_z_16xi16_perm_high_mask6:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqw %ymm2, %ymm1, %k1
+; CHECK-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,6,7,6,5,8,9,10,11,14,15,14,13]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 7, i32 6, i32 5, i32 8, i32 9, i32 10, i32 11, i32 14, i32 15, i32 14, i32 13>
+ %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
+ ret <16 x i16> %res
+}
+define <16 x i16> @test_masked_16xi16_perm_low_mask7(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
+; CHECK-LABEL: test_masked_16xi16_perm_low_mask7:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqw %ymm3, %ymm2, %k1
+; CHECK-NEXT: vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[3,2,1,2,4,5,6,7,11,10,9,10,12,13,14,15]
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 2, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 9, i32 10, i32 12, i32 13, i32 14, i32 15>
+ %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
+ ret <16 x i16> %res
+}
+
+define <16 x i16> @test_masked_z_16xi16_perm_low_mask7(<16 x i16> %vec, <16 x i16> %mask) {
+; CHECK-LABEL: test_masked_z_16xi16_perm_low_mask7:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqw %ymm2, %ymm1, %k1
+; CHECK-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[3,2,1,2,4,5,6,7,11,10,9,10,12,13,14,15]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 2, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 9, i32 10, i32 12, i32 13, i32 14, i32 15>
+ %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
+ ret <16 x i16> %res
+}
+define <16 x i16> @test_16xi16_perm_high_mem_mask0(<16 x i16>* %vp) {
+; CHECK-LABEL: test_16xi16_perm_high_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpshufhw {{.*#+}} ymm0 = mem[0,1,2,3,5,6,4,7,8,9,10,11,13,14,12,15]
+; CHECK-NEXT: retq
+ %vec = load <16 x i16>, <16 x i16>* %vp
+ %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 6, i32 4, i32 7, i32 8, i32 9, i32 10, i32 11, i32 13, i32 14, i32 12, i32 15>
+ ret <16 x i16> %res
+}
+define <16 x i16> @test_masked_16xi16_perm_high_mem_mask0(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
+; CHECK-LABEL: test_masked_16xi16_perm_high_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqw %ymm2, %ymm1, %k1
+; CHECK-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,5,6,4,7,8,9,10,11,13,14,12,15]
+; CHECK-NEXT: retq
+ %vec = load <16 x i16>, <16 x i16>* %vp
+ %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 6, i32 4, i32 7, i32 8, i32 9, i32 10, i32 11, i32 13, i32 14, i32 12, i32 15>
+ %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
+ ret <16 x i16> %res
+}
+
+define <16 x i16> @test_masked_z_16xi16_perm_high_mem_mask0(<16 x i16>* %vp, <16 x i16> %mask) {
+; CHECK-LABEL: test_masked_z_16xi16_perm_high_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k1
+; CHECK-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,5,6,4,7,8,9,10,11,13,14,12,15]
+; CHECK-NEXT: retq
+ %vec = load <16 x i16>, <16 x i16>* %vp
+ %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 6, i32 4, i32 7, i32 8, i32 9, i32 10, i32 11, i32 13, i32 14, i32 12, i32 15>
+ %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
+ ret <16 x i16> %res
+}
+
+define <16 x i16> @test_masked_16xi16_perm_low_mem_mask1(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
+; CHECK-LABEL: test_masked_16xi16_perm_low_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqw %ymm2, %ymm1, %k1
+; CHECK-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} = mem[1,3,3,0,4,5,6,7,9,11,11,8,12,13,14,15]
+; CHECK-NEXT: retq
+ %vec = load <16 x i16>, <16 x i16>* %vp
+ %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 1, i32 3, i32 3, i32 0, i32 4, i32 5, i32 6, i32 7, i32 9, i32 11, i32 11, i32 8, i32 12, i32 13, i32 14, i32 15>
+ %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
+ ret <16 x i16> %res
+}
+
+define <16 x i16> @test_masked_z_16xi16_perm_low_mem_mask1(<16 x i16>* %vp, <16 x i16> %mask) {
+; CHECK-LABEL: test_masked_z_16xi16_perm_low_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k1
+; CHECK-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[1,3,3,0,4,5,6,7,9,11,11,8,12,13,14,15]
+; CHECK-NEXT: retq
+ %vec = load <16 x i16>, <16 x i16>* %vp
+ %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 1, i32 3, i32 3, i32 0, i32 4, i32 5, i32 6, i32 7, i32 9, i32 11, i32 11, i32 8, i32 12, i32 13, i32 14, i32 15>
+ %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
+ ret <16 x i16> %res
+}
+
+define <16 x i16> @test_masked_16xi16_perm_high_mem_mask2(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
+; CHECK-LABEL: test_masked_16xi16_perm_high_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqw %ymm2, %ymm1, %k1
+; CHECK-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,5,6,5,6,8,9,10,11,13,14,13,14]
+; CHECK-NEXT: retq
+ %vec = load <16 x i16>, <16 x i16>* %vp
+ %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 6, i32 5, i32 6, i32 8, i32 9, i32 10, i32 11, i32 13, i32 14, i32 13, i32 14>
+ %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
+ ret <16 x i16> %res
+}
+
+define <16 x i16> @test_masked_z_16xi16_perm_high_mem_mask2(<16 x i16>* %vp, <16 x i16> %mask) {
+; CHECK-LABEL: test_masked_z_16xi16_perm_high_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k1
+; CHECK-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,5,6,5,6,8,9,10,11,13,14,13,14]
+; CHECK-NEXT: retq
+ %vec = load <16 x i16>, <16 x i16>* %vp
+ %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 6, i32 5, i32 6, i32 8, i32 9, i32 10, i32 11, i32 13, i32 14, i32 13, i32 14>
+ %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
+ ret <16 x i16> %res
+}
+
+define <16 x i16> @test_16xi16_perm_low_mem_mask3(<16 x i16>* %vp) {
+; CHECK-LABEL: test_16xi16_perm_low_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpshuflw {{.*#+}} ymm0 = mem[3,2,3,0,4,5,6,7,11,10,11,8,12,13,14,15]
+; CHECK-NEXT: retq
+ %vec = load <16 x i16>, <16 x i16>* %vp
+ %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 3, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 11, i32 8, i32 12, i32 13, i32 14, i32 15>
+ ret <16 x i16> %res
+}
+define <16 x i16> @test_masked_16xi16_perm_low_mem_mask3(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
+; CHECK-LABEL: test_masked_16xi16_perm_low_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqw %ymm2, %ymm1, %k1
+; CHECK-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} = mem[3,2,3,0,4,5,6,7,11,10,11,8,12,13,14,15]
+; CHECK-NEXT: retq
+ %vec = load <16 x i16>, <16 x i16>* %vp
+ %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 3, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 11, i32 8, i32 12, i32 13, i32 14, i32 15>
+ %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
+ ret <16 x i16> %res
+}
+
+define <16 x i16> @test_masked_z_16xi16_perm_low_mem_mask3(<16 x i16>* %vp, <16 x i16> %mask) {
+; CHECK-LABEL: test_masked_z_16xi16_perm_low_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k1
+; CHECK-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[3,2,3,0,4,5,6,7,11,10,11,8,12,13,14,15]
+; CHECK-NEXT: retq
+ %vec = load <16 x i16>, <16 x i16>* %vp
+ %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 3, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 11, i32 8, i32 12, i32 13, i32 14, i32 15>
+ %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
+ ret <16 x i16> %res
+}
+
+define <16 x i16> @test_masked_16xi16_perm_high_mem_mask4(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
+; CHECK-LABEL: test_masked_16xi16_perm_high_mem_mask4:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqw %ymm2, %ymm1, %k1
+; CHECK-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,7,7,6,7,8,9,10,11,15,15,14,15]
+; CHECK-NEXT: retq
+ %vec = load <16 x i16>, <16 x i16>* %vp
+ %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 7, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 15, i32 15, i32 14, i32 15>
+ %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
+ ret <16 x i16> %res
+}
+
+define <16 x i16> @test_masked_z_16xi16_perm_high_mem_mask4(<16 x i16>* %vp, <16 x i16> %mask) {
+; CHECK-LABEL: test_masked_z_16xi16_perm_high_mem_mask4:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k1
+; CHECK-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,7,7,6,7,8,9,10,11,15,15,14,15]
+; CHECK-NEXT: retq
+ %vec = load <16 x i16>, <16 x i16>* %vp
+ %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 7, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 15, i32 15, i32 14, i32 15>
+ %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
+ ret <16 x i16> %res
+}
+
+define <16 x i16> @test_masked_16xi16_perm_low_mem_mask5(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
+; CHECK-LABEL: test_masked_16xi16_perm_low_mem_mask5:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqw %ymm2, %ymm1, %k1
+; CHECK-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} = mem[1,3,3,2,4,5,6,7,9,11,11,10,12,13,14,15]
+; CHECK-NEXT: retq
+ %vec = load <16 x i16>, <16 x i16>* %vp
+ %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 1, i32 3, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7, i32 9, i32 11, i32 11, i32 10, i32 12, i32 13, i32 14, i32 15>
+ %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
+ ret <16 x i16> %res
+}
+
+define <16 x i16> @test_masked_z_16xi16_perm_low_mem_mask5(<16 x i16>* %vp, <16 x i16> %mask) {
+; CHECK-LABEL: test_masked_z_16xi16_perm_low_mem_mask5:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k1
+; CHECK-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[1,3,3,2,4,5,6,7,9,11,11,10,12,13,14,15]
+; CHECK-NEXT: retq
+ %vec = load <16 x i16>, <16 x i16>* %vp
+ %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 1, i32 3, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7, i32 9, i32 11, i32 11, i32 10, i32 12, i32 13, i32 14, i32 15>
+ %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
+ ret <16 x i16> %res
+}
+
+define <16 x i16> @test_16xi16_perm_high_mem_mask6(<16 x i16>* %vp) {
+; CHECK-LABEL: test_16xi16_perm_high_mem_mask6:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpshufhw {{.*#+}} ymm0 = mem[0,1,2,3,4,4,4,5,8,9,10,11,12,12,12,13]
+; CHECK-NEXT: retq
+ %vec = load <16 x i16>, <16 x i16>* %vp
+ %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 5, i32 8, i32 9, i32 10, i32 11, i32 12, i32 12, i32 12, i32 13>
+ ret <16 x i16> %res
+}
+define <16 x i16> @test_masked_16xi16_perm_high_mem_mask6(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
+; CHECK-LABEL: test_masked_16xi16_perm_high_mem_mask6:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqw %ymm2, %ymm1, %k1
+; CHECK-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,4,4,4,5,8,9,10,11,12,12,12,13]
+; CHECK-NEXT: retq
+ %vec = load <16 x i16>, <16 x i16>* %vp
+ %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 5, i32 8, i32 9, i32 10, i32 11, i32 12, i32 12, i32 12, i32 13>
+ %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
+ ret <16 x i16> %res
+}
+
+define <16 x i16> @test_masked_z_16xi16_perm_high_mem_mask6(<16 x i16>* %vp, <16 x i16> %mask) {
+; CHECK-LABEL: test_masked_z_16xi16_perm_high_mem_mask6:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k1
+; CHECK-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,4,4,4,5,8,9,10,11,12,12,12,13]
+; CHECK-NEXT: retq
+ %vec = load <16 x i16>, <16 x i16>* %vp
+ %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 5, i32 8, i32 9, i32 10, i32 11, i32 12, i32 12, i32 12, i32 13>
+ %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
+ ret <16 x i16> %res
+}
+
+define <16 x i16> @test_masked_16xi16_perm_low_mem_mask7(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
+; CHECK-LABEL: test_masked_16xi16_perm_low_mem_mask7:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqw %ymm2, %ymm1, %k1
+; CHECK-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} = mem[3,1,3,2,4,5,6,7,11,9,11,10,12,13,14,15]
+; CHECK-NEXT: retq
+ %vec = load <16 x i16>, <16 x i16>* %vp
+ %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 1, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7, i32 11, i32 9, i32 11, i32 10, i32 12, i32 13, i32 14, i32 15>
+ %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
+ ret <16 x i16> %res
+}
+
+define <16 x i16> @test_masked_z_16xi16_perm_low_mem_mask7(<16 x i16>* %vp, <16 x i16> %mask) {
+; CHECK-LABEL: test_masked_z_16xi16_perm_low_mem_mask7:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k1
+; CHECK-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[3,1,3,2,4,5,6,7,11,9,11,10,12,13,14,15]
+; CHECK-NEXT: retq
+ %vec = load <16 x i16>, <16 x i16>* %vp
+ %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 1, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7, i32 11, i32 9, i32 11, i32 10, i32 12, i32 13, i32 14, i32 15>
+ %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
+ ret <16 x i16> %res
+}
+
+define <32 x i16> @test_32xi16_perm_high_mask0(<32 x i16> %vec) {
+; CHECK-LABEL: test_32xi16_perm_high_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpshufhw {{.*#+}} zmm0 = zmm0[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12,16,17,18,19,20,21,22,20,24,25,26,27,28,29,30,28]
+; CHECK-NEXT: retq
+ %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 4, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 12, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 20, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 28>
+ ret <32 x i16> %res
+}
+define <32 x i16> @test_masked_32xi16_perm_high_mask0(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
+; CHECK-LABEL: test_masked_32xi16_perm_high_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqw %zmm3, %zmm2, %k1
+; CHECK-NEXT: vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12,16,17,18,19,20,21,22,20,24,25,26,27,28,29,30,28]
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 4, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 12, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 20, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 28>
+ %cmp = icmp eq <32 x i16> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
+ ret <32 x i16> %res
+}
+
+define <32 x i16> @test_masked_z_32xi16_perm_high_mask0(<32 x i16> %vec, <32 x i16> %mask) {
+; CHECK-LABEL: test_masked_z_32xi16_perm_high_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqw %zmm2, %zmm1, %k1
+; CHECK-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12,16,17,18,19,20,21,22,20,24,25,26,27,28,29,30,28]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 4, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 12, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 20, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 28>
+ %cmp = icmp eq <32 x i16> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
+ ret <32 x i16> %res
+}
+define <32 x i16> @test_masked_32xi16_perm_low_mask1(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
+; CHECK-LABEL: test_masked_32xi16_perm_low_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqw %zmm3, %zmm2, %k1
+; CHECK-NEXT: vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[2,1,0,0,4,5,6,7,10,9,8,8,12,13,14,15,18,17,16,16,20,21,22,23,26,25,24,24,28,29,30,31]
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 2, i32 1, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7, i32 10, i32 9, i32 8, i32 8, i32 12, i32 13, i32 14, i32 15, i32 18, i32 17, i32 16, i32 16, i32 20, i32 21, i32 22, i32 23, i32 26, i32 25, i32 24, i32 24, i32 28, i32 29, i32 30, i32 31>
+ %cmp = icmp eq <32 x i16> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
+ ret <32 x i16> %res
+}
+
+define <32 x i16> @test_masked_z_32xi16_perm_low_mask1(<32 x i16> %vec, <32 x i16> %mask) {
+; CHECK-LABEL: test_masked_z_32xi16_perm_low_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqw %zmm2, %zmm1, %k1
+; CHECK-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[2,1,0,0,4,5,6,7,10,9,8,8,12,13,14,15,18,17,16,16,20,21,22,23,26,25,24,24,28,29,30,31]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 2, i32 1, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7, i32 10, i32 9, i32 8, i32 8, i32 12, i32 13, i32 14, i32 15, i32 18, i32 17, i32 16, i32 16, i32 20, i32 21, i32 22, i32 23, i32 26, i32 25, i32 24, i32 24, i32 28, i32 29, i32 30, i32 31>
+ %cmp = icmp eq <32 x i16> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
+ ret <32 x i16> %res
+}
+define <32 x i16> @test_masked_32xi16_perm_high_mask2(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
+; CHECK-LABEL: test_masked_32xi16_perm_high_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqw %zmm3, %zmm2, %k1
+; CHECK-NEXT: vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,6,4,7,8,9,10,11,12,14,12,15,16,17,18,19,20,22,20,23,24,25,26,27,28,30,28,31]
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 6, i32 4, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 14, i32 12, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 22, i32 20, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 30, i32 28, i32 31>
+ %cmp = icmp eq <32 x i16> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
+ ret <32 x i16> %res
+}
+
+define <32 x i16> @test_masked_z_32xi16_perm_high_mask2(<32 x i16> %vec, <32 x i16> %mask) {
+; CHECK-LABEL: test_masked_z_32xi16_perm_high_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqw %zmm2, %zmm1, %k1
+; CHECK-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,6,4,7,8,9,10,11,12,14,12,15,16,17,18,19,20,22,20,23,24,25,26,27,28,30,28,31]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 6, i32 4, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 14, i32 12, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 22, i32 20, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 30, i32 28, i32 31>
+ %cmp = icmp eq <32 x i16> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
+ ret <32 x i16> %res
+}
+define <32 x i16> @test_32xi16_perm_low_mask3(<32 x i16> %vec) {
+; CHECK-LABEL: test_32xi16_perm_low_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpshuflw {{.*#+}} zmm0 = zmm0[3,3,1,3,4,5,6,7,11,11,9,11,12,13,14,15,19,19,17,19,20,21,22,23,27,27,25,27,28,29,30,31]
+; CHECK-NEXT: retq
+ %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 1, i32 3, i32 4, i32 5, i32 6, i32 7, i32 11, i32 11, i32 9, i32 11, i32 12, i32 13, i32 14, i32 15, i32 19, i32 19, i32 17, i32 19, i32 20, i32 21, i32 22, i32 23, i32 27, i32 27, i32 25, i32 27, i32 28, i32 29, i32 30, i32 31>
+ ret <32 x i16> %res
+}
+define <32 x i16> @test_masked_32xi16_perm_low_mask3(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
+; CHECK-LABEL: test_masked_32xi16_perm_low_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqw %zmm3, %zmm2, %k1
+; CHECK-NEXT: vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[3,3,1,3,4,5,6,7,11,11,9,11,12,13,14,15,19,19,17,19,20,21,22,23,27,27,25,27,28,29,30,31]
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 1, i32 3, i32 4, i32 5, i32 6, i32 7, i32 11, i32 11, i32 9, i32 11, i32 12, i32 13, i32 14, i32 15, i32 19, i32 19, i32 17, i32 19, i32 20, i32 21, i32 22, i32 23, i32 27, i32 27, i32 25, i32 27, i32 28, i32 29, i32 30, i32 31>
+ %cmp = icmp eq <32 x i16> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
+ ret <32 x i16> %res
+}
+
+define <32 x i16> @test_masked_z_32xi16_perm_low_mask3(<32 x i16> %vec, <32 x i16> %mask) {
+; CHECK-LABEL: test_masked_z_32xi16_perm_low_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqw %zmm2, %zmm1, %k1
+; CHECK-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[3,3,1,3,4,5,6,7,11,11,9,11,12,13,14,15,19,19,17,19,20,21,22,23,27,27,25,27,28,29,30,31]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 1, i32 3, i32 4, i32 5, i32 6, i32 7, i32 11, i32 11, i32 9, i32 11, i32 12, i32 13, i32 14, i32 15, i32 19, i32 19, i32 17, i32 19, i32 20, i32 21, i32 22, i32 23, i32 27, i32 27, i32 25, i32 27, i32 28, i32 29, i32 30, i32 31>
+ %cmp = icmp eq <32 x i16> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
+ ret <32 x i16> %res
+}
+define <32 x i16> @test_masked_32xi16_perm_high_mask4(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
+; CHECK-LABEL: test_masked_32xi16_perm_high_mask4:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqw %zmm3, %zmm2, %k1
+; CHECK-NEXT: vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,7,7,5,6,8,9,10,11,15,15,13,14,16,17,18,19,23,23,21,22,24,25,26,27,31,31,29,30]
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 7, i32 5, i32 6, i32 8, i32 9, i32 10, i32 11, i32 15, i32 15, i32 13, i32 14, i32 16, i32 17, i32 18, i32 19, i32 23, i32 23, i32 21, i32 22, i32 24, i32 25, i32 26, i32 27, i32 31, i32 31, i32 29, i32 30>
+ %cmp = icmp eq <32 x i16> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
+ ret <32 x i16> %res
+}
+
+define <32 x i16> @test_masked_z_32xi16_perm_high_mask4(<32 x i16> %vec, <32 x i16> %mask) {
+; CHECK-LABEL: test_masked_z_32xi16_perm_high_mask4:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqw %zmm2, %zmm1, %k1
+; CHECK-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,7,7,5,6,8,9,10,11,15,15,13,14,16,17,18,19,23,23,21,22,24,25,26,27,31,31,29,30]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 7, i32 5, i32 6, i32 8, i32 9, i32 10, i32 11, i32 15, i32 15, i32 13, i32 14, i32 16, i32 17, i32 18, i32 19, i32 23, i32 23, i32 21, i32 22, i32 24, i32 25, i32 26, i32 27, i32 31, i32 31, i32 29, i32 30>
+ %cmp = icmp eq <32 x i16> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
+ ret <32 x i16> %res
+}
+define <32 x i16> @test_masked_32xi16_perm_low_mask5(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
+; CHECK-LABEL: test_masked_32xi16_perm_low_mask5:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqw %zmm3, %zmm2, %k1
+; CHECK-NEXT: vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[2,1,1,0,4,5,6,7,10,9,9,8,12,13,14,15,18,17,17,16,20,21,22,23,26,25,25,24,28,29,30,31]
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 2, i32 1, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7, i32 10, i32 9, i32 9, i32 8, i32 12, i32 13, i32 14, i32 15, i32 18, i32 17, i32 17, i32 16, i32 20, i32 21, i32 22, i32 23, i32 26, i32 25, i32 25, i32 24, i32 28, i32 29, i32 30, i32 31>
+ %cmp = icmp eq <32 x i16> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
+ ret <32 x i16> %res
+}
+
+define <32 x i16> @test_masked_z_32xi16_perm_low_mask5(<32 x i16> %vec, <32 x i16> %mask) {
+; CHECK-LABEL: test_masked_z_32xi16_perm_low_mask5:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqw %zmm2, %zmm1, %k1
+; CHECK-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[2,1,1,0,4,5,6,7,10,9,9,8,12,13,14,15,18,17,17,16,20,21,22,23,26,25,25,24,28,29,30,31]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 2, i32 1, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7, i32 10, i32 9, i32 9, i32 8, i32 12, i32 13, i32 14, i32 15, i32 18, i32 17, i32 17, i32 16, i32 20, i32 21, i32 22, i32 23, i32 26, i32 25, i32 25, i32 24, i32 28, i32 29, i32 30, i32 31>
+ %cmp = icmp eq <32 x i16> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
+ ret <32 x i16> %res
+}
+define <32 x i16> @test_32xi16_perm_high_mask6(<32 x i16> %vec) {
+; CHECK-LABEL: test_32xi16_perm_high_mask6:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpshufhw {{.*#+}} zmm0 = zmm0[0,1,2,3,4,4,5,6,8,9,10,11,12,12,13,14,16,17,18,19,20,20,21,22,24,25,26,27,28,28,29,30]
+; CHECK-NEXT: retq
+ %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 5, i32 6, i32 8, i32 9, i32 10, i32 11, i32 12, i32 12, i32 13, i32 14, i32 16, i32 17, i32 18, i32 19, i32 20, i32 20, i32 21, i32 22, i32 24, i32 25, i32 26, i32 27, i32 28, i32 28, i32 29, i32 30>
+ ret <32 x i16> %res
+}
+define <32 x i16> @test_masked_32xi16_perm_high_mask6(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
+; CHECK-LABEL: test_masked_32xi16_perm_high_mask6:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqw %zmm3, %zmm2, %k1
+; CHECK-NEXT: vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,4,5,6,8,9,10,11,12,12,13,14,16,17,18,19,20,20,21,22,24,25,26,27,28,28,29,30]
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 5, i32 6, i32 8, i32 9, i32 10, i32 11, i32 12, i32 12, i32 13, i32 14, i32 16, i32 17, i32 18, i32 19, i32 20, i32 20, i32 21, i32 22, i32 24, i32 25, i32 26, i32 27, i32 28, i32 28, i32 29, i32 30>
+ %cmp = icmp eq <32 x i16> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
+ ret <32 x i16> %res
+}
+
+define <32 x i16> @test_masked_z_32xi16_perm_high_mask6(<32 x i16> %vec, <32 x i16> %mask) {
+; CHECK-LABEL: test_masked_z_32xi16_perm_high_mask6:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqw %zmm2, %zmm1, %k1
+; CHECK-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,4,5,6,8,9,10,11,12,12,13,14,16,17,18,19,20,20,21,22,24,25,26,27,28,28,29,30]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 5, i32 6, i32 8, i32 9, i32 10, i32 11, i32 12, i32 12, i32 13, i32 14, i32 16, i32 17, i32 18, i32 19, i32 20, i32 20, i32 21, i32 22, i32 24, i32 25, i32 26, i32 27, i32 28, i32 28, i32 29, i32 30>
+ %cmp = icmp eq <32 x i16> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
+ ret <32 x i16> %res
+}
+define <32 x i16> @test_masked_32xi16_perm_low_mask7(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
+; CHECK-LABEL: test_masked_32xi16_perm_low_mask7:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqw %zmm3, %zmm2, %k1
+; CHECK-NEXT: vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[3,0,3,0,4,5,6,7,11,8,11,8,12,13,14,15,19,16,19,16,20,21,22,23,27,24,27,24,28,29,30,31]
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 3, i32 0, i32 3, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 8, i32 11, i32 8, i32 12, i32 13, i32 14, i32 15, i32 19, i32 16, i32 19, i32 16, i32 20, i32 21, i32 22, i32 23, i32 27, i32 24, i32 27, i32 24, i32 28, i32 29, i32 30, i32 31>
+ %cmp = icmp eq <32 x i16> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
+ ret <32 x i16> %res
+}
+
+define <32 x i16> @test_masked_z_32xi16_perm_low_mask7(<32 x i16> %vec, <32 x i16> %mask) {
+; CHECK-LABEL: test_masked_z_32xi16_perm_low_mask7:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqw %zmm2, %zmm1, %k1
+; CHECK-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[3,0,3,0,4,5,6,7,11,8,11,8,12,13,14,15,19,16,19,16,20,21,22,23,27,24,27,24,28,29,30,31]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 3, i32 0, i32 3, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 8, i32 11, i32 8, i32 12, i32 13, i32 14, i32 15, i32 19, i32 16, i32 19, i32 16, i32 20, i32 21, i32 22, i32 23, i32 27, i32 24, i32 27, i32 24, i32 28, i32 29, i32 30, i32 31>
+ %cmp = icmp eq <32 x i16> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
+ ret <32 x i16> %res
+}
+define <32 x i16> @test_32xi16_perm_high_mem_mask0(<32 x i16>* %vp) {
+; CHECK-LABEL: test_32xi16_perm_high_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpshufhw {{.*#+}} zmm0 = mem[0,1,2,3,7,4,5,6,8,9,10,11,15,12,13,14,16,17,18,19,23,20,21,22,24,25,26,27,31,28,29,30]
+; CHECK-NEXT: retq
+ %vec = load <32 x i16>, <32 x i16>* %vp
+ %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 4, i32 5, i32 6, i32 8, i32 9, i32 10, i32 11, i32 15, i32 12, i32 13, i32 14, i32 16, i32 17, i32 18, i32 19, i32 23, i32 20, i32 21, i32 22, i32 24, i32 25, i32 26, i32 27, i32 31, i32 28, i32 29, i32 30>
+ ret <32 x i16> %res
+}
+define <32 x i16> @test_masked_32xi16_perm_high_mem_mask0(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
+; CHECK-LABEL: test_masked_32xi16_perm_high_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqw %zmm2, %zmm1, %k1
+; CHECK-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,7,4,5,6,8,9,10,11,15,12,13,14,16,17,18,19,23,20,21,22,24,25,26,27,31,28,29,30]
+; CHECK-NEXT: retq
+ %vec = load <32 x i16>, <32 x i16>* %vp
+ %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 4, i32 5, i32 6, i32 8, i32 9, i32 10, i32 11, i32 15, i32 12, i32 13, i32 14, i32 16, i32 17, i32 18, i32 19, i32 23, i32 20, i32 21, i32 22, i32 24, i32 25, i32 26, i32 27, i32 31, i32 28, i32 29, i32 30>
+ %cmp = icmp eq <32 x i16> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
+ ret <32 x i16> %res
+}
+
+define <32 x i16> @test_masked_z_32xi16_perm_high_mem_mask0(<32 x i16>* %vp, <32 x i16> %mask) {
+; CHECK-LABEL: test_masked_z_32xi16_perm_high_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqw %zmm1, %zmm0, %k1
+; CHECK-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,7,4,5,6,8,9,10,11,15,12,13,14,16,17,18,19,23,20,21,22,24,25,26,27,31,28,29,30]
+; CHECK-NEXT: retq
+ %vec = load <32 x i16>, <32 x i16>* %vp
+ %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 4, i32 5, i32 6, i32 8, i32 9, i32 10, i32 11, i32 15, i32 12, i32 13, i32 14, i32 16, i32 17, i32 18, i32 19, i32 23, i32 20, i32 21, i32 22, i32 24, i32 25, i32 26, i32 27, i32 31, i32 28, i32 29, i32 30>
+ %cmp = icmp eq <32 x i16> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
+ ret <32 x i16> %res
+}
+
+define <32 x i16> @test_masked_32xi16_perm_low_mem_mask1(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
+; CHECK-LABEL: test_masked_32xi16_perm_low_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqw %zmm2, %zmm1, %k1
+; CHECK-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} = mem[1,1,3,3,4,5,6,7,9,9,11,11,12,13,14,15,17,17,19,19,20,21,22,23,25,25,27,27,28,29,30,31]
+; CHECK-NEXT: retq
+ %vec = load <32 x i16>, <32 x i16>* %vp
+ %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 1, i32 1, i32 3, i32 3, i32 4, i32 5, i32 6, i32 7, i32 9, i32 9, i32 11, i32 11, i32 12, i32 13, i32 14, i32 15, i32 17, i32 17, i32 19, i32 19, i32 20, i32 21, i32 22, i32 23, i32 25, i32 25, i32 27, i32 27, i32 28, i32 29, i32 30, i32 31>
+ %cmp = icmp eq <32 x i16> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
+ ret <32 x i16> %res
+}
+
+define <32 x i16> @test_masked_z_32xi16_perm_low_mem_mask1(<32 x i16>* %vp, <32 x i16> %mask) {
+; CHECK-LABEL: test_masked_z_32xi16_perm_low_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqw %zmm1, %zmm0, %k1
+; CHECK-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = mem[1,1,3,3,4,5,6,7,9,9,11,11,12,13,14,15,17,17,19,19,20,21,22,23,25,25,27,27,28,29,30,31]
+; CHECK-NEXT: retq
+ %vec = load <32 x i16>, <32 x i16>* %vp
+ %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 1, i32 1, i32 3, i32 3, i32 4, i32 5, i32 6, i32 7, i32 9, i32 9, i32 11, i32 11, i32 12, i32 13, i32 14, i32 15, i32 17, i32 17, i32 19, i32 19, i32 20, i32 21, i32 22, i32 23, i32 25, i32 25, i32 27, i32 27, i32 28, i32 29, i32 30, i32 31>
+ %cmp = icmp eq <32 x i16> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
+ ret <32 x i16> %res
+}
+
+define <32 x i16> @test_masked_32xi16_perm_high_mem_mask2(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
+; CHECK-LABEL: test_masked_32xi16_perm_high_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqw %zmm2, %zmm1, %k1
+; CHECK-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,4,7,6,4,8,9,10,11,12,15,14,12,16,17,18,19,20,23,22,20,24,25,26,27,28,31,30,28]
+; CHECK-NEXT: retq
+ %vec = load <32 x i16>, <32 x i16>* %vp
+ %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 7, i32 6, i32 4, i32 8, i32 9, i32 10, i32 11, i32 12, i32 15, i32 14, i32 12, i32 16, i32 17, i32 18, i32 19, i32 20, i32 23, i32 22, i32 20, i32 24, i32 25, i32 26, i32 27, i32 28, i32 31, i32 30, i32 28>
+ %cmp = icmp eq <32 x i16> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
+ ret <32 x i16> %res
+}
+
+define <32 x i16> @test_masked_z_32xi16_perm_high_mem_mask2(<32 x i16>* %vp, <32 x i16> %mask) {
+; CHECK-LABEL: test_masked_z_32xi16_perm_high_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqw %zmm1, %zmm0, %k1
+; CHECK-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,4,7,6,4,8,9,10,11,12,15,14,12,16,17,18,19,20,23,22,20,24,25,26,27,28,31,30,28]
+; CHECK-NEXT: retq
+ %vec = load <32 x i16>, <32 x i16>* %vp
+ %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 7, i32 6, i32 4, i32 8, i32 9, i32 10, i32 11, i32 12, i32 15, i32 14, i32 12, i32 16, i32 17, i32 18, i32 19, i32 20, i32 23, i32 22, i32 20, i32 24, i32 25, i32 26, i32 27, i32 28, i32 31, i32 30, i32 28>
+ %cmp = icmp eq <32 x i16> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
+ ret <32 x i16> %res
+}
+
+define <32 x i16> @test_32xi16_perm_low_mem_mask3(<32 x i16>* %vp) {
+; CHECK-LABEL: test_32xi16_perm_low_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpshuflw {{.*#+}} zmm0 = mem[2,2,0,3,4,5,6,7,10,10,8,11,12,13,14,15,18,18,16,19,20,21,22,23,26,26,24,27,28,29,30,31]
+; CHECK-NEXT: retq
+ %vec = load <32 x i16>, <32 x i16>* %vp
+ %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 2, i32 2, i32 0, i32 3, i32 4, i32 5, i32 6, i32 7, i32 10, i32 10, i32 8, i32 11, i32 12, i32 13, i32 14, i32 15, i32 18, i32 18, i32 16, i32 19, i32 20, i32 21, i32 22, i32 23, i32 26, i32 26, i32 24, i32 27, i32 28, i32 29, i32 30, i32 31>
+ ret <32 x i16> %res
+}
+define <32 x i16> @test_masked_32xi16_perm_low_mem_mask3(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
+; CHECK-LABEL: test_masked_32xi16_perm_low_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqw %zmm2, %zmm1, %k1
+; CHECK-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} = mem[2,2,0,3,4,5,6,7,10,10,8,11,12,13,14,15,18,18,16,19,20,21,22,23,26,26,24,27,28,29,30,31]
+; CHECK-NEXT: retq
+ %vec = load <32 x i16>, <32 x i16>* %vp
+ %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 2, i32 2, i32 0, i32 3, i32 4, i32 5, i32 6, i32 7, i32 10, i32 10, i32 8, i32 11, i32 12, i32 13, i32 14, i32 15, i32 18, i32 18, i32 16, i32 19, i32 20, i32 21, i32 22, i32 23, i32 26, i32 26, i32 24, i32 27, i32 28, i32 29, i32 30, i32 31>
+ %cmp = icmp eq <32 x i16> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
+ ret <32 x i16> %res
+}
+
+define <32 x i16> @test_masked_z_32xi16_perm_low_mem_mask3(<32 x i16>* %vp, <32 x i16> %mask) {
+; CHECK-LABEL: test_masked_z_32xi16_perm_low_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqw %zmm1, %zmm0, %k1
+; CHECK-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = mem[2,2,0,3,4,5,6,7,10,10,8,11,12,13,14,15,18,18,16,19,20,21,22,23,26,26,24,27,28,29,30,31]
+; CHECK-NEXT: retq
+ %vec = load <32 x i16>, <32 x i16>* %vp
+ %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 2, i32 2, i32 0, i32 3, i32 4, i32 5, i32 6, i32 7, i32 10, i32 10, i32 8, i32 11, i32 12, i32 13, i32 14, i32 15, i32 18, i32 18, i32 16, i32 19, i32 20, i32 21, i32 22, i32 23, i32 26, i32 26, i32 24, i32 27, i32 28, i32 29, i32 30, i32 31>
+ %cmp = icmp eq <32 x i16> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
+ ret <32 x i16> %res
+}
+
+define <32 x i16> @test_masked_32xi16_perm_high_mem_mask4(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
+; CHECK-LABEL: test_masked_32xi16_perm_high_mem_mask4:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqw %zmm2, %zmm1, %k1
+; CHECK-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,7,4,6,5,8,9,10,11,15,12,14,13,16,17,18,19,23,20,22,21,24,25,26,27,31,28,30,29]
+; CHECK-NEXT: retq
+ %vec = load <32 x i16>, <32 x i16>* %vp
+ %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 4, i32 6, i32 5, i32 8, i32 9, i32 10, i32 11, i32 15, i32 12, i32 14, i32 13, i32 16, i32 17, i32 18, i32 19, i32 23, i32 20, i32 22, i32 21, i32 24, i32 25, i32 26, i32 27, i32 31, i32 28, i32 30, i32 29>
+ %cmp = icmp eq <32 x i16> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
+ ret <32 x i16> %res
+}
+
+define <32 x i16> @test_masked_z_32xi16_perm_high_mem_mask4(<32 x i16>* %vp, <32 x i16> %mask) {
+; CHECK-LABEL: test_masked_z_32xi16_perm_high_mem_mask4:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqw %zmm1, %zmm0, %k1
+; CHECK-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,7,4,6,5,8,9,10,11,15,12,14,13,16,17,18,19,23,20,22,21,24,25,26,27,31,28,30,29]
+; CHECK-NEXT: retq
+ %vec = load <32 x i16>, <32 x i16>* %vp
+ %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 4, i32 6, i32 5, i32 8, i32 9, i32 10, i32 11, i32 15, i32 12, i32 14, i32 13, i32 16, i32 17, i32 18, i32 19, i32 23, i32 20, i32 22, i32 21, i32 24, i32 25, i32 26, i32 27, i32 31, i32 28, i32 30, i32 29>
+ %cmp = icmp eq <32 x i16> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
+ ret <32 x i16> %res
+}
+
+define <32 x i16> @test_masked_32xi16_perm_low_mem_mask5(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
+; CHECK-LABEL: test_masked_32xi16_perm_low_mem_mask5:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpshufd {{.*#+}} zmm2 = mem[0,0,2,3,4,4,6,7,8,8,10,11,12,12,14,15]
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqw %zmm3, %zmm1, %k1
+; CHECK-NEXT: vmovdqu16 %zmm2, %zmm0 {%k1}
+; CHECK-NEXT: retq
+ %vec = load <32 x i16>, <32 x i16>* %vp
+ %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 8, i32 9, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 16, i32 17, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 24, i32 25, i32 28, i32 29, i32 30, i32 31>
+ %cmp = icmp eq <32 x i16> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
+ ret <32 x i16> %res
+}
+
+define <32 x i16> @test_masked_z_32xi16_perm_low_mem_mask5(<32 x i16>* %vp, <32 x i16> %mask) {
+; CHECK-LABEL: test_masked_z_32xi16_perm_low_mem_mask5:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpshufd {{.*#+}} zmm1 = mem[0,0,2,3,4,4,6,7,8,8,10,11,12,12,14,15]
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqw %zmm2, %zmm0, %k1
+; CHECK-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %vec = load <32 x i16>, <32 x i16>* %vp
+ %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 8, i32 9, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 16, i32 17, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 24, i32 25, i32 28, i32 29, i32 30, i32 31>
+ %cmp = icmp eq <32 x i16> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
+ ret <32 x i16> %res
+}
+
+define <32 x i16> @test_32xi16_perm_high_mem_mask6(<32 x i16>* %vp) {
+; CHECK-LABEL: test_32xi16_perm_high_mem_mask6:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpshufhw {{.*#+}} zmm0 = mem[0,1,2,3,6,5,6,6,8,9,10,11,14,13,14,14,16,17,18,19,22,21,22,22,24,25,26,27,30,29,30,30]
+; CHECK-NEXT: retq
+ %vec = load <32 x i16>, <32 x i16>* %vp
+ %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 5, i32 6, i32 6, i32 8, i32 9, i32 10, i32 11, i32 14, i32 13, i32 14, i32 14, i32 16, i32 17, i32 18, i32 19, i32 22, i32 21, i32 22, i32 22, i32 24, i32 25, i32 26, i32 27, i32 30, i32 29, i32 30, i32 30>
+ ret <32 x i16> %res
+}
+define <32 x i16> @test_masked_32xi16_perm_high_mem_mask6(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
+; CHECK-LABEL: test_masked_32xi16_perm_high_mem_mask6:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqw %zmm2, %zmm1, %k1
+; CHECK-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,6,5,6,6,8,9,10,11,14,13,14,14,16,17,18,19,22,21,22,22,24,25,26,27,30,29,30,30]
+; CHECK-NEXT: retq
+ %vec = load <32 x i16>, <32 x i16>* %vp
+ %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 5, i32 6, i32 6, i32 8, i32 9, i32 10, i32 11, i32 14, i32 13, i32 14, i32 14, i32 16, i32 17, i32 18, i32 19, i32 22, i32 21, i32 22, i32 22, i32 24, i32 25, i32 26, i32 27, i32 30, i32 29, i32 30, i32 30>
+ %cmp = icmp eq <32 x i16> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
+ ret <32 x i16> %res
+}
+
+define <32 x i16> @test_masked_z_32xi16_perm_high_mem_mask6(<32 x i16>* %vp, <32 x i16> %mask) {
+; CHECK-LABEL: test_masked_z_32xi16_perm_high_mem_mask6:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqw %zmm1, %zmm0, %k1
+; CHECK-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,6,5,6,6,8,9,10,11,14,13,14,14,16,17,18,19,22,21,22,22,24,25,26,27,30,29,30,30]
+; CHECK-NEXT: retq
+ %vec = load <32 x i16>, <32 x i16>* %vp
+ %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 5, i32 6, i32 6, i32 8, i32 9, i32 10, i32 11, i32 14, i32 13, i32 14, i32 14, i32 16, i32 17, i32 18, i32 19, i32 22, i32 21, i32 22, i32 22, i32 24, i32 25, i32 26, i32 27, i32 30, i32 29, i32 30, i32 30>
+ %cmp = icmp eq <32 x i16> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
+ ret <32 x i16> %res
+}
+
+define <32 x i16> @test_masked_32xi16_perm_low_mem_mask7(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
+; CHECK-LABEL: test_masked_32xi16_perm_low_mem_mask7:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqw %zmm2, %zmm1, %k1
+; CHECK-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} = mem[3,1,3,0,4,5,6,7,11,9,11,8,12,13,14,15,19,17,19,16,20,21,22,23,27,25,27,24,28,29,30,31]
+; CHECK-NEXT: retq
+ %vec = load <32 x i16>, <32 x i16>* %vp
+ %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 3, i32 1, i32 3, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 9, i32 11, i32 8, i32 12, i32 13, i32 14, i32 15, i32 19, i32 17, i32 19, i32 16, i32 20, i32 21, i32 22, i32 23, i32 27, i32 25, i32 27, i32 24, i32 28, i32 29, i32 30, i32 31>
+ %cmp = icmp eq <32 x i16> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
+ ret <32 x i16> %res
+}
+
+define <32 x i16> @test_masked_z_32xi16_perm_low_mem_mask7(<32 x i16>* %vp, <32 x i16> %mask) {
+; CHECK-LABEL: test_masked_z_32xi16_perm_low_mem_mask7:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqw %zmm1, %zmm0, %k1
+; CHECK-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = mem[3,1,3,0,4,5,6,7,11,9,11,8,12,13,14,15,19,17,19,16,20,21,22,23,27,25,27,24,28,29,30,31]
+; CHECK-NEXT: retq
+ %vec = load <32 x i16>, <32 x i16>* %vp
+ %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 3, i32 1, i32 3, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 9, i32 11, i32 8, i32 12, i32 13, i32 14, i32 15, i32 19, i32 17, i32 19, i32 16, i32 20, i32 21, i32 22, i32 23, i32 27, i32 25, i32 27, i32 24, i32 28, i32 29, i32 30, i32 31>
+ %cmp = icmp eq <32 x i16> %mask, zeroinitializer
+ %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
+ ret <32 x i16> %res
+}
+
+define <4 x i32> @test_4xi32_perm_mask0(<4 x i32> %vec) {
+; CHECK-LABEL: test_4xi32_perm_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,3,0]
+; CHECK-NEXT: retq
+ %res = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 3, i32 0>
+ ret <4 x i32> %res
+}
+define <4 x i32> @test_masked_4xi32_perm_mask0(<4 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
+; CHECK-LABEL: test_masked_4xi32_perm_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqd %xmm3, %xmm2, %k1
+; CHECK-NEXT: vpshufd {{.*#+}} xmm1 {%k1} = xmm0[2,3,3,0]
+; CHECK-NEXT: vmovdqa %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 3, i32 0>
+ %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_masked_z_4xi32_perm_mask0(<4 x i32> %vec, <4 x i32> %mask) {
+; CHECK-LABEL: test_masked_z_4xi32_perm_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %k1
+; CHECK-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[2,3,3,0]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 3, i32 0>
+ %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
+ ret <4 x i32> %res
+}
+define <4 x i32> @test_masked_4xi32_perm_mask1(<4 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
+; CHECK-LABEL: test_masked_4xi32_perm_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqd %xmm3, %xmm2, %k1
+; CHECK-NEXT: vpshufd {{.*#+}} xmm1 {%k1} = xmm0[1,0,2,0]
+; CHECK-NEXT: vmovdqa %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 2, i32 0>
+ %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_masked_z_4xi32_perm_mask1(<4 x i32> %vec, <4 x i32> %mask) {
+; CHECK-LABEL: test_masked_z_4xi32_perm_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %k1
+; CHECK-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[1,0,2,0]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 2, i32 0>
+ %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
+ ret <4 x i32> %res
+}
+define <4 x i32> @test_masked_4xi32_perm_mask2(<4 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
+; CHECK-LABEL: test_masked_4xi32_perm_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqd %xmm3, %xmm2, %k1
+; CHECK-NEXT: vpshufd {{.*#+}} xmm1 {%k1} = xmm0[3,0,1,0]
+; CHECK-NEXT: vmovdqa %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 3, i32 0, i32 1, i32 0>
+ %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_masked_z_4xi32_perm_mask2(<4 x i32> %vec, <4 x i32> %mask) {
+; CHECK-LABEL: test_masked_z_4xi32_perm_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %k1
+; CHECK-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[3,0,1,0]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 3, i32 0, i32 1, i32 0>
+ %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
+ ret <4 x i32> %res
+}
+define <4 x i32> @test_4xi32_perm_mask3(<4 x i32> %vec) {
+; CHECK-LABEL: test_4xi32_perm_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,0,3]
+; CHECK-NEXT: retq
+ %res = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 0, i32 3>
+ ret <4 x i32> %res
+}
+define <4 x i32> @test_masked_4xi32_perm_mask3(<4 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
+; CHECK-LABEL: test_masked_4xi32_perm_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqd %xmm3, %xmm2, %k1
+; CHECK-NEXT: vpshufd {{.*#+}} xmm1 {%k1} = xmm0[1,1,0,3]
+; CHECK-NEXT: vmovdqa %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 0, i32 3>
+ %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_masked_z_4xi32_perm_mask3(<4 x i32> %vec, <4 x i32> %mask) {
+; CHECK-LABEL: test_masked_z_4xi32_perm_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %k1
+; CHECK-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[1,1,0,3]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 0, i32 3>
+ %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
+ ret <4 x i32> %res
+}
+define <4 x i32> @test_4xi32_perm_mem_mask0(<4 x i32>* %vp) {
+; CHECK-LABEL: test_4xi32_perm_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = mem[0,1,3,3]
+; CHECK-NEXT: retq
+ %vec = load <4 x i32>, <4 x i32>* %vp
+ %res = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 3, i32 3>
+ ret <4 x i32> %res
+}
+define <4 x i32> @test_masked_4xi32_perm_mem_mask0(<4 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) {
+; CHECK-LABEL: test_masked_4xi32_perm_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %k1
+; CHECK-NEXT: vpshufd {{.*#+}} xmm0 {%k1} = mem[0,1,3,3]
+; CHECK-NEXT: retq
+ %vec = load <4 x i32>, <4 x i32>* %vp
+ %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 3, i32 3>
+ %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_masked_z_4xi32_perm_mem_mask0(<4 x i32>* %vp, <4 x i32> %mask) {
+; CHECK-LABEL: test_masked_z_4xi32_perm_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k1
+; CHECK-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[0,1,3,3]
+; CHECK-NEXT: retq
+ %vec = load <4 x i32>, <4 x i32>* %vp
+ %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 3, i32 3>
+ %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_masked_4xi32_perm_mem_mask1(<4 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) {
+; CHECK-LABEL: test_masked_4xi32_perm_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %k1
+; CHECK-NEXT: vpshufd {{.*#+}} xmm0 {%k1} = mem[2,2,3,1]
+; CHECK-NEXT: retq
+ %vec = load <4 x i32>, <4 x i32>* %vp
+ %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 2, i32 2, i32 3, i32 1>
+ %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_masked_z_4xi32_perm_mem_mask1(<4 x i32>* %vp, <4 x i32> %mask) {
+; CHECK-LABEL: test_masked_z_4xi32_perm_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k1
+; CHECK-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[2,2,3,1]
+; CHECK-NEXT: retq
+ %vec = load <4 x i32>, <4 x i32>* %vp
+ %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 2, i32 2, i32 3, i32 1>
+ %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_masked_4xi32_perm_mem_mask2(<4 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) {
+; CHECK-LABEL: test_masked_4xi32_perm_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %k1
+; CHECK-NEXT: vpshufd {{.*#+}} xmm0 {%k1} = mem[0,3,0,1]
+; CHECK-NEXT: retq
+ %vec = load <4 x i32>, <4 x i32>* %vp
+ %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 0, i32 3, i32 0, i32 1>
+ %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_masked_z_4xi32_perm_mem_mask2(<4 x i32>* %vp, <4 x i32> %mask) {
+; CHECK-LABEL: test_masked_z_4xi32_perm_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k1
+; CHECK-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[0,3,0,1]
+; CHECK-NEXT: retq
+ %vec = load <4 x i32>, <4 x i32>* %vp
+ %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 0, i32 3, i32 0, i32 1>
+ %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_4xi32_perm_mem_mask3(<4 x i32>* %vp) {
+; CHECK-LABEL: test_4xi32_perm_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,0,1,0]
+; CHECK-NEXT: retq
+ %vec = load <4 x i32>, <4 x i32>* %vp
+ %res = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
+ ret <4 x i32> %res
+}
+define <4 x i32> @test_masked_4xi32_perm_mem_mask3(<4 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) {
+; CHECK-LABEL: test_masked_4xi32_perm_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %k1
+; CHECK-NEXT: vpshufd {{.*#+}} xmm0 {%k1} = mem[1,0,1,0]
+; CHECK-NEXT: retq
+ %vec = load <4 x i32>, <4 x i32>* %vp
+ %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
+ %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_masked_z_4xi32_perm_mem_mask3(<4 x i32>* %vp, <4 x i32> %mask) {
+; CHECK-LABEL: test_masked_z_4xi32_perm_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k1
+; CHECK-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[1,0,1,0]
+; CHECK-NEXT: retq
+ %vec = load <4 x i32>, <4 x i32>* %vp
+ %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
+ %cmp = icmp eq <4 x i32> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
+ ret <4 x i32> %res
+}
+
+define <8 x i32> @test_8xi32_perm_mask0(<8 x i32> %vec) {
+; CHECK-LABEL: test_8xi32_perm_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,1,0,6,7,5,4]
+; CHECK-NEXT: retq
+ %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 1, i32 0, i32 6, i32 7, i32 5, i32 4>
+ ret <8 x i32> %res
+}
+define <8 x i32> @test_masked_8xi32_perm_mask0(<8 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
+; CHECK-LABEL: test_masked_8xi32_perm_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1
+; CHECK-NEXT: vpshufd {{.*#+}} ymm1 {%k1} = ymm0[2,3,1,0,6,7,5,4]
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 1, i32 0, i32 6, i32 7, i32 5, i32 4>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_masked_z_8xi32_perm_mask0(<8 x i32> %vec, <8 x i32> %mask) {
+; CHECK-LABEL: test_masked_z_8xi32_perm_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3,1,0,6,7,5,4]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 1, i32 0, i32 6, i32 7, i32 5, i32 4>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
+ ret <8 x i32> %res
+}
+define <8 x i32> @test_masked_8xi32_perm_mask1(<8 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
+; CHECK-LABEL: test_masked_8xi32_perm_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1
+; CHECK-NEXT: vpshufd {{.*#+}} ymm1 {%k1} = ymm0[0,3,3,3,4,7,7,7]
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 0, i32 3, i32 3, i32 3, i32 4, i32 7, i32 7, i32 7>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_masked_z_8xi32_perm_mask1(<8 x i32> %vec, <8 x i32> %mask) {
+; CHECK-LABEL: test_masked_z_8xi32_perm_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[0,3,3,3,4,7,7,7]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 0, i32 3, i32 3, i32 3, i32 4, i32 7, i32 7, i32 7>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
+ ret <8 x i32> %res
+}
+define <8 x i32> @test_masked_8xi32_perm_mask2(<8 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
+; CHECK-LABEL: test_masked_8xi32_perm_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1
+; CHECK-NEXT: vpshufd {{.*#+}} ymm1 {%k1} = ymm0[1,2,0,3,5,6,4,7]
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 1, i32 2, i32 0, i32 3, i32 5, i32 6, i32 4, i32 7>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_masked_z_8xi32_perm_mask2(<8 x i32> %vec, <8 x i32> %mask) {
+; CHECK-LABEL: test_masked_z_8xi32_perm_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,2,0,3,5,6,4,7]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 1, i32 2, i32 0, i32 3, i32 5, i32 6, i32 4, i32 7>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
+ ret <8 x i32> %res
+}
+define <8 x i32> @test_8xi32_perm_mask3(<8 x i32> %vec) {
+; CHECK-LABEL: test_8xi32_perm_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,3,1,0,5,7,5,4]
+; CHECK-NEXT: retq
+ %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 1, i32 3, i32 1, i32 0, i32 5, i32 7, i32 5, i32 4>
+ ret <8 x i32> %res
+}
+define <8 x i32> @test_masked_8xi32_perm_mask3(<8 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
+; CHECK-LABEL: test_masked_8xi32_perm_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1
+; CHECK-NEXT: vpshufd {{.*#+}} ymm1 {%k1} = ymm0[1,3,1,0,5,7,5,4]
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 1, i32 3, i32 1, i32 0, i32 5, i32 7, i32 5, i32 4>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_masked_z_8xi32_perm_mask3(<8 x i32> %vec, <8 x i32> %mask) {
+; CHECK-LABEL: test_masked_z_8xi32_perm_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,3,1,0,5,7,5,4]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 1, i32 3, i32 1, i32 0, i32 5, i32 7, i32 5, i32 4>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
+ ret <8 x i32> %res
+}
+define <8 x i32> @test_8xi32_perm_mem_mask0(<8 x i32>* %vp) {
+; CHECK-LABEL: test_8xi32_perm_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpermilps {{.*#+}} ymm0 = mem[1,0,2,0,5,4,6,4]
+; CHECK-NEXT: retq
+ %vec = load <8 x i32>, <8 x i32>* %vp
+ %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 1, i32 0, i32 2, i32 0, i32 5, i32 4, i32 6, i32 4>
+ ret <8 x i32> %res
+}
+define <8 x i32> @test_masked_8xi32_perm_mem_mask0(<8 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) {
+; CHECK-LABEL: test_masked_8xi32_perm_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vpshufd {{.*#+}} ymm0 {%k1} = mem[1,0,2,0,5,4,6,4]
+; CHECK-NEXT: retq
+ %vec = load <8 x i32>, <8 x i32>* %vp
+ %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 1, i32 0, i32 2, i32 0, i32 5, i32 4, i32 6, i32 4>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_masked_z_8xi32_perm_mem_mask0(<8 x i32>* %vp, <8 x i32> %mask) {
+; CHECK-LABEL: test_masked_z_8xi32_perm_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k1
+; CHECK-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[1,0,2,0,5,4,6,4]
+; CHECK-NEXT: retq
+ %vec = load <8 x i32>, <8 x i32>* %vp
+ %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 1, i32 0, i32 2, i32 0, i32 5, i32 4, i32 6, i32 4>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_masked_8xi32_perm_mem_mask1(<8 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) {
+; CHECK-LABEL: test_masked_8xi32_perm_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vpshufd {{.*#+}} ymm0 {%k1} = mem[0,3,2,0,4,7,6,4]
+; CHECK-NEXT: retq
+ %vec = load <8 x i32>, <8 x i32>* %vp
+ %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 0, i32 3, i32 2, i32 0, i32 4, i32 7, i32 6, i32 4>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_masked_z_8xi32_perm_mem_mask1(<8 x i32>* %vp, <8 x i32> %mask) {
+; CHECK-LABEL: test_masked_z_8xi32_perm_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k1
+; CHECK-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[0,3,2,0,4,7,6,4]
+; CHECK-NEXT: retq
+ %vec = load <8 x i32>, <8 x i32>* %vp
+ %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 0, i32 3, i32 2, i32 0, i32 4, i32 7, i32 6, i32 4>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_masked_8xi32_perm_mem_mask2(<8 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) {
+; CHECK-LABEL: test_masked_8xi32_perm_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vpshufd {{.*#+}} ymm0 {%k1} = mem[3,2,3,1,7,6,7,5]
+; CHECK-NEXT: retq
+ %vec = load <8 x i32>, <8 x i32>* %vp
+ %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 2, i32 3, i32 1, i32 7, i32 6, i32 7, i32 5>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_masked_z_8xi32_perm_mem_mask2(<8 x i32>* %vp, <8 x i32> %mask) {
+; CHECK-LABEL: test_masked_z_8xi32_perm_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k1
+; CHECK-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[3,2,3,1,7,6,7,5]
+; CHECK-NEXT: retq
+ %vec = load <8 x i32>, <8 x i32>* %vp
+ %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 2, i32 3, i32 1, i32 7, i32 6, i32 7, i32 5>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_8xi32_perm_mem_mask3(<8 x i32>* %vp) {
+; CHECK-LABEL: test_8xi32_perm_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpermilps {{.*#+}} ymm0 = mem[3,2,0,0,7,6,4,4]
+; CHECK-NEXT: retq
+ %vec = load <8 x i32>, <8 x i32>* %vp
+ %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 2, i32 0, i32 0, i32 7, i32 6, i32 4, i32 4>
+ ret <8 x i32> %res
+}
+define <8 x i32> @test_masked_8xi32_perm_mem_mask3(<8 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) {
+; CHECK-LABEL: test_masked_8xi32_perm_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vpshufd {{.*#+}} ymm0 {%k1} = mem[3,2,0,0,7,6,4,4]
+; CHECK-NEXT: retq
+ %vec = load <8 x i32>, <8 x i32>* %vp
+ %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 2, i32 0, i32 0, i32 7, i32 6, i32 4, i32 4>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_masked_z_8xi32_perm_mem_mask3(<8 x i32>* %vp, <8 x i32> %mask) {
+; CHECK-LABEL: test_masked_z_8xi32_perm_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k1
+; CHECK-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[3,2,0,0,7,6,4,4]
+; CHECK-NEXT: retq
+ %vec = load <8 x i32>, <8 x i32>* %vp
+ %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 2, i32 0, i32 0, i32 7, i32 6, i32 4, i32 4>
+ %cmp = icmp eq <8 x i32> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
+ ret <8 x i32> %res
+}
+
+define <16 x i32> @test_16xi32_perm_mask0(<16 x i32> %vec) {
+; CHECK-LABEL: test_16xi32_perm_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[3,1,3,0,7,5,7,4,11,9,11,8,15,13,15,12]
+; CHECK-NEXT: retq
+ %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 1, i32 3, i32 0, i32 7, i32 5, i32 7, i32 4, i32 11, i32 9, i32 11, i32 8, i32 15, i32 13, i32 15, i32 12>
+ ret <16 x i32> %res
+}
+define <16 x i32> @test_masked_16xi32_perm_mask0(<16 x i32> %vec, <16 x i32> %vec2, <16 x i32> %mask) {
+; CHECK-LABEL: test_masked_16xi32_perm_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqd %zmm3, %zmm2, %k1
+; CHECK-NEXT: vpshufd {{.*#+}} zmm1 {%k1} = zmm0[3,1,3,0,7,5,7,4,11,9,11,8,15,13,15,12]
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 1, i32 3, i32 0, i32 7, i32 5, i32 7, i32 4, i32 11, i32 9, i32 11, i32 8, i32 15, i32 13, i32 15, i32 12>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2
+ ret <16 x i32> %res
+}
+
+define <16 x i32> @test_masked_z_16xi32_perm_mask0(<16 x i32> %vec, <16 x i32> %mask) {
+; CHECK-LABEL: test_masked_z_16xi32_perm_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqd %zmm2, %zmm1, %k1
+; CHECK-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,1,3,0,7,5,7,4,11,9,11,8,15,13,15,12]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 1, i32 3, i32 0, i32 7, i32 5, i32 7, i32 4, i32 11, i32 9, i32 11, i32 8, i32 15, i32 13, i32 15, i32 12>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
+ ret <16 x i32> %res
+}
+define <16 x i32> @test_masked_16xi32_perm_mask1(<16 x i32> %vec, <16 x i32> %vec2, <16 x i32> %mask) {
+; CHECK-LABEL: test_masked_16xi32_perm_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqd %zmm3, %zmm2, %k1
+; CHECK-NEXT: vpshufd {{.*#+}} zmm1 {%k1} = zmm0[2,0,3,0,6,4,7,4,10,8,11,8,14,12,15,12]
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 2, i32 0, i32 3, i32 0, i32 6, i32 4, i32 7, i32 4, i32 10, i32 8, i32 11, i32 8, i32 14, i32 12, i32 15, i32 12>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2
+ ret <16 x i32> %res
+}
+
+define <16 x i32> @test_masked_z_16xi32_perm_mask1(<16 x i32> %vec, <16 x i32> %mask) {
+; CHECK-LABEL: test_masked_z_16xi32_perm_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqd %zmm2, %zmm1, %k1
+; CHECK-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[2,0,3,0,6,4,7,4,10,8,11,8,14,12,15,12]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 2, i32 0, i32 3, i32 0, i32 6, i32 4, i32 7, i32 4, i32 10, i32 8, i32 11, i32 8, i32 14, i32 12, i32 15, i32 12>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
+ ret <16 x i32> %res
+}
+define <16 x i32> @test_masked_16xi32_perm_mask2(<16 x i32> %vec, <16 x i32> %vec2, <16 x i32> %mask) {
+; CHECK-LABEL: test_masked_16xi32_perm_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqd %zmm3, %zmm2, %k1
+; CHECK-NEXT: vpshufd {{.*#+}} zmm1 {%k1} = zmm0[1,3,3,0,5,7,7,4,9,11,11,8,13,15,15,12]
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 1, i32 3, i32 3, i32 0, i32 5, i32 7, i32 7, i32 4, i32 9, i32 11, i32 11, i32 8, i32 13, i32 15, i32 15, i32 12>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2
+ ret <16 x i32> %res
+}
+
+define <16 x i32> @test_masked_z_16xi32_perm_mask2(<16 x i32> %vec, <16 x i32> %mask) {
+; CHECK-LABEL: test_masked_z_16xi32_perm_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqd %zmm2, %zmm1, %k1
+; CHECK-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[1,3,3,0,5,7,7,4,9,11,11,8,13,15,15,12]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 1, i32 3, i32 3, i32 0, i32 5, i32 7, i32 7, i32 4, i32 9, i32 11, i32 11, i32 8, i32 13, i32 15, i32 15, i32 12>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
+ ret <16 x i32> %res
+}
+define <16 x i32> @test_16xi32_perm_mask3(<16 x i32> %vec) {
+; CHECK-LABEL: test_16xi32_perm_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[3,2,0,3,7,6,4,7,11,10,8,11,15,14,12,15]
+; CHECK-NEXT: retq
+ %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 2, i32 0, i32 3, i32 7, i32 6, i32 4, i32 7, i32 11, i32 10, i32 8, i32 11, i32 15, i32 14, i32 12, i32 15>
+ ret <16 x i32> %res
+}
+define <16 x i32> @test_masked_16xi32_perm_mask3(<16 x i32> %vec, <16 x i32> %vec2, <16 x i32> %mask) {
+; CHECK-LABEL: test_masked_16xi32_perm_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqd %zmm3, %zmm2, %k1
+; CHECK-NEXT: vpshufd {{.*#+}} zmm1 {%k1} = zmm0[3,2,0,3,7,6,4,7,11,10,8,11,15,14,12,15]
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 2, i32 0, i32 3, i32 7, i32 6, i32 4, i32 7, i32 11, i32 10, i32 8, i32 11, i32 15, i32 14, i32 12, i32 15>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2
+ ret <16 x i32> %res
+}
+
+define <16 x i32> @test_masked_z_16xi32_perm_mask3(<16 x i32> %vec, <16 x i32> %mask) {
+; CHECK-LABEL: test_masked_z_16xi32_perm_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqd %zmm2, %zmm1, %k1
+; CHECK-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,2,0,3,7,6,4,7,11,10,8,11,15,14,12,15]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 2, i32 0, i32 3, i32 7, i32 6, i32 4, i32 7, i32 11, i32 10, i32 8, i32 11, i32 15, i32 14, i32 12, i32 15>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
+ ret <16 x i32> %res
+}
+define <16 x i32> @test_16xi32_perm_mem_mask0(<16 x i32>* %vp) {
+; CHECK-LABEL: test_16xi32_perm_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpermilps {{.*#+}} zmm0 = mem[1,0,1,3,5,4,5,7,9,8,9,11,13,12,13,15]
+; CHECK-NEXT: retq
+ %vec = load <16 x i32>, <16 x i32>* %vp
+ %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 1, i32 0, i32 1, i32 3, i32 5, i32 4, i32 5, i32 7, i32 9, i32 8, i32 9, i32 11, i32 13, i32 12, i32 13, i32 15>
+ ret <16 x i32> %res
+}
+define <16 x i32> @test_masked_16xi32_perm_mem_mask0(<16 x i32>* %vp, <16 x i32> %vec2, <16 x i32> %mask) {
+; CHECK-LABEL: test_masked_16xi32_perm_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqd %zmm2, %zmm1, %k1
+; CHECK-NEXT: vpshufd {{.*#+}} zmm0 {%k1} = mem[1,0,1,3,5,4,5,7,9,8,9,11,13,12,13,15]
+; CHECK-NEXT: retq
+ %vec = load <16 x i32>, <16 x i32>* %vp
+ %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 1, i32 0, i32 1, i32 3, i32 5, i32 4, i32 5, i32 7, i32 9, i32 8, i32 9, i32 11, i32 13, i32 12, i32 13, i32 15>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2
+ ret <16 x i32> %res
+}
+
+define <16 x i32> @test_masked_z_16xi32_perm_mem_mask0(<16 x i32>* %vp, <16 x i32> %mask) {
+; CHECK-LABEL: test_masked_z_16xi32_perm_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k1
+; CHECK-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[1,0,1,3,5,4,5,7,9,8,9,11,13,12,13,15]
+; CHECK-NEXT: retq
+ %vec = load <16 x i32>, <16 x i32>* %vp
+ %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 1, i32 0, i32 1, i32 3, i32 5, i32 4, i32 5, i32 7, i32 9, i32 8, i32 9, i32 11, i32 13, i32 12, i32 13, i32 15>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
+ ret <16 x i32> %res
+}
+
+define <16 x i32> @test_masked_16xi32_perm_mem_mask1(<16 x i32>* %vp, <16 x i32> %vec2, <16 x i32> %mask) {
+; CHECK-LABEL: test_masked_16xi32_perm_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqd %zmm2, %zmm1, %k1
+; CHECK-NEXT: vpshufd {{.*#+}} zmm0 {%k1} = mem[1,0,0,2,5,4,4,6,9,8,8,10,13,12,12,14]
+; CHECK-NEXT: retq
+ %vec = load <16 x i32>, <16 x i32>* %vp
+ %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 1, i32 0, i32 0, i32 2, i32 5, i32 4, i32 4, i32 6, i32 9, i32 8, i32 8, i32 10, i32 13, i32 12, i32 12, i32 14>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2
+ ret <16 x i32> %res
+}
+
+define <16 x i32> @test_masked_z_16xi32_perm_mem_mask1(<16 x i32>* %vp, <16 x i32> %mask) {
+; CHECK-LABEL: test_masked_z_16xi32_perm_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k1
+; CHECK-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[1,0,0,2,5,4,4,6,9,8,8,10,13,12,12,14]
+; CHECK-NEXT: retq
+ %vec = load <16 x i32>, <16 x i32>* %vp
+ %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 1, i32 0, i32 0, i32 2, i32 5, i32 4, i32 4, i32 6, i32 9, i32 8, i32 8, i32 10, i32 13, i32 12, i32 12, i32 14>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
+ ret <16 x i32> %res
+}
+
+define <16 x i32> @test_masked_16xi32_perm_mem_mask2(<16 x i32>* %vp, <16 x i32> %vec2, <16 x i32> %mask) {
+; CHECK-LABEL: test_masked_16xi32_perm_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqd %zmm2, %zmm1, %k1
+; CHECK-NEXT: vpshufd {{.*#+}} zmm0 {%k1} = mem[2,0,1,2,6,4,5,6,10,8,9,10,14,12,13,14]
+; CHECK-NEXT: retq
+ %vec = load <16 x i32>, <16 x i32>* %vp
+ %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 2, i32 0, i32 1, i32 2, i32 6, i32 4, i32 5, i32 6, i32 10, i32 8, i32 9, i32 10, i32 14, i32 12, i32 13, i32 14>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2
+ ret <16 x i32> %res
+}
+
+define <16 x i32> @test_masked_z_16xi32_perm_mem_mask2(<16 x i32>* %vp, <16 x i32> %mask) {
+; CHECK-LABEL: test_masked_z_16xi32_perm_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k1
+; CHECK-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[2,0,1,2,6,4,5,6,10,8,9,10,14,12,13,14]
+; CHECK-NEXT: retq
+ %vec = load <16 x i32>, <16 x i32>* %vp
+ %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 2, i32 0, i32 1, i32 2, i32 6, i32 4, i32 5, i32 6, i32 10, i32 8, i32 9, i32 10, i32 14, i32 12, i32 13, i32 14>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
+ ret <16 x i32> %res
+}
+
+define <16 x i32> @test_16xi32_perm_mem_mask3(<16 x i32>* %vp) {
+; CHECK-LABEL: test_16xi32_perm_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpermilps {{.*#+}} zmm0 = mem[3,1,1,1,7,5,5,5,11,9,9,9,15,13,13,13]
+; CHECK-NEXT: retq
+ %vec = load <16 x i32>, <16 x i32>* %vp
+ %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 1, i32 1, i32 1, i32 7, i32 5, i32 5, i32 5, i32 11, i32 9, i32 9, i32 9, i32 15, i32 13, i32 13, i32 13>
+ ret <16 x i32> %res
+}
+define <16 x i32> @test_masked_16xi32_perm_mem_mask3(<16 x i32>* %vp, <16 x i32> %vec2, <16 x i32> %mask) {
+; CHECK-LABEL: test_masked_16xi32_perm_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vpcmpeqd %zmm2, %zmm1, %k1
+; CHECK-NEXT: vpshufd {{.*#+}} zmm0 {%k1} = mem[3,1,1,1,7,5,5,5,11,9,9,9,15,13,13,13]
+; CHECK-NEXT: retq
+ %vec = load <16 x i32>, <16 x i32>* %vp
+ %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 1, i32 1, i32 1, i32 7, i32 5, i32 5, i32 5, i32 11, i32 9, i32 9, i32 9, i32 15, i32 13, i32 13, i32 13>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2
+ ret <16 x i32> %res
+}
+
+define <16 x i32> @test_masked_z_16xi32_perm_mem_mask3(<16 x i32>* %vp, <16 x i32> %mask) {
+; CHECK-LABEL: test_masked_z_16xi32_perm_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k1
+; CHECK-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[3,1,1,1,7,5,5,5,11,9,9,9,15,13,13,13]
+; CHECK-NEXT: retq
+ %vec = load <16 x i32>, <16 x i32>* %vp
+ %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 1, i32 1, i32 1, i32 7, i32 5, i32 5, i32 5, i32 11, i32 9, i32 9, i32 9, i32 15, i32 13, i32 13, i32 13>
+ %cmp = icmp eq <16 x i32> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
+ ret <16 x i32> %res
+}
+
diff --git a/test/CodeGen/X86/avx512-shuffles/unpack.ll b/test/CodeGen/X86/avx512-shuffles/unpack.ll
new file mode 100644
index 000000000000..5eca7f0cebab
--- /dev/null
+++ b/test/CodeGen/X86/avx512-shuffles/unpack.ll
@@ -0,0 +1,2797 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512vl %s -o - | FileCheck %s
+
+define <4 x float> @test_4xfloat_unpack_low_mask0(<4 x float> %vec1, <4 x float> %vec2) {
+; CHECK-LABEL: test_4xfloat_unpack_low_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-NEXT: retq
+ %res = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+ ret <4 x float> %res
+}
+define <4 x float> @test_4xfloat_masked_unpack_low_mask0(<4 x float> %vec1, <4 x float> %vec2, <4 x float> %vec3, <4 x float> %mask) {
+; CHECK-LABEL: test_4xfloat_masked_unpack_low_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqps %xmm4, %xmm3, %k1
+; CHECK-NEXT: vunpcklps {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-NEXT: vmovaps %xmm2, %xmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec3
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_4xfloat_zero_masked_unpack_low_mask0(<4 x float> %vec1, <4 x float> %vec2, <4 x float> %mask) {
+; CHECK-LABEL: test_4xfloat_zero_masked_unpack_low_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1
+; CHECK-NEXT: vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
+ ret <4 x float> %res
+}
+define <4 x float> @test_4xfloat_masked_unpack_low_mask1(<4 x float> %vec1, <4 x float> %vec2, <4 x float> %vec3, <4 x float> %mask) {
+; CHECK-LABEL: test_4xfloat_masked_unpack_low_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqps %xmm4, %xmm3, %k1
+; CHECK-NEXT: vunpcklps {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-NEXT: vmovaps %xmm2, %xmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec3
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_4xfloat_zero_masked_unpack_low_mask1(<4 x float> %vec1, <4 x float> %vec2, <4 x float> %mask) {
+; CHECK-LABEL: test_4xfloat_zero_masked_unpack_low_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1
+; CHECK-NEXT: vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
+ ret <4 x float> %res
+}
+define <4 x float> @test_4xfloat_masked_unpack_low_mask2(<4 x float> %vec1, <4 x float> %vec2, <4 x float> %vec3, <4 x float> %mask) {
+; CHECK-LABEL: test_4xfloat_masked_unpack_low_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqps %xmm4, %xmm3, %k1
+; CHECK-NEXT: vunpcklps {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-NEXT: vmovaps %xmm2, %xmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec3
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_4xfloat_zero_masked_unpack_low_mask2(<4 x float> %vec1, <4 x float> %vec2, <4 x float> %mask) {
+; CHECK-LABEL: test_4xfloat_zero_masked_unpack_low_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1
+; CHECK-NEXT: vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
+ ret <4 x float> %res
+}
+define <4 x float> @test_4xfloat_unpack_low_mask3(<4 x float> %vec1, <4 x float> %vec2) {
+; CHECK-LABEL: test_4xfloat_unpack_low_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-NEXT: retq
+ %res = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+ ret <4 x float> %res
+}
+define <4 x float> @test_4xfloat_masked_unpack_low_mask3(<4 x float> %vec1, <4 x float> %vec2, <4 x float> %vec3, <4 x float> %mask) {
+; CHECK-LABEL: test_4xfloat_masked_unpack_low_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqps %xmm4, %xmm3, %k1
+; CHECK-NEXT: vunpcklps {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-NEXT: vmovaps %xmm2, %xmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec3
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_4xfloat_zero_masked_unpack_low_mask3(<4 x float> %vec1, <4 x float> %vec2, <4 x float> %mask) {
+; CHECK-LABEL: test_4xfloat_zero_masked_unpack_low_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1
+; CHECK-NEXT: vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
+ ret <4 x float> %res
+}
+define <4 x float> @test_4xfloat_unpack_low_mem_mask0(<4 x float> %vec1, <4 x float>* %vec2p) {
+; CHECK-LABEL: test_4xfloat_unpack_low_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; CHECK-NEXT: retq
+ %vec2 = load <4 x float>, <4 x float>* %vec2p
+ %res = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+ ret <4 x float> %res
+}
+define <4 x float> @test_4xfloat_masked_unpack_low_mem_mask0(<4 x float> %vec1, <4 x float>* %vec2p, <4 x float> %vec3, <4 x float> %mask) {
+; CHECK-LABEL: test_4xfloat_masked_unpack_low_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1
+; CHECK-NEXT: vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1]
+; CHECK-NEXT: vmovaps %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %vec2 = load <4 x float>, <4 x float>* %vec2p
+ %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec3
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_4xfloat_zero_masked_unpack_low_mem_mask0(<4 x float> %vec1, <4 x float>* %vec2p, <4 x float> %mask) {
+; CHECK-LABEL: test_4xfloat_zero_masked_unpack_low_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
+; CHECK-NEXT: vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1]
+; CHECK-NEXT: retq
+ %vec2 = load <4 x float>, <4 x float>* %vec2p
+ %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_4xfloat_masked_unpack_low_mem_mask1(<4 x float> %vec1, <4 x float>* %vec2p, <4 x float> %vec3, <4 x float> %mask) {
+; CHECK-LABEL: test_4xfloat_masked_unpack_low_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1
+; CHECK-NEXT: vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1]
+; CHECK-NEXT: vmovaps %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %vec2 = load <4 x float>, <4 x float>* %vec2p
+ %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec3
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_4xfloat_zero_masked_unpack_low_mem_mask1(<4 x float> %vec1, <4 x float>* %vec2p, <4 x float> %mask) {
+; CHECK-LABEL: test_4xfloat_zero_masked_unpack_low_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
+; CHECK-NEXT: vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1]
+; CHECK-NEXT: retq
+ %vec2 = load <4 x float>, <4 x float>* %vec2p
+ %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_4xfloat_masked_unpack_low_mem_mask2(<4 x float> %vec1, <4 x float>* %vec2p, <4 x float> %vec3, <4 x float> %mask) {
+; CHECK-LABEL: test_4xfloat_masked_unpack_low_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1
+; CHECK-NEXT: vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1]
+; CHECK-NEXT: vmovaps %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %vec2 = load <4 x float>, <4 x float>* %vec2p
+ %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec3
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_4xfloat_zero_masked_unpack_low_mem_mask2(<4 x float> %vec1, <4 x float>* %vec2p, <4 x float> %mask) {
+; CHECK-LABEL: test_4xfloat_zero_masked_unpack_low_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
+; CHECK-NEXT: vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1]
+; CHECK-NEXT: retq
+ %vec2 = load <4 x float>, <4 x float>* %vec2p
+ %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_4xfloat_unpack_low_mem_mask3(<4 x float> %vec1, <4 x float>* %vec2p) {
+; CHECK-LABEL: test_4xfloat_unpack_low_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; CHECK-NEXT: retq
+ %vec2 = load <4 x float>, <4 x float>* %vec2p
+ %res = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+ ret <4 x float> %res
+}
+define <4 x float> @test_4xfloat_masked_unpack_low_mem_mask3(<4 x float> %vec1, <4 x float>* %vec2p, <4 x float> %vec3, <4 x float> %mask) {
+; CHECK-LABEL: test_4xfloat_masked_unpack_low_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1
+; CHECK-NEXT: vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1]
+; CHECK-NEXT: vmovaps %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %vec2 = load <4 x float>, <4 x float>* %vec2p
+ %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec3
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_4xfloat_zero_masked_unpack_low_mem_mask3(<4 x float> %vec1, <4 x float>* %vec2p, <4 x float> %mask) {
+; CHECK-LABEL: test_4xfloat_zero_masked_unpack_low_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
+; CHECK-NEXT: vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1]
+; CHECK-NEXT: retq
+ %vec2 = load <4 x float>, <4 x float>* %vec2p
+ %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
+ ret <4 x float> %res
+}
+
+define <8 x float> @test_8xfloat_unpack_low_mask0(<8 x float> %vec1, <8 x float> %vec2) {
+; CHECK-LABEL: test_8xfloat_unpack_low_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
+; CHECK-NEXT: retq
+ %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
+ ret <8 x float> %res
+}
+define <8 x float> @test_8xfloat_masked_unpack_low_mask0(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x float> %mask) {
+; CHECK-LABEL: test_8xfloat_masked_unpack_low_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqps %ymm4, %ymm3, %k1
+; CHECK-NEXT: vunpcklps {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
+; CHECK-NEXT: vmovaps %ymm2, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_8xfloat_zero_masked_unpack_low_mask0(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %mask) {
+; CHECK-LABEL: test_8xfloat_zero_masked_unpack_low_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1
+; CHECK-NEXT: vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+define <8 x float> @test_8xfloat_masked_unpack_low_mask1(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x float> %mask) {
+; CHECK-LABEL: test_8xfloat_masked_unpack_low_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqps %ymm4, %ymm3, %k1
+; CHECK-NEXT: vunpcklps {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
+; CHECK-NEXT: vmovaps %ymm2, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_8xfloat_zero_masked_unpack_low_mask1(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %mask) {
+; CHECK-LABEL: test_8xfloat_zero_masked_unpack_low_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1
+; CHECK-NEXT: vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+define <8 x float> @test_8xfloat_masked_unpack_low_mask2(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x float> %mask) {
+; CHECK-LABEL: test_8xfloat_masked_unpack_low_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqps %ymm4, %ymm3, %k1
+; CHECK-NEXT: vunpcklps {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
+; CHECK-NEXT: vmovaps %ymm2, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_8xfloat_zero_masked_unpack_low_mask2(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %mask) {
+; CHECK-LABEL: test_8xfloat_zero_masked_unpack_low_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1
+; CHECK-NEXT: vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+define <8 x float> @test_8xfloat_unpack_low_mask3(<8 x float> %vec1, <8 x float> %vec2) {
+; CHECK-LABEL: test_8xfloat_unpack_low_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
+; CHECK-NEXT: retq
+ %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
+ ret <8 x float> %res
+}
+define <8 x float> @test_8xfloat_masked_unpack_low_mask3(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x float> %mask) {
+; CHECK-LABEL: test_8xfloat_masked_unpack_low_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqps %ymm4, %ymm3, %k1
+; CHECK-NEXT: vunpcklps {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
+; CHECK-NEXT: vmovaps %ymm2, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_8xfloat_zero_masked_unpack_low_mask3(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %mask) {
+; CHECK-LABEL: test_8xfloat_zero_masked_unpack_low_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1
+; CHECK-NEXT: vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+define <8 x float> @test_8xfloat_unpack_low_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p) {
+; CHECK-LABEL: test_8xfloat_unpack_low_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
+; CHECK-NEXT: retq
+ %vec2 = load <8 x float>, <8 x float>* %vec2p
+ %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
+ ret <8 x float> %res
+}
+define <8 x float> @test_8xfloat_masked_unpack_low_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x float> %mask) {
+; CHECK-LABEL: test_8xfloat_masked_unpack_low_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1
+; CHECK-NEXT: vunpcklps {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %vec2 = load <8 x float>, <8 x float>* %vec2p
+ %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_8xfloat_zero_masked_unpack_low_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %mask) {
+; CHECK-LABEL: test_8xfloat_zero_masked_unpack_low_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1
+; CHECK-NEXT: vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
+; CHECK-NEXT: retq
+ %vec2 = load <8 x float>, <8 x float>* %vec2p
+ %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_8xfloat_masked_unpack_low_mem_mask1(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x float> %mask) {
+; CHECK-LABEL: test_8xfloat_masked_unpack_low_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1
+; CHECK-NEXT: vunpcklps {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %vec2 = load <8 x float>, <8 x float>* %vec2p
+ %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_8xfloat_zero_masked_unpack_low_mem_mask1(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %mask) {
+; CHECK-LABEL: test_8xfloat_zero_masked_unpack_low_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1
+; CHECK-NEXT: vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
+; CHECK-NEXT: retq
+ %vec2 = load <8 x float>, <8 x float>* %vec2p
+ %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_8xfloat_masked_unpack_low_mem_mask2(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x float> %mask) {
+; CHECK-LABEL: test_8xfloat_masked_unpack_low_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1
+; CHECK-NEXT: vunpcklps {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %vec2 = load <8 x float>, <8 x float>* %vec2p
+ %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_8xfloat_zero_masked_unpack_low_mem_mask2(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %mask) {
+; CHECK-LABEL: test_8xfloat_zero_masked_unpack_low_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1
+; CHECK-NEXT: vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
+; CHECK-NEXT: retq
+ %vec2 = load <8 x float>, <8 x float>* %vec2p
+ %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_8xfloat_unpack_low_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p) {
+; CHECK-LABEL: test_8xfloat_unpack_low_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
+; CHECK-NEXT: retq
+ %vec2 = load <8 x float>, <8 x float>* %vec2p
+ %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
+ ret <8 x float> %res
+}
+define <8 x float> @test_8xfloat_masked_unpack_low_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x float> %mask) {
+; CHECK-LABEL: test_8xfloat_masked_unpack_low_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1
+; CHECK-NEXT: vunpcklps {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %vec2 = load <8 x float>, <8 x float>* %vec2p
+ %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_8xfloat_zero_masked_unpack_low_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %mask) {
+; CHECK-LABEL: test_8xfloat_zero_masked_unpack_low_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1
+; CHECK-NEXT: vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
+; CHECK-NEXT: retq
+ %vec2 = load <8 x float>, <8 x float>* %vec2p
+ %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+
+define <16 x float> @test_16xfloat_unpack_low_mask0(<16 x float> %vec1, <16 x float> %vec2) {
+; CHECK-LABEL: test_16xfloat_unpack_low_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vunpcklps {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
+; CHECK-NEXT: retq
+ %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
+ ret <16 x float> %res
+}
+define <16 x float> @test_16xfloat_masked_unpack_low_mask0(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x float> %mask) {
+; CHECK-LABEL: test_16xfloat_masked_unpack_low_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqps %zmm4, %zmm3, %k1
+; CHECK-NEXT: vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
+; CHECK-NEXT: vmovaps %zmm2, %zmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_16xfloat_zero_masked_unpack_low_mask0(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %mask) {
+; CHECK-LABEL: test_16xfloat_zero_masked_unpack_low_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %zmm3, %zmm2, %k1
+; CHECK-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
+define <16 x float> @test_16xfloat_masked_unpack_low_mask1(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x float> %mask) {
+; CHECK-LABEL: test_16xfloat_masked_unpack_low_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqps %zmm4, %zmm3, %k1
+; CHECK-NEXT: vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
+; CHECK-NEXT: vmovaps %zmm2, %zmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_16xfloat_zero_masked_unpack_low_mask1(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %mask) {
+; CHECK-LABEL: test_16xfloat_zero_masked_unpack_low_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %zmm3, %zmm2, %k1
+; CHECK-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
+define <16 x float> @test_16xfloat_masked_unpack_low_mask2(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x float> %mask) {
+; CHECK-LABEL: test_16xfloat_masked_unpack_low_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqps %zmm4, %zmm3, %k1
+; CHECK-NEXT: vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
+; CHECK-NEXT: vmovaps %zmm2, %zmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_16xfloat_zero_masked_unpack_low_mask2(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %mask) {
+; CHECK-LABEL: test_16xfloat_zero_masked_unpack_low_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %zmm3, %zmm2, %k1
+; CHECK-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
+define <16 x float> @test_16xfloat_unpack_low_mask3(<16 x float> %vec1, <16 x float> %vec2) {
+; CHECK-LABEL: test_16xfloat_unpack_low_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vunpcklps {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
+; CHECK-NEXT: retq
+ %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
+ ret <16 x float> %res
+}
+define <16 x float> @test_16xfloat_masked_unpack_low_mask3(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x float> %mask) {
+; CHECK-LABEL: test_16xfloat_masked_unpack_low_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqps %zmm4, %zmm3, %k1
+; CHECK-NEXT: vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
+; CHECK-NEXT: vmovaps %zmm2, %zmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_16xfloat_zero_masked_unpack_low_mask3(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %mask) {
+; CHECK-LABEL: test_16xfloat_zero_masked_unpack_low_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %zmm3, %zmm2, %k1
+; CHECK-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
+define <16 x float> @test_16xfloat_unpack_low_mem_mask0(<16 x float> %vec1, <16 x float>* %vec2p) {
+; CHECK-LABEL: test_16xfloat_unpack_low_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vunpcklps {{.*#+}} zmm0 = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13]
+; CHECK-NEXT: retq
+ %vec2 = load <16 x float>, <16 x float>* %vec2p
+ %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
+ ret <16 x float> %res
+}
+define <16 x float> @test_16xfloat_masked_unpack_low_mem_mask0(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x float> %mask) {
+; CHECK-LABEL: test_16xfloat_masked_unpack_low_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %zmm3, %zmm2, %k1
+; CHECK-NEXT: vunpcklps {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13]
+; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %vec2 = load <16 x float>, <16 x float>* %vec2p
+ %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_16xfloat_zero_masked_unpack_low_mem_mask0(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %mask) {
+; CHECK-LABEL: test_16xfloat_zero_masked_unpack_low_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %zmm2, %zmm1, %k1
+; CHECK-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13]
+; CHECK-NEXT: retq
+ %vec2 = load <16 x float>, <16 x float>* %vec2p
+ %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_16xfloat_masked_unpack_low_mem_mask1(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x float> %mask) {
+; CHECK-LABEL: test_16xfloat_masked_unpack_low_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %zmm3, %zmm2, %k1
+; CHECK-NEXT: vunpcklps {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13]
+; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %vec2 = load <16 x float>, <16 x float>* %vec2p
+ %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_16xfloat_zero_masked_unpack_low_mem_mask1(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %mask) {
+; CHECK-LABEL: test_16xfloat_zero_masked_unpack_low_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %zmm2, %zmm1, %k1
+; CHECK-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13]
+; CHECK-NEXT: retq
+ %vec2 = load <16 x float>, <16 x float>* %vec2p
+ %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_16xfloat_masked_unpack_low_mem_mask2(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x float> %mask) {
+; CHECK-LABEL: test_16xfloat_masked_unpack_low_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %zmm3, %zmm2, %k1
+; CHECK-NEXT: vunpcklps {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13]
+; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %vec2 = load <16 x float>, <16 x float>* %vec2p
+ %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_16xfloat_zero_masked_unpack_low_mem_mask2(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %mask) {
+; CHECK-LABEL: test_16xfloat_zero_masked_unpack_low_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %zmm2, %zmm1, %k1
+; CHECK-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13]
+; CHECK-NEXT: retq
+ %vec2 = load <16 x float>, <16 x float>* %vec2p
+ %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_16xfloat_unpack_low_mem_mask3(<16 x float> %vec1, <16 x float>* %vec2p) {
+; CHECK-LABEL: test_16xfloat_unpack_low_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vunpcklps {{.*#+}} zmm0 = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13]
+; CHECK-NEXT: retq
+ %vec2 = load <16 x float>, <16 x float>* %vec2p
+ %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
+ ret <16 x float> %res
+}
+define <16 x float> @test_16xfloat_masked_unpack_low_mem_mask3(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x float> %mask) {
+; CHECK-LABEL: test_16xfloat_masked_unpack_low_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %zmm3, %zmm2, %k1
+; CHECK-NEXT: vunpcklps {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13]
+; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %vec2 = load <16 x float>, <16 x float>* %vec2p
+ %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_16xfloat_zero_masked_unpack_low_mem_mask3(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %mask) {
+; CHECK-LABEL: test_16xfloat_zero_masked_unpack_low_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %zmm2, %zmm1, %k1
+; CHECK-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13]
+; CHECK-NEXT: retq
+ %vec2 = load <16 x float>, <16 x float>* %vec2p
+ %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
+
+define <2 x double> @test_2xdouble_unpack_low_mask0(<2 x double> %vec1, <2 x double> %vec2) {
+; CHECK-LABEL: test_2xdouble_unpack_low_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; CHECK-NEXT: retq
+ %res = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 0, i32 2>
+ ret <2 x double> %res
+}
+define <2 x double> @test_2xdouble_masked_unpack_low_mask0(<2 x double> %vec1, <2 x double> %vec2, <2 x double> %vec3, <2 x double> %mask) {
+; CHECK-LABEL: test_2xdouble_masked_unpack_low_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqpd %xmm4, %xmm3, %k1
+; CHECK-NEXT: vunpcklpd {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0]
+; CHECK-NEXT: vmovapd %xmm2, %xmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 0, i32 2>
+ %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
+ %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec3
+ ret <2 x double> %res
+}
+
+define <2 x double> @test_2xdouble_zero_masked_unpack_low_mask0(<2 x double> %vec1, <2 x double> %vec2, <2 x double> %mask) {
+; CHECK-LABEL: test_2xdouble_zero_masked_unpack_low_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %xmm3, %xmm2, %k1
+; CHECK-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 0, i32 2>
+ %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
+ %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
+ ret <2 x double> %res
+}
+define <2 x double> @test_2xdouble_masked_unpack_low_mask1(<2 x double> %vec1, <2 x double> %vec2, <2 x double> %vec3, <2 x double> %mask) {
+; CHECK-LABEL: test_2xdouble_masked_unpack_low_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqpd %xmm4, %xmm3, %k1
+; CHECK-NEXT: vunpcklpd {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0]
+; CHECK-NEXT: vmovapd %xmm2, %xmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 0, i32 2>
+ %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
+ %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec3
+ ret <2 x double> %res
+}
+
+define <2 x double> @test_2xdouble_zero_masked_unpack_low_mask1(<2 x double> %vec1, <2 x double> %vec2, <2 x double> %mask) {
+; CHECK-LABEL: test_2xdouble_zero_masked_unpack_low_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %xmm3, %xmm2, %k1
+; CHECK-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 0, i32 2>
+ %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
+ %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
+ ret <2 x double> %res
+}
+define <2 x double> @test_2xdouble_unpack_low_mem_mask0(<2 x double> %vec1, <2 x double>* %vec2p) {
+; CHECK-LABEL: test_2xdouble_unpack_low_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; CHECK-NEXT: retq
+ %vec2 = load <2 x double>, <2 x double>* %vec2p
+ %res = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 0, i32 2>
+ ret <2 x double> %res
+}
+define <2 x double> @test_2xdouble_masked_unpack_low_mem_mask0(<2 x double> %vec1, <2 x double>* %vec2p, <2 x double> %vec3, <2 x double> %mask) {
+; CHECK-LABEL: test_2xdouble_masked_unpack_low_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %xmm3, %xmm2, %k1
+; CHECK-NEXT: vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0]
+; CHECK-NEXT: vmovapd %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %vec2 = load <2 x double>, <2 x double>* %vec2p
+ %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 0, i32 2>
+ %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
+ %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec3
+ ret <2 x double> %res
+}
+
+define <2 x double> @test_2xdouble_zero_masked_unpack_low_mem_mask0(<2 x double> %vec1, <2 x double>* %vec2p, <2 x double> %mask) {
+; CHECK-LABEL: test_2xdouble_zero_masked_unpack_low_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %xmm2, %xmm1, %k1
+; CHECK-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0]
+; CHECK-NEXT: retq
+ %vec2 = load <2 x double>, <2 x double>* %vec2p
+ %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 0, i32 2>
+ %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
+ %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
+ ret <2 x double> %res
+}
+
+define <2 x double> @test_2xdouble_masked_unpack_low_mem_mask1(<2 x double> %vec1, <2 x double>* %vec2p, <2 x double> %vec3, <2 x double> %mask) {
+; CHECK-LABEL: test_2xdouble_masked_unpack_low_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %xmm3, %xmm2, %k1
+; CHECK-NEXT: vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0]
+; CHECK-NEXT: vmovapd %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %vec2 = load <2 x double>, <2 x double>* %vec2p
+ %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 0, i32 2>
+ %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
+ %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec3
+ ret <2 x double> %res
+}
+
+define <2 x double> @test_2xdouble_zero_masked_unpack_low_mem_mask1(<2 x double> %vec1, <2 x double>* %vec2p, <2 x double> %mask) {
+; CHECK-LABEL: test_2xdouble_zero_masked_unpack_low_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %xmm2, %xmm1, %k1
+; CHECK-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0]
+; CHECK-NEXT: retq
+ %vec2 = load <2 x double>, <2 x double>* %vec2p
+ %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 0, i32 2>
+ %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
+ %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
+ ret <2 x double> %res
+}
+
+define <4 x double> @test_4xdouble_unpack_low_mask0(<4 x double> %vec1, <4 x double> %vec2) {
+; CHECK-LABEL: test_4xdouble_unpack_low_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; CHECK-NEXT: retq
+ %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+ ret <4 x double> %res
+}
+define <4 x double> @test_4xdouble_masked_unpack_low_mask0(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x double> %mask) {
+; CHECK-LABEL: test_4xdouble_masked_unpack_low_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqpd %ymm4, %ymm3, %k1
+; CHECK-NEXT: vunpcklpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; CHECK-NEXT: vmovapd %ymm2, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_4xdouble_zero_masked_unpack_low_mask0(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %mask) {
+; CHECK-LABEL: test_4xdouble_zero_masked_unpack_low_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
+; CHECK-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
+ ret <4 x double> %res
+}
+define <4 x double> @test_4xdouble_masked_unpack_low_mask1(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x double> %mask) {
+; CHECK-LABEL: test_4xdouble_masked_unpack_low_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqpd %ymm4, %ymm3, %k1
+; CHECK-NEXT: vunpcklpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; CHECK-NEXT: vmovapd %ymm2, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_4xdouble_zero_masked_unpack_low_mask1(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %mask) {
+; CHECK-LABEL: test_4xdouble_zero_masked_unpack_low_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
+; CHECK-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
+ ret <4 x double> %res
+}
+define <4 x double> @test_4xdouble_masked_unpack_low_mask2(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x double> %mask) {
+; CHECK-LABEL: test_4xdouble_masked_unpack_low_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqpd %ymm4, %ymm3, %k1
+; CHECK-NEXT: vunpcklpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; CHECK-NEXT: vmovapd %ymm2, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_4xdouble_zero_masked_unpack_low_mask2(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %mask) {
+; CHECK-LABEL: test_4xdouble_zero_masked_unpack_low_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
+; CHECK-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
+ ret <4 x double> %res
+}
+define <4 x double> @test_4xdouble_unpack_low_mask3(<4 x double> %vec1, <4 x double> %vec2) {
+; CHECK-LABEL: test_4xdouble_unpack_low_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; CHECK-NEXT: retq
+ %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+ ret <4 x double> %res
+}
+define <4 x double> @test_4xdouble_masked_unpack_low_mask3(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x double> %mask) {
+; CHECK-LABEL: test_4xdouble_masked_unpack_low_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqpd %ymm4, %ymm3, %k1
+; CHECK-NEXT: vunpcklpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; CHECK-NEXT: vmovapd %ymm2, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_4xdouble_zero_masked_unpack_low_mask3(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %mask) {
+; CHECK-LABEL: test_4xdouble_zero_masked_unpack_low_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
+; CHECK-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
+ ret <4 x double> %res
+}
+define <4 x double> @test_4xdouble_unpack_low_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p) {
+; CHECK-LABEL: test_4xdouble_unpack_low_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2]
+; CHECK-NEXT: retq
+ %vec2 = load <4 x double>, <4 x double>* %vec2p
+ %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+ ret <4 x double> %res
+}
+define <4 x double> @test_4xdouble_masked_unpack_low_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x double> %mask) {
+; CHECK-LABEL: test_4xdouble_masked_unpack_low_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
+; CHECK-NEXT: vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2]
+; CHECK-NEXT: vmovapd %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %vec2 = load <4 x double>, <4 x double>* %vec2p
+ %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_4xdouble_zero_masked_unpack_low_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %mask) {
+; CHECK-LABEL: test_4xdouble_zero_masked_unpack_low_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[2],mem[2]
+; CHECK-NEXT: retq
+ %vec2 = load <4 x double>, <4 x double>* %vec2p
+ %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_4xdouble_masked_unpack_low_mem_mask1(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x double> %mask) {
+; CHECK-LABEL: test_4xdouble_masked_unpack_low_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
+; CHECK-NEXT: vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2]
+; CHECK-NEXT: vmovapd %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %vec2 = load <4 x double>, <4 x double>* %vec2p
+ %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_4xdouble_zero_masked_unpack_low_mem_mask1(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %mask) {
+; CHECK-LABEL: test_4xdouble_zero_masked_unpack_low_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[2],mem[2]
+; CHECK-NEXT: retq
+ %vec2 = load <4 x double>, <4 x double>* %vec2p
+ %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_4xdouble_masked_unpack_low_mem_mask2(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x double> %mask) {
+; CHECK-LABEL: test_4xdouble_masked_unpack_low_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
+; CHECK-NEXT: vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2]
+; CHECK-NEXT: vmovapd %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %vec2 = load <4 x double>, <4 x double>* %vec2p
+ %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_4xdouble_zero_masked_unpack_low_mem_mask2(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %mask) {
+; CHECK-LABEL: test_4xdouble_zero_masked_unpack_low_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[2],mem[2]
+; CHECK-NEXT: retq
+ %vec2 = load <4 x double>, <4 x double>* %vec2p
+ %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_4xdouble_unpack_low_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p) {
+; CHECK-LABEL: test_4xdouble_unpack_low_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2]
+; CHECK-NEXT: retq
+ %vec2 = load <4 x double>, <4 x double>* %vec2p
+ %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+ ret <4 x double> %res
+}
+define <4 x double> @test_4xdouble_masked_unpack_low_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x double> %mask) {
+; CHECK-LABEL: test_4xdouble_masked_unpack_low_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
+; CHECK-NEXT: vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2]
+; CHECK-NEXT: vmovapd %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %vec2 = load <4 x double>, <4 x double>* %vec2p
+ %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_4xdouble_zero_masked_unpack_low_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %mask) {
+; CHECK-LABEL: test_4xdouble_zero_masked_unpack_low_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[2],mem[2]
+; CHECK-NEXT: retq
+ %vec2 = load <4 x double>, <4 x double>* %vec2p
+ %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
+ ret <4 x double> %res
+}
+
+define <8 x double> @test_8xdouble_unpack_low_mask0(<8 x double> %vec1, <8 x double> %vec2) {
+; CHECK-LABEL: test_8xdouble_unpack_low_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vunpcklpd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
+; CHECK-NEXT: retq
+ %res = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+ ret <8 x double> %res
+}
+define <8 x double> @test_8xdouble_masked_unpack_low_mask0(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x double> %mask) {
+; CHECK-LABEL: test_8xdouble_masked_unpack_low_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqpd %zmm4, %zmm3, %k1
+; CHECK-NEXT: vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
+; CHECK-NEXT: vmovapd %zmm2, %zmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_8xdouble_zero_masked_unpack_low_mask0(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %mask) {
+; CHECK-LABEL: test_8xdouble_zero_masked_unpack_low_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %zmm3, %zmm2, %k1
+; CHECK-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
+ ret <8 x double> %res
+}
+define <8 x double> @test_8xdouble_masked_unpack_low_mask1(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x double> %mask) {
+; CHECK-LABEL: test_8xdouble_masked_unpack_low_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqpd %zmm4, %zmm3, %k1
+; CHECK-NEXT: vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
+; CHECK-NEXT: vmovapd %zmm2, %zmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_8xdouble_zero_masked_unpack_low_mask1(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %mask) {
+; CHECK-LABEL: test_8xdouble_zero_masked_unpack_low_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %zmm3, %zmm2, %k1
+; CHECK-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
+ ret <8 x double> %res
+}
+define <8 x double> @test_8xdouble_masked_unpack_low_mask2(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x double> %mask) {
+; CHECK-LABEL: test_8xdouble_masked_unpack_low_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqpd %zmm4, %zmm3, %k1
+; CHECK-NEXT: vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
+; CHECK-NEXT: vmovapd %zmm2, %zmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_8xdouble_zero_masked_unpack_low_mask2(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %mask) {
+; CHECK-LABEL: test_8xdouble_zero_masked_unpack_low_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %zmm3, %zmm2, %k1
+; CHECK-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
+ ret <8 x double> %res
+}
+define <8 x double> @test_8xdouble_unpack_low_mask3(<8 x double> %vec1, <8 x double> %vec2) {
+; CHECK-LABEL: test_8xdouble_unpack_low_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vunpcklpd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
+; CHECK-NEXT: retq
+ %res = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+ ret <8 x double> %res
+}
+define <8 x double> @test_8xdouble_masked_unpack_low_mask3(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x double> %mask) {
+; CHECK-LABEL: test_8xdouble_masked_unpack_low_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqpd %zmm4, %zmm3, %k1
+; CHECK-NEXT: vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
+; CHECK-NEXT: vmovapd %zmm2, %zmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_8xdouble_zero_masked_unpack_low_mask3(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %mask) {
+; CHECK-LABEL: test_8xdouble_zero_masked_unpack_low_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %zmm3, %zmm2, %k1
+; CHECK-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
+ ret <8 x double> %res
+}
+define <8 x double> @test_8xdouble_unpack_low_mem_mask0(<8 x double> %vec1, <8 x double>* %vec2p) {
+; CHECK-LABEL: test_8xdouble_unpack_low_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vunpcklpd {{.*#+}} zmm0 = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6]
+; CHECK-NEXT: retq
+ %vec2 = load <8 x double>, <8 x double>* %vec2p
+ %res = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+ ret <8 x double> %res
+}
+define <8 x double> @test_8xdouble_masked_unpack_low_mem_mask0(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x double> %mask) {
+; CHECK-LABEL: test_8xdouble_masked_unpack_low_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %zmm3, %zmm2, %k1
+; CHECK-NEXT: vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6]
+; CHECK-NEXT: vmovapd %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %vec2 = load <8 x double>, <8 x double>* %vec2p
+ %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_8xdouble_zero_masked_unpack_low_mem_mask0(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %mask) {
+; CHECK-LABEL: test_8xdouble_zero_masked_unpack_low_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %zmm2, %zmm1, %k1
+; CHECK-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6]
+; CHECK-NEXT: retq
+ %vec2 = load <8 x double>, <8 x double>* %vec2p
+ %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_8xdouble_masked_unpack_low_mem_mask1(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x double> %mask) {
+; CHECK-LABEL: test_8xdouble_masked_unpack_low_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %zmm3, %zmm2, %k1
+; CHECK-NEXT: vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6]
+; CHECK-NEXT: vmovapd %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %vec2 = load <8 x double>, <8 x double>* %vec2p
+ %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_8xdouble_zero_masked_unpack_low_mem_mask1(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %mask) {
+; CHECK-LABEL: test_8xdouble_zero_masked_unpack_low_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %zmm2, %zmm1, %k1
+; CHECK-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6]
+; CHECK-NEXT: retq
+ %vec2 = load <8 x double>, <8 x double>* %vec2p
+ %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_8xdouble_masked_unpack_low_mem_mask2(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x double> %mask) {
+; CHECK-LABEL: test_8xdouble_masked_unpack_low_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %zmm3, %zmm2, %k1
+; CHECK-NEXT: vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6]
+; CHECK-NEXT: vmovapd %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %vec2 = load <8 x double>, <8 x double>* %vec2p
+ %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_8xdouble_zero_masked_unpack_low_mem_mask2(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %mask) {
+; CHECK-LABEL: test_8xdouble_zero_masked_unpack_low_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %zmm2, %zmm1, %k1
+; CHECK-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6]
+; CHECK-NEXT: retq
+ %vec2 = load <8 x double>, <8 x double>* %vec2p
+ %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_8xdouble_unpack_low_mem_mask3(<8 x double> %vec1, <8 x double>* %vec2p) {
+; CHECK-LABEL: test_8xdouble_unpack_low_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vunpcklpd {{.*#+}} zmm0 = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6]
+; CHECK-NEXT: retq
+ %vec2 = load <8 x double>, <8 x double>* %vec2p
+ %res = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+ ret <8 x double> %res
+}
+define <8 x double> @test_8xdouble_masked_unpack_low_mem_mask3(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x double> %mask) {
+; CHECK-LABEL: test_8xdouble_masked_unpack_low_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %zmm3, %zmm2, %k1
+; CHECK-NEXT: vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6]
+; CHECK-NEXT: vmovapd %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %vec2 = load <8 x double>, <8 x double>* %vec2p
+ %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_8xdouble_zero_masked_unpack_low_mem_mask3(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %mask) {
+; CHECK-LABEL: test_8xdouble_zero_masked_unpack_low_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %zmm2, %zmm1, %k1
+; CHECK-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6]
+; CHECK-NEXT: retq
+ %vec2 = load <8 x double>, <8 x double>* %vec2p
+ %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
+ ret <8 x double> %res
+}
+
+define <4 x float> @test_4xfloat_unpack_high_mask0(<4 x float> %vec1, <4 x float> %vec2) {
+; CHECK-LABEL: test_4xfloat_unpack_high_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; CHECK-NEXT: retq
+ %res = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+ ret <4 x float> %res
+}
+define <4 x float> @test_4xfloat_masked_unpack_high_mask0(<4 x float> %vec1, <4 x float> %vec2, <4 x float> %vec3, <4 x float> %mask) {
+; CHECK-LABEL: test_4xfloat_masked_unpack_high_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqps %xmm4, %xmm3, %k1
+; CHECK-NEXT: vunpckhps {{.*#+}} xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; CHECK-NEXT: vmovaps %xmm2, %xmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec3
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_4xfloat_zero_masked_unpack_high_mask0(<4 x float> %vec1, <4 x float> %vec2, <4 x float> %mask) {
+; CHECK-LABEL: test_4xfloat_zero_masked_unpack_high_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1
+; CHECK-NEXT: vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
+ ret <4 x float> %res
+}
+define <4 x float> @test_4xfloat_masked_unpack_high_mask1(<4 x float> %vec1, <4 x float> %vec2, <4 x float> %vec3, <4 x float> %mask) {
+; CHECK-LABEL: test_4xfloat_masked_unpack_high_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqps %xmm4, %xmm3, %k1
+; CHECK-NEXT: vunpckhps {{.*#+}} xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; CHECK-NEXT: vmovaps %xmm2, %xmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec3
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_4xfloat_zero_masked_unpack_high_mask1(<4 x float> %vec1, <4 x float> %vec2, <4 x float> %mask) {
+; CHECK-LABEL: test_4xfloat_zero_masked_unpack_high_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1
+; CHECK-NEXT: vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
+ ret <4 x float> %res
+}
+define <4 x float> @test_4xfloat_masked_unpack_high_mask2(<4 x float> %vec1, <4 x float> %vec2, <4 x float> %vec3, <4 x float> %mask) {
+; CHECK-LABEL: test_4xfloat_masked_unpack_high_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqps %xmm4, %xmm3, %k1
+; CHECK-NEXT: vunpckhps {{.*#+}} xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; CHECK-NEXT: vmovaps %xmm2, %xmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec3
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_4xfloat_zero_masked_unpack_high_mask2(<4 x float> %vec1, <4 x float> %vec2, <4 x float> %mask) {
+; CHECK-LABEL: test_4xfloat_zero_masked_unpack_high_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1
+; CHECK-NEXT: vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
+ ret <4 x float> %res
+}
+define <4 x float> @test_4xfloat_unpack_high_mask3(<4 x float> %vec1, <4 x float> %vec2) {
+; CHECK-LABEL: test_4xfloat_unpack_high_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; CHECK-NEXT: retq
+ %res = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+ ret <4 x float> %res
+}
+define <4 x float> @test_4xfloat_masked_unpack_high_mask3(<4 x float> %vec1, <4 x float> %vec2, <4 x float> %vec3, <4 x float> %mask) {
+; CHECK-LABEL: test_4xfloat_masked_unpack_high_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqps %xmm4, %xmm3, %k1
+; CHECK-NEXT: vunpckhps {{.*#+}} xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; CHECK-NEXT: vmovaps %xmm2, %xmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec3
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_4xfloat_zero_masked_unpack_high_mask3(<4 x float> %vec1, <4 x float> %vec2, <4 x float> %mask) {
+; CHECK-LABEL: test_4xfloat_zero_masked_unpack_high_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1
+; CHECK-NEXT: vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
+ ret <4 x float> %res
+}
+define <4 x float> @test_4xfloat_unpack_high_mem_mask0(<4 x float> %vec1, <4 x float>* %vec2p) {
+; CHECK-LABEL: test_4xfloat_unpack_high_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
+; CHECK-NEXT: retq
+ %vec2 = load <4 x float>, <4 x float>* %vec2p
+ %res = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+ ret <4 x float> %res
+}
+define <4 x float> @test_4xfloat_masked_unpack_high_mem_mask0(<4 x float> %vec1, <4 x float>* %vec2p, <4 x float> %vec3, <4 x float> %mask) {
+; CHECK-LABEL: test_4xfloat_masked_unpack_high_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1
+; CHECK-NEXT: vunpckhps {{.*#+}} xmm1 {%k1} = xmm0[2],mem[2],xmm0[3],mem[3]
+; CHECK-NEXT: vmovaps %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %vec2 = load <4 x float>, <4 x float>* %vec2p
+ %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec3
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_4xfloat_zero_masked_unpack_high_mem_mask0(<4 x float> %vec1, <4 x float>* %vec2p, <4 x float> %mask) {
+; CHECK-LABEL: test_4xfloat_zero_masked_unpack_high_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
+; CHECK-NEXT: vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],mem[2],xmm0[3],mem[3]
+; CHECK-NEXT: retq
+ %vec2 = load <4 x float>, <4 x float>* %vec2p
+ %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_4xfloat_masked_unpack_high_mem_mask1(<4 x float> %vec1, <4 x float>* %vec2p, <4 x float> %vec3, <4 x float> %mask) {
+; CHECK-LABEL: test_4xfloat_masked_unpack_high_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1
+; CHECK-NEXT: vunpckhps {{.*#+}} xmm1 {%k1} = xmm0[2],mem[2],xmm0[3],mem[3]
+; CHECK-NEXT: vmovaps %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %vec2 = load <4 x float>, <4 x float>* %vec2p
+ %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec3
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_4xfloat_zero_masked_unpack_high_mem_mask1(<4 x float> %vec1, <4 x float>* %vec2p, <4 x float> %mask) {
+; CHECK-LABEL: test_4xfloat_zero_masked_unpack_high_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
+; CHECK-NEXT: vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],mem[2],xmm0[3],mem[3]
+; CHECK-NEXT: retq
+ %vec2 = load <4 x float>, <4 x float>* %vec2p
+ %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_4xfloat_masked_unpack_high_mem_mask2(<4 x float> %vec1, <4 x float>* %vec2p, <4 x float> %vec3, <4 x float> %mask) {
+; CHECK-LABEL: test_4xfloat_masked_unpack_high_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1
+; CHECK-NEXT: vunpckhps {{.*#+}} xmm1 {%k1} = xmm0[2],mem[2],xmm0[3],mem[3]
+; CHECK-NEXT: vmovaps %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %vec2 = load <4 x float>, <4 x float>* %vec2p
+ %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec3
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_4xfloat_zero_masked_unpack_high_mem_mask2(<4 x float> %vec1, <4 x float>* %vec2p, <4 x float> %mask) {
+; CHECK-LABEL: test_4xfloat_zero_masked_unpack_high_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
+; CHECK-NEXT: vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],mem[2],xmm0[3],mem[3]
+; CHECK-NEXT: retq
+ %vec2 = load <4 x float>, <4 x float>* %vec2p
+ %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_4xfloat_unpack_high_mem_mask3(<4 x float> %vec1, <4 x float>* %vec2p) {
+; CHECK-LABEL: test_4xfloat_unpack_high_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
+; CHECK-NEXT: retq
+ %vec2 = load <4 x float>, <4 x float>* %vec2p
+ %res = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+ ret <4 x float> %res
+}
+define <4 x float> @test_4xfloat_masked_unpack_high_mem_mask3(<4 x float> %vec1, <4 x float>* %vec2p, <4 x float> %vec3, <4 x float> %mask) {
+; CHECK-LABEL: test_4xfloat_masked_unpack_high_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1
+; CHECK-NEXT: vunpckhps {{.*#+}} xmm1 {%k1} = xmm0[2],mem[2],xmm0[3],mem[3]
+; CHECK-NEXT: vmovaps %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %vec2 = load <4 x float>, <4 x float>* %vec2p
+ %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec3
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_4xfloat_zero_masked_unpack_high_mem_mask3(<4 x float> %vec1, <4 x float>* %vec2p, <4 x float> %mask) {
+; CHECK-LABEL: test_4xfloat_zero_masked_unpack_high_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
+; CHECK-NEXT: vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],mem[2],xmm0[3],mem[3]
+; CHECK-NEXT: retq
+ %vec2 = load <4 x float>, <4 x float>* %vec2p
+ %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
+ ret <4 x float> %res
+}
+
+define <8 x float> @test_8xfloat_unpack_high_mask0(<8 x float> %vec1, <8 x float> %vec2) {
+; CHECK-LABEL: test_8xfloat_unpack_high_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
+; CHECK-NEXT: retq
+ %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
+ ret <8 x float> %res
+}
+define <8 x float> @test_8xfloat_masked_unpack_high_mask0(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x float> %mask) {
+; CHECK-LABEL: test_8xfloat_masked_unpack_high_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqps %ymm4, %ymm3, %k1
+; CHECK-NEXT: vunpckhps {{.*#+}} ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
+; CHECK-NEXT: vmovaps %ymm2, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_8xfloat_zero_masked_unpack_high_mask0(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %mask) {
+; CHECK-LABEL: test_8xfloat_zero_masked_unpack_high_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1
+; CHECK-NEXT: vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+define <8 x float> @test_8xfloat_masked_unpack_high_mask1(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x float> %mask) {
+; CHECK-LABEL: test_8xfloat_masked_unpack_high_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqps %ymm4, %ymm3, %k1
+; CHECK-NEXT: vunpckhps {{.*#+}} ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
+; CHECK-NEXT: vmovaps %ymm2, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_8xfloat_zero_masked_unpack_high_mask1(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %mask) {
+; CHECK-LABEL: test_8xfloat_zero_masked_unpack_high_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1
+; CHECK-NEXT: vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+define <8 x float> @test_8xfloat_masked_unpack_high_mask2(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x float> %mask) {
+; CHECK-LABEL: test_8xfloat_masked_unpack_high_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqps %ymm4, %ymm3, %k1
+; CHECK-NEXT: vunpckhps {{.*#+}} ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
+; CHECK-NEXT: vmovaps %ymm2, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_8xfloat_zero_masked_unpack_high_mask2(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %mask) {
+; CHECK-LABEL: test_8xfloat_zero_masked_unpack_high_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1
+; CHECK-NEXT: vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+define <8 x float> @test_8xfloat_unpack_high_mask3(<8 x float> %vec1, <8 x float> %vec2) {
+; CHECK-LABEL: test_8xfloat_unpack_high_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
+; CHECK-NEXT: retq
+ %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
+ ret <8 x float> %res
+}
+define <8 x float> @test_8xfloat_masked_unpack_high_mask3(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x float> %mask) {
+; CHECK-LABEL: test_8xfloat_masked_unpack_high_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqps %ymm4, %ymm3, %k1
+; CHECK-NEXT: vunpckhps {{.*#+}} ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
+; CHECK-NEXT: vmovaps %ymm2, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_8xfloat_zero_masked_unpack_high_mask3(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %mask) {
+; CHECK-LABEL: test_8xfloat_zero_masked_unpack_high_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1
+; CHECK-NEXT: vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+define <8 x float> @test_8xfloat_unpack_high_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p) {
+; CHECK-LABEL: test_8xfloat_unpack_high_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
+; CHECK-NEXT: retq
+ %vec2 = load <8 x float>, <8 x float>* %vec2p
+ %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
+ ret <8 x float> %res
+}
+define <8 x float> @test_8xfloat_masked_unpack_high_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x float> %mask) {
+; CHECK-LABEL: test_8xfloat_masked_unpack_high_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1
+; CHECK-NEXT: vunpckhps {{.*#+}} ymm1 {%k1} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %vec2 = load <8 x float>, <8 x float>* %vec2p
+ %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_8xfloat_zero_masked_unpack_high_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %mask) {
+; CHECK-LABEL: test_8xfloat_zero_masked_unpack_high_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1
+; CHECK-NEXT: vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
+; CHECK-NEXT: retq
+ %vec2 = load <8 x float>, <8 x float>* %vec2p
+ %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_8xfloat_masked_unpack_high_mem_mask1(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x float> %mask) {
+; CHECK-LABEL: test_8xfloat_masked_unpack_high_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1
+; CHECK-NEXT: vunpckhps {{.*#+}} ymm1 {%k1} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %vec2 = load <8 x float>, <8 x float>* %vec2p
+ %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_8xfloat_zero_masked_unpack_high_mem_mask1(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %mask) {
+; CHECK-LABEL: test_8xfloat_zero_masked_unpack_high_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1
+; CHECK-NEXT: vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
+; CHECK-NEXT: retq
+ %vec2 = load <8 x float>, <8 x float>* %vec2p
+ %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_8xfloat_masked_unpack_high_mem_mask2(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x float> %mask) {
+; CHECK-LABEL: test_8xfloat_masked_unpack_high_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1
+; CHECK-NEXT: vunpckhps {{.*#+}} ymm1 {%k1} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %vec2 = load <8 x float>, <8 x float>* %vec2p
+ %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_8xfloat_zero_masked_unpack_high_mem_mask2(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %mask) {
+; CHECK-LABEL: test_8xfloat_zero_masked_unpack_high_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1
+; CHECK-NEXT: vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
+; CHECK-NEXT: retq
+ %vec2 = load <8 x float>, <8 x float>* %vec2p
+ %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_8xfloat_unpack_high_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p) {
+; CHECK-LABEL: test_8xfloat_unpack_high_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
+; CHECK-NEXT: retq
+ %vec2 = load <8 x float>, <8 x float>* %vec2p
+ %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
+ ret <8 x float> %res
+}
+define <8 x float> @test_8xfloat_masked_unpack_high_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x float> %mask) {
+; CHECK-LABEL: test_8xfloat_masked_unpack_high_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1
+; CHECK-NEXT: vunpckhps {{.*#+}} ymm1 {%k1} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
+; CHECK-NEXT: vmovaps %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %vec2 = load <8 x float>, <8 x float>* %vec2p
+ %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_8xfloat_zero_masked_unpack_high_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %mask) {
+; CHECK-LABEL: test_8xfloat_zero_masked_unpack_high_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %ymm2, %ymm1, %k1
+; CHECK-NEXT: vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
+; CHECK-NEXT: retq
+ %vec2 = load <8 x float>, <8 x float>* %vec2p
+ %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
+ %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+
+define <16 x float> @test_16xfloat_unpack_high_mask0(<16 x float> %vec1, <16 x float> %vec2) {
+; CHECK-LABEL: test_16xfloat_unpack_high_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vunpckhps {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
+; CHECK-NEXT: retq
+ %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
+ ret <16 x float> %res
+}
+define <16 x float> @test_16xfloat_masked_unpack_high_mask0(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x float> %mask) {
+; CHECK-LABEL: test_16xfloat_masked_unpack_high_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqps %zmm4, %zmm3, %k1
+; CHECK-NEXT: vunpckhps {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
+; CHECK-NEXT: vmovaps %zmm2, %zmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_16xfloat_zero_masked_unpack_high_mask0(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %mask) {
+; CHECK-LABEL: test_16xfloat_zero_masked_unpack_high_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %zmm3, %zmm2, %k1
+; CHECK-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
+define <16 x float> @test_16xfloat_masked_unpack_high_mask1(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x float> %mask) {
+; CHECK-LABEL: test_16xfloat_masked_unpack_high_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqps %zmm4, %zmm3, %k1
+; CHECK-NEXT: vunpckhps {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
+; CHECK-NEXT: vmovaps %zmm2, %zmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_16xfloat_zero_masked_unpack_high_mask1(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %mask) {
+; CHECK-LABEL: test_16xfloat_zero_masked_unpack_high_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %zmm3, %zmm2, %k1
+; CHECK-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
+define <16 x float> @test_16xfloat_masked_unpack_high_mask2(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x float> %mask) {
+; CHECK-LABEL: test_16xfloat_masked_unpack_high_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqps %zmm4, %zmm3, %k1
+; CHECK-NEXT: vunpckhps {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
+; CHECK-NEXT: vmovaps %zmm2, %zmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_16xfloat_zero_masked_unpack_high_mask2(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %mask) {
+; CHECK-LABEL: test_16xfloat_zero_masked_unpack_high_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %zmm3, %zmm2, %k1
+; CHECK-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
+define <16 x float> @test_16xfloat_unpack_high_mask3(<16 x float> %vec1, <16 x float> %vec2) {
+; CHECK-LABEL: test_16xfloat_unpack_high_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vunpckhps {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
+; CHECK-NEXT: retq
+ %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
+ ret <16 x float> %res
+}
+define <16 x float> @test_16xfloat_masked_unpack_high_mask3(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x float> %mask) {
+; CHECK-LABEL: test_16xfloat_masked_unpack_high_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqps %zmm4, %zmm3, %k1
+; CHECK-NEXT: vunpckhps {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
+; CHECK-NEXT: vmovaps %zmm2, %zmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_16xfloat_zero_masked_unpack_high_mask3(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %mask) {
+; CHECK-LABEL: test_16xfloat_zero_masked_unpack_high_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %zmm3, %zmm2, %k1
+; CHECK-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
+define <16 x float> @test_16xfloat_unpack_high_mem_mask0(<16 x float> %vec1, <16 x float>* %vec2p) {
+; CHECK-LABEL: test_16xfloat_unpack_high_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vunpckhps {{.*#+}} zmm0 = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15]
+; CHECK-NEXT: retq
+ %vec2 = load <16 x float>, <16 x float>* %vec2p
+ %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
+ ret <16 x float> %res
+}
+define <16 x float> @test_16xfloat_masked_unpack_high_mem_mask0(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x float> %mask) {
+; CHECK-LABEL: test_16xfloat_masked_unpack_high_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %zmm3, %zmm2, %k1
+; CHECK-NEXT: vunpckhps {{.*#+}} zmm1 {%k1} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15]
+; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %vec2 = load <16 x float>, <16 x float>* %vec2p
+ %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_16xfloat_zero_masked_unpack_high_mem_mask0(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %mask) {
+; CHECK-LABEL: test_16xfloat_zero_masked_unpack_high_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %zmm2, %zmm1, %k1
+; CHECK-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15]
+; CHECK-NEXT: retq
+ %vec2 = load <16 x float>, <16 x float>* %vec2p
+ %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_16xfloat_masked_unpack_high_mem_mask1(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x float> %mask) {
+; CHECK-LABEL: test_16xfloat_masked_unpack_high_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %zmm3, %zmm2, %k1
+; CHECK-NEXT: vunpckhps {{.*#+}} zmm1 {%k1} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15]
+; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %vec2 = load <16 x float>, <16 x float>* %vec2p
+ %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_16xfloat_zero_masked_unpack_high_mem_mask1(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %mask) {
+; CHECK-LABEL: test_16xfloat_zero_masked_unpack_high_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %zmm2, %zmm1, %k1
+; CHECK-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15]
+; CHECK-NEXT: retq
+ %vec2 = load <16 x float>, <16 x float>* %vec2p
+ %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_16xfloat_masked_unpack_high_mem_mask2(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x float> %mask) {
+; CHECK-LABEL: test_16xfloat_masked_unpack_high_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %zmm3, %zmm2, %k1
+; CHECK-NEXT: vunpckhps {{.*#+}} zmm1 {%k1} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15]
+; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %vec2 = load <16 x float>, <16 x float>* %vec2p
+ %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_16xfloat_zero_masked_unpack_high_mem_mask2(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %mask) {
+; CHECK-LABEL: test_16xfloat_zero_masked_unpack_high_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %zmm2, %zmm1, %k1
+; CHECK-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15]
+; CHECK-NEXT: retq
+ %vec2 = load <16 x float>, <16 x float>* %vec2p
+ %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_16xfloat_unpack_high_mem_mask3(<16 x float> %vec1, <16 x float>* %vec2p) {
+; CHECK-LABEL: test_16xfloat_unpack_high_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vunpckhps {{.*#+}} zmm0 = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15]
+; CHECK-NEXT: retq
+ %vec2 = load <16 x float>, <16 x float>* %vec2p
+ %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
+ ret <16 x float> %res
+}
+define <16 x float> @test_16xfloat_masked_unpack_high_mem_mask3(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x float> %mask) {
+; CHECK-LABEL: test_16xfloat_masked_unpack_high_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %zmm3, %zmm2, %k1
+; CHECK-NEXT: vunpckhps {{.*#+}} zmm1 {%k1} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15]
+; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %vec2 = load <16 x float>, <16 x float>* %vec2p
+ %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
+ ret <16 x float> %res
+}
+
+define <16 x float> @test_16xfloat_zero_masked_unpack_high_mem_mask3(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %mask) {
+; CHECK-LABEL: test_16xfloat_zero_masked_unpack_high_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %zmm2, %zmm1, %k1
+; CHECK-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15]
+; CHECK-NEXT: retq
+ %vec2 = load <16 x float>, <16 x float>* %vec2p
+ %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
+ %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
+
+define <2 x double> @test_2xdouble_unpack_high_mask0(<2 x double> %vec1, <2 x double> %vec2) {
+; CHECK-LABEL: test_2xdouble_unpack_high_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; CHECK-NEXT: retq
+ %res = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 1, i32 3>
+ ret <2 x double> %res
+}
+define <2 x double> @test_2xdouble_masked_unpack_high_mask0(<2 x double> %vec1, <2 x double> %vec2, <2 x double> %vec3, <2 x double> %mask) {
+; CHECK-LABEL: test_2xdouble_masked_unpack_high_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqpd %xmm4, %xmm3, %k1
+; CHECK-NEXT: vunpckhpd {{.*#+}} xmm2 {%k1} = xmm0[1],xmm1[1]
+; CHECK-NEXT: vmovapd %xmm2, %xmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 1, i32 3>
+ %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
+ %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec3
+ ret <2 x double> %res
+}
+
+define <2 x double> @test_2xdouble_zero_masked_unpack_high_mask0(<2 x double> %vec1, <2 x double> %vec2, <2 x double> %mask) {
+; CHECK-LABEL: test_2xdouble_zero_masked_unpack_high_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %xmm3, %xmm2, %k1
+; CHECK-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[1]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 1, i32 3>
+ %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
+ %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
+ ret <2 x double> %res
+}
+define <2 x double> @test_2xdouble_masked_unpack_high_mask1(<2 x double> %vec1, <2 x double> %vec2, <2 x double> %vec3, <2 x double> %mask) {
+; CHECK-LABEL: test_2xdouble_masked_unpack_high_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqpd %xmm4, %xmm3, %k1
+; CHECK-NEXT: vunpckhpd {{.*#+}} xmm2 {%k1} = xmm0[1],xmm1[1]
+; CHECK-NEXT: vmovapd %xmm2, %xmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 1, i32 3>
+ %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
+ %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec3
+ ret <2 x double> %res
+}
+
+define <2 x double> @test_2xdouble_zero_masked_unpack_high_mask1(<2 x double> %vec1, <2 x double> %vec2, <2 x double> %mask) {
+; CHECK-LABEL: test_2xdouble_zero_masked_unpack_high_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %xmm3, %xmm2, %k1
+; CHECK-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[1]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 1, i32 3>
+ %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
+ %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
+ ret <2 x double> %res
+}
+define <2 x double> @test_2xdouble_unpack_high_mem_mask0(<2 x double> %vec1, <2 x double>* %vec2p) {
+; CHECK-LABEL: test_2xdouble_unpack_high_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1]
+; CHECK-NEXT: retq
+ %vec2 = load <2 x double>, <2 x double>* %vec2p
+ %res = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 1, i32 3>
+ ret <2 x double> %res
+}
+define <2 x double> @test_2xdouble_masked_unpack_high_mem_mask0(<2 x double> %vec1, <2 x double>* %vec2p, <2 x double> %vec3, <2 x double> %mask) {
+; CHECK-LABEL: test_2xdouble_masked_unpack_high_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %xmm3, %xmm2, %k1
+; CHECK-NEXT: vunpckhpd {{.*#+}} xmm1 {%k1} = xmm0[1],mem[1]
+; CHECK-NEXT: vmovapd %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %vec2 = load <2 x double>, <2 x double>* %vec2p
+ %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 1, i32 3>
+ %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
+ %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec3
+ ret <2 x double> %res
+}
+
+define <2 x double> @test_2xdouble_zero_masked_unpack_high_mem_mask0(<2 x double> %vec1, <2 x double>* %vec2p, <2 x double> %mask) {
+; CHECK-LABEL: test_2xdouble_zero_masked_unpack_high_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %xmm2, %xmm1, %k1
+; CHECK-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],mem[1]
+; CHECK-NEXT: retq
+ %vec2 = load <2 x double>, <2 x double>* %vec2p
+ %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 1, i32 3>
+ %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
+ %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
+ ret <2 x double> %res
+}
+
+define <2 x double> @test_2xdouble_masked_unpack_high_mem_mask1(<2 x double> %vec1, <2 x double>* %vec2p, <2 x double> %vec3, <2 x double> %mask) {
+; CHECK-LABEL: test_2xdouble_masked_unpack_high_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %xmm3, %xmm2, %k1
+; CHECK-NEXT: vunpckhpd {{.*#+}} xmm1 {%k1} = xmm0[1],mem[1]
+; CHECK-NEXT: vmovapd %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %vec2 = load <2 x double>, <2 x double>* %vec2p
+ %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 1, i32 3>
+ %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
+ %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec3
+ ret <2 x double> %res
+}
+
+define <2 x double> @test_2xdouble_zero_masked_unpack_high_mem_mask1(<2 x double> %vec1, <2 x double>* %vec2p, <2 x double> %mask) {
+; CHECK-LABEL: test_2xdouble_zero_masked_unpack_high_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %xmm2, %xmm1, %k1
+; CHECK-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],mem[1]
+; CHECK-NEXT: retq
+ %vec2 = load <2 x double>, <2 x double>* %vec2p
+ %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 1, i32 3>
+ %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
+ %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
+ ret <2 x double> %res
+}
+
+define <4 x double> @test_4xdouble_unpack_high_mask0(<4 x double> %vec1, <4 x double> %vec2) {
+; CHECK-LABEL: test_4xdouble_unpack_high_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; CHECK-NEXT: retq
+ %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+ ret <4 x double> %res
+}
+define <4 x double> @test_4xdouble_masked_unpack_high_mask0(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x double> %mask) {
+; CHECK-LABEL: test_4xdouble_masked_unpack_high_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqpd %ymm4, %ymm3, %k1
+; CHECK-NEXT: vunpckhpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; CHECK-NEXT: vmovapd %ymm2, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_4xdouble_zero_masked_unpack_high_mask0(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %mask) {
+; CHECK-LABEL: test_4xdouble_zero_masked_unpack_high_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
+; CHECK-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
+ ret <4 x double> %res
+}
+define <4 x double> @test_4xdouble_masked_unpack_high_mask1(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x double> %mask) {
+; CHECK-LABEL: test_4xdouble_masked_unpack_high_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqpd %ymm4, %ymm3, %k1
+; CHECK-NEXT: vunpckhpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; CHECK-NEXT: vmovapd %ymm2, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_4xdouble_zero_masked_unpack_high_mask1(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %mask) {
+; CHECK-LABEL: test_4xdouble_zero_masked_unpack_high_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
+; CHECK-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
+ ret <4 x double> %res
+}
+define <4 x double> @test_4xdouble_masked_unpack_high_mask2(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x double> %mask) {
+; CHECK-LABEL: test_4xdouble_masked_unpack_high_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqpd %ymm4, %ymm3, %k1
+; CHECK-NEXT: vunpckhpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; CHECK-NEXT: vmovapd %ymm2, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_4xdouble_zero_masked_unpack_high_mask2(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %mask) {
+; CHECK-LABEL: test_4xdouble_zero_masked_unpack_high_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
+; CHECK-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
+ ret <4 x double> %res
+}
+define <4 x double> @test_4xdouble_unpack_high_mask3(<4 x double> %vec1, <4 x double> %vec2) {
+; CHECK-LABEL: test_4xdouble_unpack_high_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; CHECK-NEXT: retq
+ %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+ ret <4 x double> %res
+}
+define <4 x double> @test_4xdouble_masked_unpack_high_mask3(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x double> %mask) {
+; CHECK-LABEL: test_4xdouble_masked_unpack_high_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqpd %ymm4, %ymm3, %k1
+; CHECK-NEXT: vunpckhpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; CHECK-NEXT: vmovapd %ymm2, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_4xdouble_zero_masked_unpack_high_mask3(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %mask) {
+; CHECK-LABEL: test_4xdouble_zero_masked_unpack_high_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
+; CHECK-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
+ ret <4 x double> %res
+}
+define <4 x double> @test_4xdouble_unpack_high_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p) {
+; CHECK-LABEL: test_4xdouble_unpack_high_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
+; CHECK-NEXT: retq
+ %vec2 = load <4 x double>, <4 x double>* %vec2p
+ %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+ ret <4 x double> %res
+}
+define <4 x double> @test_4xdouble_masked_unpack_high_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x double> %mask) {
+; CHECK-LABEL: test_4xdouble_masked_unpack_high_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
+; CHECK-NEXT: vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3]
+; CHECK-NEXT: vmovapd %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %vec2 = load <4 x double>, <4 x double>* %vec2p
+ %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_4xdouble_zero_masked_unpack_high_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %mask) {
+; CHECK-LABEL: test_4xdouble_zero_masked_unpack_high_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[3]
+; CHECK-NEXT: retq
+ %vec2 = load <4 x double>, <4 x double>* %vec2p
+ %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_4xdouble_masked_unpack_high_mem_mask1(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x double> %mask) {
+; CHECK-LABEL: test_4xdouble_masked_unpack_high_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
+; CHECK-NEXT: vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3]
+; CHECK-NEXT: vmovapd %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %vec2 = load <4 x double>, <4 x double>* %vec2p
+ %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_4xdouble_zero_masked_unpack_high_mem_mask1(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %mask) {
+; CHECK-LABEL: test_4xdouble_zero_masked_unpack_high_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[3]
+; CHECK-NEXT: retq
+ %vec2 = load <4 x double>, <4 x double>* %vec2p
+ %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_4xdouble_masked_unpack_high_mem_mask2(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x double> %mask) {
+; CHECK-LABEL: test_4xdouble_masked_unpack_high_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
+; CHECK-NEXT: vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3]
+; CHECK-NEXT: vmovapd %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %vec2 = load <4 x double>, <4 x double>* %vec2p
+ %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_4xdouble_zero_masked_unpack_high_mem_mask2(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %mask) {
+; CHECK-LABEL: test_4xdouble_zero_masked_unpack_high_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[3]
+; CHECK-NEXT: retq
+ %vec2 = load <4 x double>, <4 x double>* %vec2p
+ %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_4xdouble_unpack_high_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p) {
+; CHECK-LABEL: test_4xdouble_unpack_high_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
+; CHECK-NEXT: retq
+ %vec2 = load <4 x double>, <4 x double>* %vec2p
+ %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+ ret <4 x double> %res
+}
+define <4 x double> @test_4xdouble_masked_unpack_high_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x double> %mask) {
+; CHECK-LABEL: test_4xdouble_masked_unpack_high_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
+; CHECK-NEXT: vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3]
+; CHECK-NEXT: vmovapd %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %vec2 = load <4 x double>, <4 x double>* %vec2p
+ %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_4xdouble_zero_masked_unpack_high_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %mask) {
+; CHECK-LABEL: test_4xdouble_zero_masked_unpack_high_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
+; CHECK-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[3]
+; CHECK-NEXT: retq
+ %vec2 = load <4 x double>, <4 x double>* %vec2p
+ %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+ %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
+ ret <4 x double> %res
+}
+
+define <8 x double> @test_8xdouble_unpack_high_mask0(<8 x double> %vec1, <8 x double> %vec2) {
+; CHECK-LABEL: test_8xdouble_unpack_high_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
+; CHECK-NEXT: retq
+ %res = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+ ret <8 x double> %res
+}
+define <8 x double> @test_8xdouble_masked_unpack_high_mask0(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x double> %mask) {
+; CHECK-LABEL: test_8xdouble_masked_unpack_high_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqpd %zmm4, %zmm3, %k1
+; CHECK-NEXT: vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
+; CHECK-NEXT: vmovapd %zmm2, %zmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_8xdouble_zero_masked_unpack_high_mask0(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %mask) {
+; CHECK-LABEL: test_8xdouble_zero_masked_unpack_high_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %zmm3, %zmm2, %k1
+; CHECK-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
+ ret <8 x double> %res
+}
+define <8 x double> @test_8xdouble_masked_unpack_high_mask1(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x double> %mask) {
+; CHECK-LABEL: test_8xdouble_masked_unpack_high_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqpd %zmm4, %zmm3, %k1
+; CHECK-NEXT: vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
+; CHECK-NEXT: vmovapd %zmm2, %zmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_8xdouble_zero_masked_unpack_high_mask1(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %mask) {
+; CHECK-LABEL: test_8xdouble_zero_masked_unpack_high_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %zmm3, %zmm2, %k1
+; CHECK-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
+ ret <8 x double> %res
+}
+define <8 x double> @test_8xdouble_masked_unpack_high_mask2(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x double> %mask) {
+; CHECK-LABEL: test_8xdouble_masked_unpack_high_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqpd %zmm4, %zmm3, %k1
+; CHECK-NEXT: vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
+; CHECK-NEXT: vmovapd %zmm2, %zmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_8xdouble_zero_masked_unpack_high_mask2(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %mask) {
+; CHECK-LABEL: test_8xdouble_zero_masked_unpack_high_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %zmm3, %zmm2, %k1
+; CHECK-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
+ ret <8 x double> %res
+}
+define <8 x double> @test_8xdouble_unpack_high_mask3(<8 x double> %vec1, <8 x double> %vec2) {
+; CHECK-LABEL: test_8xdouble_unpack_high_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
+; CHECK-NEXT: retq
+ %res = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+ ret <8 x double> %res
+}
+define <8 x double> @test_8xdouble_masked_unpack_high_mask3(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x double> %mask) {
+; CHECK-LABEL: test_8xdouble_masked_unpack_high_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqpd %zmm4, %zmm3, %k1
+; CHECK-NEXT: vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
+; CHECK-NEXT: vmovapd %zmm2, %zmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_8xdouble_zero_masked_unpack_high_mask3(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %mask) {
+; CHECK-LABEL: test_8xdouble_zero_masked_unpack_high_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %zmm3, %zmm2, %k1
+; CHECK-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
+ ret <8 x double> %res
+}
+define <8 x double> @test_8xdouble_unpack_high_mem_mask0(<8 x double> %vec1, <8 x double>* %vec2p) {
+; CHECK-LABEL: test_8xdouble_unpack_high_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vunpckhpd {{.*#+}} zmm0 = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7]
+; CHECK-NEXT: retq
+ %vec2 = load <8 x double>, <8 x double>* %vec2p
+ %res = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+ ret <8 x double> %res
+}
+define <8 x double> @test_8xdouble_masked_unpack_high_mem_mask0(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x double> %mask) {
+; CHECK-LABEL: test_8xdouble_masked_unpack_high_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %zmm3, %zmm2, %k1
+; CHECK-NEXT: vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7]
+; CHECK-NEXT: vmovapd %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %vec2 = load <8 x double>, <8 x double>* %vec2p
+ %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_8xdouble_zero_masked_unpack_high_mem_mask0(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %mask) {
+; CHECK-LABEL: test_8xdouble_zero_masked_unpack_high_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %zmm2, %zmm1, %k1
+; CHECK-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7]
+; CHECK-NEXT: retq
+ %vec2 = load <8 x double>, <8 x double>* %vec2p
+ %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_8xdouble_masked_unpack_high_mem_mask1(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x double> %mask) {
+; CHECK-LABEL: test_8xdouble_masked_unpack_high_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %zmm3, %zmm2, %k1
+; CHECK-NEXT: vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7]
+; CHECK-NEXT: vmovapd %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %vec2 = load <8 x double>, <8 x double>* %vec2p
+ %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_8xdouble_zero_masked_unpack_high_mem_mask1(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %mask) {
+; CHECK-LABEL: test_8xdouble_zero_masked_unpack_high_mem_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %zmm2, %zmm1, %k1
+; CHECK-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7]
+; CHECK-NEXT: retq
+ %vec2 = load <8 x double>, <8 x double>* %vec2p
+ %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_8xdouble_masked_unpack_high_mem_mask2(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x double> %mask) {
+; CHECK-LABEL: test_8xdouble_masked_unpack_high_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %zmm3, %zmm2, %k1
+; CHECK-NEXT: vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7]
+; CHECK-NEXT: vmovapd %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %vec2 = load <8 x double>, <8 x double>* %vec2p
+ %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_8xdouble_zero_masked_unpack_high_mem_mask2(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %mask) {
+; CHECK-LABEL: test_8xdouble_zero_masked_unpack_high_mem_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %zmm2, %zmm1, %k1
+; CHECK-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7]
+; CHECK-NEXT: retq
+ %vec2 = load <8 x double>, <8 x double>* %vec2p
+ %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_8xdouble_unpack_high_mem_mask3(<8 x double> %vec1, <8 x double>* %vec2p) {
+; CHECK-LABEL: test_8xdouble_unpack_high_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vunpckhpd {{.*#+}} zmm0 = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7]
+; CHECK-NEXT: retq
+ %vec2 = load <8 x double>, <8 x double>* %vec2p
+ %res = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+ ret <8 x double> %res
+}
+define <8 x double> @test_8xdouble_masked_unpack_high_mem_mask3(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x double> %mask) {
+; CHECK-LABEL: test_8xdouble_masked_unpack_high_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %zmm3, %zmm2, %k1
+; CHECK-NEXT: vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7]
+; CHECK-NEXT: vmovapd %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %vec2 = load <8 x double>, <8 x double>* %vec2p
+ %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_8xdouble_zero_masked_unpack_high_mem_mask3(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %mask) {
+; CHECK-LABEL: test_8xdouble_zero_masked_unpack_high_mem_mask3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %zmm2, %zmm1, %k1
+; CHECK-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7]
+; CHECK-NEXT: retq
+ %vec2 = load <8 x double>, <8 x double>* %vec2p
+ %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+ %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
+ %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
+ ret <8 x double> %res
+}
+
diff --git a/test/CodeGen/X86/avx512-skx-insert-subvec.ll b/test/CodeGen/X86/avx512-skx-insert-subvec.ll
index b7f80ec97150..1182bbf94ec5 100644
--- a/test/CodeGen/X86/avx512-skx-insert-subvec.ll
+++ b/test/CodeGen/X86/avx512-skx-insert-subvec.ll
@@ -3,7 +3,7 @@
define <8 x i1> @test(<2 x i1> %a) {
; CHECK-LABEL: test:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpsllq $63, %xmm0, %xmm0
; CHECK-NEXT: vptestmq %xmm0, %xmm0, %k0
; CHECK-NEXT: kshiftlb $2, %k0, %k0
@@ -15,7 +15,7 @@ define <8 x i1> @test(<2 x i1> %a) {
define <8 x i1> @test1(<2 x i1> %a) {
; CHECK-LABEL: test1:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpsllq $63, %xmm0, %xmm0
; CHECK-NEXT: vptestmq %xmm0, %xmm0, %k0
; CHECK-NEXT: kshiftlb $4, %k0, %k0
@@ -27,11 +27,11 @@ define <8 x i1> @test1(<2 x i1> %a) {
define <8 x i1> @test2(<2 x i1> %a) {
; CHECK-LABEL: test2:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpsllq $63, %xmm0, %xmm0
; CHECK-NEXT: vptestmq %xmm0, %xmm0, %k0
; CHECK-NEXT: vpmovm2q %k0, %zmm0
-; CHECK-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
; CHECK-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; CHECK-NEXT: vpmovq2m %zmm0, %k0
; CHECK-NEXT: vpmovm2w %k0, %xmm0
@@ -43,11 +43,9 @@ define <8 x i1> @test2(<2 x i1> %a) {
define <8 x i1> @test3(<4 x i1> %a) {
; CHECK-LABEL: test3:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpslld $31, %xmm0, %xmm0
; CHECK-NEXT: vptestmd %xmm0, %xmm0, %k0
-; CHECK-NEXT: kshiftlb $4, %k0, %k0
-; CHECK-NEXT: kshiftrb $4, %k0, %k0
; CHECK-NEXT: vpmovm2w %k0, %xmm0
; CHECK-NEXT: retq
@@ -57,15 +55,13 @@ define <8 x i1> @test3(<4 x i1> %a) {
define <8 x i1> @test4(<4 x i1> %a, <4 x i1>%b) {
; CHECK-LABEL: test4:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpslld $31, %xmm1, %xmm1
+; CHECK-NEXT: vptestmd %xmm1, %xmm1, %k0
; CHECK-NEXT: vpslld $31, %xmm0, %xmm0
-; CHECK-NEXT: vptestmd %xmm0, %xmm0, %k0
-; CHECK-NEXT: vpslld $31, %xmm1, %xmm0
; CHECK-NEXT: vptestmd %xmm0, %xmm0, %k1
-; CHECK-NEXT: kshiftlb $4, %k1, %k1
; CHECK-NEXT: kshiftlb $4, %k0, %k0
-; CHECK-NEXT: kshiftrb $4, %k0, %k0
-; CHECK-NEXT: korb %k1, %k0, %k0
+; CHECK-NEXT: korb %k0, %k1, %k0
; CHECK-NEXT: vpmovm2w %k0, %xmm0
; CHECK-NEXT: retq
@@ -75,15 +71,13 @@ define <8 x i1> @test4(<4 x i1> %a, <4 x i1>%b) {
define <4 x i1> @test5(<2 x i1> %a, <2 x i1>%b) {
; CHECK-LABEL: test5:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpsllq $63, %xmm1, %xmm1
+; CHECK-NEXT: vptestmq %xmm1, %xmm1, %k0
; CHECK-NEXT: vpsllq $63, %xmm0, %xmm0
-; CHECK-NEXT: vptestmq %xmm0, %xmm0, %k0
-; CHECK-NEXT: vpsllq $63, %xmm1, %xmm0
; CHECK-NEXT: vptestmq %xmm0, %xmm0, %k1
-; CHECK-NEXT: kshiftlb $2, %k1, %k1
; CHECK-NEXT: kshiftlb $2, %k0, %k0
-; CHECK-NEXT: kshiftrb $2, %k0, %k0
-; CHECK-NEXT: korb %k1, %k0, %k0
+; CHECK-NEXT: korb %k0, %k1, %k0
; CHECK-NEXT: vpmovm2d %k0, %xmm0
; CHECK-NEXT: retq
@@ -93,16 +87,13 @@ define <4 x i1> @test5(<2 x i1> %a, <2 x i1>%b) {
define <16 x i1> @test6(<2 x i1> %a, <2 x i1>%b) {
; CHECK-LABEL: test6:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpsllq $63, %xmm1, %xmm1
+; CHECK-NEXT: vptestmq %xmm1, %xmm1, %k0
; CHECK-NEXT: vpsllq $63, %xmm0, %xmm0
-; CHECK-NEXT: vptestmq %xmm0, %xmm0, %k0
-; CHECK-NEXT: vpsllq $63, %xmm1, %xmm0
; CHECK-NEXT: vptestmq %xmm0, %xmm0, %k1
-; CHECK-NEXT: kshiftlb $2, %k1, %k1
; CHECK-NEXT: kshiftlb $2, %k0, %k0
-; CHECK-NEXT: kshiftrb $2, %k0, %k0
-; CHECK-NEXT: korb %k1, %k0, %k0
-; CHECK-NEXT: kunpckbw %k0, %k0, %k0
+; CHECK-NEXT: korb %k0, %k1, %k0
; CHECK-NEXT: vpmovm2b %k0, %xmm0
; CHECK-NEXT: retq
@@ -112,17 +103,13 @@ define <16 x i1> @test6(<2 x i1> %a, <2 x i1>%b) {
define <32 x i1> @test7(<4 x i1> %a, <4 x i1>%b) {
; CHECK-LABEL: test7:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpslld $31, %xmm1, %xmm1
+; CHECK-NEXT: vptestmd %xmm1, %xmm1, %k0
; CHECK-NEXT: vpslld $31, %xmm0, %xmm0
-; CHECK-NEXT: vptestmd %xmm0, %xmm0, %k0
-; CHECK-NEXT: vpslld $31, %xmm1, %xmm0
; CHECK-NEXT: vptestmd %xmm0, %xmm0, %k1
-; CHECK-NEXT: kshiftlb $4, %k1, %k1
; CHECK-NEXT: kshiftlb $4, %k0, %k0
-; CHECK-NEXT: kshiftrb $4, %k0, %k0
-; CHECK-NEXT: korb %k1, %k0, %k0
-; CHECK-NEXT: kunpckbw %k0, %k0, %k0
-; CHECK-NEXT: kunpckwd %k0, %k0, %k0
+; CHECK-NEXT: korb %k0, %k1, %k0
; CHECK-NEXT: vpmovm2b %k0, %ymm0
; CHECK-NEXT: retq
@@ -132,7 +119,7 @@ define <32 x i1> @test7(<4 x i1> %a, <4 x i1>%b) {
define <64 x i1> @test8(<8 x i1> %a, <8 x i1>%b) {
; CHECK-LABEL: test8:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpsllw $15, %xmm1, %xmm1
; CHECK-NEXT: vpmovw2m %xmm1, %k0
; CHECK-NEXT: vpsllw $15, %xmm0, %xmm0
@@ -147,7 +134,7 @@ define <64 x i1> @test8(<8 x i1> %a, <8 x i1>%b) {
define <4 x i1> @test9(<8 x i1> %a, <8 x i1> %b) {
; CHECK-LABEL: test9:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpsllw $15, %xmm0, %xmm0
; CHECK-NEXT: vpmovw2m %xmm0, %k0
; CHECK-NEXT: kshiftrw $4, %k0, %k0
@@ -159,7 +146,7 @@ define <4 x i1> @test9(<8 x i1> %a, <8 x i1> %b) {
define <2 x i1> @test10(<4 x i1> %a, <4 x i1> %b) {
; CHECK-LABEL: test10:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpslld $31, %xmm0, %xmm0
; CHECK-NEXT: vptestmd %xmm0, %xmm0, %k0
; CHECK-NEXT: kshiftrw $2, %k0, %k0
@@ -171,7 +158,7 @@ define <2 x i1> @test10(<4 x i1> %a, <4 x i1> %b) {
define <8 x i1> @test11(<4 x i1> %a, <4 x i1>%b) {
; CHECK-LABEL: test11:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpslld $31, %xmm0, %xmm0
; CHECK-NEXT: vptestmd %xmm0, %xmm0, %k0
; CHECK-NEXT: kshiftlb $4, %k0, %k0
diff --git a/test/CodeGen/X86/avx512-trunc.ll b/test/CodeGen/X86/avx512-trunc.ll
index 1c88ce6eb2f7..d40c899b4957 100644
--- a/test/CodeGen/X86/avx512-trunc.ll
+++ b/test/CodeGen/X86/avx512-trunc.ll
@@ -6,7 +6,7 @@
define <16 x i8> @trunc_16x32_to_16x8(<16 x i32> %i) #0 {
; ALL-LABEL: trunc_16x32_to_16x8:
-; ALL: ## BB#0:
+; ALL: ## %bb.0:
; ALL-NEXT: vpmovdb %zmm0, %xmm0
; ALL-NEXT: vzeroupper
; ALL-NEXT: retq
@@ -16,7 +16,7 @@ define <16 x i8> @trunc_16x32_to_16x8(<16 x i32> %i) #0 {
define <8 x i16> @trunc_8x64_to_8x16(<8 x i64> %i) #0 {
; ALL-LABEL: trunc_8x64_to_8x16:
-; ALL: ## BB#0:
+; ALL: ## %bb.0:
; ALL-NEXT: vpmovqw %zmm0, %xmm0
; ALL-NEXT: vzeroupper
; ALL-NEXT: retq
@@ -26,7 +26,7 @@ define <8 x i16> @trunc_8x64_to_8x16(<8 x i64> %i) #0 {
define <16 x i16> @trunc_v16i32_to_v16i16(<16 x i32> %x) #0 {
; ALL-LABEL: trunc_v16i32_to_v16i16:
-; ALL: ## BB#0:
+; ALL: ## %bb.0:
; ALL-NEXT: vpmovdw %zmm0, %ymm0
; ALL-NEXT: retq
%1 = trunc <16 x i32> %x to <16 x i16>
@@ -35,7 +35,7 @@ define <16 x i16> @trunc_v16i32_to_v16i16(<16 x i32> %x) #0 {
define <8 x i8> @trunc_qb_512(<8 x i64> %i) #0 {
; ALL-LABEL: trunc_qb_512:
-; ALL: ## BB#0:
+; ALL: ## %bb.0:
; ALL-NEXT: vpmovqw %zmm0, %xmm0
; ALL-NEXT: vzeroupper
; ALL-NEXT: retq
@@ -45,7 +45,7 @@ define <8 x i8> @trunc_qb_512(<8 x i64> %i) #0 {
define void @trunc_qb_512_mem(<8 x i64> %i, <8 x i8>* %res) #0 {
; ALL-LABEL: trunc_qb_512_mem:
-; ALL: ## BB#0:
+; ALL: ## %bb.0:
; ALL-NEXT: vpmovqb %zmm0, (%rdi)
; ALL-NEXT: vzeroupper
; ALL-NEXT: retq
@@ -56,15 +56,15 @@ define void @trunc_qb_512_mem(<8 x i64> %i, <8 x i8>* %res) #0 {
define <4 x i8> @trunc_qb_256(<4 x i64> %i) #0 {
; KNL-LABEL: trunc_qb_256:
-; KNL: ## BB#0:
-; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; KNL: ## %bb.0:
+; KNL-NEXT: ## kill: def %ymm0 killed %ymm0 def %zmm0
; KNL-NEXT: vpmovqd %zmm0, %ymm0
-; KNL-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; KNL-NEXT: ## kill: def %xmm0 killed %xmm0 killed %ymm0
; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: trunc_qb_256:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpmovqd %ymm0, %xmm0
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
@@ -74,8 +74,8 @@ define <4 x i8> @trunc_qb_256(<4 x i64> %i) #0 {
define void @trunc_qb_256_mem(<4 x i64> %i, <4 x i8>* %res) #0 {
; KNL-LABEL: trunc_qb_256_mem:
-; KNL: ## BB#0:
-; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; KNL: ## %bb.0:
+; KNL-NEXT: ## kill: def %ymm0 killed %ymm0 def %zmm0
; KNL-NEXT: vpmovqd %zmm0, %ymm0
; KNL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
; KNL-NEXT: vmovd %xmm0, (%rdi)
@@ -83,7 +83,7 @@ define void @trunc_qb_256_mem(<4 x i64> %i, <4 x i8>* %res) #0 {
; KNL-NEXT: retq
;
; SKX-LABEL: trunc_qb_256_mem:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpmovqb %ymm0, (%rdi)
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
@@ -94,7 +94,7 @@ define void @trunc_qb_256_mem(<4 x i64> %i, <4 x i8>* %res) #0 {
define <2 x i8> @trunc_qb_128(<2 x i64> %i) #0 {
; ALL-LABEL: trunc_qb_128:
-; ALL: ## BB#0:
+; ALL: ## %bb.0:
; ALL-NEXT: retq
%x = trunc <2 x i64> %i to <2 x i8>
ret <2 x i8> %x
@@ -102,13 +102,13 @@ define <2 x i8> @trunc_qb_128(<2 x i64> %i) #0 {
define void @trunc_qb_128_mem(<2 x i64> %i, <2 x i8>* %res) #0 {
; KNL-LABEL: trunc_qb_128_mem:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
; KNL-NEXT: vpextrw $0, %xmm0, (%rdi)
; KNL-NEXT: retq
;
; SKX-LABEL: trunc_qb_128_mem:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpmovqb %xmm0, (%rdi)
; SKX-NEXT: retq
%x = trunc <2 x i64> %i to <2 x i8>
@@ -118,7 +118,7 @@ define void @trunc_qb_128_mem(<2 x i64> %i, <2 x i8>* %res) #0 {
define <8 x i16> @trunc_qw_512(<8 x i64> %i) #0 {
; ALL-LABEL: trunc_qw_512:
-; ALL: ## BB#0:
+; ALL: ## %bb.0:
; ALL-NEXT: vpmovqw %zmm0, %xmm0
; ALL-NEXT: vzeroupper
; ALL-NEXT: retq
@@ -128,7 +128,7 @@ define <8 x i16> @trunc_qw_512(<8 x i64> %i) #0 {
define void @trunc_qw_512_mem(<8 x i64> %i, <8 x i16>* %res) #0 {
; ALL-LABEL: trunc_qw_512_mem:
-; ALL: ## BB#0:
+; ALL: ## %bb.0:
; ALL-NEXT: vpmovqw %zmm0, (%rdi)
; ALL-NEXT: vzeroupper
; ALL-NEXT: retq
@@ -139,15 +139,15 @@ define void @trunc_qw_512_mem(<8 x i64> %i, <8 x i16>* %res) #0 {
define <4 x i16> @trunc_qw_256(<4 x i64> %i) #0 {
; KNL-LABEL: trunc_qw_256:
-; KNL: ## BB#0:
-; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; KNL: ## %bb.0:
+; KNL-NEXT: ## kill: def %ymm0 killed %ymm0 def %zmm0
; KNL-NEXT: vpmovqd %zmm0, %ymm0
-; KNL-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; KNL-NEXT: ## kill: def %xmm0 killed %xmm0 killed %ymm0
; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: trunc_qw_256:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpmovqd %ymm0, %xmm0
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
@@ -157,8 +157,8 @@ define <4 x i16> @trunc_qw_256(<4 x i64> %i) #0 {
define void @trunc_qw_256_mem(<4 x i64> %i, <4 x i16>* %res) #0 {
; KNL-LABEL: trunc_qw_256_mem:
-; KNL: ## BB#0:
-; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; KNL: ## %bb.0:
+; KNL-NEXT: ## kill: def %ymm0 killed %ymm0 def %zmm0
; KNL-NEXT: vpmovqd %zmm0, %ymm0
; KNL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; KNL-NEXT: vmovq %xmm0, (%rdi)
@@ -166,7 +166,7 @@ define void @trunc_qw_256_mem(<4 x i64> %i, <4 x i16>* %res) #0 {
; KNL-NEXT: retq
;
; SKX-LABEL: trunc_qw_256_mem:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpmovqw %ymm0, (%rdi)
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
@@ -177,7 +177,7 @@ define void @trunc_qw_256_mem(<4 x i64> %i, <4 x i16>* %res) #0 {
define <2 x i16> @trunc_qw_128(<2 x i64> %i) #0 {
; ALL-LABEL: trunc_qw_128:
-; ALL: ## BB#0:
+; ALL: ## %bb.0:
; ALL-NEXT: retq
%x = trunc <2 x i64> %i to <2 x i16>
ret <2 x i16> %x
@@ -185,14 +185,14 @@ define <2 x i16> @trunc_qw_128(<2 x i64> %i) #0 {
define void @trunc_qw_128_mem(<2 x i64> %i, <2 x i16>* %res) #0 {
; KNL-LABEL: trunc_qw_128_mem:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; KNL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; KNL-NEXT: vmovd %xmm0, (%rdi)
; KNL-NEXT: retq
;
; SKX-LABEL: trunc_qw_128_mem:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpmovqw %xmm0, (%rdi)
; SKX-NEXT: retq
%x = trunc <2 x i64> %i to <2 x i16>
@@ -202,7 +202,7 @@ define void @trunc_qw_128_mem(<2 x i64> %i, <2 x i16>* %res) #0 {
define <8 x i32> @trunc_qd_512(<8 x i64> %i) #0 {
; ALL-LABEL: trunc_qd_512:
-; ALL: ## BB#0:
+; ALL: ## %bb.0:
; ALL-NEXT: vpmovqd %zmm0, %ymm0
; ALL-NEXT: retq
%x = trunc <8 x i64> %i to <8 x i32>
@@ -211,7 +211,7 @@ define <8 x i32> @trunc_qd_512(<8 x i64> %i) #0 {
define void @trunc_qd_512_mem(<8 x i64> %i, <8 x i32>* %res) #0 {
; ALL-LABEL: trunc_qd_512_mem:
-; ALL: ## BB#0:
+; ALL: ## %bb.0:
; ALL-NEXT: vpmovqd %zmm0, (%rdi)
; ALL-NEXT: vzeroupper
; ALL-NEXT: retq
@@ -222,15 +222,15 @@ define void @trunc_qd_512_mem(<8 x i64> %i, <8 x i32>* %res) #0 {
define <4 x i32> @trunc_qd_256(<4 x i64> %i) #0 {
; KNL-LABEL: trunc_qd_256:
-; KNL: ## BB#0:
-; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; KNL: ## %bb.0:
+; KNL-NEXT: ## kill: def %ymm0 killed %ymm0 def %zmm0
; KNL-NEXT: vpmovqd %zmm0, %ymm0
-; KNL-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; KNL-NEXT: ## kill: def %xmm0 killed %xmm0 killed %ymm0
; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: trunc_qd_256:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpmovqd %ymm0, %xmm0
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
@@ -240,15 +240,15 @@ define <4 x i32> @trunc_qd_256(<4 x i64> %i) #0 {
define void @trunc_qd_256_mem(<4 x i64> %i, <4 x i32>* %res) #0 {
; KNL-LABEL: trunc_qd_256_mem:
-; KNL: ## BB#0:
-; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; KNL: ## %bb.0:
+; KNL-NEXT: ## kill: def %ymm0 killed %ymm0 def %zmm0
; KNL-NEXT: vpmovqd %zmm0, %ymm0
; KNL-NEXT: vmovdqa %xmm0, (%rdi)
; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: trunc_qd_256_mem:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpmovqd %ymm0, (%rdi)
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
@@ -259,7 +259,7 @@ define void @trunc_qd_256_mem(<4 x i64> %i, <4 x i32>* %res) #0 {
define <2 x i32> @trunc_qd_128(<2 x i64> %i) #0 {
; ALL-LABEL: trunc_qd_128:
-; ALL: ## BB#0:
+; ALL: ## %bb.0:
; ALL-NEXT: retq
%x = trunc <2 x i64> %i to <2 x i32>
ret <2 x i32> %x
@@ -267,13 +267,13 @@ define <2 x i32> @trunc_qd_128(<2 x i64> %i) #0 {
define void @trunc_qd_128_mem(<2 x i64> %i, <2 x i32>* %res) #0 {
; KNL-LABEL: trunc_qd_128_mem:
-; KNL: ## BB#0:
-; KNL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; KNL-NEXT: vmovq %xmm0, (%rdi)
+; KNL: ## %bb.0:
+; KNL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; KNL-NEXT: vmovlps %xmm0, (%rdi)
; KNL-NEXT: retq
;
; SKX-LABEL: trunc_qd_128_mem:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpmovqd %xmm0, (%rdi)
; SKX-NEXT: retq
%x = trunc <2 x i64> %i to <2 x i32>
@@ -283,7 +283,7 @@ define void @trunc_qd_128_mem(<2 x i64> %i, <2 x i32>* %res) #0 {
define <16 x i8> @trunc_db_512(<16 x i32> %i) #0 {
; ALL-LABEL: trunc_db_512:
-; ALL: ## BB#0:
+; ALL: ## %bb.0:
; ALL-NEXT: vpmovdb %zmm0, %xmm0
; ALL-NEXT: vzeroupper
; ALL-NEXT: retq
@@ -293,7 +293,7 @@ define <16 x i8> @trunc_db_512(<16 x i32> %i) #0 {
define void @trunc_db_512_mem(<16 x i32> %i, <16 x i8>* %res) #0 {
; ALL-LABEL: trunc_db_512_mem:
-; ALL: ## BB#0:
+; ALL: ## %bb.0:
; ALL-NEXT: vpmovdb %zmm0, (%rdi)
; ALL-NEXT: vzeroupper
; ALL-NEXT: retq
@@ -304,15 +304,15 @@ define void @trunc_db_512_mem(<16 x i32> %i, <16 x i8>* %res) #0 {
define <8 x i8> @trunc_db_256(<8 x i32> %i) #0 {
; KNL-LABEL: trunc_db_256:
-; KNL: ## BB#0:
-; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; KNL: ## %bb.0:
+; KNL-NEXT: ## kill: def %ymm0 killed %ymm0 def %zmm0
; KNL-NEXT: vpmovdw %zmm0, %ymm0
-; KNL-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; KNL-NEXT: ## kill: def %xmm0 killed %xmm0 killed %ymm0
; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: trunc_db_256:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpmovdw %ymm0, %xmm0
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
@@ -322,8 +322,8 @@ define <8 x i8> @trunc_db_256(<8 x i32> %i) #0 {
define void @trunc_db_256_mem(<8 x i32> %i, <8 x i8>* %res) #0 {
; KNL-LABEL: trunc_db_256_mem:
-; KNL: ## BB#0:
-; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; KNL: ## %bb.0:
+; KNL-NEXT: ## kill: def %ymm0 killed %ymm0 def %zmm0
; KNL-NEXT: vpmovdw %zmm0, %ymm0
; KNL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; KNL-NEXT: vmovq %xmm0, (%rdi)
@@ -331,7 +331,7 @@ define void @trunc_db_256_mem(<8 x i32> %i, <8 x i8>* %res) #0 {
; KNL-NEXT: retq
;
; SKX-LABEL: trunc_db_256_mem:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpmovdb %ymm0, (%rdi)
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
@@ -342,7 +342,7 @@ define void @trunc_db_256_mem(<8 x i32> %i, <8 x i8>* %res) #0 {
define <4 x i8> @trunc_db_128(<4 x i32> %i) #0 {
; ALL-LABEL: trunc_db_128:
-; ALL: ## BB#0:
+; ALL: ## %bb.0:
; ALL-NEXT: retq
%x = trunc <4 x i32> %i to <4 x i8>
ret <4 x i8> %x
@@ -350,13 +350,13 @@ define <4 x i8> @trunc_db_128(<4 x i32> %i) #0 {
define void @trunc_db_128_mem(<4 x i32> %i, <4 x i8>* %res) #0 {
; KNL-LABEL: trunc_db_128_mem:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
; KNL-NEXT: vmovd %xmm0, (%rdi)
; KNL-NEXT: retq
;
; SKX-LABEL: trunc_db_128_mem:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpmovdb %xmm0, (%rdi)
; SKX-NEXT: retq
%x = trunc <4 x i32> %i to <4 x i8>
@@ -366,7 +366,7 @@ define void @trunc_db_128_mem(<4 x i32> %i, <4 x i8>* %res) #0 {
define <16 x i16> @trunc_dw_512(<16 x i32> %i) #0 {
; ALL-LABEL: trunc_dw_512:
-; ALL: ## BB#0:
+; ALL: ## %bb.0:
; ALL-NEXT: vpmovdw %zmm0, %ymm0
; ALL-NEXT: retq
%x = trunc <16 x i32> %i to <16 x i16>
@@ -375,7 +375,7 @@ define <16 x i16> @trunc_dw_512(<16 x i32> %i) #0 {
define void @trunc_dw_512_mem(<16 x i32> %i, <16 x i16>* %res) #0 {
; ALL-LABEL: trunc_dw_512_mem:
-; ALL: ## BB#0:
+; ALL: ## %bb.0:
; ALL-NEXT: vpmovdw %zmm0, (%rdi)
; ALL-NEXT: vzeroupper
; ALL-NEXT: retq
@@ -386,15 +386,15 @@ define void @trunc_dw_512_mem(<16 x i32> %i, <16 x i16>* %res) #0 {
define <8 x i16> @trunc_dw_256(<8 x i32> %i) #0 {
; KNL-LABEL: trunc_dw_256:
-; KNL: ## BB#0:
-; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; KNL: ## %bb.0:
+; KNL-NEXT: ## kill: def %ymm0 killed %ymm0 def %zmm0
; KNL-NEXT: vpmovdw %zmm0, %ymm0
-; KNL-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; KNL-NEXT: ## kill: def %xmm0 killed %xmm0 killed %ymm0
; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: trunc_dw_256:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpmovdw %ymm0, %xmm0
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
@@ -404,15 +404,15 @@ define <8 x i16> @trunc_dw_256(<8 x i32> %i) #0 {
define void @trunc_dw_256_mem(<8 x i32> %i, <8 x i16>* %res) #0 {
; KNL-LABEL: trunc_dw_256_mem:
-; KNL: ## BB#0:
-; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; KNL: ## %bb.0:
+; KNL-NEXT: ## kill: def %ymm0 killed %ymm0 def %zmm0
; KNL-NEXT: vpmovdw %zmm0, %ymm0
; KNL-NEXT: vmovdqa %xmm0, (%rdi)
; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: trunc_dw_256_mem:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpmovdw %ymm0, (%rdi)
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
@@ -423,13 +423,13 @@ define void @trunc_dw_256_mem(<8 x i32> %i, <8 x i16>* %res) #0 {
define void @trunc_dw_128_mem(<4 x i32> %i, <4 x i16>* %res) #0 {
; KNL-LABEL: trunc_dw_128_mem:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; KNL-NEXT: vmovq %xmm0, (%rdi)
; KNL-NEXT: retq
;
; SKX-LABEL: trunc_dw_128_mem:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpmovdw %xmm0, (%rdi)
; SKX-NEXT: retq
%x = trunc <4 x i32> %i to <4 x i16>
@@ -439,7 +439,7 @@ define void @trunc_dw_128_mem(<4 x i32> %i, <4 x i16>* %res) #0 {
define <32 x i8> @trunc_wb_512(<32 x i16> %i) #0 {
; KNL-LABEL: trunc_wb_512:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: vpmovsxwd %ymm0, %zmm0
; KNL-NEXT: vpmovdb %zmm0, %xmm0
; KNL-NEXT: vpmovsxwd %ymm1, %zmm1
@@ -448,7 +448,7 @@ define <32 x i8> @trunc_wb_512(<32 x i16> %i) #0 {
; KNL-NEXT: retq
;
; SKX-LABEL: trunc_wb_512:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpmovwb %zmm0, %ymm0
; SKX-NEXT: retq
%x = trunc <32 x i16> %i to <32 x i8>
@@ -457,7 +457,7 @@ define <32 x i8> @trunc_wb_512(<32 x i16> %i) #0 {
define void @trunc_wb_512_mem(<32 x i16> %i, <32 x i8>* %res) #0 {
; KNL-LABEL: trunc_wb_512_mem:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: vpmovsxwd %ymm0, %zmm0
; KNL-NEXT: vpmovdb %zmm0, %xmm0
; KNL-NEXT: vpmovsxwd %ymm1, %zmm1
@@ -468,7 +468,7 @@ define void @trunc_wb_512_mem(<32 x i16> %i, <32 x i8>* %res) #0 {
; KNL-NEXT: retq
;
; SKX-LABEL: trunc_wb_512_mem:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpmovwb %zmm0, (%rdi)
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
@@ -479,14 +479,14 @@ define void @trunc_wb_512_mem(<32 x i16> %i, <32 x i8>* %res) #0 {
define <16 x i8> @trunc_wb_256(<16 x i16> %i) #0 {
; KNL-LABEL: trunc_wb_256:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: vpmovsxwd %ymm0, %zmm0
; KNL-NEXT: vpmovdb %zmm0, %xmm0
; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: trunc_wb_256:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpmovwb %ymm0, %xmm0
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
@@ -496,7 +496,7 @@ define <16 x i8> @trunc_wb_256(<16 x i16> %i) #0 {
define void @trunc_wb_256_mem(<16 x i16> %i, <16 x i8>* %res) #0 {
; KNL-LABEL: trunc_wb_256_mem:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: vpmovsxwd %ymm0, %zmm0
; KNL-NEXT: vpmovdb %zmm0, %xmm0
; KNL-NEXT: vmovdqa %xmm0, (%rdi)
@@ -504,7 +504,7 @@ define void @trunc_wb_256_mem(<16 x i16> %i, <16 x i8>* %res) #0 {
; KNL-NEXT: retq
;
; SKX-LABEL: trunc_wb_256_mem:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpmovwb %ymm0, (%rdi)
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
@@ -515,7 +515,7 @@ define void @trunc_wb_256_mem(<16 x i16> %i, <16 x i8>* %res) #0 {
define <8 x i8> @trunc_wb_128(<8 x i16> %i) #0 {
; ALL-LABEL: trunc_wb_128:
-; ALL: ## BB#0:
+; ALL: ## %bb.0:
; ALL-NEXT: retq
%x = trunc <8 x i16> %i to <8 x i8>
ret <8 x i8> %x
@@ -523,13 +523,13 @@ define <8 x i8> @trunc_wb_128(<8 x i16> %i) #0 {
define void @trunc_wb_128_mem(<8 x i16> %i, <8 x i8>* %res) #0 {
; KNL-LABEL: trunc_wb_128_mem:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; KNL-NEXT: vmovq %xmm0, (%rdi)
; KNL-NEXT: retq
;
; SKX-LABEL: trunc_wb_128_mem:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpmovwb %xmm0, (%rdi)
; SKX-NEXT: retq
%x = trunc <8 x i16> %i to <8 x i8>
@@ -540,7 +540,7 @@ define void @trunc_wb_128_mem(<8 x i16> %i, <8 x i8>* %res) #0 {
define void @usat_trunc_wb_256_mem(<16 x i16> %i, <16 x i8>* %res) {
; KNL-LABEL: usat_trunc_wb_256_mem:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: vpminuw {{.*}}(%rip), %ymm0, %ymm0
; KNL-NEXT: vpmovsxwd %ymm0, %zmm0
; KNL-NEXT: vpmovdb %zmm0, %xmm0
@@ -549,7 +549,7 @@ define void @usat_trunc_wb_256_mem(<16 x i16> %i, <16 x i8>* %res) {
; KNL-NEXT: retq
;
; SKX-LABEL: usat_trunc_wb_256_mem:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpmovuswb %ymm0, (%rdi)
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
@@ -562,7 +562,7 @@ define void @usat_trunc_wb_256_mem(<16 x i16> %i, <16 x i8>* %res) {
define <16 x i8> @usat_trunc_wb_256(<16 x i16> %i) {
; KNL-LABEL: usat_trunc_wb_256:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: vpminuw {{.*}}(%rip), %ymm0, %ymm0
; KNL-NEXT: vpmovsxwd %ymm0, %zmm0
; KNL-NEXT: vpmovdb %zmm0, %xmm0
@@ -570,7 +570,7 @@ define <16 x i8> @usat_trunc_wb_256(<16 x i16> %i) {
; KNL-NEXT: retq
;
; SKX-LABEL: usat_trunc_wb_256:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpmovuswb %ymm0, %xmm0
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
@@ -582,14 +582,14 @@ define <16 x i8> @usat_trunc_wb_256(<16 x i16> %i) {
define void @usat_trunc_wb_128_mem(<8 x i16> %i, <8 x i8>* %res) {
; KNL-LABEL: usat_trunc_wb_128_mem:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: vpminuw {{.*}}(%rip), %xmm0, %xmm0
-; KNL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; KNL-NEXT: vpackuswb %xmm0, %xmm0, %xmm0
; KNL-NEXT: vmovq %xmm0, (%rdi)
; KNL-NEXT: retq
;
; SKX-LABEL: usat_trunc_wb_128_mem:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpmovuswb %xmm0, (%rdi)
; SKX-NEXT: retq
%x3 = icmp ult <8 x i16> %i, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
@@ -601,7 +601,7 @@ define void @usat_trunc_wb_128_mem(<8 x i16> %i, <8 x i8>* %res) {
define void @usat_trunc_db_512_mem(<16 x i32> %i, <16 x i8>* %res) {
; ALL-LABEL: usat_trunc_db_512_mem:
-; ALL: ## BB#0:
+; ALL: ## %bb.0:
; ALL-NEXT: vpmovusdb %zmm0, (%rdi)
; ALL-NEXT: vzeroupper
; ALL-NEXT: retq
@@ -614,7 +614,7 @@ define void @usat_trunc_db_512_mem(<16 x i32> %i, <16 x i8>* %res) {
define void @usat_trunc_qb_512_mem(<8 x i64> %i, <8 x i8>* %res) {
; ALL-LABEL: usat_trunc_qb_512_mem:
-; ALL: ## BB#0:
+; ALL: ## %bb.0:
; ALL-NEXT: vpmovusqb %zmm0, (%rdi)
; ALL-NEXT: vzeroupper
; ALL-NEXT: retq
@@ -627,7 +627,7 @@ define void @usat_trunc_qb_512_mem(<8 x i64> %i, <8 x i8>* %res) {
define void @usat_trunc_qd_512_mem(<8 x i64> %i, <8 x i32>* %res) {
; ALL-LABEL: usat_trunc_qd_512_mem:
-; ALL: ## BB#0:
+; ALL: ## %bb.0:
; ALL-NEXT: vpmovusqd %zmm0, (%rdi)
; ALL-NEXT: vzeroupper
; ALL-NEXT: retq
@@ -640,7 +640,7 @@ define void @usat_trunc_qd_512_mem(<8 x i64> %i, <8 x i32>* %res) {
define void @usat_trunc_qw_512_mem(<8 x i64> %i, <8 x i16>* %res) {
; ALL-LABEL: usat_trunc_qw_512_mem:
-; ALL: ## BB#0:
+; ALL: ## %bb.0:
; ALL-NEXT: vpmovusqw %zmm0, (%rdi)
; ALL-NEXT: vzeroupper
; ALL-NEXT: retq
@@ -653,15 +653,15 @@ define void @usat_trunc_qw_512_mem(<8 x i64> %i, <8 x i16>* %res) {
define <32 x i8> @usat_trunc_db_1024(<32 x i32> %i) {
; KNL-LABEL: usat_trunc_db_1024:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: vpmovusdb %zmm0, %xmm0
; KNL-NEXT: vpmovusdb %zmm1, %xmm1
; KNL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; KNL-NEXT: retq
;
; SKX-LABEL: usat_trunc_db_1024:
-; SKX: ## BB#0:
-; SKX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm2
+; SKX: ## %bb.0:
+; SKX-NEXT: vpbroadcastd {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; SKX-NEXT: vpminud %zmm2, %zmm1, %zmm1
; SKX-NEXT: vpminud %zmm2, %zmm0, %zmm0
; SKX-NEXT: vpmovdw %zmm0, %ymm0
@@ -677,7 +677,7 @@ define <32 x i8> @usat_trunc_db_1024(<32 x i32> %i) {
define void @usat_trunc_db_1024_mem(<32 x i32> %i, <32 x i8>* %p) {
; KNL-LABEL: usat_trunc_db_1024_mem:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: vpmovusdb %zmm0, %xmm0
; KNL-NEXT: vpmovusdb %zmm1, %xmm1
; KNL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
@@ -686,8 +686,8 @@ define void @usat_trunc_db_1024_mem(<32 x i32> %i, <32 x i8>* %p) {
; KNL-NEXT: retq
;
; SKX-LABEL: usat_trunc_db_1024_mem:
-; SKX: ## BB#0:
-; SKX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm2
+; SKX: ## %bb.0:
+; SKX-NEXT: vpbroadcastd {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; SKX-NEXT: vpminud %zmm2, %zmm1, %zmm1
; SKX-NEXT: vpminud %zmm2, %zmm0, %zmm0
; SKX-NEXT: vpmovdw %zmm0, %ymm0
@@ -705,7 +705,7 @@ define void @usat_trunc_db_1024_mem(<32 x i32> %i, <32 x i8>* %p) {
define <16 x i16> @usat_trunc_dw_512(<16 x i32> %i) {
; ALL-LABEL: usat_trunc_dw_512:
-; ALL: ## BB#0:
+; ALL: ## %bb.0:
; ALL-NEXT: vpmovusdw %zmm0, %ymm0
; ALL-NEXT: retq
%x3 = icmp ult <16 x i32> %i, <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
@@ -716,7 +716,7 @@ define <16 x i16> @usat_trunc_dw_512(<16 x i32> %i) {
define <8 x i8> @usat_trunc_wb_128(<8 x i16> %i) {
; ALL-LABEL: usat_trunc_wb_128:
-; ALL: ## BB#0:
+; ALL: ## %bb.0:
; ALL-NEXT: vpminuw {{.*}}(%rip), %xmm0, %xmm0
; ALL-NEXT: retq
%x3 = icmp ult <8 x i16> %i, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
@@ -726,27 +726,16 @@ define <8 x i8> @usat_trunc_wb_128(<8 x i16> %i) {
}
define <16 x i16> @usat_trunc_qw_1024(<16 x i64> %i) {
-; KNL-LABEL: usat_trunc_qw_1024:
-; KNL: ## BB#0:
-; KNL-NEXT: vpbroadcastq {{.*}}(%rip), %zmm2
-; KNL-NEXT: vpminuq %zmm2, %zmm1, %zmm1
-; KNL-NEXT: vpminuq %zmm2, %zmm0, %zmm0
-; KNL-NEXT: vpmovqd %zmm0, %ymm0
-; KNL-NEXT: vpmovqd %zmm1, %ymm1
-; KNL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; KNL-NEXT: vpmovdw %zmm0, %ymm0
-; KNL-NEXT: retq
-;
-; SKX-LABEL: usat_trunc_qw_1024:
-; SKX: ## BB#0:
-; SKX-NEXT: vpbroadcastq {{.*}}(%rip), %zmm2
-; SKX-NEXT: vpminuq %zmm2, %zmm1, %zmm1
-; SKX-NEXT: vpminuq %zmm2, %zmm0, %zmm0
-; SKX-NEXT: vpmovqd %zmm0, %ymm0
-; SKX-NEXT: vpmovqd %zmm1, %ymm1
-; SKX-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0
-; SKX-NEXT: vpmovdw %zmm0, %ymm0
-; SKX-NEXT: retq
+; ALL-LABEL: usat_trunc_qw_1024:
+; ALL: ## %bb.0:
+; ALL-NEXT: vpbroadcastq {{.*#+}} zmm2 = [65535,65535,65535,65535,65535,65535,65535,65535]
+; ALL-NEXT: vpminuq %zmm2, %zmm1, %zmm1
+; ALL-NEXT: vpminuq %zmm2, %zmm0, %zmm0
+; ALL-NEXT: vpmovqd %zmm0, %ymm0
+; ALL-NEXT: vpmovqd %zmm1, %ymm1
+; ALL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; ALL-NEXT: vpmovdw %zmm0, %ymm0
+; ALL-NEXT: retq
%x3 = icmp ult <16 x i64> %i, <i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535>
%x5 = select <16 x i1> %x3, <16 x i64> %i, <16 x i64> <i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535>
%x6 = trunc <16 x i64> %x5 to <16 x i16>
@@ -755,8 +744,8 @@ define <16 x i16> @usat_trunc_qw_1024(<16 x i64> %i) {
define <16 x i8> @usat_trunc_db_256(<8 x i32> %x) {
; KNL-LABEL: usat_trunc_db_256:
-; KNL: ## BB#0:
-; KNL-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1
+; KNL: ## %bb.0:
+; KNL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255]
; KNL-NEXT: vpminud %ymm1, %ymm0, %ymm0
; KNL-NEXT: vpmovdw %zmm0, %ymm0
; KNL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
@@ -764,7 +753,7 @@ define <16 x i8> @usat_trunc_db_256(<8 x i32> %x) {
; KNL-NEXT: retq
;
; SKX-LABEL: usat_trunc_db_256:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpminud {{.*}}(%rip){1to8}, %ymm0, %ymm0
; SKX-NEXT: vpmovdw %ymm0, %xmm0
; SKX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
diff --git a/test/CodeGen/X86/avx512-unsafe-fp-math.ll b/test/CodeGen/X86/avx512-unsafe-fp-math.ll
index 1956b2f7eca9..00c9e4c957c4 100644
--- a/test/CodeGen/X86/avx512-unsafe-fp-math.ll
+++ b/test/CodeGen/X86/avx512-unsafe-fp-math.ll
@@ -1,15 +1,15 @@
-; NOTE: Assertions have been autogenerated by update_llc_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64 -enable-unsafe-fp-math -mattr=+avx512f | FileCheck %s --check-prefix=CHECK_UNSAFE --check-prefix=AVX512F_UNSAFE
; RUN: llc < %s -mtriple=x86_64 -mattr=+avx512f | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512
define <16 x float> @test_max_v16f32(<16 x float> * %a_ptr, <16 x float> %b) {
; CHECK_UNSAFE-LABEL: test_max_v16f32:
-; CHECK_UNSAFE: # BB#0:
+; CHECK_UNSAFE: # %bb.0:
; CHECK_UNSAFE-NEXT: vmaxps (%rdi), %zmm0, %zmm0
; CHECK_UNSAFE-NEXT: retq
;
; CHECK-LABEL: test_max_v16f32:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps (%rdi), %zmm1
; CHECK-NEXT: vmaxps %zmm0, %zmm1, %zmm0
; CHECK-NEXT: retq
@@ -21,12 +21,12 @@ define <16 x float> @test_max_v16f32(<16 x float> * %a_ptr, <16 x float> %b) {
define <16 x float> @test_min_v16f32(<16 x float>* %a_ptr, <16 x float> %b) {
; CHECK_UNSAFE-LABEL: test_min_v16f32:
-; CHECK_UNSAFE: # BB#0:
+; CHECK_UNSAFE: # %bb.0:
; CHECK_UNSAFE-NEXT: vminps (%rdi), %zmm0, %zmm0
; CHECK_UNSAFE-NEXT: retq
;
; CHECK-LABEL: test_min_v16f32:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps (%rdi), %zmm1
; CHECK-NEXT: vminps %zmm0, %zmm1, %zmm0
; CHECK-NEXT: retq
@@ -38,12 +38,12 @@ define <16 x float> @test_min_v16f32(<16 x float>* %a_ptr, <16 x float> %b) {
define <8 x double> @test_max_v8f64(<8 x double> * %a_ptr, <8 x double> %b) {
; CHECK_UNSAFE-LABEL: test_max_v8f64:
-; CHECK_UNSAFE: # BB#0:
+; CHECK_UNSAFE: # %bb.0:
; CHECK_UNSAFE-NEXT: vmaxpd (%rdi), %zmm0, %zmm0
; CHECK_UNSAFE-NEXT: retq
;
; CHECK-LABEL: test_max_v8f64:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vmovapd (%rdi), %zmm1
; CHECK-NEXT: vmaxpd %zmm0, %zmm1, %zmm0
; CHECK-NEXT: retq
@@ -55,12 +55,12 @@ define <8 x double> @test_max_v8f64(<8 x double> * %a_ptr, <8 x double> %b) {
define <8 x double> @test_min_v8f64(<8 x double>* %a_ptr, <8 x double> %b) {
; CHECK_UNSAFE-LABEL: test_min_v8f64:
-; CHECK_UNSAFE: # BB#0:
+; CHECK_UNSAFE: # %bb.0:
; CHECK_UNSAFE-NEXT: vminpd (%rdi), %zmm0, %zmm0
; CHECK_UNSAFE-NEXT: retq
;
; CHECK-LABEL: test_min_v8f64:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vmovapd (%rdi), %zmm1
; CHECK-NEXT: vminpd %zmm0, %zmm1, %zmm0
; CHECK-NEXT: retq
@@ -72,12 +72,12 @@ define <8 x double> @test_min_v8f64(<8 x double>* %a_ptr, <8 x double> %b) {
define float @test_min_f32(float %a, float* %ptr) {
; CHECK_UNSAFE-LABEL: test_min_f32:
-; CHECK_UNSAFE: # BB#0: # %entry
+; CHECK_UNSAFE: # %bb.0: # %entry
; CHECK_UNSAFE-NEXT: vminss (%rdi), %xmm0, %xmm0
; CHECK_UNSAFE-NEXT: retq
;
; CHECK-LABEL: test_min_f32:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; CHECK-NEXT: vminss %xmm0, %xmm1, %xmm0
; CHECK-NEXT: retq
@@ -90,12 +90,12 @@ entry:
define double @test_max_f64(double %a, double* %ptr) {
; CHECK_UNSAFE-LABEL: test_max_f64:
-; CHECK_UNSAFE: # BB#0: # %entry
+; CHECK_UNSAFE: # %bb.0: # %entry
; CHECK_UNSAFE-NEXT: vmaxsd (%rdi), %xmm0, %xmm0
; CHECK_UNSAFE-NEXT: retq
;
; CHECK-LABEL: test_max_f64:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
; CHECK-NEXT: vmaxsd %xmm0, %xmm1, %xmm0
; CHECK-NEXT: retq
diff --git a/test/CodeGen/X86/avx512-vbroadcast.ll b/test/CodeGen/X86/avx512-vbroadcast.ll
index 350c0d7873ea..09e7e646ca4a 100644
--- a/test/CodeGen/X86/avx512-vbroadcast.ll
+++ b/test/CodeGen/X86/avx512-vbroadcast.ll
@@ -4,7 +4,7 @@
define <16 x i32> @_inreg16xi32(i32 %a) {
; ALL-LABEL: _inreg16xi32:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vpbroadcastd %edi, %zmm0
; ALL-NEXT: retq
%b = insertelement <16 x i32> undef, i32 %a, i32 0
@@ -14,7 +14,7 @@ define <16 x i32> @_inreg16xi32(i32 %a) {
define <8 x i64> @_inreg8xi64(i64 %a) {
; ALL-LABEL: _inreg8xi64:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vpbroadcastq %rdi, %zmm0
; ALL-NEXT: retq
%b = insertelement <8 x i64> undef, i64 %a, i32 0
@@ -24,7 +24,7 @@ define <8 x i64> @_inreg8xi64(i64 %a) {
define <16 x float> @_ss16xfloat_v4(<4 x float> %a) {
; ALL-LABEL: _ss16xfloat_v4:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vbroadcastss %xmm0, %zmm0
; ALL-NEXT: retq
%b = shufflevector <4 x float> %a, <4 x float> undef, <16 x i32> zeroinitializer
@@ -33,7 +33,7 @@ define <16 x float> @_ss16xfloat_v4(<4 x float> %a) {
define <16 x float> @_inreg16xfloat(float %a) {
; ALL-LABEL: _inreg16xfloat:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vbroadcastss %xmm0, %zmm0
; ALL-NEXT: retq
%b = insertelement <16 x float> undef, float %a, i32 0
@@ -43,8 +43,8 @@ define <16 x float> @_inreg16xfloat(float %a) {
define <16 x float> @_ss16xfloat_mask(float %a, <16 x float> %i, <16 x i32> %mask1) {
; ALL-LABEL: _ss16xfloat_mask:
-; ALL: # BB#0:
-; ALL-NEXT: vpxord %zmm3, %zmm3, %zmm3
+; ALL: # %bb.0:
+; ALL-NEXT: vpxor %xmm3, %xmm3, %xmm3
; ALL-NEXT: vpcmpneqd %zmm3, %zmm2, %k1
; ALL-NEXT: vbroadcastss %xmm0, %zmm1 {%k1}
; ALL-NEXT: vmovaps %zmm1, %zmm0
@@ -58,8 +58,8 @@ define <16 x float> @_ss16xfloat_mask(float %a, <16 x float> %i, <16 x i32> %m
define <16 x float> @_ss16xfloat_maskz(float %a, <16 x i32> %mask1) {
; ALL-LABEL: _ss16xfloat_maskz:
-; ALL: # BB#0:
-; ALL-NEXT: vpxord %zmm2, %zmm2, %zmm2
+; ALL: # %bb.0:
+; ALL-NEXT: vpxor %xmm2, %xmm2, %xmm2
; ALL-NEXT: vpcmpneqd %zmm2, %zmm1, %k1
; ALL-NEXT: vbroadcastss %xmm0, %zmm0 {%k1} {z}
; ALL-NEXT: retq
@@ -72,7 +72,7 @@ define <16 x float> @_ss16xfloat_maskz(float %a, <16 x i32> %mask1) {
define <16 x float> @_ss16xfloat_load(float* %a.ptr) {
; ALL-LABEL: _ss16xfloat_load:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vbroadcastss (%rdi), %zmm0
; ALL-NEXT: retq
%a = load float, float* %a.ptr
@@ -83,8 +83,8 @@ define <16 x float> @_ss16xfloat_load(float* %a.ptr) {
define <16 x float> @_ss16xfloat_mask_load(float* %a.ptr, <16 x float> %i, <16 x i32> %mask1) {
; ALL-LABEL: _ss16xfloat_mask_load:
-; ALL: # BB#0:
-; ALL-NEXT: vpxord %zmm2, %zmm2, %zmm2
+; ALL: # %bb.0:
+; ALL-NEXT: vpxor %xmm2, %xmm2, %xmm2
; ALL-NEXT: vpcmpneqd %zmm2, %zmm1, %k1
; ALL-NEXT: vbroadcastss (%rdi), %zmm0 {%k1}
; ALL-NEXT: retq
@@ -98,8 +98,8 @@ define <16 x float> @_ss16xfloat_mask_load(float* %a.ptr, <16 x float> %i, <16
define <16 x float> @_ss16xfloat_maskz_load(float* %a.ptr, <16 x i32> %mask1) {
; ALL-LABEL: _ss16xfloat_maskz_load:
-; ALL: # BB#0:
-; ALL-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; ALL: # %bb.0:
+; ALL-NEXT: vpxor %xmm1, %xmm1, %xmm1
; ALL-NEXT: vpcmpneqd %zmm1, %zmm0, %k1
; ALL-NEXT: vbroadcastss (%rdi), %zmm0 {%k1} {z}
; ALL-NEXT: retq
@@ -113,7 +113,7 @@ define <16 x float> @_ss16xfloat_maskz_load(float* %a.ptr, <16 x i32> %mask1)
define <8 x double> @_inreg8xdouble(double %a) {
; ALL-LABEL: _inreg8xdouble:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vbroadcastsd %xmm0, %zmm0
; ALL-NEXT: retq
%b = insertelement <8 x double> undef, double %a, i32 0
@@ -123,9 +123,9 @@ define <8 x double> @_inreg8xdouble(double %a) {
define <8 x double> @_sd8xdouble_mask(double %a, <8 x double> %i, <8 x i32> %mask1) {
; ALL-LABEL: _sd8xdouble_mask:
-; ALL: # BB#0:
-; ALL-NEXT: # kill: %YMM2<def> %YMM2<kill> %ZMM2<def>
-; ALL-NEXT: vpxor %ymm3, %ymm3, %ymm3
+; ALL: # %bb.0:
+; ALL-NEXT: # kill: def %ymm2 killed %ymm2 def %zmm2
+; ALL-NEXT: vpxor %xmm3, %xmm3, %xmm3
; ALL-NEXT: vpcmpneqd %zmm3, %zmm2, %k1
; ALL-NEXT: vbroadcastsd %xmm0, %zmm1 {%k1}
; ALL-NEXT: vmovapd %zmm1, %zmm0
@@ -139,9 +139,9 @@ define <8 x double> @_sd8xdouble_mask(double %a, <8 x double> %i, <8 x i32> %m
define <8 x double> @_sd8xdouble_maskz(double %a, <8 x i32> %mask1) {
; ALL-LABEL: _sd8xdouble_maskz:
-; ALL: # BB#0:
-; ALL-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
-; ALL-NEXT: vpxor %ymm2, %ymm2, %ymm2
+; ALL: # %bb.0:
+; ALL-NEXT: # kill: def %ymm1 killed %ymm1 def %zmm1
+; ALL-NEXT: vpxor %xmm2, %xmm2, %xmm2
; ALL-NEXT: vpcmpneqd %zmm2, %zmm1, %k1
; ALL-NEXT: vbroadcastsd %xmm0, %zmm0 {%k1} {z}
; ALL-NEXT: retq
@@ -154,7 +154,7 @@ define <8 x double> @_sd8xdouble_maskz(double %a, <8 x i32> %mask1) {
define <8 x double> @_sd8xdouble_load(double* %a.ptr) {
; ALL-LABEL: _sd8xdouble_load:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vbroadcastsd (%rdi), %zmm0
; ALL-NEXT: retq
%a = load double, double* %a.ptr
@@ -165,9 +165,9 @@ define <8 x double> @_sd8xdouble_load(double* %a.ptr) {
define <8 x double> @_sd8xdouble_mask_load(double* %a.ptr, <8 x double> %i, <8 x i32> %mask1) {
; ALL-LABEL: _sd8xdouble_mask_load:
-; ALL: # BB#0:
-; ALL-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
-; ALL-NEXT: vpxor %ymm2, %ymm2, %ymm2
+; ALL: # %bb.0:
+; ALL-NEXT: # kill: def %ymm1 killed %ymm1 def %zmm1
+; ALL-NEXT: vpxor %xmm2, %xmm2, %xmm2
; ALL-NEXT: vpcmpneqd %zmm2, %zmm1, %k1
; ALL-NEXT: vbroadcastsd (%rdi), %zmm0 {%k1}
; ALL-NEXT: retq
@@ -181,9 +181,9 @@ define <8 x double> @_sd8xdouble_mask_load(double* %a.ptr, <8 x double> %i, <8
define <8 x double> @_sd8xdouble_maskz_load(double* %a.ptr, <8 x i32> %mask1) {
; ALL-LABEL: _sd8xdouble_maskz_load:
-; ALL: # BB#0:
-; ALL-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
-; ALL-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; ALL: # %bb.0:
+; ALL-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
+; ALL-NEXT: vpxor %xmm1, %xmm1, %xmm1
; ALL-NEXT: vpcmpneqd %zmm1, %zmm0, %k1
; ALL-NEXT: vbroadcastsd (%rdi), %zmm0 {%k1} {z}
; ALL-NEXT: retq
@@ -197,7 +197,7 @@ define <8 x double> @_sd8xdouble_maskz_load(double* %a.ptr, <8 x i32> %mask1)
define <16 x i32> @_xmm16xi32(<16 x i32> %a) {
; ALL-LABEL: _xmm16xi32:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vbroadcastss %xmm0, %zmm0
; ALL-NEXT: retq
%b = shufflevector <16 x i32> %a, <16 x i32> undef, <16 x i32> zeroinitializer
@@ -206,7 +206,7 @@ define <16 x i32> @_xmm16xi32(<16 x i32> %a) {
define <16 x float> @_xmm16xfloat(<16 x float> %a) {
; ALL-LABEL: _xmm16xfloat:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vbroadcastss %xmm0, %zmm0
; ALL-NEXT: retq
%b = shufflevector <16 x float> %a, <16 x float> undef, <16 x i32> zeroinitializer
@@ -215,8 +215,8 @@ define <16 x float> @_xmm16xfloat(<16 x float> %a) {
define <16 x i32> @test_vbroadcast() {
; ALL-LABEL: test_vbroadcast:
-; ALL: # BB#0: # %entry
-; ALL-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; ALL: # %bb.0: # %entry
+; ALL-NEXT: vxorps %xmm0, %xmm0, %xmm0
; ALL-NEXT: vcmpunordps %zmm0, %zmm0, %k1
; ALL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; ALL-NEXT: knotw %k1, %k1
@@ -234,7 +234,7 @@ entry:
; IR generated will produce broadcasts at the end.
define <8 x double> @test_set1_pd(double %d) #2 {
; ALL-LABEL: test_set1_pd:
-; ALL: # BB#0: # %entry
+; ALL: # %bb.0: # %entry
; ALL-NEXT: vbroadcastsd %xmm0, %zmm0
; ALL-NEXT: retq
entry:
@@ -251,7 +251,7 @@ entry:
define <8 x i64> @test_set1_epi64(i64 %d) #2 {
; ALL-LABEL: test_set1_epi64:
-; ALL: # BB#0: # %entry
+; ALL: # %bb.0: # %entry
; ALL-NEXT: vpbroadcastq %rdi, %zmm0
; ALL-NEXT: retq
entry:
@@ -268,7 +268,7 @@ entry:
define <16 x float> @test_set1_ps(float %f) #2 {
; ALL-LABEL: test_set1_ps:
-; ALL: # BB#0: # %entry
+; ALL: # %bb.0: # %entry
; ALL-NEXT: vbroadcastss %xmm0, %zmm0
; ALL-NEXT: retq
entry:
@@ -293,7 +293,7 @@ entry:
define <16 x i32> @test_set1_epi32(i32 %f) #2 {
; ALL-LABEL: test_set1_epi32:
-; ALL: # BB#0: # %entry
+; ALL: # %bb.0: # %entry
; ALL-NEXT: vpbroadcastd %edi, %zmm0
; ALL-NEXT: retq
entry:
@@ -320,7 +320,7 @@ entry:
; Verify that the IR generated will produce the broadcast at the end.
define <8 x double> @test_mm512_broadcastsd_pd(<2 x double> %a) {
; ALL-LABEL: test_mm512_broadcastsd_pd:
-; ALL: # BB#0: # %entry
+; ALL: # %bb.0: # %entry
; ALL-NEXT: vbroadcastsd %xmm0, %zmm0
; ALL-NEXT: retq
entry:
@@ -338,7 +338,7 @@ entry:
define <16 x float> @test1(<8 x float>%a) {
; ALL-LABEL: test1:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vbroadcastss %xmm0, %zmm0
; ALL-NEXT: retq
%res = shufflevector <8 x float> %a, <8 x float> undef, <16 x i32> zeroinitializer
@@ -347,7 +347,7 @@ define <16 x float> @test1(<8 x float>%a) {
define <8 x double> @test2(<4 x double>%a) {
; ALL-LABEL: test2:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vbroadcastsd %xmm0, %zmm0
; ALL-NEXT: retq
%res = shufflevector <4 x double> %a, <4 x double> undef, <8 x i32> zeroinitializer
@@ -356,13 +356,13 @@ define <8 x double> @test2(<4 x double>%a) {
define <64 x i8> @_invec32xi8(<32 x i8>%a) {
; AVX512F-LABEL: _invec32xi8:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vpbroadcastb %xmm0, %ymm0
; AVX512F-NEXT: vmovdqa %ymm0, %ymm1
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: _invec32xi8:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpbroadcastb %xmm0, %zmm0
; AVX512BW-NEXT: retq
%res = shufflevector <32 x i8> %a, <32 x i8> undef, <64 x i32> zeroinitializer
@@ -371,13 +371,13 @@ define <64 x i8> @_invec32xi8(<32 x i8>%a) {
define <32 x i16> @_invec16xi16(<16 x i16>%a) {
; AVX512F-LABEL: _invec16xi16:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vpbroadcastw %xmm0, %ymm0
; AVX512F-NEXT: vmovdqa %ymm0, %ymm1
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: _invec16xi16:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpbroadcastw %xmm0, %zmm0
; AVX512BW-NEXT: retq
%res = shufflevector <16 x i16> %a, <16 x i16> undef, <32 x i32> zeroinitializer
@@ -386,7 +386,7 @@ define <32 x i16> @_invec16xi16(<16 x i16>%a) {
define <16 x i32> @_invec8xi32(<8 x i32>%a) {
; ALL-LABEL: _invec8xi32:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vbroadcastss %xmm0, %zmm0
; ALL-NEXT: retq
%res = shufflevector <8 x i32> %a, <8 x i32> undef, <16 x i32> zeroinitializer
@@ -395,7 +395,7 @@ define <16 x i32> @_invec8xi32(<8 x i32>%a) {
define <8 x i64> @_invec4xi64(<4 x i64>%a) {
; ALL-LABEL: _invec4xi64:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vbroadcastsd %xmm0, %zmm0
; ALL-NEXT: retq
%res = shufflevector <4 x i64> %a, <4 x i64> undef, <8 x i32> zeroinitializer
@@ -405,9 +405,8 @@ define <8 x i64> @_invec4xi64(<4 x i64>%a) {
declare void @func_f32(float)
define <16 x float> @broadcast_ss_spill(float %x) {
; ALL-LABEL: broadcast_ss_spill:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: subq $24, %rsp
-; ALL-NEXT: .Lcfi0:
; ALL-NEXT: .cfi_def_cfa_offset 32
; ALL-NEXT: vaddss %xmm0, %xmm0, %xmm0
; ALL-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
@@ -425,9 +424,8 @@ define <16 x float> @broadcast_ss_spill(float %x) {
declare void @func_f64(double)
define <8 x double> @broadcast_sd_spill(double %x) {
; ALL-LABEL: broadcast_sd_spill:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: subq $24, %rsp
-; ALL-NEXT: .Lcfi1:
; ALL-NEXT: .cfi_def_cfa_offset 32
; ALL-NEXT: vaddsd %xmm0, %xmm0, %xmm0
; ALL-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
diff --git a/test/CodeGen/X86/avx512-vbroadcasti128.ll b/test/CodeGen/X86/avx512-vbroadcasti128.ll
index ed19324df995..c5ecb1559b4e 100644
--- a/test/CodeGen/X86/avx512-vbroadcasti128.ll
+++ b/test/CodeGen/X86/avx512-vbroadcasti128.ll
@@ -8,23 +8,11 @@
;
define <4 x double> @test_broadcast_2f64_4f64(<2 x double> *%p) nounwind {
-; X64-AVX512VL-LABEL: test_broadcast_2f64_4f64:
-; X64-AVX512VL: ## BB#0:
-; X64-AVX512VL-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
-; X64-AVX512VL-NEXT: vaddpd {{.*}}(%rip), %ymm0, %ymm0
-; X64-AVX512VL-NEXT: retq
-;
-; X64-AVX512BWVL-LABEL: test_broadcast_2f64_4f64:
-; X64-AVX512BWVL: ## BB#0:
-; X64-AVX512BWVL-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
-; X64-AVX512BWVL-NEXT: vaddpd {{.*}}(%rip), %ymm0, %ymm0
-; X64-AVX512BWVL-NEXT: retq
-;
-; X64-AVX512DQVL-LABEL: test_broadcast_2f64_4f64:
-; X64-AVX512DQVL: ## BB#0:
-; X64-AVX512DQVL-NEXT: vbroadcastf64x2 {{.*#+}} ymm0 = mem[0,1,0,1]
-; X64-AVX512DQVL-NEXT: vaddpd {{.*}}(%rip), %ymm0, %ymm0
-; X64-AVX512DQVL-NEXT: retq
+; X64-AVX512-LABEL: test_broadcast_2f64_4f64:
+; X64-AVX512: ## %bb.0:
+; X64-AVX512-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
+; X64-AVX512-NEXT: vaddpd {{.*}}(%rip), %ymm0, %ymm0
+; X64-AVX512-NEXT: retq
%1 = load <2 x double>, <2 x double> *%p
%2 = shufflevector <2 x double> %1, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
%3 = fadd <4 x double> %2, <double 1.0, double 2.0, double 3.0, double 4.0>
@@ -32,23 +20,11 @@ define <4 x double> @test_broadcast_2f64_4f64(<2 x double> *%p) nounwind {
}
define <4 x i64> @test_broadcast_2i64_4i64(<2 x i64> *%p) nounwind {
-; X64-AVX512VL-LABEL: test_broadcast_2i64_4i64:
-; X64-AVX512VL: ## BB#0:
-; X64-AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
-; X64-AVX512VL-NEXT: vpaddq {{.*}}(%rip), %ymm0, %ymm0
-; X64-AVX512VL-NEXT: retq
-;
-; X64-AVX512BWVL-LABEL: test_broadcast_2i64_4i64:
-; X64-AVX512BWVL: ## BB#0:
-; X64-AVX512BWVL-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
-; X64-AVX512BWVL-NEXT: vpaddq {{.*}}(%rip), %ymm0, %ymm0
-; X64-AVX512BWVL-NEXT: retq
-;
-; X64-AVX512DQVL-LABEL: test_broadcast_2i64_4i64:
-; X64-AVX512DQVL: ## BB#0:
-; X64-AVX512DQVL-NEXT: vbroadcasti64x2 {{.*#+}} ymm0 = mem[0,1,0,1]
-; X64-AVX512DQVL-NEXT: vpaddq {{.*}}(%rip), %ymm0, %ymm0
-; X64-AVX512DQVL-NEXT: retq
+; X64-AVX512-LABEL: test_broadcast_2i64_4i64:
+; X64-AVX512: ## %bb.0:
+; X64-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
+; X64-AVX512-NEXT: vpaddq {{.*}}(%rip), %ymm0, %ymm0
+; X64-AVX512-NEXT: retq
%1 = load <2 x i64>, <2 x i64> *%p
%2 = shufflevector <2 x i64> %1, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
%3 = add <4 x i64> %2, <i64 1, i64 2, i64 3, i64 4>
@@ -57,7 +33,7 @@ define <4 x i64> @test_broadcast_2i64_4i64(<2 x i64> *%p) nounwind {
define <8 x float> @test_broadcast_4f32_8f32(<4 x float> *%p) nounwind {
; X64-AVX512-LABEL: test_broadcast_4f32_8f32:
-; X64-AVX512: ## BB#0:
+; X64-AVX512: ## %bb.0:
; X64-AVX512-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
; X64-AVX512-NEXT: vaddps {{.*}}(%rip), %ymm0, %ymm0
; X64-AVX512-NEXT: retq
@@ -69,7 +45,7 @@ define <8 x float> @test_broadcast_4f32_8f32(<4 x float> *%p) nounwind {
define <8 x i32> @test_broadcast_4i32_8i32(<4 x i32> *%p) nounwind {
; X64-AVX512-LABEL: test_broadcast_4i32_8i32:
-; X64-AVX512: ## BB#0:
+; X64-AVX512: ## %bb.0:
; X64-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
; X64-AVX512-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0
; X64-AVX512-NEXT: retq
@@ -81,7 +57,7 @@ define <8 x i32> @test_broadcast_4i32_8i32(<4 x i32> *%p) nounwind {
define <16 x i16> @test_broadcast_8i16_16i16(<8 x i16> *%p) nounwind {
; X64-AVX512-LABEL: test_broadcast_8i16_16i16:
-; X64-AVX512: ## BB#0:
+; X64-AVX512: ## %bb.0:
; X64-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
; X64-AVX512-NEXT: vpaddw {{.*}}(%rip), %ymm0, %ymm0
; X64-AVX512-NEXT: retq
@@ -93,7 +69,7 @@ define <16 x i16> @test_broadcast_8i16_16i16(<8 x i16> *%p) nounwind {
define <32 x i8> @test_broadcast_16i8_32i8(<16 x i8> *%p) nounwind {
; X64-AVX512-LABEL: test_broadcast_16i8_32i8:
-; X64-AVX512: ## BB#0:
+; X64-AVX512: ## %bb.0:
; X64-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
; X64-AVX512-NEXT: vpaddb {{.*}}(%rip), %ymm0, %ymm0
; X64-AVX512-NEXT: retq
@@ -108,23 +84,11 @@ define <32 x i8> @test_broadcast_16i8_32i8(<16 x i8> *%p) nounwind {
;
define <8 x double> @test_broadcast_2f64_8f64(<2 x double> *%p) nounwind {
-; X64-AVX512VL-LABEL: test_broadcast_2f64_8f64:
-; X64-AVX512VL: ## BB#0:
-; X64-AVX512VL-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; X64-AVX512VL-NEXT: vaddpd {{.*}}(%rip), %zmm0, %zmm0
-; X64-AVX512VL-NEXT: retq
-;
-; X64-AVX512BWVL-LABEL: test_broadcast_2f64_8f64:
-; X64-AVX512BWVL: ## BB#0:
-; X64-AVX512BWVL-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; X64-AVX512BWVL-NEXT: vaddpd {{.*}}(%rip), %zmm0, %zmm0
-; X64-AVX512BWVL-NEXT: retq
-;
-; X64-AVX512DQVL-LABEL: test_broadcast_2f64_8f64:
-; X64-AVX512DQVL: ## BB#0:
-; X64-AVX512DQVL-NEXT: vbroadcastf64x2 {{.*#+}} zmm0 = mem[0,1,0,1,0,1,0,1]
-; X64-AVX512DQVL-NEXT: vaddpd {{.*}}(%rip), %zmm0, %zmm0
-; X64-AVX512DQVL-NEXT: retq
+; X64-AVX512-LABEL: test_broadcast_2f64_8f64:
+; X64-AVX512: ## %bb.0:
+; X64-AVX512-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; X64-AVX512-NEXT: vaddpd {{.*}}(%rip), %zmm0, %zmm0
+; X64-AVX512-NEXT: retq
%1 = load <2 x double>, <2 x double> *%p
%2 = shufflevector <2 x double> %1, <2 x double> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
%3 = fadd <8 x double> %2, <double 1.0, double 2.0, double 3.0, double 4.0, double 5.0, double 6.0, double 7.0, double 8.0>
@@ -132,23 +96,11 @@ define <8 x double> @test_broadcast_2f64_8f64(<2 x double> *%p) nounwind {
}
define <8 x i64> @test_broadcast_2i64_8i64(<2 x i64> *%p) nounwind {
-; X64-AVX512VL-LABEL: test_broadcast_2i64_8i64:
-; X64-AVX512VL: ## BB#0:
-; X64-AVX512VL-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; X64-AVX512VL-NEXT: vpaddq {{.*}}(%rip), %zmm0, %zmm0
-; X64-AVX512VL-NEXT: retq
-;
-; X64-AVX512BWVL-LABEL: test_broadcast_2i64_8i64:
-; X64-AVX512BWVL: ## BB#0:
-; X64-AVX512BWVL-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; X64-AVX512BWVL-NEXT: vpaddq {{.*}}(%rip), %zmm0, %zmm0
-; X64-AVX512BWVL-NEXT: retq
-;
-; X64-AVX512DQVL-LABEL: test_broadcast_2i64_8i64:
-; X64-AVX512DQVL: ## BB#0:
-; X64-AVX512DQVL-NEXT: vbroadcasti64x2 {{.*#+}} zmm0 = mem[0,1,0,1,0,1,0,1]
-; X64-AVX512DQVL-NEXT: vpaddq {{.*}}(%rip), %zmm0, %zmm0
-; X64-AVX512DQVL-NEXT: retq
+; X64-AVX512-LABEL: test_broadcast_2i64_8i64:
+; X64-AVX512: ## %bb.0:
+; X64-AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; X64-AVX512-NEXT: vpaddq {{.*}}(%rip), %zmm0, %zmm0
+; X64-AVX512-NEXT: retq
%1 = load <2 x i64>, <2 x i64> *%p
%2 = shufflevector <2 x i64> %1, <2 x i64> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
%3 = add <8 x i64> %2, <i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8>
@@ -157,7 +109,7 @@ define <8 x i64> @test_broadcast_2i64_8i64(<2 x i64> *%p) nounwind {
define <16 x float> @test_broadcast_4f32_16f32(<4 x float> *%p) nounwind {
; X64-AVX512-LABEL: test_broadcast_4f32_16f32:
-; X64-AVX512: ## BB#0:
+; X64-AVX512: ## %bb.0:
; X64-AVX512-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; X64-AVX512-NEXT: vaddps {{.*}}(%rip), %zmm0, %zmm0
; X64-AVX512-NEXT: retq
@@ -169,7 +121,7 @@ define <16 x float> @test_broadcast_4f32_16f32(<4 x float> *%p) nounwind {
define <16 x i32> @test_broadcast_4i32_16i32(<4 x i32> *%p) nounwind {
; X64-AVX512-LABEL: test_broadcast_4i32_16i32:
-; X64-AVX512: ## BB#0:
+; X64-AVX512: ## %bb.0:
; X64-AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; X64-AVX512-NEXT: vpaddd {{.*}}(%rip), %zmm0, %zmm0
; X64-AVX512-NEXT: retq
@@ -181,20 +133,20 @@ define <16 x i32> @test_broadcast_4i32_16i32(<4 x i32> *%p) nounwind {
define <32 x i16> @test_broadcast_8i16_32i16(<8 x i16> *%p) nounwind {
; X64-AVX512VL-LABEL: test_broadcast_8i16_32i16:
-; X64-AVX512VL: ## BB#0:
+; X64-AVX512VL: ## %bb.0:
; X64-AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = mem[0,1,0,1]
; X64-AVX512VL-NEXT: vpaddw {{.*}}(%rip), %ymm1, %ymm0
; X64-AVX512VL-NEXT: vpaddw {{.*}}(%rip), %ymm1, %ymm1
; X64-AVX512VL-NEXT: retq
;
; X64-AVX512BWVL-LABEL: test_broadcast_8i16_32i16:
-; X64-AVX512BWVL: ## BB#0:
+; X64-AVX512BWVL: ## %bb.0:
; X64-AVX512BWVL-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; X64-AVX512BWVL-NEXT: vpaddw {{.*}}(%rip), %zmm0, %zmm0
; X64-AVX512BWVL-NEXT: retq
;
; X64-AVX512DQVL-LABEL: test_broadcast_8i16_32i16:
-; X64-AVX512DQVL: ## BB#0:
+; X64-AVX512DQVL: ## %bb.0:
; X64-AVX512DQVL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = mem[0,1,0,1]
; X64-AVX512DQVL-NEXT: vpaddw {{.*}}(%rip), %ymm1, %ymm0
; X64-AVX512DQVL-NEXT: vpaddw {{.*}}(%rip), %ymm1, %ymm1
@@ -207,20 +159,20 @@ define <32 x i16> @test_broadcast_8i16_32i16(<8 x i16> *%p) nounwind {
define <64 x i8> @test_broadcast_16i8_64i8(<16 x i8> *%p) nounwind {
; X64-AVX512VL-LABEL: test_broadcast_16i8_64i8:
-; X64-AVX512VL: ## BB#0:
+; X64-AVX512VL: ## %bb.0:
; X64-AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = mem[0,1,0,1]
; X64-AVX512VL-NEXT: vpaddb {{.*}}(%rip), %ymm1, %ymm0
; X64-AVX512VL-NEXT: vpaddb {{.*}}(%rip), %ymm1, %ymm1
; X64-AVX512VL-NEXT: retq
;
; X64-AVX512BWVL-LABEL: test_broadcast_16i8_64i8:
-; X64-AVX512BWVL: ## BB#0:
+; X64-AVX512BWVL: ## %bb.0:
; X64-AVX512BWVL-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; X64-AVX512BWVL-NEXT: vpaddb {{.*}}(%rip), %zmm0, %zmm0
; X64-AVX512BWVL-NEXT: retq
;
; X64-AVX512DQVL-LABEL: test_broadcast_16i8_64i8:
-; X64-AVX512DQVL: ## BB#0:
+; X64-AVX512DQVL: ## %bb.0:
; X64-AVX512DQVL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = mem[0,1,0,1]
; X64-AVX512DQVL-NEXT: vpaddb {{.*}}(%rip), %ymm1, %ymm0
; X64-AVX512DQVL-NEXT: vpaddb {{.*}}(%rip), %ymm1, %ymm1
@@ -233,27 +185,27 @@ define <64 x i8> @test_broadcast_16i8_64i8(<16 x i8> *%p) nounwind {
define <8 x i32> @PR29088(<4 x i32>* %p0, <8 x float>* %p1) {
; X64-AVX512VL-LABEL: PR29088:
-; X64-AVX512VL: ## BB#0:
-; X64-AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
-; X64-AVX512VL-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; X64-AVX512VL: ## %bb.0:
+; X64-AVX512VL-NEXT: vmovaps (%rdi), %xmm0
+; X64-AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
; X64-AVX512VL-NEXT: vmovdqa %ymm1, (%rsi)
-; X64-AVX512VL-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; X64-AVX512VL-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; X64-AVX512VL-NEXT: retq
;
; X64-AVX512BWVL-LABEL: PR29088:
-; X64-AVX512BWVL: ## BB#0:
-; X64-AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
-; X64-AVX512BWVL-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; X64-AVX512BWVL: ## %bb.0:
+; X64-AVX512BWVL-NEXT: vmovaps (%rdi), %xmm0
+; X64-AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1
; X64-AVX512BWVL-NEXT: vmovdqa %ymm1, (%rsi)
-; X64-AVX512BWVL-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; X64-AVX512BWVL-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; X64-AVX512BWVL-NEXT: retq
;
; X64-AVX512DQVL-LABEL: PR29088:
-; X64-AVX512DQVL: ## BB#0:
-; X64-AVX512DQVL-NEXT: vmovdqa (%rdi), %xmm0
-; X64-AVX512DQVL-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; X64-AVX512DQVL: ## %bb.0:
+; X64-AVX512DQVL-NEXT: vmovaps (%rdi), %xmm0
+; X64-AVX512DQVL-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X64-AVX512DQVL-NEXT: vmovaps %ymm1, (%rsi)
-; X64-AVX512DQVL-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; X64-AVX512DQVL-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; X64-AVX512DQVL-NEXT: retq
%ld = load <4 x i32>, <4 x i32>* %p0
store <8 x float> zeroinitializer, <8 x float>* %p1
diff --git a/test/CodeGen/X86/avx512-vbroadcasti256.ll b/test/CodeGen/X86/avx512-vbroadcasti256.ll
index 55fbc34732e1..b7710f3237a7 100644
--- a/test/CodeGen/X86/avx512-vbroadcasti256.ll
+++ b/test/CodeGen/X86/avx512-vbroadcasti256.ll
@@ -5,7 +5,7 @@
define <8 x double> @test_broadcast_4f64_8f64(<4 x double> *%p) nounwind {
; X64-AVX512-LABEL: test_broadcast_4f64_8f64:
-; X64-AVX512: ## BB#0:
+; X64-AVX512: ## %bb.0:
; X64-AVX512-NEXT: vbroadcastf64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
; X64-AVX512-NEXT: vaddpd {{.*}}(%rip), %zmm0, %zmm0
; X64-AVX512-NEXT: retq
@@ -17,7 +17,7 @@ define <8 x double> @test_broadcast_4f64_8f64(<4 x double> *%p) nounwind {
define <8 x i64> @test_broadcast_4i64_8i64(<4 x i64> *%p) nounwind {
; X64-AVX512-LABEL: test_broadcast_4i64_8i64:
-; X64-AVX512: ## BB#0:
+; X64-AVX512: ## %bb.0:
; X64-AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
; X64-AVX512-NEXT: vpaddq {{.*}}(%rip), %zmm0, %zmm0
; X64-AVX512-NEXT: retq
@@ -28,23 +28,11 @@ define <8 x i64> @test_broadcast_4i64_8i64(<4 x i64> *%p) nounwind {
}
define <16 x float> @test_broadcast_8f32_16f32(<8 x float> *%p) nounwind {
-; X64-AVX512VL-LABEL: test_broadcast_8f32_16f32:
-; X64-AVX512VL: ## BB#0:
-; X64-AVX512VL-NEXT: vbroadcastf64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
-; X64-AVX512VL-NEXT: vaddps {{.*}}(%rip), %zmm0, %zmm0
-; X64-AVX512VL-NEXT: retq
-;
-; X64-AVX512BWVL-LABEL: test_broadcast_8f32_16f32:
-; X64-AVX512BWVL: ## BB#0:
-; X64-AVX512BWVL-NEXT: vbroadcastf64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
-; X64-AVX512BWVL-NEXT: vaddps {{.*}}(%rip), %zmm0, %zmm0
-; X64-AVX512BWVL-NEXT: retq
-;
-; X64-AVX512DQVL-LABEL: test_broadcast_8f32_16f32:
-; X64-AVX512DQVL: ## BB#0:
-; X64-AVX512DQVL-NEXT: vbroadcastf32x8 {{.*#+}} zmm0 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
-; X64-AVX512DQVL-NEXT: vaddps {{.*}}(%rip), %zmm0, %zmm0
-; X64-AVX512DQVL-NEXT: retq
+; X64-AVX512-LABEL: test_broadcast_8f32_16f32:
+; X64-AVX512: ## %bb.0:
+; X64-AVX512-NEXT: vbroadcastf64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
+; X64-AVX512-NEXT: vaddps {{.*}}(%rip), %zmm0, %zmm0
+; X64-AVX512-NEXT: retq
%1 = load <8 x float>, <8 x float> *%p
%2 = shufflevector <8 x float> %1, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%3 = fadd <16 x float> %2, <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0>
@@ -52,23 +40,11 @@ define <16 x float> @test_broadcast_8f32_16f32(<8 x float> *%p) nounwind {
}
define <16 x i32> @test_broadcast_8i32_16i32(<8 x i32> *%p) nounwind {
-; X64-AVX512VL-LABEL: test_broadcast_8i32_16i32:
-; X64-AVX512VL: ## BB#0:
-; X64-AVX512VL-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
-; X64-AVX512VL-NEXT: vpaddd {{.*}}(%rip), %zmm0, %zmm0
-; X64-AVX512VL-NEXT: retq
-;
-; X64-AVX512BWVL-LABEL: test_broadcast_8i32_16i32:
-; X64-AVX512BWVL: ## BB#0:
-; X64-AVX512BWVL-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
-; X64-AVX512BWVL-NEXT: vpaddd {{.*}}(%rip), %zmm0, %zmm0
-; X64-AVX512BWVL-NEXT: retq
-;
-; X64-AVX512DQVL-LABEL: test_broadcast_8i32_16i32:
-; X64-AVX512DQVL: ## BB#0:
-; X64-AVX512DQVL-NEXT: vbroadcasti32x8 {{.*#+}} zmm0 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
-; X64-AVX512DQVL-NEXT: vpaddd {{.*}}(%rip), %zmm0, %zmm0
-; X64-AVX512DQVL-NEXT: retq
+; X64-AVX512-LABEL: test_broadcast_8i32_16i32:
+; X64-AVX512: ## %bb.0:
+; X64-AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
+; X64-AVX512-NEXT: vpaddd {{.*}}(%rip), %zmm0, %zmm0
+; X64-AVX512-NEXT: retq
%1 = load <8 x i32>, <8 x i32> *%p
%2 = shufflevector <8 x i32> %1, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%3 = add <16 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>
@@ -77,20 +53,20 @@ define <16 x i32> @test_broadcast_8i32_16i32(<8 x i32> *%p) nounwind {
define <32 x i16> @test_broadcast_16i16_32i16(<16 x i16> *%p) nounwind {
; X64-AVX512VL-LABEL: test_broadcast_16i16_32i16:
-; X64-AVX512VL: ## BB#0:
+; X64-AVX512VL: ## %bb.0:
; X64-AVX512VL-NEXT: vmovdqa (%rdi), %ymm1
; X64-AVX512VL-NEXT: vpaddw {{.*}}(%rip), %ymm1, %ymm0
; X64-AVX512VL-NEXT: vpaddw {{.*}}(%rip), %ymm1, %ymm1
; X64-AVX512VL-NEXT: retq
;
; X64-AVX512BWVL-LABEL: test_broadcast_16i16_32i16:
-; X64-AVX512BWVL: ## BB#0:
+; X64-AVX512BWVL: ## %bb.0:
; X64-AVX512BWVL-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
; X64-AVX512BWVL-NEXT: vpaddw {{.*}}(%rip), %zmm0, %zmm0
; X64-AVX512BWVL-NEXT: retq
;
; X64-AVX512DQVL-LABEL: test_broadcast_16i16_32i16:
-; X64-AVX512DQVL: ## BB#0:
+; X64-AVX512DQVL: ## %bb.0:
; X64-AVX512DQVL-NEXT: vmovdqa (%rdi), %ymm1
; X64-AVX512DQVL-NEXT: vpaddw {{.*}}(%rip), %ymm1, %ymm0
; X64-AVX512DQVL-NEXT: vpaddw {{.*}}(%rip), %ymm1, %ymm1
@@ -103,20 +79,20 @@ define <32 x i16> @test_broadcast_16i16_32i16(<16 x i16> *%p) nounwind {
define <64 x i8> @test_broadcast_32i8_64i8(<32 x i8> *%p) nounwind {
; X64-AVX512VL-LABEL: test_broadcast_32i8_64i8:
-; X64-AVX512VL: ## BB#0:
+; X64-AVX512VL: ## %bb.0:
; X64-AVX512VL-NEXT: vmovdqa (%rdi), %ymm1
; X64-AVX512VL-NEXT: vpaddb {{.*}}(%rip), %ymm1, %ymm0
; X64-AVX512VL-NEXT: vpaddb {{.*}}(%rip), %ymm1, %ymm1
; X64-AVX512VL-NEXT: retq
;
; X64-AVX512BWVL-LABEL: test_broadcast_32i8_64i8:
-; X64-AVX512BWVL: ## BB#0:
+; X64-AVX512BWVL: ## %bb.0:
; X64-AVX512BWVL-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
; X64-AVX512BWVL-NEXT: vpaddb {{.*}}(%rip), %zmm0, %zmm0
; X64-AVX512BWVL-NEXT: retq
;
; X64-AVX512DQVL-LABEL: test_broadcast_32i8_64i8:
-; X64-AVX512DQVL: ## BB#0:
+; X64-AVX512DQVL: ## %bb.0:
; X64-AVX512DQVL-NEXT: vmovdqa (%rdi), %ymm1
; X64-AVX512DQVL-NEXT: vpaddb {{.*}}(%rip), %ymm1, %ymm0
; X64-AVX512DQVL-NEXT: vpaddb {{.*}}(%rip), %ymm1, %ymm1
diff --git a/test/CodeGen/X86/avx512-vec-cmp.ll b/test/CodeGen/X86/avx512-vec-cmp.ll
index b3fbceea80a9..fc684e54b063 100644
--- a/test/CodeGen/X86/avx512-vec-cmp.ll
+++ b/test/CodeGen/X86/avx512-vec-cmp.ll
@@ -1,14 +1,13 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s --check-prefix=CHECK --check-prefix=KNL
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s --check-prefix=CHECK --check-prefix=SKX
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f | FileCheck %s --check-prefix=CHECK --check-prefix=KNL
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq | FileCheck %s --check-prefix=CHECK --check-prefix=SKX
define <16 x float> @test1(<16 x float> %x, <16 x float> %y) nounwind {
; CHECK-LABEL: test1:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vcmpleps %zmm1, %zmm0, %k1
; CHECK-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
-; CHECK-NEXT: ## -- End function
%mask = fcmp ole <16 x float> %x, %y
%max = select <16 x i1> %mask, <16 x float> %x, <16 x float> %y
ret <16 x float> %max
@@ -16,11 +15,10 @@ define <16 x float> @test1(<16 x float> %x, <16 x float> %y) nounwind {
define <8 x double> @test2(<8 x double> %x, <8 x double> %y) nounwind {
; CHECK-LABEL: test2:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vcmplepd %zmm1, %zmm0, %k1
; CHECK-NEXT: vblendmpd %zmm0, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
-; CHECK-NEXT: ## -- End function
%mask = fcmp ole <8 x double> %x, %y
%max = select <8 x i1> %mask, <8 x double> %x, <8 x double> %y
ret <8 x double> %max
@@ -28,11 +26,10 @@ define <8 x double> @test2(<8 x double> %x, <8 x double> %y) nounwind {
define <16 x i32> @test3(<16 x i32> %x, <16 x i32> %x1, <16 x i32>* %yp) nounwind {
; CHECK-LABEL: test3:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpcmpeqd (%rdi), %zmm0, %k1
; CHECK-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
-; CHECK-NEXT: ## -- End function
%y = load <16 x i32>, <16 x i32>* %yp, align 4
%mask = icmp eq <16 x i32> %x, %y
%max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %x1
@@ -41,11 +38,10 @@ define <16 x i32> @test3(<16 x i32> %x, <16 x i32> %x1, <16 x i32>* %yp) nounwin
define <16 x i32> @test4_unsigned(<16 x i32> %x, <16 x i32> %y, <16 x i32> %x1) nounwind {
; CHECK-LABEL: test4_unsigned:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpcmpnltud %zmm1, %zmm0, %k1
; CHECK-NEXT: vpblendmd %zmm2, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
-; CHECK-NEXT: ## -- End function
%mask = icmp uge <16 x i32> %x, %y
%max = select <16 x i1> %mask, <16 x i32> %x1, <16 x i32> %y
ret <16 x i32> %max
@@ -53,11 +49,10 @@ define <16 x i32> @test4_unsigned(<16 x i32> %x, <16 x i32> %y, <16 x i32> %x1)
define <8 x i64> @test5(<8 x i64> %x, <8 x i64> %y) nounwind {
; CHECK-LABEL: test5:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k1
; CHECK-NEXT: vpblendmq %zmm0, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
-; CHECK-NEXT: ## -- End function
%mask = icmp eq <8 x i64> %x, %y
%max = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> %y
ret <8 x i64> %max
@@ -65,11 +60,10 @@ define <8 x i64> @test5(<8 x i64> %x, <8 x i64> %y) nounwind {
define <8 x i64> @test6_unsigned(<8 x i64> %x, <8 x i64> %y, <8 x i64> %x1) nounwind {
; CHECK-LABEL: test6_unsigned:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpcmpnleuq %zmm1, %zmm0, %k1
; CHECK-NEXT: vpblendmq %zmm2, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
-; CHECK-NEXT: ## -- End function
%mask = icmp ugt <8 x i64> %x, %y
%max = select <8 x i1> %mask, <8 x i64> %x1, <8 x i64> %y
ret <8 x i64> %max
@@ -77,14 +71,14 @@ define <8 x i64> @test6_unsigned(<8 x i64> %x, <8 x i64> %y, <8 x i64> %x1) noun
define <4 x float> @test7(<4 x float> %a, <4 x float> %b) {
; KNL-LABEL: test7:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: vxorps %xmm2, %xmm2, %xmm2
; KNL-NEXT: vcmpltps %xmm2, %xmm0, %xmm2
; KNL-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
; KNL-NEXT: retq
;
; SKX-LABEL: test7:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vxorps %xmm2, %xmm2, %xmm2
; SKX-NEXT: vcmpltps %xmm2, %xmm0, %k1
; SKX-NEXT: vblendmps %xmm0, %xmm1, %xmm0 {%k1}
@@ -97,14 +91,14 @@ define <4 x float> @test7(<4 x float> %a, <4 x float> %b) {
define <2 x double> @test8(<2 x double> %a, <2 x double> %b) {
; KNL-LABEL: test8:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: vxorpd %xmm2, %xmm2, %xmm2
; KNL-NEXT: vcmpltpd %xmm2, %xmm0, %xmm2
; KNL-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; KNL-NEXT: retq
;
; SKX-LABEL: test8:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vxorpd %xmm2, %xmm2, %xmm2
; SKX-NEXT: vcmpltpd %xmm2, %xmm0, %k1
; SKX-NEXT: vblendmpd %xmm0, %xmm1, %xmm0 {%k1}
@@ -116,21 +110,19 @@ define <2 x double> @test8(<2 x double> %a, <2 x double> %b) {
define <8 x i32> @test9(<8 x i32> %x, <8 x i32> %y) nounwind {
; KNL-LABEL: test9:
-; KNL: ## BB#0:
-; KNL-NEXT: ## kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
-; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; KNL: ## %bb.0:
+; KNL-NEXT: ## kill: def %ymm1 killed %ymm1 def %zmm1
+; KNL-NEXT: ## kill: def %ymm0 killed %ymm0 def %zmm0
; KNL-NEXT: vpcmpeqd %zmm1, %zmm0, %k1
; KNL-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
-; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; KNL-NEXT: ## kill: def %ymm0 killed %ymm0 killed %zmm0
; KNL-NEXT: retq
-; KNL-NEXT: ## -- End function
;
; SKX-LABEL: test9:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpcmpeqd %ymm1, %ymm0, %k1
; SKX-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
; SKX-NEXT: retq
-; SKX-NEXT: ## -- End function
%mask = icmp eq <8 x i32> %x, %y
%max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %y
ret <8 x i32> %max
@@ -138,21 +130,19 @@ define <8 x i32> @test9(<8 x i32> %x, <8 x i32> %y) nounwind {
define <8 x float> @test10(<8 x float> %x, <8 x float> %y) nounwind {
; KNL-LABEL: test10:
-; KNL: ## BB#0:
-; KNL-NEXT: ## kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
-; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; KNL: ## %bb.0:
+; KNL-NEXT: ## kill: def %ymm1 killed %ymm1 def %zmm1
+; KNL-NEXT: ## kill: def %ymm0 killed %ymm0 def %zmm0
; KNL-NEXT: vcmpeqps %zmm1, %zmm0, %k1
; KNL-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1}
-; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; KNL-NEXT: ## kill: def %ymm0 killed %ymm0 killed %zmm0
; KNL-NEXT: retq
-; KNL-NEXT: ## -- End function
;
; SKX-LABEL: test10:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vcmpeqps %ymm1, %ymm0, %k1
; SKX-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1}
; SKX-NEXT: retq
-; SKX-NEXT: ## -- End function
%mask = fcmp oeq <8 x float> %x, %y
%max = select <8 x i1> %mask, <8 x float> %x, <8 x float> %y
@@ -161,10 +151,9 @@ define <8 x float> @test10(<8 x float> %x, <8 x float> %y) nounwind {
define <8 x i32> @test11_unsigned(<8 x i32> %x, <8 x i32> %y) nounwind {
; CHECK-LABEL: test11_unsigned:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
; CHECK-NEXT: retq
-; CHECK-NEXT: ## -- End function
%mask = icmp ugt <8 x i32> %x, %y
%max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %y
ret <8 x i32> %max
@@ -172,25 +161,24 @@ define <8 x i32> @test11_unsigned(<8 x i32> %x, <8 x i32> %y) nounwind {
define i16 @test12(<16 x i64> %a, <16 x i64> %b) nounwind {
; KNL-LABEL: test12:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: vpcmpeqq %zmm2, %zmm0, %k0
; KNL-NEXT: vpcmpeqq %zmm3, %zmm1, %k1
; KNL-NEXT: kunpckbw %k0, %k1, %k0
; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; KNL-NEXT: ## kill: def %ax killed %ax killed %eax
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
-; KNL-NEXT: ## -- End function
;
; SKX-LABEL: test12:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpcmpeqq %zmm2, %zmm0, %k0
; SKX-NEXT: vpcmpeqq %zmm3, %zmm1, %k1
; SKX-NEXT: kunpckbw %k0, %k1, %k0
; SKX-NEXT: kmovd %k0, %eax
-; SKX-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; SKX-NEXT: ## kill: def %ax killed %ax killed %eax
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
-; SKX-NEXT: ## -- End function
%res = icmp eq <16 x i64> %a, %b
%res1 = bitcast <16 x i1> %res to i16
ret i16 %res1
@@ -198,143 +186,22 @@ define i16 @test12(<16 x i64> %a, <16 x i64> %b) nounwind {
define i32 @test12_v32i32(<32 x i32> %a, <32 x i32> %b) nounwind {
; KNL-LABEL: test12_v32i32:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: pushq %rbp
; KNL-NEXT: movq %rsp, %rbp
; KNL-NEXT: andq $-32, %rsp
; KNL-NEXT: subq $32, %rsp
-; KNL-NEXT: vpcmpeqd %zmm3, %zmm1, %k0
-; KNL-NEXT: kshiftlw $14, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: kshiftlw $15, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %ecx
-; KNL-NEXT: vmovd %ecx, %xmm1
-; KNL-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
-; KNL-NEXT: kshiftlw $13, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
-; KNL-NEXT: kshiftlw $12, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1
-; KNL-NEXT: kshiftlw $11, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; KNL-NEXT: kshiftlw $10, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
-; KNL-NEXT: kshiftlw $9, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1
-; KNL-NEXT: kshiftlw $8, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1
-; KNL-NEXT: kshiftlw $7, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
-; KNL-NEXT: kshiftlw $6, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1
-; KNL-NEXT: kshiftlw $5, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1
-; KNL-NEXT: kshiftlw $4, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1
-; KNL-NEXT: kshiftlw $3, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
-; KNL-NEXT: kshiftlw $2, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
-; KNL-NEXT: kshiftlw $1, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1
+; KNL-NEXT: vpcmpeqd %zmm3, %zmm1, %k1
+; KNL-NEXT: movl {{.*}}(%rip), %eax
+; KNL-NEXT: vpbroadcastd %eax, %zmm1 {%k1} {z}
+; KNL-NEXT: vpmovdb %zmm1, %xmm1
; KNL-NEXT: vpmovsxbd %xmm1, %zmm1
; KNL-NEXT: vpslld $31, %zmm1, %zmm1
; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0
; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; KNL-NEXT: vpcmpeqd %zmm2, %zmm0, %k0
-; KNL-NEXT: kshiftlw $14, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: kshiftlw $15, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %ecx
-; KNL-NEXT: vmovd %ecx, %xmm0
-; KNL-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
-; KNL-NEXT: kshiftlw $13, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
-; KNL-NEXT: kshiftlw $12, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
-; KNL-NEXT: kshiftlw $11, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
-; KNL-NEXT: kshiftlw $10, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; KNL-NEXT: kshiftlw $9, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; KNL-NEXT: kshiftlw $8, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
-; KNL-NEXT: kshiftlw $7, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
-; KNL-NEXT: kshiftlw $6, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
-; KNL-NEXT: kshiftlw $5, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
-; KNL-NEXT: kshiftlw $4, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; KNL-NEXT: kshiftlw $3, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
-; KNL-NEXT: kshiftlw $2, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
-; KNL-NEXT: kshiftlw $1, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; KNL-NEXT: vpcmpeqd %zmm2, %zmm0, %k1
+; KNL-NEXT: vpbroadcastd %eax, %zmm0 {%k1} {z}
+; KNL-NEXT: vpmovdb %zmm0, %xmm0
; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -342,18 +209,17 @@ define i32 @test12_v32i32(<32 x i32> %a, <32 x i32> %b) nounwind {
; KNL-NEXT: movl (%rsp), %eax
; KNL-NEXT: movq %rbp, %rsp
; KNL-NEXT: popq %rbp
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
-; KNL-NEXT: ## -- End function
;
; SKX-LABEL: test12_v32i32:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpcmpeqd %zmm2, %zmm0, %k0
; SKX-NEXT: vpcmpeqd %zmm3, %zmm1, %k1
; SKX-NEXT: kunpckwd %k0, %k1, %k0
; SKX-NEXT: kmovd %k0, %eax
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
-; SKX-NEXT: ## -- End function
%res = icmp eq <32 x i32> %a, %b
%res1 = bitcast <32 x i1> %res to i32
ret i32 %res1
@@ -361,291 +227,35 @@ define i32 @test12_v32i32(<32 x i32> %a, <32 x i32> %b) nounwind {
define i64 @test12_v64i16(<64 x i16> %a, <64 x i16> %b) nounwind {
; KNL-LABEL: test12_v64i16:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: pushq %rbp
; KNL-NEXT: movq %rsp, %rbp
; KNL-NEXT: andq $-32, %rsp
; KNL-NEXT: subq $64, %rsp
; KNL-NEXT: vpcmpeqw %ymm5, %ymm1, %ymm1
; KNL-NEXT: vpmovsxwd %ymm1, %zmm1
-; KNL-NEXT: vpslld $31, %zmm1, %zmm1
-; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0
-; KNL-NEXT: kshiftlw $14, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: kshiftlw $15, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %ecx
-; KNL-NEXT: vmovd %ecx, %xmm1
-; KNL-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
-; KNL-NEXT: kshiftlw $13, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
-; KNL-NEXT: kshiftlw $12, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1
-; KNL-NEXT: kshiftlw $11, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; KNL-NEXT: kshiftlw $10, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
-; KNL-NEXT: kshiftlw $9, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1
-; KNL-NEXT: kshiftlw $8, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1
-; KNL-NEXT: kshiftlw $7, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
-; KNL-NEXT: kshiftlw $6, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1
-; KNL-NEXT: kshiftlw $5, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1
-; KNL-NEXT: kshiftlw $4, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1
-; KNL-NEXT: kshiftlw $3, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
-; KNL-NEXT: kshiftlw $2, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
-; KNL-NEXT: kshiftlw $1, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1
+; KNL-NEXT: vpmovdb %zmm1, %xmm1
; KNL-NEXT: vpmovsxbd %xmm1, %zmm1
; KNL-NEXT: vpslld $31, %zmm1, %zmm1
; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0
; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; KNL-NEXT: vpcmpeqw %ymm4, %ymm0, %ymm0
; KNL-NEXT: vpmovsxwd %ymm0, %zmm0
-; KNL-NEXT: vpslld $31, %zmm0, %zmm0
-; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
-; KNL-NEXT: kshiftlw $14, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: kshiftlw $15, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %ecx
-; KNL-NEXT: vmovd %ecx, %xmm0
-; KNL-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
-; KNL-NEXT: kshiftlw $13, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
-; KNL-NEXT: kshiftlw $12, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
-; KNL-NEXT: kshiftlw $11, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
-; KNL-NEXT: kshiftlw $10, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; KNL-NEXT: kshiftlw $9, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; KNL-NEXT: kshiftlw $8, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
-; KNL-NEXT: kshiftlw $7, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
-; KNL-NEXT: kshiftlw $6, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
-; KNL-NEXT: kshiftlw $5, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
-; KNL-NEXT: kshiftlw $4, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; KNL-NEXT: kshiftlw $3, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
-; KNL-NEXT: kshiftlw $2, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
-; KNL-NEXT: kshiftlw $1, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; KNL-NEXT: vpmovdb %zmm0, %xmm0
; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
; KNL-NEXT: kmovw %k0, (%rsp)
; KNL-NEXT: vpcmpeqw %ymm7, %ymm3, %ymm0
; KNL-NEXT: vpmovsxwd %ymm0, %zmm0
-; KNL-NEXT: vpslld $31, %zmm0, %zmm0
-; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
-; KNL-NEXT: kshiftlw $14, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: kshiftlw $15, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %ecx
-; KNL-NEXT: vmovd %ecx, %xmm0
-; KNL-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
-; KNL-NEXT: kshiftlw $13, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
-; KNL-NEXT: kshiftlw $12, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
-; KNL-NEXT: kshiftlw $11, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
-; KNL-NEXT: kshiftlw $10, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; KNL-NEXT: kshiftlw $9, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; KNL-NEXT: kshiftlw $8, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
-; KNL-NEXT: kshiftlw $7, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
-; KNL-NEXT: kshiftlw $6, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
-; KNL-NEXT: kshiftlw $5, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
-; KNL-NEXT: kshiftlw $4, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; KNL-NEXT: kshiftlw $3, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
-; KNL-NEXT: kshiftlw $2, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
-; KNL-NEXT: kshiftlw $1, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; KNL-NEXT: vpmovdb %zmm0, %xmm0
; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; KNL-NEXT: vpcmpeqw %ymm6, %ymm2, %ymm0
; KNL-NEXT: vpmovsxwd %ymm0, %zmm0
-; KNL-NEXT: vpslld $31, %zmm0, %zmm0
-; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
-; KNL-NEXT: kshiftlw $14, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: kshiftlw $15, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %ecx
-; KNL-NEXT: vmovd %ecx, %xmm0
-; KNL-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
-; KNL-NEXT: kshiftlw $13, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
-; KNL-NEXT: kshiftlw $12, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
-; KNL-NEXT: kshiftlw $11, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
-; KNL-NEXT: kshiftlw $10, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; KNL-NEXT: kshiftlw $9, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; KNL-NEXT: kshiftlw $8, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
-; KNL-NEXT: kshiftlw $7, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
-; KNL-NEXT: kshiftlw $6, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
-; KNL-NEXT: kshiftlw $5, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
-; KNL-NEXT: kshiftlw $4, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; KNL-NEXT: kshiftlw $3, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
-; KNL-NEXT: kshiftlw $2, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
-; KNL-NEXT: kshiftlw $1, %k0, %k1
-; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; KNL-NEXT: vpmovdb %zmm0, %xmm0
; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -656,18 +266,17 @@ define i64 @test12_v64i16(<64 x i16> %a, <64 x i16> %b) nounwind {
; KNL-NEXT: orq %rcx, %rax
; KNL-NEXT: movq %rbp, %rsp
; KNL-NEXT: popq %rbp
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
-; KNL-NEXT: ## -- End function
;
; SKX-LABEL: test12_v64i16:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpcmpeqw %zmm2, %zmm0, %k0
; SKX-NEXT: vpcmpeqw %zmm3, %zmm1, %k1
; SKX-NEXT: kunpckdq %k0, %k1, %k0
; SKX-NEXT: kmovq %k0, %rax
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
-; SKX-NEXT: ## -- End function
%res = icmp eq <64 x i16> %a, %b
%res1 = bitcast <64 x i1> %res to i64
ret i64 %res1
@@ -675,7 +284,7 @@ define i64 @test12_v64i16(<64 x i16> %a, <64 x i16> %b) nounwind {
define <16 x i32> @test13(<16 x float>%a, <16 x float>%b)
; CHECK-LABEL: test13:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vcmpeqps %zmm1, %zmm0, %k1
; CHECK-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -687,7 +296,7 @@ define <16 x i32> @test13(<16 x float>%a, <16 x float>%b)
define <16 x i32> @test14(<16 x i32>%a, <16 x i32>%b) {
; CHECK-LABEL: test14:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsubd %zmm1, %zmm0, %zmm2
; CHECK-NEXT: vpcmpgtd %zmm0, %zmm2, %k1
; CHECK-NEXT: vpsubd %zmm1, %zmm0, %zmm0 {%k1} {z}
@@ -702,7 +311,7 @@ define <16 x i32> @test14(<16 x i32>%a, <16 x i32>%b) {
define <8 x i64> @test15(<8 x i64>%a, <8 x i64>%b) {
; CHECK-LABEL: test15:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsubq %zmm1, %zmm0, %zmm2
; CHECK-NEXT: vpcmpgtq %zmm0, %zmm2, %k1
; CHECK-NEXT: vpsubq %zmm1, %zmm0, %zmm0 {%k1} {z}
@@ -717,11 +326,10 @@ define <8 x i64> @test15(<8 x i64>%a, <8 x i64>%b) {
define <16 x i32> @test16(<16 x i32> %x, <16 x i32> %y, <16 x i32> %x1) nounwind {
; CHECK-LABEL: test16:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpcmpled %zmm0, %zmm1, %k1
; CHECK-NEXT: vpblendmd %zmm2, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
-; CHECK-NEXT: ## -- End function
%mask = icmp sge <16 x i32> %x, %y
%max = select <16 x i1> %mask, <16 x i32> %x1, <16 x i32> %y
ret <16 x i32> %max
@@ -729,11 +337,10 @@ define <16 x i32> @test16(<16 x i32> %x, <16 x i32> %y, <16 x i32> %x1) nounwind
define <16 x i32> @test17(<16 x i32> %x, <16 x i32> %x1, <16 x i32>* %y.ptr) nounwind {
; CHECK-LABEL: test17:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpcmpgtd (%rdi), %zmm0, %k1
; CHECK-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
-; CHECK-NEXT: ## -- End function
%y = load <16 x i32>, <16 x i32>* %y.ptr, align 4
%mask = icmp sgt <16 x i32> %x, %y
%max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %x1
@@ -742,11 +349,10 @@ define <16 x i32> @test17(<16 x i32> %x, <16 x i32> %x1, <16 x i32>* %y.ptr) nou
define <16 x i32> @test18(<16 x i32> %x, <16 x i32> %x1, <16 x i32>* %y.ptr) nounwind {
; CHECK-LABEL: test18:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpcmpled (%rdi), %zmm0, %k1
; CHECK-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
-; CHECK-NEXT: ## -- End function
%y = load <16 x i32>, <16 x i32>* %y.ptr, align 4
%mask = icmp sle <16 x i32> %x, %y
%max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %x1
@@ -755,11 +361,10 @@ define <16 x i32> @test18(<16 x i32> %x, <16 x i32> %x1, <16 x i32>* %y.ptr) nou
define <16 x i32> @test19(<16 x i32> %x, <16 x i32> %x1, <16 x i32>* %y.ptr) nounwind {
; CHECK-LABEL: test19:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpcmpleud (%rdi), %zmm0, %k1
; CHECK-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
-; CHECK-NEXT: ## -- End function
%y = load <16 x i32>, <16 x i32>* %y.ptr, align 4
%mask = icmp ule <16 x i32> %x, %y
%max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %x1
@@ -768,12 +373,11 @@ define <16 x i32> @test19(<16 x i32> %x, <16 x i32> %x1, <16 x i32>* %y.ptr) nou
define <16 x i32> @test20(<16 x i32> %x, <16 x i32> %y, <16 x i32> %x1, <16 x i32> %y1) nounwind {
; CHECK-LABEL: test20:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k1
; CHECK-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 {%k1}
; CHECK-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
-; CHECK-NEXT: ## -- End function
%mask1 = icmp eq <16 x i32> %x1, %y1
%mask0 = icmp eq <16 x i32> %x, %y
%mask = select <16 x i1> %mask0, <16 x i1> %mask1, <16 x i1> zeroinitializer
@@ -783,12 +387,11 @@ define <16 x i32> @test20(<16 x i32> %x, <16 x i32> %y, <16 x i32> %x1, <16 x i3
define <8 x i64> @test21(<8 x i64> %x, <8 x i64> %y, <8 x i64> %x1, <8 x i64> %y1) nounwind {
; CHECK-LABEL: test21:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpcmpleq %zmm1, %zmm0, %k1
; CHECK-NEXT: vpcmpleq %zmm2, %zmm3, %k1 {%k1}
; CHECK-NEXT: vpblendmq %zmm0, %zmm2, %zmm0 {%k1}
; CHECK-NEXT: retq
-; CHECK-NEXT: ## -- End function
%mask1 = icmp sge <8 x i64> %x1, %y1
%mask0 = icmp sle <8 x i64> %x, %y
%mask = select <8 x i1> %mask0, <8 x i1> %mask1, <8 x i1> zeroinitializer
@@ -798,12 +401,11 @@ define <8 x i64> @test21(<8 x i64> %x, <8 x i64> %y, <8 x i64> %x1, <8 x i64> %y
define <8 x i64> @test22(<8 x i64> %x, <8 x i64>* %y.ptr, <8 x i64> %x1, <8 x i64> %y1) nounwind {
; CHECK-LABEL: test22:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpcmpgtq %zmm2, %zmm1, %k1
; CHECK-NEXT: vpcmpgtq (%rdi), %zmm0, %k1 {%k1}
; CHECK-NEXT: vpblendmq %zmm0, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
-; CHECK-NEXT: ## -- End function
%mask1 = icmp sgt <8 x i64> %x1, %y1
%y = load <8 x i64>, <8 x i64>* %y.ptr, align 4
%mask0 = icmp sgt <8 x i64> %x, %y
@@ -814,12 +416,11 @@ define <8 x i64> @test22(<8 x i64> %x, <8 x i64>* %y.ptr, <8 x i64> %x1, <8 x i6
define <16 x i32> @test23(<16 x i32> %x, <16 x i32>* %y.ptr, <16 x i32> %x1, <16 x i32> %y1) nounwind {
; CHECK-LABEL: test23:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpcmpled %zmm1, %zmm2, %k1
; CHECK-NEXT: vpcmpleud (%rdi), %zmm0, %k1 {%k1}
; CHECK-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
-; CHECK-NEXT: ## -- End function
%mask1 = icmp sge <16 x i32> %x1, %y1
%y = load <16 x i32>, <16 x i32>* %y.ptr, align 4
%mask0 = icmp ule <16 x i32> %x, %y
@@ -830,11 +431,10 @@ define <16 x i32> @test23(<16 x i32> %x, <16 x i32>* %y.ptr, <16 x i32> %x1, <16
define <8 x i64> @test24(<8 x i64> %x, <8 x i64> %x1, i64* %yb.ptr) nounwind {
; CHECK-LABEL: test24:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpcmpeqq (%rdi){1to8}, %zmm0, %k1
; CHECK-NEXT: vpblendmq %zmm0, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
-; CHECK-NEXT: ## -- End function
%yb = load i64, i64* %yb.ptr, align 4
%y.0 = insertelement <8 x i64> undef, i64 %yb, i32 0
%y = shufflevector <8 x i64> %y.0, <8 x i64> undef, <8 x i32> zeroinitializer
@@ -845,11 +445,10 @@ define <8 x i64> @test24(<8 x i64> %x, <8 x i64> %x1, i64* %yb.ptr) nounwind {
define <16 x i32> @test25(<16 x i32> %x, i32* %yb.ptr, <16 x i32> %x1) nounwind {
; CHECK-LABEL: test25:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpcmpled (%rdi){1to16}, %zmm0, %k1
; CHECK-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
-; CHECK-NEXT: ## -- End function
%yb = load i32, i32* %yb.ptr, align 4
%y.0 = insertelement <16 x i32> undef, i32 %yb, i32 0
%y = shufflevector <16 x i32> %y.0, <16 x i32> undef, <16 x i32> zeroinitializer
@@ -860,12 +459,11 @@ define <16 x i32> @test25(<16 x i32> %x, i32* %yb.ptr, <16 x i32> %x1) nounwind
define <16 x i32> @test26(<16 x i32> %x, i32* %yb.ptr, <16 x i32> %x1, <16 x i32> %y1) nounwind {
; CHECK-LABEL: test26:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpcmpled %zmm1, %zmm2, %k1
; CHECK-NEXT: vpcmpgtd (%rdi){1to16}, %zmm0, %k1 {%k1}
; CHECK-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
-; CHECK-NEXT: ## -- End function
%mask1 = icmp sge <16 x i32> %x1, %y1
%yb = load i32, i32* %yb.ptr, align 4
%y.0 = insertelement <16 x i32> undef, i32 %yb, i32 0
@@ -878,12 +476,11 @@ define <16 x i32> @test26(<16 x i32> %x, i32* %yb.ptr, <16 x i32> %x1, <16 x i32
define <8 x i64> @test27(<8 x i64> %x, i64* %yb.ptr, <8 x i64> %x1, <8 x i64> %y1) nounwind {
; CHECK-LABEL: test27:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpcmpleq %zmm1, %zmm2, %k1
; CHECK-NEXT: vpcmpleq (%rdi){1to8}, %zmm0, %k1 {%k1}
; CHECK-NEXT: vpblendmq %zmm0, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
-; CHECK-NEXT: ## -- End function
%mask1 = icmp sge <8 x i64> %x1, %y1
%yb = load i64, i64* %yb.ptr, align 4
%y.0 = insertelement <8 x i64> undef, i64 %yb, i32 0
@@ -896,16 +493,16 @@ define <8 x i64> @test27(<8 x i64> %x, i64* %yb.ptr, <8 x i64> %x1, <8 x i64> %y
define <8 x i32>@test28(<8 x i64> %x, <8 x i64> %y, <8 x i64> %x1, <8 x i64> %y1) {
; KNL-LABEL: test28:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: vpcmpgtq %zmm1, %zmm0, %k0
; KNL-NEXT: vpcmpgtq %zmm3, %zmm2, %k1
; KNL-NEXT: kxnorw %k1, %k0, %k1
-; KNL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; KNL-NEXT: vpmovqd %zmm0, %ymm0
+; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT: ## kill: def %ymm0 killed %ymm0 killed %zmm0
; KNL-NEXT: retq
;
; SKX-LABEL: test28:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpcmpgtq %zmm1, %zmm0, %k0
; SKX-NEXT: vpcmpgtq %zmm3, %zmm2, %k1
; SKX-NEXT: kxnorb %k1, %k0, %k0
@@ -920,16 +517,17 @@ define <8 x i32>@test28(<8 x i64> %x, <8 x i64> %y, <8 x i64> %x1, <8 x i64> %y1
define <16 x i8>@test29(<16 x i32> %x, <16 x i32> %y, <16 x i32> %x1, <16 x i32> %y1) {
; KNL-LABEL: test29:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: vpcmpgtd %zmm1, %zmm0, %k0
; KNL-NEXT: vpcmpgtd %zmm3, %zmm2, %k1
; KNL-NEXT: kxorw %k1, %k0, %k1
; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; KNL-NEXT: vpmovdb %zmm0, %xmm0
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: test29:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0
; SKX-NEXT: vpcmpgtd %zmm3, %zmm2, %k1
; SKX-NEXT: kxorw %k1, %k0, %k0
@@ -945,18 +543,16 @@ define <16 x i8>@test29(<16 x i32> %x, <16 x i32> %y, <16 x i32> %x1, <16 x i32>
define <4 x double> @test30(<4 x double> %x, <4 x double> %y) nounwind {
; KNL-LABEL: test30:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm2
; KNL-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
; KNL-NEXT: retq
-; KNL-NEXT: ## -- End function
;
; SKX-LABEL: test30:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vcmpeqpd %ymm1, %ymm0, %k1
; SKX-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
; SKX-NEXT: retq
-; SKX-NEXT: ## -- End function
%mask = fcmp oeq <4 x double> %x, %y
%max = select <4 x i1> %mask, <4 x double> %x, <4 x double> %y
@@ -965,18 +561,16 @@ define <4 x double> @test30(<4 x double> %x, <4 x double> %y) nounwind {
define <2 x double> @test31(<2 x double> %x, <2 x double> %x1, <2 x double>* %yp) nounwind {
; KNL-LABEL: test31:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: vcmpltpd (%rdi), %xmm0, %xmm2
; KNL-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; KNL-NEXT: retq
-; KNL-NEXT: ## -- End function
;
; SKX-LABEL: test31:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vcmpltpd (%rdi), %xmm0, %k1
; SKX-NEXT: vblendmpd %xmm0, %xmm1, %xmm0 {%k1}
; SKX-NEXT: retq
-; SKX-NEXT: ## -- End function
%y = load <2 x double>, <2 x double>* %yp, align 4
%mask = fcmp olt <2 x double> %x, %y
@@ -986,18 +580,16 @@ define <2 x double> @test31(<2 x double> %x, <2 x double> %x1, <2 x double>* %yp
define <4 x double> @test32(<4 x double> %x, <4 x double> %x1, <4 x double>* %yp) nounwind {
; KNL-LABEL: test32:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: vcmpltpd (%rdi), %ymm0, %ymm2
; KNL-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
; KNL-NEXT: retq
-; KNL-NEXT: ## -- End function
;
; SKX-LABEL: test32:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vcmpltpd (%rdi), %ymm0, %k1
; SKX-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
; SKX-NEXT: retq
-; SKX-NEXT: ## -- End function
%y = load <4 x double>, <4 x double>* %yp, align 4
%mask = fcmp ogt <4 x double> %y, %x
@@ -1007,11 +599,10 @@ define <4 x double> @test32(<4 x double> %x, <4 x double> %x1, <4 x double>* %yp
define <8 x double> @test33(<8 x double> %x, <8 x double> %x1, <8 x double>* %yp) nounwind {
; CHECK-LABEL: test33:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vcmpltpd (%rdi), %zmm0, %k1
; CHECK-NEXT: vblendmpd %zmm0, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
-; CHECK-NEXT: ## -- End function
%y = load <8 x double>, <8 x double>* %yp, align 4
%mask = fcmp olt <8 x double> %x, %y
%max = select <8 x i1> %mask, <8 x double> %x, <8 x double> %x1
@@ -1020,18 +611,16 @@ define <8 x double> @test33(<8 x double> %x, <8 x double> %x1, <8 x double>* %yp
define <4 x float> @test34(<4 x float> %x, <4 x float> %x1, <4 x float>* %yp) nounwind {
; KNL-LABEL: test34:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: vcmpltps (%rdi), %xmm0, %xmm2
; KNL-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
; KNL-NEXT: retq
-; KNL-NEXT: ## -- End function
;
; SKX-LABEL: test34:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vcmpltps (%rdi), %xmm0, %k1
; SKX-NEXT: vblendmps %xmm0, %xmm1, %xmm0 {%k1}
; SKX-NEXT: retq
-; SKX-NEXT: ## -- End function
%y = load <4 x float>, <4 x float>* %yp, align 4
%mask = fcmp olt <4 x float> %x, %y
%max = select <4 x i1> %mask, <4 x float> %x, <4 x float> %x1
@@ -1040,22 +629,20 @@ define <4 x float> @test34(<4 x float> %x, <4 x float> %x1, <4 x float>* %yp) no
define <8 x float> @test35(<8 x float> %x, <8 x float> %x1, <8 x float>* %yp) nounwind {
; KNL-LABEL: test35:
-; KNL: ## BB#0:
-; KNL-NEXT: ## kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
-; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; KNL: ## %bb.0:
+; KNL-NEXT: ## kill: def %ymm1 killed %ymm1 def %zmm1
+; KNL-NEXT: ## kill: def %ymm0 killed %ymm0 def %zmm0
; KNL-NEXT: vmovups (%rdi), %ymm2
; KNL-NEXT: vcmpltps %zmm2, %zmm0, %k1
; KNL-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1}
-; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; KNL-NEXT: ## kill: def %ymm0 killed %ymm0 killed %zmm0
; KNL-NEXT: retq
-; KNL-NEXT: ## -- End function
;
; SKX-LABEL: test35:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vcmpltps (%rdi), %ymm0, %k1
; SKX-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1}
; SKX-NEXT: retq
-; SKX-NEXT: ## -- End function
%y = load <8 x float>, <8 x float>* %yp, align 4
%mask = fcmp ogt <8 x float> %y, %x
@@ -1065,11 +652,10 @@ define <8 x float> @test35(<8 x float> %x, <8 x float> %x1, <8 x float>* %yp) no
define <16 x float> @test36(<16 x float> %x, <16 x float> %x1, <16 x float>* %yp) nounwind {
; CHECK-LABEL: test36:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vcmpltps (%rdi), %zmm0, %k1
; CHECK-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
-; CHECK-NEXT: ## -- End function
%y = load <16 x float>, <16 x float>* %yp, align 4
%mask = fcmp olt <16 x float> %x, %y
%max = select <16 x i1> %mask, <16 x float> %x, <16 x float> %x1
@@ -1078,11 +664,10 @@ define <16 x float> @test36(<16 x float> %x, <16 x float> %x1, <16 x float>* %yp
define <8 x double> @test37(<8 x double> %x, <8 x double> %x1, double* %ptr) nounwind {
; CHECK-LABEL: test37:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vcmpltpd (%rdi){1to8}, %zmm0, %k1
; CHECK-NEXT: vblendmpd %zmm0, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
-; CHECK-NEXT: ## -- End function
%a = load double, double* %ptr
%v = insertelement <8 x double> undef, double %a, i32 0
@@ -1095,19 +680,17 @@ define <8 x double> @test37(<8 x double> %x, <8 x double> %x1, double* %ptr) nou
define <4 x double> @test38(<4 x double> %x, <4 x double> %x1, double* %ptr) nounwind {
; KNL-LABEL: test38:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: vbroadcastsd (%rdi), %ymm2
; KNL-NEXT: vcmpltpd %ymm2, %ymm0, %ymm2
; KNL-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
; KNL-NEXT: retq
-; KNL-NEXT: ## -- End function
;
; SKX-LABEL: test38:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vcmpltpd (%rdi){1to4}, %ymm0, %k1
; SKX-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
; SKX-NEXT: retq
-; SKX-NEXT: ## -- End function
%a = load double, double* %ptr
%v = insertelement <4 x double> undef, double %a, i32 0
@@ -1120,19 +703,17 @@ define <4 x double> @test38(<4 x double> %x, <4 x double> %x1, double* %ptr) nou
define <2 x double> @test39(<2 x double> %x, <2 x double> %x1, double* %ptr) nounwind {
; KNL-LABEL: test39:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0]
; KNL-NEXT: vcmpltpd %xmm2, %xmm0, %xmm2
; KNL-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; KNL-NEXT: retq
-; KNL-NEXT: ## -- End function
;
; SKX-LABEL: test39:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vcmpltpd (%rdi){1to2}, %xmm0, %k1
; SKX-NEXT: vblendmpd %xmm0, %xmm1, %xmm0 {%k1}
; SKX-NEXT: retq
-; SKX-NEXT: ## -- End function
%a = load double, double* %ptr
%v = insertelement <2 x double> undef, double %a, i32 0
@@ -1146,11 +727,10 @@ define <2 x double> @test39(<2 x double> %x, <2 x double> %x1, double* %ptr) nou
define <16 x float> @test40(<16 x float> %x, <16 x float> %x1, float* %ptr) nounwind {
; CHECK-LABEL: test40:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vcmpltps (%rdi){1to16}, %zmm0, %k1
; CHECK-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
-; CHECK-NEXT: ## -- End function
%a = load float, float* %ptr
%v = insertelement <16 x float> undef, float %a, i32 0
@@ -1163,22 +743,20 @@ define <16 x float> @test40(<16 x float> %x, <16 x float> %x1, float* %ptr) n
define <8 x float> @test41(<8 x float> %x, <8 x float> %x1, float* %ptr) nounwind {
; KNL-LABEL: test41:
-; KNL: ## BB#0:
-; KNL-NEXT: ## kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
-; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; KNL: ## %bb.0:
+; KNL-NEXT: ## kill: def %ymm1 killed %ymm1 def %zmm1
+; KNL-NEXT: ## kill: def %ymm0 killed %ymm0 def %zmm0
; KNL-NEXT: vbroadcastss (%rdi), %ymm2
; KNL-NEXT: vcmpltps %zmm2, %zmm0, %k1
; KNL-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1}
-; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; KNL-NEXT: ## kill: def %ymm0 killed %ymm0 killed %zmm0
; KNL-NEXT: retq
-; KNL-NEXT: ## -- End function
;
; SKX-LABEL: test41:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vcmpltps (%rdi){1to8}, %ymm0, %k1
; SKX-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1}
; SKX-NEXT: retq
-; SKX-NEXT: ## -- End function
%a = load float, float* %ptr
%v = insertelement <8 x float> undef, float %a, i32 0
@@ -1191,19 +769,17 @@ define <8 x float> @test41(<8 x float> %x, <8 x float> %x1, float* %ptr) noun
define <4 x float> @test42(<4 x float> %x, <4 x float> %x1, float* %ptr) nounwind {
; KNL-LABEL: test42:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: vbroadcastss (%rdi), %xmm2
; KNL-NEXT: vcmpltps %xmm2, %xmm0, %xmm2
; KNL-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
; KNL-NEXT: retq
-; KNL-NEXT: ## -- End function
;
; SKX-LABEL: test42:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vcmpltps (%rdi){1to4}, %xmm0, %k1
; SKX-NEXT: vblendmps %xmm0, %xmm1, %xmm0 {%k1}
; SKX-NEXT: retq
-; SKX-NEXT: ## -- End function
%a = load float, float* %ptr
%v = insertelement <4 x float> undef, float %a, i32 0
@@ -1216,23 +792,21 @@ define <4 x float> @test42(<4 x float> %x, <4 x float> %x1, float* %ptr) noun
define <8 x double> @test43(<8 x double> %x, <8 x double> %x1, double* %ptr,<8 x i1> %mask_in) nounwind {
; KNL-LABEL: test43:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: vpmovsxwq %xmm2, %zmm2
; KNL-NEXT: vpsllq $63, %zmm2, %zmm2
; KNL-NEXT: vptestmq %zmm2, %zmm2, %k1
; KNL-NEXT: vcmpltpd (%rdi){1to8}, %zmm0, %k1 {%k1}
; KNL-NEXT: vblendmpd %zmm0, %zmm1, %zmm0 {%k1}
; KNL-NEXT: retq
-; KNL-NEXT: ## -- End function
;
; SKX-LABEL: test43:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpsllw $15, %xmm2, %xmm2
; SKX-NEXT: vpmovw2m %xmm2, %k1
; SKX-NEXT: vcmpltpd (%rdi){1to8}, %zmm0, %k1 {%k1}
; SKX-NEXT: vblendmpd %zmm0, %zmm1, %zmm0 {%k1}
; SKX-NEXT: retq
-; SKX-NEXT: ## -- End function
%a = load double, double* %ptr
%v = insertelement <8 x double> undef, double %a, i32 0
@@ -1246,7 +820,7 @@ define <8 x double> @test43(<8 x double> %x, <8 x double> %x1, double* %ptr,<8 x
define <4 x i32> @test44(<4 x i16> %x, <4 x i16> %y) #0 {
; KNL-LABEL: test44:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: vpxor %xmm2, %xmm2, %xmm2
; KNL-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
; KNL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
@@ -1254,7 +828,7 @@ define <4 x i32> @test44(<4 x i16> %x, <4 x i16> %y) #0 {
; KNL-NEXT: retq
;
; SKX-LABEL: test44:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2
; SKX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
; SKX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
@@ -1268,7 +842,7 @@ define <4 x i32> @test44(<4 x i16> %x, <4 x i16> %y) #0 {
define <2 x i64> @test45(<2 x i16> %x, <2 x i16> %y) #0 {
; KNL-LABEL: test45:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: vpxor %xmm2, %xmm2, %xmm2
; KNL-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
; KNL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
@@ -1277,7 +851,7 @@ define <2 x i64> @test45(<2 x i16> %x, <2 x i16> %y) #0 {
; KNL-NEXT: retq
;
; SKX-LABEL: test45:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2
; SKX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
; SKX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
@@ -1291,14 +865,14 @@ define <2 x i64> @test45(<2 x i16> %x, <2 x i16> %y) #0 {
define <2 x i64> @test46(<2 x float> %x, <2 x float> %y) #0 {
; KNL-LABEL: test46:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: vcmpeqps %xmm1, %xmm0, %xmm0
; KNL-NEXT: vpmovsxdq %xmm0, %xmm0
-; KNL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; KNL-NEXT: vpsrlq $63, %xmm0, %xmm0
; KNL-NEXT: retq
;
; SKX-LABEL: test46:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vcmpeqps %xmm1, %xmm0, %k1
; SKX-NEXT: vmovdqa64 {{.*}}(%rip), %xmm0 {%k1} {z}
; SKX-NEXT: retq
diff --git a/test/CodeGen/X86/avx512-vec3-crash.ll b/test/CodeGen/X86/avx512-vec3-crash.ll
index 281456c235b5..562ac1fe3692 100644
--- a/test/CodeGen/X86/avx512-vec3-crash.ll
+++ b/test/CodeGen/X86/avx512-vec3-crash.ll
@@ -4,7 +4,7 @@
; This test crashed during type legalization of SETCC result type.
define <3 x i8 > @foo(<3 x i8>%x, <3 x i8>%a, <3 x i8>%b) {
; CHECK-LABEL: foo:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vmovd %edi, %xmm0
; CHECK-NEXT: vpinsrd $1, %esi, %xmm0, %xmm0
; CHECK-NEXT: vpinsrd $2, %edx, %xmm0, %xmm0
@@ -20,9 +20,9 @@ define <3 x i8 > @foo(<3 x i8>%x, <3 x i8>%a, <3 x i8>%b) {
; CHECK-NEXT: vpextrb $0, %xmm0, %eax
; CHECK-NEXT: vpextrb $4, %xmm0, %edx
; CHECK-NEXT: vpextrb $8, %xmm0, %ecx
-; CHECK-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
-; CHECK-NEXT: # kill: %DL<def> %DL<kill> %EDX<kill>
-; CHECK-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
+; CHECK-NEXT: # kill: def %al killed %al killed %eax
+; CHECK-NEXT: # kill: def %dl killed %dl killed %edx
+; CHECK-NEXT: # kill: def %cl killed %cl killed %ecx
; CHECK-NEXT: retq
%cmp.i = icmp slt <3 x i8> %x, %a
%res = sext <3 x i1> %cmp.i to <3 x i8>
diff --git a/test/CodeGen/X86/avx512-vpclmulqdq.ll b/test/CodeGen/X86/avx512-vpclmulqdq.ll
new file mode 100644
index 000000000000..00dc6ff3cf0f
--- /dev/null
+++ b/test/CodeGen/X86/avx512-vpclmulqdq.ll
@@ -0,0 +1,11 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+vpclmulqdq -show-mc-encoding | FileCheck %s --check-prefix=AVX512_VPCLMULQDQ
+
+define <8 x i64> @test_x86_pclmulqdq(<8 x i64> %a0, <8 x i64> %a1) {
+; AVX512_VPCLMULQDQ-LABEL: test_x86_pclmulqdq:
+; AVX512_VPCLMULQDQ: # %bb.0:
+; AVX512_VPCLMULQDQ-NEXT: vpclmulqdq $1, %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf3,0x7d,0x48,0x44,0xc1,0x01]
+; AVX512_VPCLMULQDQ-NEXT: retq # encoding: [0xc3]
+ %res = call <8 x i64> @llvm.x86.pclmulqdq.512(<8 x i64> %a0, <8 x i64> %a1, i8 1)
+ ret <8 x i64> %res
+}
+declare <8 x i64> @llvm.x86.pclmulqdq.512(<8 x i64>, <8 x i64>, i8) nounwind readnone
diff --git a/test/CodeGen/X86/avx512-vpermv3-commute.ll b/test/CodeGen/X86/avx512-vpermv3-commute.ll
index 2827f471762f..9031a296becf 100644
--- a/test/CodeGen/X86/avx512-vpermv3-commute.ll
+++ b/test/CodeGen/X86/avx512-vpermv3-commute.ll
@@ -7,7 +7,7 @@ declare <16 x i32> @llvm.x86.avx512.mask.vpermi2var.d.512(<16 x i32>, <16 x i32>
define <16 x i32>@test_int_x86_avx512_mask_vpermi2var_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32>* %x2p) {
; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_d_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpermt2d (%rdi), %zmm1, %zmm0
; CHECK-NEXT: retq
%x2 = load <16 x i32>, <16 x i32>* %x2p
@@ -19,7 +19,7 @@ declare <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double>, <8 x
define <8 x double>@test_int_x86_avx512_mask_vpermi2var_pd_512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_pd_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpermt2pd %zmm2, %zmm1, %zmm0
; CHECK-NEXT: retq
%res = call <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 -1)
@@ -30,7 +30,7 @@ declare <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float>, <16 x
define <16 x float>@test_int_x86_avx512_mask_vpermi2var_ps_512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_ps_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpermt2ps %zmm2, %zmm1, %zmm0
; CHECK-NEXT: retq
%res = call <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 -1)
@@ -41,7 +41,7 @@ declare <8 x i64> @llvm.x86.avx512.mask.vpermi2var.q.512(<8 x i64>, <8 x i64>, <
define <8 x i64>@test_int_x86_avx512_mask_vpermi2var_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_q_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpermt2q %zmm2, %zmm1, %zmm0
; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.mask.vpermi2var.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
@@ -52,7 +52,7 @@ declare <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32>, <16 x i32
define <16 x i32>@test_int_x86_avx512_maskz_vpermt2var_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32>* %x2p, i16 %x3) {
; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_d_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: vpermi2d (%rdi), %zmm1, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -65,7 +65,7 @@ declare <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64>, <8 x do
define <8 x double>@test_int_x86_avx512_maskz_vpermt2var_pd_512(<8 x i64> %x0, <8 x double> %x1, double* %x2ptr, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_pd_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: vpermi2pd (%rdi){1to8}, %zmm1, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -80,7 +80,7 @@ declare <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32>, <16 x
define <16 x float>@test_int_x86_avx512_maskz_vpermt2var_ps_512(<16 x i32> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3) {
; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_ps_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1
; CHECK-NEXT: vpermi2ps %zmm2, %zmm1, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -93,7 +93,7 @@ declare <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64>, <8 x i64>,
define <8 x i64>@test_int_x86_avx512_maskz_vpermt2var_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_q_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1
; CHECK-NEXT: vpermi2q %zmm2, %zmm1, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -105,7 +105,7 @@ declare <16 x i32> @llvm.x86.avx512.mask.vpermt2var.d.512(<16 x i32>, <16 x i32>
define <16 x i32>@test_int_x86_avx512_mask_vpermt2var_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_vpermt2var_d_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpermi2d %zmm2, %zmm1, %zmm0
; CHECK-NEXT: retq
%res = call <16 x i32> @llvm.x86.avx512.mask.vpermt2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1)
@@ -116,7 +116,7 @@ declare <4 x i32> @llvm.x86.avx512.mask.vpermt2var.d.128(<4 x i32>, <4 x i32>, <
define <4 x i32>@test_int_x86_avx512_mask_vpermt2var_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_vpermt2var_d_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpermi2d %xmm2, %xmm1, %xmm0
; CHECK-NEXT: retq
%res = call <4 x i32> @llvm.x86.avx512.mask.vpermt2var.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 -1)
@@ -127,7 +127,7 @@ declare <4 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.128(<4 x i32>, <4 x i32>,
define <4 x i32>@test_int_x86_avx512_maskz_vpermt2var_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_d_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1
; CHECK-NEXT: vpermi2d %xmm2, %xmm1, %xmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -137,7 +137,7 @@ define <4 x i32>@test_int_x86_avx512_maskz_vpermt2var_d_128(<4 x i32> %x0, <4 x
define <4 x i32>@test_int_x86_avx512_maskz_vpermt2var_d_128_broadcast(<4 x i32> %x0, <4 x i32> %x1, i32* %x2ptr, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_d_128_broadcast:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: vpermi2d (%rdi){1to4}, %xmm1, %xmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -152,7 +152,7 @@ declare <8 x i32> @llvm.x86.avx512.mask.vpermt2var.d.256(<8 x i32>, <8 x i32>, <
define <8 x i32>@test_int_x86_avx512_mask_vpermt2var_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_vpermt2var_d_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpermi2d %ymm2, %ymm1, %ymm0
; CHECK-NEXT: retq
%res = call <8 x i32> @llvm.x86.avx512.mask.vpermt2var.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1)
@@ -163,7 +163,7 @@ declare <8 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.256(<8 x i32>, <8 x i32>,
define <8 x i32>@test_int_x86_avx512_maskz_vpermt2var_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_d_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1
; CHECK-NEXT: vpermi2d %ymm2, %ymm1, %ymm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -175,7 +175,7 @@ declare <2 x double> @llvm.x86.avx512.mask.vpermi2var.pd.128(<2 x double>, <2 x
define <2 x double>@test_int_x86_avx512_mask_vpermi2var_pd_128(<2 x double> %x0, <2 x i64> %x1, <2 x double> %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_pd_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpermt2pd %xmm2, %xmm1, %xmm0
; CHECK-NEXT: retq
%res = call <2 x double> @llvm.x86.avx512.mask.vpermi2var.pd.128(<2 x double> %x0, <2 x i64> %x1, <2 x double> %x2, i8 -1)
@@ -186,7 +186,7 @@ declare <4 x double> @llvm.x86.avx512.mask.vpermi2var.pd.256(<4 x double>, <4 x
define <4 x double>@test_int_x86_avx512_mask_vpermi2var_pd_256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_pd_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpermt2pd %ymm2, %ymm1, %ymm0
; CHECK-NEXT: retq
%res = call <4 x double> @llvm.x86.avx512.mask.vpermi2var.pd.256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2, i8 -1)
@@ -197,7 +197,7 @@ declare <4 x float> @llvm.x86.avx512.mask.vpermi2var.ps.128(<4 x float>, <4 x i3
define <4 x float>@test_int_x86_avx512_mask_vpermi2var_ps_128(<4 x float> %x0, <4 x i32> %x1, <4 x float> %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_ps_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpermt2ps %xmm2, %xmm1, %xmm0
; CHECK-NEXT: retq
%res = call <4 x float> @llvm.x86.avx512.mask.vpermi2var.ps.128(<4 x float> %x0, <4 x i32> %x1, <4 x float> %x2, i8 -1)
@@ -208,7 +208,7 @@ declare <8 x float> @llvm.x86.avx512.mask.vpermi2var.ps.256(<8 x float>, <8 x i3
define <8 x float>@test_int_x86_avx512_mask_vpermi2var_ps_256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_ps_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpermt2ps %ymm2, %ymm1, %ymm0
; CHECK-NEXT: retq
%res = call <8 x float> @llvm.x86.avx512.mask.vpermi2var.ps.256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2, i8 -1)
@@ -217,7 +217,7 @@ define <8 x float>@test_int_x86_avx512_mask_vpermi2var_ps_256(<8 x float> %x0, <
define <8 x float>@test_int_x86_avx512_mask_vpermi2var_ps_256_load(<8 x float> %x0, <8 x i32> %x1, <8 x float>* %x2p) {
; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_ps_256_load:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpermt2ps (%rdi), %ymm1, %ymm0
; CHECK-NEXT: retq
%x2 = load <8 x float>, <8 x float>* %x2p
@@ -227,7 +227,7 @@ define <8 x float>@test_int_x86_avx512_mask_vpermi2var_ps_256_load(<8 x float> %
define <8 x float>@test_int_x86_avx512_mask_vpermi2var_ps_256_broadcast(<8 x float> %x0, <8 x i32> %x1, float* %x2ptr) {
; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_ps_256_broadcast:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpermt2ps (%rdi){1to8}, %ymm1, %ymm0
; CHECK-NEXT: retq
%x2s = load float, float* %x2ptr
@@ -241,7 +241,7 @@ declare <16 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.128(<16 x i8>, <16 x i8>,
define <16 x i8>@test_int_x86_avx512_mask_vpermi2var_qi_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_qi_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpermt2b %xmm2, %xmm1, %xmm0
; CHECK-NEXT: retq
%res = call <16 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 -1)
@@ -252,7 +252,7 @@ declare <32 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.256(<32 x i8>, <32 x i8>,
define <32 x i8>@test_int_x86_avx512_mask_vpermi2var_qi_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_qi_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpermt2b %ymm2, %ymm1, %ymm0
; CHECK-NEXT: retq
%res = call <32 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1)
@@ -263,7 +263,7 @@ declare <16 x i8> @llvm.x86.avx512.mask.vpermt2var.qi.128(<16 x i8>, <16 x i8>,
define <16 x i8>@test_int_x86_avx512_mask_vpermt2var_qi_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_vpermt2var_qi_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpermi2b %xmm2, %xmm1, %xmm0
; CHECK-NEXT: retq
%res = call <16 x i8> @llvm.x86.avx512.mask.vpermt2var.qi.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 -1)
@@ -272,7 +272,7 @@ define <16 x i8>@test_int_x86_avx512_mask_vpermt2var_qi_128(<16 x i8> %x0, <16 x
define <16 x i8>@test_int_x86_avx512_mask_vpermt2var_qi_128_load(<16 x i8> %x0, <16 x i8> %x1, <16 x i8>* %x2p) {
; CHECK-LABEL: test_int_x86_avx512_mask_vpermt2var_qi_128_load:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpermi2b (%rdi), %xmm1, %xmm0
; CHECK-NEXT: retq
%x2 = load <16 x i8>, <16 x i8>* %x2p
@@ -284,7 +284,7 @@ declare <32 x i8> @llvm.x86.avx512.mask.vpermt2var.qi.256(<32 x i8>, <32 x i8>,
define <32 x i8>@test_int_x86_avx512_mask_vpermt2var_qi_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_vpermt2var_qi_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpermi2b %ymm2, %ymm1, %ymm0
; CHECK-NEXT: retq
%res = call <32 x i8> @llvm.x86.avx512.mask.vpermt2var.qi.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1)
@@ -295,7 +295,7 @@ declare <16 x i8> @llvm.x86.avx512.maskz.vpermt2var.qi.128(<16 x i8>, <16 x i8>,
define <16 x i8>@test_int_x86_avx512_maskz_vpermt2var_qi_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) {
; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_qi_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1
; CHECK-NEXT: vpermi2b %xmm2, %xmm1, %xmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -305,7 +305,7 @@ define <16 x i8>@test_int_x86_avx512_maskz_vpermt2var_qi_128(<16 x i8> %x0, <16
define <16 x i8>@test_int_x86_avx512_maskz_vpermt2var_qi_128_load(<16 x i8> %x0, <16 x i8> %x1, <16 x i8>* %x2p, i16 %x3) {
; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_qi_128_load:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: vpermi2b (%rdi), %xmm1, %xmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -318,7 +318,7 @@ declare <32 x i8> @llvm.x86.avx512.maskz.vpermt2var.qi.256(<32 x i8>, <32 x i8>,
define <32 x i8>@test_int_x86_avx512_maskz_vpermt2var_qi_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) {
; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_qi_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1
; CHECK-NEXT: vpermi2b %ymm2, %ymm1, %ymm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -328,7 +328,7 @@ define <32 x i8>@test_int_x86_avx512_maskz_vpermt2var_qi_256(<32 x i8> %x0, <32
define <32 x i8>@test_int_x86_avx512_maskz_vpermt2var_qi_256_load(<32 x i8> %x0, <32 x i8> %x1, <32 x i8>* %x2p, i32 %x3) {
; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_qi_256_load:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: vpermi2b (%rdi), %ymm1, %ymm0 {%k1} {z}
; CHECK-NEXT: retq
diff --git a/test/CodeGen/X86/avx512-vpternlog-commute.ll b/test/CodeGen/X86/avx512-vpternlog-commute.ll
index c917e0b17f1c..a67994efa0e8 100644
--- a/test/CodeGen/X86/avx512-vpternlog-commute.ll
+++ b/test/CodeGen/X86/avx512-vpternlog-commute.ll
@@ -8,7 +8,7 @@ declare <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32>, <16 x i32>,
define <16 x i32> @vpternlog_v16i32_012(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) {
; CHECK-LABEL: vpternlog_v16i32_012:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpternlogd $114, %zmm2, %zmm1, %zmm0
; CHECK-NEXT: retq
%res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 114, i16 -1)
@@ -17,7 +17,7 @@ define <16 x i32> @vpternlog_v16i32_012(<16 x i32> %x0, <16 x i32> %x1, <16 x i3
define <16 x i32> @vpternlog_v16i32_102(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) {
; CHECK-LABEL: vpternlog_v16i32_102:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpternlogd $78, %zmm2, %zmm1, %zmm0
; CHECK-NEXT: retq
%res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2, i32 114, i16 -1)
@@ -26,7 +26,7 @@ define <16 x i32> @vpternlog_v16i32_102(<16 x i32> %x0, <16 x i32> %x1, <16 x i3
define <16 x i32> @vpternlog_v16i32_210(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) {
; CHECK-LABEL: vpternlog_v16i32_210:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpternlogd $78, %zmm0, %zmm2, %zmm1
; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT: retq
@@ -36,7 +36,7 @@ define <16 x i32> @vpternlog_v16i32_210(<16 x i32> %x0, <16 x i32> %x1, <16 x i3
define <16 x i32> @vpternlog_v16i32_012_load0(<16 x i32>* %x0ptr, <16 x i32> %x1, <16 x i32> %x2) {
; CHECK-LABEL: vpternlog_v16i32_012_load0:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpternlogd $46, (%rdi), %zmm1, %zmm0
; CHECK-NEXT: retq
%x0 = load <16 x i32>, <16 x i32>* %x0ptr
@@ -46,7 +46,7 @@ define <16 x i32> @vpternlog_v16i32_012_load0(<16 x i32>* %x0ptr, <16 x i32> %x1
define <16 x i32> @vpternlog_v16i32_012_load1(<16 x i32> %x0, <16 x i32>* %x1ptr, <16 x i32> %x2) {
; CHECK-LABEL: vpternlog_v16i32_012_load1:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpternlogd $116, (%rdi), %zmm1, %zmm0
; CHECK-NEXT: retq
%x1 = load <16 x i32>, <16 x i32>* %x1ptr
@@ -56,7 +56,7 @@ define <16 x i32> @vpternlog_v16i32_012_load1(<16 x i32> %x0, <16 x i32>* %x1ptr
define <16 x i32> @vpternlog_v16i32_012_load2(<16 x i32> %x0, <16 x i32> %x1, <16 x i32>* %x2ptr) {
; CHECK-LABEL: vpternlog_v16i32_012_load2:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpternlogd $114, (%rdi), %zmm1, %zmm0
; CHECK-NEXT: retq
%x2 = load <16 x i32>, <16 x i32>* %x2ptr
@@ -66,7 +66,7 @@ define <16 x i32> @vpternlog_v16i32_012_load2(<16 x i32> %x0, <16 x i32> %x1, <1
define <16 x i32> @vpternlog_v16i32_102_load0(<16 x i32>* %x0ptr, <16 x i32> %x1, <16 x i32> %x2) {
; CHECK-LABEL: vpternlog_v16i32_102_load0:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpternlogd $116, (%rdi), %zmm1, %zmm0
; CHECK-NEXT: retq
%x0 = load <16 x i32>, <16 x i32>* %x0ptr
@@ -76,7 +76,7 @@ define <16 x i32> @vpternlog_v16i32_102_load0(<16 x i32>* %x0ptr, <16 x i32> %x1
define <16 x i32> @vpternlog_v16i32_102_load1(<16 x i32> %x0, <16 x i32>* %x1ptr, <16 x i32> %x2) {
; CHECK-LABEL: vpternlog_v16i32_102_load1:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpternlogd $46, (%rdi), %zmm1, %zmm0
; CHECK-NEXT: retq
%x1 = load <16 x i32>, <16 x i32>* %x1ptr
@@ -86,7 +86,7 @@ define <16 x i32> @vpternlog_v16i32_102_load1(<16 x i32> %x0, <16 x i32>* %x1ptr
define <16 x i32> @vpternlog_v16i32_102_load2(<16 x i32> %x0, <16 x i32> %x1, <16 x i32>* %x2ptr) {
; CHECK-LABEL: vpternlog_v16i32_102_load2:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpternlogd $78, (%rdi), %zmm1, %zmm0
; CHECK-NEXT: retq
%x2 = load <16 x i32>, <16 x i32>* %x2ptr
@@ -96,7 +96,7 @@ define <16 x i32> @vpternlog_v16i32_102_load2(<16 x i32> %x0, <16 x i32> %x1, <1
define <16 x i32> @vpternlog_v16i32_210_load0(<16 x i32>* %x0ptr, <16 x i32> %x1, <16 x i32> %x2) {
; CHECK-LABEL: vpternlog_v16i32_210_load0:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpternlogd $78, (%rdi), %zmm1, %zmm0
; CHECK-NEXT: retq
%x0 = load <16 x i32>, <16 x i32>* %x0ptr
@@ -106,7 +106,7 @@ define <16 x i32> @vpternlog_v16i32_210_load0(<16 x i32>* %x0ptr, <16 x i32> %x1
define <16 x i32> @vpternlog_v16i32_210_load1(<16 x i32> %x0, <16 x i32>* %x1ptr, <16 x i32> %x2) {
; CHECK-LABEL: vpternlog_v16i32_210_load1:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpternlogd $92, (%rdi), %zmm1, %zmm0
; CHECK-NEXT: retq
%x1 = load <16 x i32>, <16 x i32>* %x1ptr
@@ -116,7 +116,7 @@ define <16 x i32> @vpternlog_v16i32_210_load1(<16 x i32> %x0, <16 x i32>* %x1ptr
define <16 x i32> @vpternlog_v16i32_210_load2(<16 x i32> %x0, <16 x i32> %x1, <16 x i32>* %x2ptr) {
; CHECK-LABEL: vpternlog_v16i32_210_load2:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpternlogd $58, (%rdi), %zmm1, %zmm0
; CHECK-NEXT: retq
%x2 = load <16 x i32>, <16 x i32>* %x2ptr
@@ -126,7 +126,7 @@ define <16 x i32> @vpternlog_v16i32_210_load2(<16 x i32> %x0, <16 x i32> %x1, <1
define <16 x i32> @vpternlog_v16i32_021_load0(<16 x i32>* %x0ptr, <16 x i32> %x1, <16 x i32> %x2) {
; CHECK-LABEL: vpternlog_v16i32_021_load0:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpternlogd $58, (%rdi), %zmm1, %zmm0
; CHECK-NEXT: retq
%x0 = load <16 x i32>, <16 x i32>* %x0ptr
@@ -136,7 +136,7 @@ define <16 x i32> @vpternlog_v16i32_021_load0(<16 x i32>* %x0ptr, <16 x i32> %x1
define <16 x i32> @vpternlog_v16i32_021_load1(<16 x i32> %x0, <16 x i32>* %x1ptr, <16 x i32> %x2) {
; CHECK-LABEL: vpternlog_v16i32_021_load1:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpternlogd $114, (%rdi), %zmm1, %zmm0
; CHECK-NEXT: retq
%x1 = load <16 x i32>, <16 x i32>* %x1ptr
@@ -146,7 +146,7 @@ define <16 x i32> @vpternlog_v16i32_021_load1(<16 x i32> %x0, <16 x i32>* %x1ptr
define <16 x i32> @vpternlog_v16i32_021_load2(<16 x i32> %x0, <16 x i32> %x1, <16 x i32>* %x2ptr) {
; CHECK-LABEL: vpternlog_v16i32_021_load2:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpternlogd $116, (%rdi), %zmm1, %zmm0
; CHECK-NEXT: retq
%x2 = load <16 x i32>, <16 x i32>* %x2ptr
@@ -156,7 +156,7 @@ define <16 x i32> @vpternlog_v16i32_021_load2(<16 x i32> %x0, <16 x i32> %x1, <1
define <16 x i32> @vpternlog_v16i32_012_mask(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %mask) {
; CHECK-LABEL: vpternlog_v16i32_012_mask:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1
; CHECK-NEXT: vpternlogd $114, %zmm2, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
@@ -166,7 +166,7 @@ define <16 x i32> @vpternlog_v16i32_012_mask(<16 x i32> %x0, <16 x i32> %x1, <16
define <16 x i32> @vpternlog_v16i32_102_mask(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %mask) {
; CHECK-LABEL: vpternlog_v16i32_102_mask:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1
; CHECK-NEXT: vpternlogd $114, %zmm2, %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
@@ -177,7 +177,7 @@ define <16 x i32> @vpternlog_v16i32_102_mask(<16 x i32> %x0, <16 x i32> %x1, <16
define <16 x i32> @vpternlog_v16i32_210_mask(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %mask) {
; CHECK-LABEL: vpternlog_v16i32_210_mask:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1
; CHECK-NEXT: vpternlogd $114, %zmm0, %zmm1, %zmm2 {%k1}
; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
@@ -188,7 +188,7 @@ define <16 x i32> @vpternlog_v16i32_210_mask(<16 x i32> %x0, <16 x i32> %x1, <16
define <16 x i32> @vpternlog_v16i32_012_mask1(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %mask) {
; CHECK-LABEL: vpternlog_v16i32_012_mask1:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1
; CHECK-NEXT: vpternlogd $78, %zmm2, %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
@@ -201,7 +201,7 @@ define <16 x i32> @vpternlog_v16i32_012_mask1(<16 x i32> %x0, <16 x i32> %x1, <1
define <16 x i32> @vpternlog_v16i32_012_mask2(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %mask) {
; CHECK-LABEL: vpternlog_v16i32_012_mask2:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1
; CHECK-NEXT: vpternlogd $58, %zmm0, %zmm1, %zmm2 {%k1}
; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0
@@ -214,7 +214,7 @@ define <16 x i32> @vpternlog_v16i32_012_mask2(<16 x i32> %x0, <16 x i32> %x1, <1
define <16 x i32> @vpternlog_v16i32_012_load0_mask(<16 x i32>* %x0ptr, <16 x i32> %x1, <16 x i32> %x2, i16 %mask) {
; CHECK-LABEL: vpternlog_v16i32_012_load0_mask:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: vmovdqa32 (%rdi), %zmm2
; CHECK-NEXT: vpternlogd $114, %zmm1, %zmm0, %zmm2 {%k1}
@@ -227,7 +227,7 @@ define <16 x i32> @vpternlog_v16i32_012_load0_mask(<16 x i32>* %x0ptr, <16 x i32
define <16 x i32> @vpternlog_v16i32_012_load0_mask1(<16 x i32>* %x0ptr, <16 x i32> %x1, <16 x i32> %x2, i16 %mask) {
; CHECK-LABEL: vpternlog_v16i32_012_load0_mask1:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: vpternlogd $65, (%rdi), %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
@@ -240,7 +240,7 @@ define <16 x i32> @vpternlog_v16i32_012_load0_mask1(<16 x i32>* %x0ptr, <16 x i3
define <16 x i32> @vpternlog_v16i32_012_load0_mask2(<16 x i32>* %x0ptr, <16 x i32> %x1, <16 x i32> %x2, i16 %mask) {
; CHECK-LABEL: vpternlog_v16i32_012_load0_mask2:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: vpternlogd $33, (%rdi), %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
@@ -254,7 +254,7 @@ define <16 x i32> @vpternlog_v16i32_012_load0_mask2(<16 x i32>* %x0ptr, <16 x i3
define <16 x i32> @vpternlog_v16i32_012_load1_mask(<16 x i32> %x0, <16 x i32>* %x1ptr, <16 x i32> %x2, i16 %mask) {
; CHECK-LABEL: vpternlog_v16i32_012_load1_mask:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: vpternlogd $116, (%rdi), %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
@@ -265,7 +265,7 @@ define <16 x i32> @vpternlog_v16i32_012_load1_mask(<16 x i32> %x0, <16 x i32>* %
define <16 x i32> @vpternlog_v16i32_012_load1_mask2(<16 x i32> %x0, <16 x i32>* %x1ptr, <16 x i32> %x2, i16 %mask) {
; CHECK-LABEL: vpternlog_v16i32_012_load1_mask2:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: vpternlogd $9, (%rdi), %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
@@ -279,7 +279,7 @@ define <16 x i32> @vpternlog_v16i32_012_load1_mask2(<16 x i32> %x0, <16 x i32>*
define <16 x i32> @vpternlog_v16i32_012_load2_mask(<16 x i32> %x0, <16 x i32> %x1, <16 x i32>* %x2ptr, i16 %mask) {
; CHECK-LABEL: vpternlog_v16i32_012_load2_mask:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: vpternlogd $114, (%rdi), %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
@@ -290,7 +290,7 @@ define <16 x i32> @vpternlog_v16i32_012_load2_mask(<16 x i32> %x0, <16 x i32> %x
define <16 x i32> @vpternlog_v16i32_012_load2_mask1(<16 x i32> %x0, <16 x i32> %x1, <16 x i32>* %x2ptr, i16 %mask) {
; CHECK-LABEL: vpternlog_v16i32_012_load2_mask1:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: vpternlogd $9, (%rdi), %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
@@ -304,7 +304,7 @@ define <16 x i32> @vpternlog_v16i32_012_load2_mask1(<16 x i32> %x0, <16 x i32> %
define <16 x i32> @vpternlog_v16i32_102_load0_mask(<16 x i32>* %x0ptr, <16 x i32> %x1, <16 x i32> %x2, i16 %mask) {
; CHECK-LABEL: vpternlog_v16i32_102_load0_mask:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: vpternlogd $116, (%rdi), %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
@@ -315,7 +315,7 @@ define <16 x i32> @vpternlog_v16i32_102_load0_mask(<16 x i32>* %x0ptr, <16 x i32
define <16 x i32> @vpternlog_v16i32_102_load1_mask(<16 x i32> %x0, <16 x i32>* %x1ptr, <16 x i32> %x2, i16 %mask) {
; CHECK-LABEL: vpternlog_v16i32_102_load1_mask:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: vmovdqa32 (%rdi), %zmm2
; CHECK-NEXT: vpternlogd $114, %zmm1, %zmm0, %zmm2 {%k1}
@@ -328,7 +328,7 @@ define <16 x i32> @vpternlog_v16i32_102_load1_mask(<16 x i32> %x0, <16 x i32>* %
define <16 x i32> @vpternlog_v16i32_102_load2_mask(<16 x i32> %x0, <16 x i32> %x1, <16 x i32>* %x2ptr, i16 %mask) {
; CHECK-LABEL: vpternlog_v16i32_102_load2_mask:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: vpternlogd $114, (%rdi), %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
@@ -340,7 +340,7 @@ define <16 x i32> @vpternlog_v16i32_102_load2_mask(<16 x i32> %x0, <16 x i32> %x
define <16 x i32> @vpternlog_v16i32_210_load0_mask(<16 x i32>* %x0ptr, <16 x i32> %x1, <16 x i32> %x2, i16 %mask) {
; CHECK-LABEL: vpternlog_v16i32_210_load0_mask:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: vpternlogd $114, (%rdi), %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
@@ -352,7 +352,7 @@ define <16 x i32> @vpternlog_v16i32_210_load0_mask(<16 x i32>* %x0ptr, <16 x i32
define <16 x i32> @vpternlog_v16i32_210_load1_mask(<16 x i32> %x0, <16 x i32>* %x1ptr, <16 x i32> %x2, i16 %mask) {
; CHECK-LABEL: vpternlog_v16i32_210_load1_mask:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: vpternlogd $116, (%rdi), %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
@@ -364,7 +364,7 @@ define <16 x i32> @vpternlog_v16i32_210_load1_mask(<16 x i32> %x0, <16 x i32>* %
define <16 x i32> @vpternlog_v16i32_210_load2_mask(<16 x i32> %x0, <16 x i32> %x1, <16 x i32>* %x2ptr, i16 %mask) {
; CHECK-LABEL: vpternlog_v16i32_210_load2_mask:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: vmovdqa32 (%rdi), %zmm2
; CHECK-NEXT: vpternlogd $114, %zmm0, %zmm1, %zmm2 {%k1}
@@ -377,7 +377,7 @@ define <16 x i32> @vpternlog_v16i32_210_load2_mask(<16 x i32> %x0, <16 x i32> %x
define <16 x i32> @vpternlog_v16i32_021_load0_mask(<16 x i32>* %x0ptr, <16 x i32> %x1, <16 x i32> %x2, i16 %mask) {
; CHECK-LABEL: vpternlog_v16i32_021_load0_mask:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: vmovdqa32 (%rdi), %zmm2
; CHECK-NEXT: vpternlogd $114, %zmm0, %zmm1, %zmm2 {%k1}
@@ -390,7 +390,7 @@ define <16 x i32> @vpternlog_v16i32_021_load0_mask(<16 x i32>* %x0ptr, <16 x i32
define <16 x i32> @vpternlog_v16i32_021_load1_mask(<16 x i32> %x0, <16 x i32>* %x1ptr, <16 x i32> %x2, i16 %mask) {
; CHECK-LABEL: vpternlog_v16i32_021_load1_mask:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: vpternlogd $114, (%rdi), %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
@@ -401,7 +401,7 @@ define <16 x i32> @vpternlog_v16i32_021_load1_mask(<16 x i32> %x0, <16 x i32>* %
define <16 x i32> @vpternlog_v16i32_021_load2_mask(<16 x i32> %x0, <16 x i32> %x1, <16 x i32>* %x2ptr, i16 %mask) {
; CHECK-LABEL: vpternlog_v16i32_021_load2_mask:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: vpternlogd $116, (%rdi), %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
@@ -412,7 +412,7 @@ define <16 x i32> @vpternlog_v16i32_021_load2_mask(<16 x i32> %x0, <16 x i32> %x
define <16 x i32> @vpternlog_v16i32_012_maskz(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %mask) {
; CHECK-LABEL: vpternlog_v16i32_012_maskz:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1
; CHECK-NEXT: vpternlogd $114, %zmm2, %zmm1, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -422,7 +422,7 @@ define <16 x i32> @vpternlog_v16i32_012_maskz(<16 x i32> %x0, <16 x i32> %x1, <1
define <16 x i32> @vpternlog_v16i32_102_maskz(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %mask) {
; CHECK-LABEL: vpternlog_v16i32_102_maskz:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1
; CHECK-NEXT: vpternlogd $78, %zmm2, %zmm1, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -432,7 +432,7 @@ define <16 x i32> @vpternlog_v16i32_102_maskz(<16 x i32> %x0, <16 x i32> %x1, <1
define <16 x i32> @vpternlog_v16i32_210_maskz(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %mask) {
; CHECK-LABEL: vpternlog_v16i32_210_maskz:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1
; CHECK-NEXT: vpternlogd $78, %zmm0, %zmm2, %zmm1 {%k1} {z}
; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
@@ -443,7 +443,7 @@ define <16 x i32> @vpternlog_v16i32_210_maskz(<16 x i32> %x0, <16 x i32> %x1, <1
define <16 x i32> @vpternlog_v16i32_012_load0_maskz(<16 x i32>* %x0ptr, <16 x i32> %x1, <16 x i32> %x2, i16 %mask) {
; CHECK-LABEL: vpternlog_v16i32_012_load0_maskz:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: vpternlogd $46, (%rdi), %zmm1, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -454,7 +454,7 @@ define <16 x i32> @vpternlog_v16i32_012_load0_maskz(<16 x i32>* %x0ptr, <16 x i3
define <16 x i32> @vpternlog_v16i32_012_load1_maskz(<16 x i32> %x0, <16 x i32>* %x1ptr, <16 x i32> %x2, i16 %mask) {
; CHECK-LABEL: vpternlog_v16i32_012_load1_maskz:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: vpternlogd $116, (%rdi), %zmm1, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -465,7 +465,7 @@ define <16 x i32> @vpternlog_v16i32_012_load1_maskz(<16 x i32> %x0, <16 x i32>*
define <16 x i32> @vpternlog_v16i32_012_load2_maskz(<16 x i32> %x0, <16 x i32> %x1, <16 x i32>* %x2ptr, i16 %mask) {
; CHECK-LABEL: vpternlog_v16i32_012_load2_maskz:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: vpternlogd $114, (%rdi), %zmm1, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -476,7 +476,7 @@ define <16 x i32> @vpternlog_v16i32_012_load2_maskz(<16 x i32> %x0, <16 x i32> %
define <16 x i32> @vpternlog_v16i32_102_load0_maskz(<16 x i32>* %x0ptr, <16 x i32> %x1, <16 x i32> %x2, i16 %mask) {
; CHECK-LABEL: vpternlog_v16i32_102_load0_maskz:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: vpternlogd $116, (%rdi), %zmm1, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -487,7 +487,7 @@ define <16 x i32> @vpternlog_v16i32_102_load0_maskz(<16 x i32>* %x0ptr, <16 x i3
define <16 x i32> @vpternlog_v16i32_102_load1_maskz(<16 x i32> %x0, <16 x i32>* %x1ptr, <16 x i32> %x2, i16 %mask) {
; CHECK-LABEL: vpternlog_v16i32_102_load1_maskz:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: vpternlogd $46, (%rdi), %zmm1, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -498,7 +498,7 @@ define <16 x i32> @vpternlog_v16i32_102_load1_maskz(<16 x i32> %x0, <16 x i32>*
define <16 x i32> @vpternlog_v16i32_102_load2_maskz(<16 x i32> %x0, <16 x i32> %x1, <16 x i32>* %x2ptr, i16 %mask) {
; CHECK-LABEL: vpternlog_v16i32_102_load2_maskz:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: vpternlogd $78, (%rdi), %zmm1, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -509,7 +509,7 @@ define <16 x i32> @vpternlog_v16i32_102_load2_maskz(<16 x i32> %x0, <16 x i32> %
define <16 x i32> @vpternlog_v16i32_210_load0_maskz(<16 x i32>* %x0ptr, <16 x i32> %x1, <16 x i32> %x2, i16 %mask) {
; CHECK-LABEL: vpternlog_v16i32_210_load0_maskz:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: vpternlogd $78, (%rdi), %zmm1, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -520,7 +520,7 @@ define <16 x i32> @vpternlog_v16i32_210_load0_maskz(<16 x i32>* %x0ptr, <16 x i3
define <16 x i32> @vpternlog_v16i32_210_load1_maskz(<16 x i32> %x0, <16 x i32>* %x1ptr, <16 x i32> %x2, i16 %mask) {
; CHECK-LABEL: vpternlog_v16i32_210_load1_maskz:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: vpternlogd $92, (%rdi), %zmm1, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -531,7 +531,7 @@ define <16 x i32> @vpternlog_v16i32_210_load1_maskz(<16 x i32> %x0, <16 x i32>*
define <16 x i32> @vpternlog_v16i32_210_load2_maskz(<16 x i32> %x0, <16 x i32> %x1, <16 x i32>* %x2ptr, i16 %mask) {
; CHECK-LABEL: vpternlog_v16i32_210_load2_maskz:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: vpternlogd $58, (%rdi), %zmm1, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -542,7 +542,7 @@ define <16 x i32> @vpternlog_v16i32_210_load2_maskz(<16 x i32> %x0, <16 x i32> %
define <16 x i32> @vpternlog_v16i32_021_load0_maskz(<16 x i32>* %x0ptr, <16 x i32> %x1, <16 x i32> %x2, i16 %mask) {
; CHECK-LABEL: vpternlog_v16i32_021_load0_maskz:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: vpternlogd $58, (%rdi), %zmm1, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -553,7 +553,7 @@ define <16 x i32> @vpternlog_v16i32_021_load0_maskz(<16 x i32>* %x0ptr, <16 x i3
define <16 x i32> @vpternlog_v16i32_021_load1_maskz(<16 x i32> %x0, <16 x i32>* %x1ptr, <16 x i32> %x2, i16 %mask) {
; CHECK-LABEL: vpternlog_v16i32_021_load1_maskz:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: vpternlogd $114, (%rdi), %zmm1, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -564,7 +564,7 @@ define <16 x i32> @vpternlog_v16i32_021_load1_maskz(<16 x i32> %x0, <16 x i32>*
define <16 x i32> @vpternlog_v16i32_021_load2_maskz(<16 x i32> %x0, <16 x i32> %x1, <16 x i32>* %x2ptr, i16 %mask) {
; CHECK-LABEL: vpternlog_v16i32_021_load2_maskz:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: vpternlogd $116, (%rdi), %zmm1, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -575,7 +575,7 @@ define <16 x i32> @vpternlog_v16i32_021_load2_maskz(<16 x i32> %x0, <16 x i32> %
define <16 x i32> @vpternlog_v16i32_012_broadcast0(i32* %ptr_x0, <16 x i32> %x1, <16 x i32> %x2) {
; CHECK-LABEL: vpternlog_v16i32_012_broadcast0:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpternlogd $46, (%rdi){1to16}, %zmm1, %zmm0
; CHECK-NEXT: retq
%x0_scalar = load i32, i32* %ptr_x0
@@ -587,7 +587,7 @@ define <16 x i32> @vpternlog_v16i32_012_broadcast0(i32* %ptr_x0, <16 x i32> %x1,
define <16 x i32> @vpternlog_v16i32_012_broadcast1(<16 x i32> %x0, i32* %ptr_x1, <16 x i32> %x2) {
; CHECK-LABEL: vpternlog_v16i32_012_broadcast1:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpternlogd $116, (%rdi){1to16}, %zmm1, %zmm0
; CHECK-NEXT: retq
%x1_scalar = load i32, i32* %ptr_x1
@@ -599,7 +599,7 @@ define <16 x i32> @vpternlog_v16i32_012_broadcast1(<16 x i32> %x0, i32* %ptr_x1,
define <16 x i32> @vpternlog_v16i32_012_broadcast2(<16 x i32> %x0, <16 x i32> %x1, i32* %ptr_x2) {
; CHECK-LABEL: vpternlog_v16i32_012_broadcast2:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpternlogd $114, (%rdi){1to16}, %zmm1, %zmm0
; CHECK-NEXT: retq
%x2_scalar = load i32, i32* %ptr_x2
@@ -611,7 +611,7 @@ define <16 x i32> @vpternlog_v16i32_012_broadcast2(<16 x i32> %x0, <16 x i32> %x
define <16 x i32> @vpternlog_v16i32_102_broadcast0(i32* %ptr_x0, <16 x i32> %x1, <16 x i32> %x2) {
; CHECK-LABEL: vpternlog_v16i32_102_broadcast0:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpternlogd $116, (%rdi){1to16}, %zmm1, %zmm0
; CHECK-NEXT: retq
%x0_scalar = load i32, i32* %ptr_x0
@@ -623,7 +623,7 @@ define <16 x i32> @vpternlog_v16i32_102_broadcast0(i32* %ptr_x0, <16 x i32> %x1,
define <16 x i32> @vpternlog_v16i32_102_broadcast1(<16 x i32> %x0, i32* %ptr_x1, <16 x i32> %x2) {
; CHECK-LABEL: vpternlog_v16i32_102_broadcast1:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpternlogd $46, (%rdi){1to16}, %zmm1, %zmm0
; CHECK-NEXT: retq
%x1_scalar = load i32, i32* %ptr_x1
@@ -635,7 +635,7 @@ define <16 x i32> @vpternlog_v16i32_102_broadcast1(<16 x i32> %x0, i32* %ptr_x1,
define <16 x i32> @vpternlog_v16i32_102_broadcast2(<16 x i32> %x0, <16 x i32> %x1, i32* %ptr_x2) {
; CHECK-LABEL: vpternlog_v16i32_102_broadcast2:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpternlogd $78, (%rdi){1to16}, %zmm1, %zmm0
; CHECK-NEXT: retq
%x2_scalar = load i32, i32* %ptr_x2
@@ -647,7 +647,7 @@ define <16 x i32> @vpternlog_v16i32_102_broadcast2(<16 x i32> %x0, <16 x i32> %x
define <16 x i32> @vpternlog_v16i32_210_broadcast0(i32* %ptr_x0, <16 x i32> %x1, <16 x i32> %x2) {
; CHECK-LABEL: vpternlog_v16i32_210_broadcast0:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpternlogd $78, (%rdi){1to16}, %zmm1, %zmm0
; CHECK-NEXT: retq
%x0_scalar = load i32, i32* %ptr_x0
@@ -659,7 +659,7 @@ define <16 x i32> @vpternlog_v16i32_210_broadcast0(i32* %ptr_x0, <16 x i32> %x1,
define <16 x i32> @vpternlog_v16i32_210_broadcast1(<16 x i32> %x0, i32* %ptr_x1, <16 x i32> %x2) {
; CHECK-LABEL: vpternlog_v16i32_210_broadcast1:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpternlogd $92, (%rdi){1to16}, %zmm1, %zmm0
; CHECK-NEXT: retq
%x1_scalar = load i32, i32* %ptr_x1
@@ -671,7 +671,7 @@ define <16 x i32> @vpternlog_v16i32_210_broadcast1(<16 x i32> %x0, i32* %ptr_x1,
define <16 x i32> @vpternlog_v16i32_210_broadcast2(<16 x i32> %x0, <16 x i32> %x1, i32* %ptr_x2) {
; CHECK-LABEL: vpternlog_v16i32_210_broadcast2:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpternlogd $58, (%rdi){1to16}, %zmm1, %zmm0
; CHECK-NEXT: retq
%x2_scalar = load i32, i32* %ptr_x2
@@ -683,7 +683,7 @@ define <16 x i32> @vpternlog_v16i32_210_broadcast2(<16 x i32> %x0, <16 x i32> %x
define <16 x i32> @vpternlog_v16i32_012_broadcast0_mask(i32* %x0ptr, <16 x i32> %x1, <16 x i32> %x2, i16 %mask) {
; CHECK-LABEL: vpternlog_v16i32_012_broadcast0_mask:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: vpbroadcastd (%rdi), %zmm2
; CHECK-NEXT: vpternlogd $114, %zmm1, %zmm0, %zmm2 {%k1}
@@ -698,7 +698,7 @@ define <16 x i32> @vpternlog_v16i32_012_broadcast0_mask(i32* %x0ptr, <16 x i32>
define <16 x i32> @vpternlog_v16i32_012_broadcast1_mask(<16 x i32> %x0, i32* %x1ptr, <16 x i32> %x2, i16 %mask) {
; CHECK-LABEL: vpternlog_v16i32_012_broadcast1_mask:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: vpternlogd $116, (%rdi){1to16}, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
@@ -711,7 +711,7 @@ define <16 x i32> @vpternlog_v16i32_012_broadcast1_mask(<16 x i32> %x0, i32* %x1
define <16 x i32> @vpternlog_v16i32_012_broadcast2_mask(<16 x i32> %x0, <16 x i32> %x1, i32* %x2ptr, i16 %mask) {
; CHECK-LABEL: vpternlog_v16i32_012_broadcast2_mask:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: vpternlogd $114, (%rdi){1to16}, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
@@ -724,7 +724,7 @@ define <16 x i32> @vpternlog_v16i32_012_broadcast2_mask(<16 x i32> %x0, <16 x i3
define <16 x i32> @vpternlog_v16i32_102_broadcast0_mask(i32* %x0ptr, <16 x i32> %x1, <16 x i32> %x2, i16 %mask) {
; CHECK-LABEL: vpternlog_v16i32_102_broadcast0_mask:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: vpternlogd $116, (%rdi){1to16}, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
@@ -737,7 +737,7 @@ define <16 x i32> @vpternlog_v16i32_102_broadcast0_mask(i32* %x0ptr, <16 x i32>
define <16 x i32> @vpternlog_v16i32_102_broadcast1_mask(<16 x i32> %x0, i32* %x1ptr, <16 x i32> %x2, i16 %mask) {
; CHECK-LABEL: vpternlog_v16i32_102_broadcast1_mask:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: vpbroadcastd (%rdi), %zmm2
; CHECK-NEXT: vpternlogd $114, %zmm1, %zmm0, %zmm2 {%k1}
@@ -752,7 +752,7 @@ define <16 x i32> @vpternlog_v16i32_102_broadcast1_mask(<16 x i32> %x0, i32* %x1
define <16 x i32> @vpternlog_v16i32_102_broadcast2_mask(<16 x i32> %x0, <16 x i32> %x1, i32* %x2ptr, i16 %mask) {
; CHECK-LABEL: vpternlog_v16i32_102_broadcast2_mask:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: vpternlogd $114, (%rdi){1to16}, %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
@@ -766,7 +766,7 @@ define <16 x i32> @vpternlog_v16i32_102_broadcast2_mask(<16 x i32> %x0, <16 x i3
define <16 x i32> @vpternlog_v16i32_210_broadcast0_mask(i32* %x0ptr, <16 x i32> %x1, <16 x i32> %x2, i16 %mask) {
; CHECK-LABEL: vpternlog_v16i32_210_broadcast0_mask:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: vpternlogd $114, (%rdi){1to16}, %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
@@ -780,7 +780,7 @@ define <16 x i32> @vpternlog_v16i32_210_broadcast0_mask(i32* %x0ptr, <16 x i32>
define <16 x i32> @vpternlog_v16i32_210_broadcast1_mask(<16 x i32> %x0, i32* %x1ptr, <16 x i32> %x2, i16 %mask) {
; CHECK-LABEL: vpternlog_v16i32_210_broadcast1_mask:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: vpternlogd $116, (%rdi){1to16}, %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
@@ -794,7 +794,7 @@ define <16 x i32> @vpternlog_v16i32_210_broadcast1_mask(<16 x i32> %x0, i32* %x1
define <16 x i32> @vpternlog_v16i32_210_broadcast2_mask(<16 x i32> %x0, <16 x i32> %x1, i32* %x2ptr, i16 %mask) {
; CHECK-LABEL: vpternlog_v16i32_210_broadcast2_mask:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: vpbroadcastd (%rdi), %zmm2
; CHECK-NEXT: vpternlogd $114, %zmm0, %zmm1, %zmm2 {%k1}
@@ -809,7 +809,7 @@ define <16 x i32> @vpternlog_v16i32_210_broadcast2_mask(<16 x i32> %x0, <16 x i3
define <16 x i32> @vpternlog_v16i32_021_broadcast0_mask(i32* %x0ptr, <16 x i32> %x1, <16 x i32> %x2, i16 %mask) {
; CHECK-LABEL: vpternlog_v16i32_021_broadcast0_mask:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: vpbroadcastd (%rdi), %zmm2
; CHECK-NEXT: vpternlogd $114, %zmm0, %zmm1, %zmm2 {%k1}
@@ -824,7 +824,7 @@ define <16 x i32> @vpternlog_v16i32_021_broadcast0_mask(i32* %x0ptr, <16 x i32>
define <16 x i32> @vpternlog_v16i32_021_broadcast1_mask(<16 x i32> %x0, i32* %x1ptr, <16 x i32> %x2, i16 %mask) {
; CHECK-LABEL: vpternlog_v16i32_021_broadcast1_mask:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: vpternlogd $114, (%rdi){1to16}, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
@@ -837,7 +837,7 @@ define <16 x i32> @vpternlog_v16i32_021_broadcast1_mask(<16 x i32> %x0, i32* %x1
define <16 x i32> @vpternlog_v16i32_021_broadcast2_mask(<16 x i32> %x0, <16 x i32> %x1, i32* %x2ptr, i16 %mask) {
; CHECK-LABEL: vpternlog_v16i32_021_broadcast2_mask:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: vpternlogd $116, (%rdi){1to16}, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
@@ -850,7 +850,7 @@ define <16 x i32> @vpternlog_v16i32_021_broadcast2_mask(<16 x i32> %x0, <16 x i3
define <16 x i32> @vpternlog_v16i32_012_broadcast0_maskz(i32* %x0ptr, <16 x i32> %x1, <16 x i32> %x2, i16 %mask) {
; CHECK-LABEL: vpternlog_v16i32_012_broadcast0_maskz:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: vpternlogd $46, (%rdi){1to16}, %zmm1, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -863,7 +863,7 @@ define <16 x i32> @vpternlog_v16i32_012_broadcast0_maskz(i32* %x0ptr, <16 x i32>
define <16 x i32> @vpternlog_v16i32_012_broadcast1_maskz(<16 x i32> %x0, i32* %x1ptr, <16 x i32> %x2, i16 %mask) {
; CHECK-LABEL: vpternlog_v16i32_012_broadcast1_maskz:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: vpternlogd $116, (%rdi){1to16}, %zmm1, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -876,7 +876,7 @@ define <16 x i32> @vpternlog_v16i32_012_broadcast1_maskz(<16 x i32> %x0, i32* %x
define <16 x i32> @vpternlog_v16i32_012_broadcast2_maskz(<16 x i32> %x0, <16 x i32> %x1, i32* %x2ptr, i16 %mask) {
; CHECK-LABEL: vpternlog_v16i32_012_broadcast2_maskz:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: vpternlogd $114, (%rdi){1to16}, %zmm1, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -889,7 +889,7 @@ define <16 x i32> @vpternlog_v16i32_012_broadcast2_maskz(<16 x i32> %x0, <16 x i
define <16 x i32> @vpternlog_v16i32_102_broadcast0_maskz(i32* %x0ptr, <16 x i32> %x1, <16 x i32> %x2, i16 %mask) {
; CHECK-LABEL: vpternlog_v16i32_102_broadcast0_maskz:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: vpternlogd $116, (%rdi){1to16}, %zmm1, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -902,7 +902,7 @@ define <16 x i32> @vpternlog_v16i32_102_broadcast0_maskz(i32* %x0ptr, <16 x i32>
define <16 x i32> @vpternlog_v16i32_102_broadcast1_maskz(<16 x i32> %x0, i32* %x1ptr, <16 x i32> %x2, i16 %mask) {
; CHECK-LABEL: vpternlog_v16i32_102_broadcast1_maskz:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: vpternlogd $46, (%rdi){1to16}, %zmm1, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -915,7 +915,7 @@ define <16 x i32> @vpternlog_v16i32_102_broadcast1_maskz(<16 x i32> %x0, i32* %x
define <16 x i32> @vpternlog_v16i32_102_broadcast2_maskz(<16 x i32> %x0, <16 x i32> %x1, i32* %x2ptr, i16 %mask) {
; CHECK-LABEL: vpternlog_v16i32_102_broadcast2_maskz:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: vpternlogd $78, (%rdi){1to16}, %zmm1, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -928,7 +928,7 @@ define <16 x i32> @vpternlog_v16i32_102_broadcast2_maskz(<16 x i32> %x0, <16 x i
define <16 x i32> @vpternlog_v16i32_210_broadcast0_maskz(i32* %x0ptr, <16 x i32> %x1, <16 x i32> %x2, i16 %mask) {
; CHECK-LABEL: vpternlog_v16i32_210_broadcast0_maskz:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: vpternlogd $78, (%rdi){1to16}, %zmm1, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -941,7 +941,7 @@ define <16 x i32> @vpternlog_v16i32_210_broadcast0_maskz(i32* %x0ptr, <16 x i32>
define <16 x i32> @vpternlog_v16i32_210_broadcast1_maskz(<16 x i32> %x0, i32* %x1ptr, <16 x i32> %x2, i16 %mask) {
; CHECK-LABEL: vpternlog_v16i32_210_broadcast1_maskz:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: vpternlogd $92, (%rdi){1to16}, %zmm1, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -954,7 +954,7 @@ define <16 x i32> @vpternlog_v16i32_210_broadcast1_maskz(<16 x i32> %x0, i32* %x
define <16 x i32> @vpternlog_v16i32_210_broadcast2_maskz(<16 x i32> %x0, <16 x i32> %x1, i32* %x2ptr, i16 %mask) {
; CHECK-LABEL: vpternlog_v16i32_210_broadcast2_maskz:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: vpternlogd $58, (%rdi){1to16}, %zmm1, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -967,7 +967,7 @@ define <16 x i32> @vpternlog_v16i32_210_broadcast2_maskz(<16 x i32> %x0, <16 x i
define <16 x i32> @vpternlog_v16i32_021_broadcast0_maskz(i32* %x0ptr, <16 x i32> %x1, <16 x i32> %x2, i16 %mask) {
; CHECK-LABEL: vpternlog_v16i32_021_broadcast0_maskz:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: vpternlogd $58, (%rdi){1to16}, %zmm1, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -980,7 +980,7 @@ define <16 x i32> @vpternlog_v16i32_021_broadcast0_maskz(i32* %x0ptr, <16 x i32>
define <16 x i32> @vpternlog_v16i32_021_broadcast1_maskz(<16 x i32> %x0, i32* %x1ptr, <16 x i32> %x2, i16 %mask) {
; CHECK-LABEL: vpternlog_v16i32_021_broadcast1_maskz:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: vpternlogd $114, (%rdi){1to16}, %zmm1, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -993,7 +993,7 @@ define <16 x i32> @vpternlog_v16i32_021_broadcast1_maskz(<16 x i32> %x0, i32* %x
define <16 x i32> @vpternlog_v16i32_021_broadcast2_maskz(<16 x i32> %x0, <16 x i32> %x1, i32* %x2ptr, i16 %mask) {
; CHECK-LABEL: vpternlog_v16i32_021_broadcast2_maskz:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: vpternlogd $116, (%rdi){1to16}, %zmm1, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -1006,9 +1006,9 @@ define <16 x i32> @vpternlog_v16i32_021_broadcast2_maskz(<16 x i32> %x0, <16 x i
define <16 x i32> @vpternlog_v16i32_012_broadcast0_mask1(i32* %x0ptr, <16 x i32> %x1, <16 x i32> %x2, i16 %mask) {
; CHECK-LABEL: vpternlog_v16i32_012_broadcast0_mask1:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1
-; CHECK-NEXT: vpternlogd $92, (%rdi), %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: vpternlogd $92, (%rdi){1to16}, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
%x0scalar = load i32, i32* %x0ptr
%vecinit.i = insertelement <16 x i32> undef, i32 %x0scalar, i32 0
@@ -1021,9 +1021,9 @@ define <16 x i32> @vpternlog_v16i32_012_broadcast0_mask1(i32* %x0ptr, <16 x i32>
define <16 x i32> @vpternlog_v16i32_012_broadcast0_mask2(i32* %x0ptr, <16 x i32> %x1, <16 x i32> %x2, i16 %mask) {
; CHECK-LABEL: vpternlog_v16i32_012_broadcast0_mask2:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1
-; CHECK-NEXT: vpternlogd $58, (%rdi), %zmm0, %zmm1 {%k1}
+; CHECK-NEXT: vpternlogd $58, (%rdi){1to16}, %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT: retq
%x0scalar = load i32, i32* %x0ptr
@@ -1037,9 +1037,9 @@ define <16 x i32> @vpternlog_v16i32_012_broadcast0_mask2(i32* %x0ptr, <16 x i32>
define <16 x i32> @vpternlog_v16i32_012_broadcast1_mask2(<16 x i32> %x0, i32* %x1ptr, <16 x i32> %x2, i16 %mask) {
; CHECK-LABEL: vpternlog_v16i32_012_broadcast1_mask2:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1
-; CHECK-NEXT: vpternlogd $46, (%rdi), %zmm0, %zmm1 {%k1}
+; CHECK-NEXT: vpternlogd $46, (%rdi){1to16}, %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT: retq
%x1scalar = load i32, i32* %x1ptr
@@ -1053,9 +1053,9 @@ define <16 x i32> @vpternlog_v16i32_012_broadcast1_mask2(<16 x i32> %x0, i32* %x
define <16 x i32> @vpternlog_v16i32_012_broadcast2_mask1(<16 x i32> %x0, <16 x i32> %x1, i32* %x2ptr, i16 %mask) {
; CHECK-LABEL: vpternlog_v16i32_012_broadcast2_mask1:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1
-; CHECK-NEXT: vpternlogd $78, (%rdi), %zmm0, %zmm1 {%k1}
+; CHECK-NEXT: vpternlogd $78, (%rdi){1to16}, %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
; CHECK-NEXT: retq
%x2scalar = load i32, i32* %x2ptr
diff --git a/test/CodeGen/X86/avx512-vselect-crash.ll b/test/CodeGen/X86/avx512-vselect-crash.ll
index 9d652d36a524..31ccf867f7aa 100644
--- a/test/CodeGen/X86/avx512-vselect-crash.ll
+++ b/test/CodeGen/X86/avx512-vselect-crash.ll
@@ -1,9 +1,11 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
-; CHECK-LABEL: test
-; CHECK: vpxord
-; CHECK: ret
define <16 x i32> @test() {
+; CHECK-LABEL: test:
+; CHECK: ## %bb.0: ## %entry
+; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; CHECK-NEXT: retq
entry:
%0 = icmp slt <16 x i32> undef, undef
%1 = select <16 x i1> %0, <16 x i32> undef, <16 x i32> zeroinitializer
diff --git a/test/CodeGen/X86/avx512-vselect.ll b/test/CodeGen/X86/avx512-vselect.ll
index 1940864824ff..0edd01e8aefe 100644
--- a/test/CodeGen/X86/avx512-vselect.ll
+++ b/test/CodeGen/X86/avx512-vselect.ll
@@ -6,7 +6,7 @@ target triple = "x86_64-unknown-unknown"
define <8 x i64> @test1(<8 x i64> %m, <8 x i64> %a, <8 x i64> %b) {
; CHECK-LABEL: test1:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vpsllq $63, %zmm0, %zmm0
; CHECK-NEXT: vptestmq %zmm0, %zmm0, %k1
; CHECK-NEXT: vpblendmq %zmm1, %zmm2, %zmm0 {%k1}
@@ -23,35 +23,16 @@ entry:
; both formulations of vselect. All of this trickery is because we can't
; directly form an SDAG input to the lowering.
define <16 x double> @test2(<16 x float> %x, <16 x float> %y, <16 x double> %a, <16 x double> %b) {
-; CHECK-SKX-LABEL: test2:
-; CHECK-SKX: # BB#0: # %entry
-; CHECK-SKX-NEXT: vxorps %zmm6, %zmm6, %zmm6
-; CHECK-SKX-NEXT: vcmpltps %zmm0, %zmm6, %k0
-; CHECK-SKX-NEXT: vcmpltps %zmm6, %zmm1, %k1
-; CHECK-SKX-NEXT: korw %k1, %k0, %k0
-; CHECK-SKX-NEXT: kshiftrw $8, %k0, %k1
-; CHECK-SKX-NEXT: vpmovm2q %k1, %zmm1
-; CHECK-SKX-NEXT: vpmovm2q %k0, %zmm0
-; CHECK-SKX-NEXT: vptestmq %zmm0, %zmm0, %k1
-; CHECK-SKX-NEXT: vblendmpd %zmm2, %zmm4, %zmm0 {%k1}
-; CHECK-SKX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; CHECK-SKX-NEXT: vblendmpd %zmm3, %zmm5, %zmm1 {%k1}
-; CHECK-SKX-NEXT: retq
-;
-; CHECK-KNL-LABEL: test2:
-; CHECK-KNL: # BB#0: # %entry
-; CHECK-KNL-NEXT: vpxord %zmm6, %zmm6, %zmm6
-; CHECK-KNL-NEXT: vcmpltps %zmm0, %zmm6, %k0
-; CHECK-KNL-NEXT: vcmpltps %zmm6, %zmm1, %k1
-; CHECK-KNL-NEXT: korw %k1, %k0, %k1
-; CHECK-KNL-NEXT: kshiftrw $8, %k1, %k2
-; CHECK-KNL-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k2} {z}
-; CHECK-KNL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; CHECK-KNL-NEXT: vptestmq %zmm0, %zmm0, %k1
-; CHECK-KNL-NEXT: vblendmpd %zmm2, %zmm4, %zmm0 {%k1}
-; CHECK-KNL-NEXT: vptestmq %zmm1, %zmm1, %k1
-; CHECK-KNL-NEXT: vblendmpd %zmm3, %zmm5, %zmm1 {%k1}
-; CHECK-KNL-NEXT: retq
+; CHECK-LABEL: test2:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vxorps %xmm6, %xmm6, %xmm6
+; CHECK-NEXT: vcmpltps %zmm0, %zmm6, %k0
+; CHECK-NEXT: vcmpltps %zmm6, %zmm1, %k1
+; CHECK-NEXT: korw %k1, %k0, %k1
+; CHECK-NEXT: vblendmpd %zmm2, %zmm4, %zmm0 {%k1}
+; CHECK-NEXT: kshiftrw $8, %k1, %k1
+; CHECK-NEXT: vblendmpd %zmm3, %zmm5, %zmm1 {%k1}
+; CHECK-NEXT: retq
entry:
%gt.m = fcmp ogt <16 x float> %x, zeroinitializer
%lt.m = fcmp olt <16 x float> %y, zeroinitializer
diff --git a/test/CodeGen/X86/avx512bw-intrinsics-fast-isel.ll b/test/CodeGen/X86/avx512bw-intrinsics-fast-isel.ll
index 50a9076163e8..1e754be6fe49 100644
--- a/test/CodeGen/X86/avx512bw-intrinsics-fast-isel.ll
+++ b/test/CodeGen/X86/avx512bw-intrinsics-fast-isel.ll
@@ -4,14 +4,1277 @@
; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/avx512bw-builtins.c
+define i64 @test_mm512_kunpackd(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C, <8 x i64> %__D, <8 x i64> %__E, <8 x i64> %__F) {
+; X32-LABEL: test_mm512_kunpackd:
+; X32: # %bb.0: # %entry
+; X32-NEXT: pushl %ebp
+; X32-NEXT: .cfi_def_cfa_offset 8
+; X32-NEXT: .cfi_offset %ebp, -8
+; X32-NEXT: movl %esp, %ebp
+; X32-NEXT: .cfi_def_cfa_register %ebp
+; X32-NEXT: andl $-64, %esp
+; X32-NEXT: subl $64, %esp
+; X32-NEXT: vmovdqa64 136(%ebp), %zmm3
+; X32-NEXT: vmovdqa64 72(%ebp), %zmm4
+; X32-NEXT: vmovdqa64 8(%ebp), %zmm5
+; X32-NEXT: vpcmpneqb %zmm0, %zmm1, %k0
+; X32-NEXT: kmovq %k0, {{[0-9]+}}(%esp)
+; X32-NEXT: vpcmpneqb %zmm5, %zmm2, %k0
+; X32-NEXT: kmovq %k0, {{[0-9]+}}(%esp)
+; X32-NEXT: kmovd {{[0-9]+}}(%esp), %k0
+; X32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
+; X32-NEXT: kunpckdq %k0, %k1, %k1
+; X32-NEXT: vpcmpneqb %zmm3, %zmm4, %k0 {%k1}
+; X32-NEXT: kmovq %k0, {{[0-9]+}}(%esp)
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT: movl %ebp, %esp
+; X32-NEXT: popl %ebp
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_kunpackd:
+; X64: # %bb.0: # %entry
+; X64-NEXT: vpcmpneqb %zmm0, %zmm1, %k0
+; X64-NEXT: vpcmpneqb %zmm3, %zmm2, %k1
+; X64-NEXT: kunpckdq %k0, %k1, %k1
+; X64-NEXT: vpcmpneqb %zmm5, %zmm4, %k0 {%k1}
+; X64-NEXT: kmovq %k0, %rax
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+entry:
+ %0 = bitcast <8 x i64> %__B to <64 x i8>
+ %1 = bitcast <8 x i64> %__A to <64 x i8>
+ %2 = icmp ne <64 x i8> %0, %1
+ %3 = bitcast <64 x i1> %2 to i64
+ %4 = bitcast <8 x i64> %__C to <64 x i8>
+ %5 = bitcast <8 x i64> %__D to <64 x i8>
+ %6 = icmp ne <64 x i8> %4, %5
+ %7 = bitcast <64 x i1> %6 to i64
+ %and.i = and i64 %7, 4294967295
+ %shl.i = shl i64 %3, 32
+ %or.i = or i64 %and.i, %shl.i
+ %8 = bitcast <8 x i64> %__E to <64 x i8>
+ %9 = bitcast <8 x i64> %__F to <64 x i8>
+ %10 = icmp ne <64 x i8> %8, %9
+ %11 = bitcast i64 %or.i to <64 x i1>
+ %12 = and <64 x i1> %10, %11
+ %13 = bitcast <64 x i1> %12 to i64
+ ret i64 %13
+}
+
+define i32 @test_mm512_kunpackw(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C, <8 x i64> %__D, <8 x i64> %__E, <8 x i64> %__F) {
+; X32-LABEL: test_mm512_kunpackw:
+; X32: # %bb.0: # %entry
+; X32-NEXT: pushl %ebp
+; X32-NEXT: .cfi_def_cfa_offset 8
+; X32-NEXT: .cfi_offset %ebp, -8
+; X32-NEXT: movl %esp, %ebp
+; X32-NEXT: .cfi_def_cfa_register %ebp
+; X32-NEXT: andl $-64, %esp
+; X32-NEXT: subl $64, %esp
+; X32-NEXT: vmovdqa64 136(%ebp), %zmm3
+; X32-NEXT: vpcmpneqw %zmm0, %zmm1, %k0
+; X32-NEXT: vpcmpneqw 8(%ebp), %zmm2, %k1
+; X32-NEXT: kunpckwd %k0, %k1, %k1
+; X32-NEXT: vpcmpneqw 72(%ebp), %zmm3, %k0 {%k1}
+; X32-NEXT: kmovd %k0, %eax
+; X32-NEXT: movl %ebp, %esp
+; X32-NEXT: popl %ebp
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_kunpackw:
+; X64: # %bb.0: # %entry
+; X64-NEXT: vpcmpneqw %zmm0, %zmm1, %k0
+; X64-NEXT: vpcmpneqw %zmm3, %zmm2, %k1
+; X64-NEXT: kunpckwd %k0, %k1, %k1
+; X64-NEXT: vpcmpneqw %zmm5, %zmm4, %k0 {%k1}
+; X64-NEXT: kmovd %k0, %eax
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+entry:
+ %0 = bitcast <8 x i64> %__B to <32 x i16>
+ %1 = bitcast <8 x i64> %__A to <32 x i16>
+ %2 = icmp ne <32 x i16> %0, %1
+ %3 = bitcast <32 x i1> %2 to i32
+ %4 = bitcast <8 x i64> %__C to <32 x i16>
+ %5 = bitcast <8 x i64> %__D to <32 x i16>
+ %6 = icmp ne <32 x i16> %4, %5
+ %7 = bitcast <32 x i1> %6 to i32
+ %and.i = and i32 %7, 65535
+ %shl.i = shl i32 %3, 16
+ %or.i = or i32 %and.i, %shl.i
+ %8 = bitcast <8 x i64> %__E to <32 x i16>
+ %9 = bitcast <8 x i64> %__F to <32 x i16>
+ %10 = icmp ne <32 x i16> %8, %9
+ %11 = bitcast i32 %or.i to <32 x i1>
+ %12 = and <32 x i1> %10, %11
+ %13 = bitcast <32 x i1> %12 to i32
+ ret i32 %13
+}
+
+
+define <8 x i64> @test_mm512_mask_set1_epi8(<8 x i64> %__O, i64 %__M, i8 signext %__A) {
+; X32-LABEL: test_mm512_mask_set1_epi8:
+; X32: # %bb.0: # %entry
+; X32-NEXT: pushl %ebx
+; X32-NEXT: .cfi_def_cfa_offset 8
+; X32-NEXT: pushl %esi
+; X32-NEXT: .cfi_def_cfa_offset 12
+; X32-NEXT: .cfi_offset %esi, -12
+; X32-NEXT: .cfi_offset %ebx, -8
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: kmovd %ecx, %k0
+; X32-NEXT: kshiftrq $1, %k0, %k1
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: andb $2, %al
+; X32-NEXT: shrb %al
+; X32-NEXT: kmovd %eax, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $62, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $2, %k0, %k1
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: andb $15, %al
+; X32-NEXT: movl %eax, %edx
+; X32-NEXT: shrb $2, %dl
+; X32-NEXT: kmovd %edx, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $61, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $3, %k0, %k1
+; X32-NEXT: shrb $3, %al
+; X32-NEXT: kmovd %eax, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $60, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $4, %k0, %k1
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: shrb $4, %al
+; X32-NEXT: kmovd %eax, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $59, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $5, %k0, %k1
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: shrb $5, %al
+; X32-NEXT: andb $1, %al
+; X32-NEXT: kmovd %eax, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $58, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $6, %k0, %k1
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: shrb $6, %al
+; X32-NEXT: kmovd %eax, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $57, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $7, %k0, %k1
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: shrb $7, %al
+; X32-NEXT: kmovd %eax, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $56, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $8, %k0, %k1
+; X32-NEXT: movb %ch, %al
+; X32-NEXT: kmovd %eax, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $55, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $9, %k0, %k1
+; X32-NEXT: andb $2, %al
+; X32-NEXT: shrb %al
+; X32-NEXT: kmovd %eax, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $54, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $10, %k0, %k1
+; X32-NEXT: movb %ch, %al
+; X32-NEXT: andb $15, %al
+; X32-NEXT: movl %eax, %edx
+; X32-NEXT: shrb $2, %dl
+; X32-NEXT: kmovd %edx, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $53, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $11, %k0, %k1
+; X32-NEXT: shrb $3, %al
+; X32-NEXT: kmovd %eax, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: shrl $12, %eax
+; X32-NEXT: andl $15, %eax
+; X32-NEXT: kmovd %eax, %k2
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: shrl $13, %eax
+; X32-NEXT: andb $1, %al
+; X32-NEXT: kmovd %eax, %k3
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: shrl $14, %eax
+; X32-NEXT: andl $3, %eax
+; X32-NEXT: kmovd %eax, %k4
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: shrl $15, %eax
+; X32-NEXT: andl $1, %eax
+; X32-NEXT: kmovd %eax, %k5
+; X32-NEXT: movl %ecx, %edx
+; X32-NEXT: shrl $16, %edx
+; X32-NEXT: movl %edx, %eax
+; X32-NEXT: andb $2, %al
+; X32-NEXT: shrb %al
+; X32-NEXT: kmovd %eax, %k6
+; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: andb $15, %bl
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: shrb $2, %al
+; X32-NEXT: kmovd %eax, %k7
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $52, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $12, %k0, %k1
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $51, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $13, %k0, %k1
+; X32-NEXT: kxorq %k3, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $50, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $14, %k0, %k1
+; X32-NEXT: kxorq %k4, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $49, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $15, %k0, %k1
+; X32-NEXT: kxorq %k5, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $48, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $16, %k0, %k1
+; X32-NEXT: kmovd %edx, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $47, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $17, %k0, %k1
+; X32-NEXT: kxorq %k6, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $46, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $18, %k0, %k1
+; X32-NEXT: kxorq %k7, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $45, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $19, %k0, %k1
+; X32-NEXT: shrb $3, %bl
+; X32-NEXT: kmovd %ebx, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $44, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $20, %k0, %k1
+; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: shrb $4, %bl
+; X32-NEXT: kmovd %ebx, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $43, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $21, %k0, %k1
+; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: shrb $5, %bl
+; X32-NEXT: andb $1, %bl
+; X32-NEXT: kmovd %ebx, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $42, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $22, %k0, %k1
+; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: shrb $6, %bl
+; X32-NEXT: kmovd %ebx, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $41, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $23, %k0, %k1
+; X32-NEXT: shrb $7, %dl
+; X32-NEXT: kmovd %edx, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $40, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $24, %k0, %k1
+; X32-NEXT: movl %ecx, %edx
+; X32-NEXT: shrl $24, %edx
+; X32-NEXT: kmovd %edx, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $39, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $25, %k0, %k1
+; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: andb $2, %bl
+; X32-NEXT: shrb %bl
+; X32-NEXT: kmovd %ebx, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $38, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $26, %k0, %k1
+; X32-NEXT: andb $15, %dl
+; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: shrb $2, %bl
+; X32-NEXT: kmovd %ebx, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $37, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $27, %k0, %k1
+; X32-NEXT: shrb $3, %dl
+; X32-NEXT: kmovd %edx, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $36, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $28, %k0, %k1
+; X32-NEXT: movl %ecx, %edx
+; X32-NEXT: shrl $28, %edx
+; X32-NEXT: kmovd %edx, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $35, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $29, %k0, %k1
+; X32-NEXT: movl %ecx, %edx
+; X32-NEXT: shrl $29, %edx
+; X32-NEXT: andb $1, %dl
+; X32-NEXT: kmovd %edx, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $34, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $30, %k0, %k1
+; X32-NEXT: movl %ecx, %edx
+; X32-NEXT: shrl $30, %edx
+; X32-NEXT: kmovd %edx, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $33, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $31, %k0, %k1
+; X32-NEXT: shrl $31, %ecx
+; X32-NEXT: kmovd %ecx, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $32, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $32, %k0, %k1
+; X32-NEXT: kmovd %eax, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $31, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $33, %k0, %k1
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: andb $2, %cl
+; X32-NEXT: shrb %cl
+; X32-NEXT: kmovd %ecx, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $30, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $34, %k0, %k1
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: andb $15, %cl
+; X32-NEXT: movl %ecx, %edx
+; X32-NEXT: shrb $2, %dl
+; X32-NEXT: kmovd %edx, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $29, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $35, %k0, %k1
+; X32-NEXT: shrb $3, %cl
+; X32-NEXT: kmovd %ecx, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $28, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $36, %k0, %k1
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: shrb $4, %cl
+; X32-NEXT: kmovd %ecx, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $27, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $37, %k0, %k1
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: shrb $5, %cl
+; X32-NEXT: andb $1, %cl
+; X32-NEXT: kmovd %ecx, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $26, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $38, %k0, %k1
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: shrb $6, %cl
+; X32-NEXT: kmovd %ecx, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $25, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $39, %k0, %k1
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: shrb $7, %cl
+; X32-NEXT: kmovd %ecx, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $24, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $40, %k0, %k1
+; X32-NEXT: movb %ah, %cl
+; X32-NEXT: kmovd %ecx, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: andb $2, %cl
+; X32-NEXT: shrb %cl
+; X32-NEXT: kmovd %ecx, %k2
+; X32-NEXT: movb %ah, %cl
+; X32-NEXT: andb $15, %cl
+; X32-NEXT: movl %ecx, %edx
+; X32-NEXT: shrb $2, %dl
+; X32-NEXT: kmovd %edx, %k3
+; X32-NEXT: shrb $3, %cl
+; X32-NEXT: kmovd %ecx, %k4
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: shrl $13, %ecx
+; X32-NEXT: andb $1, %cl
+; X32-NEXT: kmovd %ecx, %k5
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: shrl $16, %ecx
+; X32-NEXT: movl %ecx, %edx
+; X32-NEXT: andb $2, %dl
+; X32-NEXT: shrb %dl
+; X32-NEXT: kmovd %edx, %k6
+; X32-NEXT: movl %ecx, %edx
+; X32-NEXT: andb $15, %dl
+; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: shrb $2, %bl
+; X32-NEXT: kmovd %ebx, %k7
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $23, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $41, %k0, %k1
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $22, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $42, %k0, %k1
+; X32-NEXT: kxorq %k3, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $21, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $43, %k0, %k1
+; X32-NEXT: kxorq %k4, %k1, %k1
+; X32-NEXT: movl %eax, %esi
+; X32-NEXT: shrl $12, %esi
+; X32-NEXT: andl $15, %esi
+; X32-NEXT: kmovd %esi, %k2
+; X32-NEXT: movl %eax, %esi
+; X32-NEXT: shrl $14, %esi
+; X32-NEXT: andl $3, %esi
+; X32-NEXT: kmovd %esi, %k3
+; X32-NEXT: movl %eax, %esi
+; X32-NEXT: shrl $15, %esi
+; X32-NEXT: andl $1, %esi
+; X32-NEXT: kmovd %esi, %k4
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $20, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $44, %k0, %k1
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $19, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $45, %k0, %k1
+; X32-NEXT: kxorq %k5, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $18, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $46, %k0, %k1
+; X32-NEXT: kxorq %k3, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $17, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $47, %k0, %k1
+; X32-NEXT: kxorq %k4, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $16, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $48, %k0, %k1
+; X32-NEXT: kmovd %ecx, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $15, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $49, %k0, %k1
+; X32-NEXT: kxorq %k6, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $14, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $50, %k0, %k1
+; X32-NEXT: kxorq %k7, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $13, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $51, %k0, %k1
+; X32-NEXT: shrb $3, %dl
+; X32-NEXT: kmovd %edx, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $12, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k4
+; X32-NEXT: kshiftrq $52, %k4, %k0
+; X32-NEXT: movl %ecx, %edx
+; X32-NEXT: shrb $4, %dl
+; X32-NEXT: kmovd %edx, %k1
+; X32-NEXT: kxorq %k1, %k0, %k5
+; X32-NEXT: movl %ecx, %edx
+; X32-NEXT: shrb $5, %dl
+; X32-NEXT: andb $1, %dl
+; X32-NEXT: kmovd %edx, %k6
+; X32-NEXT: movl %ecx, %edx
+; X32-NEXT: shrb $6, %dl
+; X32-NEXT: kmovd %edx, %k7
+; X32-NEXT: shrb $7, %cl
+; X32-NEXT: kmovd %ecx, %k0
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: shrl $24, %ecx
+; X32-NEXT: kmovd %ecx, %k1
+; X32-NEXT: movl %ecx, %edx
+; X32-NEXT: andb $2, %dl
+; X32-NEXT: shrb %dl
+; X32-NEXT: kmovd %edx, %k2
+; X32-NEXT: andb $15, %cl
+; X32-NEXT: movl %ecx, %edx
+; X32-NEXT: shrb $2, %dl
+; X32-NEXT: kmovd %edx, %k3
+; X32-NEXT: kshiftlq $63, %k5, %k5
+; X32-NEXT: kshiftrq $11, %k5, %k5
+; X32-NEXT: kxorq %k4, %k5, %k4
+; X32-NEXT: kshiftrq $53, %k4, %k5
+; X32-NEXT: kxorq %k6, %k5, %k5
+; X32-NEXT: kshiftlq $63, %k5, %k5
+; X32-NEXT: kshiftrq $10, %k5, %k5
+; X32-NEXT: kxorq %k4, %k5, %k5
+; X32-NEXT: kshiftrq $54, %k5, %k4
+; X32-NEXT: kxorq %k7, %k4, %k6
+; X32-NEXT: shrb $3, %cl
+; X32-NEXT: kmovd %ecx, %k4
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: shrl $29, %ecx
+; X32-NEXT: andb $1, %cl
+; X32-NEXT: kmovd %ecx, %k7
+; X32-NEXT: kshiftlq $63, %k6, %k6
+; X32-NEXT: kshiftrq $9, %k6, %k6
+; X32-NEXT: kxorq %k5, %k6, %k5
+; X32-NEXT: kshiftrq $55, %k5, %k6
+; X32-NEXT: kxorq %k0, %k6, %k0
+; X32-NEXT: kshiftlq $63, %k0, %k0
+; X32-NEXT: kshiftrq $8, %k0, %k0
+; X32-NEXT: kxorq %k5, %k0, %k0
+; X32-NEXT: kshiftrq $56, %k0, %k5
+; X32-NEXT: kxorq %k1, %k5, %k1
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: shrl $28, %ecx
+; X32-NEXT: kmovd %ecx, %k5
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: shrl $30, %ecx
+; X32-NEXT: kmovd %ecx, %k6
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $7, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $57, %k0, %k1
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $6, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $58, %k0, %k1
+; X32-NEXT: kxorq %k3, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $5, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $59, %k0, %k1
+; X32-NEXT: kxorq %k4, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $4, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $60, %k0, %k1
+; X32-NEXT: kxorq %k5, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $3, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $61, %k0, %k1
+; X32-NEXT: kxorq %k7, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $2, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $62, %k0, %k1
+; X32-NEXT: kxorq %k6, %k1, %k1
+; X32-NEXT: shrl $31, %eax
+; X32-NEXT: kmovd %eax, %k2
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $1, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftlq $1, %k0, %k0
+; X32-NEXT: kshiftrq $1, %k0, %k0
+; X32-NEXT: kshiftlq $63, %k2, %k1
+; X32-NEXT: korq %k1, %k0, %k1
+; X32-NEXT: vpbroadcastb %eax, %zmm0 {%k1}
+; X32-NEXT: popl %esi
+; X32-NEXT: popl %ebx
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_mask_set1_epi8:
+; X64: # %bb.0: # %entry
+; X64-NEXT: kmovq %rdi, %k1
+; X64-NEXT: vpbroadcastb %esi, %zmm0 {%k1}
+; X64-NEXT: retq
+ entry:
+ %vecinit.i.i = insertelement <64 x i8> undef, i8 %__A, i32 0
+ %vecinit63.i.i = shufflevector <64 x i8> %vecinit.i.i, <64 x i8> undef, <64 x i32> zeroinitializer
+ %0 = bitcast <8 x i64> %__O to <64 x i8>
+ %1 = bitcast i64 %__M to <64 x i1>
+ %2 = select <64 x i1> %1, <64 x i8> %vecinit63.i.i, <64 x i8> %0
+ %3 = bitcast <64 x i8> %2 to <8 x i64>
+ ret <8 x i64> %3
+}
+
+define <8 x i64> @test_mm512_maskz_set1_epi8(i64 %__M, i8 signext %__A) {
+; X32-LABEL: test_mm512_maskz_set1_epi8:
+; X32: # %bb.0: # %entry
+; X32-NEXT: pushl %ebx
+; X32-NEXT: .cfi_def_cfa_offset 8
+; X32-NEXT: pushl %esi
+; X32-NEXT: .cfi_def_cfa_offset 12
+; X32-NEXT: .cfi_offset %esi, -12
+; X32-NEXT: .cfi_offset %ebx, -8
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: kmovd %ecx, %k0
+; X32-NEXT: kshiftrq $1, %k0, %k1
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: andb $2, %al
+; X32-NEXT: shrb %al
+; X32-NEXT: kmovd %eax, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $62, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $2, %k0, %k1
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: andb $15, %al
+; X32-NEXT: movl %eax, %edx
+; X32-NEXT: shrb $2, %dl
+; X32-NEXT: kmovd %edx, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $61, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $3, %k0, %k1
+; X32-NEXT: shrb $3, %al
+; X32-NEXT: kmovd %eax, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $60, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $4, %k0, %k1
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: shrb $4, %al
+; X32-NEXT: kmovd %eax, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $59, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $5, %k0, %k1
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: shrb $5, %al
+; X32-NEXT: andb $1, %al
+; X32-NEXT: kmovd %eax, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $58, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $6, %k0, %k1
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: shrb $6, %al
+; X32-NEXT: kmovd %eax, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $57, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $7, %k0, %k1
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: shrb $7, %al
+; X32-NEXT: kmovd %eax, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $56, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $8, %k0, %k1
+; X32-NEXT: movb %ch, %al
+; X32-NEXT: kmovd %eax, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $55, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $9, %k0, %k1
+; X32-NEXT: andb $2, %al
+; X32-NEXT: shrb %al
+; X32-NEXT: kmovd %eax, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $54, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $10, %k0, %k1
+; X32-NEXT: movb %ch, %al
+; X32-NEXT: andb $15, %al
+; X32-NEXT: movl %eax, %edx
+; X32-NEXT: shrb $2, %dl
+; X32-NEXT: kmovd %edx, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $53, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $11, %k0, %k1
+; X32-NEXT: shrb $3, %al
+; X32-NEXT: kmovd %eax, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: shrl $12, %eax
+; X32-NEXT: andl $15, %eax
+; X32-NEXT: kmovd %eax, %k2
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: shrl $13, %eax
+; X32-NEXT: andb $1, %al
+; X32-NEXT: kmovd %eax, %k3
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: shrl $14, %eax
+; X32-NEXT: andl $3, %eax
+; X32-NEXT: kmovd %eax, %k4
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: shrl $15, %eax
+; X32-NEXT: andl $1, %eax
+; X32-NEXT: kmovd %eax, %k5
+; X32-NEXT: movl %ecx, %edx
+; X32-NEXT: shrl $16, %edx
+; X32-NEXT: movl %edx, %eax
+; X32-NEXT: andb $2, %al
+; X32-NEXT: shrb %al
+; X32-NEXT: kmovd %eax, %k6
+; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: andb $15, %bl
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: shrb $2, %al
+; X32-NEXT: kmovd %eax, %k7
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $52, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $12, %k0, %k1
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $51, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $13, %k0, %k1
+; X32-NEXT: kxorq %k3, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $50, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $14, %k0, %k1
+; X32-NEXT: kxorq %k4, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $49, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $15, %k0, %k1
+; X32-NEXT: kxorq %k5, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $48, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $16, %k0, %k1
+; X32-NEXT: kmovd %edx, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $47, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $17, %k0, %k1
+; X32-NEXT: kxorq %k6, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $46, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $18, %k0, %k1
+; X32-NEXT: kxorq %k7, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $45, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $19, %k0, %k1
+; X32-NEXT: shrb $3, %bl
+; X32-NEXT: kmovd %ebx, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $44, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $20, %k0, %k1
+; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: shrb $4, %bl
+; X32-NEXT: kmovd %ebx, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $43, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $21, %k0, %k1
+; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: shrb $5, %bl
+; X32-NEXT: andb $1, %bl
+; X32-NEXT: kmovd %ebx, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $42, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $22, %k0, %k1
+; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: shrb $6, %bl
+; X32-NEXT: kmovd %ebx, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $41, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $23, %k0, %k1
+; X32-NEXT: shrb $7, %dl
+; X32-NEXT: kmovd %edx, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $40, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $24, %k0, %k1
+; X32-NEXT: movl %ecx, %edx
+; X32-NEXT: shrl $24, %edx
+; X32-NEXT: kmovd %edx, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $39, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $25, %k0, %k1
+; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: andb $2, %bl
+; X32-NEXT: shrb %bl
+; X32-NEXT: kmovd %ebx, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $38, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $26, %k0, %k1
+; X32-NEXT: andb $15, %dl
+; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: shrb $2, %bl
+; X32-NEXT: kmovd %ebx, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $37, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $27, %k0, %k1
+; X32-NEXT: shrb $3, %dl
+; X32-NEXT: kmovd %edx, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $36, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $28, %k0, %k1
+; X32-NEXT: movl %ecx, %edx
+; X32-NEXT: shrl $28, %edx
+; X32-NEXT: kmovd %edx, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $35, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $29, %k0, %k1
+; X32-NEXT: movl %ecx, %edx
+; X32-NEXT: shrl $29, %edx
+; X32-NEXT: andb $1, %dl
+; X32-NEXT: kmovd %edx, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $34, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $30, %k0, %k1
+; X32-NEXT: movl %ecx, %edx
+; X32-NEXT: shrl $30, %edx
+; X32-NEXT: kmovd %edx, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $33, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $31, %k0, %k1
+; X32-NEXT: shrl $31, %ecx
+; X32-NEXT: kmovd %ecx, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $32, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $32, %k0, %k1
+; X32-NEXT: kmovd %eax, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $31, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $33, %k0, %k1
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: andb $2, %cl
+; X32-NEXT: shrb %cl
+; X32-NEXT: kmovd %ecx, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $30, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $34, %k0, %k1
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: andb $15, %cl
+; X32-NEXT: movl %ecx, %edx
+; X32-NEXT: shrb $2, %dl
+; X32-NEXT: kmovd %edx, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $29, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $35, %k0, %k1
+; X32-NEXT: shrb $3, %cl
+; X32-NEXT: kmovd %ecx, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $28, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $36, %k0, %k1
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: shrb $4, %cl
+; X32-NEXT: kmovd %ecx, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $27, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $37, %k0, %k1
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: shrb $5, %cl
+; X32-NEXT: andb $1, %cl
+; X32-NEXT: kmovd %ecx, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $26, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $38, %k0, %k1
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: shrb $6, %cl
+; X32-NEXT: kmovd %ecx, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $25, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $39, %k0, %k1
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: shrb $7, %cl
+; X32-NEXT: kmovd %ecx, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $24, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $40, %k0, %k1
+; X32-NEXT: movb %ah, %cl
+; X32-NEXT: kmovd %ecx, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: andb $2, %cl
+; X32-NEXT: shrb %cl
+; X32-NEXT: kmovd %ecx, %k2
+; X32-NEXT: movb %ah, %cl
+; X32-NEXT: andb $15, %cl
+; X32-NEXT: movl %ecx, %edx
+; X32-NEXT: shrb $2, %dl
+; X32-NEXT: kmovd %edx, %k3
+; X32-NEXT: shrb $3, %cl
+; X32-NEXT: kmovd %ecx, %k4
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: shrl $13, %ecx
+; X32-NEXT: andb $1, %cl
+; X32-NEXT: kmovd %ecx, %k5
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: shrl $16, %ecx
+; X32-NEXT: movl %ecx, %edx
+; X32-NEXT: andb $2, %dl
+; X32-NEXT: shrb %dl
+; X32-NEXT: kmovd %edx, %k6
+; X32-NEXT: movl %ecx, %edx
+; X32-NEXT: andb $15, %dl
+; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: shrb $2, %bl
+; X32-NEXT: kmovd %ebx, %k7
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $23, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $41, %k0, %k1
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $22, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $42, %k0, %k1
+; X32-NEXT: kxorq %k3, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $21, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $43, %k0, %k1
+; X32-NEXT: kxorq %k4, %k1, %k1
+; X32-NEXT: movl %eax, %esi
+; X32-NEXT: shrl $12, %esi
+; X32-NEXT: andl $15, %esi
+; X32-NEXT: kmovd %esi, %k2
+; X32-NEXT: movl %eax, %esi
+; X32-NEXT: shrl $14, %esi
+; X32-NEXT: andl $3, %esi
+; X32-NEXT: kmovd %esi, %k3
+; X32-NEXT: movl %eax, %esi
+; X32-NEXT: shrl $15, %esi
+; X32-NEXT: andl $1, %esi
+; X32-NEXT: kmovd %esi, %k4
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $20, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $44, %k0, %k1
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $19, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $45, %k0, %k1
+; X32-NEXT: kxorq %k5, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $18, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $46, %k0, %k1
+; X32-NEXT: kxorq %k3, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $17, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $47, %k0, %k1
+; X32-NEXT: kxorq %k4, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $16, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $48, %k0, %k1
+; X32-NEXT: kmovd %ecx, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $15, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $49, %k0, %k1
+; X32-NEXT: kxorq %k6, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $14, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $50, %k0, %k1
+; X32-NEXT: kxorq %k7, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $13, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $51, %k0, %k1
+; X32-NEXT: shrb $3, %dl
+; X32-NEXT: kmovd %edx, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $12, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k4
+; X32-NEXT: kshiftrq $52, %k4, %k0
+; X32-NEXT: movl %ecx, %edx
+; X32-NEXT: shrb $4, %dl
+; X32-NEXT: kmovd %edx, %k1
+; X32-NEXT: kxorq %k1, %k0, %k5
+; X32-NEXT: movl %ecx, %edx
+; X32-NEXT: shrb $5, %dl
+; X32-NEXT: andb $1, %dl
+; X32-NEXT: kmovd %edx, %k6
+; X32-NEXT: movl %ecx, %edx
+; X32-NEXT: shrb $6, %dl
+; X32-NEXT: kmovd %edx, %k7
+; X32-NEXT: shrb $7, %cl
+; X32-NEXT: kmovd %ecx, %k0
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: shrl $24, %ecx
+; X32-NEXT: kmovd %ecx, %k1
+; X32-NEXT: movl %ecx, %edx
+; X32-NEXT: andb $2, %dl
+; X32-NEXT: shrb %dl
+; X32-NEXT: kmovd %edx, %k2
+; X32-NEXT: andb $15, %cl
+; X32-NEXT: movl %ecx, %edx
+; X32-NEXT: shrb $2, %dl
+; X32-NEXT: kmovd %edx, %k3
+; X32-NEXT: kshiftlq $63, %k5, %k5
+; X32-NEXT: kshiftrq $11, %k5, %k5
+; X32-NEXT: kxorq %k4, %k5, %k4
+; X32-NEXT: kshiftrq $53, %k4, %k5
+; X32-NEXT: kxorq %k6, %k5, %k5
+; X32-NEXT: kshiftlq $63, %k5, %k5
+; X32-NEXT: kshiftrq $10, %k5, %k5
+; X32-NEXT: kxorq %k4, %k5, %k5
+; X32-NEXT: kshiftrq $54, %k5, %k4
+; X32-NEXT: kxorq %k7, %k4, %k6
+; X32-NEXT: shrb $3, %cl
+; X32-NEXT: kmovd %ecx, %k4
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: shrl $29, %ecx
+; X32-NEXT: andb $1, %cl
+; X32-NEXT: kmovd %ecx, %k7
+; X32-NEXT: kshiftlq $63, %k6, %k6
+; X32-NEXT: kshiftrq $9, %k6, %k6
+; X32-NEXT: kxorq %k5, %k6, %k5
+; X32-NEXT: kshiftrq $55, %k5, %k6
+; X32-NEXT: kxorq %k0, %k6, %k0
+; X32-NEXT: kshiftlq $63, %k0, %k0
+; X32-NEXT: kshiftrq $8, %k0, %k0
+; X32-NEXT: kxorq %k5, %k0, %k0
+; X32-NEXT: kshiftrq $56, %k0, %k5
+; X32-NEXT: kxorq %k1, %k5, %k1
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: shrl $28, %ecx
+; X32-NEXT: kmovd %ecx, %k5
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: shrl $30, %ecx
+; X32-NEXT: kmovd %ecx, %k6
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $7, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $57, %k0, %k1
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $6, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $58, %k0, %k1
+; X32-NEXT: kxorq %k3, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $5, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $59, %k0, %k1
+; X32-NEXT: kxorq %k4, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $4, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $60, %k0, %k1
+; X32-NEXT: kxorq %k5, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $3, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $61, %k0, %k1
+; X32-NEXT: kxorq %k7, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $2, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $62, %k0, %k1
+; X32-NEXT: kxorq %k6, %k1, %k1
+; X32-NEXT: shrl $31, %eax
+; X32-NEXT: kmovd %eax, %k2
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $1, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftlq $1, %k0, %k0
+; X32-NEXT: kshiftrq $1, %k0, %k0
+; X32-NEXT: kshiftlq $63, %k2, %k1
+; X32-NEXT: korq %k1, %k0, %k1
+; X32-NEXT: vpbroadcastb %eax, %zmm0 {%k1} {z}
+; X32-NEXT: popl %esi
+; X32-NEXT: popl %ebx
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_maskz_set1_epi8:
+; X64: # %bb.0: # %entry
+; X64-NEXT: kmovq %rdi, %k1
+; X64-NEXT: vpbroadcastb %esi, %zmm0 {%k1} {z}
+; X64-NEXT: retq
+ entry:
+ %vecinit.i.i = insertelement <64 x i8> undef, i8 %__A, i32 0
+ %vecinit63.i.i = shufflevector <64 x i8> %vecinit.i.i, <64 x i8> undef, <64 x i32> zeroinitializer
+ %0 = bitcast i64 %__M to <64 x i1>
+ %1 = select <64 x i1> %0, <64 x i8> %vecinit63.i.i, <64 x i8> zeroinitializer
+ %2 = bitcast <64 x i8> %1 to <8 x i64>
+ ret <8 x i64> %2
+}
+
+define <8 x i64> @test_mm512_mask_set1_epi16(<8 x i64> %__O, i32 %__M, i16 signext %__A) {
+; X32-LABEL: test_mm512_mask_set1_epi16:
+; X32: # %bb.0: # %entry
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
+; X32-NEXT: vpbroadcastw %eax, %zmm0 {%k1}
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_mask_set1_epi16:
+; X64: # %bb.0: # %entry
+; X64-NEXT: kmovd %edi, %k1
+; X64-NEXT: vpbroadcastw %esi, %zmm0 {%k1}
+; X64-NEXT: retq
+ entry:
+ %vecinit.i.i = insertelement <32 x i16> undef, i16 %__A, i32 0
+ %vecinit31.i.i = shufflevector <32 x i16> %vecinit.i.i, <32 x i16> undef, <32 x i32> zeroinitializer
+ %0 = bitcast <8 x i64> %__O to <32 x i16>
+ %1 = bitcast i32 %__M to <32 x i1>
+ %2 = select <32 x i1> %1, <32 x i16> %vecinit31.i.i, <32 x i16> %0
+ %3 = bitcast <32 x i16> %2 to <8 x i64>
+ ret <8 x i64> %3
+}
+
+define <8 x i64> @test_mm512_maskz_set1_epi16(i32 %__M, i16 signext %__A) {
+; X32-LABEL: test_mm512_maskz_set1_epi16:
+; X32: # %bb.0: # %entry
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
+; X32-NEXT: vpbroadcastw %eax, %zmm0 {%k1} {z}
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_maskz_set1_epi16:
+; X64: # %bb.0: # %entry
+; X64-NEXT: kmovd %edi, %k1
+; X64-NEXT: vpbroadcastw %esi, %zmm0 {%k1} {z}
+; X64-NEXT: retq
+ entry:
+ %vecinit.i.i = insertelement <32 x i16> undef, i16 %__A, i32 0
+ %vecinit31.i.i = shufflevector <32 x i16> %vecinit.i.i, <32 x i16> undef, <32 x i32> zeroinitializer
+ %0 = bitcast i32 %__M to <32 x i1>
+ %1 = select <32 x i1> %0, <32 x i16> %vecinit31.i.i, <32 x i16> zeroinitializer
+ %2 = bitcast <32 x i16> %1 to <8 x i64>
+ ret <8 x i64> %2
+}
+
define <8 x i64> @test_mm512_broadcastb_epi8(<2 x i64> %a0) {
; X32-LABEL: test_mm512_broadcastb_epi8:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpbroadcastb %xmm0, %zmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm512_broadcastb_epi8:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpbroadcastb %xmm0, %zmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <16 x i8>
@@ -22,14 +1285,14 @@ define <8 x i64> @test_mm512_broadcastb_epi8(<2 x i64> %a0) {
define <8 x i64> @test_mm512_mask_broadcastb_epi8(<8 x i64> %a0, i64* %a1, <2 x i64> %a2) {
; X32-LABEL: test_mm512_mask_broadcastb_epi8:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: kmovq (%eax), %k1
; X32-NEXT: vpbroadcastb %xmm1, %zmm0 {%k1}
; X32-NEXT: retl
;
; X64-LABEL: test_mm512_mask_broadcastb_epi8:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: kmovq (%rdi), %k1
; X64-NEXT: vpbroadcastb %xmm1, %zmm0 {%k1}
; X64-NEXT: retq
@@ -45,14 +1308,14 @@ define <8 x i64> @test_mm512_mask_broadcastb_epi8(<8 x i64> %a0, i64* %a1, <2 x
define <8 x i64> @test_mm512_maskz_broadcastb_epi8(i64* %a0, <2 x i64> %a1) {
; X32-LABEL: test_mm512_maskz_broadcastb_epi8:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: kmovq (%eax), %k1
; X32-NEXT: vpbroadcastb %xmm0, %zmm0 {%k1} {z}
; X32-NEXT: retl
;
; X64-LABEL: test_mm512_maskz_broadcastb_epi8:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: kmovq (%rdi), %k1
; X64-NEXT: vpbroadcastb %xmm0, %zmm0 {%k1} {z}
; X64-NEXT: retq
@@ -67,12 +1330,12 @@ define <8 x i64> @test_mm512_maskz_broadcastb_epi8(i64* %a0, <2 x i64> %a1) {
define <8 x i64> @test_mm512_broadcastw_epi16(<2 x i64> %a0) {
; X32-LABEL: test_mm512_broadcastw_epi16:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpbroadcastw %xmm0, %zmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm512_broadcastw_epi16:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpbroadcastw %xmm0, %zmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <8 x i16>
@@ -83,14 +1346,13 @@ define <8 x i64> @test_mm512_broadcastw_epi16(<2 x i64> %a0) {
define <8 x i64> @test_mm512_mask_broadcastw_epi16(<8 x i64> %a0, i32 %a1, <2 x i64> %a2) {
; X32-LABEL: test_mm512_mask_broadcastw_epi16:
-; X32: # BB#0:
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: kmovd %eax, %k1
+; X32: # %bb.0:
+; X32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; X32-NEXT: vpbroadcastw %xmm1, %zmm0 {%k1}
; X32-NEXT: retl
;
; X64-LABEL: test_mm512_mask_broadcastw_epi16:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: kmovd %edi, %k1
; X64-NEXT: vpbroadcastw %xmm1, %zmm0 {%k1}
; X64-NEXT: retq
@@ -105,14 +1367,13 @@ define <8 x i64> @test_mm512_mask_broadcastw_epi16(<8 x i64> %a0, i32 %a1, <2 x
define <8 x i64> @test_mm512_maskz_broadcastw_epi16(i32 %a0, <2 x i64> %a1) {
; X32-LABEL: test_mm512_maskz_broadcastw_epi16:
-; X32: # BB#0:
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: kmovd %eax, %k1
+; X32: # %bb.0:
+; X32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; X32-NEXT: vpbroadcastw %xmm0, %zmm0 {%k1} {z}
; X32-NEXT: retl
;
; X64-LABEL: test_mm512_maskz_broadcastw_epi16:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: kmovd %edi, %k1
; X64-NEXT: vpbroadcastw %xmm0, %zmm0 {%k1} {z}
; X64-NEXT: retq
@@ -126,12 +1387,12 @@ define <8 x i64> @test_mm512_maskz_broadcastw_epi16(i32 %a0, <2 x i64> %a1) {
define <8 x i64> @test_mm512_bslli_epi128(<8 x i64> %a0) {
; X32-LABEL: test_mm512_bslli_epi128:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpsrldq {{.*#+}} zmm0 = zmm0[11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[43,44,45,46,47],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[59,60,61,62,63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; X32-NEXT: retl
;
; X64-LABEL: test_mm512_bslli_epi128:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpsrldq {{.*#+}} zmm0 = zmm0[11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[43,44,45,46,47],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[59,60,61,62,63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; X64-NEXT: retq
%arg0 = bitcast <8 x i64> %a0 to <64 x i8>
@@ -142,12 +1403,12 @@ define <8 x i64> @test_mm512_bslli_epi128(<8 x i64> %a0) {
define <8 x i64> @test_mm512_bsrli_epi128(<8 x i64> %a0) {
; X32-LABEL: test_mm512_bsrli_epi128:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpsrldq {{.*#+}} zmm0 = zmm0[5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zmm0[21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zmm0[37,38,39,40,41,42,43,44,45,46,47],zero,zero,zero,zero,zero,zmm0[53,54,55,56,57,58,59,60,61,62,63],zero,zero,zero,zero,zero
; X32-NEXT: retl
;
; X64-LABEL: test_mm512_bsrli_epi128:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpsrldq {{.*#+}} zmm0 = zmm0[5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zmm0[21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zmm0[37,38,39,40,41,42,43,44,45,46,47],zero,zero,zero,zero,zero,zmm0[53,54,55,56,57,58,59,60,61,62,63],zero,zero,zero,zero,zero
; X64-NEXT: retq
%arg0 = bitcast <8 x i64> %a0 to <64 x i8>
@@ -158,12 +1419,12 @@ define <8 x i64> @test_mm512_bsrli_epi128(<8 x i64> %a0) {
define <8 x i64> @test_mm512_unpackhi_epi8(<8 x i64> %a0, <8 x i64> %a1) {
; X32-LABEL: test_mm512_unpackhi_epi8:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpunpckhbw {{.*#+}} zmm0 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63]
; X32-NEXT: retl
;
; X64-LABEL: test_mm512_unpackhi_epi8:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpunpckhbw {{.*#+}} zmm0 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63]
; X64-NEXT: retq
%arg0 = bitcast <8 x i64> %a0 to <64 x i8>
@@ -176,14 +1437,14 @@ define <8 x i64> @test_mm512_unpackhi_epi8(<8 x i64> %a0, <8 x i64> %a1) {
; TODO - improve support for i64 -> mmask64 on 32-bit targets
define <8 x i64> @test_mm512_mask_unpackhi_epi8(<8 x i64> %a0, i64* %a1, <8 x i64> %a2, <8 x i64> %a3) {
; X32-LABEL: test_mm512_mask_unpackhi_epi8:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: kmovq (%eax), %k1
; X32-NEXT: vpunpckhbw {{.*#+}} zmm0 {%k1} = zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[12],zmm2[12],zmm1[13],zmm2[13],zmm1[14],zmm2[14],zmm1[15],zmm2[15],zmm1[24],zmm2[24],zmm1[25],zmm2[25],zmm1[26],zmm2[26],zmm1[27],zmm2[27],zmm1[28],zmm2[28],zmm1[29],zmm2[29],zmm1[30],zmm2[30],zmm1[31],zmm2[31],zmm1[40],zmm2[40],zmm1[41],zmm2[41],zmm1[42],zmm2[42],zmm1[43],zmm2[43],zmm1[44],zmm2[44],zmm1[45],zmm2[45],zmm1[46],zmm2[46],zmm1[47],zmm2[47],zmm1[56],zmm2[56],zmm1[57],zmm2[57],zmm1[58],zmm2[58],zmm1[59],zmm2[59],zmm1[60],zmm2[60],zmm1[61],zmm2[61],zmm1[62],zmm2[62],zmm1[63],zmm2[63]
; X32-NEXT: retl
;
; X64-LABEL: test_mm512_mask_unpackhi_epi8:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: kmovq (%rdi), %k1
; X64-NEXT: vpunpckhbw {{.*#+}} zmm0 {%k1} = zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[12],zmm2[12],zmm1[13],zmm2[13],zmm1[14],zmm2[14],zmm1[15],zmm2[15],zmm1[24],zmm2[24],zmm1[25],zmm2[25],zmm1[26],zmm2[26],zmm1[27],zmm2[27],zmm1[28],zmm2[28],zmm1[29],zmm2[29],zmm1[30],zmm2[30],zmm1[31],zmm2[31],zmm1[40],zmm2[40],zmm1[41],zmm2[41],zmm1[42],zmm2[42],zmm1[43],zmm2[43],zmm1[44],zmm2[44],zmm1[45],zmm2[45],zmm1[46],zmm2[46],zmm1[47],zmm2[47],zmm1[56],zmm2[56],zmm1[57],zmm2[57],zmm1[58],zmm2[58],zmm1[59],zmm2[59],zmm1[60],zmm2[60],zmm1[61],zmm2[61],zmm1[62],zmm2[62],zmm1[63],zmm2[63]
; X64-NEXT: retq
@@ -200,14 +1461,14 @@ define <8 x i64> @test_mm512_mask_unpackhi_epi8(<8 x i64> %a0, i64* %a1, <8 x i6
define <8 x i64> @test_mm512_maskz_unpackhi_epi8(i64* %a0, <8 x i64> %a1, <8 x i64> %a2) {
; X32-LABEL: test_mm512_maskz_unpackhi_epi8:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: kmovq (%eax), %k1
; X32-NEXT: vpunpckhbw {{.*#+}} zmm0 {%k1} {z} = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63]
; X32-NEXT: retl
;
; X64-LABEL: test_mm512_maskz_unpackhi_epi8:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: kmovq (%rdi), %k1
; X64-NEXT: vpunpckhbw {{.*#+}} zmm0 {%k1} {z} = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63]
; X64-NEXT: retq
@@ -223,12 +1484,12 @@ define <8 x i64> @test_mm512_maskz_unpackhi_epi8(i64* %a0, <8 x i64> %a1, <8 x i
define <8 x i64> @test_mm512_unpackhi_epi16(<8 x i64> %a0, <8 x i64> %a1) {
; X32-LABEL: test_mm512_unpackhi_epi16:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpunpckhwd {{.*#+}} zmm0 = zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31]
; X32-NEXT: retl
;
; X64-LABEL: test_mm512_unpackhi_epi16:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpunpckhwd {{.*#+}} zmm0 = zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31]
; X64-NEXT: retq
%arg0 = bitcast <8 x i64> %a0 to <32 x i16>
@@ -240,14 +1501,13 @@ define <8 x i64> @test_mm512_unpackhi_epi16(<8 x i64> %a0, <8 x i64> %a1) {
define <8 x i64> @test_mm512_mask_unpackhi_epi16(<8 x i64> %a0, i32 %a1, <8 x i64> %a2, <8 x i64> %a3) {
; X32-LABEL: test_mm512_mask_unpackhi_epi16:
-; X32: # BB#0:
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: kmovd %eax, %k1
+; X32: # %bb.0:
+; X32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; X32-NEXT: vpunpckhwd {{.*#+}} zmm0 {%k1} = zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[12],zmm2[12],zmm1[13],zmm2[13],zmm1[14],zmm2[14],zmm1[15],zmm2[15],zmm1[20],zmm2[20],zmm1[21],zmm2[21],zmm1[22],zmm2[22],zmm1[23],zmm2[23],zmm1[28],zmm2[28],zmm1[29],zmm2[29],zmm1[30],zmm2[30],zmm1[31],zmm2[31]
; X32-NEXT: retl
;
; X64-LABEL: test_mm512_mask_unpackhi_epi16:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: kmovd %edi, %k1
; X64-NEXT: vpunpckhwd {{.*#+}} zmm0 {%k1} = zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[12],zmm2[12],zmm1[13],zmm2[13],zmm1[14],zmm2[14],zmm1[15],zmm2[15],zmm1[20],zmm2[20],zmm1[21],zmm2[21],zmm1[22],zmm2[22],zmm1[23],zmm2[23],zmm1[28],zmm2[28],zmm1[29],zmm2[29],zmm1[30],zmm2[30],zmm1[31],zmm2[31]
; X64-NEXT: retq
@@ -263,14 +1523,13 @@ define <8 x i64> @test_mm512_mask_unpackhi_epi16(<8 x i64> %a0, i32 %a1, <8 x i6
define <8 x i64> @test_mm512_maskz_unpackhi_epi16(i32 %a0, <8 x i64> %a1, <8 x i64> %a2) {
; X32-LABEL: test_mm512_maskz_unpackhi_epi16:
-; X32: # BB#0:
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: kmovd %eax, %k1
+; X32: # %bb.0:
+; X32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; X32-NEXT: vpunpckhwd {{.*#+}} zmm0 {%k1} {z} = zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31]
; X32-NEXT: retl
;
; X64-LABEL: test_mm512_maskz_unpackhi_epi16:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: kmovd %edi, %k1
; X64-NEXT: vpunpckhwd {{.*#+}} zmm0 {%k1} {z} = zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31]
; X64-NEXT: retq
@@ -285,12 +1544,12 @@ define <8 x i64> @test_mm512_maskz_unpackhi_epi16(i32 %a0, <8 x i64> %a1, <8 x i
define <8 x i64> @test_mm512_unpacklo_epi8(<8 x i64> %a0, <8 x i64> %a1) {
; X32-LABEL: test_mm512_unpacklo_epi8:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55]
; X32-NEXT: retl
;
; X64-LABEL: test_mm512_unpacklo_epi8:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55]
; X64-NEXT: retq
%arg0 = bitcast <8 x i64> %a0 to <64 x i8>
@@ -302,14 +1561,14 @@ define <8 x i64> @test_mm512_unpacklo_epi8(<8 x i64> %a0, <8 x i64> %a1) {
define <8 x i64> @test_mm512_mask_unpacklo_epi8(<8 x i64> %a0, i64* %a1, <8 x i64> %a2, <8 x i64> %a3) {
; X32-LABEL: test_mm512_mask_unpacklo_epi8:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: kmovq (%eax), %k1
; X32-NEXT: vpunpcklbw {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[16],zmm2[16],zmm1[17],zmm2[17],zmm1[18],zmm2[18],zmm1[19],zmm2[19],zmm1[20],zmm2[20],zmm1[21],zmm2[21],zmm1[22],zmm2[22],zmm1[23],zmm2[23],zmm1[32],zmm2[32],zmm1[33],zmm2[33],zmm1[34],zmm2[34],zmm1[35],zmm2[35],zmm1[36],zmm2[36],zmm1[37],zmm2[37],zmm1[38],zmm2[38],zmm1[39],zmm2[39],zmm1[48],zmm2[48],zmm1[49],zmm2[49],zmm1[50],zmm2[50],zmm1[51],zmm2[51],zmm1[52],zmm2[52],zmm1[53],zmm2[53],zmm1[54],zmm2[54],zmm1[55],zmm2[55]
; X32-NEXT: retl
;
; X64-LABEL: test_mm512_mask_unpacklo_epi8:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: kmovq (%rdi), %k1
; X64-NEXT: vpunpcklbw {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[16],zmm2[16],zmm1[17],zmm2[17],zmm1[18],zmm2[18],zmm1[19],zmm2[19],zmm1[20],zmm2[20],zmm1[21],zmm2[21],zmm1[22],zmm2[22],zmm1[23],zmm2[23],zmm1[32],zmm2[32],zmm1[33],zmm2[33],zmm1[34],zmm2[34],zmm1[35],zmm2[35],zmm1[36],zmm2[36],zmm1[37],zmm2[37],zmm1[38],zmm2[38],zmm1[39],zmm2[39],zmm1[48],zmm2[48],zmm1[49],zmm2[49],zmm1[50],zmm2[50],zmm1[51],zmm2[51],zmm1[52],zmm2[52],zmm1[53],zmm2[53],zmm1[54],zmm2[54],zmm1[55],zmm2[55]
; X64-NEXT: retq
@@ -326,14 +1585,14 @@ define <8 x i64> @test_mm512_mask_unpacklo_epi8(<8 x i64> %a0, i64* %a1, <8 x i6
define <8 x i64> @test_mm512_maskz_unpacklo_epi8(i64* %a0, <8 x i64> %a1, <8 x i64> %a2) {
; X32-LABEL: test_mm512_maskz_unpacklo_epi8:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: kmovq (%eax), %k1
; X32-NEXT: vpunpcklbw {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55]
; X32-NEXT: retl
;
; X64-LABEL: test_mm512_maskz_unpacklo_epi8:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: kmovq (%rdi), %k1
; X64-NEXT: vpunpcklbw {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55]
; X64-NEXT: retq
@@ -349,12 +1608,12 @@ define <8 x i64> @test_mm512_maskz_unpacklo_epi8(i64* %a0, <8 x i64> %a1, <8 x i
define <8 x i64> @test_mm512_unpacklo_epi16(<8 x i64> %a0, <8 x i64> %a1) {
; X32-LABEL: test_mm512_unpacklo_epi16:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpunpcklwd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27]
; X32-NEXT: retl
;
; X64-LABEL: test_mm512_unpacklo_epi16:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpunpcklwd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27]
; X64-NEXT: retq
%arg0 = bitcast <8 x i64> %a0 to <32 x i16>
@@ -366,14 +1625,13 @@ define <8 x i64> @test_mm512_unpacklo_epi16(<8 x i64> %a0, <8 x i64> %a1) {
define <8 x i64> @test_mm512_mask_unpacklo_epi16(<8 x i64> %a0, i32 %a1, <8 x i64> %a2, <8 x i64> %a3) {
; X32-LABEL: test_mm512_mask_unpacklo_epi16:
-; X32: # BB#0:
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: kmovd %eax, %k1
+; X32: # %bb.0:
+; X32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; X32-NEXT: vpunpcklwd {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[16],zmm2[16],zmm1[17],zmm2[17],zmm1[18],zmm2[18],zmm1[19],zmm2[19],zmm1[24],zmm2[24],zmm1[25],zmm2[25],zmm1[26],zmm2[26],zmm1[27],zmm2[27]
; X32-NEXT: retl
;
; X64-LABEL: test_mm512_mask_unpacklo_epi16:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: kmovd %edi, %k1
; X64-NEXT: vpunpcklwd {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[16],zmm2[16],zmm1[17],zmm2[17],zmm1[18],zmm2[18],zmm1[19],zmm2[19],zmm1[24],zmm2[24],zmm1[25],zmm2[25],zmm1[26],zmm2[26],zmm1[27],zmm2[27]
; X64-NEXT: retq
@@ -389,14 +1647,13 @@ define <8 x i64> @test_mm512_mask_unpacklo_epi16(<8 x i64> %a0, i32 %a1, <8 x i6
define <8 x i64> @test_mm512_maskz_unpacklo_epi16(i32 %a0, <8 x i64> %a1, <8 x i64> %a2) {
; X32-LABEL: test_mm512_maskz_unpacklo_epi16:
-; X32: # BB#0:
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: kmovd %eax, %k1
+; X32: # %bb.0:
+; X32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; X32-NEXT: vpunpcklwd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27]
; X32-NEXT: retl
;
; X64-LABEL: test_mm512_maskz_unpacklo_epi16:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: kmovd %edi, %k1
; X64-NEXT: vpunpcklwd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27]
; X64-NEXT: retq
@@ -409,5 +1666,1299 @@ define <8 x i64> @test_mm512_maskz_unpacklo_epi16(i32 %a0, <8 x i64> %a1, <8 x i
ret <8 x i64> %res2
}
+define i64 @test_mm512_test_epi8_mask(<8 x i64> %__A, <8 x i64> %__B) {
+; X32-LABEL: test_mm512_test_epi8_mask:
+; X32: # %bb.0: # %entry
+; X32-NEXT: pushl %ebp
+; X32-NEXT: .cfi_def_cfa_offset 8
+; X32-NEXT: .cfi_offset %ebp, -8
+; X32-NEXT: movl %esp, %ebp
+; X32-NEXT: .cfi_def_cfa_register %ebp
+; X32-NEXT: andl $-8, %esp
+; X32-NEXT: subl $8, %esp
+; X32-NEXT: vptestmb %zmm0, %zmm1, %k0
+; X32-NEXT: kmovq %k0, (%esp)
+; X32-NEXT: movl (%esp), %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT: movl %ebp, %esp
+; X32-NEXT: popl %ebp
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_test_epi8_mask:
+; X64: # %bb.0: # %entry
+; X64-NEXT: vptestmb %zmm0, %zmm1, %k0
+; X64-NEXT: kmovq %k0, %rax
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+entry:
+ %and1.i.i = and <8 x i64> %__B, %__A
+ %0 = bitcast <8 x i64> %and1.i.i to <64 x i8>
+ %1 = icmp ne <64 x i8> %0, zeroinitializer
+ %2 = bitcast <64 x i1> %1 to i64
+ ret i64 %2
+}
+
+define i64 @test_mm512_mask_test_epi8_mask(i64 %__U, <8 x i64> %__A, <8 x i64> %__B) {
+; X32-LABEL: test_mm512_mask_test_epi8_mask:
+; X32: # %bb.0: # %entry
+; X32-NEXT: pushl %ebp
+; X32-NEXT: .cfi_def_cfa_offset 8
+; X32-NEXT: .cfi_offset %ebp, -8
+; X32-NEXT: movl %esp, %ebp
+; X32-NEXT: .cfi_def_cfa_register %ebp
+; X32-NEXT: pushl %ebx
+; X32-NEXT: pushl %esi
+; X32-NEXT: andl $-8, %esp
+; X32-NEXT: subl $8, %esp
+; X32-NEXT: .cfi_offset %esi, -16
+; X32-NEXT: .cfi_offset %ebx, -12
+; X32-NEXT: movl 8(%ebp), %ecx
+; X32-NEXT: kmovd %ecx, %k0
+; X32-NEXT: kshiftrq $1, %k0, %k1
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: andb $2, %al
+; X32-NEXT: shrb %al
+; X32-NEXT: kmovd %eax, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $62, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $2, %k0, %k1
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: andb $15, %al
+; X32-NEXT: movl %eax, %edx
+; X32-NEXT: shrb $2, %dl
+; X32-NEXT: kmovd %edx, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $61, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $3, %k0, %k1
+; X32-NEXT: shrb $3, %al
+; X32-NEXT: kmovd %eax, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $60, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $4, %k0, %k1
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: shrb $4, %al
+; X32-NEXT: kmovd %eax, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $59, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $5, %k0, %k1
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: shrb $5, %al
+; X32-NEXT: andb $1, %al
+; X32-NEXT: kmovd %eax, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $58, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $6, %k0, %k1
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: shrb $6, %al
+; X32-NEXT: kmovd %eax, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $57, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $7, %k0, %k1
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: shrb $7, %al
+; X32-NEXT: kmovd %eax, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $56, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $8, %k0, %k1
+; X32-NEXT: movb %ch, %al
+; X32-NEXT: kmovd %eax, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: andb $2, %al
+; X32-NEXT: shrb %al
+; X32-NEXT: kmovd %eax, %k2
+; X32-NEXT: movb %ch, %al
+; X32-NEXT: andb $15, %al
+; X32-NEXT: movl %eax, %edx
+; X32-NEXT: shrb $2, %dl
+; X32-NEXT: kmovd %edx, %k3
+; X32-NEXT: shrb $3, %al
+; X32-NEXT: kmovd %eax, %k4
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: shrl $13, %eax
+; X32-NEXT: andb $1, %al
+; X32-NEXT: kmovd %eax, %k5
+; X32-NEXT: movl %ecx, %edx
+; X32-NEXT: shrl $16, %edx
+; X32-NEXT: movl %edx, %eax
+; X32-NEXT: andb $2, %al
+; X32-NEXT: shrb %al
+; X32-NEXT: kmovd %eax, %k6
+; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: andb $15, %bl
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: shrb $2, %al
+; X32-NEXT: kmovd %eax, %k7
+; X32-NEXT: movl 12(%ebp), %eax
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $55, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $9, %k0, %k1
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $54, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $10, %k0, %k1
+; X32-NEXT: kxorq %k3, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $53, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $11, %k0, %k1
+; X32-NEXT: kxorq %k4, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $52, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $12, %k0, %k1
+; X32-NEXT: movl %ecx, %esi
+; X32-NEXT: shrl $12, %esi
+; X32-NEXT: andl $15, %esi
+; X32-NEXT: kmovd %esi, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $51, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $13, %k0, %k1
+; X32-NEXT: kxorq %k5, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $50, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $14, %k0, %k1
+; X32-NEXT: movl %ecx, %esi
+; X32-NEXT: shrl $14, %esi
+; X32-NEXT: andl $3, %esi
+; X32-NEXT: kmovd %esi, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $49, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $15, %k0, %k1
+; X32-NEXT: movl %ecx, %esi
+; X32-NEXT: shrl $15, %esi
+; X32-NEXT: andl $1, %esi
+; X32-NEXT: kmovd %esi, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $48, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $16, %k0, %k1
+; X32-NEXT: kmovd %edx, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $47, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $17, %k0, %k1
+; X32-NEXT: kxorq %k6, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $46, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $18, %k0, %k1
+; X32-NEXT: kxorq %k7, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $45, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $19, %k0, %k1
+; X32-NEXT: shrb $3, %bl
+; X32-NEXT: kmovd %ebx, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $44, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $20, %k0, %k1
+; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: shrb $4, %bl
+; X32-NEXT: kmovd %ebx, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $43, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $21, %k0, %k1
+; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: shrb $5, %bl
+; X32-NEXT: andb $1, %bl
+; X32-NEXT: kmovd %ebx, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $42, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $22, %k0, %k1
+; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: shrb $6, %bl
+; X32-NEXT: kmovd %ebx, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $41, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $23, %k0, %k1
+; X32-NEXT: shrb $7, %dl
+; X32-NEXT: kmovd %edx, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $40, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $24, %k0, %k1
+; X32-NEXT: movl %ecx, %edx
+; X32-NEXT: shrl $24, %edx
+; X32-NEXT: kmovd %edx, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $39, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $25, %k0, %k1
+; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: andb $2, %bl
+; X32-NEXT: shrb %bl
+; X32-NEXT: kmovd %ebx, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $38, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $26, %k0, %k1
+; X32-NEXT: andb $15, %dl
+; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: shrb $2, %bl
+; X32-NEXT: kmovd %ebx, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $37, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $27, %k0, %k1
+; X32-NEXT: shrb $3, %dl
+; X32-NEXT: kmovd %edx, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $36, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $28, %k0, %k1
+; X32-NEXT: movl %ecx, %edx
+; X32-NEXT: shrl $28, %edx
+; X32-NEXT: kmovd %edx, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $35, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $29, %k0, %k1
+; X32-NEXT: movl %ecx, %edx
+; X32-NEXT: shrl $29, %edx
+; X32-NEXT: andb $1, %dl
+; X32-NEXT: kmovd %edx, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $34, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $30, %k0, %k1
+; X32-NEXT: movl %ecx, %edx
+; X32-NEXT: shrl $30, %edx
+; X32-NEXT: kmovd %edx, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $33, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $31, %k0, %k1
+; X32-NEXT: shrl $31, %ecx
+; X32-NEXT: kmovd %ecx, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $32, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $32, %k0, %k1
+; X32-NEXT: kmovd %eax, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $31, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $33, %k0, %k1
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: andb $2, %cl
+; X32-NEXT: shrb %cl
+; X32-NEXT: kmovd %ecx, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $30, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $34, %k0, %k1
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: andb $15, %cl
+; X32-NEXT: movl %ecx, %edx
+; X32-NEXT: shrb $2, %dl
+; X32-NEXT: kmovd %edx, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $29, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $35, %k0, %k1
+; X32-NEXT: shrb $3, %cl
+; X32-NEXT: kmovd %ecx, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $28, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $36, %k0, %k1
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: shrb $4, %cl
+; X32-NEXT: kmovd %ecx, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $27, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $37, %k0, %k1
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: shrb $5, %cl
+; X32-NEXT: andb $1, %cl
+; X32-NEXT: kmovd %ecx, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $26, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $38, %k0, %k1
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: shrb $6, %cl
+; X32-NEXT: kmovd %ecx, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $25, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $39, %k0, %k1
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: shrb $7, %cl
+; X32-NEXT: kmovd %ecx, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $24, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $40, %k0, %k1
+; X32-NEXT: movb %ah, %cl
+; X32-NEXT: kmovd %ecx, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: andb $2, %cl
+; X32-NEXT: shrb %cl
+; X32-NEXT: kmovd %ecx, %k2
+; X32-NEXT: movb %ah, %cl
+; X32-NEXT: andb $15, %cl
+; X32-NEXT: movl %ecx, %edx
+; X32-NEXT: shrb $2, %dl
+; X32-NEXT: kmovd %edx, %k3
+; X32-NEXT: shrb $3, %cl
+; X32-NEXT: kmovd %ecx, %k4
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: shrl $13, %ecx
+; X32-NEXT: andb $1, %cl
+; X32-NEXT: kmovd %ecx, %k5
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: shrl $16, %ecx
+; X32-NEXT: movl %ecx, %edx
+; X32-NEXT: andb $2, %dl
+; X32-NEXT: shrb %dl
+; X32-NEXT: kmovd %edx, %k6
+; X32-NEXT: movl %ecx, %edx
+; X32-NEXT: andb $15, %dl
+; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: shrb $2, %bl
+; X32-NEXT: kmovd %ebx, %k7
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $23, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $41, %k0, %k1
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $22, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $42, %k0, %k1
+; X32-NEXT: kxorq %k3, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $21, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $43, %k0, %k1
+; X32-NEXT: kxorq %k4, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $20, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $44, %k0, %k1
+; X32-NEXT: movl %eax, %esi
+; X32-NEXT: shrl $12, %esi
+; X32-NEXT: andl $15, %esi
+; X32-NEXT: kmovd %esi, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $19, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $45, %k0, %k1
+; X32-NEXT: kxorq %k5, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $18, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $46, %k0, %k1
+; X32-NEXT: movl %eax, %esi
+; X32-NEXT: shrl $14, %esi
+; X32-NEXT: andl $3, %esi
+; X32-NEXT: kmovd %esi, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $17, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $47, %k0, %k1
+; X32-NEXT: movl %eax, %esi
+; X32-NEXT: shrl $15, %esi
+; X32-NEXT: andl $1, %esi
+; X32-NEXT: kmovd %esi, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $16, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $48, %k0, %k1
+; X32-NEXT: kmovd %ecx, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $15, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $49, %k0, %k1
+; X32-NEXT: kxorq %k6, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $14, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $50, %k0, %k1
+; X32-NEXT: kxorq %k7, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $13, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $51, %k0, %k1
+; X32-NEXT: shrb $3, %dl
+; X32-NEXT: kmovd %edx, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $12, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $52, %k0, %k1
+; X32-NEXT: movl %ecx, %edx
+; X32-NEXT: shrb $4, %dl
+; X32-NEXT: kmovd %edx, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: movl %ecx, %edx
+; X32-NEXT: shrb $5, %dl
+; X32-NEXT: andb $1, %dl
+; X32-NEXT: kmovd %edx, %k2
+; X32-NEXT: movl %ecx, %edx
+; X32-NEXT: shrb $6, %dl
+; X32-NEXT: kmovd %edx, %k3
+; X32-NEXT: shrb $7, %cl
+; X32-NEXT: kmovd %ecx, %k4
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: shrl $24, %ecx
+; X32-NEXT: kmovd %ecx, %k5
+; X32-NEXT: movl %ecx, %edx
+; X32-NEXT: andb $2, %dl
+; X32-NEXT: shrb %dl
+; X32-NEXT: kmovd %edx, %k6
+; X32-NEXT: andb $15, %cl
+; X32-NEXT: movl %ecx, %edx
+; X32-NEXT: shrb $2, %dl
+; X32-NEXT: kmovd %edx, %k7
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $11, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $53, %k0, %k1
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $10, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $54, %k0, %k1
+; X32-NEXT: kxorq %k3, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $9, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $55, %k0, %k1
+; X32-NEXT: kxorq %k4, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $8, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $56, %k0, %k1
+; X32-NEXT: kxorq %k5, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $7, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $57, %k0, %k1
+; X32-NEXT: kxorq %k6, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $6, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $58, %k0, %k1
+; X32-NEXT: kxorq %k7, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $5, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $59, %k0, %k1
+; X32-NEXT: shrb $3, %cl
+; X32-NEXT: kmovd %ecx, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $4, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $60, %k0, %k1
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: shrl $28, %ecx
+; X32-NEXT: kmovd %ecx, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $3, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $61, %k0, %k1
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: shrl $29, %ecx
+; X32-NEXT: andb $1, %cl
+; X32-NEXT: kmovd %ecx, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $2, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $62, %k0, %k1
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: shrl $30, %ecx
+; X32-NEXT: kmovd %ecx, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $1, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftlq $1, %k0, %k0
+; X32-NEXT: kshiftrq $1, %k0, %k0
+; X32-NEXT: shrl $31, %eax
+; X32-NEXT: kmovd %eax, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: korq %k1, %k0, %k1
+; X32-NEXT: vptestmb %zmm0, %zmm1, %k0 {%k1}
+; X32-NEXT: kmovq %k0, (%esp)
+; X32-NEXT: movl (%esp), %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT: leal -8(%ebp), %esp
+; X32-NEXT: popl %esi
+; X32-NEXT: popl %ebx
+; X32-NEXT: popl %ebp
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_mask_test_epi8_mask:
+; X64: # %bb.0: # %entry
+; X64-NEXT: kmovq %rdi, %k1
+; X64-NEXT: vptestmb %zmm0, %zmm1, %k0 {%k1}
+; X64-NEXT: kmovq %k0, %rax
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+entry:
+ %and1.i.i = and <8 x i64> %__B, %__A
+ %0 = bitcast <8 x i64> %and1.i.i to <64 x i8>
+ %1 = icmp ne <64 x i8> %0, zeroinitializer
+ %2 = bitcast i64 %__U to <64 x i1>
+ %3 = and <64 x i1> %1, %2
+ %4 = bitcast <64 x i1> %3 to i64
+ ret i64 %4
+}
+
+define i32 @test_mm512_test_epi16_mask(<8 x i64> %__A, <8 x i64> %__B) {
+; X32-LABEL: test_mm512_test_epi16_mask:
+; X32: # %bb.0: # %entry
+; X32-NEXT: vptestmw %zmm0, %zmm1, %k0
+; X32-NEXT: kmovd %k0, %eax
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_test_epi16_mask:
+; X64: # %bb.0: # %entry
+; X64-NEXT: vptestmw %zmm0, %zmm1, %k0
+; X64-NEXT: kmovd %k0, %eax
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+entry:
+ %and1.i.i = and <8 x i64> %__B, %__A
+ %0 = bitcast <8 x i64> %and1.i.i to <32 x i16>
+ %1 = icmp ne <32 x i16> %0, zeroinitializer
+ %2 = bitcast <32 x i1> %1 to i32
+ ret i32 %2
+}
+
+define i32 @test_mm512_mask_test_epi16_mask(i32 %__U, <8 x i64> %__A, <8 x i64> %__B) {
+; X32-LABEL: test_mm512_mask_test_epi16_mask:
+; X32: # %bb.0: # %entry
+; X32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
+; X32-NEXT: vptestmw %zmm0, %zmm1, %k0 {%k1}
+; X32-NEXT: kmovd %k0, %eax
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_mask_test_epi16_mask:
+; X64: # %bb.0: # %entry
+; X64-NEXT: kmovd %edi, %k1
+; X64-NEXT: vptestmw %zmm0, %zmm1, %k0 {%k1}
+; X64-NEXT: kmovd %k0, %eax
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+entry:
+ %and1.i.i = and <8 x i64> %__B, %__A
+ %0 = bitcast <8 x i64> %and1.i.i to <32 x i16>
+ %1 = icmp ne <32 x i16> %0, zeroinitializer
+ %2 = bitcast i32 %__U to <32 x i1>
+ %3 = and <32 x i1> %1, %2
+ %4 = bitcast <32 x i1> %3 to i32
+ ret i32 %4
+}
+
+define i64 @test_mm512_testn_epi8_mask(<8 x i64> %__A, <8 x i64> %__B) {
+; X32-LABEL: test_mm512_testn_epi8_mask:
+; X32: # %bb.0: # %entry
+; X32-NEXT: pushl %ebp
+; X32-NEXT: .cfi_def_cfa_offset 8
+; X32-NEXT: .cfi_offset %ebp, -8
+; X32-NEXT: movl %esp, %ebp
+; X32-NEXT: .cfi_def_cfa_register %ebp
+; X32-NEXT: andl $-8, %esp
+; X32-NEXT: subl $8, %esp
+; X32-NEXT: vptestnmb %zmm0, %zmm1, %k0
+; X32-NEXT: kmovq %k0, (%esp)
+; X32-NEXT: movl (%esp), %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT: movl %ebp, %esp
+; X32-NEXT: popl %ebp
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_testn_epi8_mask:
+; X64: # %bb.0: # %entry
+; X64-NEXT: vptestnmb %zmm0, %zmm1, %k0
+; X64-NEXT: kmovq %k0, %rax
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+entry:
+ %and1.i.i = and <8 x i64> %__B, %__A
+ %0 = bitcast <8 x i64> %and1.i.i to <64 x i8>
+ %1 = icmp eq <64 x i8> %0, zeroinitializer
+ %2 = bitcast <64 x i1> %1 to i64
+ ret i64 %2
+}
+
+define i64 @test_mm512_mask_testn_epi8_mask(i64 %__U, <8 x i64> %__A, <8 x i64> %__B) {
+; X32-LABEL: test_mm512_mask_testn_epi8_mask:
+; X32: # %bb.0: # %entry
+; X32-NEXT: pushl %ebp
+; X32-NEXT: .cfi_def_cfa_offset 8
+; X32-NEXT: .cfi_offset %ebp, -8
+; X32-NEXT: movl %esp, %ebp
+; X32-NEXT: .cfi_def_cfa_register %ebp
+; X32-NEXT: pushl %ebx
+; X32-NEXT: pushl %esi
+; X32-NEXT: andl $-8, %esp
+; X32-NEXT: subl $8, %esp
+; X32-NEXT: .cfi_offset %esi, -16
+; X32-NEXT: .cfi_offset %ebx, -12
+; X32-NEXT: movl 8(%ebp), %ecx
+; X32-NEXT: kmovd %ecx, %k0
+; X32-NEXT: kshiftrq $1, %k0, %k1
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: andb $2, %al
+; X32-NEXT: shrb %al
+; X32-NEXT: kmovd %eax, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $62, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $2, %k0, %k1
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: andb $15, %al
+; X32-NEXT: movl %eax, %edx
+; X32-NEXT: shrb $2, %dl
+; X32-NEXT: kmovd %edx, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $61, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $3, %k0, %k1
+; X32-NEXT: shrb $3, %al
+; X32-NEXT: kmovd %eax, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $60, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $4, %k0, %k1
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: shrb $4, %al
+; X32-NEXT: kmovd %eax, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $59, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $5, %k0, %k1
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: shrb $5, %al
+; X32-NEXT: andb $1, %al
+; X32-NEXT: kmovd %eax, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $58, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $6, %k0, %k1
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: shrb $6, %al
+; X32-NEXT: kmovd %eax, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $57, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $7, %k0, %k1
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: shrb $7, %al
+; X32-NEXT: kmovd %eax, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $56, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $8, %k0, %k1
+; X32-NEXT: movb %ch, %al
+; X32-NEXT: kmovd %eax, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: andb $2, %al
+; X32-NEXT: shrb %al
+; X32-NEXT: kmovd %eax, %k2
+; X32-NEXT: movb %ch, %al
+; X32-NEXT: andb $15, %al
+; X32-NEXT: movl %eax, %edx
+; X32-NEXT: shrb $2, %dl
+; X32-NEXT: kmovd %edx, %k3
+; X32-NEXT: shrb $3, %al
+; X32-NEXT: kmovd %eax, %k4
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: shrl $13, %eax
+; X32-NEXT: andb $1, %al
+; X32-NEXT: kmovd %eax, %k5
+; X32-NEXT: movl %ecx, %edx
+; X32-NEXT: shrl $16, %edx
+; X32-NEXT: movl %edx, %eax
+; X32-NEXT: andb $2, %al
+; X32-NEXT: shrb %al
+; X32-NEXT: kmovd %eax, %k6
+; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: andb $15, %bl
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: shrb $2, %al
+; X32-NEXT: kmovd %eax, %k7
+; X32-NEXT: movl 12(%ebp), %eax
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $55, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $9, %k0, %k1
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $54, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $10, %k0, %k1
+; X32-NEXT: kxorq %k3, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $53, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $11, %k0, %k1
+; X32-NEXT: kxorq %k4, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $52, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $12, %k0, %k1
+; X32-NEXT: movl %ecx, %esi
+; X32-NEXT: shrl $12, %esi
+; X32-NEXT: andl $15, %esi
+; X32-NEXT: kmovd %esi, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $51, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $13, %k0, %k1
+; X32-NEXT: kxorq %k5, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $50, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $14, %k0, %k1
+; X32-NEXT: movl %ecx, %esi
+; X32-NEXT: shrl $14, %esi
+; X32-NEXT: andl $3, %esi
+; X32-NEXT: kmovd %esi, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $49, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $15, %k0, %k1
+; X32-NEXT: movl %ecx, %esi
+; X32-NEXT: shrl $15, %esi
+; X32-NEXT: andl $1, %esi
+; X32-NEXT: kmovd %esi, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $48, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $16, %k0, %k1
+; X32-NEXT: kmovd %edx, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $47, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $17, %k0, %k1
+; X32-NEXT: kxorq %k6, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $46, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $18, %k0, %k1
+; X32-NEXT: kxorq %k7, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $45, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $19, %k0, %k1
+; X32-NEXT: shrb $3, %bl
+; X32-NEXT: kmovd %ebx, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $44, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $20, %k0, %k1
+; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: shrb $4, %bl
+; X32-NEXT: kmovd %ebx, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $43, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $21, %k0, %k1
+; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: shrb $5, %bl
+; X32-NEXT: andb $1, %bl
+; X32-NEXT: kmovd %ebx, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $42, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $22, %k0, %k1
+; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: shrb $6, %bl
+; X32-NEXT: kmovd %ebx, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $41, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $23, %k0, %k1
+; X32-NEXT: shrb $7, %dl
+; X32-NEXT: kmovd %edx, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $40, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $24, %k0, %k1
+; X32-NEXT: movl %ecx, %edx
+; X32-NEXT: shrl $24, %edx
+; X32-NEXT: kmovd %edx, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $39, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $25, %k0, %k1
+; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: andb $2, %bl
+; X32-NEXT: shrb %bl
+; X32-NEXT: kmovd %ebx, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $38, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $26, %k0, %k1
+; X32-NEXT: andb $15, %dl
+; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: shrb $2, %bl
+; X32-NEXT: kmovd %ebx, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $37, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $27, %k0, %k1
+; X32-NEXT: shrb $3, %dl
+; X32-NEXT: kmovd %edx, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $36, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $28, %k0, %k1
+; X32-NEXT: movl %ecx, %edx
+; X32-NEXT: shrl $28, %edx
+; X32-NEXT: kmovd %edx, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $35, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $29, %k0, %k1
+; X32-NEXT: movl %ecx, %edx
+; X32-NEXT: shrl $29, %edx
+; X32-NEXT: andb $1, %dl
+; X32-NEXT: kmovd %edx, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $34, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $30, %k0, %k1
+; X32-NEXT: movl %ecx, %edx
+; X32-NEXT: shrl $30, %edx
+; X32-NEXT: kmovd %edx, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $33, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $31, %k0, %k1
+; X32-NEXT: shrl $31, %ecx
+; X32-NEXT: kmovd %ecx, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $32, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $32, %k0, %k1
+; X32-NEXT: kmovd %eax, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $31, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $33, %k0, %k1
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: andb $2, %cl
+; X32-NEXT: shrb %cl
+; X32-NEXT: kmovd %ecx, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $30, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $34, %k0, %k1
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: andb $15, %cl
+; X32-NEXT: movl %ecx, %edx
+; X32-NEXT: shrb $2, %dl
+; X32-NEXT: kmovd %edx, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $29, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $35, %k0, %k1
+; X32-NEXT: shrb $3, %cl
+; X32-NEXT: kmovd %ecx, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $28, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $36, %k0, %k1
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: shrb $4, %cl
+; X32-NEXT: kmovd %ecx, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $27, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $37, %k0, %k1
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: shrb $5, %cl
+; X32-NEXT: andb $1, %cl
+; X32-NEXT: kmovd %ecx, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $26, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $38, %k0, %k1
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: shrb $6, %cl
+; X32-NEXT: kmovd %ecx, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $25, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $39, %k0, %k1
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: shrb $7, %cl
+; X32-NEXT: kmovd %ecx, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $24, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $40, %k0, %k1
+; X32-NEXT: movb %ah, %cl
+; X32-NEXT: kmovd %ecx, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: andb $2, %cl
+; X32-NEXT: shrb %cl
+; X32-NEXT: kmovd %ecx, %k2
+; X32-NEXT: movb %ah, %cl
+; X32-NEXT: andb $15, %cl
+; X32-NEXT: movl %ecx, %edx
+; X32-NEXT: shrb $2, %dl
+; X32-NEXT: kmovd %edx, %k3
+; X32-NEXT: shrb $3, %cl
+; X32-NEXT: kmovd %ecx, %k4
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: shrl $13, %ecx
+; X32-NEXT: andb $1, %cl
+; X32-NEXT: kmovd %ecx, %k5
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: shrl $16, %ecx
+; X32-NEXT: movl %ecx, %edx
+; X32-NEXT: andb $2, %dl
+; X32-NEXT: shrb %dl
+; X32-NEXT: kmovd %edx, %k6
+; X32-NEXT: movl %ecx, %edx
+; X32-NEXT: andb $15, %dl
+; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: shrb $2, %bl
+; X32-NEXT: kmovd %ebx, %k7
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $23, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $41, %k0, %k1
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $22, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $42, %k0, %k1
+; X32-NEXT: kxorq %k3, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $21, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $43, %k0, %k1
+; X32-NEXT: kxorq %k4, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $20, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $44, %k0, %k1
+; X32-NEXT: movl %eax, %esi
+; X32-NEXT: shrl $12, %esi
+; X32-NEXT: andl $15, %esi
+; X32-NEXT: kmovd %esi, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $19, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $45, %k0, %k1
+; X32-NEXT: kxorq %k5, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $18, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $46, %k0, %k1
+; X32-NEXT: movl %eax, %esi
+; X32-NEXT: shrl $14, %esi
+; X32-NEXT: andl $3, %esi
+; X32-NEXT: kmovd %esi, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $17, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $47, %k0, %k1
+; X32-NEXT: movl %eax, %esi
+; X32-NEXT: shrl $15, %esi
+; X32-NEXT: andl $1, %esi
+; X32-NEXT: kmovd %esi, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $16, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $48, %k0, %k1
+; X32-NEXT: kmovd %ecx, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $15, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $49, %k0, %k1
+; X32-NEXT: kxorq %k6, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $14, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $50, %k0, %k1
+; X32-NEXT: kxorq %k7, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $13, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $51, %k0, %k1
+; X32-NEXT: shrb $3, %dl
+; X32-NEXT: kmovd %edx, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $12, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $52, %k0, %k1
+; X32-NEXT: movl %ecx, %edx
+; X32-NEXT: shrb $4, %dl
+; X32-NEXT: kmovd %edx, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: movl %ecx, %edx
+; X32-NEXT: shrb $5, %dl
+; X32-NEXT: andb $1, %dl
+; X32-NEXT: kmovd %edx, %k2
+; X32-NEXT: movl %ecx, %edx
+; X32-NEXT: shrb $6, %dl
+; X32-NEXT: kmovd %edx, %k3
+; X32-NEXT: shrb $7, %cl
+; X32-NEXT: kmovd %ecx, %k4
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: shrl $24, %ecx
+; X32-NEXT: kmovd %ecx, %k5
+; X32-NEXT: movl %ecx, %edx
+; X32-NEXT: andb $2, %dl
+; X32-NEXT: shrb %dl
+; X32-NEXT: kmovd %edx, %k6
+; X32-NEXT: andb $15, %cl
+; X32-NEXT: movl %ecx, %edx
+; X32-NEXT: shrb $2, %dl
+; X32-NEXT: kmovd %edx, %k7
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $11, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $53, %k0, %k1
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $10, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $54, %k0, %k1
+; X32-NEXT: kxorq %k3, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $9, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $55, %k0, %k1
+; X32-NEXT: kxorq %k4, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $8, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $56, %k0, %k1
+; X32-NEXT: kxorq %k5, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $7, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $57, %k0, %k1
+; X32-NEXT: kxorq %k6, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $6, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $58, %k0, %k1
+; X32-NEXT: kxorq %k7, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $5, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $59, %k0, %k1
+; X32-NEXT: shrb $3, %cl
+; X32-NEXT: kmovd %ecx, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $4, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $60, %k0, %k1
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: shrl $28, %ecx
+; X32-NEXT: kmovd %ecx, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $3, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $61, %k0, %k1
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: shrl $29, %ecx
+; X32-NEXT: andb $1, %cl
+; X32-NEXT: kmovd %ecx, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $2, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftrq $62, %k0, %k1
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: shrl $30, %ecx
+; X32-NEXT: kmovd %ecx, %k2
+; X32-NEXT: kxorq %k2, %k1, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: kshiftrq $1, %k1, %k1
+; X32-NEXT: kxorq %k0, %k1, %k0
+; X32-NEXT: kshiftlq $1, %k0, %k0
+; X32-NEXT: kshiftrq $1, %k0, %k0
+; X32-NEXT: shrl $31, %eax
+; X32-NEXT: kmovd %eax, %k1
+; X32-NEXT: kshiftlq $63, %k1, %k1
+; X32-NEXT: korq %k1, %k0, %k1
+; X32-NEXT: vptestnmb %zmm0, %zmm1, %k0 {%k1}
+; X32-NEXT: kmovq %k0, (%esp)
+; X32-NEXT: movl (%esp), %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT: leal -8(%ebp), %esp
+; X32-NEXT: popl %esi
+; X32-NEXT: popl %ebx
+; X32-NEXT: popl %ebp
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_mask_testn_epi8_mask:
+; X64: # %bb.0: # %entry
+; X64-NEXT: kmovq %rdi, %k1
+; X64-NEXT: vptestnmb %zmm0, %zmm1, %k0 {%k1}
+; X64-NEXT: kmovq %k0, %rax
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+entry:
+ %and1.i.i = and <8 x i64> %__B, %__A
+ %0 = bitcast <8 x i64> %and1.i.i to <64 x i8>
+ %1 = icmp eq <64 x i8> %0, zeroinitializer
+ %2 = bitcast i64 %__U to <64 x i1>
+ %3 = and <64 x i1> %1, %2
+ %4 = bitcast <64 x i1> %3 to i64
+ ret i64 %4
+}
+
+define i32 @test_mm512_testn_epi16_mask(<8 x i64> %__A, <8 x i64> %__B) {
+; X32-LABEL: test_mm512_testn_epi16_mask:
+; X32: # %bb.0: # %entry
+; X32-NEXT: vptestnmw %zmm0, %zmm1, %k0
+; X32-NEXT: kmovd %k0, %eax
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_testn_epi16_mask:
+; X64: # %bb.0: # %entry
+; X64-NEXT: vptestnmw %zmm0, %zmm1, %k0
+; X64-NEXT: kmovd %k0, %eax
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+entry:
+ %and1.i.i = and <8 x i64> %__B, %__A
+ %0 = bitcast <8 x i64> %and1.i.i to <32 x i16>
+ %1 = icmp eq <32 x i16> %0, zeroinitializer
+ %2 = bitcast <32 x i1> %1 to i32
+ ret i32 %2
+}
+
+define i32 @test_mm512_mask_testn_epi16_mask(i32 %__U, <8 x i64> %__A, <8 x i64> %__B) {
+; X32-LABEL: test_mm512_mask_testn_epi16_mask:
+; X32: # %bb.0: # %entry
+; X32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
+; X32-NEXT: vptestnmw %zmm0, %zmm1, %k0 {%k1}
+; X32-NEXT: kmovd %k0, %eax
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_mask_testn_epi16_mask:
+; X64: # %bb.0: # %entry
+; X64-NEXT: kmovd %edi, %k1
+; X64-NEXT: vptestnmw %zmm0, %zmm1, %k0 {%k1}
+; X64-NEXT: kmovd %k0, %eax
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+entry:
+ %and1.i.i = and <8 x i64> %__B, %__A
+ %0 = bitcast <8 x i64> %and1.i.i to <32 x i16>
+ %1 = icmp eq <32 x i16> %0, zeroinitializer
+ %2 = bitcast i32 %__U to <32 x i1>
+ %3 = and <32 x i1> %1, %2
+ %4 = bitcast <32 x i1> %3 to i32
+ ret i32 %4
+}
+
!0 = !{i32 1}
diff --git a/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll
index 2b89373ceb0e..f19e09758f12 100644
--- a/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll
+++ b/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll
@@ -1,24 +1,127 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512BW
-; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512F-32
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512BW
+; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512F-32
+
+declare i32 @llvm.x86.avx512.kunpck.wd(i32, i32)
+
+define i32@test_int_x86_avx512_kunpck_wd(i32 %x0, i32 %x1) {
+; AVX512BW-LABEL: test_int_x86_avx512_kunpck_wd:
+; AVX512BW: ## %bb.0:
+; AVX512BW-NEXT: movzwl %di, %eax
+; AVX512BW-NEXT: shll $16, %esi
+; AVX512BW-NEXT: orl %esi, %eax
+; AVX512BW-NEXT: retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_kunpck_wd:
+; AVX512F-32: # %bb.0:
+; AVX512F-32-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
+; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT: shll $16, %eax
+; AVX512F-32-NEXT: orl %ecx, %eax
+; AVX512F-32-NEXT: retl
+ %res = call i32 @llvm.x86.avx512.kunpck.wd(i32 %x0, i32 %x1)
+ ret i32 %res
+}
+
+declare i64 @llvm.x86.avx512.kunpck.dq(i64, i64)
+
+define i64@test_int_x86_avx512_kunpck_qd(i64 %x0, i64 %x1) {
+; AVX512BW-LABEL: test_int_x86_avx512_kunpck_qd:
+; AVX512BW: ## %bb.0:
+; AVX512BW-NEXT: movl %edi, %eax
+; AVX512BW-NEXT: shlq $32, %rsi
+; AVX512BW-NEXT: orq %rsi, %rax
+; AVX512BW-NEXT: retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_kunpck_qd:
+; AVX512F-32: # %bb.0:
+; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT: retl
+ %res = call i64 @llvm.x86.avx512.kunpck.dq(i64 %x0, i64 %x1)
+ ret i64 %res
+}
+
+declare <64 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.512(i8, <64 x i8>, i64)
+
+ define <64 x i8>@test_int_x86_avx512_mask_pbroadcast_b_gpr_512(i8 %x0, <64 x i8> %x1, i64 %mask) {
+; AVX512BW-LABEL: test_int_x86_avx512_mask_pbroadcast_b_gpr_512:
+; AVX512BW: ## %bb.0:
+; AVX512BW-NEXT: vpbroadcastb %edi, %zmm1
+; AVX512BW-NEXT: kmovq %rsi, %k1
+; AVX512BW-NEXT: vpbroadcastb %edi, %zmm0 {%k1}
+; AVX512BW-NEXT: vpbroadcastb %edi, %zmm2 {%k1} {z}
+; AVX512BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT: retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_pbroadcast_b_gpr_512:
+; AVX512F-32: # %bb.0:
+; AVX512F-32-NEXT: movb {{[0-9]+}}(%esp), %al
+; AVX512F-32-NEXT: vpbroadcastb %eax, %zmm1
+; AVX512F-32-NEXT: kmovq {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT: vpbroadcastb %eax, %zmm0 {%k1}
+; AVX512F-32-NEXT: vpbroadcastb %eax, %zmm2 {%k1} {z}
+; AVX512F-32-NEXT: vpaddb %zmm2, %zmm0, %zmm0
+; AVX512F-32-NEXT: vpaddb %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: retl
+ %res = call <64 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.512(i8 %x0, <64 x i8> %x1, i64 -1)
+ %res1 = call <64 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.512(i8 %x0, <64 x i8> %x1, i64 %mask)
+ %res2 = call <64 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.512(i8 %x0, <64 x i8> zeroinitializer, i64 %mask)
+ %res3 = add <64 x i8> %res, %res1
+ %res4 = add <64 x i8> %res2, %res3
+ ret <64 x i8> %res4
+ }
+
+declare <32 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.512(i16, <32 x i16>, i32)
+ define <32 x i16>@test_int_x86_avx512_mask_pbroadcast_w_gpr_512(i16 %x0, <32 x i16> %x1, i32 %mask) {
+; AVX512BW-LABEL: test_int_x86_avx512_mask_pbroadcast_w_gpr_512:
+; AVX512BW: ## %bb.0:
+; AVX512BW-NEXT: vpbroadcastw %edi, %zmm1
+; AVX512BW-NEXT: kmovd %esi, %k1
+; AVX512BW-NEXT: vpbroadcastw %edi, %zmm0 {%k1}
+; AVX512BW-NEXT: vpbroadcastw %edi, %zmm2 {%k1} {z}
+; AVX512BW-NEXT: vpaddw %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT: retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_pbroadcast_w_gpr_512:
+; AVX512F-32: # %bb.0:
+; AVX512F-32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT: vpbroadcastw %eax, %zmm1
+; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT: vpbroadcastw %eax, %zmm0 {%k1}
+; AVX512F-32-NEXT: vpbroadcastw %eax, %zmm2 {%k1} {z}
+; AVX512F-32-NEXT: vpaddw %zmm2, %zmm0, %zmm0
+; AVX512F-32-NEXT: vpaddw %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: retl
+ %res = call <32 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.512(i16 %x0, <32 x i16> %x1, i32 -1)
+ %res1 = call <32 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.512(i16 %x0, <32 x i16> %x1, i32 %mask)
+ %res2 = call <32 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.512(i16 %x0, <32 x i16> zeroinitializer, i32 %mask)
+ %res3 = add <32 x i16> %res, %res1
+ %res4 = add <32 x i16> %res2, %res3
+ ret <32 x i16> %res4
+ }
declare void @llvm.x86.avx512.mask.storeu.b.512(i8*, <64 x i8>, i64)
define void@test_int_x86_avx512_mask_storeu_b_512(i8* %ptr1, i8* %ptr2, <64 x i8> %x1, i64 %x2) {
; AVX512BW-LABEL: test_int_x86_avx512_mask_storeu_b_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovq %rdx, %k1
; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rdi) {%k1}
-; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rsi)
+; AVX512BW-NEXT: vmovdqu32 %zmm0, (%rsi)
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_int_x86_avx512_mask_storeu_b_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; AVX512F-32-NEXT: kmovq {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vmovdqu8 %zmm0, (%ecx) {%k1}
-; AVX512F-32-NEXT: vmovdqu8 %zmm0, (%eax)
+; AVX512F-32-NEXT: vmovdqu32 %zmm0, (%eax)
+; AVX512F-32-NEXT: vzeroupper
; AVX512F-32-NEXT: retl
call void @llvm.x86.avx512.mask.storeu.b.512(i8* %ptr1, <64 x i8> %x1, i64 %x2)
call void @llvm.x86.avx512.mask.storeu.b.512(i8* %ptr2, <64 x i8> %x1, i64 -1)
@@ -29,19 +132,21 @@ declare void @llvm.x86.avx512.mask.storeu.w.512(i8*, <32 x i16>, i32)
define void@test_int_x86_avx512_mask_storeu_w_512(i8* %ptr1, i8* %ptr2, <32 x i16> %x1, i32 %x2) {
; AVX512BW-LABEL: test_int_x86_avx512_mask_storeu_w_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovd %edx, %k1
; AVX512BW-NEXT: vmovdqu16 %zmm0, (%rdi) {%k1}
-; AVX512BW-NEXT: vmovdqu16 %zmm0, (%rsi)
+; AVX512BW-NEXT: vmovdqu32 %zmm0, (%rsi)
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_int_x86_avx512_mask_storeu_w_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vmovdqu16 %zmm0, (%ecx) {%k1}
-; AVX512F-32-NEXT: vmovdqu16 %zmm0, (%eax)
+; AVX512F-32-NEXT: vmovdqu32 %zmm0, (%eax)
+; AVX512F-32-NEXT: vzeroupper
; AVX512F-32-NEXT: retl
call void @llvm.x86.avx512.mask.storeu.w.512(i8* %ptr1, <32 x i16> %x1, i32 %x2)
call void @llvm.x86.avx512.mask.storeu.w.512(i8* %ptr2, <32 x i16> %x1, i32 -1)
@@ -52,8 +157,8 @@ declare <32 x i16> @llvm.x86.avx512.mask.loadu.w.512(i8*, <32 x i16>, i32)
define <32 x i16>@test_int_x86_avx512_mask_loadu_w_512(i8* %ptr, i8* %ptr2, <32 x i16> %x1, i32 %mask) {
; AVX512BW-LABEL: test_int_x86_avx512_mask_loadu_w_512:
-; AVX512BW: ## BB#0:
-; AVX512BW-NEXT: vmovdqu16 (%rdi), %zmm0
+; AVX512BW: ## %bb.0:
+; AVX512BW-NEXT: vmovdqu64 (%rdi), %zmm0
; AVX512BW-NEXT: kmovd %edx, %k1
; AVX512BW-NEXT: vmovdqu16 (%rsi), %zmm0 {%k1}
; AVX512BW-NEXT: vmovdqu16 (%rdi), %zmm1 {%k1} {z}
@@ -61,10 +166,10 @@ define <32 x i16>@test_int_x86_avx512_mask_loadu_w_512(i8* %ptr, i8* %ptr2, <32
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_int_x86_avx512_mask_loadu_w_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; AVX512F-32-NEXT: vmovdqu16 (%ecx), %zmm0
+; AVX512F-32-NEXT: vmovdqu64 (%ecx), %zmm0
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vmovdqu16 (%eax), %zmm0 {%k1}
; AVX512F-32-NEXT: vmovdqu16 (%ecx), %zmm1 {%k1} {z}
@@ -81,8 +186,8 @@ declare <64 x i8> @llvm.x86.avx512.mask.loadu.b.512(i8*, <64 x i8>, i64)
define <64 x i8>@test_int_x86_avx512_mask_loadu_b_512(i8* %ptr, i8* %ptr2, <64 x i8> %x1, i64 %mask) {
; AVX512BW-LABEL: test_int_x86_avx512_mask_loadu_b_512:
-; AVX512BW: ## BB#0:
-; AVX512BW-NEXT: vmovdqu8 (%rdi), %zmm0
+; AVX512BW: ## %bb.0:
+; AVX512BW-NEXT: vmovdqu64 (%rdi), %zmm0
; AVX512BW-NEXT: kmovq %rdx, %k1
; AVX512BW-NEXT: vmovdqu8 (%rsi), %zmm0 {%k1}
; AVX512BW-NEXT: vmovdqu8 (%rdi), %zmm1 {%k1} {z}
@@ -90,10 +195,10 @@ define <64 x i8>@test_int_x86_avx512_mask_loadu_b_512(i8* %ptr, i8* %ptr2, <64 x
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_int_x86_avx512_mask_loadu_b_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; AVX512F-32-NEXT: vmovdqu8 (%ecx), %zmm0
+; AVX512F-32-NEXT: vmovdqu64 (%ecx), %zmm0
; AVX512F-32-NEXT: kmovq {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vmovdqu8 (%eax), %zmm0 {%k1}
; AVX512F-32-NEXT: vmovdqu8 (%ecx), %zmm1 {%k1} {z}
@@ -110,14 +215,14 @@ declare <8 x i64> @llvm.x86.avx512.psll.dq.512(<8 x i64>, i32)
define <8 x i64>@test_int_x86_avx512_psll_dq_512(<8 x i64> %x0) {
; AVX512BW-LABEL: test_int_x86_avx512_psll_dq_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: vpslldq {{.*#+}} zmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zmm0[0,1,2,3,4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,zmm0[16,17,18,19,20,21,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zmm0[32,33,34,35,36,37,38,39],zero,zero,zero,zero,zero,zero,zero,zero,zmm0[48,49,50,51,52,53,54,55]
; AVX512BW-NEXT: vpslldq {{.*#+}} zmm0 = zero,zero,zero,zero,zmm0[0,1,2,3,4,5,6,7,8,9,10,11],zero,zero,zero,zero,zmm0[16,17,18,19,20,21,22,23,24,25,26,27],zero,zero,zero,zero,zmm0[32,33,34,35,36,37,38,39,40,41,42,43],zero,zero,zero,zero,zmm0[48,49,50,51,52,53,54,55,56,57,58,59]
; AVX512BW-NEXT: vpaddq %zmm0, %zmm1, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_int_x86_avx512_psll_dq_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: vpslldq {{.*#+}} zmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zmm0[0,1,2,3,4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,zmm0[16,17,18,19,20,21,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zmm0[32,33,34,35,36,37,38,39],zero,zero,zero,zero,zero,zero,zero,zero,zmm0[48,49,50,51,52,53,54,55]
; AVX512F-32-NEXT: vpslldq {{.*#+}} zmm0 = zero,zero,zero,zero,zmm0[0,1,2,3,4,5,6,7,8,9,10,11],zero,zero,zero,zero,zmm0[16,17,18,19,20,21,22,23,24,25,26,27],zero,zero,zero,zero,zmm0[32,33,34,35,36,37,38,39,40,41,42,43],zero,zero,zero,zero,zmm0[48,49,50,51,52,53,54,55,56,57,58,59]
; AVX512F-32-NEXT: vpaddq %zmm0, %zmm1, %zmm0
@@ -130,12 +235,12 @@ define <8 x i64>@test_int_x86_avx512_psll_dq_512(<8 x i64> %x0) {
define <8 x i64>@test_int_x86_avx512_psll_load_dq_512(<8 x i64>* %p0) {
; AVX512BW-LABEL: test_int_x86_avx512_psll_load_dq_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: vpslldq {{.*#+}} zmm0 = zero,zero,zero,zero,mem[0,1,2,3,4,5,6,7,8,9,10,11],zero,zero,zero,zero,mem[16,17,18,19,20,21,22,23,24,25,26,27],zero,zero,zero,zero,mem[32,33,34,35,36,37,38,39,40,41,42,43],zero,zero,zero,zero,mem[48,49,50,51,52,53,54,55,56,57,58,59]
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_int_x86_avx512_psll_load_dq_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX512F-32-NEXT: vpslldq {{.*#+}} zmm0 = zero,zero,zero,zero,mem[0,1,2,3,4,5,6,7,8,9,10,11],zero,zero,zero,zero,mem[16,17,18,19,20,21,22,23,24,25,26,27],zero,zero,zero,zero,mem[32,33,34,35,36,37,38,39,40,41,42,43],zero,zero,zero,zero,mem[48,49,50,51,52,53,54,55,56,57,58,59]
; AVX512F-32-NEXT: retl
@@ -148,14 +253,14 @@ declare <8 x i64> @llvm.x86.avx512.psrl.dq.512(<8 x i64>, i32)
define <8 x i64>@test_int_x86_avx512_psrl_dq_512(<8 x i64> %x0) {
; AVX512BW-LABEL: test_int_x86_avx512_psrl_dq_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: vpsrldq {{.*#+}} zmm1 = zmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zmm0[24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zmm0[40,41,42,43,44,45,46,47],zero,zero,zero,zero,zero,zero,zero,zero,zmm0[56,57,58,59,60,61,62,63],zero,zero,zero,zero,zero,zero,zero,zero
; AVX512BW-NEXT: vpsrldq {{.*#+}} zmm0 = zmm0[4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zmm0[20,21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zmm0[36,37,38,39,40,41,42,43,44,45,46,47],zero,zero,zero,zero,zmm0[52,53,54,55,56,57,58,59,60,61,62,63],zero,zero,zero,zero
; AVX512BW-NEXT: vpaddq %zmm0, %zmm1, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_int_x86_avx512_psrl_dq_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: vpsrldq {{.*#+}} zmm1 = zmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zmm0[24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zmm0[40,41,42,43,44,45,46,47],zero,zero,zero,zero,zero,zero,zero,zero,zmm0[56,57,58,59,60,61,62,63],zero,zero,zero,zero,zero,zero,zero,zero
; AVX512F-32-NEXT: vpsrldq {{.*#+}} zmm0 = zmm0[4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zmm0[20,21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zmm0[36,37,38,39,40,41,42,43,44,45,46,47],zero,zero,zero,zero,zmm0[52,53,54,55,56,57,58,59,60,61,62,63],zero,zero,zero,zero
; AVX512F-32-NEXT: vpaddq %zmm0, %zmm1, %zmm0
@@ -168,12 +273,12 @@ define <8 x i64>@test_int_x86_avx512_psrl_dq_512(<8 x i64> %x0) {
define <8 x i64>@test_int_x86_avx512_psrl_load_dq_512(<8 x i64>* %p0) {
; AVX512BW-LABEL: test_int_x86_avx512_psrl_load_dq_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: vpsrldq {{.*#+}} zmm0 = mem[4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,mem[20,21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,mem[36,37,38,39,40,41,42,43,44,45,46,47],zero,zero,zero,zero,mem[52,53,54,55,56,57,58,59,60,61,62,63],zero,zero,zero,zero
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_int_x86_avx512_psrl_load_dq_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX512F-32-NEXT: vpsrldq {{.*#+}} zmm0 = mem[4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,mem[20,21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,mem[36,37,38,39,40,41,42,43,44,45,46,47],zero,zero,zero,zero,mem[52,53,54,55,56,57,58,59,60,61,62,63],zero,zero,zero,zero
; AVX512F-32-NEXT: retl
@@ -186,23 +291,23 @@ declare <64 x i8> @llvm.x86.avx512.mask.palignr.512(<64 x i8>, <64 x i8>, i32, <
define <64 x i8>@test_int_x86_avx512_mask_palignr_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x3, i64 %x4) {
; AVX512BW-LABEL: test_int_x86_avx512_mask_palignr_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: vpalignr {{.*#+}} zmm3 = zmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0,1],zmm1[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zmm0[16,17],zmm1[34,35,36,37,38,39,40,41,42,43,44,45,46,47],zmm0[32,33],zmm1[50,51,52,53,54,55,56,57,58,59,60,61,62,63],zmm0[48,49]
; AVX512BW-NEXT: kmovq %rdi, %k1
; AVX512BW-NEXT: vpalignr {{.*#+}} zmm2 {%k1} = zmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0,1],zmm1[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zmm0[16,17],zmm1[34,35,36,37,38,39,40,41,42,43,44,45,46,47],zmm0[32,33],zmm1[50,51,52,53,54,55,56,57,58,59,60,61,62,63],zmm0[48,49]
; AVX512BW-NEXT: vpalignr {{.*#+}} zmm0 {%k1} {z} = zmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0,1],zmm1[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zmm0[16,17],zmm1[34,35,36,37,38,39,40,41,42,43,44,45,46,47],zmm0[32,33],zmm1[50,51,52,53,54,55,56,57,58,59,60,61,62,63],zmm0[48,49]
-; AVX512BW-NEXT: vpaddb %zmm0, %zmm2, %zmm0
; AVX512BW-NEXT: vpaddb %zmm3, %zmm0, %zmm0
+; AVX512BW-NEXT: vpaddb %zmm0, %zmm2, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_int_x86_avx512_mask_palignr_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: vpalignr {{.*#+}} zmm3 = zmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0,1],zmm1[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zmm0[16,17],zmm1[34,35,36,37,38,39,40,41,42,43,44,45,46,47],zmm0[32,33],zmm1[50,51,52,53,54,55,56,57,58,59,60,61,62,63],zmm0[48,49]
; AVX512F-32-NEXT: kmovq {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpalignr {{.*#+}} zmm2 {%k1} = zmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0,1],zmm1[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zmm0[16,17],zmm1[34,35,36,37,38,39,40,41,42,43,44,45,46,47],zmm0[32,33],zmm1[50,51,52,53,54,55,56,57,58,59,60,61,62,63],zmm0[48,49]
; AVX512F-32-NEXT: vpalignr {{.*#+}} zmm0 {%k1} {z} = zmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0,1],zmm1[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zmm0[16,17],zmm1[34,35,36,37,38,39,40,41,42,43,44,45,46,47],zmm0[32,33],zmm1[50,51,52,53,54,55,56,57,58,59,60,61,62,63],zmm0[48,49]
-; AVX512F-32-NEXT: vpaddb %zmm0, %zmm2, %zmm0
; AVX512F-32-NEXT: vpaddb %zmm3, %zmm0, %zmm0
+; AVX512F-32-NEXT: vpaddb %zmm0, %zmm2, %zmm0
; AVX512F-32-NEXT: retl
%res = call <64 x i8> @llvm.x86.avx512.mask.palignr.512(<64 x i8> %x0, <64 x i8> %x1, i32 2, <64 x i8> %x3, i64 %x4)
%res1 = call <64 x i8> @llvm.x86.avx512.mask.palignr.512(<64 x i8> %x0, <64 x i8> %x1, i32 2, <64 x i8> zeroinitializer, i64 %x4)
@@ -216,23 +321,23 @@ declare <32 x i16> @llvm.x86.avx512.mask.pshufh.w.512(<32 x i16>, i32, <32 x i16
define <32 x i16>@test_int_x86_avx512_mask_pshufh_w_512(<32 x i16> %x0, i32 %x1, <32 x i16> %x2, i32 %x3) {
; AVX512BW-LABEL: test_int_x86_avx512_mask_pshufh_w_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: vpshufhw {{.*#+}} zmm2 = zmm0[0,1,2,3,7,4,4,4,8,9,10,11,15,12,12,12,16,17,18,19,23,20,20,20,24,25,26,27,31,28,28,28]
; AVX512BW-NEXT: kmovd %esi, %k1
; AVX512BW-NEXT: vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,7,4,4,4,8,9,10,11,15,12,12,12,16,17,18,19,23,20,20,20,24,25,26,27,31,28,28,28]
; AVX512BW-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,7,4,4,4,8,9,10,11,15,12,12,12,16,17,18,19,23,20,20,20,24,25,26,27,31,28,28,28]
-; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0
; AVX512BW-NEXT: vpaddw %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_int_x86_avx512_mask_pshufh_w_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: vpshufhw {{.*#+}} zmm2 = zmm0[0,1,2,3,7,4,4,4,8,9,10,11,15,12,12,12,16,17,18,19,23,20,20,20,24,25,26,27,31,28,28,28]
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,7,4,4,4,8,9,10,11,15,12,12,12,16,17,18,19,23,20,20,20,24,25,26,27,31,28,28,28]
; AVX512F-32-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,7,4,4,4,8,9,10,11,15,12,12,12,16,17,18,19,23,20,20,20,24,25,26,27,31,28,28,28]
-; AVX512F-32-NEXT: vpaddw %zmm0, %zmm1, %zmm0
; AVX512F-32-NEXT: vpaddw %zmm2, %zmm0, %zmm0
+; AVX512F-32-NEXT: vpaddw %zmm0, %zmm1, %zmm0
; AVX512F-32-NEXT: retl
%res = call <32 x i16> @llvm.x86.avx512.mask.pshufh.w.512(<32 x i16> %x0, i32 3, <32 x i16> %x2, i32 %x3)
%res1 = call <32 x i16> @llvm.x86.avx512.mask.pshufh.w.512(<32 x i16> %x0, i32 3, <32 x i16> zeroinitializer, i32 %x3)
@@ -246,23 +351,23 @@ declare <32 x i16> @llvm.x86.avx512.mask.pshufl.w.512(<32 x i16>, i32, <32 x i16
define <32 x i16>@test_int_x86_avx512_mask_pshufl_w_512(<32 x i16> %x0, i32 %x1, <32 x i16> %x2, i32 %x3) {
; AVX512BW-LABEL: test_int_x86_avx512_mask_pshufl_w_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: vpshuflw {{.*#+}} zmm2 = zmm0[3,0,0,0,4,5,6,7,11,8,8,8,12,13,14,15,19,16,16,16,20,21,22,23,27,24,24,24,28,29,30,31]
; AVX512BW-NEXT: kmovd %esi, %k1
; AVX512BW-NEXT: vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[3,0,0,0,4,5,6,7,11,8,8,8,12,13,14,15,19,16,16,16,20,21,22,23,27,24,24,24,28,29,30,31]
; AVX512BW-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[3,0,0,0,4,5,6,7,11,8,8,8,12,13,14,15,19,16,16,16,20,21,22,23,27,24,24,24,28,29,30,31]
-; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0
; AVX512BW-NEXT: vpaddw %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_int_x86_avx512_mask_pshufl_w_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: vpshuflw {{.*#+}} zmm2 = zmm0[3,0,0,0,4,5,6,7,11,8,8,8,12,13,14,15,19,16,16,16,20,21,22,23,27,24,24,24,28,29,30,31]
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[3,0,0,0,4,5,6,7,11,8,8,8,12,13,14,15,19,16,16,16,20,21,22,23,27,24,24,24,28,29,30,31]
; AVX512F-32-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[3,0,0,0,4,5,6,7,11,8,8,8,12,13,14,15,19,16,16,16,20,21,22,23,27,24,24,24,28,29,30,31]
-; AVX512F-32-NEXT: vpaddw %zmm0, %zmm1, %zmm0
; AVX512F-32-NEXT: vpaddw %zmm2, %zmm0, %zmm0
+; AVX512F-32-NEXT: vpaddw %zmm0, %zmm1, %zmm0
; AVX512F-32-NEXT: retl
%res = call <32 x i16> @llvm.x86.avx512.mask.pshufl.w.512(<32 x i16> %x0, i32 3, <32 x i16> %x2, i32 %x3)
%res1 = call <32 x i16> @llvm.x86.avx512.mask.pshufl.w.512(<32 x i16> %x0, i32 3, <32 x i16> zeroinitializer, i32 %x3)
@@ -274,21 +379,22 @@ define <32 x i16>@test_int_x86_avx512_mask_pshufl_w_512(<32 x i16> %x0, i32 %x1,
define i64 @test_pcmpeq_b(<64 x i8> %a, <64 x i8> %b) {
; AVX512BW-LABEL: test_pcmpeq_b:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: vpcmpeqb %zmm1, %zmm0, %k0
; AVX512BW-NEXT: kmovq %k0, %rax
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_pcmpeq_b:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: subl $12, %esp
-; AVX512F-32-NEXT: .Lcfi0:
; AVX512F-32-NEXT: .cfi_def_cfa_offset 16
; AVX512F-32-NEXT: vpcmpeqb %zmm1, %zmm0, %k0
; AVX512F-32-NEXT: kmovq %k0, (%esp)
; AVX512F-32-NEXT: movl (%esp), %eax
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: addl $12, %esp
+; AVX512F-32-NEXT: vzeroupper
; AVX512F-32-NEXT: retl
%res = call i64 @llvm.x86.avx512.mask.pcmpeq.b.512(<64 x i8> %a, <64 x i8> %b, i64 -1)
ret i64 %res
@@ -296,16 +402,16 @@ define i64 @test_pcmpeq_b(<64 x i8> %a, <64 x i8> %b) {
define i64 @test_mask_pcmpeq_b(<64 x i8> %a, <64 x i8> %b, i64 %mask) {
; AVX512BW-LABEL: test_mask_pcmpeq_b:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovq %rdi, %k1
; AVX512BW-NEXT: vpcmpeqb %zmm1, %zmm0, %k0 {%k1}
; AVX512BW-NEXT: kmovq %k0, %rax
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_mask_pcmpeq_b:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: subl $12, %esp
-; AVX512F-32-NEXT: .Lcfi1:
; AVX512F-32-NEXT: .cfi_def_cfa_offset 16
; AVX512F-32-NEXT: kmovq {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpcmpeqb %zmm1, %zmm0, %k0 {%k1}
@@ -313,6 +419,7 @@ define i64 @test_mask_pcmpeq_b(<64 x i8> %a, <64 x i8> %b, i64 %mask) {
; AVX512F-32-NEXT: movl (%esp), %eax
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: addl $12, %esp
+; AVX512F-32-NEXT: vzeroupper
; AVX512F-32-NEXT: retl
%res = call i64 @llvm.x86.avx512.mask.pcmpeq.b.512(<64 x i8> %a, <64 x i8> %b, i64 %mask)
ret i64 %res
@@ -322,15 +429,17 @@ declare i64 @llvm.x86.avx512.mask.pcmpeq.b.512(<64 x i8>, <64 x i8>, i64)
define i32 @test_pcmpeq_w(<32 x i16> %a, <32 x i16> %b) {
; AVX512BW-LABEL: test_pcmpeq_w:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: vpcmpeqw %zmm1, %zmm0, %k0
; AVX512BW-NEXT: kmovd %k0, %eax
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_pcmpeq_w:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: vpcmpeqw %zmm1, %zmm0, %k0
; AVX512F-32-NEXT: kmovd %k0, %eax
+; AVX512F-32-NEXT: vzeroupper
; AVX512F-32-NEXT: retl
%res = call i32 @llvm.x86.avx512.mask.pcmpeq.w.512(<32 x i16> %a, <32 x i16> %b, i32 -1)
ret i32 %res
@@ -338,17 +447,19 @@ define i32 @test_pcmpeq_w(<32 x i16> %a, <32 x i16> %b) {
define i32 @test_mask_pcmpeq_w(<32 x i16> %a, <32 x i16> %b, i32 %mask) {
; AVX512BW-LABEL: test_mask_pcmpeq_w:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovd %edi, %k1
; AVX512BW-NEXT: vpcmpeqw %zmm1, %zmm0, %k0 {%k1}
; AVX512BW-NEXT: kmovd %k0, %eax
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_mask_pcmpeq_w:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpcmpeqw %zmm1, %zmm0, %k0 {%k1}
; AVX512F-32-NEXT: kmovd %k0, %eax
+; AVX512F-32-NEXT: vzeroupper
; AVX512F-32-NEXT: retl
%res = call i32 @llvm.x86.avx512.mask.pcmpeq.w.512(<32 x i16> %a, <32 x i16> %b, i32 %mask)
ret i32 %res
@@ -358,21 +469,22 @@ declare i32 @llvm.x86.avx512.mask.pcmpeq.w.512(<32 x i16>, <32 x i16>, i32)
define i64 @test_pcmpgt_b(<64 x i8> %a, <64 x i8> %b) {
; AVX512BW-LABEL: test_pcmpgt_b:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: vpcmpgtb %zmm1, %zmm0, %k0
; AVX512BW-NEXT: kmovq %k0, %rax
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_pcmpgt_b:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: subl $12, %esp
-; AVX512F-32-NEXT: .Lcfi2:
; AVX512F-32-NEXT: .cfi_def_cfa_offset 16
; AVX512F-32-NEXT: vpcmpgtb %zmm1, %zmm0, %k0
; AVX512F-32-NEXT: kmovq %k0, (%esp)
; AVX512F-32-NEXT: movl (%esp), %eax
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: addl $12, %esp
+; AVX512F-32-NEXT: vzeroupper
; AVX512F-32-NEXT: retl
%res = call i64 @llvm.x86.avx512.mask.pcmpgt.b.512(<64 x i8> %a, <64 x i8> %b, i64 -1)
ret i64 %res
@@ -380,16 +492,16 @@ define i64 @test_pcmpgt_b(<64 x i8> %a, <64 x i8> %b) {
define i64 @test_mask_pcmpgt_b(<64 x i8> %a, <64 x i8> %b, i64 %mask) {
; AVX512BW-LABEL: test_mask_pcmpgt_b:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovq %rdi, %k1
; AVX512BW-NEXT: vpcmpgtb %zmm1, %zmm0, %k0 {%k1}
; AVX512BW-NEXT: kmovq %k0, %rax
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_mask_pcmpgt_b:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: subl $12, %esp
-; AVX512F-32-NEXT: .Lcfi3:
; AVX512F-32-NEXT: .cfi_def_cfa_offset 16
; AVX512F-32-NEXT: kmovq {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpcmpgtb %zmm1, %zmm0, %k0 {%k1}
@@ -397,6 +509,7 @@ define i64 @test_mask_pcmpgt_b(<64 x i8> %a, <64 x i8> %b, i64 %mask) {
; AVX512F-32-NEXT: movl (%esp), %eax
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: addl $12, %esp
+; AVX512F-32-NEXT: vzeroupper
; AVX512F-32-NEXT: retl
%res = call i64 @llvm.x86.avx512.mask.pcmpgt.b.512(<64 x i8> %a, <64 x i8> %b, i64 %mask)
ret i64 %res
@@ -406,15 +519,17 @@ declare i64 @llvm.x86.avx512.mask.pcmpgt.b.512(<64 x i8>, <64 x i8>, i64)
define i32 @test_pcmpgt_w(<32 x i16> %a, <32 x i16> %b) {
; AVX512BW-LABEL: test_pcmpgt_w:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: vpcmpgtw %zmm1, %zmm0, %k0
; AVX512BW-NEXT: kmovd %k0, %eax
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_pcmpgt_w:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: vpcmpgtw %zmm1, %zmm0, %k0
; AVX512F-32-NEXT: kmovd %k0, %eax
+; AVX512F-32-NEXT: vzeroupper
; AVX512F-32-NEXT: retl
%res = call i32 @llvm.x86.avx512.mask.pcmpgt.w.512(<32 x i16> %a, <32 x i16> %b, i32 -1)
ret i32 %res
@@ -422,17 +537,19 @@ define i32 @test_pcmpgt_w(<32 x i16> %a, <32 x i16> %b) {
define i32 @test_mask_pcmpgt_w(<32 x i16> %a, <32 x i16> %b, i32 %mask) {
; AVX512BW-LABEL: test_mask_pcmpgt_w:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovd %edi, %k1
; AVX512BW-NEXT: vpcmpgtw %zmm1, %zmm0, %k0 {%k1}
; AVX512BW-NEXT: kmovd %k0, %eax
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_mask_pcmpgt_w:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpcmpgtw %zmm1, %zmm0, %k0 {%k1}
; AVX512F-32-NEXT: kmovd %k0, %eax
+; AVX512F-32-NEXT: vzeroupper
; AVX512F-32-NEXT: retl
%res = call i32 @llvm.x86.avx512.mask.pcmpgt.w.512(<32 x i16> %a, <32 x i16> %b, i32 %mask)
ret i32 %res
@@ -444,7 +561,7 @@ declare <64 x i8> @llvm.x86.avx512.mask.punpckhb.w.512(<64 x i8>, <64 x i8>, <64
define <64 x i8>@test_int_x86_avx512_mask_punpckhb_w_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) {
; AVX512BW-LABEL: test_int_x86_avx512_mask_punpckhb_w_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63]
; AVX512BW-NEXT: kmovq %rdi, %k1
; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 {%k1} = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63]
@@ -452,7 +569,7 @@ define <64 x i8>@test_int_x86_avx512_mask_punpckhb_w_512(<64 x i8> %x0, <64 x i8
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_int_x86_avx512_mask_punpckhb_w_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63]
; AVX512F-32-NEXT: kmovq {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpunpckhbw {{.*#+}} zmm2 {%k1} = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63]
@@ -468,7 +585,7 @@ declare <64 x i8> @llvm.x86.avx512.mask.punpcklb.w.512(<64 x i8>, <64 x i8>, <64
define <64 x i8>@test_int_x86_avx512_mask_punpcklb_w_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) {
; AVX512BW-LABEL: test_int_x86_avx512_mask_punpcklb_w_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm3 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55]
; AVX512BW-NEXT: kmovq %rdi, %k1
; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55]
@@ -476,7 +593,7 @@ define <64 x i8>@test_int_x86_avx512_mask_punpcklb_w_512(<64 x i8> %x0, <64 x i8
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_int_x86_avx512_mask_punpcklb_w_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: vpunpcklbw {{.*#+}} zmm3 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55]
; AVX512F-32-NEXT: kmovq {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpunpcklbw {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55]
@@ -492,7 +609,7 @@ declare <32 x i16> @llvm.x86.avx512.mask.punpckhw.d.512(<32 x i16>, <32 x i16>,
define <32 x i16>@test_int_x86_avx512_mask_punpckhw_d_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
; AVX512BW-LABEL: test_int_x86_avx512_mask_punpckhw_d_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: vpunpckhwd {{.*#+}} zmm3 = zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31]
; AVX512BW-NEXT: kmovd %edi, %k1
; AVX512BW-NEXT: vpunpckhwd {{.*#+}} zmm2 {%k1} = zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31]
@@ -500,7 +617,7 @@ define <32 x i16>@test_int_x86_avx512_mask_punpckhw_d_512(<32 x i16> %x0, <32 x
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_int_x86_avx512_mask_punpckhw_d_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: vpunpckhwd {{.*#+}} zmm3 = zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31]
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpunpckhwd {{.*#+}} zmm2 {%k1} = zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31]
@@ -516,7 +633,7 @@ declare <32 x i16> @llvm.x86.avx512.mask.punpcklw.d.512(<32 x i16>, <32 x i16>,
define <32 x i16>@test_int_x86_avx512_mask_punpcklw_d_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
; AVX512BW-LABEL: test_int_x86_avx512_mask_punpcklw_d_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: vpunpcklwd {{.*#+}} zmm3 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27]
; AVX512BW-NEXT: kmovd %edi, %k1
; AVX512BW-NEXT: vpunpcklwd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27]
@@ -524,7 +641,7 @@ define <32 x i16>@test_int_x86_avx512_mask_punpcklw_d_512(<32 x i16> %x0, <32 x
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_int_x86_avx512_mask_punpcklw_d_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: vpunpcklwd {{.*#+}} zmm3 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27]
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpunpcklwd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27]
@@ -540,7 +657,7 @@ declare <64 x i8> @llvm.x86.avx512.mask.pmaxs.b.512(<64 x i8>, <64 x i8>, <64 x
define <64 x i8>@test_int_x86_avx512_mask_pmaxs_b_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) {
; AVX512BW-LABEL: test_int_x86_avx512_mask_pmaxs_b_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: vpmaxsb %zmm1, %zmm0, %zmm3
; AVX512BW-NEXT: kmovq %rdi, %k1
; AVX512BW-NEXT: vpmaxsb %zmm1, %zmm0, %zmm2 {%k1}
@@ -548,7 +665,7 @@ define <64 x i8>@test_int_x86_avx512_mask_pmaxs_b_512(<64 x i8> %x0, <64 x i8> %
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_int_x86_avx512_mask_pmaxs_b_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: vpmaxsb %zmm1, %zmm0, %zmm3
; AVX512F-32-NEXT: kmovq {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpmaxsb %zmm1, %zmm0, %zmm2 {%k1}
@@ -564,7 +681,7 @@ declare <32 x i16> @llvm.x86.avx512.mask.pmaxs.w.512(<32 x i16>, <32 x i16>, <32
define <32 x i16>@test_int_x86_avx512_mask_pmaxs_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
; AVX512BW-LABEL: test_int_x86_avx512_mask_pmaxs_w_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: vpmaxsw %zmm1, %zmm0, %zmm3
; AVX512BW-NEXT: kmovd %edi, %k1
; AVX512BW-NEXT: vpmaxsw %zmm1, %zmm0, %zmm2 {%k1}
@@ -572,7 +689,7 @@ define <32 x i16>@test_int_x86_avx512_mask_pmaxs_w_512(<32 x i16> %x0, <32 x i16
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_int_x86_avx512_mask_pmaxs_w_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: vpmaxsw %zmm1, %zmm0, %zmm3
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpmaxsw %zmm1, %zmm0, %zmm2 {%k1}
@@ -588,7 +705,7 @@ declare <64 x i8> @llvm.x86.avx512.mask.pmaxu.b.512(<64 x i8>, <64 x i8>, <64 x
define <64 x i8>@test_int_x86_avx512_mask_pmaxu_b_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) {
; AVX512BW-LABEL: test_int_x86_avx512_mask_pmaxu_b_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: vpmaxub %zmm1, %zmm0, %zmm3
; AVX512BW-NEXT: kmovq %rdi, %k1
; AVX512BW-NEXT: vpmaxub %zmm1, %zmm0, %zmm2 {%k1}
@@ -596,7 +713,7 @@ define <64 x i8>@test_int_x86_avx512_mask_pmaxu_b_512(<64 x i8> %x0, <64 x i8> %
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_int_x86_avx512_mask_pmaxu_b_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: vpmaxub %zmm1, %zmm0, %zmm3
; AVX512F-32-NEXT: kmovq {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpmaxub %zmm1, %zmm0, %zmm2 {%k1}
@@ -612,7 +729,7 @@ declare <32 x i16> @llvm.x86.avx512.mask.pmaxu.w.512(<32 x i16>, <32 x i16>, <32
define <32 x i16>@test_int_x86_avx512_mask_pmaxu_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
; AVX512BW-LABEL: test_int_x86_avx512_mask_pmaxu_w_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: vpmaxuw %zmm1, %zmm0, %zmm3
; AVX512BW-NEXT: kmovd %edi, %k1
; AVX512BW-NEXT: vpmaxuw %zmm1, %zmm0, %zmm2 {%k1}
@@ -620,7 +737,7 @@ define <32 x i16>@test_int_x86_avx512_mask_pmaxu_w_512(<32 x i16> %x0, <32 x i16
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_int_x86_avx512_mask_pmaxu_w_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: vpmaxuw %zmm1, %zmm0, %zmm3
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpmaxuw %zmm1, %zmm0, %zmm2 {%k1}
@@ -636,7 +753,7 @@ declare <64 x i8> @llvm.x86.avx512.mask.pmins.b.512(<64 x i8>, <64 x i8>, <64 x
define <64 x i8>@test_int_x86_avx512_mask_pmins_b_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) {
; AVX512BW-LABEL: test_int_x86_avx512_mask_pmins_b_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: vpminsb %zmm1, %zmm0, %zmm3
; AVX512BW-NEXT: kmovq %rdi, %k1
; AVX512BW-NEXT: vpminsb %zmm1, %zmm0, %zmm2 {%k1}
@@ -644,7 +761,7 @@ define <64 x i8>@test_int_x86_avx512_mask_pmins_b_512(<64 x i8> %x0, <64 x i8> %
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_int_x86_avx512_mask_pmins_b_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: vpminsb %zmm1, %zmm0, %zmm3
; AVX512F-32-NEXT: kmovq {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpminsb %zmm1, %zmm0, %zmm2 {%k1}
@@ -660,7 +777,7 @@ declare <32 x i16> @llvm.x86.avx512.mask.pmins.w.512(<32 x i16>, <32 x i16>, <32
define <32 x i16>@test_int_x86_avx512_mask_pmins_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
; AVX512BW-LABEL: test_int_x86_avx512_mask_pmins_w_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: vpminsw %zmm1, %zmm0, %zmm3
; AVX512BW-NEXT: kmovd %edi, %k1
; AVX512BW-NEXT: vpminsw %zmm1, %zmm0, %zmm2 {%k1}
@@ -668,7 +785,7 @@ define <32 x i16>@test_int_x86_avx512_mask_pmins_w_512(<32 x i16> %x0, <32 x i16
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_int_x86_avx512_mask_pmins_w_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: vpminsw %zmm1, %zmm0, %zmm3
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpminsw %zmm1, %zmm0, %zmm2 {%k1}
@@ -684,7 +801,7 @@ declare <64 x i8> @llvm.x86.avx512.mask.pminu.b.512(<64 x i8>, <64 x i8>, <64 x
define <64 x i8>@test_int_x86_avx512_mask_pminu_b_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) {
; AVX512BW-LABEL: test_int_x86_avx512_mask_pminu_b_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: vpminub %zmm1, %zmm0, %zmm3
; AVX512BW-NEXT: kmovq %rdi, %k1
; AVX512BW-NEXT: vpminub %zmm1, %zmm0, %zmm2 {%k1}
@@ -692,7 +809,7 @@ define <64 x i8>@test_int_x86_avx512_mask_pminu_b_512(<64 x i8> %x0, <64 x i8> %
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_int_x86_avx512_mask_pminu_b_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: vpminub %zmm1, %zmm0, %zmm3
; AVX512F-32-NEXT: kmovq {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpminub %zmm1, %zmm0, %zmm2 {%k1}
@@ -708,7 +825,7 @@ declare <32 x i16> @llvm.x86.avx512.mask.pminu.w.512(<32 x i16>, <32 x i16>, <32
define <32 x i16>@test_int_x86_avx512_mask_pminu_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
; AVX512BW-LABEL: test_int_x86_avx512_mask_pminu_w_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: vpminuw %zmm1, %zmm0, %zmm3
; AVX512BW-NEXT: kmovd %edi, %k1
; AVX512BW-NEXT: vpminuw %zmm1, %zmm0, %zmm2 {%k1}
@@ -716,7 +833,7 @@ define <32 x i16>@test_int_x86_avx512_mask_pminu_w_512(<32 x i16> %x0, <32 x i16
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_int_x86_avx512_mask_pminu_w_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: vpminuw %zmm1, %zmm0, %zmm3
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpminuw %zmm1, %zmm0, %zmm2 {%k1}
@@ -732,23 +849,23 @@ declare <32 x i16> @llvm.x86.avx512.mask.pmovzxb.w.512(<32 x i8>, <32 x i16>, i3
define <32 x i16>@test_int_x86_avx512_mask_pmovzxb_w_512(<32 x i8> %x0, <32 x i16> %x1, i32 %x2) {
; AVX512BW-LABEL: test_int_x86_avx512_mask_pmovzxb_w_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm2 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
; AVX512BW-NEXT: kmovd %edi, %k1
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 {%k1} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 {%k1} {z} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
-; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0
; AVX512BW-NEXT: vpaddw %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_int_x86_avx512_mask_pmovzxb_w_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: vpmovzxbw {{.*#+}} zmm2 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpmovzxbw {{.*#+}} zmm1 {%k1} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
; AVX512F-32-NEXT: vpmovzxbw {{.*#+}} zmm0 {%k1} {z} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
-; AVX512F-32-NEXT: vpaddw %zmm0, %zmm1, %zmm0
; AVX512F-32-NEXT: vpaddw %zmm2, %zmm0, %zmm0
+; AVX512F-32-NEXT: vpaddw %zmm0, %zmm1, %zmm0
; AVX512F-32-NEXT: retl
%res = call <32 x i16> @llvm.x86.avx512.mask.pmovzxb.w.512(<32 x i8> %x0, <32 x i16> %x1, i32 %x2)
%res1 = call <32 x i16> @llvm.x86.avx512.mask.pmovzxb.w.512(<32 x i8> %x0, <32 x i16> zeroinitializer, i32 %x2)
@@ -762,23 +879,23 @@ declare <32 x i16> @llvm.x86.avx512.mask.pmovsxb.w.512(<32 x i8>, <32 x i16>, i3
define <32 x i16>@test_int_x86_avx512_mask_pmovsxb_w_512(<32 x i8> %x0, <32 x i16> %x1, i32 %x2) {
; AVX512BW-LABEL: test_int_x86_avx512_mask_pmovsxb_w_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm2
; AVX512BW-NEXT: kmovd %edi, %k1
; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm1 {%k1}
; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm0 {%k1} {z}
-; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0
; AVX512BW-NEXT: vpaddw %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_int_x86_avx512_mask_pmovsxb_w_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: vpmovsxbw %ymm0, %zmm2
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpmovsxbw %ymm0, %zmm1 {%k1}
; AVX512F-32-NEXT: vpmovsxbw %ymm0, %zmm0 {%k1} {z}
-; AVX512F-32-NEXT: vpaddw %zmm0, %zmm1, %zmm0
; AVX512F-32-NEXT: vpaddw %zmm2, %zmm0, %zmm0
+; AVX512F-32-NEXT: vpaddw %zmm0, %zmm1, %zmm0
; AVX512F-32-NEXT: retl
%res = call <32 x i16> @llvm.x86.avx512.mask.pmovsxb.w.512(<32 x i8> %x0, <32 x i16> %x1, i32 %x2)
%res1 = call <32 x i16> @llvm.x86.avx512.mask.pmovsxb.w.512(<32 x i8> %x0, <32 x i16> zeroinitializer, i32 %x2)
@@ -792,22 +909,22 @@ declare <32 x i16> @llvm.x86.avx512.mask.psrl.w.512(<32 x i16>, <8 x i16>, <32 x
define <32 x i16>@test_int_x86_avx512_mask_psrl_w_512(<32 x i16> %x0, <8 x i16> %x1, <32 x i16> %x2, i32 %x3) {
; AVX512BW-LABEL: test_int_x86_avx512_mask_psrl_w_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm3
; AVX512BW-NEXT: kmovd %edi, %k1
; AVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm2 {%k1}
-; AVX512BW-NEXT: vpaddw %zmm3, %zmm2, %zmm2
; AVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT: vpaddw %zmm0, %zmm3, %zmm0
; AVX512BW-NEXT: vpaddw %zmm0, %zmm2, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_int_x86_avx512_mask_psrl_w_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: vpsrlw %xmm1, %zmm0, %zmm3
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpsrlw %xmm1, %zmm0, %zmm2 {%k1}
-; AVX512F-32-NEXT: vpaddw %zmm3, %zmm2, %zmm2
; AVX512F-32-NEXT: vpsrlw %xmm1, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-32-NEXT: vpaddw %zmm0, %zmm3, %zmm0
; AVX512F-32-NEXT: vpaddw %zmm0, %zmm2, %zmm0
; AVX512F-32-NEXT: retl
%res = call <32 x i16> @llvm.x86.avx512.mask.psrl.w.512(<32 x i16> %x0, <8 x i16> %x1, <32 x i16> %x2, i32 %x3)
@@ -822,22 +939,22 @@ declare <32 x i16> @llvm.x86.avx512.mask.psrl.wi.512(<32 x i16>, i32, <32 x i16>
define <32 x i16>@test_int_x86_avx512_mask_psrl_wi_512(<32 x i16> %x0, i32 %x1, <32 x i16> %x2, i32 %x3) {
; AVX512BW-LABEL: test_int_x86_avx512_mask_psrl_wi_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: vpsrlw $3, %zmm0, %zmm2
; AVX512BW-NEXT: kmovd %esi, %k1
; AVX512BW-NEXT: vpsrlw $3, %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT: vpaddw %zmm2, %zmm1, %zmm1
; AVX512BW-NEXT: vpsrlw $3, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT: vpaddw %zmm0, %zmm2, %zmm0
; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_int_x86_avx512_mask_psrl_wi_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: vpsrlw $3, %zmm0, %zmm2
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpsrlw $3, %zmm0, %zmm1 {%k1}
-; AVX512F-32-NEXT: vpaddw %zmm2, %zmm1, %zmm1
; AVX512F-32-NEXT: vpsrlw $3, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-32-NEXT: vpaddw %zmm0, %zmm2, %zmm0
; AVX512F-32-NEXT: vpaddw %zmm0, %zmm1, %zmm0
; AVX512F-32-NEXT: retl
%res = call <32 x i16> @llvm.x86.avx512.mask.psrl.wi.512(<32 x i16> %x0, i32 3, <32 x i16> %x2, i32 %x3)
@@ -852,23 +969,23 @@ declare <32 x i16> @llvm.x86.avx512.mask.psra.w.512(<32 x i16>, <8 x i16>, <32 x
define <32 x i16>@test_int_x86_avx512_mask_psra_w_512(<32 x i16> %x0, <8 x i16> %x1, <32 x i16> %x2, i32 %x3) {
; AVX512BW-LABEL: test_int_x86_avx512_mask_psra_w_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: vpsraw %xmm1, %zmm0, %zmm3
; AVX512BW-NEXT: kmovd %edi, %k1
; AVX512BW-NEXT: vpsraw %xmm1, %zmm0, %zmm2 {%k1}
; AVX512BW-NEXT: vpsraw %xmm1, %zmm0, %zmm0 {%k1} {z}
-; AVX512BW-NEXT: vpaddw %zmm0, %zmm2, %zmm0
; AVX512BW-NEXT: vpaddw %zmm3, %zmm0, %zmm0
+; AVX512BW-NEXT: vpaddw %zmm0, %zmm2, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_int_x86_avx512_mask_psra_w_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: vpsraw %xmm1, %zmm0, %zmm3
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpsraw %xmm1, %zmm0, %zmm2 {%k1}
; AVX512F-32-NEXT: vpsraw %xmm1, %zmm0, %zmm0 {%k1} {z}
-; AVX512F-32-NEXT: vpaddw %zmm0, %zmm2, %zmm0
; AVX512F-32-NEXT: vpaddw %zmm3, %zmm0, %zmm0
+; AVX512F-32-NEXT: vpaddw %zmm0, %zmm2, %zmm0
; AVX512F-32-NEXT: retl
%res = call <32 x i16> @llvm.x86.avx512.mask.psra.w.512(<32 x i16> %x0, <8 x i16> %x1, <32 x i16> %x2, i32 %x3)
%res1 = call <32 x i16> @llvm.x86.avx512.mask.psra.w.512(<32 x i16> %x0, <8 x i16> %x1, <32 x i16> zeroinitializer, i32 %x3)
@@ -882,23 +999,23 @@ declare <32 x i16> @llvm.x86.avx512.mask.psra.wi.512(<32 x i16>, i32, <32 x i16>
define <32 x i16>@test_int_x86_avx512_mask_psra_wi_512(<32 x i16> %x0, i32 %x1, <32 x i16> %x2, i32 %x3) {
; AVX512BW-LABEL: test_int_x86_avx512_mask_psra_wi_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: vpsraw $3, %zmm0, %zmm2
; AVX512BW-NEXT: kmovd %esi, %k1
; AVX512BW-NEXT: vpsraw $3, %zmm0, %zmm1 {%k1}
; AVX512BW-NEXT: vpsraw $3, %zmm0, %zmm0 {%k1} {z}
-; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0
; AVX512BW-NEXT: vpaddw %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_int_x86_avx512_mask_psra_wi_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: vpsraw $3, %zmm0, %zmm2
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpsraw $3, %zmm0, %zmm1 {%k1}
; AVX512F-32-NEXT: vpsraw $3, %zmm0, %zmm0 {%k1} {z}
-; AVX512F-32-NEXT: vpaddw %zmm0, %zmm1, %zmm0
; AVX512F-32-NEXT: vpaddw %zmm2, %zmm0, %zmm0
+; AVX512F-32-NEXT: vpaddw %zmm0, %zmm1, %zmm0
; AVX512F-32-NEXT: retl
%res = call <32 x i16> @llvm.x86.avx512.mask.psra.wi.512(<32 x i16> %x0, i32 3, <32 x i16> %x2, i32 %x3)
%res1 = call <32 x i16> @llvm.x86.avx512.mask.psra.wi.512(<32 x i16> %x0, i32 3, <32 x i16> zeroinitializer, i32 %x3)
@@ -912,23 +1029,23 @@ declare <32 x i16> @llvm.x86.avx512.mask.psll.w.512(<32 x i16>, <8 x i16>, <32 x
define <32 x i16>@test_int_x86_avx512_mask_psll_w_512(<32 x i16> %x0, <8 x i16> %x1, <32 x i16> %x2, i32 %x3) {
; AVX512BW-LABEL: test_int_x86_avx512_mask_psll_w_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: vpsllw %xmm1, %zmm0, %zmm3
; AVX512BW-NEXT: kmovd %edi, %k1
; AVX512BW-NEXT: vpsllw %xmm1, %zmm0, %zmm2 {%k1}
; AVX512BW-NEXT: vpsllw %xmm1, %zmm0, %zmm0 {%k1} {z}
-; AVX512BW-NEXT: vpaddw %zmm0, %zmm2, %zmm0
; AVX512BW-NEXT: vpaddw %zmm3, %zmm0, %zmm0
+; AVX512BW-NEXT: vpaddw %zmm0, %zmm2, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_int_x86_avx512_mask_psll_w_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: vpsllw %xmm1, %zmm0, %zmm3
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpsllw %xmm1, %zmm0, %zmm2 {%k1}
; AVX512F-32-NEXT: vpsllw %xmm1, %zmm0, %zmm0 {%k1} {z}
-; AVX512F-32-NEXT: vpaddw %zmm0, %zmm2, %zmm0
; AVX512F-32-NEXT: vpaddw %zmm3, %zmm0, %zmm0
+; AVX512F-32-NEXT: vpaddw %zmm0, %zmm2, %zmm0
; AVX512F-32-NEXT: retl
%res = call <32 x i16> @llvm.x86.avx512.mask.psll.w.512(<32 x i16> %x0, <8 x i16> %x1, <32 x i16> %x2, i32 %x3)
%res1 = call <32 x i16> @llvm.x86.avx512.mask.psll.w.512(<32 x i16> %x0, <8 x i16> %x1, <32 x i16> zeroinitializer, i32 %x3)
@@ -942,23 +1059,23 @@ declare <32 x i16> @llvm.x86.avx512.mask.psll.wi.512(<32 x i16>, i32, <32 x i16>
define <32 x i16>@test_int_x86_avx512_mask_psll_wi_512(<32 x i16> %x0, i32 %x1, <32 x i16> %x2, i32 %x3) {
; AVX512BW-LABEL: test_int_x86_avx512_mask_psll_wi_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: vpsllw $3, %zmm0, %zmm2
; AVX512BW-NEXT: kmovd %esi, %k1
; AVX512BW-NEXT: vpsllw $3, %zmm0, %zmm1 {%k1}
; AVX512BW-NEXT: vpsllw $3, %zmm0, %zmm0 {%k1} {z}
-; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0
; AVX512BW-NEXT: vpaddw %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_int_x86_avx512_mask_psll_wi_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: vpsllw $3, %zmm0, %zmm2
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpsllw $3, %zmm0, %zmm1 {%k1}
; AVX512F-32-NEXT: vpsllw $3, %zmm0, %zmm0 {%k1} {z}
-; AVX512F-32-NEXT: vpaddw %zmm0, %zmm1, %zmm0
; AVX512F-32-NEXT: vpaddw %zmm2, %zmm0, %zmm0
+; AVX512F-32-NEXT: vpaddw %zmm0, %zmm1, %zmm0
; AVX512F-32-NEXT: retl
%res = call <32 x i16> @llvm.x86.avx512.mask.psll.wi.512(<32 x i16> %x0, i32 3, <32 x i16> %x2, i32 %x3)
%res1 = call <32 x i16> @llvm.x86.avx512.mask.psll.wi.512(<32 x i16> %x0, i32 3, <32 x i16> zeroinitializer, i32 %x3)
@@ -972,7 +1089,7 @@ declare <64 x i8> @llvm.x86.avx512.mask.pshuf.b.512(<64 x i8>, <64 x i8>, <64 x
define <64 x i8>@test_int_x86_avx512_mask_pshuf_b_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) {
; AVX512BW-LABEL: test_int_x86_avx512_mask_pshuf_b_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: vpshufb %zmm1, %zmm0, %zmm3
; AVX512BW-NEXT: kmovq %rdi, %k1
; AVX512BW-NEXT: vpshufb %zmm1, %zmm0, %zmm2 {%k1}
@@ -980,7 +1097,7 @@ define <64 x i8>@test_int_x86_avx512_mask_pshuf_b_512(<64 x i8> %x0, <64 x i8> %
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_int_x86_avx512_mask_pshuf_b_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: vpshufb %zmm1, %zmm0, %zmm3
; AVX512F-32-NEXT: kmovq {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpshufb %zmm1, %zmm0, %zmm2 {%k1}
@@ -997,13 +1114,13 @@ declare <64 x i8> @llvm.x86.avx512.cvtmask2b.512(i64)
define <64 x i8>@test_int_x86_avx512_cvtmask2b_512(i64 %x0) {
; AVX512BW-LABEL: test_int_x86_avx512_cvtmask2b_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovq %rdi, %k0
; AVX512BW-NEXT: vpmovm2b %k0, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_int_x86_avx512_cvtmask2b_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: kmovq {{[0-9]+}}(%esp), %k0
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm0
; AVX512F-32-NEXT: retl
@@ -1015,13 +1132,13 @@ declare <32 x i16> @llvm.x86.avx512.cvtmask2w.512(i32)
define <32 x i16>@test_int_x86_avx512_cvtmask2w_512(i32 %x0) {
; AVX512BW-LABEL: test_int_x86_avx512_cvtmask2w_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovd %edi, %k0
; AVX512BW-NEXT: vpmovm2w %k0, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_int_x86_avx512_cvtmask2w_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k0
; AVX512F-32-NEXT: vpmovm2w %k0, %zmm0
; AVX512F-32-NEXT: retl
@@ -1030,12 +1147,12 @@ define <32 x i16>@test_int_x86_avx512_cvtmask2w_512(i32 %x0) {
}
define <32 x i16> @test_mask_packs_epi32_rr_512(<16 x i32> %a, <16 x i32> %b) {
; AVX512BW-LABEL: test_mask_packs_epi32_rr_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: vpackssdw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_mask_packs_epi32_rr_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: vpackssdw %zmm1, %zmm0, %zmm0
; AVX512F-32-NEXT: retl
%res = call <32 x i16> @llvm.x86.avx512.mask.packssdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> zeroinitializer, i32 -1)
@@ -1044,14 +1161,14 @@ define <32 x i16> @test_mask_packs_epi32_rr_512(<16 x i32> %a, <16 x i32> %b) {
define <32 x i16> @test_mask_packs_epi32_rrk_512(<16 x i32> %a, <16 x i32> %b, <32 x i16> %passThru, i32 %mask) {
; AVX512BW-LABEL: test_mask_packs_epi32_rrk_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovd %edi, %k1
; AVX512BW-NEXT: vpackssdw %zmm1, %zmm0, %zmm2 {%k1}
; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_mask_packs_epi32_rrk_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpackssdw %zmm1, %zmm0, %zmm2 {%k1}
; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0
@@ -1062,13 +1179,13 @@ define <32 x i16> @test_mask_packs_epi32_rrk_512(<16 x i32> %a, <16 x i32> %b, <
define <32 x i16> @test_mask_packs_epi32_rrkz_512(<16 x i32> %a, <16 x i32> %b, i32 %mask) {
; AVX512BW-LABEL: test_mask_packs_epi32_rrkz_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovd %edi, %k1
; AVX512BW-NEXT: vpackssdw %zmm1, %zmm0, %zmm0 {%k1} {z}
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_mask_packs_epi32_rrkz_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpackssdw %zmm1, %zmm0, %zmm0 {%k1} {z}
; AVX512F-32-NEXT: retl
@@ -1078,12 +1195,12 @@ define <32 x i16> @test_mask_packs_epi32_rrkz_512(<16 x i32> %a, <16 x i32> %b,
define <32 x i16> @test_mask_packs_epi32_rm_512(<16 x i32> %a, <16 x i32>* %ptr_b) {
; AVX512BW-LABEL: test_mask_packs_epi32_rm_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: vpackssdw (%rdi), %zmm0, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_mask_packs_epi32_rm_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX512F-32-NEXT: vpackssdw (%eax), %zmm0, %zmm0
; AVX512F-32-NEXT: retl
@@ -1094,14 +1211,14 @@ define <32 x i16> @test_mask_packs_epi32_rm_512(<16 x i32> %a, <16 x i32>* %ptr_
define <32 x i16> @test_mask_packs_epi32_rmk_512(<16 x i32> %a, <16 x i32>* %ptr_b, <32 x i16> %passThru, i32 %mask) {
; AVX512BW-LABEL: test_mask_packs_epi32_rmk_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovd %esi, %k1
; AVX512BW-NEXT: vpackssdw (%rdi), %zmm0, %zmm1 {%k1}
; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_mask_packs_epi32_rmk_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpackssdw (%eax), %zmm0, %zmm1 {%k1}
@@ -1114,13 +1231,13 @@ define <32 x i16> @test_mask_packs_epi32_rmk_512(<16 x i32> %a, <16 x i32>* %ptr
define <32 x i16> @test_mask_packs_epi32_rmkz_512(<16 x i32> %a, <16 x i32>* %ptr_b, i32 %mask) {
; AVX512BW-LABEL: test_mask_packs_epi32_rmkz_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovd %esi, %k1
; AVX512BW-NEXT: vpackssdw (%rdi), %zmm0, %zmm0 {%k1} {z}
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_mask_packs_epi32_rmkz_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpackssdw (%eax), %zmm0, %zmm0 {%k1} {z}
@@ -1132,12 +1249,12 @@ define <32 x i16> @test_mask_packs_epi32_rmkz_512(<16 x i32> %a, <16 x i32>* %pt
define <32 x i16> @test_mask_packs_epi32_rmb_512(<16 x i32> %a, i32* %ptr_b) {
; AVX512BW-LABEL: test_mask_packs_epi32_rmb_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: vpackssdw (%rdi){1to16}, %zmm0, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_mask_packs_epi32_rmb_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX512F-32-NEXT: vpackssdw (%eax){1to16}, %zmm0, %zmm0
; AVX512F-32-NEXT: retl
@@ -1150,14 +1267,14 @@ define <32 x i16> @test_mask_packs_epi32_rmb_512(<16 x i32> %a, i32* %ptr_b) {
define <32 x i16> @test_mask_packs_epi32_rmbk_512(<16 x i32> %a, i32* %ptr_b, <32 x i16> %passThru, i32 %mask) {
; AVX512BW-LABEL: test_mask_packs_epi32_rmbk_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovd %esi, %k1
; AVX512BW-NEXT: vpackssdw (%rdi){1to16}, %zmm0, %zmm1 {%k1}
; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_mask_packs_epi32_rmbk_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpackssdw (%eax){1to16}, %zmm0, %zmm1 {%k1}
@@ -1172,13 +1289,13 @@ define <32 x i16> @test_mask_packs_epi32_rmbk_512(<16 x i32> %a, i32* %ptr_b, <3
define <32 x i16> @test_mask_packs_epi32_rmbkz_512(<16 x i32> %a, i32* %ptr_b, i32 %mask) {
; AVX512BW-LABEL: test_mask_packs_epi32_rmbkz_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovd %esi, %k1
; AVX512BW-NEXT: vpackssdw (%rdi){1to16}, %zmm0, %zmm0 {%k1} {z}
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_mask_packs_epi32_rmbkz_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpackssdw (%eax){1to16}, %zmm0, %zmm0 {%k1} {z}
@@ -1194,12 +1311,12 @@ declare <32 x i16> @llvm.x86.avx512.mask.packssdw.512(<16 x i32>, <16 x i32>, <3
define <64 x i8> @test_mask_packs_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) {
; AVX512BW-LABEL: test_mask_packs_epi16_rr_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: vpacksswb %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_mask_packs_epi16_rr_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: vpacksswb %zmm1, %zmm0, %zmm0
; AVX512F-32-NEXT: retl
%res = call <64 x i8> @llvm.x86.avx512.mask.packsswb.512(<32 x i16> %a, <32 x i16> %b, <64 x i8> zeroinitializer, i64 -1)
@@ -1208,14 +1325,14 @@ define <64 x i8> @test_mask_packs_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) {
define <64 x i8> @test_mask_packs_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <64 x i8> %passThru, i64 %mask) {
; AVX512BW-LABEL: test_mask_packs_epi16_rrk_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovq %rdi, %k1
; AVX512BW-NEXT: vpacksswb %zmm1, %zmm0, %zmm2 {%k1}
; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_mask_packs_epi16_rrk_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: kmovq {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpacksswb %zmm1, %zmm0, %zmm2 {%k1}
; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0
@@ -1226,13 +1343,13 @@ define <64 x i8> @test_mask_packs_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <6
define <64 x i8> @test_mask_packs_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i64 %mask) {
; AVX512BW-LABEL: test_mask_packs_epi16_rrkz_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovq %rdi, %k1
; AVX512BW-NEXT: vpacksswb %zmm1, %zmm0, %zmm0 {%k1} {z}
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_mask_packs_epi16_rrkz_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: kmovq {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpacksswb %zmm1, %zmm0, %zmm0 {%k1} {z}
; AVX512F-32-NEXT: retl
@@ -1242,12 +1359,12 @@ define <64 x i8> @test_mask_packs_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i
define <64 x i8> @test_mask_packs_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) {
; AVX512BW-LABEL: test_mask_packs_epi16_rm_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: vpacksswb (%rdi), %zmm0, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_mask_packs_epi16_rm_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX512F-32-NEXT: vpacksswb (%eax), %zmm0, %zmm0
; AVX512F-32-NEXT: retl
@@ -1258,14 +1375,14 @@ define <64 x i8> @test_mask_packs_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b
define <64 x i8> @test_mask_packs_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <64 x i8> %passThru, i64 %mask) {
; AVX512BW-LABEL: test_mask_packs_epi16_rmk_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovq %rsi, %k1
; AVX512BW-NEXT: vpacksswb (%rdi), %zmm0, %zmm1 {%k1}
; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_mask_packs_epi16_rmk_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX512F-32-NEXT: kmovq {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpacksswb (%eax), %zmm0, %zmm1 {%k1}
@@ -1278,13 +1395,13 @@ define <64 x i8> @test_mask_packs_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_
define <64 x i8> @test_mask_packs_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i64 %mask) {
; AVX512BW-LABEL: test_mask_packs_epi16_rmkz_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovq %rsi, %k1
; AVX512BW-NEXT: vpacksswb (%rdi), %zmm0, %zmm0 {%k1} {z}
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_mask_packs_epi16_rmkz_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX512F-32-NEXT: kmovq {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpacksswb (%eax), %zmm0, %zmm0 {%k1} {z}
@@ -1299,12 +1416,12 @@ declare <64 x i8> @llvm.x86.avx512.mask.packsswb.512(<32 x i16>, <32 x i16>, <64
define <32 x i16> @test_mask_packus_epi32_rr_512(<16 x i32> %a, <16 x i32> %b) {
; AVX512BW-LABEL: test_mask_packus_epi32_rr_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: vpackusdw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_mask_packus_epi32_rr_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: vpackusdw %zmm1, %zmm0, %zmm0
; AVX512F-32-NEXT: retl
%res = call <32 x i16> @llvm.x86.avx512.mask.packusdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> zeroinitializer, i32 -1)
@@ -1313,14 +1430,14 @@ define <32 x i16> @test_mask_packus_epi32_rr_512(<16 x i32> %a, <16 x i32> %b) {
define <32 x i16> @test_mask_packus_epi32_rrk_512(<16 x i32> %a, <16 x i32> %b, <32 x i16> %passThru, i32 %mask) {
; AVX512BW-LABEL: test_mask_packus_epi32_rrk_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovd %edi, %k1
; AVX512BW-NEXT: vpackusdw %zmm1, %zmm0, %zmm2 {%k1}
; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_mask_packus_epi32_rrk_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpackusdw %zmm1, %zmm0, %zmm2 {%k1}
; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0
@@ -1331,13 +1448,13 @@ define <32 x i16> @test_mask_packus_epi32_rrk_512(<16 x i32> %a, <16 x i32> %b,
define <32 x i16> @test_mask_packus_epi32_rrkz_512(<16 x i32> %a, <16 x i32> %b, i32 %mask) {
; AVX512BW-LABEL: test_mask_packus_epi32_rrkz_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovd %edi, %k1
; AVX512BW-NEXT: vpackusdw %zmm1, %zmm0, %zmm0 {%k1} {z}
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_mask_packus_epi32_rrkz_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpackusdw %zmm1, %zmm0, %zmm0 {%k1} {z}
; AVX512F-32-NEXT: retl
@@ -1347,12 +1464,12 @@ define <32 x i16> @test_mask_packus_epi32_rrkz_512(<16 x i32> %a, <16 x i32> %b,
define <32 x i16> @test_mask_packus_epi32_rm_512(<16 x i32> %a, <16 x i32>* %ptr_b) {
; AVX512BW-LABEL: test_mask_packus_epi32_rm_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: vpackusdw (%rdi), %zmm0, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_mask_packus_epi32_rm_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX512F-32-NEXT: vpackusdw (%eax), %zmm0, %zmm0
; AVX512F-32-NEXT: retl
@@ -1363,14 +1480,14 @@ define <32 x i16> @test_mask_packus_epi32_rm_512(<16 x i32> %a, <16 x i32>* %ptr
define <32 x i16> @test_mask_packus_epi32_rmk_512(<16 x i32> %a, <16 x i32>* %ptr_b, <32 x i16> %passThru, i32 %mask) {
; AVX512BW-LABEL: test_mask_packus_epi32_rmk_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovd %esi, %k1
; AVX512BW-NEXT: vpackusdw (%rdi), %zmm0, %zmm1 {%k1}
; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_mask_packus_epi32_rmk_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpackusdw (%eax), %zmm0, %zmm1 {%k1}
@@ -1383,13 +1500,13 @@ define <32 x i16> @test_mask_packus_epi32_rmk_512(<16 x i32> %a, <16 x i32>* %pt
define <32 x i16> @test_mask_packus_epi32_rmkz_512(<16 x i32> %a, <16 x i32>* %ptr_b, i32 %mask) {
; AVX512BW-LABEL: test_mask_packus_epi32_rmkz_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovd %esi, %k1
; AVX512BW-NEXT: vpackusdw (%rdi), %zmm0, %zmm0 {%k1} {z}
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_mask_packus_epi32_rmkz_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpackusdw (%eax), %zmm0, %zmm0 {%k1} {z}
@@ -1401,12 +1518,12 @@ define <32 x i16> @test_mask_packus_epi32_rmkz_512(<16 x i32> %a, <16 x i32>* %p
define <32 x i16> @test_mask_packus_epi32_rmb_512(<16 x i32> %a, i32* %ptr_b) {
; AVX512BW-LABEL: test_mask_packus_epi32_rmb_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: vpackusdw (%rdi){1to16}, %zmm0, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_mask_packus_epi32_rmb_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX512F-32-NEXT: vpackusdw (%eax){1to16}, %zmm0, %zmm0
; AVX512F-32-NEXT: retl
@@ -1419,14 +1536,14 @@ define <32 x i16> @test_mask_packus_epi32_rmb_512(<16 x i32> %a, i32* %ptr_b) {
define <32 x i16> @test_mask_packus_epi32_rmbk_512(<16 x i32> %a, i32* %ptr_b, <32 x i16> %passThru, i32 %mask) {
; AVX512BW-LABEL: test_mask_packus_epi32_rmbk_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovd %esi, %k1
; AVX512BW-NEXT: vpackusdw (%rdi){1to16}, %zmm0, %zmm1 {%k1}
; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_mask_packus_epi32_rmbk_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpackusdw (%eax){1to16}, %zmm0, %zmm1 {%k1}
@@ -1441,13 +1558,13 @@ define <32 x i16> @test_mask_packus_epi32_rmbk_512(<16 x i32> %a, i32* %ptr_b, <
define <32 x i16> @test_mask_packus_epi32_rmbkz_512(<16 x i32> %a, i32* %ptr_b, i32 %mask) {
; AVX512BW-LABEL: test_mask_packus_epi32_rmbkz_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovd %esi, %k1
; AVX512BW-NEXT: vpackusdw (%rdi){1to16}, %zmm0, %zmm0 {%k1} {z}
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_mask_packus_epi32_rmbkz_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpackusdw (%eax){1to16}, %zmm0, %zmm0 {%k1} {z}
@@ -1463,12 +1580,12 @@ declare <32 x i16> @llvm.x86.avx512.mask.packusdw.512(<16 x i32>, <16 x i32>, <3
define <64 x i8> @test_mask_packus_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) {
; AVX512BW-LABEL: test_mask_packus_epi16_rr_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: vpackuswb %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_mask_packus_epi16_rr_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: vpackuswb %zmm1, %zmm0, %zmm0
; AVX512F-32-NEXT: retl
%res = call <64 x i8> @llvm.x86.avx512.mask.packuswb.512(<32 x i16> %a, <32 x i16> %b, <64 x i8> zeroinitializer, i64 -1)
@@ -1477,14 +1594,14 @@ define <64 x i8> @test_mask_packus_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) {
define <64 x i8> @test_mask_packus_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <64 x i8> %passThru, i64 %mask) {
; AVX512BW-LABEL: test_mask_packus_epi16_rrk_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovq %rdi, %k1
; AVX512BW-NEXT: vpackuswb %zmm1, %zmm0, %zmm2 {%k1}
; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_mask_packus_epi16_rrk_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: kmovq {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpackuswb %zmm1, %zmm0, %zmm2 {%k1}
; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0
@@ -1495,13 +1612,13 @@ define <64 x i8> @test_mask_packus_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <
define <64 x i8> @test_mask_packus_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i64 %mask) {
; AVX512BW-LABEL: test_mask_packus_epi16_rrkz_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovq %rdi, %k1
; AVX512BW-NEXT: vpackuswb %zmm1, %zmm0, %zmm0 {%k1} {z}
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_mask_packus_epi16_rrkz_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: kmovq {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpackuswb %zmm1, %zmm0, %zmm0 {%k1} {z}
; AVX512F-32-NEXT: retl
@@ -1511,12 +1628,12 @@ define <64 x i8> @test_mask_packus_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b,
define <64 x i8> @test_mask_packus_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) {
; AVX512BW-LABEL: test_mask_packus_epi16_rm_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: vpackuswb (%rdi), %zmm0, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_mask_packus_epi16_rm_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX512F-32-NEXT: vpackuswb (%eax), %zmm0, %zmm0
; AVX512F-32-NEXT: retl
@@ -1527,14 +1644,14 @@ define <64 x i8> @test_mask_packus_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_
define <64 x i8> @test_mask_packus_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <64 x i8> %passThru, i64 %mask) {
; AVX512BW-LABEL: test_mask_packus_epi16_rmk_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovq %rsi, %k1
; AVX512BW-NEXT: vpackuswb (%rdi), %zmm0, %zmm1 {%k1}
; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_mask_packus_epi16_rmk_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX512F-32-NEXT: kmovq {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpackuswb (%eax), %zmm0, %zmm1 {%k1}
@@ -1547,13 +1664,13 @@ define <64 x i8> @test_mask_packus_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr
define <64 x i8> @test_mask_packus_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i64 %mask) {
; AVX512BW-LABEL: test_mask_packus_epi16_rmkz_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovq %rsi, %k1
; AVX512BW-NEXT: vpackuswb (%rdi), %zmm0, %zmm0 {%k1} {z}
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_mask_packus_epi16_rmkz_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX512F-32-NEXT: kmovq {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpackuswb (%eax), %zmm0, %zmm0 {%k1} {z}
@@ -1567,7 +1684,7 @@ declare <64 x i8> @llvm.x86.avx512.mask.packuswb.512(<32 x i16>, <32 x i16>, <64
define i64 @test_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1) {
; AVX512BW-LABEL: test_cmp_b_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: vpcmpeqb %zmm1, %zmm0, %k0
; AVX512BW-NEXT: kmovq %k0, %rax
; AVX512BW-NEXT: vpcmpgtb %zmm0, %zmm1, %k0
@@ -1588,12 +1705,12 @@ define i64 @test_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1) {
; AVX512BW-NEXT: kxnorq %k0, %k0, %k0
; AVX512BW-NEXT: kmovq %k0, %rax
; AVX512BW-NEXT: addq %rcx, %rax
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_cmp_b_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: subl $60, %esp
-; AVX512F-32-NEXT: .Lcfi4:
; AVX512F-32-NEXT: .cfi_def_cfa_offset 64
; AVX512F-32-NEXT: vpcmpeqb %zmm1, %zmm0, %k0
; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp)
@@ -1602,28 +1719,29 @@ define i64 @test_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1) {
; AVX512F-32-NEXT: vpcmpgtb %zmm0, %zmm1, %k0
; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp)
; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: vpcmpleb %zmm1, %zmm0, %k0
; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp)
; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: vpcmpneqb %zmm1, %zmm0, %k0
; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp)
; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: vpcmpleb %zmm0, %zmm1, %k0
; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp)
; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: vpcmpgtb %zmm1, %zmm0, %k0
; AVX512F-32-NEXT: kmovq %k0, (%esp)
; AVX512F-32-NEXT: addl (%esp), %eax
-; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: kxnorq %k0, %k0, %k0
; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp)
; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: addl $60, %esp
+; AVX512F-32-NEXT: vzeroupper
; AVX512F-32-NEXT: retl
%res0 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 0, i64 -1)
%res1 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 1, i64 -1)
@@ -1645,7 +1763,7 @@ define i64 @test_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1) {
define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
; AVX512BW-LABEL: test_mask_cmp_b_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovq %rdi, %k1
; AVX512BW-NEXT: vpcmpeqb %zmm1, %zmm0, %k0 {%k1}
; AVX512BW-NEXT: kmovq %k0, %rax
@@ -1668,768 +1786,586 @@ define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
; AVX512BW-NEXT: kmovq %k0, %rax
; AVX512BW-NEXT: addq %rcx, %rax
; AVX512BW-NEXT: addq %rdi, %rax
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_mask_cmp_b_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: pushl %ebx
-; AVX512F-32-NEXT: .Lcfi5:
; AVX512F-32-NEXT: .cfi_def_cfa_offset 8
; AVX512F-32-NEXT: pushl %esi
-; AVX512F-32-NEXT: .Lcfi6:
; AVX512F-32-NEXT: .cfi_def_cfa_offset 12
-; AVX512F-32-NEXT: subl $60, %esp
-; AVX512F-32-NEXT: .Lcfi7:
-; AVX512F-32-NEXT: .cfi_def_cfa_offset 72
-; AVX512F-32-NEXT: .Lcfi8:
+; AVX512F-32-NEXT: subl $68, %esp
+; AVX512F-32-NEXT: .cfi_def_cfa_offset 80
; AVX512F-32-NEXT: .cfi_offset %esi, -12
-; AVX512F-32-NEXT: .Lcfi9:
; AVX512F-32-NEXT: .cfi_offset %ebx, -8
-; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm6
-; AVX512F-32-NEXT: vmovdqa64 %zmm0, %zmm5
-; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; AVX512F-32-NEXT: movb %cl, %al
-; AVX512F-32-NEXT: shrb $5, %al
-; AVX512F-32-NEXT: andb $1, %al
-; AVX512F-32-NEXT: movb %cl, %bl
-; AVX512F-32-NEXT: andb $15, %bl
-; AVX512F-32-NEXT: movb %cl, %dl
-; AVX512F-32-NEXT: andb $2, %dl
-; AVX512F-32-NEXT: shrb %dl
-; AVX512F-32-NEXT: kmovd %edx, %k0
-; AVX512F-32-NEXT: movb %bl, %dl
-; AVX512F-32-NEXT: shrb $2, %bl
-; AVX512F-32-NEXT: kmovd %ebx, %k1
-; AVX512F-32-NEXT: movb %cl, %bl
-; AVX512F-32-NEXT: shrb $4, %bl
-; AVX512F-32-NEXT: shrb $3, %dl
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2
-; AVX512F-32-NEXT: vpsllw $8, %xmm2, %xmm2
-; AVX512F-32-NEXT: kmovd %ecx, %k0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
-; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm3
-; AVX512F-32-NEXT: vpbroadcastw %xmm3, %xmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm3
-; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[4,5,6,7]
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2
-; AVX512F-32-NEXT: kmovd %edx, %k0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vpslld $24, %xmm3, %xmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm3
-; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[4,5,6,7]
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2
-; AVX512F-32-NEXT: kmovd %ebx, %k0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vpbroadcastd %xmm3, %xmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm3
-; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[4,5,6,7]
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2
-; AVX512F-32-NEXT: kmovd %eax, %k0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vpsllq $40, %xmm3, %xmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm3
-; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[4,5,6,7]
-; AVX512F-32-NEXT: movb %cl, %al
-; AVX512F-32-NEXT: shrb $6, %al
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
-; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: movb %cl, %al
-; AVX512F-32-NEXT: shrb $7, %al
-; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT: vpsllq $56, %xmm2, %xmm2
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
-; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: movb %ch, %al
-; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT: vpbroadcastq %xmm2, %xmm2
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
-; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: andb $2, %al
-; AVX512F-32-NEXT: shrb %al
-; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6]
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
-; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: movb %ch, %dl
-; AVX512F-32-NEXT: andb $15, %dl
-; AVX512F-32-NEXT: movb %dl, %al
-; AVX512F-32-NEXT: shrb $2, %dl
-; AVX512F-32-NEXT: kmovd %edx, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
-; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: shrb $3, %al
-; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4]
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
-; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: movl %ecx, %eax
-; AVX512F-32-NEXT: andl $61440, %eax # imm = 0xF000
-; AVX512F-32-NEXT: shrl $12, %eax
-; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT: vpbroadcastd %xmm2, %xmm2
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
-; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: movl %ecx, %eax
-; AVX512F-32-NEXT: shrl $13, %eax
-; AVX512F-32-NEXT: andb $1, %al
-; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2]
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
-; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: movl %ecx, %eax
-; AVX512F-32-NEXT: andl $49152, %eax # imm = 0xC000
-; AVX512F-32-NEXT: shrl $14, %eax
-; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
-; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: movl %ecx, %eax
-; AVX512F-32-NEXT: andl $32768, %eax # imm = 0x8000
-; AVX512F-32-NEXT: shrl $15, %eax
-; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0]
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
-; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: movl %ecx, %eax
+; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; AVX512F-32-NEXT: movl %ebx, %eax
; AVX512F-32-NEXT: shrl $16, %eax
-; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
-; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: movb %al, %dl
-; AVX512F-32-NEXT: andb $2, %dl
-; AVX512F-32-NEXT: shrb %dl
-; AVX512F-32-NEXT: kmovd %edx, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT: vpsllw $8, %xmm2, %xmm2
-; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
-; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: movb %al, %bl
-; AVX512F-32-NEXT: andb $15, %bl
-; AVX512F-32-NEXT: movb %bl, %dl
-; AVX512F-32-NEXT: shrb $2, %bl
-; AVX512F-32-NEXT: kmovd %ebx, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2
-; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
-; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: shrb $3, %dl
-; AVX512F-32-NEXT: kmovd %edx, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT: vpslld $24, %xmm2, %xmm2
-; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
-; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: movb %al, %dl
-; AVX512F-32-NEXT: shrb $4, %dl
-; AVX512F-32-NEXT: kmovd %edx, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT: vpbroadcastd %xmm2, %xmm2
-; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
-; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: movb %al, %dl
-; AVX512F-32-NEXT: shrb $5, %dl
-; AVX512F-32-NEXT: andb $1, %dl
-; AVX512F-32-NEXT: kmovd %edx, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT: vpsllq $40, %xmm2, %xmm2
-; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
-; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: movb %al, %dl
-; AVX512F-32-NEXT: shrb $6, %dl
-; AVX512F-32-NEXT: kmovd %edx, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2
-; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
-; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> %EAX<def>
-; AVX512F-32-NEXT: shrb $7, %al
-; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT: vpsllq $56, %xmm2, %xmm2
-; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
-; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: movl %ecx, %eax
-; AVX512F-32-NEXT: shrl $24, %eax
-; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT: vpbroadcastq %xmm2, %ymm2
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
-; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: movb %al, %dl
-; AVX512F-32-NEXT: andb $2, %dl
-; AVX512F-32-NEXT: shrb %dl
-; AVX512F-32-NEXT: kmovd %edx, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6]
-; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
-; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: movb %al, %dl
+; AVX512F-32-NEXT: movl %ebx, %edx
; AVX512F-32-NEXT: andb $15, %dl
-; AVX512F-32-NEXT: movb %dl, %al
+; AVX512F-32-NEXT: movl %ebx, %ecx
+; AVX512F-32-NEXT: andb $2, %cl
+; AVX512F-32-NEXT: shrb %cl
+; AVX512F-32-NEXT: kmovd %ecx, %k1
+; AVX512F-32-NEXT: movl %edx, %ecx
; AVX512F-32-NEXT: shrb $2, %dl
-; AVX512F-32-NEXT: kmovd %edx, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2
-; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
-; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: shrb $3, %al
-; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4]
-; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm3, %ymm4, %ymm2, %ymm2
-; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm4[4,5,6,7]
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: movl %ecx, %eax
-; AVX512F-32-NEXT: shrl $28, %eax
-; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT: vpbroadcastd %xmm2, %xmm2
-; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm4, %ymm2, %ymm2
-; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm4[4,5,6,7]
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: movl %ecx, %eax
-; AVX512F-32-NEXT: movl %ecx, %esi
-; AVX512F-32-NEXT: shrl $29, %eax
-; AVX512F-32-NEXT: andb $1, %al
-; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2]
-; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm0
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm2
-; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm0[4,5,6,7]
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT: movl %esi, %eax
-; AVX512F-32-NEXT: shrl $30, %eax
-; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpbroadcastw %xmm0, %xmm0
-; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm1
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm0
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255]
-; AVX512F-32-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm1
-; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7]
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT: movl %esi, %eax
-; AVX512F-32-NEXT: shrl $31, %eax
-; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0]
-; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm1, %ymm0, %ymm0
-; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; AVX512F-32-NEXT: kmovd %edx, %k2
+; AVX512F-32-NEXT: movb %bh, %dl
+; AVX512F-32-NEXT: andb $15, %dl
+; AVX512F-32-NEXT: shrb $3, %cl
+; AVX512F-32-NEXT: kmovd %ecx, %k0
+; AVX512F-32-NEXT: movl %ebx, %ecx
+; AVX512F-32-NEXT: shrb $4, %cl
+; AVX512F-32-NEXT: kmovd %ecx, %k3
+; AVX512F-32-NEXT: movl %ebx, %ecx
+; AVX512F-32-NEXT: shrb $5, %cl
+; AVX512F-32-NEXT: andb $1, %cl
+; AVX512F-32-NEXT: kmovd %ecx, %k4
+; AVX512F-32-NEXT: movl %ebx, %ecx
+; AVX512F-32-NEXT: shrb $6, %cl
+; AVX512F-32-NEXT: kmovd %ecx, %k6
+; AVX512F-32-NEXT: movl %ebx, %ecx
+; AVX512F-32-NEXT: shrb $7, %cl
+; AVX512F-32-NEXT: kmovd %ebx, %k5
+; AVX512F-32-NEXT: kshiftrq $1, %k5, %k7
+; AVX512F-32-NEXT: kxorq %k1, %k7, %k1
+; AVX512F-32-NEXT: kshiftlq $63, %k1, %k1
+; AVX512F-32-NEXT: kshiftrq $62, %k1, %k1
+; AVX512F-32-NEXT: kxorq %k5, %k1, %k7
+; AVX512F-32-NEXT: kshiftrq $2, %k7, %k1
+; AVX512F-32-NEXT: kxorq %k2, %k1, %k2
+; AVX512F-32-NEXT: kmovd %ecx, %k5
+; AVX512F-32-NEXT: movb %bh, %cl
; AVX512F-32-NEXT: kmovd %ecx, %k1
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm0
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm7
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm1, %ymm7, %ymm1
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT: movb %cl, %al
-; AVX512F-32-NEXT: andb $2, %al
-; AVX512F-32-NEXT: shrb %al
-; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpsllw $8, %xmm0, %xmm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT: movb %cl, %dl
+; AVX512F-32-NEXT: movl %ebx, %esi
+; AVX512F-32-NEXT: andb $2, %cl
+; AVX512F-32-NEXT: shrb %cl
+; AVX512F-32-NEXT: kshiftlq $63, %k2, %k2
+; AVX512F-32-NEXT: kshiftrq $61, %k2, %k2
+; AVX512F-32-NEXT: kxorq %k7, %k2, %k7
+; AVX512F-32-NEXT: kshiftrq $3, %k7, %k2
+; AVX512F-32-NEXT: kxorq %k0, %k2, %k0
+; AVX512F-32-NEXT: kmovd %ecx, %k2
+; AVX512F-32-NEXT: movl %edx, %ecx
+; AVX512F-32-NEXT: shrb $2, %dl
+; AVX512F-32-NEXT: kshiftlq $63, %k0, %k0
+; AVX512F-32-NEXT: kshiftrq $60, %k0, %k0
+; AVX512F-32-NEXT: kxorq %k7, %k0, %k0
+; AVX512F-32-NEXT: kshiftrq $4, %k0, %k7
+; AVX512F-32-NEXT: kxorq %k3, %k7, %k7
+; AVX512F-32-NEXT: kmovd %edx, %k3
+; AVX512F-32-NEXT: movl %eax, %edx
; AVX512F-32-NEXT: andb $15, %dl
-; AVX512F-32-NEXT: movb %dl, %al
+; AVX512F-32-NEXT: shrb $3, %cl
+; AVX512F-32-NEXT: kshiftlq $63, %k7, %k7
+; AVX512F-32-NEXT: kshiftrq $59, %k7, %k7
+; AVX512F-32-NEXT: kxorq %k0, %k7, %k7
+; AVX512F-32-NEXT: kshiftrq $5, %k7, %k0
+; AVX512F-32-NEXT: kxorq %k4, %k0, %k4
+; AVX512F-32-NEXT: kmovd %ecx, %k0
+; AVX512F-32-NEXT: movl %esi, %ecx
+; AVX512F-32-NEXT: shrl $13, %ecx
+; AVX512F-32-NEXT: andb $1, %cl
+; AVX512F-32-NEXT: kshiftlq $63, %k4, %k4
+; AVX512F-32-NEXT: kshiftrq $58, %k4, %k4
+; AVX512F-32-NEXT: kxorq %k7, %k4, %k7
+; AVX512F-32-NEXT: kshiftrq $6, %k7, %k4
+; AVX512F-32-NEXT: kxorq %k6, %k4, %k6
+; AVX512F-32-NEXT: kmovd %ecx, %k4
+; AVX512F-32-NEXT: movl %eax, %ebx
+; AVX512F-32-NEXT: andb $2, %bl
+; AVX512F-32-NEXT: shrb %bl
+; AVX512F-32-NEXT: kshiftlq $63, %k6, %k6
+; AVX512F-32-NEXT: kshiftrq $57, %k6, %k6
+; AVX512F-32-NEXT: kxorq %k7, %k6, %k6
+; AVX512F-32-NEXT: kshiftrq $7, %k6, %k7
+; AVX512F-32-NEXT: kxorq %k5, %k7, %k7
+; AVX512F-32-NEXT: kmovd %ebx, %k5
+; AVX512F-32-NEXT: movl %edx, %ecx
; AVX512F-32-NEXT: shrb $2, %dl
-; AVX512F-32-NEXT: kmovd %edx, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpbroadcastw %xmm0, %xmm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT: shrb $3, %al
-; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpslld $24, %xmm0, %xmm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT: movb %cl, %al
-; AVX512F-32-NEXT: shrb $4, %al
-; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpbroadcastd %xmm0, %xmm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT: movb %cl, %al
-; AVX512F-32-NEXT: shrb $5, %al
-; AVX512F-32-NEXT: andb $1, %al
-; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpsllq $40, %xmm0, %xmm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT: movb %cl, %al
-; AVX512F-32-NEXT: shrb $6, %al
-; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpbroadcastw %xmm0, %xmm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT: movb %cl, %al
+; AVX512F-32-NEXT: kshiftlq $63, %k7, %k7
+; AVX512F-32-NEXT: kshiftrq $56, %k7, %k7
+; AVX512F-32-NEXT: kxorq %k6, %k7, %k7
+; AVX512F-32-NEXT: kshiftrq $8, %k7, %k6
+; AVX512F-32-NEXT: kxorq %k1, %k6, %k1
+; AVX512F-32-NEXT: kmovd %edx, %k6
+; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; AVX512F-32-NEXT: shrb $3, %cl
+; AVX512F-32-NEXT: kshiftlq $63, %k1, %k1
+; AVX512F-32-NEXT: kshiftrq $55, %k1, %k1
+; AVX512F-32-NEXT: kxorq %k7, %k1, %k7
+; AVX512F-32-NEXT: kshiftrq $9, %k7, %k1
+; AVX512F-32-NEXT: kxorq %k2, %k1, %k2
+; AVX512F-32-NEXT: kmovd %ecx, %k1
+; AVX512F-32-NEXT: movl %eax, %ecx
+; AVX512F-32-NEXT: shrb $4, %cl
+; AVX512F-32-NEXT: kshiftlq $63, %k2, %k2
+; AVX512F-32-NEXT: kshiftrq $54, %k2, %k2
+; AVX512F-32-NEXT: kxorq %k7, %k2, %k7
+; AVX512F-32-NEXT: kshiftrq $10, %k7, %k2
+; AVX512F-32-NEXT: kxorq %k3, %k2, %k3
+; AVX512F-32-NEXT: kmovd %ecx, %k2
+; AVX512F-32-NEXT: movl %eax, %ecx
+; AVX512F-32-NEXT: shrb $5, %cl
+; AVX512F-32-NEXT: andb $1, %cl
+; AVX512F-32-NEXT: movl %esi, %edx
+; AVX512F-32-NEXT: shrl $12, %edx
+; AVX512F-32-NEXT: andl $15, %edx
+; AVX512F-32-NEXT: kshiftlq $63, %k3, %k3
+; AVX512F-32-NEXT: kshiftrq $53, %k3, %k3
+; AVX512F-32-NEXT: kxorq %k7, %k3, %k3
+; AVX512F-32-NEXT: kshiftrq $11, %k3, %k7
+; AVX512F-32-NEXT: kxorq %k0, %k7, %k0
+; AVX512F-32-NEXT: kshiftlq $63, %k0, %k0
+; AVX512F-32-NEXT: kshiftrq $52, %k0, %k0
+; AVX512F-32-NEXT: kxorq %k3, %k0, %k3
+; AVX512F-32-NEXT: kshiftrq $12, %k3, %k0
+; AVX512F-32-NEXT: kmovd %edx, %k7
+; AVX512F-32-NEXT: kxorq %k7, %k0, %k7
+; AVX512F-32-NEXT: kmovd %ecx, %k0
+; AVX512F-32-NEXT: movl %eax, %edx
+; AVX512F-32-NEXT: shrb $6, %dl
+; AVX512F-32-NEXT: movl %esi, %ecx
+; AVX512F-32-NEXT: shrl $14, %ecx
+; AVX512F-32-NEXT: andl $3, %ecx
+; AVX512F-32-NEXT: kshiftlq $63, %k7, %k7
+; AVX512F-32-NEXT: kshiftrq $51, %k7, %k7
+; AVX512F-32-NEXT: kxorq %k3, %k7, %k3
+; AVX512F-32-NEXT: kshiftrq $13, %k3, %k7
+; AVX512F-32-NEXT: kxorq %k4, %k7, %k4
+; AVX512F-32-NEXT: kmovd %ecx, %k7
+; AVX512F-32-NEXT: movl %esi, %ecx
+; AVX512F-32-NEXT: shrl $15, %ecx
+; AVX512F-32-NEXT: andl $1, %ecx
+; AVX512F-32-NEXT: kshiftlq $63, %k4, %k4
+; AVX512F-32-NEXT: kshiftrq $50, %k4, %k4
+; AVX512F-32-NEXT: kxorq %k3, %k4, %k3
+; AVX512F-32-NEXT: kshiftrq $14, %k3, %k4
+; AVX512F-32-NEXT: kxorq %k7, %k4, %k4
+; AVX512F-32-NEXT: kshiftlq $63, %k4, %k4
+; AVX512F-32-NEXT: kshiftrq $49, %k4, %k4
+; AVX512F-32-NEXT: kxorq %k3, %k4, %k3
+; AVX512F-32-NEXT: kshiftrq $15, %k3, %k4
+; AVX512F-32-NEXT: kmovd %ecx, %k7
+; AVX512F-32-NEXT: kxorq %k7, %k4, %k4
+; AVX512F-32-NEXT: kshiftlq $63, %k4, %k4
+; AVX512F-32-NEXT: kshiftrq $48, %k4, %k4
+; AVX512F-32-NEXT: kxorq %k3, %k4, %k3
+; AVX512F-32-NEXT: kshiftrq $16, %k3, %k4
+; AVX512F-32-NEXT: kmovd %eax, %k7
+; AVX512F-32-NEXT: kxorq %k7, %k4, %k4
+; AVX512F-32-NEXT: kmovd %edx, %k7
+; AVX512F-32-NEXT: movl %esi, %edx
+; AVX512F-32-NEXT: shrl $24, %edx
+; AVX512F-32-NEXT: # kill: def %al killed %al killed %eax def %eax
; AVX512F-32-NEXT: shrb $7, %al
-; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpsllq $56, %xmm0, %xmm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT: movb %ch, %al
-; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpbroadcastq %xmm0, %xmm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: kshiftlq $63, %k4, %k4
+; AVX512F-32-NEXT: kshiftrq $47, %k4, %k4
+; AVX512F-32-NEXT: kxorq %k3, %k4, %k3
+; AVX512F-32-NEXT: kshiftrq $17, %k3, %k4
+; AVX512F-32-NEXT: kxorq %k5, %k4, %k4
+; AVX512F-32-NEXT: kmovd %eax, %k5
+; AVX512F-32-NEXT: movl %edx, %eax
+; AVX512F-32-NEXT: kshiftlq $63, %k4, %k4
+; AVX512F-32-NEXT: kshiftrq $46, %k4, %k4
+; AVX512F-32-NEXT: kxorq %k3, %k4, %k4
+; AVX512F-32-NEXT: kshiftrq $18, %k4, %k3
+; AVX512F-32-NEXT: kxorq %k6, %k3, %k6
+; AVX512F-32-NEXT: kmovd %edx, %k3
+; AVX512F-32-NEXT: # kill: def %dl killed %dl killed %edx def %edx
+; AVX512F-32-NEXT: andb $15, %dl
; AVX512F-32-NEXT: andb $2, %al
; AVX512F-32-NEXT: shrb %al
-; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6]
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT: movb %ch, %dl
-; AVX512F-32-NEXT: andb $15, %dl
-; AVX512F-32-NEXT: movb %dl, %al
+; AVX512F-32-NEXT: kshiftlq $63, %k6, %k6
+; AVX512F-32-NEXT: kshiftrq $45, %k6, %k6
+; AVX512F-32-NEXT: kxorq %k4, %k6, %k6
+; AVX512F-32-NEXT: kshiftrq $19, %k6, %k4
+; AVX512F-32-NEXT: kxorq %k1, %k4, %k1
+; AVX512F-32-NEXT: kmovd %eax, %k4
+; AVX512F-32-NEXT: movl %edx, %ecx
; AVX512F-32-NEXT: shrb $2, %dl
-; AVX512F-32-NEXT: kmovd %edx, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpbroadcastw %xmm0, %xmm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT: shrb $3, %al
-; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4]
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT: movl %ecx, %eax
-; AVX512F-32-NEXT: andl $61440, %eax # imm = 0xF000
-; AVX512F-32-NEXT: shrl $12, %eax
-; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpbroadcastd %xmm0, %xmm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT: movl %ecx, %eax
-; AVX512F-32-NEXT: shrl $13, %eax
-; AVX512F-32-NEXT: andb $1, %al
-; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2]
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT: movl %ecx, %eax
-; AVX512F-32-NEXT: andl $49152, %eax # imm = 0xC000
-; AVX512F-32-NEXT: shrl $14, %eax
-; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpbroadcastw %xmm0, %xmm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT: movl %ecx, %eax
-; AVX512F-32-NEXT: andl $32768, %eax # imm = 0x8000
-; AVX512F-32-NEXT: shrl $15, %eax
-; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0]
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT: movl %ecx, %ebx
-; AVX512F-32-NEXT: shrl $16, %ebx
-; AVX512F-32-NEXT: kmovd %ebx, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT: movb %bl, %dl
+; AVX512F-32-NEXT: kshiftlq $63, %k1, %k1
+; AVX512F-32-NEXT: kshiftrq $44, %k1, %k1
+; AVX512F-32-NEXT: kxorq %k6, %k1, %k1
+; AVX512F-32-NEXT: kshiftrq $20, %k1, %k6
+; AVX512F-32-NEXT: kxorq %k2, %k6, %k6
+; AVX512F-32-NEXT: kmovd %edx, %k2
+; AVX512F-32-NEXT: movl %ebx, %eax
+; AVX512F-32-NEXT: andb $15, %al
+; AVX512F-32-NEXT: shrb $3, %cl
+; AVX512F-32-NEXT: kshiftlq $63, %k6, %k6
+; AVX512F-32-NEXT: kshiftrq $43, %k6, %k6
+; AVX512F-32-NEXT: kxorq %k1, %k6, %k1
+; AVX512F-32-NEXT: kshiftrq $21, %k1, %k6
+; AVX512F-32-NEXT: kxorq %k0, %k6, %k6
+; AVX512F-32-NEXT: kmovd %ecx, %k0
+; AVX512F-32-NEXT: movl %esi, %ecx
+; AVX512F-32-NEXT: shrl $29, %ecx
+; AVX512F-32-NEXT: andb $1, %cl
+; AVX512F-32-NEXT: kshiftlq $63, %k6, %k6
+; AVX512F-32-NEXT: kshiftrq $42, %k6, %k6
+; AVX512F-32-NEXT: kxorq %k1, %k6, %k6
+; AVX512F-32-NEXT: kshiftrq $22, %k6, %k1
+; AVX512F-32-NEXT: kxorq %k7, %k1, %k7
+; AVX512F-32-NEXT: kmovd %ecx, %k1
+; AVX512F-32-NEXT: movl %ebx, %edx
; AVX512F-32-NEXT: andb $2, %dl
; AVX512F-32-NEXT: shrb %dl
-; AVX512F-32-NEXT: kmovd %edx, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpsllw $8, %xmm0, %xmm0
-; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: movb %bl, %al
+; AVX512F-32-NEXT: kshiftlq $63, %k7, %k7
+; AVX512F-32-NEXT: kshiftrq $41, %k7, %k7
+; AVX512F-32-NEXT: kxorq %k6, %k7, %k6
+; AVX512F-32-NEXT: kshiftrq $23, %k6, %k7
+; AVX512F-32-NEXT: kxorq %k5, %k7, %k7
+; AVX512F-32-NEXT: kmovd %edx, %k5
+; AVX512F-32-NEXT: movl %eax, %ecx
+; AVX512F-32-NEXT: shrb $2, %al
+; AVX512F-32-NEXT: kshiftlq $63, %k7, %k7
+; AVX512F-32-NEXT: kshiftrq $40, %k7, %k7
+; AVX512F-32-NEXT: kxorq %k6, %k7, %k7
+; AVX512F-32-NEXT: kshiftrq $24, %k7, %k6
+; AVX512F-32-NEXT: kxorq %k3, %k6, %k3
+; AVX512F-32-NEXT: kmovd %eax, %k6
+; AVX512F-32-NEXT: movb %bh, %al
; AVX512F-32-NEXT: andb $15, %al
-; AVX512F-32-NEXT: movb %al, %dl
+; AVX512F-32-NEXT: shrb $3, %cl
+; AVX512F-32-NEXT: kshiftlq $63, %k3, %k3
+; AVX512F-32-NEXT: kshiftrq $39, %k3, %k3
+; AVX512F-32-NEXT: kxorq %k7, %k3, %k7
+; AVX512F-32-NEXT: kshiftrq $25, %k7, %k3
+; AVX512F-32-NEXT: kxorq %k4, %k3, %k4
+; AVX512F-32-NEXT: kmovd %ecx, %k3
+; AVX512F-32-NEXT: movl %ebx, %ecx
+; AVX512F-32-NEXT: shrb $4, %cl
+; AVX512F-32-NEXT: kshiftlq $63, %k4, %k4
+; AVX512F-32-NEXT: kshiftrq $38, %k4, %k4
+; AVX512F-32-NEXT: kxorq %k7, %k4, %k7
+; AVX512F-32-NEXT: kshiftrq $26, %k7, %k4
+; AVX512F-32-NEXT: kxorq %k2, %k4, %k2
+; AVX512F-32-NEXT: kmovd %ecx, %k4
+; AVX512F-32-NEXT: movl %ebx, %ecx
+; AVX512F-32-NEXT: shrb $5, %cl
+; AVX512F-32-NEXT: andb $1, %cl
+; AVX512F-32-NEXT: movl %esi, %edx
+; AVX512F-32-NEXT: shrl $28, %edx
+; AVX512F-32-NEXT: kshiftlq $63, %k2, %k2
+; AVX512F-32-NEXT: kshiftrq $37, %k2, %k2
+; AVX512F-32-NEXT: kxorq %k7, %k2, %k2
+; AVX512F-32-NEXT: kshiftrq $27, %k2, %k7
+; AVX512F-32-NEXT: kxorq %k0, %k7, %k0
+; AVX512F-32-NEXT: kshiftlq $63, %k0, %k0
+; AVX512F-32-NEXT: kshiftrq $36, %k0, %k0
+; AVX512F-32-NEXT: kxorq %k2, %k0, %k2
+; AVX512F-32-NEXT: kshiftrq $28, %k2, %k0
+; AVX512F-32-NEXT: kmovd %edx, %k7
+; AVX512F-32-NEXT: kxorq %k7, %k0, %k7
+; AVX512F-32-NEXT: kmovd %ecx, %k0
+; AVX512F-32-NEXT: movl %ebx, %edx
+; AVX512F-32-NEXT: shrb $6, %dl
+; AVX512F-32-NEXT: movl %esi, %ecx
+; AVX512F-32-NEXT: shrl $30, %ecx
+; AVX512F-32-NEXT: kshiftlq $63, %k7, %k7
+; AVX512F-32-NEXT: kshiftrq $35, %k7, %k7
+; AVX512F-32-NEXT: kxorq %k2, %k7, %k2
+; AVX512F-32-NEXT: kshiftrq $29, %k2, %k7
+; AVX512F-32-NEXT: kxorq %k1, %k7, %k1
+; AVX512F-32-NEXT: kmovd %ecx, %k7
+; AVX512F-32-NEXT: movl %esi, %ecx
+; AVX512F-32-NEXT: shrl $31, %ecx
+; AVX512F-32-NEXT: kshiftlq $63, %k1, %k1
+; AVX512F-32-NEXT: kshiftrq $34, %k1, %k1
+; AVX512F-32-NEXT: kxorq %k2, %k1, %k1
+; AVX512F-32-NEXT: kshiftrq $30, %k1, %k2
+; AVX512F-32-NEXT: kxorq %k7, %k2, %k2
+; AVX512F-32-NEXT: kshiftlq $63, %k2, %k2
+; AVX512F-32-NEXT: kshiftrq $33, %k2, %k2
+; AVX512F-32-NEXT: kxorq %k1, %k2, %k1
+; AVX512F-32-NEXT: kshiftrq $31, %k1, %k2
+; AVX512F-32-NEXT: kmovd %ecx, %k7
+; AVX512F-32-NEXT: kxorq %k7, %k2, %k2
+; AVX512F-32-NEXT: kshiftlq $63, %k2, %k2
+; AVX512F-32-NEXT: kshiftrq $32, %k2, %k2
+; AVX512F-32-NEXT: kxorq %k1, %k2, %k1
+; AVX512F-32-NEXT: kshiftrq $32, %k1, %k2
+; AVX512F-32-NEXT: kmovd %ebx, %k7
+; AVX512F-32-NEXT: kxorq %k7, %k2, %k2
+; AVX512F-32-NEXT: kmovd %edx, %k7
+; AVX512F-32-NEXT: movl %ebx, %ecx
+; AVX512F-32-NEXT: shrb $7, %cl
+; AVX512F-32-NEXT: kshiftlq $63, %k2, %k2
+; AVX512F-32-NEXT: kshiftrq $31, %k2, %k2
+; AVX512F-32-NEXT: kxorq %k1, %k2, %k1
+; AVX512F-32-NEXT: kshiftrq $33, %k1, %k2
+; AVX512F-32-NEXT: kxorq %k5, %k2, %k2
+; AVX512F-32-NEXT: kshiftlq $63, %k2, %k2
+; AVX512F-32-NEXT: kshiftrq $30, %k2, %k2
+; AVX512F-32-NEXT: kxorq %k1, %k2, %k2
+; AVX512F-32-NEXT: kshiftrq $34, %k2, %k1
+; AVX512F-32-NEXT: kxorq %k6, %k1, %k5
+; AVX512F-32-NEXT: kmovd %ecx, %k6
+; AVX512F-32-NEXT: movb %bh, %cl
+; AVX512F-32-NEXT: kmovd %ecx, %k1
+; AVX512F-32-NEXT: andb $2, %cl
+; AVX512F-32-NEXT: shrb %cl
+; AVX512F-32-NEXT: kshiftlq $63, %k5, %k5
+; AVX512F-32-NEXT: kshiftrq $29, %k5, %k5
+; AVX512F-32-NEXT: kxorq %k2, %k5, %k5
+; AVX512F-32-NEXT: kshiftrq $35, %k5, %k2
+; AVX512F-32-NEXT: kxorq %k3, %k2, %k3
+; AVX512F-32-NEXT: kmovd %ecx, %k2
+; AVX512F-32-NEXT: movl %eax, %ecx
; AVX512F-32-NEXT: shrb $2, %al
-; AVX512F-32-NEXT: kmovd %eax, %k0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k1
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm0
-; AVX512F-32-NEXT: vpbroadcastw %xmm0, %xmm0
-; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT: shrb $3, %dl
-; AVX512F-32-NEXT: kmovd %edx, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpslld $24, %xmm0, %xmm0
-; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT: movb %bl, %al
-; AVX512F-32-NEXT: shrb $4, %al
-; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpbroadcastd %xmm0, %xmm0
-; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT: movb %bl, %al
-; AVX512F-32-NEXT: shrb $5, %al
-; AVX512F-32-NEXT: andb $1, %al
-; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpsllq $40, %xmm0, %xmm0
-; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT: movb %bl, %al
-; AVX512F-32-NEXT: shrb $6, %al
-; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpbroadcastw %xmm0, %xmm0
-; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT: # kill: %BL<def> %BL<kill> %EBX<kill> %EBX<def>
-; AVX512F-32-NEXT: shrb $7, %bl
-; AVX512F-32-NEXT: kmovd %ebx, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpsllq $56, %xmm0, %xmm0
-; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT: movl %ecx, %eax
-; AVX512F-32-NEXT: shrl $24, %eax
-; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpbroadcastq %xmm0, %ymm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT: movb %al, %dl
+; AVX512F-32-NEXT: kshiftlq $63, %k3, %k3
+; AVX512F-32-NEXT: kshiftrq $28, %k3, %k3
+; AVX512F-32-NEXT: kxorq %k5, %k3, %k5
+; AVX512F-32-NEXT: kshiftrq $36, %k5, %k3
+; AVX512F-32-NEXT: kxorq %k4, %k3, %k4
+; AVX512F-32-NEXT: kmovd %eax, %k3
+; AVX512F-32-NEXT: movl %ebx, %eax
+; AVX512F-32-NEXT: shrl $16, %eax
+; AVX512F-32-NEXT: shrb $3, %cl
+; AVX512F-32-NEXT: kshiftlq $63, %k4, %k4
+; AVX512F-32-NEXT: kshiftrq $27, %k4, %k4
+; AVX512F-32-NEXT: kxorq %k5, %k4, %k5
+; AVX512F-32-NEXT: kshiftrq $37, %k5, %k4
+; AVX512F-32-NEXT: kxorq %k0, %k4, %k0
+; AVX512F-32-NEXT: kmovd %ecx, %k4
+; AVX512F-32-NEXT: movl %ebx, %ecx
+; AVX512F-32-NEXT: shrl $13, %ecx
+; AVX512F-32-NEXT: andb $1, %cl
+; AVX512F-32-NEXT: kshiftlq $63, %k0, %k0
+; AVX512F-32-NEXT: kshiftrq $26, %k0, %k0
+; AVX512F-32-NEXT: kxorq %k5, %k0, %k0
+; AVX512F-32-NEXT: kshiftrq $38, %k0, %k5
+; AVX512F-32-NEXT: kxorq %k7, %k5, %k7
+; AVX512F-32-NEXT: kmovd %ecx, %k5
+; AVX512F-32-NEXT: movl %eax, %edx
; AVX512F-32-NEXT: andb $2, %dl
; AVX512F-32-NEXT: shrb %dl
+; AVX512F-32-NEXT: kshiftlq $63, %k7, %k7
+; AVX512F-32-NEXT: kshiftrq $25, %k7, %k7
+; AVX512F-32-NEXT: kxorq %k0, %k7, %k7
+; AVX512F-32-NEXT: kshiftrq $39, %k7, %k0
+; AVX512F-32-NEXT: kxorq %k6, %k0, %k6
+; AVX512F-32-NEXT: kmovd %edx, %k0
+; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp) # 8-byte Spill
+; AVX512F-32-NEXT: movl %eax, %ecx
+; AVX512F-32-NEXT: andb $15, %cl
+; AVX512F-32-NEXT: movl %ecx, %edx
+; AVX512F-32-NEXT: shrb $2, %cl
+; AVX512F-32-NEXT: kshiftlq $63, %k6, %k6
+; AVX512F-32-NEXT: kshiftrq $24, %k6, %k6
+; AVX512F-32-NEXT: kxorq %k7, %k6, %k6
+; AVX512F-32-NEXT: kshiftrq $40, %k6, %k7
+; AVX512F-32-NEXT: kxorq %k1, %k7, %k7
+; AVX512F-32-NEXT: kmovd %ecx, %k1
+; AVX512F-32-NEXT: movl %ebx, %ecx
+; AVX512F-32-NEXT: shrl $12, %ecx
+; AVX512F-32-NEXT: andl $15, %ecx
+; AVX512F-32-NEXT: kshiftlq $63, %k7, %k7
+; AVX512F-32-NEXT: kshiftrq $23, %k7, %k7
+; AVX512F-32-NEXT: kxorq %k6, %k7, %k6
+; AVX512F-32-NEXT: kshiftrq $41, %k6, %k7
+; AVX512F-32-NEXT: kxorq %k2, %k7, %k2
+; AVX512F-32-NEXT: kmovd %ecx, %k0
+; AVX512F-32-NEXT: movl %ebx, %ecx
+; AVX512F-32-NEXT: shrl $14, %ecx
+; AVX512F-32-NEXT: andl $3, %ecx
+; AVX512F-32-NEXT: kshiftlq $63, %k2, %k2
+; AVX512F-32-NEXT: kshiftrq $22, %k2, %k2
+; AVX512F-32-NEXT: kxorq %k6, %k2, %k2
+; AVX512F-32-NEXT: kshiftrq $42, %k2, %k6
+; AVX512F-32-NEXT: kxorq %k3, %k6, %k3
+; AVX512F-32-NEXT: kmovd %ecx, %k7
+; AVX512F-32-NEXT: movl %ebx, %ecx
+; AVX512F-32-NEXT: shrl $15, %ecx
+; AVX512F-32-NEXT: andl $1, %ecx
+; AVX512F-32-NEXT: kshiftlq $63, %k3, %k3
+; AVX512F-32-NEXT: kshiftrq $21, %k3, %k3
+; AVX512F-32-NEXT: kxorq %k2, %k3, %k2
+; AVX512F-32-NEXT: kshiftrq $43, %k2, %k3
+; AVX512F-32-NEXT: kxorq %k4, %k3, %k3
+; AVX512F-32-NEXT: kmovd %ecx, %k6
+; AVX512F-32-NEXT: shrb $3, %dl
+; AVX512F-32-NEXT: kshiftlq $63, %k3, %k3
+; AVX512F-32-NEXT: kshiftrq $20, %k3, %k3
+; AVX512F-32-NEXT: kxorq %k2, %k3, %k3
+; AVX512F-32-NEXT: kshiftrq $44, %k3, %k2
+; AVX512F-32-NEXT: kxorq %k0, %k2, %k0
+; AVX512F-32-NEXT: kmovd %edx, %k2
+; AVX512F-32-NEXT: movl %eax, %ecx
+; AVX512F-32-NEXT: shrb $4, %cl
+; AVX512F-32-NEXT: kshiftlq $63, %k0, %k0
+; AVX512F-32-NEXT: kshiftrq $19, %k0, %k0
+; AVX512F-32-NEXT: kxorq %k3, %k0, %k0
+; AVX512F-32-NEXT: kshiftrq $45, %k0, %k3
+; AVX512F-32-NEXT: kxorq %k5, %k3, %k4
+; AVX512F-32-NEXT: kmovd %ecx, %k3
+; AVX512F-32-NEXT: movl %eax, %ecx
+; AVX512F-32-NEXT: shrb $5, %cl
+; AVX512F-32-NEXT: andb $1, %cl
+; AVX512F-32-NEXT: kshiftlq $63, %k4, %k4
+; AVX512F-32-NEXT: kshiftrq $18, %k4, %k4
+; AVX512F-32-NEXT: kxorq %k0, %k4, %k0
+; AVX512F-32-NEXT: kshiftrq $46, %k0, %k4
+; AVX512F-32-NEXT: kxorq %k7, %k4, %k5
+; AVX512F-32-NEXT: kmovd %ecx, %k4
+; AVX512F-32-NEXT: movl %eax, %ecx
+; AVX512F-32-NEXT: shrb $6, %cl
+; AVX512F-32-NEXT: kshiftlq $63, %k5, %k5
+; AVX512F-32-NEXT: kshiftrq $17, %k5, %k5
+; AVX512F-32-NEXT: kxorq %k0, %k5, %k0
+; AVX512F-32-NEXT: kshiftrq $47, %k0, %k5
+; AVX512F-32-NEXT: kxorq %k6, %k5, %k5
+; AVX512F-32-NEXT: kshiftlq $63, %k5, %k5
+; AVX512F-32-NEXT: kshiftrq $16, %k5, %k5
+; AVX512F-32-NEXT: kxorq %k0, %k5, %k0
+; AVX512F-32-NEXT: kshiftrq $48, %k0, %k5
+; AVX512F-32-NEXT: kmovd %eax, %k6
+; AVX512F-32-NEXT: kxorq %k6, %k5, %k6
+; AVX512F-32-NEXT: kmovd %ecx, %k5
+; AVX512F-32-NEXT: movl %ebx, %edx
+; AVX512F-32-NEXT: shrl $24, %edx
+; AVX512F-32-NEXT: # kill: def %al killed %al killed %eax def %eax
+; AVX512F-32-NEXT: shrb $7, %al
+; AVX512F-32-NEXT: kshiftlq $63, %k6, %k6
+; AVX512F-32-NEXT: kshiftrq $15, %k6, %k6
+; AVX512F-32-NEXT: kxorq %k0, %k6, %k6
+; AVX512F-32-NEXT: kshiftrq $49, %k6, %k0
+; AVX512F-32-NEXT: kmovq {{[0-9]+}}(%esp), %k7 # 8-byte Reload
+; AVX512F-32-NEXT: kxorq %k7, %k0, %k7
+; AVX512F-32-NEXT: kmovd %eax, %k0
+; AVX512F-32-NEXT: movl %edx, %eax
+; AVX512F-32-NEXT: kshiftlq $63, %k7, %k7
+; AVX512F-32-NEXT: kshiftrq $14, %k7, %k7
+; AVX512F-32-NEXT: kxorq %k6, %k7, %k6
+; AVX512F-32-NEXT: kshiftrq $50, %k6, %k7
+; AVX512F-32-NEXT: kxorq %k1, %k7, %k7
; AVX512F-32-NEXT: kmovd %edx, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6]
-; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: movb %al, %dl
+; AVX512F-32-NEXT: # kill: def %dl killed %dl killed %edx def %edx
; AVX512F-32-NEXT: andb $15, %dl
-; AVX512F-32-NEXT: movb %dl, %al
+; AVX512F-32-NEXT: andb $2, %al
+; AVX512F-32-NEXT: shrb %al
+; AVX512F-32-NEXT: kshiftlq $63, %k7, %k7
+; AVX512F-32-NEXT: kshiftrq $13, %k7, %k7
+; AVX512F-32-NEXT: kxorq %k6, %k7, %k6
+; AVX512F-32-NEXT: kshiftrq $51, %k6, %k7
+; AVX512F-32-NEXT: kxorq %k2, %k7, %k7
+; AVX512F-32-NEXT: kmovd %eax, %k2
+; AVX512F-32-NEXT: movl %edx, %eax
; AVX512F-32-NEXT: shrb $2, %dl
-; AVX512F-32-NEXT: kmovd %edx, %k0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k1
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm0
-; AVX512F-32-NEXT: vpbroadcastw %xmm0, %xmm0
-; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; AVX512F-32-NEXT: kshiftlq $63, %k7, %k7
+; AVX512F-32-NEXT: kshiftrq $12, %k7, %k7
+; AVX512F-32-NEXT: kxorq %k6, %k7, %k6
+; AVX512F-32-NEXT: kshiftrq $52, %k6, %k7
+; AVX512F-32-NEXT: kxorq %k3, %k7, %k7
+; AVX512F-32-NEXT: kmovd %edx, %k3
; AVX512F-32-NEXT: shrb $3, %al
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm0
-; AVX512F-32-NEXT: kmovd %eax, %k0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4]
-; AVX512F-32-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm0, %ymm4
-; AVX512F-32-NEXT: vpblendvb %ymm3, %ymm4, %ymm1, %ymm1
-; AVX512F-32-NEXT: movl %ecx, %eax
+; AVX512F-32-NEXT: kshiftlq $63, %k7, %k7
+; AVX512F-32-NEXT: kshiftrq $11, %k7, %k7
+; AVX512F-32-NEXT: kxorq %k6, %k7, %k6
+; AVX512F-32-NEXT: kshiftrq $53, %k6, %k7
+; AVX512F-32-NEXT: kxorq %k4, %k7, %k7
+; AVX512F-32-NEXT: kmovd %eax, %k4
+; AVX512F-32-NEXT: movl %ebx, %eax
; AVX512F-32-NEXT: shrl $29, %eax
; AVX512F-32-NEXT: andb $1, %al
-; AVX512F-32-NEXT: kmovd %eax, %k0
-; AVX512F-32-NEXT: movl %ecx, %eax
+; AVX512F-32-NEXT: kshiftlq $63, %k7, %k7
+; AVX512F-32-NEXT: kshiftrq $10, %k7, %k7
+; AVX512F-32-NEXT: kxorq %k6, %k7, %k6
+; AVX512F-32-NEXT: kshiftrq $54, %k6, %k7
+; AVX512F-32-NEXT: kxorq %k5, %k7, %k7
+; AVX512F-32-NEXT: kmovd %eax, %k5
+; AVX512F-32-NEXT: kshiftlq $63, %k7, %k7
+; AVX512F-32-NEXT: kshiftrq $9, %k7, %k7
+; AVX512F-32-NEXT: kxorq %k6, %k7, %k6
+; AVX512F-32-NEXT: kshiftrq $55, %k6, %k7
+; AVX512F-32-NEXT: kxorq %k0, %k7, %k0
+; AVX512F-32-NEXT: kshiftlq $63, %k0, %k0
+; AVX512F-32-NEXT: kshiftrq $8, %k0, %k0
+; AVX512F-32-NEXT: kxorq %k6, %k0, %k0
+; AVX512F-32-NEXT: kshiftrq $56, %k0, %k6
+; AVX512F-32-NEXT: kxorq %k1, %k6, %k1
+; AVX512F-32-NEXT: kshiftlq $63, %k1, %k1
+; AVX512F-32-NEXT: kshiftrq $7, %k1, %k1
+; AVX512F-32-NEXT: kxorq %k0, %k1, %k0
+; AVX512F-32-NEXT: kshiftrq $57, %k0, %k1
+; AVX512F-32-NEXT: kxorq %k2, %k1, %k1
+; AVX512F-32-NEXT: kshiftlq $63, %k1, %k1
+; AVX512F-32-NEXT: kshiftrq $6, %k1, %k1
+; AVX512F-32-NEXT: kxorq %k0, %k1, %k0
+; AVX512F-32-NEXT: kshiftrq $58, %k0, %k1
+; AVX512F-32-NEXT: kxorq %k3, %k1, %k1
+; AVX512F-32-NEXT: movl %ebx, %eax
; AVX512F-32-NEXT: shrl $28, %eax
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm3
-; AVX512F-32-NEXT: vpbroadcastd %xmm3, %xmm3
-; AVX512F-32-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm1
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2]
-; AVX512F-32-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm1
-; AVX512F-32-NEXT: movl %ecx, %eax
-; AVX512F-32-NEXT: shrl $30, %eax
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm0
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-32-NEXT: kmovd %eax, %k0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vpbroadcastw %xmm3, %xmm3
-; AVX512F-32-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512F-32-NEXT: vpblendvb %ymm2, %ymm1, %ymm3, %ymm1
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT: movl %ecx, %eax
+; AVX512F-32-NEXT: kshiftlq $63, %k1, %k1
+; AVX512F-32-NEXT: kshiftrq $5, %k1, %k1
+; AVX512F-32-NEXT: kxorq %k0, %k1, %k0
+; AVX512F-32-NEXT: kshiftrq $59, %k0, %k1
+; AVX512F-32-NEXT: kxorq %k4, %k1, %k1
+; AVX512F-32-NEXT: kshiftlq $63, %k1, %k1
+; AVX512F-32-NEXT: kshiftrq $4, %k1, %k1
+; AVX512F-32-NEXT: kxorq %k0, %k1, %k0
+; AVX512F-32-NEXT: kshiftrq $60, %k0, %k1
+; AVX512F-32-NEXT: kmovd %eax, %k2
+; AVX512F-32-NEXT: kxorq %k2, %k1, %k1
+; AVX512F-32-NEXT: movl %ebx, %eax
; AVX512F-32-NEXT: shrl $31, %eax
+; AVX512F-32-NEXT: movl %ebx, %ecx
+; AVX512F-32-NEXT: shrl $30, %ecx
+; AVX512F-32-NEXT: kshiftlq $63, %k1, %k1
+; AVX512F-32-NEXT: kshiftrq $3, %k1, %k1
+; AVX512F-32-NEXT: kxorq %k0, %k1, %k0
+; AVX512F-32-NEXT: kshiftrq $61, %k0, %k1
+; AVX512F-32-NEXT: kxorq %k5, %k1, %k1
+; AVX512F-32-NEXT: kshiftlq $63, %k1, %k1
+; AVX512F-32-NEXT: kshiftrq $2, %k1, %k1
+; AVX512F-32-NEXT: kxorq %k0, %k1, %k0
+; AVX512F-32-NEXT: kshiftrq $62, %k0, %k1
+; AVX512F-32-NEXT: kmovd %ecx, %k2
+; AVX512F-32-NEXT: kxorq %k2, %k1, %k1
+; AVX512F-32-NEXT: kshiftlq $63, %k1, %k1
+; AVX512F-32-NEXT: kshiftrq $1, %k1, %k1
+; AVX512F-32-NEXT: kxorq %k0, %k1, %k0
; AVX512F-32-NEXT: kshiftlq $1, %k0, %k0
; AVX512F-32-NEXT: kshiftrq $1, %k0, %k0
; AVX512F-32-NEXT: kmovd %eax, %k1
; AVX512F-32-NEXT: kshiftlq $63, %k1, %k1
; AVX512F-32-NEXT: korq %k1, %k0, %k1
-; AVX512F-32-NEXT: vpcmpeqb %zmm6, %zmm5, %k0 {%k1}
-; AVX512F-32-NEXT: vpcmpgtb %zmm5, %zmm6, %k2 {%k1}
-; AVX512F-32-NEXT: vpcmpleb %zmm6, %zmm5, %k3 {%k1}
-; AVX512F-32-NEXT: vpcmpneqb %zmm6, %zmm5, %k4 {%k1}
-; AVX512F-32-NEXT: vpcmpleb %zmm5, %zmm6, %k5 {%k1}
-; AVX512F-32-NEXT: vpcmpgtb %zmm6, %zmm5, %k1 {%k1}
+; AVX512F-32-NEXT: vpcmpeqb %zmm1, %zmm0, %k0 {%k1}
; AVX512F-32-NEXT: kmovq %k0, (%esp)
; AVX512F-32-NEXT: movl (%esp), %eax
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %edx
-; AVX512F-32-NEXT: kmovq %k2, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT: vpcmpgtb %zmm0, %zmm1, %k0 {%k1}
+; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp)
; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx
-; AVX512F-32-NEXT: kmovq %k3, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT: vpcmpleb %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp)
; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: kxorq %k0, %k0, %k0
; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp)
; AVX512F-32-NEXT: orl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: orl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT: kmovq %k4, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT: vpcmpneqb %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp)
; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx
-; AVX512F-32-NEXT: kmovq %k5, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT: vpcmpleb %zmm0, %zmm1, %k0 {%k1}
+; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp)
; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx
-; AVX512F-32-NEXT: kmovq %k1, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT: vpcmpgtb %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp)
; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: addl %esi, %eax
-; AVX512F-32-NEXT: adcxl %ecx, %edx
-; AVX512F-32-NEXT: addl $60, %esp
+; AVX512F-32-NEXT: adcl %ebx, %edx
+; AVX512F-32-NEXT: addl $68, %esp
; AVX512F-32-NEXT: popl %esi
; AVX512F-32-NEXT: popl %ebx
+; AVX512F-32-NEXT: vzeroupper
; AVX512F-32-NEXT: retl
%res0 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 0, i64 %mask)
%res1 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 1, i64 %mask)
@@ -2453,7 +2389,7 @@ declare i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8>, <64 x i8>, i32, i64) noun
define i64 @test_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1) {
; AVX512BW-LABEL: test_ucmp_b_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: vpcmpeqb %zmm1, %zmm0, %k0
; AVX512BW-NEXT: kmovq %k0, %rax
; AVX512BW-NEXT: vpcmpltub %zmm1, %zmm0, %k0
@@ -2474,12 +2410,12 @@ define i64 @test_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1) {
; AVX512BW-NEXT: kxnorq %k0, %k0, %k0
; AVX512BW-NEXT: kmovq %k0, %rax
; AVX512BW-NEXT: addq %rcx, %rax
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_ucmp_b_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: subl $60, %esp
-; AVX512F-32-NEXT: .Lcfi10:
; AVX512F-32-NEXT: .cfi_def_cfa_offset 64
; AVX512F-32-NEXT: vpcmpeqb %zmm1, %zmm0, %k0
; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp)
@@ -2488,28 +2424,29 @@ define i64 @test_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1) {
; AVX512F-32-NEXT: vpcmpltub %zmm1, %zmm0, %k0
; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp)
; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: vpcmpleub %zmm1, %zmm0, %k0
; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp)
; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: vpcmpneqb %zmm1, %zmm0, %k0
; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp)
; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: vpcmpnltub %zmm1, %zmm0, %k0
; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp)
; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: vpcmpnleub %zmm1, %zmm0, %k0
; AVX512F-32-NEXT: kmovq %k0, (%esp)
; AVX512F-32-NEXT: addl (%esp), %eax
-; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: kxnorq %k0, %k0, %k0
; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp)
; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: addl $60, %esp
+; AVX512F-32-NEXT: vzeroupper
; AVX512F-32-NEXT: retl
%res0 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 0, i64 -1)
%res1 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 1, i64 -1)
@@ -2531,7 +2468,7 @@ define i64 @test_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1) {
define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
; AVX512BW-LABEL: test_mask_x86_avx512_ucmp_b_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovq %rdi, %k1
; AVX512BW-NEXT: vpcmpeqb %zmm1, %zmm0, %k0 {%k1}
; AVX512BW-NEXT: kmovq %k0, %rax
@@ -2554,768 +2491,586 @@ define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %m
; AVX512BW-NEXT: kmovq %k0, %rax
; AVX512BW-NEXT: addq %rcx, %rax
; AVX512BW-NEXT: addq %rdi, %rax
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_mask_x86_avx512_ucmp_b_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: pushl %ebx
-; AVX512F-32-NEXT: .Lcfi11:
; AVX512F-32-NEXT: .cfi_def_cfa_offset 8
; AVX512F-32-NEXT: pushl %esi
-; AVX512F-32-NEXT: .Lcfi12:
; AVX512F-32-NEXT: .cfi_def_cfa_offset 12
-; AVX512F-32-NEXT: subl $60, %esp
-; AVX512F-32-NEXT: .Lcfi13:
-; AVX512F-32-NEXT: .cfi_def_cfa_offset 72
-; AVX512F-32-NEXT: .Lcfi14:
+; AVX512F-32-NEXT: subl $68, %esp
+; AVX512F-32-NEXT: .cfi_def_cfa_offset 80
; AVX512F-32-NEXT: .cfi_offset %esi, -12
-; AVX512F-32-NEXT: .Lcfi15:
; AVX512F-32-NEXT: .cfi_offset %ebx, -8
-; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm6
-; AVX512F-32-NEXT: vmovdqa64 %zmm0, %zmm5
-; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; AVX512F-32-NEXT: movb %cl, %al
-; AVX512F-32-NEXT: shrb $5, %al
-; AVX512F-32-NEXT: andb $1, %al
-; AVX512F-32-NEXT: movb %cl, %bl
-; AVX512F-32-NEXT: andb $15, %bl
-; AVX512F-32-NEXT: movb %cl, %dl
-; AVX512F-32-NEXT: andb $2, %dl
-; AVX512F-32-NEXT: shrb %dl
-; AVX512F-32-NEXT: kmovd %edx, %k0
-; AVX512F-32-NEXT: movb %bl, %dl
-; AVX512F-32-NEXT: shrb $2, %bl
-; AVX512F-32-NEXT: kmovd %ebx, %k1
-; AVX512F-32-NEXT: movb %cl, %bl
-; AVX512F-32-NEXT: shrb $4, %bl
-; AVX512F-32-NEXT: shrb $3, %dl
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2
-; AVX512F-32-NEXT: vpsllw $8, %xmm2, %xmm2
-; AVX512F-32-NEXT: kmovd %ecx, %k0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
-; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm3
-; AVX512F-32-NEXT: vpbroadcastw %xmm3, %xmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm3
-; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[4,5,6,7]
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2
-; AVX512F-32-NEXT: kmovd %edx, %k0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vpslld $24, %xmm3, %xmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm3
-; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[4,5,6,7]
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2
-; AVX512F-32-NEXT: kmovd %ebx, %k0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vpbroadcastd %xmm3, %xmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm3
-; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[4,5,6,7]
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2
-; AVX512F-32-NEXT: kmovd %eax, %k0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vpsllq $40, %xmm3, %xmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm3
-; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[4,5,6,7]
-; AVX512F-32-NEXT: movb %cl, %al
-; AVX512F-32-NEXT: shrb $6, %al
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
-; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: movb %cl, %al
-; AVX512F-32-NEXT: shrb $7, %al
-; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT: vpsllq $56, %xmm2, %xmm2
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
-; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: movb %ch, %al
-; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT: vpbroadcastq %xmm2, %xmm2
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
-; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: andb $2, %al
-; AVX512F-32-NEXT: shrb %al
-; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6]
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
-; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: movb %ch, %dl
-; AVX512F-32-NEXT: andb $15, %dl
-; AVX512F-32-NEXT: movb %dl, %al
-; AVX512F-32-NEXT: shrb $2, %dl
-; AVX512F-32-NEXT: kmovd %edx, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
-; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: shrb $3, %al
-; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4]
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
-; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: movl %ecx, %eax
-; AVX512F-32-NEXT: andl $61440, %eax # imm = 0xF000
-; AVX512F-32-NEXT: shrl $12, %eax
-; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT: vpbroadcastd %xmm2, %xmm2
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
-; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: movl %ecx, %eax
-; AVX512F-32-NEXT: shrl $13, %eax
-; AVX512F-32-NEXT: andb $1, %al
-; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2]
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
-; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: movl %ecx, %eax
-; AVX512F-32-NEXT: andl $49152, %eax # imm = 0xC000
-; AVX512F-32-NEXT: shrl $14, %eax
-; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
-; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: movl %ecx, %eax
-; AVX512F-32-NEXT: andl $32768, %eax # imm = 0x8000
-; AVX512F-32-NEXT: shrl $15, %eax
-; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0]
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
-; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: movl %ecx, %eax
+; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; AVX512F-32-NEXT: movl %ebx, %eax
; AVX512F-32-NEXT: shrl $16, %eax
-; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
-; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: movb %al, %dl
-; AVX512F-32-NEXT: andb $2, %dl
-; AVX512F-32-NEXT: shrb %dl
-; AVX512F-32-NEXT: kmovd %edx, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT: vpsllw $8, %xmm2, %xmm2
-; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
-; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: movb %al, %bl
-; AVX512F-32-NEXT: andb $15, %bl
-; AVX512F-32-NEXT: movb %bl, %dl
-; AVX512F-32-NEXT: shrb $2, %bl
-; AVX512F-32-NEXT: kmovd %ebx, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2
-; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
-; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: shrb $3, %dl
-; AVX512F-32-NEXT: kmovd %edx, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT: vpslld $24, %xmm2, %xmm2
-; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
-; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: movb %al, %dl
-; AVX512F-32-NEXT: shrb $4, %dl
-; AVX512F-32-NEXT: kmovd %edx, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT: vpbroadcastd %xmm2, %xmm2
-; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
-; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: movb %al, %dl
-; AVX512F-32-NEXT: shrb $5, %dl
-; AVX512F-32-NEXT: andb $1, %dl
-; AVX512F-32-NEXT: kmovd %edx, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT: vpsllq $40, %xmm2, %xmm2
-; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
-; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: movb %al, %dl
-; AVX512F-32-NEXT: shrb $6, %dl
-; AVX512F-32-NEXT: kmovd %edx, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2
-; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
-; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> %EAX<def>
-; AVX512F-32-NEXT: shrb $7, %al
-; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT: vpsllq $56, %xmm2, %xmm2
-; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
-; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: movl %ecx, %eax
-; AVX512F-32-NEXT: shrl $24, %eax
-; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT: vpbroadcastq %xmm2, %ymm2
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
-; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: movb %al, %dl
-; AVX512F-32-NEXT: andb $2, %dl
-; AVX512F-32-NEXT: shrb %dl
-; AVX512F-32-NEXT: kmovd %edx, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6]
-; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
-; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: movb %al, %dl
+; AVX512F-32-NEXT: movl %ebx, %edx
; AVX512F-32-NEXT: andb $15, %dl
-; AVX512F-32-NEXT: movb %dl, %al
+; AVX512F-32-NEXT: movl %ebx, %ecx
+; AVX512F-32-NEXT: andb $2, %cl
+; AVX512F-32-NEXT: shrb %cl
+; AVX512F-32-NEXT: kmovd %ecx, %k1
+; AVX512F-32-NEXT: movl %edx, %ecx
; AVX512F-32-NEXT: shrb $2, %dl
-; AVX512F-32-NEXT: kmovd %edx, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2
-; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
-; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: shrb $3, %al
-; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4]
-; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm3, %ymm4, %ymm2, %ymm2
-; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm4[4,5,6,7]
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: movl %ecx, %eax
-; AVX512F-32-NEXT: shrl $28, %eax
-; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT: vpbroadcastd %xmm2, %xmm2
-; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm4, %ymm2, %ymm2
-; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm4[4,5,6,7]
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: movl %ecx, %eax
-; AVX512F-32-NEXT: movl %ecx, %esi
-; AVX512F-32-NEXT: shrl $29, %eax
-; AVX512F-32-NEXT: andb $1, %al
-; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2]
-; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm0
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm2
-; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm0[4,5,6,7]
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT: movl %esi, %eax
-; AVX512F-32-NEXT: shrl $30, %eax
-; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpbroadcastw %xmm0, %xmm0
-; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm1
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm0
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255]
-; AVX512F-32-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm1
-; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7]
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT: movl %esi, %eax
-; AVX512F-32-NEXT: shrl $31, %eax
-; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0]
-; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm1, %ymm0, %ymm0
-; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; AVX512F-32-NEXT: kmovd %edx, %k2
+; AVX512F-32-NEXT: movb %bh, %dl
+; AVX512F-32-NEXT: andb $15, %dl
+; AVX512F-32-NEXT: shrb $3, %cl
+; AVX512F-32-NEXT: kmovd %ecx, %k0
+; AVX512F-32-NEXT: movl %ebx, %ecx
+; AVX512F-32-NEXT: shrb $4, %cl
+; AVX512F-32-NEXT: kmovd %ecx, %k3
+; AVX512F-32-NEXT: movl %ebx, %ecx
+; AVX512F-32-NEXT: shrb $5, %cl
+; AVX512F-32-NEXT: andb $1, %cl
+; AVX512F-32-NEXT: kmovd %ecx, %k4
+; AVX512F-32-NEXT: movl %ebx, %ecx
+; AVX512F-32-NEXT: shrb $6, %cl
+; AVX512F-32-NEXT: kmovd %ecx, %k6
+; AVX512F-32-NEXT: movl %ebx, %ecx
+; AVX512F-32-NEXT: shrb $7, %cl
+; AVX512F-32-NEXT: kmovd %ebx, %k5
+; AVX512F-32-NEXT: kshiftrq $1, %k5, %k7
+; AVX512F-32-NEXT: kxorq %k1, %k7, %k1
+; AVX512F-32-NEXT: kshiftlq $63, %k1, %k1
+; AVX512F-32-NEXT: kshiftrq $62, %k1, %k1
+; AVX512F-32-NEXT: kxorq %k5, %k1, %k7
+; AVX512F-32-NEXT: kshiftrq $2, %k7, %k1
+; AVX512F-32-NEXT: kxorq %k2, %k1, %k2
+; AVX512F-32-NEXT: kmovd %ecx, %k5
+; AVX512F-32-NEXT: movb %bh, %cl
; AVX512F-32-NEXT: kmovd %ecx, %k1
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm0
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm7
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm1, %ymm7, %ymm1
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT: movb %cl, %al
-; AVX512F-32-NEXT: andb $2, %al
-; AVX512F-32-NEXT: shrb %al
-; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpsllw $8, %xmm0, %xmm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT: movb %cl, %dl
+; AVX512F-32-NEXT: movl %ebx, %esi
+; AVX512F-32-NEXT: andb $2, %cl
+; AVX512F-32-NEXT: shrb %cl
+; AVX512F-32-NEXT: kshiftlq $63, %k2, %k2
+; AVX512F-32-NEXT: kshiftrq $61, %k2, %k2
+; AVX512F-32-NEXT: kxorq %k7, %k2, %k7
+; AVX512F-32-NEXT: kshiftrq $3, %k7, %k2
+; AVX512F-32-NEXT: kxorq %k0, %k2, %k0
+; AVX512F-32-NEXT: kmovd %ecx, %k2
+; AVX512F-32-NEXT: movl %edx, %ecx
+; AVX512F-32-NEXT: shrb $2, %dl
+; AVX512F-32-NEXT: kshiftlq $63, %k0, %k0
+; AVX512F-32-NEXT: kshiftrq $60, %k0, %k0
+; AVX512F-32-NEXT: kxorq %k7, %k0, %k0
+; AVX512F-32-NEXT: kshiftrq $4, %k0, %k7
+; AVX512F-32-NEXT: kxorq %k3, %k7, %k7
+; AVX512F-32-NEXT: kmovd %edx, %k3
+; AVX512F-32-NEXT: movl %eax, %edx
; AVX512F-32-NEXT: andb $15, %dl
-; AVX512F-32-NEXT: movb %dl, %al
+; AVX512F-32-NEXT: shrb $3, %cl
+; AVX512F-32-NEXT: kshiftlq $63, %k7, %k7
+; AVX512F-32-NEXT: kshiftrq $59, %k7, %k7
+; AVX512F-32-NEXT: kxorq %k0, %k7, %k7
+; AVX512F-32-NEXT: kshiftrq $5, %k7, %k0
+; AVX512F-32-NEXT: kxorq %k4, %k0, %k4
+; AVX512F-32-NEXT: kmovd %ecx, %k0
+; AVX512F-32-NEXT: movl %esi, %ecx
+; AVX512F-32-NEXT: shrl $13, %ecx
+; AVX512F-32-NEXT: andb $1, %cl
+; AVX512F-32-NEXT: kshiftlq $63, %k4, %k4
+; AVX512F-32-NEXT: kshiftrq $58, %k4, %k4
+; AVX512F-32-NEXT: kxorq %k7, %k4, %k7
+; AVX512F-32-NEXT: kshiftrq $6, %k7, %k4
+; AVX512F-32-NEXT: kxorq %k6, %k4, %k6
+; AVX512F-32-NEXT: kmovd %ecx, %k4
+; AVX512F-32-NEXT: movl %eax, %ebx
+; AVX512F-32-NEXT: andb $2, %bl
+; AVX512F-32-NEXT: shrb %bl
+; AVX512F-32-NEXT: kshiftlq $63, %k6, %k6
+; AVX512F-32-NEXT: kshiftrq $57, %k6, %k6
+; AVX512F-32-NEXT: kxorq %k7, %k6, %k6
+; AVX512F-32-NEXT: kshiftrq $7, %k6, %k7
+; AVX512F-32-NEXT: kxorq %k5, %k7, %k7
+; AVX512F-32-NEXT: kmovd %ebx, %k5
+; AVX512F-32-NEXT: movl %edx, %ecx
; AVX512F-32-NEXT: shrb $2, %dl
-; AVX512F-32-NEXT: kmovd %edx, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpbroadcastw %xmm0, %xmm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT: shrb $3, %al
-; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpslld $24, %xmm0, %xmm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT: movb %cl, %al
-; AVX512F-32-NEXT: shrb $4, %al
-; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpbroadcastd %xmm0, %xmm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT: movb %cl, %al
-; AVX512F-32-NEXT: shrb $5, %al
-; AVX512F-32-NEXT: andb $1, %al
-; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpsllq $40, %xmm0, %xmm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT: movb %cl, %al
-; AVX512F-32-NEXT: shrb $6, %al
-; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpbroadcastw %xmm0, %xmm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT: movb %cl, %al
+; AVX512F-32-NEXT: kshiftlq $63, %k7, %k7
+; AVX512F-32-NEXT: kshiftrq $56, %k7, %k7
+; AVX512F-32-NEXT: kxorq %k6, %k7, %k7
+; AVX512F-32-NEXT: kshiftrq $8, %k7, %k6
+; AVX512F-32-NEXT: kxorq %k1, %k6, %k1
+; AVX512F-32-NEXT: kmovd %edx, %k6
+; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; AVX512F-32-NEXT: shrb $3, %cl
+; AVX512F-32-NEXT: kshiftlq $63, %k1, %k1
+; AVX512F-32-NEXT: kshiftrq $55, %k1, %k1
+; AVX512F-32-NEXT: kxorq %k7, %k1, %k7
+; AVX512F-32-NEXT: kshiftrq $9, %k7, %k1
+; AVX512F-32-NEXT: kxorq %k2, %k1, %k2
+; AVX512F-32-NEXT: kmovd %ecx, %k1
+; AVX512F-32-NEXT: movl %eax, %ecx
+; AVX512F-32-NEXT: shrb $4, %cl
+; AVX512F-32-NEXT: kshiftlq $63, %k2, %k2
+; AVX512F-32-NEXT: kshiftrq $54, %k2, %k2
+; AVX512F-32-NEXT: kxorq %k7, %k2, %k7
+; AVX512F-32-NEXT: kshiftrq $10, %k7, %k2
+; AVX512F-32-NEXT: kxorq %k3, %k2, %k3
+; AVX512F-32-NEXT: kmovd %ecx, %k2
+; AVX512F-32-NEXT: movl %eax, %ecx
+; AVX512F-32-NEXT: shrb $5, %cl
+; AVX512F-32-NEXT: andb $1, %cl
+; AVX512F-32-NEXT: movl %esi, %edx
+; AVX512F-32-NEXT: shrl $12, %edx
+; AVX512F-32-NEXT: andl $15, %edx
+; AVX512F-32-NEXT: kshiftlq $63, %k3, %k3
+; AVX512F-32-NEXT: kshiftrq $53, %k3, %k3
+; AVX512F-32-NEXT: kxorq %k7, %k3, %k3
+; AVX512F-32-NEXT: kshiftrq $11, %k3, %k7
+; AVX512F-32-NEXT: kxorq %k0, %k7, %k0
+; AVX512F-32-NEXT: kshiftlq $63, %k0, %k0
+; AVX512F-32-NEXT: kshiftrq $52, %k0, %k0
+; AVX512F-32-NEXT: kxorq %k3, %k0, %k3
+; AVX512F-32-NEXT: kshiftrq $12, %k3, %k0
+; AVX512F-32-NEXT: kmovd %edx, %k7
+; AVX512F-32-NEXT: kxorq %k7, %k0, %k7
+; AVX512F-32-NEXT: kmovd %ecx, %k0
+; AVX512F-32-NEXT: movl %eax, %edx
+; AVX512F-32-NEXT: shrb $6, %dl
+; AVX512F-32-NEXT: movl %esi, %ecx
+; AVX512F-32-NEXT: shrl $14, %ecx
+; AVX512F-32-NEXT: andl $3, %ecx
+; AVX512F-32-NEXT: kshiftlq $63, %k7, %k7
+; AVX512F-32-NEXT: kshiftrq $51, %k7, %k7
+; AVX512F-32-NEXT: kxorq %k3, %k7, %k3
+; AVX512F-32-NEXT: kshiftrq $13, %k3, %k7
+; AVX512F-32-NEXT: kxorq %k4, %k7, %k4
+; AVX512F-32-NEXT: kmovd %ecx, %k7
+; AVX512F-32-NEXT: movl %esi, %ecx
+; AVX512F-32-NEXT: shrl $15, %ecx
+; AVX512F-32-NEXT: andl $1, %ecx
+; AVX512F-32-NEXT: kshiftlq $63, %k4, %k4
+; AVX512F-32-NEXT: kshiftrq $50, %k4, %k4
+; AVX512F-32-NEXT: kxorq %k3, %k4, %k3
+; AVX512F-32-NEXT: kshiftrq $14, %k3, %k4
+; AVX512F-32-NEXT: kxorq %k7, %k4, %k4
+; AVX512F-32-NEXT: kshiftlq $63, %k4, %k4
+; AVX512F-32-NEXT: kshiftrq $49, %k4, %k4
+; AVX512F-32-NEXT: kxorq %k3, %k4, %k3
+; AVX512F-32-NEXT: kshiftrq $15, %k3, %k4
+; AVX512F-32-NEXT: kmovd %ecx, %k7
+; AVX512F-32-NEXT: kxorq %k7, %k4, %k4
+; AVX512F-32-NEXT: kshiftlq $63, %k4, %k4
+; AVX512F-32-NEXT: kshiftrq $48, %k4, %k4
+; AVX512F-32-NEXT: kxorq %k3, %k4, %k3
+; AVX512F-32-NEXT: kshiftrq $16, %k3, %k4
+; AVX512F-32-NEXT: kmovd %eax, %k7
+; AVX512F-32-NEXT: kxorq %k7, %k4, %k4
+; AVX512F-32-NEXT: kmovd %edx, %k7
+; AVX512F-32-NEXT: movl %esi, %edx
+; AVX512F-32-NEXT: shrl $24, %edx
+; AVX512F-32-NEXT: # kill: def %al killed %al killed %eax def %eax
; AVX512F-32-NEXT: shrb $7, %al
-; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpsllq $56, %xmm0, %xmm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT: movb %ch, %al
-; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpbroadcastq %xmm0, %xmm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: kshiftlq $63, %k4, %k4
+; AVX512F-32-NEXT: kshiftrq $47, %k4, %k4
+; AVX512F-32-NEXT: kxorq %k3, %k4, %k3
+; AVX512F-32-NEXT: kshiftrq $17, %k3, %k4
+; AVX512F-32-NEXT: kxorq %k5, %k4, %k4
+; AVX512F-32-NEXT: kmovd %eax, %k5
+; AVX512F-32-NEXT: movl %edx, %eax
+; AVX512F-32-NEXT: kshiftlq $63, %k4, %k4
+; AVX512F-32-NEXT: kshiftrq $46, %k4, %k4
+; AVX512F-32-NEXT: kxorq %k3, %k4, %k4
+; AVX512F-32-NEXT: kshiftrq $18, %k4, %k3
+; AVX512F-32-NEXT: kxorq %k6, %k3, %k6
+; AVX512F-32-NEXT: kmovd %edx, %k3
+; AVX512F-32-NEXT: # kill: def %dl killed %dl killed %edx def %edx
+; AVX512F-32-NEXT: andb $15, %dl
; AVX512F-32-NEXT: andb $2, %al
; AVX512F-32-NEXT: shrb %al
-; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6]
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT: movb %ch, %dl
-; AVX512F-32-NEXT: andb $15, %dl
-; AVX512F-32-NEXT: movb %dl, %al
+; AVX512F-32-NEXT: kshiftlq $63, %k6, %k6
+; AVX512F-32-NEXT: kshiftrq $45, %k6, %k6
+; AVX512F-32-NEXT: kxorq %k4, %k6, %k6
+; AVX512F-32-NEXT: kshiftrq $19, %k6, %k4
+; AVX512F-32-NEXT: kxorq %k1, %k4, %k1
+; AVX512F-32-NEXT: kmovd %eax, %k4
+; AVX512F-32-NEXT: movl %edx, %ecx
; AVX512F-32-NEXT: shrb $2, %dl
-; AVX512F-32-NEXT: kmovd %edx, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpbroadcastw %xmm0, %xmm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT: shrb $3, %al
-; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4]
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT: movl %ecx, %eax
-; AVX512F-32-NEXT: andl $61440, %eax # imm = 0xF000
-; AVX512F-32-NEXT: shrl $12, %eax
-; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpbroadcastd %xmm0, %xmm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT: movl %ecx, %eax
-; AVX512F-32-NEXT: shrl $13, %eax
-; AVX512F-32-NEXT: andb $1, %al
-; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2]
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT: movl %ecx, %eax
-; AVX512F-32-NEXT: andl $49152, %eax # imm = 0xC000
-; AVX512F-32-NEXT: shrl $14, %eax
-; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpbroadcastw %xmm0, %xmm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT: movl %ecx, %eax
-; AVX512F-32-NEXT: andl $32768, %eax # imm = 0x8000
-; AVX512F-32-NEXT: shrl $15, %eax
-; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0]
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT: movl %ecx, %ebx
-; AVX512F-32-NEXT: shrl $16, %ebx
-; AVX512F-32-NEXT: kmovd %ebx, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT: movb %bl, %dl
+; AVX512F-32-NEXT: kshiftlq $63, %k1, %k1
+; AVX512F-32-NEXT: kshiftrq $44, %k1, %k1
+; AVX512F-32-NEXT: kxorq %k6, %k1, %k1
+; AVX512F-32-NEXT: kshiftrq $20, %k1, %k6
+; AVX512F-32-NEXT: kxorq %k2, %k6, %k6
+; AVX512F-32-NEXT: kmovd %edx, %k2
+; AVX512F-32-NEXT: movl %ebx, %eax
+; AVX512F-32-NEXT: andb $15, %al
+; AVX512F-32-NEXT: shrb $3, %cl
+; AVX512F-32-NEXT: kshiftlq $63, %k6, %k6
+; AVX512F-32-NEXT: kshiftrq $43, %k6, %k6
+; AVX512F-32-NEXT: kxorq %k1, %k6, %k1
+; AVX512F-32-NEXT: kshiftrq $21, %k1, %k6
+; AVX512F-32-NEXT: kxorq %k0, %k6, %k6
+; AVX512F-32-NEXT: kmovd %ecx, %k0
+; AVX512F-32-NEXT: movl %esi, %ecx
+; AVX512F-32-NEXT: shrl $29, %ecx
+; AVX512F-32-NEXT: andb $1, %cl
+; AVX512F-32-NEXT: kshiftlq $63, %k6, %k6
+; AVX512F-32-NEXT: kshiftrq $42, %k6, %k6
+; AVX512F-32-NEXT: kxorq %k1, %k6, %k6
+; AVX512F-32-NEXT: kshiftrq $22, %k6, %k1
+; AVX512F-32-NEXT: kxorq %k7, %k1, %k7
+; AVX512F-32-NEXT: kmovd %ecx, %k1
+; AVX512F-32-NEXT: movl %ebx, %edx
; AVX512F-32-NEXT: andb $2, %dl
; AVX512F-32-NEXT: shrb %dl
-; AVX512F-32-NEXT: kmovd %edx, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpsllw $8, %xmm0, %xmm0
-; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: movb %bl, %al
+; AVX512F-32-NEXT: kshiftlq $63, %k7, %k7
+; AVX512F-32-NEXT: kshiftrq $41, %k7, %k7
+; AVX512F-32-NEXT: kxorq %k6, %k7, %k6
+; AVX512F-32-NEXT: kshiftrq $23, %k6, %k7
+; AVX512F-32-NEXT: kxorq %k5, %k7, %k7
+; AVX512F-32-NEXT: kmovd %edx, %k5
+; AVX512F-32-NEXT: movl %eax, %ecx
+; AVX512F-32-NEXT: shrb $2, %al
+; AVX512F-32-NEXT: kshiftlq $63, %k7, %k7
+; AVX512F-32-NEXT: kshiftrq $40, %k7, %k7
+; AVX512F-32-NEXT: kxorq %k6, %k7, %k7
+; AVX512F-32-NEXT: kshiftrq $24, %k7, %k6
+; AVX512F-32-NEXT: kxorq %k3, %k6, %k3
+; AVX512F-32-NEXT: kmovd %eax, %k6
+; AVX512F-32-NEXT: movb %bh, %al
; AVX512F-32-NEXT: andb $15, %al
-; AVX512F-32-NEXT: movb %al, %dl
+; AVX512F-32-NEXT: shrb $3, %cl
+; AVX512F-32-NEXT: kshiftlq $63, %k3, %k3
+; AVX512F-32-NEXT: kshiftrq $39, %k3, %k3
+; AVX512F-32-NEXT: kxorq %k7, %k3, %k7
+; AVX512F-32-NEXT: kshiftrq $25, %k7, %k3
+; AVX512F-32-NEXT: kxorq %k4, %k3, %k4
+; AVX512F-32-NEXT: kmovd %ecx, %k3
+; AVX512F-32-NEXT: movl %ebx, %ecx
+; AVX512F-32-NEXT: shrb $4, %cl
+; AVX512F-32-NEXT: kshiftlq $63, %k4, %k4
+; AVX512F-32-NEXT: kshiftrq $38, %k4, %k4
+; AVX512F-32-NEXT: kxorq %k7, %k4, %k7
+; AVX512F-32-NEXT: kshiftrq $26, %k7, %k4
+; AVX512F-32-NEXT: kxorq %k2, %k4, %k2
+; AVX512F-32-NEXT: kmovd %ecx, %k4
+; AVX512F-32-NEXT: movl %ebx, %ecx
+; AVX512F-32-NEXT: shrb $5, %cl
+; AVX512F-32-NEXT: andb $1, %cl
+; AVX512F-32-NEXT: movl %esi, %edx
+; AVX512F-32-NEXT: shrl $28, %edx
+; AVX512F-32-NEXT: kshiftlq $63, %k2, %k2
+; AVX512F-32-NEXT: kshiftrq $37, %k2, %k2
+; AVX512F-32-NEXT: kxorq %k7, %k2, %k2
+; AVX512F-32-NEXT: kshiftrq $27, %k2, %k7
+; AVX512F-32-NEXT: kxorq %k0, %k7, %k0
+; AVX512F-32-NEXT: kshiftlq $63, %k0, %k0
+; AVX512F-32-NEXT: kshiftrq $36, %k0, %k0
+; AVX512F-32-NEXT: kxorq %k2, %k0, %k2
+; AVX512F-32-NEXT: kshiftrq $28, %k2, %k0
+; AVX512F-32-NEXT: kmovd %edx, %k7
+; AVX512F-32-NEXT: kxorq %k7, %k0, %k7
+; AVX512F-32-NEXT: kmovd %ecx, %k0
+; AVX512F-32-NEXT: movl %ebx, %edx
+; AVX512F-32-NEXT: shrb $6, %dl
+; AVX512F-32-NEXT: movl %esi, %ecx
+; AVX512F-32-NEXT: shrl $30, %ecx
+; AVX512F-32-NEXT: kshiftlq $63, %k7, %k7
+; AVX512F-32-NEXT: kshiftrq $35, %k7, %k7
+; AVX512F-32-NEXT: kxorq %k2, %k7, %k2
+; AVX512F-32-NEXT: kshiftrq $29, %k2, %k7
+; AVX512F-32-NEXT: kxorq %k1, %k7, %k1
+; AVX512F-32-NEXT: kmovd %ecx, %k7
+; AVX512F-32-NEXT: movl %esi, %ecx
+; AVX512F-32-NEXT: shrl $31, %ecx
+; AVX512F-32-NEXT: kshiftlq $63, %k1, %k1
+; AVX512F-32-NEXT: kshiftrq $34, %k1, %k1
+; AVX512F-32-NEXT: kxorq %k2, %k1, %k1
+; AVX512F-32-NEXT: kshiftrq $30, %k1, %k2
+; AVX512F-32-NEXT: kxorq %k7, %k2, %k2
+; AVX512F-32-NEXT: kshiftlq $63, %k2, %k2
+; AVX512F-32-NEXT: kshiftrq $33, %k2, %k2
+; AVX512F-32-NEXT: kxorq %k1, %k2, %k1
+; AVX512F-32-NEXT: kshiftrq $31, %k1, %k2
+; AVX512F-32-NEXT: kmovd %ecx, %k7
+; AVX512F-32-NEXT: kxorq %k7, %k2, %k2
+; AVX512F-32-NEXT: kshiftlq $63, %k2, %k2
+; AVX512F-32-NEXT: kshiftrq $32, %k2, %k2
+; AVX512F-32-NEXT: kxorq %k1, %k2, %k1
+; AVX512F-32-NEXT: kshiftrq $32, %k1, %k2
+; AVX512F-32-NEXT: kmovd %ebx, %k7
+; AVX512F-32-NEXT: kxorq %k7, %k2, %k2
+; AVX512F-32-NEXT: kmovd %edx, %k7
+; AVX512F-32-NEXT: movl %ebx, %ecx
+; AVX512F-32-NEXT: shrb $7, %cl
+; AVX512F-32-NEXT: kshiftlq $63, %k2, %k2
+; AVX512F-32-NEXT: kshiftrq $31, %k2, %k2
+; AVX512F-32-NEXT: kxorq %k1, %k2, %k1
+; AVX512F-32-NEXT: kshiftrq $33, %k1, %k2
+; AVX512F-32-NEXT: kxorq %k5, %k2, %k2
+; AVX512F-32-NEXT: kshiftlq $63, %k2, %k2
+; AVX512F-32-NEXT: kshiftrq $30, %k2, %k2
+; AVX512F-32-NEXT: kxorq %k1, %k2, %k2
+; AVX512F-32-NEXT: kshiftrq $34, %k2, %k1
+; AVX512F-32-NEXT: kxorq %k6, %k1, %k5
+; AVX512F-32-NEXT: kmovd %ecx, %k6
+; AVX512F-32-NEXT: movb %bh, %cl
+; AVX512F-32-NEXT: kmovd %ecx, %k1
+; AVX512F-32-NEXT: andb $2, %cl
+; AVX512F-32-NEXT: shrb %cl
+; AVX512F-32-NEXT: kshiftlq $63, %k5, %k5
+; AVX512F-32-NEXT: kshiftrq $29, %k5, %k5
+; AVX512F-32-NEXT: kxorq %k2, %k5, %k5
+; AVX512F-32-NEXT: kshiftrq $35, %k5, %k2
+; AVX512F-32-NEXT: kxorq %k3, %k2, %k3
+; AVX512F-32-NEXT: kmovd %ecx, %k2
+; AVX512F-32-NEXT: movl %eax, %ecx
; AVX512F-32-NEXT: shrb $2, %al
-; AVX512F-32-NEXT: kmovd %eax, %k0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k1
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm0
-; AVX512F-32-NEXT: vpbroadcastw %xmm0, %xmm0
-; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT: shrb $3, %dl
-; AVX512F-32-NEXT: kmovd %edx, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpslld $24, %xmm0, %xmm0
-; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT: movb %bl, %al
-; AVX512F-32-NEXT: shrb $4, %al
-; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpbroadcastd %xmm0, %xmm0
-; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT: movb %bl, %al
-; AVX512F-32-NEXT: shrb $5, %al
-; AVX512F-32-NEXT: andb $1, %al
-; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpsllq $40, %xmm0, %xmm0
-; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT: movb %bl, %al
-; AVX512F-32-NEXT: shrb $6, %al
-; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpbroadcastw %xmm0, %xmm0
-; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT: # kill: %BL<def> %BL<kill> %EBX<kill> %EBX<def>
-; AVX512F-32-NEXT: shrb $7, %bl
-; AVX512F-32-NEXT: kmovd %ebx, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpsllq $56, %xmm0, %xmm0
-; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT: movl %ecx, %eax
-; AVX512F-32-NEXT: shrl $24, %eax
-; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpbroadcastq %xmm0, %ymm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT: movb %al, %dl
+; AVX512F-32-NEXT: kshiftlq $63, %k3, %k3
+; AVX512F-32-NEXT: kshiftrq $28, %k3, %k3
+; AVX512F-32-NEXT: kxorq %k5, %k3, %k5
+; AVX512F-32-NEXT: kshiftrq $36, %k5, %k3
+; AVX512F-32-NEXT: kxorq %k4, %k3, %k4
+; AVX512F-32-NEXT: kmovd %eax, %k3
+; AVX512F-32-NEXT: movl %ebx, %eax
+; AVX512F-32-NEXT: shrl $16, %eax
+; AVX512F-32-NEXT: shrb $3, %cl
+; AVX512F-32-NEXT: kshiftlq $63, %k4, %k4
+; AVX512F-32-NEXT: kshiftrq $27, %k4, %k4
+; AVX512F-32-NEXT: kxorq %k5, %k4, %k5
+; AVX512F-32-NEXT: kshiftrq $37, %k5, %k4
+; AVX512F-32-NEXT: kxorq %k0, %k4, %k0
+; AVX512F-32-NEXT: kmovd %ecx, %k4
+; AVX512F-32-NEXT: movl %ebx, %ecx
+; AVX512F-32-NEXT: shrl $13, %ecx
+; AVX512F-32-NEXT: andb $1, %cl
+; AVX512F-32-NEXT: kshiftlq $63, %k0, %k0
+; AVX512F-32-NEXT: kshiftrq $26, %k0, %k0
+; AVX512F-32-NEXT: kxorq %k5, %k0, %k0
+; AVX512F-32-NEXT: kshiftrq $38, %k0, %k5
+; AVX512F-32-NEXT: kxorq %k7, %k5, %k7
+; AVX512F-32-NEXT: kmovd %ecx, %k5
+; AVX512F-32-NEXT: movl %eax, %edx
; AVX512F-32-NEXT: andb $2, %dl
; AVX512F-32-NEXT: shrb %dl
+; AVX512F-32-NEXT: kshiftlq $63, %k7, %k7
+; AVX512F-32-NEXT: kshiftrq $25, %k7, %k7
+; AVX512F-32-NEXT: kxorq %k0, %k7, %k7
+; AVX512F-32-NEXT: kshiftrq $39, %k7, %k0
+; AVX512F-32-NEXT: kxorq %k6, %k0, %k6
+; AVX512F-32-NEXT: kmovd %edx, %k0
+; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp) # 8-byte Spill
+; AVX512F-32-NEXT: movl %eax, %ecx
+; AVX512F-32-NEXT: andb $15, %cl
+; AVX512F-32-NEXT: movl %ecx, %edx
+; AVX512F-32-NEXT: shrb $2, %cl
+; AVX512F-32-NEXT: kshiftlq $63, %k6, %k6
+; AVX512F-32-NEXT: kshiftrq $24, %k6, %k6
+; AVX512F-32-NEXT: kxorq %k7, %k6, %k6
+; AVX512F-32-NEXT: kshiftrq $40, %k6, %k7
+; AVX512F-32-NEXT: kxorq %k1, %k7, %k7
+; AVX512F-32-NEXT: kmovd %ecx, %k1
+; AVX512F-32-NEXT: movl %ebx, %ecx
+; AVX512F-32-NEXT: shrl $12, %ecx
+; AVX512F-32-NEXT: andl $15, %ecx
+; AVX512F-32-NEXT: kshiftlq $63, %k7, %k7
+; AVX512F-32-NEXT: kshiftrq $23, %k7, %k7
+; AVX512F-32-NEXT: kxorq %k6, %k7, %k6
+; AVX512F-32-NEXT: kshiftrq $41, %k6, %k7
+; AVX512F-32-NEXT: kxorq %k2, %k7, %k2
+; AVX512F-32-NEXT: kmovd %ecx, %k0
+; AVX512F-32-NEXT: movl %ebx, %ecx
+; AVX512F-32-NEXT: shrl $14, %ecx
+; AVX512F-32-NEXT: andl $3, %ecx
+; AVX512F-32-NEXT: kshiftlq $63, %k2, %k2
+; AVX512F-32-NEXT: kshiftrq $22, %k2, %k2
+; AVX512F-32-NEXT: kxorq %k6, %k2, %k2
+; AVX512F-32-NEXT: kshiftrq $42, %k2, %k6
+; AVX512F-32-NEXT: kxorq %k3, %k6, %k3
+; AVX512F-32-NEXT: kmovd %ecx, %k7
+; AVX512F-32-NEXT: movl %ebx, %ecx
+; AVX512F-32-NEXT: shrl $15, %ecx
+; AVX512F-32-NEXT: andl $1, %ecx
+; AVX512F-32-NEXT: kshiftlq $63, %k3, %k3
+; AVX512F-32-NEXT: kshiftrq $21, %k3, %k3
+; AVX512F-32-NEXT: kxorq %k2, %k3, %k2
+; AVX512F-32-NEXT: kshiftrq $43, %k2, %k3
+; AVX512F-32-NEXT: kxorq %k4, %k3, %k3
+; AVX512F-32-NEXT: kmovd %ecx, %k6
+; AVX512F-32-NEXT: shrb $3, %dl
+; AVX512F-32-NEXT: kshiftlq $63, %k3, %k3
+; AVX512F-32-NEXT: kshiftrq $20, %k3, %k3
+; AVX512F-32-NEXT: kxorq %k2, %k3, %k3
+; AVX512F-32-NEXT: kshiftrq $44, %k3, %k2
+; AVX512F-32-NEXT: kxorq %k0, %k2, %k0
+; AVX512F-32-NEXT: kmovd %edx, %k2
+; AVX512F-32-NEXT: movl %eax, %ecx
+; AVX512F-32-NEXT: shrb $4, %cl
+; AVX512F-32-NEXT: kshiftlq $63, %k0, %k0
+; AVX512F-32-NEXT: kshiftrq $19, %k0, %k0
+; AVX512F-32-NEXT: kxorq %k3, %k0, %k0
+; AVX512F-32-NEXT: kshiftrq $45, %k0, %k3
+; AVX512F-32-NEXT: kxorq %k5, %k3, %k4
+; AVX512F-32-NEXT: kmovd %ecx, %k3
+; AVX512F-32-NEXT: movl %eax, %ecx
+; AVX512F-32-NEXT: shrb $5, %cl
+; AVX512F-32-NEXT: andb $1, %cl
+; AVX512F-32-NEXT: kshiftlq $63, %k4, %k4
+; AVX512F-32-NEXT: kshiftrq $18, %k4, %k4
+; AVX512F-32-NEXT: kxorq %k0, %k4, %k0
+; AVX512F-32-NEXT: kshiftrq $46, %k0, %k4
+; AVX512F-32-NEXT: kxorq %k7, %k4, %k5
+; AVX512F-32-NEXT: kmovd %ecx, %k4
+; AVX512F-32-NEXT: movl %eax, %ecx
+; AVX512F-32-NEXT: shrb $6, %cl
+; AVX512F-32-NEXT: kshiftlq $63, %k5, %k5
+; AVX512F-32-NEXT: kshiftrq $17, %k5, %k5
+; AVX512F-32-NEXT: kxorq %k0, %k5, %k0
+; AVX512F-32-NEXT: kshiftrq $47, %k0, %k5
+; AVX512F-32-NEXT: kxorq %k6, %k5, %k5
+; AVX512F-32-NEXT: kshiftlq $63, %k5, %k5
+; AVX512F-32-NEXT: kshiftrq $16, %k5, %k5
+; AVX512F-32-NEXT: kxorq %k0, %k5, %k0
+; AVX512F-32-NEXT: kshiftrq $48, %k0, %k5
+; AVX512F-32-NEXT: kmovd %eax, %k6
+; AVX512F-32-NEXT: kxorq %k6, %k5, %k6
+; AVX512F-32-NEXT: kmovd %ecx, %k5
+; AVX512F-32-NEXT: movl %ebx, %edx
+; AVX512F-32-NEXT: shrl $24, %edx
+; AVX512F-32-NEXT: # kill: def %al killed %al killed %eax def %eax
+; AVX512F-32-NEXT: shrb $7, %al
+; AVX512F-32-NEXT: kshiftlq $63, %k6, %k6
+; AVX512F-32-NEXT: kshiftrq $15, %k6, %k6
+; AVX512F-32-NEXT: kxorq %k0, %k6, %k6
+; AVX512F-32-NEXT: kshiftrq $49, %k6, %k0
+; AVX512F-32-NEXT: kmovq {{[0-9]+}}(%esp), %k7 # 8-byte Reload
+; AVX512F-32-NEXT: kxorq %k7, %k0, %k7
+; AVX512F-32-NEXT: kmovd %eax, %k0
+; AVX512F-32-NEXT: movl %edx, %eax
+; AVX512F-32-NEXT: kshiftlq $63, %k7, %k7
+; AVX512F-32-NEXT: kshiftrq $14, %k7, %k7
+; AVX512F-32-NEXT: kxorq %k6, %k7, %k6
+; AVX512F-32-NEXT: kshiftrq $50, %k6, %k7
+; AVX512F-32-NEXT: kxorq %k1, %k7, %k7
; AVX512F-32-NEXT: kmovd %edx, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6]
-; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: movb %al, %dl
+; AVX512F-32-NEXT: # kill: def %dl killed %dl killed %edx def %edx
; AVX512F-32-NEXT: andb $15, %dl
-; AVX512F-32-NEXT: movb %dl, %al
+; AVX512F-32-NEXT: andb $2, %al
+; AVX512F-32-NEXT: shrb %al
+; AVX512F-32-NEXT: kshiftlq $63, %k7, %k7
+; AVX512F-32-NEXT: kshiftrq $13, %k7, %k7
+; AVX512F-32-NEXT: kxorq %k6, %k7, %k6
+; AVX512F-32-NEXT: kshiftrq $51, %k6, %k7
+; AVX512F-32-NEXT: kxorq %k2, %k7, %k7
+; AVX512F-32-NEXT: kmovd %eax, %k2
+; AVX512F-32-NEXT: movl %edx, %eax
; AVX512F-32-NEXT: shrb $2, %dl
-; AVX512F-32-NEXT: kmovd %edx, %k0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k1
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm0
-; AVX512F-32-NEXT: vpbroadcastw %xmm0, %xmm0
-; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; AVX512F-32-NEXT: kshiftlq $63, %k7, %k7
+; AVX512F-32-NEXT: kshiftrq $12, %k7, %k7
+; AVX512F-32-NEXT: kxorq %k6, %k7, %k6
+; AVX512F-32-NEXT: kshiftrq $52, %k6, %k7
+; AVX512F-32-NEXT: kxorq %k3, %k7, %k7
+; AVX512F-32-NEXT: kmovd %edx, %k3
; AVX512F-32-NEXT: shrb $3, %al
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm0
-; AVX512F-32-NEXT: kmovd %eax, %k0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
-; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4]
-; AVX512F-32-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm0, %ymm4
-; AVX512F-32-NEXT: vpblendvb %ymm3, %ymm4, %ymm1, %ymm1
-; AVX512F-32-NEXT: movl %ecx, %eax
+; AVX512F-32-NEXT: kshiftlq $63, %k7, %k7
+; AVX512F-32-NEXT: kshiftrq $11, %k7, %k7
+; AVX512F-32-NEXT: kxorq %k6, %k7, %k6
+; AVX512F-32-NEXT: kshiftrq $53, %k6, %k7
+; AVX512F-32-NEXT: kxorq %k4, %k7, %k7
+; AVX512F-32-NEXT: kmovd %eax, %k4
+; AVX512F-32-NEXT: movl %ebx, %eax
; AVX512F-32-NEXT: shrl $29, %eax
; AVX512F-32-NEXT: andb $1, %al
-; AVX512F-32-NEXT: kmovd %eax, %k0
-; AVX512F-32-NEXT: movl %ecx, %eax
+; AVX512F-32-NEXT: kshiftlq $63, %k7, %k7
+; AVX512F-32-NEXT: kshiftrq $10, %k7, %k7
+; AVX512F-32-NEXT: kxorq %k6, %k7, %k6
+; AVX512F-32-NEXT: kshiftrq $54, %k6, %k7
+; AVX512F-32-NEXT: kxorq %k5, %k7, %k7
+; AVX512F-32-NEXT: kmovd %eax, %k5
+; AVX512F-32-NEXT: kshiftlq $63, %k7, %k7
+; AVX512F-32-NEXT: kshiftrq $9, %k7, %k7
+; AVX512F-32-NEXT: kxorq %k6, %k7, %k6
+; AVX512F-32-NEXT: kshiftrq $55, %k6, %k7
+; AVX512F-32-NEXT: kxorq %k0, %k7, %k0
+; AVX512F-32-NEXT: kshiftlq $63, %k0, %k0
+; AVX512F-32-NEXT: kshiftrq $8, %k0, %k0
+; AVX512F-32-NEXT: kxorq %k6, %k0, %k0
+; AVX512F-32-NEXT: kshiftrq $56, %k0, %k6
+; AVX512F-32-NEXT: kxorq %k1, %k6, %k1
+; AVX512F-32-NEXT: kshiftlq $63, %k1, %k1
+; AVX512F-32-NEXT: kshiftrq $7, %k1, %k1
+; AVX512F-32-NEXT: kxorq %k0, %k1, %k0
+; AVX512F-32-NEXT: kshiftrq $57, %k0, %k1
+; AVX512F-32-NEXT: kxorq %k2, %k1, %k1
+; AVX512F-32-NEXT: kshiftlq $63, %k1, %k1
+; AVX512F-32-NEXT: kshiftrq $6, %k1, %k1
+; AVX512F-32-NEXT: kxorq %k0, %k1, %k0
+; AVX512F-32-NEXT: kshiftrq $58, %k0, %k1
+; AVX512F-32-NEXT: kxorq %k3, %k1, %k1
+; AVX512F-32-NEXT: movl %ebx, %eax
; AVX512F-32-NEXT: shrl $28, %eax
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm3
-; AVX512F-32-NEXT: vpbroadcastd %xmm3, %xmm3
-; AVX512F-32-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm1
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2]
-; AVX512F-32-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm1
-; AVX512F-32-NEXT: movl %ecx, %eax
-; AVX512F-32-NEXT: shrl $30, %eax
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm0
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-32-NEXT: kmovd %eax, %k0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vpbroadcastw %xmm3, %xmm3
-; AVX512F-32-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512F-32-NEXT: vpblendvb %ymm2, %ymm1, %ymm3, %ymm1
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT: movl %ecx, %eax
+; AVX512F-32-NEXT: kshiftlq $63, %k1, %k1
+; AVX512F-32-NEXT: kshiftrq $5, %k1, %k1
+; AVX512F-32-NEXT: kxorq %k0, %k1, %k0
+; AVX512F-32-NEXT: kshiftrq $59, %k0, %k1
+; AVX512F-32-NEXT: kxorq %k4, %k1, %k1
+; AVX512F-32-NEXT: kshiftlq $63, %k1, %k1
+; AVX512F-32-NEXT: kshiftrq $4, %k1, %k1
+; AVX512F-32-NEXT: kxorq %k0, %k1, %k0
+; AVX512F-32-NEXT: kshiftrq $60, %k0, %k1
+; AVX512F-32-NEXT: kmovd %eax, %k2
+; AVX512F-32-NEXT: kxorq %k2, %k1, %k1
+; AVX512F-32-NEXT: movl %ebx, %eax
; AVX512F-32-NEXT: shrl $31, %eax
+; AVX512F-32-NEXT: movl %ebx, %ecx
+; AVX512F-32-NEXT: shrl $30, %ecx
+; AVX512F-32-NEXT: kshiftlq $63, %k1, %k1
+; AVX512F-32-NEXT: kshiftrq $3, %k1, %k1
+; AVX512F-32-NEXT: kxorq %k0, %k1, %k0
+; AVX512F-32-NEXT: kshiftrq $61, %k0, %k1
+; AVX512F-32-NEXT: kxorq %k5, %k1, %k1
+; AVX512F-32-NEXT: kshiftlq $63, %k1, %k1
+; AVX512F-32-NEXT: kshiftrq $2, %k1, %k1
+; AVX512F-32-NEXT: kxorq %k0, %k1, %k0
+; AVX512F-32-NEXT: kshiftrq $62, %k0, %k1
+; AVX512F-32-NEXT: kmovd %ecx, %k2
+; AVX512F-32-NEXT: kxorq %k2, %k1, %k1
+; AVX512F-32-NEXT: kshiftlq $63, %k1, %k1
+; AVX512F-32-NEXT: kshiftrq $1, %k1, %k1
+; AVX512F-32-NEXT: kxorq %k0, %k1, %k0
; AVX512F-32-NEXT: kshiftlq $1, %k0, %k0
; AVX512F-32-NEXT: kshiftrq $1, %k0, %k0
; AVX512F-32-NEXT: kmovd %eax, %k1
; AVX512F-32-NEXT: kshiftlq $63, %k1, %k1
; AVX512F-32-NEXT: korq %k1, %k0, %k1
-; AVX512F-32-NEXT: vpcmpeqb %zmm6, %zmm5, %k0 {%k1}
-; AVX512F-32-NEXT: vpcmpltub %zmm6, %zmm5, %k2 {%k1}
-; AVX512F-32-NEXT: vpcmpleub %zmm6, %zmm5, %k3 {%k1}
-; AVX512F-32-NEXT: vpcmpneqb %zmm6, %zmm5, %k4 {%k1}
-; AVX512F-32-NEXT: vpcmpnltub %zmm6, %zmm5, %k5 {%k1}
-; AVX512F-32-NEXT: vpcmpnleub %zmm6, %zmm5, %k1 {%k1}
+; AVX512F-32-NEXT: vpcmpeqb %zmm1, %zmm0, %k0 {%k1}
; AVX512F-32-NEXT: kmovq %k0, (%esp)
; AVX512F-32-NEXT: movl (%esp), %eax
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %edx
-; AVX512F-32-NEXT: kmovq %k2, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT: vpcmpltub %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp)
; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx
-; AVX512F-32-NEXT: kmovq %k3, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT: vpcmpleub %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp)
; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: kxorq %k0, %k0, %k0
; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp)
; AVX512F-32-NEXT: orl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: orl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT: kmovq %k4, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT: vpcmpneqb %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp)
; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx
-; AVX512F-32-NEXT: kmovq %k5, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT: vpcmpnltub %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp)
; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx
-; AVX512F-32-NEXT: kmovq %k1, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT: vpcmpnleub %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp)
; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: addl %esi, %eax
-; AVX512F-32-NEXT: adcxl %ecx, %edx
-; AVX512F-32-NEXT: addl $60, %esp
+; AVX512F-32-NEXT: adcl %ebx, %edx
+; AVX512F-32-NEXT: addl $68, %esp
; AVX512F-32-NEXT: popl %esi
; AVX512F-32-NEXT: popl %ebx
+; AVX512F-32-NEXT: vzeroupper
; AVX512F-32-NEXT: retl
%res0 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 0, i64 %mask)
%res1 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 1, i64 %mask)
@@ -3339,7 +3094,7 @@ declare i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8>, <64 x i8>, i32, i64) nou
define i32 @test_cmp_w_512(<32 x i16> %a0, <32 x i16> %a1) {
; AVX512BW-LABEL: test_cmp_w_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: vpcmpeqw %zmm1, %zmm0, %k0
; AVX512BW-NEXT: kmovd %k0, %eax
; AVX512BW-NEXT: vpcmpgtw %zmm0, %zmm1, %k0
@@ -3360,10 +3115,11 @@ define i32 @test_cmp_w_512(<32 x i16> %a0, <32 x i16> %a1) {
; AVX512BW-NEXT: kxnord %k0, %k0, %k0
; AVX512BW-NEXT: kmovd %k0, %eax
; AVX512BW-NEXT: addl %ecx, %eax
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_cmp_w_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: vpcmpeqw %zmm1, %zmm0, %k0
; AVX512F-32-NEXT: kmovd %k0, %eax
; AVX512F-32-NEXT: vpcmpgtw %zmm0, %zmm1, %k0
@@ -3384,6 +3140,7 @@ define i32 @test_cmp_w_512(<32 x i16> %a0, <32 x i16> %a1) {
; AVX512F-32-NEXT: kxnord %k0, %k0, %k0
; AVX512F-32-NEXT: kmovd %k0, %eax
; AVX512F-32-NEXT: addl %ecx, %eax
+; AVX512F-32-NEXT: vzeroupper
; AVX512F-32-NEXT: retl
%res0 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 0, i32 -1)
%res1 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 1, i32 -1)
@@ -3405,7 +3162,7 @@ define i32 @test_cmp_w_512(<32 x i16> %a0, <32 x i16> %a1) {
define i32 @test_mask_cmp_w_512(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
; AVX512BW-LABEL: test_mask_cmp_w_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovd %edi, %k1
; AVX512BW-NEXT: vpcmpeqw %zmm1, %zmm0, %k0 {%k1}
; AVX512BW-NEXT: kmovd %k0, %eax
@@ -3428,10 +3185,11 @@ define i32 @test_mask_cmp_w_512(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
; AVX512BW-NEXT: kmovd %k0, %eax
; AVX512BW-NEXT: addl %ecx, %eax
; AVX512BW-NEXT: addl %edi, %eax
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_mask_cmp_w_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; AVX512F-32-NEXT: kmovd %ecx, %k1
; AVX512F-32-NEXT: vpcmpeqw %zmm1, %zmm0, %k0 {%k1}
@@ -3455,6 +3213,7 @@ define i32 @test_mask_cmp_w_512(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
; AVX512F-32-NEXT: kmovd %k0, %eax
; AVX512F-32-NEXT: addl %edx, %eax
; AVX512F-32-NEXT: addl %ecx, %eax
+; AVX512F-32-NEXT: vzeroupper
; AVX512F-32-NEXT: retl
%res0 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 0, i32 %mask)
%res1 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 1, i32 %mask)
@@ -3478,7 +3237,7 @@ declare i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16>, <32 x i16>, i32, i32) no
define i32 @test_ucmp_w_512(<32 x i16> %a0, <32 x i16> %a1) {
; AVX512BW-LABEL: test_ucmp_w_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: vpcmpeqw %zmm1, %zmm0, %k0
; AVX512BW-NEXT: kmovd %k0, %eax
; AVX512BW-NEXT: vpcmpltuw %zmm1, %zmm0, %k0
@@ -3499,10 +3258,11 @@ define i32 @test_ucmp_w_512(<32 x i16> %a0, <32 x i16> %a1) {
; AVX512BW-NEXT: kxnord %k0, %k0, %k0
; AVX512BW-NEXT: kmovd %k0, %eax
; AVX512BW-NEXT: addl %ecx, %eax
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_ucmp_w_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: vpcmpeqw %zmm1, %zmm0, %k0
; AVX512F-32-NEXT: kmovd %k0, %eax
; AVX512F-32-NEXT: vpcmpltuw %zmm1, %zmm0, %k0
@@ -3523,6 +3283,7 @@ define i32 @test_ucmp_w_512(<32 x i16> %a0, <32 x i16> %a1) {
; AVX512F-32-NEXT: kxnord %k0, %k0, %k0
; AVX512F-32-NEXT: kmovd %k0, %eax
; AVX512F-32-NEXT: addl %ecx, %eax
+; AVX512F-32-NEXT: vzeroupper
; AVX512F-32-NEXT: retl
%res0 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 0, i32 -1)
%res1 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 1, i32 -1)
@@ -3544,7 +3305,7 @@ define i32 @test_ucmp_w_512(<32 x i16> %a0, <32 x i16> %a1) {
define i32 @test_mask_ucmp_w_512(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
; AVX512BW-LABEL: test_mask_ucmp_w_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovd %edi, %k1
; AVX512BW-NEXT: vpcmpeqw %zmm1, %zmm0, %k0 {%k1}
; AVX512BW-NEXT: kmovd %k0, %eax
@@ -3567,10 +3328,11 @@ define i32 @test_mask_ucmp_w_512(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
; AVX512BW-NEXT: kmovd %k0, %eax
; AVX512BW-NEXT: addl %ecx, %eax
; AVX512BW-NEXT: addl %edi, %eax
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_mask_ucmp_w_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; AVX512F-32-NEXT: kmovd %ecx, %k1
; AVX512F-32-NEXT: vpcmpeqw %zmm1, %zmm0, %k0 {%k1}
@@ -3594,6 +3356,7 @@ define i32 @test_mask_ucmp_w_512(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
; AVX512F-32-NEXT: kmovd %k0, %eax
; AVX512F-32-NEXT: addl %edx, %eax
; AVX512F-32-NEXT: addl %ecx, %eax
+; AVX512F-32-NEXT: vzeroupper
; AVX512F-32-NEXT: retl
%res0 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 0, i32 %mask)
%res1 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 1, i32 %mask)
@@ -3614,3 +3377,234 @@ define i32 @test_mask_ucmp_w_512(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
}
declare i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16>, <32 x i16>, i32, i32) nounwind readnone
+
+
+declare <64 x i8> @llvm.x86.avx512.mask.pavg.b.512(<64 x i8>, <64 x i8>, <64 x i8>, i64)
+
+define <64 x i8>@mm512_avg_epu8(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) {
+; AVX512BW-LABEL: mm512_avg_epu8:
+; AVX512BW: ## %bb.0:
+; AVX512BW-NEXT: vpavgb %zmm1, %zmm0, %zmm3
+; AVX512BW-NEXT: kmovq %rdi, %k1
+; AVX512BW-NEXT: vpavgb %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT: vpaddb %zmm3, %zmm2, %zmm0
+; AVX512BW-NEXT: retq
+;
+; AVX512F-32-LABEL: mm512_avg_epu8:
+; AVX512F-32: # %bb.0:
+; AVX512F-32-NEXT: vpavgb %zmm1, %zmm0, %zmm3
+; AVX512F-32-NEXT: kmovq {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT: vpavgb %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512F-32-NEXT: vpaddb %zmm3, %zmm2, %zmm0
+; AVX512F-32-NEXT: retl
+ %res = call <64 x i8> @llvm.x86.avx512.mask.pavg.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3)
+ %res1 = call <64 x i8> @llvm.x86.avx512.mask.pavg.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1)
+ %res2 = add <64 x i8> %res, %res1
+ ret <64 x i8> %res2
+}
+
+
+declare <32 x i16> @llvm.x86.avx512.mask.pavg.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
+
+define <32 x i16>@mm512_avg_epu16(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
+; AVX512BW-LABEL: mm512_avg_epu16:
+; AVX512BW: ## %bb.0:
+; AVX512BW-NEXT: vpavgw %zmm1, %zmm0, %zmm3
+; AVX512BW-NEXT: kmovd %edi, %k1
+; AVX512BW-NEXT: vpavgw %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT: vpaddw %zmm3, %zmm2, %zmm0
+; AVX512BW-NEXT: retq
+;
+; AVX512F-32-LABEL: mm512_avg_epu16:
+; AVX512F-32: # %bb.0:
+; AVX512F-32-NEXT: vpavgw %zmm1, %zmm0, %zmm3
+; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT: vpavgw %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512F-32-NEXT: vpaddw %zmm3, %zmm2, %zmm0
+; AVX512F-32-NEXT: retl
+ %res = call <32 x i16> @llvm.x86.avx512.mask.pavg.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
+ %res1 = call <32 x i16> @llvm.x86.avx512.mask.pavg.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
+ %res2 = add <32 x i16> %res, %res1
+ ret <32 x i16> %res2
+}
+
+declare <32 x i16> @llvm.x86.avx512.mask.pabs.w.512(<32 x i16>, <32 x i16>, i32)
+
+define <32 x i16>@test_int_x86_avx512_mask_pabs_w_512(<32 x i16> %x0, <32 x i16> %x1, i32 %x2) {
+; AVX512BW-LABEL: test_int_x86_avx512_mask_pabs_w_512:
+; AVX512BW: ## %bb.0:
+; AVX512BW-NEXT: vpabsw %zmm0, %zmm2
+; AVX512BW-NEXT: kmovd %edi, %k1
+; AVX512BW-NEXT: vpabsw %zmm0, %zmm1 {%k1}
+; AVX512BW-NEXT: vpaddw %zmm2, %zmm1, %zmm0
+; AVX512BW-NEXT: retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_pabs_w_512:
+; AVX512F-32: # %bb.0:
+; AVX512F-32-NEXT: vpabsw %zmm0, %zmm2
+; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT: vpabsw %zmm0, %zmm1 {%k1}
+; AVX512F-32-NEXT: vpaddw %zmm2, %zmm1, %zmm0
+; AVX512F-32-NEXT: retl
+ %res = call <32 x i16> @llvm.x86.avx512.mask.pabs.w.512(<32 x i16> %x0, <32 x i16> %x1, i32 %x2)
+ %res1 = call <32 x i16> @llvm.x86.avx512.mask.pabs.w.512(<32 x i16> %x0, <32 x i16> %x1, i32 -1)
+ %res2 = add <32 x i16> %res, %res1
+ ret <32 x i16> %res2
+}
+
+declare <64 x i8> @llvm.x86.avx512.mask.pabs.b.512(<64 x i8>, <64 x i8>, i64)
+
+define <64 x i8>@test_int_x86_avx512_mask_pabs_b_512(<64 x i8> %x0, <64 x i8> %x1, i64 %x2) {
+; AVX512BW-LABEL: test_int_x86_avx512_mask_pabs_b_512:
+; AVX512BW: ## %bb.0:
+; AVX512BW-NEXT: vpabsb %zmm0, %zmm2
+; AVX512BW-NEXT: kmovq %rdi, %k1
+; AVX512BW-NEXT: vpabsb %zmm0, %zmm1 {%k1}
+; AVX512BW-NEXT: vpaddb %zmm2, %zmm1, %zmm0
+; AVX512BW-NEXT: retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_pabs_b_512:
+; AVX512F-32: # %bb.0:
+; AVX512F-32-NEXT: vpabsb %zmm0, %zmm2
+; AVX512F-32-NEXT: kmovq {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT: vpabsb %zmm0, %zmm1 {%k1}
+; AVX512F-32-NEXT: vpaddb %zmm2, %zmm1, %zmm0
+; AVX512F-32-NEXT: retl
+ %res = call <64 x i8> @llvm.x86.avx512.mask.pabs.b.512(<64 x i8> %x0, <64 x i8> %x1, i64 %x2)
+ %res1 = call <64 x i8> @llvm.x86.avx512.mask.pabs.b.512(<64 x i8> %x0, <64 x i8> %x1, i64 -1)
+ %res2 = add <64 x i8> %res, %res1
+ ret <64 x i8> %res2
+}
+
+declare i64 @llvm.x86.avx512.ptestm.b.512(<64 x i8>, <64 x i8>, i64)
+
+define i64@test_int_x86_avx512_ptestm_b_512(<64 x i8> %x0, <64 x i8> %x1, i64 %x2) {
+; AVX512BW-LABEL: test_int_x86_avx512_ptestm_b_512:
+; AVX512BW: ## %bb.0:
+; AVX512BW-NEXT: vptestmb %zmm1, %zmm0, %k0
+; AVX512BW-NEXT: kmovq %rdi, %k1
+; AVX512BW-NEXT: vptestmb %zmm1, %zmm0, %k1 {%k1}
+; AVX512BW-NEXT: kmovq %k1, %rcx
+; AVX512BW-NEXT: kmovq %k0, %rax
+; AVX512BW-NEXT: addq %rcx, %rax
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_ptestm_b_512:
+; AVX512F-32: # %bb.0:
+; AVX512F-32-NEXT: subl $20, %esp
+; AVX512F-32-NEXT: .cfi_def_cfa_offset 24
+; AVX512F-32-NEXT: vptestmb %zmm1, %zmm0, %k0
+; AVX512F-32-NEXT: kmovq {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT: vptestmb %zmm1, %zmm0, %k1 {%k1}
+; AVX512F-32-NEXT: kmovq %k1, (%esp)
+; AVX512F-32-NEXT: movl (%esp), %eax
+; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT: addl $20, %esp
+; AVX512F-32-NEXT: vzeroupper
+; AVX512F-32-NEXT: retl
+ %res = call i64 @llvm.x86.avx512.ptestm.b.512(<64 x i8> %x0, <64 x i8> %x1, i64 %x2)
+ %res1 = call i64 @llvm.x86.avx512.ptestm.b.512(<64 x i8> %x0, <64 x i8> %x1, i64-1)
+ %res2 = add i64 %res, %res1
+ ret i64 %res2
+}
+
+declare i32 @llvm.x86.avx512.ptestm.w.512(<32 x i16>, <32 x i16>, i32)
+
+define i32@test_int_x86_avx512_ptestm_w_512(<32 x i16> %x0, <32 x i16> %x1, i32 %x2) {
+; AVX512BW-LABEL: test_int_x86_avx512_ptestm_w_512:
+; AVX512BW: ## %bb.0:
+; AVX512BW-NEXT: vptestmw %zmm1, %zmm0, %k0
+; AVX512BW-NEXT: kmovd %edi, %k1
+; AVX512BW-NEXT: vptestmw %zmm1, %zmm0, %k1 {%k1}
+; AVX512BW-NEXT: kmovd %k1, %ecx
+; AVX512BW-NEXT: kmovd %k0, %eax
+; AVX512BW-NEXT: addl %ecx, %eax
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_ptestm_w_512:
+; AVX512F-32: # %bb.0:
+; AVX512F-32-NEXT: vptestmw %zmm1, %zmm0, %k0
+; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT: vptestmw %zmm1, %zmm0, %k1 {%k1}
+; AVX512F-32-NEXT: kmovd %k1, %ecx
+; AVX512F-32-NEXT: kmovd %k0, %eax
+; AVX512F-32-NEXT: addl %ecx, %eax
+; AVX512F-32-NEXT: vzeroupper
+; AVX512F-32-NEXT: retl
+ %res = call i32 @llvm.x86.avx512.ptestm.w.512(<32 x i16> %x0, <32 x i16> %x1, i32 %x2)
+ %res1 = call i32 @llvm.x86.avx512.ptestm.w.512(<32 x i16> %x0, <32 x i16> %x1, i32-1)
+ %res2 = add i32 %res, %res1
+ ret i32 %res2
+}
+
+declare i64 @llvm.x86.avx512.ptestnm.b.512(<64 x i8>, <64 x i8>, i64 %x2)
+
+define i64@test_int_x86_avx512_ptestnm_b_512(<64 x i8> %x0, <64 x i8> %x1, i64 %x2) {
+; AVX512BW-LABEL: test_int_x86_avx512_ptestnm_b_512:
+; AVX512BW: ## %bb.0:
+; AVX512BW-NEXT: vptestnmb %zmm1, %zmm0, %k0
+; AVX512BW-NEXT: kmovq %rdi, %k1
+; AVX512BW-NEXT: vptestnmb %zmm1, %zmm0, %k1 {%k1}
+; AVX512BW-NEXT: kmovq %k1, %rcx
+; AVX512BW-NEXT: kmovq %k0, %rax
+; AVX512BW-NEXT: addq %rcx, %rax
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_ptestnm_b_512:
+; AVX512F-32: # %bb.0:
+; AVX512F-32-NEXT: subl $20, %esp
+; AVX512F-32-NEXT: .cfi_def_cfa_offset 24
+; AVX512F-32-NEXT: vptestnmb %zmm1, %zmm0, %k0
+; AVX512F-32-NEXT: kmovq {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT: vptestnmb %zmm1, %zmm0, %k1 {%k1}
+; AVX512F-32-NEXT: kmovq %k1, (%esp)
+; AVX512F-32-NEXT: movl (%esp), %eax
+; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT: addl $20, %esp
+; AVX512F-32-NEXT: vzeroupper
+; AVX512F-32-NEXT: retl
+ %res = call i64 @llvm.x86.avx512.ptestnm.b.512(<64 x i8> %x0, <64 x i8> %x1, i64 %x2)
+ %res1 = call i64 @llvm.x86.avx512.ptestnm.b.512(<64 x i8> %x0, <64 x i8> %x1, i64-1)
+ %res2 = add i64 %res, %res1
+ ret i64 %res2
+}
+
+declare i32 @llvm.x86.avx512.ptestnm.w.512(<32 x i16>, <32 x i16>, i32 %x2)
+
+define i32@test_int_x86_avx512_ptestnm_w_512(<32 x i16> %x0, <32 x i16> %x1, i32 %x2) {
+; AVX512BW-LABEL: test_int_x86_avx512_ptestnm_w_512:
+; AVX512BW: ## %bb.0:
+; AVX512BW-NEXT: vptestnmw %zmm1, %zmm0, %k0
+; AVX512BW-NEXT: kmovd %edi, %k1
+; AVX512BW-NEXT: vptestnmw %zmm1, %zmm0, %k1 {%k1}
+; AVX512BW-NEXT: kmovd %k1, %ecx
+; AVX512BW-NEXT: kmovd %k0, %eax
+; AVX512BW-NEXT: addl %ecx, %eax
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_ptestnm_w_512:
+; AVX512F-32: # %bb.0:
+; AVX512F-32-NEXT: vptestnmw %zmm1, %zmm0, %k0
+; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT: vptestnmw %zmm1, %zmm0, %k1 {%k1}
+; AVX512F-32-NEXT: kmovd %k1, %ecx
+; AVX512F-32-NEXT: kmovd %k0, %eax
+; AVX512F-32-NEXT: addl %ecx, %eax
+; AVX512F-32-NEXT: vzeroupper
+; AVX512F-32-NEXT: retl
+ %res = call i32 @llvm.x86.avx512.ptestnm.w.512(<32 x i16> %x0, <32 x i16> %x1, i32 %x2)
+ %res1 = call i32 @llvm.x86.avx512.ptestnm.w.512(<32 x i16> %x0, <32 x i16> %x1, i32-1)
+ %res2 = add i32 %res, %res1
+ ret i32 %res2
+}
+
diff --git a/test/CodeGen/X86/avx512bw-intrinsics.ll b/test/CodeGen/X86/avx512bw-intrinsics.ll
index 4abe3df9fc2a..2fa7c2c5b8a8 100644
--- a/test/CodeGen/X86/avx512bw-intrinsics.ll
+++ b/test/CodeGen/X86/avx512bw-intrinsics.ll
@@ -4,12 +4,12 @@
define <32 x i16> @test_mask_packs_epi32_rr_512(<16 x i32> %a, <16 x i32> %b) {
; AVX512BW-LABEL: test_mask_packs_epi32_rr_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: vpackssdw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_mask_packs_epi32_rr_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: vpackssdw %zmm1, %zmm0, %zmm0
; AVX512F-32-NEXT: retl
%1 = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> %a, <16 x i32> %b)
@@ -18,14 +18,14 @@ define <32 x i16> @test_mask_packs_epi32_rr_512(<16 x i32> %a, <16 x i32> %b) {
define <32 x i16> @test_mask_packs_epi32_rrk_512(<16 x i32> %a, <16 x i32> %b, <32 x i16> %passThru, i32 %mask) {
; AVX512BW-LABEL: test_mask_packs_epi32_rrk_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovd %edi, %k1
; AVX512BW-NEXT: vpackssdw %zmm1, %zmm0, %zmm2 {%k1}
; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_mask_packs_epi32_rrk_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpackssdw %zmm1, %zmm0, %zmm2 {%k1}
; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0
@@ -38,13 +38,13 @@ define <32 x i16> @test_mask_packs_epi32_rrk_512(<16 x i32> %a, <16 x i32> %b, <
define <32 x i16> @test_mask_packs_epi32_rrkz_512(<16 x i32> %a, <16 x i32> %b, i32 %mask) {
; AVX512BW-LABEL: test_mask_packs_epi32_rrkz_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovd %edi, %k1
; AVX512BW-NEXT: vpackssdw %zmm1, %zmm0, %zmm0 {%k1} {z}
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_mask_packs_epi32_rrkz_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpackssdw %zmm1, %zmm0, %zmm0 {%k1} {z}
; AVX512F-32-NEXT: retl
@@ -56,12 +56,12 @@ define <32 x i16> @test_mask_packs_epi32_rrkz_512(<16 x i32> %a, <16 x i32> %b,
define <32 x i16> @test_mask_packs_epi32_rm_512(<16 x i32> %a, <16 x i32>* %ptr_b) {
; AVX512BW-LABEL: test_mask_packs_epi32_rm_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: vpackssdw (%rdi), %zmm0, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_mask_packs_epi32_rm_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX512F-32-NEXT: vpackssdw (%eax), %zmm0, %zmm0
; AVX512F-32-NEXT: retl
@@ -72,14 +72,14 @@ define <32 x i16> @test_mask_packs_epi32_rm_512(<16 x i32> %a, <16 x i32>* %ptr_
define <32 x i16> @test_mask_packs_epi32_rmk_512(<16 x i32> %a, <16 x i32>* %ptr_b, <32 x i16> %passThru, i32 %mask) {
; AVX512BW-LABEL: test_mask_packs_epi32_rmk_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovd %esi, %k1
; AVX512BW-NEXT: vpackssdw (%rdi), %zmm0, %zmm1 {%k1}
; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_mask_packs_epi32_rmk_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpackssdw (%eax), %zmm0, %zmm1 {%k1}
@@ -94,13 +94,13 @@ define <32 x i16> @test_mask_packs_epi32_rmk_512(<16 x i32> %a, <16 x i32>* %ptr
define <32 x i16> @test_mask_packs_epi32_rmkz_512(<16 x i32> %a, <16 x i32>* %ptr_b, i32 %mask) {
; AVX512BW-LABEL: test_mask_packs_epi32_rmkz_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovd %esi, %k1
; AVX512BW-NEXT: vpackssdw (%rdi), %zmm0, %zmm0 {%k1} {z}
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_mask_packs_epi32_rmkz_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpackssdw (%eax), %zmm0, %zmm0 {%k1} {z}
@@ -114,12 +114,12 @@ define <32 x i16> @test_mask_packs_epi32_rmkz_512(<16 x i32> %a, <16 x i32>* %pt
define <32 x i16> @test_mask_packs_epi32_rmb_512(<16 x i32> %a, i32* %ptr_b) {
; AVX512BW-LABEL: test_mask_packs_epi32_rmb_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: vpackssdw (%rdi){1to16}, %zmm0, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_mask_packs_epi32_rmb_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX512F-32-NEXT: vpackssdw (%eax){1to16}, %zmm0, %zmm0
; AVX512F-32-NEXT: retl
@@ -132,14 +132,14 @@ define <32 x i16> @test_mask_packs_epi32_rmb_512(<16 x i32> %a, i32* %ptr_b) {
define <32 x i16> @test_mask_packs_epi32_rmbk_512(<16 x i32> %a, i32* %ptr_b, <32 x i16> %passThru, i32 %mask) {
; AVX512BW-LABEL: test_mask_packs_epi32_rmbk_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovd %esi, %k1
; AVX512BW-NEXT: vpackssdw (%rdi){1to16}, %zmm0, %zmm1 {%k1}
; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_mask_packs_epi32_rmbk_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpackssdw (%eax){1to16}, %zmm0, %zmm1 {%k1}
@@ -156,13 +156,13 @@ define <32 x i16> @test_mask_packs_epi32_rmbk_512(<16 x i32> %a, i32* %ptr_b, <3
define <32 x i16> @test_mask_packs_epi32_rmbkz_512(<16 x i32> %a, i32* %ptr_b, i32 %mask) {
; AVX512BW-LABEL: test_mask_packs_epi32_rmbkz_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovd %esi, %k1
; AVX512BW-NEXT: vpackssdw (%rdi){1to16}, %zmm0, %zmm0 {%k1} {z}
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_mask_packs_epi32_rmbkz_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpackssdw (%eax){1to16}, %zmm0, %zmm0 {%k1} {z}
@@ -180,12 +180,12 @@ declare <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32>, <16 x i32>)
define <64 x i8> @test_mask_packs_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) {
; AVX512BW-LABEL: test_mask_packs_epi16_rr_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: vpacksswb %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_mask_packs_epi16_rr_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: vpacksswb %zmm1, %zmm0, %zmm0
; AVX512F-32-NEXT: retl
%1 = call <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16> %a, <32 x i16> %b)
@@ -194,14 +194,14 @@ define <64 x i8> @test_mask_packs_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) {
define <64 x i8> @test_mask_packs_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <64 x i8> %passThru, i64 %mask) {
; AVX512BW-LABEL: test_mask_packs_epi16_rrk_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovq %rdi, %k1
; AVX512BW-NEXT: vpacksswb %zmm1, %zmm0, %zmm2 {%k1}
; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_mask_packs_epi16_rrk_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: kmovq {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpacksswb %zmm1, %zmm0, %zmm2 {%k1}
; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0
@@ -214,13 +214,13 @@ define <64 x i8> @test_mask_packs_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <6
define <64 x i8> @test_mask_packs_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i64 %mask) {
; AVX512BW-LABEL: test_mask_packs_epi16_rrkz_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovq %rdi, %k1
; AVX512BW-NEXT: vpacksswb %zmm1, %zmm0, %zmm0 {%k1} {z}
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_mask_packs_epi16_rrkz_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: kmovq {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpacksswb %zmm1, %zmm0, %zmm0 {%k1} {z}
; AVX512F-32-NEXT: retl
@@ -232,12 +232,12 @@ define <64 x i8> @test_mask_packs_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i
define <64 x i8> @test_mask_packs_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) {
; AVX512BW-LABEL: test_mask_packs_epi16_rm_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: vpacksswb (%rdi), %zmm0, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_mask_packs_epi16_rm_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX512F-32-NEXT: vpacksswb (%eax), %zmm0, %zmm0
; AVX512F-32-NEXT: retl
@@ -248,14 +248,14 @@ define <64 x i8> @test_mask_packs_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b
define <64 x i8> @test_mask_packs_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <64 x i8> %passThru, i64 %mask) {
; AVX512BW-LABEL: test_mask_packs_epi16_rmk_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovq %rsi, %k1
; AVX512BW-NEXT: vpacksswb (%rdi), %zmm0, %zmm1 {%k1}
; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_mask_packs_epi16_rmk_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX512F-32-NEXT: kmovq {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpacksswb (%eax), %zmm0, %zmm1 {%k1}
@@ -270,13 +270,13 @@ define <64 x i8> @test_mask_packs_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_
define <64 x i8> @test_mask_packs_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i64 %mask) {
; AVX512BW-LABEL: test_mask_packs_epi16_rmkz_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovq %rsi, %k1
; AVX512BW-NEXT: vpacksswb (%rdi), %zmm0, %zmm0 {%k1} {z}
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_mask_packs_epi16_rmkz_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX512F-32-NEXT: kmovq {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpacksswb (%eax), %zmm0, %zmm0 {%k1} {z}
@@ -293,12 +293,12 @@ declare <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16>, <32 x i16>)
define <32 x i16> @test_mask_packus_epi32_rr_512(<16 x i32> %a, <16 x i32> %b) {
; AVX512BW-LABEL: test_mask_packus_epi32_rr_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: vpackusdw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_mask_packus_epi32_rr_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: vpackusdw %zmm1, %zmm0, %zmm0
; AVX512F-32-NEXT: retl
%1 = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> %a, <16 x i32> %b)
@@ -307,14 +307,14 @@ define <32 x i16> @test_mask_packus_epi32_rr_512(<16 x i32> %a, <16 x i32> %b) {
define <32 x i16> @test_mask_packus_epi32_rrk_512(<16 x i32> %a, <16 x i32> %b, <32 x i16> %passThru, i32 %mask) {
; AVX512BW-LABEL: test_mask_packus_epi32_rrk_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovd %edi, %k1
; AVX512BW-NEXT: vpackusdw %zmm1, %zmm0, %zmm2 {%k1}
; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_mask_packus_epi32_rrk_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpackusdw %zmm1, %zmm0, %zmm2 {%k1}
; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0
@@ -327,13 +327,13 @@ define <32 x i16> @test_mask_packus_epi32_rrk_512(<16 x i32> %a, <16 x i32> %b,
define <32 x i16> @test_mask_packus_epi32_rrkz_512(<16 x i32> %a, <16 x i32> %b, i32 %mask) {
; AVX512BW-LABEL: test_mask_packus_epi32_rrkz_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovd %edi, %k1
; AVX512BW-NEXT: vpackusdw %zmm1, %zmm0, %zmm0 {%k1} {z}
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_mask_packus_epi32_rrkz_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpackusdw %zmm1, %zmm0, %zmm0 {%k1} {z}
; AVX512F-32-NEXT: retl
@@ -345,12 +345,12 @@ define <32 x i16> @test_mask_packus_epi32_rrkz_512(<16 x i32> %a, <16 x i32> %b,
define <32 x i16> @test_mask_packus_epi32_rm_512(<16 x i32> %a, <16 x i32>* %ptr_b) {
; AVX512BW-LABEL: test_mask_packus_epi32_rm_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: vpackusdw (%rdi), %zmm0, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_mask_packus_epi32_rm_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX512F-32-NEXT: vpackusdw (%eax), %zmm0, %zmm0
; AVX512F-32-NEXT: retl
@@ -361,14 +361,14 @@ define <32 x i16> @test_mask_packus_epi32_rm_512(<16 x i32> %a, <16 x i32>* %ptr
define <32 x i16> @test_mask_packus_epi32_rmk_512(<16 x i32> %a, <16 x i32>* %ptr_b, <32 x i16> %passThru, i32 %mask) {
; AVX512BW-LABEL: test_mask_packus_epi32_rmk_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovd %esi, %k1
; AVX512BW-NEXT: vpackusdw (%rdi), %zmm0, %zmm1 {%k1}
; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_mask_packus_epi32_rmk_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpackusdw (%eax), %zmm0, %zmm1 {%k1}
@@ -383,13 +383,13 @@ define <32 x i16> @test_mask_packus_epi32_rmk_512(<16 x i32> %a, <16 x i32>* %pt
define <32 x i16> @test_mask_packus_epi32_rmkz_512(<16 x i32> %a, <16 x i32>* %ptr_b, i32 %mask) {
; AVX512BW-LABEL: test_mask_packus_epi32_rmkz_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovd %esi, %k1
; AVX512BW-NEXT: vpackusdw (%rdi), %zmm0, %zmm0 {%k1} {z}
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_mask_packus_epi32_rmkz_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpackusdw (%eax), %zmm0, %zmm0 {%k1} {z}
@@ -403,12 +403,12 @@ define <32 x i16> @test_mask_packus_epi32_rmkz_512(<16 x i32> %a, <16 x i32>* %p
define <32 x i16> @test_mask_packus_epi32_rmb_512(<16 x i32> %a, i32* %ptr_b) {
; AVX512BW-LABEL: test_mask_packus_epi32_rmb_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: vpackusdw (%rdi){1to16}, %zmm0, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_mask_packus_epi32_rmb_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX512F-32-NEXT: vpackusdw (%eax){1to16}, %zmm0, %zmm0
; AVX512F-32-NEXT: retl
@@ -421,14 +421,14 @@ define <32 x i16> @test_mask_packus_epi32_rmb_512(<16 x i32> %a, i32* %ptr_b) {
define <32 x i16> @test_mask_packus_epi32_rmbk_512(<16 x i32> %a, i32* %ptr_b, <32 x i16> %passThru, i32 %mask) {
; AVX512BW-LABEL: test_mask_packus_epi32_rmbk_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovd %esi, %k1
; AVX512BW-NEXT: vpackusdw (%rdi){1to16}, %zmm0, %zmm1 {%k1}
; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_mask_packus_epi32_rmbk_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpackusdw (%eax){1to16}, %zmm0, %zmm1 {%k1}
@@ -445,13 +445,13 @@ define <32 x i16> @test_mask_packus_epi32_rmbk_512(<16 x i32> %a, i32* %ptr_b, <
define <32 x i16> @test_mask_packus_epi32_rmbkz_512(<16 x i32> %a, i32* %ptr_b, i32 %mask) {
; AVX512BW-LABEL: test_mask_packus_epi32_rmbkz_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovd %esi, %k1
; AVX512BW-NEXT: vpackusdw (%rdi){1to16}, %zmm0, %zmm0 {%k1} {z}
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_mask_packus_epi32_rmbkz_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpackusdw (%eax){1to16}, %zmm0, %zmm0 {%k1} {z}
@@ -469,12 +469,12 @@ declare <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32>, <16 x i32>)
define <64 x i8> @test_mask_packus_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) {
; AVX512BW-LABEL: test_mask_packus_epi16_rr_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: vpackuswb %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_mask_packus_epi16_rr_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: vpackuswb %zmm1, %zmm0, %zmm0
; AVX512F-32-NEXT: retl
%1 = call <64 x i8> @llvm.x86.avx512.packuswb.512(<32 x i16> %a, <32 x i16> %b)
@@ -483,14 +483,14 @@ define <64 x i8> @test_mask_packus_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) {
define <64 x i8> @test_mask_packus_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <64 x i8> %passThru, i64 %mask) {
; AVX512BW-LABEL: test_mask_packus_epi16_rrk_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovq %rdi, %k1
; AVX512BW-NEXT: vpackuswb %zmm1, %zmm0, %zmm2 {%k1}
; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_mask_packus_epi16_rrk_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: kmovq {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpackuswb %zmm1, %zmm0, %zmm2 {%k1}
; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0
@@ -503,13 +503,13 @@ define <64 x i8> @test_mask_packus_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <
define <64 x i8> @test_mask_packus_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i64 %mask) {
; AVX512BW-LABEL: test_mask_packus_epi16_rrkz_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovq %rdi, %k1
; AVX512BW-NEXT: vpackuswb %zmm1, %zmm0, %zmm0 {%k1} {z}
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_mask_packus_epi16_rrkz_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: kmovq {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpackuswb %zmm1, %zmm0, %zmm0 {%k1} {z}
; AVX512F-32-NEXT: retl
@@ -521,12 +521,12 @@ define <64 x i8> @test_mask_packus_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b,
define <64 x i8> @test_mask_packus_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) {
; AVX512BW-LABEL: test_mask_packus_epi16_rm_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: vpackuswb (%rdi), %zmm0, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_mask_packus_epi16_rm_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX512F-32-NEXT: vpackuswb (%eax), %zmm0, %zmm0
; AVX512F-32-NEXT: retl
@@ -537,14 +537,14 @@ define <64 x i8> @test_mask_packus_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_
define <64 x i8> @test_mask_packus_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <64 x i8> %passThru, i64 %mask) {
; AVX512BW-LABEL: test_mask_packus_epi16_rmk_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovq %rsi, %k1
; AVX512BW-NEXT: vpackuswb (%rdi), %zmm0, %zmm1 {%k1}
; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_mask_packus_epi16_rmk_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX512F-32-NEXT: kmovq {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpackuswb (%eax), %zmm0, %zmm1 {%k1}
@@ -559,13 +559,13 @@ define <64 x i8> @test_mask_packus_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr
define <64 x i8> @test_mask_packus_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i64 %mask) {
; AVX512BW-LABEL: test_mask_packus_epi16_rmkz_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovq %rsi, %k1
; AVX512BW-NEXT: vpackuswb (%rdi), %zmm0, %zmm0 {%k1} {z}
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_mask_packus_epi16_rmkz_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX512F-32-NEXT: kmovq {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpackuswb (%eax), %zmm0, %zmm0 {%k1} {z}
@@ -581,12 +581,12 @@ declare <64 x i8> @llvm.x86.avx512.packuswb.512(<32 x i16>, <32 x i16>)
define <32 x i16> @test_mask_adds_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) {
; AVX512BW-LABEL: test_mask_adds_epi16_rr_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: vpaddsw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_mask_adds_epi16_rr_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: vpaddsw %zmm1, %zmm0, %zmm0
; AVX512F-32-NEXT: retl
%res = call <32 x i16> @llvm.x86.avx512.mask.padds.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1)
@@ -595,14 +595,14 @@ define <32 x i16> @test_mask_adds_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) {
define <32 x i16> @test_mask_adds_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) {
; AVX512BW-LABEL: test_mask_adds_epi16_rrk_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovd %edi, %k1
; AVX512BW-NEXT: vpaddsw %zmm1, %zmm0, %zmm2 {%k1}
; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_mask_adds_epi16_rrk_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpaddsw %zmm1, %zmm0, %zmm2 {%k1}
; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0
@@ -613,13 +613,13 @@ define <32 x i16> @test_mask_adds_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <3
define <32 x i16> @test_mask_adds_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %mask) {
; AVX512BW-LABEL: test_mask_adds_epi16_rrkz_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovd %edi, %k1
; AVX512BW-NEXT: vpaddsw %zmm1, %zmm0, %zmm0 {%k1} {z}
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_mask_adds_epi16_rrkz_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpaddsw %zmm1, %zmm0, %zmm0 {%k1} {z}
; AVX512F-32-NEXT: retl
@@ -629,12 +629,12 @@ define <32 x i16> @test_mask_adds_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i
define <32 x i16> @test_mask_adds_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) {
; AVX512BW-LABEL: test_mask_adds_epi16_rm_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: vpaddsw (%rdi), %zmm0, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_mask_adds_epi16_rm_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX512F-32-NEXT: vpaddsw (%eax), %zmm0, %zmm0
; AVX512F-32-NEXT: retl
@@ -645,14 +645,14 @@ define <32 x i16> @test_mask_adds_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b
define <32 x i16> @test_mask_adds_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <32 x i16> %passThru, i32 %mask) {
; AVX512BW-LABEL: test_mask_adds_epi16_rmk_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovd %esi, %k1
; AVX512BW-NEXT: vpaddsw (%rdi), %zmm0, %zmm1 {%k1}
; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_mask_adds_epi16_rmk_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpaddsw (%eax), %zmm0, %zmm1 {%k1}
@@ -665,13 +665,13 @@ define <32 x i16> @test_mask_adds_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_
define <32 x i16> @test_mask_adds_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i32 %mask) {
; AVX512BW-LABEL: test_mask_adds_epi16_rmkz_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovd %esi, %k1
; AVX512BW-NEXT: vpaddsw (%rdi), %zmm0, %zmm0 {%k1} {z}
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_mask_adds_epi16_rmkz_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpaddsw (%eax), %zmm0, %zmm0 {%k1} {z}
@@ -685,12 +685,12 @@ declare <32 x i16> @llvm.x86.avx512.mask.padds.w.512(<32 x i16>, <32 x i16>, <32
define <32 x i16> @test_mask_subs_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) {
; AVX512BW-LABEL: test_mask_subs_epi16_rr_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: vpsubsw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_mask_subs_epi16_rr_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: vpsubsw %zmm1, %zmm0, %zmm0
; AVX512F-32-NEXT: retl
%res = call <32 x i16> @llvm.x86.avx512.mask.psubs.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1)
@@ -699,14 +699,14 @@ define <32 x i16> @test_mask_subs_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) {
define <32 x i16> @test_mask_subs_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) {
; AVX512BW-LABEL: test_mask_subs_epi16_rrk_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovd %edi, %k1
; AVX512BW-NEXT: vpsubsw %zmm1, %zmm0, %zmm2 {%k1}
; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_mask_subs_epi16_rrk_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpsubsw %zmm1, %zmm0, %zmm2 {%k1}
; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0
@@ -717,13 +717,13 @@ define <32 x i16> @test_mask_subs_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <3
define <32 x i16> @test_mask_subs_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %mask) {
; AVX512BW-LABEL: test_mask_subs_epi16_rrkz_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovd %edi, %k1
; AVX512BW-NEXT: vpsubsw %zmm1, %zmm0, %zmm0 {%k1} {z}
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_mask_subs_epi16_rrkz_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpsubsw %zmm1, %zmm0, %zmm0 {%k1} {z}
; AVX512F-32-NEXT: retl
@@ -733,12 +733,12 @@ define <32 x i16> @test_mask_subs_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i
define <32 x i16> @test_mask_subs_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) {
; AVX512BW-LABEL: test_mask_subs_epi16_rm_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: vpsubsw (%rdi), %zmm0, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_mask_subs_epi16_rm_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX512F-32-NEXT: vpsubsw (%eax), %zmm0, %zmm0
; AVX512F-32-NEXT: retl
@@ -749,14 +749,14 @@ define <32 x i16> @test_mask_subs_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b
define <32 x i16> @test_mask_subs_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <32 x i16> %passThru, i32 %mask) {
; AVX512BW-LABEL: test_mask_subs_epi16_rmk_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovd %esi, %k1
; AVX512BW-NEXT: vpsubsw (%rdi), %zmm0, %zmm1 {%k1}
; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_mask_subs_epi16_rmk_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpsubsw (%eax), %zmm0, %zmm1 {%k1}
@@ -769,13 +769,13 @@ define <32 x i16> @test_mask_subs_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_
define <32 x i16> @test_mask_subs_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i32 %mask) {
; AVX512BW-LABEL: test_mask_subs_epi16_rmkz_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovd %esi, %k1
; AVX512BW-NEXT: vpsubsw (%rdi), %zmm0, %zmm0 {%k1} {z}
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_mask_subs_epi16_rmkz_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpsubsw (%eax), %zmm0, %zmm0 {%k1} {z}
@@ -789,12 +789,12 @@ declare <32 x i16> @llvm.x86.avx512.mask.psubs.w.512(<32 x i16>, <32 x i16>, <32
define <32 x i16> @test_mask_adds_epu16_rr_512(<32 x i16> %a, <32 x i16> %b) {
; AVX512BW-LABEL: test_mask_adds_epu16_rr_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: vpaddusw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_mask_adds_epu16_rr_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: vpaddusw %zmm1, %zmm0, %zmm0
; AVX512F-32-NEXT: retl
%res = call <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1)
@@ -803,14 +803,14 @@ define <32 x i16> @test_mask_adds_epu16_rr_512(<32 x i16> %a, <32 x i16> %b) {
define <32 x i16> @test_mask_adds_epu16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) {
; AVX512BW-LABEL: test_mask_adds_epu16_rrk_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovd %edi, %k1
; AVX512BW-NEXT: vpaddusw %zmm1, %zmm0, %zmm2 {%k1}
; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_mask_adds_epu16_rrk_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpaddusw %zmm1, %zmm0, %zmm2 {%k1}
; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0
@@ -821,13 +821,13 @@ define <32 x i16> @test_mask_adds_epu16_rrk_512(<32 x i16> %a, <32 x i16> %b, <3
define <32 x i16> @test_mask_adds_epu16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %mask) {
; AVX512BW-LABEL: test_mask_adds_epu16_rrkz_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovd %edi, %k1
; AVX512BW-NEXT: vpaddusw %zmm1, %zmm0, %zmm0 {%k1} {z}
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_mask_adds_epu16_rrkz_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpaddusw %zmm1, %zmm0, %zmm0 {%k1} {z}
; AVX512F-32-NEXT: retl
@@ -837,12 +837,12 @@ define <32 x i16> @test_mask_adds_epu16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i
define <32 x i16> @test_mask_adds_epu16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) {
; AVX512BW-LABEL: test_mask_adds_epu16_rm_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: vpaddusw (%rdi), %zmm0, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_mask_adds_epu16_rm_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX512F-32-NEXT: vpaddusw (%eax), %zmm0, %zmm0
; AVX512F-32-NEXT: retl
@@ -853,14 +853,14 @@ define <32 x i16> @test_mask_adds_epu16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b
define <32 x i16> @test_mask_adds_epu16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <32 x i16> %passThru, i32 %mask) {
; AVX512BW-LABEL: test_mask_adds_epu16_rmk_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovd %esi, %k1
; AVX512BW-NEXT: vpaddusw (%rdi), %zmm0, %zmm1 {%k1}
; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_mask_adds_epu16_rmk_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpaddusw (%eax), %zmm0, %zmm1 {%k1}
@@ -873,13 +873,13 @@ define <32 x i16> @test_mask_adds_epu16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_
define <32 x i16> @test_mask_adds_epu16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i32 %mask) {
; AVX512BW-LABEL: test_mask_adds_epu16_rmkz_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovd %esi, %k1
; AVX512BW-NEXT: vpaddusw (%rdi), %zmm0, %zmm0 {%k1} {z}
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_mask_adds_epu16_rmkz_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpaddusw (%eax), %zmm0, %zmm0 {%k1} {z}
@@ -893,12 +893,12 @@ declare <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16>, <32 x i16>, <3
define <32 x i16> @test_mask_subs_epu16_rr_512(<32 x i16> %a, <32 x i16> %b) {
; AVX512BW-LABEL: test_mask_subs_epu16_rr_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: vpsubusw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_mask_subs_epu16_rr_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: vpsubusw %zmm1, %zmm0, %zmm0
; AVX512F-32-NEXT: retl
%res = call <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1)
@@ -907,14 +907,14 @@ define <32 x i16> @test_mask_subs_epu16_rr_512(<32 x i16> %a, <32 x i16> %b) {
define <32 x i16> @test_mask_subs_epu16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) {
; AVX512BW-LABEL: test_mask_subs_epu16_rrk_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovd %edi, %k1
; AVX512BW-NEXT: vpsubusw %zmm1, %zmm0, %zmm2 {%k1}
; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_mask_subs_epu16_rrk_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpsubusw %zmm1, %zmm0, %zmm2 {%k1}
; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0
@@ -925,13 +925,13 @@ define <32 x i16> @test_mask_subs_epu16_rrk_512(<32 x i16> %a, <32 x i16> %b, <3
define <32 x i16> @test_mask_subs_epu16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %mask) {
; AVX512BW-LABEL: test_mask_subs_epu16_rrkz_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovd %edi, %k1
; AVX512BW-NEXT: vpsubusw %zmm1, %zmm0, %zmm0 {%k1} {z}
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_mask_subs_epu16_rrkz_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpsubusw %zmm1, %zmm0, %zmm0 {%k1} {z}
; AVX512F-32-NEXT: retl
@@ -941,12 +941,12 @@ define <32 x i16> @test_mask_subs_epu16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i
define <32 x i16> @test_mask_subs_epu16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) {
; AVX512BW-LABEL: test_mask_subs_epu16_rm_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: vpsubusw (%rdi), %zmm0, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_mask_subs_epu16_rm_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX512F-32-NEXT: vpsubusw (%eax), %zmm0, %zmm0
; AVX512F-32-NEXT: retl
@@ -957,14 +957,14 @@ define <32 x i16> @test_mask_subs_epu16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b
define <32 x i16> @test_mask_subs_epu16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <32 x i16> %passThru, i32 %mask) {
; AVX512BW-LABEL: test_mask_subs_epu16_rmk_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovd %esi, %k1
; AVX512BW-NEXT: vpsubusw (%rdi), %zmm0, %zmm1 {%k1}
; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_mask_subs_epu16_rmk_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpsubusw (%eax), %zmm0, %zmm1 {%k1}
@@ -977,13 +977,13 @@ define <32 x i16> @test_mask_subs_epu16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_
define <32 x i16> @test_mask_subs_epu16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i32 %mask) {
; AVX512BW-LABEL: test_mask_subs_epu16_rmkz_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovd %esi, %k1
; AVX512BW-NEXT: vpsubusw (%rdi), %zmm0, %zmm0 {%k1} {z}
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_mask_subs_epu16_rmkz_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpsubusw (%eax), %zmm0, %zmm0 {%k1} {z}
@@ -999,7 +999,7 @@ declare <32 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.512(<32 x i16>, <32 x i16
define <32 x i16>@test_int_x86_avx512_mask_vpermt2var_hi_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
; AVX512BW-LABEL: test_int_x86_avx512_mask_vpermt2var_hi_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovd %edi, %k1
; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3
; AVX512BW-NEXT: vpermt2w %zmm2, %zmm0, %zmm3 {%k1}
@@ -1008,7 +1008,7 @@ define <32 x i16>@test_int_x86_avx512_mask_vpermt2var_hi_512(<32 x i16> %x0, <32
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_int_x86_avx512_mask_vpermt2var_hi_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm3
; AVX512F-32-NEXT: vpermt2w %zmm2, %zmm0, %zmm3 {%k1}
@@ -1025,7 +1025,7 @@ declare <32 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.512(<32 x i16>, <32 x i1
define <32 x i16>@test_int_x86_avx512_maskz_vpermt2var_hi_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
; AVX512BW-LABEL: test_int_x86_avx512_maskz_vpermt2var_hi_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovd %edi, %k1
; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3
; AVX512BW-NEXT: vpermt2w %zmm2, %zmm0, %zmm3 {%k1} {z}
@@ -1034,7 +1034,7 @@ define <32 x i16>@test_int_x86_avx512_maskz_vpermt2var_hi_512(<32 x i16> %x0, <3
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_int_x86_avx512_maskz_vpermt2var_hi_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm3
; AVX512F-32-NEXT: vpermt2w %zmm2, %zmm0, %zmm3 {%k1} {z}
@@ -1051,7 +1051,7 @@ declare <32 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.512(<32 x i16>, <32 x i16
define <32 x i16>@test_int_x86_avx512_mask_vpermi2var_hi_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
; AVX512BW-LABEL: test_int_x86_avx512_mask_vpermi2var_hi_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovd %edi, %k1
; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3
; AVX512BW-NEXT: vpermi2w %zmm2, %zmm0, %zmm3 {%k1}
@@ -1060,7 +1060,7 @@ define <32 x i16>@test_int_x86_avx512_mask_vpermi2var_hi_512(<32 x i16> %x0, <32
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_int_x86_avx512_mask_vpermi2var_hi_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm3
; AVX512F-32-NEXT: vpermi2w %zmm2, %zmm0, %zmm3 {%k1}
@@ -1073,66 +1073,16 @@ define <32 x i16>@test_int_x86_avx512_mask_vpermi2var_hi_512(<32 x i16> %x0, <32
ret <32 x i16> %res2
}
-declare <64 x i8> @llvm.x86.avx512.mask.pavg.b.512(<64 x i8>, <64 x i8>, <64 x i8>, i64)
-
-define <64 x i8>@test_int_x86_avx512_mask_pavg_b_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) {
-; AVX512BW-LABEL: test_int_x86_avx512_mask_pavg_b_512:
-; AVX512BW: ## BB#0:
-; AVX512BW-NEXT: kmovq %rdi, %k1
-; AVX512BW-NEXT: vpavgb %zmm1, %zmm0, %zmm2 {%k1}
-; AVX512BW-NEXT: vpavgb %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpaddb %zmm0, %zmm2, %zmm0
-; AVX512BW-NEXT: retq
-;
-; AVX512F-32-LABEL: test_int_x86_avx512_mask_pavg_b_512:
-; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k0
-; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT: kunpckdq %k0, %k1, %k1
-; AVX512F-32-NEXT: vpavgb %zmm1, %zmm0, %zmm2 {%k1}
-; AVX512F-32-NEXT: vpavgb %zmm1, %zmm0, %zmm0
-; AVX512F-32-NEXT: vpaddb %zmm0, %zmm2, %zmm0
-; AVX512F-32-NEXT: retl
- %res = call <64 x i8> @llvm.x86.avx512.mask.pavg.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3)
- %res1 = call <64 x i8> @llvm.x86.avx512.mask.pavg.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1)
- %res2 = add <64 x i8> %res, %res1
- ret <64 x i8> %res2
-}
-
-declare <32 x i16> @llvm.x86.avx512.mask.pavg.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
-
-define <32 x i16>@test_int_x86_avx512_mask_pavg_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
-; AVX512BW-LABEL: test_int_x86_avx512_mask_pavg_w_512:
-; AVX512BW: ## BB#0:
-; AVX512BW-NEXT: kmovd %edi, %k1
-; AVX512BW-NEXT: vpavgw %zmm1, %zmm0, %zmm2 {%k1}
-; AVX512BW-NEXT: vpavgw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpaddw %zmm0, %zmm2, %zmm0
-; AVX512BW-NEXT: retq
-;
-; AVX512F-32-LABEL: test_int_x86_avx512_mask_pavg_w_512:
-; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT: vpavgw %zmm1, %zmm0, %zmm2 {%k1}
-; AVX512F-32-NEXT: vpavgw %zmm1, %zmm0, %zmm0
-; AVX512F-32-NEXT: vpaddw %zmm0, %zmm2, %zmm0
-; AVX512F-32-NEXT: retl
- %res = call <32 x i16> @llvm.x86.avx512.mask.pavg.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
- %res1 = call <32 x i16> @llvm.x86.avx512.mask.pavg.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
- %res2 = add <32 x i16> %res, %res1
- ret <32 x i16> %res2
-}
-
declare <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8>, <64 x i8>)
define <64 x i8>@test_int_x86_avx512_pshuf_b_512(<64 x i8> %x0, <64 x i8> %x1) {
; AVX512BW-LABEL: test_int_x86_avx512_pshuf_b_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: vpshufb %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_int_x86_avx512_pshuf_b_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: vpshufb %zmm1, %zmm0, %zmm0
; AVX512F-32-NEXT: retl
%res = call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> %x0, <64 x i8> %x1)
@@ -1141,14 +1091,14 @@ define <64 x i8>@test_int_x86_avx512_pshuf_b_512(<64 x i8> %x0, <64 x i8> %x1) {
define <64 x i8>@test_int_x86_avx512_pshuf_b_512_mask(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %mask) {
; AVX512BW-LABEL: test_int_x86_avx512_pshuf_b_512_mask:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovq %rdi, %k1
; AVX512BW-NEXT: vpshufb %zmm1, %zmm0, %zmm2 {%k1}
; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_int_x86_avx512_pshuf_b_512_mask:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: kmovq {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpshufb %zmm1, %zmm0, %zmm2 {%k1}
; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0
@@ -1161,13 +1111,13 @@ define <64 x i8>@test_int_x86_avx512_pshuf_b_512_mask(<64 x i8> %x0, <64 x i8> %
define <64 x i8>@test_int_x86_avx512_pshuf_b_512_maskz(<64 x i8> %x0, <64 x i8> %x1, i64 %mask) {
; AVX512BW-LABEL: test_int_x86_avx512_pshuf_b_512_maskz:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovq %rdi, %k1
; AVX512BW-NEXT: vpshufb %zmm1, %zmm0, %zmm0 {%k1} {z}
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_int_x86_avx512_pshuf_b_512_maskz:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: kmovq {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpshufb %zmm1, %zmm0, %zmm0 {%k1} {z}
; AVX512F-32-NEXT: retl
@@ -1177,61 +1127,11 @@ define <64 x i8>@test_int_x86_avx512_pshuf_b_512_maskz(<64 x i8> %x0, <64 x i8>
ret <64 x i8> %res2
}
-declare <32 x i16> @llvm.x86.avx512.mask.pabs.w.512(<32 x i16>, <32 x i16>, i32)
-
-define <32 x i16>@test_int_x86_avx512_mask_pabs_w_512(<32 x i16> %x0, <32 x i16> %x1, i32 %x2) {
-; AVX512BW-LABEL: test_int_x86_avx512_mask_pabs_w_512:
-; AVX512BW: ## BB#0:
-; AVX512BW-NEXT: kmovd %edi, %k1
-; AVX512BW-NEXT: vpabsw %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT: vpabsw %zmm0, %zmm0
-; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0
-; AVX512BW-NEXT: retq
-;
-; AVX512F-32-LABEL: test_int_x86_avx512_mask_pabs_w_512:
-; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT: vpabsw %zmm0, %zmm1 {%k1}
-; AVX512F-32-NEXT: vpabsw %zmm0, %zmm0
-; AVX512F-32-NEXT: vpaddw %zmm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: retl
- %res = call <32 x i16> @llvm.x86.avx512.mask.pabs.w.512(<32 x i16> %x0, <32 x i16> %x1, i32 %x2)
- %res1 = call <32 x i16> @llvm.x86.avx512.mask.pabs.w.512(<32 x i16> %x0, <32 x i16> %x1, i32 -1)
- %res2 = add <32 x i16> %res, %res1
- ret <32 x i16> %res2
-}
-
-declare <64 x i8> @llvm.x86.avx512.mask.pabs.b.512(<64 x i8>, <64 x i8>, i64)
-
-define <64 x i8>@test_int_x86_avx512_mask_pabs_b_512(<64 x i8> %x0, <64 x i8> %x1, i64 %x2) {
-; AVX512BW-LABEL: test_int_x86_avx512_mask_pabs_b_512:
-; AVX512BW: ## BB#0:
-; AVX512BW-NEXT: kmovq %rdi, %k1
-; AVX512BW-NEXT: vpabsb %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT: vpabsb %zmm0, %zmm0
-; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0
-; AVX512BW-NEXT: retq
-;
-; AVX512F-32-LABEL: test_int_x86_avx512_mask_pabs_b_512:
-; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k0
-; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT: kunpckdq %k0, %k1, %k1
-; AVX512F-32-NEXT: vpabsb %zmm0, %zmm1 {%k1}
-; AVX512F-32-NEXT: vpabsb %zmm0, %zmm0
-; AVX512F-32-NEXT: vpaddb %zmm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: retl
- %res = call <64 x i8> @llvm.x86.avx512.mask.pabs.b.512(<64 x i8> %x0, <64 x i8> %x1, i64 %x2)
- %res1 = call <64 x i8> @llvm.x86.avx512.mask.pabs.b.512(<64 x i8> %x0, <64 x i8> %x1, i64 -1)
- %res2 = add <64 x i8> %res, %res1
- ret <64 x i8> %res2
-}
-
declare <32 x i16> @llvm.x86.avx512.mask.pmulhu.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
define <32 x i16>@test_int_x86_avx512_mask_pmulhu_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
; AVX512BW-LABEL: test_int_x86_avx512_mask_pmulhu_w_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovd %edi, %k1
; AVX512BW-NEXT: vpmulhuw %zmm1, %zmm0, %zmm2 {%k1}
; AVX512BW-NEXT: vpmulhuw %zmm1, %zmm0, %zmm0
@@ -1239,7 +1139,7 @@ define <32 x i16>@test_int_x86_avx512_mask_pmulhu_w_512(<32 x i16> %x0, <32 x i1
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_int_x86_avx512_mask_pmulhu_w_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpmulhuw %zmm1, %zmm0, %zmm2 {%k1}
; AVX512F-32-NEXT: vpmulhuw %zmm1, %zmm0, %zmm0
@@ -1255,7 +1155,7 @@ declare <32 x i16> @llvm.x86.avx512.mask.pmulh.w.512(<32 x i16>, <32 x i16>, <32
define <32 x i16>@test_int_x86_avx512_mask_pmulh_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
; AVX512BW-LABEL: test_int_x86_avx512_mask_pmulh_w_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovd %edi, %k1
; AVX512BW-NEXT: vpmulhw %zmm1, %zmm0, %zmm2 {%k1}
; AVX512BW-NEXT: vpmulhw %zmm1, %zmm0, %zmm0
@@ -1263,7 +1163,7 @@ define <32 x i16>@test_int_x86_avx512_mask_pmulh_w_512(<32 x i16> %x0, <32 x i16
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_int_x86_avx512_mask_pmulh_w_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpmulhw %zmm1, %zmm0, %zmm2 {%k1}
; AVX512F-32-NEXT: vpmulhw %zmm1, %zmm0, %zmm0
@@ -1279,7 +1179,7 @@ declare <32 x i16> @llvm.x86.avx512.mask.pmul.hr.sw.512(<32 x i16>, <32 x i16>,
define <32 x i16>@test_int_x86_avx512_mask_pmulhr_sw_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
; AVX512BW-LABEL: test_int_x86_avx512_mask_pmulhr_sw_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovd %edi, %k1
; AVX512BW-NEXT: vpmulhrsw %zmm1, %zmm0, %zmm2 {%k1}
; AVX512BW-NEXT: vpmulhrsw %zmm1, %zmm0, %zmm0
@@ -1287,7 +1187,7 @@ define <32 x i16>@test_int_x86_avx512_mask_pmulhr_sw_512(<32 x i16> %x0, <32 x i
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_int_x86_avx512_mask_pmulhr_sw_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpmulhrsw %zmm1, %zmm0, %zmm2 {%k1}
; AVX512F-32-NEXT: vpmulhrsw %zmm1, %zmm0, %zmm0
@@ -1303,7 +1203,7 @@ declare <32 x i8> @llvm.x86.avx512.mask.pmov.wb.512(<32 x i16>, <32 x i8>, i32)
define <32 x i8>@test_int_x86_avx512_mask_pmov_wb_512(<32 x i16> %x0, <32 x i8> %x1, i32 %x2) {
; AVX512BW-LABEL: test_int_x86_avx512_mask_pmov_wb_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovd %edi, %k1
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm1 {%k1}
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm2 {%k1} {z}
@@ -1313,7 +1213,7 @@ define <32 x i8>@test_int_x86_avx512_mask_pmov_wb_512(<32 x i16> %x0, <32 x i8>
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_int_x86_avx512_mask_pmov_wb_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpmovwb %zmm0, %ymm1 {%k1}
; AVX512F-32-NEXT: vpmovwb %zmm0, %ymm2 {%k1} {z}
@@ -1333,14 +1233,14 @@ declare void @llvm.x86.avx512.mask.pmov.wb.mem.512(i8* %ptr, <32 x i16>, i32)
define void @test_int_x86_avx512_mask_pmov_wb_mem_512(i8* %ptr, <32 x i16> %x1, i32 %x2) {
; AVX512BW-LABEL: test_int_x86_avx512_mask_pmov_wb_mem_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovd %esi, %k1
; AVX512BW-NEXT: vpmovwb %zmm0, (%rdi)
; AVX512BW-NEXT: vpmovwb %zmm0, (%rdi) {%k1}
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_int_x86_avx512_mask_pmov_wb_mem_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX512F-32-NEXT: vpmovwb %zmm0, (%eax)
@@ -1355,7 +1255,7 @@ declare <32 x i8> @llvm.x86.avx512.mask.pmovs.wb.512(<32 x i16>, <32 x i8>, i32)
define <32 x i8>@test_int_x86_avx512_mask_pmovs_wb_512(<32 x i16> %x0, <32 x i8> %x1, i32 %x2) {
; AVX512BW-LABEL: test_int_x86_avx512_mask_pmovs_wb_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovd %edi, %k1
; AVX512BW-NEXT: vpmovswb %zmm0, %ymm1 {%k1}
; AVX512BW-NEXT: vpmovswb %zmm0, %ymm2 {%k1} {z}
@@ -1365,7 +1265,7 @@ define <32 x i8>@test_int_x86_avx512_mask_pmovs_wb_512(<32 x i16> %x0, <32 x i8>
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_int_x86_avx512_mask_pmovs_wb_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpmovswb %zmm0, %ymm1 {%k1}
; AVX512F-32-NEXT: vpmovswb %zmm0, %ymm2 {%k1} {z}
@@ -1385,14 +1285,14 @@ declare void @llvm.x86.avx512.mask.pmovs.wb.mem.512(i8* %ptr, <32 x i16>, i32)
define void @test_int_x86_avx512_mask_pmovs_wb_mem_512(i8* %ptr, <32 x i16> %x1, i32 %x2) {
; AVX512BW-LABEL: test_int_x86_avx512_mask_pmovs_wb_mem_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovd %esi, %k1
; AVX512BW-NEXT: vpmovswb %zmm0, (%rdi)
; AVX512BW-NEXT: vpmovswb %zmm0, (%rdi) {%k1}
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_int_x86_avx512_mask_pmovs_wb_mem_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX512F-32-NEXT: vpmovswb %zmm0, (%eax)
@@ -1407,7 +1307,7 @@ declare <32 x i8> @llvm.x86.avx512.mask.pmovus.wb.512(<32 x i16>, <32 x i8>, i32
define <32 x i8>@test_int_x86_avx512_mask_pmovus_wb_512(<32 x i16> %x0, <32 x i8> %x1, i32 %x2) {
; AVX512BW-LABEL: test_int_x86_avx512_mask_pmovus_wb_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovd %edi, %k1
; AVX512BW-NEXT: vpmovuswb %zmm0, %ymm1 {%k1}
; AVX512BW-NEXT: vpmovuswb %zmm0, %ymm2 {%k1} {z}
@@ -1417,7 +1317,7 @@ define <32 x i8>@test_int_x86_avx512_mask_pmovus_wb_512(<32 x i16> %x0, <32 x i8
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_int_x86_avx512_mask_pmovus_wb_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpmovuswb %zmm0, %ymm1 {%k1}
; AVX512F-32-NEXT: vpmovuswb %zmm0, %ymm2 {%k1} {z}
@@ -1437,14 +1337,14 @@ declare void @llvm.x86.avx512.mask.pmovus.wb.mem.512(i8* %ptr, <32 x i16>, i32)
define void @test_int_x86_avx512_mask_pmovus_wb_mem_512(i8* %ptr, <32 x i16> %x1, i32 %x2) {
; AVX512BW-LABEL: test_int_x86_avx512_mask_pmovus_wb_mem_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovd %esi, %k1
; AVX512BW-NEXT: vpmovuswb %zmm0, (%rdi)
; AVX512BW-NEXT: vpmovuswb %zmm0, (%rdi) {%k1}
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_int_x86_avx512_mask_pmovus_wb_mem_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX512F-32-NEXT: vpmovuswb %zmm0, (%eax)
@@ -1459,7 +1359,7 @@ declare <32 x i16> @llvm.x86.avx512.mask.pmaddubs.w.512(<64 x i8>, <64 x i8>, <3
define <32 x i16>@test_int_x86_avx512_mask_pmaddubs_w_512(<64 x i8> %x0, <64 x i8> %x1, <32 x i16> %x2, i32 %x3) {
; AVX512BW-LABEL: test_int_x86_avx512_mask_pmaddubs_w_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovd %edi, %k1
; AVX512BW-NEXT: vpmaddubsw %zmm1, %zmm0, %zmm2 {%k1}
; AVX512BW-NEXT: vpmaddubsw %zmm1, %zmm0, %zmm0
@@ -1467,7 +1367,7 @@ define <32 x i16>@test_int_x86_avx512_mask_pmaddubs_w_512(<64 x i8> %x0, <64 x i
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_int_x86_avx512_mask_pmaddubs_w_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpmaddubsw %zmm1, %zmm0, %zmm2 {%k1}
; AVX512F-32-NEXT: vpmaddubsw %zmm1, %zmm0, %zmm0
@@ -1483,7 +1383,7 @@ declare <16 x i32> @llvm.x86.avx512.mask.pmaddw.d.512(<32 x i16>, <32 x i16>, <1
define <16 x i32>@test_int_x86_avx512_mask_pmaddw_d_512(<32 x i16> %x0, <32 x i16> %x1, <16 x i32> %x2, i16 %x3) {
; AVX512BW-LABEL: test_int_x86_avx512_mask_pmaddw_d_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovd %edi, %k1
; AVX512BW-NEXT: vpmaddwd %zmm1, %zmm0, %zmm2 {%k1}
; AVX512BW-NEXT: vpmaddwd %zmm1, %zmm0, %zmm0
@@ -1491,7 +1391,7 @@ define <16 x i32>@test_int_x86_avx512_mask_pmaddw_d_512(<32 x i16> %x0, <32 x i1
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_int_x86_avx512_mask_pmaddw_d_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpmaddwd %zmm1, %zmm0, %zmm2 {%k1}
; AVX512F-32-NEXT: vpmaddwd %zmm1, %zmm0, %zmm0
@@ -1507,7 +1407,7 @@ declare <32 x i16> @llvm.x86.avx512.mask.dbpsadbw.512(<64 x i8>, <64 x i8>, i32,
define <32 x i16>@test_int_x86_avx512_mask_dbpsadbw_512(<64 x i8> %x0, <64 x i8> %x1, <32 x i16> %x3, i32 %x4) {
; AVX512BW-LABEL: test_int_x86_avx512_mask_dbpsadbw_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovd %edi, %k1
; AVX512BW-NEXT: vdbpsadbw $2, %zmm1, %zmm0, %zmm2 {%k1}
; AVX512BW-NEXT: vdbpsadbw $2, %zmm1, %zmm0, %zmm3 {%k1} {z}
@@ -1517,7 +1417,7 @@ define <32 x i16>@test_int_x86_avx512_mask_dbpsadbw_512(<64 x i8> %x0, <64 x i8>
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_int_x86_avx512_mask_dbpsadbw_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vdbpsadbw $2, %zmm1, %zmm0, %zmm2 {%k1}
; AVX512F-32-NEXT: vdbpsadbw $2, %zmm1, %zmm0, %zmm3 {%k1} {z}
@@ -1537,14 +1437,14 @@ declare <8 x i64> @llvm.x86.avx512.psad.bw.512(<64 x i8>, <64 x i8>)
define <8 x i64>@test_int_x86_avx512_mask_psadb_w_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2){
; AVX512BW-LABEL: test_int_x86_avx512_mask_psadb_w_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: vpsadbw %zmm1, %zmm0, %zmm1
; AVX512BW-NEXT: vpsadbw %zmm2, %zmm0, %zmm0
; AVX512BW-NEXT: vpaddq %zmm0, %zmm1, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_int_x86_avx512_mask_psadb_w_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: vpsadbw %zmm1, %zmm0, %zmm1
; AVX512F-32-NEXT: vpsadbw %zmm2, %zmm0, %zmm0
; AVX512F-32-NEXT: vpaddq %zmm0, %zmm1, %zmm0
@@ -1555,69 +1455,18 @@ define <8 x i64>@test_int_x86_avx512_mask_psadb_w_512(<64 x i8> %x0, <64 x i8>
ret <8 x i64> %res2
}
-declare i32 @llvm.x86.avx512.kunpck.wd(i32, i32)
-
-define i32@test_int_x86_avx512_kunpck_wd(i32 %x0, i32 %x1) {
-; AVX512BW-LABEL: test_int_x86_avx512_kunpck_wd:
-; AVX512BW: ## BB#0:
-; AVX512BW-NEXT: kmovd %edi, %k0
-; AVX512BW-NEXT: kmovd %esi, %k1
-; AVX512BW-NEXT: kunpckwd %k1, %k0, %k0
-; AVX512BW-NEXT: kmovd %k0, %eax
-; AVX512BW-NEXT: retq
-;
-; AVX512F-32-LABEL: test_int_x86_avx512_kunpck_wd:
-; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: kmovw {{[0-9]+}}(%esp), %k0
-; AVX512F-32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT: kunpckwd %k0, %k1, %k0
-; AVX512F-32-NEXT: kmovd %k0, %eax
-; AVX512F-32-NEXT: retl
- %res = call i32 @llvm.x86.avx512.kunpck.wd(i32 %x0, i32 %x1)
- ret i32 %res
-}
-
-declare i64 @llvm.x86.avx512.kunpck.dq(i64, i64)
-
-define i64@test_int_x86_avx512_kunpck_qd(i64 %x0, i64 %x1) {
-; AVX512BW-LABEL: test_int_x86_avx512_kunpck_qd:
-; AVX512BW: ## BB#0:
-; AVX512BW-NEXT: kmovq %rdi, %k0
-; AVX512BW-NEXT: kmovq %rsi, %k1
-; AVX512BW-NEXT: kunpckdq %k1, %k0, %k0
-; AVX512BW-NEXT: kmovq %k0, %rax
-; AVX512BW-NEXT: retq
-;
-; AVX512F-32-LABEL: test_int_x86_avx512_kunpck_qd:
-; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: subl $12, %esp
-; AVX512F-32-NEXT: .Lcfi0:
-; AVX512F-32-NEXT: .cfi_def_cfa_offset 16
-; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k0
-; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT: kunpckdq %k0, %k1, %k0
-; AVX512F-32-NEXT: kmovq %k0, (%esp)
-; AVX512F-32-NEXT: movl (%esp), %eax
-; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %edx
-; AVX512F-32-NEXT: addl $12, %esp
-; AVX512F-32-NEXT: retl
- %res = call i64 @llvm.x86.avx512.kunpck.dq(i64 %x0, i64 %x1)
- ret i64 %res
-}
-
declare i64 @llvm.x86.avx512.cvtb2mask.512(<64 x i8>)
define i64@test_int_x86_avx512_cvtb2mask_512(<64 x i8> %x0) {
; AVX512BW-LABEL: test_int_x86_avx512_cvtb2mask_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: vpmovb2m %zmm0, %k0
; AVX512BW-NEXT: kmovq %k0, %rax
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_int_x86_avx512_cvtb2mask_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: subl $12, %esp
-; AVX512F-32-NEXT: .Lcfi1:
; AVX512F-32-NEXT: .cfi_def_cfa_offset 16
; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
; AVX512F-32-NEXT: kmovq %k0, (%esp)
@@ -1633,13 +1482,13 @@ declare i32 @llvm.x86.avx512.cvtw2mask.512(<32 x i16>)
define i32@test_int_x86_avx512_cvtw2mask_512(<32 x i16> %x0) {
; AVX512BW-LABEL: test_int_x86_avx512_cvtw2mask_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: vpmovw2m %zmm0, %k0
; AVX512BW-NEXT: kmovd %k0, %eax
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_int_x86_avx512_cvtw2mask_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: vpmovw2m %zmm0, %k0
; AVX512F-32-NEXT: kmovd %k0, %eax
; AVX512F-32-NEXT: retl
@@ -1651,7 +1500,7 @@ declare <32 x i16> @llvm.x86.avx512.mask.psrlv32hi(<32 x i16>, <32 x i16>, <32 x
define <32 x i16>@test_int_x86_avx512_mask_psrlv32hi(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
; AVX512BW-LABEL: test_int_x86_avx512_mask_psrlv32hi:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm3
; AVX512BW-NEXT: kmovd %edi, %k1
; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm2 {%k1}
@@ -1661,7 +1510,7 @@ define <32 x i16>@test_int_x86_avx512_mask_psrlv32hi(<32 x i16> %x0, <32 x i16>
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_int_x86_avx512_mask_psrlv32hi:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: vpsrlvw %zmm1, %zmm0, %zmm3
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpsrlvw %zmm1, %zmm0, %zmm2 {%k1}
@@ -1681,7 +1530,7 @@ declare <32 x i16> @llvm.x86.avx512.mask.psrav32.hi(<32 x i16>, <32 x i16>, <32
define <32 x i16>@test_int_x86_avx512_mask_psrav32_hi(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
; AVX512BW-LABEL: test_int_x86_avx512_mask_psrav32_hi:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm3
; AVX512BW-NEXT: kmovd %edi, %k1
; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm2 {%k1}
@@ -1691,7 +1540,7 @@ define <32 x i16>@test_int_x86_avx512_mask_psrav32_hi(<32 x i16> %x0, <32 x i16>
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_int_x86_avx512_mask_psrav32_hi:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: vpsravw %zmm1, %zmm0, %zmm3
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpsravw %zmm1, %zmm0, %zmm2 {%k1}
@@ -1709,14 +1558,14 @@ define <32 x i16>@test_int_x86_avx512_mask_psrav32_hi(<32 x i16> %x0, <32 x i16>
define <32 x i16>@test_int_x86_avx512_mask_psrav32_hi_const(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
; AVX512BW-LABEL: test_int_x86_avx512_mask_psrav32_hi_const:
-; AVX512BW: ## BB#0:
-; AVX512BW-NEXT: vmovdqu16 {{.*#+}} zmm0 = [2,9,65524,23,65510,37,65496,51,2,9,65524,23,65510,37,65496,51,2,9,65524,23,65510,37,65496,51,2,9,65524,23,65510,37,65496,51]
+; AVX512BW: ## %bb.0:
+; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [2,9,65524,23,65510,37,65496,51,2,9,65524,23,65510,37,65496,51,2,9,65524,23,65510,37,65496,51,2,9,65524,23,65510,37,65496,51]
; AVX512BW-NEXT: vpsravw {{.*}}(%rip), %zmm0, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_int_x86_avx512_mask_psrav32_hi_const:
-; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vmovdqu16 {{.*#+}} zmm0 = [2,9,65524,23,65510,37,65496,51,2,9,65524,23,65510,37,65496,51,2,9,65524,23,65510,37,65496,51,2,9,65524,23,65510,37,65496,51]
+; AVX512F-32: # %bb.0:
+; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm0 = [2,9,65524,23,65510,37,65496,51,2,9,65524,23,65510,37,65496,51,2,9,65524,23,65510,37,65496,51,2,9,65524,23,65510,37,65496,51]
; AVX512F-32-NEXT: vpsravw {{\.LCPI.*}}, %zmm0, %zmm0
; AVX512F-32-NEXT: retl
%res = call <32 x i16> @llvm.x86.avx512.mask.psrav32.hi(<32 x i16> <i16 2, i16 9, i16 -12, i16 23, i16 -26, i16 37, i16 -40, i16 51, i16 2, i16 9, i16 -12, i16 23, i16 -26, i16 37, i16 -40, i16 51, i16 2, i16 9, i16 -12, i16 23, i16 -26, i16 37, i16 -40, i16 51, i16 2, i16 9, i16 -12, i16 23, i16 -26, i16 37, i16 -40, i16 51>,
@@ -1729,7 +1578,7 @@ declare <32 x i16> @llvm.x86.avx512.mask.psllv32hi(<32 x i16>, <32 x i16>, <32 x
define <32 x i16>@test_int_x86_avx512_mask_psllv32hi(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
; AVX512BW-LABEL: test_int_x86_avx512_mask_psllv32hi:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm3
; AVX512BW-NEXT: kmovd %edi, %k1
; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm2 {%k1}
@@ -1739,7 +1588,7 @@ define <32 x i16>@test_int_x86_avx512_mask_psllv32hi(<32 x i16> %x0, <32 x i16>
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_int_x86_avx512_mask_psllv32hi:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: vpsllvw %zmm1, %zmm0, %zmm3
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpsllvw %zmm1, %zmm0, %zmm2 {%k1}
@@ -1759,7 +1608,7 @@ declare <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16>, <32 x i16>,
define <32 x i16>@test_int_x86_avx512_mask_permvar_hi_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
; AVX512BW-LABEL: test_int_x86_avx512_mask_permvar_hi_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovd %edi, %k1
; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm2 {%k1}
; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm3 {%k1} {z}
@@ -1769,7 +1618,7 @@ define <32 x i16>@test_int_x86_avx512_mask_permvar_hi_512(<32 x i16> %x0, <32 x
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_int_x86_avx512_mask_permvar_hi_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpermw %zmm0, %zmm1, %zmm2 {%k1}
; AVX512F-32-NEXT: vpermw %zmm0, %zmm1, %zmm3 {%k1} {z}
@@ -1785,209 +1634,14 @@ define <32 x i16>@test_int_x86_avx512_mask_permvar_hi_512(<32 x i16> %x0, <32 x
ret <32 x i16> %res4
}
-declare i64 @llvm.x86.avx512.ptestm.b.512(<64 x i8>, <64 x i8>, i64)
-
-define i64@test_int_x86_avx512_ptestm_b_512(<64 x i8> %x0, <64 x i8> %x1, i64 %x2) {
-; AVX512BW-LABEL: test_int_x86_avx512_ptestm_b_512:
-; AVX512BW: ## BB#0:
-; AVX512BW-NEXT: kmovq %rdi, %k1
-; AVX512BW-NEXT: vptestmb %zmm1, %zmm0, %k0 {%k1}
-; AVX512BW-NEXT: kmovq %k0, %rcx
-; AVX512BW-NEXT: vptestmb %zmm1, %zmm0, %k0
-; AVX512BW-NEXT: kmovq %k0, %rax
-; AVX512BW-NEXT: addq %rcx, %rax
-; AVX512BW-NEXT: retq
-;
-; AVX512F-32-LABEL: test_int_x86_avx512_ptestm_b_512:
-; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: subl $20, %esp
-; AVX512F-32-NEXT: .Lcfi2:
-; AVX512F-32-NEXT: .cfi_def_cfa_offset 24
-; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k0
-; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT: kunpckdq %k0, %k1, %k1
-; AVX512F-32-NEXT: vptestmb %zmm1, %zmm0, %k0 {%k1}
-; AVX512F-32-NEXT: kmovq %k0, (%esp)
-; AVX512F-32-NEXT: vptestmb %zmm1, %zmm0, %k0
-; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp)
-; AVX512F-32-NEXT: movl (%esp), %eax
-; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %edx
-; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx
-; AVX512F-32-NEXT: addl $20, %esp
-; AVX512F-32-NEXT: retl
- %res = call i64 @llvm.x86.avx512.ptestm.b.512(<64 x i8> %x0, <64 x i8> %x1, i64 %x2)
- %res1 = call i64 @llvm.x86.avx512.ptestm.b.512(<64 x i8> %x0, <64 x i8> %x1, i64-1)
- %res2 = add i64 %res, %res1
- ret i64 %res2
-}
-
-declare i32 @llvm.x86.avx512.ptestm.w.512(<32 x i16>, <32 x i16>, i32)
-
-define i32@test_int_x86_avx512_ptestm_w_512(<32 x i16> %x0, <32 x i16> %x1, i32 %x2) {
-; AVX512BW-LABEL: test_int_x86_avx512_ptestm_w_512:
-; AVX512BW: ## BB#0:
-; AVX512BW-NEXT: kmovd %edi, %k1
-; AVX512BW-NEXT: vptestmw %zmm1, %zmm0, %k0 {%k1}
-; AVX512BW-NEXT: kmovd %k0, %ecx
-; AVX512BW-NEXT: vptestmw %zmm1, %zmm0, %k0
-; AVX512BW-NEXT: kmovd %k0, %eax
-; AVX512BW-NEXT: addl %ecx, %eax
-; AVX512BW-NEXT: retq
-;
-; AVX512F-32-LABEL: test_int_x86_avx512_ptestm_w_512:
-; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT: vptestmw %zmm1, %zmm0, %k0 {%k1}
-; AVX512F-32-NEXT: kmovd %k0, %ecx
-; AVX512F-32-NEXT: vptestmw %zmm1, %zmm0, %k0
-; AVX512F-32-NEXT: kmovd %k0, %eax
-; AVX512F-32-NEXT: addl %ecx, %eax
-; AVX512F-32-NEXT: retl
- %res = call i32 @llvm.x86.avx512.ptestm.w.512(<32 x i16> %x0, <32 x i16> %x1, i32 %x2)
- %res1 = call i32 @llvm.x86.avx512.ptestm.w.512(<32 x i16> %x0, <32 x i16> %x1, i32-1)
- %res2 = add i32 %res, %res1
- ret i32 %res2
-}
-
-declare i64 @llvm.x86.avx512.ptestnm.b.512(<64 x i8>, <64 x i8>, i64 %x2)
-
-define i64@test_int_x86_avx512_ptestnm_b_512(<64 x i8> %x0, <64 x i8> %x1, i64 %x2) {
-; AVX512BW-LABEL: test_int_x86_avx512_ptestnm_b_512:
-; AVX512BW: ## BB#0:
-; AVX512BW-NEXT: kmovq %rdi, %k1
-; AVX512BW-NEXT: vptestnmb %zmm1, %zmm0, %k0 {%k1}
-; AVX512BW-NEXT: kmovq %k0, %rcx
-; AVX512BW-NEXT: vptestnmb %zmm1, %zmm0, %k0
-; AVX512BW-NEXT: kmovq %k0, %rax
-; AVX512BW-NEXT: addq %rcx, %rax
-; AVX512BW-NEXT: retq
-;
-; AVX512F-32-LABEL: test_int_x86_avx512_ptestnm_b_512:
-; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: subl $20, %esp
-; AVX512F-32-NEXT: .Lcfi3:
-; AVX512F-32-NEXT: .cfi_def_cfa_offset 24
-; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k0
-; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT: kunpckdq %k0, %k1, %k1
-; AVX512F-32-NEXT: vptestnmb %zmm1, %zmm0, %k0 {%k1}
-; AVX512F-32-NEXT: kmovq %k0, (%esp)
-; AVX512F-32-NEXT: vptestnmb %zmm1, %zmm0, %k0
-; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp)
-; AVX512F-32-NEXT: movl (%esp), %eax
-; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %edx
-; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx
-; AVX512F-32-NEXT: addl $20, %esp
-; AVX512F-32-NEXT: retl
- %res = call i64 @llvm.x86.avx512.ptestnm.b.512(<64 x i8> %x0, <64 x i8> %x1, i64 %x2)
- %res1 = call i64 @llvm.x86.avx512.ptestnm.b.512(<64 x i8> %x0, <64 x i8> %x1, i64-1)
- %res2 = add i64 %res, %res1
- ret i64 %res2
-}
-
-declare i32 @llvm.x86.avx512.ptestnm.w.512(<32 x i16>, <32 x i16>, i32 %x2)
-
-define i32@test_int_x86_avx512_ptestnm_w_512(<32 x i16> %x0, <32 x i16> %x1, i32 %x2) {
-; AVX512BW-LABEL: test_int_x86_avx512_ptestnm_w_512:
-; AVX512BW: ## BB#0:
-; AVX512BW-NEXT: kmovd %edi, %k1
-; AVX512BW-NEXT: vptestnmw %zmm1, %zmm0, %k0 {%k1}
-; AVX512BW-NEXT: kmovd %k0, %ecx
-; AVX512BW-NEXT: vptestnmw %zmm1, %zmm0, %k0
-; AVX512BW-NEXT: kmovd %k0, %eax
-; AVX512BW-NEXT: addl %ecx, %eax
-; AVX512BW-NEXT: retq
-;
-; AVX512F-32-LABEL: test_int_x86_avx512_ptestnm_w_512:
-; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT: vptestnmw %zmm1, %zmm0, %k0 {%k1}
-; AVX512F-32-NEXT: kmovd %k0, %ecx
-; AVX512F-32-NEXT: vptestnmw %zmm1, %zmm0, %k0
-; AVX512F-32-NEXT: kmovd %k0, %eax
-; AVX512F-32-NEXT: addl %ecx, %eax
-; AVX512F-32-NEXT: retl
- %res = call i32 @llvm.x86.avx512.ptestnm.w.512(<32 x i16> %x0, <32 x i16> %x1, i32 %x2)
- %res1 = call i32 @llvm.x86.avx512.ptestnm.w.512(<32 x i16> %x0, <32 x i16> %x1, i32-1)
- %res2 = add i32 %res, %res1
- ret i32 %res2
-}
-
-declare <64 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.512(i8, <64 x i8>, i64)
-
-define <64 x i8>@test_int_x86_avx512_mask_pbroadcast_b_gpr_512(i8 %x0, <64 x i8> %x1, i64 %mask) {
-; AVX512BW-LABEL: test_int_x86_avx512_mask_pbroadcast_b_gpr_512:
-; AVX512BW: ## BB#0:
-; AVX512BW-NEXT: kmovq %rsi, %k1
-; AVX512BW-NEXT: vpbroadcastb %edi, %zmm1 {%k1} {z}
-; AVX512BW-NEXT: vpbroadcastb %edi, %zmm0 {%k1}
-; AVX512BW-NEXT: vpbroadcastb %edi, %zmm2
-; AVX512BW-NEXT: vpaddb %zmm0, %zmm2, %zmm0
-; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0
-; AVX512BW-NEXT: retq
-;
-; AVX512F-32-LABEL: test_int_x86_avx512_mask_pbroadcast_b_gpr_512:
-; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: movb {{[0-9]+}}(%esp), %al
-; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k0
-; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT: kunpckdq %k0, %k1, %k1
-; AVX512F-32-NEXT: vpbroadcastb %eax, %zmm1 {%k1} {z}
-; AVX512F-32-NEXT: vpbroadcastb %eax, %zmm0 {%k1}
-; AVX512F-32-NEXT: vpbroadcastb %eax, %zmm2
-; AVX512F-32-NEXT: vpaddb %zmm0, %zmm2, %zmm0
-; AVX512F-32-NEXT: vpaddb %zmm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: retl
- %res = call <64 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.512(i8 %x0, <64 x i8> %x1, i64 -1)
- %res1 = call <64 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.512(i8 %x0, <64 x i8> %x1, i64 %mask)
- %res2 = call <64 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.512(i8 %x0, <64 x i8> zeroinitializer, i64 %mask)
- %res3 = add <64 x i8> %res, %res1
- %res4 = add <64 x i8> %res2, %res3
- ret <64 x i8> %res4
-}
-
-declare <32 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.512(i16, <32 x i16>, i32)
-
-define <32 x i16>@test_int_x86_avx512_mask_pbroadcast_w_gpr_512(i16 %x0, <32 x i16> %x1, i32 %mask) {
-; AVX512BW-LABEL: test_int_x86_avx512_mask_pbroadcast_w_gpr_512:
-; AVX512BW: ## BB#0:
-; AVX512BW-NEXT: kmovd %esi, %k1
-; AVX512BW-NEXT: vpbroadcastw %edi, %zmm1 {%k1} {z}
-; AVX512BW-NEXT: vpbroadcastw %edi, %zmm0 {%k1}
-; AVX512BW-NEXT: vpbroadcastw %edi, %zmm2
-; AVX512BW-NEXT: vpaddw %zmm0, %zmm2, %zmm0
-; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0
-; AVX512BW-NEXT: retq
-;
-; AVX512F-32-LABEL: test_int_x86_avx512_mask_pbroadcast_w_gpr_512:
-; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT: movw {{[0-9]+}}(%esp), %ax
-; AVX512F-32-NEXT: vpbroadcastw %eax, %zmm1 {%k1} {z}
-; AVX512F-32-NEXT: vpbroadcastw %eax, %zmm0 {%k1}
-; AVX512F-32-NEXT: vpbroadcastw %eax, %zmm2
-; AVX512F-32-NEXT: vpaddw %zmm0, %zmm2, %zmm0
-; AVX512F-32-NEXT: vpaddw %zmm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: retl
- %res = call <32 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.512(i16 %x0, <32 x i16> %x1, i32 -1)
- %res1 = call <32 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.512(i16 %x0, <32 x i16> %x1, i32 %mask)
- %res2 = call <32 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.512(i16 %x0, <32 x i16> zeroinitializer, i32 %mask)
- %res3 = add <32 x i16> %res, %res1
- %res4 = add <32 x i16> %res2, %res3
- ret <32 x i16> %res4
-}
-
-
define <32 x i16> @test_x86_avx512_psll_w_512(<32 x i16> %a0, <8 x i16> %a1) {
; AVX512BW-LABEL: test_x86_avx512_psll_w_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: vpsllw %xmm1, %zmm0, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_x86_avx512_psll_w_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: vpsllw %xmm1, %zmm0, %zmm0
; AVX512F-32-NEXT: retl
%res = call <32 x i16> @llvm.x86.avx512.psll.w.512(<32 x i16> %a0, <8 x i16> %a1) ; <<32 x i16>> [#uses=1]
@@ -1995,14 +1649,14 @@ define <32 x i16> @test_x86_avx512_psll_w_512(<32 x i16> %a0, <8 x i16> %a1) {
}
define <32 x i16> @test_x86_avx512_mask_psll_w_512(<32 x i16> %a0, <8 x i16> %a1, <32 x i16> %passthru, i32 %mask) {
; AVX512BW-LABEL: test_x86_avx512_mask_psll_w_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovd %edi, %k1
; AVX512BW-NEXT: vpsllw %xmm1, %zmm0, %zmm2 {%k1}
; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_x86_avx512_mask_psll_w_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpsllw %xmm1, %zmm0, %zmm2 {%k1}
; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0
@@ -2014,13 +1668,13 @@ define <32 x i16> @test_x86_avx512_mask_psll_w_512(<32 x i16> %a0, <8 x i16> %a1
}
define <32 x i16> @test_x86_avx512_maskz_psll_w_512(<32 x i16> %a0, <8 x i16> %a1, i32 %mask) {
; AVX512BW-LABEL: test_x86_avx512_maskz_psll_w_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovd %edi, %k1
; AVX512BW-NEXT: vpsllw %xmm1, %zmm0, %zmm0 {%k1} {z}
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_x86_avx512_maskz_psll_w_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpsllw %xmm1, %zmm0, %zmm0 {%k1} {z}
; AVX512F-32-NEXT: retl
@@ -2034,12 +1688,12 @@ declare <32 x i16> @llvm.x86.avx512.psll.w.512(<32 x i16>, <8 x i16>) nounwind r
define <32 x i16> @test_x86_avx512_pslli_w_512(<32 x i16> %a0) {
; AVX512BW-LABEL: test_x86_avx512_pslli_w_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: vpsllw $7, %zmm0, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_x86_avx512_pslli_w_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: vpsllw $7, %zmm0, %zmm0
; AVX512F-32-NEXT: retl
%res = call <32 x i16> @llvm.x86.avx512.pslli.w.512(<32 x i16> %a0, i32 7) ; <<32 x i16>> [#uses=1]
@@ -2047,14 +1701,14 @@ define <32 x i16> @test_x86_avx512_pslli_w_512(<32 x i16> %a0) {
}
define <32 x i16> @test_x86_avx512_mask_pslli_w_512(<32 x i16> %a0, <32 x i16> %passthru, i32 %mask) {
; AVX512BW-LABEL: test_x86_avx512_mask_pslli_w_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovd %edi, %k1
; AVX512BW-NEXT: vpsllw $7, %zmm0, %zmm1 {%k1}
; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_x86_avx512_mask_pslli_w_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpsllw $7, %zmm0, %zmm1 {%k1}
; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0
@@ -2066,13 +1720,13 @@ define <32 x i16> @test_x86_avx512_mask_pslli_w_512(<32 x i16> %a0, <32 x i16> %
}
define <32 x i16> @test_x86_avx512_maskz_pslli_w_512(<32 x i16> %a0, i32 %mask) {
; AVX512BW-LABEL: test_x86_avx512_maskz_pslli_w_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovd %edi, %k1
; AVX512BW-NEXT: vpsllw $7, %zmm0, %zmm0 {%k1} {z}
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_x86_avx512_maskz_pslli_w_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpsllw $7, %zmm0, %zmm0 {%k1} {z}
; AVX512F-32-NEXT: retl
@@ -2086,12 +1740,12 @@ declare <32 x i16> @llvm.x86.avx512.pslli.w.512(<32 x i16>, i32) nounwind readno
define <32 x i16> @test_x86_avx512_psra_w_512(<32 x i16> %a0, <8 x i16> %a1) {
; AVX512BW-LABEL: test_x86_avx512_psra_w_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: vpsraw %xmm1, %zmm0, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_x86_avx512_psra_w_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: vpsraw %xmm1, %zmm0, %zmm0
; AVX512F-32-NEXT: retl
%res = call <32 x i16> @llvm.x86.avx512.psra.w.512(<32 x i16> %a0, <8 x i16> %a1) ; <<32 x i16>> [#uses=1]
@@ -2099,14 +1753,14 @@ define <32 x i16> @test_x86_avx512_psra_w_512(<32 x i16> %a0, <8 x i16> %a1) {
}
define <32 x i16> @test_x86_avx512_mask_psra_w_512(<32 x i16> %a0, <8 x i16> %a1, <32 x i16> %passthru, i32 %mask) {
; AVX512BW-LABEL: test_x86_avx512_mask_psra_w_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovd %edi, %k1
; AVX512BW-NEXT: vpsraw %xmm1, %zmm0, %zmm2 {%k1}
; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_x86_avx512_mask_psra_w_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpsraw %xmm1, %zmm0, %zmm2 {%k1}
; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0
@@ -2118,13 +1772,13 @@ define <32 x i16> @test_x86_avx512_mask_psra_w_512(<32 x i16> %a0, <8 x i16> %a1
}
define <32 x i16> @test_x86_avx512_maskz_psra_w_512(<32 x i16> %a0, <8 x i16> %a1, i32 %mask) {
; AVX512BW-LABEL: test_x86_avx512_maskz_psra_w_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovd %edi, %k1
; AVX512BW-NEXT: vpsraw %xmm1, %zmm0, %zmm0 {%k1} {z}
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_x86_avx512_maskz_psra_w_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpsraw %xmm1, %zmm0, %zmm0 {%k1} {z}
; AVX512F-32-NEXT: retl
@@ -2138,12 +1792,12 @@ declare <32 x i16> @llvm.x86.avx512.psra.w.512(<32 x i16>, <8 x i16>) nounwind r
define <32 x i16> @test_x86_avx512_psrai_w_512(<32 x i16> %a0) {
; AVX512BW-LABEL: test_x86_avx512_psrai_w_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: vpsraw $7, %zmm0, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_x86_avx512_psrai_w_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: vpsraw $7, %zmm0, %zmm0
; AVX512F-32-NEXT: retl
%res = call <32 x i16> @llvm.x86.avx512.psrai.w.512(<32 x i16> %a0, i32 7) ; <<32 x i16>> [#uses=1]
@@ -2151,14 +1805,14 @@ define <32 x i16> @test_x86_avx512_psrai_w_512(<32 x i16> %a0) {
}
define <32 x i16> @test_x86_avx512_mask_psrai_w_512(<32 x i16> %a0, <32 x i16> %passthru, i32 %mask) {
; AVX512BW-LABEL: test_x86_avx512_mask_psrai_w_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovd %edi, %k1
; AVX512BW-NEXT: vpsraw $7, %zmm0, %zmm1 {%k1}
; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_x86_avx512_mask_psrai_w_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpsraw $7, %zmm0, %zmm1 {%k1}
; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0
@@ -2170,13 +1824,13 @@ define <32 x i16> @test_x86_avx512_mask_psrai_w_512(<32 x i16> %a0, <32 x i16> %
}
define <32 x i16> @test_x86_avx512_maskz_psrai_w_512(<32 x i16> %a0, <32 x i16> %passthru, i32 %mask) {
; AVX512BW-LABEL: test_x86_avx512_maskz_psrai_w_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovd %edi, %k1
; AVX512BW-NEXT: vpsraw $7, %zmm0, %zmm0 {%k1} {z}
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_x86_avx512_maskz_psrai_w_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpsraw $7, %zmm0, %zmm0 {%k1} {z}
; AVX512F-32-NEXT: retl
@@ -2190,12 +1844,12 @@ declare <32 x i16> @llvm.x86.avx512.psrai.w.512(<32 x i16>, i32) nounwind readno
define <32 x i16> @test_x86_avx512_psrl_w_512(<32 x i16> %a0, <8 x i16> %a1) {
; AVX512BW-LABEL: test_x86_avx512_psrl_w_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_x86_avx512_psrl_w_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: vpsrlw %xmm1, %zmm0, %zmm0
; AVX512F-32-NEXT: retl
%res = call <32 x i16> @llvm.x86.avx512.psrl.w.512(<32 x i16> %a0, <8 x i16> %a1) ; <<32 x i16>> [#uses=1]
@@ -2203,14 +1857,14 @@ define <32 x i16> @test_x86_avx512_psrl_w_512(<32 x i16> %a0, <8 x i16> %a1) {
}
define <32 x i16> @test_x86_avx512_mask_psrl_w_512(<32 x i16> %a0, <8 x i16> %a1, <32 x i16> %passthru, i32 %mask) {
; AVX512BW-LABEL: test_x86_avx512_mask_psrl_w_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovd %edi, %k1
; AVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm2 {%k1}
; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_x86_avx512_mask_psrl_w_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpsrlw %xmm1, %zmm0, %zmm2 {%k1}
; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0
@@ -2222,13 +1876,13 @@ define <32 x i16> @test_x86_avx512_mask_psrl_w_512(<32 x i16> %a0, <8 x i16> %a1
}
define <32 x i16> @test_x86_avx512_maskz_psrl_w_512(<32 x i16> %a0, <8 x i16> %a1, i32 %mask) {
; AVX512BW-LABEL: test_x86_avx512_maskz_psrl_w_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovd %edi, %k1
; AVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm0 {%k1} {z}
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_x86_avx512_maskz_psrl_w_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpsrlw %xmm1, %zmm0, %zmm0 {%k1} {z}
; AVX512F-32-NEXT: retl
@@ -2242,12 +1896,12 @@ declare <32 x i16> @llvm.x86.avx512.psrl.w.512(<32 x i16>, <8 x i16>) nounwind r
define <32 x i16> @test_x86_avx512_psrli_w_512(<32 x i16> %a0) {
; AVX512BW-LABEL: test_x86_avx512_psrli_w_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: vpsrlw $7, %zmm0, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_x86_avx512_psrli_w_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: vpsrlw $7, %zmm0, %zmm0
; AVX512F-32-NEXT: retl
%res = call <32 x i16> @llvm.x86.avx512.psrli.w.512(<32 x i16> %a0, i32 7) ; <<32 x i16>> [#uses=1]
@@ -2255,14 +1909,14 @@ define <32 x i16> @test_x86_avx512_psrli_w_512(<32 x i16> %a0) {
}
define <32 x i16> @test_x86_avx512_mask_psrli_w_512(<32 x i16> %a0, <32 x i16> %passthru, i32 %mask) {
; AVX512BW-LABEL: test_x86_avx512_mask_psrli_w_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovd %edi, %k1
; AVX512BW-NEXT: vpsrlw $7, %zmm0, %zmm1 {%k1}
; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_x86_avx512_mask_psrli_w_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpsrlw $7, %zmm0, %zmm1 {%k1}
; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0
@@ -2274,13 +1928,13 @@ define <32 x i16> @test_x86_avx512_mask_psrli_w_512(<32 x i16> %a0, <32 x i16> %
}
define <32 x i16> @test_x86_avx512_maskz_psrli_w_512(<32 x i16> %a0, i32 %mask) {
; AVX512BW-LABEL: test_x86_avx512_maskz_psrli_w_512:
-; AVX512BW: ## BB#0:
+; AVX512BW: ## %bb.0:
; AVX512BW-NEXT: kmovd %edi, %k1
; AVX512BW-NEXT: vpsrlw $7, %zmm0, %zmm0 {%k1} {z}
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_x86_avx512_maskz_psrli_w_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpsrlw $7, %zmm0, %zmm0 {%k1} {z}
; AVX512F-32-NEXT: retl
diff --git a/test/CodeGen/X86/avx512bw-mask-op.ll b/test/CodeGen/X86/avx512bw-mask-op.ll
index e000ef4068f6..6d5ea0d85998 100644
--- a/test/CodeGen/X86/avx512bw-mask-op.ll
+++ b/test/CodeGen/X86/avx512bw-mask-op.ll
@@ -3,7 +3,7 @@
define i32 @mask32(i32 %x) {
; CHECK-LABEL: mask32:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k0
; CHECK-NEXT: knotd %k0, %k0
; CHECK-NEXT: kmovd %k0, %eax
@@ -19,7 +19,7 @@ define i32 @mask32(i32 %x) {
define i64 @mask64(i64 %x) {
; CHECK-LABEL: mask64:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovq %rdi, %k0
; CHECK-NEXT: knotq %k0, %k0
; CHECK-NEXT: kmovq %k0, %rax
@@ -39,7 +39,7 @@ define i64 @mask64(i64 %x) {
define void @mask32_mem(i32* %ptr) {
; CHECK-LABEL: mask32_mem:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd (%rdi), %k0
; CHECK-NEXT: knotd %k0, %k0
; CHECK-NEXT: kmovd %k0, (%rdi)
@@ -57,7 +57,7 @@ define void @mask32_mem(i32* %ptr) {
define void @mask64_mem(i64* %ptr) {
; CHECK-LABEL: mask64_mem:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovq (%rdi), %k0
; CHECK-NEXT: knotq %k0, %k0
; CHECK-NEXT: kmovq %k0, (%rdi)
@@ -79,7 +79,7 @@ define void @mask64_mem(i64* %ptr) {
define i32 @mand32(i32 %x, i32 %y) {
; CHECK-LABEL: mand32:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: movl %edi, %eax
; CHECK-NEXT: andl %esi, %eax
; CHECK-NEXT: xorl %esi, %edi
@@ -97,7 +97,7 @@ define i32 @mand32(i32 %x, i32 %y) {
define i32 @mand32_mem(<32 x i1>* %x, <32 x i1>* %y) {
; CHECK-LABEL: mand32_mem:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd (%rdi), %k0
; CHECK-NEXT: kmovd (%rsi), %k1
; CHECK-NEXT: kandd %k1, %k0, %k2
@@ -116,7 +116,7 @@ define i32 @mand32_mem(<32 x i1>* %x, <32 x i1>* %y) {
define i64 @mand64(i64 %x, i64 %y) {
; CHECK-LABEL: mand64:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: movq %rdi, %rax
; CHECK-NEXT: andq %rsi, %rax
; CHECK-NEXT: xorq %rsi, %rdi
@@ -134,7 +134,7 @@ define i64 @mand64(i64 %x, i64 %y) {
define i64 @mand64_mem(<64 x i1>* %x, <64 x i1>* %y) {
; CHECK-LABEL: mand64_mem:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovq (%rdi), %k0
; CHECK-NEXT: kmovq (%rsi), %k1
; CHECK-NEXT: kandq %k1, %k0, %k2
@@ -153,7 +153,7 @@ define i64 @mand64_mem(<64 x i1>* %x, <64 x i1>* %y) {
define i32 @test_v32i1_add(i32 %x, i32 %y) {
; CHECK-LABEL: test_v32i1_add:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k0
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: kxord %k1, %k0, %k0
@@ -168,7 +168,7 @@ define i32 @test_v32i1_add(i32 %x, i32 %y) {
define i32 @test_v32i1_sub(i32 %x, i32 %y) {
; CHECK-LABEL: test_v32i1_sub:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k0
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: kxord %k1, %k0, %k0
@@ -183,7 +183,7 @@ define i32 @test_v32i1_sub(i32 %x, i32 %y) {
define i32 @test_v32i1_mul(i32 %x, i32 %y) {
; CHECK-LABEL: test_v32i1_mul:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k0
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: kandd %k1, %k0, %k0
@@ -198,7 +198,7 @@ define i32 @test_v32i1_mul(i32 %x, i32 %y) {
define i64 @test_v64i1_add(i64 %x, i64 %y) {
; CHECK-LABEL: test_v64i1_add:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovq %rdi, %k0
; CHECK-NEXT: kmovq %rsi, %k1
; CHECK-NEXT: kxorq %k1, %k0, %k0
@@ -213,7 +213,7 @@ define i64 @test_v64i1_add(i64 %x, i64 %y) {
define i64 @test_v64i1_sub(i64 %x, i64 %y) {
; CHECK-LABEL: test_v64i1_sub:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovq %rdi, %k0
; CHECK-NEXT: kmovq %rsi, %k1
; CHECK-NEXT: kxorq %k1, %k0, %k0
@@ -228,7 +228,7 @@ define i64 @test_v64i1_sub(i64 %x, i64 %y) {
define i64 @test_v64i1_mul(i64 %x, i64 %y) {
; CHECK-LABEL: test_v64i1_mul:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovq %rdi, %k0
; CHECK-NEXT: kmovq %rsi, %k1
; CHECK-NEXT: kandq %k1, %k0, %k0
diff --git a/test/CodeGen/X86/avx512bw-mov.ll b/test/CodeGen/X86/avx512bw-mov.ll
index 11bb431414a0..7158fb262c0d 100644
--- a/test/CodeGen/X86/avx512bw-mov.ll
+++ b/test/CodeGen/X86/avx512bw-mov.ll
@@ -1,10 +1,10 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw | FileCheck %s
define <64 x i8> @test1(i8 * %addr) {
; CHECK-LABEL: test1:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vmovdqu8 (%rdi), %zmm0
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vmovups (%rdi), %zmm0
; CHECK-NEXT: retq
%vaddr = bitcast i8* %addr to <64 x i8>*
%res = load <64 x i8>, <64 x i8>* %vaddr, align 1
@@ -13,8 +13,8 @@ define <64 x i8> @test1(i8 * %addr) {
define void @test2(i8 * %addr, <64 x i8> %data) {
; CHECK-LABEL: test2:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vmovdqu8 %zmm0, (%rdi)
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vmovups %zmm0, (%rdi)
; CHECK-NEXT: retq
%vaddr = bitcast i8* %addr to <64 x i8>*
store <64 x i8>%data, <64 x i8>* %vaddr, align 1
@@ -23,8 +23,8 @@ define void @test2(i8 * %addr, <64 x i8> %data) {
define <64 x i8> @test3(i8 * %addr, <64 x i8> %old, <64 x i8> %mask1) {
; CHECK-LABEL: test3:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vpcmpneqb %zmm2, %zmm1, %k1
; CHECK-NEXT: vmovdqu8 (%rdi), %zmm0 {%k1}
; CHECK-NEXT: retq
@@ -37,8 +37,8 @@ define <64 x i8> @test3(i8 * %addr, <64 x i8> %old, <64 x i8> %mask1) {
define <64 x i8> @test4(i8 * %addr, <64 x i8> %mask1) {
; CHECK-LABEL: test4:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
; CHECK-NEXT: vpcmpneqb %zmm1, %zmm0, %k1
; CHECK-NEXT: vmovdqu8 (%rdi), %zmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -51,8 +51,8 @@ define <64 x i8> @test4(i8 * %addr, <64 x i8> %mask1) {
define <32 x i16> @test5(i8 * %addr) {
; CHECK-LABEL: test5:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vmovdqu16 (%rdi), %zmm0
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vmovups (%rdi), %zmm0
; CHECK-NEXT: retq
%vaddr = bitcast i8* %addr to <32 x i16>*
%res = load <32 x i16>, <32 x i16>* %vaddr, align 1
@@ -61,8 +61,8 @@ define <32 x i16> @test5(i8 * %addr) {
define void @test6(i8 * %addr, <32 x i16> %data) {
; CHECK-LABEL: test6:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vmovdqu16 %zmm0, (%rdi)
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vmovups %zmm0, (%rdi)
; CHECK-NEXT: retq
%vaddr = bitcast i8* %addr to <32 x i16>*
store <32 x i16>%data, <32 x i16>* %vaddr, align 1
@@ -71,8 +71,8 @@ define void @test6(i8 * %addr, <32 x i16> %data) {
define <32 x i16> @test7(i8 * %addr, <32 x i16> %old, <32 x i16> %mask1) {
; CHECK-LABEL: test7:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vpcmpneqw %zmm2, %zmm1, %k1
; CHECK-NEXT: vmovdqu16 (%rdi), %zmm0 {%k1}
; CHECK-NEXT: retq
@@ -85,8 +85,8 @@ define <32 x i16> @test7(i8 * %addr, <32 x i16> %old, <32 x i16> %mask1) {
define <32 x i16> @test8(i8 * %addr, <32 x i16> %mask1) {
; CHECK-LABEL: test8:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
; CHECK-NEXT: vpcmpneqw %zmm1, %zmm0, %k1
; CHECK-NEXT: vmovdqu16 (%rdi), %zmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -99,13 +99,12 @@ define <32 x i16> @test8(i8 * %addr, <32 x i16> %mask1) {
define <16 x i8> @test_mask_load_16xi8(<16 x i1> %mask, <16 x i8>* %addr, <16 x i8> %val) {
; CHECK-LABEL: test_mask_load_16xi8:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsllw $7, %xmm0, %xmm0
; CHECK-NEXT: vpmovb2m %zmm0, %k0
-; CHECK-NEXT: kshiftlq $48, %k0, %k0
-; CHECK-NEXT: kshiftrq $48, %k0, %k1
+; CHECK-NEXT: kmovw %k0, %k1
; CHECK-NEXT: vmovdqu8 (%rdi), %zmm0 {%k1} {z}
-; CHECK-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; CHECK-NEXT: ## kill: def %xmm0 killed %xmm0 killed %zmm0
; CHECK-NEXT: retq
%res = call <16 x i8> @llvm.masked.load.v16i8(<16 x i8>* %addr, i32 4, <16 x i1>%mask, <16 x i8> undef)
ret <16 x i8> %res
@@ -114,13 +113,12 @@ declare <16 x i8> @llvm.masked.load.v16i8(<16 x i8>*, i32, <16 x i1>, <16 x i8>)
define <32 x i8> @test_mask_load_32xi8(<32 x i1> %mask, <32 x i8>* %addr, <32 x i8> %val) {
; CHECK-LABEL: test_mask_load_32xi8:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsllw $7, %ymm0, %ymm0
; CHECK-NEXT: vpmovb2m %zmm0, %k0
-; CHECK-NEXT: kshiftlq $32, %k0, %k0
-; CHECK-NEXT: kshiftrq $32, %k0, %k1
+; CHECK-NEXT: kmovd %k0, %k1
; CHECK-NEXT: vmovdqu8 (%rdi), %zmm0 {%k1} {z}
-; CHECK-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; CHECK-NEXT: ## kill: def %ymm0 killed %ymm0 killed %zmm0
; CHECK-NEXT: retq
%res = call <32 x i8> @llvm.masked.load.v32i8(<32 x i8>* %addr, i32 4, <32 x i1>%mask, <32 x i8> zeroinitializer)
ret <32 x i8> %res
@@ -129,13 +127,13 @@ declare <32 x i8> @llvm.masked.load.v32i8(<32 x i8>*, i32, <32 x i1>, <32 x i8>)
define <8 x i16> @test_mask_load_8xi16(<8 x i1> %mask, <8 x i16>* %addr, <8 x i16> %val) {
; CHECK-LABEL: test_mask_load_8xi16:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsllw $15, %xmm0, %xmm0
; CHECK-NEXT: vpmovw2m %zmm0, %k0
; CHECK-NEXT: kshiftld $24, %k0, %k0
; CHECK-NEXT: kshiftrd $24, %k0, %k1
; CHECK-NEXT: vmovdqu16 (%rdi), %zmm0 {%k1} {z}
-; CHECK-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; CHECK-NEXT: ## kill: def %xmm0 killed %xmm0 killed %zmm0
; CHECK-NEXT: retq
%res = call <8 x i16> @llvm.masked.load.v8i16(<8 x i16>* %addr, i32 4, <8 x i1>%mask, <8 x i16> undef)
ret <8 x i16> %res
@@ -144,13 +142,12 @@ declare <8 x i16> @llvm.masked.load.v8i16(<8 x i16>*, i32, <8 x i1>, <8 x i16>)
define <16 x i16> @test_mask_load_16xi16(<16 x i1> %mask, <16 x i16>* %addr, <16 x i16> %val) {
; CHECK-LABEL: test_mask_load_16xi16:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsllw $7, %xmm0, %xmm0
; CHECK-NEXT: vpmovb2m %zmm0, %k0
-; CHECK-NEXT: kshiftld $16, %k0, %k0
-; CHECK-NEXT: kshiftrd $16, %k0, %k1
+; CHECK-NEXT: kmovw %k0, %k1
; CHECK-NEXT: vmovdqu16 (%rdi), %zmm0 {%k1} {z}
-; CHECK-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; CHECK-NEXT: ## kill: def %ymm0 killed %ymm0 killed %zmm0
; CHECK-NEXT: retq
%res = call <16 x i16> @llvm.masked.load.v16i16(<16 x i16>* %addr, i32 4, <16 x i1>%mask, <16 x i16> zeroinitializer)
ret <16 x i16> %res
@@ -159,12 +156,11 @@ declare <16 x i16> @llvm.masked.load.v16i16(<16 x i16>*, i32, <16 x i1>, <16 x i
define void @test_mask_store_16xi8(<16 x i1> %mask, <16 x i8>* %addr, <16 x i8> %val) {
; CHECK-LABEL: test_mask_store_16xi8:
-; CHECK: ## BB#0:
-; CHECK-NEXT: ## kill: %XMM1<def> %XMM1<kill> %ZMM1<def>
+; CHECK: ## %bb.0:
+; CHECK-NEXT: ## kill: def %xmm1 killed %xmm1 def %zmm1
; CHECK-NEXT: vpsllw $7, %xmm0, %xmm0
; CHECK-NEXT: vpmovb2m %zmm0, %k0
-; CHECK-NEXT: kshiftlq $48, %k0, %k0
-; CHECK-NEXT: kshiftrq $48, %k0, %k1
+; CHECK-NEXT: kmovw %k0, %k1
; CHECK-NEXT: vmovdqu8 %zmm1, (%rdi) {%k1}
; CHECK-NEXT: retq
call void @llvm.masked.store.v16i8(<16 x i8> %val, <16 x i8>* %addr, i32 4, <16 x i1>%mask)
@@ -174,12 +170,11 @@ declare void @llvm.masked.store.v16i8(<16 x i8>, <16 x i8>*, i32, <16 x i1>)
define void @test_mask_store_32xi8(<32 x i1> %mask, <32 x i8>* %addr, <32 x i8> %val) {
; CHECK-LABEL: test_mask_store_32xi8:
-; CHECK: ## BB#0:
-; CHECK-NEXT: ## kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; CHECK: ## %bb.0:
+; CHECK-NEXT: ## kill: def %ymm1 killed %ymm1 def %zmm1
; CHECK-NEXT: vpsllw $7, %ymm0, %ymm0
; CHECK-NEXT: vpmovb2m %zmm0, %k0
-; CHECK-NEXT: kshiftlq $32, %k0, %k0
-; CHECK-NEXT: kshiftrq $32, %k0, %k1
+; CHECK-NEXT: kmovd %k0, %k1
; CHECK-NEXT: vmovdqu8 %zmm1, (%rdi) {%k1}
; CHECK-NEXT: retq
call void @llvm.masked.store.v32i8(<32 x i8> %val, <32 x i8>* %addr, i32 4, <32 x i1>%mask)
@@ -189,8 +184,8 @@ declare void @llvm.masked.store.v32i8(<32 x i8>, <32 x i8>*, i32, <32 x i1>)
define void @test_mask_store_8xi16(<8 x i1> %mask, <8 x i16>* %addr, <8 x i16> %val) {
; CHECK-LABEL: test_mask_store_8xi16:
-; CHECK: ## BB#0:
-; CHECK-NEXT: ## kill: %XMM1<def> %XMM1<kill> %ZMM1<def>
+; CHECK: ## %bb.0:
+; CHECK-NEXT: ## kill: def %xmm1 killed %xmm1 def %zmm1
; CHECK-NEXT: vpsllw $15, %xmm0, %xmm0
; CHECK-NEXT: vpmovw2m %zmm0, %k0
; CHECK-NEXT: kshiftld $24, %k0, %k0
@@ -204,12 +199,11 @@ declare void @llvm.masked.store.v8i16(<8 x i16>, <8 x i16>*, i32, <8 x i1>)
define void @test_mask_store_16xi16(<16 x i1> %mask, <16 x i16>* %addr, <16 x i16> %val) {
; CHECK-LABEL: test_mask_store_16xi16:
-; CHECK: ## BB#0:
-; CHECK-NEXT: ## kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; CHECK: ## %bb.0:
+; CHECK-NEXT: ## kill: def %ymm1 killed %ymm1 def %zmm1
; CHECK-NEXT: vpsllw $7, %xmm0, %xmm0
; CHECK-NEXT: vpmovb2m %zmm0, %k0
-; CHECK-NEXT: kshiftld $16, %k0, %k0
-; CHECK-NEXT: kshiftrd $16, %k0, %k1
+; CHECK-NEXT: kmovw %k0, %k1
; CHECK-NEXT: vmovdqu16 %zmm1, (%rdi) {%k1}
; CHECK-NEXT: retq
call void @llvm.masked.store.v16i16(<16 x i16> %val, <16 x i16>* %addr, i32 4, <16 x i1>%mask)
diff --git a/test/CodeGen/X86/avx512bw-vec-cmp.ll b/test/CodeGen/X86/avx512bw-vec-cmp.ll
index 016837e61307..3d400e1b4723 100644
--- a/test/CodeGen/X86/avx512bw-vec-cmp.ll
+++ b/test/CodeGen/X86/avx512bw-vec-cmp.ll
@@ -3,7 +3,7 @@
define <64 x i8> @test1(<64 x i8> %x, <64 x i8> %y) nounwind {
; CHECK-LABEL: test1:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpcmpeqb %zmm1, %zmm0, %k1
; CHECK-NEXT: vpblendmb %zmm0, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
@@ -14,7 +14,7 @@ define <64 x i8> @test1(<64 x i8> %x, <64 x i8> %y) nounwind {
define <64 x i8> @test2(<64 x i8> %x, <64 x i8> %y, <64 x i8> %x1) nounwind {
; CHECK-LABEL: test2:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpcmpgtb %zmm1, %zmm0, %k1
; CHECK-NEXT: vpblendmb %zmm2, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
@@ -25,7 +25,7 @@ define <64 x i8> @test2(<64 x i8> %x, <64 x i8> %y, <64 x i8> %x1) nounwind {
define <32 x i16> @test3(<32 x i16> %x, <32 x i16> %y, <32 x i16> %x1) nounwind {
; CHECK-LABEL: test3:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpcmplew %zmm0, %zmm1, %k1
; CHECK-NEXT: vpblendmw %zmm2, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
@@ -36,7 +36,7 @@ define <32 x i16> @test3(<32 x i16> %x, <32 x i16> %y, <32 x i16> %x1) nounwind
define <64 x i8> @test4(<64 x i8> %x, <64 x i8> %y, <64 x i8> %x1) nounwind {
; CHECK-LABEL: test4:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpcmpnleub %zmm1, %zmm0, %k1
; CHECK-NEXT: vpblendmb %zmm2, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
@@ -47,7 +47,7 @@ define <64 x i8> @test4(<64 x i8> %x, <64 x i8> %y, <64 x i8> %x1) nounwind {
define <32 x i16> @test5(<32 x i16> %x, <32 x i16> %x1, <32 x i16>* %yp) nounwind {
; CHECK-LABEL: test5:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpcmpeqw (%rdi), %zmm0, %k1
; CHECK-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
@@ -59,7 +59,7 @@ define <32 x i16> @test5(<32 x i16> %x, <32 x i16> %x1, <32 x i16>* %yp) nounwin
define <32 x i16> @test6(<32 x i16> %x, <32 x i16> %x1, <32 x i16>* %y.ptr) nounwind {
; CHECK-LABEL: test6:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpcmpgtw (%rdi), %zmm0, %k1
; CHECK-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
@@ -71,7 +71,7 @@ define <32 x i16> @test6(<32 x i16> %x, <32 x i16> %x1, <32 x i16>* %y.ptr) noun
define <32 x i16> @test7(<32 x i16> %x, <32 x i16> %x1, <32 x i16>* %y.ptr) nounwind {
; CHECK-LABEL: test7:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpcmplew (%rdi), %zmm0, %k1
; CHECK-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
@@ -83,7 +83,7 @@ define <32 x i16> @test7(<32 x i16> %x, <32 x i16> %x1, <32 x i16>* %y.ptr) noun
define <32 x i16> @test8(<32 x i16> %x, <32 x i16> %x1, <32 x i16>* %y.ptr) nounwind {
; CHECK-LABEL: test8:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpcmpleuw (%rdi), %zmm0, %k1
; CHECK-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
@@ -95,7 +95,7 @@ define <32 x i16> @test8(<32 x i16> %x, <32 x i16> %x1, <32 x i16>* %y.ptr) noun
define <32 x i16> @test9(<32 x i16> %x, <32 x i16> %y, <32 x i16> %x1, <32 x i16> %y1) nounwind {
; CHECK-LABEL: test9:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpcmpeqw %zmm1, %zmm0, %k1
; CHECK-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 {%k1}
; CHECK-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1}
@@ -109,7 +109,7 @@ define <32 x i16> @test9(<32 x i16> %x, <32 x i16> %y, <32 x i16> %x1, <32 x i16
define <64 x i8> @test10(<64 x i8> %x, <64 x i8> %y, <64 x i8> %x1, <64 x i8> %y1) nounwind {
; CHECK-LABEL: test10:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpcmpleb %zmm1, %zmm0, %k1
; CHECK-NEXT: vpcmpleb %zmm2, %zmm3, %k1 {%k1}
; CHECK-NEXT: vpblendmb %zmm0, %zmm2, %zmm0 {%k1}
@@ -123,7 +123,7 @@ define <64 x i8> @test10(<64 x i8> %x, <64 x i8> %y, <64 x i8> %x1, <64 x i8> %y
define <64 x i8> @test11(<64 x i8> %x, <64 x i8>* %y.ptr, <64 x i8> %x1, <64 x i8> %y1) nounwind {
; CHECK-LABEL: test11:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpcmpgtb %zmm2, %zmm1, %k1
; CHECK-NEXT: vpcmpgtb (%rdi), %zmm0, %k1 {%k1}
; CHECK-NEXT: vpblendmb %zmm0, %zmm1, %zmm0 {%k1}
@@ -138,7 +138,7 @@ define <64 x i8> @test11(<64 x i8> %x, <64 x i8>* %y.ptr, <64 x i8> %x1, <64 x i
define <32 x i16> @test12(<32 x i16> %x, <32 x i16>* %y.ptr, <32 x i16> %x1, <32 x i16> %y1) nounwind {
; CHECK-LABEL: test12:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpcmplew %zmm1, %zmm2, %k1
; CHECK-NEXT: vpcmpleuw (%rdi), %zmm0, %k1 {%k1}
; CHECK-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1}
diff --git a/test/CodeGen/X86/avx512bw-vec-test-testn.ll b/test/CodeGen/X86/avx512bw-vec-test-testn.ll
new file mode 100644
index 000000000000..6ae2f093a2b3
--- /dev/null
+++ b/test/CodeGen/X86/avx512bw-vec-test-testn.ll
@@ -0,0 +1,145 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s
+
+; Function Attrs: norecurse nounwind readnone
+define zeroext i32 @TEST_mm512_test_epi16_mask(<8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
+; CHECK-LABEL: TEST_mm512_test_epi16_mask:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vptestmw %zmm0, %zmm1, %k0
+; CHECK-NEXT: kmovd %k0, %eax
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+entry:
+ %and.i.i = and <8 x i64> %__B, %__A
+ %0 = bitcast <8 x i64> %and.i.i to <32 x i16>
+ %1 = icmp ne <32 x i16> %0, zeroinitializer
+ %2 = bitcast <32 x i1> %1 to i32
+ ret i32 %2
+}
+
+
+; Function Attrs: norecurse nounwind readnone
+define zeroext i64 @TEST_mm512_test_epi8_mask(<8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
+; CHECK-LABEL: TEST_mm512_test_epi8_mask:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vptestmb %zmm0, %zmm1, %k0
+; CHECK-NEXT: kmovq %k0, %rax
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+entry:
+ %and.i.i = and <8 x i64> %__B, %__A
+ %0 = bitcast <8 x i64> %and.i.i to <64 x i8>
+ %1 = icmp ne <64 x i8> %0, zeroinitializer
+ %2 = bitcast <64 x i1> %1 to i64
+ ret i64 %2
+}
+
+; Function Attrs: norecurse nounwind readnone
+define zeroext i32 @TEST_mm512_mask_test_epi16_mask(i32 %__U, <8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
+; CHECK-LABEL: TEST_mm512_mask_test_epi16_mask:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vptestmw %zmm0, %zmm1, %k0 {%k1}
+; CHECK-NEXT: kmovd %k0, %eax
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+entry:
+ %and.i.i = and <8 x i64> %__B, %__A
+ %0 = bitcast <8 x i64> %and.i.i to <32 x i16>
+ %1 = icmp ne <32 x i16> %0, zeroinitializer
+ %2 = bitcast i32 %__U to <32 x i1>
+ %3 = and <32 x i1> %1, %2
+ %4 = bitcast <32 x i1> %3 to i32
+ ret i32 %4
+}
+
+; Function Attrs: norecurse nounwind readnone
+define zeroext i64 @TEST_mm512_mask_test_epi8_mask(i64 %__U, <8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
+; CHECK-LABEL: TEST_mm512_mask_test_epi8_mask:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: kmovq %rdi, %k1
+; CHECK-NEXT: vptestmb %zmm0, %zmm1, %k0 {%k1}
+; CHECK-NEXT: kmovq %k0, %rax
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+entry:
+ %and.i.i = and <8 x i64> %__B, %__A
+ %0 = bitcast <8 x i64> %and.i.i to <64 x i8>
+ %1 = icmp ne <64 x i8> %0, zeroinitializer
+ %2 = bitcast i64 %__U to <64 x i1>
+ %3 = and <64 x i1> %1, %2
+ %4 = bitcast <64 x i1> %3 to i64
+ ret i64 %4
+}
+
+; Function Attrs: norecurse nounwind readnone
+define zeroext i32 @TEST_mm512_testn_epi16_mask(<8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
+; CHECK-LABEL: TEST_mm512_testn_epi16_mask:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vptestnmw %zmm0, %zmm1, %k0
+; CHECK-NEXT: kmovd %k0, %eax
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+entry:
+ %and.i.i = and <8 x i64> %__B, %__A
+ %0 = bitcast <8 x i64> %and.i.i to <32 x i16>
+ %1 = icmp eq <32 x i16> %0, zeroinitializer
+ %2 = bitcast <32 x i1> %1 to i32
+ ret i32 %2
+}
+
+
+; Function Attrs: norecurse nounwind readnone
+define zeroext i64 @TEST_mm512_testn_epi8_mask(<8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
+; CHECK-LABEL: TEST_mm512_testn_epi8_mask:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vptestnmb %zmm0, %zmm1, %k0
+; CHECK-NEXT: kmovq %k0, %rax
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+entry:
+ %and.i.i = and <8 x i64> %__B, %__A
+ %0 = bitcast <8 x i64> %and.i.i to <64 x i8>
+ %1 = icmp eq <64 x i8> %0, zeroinitializer
+ %2 = bitcast <64 x i1> %1 to i64
+ ret i64 %2
+}
+
+; Function Attrs: norecurse nounwind readnone
+define zeroext i32 @TEST_mm512_mask_testn_epi16_mask(i32 %__U, <8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
+; CHECK-LABEL: TEST_mm512_mask_testn_epi16_mask:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vptestnmw %zmm0, %zmm1, %k0 {%k1}
+; CHECK-NEXT: kmovd %k0, %eax
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+entry:
+ %and.i.i = and <8 x i64> %__B, %__A
+ %0 = bitcast <8 x i64> %and.i.i to <32 x i16>
+ %1 = icmp eq <32 x i16> %0, zeroinitializer
+ %2 = bitcast i32 %__U to <32 x i1>
+ %3 = and <32 x i1> %1, %2
+ %4 = bitcast <32 x i1> %3 to i32
+ ret i32 %4
+}
+
+; Function Attrs: norecurse nounwind readnone
+define zeroext i64 @TEST_mm512_mask_testn_epi8_mask(i64 %__U, <8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
+; CHECK-LABEL: TEST_mm512_mask_testn_epi8_mask:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: kmovq %rdi, %k1
+; CHECK-NEXT: vptestnmb %zmm0, %zmm1, %k0 {%k1}
+; CHECK-NEXT: kmovq %k0, %rax
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+entry:
+ %and.i.i = and <8 x i64> %__B, %__A
+ %0 = bitcast <8 x i64> %and.i.i to <64 x i8>
+ %1 = icmp eq <64 x i8> %0, zeroinitializer
+ %2 = bitcast i64 %__U to <64 x i1>
+ %3 = and <64 x i1> %1, %2
+ %4 = bitcast <64 x i1> %3 to i64
+ ret i64 %4
+}
+
diff --git a/test/CodeGen/X86/avx512bwvl-intrinsics-fast-isel.ll b/test/CodeGen/X86/avx512bwvl-intrinsics-fast-isel.ll
index 98b346a2d733..aac83f47ae34 100644
--- a/test/CodeGen/X86/avx512bwvl-intrinsics-fast-isel.ll
+++ b/test/CodeGen/X86/avx512bwvl-intrinsics-fast-isel.ll
@@ -4,14 +4,591 @@
; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/avx512vlbw-builtins.c
+define zeroext i16 @test_mm_test_epi8_mask(<2 x i64> %__A, <2 x i64> %__B) {
+; X32-LABEL: test_mm_test_epi8_mask:
+; X32: # %bb.0: # %entry
+; X32-NEXT: vptestmb %xmm0, %xmm1, %k0
+; X32-NEXT: kmovd %k0, %eax
+; X32-NEXT: movzwl %ax, %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_test_epi8_mask:
+; X64: # %bb.0: # %entry
+; X64-NEXT: vptestmb %xmm0, %xmm1, %k0
+; X64-NEXT: kmovd %k0, %eax
+; X64-NEXT: movzwl %ax, %eax
+; X64-NEXT: retq
+entry:
+ %and.i.i = and <2 x i64> %__B, %__A
+ %0 = bitcast <2 x i64> %and.i.i to <16 x i8>
+ %1 = icmp ne <16 x i8> %0, zeroinitializer
+ %2 = bitcast <16 x i1> %1 to i16
+ ret i16 %2
+}
+
+define zeroext i16 @test_mm_mask_test_epi8_mask(i16 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
+; X32-LABEL: test_mm_mask_test_epi8_mask:
+; X32: # %bb.0: # %entry
+; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
+; X32-NEXT: vptestmb %xmm0, %xmm1, %k0 {%k1}
+; X32-NEXT: kmovd %k0, %eax
+; X32-NEXT: movzwl %ax, %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_mask_test_epi8_mask:
+; X64: # %bb.0: # %entry
+; X64-NEXT: kmovd %edi, %k1
+; X64-NEXT: vptestmb %xmm0, %xmm1, %k0 {%k1}
+; X64-NEXT: kmovd %k0, %eax
+; X64-NEXT: movzwl %ax, %eax
+; X64-NEXT: retq
+entry:
+ %and.i.i = and <2 x i64> %__B, %__A
+ %0 = bitcast <2 x i64> %and.i.i to <16 x i8>
+ %1 = icmp ne <16 x i8> %0, zeroinitializer
+ %2 = bitcast i16 %__U to <16 x i1>
+ %3 = and <16 x i1> %1, %2
+ %4 = bitcast <16 x i1> %3 to i16
+ ret i16 %4
+}
+
+define i32 @test_mm256_test_epi8_mask(<4 x i64> %__A, <4 x i64> %__B) {
+; X32-LABEL: test_mm256_test_epi8_mask:
+; X32: # %bb.0: # %entry
+; X32-NEXT: vptestmb %ymm0, %ymm1, %k0
+; X32-NEXT: kmovd %k0, %eax
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_test_epi8_mask:
+; X64: # %bb.0: # %entry
+; X64-NEXT: vptestmb %ymm0, %ymm1, %k0
+; X64-NEXT: kmovd %k0, %eax
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+entry:
+ %and.i.i = and <4 x i64> %__B, %__A
+ %0 = bitcast <4 x i64> %and.i.i to <32 x i8>
+ %1 = icmp ne <32 x i8> %0, zeroinitializer
+ %2 = bitcast <32 x i1> %1 to i32
+ ret i32 %2
+}
+
+define i32 @test_mm256_mask_test_epi8_mask(i32 %__U, <4 x i64> %__A, <4 x i64> %__B) {
+; X32-LABEL: test_mm256_mask_test_epi8_mask:
+; X32: # %bb.0: # %entry
+; X32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
+; X32-NEXT: vptestmb %ymm0, %ymm1, %k0 {%k1}
+; X32-NEXT: kmovd %k0, %eax
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_mask_test_epi8_mask:
+; X64: # %bb.0: # %entry
+; X64-NEXT: kmovd %edi, %k1
+; X64-NEXT: vptestmb %ymm0, %ymm1, %k0 {%k1}
+; X64-NEXT: kmovd %k0, %eax
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+entry:
+ %and.i.i = and <4 x i64> %__B, %__A
+ %0 = bitcast <4 x i64> %and.i.i to <32 x i8>
+ %1 = icmp ne <32 x i8> %0, zeroinitializer
+ %2 = bitcast i32 %__U to <32 x i1>
+ %3 = and <32 x i1> %1, %2
+ %4 = bitcast <32 x i1> %3 to i32
+ ret i32 %4
+}
+
+define zeroext i8 @test_mm_test_epi16_mask(<2 x i64> %__A, <2 x i64> %__B) {
+; X32-LABEL: test_mm_test_epi16_mask:
+; X32: # %bb.0: # %entry
+; X32-NEXT: vptestmw %xmm0, %xmm1, %k0
+; X32-NEXT: kmovd %k0, %eax
+; X32-NEXT: movzbl %al, %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_test_epi16_mask:
+; X64: # %bb.0: # %entry
+; X64-NEXT: vptestmw %xmm0, %xmm1, %k0
+; X64-NEXT: kmovd %k0, %eax
+; X64-NEXT: movzbl %al, %eax
+; X64-NEXT: retq
+entry:
+ %and.i.i = and <2 x i64> %__B, %__A
+ %0 = bitcast <2 x i64> %and.i.i to <8 x i16>
+ %1 = icmp ne <8 x i16> %0, zeroinitializer
+ %2 = bitcast <8 x i1> %1 to i8
+ ret i8 %2
+}
+
+define zeroext i8 @test_mm_mask_test_epi16_mask(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
+; X32-LABEL: test_mm_mask_test_epi16_mask:
+; X32: # %bb.0: # %entry
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kmovd %eax, %k1
+; X32-NEXT: vptestmw %xmm0, %xmm1, %k0 {%k1}
+; X32-NEXT: kmovd %k0, %eax
+; X32-NEXT: movzbl %al, %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_mask_test_epi16_mask:
+; X64: # %bb.0: # %entry
+; X64-NEXT: kmovd %edi, %k1
+; X64-NEXT: vptestmw %xmm0, %xmm1, %k0 {%k1}
+; X64-NEXT: kmovd %k0, %eax
+; X64-NEXT: movzbl %al, %eax
+; X64-NEXT: retq
+entry:
+ %and.i.i = and <2 x i64> %__B, %__A
+ %0 = bitcast <2 x i64> %and.i.i to <8 x i16>
+ %1 = icmp ne <8 x i16> %0, zeroinitializer
+ %2 = bitcast i8 %__U to <8 x i1>
+ %3 = and <8 x i1> %1, %2
+ %4 = bitcast <8 x i1> %3 to i8
+ ret i8 %4
+}
+
+define zeroext i16 @test_mm256_test_epi16_mask(<4 x i64> %__A, <4 x i64> %__B) {
+; X32-LABEL: test_mm256_test_epi16_mask:
+; X32: # %bb.0: # %entry
+; X32-NEXT: vptestmw %ymm0, %ymm1, %k0
+; X32-NEXT: kmovd %k0, %eax
+; X32-NEXT: movzwl %ax, %eax
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_test_epi16_mask:
+; X64: # %bb.0: # %entry
+; X64-NEXT: vptestmw %ymm0, %ymm1, %k0
+; X64-NEXT: kmovd %k0, %eax
+; X64-NEXT: movzwl %ax, %eax
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+entry:
+ %and.i.i = and <4 x i64> %__B, %__A
+ %0 = bitcast <4 x i64> %and.i.i to <16 x i16>
+ %1 = icmp ne <16 x i16> %0, zeroinitializer
+ %2 = bitcast <16 x i1> %1 to i16
+ ret i16 %2
+}
+
+define zeroext i16 @test_mm256_mask_test_epi16_mask(i16 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
+; X32-LABEL: test_mm256_mask_test_epi16_mask:
+; X32: # %bb.0: # %entry
+; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
+; X32-NEXT: vptestmw %ymm0, %ymm1, %k0 {%k1}
+; X32-NEXT: kmovd %k0, %eax
+; X32-NEXT: movzwl %ax, %eax
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_mask_test_epi16_mask:
+; X64: # %bb.0: # %entry
+; X64-NEXT: kmovd %edi, %k1
+; X64-NEXT: vptestmw %ymm0, %ymm1, %k0 {%k1}
+; X64-NEXT: kmovd %k0, %eax
+; X64-NEXT: movzwl %ax, %eax
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+entry:
+ %and.i.i = and <4 x i64> %__B, %__A
+ %0 = bitcast <4 x i64> %and.i.i to <16 x i16>
+ %1 = icmp ne <16 x i16> %0, zeroinitializer
+ %2 = bitcast i16 %__U to <16 x i1>
+ %3 = and <16 x i1> %1, %2
+ %4 = bitcast <16 x i1> %3 to i16
+ ret i16 %4
+}
+
+define zeroext i16 @test_mm_testn_epi8_mask(<2 x i64> %__A, <2 x i64> %__B) {
+; X32-LABEL: test_mm_testn_epi8_mask:
+; X32: # %bb.0: # %entry
+; X32-NEXT: vptestnmb %xmm0, %xmm1, %k0
+; X32-NEXT: kmovd %k0, %eax
+; X32-NEXT: movzwl %ax, %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_testn_epi8_mask:
+; X64: # %bb.0: # %entry
+; X64-NEXT: vptestnmb %xmm0, %xmm1, %k0
+; X64-NEXT: kmovd %k0, %eax
+; X64-NEXT: movzwl %ax, %eax
+; X64-NEXT: retq
+entry:
+ %and.i.i = and <2 x i64> %__B, %__A
+ %0 = bitcast <2 x i64> %and.i.i to <16 x i8>
+ %1 = icmp eq <16 x i8> %0, zeroinitializer
+ %2 = bitcast <16 x i1> %1 to i16
+ ret i16 %2
+}
+
+define zeroext i16 @test_mm_mask_testn_epi8_mask(i16 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
+; X32-LABEL: test_mm_mask_testn_epi8_mask:
+; X32: # %bb.0: # %entry
+; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
+; X32-NEXT: vptestnmb %xmm0, %xmm1, %k0 {%k1}
+; X32-NEXT: kmovd %k0, %eax
+; X32-NEXT: movzwl %ax, %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_mask_testn_epi8_mask:
+; X64: # %bb.0: # %entry
+; X64-NEXT: kmovd %edi, %k1
+; X64-NEXT: vptestnmb %xmm0, %xmm1, %k0 {%k1}
+; X64-NEXT: kmovd %k0, %eax
+; X64-NEXT: movzwl %ax, %eax
+; X64-NEXT: retq
+entry:
+ %and.i.i = and <2 x i64> %__B, %__A
+ %0 = bitcast <2 x i64> %and.i.i to <16 x i8>
+ %1 = icmp eq <16 x i8> %0, zeroinitializer
+ %2 = bitcast i16 %__U to <16 x i1>
+ %3 = and <16 x i1> %1, %2
+ %4 = bitcast <16 x i1> %3 to i16
+ ret i16 %4
+}
+
+define i32 @test_mm256_testn_epi8_mask(<4 x i64> %__A, <4 x i64> %__B) {
+; X32-LABEL: test_mm256_testn_epi8_mask:
+; X32: # %bb.0: # %entry
+; X32-NEXT: vptestnmb %ymm0, %ymm1, %k0
+; X32-NEXT: kmovd %k0, %eax
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_testn_epi8_mask:
+; X64: # %bb.0: # %entry
+; X64-NEXT: vptestnmb %ymm0, %ymm1, %k0
+; X64-NEXT: kmovd %k0, %eax
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+entry:
+ %and.i.i = and <4 x i64> %__B, %__A
+ %0 = bitcast <4 x i64> %and.i.i to <32 x i8>
+ %1 = icmp eq <32 x i8> %0, zeroinitializer
+ %2 = bitcast <32 x i1> %1 to i32
+ ret i32 %2
+}
+
+define i32 @test_mm256_mask_testn_epi8_mask(i32 %__U, <4 x i64> %__A, <4 x i64> %__B) {
+; X32-LABEL: test_mm256_mask_testn_epi8_mask:
+; X32: # %bb.0: # %entry
+; X32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
+; X32-NEXT: vptestnmb %ymm0, %ymm1, %k0 {%k1}
+; X32-NEXT: kmovd %k0, %eax
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_mask_testn_epi8_mask:
+; X64: # %bb.0: # %entry
+; X64-NEXT: kmovd %edi, %k1
+; X64-NEXT: vptestnmb %ymm0, %ymm1, %k0 {%k1}
+; X64-NEXT: kmovd %k0, %eax
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+entry:
+ %and.i.i = and <4 x i64> %__B, %__A
+ %0 = bitcast <4 x i64> %and.i.i to <32 x i8>
+ %1 = icmp eq <32 x i8> %0, zeroinitializer
+ %2 = bitcast i32 %__U to <32 x i1>
+ %3 = and <32 x i1> %1, %2
+ %4 = bitcast <32 x i1> %3 to i32
+ ret i32 %4
+}
+
+define zeroext i8 @test_mm_testn_epi16_mask(<2 x i64> %__A, <2 x i64> %__B) {
+; X32-LABEL: test_mm_testn_epi16_mask:
+; X32: # %bb.0: # %entry
+; X32-NEXT: vptestnmw %xmm0, %xmm1, %k0
+; X32-NEXT: kmovd %k0, %eax
+; X32-NEXT: movzbl %al, %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_testn_epi16_mask:
+; X64: # %bb.0: # %entry
+; X64-NEXT: vptestnmw %xmm0, %xmm1, %k0
+; X64-NEXT: kmovd %k0, %eax
+; X64-NEXT: movzbl %al, %eax
+; X64-NEXT: retq
+entry:
+ %and.i.i = and <2 x i64> %__B, %__A
+ %0 = bitcast <2 x i64> %and.i.i to <8 x i16>
+ %1 = icmp eq <8 x i16> %0, zeroinitializer
+ %2 = bitcast <8 x i1> %1 to i8
+ ret i8 %2
+}
+
+define zeroext i8 @test_mm_mask_testn_epi16_mask(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
+; X32-LABEL: test_mm_mask_testn_epi16_mask:
+; X32: # %bb.0: # %entry
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kmovd %eax, %k1
+; X32-NEXT: vptestnmw %xmm0, %xmm1, %k0 {%k1}
+; X32-NEXT: kmovd %k0, %eax
+; X32-NEXT: movzbl %al, %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_mask_testn_epi16_mask:
+; X64: # %bb.0: # %entry
+; X64-NEXT: kmovd %edi, %k1
+; X64-NEXT: vptestnmw %xmm0, %xmm1, %k0 {%k1}
+; X64-NEXT: kmovd %k0, %eax
+; X64-NEXT: movzbl %al, %eax
+; X64-NEXT: retq
+entry:
+ %and.i.i = and <2 x i64> %__B, %__A
+ %0 = bitcast <2 x i64> %and.i.i to <8 x i16>
+ %1 = icmp eq <8 x i16> %0, zeroinitializer
+ %2 = bitcast i8 %__U to <8 x i1>
+ %3 = and <8 x i1> %1, %2
+ %4 = bitcast <8 x i1> %3 to i8
+ ret i8 %4
+}
+
+define zeroext i16 @test_mm256_testn_epi16_mask(<4 x i64> %__A, <4 x i64> %__B) {
+; X32-LABEL: test_mm256_testn_epi16_mask:
+; X32: # %bb.0: # %entry
+; X32-NEXT: vptestnmw %ymm0, %ymm1, %k0
+; X32-NEXT: kmovd %k0, %eax
+; X32-NEXT: movzwl %ax, %eax
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_testn_epi16_mask:
+; X64: # %bb.0: # %entry
+; X64-NEXT: vptestnmw %ymm0, %ymm1, %k0
+; X64-NEXT: kmovd %k0, %eax
+; X64-NEXT: movzwl %ax, %eax
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+entry:
+ %and.i.i = and <4 x i64> %__B, %__A
+ %0 = bitcast <4 x i64> %and.i.i to <16 x i16>
+ %1 = icmp eq <16 x i16> %0, zeroinitializer
+ %2 = bitcast <16 x i1> %1 to i16
+ ret i16 %2
+}
+
+define zeroext i16 @test_mm256_mask_testn_epi16_mask(i16 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
+; X32-LABEL: test_mm256_mask_testn_epi16_mask:
+; X32: # %bb.0: # %entry
+; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
+; X32-NEXT: vptestnmw %ymm0, %ymm1, %k0 {%k1}
+; X32-NEXT: kmovd %k0, %eax
+; X32-NEXT: movzwl %ax, %eax
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_mask_testn_epi16_mask:
+; X64: # %bb.0: # %entry
+; X64-NEXT: kmovd %edi, %k1
+; X64-NEXT: vptestnmw %ymm0, %ymm1, %k0 {%k1}
+; X64-NEXT: kmovd %k0, %eax
+; X64-NEXT: movzwl %ax, %eax
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+entry:
+ %and.i.i = and <4 x i64> %__B, %__A
+ %0 = bitcast <4 x i64> %and.i.i to <16 x i16>
+ %1 = icmp eq <16 x i16> %0, zeroinitializer
+ %2 = bitcast i16 %__U to <16 x i1>
+ %3 = and <16 x i1> %1, %2
+ %4 = bitcast <16 x i1> %3 to i16
+ ret i16 %4
+}
+
+define <2 x i64> @test_mm_mask_set1_epi8(<2 x i64> %__O, i16 zeroext %__M, i8 signext %__A) local_unnamed_addr #0 {
+; X32-LABEL: test_mm_mask_set1_epi8:
+; X32: # %bb.0: # %entry
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
+; X32-NEXT: vpbroadcastb %eax, %xmm0 {%k1}
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_mask_set1_epi8:
+; X64: # %bb.0: # %entry
+; X64-NEXT: kmovd %edi, %k1
+; X64-NEXT: vpbroadcastb %esi, %xmm0 {%k1}
+; X64-NEXT: retq
+entry:
+ %vecinit.i.i = insertelement <16 x i8> undef, i8 %__A, i32 0
+ %vecinit15.i.i = shufflevector <16 x i8> %vecinit.i.i, <16 x i8> undef, <16 x i32> zeroinitializer
+ %0 = bitcast <2 x i64> %__O to <16 x i8>
+ %1 = bitcast i16 %__M to <16 x i1>
+ %2 = select <16 x i1> %1, <16 x i8> %vecinit15.i.i, <16 x i8> %0
+ %3 = bitcast <16 x i8> %2 to <2 x i64>
+ ret <2 x i64> %3
+}
+
+define <2 x i64> @test_mm_maskz_set1_epi8(i16 zeroext %__M, i8 signext %__A) {
+; X32-LABEL: test_mm_maskz_set1_epi8:
+; X32: # %bb.0: # %entry
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
+; X32-NEXT: vpbroadcastb %eax, %xmm0 {%k1} {z}
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_maskz_set1_epi8:
+; X64: # %bb.0: # %entry
+; X64-NEXT: kmovd %edi, %k1
+; X64-NEXT: vpbroadcastb %esi, %xmm0 {%k1} {z}
+; X64-NEXT: retq
+entry:
+ %vecinit.i.i = insertelement <16 x i8> undef, i8 %__A, i32 0
+ %vecinit15.i.i = shufflevector <16 x i8> %vecinit.i.i, <16 x i8> undef, <16 x i32> zeroinitializer
+ %0 = bitcast i16 %__M to <16 x i1>
+ %1 = select <16 x i1> %0, <16 x i8> %vecinit15.i.i, <16 x i8> zeroinitializer
+ %2 = bitcast <16 x i8> %1 to <2 x i64>
+ ret <2 x i64> %2
+}
+
+define <4 x i64> @test_mm256_mask_set1_epi8(<4 x i64> %__O, i32 %__M, i8 signext %__A){
+; X32-LABEL: test_mm256_mask_set1_epi8:
+; X32: # %bb.0: # %entry
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
+; X32-NEXT: vpbroadcastb %eax, %ymm0 {%k1}
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_mask_set1_epi8:
+; X64: # %bb.0: # %entry
+; X64-NEXT: kmovd %edi, %k1
+; X64-NEXT: vpbroadcastb %esi, %ymm0 {%k1}
+; X64-NEXT: retq
+entry:
+ %vecinit.i.i = insertelement <32 x i8> undef, i8 %__A, i32 0
+ %vecinit31.i.i = shufflevector <32 x i8> %vecinit.i.i, <32 x i8> undef, <32 x i32> zeroinitializer
+ %0 = bitcast <4 x i64> %__O to <32 x i8>
+ %1 = bitcast i32 %__M to <32 x i1>
+ %2 = select <32 x i1> %1, <32 x i8> %vecinit31.i.i, <32 x i8> %0
+ %3 = bitcast <32 x i8> %2 to <4 x i64>
+ ret <4 x i64> %3
+}
+
+define <4 x i64> @test_mm256_maskz_set1_epi8(i32 %__M, i8 signext %__A) {
+; X32-LABEL: test_mm256_maskz_set1_epi8:
+; X32: # %bb.0: # %entry
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
+; X32-NEXT: vpbroadcastb %eax, %ymm0 {%k1} {z}
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_maskz_set1_epi8:
+; X64: # %bb.0: # %entry
+; X64-NEXT: kmovd %edi, %k1
+; X64-NEXT: vpbroadcastb %esi, %ymm0 {%k1} {z}
+; X64-NEXT: retq
+entry:
+ %vecinit.i.i = insertelement <32 x i8> undef, i8 %__A, i32 0
+ %vecinit31.i.i = shufflevector <32 x i8> %vecinit.i.i, <32 x i8> undef, <32 x i32> zeroinitializer
+ %0 = bitcast i32 %__M to <32 x i1>
+ %1 = select <32 x i1> %0, <32 x i8> %vecinit31.i.i, <32 x i8> zeroinitializer
+ %2 = bitcast <32 x i8> %1 to <4 x i64>
+ ret <4 x i64> %2
+}
+
+define <4 x i64> @test_mm256_mask_set1_epi16(<4 x i64> %__O, i16 zeroext %__M, i16 signext %__A) {
+; X32-LABEL: test_mm256_mask_set1_epi16:
+; X32: # %bb.0: # %entry
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
+; X32-NEXT: vpbroadcastw %eax, %ymm0 {%k1}
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_mask_set1_epi16:
+; X64: # %bb.0: # %entry
+; X64-NEXT: kmovd %edi, %k1
+; X64-NEXT: vpbroadcastw %esi, %ymm0 {%k1}
+; X64-NEXT: retq
+entry:
+ %vecinit.i.i = insertelement <16 x i16> undef, i16 %__A, i32 0
+ %vecinit15.i.i = shufflevector <16 x i16> %vecinit.i.i, <16 x i16> undef, <16 x i32> zeroinitializer
+ %0 = bitcast <4 x i64> %__O to <16 x i16>
+ %1 = bitcast i16 %__M to <16 x i1>
+ %2 = select <16 x i1> %1, <16 x i16> %vecinit15.i.i, <16 x i16> %0
+ %3 = bitcast <16 x i16> %2 to <4 x i64>
+ ret <4 x i64> %3
+}
+
+define <4 x i64> @test_mm256_maskz_set1_epi16(i16 zeroext %__M, i16 signext %__A) {
+; X32-LABEL: test_mm256_maskz_set1_epi16:
+; X32: # %bb.0: # %entry
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
+; X32-NEXT: vpbroadcastw %eax, %ymm0 {%k1} {z}
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_maskz_set1_epi16:
+; X64: # %bb.0: # %entry
+; X64-NEXT: kmovd %edi, %k1
+; X64-NEXT: vpbroadcastw %esi, %ymm0 {%k1} {z}
+; X64-NEXT: retq
+entry:
+ %vecinit.i.i = insertelement <16 x i16> undef, i16 %__A, i32 0
+ %vecinit15.i.i = shufflevector <16 x i16> %vecinit.i.i, <16 x i16> undef, <16 x i32> zeroinitializer
+ %0 = bitcast i16 %__M to <16 x i1>
+ %1 = select <16 x i1> %0, <16 x i16> %vecinit15.i.i, <16 x i16> zeroinitializer
+ %2 = bitcast <16 x i16> %1 to <4 x i64>
+ ret <4 x i64> %2
+}
+
+define <2 x i64> @test_mm_mask_set1_epi16(<2 x i64> %__O, i8 zeroext %__M, i16 signext %__A) {
+; X32-LABEL: test_mm_mask_set1_epi16:
+; X32: # %bb.0: # %entry
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movb {{[0-9]+}}(%esp), %cl
+; X32-NEXT: kmovd %ecx, %k1
+; X32-NEXT: vpbroadcastw %eax, %xmm0 {%k1}
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_mask_set1_epi16:
+; X64: # %bb.0: # %entry
+; X64-NEXT: kmovd %edi, %k1
+; X64-NEXT: vpbroadcastw %esi, %xmm0 {%k1}
+; X64-NEXT: retq
+entry:
+ %vecinit.i.i = insertelement <8 x i16> undef, i16 %__A, i32 0
+ %vecinit7.i.i = shufflevector <8 x i16> %vecinit.i.i, <8 x i16> undef, <8 x i32> zeroinitializer
+ %0 = bitcast <2 x i64> %__O to <8 x i16>
+ %1 = bitcast i8 %__M to <8 x i1>
+ %2 = select <8 x i1> %1, <8 x i16> %vecinit7.i.i, <8 x i16> %0
+ %3 = bitcast <8 x i16> %2 to <2 x i64>
+ ret <2 x i64> %3
+}
+
+define <2 x i64> @test_mm_maskz_set1_epi16(i8 zeroext %__M, i16 signext %__A) {
+; X32-LABEL: test_mm_maskz_set1_epi16:
+; X32: # %bb.0: # %entry
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movb {{[0-9]+}}(%esp), %cl
+; X32-NEXT: kmovd %ecx, %k1
+; X32-NEXT: vpbroadcastw %eax, %xmm0 {%k1} {z}
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_maskz_set1_epi16:
+; X64: # %bb.0: # %entry
+; X64-NEXT: kmovd %edi, %k1
+; X64-NEXT: vpbroadcastw %esi, %xmm0 {%k1} {z}
+; X64-NEXT: retq
+entry:
+ %vecinit.i.i = insertelement <8 x i16> undef, i16 %__A, i32 0
+ %vecinit7.i.i = shufflevector <8 x i16> %vecinit.i.i, <8 x i16> undef, <8 x i32> zeroinitializer
+ %0 = bitcast i8 %__M to <8 x i1>
+ %1 = select <8 x i1> %0, <8 x i16> %vecinit7.i.i, <8 x i16> zeroinitializer
+ %2 = bitcast <8 x i16> %1 to <2 x i64>
+ ret <2 x i64> %2
+}
+
+
define <2 x i64> @test_mm_broadcastb_epi8(<2 x i64> %a0) {
; X32-LABEL: test_mm_broadcastb_epi8:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpbroadcastb %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_broadcastb_epi8:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpbroadcastb %xmm0, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <16 x i8>
@@ -22,14 +599,13 @@ define <2 x i64> @test_mm_broadcastb_epi8(<2 x i64> %a0) {
define <2 x i64> @test_mm_mask_broadcastb_epi8(<2 x i64> %a0, i16 %a1, <2 x i64> %a2) {
; X32-LABEL: test_mm_mask_broadcastb_epi8:
-; X32: # BB#0:
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
-; X32-NEXT: kmovd %eax, %k1
+; X32: # %bb.0:
+; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
; X32-NEXT: vpbroadcastb %xmm1, %xmm0 {%k1}
; X32-NEXT: retl
;
; X64-LABEL: test_mm_mask_broadcastb_epi8:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: kmovd %edi, %k1
; X64-NEXT: vpbroadcastb %xmm1, %xmm0 {%k1}
; X64-NEXT: retq
@@ -44,14 +620,13 @@ define <2 x i64> @test_mm_mask_broadcastb_epi8(<2 x i64> %a0, i16 %a1, <2 x i64>
define <2 x i64> @test_mm_maskz_broadcastb_epi8(i16 %a0, <2 x i64> %a1) {
; X32-LABEL: test_mm_maskz_broadcastb_epi8:
-; X32: # BB#0:
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
-; X32-NEXT: kmovd %eax, %k1
+; X32: # %bb.0:
+; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
; X32-NEXT: vpbroadcastb %xmm0, %xmm0 {%k1} {z}
; X32-NEXT: retl
;
; X64-LABEL: test_mm_maskz_broadcastb_epi8:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: kmovd %edi, %k1
; X64-NEXT: vpbroadcastb %xmm0, %xmm0 {%k1} {z}
; X64-NEXT: retq
@@ -65,12 +640,12 @@ define <2 x i64> @test_mm_maskz_broadcastb_epi8(i16 %a0, <2 x i64> %a1) {
define <4 x i64> @test_mm256_broadcastb_epi8(<2 x i64> %a0) {
; X32-LABEL: test_mm256_broadcastb_epi8:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpbroadcastb %xmm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_broadcastb_epi8:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpbroadcastb %xmm0, %ymm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <16 x i8>
@@ -81,14 +656,13 @@ define <4 x i64> @test_mm256_broadcastb_epi8(<2 x i64> %a0) {
define <4 x i64> @test_mm256_mask_broadcastb_epi8(<4 x i64> %a0, i32 %a1, <2 x i64> %a2) {
; X32-LABEL: test_mm256_mask_broadcastb_epi8:
-; X32: # BB#0:
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: kmovd %eax, %k1
+; X32: # %bb.0:
+; X32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; X32-NEXT: vpbroadcastb %xmm1, %ymm0 {%k1}
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_mask_broadcastb_epi8:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: kmovd %edi, %k1
; X64-NEXT: vpbroadcastb %xmm1, %ymm0 {%k1}
; X64-NEXT: retq
@@ -103,14 +677,13 @@ define <4 x i64> @test_mm256_mask_broadcastb_epi8(<4 x i64> %a0, i32 %a1, <2 x i
define <4 x i64> @test_mm256_maskz_broadcastb_epi8(i32 %a0, <2 x i64> %a1) {
; X32-LABEL: test_mm256_maskz_broadcastb_epi8:
-; X32: # BB#0:
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: kmovd %eax, %k1
+; X32: # %bb.0:
+; X32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; X32-NEXT: vpbroadcastb %xmm0, %ymm0 {%k1} {z}
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_maskz_broadcastb_epi8:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: kmovd %edi, %k1
; X64-NEXT: vpbroadcastb %xmm0, %ymm0 {%k1} {z}
; X64-NEXT: retq
@@ -124,12 +697,12 @@ define <4 x i64> @test_mm256_maskz_broadcastb_epi8(i32 %a0, <2 x i64> %a1) {
define <2 x i64> @test_mm_broadcastw_epi16(<2 x i64> %a0) {
; X32-LABEL: test_mm_broadcastw_epi16:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpbroadcastw %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_broadcastw_epi16:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpbroadcastw %xmm0, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <8 x i16>
@@ -140,14 +713,14 @@ define <2 x i64> @test_mm_broadcastw_epi16(<2 x i64> %a0) {
define <2 x i64> @test_mm_mask_broadcastw_epi16(<2 x i64> %a0, i8 %a1, <2 x i64> %a2) {
; X32-LABEL: test_mm_mask_broadcastw_epi16:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movb {{[0-9]+}}(%esp), %al
; X32-NEXT: kmovd %eax, %k1
; X32-NEXT: vpbroadcastw %xmm1, %xmm0 {%k1}
; X32-NEXT: retl
;
; X64-LABEL: test_mm_mask_broadcastw_epi16:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: kmovd %edi, %k1
; X64-NEXT: vpbroadcastw %xmm1, %xmm0 {%k1}
; X64-NEXT: retq
@@ -162,14 +735,14 @@ define <2 x i64> @test_mm_mask_broadcastw_epi16(<2 x i64> %a0, i8 %a1, <2 x i64>
define <2 x i64> @test_mm_maskz_broadcastw_epi16(i8 %a0, <2 x i64> %a1) {
; X32-LABEL: test_mm_maskz_broadcastw_epi16:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movb {{[0-9]+}}(%esp), %al
; X32-NEXT: kmovd %eax, %k1
; X32-NEXT: vpbroadcastw %xmm0, %xmm0 {%k1} {z}
; X32-NEXT: retl
;
; X64-LABEL: test_mm_maskz_broadcastw_epi16:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: kmovd %edi, %k1
; X64-NEXT: vpbroadcastw %xmm0, %xmm0 {%k1} {z}
; X64-NEXT: retq
@@ -183,12 +756,12 @@ define <2 x i64> @test_mm_maskz_broadcastw_epi16(i8 %a0, <2 x i64> %a1) {
define <4 x i64> @test_mm256_broadcastw_epi16(<2 x i64> %a0) {
; X32-LABEL: test_mm256_broadcastw_epi16:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpbroadcastw %xmm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_broadcastw_epi16:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpbroadcastw %xmm0, %ymm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <8 x i16>
@@ -199,14 +772,13 @@ define <4 x i64> @test_mm256_broadcastw_epi16(<2 x i64> %a0) {
define <4 x i64> @test_mm256_mask_broadcastw_epi16(<4 x i64> %a0, i16 %a1, <2 x i64> %a2) {
; X32-LABEL: test_mm256_mask_broadcastw_epi16:
-; X32: # BB#0:
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
-; X32-NEXT: kmovd %eax, %k1
+; X32: # %bb.0:
+; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
; X32-NEXT: vpbroadcastw %xmm1, %ymm0 {%k1}
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_mask_broadcastw_epi16:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: kmovd %edi, %k1
; X64-NEXT: vpbroadcastw %xmm1, %ymm0 {%k1}
; X64-NEXT: retq
@@ -221,14 +793,13 @@ define <4 x i64> @test_mm256_mask_broadcastw_epi16(<4 x i64> %a0, i16 %a1, <2 x
define <4 x i64> @test_mm256_maskz_broadcastw_epi16(i16 %a0, <2 x i64> %a1) {
; X32-LABEL: test_mm256_maskz_broadcastw_epi16:
-; X32: # BB#0:
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
-; X32-NEXT: kmovd %eax, %k1
+; X32: # %bb.0:
+; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
; X32-NEXT: vpbroadcastw %xmm0, %ymm0 {%k1} {z}
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_maskz_broadcastw_epi16:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: kmovd %edi, %k1
; X64-NEXT: vpbroadcastw %xmm0, %ymm0 {%k1} {z}
; X64-NEXT: retq
diff --git a/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll
index f4504ed07fc6..4e343eef6fad 100644
--- a/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll
+++ b/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll
@@ -1,17 +1,101 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw -mattr=+avx512vl --show-mc-encoding| FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw,+avx512vl --show-mc-encoding| FileCheck %s
+
+declare <16 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.128(i8, <16 x i8>, i16)
+
+define <16 x i8>@test_int_x86_avx512_mask_pbroadcast_b_gpr_128(i8 %x0, <16 x i8> %x1, i16 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcast_b_gpr_128:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpbroadcastb %edi, %xmm1 ## encoding: [0x62,0xf2,0x7d,0x08,0x7a,0xcf]
+; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT: vpbroadcastb %edi, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x7a,0xc7]
+; CHECK-NEXT: vpbroadcastb %edi, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x7a,0xd7]
+; CHECK-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfc,0xc2]
+; CHECK-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfc,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <16 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.128(i8 %x0, <16 x i8> %x1, i16 -1)
+ %res1 = call <16 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.128(i8 %x0, <16 x i8> %x1, i16 %mask)
+ %res2 = call <16 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.128(i8 %x0, <16 x i8> zeroinitializer, i16 %mask)
+ %res3 = add <16 x i8> %res, %res1
+ %res4 = add <16 x i8> %res2, %res3
+ ret <16 x i8> %res4
+}
+
+
+declare <8 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.128(i16, <8 x i16>, i8)
+
+define <8 x i16>@test_int_x86_avx512_mask_pbroadcast_w_gpr_128(i16 %x0, <8 x i16> %x1, i8 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcast_w_gpr_128:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpbroadcastw %edi, %xmm1 ## encoding: [0x62,0xf2,0x7d,0x08,0x7b,0xcf]
+; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT: vpbroadcastw %edi, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x7b,0xc7]
+; CHECK-NEXT: vpbroadcastw %edi, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x7b,0xd7]
+; CHECK-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc2]
+; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <8 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.128(i16 %x0, <8 x i16> %x1, i8 -1)
+ %res1 = call <8 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.128(i16 %x0, <8 x i16> %x1, i8 %mask)
+ %res2 = call <8 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.128(i16 %x0, <8 x i16> zeroinitializer, i8 %mask)
+ %res3 = add <8 x i16> %res, %res1
+ %res4 = add <8 x i16> %res2, %res3
+ ret <8 x i16> %res4
+}
+
+
+ declare <32 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.256(i8, <32 x i8>, i32)
+
+ define <32 x i8>@test_int_x86_avx512_mask_pbroadcast_b_gpr_256(i8 %x0, <32 x i8> %x1, i32 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcast_b_gpr_256:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpbroadcastb %edi, %ymm1 ## encoding: [0x62,0xf2,0x7d,0x28,0x7a,0xcf]
+; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT: vpbroadcastb %edi, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x7a,0xc7]
+; CHECK-NEXT: vpbroadcastb %edi, %ymm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x7a,0xd7]
+; CHECK-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfc,0xc2]
+; CHECK-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfc,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <32 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.256(i8 %x0, <32 x i8> %x1, i32 -1)
+ %res1 = call <32 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.256(i8 %x0, <32 x i8> %x1, i32 %mask)
+ %res2 = call <32 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.256(i8 %x0, <32 x i8> zeroinitializer, i32 %mask)
+ %res3 = add <32 x i8> %res, %res1
+ %res4 = add <32 x i8> %res2, %res3
+ ret <32 x i8> %res4
+ }
+
+
+
+declare <16 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.256(i16, <16 x i16>, i16)
+
+ define <16 x i16>@test_int_x86_avx512_mask_pbroadcast_w_gpr_256(i16 %x0, <16 x i16> %x1, i16 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcast_w_gpr_256:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpbroadcastw %edi, %ymm1 ## encoding: [0x62,0xf2,0x7d,0x28,0x7b,0xcf]
+; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT: vpbroadcastw %edi, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x7b,0xc7]
+; CHECK-NEXT: vpbroadcastw %edi, %ymm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x7b,0xd7]
+; CHECK-NEXT: vpaddw %ymm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0xc2]
+; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <16 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.256(i16 %x0, <16 x i16> %x1, i16 -1)
+ %res1 = call <16 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.256(i16 %x0, <16 x i16> %x1, i16 %mask)
+ %res2 = call <16 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.256(i16 %x0, <16 x i16> zeroinitializer, i16 %mask)
+ %res3 = add <16 x i16> %res, %res1
+ %res4 = add <16 x i16> %res2, %res3
+ ret <16 x i16> %res4
+ }
declare <32 x i8> @llvm.x86.avx512.pbroadcastb.256(<16 x i8>, <32 x i8>, i32)
define <32 x i8>@test_int_x86_avx512_pbroadcastb_256(<16 x i8> %x0, <32 x i8> %x1, i32 %mask) {
; CHECK-LABEL: test_int_x86_avx512_pbroadcastb_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpbroadcastb %xmm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x78,0xd0]
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpbroadcastb %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x78,0xc8]
-; CHECK-NEXT: vpaddb %ymm1, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfc,0xc9]
; CHECK-NEXT: vpbroadcastb %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x78,0xc0]
-; CHECK-NEXT: vpaddb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfc,0xc1]
+; CHECK-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfc,0xc0]
+; CHECK-NEXT: vpaddb %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfc,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <32 x i8> @llvm.x86.avx512.pbroadcastb.256(<16 x i8> %x0, <32 x i8> %x1, i32 -1)
%res1 = call <32 x i8> @llvm.x86.avx512.pbroadcastb.256(<16 x i8> %x0, <32 x i8> %x1, i32 %mask)
@@ -25,13 +109,13 @@ declare <16 x i8> @llvm.x86.avx512.pbroadcastb.128(<16 x i8>, <16 x i8>, i16)
define <16 x i8>@test_int_x86_avx512_pbroadcastb_128(<16 x i8> %x0, <16 x i8> %x1, i16 %mask) {
; CHECK-LABEL: test_int_x86_avx512_pbroadcastb_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpbroadcastb %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x78,0xd0]
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpbroadcastb %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x78,0xc8]
-; CHECK-NEXT: vpaddb %xmm1, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc9]
; CHECK-NEXT: vpbroadcastb %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x78,0xc0]
-; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfc,0xc1]
+; CHECK-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfc,0xc0]
+; CHECK-NEXT: vpaddb %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i8> @llvm.x86.avx512.pbroadcastb.128(<16 x i8> %x0, <16 x i8> %x1, i16 -1)
%res1 = call <16 x i8> @llvm.x86.avx512.pbroadcastb.128(<16 x i8> %x0, <16 x i8> %x1, i16 %mask)
@@ -45,13 +129,13 @@ declare <16 x i16> @llvm.x86.avx512.pbroadcastw.256(<8 x i16>, <16 x i16>, i16)
define <16 x i16>@test_int_x86_avx512_pbroadcastw_256(<8 x i16> %x0, <16 x i16> %x1, i16 %mask) {
; CHECK-LABEL: test_int_x86_avx512_pbroadcastw_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpbroadcastw %xmm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x79,0xd0]
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpbroadcastw %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x79,0xc8]
-; CHECK-NEXT: vpaddw %ymm1, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc9]
; CHECK-NEXT: vpbroadcastw %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x79,0xc0]
-; CHECK-NEXT: vpaddw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0xc1]
+; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0]
+; CHECK-NEXT: vpaddw %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.pbroadcastw.256(<8 x i16> %x0, <16 x i16> %x1, i16 -1)
%res1 = call <16 x i16> @llvm.x86.avx512.pbroadcastw.256(<8 x i16> %x0, <16 x i16> %x1, i16 %mask)
@@ -65,13 +149,13 @@ declare <8 x i16> @llvm.x86.avx512.pbroadcastw.128(<8 x i16>, <8 x i16>, i8)
define <8 x i16>@test_int_x86_avx512_pbroadcastw_128(<8 x i16> %x0, <8 x i16> %x1, i8 %mask) {
; CHECK-LABEL: test_int_x86_avx512_pbroadcastw_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpbroadcastw %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x79,0xd0]
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpbroadcastw %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x79,0xc8]
-; CHECK-NEXT: vpaddw %xmm1, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc9]
; CHECK-NEXT: vpbroadcastw %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x79,0xc0]
-; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc1]
+; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc0]
+; CHECK-NEXT: vpaddw %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.pbroadcastw.128(<8 x i16> %x0, <8 x i16> %x1, i8 -1)
%res1 = call <8 x i16> @llvm.x86.avx512.pbroadcastw.128(<8 x i16> %x0, <8 x i16> %x1, i8 %mask)
@@ -85,13 +169,13 @@ declare <64 x i8> @llvm.x86.avx512.pbroadcastb.512(<16 x i8>, <64 x i8>, i64)
define <64 x i8>@test_int_x86_avx512_pbroadcastb_512(<16 x i8> %x0, <64 x i8> %x1, i64 %mask) {
; CHECK-LABEL: test_int_x86_avx512_pbroadcastb_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpbroadcastb %xmm0, %zmm2 ## encoding: [0x62,0xf2,0x7d,0x48,0x78,0xd0]
; CHECK-NEXT: kmovq %rdi, %k1 ## encoding: [0xc4,0xe1,0xfb,0x92,0xcf]
; CHECK-NEXT: vpbroadcastb %xmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x78,0xc8]
-; CHECK-NEXT: vpaddb %zmm1, %zmm2, %zmm1 ## encoding: [0x62,0xf1,0x6d,0x48,0xfc,0xc9]
; CHECK-NEXT: vpbroadcastb %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x78,0xc0]
-; CHECK-NEXT: vpaddb %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xfc,0xc1]
+; CHECK-NEXT: vpaddb %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x75,0x48,0xfc,0xc0]
+; CHECK-NEXT: vpaddb %zmm0, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6d,0x48,0xfc,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <64 x i8> @llvm.x86.avx512.pbroadcastb.512(<16 x i8> %x0, <64 x i8> %x1, i64 -1)
%res1 = call <64 x i8> @llvm.x86.avx512.pbroadcastb.512(<16 x i8> %x0, <64 x i8> %x1, i64 %mask)
@@ -105,13 +189,13 @@ declare <32 x i16> @llvm.x86.avx512.pbroadcastw.512(<8 x i16>, <32 x i16>, i32)
define <32 x i16>@test_int_x86_avx512_pbroadcastw_512(<8 x i16> %x0, <32 x i16> %x1, i32 %mask) {
; CHECK-LABEL: test_int_x86_avx512_pbroadcastw_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpbroadcastw %xmm0, %zmm2 ## encoding: [0x62,0xf2,0x7d,0x48,0x79,0xd0]
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpbroadcastw %xmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x79,0xc8]
-; CHECK-NEXT: vpaddw %zmm1, %zmm2, %zmm1 ## encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc9]
; CHECK-NEXT: vpbroadcastw %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x79,0xc0]
-; CHECK-NEXT: vpaddw %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xfd,0xc1]
+; CHECK-NEXT: vpaddw %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x75,0x48,0xfd,0xc0]
+; CHECK-NEXT: vpaddw %zmm0, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <32 x i16> @llvm.x86.avx512.pbroadcastw.512(<8 x i16> %x0, <32 x i16> %x1, i32 -1)
%res1 = call <32 x i16> @llvm.x86.avx512.pbroadcastw.512(<8 x i16> %x0, <32 x i16> %x1, i32 %mask)
@@ -125,7 +209,7 @@ declare void @llvm.x86.avx512.mask.storeu.b.128(i8*, <16 x i8>, i16)
define void@test_int_x86_avx512_mask_storeu_b_128(i8* %ptr1, i8* %ptr2, <16 x i8> %x1, i16 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_storeu_b_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edx, %k1 ## encoding: [0xc5,0xfb,0x92,0xca]
; CHECK-NEXT: vmovdqu8 %xmm0, (%rdi) {%k1} ## encoding: [0x62,0xf1,0x7f,0x09,0x7f,0x07]
; CHECK-NEXT: vmovdqu %xmm0, (%rsi) ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7f,0x06]
@@ -139,10 +223,11 @@ declare void @llvm.x86.avx512.mask.storeu.b.256(i8*, <32 x i8>, i32)
define void@test_int_x86_avx512_mask_storeu_b_256(i8* %ptr1, i8* %ptr2, <32 x i8> %x1, i32 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_storeu_b_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edx, %k1 ## encoding: [0xc5,0xfb,0x92,0xca]
; CHECK-NEXT: vmovdqu8 %ymm0, (%rdi) {%k1} ## encoding: [0x62,0xf1,0x7f,0x29,0x7f,0x07]
; CHECK-NEXT: vmovdqu %ymm0, (%rsi) ## EVEX TO VEX Compression encoding: [0xc5,0xfe,0x7f,0x06]
+; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
; CHECK-NEXT: retq ## encoding: [0xc3]
call void @llvm.x86.avx512.mask.storeu.b.256(i8* %ptr1, <32 x i8> %x1, i32 %x2)
call void @llvm.x86.avx512.mask.storeu.b.256(i8* %ptr2, <32 x i8> %x1, i32 -1)
@@ -153,7 +238,7 @@ declare void @llvm.x86.avx512.mask.storeu.w.128(i8*, <8 x i16>, i8)
define void@test_int_x86_avx512_mask_storeu_w_128(i8* %ptr1, i8* %ptr2, <8 x i16> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_storeu_w_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edx, %k1 ## encoding: [0xc5,0xfb,0x92,0xca]
; CHECK-NEXT: vmovdqu16 %xmm0, (%rdi) {%k1} ## encoding: [0x62,0xf1,0xff,0x09,0x7f,0x07]
; CHECK-NEXT: vmovdqu %xmm0, (%rsi) ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7f,0x06]
@@ -167,10 +252,11 @@ declare void @llvm.x86.avx512.mask.storeu.w.256(i8*, <16 x i16>, i16)
define void@test_int_x86_avx512_mask_storeu_w_256(i8* %ptr1, i8* %ptr2, <16 x i16> %x1, i16 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_storeu_w_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edx, %k1 ## encoding: [0xc5,0xfb,0x92,0xca]
; CHECK-NEXT: vmovdqu16 %ymm0, (%rdi) {%k1} ## encoding: [0x62,0xf1,0xff,0x29,0x7f,0x07]
; CHECK-NEXT: vmovdqu %ymm0, (%rsi) ## EVEX TO VEX Compression encoding: [0xc5,0xfe,0x7f,0x06]
+; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
; CHECK-NEXT: retq ## encoding: [0xc3]
call void @llvm.x86.avx512.mask.storeu.w.256(i8* %ptr1, <16 x i16> %x1, i16 %x2)
call void @llvm.x86.avx512.mask.storeu.w.256(i8* %ptr2, <16 x i16> %x1, i16 -1)
@@ -181,7 +267,7 @@ declare <8 x i16> @llvm.x86.avx512.mask.loadu.w.128(i8*, <8 x i16>, i8)
define <8 x i16>@test_int_x86_avx512_mask_loadu_w_128(i8* %ptr, i8* %ptr2, <8 x i16> %x1, i8 %mask) {
; CHECK-LABEL: test_int_x86_avx512_mask_loadu_w_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovdqu (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x6f,0x07]
; CHECK-NEXT: kmovd %edx, %k1 ## encoding: [0xc5,0xfb,0x92,0xca]
; CHECK-NEXT: vmovdqu16 (%rsi), %xmm0 {%k1} ## encoding: [0x62,0xf1,0xff,0x09,0x6f,0x06]
@@ -199,7 +285,7 @@ declare <16 x i16> @llvm.x86.avx512.mask.loadu.w.256(i8*, <16 x i16>, i16)
define <16 x i16>@test_int_x86_avx512_mask_loadu_w_256(i8* %ptr, i8* %ptr2, <16 x i16> %x1, i16 %mask) {
; CHECK-LABEL: test_int_x86_avx512_mask_loadu_w_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovdqu (%rdi), %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfe,0x6f,0x07]
; CHECK-NEXT: kmovd %edx, %k1 ## encoding: [0xc5,0xfb,0x92,0xca]
; CHECK-NEXT: vmovdqu16 (%rsi), %ymm0 {%k1} ## encoding: [0x62,0xf1,0xff,0x29,0x6f,0x06]
@@ -217,7 +303,7 @@ declare <16 x i8> @llvm.x86.avx512.mask.loadu.b.128(i8*, <16 x i8>, i16)
define <16 x i8>@test_int_x86_avx512_mask_loadu_b_128(i8* %ptr, i8* %ptr2, <16 x i8> %x1, i16 %mask) {
; CHECK-LABEL: test_int_x86_avx512_mask_loadu_b_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovdqu (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x6f,0x07]
; CHECK-NEXT: kmovd %edx, %k1 ## encoding: [0xc5,0xfb,0x92,0xca]
; CHECK-NEXT: vmovdqu8 (%rsi), %xmm0 {%k1} ## encoding: [0x62,0xf1,0x7f,0x09,0x6f,0x06]
@@ -235,7 +321,7 @@ declare <32 x i8> @llvm.x86.avx512.mask.loadu.b.256(i8*, <32 x i8>, i32)
define <32 x i8>@test_int_x86_avx512_mask_loadu_b_256(i8* %ptr, i8* %ptr2, <32 x i8> %x1, i32 %mask) {
; CHECK-LABEL: test_int_x86_avx512_mask_loadu_b_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovdqu (%rdi), %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfe,0x6f,0x07]
; CHECK-NEXT: kmovd %edx, %k1 ## encoding: [0xc5,0xfb,0x92,0xca]
; CHECK-NEXT: vmovdqu8 (%rsi), %ymm0 {%k1} ## encoding: [0x62,0xf1,0x7f,0x29,0x6f,0x06]
@@ -253,7 +339,7 @@ declare <16 x i8> @llvm.x86.avx512.mask.palignr.128(<16 x i8>, <16 x i8>, i32, <
define <16 x i8>@test_int_x86_avx512_mask_palignr_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x3, i16 %x4) {
; CHECK-LABEL: test_int_x86_avx512_mask_palignr_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpalignr $2, %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x0f,0xd9,0x02]
; CHECK-NEXT: ## xmm3 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1]
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
@@ -261,8 +347,8 @@ define <16 x i8>@test_int_x86_avx512_mask_palignr_128(<16 x i8> %x0, <16 x i8> %
; CHECK-NEXT: ## xmm2 {%k1} = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1]
; CHECK-NEXT: vpalignr $2, %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0x89,0x0f,0xc1,0x02]
; CHECK-NEXT: ## xmm0 {%k1} {z} = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1]
-; CHECK-NEXT: vpaddb %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc0]
; CHECK-NEXT: vpaddb %xmm3, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfc,0xc3]
+; CHECK-NEXT: vpaddb %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i8> @llvm.x86.avx512.mask.palignr.128(<16 x i8> %x0, <16 x i8> %x1, i32 2, <16 x i8> %x3, i16 %x4)
%res1 = call <16 x i8> @llvm.x86.avx512.mask.palignr.128(<16 x i8> %x0, <16 x i8> %x1, i32 2, <16 x i8> zeroinitializer, i16 %x4)
@@ -276,7 +362,7 @@ declare <32 x i8> @llvm.x86.avx512.mask.palignr.256(<32 x i8>, <32 x i8>, i32, <
define <32 x i8>@test_int_x86_avx512_mask_palignr_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x3, i32 %x4) {
; CHECK-LABEL: test_int_x86_avx512_mask_palignr_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpalignr $2, %ymm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x0f,0xd9,0x02]
; CHECK-NEXT: ## ymm3 = ymm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm0[0,1],ymm1[18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm0[16,17]
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
@@ -284,8 +370,8 @@ define <32 x i8>@test_int_x86_avx512_mask_palignr_256(<32 x i8> %x0, <32 x i8> %
; CHECK-NEXT: ## ymm2 {%k1} = ymm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm0[0,1],ymm1[18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm0[16,17]
; CHECK-NEXT: vpalignr $2, %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xa9,0x0f,0xc1,0x02]
; CHECK-NEXT: ## ymm0 {%k1} {z} = ymm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm0[0,1],ymm1[18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm0[16,17]
-; CHECK-NEXT: vpaddb %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfc,0xc0]
; CHECK-NEXT: vpaddb %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfc,0xc3]
+; CHECK-NEXT: vpaddb %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfc,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <32 x i8> @llvm.x86.avx512.mask.palignr.256(<32 x i8> %x0, <32 x i8> %x1, i32 2, <32 x i8> %x3, i32 %x4)
%res1 = call <32 x i8> @llvm.x86.avx512.mask.palignr.256(<32 x i8> %x0, <32 x i8> %x1, i32 2, <32 x i8> zeroinitializer, i32 %x4)
@@ -299,7 +385,7 @@ declare <8 x i16> @llvm.x86.avx512.mask.pshufh.w.128(<8 x i16>, i32, <8 x i16>,
define <8 x i16>@test_int_x86_avx512_mask_pshufh_w_128(<8 x i16> %x0, i32 %x1, <8 x i16> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_pshufh_w_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpshufhw $3, %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x70,0xd0,0x03]
; CHECK-NEXT: ## xmm2 = xmm0[0,1,2,3,7,4,4,4]
; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
@@ -307,8 +393,8 @@ define <8 x i16>@test_int_x86_avx512_mask_pshufh_w_128(<8 x i16> %x0, i32 %x1, <
; CHECK-NEXT: ## xmm1 {%k1} = xmm0[0,1,2,3,7,4,4,4]
; CHECK-NEXT: vpshufhw $3, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0x89,0x70,0xc0,0x03]
; CHECK-NEXT: ## xmm0 {%k1} {z} = xmm0[0,1,2,3,7,4,4,4]
-; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc0]
; CHECK-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc2]
+; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.mask.pshufh.w.128(<8 x i16> %x0, i32 3, <8 x i16> %x2, i8 %x3)
%res1 = call <8 x i16> @llvm.x86.avx512.mask.pshufh.w.128(<8 x i16> %x0, i32 3, <8 x i16> zeroinitializer, i8 %x3)
@@ -322,7 +408,7 @@ declare <16 x i16> @llvm.x86.avx512.mask.pshufh.w.256(<16 x i16>, i32, <16 x i16
define <16 x i16>@test_int_x86_avx512_mask_pshufh_w_256(<16 x i16> %x0, i32 %x1, <16 x i16> %x2, i16 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_pshufh_w_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpshufhw $3, %ymm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xfe,0x70,0xd0,0x03]
; CHECK-NEXT: ## ymm2 = ymm0[0,1,2,3,7,4,4,4,8,9,10,11,15,12,12,12]
; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
@@ -330,8 +416,8 @@ define <16 x i16>@test_int_x86_avx512_mask_pshufh_w_256(<16 x i16> %x0, i32 %x1,
; CHECK-NEXT: ## ymm1 {%k1} = ymm0[0,1,2,3,7,4,4,4,8,9,10,11,15,12,12,12]
; CHECK-NEXT: vpshufhw $3, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0xa9,0x70,0xc0,0x03]
; CHECK-NEXT: ## ymm0 {%k1} {z} = ymm0[0,1,2,3,7,4,4,4,8,9,10,11,15,12,12,12]
-; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0]
; CHECK-NEXT: vpaddw %ymm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0xc2]
+; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.mask.pshufh.w.256(<16 x i16> %x0, i32 3, <16 x i16> %x2, i16 %x3)
%res1 = call <16 x i16> @llvm.x86.avx512.mask.pshufh.w.256(<16 x i16> %x0, i32 3, <16 x i16> zeroinitializer, i16 %x3)
@@ -345,7 +431,7 @@ declare <8 x i16> @llvm.x86.avx512.mask.pshufl.w.128(<8 x i16>, i32, <8 x i16>,
define <8 x i16>@test_int_x86_avx512_mask_pshufl_w_128(<8 x i16> %x0, i32 %x1, <8 x i16> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_pshufl_w_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpshuflw $3, %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xfb,0x70,0xd0,0x03]
; CHECK-NEXT: ## xmm2 = xmm0[3,0,0,0,4,5,6,7]
; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
@@ -353,8 +439,8 @@ define <8 x i16>@test_int_x86_avx512_mask_pshufl_w_128(<8 x i16> %x0, i32 %x1, <
; CHECK-NEXT: ## xmm1 {%k1} = xmm0[3,0,0,0,4,5,6,7]
; CHECK-NEXT: vpshuflw $3, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7f,0x89,0x70,0xc0,0x03]
; CHECK-NEXT: ## xmm0 {%k1} {z} = xmm0[3,0,0,0,4,5,6,7]
-; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc0]
; CHECK-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc2]
+; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.mask.pshufl.w.128(<8 x i16> %x0, i32 3, <8 x i16> %x2, i8 %x3)
%res1 = call <8 x i16> @llvm.x86.avx512.mask.pshufl.w.128(<8 x i16> %x0, i32 3, <8 x i16> zeroinitializer, i8 %x3)
@@ -368,7 +454,7 @@ declare <16 x i16> @llvm.x86.avx512.mask.pshufl.w.256(<16 x i16>, i32, <16 x i16
define <16 x i16>@test_int_x86_avx512_mask_pshufl_w_256(<16 x i16> %x0, i32 %x1, <16 x i16> %x2, i16 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_pshufl_w_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpshuflw $3, %ymm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xff,0x70,0xd0,0x03]
; CHECK-NEXT: ## ymm2 = ymm0[3,0,0,0,4,5,6,7,11,8,8,8,12,13,14,15]
; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
@@ -376,8 +462,8 @@ define <16 x i16>@test_int_x86_avx512_mask_pshufl_w_256(<16 x i16> %x0, i32 %x1,
; CHECK-NEXT: ## ymm1 {%k1} = ymm0[3,0,0,0,4,5,6,7,11,8,8,8,12,13,14,15]
; CHECK-NEXT: vpshuflw $3, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7f,0xa9,0x70,0xc0,0x03]
; CHECK-NEXT: ## ymm0 {%k1} {z} = ymm0[3,0,0,0,4,5,6,7,11,8,8,8,12,13,14,15]
-; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0]
; CHECK-NEXT: vpaddw %ymm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0xc2]
+; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.mask.pshufl.w.256(<16 x i16> %x0, i32 3, <16 x i16> %x2, i16 %x3)
%res1 = call <16 x i16> @llvm.x86.avx512.mask.pshufl.w.256(<16 x i16> %x0, i32 3, <16 x i16> zeroinitializer, i16 %x3)
@@ -389,9 +475,10 @@ define <16 x i16>@test_int_x86_avx512_mask_pshufl_w_256(<16 x i16> %x0, i32 %x1,
define i32 @test_pcmpeq_b_256(<32 x i8> %a, <32 x i8> %b) {
; CHECK-LABEL: test_pcmpeq_b_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpcmpeqb %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x28,0x74,0xc1]
; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
+; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call i32 @llvm.x86.avx512.mask.pcmpeq.b.256(<32 x i8> %a, <32 x i8> %b, i32 -1)
ret i32 %res
@@ -399,10 +486,11 @@ define i32 @test_pcmpeq_b_256(<32 x i8> %a, <32 x i8> %b) {
define i32 @test_mask_pcmpeq_b_256(<32 x i8> %a, <32 x i8> %b, i32 %mask) {
; CHECK-LABEL: test_mask_pcmpeq_b_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpcmpeqb %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x74,0xc1]
; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
+; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call i32 @llvm.x86.avx512.mask.pcmpeq.b.256(<32 x i8> %a, <32 x i8> %b, i32 %mask)
ret i32 %res
@@ -412,10 +500,11 @@ declare i32 @llvm.x86.avx512.mask.pcmpeq.b.256(<32 x i8>, <32 x i8>, i32)
define i16 @test_pcmpeq_w_256(<16 x i16> %a, <16 x i16> %b) {
; CHECK-LABEL: test_pcmpeq_w_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x28,0x75,0xc1]
; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
-; CHECK-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; CHECK-NEXT: ## kill: def %ax killed %ax killed %eax
+; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call i16 @llvm.x86.avx512.mask.pcmpeq.w.256(<16 x i16> %a, <16 x i16> %b, i16 -1)
ret i16 %res
@@ -423,11 +512,12 @@ define i16 @test_pcmpeq_w_256(<16 x i16> %a, <16 x i16> %b) {
define i16 @test_mask_pcmpeq_w_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) {
; CHECK-LABEL: test_mask_pcmpeq_w_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x75,0xc1]
; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
-; CHECK-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; CHECK-NEXT: ## kill: def %ax killed %ax killed %eax
+; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call i16 @llvm.x86.avx512.mask.pcmpeq.w.256(<16 x i16> %a, <16 x i16> %b, i16 %mask)
ret i16 %res
@@ -437,9 +527,10 @@ declare i16 @llvm.x86.avx512.mask.pcmpeq.w.256(<16 x i16>, <16 x i16>, i16)
define i32 @test_pcmpgt_b_256(<32 x i8> %a, <32 x i8> %b) {
; CHECK-LABEL: test_pcmpgt_b_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpcmpgtb %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x28,0x64,0xc1]
; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
+; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call i32 @llvm.x86.avx512.mask.pcmpgt.b.256(<32 x i8> %a, <32 x i8> %b, i32 -1)
ret i32 %res
@@ -447,10 +538,11 @@ define i32 @test_pcmpgt_b_256(<32 x i8> %a, <32 x i8> %b) {
define i32 @test_mask_pcmpgt_b_256(<32 x i8> %a, <32 x i8> %b, i32 %mask) {
; CHECK-LABEL: test_mask_pcmpgt_b_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpcmpgtb %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x64,0xc1]
; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
+; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call i32 @llvm.x86.avx512.mask.pcmpgt.b.256(<32 x i8> %a, <32 x i8> %b, i32 %mask)
ret i32 %res
@@ -460,10 +552,11 @@ declare i32 @llvm.x86.avx512.mask.pcmpgt.b.256(<32 x i8>, <32 x i8>, i32)
define i16 @test_pcmpgt_w_256(<16 x i16> %a, <16 x i16> %b) {
; CHECK-LABEL: test_pcmpgt_w_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpcmpgtw %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x28,0x65,0xc1]
; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
-; CHECK-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; CHECK-NEXT: ## kill: def %ax killed %ax killed %eax
+; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call i16 @llvm.x86.avx512.mask.pcmpgt.w.256(<16 x i16> %a, <16 x i16> %b, i16 -1)
ret i16 %res
@@ -471,11 +564,12 @@ define i16 @test_pcmpgt_w_256(<16 x i16> %a, <16 x i16> %b) {
define i16 @test_mask_pcmpgt_w_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) {
; CHECK-LABEL: test_mask_pcmpgt_w_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpcmpgtw %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x65,0xc1]
; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
-; CHECK-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; CHECK-NEXT: ## kill: def %ax killed %ax killed %eax
+; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call i16 @llvm.x86.avx512.mask.pcmpgt.w.256(<16 x i16> %a, <16 x i16> %b, i16 %mask)
ret i16 %res
@@ -485,10 +579,10 @@ declare i16 @llvm.x86.avx512.mask.pcmpgt.w.256(<16 x i16>, <16 x i16>, i16)
define i16 @test_pcmpeq_b_128(<16 x i8> %a, <16 x i8> %b) {
; CHECK-LABEL: test_pcmpeq_b_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpcmpeqb %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x08,0x74,0xc1]
; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
-; CHECK-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; CHECK-NEXT: ## kill: def %ax killed %ax killed %eax
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call i16 @llvm.x86.avx512.mask.pcmpeq.b.128(<16 x i8> %a, <16 x i8> %b, i16 -1)
ret i16 %res
@@ -496,11 +590,11 @@ define i16 @test_pcmpeq_b_128(<16 x i8> %a, <16 x i8> %b) {
define i16 @test_mask_pcmpeq_b_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) {
; CHECK-LABEL: test_mask_pcmpeq_b_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpcmpeqb %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x74,0xc1]
; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
-; CHECK-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; CHECK-NEXT: ## kill: def %ax killed %ax killed %eax
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call i16 @llvm.x86.avx512.mask.pcmpeq.b.128(<16 x i8> %a, <16 x i8> %b, i16 %mask)
ret i16 %res
@@ -510,10 +604,10 @@ declare i16 @llvm.x86.avx512.mask.pcmpeq.b.128(<16 x i8>, <16 x i8>, i16)
define i8 @test_pcmpeq_w_128(<8 x i16> %a, <8 x i16> %b) {
; CHECK-LABEL: test_pcmpeq_w_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x08,0x75,0xc1]
; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
-; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: ## kill: def %al killed %al killed %eax
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call i8 @llvm.x86.avx512.mask.pcmpeq.w.128(<8 x i16> %a, <8 x i16> %b, i8 -1)
ret i8 %res
@@ -521,11 +615,11 @@ define i8 @test_pcmpeq_w_128(<8 x i16> %a, <8 x i16> %b) {
define i8 @test_mask_pcmpeq_w_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) {
; CHECK-LABEL: test_mask_pcmpeq_w_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x75,0xc1]
; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
-; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: ## kill: def %al killed %al killed %eax
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call i8 @llvm.x86.avx512.mask.pcmpeq.w.128(<8 x i16> %a, <8 x i16> %b, i8 %mask)
ret i8 %res
@@ -535,10 +629,10 @@ declare i8 @llvm.x86.avx512.mask.pcmpeq.w.128(<8 x i16>, <8 x i16>, i8)
define i16 @test_pcmpgt_b_128(<16 x i8> %a, <16 x i8> %b) {
; CHECK-LABEL: test_pcmpgt_b_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpcmpgtb %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x08,0x64,0xc1]
; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
-; CHECK-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; CHECK-NEXT: ## kill: def %ax killed %ax killed %eax
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call i16 @llvm.x86.avx512.mask.pcmpgt.b.128(<16 x i8> %a, <16 x i8> %b, i16 -1)
ret i16 %res
@@ -546,11 +640,11 @@ define i16 @test_pcmpgt_b_128(<16 x i8> %a, <16 x i8> %b) {
define i16 @test_mask_pcmpgt_b_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) {
; CHECK-LABEL: test_mask_pcmpgt_b_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpcmpgtb %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x64,0xc1]
; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
-; CHECK-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; CHECK-NEXT: ## kill: def %ax killed %ax killed %eax
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call i16 @llvm.x86.avx512.mask.pcmpgt.b.128(<16 x i8> %a, <16 x i8> %b, i16 %mask)
ret i16 %res
@@ -560,10 +654,10 @@ declare i16 @llvm.x86.avx512.mask.pcmpgt.b.128(<16 x i8>, <16 x i8>, i16)
define i8 @test_pcmpgt_w_128(<8 x i16> %a, <8 x i16> %b) {
; CHECK-LABEL: test_pcmpgt_w_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpcmpgtw %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x08,0x65,0xc1]
; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
-; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: ## kill: def %al killed %al killed %eax
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call i8 @llvm.x86.avx512.mask.pcmpgt.w.128(<8 x i16> %a, <8 x i16> %b, i8 -1)
ret i8 %res
@@ -571,11 +665,11 @@ define i8 @test_pcmpgt_w_128(<8 x i16> %a, <8 x i16> %b) {
define i8 @test_mask_pcmpgt_w_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) {
; CHECK-LABEL: test_mask_pcmpgt_w_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpcmpgtw %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x65,0xc1]
; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
-; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: ## kill: def %al killed %al killed %eax
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call i8 @llvm.x86.avx512.mask.pcmpgt.w.128(<8 x i16> %a, <8 x i16> %b, i8 %mask)
ret i8 %res
@@ -587,7 +681,7 @@ declare <16 x i8> @llvm.x86.avx512.mask.punpckhb.w.128(<16 x i8>, <16 x i8>, <16
define <16 x i8>@test_int_x86_avx512_mask_punpckhb_w_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_punpckhb_w_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpunpckhbw %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x68,0xd9]
; CHECK-NEXT: ## xmm3 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
@@ -605,7 +699,7 @@ declare <16 x i8> @llvm.x86.avx512.mask.punpcklb.w.128(<16 x i8>, <16 x i8>, <16
define <16 x i8>@test_int_x86_avx512_mask_punpcklb_w_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_punpcklb_w_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpunpcklbw %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x60,0xd9]
; CHECK-NEXT: ## xmm3 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
@@ -623,7 +717,7 @@ declare <32 x i8> @llvm.x86.avx512.mask.punpckhb.w.256(<32 x i8>, <32 x i8>, <32
define <32 x i8>@test_int_x86_avx512_mask_punpckhb_w_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_punpckhb_w_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpunpckhbw %ymm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x68,0xd9]
; CHECK-NEXT: ## ymm3 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
@@ -641,7 +735,7 @@ declare <32 x i8> @llvm.x86.avx512.mask.punpcklb.w.256(<32 x i8>, <32 x i8>, <32
define <32 x i8>@test_int_x86_avx512_mask_punpcklb_w_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_punpcklb_w_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpunpcklbw %ymm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x60,0xd9]
; CHECK-NEXT: ## ymm3 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
@@ -659,7 +753,7 @@ declare <8 x i16> @llvm.x86.avx512.mask.punpcklw.d.128(<8 x i16>, <8 x i16>, <8
define <8 x i16>@test_int_x86_avx512_mask_punpcklw_d_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_punpcklw_d_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpunpcklwd %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x61,0xd9]
; CHECK-NEXT: ## xmm3 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
@@ -677,7 +771,7 @@ declare <8 x i16> @llvm.x86.avx512.mask.punpckhw.d.128(<8 x i16>, <8 x i16>, <8
define <8 x i16>@test_int_x86_avx512_mask_punpckhw_d_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_punpckhw_d_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpunpckhwd %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x69,0xd9]
; CHECK-NEXT: ## xmm3 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
@@ -695,7 +789,7 @@ declare <16 x i16> @llvm.x86.avx512.mask.punpcklw.d.256(<16 x i16>, <16 x i16>,
define <16 x i16>@test_int_x86_avx512_mask_punpcklw_d_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_punpcklw_d_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpunpcklwd %ymm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x61,0xd9]
; CHECK-NEXT: ## ymm3 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
@@ -713,7 +807,7 @@ declare <16 x i16> @llvm.x86.avx512.mask.punpckhw.d.256(<16 x i16>, <16 x i16>,
define <16 x i16>@test_int_x86_avx512_mask_punpckhw_d_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_punpckhw_d_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpunpckhwd %ymm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x69,0xd9]
; CHECK-NEXT: ## ymm3 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15]
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
@@ -729,7 +823,7 @@ define <16 x i16>@test_int_x86_avx512_mask_punpckhw_d_256(<16 x i16> %x0, <16 x
define <8 x i16> @test_mask_add_epi16_rr_128(<8 x i16> %a, <8 x i16> %b) {
; CHECK-LABEL: test_mask_add_epi16_rr_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.mask.padd.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1)
@@ -738,7 +832,7 @@ define <8 x i16> @test_mask_add_epi16_rr_128(<8 x i16> %a, <8 x i16> %b) {
define <8 x i16> @test_mask_add_epi16_rrk_128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_add_epi16_rrk_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xfd,0xd1]
; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
@@ -749,7 +843,7 @@ define <8 x i16> @test_mask_add_epi16_rrk_128(<8 x i16> %a, <8 x i16> %b, <8 x i
define <8 x i16> @test_mask_add_epi16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) {
; CHECK-LABEL: test_mask_add_epi16_rrkz_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xfd,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -759,7 +853,7 @@ define <8 x i16> @test_mask_add_epi16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i8 %m
define <8 x i16> @test_mask_add_epi16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) {
; CHECK-LABEL: test_mask_add_epi16_rm_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpaddw (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x i16>, <8 x i16>* %ptr_b
@@ -769,7 +863,7 @@ define <8 x i16> @test_mask_add_epi16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) {
define <8 x i16> @test_mask_add_epi16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <8 x i16> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_add_epi16_rmk_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
; CHECK-NEXT: vpaddw (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xfd,0x0f]
; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
@@ -781,7 +875,7 @@ define <8 x i16> @test_mask_add_epi16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <
define <8 x i16> @test_mask_add_epi16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i8 %mask) {
; CHECK-LABEL: test_mask_add_epi16_rmkz_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
; CHECK-NEXT: vpaddw (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xfd,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -794,7 +888,7 @@ declare <8 x i16> @llvm.x86.avx512.mask.padd.w.128(<8 x i16>, <8 x i16>, <8 x i1
define <16 x i16> @test_mask_add_epi16_rr_256(<16 x i16> %a, <16 x i16> %b) {
; CHECK-LABEL: test_mask_add_epi16_rr_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpaddw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.mask.padd.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1)
@@ -803,7 +897,7 @@ define <16 x i16> @test_mask_add_epi16_rr_256(<16 x i16> %a, <16 x i16> %b) {
define <16 x i16> @test_mask_add_epi16_rrk_256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) {
; CHECK-LABEL: test_mask_add_epi16_rrk_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpaddw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xfd,0xd1]
; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2]
@@ -814,7 +908,7 @@ define <16 x i16> @test_mask_add_epi16_rrk_256(<16 x i16> %a, <16 x i16> %b, <16
define <16 x i16> @test_mask_add_epi16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) {
; CHECK-LABEL: test_mask_add_epi16_rrkz_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpaddw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xfd,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -824,7 +918,7 @@ define <16 x i16> @test_mask_add_epi16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i1
define <16 x i16> @test_mask_add_epi16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) {
; CHECK-LABEL: test_mask_add_epi16_rm_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpaddw (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <16 x i16>, <16 x i16>* %ptr_b
@@ -834,7 +928,7 @@ define <16 x i16> @test_mask_add_epi16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b)
define <16 x i16> @test_mask_add_epi16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <16 x i16> %passThru, i16 %mask) {
; CHECK-LABEL: test_mask_add_epi16_rmk_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
; CHECK-NEXT: vpaddw (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xfd,0x0f]
; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
@@ -846,7 +940,7 @@ define <16 x i16> @test_mask_add_epi16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b
define <16 x i16> @test_mask_add_epi16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i16 %mask) {
; CHECK-LABEL: test_mask_add_epi16_rmkz_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
; CHECK-NEXT: vpaddw (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xfd,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -859,7 +953,7 @@ declare <16 x i16> @llvm.x86.avx512.mask.padd.w.256(<16 x i16>, <16 x i16>, <16
define <8 x i16> @test_mask_sub_epi16_rr_128(<8 x i16> %a, <8 x i16> %b) {
; CHECK-LABEL: test_mask_sub_epi16_rr_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsubw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xf9,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.mask.psub.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1)
@@ -868,7 +962,7 @@ define <8 x i16> @test_mask_sub_epi16_rr_128(<8 x i16> %a, <8 x i16> %b) {
define <8 x i16> @test_mask_sub_epi16_rrk_128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_sub_epi16_rrk_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpsubw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xf9,0xd1]
; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
@@ -879,7 +973,7 @@ define <8 x i16> @test_mask_sub_epi16_rrk_128(<8 x i16> %a, <8 x i16> %b, <8 x i
define <8 x i16> @test_mask_sub_epi16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) {
; CHECK-LABEL: test_mask_sub_epi16_rrkz_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpsubw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xf9,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -889,7 +983,7 @@ define <8 x i16> @test_mask_sub_epi16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i8 %m
define <8 x i16> @test_mask_sub_epi16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) {
; CHECK-LABEL: test_mask_sub_epi16_rm_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsubw (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xf9,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x i16>, <8 x i16>* %ptr_b
@@ -899,7 +993,7 @@ define <8 x i16> @test_mask_sub_epi16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) {
define <8 x i16> @test_mask_sub_epi16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <8 x i16> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_sub_epi16_rmk_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
; CHECK-NEXT: vpsubw (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xf9,0x0f]
; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
@@ -911,7 +1005,7 @@ define <8 x i16> @test_mask_sub_epi16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <
define <8 x i16> @test_mask_sub_epi16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i8 %mask) {
; CHECK-LABEL: test_mask_sub_epi16_rmkz_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
; CHECK-NEXT: vpsubw (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xf9,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -924,7 +1018,7 @@ declare <8 x i16> @llvm.x86.avx512.mask.psub.w.128(<8 x i16>, <8 x i16>, <8 x i1
define <16 x i16> @test_mask_sub_epi16_rr_256(<16 x i16> %a, <16 x i16> %b) {
; CHECK-LABEL: test_mask_sub_epi16_rr_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsubw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xf9,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.mask.psub.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1)
@@ -933,7 +1027,7 @@ define <16 x i16> @test_mask_sub_epi16_rr_256(<16 x i16> %a, <16 x i16> %b) {
define <16 x i16> @test_mask_sub_epi16_rrk_256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) {
; CHECK-LABEL: test_mask_sub_epi16_rrk_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpsubw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xf9,0xd1]
; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2]
@@ -944,7 +1038,7 @@ define <16 x i16> @test_mask_sub_epi16_rrk_256(<16 x i16> %a, <16 x i16> %b, <16
define <16 x i16> @test_mask_sub_epi16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) {
; CHECK-LABEL: test_mask_sub_epi16_rrkz_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpsubw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xf9,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -954,7 +1048,7 @@ define <16 x i16> @test_mask_sub_epi16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i1
define <16 x i16> @test_mask_sub_epi16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) {
; CHECK-LABEL: test_mask_sub_epi16_rm_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsubw (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xf9,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <16 x i16>, <16 x i16>* %ptr_b
@@ -964,7 +1058,7 @@ define <16 x i16> @test_mask_sub_epi16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b)
define <16 x i16> @test_mask_sub_epi16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <16 x i16> %passThru, i16 %mask) {
; CHECK-LABEL: test_mask_sub_epi16_rmk_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
; CHECK-NEXT: vpsubw (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xf9,0x0f]
; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
@@ -976,7 +1070,7 @@ define <16 x i16> @test_mask_sub_epi16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b
define <16 x i16> @test_mask_sub_epi16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i16 %mask) {
; CHECK-LABEL: test_mask_sub_epi16_rmkz_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
; CHECK-NEXT: vpsubw (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xf9,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -989,7 +1083,7 @@ declare <16 x i16> @llvm.x86.avx512.mask.psub.w.256(<16 x i16>, <16 x i16>, <16
define <32 x i16> @test_mask_add_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) {
; CHECK-LABEL: test_mask_add_epi16_rr_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpaddw %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xfd,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <32 x i16> @llvm.x86.avx512.mask.padd.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1)
@@ -998,7 +1092,7 @@ define <32 x i16> @test_mask_add_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) {
define <32 x i16> @test_mask_add_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) {
; CHECK-LABEL: test_mask_add_epi16_rrk_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpaddw %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0xfd,0xd1]
; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2]
@@ -1009,7 +1103,7 @@ define <32 x i16> @test_mask_add_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32
define <32 x i16> @test_mask_add_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %mask) {
; CHECK-LABEL: test_mask_add_epi16_rrkz_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpaddw %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xc9,0xfd,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -1019,7 +1113,7 @@ define <32 x i16> @test_mask_add_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i3
define <32 x i16> @test_mask_add_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) {
; CHECK-LABEL: test_mask_add_epi16_rm_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpaddw (%rdi), %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xfd,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <32 x i16>, <32 x i16>* %ptr_b
@@ -1029,7 +1123,7 @@ define <32 x i16> @test_mask_add_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b)
define <32 x i16> @test_mask_add_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <32 x i16> %passThru, i32 %mask) {
; CHECK-LABEL: test_mask_add_epi16_rmk_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
; CHECK-NEXT: vpaddw (%rdi), %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0xfd,0x0f]
; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1]
@@ -1041,7 +1135,7 @@ define <32 x i16> @test_mask_add_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b
define <32 x i16> @test_mask_add_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i32 %mask) {
; CHECK-LABEL: test_mask_add_epi16_rmkz_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
; CHECK-NEXT: vpaddw (%rdi), %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xc9,0xfd,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -1054,7 +1148,7 @@ declare <32 x i16> @llvm.x86.avx512.mask.padd.w.512(<32 x i16>, <32 x i16>, <32
define <32 x i16> @test_mask_sub_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) {
; CHECK-LABEL: test_mask_sub_epi16_rr_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsubw %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xf9,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <32 x i16> @llvm.x86.avx512.mask.psub.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1)
@@ -1063,7 +1157,7 @@ define <32 x i16> @test_mask_sub_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) {
define <32 x i16> @test_mask_sub_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) {
; CHECK-LABEL: test_mask_sub_epi16_rrk_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpsubw %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0xf9,0xd1]
; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2]
@@ -1074,7 +1168,7 @@ define <32 x i16> @test_mask_sub_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32
define <32 x i16> @test_mask_sub_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %mask) {
; CHECK-LABEL: test_mask_sub_epi16_rrkz_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpsubw %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xc9,0xf9,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -1084,7 +1178,7 @@ define <32 x i16> @test_mask_sub_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i3
define <32 x i16> @test_mask_sub_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) {
; CHECK-LABEL: test_mask_sub_epi16_rm_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsubw (%rdi), %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xf9,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <32 x i16>, <32 x i16>* %ptr_b
@@ -1094,7 +1188,7 @@ define <32 x i16> @test_mask_sub_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b)
define <32 x i16> @test_mask_sub_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <32 x i16> %passThru, i32 %mask) {
; CHECK-LABEL: test_mask_sub_epi16_rmk_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
; CHECK-NEXT: vpsubw (%rdi), %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0xf9,0x0f]
; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1]
@@ -1106,7 +1200,7 @@ define <32 x i16> @test_mask_sub_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b
define <32 x i16> @test_mask_sub_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i32 %mask) {
; CHECK-LABEL: test_mask_sub_epi16_rmkz_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
; CHECK-NEXT: vpsubw (%rdi), %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xc9,0xf9,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -1119,7 +1213,7 @@ declare <32 x i16> @llvm.x86.avx512.mask.psub.w.512(<32 x i16>, <32 x i16>, <32
define <32 x i16> @test_mask_mullo_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) {
; CHECK-LABEL: test_mask_mullo_epi16_rr_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpmullw %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xd5,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <32 x i16> @llvm.x86.avx512.mask.pmull.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1)
@@ -1128,7 +1222,7 @@ define <32 x i16> @test_mask_mullo_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) {
define <32 x i16> @test_mask_mullo_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) {
; CHECK-LABEL: test_mask_mullo_epi16_rrk_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpmullw %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0xd5,0xd1]
; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2]
@@ -1139,7 +1233,7 @@ define <32 x i16> @test_mask_mullo_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <
define <32 x i16> @test_mask_mullo_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %mask) {
; CHECK-LABEL: test_mask_mullo_epi16_rrkz_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpmullw %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xc9,0xd5,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -1149,7 +1243,7 @@ define <32 x i16> @test_mask_mullo_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b,
define <32 x i16> @test_mask_mullo_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) {
; CHECK-LABEL: test_mask_mullo_epi16_rm_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpmullw (%rdi), %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xd5,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <32 x i16>, <32 x i16>* %ptr_b
@@ -1159,7 +1253,7 @@ define <32 x i16> @test_mask_mullo_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_
define <32 x i16> @test_mask_mullo_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <32 x i16> %passThru, i32 %mask) {
; CHECK-LABEL: test_mask_mullo_epi16_rmk_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
; CHECK-NEXT: vpmullw (%rdi), %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0xd5,0x0f]
; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1]
@@ -1171,7 +1265,7 @@ define <32 x i16> @test_mask_mullo_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr
define <32 x i16> @test_mask_mullo_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i32 %mask) {
; CHECK-LABEL: test_mask_mullo_epi16_rmkz_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
; CHECK-NEXT: vpmullw (%rdi), %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xc9,0xd5,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -1184,7 +1278,7 @@ declare <32 x i16> @llvm.x86.avx512.mask.pmull.w.512(<32 x i16>, <32 x i16>, <32
define <8 x i16> @test_mask_mullo_epi16_rr_128(<8 x i16> %a, <8 x i16> %b) {
; CHECK-LABEL: test_mask_mullo_epi16_rr_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd5,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.mask.pmull.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1)
@@ -1193,7 +1287,7 @@ define <8 x i16> @test_mask_mullo_epi16_rr_128(<8 x i16> %a, <8 x i16> %b) {
define <8 x i16> @test_mask_mullo_epi16_rrk_128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_mullo_epi16_rrk_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpmullw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xd5,0xd1]
; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
@@ -1204,7 +1298,7 @@ define <8 x i16> @test_mask_mullo_epi16_rrk_128(<8 x i16> %a, <8 x i16> %b, <8 x
define <8 x i16> @test_mask_mullo_epi16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) {
; CHECK-LABEL: test_mask_mullo_epi16_rrkz_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpmullw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xd5,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -1214,7 +1308,7 @@ define <8 x i16> @test_mask_mullo_epi16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i8
define <8 x i16> @test_mask_mullo_epi16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) {
; CHECK-LABEL: test_mask_mullo_epi16_rm_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpmullw (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd5,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x i16>, <8 x i16>* %ptr_b
@@ -1224,7 +1318,7 @@ define <8 x i16> @test_mask_mullo_epi16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b)
define <8 x i16> @test_mask_mullo_epi16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <8 x i16> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_mullo_epi16_rmk_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
; CHECK-NEXT: vpmullw (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xd5,0x0f]
; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
@@ -1236,7 +1330,7 @@ define <8 x i16> @test_mask_mullo_epi16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b,
define <8 x i16> @test_mask_mullo_epi16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i8 %mask) {
; CHECK-LABEL: test_mask_mullo_epi16_rmkz_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
; CHECK-NEXT: vpmullw (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xd5,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -1249,7 +1343,7 @@ declare <8 x i16> @llvm.x86.avx512.mask.pmull.w.128(<8 x i16>, <8 x i16>, <8 x i
define <16 x i16> @test_mask_mullo_epi16_rr_256(<16 x i16> %a, <16 x i16> %b) {
; CHECK-LABEL: test_mask_mullo_epi16_rr_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpmullw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd5,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.mask.pmull.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1)
@@ -1258,7 +1352,7 @@ define <16 x i16> @test_mask_mullo_epi16_rr_256(<16 x i16> %a, <16 x i16> %b) {
define <16 x i16> @test_mask_mullo_epi16_rrk_256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) {
; CHECK-LABEL: test_mask_mullo_epi16_rrk_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpmullw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xd5,0xd1]
; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2]
@@ -1269,7 +1363,7 @@ define <16 x i16> @test_mask_mullo_epi16_rrk_256(<16 x i16> %a, <16 x i16> %b, <
define <16 x i16> @test_mask_mullo_epi16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) {
; CHECK-LABEL: test_mask_mullo_epi16_rrkz_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpmullw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xd5,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -1279,7 +1373,7 @@ define <16 x i16> @test_mask_mullo_epi16_rrkz_256(<16 x i16> %a, <16 x i16> %b,
define <16 x i16> @test_mask_mullo_epi16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) {
; CHECK-LABEL: test_mask_mullo_epi16_rm_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpmullw (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd5,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <16 x i16>, <16 x i16>* %ptr_b
@@ -1289,7 +1383,7 @@ define <16 x i16> @test_mask_mullo_epi16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_
define <16 x i16> @test_mask_mullo_epi16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <16 x i16> %passThru, i16 %mask) {
; CHECK-LABEL: test_mask_mullo_epi16_rmk_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
; CHECK-NEXT: vpmullw (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xd5,0x0f]
; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
@@ -1301,7 +1395,7 @@ define <16 x i16> @test_mask_mullo_epi16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr
define <16 x i16> @test_mask_mullo_epi16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i16 %mask) {
; CHECK-LABEL: test_mask_mullo_epi16_rmkz_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
; CHECK-NEXT: vpmullw (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xd5,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -1316,7 +1410,7 @@ declare <16 x i8> @llvm.x86.avx512.mask.pmaxs.b.128(<16 x i8>, <16 x i8>, <16 x
define <16 x i8>@test_int_x86_avx512_mask_pmaxs_b_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %mask) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmaxs_b_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpmaxsb %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x3c,0xd1]
; CHECK-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x3c,0xc1]
@@ -1332,7 +1426,7 @@ declare <32 x i8> @llvm.x86.avx512.mask.pmaxs.b.256(<32 x i8>, <32 x i8>, <32 x
define <32 x i8>@test_int_x86_avx512_mask_pmaxs_b_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmaxs_b_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpmaxsb %ymm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x3c,0xd9]
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpmaxsb %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x3c,0xd1]
@@ -1348,7 +1442,7 @@ declare <8 x i16> @llvm.x86.avx512.mask.pmaxs.w.128(<8 x i16>, <8 x i16>, <8 x i
define <8 x i16>@test_int_x86_avx512_mask_pmaxs_w_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmaxs_w_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpmaxsw %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xee,0xd9]
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpmaxsw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xee,0xd1]
@@ -1364,7 +1458,7 @@ declare <16 x i16> @llvm.x86.avx512.mask.pmaxs.w.256(<16 x i16>, <16 x i16>, <16
define <16 x i16>@test_int_x86_avx512_mask_pmaxs_w_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %mask) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmaxs_w_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpmaxsw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xee,0xd1]
; CHECK-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xee,0xc1]
@@ -1380,7 +1474,7 @@ declare <16 x i8> @llvm.x86.avx512.mask.pmaxu.b.128(<16 x i8>, <16 x i8>, <16 x
define <16 x i8>@test_int_x86_avx512_mask_pmaxu_b_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2,i16 %mask) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmaxu_b_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpmaxub %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xde,0xd1]
; CHECK-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xde,0xc1]
@@ -1396,7 +1490,7 @@ declare <32 x i8> @llvm.x86.avx512.mask.pmaxu.b.256(<32 x i8>, <32 x i8>, <32 x
define <32 x i8>@test_int_x86_avx512_mask_pmaxu_b_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmaxu_b_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpmaxub %ymm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xde,0xd9]
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpmaxub %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xde,0xd1]
@@ -1412,7 +1506,7 @@ declare <8 x i16> @llvm.x86.avx512.mask.pmaxu.w.128(<8 x i16>, <8 x i16>, <8 x i
define <8 x i16>@test_int_x86_avx512_mask_pmaxu_w_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmaxu_w_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpmaxuw %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x3e,0xd9]
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpmaxuw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x3e,0xd1]
@@ -1428,7 +1522,7 @@ declare <16 x i16> @llvm.x86.avx512.mask.pmaxu.w.256(<16 x i16>, <16 x i16>, <16
define <16 x i16>@test_int_x86_avx512_mask_pmaxu_w_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %mask) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmaxu_w_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpmaxuw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x3e,0xd1]
; CHECK-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x3e,0xc1]
@@ -1444,7 +1538,7 @@ declare <16 x i8> @llvm.x86.avx512.mask.pmins.b.128(<16 x i8>, <16 x i8>, <16 x
define <16 x i8>@test_int_x86_avx512_mask_pmins_b_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %mask) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmins_b_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpminsb %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x38,0xd1]
; CHECK-NEXT: vpminsb %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x38,0xc1]
@@ -1460,7 +1554,7 @@ declare <32 x i8> @llvm.x86.avx512.mask.pmins.b.256(<32 x i8>, <32 x i8>, <32 x
define <32 x i8>@test_int_x86_avx512_mask_pmins_b_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmins_b_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpminsb %ymm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x38,0xd9]
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpminsb %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x38,0xd1]
@@ -1476,7 +1570,7 @@ declare <8 x i16> @llvm.x86.avx512.mask.pmins.w.128(<8 x i16>, <8 x i16>, <8 x i
define <8 x i16>@test_int_x86_avx512_mask_pmins_w_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmins_w_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpminsw %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xea,0xd9]
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpminsw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xea,0xd1]
@@ -1492,7 +1586,7 @@ declare <16 x i16> @llvm.x86.avx512.mask.pmins.w.256(<16 x i16>, <16 x i16>, <16
define <16 x i16>@test_int_x86_avx512_mask_pmins_w_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %mask) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmins_w_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpminsw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xea,0xd1]
; CHECK-NEXT: vpminsw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xea,0xc1]
@@ -1508,7 +1602,7 @@ declare <16 x i8> @llvm.x86.avx512.mask.pminu.b.128(<16 x i8>, <16 x i8>, <16 x
define <16 x i8>@test_int_x86_avx512_mask_pminu_b_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %mask) {
; CHECK-LABEL: test_int_x86_avx512_mask_pminu_b_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpminub %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xda,0xd1]
; CHECK-NEXT: vpminub %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xda,0xc1]
@@ -1524,7 +1618,7 @@ declare <32 x i8> @llvm.x86.avx512.mask.pminu.b.256(<32 x i8>, <32 x i8>, <32 x
define <32 x i8>@test_int_x86_avx512_mask_pminu_b_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_pminu_b_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpminub %ymm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xda,0xd9]
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpminub %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xda,0xd1]
@@ -1540,7 +1634,7 @@ declare <8 x i16> @llvm.x86.avx512.mask.pminu.w.128(<8 x i16>, <8 x i16>, <8 x i
define <8 x i16>@test_int_x86_avx512_mask_pminu_w_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_pminu_w_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpminuw %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x3a,0xd9]
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpminuw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x3a,0xd1]
@@ -1556,7 +1650,7 @@ declare <16 x i16> @llvm.x86.avx512.mask.pminu.w.256(<16 x i16>, <16 x i16>, <16
define <16 x i16>@test_int_x86_avx512_mask_pminu_w_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %mask) {
; CHECK-LABEL: test_int_x86_avx512_mask_pminu_w_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpminuw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x3a,0xd1]
; CHECK-NEXT: vpminuw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x3a,0xc1]
@@ -1572,13 +1666,13 @@ declare <8 x i16> @llvm.x86.avx512.mask.psrl.w.128(<8 x i16>, <8 x i16>, <8 x i1
define <8 x i16>@test_int_x86_avx512_mask_psrl_w_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psrl_w_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsrlw %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd1,0xd9]
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpsrlw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xd1,0xd1]
-; CHECK-NEXT: vpaddw %xmm3, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xd3]
; CHECK-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xd1,0xc1]
-; CHECK-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc2]
+; CHECK-NEXT: vpaddw %xmm0, %xmm3, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe1,0xfd,0xc0]
+; CHECK-NEXT: vpaddw %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.mask.psrl.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3)
%res1 = call <8 x i16> @llvm.x86.avx512.mask.psrl.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1)
@@ -1592,12 +1686,12 @@ declare <16 x i16> @llvm.x86.avx512.mask.psrl.w.256(<16 x i16>, <8 x i16>, <16 x
define <16 x i16>@test_int_x86_avx512_mask_psrl_w_256(<16 x i16> %x0, <8 x i16> %x1, <16 x i16> %x2, i16 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psrl_w_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsrlw %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd1,0xd9]
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpsrlw %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xd1,0xd1]
-; CHECK-NEXT: vpaddw %ymm3, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xd3]
; CHECK-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xd1,0xc1]
+; CHECK-NEXT: vpaddw %ymm0, %ymm3, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe5,0xfd,0xc0]
; CHECK-NEXT: vpaddw %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.mask.psrl.w.256(<16 x i16> %x0, <8 x i16> %x1, <16 x i16> %x2, i16 %x3)
@@ -1612,13 +1706,13 @@ declare <8 x i16> @llvm.x86.avx512.mask.psra.w.128(<8 x i16>, <8 x i16>, <8 x i1
define <8 x i16>@test_int_x86_avx512_mask_psra_w_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psra_w_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsraw %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe1,0xd9]
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpsraw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xe1,0xd1]
; CHECK-NEXT: vpsraw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xe1,0xc1]
-; CHECK-NEXT: vpaddw %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc0]
; CHECK-NEXT: vpaddw %xmm3, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc3]
+; CHECK-NEXT: vpaddw %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.mask.psra.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3)
%res1 = call <8 x i16> @llvm.x86.avx512.mask.psra.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> zeroinitializer, i8 %x3)
@@ -1632,13 +1726,13 @@ declare <16 x i16> @llvm.x86.avx512.mask.psra.w.256(<16 x i16>, <8 x i16>, <16 x
define <16 x i16>@test_int_x86_avx512_mask_psra_w_256(<16 x i16> %x0, <8 x i16> %x1, <16 x i16> %x2, i16 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psra_w_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsraw %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe1,0xd9]
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpsraw %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xe1,0xd1]
; CHECK-NEXT: vpsraw %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xe1,0xc1]
-; CHECK-NEXT: vpaddw %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0]
; CHECK-NEXT: vpaddw %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0xc3]
+; CHECK-NEXT: vpaddw %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.mask.psra.w.256(<16 x i16> %x0, <8 x i16> %x1, <16 x i16> %x2, i16 %x3)
%res1 = call <16 x i16> @llvm.x86.avx512.mask.psra.w.256(<16 x i16> %x0, <8 x i16> %x1, <16 x i16> zeroinitializer, i16 %x3)
@@ -1652,13 +1746,13 @@ declare <8 x i16> @llvm.x86.avx512.mask.psll.w.128(<8 x i16>, <8 x i16>, <8 x i1
define <8 x i16>@test_int_x86_avx512_mask_psll_w_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psll_w_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsllw %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xf1,0xd9]
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpsllw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xf1,0xd1]
; CHECK-NEXT: vpsllw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xf1,0xc1]
-; CHECK-NEXT: vpaddw %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc0]
; CHECK-NEXT: vpaddw %xmm3, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc3]
+; CHECK-NEXT: vpaddw %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.mask.psll.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3)
%res1 = call <8 x i16> @llvm.x86.avx512.mask.psll.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> zeroinitializer, i8 %x3)
@@ -1672,13 +1766,13 @@ declare <16 x i16> @llvm.x86.avx512.mask.psll.w.256(<16 x i16>, <8 x i16>, <16 x
define <16 x i16>@test_int_x86_avx512_mask_psll_w_256(<16 x i16> %x0, <8 x i16> %x1, <16 x i16> %x2, i16 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psll_w_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsllw %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xf1,0xd9]
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpsllw %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xf1,0xd1]
; CHECK-NEXT: vpsllw %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xf1,0xc1]
-; CHECK-NEXT: vpaddw %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0]
; CHECK-NEXT: vpaddw %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0xc3]
+; CHECK-NEXT: vpaddw %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.mask.psll.w.256(<16 x i16> %x0, <8 x i16> %x1, <16 x i16> %x2, i16 %x3)
%res1 = call <16 x i16> @llvm.x86.avx512.mask.psll.w.256(<16 x i16> %x0, <8 x i16> %x1, <16 x i16> zeroinitializer, i16 %x3)
@@ -1692,13 +1786,13 @@ declare <8 x i16> @llvm.x86.avx512.mask.psrl.wi.128(<8 x i16>, i32, <8 x i16>, i
define <8 x i16>@test_int_x86_avx512_mask_psrl_wi_128(<8 x i16> %x0, i32 %x1, <8 x i16> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psrl_wi_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsrlw $3, %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0x71,0xd0,0x03]
; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
; CHECK-NEXT: vpsrlw $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x09,0x71,0xd0,0x03]
-; CHECK-NEXT: vpaddw %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xca]
; CHECK-NEXT: vpsrlw $3, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x71,0xd0,0x03]
-; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc1]
+; CHECK-NEXT: vpaddw %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc0]
+; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.mask.psrl.wi.128(<8 x i16> %x0, i32 3, <8 x i16> %x2, i8 %x3)
%res1 = call <8 x i16> @llvm.x86.avx512.mask.psrl.wi.128(<8 x i16> %x0, i32 3, <8 x i16> %x2, i8 -1)
@@ -1712,12 +1806,12 @@ declare <16 x i16> @llvm.x86.avx512.mask.psrl.wi.256(<16 x i16>, i32, <16 x i16>
define <16 x i16>@test_int_x86_avx512_mask_psrl_wi_256(<16 x i16> %x0, i32 %x1, <16 x i16> %x2, i16 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psrl_wi_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsrlw $3, %ymm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x71,0xd0,0x03]
; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
; CHECK-NEXT: vpsrlw $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x29,0x71,0xd0,0x03]
-; CHECK-NEXT: vpaddw %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xca]
; CHECK-NEXT: vpsrlw $3, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x71,0xd0,0x03]
+; CHECK-NEXT: vpaddw %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0]
; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.mask.psrl.wi.256(<16 x i16> %x0, i32 3, <16 x i16> %x2, i16 %x3)
@@ -1732,13 +1826,13 @@ declare <8 x i16> @llvm.x86.avx512.mask.psra.wi.128(<8 x i16>, i32, <8 x i16>, i
define <8 x i16>@test_int_x86_avx512_mask_psra_wi_128(<8 x i16> %x0, i32 %x1, <8 x i16> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psra_wi_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsraw $3, %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0x71,0xe0,0x03]
; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
; CHECK-NEXT: vpsraw $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x09,0x71,0xe0,0x03]
; CHECK-NEXT: vpsraw $3, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x71,0xe0,0x03]
-; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc0]
; CHECK-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc2]
+; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.mask.psra.wi.128(<8 x i16> %x0, i32 3, <8 x i16> %x2, i8 %x3)
%res1 = call <8 x i16> @llvm.x86.avx512.mask.psra.wi.128(<8 x i16> %x0, i32 3, <8 x i16> zeroinitializer, i8 %x3)
@@ -1752,13 +1846,13 @@ declare <16 x i16> @llvm.x86.avx512.mask.psra.wi.256(<16 x i16>, i32, <16 x i16>
define <16 x i16>@test_int_x86_avx512_mask_psra_wi_256(<16 x i16> %x0, i32 %x1, <16 x i16> %x2, i16 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psra_wi_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsraw $3, %ymm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x71,0xe0,0x03]
; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
; CHECK-NEXT: vpsraw $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x29,0x71,0xe0,0x03]
; CHECK-NEXT: vpsraw $3, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x71,0xe0,0x03]
-; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0]
; CHECK-NEXT: vpaddw %ymm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0xc2]
+; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.mask.psra.wi.256(<16 x i16> %x0, i32 3, <16 x i16> %x2, i16 %x3)
%res1 = call <16 x i16> @llvm.x86.avx512.mask.psra.wi.256(<16 x i16> %x0, i32 3, <16 x i16> zeroinitializer, i16 %x3)
@@ -1772,13 +1866,13 @@ declare <8 x i16> @llvm.x86.avx512.mask.psll.wi.128(<8 x i16>, i32, <8 x i16>, i
define <8 x i16>@test_int_x86_avx512_mask_psll_wi_128(<8 x i16> %x0, i32 %x1, <8 x i16> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psll_wi_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsllw $3, %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0x71,0xf0,0x03]
; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
; CHECK-NEXT: vpsllw $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x09,0x71,0xf0,0x03]
; CHECK-NEXT: vpsllw $3, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x71,0xf0,0x03]
-; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc0]
; CHECK-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc2]
+; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.mask.psll.wi.128(<8 x i16> %x0, i32 3, <8 x i16> %x2, i8 %x3)
%res1 = call <8 x i16> @llvm.x86.avx512.mask.psll.wi.128(<8 x i16> %x0, i32 3, <8 x i16> zeroinitializer, i8 %x3)
@@ -1792,13 +1886,13 @@ declare <16 x i16> @llvm.x86.avx512.mask.psll.wi.256(<16 x i16>, i32, <16 x i16>
define <16 x i16>@test_int_x86_avx512_mask_psll_wi_256(<16 x i16> %x0, i32 %x1, <16 x i16> %x2, i16 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psll_wi_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsllw $3, %ymm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x71,0xf0,0x03]
; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
; CHECK-NEXT: vpsllw $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x29,0x71,0xf0,0x03]
; CHECK-NEXT: vpsllw $3, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x71,0xf0,0x03]
-; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0]
; CHECK-NEXT: vpaddw %ymm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0xc2]
+; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.mask.psll.wi.256(<16 x i16> %x0, i32 3, <16 x i16> %x2, i16 %x3)
%res1 = call <16 x i16> @llvm.x86.avx512.mask.psll.wi.256(<16 x i16> %x0, i32 3, <16 x i16> zeroinitializer, i16 %x3)
@@ -1812,7 +1906,7 @@ declare <16 x i8> @llvm.x86.avx512.mask.pshuf.b.128(<16 x i8>, <16 x i8>, <16 x
define <16 x i8>@test_int_x86_avx512_mask_pshuf_b_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_pshuf_b_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpshufb %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x00,0xd9]
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpshufb %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x00,0xd1]
@@ -1828,7 +1922,7 @@ declare <32 x i8> @llvm.x86.avx512.mask.pshuf.b.256(<32 x i8>, <32 x i8>, <32 x
define <32 x i8>@test_int_x86_avx512_mask_pshuf_b_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_pshuf_b_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpshufb %ymm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x00,0xd9]
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpshufb %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x00,0xd1]
@@ -1844,7 +1938,7 @@ declare <8 x i16> @llvm.x86.avx512.mask.pmovzxb.w.128(<16 x i8>, <8 x i16>, i8)
define <8 x i16>@test_int_x86_avx512_mask_pmovzxb_w_128(<16 x i8> %x0, <8 x i16> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovzxb_w_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpmovzxbw %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x30,0xd0]
; CHECK-NEXT: ## xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
@@ -1852,8 +1946,8 @@ define <8 x i16>@test_int_x86_avx512_mask_pmovzxb_w_128(<16 x i8> %x0, <8 x i16>
; CHECK-NEXT: ## xmm1 {%k1} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; CHECK-NEXT: vpmovzxbw %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x30,0xc0]
; CHECK-NEXT: ## xmm0 {%k1} {z} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc0]
; CHECK-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc2]
+; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.mask.pmovzxb.w.128(<16 x i8> %x0, <8 x i16> %x1, i8 %x2)
%res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovzxb.w.128(<16 x i8> %x0, <8 x i16> zeroinitializer, i8 %x2)
@@ -1867,7 +1961,7 @@ declare <16 x i16> @llvm.x86.avx512.mask.pmovzxb.w.256(<16 x i8>, <16 x i16>, i1
define <16 x i16>@test_int_x86_avx512_mask_pmovzxb_w_256(<16 x i8> %x0, <16 x i16> %x1, i16 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovzxb_w_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpmovzxbw %xmm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x30,0xd0]
; CHECK-NEXT: ## ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
@@ -1875,8 +1969,8 @@ define <16 x i16>@test_int_x86_avx512_mask_pmovzxb_w_256(<16 x i8> %x0, <16 x i1
; CHECK-NEXT: ## ymm1 {%k1} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
; CHECK-NEXT: vpmovzxbw %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x30,0xc0]
; CHECK-NEXT: ## ymm0 {%k1} {z} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0]
; CHECK-NEXT: vpaddw %ymm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0xc2]
+; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.mask.pmovzxb.w.256(<16 x i8> %x0, <16 x i16> %x1, i16 %x2)
%res1 = call <16 x i16> @llvm.x86.avx512.mask.pmovzxb.w.256(<16 x i8> %x0, <16 x i16> zeroinitializer, i16 %x2)
@@ -1891,13 +1985,13 @@ declare <8 x i16> @llvm.x86.avx512.mask.pmovsxb.w.128(<16 x i8>, <8 x i16>, i8)
define <8 x i16>@test_int_x86_avx512_mask_pmovsxb_w_128(<16 x i8> %x0, <8 x i16> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovsxb_w_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpmovsxbw %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x20,0xd0]
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpmovsxbw %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x20,0xc8]
; CHECK-NEXT: vpmovsxbw %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x20,0xc0]
-; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc0]
; CHECK-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc2]
+; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.mask.pmovsxb.w.128(<16 x i8> %x0, <8 x i16> %x1, i8 %x2)
%res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovsxb.w.128(<16 x i8> %x0, <8 x i16> zeroinitializer, i8 %x2)
@@ -1911,13 +2005,13 @@ declare <16 x i16> @llvm.x86.avx512.mask.pmovsxb.w.256(<16 x i8>, <16 x i16>, i1
define <16 x i16>@test_int_x86_avx512_mask_pmovsxb_w_256(<16 x i8> %x0, <16 x i16> %x1, i16 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovsxb_w_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpmovsxbw %xmm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x20,0xd0]
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpmovsxbw %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x20,0xc8]
; CHECK-NEXT: vpmovsxbw %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x20,0xc0]
-; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0]
; CHECK-NEXT: vpaddw %ymm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0xc2]
+; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.mask.pmovsxb.w.256(<16 x i8> %x0, <16 x i16> %x1, i16 %x2)
%res1 = call <16 x i16> @llvm.x86.avx512.mask.pmovsxb.w.256(<16 x i8> %x0, <16 x i16> zeroinitializer, i16 %x2)
@@ -1931,13 +2025,13 @@ declare <2 x i64> @llvm.x86.avx512.mask.pmovsxd.q.128(<4 x i32>, <2 x i64>, i8)
define <2 x i64>@test_int_x86_avx512_mask_pmovsxd_q_128(<4 x i32> %x0, <2 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovsxd_q_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpmovsxdq %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x25,0xd0]
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpmovsxdq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x25,0xc8]
; CHECK-NEXT: vpmovsxdq %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x25,0xc0]
-; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0]
; CHECK-NEXT: vpaddq %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd4,0xc2]
+; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x i64> @llvm.x86.avx512.mask.pmovsxd.q.128(<4 x i32> %x0, <2 x i64> %x1, i8 %x2)
%res1 = call <2 x i64> @llvm.x86.avx512.mask.pmovsxd.q.128(<4 x i32> %x0, <2 x i64> zeroinitializer, i8 %x2)
@@ -1951,13 +2045,13 @@ declare <4 x i64> @llvm.x86.avx512.mask.pmovsxd.q.256(<4 x i32>, <4 x i64>, i8)
define <4 x i64>@test_int_x86_avx512_mask_pmovsxd_q_256(<4 x i32> %x0, <4 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovsxd_q_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpmovsxdq %xmm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x25,0xd0]
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpmovsxdq %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x25,0xc8]
; CHECK-NEXT: vpmovsxdq %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x25,0xc0]
-; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0]
; CHECK-NEXT: vpaddq %ymm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd4,0xc2]
+; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i64> @llvm.x86.avx512.mask.pmovsxd.q.256(<4 x i32> %x0, <4 x i64> %x1, i8 %x2)
%res1 = call <4 x i64> @llvm.x86.avx512.mask.pmovsxd.q.256(<4 x i32> %x0, <4 x i64> zeroinitializer, i8 %x2)
@@ -1972,7 +2066,7 @@ declare <16 x i8> @llvm.x86.avx512.cvtmask2b.128(i16)
define <16 x i8>@test_int_x86_avx512_cvtmask2b_128(i16 %x0) {
; CHECK-LABEL: test_int_x86_avx512_cvtmask2b_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k0 ## encoding: [0xc5,0xfb,0x92,0xc7]
; CHECK-NEXT: vpmovm2b %k0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x08,0x28,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -1984,7 +2078,7 @@ declare <32 x i8> @llvm.x86.avx512.cvtmask2b.256(i32)
define <32 x i8>@test_int_x86_avx512_cvtmask2b_256(i32 %x0) {
; CHECK-LABEL: test_int_x86_avx512_cvtmask2b_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k0 ## encoding: [0xc5,0xfb,0x92,0xc7]
; CHECK-NEXT: vpmovm2b %k0, %ymm0 ## encoding: [0x62,0xf2,0x7e,0x28,0x28,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -1996,7 +2090,7 @@ declare <8 x i16> @llvm.x86.avx512.cvtmask2w.128(i8)
define <8 x i16>@test_int_x86_avx512_cvtmask2w_128(i8 %x0) {
; CHECK-LABEL: test_int_x86_avx512_cvtmask2w_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k0 ## encoding: [0xc5,0xfb,0x92,0xc7]
; CHECK-NEXT: vpmovm2w %k0, %xmm0 ## encoding: [0x62,0xf2,0xfe,0x08,0x28,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -2008,7 +2102,7 @@ declare <16 x i16> @llvm.x86.avx512.cvtmask2w.256(i16)
define <16 x i16>@test_int_x86_avx512_cvtmask2w_256(i16 %x0) {
; CHECK-LABEL: test_int_x86_avx512_cvtmask2w_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k0 ## encoding: [0xc5,0xfb,0x92,0xc7]
; CHECK-NEXT: vpmovm2w %k0, %ymm0 ## encoding: [0x62,0xf2,0xfe,0x28,0x28,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -2017,7 +2111,7 @@ define <16 x i16>@test_int_x86_avx512_cvtmask2w_256(i16 %x0) {
}
define <8 x i16> @test_mask_packs_epi32_rr_128(<4 x i32> %a, <4 x i32> %b) {
; CHECK-LABEL: test_mask_packs_epi32_rr_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6b,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.mask.packssdw.128(<4 x i32> %a, <4 x i32> %b, <8 x i16> zeroinitializer, i8 -1)
@@ -2026,7 +2120,7 @@ define <8 x i16> @test_mask_packs_epi32_rr_128(<4 x i32> %a, <4 x i32> %b) {
define <8 x i16> @test_mask_packs_epi32_rrk_128(<4 x i32> %a, <4 x i32> %b, <8 x i16> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_packs_epi32_rrk_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpackssdw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x6b,0xd1]
; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
@@ -2037,7 +2131,7 @@ define <8 x i16> @test_mask_packs_epi32_rrk_128(<4 x i32> %a, <4 x i32> %b, <8 x
define <8 x i16> @test_mask_packs_epi32_rrkz_128(<4 x i32> %a, <4 x i32> %b, i8 %mask) {
; CHECK-LABEL: test_mask_packs_epi32_rrkz_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x6b,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -2047,7 +2141,7 @@ define <8 x i16> @test_mask_packs_epi32_rrkz_128(<4 x i32> %a, <4 x i32> %b, i8
define <8 x i16> @test_mask_packs_epi32_rm_128(<4 x i32> %a, <4 x i32>* %ptr_b) {
; CHECK-LABEL: test_mask_packs_epi32_rm_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpackssdw (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6b,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <4 x i32>, <4 x i32>* %ptr_b
@@ -2057,7 +2151,7 @@ define <8 x i16> @test_mask_packs_epi32_rm_128(<4 x i32> %a, <4 x i32>* %ptr_b)
define <8 x i16> @test_mask_packs_epi32_rmk_128(<4 x i32> %a, <4 x i32>* %ptr_b, <8 x i16> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_packs_epi32_rmk_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
; CHECK-NEXT: vpackssdw (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x6b,0x0f]
; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
@@ -2069,7 +2163,7 @@ define <8 x i16> @test_mask_packs_epi32_rmk_128(<4 x i32> %a, <4 x i32>* %ptr_b,
define <8 x i16> @test_mask_packs_epi32_rmkz_128(<4 x i32> %a, <4 x i32>* %ptr_b, i8 %mask) {
; CHECK-LABEL: test_mask_packs_epi32_rmkz_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
; CHECK-NEXT: vpackssdw (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x6b,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -2080,7 +2174,7 @@ define <8 x i16> @test_mask_packs_epi32_rmkz_128(<4 x i32> %a, <4 x i32>* %ptr_b
define <8 x i16> @test_mask_packs_epi32_rmb_128(<4 x i32> %a, i32* %ptr_b) {
; CHECK-LABEL: test_mask_packs_epi32_rmb_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpackssdw (%rdi){1to4}, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x18,0x6b,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load i32, i32* %ptr_b
@@ -2092,7 +2186,7 @@ define <8 x i16> @test_mask_packs_epi32_rmb_128(<4 x i32> %a, i32* %ptr_b) {
define <8 x i16> @test_mask_packs_epi32_rmbk_128(<4 x i32> %a, i32* %ptr_b, <8 x i16> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_packs_epi32_rmbk_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
; CHECK-NEXT: vpackssdw (%rdi){1to4}, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x19,0x6b,0x0f]
; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
@@ -2106,7 +2200,7 @@ define <8 x i16> @test_mask_packs_epi32_rmbk_128(<4 x i32> %a, i32* %ptr_b, <8 x
define <8 x i16> @test_mask_packs_epi32_rmbkz_128(<4 x i32> %a, i32* %ptr_b, i8 %mask) {
; CHECK-LABEL: test_mask_packs_epi32_rmbkz_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
; CHECK-NEXT: vpackssdw (%rdi){1to4}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x99,0x6b,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -2121,7 +2215,7 @@ declare <8 x i16> @llvm.x86.avx512.mask.packssdw.128(<4 x i32>, <4 x i32>, <8 x
define <16 x i16> @test_mask_packs_epi32_rr_256(<8 x i32> %a, <8 x i32> %b) {
; CHECK-LABEL: test_mask_packs_epi32_rr_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6b,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.mask.packssdw.256(<8 x i32> %a, <8 x i32> %b, <16 x i16> zeroinitializer, i16 -1)
@@ -2130,7 +2224,7 @@ define <16 x i16> @test_mask_packs_epi32_rr_256(<8 x i32> %a, <8 x i32> %b) {
define <16 x i16> @test_mask_packs_epi32_rrk_256(<8 x i32> %a, <8 x i32> %b, <16 x i16> %passThru, i16 %mask) {
; CHECK-LABEL: test_mask_packs_epi32_rrk_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpackssdw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x6b,0xd1]
; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2]
@@ -2141,7 +2235,7 @@ define <16 x i16> @test_mask_packs_epi32_rrk_256(<8 x i32> %a, <8 x i32> %b, <16
define <16 x i16> @test_mask_packs_epi32_rrkz_256(<8 x i32> %a, <8 x i32> %b, i16 %mask) {
; CHECK-LABEL: test_mask_packs_epi32_rrkz_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x6b,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -2151,7 +2245,7 @@ define <16 x i16> @test_mask_packs_epi32_rrkz_256(<8 x i32> %a, <8 x i32> %b, i1
define <16 x i16> @test_mask_packs_epi32_rm_256(<8 x i32> %a, <8 x i32>* %ptr_b) {
; CHECK-LABEL: test_mask_packs_epi32_rm_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpackssdw (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6b,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x i32>, <8 x i32>* %ptr_b
@@ -2161,7 +2255,7 @@ define <16 x i16> @test_mask_packs_epi32_rm_256(<8 x i32> %a, <8 x i32>* %ptr_b)
define <16 x i16> @test_mask_packs_epi32_rmk_256(<8 x i32> %a, <8 x i32>* %ptr_b, <16 x i16> %passThru, i16 %mask) {
; CHECK-LABEL: test_mask_packs_epi32_rmk_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
; CHECK-NEXT: vpackssdw (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x6b,0x0f]
; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
@@ -2173,7 +2267,7 @@ define <16 x i16> @test_mask_packs_epi32_rmk_256(<8 x i32> %a, <8 x i32>* %ptr_b
define <16 x i16> @test_mask_packs_epi32_rmkz_256(<8 x i32> %a, <8 x i32>* %ptr_b, i16 %mask) {
; CHECK-LABEL: test_mask_packs_epi32_rmkz_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
; CHECK-NEXT: vpackssdw (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x6b,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -2184,7 +2278,7 @@ define <16 x i16> @test_mask_packs_epi32_rmkz_256(<8 x i32> %a, <8 x i32>* %ptr_
define <16 x i16> @test_mask_packs_epi32_rmb_256(<8 x i32> %a, i32* %ptr_b) {
; CHECK-LABEL: test_mask_packs_epi32_rmb_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpackssdw (%rdi){1to8}, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x38,0x6b,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load i32, i32* %ptr_b
@@ -2196,7 +2290,7 @@ define <16 x i16> @test_mask_packs_epi32_rmb_256(<8 x i32> %a, i32* %ptr_b) {
define <16 x i16> @test_mask_packs_epi32_rmbk_256(<8 x i32> %a, i32* %ptr_b, <16 x i16> %passThru, i16 %mask) {
; CHECK-LABEL: test_mask_packs_epi32_rmbk_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
; CHECK-NEXT: vpackssdw (%rdi){1to8}, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x39,0x6b,0x0f]
; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
@@ -2210,7 +2304,7 @@ define <16 x i16> @test_mask_packs_epi32_rmbk_256(<8 x i32> %a, i32* %ptr_b, <16
define <16 x i16> @test_mask_packs_epi32_rmbkz_256(<8 x i32> %a, i32* %ptr_b, i16 %mask) {
; CHECK-LABEL: test_mask_packs_epi32_rmbkz_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
; CHECK-NEXT: vpackssdw (%rdi){1to8}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xb9,0x6b,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -2225,7 +2319,7 @@ declare <16 x i16> @llvm.x86.avx512.mask.packssdw.256(<8 x i32>, <8 x i32>, <16
define <16 x i8> @test_mask_packs_epi16_rr_128(<8 x i16> %a, <8 x i16> %b) {
; CHECK-LABEL: test_mask_packs_epi16_rr_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x63,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i8> @llvm.x86.avx512.mask.packsswb.128(<8 x i16> %a, <8 x i16> %b, <16 x i8> zeroinitializer, i16 -1)
@@ -2234,7 +2328,7 @@ define <16 x i8> @test_mask_packs_epi16_rr_128(<8 x i16> %a, <8 x i16> %b) {
define <16 x i8> @test_mask_packs_epi16_rrk_128(<8 x i16> %a, <8 x i16> %b, <16 x i8> %passThru, i16 %mask) {
; CHECK-LABEL: test_mask_packs_epi16_rrk_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpacksswb %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x63,0xd1]
; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
@@ -2245,7 +2339,7 @@ define <16 x i8> @test_mask_packs_epi16_rrk_128(<8 x i16> %a, <8 x i16> %b, <16
define <16 x i8> @test_mask_packs_epi16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i16 %mask) {
; CHECK-LABEL: test_mask_packs_epi16_rrkz_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x63,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -2255,7 +2349,7 @@ define <16 x i8> @test_mask_packs_epi16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i16
define <16 x i8> @test_mask_packs_epi16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) {
; CHECK-LABEL: test_mask_packs_epi16_rm_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpacksswb (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x63,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x i16>, <8 x i16>* %ptr_b
@@ -2265,7 +2359,7 @@ define <16 x i8> @test_mask_packs_epi16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b)
define <16 x i8> @test_mask_packs_epi16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <16 x i8> %passThru, i16 %mask) {
; CHECK-LABEL: test_mask_packs_epi16_rmk_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
; CHECK-NEXT: vpacksswb (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x63,0x0f]
; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
@@ -2277,7 +2371,7 @@ define <16 x i8> @test_mask_packs_epi16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b,
define <16 x i8> @test_mask_packs_epi16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i16 %mask) {
; CHECK-LABEL: test_mask_packs_epi16_rmkz_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
; CHECK-NEXT: vpacksswb (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x63,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -2290,7 +2384,7 @@ declare <16 x i8> @llvm.x86.avx512.mask.packsswb.128(<8 x i16>, <8 x i16>, <16 x
define <32 x i8> @test_mask_packs_epi16_rr_256(<16 x i16> %a, <16 x i16> %b) {
; CHECK-LABEL: test_mask_packs_epi16_rr_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpacksswb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x63,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <32 x i8> @llvm.x86.avx512.mask.packsswb.256(<16 x i16> %a, <16 x i16> %b, <32 x i8> zeroinitializer, i32 -1)
@@ -2299,7 +2393,7 @@ define <32 x i8> @test_mask_packs_epi16_rr_256(<16 x i16> %a, <16 x i16> %b) {
define <32 x i8> @test_mask_packs_epi16_rrk_256(<16 x i16> %a, <16 x i16> %b, <32 x i8> %passThru, i32 %mask) {
; CHECK-LABEL: test_mask_packs_epi16_rrk_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpacksswb %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x63,0xd1]
; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2]
@@ -2310,7 +2404,7 @@ define <32 x i8> @test_mask_packs_epi16_rrk_256(<16 x i16> %a, <16 x i16> %b, <3
define <32 x i8> @test_mask_packs_epi16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i32 %mask) {
; CHECK-LABEL: test_mask_packs_epi16_rrkz_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpacksswb %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x63,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -2320,7 +2414,7 @@ define <32 x i8> @test_mask_packs_epi16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i
define <32 x i8> @test_mask_packs_epi16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) {
; CHECK-LABEL: test_mask_packs_epi16_rm_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpacksswb (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x63,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <16 x i16>, <16 x i16>* %ptr_b
@@ -2330,7 +2424,7 @@ define <32 x i8> @test_mask_packs_epi16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b
define <32 x i8> @test_mask_packs_epi16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <32 x i8> %passThru, i32 %mask) {
; CHECK-LABEL: test_mask_packs_epi16_rmk_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
; CHECK-NEXT: vpacksswb (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x63,0x0f]
; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
@@ -2342,7 +2436,7 @@ define <32 x i8> @test_mask_packs_epi16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_
define <32 x i8> @test_mask_packs_epi16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i32 %mask) {
; CHECK-LABEL: test_mask_packs_epi16_rmkz_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
; CHECK-NEXT: vpacksswb (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x63,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -2356,7 +2450,7 @@ declare <32 x i8> @llvm.x86.avx512.mask.packsswb.256(<16 x i16>, <16 x i16>, <32
define <8 x i16> @test_mask_packus_epi32_rr_128(<4 x i32> %a, <4 x i32> %b) {
; CHECK-LABEL: test_mask_packus_epi32_rr_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x2b,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.mask.packusdw.128(<4 x i32> %a, <4 x i32> %b, <8 x i16> zeroinitializer, i8 -1)
@@ -2365,7 +2459,7 @@ define <8 x i16> @test_mask_packus_epi32_rr_128(<4 x i32> %a, <4 x i32> %b) {
define <8 x i16> @test_mask_packus_epi32_rrk_128(<4 x i32> %a, <4 x i32> %b, <8 x i16> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_packus_epi32_rrk_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpackusdw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x2b,0xd1]
; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
@@ -2376,7 +2470,7 @@ define <8 x i16> @test_mask_packus_epi32_rrk_128(<4 x i32> %a, <4 x i32> %b, <8
define <8 x i16> @test_mask_packus_epi32_rrkz_128(<4 x i32> %a, <4 x i32> %b, i8 %mask) {
; CHECK-LABEL: test_mask_packus_epi32_rrkz_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x2b,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -2386,7 +2480,7 @@ define <8 x i16> @test_mask_packus_epi32_rrkz_128(<4 x i32> %a, <4 x i32> %b, i8
define <8 x i16> @test_mask_packus_epi32_rm_128(<4 x i32> %a, <4 x i32>* %ptr_b) {
; CHECK-LABEL: test_mask_packus_epi32_rm_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpackusdw (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x2b,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <4 x i32>, <4 x i32>* %ptr_b
@@ -2396,7 +2490,7 @@ define <8 x i16> @test_mask_packus_epi32_rm_128(<4 x i32> %a, <4 x i32>* %ptr_b)
define <8 x i16> @test_mask_packus_epi32_rmk_128(<4 x i32> %a, <4 x i32>* %ptr_b, <8 x i16> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_packus_epi32_rmk_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
; CHECK-NEXT: vpackusdw (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x2b,0x0f]
; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
@@ -2408,7 +2502,7 @@ define <8 x i16> @test_mask_packus_epi32_rmk_128(<4 x i32> %a, <4 x i32>* %ptr_b
define <8 x i16> @test_mask_packus_epi32_rmkz_128(<4 x i32> %a, <4 x i32>* %ptr_b, i8 %mask) {
; CHECK-LABEL: test_mask_packus_epi32_rmkz_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
; CHECK-NEXT: vpackusdw (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x2b,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -2419,7 +2513,7 @@ define <8 x i16> @test_mask_packus_epi32_rmkz_128(<4 x i32> %a, <4 x i32>* %ptr_
define <8 x i16> @test_mask_packus_epi32_rmb_128(<4 x i32> %a, i32* %ptr_b) {
; CHECK-LABEL: test_mask_packus_epi32_rmb_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpackusdw (%rdi){1to4}, %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7d,0x18,0x2b,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load i32, i32* %ptr_b
@@ -2431,7 +2525,7 @@ define <8 x i16> @test_mask_packus_epi32_rmb_128(<4 x i32> %a, i32* %ptr_b) {
define <8 x i16> @test_mask_packus_epi32_rmbk_128(<4 x i32> %a, i32* %ptr_b, <8 x i16> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_packus_epi32_rmbk_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
; CHECK-NEXT: vpackusdw (%rdi){1to4}, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x19,0x2b,0x0f]
; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
@@ -2445,7 +2539,7 @@ define <8 x i16> @test_mask_packus_epi32_rmbk_128(<4 x i32> %a, i32* %ptr_b, <8
define <8 x i16> @test_mask_packus_epi32_rmbkz_128(<4 x i32> %a, i32* %ptr_b, i8 %mask) {
; CHECK-LABEL: test_mask_packus_epi32_rmbkz_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
; CHECK-NEXT: vpackusdw (%rdi){1to4}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x99,0x2b,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -2460,7 +2554,7 @@ declare <8 x i16> @llvm.x86.avx512.mask.packusdw.128(<4 x i32>, <4 x i32>, <8 x
define <16 x i16> @test_mask_packus_epi32_rr_256(<8 x i32> %a, <8 x i32> %b) {
; CHECK-LABEL: test_mask_packus_epi32_rr_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x2b,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.mask.packusdw.256(<8 x i32> %a, <8 x i32> %b, <16 x i16> zeroinitializer, i16 -1)
@@ -2469,7 +2563,7 @@ define <16 x i16> @test_mask_packus_epi32_rr_256(<8 x i32> %a, <8 x i32> %b) {
define <16 x i16> @test_mask_packus_epi32_rrk_256(<8 x i32> %a, <8 x i32> %b, <16 x i16> %passThru, i16 %mask) {
; CHECK-LABEL: test_mask_packus_epi32_rrk_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpackusdw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x2b,0xd1]
; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2]
@@ -2480,7 +2574,7 @@ define <16 x i16> @test_mask_packus_epi32_rrk_256(<8 x i32> %a, <8 x i32> %b, <1
define <16 x i16> @test_mask_packus_epi32_rrkz_256(<8 x i32> %a, <8 x i32> %b, i16 %mask) {
; CHECK-LABEL: test_mask_packus_epi32_rrkz_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x2b,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -2490,7 +2584,7 @@ define <16 x i16> @test_mask_packus_epi32_rrkz_256(<8 x i32> %a, <8 x i32> %b, i
define <16 x i16> @test_mask_packus_epi32_rm_256(<8 x i32> %a, <8 x i32>* %ptr_b) {
; CHECK-LABEL: test_mask_packus_epi32_rm_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpackusdw (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x2b,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x i32>, <8 x i32>* %ptr_b
@@ -2500,7 +2594,7 @@ define <16 x i16> @test_mask_packus_epi32_rm_256(<8 x i32> %a, <8 x i32>* %ptr_b
define <16 x i16> @test_mask_packus_epi32_rmk_256(<8 x i32> %a, <8 x i32>* %ptr_b, <16 x i16> %passThru, i16 %mask) {
; CHECK-LABEL: test_mask_packus_epi32_rmk_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
; CHECK-NEXT: vpackusdw (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x2b,0x0f]
; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
@@ -2512,7 +2606,7 @@ define <16 x i16> @test_mask_packus_epi32_rmk_256(<8 x i32> %a, <8 x i32>* %ptr_
define <16 x i16> @test_mask_packus_epi32_rmkz_256(<8 x i32> %a, <8 x i32>* %ptr_b, i16 %mask) {
; CHECK-LABEL: test_mask_packus_epi32_rmkz_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
; CHECK-NEXT: vpackusdw (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x2b,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -2523,7 +2617,7 @@ define <16 x i16> @test_mask_packus_epi32_rmkz_256(<8 x i32> %a, <8 x i32>* %ptr
define <16 x i16> @test_mask_packus_epi32_rmb_256(<8 x i32> %a, i32* %ptr_b) {
; CHECK-LABEL: test_mask_packus_epi32_rmb_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpackusdw (%rdi){1to8}, %ymm0, %ymm0 ## encoding: [0x62,0xf2,0x7d,0x38,0x2b,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load i32, i32* %ptr_b
@@ -2535,7 +2629,7 @@ define <16 x i16> @test_mask_packus_epi32_rmb_256(<8 x i32> %a, i32* %ptr_b) {
define <16 x i16> @test_mask_packus_epi32_rmbk_256(<8 x i32> %a, i32* %ptr_b, <16 x i16> %passThru, i16 %mask) {
; CHECK-LABEL: test_mask_packus_epi32_rmbk_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
; CHECK-NEXT: vpackusdw (%rdi){1to8}, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x39,0x2b,0x0f]
; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
@@ -2549,7 +2643,7 @@ define <16 x i16> @test_mask_packus_epi32_rmbk_256(<8 x i32> %a, i32* %ptr_b, <1
define <16 x i16> @test_mask_packus_epi32_rmbkz_256(<8 x i32> %a, i32* %ptr_b, i16 %mask) {
; CHECK-LABEL: test_mask_packus_epi32_rmbkz_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
; CHECK-NEXT: vpackusdw (%rdi){1to8}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xb9,0x2b,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -2564,7 +2658,7 @@ declare <16 x i16> @llvm.x86.avx512.mask.packusdw.256(<8 x i32>, <8 x i32>, <16
define <16 x i8> @test_mask_packus_epi16_rr_128(<8 x i16> %a, <8 x i16> %b) {
; CHECK-LABEL: test_mask_packus_epi16_rr_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x67,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i8> @llvm.x86.avx512.mask.packuswb.128(<8 x i16> %a, <8 x i16> %b, <16 x i8> zeroinitializer, i16 -1)
@@ -2573,7 +2667,7 @@ define <16 x i8> @test_mask_packus_epi16_rr_128(<8 x i16> %a, <8 x i16> %b) {
define <16 x i8> @test_mask_packus_epi16_rrk_128(<8 x i16> %a, <8 x i16> %b, <16 x i8> %passThru, i16 %mask) {
; CHECK-LABEL: test_mask_packus_epi16_rrk_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpackuswb %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x67,0xd1]
; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
@@ -2584,7 +2678,7 @@ define <16 x i8> @test_mask_packus_epi16_rrk_128(<8 x i16> %a, <8 x i16> %b, <16
define <16 x i8> @test_mask_packus_epi16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i16 %mask) {
; CHECK-LABEL: test_mask_packus_epi16_rrkz_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x67,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -2594,7 +2688,7 @@ define <16 x i8> @test_mask_packus_epi16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i1
define <16 x i8> @test_mask_packus_epi16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) {
; CHECK-LABEL: test_mask_packus_epi16_rm_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpackuswb (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x67,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x i16>, <8 x i16>* %ptr_b
@@ -2604,7 +2698,7 @@ define <16 x i8> @test_mask_packus_epi16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b)
define <16 x i8> @test_mask_packus_epi16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <16 x i8> %passThru, i16 %mask) {
; CHECK-LABEL: test_mask_packus_epi16_rmk_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
; CHECK-NEXT: vpackuswb (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x67,0x0f]
; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
@@ -2616,7 +2710,7 @@ define <16 x i8> @test_mask_packus_epi16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b
define <16 x i8> @test_mask_packus_epi16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i16 %mask) {
; CHECK-LABEL: test_mask_packus_epi16_rmkz_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
; CHECK-NEXT: vpackuswb (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x67,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -2629,7 +2723,7 @@ declare <16 x i8> @llvm.x86.avx512.mask.packuswb.128(<8 x i16>, <8 x i16>, <16 x
define <32 x i8> @test_mask_packus_epi16_rr_256(<16 x i16> %a, <16 x i16> %b) {
; CHECK-LABEL: test_mask_packus_epi16_rr_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x67,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <32 x i8> @llvm.x86.avx512.mask.packuswb.256(<16 x i16> %a, <16 x i16> %b, <32 x i8> zeroinitializer, i32 -1)
@@ -2638,7 +2732,7 @@ define <32 x i8> @test_mask_packus_epi16_rr_256(<16 x i16> %a, <16 x i16> %b) {
define <32 x i8> @test_mask_packus_epi16_rrk_256(<16 x i16> %a, <16 x i16> %b, <32 x i8> %passThru, i32 %mask) {
; CHECK-LABEL: test_mask_packus_epi16_rrk_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpackuswb %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x67,0xd1]
; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2]
@@ -2649,7 +2743,7 @@ define <32 x i8> @test_mask_packus_epi16_rrk_256(<16 x i16> %a, <16 x i16> %b, <
define <32 x i8> @test_mask_packus_epi16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i32 %mask) {
; CHECK-LABEL: test_mask_packus_epi16_rrkz_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x67,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -2659,7 +2753,7 @@ define <32 x i8> @test_mask_packus_epi16_rrkz_256(<16 x i16> %a, <16 x i16> %b,
define <32 x i8> @test_mask_packus_epi16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) {
; CHECK-LABEL: test_mask_packus_epi16_rm_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpackuswb (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x67,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <16 x i16>, <16 x i16>* %ptr_b
@@ -2669,7 +2763,7 @@ define <32 x i8> @test_mask_packus_epi16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_
define <32 x i8> @test_mask_packus_epi16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <32 x i8> %passThru, i32 %mask) {
; CHECK-LABEL: test_mask_packus_epi16_rmk_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
; CHECK-NEXT: vpackuswb (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x67,0x0f]
; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
@@ -2681,7 +2775,7 @@ define <32 x i8> @test_mask_packus_epi16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr
define <32 x i8> @test_mask_packus_epi16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i32 %mask) {
; CHECK-LABEL: test_mask_packus_epi16_rmkz_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
; CHECK-NEXT: vpackuswb (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x67,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -2694,7 +2788,7 @@ declare <32 x i8> @llvm.x86.avx512.mask.packuswb.256(<16 x i16>, <16 x i16>, <32
define <8 x i32> @test_cmp_b_256(<32 x i8> %a0, <32 x i8> %a1) {
; CHECK-LABEL: test_cmp_b_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpcmpeqb %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x28,0x74,0xc1]
; CHECK-NEXT: kmovd %k0, %r8d ## encoding: [0xc5,0x7b,0x93,0xc0]
; CHECK-NEXT: vpcmpgtb %ymm0, %ymm1, %k0 ## encoding: [0x62,0xf1,0x75,0x28,0x64,0xc0]
@@ -2743,7 +2837,7 @@ define <8 x i32> @test_cmp_b_256(<32 x i8> %a0, <32 x i8> %a1) {
define <8 x i32> @test_mask_cmp_b_256(<32 x i8> %a0, <32 x i8> %a1, i32 %mask) {
; CHECK-LABEL: test_mask_cmp_b_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpcmpeqb %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x74,0xc1]
; CHECK-NEXT: kmovd %k0, %r8d ## encoding: [0xc5,0x7b,0x93,0xc0]
@@ -2792,7 +2886,7 @@ declare i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8>, <32 x i8>, i32, i32) noun
define <8 x i32> @test_ucmp_b_256(<32 x i8> %a0, <32 x i8> %a1) {
; CHECK-LABEL: test_ucmp_b_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpcmpeqb %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x28,0x74,0xc1]
; CHECK-NEXT: kmovd %k0, %r8d ## encoding: [0xc5,0x7b,0x93,0xc0]
; CHECK-NEXT: vpcmpltub %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x28,0x3e,0xc1,0x01]
@@ -2841,7 +2935,7 @@ define <8 x i32> @test_ucmp_b_256(<32 x i8> %a0, <32 x i8> %a1) {
define <8 x i32> @test_mask_ucmp_b_256(<32 x i8> %a0, <32 x i8> %a1, i32 %mask) {
; CHECK-LABEL: test_mask_ucmp_b_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpcmpeqb %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x74,0xc1]
; CHECK-NEXT: kmovd %k0, %r8d ## encoding: [0xc5,0x7b,0x93,0xc0]
@@ -2890,7 +2984,7 @@ declare i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8>, <32 x i8>, i32, i32) nou
define <8 x i16> @test_cmp_w_256(<16 x i16> %a0, <16 x i16> %a1) {
; CHECK-LABEL: test_cmp_w_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x28,0x75,0xc1]
; CHECK-NEXT: vpcmpgtw %ymm0, %ymm1, %k1 ## encoding: [0x62,0xf1,0x75,0x28,0x65,0xc8]
; CHECK-NEXT: vpcmplew %ymm1, %ymm0, %k2 ## encoding: [0x62,0xf3,0xfd,0x28,0x3f,0xd1,0x02]
@@ -2913,6 +3007,7 @@ define <8 x i16> @test_cmp_w_256(<16 x i16> %a0, <16 x i16> %a1) {
; CHECK-NEXT: kxnorw %k0, %k0, %k0 ## encoding: [0xc5,0xfc,0x46,0xc0]
; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
; CHECK-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x07]
+; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 0, i16 -1)
%vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
@@ -2935,7 +3030,7 @@ define <8 x i16> @test_cmp_w_256(<16 x i16> %a0, <16 x i16> %a1) {
define <8 x i16> @test_mask_cmp_w_256(<16 x i16> %a0, <16 x i16> %a1, i16 %mask) {
; CHECK-LABEL: test_mask_cmp_w_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x75,0xc1]
; CHECK-NEXT: vpcmpgtw %ymm0, %ymm1, %k2 {%k1} ## encoding: [0x62,0xf1,0x75,0x29,0x65,0xd0]
@@ -2959,6 +3054,7 @@ define <8 x i16> @test_mask_cmp_w_256(<16 x i16> %a0, <16 x i16> %a1, i16 %mask)
; CHECK-NEXT: kmovd %k1, %eax ## encoding: [0xc5,0xfb,0x93,0xc1]
; CHECK-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x06]
; CHECK-NEXT: vpinsrw $7, %edi, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc7,0x07]
+; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 0, i16 %mask)
%vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
@@ -2983,7 +3079,7 @@ declare i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16>, <16 x i16>, i32, i16) no
define <8 x i16> @test_ucmp_w_256(<16 x i16> %a0, <16 x i16> %a1) {
; CHECK-LABEL: test_ucmp_w_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x28,0x75,0xc1]
; CHECK-NEXT: vpcmpltuw %ymm1, %ymm0, %k1 ## encoding: [0x62,0xf3,0xfd,0x28,0x3e,0xc9,0x01]
; CHECK-NEXT: vpcmpleuw %ymm1, %ymm0, %k2 ## encoding: [0x62,0xf3,0xfd,0x28,0x3e,0xd1,0x02]
@@ -3006,6 +3102,7 @@ define <8 x i16> @test_ucmp_w_256(<16 x i16> %a0, <16 x i16> %a1) {
; CHECK-NEXT: kxnorw %k0, %k0, %k0 ## encoding: [0xc5,0xfc,0x46,0xc0]
; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
; CHECK-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x07]
+; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 0, i16 -1)
%vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
@@ -3028,7 +3125,7 @@ define <8 x i16> @test_ucmp_w_256(<16 x i16> %a0, <16 x i16> %a1) {
define <8 x i16> @test_mask_ucmp_w_256(<16 x i16> %a0, <16 x i16> %a1, i16 %mask) {
; CHECK-LABEL: test_mask_ucmp_w_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x75,0xc1]
; CHECK-NEXT: vpcmpltuw %ymm1, %ymm0, %k2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x3e,0xd1,0x01]
@@ -3052,6 +3149,7 @@ define <8 x i16> @test_mask_ucmp_w_256(<16 x i16> %a0, <16 x i16> %a1, i16 %mask
; CHECK-NEXT: kmovd %k1, %eax ## encoding: [0xc5,0xfb,0x93,0xc1]
; CHECK-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x06]
; CHECK-NEXT: vpinsrw $7, %edi, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc7,0x07]
+; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 0, i16 %mask)
%vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
@@ -3076,7 +3174,7 @@ declare i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16>, <16 x i16>, i32, i16) n
define <8 x i16> @test_cmp_b_128(<16 x i8> %a0, <16 x i8> %a1) {
; CHECK-LABEL: test_cmp_b_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpcmpeqb %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x08,0x74,0xc1]
; CHECK-NEXT: vpcmpgtb %xmm0, %xmm1, %k1 ## encoding: [0x62,0xf1,0x75,0x08,0x64,0xc8]
; CHECK-NEXT: vpcmpleb %xmm1, %xmm0, %k2 ## encoding: [0x62,0xf3,0x7d,0x08,0x3f,0xd1,0x02]
@@ -3121,7 +3219,7 @@ define <8 x i16> @test_cmp_b_128(<16 x i8> %a0, <16 x i8> %a1) {
define <8 x i16> @test_mask_cmp_b_128(<16 x i8> %a0, <16 x i8> %a1, i16 %mask) {
; CHECK-LABEL: test_mask_cmp_b_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpcmpeqb %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x74,0xc1]
; CHECK-NEXT: vpcmpgtb %xmm0, %xmm1, %k2 {%k1} ## encoding: [0x62,0xf1,0x75,0x09,0x64,0xd0]
@@ -3169,7 +3267,7 @@ declare i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8>, <16 x i8>, i32, i16) noun
define <8 x i16> @test_ucmp_b_128(<16 x i8> %a0, <16 x i8> %a1) {
; CHECK-LABEL: test_ucmp_b_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpcmpeqb %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x08,0x74,0xc1]
; CHECK-NEXT: vpcmpltub %xmm1, %xmm0, %k1 ## encoding: [0x62,0xf3,0x7d,0x08,0x3e,0xc9,0x01]
; CHECK-NEXT: vpcmpleub %xmm1, %xmm0, %k2 ## encoding: [0x62,0xf3,0x7d,0x08,0x3e,0xd1,0x02]
@@ -3214,7 +3312,7 @@ define <8 x i16> @test_ucmp_b_128(<16 x i8> %a0, <16 x i8> %a1) {
define <8 x i16> @test_mask_ucmp_b_128(<16 x i8> %a0, <16 x i8> %a1, i16 %mask) {
; CHECK-LABEL: test_mask_ucmp_b_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpcmpeqb %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x74,0xc1]
; CHECK-NEXT: vpcmpltub %xmm1, %xmm0, %k2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x3e,0xd1,0x01]
@@ -3262,7 +3360,7 @@ declare i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8>, <16 x i8>, i32, i16) nou
define <8 x i8> @test_cmp_w_128(<8 x i16> %a0, <8 x i16> %a1) {
; CHECK-LABEL: test_cmp_w_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x08,0x75,0xc1]
; CHECK-NEXT: vpcmpgtw %xmm0, %xmm1, %k1 ## encoding: [0x62,0xf1,0x75,0x08,0x65,0xc8]
; CHECK-NEXT: vpcmplew %xmm1, %xmm0, %k2 ## encoding: [0x62,0xf3,0xfd,0x08,0x3f,0xd1,0x02]
@@ -3307,7 +3405,7 @@ define <8 x i8> @test_cmp_w_128(<8 x i16> %a0, <8 x i16> %a1) {
define <8 x i8> @test_mask_cmp_w_128(<8 x i16> %a0, <8 x i16> %a1, i8 %mask) {
; CHECK-LABEL: test_mask_cmp_w_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x75,0xc1]
; CHECK-NEXT: vpcmpgtw %xmm0, %xmm1, %k2 {%k1} ## encoding: [0x62,0xf1,0x75,0x09,0x65,0xd0]
@@ -3355,7 +3453,7 @@ declare i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16>, <8 x i16>, i32, i8) nounwi
define <8 x i8> @test_ucmp_w_128(<8 x i16> %a0, <8 x i16> %a1) {
; CHECK-LABEL: test_ucmp_w_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x08,0x75,0xc1]
; CHECK-NEXT: vpcmpltuw %xmm1, %xmm0, %k1 ## encoding: [0x62,0xf3,0xfd,0x08,0x3e,0xc9,0x01]
; CHECK-NEXT: vpcmpleuw %xmm1, %xmm0, %k2 ## encoding: [0x62,0xf3,0xfd,0x08,0x3e,0xd1,0x02]
@@ -3400,7 +3498,7 @@ define <8 x i8> @test_ucmp_w_128(<8 x i16> %a0, <8 x i16> %a1) {
define <8 x i8> @test_mask_ucmp_w_128(<8 x i16> %a0, <8 x i16> %a1, i8 %mask) {
; CHECK-LABEL: test_mask_ucmp_w_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x75,0xc1]
; CHECK-NEXT: vpcmpltuw %xmm1, %xmm0, %k2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x3e,0xd1,0x01]
@@ -3445,3 +3543,286 @@ define <8 x i8> @test_mask_ucmp_w_128(<8 x i16> %a0, <8 x i16> %a1, i8 %mask) {
}
declare i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16>, <8 x i16>, i32, i8) nounwind readnone
+
+define <16 x i8>@mm_mask_avg_epu8(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) {
+; CHECK-LABEL: mm_mask_avg_epu8:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpavgb %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe0,0xd9]
+; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT: vpavgb %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xe0,0xd1]
+; CHECK-NEXT: vpaddb %xmm3, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc3]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <16 x i8> @llvm.x86.avx512.mask.pavg.b.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3)
+ %res1 = call <16 x i8> @llvm.x86.avx512.mask.pavg.b.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 -1)
+ %res2 = add <16 x i8> %res, %res1
+ ret <16 x i8> %res2
+}
+
+declare <16 x i8> @llvm.x86.avx512.mask.pabs.b.128(<16 x i8>, <16 x i8>, i16)
+
+define <16 x i8>@test_int_x86_avx512_mask_pabs_b_128(<16 x i8> %x0, <16 x i8> %x1, i16 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pabs_b_128:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpabsb %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x1c,0xd0]
+; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT: vpabsb %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x1c,0xc8]
+; CHECK-NEXT: vpaddb %xmm2, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfc,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <16 x i8> @llvm.x86.avx512.mask.pabs.b.128(<16 x i8> %x0, <16 x i8> %x1, i16 %x2)
+ %res1 = call <16 x i8> @llvm.x86.avx512.mask.pabs.b.128(<16 x i8> %x0, <16 x i8> %x1, i16 -1)
+ %res2 = add <16 x i8> %res, %res1
+ ret <16 x i8> %res2
+}
+
+declare <16 x i8> @llvm.x86.avx512.mask.pavg.b.128(<16 x i8>, <16 x i8>, <16 x i8>, i16)
+
+define <32 x i8>@mm256_mask_avg_epu8(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) {
+; CHECK-LABEL: mm256_mask_avg_epu8:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpavgb %ymm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe0,0xd9]
+; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT: vpavgb %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xe0,0xd1]
+; CHECK-NEXT: vpaddb %ymm3, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfc,0xc3]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <32 x i8> @llvm.x86.avx512.mask.pavg.b.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3)
+ %res1 = call <32 x i8> @llvm.x86.avx512.mask.pavg.b.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1)
+ %res2 = add <32 x i8> %res, %res1
+ ret <32 x i8> %res2
+}
+
+declare <32 x i8> @llvm.x86.avx512.mask.pabs.b.256(<32 x i8>, <32 x i8>, i32)
+
+define <32 x i8>@test_int_x86_avx512_mask_pabs_b_256(<32 x i8> %x0, <32 x i8> %x1, i32 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pabs_b_256:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpabsb %ymm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x1c,0xd0]
+; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT: vpabsb %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x1c,0xc8]
+; CHECK-NEXT: vpaddb %ymm2, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfc,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <32 x i8> @llvm.x86.avx512.mask.pabs.b.256(<32 x i8> %x0, <32 x i8> %x1, i32 %x2)
+ %res1 = call <32 x i8> @llvm.x86.avx512.mask.pabs.b.256(<32 x i8> %x0, <32 x i8> %x1, i32 -1)
+ %res2 = add <32 x i8> %res, %res1
+ ret <32 x i8> %res2
+}
+
+declare <32 x i8> @llvm.x86.avx512.mask.pavg.b.256(<32 x i8>, <32 x i8>, <32 x i8>, i32)
+
+define <8 x i16>@mm_mask_avg_epu16(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
+; CHECK-LABEL: mm_mask_avg_epu16:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpavgw %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe3,0xd9]
+; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT: vpavgw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xe3,0xd1]
+; CHECK-NEXT: vpaddw %xmm3, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc3]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <8 x i16> @llvm.x86.avx512.mask.pavg.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3)
+ %res1 = call <8 x i16> @llvm.x86.avx512.mask.pavg.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1)
+ %res2 = add <8 x i16> %res, %res1
+ ret <8 x i16> %res2
+}
+
+declare <8 x i16> @llvm.x86.avx512.mask.pabs.w.128(<8 x i16>, <8 x i16>, i8)
+
+define <8 x i16>@test_int_x86_avx512_mask_pabs_w_128(<8 x i16> %x0, <8 x i16> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pabs_w_128:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpabsw %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x1d,0xd0]
+; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT: vpabsw %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x1d,0xc8]
+; CHECK-NEXT: vpaddw %xmm2, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <8 x i16> @llvm.x86.avx512.mask.pabs.w.128(<8 x i16> %x0, <8 x i16> %x1, i8 %x2)
+ %res1 = call <8 x i16> @llvm.x86.avx512.mask.pabs.w.128(<8 x i16> %x0, <8 x i16> %x1, i8 -1)
+ %res2 = add <8 x i16> %res, %res1
+ ret <8 x i16> %res2
+}
+
+declare <8 x i16> @llvm.x86.avx512.mask.pavg.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)
+
+define <16 x i16>@mm256_mask_avg_epu16(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) {
+; CHECK-LABEL: mm256_mask_avg_epu16:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpavgw %ymm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe3,0xd9]
+; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT: vpavgw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xe3,0xd1]
+; CHECK-NEXT: vpaddw %ymm3, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc3]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <16 x i16> @llvm.x86.avx512.mask.pavg.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3)
+ %res1 = call <16 x i16> @llvm.x86.avx512.mask.pavg.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1)
+ %res2 = add <16 x i16> %res, %res1
+ ret <16 x i16> %res2
+}
+
+declare <16 x i16> @llvm.x86.avx512.mask.pabs.w.256(<16 x i16>, <16 x i16>, i16)
+
+define <16 x i16>@test_int_x86_avx512_mask_pabs_w_256(<16 x i16> %x0, <16 x i16> %x1, i16 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pabs_w_256:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpabsw %ymm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x1d,0xd0]
+; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT: vpabsw %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x1d,0xc8]
+; CHECK-NEXT: vpaddw %ymm2, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <16 x i16> @llvm.x86.avx512.mask.pabs.w.256(<16 x i16> %x0, <16 x i16> %x1, i16 %x2)
+ %res1 = call <16 x i16> @llvm.x86.avx512.mask.pabs.w.256(<16 x i16> %x0, <16 x i16> %x1, i16 -1)
+ %res2 = add <16 x i16> %res, %res1
+ ret <16 x i16> %res2
+}
+
+declare <16 x i16> @llvm.x86.avx512.mask.pavg.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16)
+
+declare i16 @llvm.x86.avx512.ptestm.b.128(<16 x i8>, <16 x i8>, i16)
+
+define i16@test_int_x86_avx512_ptestm_b_128(<16 x i8> %x0, <16 x i8> %x1, i16 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_ptestm_b_128:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vptestmb %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf2,0x7d,0x08,0x26,0xc1]
+; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT: vptestmb %xmm1, %xmm0, %k1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x26,0xc9]
+; CHECK-NEXT: kmovd %k1, %ecx ## encoding: [0xc5,0xfb,0x93,0xc9]
+; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
+; CHECK-NEXT: addl %ecx, %eax ## encoding: [0x01,0xc8]
+; CHECK-NEXT: ## kill: def %ax killed %ax killed %eax
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call i16 @llvm.x86.avx512.ptestm.b.128(<16 x i8> %x0, <16 x i8> %x1, i16 %x2)
+ %res1 = call i16 @llvm.x86.avx512.ptestm.b.128(<16 x i8> %x0, <16 x i8> %x1, i16-1)
+ %res2 = add i16 %res, %res1
+ ret i16 %res2
+}
+
+declare i32 @llvm.x86.avx512.ptestm.b.256(<32 x i8>, <32 x i8>, i32)
+
+define i32@test_int_x86_avx512_ptestm_b_256(<32 x i8> %x0, <32 x i8> %x1, i32 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_ptestm_b_256:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vptestmb %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf2,0x7d,0x28,0x26,0xc1]
+; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT: vptestmb %ymm1, %ymm0, %k1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x26,0xc9]
+; CHECK-NEXT: kmovd %k1, %ecx ## encoding: [0xc5,0xfb,0x93,0xc9]
+; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
+; CHECK-NEXT: addl %ecx, %eax ## encoding: [0x01,0xc8]
+; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call i32 @llvm.x86.avx512.ptestm.b.256(<32 x i8> %x0, <32 x i8> %x1, i32 %x2)
+ %res1 = call i32 @llvm.x86.avx512.ptestm.b.256(<32 x i8> %x0, <32 x i8> %x1, i32-1)
+ %res2 = add i32 %res, %res1
+ ret i32 %res2
+}
+
+declare i8 @llvm.x86.avx512.ptestm.w.128(<8 x i16>, <8 x i16>, i8)
+
+define i8@test_int_x86_avx512_ptestm_w_128(<8 x i16> %x0, <8 x i16> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_ptestm_w_128:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vptestmw %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf2,0xfd,0x08,0x26,0xc1]
+; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT: vptestmw %xmm1, %xmm0, %k1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x26,0xc9]
+; CHECK-NEXT: kmovd %k1, %ecx ## encoding: [0xc5,0xfb,0x93,0xc9]
+; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
+; CHECK-NEXT: addb %cl, %al ## encoding: [0x00,0xc8]
+; CHECK-NEXT: ## kill: def %al killed %al killed %eax
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call i8 @llvm.x86.avx512.ptestm.w.128(<8 x i16> %x0, <8 x i16> %x1, i8 %x2)
+ %res1 = call i8 @llvm.x86.avx512.ptestm.w.128(<8 x i16> %x0, <8 x i16> %x1, i8-1)
+ %res2 = add i8 %res, %res1
+ ret i8 %res2
+}
+
+declare i16 @llvm.x86.avx512.ptestm.w.256(<16 x i16>, <16 x i16>, i16)
+
+define i16@test_int_x86_avx512_ptestm_w_256(<16 x i16> %x0, <16 x i16> %x1, i16 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_ptestm_w_256:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vptestmw %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf2,0xfd,0x28,0x26,0xc1]
+; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT: vptestmw %ymm1, %ymm0, %k1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x26,0xc9]
+; CHECK-NEXT: kmovd %k1, %ecx ## encoding: [0xc5,0xfb,0x93,0xc9]
+; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
+; CHECK-NEXT: addl %ecx, %eax ## encoding: [0x01,0xc8]
+; CHECK-NEXT: ## kill: def %ax killed %ax killed %eax
+; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call i16 @llvm.x86.avx512.ptestm.w.256(<16 x i16> %x0, <16 x i16> %x1, i16 %x2)
+ %res1 = call i16 @llvm.x86.avx512.ptestm.w.256(<16 x i16> %x0, <16 x i16> %x1, i16-1)
+ %res2 = add i16 %res, %res1
+ ret i16 %res2
+}
+
+declare i16 @llvm.x86.avx512.ptestnm.b.128(<16 x i8>, <16 x i8>, i16)
+
+define i16@test_int_x86_avx512_ptestnm_b_128(<16 x i8> %x0, <16 x i8> %x1, i16 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_ptestnm_b_128:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vptestnmb %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf2,0x7e,0x08,0x26,0xc1]
+; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT: vptestnmb %xmm1, %xmm0, %k1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x26,0xc9]
+; CHECK-NEXT: kmovd %k1, %ecx ## encoding: [0xc5,0xfb,0x93,0xc9]
+; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
+; CHECK-NEXT: addl %ecx, %eax ## encoding: [0x01,0xc8]
+; CHECK-NEXT: ## kill: def %ax killed %ax killed %eax
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call i16 @llvm.x86.avx512.ptestnm.b.128(<16 x i8> %x0, <16 x i8> %x1, i16 %x2)
+ %res1 = call i16 @llvm.x86.avx512.ptestnm.b.128(<16 x i8> %x0, <16 x i8> %x1, i16-1)
+ %res2 = add i16 %res, %res1
+ ret i16 %res2
+}
+
+declare i32 @llvm.x86.avx512.ptestnm.b.256(<32 x i8>, <32 x i8>, i32)
+
+define i32@test_int_x86_avx512_ptestnm_b_256(<32 x i8> %x0, <32 x i8> %x1, i32 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_ptestnm_b_256:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vptestnmb %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf2,0x7e,0x28,0x26,0xc1]
+; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT: vptestnmb %ymm1, %ymm0, %k1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x26,0xc9]
+; CHECK-NEXT: kmovd %k1, %ecx ## encoding: [0xc5,0xfb,0x93,0xc9]
+; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
+; CHECK-NEXT: addl %ecx, %eax ## encoding: [0x01,0xc8]
+; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call i32 @llvm.x86.avx512.ptestnm.b.256(<32 x i8> %x0, <32 x i8> %x1, i32 %x2)
+ %res1 = call i32 @llvm.x86.avx512.ptestnm.b.256(<32 x i8> %x0, <32 x i8> %x1, i32-1)
+ %res2 = add i32 %res, %res1
+ ret i32 %res2
+}
+
+declare i8 @llvm.x86.avx512.ptestnm.w.128(<8 x i16>, <8 x i16>, i8 %x2)
+
+define i8@test_int_x86_avx512_ptestnm_w_128(<8 x i16> %x0, <8 x i16> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_ptestnm_w_128:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vptestnmw %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf2,0xfe,0x08,0x26,0xc1]
+; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT: vptestnmw %xmm1, %xmm0, %k1 {%k1} ## encoding: [0x62,0xf2,0xfe,0x09,0x26,0xc9]
+; CHECK-NEXT: kmovd %k1, %ecx ## encoding: [0xc5,0xfb,0x93,0xc9]
+; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
+; CHECK-NEXT: addb %cl, %al ## encoding: [0x00,0xc8]
+; CHECK-NEXT: ## kill: def %al killed %al killed %eax
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call i8 @llvm.x86.avx512.ptestnm.w.128(<8 x i16> %x0, <8 x i16> %x1, i8 %x2)
+ %res1 = call i8 @llvm.x86.avx512.ptestnm.w.128(<8 x i16> %x0, <8 x i16> %x1, i8-1)
+ %res2 = add i8 %res, %res1
+ ret i8 %res2
+}
+
+declare i16 @llvm.x86.avx512.ptestnm.w.256(<16 x i16>, <16 x i16>, i16 %x2)
+
+define i16@test_int_x86_avx512_ptestnm_w_256(<16 x i16> %x0, <16 x i16> %x1, i16 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_ptestnm_w_256:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vptestnmw %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf2,0xfe,0x28,0x26,0xc1]
+; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT: vptestnmw %ymm1, %ymm0, %k1 {%k1} ## encoding: [0x62,0xf2,0xfe,0x29,0x26,0xc9]
+; CHECK-NEXT: kmovd %k1, %ecx ## encoding: [0xc5,0xfb,0x93,0xc9]
+; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
+; CHECK-NEXT: addl %ecx, %eax ## encoding: [0x01,0xc8]
+; CHECK-NEXT: ## kill: def %ax killed %ax killed %eax
+; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call i16 @llvm.x86.avx512.ptestnm.w.256(<16 x i16> %x0, <16 x i16> %x1, i16 %x2)
+ %res1 = call i16 @llvm.x86.avx512.ptestnm.w.256(<16 x i16> %x0, <16 x i16> %x1, i16-1)
+ %res2 = add i16 %res, %res1
+ ret i16 %res2
+}
+
diff --git a/test/CodeGen/X86/avx512bwvl-intrinsics.ll b/test/CodeGen/X86/avx512bwvl-intrinsics.ll
index 9ceb3e5931a6..64ad66e336b5 100644
--- a/test/CodeGen/X86/avx512bwvl-intrinsics.ll
+++ b/test/CodeGen/X86/avx512bwvl-intrinsics.ll
@@ -3,7 +3,7 @@
define <8 x i16> @test_mask_packs_epi32_rr_128(<4 x i32> %a, <4 x i32> %b) {
; CHECK-LABEL: test_mask_packs_epi32_rr_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6b,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%1 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a, <4 x i32> %b)
@@ -12,7 +12,7 @@ define <8 x i16> @test_mask_packs_epi32_rr_128(<4 x i32> %a, <4 x i32> %b) {
define <8 x i16> @test_mask_packs_epi32_rrk_128(<4 x i32> %a, <4 x i32> %b, <8 x i16> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_packs_epi32_rrk_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpackssdw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x6b,0xd1]
; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
@@ -25,7 +25,7 @@ define <8 x i16> @test_mask_packs_epi32_rrk_128(<4 x i32> %a, <4 x i32> %b, <8 x
define <8 x i16> @test_mask_packs_epi32_rrkz_128(<4 x i32> %a, <4 x i32> %b, i8 %mask) {
; CHECK-LABEL: test_mask_packs_epi32_rrkz_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x6b,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -37,7 +37,7 @@ define <8 x i16> @test_mask_packs_epi32_rrkz_128(<4 x i32> %a, <4 x i32> %b, i8
define <8 x i16> @test_mask_packs_epi32_rm_128(<4 x i32> %a, <4 x i32>* %ptr_b) {
; CHECK-LABEL: test_mask_packs_epi32_rm_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpackssdw (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6b,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <4 x i32>, <4 x i32>* %ptr_b
@@ -47,7 +47,7 @@ define <8 x i16> @test_mask_packs_epi32_rm_128(<4 x i32> %a, <4 x i32>* %ptr_b)
define <8 x i16> @test_mask_packs_epi32_rmk_128(<4 x i32> %a, <4 x i32>* %ptr_b, <8 x i16> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_packs_epi32_rmk_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
; CHECK-NEXT: vpackssdw (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x6b,0x0f]
; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
@@ -61,7 +61,7 @@ define <8 x i16> @test_mask_packs_epi32_rmk_128(<4 x i32> %a, <4 x i32>* %ptr_b,
define <8 x i16> @test_mask_packs_epi32_rmkz_128(<4 x i32> %a, <4 x i32>* %ptr_b, i8 %mask) {
; CHECK-LABEL: test_mask_packs_epi32_rmkz_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
; CHECK-NEXT: vpackssdw (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x6b,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -74,7 +74,7 @@ define <8 x i16> @test_mask_packs_epi32_rmkz_128(<4 x i32> %a, <4 x i32>* %ptr_b
define <8 x i16> @test_mask_packs_epi32_rmb_128(<4 x i32> %a, i32* %ptr_b) {
; CHECK-LABEL: test_mask_packs_epi32_rmb_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpackssdw (%rdi){1to4}, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x18,0x6b,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load i32, i32* %ptr_b
@@ -86,7 +86,7 @@ define <8 x i16> @test_mask_packs_epi32_rmb_128(<4 x i32> %a, i32* %ptr_b) {
define <8 x i16> @test_mask_packs_epi32_rmbk_128(<4 x i32> %a, i32* %ptr_b, <8 x i16> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_packs_epi32_rmbk_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
; CHECK-NEXT: vpackssdw (%rdi){1to4}, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x19,0x6b,0x0f]
; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
@@ -102,7 +102,7 @@ define <8 x i16> @test_mask_packs_epi32_rmbk_128(<4 x i32> %a, i32* %ptr_b, <8 x
define <8 x i16> @test_mask_packs_epi32_rmbkz_128(<4 x i32> %a, i32* %ptr_b, i8 %mask) {
; CHECK-LABEL: test_mask_packs_epi32_rmbkz_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
; CHECK-NEXT: vpackssdw (%rdi){1to4}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x99,0x6b,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -119,7 +119,7 @@ declare <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32>, <4 x i32>)
define <16 x i16> @test_mask_packs_epi32_rr_256(<8 x i32> %a, <8 x i32> %b) {
; CHECK-LABEL: test_mask_packs_epi32_rr_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6b,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%1 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a, <8 x i32> %b)
@@ -128,7 +128,7 @@ define <16 x i16> @test_mask_packs_epi32_rr_256(<8 x i32> %a, <8 x i32> %b) {
define <16 x i16> @test_mask_packs_epi32_rrk_256(<8 x i32> %a, <8 x i32> %b, <16 x i16> %passThru, i16 %mask) {
; CHECK-LABEL: test_mask_packs_epi32_rrk_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpackssdw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x6b,0xd1]
; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2]
@@ -141,7 +141,7 @@ define <16 x i16> @test_mask_packs_epi32_rrk_256(<8 x i32> %a, <8 x i32> %b, <16
define <16 x i16> @test_mask_packs_epi32_rrkz_256(<8 x i32> %a, <8 x i32> %b, i16 %mask) {
; CHECK-LABEL: test_mask_packs_epi32_rrkz_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x6b,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -153,7 +153,7 @@ define <16 x i16> @test_mask_packs_epi32_rrkz_256(<8 x i32> %a, <8 x i32> %b, i1
define <16 x i16> @test_mask_packs_epi32_rm_256(<8 x i32> %a, <8 x i32>* %ptr_b) {
; CHECK-LABEL: test_mask_packs_epi32_rm_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpackssdw (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6b,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x i32>, <8 x i32>* %ptr_b
@@ -163,7 +163,7 @@ define <16 x i16> @test_mask_packs_epi32_rm_256(<8 x i32> %a, <8 x i32>* %ptr_b)
define <16 x i16> @test_mask_packs_epi32_rmk_256(<8 x i32> %a, <8 x i32>* %ptr_b, <16 x i16> %passThru, i16 %mask) {
; CHECK-LABEL: test_mask_packs_epi32_rmk_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
; CHECK-NEXT: vpackssdw (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x6b,0x0f]
; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
@@ -177,7 +177,7 @@ define <16 x i16> @test_mask_packs_epi32_rmk_256(<8 x i32> %a, <8 x i32>* %ptr_b
define <16 x i16> @test_mask_packs_epi32_rmkz_256(<8 x i32> %a, <8 x i32>* %ptr_b, i16 %mask) {
; CHECK-LABEL: test_mask_packs_epi32_rmkz_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
; CHECK-NEXT: vpackssdw (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x6b,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -190,7 +190,7 @@ define <16 x i16> @test_mask_packs_epi32_rmkz_256(<8 x i32> %a, <8 x i32>* %ptr_
define <16 x i16> @test_mask_packs_epi32_rmb_256(<8 x i32> %a, i32* %ptr_b) {
; CHECK-LABEL: test_mask_packs_epi32_rmb_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpackssdw (%rdi){1to8}, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x38,0x6b,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load i32, i32* %ptr_b
@@ -202,7 +202,7 @@ define <16 x i16> @test_mask_packs_epi32_rmb_256(<8 x i32> %a, i32* %ptr_b) {
define <16 x i16> @test_mask_packs_epi32_rmbk_256(<8 x i32> %a, i32* %ptr_b, <16 x i16> %passThru, i16 %mask) {
; CHECK-LABEL: test_mask_packs_epi32_rmbk_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
; CHECK-NEXT: vpackssdw (%rdi){1to8}, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x39,0x6b,0x0f]
; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
@@ -218,7 +218,7 @@ define <16 x i16> @test_mask_packs_epi32_rmbk_256(<8 x i32> %a, i32* %ptr_b, <16
define <16 x i16> @test_mask_packs_epi32_rmbkz_256(<8 x i32> %a, i32* %ptr_b, i16 %mask) {
; CHECK-LABEL: test_mask_packs_epi32_rmbkz_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
; CHECK-NEXT: vpackssdw (%rdi){1to8}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xb9,0x6b,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -235,7 +235,7 @@ declare <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32>, <8 x i32>)
define <16 x i8> @test_mask_packs_epi16_rr_128(<8 x i16> %a, <8 x i16> %b) {
; CHECK-LABEL: test_mask_packs_epi16_rr_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x63,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%1 = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %a, <8 x i16> %b)
@@ -244,7 +244,7 @@ define <16 x i8> @test_mask_packs_epi16_rr_128(<8 x i16> %a, <8 x i16> %b) {
define <16 x i8> @test_mask_packs_epi16_rrk_128(<8 x i16> %a, <8 x i16> %b, <16 x i8> %passThru, i16 %mask) {
; CHECK-LABEL: test_mask_packs_epi16_rrk_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpacksswb %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x63,0xd1]
; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
@@ -257,7 +257,7 @@ define <16 x i8> @test_mask_packs_epi16_rrk_128(<8 x i16> %a, <8 x i16> %b, <16
define <16 x i8> @test_mask_packs_epi16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i16 %mask) {
; CHECK-LABEL: test_mask_packs_epi16_rrkz_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x63,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -269,7 +269,7 @@ define <16 x i8> @test_mask_packs_epi16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i16
define <16 x i8> @test_mask_packs_epi16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) {
; CHECK-LABEL: test_mask_packs_epi16_rm_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpacksswb (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x63,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x i16>, <8 x i16>* %ptr_b
@@ -279,7 +279,7 @@ define <16 x i8> @test_mask_packs_epi16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b)
define <16 x i8> @test_mask_packs_epi16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <16 x i8> %passThru, i16 %mask) {
; CHECK-LABEL: test_mask_packs_epi16_rmk_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
; CHECK-NEXT: vpacksswb (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x63,0x0f]
; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
@@ -293,7 +293,7 @@ define <16 x i8> @test_mask_packs_epi16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b,
define <16 x i8> @test_mask_packs_epi16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i16 %mask) {
; CHECK-LABEL: test_mask_packs_epi16_rmkz_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
; CHECK-NEXT: vpacksswb (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x63,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -308,7 +308,7 @@ declare <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16>, <8 x i16>)
define <32 x i8> @test_mask_packs_epi16_rr_256(<16 x i16> %a, <16 x i16> %b) {
; CHECK-LABEL: test_mask_packs_epi16_rr_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpacksswb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x63,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%1 = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %a, <16 x i16> %b)
@@ -317,7 +317,7 @@ define <32 x i8> @test_mask_packs_epi16_rr_256(<16 x i16> %a, <16 x i16> %b) {
define <32 x i8> @test_mask_packs_epi16_rrk_256(<16 x i16> %a, <16 x i16> %b, <32 x i8> %passThru, i32 %mask) {
; CHECK-LABEL: test_mask_packs_epi16_rrk_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpacksswb %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x63,0xd1]
; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2]
@@ -330,7 +330,7 @@ define <32 x i8> @test_mask_packs_epi16_rrk_256(<16 x i16> %a, <16 x i16> %b, <3
define <32 x i8> @test_mask_packs_epi16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i32 %mask) {
; CHECK-LABEL: test_mask_packs_epi16_rrkz_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpacksswb %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x63,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -342,7 +342,7 @@ define <32 x i8> @test_mask_packs_epi16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i
define <32 x i8> @test_mask_packs_epi16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) {
; CHECK-LABEL: test_mask_packs_epi16_rm_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpacksswb (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x63,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <16 x i16>, <16 x i16>* %ptr_b
@@ -352,7 +352,7 @@ define <32 x i8> @test_mask_packs_epi16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b
define <32 x i8> @test_mask_packs_epi16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <32 x i8> %passThru, i32 %mask) {
; CHECK-LABEL: test_mask_packs_epi16_rmk_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
; CHECK-NEXT: vpacksswb (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x63,0x0f]
; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
@@ -366,7 +366,7 @@ define <32 x i8> @test_mask_packs_epi16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_
define <32 x i8> @test_mask_packs_epi16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i32 %mask) {
; CHECK-LABEL: test_mask_packs_epi16_rmkz_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
; CHECK-NEXT: vpacksswb (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x63,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -382,7 +382,7 @@ declare <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16>, <16 x i16>)
define <8 x i16> @test_mask_packus_epi32_rr_128(<4 x i32> %a, <4 x i32> %b) {
; CHECK-LABEL: test_mask_packus_epi32_rr_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x2b,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%1 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a, <4 x i32> %b)
@@ -391,7 +391,7 @@ define <8 x i16> @test_mask_packus_epi32_rr_128(<4 x i32> %a, <4 x i32> %b) {
define <8 x i16> @test_mask_packus_epi32_rrk_128(<4 x i32> %a, <4 x i32> %b, <8 x i16> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_packus_epi32_rrk_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpackusdw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x2b,0xd1]
; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
@@ -404,7 +404,7 @@ define <8 x i16> @test_mask_packus_epi32_rrk_128(<4 x i32> %a, <4 x i32> %b, <8
define <8 x i16> @test_mask_packus_epi32_rrkz_128(<4 x i32> %a, <4 x i32> %b, i8 %mask) {
; CHECK-LABEL: test_mask_packus_epi32_rrkz_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x2b,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -416,7 +416,7 @@ define <8 x i16> @test_mask_packus_epi32_rrkz_128(<4 x i32> %a, <4 x i32> %b, i8
define <8 x i16> @test_mask_packus_epi32_rm_128(<4 x i32> %a, <4 x i32>* %ptr_b) {
; CHECK-LABEL: test_mask_packus_epi32_rm_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpackusdw (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x2b,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <4 x i32>, <4 x i32>* %ptr_b
@@ -426,7 +426,7 @@ define <8 x i16> @test_mask_packus_epi32_rm_128(<4 x i32> %a, <4 x i32>* %ptr_b)
define <8 x i16> @test_mask_packus_epi32_rmk_128(<4 x i32> %a, <4 x i32>* %ptr_b, <8 x i16> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_packus_epi32_rmk_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
; CHECK-NEXT: vpackusdw (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x2b,0x0f]
; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
@@ -440,7 +440,7 @@ define <8 x i16> @test_mask_packus_epi32_rmk_128(<4 x i32> %a, <4 x i32>* %ptr_b
define <8 x i16> @test_mask_packus_epi32_rmkz_128(<4 x i32> %a, <4 x i32>* %ptr_b, i8 %mask) {
; CHECK-LABEL: test_mask_packus_epi32_rmkz_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
; CHECK-NEXT: vpackusdw (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x2b,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -453,7 +453,7 @@ define <8 x i16> @test_mask_packus_epi32_rmkz_128(<4 x i32> %a, <4 x i32>* %ptr_
define <8 x i16> @test_mask_packus_epi32_rmb_128(<4 x i32> %a, i32* %ptr_b) {
; CHECK-LABEL: test_mask_packus_epi32_rmb_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpackusdw (%rdi){1to4}, %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7d,0x18,0x2b,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load i32, i32* %ptr_b
@@ -465,7 +465,7 @@ define <8 x i16> @test_mask_packus_epi32_rmb_128(<4 x i32> %a, i32* %ptr_b) {
define <8 x i16> @test_mask_packus_epi32_rmbk_128(<4 x i32> %a, i32* %ptr_b, <8 x i16> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_packus_epi32_rmbk_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
; CHECK-NEXT: vpackusdw (%rdi){1to4}, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x19,0x2b,0x0f]
; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
@@ -481,7 +481,7 @@ define <8 x i16> @test_mask_packus_epi32_rmbk_128(<4 x i32> %a, i32* %ptr_b, <8
define <8 x i16> @test_mask_packus_epi32_rmbkz_128(<4 x i32> %a, i32* %ptr_b, i8 %mask) {
; CHECK-LABEL: test_mask_packus_epi32_rmbkz_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
; CHECK-NEXT: vpackusdw (%rdi){1to4}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x99,0x2b,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -498,7 +498,7 @@ declare <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32>, <4 x i32>)
define <16 x i16> @test_mask_packus_epi32_rr_256(<8 x i32> %a, <8 x i32> %b) {
; CHECK-LABEL: test_mask_packus_epi32_rr_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x2b,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%1 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a, <8 x i32> %b)
@@ -507,7 +507,7 @@ define <16 x i16> @test_mask_packus_epi32_rr_256(<8 x i32> %a, <8 x i32> %b) {
define <16 x i16> @test_mask_packus_epi32_rrk_256(<8 x i32> %a, <8 x i32> %b, <16 x i16> %passThru, i16 %mask) {
; CHECK-LABEL: test_mask_packus_epi32_rrk_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpackusdw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x2b,0xd1]
; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2]
@@ -520,7 +520,7 @@ define <16 x i16> @test_mask_packus_epi32_rrk_256(<8 x i32> %a, <8 x i32> %b, <1
define <16 x i16> @test_mask_packus_epi32_rrkz_256(<8 x i32> %a, <8 x i32> %b, i16 %mask) {
; CHECK-LABEL: test_mask_packus_epi32_rrkz_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x2b,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -532,7 +532,7 @@ define <16 x i16> @test_mask_packus_epi32_rrkz_256(<8 x i32> %a, <8 x i32> %b, i
define <16 x i16> @test_mask_packus_epi32_rm_256(<8 x i32> %a, <8 x i32>* %ptr_b) {
; CHECK-LABEL: test_mask_packus_epi32_rm_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpackusdw (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x2b,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x i32>, <8 x i32>* %ptr_b
@@ -542,7 +542,7 @@ define <16 x i16> @test_mask_packus_epi32_rm_256(<8 x i32> %a, <8 x i32>* %ptr_b
define <16 x i16> @test_mask_packus_epi32_rmk_256(<8 x i32> %a, <8 x i32>* %ptr_b, <16 x i16> %passThru, i16 %mask) {
; CHECK-LABEL: test_mask_packus_epi32_rmk_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
; CHECK-NEXT: vpackusdw (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x2b,0x0f]
; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
@@ -556,7 +556,7 @@ define <16 x i16> @test_mask_packus_epi32_rmk_256(<8 x i32> %a, <8 x i32>* %ptr_
define <16 x i16> @test_mask_packus_epi32_rmkz_256(<8 x i32> %a, <8 x i32>* %ptr_b, i16 %mask) {
; CHECK-LABEL: test_mask_packus_epi32_rmkz_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
; CHECK-NEXT: vpackusdw (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x2b,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -569,7 +569,7 @@ define <16 x i16> @test_mask_packus_epi32_rmkz_256(<8 x i32> %a, <8 x i32>* %ptr
define <16 x i16> @test_mask_packus_epi32_rmb_256(<8 x i32> %a, i32* %ptr_b) {
; CHECK-LABEL: test_mask_packus_epi32_rmb_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpackusdw (%rdi){1to8}, %ymm0, %ymm0 ## encoding: [0x62,0xf2,0x7d,0x38,0x2b,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load i32, i32* %ptr_b
@@ -581,7 +581,7 @@ define <16 x i16> @test_mask_packus_epi32_rmb_256(<8 x i32> %a, i32* %ptr_b) {
define <16 x i16> @test_mask_packus_epi32_rmbk_256(<8 x i32> %a, i32* %ptr_b, <16 x i16> %passThru, i16 %mask) {
; CHECK-LABEL: test_mask_packus_epi32_rmbk_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
; CHECK-NEXT: vpackusdw (%rdi){1to8}, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x39,0x2b,0x0f]
; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
@@ -597,7 +597,7 @@ define <16 x i16> @test_mask_packus_epi32_rmbk_256(<8 x i32> %a, i32* %ptr_b, <1
define <16 x i16> @test_mask_packus_epi32_rmbkz_256(<8 x i32> %a, i32* %ptr_b, i16 %mask) {
; CHECK-LABEL: test_mask_packus_epi32_rmbkz_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
; CHECK-NEXT: vpackusdw (%rdi){1to8}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xb9,0x2b,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -614,7 +614,7 @@ declare <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32>, <8 x i32>)
define <16 x i8> @test_mask_packus_epi16_rr_128(<8 x i16> %a, <8 x i16> %b) {
; CHECK-LABEL: test_mask_packus_epi16_rr_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x67,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%1 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %a, <8 x i16> %b)
@@ -623,7 +623,7 @@ define <16 x i8> @test_mask_packus_epi16_rr_128(<8 x i16> %a, <8 x i16> %b) {
define <16 x i8> @test_mask_packus_epi16_rrk_128(<8 x i16> %a, <8 x i16> %b, <16 x i8> %passThru, i16 %mask) {
; CHECK-LABEL: test_mask_packus_epi16_rrk_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpackuswb %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x67,0xd1]
; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
@@ -636,7 +636,7 @@ define <16 x i8> @test_mask_packus_epi16_rrk_128(<8 x i16> %a, <8 x i16> %b, <16
define <16 x i8> @test_mask_packus_epi16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i16 %mask) {
; CHECK-LABEL: test_mask_packus_epi16_rrkz_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x67,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -648,7 +648,7 @@ define <16 x i8> @test_mask_packus_epi16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i1
define <16 x i8> @test_mask_packus_epi16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) {
; CHECK-LABEL: test_mask_packus_epi16_rm_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpackuswb (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x67,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x i16>, <8 x i16>* %ptr_b
@@ -658,7 +658,7 @@ define <16 x i8> @test_mask_packus_epi16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b)
define <16 x i8> @test_mask_packus_epi16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <16 x i8> %passThru, i16 %mask) {
; CHECK-LABEL: test_mask_packus_epi16_rmk_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
; CHECK-NEXT: vpackuswb (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x67,0x0f]
; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
@@ -672,7 +672,7 @@ define <16 x i8> @test_mask_packus_epi16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b
define <16 x i8> @test_mask_packus_epi16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i16 %mask) {
; CHECK-LABEL: test_mask_packus_epi16_rmkz_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
; CHECK-NEXT: vpackuswb (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x67,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -687,7 +687,7 @@ declare <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16>, <8 x i16>)
define <32 x i8> @test_mask_packus_epi16_rr_256(<16 x i16> %a, <16 x i16> %b) {
; CHECK-LABEL: test_mask_packus_epi16_rr_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x67,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%1 = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %a, <16 x i16> %b)
@@ -696,7 +696,7 @@ define <32 x i8> @test_mask_packus_epi16_rr_256(<16 x i16> %a, <16 x i16> %b) {
define <32 x i8> @test_mask_packus_epi16_rrk_256(<16 x i16> %a, <16 x i16> %b, <32 x i8> %passThru, i32 %mask) {
; CHECK-LABEL: test_mask_packus_epi16_rrk_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpackuswb %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x67,0xd1]
; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2]
@@ -709,7 +709,7 @@ define <32 x i8> @test_mask_packus_epi16_rrk_256(<16 x i16> %a, <16 x i16> %b, <
define <32 x i8> @test_mask_packus_epi16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i32 %mask) {
; CHECK-LABEL: test_mask_packus_epi16_rrkz_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x67,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -721,7 +721,7 @@ define <32 x i8> @test_mask_packus_epi16_rrkz_256(<16 x i16> %a, <16 x i16> %b,
define <32 x i8> @test_mask_packus_epi16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) {
; CHECK-LABEL: test_mask_packus_epi16_rm_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpackuswb (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x67,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <16 x i16>, <16 x i16>* %ptr_b
@@ -731,7 +731,7 @@ define <32 x i8> @test_mask_packus_epi16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_
define <32 x i8> @test_mask_packus_epi16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <32 x i8> %passThru, i32 %mask) {
; CHECK-LABEL: test_mask_packus_epi16_rmk_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
; CHECK-NEXT: vpackuswb (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x67,0x0f]
; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
@@ -745,7 +745,7 @@ define <32 x i8> @test_mask_packus_epi16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr
define <32 x i8> @test_mask_packus_epi16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i32 %mask) {
; CHECK-LABEL: test_mask_packus_epi16_rmkz_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
; CHECK-NEXT: vpackuswb (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x67,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -760,7 +760,7 @@ declare <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16>, <16 x i16>)
define <8 x i16> @test_mask_adds_epi16_rr_128(<8 x i16> %a, <8 x i16> %b) {
; CHECK-LABEL: test_mask_adds_epi16_rr_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpaddsw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xed,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.mask.padds.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1)
@@ -769,7 +769,7 @@ define <8 x i16> @test_mask_adds_epi16_rr_128(<8 x i16> %a, <8 x i16> %b) {
define <8 x i16> @test_mask_adds_epi16_rrk_128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_adds_epi16_rrk_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpaddsw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xed,0xd1]
; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
@@ -780,7 +780,7 @@ define <8 x i16> @test_mask_adds_epi16_rrk_128(<8 x i16> %a, <8 x i16> %b, <8 x
define <8 x i16> @test_mask_adds_epi16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) {
; CHECK-LABEL: test_mask_adds_epi16_rrkz_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpaddsw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xed,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -790,7 +790,7 @@ define <8 x i16> @test_mask_adds_epi16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i8 %
define <8 x i16> @test_mask_adds_epi16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) {
; CHECK-LABEL: test_mask_adds_epi16_rm_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpaddsw (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xed,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x i16>, <8 x i16>* %ptr_b
@@ -800,7 +800,7 @@ define <8 x i16> @test_mask_adds_epi16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) {
define <8 x i16> @test_mask_adds_epi16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <8 x i16> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_adds_epi16_rmk_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
; CHECK-NEXT: vpaddsw (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xed,0x0f]
; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
@@ -812,7 +812,7 @@ define <8 x i16> @test_mask_adds_epi16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b,
define <8 x i16> @test_mask_adds_epi16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i8 %mask) {
; CHECK-LABEL: test_mask_adds_epi16_rmkz_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
; CHECK-NEXT: vpaddsw (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xed,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -825,7 +825,7 @@ declare <8 x i16> @llvm.x86.avx512.mask.padds.w.128(<8 x i16>, <8 x i16>, <8 x i
define <16 x i16> @test_mask_adds_epi16_rr_256(<16 x i16> %a, <16 x i16> %b) {
; CHECK-LABEL: test_mask_adds_epi16_rr_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpaddsw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xed,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.mask.padds.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1)
@@ -834,7 +834,7 @@ define <16 x i16> @test_mask_adds_epi16_rr_256(<16 x i16> %a, <16 x i16> %b) {
define <16 x i16> @test_mask_adds_epi16_rrk_256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) {
; CHECK-LABEL: test_mask_adds_epi16_rrk_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpaddsw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xed,0xd1]
; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2]
@@ -845,7 +845,7 @@ define <16 x i16> @test_mask_adds_epi16_rrk_256(<16 x i16> %a, <16 x i16> %b, <1
define <16 x i16> @test_mask_adds_epi16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) {
; CHECK-LABEL: test_mask_adds_epi16_rrkz_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpaddsw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xed,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -855,7 +855,7 @@ define <16 x i16> @test_mask_adds_epi16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i
define <16 x i16> @test_mask_adds_epi16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) {
; CHECK-LABEL: test_mask_adds_epi16_rm_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpaddsw (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xed,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <16 x i16>, <16 x i16>* %ptr_b
@@ -865,7 +865,7 @@ define <16 x i16> @test_mask_adds_epi16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b
define <16 x i16> @test_mask_adds_epi16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <16 x i16> %passThru, i16 %mask) {
; CHECK-LABEL: test_mask_adds_epi16_rmk_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
; CHECK-NEXT: vpaddsw (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xed,0x0f]
; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
@@ -877,7 +877,7 @@ define <16 x i16> @test_mask_adds_epi16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_
define <16 x i16> @test_mask_adds_epi16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i16 %mask) {
; CHECK-LABEL: test_mask_adds_epi16_rmkz_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
; CHECK-NEXT: vpaddsw (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xed,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -890,7 +890,7 @@ declare <16 x i16> @llvm.x86.avx512.mask.padds.w.256(<16 x i16>, <16 x i16>, <16
define <8 x i16> @test_mask_subs_epi16_rr_128(<8 x i16> %a, <8 x i16> %b) {
; CHECK-LABEL: test_mask_subs_epi16_rr_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsubsw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe9,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.mask.psubs.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1)
@@ -899,7 +899,7 @@ define <8 x i16> @test_mask_subs_epi16_rr_128(<8 x i16> %a, <8 x i16> %b) {
define <8 x i16> @test_mask_subs_epi16_rrk_128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_subs_epi16_rrk_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpsubsw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xe9,0xd1]
; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
@@ -910,7 +910,7 @@ define <8 x i16> @test_mask_subs_epi16_rrk_128(<8 x i16> %a, <8 x i16> %b, <8 x
define <8 x i16> @test_mask_subs_epi16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) {
; CHECK-LABEL: test_mask_subs_epi16_rrkz_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpsubsw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xe9,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -920,7 +920,7 @@ define <8 x i16> @test_mask_subs_epi16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i8 %
define <8 x i16> @test_mask_subs_epi16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) {
; CHECK-LABEL: test_mask_subs_epi16_rm_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsubsw (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe9,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x i16>, <8 x i16>* %ptr_b
@@ -930,7 +930,7 @@ define <8 x i16> @test_mask_subs_epi16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) {
define <8 x i16> @test_mask_subs_epi16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <8 x i16> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_subs_epi16_rmk_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
; CHECK-NEXT: vpsubsw (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xe9,0x0f]
; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
@@ -942,7 +942,7 @@ define <8 x i16> @test_mask_subs_epi16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b,
define <8 x i16> @test_mask_subs_epi16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i8 %mask) {
; CHECK-LABEL: test_mask_subs_epi16_rmkz_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
; CHECK-NEXT: vpsubsw (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xe9,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -955,7 +955,7 @@ declare <8 x i16> @llvm.x86.avx512.mask.psubs.w.128(<8 x i16>, <8 x i16>, <8 x i
define <16 x i16> @test_mask_subs_epi16_rr_256(<16 x i16> %a, <16 x i16> %b) {
; CHECK-LABEL: test_mask_subs_epi16_rr_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsubsw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe9,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.mask.psubs.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1)
@@ -964,7 +964,7 @@ define <16 x i16> @test_mask_subs_epi16_rr_256(<16 x i16> %a, <16 x i16> %b) {
define <16 x i16> @test_mask_subs_epi16_rrk_256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) {
; CHECK-LABEL: test_mask_subs_epi16_rrk_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpsubsw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xe9,0xd1]
; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2]
@@ -975,7 +975,7 @@ define <16 x i16> @test_mask_subs_epi16_rrk_256(<16 x i16> %a, <16 x i16> %b, <1
define <16 x i16> @test_mask_subs_epi16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) {
; CHECK-LABEL: test_mask_subs_epi16_rrkz_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpsubsw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xe9,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -985,7 +985,7 @@ define <16 x i16> @test_mask_subs_epi16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i
define <16 x i16> @test_mask_subs_epi16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) {
; CHECK-LABEL: test_mask_subs_epi16_rm_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsubsw (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe9,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <16 x i16>, <16 x i16>* %ptr_b
@@ -995,7 +995,7 @@ define <16 x i16> @test_mask_subs_epi16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b
define <16 x i16> @test_mask_subs_epi16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <16 x i16> %passThru, i16 %mask) {
; CHECK-LABEL: test_mask_subs_epi16_rmk_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
; CHECK-NEXT: vpsubsw (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xe9,0x0f]
; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
@@ -1007,7 +1007,7 @@ define <16 x i16> @test_mask_subs_epi16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_
define <16 x i16> @test_mask_subs_epi16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i16 %mask) {
; CHECK-LABEL: test_mask_subs_epi16_rmkz_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
; CHECK-NEXT: vpsubsw (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xe9,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -1020,7 +1020,7 @@ declare <16 x i16> @llvm.x86.avx512.mask.psubs.w.256(<16 x i16>, <16 x i16>, <16
define <8 x i16> @test_mask_adds_epu16_rr_128(<8 x i16> %a, <8 x i16> %b) {
; CHECK-LABEL: test_mask_adds_epu16_rr_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpaddusw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdd,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.mask.paddus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1)
@@ -1029,7 +1029,7 @@ define <8 x i16> @test_mask_adds_epu16_rr_128(<8 x i16> %a, <8 x i16> %b) {
define <8 x i16> @test_mask_adds_epu16_rrk_128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_adds_epu16_rrk_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpaddusw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xdd,0xd1]
; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
@@ -1040,7 +1040,7 @@ define <8 x i16> @test_mask_adds_epu16_rrk_128(<8 x i16> %a, <8 x i16> %b, <8 x
define <8 x i16> @test_mask_adds_epu16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) {
; CHECK-LABEL: test_mask_adds_epu16_rrkz_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpaddusw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xdd,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -1050,7 +1050,7 @@ define <8 x i16> @test_mask_adds_epu16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i8 %
define <8 x i16> @test_mask_adds_epu16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) {
; CHECK-LABEL: test_mask_adds_epu16_rm_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpaddusw (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdd,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x i16>, <8 x i16>* %ptr_b
@@ -1060,7 +1060,7 @@ define <8 x i16> @test_mask_adds_epu16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) {
define <8 x i16> @test_mask_adds_epu16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <8 x i16> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_adds_epu16_rmk_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
; CHECK-NEXT: vpaddusw (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xdd,0x0f]
; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
@@ -1072,7 +1072,7 @@ define <8 x i16> @test_mask_adds_epu16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b,
define <8 x i16> @test_mask_adds_epu16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i8 %mask) {
; CHECK-LABEL: test_mask_adds_epu16_rmkz_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
; CHECK-NEXT: vpaddusw (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xdd,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -1085,7 +1085,7 @@ declare <8 x i16> @llvm.x86.avx512.mask.paddus.w.128(<8 x i16>, <8 x i16>, <8 x
define <16 x i16> @test_mask_adds_epu16_rr_256(<16 x i16> %a, <16 x i16> %b) {
; CHECK-LABEL: test_mask_adds_epu16_rr_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpaddusw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdd,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.mask.paddus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1)
@@ -1094,7 +1094,7 @@ define <16 x i16> @test_mask_adds_epu16_rr_256(<16 x i16> %a, <16 x i16> %b) {
define <16 x i16> @test_mask_adds_epu16_rrk_256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) {
; CHECK-LABEL: test_mask_adds_epu16_rrk_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpaddusw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xdd,0xd1]
; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2]
@@ -1105,7 +1105,7 @@ define <16 x i16> @test_mask_adds_epu16_rrk_256(<16 x i16> %a, <16 x i16> %b, <1
define <16 x i16> @test_mask_adds_epu16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) {
; CHECK-LABEL: test_mask_adds_epu16_rrkz_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpaddusw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xdd,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -1115,7 +1115,7 @@ define <16 x i16> @test_mask_adds_epu16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i
define <16 x i16> @test_mask_adds_epu16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) {
; CHECK-LABEL: test_mask_adds_epu16_rm_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpaddusw (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdd,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <16 x i16>, <16 x i16>* %ptr_b
@@ -1125,7 +1125,7 @@ define <16 x i16> @test_mask_adds_epu16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b
define <16 x i16> @test_mask_adds_epu16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <16 x i16> %passThru, i16 %mask) {
; CHECK-LABEL: test_mask_adds_epu16_rmk_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
; CHECK-NEXT: vpaddusw (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xdd,0x0f]
; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
@@ -1137,7 +1137,7 @@ define <16 x i16> @test_mask_adds_epu16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_
define <16 x i16> @test_mask_adds_epu16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i16 %mask) {
; CHECK-LABEL: test_mask_adds_epu16_rmkz_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
; CHECK-NEXT: vpaddusw (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xdd,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -1150,7 +1150,7 @@ declare <16 x i16> @llvm.x86.avx512.mask.paddus.w.256(<16 x i16>, <16 x i16>, <1
define <8 x i16> @test_mask_subs_epu16_rr_128(<8 x i16> %a, <8 x i16> %b) {
; CHECK-LABEL: test_mask_subs_epu16_rr_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd9,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.mask.psubus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 -1)
@@ -1159,7 +1159,7 @@ define <8 x i16> @test_mask_subs_epu16_rr_128(<8 x i16> %a, <8 x i16> %b) {
define <8 x i16> @test_mask_subs_epu16_rrk_128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_subs_epu16_rrk_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpsubusw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xd9,0xd1]
; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
@@ -1170,7 +1170,7 @@ define <8 x i16> @test_mask_subs_epu16_rrk_128(<8 x i16> %a, <8 x i16> %b, <8 x
define <8 x i16> @test_mask_subs_epu16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) {
; CHECK-LABEL: test_mask_subs_epu16_rrkz_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xd9,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -1180,7 +1180,7 @@ define <8 x i16> @test_mask_subs_epu16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i8 %
define <8 x i16> @test_mask_subs_epu16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) {
; CHECK-LABEL: test_mask_subs_epu16_rm_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsubusw (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd9,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x i16>, <8 x i16>* %ptr_b
@@ -1190,7 +1190,7 @@ define <8 x i16> @test_mask_subs_epu16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) {
define <8 x i16> @test_mask_subs_epu16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <8 x i16> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_subs_epu16_rmk_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
; CHECK-NEXT: vpsubusw (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xd9,0x0f]
; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
@@ -1202,7 +1202,7 @@ define <8 x i16> @test_mask_subs_epu16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b,
define <8 x i16> @test_mask_subs_epu16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i8 %mask) {
; CHECK-LABEL: test_mask_subs_epu16_rmkz_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
; CHECK-NEXT: vpsubusw (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xd9,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -1215,7 +1215,7 @@ declare <8 x i16> @llvm.x86.avx512.mask.psubus.w.128(<8 x i16>, <8 x i16>, <8 x
define <16 x i16> @test_mask_subs_epu16_rr_256(<16 x i16> %a, <16 x i16> %b) {
; CHECK-LABEL: test_mask_subs_epu16_rr_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsubusw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd9,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.mask.psubus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 -1)
@@ -1224,7 +1224,7 @@ define <16 x i16> @test_mask_subs_epu16_rr_256(<16 x i16> %a, <16 x i16> %b) {
define <16 x i16> @test_mask_subs_epu16_rrk_256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) {
; CHECK-LABEL: test_mask_subs_epu16_rrk_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpsubusw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xd9,0xd1]
; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2]
@@ -1235,7 +1235,7 @@ define <16 x i16> @test_mask_subs_epu16_rrk_256(<16 x i16> %a, <16 x i16> %b, <1
define <16 x i16> @test_mask_subs_epu16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) {
; CHECK-LABEL: test_mask_subs_epu16_rrkz_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpsubusw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xd9,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -1245,7 +1245,7 @@ define <16 x i16> @test_mask_subs_epu16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i
define <16 x i16> @test_mask_subs_epu16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) {
; CHECK-LABEL: test_mask_subs_epu16_rm_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsubusw (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd9,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <16 x i16>, <16 x i16>* %ptr_b
@@ -1255,7 +1255,7 @@ define <16 x i16> @test_mask_subs_epu16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b
define <16 x i16> @test_mask_subs_epu16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <16 x i16> %passThru, i16 %mask) {
; CHECK-LABEL: test_mask_subs_epu16_rmk_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
; CHECK-NEXT: vpsubusw (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xd9,0x0f]
; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
@@ -1267,7 +1267,7 @@ define <16 x i16> @test_mask_subs_epu16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_
define <16 x i16> @test_mask_subs_epu16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i16 %mask) {
; CHECK-LABEL: test_mask_subs_epu16_rmkz_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
; CHECK-NEXT: vpsubusw (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xd9,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -1280,7 +1280,7 @@ declare <16 x i16> @llvm.x86.avx512.mask.psubus.w.256(<16 x i16>, <16 x i16>, <1
define <16 x i8> @test_mask_adds_epi8_rr_128(<16 x i8> %a, <16 x i8> %b) {
; CHECK-LABEL: test_mask_adds_epi8_rr_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xec,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i8> @llvm.x86.avx512.mask.padds.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 -1)
@@ -1289,7 +1289,7 @@ define <16 x i8> @test_mask_adds_epi8_rr_128(<16 x i8> %a, <16 x i8> %b) {
define <16 x i8> @test_mask_adds_epi8_rrk_128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) {
; CHECK-LABEL: test_mask_adds_epi8_rrk_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpaddsb %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xec,0xd1]
; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
@@ -1300,7 +1300,7 @@ define <16 x i8> @test_mask_adds_epi8_rrk_128(<16 x i8> %a, <16 x i8> %b, <16 x
define <16 x i8> @test_mask_adds_epi8_rrkz_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) {
; CHECK-LABEL: test_mask_adds_epi8_rrkz_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xec,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -1310,7 +1310,7 @@ define <16 x i8> @test_mask_adds_epi8_rrkz_128(<16 x i8> %a, <16 x i8> %b, i16 %
define <16 x i8> @test_mask_adds_epi8_rm_128(<16 x i8> %a, <16 x i8>* %ptr_b) {
; CHECK-LABEL: test_mask_adds_epi8_rm_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpaddsb (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xec,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <16 x i8>, <16 x i8>* %ptr_b
@@ -1320,7 +1320,7 @@ define <16 x i8> @test_mask_adds_epi8_rm_128(<16 x i8> %a, <16 x i8>* %ptr_b) {
define <16 x i8> @test_mask_adds_epi8_rmk_128(<16 x i8> %a, <16 x i8>* %ptr_b, <16 x i8> %passThru, i16 %mask) {
; CHECK-LABEL: test_mask_adds_epi8_rmk_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
; CHECK-NEXT: vpaddsb (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xec,0x0f]
; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
@@ -1332,7 +1332,7 @@ define <16 x i8> @test_mask_adds_epi8_rmk_128(<16 x i8> %a, <16 x i8>* %ptr_b, <
define <16 x i8> @test_mask_adds_epi8_rmkz_128(<16 x i8> %a, <16 x i8>* %ptr_b, i16 %mask) {
; CHECK-LABEL: test_mask_adds_epi8_rmkz_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
; CHECK-NEXT: vpaddsb (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xec,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -1345,7 +1345,7 @@ declare <16 x i8> @llvm.x86.avx512.mask.padds.b.128(<16 x i8>, <16 x i8>, <16 x
define <32 x i8> @test_mask_adds_epi8_rr_256(<32 x i8> %a, <32 x i8> %b) {
; CHECK-LABEL: test_mask_adds_epi8_rr_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpaddsb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xec,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <32 x i8> @llvm.x86.avx512.mask.padds.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 -1)
@@ -1354,7 +1354,7 @@ define <32 x i8> @test_mask_adds_epi8_rr_256(<32 x i8> %a, <32 x i8> %b) {
define <32 x i8> @test_mask_adds_epi8_rrk_256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) {
; CHECK-LABEL: test_mask_adds_epi8_rrk_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpaddsb %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xec,0xd1]
; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2]
@@ -1365,7 +1365,7 @@ define <32 x i8> @test_mask_adds_epi8_rrk_256(<32 x i8> %a, <32 x i8> %b, <32 x
define <32 x i8> @test_mask_adds_epi8_rrkz_256(<32 x i8> %a, <32 x i8> %b, i32 %mask) {
; CHECK-LABEL: test_mask_adds_epi8_rrkz_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpaddsb %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xec,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -1375,7 +1375,7 @@ define <32 x i8> @test_mask_adds_epi8_rrkz_256(<32 x i8> %a, <32 x i8> %b, i32 %
define <32 x i8> @test_mask_adds_epi8_rm_256(<32 x i8> %a, <32 x i8>* %ptr_b) {
; CHECK-LABEL: test_mask_adds_epi8_rm_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpaddsb (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xec,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <32 x i8>, <32 x i8>* %ptr_b
@@ -1385,7 +1385,7 @@ define <32 x i8> @test_mask_adds_epi8_rm_256(<32 x i8> %a, <32 x i8>* %ptr_b) {
define <32 x i8> @test_mask_adds_epi8_rmk_256(<32 x i8> %a, <32 x i8>* %ptr_b, <32 x i8> %passThru, i32 %mask) {
; CHECK-LABEL: test_mask_adds_epi8_rmk_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
; CHECK-NEXT: vpaddsb (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xec,0x0f]
; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
@@ -1397,7 +1397,7 @@ define <32 x i8> @test_mask_adds_epi8_rmk_256(<32 x i8> %a, <32 x i8>* %ptr_b, <
define <32 x i8> @test_mask_adds_epi8_rmkz_256(<32 x i8> %a, <32 x i8>* %ptr_b, i32 %mask) {
; CHECK-LABEL: test_mask_adds_epi8_rmkz_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
; CHECK-NEXT: vpaddsb (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xec,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -1410,7 +1410,7 @@ declare <32 x i8> @llvm.x86.avx512.mask.padds.b.256(<32 x i8>, <32 x i8>, <32 x
define <16 x i8> @test_mask_subs_epi8_rr_128(<16 x i8> %a, <16 x i8> %b) {
; CHECK-LABEL: test_mask_subs_epi8_rr_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe8,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i8> @llvm.x86.avx512.mask.psubs.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 -1)
@@ -1419,7 +1419,7 @@ define <16 x i8> @test_mask_subs_epi8_rr_128(<16 x i8> %a, <16 x i8> %b) {
define <16 x i8> @test_mask_subs_epi8_rrk_128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) {
; CHECK-LABEL: test_mask_subs_epi8_rrk_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpsubsb %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xe8,0xd1]
; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
@@ -1430,7 +1430,7 @@ define <16 x i8> @test_mask_subs_epi8_rrk_128(<16 x i8> %a, <16 x i8> %b, <16 x
define <16 x i8> @test_mask_subs_epi8_rrkz_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) {
; CHECK-LABEL: test_mask_subs_epi8_rrkz_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xe8,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -1440,7 +1440,7 @@ define <16 x i8> @test_mask_subs_epi8_rrkz_128(<16 x i8> %a, <16 x i8> %b, i16 %
define <16 x i8> @test_mask_subs_epi8_rm_128(<16 x i8> %a, <16 x i8>* %ptr_b) {
; CHECK-LABEL: test_mask_subs_epi8_rm_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsubsb (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe8,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <16 x i8>, <16 x i8>* %ptr_b
@@ -1450,7 +1450,7 @@ define <16 x i8> @test_mask_subs_epi8_rm_128(<16 x i8> %a, <16 x i8>* %ptr_b) {
define <16 x i8> @test_mask_subs_epi8_rmk_128(<16 x i8> %a, <16 x i8>* %ptr_b, <16 x i8> %passThru, i16 %mask) {
; CHECK-LABEL: test_mask_subs_epi8_rmk_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
; CHECK-NEXT: vpsubsb (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xe8,0x0f]
; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
@@ -1462,7 +1462,7 @@ define <16 x i8> @test_mask_subs_epi8_rmk_128(<16 x i8> %a, <16 x i8>* %ptr_b, <
define <16 x i8> @test_mask_subs_epi8_rmkz_128(<16 x i8> %a, <16 x i8>* %ptr_b, i16 %mask) {
; CHECK-LABEL: test_mask_subs_epi8_rmkz_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
; CHECK-NEXT: vpsubsb (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xe8,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -1475,7 +1475,7 @@ declare <16 x i8> @llvm.x86.avx512.mask.psubs.b.128(<16 x i8>, <16 x i8>, <16 x
define <32 x i8> @test_mask_subs_epi8_rr_256(<32 x i8> %a, <32 x i8> %b) {
; CHECK-LABEL: test_mask_subs_epi8_rr_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsubsb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe8,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <32 x i8> @llvm.x86.avx512.mask.psubs.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 -1)
@@ -1484,7 +1484,7 @@ define <32 x i8> @test_mask_subs_epi8_rr_256(<32 x i8> %a, <32 x i8> %b) {
define <32 x i8> @test_mask_subs_epi8_rrk_256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) {
; CHECK-LABEL: test_mask_subs_epi8_rrk_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpsubsb %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xe8,0xd1]
; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2]
@@ -1495,7 +1495,7 @@ define <32 x i8> @test_mask_subs_epi8_rrk_256(<32 x i8> %a, <32 x i8> %b, <32 x
define <32 x i8> @test_mask_subs_epi8_rrkz_256(<32 x i8> %a, <32 x i8> %b, i32 %mask) {
; CHECK-LABEL: test_mask_subs_epi8_rrkz_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpsubsb %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xe8,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -1505,7 +1505,7 @@ define <32 x i8> @test_mask_subs_epi8_rrkz_256(<32 x i8> %a, <32 x i8> %b, i32 %
define <32 x i8> @test_mask_subs_epi8_rm_256(<32 x i8> %a, <32 x i8>* %ptr_b) {
; CHECK-LABEL: test_mask_subs_epi8_rm_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsubsb (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe8,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <32 x i8>, <32 x i8>* %ptr_b
@@ -1515,7 +1515,7 @@ define <32 x i8> @test_mask_subs_epi8_rm_256(<32 x i8> %a, <32 x i8>* %ptr_b) {
define <32 x i8> @test_mask_subs_epi8_rmk_256(<32 x i8> %a, <32 x i8>* %ptr_b, <32 x i8> %passThru, i32 %mask) {
; CHECK-LABEL: test_mask_subs_epi8_rmk_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
; CHECK-NEXT: vpsubsb (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xe8,0x0f]
; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
@@ -1527,7 +1527,7 @@ define <32 x i8> @test_mask_subs_epi8_rmk_256(<32 x i8> %a, <32 x i8>* %ptr_b, <
define <32 x i8> @test_mask_subs_epi8_rmkz_256(<32 x i8> %a, <32 x i8>* %ptr_b, i32 %mask) {
; CHECK-LABEL: test_mask_subs_epi8_rmkz_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
; CHECK-NEXT: vpsubsb (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xe8,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -1540,7 +1540,7 @@ declare <32 x i8> @llvm.x86.avx512.mask.psubs.b.256(<32 x i8>, <32 x i8>, <32 x
define <16 x i8> @test_mask_adds_epu8_rr_128(<16 x i8> %a, <16 x i8> %b) {
; CHECK-LABEL: test_mask_adds_epu8_rr_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdc,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i8> @llvm.x86.avx512.mask.paddus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 -1)
@@ -1549,7 +1549,7 @@ define <16 x i8> @test_mask_adds_epu8_rr_128(<16 x i8> %a, <16 x i8> %b) {
define <16 x i8> @test_mask_adds_epu8_rrk_128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) {
; CHECK-LABEL: test_mask_adds_epu8_rrk_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpaddusb %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xdc,0xd1]
; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
@@ -1560,7 +1560,7 @@ define <16 x i8> @test_mask_adds_epu8_rrk_128(<16 x i8> %a, <16 x i8> %b, <16 x
define <16 x i8> @test_mask_adds_epu8_rrkz_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) {
; CHECK-LABEL: test_mask_adds_epu8_rrkz_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xdc,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -1570,7 +1570,7 @@ define <16 x i8> @test_mask_adds_epu8_rrkz_128(<16 x i8> %a, <16 x i8> %b, i16 %
define <16 x i8> @test_mask_adds_epu8_rm_128(<16 x i8> %a, <16 x i8>* %ptr_b) {
; CHECK-LABEL: test_mask_adds_epu8_rm_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpaddusb (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdc,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <16 x i8>, <16 x i8>* %ptr_b
@@ -1580,7 +1580,7 @@ define <16 x i8> @test_mask_adds_epu8_rm_128(<16 x i8> %a, <16 x i8>* %ptr_b) {
define <16 x i8> @test_mask_adds_epu8_rmk_128(<16 x i8> %a, <16 x i8>* %ptr_b, <16 x i8> %passThru, i16 %mask) {
; CHECK-LABEL: test_mask_adds_epu8_rmk_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
; CHECK-NEXT: vpaddusb (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xdc,0x0f]
; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
@@ -1592,7 +1592,7 @@ define <16 x i8> @test_mask_adds_epu8_rmk_128(<16 x i8> %a, <16 x i8>* %ptr_b, <
define <16 x i8> @test_mask_adds_epu8_rmkz_128(<16 x i8> %a, <16 x i8>* %ptr_b, i16 %mask) {
; CHECK-LABEL: test_mask_adds_epu8_rmkz_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
; CHECK-NEXT: vpaddusb (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xdc,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -1605,7 +1605,7 @@ declare <16 x i8> @llvm.x86.avx512.mask.paddus.b.128(<16 x i8>, <16 x i8>, <16 x
define <32 x i8> @test_mask_adds_epu8_rr_256(<32 x i8> %a, <32 x i8> %b) {
; CHECK-LABEL: test_mask_adds_epu8_rr_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpaddusb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdc,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <32 x i8> @llvm.x86.avx512.mask.paddus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 -1)
@@ -1614,7 +1614,7 @@ define <32 x i8> @test_mask_adds_epu8_rr_256(<32 x i8> %a, <32 x i8> %b) {
define <32 x i8> @test_mask_adds_epu8_rrk_256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) {
; CHECK-LABEL: test_mask_adds_epu8_rrk_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpaddusb %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xdc,0xd1]
; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2]
@@ -1625,7 +1625,7 @@ define <32 x i8> @test_mask_adds_epu8_rrk_256(<32 x i8> %a, <32 x i8> %b, <32 x
define <32 x i8> @test_mask_adds_epu8_rrkz_256(<32 x i8> %a, <32 x i8> %b, i32 %mask) {
; CHECK-LABEL: test_mask_adds_epu8_rrkz_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpaddusb %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xdc,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -1635,7 +1635,7 @@ define <32 x i8> @test_mask_adds_epu8_rrkz_256(<32 x i8> %a, <32 x i8> %b, i32 %
define <32 x i8> @test_mask_adds_epu8_rm_256(<32 x i8> %a, <32 x i8>* %ptr_b) {
; CHECK-LABEL: test_mask_adds_epu8_rm_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpaddusb (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdc,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <32 x i8>, <32 x i8>* %ptr_b
@@ -1645,7 +1645,7 @@ define <32 x i8> @test_mask_adds_epu8_rm_256(<32 x i8> %a, <32 x i8>* %ptr_b) {
define <32 x i8> @test_mask_adds_epu8_rmk_256(<32 x i8> %a, <32 x i8>* %ptr_b, <32 x i8> %passThru, i32 %mask) {
; CHECK-LABEL: test_mask_adds_epu8_rmk_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
; CHECK-NEXT: vpaddusb (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xdc,0x0f]
; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
@@ -1657,7 +1657,7 @@ define <32 x i8> @test_mask_adds_epu8_rmk_256(<32 x i8> %a, <32 x i8>* %ptr_b, <
define <32 x i8> @test_mask_adds_epu8_rmkz_256(<32 x i8> %a, <32 x i8>* %ptr_b, i32 %mask) {
; CHECK-LABEL: test_mask_adds_epu8_rmkz_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
; CHECK-NEXT: vpaddusb (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xdc,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -1670,7 +1670,7 @@ declare <32 x i8> @llvm.x86.avx512.mask.paddus.b.256(<32 x i8>, <32 x i8>, <32 x
define <16 x i8> @test_mask_subs_epu8_rr_128(<16 x i8> %a, <16 x i8> %b) {
; CHECK-LABEL: test_mask_subs_epu8_rr_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd8,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i8> @llvm.x86.avx512.mask.psubus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 -1)
@@ -1679,7 +1679,7 @@ define <16 x i8> @test_mask_subs_epu8_rr_128(<16 x i8> %a, <16 x i8> %b) {
define <16 x i8> @test_mask_subs_epu8_rrk_128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) {
; CHECK-LABEL: test_mask_subs_epu8_rrk_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpsubusb %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xd8,0xd1]
; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
@@ -1690,7 +1690,7 @@ define <16 x i8> @test_mask_subs_epu8_rrk_128(<16 x i8> %a, <16 x i8> %b, <16 x
define <16 x i8> @test_mask_subs_epu8_rrkz_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) {
; CHECK-LABEL: test_mask_subs_epu8_rrkz_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xd8,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -1700,7 +1700,7 @@ define <16 x i8> @test_mask_subs_epu8_rrkz_128(<16 x i8> %a, <16 x i8> %b, i16 %
define <16 x i8> @test_mask_subs_epu8_rm_128(<16 x i8> %a, <16 x i8>* %ptr_b) {
; CHECK-LABEL: test_mask_subs_epu8_rm_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsubusb (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd8,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <16 x i8>, <16 x i8>* %ptr_b
@@ -1710,7 +1710,7 @@ define <16 x i8> @test_mask_subs_epu8_rm_128(<16 x i8> %a, <16 x i8>* %ptr_b) {
define <16 x i8> @test_mask_subs_epu8_rmk_128(<16 x i8> %a, <16 x i8>* %ptr_b, <16 x i8> %passThru, i16 %mask) {
; CHECK-LABEL: test_mask_subs_epu8_rmk_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
; CHECK-NEXT: vpsubusb (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xd8,0x0f]
; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
@@ -1722,7 +1722,7 @@ define <16 x i8> @test_mask_subs_epu8_rmk_128(<16 x i8> %a, <16 x i8>* %ptr_b, <
define <16 x i8> @test_mask_subs_epu8_rmkz_128(<16 x i8> %a, <16 x i8>* %ptr_b, i16 %mask) {
; CHECK-LABEL: test_mask_subs_epu8_rmkz_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
; CHECK-NEXT: vpsubusb (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xd8,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -1735,7 +1735,7 @@ declare <16 x i8> @llvm.x86.avx512.mask.psubus.b.128(<16 x i8>, <16 x i8>, <16 x
define <32 x i8> @test_mask_subs_epu8_rr_256(<32 x i8> %a, <32 x i8> %b) {
; CHECK-LABEL: test_mask_subs_epu8_rr_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsubusb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd8,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <32 x i8> @llvm.x86.avx512.mask.psubus.b.256(<32 x i8> %a, <32 x i8> %b, <32 x i8> zeroinitializer, i32 -1)
@@ -1744,7 +1744,7 @@ define <32 x i8> @test_mask_subs_epu8_rr_256(<32 x i8> %a, <32 x i8> %b) {
define <32 x i8> @test_mask_subs_epu8_rrk_256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) {
; CHECK-LABEL: test_mask_subs_epu8_rrk_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpsubusb %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xd8,0xd1]
; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2]
@@ -1755,7 +1755,7 @@ define <32 x i8> @test_mask_subs_epu8_rrk_256(<32 x i8> %a, <32 x i8> %b, <32 x
define <32 x i8> @test_mask_subs_epu8_rrkz_256(<32 x i8> %a, <32 x i8> %b, i32 %mask) {
; CHECK-LABEL: test_mask_subs_epu8_rrkz_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpsubusb %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xd8,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -1765,7 +1765,7 @@ define <32 x i8> @test_mask_subs_epu8_rrkz_256(<32 x i8> %a, <32 x i8> %b, i32 %
define <32 x i8> @test_mask_subs_epu8_rm_256(<32 x i8> %a, <32 x i8>* %ptr_b) {
; CHECK-LABEL: test_mask_subs_epu8_rm_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsubusb (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd8,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <32 x i8>, <32 x i8>* %ptr_b
@@ -1775,7 +1775,7 @@ define <32 x i8> @test_mask_subs_epu8_rm_256(<32 x i8> %a, <32 x i8>* %ptr_b) {
define <32 x i8> @test_mask_subs_epu8_rmk_256(<32 x i8> %a, <32 x i8>* %ptr_b, <32 x i8> %passThru, i32 %mask) {
; CHECK-LABEL: test_mask_subs_epu8_rmk_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
; CHECK-NEXT: vpsubusb (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xd8,0x0f]
; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
@@ -1787,7 +1787,7 @@ define <32 x i8> @test_mask_subs_epu8_rmk_256(<32 x i8> %a, <32 x i8>* %ptr_b, <
define <32 x i8> @test_mask_subs_epu8_rmkz_256(<32 x i8> %a, <32 x i8>* %ptr_b, i32 %mask) {
; CHECK-LABEL: test_mask_subs_epu8_rmkz_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
; CHECK-NEXT: vpsubusb (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xd8,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -1802,7 +1802,7 @@ declare <8 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.128(<8 x i16>, <8 x i16>,
define <8 x i16>@test_int_x86_avx512_mask_vpermt2var_hi_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vpermt2var_hi_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vmovdqa %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd9]
; CHECK-NEXT: vpermt2w %xmm2, %xmm0, %xmm3 ## encoding: [0x62,0xf2,0xfd,0x08,0x7d,0xda]
@@ -1819,7 +1819,7 @@ declare <8 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.128(<8 x i16>, <8 x i16>,
define <8 x i16>@test_int_x86_avx512_maskz_vpermt2var_hi_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_hi_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vmovdqa %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd9]
; CHECK-NEXT: vpermt2w %xmm2, %xmm0, %xmm3 ## encoding: [0x62,0xf2,0xfd,0x08,0x7d,0xda]
@@ -1836,7 +1836,7 @@ declare <16 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.256(<16 x i16>, <16 x i16
define <16 x i16>@test_int_x86_avx512_mask_vpermt2var_hi_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vpermt2var_hi_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vmovdqa %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd9]
; CHECK-NEXT: vpermt2w %ymm2, %ymm0, %ymm3 ## encoding: [0x62,0xf2,0xfd,0x28,0x7d,0xda]
@@ -1853,7 +1853,7 @@ declare <16 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.256(<16 x i16>, <16 x i1
define <16 x i16>@test_int_x86_avx512_maskz_vpermt2var_hi_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) {
; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_hi_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vmovdqa %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd9]
; CHECK-NEXT: vpermt2w %ymm2, %ymm0, %ymm3 ## encoding: [0x62,0xf2,0xfd,0x28,0x7d,0xda]
@@ -1870,7 +1870,7 @@ declare <8 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.128(<8 x i16>, <8 x i16>,
define <8 x i16>@test_int_x86_avx512_mask_vpermi2var_hi_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_hi_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vmovdqa %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd9]
; CHECK-NEXT: vpermi2w %xmm2, %xmm0, %xmm3 ## encoding: [0x62,0xf2,0xfd,0x08,0x75,0xda]
@@ -1887,7 +1887,7 @@ declare <16 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.256(<16 x i16>, <16 x i16
define <16 x i16>@test_int_x86_avx512_mask_vpermi2var_hi_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_hi_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vmovdqa %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd9]
; CHECK-NEXT: vpermi2w %ymm2, %ymm0, %ymm3 ## encoding: [0x62,0xf2,0xfd,0x28,0x75,0xda]
@@ -1900,139 +1900,11 @@ define <16 x i16>@test_int_x86_avx512_mask_vpermi2var_hi_256(<16 x i16> %x0, <16
ret <16 x i16> %res2
}
-declare <16 x i8> @llvm.x86.avx512.mask.pavg.b.128(<16 x i8>, <16 x i8>, <16 x i8>, i16)
-
-define <16 x i8>@test_int_x86_avx512_mask_pavg_b_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_pavg_b_128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
-; CHECK-NEXT: vpavgb %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xe0,0xd1]
-; CHECK-NEXT: vpavgb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe0,0xc1]
-; CHECK-NEXT: vpaddb %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc0]
-; CHECK-NEXT: retq ## encoding: [0xc3]
- %res = call <16 x i8> @llvm.x86.avx512.mask.pavg.b.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3)
- %res1 = call <16 x i8> @llvm.x86.avx512.mask.pavg.b.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 -1)
- %res2 = add <16 x i8> %res, %res1
- ret <16 x i8> %res2
-}
-
-declare <32 x i8> @llvm.x86.avx512.mask.pavg.b.256(<32 x i8>, <32 x i8>, <32 x i8>, i32)
-
-define <32 x i8>@test_int_x86_avx512_mask_pavg_b_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_pavg_b_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
-; CHECK-NEXT: vpavgb %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xe0,0xd1]
-; CHECK-NEXT: vpavgb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe0,0xc1]
-; CHECK-NEXT: vpaddb %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfc,0xc0]
-; CHECK-NEXT: retq ## encoding: [0xc3]
- %res = call <32 x i8> @llvm.x86.avx512.mask.pavg.b.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3)
- %res1 = call <32 x i8> @llvm.x86.avx512.mask.pavg.b.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1)
- %res2 = add <32 x i8> %res, %res1
- ret <32 x i8> %res2
-}
-
-declare <8 x i16> @llvm.x86.avx512.mask.pavg.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)
-
-define <8 x i16>@test_int_x86_avx512_mask_pavg_w_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_pavg_w_128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
-; CHECK-NEXT: vpavgw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xe3,0xd1]
-; CHECK-NEXT: vpavgw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe3,0xc1]
-; CHECK-NEXT: vpaddw %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc0]
-; CHECK-NEXT: retq ## encoding: [0xc3]
- %res = call <8 x i16> @llvm.x86.avx512.mask.pavg.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3)
- %res1 = call <8 x i16> @llvm.x86.avx512.mask.pavg.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1)
- %res2 = add <8 x i16> %res, %res1
- ret <8 x i16> %res2
-}
-
-declare <16 x i16> @llvm.x86.avx512.mask.pavg.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16)
-
-define <16 x i16>@test_int_x86_avx512_mask_pavg_w_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_pavg_w_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
-; CHECK-NEXT: vpavgw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xe3,0xd1]
-; CHECK-NEXT: vpavgw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe3,0xc1]
-; CHECK-NEXT: vpaddw %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0]
-; CHECK-NEXT: retq ## encoding: [0xc3]
- %res = call <16 x i16> @llvm.x86.avx512.mask.pavg.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3)
- %res1 = call <16 x i16> @llvm.x86.avx512.mask.pavg.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1)
- %res2 = add <16 x i16> %res, %res1
- ret <16 x i16> %res2
-}
-
-declare <16 x i8> @llvm.x86.avx512.mask.pabs.b.128(<16 x i8>, <16 x i8>, i16)
-
-define <16 x i8>@test_int_x86_avx512_mask_pabs_b_128(<16 x i8> %x0, <16 x i8> %x1, i16 %x2) {
-; CHECK-LABEL: test_int_x86_avx512_mask_pabs_b_128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
-; CHECK-NEXT: vpabsb %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x1c,0xc8]
-; CHECK-NEXT: vpabsb %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x1c,0xc0]
-; CHECK-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfc,0xc0]
-; CHECK-NEXT: retq ## encoding: [0xc3]
- %res = call <16 x i8> @llvm.x86.avx512.mask.pabs.b.128(<16 x i8> %x0, <16 x i8> %x1, i16 %x2)
- %res1 = call <16 x i8> @llvm.x86.avx512.mask.pabs.b.128(<16 x i8> %x0, <16 x i8> %x1, i16 -1)
- %res2 = add <16 x i8> %res, %res1
- ret <16 x i8> %res2
-}
-
-declare <32 x i8> @llvm.x86.avx512.mask.pabs.b.256(<32 x i8>, <32 x i8>, i32)
-
-define <32 x i8>@test_int_x86_avx512_mask_pabs_b_256(<32 x i8> %x0, <32 x i8> %x1, i32 %x2) {
-; CHECK-LABEL: test_int_x86_avx512_mask_pabs_b_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
-; CHECK-NEXT: vpabsb %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x1c,0xc8]
-; CHECK-NEXT: vpabsb %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x1c,0xc0]
-; CHECK-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfc,0xc0]
-; CHECK-NEXT: retq ## encoding: [0xc3]
- %res = call <32 x i8> @llvm.x86.avx512.mask.pabs.b.256(<32 x i8> %x0, <32 x i8> %x1, i32 %x2)
- %res1 = call <32 x i8> @llvm.x86.avx512.mask.pabs.b.256(<32 x i8> %x0, <32 x i8> %x1, i32 -1)
- %res2 = add <32 x i8> %res, %res1
- ret <32 x i8> %res2
-}
-
-declare <8 x i16> @llvm.x86.avx512.mask.pabs.w.128(<8 x i16>, <8 x i16>, i8)
-
-define <8 x i16>@test_int_x86_avx512_mask_pabs_w_128(<8 x i16> %x0, <8 x i16> %x1, i8 %x2) {
-; CHECK-LABEL: test_int_x86_avx512_mask_pabs_w_128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
-; CHECK-NEXT: vpabsw %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x1d,0xc8]
-; CHECK-NEXT: vpabsw %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x1d,0xc0]
-; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc0]
-; CHECK-NEXT: retq ## encoding: [0xc3]
- %res = call <8 x i16> @llvm.x86.avx512.mask.pabs.w.128(<8 x i16> %x0, <8 x i16> %x1, i8 %x2)
- %res1 = call <8 x i16> @llvm.x86.avx512.mask.pabs.w.128(<8 x i16> %x0, <8 x i16> %x1, i8 -1)
- %res2 = add <8 x i16> %res, %res1
- ret <8 x i16> %res2
-}
-
-declare <16 x i16> @llvm.x86.avx512.mask.pabs.w.256(<16 x i16>, <16 x i16>, i16)
-
-define <16 x i16>@test_int_x86_avx512_mask_pabs_w_256(<16 x i16> %x0, <16 x i16> %x1, i16 %x2) {
-; CHECK-LABEL: test_int_x86_avx512_mask_pabs_w_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
-; CHECK-NEXT: vpabsw %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x1d,0xc8]
-; CHECK-NEXT: vpabsw %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x1d,0xc0]
-; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0]
-; CHECK-NEXT: retq ## encoding: [0xc3]
- %res = call <16 x i16> @llvm.x86.avx512.mask.pabs.w.256(<16 x i16> %x0, <16 x i16> %x1, i16 %x2)
- %res1 = call <16 x i16> @llvm.x86.avx512.mask.pabs.w.256(<16 x i16> %x0, <16 x i16> %x1, i16 -1)
- %res2 = add <16 x i16> %res, %res1
- ret <16 x i16> %res2
-}
-
declare <8 x i16> @llvm.x86.avx512.mask.pmulhu.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)
define <8 x i16>@test_int_x86_avx512_mask_pmulhu_w_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmulhu_w_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpmulhuw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xe4,0xd1]
; CHECK-NEXT: vpmulhuw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe4,0xc1]
@@ -2048,7 +1920,7 @@ declare <16 x i16> @llvm.x86.avx512.mask.pmulhu.w.256(<16 x i16>, <16 x i16>, <1
define <16 x i16>@test_int_x86_avx512_mask_pmulhu_w_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmulhu_w_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpmulhuw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xe4,0xd1]
; CHECK-NEXT: vpmulhuw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe4,0xc1]
@@ -2064,7 +1936,7 @@ declare <8 x i16> @llvm.x86.avx512.mask.pmulh.w.128(<8 x i16>, <8 x i16>, <8 x i
define <8 x i16>@test_int_x86_avx512_mask_pmulh_w_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmulh_w_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpmulhw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xe5,0xd1]
; CHECK-NEXT: vpmulhw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe5,0xc1]
@@ -2080,7 +1952,7 @@ declare <16 x i16> @llvm.x86.avx512.mask.pmulh.w.256(<16 x i16>, <16 x i16>, <16
define <16 x i16>@test_int_x86_avx512_mask_pmulh_w_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmulh_w_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpmulhw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xe5,0xd1]
; CHECK-NEXT: vpmulhw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe5,0xc1]
@@ -2096,7 +1968,7 @@ declare <8 x i16> @llvm.x86.avx512.mask.pmul.hr.sw.128(<8 x i16>, <8 x i16>, <8
define <8 x i16>@test_int_x86_avx512_mask_pmulhr_sw_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmulhr_sw_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpmulhrsw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x0b,0xd1]
; CHECK-NEXT: vpmulhrsw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x0b,0xc1]
@@ -2112,7 +1984,7 @@ declare <16 x i16> @llvm.x86.avx512.mask.pmul.hr.sw.256(<16 x i16>, <16 x i16>,
define <16 x i16>@test_int_x86_avx512_mask_pmulhr_sw_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmulhr_sw_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpmulhrsw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x0b,0xd1]
; CHECK-NEXT: vpmulhrsw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x0b,0xc1]
@@ -2128,7 +2000,7 @@ declare <16 x i8> @llvm.x86.avx512.mask.pmov.wb.128(<8 x i16>, <16 x i8>, i8)
define <16 x i8>@test_int_x86_avx512_mask_pmov_wb_128(<8 x i16> %x0, <16 x i8> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmov_wb_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpmovwb %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0x89,0x30,0xc2]
; CHECK-NEXT: vpmovwb %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x30,0xc1]
@@ -2148,7 +2020,7 @@ declare void @llvm.x86.avx512.mask.pmov.wb.mem.128(i8* %ptr, <8 x i16>, i8)
define void @test_int_x86_avx512_mask_pmov_wb_mem_128(i8* %ptr, <8 x i16> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmov_wb_mem_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
; CHECK-NEXT: vpmovwb %xmm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x08,0x30,0x07]
; CHECK-NEXT: vpmovwb %xmm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x30,0x07]
@@ -2162,7 +2034,7 @@ declare <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.128(<8 x i16>, <16 x i8>, i8)
define <16 x i8>@test_int_x86_avx512_mask_pmovs_wb_128(<8 x i16> %x0, <16 x i8> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_wb_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpmovswb %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0x89,0x20,0xc2]
; CHECK-NEXT: vpmovswb %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x20,0xc1]
@@ -2182,7 +2054,7 @@ declare void @llvm.x86.avx512.mask.pmovs.wb.mem.128(i8* %ptr, <8 x i16>, i8)
define void @test_int_x86_avx512_mask_pmovs_wb_mem_128(i8* %ptr, <8 x i16> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_wb_mem_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
; CHECK-NEXT: vpmovswb %xmm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x08,0x20,0x07]
; CHECK-NEXT: vpmovswb %xmm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x20,0x07]
@@ -2196,7 +2068,7 @@ declare <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.128(<8 x i16>, <16 x i8>, i8)
define <16 x i8>@test_int_x86_avx512_mask_pmovus_wb_128(<8 x i16> %x0, <16 x i8> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_wb_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpmovuswb %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0x89,0x10,0xc2]
; CHECK-NEXT: vpmovuswb %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x10,0xc1]
@@ -2216,7 +2088,7 @@ declare void @llvm.x86.avx512.mask.pmovus.wb.mem.128(i8* %ptr, <8 x i16>, i8)
define void @test_int_x86_avx512_mask_pmovus_wb_mem_128(i8* %ptr, <8 x i16> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_wb_mem_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
; CHECK-NEXT: vpmovuswb %xmm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x08,0x10,0x07]
; CHECK-NEXT: vpmovuswb %xmm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x10,0x07]
@@ -2230,7 +2102,7 @@ declare <16 x i8> @llvm.x86.avx512.mask.pmov.wb.256(<16 x i16>, <16 x i8>, i16)
define <16 x i8>@test_int_x86_avx512_mask_pmov_wb_256(<16 x i16> %x0, <16 x i8> %x1, i16 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmov_wb_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpmovwb %ymm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0xa9,0x30,0xc2]
; CHECK-NEXT: vpmovwb %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x30,0xc1]
@@ -2250,7 +2122,7 @@ declare void @llvm.x86.avx512.mask.pmov.wb.mem.256(i8* %ptr, <16 x i16>, i16)
define void @test_int_x86_avx512_mask_pmov_wb_mem_256(i8* %ptr, <16 x i16> %x1, i16 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmov_wb_mem_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
; CHECK-NEXT: vpmovwb %ymm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x28,0x30,0x07]
; CHECK-NEXT: vpmovwb %ymm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x30,0x07]
@@ -2264,7 +2136,7 @@ declare <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.256(<16 x i16>, <16 x i8>, i16)
define <16 x i8>@test_int_x86_avx512_mask_pmovs_wb_256(<16 x i16> %x0, <16 x i8> %x1, i16 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_wb_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpmovswb %ymm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0xa9,0x20,0xc2]
; CHECK-NEXT: vpmovswb %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x20,0xc1]
@@ -2284,7 +2156,7 @@ declare void @llvm.x86.avx512.mask.pmovs.wb.mem.256(i8* %ptr, <16 x i16>, i16)
define void @test_int_x86_avx512_mask_pmovs_wb_mem_256(i8* %ptr, <16 x i16> %x1, i16 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_wb_mem_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
; CHECK-NEXT: vpmovswb %ymm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x28,0x20,0x07]
; CHECK-NEXT: vpmovswb %ymm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x20,0x07]
@@ -2298,7 +2170,7 @@ declare <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.256(<16 x i16>, <16 x i8>, i16
define <16 x i8>@test_int_x86_avx512_mask_pmovus_wb_256(<16 x i16> %x0, <16 x i8> %x1, i16 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_wb_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpmovuswb %ymm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0xa9,0x10,0xc2]
; CHECK-NEXT: vpmovuswb %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x10,0xc1]
@@ -2318,7 +2190,7 @@ declare void @llvm.x86.avx512.mask.pmovus.wb.mem.256(i8* %ptr, <16 x i16>, i16)
define void @test_int_x86_avx512_mask_pmovus_wb_mem_256(i8* %ptr, <16 x i16> %x1, i16 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_wb_mem_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
; CHECK-NEXT: vpmovuswb %ymm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x28,0x10,0x07]
; CHECK-NEXT: vpmovuswb %ymm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x10,0x07]
@@ -2332,7 +2204,7 @@ declare <4 x i32> @llvm.x86.avx512.mask.pmaddw.d.128(<8 x i16>, <8 x i16>, <4 x
define <4 x i32>@test_int_x86_avx512_mask_pmaddw_d_128(<8 x i16> %x0, <8 x i16> %x1, <4 x i32> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmaddw_d_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpmaddwd %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xf5,0xd1]
; CHECK-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xf5,0xc1]
@@ -2348,7 +2220,7 @@ declare <8 x i32> @llvm.x86.avx512.mask.pmaddw.d.256(<16 x i16>, <16 x i16>, <8
define <8 x i32>@test_int_x86_avx512_mask_pmaddw_d_256(<16 x i16> %x0, <16 x i16> %x1, <8 x i32> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmaddw_d_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpmaddwd %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xf5,0xd1]
; CHECK-NEXT: vpmaddwd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xf5,0xc1]
@@ -2364,7 +2236,7 @@ declare <8 x i16> @llvm.x86.avx512.mask.pmaddubs.w.128(<16 x i8>, <16 x i8>, <8
define <8 x i16>@test_int_x86_avx512_mask_pmaddubs_w_128(<16 x i8> %x0, <16 x i8> %x1, <8 x i16> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmaddubs_w_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpmaddubsw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x04,0xd1]
; CHECK-NEXT: vpmaddubsw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x04,0xc1]
@@ -2380,7 +2252,7 @@ declare <16 x i16> @llvm.x86.avx512.mask.pmaddubs.w.256(<32 x i8>, <32 x i8>, <1
define <16 x i16>@test_int_x86_avx512_mask_pmaddubs_w_256(<32 x i8> %x0, <32 x i8> %x1, <16 x i16> %x2, i16 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmaddubs_w_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpmaddubsw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x04,0xd1]
; CHECK-NEXT: vpmaddubsw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x04,0xc1]
@@ -2396,7 +2268,7 @@ declare <8 x i16> @llvm.x86.avx512.mask.dbpsadbw.128(<16 x i8>, <16 x i8>, i32,
define <8 x i16>@test_int_x86_avx512_mask_dbpsadbw_128(<16 x i8> %x0, <16 x i8> %x1, <8 x i16> %x3, i8 %x4) {
; CHECK-LABEL: test_int_x86_avx512_mask_dbpsadbw_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vdbpsadbw $2, %xmm1, %xmm0, %xmm3 ## encoding: [0x62,0xf3,0x7d,0x08,0x42,0xd9,0x02]
; CHECK-NEXT: vdbpsadbw $2, %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x42,0xd1,0x02]
@@ -2416,7 +2288,7 @@ declare <16 x i16> @llvm.x86.avx512.mask.dbpsadbw.256(<32 x i8>, <32 x i8>, i32,
define <16 x i16>@test_int_x86_avx512_mask_dbpsadbw_256(<32 x i8> %x0, <32 x i8> %x1, <16 x i16> %x3, i16 %x4) {
; CHECK-LABEL: test_int_x86_avx512_mask_dbpsadbw_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vdbpsadbw $2, %ymm1, %ymm0, %ymm3 ## encoding: [0x62,0xf3,0x7d,0x28,0x42,0xd9,0x02]
; CHECK-NEXT: vdbpsadbw $2, %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x42,0xd1,0x02]
@@ -2436,10 +2308,10 @@ declare i16 @llvm.x86.avx512.cvtb2mask.128(<16 x i8>)
define i16@test_int_x86_avx512_cvtb2mask_128(<16 x i8> %x0) {
; CHECK-LABEL: test_int_x86_avx512_cvtb2mask_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpmovb2m %xmm0, %k0 ## encoding: [0x62,0xf2,0x7e,0x08,0x29,0xc0]
; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
-; CHECK-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; CHECK-NEXT: ## kill: def %ax killed %ax killed %eax
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call i16 @llvm.x86.avx512.cvtb2mask.128(<16 x i8> %x0)
ret i16 %res
@@ -2449,7 +2321,7 @@ declare i32 @llvm.x86.avx512.cvtb2mask.256(<32 x i8>)
define i32@test_int_x86_avx512_cvtb2mask_256(<32 x i8> %x0) {
; CHECK-LABEL: test_int_x86_avx512_cvtb2mask_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpmovb2m %ymm0, %k0 ## encoding: [0x62,0xf2,0x7e,0x28,0x29,0xc0]
; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -2461,10 +2333,10 @@ declare i8 @llvm.x86.avx512.cvtw2mask.128(<8 x i16>)
define i8@test_int_x86_avx512_cvtw2mask_128(<8 x i16> %x0) {
; CHECK-LABEL: test_int_x86_avx512_cvtw2mask_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpmovw2m %xmm0, %k0 ## encoding: [0x62,0xf2,0xfe,0x08,0x29,0xc0]
; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
-; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: ## kill: def %al killed %al killed %eax
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call i8 @llvm.x86.avx512.cvtw2mask.128(<8 x i16> %x0)
ret i8 %res
@@ -2474,10 +2346,10 @@ declare i16 @llvm.x86.avx512.cvtw2mask.256(<16 x i16>)
define i16@test_int_x86_avx512_cvtw2mask_256(<16 x i16> %x0) {
; CHECK-LABEL: test_int_x86_avx512_cvtw2mask_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpmovw2m %ymm0, %k0 ## encoding: [0x62,0xf2,0xfe,0x28,0x29,0xc0]
; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
-; CHECK-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; CHECK-NEXT: ## kill: def %ax killed %ax killed %eax
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call i16 @llvm.x86.avx512.cvtw2mask.256(<16 x i16> %x0)
ret i16 %res
@@ -2487,7 +2359,7 @@ declare <16 x i16> @llvm.x86.avx512.mask.psrlv16.hi(<16 x i16>, <16 x i16>, <16
define <16 x i16>@test_int_x86_avx512_mask_psrlv16_hi(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psrlv16_hi:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsrlvw %ymm1, %ymm0, %ymm3 ## encoding: [0x62,0xf2,0xfd,0x28,0x10,0xd9]
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpsrlvw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x10,0xd1]
@@ -2507,7 +2379,7 @@ declare <8 x i16> @llvm.x86.avx512.mask.psrlv8.hi(<8 x i16>, <8 x i16>, <8 x i16
define <8 x i16>@test_int_x86_avx512_mask_psrlv8_hi(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psrlv8_hi:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsrlvw %xmm1, %xmm0, %xmm3 ## encoding: [0x62,0xf2,0xfd,0x08,0x10,0xd9]
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpsrlvw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x10,0xd1]
@@ -2527,7 +2399,7 @@ declare <16 x i16> @llvm.x86.avx512.mask.psrav16.hi(<16 x i16>, <16 x i16>, <16
define <16 x i16>@test_int_x86_avx512_mask_psrav16_hi(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psrav16_hi:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsravw %ymm1, %ymm0, %ymm3 ## encoding: [0x62,0xf2,0xfd,0x28,0x11,0xd9]
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpsravw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x11,0xd1]
@@ -2547,7 +2419,7 @@ declare <8 x i16> @llvm.x86.avx512.mask.psrav8.hi(<8 x i16>, <8 x i16>, <8 x i16
define <8 x i16>@test_int_x86_avx512_mask_psrav8_hi(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psrav8_hi:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsravw %xmm1, %xmm0, %xmm3 ## encoding: [0x62,0xf2,0xfd,0x08,0x11,0xd9]
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpsravw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x11,0xd1]
@@ -2567,7 +2439,7 @@ declare <16 x i16> @llvm.x86.avx512.mask.psllv16.hi(<16 x i16>, <16 x i16>, <16
define <16 x i16>@test_int_x86_avx512_mask_psllv16_hi(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psllv16_hi:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsllvw %ymm1, %ymm0, %ymm3 ## encoding: [0x62,0xf2,0xfd,0x28,0x12,0xd9]
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpsllvw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x12,0xd1]
@@ -2587,7 +2459,7 @@ declare <8 x i16> @llvm.x86.avx512.mask.psllv8.hi(<8 x i16>, <8 x i16>, <8 x i16
define <8 x i16>@test_int_x86_avx512_mask_psllv8_hi(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psllv8_hi:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsllvw %xmm1, %xmm0, %xmm3 ## encoding: [0x62,0xf2,0xfd,0x08,0x12,0xd9]
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpsllvw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x12,0xd1]
@@ -2607,7 +2479,7 @@ declare <8 x i16> @llvm.x86.avx512.mask.permvar.hi.128(<8 x i16>, <8 x i16>, <8
define <8 x i16>@test_int_x86_avx512_mask_permvar_hi_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_permvar_hi_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpermw %xmm0, %xmm1, %xmm3 ## encoding: [0x62,0xf2,0xf5,0x08,0x8d,0xd8]
; CHECK-NEXT: vpermw %xmm0, %xmm1, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0x8d,0xd0]
@@ -2627,7 +2499,7 @@ declare <16 x i16> @llvm.x86.avx512.mask.permvar.hi.256(<16 x i16>, <16 x i16>,
define <16 x i16>@test_int_x86_avx512_mask_permvar_hi_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_permvar_hi_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpermw %ymm0, %ymm1, %ymm3 ## encoding: [0x62,0xf2,0xf5,0x28,0x8d,0xd8]
; CHECK-NEXT: vpermw %ymm0, %ymm1, %ymm2 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0x8d,0xd0]
@@ -2643,232 +2515,3 @@ define <16 x i16>@test_int_x86_avx512_mask_permvar_hi_256(<16 x i16> %x0, <16 x
ret <16 x i16> %res4
}
-declare i16 @llvm.x86.avx512.ptestm.b.128(<16 x i8>, <16 x i8>, i16)
-
-define i16@test_int_x86_avx512_ptestm_b_128(<16 x i8> %x0, <16 x i8> %x1, i16 %x2) {
-; CHECK-LABEL: test_int_x86_avx512_ptestm_b_128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
-; CHECK-NEXT: vptestmb %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf2,0x7d,0x08,0x26,0xc1]
-; CHECK-NEXT: kmovd %k0, %ecx ## encoding: [0xc5,0xfb,0x93,0xc8]
-; CHECK-NEXT: vptestmb %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x26,0xc1]
-; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
-; CHECK-NEXT: addl %ecx, %eax ## encoding: [0x01,0xc8]
-; CHECK-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
-; CHECK-NEXT: retq ## encoding: [0xc3]
- %res = call i16 @llvm.x86.avx512.ptestm.b.128(<16 x i8> %x0, <16 x i8> %x1, i16 %x2)
- %res1 = call i16 @llvm.x86.avx512.ptestm.b.128(<16 x i8> %x0, <16 x i8> %x1, i16-1)
- %res2 = add i16 %res, %res1
- ret i16 %res2
-}
-
-declare i32 @llvm.x86.avx512.ptestm.b.256(<32 x i8>, <32 x i8>, i32)
-
-define i32@test_int_x86_avx512_ptestm_b_256(<32 x i8> %x0, <32 x i8> %x1, i32 %x2) {
-; CHECK-LABEL: test_int_x86_avx512_ptestm_b_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
-; CHECK-NEXT: vptestmb %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x26,0xc1]
-; CHECK-NEXT: kmovd %k0, %ecx ## encoding: [0xc5,0xfb,0x93,0xc8]
-; CHECK-NEXT: vptestmb %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf2,0x7d,0x28,0x26,0xc1]
-; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
-; CHECK-NEXT: addl %ecx, %eax ## encoding: [0x01,0xc8]
-; CHECK-NEXT: retq ## encoding: [0xc3]
- %res = call i32 @llvm.x86.avx512.ptestm.b.256(<32 x i8> %x0, <32 x i8> %x1, i32 %x2)
- %res1 = call i32 @llvm.x86.avx512.ptestm.b.256(<32 x i8> %x0, <32 x i8> %x1, i32-1)
- %res2 = add i32 %res, %res1
- ret i32 %res2
-}
-
-declare i8 @llvm.x86.avx512.ptestm.w.128(<8 x i16>, <8 x i16>, i8)
-
-define i8@test_int_x86_avx512_ptestm_w_128(<8 x i16> %x0, <8 x i16> %x1, i8 %x2) {
-; CHECK-LABEL: test_int_x86_avx512_ptestm_w_128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
-; CHECK-NEXT: vptestmw %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf2,0xfd,0x08,0x26,0xc1]
-; CHECK-NEXT: kmovd %k0, %ecx ## encoding: [0xc5,0xfb,0x93,0xc8]
-; CHECK-NEXT: vptestmw %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x26,0xc1]
-; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
-; CHECK-NEXT: addb %cl, %al ## encoding: [0x00,0xc8]
-; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
-; CHECK-NEXT: retq ## encoding: [0xc3]
- %res = call i8 @llvm.x86.avx512.ptestm.w.128(<8 x i16> %x0, <8 x i16> %x1, i8 %x2)
- %res1 = call i8 @llvm.x86.avx512.ptestm.w.128(<8 x i16> %x0, <8 x i16> %x1, i8-1)
- %res2 = add i8 %res, %res1
- ret i8 %res2
-}
-
-declare i16 @llvm.x86.avx512.ptestm.w.256(<16 x i16>, <16 x i16>, i16)
-
-define i16@test_int_x86_avx512_ptestm_w_256(<16 x i16> %x0, <16 x i16> %x1, i16 %x2) {
-; CHECK-LABEL: test_int_x86_avx512_ptestm_w_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
-; CHECK-NEXT: vptestmw %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf2,0xfd,0x28,0x26,0xc1]
-; CHECK-NEXT: kmovd %k0, %ecx ## encoding: [0xc5,0xfb,0x93,0xc8]
-; CHECK-NEXT: vptestmw %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x26,0xc1]
-; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
-; CHECK-NEXT: addl %ecx, %eax ## encoding: [0x01,0xc8]
-; CHECK-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
-; CHECK-NEXT: retq ## encoding: [0xc3]
- %res = call i16 @llvm.x86.avx512.ptestm.w.256(<16 x i16> %x0, <16 x i16> %x1, i16 %x2)
- %res1 = call i16 @llvm.x86.avx512.ptestm.w.256(<16 x i16> %x0, <16 x i16> %x1, i16-1)
- %res2 = add i16 %res, %res1
- ret i16 %res2
-}
-
-declare i16 @llvm.x86.avx512.ptestnm.b.128(<16 x i8>, <16 x i8>, i16)
-
-define i16@test_int_x86_avx512_ptestnm_b_128(<16 x i8> %x0, <16 x i8> %x1, i16 %x2) {
-; CHECK-LABEL: test_int_x86_avx512_ptestnm_b_128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
-; CHECK-NEXT: vptestnmb %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf2,0x7e,0x08,0x26,0xc1]
-; CHECK-NEXT: kmovd %k0, %ecx ## encoding: [0xc5,0xfb,0x93,0xc8]
-; CHECK-NEXT: vptestnmb %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x26,0xc1]
-; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
-; CHECK-NEXT: addl %ecx, %eax ## encoding: [0x01,0xc8]
-; CHECK-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
-; CHECK-NEXT: retq ## encoding: [0xc3]
- %res = call i16 @llvm.x86.avx512.ptestnm.b.128(<16 x i8> %x0, <16 x i8> %x1, i16 %x2)
- %res1 = call i16 @llvm.x86.avx512.ptestnm.b.128(<16 x i8> %x0, <16 x i8> %x1, i16-1)
- %res2 = add i16 %res, %res1
- ret i16 %res2
-}
-
-declare i32 @llvm.x86.avx512.ptestnm.b.256(<32 x i8>, <32 x i8>, i32)
-
-define i32@test_int_x86_avx512_ptestnm_b_256(<32 x i8> %x0, <32 x i8> %x1, i32 %x2) {
-; CHECK-LABEL: test_int_x86_avx512_ptestnm_b_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
-; CHECK-NEXT: vptestnmb %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x26,0xc1]
-; CHECK-NEXT: kmovd %k0, %ecx ## encoding: [0xc5,0xfb,0x93,0xc8]
-; CHECK-NEXT: vptestnmb %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf2,0x7e,0x28,0x26,0xc1]
-; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
-; CHECK-NEXT: addl %ecx, %eax ## encoding: [0x01,0xc8]
-; CHECK-NEXT: retq ## encoding: [0xc3]
- %res = call i32 @llvm.x86.avx512.ptestnm.b.256(<32 x i8> %x0, <32 x i8> %x1, i32 %x2)
- %res1 = call i32 @llvm.x86.avx512.ptestnm.b.256(<32 x i8> %x0, <32 x i8> %x1, i32-1)
- %res2 = add i32 %res, %res1
- ret i32 %res2
-}
-
-declare i8 @llvm.x86.avx512.ptestnm.w.128(<8 x i16>, <8 x i16>, i8 %x2)
-
-define i8@test_int_x86_avx512_ptestnm_w_128(<8 x i16> %x0, <8 x i16> %x1, i8 %x2) {
-; CHECK-LABEL: test_int_x86_avx512_ptestnm_w_128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
-; CHECK-NEXT: vptestnmw %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf2,0xfe,0x08,0x26,0xc1]
-; CHECK-NEXT: kmovd %k0, %ecx ## encoding: [0xc5,0xfb,0x93,0xc8]
-; CHECK-NEXT: vptestnmw %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf2,0xfe,0x09,0x26,0xc1]
-; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
-; CHECK-NEXT: addb %cl, %al ## encoding: [0x00,0xc8]
-; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
-; CHECK-NEXT: retq ## encoding: [0xc3]
- %res = call i8 @llvm.x86.avx512.ptestnm.w.128(<8 x i16> %x0, <8 x i16> %x1, i8 %x2)
- %res1 = call i8 @llvm.x86.avx512.ptestnm.w.128(<8 x i16> %x0, <8 x i16> %x1, i8-1)
- %res2 = add i8 %res, %res1
- ret i8 %res2
-}
-
-declare i16 @llvm.x86.avx512.ptestnm.w.256(<16 x i16>, <16 x i16>, i16 %x2)
-
-define i16@test_int_x86_avx512_ptestnm_w_256(<16 x i16> %x0, <16 x i16> %x1, i16 %x2) {
-; CHECK-LABEL: test_int_x86_avx512_ptestnm_w_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
-; CHECK-NEXT: vptestnmw %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf2,0xfe,0x28,0x26,0xc1]
-; CHECK-NEXT: kmovd %k0, %ecx ## encoding: [0xc5,0xfb,0x93,0xc8]
-; CHECK-NEXT: vptestnmw %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf2,0xfe,0x29,0x26,0xc1]
-; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
-; CHECK-NEXT: addl %ecx, %eax ## encoding: [0x01,0xc8]
-; CHECK-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
-; CHECK-NEXT: retq ## encoding: [0xc3]
- %res = call i16 @llvm.x86.avx512.ptestnm.w.256(<16 x i16> %x0, <16 x i16> %x1, i16 %x2)
- %res1 = call i16 @llvm.x86.avx512.ptestnm.w.256(<16 x i16> %x0, <16 x i16> %x1, i16-1)
- %res2 = add i16 %res, %res1
- ret i16 %res2
-}
-
-declare <32 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.256(i8, <32 x i8>, i32)
-
-define <32 x i8>@test_int_x86_avx512_mask_pbroadcast_b_gpr_256(i8 %x0, <32 x i8> %x1, i32 %mask) {
-; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcast_b_gpr_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
-; CHECK-NEXT: vpbroadcastb %edi, %ymm1 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x7a,0xcf]
-; CHECK-NEXT: vpbroadcastb %edi, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x7a,0xc7]
-; CHECK-NEXT: vpbroadcastb %edi, %ymm2 ## encoding: [0x62,0xf2,0x7d,0x28,0x7a,0xd7]
-; CHECK-NEXT: vpaddb %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfc,0xc0]
-; CHECK-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfc,0xc0]
-; CHECK-NEXT: retq ## encoding: [0xc3]
- %res = call <32 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.256(i8 %x0, <32 x i8> %x1, i32 -1)
- %res1 = call <32 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.256(i8 %x0, <32 x i8> %x1, i32 %mask)
- %res2 = call <32 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.256(i8 %x0, <32 x i8> zeroinitializer, i32 %mask)
- %res3 = add <32 x i8> %res, %res1
- %res4 = add <32 x i8> %res2, %res3
- ret <32 x i8> %res4
-}
-
-declare <16 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.128(i8, <16 x i8>, i16)
-
-define <16 x i8>@test_int_x86_avx512_mask_pbroadcast_b_gpr_128(i8 %x0, <16 x i8> %x1, i16 %mask) {
-; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcast_b_gpr_128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
-; CHECK-NEXT: vpbroadcastb %edi, %xmm1 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x7a,0xcf]
-; CHECK-NEXT: vpbroadcastb %edi, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x7a,0xc7]
-; CHECK-NEXT: vpbroadcastb %edi, %xmm2 ## encoding: [0x62,0xf2,0x7d,0x08,0x7a,0xd7]
-; CHECK-NEXT: vpaddb %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc0]
-; CHECK-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfc,0xc0]
-; CHECK-NEXT: retq ## encoding: [0xc3]
- %res = call <16 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.128(i8 %x0, <16 x i8> %x1, i16 -1)
- %res1 = call <16 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.128(i8 %x0, <16 x i8> %x1, i16 %mask)
- %res2 = call <16 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.128(i8 %x0, <16 x i8> zeroinitializer, i16 %mask)
- %res3 = add <16 x i8> %res, %res1
- %res4 = add <16 x i8> %res2, %res3
- ret <16 x i8> %res4
-}
-
-declare <16 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.256(i16, <16 x i16>, i16)
-
-define <16 x i16>@test_int_x86_avx512_mask_pbroadcast_w_gpr_256(i16 %x0, <16 x i16> %x1, i16 %mask) {
-; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcast_w_gpr_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
-; CHECK-NEXT: vpbroadcastw %edi, %ymm1 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x7b,0xcf]
-; CHECK-NEXT: vpbroadcastw %edi, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x7b,0xc7]
-; CHECK-NEXT: vpbroadcastw %edi, %ymm2 ## encoding: [0x62,0xf2,0x7d,0x28,0x7b,0xd7]
-; CHECK-NEXT: vpaddw %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0]
-; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0]
-; CHECK-NEXT: retq ## encoding: [0xc3]
- %res = call <16 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.256(i16 %x0, <16 x i16> %x1, i16 -1)
- %res1 = call <16 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.256(i16 %x0, <16 x i16> %x1, i16 %mask)
- %res2 = call <16 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.256(i16 %x0, <16 x i16> zeroinitializer, i16 %mask)
- %res3 = add <16 x i16> %res, %res1
- %res4 = add <16 x i16> %res2, %res3
- ret <16 x i16> %res4
-}
-
-declare <8 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.128(i16, <8 x i16>, i8)
-
-define <8 x i16>@test_int_x86_avx512_mask_pbroadcast_w_gpr_128(i16 %x0, <8 x i16> %x1, i8 %mask) {
-; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcast_w_gpr_128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
-; CHECK-NEXT: vpbroadcastw %edi, %xmm1 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x7b,0xcf]
-; CHECK-NEXT: vpbroadcastw %edi, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x7b,0xc7]
-; CHECK-NEXT: vpbroadcastw %edi, %xmm2 ## encoding: [0x62,0xf2,0x7d,0x08,0x7b,0xd7]
-; CHECK-NEXT: vpaddw %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc0]
-; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc0]
-; CHECK-NEXT: retq ## encoding: [0xc3]
- %res = call <8 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.128(i16 %x0, <8 x i16> %x1, i8 -1)
- %res1 = call <8 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.128(i16 %x0, <8 x i16> %x1, i8 %mask)
- %res2 = call <8 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.128(i16 %x0, <8 x i16> zeroinitializer, i8 %mask)
- %res3 = add <8 x i16> %res, %res1
- %res4 = add <8 x i16> %res2, %res3
- ret <8 x i16> %res4
-}
diff --git a/test/CodeGen/X86/avx512bwvl-mov.ll b/test/CodeGen/X86/avx512bwvl-mov.ll
index 3f92641a3e16..1826890d49ca 100644
--- a/test/CodeGen/X86/avx512bwvl-mov.ll
+++ b/test/CodeGen/X86/avx512bwvl-mov.ll
@@ -1,10 +1,10 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw -mattr=+avx512vl --show-mc-encoding| FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw -mattr=+avx512vl --show-mc-encoding| FileCheck %s
define <32 x i8> @test_256_1(i8 * %addr) {
; CHECK-LABEL: test_256_1:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vmovdqu (%rdi), %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfe,0x6f,0x07]
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vmovups (%rdi), %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x10,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <32 x i8>*
%res = load <32 x i8>, <32 x i8>* %vaddr, align 1
@@ -13,8 +13,8 @@ define <32 x i8> @test_256_1(i8 * %addr) {
define void @test_256_2(i8 * %addr, <32 x i8> %data) {
; CHECK-LABEL: test_256_2:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vmovdqu %ymm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xfe,0x7f,0x07]
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vmovups %ymm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x11,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <32 x i8>*
store <32 x i8>%data, <32 x i8>* %vaddr, align 1
@@ -23,8 +23,8 @@ define void @test_256_2(i8 * %addr, <32 x i8> %data) {
define <32 x i8> @test_256_3(i8 * %addr, <32 x i8> %old, <32 x i8> %mask1) {
; CHECK-LABEL: test_256_3:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpxor %ymm2, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xef,0xd2]
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2]
; CHECK-NEXT: vpcmpneqb %ymm2, %ymm1, %k1 ## encoding: [0x62,0xf3,0x75,0x28,0x3f,0xca,0x04]
; CHECK-NEXT: vmovdqu8 (%rdi), %ymm0 {%k1} ## encoding: [0x62,0xf1,0x7f,0x29,0x6f,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -37,8 +37,8 @@ define <32 x i8> @test_256_3(i8 * %addr, <32 x i8> %old, <32 x i8> %mask1) {
define <32 x i8> @test_256_4(i8 * %addr, <32 x i8> %mask1) {
; CHECK-LABEL: test_256_4:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpxor %ymm1, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xef,0xc9]
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xef,0xc9]
; CHECK-NEXT: vpcmpneqb %ymm1, %ymm0, %k1 ## encoding: [0x62,0xf3,0x7d,0x28,0x3f,0xc9,0x04]
; CHECK-NEXT: vmovdqu8 (%rdi), %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7f,0xa9,0x6f,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -51,8 +51,8 @@ define <32 x i8> @test_256_4(i8 * %addr, <32 x i8> %mask1) {
define <16 x i16> @test_256_5(i8 * %addr) {
; CHECK-LABEL: test_256_5:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vmovdqu (%rdi), %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfe,0x6f,0x07]
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vmovups (%rdi), %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x10,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <16 x i16>*
%res = load <16 x i16>, <16 x i16>* %vaddr, align 1
@@ -61,8 +61,8 @@ define <16 x i16> @test_256_5(i8 * %addr) {
define void @test_256_6(i8 * %addr, <16 x i16> %data) {
; CHECK-LABEL: test_256_6:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vmovdqu %ymm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xfe,0x7f,0x07]
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vmovups %ymm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x11,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <16 x i16>*
store <16 x i16>%data, <16 x i16>* %vaddr, align 1
@@ -71,8 +71,8 @@ define void @test_256_6(i8 * %addr, <16 x i16> %data) {
define <16 x i16> @test_256_7(i8 * %addr, <16 x i16> %old, <16 x i16> %mask1) {
; CHECK-LABEL: test_256_7:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpxor %ymm2, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xef,0xd2]
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2]
; CHECK-NEXT: vpcmpneqw %ymm2, %ymm1, %k1 ## encoding: [0x62,0xf3,0xf5,0x28,0x3f,0xca,0x04]
; CHECK-NEXT: vmovdqu16 (%rdi), %ymm0 {%k1} ## encoding: [0x62,0xf1,0xff,0x29,0x6f,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -85,8 +85,8 @@ define <16 x i16> @test_256_7(i8 * %addr, <16 x i16> %old, <16 x i16> %mask1) {
define <16 x i16> @test_256_8(i8 * %addr, <16 x i16> %mask1) {
; CHECK-LABEL: test_256_8:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpxor %ymm1, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xef,0xc9]
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xef,0xc9]
; CHECK-NEXT: vpcmpneqw %ymm1, %ymm0, %k1 ## encoding: [0x62,0xf3,0xfd,0x28,0x3f,0xc9,0x04]
; CHECK-NEXT: vmovdqu16 (%rdi), %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xff,0xa9,0x6f,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -99,8 +99,8 @@ define <16 x i16> @test_256_8(i8 * %addr, <16 x i16> %mask1) {
define <16 x i8> @test_128_1(i8 * %addr) {
; CHECK-LABEL: test_128_1:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vmovdqu (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x6f,0x07]
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vmovups (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x10,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <16 x i8>*
%res = load <16 x i8>, <16 x i8>* %vaddr, align 1
@@ -109,8 +109,8 @@ define <16 x i8> @test_128_1(i8 * %addr) {
define void @test_128_2(i8 * %addr, <16 x i8> %data) {
; CHECK-LABEL: test_128_2:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vmovdqu %xmm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7f,0x07]
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vmovups %xmm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x11,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <16 x i8>*
store <16 x i8>%data, <16 x i8>* %vaddr, align 1
@@ -119,7 +119,7 @@ define void @test_128_2(i8 * %addr, <16 x i8> %data) {
define <16 x i8> @test_128_3(i8 * %addr, <16 x i8> %old, <16 x i8> %mask1) {
; CHECK-LABEL: test_128_3:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2]
; CHECK-NEXT: vpcmpneqb %xmm2, %xmm1, %k1 ## encoding: [0x62,0xf3,0x75,0x08,0x3f,0xca,0x04]
; CHECK-NEXT: vmovdqu8 (%rdi), %xmm0 {%k1} ## encoding: [0x62,0xf1,0x7f,0x09,0x6f,0x07]
@@ -133,7 +133,7 @@ define <16 x i8> @test_128_3(i8 * %addr, <16 x i8> %old, <16 x i8> %mask1) {
define <16 x i8> @test_128_4(i8 * %addr, <16 x i8> %mask1) {
; CHECK-LABEL: test_128_4:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xef,0xc9]
; CHECK-NEXT: vpcmpneqb %xmm1, %xmm0, %k1 ## encoding: [0x62,0xf3,0x7d,0x08,0x3f,0xc9,0x04]
; CHECK-NEXT: vmovdqu8 (%rdi), %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7f,0x89,0x6f,0x07]
@@ -147,8 +147,8 @@ define <16 x i8> @test_128_4(i8 * %addr, <16 x i8> %mask1) {
define <8 x i16> @test_128_5(i8 * %addr) {
; CHECK-LABEL: test_128_5:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vmovdqu (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x6f,0x07]
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vmovups (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x10,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <8 x i16>*
%res = load <8 x i16>, <8 x i16>* %vaddr, align 1
@@ -157,8 +157,8 @@ define <8 x i16> @test_128_5(i8 * %addr) {
define void @test_128_6(i8 * %addr, <8 x i16> %data) {
; CHECK-LABEL: test_128_6:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vmovdqu %xmm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7f,0x07]
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vmovups %xmm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x11,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <8 x i16>*
store <8 x i16>%data, <8 x i16>* %vaddr, align 1
@@ -167,7 +167,7 @@ define void @test_128_6(i8 * %addr, <8 x i16> %data) {
define <8 x i16> @test_128_7(i8 * %addr, <8 x i16> %old, <8 x i16> %mask1) {
; CHECK-LABEL: test_128_7:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2]
; CHECK-NEXT: vpcmpneqw %xmm2, %xmm1, %k1 ## encoding: [0x62,0xf3,0xf5,0x08,0x3f,0xca,0x04]
; CHECK-NEXT: vmovdqu16 (%rdi), %xmm0 {%k1} ## encoding: [0x62,0xf1,0xff,0x09,0x6f,0x07]
@@ -181,7 +181,7 @@ define <8 x i16> @test_128_7(i8 * %addr, <8 x i16> %old, <8 x i16> %mask1) {
define <8 x i16> @test_128_8(i8 * %addr, <8 x i16> %mask1) {
; CHECK-LABEL: test_128_8:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xef,0xc9]
; CHECK-NEXT: vpcmpneqw %xmm1, %xmm0, %k1 ## encoding: [0x62,0xf3,0xfd,0x08,0x3f,0xc9,0x04]
; CHECK-NEXT: vmovdqu16 (%rdi), %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xff,0x89,0x6f,0x07]
diff --git a/test/CodeGen/X86/avx512bwvl-vec-cmp.ll b/test/CodeGen/X86/avx512bwvl-vec-cmp.ll
index 17e581bbb501..bdaa1587e0a9 100644
--- a/test/CodeGen/X86/avx512bwvl-vec-cmp.ll
+++ b/test/CodeGen/X86/avx512bwvl-vec-cmp.ll
@@ -3,7 +3,7 @@
define <32 x i8> @test256_1(<32 x i8> %x, <32 x i8> %y) nounwind {
; CHECK-LABEL: test256_1:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpcmpeqb %ymm1, %ymm0, %k1
; CHECK-NEXT: vpblendmb %ymm0, %ymm1, %ymm0 {%k1}
; CHECK-NEXT: retq
@@ -14,7 +14,7 @@ define <32 x i8> @test256_1(<32 x i8> %x, <32 x i8> %y) nounwind {
define <32 x i8> @test256_2(<32 x i8> %x, <32 x i8> %y, <32 x i8> %x1) nounwind {
; CHECK-LABEL: test256_2:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpcmpgtb %ymm1, %ymm0, %k1
; CHECK-NEXT: vpblendmb %ymm0, %ymm2, %ymm0 {%k1}
; CHECK-NEXT: retq
@@ -25,7 +25,7 @@ define <32 x i8> @test256_2(<32 x i8> %x, <32 x i8> %y, <32 x i8> %x1) nounwind
define <16 x i16> @test256_3(<16 x i16> %x, <16 x i16> %y, <16 x i16> %x1) nounwind {
; CHECK-LABEL: test256_3:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpcmplew %ymm0, %ymm1, %k1
; CHECK-NEXT: vpblendmw %ymm2, %ymm1, %ymm0 {%k1}
; CHECK-NEXT: retq
@@ -36,7 +36,7 @@ define <16 x i16> @test256_3(<16 x i16> %x, <16 x i16> %y, <16 x i16> %x1) nounw
define <32 x i8> @test256_4(<32 x i8> %x, <32 x i8> %y, <32 x i8> %x1) nounwind {
; CHECK-LABEL: test256_4:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpcmpnleub %ymm1, %ymm0, %k1
; CHECK-NEXT: vpblendmb %ymm0, %ymm2, %ymm0 {%k1}
; CHECK-NEXT: retq
@@ -47,7 +47,7 @@ define <32 x i8> @test256_4(<32 x i8> %x, <32 x i8> %y, <32 x i8> %x1) nounwind
define <16 x i16> @test256_5(<16 x i16> %x, <16 x i16> %x1, <16 x i16>* %yp) nounwind {
; CHECK-LABEL: test256_5:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpcmpeqw (%rdi), %ymm0, %k1
; CHECK-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1}
; CHECK-NEXT: retq
@@ -59,7 +59,7 @@ define <16 x i16> @test256_5(<16 x i16> %x, <16 x i16> %x1, <16 x i16>* %yp) nou
define <16 x i16> @test256_6(<16 x i16> %x, <16 x i16> %x1, <16 x i16>* %y.ptr) nounwind {
; CHECK-LABEL: test256_6:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpcmpgtw (%rdi), %ymm0, %k1
; CHECK-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1}
; CHECK-NEXT: retq
@@ -71,7 +71,7 @@ define <16 x i16> @test256_6(<16 x i16> %x, <16 x i16> %x1, <16 x i16>* %y.ptr)
define <16 x i16> @test256_7(<16 x i16> %x, <16 x i16> %x1, <16 x i16>* %y.ptr) nounwind {
; CHECK-LABEL: test256_7:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpcmplew (%rdi), %ymm0, %k1
; CHECK-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1}
; CHECK-NEXT: retq
@@ -83,7 +83,7 @@ define <16 x i16> @test256_7(<16 x i16> %x, <16 x i16> %x1, <16 x i16>* %y.ptr)
define <16 x i16> @test256_8(<16 x i16> %x, <16 x i16> %x1, <16 x i16>* %y.ptr) nounwind {
; CHECK-LABEL: test256_8:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpcmpleuw (%rdi), %ymm0, %k1
; CHECK-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1}
; CHECK-NEXT: retq
@@ -95,7 +95,7 @@ define <16 x i16> @test256_8(<16 x i16> %x, <16 x i16> %x1, <16 x i16>* %y.ptr)
define <16 x i16> @test256_9(<16 x i16> %x, <16 x i16> %y, <16 x i16> %x1, <16 x i16> %y1) nounwind {
; CHECK-LABEL: test256_9:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k1
; CHECK-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 {%k1}
; CHECK-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1}
@@ -109,7 +109,7 @@ define <16 x i16> @test256_9(<16 x i16> %x, <16 x i16> %y, <16 x i16> %x1, <16 x
define <32 x i8> @test256_10(<32 x i8> %x, <32 x i8> %y, <32 x i8> %x1, <32 x i8> %y1) nounwind {
; CHECK-LABEL: test256_10:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpcmpleb %ymm1, %ymm0, %k1
; CHECK-NEXT: vpcmpleb %ymm2, %ymm3, %k1 {%k1}
; CHECK-NEXT: vpblendmb %ymm0, %ymm2, %ymm0 {%k1}
@@ -123,7 +123,7 @@ define <32 x i8> @test256_10(<32 x i8> %x, <32 x i8> %y, <32 x i8> %x1, <32 x i8
define <32 x i8> @test256_11(<32 x i8> %x, <32 x i8>* %y.ptr, <32 x i8> %x1, <32 x i8> %y1) nounwind {
; CHECK-LABEL: test256_11:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpcmpgtb %ymm2, %ymm1, %k1
; CHECK-NEXT: vpcmpgtb (%rdi), %ymm0, %k1 {%k1}
; CHECK-NEXT: vpblendmb %ymm0, %ymm1, %ymm0 {%k1}
@@ -138,7 +138,7 @@ define <32 x i8> @test256_11(<32 x i8> %x, <32 x i8>* %y.ptr, <32 x i8> %x1, <32
define <16 x i16> @test256_12(<16 x i16> %x, <16 x i16>* %y.ptr, <16 x i16> %x1, <16 x i16> %y1) nounwind {
; CHECK-LABEL: test256_12:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpcmplew %ymm1, %ymm2, %k1
; CHECK-NEXT: vpcmpleuw (%rdi), %ymm0, %k1 {%k1}
; CHECK-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1}
@@ -153,7 +153,7 @@ define <16 x i16> @test256_12(<16 x i16> %x, <16 x i16>* %y.ptr, <16 x i16> %x1,
define <16 x i8> @test128_1(<16 x i8> %x, <16 x i8> %y) nounwind {
; CHECK-LABEL: test128_1:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpcmpeqb %xmm1, %xmm0, %k1
; CHECK-NEXT: vpblendmb %xmm0, %xmm1, %xmm0 {%k1}
; CHECK-NEXT: retq
@@ -164,7 +164,7 @@ define <16 x i8> @test128_1(<16 x i8> %x, <16 x i8> %y) nounwind {
define <16 x i8> @test128_2(<16 x i8> %x, <16 x i8> %y, <16 x i8> %x1) nounwind {
; CHECK-LABEL: test128_2:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpcmpgtb %xmm1, %xmm0, %k1
; CHECK-NEXT: vpblendmb %xmm0, %xmm2, %xmm0 {%k1}
; CHECK-NEXT: retq
@@ -175,7 +175,7 @@ define <16 x i8> @test128_2(<16 x i8> %x, <16 x i8> %y, <16 x i8> %x1) nounwind
define <8 x i16> @test128_3(<8 x i16> %x, <8 x i16> %y, <8 x i16> %x1) nounwind {
; CHECK-LABEL: test128_3:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpcmplew %xmm0, %xmm1, %k1
; CHECK-NEXT: vpblendmw %xmm2, %xmm1, %xmm0 {%k1}
; CHECK-NEXT: retq
@@ -186,7 +186,7 @@ define <8 x i16> @test128_3(<8 x i16> %x, <8 x i16> %y, <8 x i16> %x1) nounwind
define <16 x i8> @test128_4(<16 x i8> %x, <16 x i8> %y, <16 x i8> %x1) nounwind {
; CHECK-LABEL: test128_4:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpcmpnleub %xmm1, %xmm0, %k1
; CHECK-NEXT: vpblendmb %xmm0, %xmm2, %xmm0 {%k1}
; CHECK-NEXT: retq
@@ -197,7 +197,7 @@ define <16 x i8> @test128_4(<16 x i8> %x, <16 x i8> %y, <16 x i8> %x1) nounwind
define <8 x i16> @test128_5(<8 x i16> %x, <8 x i16> %x1, <8 x i16>* %yp) nounwind {
; CHECK-LABEL: test128_5:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpcmpeqw (%rdi), %xmm0, %k1
; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
; CHECK-NEXT: retq
@@ -209,7 +209,7 @@ define <8 x i16> @test128_5(<8 x i16> %x, <8 x i16> %x1, <8 x i16>* %yp) nounwin
define <8 x i16> @test128_6(<8 x i16> %x, <8 x i16> %x1, <8 x i16>* %y.ptr) nounwind {
; CHECK-LABEL: test128_6:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpcmpgtw (%rdi), %xmm0, %k1
; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
; CHECK-NEXT: retq
@@ -221,7 +221,7 @@ define <8 x i16> @test128_6(<8 x i16> %x, <8 x i16> %x1, <8 x i16>* %y.ptr) noun
define <8 x i16> @test128_7(<8 x i16> %x, <8 x i16> %x1, <8 x i16>* %y.ptr) nounwind {
; CHECK-LABEL: test128_7:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpcmplew (%rdi), %xmm0, %k1
; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
; CHECK-NEXT: retq
@@ -233,7 +233,7 @@ define <8 x i16> @test128_7(<8 x i16> %x, <8 x i16> %x1, <8 x i16>* %y.ptr) noun
define <8 x i16> @test128_8(<8 x i16> %x, <8 x i16> %x1, <8 x i16>* %y.ptr) nounwind {
; CHECK-LABEL: test128_8:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpcmpleuw (%rdi), %xmm0, %k1
; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
; CHECK-NEXT: retq
@@ -245,7 +245,7 @@ define <8 x i16> @test128_8(<8 x i16> %x, <8 x i16> %x1, <8 x i16>* %y.ptr) noun
define <8 x i16> @test128_9(<8 x i16> %x, <8 x i16> %y, <8 x i16> %x1, <8 x i16> %y1) nounwind {
; CHECK-LABEL: test128_9:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k1
; CHECK-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 {%k1}
; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
@@ -259,7 +259,7 @@ define <8 x i16> @test128_9(<8 x i16> %x, <8 x i16> %y, <8 x i16> %x1, <8 x i16>
define <16 x i8> @test128_10(<16 x i8> %x, <16 x i8> %y, <16 x i8> %x1, <16 x i8> %y1) nounwind {
; CHECK-LABEL: test128_10:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpcmpleb %xmm1, %xmm0, %k1
; CHECK-NEXT: vpcmpleb %xmm2, %xmm3, %k1 {%k1}
; CHECK-NEXT: vpblendmb %xmm0, %xmm2, %xmm0 {%k1}
@@ -273,7 +273,7 @@ define <16 x i8> @test128_10(<16 x i8> %x, <16 x i8> %y, <16 x i8> %x1, <16 x i8
define <16 x i8> @test128_11(<16 x i8> %x, <16 x i8>* %y.ptr, <16 x i8> %x1, <16 x i8> %y1) nounwind {
; CHECK-LABEL: test128_11:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpcmpgtb %xmm2, %xmm1, %k1
; CHECK-NEXT: vpcmpgtb (%rdi), %xmm0, %k1 {%k1}
; CHECK-NEXT: vpblendmb %xmm0, %xmm1, %xmm0 {%k1}
@@ -288,7 +288,7 @@ define <16 x i8> @test128_11(<16 x i8> %x, <16 x i8>* %y.ptr, <16 x i8> %x1, <16
define <8 x i16> @test128_12(<8 x i16> %x, <8 x i16>* %y.ptr, <8 x i16> %x1, <8 x i16> %y1) nounwind {
; CHECK-LABEL: test128_12:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpcmplew %xmm1, %xmm2, %k1
; CHECK-NEXT: vpcmpleuw (%rdi), %xmm0, %k1 {%k1}
; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
diff --git a/test/CodeGen/X86/avx512bwvl-vec-test-testn.ll b/test/CodeGen/X86/avx512bwvl-vec-test-testn.ll
new file mode 100644
index 000000000000..fba2b5f07939
--- /dev/null
+++ b/test/CodeGen/X86/avx512bwvl-vec-test-testn.ll
@@ -0,0 +1,288 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s
+
+; Function Attrs: norecurse nounwind readnone
+define zeroext i16 @TEST_mm_test_epi8_mask(<2 x i64> %__A, <2 x i64> %__B) local_unnamed_addr #0 {
+; CHECK-LABEL: TEST_mm_test_epi8_mask:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vptestmb %xmm0, %xmm1, %k0
+; CHECK-NEXT: kmovd %k0, %eax
+; CHECK-NEXT: # kill: def %ax killed %ax killed %eax
+; CHECK-NEXT: retq
+entry:
+ %and.i.i = and <2 x i64> %__B, %__A
+ %0 = bitcast <2 x i64> %and.i.i to <16 x i8>
+ %1 = icmp ne <16 x i8> %0, zeroinitializer
+ %2 = bitcast <16 x i1> %1 to i16
+ ret i16 %2
+}
+
+; Function Attrs: norecurse nounwind readnone
+define zeroext i16 @TEST_mm_mask_test_epi8_mask(i16 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) local_unnamed_addr #0 {
+; CHECK-LABEL: TEST_mm_mask_test_epi8_mask:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vptestmb %xmm0, %xmm1, %k0 {%k1}
+; CHECK-NEXT: kmovd %k0, %eax
+; CHECK-NEXT: # kill: def %ax killed %ax killed %eax
+; CHECK-NEXT: retq
+entry:
+ %and.i.i = and <2 x i64> %__B, %__A
+ %0 = bitcast <2 x i64> %and.i.i to <16 x i8>
+ %1 = icmp ne <16 x i8> %0, zeroinitializer
+ %2 = bitcast i16 %__U to <16 x i1>
+ %3 = and <16 x i1> %1, %2
+ %4 = bitcast <16 x i1> %3 to i16
+ ret i16 %4
+}
+
+; Function Attrs: norecurse nounwind readnone
+define zeroext i8 @TEST_mm_test_epi16_mask(<2 x i64> %__A, <2 x i64> %__B) local_unnamed_addr #0 {
+; CHECK-LABEL: TEST_mm_test_epi16_mask:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vptestmw %xmm0, %xmm1, %k0
+; CHECK-NEXT: kmovd %k0, %eax
+; CHECK-NEXT: # kill: def %al killed %al killed %eax
+; CHECK-NEXT: retq
+entry:
+ %and.i.i = and <2 x i64> %__B, %__A
+ %0 = bitcast <2 x i64> %and.i.i to <8 x i16>
+ %1 = icmp ne <8 x i16> %0, zeroinitializer
+ %2 = bitcast <8 x i1> %1 to i8
+ ret i8 %2
+}
+
+; Function Attrs: norecurse nounwind readnone
+define zeroext i8 @TEST_mm_mask_test_epi16_mask(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) local_unnamed_addr #0 {
+; CHECK-LABEL: TEST_mm_mask_test_epi16_mask:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vptestmw %xmm0, %xmm1, %k0 {%k1}
+; CHECK-NEXT: kmovd %k0, %eax
+; CHECK-NEXT: # kill: def %al killed %al killed %eax
+; CHECK-NEXT: retq
+entry:
+ %and.i.i = and <2 x i64> %__B, %__A
+ %0 = bitcast <2 x i64> %and.i.i to <8 x i16>
+ %1 = icmp ne <8 x i16> %0, zeroinitializer
+ %2 = bitcast i8 %__U to <8 x i1>
+ %3 = and <8 x i1> %1, %2
+ %4 = bitcast <8 x i1> %3 to i8
+ ret i8 %4
+}
+
+; Function Attrs: norecurse nounwind readnone
+define zeroext i16 @TEST_mm_testn_epi8_mask(<2 x i64> %__A, <2 x i64> %__B) local_unnamed_addr #0 {
+; CHECK-LABEL: TEST_mm_testn_epi8_mask:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vptestnmb %xmm0, %xmm1, %k0
+; CHECK-NEXT: kmovd %k0, %eax
+; CHECK-NEXT: # kill: def %ax killed %ax killed %eax
+; CHECK-NEXT: retq
+entry:
+ %and.i.i = and <2 x i64> %__B, %__A
+ %0 = bitcast <2 x i64> %and.i.i to <16 x i8>
+ %1 = icmp eq <16 x i8> %0, zeroinitializer
+ %2 = bitcast <16 x i1> %1 to i16
+ ret i16 %2
+}
+
+; Function Attrs: norecurse nounwind readnone
+define zeroext i16 @TEST_mm_mask_testn_epi8_mask(i16 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) local_unnamed_addr #0 {
+; CHECK-LABEL: TEST_mm_mask_testn_epi8_mask:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vptestnmb %xmm0, %xmm1, %k0 {%k1}
+; CHECK-NEXT: kmovd %k0, %eax
+; CHECK-NEXT: # kill: def %ax killed %ax killed %eax
+; CHECK-NEXT: retq
+entry:
+ %and.i.i = and <2 x i64> %__B, %__A
+ %0 = bitcast <2 x i64> %and.i.i to <16 x i8>
+ %1 = icmp eq <16 x i8> %0, zeroinitializer
+ %2 = bitcast i16 %__U to <16 x i1>
+ %3 = and <16 x i1> %1, %2
+ %4 = bitcast <16 x i1> %3 to i16
+ ret i16 %4
+}
+
+; Function Attrs: norecurse nounwind readnone
+define zeroext i8 @TEST_mm_testn_epi16_mask(<2 x i64> %__A, <2 x i64> %__B) local_unnamed_addr #0 {
+; CHECK-LABEL: TEST_mm_testn_epi16_mask:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vptestnmw %xmm0, %xmm1, %k0
+; CHECK-NEXT: kmovd %k0, %eax
+; CHECK-NEXT: # kill: def %al killed %al killed %eax
+; CHECK-NEXT: retq
+entry:
+ %and.i.i = and <2 x i64> %__B, %__A
+ %0 = bitcast <2 x i64> %and.i.i to <8 x i16>
+ %1 = icmp eq <8 x i16> %0, zeroinitializer
+ %2 = bitcast <8 x i1> %1 to i8
+ ret i8 %2
+}
+
+; Function Attrs: norecurse nounwind readnone
+define zeroext i8 @TEST_mm_mask_testn_epi16_mask(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) local_unnamed_addr #0 {
+; CHECK-LABEL: TEST_mm_mask_testn_epi16_mask:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vptestnmw %xmm0, %xmm1, %k0 {%k1}
+; CHECK-NEXT: kmovd %k0, %eax
+; CHECK-NEXT: # kill: def %al killed %al killed %eax
+; CHECK-NEXT: retq
+entry:
+ %and.i.i = and <2 x i64> %__B, %__A
+ %0 = bitcast <2 x i64> %and.i.i to <8 x i16>
+ %1 = icmp eq <8 x i16> %0, zeroinitializer
+ %2 = bitcast i8 %__U to <8 x i1>
+ %3 = and <8 x i1> %1, %2
+ %4 = bitcast <8 x i1> %3 to i8
+ ret i8 %4
+}
+
+; Function Attrs: norecurse nounwind readnone
+define i32 @TEST_mm256_test_epi8_mask(<4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 {
+; CHECK-LABEL: TEST_mm256_test_epi8_mask:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vptestmb %ymm0, %ymm1, %k0
+; CHECK-NEXT: kmovd %k0, %eax
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+entry:
+ %and.i.i = and <4 x i64> %__B, %__A
+ %0 = bitcast <4 x i64> %and.i.i to <32 x i8>
+ %1 = icmp ne <32 x i8> %0, zeroinitializer
+ %2 = bitcast <32 x i1> %1 to i32
+ ret i32 %2
+}
+
+; Function Attrs: norecurse nounwind readnone
+define i32 @TEST_mm256_mask_test_epi8_mask(i32 %__U, <4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 {
+; CHECK-LABEL: TEST_mm256_mask_test_epi8_mask:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vptestmb %ymm0, %ymm1, %k0 {%k1}
+; CHECK-NEXT: kmovd %k0, %eax
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+entry:
+ %and.i.i = and <4 x i64> %__B, %__A
+ %0 = bitcast <4 x i64> %and.i.i to <32 x i8>
+ %1 = icmp ne <32 x i8> %0, zeroinitializer
+ %2 = bitcast i32 %__U to <32 x i1>
+ %3 = and <32 x i1> %1, %2
+ %4 = bitcast <32 x i1> %3 to i32
+ ret i32 %4
+}
+
+; Function Attrs: norecurse nounwind readnone
+define zeroext i16 @TEST_mm256_test_epi16_mask(<4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 {
+; CHECK-LABEL: TEST_mm256_test_epi16_mask:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vptestmw %ymm0, %ymm1, %k0
+; CHECK-NEXT: kmovd %k0, %eax
+; CHECK-NEXT: # kill: def %ax killed %ax killed %eax
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+entry:
+ %and.i.i = and <4 x i64> %__B, %__A
+ %0 = bitcast <4 x i64> %and.i.i to <16 x i16>
+ %1 = icmp ne <16 x i16> %0, zeroinitializer
+ %2 = bitcast <16 x i1> %1 to i16
+ ret i16 %2
+}
+
+; Function Attrs: norecurse nounwind readnone
+define zeroext i16 @TEST_mm256_mask_test_epi16_mask(i16 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 {
+; CHECK-LABEL: TEST_mm256_mask_test_epi16_mask:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vptestmw %ymm0, %ymm1, %k0 {%k1}
+; CHECK-NEXT: kmovd %k0, %eax
+; CHECK-NEXT: # kill: def %ax killed %ax killed %eax
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+entry:
+ %and.i.i = and <4 x i64> %__B, %__A
+ %0 = bitcast <4 x i64> %and.i.i to <16 x i16>
+ %1 = icmp ne <16 x i16> %0, zeroinitializer
+ %2 = bitcast i16 %__U to <16 x i1>
+ %3 = and <16 x i1> %1, %2
+ %4 = bitcast <16 x i1> %3 to i16
+ ret i16 %4
+}
+
+; Function Attrs: norecurse nounwind readnone
+define i32 @TEST_mm256_testn_epi8_mask(<4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 {
+; CHECK-LABEL: TEST_mm256_testn_epi8_mask:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vptestnmb %ymm0, %ymm1, %k0
+; CHECK-NEXT: kmovd %k0, %eax
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+entry:
+ %and.i.i = and <4 x i64> %__B, %__A
+ %0 = bitcast <4 x i64> %and.i.i to <32 x i8>
+ %1 = icmp eq <32 x i8> %0, zeroinitializer
+ %2 = bitcast <32 x i1> %1 to i32
+ ret i32 %2
+}
+
+; Function Attrs: norecurse nounwind readnone
+define i32 @TEST_mm256_mask_testn_epi8_mask(i32 %__U, <4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 {
+; CHECK-LABEL: TEST_mm256_mask_testn_epi8_mask:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vptestnmb %ymm0, %ymm1, %k0 {%k1}
+; CHECK-NEXT: kmovd %k0, %eax
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+entry:
+ %and.i.i = and <4 x i64> %__B, %__A
+ %0 = bitcast <4 x i64> %and.i.i to <32 x i8>
+ %1 = icmp eq <32 x i8> %0, zeroinitializer
+ %2 = bitcast i32 %__U to <32 x i1>
+ %3 = and <32 x i1> %1, %2
+ %4 = bitcast <32 x i1> %3 to i32
+ ret i32 %4
+}
+
+; Function Attrs: norecurse nounwind readnone
+define zeroext i16 @TEST_mm256_testn_epi16_mask(<4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 {
+; CHECK-LABEL: TEST_mm256_testn_epi16_mask:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vptestnmw %ymm0, %ymm1, %k0
+; CHECK-NEXT: kmovd %k0, %eax
+; CHECK-NEXT: # kill: def %ax killed %ax killed %eax
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+entry:
+ %and.i.i = and <4 x i64> %__B, %__A
+ %0 = bitcast <4 x i64> %and.i.i to <16 x i16>
+ %1 = icmp eq <16 x i16> %0, zeroinitializer
+ %2 = bitcast <16 x i1> %1 to i16
+ ret i16 %2
+}
+
+; Function Attrs: norecurse nounwind readnone
+define zeroext i16 @TEST_mm256_mask_testn_epi16_mask(i16 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 {
+; CHECK-LABEL: TEST_mm256_mask_testn_epi16_mask:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vptestnmw %ymm0, %ymm1, %k0 {%k1}
+; CHECK-NEXT: kmovd %k0, %eax
+; CHECK-NEXT: # kill: def %ax killed %ax killed %eax
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+entry:
+ %and.i.i = and <4 x i64> %__B, %__A
+ %0 = bitcast <4 x i64> %and.i.i to <16 x i16>
+ %1 = icmp eq <16 x i16> %0, zeroinitializer
+ %2 = bitcast i16 %__U to <16 x i1>
+ %3 = and <16 x i1> %1, %2
+ %4 = bitcast <16 x i1> %3 to i16
+ ret i16 %4
+}
+
+
diff --git a/test/CodeGen/X86/avx512cd-intrinsics-fast-isel.ll b/test/CodeGen/X86/avx512cd-intrinsics-fast-isel.ll
new file mode 100644
index 000000000000..a4f4c837dc01
--- /dev/null
+++ b/test/CodeGen/X86/avx512cd-intrinsics-fast-isel.ll
@@ -0,0 +1,37 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx512cd | FileCheck %s
+
+define <8 x i64> @test_mm512_broadcastmb_epi64(<8 x i64> %a, <8 x i64> %b) {
+; CHECK-LABEL: test_mm512_broadcastmb_epi64:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k0
+; CHECK-NEXT: vpbroadcastmb2q %k0, %zmm0
+; CHECK-NEXT: retq
+entry:
+ %0 = icmp eq <8 x i64> %a, %b
+ %1 = bitcast <8 x i1> %0 to i8
+ %conv.i = zext i8 %1 to i64
+ %vecinit.i.i = insertelement <8 x i64> undef, i64 %conv.i, i32 0
+ %vecinit7.i.i = shufflevector <8 x i64> %vecinit.i.i, <8 x i64> undef, <8 x i32> zeroinitializer
+ ret <8 x i64> %vecinit7.i.i
+}
+
+define <8 x i64> @test_mm512_broadcastmw_epi32(<8 x i64> %a, <8 x i64> %b) {
+; CHECK-LABEL: test_mm512_broadcastmw_epi32:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
+; CHECK-NEXT: vpbroadcastmw2d %k0, %zmm0
+; CHECK-NEXT: retq
+entry:
+ %0 = bitcast <8 x i64> %a to <16 x i32>
+ %1 = bitcast <8 x i64> %b to <16 x i32>
+ %2 = icmp eq <16 x i32> %0, %1
+ %3 = bitcast <16 x i1> %2 to i16
+ %conv.i = zext i16 %3 to i32
+ %vecinit.i.i = insertelement <16 x i32> undef, i32 %conv.i, i32 0
+ %vecinit15.i.i = shufflevector <16 x i32> %vecinit.i.i, <16 x i32> undef, <16 x i32> zeroinitializer
+ %4 = bitcast <16 x i32> %vecinit15.i.i to <8 x i64>
+ ret <8 x i64> %4
+}
+
+
diff --git a/test/CodeGen/X86/avx512cd-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512cd-intrinsics-upgrade.ll
index e5dbff9ac515..da4ba9e10099 100644
--- a/test/CodeGen/X86/avx512cd-intrinsics-upgrade.ll
+++ b/test/CodeGen/X86/avx512cd-intrinsics-upgrade.ll
@@ -3,7 +3,7 @@
define <16 x i32> @test_lzcnt_d(<16 x i32> %a) {
; CHECK-LABEL: test_lzcnt_d:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vplzcntd %zmm0, %zmm0
; CHECK-NEXT: retq
%res = call <16 x i32> @llvm.x86.avx512.mask.lzcnt.d.512(<16 x i32> %a, <16 x i32> zeroinitializer, i16 -1)
@@ -14,7 +14,7 @@ declare <16 x i32> @llvm.x86.avx512.mask.lzcnt.d.512(<16 x i32>, <16 x i32>, i16
define <8 x i64> @test_lzcnt_q(<8 x i64> %a) {
; CHECK-LABEL: test_lzcnt_q:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vplzcntq %zmm0, %zmm0
; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.mask.lzcnt.q.512(<8 x i64> %a, <8 x i64> zeroinitializer, i8 -1)
@@ -26,7 +26,7 @@ declare <8 x i64> @llvm.x86.avx512.mask.lzcnt.q.512(<8 x i64>, <8 x i64>, i8) no
define <16 x i32> @test_mask_lzcnt_d(<16 x i32> %a, <16 x i32> %b, i16 %mask) {
; CHECK-LABEL: test_mask_lzcnt_d:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vplzcntd %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
@@ -37,7 +37,7 @@ define <16 x i32> @test_mask_lzcnt_d(<16 x i32> %a, <16 x i32> %b, i16 %mask) {
define <8 x i64> @test_mask_lzcnt_q(<8 x i64> %a, <8 x i64> %b, i8 %mask) {
; CHECK-LABEL: test_mask_lzcnt_q:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vplzcntq %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
@@ -45,3 +45,26 @@ define <8 x i64> @test_mask_lzcnt_q(<8 x i64> %a, <8 x i64> %b, i8 %mask) {
%res = call <8 x i64> @llvm.x86.avx512.mask.lzcnt.q.512(<8 x i64> %a, <8 x i64> %b, i8 %mask)
ret <8 x i64> %res
}
+
+define <16 x i32> @test_x86_vbroadcastmw_512(i16 %a0) {
+; CHECK-LABEL: test_x86_vbroadcastmw_512:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: movzwl %di, %eax
+; CHECK-NEXT: vpbroadcastd %eax, %zmm0
+; CHECK-NEXT: retq
+ %res = call <16 x i32> @llvm.x86.avx512.broadcastmw.512(i16 %a0)
+ ret <16 x i32> %res
+}
+declare <16 x i32> @llvm.x86.avx512.broadcastmw.512(i16)
+
+define <8 x i64> @test_x86_broadcastmb_512(i8 %a0) {
+; CHECK-LABEL: test_x86_broadcastmb_512:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: movzbl %dil, %eax
+; CHECK-NEXT: vpbroadcastq %rax, %zmm0
+; CHECK-NEXT: retq
+ %res = call <8 x i64> @llvm.x86.avx512.broadcastmb.512(i8 %a0)
+ ret <8 x i64> %res
+}
+declare <8 x i64> @llvm.x86.avx512.broadcastmb.512(i8)
+
diff --git a/test/CodeGen/X86/avx512cd-intrinsics.ll b/test/CodeGen/X86/avx512cd-intrinsics.ll
index 7e5a3e8fe25d..7f0c761991e4 100644
--- a/test/CodeGen/X86/avx512cd-intrinsics.ll
+++ b/test/CodeGen/X86/avx512cd-intrinsics.ll
@@ -1,33 +1,11 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512cd | FileCheck %s
-define <16 x i32> @test_x86_vbroadcastmw_512(i16 %a0) {
-; CHECK-LABEL: test_x86_vbroadcastmw_512:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k0
-; CHECK-NEXT: vpbroadcastmw2d %k0, %zmm0
-; CHECK-NEXT: retq
- %res = call <16 x i32> @llvm.x86.avx512.broadcastmw.512(i16 %a0)
- ret <16 x i32> %res
-}
-declare <16 x i32> @llvm.x86.avx512.broadcastmw.512(i16)
-
-define <8 x i64> @test_x86_broadcastmb_512(i8 %a0) {
-; CHECK-LABEL: test_x86_broadcastmb_512:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k0
-; CHECK-NEXT: vpbroadcastmb2q %k0, %zmm0
-; CHECK-NEXT: retq
- %res = call <8 x i64> @llvm.x86.avx512.broadcastmb.512(i8 %a0)
- ret <8 x i64> %res
-}
-declare <8 x i64> @llvm.x86.avx512.broadcastmb.512(i8)
-
declare <16 x i32> @llvm.x86.avx512.mask.conflict.d.512(<16 x i32>, <16 x i32>, i16) nounwind readonly
define <8 x i64> @test_conflict_q(<8 x i64> %a) {
; CHECK-LABEL: test_conflict_q:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpconflictq %zmm0, %zmm0
; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.mask.conflict.q.512(<8 x i64> %a, <8 x i64> zeroinitializer, i8 -1)
@@ -38,7 +16,7 @@ declare <8 x i64> @llvm.x86.avx512.mask.conflict.q.512(<8 x i64>, <8 x i64>, i8)
define <16 x i32> @test_maskz_conflict_d(<16 x i32> %a, i16 %mask) {
; CHECK-LABEL: test_maskz_conflict_d:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpconflictd %zmm0, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -48,7 +26,7 @@ define <16 x i32> @test_maskz_conflict_d(<16 x i32> %a, i16 %mask) {
define <8 x i64> @test_mask_conflict_q(<8 x i64> %a, <8 x i64> %b, i8 %mask) {
; CHECK-LABEL: test_mask_conflict_q:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpconflictq %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
@@ -59,7 +37,7 @@ define <8 x i64> @test_mask_conflict_q(<8 x i64> %a, <8 x i64> %b, i8 %mask) {
define <16 x i32> @test_lzcnt_d(<16 x i32> %a) {
; CHECK-LABEL: test_lzcnt_d:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vplzcntd %zmm0, %zmm0
; CHECK-NEXT: retq
%1 = call <16 x i32> @llvm.ctlz.v16i32(<16 x i32> %a, i1 false)
@@ -69,7 +47,7 @@ declare <16 x i32> @llvm.ctlz.v16i32(<16 x i32>, i1) #0
define <8 x i64> @test_lzcnt_q(<8 x i64> %a) {
; CHECK-LABEL: test_lzcnt_q:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vplzcntq %zmm0, %zmm0
; CHECK-NEXT: retq
%1 = call <8 x i64> @llvm.ctlz.v8i64(<8 x i64> %a, i1 false)
@@ -79,7 +57,7 @@ declare <8 x i64> @llvm.ctlz.v8i64(<8 x i64>, i1) #0
define <16 x i32> @test_mask_lzcnt_d(<16 x i32> %a, <16 x i32> %b, i16 %mask) {
; CHECK-LABEL: test_mask_lzcnt_d:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vplzcntd %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
@@ -92,7 +70,7 @@ define <16 x i32> @test_mask_lzcnt_d(<16 x i32> %a, <16 x i32> %b, i16 %mask) {
define <8 x i64> @test_mask_lzcnt_q(<8 x i64> %a, <8 x i64> %b, i8 %mask) {
; CHECK-LABEL: test_mask_lzcnt_q:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vplzcntq %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
diff --git a/test/CodeGen/X86/avx512cdvl-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512cdvl-intrinsics-upgrade.ll
index f8f47c87100a..6070ea294d55 100644
--- a/test/CodeGen/X86/avx512cdvl-intrinsics-upgrade.ll
+++ b/test/CodeGen/X86/avx512cdvl-intrinsics-upgrade.ll
@@ -5,7 +5,7 @@ declare <4 x i32> @llvm.x86.avx512.mask.lzcnt.d.128(<4 x i32>, <4 x i32>, i8)
define <4 x i32>@test_int_x86_avx512_mask_vplzcnt_d_128(<4 x i32> %x0, <4 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_vplzcnt_d_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vplzcntd %xmm0, %xmm2
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vplzcntd %xmm0, %xmm1 {%k1}
@@ -25,7 +25,7 @@ declare <8 x i32> @llvm.x86.avx512.mask.lzcnt.d.256(<8 x i32>, <8 x i32>, i8)
define <8 x i32>@test_int_x86_avx512_mask_vplzcnt_d_256(<8 x i32> %x0, <8 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_vplzcnt_d_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vplzcntd %ymm0, %ymm2
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vplzcntd %ymm0, %ymm1 {%k1}
@@ -41,7 +41,7 @@ declare <2 x i64> @llvm.x86.avx512.mask.lzcnt.q.128(<2 x i64>, <2 x i64>, i8)
define <2 x i64>@test_int_x86_avx512_mask_vplzcnt_q_128(<2 x i64> %x0, <2 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_vplzcnt_q_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vplzcntq %xmm0, %xmm2
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vplzcntq %xmm0, %xmm1 {%k1}
@@ -57,7 +57,7 @@ declare <4 x i64> @llvm.x86.avx512.mask.lzcnt.q.256(<4 x i64>, <4 x i64>, i8)
define <4 x i64>@test_int_x86_avx512_mask_vplzcnt_q_256(<4 x i64> %x0, <4 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_vplzcnt_q_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vplzcntq %ymm0, %ymm2
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vplzcntq %ymm0, %ymm1 {%k1}
@@ -69,3 +69,47 @@ define <4 x i64>@test_int_x86_avx512_mask_vplzcnt_q_256(<4 x i64> %x0, <4 x i64>
ret <4 x i64> %res2
}
+define <8 x i32> @test_x86_vbroadcastmw_256(i16 %a0) {
+; CHECK-LABEL: test_x86_vbroadcastmw_256:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: movzwl %di, %eax
+; CHECK-NEXT: vpbroadcastd %eax, %ymm0
+; CHECK-NEXT: retq
+ %res = call <8 x i32> @llvm.x86.avx512.broadcastmw.256(i16 %a0) ;
+ ret <8 x i32> %res
+}
+declare <8 x i32> @llvm.x86.avx512.broadcastmw.256(i16)
+
+define <4 x i32> @test_x86_vbroadcastmw_128(i16 %a0) {
+; CHECK-LABEL: test_x86_vbroadcastmw_128:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: movzwl %di, %eax
+; CHECK-NEXT: vpbroadcastd %eax, %xmm0
+; CHECK-NEXT: retq
+ %res = call <4 x i32> @llvm.x86.avx512.broadcastmw.128(i16 %a0) ;
+ ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.avx512.broadcastmw.128(i16)
+
+define <4 x i64> @test_x86_broadcastmb_256(i8 %a0) {
+; CHECK-LABEL: test_x86_broadcastmb_256:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: movzbl %dil, %eax
+; CHECK-NEXT: vpbroadcastq %rax, %ymm0
+; CHECK-NEXT: retq
+ %res = call <4 x i64> @llvm.x86.avx512.broadcastmb.256(i8 %a0) ;
+ ret <4 x i64> %res
+}
+declare <4 x i64> @llvm.x86.avx512.broadcastmb.256(i8)
+
+define <2 x i64> @test_x86_broadcastmb_128(i8 %a0) {
+; CHECK-LABEL: test_x86_broadcastmb_128:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: movzbl %dil, %eax
+; CHECK-NEXT: vpbroadcastq %rax, %xmm0
+; CHECK-NEXT: retq
+ %res = call <2 x i64> @llvm.x86.avx512.broadcastmb.128(i8 %a0) ;
+ ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.avx512.broadcastmb.128(i8)
+
diff --git a/test/CodeGen/X86/avx512cdvl-intrinsics.ll b/test/CodeGen/X86/avx512cdvl-intrinsics.ll
index 96254f7c95b0..3530d321b020 100644
--- a/test/CodeGen/X86/avx512cdvl-intrinsics.ll
+++ b/test/CodeGen/X86/avx512cdvl-intrinsics.ll
@@ -3,7 +3,7 @@
define <4 x i32> @test_int_x86_avx512_mask_vplzcnt_d_128(<4 x i32> %x0, <4 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_vplzcnt_d_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vplzcntd %xmm0, %xmm2
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vplzcntd %xmm0, %xmm1 {%k1}
@@ -28,7 +28,7 @@ declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1) #0
define <8 x i32> @test_int_x86_avx512_mask_vplzcnt_d_256(<8 x i32> %x0, <8 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_vplzcnt_d_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vplzcntd %ymm0, %ymm2
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vplzcntd %ymm0, %ymm1 {%k1}
@@ -45,7 +45,7 @@ declare <8 x i32> @llvm.ctlz.v8i32(<8 x i32>, i1) #0
define <2 x i64> @test_int_x86_avx512_mask_vplzcnt_q_128(<2 x i64> %x0, <2 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_vplzcnt_q_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vplzcntq %xmm0, %xmm2
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vplzcntq %xmm0, %xmm1 {%k1}
@@ -63,7 +63,7 @@ declare <2 x i64> @llvm.ctlz.v2i64(<2 x i64>, i1) #0
define <4 x i64> @test_int_x86_avx512_mask_vplzcnt_q_256(<4 x i64> %x0, <4 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_vplzcnt_q_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vplzcntq %ymm0, %ymm2
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vplzcntq %ymm0, %ymm1 {%k1}
@@ -83,7 +83,7 @@ declare <4 x i32> @llvm.x86.avx512.mask.conflict.d.128(<4 x i32>, <4 x i32>, i8)
define <4 x i32>@test_int_x86_avx512_mask_vpconflict_d_128(<4 x i32> %x0, <4 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_vpconflict_d_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpconflictd %xmm0, %xmm2 {%k1} {z}
; CHECK-NEXT: vpconflictd %xmm0, %xmm1 {%k1}
@@ -103,7 +103,7 @@ declare <8 x i32> @llvm.x86.avx512.mask.conflict.d.256(<8 x i32>, <8 x i32>, i8)
define <8 x i32>@test_int_x86_avx512_mask_vpconflict_d_256(<8 x i32> %x0, <8 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_vpconflict_d_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpconflictd %ymm0, %ymm1 {%k1}
; CHECK-NEXT: vpconflictd %ymm0, %ymm0
@@ -119,7 +119,7 @@ declare <2 x i64> @llvm.x86.avx512.mask.conflict.q.128(<2 x i64>, <2 x i64>, i8)
define <2 x i64>@test_int_x86_avx512_mask_vpconflict_q_128(<2 x i64> %x0, <2 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_vpconflict_q_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpconflictq %xmm0, %xmm1 {%k1}
; CHECK-NEXT: vpconflictq %xmm0, %xmm0
@@ -135,7 +135,7 @@ declare <4 x i64> @llvm.x86.avx512.mask.conflict.q.256(<4 x i64>, <4 x i64>, i8)
define <4 x i64>@test_int_x86_avx512_mask_vpconflict_q_256(<4 x i64> %x0, <4 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_vpconflict_q_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vpconflictq %ymm0, %ymm1 {%k1}
; CHECK-NEXT: vpconflictq %ymm0, %ymm0
@@ -147,46 +147,3 @@ define <4 x i64>@test_int_x86_avx512_mask_vpconflict_q_256(<4 x i64> %x0, <4 x i
ret <4 x i64> %res2
}
-define <8 x i32> @test_x86_vbroadcastmw_256(i16 %a0) {
-; CHECK-LABEL: test_x86_vbroadcastmw_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k0
-; CHECK-NEXT: vpbroadcastmw2d %k0, %ymm0
-; CHECK-NEXT: retq
- %res = call <8 x i32> @llvm.x86.avx512.broadcastmw.256(i16 %a0) ;
- ret <8 x i32> %res
-}
-declare <8 x i32> @llvm.x86.avx512.broadcastmw.256(i16)
-
-define <4 x i32> @test_x86_vbroadcastmw_128(i16 %a0) {
-; CHECK-LABEL: test_x86_vbroadcastmw_128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k0
-; CHECK-NEXT: vpbroadcastmw2d %k0, %xmm0
-; CHECK-NEXT: retq
- %res = call <4 x i32> @llvm.x86.avx512.broadcastmw.128(i16 %a0) ;
- ret <4 x i32> %res
-}
-declare <4 x i32> @llvm.x86.avx512.broadcastmw.128(i16)
-
-define <4 x i64> @test_x86_broadcastmb_256(i8 %a0) {
-; CHECK-LABEL: test_x86_broadcastmb_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k0
-; CHECK-NEXT: vpbroadcastmb2q %k0, %ymm0
-; CHECK-NEXT: retq
- %res = call <4 x i64> @llvm.x86.avx512.broadcastmb.256(i8 %a0) ;
- ret <4 x i64> %res
-}
-declare <4 x i64> @llvm.x86.avx512.broadcastmb.256(i8)
-
-define <2 x i64> @test_x86_broadcastmb_128(i8 %a0) {
-; CHECK-LABEL: test_x86_broadcastmb_128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k0
-; CHECK-NEXT: vpbroadcastmb2q %k0, %xmm0
-; CHECK-NEXT: retq
- %res = call <2 x i64> @llvm.x86.avx512.broadcastmb.128(i8 %a0) ;
- ret <2 x i64> %res
-}
-declare <2 x i64> @llvm.x86.avx512.broadcastmb.128(i8)
diff --git a/test/CodeGen/X86/avx512dq-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512dq-intrinsics-upgrade.ll
index 636358fb91cb..c1e6de39a6e1 100644
--- a/test/CodeGen/X86/avx512dq-intrinsics-upgrade.ll
+++ b/test/CodeGen/X86/avx512dq-intrinsics-upgrade.ll
@@ -5,16 +5,12 @@ declare <2 x double> @llvm.x86.avx512.mask.vextractf64x2.512(<8 x double>, i32,
define <2 x double>@test_int_x86_avx512_mask_vextractf64x2_512(<8 x double> %x0, <2 x double> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vextractf64x2_512:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vextractf64x2 $1, %zmm0, %xmm0
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT: vmovd %edi, %xmm2
; CHECK-NEXT: kmovw %edi, %k0
-; CHECK-NEXT: kshiftlb $7, %k0, %k1
-; CHECK-NEXT: kshiftrb $7, %k1, %k1
-; CHECK-NEXT: kshiftlb $6, %k0, %k0
-; CHECK-NEXT: kshiftrb $7, %k0, %k0
+; CHECK-NEXT: kshiftrb $1, %k0, %k0
; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: kmovw %k1, %ecx
-; CHECK-NEXT: vmovd %ecx, %xmm2
; CHECK-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
; CHECK-NEXT: vpsllq $63, %xmm2, %xmm2
; CHECK-NEXT: vpsraq $63, %zmm2, %zmm2
@@ -35,8 +31,8 @@ declare <8 x float> @llvm.x86.avx512.mask.vextractf32x8.512(<16 x float>, i32, <
define <8 x float>@test_int_x86_avx512_mask_vextractf32x8(<16 x float> %x0, <8 x float> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vextractf32x8:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vextractf32x8 $1, %zmm0, %ymm2
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vextractf32x8 $1, %zmm0, %ymm1 {%k1}
; CHECK-NEXT: vaddps %ymm2, %ymm1, %ymm1
@@ -55,8 +51,8 @@ declare <16 x float> @llvm.x86.avx512.mask.insertf32x8.512(<16 x float>, <8 x fl
define <16 x float>@test_int_x86_avx512_mask_insertf32x8_512(<16 x float> %x0, <8 x float> %x1, <16 x float> %x3, i16 %x4) {
; CHECK-LABEL: test_int_x86_avx512_mask_insertf32x8_512:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vinsertf32x8 $1, %ymm1, %zmm0, %zmm3
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm3
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vinsertf32x8 $1, %ymm1, %zmm0, %zmm2 {%k1}
; CHECK-NEXT: vinsertf32x8 $1, %ymm1, %zmm0, %zmm0 {%k1} {z}
@@ -75,8 +71,8 @@ declare <8 x double> @llvm.x86.avx512.mask.insertf64x2.512(<8 x double>, <2 x do
define <8 x double>@test_int_x86_avx512_mask_insertf64x2_512(<8 x double> %x0, <2 x double> %x1,<8 x double> %x3, i8 %x4) {
; CHECK-LABEL: test_int_x86_avx512_mask_insertf64x2_512:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vinsertf64x2 $1, %xmm1, %zmm0, %zmm3
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm3
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vinsertf64x2 $1, %xmm1, %zmm0, %zmm2 {%k1}
; CHECK-NEXT: vinsertf64x2 $1, %xmm1, %zmm0, %zmm0 {%k1} {z}
@@ -95,8 +91,8 @@ declare <16 x i32> @llvm.x86.avx512.mask.inserti32x8.512(<16 x i32>, <8 x i32>,
define <16 x i32>@test_int_x86_avx512_mask_inserti32x8_512(<16 x i32> %x0, <8 x i32> %x1, <16 x i32> %x3, i16 %x4) {
; CHECK-LABEL: test_int_x86_avx512_mask_inserti32x8_512:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm3
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm3
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm2 {%k1}
; CHECK-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0 {%k1} {z}
@@ -115,8 +111,8 @@ declare <8 x i64> @llvm.x86.avx512.mask.inserti64x2.512(<8 x i64>, <2 x i64>, i3
define <8 x i64>@test_int_x86_avx512_mask_inserti64x2_512(<8 x i64> %x0, <2 x i64> %x1, <8 x i64> %x3, i8 %x4) {
; CHECK-LABEL: test_int_x86_avx512_mask_inserti64x2_512:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vinserti64x2 $1, %xmm1, %zmm0, %zmm3
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm3
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vinserti64x2 $1, %xmm1, %zmm0, %zmm2 {%k1}
; CHECK-NEXT: vinserti64x2 $1, %xmm1, %zmm0, %zmm0 {%k1} {z}
@@ -136,7 +132,7 @@ declare <16 x i32> @llvm.x86.avx512.cvtmask2d.512(i16)
define <16 x i32>@test_int_x86_avx512_cvtmask2d_512(i16 %x0) {
; CHECK-LABEL: test_int_x86_avx512_cvtmask2d_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k0
; CHECK-NEXT: vpmovm2d %k0, %zmm0
; CHECK-NEXT: retq
@@ -148,10 +144,193 @@ declare <8 x i64> @llvm.x86.avx512.cvtmask2q.512(i8)
define <8 x i64>@test_int_x86_avx512_cvtmask2q_512(i8 %x0) {
; CHECK-LABEL: test_int_x86_avx512_cvtmask2q_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k0
; CHECK-NEXT: vpmovm2q %k0, %zmm0
; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.cvtmask2q.512(i8 %x0)
ret <8 x i64> %res
}
+
+declare <16 x float> @llvm.x86.avx512.mask.broadcastf32x8.512(<8 x float>, <16 x float>, i16)
+
+define <16 x float>@test_int_x86_avx512_mask_broadcastf32x8_512(<8 x float> %x0, <16 x float> %x2, i16 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_mask_broadcastf32x8_512:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: ## kill: def %ymm0 killed %ymm0 def %zmm0
+; CHECK-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm2
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vinsertf32x8 $1, %ymm0, %zmm0, %zmm1 {%k1}
+; CHECK-NEXT: vaddps %zmm1, %zmm2, %zmm1
+; CHECK-NEXT: vinsertf32x8 $1, %ymm0, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: retq
+
+ %res1 = call <16 x float> @llvm.x86.avx512.mask.broadcastf32x8.512(<8 x float> %x0, <16 x float> %x2, i16 -1)
+ %res2 = call <16 x float> @llvm.x86.avx512.mask.broadcastf32x8.512(<8 x float> %x0, <16 x float> %x2, i16 %mask)
+ %res3 = call <16 x float> @llvm.x86.avx512.mask.broadcastf32x8.512(<8 x float> %x0, <16 x float> zeroinitializer, i16 %mask)
+ %res4 = fadd <16 x float> %res1, %res2
+ %res5 = fadd <16 x float> %res3, %res4
+ ret <16 x float> %res5
+}
+
+define <16 x float>@test_int_x86_avx512_mask_broadcastf32x8_512_load(<8 x float>* %x0ptr, <16 x float> %x2, i16 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_mask_broadcastf32x8_512_load:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovw %esi, %k1
+; CHECK-NEXT: vbroadcastf32x8 {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
+; CHECK-NEXT: retq
+
+ %x0 = load <8 x float>, <8 x float>* %x0ptr
+ %res = call <16 x float> @llvm.x86.avx512.mask.broadcastf32x8.512(<8 x float> %x0, <16 x float> %x2, i16 %mask)
+ ret <16 x float> %res
+}
+
+declare <8 x double> @llvm.x86.avx512.mask.broadcastf64x2.512(<2 x double>, <8 x double>, i8)
+
+define <8 x double>@test_int_x86_avx512_mask_broadcastf64x2_512(<2 x double> %x0, <8 x double> %x2, i8 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_mask_broadcastf64x2_512:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: ## kill: def %xmm0 killed %xmm0 def %ymm0
+; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; CHECK-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm2
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm1 {%k1}
+; CHECK-NEXT: vaddpd %zmm1, %zmm2, %zmm1
+; CHECK-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT: vaddpd %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: retq
+
+ %res1 = call <8 x double> @llvm.x86.avx512.mask.broadcastf64x2.512(<2 x double> %x0, <8 x double> %x2, i8 -1)
+ %res2 = call <8 x double> @llvm.x86.avx512.mask.broadcastf64x2.512(<2 x double> %x0, <8 x double> %x2, i8 %mask)
+ %res3 = call <8 x double> @llvm.x86.avx512.mask.broadcastf64x2.512(<2 x double> %x0, <8 x double> zeroinitializer, i8 %mask)
+ %res4 = fadd <8 x double> %res1, %res2
+ %res5 = fadd <8 x double> %res3, %res4
+ ret <8 x double> %res5
+}
+
+define <8 x double>@test_int_x86_avx512_mask_broadcastf64x2_512_load(<2 x double>* %x0ptr, <8 x double> %x2, i8 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_mask_broadcastf64x2_512_load:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovw %esi, %k1
+; CHECK-NEXT: vbroadcastf64x2 {{.*#+}} zmm0 {%k1} = mem[0,1,0,1,0,1,0,1]
+; CHECK-NEXT: retq
+
+ %x0 = load <2 x double>, <2 x double>* %x0ptr
+ %res = call <8 x double> @llvm.x86.avx512.mask.broadcastf64x2.512(<2 x double> %x0, <8 x double> %x2, i8 %mask)
+ ret <8 x double> %res
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.broadcasti32x8.512(<8 x i32>, <16 x i32>, i16)
+
+define <16 x i32>@test_int_x86_avx512_mask_broadcasti32x8_512(<8 x i32> %x0, <16 x i32> %x2, i16 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_mask_broadcasti32x8_512:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: ## kill: def %ymm0 killed %ymm0 def %zmm0
+; CHECK-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm2
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm1 {%k1}
+; CHECK-NEXT: vpaddd %zmm1, %zmm2, %zmm1
+; CHECK-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: retq
+
+ %res1 = call <16 x i32> @llvm.x86.avx512.mask.broadcasti32x8.512(<8 x i32> %x0, <16 x i32> %x2, i16 -1)
+ %res2 = call <16 x i32> @llvm.x86.avx512.mask.broadcasti32x8.512(<8 x i32> %x0, <16 x i32> %x2, i16 %mask)
+ %res3 = call <16 x i32> @llvm.x86.avx512.mask.broadcasti32x8.512(<8 x i32> %x0, <16 x i32> zeroinitializer, i16 %mask)
+ %res4 = add <16 x i32> %res1, %res2
+ %res5 = add <16 x i32> %res3, %res4
+ ret <16 x i32> %res5
+}
+
+define <16 x i32>@test_int_x86_avx512_mask_broadcasti32x8_512_load(<8 x i32>* %x0ptr, <16 x i32> %x2, i16 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_mask_broadcasti32x8_512_load:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovw %esi, %k1
+; CHECK-NEXT: vbroadcasti32x8 {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
+; CHECK-NEXT: retq
+
+ %x0 = load <8 x i32>, <8 x i32>* %x0ptr
+ %res = call <16 x i32> @llvm.x86.avx512.mask.broadcasti32x8.512(<8 x i32> %x0, <16 x i32> %x2, i16 %mask)
+ ret <16 x i32> %res
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.broadcasti64x2.512(<2 x i64>, <8 x i64>, i8)
+
+define <8 x i64>@test_int_x86_avx512_mask_broadcasti64x2_512(<2 x i64> %x0, <8 x i64> %x2, i8 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_mask_broadcasti64x2_512:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: ## kill: def %xmm0 killed %xmm0 def %ymm0
+; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; CHECK-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm2
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm1 {%k1}
+; CHECK-NEXT: vpaddq %zmm1, %zmm2, %zmm1
+; CHECK-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT: vpaddq %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: retq
+
+ %res1 = call <8 x i64> @llvm.x86.avx512.mask.broadcasti64x2.512(<2 x i64> %x0, <8 x i64> %x2, i8 -1)
+ %res2 = call <8 x i64> @llvm.x86.avx512.mask.broadcasti64x2.512(<2 x i64> %x0, <8 x i64> %x2, i8 %mask)
+ %res3 = call <8 x i64> @llvm.x86.avx512.mask.broadcasti64x2.512(<2 x i64> %x0, <8 x i64> zeroinitializer, i8 %mask)
+ %res4 = add <8 x i64> %res1, %res2
+ %res5 = add <8 x i64> %res3, %res4
+ ret <8 x i64> %res5
+}
+
+define <8 x i64>@test_int_x86_avx512_mask_broadcasti64x2_512_load(<2 x i64>* %x0ptr, <8 x i64> %x2, i8 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_mask_broadcasti64x2_512_load:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovw %esi, %k1
+; CHECK-NEXT: vbroadcasti64x2 {{.*#+}} zmm0 {%k1} = mem[0,1,0,1,0,1,0,1]
+; CHECK-NEXT: retq
+
+ %x0 = load <2 x i64>, <2 x i64>* %x0ptr
+ %res = call <8 x i64> @llvm.x86.avx512.mask.broadcasti64x2.512(<2 x i64> %x0, <8 x i64> %x2, i8 %mask)
+ ret <8 x i64> %res
+}
+
+declare <16 x float> @llvm.x86.avx512.mask.broadcastf32x2.512(<4 x float>, <16 x float>, i16)
+
+define <16 x float>@test_int_x86_avx512_mask_broadcastf32x2_512(<4 x float> %x0, <16 x float> %x2, i16 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_broadcastf32x2_512:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: ## kill: def %xmm0 killed %xmm0 def %ymm0
+; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; CHECK-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm2
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vinsertf32x8 $1, %ymm0, %zmm0, %zmm1 {%k1}
+; CHECK-NEXT: vinsertf32x8 $1, %ymm0, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT: vaddps %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: vaddps %zmm2, %zmm0, %zmm0
+; CHECK-NEXT: retq
+ %res = call <16 x float> @llvm.x86.avx512.mask.broadcastf32x2.512(<4 x float> %x0, <16 x float> %x2, i16 %x3)
+ %res1 = call <16 x float> @llvm.x86.avx512.mask.broadcastf32x2.512(<4 x float> %x0, <16 x float> zeroinitializer, i16 %x3)
+ %res2 = call <16 x float> @llvm.x86.avx512.mask.broadcastf32x2.512(<4 x float> %x0, <16 x float> %x2, i16 -1)
+ %res3 = fadd <16 x float> %res, %res1
+ %res4 = fadd <16 x float> %res3, %res2
+ ret <16 x float> %res4
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.broadcasti32x2.512(<4 x i32>, <16 x i32>, i16)
+
+define <16 x i32>@test_int_x86_avx512_mask_broadcasti32x2_512(<4 x i32> %x0, <16 x i32> %x2, i16 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_broadcasti32x2_512:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: ## kill: def %xmm0 killed %xmm0 def %ymm0
+; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; CHECK-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm2
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm1 {%k1}
+; CHECK-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: vpaddd %zmm2, %zmm0, %zmm0
+; CHECK-NEXT: retq
+ %res = call <16 x i32> @llvm.x86.avx512.mask.broadcasti32x2.512(<4 x i32> %x0, <16 x i32> %x2, i16 %x3)
+ %res1 = call <16 x i32> @llvm.x86.avx512.mask.broadcasti32x2.512(<4 x i32> %x0, <16 x i32> zeroinitializer, i16 %x3)
+ %res2 = call <16 x i32> @llvm.x86.avx512.mask.broadcasti32x2.512(<4 x i32> %x0, <16 x i32> %x2, i16 -1)
+ %res3 = add <16 x i32> %res, %res1
+ %res4 = add <16 x i32> %res3, %res2
+ ret <16 x i32> %res4
+}
+
diff --git a/test/CodeGen/X86/avx512dq-intrinsics.ll b/test/CodeGen/X86/avx512dq-intrinsics.ll
index d54208c00987..e0e7b2374477 100644
--- a/test/CodeGen/X86/avx512dq-intrinsics.ll
+++ b/test/CodeGen/X86/avx512dq-intrinsics.ll
@@ -1,11 +1,12 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512dq | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512dq | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512DQ
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512dq,avx512vl | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512DQVL
declare <8 x i64> @llvm.x86.avx512.mask.cvtpd2qq.512(<8 x double>, <8 x i64>, i8, i32)
define <8 x i64>@test_int_x86_avx512_mask_cvt_pd2qq_512(<8 x double> %x0, <8 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_pd2qq_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vcvtpd2qq {ru-sae}, %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vcvtpd2qq {rn-sae}, %zmm0, %zmm0
@@ -21,7 +22,7 @@ declare <8 x i64> @llvm.x86.avx512.mask.cvtpd2uqq.512(<8 x double>, <8 x i64>, i
define <8 x i64>@test_int_x86_avx512_mask_cvt_pd2uqq_512(<8 x double> %x0, <8 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_pd2uqq_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vcvtpd2uqq {ru-sae}, %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vcvtpd2uqq {rn-sae}, %zmm0, %zmm0
@@ -37,7 +38,7 @@ declare <8 x i64> @llvm.x86.avx512.mask.cvtps2qq.512(<8 x float>, <8 x i64>, i8,
define <8 x i64>@test_int_x86_avx512_mask_cvt_ps2qq_512(<8 x float> %x0, <8 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ps2qq_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vcvtps2qq {ru-sae}, %ymm0, %zmm1 {%k1}
; CHECK-NEXT: vcvtps2qq {rn-sae}, %ymm0, %zmm0
@@ -53,7 +54,7 @@ declare <8 x i64> @llvm.x86.avx512.mask.cvtps2uqq.512(<8 x float>, <8 x i64>, i8
define <8 x i64>@test_int_x86_avx512_mask_cvt_ps2uqq_512(<8 x float> %x0, <8 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ps2uqq_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vcvtps2uqq {ru-sae}, %ymm0, %zmm1 {%k1}
; CHECK-NEXT: vcvtps2uqq {rn-sae}, %ymm0, %zmm0
@@ -69,7 +70,7 @@ declare <8 x double> @llvm.x86.avx512.mask.cvtqq2pd.512(<8 x i64>, <8 x double>,
define <8 x double>@test_int_x86_avx512_mask_cvt_qq2pd_512(<8 x i64> %x0, <8 x double> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_qq2pd_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vcvtqq2pd %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vcvtqq2pd {rn-sae}, %zmm0, %zmm0
@@ -85,7 +86,7 @@ declare <8 x float> @llvm.x86.avx512.mask.cvtqq2ps.512(<8 x i64>, <8 x float>, i
define <8 x float>@test_int_x86_avx512_mask_cvt_qq2ps_512(<8 x i64> %x0, <8 x float> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_qq2ps_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vcvtqq2ps %zmm0, %ymm1 {%k1}
; CHECK-NEXT: vcvtqq2ps {rn-sae}, %zmm0, %ymm0
@@ -101,7 +102,7 @@ declare <8 x i64> @llvm.x86.avx512.mask.cvttpd2qq.512(<8 x double>, <8 x i64>, i
define <8 x i64>@test_int_x86_avx512_mask_cvtt_pd2qq_512(<8 x double> %x0, <8 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_pd2qq_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vcvttpd2qq %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vcvttpd2qq {sae}, %zmm0, %zmm0
@@ -117,7 +118,7 @@ declare <8 x i64> @llvm.x86.avx512.mask.cvttpd2uqq.512(<8 x double>, <8 x i64>,
define <8 x i64>@test_int_x86_avx512_mask_cvtt_pd2uqq_512(<8 x double> %x0, <8 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_pd2uqq_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vcvttpd2uqq %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vcvttpd2uqq {sae}, %zmm0, %zmm0
@@ -133,7 +134,7 @@ declare <8 x i64> @llvm.x86.avx512.mask.cvttps2qq.512(<8 x float>, <8 x i64>, i8
define <8 x i64>@test_int_x86_avx512_mask_cvtt_ps2qq_512(<8 x float> %x0, <8 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_ps2qq_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vcvttps2qq %ymm0, %zmm1 {%k1}
; CHECK-NEXT: vcvttps2qq {sae}, %ymm0, %zmm0
@@ -149,7 +150,7 @@ declare <8 x i64> @llvm.x86.avx512.mask.cvttps2uqq.512(<8 x float>, <8 x i64>, i
define <8 x i64>@test_int_x86_avx512_mask_cvtt_ps2uqq_512(<8 x float> %x0, <8 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_ps2uqq_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vcvttps2uqq %ymm0, %zmm1 {%k1}
; CHECK-NEXT: vcvttps2uqq {sae}, %ymm0, %zmm0
@@ -165,7 +166,7 @@ declare <8 x double> @llvm.x86.avx512.mask.cvtuqq2pd.512(<8 x i64>, <8 x double>
define <8 x double>@test_int_x86_avx512_mask_cvt_uqq2pd_512(<8 x i64> %x0, <8 x double> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_uqq2pd_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vcvtuqq2pd %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vcvtuqq2pd {rn-sae}, %zmm0, %zmm0
@@ -181,7 +182,7 @@ declare <8 x float> @llvm.x86.avx512.mask.cvtuqq2ps.512(<8 x i64>, <8 x float>,
define <8 x float>@test_int_x86_avx512_mask_cvt_uqq2ps_512(<8 x i64> %x0, <8 x float> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_uqq2ps_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vcvtuqq2ps %zmm0, %ymm1 {%k1}
; CHECK-NEXT: vcvtuqq2ps {rn-sae}, %zmm0, %ymm0
@@ -197,7 +198,7 @@ declare <8 x double> @llvm.x86.avx512.mask.reduce.pd.512(<8 x double>, i32, <8 x
define <8 x double>@test_int_x86_avx512_mask_reduce_pd_512(<8 x double> %x0, <8 x double> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_reduce_pd_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vreducepd $8, %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vreducepd $4, {sae}, %zmm0, %zmm0
@@ -213,7 +214,7 @@ declare <16 x float> @llvm.x86.avx512.mask.reduce.ps.512(<16 x float>, i32, <16
define <16 x float>@test_int_x86_avx512_mask_reduce_ps_512(<16 x float> %x0, <16 x float> %x2, i16 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_reduce_ps_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vreduceps $44, {sae}, %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vreduceps $11, %zmm0, %zmm0
@@ -229,7 +230,7 @@ declare <8 x double> @llvm.x86.avx512.mask.range.pd.512(<8 x double>, <8 x doubl
define <8 x double>@test_int_x86_avx512_mask_range_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x3, i8 %x4) {
; CHECK-LABEL: test_int_x86_avx512_mask_range_pd_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vrangepd $8, %zmm1, %zmm0, %zmm2 {%k1}
; CHECK-NEXT: vrangepd $4, {sae}, %zmm1, %zmm0, %zmm0
@@ -245,7 +246,7 @@ declare <16 x float> @llvm.x86.avx512.mask.range.ps.512(<16 x float>, <16 x floa
define <16 x float>@test_int_x86_avx512_mask_range_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x3, i16 %x4) {
; CHECK-LABEL: test_int_x86_avx512_mask_range_ps_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vrangeps $88, %zmm1, %zmm0, %zmm2 {%k1}
; CHECK-NEXT: vrangeps $4, {sae}, %zmm1, %zmm0, %zmm0
@@ -261,7 +262,7 @@ declare <4 x float> @llvm.x86.avx512.mask.reduce.ss(<4 x float>, <4 x float>,<4
define <4 x float>@test_int_x86_avx512_mask_reduce_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x3, i8 %x4) {
; CHECK-LABEL: test_int_x86_avx512_mask_reduce_ss:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vreducess $4, %xmm1, %xmm0, %xmm2 {%k1}
; CHECK-NEXT: vreducess $4, {sae}, %xmm1, %xmm0, %xmm0
@@ -276,24 +277,38 @@ define <4 x float>@test_int_x86_avx512_mask_reduce_ss(<4 x float> %x0, <4 x floa
declare <4 x float> @llvm.x86.avx512.mask.range.ss(<4 x float>, <4 x float>,<4 x float>, i8, i32, i32)
define <4 x float>@test_int_x86_avx512_mask_range_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x3, i8 %x4) {
-; CHECK-LABEL: test_int_x86_avx512_mask_range_ss:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vrangess $4, {sae}, %xmm1, %xmm0, %xmm2 {%k1}
-; CHECK-NEXT: vrangess $4, {sae}, %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: vaddps %xmm0, %xmm2, %xmm0
-; CHECK-NEXT: retq
+; AVX512DQ-LABEL: test_int_x86_avx512_mask_range_ss:
+; AVX512DQ: ## %bb.0:
+; AVX512DQ-NEXT: kmovw %edi, %k1
+; AVX512DQ-NEXT: vrangess $4, {sae}, %xmm1, %xmm0, %xmm2 {%k1}
+; AVX512DQ-NEXT: vrangess $4, {sae}, %xmm1, %xmm0, %xmm3
+; AVX512DQ-NEXT: vrangess $4, %xmm1, %xmm0, %xmm0
+; AVX512DQ-NEXT: vaddps %xmm3, %xmm2, %xmm1
+; AVX512DQ-NEXT: vaddps %xmm1, %xmm0, %xmm0
+; AVX512DQ-NEXT: retq
+;
+; AVX512DQVL-LABEL: test_int_x86_avx512_mask_range_ss:
+; AVX512DQVL: ## %bb.0:
+; AVX512DQVL-NEXT: kmovw %edi, %k1
+; AVX512DQVL-NEXT: vrangess $4, {sae}, %xmm1, %xmm0, %xmm2 {%k1}
+; AVX512DQVL-NEXT: vrangess $4, {sae}, %xmm1, %xmm0, %xmm3
+; AVX512DQVL-NEXT: vaddps %xmm3, %xmm2, %xmm2
+; AVX512DQVL-NEXT: vrangess $4, %xmm1, %xmm0, %xmm0
+; AVX512DQVL-NEXT: vaddps %xmm2, %xmm0, %xmm0
+; AVX512DQVL-NEXT: retq
%res = call <4 x float> @llvm.x86.avx512.mask.range.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x3, i8 %x4, i32 4, i32 8)
%res1 = call <4 x float> @llvm.x86.avx512.mask.range.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x3, i8 -1, i32 4, i32 8)
- %res2 = fadd <4 x float> %res, %res1
- ret <4 x float> %res2
+ %res2 = call <4 x float> @llvm.x86.avx512.mask.range.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x3, i8 -1, i32 4, i32 4)
+ %res3 = fadd <4 x float> %res, %res1
+ %res4 = fadd <4 x float> %res2, %res3
+ ret <4 x float> %res4
}
declare <2 x double> @llvm.x86.avx512.mask.reduce.sd(<2 x double>, <2 x double>,<2 x double>, i8, i32, i32)
define <2 x double>@test_int_x86_avx512_mask_reduce_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x3, i8 %x4) {
; CHECK-LABEL: test_int_x86_avx512_mask_reduce_sd:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vreducesd $4, %xmm1, %xmm0, %xmm2 {%k1}
; CHECK-NEXT: vreducesd $4, {sae}, %xmm1, %xmm0, %xmm0
@@ -309,30 +324,34 @@ declare <2 x double> @llvm.x86.avx512.mask.range.sd(<2 x double>, <2 x double>,<
define <2 x double>@test_int_x86_avx512_mask_range_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x3, i8 %x4) {
; CHECK-LABEL: test_int_x86_avx512_mask_range_sd:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vrangesd $4, %xmm1, %xmm0, %xmm3
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vrangesd $4, %xmm1, %xmm0, %xmm2 {%k1}
; CHECK-NEXT: vrangesd $4, {sae}, %xmm1, %xmm0, %xmm0
; CHECK-NEXT: vaddpd %xmm0, %xmm2, %xmm0
+; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0
; CHECK-NEXT: retq
%res = call <2 x double> @llvm.x86.avx512.mask.range.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x3, i8 %x4, i32 4, i32 4)
%res1 = call <2 x double> @llvm.x86.avx512.mask.range.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x3, i8 -1, i32 4, i32 8)
- %res2 = fadd <2 x double> %res, %res1
- ret <2 x double> %res2
+ %res2 = call <2 x double> @llvm.x86.avx512.mask.range.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x3, i8 -1, i32 4, i32 4)
+ %res3 = fadd <2 x double> %res, %res1
+ %res4 = fadd <2 x double> %res2, %res3
+ ret <2 x double> %res4
}
declare i8 @llvm.x86.avx512.mask.fpclass.pd.512(<8 x double>, i32, i8)
define i8 @test_int_x86_avx512_mask_fpclass_pd_512(<8 x double> %x0, i8 %x1) {
; CHECK-LABEL: test_int_x86_avx512_mask_fpclass_pd_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vfpclasspd $2, %zmm0, %k0 {%k1}
; CHECK-NEXT: kmovw %k0, %ecx
; CHECK-NEXT: vfpclasspd $4, %zmm0, %k0
; CHECK-NEXT: kmovw %k0, %eax
; CHECK-NEXT: addb %cl, %al
-; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: ## kill: def %al killed %al killed %eax
; CHECK-NEXT: retq
%res = call i8 @llvm.x86.avx512.mask.fpclass.pd.512(<8 x double> %x0, i32 2, i8 %x1)
%res1 = call i8 @llvm.x86.avx512.mask.fpclass.pd.512(<8 x double> %x0, i32 4, i8 -1)
@@ -343,14 +362,14 @@ declare i16 @llvm.x86.avx512.mask.fpclass.ps.512(<16 x float>, i32, i16)
define i16@test_int_x86_avx512_mask_fpclass_ps_512(<16 x float> %x0, i16 %x1) {
; CHECK-LABEL: test_int_x86_avx512_mask_fpclass_ps_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vfpclassps $4, %zmm0, %k0 {%k1}
; CHECK-NEXT: kmovw %k0, %ecx
; CHECK-NEXT: vfpclassps $4, %zmm0, %k0
; CHECK-NEXT: kmovw %k0, %eax
; CHECK-NEXT: addl %ecx, %eax
-; CHECK-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; CHECK-NEXT: ## kill: def %ax killed %ax killed %eax
; CHECK-NEXT: retq
%res = call i16 @llvm.x86.avx512.mask.fpclass.ps.512(<16 x float> %x0, i32 4, i16 %x1)
%res1 = call i16 @llvm.x86.avx512.mask.fpclass.ps.512(<16 x float> %x0, i32 4, i16 -1)
@@ -362,14 +381,14 @@ declare i8 @llvm.x86.avx512.mask.fpclass.sd(<2 x double>, i32, i8)
define i8 @test_int_x86_avx512_mask_fpclass_sd(<2 x double> %x0, i8 %x1) {
; CHECK-LABEL: test_int_x86_avx512_mask_fpclass_sd:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vfpclasssd $2, %xmm0, %k0 {%k1}
; CHECK-NEXT: kmovw %k0, %ecx
; CHECK-NEXT: vfpclasssd $4, %xmm0, %k0
; CHECK-NEXT: kmovw %k0, %eax
; CHECK-NEXT: addb %cl, %al
-; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: ## kill: def %al killed %al killed %eax
; CHECK-NEXT: retq
%res = call i8 @llvm.x86.avx512.mask.fpclass.sd(<2 x double> %x0, i32 2, i8 %x1)
%res1 = call i8 @llvm.x86.avx512.mask.fpclass.sd(<2 x double> %x0, i32 4, i8 -1)
@@ -377,18 +396,30 @@ define i8 @test_int_x86_avx512_mask_fpclass_sd(<2 x double> %x0, i8 %x1) {
ret i8 %res2
}
+define i8 @test_int_x86_avx512_mask_fpclass_sd_load(<2 x double>* %x0ptr) {
+; CHECK-LABEL: test_int_x86_avx512_mask_fpclass_sd_load:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vfpclasssd $4, (%rdi), %k0
+; CHECK-NEXT: kmovw %k0, %eax
+; CHECK-NEXT: ## kill: def %al killed %al killed %eax
+; CHECK-NEXT: retq
+ %x0 = load <2 x double>, <2 x double>* %x0ptr
+ %res = call i8 @llvm.x86.avx512.mask.fpclass.sd(<2 x double> %x0, i32 4, i8 -1)
+ ret i8 %res
+}
+
declare i8 @llvm.x86.avx512.mask.fpclass.ss(<4 x float>, i32, i8)
define i8 @test_int_x86_avx512_mask_fpclass_ss(<4 x float> %x0, i8 %x1) {
; CHECK-LABEL: test_int_x86_avx512_mask_fpclass_ss:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vfpclassss $4, %xmm0, %k0 {%k1}
; CHECK-NEXT: kmovw %k0, %ecx
; CHECK-NEXT: vfpclassss $4, %xmm0, %k0
; CHECK-NEXT: kmovw %k0, %eax
; CHECK-NEXT: addb %cl, %al
-; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: ## kill: def %al killed %al killed %eax
; CHECK-NEXT: retq
%res = call i8 @llvm.x86.avx512.mask.fpclass.ss(<4 x float> %x0, i32 4, i8 %x1)
%res1 = call i8 @llvm.x86.avx512.mask.fpclass.ss(<4 x float> %x0, i32 4, i8 -1)
@@ -396,54 +427,26 @@ define i8 @test_int_x86_avx512_mask_fpclass_ss(<4 x float> %x0, i8 %x1) {
ret i8 %res2
}
-declare <16 x float> @llvm.x86.avx512.mask.broadcastf32x2.512(<4 x float>, <16 x float>, i16)
-
-define <16 x float>@test_int_x86_avx512_mask_broadcastf32x2_512(<4 x float> %x0, <16 x float> %x2, i16 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_broadcastf32x2_512:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vbroadcastf32x2 {{.*#+}} zmm1 {%k1} = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
-; CHECK-NEXT: vbroadcastf32x2 {{.*#+}} zmm2 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
-; CHECK-NEXT: vaddps %zmm2, %zmm1, %zmm1
-; CHECK-NEXT: vbroadcastf32x2 {{.*#+}} zmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
-; CHECK-NEXT: vaddps %zmm0, %zmm1, %zmm0
-; CHECK-NEXT: retq
- %res = call <16 x float> @llvm.x86.avx512.mask.broadcastf32x2.512(<4 x float> %x0, <16 x float> %x2, i16 %x3)
- %res1 = call <16 x float> @llvm.x86.avx512.mask.broadcastf32x2.512(<4 x float> %x0, <16 x float> zeroinitializer, i16 %x3)
- %res2 = call <16 x float> @llvm.x86.avx512.mask.broadcastf32x2.512(<4 x float> %x0, <16 x float> %x2, i16 -1)
- %res3 = fadd <16 x float> %res, %res1
- %res4 = fadd <16 x float> %res3, %res2
- ret <16 x float> %res4
-}
-
-declare <16 x i32> @llvm.x86.avx512.mask.broadcasti32x2.512(<4 x i32>, <16 x i32>, i16)
-
-define <16 x i32>@test_int_x86_avx512_mask_broadcasti32x2_512(<4 x i32> %x0, <16 x i32> %x2, i16 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_broadcasti32x2_512:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} zmm1 {%k1} = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
-; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} zmm2 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
-; CHECK-NEXT: vpaddd %zmm2, %zmm1, %zmm1
-; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} zmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
-; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0
+define i8 @test_int_x86_avx512_mask_fpclass_ss_load(<4 x float>* %x0ptr, i8 %x1) {
+; CHECK-LABEL: test_int_x86_avx512_mask_fpclass_ss_load:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vfpclassss $4, (%rdi), %k0
+; CHECK-NEXT: kmovw %k0, %eax
+; CHECK-NEXT: ## kill: def %al killed %al killed %eax
; CHECK-NEXT: retq
- %res = call <16 x i32> @llvm.x86.avx512.mask.broadcasti32x2.512(<4 x i32> %x0, <16 x i32> %x2, i16 %x3)
- %res1 = call <16 x i32> @llvm.x86.avx512.mask.broadcasti32x2.512(<4 x i32> %x0, <16 x i32> zeroinitializer, i16 %x3)
- %res2 = call <16 x i32> @llvm.x86.avx512.mask.broadcasti32x2.512(<4 x i32> %x0, <16 x i32> %x2, i16 -1)
- %res3 = add <16 x i32> %res, %res1
- %res4 = add <16 x i32> %res3, %res2
- ret <16 x i32> %res4
+ %x0 = load <4 x float>, <4 x float>* %x0ptr
+ %res = call i8 @llvm.x86.avx512.mask.fpclass.ss(<4 x float> %x0, i32 4, i8 -1)
+ ret i8 %res
}
declare i16 @llvm.x86.avx512.cvtd2mask.512(<16 x i32>)
define i16@test_int_x86_avx512_cvtd2mask_512(<16 x i32> %x0) {
; CHECK-LABEL: test_int_x86_avx512_cvtd2mask_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpmovd2m %zmm0, %k0
; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; CHECK-NEXT: ## kill: def %ax killed %ax killed %eax
; CHECK-NEXT: retq
%res = call i16 @llvm.x86.avx512.cvtd2mask.512(<16 x i32> %x0)
ret i16 %res
@@ -453,151 +456,11 @@ declare i8 @llvm.x86.avx512.cvtq2mask.512(<8 x i64>)
define i8@test_int_x86_avx512_cvtq2mask_512(<8 x i64> %x0) {
; CHECK-LABEL: test_int_x86_avx512_cvtq2mask_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpmovq2m %zmm0, %k0
; CHECK-NEXT: kmovw %k0, %eax
-; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: ## kill: def %al killed %al killed %eax
; CHECK-NEXT: retq
%res = call i8 @llvm.x86.avx512.cvtq2mask.512(<8 x i64> %x0)
ret i8 %res
}
-
-declare <16 x float> @llvm.x86.avx512.mask.broadcastf32x8.512(<8 x float>, <16 x float>, i16)
-
-define <16 x float>@test_int_x86_avx512_mask_broadcastf32x8_512(<8 x float> %x0, <16 x float> %x2, i16 %mask) {
-; CHECK-LABEL: test_int_x86_avx512_mask_broadcastf32x8_512:
-; CHECK: ## BB#0:
-; CHECK-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
-; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vshuff32x4 {{.*#+}} zmm2 {%k1} {z} = zmm0[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
-; CHECK-NEXT: vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
-; CHECK-NEXT: vshuff32x4 {{.*#+}} zmm0 = zmm0[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
-; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0
-; CHECK-NEXT: vaddps %zmm0, %zmm2, %zmm0
-; CHECK-NEXT: retq
-
- %res1 = call <16 x float> @llvm.x86.avx512.mask.broadcastf32x8.512(<8 x float> %x0, <16 x float> %x2, i16 -1)
- %res2 = call <16 x float> @llvm.x86.avx512.mask.broadcastf32x8.512(<8 x float> %x0, <16 x float> %x2, i16 %mask)
- %res3 = call <16 x float> @llvm.x86.avx512.mask.broadcastf32x8.512(<8 x float> %x0, <16 x float> zeroinitializer, i16 %mask)
- %res4 = fadd <16 x float> %res1, %res2
- %res5 = fadd <16 x float> %res3, %res4
- ret <16 x float> %res5
-}
-
-define <16 x float>@test_int_x86_avx512_mask_broadcastf32x8_512_load(<8 x float>* %x0ptr, <16 x float> %x2, i16 %mask) {
-; CHECK-LABEL: test_int_x86_avx512_mask_broadcastf32x8_512_load:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %esi, %k1
-; CHECK-NEXT: vmovaps (%rdi), %ymm1
-; CHECK-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
-; CHECK-NEXT: retq
-
- %x0 = load <8 x float>, <8 x float>* %x0ptr
- %res = call <16 x float> @llvm.x86.avx512.mask.broadcastf32x8.512(<8 x float> %x0, <16 x float> %x2, i16 %mask)
- ret <16 x float> %res
-}
-
-declare <8 x double> @llvm.x86.avx512.mask.broadcastf64x2.512(<2 x double>, <8 x double>, i8)
-
-define <8 x double>@test_int_x86_avx512_mask_broadcastf64x2_512(<2 x double> %x0, <8 x double> %x2, i8 %mask) {
-; CHECK-LABEL: test_int_x86_avx512_mask_broadcastf64x2_512:
-; CHECK: ## BB#0:
-; CHECK-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
-; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm2 {%k1} {z} = zmm0[0,1,0,1,0,1,0,1]
-; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[0,1,0,1,0,1,0,1]
-; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1]
-; CHECK-NEXT: vaddpd %zmm1, %zmm0, %zmm0
-; CHECK-NEXT: vaddpd %zmm0, %zmm2, %zmm0
-; CHECK-NEXT: retq
-
- %res1 = call <8 x double> @llvm.x86.avx512.mask.broadcastf64x2.512(<2 x double> %x0, <8 x double> %x2, i8 -1)
- %res2 = call <8 x double> @llvm.x86.avx512.mask.broadcastf64x2.512(<2 x double> %x0, <8 x double> %x2, i8 %mask)
- %res3 = call <8 x double> @llvm.x86.avx512.mask.broadcastf64x2.512(<2 x double> %x0, <8 x double> zeroinitializer, i8 %mask)
- %res4 = fadd <8 x double> %res1, %res2
- %res5 = fadd <8 x double> %res3, %res4
- ret <8 x double> %res5
-}
-
-define <8 x double>@test_int_x86_avx512_mask_broadcastf64x2_512_load(<2 x double>* %x0ptr, <8 x double> %x2, i8 %mask) {
-; CHECK-LABEL: test_int_x86_avx512_mask_broadcastf64x2_512_load:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %esi, %k1
-; CHECK-NEXT: vmovapd (%rdi), %xmm1
-; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} = zmm1[0,1,0,1,0,1,0,1]
-; CHECK-NEXT: retq
-
- %x0 = load <2 x double>, <2 x double>* %x0ptr
- %res = call <8 x double> @llvm.x86.avx512.mask.broadcastf64x2.512(<2 x double> %x0, <8 x double> %x2, i8 %mask)
- ret <8 x double> %res
-}
-
-declare <16 x i32> @llvm.x86.avx512.mask.broadcasti32x8.512(<8 x i32>, <16 x i32>, i16)
-
-define <16 x i32>@test_int_x86_avx512_mask_broadcasti32x8_512(<8 x i32> %x0, <16 x i32> %x2, i16 %mask) {
-; CHECK-LABEL: test_int_x86_avx512_mask_broadcasti32x8_512:
-; CHECK: ## BB#0:
-; CHECK-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
-; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vshufi32x4 {{.*#+}} zmm2 {%k1} {z} = zmm0[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
-; CHECK-NEXT: vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
-; CHECK-NEXT: vshufi32x4 {{.*#+}} zmm0 = zmm0[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
-; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0
-; CHECK-NEXT: vpaddd %zmm0, %zmm2, %zmm0
-; CHECK-NEXT: retq
-
- %res1 = call <16 x i32> @llvm.x86.avx512.mask.broadcasti32x8.512(<8 x i32> %x0, <16 x i32> %x2, i16 -1)
- %res2 = call <16 x i32> @llvm.x86.avx512.mask.broadcasti32x8.512(<8 x i32> %x0, <16 x i32> %x2, i16 %mask)
- %res3 = call <16 x i32> @llvm.x86.avx512.mask.broadcasti32x8.512(<8 x i32> %x0, <16 x i32> zeroinitializer, i16 %mask)
- %res4 = add <16 x i32> %res1, %res2
- %res5 = add <16 x i32> %res3, %res4
- ret <16 x i32> %res5
-}
-
-define <16 x i32>@test_int_x86_avx512_mask_broadcasti32x8_512_load(<8 x i32>* %x0ptr, <16 x i32> %x2, i16 %mask) {
-; CHECK-LABEL: test_int_x86_avx512_mask_broadcasti32x8_512_load:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %esi, %k1
-; CHECK-NEXT: vmovdqa (%rdi), %ymm1
-; CHECK-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
-; CHECK-NEXT: retq
-
- %x0 = load <8 x i32>, <8 x i32>* %x0ptr
- %res = call <16 x i32> @llvm.x86.avx512.mask.broadcasti32x8.512(<8 x i32> %x0, <16 x i32> %x2, i16 %mask)
- ret <16 x i32> %res
-}
-
-declare <8 x i64> @llvm.x86.avx512.mask.broadcasti64x2.512(<2 x i64>, <8 x i64>, i8)
-
-define <8 x i64>@test_int_x86_avx512_mask_broadcasti64x2_512(<2 x i64> %x0, <8 x i64> %x2, i8 %mask) {
-; CHECK-LABEL: test_int_x86_avx512_mask_broadcasti64x2_512:
-; CHECK: ## BB#0:
-; CHECK-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
-; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k1} {z} = zmm0[0,1,0,1,0,1,0,1]
-; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[0,1,0,1,0,1,0,1]
-; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1]
-; CHECK-NEXT: vpaddq %zmm1, %zmm0, %zmm0
-; CHECK-NEXT: vpaddq %zmm0, %zmm2, %zmm0
-; CHECK-NEXT: retq
-
- %res1 = call <8 x i64> @llvm.x86.avx512.mask.broadcasti64x2.512(<2 x i64> %x0, <8 x i64> %x2, i8 -1)
- %res2 = call <8 x i64> @llvm.x86.avx512.mask.broadcasti64x2.512(<2 x i64> %x0, <8 x i64> %x2, i8 %mask)
- %res3 = call <8 x i64> @llvm.x86.avx512.mask.broadcasti64x2.512(<2 x i64> %x0, <8 x i64> zeroinitializer, i8 %mask)
- %res4 = add <8 x i64> %res1, %res2
- %res5 = add <8 x i64> %res3, %res4
- ret <8 x i64> %res5
-}
-
-define <8 x i64>@test_int_x86_avx512_mask_broadcasti64x2_512_load(<2 x i64>* %x0ptr, <8 x i64> %x2, i8 %mask) {
-; CHECK-LABEL: test_int_x86_avx512_mask_broadcasti64x2_512_load:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %esi, %k1
-; CHECK-NEXT: vmovdqa (%rdi), %xmm1
-; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} = zmm1[0,1,0,1,0,1,0,1]
-; CHECK-NEXT: retq
-
- %x0 = load <2 x i64>, <2 x i64>* %x0ptr
- %res = call <8 x i64> @llvm.x86.avx512.mask.broadcasti64x2.512(<2 x i64> %x0, <8 x i64> %x2, i8 %mask)
- ret <8 x i64> %res
-}
diff --git a/test/CodeGen/X86/avx512dq-mask-op.ll b/test/CodeGen/X86/avx512dq-mask-op.ll
index f0ae1b0129a8..8f7938f6a467 100644
--- a/test/CodeGen/X86/avx512dq-mask-op.ll
+++ b/test/CodeGen/X86/avx512dq-mask-op.ll
@@ -3,11 +3,11 @@
define i8 @mask8(i8 %x) {
; CHECK-LABEL: mask8:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k0
; CHECK-NEXT: knotb %k0, %k0
; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: ## kill: def %al killed %al killed %eax
; CHECK-NEXT: retq
%m0 = bitcast i8 %x to <8 x i1>
%m1 = xor <8 x i1> %m0, <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1>
@@ -17,7 +17,7 @@ define i8 @mask8(i8 %x) {
define void @mask8_mem(i8* %ptr) {
; CHECK-LABEL: mask8_mem:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovb (%rdi), %k0
; CHECK-NEXT: knotb %k0, %k0
; CHECK-NEXT: kmovb %k0, (%rdi)
@@ -32,7 +32,7 @@ define void @mask8_mem(i8* %ptr) {
define i8 @mand8(i8 %x, i8 %y) {
; CHECK-LABEL: mand8:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: movl %edi, %eax
; CHECK-NEXT: xorl %esi, %eax
; CHECK-NEXT: andl %esi, %edi
@@ -50,14 +50,14 @@ define i8 @mand8(i8 %x, i8 %y) {
define i8 @mand8_mem(<8 x i1>* %x, <8 x i1>* %y) {
; CHECK-LABEL: mand8_mem:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovb (%rdi), %k0
; CHECK-NEXT: kmovb (%rsi), %k1
; CHECK-NEXT: kandb %k1, %k0, %k2
; CHECK-NEXT: kxorb %k1, %k0, %k0
; CHECK-NEXT: korb %k0, %k2, %k0
; CHECK-NEXT: kmovd %k0, %eax
-; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: ## kill: def %al killed %al killed %eax
; CHECK-NEXT: retq
%ma = load <8 x i1>, <8 x i1>* %x
%mb = load <8 x i1>, <8 x i1>* %y
diff --git a/test/CodeGen/X86/avx512dqvl-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512dqvl-intrinsics-upgrade.ll
index 595b3e0ebb86..3a1bce05e678 100644
--- a/test/CodeGen/X86/avx512dqvl-intrinsics-upgrade.ll
+++ b/test/CodeGen/X86/avx512dqvl-intrinsics-upgrade.ll
@@ -3,7 +3,7 @@
define <4 x float> @test_mask_andnot_ps_rr_128(<4 x float> %a, <4 x float> %b) {
; CHECK-LABEL: test_mask_andnot_ps_rr_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vandnps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x55,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.andn.ps.128(<4 x float> %a, <4 x float> %b, <4 x float> zeroinitializer, i8 -1)
@@ -12,7 +12,7 @@ define <4 x float> @test_mask_andnot_ps_rr_128(<4 x float> %a, <4 x float> %b) {
define <4 x float> @test_mask_andnot_ps_rrk_128(<4 x float> %a, <4 x float> %b, <4 x float> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_andnot_ps_rrk_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vandnps %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x55,0xd1]
; CHECK-NEXT: vmovaps %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2]
@@ -23,7 +23,7 @@ define <4 x float> @test_mask_andnot_ps_rrk_128(<4 x float> %a, <4 x float> %b,
define <4 x float> @test_mask_andnot_ps_rrkz_128(<4 x float> %a, <4 x float> %b, i8 %mask) {
; CHECK-LABEL: test_mask_andnot_ps_rrkz_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vandnps %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x89,0x55,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -33,7 +33,7 @@ define <4 x float> @test_mask_andnot_ps_rrkz_128(<4 x float> %a, <4 x float> %b,
define <4 x float> @test_mask_andnot_ps_rm_128(<4 x float> %a, <4 x float>* %ptr_b) {
; CHECK-LABEL: test_mask_andnot_ps_rm_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vandnps (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x55,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <4 x float>, <4 x float>* %ptr_b
@@ -43,7 +43,7 @@ define <4 x float> @test_mask_andnot_ps_rm_128(<4 x float> %a, <4 x float>* %ptr
define <4 x float> @test_mask_andnot_ps_rmk_128(<4 x float> %a, <4 x float>* %ptr_b, <4 x float> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_andnot_ps_rmk_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vandnps (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x55,0x0f]
; CHECK-NEXT: vmovaps %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1]
@@ -55,7 +55,7 @@ define <4 x float> @test_mask_andnot_ps_rmk_128(<4 x float> %a, <4 x float>* %pt
define <4 x float> @test_mask_andnot_ps_rmkz_128(<4 x float> %a, <4 x float>* %ptr_b, i8 %mask) {
; CHECK-LABEL: test_mask_andnot_ps_rmkz_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vandnps (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x89,0x55,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -66,7 +66,7 @@ define <4 x float> @test_mask_andnot_ps_rmkz_128(<4 x float> %a, <4 x float>* %p
define <4 x float> @test_mask_andnot_ps_rmb_128(<4 x float> %a, float* %ptr_b) {
; CHECK-LABEL: test_mask_andnot_ps_rmb_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vandnps (%rdi){1to4}, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x18,0x55,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load float, float* %ptr_b
@@ -78,7 +78,7 @@ define <4 x float> @test_mask_andnot_ps_rmb_128(<4 x float> %a, float* %ptr_b) {
define <4 x float> @test_mask_andnot_ps_rmbk_128(<4 x float> %a, float* %ptr_b, <4 x float> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_andnot_ps_rmbk_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vandnps (%rdi){1to4}, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x19,0x55,0x0f]
; CHECK-NEXT: vmovaps %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1]
@@ -92,7 +92,7 @@ define <4 x float> @test_mask_andnot_ps_rmbk_128(<4 x float> %a, float* %ptr_b,
define <4 x float> @test_mask_andnot_ps_rmbkz_128(<4 x float> %a, float* %ptr_b, i8 %mask) {
; CHECK-LABEL: test_mask_andnot_ps_rmbkz_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vandnps (%rdi){1to4}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x99,0x55,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -107,7 +107,7 @@ declare <4 x float> @llvm.x86.avx512.mask.andn.ps.128(<4 x float>, <4 x float>,
define <8 x float> @test_mask_andnot_ps_rr_256(<8 x float> %a, <8 x float> %b) {
; CHECK-LABEL: test_mask_andnot_ps_rr_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vandnps %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x55,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.andn.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> zeroinitializer, i8 -1)
@@ -116,7 +116,7 @@ define <8 x float> @test_mask_andnot_ps_rr_256(<8 x float> %a, <8 x float> %b) {
define <8 x float> @test_mask_andnot_ps_rrk_256(<8 x float> %a, <8 x float> %b, <8 x float> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_andnot_ps_rrk_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vandnps %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x55,0xd1]
; CHECK-NEXT: vmovaps %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2]
@@ -127,7 +127,7 @@ define <8 x float> @test_mask_andnot_ps_rrk_256(<8 x float> %a, <8 x float> %b,
define <8 x float> @test_mask_andnot_ps_rrkz_256(<8 x float> %a, <8 x float> %b, i8 %mask) {
; CHECK-LABEL: test_mask_andnot_ps_rrkz_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vandnps %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xa9,0x55,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -137,7 +137,7 @@ define <8 x float> @test_mask_andnot_ps_rrkz_256(<8 x float> %a, <8 x float> %b,
define <8 x float> @test_mask_andnot_ps_rm_256(<8 x float> %a, <8 x float>* %ptr_b) {
; CHECK-LABEL: test_mask_andnot_ps_rm_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vandnps (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x55,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x float>, <8 x float>* %ptr_b
@@ -147,7 +147,7 @@ define <8 x float> @test_mask_andnot_ps_rm_256(<8 x float> %a, <8 x float>* %ptr
define <8 x float> @test_mask_andnot_ps_rmk_256(<8 x float> %a, <8 x float>* %ptr_b, <8 x float> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_andnot_ps_rmk_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vandnps (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x55,0x0f]
; CHECK-NEXT: vmovaps %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1]
@@ -159,7 +159,7 @@ define <8 x float> @test_mask_andnot_ps_rmk_256(<8 x float> %a, <8 x float>* %pt
define <8 x float> @test_mask_andnot_ps_rmkz_256(<8 x float> %a, <8 x float>* %ptr_b, i8 %mask) {
; CHECK-LABEL: test_mask_andnot_ps_rmkz_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vandnps (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xa9,0x55,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -170,7 +170,7 @@ define <8 x float> @test_mask_andnot_ps_rmkz_256(<8 x float> %a, <8 x float>* %p
define <8 x float> @test_mask_andnot_ps_rmb_256(<8 x float> %a, float* %ptr_b) {
; CHECK-LABEL: test_mask_andnot_ps_rmb_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vandnps (%rdi){1to8}, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x38,0x55,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load float, float* %ptr_b
@@ -182,7 +182,7 @@ define <8 x float> @test_mask_andnot_ps_rmb_256(<8 x float> %a, float* %ptr_b) {
define <8 x float> @test_mask_andnot_ps_rmbk_256(<8 x float> %a, float* %ptr_b, <8 x float> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_andnot_ps_rmbk_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vandnps (%rdi){1to8}, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x39,0x55,0x0f]
; CHECK-NEXT: vmovaps %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1]
@@ -196,7 +196,7 @@ define <8 x float> @test_mask_andnot_ps_rmbk_256(<8 x float> %a, float* %ptr_b,
define <8 x float> @test_mask_andnot_ps_rmbkz_256(<8 x float> %a, float* %ptr_b, i8 %mask) {
; CHECK-LABEL: test_mask_andnot_ps_rmbkz_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vandnps (%rdi){1to8}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xb9,0x55,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -211,7 +211,7 @@ declare <8 x float> @llvm.x86.avx512.mask.andn.ps.256(<8 x float>, <8 x float>,
define <16 x float> @test_mask_andnot_ps_rr_512(<16 x float> %a, <16 x float> %b) {
; CHECK-LABEL: test_mask_andnot_ps_rr_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vandnps %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x55,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x float> @llvm.x86.avx512.mask.andn.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> zeroinitializer, i16 -1)
@@ -220,7 +220,7 @@ define <16 x float> @test_mask_andnot_ps_rr_512(<16 x float> %a, <16 x float> %b
define <16 x float> @test_mask_andnot_ps_rrk_512(<16 x float> %a, <16 x float> %b, <16 x float> %passThru, i16 %mask) {
; CHECK-LABEL: test_mask_andnot_ps_rrk_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vandnps %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x49,0x55,0xd1]
; CHECK-NEXT: vmovaps %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2]
@@ -231,7 +231,7 @@ define <16 x float> @test_mask_andnot_ps_rrk_512(<16 x float> %a, <16 x float> %
define <16 x float> @test_mask_andnot_ps_rrkz_512(<16 x float> %a, <16 x float> %b, i16 %mask) {
; CHECK-LABEL: test_mask_andnot_ps_rrkz_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vandnps %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xc9,0x55,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -241,7 +241,7 @@ define <16 x float> @test_mask_andnot_ps_rrkz_512(<16 x float> %a, <16 x float>
define <16 x float> @test_mask_andnot_ps_rm_512(<16 x float> %a, <16 x float>* %ptr_b) {
; CHECK-LABEL: test_mask_andnot_ps_rm_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vandnps (%rdi), %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x55,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <16 x float>, <16 x float>* %ptr_b
@@ -251,7 +251,7 @@ define <16 x float> @test_mask_andnot_ps_rm_512(<16 x float> %a, <16 x float>* %
define <16 x float> @test_mask_andnot_ps_rmk_512(<16 x float> %a, <16 x float>* %ptr_b, <16 x float> %passThru, i16 %mask) {
; CHECK-LABEL: test_mask_andnot_ps_rmk_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vandnps (%rdi), %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x49,0x55,0x0f]
; CHECK-NEXT: vmovaps %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc1]
@@ -263,7 +263,7 @@ define <16 x float> @test_mask_andnot_ps_rmk_512(<16 x float> %a, <16 x float>*
define <16 x float> @test_mask_andnot_ps_rmkz_512(<16 x float> %a, <16 x float>* %ptr_b, i16 %mask) {
; CHECK-LABEL: test_mask_andnot_ps_rmkz_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vandnps (%rdi), %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xc9,0x55,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -274,7 +274,7 @@ define <16 x float> @test_mask_andnot_ps_rmkz_512(<16 x float> %a, <16 x float>*
define <16 x float> @test_mask_andnot_ps_rmb_512(<16 x float> %a, float* %ptr_b) {
; CHECK-LABEL: test_mask_andnot_ps_rmb_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vandnps (%rdi){1to16}, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x58,0x55,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load float, float* %ptr_b
@@ -286,7 +286,7 @@ define <16 x float> @test_mask_andnot_ps_rmb_512(<16 x float> %a, float* %ptr_b)
define <16 x float> @test_mask_andnot_ps_rmbk_512(<16 x float> %a, float* %ptr_b, <16 x float> %passThru, i16 %mask) {
; CHECK-LABEL: test_mask_andnot_ps_rmbk_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vandnps (%rdi){1to16}, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x59,0x55,0x0f]
; CHECK-NEXT: vmovaps %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc1]
@@ -300,7 +300,7 @@ define <16 x float> @test_mask_andnot_ps_rmbk_512(<16 x float> %a, float* %ptr_b
define <16 x float> @test_mask_andnot_ps_rmbkz_512(<16 x float> %a, float* %ptr_b, i16 %mask) {
; CHECK-LABEL: test_mask_andnot_ps_rmbkz_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vandnps (%rdi){1to16}, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xd9,0x55,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -315,7 +315,7 @@ declare <16 x float> @llvm.x86.avx512.mask.andn.ps.512(<16 x float>, <16 x float
define <4 x float> @test_mask_and_ps_rr_128(<4 x float> %a, <4 x float> %b) {
; CHECK-LABEL: test_mask_and_ps_rr_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vandps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x54,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.and.ps.128(<4 x float> %a, <4 x float> %b, <4 x float> zeroinitializer, i8 -1)
@@ -324,7 +324,7 @@ define <4 x float> @test_mask_and_ps_rr_128(<4 x float> %a, <4 x float> %b) {
define <4 x float> @test_mask_and_ps_rrk_128(<4 x float> %a, <4 x float> %b, <4 x float> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_and_ps_rrk_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vandps %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x54,0xd1]
; CHECK-NEXT: vmovaps %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2]
@@ -335,7 +335,7 @@ define <4 x float> @test_mask_and_ps_rrk_128(<4 x float> %a, <4 x float> %b, <4
define <4 x float> @test_mask_and_ps_rrkz_128(<4 x float> %a, <4 x float> %b, i8 %mask) {
; CHECK-LABEL: test_mask_and_ps_rrkz_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vandps %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x89,0x54,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -345,7 +345,7 @@ define <4 x float> @test_mask_and_ps_rrkz_128(<4 x float> %a, <4 x float> %b, i8
define <4 x float> @test_mask_and_ps_rm_128(<4 x float> %a, <4 x float>* %ptr_b) {
; CHECK-LABEL: test_mask_and_ps_rm_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vandps (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x54,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <4 x float>, <4 x float>* %ptr_b
@@ -355,7 +355,7 @@ define <4 x float> @test_mask_and_ps_rm_128(<4 x float> %a, <4 x float>* %ptr_b)
define <4 x float> @test_mask_and_ps_rmk_128(<4 x float> %a, <4 x float>* %ptr_b, <4 x float> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_and_ps_rmk_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vandps (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x54,0x0f]
; CHECK-NEXT: vmovaps %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1]
@@ -367,7 +367,7 @@ define <4 x float> @test_mask_and_ps_rmk_128(<4 x float> %a, <4 x float>* %ptr_b
define <4 x float> @test_mask_and_ps_rmkz_128(<4 x float> %a, <4 x float>* %ptr_b, i8 %mask) {
; CHECK-LABEL: test_mask_and_ps_rmkz_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vandps (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x89,0x54,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -378,7 +378,7 @@ define <4 x float> @test_mask_and_ps_rmkz_128(<4 x float> %a, <4 x float>* %ptr_
define <4 x float> @test_mask_and_ps_rmb_128(<4 x float> %a, float* %ptr_b) {
; CHECK-LABEL: test_mask_and_ps_rmb_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vandps (%rdi){1to4}, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x18,0x54,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load float, float* %ptr_b
@@ -390,7 +390,7 @@ define <4 x float> @test_mask_and_ps_rmb_128(<4 x float> %a, float* %ptr_b) {
define <4 x float> @test_mask_and_ps_rmbk_128(<4 x float> %a, float* %ptr_b, <4 x float> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_and_ps_rmbk_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vandps (%rdi){1to4}, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x19,0x54,0x0f]
; CHECK-NEXT: vmovaps %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1]
@@ -404,7 +404,7 @@ define <4 x float> @test_mask_and_ps_rmbk_128(<4 x float> %a, float* %ptr_b, <4
define <4 x float> @test_mask_and_ps_rmbkz_128(<4 x float> %a, float* %ptr_b, i8 %mask) {
; CHECK-LABEL: test_mask_and_ps_rmbkz_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vandps (%rdi){1to4}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x99,0x54,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -419,7 +419,7 @@ declare <4 x float> @llvm.x86.avx512.mask.and.ps.128(<4 x float>, <4 x float>, <
define <8 x float> @test_mask_and_ps_rr_256(<8 x float> %a, <8 x float> %b) {
; CHECK-LABEL: test_mask_and_ps_rr_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vandps %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x54,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.and.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> zeroinitializer, i8 -1)
@@ -428,7 +428,7 @@ define <8 x float> @test_mask_and_ps_rr_256(<8 x float> %a, <8 x float> %b) {
define <8 x float> @test_mask_and_ps_rrk_256(<8 x float> %a, <8 x float> %b, <8 x float> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_and_ps_rrk_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vandps %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x54,0xd1]
; CHECK-NEXT: vmovaps %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2]
@@ -439,7 +439,7 @@ define <8 x float> @test_mask_and_ps_rrk_256(<8 x float> %a, <8 x float> %b, <8
define <8 x float> @test_mask_and_ps_rrkz_256(<8 x float> %a, <8 x float> %b, i8 %mask) {
; CHECK-LABEL: test_mask_and_ps_rrkz_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vandps %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xa9,0x54,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -449,7 +449,7 @@ define <8 x float> @test_mask_and_ps_rrkz_256(<8 x float> %a, <8 x float> %b, i8
define <8 x float> @test_mask_and_ps_rm_256(<8 x float> %a, <8 x float>* %ptr_b) {
; CHECK-LABEL: test_mask_and_ps_rm_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vandps (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x54,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x float>, <8 x float>* %ptr_b
@@ -459,7 +459,7 @@ define <8 x float> @test_mask_and_ps_rm_256(<8 x float> %a, <8 x float>* %ptr_b)
define <8 x float> @test_mask_and_ps_rmk_256(<8 x float> %a, <8 x float>* %ptr_b, <8 x float> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_and_ps_rmk_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vandps (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x54,0x0f]
; CHECK-NEXT: vmovaps %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1]
@@ -471,7 +471,7 @@ define <8 x float> @test_mask_and_ps_rmk_256(<8 x float> %a, <8 x float>* %ptr_b
define <8 x float> @test_mask_and_ps_rmkz_256(<8 x float> %a, <8 x float>* %ptr_b, i8 %mask) {
; CHECK-LABEL: test_mask_and_ps_rmkz_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vandps (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xa9,0x54,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -482,7 +482,7 @@ define <8 x float> @test_mask_and_ps_rmkz_256(<8 x float> %a, <8 x float>* %ptr_
define <8 x float> @test_mask_and_ps_rmb_256(<8 x float> %a, float* %ptr_b) {
; CHECK-LABEL: test_mask_and_ps_rmb_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vandps (%rdi){1to8}, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x38,0x54,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load float, float* %ptr_b
@@ -494,7 +494,7 @@ define <8 x float> @test_mask_and_ps_rmb_256(<8 x float> %a, float* %ptr_b) {
define <8 x float> @test_mask_and_ps_rmbk_256(<8 x float> %a, float* %ptr_b, <8 x float> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_and_ps_rmbk_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vandps (%rdi){1to8}, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x39,0x54,0x0f]
; CHECK-NEXT: vmovaps %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1]
@@ -508,7 +508,7 @@ define <8 x float> @test_mask_and_ps_rmbk_256(<8 x float> %a, float* %ptr_b, <8
define <8 x float> @test_mask_and_ps_rmbkz_256(<8 x float> %a, float* %ptr_b, i8 %mask) {
; CHECK-LABEL: test_mask_and_ps_rmbkz_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vandps (%rdi){1to8}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xb9,0x54,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -523,7 +523,7 @@ declare <8 x float> @llvm.x86.avx512.mask.and.ps.256(<8 x float>, <8 x float>, <
define <16 x float> @test_mask_and_ps_rr_512(<16 x float> %a, <16 x float> %b) {
; CHECK-LABEL: test_mask_and_ps_rr_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vandps %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x54,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x float> @llvm.x86.avx512.mask.and.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> zeroinitializer, i16 -1)
@@ -532,7 +532,7 @@ define <16 x float> @test_mask_and_ps_rr_512(<16 x float> %a, <16 x float> %b) {
define <16 x float> @test_mask_and_ps_rrk_512(<16 x float> %a, <16 x float> %b, <16 x float> %passThru, i16 %mask) {
; CHECK-LABEL: test_mask_and_ps_rrk_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vandps %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x49,0x54,0xd1]
; CHECK-NEXT: vmovaps %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2]
@@ -543,7 +543,7 @@ define <16 x float> @test_mask_and_ps_rrk_512(<16 x float> %a, <16 x float> %b,
define <16 x float> @test_mask_and_ps_rrkz_512(<16 x float> %a, <16 x float> %b, i16 %mask) {
; CHECK-LABEL: test_mask_and_ps_rrkz_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vandps %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xc9,0x54,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -553,7 +553,7 @@ define <16 x float> @test_mask_and_ps_rrkz_512(<16 x float> %a, <16 x float> %b,
define <16 x float> @test_mask_and_ps_rm_512(<16 x float> %a, <16 x float>* %ptr_b) {
; CHECK-LABEL: test_mask_and_ps_rm_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vandps (%rdi), %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x54,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <16 x float>, <16 x float>* %ptr_b
@@ -563,7 +563,7 @@ define <16 x float> @test_mask_and_ps_rm_512(<16 x float> %a, <16 x float>* %ptr
define <16 x float> @test_mask_and_ps_rmk_512(<16 x float> %a, <16 x float>* %ptr_b, <16 x float> %passThru, i16 %mask) {
; CHECK-LABEL: test_mask_and_ps_rmk_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vandps (%rdi), %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x49,0x54,0x0f]
; CHECK-NEXT: vmovaps %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc1]
@@ -575,7 +575,7 @@ define <16 x float> @test_mask_and_ps_rmk_512(<16 x float> %a, <16 x float>* %pt
define <16 x float> @test_mask_and_ps_rmkz_512(<16 x float> %a, <16 x float>* %ptr_b, i16 %mask) {
; CHECK-LABEL: test_mask_and_ps_rmkz_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vandps (%rdi), %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xc9,0x54,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -586,7 +586,7 @@ define <16 x float> @test_mask_and_ps_rmkz_512(<16 x float> %a, <16 x float>* %p
define <16 x float> @test_mask_and_ps_rmb_512(<16 x float> %a, float* %ptr_b) {
; CHECK-LABEL: test_mask_and_ps_rmb_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vandps (%rdi){1to16}, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x58,0x54,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load float, float* %ptr_b
@@ -598,7 +598,7 @@ define <16 x float> @test_mask_and_ps_rmb_512(<16 x float> %a, float* %ptr_b) {
define <16 x float> @test_mask_and_ps_rmbk_512(<16 x float> %a, float* %ptr_b, <16 x float> %passThru, i16 %mask) {
; CHECK-LABEL: test_mask_and_ps_rmbk_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vandps (%rdi){1to16}, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x59,0x54,0x0f]
; CHECK-NEXT: vmovaps %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc1]
@@ -612,7 +612,7 @@ define <16 x float> @test_mask_and_ps_rmbk_512(<16 x float> %a, float* %ptr_b, <
define <16 x float> @test_mask_and_ps_rmbkz_512(<16 x float> %a, float* %ptr_b, i16 %mask) {
; CHECK-LABEL: test_mask_and_ps_rmbkz_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vandps (%rdi){1to16}, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xd9,0x54,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -627,7 +627,7 @@ declare <16 x float> @llvm.x86.avx512.mask.and.ps.512(<16 x float>, <16 x float>
define <4 x float> @test_mask_or_ps_rr_128(<4 x float> %a, <4 x float> %b) {
; CHECK-LABEL: test_mask_or_ps_rr_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vorps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x56,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.or.ps.128(<4 x float> %a, <4 x float> %b, <4 x float> zeroinitializer, i8 -1)
@@ -636,7 +636,7 @@ define <4 x float> @test_mask_or_ps_rr_128(<4 x float> %a, <4 x float> %b) {
define <4 x float> @test_mask_or_ps_rrk_128(<4 x float> %a, <4 x float> %b, <4 x float> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_or_ps_rrk_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vorps %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x56,0xd1]
; CHECK-NEXT: vmovaps %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2]
@@ -647,7 +647,7 @@ define <4 x float> @test_mask_or_ps_rrk_128(<4 x float> %a, <4 x float> %b, <4 x
define <4 x float> @test_mask_or_ps_rrkz_128(<4 x float> %a, <4 x float> %b, i8 %mask) {
; CHECK-LABEL: test_mask_or_ps_rrkz_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vorps %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x89,0x56,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -657,7 +657,7 @@ define <4 x float> @test_mask_or_ps_rrkz_128(<4 x float> %a, <4 x float> %b, i8
define <4 x float> @test_mask_or_ps_rm_128(<4 x float> %a, <4 x float>* %ptr_b) {
; CHECK-LABEL: test_mask_or_ps_rm_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vorps (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x56,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <4 x float>, <4 x float>* %ptr_b
@@ -667,7 +667,7 @@ define <4 x float> @test_mask_or_ps_rm_128(<4 x float> %a, <4 x float>* %ptr_b)
define <4 x float> @test_mask_or_ps_rmk_128(<4 x float> %a, <4 x float>* %ptr_b, <4 x float> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_or_ps_rmk_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vorps (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x56,0x0f]
; CHECK-NEXT: vmovaps %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1]
@@ -679,7 +679,7 @@ define <4 x float> @test_mask_or_ps_rmk_128(<4 x float> %a, <4 x float>* %ptr_b,
define <4 x float> @test_mask_or_ps_rmkz_128(<4 x float> %a, <4 x float>* %ptr_b, i8 %mask) {
; CHECK-LABEL: test_mask_or_ps_rmkz_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vorps (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x89,0x56,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -690,7 +690,7 @@ define <4 x float> @test_mask_or_ps_rmkz_128(<4 x float> %a, <4 x float>* %ptr_b
define <4 x float> @test_mask_or_ps_rmb_128(<4 x float> %a, float* %ptr_b) {
; CHECK-LABEL: test_mask_or_ps_rmb_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vorps (%rdi){1to4}, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x18,0x56,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load float, float* %ptr_b
@@ -702,7 +702,7 @@ define <4 x float> @test_mask_or_ps_rmb_128(<4 x float> %a, float* %ptr_b) {
define <4 x float> @test_mask_or_ps_rmbk_128(<4 x float> %a, float* %ptr_b, <4 x float> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_or_ps_rmbk_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vorps (%rdi){1to4}, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x19,0x56,0x0f]
; CHECK-NEXT: vmovaps %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1]
@@ -716,7 +716,7 @@ define <4 x float> @test_mask_or_ps_rmbk_128(<4 x float> %a, float* %ptr_b, <4 x
define <4 x float> @test_mask_or_ps_rmbkz_128(<4 x float> %a, float* %ptr_b, i8 %mask) {
; CHECK-LABEL: test_mask_or_ps_rmbkz_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vorps (%rdi){1to4}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x99,0x56,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -731,7 +731,7 @@ declare <4 x float> @llvm.x86.avx512.mask.or.ps.128(<4 x float>, <4 x float>, <4
define <8 x float> @test_mask_or_ps_rr_256(<8 x float> %a, <8 x float> %b) {
; CHECK-LABEL: test_mask_or_ps_rr_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vorps %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x56,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.or.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> zeroinitializer, i8 -1)
@@ -740,7 +740,7 @@ define <8 x float> @test_mask_or_ps_rr_256(<8 x float> %a, <8 x float> %b) {
define <8 x float> @test_mask_or_ps_rrk_256(<8 x float> %a, <8 x float> %b, <8 x float> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_or_ps_rrk_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vorps %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x56,0xd1]
; CHECK-NEXT: vmovaps %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2]
@@ -751,7 +751,7 @@ define <8 x float> @test_mask_or_ps_rrk_256(<8 x float> %a, <8 x float> %b, <8 x
define <8 x float> @test_mask_or_ps_rrkz_256(<8 x float> %a, <8 x float> %b, i8 %mask) {
; CHECK-LABEL: test_mask_or_ps_rrkz_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vorps %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xa9,0x56,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -761,7 +761,7 @@ define <8 x float> @test_mask_or_ps_rrkz_256(<8 x float> %a, <8 x float> %b, i8
define <8 x float> @test_mask_or_ps_rm_256(<8 x float> %a, <8 x float>* %ptr_b) {
; CHECK-LABEL: test_mask_or_ps_rm_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vorps (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x56,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x float>, <8 x float>* %ptr_b
@@ -771,7 +771,7 @@ define <8 x float> @test_mask_or_ps_rm_256(<8 x float> %a, <8 x float>* %ptr_b)
define <8 x float> @test_mask_or_ps_rmk_256(<8 x float> %a, <8 x float>* %ptr_b, <8 x float> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_or_ps_rmk_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vorps (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x56,0x0f]
; CHECK-NEXT: vmovaps %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1]
@@ -783,7 +783,7 @@ define <8 x float> @test_mask_or_ps_rmk_256(<8 x float> %a, <8 x float>* %ptr_b,
define <8 x float> @test_mask_or_ps_rmkz_256(<8 x float> %a, <8 x float>* %ptr_b, i8 %mask) {
; CHECK-LABEL: test_mask_or_ps_rmkz_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vorps (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xa9,0x56,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -794,7 +794,7 @@ define <8 x float> @test_mask_or_ps_rmkz_256(<8 x float> %a, <8 x float>* %ptr_b
define <8 x float> @test_mask_or_ps_rmb_256(<8 x float> %a, float* %ptr_b) {
; CHECK-LABEL: test_mask_or_ps_rmb_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vorps (%rdi){1to8}, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x38,0x56,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load float, float* %ptr_b
@@ -806,7 +806,7 @@ define <8 x float> @test_mask_or_ps_rmb_256(<8 x float> %a, float* %ptr_b) {
define <8 x float> @test_mask_or_ps_rmbk_256(<8 x float> %a, float* %ptr_b, <8 x float> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_or_ps_rmbk_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vorps (%rdi){1to8}, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x39,0x56,0x0f]
; CHECK-NEXT: vmovaps %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1]
@@ -820,7 +820,7 @@ define <8 x float> @test_mask_or_ps_rmbk_256(<8 x float> %a, float* %ptr_b, <8 x
define <8 x float> @test_mask_or_ps_rmbkz_256(<8 x float> %a, float* %ptr_b, i8 %mask) {
; CHECK-LABEL: test_mask_or_ps_rmbkz_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vorps (%rdi){1to8}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xb9,0x56,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -835,7 +835,7 @@ declare <8 x float> @llvm.x86.avx512.mask.or.ps.256(<8 x float>, <8 x float>, <8
define <16 x float> @test_mask_or_ps_rr_512(<16 x float> %a, <16 x float> %b) {
; CHECK-LABEL: test_mask_or_ps_rr_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vorps %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x56,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x float> @llvm.x86.avx512.mask.or.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> zeroinitializer, i16 -1)
@@ -844,7 +844,7 @@ define <16 x float> @test_mask_or_ps_rr_512(<16 x float> %a, <16 x float> %b) {
define <16 x float> @test_mask_or_ps_rrk_512(<16 x float> %a, <16 x float> %b, <16 x float> %passThru, i16 %mask) {
; CHECK-LABEL: test_mask_or_ps_rrk_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vorps %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x49,0x56,0xd1]
; CHECK-NEXT: vmovaps %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2]
@@ -855,7 +855,7 @@ define <16 x float> @test_mask_or_ps_rrk_512(<16 x float> %a, <16 x float> %b, <
define <16 x float> @test_mask_or_ps_rrkz_512(<16 x float> %a, <16 x float> %b, i16 %mask) {
; CHECK-LABEL: test_mask_or_ps_rrkz_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vorps %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xc9,0x56,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -865,7 +865,7 @@ define <16 x float> @test_mask_or_ps_rrkz_512(<16 x float> %a, <16 x float> %b,
define <16 x float> @test_mask_or_ps_rm_512(<16 x float> %a, <16 x float>* %ptr_b) {
; CHECK-LABEL: test_mask_or_ps_rm_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vorps (%rdi), %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x56,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <16 x float>, <16 x float>* %ptr_b
@@ -875,7 +875,7 @@ define <16 x float> @test_mask_or_ps_rm_512(<16 x float> %a, <16 x float>* %ptr_
define <16 x float> @test_mask_or_ps_rmk_512(<16 x float> %a, <16 x float>* %ptr_b, <16 x float> %passThru, i16 %mask) {
; CHECK-LABEL: test_mask_or_ps_rmk_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vorps (%rdi), %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x49,0x56,0x0f]
; CHECK-NEXT: vmovaps %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc1]
@@ -887,7 +887,7 @@ define <16 x float> @test_mask_or_ps_rmk_512(<16 x float> %a, <16 x float>* %ptr
define <16 x float> @test_mask_or_ps_rmkz_512(<16 x float> %a, <16 x float>* %ptr_b, i16 %mask) {
; CHECK-LABEL: test_mask_or_ps_rmkz_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vorps (%rdi), %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xc9,0x56,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -898,7 +898,7 @@ define <16 x float> @test_mask_or_ps_rmkz_512(<16 x float> %a, <16 x float>* %pt
define <16 x float> @test_mask_or_ps_rmb_512(<16 x float> %a, float* %ptr_b) {
; CHECK-LABEL: test_mask_or_ps_rmb_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vorps (%rdi){1to16}, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x58,0x56,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load float, float* %ptr_b
@@ -910,7 +910,7 @@ define <16 x float> @test_mask_or_ps_rmb_512(<16 x float> %a, float* %ptr_b) {
define <16 x float> @test_mask_or_ps_rmbk_512(<16 x float> %a, float* %ptr_b, <16 x float> %passThru, i16 %mask) {
; CHECK-LABEL: test_mask_or_ps_rmbk_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vorps (%rdi){1to16}, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x59,0x56,0x0f]
; CHECK-NEXT: vmovaps %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc1]
@@ -924,7 +924,7 @@ define <16 x float> @test_mask_or_ps_rmbk_512(<16 x float> %a, float* %ptr_b, <1
define <16 x float> @test_mask_or_ps_rmbkz_512(<16 x float> %a, float* %ptr_b, i16 %mask) {
; CHECK-LABEL: test_mask_or_ps_rmbkz_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vorps (%rdi){1to16}, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xd9,0x56,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -939,7 +939,7 @@ declare <16 x float> @llvm.x86.avx512.mask.or.ps.512(<16 x float>, <16 x float>,
define <4 x float> @test_mask_xor_ps_rr_128(<4 x float> %a, <4 x float> %b) {
; CHECK-LABEL: test_mask_xor_ps_rr_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vxorps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x57,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.xor.ps.128(<4 x float> %a, <4 x float> %b, <4 x float> zeroinitializer, i8 -1)
@@ -948,7 +948,7 @@ define <4 x float> @test_mask_xor_ps_rr_128(<4 x float> %a, <4 x float> %b) {
define <4 x float> @test_mask_xor_ps_rrk_128(<4 x float> %a, <4 x float> %b, <4 x float> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_xor_ps_rrk_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vxorps %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x57,0xd1]
; CHECK-NEXT: vmovaps %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2]
@@ -959,7 +959,7 @@ define <4 x float> @test_mask_xor_ps_rrk_128(<4 x float> %a, <4 x float> %b, <4
define <4 x float> @test_mask_xor_ps_rrkz_128(<4 x float> %a, <4 x float> %b, i8 %mask) {
; CHECK-LABEL: test_mask_xor_ps_rrkz_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vxorps %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x89,0x57,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -969,7 +969,7 @@ define <4 x float> @test_mask_xor_ps_rrkz_128(<4 x float> %a, <4 x float> %b, i8
define <4 x float> @test_mask_xor_ps_rm_128(<4 x float> %a, <4 x float>* %ptr_b) {
; CHECK-LABEL: test_mask_xor_ps_rm_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vxorps (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x57,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <4 x float>, <4 x float>* %ptr_b
@@ -979,7 +979,7 @@ define <4 x float> @test_mask_xor_ps_rm_128(<4 x float> %a, <4 x float>* %ptr_b)
define <4 x float> @test_mask_xor_ps_rmk_128(<4 x float> %a, <4 x float>* %ptr_b, <4 x float> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_xor_ps_rmk_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vxorps (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x57,0x0f]
; CHECK-NEXT: vmovaps %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1]
@@ -991,7 +991,7 @@ define <4 x float> @test_mask_xor_ps_rmk_128(<4 x float> %a, <4 x float>* %ptr_b
define <4 x float> @test_mask_xor_ps_rmkz_128(<4 x float> %a, <4 x float>* %ptr_b, i8 %mask) {
; CHECK-LABEL: test_mask_xor_ps_rmkz_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vxorps (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x89,0x57,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -1002,7 +1002,7 @@ define <4 x float> @test_mask_xor_ps_rmkz_128(<4 x float> %a, <4 x float>* %ptr_
define <4 x float> @test_mask_xor_ps_rmb_128(<4 x float> %a, float* %ptr_b) {
; CHECK-LABEL: test_mask_xor_ps_rmb_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vxorps (%rdi){1to4}, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x18,0x57,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load float, float* %ptr_b
@@ -1014,7 +1014,7 @@ define <4 x float> @test_mask_xor_ps_rmb_128(<4 x float> %a, float* %ptr_b) {
define <4 x float> @test_mask_xor_ps_rmbk_128(<4 x float> %a, float* %ptr_b, <4 x float> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_xor_ps_rmbk_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vxorps (%rdi){1to4}, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x19,0x57,0x0f]
; CHECK-NEXT: vmovaps %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1]
@@ -1028,7 +1028,7 @@ define <4 x float> @test_mask_xor_ps_rmbk_128(<4 x float> %a, float* %ptr_b, <4
define <4 x float> @test_mask_xor_ps_rmbkz_128(<4 x float> %a, float* %ptr_b, i8 %mask) {
; CHECK-LABEL: test_mask_xor_ps_rmbkz_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vxorps (%rdi){1to4}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x99,0x57,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -1043,7 +1043,7 @@ declare <4 x float> @llvm.x86.avx512.mask.xor.ps.128(<4 x float>, <4 x float>, <
define <8 x float> @test_mask_xor_ps_rr_256(<8 x float> %a, <8 x float> %b) {
; CHECK-LABEL: test_mask_xor_ps_rr_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vxorps %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x57,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.xor.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> zeroinitializer, i8 -1)
@@ -1052,7 +1052,7 @@ define <8 x float> @test_mask_xor_ps_rr_256(<8 x float> %a, <8 x float> %b) {
define <8 x float> @test_mask_xor_ps_rrk_256(<8 x float> %a, <8 x float> %b, <8 x float> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_xor_ps_rrk_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vxorps %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x57,0xd1]
; CHECK-NEXT: vmovaps %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2]
@@ -1063,7 +1063,7 @@ define <8 x float> @test_mask_xor_ps_rrk_256(<8 x float> %a, <8 x float> %b, <8
define <8 x float> @test_mask_xor_ps_rrkz_256(<8 x float> %a, <8 x float> %b, i8 %mask) {
; CHECK-LABEL: test_mask_xor_ps_rrkz_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vxorps %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xa9,0x57,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -1073,7 +1073,7 @@ define <8 x float> @test_mask_xor_ps_rrkz_256(<8 x float> %a, <8 x float> %b, i8
define <8 x float> @test_mask_xor_ps_rm_256(<8 x float> %a, <8 x float>* %ptr_b) {
; CHECK-LABEL: test_mask_xor_ps_rm_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vxorps (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x57,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x float>, <8 x float>* %ptr_b
@@ -1083,7 +1083,7 @@ define <8 x float> @test_mask_xor_ps_rm_256(<8 x float> %a, <8 x float>* %ptr_b)
define <8 x float> @test_mask_xor_ps_rmk_256(<8 x float> %a, <8 x float>* %ptr_b, <8 x float> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_xor_ps_rmk_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vxorps (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x57,0x0f]
; CHECK-NEXT: vmovaps %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1]
@@ -1095,7 +1095,7 @@ define <8 x float> @test_mask_xor_ps_rmk_256(<8 x float> %a, <8 x float>* %ptr_b
define <8 x float> @test_mask_xor_ps_rmkz_256(<8 x float> %a, <8 x float>* %ptr_b, i8 %mask) {
; CHECK-LABEL: test_mask_xor_ps_rmkz_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vxorps (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xa9,0x57,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -1106,7 +1106,7 @@ define <8 x float> @test_mask_xor_ps_rmkz_256(<8 x float> %a, <8 x float>* %ptr_
define <8 x float> @test_mask_xor_ps_rmb_256(<8 x float> %a, float* %ptr_b) {
; CHECK-LABEL: test_mask_xor_ps_rmb_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vxorps (%rdi){1to8}, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x38,0x57,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load float, float* %ptr_b
@@ -1118,7 +1118,7 @@ define <8 x float> @test_mask_xor_ps_rmb_256(<8 x float> %a, float* %ptr_b) {
define <8 x float> @test_mask_xor_ps_rmbk_256(<8 x float> %a, float* %ptr_b, <8 x float> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_xor_ps_rmbk_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vxorps (%rdi){1to8}, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x39,0x57,0x0f]
; CHECK-NEXT: vmovaps %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1]
@@ -1132,7 +1132,7 @@ define <8 x float> @test_mask_xor_ps_rmbk_256(<8 x float> %a, float* %ptr_b, <8
define <8 x float> @test_mask_xor_ps_rmbkz_256(<8 x float> %a, float* %ptr_b, i8 %mask) {
; CHECK-LABEL: test_mask_xor_ps_rmbkz_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vxorps (%rdi){1to8}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xb9,0x57,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -1147,7 +1147,7 @@ declare <8 x float> @llvm.x86.avx512.mask.xor.ps.256(<8 x float>, <8 x float>, <
define <16 x float> @test_mask_xor_ps_rr_512(<16 x float> %a, <16 x float> %b) {
; CHECK-LABEL: test_mask_xor_ps_rr_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vxorps %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x57,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x float> @llvm.x86.avx512.mask.xor.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> zeroinitializer, i16 -1)
@@ -1156,7 +1156,7 @@ define <16 x float> @test_mask_xor_ps_rr_512(<16 x float> %a, <16 x float> %b) {
define <16 x float> @test_mask_xor_ps_rrk_512(<16 x float> %a, <16 x float> %b, <16 x float> %passThru, i16 %mask) {
; CHECK-LABEL: test_mask_xor_ps_rrk_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vxorps %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x49,0x57,0xd1]
; CHECK-NEXT: vmovaps %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2]
@@ -1167,7 +1167,7 @@ define <16 x float> @test_mask_xor_ps_rrk_512(<16 x float> %a, <16 x float> %b,
define <16 x float> @test_mask_xor_ps_rrkz_512(<16 x float> %a, <16 x float> %b, i16 %mask) {
; CHECK-LABEL: test_mask_xor_ps_rrkz_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vxorps %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xc9,0x57,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -1177,7 +1177,7 @@ define <16 x float> @test_mask_xor_ps_rrkz_512(<16 x float> %a, <16 x float> %b,
define <16 x float> @test_mask_xor_ps_rm_512(<16 x float> %a, <16 x float>* %ptr_b) {
; CHECK-LABEL: test_mask_xor_ps_rm_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vxorps (%rdi), %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x57,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <16 x float>, <16 x float>* %ptr_b
@@ -1187,7 +1187,7 @@ define <16 x float> @test_mask_xor_ps_rm_512(<16 x float> %a, <16 x float>* %ptr
define <16 x float> @test_mask_xor_ps_rmk_512(<16 x float> %a, <16 x float>* %ptr_b, <16 x float> %passThru, i16 %mask) {
; CHECK-LABEL: test_mask_xor_ps_rmk_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vxorps (%rdi), %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x49,0x57,0x0f]
; CHECK-NEXT: vmovaps %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc1]
@@ -1199,7 +1199,7 @@ define <16 x float> @test_mask_xor_ps_rmk_512(<16 x float> %a, <16 x float>* %pt
define <16 x float> @test_mask_xor_ps_rmkz_512(<16 x float> %a, <16 x float>* %ptr_b, i16 %mask) {
; CHECK-LABEL: test_mask_xor_ps_rmkz_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vxorps (%rdi), %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xc9,0x57,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -1210,7 +1210,7 @@ define <16 x float> @test_mask_xor_ps_rmkz_512(<16 x float> %a, <16 x float>* %p
define <16 x float> @test_mask_xor_ps_rmb_512(<16 x float> %a, float* %ptr_b) {
; CHECK-LABEL: test_mask_xor_ps_rmb_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vxorps (%rdi){1to16}, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x58,0x57,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load float, float* %ptr_b
@@ -1222,7 +1222,7 @@ define <16 x float> @test_mask_xor_ps_rmb_512(<16 x float> %a, float* %ptr_b) {
define <16 x float> @test_mask_xor_ps_rmbk_512(<16 x float> %a, float* %ptr_b, <16 x float> %passThru, i16 %mask) {
; CHECK-LABEL: test_mask_xor_ps_rmbk_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vxorps (%rdi){1to16}, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x59,0x57,0x0f]
; CHECK-NEXT: vmovaps %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc1]
@@ -1236,7 +1236,7 @@ define <16 x float> @test_mask_xor_ps_rmbk_512(<16 x float> %a, float* %ptr_b, <
define <16 x float> @test_mask_xor_ps_rmbkz_512(<16 x float> %a, float* %ptr_b, i16 %mask) {
; CHECK-LABEL: test_mask_xor_ps_rmbkz_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vxorps (%rdi){1to16}, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xd9,0x57,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -1251,7 +1251,7 @@ declare <16 x float> @llvm.x86.avx512.mask.xor.ps.512(<16 x float>, <16 x float>
define <8 x i64> @test_mask_mullo_epi64_rr_512(<8 x i64> %a, <8 x i64> %b) {
; CHECK-LABEL: test_mask_mullo_epi64_rr_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpmullq %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf2,0xfd,0x48,0x40,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i64> @llvm.x86.avx512.mask.pmull.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 -1)
@@ -1260,7 +1260,7 @@ define <8 x i64> @test_mask_mullo_epi64_rr_512(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @test_mask_mullo_epi64_rrk_512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_mullo_epi64_rrk_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpmullq %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x40,0xd1]
; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2]
@@ -1271,7 +1271,7 @@ define <8 x i64> @test_mask_mullo_epi64_rrk_512(<8 x i64> %a, <8 x i64> %b, <8 x
define <8 x i64> @test_mask_mullo_epi64_rrkz_512(<8 x i64> %a, <8 x i64> %b, i8 %mask) {
; CHECK-LABEL: test_mask_mullo_epi64_rrkz_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpmullq %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xc9,0x40,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -1281,7 +1281,7 @@ define <8 x i64> @test_mask_mullo_epi64_rrkz_512(<8 x i64> %a, <8 x i64> %b, i8
define <8 x i64> @test_mask_mullo_epi64_rm_512(<8 x i64> %a, <8 x i64>* %ptr_b) {
; CHECK-LABEL: test_mask_mullo_epi64_rm_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpmullq (%rdi), %zmm0, %zmm0 ## encoding: [0x62,0xf2,0xfd,0x48,0x40,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x i64>, <8 x i64>* %ptr_b
@@ -1291,7 +1291,7 @@ define <8 x i64> @test_mask_mullo_epi64_rm_512(<8 x i64> %a, <8 x i64>* %ptr_b)
define <8 x i64> @test_mask_mullo_epi64_rmk_512(<8 x i64> %a, <8 x i64>* %ptr_b, <8 x i64> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_mullo_epi64_rmk_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpmullq (%rdi), %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x40,0x0f]
; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1]
@@ -1303,7 +1303,7 @@ define <8 x i64> @test_mask_mullo_epi64_rmk_512(<8 x i64> %a, <8 x i64>* %ptr_b,
define <8 x i64> @test_mask_mullo_epi64_rmkz_512(<8 x i64> %a, <8 x i64>* %ptr_b, i8 %mask) {
; CHECK-LABEL: test_mask_mullo_epi64_rmkz_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpmullq (%rdi), %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xc9,0x40,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -1314,7 +1314,7 @@ define <8 x i64> @test_mask_mullo_epi64_rmkz_512(<8 x i64> %a, <8 x i64>* %ptr_b
define <8 x i64> @test_mask_mullo_epi64_rmb_512(<8 x i64> %a, i64* %ptr_b) {
; CHECK-LABEL: test_mask_mullo_epi64_rmb_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpmullq (%rdi){1to8}, %zmm0, %zmm0 ## encoding: [0x62,0xf2,0xfd,0x58,0x40,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load i64, i64* %ptr_b
@@ -1326,7 +1326,7 @@ define <8 x i64> @test_mask_mullo_epi64_rmb_512(<8 x i64> %a, i64* %ptr_b) {
define <8 x i64> @test_mask_mullo_epi64_rmbk_512(<8 x i64> %a, i64* %ptr_b, <8 x i64> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_mullo_epi64_rmbk_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpmullq (%rdi){1to8}, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x59,0x40,0x0f]
; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1]
@@ -1340,7 +1340,7 @@ define <8 x i64> @test_mask_mullo_epi64_rmbk_512(<8 x i64> %a, i64* %ptr_b, <8 x
define <8 x i64> @test_mask_mullo_epi64_rmbkz_512(<8 x i64> %a, i64* %ptr_b, i8 %mask) {
; CHECK-LABEL: test_mask_mullo_epi64_rmbkz_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpmullq (%rdi){1to8}, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xd9,0x40,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -1354,7 +1354,7 @@ declare <8 x i64> @llvm.x86.avx512.mask.pmull.q.512(<8 x i64>, <8 x i64>, <8 x i
define <4 x i64> @test_mask_mullo_epi64_rr_256(<4 x i64> %a, <4 x i64> %b) {
; CHECK-LABEL: test_mask_mullo_epi64_rr_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpmullq %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf2,0xfd,0x28,0x40,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i64> @llvm.x86.avx512.mask.pmull.q.256(<4 x i64> %a, <4 x i64> %b, <4 x i64> zeroinitializer, i8 -1)
@@ -1363,7 +1363,7 @@ define <4 x i64> @test_mask_mullo_epi64_rr_256(<4 x i64> %a, <4 x i64> %b) {
define <4 x i64> @test_mask_mullo_epi64_rrk_256(<4 x i64> %a, <4 x i64> %b, <4 x i64> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_mullo_epi64_rrk_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpmullq %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x40,0xd1]
; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2]
@@ -1374,7 +1374,7 @@ define <4 x i64> @test_mask_mullo_epi64_rrk_256(<4 x i64> %a, <4 x i64> %b, <4 x
define <4 x i64> @test_mask_mullo_epi64_rrkz_256(<4 x i64> %a, <4 x i64> %b, i8 %mask) {
; CHECK-LABEL: test_mask_mullo_epi64_rrkz_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpmullq %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x40,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -1384,7 +1384,7 @@ define <4 x i64> @test_mask_mullo_epi64_rrkz_256(<4 x i64> %a, <4 x i64> %b, i8
define <4 x i64> @test_mask_mullo_epi64_rm_256(<4 x i64> %a, <4 x i64>* %ptr_b) {
; CHECK-LABEL: test_mask_mullo_epi64_rm_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpmullq (%rdi), %ymm0, %ymm0 ## encoding: [0x62,0xf2,0xfd,0x28,0x40,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <4 x i64>, <4 x i64>* %ptr_b
@@ -1394,7 +1394,7 @@ define <4 x i64> @test_mask_mullo_epi64_rm_256(<4 x i64> %a, <4 x i64>* %ptr_b)
define <4 x i64> @test_mask_mullo_epi64_rmk_256(<4 x i64> %a, <4 x i64>* %ptr_b, <4 x i64> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_mullo_epi64_rmk_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpmullq (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x40,0x0f]
; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
@@ -1406,7 +1406,7 @@ define <4 x i64> @test_mask_mullo_epi64_rmk_256(<4 x i64> %a, <4 x i64>* %ptr_b,
define <4 x i64> @test_mask_mullo_epi64_rmkz_256(<4 x i64> %a, <4 x i64>* %ptr_b, i8 %mask) {
; CHECK-LABEL: test_mask_mullo_epi64_rmkz_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpmullq (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x40,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -1417,7 +1417,7 @@ define <4 x i64> @test_mask_mullo_epi64_rmkz_256(<4 x i64> %a, <4 x i64>* %ptr_b
define <4 x i64> @test_mask_mullo_epi64_rmb_256(<4 x i64> %a, i64* %ptr_b) {
; CHECK-LABEL: test_mask_mullo_epi64_rmb_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpmullq (%rdi){1to4}, %ymm0, %ymm0 ## encoding: [0x62,0xf2,0xfd,0x38,0x40,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load i64, i64* %ptr_b
@@ -1429,7 +1429,7 @@ define <4 x i64> @test_mask_mullo_epi64_rmb_256(<4 x i64> %a, i64* %ptr_b) {
define <4 x i64> @test_mask_mullo_epi64_rmbk_256(<4 x i64> %a, i64* %ptr_b, <4 x i64> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_mullo_epi64_rmbk_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpmullq (%rdi){1to4}, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x39,0x40,0x0f]
; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
@@ -1443,7 +1443,7 @@ define <4 x i64> @test_mask_mullo_epi64_rmbk_256(<4 x i64> %a, i64* %ptr_b, <4 x
define <4 x i64> @test_mask_mullo_epi64_rmbkz_256(<4 x i64> %a, i64* %ptr_b, i8 %mask) {
; CHECK-LABEL: test_mask_mullo_epi64_rmbkz_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpmullq (%rdi){1to4}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xb9,0x40,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -1458,7 +1458,7 @@ declare <4 x i64> @llvm.x86.avx512.mask.pmull.q.256(<4 x i64>, <4 x i64>, <4 x i
define <2 x i64> @test_mask_mullo_epi64_rr_128(<2 x i64> %a, <2 x i64> %b) {
; CHECK-LABEL: test_mask_mullo_epi64_rr_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpmullq %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf2,0xfd,0x08,0x40,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x i64> @llvm.x86.avx512.mask.pmull.q.128(<2 x i64> %a, <2 x i64> %b, <2 x i64> zeroinitializer, i8 -1)
@@ -1467,7 +1467,7 @@ define <2 x i64> @test_mask_mullo_epi64_rr_128(<2 x i64> %a, <2 x i64> %b) {
define <2 x i64> @test_mask_mullo_epi64_rrk_128(<2 x i64> %a, <2 x i64> %b, <2 x i64> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_mullo_epi64_rrk_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpmullq %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x40,0xd1]
; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
@@ -1478,7 +1478,7 @@ define <2 x i64> @test_mask_mullo_epi64_rrk_128(<2 x i64> %a, <2 x i64> %b, <2 x
define <2 x i64> @test_mask_mullo_epi64_rrkz_128(<2 x i64> %a, <2 x i64> %b, i8 %mask) {
; CHECK-LABEL: test_mask_mullo_epi64_rrkz_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpmullq %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0x40,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -1488,7 +1488,7 @@ define <2 x i64> @test_mask_mullo_epi64_rrkz_128(<2 x i64> %a, <2 x i64> %b, i8
define <2 x i64> @test_mask_mullo_epi64_rm_128(<2 x i64> %a, <2 x i64>* %ptr_b) {
; CHECK-LABEL: test_mask_mullo_epi64_rm_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpmullq (%rdi), %xmm0, %xmm0 ## encoding: [0x62,0xf2,0xfd,0x08,0x40,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <2 x i64>, <2 x i64>* %ptr_b
@@ -1498,7 +1498,7 @@ define <2 x i64> @test_mask_mullo_epi64_rm_128(<2 x i64> %a, <2 x i64>* %ptr_b)
define <2 x i64> @test_mask_mullo_epi64_rmk_128(<2 x i64> %a, <2 x i64>* %ptr_b, <2 x i64> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_mullo_epi64_rmk_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpmullq (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x40,0x0f]
; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
@@ -1510,7 +1510,7 @@ define <2 x i64> @test_mask_mullo_epi64_rmk_128(<2 x i64> %a, <2 x i64>* %ptr_b,
define <2 x i64> @test_mask_mullo_epi64_rmkz_128(<2 x i64> %a, <2 x i64>* %ptr_b, i8 %mask) {
; CHECK-LABEL: test_mask_mullo_epi64_rmkz_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpmullq (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0x40,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -1521,7 +1521,7 @@ define <2 x i64> @test_mask_mullo_epi64_rmkz_128(<2 x i64> %a, <2 x i64>* %ptr_b
define <2 x i64> @test_mask_mullo_epi64_rmb_128(<2 x i64> %a, i64* %ptr_b) {
; CHECK-LABEL: test_mask_mullo_epi64_rmb_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpmullq (%rdi){1to2}, %xmm0, %xmm0 ## encoding: [0x62,0xf2,0xfd,0x18,0x40,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load i64, i64* %ptr_b
@@ -1533,7 +1533,7 @@ define <2 x i64> @test_mask_mullo_epi64_rmb_128(<2 x i64> %a, i64* %ptr_b) {
define <2 x i64> @test_mask_mullo_epi64_rmbk_128(<2 x i64> %a, i64* %ptr_b, <2 x i64> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_mullo_epi64_rmbk_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpmullq (%rdi){1to2}, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x19,0x40,0x0f]
; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
@@ -1547,7 +1547,7 @@ define <2 x i64> @test_mask_mullo_epi64_rmbk_128(<2 x i64> %a, i64* %ptr_b, <2 x
define <2 x i64> @test_mask_mullo_epi64_rmbkz_128(<2 x i64> %a, i64* %ptr_b, i8 %mask) {
; CHECK-LABEL: test_mask_mullo_epi64_rmbkz_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpmullq (%rdi){1to2}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x99,0x40,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -1564,7 +1564,7 @@ declare <2 x double> @llvm.x86.avx512.mask.vextractf64x2.256(<4 x double>, i32,
define <2 x double>@test_int_x86_avx512_mask_vextractf64x2_256(<4 x double> %x0, <2 x double> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vextractf64x2_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x19,0xc2,0x01]
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vextractf64x2 $1, %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x19,0xc1,0x01]
@@ -1584,7 +1584,7 @@ declare <4 x double> @llvm.x86.avx512.mask.insertf64x2.256(<4 x double>, <2 x do
define <4 x double>@test_int_x86_avx512_mask_insertf64x2_256(<4 x double> %x0, <2 x double> %x1, <4 x double> %x3, i8 %x4) {
; CHECK-LABEL: test_int_x86_avx512_mask_insertf64x2_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x18,0xd9,0x01]
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vinsertf64x2 $1, %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x18,0xd1,0x01]
@@ -1604,7 +1604,7 @@ declare <4 x i64> @llvm.x86.avx512.mask.inserti64x2.256(<4 x i64>, <2 x i64>, i3
define <4 x i64>@test_int_x86_avx512_mask_inserti64x2_256(<4 x i64> %x0, <2 x i64> %x1, <4 x i64> %x3, i8 %x4) {
; CHECK-LABEL: test_int_x86_avx512_mask_inserti64x2_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x38,0xd9,0x01]
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vinserti64x2 $1, %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x38,0xd1,0x01]
@@ -1624,7 +1624,7 @@ declare <4 x i32> @llvm.x86.avx512.cvtmask2d.128(i8)
define <4 x i32>@test_int_x86_avx512_cvtmask2d_128(i8 %x0) {
; CHECK-LABEL: test_int_x86_avx512_cvtmask2d_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k0 ## encoding: [0xc5,0xf8,0x92,0xc7]
; CHECK-NEXT: vpmovm2d %k0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x08,0x38,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -1636,7 +1636,7 @@ declare <8 x i32> @llvm.x86.avx512.cvtmask2d.256(i8)
define <8 x i32>@test_int_x86_avx512_cvtmask2d_256(i8 %x0) {
; CHECK-LABEL: test_int_x86_avx512_cvtmask2d_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k0 ## encoding: [0xc5,0xf8,0x92,0xc7]
; CHECK-NEXT: vpmovm2d %k0, %ymm0 ## encoding: [0x62,0xf2,0x7e,0x28,0x38,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -1648,7 +1648,7 @@ declare <2 x i64> @llvm.x86.avx512.cvtmask2q.128(i8)
define <2 x i64>@test_int_x86_avx512_cvtmask2q_128(i8 %x0) {
; CHECK-LABEL: test_int_x86_avx512_cvtmask2q_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k0 ## encoding: [0xc5,0xf8,0x92,0xc7]
; CHECK-NEXT: vpmovm2q %k0, %xmm0 ## encoding: [0x62,0xf2,0xfe,0x08,0x38,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -1660,10 +1660,147 @@ declare <4 x i64> @llvm.x86.avx512.cvtmask2q.256(i8)
define <4 x i64>@test_int_x86_avx512_cvtmask2q_256(i8 %x0) {
; CHECK-LABEL: test_int_x86_avx512_cvtmask2q_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k0 ## encoding: [0xc5,0xf8,0x92,0xc7]
; CHECK-NEXT: vpmovm2q %k0, %ymm0 ## encoding: [0x62,0xf2,0xfe,0x28,0x38,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i64> @llvm.x86.avx512.cvtmask2q.256(i8 %x0)
ret <4 x i64> %res
}
+
+declare <4 x double> @llvm.x86.avx512.mask.broadcastf64x2.256(<2 x double>, <4 x double>, i8)
+
+define <4 x double>@test_int_x86_avx512_mask_broadcastf64x2_256(<2 x double> %x0, <4 x double> %x2, i8 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_mask_broadcastf64x2_256:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: ## kill: def %xmm0 killed %xmm0 def %ymm0
+; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x18,0xd0,0x01]
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vinsertf64x2 $1, %xmm0, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x18,0xc8,0x01]
+; CHECK-NEXT: vaddpd %ymm1, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xc9]
+; CHECK-NEXT: vinsertf64x2 $1, %xmm0, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xa9,0x18,0xc0,0x01]
+; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x58,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+
+ %res1 = call <4 x double> @llvm.x86.avx512.mask.broadcastf64x2.256(<2 x double> %x0, <4 x double> %x2, i8 -1)
+ %res2 = call <4 x double> @llvm.x86.avx512.mask.broadcastf64x2.256(<2 x double> %x0, <4 x double> %x2, i8 %mask)
+ %res3 = call <4 x double> @llvm.x86.avx512.mask.broadcastf64x2.256(<2 x double> %x0, <4 x double> zeroinitializer, i8 %mask)
+ %res4 = fadd <4 x double> %res1, %res2
+ %res5 = fadd <4 x double> %res3, %res4
+ ret <4 x double> %res5
+}
+
+define <4 x double>@test_int_x86_avx512_mask_broadcastf64x2_256_load(<2 x double>* %x0ptr, <4 x double> %x2, i8 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_mask_broadcastf64x2_256_load:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vbroadcastf64x2 (%rdi), %ymm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x1a,0x07]
+; CHECK-NEXT: ## ymm0 {%k1} = mem[0,1,0,1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+
+ %x0 = load <2 x double>, <2 x double>* %x0ptr
+ %res = call <4 x double> @llvm.x86.avx512.mask.broadcastf64x2.256(<2 x double> %x0, <4 x double> %x2, i8 %mask)
+ ret <4 x double> %res
+}
+
+declare <4 x i64> @llvm.x86.avx512.mask.broadcasti64x2.256(<2 x i64>, <4 x i64>, i8)
+
+define <4 x i64>@test_int_x86_avx512_mask_broadcasti64x2_256(<2 x i64> %x0, <4 x i64> %x2, i8 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_mask_broadcasti64x2_256:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: ## kill: def %xmm0 killed %xmm0 def %ymm0
+; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x38,0xd0,0x01]
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vinserti64x2 $1, %xmm0, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x38,0xc8,0x01]
+; CHECK-NEXT: vpaddq %ymm1, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc9]
+; CHECK-NEXT: vinserti64x2 $1, %xmm0, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xa9,0x38,0xc0,0x01]
+; CHECK-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd4,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+
+ %res1 = call <4 x i64> @llvm.x86.avx512.mask.broadcasti64x2.256(<2 x i64> %x0, <4 x i64> %x2, i8 -1)
+ %res2 = call <4 x i64> @llvm.x86.avx512.mask.broadcasti64x2.256(<2 x i64> %x0, <4 x i64> %x2, i8 %mask)
+ %res3 = call <4 x i64> @llvm.x86.avx512.mask.broadcasti64x2.256(<2 x i64> %x0, <4 x i64> zeroinitializer, i8 %mask)
+ %res4 = add <4 x i64> %res1, %res2
+ %res5 = add <4 x i64> %res3, %res4
+ ret <4 x i64> %res5
+}
+
+define <4 x i64>@test_int_x86_avx512_mask_broadcasti64x2_256_load(<2 x i64>* %x0ptr, <4 x i64> %x2, i8 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_mask_broadcasti64x2_256_load:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vbroadcasti64x2 (%rdi), %ymm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x5a,0x07]
+; CHECK-NEXT: ## ymm0 {%k1} = mem[0,1,0,1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+
+ %x0 = load <2 x i64>, <2 x i64>* %x0ptr
+ %res = call <4 x i64> @llvm.x86.avx512.mask.broadcasti64x2.256(<2 x i64> %x0, <4 x i64> %x2, i8 %mask)
+ ret <4 x i64> %res
+}
+
+declare <8 x float> @llvm.x86.avx512.mask.broadcastf32x2.256(<4 x float>, <8 x float>, i8)
+
+define <8 x float>@test_int_x86_avx512_mask_broadcastf32x2_256(<4 x float> %x0, <8 x float> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_broadcastf32x2_256:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: ## kill: def %xmm0 killed %xmm0 def %ymm0
+; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x18,0xd0,0x01]
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vinsertf32x4 $1, %xmm0, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x18,0xc8,0x01]
+; CHECK-NEXT: vinsertf32x4 $1, %xmm0, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xa9,0x18,0xc0,0x01]
+; CHECK-NEXT: vaddps %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xc0]
+; CHECK-NEXT: vaddps %ymm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <8 x float> @llvm.x86.avx512.mask.broadcastf32x2.256(<4 x float> %x0, <8 x float> %x2, i8 %x3)
+ %res1 = call <8 x float> @llvm.x86.avx512.mask.broadcastf32x2.256(<4 x float> %x0, <8 x float> zeroinitializer, i8 %x3)
+ %res2 = call <8 x float> @llvm.x86.avx512.mask.broadcastf32x2.256(<4 x float> %x0, <8 x float> %x2, i8 -1)
+ %res3 = fadd <8 x float> %res, %res1
+ %res4 = fadd <8 x float> %res3, %res2
+ ret <8 x float> %res4
+}
+
+declare <8 x i32> @llvm.x86.avx512.mask.broadcasti32x2.256(<4 x i32>, <8 x i32>, i8)
+
+define <8 x i32>@test_int_x86_avx512_mask_broadcasti32x2_256(<4 x i32> %x0, <8 x i32> %x2, i8 %x3, i64 * %y_ptr) {
+; CHECK-LABEL: test_int_x86_avx512_mask_broadcasti32x2_256:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: ## kill: def %xmm0 killed %xmm0 def %ymm0
+; CHECK-NEXT: vmovq (%rsi), %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7e,0x16]
+; CHECK-NEXT: ## xmm2 = mem[0],zero
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vinserti32x4 $1, %xmm2, %ymm2, %ymm1 {%k1} ## encoding: [0x62,0xf3,0x6d,0x29,0x38,0xca,0x01]
+; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x38,0xd0,0x01]
+; CHECK-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xa9,0x38,0xc0,0x01]
+; CHECK-NEXT: vpaddd %ymm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc2]
+; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %y_64 = load i64, i64 * %y_ptr
+ %y_v2i64 = insertelement <2 x i64> undef, i64 %y_64, i32 0
+ %y = bitcast <2 x i64> %y_v2i64 to <4 x i32>
+ %res = call <8 x i32> @llvm.x86.avx512.mask.broadcasti32x2.256(<4 x i32> %y, <8 x i32> %x2, i8 %x3)
+ %res1 = call <8 x i32> @llvm.x86.avx512.mask.broadcasti32x2.256(<4 x i32> %x0, <8 x i32> zeroinitializer, i8 %x3)
+ %res2 = call <8 x i32> @llvm.x86.avx512.mask.broadcasti32x2.256(<4 x i32> %x0, <8 x i32> %x2, i8 -1)
+ %res3 = add <8 x i32> %res, %res1
+ %res4 = add <8 x i32> %res3, %res2
+ ret <8 x i32> %res4
+}
+
+declare <4 x i32> @llvm.x86.avx512.mask.broadcasti32x2.128(<4 x i32>, <4 x i32>, i8)
+
+define <4 x i32>@test_int_x86_avx512_mask_broadcasti32x2_128(<4 x i32> %x0, <4 x i32> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_broadcasti32x2_128:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x6f,0xc8]
+; CHECK-NEXT: vmovdqa32 %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x6f,0xd0]
+; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xca]
+; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <4 x i32> @llvm.x86.avx512.mask.broadcasti32x2.128(<4 x i32> %x0, <4 x i32> %x2, i8 %x3)
+ %res1 = call <4 x i32> @llvm.x86.avx512.mask.broadcasti32x2.128(<4 x i32> %x0, <4 x i32> zeroinitializer, i8 %x3)
+ %res2 = call <4 x i32> @llvm.x86.avx512.mask.broadcasti32x2.128(<4 x i32> %x0, <4 x i32> %x2, i8 -1)
+ %res3 = add <4 x i32> %res, %res1
+ %res4 = add <4 x i32> %res3, %res2
+ ret <4 x i32> %res4
+}
+
diff --git a/test/CodeGen/X86/avx512dqvl-intrinsics.ll b/test/CodeGen/X86/avx512dqvl-intrinsics.ll
index 1bfdfd0e634d..f201599c4aa9 100644
--- a/test/CodeGen/X86/avx512dqvl-intrinsics.ll
+++ b/test/CodeGen/X86/avx512dqvl-intrinsics.ll
@@ -5,7 +5,7 @@ declare <2 x i64> @llvm.x86.avx512.mask.cvtpd2qq.128(<2 x double>, <2 x i64>, i8
define <2 x i64>@test_int_x86_avx512_mask_cvt_pd2qq_128(<2 x double> %x0, <2 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_pd2qq_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vcvtpd2qq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0x7b,0xc8]
; CHECK-NEXT: vcvtpd2qq %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0x7b,0xc0]
@@ -21,7 +21,7 @@ declare <4 x i64> @llvm.x86.avx512.mask.cvtpd2qq.256(<4 x double>, <4 x i64>, i8
define <4 x i64>@test_int_x86_avx512_mask_cvt_pd2qq_256(<4 x double> %x0, <4 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_pd2qq_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vcvtpd2qq %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0x7b,0xc8]
; CHECK-NEXT: vcvtpd2qq %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0x7b,0xc0]
@@ -37,7 +37,7 @@ declare <2 x i64> @llvm.x86.avx512.mask.cvtpd2uqq.128(<2 x double>, <2 x i64>, i
define <2 x i64>@test_int_x86_avx512_mask_cvt_pd2uqq_128(<2 x double> %x0, <2 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_pd2uqq_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vcvtpd2uqq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0x79,0xc8]
; CHECK-NEXT: vcvtpd2uqq %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0x79,0xc0]
@@ -53,7 +53,7 @@ declare <4 x i64> @llvm.x86.avx512.mask.cvtpd2uqq.256(<4 x double>, <4 x i64>, i
define <4 x i64>@test_int_x86_avx512_mask_cvt_pd2uqq_256(<4 x double> %x0, <4 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_pd2uqq_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vcvtpd2uqq %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0x79,0xc8]
; CHECK-NEXT: vcvtpd2uqq %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0x79,0xc0]
@@ -69,7 +69,7 @@ declare <2 x i64> @llvm.x86.avx512.mask.cvtps2qq.128(<4 x float>, <2 x i64>, i8)
define <2 x i64>@test_int_x86_avx512_mask_cvt_ps2qq_128(<4 x float> %x0, <2 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ps2qq_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vcvtps2qq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x7b,0xc8]
; CHECK-NEXT: vcvtps2qq %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x7b,0xc0]
@@ -85,7 +85,7 @@ declare <4 x i64> @llvm.x86.avx512.mask.cvtps2qq.256(<4 x float>, <4 x i64>, i8)
define <4 x i64>@test_int_x86_avx512_mask_cvt_ps2qq_256(<4 x float> %x0, <4 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ps2qq_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vcvtps2qq %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x7b,0xc8]
; CHECK-NEXT: vcvtps2qq %xmm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0x7b,0xc0]
@@ -101,7 +101,7 @@ declare <2 x i64> @llvm.x86.avx512.mask.cvtps2uqq.128(<4 x float>, <2 x i64>, i8
define <2 x i64>@test_int_x86_avx512_mask_cvt_ps2uqq_128(<4 x float> %x0, <2 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ps2uqq_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vcvtps2uqq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x79,0xc8]
; CHECK-NEXT: vcvtps2uqq %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x79,0xc0]
@@ -117,7 +117,7 @@ declare <4 x i64> @llvm.x86.avx512.mask.cvtps2uqq.256(<4 x float>, <4 x i64>, i8
define <4 x i64>@test_int_x86_avx512_mask_cvt_ps2uqq_256(<4 x float> %x0, <4 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ps2uqq_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vcvtps2uqq %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x79,0xc8]
; CHECK-NEXT: vcvtps2uqq %xmm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0x79,0xc0]
@@ -133,7 +133,7 @@ declare <2 x double> @llvm.x86.avx512.mask.cvtqq2pd.128(<2 x i64>, <2 x double>,
define <2 x double>@test_int_x86_avx512_mask_cvt_qq2pd_128(<2 x i64> %x0, <2 x double> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_qq2pd_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vcvtqq2pd %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xfe,0x09,0xe6,0xc8]
; CHECK-NEXT: vcvtqq2pd %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfe,0x08,0xe6,0xc0]
@@ -149,7 +149,7 @@ declare <4 x double> @llvm.x86.avx512.mask.cvtqq2pd.256(<4 x i64>, <4 x double>,
define <4 x double>@test_int_x86_avx512_mask_cvt_qq2pd_256(<4 x i64> %x0, <4 x double> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_qq2pd_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vcvtqq2pd %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xfe,0x29,0xe6,0xc8]
; CHECK-NEXT: vcvtqq2pd %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfe,0x28,0xe6,0xc0]
@@ -165,7 +165,7 @@ declare <4 x float> @llvm.x86.avx512.mask.cvtqq2ps.128(<2 x i64>, <4 x float>, i
define <4 x float>@test_int_x86_avx512_mask_cvt_qq2ps_128(<2 x i64> %x0, <4 x float> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_qq2ps_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vcvtqq2ps %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xfc,0x09,0x5b,0xc8]
; CHECK-NEXT: vcvtqq2ps %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfc,0x08,0x5b,0xc0]
@@ -179,7 +179,7 @@ define <4 x float>@test_int_x86_avx512_mask_cvt_qq2ps_128(<2 x i64> %x0, <4 x fl
define <4 x float>@test_int_x86_avx512_mask_cvt_qq2ps_128_zext(<2 x i64> %x0, <4 x float> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_qq2ps_128_zext:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vcvtqq2ps %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xfc,0x09,0x5b,0xc8]
; CHECK-NEXT: vmovq %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7e,0xc9]
@@ -199,7 +199,7 @@ declare <4 x float> @llvm.x86.avx512.mask.cvtqq2ps.256(<4 x i64>, <4 x float>, i
define <4 x float>@test_int_x86_avx512_mask_cvt_qq2ps_256(<4 x i64> %x0, <4 x float> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_qq2ps_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vcvtqq2ps %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xfc,0x29,0x5b,0xc8]
; CHECK-NEXT: vcvtqq2ps %ymm0, %xmm0 ## encoding: [0x62,0xf1,0xfc,0x28,0x5b,0xc0]
@@ -215,7 +215,7 @@ declare <2 x i64> @llvm.x86.avx512.mask.cvttpd2qq.128(<2 x double>, <2 x i64>, i
define <2 x i64>@test_int_x86_avx512_mask_cvtt_pd2qq_128(<2 x double> %x0, <2 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_pd2qq_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vcvttpd2qq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0x7a,0xc8]
; CHECK-NEXT: vcvttpd2qq %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0x7a,0xc0]
@@ -231,7 +231,7 @@ declare <4 x i64> @llvm.x86.avx512.mask.cvttpd2qq.256(<4 x double>, <4 x i64>, i
define <4 x i64>@test_int_x86_avx512_mask_cvtt_pd2qq_256(<4 x double> %x0, <4 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_pd2qq_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vcvttpd2qq %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0x7a,0xc8]
; CHECK-NEXT: vcvttpd2qq %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0x7a,0xc0]
@@ -247,7 +247,7 @@ declare <2 x i64> @llvm.x86.avx512.mask.cvttpd2uqq.128(<2 x double>, <2 x i64>,
define <2 x i64>@test_int_x86_avx512_mask_cvtt_pd2uqq_128(<2 x double> %x0, <2 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_pd2uqq_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vcvttpd2uqq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0x78,0xc8]
; CHECK-NEXT: vcvttpd2uqq %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0x78,0xc0]
@@ -263,7 +263,7 @@ declare <4 x i64> @llvm.x86.avx512.mask.cvttpd2uqq.256(<4 x double>, <4 x i64>,
define <4 x i64>@test_int_x86_avx512_mask_cvtt_pd2uqq_256(<4 x double> %x0, <4 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_pd2uqq_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vcvttpd2uqq %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0x78,0xc8]
; CHECK-NEXT: vcvttpd2uqq %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0x78,0xc0]
@@ -279,7 +279,7 @@ declare <2 x i64> @llvm.x86.avx512.mask.cvttps2qq.128(<4 x float>, <2 x i64>, i8
define <2 x i64>@test_int_x86_avx512_mask_cvtt_ps2qq_128(<4 x float> %x0, <2 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_ps2qq_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vcvttps2qq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x7a,0xc8]
; CHECK-NEXT: vcvttps2qq %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x7a,0xc0]
@@ -295,7 +295,7 @@ declare <4 x i64> @llvm.x86.avx512.mask.cvttps2qq.256(<4 x float>, <4 x i64>, i8
define <4 x i64>@test_int_x86_avx512_mask_cvtt_ps2qq_256(<4 x float> %x0, <4 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_ps2qq_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vcvttps2qq %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x7a,0xc8]
; CHECK-NEXT: vcvttps2qq %xmm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0x7a,0xc0]
@@ -311,7 +311,7 @@ declare <2 x double> @llvm.x86.avx512.mask.cvtuqq2pd.128(<2 x i64>, <2 x double>
define <2 x double>@test_int_x86_avx512_mask_cvt_uqq2pd_128(<2 x i64> %x0, <2 x double> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_uqq2pd_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vcvtuqq2pd %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xfe,0x09,0x7a,0xc8]
; CHECK-NEXT: vcvtuqq2pd %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfe,0x08,0x7a,0xc0]
@@ -327,7 +327,7 @@ declare <4 x double> @llvm.x86.avx512.mask.cvtuqq2pd.256(<4 x i64>, <4 x double>
define <4 x double>@test_int_x86_avx512_mask_cvt_uqq2pd_256(<4 x i64> %x0, <4 x double> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_uqq2pd_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vcvtuqq2pd %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xfe,0x29,0x7a,0xc8]
; CHECK-NEXT: vcvtuqq2pd %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfe,0x28,0x7a,0xc0]
@@ -343,7 +343,7 @@ declare <4 x float> @llvm.x86.avx512.mask.cvtuqq2ps.128(<2 x i64>, <4 x float>,
define <4 x float>@test_int_x86_avx512_mask_cvt_uqq2ps_128(<2 x i64> %x0, <4 x float> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_uqq2ps_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vcvtuqq2ps %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xff,0x09,0x7a,0xc8]
; CHECK-NEXT: vcvtuqq2ps %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xff,0x08,0x7a,0xc0]
@@ -357,7 +357,7 @@ define <4 x float>@test_int_x86_avx512_mask_cvt_uqq2ps_128(<2 x i64> %x0, <4 x f
define <4 x float>@test_int_x86_avx512_mask_cvt_uqq2ps_128_zext(<2 x i64> %x0, <4 x float> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_uqq2ps_128_zext:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vcvtuqq2ps %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xff,0x09,0x7a,0xc8]
; CHECK-NEXT: vmovq %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7e,0xc9]
@@ -377,7 +377,7 @@ declare <4 x float> @llvm.x86.avx512.mask.cvtuqq2ps.256(<4 x i64>, <4 x float>,
define <4 x float>@test_int_x86_avx512_mask_cvt_uqq2ps_256(<4 x i64> %x0, <4 x float> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_uqq2ps_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vcvtuqq2ps %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xff,0x29,0x7a,0xc8]
; CHECK-NEXT: vcvtuqq2ps %ymm0, %xmm0 ## encoding: [0x62,0xf1,0xff,0x28,0x7a,0xc0]
@@ -393,7 +393,7 @@ declare <2 x i64> @llvm.x86.avx512.mask.cvttps2uqq.128(<4 x float>, <2 x i64>, i
define <2 x i64>@test_int_x86_avx512_mask_cvtt_ps2uqq_128(<4 x float> %x0, <2 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_ps2uqq_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vcvttps2uqq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x78,0xc8]
; CHECK-NEXT: vcvttps2uqq %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x78,0xc0]
@@ -409,7 +409,7 @@ declare <4 x i64> @llvm.x86.avx512.mask.cvttps2uqq.256(<4 x float>, <4 x i64>, i
define <4 x i64>@test_int_x86_avx512_mask_cvtt_ps2uqq_256(<4 x float> %x0, <4 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_ps2uqq_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vcvttps2uqq %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x78,0xc8]
; CHECK-NEXT: vcvttps2uqq %xmm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0x78,0xc0]
@@ -425,7 +425,7 @@ declare <2 x double> @llvm.x86.avx512.mask.reduce.pd.128(<2 x double>, i32, <2 x
define <2 x double>@test_int_x86_avx512_mask_reduce_pd_128(<2 x double> %x0, <2 x double> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_reduce_pd_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vreducepd $4, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x56,0xc8,0x04]
; CHECK-NEXT: vreducepd $8, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0xfd,0x08,0x56,0xc0,0x08]
@@ -441,7 +441,7 @@ declare <4 x double> @llvm.x86.avx512.mask.reduce.pd.256(<4 x double>, i32, <4 x
define <4 x double>@test_int_x86_avx512_mask_reduce_pd_256(<4 x double> %x0, <4 x double> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_reduce_pd_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vreducepd $4, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x56,0xc8,0x04]
; CHECK-NEXT: vreducepd $0, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0xfd,0x28,0x56,0xc0,0x00]
@@ -457,7 +457,7 @@ declare <4 x float> @llvm.x86.avx512.mask.reduce.ps.128(<4 x float>, i32, <4 x f
define <4 x float>@test_int_x86_avx512_mask_reduce_ps_128(<4 x float> %x0, <4 x float> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_reduce_ps_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vreduceps $4, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x56,0xc8,0x04]
; CHECK-NEXT: vreduceps $88, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0x7d,0x08,0x56,0xc0,0x58]
@@ -473,7 +473,7 @@ declare <8 x float> @llvm.x86.avx512.mask.reduce.ps.256(<8 x float>, i32, <8 x f
define <8 x float>@test_int_x86_avx512_mask_reduce_ps_256(<8 x float> %x0, <8 x float> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_reduce_ps_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vreduceps $11, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x56,0xc8,0x0b]
; CHECK-NEXT: vreduceps $11, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0x7d,0x28,0x56,0xc0,0x0b]
@@ -489,7 +489,7 @@ declare <2 x double> @llvm.x86.avx512.mask.range.pd.128(<2 x double>, <2 x doubl
define <2 x double>@test_int_x86_avx512_mask_range_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x3, i8 %x4) {
; CHECK-LABEL: test_int_x86_avx512_mask_range_pd_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vrangepd $4, %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x50,0xd1,0x04]
; CHECK-NEXT: vrangepd $8, %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0xfd,0x08,0x50,0xc1,0x08]
@@ -505,7 +505,7 @@ declare <4 x double> @llvm.x86.avx512.mask.range.pd.256(<4 x double>, <4 x doubl
define <4 x double>@test_int_x86_avx512_mask_range_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x3, i8 %x4) {
; CHECK-LABEL: test_int_x86_avx512_mask_range_pd_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vrangepd $4, %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x50,0xd1,0x04]
; CHECK-NEXT: vrangepd $88, %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0xfd,0x28,0x50,0xc1,0x58]
@@ -521,7 +521,7 @@ declare <4 x float> @llvm.x86.avx512.mask.range.ps.128(<4 x float>, <4 x float>,
define <4 x float>@test_int_x86_avx512_mask_range_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x3, i8 %x4) {
; CHECK-LABEL: test_int_x86_avx512_mask_range_ps_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vrangeps $4, %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x50,0xd1,0x04]
; CHECK-NEXT: vrangeps $88, %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0x7d,0x08,0x50,0xc1,0x58]
@@ -537,7 +537,7 @@ declare <8 x float> @llvm.x86.avx512.mask.range.ps.256(<8 x float>, <8 x float>,
define <8 x float>@test_int_x86_avx512_mask_range_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x3, i8 %x4) {
; CHECK-LABEL: test_int_x86_avx512_mask_range_ps_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vrangeps $4, %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x50,0xd1,0x04]
; CHECK-NEXT: vrangeps $88, %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0x7d,0x28,0x50,0xc1,0x58]
@@ -553,14 +553,14 @@ declare i8 @llvm.x86.avx512.mask.fpclass.ps.128(<4 x float>, i32, i8)
define i8 @test_int_x86_avx512_mask_fpclass_ps_128(<4 x float> %x0, i8 %x1) {
; CHECK-LABEL: test_int_x86_avx512_mask_fpclass_ps_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vfpclassps $2, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x66,0xc0,0x02]
; CHECK-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
; CHECK-NEXT: vfpclassps $4, %xmm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x08,0x66,0xc0,0x04]
; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
; CHECK-NEXT: addb %cl, %al ## encoding: [0x00,0xc8]
-; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: ## kill: def %al killed %al killed %eax
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call i8 @llvm.x86.avx512.mask.fpclass.ps.128(<4 x float> %x0, i32 2, i8 %x1)
%res1 = call i8 @llvm.x86.avx512.mask.fpclass.ps.128(<4 x float> %x0, i32 4, i8 -1)
@@ -572,14 +572,14 @@ declare i8 @llvm.x86.avx512.mask.fpclass.ps.256(<8 x float>, i32, i8)
define i8 @test_int_x86_avx512_mask_fpclass_ps_256(<8 x float> %x0, i8 %x1) {
; CHECK-LABEL: test_int_x86_avx512_mask_fpclass_ps_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vfpclassps $2, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x66,0xc0,0x02]
; CHECK-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
; CHECK-NEXT: vfpclassps $4, %ymm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x28,0x66,0xc0,0x04]
; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
; CHECK-NEXT: addb %cl, %al ## encoding: [0x00,0xc8]
-; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: ## kill: def %al killed %al killed %eax
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call i8 @llvm.x86.avx512.mask.fpclass.ps.256(<8 x float> %x0, i32 2, i8 %x1)
%res1 = call i8 @llvm.x86.avx512.mask.fpclass.ps.256(<8 x float> %x0, i32 4, i8 -1)
@@ -591,14 +591,14 @@ declare i8 @llvm.x86.avx512.mask.fpclass.pd.128(<2 x double>, i32, i8)
define i8 @test_int_x86_avx512_mask_fpclass_pd_128(<2 x double> %x0, i8 %x1) {
; CHECK-LABEL: test_int_x86_avx512_mask_fpclass_pd_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vfpclasspd $4, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x66,0xc0,0x04]
; CHECK-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
; CHECK-NEXT: vfpclasspd $2, %xmm0, %k0 ## encoding: [0x62,0xf3,0xfd,0x08,0x66,0xc0,0x02]
; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
; CHECK-NEXT: addb %cl, %al ## encoding: [0x00,0xc8]
-; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: ## kill: def %al killed %al killed %eax
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call i8 @llvm.x86.avx512.mask.fpclass.pd.128(<2 x double> %x0, i32 4, i8 %x1)
%res1 = call i8 @llvm.x86.avx512.mask.fpclass.pd.128(<2 x double> %x0, i32 2, i8 -1)
@@ -610,14 +610,14 @@ declare i8 @llvm.x86.avx512.mask.fpclass.pd.256(<4 x double>, i32, i8)
define i8 @test_int_x86_avx512_mask_fpclass_pd_256(<4 x double> %x0, i8 %x1) {
; CHECK-LABEL: test_int_x86_avx512_mask_fpclass_pd_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vfpclasspd $2, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x66,0xc0,0x02]
; CHECK-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
; CHECK-NEXT: vfpclasspd $4, %ymm0, %k0 ## encoding: [0x62,0xf3,0xfd,0x28,0x66,0xc0,0x04]
; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
; CHECK-NEXT: addb %cl, %al ## encoding: [0x00,0xc8]
-; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: ## kill: def %al killed %al killed %eax
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call i8 @llvm.x86.avx512.mask.fpclass.pd.256(<4 x double> %x0, i32 2, i8 %x1)
%res1 = call i8 @llvm.x86.avx512.mask.fpclass.pd.256(<4 x double> %x0, i32 4, i8 -1)
@@ -625,81 +625,14 @@ define i8 @test_int_x86_avx512_mask_fpclass_pd_256(<4 x double> %x0, i8 %x1) {
ret i8 %res2
}
-declare <8 x float> @llvm.x86.avx512.mask.broadcastf32x2.256(<4 x float>, <8 x float>, i8)
-
-define <8 x float>@test_int_x86_avx512_mask_broadcastf32x2_256(<4 x float> %x0, <8 x float> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_broadcastf32x2_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT: vbroadcastf32x2 %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x19,0xc8]
-; CHECK-NEXT: ## ymm1 {%k1} = xmm0[0,1,0,1,0,1,0,1]
-; CHECK-NEXT: vbroadcastf32x2 %xmm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x19,0xd0]
-; CHECK-NEXT: ## ymm2 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1]
-; CHECK-NEXT: vaddps %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xca]
-; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x19,0xc0]
-; CHECK-NEXT: vaddps %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xc0]
-; CHECK-NEXT: retq ## encoding: [0xc3]
- %res = call <8 x float> @llvm.x86.avx512.mask.broadcastf32x2.256(<4 x float> %x0, <8 x float> %x2, i8 %x3)
- %res1 = call <8 x float> @llvm.x86.avx512.mask.broadcastf32x2.256(<4 x float> %x0, <8 x float> zeroinitializer, i8 %x3)
- %res2 = call <8 x float> @llvm.x86.avx512.mask.broadcastf32x2.256(<4 x float> %x0, <8 x float> %x2, i8 -1)
- %res3 = fadd <8 x float> %res, %res1
- %res4 = fadd <8 x float> %res3, %res2
- ret <8 x float> %res4
-}
-
-declare <8 x i32> @llvm.x86.avx512.mask.broadcasti32x2.256(<4 x i32>, <8 x i32>, i8)
-
-define <8 x i32>@test_int_x86_avx512_mask_broadcasti32x2_256(<4 x i32> %x0, <8 x i32> %x2, i8 %x3, i64 * %y_ptr) {
-; CHECK-LABEL: test_int_x86_avx512_mask_broadcasti32x2_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT: vbroadcasti32x2 (%rsi), %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x59,0x0e]
-; CHECK-NEXT: ## ymm1 {%k1} = mem[0,1,0,1,0,1,0,1]
-; CHECK-NEXT: vbroadcasti32x2 %xmm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x59,0xd0]
-; CHECK-NEXT: ## ymm2 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1]
-; CHECK-NEXT: vpbroadcastq %xmm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x59,0xc0]
-; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc0]
-; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc0]
-; CHECK-NEXT: retq ## encoding: [0xc3]
- %y_64 = load i64, i64 * %y_ptr
- %y_v2i64 = insertelement <2 x i64> undef, i64 %y_64, i32 0
- %y = bitcast <2 x i64> %y_v2i64 to <4 x i32>
- %res = call <8 x i32> @llvm.x86.avx512.mask.broadcasti32x2.256(<4 x i32> %y, <8 x i32> %x2, i8 %x3)
- %res1 = call <8 x i32> @llvm.x86.avx512.mask.broadcasti32x2.256(<4 x i32> %x0, <8 x i32> zeroinitializer, i8 %x3)
- %res2 = call <8 x i32> @llvm.x86.avx512.mask.broadcasti32x2.256(<4 x i32> %x0, <8 x i32> %x2, i8 -1)
- %res3 = add <8 x i32> %res, %res1
- %res4 = add <8 x i32> %res3, %res2
- ret <8 x i32> %res4
-}
-
-declare <4 x i32> @llvm.x86.avx512.mask.broadcasti32x2.128(<4 x i32>, <4 x i32>, i8)
-
-define <4 x i32>@test_int_x86_avx512_mask_broadcasti32x2_128(<4 x i32> %x0, <4 x i32> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_broadcasti32x2_128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT: vbroadcasti32x2 %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x59,0xc8]
-; CHECK-NEXT: vbroadcasti32x2 %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x59,0xd0]
-; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xca]
-; CHECK-NEXT: vpbroadcastq %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x59,0xc0]
-; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0]
-; CHECK-NEXT: retq ## encoding: [0xc3]
- %res = call <4 x i32> @llvm.x86.avx512.mask.broadcasti32x2.128(<4 x i32> %x0, <4 x i32> %x2, i8 %x3)
- %res1 = call <4 x i32> @llvm.x86.avx512.mask.broadcasti32x2.128(<4 x i32> %x0, <4 x i32> zeroinitializer, i8 %x3)
- %res2 = call <4 x i32> @llvm.x86.avx512.mask.broadcasti32x2.128(<4 x i32> %x0, <4 x i32> %x2, i8 -1)
- %res3 = add <4 x i32> %res, %res1
- %res4 = add <4 x i32> %res3, %res2
- ret <4 x i32> %res4
-}
-
declare i8 @llvm.x86.avx512.cvtd2mask.128(<4 x i32>)
define i8@test_int_x86_avx512_cvtd2mask_128(<4 x i32> %x0) {
; CHECK-LABEL: test_int_x86_avx512_cvtd2mask_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpmovd2m %xmm0, %k0 ## encoding: [0x62,0xf2,0x7e,0x08,0x39,0xc0]
; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
-; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: ## kill: def %al killed %al killed %eax
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call i8 @llvm.x86.avx512.cvtd2mask.128(<4 x i32> %x0)
ret i8 %res
@@ -709,10 +642,10 @@ declare i8 @llvm.x86.avx512.cvtd2mask.256(<8 x i32>)
define i8@test_int_x86_avx512_cvtd2mask_256(<8 x i32> %x0) {
; CHECK-LABEL: test_int_x86_avx512_cvtd2mask_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpmovd2m %ymm0, %k0 ## encoding: [0x62,0xf2,0x7e,0x28,0x39,0xc0]
; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
-; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: ## kill: def %al killed %al killed %eax
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call i8 @llvm.x86.avx512.cvtd2mask.256(<8 x i32> %x0)
ret i8 %res
@@ -722,10 +655,10 @@ declare i8 @llvm.x86.avx512.cvtq2mask.128(<2 x i64>)
define i8@test_int_x86_avx512_cvtq2mask_128(<2 x i64> %x0) {
; CHECK-LABEL: test_int_x86_avx512_cvtq2mask_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpmovq2m %xmm0, %k0 ## encoding: [0x62,0xf2,0xfe,0x08,0x39,0xc0]
; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
-; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: ## kill: def %al killed %al killed %eax
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call i8 @llvm.x86.avx512.cvtq2mask.128(<2 x i64> %x0)
ret i8 %res
@@ -735,89 +668,11 @@ declare i8 @llvm.x86.avx512.cvtq2mask.256(<4 x i64>)
define i8@test_int_x86_avx512_cvtq2mask_256(<4 x i64> %x0) {
; CHECK-LABEL: test_int_x86_avx512_cvtq2mask_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpmovq2m %ymm0, %k0 ## encoding: [0x62,0xf2,0xfe,0x28,0x39,0xc0]
; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
-; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: ## kill: def %al killed %al killed %eax
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call i8 @llvm.x86.avx512.cvtq2mask.256(<4 x i64> %x0)
ret i8 %res
}
-
-declare <4 x double> @llvm.x86.avx512.mask.broadcastf64x2.256(<2 x double>, <4 x double>, i8)
-
-define <4 x double>@test_int_x86_avx512_mask_broadcastf64x2_256(<2 x double> %x0, <4 x double> %x2, i8 %mask) {
-; CHECK-LABEL: test_int_x86_avx512_mask_broadcastf64x2_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<def>
-; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT: vshuff64x2 $0, %ymm0, %ymm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xa9,0x23,0xd0,0x00]
-; CHECK-NEXT: ## ymm2 {%k1} {z} = ymm0[0,1,0,1]
-; CHECK-NEXT: vshuff64x2 $0, %ymm0, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x23,0xc8,0x00]
-; CHECK-NEXT: ## ymm1 {%k1} = ymm0[0,1,0,1]
-; CHECK-NEXT: vshuff64x2 $0, %ymm0, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0xfd,0x28,0x23,0xc0,0x00]
-; CHECK-NEXT: ## ymm0 = ymm0[0,1,0,1]
-; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x58,0xc1]
-; CHECK-NEXT: vaddpd %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xc0]
-; CHECK-NEXT: retq ## encoding: [0xc3]
-
- %res1 = call <4 x double> @llvm.x86.avx512.mask.broadcastf64x2.256(<2 x double> %x0, <4 x double> %x2, i8 -1)
- %res2 = call <4 x double> @llvm.x86.avx512.mask.broadcastf64x2.256(<2 x double> %x0, <4 x double> %x2, i8 %mask)
- %res3 = call <4 x double> @llvm.x86.avx512.mask.broadcastf64x2.256(<2 x double> %x0, <4 x double> zeroinitializer, i8 %mask)
- %res4 = fadd <4 x double> %res1, %res2
- %res5 = fadd <4 x double> %res3, %res4
- ret <4 x double> %res5
-}
-
-define <4 x double>@test_int_x86_avx512_mask_broadcastf64x2_256_load(<2 x double>* %x0ptr, <4 x double> %x2, i8 %mask) {
-; CHECK-LABEL: test_int_x86_avx512_mask_broadcastf64x2_256_load:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
-; CHECK-NEXT: vmovapd (%rdi), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0x0f]
-; CHECK-NEXT: vshuff64x2 $0, %ymm1, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf3,0xf5,0x29,0x23,0xc1,0x00]
-; CHECK-NEXT: ## ymm0 {%k1} = ymm1[0,1,0,1]
-; CHECK-NEXT: retq ## encoding: [0xc3]
-
- %x0 = load <2 x double>, <2 x double>* %x0ptr
- %res = call <4 x double> @llvm.x86.avx512.mask.broadcastf64x2.256(<2 x double> %x0, <4 x double> %x2, i8 %mask)
- ret <4 x double> %res
-}
-
-declare <4 x i64> @llvm.x86.avx512.mask.broadcasti64x2.256(<2 x i64>, <4 x i64>, i8)
-
-define <4 x i64>@test_int_x86_avx512_mask_broadcasti64x2_256(<2 x i64> %x0, <4 x i64> %x2, i8 %mask) {
-; CHECK-LABEL: test_int_x86_avx512_mask_broadcasti64x2_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<def>
-; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT: vshufi64x2 $0, %ymm0, %ymm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xa9,0x43,0xd0,0x00]
-; CHECK-NEXT: ## ymm2 {%k1} {z} = ymm0[0,1,0,1]
-; CHECK-NEXT: vshufi64x2 $0, %ymm0, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x43,0xc8,0x00]
-; CHECK-NEXT: ## ymm1 {%k1} = ymm0[0,1,0,1]
-; CHECK-NEXT: vshufi64x2 $0, %ymm0, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0xfd,0x28,0x43,0xc0,0x00]
-; CHECK-NEXT: ## ymm0 = ymm0[0,1,0,1]
-; CHECK-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd4,0xc1]
-; CHECK-NEXT: vpaddq %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc0]
-; CHECK-NEXT: retq ## encoding: [0xc3]
-
- %res1 = call <4 x i64> @llvm.x86.avx512.mask.broadcasti64x2.256(<2 x i64> %x0, <4 x i64> %x2, i8 -1)
- %res2 = call <4 x i64> @llvm.x86.avx512.mask.broadcasti64x2.256(<2 x i64> %x0, <4 x i64> %x2, i8 %mask)
- %res3 = call <4 x i64> @llvm.x86.avx512.mask.broadcasti64x2.256(<2 x i64> %x0, <4 x i64> zeroinitializer, i8 %mask)
- %res4 = add <4 x i64> %res1, %res2
- %res5 = add <4 x i64> %res3, %res4
- ret <4 x i64> %res5
-}
-
-define <4 x i64>@test_int_x86_avx512_mask_broadcasti64x2_256_load(<2 x i64>* %x0ptr, <4 x i64> %x2, i8 %mask) {
-; CHECK-LABEL: test_int_x86_avx512_mask_broadcasti64x2_256_load:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
-; CHECK-NEXT: vmovdqa (%rdi), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0x0f]
-; CHECK-NEXT: vshufi64x2 $0, %ymm1, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf3,0xf5,0x29,0x43,0xc1,0x00]
-; CHECK-NEXT: ## ymm0 {%k1} = ymm1[0,1,0,1]
-; CHECK-NEXT: retq ## encoding: [0xc3]
-
- %x0 = load <2 x i64>, <2 x i64>* %x0ptr
- %res = call <4 x i64> @llvm.x86.avx512.mask.broadcasti64x2.256(<2 x i64> %x0, <4 x i64> %x2, i8 %mask)
- ret <4 x i64> %res
-}
diff --git a/test/CodeGen/X86/avx512er-intrinsics.ll b/test/CodeGen/X86/avx512er-intrinsics.ll
index 0e4922f37bbb..cbb06dfbea68 100644
--- a/test/CodeGen/X86/avx512er-intrinsics.ll
+++ b/test/CodeGen/X86/avx512er-intrinsics.ll
@@ -3,7 +3,7 @@
define <16 x float> @test_rsqrt28_ps(<16 x float> %a0) {
; CHECK-LABEL: test_rsqrt28_ps:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vrsqrt28ps {sae}, %zmm0, %zmm0 # encoding: [0x62,0xf2,0x7d,0x18,0xcc,0xc0]
; CHECK-NEXT: retq # encoding: [0xc3]
%res = call <16 x float> @llvm.x86.avx512.rsqrt28.ps(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 8)
@@ -12,7 +12,7 @@ define <16 x float> @test_rsqrt28_ps(<16 x float> %a0) {
define <16 x float> @test1_rsqrt28_ps(<16 x float> %a0, <16 x float> %a1) {
; CHECK-LABEL: test1_rsqrt28_ps:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movw $6, %ax # encoding: [0x66,0xb8,0x06,0x00]
; CHECK-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
; CHECK-NEXT: vrsqrt28ps {sae}, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x19,0xcc,0xc8]
@@ -24,7 +24,7 @@ define <16 x float> @test1_rsqrt28_ps(<16 x float> %a0, <16 x float> %a1) {
define <16 x float> @test2_rsqrt28_ps(<16 x float> %a0) {
; CHECK-LABEL: test2_rsqrt28_ps:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movw $6, %ax # encoding: [0x66,0xb8,0x06,0x00]
; CHECK-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
; CHECK-NEXT: vrsqrt28ps %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xc9,0xcc,0xc0]
@@ -35,7 +35,7 @@ define <16 x float> @test2_rsqrt28_ps(<16 x float> %a0) {
define <16 x float> @test3_rsqrt28_ps(<16 x float> %a0) {
; CHECK-LABEL: test3_rsqrt28_ps:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movw $6, %ax # encoding: [0x66,0xb8,0x06,0x00]
; CHECK-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
; CHECK-NEXT: vrsqrt28ps %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xc9,0xcc,0xc0]
@@ -46,7 +46,7 @@ define <16 x float> @test3_rsqrt28_ps(<16 x float> %a0) {
define <16 x float> @test4_rsqrt28_ps(<16 x float> %a0) {
; CHECK-LABEL: test4_rsqrt28_ps:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movw $6, %ax # encoding: [0x66,0xb8,0x06,0x00]
; CHECK-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
; CHECK-NEXT: vrsqrt28ps {sae}, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x99,0xcc,0xc0]
@@ -60,7 +60,7 @@ declare <16 x float> @llvm.x86.avx512.rsqrt28.ps(<16 x float>, <16 x float>, i16
define <16 x float> @test_rcp28_ps_512(<16 x float> %a0) {
; CHECK-LABEL: test_rcp28_ps_512:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vrcp28ps {sae}, %zmm0, %zmm0 # encoding: [0x62,0xf2,0x7d,0x18,0xca,0xc0]
; CHECK-NEXT: retq # encoding: [0xc3]
%res = call <16 x float> @llvm.x86.avx512.rcp28.ps(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 8)
@@ -70,7 +70,7 @@ declare <16 x float> @llvm.x86.avx512.rcp28.ps(<16 x float>, <16 x float>, i16,
define <8 x double> @test_rcp28_pd_512(<8 x double> %a0) {
; CHECK-LABEL: test_rcp28_pd_512:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vrcp28pd {sae}, %zmm0, %zmm0 # encoding: [0x62,0xf2,0xfd,0x18,0xca,0xc0]
; CHECK-NEXT: retq # encoding: [0xc3]
%res = call <8 x double> @llvm.x86.avx512.rcp28.pd(<8 x double> %a0, <8 x double> zeroinitializer, i8 -1, i32 8)
@@ -80,7 +80,7 @@ declare <8 x double> @llvm.x86.avx512.rcp28.pd(<8 x double>, <8 x double>, i8, i
define <16 x float> @test_exp2_ps_512(<16 x float> %a0) {
; CHECK-LABEL: test_exp2_ps_512:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vexp2ps {sae}, %zmm0, %zmm0 # encoding: [0x62,0xf2,0x7d,0x18,0xc8,0xc0]
; CHECK-NEXT: retq # encoding: [0xc3]
%res = call <16 x float> @llvm.x86.avx512.exp2.ps(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 8)
@@ -90,7 +90,7 @@ declare <16 x float> @llvm.x86.avx512.exp2.ps(<16 x float>, <16 x float>, i16, i
define <8 x double> @test_exp2_pd_512(<8 x double> %a0) {
; CHECK-LABEL: test_exp2_pd_512:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vexp2pd {sae}, %zmm0, %zmm0 # encoding: [0x62,0xf2,0xfd,0x18,0xc8,0xc0]
; CHECK-NEXT: retq # encoding: [0xc3]
%res = call <8 x double> @llvm.x86.avx512.exp2.pd(<8 x double> %a0, <8 x double> zeroinitializer, i8 -1, i32 8)
@@ -100,7 +100,7 @@ declare <8 x double> @llvm.x86.avx512.exp2.pd(<8 x double>, <8 x double>, i8, i3
define <4 x float> @test_rsqrt28_ss(<4 x float> %a0) {
; CHECK-LABEL: test_rsqrt28_ss:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vrsqrt28ss {sae}, %xmm0, %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7d,0x18,0xcd,0xc0]
; CHECK-NEXT: retq # encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.rsqrt28.ss(<4 x float> %a0, <4 x float> %a0, <4 x float> zeroinitializer, i8 -1, i32 8) ; <<4 x float>> [#uses=1]
@@ -110,7 +110,7 @@ declare <4 x float> @llvm.x86.avx512.rsqrt28.ss(<4 x float>, <4 x float>, <4 x f
define <4 x float> @test_rcp28_ss(<4 x float> %a0) {
; CHECK-LABEL: test_rcp28_ss:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vrcp28ss {sae}, %xmm0, %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7d,0x18,0xcb,0xc0]
; CHECK-NEXT: retq # encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.rcp28.ss(<4 x float> %a0, <4 x float> %a0, <4 x float> zeroinitializer, i8 -1, i32 8) ; <<4 x float>> [#uses=1]
@@ -118,9 +118,29 @@ define <4 x float> @test_rcp28_ss(<4 x float> %a0) {
}
declare <4 x float> @llvm.x86.avx512.rcp28.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone
+define <4 x float> @test_rcp28_ss_load(<4 x float> %a0, <4 x float>* %a1ptr) {
+; CHECK-LABEL: test_rcp28_ss_load:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vrcp28ss (%rdi), %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7d,0x08,0xcb,0x07]
+; CHECK-NEXT: retq # encoding: [0xc3]
+ %a1 = load <4 x float>, <4 x float>* %a1ptr
+ %res = call <4 x float> @llvm.x86.avx512.rcp28.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> undef, i8 -1, i32 4) ; <<4 x float>> [#uses=1]
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_rsqrt28_ss_load(<4 x float> %a0, <4 x float>* %a1ptr) {
+; CHECK-LABEL: test_rsqrt28_ss_load:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vrsqrt28ss (%rdi), %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7d,0x08,0xcd,0x07]
+; CHECK-NEXT: retq # encoding: [0xc3]
+ %a1 = load <4 x float>, <4 x float>* %a1ptr
+ %res = call <4 x float> @llvm.x86.avx512.rsqrt28.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> undef, i8 -1, i32 4) ; <<4 x float>> [#uses=1]
+ ret <4 x float> %res
+}
+
define <4 x float> @test_rsqrt28_ss_maskz(<4 x float> %a0, i8 %mask) {
; CHECK-LABEL: test_rsqrt28_ss_maskz:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vrsqrt28ss {sae}, %xmm0, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x99,0xcd,0xc0]
; CHECK-NEXT: retq # encoding: [0xc3]
@@ -130,7 +150,7 @@ define <4 x float> @test_rsqrt28_ss_maskz(<4 x float> %a0, i8 %mask) {
define <4 x float> @test_rsqrt28_ss_mask(<4 x float> %a0, <4 x float> %b0, <4 x float> %c0, i8 %mask) {
; CHECK-LABEL: test_rsqrt28_ss_mask:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vrsqrt28ss {sae}, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x19,0xcd,0xd1]
; CHECK-NEXT: vmovaps %xmm2, %xmm0 # encoding: [0xc5,0xf8,0x28,0xc2]
@@ -139,9 +159,33 @@ define <4 x float> @test_rsqrt28_ss_mask(<4 x float> %a0, <4 x float> %b0, <4 x
ret <4 x float> %res
}
+define <2 x double> @test_rcp28_sd_mask_load(<2 x double> %a0, <2 x double>* %a1ptr, <2 x double> %a2, i8 %mask) {
+; CHECK-LABEL: test_rcp28_sd_mask_load:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vrcp28sd %xmm0, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0xcb,0xc8]
+; CHECK-NEXT: vmovapd %xmm1, %xmm0 # encoding: [0xc5,0xf9,0x28,0xc1]
+; CHECK-NEXT: retq # encoding: [0xc3]
+ %a1 = load <2 x double>, <2 x double>* %a1ptr
+ %res = call <2 x double> @llvm.x86.avx512.rcp28.sd(<2 x double> %a0, <2 x double> %a0, <2 x double> %a2, i8 %mask, i32 4) ;
+ ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.avx512.rcp28.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32) nounwind readnone
+
+define <2 x double> @test_rsqrt28_sd_maskz_load(<2 x double> %a0, <2 x double>* %a1ptr, i8 %mask) {
+; CHECK-LABEL: test_rsqrt28_sd_maskz_load:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vrsqrt28sd %xmm0, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x89,0xcd,0xc0]
+; CHECK-NEXT: retq # encoding: [0xc3]
+ %a1 = load <2 x double>, <2 x double>* %a1ptr
+ %res = call <2 x double> @llvm.x86.avx512.rsqrt28.sd(<2 x double> %a0, <2 x double> %a0, <2 x double> zeroinitializer, i8 %mask, i32 4) ;
+ ret <2 x double> %res
+}
+
define <2 x double> @test_rsqrt28_sd_maskz(<2 x double> %a0, i8 %mask) {
; CHECK-LABEL: test_rsqrt28_sd_maskz:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vrsqrt28sd {sae}, %xmm0, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x99,0xcd,0xc0]
; CHECK-NEXT: retq # encoding: [0xc3]
@@ -151,7 +195,7 @@ define <2 x double> @test_rsqrt28_sd_maskz(<2 x double> %a0, i8 %mask) {
define <2 x double> @test_rsqrt28_sd_mask(<2 x double> %a0, <2 x double> %b0, <2 x double> %c0, i8 %mask) {
; CHECK-LABEL: test_rsqrt28_sd_mask:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vrsqrt28sd {sae}, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x19,0xcd,0xd1]
; CHECK-NEXT: vmovapd %xmm2, %xmm0 # encoding: [0xc5,0xf9,0x28,0xc2]
@@ -164,7 +208,7 @@ declare <2 x double> @llvm.x86.avx512.rsqrt28.sd(<2 x double>, <2 x double>, <2
define <2 x double> @test_rsqrt28_sd_maskz_mem(<2 x double> %a0, double* %ptr, i8 %mask) {
; CHECK-LABEL: test_rsqrt28_sd_maskz_mem:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vrsqrt28sd (%rdi), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x89,0xcd,0x07]
; CHECK-NEXT: retq # encoding: [0xc3]
@@ -176,7 +220,7 @@ define <2 x double> @test_rsqrt28_sd_maskz_mem(<2 x double> %a0, double* %ptr, i
define <2 x double> @test_rsqrt28_sd_maskz_mem_offset(<2 x double> %a0, double* %ptr, i8 %mask) {
; CHECK-LABEL: test_rsqrt28_sd_maskz_mem_offset:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vrsqrt28sd 144(%rdi), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x89,0xcd,0x47,0x12]
; CHECK-NEXT: retq # encoding: [0xc3]
diff --git a/test/CodeGen/X86/avx512f-vec-test-testn.ll b/test/CodeGen/X86/avx512f-vec-test-testn.ll
new file mode 100644
index 000000000000..731f5ffa2cad
--- /dev/null
+++ b/test/CodeGen/X86/avx512f-vec-test-testn.ll
@@ -0,0 +1,147 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s
+
+; Function Attrs: norecurse nounwind readnone
+define zeroext i8 @TEST_mm512_test_epi64_mask(<8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
+; CHECK-LABEL: TEST_mm512_test_epi64_mask:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vptestmq %zmm0, %zmm1, %k0
+; CHECK-NEXT: kmovw %k0, %eax
+; CHECK-NEXT: # kill: def %al killed %al killed %eax
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+entry:
+ %and.i.i = and <8 x i64> %__B, %__A
+ %0 = icmp ne <8 x i64> %and.i.i, zeroinitializer
+ %1 = bitcast <8 x i1> %0 to i8
+ ret i8 %1
+}
+
+; Function Attrs: norecurse nounwind readnone
+define zeroext i16 @TEST_mm512_test_epi32_mask(<8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
+; CHECK-LABEL: TEST_mm512_test_epi32_mask:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vptestmd %zmm0, %zmm1, %k0
+; CHECK-NEXT: kmovw %k0, %eax
+; CHECK-NEXT: # kill: def %ax killed %ax killed %eax
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+entry:
+ %and.i.i = and <8 x i64> %__B, %__A
+ %0 = bitcast <8 x i64> %and.i.i to <16 x i32>
+ %1 = icmp ne <16 x i32> %0, zeroinitializer
+ %2 = bitcast <16 x i1> %1 to i16
+ ret i16 %2
+}
+
+; Function Attrs: norecurse nounwind readnone
+define zeroext i8 @TEST_mm512_mask_test_epi64_mask(i8 %__U, <8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
+; CHECK-LABEL: TEST_mm512_mask_test_epi64_mask:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vptestmq %zmm0, %zmm1, %k0 {%k1}
+; CHECK-NEXT: kmovw %k0, %eax
+; CHECK-NEXT: # kill: def %al killed %al killed %eax
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+entry:
+ %and.i.i = and <8 x i64> %__B, %__A
+ %0 = icmp ne <8 x i64> %and.i.i, zeroinitializer
+ %1 = bitcast i8 %__U to <8 x i1>
+ %2 = and <8 x i1> %0, %1
+ %3 = bitcast <8 x i1> %2 to i8
+ ret i8 %3
+}
+
+; Function Attrs: norecurse nounwind readnone
+define zeroext i16 @TEST_mm512_mask_test_epi32_mask(i16 %__U, <8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
+; CHECK-LABEL: TEST_mm512_mask_test_epi32_mask:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vptestmd %zmm0, %zmm1, %k0 {%k1}
+; CHECK-NEXT: kmovw %k0, %eax
+; CHECK-NEXT: # kill: def %ax killed %ax killed %eax
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+entry:
+ %and.i.i = and <8 x i64> %__B, %__A
+ %0 = bitcast <8 x i64> %and.i.i to <16 x i32>
+ %1 = icmp ne <16 x i32> %0, zeroinitializer
+ %2 = bitcast i16 %__U to <16 x i1>
+ %3 = and <16 x i1> %1, %2
+ %4 = bitcast <16 x i1> %3 to i16
+ ret i16 %4
+}
+
+; Function Attrs: norecurse nounwind readnone
+define zeroext i8 @TEST_mm512_testn_epi64_mask(<8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
+; CHECK-LABEL: TEST_mm512_testn_epi64_mask:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vptestnmq %zmm0, %zmm1, %k0
+; CHECK-NEXT: kmovw %k0, %eax
+; CHECK-NEXT: # kill: def %al killed %al killed %eax
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+entry:
+ %and.i.i = and <8 x i64> %__B, %__A
+ %0 = icmp eq <8 x i64> %and.i.i, zeroinitializer
+ %1 = bitcast <8 x i1> %0 to i8
+ ret i8 %1
+}
+
+; Function Attrs: norecurse nounwind readnone
+define zeroext i16 @TEST_mm512_testn_epi32_mask(<8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
+; CHECK-LABEL: TEST_mm512_testn_epi32_mask:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vptestnmd %zmm0, %zmm1, %k0
+; CHECK-NEXT: kmovw %k0, %eax
+; CHECK-NEXT: # kill: def %ax killed %ax killed %eax
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+entry:
+ %and.i.i = and <8 x i64> %__B, %__A
+ %0 = bitcast <8 x i64> %and.i.i to <16 x i32>
+ %1 = icmp eq <16 x i32> %0, zeroinitializer
+ %2 = bitcast <16 x i1> %1 to i16
+ ret i16 %2
+}
+
+; Function Attrs: norecurse nounwind readnone
+define zeroext i8 @TEST_mm512_mask_testn_epi64_mask(i8 %__U, <8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
+; CHECK-LABEL: TEST_mm512_mask_testn_epi64_mask:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vptestnmq %zmm0, %zmm1, %k0 {%k1}
+; CHECK-NEXT: kmovw %k0, %eax
+; CHECK-NEXT: # kill: def %al killed %al killed %eax
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+entry:
+ %and.i.i = and <8 x i64> %__B, %__A
+ %0 = icmp eq <8 x i64> %and.i.i, zeroinitializer
+ %1 = bitcast i8 %__U to <8 x i1>
+ %2 = and <8 x i1> %0, %1
+ %3 = bitcast <8 x i1> %2 to i8
+ ret i8 %3
+}
+
+; Function Attrs: norecurse nounwind readnone
+define zeroext i16 @TEST_mm512_mask_testn_epi32_mask(i16 %__U, <8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 {
+; CHECK-LABEL: TEST_mm512_mask_testn_epi32_mask:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: vptestnmd %zmm0, %zmm1, %k0 {%k1}
+; CHECK-NEXT: kmovw %k0, %eax
+; CHECK-NEXT: # kill: def %ax killed %ax killed %eax
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+entry:
+ %and.i.i = and <8 x i64> %__B, %__A
+ %0 = bitcast <8 x i64> %and.i.i to <16 x i32>
+ %1 = icmp eq <16 x i32> %0, zeroinitializer
+ %2 = bitcast i16 %__U to <16 x i1>
+ %3 = and <16 x i1> %1, %2
+ %4 = bitcast <16 x i1> %3 to i16
+ ret i16 %4
+}
+
diff --git a/test/CodeGen/X86/avx512ifma-intrinsics.ll b/test/CodeGen/X86/avx512ifma-intrinsics.ll
index 9659dc6d455a..1217138b2264 100644
--- a/test/CodeGen/X86/avx512ifma-intrinsics.ll
+++ b/test/CodeGen/X86/avx512ifma-intrinsics.ll
@@ -5,13 +5,13 @@ declare <8 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.512(<8 x i64>, <8 x i64>, <
define <8 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vpmadd52h_uq_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovdqa64 %zmm0, %zmm3
; CHECK-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm3
; CHECK-NEXT: vmovdqa64 %zmm0, %zmm4
; CHECK-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm4 {%k1}
-; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: vpaddq %zmm0, %zmm4, %zmm0
; CHECK-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm2 {%k1} {z}
@@ -33,13 +33,13 @@ declare <8 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.512(<8 x i64>, <8 x i64>,
define <8 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_maskz_vpmadd52h_uq_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovdqa64 %zmm0, %zmm3
; CHECK-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm3
; CHECK-NEXT: vmovdqa64 %zmm0, %zmm4
; CHECK-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm4 {%k1} {z}
-; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm0 {%k1} {z}
; CHECK-NEXT: vpaddq %zmm0, %zmm4, %zmm0
; CHECK-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm2 {%k1} {z}
@@ -61,13 +61,13 @@ declare <8 x i64> @llvm.x86.avx512.mask.vpmadd52l.uq.512(<8 x i64>, <8 x i64>, <
define <8 x i64>@test_int_x86_avx512_mask_vpmadd52l_uq_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vpmadd52l_uq_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovdqa64 %zmm0, %zmm3
; CHECK-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm3
; CHECK-NEXT: vmovdqa64 %zmm0, %zmm4
; CHECK-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm4 {%k1}
-; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: vpaddq %zmm0, %zmm4, %zmm0
; CHECK-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm2 {%k1} {z}
@@ -89,13 +89,13 @@ declare <8 x i64> @llvm.x86.avx512.maskz.vpmadd52l.uq.512(<8 x i64>, <8 x i64>,
define <8 x i64>@test_int_x86_avx512_maskz_vpmadd52l_uq_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_maskz_vpmadd52l_uq_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovdqa64 %zmm0, %zmm3
; CHECK-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm3
; CHECK-NEXT: vmovdqa64 %zmm0, %zmm4
; CHECK-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm4 {%k1} {z}
-; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm0 {%k1} {z}
; CHECK-NEXT: vpaddq %zmm0, %zmm4, %zmm0
; CHECK-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm2 {%k1} {z}
@@ -112,3 +112,155 @@ define <8 x i64>@test_int_x86_avx512_maskz_vpmadd52l_uq_512(<8 x i64> %x0, <8 x
%res6 = add <8 x i64> %res5, %res4
ret <8 x i64> %res6
}
+
+define <8 x i64>@test_int_x86_avx512_vpmadd52h_uq_512_load(<8 x i64> %x0, <8 x i64> %x1, <8 x i64>* %x2ptr) {
+; CHECK-LABEL: test_int_x86_avx512_vpmadd52h_uq_512_load:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpmadd52huq (%rdi), %zmm1, %zmm0
+; CHECK-NEXT: retq
+
+ %x2 = load <8 x i64>, <8 x i64>* %x2ptr
+ %res = call <8 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
+ ret <8 x i64> %res
+}
+
+define <8 x i64>@test_int_x86_avx512_vpmadd52h_uq_512_load_bcast(<8 x i64> %x0, <8 x i64> %x1, i64* %x2ptr) {
+; CHECK-LABEL: test_int_x86_avx512_vpmadd52h_uq_512_load_bcast:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpmadd52huq (%rdi){1to8}, %zmm1, %zmm0
+; CHECK-NEXT: retq
+
+ %x2load = load i64, i64* %x2ptr
+ %x2insert = insertelement <8 x i64> undef, i64 %x2load, i64 0
+ %x2 = shufflevector <8 x i64> %x2insert, <8 x i64> undef, <8 x i32> zeroinitializer
+ %res = call <8 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
+ ret <8 x i64> %res
+}
+
+define <8 x i64>@test_int_x86_avx512_vpmadd52h_uq_512_load_commute(<8 x i64> %x0, <8 x i64>* %x1ptr, <8 x i64> %x2) {
+; CHECK-LABEL: test_int_x86_avx512_vpmadd52h_uq_512_load_commute:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpmadd52huq (%rdi), %zmm1, %zmm0
+; CHECK-NEXT: retq
+
+ %x1 = load <8 x i64>, <8 x i64>* %x1ptr
+ %res = call <8 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
+ ret <8 x i64> %res
+}
+
+define <8 x i64>@test_int_x86_avx512_vpmadd52h_uq_512_load_commute_bcast(<8 x i64> %x0, i64* %x1ptr, <8 x i64> %x2) {
+; CHECK-LABEL: test_int_x86_avx512_vpmadd52h_uq_512_load_commute_bcast:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpmadd52huq (%rdi){1to8}, %zmm1, %zmm0
+; CHECK-NEXT: retq
+
+ %x1load = load i64, i64* %x1ptr
+ %x1insert = insertelement <8 x i64> undef, i64 %x1load, i64 0
+ %x1 = shufflevector <8 x i64> %x1insert, <8 x i64> undef, <8 x i32> zeroinitializer
+ %res = call <8 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
+ ret <8 x i64> %res
+}
+
+define <8 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_512_load(<8 x i64> %x0, <8 x i64> %x1, <8 x i64>* %x2ptr, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpmadd52h_uq_512_load:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovw %esi, %k1
+; CHECK-NEXT: vpmadd52huq (%rdi), %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: retq
+
+ %x2 = load <8 x i64>, <8 x i64>* %x2ptr
+ %res = call <8 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
+ ret <8 x i64> %res
+}
+
+define <8 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_512_load_bcast(<8 x i64> %x0, <8 x i64> %x1, i64* %x2ptr, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpmadd52h_uq_512_load_bcast:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovw %esi, %k1
+; CHECK-NEXT: vpmadd52huq (%rdi){1to8}, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: retq
+
+ %x2load = load i64, i64* %x2ptr
+ %x2insert = insertelement <8 x i64> undef, i64 %x2load, i64 0
+ %x2 = shufflevector <8 x i64> %x2insert, <8 x i64> undef, <8 x i32> zeroinitializer
+ %res = call <8 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
+ ret <8 x i64> %res
+}
+
+define <8 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_512_load_commute(<8 x i64> %x0, <8 x i64>* %x1ptr, <8 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpmadd52h_uq_512_load_commute:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovw %esi, %k1
+; CHECK-NEXT: vpmadd52huq (%rdi), %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: retq
+
+ %x1 = load <8 x i64>, <8 x i64>* %x1ptr
+ %res = call <8 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
+ ret <8 x i64> %res
+}
+
+define <8 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_512_load_commute_bcast(<8 x i64> %x0, i64* %x1ptr, <8 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpmadd52h_uq_512_load_commute_bcast:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovw %esi, %k1
+; CHECK-NEXT: vpmadd52huq (%rdi){1to8}, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: retq
+
+ %x1load = load i64, i64* %x1ptr
+ %x1insert = insertelement <8 x i64> undef, i64 %x1load, i64 0
+ %x1 = shufflevector <8 x i64> %x1insert, <8 x i64> undef, <8 x i32> zeroinitializer
+ %res = call <8 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
+ ret <8 x i64> %res
+}
+
+define <8 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_512_load(<8 x i64> %x0, <8 x i64> %x1, <8 x i64>* %x2ptr, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_maskz_vpmadd52h_uq_512_load:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovw %esi, %k1
+; CHECK-NEXT: vpmadd52huq (%rdi), %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+
+ %x2 = load <8 x i64>, <8 x i64>* %x2ptr
+ %res = call <8 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
+ ret <8 x i64> %res
+}
+
+define <8 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_512_load_bcast(<8 x i64> %x0, <8 x i64> %x1, i64* %x2ptr, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_maskz_vpmadd52h_uq_512_load_bcast:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovw %esi, %k1
+; CHECK-NEXT: vpmadd52huq (%rdi){1to8}, %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+
+ %x2load = load i64, i64* %x2ptr
+ %x2insert = insertelement <8 x i64> undef, i64 %x2load, i64 0
+ %x2 = shufflevector <8 x i64> %x2insert, <8 x i64> undef, <8 x i32> zeroinitializer
+ %res = call <8 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
+ ret <8 x i64> %res
+}
+
+define <8 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_512_load_commute(<8 x i64> %x0, <8 x i64>* %x1ptr, <8 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_maskz_vpmadd52h_uq_512_load_commute:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovw %esi, %k1
+; CHECK-NEXT: vpmadd52huq (%rdi), %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+
+ %x1 = load <8 x i64>, <8 x i64>* %x1ptr
+ %res = call <8 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
+ ret <8 x i64> %res
+}
+
+define <8 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_512_load_commute_bcast(<8 x i64> %x0, i64* %x1ptr, <8 x i64> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_maskz_vpmadd52h_uq_512_load_commute_bcast:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovw %esi, %k1
+; CHECK-NEXT: vpmadd52huq (%rdi){1to8}, %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+
+ %x1load = load i64, i64* %x1ptr
+ %x1insert = insertelement <8 x i64> undef, i64 %x1load, i64 0
+ %x1 = shufflevector <8 x i64> %x1insert, <8 x i64> undef, <8 x i32> zeroinitializer
+ %res = call <8 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
+ ret <8 x i64> %res
+}
diff --git a/test/CodeGen/X86/avx512ifmavl-intrinsics.ll b/test/CodeGen/X86/avx512ifmavl-intrinsics.ll
index b2fe6eba88ab..40312c9f5248 100644
--- a/test/CodeGen/X86/avx512ifmavl-intrinsics.ll
+++ b/test/CodeGen/X86/avx512ifmavl-intrinsics.ll
@@ -1,12 +1,11 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; NOTE: Assertions have been autogenerated by update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512vl -mattr=+avx512ifma | FileCheck %s
declare <2 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.128(<2 x i64>, <2 x i64>, <2 x i64>, i8)
define <2 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vpmadd52h_uq_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovdqa %xmm0, %xmm3
; CHECK-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm3
@@ -34,13 +33,13 @@ declare <4 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.256(<4 x i64>, <4 x i64>, <
define <4 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vpmadd52h_uq_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovdqa %ymm0, %ymm3
; CHECK-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm3
; CHECK-NEXT: vmovdqa %ymm0, %ymm4
; CHECK-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm4 {%k1}
-; CHECK-NEXT: vpxor %ymm2, %ymm2, %ymm2
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm0 {%k1}
; CHECK-NEXT: vpaddq %ymm0, %ymm4, %ymm0
; CHECK-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm2 {%k1} {z}
@@ -62,7 +61,7 @@ declare <2 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.128(<2 x i64>, <2 x i64>,
define <2 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_maskz_vpmadd52h_uq_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovdqa %xmm0, %xmm3
; CHECK-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm3
@@ -90,13 +89,13 @@ declare <4 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.256(<4 x i64>, <4 x i64>,
define <4 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_maskz_vpmadd52h_uq_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovdqa %ymm0, %ymm3
; CHECK-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm3
; CHECK-NEXT: vmovdqa %ymm0, %ymm4
; CHECK-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm4 {%k1} {z}
-; CHECK-NEXT: vpxor %ymm2, %ymm2, %ymm2
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm0 {%k1} {z}
; CHECK-NEXT: vpaddq %ymm0, %ymm4, %ymm0
; CHECK-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm2 {%k1} {z}
@@ -118,7 +117,7 @@ declare <2 x i64> @llvm.x86.avx512.mask.vpmadd52l.uq.128(<2 x i64>, <2 x i64>, <
define <2 x i64>@test_int_x86_avx512_mask_vpmadd52l_uq_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vpmadd52l_uq_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovdqa %xmm0, %xmm3
; CHECK-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm3
@@ -146,13 +145,13 @@ declare <4 x i64> @llvm.x86.avx512.mask.vpmadd52l.uq.256(<4 x i64>, <4 x i64>, <
define <4 x i64>@test_int_x86_avx512_mask_vpmadd52l_uq_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vpmadd52l_uq_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovdqa %ymm0, %ymm3
; CHECK-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm3
; CHECK-NEXT: vmovdqa %ymm0, %ymm4
; CHECK-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm4 {%k1}
-; CHECK-NEXT: vpxor %ymm2, %ymm2, %ymm2
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm0 {%k1}
; CHECK-NEXT: vpaddq %ymm0, %ymm4, %ymm0
; CHECK-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm2 {%k1} {z}
@@ -174,7 +173,7 @@ declare <2 x i64> @llvm.x86.avx512.maskz.vpmadd52l.uq.128(<2 x i64>, <2 x i64>,
define <2 x i64>@test_int_x86_avx512_maskz_vpmadd52l_uq_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_maskz_vpmadd52l_uq_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovdqa %xmm0, %xmm3
; CHECK-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm3
@@ -202,13 +201,13 @@ declare <4 x i64> @llvm.x86.avx512.maskz.vpmadd52l.uq.256(<4 x i64>, <4 x i64>,
define <4 x i64>@test_int_x86_avx512_maskz_vpmadd52l_uq_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_maskz_vpmadd52l_uq_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovdqa %ymm0, %ymm3
; CHECK-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm3
; CHECK-NEXT: vmovdqa %ymm0, %ymm4
; CHECK-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm4 {%k1} {z}
-; CHECK-NEXT: vpxor %ymm2, %ymm2, %ymm2
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm0 {%k1} {z}
; CHECK-NEXT: vpaddq %ymm0, %ymm4, %ymm0
; CHECK-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm2 {%k1} {z}
diff --git a/test/CodeGen/X86/avx512vbmi-intrinsics.ll b/test/CodeGen/X86/avx512vbmi-intrinsics.ll
index c98c7239b425..25f62a497844 100644
--- a/test/CodeGen/X86/avx512vbmi-intrinsics.ll
+++ b/test/CodeGen/X86/avx512vbmi-intrinsics.ll
@@ -4,7 +4,7 @@ declare <64 x i8> @llvm.x86.avx512.mask.permvar.qi.512(<64 x i8>, <64 x i8>, <64
define <64 x i8>@test_int_x86_avx512_mask_permvar_qi_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_permvar_qi_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovq %rdi, %k1
; CHECK-NEXT: vpermb %zmm0, %zmm1, %zmm2 {%k1}
; CHECK-NEXT: vpermb %zmm0, %zmm1, %zmm3 {%k1} {z}
@@ -24,7 +24,7 @@ declare <64 x i8> @llvm.x86.avx512.mask.pmultishift.qb.512(<64 x i8>, <64 x i8>,
define <64 x i8>@test_int_x86_avx512_mask_pmultishift_qb_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmultishift_qb_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovq %rdi, %k1
; CHECK-NEXT: vpmultishiftqb %zmm1, %zmm0, %zmm2 {%k1}
; CHECK-NEXT: vpmultishiftqb %zmm1, %zmm0, %zmm3 {%k1} {z}
@@ -44,12 +44,12 @@ declare <64 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.512(<64 x i8>, <64 x i8>,
define <64 x i8>@test_int_x86_avx512_mask_vpermi2var_qi_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_qi_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovq %rdi, %k1
; CHECK-NEXT: vmovdqa64 %zmm1, %zmm3
; CHECK-NEXT: vpermi2b %zmm2, %zmm0, %zmm3 {%k1}
; CHECK-NEXT: vpermi2b %zmm2, %zmm0, %zmm1
-; CHECK-NEXT: vpxord %zmm4, %zmm4, %zmm4
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
; CHECK-NEXT: vpermi2b %zmm2, %zmm0, %zmm4 {%k1} {z}
; CHECK-NEXT: vpaddb %zmm1, %zmm4, %zmm0
; CHECK-NEXT: vpaddb %zmm0, %zmm3, %zmm0
@@ -66,12 +66,12 @@ declare <64 x i8> @llvm.x86.avx512.mask.vpermt2var.qi.512(<64 x i8>, <64 x i8>,
define <64 x i8>@test_int_x86_avx512_mask_vpermt2var_qi_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vpermt2var_qi_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovq %rdi, %k1
; CHECK-NEXT: vmovdqa64 %zmm1, %zmm3
; CHECK-NEXT: vpermt2b %zmm2, %zmm0, %zmm3 {%k1}
; CHECK-NEXT: vpermt2b %zmm2, %zmm0, %zmm1
-; CHECK-NEXT: vpxord %zmm4, %zmm4, %zmm4
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4
; CHECK-NEXT: vpermt2b %zmm2, %zmm0, %zmm4 {%k1} {z}
; CHECK-NEXT: vpaddb %zmm1, %zmm4, %zmm0
; CHECK-NEXT: vpaddb %zmm0, %zmm3, %zmm0
@@ -88,7 +88,7 @@ declare <64 x i8> @llvm.x86.avx512.maskz.vpermt2var.qi.512(<64 x i8>, <64 x i8>,
define <64 x i8>@test_int_x86_avx512_maskz_vpermt2var_qi_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) {
; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_qi_512:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovq %rdi, %k1
; CHECK-NEXT: vpermi2b %zmm2, %zmm1, %zmm0 {%k1} {z}
; CHECK-NEXT: retq
diff --git a/test/CodeGen/X86/avx512vbmi2-intrinsics.ll b/test/CodeGen/X86/avx512vbmi2-intrinsics.ll
new file mode 100644
index 000000000000..7d307a828673
--- /dev/null
+++ b/test/CodeGen/X86/avx512vbmi2-intrinsics.ll
@@ -0,0 +1,327 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512vbmi2 | FileCheck %s
+
+define <32 x i16> @test_expand_load_w_512(i8* %addr, <32 x i16> %data, i32 %mask) {
+; CHECK-LABEL: test_expand_load_w_512:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vpexpandw (%rdi), %zmm0 {%k1}
+; CHECK-NEXT: retq
+ %res = call <32 x i16> @llvm.x86.avx512.mask.expand.load.w.512(i8* %addr, <32 x i16> %data, i32 %mask)
+ ret <32 x i16> %res
+}
+declare <32 x i16> @llvm.x86.avx512.mask.expand.load.w.512(i8* %addr, <32 x i16> %data, i32 %mask)
+
+define void @test_compress_store_w_512(i8* %addr, <32 x i16> %data, i32 %mask) {
+; CHECK-LABEL: test_compress_store_w_512:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vpcompressw %zmm0, (%rdi) {%k1}
+; CHECK-NEXT: retq
+ call void @llvm.x86.avx512.mask.compress.store.w.512(i8* %addr, <32 x i16> %data, i32 %mask)
+ ret void
+}
+declare void @llvm.x86.avx512.mask.compress.store.w.512(i8* %addr, <32 x i16> %data, i32 %mask)
+
+define <64 x i8> @test_expand_load_b_512(i8* %addr, <64 x i8> %data, i64 %mask) {
+; CHECK-LABEL: test_expand_load_b_512:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovq %rsi, %k1
+; CHECK-NEXT: vpexpandb (%rdi), %zmm0 {%k1}
+; CHECK-NEXT: retq
+ %res = call <64 x i8> @llvm.x86.avx512.mask.expand.load.b.512(i8* %addr, <64 x i8> %data, i64 %mask)
+ ret <64 x i8> %res
+}
+declare <64 x i8> @llvm.x86.avx512.mask.expand.load.b.512(i8* %addr, <64 x i8> %data, i64 %mask)
+
+define void @test_compress_store_b_512(i8* %addr, <64 x i8> %data, i64 %mask) {
+; CHECK-LABEL: test_compress_store_b_512:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovq %rsi, %k1
+; CHECK-NEXT: vpcompressb %zmm0, (%rdi) {%k1}
+; CHECK-NEXT: retq
+ call void @llvm.x86.avx512.mask.compress.store.b.512(i8* %addr, <64 x i8> %data, i64 %mask)
+ ret void
+}
+declare void @llvm.x86.avx512.mask.compress.store.b.512(i8* %addr, <64 x i8> %data, i64 %mask)
+
+define <32 x i16> @test_compress_w_512(<32 x i16> %data, <32 x i16> %src, i32 %mask) {
+; CHECK-LABEL: test_compress_w_512:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vpcompressw %zmm0, %zmm1 {%k1}
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %res = call <32 x i16> @llvm.x86.avx512.mask.compress.w.512(<32 x i16> %data, <32 x i16> %src, i32 %mask)
+ ret <32 x i16> %res
+}
+declare <32 x i16> @llvm.x86.avx512.mask.compress.w.512(<32 x i16>, <32 x i16>, i32)
+
+define <64 x i8> @test_compress_b_512(<64 x i8> %data, <64 x i8> %src, i64 %mask) {
+; CHECK-LABEL: test_compress_b_512:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovq %rdi, %k1
+; CHECK-NEXT: vpcompressb %zmm0, %zmm1 {%k1}
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %res = call <64 x i8> @llvm.x86.avx512.mask.compress.b.512(<64 x i8> %data, <64 x i8> %src, i64 %mask)
+ ret <64 x i8> %res
+}
+declare <64 x i8> @llvm.x86.avx512.mask.compress.b.512(<64 x i8>, <64 x i8>, i64)
+
+define <32 x i16> @test_expand_w_512(i8* %addr, <32 x i16> %data, i32 %mask) {
+; CHECK-LABEL: test_expand_w_512:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vpexpandw %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %res = call <32 x i16> @llvm.x86.avx512.mask.expand.w.512(<32 x i16> %data, <32 x i16> zeroinitializer, i32 %mask)
+ ret <32 x i16> %res
+}
+declare <32 x i16> @llvm.x86.avx512.mask.expand.w.512(<32 x i16>, <32 x i16>, i32)
+
+define <64 x i8> @test_expand_b_512(i8* %addr, <64 x i8> %data, i64 %mask) {
+; CHECK-LABEL: test_expand_b_512:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovq %rsi, %k1
+; CHECK-NEXT: vpexpandb %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %res = call <64 x i8> @llvm.x86.avx512.mask.expand.b.512(<64 x i8> %data, <64 x i8> zeroinitializer, i64 %mask)
+ ret <64 x i8> %res
+}
+declare <64 x i8> @llvm.x86.avx512.mask.expand.b.512(<64 x i8>, <64 x i8>, i64)
+
+define <16 x i32>@test_int_x86_avx512_mask_vpshld_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x3, i16 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpshld_d_512:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vpshldd $22, %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT: vpshldd $22, %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: vpaddd %zmm0, %zmm2, %zmm0
+; CHECK-NEXT: retq
+ %res = call <16 x i32> @llvm.x86.avx512.mask.vpshld.d.512(<16 x i32> %x0, <16 x i32> %x1, i32 22, <16 x i32> %x3, i16 %x4)
+ %res1 = call <16 x i32> @llvm.x86.avx512.mask.vpshld.d.512(<16 x i32> %x0, <16 x i32> %x1, i32 22, <16 x i32> %x3, i16 -1)
+ %res2 = add <16 x i32> %res, %res1
+ ret <16 x i32> %res2
+}
+declare <16 x i32> @llvm.x86.avx512.mask.vpshld.d.512(<16 x i32>, <16 x i32>, i32, <16 x i32>, i16)
+
+define <8 x i64>@test_int_x86_avx512_mask_vpshld_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x3, i8 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpshld_q_512:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vpshldq $22, %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT: vpshldq $22, %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: vpaddq %zmm0, %zmm2, %zmm0
+; CHECK-NEXT: retq
+ %res = call <8 x i64> @llvm.x86.avx512.mask.vpshld.q.512(<8 x i64> %x0, <8 x i64> %x1, i32 22, <8 x i64> %x3, i8 %x4)
+ %res1 = call <8 x i64> @llvm.x86.avx512.mask.vpshld.q.512(<8 x i64> %x0, <8 x i64> %x1, i32 22, <8 x i64> %x3, i8 -1)
+ %res2 = add <8 x i64> %res, %res1
+ ret <8 x i64> %res2
+}
+declare <8 x i64> @llvm.x86.avx512.mask.vpshld.q.512(<8 x i64>, <8 x i64>, i32, <8 x i64>, i8)
+
+define <32 x i16>@test_int_x86_avx512_mask_vpshld_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x3, i32 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpshld_w_512:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vpshldw $22, %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT: vpshldw $22, %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: vpaddw %zmm0, %zmm2, %zmm0
+; CHECK-NEXT: retq
+ %res = call <32 x i16> @llvm.x86.avx512.mask.vpshld.w.512(<32 x i16> %x0, <32 x i16> %x1, i32 22, <32 x i16> %x3, i32 %x4)
+ %res1 = call <32 x i16> @llvm.x86.avx512.mask.vpshld.w.512(<32 x i16> %x0, <32 x i16> %x1, i32 22, <32 x i16> %x3, i32 -1)
+ %res2 = add <32 x i16> %res, %res1
+ ret <32 x i16> %res2
+}
+declare <32 x i16> @llvm.x86.avx512.mask.vpshld.w.512(<32 x i16>, <32 x i16>, i32, <32 x i16>, i32)
+
+define <16 x i32>@test_int_x86_avx512_mask_vpshrd_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x3, i16 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpshrd_d_512:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vpshrdd $22, %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT: vpshrdd $22, %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: vpaddd %zmm0, %zmm2, %zmm0
+; CHECK-NEXT: retq
+ %res = call <16 x i32> @llvm.x86.avx512.mask.vpshrd.d.512(<16 x i32> %x0, <16 x i32> %x1, i32 22, <16 x i32> %x3, i16 %x4)
+ %res1 = call <16 x i32> @llvm.x86.avx512.mask.vpshrd.d.512(<16 x i32> %x0, <16 x i32> %x1, i32 22, <16 x i32> %x3, i16 -1)
+ %res2 = add <16 x i32> %res, %res1
+ ret <16 x i32> %res2
+}
+declare <16 x i32> @llvm.x86.avx512.mask.vpshrd.d.512(<16 x i32>, <16 x i32>, i32, <16 x i32>, i16)
+
+define <8 x i64>@test_int_x86_avx512_mask_vpshrd_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x3, i8 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpshrd_q_512:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vpshrdq $22, %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT: vpshrdq $22, %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: vpaddq %zmm0, %zmm2, %zmm0
+; CHECK-NEXT: retq
+ %res = call <8 x i64> @llvm.x86.avx512.mask.vpshrd.q.512(<8 x i64> %x0, <8 x i64> %x1, i32 22, <8 x i64> %x3, i8 %x4)
+ %res1 = call <8 x i64> @llvm.x86.avx512.mask.vpshrd.q.512(<8 x i64> %x0, <8 x i64> %x1, i32 22, <8 x i64> %x3, i8 -1)
+ %res2 = add <8 x i64> %res, %res1
+ ret <8 x i64> %res2
+}
+declare <8 x i64> @llvm.x86.avx512.mask.vpshrd.q.512(<8 x i64>, <8 x i64>, i32, <8 x i64>, i8)
+
+define <32 x i16>@test_int_x86_avx512_mask_vpshrd_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x3, i32 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpshrd_w_512:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vpshrdw $22, %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT: vpshrdw $22, %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: vpaddw %zmm0, %zmm2, %zmm0
+; CHECK-NEXT: retq
+ %res = call <32 x i16> @llvm.x86.avx512.mask.vpshrd.w.512(<32 x i16> %x0, <32 x i16> %x1, i32 22, <32 x i16> %x3, i32 %x4)
+ %res1 = call <32 x i16> @llvm.x86.avx512.mask.vpshrd.w.512(<32 x i16> %x0, <32 x i16> %x1, i32 22, <32 x i16> %x3, i32 -1)
+ %res2 = add <32 x i16> %res, %res1
+ ret <32 x i16> %res2
+}
+declare <32 x i16> @llvm.x86.avx512.mask.vpshrd.w.512(<32 x i16>, <32 x i16>, i32, <32 x i16>, i32)
+
+declare <16 x i32> @llvm.x86.avx512.mask.vpshrdv.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
+declare <16 x i32> @llvm.x86.avx512.maskz.vpshrdv.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
+
+define <16 x i32>@test_int_x86_avx512_mask_vpshrdv_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32>* %x2p, <16 x i32> %x4, i16 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpshrdv_d_512:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vmovdqa64 %zmm0, %zmm3
+; CHECK-NEXT: vpshrdvd (%rdi), %zmm1, %zmm3 {%k1}
+; CHECK-NEXT: vmovdqa64 %zmm0, %zmm4
+; CHECK-NEXT: vpshrdvd %zmm2, %zmm1, %zmm4
+; CHECK-NEXT: vpshrdvd %zmm2, %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT: vpaddd %zmm0, %zmm4, %zmm0
+; CHECK-NEXT: vpaddd %zmm0, %zmm3, %zmm0
+; CHECK-NEXT: retq
+ %x2 = load <16 x i32>, <16 x i32>* %x2p
+ %res = call <16 x i32> @llvm.x86.avx512.mask.vpshrdv.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)
+ %res1 = call <16 x i32> @llvm.x86.avx512.mask.vpshrdv.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x4, i16 -1)
+ %res2 = call <16 x i32> @llvm.x86.avx512.maskz.vpshrdv.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x4, i16 %x3)
+ %res3 = add <16 x i32> %res, %res1
+ %res4 = add <16 x i32> %res2, %res3
+ ret <16 x i32> %res4
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.vpshrdv.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
+declare <8 x i64> @llvm.x86.avx512.maskz.vpshrdv.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
+
+define <8 x i64>@test_int_x86_avx512_mask_vpshrdv_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64>* %x2p, <8 x i64> %x4, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpshrdv_q_512:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vmovdqa64 %zmm0, %zmm3
+; CHECK-NEXT: vpshrdvq (%rdi), %zmm1, %zmm3 {%k1}
+; CHECK-NEXT: vmovdqa64 %zmm0, %zmm4
+; CHECK-NEXT: vpshrdvq %zmm2, %zmm1, %zmm4
+; CHECK-NEXT: vpshrdvq %zmm2, %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT: vpaddq %zmm0, %zmm4, %zmm0
+; CHECK-NEXT: vpaddq %zmm0, %zmm3, %zmm0
+; CHECK-NEXT: retq
+ %x2 = load <8 x i64>, <8 x i64>* %x2p
+ %res = call <8 x i64> @llvm.x86.avx512.mask.vpshrdv.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
+ %res1 = call <8 x i64> @llvm.x86.avx512.mask.vpshrdv.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x4, i8 -1)
+ %res2 = call <8 x i64> @llvm.x86.avx512.maskz.vpshrdv.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x4, i8 %x3)
+ %res3 = add <8 x i64> %res, %res1
+ %res4 = add <8 x i64> %res2, %res3
+ ret <8 x i64> %res4
+}
+
+declare <32 x i16> @llvm.x86.avx512.mask.vpshrdv.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
+declare <32 x i16> @llvm.x86.avx512.maskz.vpshrdv.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
+
+define <32 x i16>@test_int_x86_avx512_mask_vpshrdv_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16>* %x2p, <32 x i16> %x4, i32 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpshrdv_w_512:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vmovdqa64 %zmm0, %zmm3
+; CHECK-NEXT: vpshrdvw (%rdi), %zmm1, %zmm3 {%k1}
+; CHECK-NEXT: vmovdqa64 %zmm0, %zmm4
+; CHECK-NEXT: vpshrdvw %zmm2, %zmm1, %zmm4 {%k1} {z}
+; CHECK-NEXT: vpshrdvw %zmm2, %zmm1, %zmm0
+; CHECK-NEXT: vpaddw %zmm4, %zmm0, %zmm0
+; CHECK-NEXT: vpaddw %zmm0, %zmm3, %zmm0
+; CHECK-NEXT: retq
+ %x2 = load <32 x i16>, <32 x i16>* %x2p
+ %res = call <32 x i16> @llvm.x86.avx512.mask.vpshrdv.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
+ %res1 = call <32 x i16> @llvm.x86.avx512.mask.vpshrdv.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x4, i32 -1)
+ %res2 = call <32 x i16> @llvm.x86.avx512.maskz.vpshrdv.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x4, i32 %x3)
+ %res3 = add <32 x i16> %res, %res1
+ %res4 = add <32 x i16> %res2, %res3
+ ret <32 x i16> %res4
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.vpshldv.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
+declare <16 x i32> @llvm.x86.avx512.maskz.vpshldv.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
+
+define <16 x i32>@test_int_x86_avx512_mask_vpshldv_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32>* %x2p, <16 x i32> %x4, i16 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpshldv_d_512:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vmovdqa64 %zmm0, %zmm3
+; CHECK-NEXT: vpshldvd (%rdi), %zmm1, %zmm3 {%k1}
+; CHECK-NEXT: vmovdqa64 %zmm0, %zmm4
+; CHECK-NEXT: vpshldvd %zmm2, %zmm1, %zmm4
+; CHECK-NEXT: vpshldvd %zmm2, %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT: vpaddd %zmm0, %zmm4, %zmm0
+; CHECK-NEXT: vpaddd %zmm0, %zmm3, %zmm0
+; CHECK-NEXT: retq
+ %x2 = load <16 x i32>, <16 x i32>* %x2p
+ %res = call <16 x i32> @llvm.x86.avx512.mask.vpshldv.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)
+ %res1 = call <16 x i32> @llvm.x86.avx512.mask.vpshldv.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x4, i16 -1)
+ %res2 = call <16 x i32> @llvm.x86.avx512.maskz.vpshldv.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x4, i16 %x3)
+ %res3 = add <16 x i32> %res, %res1
+ %res4 = add <16 x i32> %res2, %res3
+ ret <16 x i32> %res4
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.vpshldv.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
+declare <8 x i64> @llvm.x86.avx512.maskz.vpshldv.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
+
+define <8 x i64>@test_int_x86_avx512_mask_vpshldv_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64>* %x2p, <8 x i64> %x4, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpshldv_q_512:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vmovdqa64 %zmm0, %zmm3
+; CHECK-NEXT: vpshldvq (%rdi), %zmm1, %zmm3 {%k1}
+; CHECK-NEXT: vmovdqa64 %zmm0, %zmm4
+; CHECK-NEXT: vpshldvq %zmm2, %zmm1, %zmm4
+; CHECK-NEXT: vpshldvq %zmm2, %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT: vpaddq %zmm0, %zmm4, %zmm0
+; CHECK-NEXT: vpaddq %zmm0, %zmm3, %zmm0
+; CHECK-NEXT: retq
+ %x2 = load <8 x i64>, <8 x i64>* %x2p
+ %res = call <8 x i64> @llvm.x86.avx512.mask.vpshldv.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
+ %res1 = call <8 x i64> @llvm.x86.avx512.mask.vpshldv.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x4, i8 -1)
+ %res2 = call <8 x i64> @llvm.x86.avx512.maskz.vpshldv.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x4, i8 %x3)
+ %res3 = add <8 x i64> %res, %res1
+ %res4 = add <8 x i64> %res2, %res3
+ ret <8 x i64> %res4
+}
+
+declare <32 x i16> @llvm.x86.avx512.mask.vpshldv.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
+declare <32 x i16> @llvm.x86.avx512.maskz.vpshldv.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
+
+define <32 x i16>@test_int_x86_avx512_mask_vpshldv_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16>* %x2p, <32 x i16> %x4, i32 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpshldv_w_512:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vmovdqa64 %zmm0, %zmm3
+; CHECK-NEXT: vpshldvw (%rdi), %zmm1, %zmm3 {%k1}
+; CHECK-NEXT: vmovdqa64 %zmm0, %zmm4
+; CHECK-NEXT: vpshldvw %zmm2, %zmm1, %zmm4 {%k1} {z}
+; CHECK-NEXT: vpshldvw %zmm2, %zmm1, %zmm0
+; CHECK-NEXT: vpaddw %zmm4, %zmm0, %zmm0
+; CHECK-NEXT: vpaddw %zmm0, %zmm3, %zmm0
+; CHECK-NEXT: retq
+ %x2 = load <32 x i16>, <32 x i16>* %x2p
+ %res = call <32 x i16> @llvm.x86.avx512.mask.vpshldv.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
+ %res1 = call <32 x i16> @llvm.x86.avx512.mask.vpshldv.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x4, i32 -1)
+ %res2 = call <32 x i16> @llvm.x86.avx512.maskz.vpshldv.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x4, i32 %x3)
+ %res3 = add <32 x i16> %res, %res1
+ %res4 = add <32 x i16> %res2, %res3
+ ret <32 x i16> %res4
+}
+
diff --git a/test/CodeGen/X86/avx512vbmi2vl-intrinsics.ll b/test/CodeGen/X86/avx512vbmi2vl-intrinsics.ll
new file mode 100644
index 000000000000..800568b9ff37
--- /dev/null
+++ b/test/CodeGen/X86/avx512vbmi2vl-intrinsics.ll
@@ -0,0 +1,657 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512vl,+avx512vbmi2 | FileCheck %s
+
+define <16 x i16> @test_compress_w_256(<16 x i16> %src, <16 x i16> %data, i16 %mask) {
+; CHECK-LABEL: test_compress_w_256:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vpcompressw %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: retq
+ %res = call <16 x i16> @llvm.x86.avx512.mask.compress.w.256(<16 x i16> %data, <16 x i16> %src, i16 %mask)
+ ret <16 x i16> %res
+}
+declare <16 x i16> @llvm.x86.avx512.mask.compress.w.256(<16 x i16>, <16 x i16>, i16)
+
+define <8 x i16> @test_compress_w_128(<8 x i16> %data, i8 %mask) {
+; CHECK-LABEL: test_compress_w_128:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vpcompressw %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %res = call <8 x i16> @llvm.x86.avx512.mask.compress.w.128(<8 x i16> %data, <8 x i16> zeroinitializer, i8 %mask)
+ ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.avx512.mask.compress.w.128(<8 x i16>, <8 x i16>, i8)
+
+define <32 x i8> @test_compress_b_256(<32 x i8> %src, <32 x i8> %data, i32 %mask) {
+; CHECK-LABEL: test_compress_b_256:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vpcompressb %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: retq
+ %res = call <32 x i8> @llvm.x86.avx512.mask.compress.b.256(<32 x i8> %data, <32 x i8> %src, i32 %mask)
+ ret <32 x i8> %res
+}
+declare <32 x i8> @llvm.x86.avx512.mask.compress.b.256(<32 x i8>, <32 x i8>, i32)
+
+define <16 x i8> @test_compress_b_128(<16 x i8> %data, i16 %mask) {
+; CHECK-LABEL: test_compress_b_128:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vpcompressb %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %res = call <16 x i8> @llvm.x86.avx512.mask.compress.b.128(<16 x i8> %data, <16 x i8> zeroinitializer, i16 %mask)
+ ret <16 x i8> %res
+}
+declare <16 x i8> @llvm.x86.avx512.mask.compress.b.128(<16 x i8>, <16 x i8>, i16)
+
+define <32 x i8> @test_expand_b_256(<32 x i8> %data, <32 x i8> %src, i32 %mask) {
+; CHECK-LABEL: test_expand_b_256:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vpexpandb %ymm0, %ymm1 {%k1}
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %res = call <32 x i8> @llvm.x86.avx512.mask.expand.b.256( <32 x i8> %data, <32 x i8> %src, i32 %mask)
+ ret <32 x i8> %res
+}
+declare <32 x i8> @llvm.x86.avx512.mask.expand.b.256(<32 x i8>, <32 x i8>, i32)
+
+define <16 x i8> @test_expand_b_128(<16 x i8> %data, i16 %mask) {
+; CHECK-LABEL: test_expand_b_128:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vpexpandb %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %res = call <16 x i8> @llvm.x86.avx512.mask.expand.b.128(<16 x i8> %data, <16 x i8> zeroinitializer, i16 %mask)
+ ret <16 x i8> %res
+}
+declare <16 x i8> @llvm.x86.avx512.mask.expand.b.128(<16 x i8>, <16 x i8>, i16)
+
+define <16 x i16> @test_expand_w_256(<16 x i16> %data, <16 x i16> %src, i16 %mask) {
+; CHECK-LABEL: test_expand_w_256:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vpexpandw %ymm0, %ymm1 {%k1}
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %res = call <16 x i16> @llvm.x86.avx512.mask.expand.w.256( <16 x i16> %data, <16 x i16> %src, i16 %mask)
+ ret <16 x i16> %res
+}
+declare <16 x i16> @llvm.x86.avx512.mask.expand.w.256(<16 x i16>, <16 x i16>, i16)
+
+define <8 x i16> @test_expand_w_128(<8 x i16> %data, i8 %mask) {
+; CHECK-LABEL: test_expand_w_128:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vpexpandw %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %res = call <8 x i16> @llvm.x86.avx512.mask.expand.w.128(<8 x i16> %data, <8 x i16> zeroinitializer, i8 %mask)
+ ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.avx512.mask.expand.w.128(<8 x i16>, <8 x i16>, i8)
+
+define <16 x i16> @test_expand_load_w_256(i8* %addr, <16 x i16> %data, i16 %mask) {
+; CHECK-LABEL: test_expand_load_w_256:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vpexpandw (%rdi), %ymm0 {%k1}
+; CHECK-NEXT: retq
+ %res = call <16 x i16> @llvm.x86.avx512.mask.expand.load.w.256(i8* %addr, <16 x i16> %data, i16 %mask)
+ ret <16 x i16> %res
+}
+declare <16 x i16> @llvm.x86.avx512.mask.expand.load.w.256(i8* %addr, <16 x i16> %data, i16 %mask)
+
+define <8 x i16> @test_expand_load_w_128(i8* %addr, <8 x i16> %data, i8 %mask) {
+; CHECK-LABEL: test_expand_load_w_128:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vpexpandw (%rdi), %xmm0 {%k1}
+; CHECK-NEXT: retq
+ %res = call <8 x i16> @llvm.x86.avx512.mask.expand.load.w.128(i8* %addr, <8 x i16> %data, i8 %mask)
+ ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.avx512.mask.expand.load.w.128(i8* %addr, <8 x i16> %data, i8 %mask)
+
+define void @test_compress_store_w_256(i8* %addr, <16 x i16> %data, i16 %mask) {
+; CHECK-LABEL: test_compress_store_w_256:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vpcompressw %ymm0, (%rdi) {%k1}
+; CHECK-NEXT: retq
+ call void @llvm.x86.avx512.mask.compress.store.w.256(i8* %addr, <16 x i16> %data, i16 %mask)
+ ret void
+}
+declare void @llvm.x86.avx512.mask.compress.store.w.256(i8* %addr, <16 x i16> %data, i16 %mask)
+
+define void @test_compress_store_w_128(i8* %addr, <8 x i16> %data, i8 %mask) {
+; CHECK-LABEL: test_compress_store_w_128:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vpcompressw %xmm0, (%rdi) {%k1}
+; CHECK-NEXT: retq
+ call void @llvm.x86.avx512.mask.compress.store.w.128(i8* %addr, <8 x i16> %data, i8 %mask)
+ ret void
+}
+declare void @llvm.x86.avx512.mask.compress.store.w.128(i8* %addr, <8 x i16> %data, i8 %mask)
+
+define <32 x i8> @test_expand_load_b_256(i8* %addr, <32 x i8> %data, i32 %mask) {
+; CHECK-LABEL: test_expand_load_b_256:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vpexpandb (%rdi), %ymm0 {%k1}
+; CHECK-NEXT: retq
+ %res = call <32 x i8> @llvm.x86.avx512.mask.expand.load.b.256(i8* %addr, <32 x i8> %data, i32 %mask)
+ ret <32 x i8> %res
+}
+declare <32 x i8> @llvm.x86.avx512.mask.expand.load.b.256(i8* %addr, <32 x i8> %data, i32 %mask)
+
+define <16 x i8> @test_expand_load_b_128(i8* %addr, <16 x i8> %data, i16 %mask) {
+; CHECK-LABEL: test_expand_load_b_128:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vpexpandb (%rdi), %xmm0 {%k1}
+; CHECK-NEXT: retq
+ %res = call <16 x i8> @llvm.x86.avx512.mask.expand.load.b.128(i8* %addr, <16 x i8> %data, i16 %mask)
+ ret <16 x i8> %res
+}
+declare <16 x i8> @llvm.x86.avx512.mask.expand.load.b.128(i8* %addr, <16 x i8> %data, i16 %mask)
+
+define void @test_compress_store_b_256(i8* %addr, <32 x i8> %data, i32 %mask) {
+; CHECK-LABEL: test_compress_store_b_256:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vpcompressb %ymm0, (%rdi) {%k1}
+; CHECK-NEXT: retq
+ call void @llvm.x86.avx512.mask.compress.store.b.256(i8* %addr, <32 x i8> %data, i32 %mask)
+ ret void
+}
+declare void @llvm.x86.avx512.mask.compress.store.b.256(i8* %addr, <32 x i8> %data, i32 %mask)
+
+define void @test_compress_store_b_128(i8* %addr, <16 x i8> %data, i16 %mask) {
+; CHECK-LABEL: test_compress_store_b_128:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vpcompressb %xmm0, (%rdi) {%k1}
+; CHECK-NEXT: retq
+ call void @llvm.x86.avx512.mask.compress.store.b.128(i8* %addr, <16 x i8> %data, i16 %mask)
+ ret void
+}
+declare void @llvm.x86.avx512.mask.compress.store.b.128(i8* %addr, <16 x i8> %data, i16 %mask)
+
+define <4 x i32>@test_int_x86_avx512_mask_vpshld_d_128(<4 x i32> %x0, <4 x i32> %x1,<4 x i32> %x3, i8 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpshld_d_128:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vpshldd $22, %xmm1, %xmm0, %xmm3 {%k1} {z}
+; CHECK-NEXT: vpshldd $22, %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT: vpshldd $22, %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0
+; CHECK-NEXT: vpaddd %xmm3, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <4 x i32> @llvm.x86.avx512.mask.vpshld.d.128(<4 x i32> %x0, <4 x i32> %x1, i32 22, <4 x i32> %x3, i8 %x4)
+ %res1 = call <4 x i32> @llvm.x86.avx512.mask.vpshld.d.128(<4 x i32> %x0, <4 x i32> %x1, i32 22, <4 x i32> %x3, i8 -1)
+ %res2 = call <4 x i32> @llvm.x86.avx512.mask.vpshld.d.128(<4 x i32> %x0, <4 x i32> %x1, i32 22, <4 x i32> zeroinitializer,i8 %x4)
+ %res3 = add <4 x i32> %res, %res1
+ %res4 = add <4 x i32> %res3, %res2
+ ret <4 x i32> %res4
+}
+declare <4 x i32> @llvm.x86.avx512.mask.vpshld.d.128(<4 x i32>, <4 x i32>, i32, <4 x i32>, i8)
+
+define <8 x i32>@test_int_x86_avx512_mask_vpshld_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x3, i8 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpshld_d_256:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vpshldd $22, %ymm1, %ymm0, %ymm2 {%k1}
+; CHECK-NEXT: vpshldd $22, %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0
+; CHECK-NEXT: retq
+ %res = call <8 x i32> @llvm.x86.avx512.mask.vpshld.d.256(<8 x i32> %x0, <8 x i32> %x1, i32 22, <8 x i32> %x3, i8 %x4)
+ %res1 = call <8 x i32> @llvm.x86.avx512.mask.vpshld.d.256(<8 x i32> %x0, <8 x i32> %x1, i32 22, <8 x i32> %x3, i8 -1)
+ %res2 = add <8 x i32> %res, %res1
+ ret <8 x i32> %res2
+}
+declare <8 x i32> @llvm.x86.avx512.mask.vpshld.d.256(<8 x i32>, <8 x i32>, i32, <8 x i32>, i8)
+
+define <2 x i64>@test_int_x86_avx512_mask_vpshld_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x3, i8 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpshld_q_128:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vpshldq $22, %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT: vpshldq $22, %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: vpaddq %xmm0, %xmm2, %xmm0
+; CHECK-NEXT: retq
+ %res = call <2 x i64> @llvm.x86.avx512.mask.vpshld.q.128(<2 x i64> %x0, <2 x i64> %x1, i32 22, <2 x i64> %x3, i8 %x4)
+ %res1 = call <2 x i64> @llvm.x86.avx512.mask.vpshld.q.128(<2 x i64> %x0, <2 x i64> %x1, i32 22, <2 x i64> %x3, i8 -1)
+ %res2 = add <2 x i64> %res, %res1
+ ret <2 x i64> %res2
+}
+declare <2 x i64> @llvm.x86.avx512.mask.vpshld.q.128(<2 x i64>, <2 x i64>, i32, <2 x i64>, i8)
+
+define <4 x i64>@test_int_x86_avx512_mask_vpshld_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x3, i8 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpshld_q_256:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vpshldq $22, %ymm1, %ymm0, %ymm2 {%k1}
+; CHECK-NEXT: vpshldq $22, %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: vpaddq %ymm0, %ymm2, %ymm0
+; CHECK-NEXT: retq
+ %res = call <4 x i64> @llvm.x86.avx512.mask.vpshld.q.256(<4 x i64> %x0, <4 x i64> %x1, i32 22, <4 x i64> %x3, i8 %x4)
+ %res1 = call <4 x i64> @llvm.x86.avx512.mask.vpshld.q.256(<4 x i64> %x0, <4 x i64> %x1, i32 22, <4 x i64> %x3, i8 -1)
+ %res2 = add <4 x i64> %res, %res1
+ ret <4 x i64> %res2
+}
+declare <4 x i64> @llvm.x86.avx512.mask.vpshld.q.256(<4 x i64>, <4 x i64>, i32, <4 x i64>, i8)
+
+define <8 x i16>@test_int_x86_avx512_mask_vpshld_w_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x3, i8 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpshld_w_128:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vpshldw $22, %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT: vpshldw $22, %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: vpaddw %xmm0, %xmm2, %xmm0
+; CHECK-NEXT: retq
+ %res = call <8 x i16> @llvm.x86.avx512.mask.vpshld.w.128(<8 x i16> %x0, <8 x i16> %x1, i32 22, <8 x i16> %x3, i8 %x4)
+ %res1 = call <8 x i16> @llvm.x86.avx512.mask.vpshld.w.128(<8 x i16> %x0, <8 x i16> %x1, i32 22, <8 x i16> %x3, i8 -1)
+ %res2 = add <8 x i16> %res, %res1
+ ret <8 x i16> %res2
+}
+declare <8 x i16> @llvm.x86.avx512.mask.vpshld.w.128(<8 x i16>, <8 x i16>, i32, <8 x i16>, i8)
+
+define <16 x i16>@test_int_x86_avx512_mask_vpshld_w_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x3, i16 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpshld_w_256:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vpshldw $22, %ymm1, %ymm0, %ymm2 {%k1}
+; CHECK-NEXT: vpshldw $22, %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: vpaddw %ymm0, %ymm2, %ymm0
+; CHECK-NEXT: retq
+ %res = call <16 x i16> @llvm.x86.avx512.mask.vpshld.w.256(<16 x i16> %x0, <16 x i16> %x1, i32 22, <16 x i16> %x3, i16 %x4)
+ %res1 = call <16 x i16> @llvm.x86.avx512.mask.vpshld.w.256(<16 x i16> %x0, <16 x i16> %x1, i32 22, <16 x i16> %x3, i16 -1)
+ %res2 = add <16 x i16> %res, %res1
+ ret <16 x i16> %res2
+}
+declare <16 x i16> @llvm.x86.avx512.mask.vpshld.w.256(<16 x i16>, <16 x i16>, i32, <16 x i16>, i16)
+
+define <4 x i32>@test_int_x86_avx512_mask_vpshrd_d_128(<4 x i32> %x0, <4 x i32> %x1,<4 x i32> %x3, i8 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpshrd_d_128:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vpshrdd $22, %xmm1, %xmm0, %xmm3 {%k1} {z}
+; CHECK-NEXT: vpshrdd $22, %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT: vpshrdd $22, %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0
+; CHECK-NEXT: vpaddd %xmm3, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = call <4 x i32> @llvm.x86.avx512.mask.vpshrd.d.128(<4 x i32> %x0, <4 x i32> %x1, i32 22, <4 x i32> %x3, i8 %x4)
+ %res1 = call <4 x i32> @llvm.x86.avx512.mask.vpshrd.d.128(<4 x i32> %x0, <4 x i32> %x1, i32 22, <4 x i32> %x3, i8 -1)
+ %res2 = call <4 x i32> @llvm.x86.avx512.mask.vpshrd.d.128(<4 x i32> %x0, <4 x i32> %x1, i32 22, <4 x i32> zeroinitializer,i8 %x4)
+ %res3 = add <4 x i32> %res, %res1
+ %res4 = add <4 x i32> %res3, %res2
+ ret <4 x i32> %res4
+}
+declare <4 x i32> @llvm.x86.avx512.mask.vpshrd.d.128(<4 x i32>, <4 x i32>, i32, <4 x i32>, i8)
+
+define <8 x i32>@test_int_x86_avx512_mask_vpshrd_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x3, i8 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpshrd_d_256:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vpshrdd $22, %ymm1, %ymm0, %ymm2 {%k1}
+; CHECK-NEXT: vpshrdd $22, %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0
+; CHECK-NEXT: retq
+ %res = call <8 x i32> @llvm.x86.avx512.mask.vpshrd.d.256(<8 x i32> %x0, <8 x i32> %x1, i32 22, <8 x i32> %x3, i8 %x4)
+ %res1 = call <8 x i32> @llvm.x86.avx512.mask.vpshrd.d.256(<8 x i32> %x0, <8 x i32> %x1, i32 22, <8 x i32> %x3, i8 -1)
+ %res2 = add <8 x i32> %res, %res1
+ ret <8 x i32> %res2
+}
+declare <8 x i32> @llvm.x86.avx512.mask.vpshrd.d.256(<8 x i32>, <8 x i32>, i32, <8 x i32>, i8)
+
+define <2 x i64>@test_int_x86_avx512_mask_vpshrd_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x3, i8 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpshrd_q_128:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vpshrdq $22, %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT: vpshrdq $22, %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: vpaddq %xmm0, %xmm2, %xmm0
+; CHECK-NEXT: retq
+ %res = call <2 x i64> @llvm.x86.avx512.mask.vpshrd.q.128(<2 x i64> %x0, <2 x i64> %x1, i32 22, <2 x i64> %x3, i8 %x4)
+ %res1 = call <2 x i64> @llvm.x86.avx512.mask.vpshrd.q.128(<2 x i64> %x0, <2 x i64> %x1, i32 22, <2 x i64> %x3, i8 -1)
+ %res2 = add <2 x i64> %res, %res1
+ ret <2 x i64> %res2
+}
+declare <2 x i64> @llvm.x86.avx512.mask.vpshrd.q.128(<2 x i64>, <2 x i64>, i32, <2 x i64>, i8)
+
+define <4 x i64>@test_int_x86_avx512_mask_vpshrd_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x3, i8 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpshrd_q_256:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vpshrdq $22, %ymm1, %ymm0, %ymm2 {%k1}
+; CHECK-NEXT: vpshrdq $22, %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: vpaddq %ymm0, %ymm2, %ymm0
+; CHECK-NEXT: retq
+ %res = call <4 x i64> @llvm.x86.avx512.mask.vpshrd.q.256(<4 x i64> %x0, <4 x i64> %x1, i32 22, <4 x i64> %x3, i8 %x4)
+ %res1 = call <4 x i64> @llvm.x86.avx512.mask.vpshrd.q.256(<4 x i64> %x0, <4 x i64> %x1, i32 22, <4 x i64> %x3, i8 -1)
+ %res2 = add <4 x i64> %res, %res1
+ ret <4 x i64> %res2
+}
+declare <4 x i64> @llvm.x86.avx512.mask.vpshrd.q.256(<4 x i64>, <4 x i64>, i32, <4 x i64>, i8)
+
+define <8 x i16>@test_int_x86_avx512_mask_vpshrd_w_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x3, i8 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpshrd_w_128:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vpshrdw $22, %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT: vpshrdw $22, %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: vpaddw %xmm0, %xmm2, %xmm0
+; CHECK-NEXT: retq
+ %res = call <8 x i16> @llvm.x86.avx512.mask.vpshrd.w.128(<8 x i16> %x0, <8 x i16> %x1, i32 22, <8 x i16> %x3, i8 %x4)
+ %res1 = call <8 x i16> @llvm.x86.avx512.mask.vpshrd.w.128(<8 x i16> %x0, <8 x i16> %x1, i32 22, <8 x i16> %x3, i8 -1)
+ %res2 = add <8 x i16> %res, %res1
+ ret <8 x i16> %res2
+}
+declare <8 x i16> @llvm.x86.avx512.mask.vpshrd.w.128(<8 x i16>, <8 x i16>, i32, <8 x i16>, i8)
+
+define <16 x i16>@test_int_x86_avx512_mask_vpshrd_w_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x3, i16 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpshrd_w_256:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vpshrdw $22, %ymm1, %ymm0, %ymm2 {%k1}
+; CHECK-NEXT: vpshrdw $22, %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: vpaddw %ymm0, %ymm2, %ymm0
+; CHECK-NEXT: retq
+ %res = call <16 x i16> @llvm.x86.avx512.mask.vpshrd.w.256(<16 x i16> %x0, <16 x i16> %x1, i32 22, <16 x i16> %x3, i16 %x4)
+ %res1 = call <16 x i16> @llvm.x86.avx512.mask.vpshrd.w.256(<16 x i16> %x0, <16 x i16> %x1, i32 22, <16 x i16> %x3, i16 -1)
+ %res2 = add <16 x i16> %res, %res1
+ ret <16 x i16> %res2
+}
+declare <16 x i16> @llvm.x86.avx512.mask.vpshrd.w.256(<16 x i16>, <16 x i16>, i32, <16 x i16>, i16)
+
+declare <8 x i32> @llvm.x86.avx512.mask.vpshrdv.d.256(<8 x i32>, <8 x i32>, <8 x i32>, i8)
+declare <8 x i32> @llvm.x86.avx512.maskz.vpshrdv.d.256(<8 x i32>, <8 x i32>, <8 x i32>, i8)
+
+define <8 x i32>@test_int_x86_avx512_mask_vpshrdv_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32>* %x2p, <8 x i32> %x4, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpshrdv_d_256:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vmovdqa %ymm0, %ymm3
+; CHECK-NEXT: vpshrdvd (%rdi), %ymm1, %ymm3 {%k1}
+; CHECK-NEXT: vmovdqa %ymm0, %ymm4
+; CHECK-NEXT: vpshrdvd %ymm2, %ymm1, %ymm4
+; CHECK-NEXT: vpshrdvd %ymm2, %ymm1, %ymm0 {%k1} {z}
+; CHECK-NEXT: vpaddd %ymm0, %ymm4, %ymm0
+; CHECK-NEXT: vpaddd %ymm0, %ymm3, %ymm0
+; CHECK-NEXT: retq
+ %x2 = load <8 x i32>, <8 x i32>* %x2p
+ %res = call <8 x i32> @llvm.x86.avx512.mask.vpshrdv.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3)
+ %res1 = call <8 x i32> @llvm.x86.avx512.mask.vpshrdv.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x4, i8 -1)
+ %res2 = call <8 x i32> @llvm.x86.avx512.maskz.vpshrdv.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x4, i8 %x3)
+ %res3 = add <8 x i32> %res, %res1
+ %res4 = add <8 x i32> %res2, %res3
+ ret <8 x i32> %res4
+}
+
+declare <4 x i32> @llvm.x86.avx512.mask.vpshrdv.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i8)
+declare <4 x i32> @llvm.x86.avx512.maskz.vpshrdv.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i8)
+
+define <4 x i32>@test_int_x86_avx512_mask_vpshrdv_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32>* %x2p, <4 x i32> %x4, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpshrdv_d_128:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vmovdqa %xmm0, %xmm3
+; CHECK-NEXT: vpshrdvd (%rdi), %xmm1, %xmm3 {%k1}
+; CHECK-NEXT: vmovdqa %xmm0, %xmm4
+; CHECK-NEXT: vpshrdvd %xmm2, %xmm1, %xmm4
+; CHECK-NEXT: vpshrdvd %xmm2, %xmm1, %xmm0 {%k1} {z}
+; CHECK-NEXT: vpaddd %xmm0, %xmm4, %xmm0
+; CHECK-NEXT: vpaddd %xmm0, %xmm3, %xmm0
+; CHECK-NEXT: retq
+ %x2 = load <4 x i32>, <4 x i32>* %x2p
+ %res = call <4 x i32> @llvm.x86.avx512.mask.vpshrdv.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3)
+ %res1 = call <4 x i32> @llvm.x86.avx512.mask.vpshrdv.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x4, i8 -1)
+ %res2 = call <4 x i32> @llvm.x86.avx512.maskz.vpshrdv.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x4, i8 %x3)
+ %res3 = add <4 x i32> %res, %res1
+ %res4 = add <4 x i32> %res2, %res3
+ ret <4 x i32> %res4
+}
+
+declare <4 x i64> @llvm.x86.avx512.mask.vpshrdv.q.256(<4 x i64>, <4 x i64>, <4 x i64>, i8)
+declare <4 x i64> @llvm.x86.avx512.maskz.vpshrdv.q.256(<4 x i64>, <4 x i64>, <4 x i64>, i8)
+
+define <4 x i64>@test_int_x86_avx512_mask_vpshrdv_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64>* %x2p, <4 x i64> %x4, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpshrdv_q_256:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vmovdqa %ymm0, %ymm3
+; CHECK-NEXT: vpshrdvq (%rdi), %ymm1, %ymm3 {%k1}
+; CHECK-NEXT: vmovdqa %ymm0, %ymm4
+; CHECK-NEXT: vpshrdvq %ymm2, %ymm1, %ymm4
+; CHECK-NEXT: vpshrdvq %ymm2, %ymm1, %ymm0 {%k1} {z}
+; CHECK-NEXT: vpaddq %ymm0, %ymm4, %ymm0
+; CHECK-NEXT: vpaddq %ymm0, %ymm3, %ymm0
+; CHECK-NEXT: retq
+ %x2 = load <4 x i64>, <4 x i64>* %x2p
+ %res = call <4 x i64> @llvm.x86.avx512.mask.vpshrdv.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3)
+ %res1 = call <4 x i64> @llvm.x86.avx512.mask.vpshrdv.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x4, i8 -1)
+ %res2 = call <4 x i64> @llvm.x86.avx512.maskz.vpshrdv.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x4, i8 %x3)
+ %res3 = add <4 x i64> %res, %res1
+ %res4 = add <4 x i64> %res2, %res3
+ ret <4 x i64> %res4
+}
+
+declare <2 x i64> @llvm.x86.avx512.mask.vpshrdv.q.128(<2 x i64>, <2 x i64>, <2 x i64>, i8)
+declare <2 x i64> @llvm.x86.avx512.maskz.vpshrdv.q.128(<2 x i64>, <2 x i64>, <2 x i64>, i8)
+
+define <2 x i64>@test_int_x86_avx512_mask_vpshrdv_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64>* %x2p, <2 x i64> %x4, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpshrdv_q_128:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vmovdqa %xmm0, %xmm3
+; CHECK-NEXT: vpshrdvq (%rdi), %xmm1, %xmm3 {%k1}
+; CHECK-NEXT: vmovdqa %xmm0, %xmm4
+; CHECK-NEXT: vpshrdvq %xmm2, %xmm1, %xmm4
+; CHECK-NEXT: vpshrdvq %xmm2, %xmm1, %xmm0 {%k1} {z}
+; CHECK-NEXT: vpaddq %xmm0, %xmm4, %xmm0
+; CHECK-NEXT: vpaddq %xmm0, %xmm3, %xmm0
+; CHECK-NEXT: retq
+ %x2 = load <2 x i64>, <2 x i64>* %x2p
+ %res = call <2 x i64> @llvm.x86.avx512.mask.vpshrdv.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3)
+ %res1 = call <2 x i64> @llvm.x86.avx512.mask.vpshrdv.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x4, i8 -1)
+ %res2 = call <2 x i64> @llvm.x86.avx512.maskz.vpshrdv.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x4, i8 %x3)
+ %res3 = add <2 x i64> %res, %res1
+ %res4 = add <2 x i64> %res2, %res3
+ ret <2 x i64> %res4
+}
+
+declare <16 x i16> @llvm.x86.avx512.mask.vpshrdv.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16)
+declare <16 x i16> @llvm.x86.avx512.maskz.vpshrdv.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16)
+
+define <16 x i16>@test_int_x86_avx512_mask_vpshrdv_w_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16>* %x2p, <16 x i16> %x4, i16 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpshrdv_w_256:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vmovdqa %ymm0, %ymm3
+; CHECK-NEXT: vpshrdvw (%rdi), %ymm1, %ymm3 {%k1}
+; CHECK-NEXT: vmovdqa %ymm0, %ymm4
+; CHECK-NEXT: vpshrdvw %ymm2, %ymm1, %ymm4
+; CHECK-NEXT: vpshrdvw %ymm2, %ymm1, %ymm0 {%k1} {z}
+; CHECK-NEXT: vpaddw %ymm0, %ymm4, %ymm0
+; CHECK-NEXT: vpaddw %ymm0, %ymm3, %ymm0
+; CHECK-NEXT: retq
+ %x2 = load <16 x i16>, <16 x i16>* %x2p
+ %res = call <16 x i16> @llvm.x86.avx512.mask.vpshrdv.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3)
+ %res1 = call <16 x i16> @llvm.x86.avx512.mask.vpshrdv.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x4, i16 -1)
+ %res2 = call <16 x i16> @llvm.x86.avx512.maskz.vpshrdv.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x4, i16 %x3)
+ %res3 = add <16 x i16> %res, %res1
+ %res4 = add <16 x i16> %res2, %res3
+ ret <16 x i16> %res4
+}
+
+declare <8 x i16> @llvm.x86.avx512.mask.vpshrdv.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)
+declare <8 x i16> @llvm.x86.avx512.maskz.vpshrdv.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)
+
+define <8 x i16>@test_int_x86_avx512_mask_vpshrdv_w_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16>* %x2p, <8 x i16> %x4, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpshrdv_w_128:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vmovdqa %xmm0, %xmm3
+; CHECK-NEXT: vpshrdvw (%rdi), %xmm1, %xmm3 {%k1}
+; CHECK-NEXT: vmovdqa %xmm0, %xmm4
+; CHECK-NEXT: vpshrdvw %xmm2, %xmm1, %xmm4
+; CHECK-NEXT: vpshrdvw %xmm2, %xmm1, %xmm0 {%k1} {z}
+; CHECK-NEXT: vpaddw %xmm0, %xmm4, %xmm0
+; CHECK-NEXT: vpaddw %xmm0, %xmm3, %xmm0
+; CHECK-NEXT: retq
+ %x2 = load <8 x i16>, <8 x i16>* %x2p
+ %res = call <8 x i16> @llvm.x86.avx512.mask.vpshrdv.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3)
+ %res1 = call <8 x i16> @llvm.x86.avx512.mask.vpshrdv.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x4, i8 -1)
+ %res2 = call <8 x i16> @llvm.x86.avx512.maskz.vpshrdv.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x4, i8 %x3)
+ %res3 = add <8 x i16> %res, %res1
+ %res4 = add <8 x i16> %res2, %res3
+ ret <8 x i16> %res4
+}
+
+declare <8 x i32> @llvm.x86.avx512.mask.vpshldv.d.256(<8 x i32>, <8 x i32>, <8 x i32>, i8)
+declare <8 x i32> @llvm.x86.avx512.maskz.vpshldv.d.256(<8 x i32>, <8 x i32>, <8 x i32>, i8)
+
+define <8 x i32>@test_int_x86_avx512_mask_vpshldv_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32>* %x2p, <8 x i32> %x4, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpshldv_d_256:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vmovdqa %ymm0, %ymm3
+; CHECK-NEXT: vpshldvd (%rdi), %ymm1, %ymm3 {%k1}
+; CHECK-NEXT: vmovdqa %ymm0, %ymm4
+; CHECK-NEXT: vpshldvd %ymm2, %ymm1, %ymm4
+; CHECK-NEXT: vpshldvd %ymm2, %ymm1, %ymm0 {%k1} {z}
+; CHECK-NEXT: vpaddd %ymm0, %ymm4, %ymm0
+; CHECK-NEXT: vpaddd %ymm0, %ymm3, %ymm0
+; CHECK-NEXT: retq
+ %x2 = load <8 x i32>, <8 x i32>* %x2p
+ %res = call <8 x i32> @llvm.x86.avx512.mask.vpshldv.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3)
+ %res1 = call <8 x i32> @llvm.x86.avx512.mask.vpshldv.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x4, i8 -1)
+ %res2 = call <8 x i32> @llvm.x86.avx512.maskz.vpshldv.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x4, i8 %x3)
+ %res3 = add <8 x i32> %res, %res1
+ %res4 = add <8 x i32> %res2, %res3
+ ret <8 x i32> %res4
+}
+
+declare <4 x i32> @llvm.x86.avx512.mask.vpshldv.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i8)
+declare <4 x i32> @llvm.x86.avx512.maskz.vpshldv.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i8)
+
+define <4 x i32>@test_int_x86_avx512_mask_vpshldv_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32>* %x2p, <4 x i32> %x4, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpshldv_d_128:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vmovdqa %xmm0, %xmm3
+; CHECK-NEXT: vpshldvd (%rdi), %xmm1, %xmm3 {%k1}
+; CHECK-NEXT: vmovdqa %xmm0, %xmm4
+; CHECK-NEXT: vpshldvd %xmm2, %xmm1, %xmm4
+; CHECK-NEXT: vpshldvd %xmm2, %xmm1, %xmm0 {%k1} {z}
+; CHECK-NEXT: vpaddd %xmm0, %xmm4, %xmm0
+; CHECK-NEXT: vpaddd %xmm0, %xmm3, %xmm0
+; CHECK-NEXT: retq
+ %x2 = load <4 x i32>, <4 x i32>* %x2p
+ %res = call <4 x i32> @llvm.x86.avx512.mask.vpshldv.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3)
+ %res1 = call <4 x i32> @llvm.x86.avx512.mask.vpshldv.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x4, i8 -1)
+ %res2 = call <4 x i32> @llvm.x86.avx512.maskz.vpshldv.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x4, i8 %x3)
+ %res3 = add <4 x i32> %res, %res1
+ %res4 = add <4 x i32> %res2, %res3
+ ret <4 x i32> %res4
+}
+
+declare <4 x i64> @llvm.x86.avx512.mask.vpshldv.q.256(<4 x i64>, <4 x i64>, <4 x i64>, i8)
+declare <4 x i64> @llvm.x86.avx512.maskz.vpshldv.q.256(<4 x i64>, <4 x i64>, <4 x i64>, i8)
+
+define <4 x i64>@test_int_x86_avx512_mask_vpshldv_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64>* %x2p, <4 x i64> %x4, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpshldv_q_256:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vmovdqa %ymm0, %ymm3
+; CHECK-NEXT: vpshldvq (%rdi), %ymm1, %ymm3 {%k1}
+; CHECK-NEXT: vmovdqa %ymm0, %ymm4
+; CHECK-NEXT: vpshldvq %ymm2, %ymm1, %ymm4
+; CHECK-NEXT: vpshldvq %ymm2, %ymm1, %ymm0 {%k1} {z}
+; CHECK-NEXT: vpaddq %ymm0, %ymm4, %ymm0
+; CHECK-NEXT: vpaddq %ymm0, %ymm3, %ymm0
+; CHECK-NEXT: retq
+ %x2 = load <4 x i64>, <4 x i64>* %x2p
+ %res = call <4 x i64> @llvm.x86.avx512.mask.vpshldv.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3)
+ %res1 = call <4 x i64> @llvm.x86.avx512.mask.vpshldv.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x4, i8 -1)
+ %res2 = call <4 x i64> @llvm.x86.avx512.maskz.vpshldv.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x4, i8 %x3)
+ %res3 = add <4 x i64> %res, %res1
+ %res4 = add <4 x i64> %res2, %res3
+ ret <4 x i64> %res4
+}
+
+declare <2 x i64> @llvm.x86.avx512.mask.vpshldv.q.128(<2 x i64>, <2 x i64>, <2 x i64>, i8)
+declare <2 x i64> @llvm.x86.avx512.maskz.vpshldv.q.128(<2 x i64>, <2 x i64>, <2 x i64>, i8)
+
+define <2 x i64>@test_int_x86_avx512_mask_vpshldv_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64>* %x2p, <2 x i64> %x4, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpshldv_q_128:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vmovdqa %xmm0, %xmm3
+; CHECK-NEXT: vpshldvq (%rdi), %xmm1, %xmm3 {%k1}
+; CHECK-NEXT: vmovdqa %xmm0, %xmm4
+; CHECK-NEXT: vpshldvq %xmm2, %xmm1, %xmm4
+; CHECK-NEXT: vpshldvq %xmm2, %xmm1, %xmm0 {%k1} {z}
+; CHECK-NEXT: vpaddq %xmm0, %xmm4, %xmm0
+; CHECK-NEXT: vpaddq %xmm0, %xmm3, %xmm0
+; CHECK-NEXT: retq
+ %x2 = load <2 x i64>, <2 x i64>* %x2p
+ %res = call <2 x i64> @llvm.x86.avx512.mask.vpshldv.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3)
+ %res1 = call <2 x i64> @llvm.x86.avx512.mask.vpshldv.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x4, i8 -1)
+ %res2 = call <2 x i64> @llvm.x86.avx512.maskz.vpshldv.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x4, i8 %x3)
+ %res3 = add <2 x i64> %res, %res1
+ %res4 = add <2 x i64> %res2, %res3
+ ret <2 x i64> %res4
+}
+
+declare <16 x i16> @llvm.x86.avx512.mask.vpshldv.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16)
+declare <16 x i16> @llvm.x86.avx512.maskz.vpshldv.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16)
+
+define <16 x i16>@test_int_x86_avx512_mask_vpshldv_w_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16>* %x2p, <16 x i16> %x4, i16 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpshldv_w_256:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vmovdqa %ymm0, %ymm3
+; CHECK-NEXT: vpshldvw (%rdi), %ymm1, %ymm3 {%k1}
+; CHECK-NEXT: vmovdqa %ymm0, %ymm4
+; CHECK-NEXT: vpshldvw %ymm2, %ymm1, %ymm4
+; CHECK-NEXT: vpshldvw %ymm2, %ymm1, %ymm0 {%k1} {z}
+; CHECK-NEXT: vpaddw %ymm0, %ymm4, %ymm0
+; CHECK-NEXT: vpaddw %ymm0, %ymm3, %ymm0
+; CHECK-NEXT: retq
+ %x2 = load <16 x i16>, <16 x i16>* %x2p
+ %res = call <16 x i16> @llvm.x86.avx512.mask.vpshldv.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3)
+ %res1 = call <16 x i16> @llvm.x86.avx512.mask.vpshldv.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x4, i16 -1)
+ %res2 = call <16 x i16> @llvm.x86.avx512.maskz.vpshldv.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x4, i16 %x3)
+ %res3 = add <16 x i16> %res, %res1
+ %res4 = add <16 x i16> %res2, %res3
+ ret <16 x i16> %res4
+}
+
+declare <8 x i16> @llvm.x86.avx512.mask.vpshldv.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)
+declare <8 x i16> @llvm.x86.avx512.maskz.vpshldv.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)
+
+define <8 x i16>@test_int_x86_avx512_mask_vpshldv_w_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16>* %x2p, <8 x i16> %x4, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpshldv_w_128:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovd %esi, %k1
+; CHECK-NEXT: vmovdqa %xmm0, %xmm3
+; CHECK-NEXT: vpshldvw (%rdi), %xmm1, %xmm3 {%k1}
+; CHECK-NEXT: vmovdqa %xmm0, %xmm4
+; CHECK-NEXT: vpshldvw %xmm2, %xmm1, %xmm4
+; CHECK-NEXT: vpshldvw %xmm2, %xmm1, %xmm0 {%k1} {z}
+; CHECK-NEXT: vpaddw %xmm0, %xmm4, %xmm0
+; CHECK-NEXT: vpaddw %xmm0, %xmm3, %xmm0
+; CHECK-NEXT: retq
+ %x2 = load <8 x i16>, <8 x i16>* %x2p
+ %res = call <8 x i16> @llvm.x86.avx512.mask.vpshldv.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3)
+ %res1 = call <8 x i16> @llvm.x86.avx512.mask.vpshldv.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x4, i8 -1)
+ %res2 = call <8 x i16> @llvm.x86.avx512.maskz.vpshldv.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x4, i8 %x3)
+ %res3 = add <8 x i16> %res, %res1
+ %res4 = add <8 x i16> %res2, %res3
+ ret <8 x i16> %res4
+}
+
diff --git a/test/CodeGen/X86/avx512vbmivl-intrinsics.ll b/test/CodeGen/X86/avx512vbmivl-intrinsics.ll
index 22edbcc8e157..bb15ed190dd1 100644
--- a/test/CodeGen/X86/avx512vbmivl-intrinsics.ll
+++ b/test/CodeGen/X86/avx512vbmivl-intrinsics.ll
@@ -1,11 +1,11 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; NOTE: Assertions have been autogenerated by update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512vl -mattr=+avx512vbmi --show-mc-encoding| FileCheck %s
+
declare <16 x i8> @llvm.x86.avx512.mask.permvar.qi.128(<16 x i8>, <16 x i8>, <16 x i8>, i16)
define <16 x i8>@test_int_x86_avx512_mask_permvar_qi_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_permvar_qi_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpermb %xmm0, %xmm1, %xmm3 ## encoding: [0x62,0xf2,0x75,0x08,0x8d,0xd8]
; CHECK-NEXT: vpermb %xmm0, %xmm1, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0x8d,0xd0]
@@ -25,7 +25,7 @@ declare <32 x i8> @llvm.x86.avx512.mask.permvar.qi.256(<32 x i8>, <32 x i8>, <32
define <32 x i8>@test_int_x86_avx512_mask_permvar_qi_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_permvar_qi_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpermb %ymm0, %ymm1, %ymm2 {%k1} ## encoding: [0x62,0xf2,0x75,0x29,0x8d,0xd0]
; CHECK-NEXT: vpermb %ymm0, %ymm1, %ymm3 {%k1} {z} ## encoding: [0x62,0xf2,0x75,0xa9,0x8d,0xd8]
@@ -45,7 +45,7 @@ declare <16 x i8> @llvm.x86.avx512.mask.pmultishift.qb.128(<16 x i8>, <16 x i8>,
define <16 x i8>@test_int_x86_avx512_mask_pmultishift_qb_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmultishift_qb_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpmultishiftqb %xmm1, %xmm0, %xmm3 ## encoding: [0x62,0xf2,0xfd,0x08,0x83,0xd9]
; CHECK-NEXT: vpmultishiftqb %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x83,0xd1]
@@ -65,7 +65,7 @@ declare <32 x i8> @llvm.x86.avx512.mask.pmultishift.qb.256(<32 x i8>, <32 x i8>,
define <32 x i8>@test_int_x86_avx512_mask_pmultishift_qb_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmultishift_qb_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpmultishiftqb %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x83,0xd1]
; CHECK-NEXT: vpmultishiftqb %ymm1, %ymm0, %ymm3 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x83,0xd9]
@@ -85,7 +85,7 @@ declare <16 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.128(<16 x i8>, <16 x i8>,
define <16 x i8>@test_int_x86_avx512_mask_vpermi2var_qi_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_qi_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vmovdqa %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd9]
; CHECK-NEXT: vpermi2b %xmm2, %xmm0, %xmm3 ## encoding: [0x62,0xf2,0x7d,0x08,0x75,0xda]
@@ -107,12 +107,12 @@ declare <32 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.256(<32 x i8>, <32 x i8>,
define <32 x i8>@test_int_x86_avx512_mask_vpermi2var_qi_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_qi_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vmovdqa %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd9]
; CHECK-NEXT: vpermi2b %ymm2, %ymm0, %ymm3 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x75,0xda]
; CHECK-NEXT: vpermi2b %ymm2, %ymm0, %ymm1 ## encoding: [0x62,0xf2,0x7d,0x28,0x75,0xca]
-; CHECK-NEXT: vpxor %ymm4, %ymm4, %ymm4 ## EVEX TO VEX Compression encoding: [0xc5,0xdd,0xef,0xe4]
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4]
; CHECK-NEXT: vpermi2b %ymm2, %ymm0, %ymm4 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x75,0xe2]
; CHECK-NEXT: vpaddb %ymm1, %ymm4, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xdd,0xfc,0xc1]
; CHECK-NEXT: vpaddb %ymm0, %ymm3, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe5,0xfc,0xc0]
@@ -129,7 +129,7 @@ declare <16 x i8> @llvm.x86.avx512.mask.vpermt2var.qi.128(<16 x i8>, <16 x i8>,
define <16 x i8>@test_int_x86_avx512_mask_vpermt2var_qi_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vpermt2var_qi_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vmovdqa %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd9]
; CHECK-NEXT: vpermt2b %xmm2, %xmm0, %xmm3 ## encoding: [0x62,0xf2,0x7d,0x08,0x7d,0xda]
@@ -151,12 +151,12 @@ declare <32 x i8> @llvm.x86.avx512.mask.vpermt2var.qi.256(<32 x i8>, <32 x i8>,
define <32 x i8>@test_int_x86_avx512_mask_vpermt2var_qi_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vpermt2var_qi_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vmovdqa %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd9]
; CHECK-NEXT: vpermt2b %ymm2, %ymm0, %ymm3 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x7d,0xda]
; CHECK-NEXT: vpermt2b %ymm2, %ymm0, %ymm1 ## encoding: [0x62,0xf2,0x7d,0x28,0x7d,0xca]
-; CHECK-NEXT: vpxor %ymm4, %ymm4, %ymm4 ## EVEX TO VEX Compression encoding: [0xc5,0xdd,0xef,0xe4]
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4]
; CHECK-NEXT: vpermt2b %ymm2, %ymm0, %ymm4 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x7d,0xe2]
; CHECK-NEXT: vpaddb %ymm1, %ymm4, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xdd,0xfc,0xc1]
; CHECK-NEXT: vpaddb %ymm0, %ymm3, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe5,0xfc,0xc0]
@@ -173,7 +173,7 @@ declare <16 x i8> @llvm.x86.avx512.maskz.vpermt2var.qi.128(<16 x i8>, <16 x i8>,
define <16 x i8>@test_int_x86_avx512_maskz_vpermt2var_qi_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) {
; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_qi_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpermi2b %xmm2, %xmm1, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x75,0x89,0x75,0xc2]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -185,7 +185,7 @@ declare <32 x i8> @llvm.x86.avx512.maskz.vpermt2var.qi.256(<32 x i8>, <32 x i8>,
define <32 x i8>@test_int_x86_avx512_maskz_vpermt2var_qi_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) {
; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_qi_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpermi2b %ymm2, %ymm1, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x75,0xa9,0x75,0xc2]
; CHECK-NEXT: retq ## encoding: [0xc3]
diff --git a/test/CodeGen/X86/avx512vl-arith.ll b/test/CodeGen/X86/avx512vl-arith.ll
index 248a68245528..beaefe92aac2 100644
--- a/test/CodeGen/X86/avx512vl-arith.ll
+++ b/test/CodeGen/X86/avx512vl-arith.ll
@@ -5,7 +5,7 @@
define <4 x i64> @vpaddq256_test(<4 x i64> %i, <4 x i64> %j) nounwind readnone {
; CHECK-LABEL: vpaddq256_test:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd4,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%x = add <4 x i64> %i, %j
@@ -14,7 +14,7 @@ define <4 x i64> @vpaddq256_test(<4 x i64> %i, <4 x i64> %j) nounwind readnone {
define <4 x i64> @vpaddq256_fold_test(<4 x i64> %i, <4 x i64>* %j) nounwind {
; CHECK-LABEL: vpaddq256_fold_test:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpaddq (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd4,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%tmp = load <4 x i64>, <4 x i64>* %j, align 4
@@ -24,7 +24,7 @@ define <4 x i64> @vpaddq256_fold_test(<4 x i64> %i, <4 x i64>* %j) nounwind {
define <4 x i64> @vpaddq256_broadcast_test(<4 x i64> %i) nounwind {
; CHECK-LABEL: vpaddq256_broadcast_test:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpaddq {{.*}}(%rip){1to4}, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x38,0xd4,0x05,A,A,A,A]
; CHECK-NEXT: ## fixup A - offset: 6, value: LCPI2_0-4, kind: reloc_riprel_4byte
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -34,7 +34,7 @@ define <4 x i64> @vpaddq256_broadcast_test(<4 x i64> %i) nounwind {
define <4 x i64> @vpaddq256_broadcast2_test(<4 x i64> %i, i64* %j.ptr) nounwind {
; CHECK-LABEL: vpaddq256_broadcast2_test:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpaddq (%rdi){1to4}, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x38,0xd4,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%j = load i64, i64* %j.ptr
@@ -46,7 +46,7 @@ define <4 x i64> @vpaddq256_broadcast2_test(<4 x i64> %i, i64* %j.ptr) nounwind
define <8 x i32> @vpaddd256_test(<8 x i32> %i, <8 x i32> %j) nounwind readnone {
; CHECK-LABEL: vpaddd256_test:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%x = add <8 x i32> %i, %j
@@ -55,7 +55,7 @@ define <8 x i32> @vpaddd256_test(<8 x i32> %i, <8 x i32> %j) nounwind readnone {
define <8 x i32> @vpaddd256_fold_test(<8 x i32> %i, <8 x i32>* %j) nounwind {
; CHECK-LABEL: vpaddd256_fold_test:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpaddd (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%tmp = load <8 x i32>, <8 x i32>* %j, align 4
@@ -65,7 +65,7 @@ define <8 x i32> @vpaddd256_fold_test(<8 x i32> %i, <8 x i32>* %j) nounwind {
define <8 x i32> @vpaddd256_broadcast_test(<8 x i32> %i) nounwind {
; CHECK-LABEL: vpaddd256_broadcast_test:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpaddd {{.*}}(%rip){1to8}, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x38,0xfe,0x05,A,A,A,A]
; CHECK-NEXT: ## fixup A - offset: 6, value: LCPI6_0-4, kind: reloc_riprel_4byte
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -75,8 +75,8 @@ define <8 x i32> @vpaddd256_broadcast_test(<8 x i32> %i) nounwind {
define <8 x i32> @vpaddd256_mask_test(<8 x i32> %i, <8 x i32> %j, <8 x i32> %mask1) nounwind readnone {
; CHECK-LABEL: vpaddd256_mask_test:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpxor %ymm3, %ymm3, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xe5,0xef,0xdb]
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xe1,0xef,0xdb]
; CHECK-NEXT: vpcmpneqd %ymm3, %ymm2, %k1 ## encoding: [0x62,0xf3,0x6d,0x28,0x1f,0xcb,0x04]
; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xfe,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -88,8 +88,8 @@ define <8 x i32> @vpaddd256_mask_test(<8 x i32> %i, <8 x i32> %j, <8 x i32> %mas
define <8 x i32> @vpaddd256_maskz_test(<8 x i32> %i, <8 x i32> %j, <8 x i32> %mask1) nounwind readnone {
; CHECK-LABEL: vpaddd256_maskz_test:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpxor %ymm3, %ymm3, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xe5,0xef,0xdb]
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xe1,0xef,0xdb]
; CHECK-NEXT: vpcmpneqd %ymm3, %ymm2, %k1 ## encoding: [0x62,0xf3,0x6d,0x28,0x1f,0xcb,0x04]
; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xfe,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -101,8 +101,8 @@ define <8 x i32> @vpaddd256_maskz_test(<8 x i32> %i, <8 x i32> %j, <8 x i32> %ma
define <8 x i32> @vpaddd256_mask_fold_test(<8 x i32> %i, <8 x i32>* %j.ptr, <8 x i32> %mask1) nounwind readnone {
; CHECK-LABEL: vpaddd256_mask_fold_test:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpxor %ymm2, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xef,0xd2]
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2]
; CHECK-NEXT: vpcmpneqd %ymm2, %ymm1, %k1 ## encoding: [0x62,0xf3,0x75,0x28,0x1f,0xca,0x04]
; CHECK-NEXT: vpaddd (%rdi), %ymm0, %ymm0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xfe,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -115,8 +115,8 @@ define <8 x i32> @vpaddd256_mask_fold_test(<8 x i32> %i, <8 x i32>* %j.ptr, <8 x
define <8 x i32> @vpaddd256_mask_broadcast_test(<8 x i32> %i, <8 x i32> %mask1) nounwind readnone {
; CHECK-LABEL: vpaddd256_mask_broadcast_test:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpxor %ymm2, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xef,0xd2]
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2]
; CHECK-NEXT: vpcmpneqd %ymm2, %ymm1, %k1 ## encoding: [0x62,0xf3,0x75,0x28,0x1f,0xca,0x04]
; CHECK-NEXT: vpaddd {{.*}}(%rip){1to8}, %ymm0, %ymm0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x39,0xfe,0x05,A,A,A,A]
; CHECK-NEXT: ## fixup A - offset: 6, value: LCPI10_0-4, kind: reloc_riprel_4byte
@@ -129,8 +129,8 @@ define <8 x i32> @vpaddd256_mask_broadcast_test(<8 x i32> %i, <8 x i32> %mask1)
define <8 x i32> @vpaddd256_maskz_fold_test(<8 x i32> %i, <8 x i32>* %j.ptr, <8 x i32> %mask1) nounwind readnone {
; CHECK-LABEL: vpaddd256_maskz_fold_test:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpxor %ymm2, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xef,0xd2]
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2]
; CHECK-NEXT: vpcmpneqd %ymm2, %ymm1, %k1 ## encoding: [0x62,0xf3,0x75,0x28,0x1f,0xca,0x04]
; CHECK-NEXT: vpaddd (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xfe,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -143,8 +143,8 @@ define <8 x i32> @vpaddd256_maskz_fold_test(<8 x i32> %i, <8 x i32>* %j.ptr, <8
define <8 x i32> @vpaddd256_maskz_broadcast_test(<8 x i32> %i, <8 x i32> %mask1) nounwind readnone {
; CHECK-LABEL: vpaddd256_maskz_broadcast_test:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpxor %ymm2, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xef,0xd2]
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2]
; CHECK-NEXT: vpcmpneqd %ymm2, %ymm1, %k1 ## encoding: [0x62,0xf3,0x75,0x28,0x1f,0xca,0x04]
; CHECK-NEXT: vpaddd {{.*}}(%rip){1to8}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xb9,0xfe,0x05,A,A,A,A]
; CHECK-NEXT: ## fixup A - offset: 6, value: LCPI12_0-4, kind: reloc_riprel_4byte
@@ -157,7 +157,7 @@ define <8 x i32> @vpaddd256_maskz_broadcast_test(<8 x i32> %i, <8 x i32> %mask1)
define <4 x i64> @vpsubq256_test(<4 x i64> %i, <4 x i64> %j) nounwind readnone {
; CHECK-LABEL: vpsubq256_test:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsubq %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfb,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%x = sub <4 x i64> %i, %j
@@ -166,7 +166,7 @@ define <4 x i64> @vpsubq256_test(<4 x i64> %i, <4 x i64> %j) nounwind readnone {
define <8 x i32> @vpsubd256_test(<8 x i32> %i, <8 x i32> %j) nounwind readnone {
; CHECK-LABEL: vpsubd256_test:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsubd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfa,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%x = sub <8 x i32> %i, %j
@@ -175,7 +175,7 @@ define <8 x i32> @vpsubd256_test(<8 x i32> %i, <8 x i32> %j) nounwind readnone {
define <8 x i32> @vpmulld256_test(<8 x i32> %i, <8 x i32> %j) {
; CHECK-LABEL: vpmulld256_test:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpmulld %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x40,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%x = mul <8 x i32> %i, %j
@@ -184,7 +184,7 @@ define <8 x i32> @vpmulld256_test(<8 x i32> %i, <8 x i32> %j) {
define <4 x double> @test_vaddpd_256(<4 x double> %y, <4 x double> %x) {
; CHECK-LABEL: test_vaddpd_256:
-; CHECK: ## BB#0: ## %entry
+; CHECK: ## %bb.0: ## %entry
; CHECK-NEXT: vaddpd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
entry:
@@ -194,7 +194,7 @@ entry:
define <4 x double> @test_fold_vaddpd_256(<4 x double> %y) {
; CHECK-LABEL: test_fold_vaddpd_256:
-; CHECK: ## BB#0: ## %entry
+; CHECK: ## %bb.0: ## %entry
; CHECK-NEXT: vaddpd {{.*}}(%rip), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x58,0x05,A,A,A,A]
; CHECK-NEXT: ## fixup A - offset: 4, value: LCPI17_0-4, kind: reloc_riprel_4byte
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -205,7 +205,7 @@ entry:
define <8 x float> @test_broadcast_vaddpd_256(<8 x float> %a) nounwind {
; CHECK-LABEL: test_broadcast_vaddpd_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vaddps {{.*}}(%rip){1to8}, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x38,0x58,0x05,A,A,A,A]
; CHECK-NEXT: ## fixup A - offset: 6, value: LCPI18_0-4, kind: reloc_riprel_4byte
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -215,8 +215,8 @@ define <8 x float> @test_broadcast_vaddpd_256(<8 x float> %a) nounwind {
define <8 x float> @test_mask_vaddps_256(<8 x float> %dst, <8 x float> %i, <8 x float> %j, <8 x i32> %mask1) nounwind readnone {
; CHECK-LABEL: test_mask_vaddps_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpxor %ymm4, %ymm4, %ymm4 ## EVEX TO VEX Compression encoding: [0xc5,0xdd,0xef,0xe4]
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4]
; CHECK-NEXT: vpcmpneqd %ymm4, %ymm3, %k1 ## encoding: [0x62,0xf3,0x65,0x28,0x1f,0xcc,0x04]
; CHECK-NEXT: vaddps %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf1,0x74,0x29,0x58,0xc2]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -228,8 +228,8 @@ define <8 x float> @test_mask_vaddps_256(<8 x float> %dst, <8 x float> %i, <8 x
define <8 x float> @test_mask_vmulps_256(<8 x float> %dst, <8 x float> %i, <8 x float> %j, <8 x i32> %mask1) nounwind readnone {
; CHECK-LABEL: test_mask_vmulps_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpxor %ymm4, %ymm4, %ymm4 ## EVEX TO VEX Compression encoding: [0xc5,0xdd,0xef,0xe4]
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4]
; CHECK-NEXT: vpcmpneqd %ymm4, %ymm3, %k1 ## encoding: [0x62,0xf3,0x65,0x28,0x1f,0xcc,0x04]
; CHECK-NEXT: vmulps %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf1,0x74,0x29,0x59,0xc2]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -241,8 +241,8 @@ define <8 x float> @test_mask_vmulps_256(<8 x float> %dst, <8 x float> %i, <8 x
define <8 x float> @test_mask_vminps_256(<8 x float> %dst, <8 x float> %i, <8 x float> %j, <8 x i32> %mask1)nounwind readnone {
; CHECK-LABEL: test_mask_vminps_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpxor %ymm4, %ymm4, %ymm4 ## EVEX TO VEX Compression encoding: [0xc5,0xdd,0xef,0xe4]
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4]
; CHECK-NEXT: vpcmpneqd %ymm4, %ymm3, %k1 ## encoding: [0x62,0xf3,0x65,0x28,0x1f,0xcc,0x04]
; CHECK-NEXT: vminps %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf1,0x74,0x29,0x5d,0xc2]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -255,8 +255,8 @@ define <8 x float> @test_mask_vminps_256(<8 x float> %dst, <8 x float> %i, <8 x
define <8 x float> @test_mask_vmaxps_256(<8 x float> %dst, <8 x float> %i, <8 x float> %j, <8 x i32> %mask1) nounwind readnone {
; CHECK-LABEL: test_mask_vmaxps_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpxor %ymm4, %ymm4, %ymm4 ## EVEX TO VEX Compression encoding: [0xc5,0xdd,0xef,0xe4]
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4]
; CHECK-NEXT: vpcmpneqd %ymm4, %ymm3, %k1 ## encoding: [0x62,0xf3,0x65,0x28,0x1f,0xcc,0x04]
; CHECK-NEXT: vmaxps %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf1,0x74,0x29,0x5f,0xc2]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -269,8 +269,8 @@ define <8 x float> @test_mask_vmaxps_256(<8 x float> %dst, <8 x float> %i, <8 x
define <8 x float> @test_mask_vsubps_256(<8 x float> %dst, <8 x float> %i, <8 x float> %j, <8 x i32> %mask1) nounwind readnone {
; CHECK-LABEL: test_mask_vsubps_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpxor %ymm4, %ymm4, %ymm4 ## EVEX TO VEX Compression encoding: [0xc5,0xdd,0xef,0xe4]
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4]
; CHECK-NEXT: vpcmpneqd %ymm4, %ymm3, %k1 ## encoding: [0x62,0xf3,0x65,0x28,0x1f,0xcc,0x04]
; CHECK-NEXT: vsubps %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf1,0x74,0x29,0x5c,0xc2]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -282,8 +282,8 @@ define <8 x float> @test_mask_vsubps_256(<8 x float> %dst, <8 x float> %i, <8 x
define <8 x float> @test_mask_vdivps_256(<8 x float> %dst, <8 x float> %i, <8 x float> %j, <8 x i32> %mask1) nounwind readnone {
; CHECK-LABEL: test_mask_vdivps_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpxor %ymm4, %ymm4, %ymm4 ## EVEX TO VEX Compression encoding: [0xc5,0xdd,0xef,0xe4]
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4]
; CHECK-NEXT: vpcmpneqd %ymm4, %ymm3, %k1 ## encoding: [0x62,0xf3,0x65,0x28,0x1f,0xcc,0x04]
; CHECK-NEXT: vdivps %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf1,0x74,0x29,0x5e,0xc2]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -295,8 +295,8 @@ define <8 x float> @test_mask_vdivps_256(<8 x float> %dst, <8 x float> %i, <8 x
define <4 x double> @test_mask_vmulpd_256(<4 x double> %dst, <4 x double> %i, <4 x double> %j, <4 x i64> %mask1) nounwind readnone {
; CHECK-LABEL: test_mask_vmulpd_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpxor %ymm4, %ymm4, %ymm4 ## EVEX TO VEX Compression encoding: [0xc5,0xdd,0xef,0xe4]
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4]
; CHECK-NEXT: vpcmpneqq %ymm4, %ymm3, %k1 ## encoding: [0x62,0xf3,0xe5,0x28,0x1f,0xcc,0x04]
; CHECK-NEXT: vmulpd %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf1,0xf5,0x29,0x59,0xc2]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -308,8 +308,8 @@ define <4 x double> @test_mask_vmulpd_256(<4 x double> %dst, <4 x double> %i, <4
define <4 x double> @test_mask_vminpd_256(<4 x double> %dst, <4 x double> %i, <4 x double> %j, <4 x i64> %mask1) nounwind readnone {
; CHECK-LABEL: test_mask_vminpd_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpxor %ymm4, %ymm4, %ymm4 ## EVEX TO VEX Compression encoding: [0xc5,0xdd,0xef,0xe4]
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4]
; CHECK-NEXT: vpcmpneqq %ymm4, %ymm3, %k1 ## encoding: [0x62,0xf3,0xe5,0x28,0x1f,0xcc,0x04]
; CHECK-NEXT: vminpd %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf1,0xf5,0x29,0x5d,0xc2]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -322,8 +322,8 @@ define <4 x double> @test_mask_vminpd_256(<4 x double> %dst, <4 x double> %i, <4
define <4 x double> @test_mask_vmaxpd_256(<4 x double> %dst, <4 x double> %i, <4 x double> %j, <4 x i64> %mask1) nounwind readnone {
; CHECK-LABEL: test_mask_vmaxpd_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpxor %ymm4, %ymm4, %ymm4 ## EVEX TO VEX Compression encoding: [0xc5,0xdd,0xef,0xe4]
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4]
; CHECK-NEXT: vpcmpneqq %ymm4, %ymm3, %k1 ## encoding: [0x62,0xf3,0xe5,0x28,0x1f,0xcc,0x04]
; CHECK-NEXT: vmaxpd %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf1,0xf5,0x29,0x5f,0xc2]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -336,8 +336,8 @@ define <4 x double> @test_mask_vmaxpd_256(<4 x double> %dst, <4 x double> %i, <4
define <4 x double> @test_mask_vsubpd_256(<4 x double> %dst, <4 x double> %i, <4 x double> %j, <4 x i64> %mask1) nounwind readnone {
; CHECK-LABEL: test_mask_vsubpd_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpxor %ymm4, %ymm4, %ymm4 ## EVEX TO VEX Compression encoding: [0xc5,0xdd,0xef,0xe4]
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4]
; CHECK-NEXT: vpcmpneqq %ymm4, %ymm3, %k1 ## encoding: [0x62,0xf3,0xe5,0x28,0x1f,0xcc,0x04]
; CHECK-NEXT: vsubpd %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf1,0xf5,0x29,0x5c,0xc2]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -349,8 +349,8 @@ define <4 x double> @test_mask_vsubpd_256(<4 x double> %dst, <4 x double> %i, <4
define <4 x double> @test_mask_vdivpd_256(<4 x double> %dst, <4 x double> %i, <4 x double> %j, <4 x i64> %mask1) nounwind readnone {
; CHECK-LABEL: test_mask_vdivpd_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpxor %ymm4, %ymm4, %ymm4 ## EVEX TO VEX Compression encoding: [0xc5,0xdd,0xef,0xe4]
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4]
; CHECK-NEXT: vpcmpneqq %ymm4, %ymm3, %k1 ## encoding: [0x62,0xf3,0xe5,0x28,0x1f,0xcc,0x04]
; CHECK-NEXT: vdivpd %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf1,0xf5,0x29,0x5e,0xc2]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -362,8 +362,8 @@ define <4 x double> @test_mask_vdivpd_256(<4 x double> %dst, <4 x double> %i, <4
define <4 x double> @test_mask_vaddpd_256(<4 x double> %dst, <4 x double> %i, <4 x double> %j, <4 x i64> %mask1) nounwind readnone {
; CHECK-LABEL: test_mask_vaddpd_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpxor %ymm4, %ymm4, %ymm4 ## EVEX TO VEX Compression encoding: [0xc5,0xdd,0xef,0xe4]
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4]
; CHECK-NEXT: vpcmpneqq %ymm4, %ymm3, %k1 ## encoding: [0x62,0xf3,0xe5,0x28,0x1f,0xcc,0x04]
; CHECK-NEXT: vaddpd %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf1,0xf5,0x29,0x58,0xc2]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -375,8 +375,8 @@ define <4 x double> @test_mask_vaddpd_256(<4 x double> %dst, <4 x double> %i, <4
define <4 x double> @test_maskz_vaddpd_256(<4 x double> %i, <4 x double> %j, <4 x i64> %mask1) nounwind readnone {
; CHECK-LABEL: test_maskz_vaddpd_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpxor %ymm3, %ymm3, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xe5,0xef,0xdb]
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xe1,0xef,0xdb]
; CHECK-NEXT: vpcmpneqq %ymm3, %ymm2, %k1 ## encoding: [0x62,0xf3,0xed,0x28,0x1f,0xcb,0x04]
; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0x58,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -388,8 +388,8 @@ define <4 x double> @test_maskz_vaddpd_256(<4 x double> %i, <4 x double> %j, <4
define <4 x double> @test_mask_fold_vaddpd_256(<4 x double> %dst, <4 x double> %i, <4 x double>* %j, <4 x i64> %mask1) nounwind {
; CHECK-LABEL: test_mask_fold_vaddpd_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpxor %ymm3, %ymm3, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xe5,0xef,0xdb]
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xe1,0xef,0xdb]
; CHECK-NEXT: vpcmpneqq %ymm3, %ymm2, %k1 ## encoding: [0x62,0xf3,0xed,0x28,0x1f,0xcb,0x04]
; CHECK-NEXT: vaddpd (%rdi), %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf1,0xf5,0x29,0x58,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -402,8 +402,8 @@ define <4 x double> @test_mask_fold_vaddpd_256(<4 x double> %dst, <4 x double> %
define <4 x double> @test_maskz_fold_vaddpd_256(<4 x double> %i, <4 x double>* %j, <4 x i64> %mask1) nounwind {
; CHECK-LABEL: test_maskz_fold_vaddpd_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpxor %ymm2, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xef,0xd2]
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2]
; CHECK-NEXT: vpcmpneqq %ymm2, %ymm1, %k1 ## encoding: [0x62,0xf3,0xf5,0x28,0x1f,0xca,0x04]
; CHECK-NEXT: vaddpd (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0x58,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -416,7 +416,7 @@ define <4 x double> @test_maskz_fold_vaddpd_256(<4 x double> %i, <4 x double>* %
define <4 x double> @test_broadcast2_vaddpd_256(<4 x double> %i, double* %j) nounwind {
; CHECK-LABEL: test_broadcast2_vaddpd_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vaddpd (%rdi){1to4}, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x38,0x58,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%tmp = load double, double* %j
@@ -428,8 +428,8 @@ define <4 x double> @test_broadcast2_vaddpd_256(<4 x double> %i, double* %j) nou
define <4 x double> @test_mask_broadcast_vaddpd_256(<4 x double> %dst, <4 x double> %i, double* %j, <4 x i64> %mask1) nounwind {
; CHECK-LABEL: test_mask_broadcast_vaddpd_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpxor %ymm0, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xef,0xc0]
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xef,0xc0]
; CHECK-NEXT: vpcmpneqq %ymm0, %ymm2, %k1 ## encoding: [0x62,0xf3,0xed,0x28,0x1f,0xc8,0x04]
; CHECK-NEXT: vaddpd (%rdi){1to4}, %ymm1, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x39,0x58,0x0f]
; CHECK-NEXT: vmovapd %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xc1]
@@ -445,8 +445,8 @@ define <4 x double> @test_mask_broadcast_vaddpd_256(<4 x double> %dst, <4 x doub
define <4 x double> @test_maskz_broadcast_vaddpd_256(<4 x double> %i, double* %j, <4 x i64> %mask1) nounwind {
; CHECK-LABEL: test_maskz_broadcast_vaddpd_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpxor %ymm2, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xef,0xd2]
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2]
; CHECK-NEXT: vpcmpneqq %ymm2, %ymm1, %k1 ## encoding: [0x62,0xf3,0xf5,0x28,0x1f,0xca,0x04]
; CHECK-NEXT: vaddpd (%rdi){1to4}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xb9,0x58,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -463,7 +463,7 @@ define <4 x double> @test_maskz_broadcast_vaddpd_256(<4 x double> %i, double* %j
define <2 x i64> @vpaddq128_test(<2 x i64> %i, <2 x i64> %j) nounwind readnone {
; CHECK-LABEL: vpaddq128_test:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd4,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%x = add <2 x i64> %i, %j
@@ -472,7 +472,7 @@ define <2 x i64> @vpaddq128_test(<2 x i64> %i, <2 x i64> %j) nounwind readnone {
define <2 x i64> @vpaddq128_fold_test(<2 x i64> %i, <2 x i64>* %j) nounwind {
; CHECK-LABEL: vpaddq128_fold_test:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpaddq (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd4,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%tmp = load <2 x i64>, <2 x i64>* %j, align 4
@@ -482,7 +482,7 @@ define <2 x i64> @vpaddq128_fold_test(<2 x i64> %i, <2 x i64>* %j) nounwind {
define <2 x i64> @vpaddq128_broadcast2_test(<2 x i64> %i, i64* %j) nounwind {
; CHECK-LABEL: vpaddq128_broadcast2_test:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpaddq (%rdi){1to2}, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x18,0xd4,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%tmp = load i64, i64* %j
@@ -494,7 +494,7 @@ define <2 x i64> @vpaddq128_broadcast2_test(<2 x i64> %i, i64* %j) nounwind {
define <4 x i32> @vpaddd128_test(<4 x i32> %i, <4 x i32> %j) nounwind readnone {
; CHECK-LABEL: vpaddd128_test:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%x = add <4 x i32> %i, %j
@@ -503,7 +503,7 @@ define <4 x i32> @vpaddd128_test(<4 x i32> %i, <4 x i32> %j) nounwind readnone {
define <4 x i32> @vpaddd128_fold_test(<4 x i32> %i, <4 x i32>* %j) nounwind {
; CHECK-LABEL: vpaddd128_fold_test:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpaddd (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%tmp = load <4 x i32>, <4 x i32>* %j, align 4
@@ -513,7 +513,7 @@ define <4 x i32> @vpaddd128_fold_test(<4 x i32> %i, <4 x i32>* %j) nounwind {
define <4 x i32> @vpaddd128_broadcast_test(<4 x i32> %i) nounwind {
; CHECK-LABEL: vpaddd128_broadcast_test:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpaddd {{.*}}(%rip){1to4}, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x18,0xfe,0x05,A,A,A,A]
; CHECK-NEXT: ## fixup A - offset: 6, value: LCPI42_0-4, kind: reloc_riprel_4byte
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -523,7 +523,7 @@ define <4 x i32> @vpaddd128_broadcast_test(<4 x i32> %i) nounwind {
define <4 x i32> @vpaddd128_mask_test(<4 x i32> %i, <4 x i32> %j, <4 x i32> %mask1) nounwind readnone {
; CHECK-LABEL: vpaddd128_mask_test:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xe1,0xef,0xdb]
; CHECK-NEXT: vpcmpneqd %xmm3, %xmm2, %k1 ## encoding: [0x62,0xf3,0x6d,0x08,0x1f,0xcb,0x04]
; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xfe,0xc1]
@@ -536,7 +536,7 @@ define <4 x i32> @vpaddd128_mask_test(<4 x i32> %i, <4 x i32> %j, <4 x i32> %mas
define <4 x i32> @vpaddd128_maskz_test(<4 x i32> %i, <4 x i32> %j, <4 x i32> %mask1) nounwind readnone {
; CHECK-LABEL: vpaddd128_maskz_test:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xe1,0xef,0xdb]
; CHECK-NEXT: vpcmpneqd %xmm3, %xmm2, %k1 ## encoding: [0x62,0xf3,0x6d,0x08,0x1f,0xcb,0x04]
; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xfe,0xc1]
@@ -549,7 +549,7 @@ define <4 x i32> @vpaddd128_maskz_test(<4 x i32> %i, <4 x i32> %j, <4 x i32> %ma
define <4 x i32> @vpaddd128_mask_fold_test(<4 x i32> %i, <4 x i32>* %j.ptr, <4 x i32> %mask1) nounwind readnone {
; CHECK-LABEL: vpaddd128_mask_fold_test:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2]
; CHECK-NEXT: vpcmpneqd %xmm2, %xmm1, %k1 ## encoding: [0x62,0xf3,0x75,0x08,0x1f,0xca,0x04]
; CHECK-NEXT: vpaddd (%rdi), %xmm0, %xmm0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xfe,0x07]
@@ -563,7 +563,7 @@ define <4 x i32> @vpaddd128_mask_fold_test(<4 x i32> %i, <4 x i32>* %j.ptr, <4 x
define <4 x i32> @vpaddd128_mask_broadcast_test(<4 x i32> %i, <4 x i32> %mask1) nounwind readnone {
; CHECK-LABEL: vpaddd128_mask_broadcast_test:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2]
; CHECK-NEXT: vpcmpneqd %xmm2, %xmm1, %k1 ## encoding: [0x62,0xf3,0x75,0x08,0x1f,0xca,0x04]
; CHECK-NEXT: vpaddd {{.*}}(%rip){1to4}, %xmm0, %xmm0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x19,0xfe,0x05,A,A,A,A]
@@ -577,7 +577,7 @@ define <4 x i32> @vpaddd128_mask_broadcast_test(<4 x i32> %i, <4 x i32> %mask1)
define <4 x i32> @vpaddd128_maskz_fold_test(<4 x i32> %i, <4 x i32>* %j.ptr, <4 x i32> %mask1) nounwind readnone {
; CHECK-LABEL: vpaddd128_maskz_fold_test:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2]
; CHECK-NEXT: vpcmpneqd %xmm2, %xmm1, %k1 ## encoding: [0x62,0xf3,0x75,0x08,0x1f,0xca,0x04]
; CHECK-NEXT: vpaddd (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xfe,0x07]
@@ -591,7 +591,7 @@ define <4 x i32> @vpaddd128_maskz_fold_test(<4 x i32> %i, <4 x i32>* %j.ptr, <4
define <4 x i32> @vpaddd128_maskz_broadcast_test(<4 x i32> %i, <4 x i32> %mask1) nounwind readnone {
; CHECK-LABEL: vpaddd128_maskz_broadcast_test:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2]
; CHECK-NEXT: vpcmpneqd %xmm2, %xmm1, %k1 ## encoding: [0x62,0xf3,0x75,0x08,0x1f,0xca,0x04]
; CHECK-NEXT: vpaddd {{.*}}(%rip){1to4}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x99,0xfe,0x05,A,A,A,A]
@@ -605,7 +605,7 @@ define <4 x i32> @vpaddd128_maskz_broadcast_test(<4 x i32> %i, <4 x i32> %mask1)
define <2 x i64> @vpsubq128_test(<2 x i64> %i, <2 x i64> %j) nounwind readnone {
; CHECK-LABEL: vpsubq128_test:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsubq %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfb,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%x = sub <2 x i64> %i, %j
@@ -614,7 +614,7 @@ define <2 x i64> @vpsubq128_test(<2 x i64> %i, <2 x i64> %j) nounwind readnone {
define <4 x i32> @vpsubd128_test(<4 x i32> %i, <4 x i32> %j) nounwind readnone {
; CHECK-LABEL: vpsubd128_test:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfa,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%x = sub <4 x i32> %i, %j
@@ -623,7 +623,7 @@ define <4 x i32> @vpsubd128_test(<4 x i32> %i, <4 x i32> %j) nounwind readnone {
define <4 x i32> @vpmulld128_test(<4 x i32> %i, <4 x i32> %j) {
; CHECK-LABEL: vpmulld128_test:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpmulld %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x40,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%x = mul <4 x i32> %i, %j
@@ -632,7 +632,7 @@ define <4 x i32> @vpmulld128_test(<4 x i32> %i, <4 x i32> %j) {
define <2 x double> @test_vaddpd_128(<2 x double> %y, <2 x double> %x) {
; CHECK-LABEL: test_vaddpd_128:
-; CHECK: ## BB#0: ## %entry
+; CHECK: ## %bb.0: ## %entry
; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
entry:
@@ -642,7 +642,7 @@ entry:
define <2 x double> @test_fold_vaddpd_128(<2 x double> %y) {
; CHECK-LABEL: test_fold_vaddpd_128:
-; CHECK: ## BB#0: ## %entry
+; CHECK: ## %bb.0: ## %entry
; CHECK-NEXT: vaddpd {{.*}}(%rip), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x58,0x05,A,A,A,A]
; CHECK-NEXT: ## fixup A - offset: 4, value: LCPI53_0-4, kind: reloc_riprel_4byte
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -653,7 +653,7 @@ entry:
define <4 x float> @test_broadcast_vaddpd_128(<4 x float> %a) nounwind {
; CHECK-LABEL: test_broadcast_vaddpd_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vaddps {{.*}}(%rip){1to4}, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x18,0x58,0x05,A,A,A,A]
; CHECK-NEXT: ## fixup A - offset: 6, value: LCPI54_0-4, kind: reloc_riprel_4byte
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -663,7 +663,7 @@ define <4 x float> @test_broadcast_vaddpd_128(<4 x float> %a) nounwind {
define <4 x float> @test_mask_vaddps_128(<4 x float> %dst, <4 x float> %i, <4 x float> %j, <4 x i32> %mask1) nounwind readnone {
; CHECK-LABEL: test_mask_vaddps_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4]
; CHECK-NEXT: vpcmpneqd %xmm4, %xmm3, %k1 ## encoding: [0x62,0xf3,0x65,0x08,0x1f,0xcc,0x04]
; CHECK-NEXT: vaddps %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf1,0x74,0x09,0x58,0xc2]
@@ -676,7 +676,7 @@ define <4 x float> @test_mask_vaddps_128(<4 x float> %dst, <4 x float> %i, <4 x
define <4 x float> @test_mask_vmulps_128(<4 x float> %dst, <4 x float> %i, <4 x float> %j, <4 x i32> %mask1) nounwind readnone {
; CHECK-LABEL: test_mask_vmulps_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4]
; CHECK-NEXT: vpcmpneqd %xmm4, %xmm3, %k1 ## encoding: [0x62,0xf3,0x65,0x08,0x1f,0xcc,0x04]
; CHECK-NEXT: vmulps %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf1,0x74,0x09,0x59,0xc2]
@@ -689,7 +689,7 @@ define <4 x float> @test_mask_vmulps_128(<4 x float> %dst, <4 x float> %i, <4 x
define <4 x float> @test_mask_vminps_128(<4 x float> %dst, <4 x float> %i, <4 x float> %j, <4 x i32> %mask1) nounwind readnone {
; CHECK-LABEL: test_mask_vminps_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4]
; CHECK-NEXT: vpcmpneqd %xmm4, %xmm3, %k1 ## encoding: [0x62,0xf3,0x65,0x08,0x1f,0xcc,0x04]
; CHECK-NEXT: vminps %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf1,0x74,0x09,0x5d,0xc2]
@@ -703,7 +703,7 @@ define <4 x float> @test_mask_vminps_128(<4 x float> %dst, <4 x float> %i, <4 x
define <4 x float> @test_mask_vmaxps_128(<4 x float> %dst, <4 x float> %i, <4 x float> %j, <4 x i32> %mask1) nounwind readnone {
; CHECK-LABEL: test_mask_vmaxps_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4]
; CHECK-NEXT: vpcmpneqd %xmm4, %xmm3, %k1 ## encoding: [0x62,0xf3,0x65,0x08,0x1f,0xcc,0x04]
; CHECK-NEXT: vmaxps %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf1,0x74,0x09,0x5f,0xc2]
@@ -717,7 +717,7 @@ define <4 x float> @test_mask_vmaxps_128(<4 x float> %dst, <4 x float> %i, <4 x
define <4 x float> @test_mask_vsubps_128(<4 x float> %dst, <4 x float> %i, <4 x float> %j, <4 x i32> %mask1) nounwind readnone {
; CHECK-LABEL: test_mask_vsubps_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4]
; CHECK-NEXT: vpcmpneqd %xmm4, %xmm3, %k1 ## encoding: [0x62,0xf3,0x65,0x08,0x1f,0xcc,0x04]
; CHECK-NEXT: vsubps %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf1,0x74,0x09,0x5c,0xc2]
@@ -731,7 +731,7 @@ define <4 x float> @test_mask_vsubps_128(<4 x float> %dst, <4 x float> %i, <4 x
define <4 x float> @test_mask_vdivps_128(<4 x float> %dst, <4 x float> %i, <4 x float> %j, <4 x i32> %mask1) nounwind readnone {
; CHECK-LABEL: test_mask_vdivps_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4]
; CHECK-NEXT: vpcmpneqd %xmm4, %xmm3, %k1 ## encoding: [0x62,0xf3,0x65,0x08,0x1f,0xcc,0x04]
; CHECK-NEXT: vdivps %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf1,0x74,0x09,0x5e,0xc2]
@@ -744,7 +744,7 @@ define <4 x float> @test_mask_vdivps_128(<4 x float> %dst, <4 x float> %i, <4 x
define <2 x double> @test_mask_vmulpd_128(<2 x double> %dst, <2 x double> %i, <2 x double> %j, <2 x i64> %mask1) nounwind readnone {
; CHECK-LABEL: test_mask_vmulpd_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4]
; CHECK-NEXT: vpcmpneqq %xmm4, %xmm3, %k1 ## encoding: [0x62,0xf3,0xe5,0x08,0x1f,0xcc,0x04]
; CHECK-NEXT: vmulpd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf1,0xf5,0x09,0x59,0xc2]
@@ -757,7 +757,7 @@ define <2 x double> @test_mask_vmulpd_128(<2 x double> %dst, <2 x double> %i, <2
define <2 x double> @test_mask_vminpd_128(<2 x double> %dst, <2 x double> %i, <2 x double> %j, <2 x i64> %mask1) nounwind readnone {
; CHECK-LABEL: test_mask_vminpd_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4]
; CHECK-NEXT: vpcmpneqq %xmm4, %xmm3, %k1 ## encoding: [0x62,0xf3,0xe5,0x08,0x1f,0xcc,0x04]
; CHECK-NEXT: vminpd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf1,0xf5,0x09,0x5d,0xc2]
@@ -771,7 +771,7 @@ define <2 x double> @test_mask_vminpd_128(<2 x double> %dst, <2 x double> %i, <2
define <2 x double> @test_mask_vmaxpd_128(<2 x double> %dst, <2 x double> %i, <2 x double> %j, <2 x i64> %mask1) nounwind readnone {
; CHECK-LABEL: test_mask_vmaxpd_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4]
; CHECK-NEXT: vpcmpneqq %xmm4, %xmm3, %k1 ## encoding: [0x62,0xf3,0xe5,0x08,0x1f,0xcc,0x04]
; CHECK-NEXT: vmaxpd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf1,0xf5,0x09,0x5f,0xc2]
@@ -785,7 +785,7 @@ define <2 x double> @test_mask_vmaxpd_128(<2 x double> %dst, <2 x double> %i, <2
define <2 x double> @test_mask_vsubpd_128(<2 x double> %dst, <2 x double> %i, <2 x double> %j, <2 x i64> %mask1) nounwind readnone {
; CHECK-LABEL: test_mask_vsubpd_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4]
; CHECK-NEXT: vpcmpneqq %xmm4, %xmm3, %k1 ## encoding: [0x62,0xf3,0xe5,0x08,0x1f,0xcc,0x04]
; CHECK-NEXT: vsubpd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf1,0xf5,0x09,0x5c,0xc2]
@@ -798,7 +798,7 @@ define <2 x double> @test_mask_vsubpd_128(<2 x double> %dst, <2 x double> %i, <2
define <2 x double> @test_mask_vdivpd_128(<2 x double> %dst, <2 x double> %i, <2 x double> %j, <2 x i64> %mask1) nounwind readnone {
; CHECK-LABEL: test_mask_vdivpd_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4]
; CHECK-NEXT: vpcmpneqq %xmm4, %xmm3, %k1 ## encoding: [0x62,0xf3,0xe5,0x08,0x1f,0xcc,0x04]
; CHECK-NEXT: vdivpd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf1,0xf5,0x09,0x5e,0xc2]
@@ -811,7 +811,7 @@ define <2 x double> @test_mask_vdivpd_128(<2 x double> %dst, <2 x double> %i, <2
define <2 x double> @test_mask_vaddpd_128(<2 x double> %dst, <2 x double> %i, <2 x double> %j, <2 x i64> %mask1) nounwind readnone {
; CHECK-LABEL: test_mask_vaddpd_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4]
; CHECK-NEXT: vpcmpneqq %xmm4, %xmm3, %k1 ## encoding: [0x62,0xf3,0xe5,0x08,0x1f,0xcc,0x04]
; CHECK-NEXT: vaddpd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf1,0xf5,0x09,0x58,0xc2]
@@ -824,7 +824,7 @@ define <2 x double> @test_mask_vaddpd_128(<2 x double> %dst, <2 x double> %i, <2
define <2 x double> @test_maskz_vaddpd_128(<2 x double> %i, <2 x double> %j,
; CHECK-LABEL: test_maskz_vaddpd_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xe1,0xef,0xdb]
; CHECK-NEXT: vpcmpneqq %xmm3, %xmm2, %k1 ## encoding: [0x62,0xf3,0xed,0x08,0x1f,0xcb,0x04]
; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0x58,0xc1]
@@ -838,7 +838,7 @@ define <2 x double> @test_maskz_vaddpd_128(<2 x double> %i, <2 x double> %j,
define <2 x double> @test_mask_fold_vaddpd_128(<2 x double> %dst, <2 x double> %i, <2 x double>* %j, <2 x i64> %mask1) nounwind {
; CHECK-LABEL: test_mask_fold_vaddpd_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xe1,0xef,0xdb]
; CHECK-NEXT: vpcmpneqq %xmm3, %xmm2, %k1 ## encoding: [0x62,0xf3,0xed,0x08,0x1f,0xcb,0x04]
; CHECK-NEXT: vaddpd (%rdi), %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf1,0xf5,0x09,0x58,0x07]
@@ -852,7 +852,7 @@ define <2 x double> @test_mask_fold_vaddpd_128(<2 x double> %dst, <2 x double> %
define <2 x double> @test_maskz_fold_vaddpd_128(<2 x double> %i, <2 x double>* %j, <2 x i64> %mask1) nounwind {
; CHECK-LABEL: test_maskz_fold_vaddpd_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2]
; CHECK-NEXT: vpcmpneqq %xmm2, %xmm1, %k1 ## encoding: [0x62,0xf3,0xf5,0x08,0x1f,0xca,0x04]
; CHECK-NEXT: vaddpd (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0x58,0x07]
@@ -866,7 +866,7 @@ define <2 x double> @test_maskz_fold_vaddpd_128(<2 x double> %i, <2 x double>* %
define <2 x double> @test_broadcast2_vaddpd_128(<2 x double> %i, double* %j) nounwind {
; CHECK-LABEL: test_broadcast2_vaddpd_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vaddpd (%rdi){1to2}, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x18,0x58,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%tmp = load double, double* %j
@@ -878,7 +878,7 @@ define <2 x double> @test_broadcast2_vaddpd_128(<2 x double> %i, double* %j) nou
define <2 x double> @test_mask_broadcast_vaddpd_128(<2 x double> %dst, <2 x double> %i, double* %j, <2 x i64> %mask1) nounwind {
; CHECK-LABEL: test_mask_broadcast_vaddpd_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xef,0xc0]
; CHECK-NEXT: vpcmpneqq %xmm0, %xmm2, %k1 ## encoding: [0x62,0xf3,0xed,0x08,0x1f,0xc8,0x04]
; CHECK-NEXT: vaddpd (%rdi){1to2}, %xmm1, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x19,0x58,0x0f]
@@ -895,7 +895,7 @@ define <2 x double> @test_mask_broadcast_vaddpd_128(<2 x double> %dst, <2 x doub
define <2 x double> @test_maskz_broadcast_vaddpd_128(<2 x double> %i, double* %j, <2 x i64> %mask1) nounwind {
; CHECK-LABEL: test_maskz_broadcast_vaddpd_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2]
; CHECK-NEXT: vpcmpneqq %xmm2, %xmm1, %k1 ## encoding: [0x62,0xf3,0xf5,0x08,0x1f,0xca,0x04]
; CHECK-NEXT: vaddpd (%rdi){1to2}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x99,0x58,0x07]
diff --git a/test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll b/test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll
index 530e2c544cfb..fdd6f7126457 100644
--- a/test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll
+++ b/test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll
@@ -4,14 +4,862 @@
; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/avx512vl-builtins.c
+define <8 x float> @test_mm256_shuffle_f32x4(<8 x float> %__A, <8 x float> %__B) {
+; X32-LABEL: test_mm256_shuffle_f32x4:
+; X32: # %bb.0: # %entry
+; X32-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_shuffle_f32x4:
+; X64: # %bb.0: # %entry
+; X64-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
+; X64-NEXT: retq
+entry:
+ %shuffle = shufflevector <8 x float> %__A, <8 x float> %__B, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
+ ret <8 x float> %shuffle
+}
+
+define <8 x float> @test_mm256_mask_shuffle_f32x4(<8 x float> %__W, i8 zeroext %__U, <8 x float> %__A, <8 x float> %__B) {
+; X32-LABEL: test_mm256_mask_shuffle_f32x4:
+; X32: # %bb.0: # %entry
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} = ymm1[4,5,6,7],ymm2[4,5,6,7]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_mask_shuffle_f32x4:
+; X64: # %bb.0: # %entry
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} = ymm1[4,5,6,7],ymm2[4,5,6,7]
+; X64-NEXT: retq
+entry:
+ %shuffle = shufflevector <8 x float> %__A, <8 x float> %__B, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
+ %0 = bitcast i8 %__U to <8 x i1>
+ %1 = select <8 x i1> %0, <8 x float> %shuffle, <8 x float> %__W
+ ret <8 x float> %1
+}
+
+define <8 x float> @test_mm256_maskz_shuffle_f32x4(i8 zeroext %__U, <8 x float> %__A, <8 x float> %__B) {
+; X32-LABEL: test_mm256_maskz_shuffle_f32x4:
+; X32: # %bb.0: # %entry
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_maskz_shuffle_f32x4:
+; X64: # %bb.0: # %entry
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7]
+; X64-NEXT: retq
+entry:
+ %shuffle = shufflevector <8 x float> %__A, <8 x float> %__B, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
+ %0 = bitcast i8 %__U to <8 x i1>
+ %1 = select <8 x i1> %0, <8 x float> %shuffle, <8 x float> zeroinitializer
+ ret <8 x float> %1
+}
+
+define <4 x double> @test_mm256_shuffle_f64x2(<4 x double> %__A, <4 x double> %__B) {
+; X32-LABEL: test_mm256_shuffle_f64x2:
+; X32: # %bb.0: # %entry
+; X32-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_shuffle_f64x2:
+; X64: # %bb.0: # %entry
+; X64-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
+; X64-NEXT: retq
+entry:
+ %shuffle = shufflevector <4 x double> %__A, <4 x double> %__B, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+ ret <4 x double> %shuffle
+}
+
+define <4 x double> @test_mm256_mask_shuffle_f64x2(<4 x double> %__W, i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B) {
+; X32-LABEL: test_mm256_mask_shuffle_f64x2:
+; X32: # %bb.0: # %entry
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} = ymm1[2,3],ymm2[2,3]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_mask_shuffle_f64x2:
+; X64: # %bb.0: # %entry
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} = ymm1[2,3],ymm2[2,3]
+; X64-NEXT: retq
+entry:
+ %shuffle = shufflevector <4 x double> %__A, <4 x double> %__B, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+ %0 = bitcast i8 %__U to <8 x i1>
+ %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %1 = select <4 x i1> %extract, <4 x double> %shuffle, <4 x double> %__W
+ ret <4 x double> %1
+}
+
+define <4 x double> @test_mm256_maskz_shuffle_f64x2(i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B) {
+; X32-LABEL: test_mm256_maskz_shuffle_f64x2:
+; X32: # %bb.0: # %entry
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_maskz_shuffle_f64x2:
+; X64: # %bb.0: # %entry
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3]
+; X64-NEXT: retq
+entry:
+ %shuffle = shufflevector <4 x double> %__A, <4 x double> %__B, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+ %0 = bitcast i8 %__U to <8 x i1>
+ %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %1 = select <4 x i1> %extract, <4 x double> %shuffle, <4 x double> zeroinitializer
+ ret <4 x double> %1
+}
+
+define <4 x i64> @test_mm256_shuffle_i32x4(<4 x i64> %__A, <4 x i64> %__B) {
+; X32-LABEL: test_mm256_shuffle_i32x4:
+; X32: # %bb.0: # %entry
+; X32-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_shuffle_i32x4:
+; X64: # %bb.0: # %entry
+; X64-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
+; X64-NEXT: retq
+entry:
+ %shuffle = shufflevector <4 x i64> %__A, <4 x i64> %__B, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+ ret <4 x i64> %shuffle
+}
+
+define <4 x i64> @test_mm256_mask_shuffle_i32x4(<4 x i64> %__W, i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
+; X32-LABEL: test_mm256_mask_shuffle_i32x4:
+; X32: # %bb.0: # %entry
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} = ymm1[4,5,6,7],ymm2[4,5,6,7]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_mask_shuffle_i32x4:
+; X64: # %bb.0: # %entry
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} = ymm1[4,5,6,7],ymm2[4,5,6,7]
+; X64-NEXT: retq
+entry:
+ %shuffle = shufflevector <4 x i64> %__A, <4 x i64> %__B, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+ %0 = bitcast <4 x i64> %shuffle to <8 x i32>
+ %1 = bitcast <4 x i64> %__W to <8 x i32>
+ %2 = bitcast i8 %__U to <8 x i1>
+ %3 = select <8 x i1> %2, <8 x i32> %0, <8 x i32> %1
+ %4 = bitcast <8 x i32> %3 to <4 x i64>
+ ret <4 x i64> %4
+}
+
+define <4 x i64> @test_mm256_maskz_shuffle_i32x4(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
+; X32-LABEL: test_mm256_maskz_shuffle_i32x4:
+; X32: # %bb.0: # %entry
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_maskz_shuffle_i32x4:
+; X64: # %bb.0: # %entry
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7]
+; X64-NEXT: retq
+entry:
+ %shuffle = shufflevector <4 x i64> %__A, <4 x i64> %__B, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+ %0 = bitcast <4 x i64> %shuffle to <8 x i32>
+ %1 = bitcast i8 %__U to <8 x i1>
+ %2 = select <8 x i1> %1, <8 x i32> %0, <8 x i32> zeroinitializer
+ %3 = bitcast <8 x i32> %2 to <4 x i64>
+ ret <4 x i64> %3
+}
+
+define <4 x i64> @test_mm256_shuffle_i64x2(<4 x i64> %__A, <4 x i64> %__B) {
+; X32-LABEL: test_mm256_shuffle_i64x2:
+; X32: # %bb.0: # %entry
+; X32-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_shuffle_i64x2:
+; X64: # %bb.0: # %entry
+; X64-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
+; X64-NEXT: retq
+entry:
+ %shuffle = shufflevector <4 x i64> %__A, <4 x i64> %__B, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+ ret <4 x i64> %shuffle
+}
+
+define <4 x i64> @test_mm256_mask_shuffle_i64x2(<4 x i64> %__W, i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
+; X32-LABEL: test_mm256_mask_shuffle_i64x2:
+; X32: # %bb.0: # %entry
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} = ymm1[2,3],ymm2[2,3]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_mask_shuffle_i64x2:
+; X64: # %bb.0: # %entry
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} = ymm1[2,3],ymm2[2,3]
+; X64-NEXT: retq
+entry:
+ %shuffle = shufflevector <4 x i64> %__A, <4 x i64> %__B, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+ %0 = bitcast i8 %__U to <8 x i1>
+ %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %1 = select <4 x i1> %extract, <4 x i64> %shuffle, <4 x i64> %__W
+ ret <4 x i64> %1
+}
+
+define <4 x i64> @test_mm256_maskz_shuffle_i64x2(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
+; X32-LABEL: test_mm256_maskz_shuffle_i64x2:
+; X32: # %bb.0: # %entry
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3]
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_maskz_shuffle_i64x2:
+; X64: # %bb.0: # %entry
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3]
+; X64-NEXT: retq
+entry:
+ %shuffle = shufflevector <4 x i64> %__A, <4 x i64> %__B, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+ %0 = bitcast i8 %__U to <8 x i1>
+ %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %1 = select <4 x i1> %extract, <4 x i64> %shuffle, <4 x i64> zeroinitializer
+ ret <4 x i64> %1
+}
+
+define zeroext i8 @test_mm_test_epi32_mask(<2 x i64> %__A, <2 x i64> %__B) {
+; X32-LABEL: test_mm_test_epi32_mask:
+; X32: # %bb.0: # %entry
+; X32-NEXT: vptestmd %xmm0, %xmm1, %k0
+; X32-NEXT: kmovw %k0, %eax
+; X32-NEXT: movzbl %al, %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_test_epi32_mask:
+; X64: # %bb.0: # %entry
+; X64-NEXT: vptestmd %xmm0, %xmm1, %k0
+; X64-NEXT: kmovw %k0, %eax
+; X64-NEXT: movzbl %al, %eax
+; X64-NEXT: retq
+entry:
+ %and.i.i = and <2 x i64> %__B, %__A
+ %0 = bitcast <2 x i64> %and.i.i to <4 x i32>
+ %1 = icmp ne <4 x i32> %0, zeroinitializer
+ %2 = shufflevector <4 x i1> %1, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %3 = bitcast <8 x i1> %2 to i8
+ ret i8 %3
+}
+
+define zeroext i8 @test_mm_mask_test_epi32_mask(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
+; X32-LABEL: test_mm_mask_test_epi32_mask:
+; X32: # %bb.0: # %entry
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vptestmd %xmm0, %xmm1, %k0 {%k1}
+; X32-NEXT: kmovw %k0, %eax
+; X32-NEXT: movzbl %al, %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_mask_test_epi32_mask:
+; X64: # %bb.0: # %entry
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vptestmd %xmm0, %xmm1, %k0 {%k1}
+; X64-NEXT: kmovw %k0, %eax
+; X64-NEXT: movzbl %al, %eax
+; X64-NEXT: retq
+entry:
+ %and.i.i = and <2 x i64> %__B, %__A
+ %0 = bitcast <2 x i64> %and.i.i to <4 x i32>
+ %1 = icmp ne <4 x i32> %0, zeroinitializer
+ %2 = bitcast i8 %__U to <8 x i1>
+ %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %3 = and <4 x i1> %1, %extract.i
+ %4 = shufflevector <4 x i1> %3, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %5 = bitcast <8 x i1> %4 to i8
+ ret i8 %5
+}
+
+define zeroext i8 @test_mm256_test_epi32_mask(<4 x i64> %__A, <4 x i64> %__B) {
+; X32-LABEL: test_mm256_test_epi32_mask:
+; X32: # %bb.0: # %entry
+; X32-NEXT: vptestmd %ymm0, %ymm1, %k0
+; X32-NEXT: kmovw %k0, %eax
+; X32-NEXT: movzbl %al, %eax
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_test_epi32_mask:
+; X64: # %bb.0: # %entry
+; X64-NEXT: vptestmd %ymm0, %ymm1, %k0
+; X64-NEXT: kmovw %k0, %eax
+; X64-NEXT: movzbl %al, %eax
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+entry:
+ %and.i.i = and <4 x i64> %__B, %__A
+ %0 = bitcast <4 x i64> %and.i.i to <8 x i32>
+ %1 = icmp ne <8 x i32> %0, zeroinitializer
+ %2 = bitcast <8 x i1> %1 to i8
+ ret i8 %2
+}
+
+define zeroext i8 @test_mm256_mask_test_epi32_mask(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
+; X32-LABEL: test_mm256_mask_test_epi32_mask:
+; X32: # %bb.0: # %entry
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vptestmd %ymm0, %ymm1, %k0 {%k1}
+; X32-NEXT: kmovw %k0, %eax
+; X32-NEXT: movzbl %al, %eax
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_mask_test_epi32_mask:
+; X64: # %bb.0: # %entry
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vptestmd %ymm0, %ymm1, %k0 {%k1}
+; X64-NEXT: kmovw %k0, %eax
+; X64-NEXT: movzbl %al, %eax
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+entry:
+ %and.i.i = and <4 x i64> %__B, %__A
+ %0 = bitcast <4 x i64> %and.i.i to <8 x i32>
+ %1 = icmp ne <8 x i32> %0, zeroinitializer
+ %2 = bitcast i8 %__U to <8 x i1>
+ %3 = and <8 x i1> %1, %2
+ %4 = bitcast <8 x i1> %3 to i8
+ ret i8 %4
+}
+
+define zeroext i8 @test_mm_test_epi64_mask(<2 x i64> %__A, <2 x i64> %__B) {
+; X32-LABEL: test_mm_test_epi64_mask:
+; X32: # %bb.0: # %entry
+; X32-NEXT: vptestmq %xmm0, %xmm1, %k0
+; X32-NEXT: kmovw %k0, %eax
+; X32-NEXT: movzbl %al, %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_test_epi64_mask:
+; X64: # %bb.0: # %entry
+; X64-NEXT: vptestmq %xmm0, %xmm1, %k0
+; X64-NEXT: kmovw %k0, %eax
+; X64-NEXT: movzbl %al, %eax
+; X64-NEXT: retq
+entry:
+ %and.i.i = and <2 x i64> %__B, %__A
+ %0 = icmp ne <2 x i64> %and.i.i, zeroinitializer
+ %1 = shufflevector <2 x i1> %0, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
+ %2 = bitcast <8 x i1> %1 to i8
+ ret i8 %2
+}
+
+define zeroext i8 @test_mm_mask_test_epi64_mask(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
+; X32-LABEL: test_mm_mask_test_epi64_mask:
+; X32: # %bb.0: # %entry
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vptestmq %xmm0, %xmm1, %k0 {%k1}
+; X32-NEXT: kmovw %k0, %eax
+; X32-NEXT: movzbl %al, %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_mask_test_epi64_mask:
+; X64: # %bb.0: # %entry
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vptestmq %xmm0, %xmm1, %k0 {%k1}
+; X64-NEXT: kmovw %k0, %eax
+; X64-NEXT: movzbl %al, %eax
+; X64-NEXT: retq
+entry:
+ %and.i.i = and <2 x i64> %__B, %__A
+ %0 = icmp ne <2 x i64> %and.i.i, zeroinitializer
+ %1 = bitcast i8 %__U to <8 x i1>
+ %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
+ %2 = and <2 x i1> %0, %extract.i
+ %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
+ %4 = bitcast <8 x i1> %3 to i8
+ ret i8 %4
+}
+
+define zeroext i8 @test_mm256_test_epi64_mask(<4 x i64> %__A, <4 x i64> %__B) {
+; X32-LABEL: test_mm256_test_epi64_mask:
+; X32: # %bb.0: # %entry
+; X32-NEXT: vptestmq %ymm0, %ymm1, %k0
+; X32-NEXT: kmovw %k0, %eax
+; X32-NEXT: movzbl %al, %eax
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_test_epi64_mask:
+; X64: # %bb.0: # %entry
+; X64-NEXT: vptestmq %ymm0, %ymm1, %k0
+; X64-NEXT: kmovw %k0, %eax
+; X64-NEXT: movzbl %al, %eax
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+entry:
+ %and.i.i = and <4 x i64> %__B, %__A
+ %0 = icmp ne <4 x i64> %and.i.i, zeroinitializer
+ %1 = shufflevector <4 x i1> %0, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %2 = bitcast <8 x i1> %1 to i8
+ ret i8 %2
+}
+
+define zeroext i8 @test_mm256_mask_test_epi64_mask(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
+; X32-LABEL: test_mm256_mask_test_epi64_mask:
+; X32: # %bb.0: # %entry
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vptestmq %ymm0, %ymm1, %k0 {%k1}
+; X32-NEXT: kmovw %k0, %eax
+; X32-NEXT: movzbl %al, %eax
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_mask_test_epi64_mask:
+; X64: # %bb.0: # %entry
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vptestmq %ymm0, %ymm1, %k0 {%k1}
+; X64-NEXT: kmovw %k0, %eax
+; X64-NEXT: movzbl %al, %eax
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+entry:
+ %and.i.i = and <4 x i64> %__B, %__A
+ %0 = icmp ne <4 x i64> %and.i.i, zeroinitializer
+ %1 = bitcast i8 %__U to <8 x i1>
+ %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %2 = and <4 x i1> %0, %extract.i
+ %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %4 = bitcast <8 x i1> %3 to i8
+ ret i8 %4
+}
+
+define zeroext i8 @test_mm_testn_epi32_mask(<2 x i64> %__A, <2 x i64> %__B) {
+; X32-LABEL: test_mm_testn_epi32_mask:
+; X32: # %bb.0: # %entry
+; X32-NEXT: vptestnmd %xmm0, %xmm1, %k0
+; X32-NEXT: kmovw %k0, %eax
+; X32-NEXT: movzbl %al, %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_testn_epi32_mask:
+; X64: # %bb.0: # %entry
+; X64-NEXT: vptestnmd %xmm0, %xmm1, %k0
+; X64-NEXT: kmovw %k0, %eax
+; X64-NEXT: movzbl %al, %eax
+; X64-NEXT: retq
+entry:
+ %and.i.i = and <2 x i64> %__B, %__A
+ %0 = bitcast <2 x i64> %and.i.i to <4 x i32>
+ %1 = icmp eq <4 x i32> %0, zeroinitializer
+ %2 = shufflevector <4 x i1> %1, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %3 = bitcast <8 x i1> %2 to i8
+ ret i8 %3
+}
+
+define zeroext i8 @test_mm_mask_testn_epi32_mask(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
+; X32-LABEL: test_mm_mask_testn_epi32_mask:
+; X32: # %bb.0: # %entry
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vptestnmd %xmm0, %xmm1, %k0 {%k1}
+; X32-NEXT: kmovw %k0, %eax
+; X32-NEXT: movzbl %al, %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_mask_testn_epi32_mask:
+; X64: # %bb.0: # %entry
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vptestnmd %xmm0, %xmm1, %k0 {%k1}
+; X64-NEXT: kmovw %k0, %eax
+; X64-NEXT: movzbl %al, %eax
+; X64-NEXT: retq
+entry:
+ %and.i.i = and <2 x i64> %__B, %__A
+ %0 = bitcast <2 x i64> %and.i.i to <4 x i32>
+ %1 = icmp eq <4 x i32> %0, zeroinitializer
+ %2 = bitcast i8 %__U to <8 x i1>
+ %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %3 = and <4 x i1> %1, %extract.i
+ %4 = shufflevector <4 x i1> %3, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %5 = bitcast <8 x i1> %4 to i8
+ ret i8 %5
+}
+
+define zeroext i8 @test_mm256_testn_epi32_mask(<4 x i64> %__A, <4 x i64> %__B) {
+; X32-LABEL: test_mm256_testn_epi32_mask:
+; X32: # %bb.0: # %entry
+; X32-NEXT: vptestnmd %ymm0, %ymm1, %k0
+; X32-NEXT: kmovw %k0, %eax
+; X32-NEXT: movzbl %al, %eax
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_testn_epi32_mask:
+; X64: # %bb.0: # %entry
+; X64-NEXT: vptestnmd %ymm0, %ymm1, %k0
+; X64-NEXT: kmovw %k0, %eax
+; X64-NEXT: movzbl %al, %eax
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+entry:
+ %and.i.i = and <4 x i64> %__B, %__A
+ %0 = bitcast <4 x i64> %and.i.i to <8 x i32>
+ %1 = icmp eq <8 x i32> %0, zeroinitializer
+ %2 = bitcast <8 x i1> %1 to i8
+ ret i8 %2
+}
+
+define zeroext i8 @test_mm256_mask_testn_epi32_mask(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
+; X32-LABEL: test_mm256_mask_testn_epi32_mask:
+; X32: # %bb.0: # %entry
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vptestnmd %ymm0, %ymm1, %k0 {%k1}
+; X32-NEXT: kmovw %k0, %eax
+; X32-NEXT: movzbl %al, %eax
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_mask_testn_epi32_mask:
+; X64: # %bb.0: # %entry
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vptestnmd %ymm0, %ymm1, %k0 {%k1}
+; X64-NEXT: kmovw %k0, %eax
+; X64-NEXT: movzbl %al, %eax
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+entry:
+ %and.i.i = and <4 x i64> %__B, %__A
+ %0 = bitcast <4 x i64> %and.i.i to <8 x i32>
+ %1 = icmp eq <8 x i32> %0, zeroinitializer
+ %2 = bitcast i8 %__U to <8 x i1>
+ %3 = and <8 x i1> %1, %2
+ %4 = bitcast <8 x i1> %3 to i8
+ ret i8 %4
+}
+
+define zeroext i8 @test_mm_testn_epi64_mask(<2 x i64> %__A, <2 x i64> %__B) {
+; X32-LABEL: test_mm_testn_epi64_mask:
+; X32: # %bb.0: # %entry
+; X32-NEXT: vptestnmq %xmm0, %xmm1, %k0
+; X32-NEXT: kmovw %k0, %eax
+; X32-NEXT: movzbl %al, %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_testn_epi64_mask:
+; X64: # %bb.0: # %entry
+; X64-NEXT: vptestnmq %xmm0, %xmm1, %k0
+; X64-NEXT: kmovw %k0, %eax
+; X64-NEXT: movzbl %al, %eax
+; X64-NEXT: retq
+entry:
+ %and.i.i = and <2 x i64> %__B, %__A
+ %0 = icmp eq <2 x i64> %and.i.i, zeroinitializer
+ %1 = shufflevector <2 x i1> %0, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
+ %2 = bitcast <8 x i1> %1 to i8
+ ret i8 %2
+}
+
+define zeroext i8 @test_mm_mask_testn_epi64_mask(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
+; X32-LABEL: test_mm_mask_testn_epi64_mask:
+; X32: # %bb.0: # %entry
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vptestnmq %xmm0, %xmm1, %k0 {%k1}
+; X32-NEXT: kmovw %k0, %eax
+; X32-NEXT: movzbl %al, %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_mask_testn_epi64_mask:
+; X64: # %bb.0: # %entry
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vptestnmq %xmm0, %xmm1, %k0 {%k1}
+; X64-NEXT: kmovw %k0, %eax
+; X64-NEXT: movzbl %al, %eax
+; X64-NEXT: retq
+entry:
+ %and.i.i = and <2 x i64> %__B, %__A
+ %0 = icmp eq <2 x i64> %and.i.i, zeroinitializer
+ %1 = bitcast i8 %__U to <8 x i1>
+ %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
+ %2 = and <2 x i1> %0, %extract.i
+ %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
+ %4 = bitcast <8 x i1> %3 to i8
+ ret i8 %4
+}
+
+define zeroext i8 @test_mm256_testn_epi64_mask(<4 x i64> %__A, <4 x i64> %__B) {
+; X32-LABEL: test_mm256_testn_epi64_mask:
+; X32: # %bb.0: # %entry
+; X32-NEXT: vptestnmq %ymm0, %ymm1, %k0
+; X32-NEXT: kmovw %k0, %eax
+; X32-NEXT: movzbl %al, %eax
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_testn_epi64_mask:
+; X64: # %bb.0: # %entry
+; X64-NEXT: vptestnmq %ymm0, %ymm1, %k0
+; X64-NEXT: kmovw %k0, %eax
+; X64-NEXT: movzbl %al, %eax
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+entry:
+ %and.i.i = and <4 x i64> %__B, %__A
+ %0 = icmp eq <4 x i64> %and.i.i, zeroinitializer
+ %1 = shufflevector <4 x i1> %0, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %2 = bitcast <8 x i1> %1 to i8
+ ret i8 %2
+}
+
+define zeroext i8 @test_mm256_mask_testn_epi64_mask(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
+; X32-LABEL: test_mm256_mask_testn_epi64_mask:
+; X32: # %bb.0: # %entry
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vptestnmq %ymm0, %ymm1, %k0 {%k1}
+; X32-NEXT: kmovw %k0, %eax
+; X32-NEXT: movzbl %al, %eax
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_mask_testn_epi64_mask:
+; X64: # %bb.0: # %entry
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vptestnmq %ymm0, %ymm1, %k0 {%k1}
+; X64-NEXT: kmovw %k0, %eax
+; X64-NEXT: movzbl %al, %eax
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+entry:
+ %and.i.i = and <4 x i64> %__B, %__A
+ %0 = icmp eq <4 x i64> %and.i.i, zeroinitializer
+ %1 = bitcast i8 %__U to <8 x i1>
+ %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %2 = and <4 x i1> %0, %extract.i
+ %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %4 = bitcast <8 x i1> %3 to i8
+ ret i8 %4
+}
+
+define <2 x i64> @test_mm_mask_set1_epi32(<2 x i64> %__O, i8 zeroext %__M) {
+; X32-LABEL: test_mm_mask_set1_epi32:
+; X32: # %bb.0: # %entry
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vpbroadcastd {{\.LCPI.*}}, %xmm0 {%k1}
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_mask_set1_epi32:
+; X64: # %bb.0: # %entry
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vpbroadcastd {{.*}}(%rip), %xmm0 {%k1}
+; X64-NEXT: retq
+entry:
+ %0 = bitcast <2 x i64> %__O to <4 x i32>
+ %1 = bitcast i8 %__M to <8 x i1>
+ %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %2 = select <4 x i1> %extract.i, <4 x i32> <i32 5, i32 5, i32 5, i32 5>, <4 x i32> %0
+ %3 = bitcast <4 x i32> %2 to <2 x i64>
+ ret <2 x i64> %3
+}
+
+define <2 x i64> @test_mm_maskz_set1_epi32(i8 zeroext %__M) {
+; X32-LABEL: test_mm_maskz_set1_epi32:
+; X32: # %bb.0: # %entry
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vpbroadcastd {{\.LCPI.*}}, %xmm0 {%k1} {z}
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_maskz_set1_epi32:
+; X64: # %bb.0: # %entry
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} {z}
+; X64-NEXT: retq
+entry:
+ %0 = bitcast i8 %__M to <8 x i1>
+ %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %1 = select <4 x i1> %extract.i, <4 x i32> <i32 5, i32 5, i32 5, i32 5>, <4 x i32> zeroinitializer
+ %2 = bitcast <4 x i32> %1 to <2 x i64>
+ ret <2 x i64> %2
+}
+
+define <4 x i64> @test_mm256_mask_set1_epi32(<4 x i64> %__O, i8 zeroext %__M) {
+; X32-LABEL: test_mm256_mask_set1_epi32:
+; X32: # %bb.0: # %entry
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vpbroadcastd {{\.LCPI.*}}, %ymm0 {%k1}
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_mask_set1_epi32:
+; X64: # %bb.0: # %entry
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vpbroadcastd {{.*}}(%rip), %ymm0 {%k1}
+; X64-NEXT: retq
+entry:
+ %0 = bitcast <4 x i64> %__O to <8 x i32>
+ %1 = bitcast i8 %__M to <8 x i1>
+ %2 = select <8 x i1> %1, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>, <8 x i32> %0
+ %3 = bitcast <8 x i32> %2 to <4 x i64>
+ ret <4 x i64> %3
+}
+
+define <4 x i64> @test_mm256_maskz_set1_epi32(i8 zeroext %__M) {
+; X32-LABEL: test_mm256_maskz_set1_epi32:
+; X32: # %bb.0: # %entry
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vpbroadcastd {{\.LCPI.*}}, %ymm0 {%k1} {z}
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_maskz_set1_epi32:
+; X64: # %bb.0: # %entry
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vpbroadcastd {{.*}}(%rip), %ymm0 {%k1} {z}
+; X64-NEXT: retq
+entry:
+ %0 = bitcast i8 %__M to <8 x i1>
+ %1 = select <8 x i1> %0, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>, <8 x i32> zeroinitializer
+ %2 = bitcast <8 x i32> %1 to <4 x i64>
+ ret <4 x i64> %2
+}
+
+define <2 x i64> @test_mm_mask_set1_epi64(<2 x i64> %__O, i8 zeroext %__M, i64 %__A) {
+; X32-LABEL: test_mm_mask_set1_epi64:
+; X32: # %bb.0: # %entry
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movb {{[0-9]+}}(%esp), %cl
+; X32-NEXT: vmovd %eax, %xmm1
+; X32-NEXT: vpbroadcastb %xmm1, %xmm1
+; X32-NEXT: kmovw %ecx, %k1
+; X32-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1}
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_mask_set1_epi64:
+; X64: # %bb.0: # %entry
+; X64-NEXT: vmovd %esi, %xmm1
+; X64-NEXT: vpbroadcastb %xmm1, %xmm1
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1}
+; X64-NEXT: retq
+entry:
+ %conv.i = trunc i64 %__A to i8
+ %vecinit.i.i = insertelement <16 x i8> undef, i8 %conv.i, i32 0
+ %vecinit15.i.i = shufflevector <16 x i8> %vecinit.i.i, <16 x i8> undef, <16 x i32> zeroinitializer
+ %0 = bitcast <16 x i8> %vecinit15.i.i to <2 x i64>
+ %1 = bitcast i8 %__M to <8 x i1>
+ %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
+ %2 = select <2 x i1> %extract.i, <2 x i64> %0, <2 x i64> %__O
+ ret <2 x i64> %2
+}
+
+define <2 x i64> @test_mm_maskz_set1_epi64(i8 zeroext %__M, i64 %__A) {
+; X32-LABEL: test_mm_maskz_set1_epi64:
+; X32: # %bb.0: # %entry
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movb {{[0-9]+}}(%esp), %cl
+; X32-NEXT: vmovd %eax, %xmm0
+; X32-NEXT: vpbroadcastb %xmm0, %xmm0
+; X32-NEXT: kmovw %ecx, %k1
+; X32-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_maskz_set1_epi64:
+; X64: # %bb.0: # %entry
+; X64-NEXT: vmovd %esi, %xmm0
+; X64-NEXT: vpbroadcastb %xmm0, %xmm0
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
+; X64-NEXT: retq
+entry:
+ %conv.i = trunc i64 %__A to i8
+ %vecinit.i.i = insertelement <16 x i8> undef, i8 %conv.i, i32 0
+ %vecinit15.i.i = shufflevector <16 x i8> %vecinit.i.i, <16 x i8> undef, <16 x i32> zeroinitializer
+ %0 = bitcast <16 x i8> %vecinit15.i.i to <2 x i64>
+ %1 = bitcast i8 %__M to <8 x i1>
+ %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
+ %2 = select <2 x i1> %extract.i, <2 x i64> %0, <2 x i64> zeroinitializer
+ ret <2 x i64> %2
+}
+
+
+define <4 x i64> @test_mm256_mask_set1_epi64(<4 x i64> %__O, i8 zeroext %__M, i64 %__A) {
+; X32-LABEL: test_mm256_mask_set1_epi64:
+; X32: # %bb.0: # %entry
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: movb {{[0-9]+}}(%esp), %dl
+; X32-NEXT: vmovd %ecx, %xmm1
+; X32-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1
+; X32-NEXT: vpinsrd $2, %ecx, %xmm1, %xmm1
+; X32-NEXT: vpinsrd $3, %eax, %xmm1, %xmm1
+; X32-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1
+; X32-NEXT: kmovw %edx, %k1
+; X32-NEXT: vmovdqa64 %ymm1, %ymm0 {%k1}
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_mask_set1_epi64:
+; X64: # %bb.0: # %entry
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vpbroadcastq %rsi, %ymm0 {%k1}
+; X64-NEXT: retq
+entry:
+ %vecinit.i.i = insertelement <4 x i64> undef, i64 %__A, i32 0
+ %vecinit3.i.i = shufflevector <4 x i64> %vecinit.i.i, <4 x i64> undef, <4 x i32> zeroinitializer
+ %0 = bitcast i8 %__M to <8 x i1>
+ %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %1 = select <4 x i1> %extract.i, <4 x i64> %vecinit3.i.i, <4 x i64> %__O
+ ret <4 x i64> %1
+}
+
+define <4 x i64> @test_mm256_maskz_set1_epi64(i8 zeroext %__M, i64 %__A) {
+; X32-LABEL: test_mm256_maskz_set1_epi64:
+; X32: # %bb.0: # %entry
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: movb {{[0-9]+}}(%esp), %dl
+; X32-NEXT: vmovd %ecx, %xmm0
+; X32-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0
+; X32-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0
+; X32-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0
+; X32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; X32-NEXT: kmovw %edx, %k1
+; X32-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z}
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_maskz_set1_epi64:
+; X64: # %bb.0: # %entry
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vpbroadcastq %rsi, %ymm0 {%k1} {z}
+; X64-NEXT: retq
+entry:
+ %vecinit.i.i = insertelement <4 x i64> undef, i64 %__A, i32 0
+ %vecinit3.i.i = shufflevector <4 x i64> %vecinit.i.i, <4 x i64> undef, <4 x i32> zeroinitializer
+ %0 = bitcast i8 %__M to <8 x i1>
+ %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %1 = select <4 x i1> %extract.i, <4 x i64> %vecinit3.i.i, <4 x i64> zeroinitializer
+ ret <4 x i64> %1
+}
+
define <2 x i64> @test_mm_broadcastd_epi32(<2 x i64> %a0) {
; X32-LABEL: test_mm_broadcastd_epi32:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vbroadcastss %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_broadcastd_epi32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vbroadcastss %xmm0, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <4 x i32>
@@ -22,9 +870,8 @@ define <2 x i64> @test_mm_broadcastd_epi32(<2 x i64> %a0) {
define <2 x i64> @test_mm_mask_broadcastd_epi32(<2 x i64> %a0, i8 %a1, <2 x i64> %a2) {
; X32-LABEL: test_mm_mask_broadcastd_epi32:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pushl %eax
-; X32-NEXT: .Lcfi0:
; X32-NEXT: .cfi_def_cfa_offset 8
; X32-NEXT: movb {{[0-9]+}}(%esp), %al
; X32-NEXT: andb $15, %al
@@ -36,7 +883,7 @@ define <2 x i64> @test_mm_mask_broadcastd_epi32(<2 x i64> %a0, i8 %a1, <2 x i64>
; X32-NEXT: retl
;
; X64-LABEL: test_mm_mask_broadcastd_epi32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: andb $15, %dil
; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
@@ -55,9 +902,8 @@ define <2 x i64> @test_mm_mask_broadcastd_epi32(<2 x i64> %a0, i8 %a1, <2 x i64>
define <2 x i64> @test_mm_maskz_broadcastd_epi32(i8 %a0, <2 x i64> %a1) {
; X32-LABEL: test_mm_maskz_broadcastd_epi32:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pushl %eax
-; X32-NEXT: .Lcfi1:
; X32-NEXT: .cfi_def_cfa_offset 8
; X32-NEXT: movb {{[0-9]+}}(%esp), %al
; X32-NEXT: andb $15, %al
@@ -69,7 +915,7 @@ define <2 x i64> @test_mm_maskz_broadcastd_epi32(i8 %a0, <2 x i64> %a1) {
; X32-NEXT: retl
;
; X64-LABEL: test_mm_maskz_broadcastd_epi32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: andb $15, %dil
; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
@@ -87,12 +933,12 @@ define <2 x i64> @test_mm_maskz_broadcastd_epi32(i8 %a0, <2 x i64> %a1) {
define <4 x i64> @test_mm256_broadcastd_epi32(<2 x i64> %a0) {
; X32-LABEL: test_mm256_broadcastd_epi32:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vbroadcastss %xmm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_broadcastd_epi32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vbroadcastss %xmm0, %ymm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <4 x i32>
@@ -103,14 +949,14 @@ define <4 x i64> @test_mm256_broadcastd_epi32(<2 x i64> %a0) {
define <4 x i64> @test_mm256_mask_broadcastd_epi32(<4 x i64> %a0, i8 %a1, <2 x i64> %a2) {
; X32-LABEL: test_mm256_mask_broadcastd_epi32:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movb {{[0-9]+}}(%esp), %al
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vpbroadcastd %xmm1, %ymm0 {%k1}
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_mask_broadcastd_epi32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: kmovw %edi, %k1
; X64-NEXT: vpbroadcastd %xmm1, %ymm0 {%k1}
; X64-NEXT: retq
@@ -125,14 +971,14 @@ define <4 x i64> @test_mm256_mask_broadcastd_epi32(<4 x i64> %a0, i8 %a1, <2 x i
define <4 x i64> @test_mm256_maskz_broadcastd_epi32(i8 %a0, <2 x i64> %a1) {
; X32-LABEL: test_mm256_maskz_broadcastd_epi32:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movb {{[0-9]+}}(%esp), %al
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vpbroadcastd %xmm0, %ymm0 {%k1} {z}
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_maskz_broadcastd_epi32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: kmovw %edi, %k1
; X64-NEXT: vpbroadcastd %xmm0, %ymm0 {%k1} {z}
; X64-NEXT: retq
@@ -146,12 +992,12 @@ define <4 x i64> @test_mm256_maskz_broadcastd_epi32(i8 %a0, <2 x i64> %a1) {
define <2 x i64> @test_mm_broadcastq_epi64(<2 x i64> %a0) {
; X32-LABEL: test_mm_broadcastq_epi64:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpbroadcastq %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_broadcastq_epi64:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpbroadcastq %xmm0, %xmm0
; X64-NEXT: retq
%res = shufflevector <2 x i64> %a0, <2 x i64> undef, <2 x i32> zeroinitializer
@@ -160,9 +1006,8 @@ define <2 x i64> @test_mm_broadcastq_epi64(<2 x i64> %a0) {
define <2 x i64> @test_mm_mask_broadcastq_epi64(<2 x i64> %a0, i8 %a1, <2 x i64> %a2) {
; X32-LABEL: test_mm_mask_broadcastq_epi64:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pushl %eax
-; X32-NEXT: .Lcfi2:
; X32-NEXT: .cfi_def_cfa_offset 8
; X32-NEXT: movb {{[0-9]+}}(%esp), %al
; X32-NEXT: andb $3, %al
@@ -174,7 +1019,7 @@ define <2 x i64> @test_mm_mask_broadcastq_epi64(<2 x i64> %a0, i8 %a1, <2 x i64>
; X32-NEXT: retl
;
; X64-LABEL: test_mm_mask_broadcastq_epi64:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: andb $3, %dil
; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
@@ -190,9 +1035,8 @@ define <2 x i64> @test_mm_mask_broadcastq_epi64(<2 x i64> %a0, i8 %a1, <2 x i64>
define <2 x i64> @test_mm_maskz_broadcastq_epi64(i8 %a0, <2 x i64> %a1) {
; X32-LABEL: test_mm_maskz_broadcastq_epi64:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pushl %eax
-; X32-NEXT: .Lcfi3:
; X32-NEXT: .cfi_def_cfa_offset 8
; X32-NEXT: movb {{[0-9]+}}(%esp), %al
; X32-NEXT: andb $3, %al
@@ -204,7 +1048,7 @@ define <2 x i64> @test_mm_maskz_broadcastq_epi64(i8 %a0, <2 x i64> %a1) {
; X32-NEXT: retl
;
; X64-LABEL: test_mm_maskz_broadcastq_epi64:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: andb $3, %dil
; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
@@ -220,12 +1064,12 @@ define <2 x i64> @test_mm_maskz_broadcastq_epi64(i8 %a0, <2 x i64> %a1) {
define <4 x i64> @test_mm256_broadcastq_epi64(<2 x i64> %a0) {
; X32-LABEL: test_mm256_broadcastq_epi64:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vbroadcastsd %xmm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_broadcastq_epi64:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vbroadcastsd %xmm0, %ymm0
; X64-NEXT: retq
%res = shufflevector <2 x i64> %a0, <2 x i64> undef, <4 x i32> zeroinitializer
@@ -234,9 +1078,8 @@ define <4 x i64> @test_mm256_broadcastq_epi64(<2 x i64> %a0) {
define <4 x i64> @test_mm256_mask_broadcastq_epi64(<4 x i64> %a0, i8 %a1, <2 x i64> %a2) {
; X32-LABEL: test_mm256_mask_broadcastq_epi64:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pushl %eax
-; X32-NEXT: .Lcfi4:
; X32-NEXT: .cfi_def_cfa_offset 8
; X32-NEXT: movb {{[0-9]+}}(%esp), %al
; X32-NEXT: andb $15, %al
@@ -248,7 +1091,7 @@ define <4 x i64> @test_mm256_mask_broadcastq_epi64(<4 x i64> %a0, i8 %a1, <2 x i
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_mask_broadcastq_epi64:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: andb $15, %dil
; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
@@ -264,9 +1107,8 @@ define <4 x i64> @test_mm256_mask_broadcastq_epi64(<4 x i64> %a0, i8 %a1, <2 x i
define <4 x i64> @test_mm256_maskz_broadcastq_epi64(i8 %a0, <2 x i64> %a1) {
; X32-LABEL: test_mm256_maskz_broadcastq_epi64:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pushl %eax
-; X32-NEXT: .Lcfi5:
; X32-NEXT: .cfi_def_cfa_offset 8
; X32-NEXT: movb {{[0-9]+}}(%esp), %al
; X32-NEXT: andb $15, %al
@@ -278,7 +1120,7 @@ define <4 x i64> @test_mm256_maskz_broadcastq_epi64(i8 %a0, <2 x i64> %a1) {
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_maskz_broadcastq_epi64:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: andb $15, %dil
; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
@@ -294,12 +1136,12 @@ define <4 x i64> @test_mm256_maskz_broadcastq_epi64(i8 %a0, <2 x i64> %a1) {
define <2 x double> @test_mm_broadcastsd_pd(<2 x double> %a0) {
; X32-LABEL: test_mm_broadcastsd_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
; X32-NEXT: retl
;
; X64-LABEL: test_mm_broadcastsd_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
; X64-NEXT: retq
%res = shufflevector <2 x double> %a0, <2 x double> undef, <2 x i32> zeroinitializer
@@ -308,9 +1150,8 @@ define <2 x double> @test_mm_broadcastsd_pd(<2 x double> %a0) {
define <2 x double> @test_mm_mask_broadcastsd_pd(<2 x double> %a0, i8 %a1, <2 x double> %a2) {
; X32-LABEL: test_mm_mask_broadcastsd_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pushl %eax
-; X32-NEXT: .Lcfi6:
; X32-NEXT: .cfi_def_cfa_offset 8
; X32-NEXT: movb {{[0-9]+}}(%esp), %al
; X32-NEXT: andb $3, %al
@@ -322,7 +1163,7 @@ define <2 x double> @test_mm_mask_broadcastsd_pd(<2 x double> %a0, i8 %a1, <2 x
; X32-NEXT: retl
;
; X64-LABEL: test_mm_mask_broadcastsd_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: andb $3, %dil
; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
@@ -338,9 +1179,8 @@ define <2 x double> @test_mm_mask_broadcastsd_pd(<2 x double> %a0, i8 %a1, <2 x
define <2 x double> @test_mm_maskz_broadcastsd_pd(i8 %a0, <2 x double> %a1) {
; X32-LABEL: test_mm_maskz_broadcastsd_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pushl %eax
-; X32-NEXT: .Lcfi7:
; X32-NEXT: .cfi_def_cfa_offset 8
; X32-NEXT: movb {{[0-9]+}}(%esp), %al
; X32-NEXT: andb $3, %al
@@ -352,7 +1192,7 @@ define <2 x double> @test_mm_maskz_broadcastsd_pd(i8 %a0, <2 x double> %a1) {
; X32-NEXT: retl
;
; X64-LABEL: test_mm_maskz_broadcastsd_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: andb $3, %dil
; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
@@ -368,12 +1208,12 @@ define <2 x double> @test_mm_maskz_broadcastsd_pd(i8 %a0, <2 x double> %a1) {
define <4 x double> @test_mm256_broadcastsd_pd(<2 x double> %a0) {
; X32-LABEL: test_mm256_broadcastsd_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vbroadcastsd %xmm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_broadcastsd_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vbroadcastsd %xmm0, %ymm0
; X64-NEXT: retq
%res = shufflevector <2 x double> %a0, <2 x double> undef, <4 x i32> zeroinitializer
@@ -382,9 +1222,8 @@ define <4 x double> @test_mm256_broadcastsd_pd(<2 x double> %a0) {
define <4 x double> @test_mm256_mask_broadcastsd_pd(<4 x double> %a0, i8 %a1, <2 x double> %a2) {
; X32-LABEL: test_mm256_mask_broadcastsd_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pushl %eax
-; X32-NEXT: .Lcfi8:
; X32-NEXT: .cfi_def_cfa_offset 8
; X32-NEXT: movb {{[0-9]+}}(%esp), %al
; X32-NEXT: andb $15, %al
@@ -396,7 +1235,7 @@ define <4 x double> @test_mm256_mask_broadcastsd_pd(<4 x double> %a0, i8 %a1, <2
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_mask_broadcastsd_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: andb $15, %dil
; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
@@ -412,9 +1251,8 @@ define <4 x double> @test_mm256_mask_broadcastsd_pd(<4 x double> %a0, i8 %a1, <2
define <4 x double> @test_mm256_maskz_broadcastsd_pd(i8 %a0, <2 x double> %a1) {
; X32-LABEL: test_mm256_maskz_broadcastsd_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pushl %eax
-; X32-NEXT: .Lcfi9:
; X32-NEXT: .cfi_def_cfa_offset 8
; X32-NEXT: movb {{[0-9]+}}(%esp), %al
; X32-NEXT: andb $15, %al
@@ -426,7 +1264,7 @@ define <4 x double> @test_mm256_maskz_broadcastsd_pd(i8 %a0, <2 x double> %a1) {
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_maskz_broadcastsd_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: andb $15, %dil
; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
@@ -442,12 +1280,12 @@ define <4 x double> @test_mm256_maskz_broadcastsd_pd(i8 %a0, <2 x double> %a1) {
define <4 x float> @test_mm_broadcastss_ps(<4 x float> %a0) {
; X32-LABEL: test_mm_broadcastss_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vbroadcastss %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_broadcastss_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vbroadcastss %xmm0, %xmm0
; X64-NEXT: retq
%res = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> zeroinitializer
@@ -456,9 +1294,8 @@ define <4 x float> @test_mm_broadcastss_ps(<4 x float> %a0) {
define <4 x float> @test_mm_mask_broadcastss_ps(<4 x float> %a0, i8 %a1, <4 x float> %a2) {
; X32-LABEL: test_mm_mask_broadcastss_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pushl %eax
-; X32-NEXT: .Lcfi10:
; X32-NEXT: .cfi_def_cfa_offset 8
; X32-NEXT: movb {{[0-9]+}}(%esp), %al
; X32-NEXT: andb $15, %al
@@ -470,7 +1307,7 @@ define <4 x float> @test_mm_mask_broadcastss_ps(<4 x float> %a0, i8 %a1, <4 x fl
; X32-NEXT: retl
;
; X64-LABEL: test_mm_mask_broadcastss_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: andb $15, %dil
; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
@@ -486,9 +1323,8 @@ define <4 x float> @test_mm_mask_broadcastss_ps(<4 x float> %a0, i8 %a1, <4 x fl
define <4 x float> @test_mm_maskz_broadcastss_ps(i8 %a0, <4 x float> %a1) {
; X32-LABEL: test_mm_maskz_broadcastss_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pushl %eax
-; X32-NEXT: .Lcfi11:
; X32-NEXT: .cfi_def_cfa_offset 8
; X32-NEXT: movb {{[0-9]+}}(%esp), %al
; X32-NEXT: andb $15, %al
@@ -500,7 +1336,7 @@ define <4 x float> @test_mm_maskz_broadcastss_ps(i8 %a0, <4 x float> %a1) {
; X32-NEXT: retl
;
; X64-LABEL: test_mm_maskz_broadcastss_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: andb $15, %dil
; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
@@ -516,12 +1352,12 @@ define <4 x float> @test_mm_maskz_broadcastss_ps(i8 %a0, <4 x float> %a1) {
define <8 x float> @test_mm256_broadcastss_ps(<4 x float> %a0) {
; X32-LABEL: test_mm256_broadcastss_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vbroadcastss %xmm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_broadcastss_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vbroadcastss %xmm0, %ymm0
; X64-NEXT: retq
%res = shufflevector <4 x float> %a0, <4 x float> undef, <8 x i32> zeroinitializer
@@ -530,14 +1366,14 @@ define <8 x float> @test_mm256_broadcastss_ps(<4 x float> %a0) {
define <8 x float> @test_mm256_mask_broadcastss_ps(<8 x float> %a0, i8 %a1, <4 x float> %a2) {
; X32-LABEL: test_mm256_mask_broadcastss_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movb {{[0-9]+}}(%esp), %al
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vbroadcastss %xmm1, %ymm0 {%k1}
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_mask_broadcastss_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: kmovw %edi, %k1
; X64-NEXT: vbroadcastss %xmm1, %ymm0 {%k1}
; X64-NEXT: retq
@@ -549,14 +1385,14 @@ define <8 x float> @test_mm256_mask_broadcastss_ps(<8 x float> %a0, i8 %a1, <4 x
define <8 x float> @test_mm256_maskz_broadcastss_ps(i8 %a0, <4 x float> %a1) {
; X32-LABEL: test_mm256_maskz_broadcastss_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movb {{[0-9]+}}(%esp), %al
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vbroadcastss %xmm0, %ymm0 {%k1} {z}
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_maskz_broadcastss_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: kmovw %edi, %k1
; X64-NEXT: vbroadcastss %xmm0, %ymm0 {%k1} {z}
; X64-NEXT: retq
@@ -568,12 +1404,12 @@ define <8 x float> @test_mm256_maskz_broadcastss_ps(i8 %a0, <4 x float> %a1) {
define <2 x double> @test_mm_movddup_pd(<2 x double> %a0) {
; X32-LABEL: test_mm_movddup_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
; X32-NEXT: retl
;
; X64-LABEL: test_mm_movddup_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
; X64-NEXT: retq
%res = shufflevector <2 x double> %a0, <2 x double> undef, <2 x i32> zeroinitializer
@@ -582,9 +1418,8 @@ define <2 x double> @test_mm_movddup_pd(<2 x double> %a0) {
define <2 x double> @test_mm_mask_movddup_pd(<2 x double> %a0, i8 %a1, <2 x double> %a2) {
; X32-LABEL: test_mm_mask_movddup_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pushl %eax
-; X32-NEXT: .Lcfi12:
; X32-NEXT: .cfi_def_cfa_offset 8
; X32-NEXT: movb {{[0-9]+}}(%esp), %al
; X32-NEXT: andb $3, %al
@@ -596,7 +1431,7 @@ define <2 x double> @test_mm_mask_movddup_pd(<2 x double> %a0, i8 %a1, <2 x doub
; X32-NEXT: retl
;
; X64-LABEL: test_mm_mask_movddup_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: andb $3, %dil
; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
@@ -612,9 +1447,8 @@ define <2 x double> @test_mm_mask_movddup_pd(<2 x double> %a0, i8 %a1, <2 x doub
define <2 x double> @test_mm_maskz_movddup_pd(i8 %a0, <2 x double> %a1) {
; X32-LABEL: test_mm_maskz_movddup_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pushl %eax
-; X32-NEXT: .Lcfi13:
; X32-NEXT: .cfi_def_cfa_offset 8
; X32-NEXT: movb {{[0-9]+}}(%esp), %al
; X32-NEXT: andb $3, %al
@@ -626,7 +1460,7 @@ define <2 x double> @test_mm_maskz_movddup_pd(i8 %a0, <2 x double> %a1) {
; X32-NEXT: retl
;
; X64-LABEL: test_mm_maskz_movddup_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: andb $3, %dil
; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
@@ -642,12 +1476,12 @@ define <2 x double> @test_mm_maskz_movddup_pd(i8 %a0, <2 x double> %a1) {
define <4 x double> @test_mm256_movddup_pd(<4 x double> %a0) {
; X32-LABEL: test_mm256_movddup_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_movddup_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
; X64-NEXT: retq
%res = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
@@ -656,9 +1490,8 @@ define <4 x double> @test_mm256_movddup_pd(<4 x double> %a0) {
define <4 x double> @test_mm256_mask_movddup_pd(<4 x double> %a0, i8 %a1, <4 x double> %a2) {
; X32-LABEL: test_mm256_mask_movddup_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pushl %eax
-; X32-NEXT: .Lcfi14:
; X32-NEXT: .cfi_def_cfa_offset 8
; X32-NEXT: movb {{[0-9]+}}(%esp), %al
; X32-NEXT: andb $15, %al
@@ -670,7 +1503,7 @@ define <4 x double> @test_mm256_mask_movddup_pd(<4 x double> %a0, i8 %a1, <4 x d
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_mask_movddup_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: andb $15, %dil
; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
@@ -686,9 +1519,8 @@ define <4 x double> @test_mm256_mask_movddup_pd(<4 x double> %a0, i8 %a1, <4 x d
define <4 x double> @test_mm256_maskz_movddup_pd(i8 %a0, <4 x double> %a1) {
; X32-LABEL: test_mm256_maskz_movddup_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pushl %eax
-; X32-NEXT: .Lcfi15:
; X32-NEXT: .cfi_def_cfa_offset 8
; X32-NEXT: movb {{[0-9]+}}(%esp), %al
; X32-NEXT: andb $15, %al
@@ -700,7 +1532,7 @@ define <4 x double> @test_mm256_maskz_movddup_pd(i8 %a0, <4 x double> %a1) {
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_maskz_movddup_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: andb $15, %dil
; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
@@ -716,12 +1548,12 @@ define <4 x double> @test_mm256_maskz_movddup_pd(i8 %a0, <4 x double> %a1) {
define <4 x float> @test_mm_movehdup_ps(<4 x float> %a0) {
; X32-LABEL: test_mm_movehdup_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
; X32-NEXT: retl
;
; X64-LABEL: test_mm_movehdup_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
; X64-NEXT: retq
%res = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
@@ -730,9 +1562,8 @@ define <4 x float> @test_mm_movehdup_ps(<4 x float> %a0) {
define <4 x float> @test_mm_mask_movehdup_ps(<4 x float> %a0, i8 %a1, <4 x float> %a2) {
; X32-LABEL: test_mm_mask_movehdup_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pushl %eax
-; X32-NEXT: .Lcfi16:
; X32-NEXT: .cfi_def_cfa_offset 8
; X32-NEXT: movb {{[0-9]+}}(%esp), %al
; X32-NEXT: andb $15, %al
@@ -744,7 +1575,7 @@ define <4 x float> @test_mm_mask_movehdup_ps(<4 x float> %a0, i8 %a1, <4 x float
; X32-NEXT: retl
;
; X64-LABEL: test_mm_mask_movehdup_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: andb $15, %dil
; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
@@ -760,9 +1591,8 @@ define <4 x float> @test_mm_mask_movehdup_ps(<4 x float> %a0, i8 %a1, <4 x float
define <4 x float> @test_mm_maskz_movehdup_ps(i8 %a0, <4 x float> %a1) {
; X32-LABEL: test_mm_maskz_movehdup_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pushl %eax
-; X32-NEXT: .Lcfi17:
; X32-NEXT: .cfi_def_cfa_offset 8
; X32-NEXT: movb {{[0-9]+}}(%esp), %al
; X32-NEXT: andb $15, %al
@@ -774,7 +1604,7 @@ define <4 x float> @test_mm_maskz_movehdup_ps(i8 %a0, <4 x float> %a1) {
; X32-NEXT: retl
;
; X64-LABEL: test_mm_maskz_movehdup_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: andb $15, %dil
; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
@@ -790,12 +1620,12 @@ define <4 x float> @test_mm_maskz_movehdup_ps(i8 %a0, <4 x float> %a1) {
define <8 x float> @test_mm256_movehdup_ps(<8 x float> %a0) {
; X32-LABEL: test_mm256_movehdup_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7]
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_movehdup_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7]
; X64-NEXT: retq
%res = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
@@ -804,14 +1634,14 @@ define <8 x float> @test_mm256_movehdup_ps(<8 x float> %a0) {
define <8 x float> @test_mm256_mask_movehdup_ps(<8 x float> %a0, i8 %a1, <8 x float> %a2) {
; X32-LABEL: test_mm256_mask_movehdup_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movb {{[0-9]+}}(%esp), %al
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vmovshdup {{.*#+}} ymm0 {%k1} = ymm1[1,1,3,3,5,5,7,7]
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_mask_movehdup_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: kmovw %edi, %k1
; X64-NEXT: vmovshdup {{.*#+}} ymm0 {%k1} = ymm1[1,1,3,3,5,5,7,7]
; X64-NEXT: retq
@@ -823,14 +1653,14 @@ define <8 x float> @test_mm256_mask_movehdup_ps(<8 x float> %a0, i8 %a1, <8 x fl
define <8 x float> @test_mm256_maskz_movehdup_ps(i8 %a0, <8 x float> %a1) {
; X32-LABEL: test_mm256_maskz_movehdup_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movb {{[0-9]+}}(%esp), %al
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vmovshdup {{.*#+}} ymm0 {%k1} {z} = ymm0[1,1,3,3,5,5,7,7]
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_maskz_movehdup_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: kmovw %edi, %k1
; X64-NEXT: vmovshdup {{.*#+}} ymm0 {%k1} {z} = ymm0[1,1,3,3,5,5,7,7]
; X64-NEXT: retq
@@ -842,12 +1672,12 @@ define <8 x float> @test_mm256_maskz_movehdup_ps(i8 %a0, <8 x float> %a1) {
define <4 x float> @test_mm_moveldup_ps(<4 x float> %a0) {
; X32-LABEL: test_mm_moveldup_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
; X32-NEXT: retl
;
; X64-LABEL: test_mm_moveldup_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
; X64-NEXT: retq
%res = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
@@ -856,9 +1686,8 @@ define <4 x float> @test_mm_moveldup_ps(<4 x float> %a0) {
define <4 x float> @test_mm_mask_moveldup_ps(<4 x float> %a0, i8 %a1, <4 x float> %a2) {
; X32-LABEL: test_mm_mask_moveldup_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pushl %eax
-; X32-NEXT: .Lcfi18:
; X32-NEXT: .cfi_def_cfa_offset 8
; X32-NEXT: movb {{[0-9]+}}(%esp), %al
; X32-NEXT: andb $15, %al
@@ -870,7 +1699,7 @@ define <4 x float> @test_mm_mask_moveldup_ps(<4 x float> %a0, i8 %a1, <4 x float
; X32-NEXT: retl
;
; X64-LABEL: test_mm_mask_moveldup_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: andb $15, %dil
; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
@@ -886,9 +1715,8 @@ define <4 x float> @test_mm_mask_moveldup_ps(<4 x float> %a0, i8 %a1, <4 x float
define <4 x float> @test_mm_maskz_moveldup_ps(i8 %a0, <4 x float> %a1) {
; X32-LABEL: test_mm_maskz_moveldup_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pushl %eax
-; X32-NEXT: .Lcfi19:
; X32-NEXT: .cfi_def_cfa_offset 8
; X32-NEXT: movb {{[0-9]+}}(%esp), %al
; X32-NEXT: andb $15, %al
@@ -900,7 +1728,7 @@ define <4 x float> @test_mm_maskz_moveldup_ps(i8 %a0, <4 x float> %a1) {
; X32-NEXT: retl
;
; X64-LABEL: test_mm_maskz_moveldup_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: andb $15, %dil
; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
@@ -916,12 +1744,12 @@ define <4 x float> @test_mm_maskz_moveldup_ps(i8 %a0, <4 x float> %a1) {
define <8 x float> @test_mm256_moveldup_ps(<8 x float> %a0) {
; X32-LABEL: test_mm256_moveldup_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vmovsldup {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6]
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_moveldup_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovsldup {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6]
; X64-NEXT: retq
%res = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
@@ -930,14 +1758,14 @@ define <8 x float> @test_mm256_moveldup_ps(<8 x float> %a0) {
define <8 x float> @test_mm256_mask_moveldup_ps(<8 x float> %a0, i8 %a1, <8 x float> %a2) {
; X32-LABEL: test_mm256_mask_moveldup_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movb {{[0-9]+}}(%esp), %al
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vmovsldup {{.*#+}} ymm0 {%k1} = ymm1[0,0,2,2,4,4,6,6]
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_mask_moveldup_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: kmovw %edi, %k1
; X64-NEXT: vmovsldup {{.*#+}} ymm0 {%k1} = ymm1[0,0,2,2,4,4,6,6]
; X64-NEXT: retq
@@ -949,14 +1777,14 @@ define <8 x float> @test_mm256_mask_moveldup_ps(<8 x float> %a0, i8 %a1, <8 x fl
define <8 x float> @test_mm256_maskz_moveldup_ps(i8 %a0, <8 x float> %a1) {
; X32-LABEL: test_mm256_maskz_moveldup_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movb {{[0-9]+}}(%esp), %al
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vmovsldup {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,2,2,4,4,6,6]
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_maskz_moveldup_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: kmovw %edi, %k1
; X64-NEXT: vmovsldup {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,2,2,4,4,6,6]
; X64-NEXT: retq
@@ -968,13 +1796,13 @@ define <8 x float> @test_mm256_maskz_moveldup_ps(i8 %a0, <8 x float> %a1) {
define <4 x i64> @test_mm256_permutex_epi64(<4 x i64> %a0) {
; X32-LABEL: test_mm256_permutex_epi64:
-; X32: # BB#0:
-; X32-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,0,0,0]
+; X32: # %bb.0:
+; X32-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,0,0,0]
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_permutex_epi64:
-; X64: # BB#0:
-; X64-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,0,0,0]
+; X64: # %bb.0:
+; X64-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,0,0,0]
; X64-NEXT: retq
%res = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 0>
ret <4 x i64> %res
@@ -982,9 +1810,8 @@ define <4 x i64> @test_mm256_permutex_epi64(<4 x i64> %a0) {
define <4 x i64> @test_mm256_mask_permutex_epi64(<4 x i64> %a0, i8 %a1, <4 x i64> %a2) {
; X32-LABEL: test_mm256_mask_permutex_epi64:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pushl %eax
-; X32-NEXT: .Lcfi20:
; X32-NEXT: .cfi_def_cfa_offset 8
; X32-NEXT: movb {{[0-9]+}}(%esp), %al
; X32-NEXT: andb $15, %al
@@ -996,7 +1823,7 @@ define <4 x i64> @test_mm256_mask_permutex_epi64(<4 x i64> %a0, i8 %a1, <4 x i64
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_mask_permutex_epi64:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: andb $15, %dil
; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
@@ -1012,9 +1839,8 @@ define <4 x i64> @test_mm256_mask_permutex_epi64(<4 x i64> %a0, i8 %a1, <4 x i64
define <4 x i64> @test_mm256_maskz_permutex_epi64(i8 %a0, <4 x i64> %a1) {
; X32-LABEL: test_mm256_maskz_permutex_epi64:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pushl %eax
-; X32-NEXT: .Lcfi21:
; X32-NEXT: .cfi_def_cfa_offset 8
; X32-NEXT: movb {{[0-9]+}}(%esp), %al
; X32-NEXT: andb $15, %al
@@ -1026,7 +1852,7 @@ define <4 x i64> @test_mm256_maskz_permutex_epi64(i8 %a0, <4 x i64> %a1) {
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_maskz_permutex_epi64:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: andb $15, %dil
; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
@@ -1042,12 +1868,12 @@ define <4 x i64> @test_mm256_maskz_permutex_epi64(i8 %a0, <4 x i64> %a1) {
define <4 x double> @test_mm256_permutex_pd(<4 x double> %a0) {
; X32-LABEL: test_mm256_permutex_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,0,0,0]
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_permutex_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,0,0,0]
; X64-NEXT: retq
%res = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 0>
@@ -1056,9 +1882,8 @@ define <4 x double> @test_mm256_permutex_pd(<4 x double> %a0) {
define <4 x double> @test_mm256_mask_permutex_pd(<4 x double> %a0, i8 %a1, <4 x double> %a2) {
; X32-LABEL: test_mm256_mask_permutex_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pushl %eax
-; X32-NEXT: .Lcfi22:
; X32-NEXT: .cfi_def_cfa_offset 8
; X32-NEXT: movb {{[0-9]+}}(%esp), %al
; X32-NEXT: andb $15, %al
@@ -1070,7 +1895,7 @@ define <4 x double> @test_mm256_mask_permutex_pd(<4 x double> %a0, i8 %a1, <4 x
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_mask_permutex_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: andb $15, %dil
; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
@@ -1086,9 +1911,8 @@ define <4 x double> @test_mm256_mask_permutex_pd(<4 x double> %a0, i8 %a1, <4 x
define <4 x double> @test_mm256_maskz_permutex_pd(i8 %a0, <4 x double> %a1) {
; X32-LABEL: test_mm256_maskz_permutex_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pushl %eax
-; X32-NEXT: .Lcfi23:
; X32-NEXT: .cfi_def_cfa_offset 8
; X32-NEXT: movb {{[0-9]+}}(%esp), %al
; X32-NEXT: andb $15, %al
@@ -1100,7 +1924,7 @@ define <4 x double> @test_mm256_maskz_permutex_pd(i8 %a0, <4 x double> %a1) {
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_maskz_permutex_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: andb $15, %dil
; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
@@ -1116,12 +1940,12 @@ define <4 x double> @test_mm256_maskz_permutex_pd(i8 %a0, <4 x double> %a1) {
define <2 x double> @test_mm_shuffle_pd(<2 x double> %a0, <2 x double> %a1) {
; X32-LABEL: test_mm_shuffle_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
; X32-NEXT: retl
;
; X64-LABEL: test_mm_shuffle_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
; X64-NEXT: retq
%res = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> <i32 1, i32 3>
@@ -1130,9 +1954,8 @@ define <2 x double> @test_mm_shuffle_pd(<2 x double> %a0, <2 x double> %a1) {
define <2 x double> @test_mm_mask_shuffle_pd(<2 x double> %a0, i8 %a1, <2 x double> %a2, <2 x double> %a3) {
; X32-LABEL: test_mm_mask_shuffle_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pushl %eax
-; X32-NEXT: .Lcfi24:
; X32-NEXT: .cfi_def_cfa_offset 8
; X32-NEXT: movb {{[0-9]+}}(%esp), %al
; X32-NEXT: andb $3, %al
@@ -1144,7 +1967,7 @@ define <2 x double> @test_mm_mask_shuffle_pd(<2 x double> %a0, i8 %a1, <2 x doub
; X32-NEXT: retl
;
; X64-LABEL: test_mm_mask_shuffle_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: andb $3, %dil
; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
@@ -1160,9 +1983,8 @@ define <2 x double> @test_mm_mask_shuffle_pd(<2 x double> %a0, i8 %a1, <2 x doub
define <2 x double> @test_mm_maskz_shuffle_pd(i8 %a0, <2 x double> %a1, <2 x double> %a2) {
; X32-LABEL: test_mm_maskz_shuffle_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pushl %eax
-; X32-NEXT: .Lcfi25:
; X32-NEXT: .cfi_def_cfa_offset 8
; X32-NEXT: movb {{[0-9]+}}(%esp), %al
; X32-NEXT: andb $3, %al
@@ -1174,7 +1996,7 @@ define <2 x double> @test_mm_maskz_shuffle_pd(i8 %a0, <2 x double> %a1, <2 x dou
; X32-NEXT: retl
;
; X64-LABEL: test_mm_maskz_shuffle_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: andb $3, %dil
; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
@@ -1190,12 +2012,12 @@ define <2 x double> @test_mm_maskz_shuffle_pd(i8 %a0, <2 x double> %a1, <2 x dou
define <4 x double> @test_mm256_shuffle_pd(<4 x double> %a0, <4 x double> %a1) {
; X32-LABEL: test_mm256_shuffle_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[2],ymm1[2]
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_shuffle_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[2],ymm1[2]
; X64-NEXT: retq
%res = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 1, i32 5, i32 2, i32 6>
@@ -1204,9 +2026,8 @@ define <4 x double> @test_mm256_shuffle_pd(<4 x double> %a0, <4 x double> %a1) {
define <4 x double> @test_mm256_mask_shuffle_pd(<4 x double> %a0, i8 %a1, <4 x double> %a2, <4 x double> %a3) {
; X32-LABEL: test_mm256_mask_shuffle_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pushl %eax
-; X32-NEXT: .Lcfi26:
; X32-NEXT: .cfi_def_cfa_offset 8
; X32-NEXT: movb {{[0-9]+}}(%esp), %al
; X32-NEXT: andb $15, %al
@@ -1218,7 +2039,7 @@ define <4 x double> @test_mm256_mask_shuffle_pd(<4 x double> %a0, i8 %a1, <4 x d
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_mask_shuffle_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: andb $15, %dil
; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
@@ -1234,9 +2055,8 @@ define <4 x double> @test_mm256_mask_shuffle_pd(<4 x double> %a0, i8 %a1, <4 x d
define <4 x double> @test_mm256_maskz_shuffle_pd(i8 %a0, <4 x double> %a1, <4 x double> %a2) {
; X32-LABEL: test_mm256_maskz_shuffle_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pushl %eax
-; X32-NEXT: .Lcfi27:
; X32-NEXT: .cfi_def_cfa_offset 8
; X32-NEXT: movb {{[0-9]+}}(%esp), %al
; X32-NEXT: andb $15, %al
@@ -1248,7 +2068,7 @@ define <4 x double> @test_mm256_maskz_shuffle_pd(i8 %a0, <4 x double> %a1, <4 x
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_maskz_shuffle_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: andb $15, %dil
; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
@@ -1264,12 +2084,12 @@ define <4 x double> @test_mm256_maskz_shuffle_pd(i8 %a0, <4 x double> %a1, <4 x
define <4 x float> @test_mm_shuffle_ps(<4 x float> %a0, <4 x float> %a1) {
; X32-LABEL: test_mm_shuffle_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
; X32-NEXT: retl
;
; X64-LABEL: test_mm_shuffle_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
; X64-NEXT: retq
%res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 0, i32 1, i32 4, i32 4>
@@ -1278,9 +2098,8 @@ define <4 x float> @test_mm_shuffle_ps(<4 x float> %a0, <4 x float> %a1) {
define <4 x float> @test_mm_mask_shuffle_ps(<4 x float> %a0, i8 %a1, <4 x float> %a2, <4 x float> %a3) {
; X32-LABEL: test_mm_mask_shuffle_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pushl %eax
-; X32-NEXT: .Lcfi28:
; X32-NEXT: .cfi_def_cfa_offset 8
; X32-NEXT: movb {{[0-9]+}}(%esp), %al
; X32-NEXT: andb $15, %al
@@ -1292,7 +2111,7 @@ define <4 x float> @test_mm_mask_shuffle_ps(<4 x float> %a0, i8 %a1, <4 x float>
; X32-NEXT: retl
;
; X64-LABEL: test_mm_mask_shuffle_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: andb $15, %dil
; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
@@ -1308,9 +2127,8 @@ define <4 x float> @test_mm_mask_shuffle_ps(<4 x float> %a0, i8 %a1, <4 x float>
define <4 x float> @test_mm_maskz_shuffle_ps(i8 %a0, <4 x float> %a1, <4 x float> %a2) {
; X32-LABEL: test_mm_maskz_shuffle_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pushl %eax
-; X32-NEXT: .Lcfi29:
; X32-NEXT: .cfi_def_cfa_offset 8
; X32-NEXT: movb {{[0-9]+}}(%esp), %al
; X32-NEXT: andb $15, %al
@@ -1322,7 +2140,7 @@ define <4 x float> @test_mm_maskz_shuffle_ps(i8 %a0, <4 x float> %a1, <4 x float
; X32-NEXT: retl
;
; X64-LABEL: test_mm_maskz_shuffle_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: andb $15, %dil
; X64-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
@@ -1338,12 +2156,12 @@ define <4 x float> @test_mm_maskz_shuffle_ps(i8 %a0, <4 x float> %a1, <4 x float
define <8 x float> @test_mm256_shuffle_ps(<8 x float> %a0, <8 x float> %a1) {
; X32-LABEL: test_mm256_shuffle_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,0],ymm0[4,5],ymm1[4,4]
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_shuffle_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,0],ymm0[4,5],ymm1[4,4]
; X64-NEXT: retq
%res = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 0, i32 1, i32 8, i32 8, i32 4, i32 5, i32 12, i32 12>
@@ -1352,14 +2170,14 @@ define <8 x float> @test_mm256_shuffle_ps(<8 x float> %a0, <8 x float> %a1) {
define <8 x float> @test_mm256_mask_shuffle_ps(<8 x float> %a0, i8 %a1, <8 x float> %a2, <8 x float> %a3) {
; X32-LABEL: test_mm256_mask_shuffle_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movb {{[0-9]+}}(%esp), %al
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vshufps {{.*#+}} ymm0 {%k1} = ymm1[0,1],ymm2[0,0],ymm1[4,5],ymm2[4,4]
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_mask_shuffle_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: kmovw %edi, %k1
; X64-NEXT: vshufps {{.*#+}} ymm0 {%k1} = ymm1[0,1],ymm2[0,0],ymm1[4,5],ymm2[4,4]
; X64-NEXT: retq
@@ -1371,14 +2189,14 @@ define <8 x float> @test_mm256_mask_shuffle_ps(<8 x float> %a0, i8 %a1, <8 x flo
define <8 x float> @test_mm256_maskz_shuffle_ps(i8 %a0, <8 x float> %a1, <8 x float> %a2) {
; X32-LABEL: test_mm256_maskz_shuffle_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movb {{[0-9]+}}(%esp), %al
; X32-NEXT: kmovw %eax, %k1
; X32-NEXT: vshufps {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1],ymm1[0,0],ymm0[4,5],ymm1[4,4]
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_maskz_shuffle_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: kmovw %edi, %k1
; X64-NEXT: vshufps {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1],ymm1[0,0],ymm0[4,5],ymm1[4,4]
; X64-NEXT: retq
diff --git a/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll
index edcc3933bc39..00935257baab 100644
--- a/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll
+++ b/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll
@@ -1,11 +1,95 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512vl --show-mc-encoding| FileCheck %s
+declare <4 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.128(i32, <4 x i32>, i8)
+
+define <4 x i32>@test_int_x86_avx512_mask_pbroadcast_d_gpr_128(i32 %x0, <4 x i32> %x1, i8 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcast_d_gpr_128:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpbroadcastd %edi, %xmm1 ## encoding: [0x62,0xf2,0x7d,0x08,0x7c,0xcf]
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpbroadcastd %edi, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x7c,0xc7]
+; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0]
+; CHECK-NEXT: vpbroadcastd %edi, %xmm1 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x7c,0xcf]
+; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <4 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.128(i32 %x0, <4 x i32> %x1, i8 -1)
+ %res1 = call <4 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.128(i32 %x0, <4 x i32> %x1, i8 %mask)
+ %res2 = call <4 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.128(i32 %x0, <4 x i32> zeroinitializer, i8 %mask)
+ %res3 = add <4 x i32> %res, %res1
+ %res4 = add <4 x i32> %res2, %res3
+ ret <4 x i32> %res4
+}
+
+
+declare <2 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.128(i64, <2 x i64>, i8)
+
+define <2 x i64>@test_int_x86_avx512_mask_pbroadcast_q_gpr_128(i64 %x0, <2 x i64> %x1, i8 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcast_q_gpr_128:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpbroadcastq %rdi, %xmm1 ## encoding: [0x62,0xf2,0xfd,0x08,0x7c,0xcf]
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpbroadcastq %rdi, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x7c,0xc7]
+; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0]
+; CHECK-NEXT: vpbroadcastq %rdi, %xmm1 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0x7c,0xcf]
+; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <2 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.128(i64 %x0, <2 x i64> %x1,i8 -1)
+ %res1 = call <2 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.128(i64 %x0, <2 x i64> %x1,i8 %mask)
+ %res2 = call <2 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.128(i64 %x0, <2 x i64> zeroinitializer,i8 %mask)
+ %res3 = add <2 x i64> %res, %res1
+ %res4 = add <2 x i64> %res2, %res3
+ ret <2 x i64> %res4
+}
+
+
+ declare <8 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.256(i32, <8 x i32>, i8)
+
+ define <8 x i32>@test_int_x86_avx512_mask_pbroadcast_d_gpr_256(i32 %x0, <8 x i32> %x1, i8 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcast_d_gpr_256:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpbroadcastd %edi, %ymm1 ## encoding: [0x62,0xf2,0x7d,0x28,0x7c,0xcf]
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpbroadcastd %edi, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x7c,0xc7]
+; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc0]
+; CHECK-NEXT: vpbroadcastd %edi, %ymm1 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x7c,0xcf]
+; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <8 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.256(i32 %x0, <8 x i32> %x1, i8 -1)
+ %res1 = call <8 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.256(i32 %x0, <8 x i32> %x1, i8 %mask)
+ %res2 = call <8 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.256(i32 %x0, <8 x i32> zeroinitializer, i8 %mask)
+ %res3 = add <8 x i32> %res, %res1
+ %res4 = add <8 x i32> %res2, %res3
+ ret <8 x i32> %res4
+ }
+
+ declare <4 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.256(i64, <4 x i64>, i8)
+
+ define <4 x i64>@test_int_x86_avx512_mask_pbroadcast_q_gpr_256(i64 %x0, <4 x i64> %x1, i8 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcast_q_gpr_256:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpbroadcastq %rdi, %ymm1 ## encoding: [0x62,0xf2,0xfd,0x28,0x7c,0xcf]
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vpbroadcastq %rdi, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x7c,0xc7]
+; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0]
+; CHECK-NEXT: vpbroadcastq %rdi, %ymm1 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x7c,0xcf]
+; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <4 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.256(i64 %x0, <4 x i64> %x1,i8 -1)
+ %res1 = call <4 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.256(i64 %x0, <4 x i64> %x1,i8 %mask)
+ %res2 = call <4 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.256(i64 %x0, <4 x i64> zeroinitializer,i8 %mask)
+ %res3 = add <4 x i64> %res, %res1
+ %res4 = add <4 x i64> %res2, %res3
+ ret <4 x i64> %res4
+ }
+
+
+
declare <8 x i32> @llvm.x86.avx512.pbroadcastd.256(<4 x i32>, <8 x i32>, i8)
define <8 x i32>@test_int_x86_avx512_pbroadcastd_256(<4 x i32> %x0, <8 x i32> %x1, i8 %mask, i32 * %y_ptr) {
; CHECK-LABEL: test_int_x86_avx512_pbroadcastd_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpbroadcastd %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x58,0xc8]
; CHECK-NEXT: vpbroadcastd %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x58,0xc0]
@@ -26,7 +110,7 @@ declare <4 x i32> @llvm.x86.avx512.pbroadcastd.128(<4 x i32>, <4 x i32>, i8)
define <4 x i32>@test_int_x86_avx512_pbroadcastd_128(<4 x i32> %x0, <4 x i32> %x1, i8 %mask) {
; CHECK-LABEL: test_int_x86_avx512_pbroadcastd_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpbroadcastd %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x58,0xd0]
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpbroadcastd %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x58,0xc8]
@@ -46,7 +130,7 @@ declare <4 x i64> @llvm.x86.avx512.pbroadcastq.256(<2 x i64>, <4 x i64>, i8)
define <4 x i64>@test_int_x86_avx512_pbroadcastq_256(<2 x i64> %x0, <4 x i64> %x1, i8 %mask) {
; CHECK-LABEL: test_int_x86_avx512_pbroadcastq_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpbroadcastq %xmm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x59,0xd0]
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpbroadcastq %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x59,0xc8]
@@ -66,7 +150,7 @@ declare <2 x i64> @llvm.x86.avx512.pbroadcastq.128(<2 x i64>, <2 x i64>, i8)
define <2 x i64>@test_int_x86_avx512_pbroadcastq_128(<2 x i64> %x0, <2 x i64> %x1, i8 %mask) {
; CHECK-LABEL: test_int_x86_avx512_pbroadcastq_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpbroadcastq %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x59,0xd0]
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpbroadcastq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x59,0xc8]
@@ -86,7 +170,7 @@ declare <4 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.256(<2 x double>, <4
define <4 x double> @test_x86_vbroadcast_sd_pd_256(<2 x double> %a0, <4 x double> %a1, i8 %mask ) {
; CHECK-LABEL: test_x86_vbroadcast_sd_pd_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vbroadcastsd %xmm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x19,0xd0]
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vbroadcastsd %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x19,0xc8]
@@ -106,7 +190,7 @@ declare <8 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.256(<4 x float>, <8 x
define <8 x float> @test_x86_vbroadcast_ss_ps_256(<4 x float> %a0, <8 x float> %a1, i8 %mask ) {
; CHECK-LABEL: test_x86_vbroadcast_ss_ps_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vbroadcastss %xmm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x18,0xd0]
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vbroadcastss %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x18,0xc8]
@@ -126,7 +210,7 @@ declare <4 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.128(<4 x float>, <4 x
define <4 x float> @test_x86_vbroadcast_ss_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %mask ) {
; CHECK-LABEL: test_x86_vbroadcast_ss_ps_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vbroadcastss %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0xd0]
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vbroadcastss %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x18,0xc8]
@@ -146,7 +230,7 @@ declare <4 x float> @llvm.x86.avx512.mask.movsldup.128(<4 x float>, <4 x float>,
define <4 x float>@test_int_x86_avx512_mask_movsldup_128(<4 x float> %x0, <4 x float> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_movsldup_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovsldup %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x12,0xd0]
; CHECK-NEXT: ## xmm2 = xmm0[0,0,2,2]
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
@@ -169,7 +253,7 @@ declare <8 x float> @llvm.x86.avx512.mask.movsldup.256(<8 x float>, <8 x float>,
define <8 x float>@test_int_x86_avx512_mask_movsldup_256(<8 x float> %x0, <8 x float> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_movsldup_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovsldup %ymm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xfe,0x12,0xd0]
; CHECK-NEXT: ## ymm2 = ymm0[0,0,2,2,4,4,6,6]
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
@@ -192,7 +276,7 @@ declare <4 x float> @llvm.x86.avx512.mask.movshdup.128(<4 x float>, <4 x float>,
define <4 x float>@test_int_x86_avx512_mask_movshdup_128(<4 x float> %x0, <4 x float> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_movshdup_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovshdup %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x16,0xd0]
; CHECK-NEXT: ## xmm2 = xmm0[1,1,3,3]
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
@@ -215,7 +299,7 @@ declare <8 x float> @llvm.x86.avx512.mask.movshdup.256(<8 x float>, <8 x float>,
define <8 x float>@test_int_x86_avx512_mask_movshdup_256(<8 x float> %x0, <8 x float> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_movshdup_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovshdup %ymm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xfe,0x16,0xd0]
; CHECK-NEXT: ## ymm2 = ymm0[1,1,3,3,5,5,7,7]
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
@@ -237,7 +321,7 @@ declare <2 x double> @llvm.x86.avx512.mask.movddup.128(<2 x double>, <2 x double
define <2 x double>@test_int_x86_avx512_mask_movddup_128(<2 x double> %x0, <2 x double> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_movddup_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovddup %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xfb,0x12,0xd0]
; CHECK-NEXT: ## xmm2 = xmm0[0,0]
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
@@ -260,7 +344,7 @@ declare <4 x double> @llvm.x86.avx512.mask.movddup.256(<4 x double>, <4 x double
define <4 x double>@test_int_x86_avx512_mask_movddup_256(<4 x double> %x0, <4 x double> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_movddup_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovddup %ymm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xff,0x12,0xd0]
; CHECK-NEXT: ## ymm2 = ymm0[0,0,2,2]
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
@@ -283,7 +367,7 @@ declare <4 x double> @llvm.x86.avx512.mask.vpermil.pd.256(<4 x double>, i32, <4
define <4 x double>@test_int_x86_avx512_mask_vpermil_pd_256(<4 x double> %x0, <4 x double> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vpermil_pd_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpermilpd $6, %ymm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x05,0xd0,0x06]
; CHECK-NEXT: ## ymm2 = ymm0[0,1,3,2]
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
@@ -306,7 +390,7 @@ declare <2 x double> @llvm.x86.avx512.mask.vpermil.pd.128(<2 x double>, i32, <2
define <2 x double>@test_int_x86_avx512_mask_vpermil_pd_128(<2 x double> %x0, <2 x double> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vpermil_pd_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpermilpd $1, %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x05,0xd0,0x01]
; CHECK-NEXT: ## xmm2 = xmm0[1,0]
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
@@ -329,7 +413,7 @@ declare <8 x float> @llvm.x86.avx512.mask.vpermil.ps.256(<8 x float>, i32, <8 x
define <8 x float>@test_int_x86_avx512_mask_vpermil_ps_256(<8 x float> %x0, <8 x float> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vpermil_ps_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpermilps $22, %ymm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x04,0xd0,0x16]
; CHECK-NEXT: ## ymm2 = ymm0[2,1,1,0,6,5,5,4]
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
@@ -352,7 +436,7 @@ declare <4 x float> @llvm.x86.avx512.mask.vpermil.ps.128(<4 x float>, i32, <4 x
define <4 x float>@test_int_x86_avx512_mask_vpermil_ps_128(<4 x float> %x0, <4 x float> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vpermil_ps_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpermilps $22, %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xd0,0x16]
; CHECK-NEXT: ## xmm2 = xmm0[2,1,1,0]
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
@@ -375,7 +459,7 @@ declare <4 x double> @llvm.x86.avx512.mask.perm.df.256(<4 x double>, i32, <4 x d
define <4 x double>@test_int_x86_avx512_mask_perm_df_256(<4 x double> %x0, i32 %x1, <4 x double> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_perm_df_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpermpd $3, %ymm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0x01,0xd0,0x03]
; CHECK-NEXT: ## ymm2 = ymm0[3,0,0,0]
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
@@ -398,7 +482,7 @@ declare <4 x i64> @llvm.x86.avx512.mask.perm.di.256(<4 x i64>, i32, <4 x i64>, i
define <4 x i64>@test_int_x86_avx512_mask_perm_di_256(<4 x i64> %x0, i32 %x1, <4 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_perm_di_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpermq $3, %ymm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0x00,0xd0,0x03]
; CHECK-NEXT: ## ymm2 = ymm0[3,0,0,0]
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
@@ -421,7 +505,7 @@ declare void @llvm.x86.avx512.mask.store.pd.128(i8*, <2 x double>, i8)
define void@test_int_x86_avx512_mask_store_pd_128(i8* %ptr1, i8* %ptr2, <2 x double> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_store_pd_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edx, %k1 ## encoding: [0xc5,0xf8,0x92,0xca]
; CHECK-NEXT: vmovapd %xmm0, (%rdi) {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0x29,0x07]
; CHECK-NEXT: vmovapd %xmm0, (%rsi) ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x29,0x06]
@@ -435,7 +519,7 @@ declare void @llvm.x86.avx512.mask.store.pd.256(i8*, <4 x double>, i8)
define void@test_int_x86_avx512_mask_store_pd_256(i8* %ptr1, i8* %ptr2, <4 x double> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_store_pd_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edx, %k1 ## encoding: [0xc5,0xf8,0x92,0xca]
; CHECK-NEXT: vmovapd %ymm0, (%rdi) {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0x29,0x07]
; CHECK-NEXT: vmovapd %ymm0, (%rsi) ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x29,0x06]
@@ -449,7 +533,7 @@ declare void @llvm.x86.avx512.mask.storeu.pd.128(i8*, <2 x double>, i8)
define void@test_int_x86_avx512_mask_storeu_pd_128(i8* %ptr1, i8* %ptr2, <2 x double> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_storeu_pd_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edx, %k1 ## encoding: [0xc5,0xf8,0x92,0xca]
; CHECK-NEXT: vmovupd %xmm0, (%rdi) {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0x11,0x07]
; CHECK-NEXT: vmovupd %xmm0, (%rsi) ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x11,0x06]
@@ -463,7 +547,7 @@ declare void @llvm.x86.avx512.mask.storeu.pd.256(i8*, <4 x double>, i8)
define void@test_int_x86_avx512_mask_storeu_pd_256(i8* %ptr1, i8* %ptr2, <4 x double> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_storeu_pd_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edx, %k1 ## encoding: [0xc5,0xf8,0x92,0xca]
; CHECK-NEXT: vmovupd %ymm0, (%rdi) {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0x11,0x07]
; CHECK-NEXT: vmovupd %ymm0, (%rsi) ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x11,0x06]
@@ -477,7 +561,7 @@ declare void @llvm.x86.avx512.mask.store.ps.128(i8*, <4 x float>, i8)
define void@test_int_x86_avx512_mask_store_ps_128(i8* %ptr1, i8* %ptr2, <4 x float> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_store_ps_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edx, %k1 ## encoding: [0xc5,0xf8,0x92,0xca]
; CHECK-NEXT: vmovaps %xmm0, (%rdi) {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x29,0x07]
; CHECK-NEXT: vmovaps %xmm0, (%rsi) ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x06]
@@ -491,7 +575,7 @@ declare void @llvm.x86.avx512.mask.store.ps.256(i8*, <8 x float>, i8)
define void@test_int_x86_avx512_mask_store_ps_256(i8* %ptr1, i8* %ptr2, <8 x float> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_store_ps_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edx, %k1 ## encoding: [0xc5,0xf8,0x92,0xca]
; CHECK-NEXT: vmovaps %ymm0, (%rdi) {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x29,0x07]
; CHECK-NEXT: vmovaps %ymm0, (%rsi) ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x29,0x06]
@@ -505,7 +589,7 @@ declare void @llvm.x86.avx512.mask.storeu.ps.128(i8*, <4 x float>, i8)
define void@test_int_x86_avx512_mask_storeu_ps_128(i8* %ptr1, i8* %ptr2, <4 x float> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_storeu_ps_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edx, %k1 ## encoding: [0xc5,0xf8,0x92,0xca]
; CHECK-NEXT: vmovups %xmm0, (%rdi) {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x11,0x07]
; CHECK-NEXT: vmovups %xmm0, (%rsi) ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x11,0x06]
@@ -519,7 +603,7 @@ declare void @llvm.x86.avx512.mask.storeu.ps.256(i8*, <8 x float>, i8)
define void@test_int_x86_avx512_mask_storeu_ps_256(i8* %ptr1, i8* %ptr2, <8 x float> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_storeu_ps_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edx, %k1 ## encoding: [0xc5,0xf8,0x92,0xca]
; CHECK-NEXT: vmovups %ymm0, (%rdi) {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x11,0x07]
; CHECK-NEXT: vmovups %ymm0, (%rsi) ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x11,0x06]
@@ -533,7 +617,7 @@ declare void @llvm.x86.avx512.mask.storeu.q.128(i8*, <2 x i64>, i8)
define void@test_int_x86_avx512_mask_storeu_q_128(i8* %ptr1, i8* %ptr2, <2 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_storeu_q_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edx, %k1 ## encoding: [0xc5,0xf8,0x92,0xca]
; CHECK-NEXT: vmovdqu64 %xmm0, (%rdi) {%k1} ## encoding: [0x62,0xf1,0xfe,0x09,0x7f,0x07]
; CHECK-NEXT: vmovdqu %xmm0, (%rsi) ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7f,0x06]
@@ -547,7 +631,7 @@ declare void @llvm.x86.avx512.mask.storeu.q.256(i8*, <4 x i64>, i8)
define void@test_int_x86_avx512_mask_storeu_q_256(i8* %ptr1, i8* %ptr2, <4 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_storeu_q_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edx, %k1 ## encoding: [0xc5,0xf8,0x92,0xca]
; CHECK-NEXT: vmovdqu64 %ymm0, (%rdi) {%k1} ## encoding: [0x62,0xf1,0xfe,0x29,0x7f,0x07]
; CHECK-NEXT: vmovdqu %ymm0, (%rsi) ## EVEX TO VEX Compression encoding: [0xc5,0xfe,0x7f,0x06]
@@ -561,7 +645,7 @@ declare void @llvm.x86.avx512.mask.storeu.d.128(i8*, <4 x i32>, i8)
define void@test_int_x86_avx512_mask_storeu_d_128(i8* %ptr1, i8* %ptr2, <4 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_storeu_d_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edx, %k1 ## encoding: [0xc5,0xf8,0x92,0xca]
; CHECK-NEXT: vmovdqu32 %xmm0, (%rdi) {%k1} ## encoding: [0x62,0xf1,0x7e,0x09,0x7f,0x07]
; CHECK-NEXT: vmovdqu %xmm0, (%rsi) ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7f,0x06]
@@ -575,7 +659,7 @@ declare void @llvm.x86.avx512.mask.storeu.d.256(i8*, <8 x i32>, i8)
define void@test_int_x86_avx512_mask_storeu_d_256(i8* %ptr1, i8* %ptr2, <8 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_storeu_d_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edx, %k1 ## encoding: [0xc5,0xf8,0x92,0xca]
; CHECK-NEXT: vmovdqu32 %ymm0, (%rdi) {%k1} ## encoding: [0x62,0xf1,0x7e,0x29,0x7f,0x07]
; CHECK-NEXT: vmovdqu %ymm0, (%rsi) ## EVEX TO VEX Compression encoding: [0xc5,0xfe,0x7f,0x06]
@@ -589,7 +673,7 @@ declare void @llvm.x86.avx512.mask.store.q.128(i8*, <2 x i64>, i8)
define void@test_int_x86_avx512_mask_store_q_128(i8* %ptr1, i8* %ptr2, <2 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_store_q_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edx, %k1 ## encoding: [0xc5,0xf8,0x92,0xca]
; CHECK-NEXT: vmovdqa64 %xmm0, (%rdi) {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0x7f,0x07]
; CHECK-NEXT: vmovdqa %xmm0, (%rsi) ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x7f,0x06]
@@ -603,7 +687,7 @@ declare void @llvm.x86.avx512.mask.store.q.256(i8*, <4 x i64>, i8)
define void@test_int_x86_avx512_mask_store_q_256(i8* %ptr1, i8* %ptr2, <4 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_store_q_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edx, %k1 ## encoding: [0xc5,0xf8,0x92,0xca]
; CHECK-NEXT: vmovdqa64 %ymm0, (%rdi) {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0x7f,0x07]
; CHECK-NEXT: vmovdqa %ymm0, (%rsi) ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x7f,0x06]
@@ -617,7 +701,7 @@ declare void @llvm.x86.avx512.mask.store.d.128(i8*, <4 x i32>, i8)
define void@test_int_x86_avx512_mask_store_d_128(i8* %ptr1, i8* %ptr2, <4 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_store_d_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edx, %k1 ## encoding: [0xc5,0xf8,0x92,0xca]
; CHECK-NEXT: vmovdqa32 %xmm0, (%rdi) {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x7f,0x07]
; CHECK-NEXT: vmovdqa %xmm0, (%rsi) ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x7f,0x06]
@@ -631,7 +715,7 @@ declare void @llvm.x86.avx512.mask.store.d.256(i8*, <8 x i32>, i8)
define void@test_int_x86_avx512_mask_store_d_256(i8* %ptr1, i8* %ptr2, <8 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_store_d_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edx, %k1 ## encoding: [0xc5,0xf8,0x92,0xca]
; CHECK-NEXT: vmovdqa32 %ymm0, (%rdi) {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x7f,0x07]
; CHECK-NEXT: vmovdqa %ymm0, (%rsi) ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x7f,0x06]
@@ -643,7 +727,7 @@ define void@test_int_x86_avx512_mask_store_d_256(i8* %ptr1, i8* %ptr2, <8 x i32>
define <8 x float> @test_mask_load_aligned_ps_256(<8 x float> %data, i8* %ptr, i8 %mask) {
; CHECK-LABEL: test_mask_load_aligned_ps_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovaps (%rdi), %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0x07]
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vmovaps (%rdi), %ymm0 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x28,0x07]
@@ -661,7 +745,7 @@ declare <8 x float> @llvm.x86.avx512.mask.load.ps.256(i8*, <8 x float>, i8)
define <8 x float> @test_mask_load_unaligned_ps_256(<8 x float> %data, i8* %ptr, i8 %mask) {
; CHECK-LABEL: test_mask_load_unaligned_ps_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovups (%rdi), %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x10,0x07]
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vmovups (%rdi), %ymm0 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x10,0x07]
@@ -679,7 +763,7 @@ declare <8 x float> @llvm.x86.avx512.mask.loadu.ps.256(i8*, <8 x float>, i8)
define <4 x double> @test_mask_load_aligned_pd_256(<4 x double> %data, i8* %ptr, i8 %mask) {
; CHECK-LABEL: test_mask_load_aligned_pd_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovapd (%rdi), %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0x07]
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vmovapd (%rdi), %ymm0 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0x28,0x07]
@@ -697,7 +781,7 @@ declare <4 x double> @llvm.x86.avx512.mask.load.pd.256(i8*, <4 x double>, i8)
define <4 x double> @test_mask_load_unaligned_pd_256(<4 x double> %data, i8* %ptr, i8 %mask) {
; CHECK-LABEL: test_mask_load_unaligned_pd_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovupd (%rdi), %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x10,0x07]
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vmovupd (%rdi), %ymm0 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0x10,0x07]
@@ -715,7 +799,7 @@ declare <4 x double> @llvm.x86.avx512.mask.loadu.pd.256(i8*, <4 x double>, i8)
define <4 x float> @test_mask_load_aligned_ps_128(<4 x float> %data, i8* %ptr, i8 %mask) {
; CHECK-LABEL: test_mask_load_aligned_ps_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovaps (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x07]
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vmovaps (%rdi), %xmm0 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x28,0x07]
@@ -733,7 +817,7 @@ declare <4 x float> @llvm.x86.avx512.mask.load.ps.128(i8*, <4 x float>, i8)
define <4 x float> @test_mask_load_unaligned_ps_128(<4 x float> %data, i8* %ptr, i8 %mask) {
; CHECK-LABEL: test_mask_load_unaligned_ps_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovups (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x10,0x07]
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vmovups (%rdi), %xmm0 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x10,0x07]
@@ -751,7 +835,7 @@ declare <4 x float> @llvm.x86.avx512.mask.loadu.ps.128(i8*, <4 x float>, i8)
define <2 x double> @test_mask_load_aligned_pd_128(<2 x double> %data, i8* %ptr, i8 %mask) {
; CHECK-LABEL: test_mask_load_aligned_pd_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovapd (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0x07]
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vmovapd (%rdi), %xmm0 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0x28,0x07]
@@ -769,7 +853,7 @@ declare <2 x double> @llvm.x86.avx512.mask.load.pd.128(i8*, <2 x double>, i8)
define <2 x double> @test_mask_load_unaligned_pd_128(<2 x double> %data, i8* %ptr, i8 %mask) {
; CHECK-LABEL: test_mask_load_unaligned_pd_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovupd (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x10,0x07]
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vmovupd (%rdi), %xmm0 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0x10,0x07]
@@ -789,7 +873,7 @@ declare <4 x i32> @llvm.x86.avx512.mask.loadu.d.128(i8*, <4 x i32>, i8)
define <4 x i32> @test_mask_load_unaligned_d_128(i8* %ptr, i8* %ptr2, <4 x i32> %data, i8 %mask) {
; CHECK-LABEL: test_mask_load_unaligned_d_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovdqu (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x6f,0x07]
; CHECK-NEXT: kmovw %edx, %k1 ## encoding: [0xc5,0xf8,0x92,0xca]
; CHECK-NEXT: vmovdqu32 (%rsi), %xmm0 {%k1} ## encoding: [0x62,0xf1,0x7e,0x09,0x6f,0x06]
@@ -807,7 +891,7 @@ declare <8 x i32> @llvm.x86.avx512.mask.loadu.d.256(i8*, <8 x i32>, i8)
define <8 x i32> @test_mask_load_unaligned_d_256(i8* %ptr, i8* %ptr2, <8 x i32> %data, i8 %mask) {
; CHECK-LABEL: test_mask_load_unaligned_d_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovdqu (%rdi), %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfe,0x6f,0x07]
; CHECK-NEXT: kmovw %edx, %k1 ## encoding: [0xc5,0xf8,0x92,0xca]
; CHECK-NEXT: vmovdqu32 (%rsi), %ymm0 {%k1} ## encoding: [0x62,0xf1,0x7e,0x29,0x6f,0x06]
@@ -825,7 +909,7 @@ declare <2 x i64> @llvm.x86.avx512.mask.loadu.q.128(i8*, <2 x i64>, i8)
define <2 x i64> @test_mask_load_unaligned_q_128(i8* %ptr, i8* %ptr2, <2 x i64> %data, i8 %mask) {
; CHECK-LABEL: test_mask_load_unaligned_q_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovdqu (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x6f,0x07]
; CHECK-NEXT: kmovw %edx, %k1 ## encoding: [0xc5,0xf8,0x92,0xca]
; CHECK-NEXT: vmovdqu64 (%rsi), %xmm0 {%k1} ## encoding: [0x62,0xf1,0xfe,0x09,0x6f,0x06]
@@ -843,7 +927,7 @@ declare <4 x i64> @llvm.x86.avx512.mask.loadu.q.256(i8*, <4 x i64>, i8)
define <4 x i64> @test_mask_load_unaligned_q_256(i8* %ptr, i8* %ptr2, <4 x i64> %data, i8 %mask) {
; CHECK-LABEL: test_mask_load_unaligned_q_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovdqu (%rdi), %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfe,0x6f,0x07]
; CHECK-NEXT: kmovw %edx, %k1 ## encoding: [0xc5,0xf8,0x92,0xca]
; CHECK-NEXT: vmovdqu64 (%rsi), %ymm0 {%k1} ## encoding: [0x62,0xf1,0xfe,0x29,0x6f,0x06]
@@ -861,7 +945,7 @@ declare <4 x i32> @llvm.x86.avx512.mask.load.d.128(i8*, <4 x i32>, i8)
define <4 x i32> @test_mask_load_aligned_d_128(<4 x i32> %data, i8* %ptr, i8 %mask) {
; CHECK-LABEL: test_mask_load_aligned_d_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovdqa (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0x07]
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vmovdqa32 (%rdi), %xmm0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x6f,0x07]
@@ -879,7 +963,7 @@ declare <8 x i32> @llvm.x86.avx512.mask.load.d.256(i8*, <8 x i32>, i8)
define <8 x i32> @test_mask_load_aligned_d_256(<8 x i32> %data, i8* %ptr, i8 %mask) {
; CHECK-LABEL: test_mask_load_aligned_d_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovdqa (%rdi), %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0x07]
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vmovdqa32 (%rdi), %ymm0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x6f,0x07]
@@ -897,7 +981,7 @@ declare <2 x i64> @llvm.x86.avx512.mask.load.q.128(i8*, <2 x i64>, i8)
define <2 x i64> @test_mask_load_aligned_q_128(<2 x i64> %data, i8* %ptr, i8 %mask) {
; CHECK-LABEL: test_mask_load_aligned_q_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovdqa (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0x07]
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vmovdqa64 (%rdi), %xmm0 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0x6f,0x07]
@@ -915,7 +999,7 @@ declare <4 x i64> @llvm.x86.avx512.mask.load.q.256(i8*, <4 x i64>, i8)
define <4 x i64> @test_mask_load_aligned_q_256(<4 x i64> %data, i8* %ptr, i8 %mask) {
; CHECK-LABEL: test_mask_load_aligned_q_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovdqa (%rdi), %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0x07]
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vmovdqa64 (%rdi), %ymm0 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0x6f,0x07]
@@ -933,7 +1017,7 @@ declare <4 x i32> @llvm.x86.avx512.mask.pshuf.d.128(<4 x i32>, i32, <4 x i32>, i
define <4 x i32>@test_int_x86_avx512_mask_pshuf_d_128(<4 x i32> %x0, i32 %x1, <4 x i32> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_pshuf_d_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpshufd $3, %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x70,0xd0,0x03]
; CHECK-NEXT: ## xmm2 = xmm0[3,0,0,0]
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
@@ -956,7 +1040,7 @@ declare <8 x i32> @llvm.x86.avx512.mask.pshuf.d.256(<8 x i32>, i32, <8 x i32>, i
define <8 x i32>@test_int_x86_avx512_mask_pshuf_d_256(<8 x i32> %x0, i32 %x1, <8 x i32> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_pshuf_d_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpshufd $3, %ymm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x70,0xd0,0x03]
; CHECK-NEXT: ## ymm2 = ymm0[3,0,0,0,7,4,4,4]
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
@@ -977,10 +1061,10 @@ define <8 x i32>@test_int_x86_avx512_mask_pshuf_d_256(<8 x i32> %x0, i32 %x1, <8
define i8 @test_pcmpeq_d_256(<8 x i32> %a, <8 x i32> %b) {
; CHECK-LABEL: test_pcmpeq_d_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x28,0x76,0xc1]
; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
-; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: ## kill: def %al killed %al killed %eax
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call i8 @llvm.x86.avx512.mask.pcmpeq.d.256(<8 x i32> %a, <8 x i32> %b, i8 -1)
ret i8 %res
@@ -988,11 +1072,11 @@ define i8 @test_pcmpeq_d_256(<8 x i32> %a, <8 x i32> %b) {
define i8 @test_mask_pcmpeq_d_256(<8 x i32> %a, <8 x i32> %b, i8 %mask) {
; CHECK-LABEL: test_mask_pcmpeq_d_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x76,0xc1]
; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
-; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: ## kill: def %al killed %al killed %eax
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call i8 @llvm.x86.avx512.mask.pcmpeq.d.256(<8 x i32> %a, <8 x i32> %b, i8 %mask)
ret i8 %res
@@ -1002,10 +1086,10 @@ declare i8 @llvm.x86.avx512.mask.pcmpeq.d.256(<8 x i32>, <8 x i32>, i8)
define i8 @test_pcmpeq_q_256(<4 x i64> %a, <4 x i64> %b) {
; CHECK-LABEL: test_pcmpeq_q_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf2,0xfd,0x28,0x29,0xc1]
; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
-; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: ## kill: def %al killed %al killed %eax
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call i8 @llvm.x86.avx512.mask.pcmpeq.q.256(<4 x i64> %a, <4 x i64> %b, i8 -1)
ret i8 %res
@@ -1013,11 +1097,11 @@ define i8 @test_pcmpeq_q_256(<4 x i64> %a, <4 x i64> %b) {
define i8 @test_mask_pcmpeq_q_256(<4 x i64> %a, <4 x i64> %b, i8 %mask) {
; CHECK-LABEL: test_mask_pcmpeq_q_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x29,0xc1]
; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
-; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: ## kill: def %al killed %al killed %eax
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call i8 @llvm.x86.avx512.mask.pcmpeq.q.256(<4 x i64> %a, <4 x i64> %b, i8 %mask)
ret i8 %res
@@ -1027,10 +1111,10 @@ declare i8 @llvm.x86.avx512.mask.pcmpeq.q.256(<4 x i64>, <4 x i64>, i8)
define i8 @test_pcmpgt_d_256(<8 x i32> %a, <8 x i32> %b) {
; CHECK-LABEL: test_pcmpgt_d_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpcmpgtd %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x28,0x66,0xc1]
; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
-; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: ## kill: def %al killed %al killed %eax
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call i8 @llvm.x86.avx512.mask.pcmpgt.d.256(<8 x i32> %a, <8 x i32> %b, i8 -1)
ret i8 %res
@@ -1038,11 +1122,11 @@ define i8 @test_pcmpgt_d_256(<8 x i32> %a, <8 x i32> %b) {
define i8 @test_mask_pcmpgt_d_256(<8 x i32> %a, <8 x i32> %b, i8 %mask) {
; CHECK-LABEL: test_mask_pcmpgt_d_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpcmpgtd %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x66,0xc1]
; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
-; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: ## kill: def %al killed %al killed %eax
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call i8 @llvm.x86.avx512.mask.pcmpgt.d.256(<8 x i32> %a, <8 x i32> %b, i8 %mask)
ret i8 %res
@@ -1052,10 +1136,10 @@ declare i8 @llvm.x86.avx512.mask.pcmpgt.d.256(<8 x i32>, <8 x i32>, i8)
define i8 @test_pcmpgt_q_256(<4 x i64> %a, <4 x i64> %b) {
; CHECK-LABEL: test_pcmpgt_q_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpcmpgtq %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf2,0xfd,0x28,0x37,0xc1]
; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
-; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: ## kill: def %al killed %al killed %eax
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call i8 @llvm.x86.avx512.mask.pcmpgt.q.256(<4 x i64> %a, <4 x i64> %b, i8 -1)
ret i8 %res
@@ -1063,11 +1147,11 @@ define i8 @test_pcmpgt_q_256(<4 x i64> %a, <4 x i64> %b) {
define i8 @test_mask_pcmpgt_q_256(<4 x i64> %a, <4 x i64> %b, i8 %mask) {
; CHECK-LABEL: test_mask_pcmpgt_q_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpcmpgtq %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x37,0xc1]
; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
-; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: ## kill: def %al killed %al killed %eax
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call i8 @llvm.x86.avx512.mask.pcmpgt.q.256(<4 x i64> %a, <4 x i64> %b, i8 %mask)
ret i8 %res
@@ -1077,10 +1161,10 @@ declare i8 @llvm.x86.avx512.mask.pcmpgt.q.256(<4 x i64>, <4 x i64>, i8)
define i8 @test_pcmpeq_d_128(<4 x i32> %a, <4 x i32> %b) {
; CHECK-LABEL: test_pcmpeq_d_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x08,0x76,0xc1]
; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
-; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: ## kill: def %al killed %al killed %eax
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call i8 @llvm.x86.avx512.mask.pcmpeq.d.128(<4 x i32> %a, <4 x i32> %b, i8 -1)
ret i8 %res
@@ -1088,11 +1172,11 @@ define i8 @test_pcmpeq_d_128(<4 x i32> %a, <4 x i32> %b) {
define i8 @test_mask_pcmpeq_d_128(<4 x i32> %a, <4 x i32> %b, i8 %mask) {
; CHECK-LABEL: test_mask_pcmpeq_d_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x76,0xc1]
; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
-; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: ## kill: def %al killed %al killed %eax
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call i8 @llvm.x86.avx512.mask.pcmpeq.d.128(<4 x i32> %a, <4 x i32> %b, i8 %mask)
ret i8 %res
@@ -1102,10 +1186,10 @@ declare i8 @llvm.x86.avx512.mask.pcmpeq.d.128(<4 x i32>, <4 x i32>, i8)
define i8 @test_pcmpeq_q_128(<2 x i64> %a, <2 x i64> %b) {
; CHECK-LABEL: test_pcmpeq_q_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf2,0xfd,0x08,0x29,0xc1]
; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
-; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: ## kill: def %al killed %al killed %eax
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call i8 @llvm.x86.avx512.mask.pcmpeq.q.128(<2 x i64> %a, <2 x i64> %b, i8 -1)
ret i8 %res
@@ -1113,11 +1197,11 @@ define i8 @test_pcmpeq_q_128(<2 x i64> %a, <2 x i64> %b) {
define i8 @test_mask_pcmpeq_q_128(<2 x i64> %a, <2 x i64> %b, i8 %mask) {
; CHECK-LABEL: test_mask_pcmpeq_q_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x29,0xc1]
; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
-; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: ## kill: def %al killed %al killed %eax
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call i8 @llvm.x86.avx512.mask.pcmpeq.q.128(<2 x i64> %a, <2 x i64> %b, i8 %mask)
ret i8 %res
@@ -1127,10 +1211,10 @@ declare i8 @llvm.x86.avx512.mask.pcmpeq.q.128(<2 x i64>, <2 x i64>, i8)
define i8 @test_pcmpgt_d_128(<4 x i32> %a, <4 x i32> %b) {
; CHECK-LABEL: test_pcmpgt_d_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpcmpgtd %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x08,0x66,0xc1]
; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
-; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: ## kill: def %al killed %al killed %eax
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call i8 @llvm.x86.avx512.mask.pcmpgt.d.128(<4 x i32> %a, <4 x i32> %b, i8 -1)
ret i8 %res
@@ -1138,11 +1222,11 @@ define i8 @test_pcmpgt_d_128(<4 x i32> %a, <4 x i32> %b) {
define i8 @test_mask_pcmpgt_d_128(<4 x i32> %a, <4 x i32> %b, i8 %mask) {
; CHECK-LABEL: test_mask_pcmpgt_d_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpcmpgtd %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x66,0xc1]
; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
-; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: ## kill: def %al killed %al killed %eax
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call i8 @llvm.x86.avx512.mask.pcmpgt.d.128(<4 x i32> %a, <4 x i32> %b, i8 %mask)
ret i8 %res
@@ -1152,10 +1236,10 @@ declare i8 @llvm.x86.avx512.mask.pcmpgt.d.128(<4 x i32>, <4 x i32>, i8)
define i8 @test_pcmpgt_q_128(<2 x i64> %a, <2 x i64> %b) {
; CHECK-LABEL: test_pcmpgt_q_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf2,0xfd,0x08,0x37,0xc1]
; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
-; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: ## kill: def %al killed %al killed %eax
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call i8 @llvm.x86.avx512.mask.pcmpgt.q.128(<2 x i64> %a, <2 x i64> %b, i8 -1)
ret i8 %res
@@ -1163,11 +1247,11 @@ define i8 @test_pcmpgt_q_128(<2 x i64> %a, <2 x i64> %b) {
define i8 @test_mask_pcmpgt_q_128(<2 x i64> %a, <2 x i64> %b, i8 %mask) {
; CHECK-LABEL: test_mask_pcmpgt_q_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x37,0xc1]
; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
-; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: ## kill: def %al killed %al killed %eax
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call i8 @llvm.x86.avx512.mask.pcmpgt.q.128(<2 x i64> %a, <2 x i64> %b, i8 %mask)
ret i8 %res
@@ -1179,7 +1263,7 @@ declare <2 x double> @llvm.x86.avx512.mask.unpckh.pd.128(<2 x double>, <2 x doub
define <2 x double>@test_int_x86_avx512_mask_unpckh_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_unpckh_pd_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vunpckhpd %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x15,0xd9]
; CHECK-NEXT: ## xmm3 = xmm0[1],xmm1[1]
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
@@ -1197,7 +1281,7 @@ declare <4 x double> @llvm.x86.avx512.mask.unpckh.pd.256(<4 x double>, <4 x doub
define <4 x double>@test_int_x86_avx512_mask_unpckh_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_unpckh_pd_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vunpckhpd %ymm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x15,0xd9]
; CHECK-NEXT: ## ymm3 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
@@ -1215,7 +1299,7 @@ declare <4 x float> @llvm.x86.avx512.mask.unpckh.ps.128(<4 x float>, <4 x float>
define <4 x float>@test_int_x86_avx512_mask_unpckh_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_unpckh_ps_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vunpckhps %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x15,0xd9]
; CHECK-NEXT: ## xmm3 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
@@ -1233,7 +1317,7 @@ declare <8 x float> @llvm.x86.avx512.mask.unpckh.ps.256(<8 x float>, <8 x float>
define <8 x float>@test_int_x86_avx512_mask_unpckh_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_unpckh_ps_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vunpckhps %ymm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x15,0xd9]
; CHECK-NEXT: ## ymm3 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
@@ -1251,7 +1335,7 @@ declare <2 x double> @llvm.x86.avx512.mask.unpckl.pd.128(<2 x double>, <2 x doub
define <2 x double>@test_int_x86_avx512_mask_unpckl_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_unpckl_pd_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vunpcklpd %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x14,0xd9]
; CHECK-NEXT: ## xmm3 = xmm0[0],xmm1[0]
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
@@ -1269,7 +1353,7 @@ declare <4 x double> @llvm.x86.avx512.mask.unpckl.pd.256(<4 x double>, <4 x doub
define <4 x double>@test_int_x86_avx512_mask_unpckl_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_unpckl_pd_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vunpcklpd %ymm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x14,0xd9]
; CHECK-NEXT: ## ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
@@ -1287,7 +1371,7 @@ declare <4 x float> @llvm.x86.avx512.mask.unpckl.ps.128(<4 x float>, <4 x float>
define <4 x float>@test_int_x86_avx512_mask_unpckl_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_unpckl_ps_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vunpcklps %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x14,0xd9]
; CHECK-NEXT: ## xmm3 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
@@ -1305,7 +1389,7 @@ declare <8 x float> @llvm.x86.avx512.mask.unpckl.ps.256(<8 x float>, <8 x float>
define <8 x float>@test_int_x86_avx512_mask_unpckl_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_unpckl_ps_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vunpcklps %ymm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x14,0xd9]
; CHECK-NEXT: ## ymm3 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
@@ -1323,7 +1407,7 @@ declare <4 x i32> @llvm.x86.avx512.mask.punpckhd.q.128(<4 x i32>, <4 x i32>, <4
define <4 x i32>@test_int_x86_avx512_mask_punpckhd_q_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_punpckhd_q_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpunpckhdq %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6a,0xd9]
; CHECK-NEXT: ## xmm3 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
@@ -1341,7 +1425,7 @@ declare <4 x i32> @llvm.x86.avx512.mask.punpckld.q.128(<4 x i32>, <4 x i32>, <4
define <4 x i32>@test_int_x86_avx512_mask_punpckld_q_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_punpckld_q_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpunpckldq %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x62,0xd9]
; CHECK-NEXT: ## xmm3 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
@@ -1359,7 +1443,7 @@ declare <8 x i32> @llvm.x86.avx512.mask.punpckhd.q.256(<8 x i32>, <8 x i32>, <8
define <8 x i32>@test_int_x86_avx512_mask_punpckhd_q_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_punpckhd_q_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpunpckhdq %ymm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6a,0xd9]
; CHECK-NEXT: ## ymm3 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
@@ -1377,7 +1461,7 @@ declare <8 x i32> @llvm.x86.avx512.mask.punpckld.q.256(<8 x i32>, <8 x i32>, <8
define <8 x i32>@test_int_x86_avx512_mask_punpckld_q_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_punpckld_q_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpunpckldq %ymm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x62,0xd9]
; CHECK-NEXT: ## ymm3 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
@@ -1395,7 +1479,7 @@ declare <2 x i64> @llvm.x86.avx512.mask.punpckhqd.q.128(<2 x i64>, <2 x i64>, <2
define <2 x i64>@test_int_x86_avx512_mask_punpckhqd_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_punpckhqd_q_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpunpckhqdq %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6d,0xd9]
; CHECK-NEXT: ## xmm3 = xmm0[1],xmm1[1]
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
@@ -1413,7 +1497,7 @@ declare <2 x i64> @llvm.x86.avx512.mask.punpcklqd.q.128(<2 x i64>, <2 x i64>, <2
define <2 x i64>@test_int_x86_avx512_mask_punpcklqd_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_punpcklqd_q_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpunpcklqdq %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6c,0xd9]
; CHECK-NEXT: ## xmm3 = xmm0[0],xmm1[0]
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
@@ -1431,7 +1515,7 @@ declare <4 x i64> @llvm.x86.avx512.mask.punpcklqd.q.256(<4 x i64>, <4 x i64>, <4
define <4 x i64>@test_int_x86_avx512_mask_punpcklqd_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_punpcklqd_q_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpunpcklqdq %ymm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6c,0xd9]
; CHECK-NEXT: ## ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
@@ -1449,7 +1533,7 @@ declare <4 x i64> @llvm.x86.avx512.mask.punpckhqd.q.256(<4 x i64>, <4 x i64>, <4
define <4 x i64>@test_int_x86_avx512_mask_punpckhqd_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_punpckhqd_q_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpunpckhqdq %ymm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6d,0xd9]
; CHECK-NEXT: ## ymm3 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
@@ -1465,7 +1549,7 @@ define <4 x i64>@test_int_x86_avx512_mask_punpckhqd_q_256(<4 x i64> %x0, <4 x i6
define <4 x i32> @test_mask_and_epi32_rr_128(<4 x i32> %a, <4 x i32> %b) {
; CHECK-LABEL: test_mask_and_epi32_rr_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpand %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdb,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx512.mask.pand.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 -1)
@@ -1474,7 +1558,7 @@ define <4 x i32> @test_mask_and_epi32_rr_128(<4 x i32> %a, <4 x i32> %b) {
define <4 x i32> @test_mask_and_epi32_rrk_128(<4 x i32> %a, <4 x i32> %b, <4 x i32> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_and_epi32_rrk_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpandd %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xdb,0xd1]
; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
@@ -1485,7 +1569,7 @@ define <4 x i32> @test_mask_and_epi32_rrk_128(<4 x i32> %a, <4 x i32> %b, <4 x i
define <4 x i32> @test_mask_and_epi32_rrkz_128(<4 x i32> %a, <4 x i32> %b, i8 %mask) {
; CHECK-LABEL: test_mask_and_epi32_rrkz_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpandd %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xdb,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -1495,7 +1579,7 @@ define <4 x i32> @test_mask_and_epi32_rrkz_128(<4 x i32> %a, <4 x i32> %b, i8 %m
define <4 x i32> @test_mask_and_epi32_rm_128(<4 x i32> %a, <4 x i32>* %ptr_b) {
; CHECK-LABEL: test_mask_and_epi32_rm_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpand (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdb,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <4 x i32>, <4 x i32>* %ptr_b
@@ -1505,7 +1589,7 @@ define <4 x i32> @test_mask_and_epi32_rm_128(<4 x i32> %a, <4 x i32>* %ptr_b) {
define <4 x i32> @test_mask_and_epi32_rmk_128(<4 x i32> %a, <4 x i32>* %ptr_b, <4 x i32> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_and_epi32_rmk_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpandd (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xdb,0x0f]
; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
@@ -1517,7 +1601,7 @@ define <4 x i32> @test_mask_and_epi32_rmk_128(<4 x i32> %a, <4 x i32>* %ptr_b, <
define <4 x i32> @test_mask_and_epi32_rmkz_128(<4 x i32> %a, <4 x i32>* %ptr_b, i8 %mask) {
; CHECK-LABEL: test_mask_and_epi32_rmkz_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpandd (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xdb,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -1528,7 +1612,7 @@ define <4 x i32> @test_mask_and_epi32_rmkz_128(<4 x i32> %a, <4 x i32>* %ptr_b,
define <4 x i32> @test_mask_and_epi32_rmb_128(<4 x i32> %a, i32* %ptr_b) {
; CHECK-LABEL: test_mask_and_epi32_rmb_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpandd (%rdi){1to4}, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x18,0xdb,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load i32, i32* %ptr_b
@@ -1540,7 +1624,7 @@ define <4 x i32> @test_mask_and_epi32_rmb_128(<4 x i32> %a, i32* %ptr_b) {
define <4 x i32> @test_mask_and_epi32_rmbk_128(<4 x i32> %a, i32* %ptr_b, <4 x i32> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_and_epi32_rmbk_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpandd (%rdi){1to4}, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x19,0xdb,0x0f]
; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
@@ -1554,7 +1638,7 @@ define <4 x i32> @test_mask_and_epi32_rmbk_128(<4 x i32> %a, i32* %ptr_b, <4 x i
define <4 x i32> @test_mask_and_epi32_rmbkz_128(<4 x i32> %a, i32* %ptr_b, i8 %mask) {
; CHECK-LABEL: test_mask_and_epi32_rmbkz_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpandd (%rdi){1to4}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x99,0xdb,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -1569,7 +1653,7 @@ declare <4 x i32> @llvm.x86.avx512.mask.pand.d.128(<4 x i32>, <4 x i32>, <4 x i3
define <8 x i32> @test_mask_and_epi32_rr_256(<8 x i32> %a, <8 x i32> %b) {
; CHECK-LABEL: test_mask_and_epi32_rr_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpand %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdb,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i32> @llvm.x86.avx512.mask.pand.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 -1)
@@ -1578,7 +1662,7 @@ define <8 x i32> @test_mask_and_epi32_rr_256(<8 x i32> %a, <8 x i32> %b) {
define <8 x i32> @test_mask_and_epi32_rrk_256(<8 x i32> %a, <8 x i32> %b, <8 x i32> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_and_epi32_rrk_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpandd %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xdb,0xd1]
; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2]
@@ -1589,7 +1673,7 @@ define <8 x i32> @test_mask_and_epi32_rrk_256(<8 x i32> %a, <8 x i32> %b, <8 x i
define <8 x i32> @test_mask_and_epi32_rrkz_256(<8 x i32> %a, <8 x i32> %b, i8 %mask) {
; CHECK-LABEL: test_mask_and_epi32_rrkz_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpandd %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xdb,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -1599,7 +1683,7 @@ define <8 x i32> @test_mask_and_epi32_rrkz_256(<8 x i32> %a, <8 x i32> %b, i8 %m
define <8 x i32> @test_mask_and_epi32_rm_256(<8 x i32> %a, <8 x i32>* %ptr_b) {
; CHECK-LABEL: test_mask_and_epi32_rm_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpand (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdb,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x i32>, <8 x i32>* %ptr_b
@@ -1609,7 +1693,7 @@ define <8 x i32> @test_mask_and_epi32_rm_256(<8 x i32> %a, <8 x i32>* %ptr_b) {
define <8 x i32> @test_mask_and_epi32_rmk_256(<8 x i32> %a, <8 x i32>* %ptr_b, <8 x i32> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_and_epi32_rmk_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpandd (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xdb,0x0f]
; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
@@ -1621,7 +1705,7 @@ define <8 x i32> @test_mask_and_epi32_rmk_256(<8 x i32> %a, <8 x i32>* %ptr_b, <
define <8 x i32> @test_mask_and_epi32_rmkz_256(<8 x i32> %a, <8 x i32>* %ptr_b, i8 %mask) {
; CHECK-LABEL: test_mask_and_epi32_rmkz_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpandd (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xdb,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -1632,7 +1716,7 @@ define <8 x i32> @test_mask_and_epi32_rmkz_256(<8 x i32> %a, <8 x i32>* %ptr_b,
define <8 x i32> @test_mask_and_epi32_rmb_256(<8 x i32> %a, i32* %ptr_b) {
; CHECK-LABEL: test_mask_and_epi32_rmb_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpandd (%rdi){1to8}, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x38,0xdb,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load i32, i32* %ptr_b
@@ -1644,7 +1728,7 @@ define <8 x i32> @test_mask_and_epi32_rmb_256(<8 x i32> %a, i32* %ptr_b) {
define <8 x i32> @test_mask_and_epi32_rmbk_256(<8 x i32> %a, i32* %ptr_b, <8 x i32> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_and_epi32_rmbk_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpandd (%rdi){1to8}, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x39,0xdb,0x0f]
; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
@@ -1658,7 +1742,7 @@ define <8 x i32> @test_mask_and_epi32_rmbk_256(<8 x i32> %a, i32* %ptr_b, <8 x i
define <8 x i32> @test_mask_and_epi32_rmbkz_256(<8 x i32> %a, i32* %ptr_b, i8 %mask) {
; CHECK-LABEL: test_mask_and_epi32_rmbkz_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpandd (%rdi){1to8}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xb9,0xdb,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -1673,7 +1757,7 @@ declare <8 x i32> @llvm.x86.avx512.mask.pand.d.256(<8 x i32>, <8 x i32>, <8 x i3
define <4 x i32> @test_mask_or_epi32_rr_128(<4 x i32> %a, <4 x i32> %b) {
; CHECK-LABEL: test_mask_or_epi32_rr_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpor %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xeb,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx512.mask.por.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 -1)
@@ -1682,7 +1766,7 @@ define <4 x i32> @test_mask_or_epi32_rr_128(<4 x i32> %a, <4 x i32> %b) {
define <4 x i32> @test_mask_or_epi32_rrk_128(<4 x i32> %a, <4 x i32> %b, <4 x i32> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_or_epi32_rrk_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpord %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xeb,0xd1]
; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
@@ -1693,7 +1777,7 @@ define <4 x i32> @test_mask_or_epi32_rrk_128(<4 x i32> %a, <4 x i32> %b, <4 x i3
define <4 x i32> @test_mask_or_epi32_rrkz_128(<4 x i32> %a, <4 x i32> %b, i8 %mask) {
; CHECK-LABEL: test_mask_or_epi32_rrkz_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpord %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xeb,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -1703,7 +1787,7 @@ define <4 x i32> @test_mask_or_epi32_rrkz_128(<4 x i32> %a, <4 x i32> %b, i8 %ma
define <4 x i32> @test_mask_or_epi32_rm_128(<4 x i32> %a, <4 x i32>* %ptr_b) {
; CHECK-LABEL: test_mask_or_epi32_rm_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpor (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xeb,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <4 x i32>, <4 x i32>* %ptr_b
@@ -1713,7 +1797,7 @@ define <4 x i32> @test_mask_or_epi32_rm_128(<4 x i32> %a, <4 x i32>* %ptr_b) {
define <4 x i32> @test_mask_or_epi32_rmk_128(<4 x i32> %a, <4 x i32>* %ptr_b, <4 x i32> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_or_epi32_rmk_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpord (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xeb,0x0f]
; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
@@ -1725,7 +1809,7 @@ define <4 x i32> @test_mask_or_epi32_rmk_128(<4 x i32> %a, <4 x i32>* %ptr_b, <4
define <4 x i32> @test_mask_or_epi32_rmkz_128(<4 x i32> %a, <4 x i32>* %ptr_b, i8 %mask) {
; CHECK-LABEL: test_mask_or_epi32_rmkz_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpord (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xeb,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -1736,7 +1820,7 @@ define <4 x i32> @test_mask_or_epi32_rmkz_128(<4 x i32> %a, <4 x i32>* %ptr_b, i
define <4 x i32> @test_mask_or_epi32_rmb_128(<4 x i32> %a, i32* %ptr_b) {
; CHECK-LABEL: test_mask_or_epi32_rmb_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpord (%rdi){1to4}, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x18,0xeb,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load i32, i32* %ptr_b
@@ -1748,7 +1832,7 @@ define <4 x i32> @test_mask_or_epi32_rmb_128(<4 x i32> %a, i32* %ptr_b) {
define <4 x i32> @test_mask_or_epi32_rmbk_128(<4 x i32> %a, i32* %ptr_b, <4 x i32> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_or_epi32_rmbk_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpord (%rdi){1to4}, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x19,0xeb,0x0f]
; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
@@ -1762,7 +1846,7 @@ define <4 x i32> @test_mask_or_epi32_rmbk_128(<4 x i32> %a, i32* %ptr_b, <4 x i3
define <4 x i32> @test_mask_or_epi32_rmbkz_128(<4 x i32> %a, i32* %ptr_b, i8 %mask) {
; CHECK-LABEL: test_mask_or_epi32_rmbkz_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpord (%rdi){1to4}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x99,0xeb,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -1777,7 +1861,7 @@ declare <4 x i32> @llvm.x86.avx512.mask.por.d.128(<4 x i32>, <4 x i32>, <4 x i32
define <8 x i32> @test_mask_or_epi32_rr_256(<8 x i32> %a, <8 x i32> %b) {
; CHECK-LABEL: test_mask_or_epi32_rr_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpor %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xeb,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i32> @llvm.x86.avx512.mask.por.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 -1)
@@ -1786,7 +1870,7 @@ define <8 x i32> @test_mask_or_epi32_rr_256(<8 x i32> %a, <8 x i32> %b) {
define <8 x i32> @test_mask_or_epi32_rrk_256(<8 x i32> %a, <8 x i32> %b, <8 x i32> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_or_epi32_rrk_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpord %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xeb,0xd1]
; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2]
@@ -1797,7 +1881,7 @@ define <8 x i32> @test_mask_or_epi32_rrk_256(<8 x i32> %a, <8 x i32> %b, <8 x i3
define <8 x i32> @test_mask_or_epi32_rrkz_256(<8 x i32> %a, <8 x i32> %b, i8 %mask) {
; CHECK-LABEL: test_mask_or_epi32_rrkz_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpord %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xeb,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -1807,7 +1891,7 @@ define <8 x i32> @test_mask_or_epi32_rrkz_256(<8 x i32> %a, <8 x i32> %b, i8 %ma
define <8 x i32> @test_mask_or_epi32_rm_256(<8 x i32> %a, <8 x i32>* %ptr_b) {
; CHECK-LABEL: test_mask_or_epi32_rm_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpor (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xeb,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x i32>, <8 x i32>* %ptr_b
@@ -1817,7 +1901,7 @@ define <8 x i32> @test_mask_or_epi32_rm_256(<8 x i32> %a, <8 x i32>* %ptr_b) {
define <8 x i32> @test_mask_or_epi32_rmk_256(<8 x i32> %a, <8 x i32>* %ptr_b, <8 x i32> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_or_epi32_rmk_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpord (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xeb,0x0f]
; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
@@ -1829,7 +1913,7 @@ define <8 x i32> @test_mask_or_epi32_rmk_256(<8 x i32> %a, <8 x i32>* %ptr_b, <8
define <8 x i32> @test_mask_or_epi32_rmkz_256(<8 x i32> %a, <8 x i32>* %ptr_b, i8 %mask) {
; CHECK-LABEL: test_mask_or_epi32_rmkz_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpord (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xeb,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -1840,7 +1924,7 @@ define <8 x i32> @test_mask_or_epi32_rmkz_256(<8 x i32> %a, <8 x i32>* %ptr_b, i
define <8 x i32> @test_mask_or_epi32_rmb_256(<8 x i32> %a, i32* %ptr_b) {
; CHECK-LABEL: test_mask_or_epi32_rmb_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpord (%rdi){1to8}, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x38,0xeb,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load i32, i32* %ptr_b
@@ -1852,7 +1936,7 @@ define <8 x i32> @test_mask_or_epi32_rmb_256(<8 x i32> %a, i32* %ptr_b) {
define <8 x i32> @test_mask_or_epi32_rmbk_256(<8 x i32> %a, i32* %ptr_b, <8 x i32> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_or_epi32_rmbk_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpord (%rdi){1to8}, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x39,0xeb,0x0f]
; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
@@ -1866,7 +1950,7 @@ define <8 x i32> @test_mask_or_epi32_rmbk_256(<8 x i32> %a, i32* %ptr_b, <8 x i3
define <8 x i32> @test_mask_or_epi32_rmbkz_256(<8 x i32> %a, i32* %ptr_b, i8 %mask) {
; CHECK-LABEL: test_mask_or_epi32_rmbkz_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpord (%rdi){1to8}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xb9,0xeb,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -1881,7 +1965,7 @@ declare <8 x i32> @llvm.x86.avx512.mask.por.d.256(<8 x i32>, <8 x i32>, <8 x i32
define <4 x i32> @test_mask_xor_epi32_rr_128(<4 x i32> %a, <4 x i32> %b) {
; CHECK-LABEL: test_mask_xor_epi32_rr_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpxor %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xef,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx512.mask.pxor.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 -1)
@@ -1890,7 +1974,7 @@ define <4 x i32> @test_mask_xor_epi32_rr_128(<4 x i32> %a, <4 x i32> %b) {
define <4 x i32> @test_mask_xor_epi32_rrk_128(<4 x i32> %a, <4 x i32> %b, <4 x i32> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_xor_epi32_rrk_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpxord %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xef,0xd1]
; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
@@ -1901,7 +1985,7 @@ define <4 x i32> @test_mask_xor_epi32_rrk_128(<4 x i32> %a, <4 x i32> %b, <4 x i
define <4 x i32> @test_mask_xor_epi32_rrkz_128(<4 x i32> %a, <4 x i32> %b, i8 %mask) {
; CHECK-LABEL: test_mask_xor_epi32_rrkz_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpxord %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xef,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -1911,7 +1995,7 @@ define <4 x i32> @test_mask_xor_epi32_rrkz_128(<4 x i32> %a, <4 x i32> %b, i8 %m
define <4 x i32> @test_mask_xor_epi32_rm_128(<4 x i32> %a, <4 x i32>* %ptr_b) {
; CHECK-LABEL: test_mask_xor_epi32_rm_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpxor (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xef,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <4 x i32>, <4 x i32>* %ptr_b
@@ -1921,7 +2005,7 @@ define <4 x i32> @test_mask_xor_epi32_rm_128(<4 x i32> %a, <4 x i32>* %ptr_b) {
define <4 x i32> @test_mask_xor_epi32_rmk_128(<4 x i32> %a, <4 x i32>* %ptr_b, <4 x i32> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_xor_epi32_rmk_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpxord (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xef,0x0f]
; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
@@ -1933,7 +2017,7 @@ define <4 x i32> @test_mask_xor_epi32_rmk_128(<4 x i32> %a, <4 x i32>* %ptr_b, <
define <4 x i32> @test_mask_xor_epi32_rmkz_128(<4 x i32> %a, <4 x i32>* %ptr_b, i8 %mask) {
; CHECK-LABEL: test_mask_xor_epi32_rmkz_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpxord (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xef,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -1944,7 +2028,7 @@ define <4 x i32> @test_mask_xor_epi32_rmkz_128(<4 x i32> %a, <4 x i32>* %ptr_b,
define <4 x i32> @test_mask_xor_epi32_rmb_128(<4 x i32> %a, i32* %ptr_b) {
; CHECK-LABEL: test_mask_xor_epi32_rmb_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpxord (%rdi){1to4}, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x18,0xef,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load i32, i32* %ptr_b
@@ -1956,7 +2040,7 @@ define <4 x i32> @test_mask_xor_epi32_rmb_128(<4 x i32> %a, i32* %ptr_b) {
define <4 x i32> @test_mask_xor_epi32_rmbk_128(<4 x i32> %a, i32* %ptr_b, <4 x i32> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_xor_epi32_rmbk_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpxord (%rdi){1to4}, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x19,0xef,0x0f]
; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
@@ -1970,7 +2054,7 @@ define <4 x i32> @test_mask_xor_epi32_rmbk_128(<4 x i32> %a, i32* %ptr_b, <4 x i
define <4 x i32> @test_mask_xor_epi32_rmbkz_128(<4 x i32> %a, i32* %ptr_b, i8 %mask) {
; CHECK-LABEL: test_mask_xor_epi32_rmbkz_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpxord (%rdi){1to4}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x99,0xef,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -1985,7 +2069,7 @@ declare <4 x i32> @llvm.x86.avx512.mask.pxor.d.128(<4 x i32>, <4 x i32>, <4 x i3
define <8 x i32> @test_mask_xor_epi32_rr_256(<8 x i32> %a, <8 x i32> %b) {
; CHECK-LABEL: test_mask_xor_epi32_rr_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpxor %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xef,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i32> @llvm.x86.avx512.mask.pxor.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 -1)
@@ -1994,7 +2078,7 @@ define <8 x i32> @test_mask_xor_epi32_rr_256(<8 x i32> %a, <8 x i32> %b) {
define <8 x i32> @test_mask_xor_epi32_rrk_256(<8 x i32> %a, <8 x i32> %b, <8 x i32> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_xor_epi32_rrk_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpxord %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xef,0xd1]
; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2]
@@ -2005,7 +2089,7 @@ define <8 x i32> @test_mask_xor_epi32_rrk_256(<8 x i32> %a, <8 x i32> %b, <8 x i
define <8 x i32> @test_mask_xor_epi32_rrkz_256(<8 x i32> %a, <8 x i32> %b, i8 %mask) {
; CHECK-LABEL: test_mask_xor_epi32_rrkz_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpxord %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xef,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -2015,7 +2099,7 @@ define <8 x i32> @test_mask_xor_epi32_rrkz_256(<8 x i32> %a, <8 x i32> %b, i8 %m
define <8 x i32> @test_mask_xor_epi32_rm_256(<8 x i32> %a, <8 x i32>* %ptr_b) {
; CHECK-LABEL: test_mask_xor_epi32_rm_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpxor (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xef,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x i32>, <8 x i32>* %ptr_b
@@ -2025,7 +2109,7 @@ define <8 x i32> @test_mask_xor_epi32_rm_256(<8 x i32> %a, <8 x i32>* %ptr_b) {
define <8 x i32> @test_mask_xor_epi32_rmk_256(<8 x i32> %a, <8 x i32>* %ptr_b, <8 x i32> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_xor_epi32_rmk_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpxord (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xef,0x0f]
; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
@@ -2037,7 +2121,7 @@ define <8 x i32> @test_mask_xor_epi32_rmk_256(<8 x i32> %a, <8 x i32>* %ptr_b, <
define <8 x i32> @test_mask_xor_epi32_rmkz_256(<8 x i32> %a, <8 x i32>* %ptr_b, i8 %mask) {
; CHECK-LABEL: test_mask_xor_epi32_rmkz_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpxord (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xef,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -2048,7 +2132,7 @@ define <8 x i32> @test_mask_xor_epi32_rmkz_256(<8 x i32> %a, <8 x i32>* %ptr_b,
define <8 x i32> @test_mask_xor_epi32_rmb_256(<8 x i32> %a, i32* %ptr_b) {
; CHECK-LABEL: test_mask_xor_epi32_rmb_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpxord (%rdi){1to8}, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x38,0xef,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load i32, i32* %ptr_b
@@ -2060,7 +2144,7 @@ define <8 x i32> @test_mask_xor_epi32_rmb_256(<8 x i32> %a, i32* %ptr_b) {
define <8 x i32> @test_mask_xor_epi32_rmbk_256(<8 x i32> %a, i32* %ptr_b, <8 x i32> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_xor_epi32_rmbk_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpxord (%rdi){1to8}, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x39,0xef,0x0f]
; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
@@ -2074,7 +2158,7 @@ define <8 x i32> @test_mask_xor_epi32_rmbk_256(<8 x i32> %a, i32* %ptr_b, <8 x i
define <8 x i32> @test_mask_xor_epi32_rmbkz_256(<8 x i32> %a, i32* %ptr_b, i8 %mask) {
; CHECK-LABEL: test_mask_xor_epi32_rmbkz_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpxord (%rdi){1to8}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xb9,0xef,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -2089,7 +2173,7 @@ declare <8 x i32> @llvm.x86.avx512.mask.pxor.d.256(<8 x i32>, <8 x i32>, <8 x i3
define <4 x i32> @test_mask_andnot_epi32_rr_128(<4 x i32> %a, <4 x i32> %b) {
; CHECK-LABEL: test_mask_andnot_epi32_rr_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpandn %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdf,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx512.mask.pandn.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 -1)
@@ -2098,7 +2182,7 @@ define <4 x i32> @test_mask_andnot_epi32_rr_128(<4 x i32> %a, <4 x i32> %b) {
define <4 x i32> @test_mask_andnot_epi32_rrk_128(<4 x i32> %a, <4 x i32> %b, <4 x i32> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_andnot_epi32_rrk_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpandnd %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xdf,0xd1]
; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
@@ -2109,7 +2193,7 @@ define <4 x i32> @test_mask_andnot_epi32_rrk_128(<4 x i32> %a, <4 x i32> %b, <4
define <4 x i32> @test_mask_andnot_epi32_rrkz_128(<4 x i32> %a, <4 x i32> %b, i8 %mask) {
; CHECK-LABEL: test_mask_andnot_epi32_rrkz_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpandnd %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xdf,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -2119,7 +2203,7 @@ define <4 x i32> @test_mask_andnot_epi32_rrkz_128(<4 x i32> %a, <4 x i32> %b, i8
define <4 x i32> @test_mask_andnot_epi32_rm_128(<4 x i32> %a, <4 x i32>* %ptr_b) {
; CHECK-LABEL: test_mask_andnot_epi32_rm_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpandn (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdf,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <4 x i32>, <4 x i32>* %ptr_b
@@ -2129,7 +2213,7 @@ define <4 x i32> @test_mask_andnot_epi32_rm_128(<4 x i32> %a, <4 x i32>* %ptr_b)
define <4 x i32> @test_mask_andnot_epi32_rmk_128(<4 x i32> %a, <4 x i32>* %ptr_b, <4 x i32> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_andnot_epi32_rmk_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpandnd (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xdf,0x0f]
; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
@@ -2141,7 +2225,7 @@ define <4 x i32> @test_mask_andnot_epi32_rmk_128(<4 x i32> %a, <4 x i32>* %ptr_b
define <4 x i32> @test_mask_andnot_epi32_rmkz_128(<4 x i32> %a, <4 x i32>* %ptr_b, i8 %mask) {
; CHECK-LABEL: test_mask_andnot_epi32_rmkz_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpandnd (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xdf,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -2152,7 +2236,7 @@ define <4 x i32> @test_mask_andnot_epi32_rmkz_128(<4 x i32> %a, <4 x i32>* %ptr_
define <4 x i32> @test_mask_andnot_epi32_rmb_128(<4 x i32> %a, i32* %ptr_b) {
; CHECK-LABEL: test_mask_andnot_epi32_rmb_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpandnd (%rdi){1to4}, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x18,0xdf,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load i32, i32* %ptr_b
@@ -2164,7 +2248,7 @@ define <4 x i32> @test_mask_andnot_epi32_rmb_128(<4 x i32> %a, i32* %ptr_b) {
define <4 x i32> @test_mask_andnot_epi32_rmbk_128(<4 x i32> %a, i32* %ptr_b, <4 x i32> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_andnot_epi32_rmbk_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpandnd (%rdi){1to4}, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x19,0xdf,0x0f]
; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
@@ -2178,7 +2262,7 @@ define <4 x i32> @test_mask_andnot_epi32_rmbk_128(<4 x i32> %a, i32* %ptr_b, <4
define <4 x i32> @test_mask_andnot_epi32_rmbkz_128(<4 x i32> %a, i32* %ptr_b, i8 %mask) {
; CHECK-LABEL: test_mask_andnot_epi32_rmbkz_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpandnd (%rdi){1to4}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x99,0xdf,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -2193,7 +2277,7 @@ declare <4 x i32> @llvm.x86.avx512.mask.pandn.d.128(<4 x i32>, <4 x i32>, <4 x i
define <8 x i32> @test_mask_andnot_epi32_rr_256(<8 x i32> %a, <8 x i32> %b) {
; CHECK-LABEL: test_mask_andnot_epi32_rr_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpandn %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdf,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i32> @llvm.x86.avx512.mask.pandn.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 -1)
@@ -2202,7 +2286,7 @@ define <8 x i32> @test_mask_andnot_epi32_rr_256(<8 x i32> %a, <8 x i32> %b) {
define <8 x i32> @test_mask_andnot_epi32_rrk_256(<8 x i32> %a, <8 x i32> %b, <8 x i32> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_andnot_epi32_rrk_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpandnd %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xdf,0xd1]
; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2]
@@ -2213,7 +2297,7 @@ define <8 x i32> @test_mask_andnot_epi32_rrk_256(<8 x i32> %a, <8 x i32> %b, <8
define <8 x i32> @test_mask_andnot_epi32_rrkz_256(<8 x i32> %a, <8 x i32> %b, i8 %mask) {
; CHECK-LABEL: test_mask_andnot_epi32_rrkz_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpandnd %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xdf,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -2223,7 +2307,7 @@ define <8 x i32> @test_mask_andnot_epi32_rrkz_256(<8 x i32> %a, <8 x i32> %b, i8
define <8 x i32> @test_mask_andnot_epi32_rm_256(<8 x i32> %a, <8 x i32>* %ptr_b) {
; CHECK-LABEL: test_mask_andnot_epi32_rm_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpandn (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdf,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x i32>, <8 x i32>* %ptr_b
@@ -2233,7 +2317,7 @@ define <8 x i32> @test_mask_andnot_epi32_rm_256(<8 x i32> %a, <8 x i32>* %ptr_b)
define <8 x i32> @test_mask_andnot_epi32_rmk_256(<8 x i32> %a, <8 x i32>* %ptr_b, <8 x i32> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_andnot_epi32_rmk_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpandnd (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xdf,0x0f]
; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
@@ -2245,7 +2329,7 @@ define <8 x i32> @test_mask_andnot_epi32_rmk_256(<8 x i32> %a, <8 x i32>* %ptr_b
define <8 x i32> @test_mask_andnot_epi32_rmkz_256(<8 x i32> %a, <8 x i32>* %ptr_b, i8 %mask) {
; CHECK-LABEL: test_mask_andnot_epi32_rmkz_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpandnd (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xdf,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -2256,7 +2340,7 @@ define <8 x i32> @test_mask_andnot_epi32_rmkz_256(<8 x i32> %a, <8 x i32>* %ptr_
define <8 x i32> @test_mask_andnot_epi32_rmb_256(<8 x i32> %a, i32* %ptr_b) {
; CHECK-LABEL: test_mask_andnot_epi32_rmb_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpandnd (%rdi){1to8}, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x38,0xdf,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load i32, i32* %ptr_b
@@ -2268,7 +2352,7 @@ define <8 x i32> @test_mask_andnot_epi32_rmb_256(<8 x i32> %a, i32* %ptr_b) {
define <8 x i32> @test_mask_andnot_epi32_rmbk_256(<8 x i32> %a, i32* %ptr_b, <8 x i32> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_andnot_epi32_rmbk_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpandnd (%rdi){1to8}, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x39,0xdf,0x0f]
; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
@@ -2282,7 +2366,7 @@ define <8 x i32> @test_mask_andnot_epi32_rmbk_256(<8 x i32> %a, i32* %ptr_b, <8
define <8 x i32> @test_mask_andnot_epi32_rmbkz_256(<8 x i32> %a, i32* %ptr_b, i8 %mask) {
; CHECK-LABEL: test_mask_andnot_epi32_rmbkz_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpandnd (%rdi){1to8}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xb9,0xdf,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -2297,7 +2381,7 @@ declare <8 x i32> @llvm.x86.avx512.mask.pandn.d.256(<8 x i32>, <8 x i32>, <8 x i
define <2 x i64> @test_mask_andnot_epi64_rr_128(<2 x i64> %a, <2 x i64> %b) {
; CHECK-LABEL: test_mask_andnot_epi64_rr_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpandn %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdf,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x i64> @llvm.x86.avx512.mask.pandn.q.128(<2 x i64> %a, <2 x i64> %b, <2 x i64> zeroinitializer, i8 -1)
@@ -2306,7 +2390,7 @@ define <2 x i64> @test_mask_andnot_epi64_rr_128(<2 x i64> %a, <2 x i64> %b) {
define <2 x i64> @test_mask_andnot_epi64_rrk_128(<2 x i64> %a, <2 x i64> %b, <2 x i64> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_andnot_epi64_rrk_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpandnq %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0xdf,0xd1]
; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
@@ -2317,7 +2401,7 @@ define <2 x i64> @test_mask_andnot_epi64_rrk_128(<2 x i64> %a, <2 x i64> %b, <2
define <2 x i64> @test_mask_andnot_epi64_rrkz_128(<2 x i64> %a, <2 x i64> %b, i8 %mask) {
; CHECK-LABEL: test_mask_andnot_epi64_rrkz_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpandnq %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0xdf,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -2327,7 +2411,7 @@ define <2 x i64> @test_mask_andnot_epi64_rrkz_128(<2 x i64> %a, <2 x i64> %b, i8
define <2 x i64> @test_mask_andnot_epi64_rm_128(<2 x i64> %a, <2 x i64>* %ptr_b) {
; CHECK-LABEL: test_mask_andnot_epi64_rm_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpandn (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdf,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <2 x i64>, <2 x i64>* %ptr_b
@@ -2337,7 +2421,7 @@ define <2 x i64> @test_mask_andnot_epi64_rm_128(<2 x i64> %a, <2 x i64>* %ptr_b)
define <2 x i64> @test_mask_andnot_epi64_rmk_128(<2 x i64> %a, <2 x i64>* %ptr_b, <2 x i64> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_andnot_epi64_rmk_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpandnq (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0xdf,0x0f]
; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
@@ -2349,7 +2433,7 @@ define <2 x i64> @test_mask_andnot_epi64_rmk_128(<2 x i64> %a, <2 x i64>* %ptr_b
define <2 x i64> @test_mask_andnot_epi64_rmkz_128(<2 x i64> %a, <2 x i64>* %ptr_b, i8 %mask) {
; CHECK-LABEL: test_mask_andnot_epi64_rmkz_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpandnq (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0xdf,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -2360,7 +2444,7 @@ define <2 x i64> @test_mask_andnot_epi64_rmkz_128(<2 x i64> %a, <2 x i64>* %ptr_
define <2 x i64> @test_mask_andnot_epi64_rmb_128(<2 x i64> %a, i64* %ptr_b) {
; CHECK-LABEL: test_mask_andnot_epi64_rmb_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpandnq (%rdi){1to2}, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x18,0xdf,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load i64, i64* %ptr_b
@@ -2372,7 +2456,7 @@ define <2 x i64> @test_mask_andnot_epi64_rmb_128(<2 x i64> %a, i64* %ptr_b) {
define <2 x i64> @test_mask_andnot_epi64_rmbk_128(<2 x i64> %a, i64* %ptr_b, <2 x i64> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_andnot_epi64_rmbk_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpandnq (%rdi){1to2}, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x19,0xdf,0x0f]
; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
@@ -2386,7 +2470,7 @@ define <2 x i64> @test_mask_andnot_epi64_rmbk_128(<2 x i64> %a, i64* %ptr_b, <2
define <2 x i64> @test_mask_andnot_epi64_rmbkz_128(<2 x i64> %a, i64* %ptr_b, i8 %mask) {
; CHECK-LABEL: test_mask_andnot_epi64_rmbkz_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpandnq (%rdi){1to2}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x99,0xdf,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -2401,7 +2485,7 @@ declare <2 x i64> @llvm.x86.avx512.mask.pandn.q.128(<2 x i64>, <2 x i64>, <2 x i
define <4 x i64> @test_mask_andnot_epi64_rr_256(<4 x i64> %a, <4 x i64> %b) {
; CHECK-LABEL: test_mask_andnot_epi64_rr_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpandn %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdf,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i64> @llvm.x86.avx512.mask.pandn.q.256(<4 x i64> %a, <4 x i64> %b, <4 x i64> zeroinitializer, i8 -1)
@@ -2410,7 +2494,7 @@ define <4 x i64> @test_mask_andnot_epi64_rr_256(<4 x i64> %a, <4 x i64> %b) {
define <4 x i64> @test_mask_andnot_epi64_rrk_256(<4 x i64> %a, <4 x i64> %b, <4 x i64> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_andnot_epi64_rrk_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpandnq %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0xdf,0xd1]
; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2]
@@ -2421,7 +2505,7 @@ define <4 x i64> @test_mask_andnot_epi64_rrk_256(<4 x i64> %a, <4 x i64> %b, <4
define <4 x i64> @test_mask_andnot_epi64_rrkz_256(<4 x i64> %a, <4 x i64> %b, i8 %mask) {
; CHECK-LABEL: test_mask_andnot_epi64_rrkz_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpandnq %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0xdf,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -2431,7 +2515,7 @@ define <4 x i64> @test_mask_andnot_epi64_rrkz_256(<4 x i64> %a, <4 x i64> %b, i8
define <4 x i64> @test_mask_andnot_epi64_rm_256(<4 x i64> %a, <4 x i64>* %ptr_b) {
; CHECK-LABEL: test_mask_andnot_epi64_rm_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpandn (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdf,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <4 x i64>, <4 x i64>* %ptr_b
@@ -2441,7 +2525,7 @@ define <4 x i64> @test_mask_andnot_epi64_rm_256(<4 x i64> %a, <4 x i64>* %ptr_b)
define <4 x i64> @test_mask_andnot_epi64_rmk_256(<4 x i64> %a, <4 x i64>* %ptr_b, <4 x i64> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_andnot_epi64_rmk_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpandnq (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0xdf,0x0f]
; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
@@ -2453,7 +2537,7 @@ define <4 x i64> @test_mask_andnot_epi64_rmk_256(<4 x i64> %a, <4 x i64>* %ptr_b
define <4 x i64> @test_mask_andnot_epi64_rmkz_256(<4 x i64> %a, <4 x i64>* %ptr_b, i8 %mask) {
; CHECK-LABEL: test_mask_andnot_epi64_rmkz_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpandnq (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0xdf,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -2464,7 +2548,7 @@ define <4 x i64> @test_mask_andnot_epi64_rmkz_256(<4 x i64> %a, <4 x i64>* %ptr_
define <4 x i64> @test_mask_andnot_epi64_rmb_256(<4 x i64> %a, i64* %ptr_b) {
; CHECK-LABEL: test_mask_andnot_epi64_rmb_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpandnq (%rdi){1to4}, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x38,0xdf,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load i64, i64* %ptr_b
@@ -2476,7 +2560,7 @@ define <4 x i64> @test_mask_andnot_epi64_rmb_256(<4 x i64> %a, i64* %ptr_b) {
define <4 x i64> @test_mask_andnot_epi64_rmbk_256(<4 x i64> %a, i64* %ptr_b, <4 x i64> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_andnot_epi64_rmbk_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpandnq (%rdi){1to4}, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x39,0xdf,0x0f]
; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
@@ -2490,7 +2574,7 @@ define <4 x i64> @test_mask_andnot_epi64_rmbk_256(<4 x i64> %a, i64* %ptr_b, <4
define <4 x i64> @test_mask_andnot_epi64_rmbkz_256(<4 x i64> %a, i64* %ptr_b, i8 %mask) {
; CHECK-LABEL: test_mask_andnot_epi64_rmbkz_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpandnq (%rdi){1to4}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xb9,0xdf,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -2505,7 +2589,7 @@ declare <4 x i64> @llvm.x86.avx512.mask.pandn.q.256(<4 x i64>, <4 x i64>, <4 x i
define <4 x i32> @test_mask_add_epi32_rr_128(<4 x i32> %a, <4 x i32> %b) {
; CHECK-LABEL: test_mask_add_epi32_rr_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx512.mask.padd.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 -1)
@@ -2514,7 +2598,7 @@ define <4 x i32> @test_mask_add_epi32_rr_128(<4 x i32> %a, <4 x i32> %b) {
define <4 x i32> @test_mask_add_epi32_rrk_128(<4 x i32> %a, <4 x i32> %b, <4 x i32> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_add_epi32_rrk_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xfe,0xd1]
; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
@@ -2525,7 +2609,7 @@ define <4 x i32> @test_mask_add_epi32_rrk_128(<4 x i32> %a, <4 x i32> %b, <4 x i
define <4 x i32> @test_mask_add_epi32_rrkz_128(<4 x i32> %a, <4 x i32> %b, i8 %mask) {
; CHECK-LABEL: test_mask_add_epi32_rrkz_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xfe,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -2535,7 +2619,7 @@ define <4 x i32> @test_mask_add_epi32_rrkz_128(<4 x i32> %a, <4 x i32> %b, i8 %m
define <4 x i32> @test_mask_add_epi32_rm_128(<4 x i32> %a, <4 x i32>* %ptr_b) {
; CHECK-LABEL: test_mask_add_epi32_rm_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpaddd (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <4 x i32>, <4 x i32>* %ptr_b
@@ -2545,7 +2629,7 @@ define <4 x i32> @test_mask_add_epi32_rm_128(<4 x i32> %a, <4 x i32>* %ptr_b) {
define <4 x i32> @test_mask_add_epi32_rmk_128(<4 x i32> %a, <4 x i32>* %ptr_b, <4 x i32> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_add_epi32_rmk_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpaddd (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xfe,0x0f]
; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
@@ -2557,7 +2641,7 @@ define <4 x i32> @test_mask_add_epi32_rmk_128(<4 x i32> %a, <4 x i32>* %ptr_b, <
define <4 x i32> @test_mask_add_epi32_rmkz_128(<4 x i32> %a, <4 x i32>* %ptr_b, i8 %mask) {
; CHECK-LABEL: test_mask_add_epi32_rmkz_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpaddd (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xfe,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -2568,7 +2652,7 @@ define <4 x i32> @test_mask_add_epi32_rmkz_128(<4 x i32> %a, <4 x i32>* %ptr_b,
define <4 x i32> @test_mask_add_epi32_rmb_128(<4 x i32> %a, i32* %ptr_b) {
; CHECK-LABEL: test_mask_add_epi32_rmb_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpaddd (%rdi){1to4}, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x18,0xfe,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load i32, i32* %ptr_b
@@ -2580,7 +2664,7 @@ define <4 x i32> @test_mask_add_epi32_rmb_128(<4 x i32> %a, i32* %ptr_b) {
define <4 x i32> @test_mask_add_epi32_rmbk_128(<4 x i32> %a, i32* %ptr_b, <4 x i32> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_add_epi32_rmbk_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpaddd (%rdi){1to4}, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x19,0xfe,0x0f]
; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
@@ -2594,7 +2678,7 @@ define <4 x i32> @test_mask_add_epi32_rmbk_128(<4 x i32> %a, i32* %ptr_b, <4 x i
define <4 x i32> @test_mask_add_epi32_rmbkz_128(<4 x i32> %a, i32* %ptr_b, i8 %mask) {
; CHECK-LABEL: test_mask_add_epi32_rmbkz_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpaddd (%rdi){1to4}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x99,0xfe,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -2609,7 +2693,7 @@ declare <4 x i32> @llvm.x86.avx512.mask.padd.d.128(<4 x i32>, <4 x i32>, <4 x i3
define <4 x i32> @test_mask_sub_epi32_rr_128(<4 x i32> %a, <4 x i32> %b) {
; CHECK-LABEL: test_mask_sub_epi32_rr_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfa,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx512.mask.psub.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 -1)
@@ -2618,7 +2702,7 @@ define <4 x i32> @test_mask_sub_epi32_rr_128(<4 x i32> %a, <4 x i32> %b) {
define <4 x i32> @test_mask_sub_epi32_rrk_128(<4 x i32> %a, <4 x i32> %b, <4 x i32> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_sub_epi32_rrk_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpsubd %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xfa,0xd1]
; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
@@ -2629,7 +2713,7 @@ define <4 x i32> @test_mask_sub_epi32_rrk_128(<4 x i32> %a, <4 x i32> %b, <4 x i
define <4 x i32> @test_mask_sub_epi32_rrkz_128(<4 x i32> %a, <4 x i32> %b, i8 %mask) {
; CHECK-LABEL: test_mask_sub_epi32_rrkz_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpsubd %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xfa,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -2639,7 +2723,7 @@ define <4 x i32> @test_mask_sub_epi32_rrkz_128(<4 x i32> %a, <4 x i32> %b, i8 %m
define <4 x i32> @test_mask_sub_epi32_rm_128(<4 x i32> %a, <4 x i32>* %ptr_b) {
; CHECK-LABEL: test_mask_sub_epi32_rm_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsubd (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfa,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <4 x i32>, <4 x i32>* %ptr_b
@@ -2649,7 +2733,7 @@ define <4 x i32> @test_mask_sub_epi32_rm_128(<4 x i32> %a, <4 x i32>* %ptr_b) {
define <4 x i32> @test_mask_sub_epi32_rmk_128(<4 x i32> %a, <4 x i32>* %ptr_b, <4 x i32> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_sub_epi32_rmk_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpsubd (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xfa,0x0f]
; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
@@ -2661,7 +2745,7 @@ define <4 x i32> @test_mask_sub_epi32_rmk_128(<4 x i32> %a, <4 x i32>* %ptr_b, <
define <4 x i32> @test_mask_sub_epi32_rmkz_128(<4 x i32> %a, <4 x i32>* %ptr_b, i8 %mask) {
; CHECK-LABEL: test_mask_sub_epi32_rmkz_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpsubd (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xfa,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -2672,7 +2756,7 @@ define <4 x i32> @test_mask_sub_epi32_rmkz_128(<4 x i32> %a, <4 x i32>* %ptr_b,
define <4 x i32> @test_mask_sub_epi32_rmb_128(<4 x i32> %a, i32* %ptr_b) {
; CHECK-LABEL: test_mask_sub_epi32_rmb_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsubd (%rdi){1to4}, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x18,0xfa,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load i32, i32* %ptr_b
@@ -2684,7 +2768,7 @@ define <4 x i32> @test_mask_sub_epi32_rmb_128(<4 x i32> %a, i32* %ptr_b) {
define <4 x i32> @test_mask_sub_epi32_rmbk_128(<4 x i32> %a, i32* %ptr_b, <4 x i32> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_sub_epi32_rmbk_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpsubd (%rdi){1to4}, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x19,0xfa,0x0f]
; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
@@ -2698,7 +2782,7 @@ define <4 x i32> @test_mask_sub_epi32_rmbk_128(<4 x i32> %a, i32* %ptr_b, <4 x i
define <4 x i32> @test_mask_sub_epi32_rmbkz_128(<4 x i32> %a, i32* %ptr_b, i8 %mask) {
; CHECK-LABEL: test_mask_sub_epi32_rmbkz_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpsubd (%rdi){1to4}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x99,0xfa,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -2713,7 +2797,7 @@ declare <4 x i32> @llvm.x86.avx512.mask.psub.d.128(<4 x i32>, <4 x i32>, <4 x i3
define <8 x i32> @test_mask_sub_epi32_rr_256(<8 x i32> %a, <8 x i32> %b) {
; CHECK-LABEL: test_mask_sub_epi32_rr_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsubd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfa,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i32> @llvm.x86.avx512.mask.psub.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 -1)
@@ -2722,7 +2806,7 @@ define <8 x i32> @test_mask_sub_epi32_rr_256(<8 x i32> %a, <8 x i32> %b) {
define <8 x i32> @test_mask_sub_epi32_rrk_256(<8 x i32> %a, <8 x i32> %b, <8 x i32> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_sub_epi32_rrk_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpsubd %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xfa,0xd1]
; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2]
@@ -2733,7 +2817,7 @@ define <8 x i32> @test_mask_sub_epi32_rrk_256(<8 x i32> %a, <8 x i32> %b, <8 x i
define <8 x i32> @test_mask_sub_epi32_rrkz_256(<8 x i32> %a, <8 x i32> %b, i8 %mask) {
; CHECK-LABEL: test_mask_sub_epi32_rrkz_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpsubd %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xfa,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -2743,7 +2827,7 @@ define <8 x i32> @test_mask_sub_epi32_rrkz_256(<8 x i32> %a, <8 x i32> %b, i8 %m
define <8 x i32> @test_mask_sub_epi32_rm_256(<8 x i32> %a, <8 x i32>* %ptr_b) {
; CHECK-LABEL: test_mask_sub_epi32_rm_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsubd (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfa,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x i32>, <8 x i32>* %ptr_b
@@ -2753,7 +2837,7 @@ define <8 x i32> @test_mask_sub_epi32_rm_256(<8 x i32> %a, <8 x i32>* %ptr_b) {
define <8 x i32> @test_mask_sub_epi32_rmk_256(<8 x i32> %a, <8 x i32>* %ptr_b, <8 x i32> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_sub_epi32_rmk_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpsubd (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xfa,0x0f]
; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
@@ -2765,7 +2849,7 @@ define <8 x i32> @test_mask_sub_epi32_rmk_256(<8 x i32> %a, <8 x i32>* %ptr_b, <
define <8 x i32> @test_mask_sub_epi32_rmkz_256(<8 x i32> %a, <8 x i32>* %ptr_b, i8 %mask) {
; CHECK-LABEL: test_mask_sub_epi32_rmkz_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpsubd (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xfa,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -2776,7 +2860,7 @@ define <8 x i32> @test_mask_sub_epi32_rmkz_256(<8 x i32> %a, <8 x i32>* %ptr_b,
define <8 x i32> @test_mask_sub_epi32_rmb_256(<8 x i32> %a, i32* %ptr_b) {
; CHECK-LABEL: test_mask_sub_epi32_rmb_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsubd (%rdi){1to8}, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x38,0xfa,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load i32, i32* %ptr_b
@@ -2788,7 +2872,7 @@ define <8 x i32> @test_mask_sub_epi32_rmb_256(<8 x i32> %a, i32* %ptr_b) {
define <8 x i32> @test_mask_sub_epi32_rmbk_256(<8 x i32> %a, i32* %ptr_b, <8 x i32> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_sub_epi32_rmbk_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpsubd (%rdi){1to8}, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x39,0xfa,0x0f]
; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
@@ -2802,7 +2886,7 @@ define <8 x i32> @test_mask_sub_epi32_rmbk_256(<8 x i32> %a, i32* %ptr_b, <8 x i
define <8 x i32> @test_mask_sub_epi32_rmbkz_256(<8 x i32> %a, i32* %ptr_b, i8 %mask) {
; CHECK-LABEL: test_mask_sub_epi32_rmbkz_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpsubd (%rdi){1to8}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xb9,0xfa,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -2817,7 +2901,7 @@ declare <8 x i32> @llvm.x86.avx512.mask.psub.d.256(<8 x i32>, <8 x i32>, <8 x i3
define <8 x i32> @test_mask_add_epi32_rr_256(<8 x i32> %a, <8 x i32> %b) {
; CHECK-LABEL: test_mask_add_epi32_rr_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i32> @llvm.x86.avx512.mask.padd.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 -1)
@@ -2826,7 +2910,7 @@ define <8 x i32> @test_mask_add_epi32_rr_256(<8 x i32> %a, <8 x i32> %b) {
define <8 x i32> @test_mask_add_epi32_rrk_256(<8 x i32> %a, <8 x i32> %b, <8 x i32> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_add_epi32_rrk_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xfe,0xd1]
; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2]
@@ -2837,7 +2921,7 @@ define <8 x i32> @test_mask_add_epi32_rrk_256(<8 x i32> %a, <8 x i32> %b, <8 x i
define <8 x i32> @test_mask_add_epi32_rrkz_256(<8 x i32> %a, <8 x i32> %b, i8 %mask) {
; CHECK-LABEL: test_mask_add_epi32_rrkz_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xfe,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -2847,7 +2931,7 @@ define <8 x i32> @test_mask_add_epi32_rrkz_256(<8 x i32> %a, <8 x i32> %b, i8 %m
define <8 x i32> @test_mask_add_epi32_rm_256(<8 x i32> %a, <8 x i32>* %ptr_b) {
; CHECK-LABEL: test_mask_add_epi32_rm_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpaddd (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load <8 x i32>, <8 x i32>* %ptr_b
@@ -2857,7 +2941,7 @@ define <8 x i32> @test_mask_add_epi32_rm_256(<8 x i32> %a, <8 x i32>* %ptr_b) {
define <8 x i32> @test_mask_add_epi32_rmk_256(<8 x i32> %a, <8 x i32>* %ptr_b, <8 x i32> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_add_epi32_rmk_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpaddd (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xfe,0x0f]
; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
@@ -2869,7 +2953,7 @@ define <8 x i32> @test_mask_add_epi32_rmk_256(<8 x i32> %a, <8 x i32>* %ptr_b, <
define <8 x i32> @test_mask_add_epi32_rmkz_256(<8 x i32> %a, <8 x i32>* %ptr_b, i8 %mask) {
; CHECK-LABEL: test_mask_add_epi32_rmkz_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpaddd (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xfe,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -2880,7 +2964,7 @@ define <8 x i32> @test_mask_add_epi32_rmkz_256(<8 x i32> %a, <8 x i32>* %ptr_b,
define <8 x i32> @test_mask_add_epi32_rmb_256(<8 x i32> %a, i32* %ptr_b) {
; CHECK-LABEL: test_mask_add_epi32_rmb_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpaddd (%rdi){1to8}, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x38,0xfe,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load i32, i32* %ptr_b
@@ -2892,7 +2976,7 @@ define <8 x i32> @test_mask_add_epi32_rmb_256(<8 x i32> %a, i32* %ptr_b) {
define <8 x i32> @test_mask_add_epi32_rmbk_256(<8 x i32> %a, i32* %ptr_b, <8 x i32> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_add_epi32_rmbk_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpaddd (%rdi){1to8}, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x39,0xfe,0x0f]
; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
@@ -2906,7 +2990,7 @@ define <8 x i32> @test_mask_add_epi32_rmbk_256(<8 x i32> %a, i32* %ptr_b, <8 x i
define <8 x i32> @test_mask_add_epi32_rmbkz_256(<8 x i32> %a, i32* %ptr_b, i8 %mask) {
; CHECK-LABEL: test_mask_add_epi32_rmbkz_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpaddd (%rdi){1to8}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xb9,0xfe,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -2921,7 +3005,7 @@ declare <8 x i32> @llvm.x86.avx512.mask.padd.d.256(<8 x i32>, <8 x i32>, <8 x i3
define <8 x float> @test_mm512_maskz_add_ps_256(<8 x float> %a0, <8 x float> %a1, i8 %mask) {
; CHECK-LABEL: test_mm512_maskz_add_ps_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xa9,0x58,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -2931,7 +3015,7 @@ define <8 x float> @test_mm512_maskz_add_ps_256(<8 x float> %a0, <8 x float> %a1
define <8 x float> @test_mm512_mask_add_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask) {
; CHECK-LABEL: test_mm512_mask_add_ps_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x58,0xd1]
; CHECK-NEXT: vmovaps %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2]
@@ -2942,7 +3026,7 @@ define <8 x float> @test_mm512_mask_add_ps_256(<8 x float> %a0, <8 x float> %a1,
define <8 x float> @test_mm512_add_ps_256(<8 x float> %a0, <8 x float> %a1, i8 %mask) {
; CHECK-LABEL: test_mm512_add_ps_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.add.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float>zeroinitializer, i8 -1)
@@ -2952,7 +3036,7 @@ declare <8 x float> @llvm.x86.avx512.mask.add.ps.256(<8 x float>, <8 x float>, <
define <4 x float> @test_mm512_maskz_add_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %mask) {
; CHECK-LABEL: test_mm512_maskz_add_ps_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x89,0x58,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -2962,7 +3046,7 @@ define <4 x float> @test_mm512_maskz_add_ps_128(<4 x float> %a0, <4 x float> %a1
define <4 x float> @test_mm512_mask_add_ps_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask) {
; CHECK-LABEL: test_mm512_mask_add_ps_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x58,0xd1]
; CHECK-NEXT: vmovaps %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2]
@@ -2973,7 +3057,7 @@ define <4 x float> @test_mm512_mask_add_ps_128(<4 x float> %a0, <4 x float> %a1,
define <4 x float> @test_mm512_add_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %mask) {
; CHECK-LABEL: test_mm512_add_ps_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.add.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float>zeroinitializer, i8 -1)
@@ -2983,7 +3067,7 @@ declare <4 x float> @llvm.x86.avx512.mask.add.ps.128(<4 x float>, <4 x float>, <
define <8 x float> @test_mm512_maskz_sub_ps_256(<8 x float> %a0, <8 x float> %a1, i8 %mask) {
; CHECK-LABEL: test_mm512_maskz_sub_ps_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vsubps %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xa9,0x5c,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -2993,7 +3077,7 @@ define <8 x float> @test_mm512_maskz_sub_ps_256(<8 x float> %a0, <8 x float> %a1
define <8 x float> @test_mm512_mask_sub_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask) {
; CHECK-LABEL: test_mm512_mask_sub_ps_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vsubps %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x5c,0xd1]
; CHECK-NEXT: vmovaps %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2]
@@ -3004,7 +3088,7 @@ define <8 x float> @test_mm512_mask_sub_ps_256(<8 x float> %a0, <8 x float> %a1,
define <8 x float> @test_mm512_sub_ps_256(<8 x float> %a0, <8 x float> %a1, i8 %mask) {
; CHECK-LABEL: test_mm512_sub_ps_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vsubps %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x5c,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.sub.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float>zeroinitializer, i8 -1)
@@ -3014,7 +3098,7 @@ declare <8 x float> @llvm.x86.avx512.mask.sub.ps.256(<8 x float>, <8 x float>, <
define <4 x float> @test_mm512_maskz_sub_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %mask) {
; CHECK-LABEL: test_mm512_maskz_sub_ps_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vsubps %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x89,0x5c,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -3024,7 +3108,7 @@ define <4 x float> @test_mm512_maskz_sub_ps_128(<4 x float> %a0, <4 x float> %a1
define <4 x float> @test_mm512_mask_sub_ps_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask) {
; CHECK-LABEL: test_mm512_mask_sub_ps_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vsubps %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x5c,0xd1]
; CHECK-NEXT: vmovaps %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2]
@@ -3035,7 +3119,7 @@ define <4 x float> @test_mm512_mask_sub_ps_128(<4 x float> %a0, <4 x float> %a1,
define <4 x float> @test_mm512_sub_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %mask) {
; CHECK-LABEL: test_mm512_sub_ps_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vsubps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x5c,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.sub.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float>zeroinitializer, i8 -1)
@@ -3045,7 +3129,7 @@ declare <4 x float> @llvm.x86.avx512.mask.sub.ps.128(<4 x float>, <4 x float>, <
define <8 x float> @test_mm512_maskz_mul_ps_256(<8 x float> %a0, <8 x float> %a1, i8 %mask) {
; CHECK-LABEL: test_mm512_maskz_mul_ps_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmulps %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xa9,0x59,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -3055,7 +3139,7 @@ define <8 x float> @test_mm512_maskz_mul_ps_256(<8 x float> %a0, <8 x float> %a1
define <8 x float> @test_mm512_mask_mul_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask) {
; CHECK-LABEL: test_mm512_mask_mul_ps_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmulps %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x59,0xd1]
; CHECK-NEXT: vmovaps %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2]
@@ -3066,7 +3150,7 @@ define <8 x float> @test_mm512_mask_mul_ps_256(<8 x float> %a0, <8 x float> %a1,
define <8 x float> @test_mm512_mul_ps_256(<8 x float> %a0, <8 x float> %a1, i8 %mask) {
; CHECK-LABEL: test_mm512_mul_ps_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmulps %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x59,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.mul.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float>zeroinitializer, i8 -1)
@@ -3076,7 +3160,7 @@ declare <8 x float> @llvm.x86.avx512.mask.mul.ps.256(<8 x float>, <8 x float>, <
define <4 x float> @test_mm512_maskz_mul_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %mask) {
; CHECK-LABEL: test_mm512_maskz_mul_ps_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmulps %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x89,0x59,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -3086,7 +3170,7 @@ define <4 x float> @test_mm512_maskz_mul_ps_128(<4 x float> %a0, <4 x float> %a1
define <4 x float> @test_mm512_mask_mul_ps_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask) {
; CHECK-LABEL: test_mm512_mask_mul_ps_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmulps %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x59,0xd1]
; CHECK-NEXT: vmovaps %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2]
@@ -3097,7 +3181,7 @@ define <4 x float> @test_mm512_mask_mul_ps_128(<4 x float> %a0, <4 x float> %a1,
define <4 x float> @test_mm512_mul_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %mask) {
; CHECK-LABEL: test_mm512_mul_ps_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmulps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x59,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.mul.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float>zeroinitializer, i8 -1)
@@ -3107,7 +3191,7 @@ declare <4 x float> @llvm.x86.avx512.mask.mul.ps.128(<4 x float>, <4 x float>, <
define <8 x float> @test_mm512_maskz_div_ps_256(<8 x float> %a0, <8 x float> %a1, i8 %mask) {
; CHECK-LABEL: test_mm512_maskz_div_ps_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vdivps %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xa9,0x5e,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -3117,7 +3201,7 @@ define <8 x float> @test_mm512_maskz_div_ps_256(<8 x float> %a0, <8 x float> %a1
define <8 x float> @test_mm512_mask_div_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask) {
; CHECK-LABEL: test_mm512_mask_div_ps_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vdivps %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x5e,0xd1]
; CHECK-NEXT: vmovaps %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2]
@@ -3128,7 +3212,7 @@ define <8 x float> @test_mm512_mask_div_ps_256(<8 x float> %a0, <8 x float> %a1,
define <8 x float> @test_mm512_div_ps_256(<8 x float> %a0, <8 x float> %a1, i8 %mask) {
; CHECK-LABEL: test_mm512_div_ps_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vdivps %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x5e,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.div.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float>zeroinitializer, i8 -1)
@@ -3138,7 +3222,7 @@ declare <8 x float> @llvm.x86.avx512.mask.div.ps.256(<8 x float>, <8 x float>, <
define <4 x float> @test_mm512_maskz_div_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %mask) {
; CHECK-LABEL: test_mm512_maskz_div_ps_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vdivps %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x89,0x5e,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -3148,7 +3232,7 @@ define <4 x float> @test_mm512_maskz_div_ps_128(<4 x float> %a0, <4 x float> %a1
define <4 x float> @test_mm512_mask_div_ps_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask) {
; CHECK-LABEL: test_mm512_mask_div_ps_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vdivps %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x5e,0xd1]
; CHECK-NEXT: vmovaps %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2]
@@ -3159,7 +3243,7 @@ define <4 x float> @test_mm512_mask_div_ps_128(<4 x float> %a0, <4 x float> %a1,
define <4 x float> @test_mm512_div_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %mask) {
; CHECK-LABEL: test_mm512_div_ps_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vdivps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x5e,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.div.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float>zeroinitializer, i8 -1)
@@ -3167,11 +3251,87 @@ define <4 x float> @test_mm512_div_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %
}
declare <4 x float> @llvm.x86.avx512.mask.div.ps.128(<4 x float>, <4 x float>, <4 x float>, i8)
+declare <8 x float> @llvm.x86.avx512.mask.shuf.f32x4.256(<8 x float>, <8 x float>, i32, <8 x float>, i8)
+
+define <8 x float>@test_int_x86_avx512_mask_shuf_f32x4_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x3, i8 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_shuf_f32x4_256:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vblendpd $12, %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x0d,0xc1,0x0c]
+; CHECK-NEXT: ## ymm0 = ymm0[0,1],ymm1[2,3]
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovaps %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x28,0xd0]
+; CHECK-NEXT: vmovaps %ymm0, %ymm1 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xa9,0x28,0xc8]
+; CHECK-NEXT: vaddps %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xc0]
+; CHECK-NEXT: vaddps %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <8 x float> @llvm.x86.avx512.mask.shuf.f32x4.256(<8 x float> %x0, <8 x float> %x1, i32 22, <8 x float> %x3, i8 %x4)
+ %res1 = call <8 x float> @llvm.x86.avx512.mask.shuf.f32x4.256(<8 x float> %x0, <8 x float> %x1, i32 22, <8 x float> %x3, i8 -1)
+ %res2 = call <8 x float> @llvm.x86.avx512.mask.shuf.f32x4.256(<8 x float> %x0, <8 x float> %x1, i32 22, <8 x float> zeroinitializer, i8 %x4)
+ %res3 = fadd <8 x float> %res, %res1
+ %res4 = fadd <8 x float> %res2, %res3
+ ret <8 x float> %res4
+}
+
+declare <4 x double> @llvm.x86.avx512.mask.shuf.f64x2.256(<4 x double>, <4 x double>, i32, <4 x double>, i8)
+
+define <4 x double>@test_int_x86_avx512_mask_shuf_f64x2_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x3, i8 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_shuf_f64x2_256:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vblendpd $12, %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x0d,0xc1,0x0c]
+; CHECK-NEXT: ## ymm0 = ymm0[0,1],ymm1[2,3]
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovapd %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0x28,0xd0]
+; CHECK-NEXT: vmovapd %ymm0, %ymm1 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0x28,0xc8]
+; CHECK-NEXT: vaddpd %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xc0]
+; CHECK-NEXT: vaddpd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <4 x double> @llvm.x86.avx512.mask.shuf.f64x2.256(<4 x double> %x0, <4 x double> %x1, i32 22, <4 x double> %x3, i8 %x4)
+ %res1 = call <4 x double> @llvm.x86.avx512.mask.shuf.f64x2.256(<4 x double> %x0, <4 x double> %x1, i32 22, <4 x double> %x3, i8 -1)
+ %res2 = call <4 x double> @llvm.x86.avx512.mask.shuf.f64x2.256(<4 x double> %x0, <4 x double> %x1, i32 22, <4 x double> zeroinitializer, i8 %x4)
+ %res3 = fadd <4 x double> %res, %res1
+ %res4 = fadd <4 x double> %res2, %res3
+ ret <4 x double> %res4
+}
+
+declare <8 x i32> @llvm.x86.avx512.mask.shuf.i32x4.256(<8 x i32>, <8 x i32>, i32, <8 x i32>, i8)
+
+define <8 x i32>@test_int_x86_avx512_mask_shuf_i32x4_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x3, i8 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_shuf_i32x4_256:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpblendd $240, %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x02,0xc1,0xf0]
+; CHECK-NEXT: ## ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovdqa32 %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x6f,0xd0]
+; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <8 x i32> @llvm.x86.avx512.mask.shuf.i32x4.256(<8 x i32> %x0, <8 x i32> %x1, i32 22, <8 x i32> %x3, i8 %x4)
+ %res1 = call <8 x i32> @llvm.x86.avx512.mask.shuf.i32x4.256(<8 x i32> %x0, <8 x i32> %x1, i32 22, <8 x i32> %x3, i8 -1)
+ %res2 = add <8 x i32> %res, %res1
+ ret <8 x i32> %res2
+}
+
+declare <4 x i64> @llvm.x86.avx512.mask.shuf.i64x2.256(<4 x i64>, <4 x i64>, i32, <4 x i64>, i8)
+
+define <4 x i64>@test_int_x86_avx512_mask_shuf_i64x2_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x3, i8 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_shuf_i64x2_256:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpblendd $240, %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x02,0xc1,0xf0]
+; CHECK-NEXT: ## ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vmovdqa64 %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0x6f,0xd0]
+; CHECK-NEXT: vpaddq %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc0]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <4 x i64> @llvm.x86.avx512.mask.shuf.i64x2.256(<4 x i64> %x0, <4 x i64> %x1, i32 22, <4 x i64> %x3, i8 %x4)
+ %res1 = call <4 x i64> @llvm.x86.avx512.mask.shuf.i64x2.256(<4 x i64> %x0, <4 x i64> %x1, i32 22, <4 x i64> %x3, i8 -1)
+ %res2 = add <4 x i64> %res, %res1
+ ret <4 x i64> %res2
+}
+
declare <2 x double> @llvm.x86.avx512.mask.shuf.pd.128(<2 x double>, <2 x double>, i32, <2 x double>, i8)
define <2 x double>@test_int_x86_avx512_mask_shuf_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x3, i8 %x4) {
; CHECK-LABEL: test_int_x86_avx512_mask_shuf_pd_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vshufpd $1, %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc6,0xd9,0x01]
; CHECK-NEXT: ## xmm3 = xmm0[1],xmm1[0]
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
@@ -3194,7 +3354,7 @@ declare <4 x double> @llvm.x86.avx512.mask.shuf.pd.256(<4 x double>, <4 x double
define <4 x double>@test_int_x86_avx512_mask_shuf_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x3, i8 %x4) {
; CHECK-LABEL: test_int_x86_avx512_mask_shuf_pd_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vshufpd $6, %ymm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xc6,0xd9,0x06]
; CHECK-NEXT: ## ymm3 = ymm0[0],ymm1[1],ymm0[3],ymm1[2]
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
@@ -3212,7 +3372,7 @@ declare <4 x float> @llvm.x86.avx512.mask.shuf.ps.128(<4 x float>, <4 x float>,
define <4 x float>@test_int_x86_avx512_mask_shuf_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x3, i8 %x4) {
; CHECK-LABEL: test_int_x86_avx512_mask_shuf_ps_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vshufps $22, %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0xc6,0xd9,0x16]
; CHECK-NEXT: ## xmm3 = xmm0[2,1],xmm1[1,0]
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
@@ -3230,7 +3390,7 @@ declare <8 x float> @llvm.x86.avx512.mask.shuf.ps.256(<8 x float>, <8 x float>,
define <8 x float>@test_int_x86_avx512_mask_shuf_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x3, i8 %x4) {
; CHECK-LABEL: test_int_x86_avx512_mask_shuf_ps_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vshufps $22, %ymm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0xc6,0xd9,0x16]
; CHECK-NEXT: ## ymm3 = ymm0[2,1],ymm1[1,0],ymm0[6,5],ymm1[5,4]
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
@@ -3248,7 +3408,7 @@ declare <4 x i32> @llvm.x86.avx512.mask.pmaxs.d.128(<4 x i32>, <4 x i32>, <4 x i
define <4 x i32>@test_int_x86_avx512_mask_pmaxs_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %mask) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmaxs_d_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpmaxsd %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x3d,0xd1]
; CHECK-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x3d,0xc1]
@@ -3264,7 +3424,7 @@ declare <8 x i32> @llvm.x86.avx512.mask.pmaxs.d.256(<8 x i32>, <8 x i32>, <8 x i
define <8 x i32>@test_int_x86_avx512_mask_pmaxs_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmaxs_d_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpmaxsd %ymm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x3d,0xd9]
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpmaxsd %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x3d,0xd1]
@@ -3280,7 +3440,7 @@ declare <2 x i64> @llvm.x86.avx512.mask.pmaxs.q.128(<2 x i64>, <2 x i64>, <2 x i
define <2 x i64>@test_int_x86_avx512_mask_pmaxs_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmaxs_q_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpmaxsq %xmm1, %xmm0, %xmm3 ## encoding: [0x62,0xf2,0xfd,0x08,0x3d,0xd9]
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpmaxsq %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x3d,0xd1]
@@ -3296,7 +3456,7 @@ declare <4 x i64> @llvm.x86.avx512.mask.pmaxs.q.256(<4 x i64>, <4 x i64>, <4 x i
define <4 x i64>@test_int_x86_avx512_mask_pmaxs_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %mask) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmaxs_q_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpmaxsq %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x3d,0xd1]
; CHECK-NEXT: vpmaxsq %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x3d,0xc1]
@@ -3312,7 +3472,7 @@ declare <4 x i32> @llvm.x86.avx512.mask.pmaxu.d.128(<4 x i32>, <4 x i32>, <4 x i
define <4 x i32>@test_int_x86_avx512_mask_pmaxu_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2,i8 %mask) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmaxu_d_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpmaxud %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x3f,0xd1]
; CHECK-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x3f,0xc1]
@@ -3328,7 +3488,7 @@ declare <8 x i32> @llvm.x86.avx512.mask.pmaxu.d.256(<8 x i32>, <8 x i32>, <8 x i
define <8 x i32>@test_int_x86_avx512_mask_pmaxu_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmaxu_d_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpmaxud %ymm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x3f,0xd9]
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpmaxud %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x3f,0xd1]
@@ -3344,7 +3504,7 @@ declare <2 x i64> @llvm.x86.avx512.mask.pmaxu.q.128(<2 x i64>, <2 x i64>, <2 x i
define <2 x i64>@test_int_x86_avx512_mask_pmaxu_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmaxu_q_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpmaxuq %xmm1, %xmm0, %xmm3 ## encoding: [0x62,0xf2,0xfd,0x08,0x3f,0xd9]
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpmaxuq %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x3f,0xd1]
@@ -3360,7 +3520,7 @@ declare <4 x i64> @llvm.x86.avx512.mask.pmaxu.q.256(<4 x i64>, <4 x i64>, <4 x i
define <4 x i64>@test_int_x86_avx512_mask_pmaxu_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %mask) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmaxu_q_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpmaxuq %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x3f,0xd1]
; CHECK-NEXT: vpmaxuq %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x3f,0xc1]
@@ -3376,7 +3536,7 @@ declare <4 x i32> @llvm.x86.avx512.mask.pmins.d.128(<4 x i32>, <4 x i32>, <4 x i
define <4 x i32>@test_int_x86_avx512_mask_pmins_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %mask) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmins_d_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpminsd %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x39,0xd1]
; CHECK-NEXT: vpminsd %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x39,0xc1]
@@ -3392,7 +3552,7 @@ declare <8 x i32> @llvm.x86.avx512.mask.pmins.d.256(<8 x i32>, <8 x i32>, <8 x i
define <8 x i32>@test_int_x86_avx512_mask_pmins_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmins_d_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpminsd %ymm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x39,0xd9]
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpminsd %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x39,0xd1]
@@ -3408,7 +3568,7 @@ declare <2 x i64> @llvm.x86.avx512.mask.pmins.q.128(<2 x i64>, <2 x i64>, <2 x i
define <2 x i64>@test_int_x86_avx512_mask_pmins_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmins_q_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpminsq %xmm1, %xmm0, %xmm3 ## encoding: [0x62,0xf2,0xfd,0x08,0x39,0xd9]
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpminsq %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x39,0xd1]
@@ -3424,7 +3584,7 @@ declare <4 x i64> @llvm.x86.avx512.mask.pmins.q.256(<4 x i64>, <4 x i64>, <4 x i
define <4 x i64>@test_int_x86_avx512_mask_pmins_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %mask) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmins_q_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpminsq %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x39,0xd1]
; CHECK-NEXT: vpminsq %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x39,0xc1]
@@ -3440,7 +3600,7 @@ declare <4 x i32> @llvm.x86.avx512.mask.pminu.d.128(<4 x i32>, <4 x i32>, <4 x i
define <4 x i32>@test_int_x86_avx512_mask_pminu_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %mask) {
; CHECK-LABEL: test_int_x86_avx512_mask_pminu_d_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpminud %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x3b,0xd1]
; CHECK-NEXT: vpminud %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x3b,0xc1]
@@ -3456,7 +3616,7 @@ declare <8 x i32> @llvm.x86.avx512.mask.pminu.d.256(<8 x i32>, <8 x i32>, <8 x i
define <8 x i32>@test_int_x86_avx512_mask_pminu_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_pminu_d_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpminud %ymm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x3b,0xd9]
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpminud %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x3b,0xd1]
@@ -3472,7 +3632,7 @@ declare <2 x i64> @llvm.x86.avx512.mask.pminu.q.128(<2 x i64>, <2 x i64>, <2 x i
define <2 x i64>@test_int_x86_avx512_mask_pminu_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_pminu_q_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpminuq %xmm1, %xmm0, %xmm3 ## encoding: [0x62,0xf2,0xfd,0x08,0x3b,0xd9]
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpminuq %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x3b,0xd1]
@@ -3488,7 +3648,7 @@ declare <4 x i64> @llvm.x86.avx512.mask.pminu.q.256(<4 x i64>, <4 x i64>, <4 x i
define <4 x i64>@test_int_x86_avx512_mask_pminu_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %mask) {
; CHECK-LABEL: test_int_x86_avx512_mask_pminu_q_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpminuq %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x3b,0xd1]
; CHECK-NEXT: vpminuq %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x3b,0xc1]
@@ -3504,7 +3664,7 @@ declare <2 x i64> @llvm.x86.avx512.mask.psrl.q.128(<2 x i64>, <2 x i64>, <2 x i6
define <2 x i64>@test_int_x86_avx512_mask_psrl_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psrl_q_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsrlq %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd3,0xd9]
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpsrlq %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0xd3,0xd1]
@@ -3524,7 +3684,7 @@ declare <4 x i64> @llvm.x86.avx512.mask.psrl.q.256(<4 x i64>, <2 x i64>, <4 x i6
define <4 x i64>@test_int_x86_avx512_mask_psrl_q_256(<4 x i64> %x0, <2 x i64> %x1, <4 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psrl_q_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsrlq %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd3,0xd9]
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpsrlq %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0xd3,0xd1]
@@ -3544,7 +3704,7 @@ declare <4 x i32> @llvm.x86.avx512.mask.psrl.d.128(<4 x i32>, <4 x i32>, <4 x i3
define <4 x i32>@test_int_x86_avx512_mask_psrl_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psrl_d_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsrld %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd2,0xd9]
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpsrld %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xd2,0xd1]
@@ -3564,7 +3724,7 @@ declare <8 x i32> @llvm.x86.avx512.mask.psrl.d.256(<8 x i32>, <4 x i32>, <8 x i3
define <8 x i32>@test_int_x86_avx512_mask_psrl_d_256(<8 x i32> %x0, <4 x i32> %x1, <8 x i32> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psrl_d_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsrld %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd2,0xd9]
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpsrld %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xd2,0xd1]
@@ -3584,7 +3744,7 @@ declare <4 x i32> @llvm.x86.avx512.mask.psra.d.128(<4 x i32>, <4 x i32>, <4 x i3
define <4 x i32>@test_int_x86_avx512_mask_psra_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psra_d_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsrad %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe2,0xd9]
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpsrad %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xe2,0xd1]
@@ -3604,7 +3764,7 @@ declare <8 x i32> @llvm.x86.avx512.mask.psra.d.256(<8 x i32>, <4 x i32>, <8 x i3
define <8 x i32>@test_int_x86_avx512_mask_psra_d_256(<8 x i32> %x0, <4 x i32> %x1, <8 x i32> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psra_d_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsrad %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe2,0xd9]
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpsrad %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xe2,0xd1]
@@ -3624,7 +3784,7 @@ declare <4 x i32> @llvm.x86.avx512.mask.psll.d.128(<4 x i32>, <4 x i32>, <4 x i3
define <4 x i32>@test_int_x86_avx512_mask_psll_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psll_d_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpslld %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xf2,0xd9]
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpslld %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xf2,0xd1]
@@ -3644,7 +3804,7 @@ declare <8 x i32> @llvm.x86.avx512.mask.psll.d.256(<8 x i32>, <4 x i32>, <8 x i3
define <8 x i32>@test_int_x86_avx512_mask_psll_d_256(<8 x i32> %x0, <4 x i32> %x1, <8 x i32> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psll_d_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpslld %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xf2,0xd9]
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpslld %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xf2,0xd1]
@@ -3664,7 +3824,7 @@ declare <4 x i64> @llvm.x86.avx512.mask.psll.q.256(<4 x i64>, <2 x i64>, <4 x i6
define <4 x i64>@test_int_x86_avx512_mask_psll_q_256(<4 x i64> %x0, <2 x i64> %x1, <4 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psll_q_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsllq %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xf3,0xd9]
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpsllq %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0xf3,0xd1]
@@ -3684,7 +3844,7 @@ declare <2 x i64> @llvm.x86.avx512.mask.psrl.qi.128(<2 x i64>, i32, <2 x i64>, i
define <2 x i64>@test_int_x86_avx512_mask_psrl_qi_128(<2 x i64> %x0, i32 %x1, <2 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psrl_qi_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsrlq $3, %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0x73,0xd0,0x03]
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpsrlq $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x09,0x73,0xd0,0x03]
@@ -3704,7 +3864,7 @@ declare <4 x i64> @llvm.x86.avx512.mask.psrl.qi.256(<4 x i64>, i32, <4 x i64>, i
define <4 x i64>@test_int_x86_avx512_mask_psrl_qi_256(<4 x i64> %x0, i32 %x1, <4 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psrl_qi_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsrlq $3, %ymm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x73,0xd0,0x03]
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpsrlq $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x29,0x73,0xd0,0x03]
@@ -3724,7 +3884,7 @@ declare <4 x i32> @llvm.x86.avx512.mask.psrl.di.128(<4 x i32>, i32, <4 x i32>, i
define <4 x i32>@test_int_x86_avx512_mask_psrl_di_128(<4 x i32> %x0, i32 %x1, <4 x i32> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psrl_di_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsrld $3, %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0x72,0xd0,0x03]
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpsrld $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x09,0x72,0xd0,0x03]
@@ -3744,7 +3904,7 @@ declare <8 x i32> @llvm.x86.avx512.mask.psrl.di.256(<8 x i32>, i32, <8 x i32>, i
define <8 x i32>@test_int_x86_avx512_mask_psrl_di_256(<8 x i32> %x0, i32 %x1, <8 x i32> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psrl_di_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsrld $3, %ymm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x72,0xd0,0x03]
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpsrld $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x29,0x72,0xd0,0x03]
@@ -3764,7 +3924,7 @@ declare <4 x i32> @llvm.x86.avx512.mask.psll.di.128(<4 x i32>, i32, <4 x i32>, i
define <4 x i32>@test_int_x86_avx512_mask_psll_di_128(<4 x i32> %x0, i32 %x1, <4 x i32> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psll_di_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpslld $3, %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0x72,0xf0,0x03]
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpslld $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x09,0x72,0xf0,0x03]
@@ -3784,7 +3944,7 @@ declare <8 x i32> @llvm.x86.avx512.mask.psll.di.256(<8 x i32>, i32, <8 x i32>, i
define <8 x i32>@test_int_x86_avx512_mask_psll_di_256(<8 x i32> %x0, i32 %x1, <8 x i32> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psll_di_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpslld $3, %ymm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x72,0xf0,0x03]
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpslld $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x29,0x72,0xf0,0x03]
@@ -3804,7 +3964,7 @@ declare <2 x i64> @llvm.x86.avx512.mask.psrlv2.di(<2 x i64>, <2 x i64>, <2 x i64
define <2 x i64>@test_int_x86_avx512_mask_psrlv2_di(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psrlv2_di:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsrlvq %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0x45,0xd9]
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpsrlvq %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x45,0xd1]
@@ -3824,7 +3984,7 @@ declare <4 x i64> @llvm.x86.avx512.mask.psrlv4.di(<4 x i64>, <4 x i64>, <4 x i64
define <4 x i64>@test_int_x86_avx512_mask_psrlv4_di(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psrlv4_di:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsrlvq %ymm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xfd,0x45,0xd9]
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpsrlvq %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x45,0xd1]
@@ -3844,7 +4004,7 @@ declare <4 x i32> @llvm.x86.avx512.mask.psrlv4.si(<4 x i32>, <4 x i32>, <4 x i32
define <4 x i32>@test_int_x86_avx512_mask_psrlv4_si(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psrlv4_si:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsrlvd %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x45,0xd9]
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpsrlvd %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x45,0xd1]
@@ -3864,7 +4024,7 @@ declare <8 x i32> @llvm.x86.avx512.mask.psrlv8.si(<8 x i32>, <8 x i32>, <8 x i32
define <8 x i32>@test_int_x86_avx512_mask_psrlv8_si(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psrlv8_si:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsrlvd %ymm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x45,0xd9]
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpsrlvd %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x45,0xd1]
@@ -3884,7 +4044,7 @@ declare <4 x i32> @llvm.x86.avx512.mask.psrav4.si(<4 x i32>, <4 x i32>, <4 x i32
define <4 x i32>@test_int_x86_avx512_mask_psrav4_si(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psrav4_si:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsravd %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x46,0xd9]
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpsravd %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x46,0xd1]
@@ -3904,7 +4064,7 @@ declare <8 x i32> @llvm.x86.avx512.mask.psrav8.si(<8 x i32>, <8 x i32>, <8 x i32
define <8 x i32>@test_int_x86_avx512_mask_psrav8_si(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psrav8_si:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsravd %ymm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x46,0xd9]
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpsravd %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x46,0xd1]
@@ -3922,12 +4082,12 @@ define <8 x i32>@test_int_x86_avx512_mask_psrav8_si(<8 x i32> %x0, <8 x i32> %x1
define <8 x i32>@test_int_x86_avx512_mask_psrav8_si_const() {
; CHECK-LABEL: test_int_x86_avx512_mask_psrav8_si_const:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovdqa {{.*}}(%rip), %ymm0 ## EVEX TO VEX Compression ymm0 = [2,9,4294967284,23,4294967270,37,4294967256,51]
; CHECK-NEXT: ## encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A]
-; CHECK-NEXT: ## fixup A - offset: 4, value: LCPI276_0-4, kind: reloc_riprel_4byte
+; CHECK-NEXT: ## fixup A - offset: 4, value: LCPI284_0-4, kind: reloc_riprel_4byte
; CHECK-NEXT: vpsravd {{.*}}(%rip), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x46,0x05,A,A,A,A]
-; CHECK-NEXT: ## fixup A - offset: 5, value: LCPI276_1-4, kind: reloc_riprel_4byte
+; CHECK-NEXT: ## fixup A - offset: 5, value: LCPI284_1-4, kind: reloc_riprel_4byte
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i32> @llvm.x86.avx512.mask.psrav8.si(<8 x i32> <i32 2, i32 9, i32 -12, i32 23, i32 -26, i32 37, i32 -40, i32 51>, <8 x i32> <i32 1, i32 18, i32 35, i32 52, i32 69, i32 15, i32 32, i32 49>, <8 x i32> zeroinitializer, i8 -1)
ret <8 x i32> %res
@@ -3937,7 +4097,7 @@ declare <2 x i64> @llvm.x86.avx512.mask.psllv2.di(<2 x i64>, <2 x i64>, <2 x i64
define <2 x i64>@test_int_x86_avx512_mask_psllv2_di(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psllv2_di:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsllvq %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0x47,0xd9]
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpsllvq %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x47,0xd1]
@@ -3957,7 +4117,7 @@ declare <4 x i64> @llvm.x86.avx512.mask.psllv4.di(<4 x i64>, <4 x i64>, <4 x i64
define <4 x i64>@test_int_x86_avx512_mask_psllv4_di(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psllv4_di:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsllvq %ymm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xfd,0x47,0xd9]
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpsllvq %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x47,0xd1]
@@ -3977,7 +4137,7 @@ declare <4 x i32> @llvm.x86.avx512.mask.psllv4.si(<4 x i32>, <4 x i32>, <4 x i32
define <4 x i32>@test_int_x86_avx512_mask_psllv4_si(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psllv4_si:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsllvd %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x47,0xd9]
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpsllvd %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x47,0xd1]
@@ -3997,7 +4157,7 @@ declare <8 x i32> @llvm.x86.avx512.mask.psllv8.si(<8 x i32>, <8 x i32>, <8 x i32
define <8 x i32>@test_int_x86_avx512_mask_psllv8_si(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psllv8_si:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsllvd %ymm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x47,0xd9]
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpsllvd %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x47,0xd1]
@@ -4017,7 +4177,7 @@ declare <4 x i32> @llvm.x86.avx512.mask.pmovzxb.d.128(<16 x i8>, <4 x i32>, i8)
define <4 x i32>@test_int_x86_avx512_mask_pmovzxb_d_128(<16 x i8> %x0, <4 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovzxb_d_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpmovzxbd %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x31,0xd0]
; CHECK-NEXT: ## xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
@@ -4040,7 +4200,7 @@ declare <8 x i32> @llvm.x86.avx512.mask.pmovzxb.d.256(<16 x i8>, <8 x i32>, i8)
define <8 x i32>@test_int_x86_avx512_mask_pmovzxb_d_256(<16 x i8> %x0, <8 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovzxb_d_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpmovzxbd %xmm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x31,0xd0]
; CHECK-NEXT: ## ymm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
@@ -4063,7 +4223,7 @@ declare <2 x i64> @llvm.x86.avx512.mask.pmovzxb.q.128(<16 x i8>, <2 x i64>, i8)
define <2 x i64>@test_int_x86_avx512_mask_pmovzxb_q_128(<16 x i8> %x0, <2 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovzxb_q_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpmovzxbq %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x32,0xd0]
; CHECK-NEXT: ## xmm2 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
@@ -4086,7 +4246,7 @@ declare <4 x i64> @llvm.x86.avx512.mask.pmovzxb.q.256(<16 x i8>, <4 x i64>, i8)
define <4 x i64>@test_int_x86_avx512_mask_pmovzxb_q_256(<16 x i8> %x0, <4 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovzxb_q_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpmovzxbq %xmm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x32,0xd0]
; CHECK-NEXT: ## ymm2 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
@@ -4109,7 +4269,7 @@ declare <2 x i64> @llvm.x86.avx512.mask.pmovzxd.q.128(<4 x i32>, <2 x i64>, i8)
define <2 x i64>@test_int_x86_avx512_mask_pmovzxd_q_128(<4 x i32> %x0, <2 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovzxd_q_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpmovzxdq %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x35,0xd0]
; CHECK-NEXT: ## xmm2 = xmm0[0],zero,xmm0[1],zero
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
@@ -4132,7 +4292,7 @@ declare <4 x i64> @llvm.x86.avx512.mask.pmovzxd.q.256(<4 x i32>, <4 x i64>, i8)
define <4 x i64>@test_int_x86_avx512_mask_pmovzxd_q_256(<4 x i32> %x0, <4 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovzxd_q_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpmovzxdq %xmm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x35,0xd0]
; CHECK-NEXT: ## ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
@@ -4155,7 +4315,7 @@ declare <4 x i32> @llvm.x86.avx512.mask.pmovzxw.d.128(<8 x i16>, <4 x i32>, i8)
define <4 x i32>@test_int_x86_avx512_mask_pmovzxw_d_128(<8 x i16> %x0, <4 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovzxw_d_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpmovzxwd %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x33,0xd0]
; CHECK-NEXT: ## xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
@@ -4178,7 +4338,7 @@ declare <8 x i32> @llvm.x86.avx512.mask.pmovzxw.d.256(<8 x i16>, <8 x i32>, i8)
define <8 x i32>@test_int_x86_avx512_mask_pmovzxw_d_256(<8 x i16> %x0, <8 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovzxw_d_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpmovzxwd %xmm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x33,0xd0]
; CHECK-NEXT: ## ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
@@ -4201,7 +4361,7 @@ declare <2 x i64> @llvm.x86.avx512.mask.pmovzxw.q.128(<8 x i16>, <2 x i64>, i8)
define <2 x i64>@test_int_x86_avx512_mask_pmovzxw_q_128(<8 x i16> %x0, <2 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovzxw_q_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpmovzxwq %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x34,0xd0]
; CHECK-NEXT: ## xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
@@ -4224,7 +4384,7 @@ declare <4 x i64> @llvm.x86.avx512.mask.pmovzxw.q.256(<8 x i16>, <4 x i64>, i8)
define <4 x i64>@test_int_x86_avx512_mask_pmovzxw_q_256(<8 x i16> %x0, <4 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovzxw_q_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpmovzxwq %xmm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x34,0xd0]
; CHECK-NEXT: ## ymm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
@@ -4247,7 +4407,7 @@ declare <4 x i32> @llvm.x86.avx512.mask.pmovsxb.d.128(<16 x i8>, <4 x i32>, i8)
define <4 x i32>@test_int_x86_avx512_mask_pmovsxb_d_128(<16 x i8> %x0, <4 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovsxb_d_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpmovsxbd %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x21,0xd0]
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpmovsxbd %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x21,0xc8]
@@ -4267,7 +4427,7 @@ declare <8 x i32> @llvm.x86.avx512.mask.pmovsxb.d.256(<16 x i8>, <8 x i32>, i8)
define <8 x i32>@test_int_x86_avx512_mask_pmovsxb_d_256(<16 x i8> %x0, <8 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovsxb_d_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpmovsxbd %xmm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x21,0xd0]
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpmovsxbd %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x21,0xc8]
@@ -4287,7 +4447,7 @@ declare <2 x i64> @llvm.x86.avx512.mask.pmovsxb.q.128(<16 x i8>, <2 x i64>, i8)
define <2 x i64>@test_int_x86_avx512_mask_pmovsxb_q_128(<16 x i8> %x0, <2 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovsxb_q_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpmovsxbq %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x22,0xd0]
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpmovsxbq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x22,0xc8]
@@ -4307,7 +4467,7 @@ declare <4 x i64> @llvm.x86.avx512.mask.pmovsxb.q.256(<16 x i8>, <4 x i64>, i8)
define <4 x i64>@test_int_x86_avx512_mask_pmovsxb_q_256(<16 x i8> %x0, <4 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovsxb_q_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpmovsxbq %xmm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x22,0xd0]
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpmovsxbq %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x22,0xc8]
@@ -4327,7 +4487,7 @@ declare <4 x i32> @llvm.x86.avx512.mask.pmovsxw.d.128(<8 x i16>, <4 x i32>, i8)
define <4 x i32>@test_int_x86_avx512_mask_pmovsxw_d_128(<8 x i16> %x0, <4 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovsxw_d_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpmovsxwd %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x23,0xd0]
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpmovsxwd %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x23,0xc8]
@@ -4347,7 +4507,7 @@ declare <8 x i32> @llvm.x86.avx512.mask.pmovsxw.d.256(<8 x i16>, <8 x i32>, i8)
define <8 x i32>@test_int_x86_avx512_mask_pmovsxw_d_256(<8 x i16> %x0, <8 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovsxw_d_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpmovsxwd %xmm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x23,0xd0]
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpmovsxwd %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x23,0xc8]
@@ -4367,7 +4527,7 @@ declare <2 x i64> @llvm.x86.avx512.mask.pmovsxw.q.128(<8 x i16>, <2 x i64>, i8)
define <2 x i64>@test_int_x86_avx512_mask_pmovsxw_q_128(<8 x i16> %x0, <2 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovsxw_q_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpmovsxwq %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x24,0xd0]
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpmovsxwq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x24,0xc8]
@@ -4387,7 +4547,7 @@ declare <4 x i64> @llvm.x86.avx512.mask.pmovsxw.q.256(<8 x i16>, <4 x i64>, i8)
define <4 x i64>@test_int_x86_avx512_mask_pmovsxw_q_256(<8 x i16> %x0, <4 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovsxw_q_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpmovsxwq %xmm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x24,0xd0]
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpmovsxwq %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x24,0xc8]
@@ -4407,7 +4567,7 @@ declare <2 x i64> @llvm.x86.avx512.mask.psra.q.128(<2 x i64>, <2 x i64>, <2 x i6
define <2 x i64>@test_int_x86_avx512_mask_psra_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psra_q_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsraq %xmm1, %xmm0, %xmm3 ## encoding: [0x62,0xf1,0xfd,0x08,0xe2,0xd9]
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpsraq %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0xe2,0xd1]
@@ -4427,7 +4587,7 @@ declare <4 x i64> @llvm.x86.avx512.mask.psra.q.256(<4 x i64>, <2 x i64>, <4 x i6
define <4 x i64>@test_int_x86_avx512_mask_psra_q_256(<4 x i64> %x0, <2 x i64> %x1, <4 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psra_q_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsraq %xmm1, %ymm0, %ymm3 ## encoding: [0x62,0xf1,0xfd,0x28,0xe2,0xd9]
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpsraq %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0xe2,0xd1]
@@ -4447,7 +4607,7 @@ declare <2 x i64> @llvm.x86.avx512.mask.psra.qi.128(<2 x i64>, i32, <2 x i64>, i
define <2 x i64>@test_int_x86_avx512_mask_psra_qi_128(<2 x i64> %x0, i32 %x1, <2 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psra_qi_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsraq $3, %xmm0, %xmm2 ## encoding: [0x62,0xf1,0xed,0x08,0x72,0xe0,0x03]
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpsraq $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x09,0x72,0xe0,0x03]
@@ -4467,7 +4627,7 @@ declare <4 x i64> @llvm.x86.avx512.mask.psra.qi.256(<4 x i64>, i32, <4 x i64>, i
define <4 x i64>@test_int_x86_avx512_mask_psra_qi_256(<4 x i64> %x0, i32 %x1, <4 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psra_qi_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsraq $3, %ymm0, %ymm2 ## encoding: [0x62,0xf1,0xed,0x28,0x72,0xe0,0x03]
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpsraq $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x29,0x72,0xe0,0x03]
@@ -4487,7 +4647,7 @@ declare <2 x i64> @llvm.x86.avx512.mask.psrav.q.128(<2 x i64>, <2 x i64>, <2 x i
define <2 x i64>@test_int_x86_avx512_mask_psrav_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psrav_q_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsravq %xmm1, %xmm0, %xmm3 ## encoding: [0x62,0xf2,0xfd,0x08,0x46,0xd9]
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpsravq %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x46,0xd1]
@@ -4505,12 +4665,12 @@ define <2 x i64>@test_int_x86_avx512_mask_psrav_q_128(<2 x i64> %x0, <2 x i64> %
define <2 x i64>@test_int_x86_avx512_mask_psrav_q_128_const(i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psrav_q_128_const:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovdqa {{.*}}(%rip), %xmm0 ## EVEX TO VEX Compression xmm0 = [2,18446744073709551607]
; CHECK-NEXT: ## encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A]
-; CHECK-NEXT: ## fixup A - offset: 4, value: LCPI304_0-4, kind: reloc_riprel_4byte
+; CHECK-NEXT: ## fixup A - offset: 4, value: LCPI312_0-4, kind: reloc_riprel_4byte
; CHECK-NEXT: vpsravq {{.*}}(%rip), %xmm0, %xmm0 ## encoding: [0x62,0xf2,0xfd,0x08,0x46,0x05,A,A,A,A]
-; CHECK-NEXT: ## fixup A - offset: 6, value: LCPI304_1-4, kind: reloc_riprel_4byte
+; CHECK-NEXT: ## fixup A - offset: 6, value: LCPI312_1-4, kind: reloc_riprel_4byte
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x i64> @llvm.x86.avx512.mask.psrav.q.128(<2 x i64> <i64 2, i64 -9>, <2 x i64> <i64 1, i64 90>, <2 x i64> zeroinitializer, i8 -1)
ret <2 x i64> %res
@@ -4520,7 +4680,7 @@ declare <4 x i64> @llvm.x86.avx512.mask.psrav.q.256(<4 x i64>, <4 x i64>, <4 x i
define <4 x i64>@test_int_x86_avx512_mask_psrav_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_psrav_q_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsravq %ymm1, %ymm0, %ymm3 ## encoding: [0x62,0xf2,0xfd,0x28,0x46,0xd9]
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpsravq %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x46,0xd1]
@@ -4540,7 +4700,7 @@ declare <2 x double> @llvm.x86.avx512.mask.cvtdq2pd.128(<4 x i32>, <2 x double>,
define <2 x double>@test_int_x86_avx512_mask_cvt_dq2pd_128(<4 x i32> %x0, <2 x double> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_dq2pd_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vcvtdq2pd %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0xe6,0xd0]
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vcvtdq2pd %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7e,0x09,0xe6,0xc8]
@@ -4556,7 +4716,7 @@ declare <4 x double> @llvm.x86.avx512.mask.cvtdq2pd.256(<4 x i32>, <4 x double>,
define <4 x double>@test_int_x86_avx512_mask_cvt_dq2pd_256(<4 x i32> %x0, <4 x double> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_dq2pd_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vcvtdq2pd %xmm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xfe,0xe6,0xd0]
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vcvtdq2pd %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7e,0x29,0xe6,0xc8]
@@ -4572,7 +4732,7 @@ declare <2 x double> @llvm.x86.avx512.mask.cvtudq2pd.128(<4 x i32>, <2 x double>
define <2 x double>@test_int_x86_avx512_mask_cvt_udq2pd_128(<4 x i32> %x0, <2 x double> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_udq2pd_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vcvtudq2pd %xmm0, %xmm2 ## encoding: [0x62,0xf1,0x7e,0x08,0x7a,0xd0]
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vcvtudq2pd %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7e,0x09,0x7a,0xc8]
@@ -4588,7 +4748,7 @@ declare <4 x double> @llvm.x86.avx512.mask.cvtudq2pd.256(<4 x i32>, <4 x double>
define <4 x double>@test_int_x86_avx512_mask_cvt_udq2pd_256(<4 x i32> %x0, <4 x double> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_udq2pd_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vcvtudq2pd %xmm0, %ymm2 ## encoding: [0x62,0xf1,0x7e,0x28,0x7a,0xd0]
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vcvtudq2pd %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7e,0x29,0x7a,0xc8]
@@ -4604,9 +4764,9 @@ declare <4 x i32> @llvm.x86.avx512.mask.valign.d.128(<4 x i32>, <4 x i32>, i32,
define <4 x i32>@test_int_x86_avx512_mask_valign_d_128(<4 x i32> %x0, <4 x i32> %x1,<4 x i32> %x3, i8 %x4) {
; CHECK-LABEL: test_int_x86_avx512_mask_valign_d_128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: valignd $2, %xmm1, %xmm0, %xmm3 ## encoding: [0x62,0xf3,0x7d,0x08,0x03,0xd9,0x02]
-; CHECK-NEXT: ## xmm3 = xmm1[2,3],xmm0[0,1]
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpalignr $8, %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x0f,0xd9,0x08]
+; CHECK-NEXT: ## xmm3 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: valignd $2, %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x03,0xd1,0x02]
; CHECK-NEXT: ## xmm2 {%k1} = xmm1[2,3],xmm0[0,1]
@@ -4627,9 +4787,9 @@ declare <8 x i32> @llvm.x86.avx512.mask.valign.d.256(<8 x i32>, <8 x i32>, i32,
define <8 x i32>@test_int_x86_avx512_mask_valign_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x3, i8 %x4) {
; CHECK-LABEL: test_int_x86_avx512_mask_valign_d_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: valignd $6, %ymm1, %ymm0, %ymm3 ## encoding: [0x62,0xf3,0x7d,0x28,0x03,0xd9,0x06]
-; CHECK-NEXT: ## ymm3 = ymm1[6,7],ymm0[0,1,2,3,4,5]
+; CHECK: ## %bb.0:
+; CHECK-NEXT: valignq $3, %ymm1, %ymm0, %ymm3 ## encoding: [0x62,0xf3,0xfd,0x28,0x03,0xd9,0x03]
+; CHECK-NEXT: ## ymm3 = ymm1[3],ymm0[0,1,2]
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: valignd $6, %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x03,0xd1,0x06]
; CHECK-NEXT: ## ymm2 {%k1} = ymm1[6,7],ymm0[0,1,2,3,4,5]
@@ -4645,9 +4805,9 @@ declare <2 x i64> @llvm.x86.avx512.mask.valign.q.128(<2 x i64>, <2 x i64>, i32,
define <2 x i64>@test_int_x86_avx512_mask_valign_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x3, i8 %x4) {
; CHECK-LABEL: test_int_x86_avx512_mask_valign_q_128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: valignq $1, %xmm1, %xmm0, %xmm3 ## encoding: [0x62,0xf3,0xfd,0x08,0x03,0xd9,0x01]
-; CHECK-NEXT: ## xmm3 = xmm1[1],xmm0[0]
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpalignr $8, %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x0f,0xd9,0x08]
+; CHECK-NEXT: ## xmm3 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: valignq $1, %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x03,0xd1,0x01]
; CHECK-NEXT: ## xmm2 {%k1} = xmm1[1],xmm0[0]
@@ -4663,7 +4823,7 @@ declare <4 x i64> @llvm.x86.avx512.mask.valign.q.256(<4 x i64>, <4 x i64>, i32,
define <4 x i64>@test_int_x86_avx512_mask_valign_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x3, i8 %x4) {
; CHECK-LABEL: test_int_x86_avx512_mask_valign_q_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: valignq $3, %ymm1, %ymm0, %ymm3 ## encoding: [0x62,0xf3,0xfd,0x28,0x03,0xd9,0x03]
; CHECK-NEXT: ## ymm3 = ymm1[3],ymm0[0,1,2]
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
@@ -4681,7 +4841,7 @@ declare <4 x double> @llvm.x86.avx512.mask.vpermilvar.pd.256(<4 x double>, <4 x
define <4 x double>@test_int_x86_avx512_mask_vpermilvar_pd_256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vpermilvar_pd_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpermilpd %ymm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x0d,0xd9]
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpermilpd %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x0d,0xd1]
@@ -4701,7 +4861,7 @@ declare <2 x double> @llvm.x86.avx512.mask.vpermilvar.pd.128(<2 x double>, <2 x
define <2 x double>@test_int_x86_avx512_mask_vpermilvar_pd_128(<2 x double> %x0, <2 x i64> %x1, <2 x double> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vpermilvar_pd_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpermilpd %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x0d,0xd9]
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpermilpd %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x0d,0xd1]
@@ -4721,7 +4881,7 @@ declare <8 x float> @llvm.x86.avx512.mask.vpermilvar.ps.256(<8 x float>, <8 x i3
define <8 x float>@test_int_x86_avx512_mask_vpermilvar_ps_256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vpermilvar_ps_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpermilps %ymm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x0c,0xd9]
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpermilps %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x0c,0xd1]
@@ -4741,7 +4901,7 @@ declare <4 x float> @llvm.x86.avx512.mask.vpermilvar.ps.128(<4 x float>, <4 x i3
define <4 x float>@test_int_x86_avx512_mask_vpermilvar_ps_128(<4 x float> %x0, <4 x i32> %x1, <4 x float> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vpermilvar_ps_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpermilps %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x0c,0xd9]
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpermilps %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x0c,0xd1]
@@ -4761,7 +4921,7 @@ declare <4 x float> @llvm.x86.avx512.mask.vextractf32x4.256(<8 x float>, i32, <4
define <4 x float>@test_int_x86_avx512_mask_vextractf32x4_256(<8 x float> %x0, <4 x float> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vextractf32x4_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x19,0xc2,0x01]
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vextractf32x4 $1, %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x19,0xc1,0x01]
@@ -4781,7 +4941,7 @@ declare <8 x float> @llvm.x86.avx512.mask.insertf32x4.256(<8 x float>, <4 x floa
define <8 x float>@test_int_x86_avx512_mask_insertf32x4_256(<8 x float> %x0, <4 x float> %x1, <8 x float> %x3, i8 %x4) {
; CHECK-LABEL: test_int_x86_avx512_mask_insertf32x4_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x18,0xd9,0x01]
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vinsertf32x4 $1, %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x18,0xd1,0x01]
@@ -4801,7 +4961,7 @@ declare <8 x i32> @llvm.x86.avx512.mask.inserti32x4.256(<8 x i32>, <4 x i32>, i3
define <8 x i32>@test_int_x86_avx512_mask_inserti32x4_256(<8 x i32> %x0, <4 x i32> %x1, <8 x i32> %x3, i8 %x4) {
; CHECK-LABEL: test_int_x86_avx512_mask_inserti32x4_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x38,0xd9,0x01]
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x38,0xd1,0x01]
@@ -4820,7 +4980,7 @@ define <8 x i32>@test_int_x86_avx512_mask_inserti32x4_256(<8 x i32> %x0, <4 x i3
define <8 x float> @test_mm512_maskz_max_ps_256(<8 x float> %a0, <8 x float> %a1, i8 %mask) {
; CHECK-LABEL: test_mm512_maskz_max_ps_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmaxps %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xa9,0x5f,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -4830,7 +4990,7 @@ define <8 x float> @test_mm512_maskz_max_ps_256(<8 x float> %a0, <8 x float> %a1
define <8 x float> @test_mm512_mask_max_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask) {
; CHECK-LABEL: test_mm512_mask_max_ps_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmaxps %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x5f,0xd1]
; CHECK-NEXT: vmovaps %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2]
@@ -4841,7 +5001,7 @@ define <8 x float> @test_mm512_mask_max_ps_256(<8 x float> %a0, <8 x float> %a1,
define <8 x float> @test_mm512_max_ps_256(<8 x float> %a0, <8 x float> %a1, i8 %mask) {
; CHECK-LABEL: test_mm512_max_ps_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmaxps %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x5f,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.max.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float>zeroinitializer, i8 -1)
@@ -4851,7 +5011,7 @@ declare <8 x float> @llvm.x86.avx512.mask.max.ps.256(<8 x float>, <8 x float>, <
define <4 x float> @test_mm512_maskz_max_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %mask) {
; CHECK-LABEL: test_mm512_maskz_max_ps_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmaxps %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x89,0x5f,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -4861,7 +5021,7 @@ define <4 x float> @test_mm512_maskz_max_ps_128(<4 x float> %a0, <4 x float> %a1
define <4 x float> @test_mm512_mask_max_ps_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask) {
; CHECK-LABEL: test_mm512_mask_max_ps_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmaxps %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x5f,0xd1]
; CHECK-NEXT: vmovaps %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2]
@@ -4872,7 +5032,7 @@ define <4 x float> @test_mm512_mask_max_ps_128(<4 x float> %a0, <4 x float> %a1,
define <4 x float> @test_mm512_max_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %mask) {
; CHECK-LABEL: test_mm512_max_ps_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmaxps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x5f,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.max.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float>zeroinitializer, i8 -1)
@@ -4882,7 +5042,7 @@ declare <4 x float> @llvm.x86.avx512.mask.max.ps.128(<4 x float>, <4 x float>, <
define <8 x float> @test_mm512_maskz_min_ps_256(<8 x float> %a0, <8 x float> %a1, i8 %mask) {
; CHECK-LABEL: test_mm512_maskz_min_ps_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vminps %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xa9,0x5d,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -4892,7 +5052,7 @@ define <8 x float> @test_mm512_maskz_min_ps_256(<8 x float> %a0, <8 x float> %a1
define <8 x float> @test_mm512_mask_min_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask) {
; CHECK-LABEL: test_mm512_mask_min_ps_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vminps %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x5d,0xd1]
; CHECK-NEXT: vmovaps %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2]
@@ -4903,7 +5063,7 @@ define <8 x float> @test_mm512_mask_min_ps_256(<8 x float> %a0, <8 x float> %a1,
define <8 x float> @test_mm512_min_ps_256(<8 x float> %a0, <8 x float> %a1, i8 %mask) {
; CHECK-LABEL: test_mm512_min_ps_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vminps %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x5d,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.min.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float>zeroinitializer, i8 -1)
@@ -4913,7 +5073,7 @@ declare <8 x float> @llvm.x86.avx512.mask.min.ps.256(<8 x float>, <8 x float>, <
define <4 x float> @test_mm512_maskz_min_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %mask) {
; CHECK-LABEL: test_mm512_maskz_min_ps_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vminps %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x89,0x5d,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -4923,7 +5083,7 @@ define <4 x float> @test_mm512_maskz_min_ps_128(<4 x float> %a0, <4 x float> %a1
define <4 x float> @test_mm512_mask_min_ps_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask) {
; CHECK-LABEL: test_mm512_mask_min_ps_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vminps %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x5d,0xd1]
; CHECK-NEXT: vmovaps %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2]
@@ -4934,7 +5094,7 @@ define <4 x float> @test_mm512_mask_min_ps_128(<4 x float> %a0, <4 x float> %a1,
define <4 x float> @test_mm512_min_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %mask) {
; CHECK-LABEL: test_mm512_min_ps_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vminps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x5d,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.min.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float>zeroinitializer, i8 -1)
@@ -4944,7 +5104,7 @@ declare <4 x float> @llvm.x86.avx512.mask.min.ps.128(<4 x float>, <4 x float>, <
define <8 x i8> @test_cmp_d_256(<8 x i32> %a0, <8 x i32> %a1) {
; CHECK-LABEL: test_cmp_d_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x28,0x76,0xc1]
; CHECK-NEXT: vpcmpgtd %ymm0, %ymm1, %k1 ## encoding: [0x62,0xf1,0x75,0x28,0x66,0xc8]
; CHECK-NEXT: vpcmpled %ymm1, %ymm0, %k2 ## encoding: [0x62,0xf3,0x7d,0x28,0x1f,0xd1,0x02]
@@ -4989,7 +5149,7 @@ define <8 x i8> @test_cmp_d_256(<8 x i32> %a0, <8 x i32> %a1) {
define <8 x i8> @test_mask_cmp_d_256(<8 x i32> %a0, <8 x i32> %a1, i8 %mask) {
; CHECK-LABEL: test_mask_cmp_d_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x76,0xc1]
; CHECK-NEXT: vpcmpgtd %ymm0, %ymm1, %k2 {%k1} ## encoding: [0x62,0xf1,0x75,0x29,0x66,0xd0]
@@ -5037,7 +5197,7 @@ declare i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32>, <8 x i32>, i32, i8) nounwi
define <8 x i8> @test_ucmp_d_256(<8 x i32> %a0, <8 x i32> %a1) {
; CHECK-LABEL: test_ucmp_d_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x28,0x76,0xc1]
; CHECK-NEXT: vpcmpltud %ymm1, %ymm0, %k1 ## encoding: [0x62,0xf3,0x7d,0x28,0x1e,0xc9,0x01]
; CHECK-NEXT: vpcmpleud %ymm1, %ymm0, %k2 ## encoding: [0x62,0xf3,0x7d,0x28,0x1e,0xd1,0x02]
@@ -5082,7 +5242,7 @@ define <8 x i8> @test_ucmp_d_256(<8 x i32> %a0, <8 x i32> %a1) {
define <8 x i8> @test_mask_ucmp_d_256(<8 x i32> %a0, <8 x i32> %a1, i8 %mask) {
; CHECK-LABEL: test_mask_ucmp_d_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x76,0xc1]
; CHECK-NEXT: vpcmpltud %ymm1, %ymm0, %k2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x1e,0xd1,0x01]
@@ -5130,7 +5290,7 @@ declare i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32>, <8 x i32>, i32, i8) nounw
define <8 x i8> @test_cmp_q_256(<4 x i64> %a0, <4 x i64> %a1) {
; CHECK-LABEL: test_cmp_q_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf2,0xfd,0x28,0x29,0xc1]
; CHECK-NEXT: vpcmpgtq %ymm0, %ymm1, %k1 ## encoding: [0x62,0xf2,0xf5,0x28,0x37,0xc8]
; CHECK-NEXT: vpcmpleq %ymm1, %ymm0, %k2 ## encoding: [0x62,0xf3,0xfd,0x28,0x1f,0xd1,0x02]
@@ -5174,32 +5334,32 @@ define <8 x i8> @test_cmp_q_256(<4 x i64> %a0, <4 x i64> %a1) {
define <8 x i8> @test_mask_cmp_q_256(<4 x i64> %a0, <4 x i64> %a1, i8 %mask) {
; CHECK-LABEL: test_mask_cmp_q_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k6 ## encoding: [0xc5,0xf8,0x92,0xf7]
-; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k0 {%k6} ## encoding: [0x62,0xf2,0xfd,0x2e,0x29,0xc1]
-; CHECK-NEXT: vpcmpgtq %ymm0, %ymm1, %k7 {%k6} ## encoding: [0x62,0xf2,0xf5,0x2e,0x37,0xf8]
-; CHECK-NEXT: vpcmpleq %ymm1, %ymm0, %k1 {%k6} ## encoding: [0x62,0xf3,0xfd,0x2e,0x1f,0xc9,0x02]
-; CHECK-NEXT: kxorw %k0, %k0, %k2 ## encoding: [0xc5,0xfc,0x47,0xd0]
-; CHECK-NEXT: vpcmpneqq %ymm1, %ymm0, %k3 {%k6} ## encoding: [0x62,0xf3,0xfd,0x2e,0x1f,0xd9,0x04]
-; CHECK-NEXT: vpcmpleq %ymm0, %ymm1, %k4 {%k6} ## encoding: [0x62,0xf3,0xf5,0x2e,0x1f,0xe0,0x02]
-; CHECK-NEXT: vpcmpgtq %ymm1, %ymm0, %k5 {%k6} ## encoding: [0x62,0xf2,0xfd,0x2e,0x37,0xe9]
-; CHECK-NEXT: kshiftlw $12, %k6, %k6 ## encoding: [0xc4,0xe3,0xf9,0x32,0xf6,0x0c]
-; CHECK-NEXT: kshiftrw $12, %k6, %k6 ## encoding: [0xc4,0xe3,0xf9,0x30,0xf6,0x0c]
-; CHECK-NEXT: kmovw %k7, %eax ## encoding: [0xc5,0xf8,0x93,0xc7]
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x29,0xc1]
+; CHECK-NEXT: vpcmpgtq %ymm0, %ymm1, %k3 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0x37,0xd8]
+; CHECK-NEXT: vpcmpleq %ymm1, %ymm0, %k2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x1f,0xd1,0x02]
+; CHECK-NEXT: kxorw %k0, %k0, %k4 ## encoding: [0xc5,0xfc,0x47,0xe0]
+; CHECK-NEXT: vpcmpneqq %ymm1, %ymm0, %k5 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x1f,0xe9,0x04]
+; CHECK-NEXT: vpcmpleq %ymm0, %ymm1, %k6 {%k1} ## encoding: [0x62,0xf3,0xf5,0x29,0x1f,0xf0,0x02]
+; CHECK-NEXT: vpcmpgtq %ymm1, %ymm0, %k7 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x37,0xf9]
+; CHECK-NEXT: kshiftlw $12, %k1, %k1 ## encoding: [0xc4,0xe3,0xf9,0x32,0xc9,0x0c]
+; CHECK-NEXT: kshiftrw $12, %k1, %k1 ## encoding: [0xc4,0xe3,0xf9,0x30,0xc9,0x0c]
+; CHECK-NEXT: kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3]
; CHECK-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
; CHECK-NEXT: vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1]
; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
-; CHECK-NEXT: kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1]
-; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
; CHECK-NEXT: kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2]
+; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
+; CHECK-NEXT: kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4]
; CHECK-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06]
-; CHECK-NEXT: kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3]
+; CHECK-NEXT: kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5]
; CHECK-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x08]
-; CHECK-NEXT: kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4]
+; CHECK-NEXT: kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6]
; CHECK-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0a]
-; CHECK-NEXT: kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5]
+; CHECK-NEXT: kmovw %k7, %eax ## encoding: [0xc5,0xf8,0x93,0xc7]
; CHECK-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0c]
-; CHECK-NEXT: kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6]
+; CHECK-NEXT: kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1]
; CHECK-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0e]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 0, i8 %mask)
@@ -5225,7 +5385,7 @@ declare i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64>, <4 x i64>, i32, i8) nounwi
define <8 x i8> @test_ucmp_q_256(<4 x i64> %a0, <4 x i64> %a1) {
; CHECK-LABEL: test_ucmp_q_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf2,0xfd,0x28,0x29,0xc1]
; CHECK-NEXT: vpcmpltuq %ymm1, %ymm0, %k1 ## encoding: [0x62,0xf3,0xfd,0x28,0x1e,0xc9,0x01]
; CHECK-NEXT: vpcmpleuq %ymm1, %ymm0, %k2 ## encoding: [0x62,0xf3,0xfd,0x28,0x1e,0xd1,0x02]
@@ -5269,32 +5429,32 @@ define <8 x i8> @test_ucmp_q_256(<4 x i64> %a0, <4 x i64> %a1) {
define <8 x i8> @test_mask_ucmp_q_256(<4 x i64> %a0, <4 x i64> %a1, i8 %mask) {
; CHECK-LABEL: test_mask_ucmp_q_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k6 ## encoding: [0xc5,0xf8,0x92,0xf7]
-; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k0 {%k6} ## encoding: [0x62,0xf2,0xfd,0x2e,0x29,0xc1]
-; CHECK-NEXT: vpcmpltuq %ymm1, %ymm0, %k7 {%k6} ## encoding: [0x62,0xf3,0xfd,0x2e,0x1e,0xf9,0x01]
-; CHECK-NEXT: vpcmpleuq %ymm1, %ymm0, %k1 {%k6} ## encoding: [0x62,0xf3,0xfd,0x2e,0x1e,0xc9,0x02]
-; CHECK-NEXT: kxorw %k0, %k0, %k2 ## encoding: [0xc5,0xfc,0x47,0xd0]
-; CHECK-NEXT: vpcmpneqq %ymm1, %ymm0, %k3 {%k6} ## encoding: [0x62,0xf3,0xfd,0x2e,0x1f,0xd9,0x04]
-; CHECK-NEXT: vpcmpnltuq %ymm1, %ymm0, %k4 {%k6} ## encoding: [0x62,0xf3,0xfd,0x2e,0x1e,0xe1,0x05]
-; CHECK-NEXT: vpcmpnleuq %ymm1, %ymm0, %k5 {%k6} ## encoding: [0x62,0xf3,0xfd,0x2e,0x1e,0xe9,0x06]
-; CHECK-NEXT: kshiftlw $12, %k6, %k6 ## encoding: [0xc4,0xe3,0xf9,0x32,0xf6,0x0c]
-; CHECK-NEXT: kshiftrw $12, %k6, %k6 ## encoding: [0xc4,0xe3,0xf9,0x30,0xf6,0x0c]
-; CHECK-NEXT: kmovw %k7, %eax ## encoding: [0xc5,0xf8,0x93,0xc7]
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x29,0xc1]
+; CHECK-NEXT: vpcmpltuq %ymm1, %ymm0, %k3 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x1e,0xd9,0x01]
+; CHECK-NEXT: vpcmpleuq %ymm1, %ymm0, %k2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x1e,0xd1,0x02]
+; CHECK-NEXT: kxorw %k0, %k0, %k4 ## encoding: [0xc5,0xfc,0x47,0xe0]
+; CHECK-NEXT: vpcmpneqq %ymm1, %ymm0, %k5 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x1f,0xe9,0x04]
+; CHECK-NEXT: vpcmpnltuq %ymm1, %ymm0, %k6 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x1e,0xf1,0x05]
+; CHECK-NEXT: vpcmpnleuq %ymm1, %ymm0, %k7 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x1e,0xf9,0x06]
+; CHECK-NEXT: kshiftlw $12, %k1, %k1 ## encoding: [0xc4,0xe3,0xf9,0x32,0xc9,0x0c]
+; CHECK-NEXT: kshiftrw $12, %k1, %k1 ## encoding: [0xc4,0xe3,0xf9,0x30,0xc9,0x0c]
+; CHECK-NEXT: kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3]
; CHECK-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
; CHECK-NEXT: vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1]
; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
-; CHECK-NEXT: kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1]
-; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
; CHECK-NEXT: kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2]
+; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
+; CHECK-NEXT: kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4]
; CHECK-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06]
-; CHECK-NEXT: kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3]
+; CHECK-NEXT: kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5]
; CHECK-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x08]
-; CHECK-NEXT: kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4]
+; CHECK-NEXT: kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6]
; CHECK-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0a]
-; CHECK-NEXT: kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5]
+; CHECK-NEXT: kmovw %k7, %eax ## encoding: [0xc5,0xf8,0x93,0xc7]
; CHECK-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0c]
-; CHECK-NEXT: kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6]
+; CHECK-NEXT: kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1]
; CHECK-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0e]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 0, i8 %mask)
@@ -5320,7 +5480,7 @@ declare i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64>, <4 x i64>, i32, i8) nounw
define <8 x i8> @test_cmp_d_128(<4 x i32> %a0, <4 x i32> %a1) {
; CHECK-LABEL: test_cmp_d_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x08,0x76,0xc1]
; CHECK-NEXT: vpcmpgtd %xmm0, %xmm1, %k1 ## encoding: [0x62,0xf1,0x75,0x08,0x66,0xc8]
; CHECK-NEXT: vpcmpled %xmm1, %xmm0, %k2 ## encoding: [0x62,0xf3,0x7d,0x08,0x1f,0xd1,0x02]
@@ -5364,32 +5524,32 @@ define <8 x i8> @test_cmp_d_128(<4 x i32> %a0, <4 x i32> %a1) {
define <8 x i8> @test_mask_cmp_d_128(<4 x i32> %a0, <4 x i32> %a1, i8 %mask) {
; CHECK-LABEL: test_mask_cmp_d_128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k6 ## encoding: [0xc5,0xf8,0x92,0xf7]
-; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k0 {%k6} ## encoding: [0x62,0xf1,0x7d,0x0e,0x76,0xc1]
-; CHECK-NEXT: vpcmpgtd %xmm0, %xmm1, %k7 {%k6} ## encoding: [0x62,0xf1,0x75,0x0e,0x66,0xf8]
-; CHECK-NEXT: vpcmpled %xmm1, %xmm0, %k1 {%k6} ## encoding: [0x62,0xf3,0x7d,0x0e,0x1f,0xc9,0x02]
-; CHECK-NEXT: kxorw %k0, %k0, %k2 ## encoding: [0xc5,0xfc,0x47,0xd0]
-; CHECK-NEXT: vpcmpneqd %xmm1, %xmm0, %k3 {%k6} ## encoding: [0x62,0xf3,0x7d,0x0e,0x1f,0xd9,0x04]
-; CHECK-NEXT: vpcmpled %xmm0, %xmm1, %k4 {%k6} ## encoding: [0x62,0xf3,0x75,0x0e,0x1f,0xe0,0x02]
-; CHECK-NEXT: vpcmpgtd %xmm1, %xmm0, %k5 {%k6} ## encoding: [0x62,0xf1,0x7d,0x0e,0x66,0xe9]
-; CHECK-NEXT: kshiftlw $12, %k6, %k6 ## encoding: [0xc4,0xe3,0xf9,0x32,0xf6,0x0c]
-; CHECK-NEXT: kshiftrw $12, %k6, %k6 ## encoding: [0xc4,0xe3,0xf9,0x30,0xf6,0x0c]
-; CHECK-NEXT: kmovw %k7, %eax ## encoding: [0xc5,0xf8,0x93,0xc7]
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x76,0xc1]
+; CHECK-NEXT: vpcmpgtd %xmm0, %xmm1, %k3 {%k1} ## encoding: [0x62,0xf1,0x75,0x09,0x66,0xd8]
+; CHECK-NEXT: vpcmpled %xmm1, %xmm0, %k2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x1f,0xd1,0x02]
+; CHECK-NEXT: kxorw %k0, %k0, %k4 ## encoding: [0xc5,0xfc,0x47,0xe0]
+; CHECK-NEXT: vpcmpneqd %xmm1, %xmm0, %k5 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x1f,0xe9,0x04]
+; CHECK-NEXT: vpcmpled %xmm0, %xmm1, %k6 {%k1} ## encoding: [0x62,0xf3,0x75,0x09,0x1f,0xf0,0x02]
+; CHECK-NEXT: vpcmpgtd %xmm1, %xmm0, %k7 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x66,0xf9]
+; CHECK-NEXT: kshiftlw $12, %k1, %k1 ## encoding: [0xc4,0xe3,0xf9,0x32,0xc9,0x0c]
+; CHECK-NEXT: kshiftrw $12, %k1, %k1 ## encoding: [0xc4,0xe3,0xf9,0x30,0xc9,0x0c]
+; CHECK-NEXT: kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3]
; CHECK-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
; CHECK-NEXT: vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1]
; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
-; CHECK-NEXT: kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1]
-; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
; CHECK-NEXT: kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2]
+; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
+; CHECK-NEXT: kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4]
; CHECK-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06]
-; CHECK-NEXT: kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3]
+; CHECK-NEXT: kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5]
; CHECK-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x08]
-; CHECK-NEXT: kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4]
+; CHECK-NEXT: kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6]
; CHECK-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0a]
-; CHECK-NEXT: kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5]
+; CHECK-NEXT: kmovw %k7, %eax ## encoding: [0xc5,0xf8,0x93,0xc7]
; CHECK-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0c]
-; CHECK-NEXT: kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6]
+; CHECK-NEXT: kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1]
; CHECK-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0e]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 0, i8 %mask)
@@ -5415,7 +5575,7 @@ declare i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32>, <4 x i32>, i32, i8) nounwi
define <8 x i8> @test_ucmp_d_128(<4 x i32> %a0, <4 x i32> %a1) {
; CHECK-LABEL: test_ucmp_d_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x08,0x76,0xc1]
; CHECK-NEXT: vpcmpltud %xmm1, %xmm0, %k1 ## encoding: [0x62,0xf3,0x7d,0x08,0x1e,0xc9,0x01]
; CHECK-NEXT: vpcmpleud %xmm1, %xmm0, %k2 ## encoding: [0x62,0xf3,0x7d,0x08,0x1e,0xd1,0x02]
@@ -5459,32 +5619,32 @@ define <8 x i8> @test_ucmp_d_128(<4 x i32> %a0, <4 x i32> %a1) {
define <8 x i8> @test_mask_ucmp_d_128(<4 x i32> %a0, <4 x i32> %a1, i8 %mask) {
; CHECK-LABEL: test_mask_ucmp_d_128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k6 ## encoding: [0xc5,0xf8,0x92,0xf7]
-; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k0 {%k6} ## encoding: [0x62,0xf1,0x7d,0x0e,0x76,0xc1]
-; CHECK-NEXT: vpcmpltud %xmm1, %xmm0, %k7 {%k6} ## encoding: [0x62,0xf3,0x7d,0x0e,0x1e,0xf9,0x01]
-; CHECK-NEXT: vpcmpleud %xmm1, %xmm0, %k1 {%k6} ## encoding: [0x62,0xf3,0x7d,0x0e,0x1e,0xc9,0x02]
-; CHECK-NEXT: kxorw %k0, %k0, %k2 ## encoding: [0xc5,0xfc,0x47,0xd0]
-; CHECK-NEXT: vpcmpneqd %xmm1, %xmm0, %k3 {%k6} ## encoding: [0x62,0xf3,0x7d,0x0e,0x1f,0xd9,0x04]
-; CHECK-NEXT: vpcmpnltud %xmm1, %xmm0, %k4 {%k6} ## encoding: [0x62,0xf3,0x7d,0x0e,0x1e,0xe1,0x05]
-; CHECK-NEXT: vpcmpnleud %xmm1, %xmm0, %k5 {%k6} ## encoding: [0x62,0xf3,0x7d,0x0e,0x1e,0xe9,0x06]
-; CHECK-NEXT: kshiftlw $12, %k6, %k6 ## encoding: [0xc4,0xe3,0xf9,0x32,0xf6,0x0c]
-; CHECK-NEXT: kshiftrw $12, %k6, %k6 ## encoding: [0xc4,0xe3,0xf9,0x30,0xf6,0x0c]
-; CHECK-NEXT: kmovw %k7, %eax ## encoding: [0xc5,0xf8,0x93,0xc7]
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x76,0xc1]
+; CHECK-NEXT: vpcmpltud %xmm1, %xmm0, %k3 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x1e,0xd9,0x01]
+; CHECK-NEXT: vpcmpleud %xmm1, %xmm0, %k2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x1e,0xd1,0x02]
+; CHECK-NEXT: kxorw %k0, %k0, %k4 ## encoding: [0xc5,0xfc,0x47,0xe0]
+; CHECK-NEXT: vpcmpneqd %xmm1, %xmm0, %k5 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x1f,0xe9,0x04]
+; CHECK-NEXT: vpcmpnltud %xmm1, %xmm0, %k6 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x1e,0xf1,0x05]
+; CHECK-NEXT: vpcmpnleud %xmm1, %xmm0, %k7 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x1e,0xf9,0x06]
+; CHECK-NEXT: kshiftlw $12, %k1, %k1 ## encoding: [0xc4,0xe3,0xf9,0x32,0xc9,0x0c]
+; CHECK-NEXT: kshiftrw $12, %k1, %k1 ## encoding: [0xc4,0xe3,0xf9,0x30,0xc9,0x0c]
+; CHECK-NEXT: kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3]
; CHECK-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
; CHECK-NEXT: vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1]
; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
-; CHECK-NEXT: kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1]
-; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
; CHECK-NEXT: kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2]
+; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
+; CHECK-NEXT: kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4]
; CHECK-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06]
-; CHECK-NEXT: kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3]
+; CHECK-NEXT: kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5]
; CHECK-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x08]
-; CHECK-NEXT: kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4]
+; CHECK-NEXT: kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6]
; CHECK-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0a]
-; CHECK-NEXT: kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5]
+; CHECK-NEXT: kmovw %k7, %eax ## encoding: [0xc5,0xf8,0x93,0xc7]
; CHECK-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0c]
-; CHECK-NEXT: kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6]
+; CHECK-NEXT: kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1]
; CHECK-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0e]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 0, i8 %mask)
@@ -5510,7 +5670,7 @@ declare i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32>, <4 x i32>, i32, i8) nounw
define <8 x i8> @test_cmp_q_128(<2 x i64> %a0, <2 x i64> %a1) {
; CHECK-LABEL: test_cmp_q_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf2,0xfd,0x08,0x29,0xc1]
; CHECK-NEXT: vpcmpgtq %xmm0, %xmm1, %k1 ## encoding: [0x62,0xf2,0xf5,0x08,0x37,0xc8]
; CHECK-NEXT: vpcmpleq %xmm1, %xmm0, %k2 ## encoding: [0x62,0xf3,0xfd,0x08,0x1f,0xd1,0x02]
@@ -5554,34 +5714,32 @@ define <8 x i8> @test_cmp_q_128(<2 x i64> %a0, <2 x i64> %a1) {
define <8 x i8> @test_mask_cmp_q_128(<2 x i64> %a0, <2 x i64> %a1, i8 %mask) {
; CHECK-LABEL: test_mask_cmp_q_128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k7 ## encoding: [0xc5,0xf8,0x92,0xff]
-; CHECK-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 {%k7} ## encoding: [0x62,0xf2,0xfd,0x0f,0x29,0xc1]
-; CHECK-NEXT: vpcmpgtq %xmm0, %xmm1, %k6 {%k7} ## encoding: [0x62,0xf2,0xf5,0x0f,0x37,0xf0]
-; CHECK-NEXT: vpcmpleq %xmm1, %xmm0, %k1 {%k7} ## encoding: [0x62,0xf3,0xfd,0x0f,0x1f,0xc9,0x02]
-; CHECK-NEXT: kxorw %k0, %k0, %k2 ## encoding: [0xc5,0xfc,0x47,0xd0]
-; CHECK-NEXT: vpcmpneqq %xmm1, %xmm0, %k3 {%k7} ## encoding: [0x62,0xf3,0xfd,0x0f,0x1f,0xd9,0x04]
-; CHECK-NEXT: vpcmpleq %xmm0, %xmm1, %k4 {%k7} ## encoding: [0x62,0xf3,0xf5,0x0f,0x1f,0xe0,0x02]
-; CHECK-NEXT: vpcmpgtq %xmm1, %xmm0, %k5 {%k7} ## encoding: [0x62,0xf2,0xfd,0x0f,0x37,0xe9]
-; CHECK-NEXT: kshiftlw $14, %k7, %k7 ## encoding: [0xc4,0xe3,0xf9,0x32,0xff,0x0e]
-; CHECK-NEXT: kshiftrw $14, %k7, %k7 ## encoding: [0xc4,0xe3,0xf9,0x30,0xff,0x0e]
-; CHECK-NEXT: kshiftlw $12, %k7, %k7 ## encoding: [0xc4,0xe3,0xf9,0x32,0xff,0x0c]
-; CHECK-NEXT: kshiftrw $12, %k7, %k7 ## encoding: [0xc4,0xe3,0xf9,0x30,0xff,0x0c]
-; CHECK-NEXT: kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6]
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x29,0xc1]
+; CHECK-NEXT: vpcmpgtq %xmm0, %xmm1, %k3 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0x37,0xd8]
+; CHECK-NEXT: vpcmpleq %xmm1, %xmm0, %k2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x1f,0xd1,0x02]
+; CHECK-NEXT: kxorw %k0, %k0, %k4 ## encoding: [0xc5,0xfc,0x47,0xe0]
+; CHECK-NEXT: vpcmpneqq %xmm1, %xmm0, %k5 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x1f,0xe9,0x04]
+; CHECK-NEXT: vpcmpleq %xmm0, %xmm1, %k6 {%k1} ## encoding: [0x62,0xf3,0xf5,0x09,0x1f,0xf0,0x02]
+; CHECK-NEXT: vpcmpgtq %xmm1, %xmm0, %k7 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x37,0xf9]
+; CHECK-NEXT: kshiftlw $14, %k1, %k1 ## encoding: [0xc4,0xe3,0xf9,0x32,0xc9,0x0e]
+; CHECK-NEXT: kshiftrw $14, %k1, %k1 ## encoding: [0xc4,0xe3,0xf9,0x30,0xc9,0x0e]
+; CHECK-NEXT: kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3]
; CHECK-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
; CHECK-NEXT: vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1]
; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
-; CHECK-NEXT: kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1]
-; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
; CHECK-NEXT: kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2]
+; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
+; CHECK-NEXT: kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4]
; CHECK-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06]
-; CHECK-NEXT: kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3]
+; CHECK-NEXT: kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5]
; CHECK-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x08]
-; CHECK-NEXT: kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4]
+; CHECK-NEXT: kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6]
; CHECK-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0a]
-; CHECK-NEXT: kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5]
-; CHECK-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0c]
; CHECK-NEXT: kmovw %k7, %eax ## encoding: [0xc5,0xf8,0x93,0xc7]
+; CHECK-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0c]
+; CHECK-NEXT: kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1]
; CHECK-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0e]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 0, i8 %mask)
@@ -5607,7 +5765,7 @@ declare i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64>, <2 x i64>, i32, i8) nounwi
define <8 x i8> @test_ucmp_q_128(<2 x i64> %a0, <2 x i64> %a1) {
; CHECK-LABEL: test_ucmp_q_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf2,0xfd,0x08,0x29,0xc1]
; CHECK-NEXT: vpcmpltuq %xmm1, %xmm0, %k1 ## encoding: [0x62,0xf3,0xfd,0x08,0x1e,0xc9,0x01]
; CHECK-NEXT: vpcmpleuq %xmm1, %xmm0, %k2 ## encoding: [0x62,0xf3,0xfd,0x08,0x1e,0xd1,0x02]
@@ -5651,34 +5809,32 @@ define <8 x i8> @test_ucmp_q_128(<2 x i64> %a0, <2 x i64> %a1) {
define <8 x i8> @test_mask_ucmp_q_128(<2 x i64> %a0, <2 x i64> %a1, i8 %mask) {
; CHECK-LABEL: test_mask_ucmp_q_128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k7 ## encoding: [0xc5,0xf8,0x92,0xff]
-; CHECK-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 {%k7} ## encoding: [0x62,0xf2,0xfd,0x0f,0x29,0xc1]
-; CHECK-NEXT: vpcmpltuq %xmm1, %xmm0, %k6 {%k7} ## encoding: [0x62,0xf3,0xfd,0x0f,0x1e,0xf1,0x01]
-; CHECK-NEXT: vpcmpleuq %xmm1, %xmm0, %k1 {%k7} ## encoding: [0x62,0xf3,0xfd,0x0f,0x1e,0xc9,0x02]
-; CHECK-NEXT: kxorw %k0, %k0, %k2 ## encoding: [0xc5,0xfc,0x47,0xd0]
-; CHECK-NEXT: vpcmpneqq %xmm1, %xmm0, %k3 {%k7} ## encoding: [0x62,0xf3,0xfd,0x0f,0x1f,0xd9,0x04]
-; CHECK-NEXT: vpcmpnltuq %xmm1, %xmm0, %k4 {%k7} ## encoding: [0x62,0xf3,0xfd,0x0f,0x1e,0xe1,0x05]
-; CHECK-NEXT: vpcmpnleuq %xmm1, %xmm0, %k5 {%k7} ## encoding: [0x62,0xf3,0xfd,0x0f,0x1e,0xe9,0x06]
-; CHECK-NEXT: kshiftlw $14, %k7, %k7 ## encoding: [0xc4,0xe3,0xf9,0x32,0xff,0x0e]
-; CHECK-NEXT: kshiftrw $14, %k7, %k7 ## encoding: [0xc4,0xe3,0xf9,0x30,0xff,0x0e]
-; CHECK-NEXT: kshiftlw $12, %k7, %k7 ## encoding: [0xc4,0xe3,0xf9,0x32,0xff,0x0c]
-; CHECK-NEXT: kshiftrw $12, %k7, %k7 ## encoding: [0xc4,0xe3,0xf9,0x30,0xff,0x0c]
-; CHECK-NEXT: kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6]
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x29,0xc1]
+; CHECK-NEXT: vpcmpltuq %xmm1, %xmm0, %k3 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x1e,0xd9,0x01]
+; CHECK-NEXT: vpcmpleuq %xmm1, %xmm0, %k2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x1e,0xd1,0x02]
+; CHECK-NEXT: kxorw %k0, %k0, %k4 ## encoding: [0xc5,0xfc,0x47,0xe0]
+; CHECK-NEXT: vpcmpneqq %xmm1, %xmm0, %k5 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x1f,0xe9,0x04]
+; CHECK-NEXT: vpcmpnltuq %xmm1, %xmm0, %k6 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x1e,0xf1,0x05]
+; CHECK-NEXT: vpcmpnleuq %xmm1, %xmm0, %k7 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x1e,0xf9,0x06]
+; CHECK-NEXT: kshiftlw $14, %k1, %k1 ## encoding: [0xc4,0xe3,0xf9,0x32,0xc9,0x0e]
+; CHECK-NEXT: kshiftrw $14, %k1, %k1 ## encoding: [0xc4,0xe3,0xf9,0x30,0xc9,0x0e]
+; CHECK-NEXT: kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3]
; CHECK-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
; CHECK-NEXT: vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1]
; CHECK-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
-; CHECK-NEXT: kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1]
-; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
; CHECK-NEXT: kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2]
+; CHECK-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
+; CHECK-NEXT: kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4]
; CHECK-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06]
-; CHECK-NEXT: kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3]
+; CHECK-NEXT: kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5]
; CHECK-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x08]
-; CHECK-NEXT: kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4]
+; CHECK-NEXT: kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6]
; CHECK-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0a]
-; CHECK-NEXT: kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5]
-; CHECK-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0c]
; CHECK-NEXT: kmovw %k7, %eax ## encoding: [0xc5,0xf8,0x93,0xc7]
+; CHECK-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0c]
+; CHECK-NEXT: kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1]
; CHECK-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0e]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 0, i8 %mask)
@@ -5701,3 +5857,286 @@ define <8 x i8> @test_mask_ucmp_q_128(<2 x i64> %a0, <2 x i64> %a1, i8 %mask) {
}
declare i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64>, <2 x i64>, i32, i8) nounwind readnone
+
+declare <8 x float> @llvm.x86.avx512.mask.broadcastf32x4.256(<4 x float>, <8 x float>, i8)
+
+define <8 x float>@test_int_x86_avx512_mask_broadcastf32x4_256(<4 x float> %x0, <8 x float> %x2, i8 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_mask_broadcastf32x4_256:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: ## kill: def %xmm0 killed %xmm0 def %ymm0
+; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x18,0xd0,0x01]
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vinsertf32x4 $1, %xmm0, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x18,0xc8,0x01]
+; CHECK-NEXT: vaddps %ymm1, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xc9]
+; CHECK-NEXT: vinsertf32x4 $1, %xmm0, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xa9,0x18,0xc0,0x01]
+; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res1 = call <8 x float> @llvm.x86.avx512.mask.broadcastf32x4.256(<4 x float> %x0, <8 x float> %x2, i8 -1)
+ %res2 = call <8 x float> @llvm.x86.avx512.mask.broadcastf32x4.256(<4 x float> %x0, <8 x float> %x2, i8 %mask)
+ %res3 = call <8 x float> @llvm.x86.avx512.mask.broadcastf32x4.256(<4 x float> %x0, <8 x float> zeroinitializer, i8 %mask)
+ %res4 = fadd <8 x float> %res1, %res2
+ %res5 = fadd <8 x float> %res3, %res4
+ ret <8 x float> %res5
+}
+
+define <8 x float>@test_int_x86_avx512_mask_broadcastf32x4_256_load(<4 x float>* %x0ptr, <8 x float> %x2, i8 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_mask_broadcastf32x4_256_load:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vbroadcastf32x4 (%rdi), %ymm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x1a,0x07]
+; CHECK-NEXT: ## ymm0 {%k1} = mem[0,1,2,3,0,1,2,3]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %x0 = load <4 x float>, <4 x float>* %x0ptr
+ %res = call <8 x float> @llvm.x86.avx512.mask.broadcastf32x4.256(<4 x float> %x0, <8 x float> %x2, i8 %mask)
+ ret <8 x float> %res
+}
+
+declare <8 x i32> @llvm.x86.avx512.mask.broadcasti32x4.256(<4 x i32>, <8 x i32>, i8)
+
+define <8 x i32>@test_int_x86_avx512_mask_broadcasti32x4_256(<4 x i32> %x0, <8 x i32> %x2, i8 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_mask_broadcasti32x4_256:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: ## kill: def %xmm0 killed %xmm0 def %ymm0
+; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x38,0xd0,0x01]
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x38,0xc8,0x01]
+; CHECK-NEXT: vpaddd %ymm1, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc9]
+; CHECK-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xa9,0x38,0xc0,0x01]
+; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc1]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res1 = call <8 x i32> @llvm.x86.avx512.mask.broadcasti32x4.256(<4 x i32> %x0, <8 x i32> %x2, i8 -1)
+ %res2 = call <8 x i32> @llvm.x86.avx512.mask.broadcasti32x4.256(<4 x i32> %x0, <8 x i32> %x2, i8 %mask)
+ %res3 = call <8 x i32> @llvm.x86.avx512.mask.broadcasti32x4.256(<4 x i32> %x0, <8 x i32> zeroinitializer, i8 %mask)
+ %res4 = add <8 x i32> %res1, %res2
+ %res5 = add <8 x i32> %res3, %res4
+ ret <8 x i32> %res5
+}
+
+define <8 x i32>@test_int_x86_avx512_mask_broadcasti32x4_256_load(<4 x i32>* %x0ptr, <8 x i32> %x2, i8 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_mask_broadcasti32x4_256_load:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT: vbroadcasti32x4 (%rdi), %ymm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x5a,0x07]
+; CHECK-NEXT: ## ymm0 {%k1} = mem[0,1,2,3,0,1,2,3]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %x0 = load <4 x i32>, <4 x i32>* %x0ptr
+ %res = call <8 x i32> @llvm.x86.avx512.mask.broadcasti32x4.256(<4 x i32> %x0, <8 x i32> %x2, i8 %mask)
+ ret <8 x i32> %res
+}
+
+declare <2 x i64> @llvm.x86.avx512.mask.pabs.q.128(<2 x i64>, <2 x i64>, i8)
+
+define <2 x i64>@test_int_x86_avx512_mask_pabs_q_128(<2 x i64> %x0, <2 x i64> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pabs_q_128:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpabsq %xmm0, %xmm2 ## encoding: [0x62,0xf2,0xfd,0x08,0x1f,0xd0]
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpabsq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x1f,0xc8]
+; CHECK-NEXT: vpaddq %xmm2, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <2 x i64> @llvm.x86.avx512.mask.pabs.q.128(<2 x i64> %x0, <2 x i64> %x1, i8 %x2)
+ %res1 = call <2 x i64> @llvm.x86.avx512.mask.pabs.q.128(<2 x i64> %x0, <2 x i64> %x1, i8 -1)
+ %res2 = add <2 x i64> %res, %res1
+ ret <2 x i64> %res2
+}
+
+declare <4 x i64> @llvm.x86.avx512.mask.pabs.q.256(<4 x i64>, <4 x i64>, i8)
+
+define <4 x i64>@test_int_x86_avx512_mask_pabs_q_256(<4 x i64> %x0, <4 x i64> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pabs_q_256:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpabsq %ymm0, %ymm2 ## encoding: [0x62,0xf2,0xfd,0x28,0x1f,0xd0]
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpabsq %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x1f,0xc8]
+; CHECK-NEXT: vpaddq %ymm2, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <4 x i64> @llvm.x86.avx512.mask.pabs.q.256(<4 x i64> %x0, <4 x i64> %x1, i8 %x2)
+ %res1 = call <4 x i64> @llvm.x86.avx512.mask.pabs.q.256(<4 x i64> %x0, <4 x i64> %x1, i8 -1)
+ %res2 = add <4 x i64> %res, %res1
+ ret <4 x i64> %res2
+}
+
+declare <4 x i32> @llvm.x86.avx512.mask.pabs.d.128(<4 x i32>, <4 x i32>, i8)
+
+define <4 x i32>@test_int_x86_avx512_mask_pabs_d_128(<4 x i32> %x0, <4 x i32> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pabs_d_128:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpabsd %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x1e,0xd0]
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpabsd %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x1e,0xc8]
+; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <4 x i32> @llvm.x86.avx512.mask.pabs.d.128(<4 x i32> %x0, <4 x i32> %x1, i8 %x2)
+ %res1 = call <4 x i32> @llvm.x86.avx512.mask.pabs.d.128(<4 x i32> %x0, <4 x i32> %x1, i8 -1)
+ %res2 = add <4 x i32> %res, %res1
+ ret <4 x i32> %res2
+}
+
+declare <8 x i32> @llvm.x86.avx512.mask.pabs.d.256(<8 x i32>, <8 x i32>, i8)
+
+define <8 x i32>@test_int_x86_avx512_mask_pabs_d_256(<8 x i32> %x0, <8 x i32> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_pabs_d_256:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpabsd %ymm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x1e,0xd0]
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vpabsd %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x1e,0xc8]
+; CHECK-NEXT: vpaddd %ymm2, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc2]
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call <8 x i32> @llvm.x86.avx512.mask.pabs.d.256(<8 x i32> %x0, <8 x i32> %x1, i8 %x2)
+ %res1 = call <8 x i32> @llvm.x86.avx512.mask.pabs.d.256(<8 x i32> %x0, <8 x i32> %x1, i8 -1)
+ %res2 = add <8 x i32> %res, %res1
+ ret <8 x i32> %res2
+}
+
+declare i8 @llvm.x86.avx512.ptestm.d.128(<4 x i32>, <4 x i32>,i8)
+
+define i8@test_int_x86_avx512_ptestm_d_128(<4 x i32> %x0, <4 x i32> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_ptestm_d_128:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vptestmd %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf2,0x7d,0x08,0x27,0xc1]
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vptestmd %xmm1, %xmm0, %k1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x27,0xc9]
+; CHECK-NEXT: kmovw %k1, %ecx ## encoding: [0xc5,0xf8,0x93,0xc9]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: addb %cl, %al ## encoding: [0x00,0xc8]
+; CHECK-NEXT: ## kill: def %al killed %al killed %eax
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call i8 @llvm.x86.avx512.ptestm.d.128(<4 x i32> %x0, <4 x i32> %x1, i8 %x2)
+ %res1 = call i8 @llvm.x86.avx512.ptestm.d.128(<4 x i32> %x0, <4 x i32> %x1, i8-1)
+ %res2 = add i8 %res, %res1
+ ret i8 %res2
+}
+
+declare i8 @llvm.x86.avx512.ptestm.d.256(<8 x i32>, <8 x i32>, i8)
+
+define i8@test_int_x86_avx512_ptestm_d_256(<8 x i32> %x0, <8 x i32> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_ptestm_d_256:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vptestmd %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf2,0x7d,0x28,0x27,0xc1]
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vptestmd %ymm1, %ymm0, %k1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x27,0xc9]
+; CHECK-NEXT: kmovw %k1, %ecx ## encoding: [0xc5,0xf8,0x93,0xc9]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: addb %cl, %al ## encoding: [0x00,0xc8]
+; CHECK-NEXT: ## kill: def %al killed %al killed %eax
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call i8 @llvm.x86.avx512.ptestm.d.256(<8 x i32> %x0, <8 x i32> %x1, i8 %x2)
+ %res1 = call i8 @llvm.x86.avx512.ptestm.d.256(<8 x i32> %x0, <8 x i32> %x1, i8-1)
+ %res2 = add i8 %res, %res1
+ ret i8 %res2
+}
+
+declare i8 @llvm.x86.avx512.ptestm.q.128(<2 x i64>, <2 x i64>, i8)
+
+define i8@test_int_x86_avx512_ptestm_q_128(<2 x i64> %x0, <2 x i64> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_ptestm_q_128:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vptestmq %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf2,0xfd,0x08,0x27,0xc1]
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vptestmq %xmm1, %xmm0, %k1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x27,0xc9]
+; CHECK-NEXT: kmovw %k1, %ecx ## encoding: [0xc5,0xf8,0x93,0xc9]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: addb %cl, %al ## encoding: [0x00,0xc8]
+; CHECK-NEXT: ## kill: def %al killed %al killed %eax
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call i8 @llvm.x86.avx512.ptestm.q.128(<2 x i64> %x0, <2 x i64> %x1, i8 %x2)
+ %res1 = call i8 @llvm.x86.avx512.ptestm.q.128(<2 x i64> %x0, <2 x i64> %x1, i8-1)
+ %res2 = add i8 %res, %res1
+ ret i8 %res2
+}
+
+declare i8 @llvm.x86.avx512.ptestm.q.256(<4 x i64>, <4 x i64>, i8)
+
+define i8@test_int_x86_avx512_ptestm_q_256(<4 x i64> %x0, <4 x i64> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_ptestm_q_256:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vptestmq %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf2,0xfd,0x28,0x27,0xc1]
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vptestmq %ymm1, %ymm0, %k1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x27,0xc9]
+; CHECK-NEXT: kmovw %k1, %ecx ## encoding: [0xc5,0xf8,0x93,0xc9]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: addb %cl, %al ## encoding: [0x00,0xc8]
+; CHECK-NEXT: ## kill: def %al killed %al killed %eax
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call i8 @llvm.x86.avx512.ptestm.q.256(<4 x i64> %x0, <4 x i64> %x1, i8 %x2)
+ %res1 = call i8 @llvm.x86.avx512.ptestm.q.256(<4 x i64> %x0, <4 x i64> %x1, i8-1)
+ %res2 = add i8 %res, %res1
+ ret i8 %res2
+}
+
+declare i8 @llvm.x86.avx512.ptestnm.d.128(<4 x i32>, <4 x i32>, i8 %x2)
+
+define i8@test_int_x86_avx512_ptestnm_d_128(<4 x i32> %x0, <4 x i32> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_ptestnm_d_128:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vptestnmd %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf2,0x7e,0x08,0x27,0xc1]
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vptestnmd %xmm1, %xmm0, %k1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x27,0xc9]
+; CHECK-NEXT: kmovw %k1, %ecx ## encoding: [0xc5,0xf8,0x93,0xc9]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: addb %cl, %al ## encoding: [0x00,0xc8]
+; CHECK-NEXT: ## kill: def %al killed %al killed %eax
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call i8 @llvm.x86.avx512.ptestnm.d.128(<4 x i32> %x0, <4 x i32> %x1, i8 %x2)
+ %res1 = call i8 @llvm.x86.avx512.ptestnm.d.128(<4 x i32> %x0, <4 x i32> %x1, i8-1)
+ %res2 = add i8 %res, %res1
+ ret i8 %res2
+}
+
+declare i8 @llvm.x86.avx512.ptestnm.d.256(<8 x i32>, <8 x i32>, i8 %x2)
+
+define i8@test_int_x86_avx512_ptestnm_d_256(<8 x i32> %x0, <8 x i32> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_ptestnm_d_256:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vptestnmd %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf2,0x7e,0x28,0x27,0xc1]
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vptestnmd %ymm1, %ymm0, %k1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x27,0xc9]
+; CHECK-NEXT: kmovw %k1, %ecx ## encoding: [0xc5,0xf8,0x93,0xc9]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: addb %cl, %al ## encoding: [0x00,0xc8]
+; CHECK-NEXT: ## kill: def %al killed %al killed %eax
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call i8 @llvm.x86.avx512.ptestnm.d.256(<8 x i32> %x0, <8 x i32> %x1, i8 %x2)
+ %res1 = call i8 @llvm.x86.avx512.ptestnm.d.256(<8 x i32> %x0, <8 x i32> %x1, i8-1)
+ %res2 = add i8 %res, %res1
+ ret i8 %res2
+}
+
+declare i8 @llvm.x86.avx512.ptestnm.q.128(<2 x i64>, <2 x i64>, i8 %x2)
+
+define i8@test_int_x86_avx512_ptestnm_q_128(<2 x i64> %x0, <2 x i64> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_ptestnm_q_128:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vptestnmq %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf2,0xfe,0x08,0x27,0xc1]
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vptestnmq %xmm1, %xmm0, %k1 {%k1} ## encoding: [0x62,0xf2,0xfe,0x09,0x27,0xc9]
+; CHECK-NEXT: kmovw %k1, %ecx ## encoding: [0xc5,0xf8,0x93,0xc9]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: addb %cl, %al ## encoding: [0x00,0xc8]
+; CHECK-NEXT: ## kill: def %al killed %al killed %eax
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call i8 @llvm.x86.avx512.ptestnm.q.128(<2 x i64> %x0, <2 x i64> %x1, i8 %x2)
+ %res1 = call i8 @llvm.x86.avx512.ptestnm.q.128(<2 x i64> %x0, <2 x i64> %x1, i8-1)
+ %res2 = add i8 %res, %res1
+ ret i8 %res2
+}
+
+declare i8 @llvm.x86.avx512.ptestnm.q.256(<4 x i64>, <4 x i64>, i8 %x2)
+
+define i8@test_int_x86_avx512_ptestnm_q_256(<4 x i64> %x0, <4 x i64> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_ptestnm_q_256:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vptestnmq %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf2,0xfe,0x28,0x27,0xc1]
+; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT: vptestnmq %ymm1, %ymm0, %k1 {%k1} ## encoding: [0x62,0xf2,0xfe,0x29,0x27,0xc9]
+; CHECK-NEXT: kmovw %k1, %ecx ## encoding: [0xc5,0xf8,0x93,0xc9]
+; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT: addb %cl, %al ## encoding: [0x00,0xc8]
+; CHECK-NEXT: ## kill: def %al killed %al killed %eax
+; CHECK-NEXT: retq ## encoding: [0xc3]
+ %res = call i8 @llvm.x86.avx512.ptestnm.q.256(<4 x i64> %x0, <4 x i64> %x1, i8 %x2)
+ %res1 = call i8 @llvm.x86.avx512.ptestnm.q.256(<4 x i64> %x0, <4 x i64> %x1, i8-1)
+ %res2 = add i8 %res, %res1
+ ret i8 %res2
+}
+
diff --git a/test/CodeGen/X86/avx512vl-intrinsics.ll b/test/CodeGen/X86/avx512vl-intrinsics.ll
index 328b40d1527b..f635342218a5 100644
--- a/test/CodeGen/X86/avx512vl-intrinsics.ll
+++ b/test/CodeGen/X86/avx512vl-intrinsics.ll
@@ -3,7 +3,7 @@
define void @compr1(i8* %addr, <8 x double> %data, i8 %mask) {
; CHECK-LABEL: compr1:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vcompresspd %zmm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x8a,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -15,7 +15,7 @@ declare void @llvm.x86.avx512.mask.compress.store.pd.512(i8* %addr, <8 x double>
define void @compr2(i8* %addr, <4 x double> %data, i8 %mask) {
; CHECK-LABEL: compr2:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vcompresspd %ymm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x8a,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -27,7 +27,7 @@ declare void @llvm.x86.avx512.mask.compress.store.pd.256(i8* %addr, <4 x double>
define void @compr3(i8* %addr, <4 x float> %data, i8 %mask) {
; CHECK-LABEL: compr3:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vcompressps %xmm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x8a,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -39,7 +39,7 @@ declare void @llvm.x86.avx512.mask.compress.store.ps.128(i8* %addr, <4 x float>
define <8 x double> @compr4(i8* %addr, <8 x double> %data, i8 %mask) {
; CHECK-LABEL: compr4:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vcompresspd %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xc9,0x8a,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -51,7 +51,7 @@ declare <8 x double> @llvm.x86.avx512.mask.compress.pd.512(<8 x double> %data, <
define <4 x double> @compr5(<4 x double> %data, <4 x double> %src0, i8 %mask) {
; CHECK-LABEL: compr5:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vcompresspd %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x8a,0xc1]
; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
@@ -64,7 +64,7 @@ declare <4 x double> @llvm.x86.avx512.mask.compress.pd.256(<4 x double> %data, <
define <4 x float> @compr6(<4 x float> %data, i8 %mask) {
; CHECK-LABEL: compr6:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vcompressps %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x8a,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -76,7 +76,7 @@ declare <4 x float> @llvm.x86.avx512.mask.compress.ps.128(<4 x float> %data, <4
define void @compr7(i8* %addr, <8 x double> %data) {
; CHECK-LABEL: compr7:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovups %zmm0, (%rdi) ## encoding: [0x62,0xf1,0x7c,0x48,0x11,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
call void @llvm.x86.avx512.mask.compress.store.pd.512(i8* %addr, <8 x double> %data, i8 -1)
@@ -85,7 +85,7 @@ define void @compr7(i8* %addr, <8 x double> %data) {
define <4 x float> @compr8(<4 x float> %data) {
; CHECK-LABEL: compr8:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.compress.ps.128(<4 x float> %data, <4 x float>zeroinitializer, i8 -1)
ret <4 x float> %res
@@ -93,7 +93,7 @@ define <4 x float> @compr8(<4 x float> %data) {
define void @compr9(i8* %addr, <8 x i64> %data, i8 %mask) {
; CHECK-LABEL: compr9:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpcompressq %zmm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x8b,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -105,7 +105,7 @@ declare void @llvm.x86.avx512.mask.compress.store.q.512(i8* %addr, <8 x i64> %da
define <4 x i32> @compr10(<4 x i32> %data, i8 %mask) {
; CHECK-LABEL: compr10:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpcompressd %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x8b,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -119,7 +119,7 @@ define <4 x i32> @compr10(<4 x i32> %data, i8 %mask) {
define i32 @compr11() {
; CHECK-LABEL: compr11:
-; CHECK: ## BB#0: ## %entry
+; CHECK: ## %bb.0: ## %entry
; CHECK-NEXT: movq _xmm@{{.*}}(%rip), %rax ## encoding: [0x48,0x8b,0x05,A,A,A,A]
; CHECK-NEXT: ## fixup A - offset: 3, value: _xmm@GOTPCREL-4, kind: reloc_riprel_4byte_movq_load
; CHECK-NEXT: vmovdqa (%rax), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0x00]
@@ -150,7 +150,7 @@ declare <4 x i32> @llvm.x86.avx512.mask.compress.d.128(<4 x i32> %data, <4 x i32
define <8 x double> @expand1(i8* %addr, <8 x double> %data, i8 %mask) {
; CHECK-LABEL: expand1:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vexpandpd (%rdi), %zmm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x88,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -162,7 +162,7 @@ declare <8 x double> @llvm.x86.avx512.mask.expand.load.pd.512(i8* %addr, <8 x do
define <4 x double> @expand2(i8* %addr, <4 x double> %data, i8 %mask) {
; CHECK-LABEL: expand2:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vexpandpd (%rdi), %ymm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x88,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -174,7 +174,7 @@ declare <4 x double> @llvm.x86.avx512.mask.expand.load.pd.256(i8* %addr, <4 x do
define <4 x float> @expand3(i8* %addr, <4 x float> %data, i8 %mask) {
; CHECK-LABEL: expand3:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vexpandps (%rdi), %xmm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x88,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -186,7 +186,7 @@ declare <4 x float> @llvm.x86.avx512.mask.expand.load.ps.128(i8* %addr, <4 x flo
define <8 x double> @expand4(i8* %addr, <8 x double> %data, i8 %mask) {
; CHECK-LABEL: expand4:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vexpandpd %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xc9,0x88,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -198,7 +198,7 @@ declare <8 x double> @llvm.x86.avx512.mask.expand.pd.512(<8 x double> %data, <8
define <4 x double> @expand5(<4 x double> %data, <4 x double> %src0, i8 %mask) {
; CHECK-LABEL: expand5:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vexpandpd %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x88,0xc8]
; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
@@ -211,7 +211,7 @@ declare <4 x double> @llvm.x86.avx512.mask.expand.pd.256(<4 x double> %data, <4
define <4 x float> @expand6(<4 x float> %data, i8 %mask) {
; CHECK-LABEL: expand6:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vexpandps %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x88,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -223,7 +223,7 @@ declare <4 x float> @llvm.x86.avx512.mask.expand.ps.128(<4 x float> %data, <4 x
define <8 x double> @expand7(i8* %addr, <8 x double> %data) {
; CHECK-LABEL: expand7:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovups (%rdi), %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x10,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x double> @llvm.x86.avx512.mask.expand.load.pd.512(i8* %addr, <8 x double> %data, i8 -1)
@@ -232,7 +232,7 @@ define <8 x double> @expand7(i8* %addr, <8 x double> %data) {
define <4 x float> @expand8(<4 x float> %data) {
; CHECK-LABEL: expand8:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.expand.ps.128(<4 x float> %data, <4 x float>zeroinitializer, i8 -1)
ret <4 x float> %res
@@ -240,7 +240,7 @@ define <4 x float> @expand8(<4 x float> %data) {
define <8 x i64> @expand9(i8* %addr, <8 x i64> %data, i8 %mask) {
; CHECK-LABEL: expand9:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpexpandq (%rdi), %zmm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x89,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -252,7 +252,7 @@ declare <8 x i64> @llvm.x86.avx512.mask.expand.load.q.512(i8* %addr, <8 x i64> %
define <4 x i32> @expand10(<4 x i32> %data, i8 %mask) {
; CHECK-LABEL: expand10:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpexpandd %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x89,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -264,7 +264,7 @@ declare <4 x i32> @llvm.x86.avx512.mask.expand.d.128(<4 x i32> %data, <4 x i32>
define <8 x i64> @expand11(i8* %addr) {
; CHECK-LABEL: expand11:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovups (%rdi), %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x10,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i64> @llvm.x86.avx512.mask.expand.load.q.512(i8* %addr, <8 x i64> undef, i8 -1)
@@ -273,7 +273,7 @@ define <8 x i64> @expand11(i8* %addr) {
define <8 x i64> @expand12(i8* %addr, i8 %mask) {
; CHECK-LABEL: expand12:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpexpandq (%rdi), %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xc9,0x89,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -287,7 +287,7 @@ declare <8 x i64> @llvm.x86.avx512.mask.expand.q.512(<8 x i64> , <8 x i64>, i8)
define < 2 x i64> @test_mask_mul_epi32_rr_128(< 4 x i32> %a, < 4 x i32> %b) {
; CHECK-LABEL: test_mask_mul_epi32_rr_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpmuldq %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x28,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call < 2 x i64> @llvm.x86.avx512.mask.pmul.dq.128(< 4 x i32> %a, < 4 x i32> %b, < 2 x i64> zeroinitializer, i8 -1)
@@ -296,7 +296,7 @@ define < 2 x i64> @test_mask_mul_epi32_rr_128(< 4 x i32> %a, < 4 x i32> %b) {
define < 2 x i64> @test_mask_mul_epi32_rrk_128(< 4 x i32> %a, < 4 x i32> %b, < 2 x i64> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_mul_epi32_rrk_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpmuldq %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x28,0xd1]
; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
@@ -307,7 +307,7 @@ define < 2 x i64> @test_mask_mul_epi32_rrk_128(< 4 x i32> %a, < 4 x i32> %b, < 2
define < 2 x i64> @test_mask_mul_epi32_rrkz_128(< 4 x i32> %a, < 4 x i32> %b, i8 %mask) {
; CHECK-LABEL: test_mask_mul_epi32_rrkz_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpmuldq %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0x28,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -317,7 +317,7 @@ define < 2 x i64> @test_mask_mul_epi32_rrkz_128(< 4 x i32> %a, < 4 x i32> %b, i8
define < 2 x i64> @test_mask_mul_epi32_rm_128(< 4 x i32> %a, < 4 x i32>* %ptr_b) {
; CHECK-LABEL: test_mask_mul_epi32_rm_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpmuldq (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x28,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load < 4 x i32>, < 4 x i32>* %ptr_b
@@ -327,7 +327,7 @@ define < 2 x i64> @test_mask_mul_epi32_rm_128(< 4 x i32> %a, < 4 x i32>* %ptr_b)
define < 2 x i64> @test_mask_mul_epi32_rmk_128(< 4 x i32> %a, < 4 x i32>* %ptr_b, < 2 x i64> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_mul_epi32_rmk_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpmuldq (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x28,0x0f]
; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
@@ -339,7 +339,7 @@ define < 2 x i64> @test_mask_mul_epi32_rmk_128(< 4 x i32> %a, < 4 x i32>* %ptr_b
define < 2 x i64> @test_mask_mul_epi32_rmkz_128(< 4 x i32> %a, < 4 x i32>* %ptr_b, i8 %mask) {
; CHECK-LABEL: test_mask_mul_epi32_rmkz_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpmuldq (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0x28,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -350,7 +350,7 @@ define < 2 x i64> @test_mask_mul_epi32_rmkz_128(< 4 x i32> %a, < 4 x i32>* %ptr_
define < 2 x i64> @test_mask_mul_epi32_rmb_128(< 4 x i32> %a, i64* %ptr_b) {
; CHECK-LABEL: test_mask_mul_epi32_rmb_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpmuldq (%rdi){1to2}, %xmm0, %xmm0 ## encoding: [0x62,0xf2,0xfd,0x18,0x28,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load i64, i64* %ptr_b
@@ -363,7 +363,7 @@ define < 2 x i64> @test_mask_mul_epi32_rmb_128(< 4 x i32> %a, i64* %ptr_b) {
define < 2 x i64> @test_mask_mul_epi32_rmbk_128(< 4 x i32> %a, i64* %ptr_b, < 2 x i64> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_mul_epi32_rmbk_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpmuldq (%rdi){1to2}, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x19,0x28,0x0f]
; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
@@ -378,7 +378,7 @@ define < 2 x i64> @test_mask_mul_epi32_rmbk_128(< 4 x i32> %a, i64* %ptr_b, < 2
define < 2 x i64> @test_mask_mul_epi32_rmbkz_128(< 4 x i32> %a, i64* %ptr_b, i8 %mask) {
; CHECK-LABEL: test_mask_mul_epi32_rmbkz_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpmuldq (%rdi){1to2}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x99,0x28,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -394,7 +394,7 @@ declare < 2 x i64> @llvm.x86.avx512.mask.pmul.dq.128(< 4 x i32>, < 4 x i32>, < 2
define < 4 x i64> @test_mask_mul_epi32_rr_256(< 8 x i32> %a, < 8 x i32> %b) {
; CHECK-LABEL: test_mask_mul_epi32_rr_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpmuldq %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x28,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call < 4 x i64> @llvm.x86.avx512.mask.pmul.dq.256(< 8 x i32> %a, < 8 x i32> %b, < 4 x i64> zeroinitializer, i8 -1)
@@ -403,7 +403,7 @@ define < 4 x i64> @test_mask_mul_epi32_rr_256(< 8 x i32> %a, < 8 x i32> %b) {
define < 4 x i64> @test_mask_mul_epi32_rrk_256(< 8 x i32> %a, < 8 x i32> %b, < 4 x i64> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_mul_epi32_rrk_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpmuldq %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x28,0xd1]
; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2]
@@ -414,7 +414,7 @@ define < 4 x i64> @test_mask_mul_epi32_rrk_256(< 8 x i32> %a, < 8 x i32> %b, < 4
define < 4 x i64> @test_mask_mul_epi32_rrkz_256(< 8 x i32> %a, < 8 x i32> %b, i8 %mask) {
; CHECK-LABEL: test_mask_mul_epi32_rrkz_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpmuldq %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x28,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -424,7 +424,7 @@ define < 4 x i64> @test_mask_mul_epi32_rrkz_256(< 8 x i32> %a, < 8 x i32> %b, i8
define < 4 x i64> @test_mask_mul_epi32_rm_256(< 8 x i32> %a, < 8 x i32>* %ptr_b) {
; CHECK-LABEL: test_mask_mul_epi32_rm_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpmuldq (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x28,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load < 8 x i32>, < 8 x i32>* %ptr_b
@@ -434,7 +434,7 @@ define < 4 x i64> @test_mask_mul_epi32_rm_256(< 8 x i32> %a, < 8 x i32>* %ptr_b)
define < 4 x i64> @test_mask_mul_epi32_rmk_256(< 8 x i32> %a, < 8 x i32>* %ptr_b, < 4 x i64> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_mul_epi32_rmk_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpmuldq (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x28,0x0f]
; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
@@ -446,7 +446,7 @@ define < 4 x i64> @test_mask_mul_epi32_rmk_256(< 8 x i32> %a, < 8 x i32>* %ptr_b
define < 4 x i64> @test_mask_mul_epi32_rmkz_256(< 8 x i32> %a, < 8 x i32>* %ptr_b, i8 %mask) {
; CHECK-LABEL: test_mask_mul_epi32_rmkz_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpmuldq (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x28,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -457,7 +457,7 @@ define < 4 x i64> @test_mask_mul_epi32_rmkz_256(< 8 x i32> %a, < 8 x i32>* %ptr_
define < 4 x i64> @test_mask_mul_epi32_rmb_256(< 8 x i32> %a, i64* %ptr_b) {
; CHECK-LABEL: test_mask_mul_epi32_rmb_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpmuldq (%rdi){1to4}, %ymm0, %ymm0 ## encoding: [0x62,0xf2,0xfd,0x38,0x28,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load i64, i64* %ptr_b
@@ -470,7 +470,7 @@ define < 4 x i64> @test_mask_mul_epi32_rmb_256(< 8 x i32> %a, i64* %ptr_b) {
define < 4 x i64> @test_mask_mul_epi32_rmbk_256(< 8 x i32> %a, i64* %ptr_b, < 4 x i64> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_mul_epi32_rmbk_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpmuldq (%rdi){1to4}, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x39,0x28,0x0f]
; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
@@ -485,7 +485,7 @@ define < 4 x i64> @test_mask_mul_epi32_rmbk_256(< 8 x i32> %a, i64* %ptr_b, < 4
define < 4 x i64> @test_mask_mul_epi32_rmbkz_256(< 8 x i32> %a, i64* %ptr_b, i8 %mask) {
; CHECK-LABEL: test_mask_mul_epi32_rmbkz_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpmuldq (%rdi){1to4}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xb9,0x28,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -501,7 +501,7 @@ declare < 4 x i64> @llvm.x86.avx512.mask.pmul.dq.256(< 8 x i32>, < 8 x i32>, < 4
define < 2 x i64> @test_mask_mul_epu32_rr_128(< 4 x i32> %a, < 4 x i32> %b) {
; CHECK-LABEL: test_mask_mul_epu32_rr_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xf4,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call < 2 x i64> @llvm.x86.avx512.mask.pmulu.dq.128(< 4 x i32> %a, < 4 x i32> %b, < 2 x i64> zeroinitializer, i8 -1)
@@ -510,7 +510,7 @@ define < 2 x i64> @test_mask_mul_epu32_rr_128(< 4 x i32> %a, < 4 x i32> %b) {
define < 2 x i64> @test_mask_mul_epu32_rrk_128(< 4 x i32> %a, < 4 x i32> %b, < 2 x i64> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_mul_epu32_rrk_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0xf4,0xd1]
; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
@@ -521,7 +521,7 @@ define < 2 x i64> @test_mask_mul_epu32_rrk_128(< 4 x i32> %a, < 4 x i32> %b, < 2
define < 2 x i64> @test_mask_mul_epu32_rrkz_128(< 4 x i32> %a, < 4 x i32> %b, i8 %mask) {
; CHECK-LABEL: test_mask_mul_epu32_rrkz_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0xf4,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -531,7 +531,7 @@ define < 2 x i64> @test_mask_mul_epu32_rrkz_128(< 4 x i32> %a, < 4 x i32> %b, i8
define < 2 x i64> @test_mask_mul_epu32_rm_128(< 4 x i32> %a, < 4 x i32>* %ptr_b) {
; CHECK-LABEL: test_mask_mul_epu32_rm_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpmuludq (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xf4,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load < 4 x i32>, < 4 x i32>* %ptr_b
@@ -541,7 +541,7 @@ define < 2 x i64> @test_mask_mul_epu32_rm_128(< 4 x i32> %a, < 4 x i32>* %ptr_b)
define < 2 x i64> @test_mask_mul_epu32_rmk_128(< 4 x i32> %a, < 4 x i32>* %ptr_b, < 2 x i64> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_mul_epu32_rmk_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpmuludq (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0xf4,0x0f]
; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
@@ -553,7 +553,7 @@ define < 2 x i64> @test_mask_mul_epu32_rmk_128(< 4 x i32> %a, < 4 x i32>* %ptr_b
define < 2 x i64> @test_mask_mul_epu32_rmkz_128(< 4 x i32> %a, < 4 x i32>* %ptr_b, i8 %mask) {
; CHECK-LABEL: test_mask_mul_epu32_rmkz_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpmuludq (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0xf4,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -564,7 +564,7 @@ define < 2 x i64> @test_mask_mul_epu32_rmkz_128(< 4 x i32> %a, < 4 x i32>* %ptr_
define < 2 x i64> @test_mask_mul_epu32_rmb_128(< 4 x i32> %a, i64* %ptr_b) {
; CHECK-LABEL: test_mask_mul_epu32_rmb_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpmuludq (%rdi){1to2}, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x18,0xf4,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load i64, i64* %ptr_b
@@ -577,7 +577,7 @@ define < 2 x i64> @test_mask_mul_epu32_rmb_128(< 4 x i32> %a, i64* %ptr_b) {
define < 2 x i64> @test_mask_mul_epu32_rmbk_128(< 4 x i32> %a, i64* %ptr_b, < 2 x i64> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_mul_epu32_rmbk_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpmuludq (%rdi){1to2}, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x19,0xf4,0x0f]
; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
@@ -592,7 +592,7 @@ define < 2 x i64> @test_mask_mul_epu32_rmbk_128(< 4 x i32> %a, i64* %ptr_b, < 2
define < 2 x i64> @test_mask_mul_epu32_rmbkz_128(< 4 x i32> %a, i64* %ptr_b, i8 %mask) {
; CHECK-LABEL: test_mask_mul_epu32_rmbkz_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpmuludq (%rdi){1to2}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x99,0xf4,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -608,7 +608,7 @@ declare < 2 x i64> @llvm.x86.avx512.mask.pmulu.dq.128(< 4 x i32>, < 4 x i32>, <
define < 4 x i64> @test_mask_mul_epu32_rr_256(< 8 x i32> %a, < 8 x i32> %b) {
; CHECK-LABEL: test_mask_mul_epu32_rr_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xf4,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call < 4 x i64> @llvm.x86.avx512.mask.pmulu.dq.256(< 8 x i32> %a, < 8 x i32> %b, < 4 x i64> zeroinitializer, i8 -1)
@@ -617,7 +617,7 @@ define < 4 x i64> @test_mask_mul_epu32_rr_256(< 8 x i32> %a, < 8 x i32> %b) {
define < 4 x i64> @test_mask_mul_epu32_rrk_256(< 8 x i32> %a, < 8 x i32> %b, < 4 x i64> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_mul_epu32_rrk_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpmuludq %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0xf4,0xd1]
; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2]
@@ -628,7 +628,7 @@ define < 4 x i64> @test_mask_mul_epu32_rrk_256(< 8 x i32> %a, < 8 x i32> %b, < 4
define < 4 x i64> @test_mask_mul_epu32_rrkz_256(< 8 x i32> %a, < 8 x i32> %b, i8 %mask) {
; CHECK-LABEL: test_mask_mul_epu32_rrkz_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0xf4,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -638,7 +638,7 @@ define < 4 x i64> @test_mask_mul_epu32_rrkz_256(< 8 x i32> %a, < 8 x i32> %b, i8
define < 4 x i64> @test_mask_mul_epu32_rm_256(< 8 x i32> %a, < 8 x i32>* %ptr_b) {
; CHECK-LABEL: test_mask_mul_epu32_rm_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpmuludq (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xf4,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%b = load < 8 x i32>, < 8 x i32>* %ptr_b
@@ -648,7 +648,7 @@ define < 4 x i64> @test_mask_mul_epu32_rm_256(< 8 x i32> %a, < 8 x i32>* %ptr_b)
define < 4 x i64> @test_mask_mul_epu32_rmk_256(< 8 x i32> %a, < 8 x i32>* %ptr_b, < 4 x i64> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_mul_epu32_rmk_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpmuludq (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0xf4,0x0f]
; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
@@ -660,7 +660,7 @@ define < 4 x i64> @test_mask_mul_epu32_rmk_256(< 8 x i32> %a, < 8 x i32>* %ptr_b
define < 4 x i64> @test_mask_mul_epu32_rmkz_256(< 8 x i32> %a, < 8 x i32>* %ptr_b, i8 %mask) {
; CHECK-LABEL: test_mask_mul_epu32_rmkz_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpmuludq (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0xf4,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -671,7 +671,7 @@ define < 4 x i64> @test_mask_mul_epu32_rmkz_256(< 8 x i32> %a, < 8 x i32>* %ptr_
define < 4 x i64> @test_mask_mul_epu32_rmb_256(< 8 x i32> %a, i64* %ptr_b) {
; CHECK-LABEL: test_mask_mul_epu32_rmb_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpmuludq (%rdi){1to4}, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x38,0xf4,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load i64, i64* %ptr_b
@@ -684,7 +684,7 @@ define < 4 x i64> @test_mask_mul_epu32_rmb_256(< 8 x i32> %a, i64* %ptr_b) {
define < 4 x i64> @test_mask_mul_epu32_rmbk_256(< 8 x i32> %a, i64* %ptr_b, < 4 x i64> %passThru, i8 %mask) {
; CHECK-LABEL: test_mask_mul_epu32_rmbk_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpmuludq (%rdi){1to4}, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x39,0xf4,0x0f]
; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
@@ -699,7 +699,7 @@ define < 4 x i64> @test_mask_mul_epu32_rmbk_256(< 8 x i32> %a, i64* %ptr_b, < 4
define < 4 x i64> @test_mask_mul_epu32_rmbkz_256(< 8 x i32> %a, i64* %ptr_b, i8 %mask) {
; CHECK-LABEL: test_mask_mul_epu32_rmbkz_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpmuludq (%rdi){1to4}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xb9,0xf4,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -715,10 +715,10 @@ declare < 4 x i64> @llvm.x86.avx512.mask.pmulu.dq.256(< 8 x i32>, < 8 x i32>, <
define i8 @test_cmpps_256(<8 x float> %a, <8 x float> %b) {
; CHECK-LABEL: test_cmpps_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vcmpleps %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf1,0x7c,0x28,0xc2,0xc1,0x02]
; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
-; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: ## kill: def %al killed %al killed %eax
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call i8 @llvm.x86.avx512.mask.cmp.ps.256(<8 x float> %a, <8 x float> %b, i32 2, i8 -1)
ret i8 %res
@@ -727,10 +727,10 @@ define i8 @test_cmpps_256(<8 x float> %a, <8 x float> %b) {
define i8 @test_cmpps_128(<4 x float> %a, <4 x float> %b) {
; CHECK-LABEL: test_cmpps_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vcmpleps %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf1,0x7c,0x08,0xc2,0xc1,0x02]
; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
-; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: ## kill: def %al killed %al killed %eax
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call i8 @llvm.x86.avx512.mask.cmp.ps.128(<4 x float> %a, <4 x float> %b, i32 2, i8 -1)
ret i8 %res
@@ -739,10 +739,10 @@ define i8 @test_cmpps_128(<4 x float> %a, <4 x float> %b) {
define i8 @test_cmppd_256(<4 x double> %a, <4 x double> %b) {
; CHECK-LABEL: test_cmppd_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vcmplepd %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf1,0xfd,0x28,0xc2,0xc1,0x02]
; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
-; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: ## kill: def %al killed %al killed %eax
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call i8 @llvm.x86.avx512.mask.cmp.pd.256(<4 x double> %a, <4 x double> %b, i32 2, i8 -1)
ret i8 %res
@@ -751,10 +751,10 @@ define i8 @test_cmppd_256(<4 x double> %a, <4 x double> %b) {
define i8 @test_cmppd_128(<2 x double> %a, <2 x double> %b) {
; CHECK-LABEL: test_cmppd_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vcmplepd %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf1,0xfd,0x08,0xc2,0xc1,0x02]
; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
-; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: ## kill: def %al killed %al killed %eax
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call i8 @llvm.x86.avx512.mask.cmp.pd.128(<2 x double> %a, <2 x double> %b, i32 2, i8 -1)
ret i8 %res
@@ -763,7 +763,7 @@ define i8 @test_cmppd_128(<2 x double> %a, <2 x double> %b) {
define <8 x float> @test_mm512_maskz_max_ps_256(<8 x float> %a0, <8 x float> %a1, i8 %mask) {
; CHECK-LABEL: test_mm512_maskz_max_ps_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmaxps %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xa9,0x5f,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -775,7 +775,7 @@ define <8 x float> @test_mm512_maskz_max_ps_256(<8 x float> %a0, <8 x float> %a1
define <8 x float> @test_mm512_mask_max_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask) {
; CHECK-LABEL: test_mm512_mask_max_ps_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmaxps %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x5f,0xd1]
; CHECK-NEXT: vmovaps %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2]
@@ -788,7 +788,7 @@ define <8 x float> @test_mm512_mask_max_ps_256(<8 x float> %a0, <8 x float> %a1,
define <8 x float> @test_mm512_max_ps_256(<8 x float> %a0, <8 x float> %a1, i8 %mask) {
; CHECK-LABEL: test_mm512_max_ps_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmaxps %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x5f,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%1 = call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> %a0, <8 x float> %a1)
@@ -798,7 +798,7 @@ declare <8 x float> @llvm.x86.avx.max.ps.256(<8 x float>, <8 x float>)
define <4 x float> @test_mm512_maskz_max_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %mask) {
; CHECK-LABEL: test_mm512_maskz_max_ps_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmaxps %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x89,0x5f,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -811,7 +811,7 @@ define <4 x float> @test_mm512_maskz_max_ps_128(<4 x float> %a0, <4 x float> %a1
define <4 x float> @test_mm512_mask_max_ps_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask) {
; CHECK-LABEL: test_mm512_mask_max_ps_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmaxps %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x5f,0xd1]
; CHECK-NEXT: vmovaps %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2]
@@ -825,7 +825,7 @@ define <4 x float> @test_mm512_mask_max_ps_128(<4 x float> %a0, <4 x float> %a1,
define <4 x float> @test_mm512_max_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %mask) {
; CHECK-LABEL: test_mm512_max_ps_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmaxps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x5f,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%1 = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %a0, <4 x float> %a1)
@@ -835,7 +835,7 @@ declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>)
define <8 x float> @test_mm512_maskz_min_ps_256(<8 x float> %a0, <8 x float> %a1, i8 %mask) {
; CHECK-LABEL: test_mm512_maskz_min_ps_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vminps %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xa9,0x5d,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -847,7 +847,7 @@ define <8 x float> @test_mm512_maskz_min_ps_256(<8 x float> %a0, <8 x float> %a1
define <8 x float> @test_mm512_mask_min_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask) {
; CHECK-LABEL: test_mm512_mask_min_ps_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vminps %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x5d,0xd1]
; CHECK-NEXT: vmovaps %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2]
@@ -860,7 +860,7 @@ define <8 x float> @test_mm512_mask_min_ps_256(<8 x float> %a0, <8 x float> %a1,
define <8 x float> @test_mm512_min_ps_256(<8 x float> %a0, <8 x float> %a1, i8 %mask) {
; CHECK-LABEL: test_mm512_min_ps_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vminps %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x5d,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%1 = call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> %a0, <8 x float> %a1)
@@ -870,7 +870,7 @@ declare <8 x float> @llvm.x86.avx.min.ps.256(<8 x float>, <8 x float>)
define <4 x float> @test_mm512_maskz_min_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %mask) {
; CHECK-LABEL: test_mm512_maskz_min_ps_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vminps %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x89,0x5d,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -883,7 +883,7 @@ define <4 x float> @test_mm512_maskz_min_ps_128(<4 x float> %a0, <4 x float> %a1
define <4 x float> @test_mm512_mask_min_ps_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask) {
; CHECK-LABEL: test_mm512_mask_min_ps_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vminps %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x5d,0xd1]
; CHECK-NEXT: vmovaps %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2]
@@ -897,7 +897,7 @@ define <4 x float> @test_mm512_mask_min_ps_128(<4 x float> %a0, <4 x float> %a1,
define <4 x float> @test_mm512_min_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %mask) {
; CHECK-LABEL: test_mm512_min_ps_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vminps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x5d,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%1 = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %a0, <4 x float> %a1)
@@ -907,7 +907,7 @@ declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>)
define <4 x double> @test_sqrt_pd_256(<4 x double> %a0, i8 %mask) {
; CHECK-LABEL: test_sqrt_pd_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vsqrtpd %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0x51,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -918,7 +918,7 @@ declare <4 x double> @llvm.x86.avx512.mask.sqrt.pd.256(<4 x double>, <4 x double
define <8 x float> @test_sqrt_ps_256(<8 x float> %a0, i8 %mask) {
; CHECK-LABEL: test_sqrt_ps_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vsqrtps %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xa9,0x51,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -930,7 +930,7 @@ declare <8 x float> @llvm.x86.avx512.mask.sqrt.ps.256(<8 x float>, <8 x float>,
define <4 x double> @test_getexp_pd_256(<4 x double> %a0) {
; CHECK-LABEL: test_getexp_pd_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vgetexppd %ymm0, %ymm0 ## encoding: [0x62,0xf2,0xfd,0x28,0x42,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx512.mask.getexp.pd.256(<4 x double> %a0, <4 x double> zeroinitializer, i8 -1)
@@ -941,7 +941,7 @@ declare <4 x double> @llvm.x86.avx512.mask.getexp.pd.256(<4 x double>, <4 x doub
define <8 x float> @test_getexp_ps_256(<8 x float> %a0) {
; CHECK-LABEL: test_getexp_ps_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vgetexpps %ymm0, %ymm0 ## encoding: [0x62,0xf2,0x7d,0x28,0x42,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.getexp.ps.256(<8 x float> %a0, <8 x float> zeroinitializer, i8 -1)
@@ -953,7 +953,7 @@ declare <4 x i32> @llvm.x86.avx512.mask.vpermt2var.d.128(<4 x i32>, <4 x i32>, <
define <4 x i32>@test_int_x86_avx512_mask_vpermt2var_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vpermt2var_d_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovdqa %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd9]
; CHECK-NEXT: vpermt2d %xmm2, %xmm0, %xmm3 ## encoding: [0x62,0xf2,0x7d,0x08,0x7e,0xda]
@@ -970,7 +970,7 @@ declare <4 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.128(<4 x i32>, <4 x i32>,
define <4 x i32>@test_int_x86_avx512_maskz_vpermt2var_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_d_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovdqa %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd9]
; CHECK-NEXT: vpermt2d %xmm2, %xmm0, %xmm3 ## encoding: [0x62,0xf2,0x7d,0x08,0x7e,0xda]
@@ -987,7 +987,7 @@ declare <8 x i32> @llvm.x86.avx512.mask.vpermt2var.d.256(<8 x i32>, <8 x i32>, <
define <8 x i32>@test_int_x86_avx512_mask_vpermt2var_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vpermt2var_d_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovdqa %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd9]
; CHECK-NEXT: vpermt2d %ymm2, %ymm0, %ymm3 ## encoding: [0x62,0xf2,0x7d,0x28,0x7e,0xda]
@@ -1004,7 +1004,7 @@ declare <8 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.256(<8 x i32>, <8 x i32>,
define <8 x i32>@test_int_x86_avx512_maskz_vpermt2var_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_d_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovdqa %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd9]
; CHECK-NEXT: vpermt2d %ymm2, %ymm0, %ymm3 ## encoding: [0x62,0xf2,0x7d,0x28,0x7e,0xda]
@@ -1021,7 +1021,7 @@ declare <2 x double> @llvm.x86.avx512.mask.vpermi2var.pd.128(<2 x double>, <2 x
define <2 x double>@test_int_x86_avx512_mask_vpermi2var_pd_128(<2 x double> %x0, <2 x i64> %x1, <2 x double> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_pd_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovapd %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xd9]
; CHECK-NEXT: vpermi2pd %xmm2, %xmm0, %xmm3 ## encoding: [0x62,0xf2,0xfd,0x08,0x77,0xda]
@@ -1038,7 +1038,7 @@ declare <4 x double> @llvm.x86.avx512.mask.vpermi2var.pd.256(<4 x double>, <4 x
define <4 x double>@test_int_x86_avx512_mask_vpermi2var_pd_256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_pd_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovapd %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xd9]
; CHECK-NEXT: vpermi2pd %ymm2, %ymm0, %ymm3 ## encoding: [0x62,0xf2,0xfd,0x28,0x77,0xda]
@@ -1055,7 +1055,7 @@ declare <4 x float> @llvm.x86.avx512.mask.vpermi2var.ps.128(<4 x float>, <4 x i3
define <4 x float>@test_int_x86_avx512_mask_vpermi2var_ps_128(<4 x float> %x0, <4 x i32> %x1, <4 x float> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_ps_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovaps %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xd9]
; CHECK-NEXT: vpermi2ps %xmm2, %xmm0, %xmm3 ## encoding: [0x62,0xf2,0x7d,0x08,0x77,0xda]
@@ -1070,7 +1070,7 @@ define <4 x float>@test_int_x86_avx512_mask_vpermi2var_ps_128(<4 x float> %x0, <
define <4 x float>@test_int_x86_avx512_mask_vpermi2var_ps_128_cast(<4 x float> %x0, <2 x i64> %x1, <4 x float> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_ps_128_cast:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpermi2ps %xmm2, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x77,0xca]
; CHECK-NEXT: vmovaps %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1]
@@ -1084,7 +1084,7 @@ declare <8 x float> @llvm.x86.avx512.mask.vpermi2var.ps.256(<8 x float>, <8 x i3
define <8 x float>@test_int_x86_avx512_mask_vpermi2var_ps_256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_ps_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovaps %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xd9]
; CHECK-NEXT: vpermi2ps %ymm2, %ymm0, %ymm3 ## encoding: [0x62,0xf2,0x7d,0x28,0x77,0xda]
@@ -1097,75 +1097,11 @@ define <8 x float>@test_int_x86_avx512_mask_vpermi2var_ps_256(<8 x float> %x0, <
ret <8 x float> %res2
}
-declare <2 x i64> @llvm.x86.avx512.mask.pabs.q.128(<2 x i64>, <2 x i64>, i8)
-
-define <2 x i64>@test_int_x86_avx512_mask_pabs_q_128(<2 x i64> %x0, <2 x i64> %x1, i8 %x2) {
-; CHECK-LABEL: test_int_x86_avx512_mask_pabs_q_128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT: vpabsq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x1f,0xc8]
-; CHECK-NEXT: vpabsq %xmm0, %xmm0 ## encoding: [0x62,0xf2,0xfd,0x08,0x1f,0xc0]
-; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0]
-; CHECK-NEXT: retq ## encoding: [0xc3]
- %res = call <2 x i64> @llvm.x86.avx512.mask.pabs.q.128(<2 x i64> %x0, <2 x i64> %x1, i8 %x2)
- %res1 = call <2 x i64> @llvm.x86.avx512.mask.pabs.q.128(<2 x i64> %x0, <2 x i64> %x1, i8 -1)
- %res2 = add <2 x i64> %res, %res1
- ret <2 x i64> %res2
-}
-
-declare <4 x i64> @llvm.x86.avx512.mask.pabs.q.256(<4 x i64>, <4 x i64>, i8)
-
-define <4 x i64>@test_int_x86_avx512_mask_pabs_q_256(<4 x i64> %x0, <4 x i64> %x1, i8 %x2) {
-; CHECK-LABEL: test_int_x86_avx512_mask_pabs_q_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT: vpabsq %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x1f,0xc8]
-; CHECK-NEXT: vpabsq %ymm0, %ymm0 ## encoding: [0x62,0xf2,0xfd,0x28,0x1f,0xc0]
-; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0]
-; CHECK-NEXT: retq ## encoding: [0xc3]
- %res = call <4 x i64> @llvm.x86.avx512.mask.pabs.q.256(<4 x i64> %x0, <4 x i64> %x1, i8 %x2)
- %res1 = call <4 x i64> @llvm.x86.avx512.mask.pabs.q.256(<4 x i64> %x0, <4 x i64> %x1, i8 -1)
- %res2 = add <4 x i64> %res, %res1
- ret <4 x i64> %res2
-}
-
-declare <4 x i32> @llvm.x86.avx512.mask.pabs.d.128(<4 x i32>, <4 x i32>, i8)
-
-define <4 x i32>@test_int_x86_avx512_mask_pabs_d_128(<4 x i32> %x0, <4 x i32> %x1, i8 %x2) {
-; CHECK-LABEL: test_int_x86_avx512_mask_pabs_d_128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT: vpabsd %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x1e,0xc8]
-; CHECK-NEXT: vpabsd %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x1e,0xc0]
-; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0]
-; CHECK-NEXT: retq ## encoding: [0xc3]
- %res = call <4 x i32> @llvm.x86.avx512.mask.pabs.d.128(<4 x i32> %x0, <4 x i32> %x1, i8 %x2)
- %res1 = call <4 x i32> @llvm.x86.avx512.mask.pabs.d.128(<4 x i32> %x0, <4 x i32> %x1, i8 -1)
- %res2 = add <4 x i32> %res, %res1
- ret <4 x i32> %res2
-}
-
-declare <8 x i32> @llvm.x86.avx512.mask.pabs.d.256(<8 x i32>, <8 x i32>, i8)
-
-define <8 x i32>@test_int_x86_avx512_mask_pabs_d_256(<8 x i32> %x0, <8 x i32> %x1, i8 %x2) {
-; CHECK-LABEL: test_int_x86_avx512_mask_pabs_d_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT: vpabsd %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x1e,0xc8]
-; CHECK-NEXT: vpabsd %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x1e,0xc0]
-; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc0]
-; CHECK-NEXT: retq ## encoding: [0xc3]
- %res = call <8 x i32> @llvm.x86.avx512.mask.pabs.d.256(<8 x i32> %x0, <8 x i32> %x1, i8 %x2)
- %res1 = call <8 x i32> @llvm.x86.avx512.mask.pabs.d.256(<8 x i32> %x0, <8 x i32> %x1, i8 -1)
- %res2 = add <8 x i32> %res, %res1
- ret <8 x i32> %res2
-}
-
declare <2 x double> @llvm.x86.avx512.mask.scalef.pd.128(<2 x double>, <2 x double>, <2 x double>, i8)
define <2 x double>@test_int_x86_avx512_mask_scalef_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_scalef_pd_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vscalefpd %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x2c,0xd1]
; CHECK-NEXT: vscalefpd %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf2,0xfd,0x08,0x2c,0xc1]
@@ -1181,7 +1117,7 @@ declare <4 x double> @llvm.x86.avx512.mask.scalef.pd.256(<4 x double>, <4 x doub
define <4 x double>@test_int_x86_avx512_mask_scalef_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_scalef_pd_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vscalefpd %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x2c,0xd1]
; CHECK-NEXT: vscalefpd %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf2,0xfd,0x28,0x2c,0xc1]
@@ -1197,7 +1133,7 @@ declare <4 x float> @llvm.x86.avx512.mask.scalef.ps.128(<4 x float>, <4 x float>
define <4 x float>@test_int_x86_avx512_mask_scalef_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_scalef_ps_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vscalefps %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x2c,0xd1]
; CHECK-NEXT: vscalefps %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7d,0x08,0x2c,0xc1]
@@ -1213,7 +1149,7 @@ declare <8 x float> @llvm.x86.avx512.mask.scalef.ps.256(<8 x float>, <8 x float>
define <8 x float>@test_int_x86_avx512_mask_scalef_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_scalef_ps_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vscalefps %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x2c,0xd1]
; CHECK-NEXT: vscalefps %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf2,0x7d,0x28,0x2c,0xc1]
@@ -1229,7 +1165,7 @@ declare <16 x i8> @llvm.x86.avx512.mask.pmov.qb.128(<2 x i64>, <16 x i8>, i8)
define <16 x i8>@test_int_x86_avx512_mask_pmov_qb_128(<2 x i64> %x0, <16 x i8> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qb_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpmovqb %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0x89,0x32,0xc2]
; CHECK-NEXT: vpmovqb %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x32,0xc1]
@@ -1249,7 +1185,7 @@ declare void @llvm.x86.avx512.mask.pmov.qb.mem.128(i8* %ptr, <2 x i64>, i8)
define void @test_int_x86_avx512_mask_pmov_qb_mem_128(i8* %ptr, <2 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qb_mem_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpmovqb %xmm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x08,0x32,0x07]
; CHECK-NEXT: vpmovqb %xmm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x32,0x07]
@@ -1263,7 +1199,7 @@ declare <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.128(<2 x i64>, <16 x i8>, i8)
define <16 x i8>@test_int_x86_avx512_mask_pmovs_qb_128(<2 x i64> %x0, <16 x i8> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qb_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpmovsqb %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0x89,0x22,0xc2]
; CHECK-NEXT: vpmovsqb %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x22,0xc1]
@@ -1283,7 +1219,7 @@ declare void @llvm.x86.avx512.mask.pmovs.qb.mem.128(i8* %ptr, <2 x i64>, i8)
define void @test_int_x86_avx512_mask_pmovs_qb_mem_128(i8* %ptr, <2 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qb_mem_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpmovsqb %xmm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x08,0x22,0x07]
; CHECK-NEXT: vpmovsqb %xmm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x22,0x07]
@@ -1297,7 +1233,7 @@ declare <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.128(<2 x i64>, <16 x i8>, i8)
define <16 x i8>@test_int_x86_avx512_mask_pmovus_qb_128(<2 x i64> %x0, <16 x i8> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qb_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpmovusqb %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0x89,0x12,0xc2]
; CHECK-NEXT: vpmovusqb %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x12,0xc1]
@@ -1317,7 +1253,7 @@ declare void @llvm.x86.avx512.mask.pmovus.qb.mem.128(i8* %ptr, <2 x i64>, i8)
define void @test_int_x86_avx512_mask_pmovus_qb_mem_128(i8* %ptr, <2 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qb_mem_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpmovusqb %xmm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x08,0x12,0x07]
; CHECK-NEXT: vpmovusqb %xmm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x12,0x07]
@@ -1331,7 +1267,7 @@ declare <16 x i8> @llvm.x86.avx512.mask.pmov.qb.256(<4 x i64>, <16 x i8>, i8)
define <16 x i8>@test_int_x86_avx512_mask_pmov_qb_256(<4 x i64> %x0, <16 x i8> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qb_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpmovqb %ymm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0xa9,0x32,0xc2]
; CHECK-NEXT: vpmovqb %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x32,0xc1]
@@ -1351,7 +1287,7 @@ declare void @llvm.x86.avx512.mask.pmov.qb.mem.256(i8* %ptr, <4 x i64>, i8)
define void @test_int_x86_avx512_mask_pmov_qb_mem_256(i8* %ptr, <4 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qb_mem_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpmovqb %ymm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x28,0x32,0x07]
; CHECK-NEXT: vpmovqb %ymm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x32,0x07]
@@ -1365,7 +1301,7 @@ declare <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.256(<4 x i64>, <16 x i8>, i8)
define <16 x i8>@test_int_x86_avx512_mask_pmovs_qb_256(<4 x i64> %x0, <16 x i8> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qb_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpmovsqb %ymm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0xa9,0x22,0xc2]
; CHECK-NEXT: vpmovsqb %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x22,0xc1]
@@ -1385,7 +1321,7 @@ declare void @llvm.x86.avx512.mask.pmovs.qb.mem.256(i8* %ptr, <4 x i64>, i8)
define void @test_int_x86_avx512_mask_pmovs_qb_mem_256(i8* %ptr, <4 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qb_mem_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpmovsqb %ymm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x28,0x22,0x07]
; CHECK-NEXT: vpmovsqb %ymm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x22,0x07]
@@ -1399,7 +1335,7 @@ declare <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.256(<4 x i64>, <16 x i8>, i8)
define <16 x i8>@test_int_x86_avx512_mask_pmovus_qb_256(<4 x i64> %x0, <16 x i8> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qb_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpmovusqb %ymm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0xa9,0x12,0xc2]
; CHECK-NEXT: vpmovusqb %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x12,0xc1]
@@ -1419,7 +1355,7 @@ declare void @llvm.x86.avx512.mask.pmovus.qb.mem.256(i8* %ptr, <4 x i64>, i8)
define void @test_int_x86_avx512_mask_pmovus_qb_mem_256(i8* %ptr, <4 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qb_mem_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpmovusqb %ymm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x28,0x12,0x07]
; CHECK-NEXT: vpmovusqb %ymm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x12,0x07]
@@ -1433,7 +1369,7 @@ declare <8 x i16> @llvm.x86.avx512.mask.pmov.qw.128(<2 x i64>, <8 x i16>, i8)
define <8 x i16>@test_int_x86_avx512_mask_pmov_qw_128(<2 x i64> %x0, <8 x i16> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qw_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpmovqw %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0x89,0x34,0xc2]
; CHECK-NEXT: vpmovqw %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x34,0xc1]
@@ -1453,7 +1389,7 @@ declare void @llvm.x86.avx512.mask.pmov.qw.mem.128(i8* %ptr, <2 x i64>, i8)
define void @test_int_x86_avx512_mask_pmov_qw_mem_128(i8* %ptr, <2 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qw_mem_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpmovqw %xmm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x08,0x34,0x07]
; CHECK-NEXT: vpmovqw %xmm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x34,0x07]
@@ -1467,7 +1403,7 @@ declare <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.128(<2 x i64>, <8 x i16>, i8)
define <8 x i16>@test_int_x86_avx512_mask_pmovs_qw_128(<2 x i64> %x0, <8 x i16> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qw_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpmovsqw %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0x89,0x24,0xc2]
; CHECK-NEXT: vpmovsqw %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x24,0xc1]
@@ -1487,7 +1423,7 @@ declare void @llvm.x86.avx512.mask.pmovs.qw.mem.128(i8* %ptr, <2 x i64>, i8)
define void @test_int_x86_avx512_mask_pmovs_qw_mem_128(i8* %ptr, <2 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qw_mem_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpmovsqw %xmm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x08,0x24,0x07]
; CHECK-NEXT: vpmovsqw %xmm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x24,0x07]
@@ -1501,7 +1437,7 @@ declare <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.128(<2 x i64>, <8 x i16>, i8)
define <8 x i16>@test_int_x86_avx512_mask_pmovus_qw_128(<2 x i64> %x0, <8 x i16> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qw_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpmovusqw %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0x89,0x14,0xc2]
; CHECK-NEXT: vpmovusqw %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x14,0xc1]
@@ -1521,7 +1457,7 @@ declare void @llvm.x86.avx512.mask.pmovus.qw.mem.128(i8* %ptr, <2 x i64>, i8)
define void @test_int_x86_avx512_mask_pmovus_qw_mem_128(i8* %ptr, <2 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qw_mem_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpmovusqw %xmm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x08,0x14,0x07]
; CHECK-NEXT: vpmovusqw %xmm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x14,0x07]
@@ -1535,7 +1471,7 @@ declare <8 x i16> @llvm.x86.avx512.mask.pmov.qw.256(<4 x i64>, <8 x i16>, i8)
define <8 x i16>@test_int_x86_avx512_mask_pmov_qw_256(<4 x i64> %x0, <8 x i16> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qw_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpmovqw %ymm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0xa9,0x34,0xc2]
; CHECK-NEXT: vpmovqw %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x34,0xc1]
@@ -1555,7 +1491,7 @@ declare void @llvm.x86.avx512.mask.pmov.qw.mem.256(i8* %ptr, <4 x i64>, i8)
define void @test_int_x86_avx512_mask_pmov_qw_mem_256(i8* %ptr, <4 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qw_mem_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpmovqw %ymm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x28,0x34,0x07]
; CHECK-NEXT: vpmovqw %ymm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x34,0x07]
@@ -1569,7 +1505,7 @@ declare <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.256(<4 x i64>, <8 x i16>, i8)
define <8 x i16>@test_int_x86_avx512_mask_pmovs_qw_256(<4 x i64> %x0, <8 x i16> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qw_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpmovsqw %ymm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0xa9,0x24,0xc2]
; CHECK-NEXT: vpmovsqw %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x24,0xc1]
@@ -1589,7 +1525,7 @@ declare void @llvm.x86.avx512.mask.pmovs.qw.mem.256(i8* %ptr, <4 x i64>, i8)
define void @test_int_x86_avx512_mask_pmovs_qw_mem_256(i8* %ptr, <4 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qw_mem_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpmovsqw %ymm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x28,0x24,0x07]
; CHECK-NEXT: vpmovsqw %ymm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x24,0x07]
@@ -1603,7 +1539,7 @@ declare <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.256(<4 x i64>, <8 x i16>, i8)
define <8 x i16>@test_int_x86_avx512_mask_pmovus_qw_256(<4 x i64> %x0, <8 x i16> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qw_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpmovusqw %ymm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0xa9,0x14,0xc2]
; CHECK-NEXT: vpmovusqw %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x14,0xc1]
@@ -1623,7 +1559,7 @@ declare void @llvm.x86.avx512.mask.pmovus.qw.mem.256(i8* %ptr, <4 x i64>, i8)
define void @test_int_x86_avx512_mask_pmovus_qw_mem_256(i8* %ptr, <4 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qw_mem_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpmovusqw %ymm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x28,0x14,0x07]
; CHECK-NEXT: vpmovusqw %ymm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x14,0x07]
@@ -1637,7 +1573,7 @@ declare <4 x i32> @llvm.x86.avx512.mask.pmov.qd.128(<2 x i64>, <4 x i32>, i8)
define <4 x i32>@test_int_x86_avx512_mask_pmov_qd_128(<2 x i64> %x0, <4 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qd_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpmovqd %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0x89,0x35,0xc2]
; CHECK-NEXT: vpmovqd %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x35,0xc1]
@@ -1657,7 +1593,7 @@ declare void @llvm.x86.avx512.mask.pmov.qd.mem.128(i8* %ptr, <2 x i64>, i8)
define void @test_int_x86_avx512_mask_pmov_qd_mem_128(i8* %ptr, <2 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qd_mem_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpmovqd %xmm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x08,0x35,0x07]
; CHECK-NEXT: vpmovqd %xmm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x35,0x07]
@@ -1671,7 +1607,7 @@ declare <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.128(<2 x i64>, <4 x i32>, i8)
define <4 x i32>@test_int_x86_avx512_mask_pmovs_qd_128(<2 x i64> %x0, <4 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qd_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpmovsqd %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0x89,0x25,0xc2]
; CHECK-NEXT: vpmovsqd %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x25,0xc1]
@@ -1691,7 +1627,7 @@ declare void @llvm.x86.avx512.mask.pmovs.qd.mem.128(i8* %ptr, <2 x i64>, i8)
define void @test_int_x86_avx512_mask_pmovs_qd_mem_128(i8* %ptr, <2 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qd_mem_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpmovsqd %xmm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x08,0x25,0x07]
; CHECK-NEXT: vpmovsqd %xmm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x25,0x07]
@@ -1705,7 +1641,7 @@ declare <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.128(<2 x i64>, <4 x i32>, i8)
define <4 x i32>@test_int_x86_avx512_mask_pmovus_qd_128(<2 x i64> %x0, <4 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qd_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpmovusqd %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0x89,0x15,0xc2]
; CHECK-NEXT: vpmovusqd %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x15,0xc1]
@@ -1725,7 +1661,7 @@ declare void @llvm.x86.avx512.mask.pmovus.qd.mem.128(i8* %ptr, <2 x i64>, i8)
define void @test_int_x86_avx512_mask_pmovus_qd_mem_128(i8* %ptr, <2 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qd_mem_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpmovusqd %xmm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x08,0x15,0x07]
; CHECK-NEXT: vpmovusqd %xmm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x15,0x07]
@@ -1739,7 +1675,7 @@ declare <4 x i32> @llvm.x86.avx512.mask.pmov.qd.256(<4 x i64>, <4 x i32>, i8)
define <4 x i32>@test_int_x86_avx512_mask_pmov_qd_256(<4 x i64> %x0, <4 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qd_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpmovqd %ymm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0xa9,0x35,0xc2]
; CHECK-NEXT: vpmovqd %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x35,0xc1]
@@ -1759,7 +1695,7 @@ declare void @llvm.x86.avx512.mask.pmov.qd.mem.256(i8* %ptr, <4 x i64>, i8)
define void @test_int_x86_avx512_mask_pmov_qd_mem_256(i8* %ptr, <4 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qd_mem_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpmovqd %ymm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x28,0x35,0x07]
; CHECK-NEXT: vpmovqd %ymm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x35,0x07]
@@ -1773,7 +1709,7 @@ declare <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.256(<4 x i64>, <4 x i32>, i8)
define <4 x i32>@test_int_x86_avx512_mask_pmovs_qd_256(<4 x i64> %x0, <4 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qd_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpmovsqd %ymm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0xa9,0x25,0xc2]
; CHECK-NEXT: vpmovsqd %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x25,0xc1]
@@ -1793,7 +1729,7 @@ declare void @llvm.x86.avx512.mask.pmovs.qd.mem.256(i8* %ptr, <4 x i64>, i8)
define void @test_int_x86_avx512_mask_pmovs_qd_mem_256(i8* %ptr, <4 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qd_mem_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpmovsqd %ymm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x28,0x25,0x07]
; CHECK-NEXT: vpmovsqd %ymm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x25,0x07]
@@ -1807,7 +1743,7 @@ declare <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.256(<4 x i64>, <4 x i32>, i8)
define <4 x i32>@test_int_x86_avx512_mask_pmovus_qd_256(<4 x i64> %x0, <4 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qd_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpmovusqd %ymm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0xa9,0x15,0xc2]
; CHECK-NEXT: vpmovusqd %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x15,0xc1]
@@ -1827,7 +1763,7 @@ declare void @llvm.x86.avx512.mask.pmovus.qd.mem.256(i8* %ptr, <4 x i64>, i8)
define void @test_int_x86_avx512_mask_pmovus_qd_mem_256(i8* %ptr, <4 x i64> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qd_mem_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpmovusqd %ymm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x28,0x15,0x07]
; CHECK-NEXT: vpmovusqd %ymm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x15,0x07]
@@ -1841,7 +1777,7 @@ declare <16 x i8> @llvm.x86.avx512.mask.pmov.db.128(<4 x i32>, <16 x i8>, i8)
define <16 x i8>@test_int_x86_avx512_mask_pmov_db_128(<4 x i32> %x0, <16 x i8> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmov_db_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpmovdb %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0x89,0x31,0xc2]
; CHECK-NEXT: vpmovdb %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x31,0xc1]
@@ -1861,7 +1797,7 @@ declare void @llvm.x86.avx512.mask.pmov.db.mem.128(i8* %ptr, <4 x i32>, i8)
define void @test_int_x86_avx512_mask_pmov_db_mem_128(i8* %ptr, <4 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmov_db_mem_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpmovdb %xmm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x08,0x31,0x07]
; CHECK-NEXT: vpmovdb %xmm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x31,0x07]
@@ -1875,7 +1811,7 @@ declare <16 x i8> @llvm.x86.avx512.mask.pmovs.db.128(<4 x i32>, <16 x i8>, i8)
define <16 x i8>@test_int_x86_avx512_mask_pmovs_db_128(<4 x i32> %x0, <16 x i8> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_db_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpmovsdb %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0x89,0x21,0xc2]
; CHECK-NEXT: vpmovsdb %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x21,0xc1]
@@ -1895,7 +1831,7 @@ declare void @llvm.x86.avx512.mask.pmovs.db.mem.128(i8* %ptr, <4 x i32>, i8)
define void @test_int_x86_avx512_mask_pmovs_db_mem_128(i8* %ptr, <4 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_db_mem_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpmovsdb %xmm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x08,0x21,0x07]
; CHECK-NEXT: vpmovsdb %xmm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x21,0x07]
@@ -1909,7 +1845,7 @@ declare <16 x i8> @llvm.x86.avx512.mask.pmovus.db.128(<4 x i32>, <16 x i8>, i8)
define <16 x i8>@test_int_x86_avx512_mask_pmovus_db_128(<4 x i32> %x0, <16 x i8> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_db_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpmovusdb %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0x89,0x11,0xc2]
; CHECK-NEXT: vpmovusdb %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x11,0xc1]
@@ -1929,7 +1865,7 @@ declare void @llvm.x86.avx512.mask.pmovus.db.mem.128(i8* %ptr, <4 x i32>, i8)
define void @test_int_x86_avx512_mask_pmovus_db_mem_128(i8* %ptr, <4 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_db_mem_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpmovusdb %xmm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x08,0x11,0x07]
; CHECK-NEXT: vpmovusdb %xmm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x11,0x07]
@@ -1943,7 +1879,7 @@ declare <16 x i8> @llvm.x86.avx512.mask.pmov.db.256(<8 x i32>, <16 x i8>, i8)
define <16 x i8>@test_int_x86_avx512_mask_pmov_db_256(<8 x i32> %x0, <16 x i8> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmov_db_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpmovdb %ymm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0xa9,0x31,0xc2]
; CHECK-NEXT: vpmovdb %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x31,0xc1]
@@ -1963,7 +1899,7 @@ declare void @llvm.x86.avx512.mask.pmov.db.mem.256(i8* %ptr, <8 x i32>, i8)
define void @test_int_x86_avx512_mask_pmov_db_mem_256(i8* %ptr, <8 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmov_db_mem_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpmovdb %ymm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x28,0x31,0x07]
; CHECK-NEXT: vpmovdb %ymm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x31,0x07]
@@ -1977,7 +1913,7 @@ declare <16 x i8> @llvm.x86.avx512.mask.pmovs.db.256(<8 x i32>, <16 x i8>, i8)
define <16 x i8>@test_int_x86_avx512_mask_pmovs_db_256(<8 x i32> %x0, <16 x i8> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_db_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpmovsdb %ymm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0xa9,0x21,0xc2]
; CHECK-NEXT: vpmovsdb %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x21,0xc1]
@@ -1997,7 +1933,7 @@ declare void @llvm.x86.avx512.mask.pmovs.db.mem.256(i8* %ptr, <8 x i32>, i8)
define void @test_int_x86_avx512_mask_pmovs_db_mem_256(i8* %ptr, <8 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_db_mem_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpmovsdb %ymm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x28,0x21,0x07]
; CHECK-NEXT: vpmovsdb %ymm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x21,0x07]
@@ -2011,7 +1947,7 @@ declare <16 x i8> @llvm.x86.avx512.mask.pmovus.db.256(<8 x i32>, <16 x i8>, i8)
define <16 x i8>@test_int_x86_avx512_mask_pmovus_db_256(<8 x i32> %x0, <16 x i8> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_db_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpmovusdb %ymm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0xa9,0x11,0xc2]
; CHECK-NEXT: vpmovusdb %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x11,0xc1]
@@ -2031,7 +1967,7 @@ declare void @llvm.x86.avx512.mask.pmovus.db.mem.256(i8* %ptr, <8 x i32>, i8)
define void @test_int_x86_avx512_mask_pmovus_db_mem_256(i8* %ptr, <8 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_db_mem_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpmovusdb %ymm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x28,0x11,0x07]
; CHECK-NEXT: vpmovusdb %ymm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x11,0x07]
@@ -2045,7 +1981,7 @@ declare <8 x i16> @llvm.x86.avx512.mask.pmov.dw.128(<4 x i32>, <8 x i16>, i8)
define <8 x i16>@test_int_x86_avx512_mask_pmov_dw_128(<4 x i32> %x0, <8 x i16> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmov_dw_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpmovdw %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0x89,0x33,0xc2]
; CHECK-NEXT: vpmovdw %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x33,0xc1]
@@ -2065,7 +2001,7 @@ declare void @llvm.x86.avx512.mask.pmov.dw.mem.128(i8* %ptr, <4 x i32>, i8)
define void @test_int_x86_avx512_mask_pmov_dw_mem_128(i8* %ptr, <4 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmov_dw_mem_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpmovdw %xmm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x08,0x33,0x07]
; CHECK-NEXT: vpmovdw %xmm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x33,0x07]
@@ -2079,7 +2015,7 @@ declare <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.128(<4 x i32>, <8 x i16>, i8)
define <8 x i16>@test_int_x86_avx512_mask_pmovs_dw_128(<4 x i32> %x0, <8 x i16> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_dw_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpmovsdw %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0x89,0x23,0xc2]
; CHECK-NEXT: vpmovsdw %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x23,0xc1]
@@ -2099,7 +2035,7 @@ declare void @llvm.x86.avx512.mask.pmovs.dw.mem.128(i8* %ptr, <4 x i32>, i8)
define void @test_int_x86_avx512_mask_pmovs_dw_mem_128(i8* %ptr, <4 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_dw_mem_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpmovsdw %xmm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x08,0x23,0x07]
; CHECK-NEXT: vpmovsdw %xmm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x23,0x07]
@@ -2113,7 +2049,7 @@ declare <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.128(<4 x i32>, <8 x i16>, i8)
define <8 x i16>@test_int_x86_avx512_mask_pmovus_dw_128(<4 x i32> %x0, <8 x i16> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_dw_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpmovusdw %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0x89,0x13,0xc2]
; CHECK-NEXT: vpmovusdw %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x13,0xc1]
@@ -2133,7 +2069,7 @@ declare void @llvm.x86.avx512.mask.pmovus.dw.mem.128(i8* %ptr, <4 x i32>, i8)
define void @test_int_x86_avx512_mask_pmovus_dw_mem_128(i8* %ptr, <4 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_dw_mem_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpmovusdw %xmm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x08,0x13,0x07]
; CHECK-NEXT: vpmovusdw %xmm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x13,0x07]
@@ -2147,7 +2083,7 @@ declare <8 x i16> @llvm.x86.avx512.mask.pmov.dw.256(<8 x i32>, <8 x i16>, i8)
define <8 x i16>@test_int_x86_avx512_mask_pmov_dw_256(<8 x i32> %x0, <8 x i16> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmov_dw_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpmovdw %ymm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0xa9,0x33,0xc2]
; CHECK-NEXT: vpmovdw %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x33,0xc1]
@@ -2167,7 +2103,7 @@ declare void @llvm.x86.avx512.mask.pmov.dw.mem.256(i8* %ptr, <8 x i32>, i8)
define void @test_int_x86_avx512_mask_pmov_dw_mem_256(i8* %ptr, <8 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmov_dw_mem_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpmovdw %ymm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x28,0x33,0x07]
; CHECK-NEXT: vpmovdw %ymm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x33,0x07]
@@ -2181,7 +2117,7 @@ declare <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.256(<8 x i32>, <8 x i16>, i8)
define <8 x i16>@test_int_x86_avx512_mask_pmovs_dw_256(<8 x i32> %x0, <8 x i16> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_dw_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpmovsdw %ymm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0xa9,0x23,0xc2]
; CHECK-NEXT: vpmovsdw %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x23,0xc1]
@@ -2201,7 +2137,7 @@ declare void @llvm.x86.avx512.mask.pmovs.dw.mem.256(i8* %ptr, <8 x i32>, i8)
define void @test_int_x86_avx512_mask_pmovs_dw_mem_256(i8* %ptr, <8 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_dw_mem_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpmovsdw %ymm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x28,0x23,0x07]
; CHECK-NEXT: vpmovsdw %ymm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x23,0x07]
@@ -2215,7 +2151,7 @@ declare <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.256(<8 x i32>, <8 x i16>, i8)
define <8 x i16>@test_int_x86_avx512_mask_pmovus_dw_256(<8 x i32> %x0, <8 x i16> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_dw_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpmovusdw %ymm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0xa9,0x13,0xc2]
; CHECK-NEXT: vpmovusdw %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x13,0xc1]
@@ -2235,7 +2171,7 @@ declare void @llvm.x86.avx512.mask.pmovus.dw.mem.256(i8* %ptr, <8 x i32>, i8)
define void @test_int_x86_avx512_mask_pmovus_dw_mem_256(i8* %ptr, <8 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_dw_mem_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vpmovusdw %ymm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x28,0x13,0x07]
; CHECK-NEXT: vpmovusdw %ymm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x13,0x07]
@@ -2249,7 +2185,7 @@ declare <4 x float> @llvm.x86.avx512.mask.cvtdq2ps.128(<4 x i32>, <4 x float>, i
define <4 x float>@test_int_x86_avx512_mask_cvt_dq2ps_128(<4 x i32> %x0, <4 x float> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_dq2ps_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vcvtdq2ps %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x5b,0xc8]
; CHECK-NEXT: vcvtdq2ps %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x5b,0xc0]
@@ -2265,7 +2201,7 @@ declare <8 x float> @llvm.x86.avx512.mask.cvtdq2ps.256(<8 x i32>, <8 x float>, i
define <8 x float>@test_int_x86_avx512_mask_cvt_dq2ps_256(<8 x i32> %x0, <8 x float> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_dq2ps_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vcvtdq2ps %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x5b,0xc8]
; CHECK-NEXT: vcvtdq2ps %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x5b,0xc0]
@@ -2281,7 +2217,7 @@ declare <4 x i32> @llvm.x86.avx512.mask.cvtpd2dq.128(<2 x double>, <4 x i32>, i8
define <4 x i32>@test_int_x86_avx512_mask_cvt_pd2dq_128(<2 x double> %x0, <4 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_pd2dq_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vcvtpd2dq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xff,0x09,0xe6,0xc8]
; CHECK-NEXT: vcvtpd2dq %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfb,0xe6,0xc0]
@@ -2295,7 +2231,7 @@ define <4 x i32>@test_int_x86_avx512_mask_cvt_pd2dq_128(<2 x double> %x0, <4 x i
define <4 x i32>@test_int_x86_avx512_mask_cvt_pd2dq_128_zext(<2 x double> %x0, <4 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_pd2dq_128_zext:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vcvtpd2dq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xff,0x09,0xe6,0xc8]
; CHECK-NEXT: vmovq %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7e,0xc9]
@@ -2315,7 +2251,7 @@ declare <4 x i32> @llvm.x86.avx512.mask.cvtpd2dq.256(<4 x double>, <4 x i32>, i8
define <4 x i32>@test_int_x86_avx512_mask_cvt_pd2dq_256(<4 x double> %x0, <4 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_pd2dq_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vcvtpd2dq %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xff,0x29,0xe6,0xc8]
; CHECK-NEXT: vcvtpd2dq %ymm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xff,0xe6,0xc0]
@@ -2331,7 +2267,7 @@ declare <4 x float> @llvm.x86.avx512.mask.cvtpd2ps.256(<4 x double>, <4 x float>
define <4 x float>@test_int_x86_avx512_mask_cvt_pd2ps_256(<4 x double> %x0, <4 x float> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_pd2ps_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vcvtpd2ps %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0x5a,0xc8]
; CHECK-NEXT: vcvtpd2ps %ymm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x5a,0xc0]
@@ -2347,7 +2283,7 @@ declare <4 x float> @llvm.x86.avx512.mask.cvtpd2ps(<2 x double>, <4 x float>, i8
define <4 x float>@test_int_x86_avx512_mask_cvt_pd2ps(<2 x double> %x0, <4 x float> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_pd2ps:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vcvtpd2ps %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0x5a,0xc8]
; CHECK-NEXT: vcvtpd2ps %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x5a,0xc0]
@@ -2361,7 +2297,7 @@ define <4 x float>@test_int_x86_avx512_mask_cvt_pd2ps(<2 x double> %x0, <4 x flo
define <4 x float>@test_int_x86_avx512_mask_cvt_pd2ps_zext(<2 x double> %x0, <4 x float> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_pd2ps_zext:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vcvtpd2ps %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0x5a,0xc8]
; CHECK-NEXT: vmovq %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7e,0xc9]
@@ -2381,7 +2317,7 @@ declare <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.128(<2 x double>, <4 x i32>, i
define <4 x i32>@test_int_x86_avx512_mask_cvt_pd2udq_128(<2 x double> %x0, <4 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_pd2udq_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vcvtpd2udq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xfc,0x09,0x79,0xc8]
; CHECK-NEXT: vcvtpd2udq %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfc,0x08,0x79,0xc0]
@@ -2395,7 +2331,7 @@ define <4 x i32>@test_int_x86_avx512_mask_cvt_pd2udq_128(<2 x double> %x0, <4 x
define <4 x i32>@test_int_x86_avx512_mask_cvt_pd2udq_128_zext(<2 x double> %x0, <4 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_pd2udq_128_zext:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vcvtpd2udq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xfc,0x09,0x79,0xc8]
; CHECK-NEXT: vmovq %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7e,0xc9]
@@ -2415,7 +2351,7 @@ declare <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.256(<4 x double>, <4 x i32>, i
define <4 x i32>@test_int_x86_avx512_mask_cvt_pd2udq_256(<4 x double> %x0, <4 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_pd2udq_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vcvtpd2udq %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xfc,0x29,0x79,0xc8]
; CHECK-NEXT: vcvtpd2udq %ymm0, %xmm0 ## encoding: [0x62,0xf1,0xfc,0x28,0x79,0xc0]
@@ -2431,7 +2367,7 @@ declare <4 x i32> @llvm.x86.avx512.mask.cvtps2dq.128(<4 x float>, <4 x i32>, i8)
define <4 x i32>@test_int_x86_avx512_mask_cvt_ps2dq_128(<4 x float> %x0, <4 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ps2dq_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vcvtps2dq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x5b,0xc8]
; CHECK-NEXT: vcvtps2dq %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x5b,0xc0]
@@ -2447,7 +2383,7 @@ declare <8 x i32> @llvm.x86.avx512.mask.cvtps2dq.256(<8 x float>, <8 x i32>, i8)
define <8 x i32>@test_int_x86_avx512_mask_cvt_ps2dq_256(<8 x float> %x0, <8 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ps2dq_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vcvtps2dq %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x5b,0xc8]
; CHECK-NEXT: vcvtps2dq %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x5b,0xc0]
@@ -2463,7 +2399,7 @@ declare <2 x double> @llvm.x86.avx512.mask.cvtps2pd.128(<4 x float>, <2 x double
define <2 x double>@test_int_x86_avx512_mask_cvt_ps2pd_128(<4 x float> %x0, <2 x double> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ps2pd_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vcvtps2pd %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x5a,0xc8]
; CHECK-NEXT: vcvtps2pd %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x5a,0xc0]
@@ -2479,7 +2415,7 @@ declare <4 x double> @llvm.x86.avx512.mask.cvtps2pd.256(<4 x float>, <4 x double
define <4 x double>@test_int_x86_avx512_mask_cvt_ps2pd_256(<4 x float> %x0, <4 x double> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ps2pd_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vcvtps2pd %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x5a,0xc8]
; CHECK-NEXT: vcvtps2pd %xmm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x5a,0xc0]
@@ -2495,7 +2431,7 @@ declare <4 x i32> @llvm.x86.avx512.mask.cvtps2udq.128(<4 x float>, <4 x i32>, i8
define <4 x i32>@test_int_x86_avx512_mask_cvt_ps2udq_128(<4 x float> %x0, <4 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ps2udq_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vcvtps2udq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x79,0xc8]
; CHECK-NEXT: vcvtps2udq %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x79,0xc0]
@@ -2511,7 +2447,7 @@ declare <8 x i32> @llvm.x86.avx512.mask.cvtps2udq.256(<8 x float>, <8 x i32>, i8
define <8 x i32>@test_int_x86_avx512_mask_cvt_ps2udq_256(<8 x float> %x0, <8 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ps2udq_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vcvtps2udq %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x79,0xc8]
; CHECK-NEXT: vcvtps2udq %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x79,0xc0]
@@ -2527,7 +2463,7 @@ declare <4 x i32> @llvm.x86.avx512.mask.cvttpd2dq.128(<2 x double>, <4 x i32>, i
define <4 x i32>@test_int_x86_avx512_mask_cvtt_pd2dq_128(<2 x double> %x0, <4 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_pd2dq_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vcvttpd2dq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0xe6,0xc8]
; CHECK-NEXT: vcvttpd2dq %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe6,0xc0]
@@ -2541,7 +2477,7 @@ define <4 x i32>@test_int_x86_avx512_mask_cvtt_pd2dq_128(<2 x double> %x0, <4 x
define <4 x i32>@test_int_x86_avx512_mask_cvtt_pd2dq_128_zext(<2 x double> %x0, <4 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_pd2dq_128_zext:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vcvttpd2dq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0xe6,0xc8]
; CHECK-NEXT: vmovq %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7e,0xc9]
@@ -2561,7 +2497,7 @@ declare <4 x i32> @llvm.x86.avx512.mask.cvttpd2dq.256(<4 x double>, <4 x i32>, i
define <4 x i32>@test_int_x86_avx512_mask_cvtt_pd2dq_256(<4 x double> %x0, <4 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_pd2dq_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vcvttpd2dq %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0xe6,0xc8]
; CHECK-NEXT: vcvttpd2dq %ymm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe6,0xc0]
@@ -2577,7 +2513,7 @@ declare <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.128(<2 x double>, <4 x i32>,
define <4 x i32>@test_int_x86_avx512_mask_cvtt_pd2udq_128(<2 x double> %x0, <4 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_pd2udq_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vcvttpd2udq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xfc,0x09,0x78,0xc8]
; CHECK-NEXT: vcvttpd2udq %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfc,0x08,0x78,0xc0]
@@ -2591,7 +2527,7 @@ define <4 x i32>@test_int_x86_avx512_mask_cvtt_pd2udq_128(<2 x double> %x0, <4 x
define <4 x i32>@test_int_x86_avx512_mask_cvtt_pd2udq_128_zext(<2 x double> %x0, <4 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_pd2udq_128_zext:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vcvttpd2udq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xfc,0x09,0x78,0xc8]
; CHECK-NEXT: vmovq %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7e,0xc9]
@@ -2611,7 +2547,7 @@ declare <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.256(<4 x double>, <4 x i32>,
define <4 x i32>@test_int_x86_avx512_mask_cvtt_pd2udq_256(<4 x double> %x0, <4 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_pd2udq_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vcvttpd2udq %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xfc,0x29,0x78,0xc8]
; CHECK-NEXT: vcvttpd2udq %ymm0, %xmm0 ## encoding: [0x62,0xf1,0xfc,0x28,0x78,0xc0]
@@ -2627,7 +2563,7 @@ declare <4 x i32> @llvm.x86.avx512.mask.cvttps2dq.128(<4 x float>, <4 x i32>, i8
define <4 x i32>@test_int_x86_avx512_mask_cvtt_ps2dq_128(<4 x float> %x0, <4 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_ps2dq_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vcvttps2dq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7e,0x09,0x5b,0xc8]
; CHECK-NEXT: vcvttps2dq %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x5b,0xc0]
@@ -2643,7 +2579,7 @@ declare <8 x i32> @llvm.x86.avx512.mask.cvttps2dq.256(<8 x float>, <8 x i32>, i8
define <8 x i32>@test_int_x86_avx512_mask_cvtt_ps2dq_256(<8 x float> %x0, <8 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_ps2dq_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vcvttps2dq %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7e,0x29,0x5b,0xc8]
; CHECK-NEXT: vcvttps2dq %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfe,0x5b,0xc0]
@@ -2659,7 +2595,7 @@ declare <4 x i32> @llvm.x86.avx512.mask.cvttps2udq.128(<4 x float>, <4 x i32>, i
define <4 x i32>@test_int_x86_avx512_mask_cvtt_ps2udq_128(<4 x float> %x0, <4 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_ps2udq_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vcvttps2udq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x78,0xc8]
; CHECK-NEXT: vcvttps2udq %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x78,0xc0]
@@ -2675,7 +2611,7 @@ declare <8 x i32> @llvm.x86.avx512.mask.cvttps2udq.256(<8 x float>, <8 x i32>, i
define <8 x i32>@test_int_x86_avx512_mask_cvtt_ps2udq_256(<8 x float> %x0, <8 x i32> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_ps2udq_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vcvttps2udq %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x78,0xc8]
; CHECK-NEXT: vcvttps2udq %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x78,0xc0]
@@ -2691,7 +2627,7 @@ declare <4 x float> @llvm.x86.avx512.mask.cvtudq2ps.128(<4 x i32>, <4 x float>,
define <4 x float>@test_int_x86_avx512_mask_cvt_udq2ps_128(<4 x i32> %x0, <4 x float> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_udq2ps_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vcvtudq2ps %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7f,0x09,0x7a,0xc8]
; CHECK-NEXT: vcvtudq2ps %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7f,0x08,0x7a,0xc0]
@@ -2707,7 +2643,7 @@ declare <8 x float> @llvm.x86.avx512.mask.cvtudq2ps.256(<8 x i32>, <8 x float>,
define <8 x float>@test_int_x86_avx512_mask_cvt_udq2ps_256(<8 x i32> %x0, <8 x float> %x1, i8 %x2) {
; CHECK-LABEL: test_int_x86_avx512_mask_cvt_udq2ps_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vcvtudq2ps %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7f,0x29,0x7a,0xc8]
; CHECK-NEXT: vcvtudq2ps %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7f,0x28,0x7a,0xc0]
@@ -2723,7 +2659,7 @@ declare <2 x double> @llvm.x86.avx512.mask.rndscale.pd.128(<2 x double>, i32, <2
define <2 x double>@test_int_x86_avx512_mask_rndscale_pd_128(<2 x double> %x0, <2 x double> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_rndscale_pd_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vrndscalepd $4, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x09,0xc8,0x04]
; CHECK-NEXT: vrndscalepd $88, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0xfd,0x08,0x09,0xc0,0x58]
@@ -2739,7 +2675,7 @@ declare <4 x double> @llvm.x86.avx512.mask.rndscale.pd.256(<4 x double>, i32, <4
define <4 x double>@test_int_x86_avx512_mask_rndscale_pd_256(<4 x double> %x0, <4 x double> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_rndscale_pd_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vrndscalepd $4, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x09,0xc8,0x04]
; CHECK-NEXT: vrndscalepd $88, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0xfd,0x28,0x09,0xc0,0x58]
@@ -2755,7 +2691,7 @@ declare <4 x float> @llvm.x86.avx512.mask.rndscale.ps.128(<4 x float>, i32, <4 x
define <4 x float>@test_int_x86_avx512_mask_rndscale_ps_128(<4 x float> %x0, <4 x float> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_rndscale_ps_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vrndscaleps $88, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x08,0xc8,0x58]
; CHECK-NEXT: vrndscaleps $4, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0x7d,0x08,0x08,0xc0,0x04]
@@ -2771,7 +2707,7 @@ declare <8 x float> @llvm.x86.avx512.mask.rndscale.ps.256(<8 x float>, i32, <8 x
define <8 x float>@test_int_x86_avx512_mask_rndscale_ps_256(<8 x float> %x0, <8 x float> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_rndscale_ps_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vrndscaleps $5, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x08,0xc8,0x05]
; CHECK-NEXT: vrndscaleps $66, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0x7d,0x28,0x08,0xc0,0x42]
@@ -2783,93 +2719,11 @@ define <8 x float>@test_int_x86_avx512_mask_rndscale_ps_256(<8 x float> %x0, <8
ret <8 x float> %res2
}
-declare <8 x float> @llvm.x86.avx512.mask.shuf.f32x4.256(<8 x float>, <8 x float>, i32, <8 x float>, i8)
-
-define <8 x float>@test_int_x86_avx512_mask_shuf_f32x4_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x3, i8 %x4) {
-; CHECK-LABEL: test_int_x86_avx512_mask_shuf_f32x4_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT: vshuff32x4 $22, %ymm1, %ymm0, %ymm3 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xa9,0x23,0xd9,0x16]
-; CHECK-NEXT: ## ymm3 {%k1} {z} = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; CHECK-NEXT: vshuff32x4 $22, %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x23,0xd1,0x16]
-; CHECK-NEXT: ## ymm2 {%k1} = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; CHECK-NEXT: vshuff32x4 $22, %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0x7d,0x28,0x23,0xc1,0x16]
-; CHECK-NEXT: ## ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; CHECK-NEXT: vaddps %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xc0]
-; CHECK-NEXT: vaddps %ymm0, %ymm3, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe4,0x58,0xc0]
-; CHECK-NEXT: retq ## encoding: [0xc3]
- %res = call <8 x float> @llvm.x86.avx512.mask.shuf.f32x4.256(<8 x float> %x0, <8 x float> %x1, i32 22, <8 x float> %x3, i8 %x4)
- %res1 = call <8 x float> @llvm.x86.avx512.mask.shuf.f32x4.256(<8 x float> %x0, <8 x float> %x1, i32 22, <8 x float> %x3, i8 -1)
- %res2 = call <8 x float> @llvm.x86.avx512.mask.shuf.f32x4.256(<8 x float> %x0, <8 x float> %x1, i32 22, <8 x float> zeroinitializer, i8 %x4)
- %res3 = fadd <8 x float> %res, %res1
- %res4 = fadd <8 x float> %res2, %res3
- ret <8 x float> %res4
-}
-
-declare <4 x double> @llvm.x86.avx512.mask.shuf.f64x2.256(<4 x double>, <4 x double>, i32, <4 x double>, i8)
-
-define <4 x double>@test_int_x86_avx512_mask_shuf_f64x2_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x3, i8 %x4) {
-; CHECK-LABEL: test_int_x86_avx512_mask_shuf_f64x2_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT: vshuff64x2 $22, %ymm1, %ymm0, %ymm3 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xa9,0x23,0xd9,0x16]
-; CHECK-NEXT: ## ymm3 {%k1} {z} = ymm0[0,1],ymm1[2,3]
-; CHECK-NEXT: vshuff64x2 $22, %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x23,0xd1,0x16]
-; CHECK-NEXT: ## ymm2 {%k1} = ymm0[0,1],ymm1[2,3]
-; CHECK-NEXT: vshuff64x2 $22, %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0xfd,0x28,0x23,0xc1,0x16]
-; CHECK-NEXT: ## ymm0 = ymm0[0,1],ymm1[2,3]
-; CHECK-NEXT: vaddpd %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xc0]
-; CHECK-NEXT: vaddpd %ymm0, %ymm3, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe5,0x58,0xc0]
-; CHECK-NEXT: retq ## encoding: [0xc3]
- %res = call <4 x double> @llvm.x86.avx512.mask.shuf.f64x2.256(<4 x double> %x0, <4 x double> %x1, i32 22, <4 x double> %x3, i8 %x4)
- %res1 = call <4 x double> @llvm.x86.avx512.mask.shuf.f64x2.256(<4 x double> %x0, <4 x double> %x1, i32 22, <4 x double> %x3, i8 -1)
- %res2 = call <4 x double> @llvm.x86.avx512.mask.shuf.f64x2.256(<4 x double> %x0, <4 x double> %x1, i32 22, <4 x double> zeroinitializer, i8 %x4)
- %res3 = fadd <4 x double> %res, %res1
- %res4 = fadd <4 x double> %res2, %res3
- ret <4 x double> %res4
-}
-
-declare <8 x i32> @llvm.x86.avx512.mask.shuf.i32x4.256(<8 x i32>, <8 x i32>, i32, <8 x i32>, i8)
-
-define <8 x i32>@test_int_x86_avx512_mask_shuf_i32x4_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x3, i8 %x4) {
-; CHECK-LABEL: test_int_x86_avx512_mask_shuf_i32x4_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT: vshufi32x4 $22, %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x43,0xd1,0x16]
-; CHECK-NEXT: ## ymm2 {%k1} = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; CHECK-NEXT: vshufi32x4 $22, %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0x7d,0x28,0x43,0xc1,0x16]
-; CHECK-NEXT: ## ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc0]
-; CHECK-NEXT: retq ## encoding: [0xc3]
- %res = call <8 x i32> @llvm.x86.avx512.mask.shuf.i32x4.256(<8 x i32> %x0, <8 x i32> %x1, i32 22, <8 x i32> %x3, i8 %x4)
- %res1 = call <8 x i32> @llvm.x86.avx512.mask.shuf.i32x4.256(<8 x i32> %x0, <8 x i32> %x1, i32 22, <8 x i32> %x3, i8 -1)
- %res2 = add <8 x i32> %res, %res1
- ret <8 x i32> %res2
-}
-
-declare <4 x i64> @llvm.x86.avx512.mask.shuf.i64x2.256(<4 x i64>, <4 x i64>, i32, <4 x i64>, i8)
-
-define <4 x i64>@test_int_x86_avx512_mask_shuf_i64x2_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x3, i8 %x4) {
-; CHECK-LABEL: test_int_x86_avx512_mask_shuf_i64x2_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT: vshufi64x2 $22, %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x43,0xd1,0x16]
-; CHECK-NEXT: ## ymm2 {%k1} = ymm0[0,1],ymm1[2,3]
-; CHECK-NEXT: vshufi64x2 $22, %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0xfd,0x28,0x43,0xc1,0x16]
-; CHECK-NEXT: ## ymm0 = ymm0[0,1],ymm1[2,3]
-; CHECK-NEXT: vpaddq %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc0]
-; CHECK-NEXT: retq ## encoding: [0xc3]
- %res = call <4 x i64> @llvm.x86.avx512.mask.shuf.i64x2.256(<4 x i64> %x0, <4 x i64> %x1, i32 22, <4 x i64> %x3, i8 %x4)
- %res1 = call <4 x i64> @llvm.x86.avx512.mask.shuf.i64x2.256(<4 x i64> %x0, <4 x i64> %x1, i32 22, <4 x i64> %x3, i8 -1)
- %res2 = add <4 x i64> %res, %res1
- ret <4 x i64> %res2
-}
-
declare <2 x double> @llvm.x86.avx512.mask.getmant.pd.128(<2 x double>, i32, <2 x double>, i8)
define <2 x double>@test_int_x86_avx512_mask_getmant_pd_128(<2 x double> %x0, <2 x double> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_getmant_pd_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vgetmantpd $11, %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0x89,0x26,0xd0,0x0b]
; CHECK-NEXT: vgetmantpd $11, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x26,0xc8,0x0b]
@@ -2889,7 +2743,7 @@ declare <4 x double> @llvm.x86.avx512.mask.getmant.pd.256(<4 x double>, i32, <4
define <4 x double>@test_int_x86_avx512_mask_getmant_pd_256(<4 x double> %x0, <4 x double> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_getmant_pd_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vgetmantpd $11, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x26,0xc8,0x0b]
; CHECK-NEXT: vgetmantpd $11, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0xfd,0x28,0x26,0xc0,0x0b]
@@ -2905,7 +2759,7 @@ declare <4 x float> @llvm.x86.avx512.mask.getmant.ps.128(<4 x float>, i32, <4 x
define <4 x float>@test_int_x86_avx512_mask_getmant_ps_128(<4 x float> %x0, <4 x float> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_getmant_ps_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vgetmantps $11, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x26,0xc8,0x0b]
; CHECK-NEXT: vgetmantps $11, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0x7d,0x08,0x26,0xc0,0x0b]
@@ -2921,7 +2775,7 @@ declare <8 x float> @llvm.x86.avx512.mask.getmant.ps.256(<8 x float>, i32, <8 x
define <8 x float>@test_int_x86_avx512_mask_getmant_ps_256(<8 x float> %x0, <8 x float> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_getmant_ps_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vgetmantps $11, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x26,0xc8,0x0b]
; CHECK-NEXT: vgetmantps $11, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0x7d,0x28,0x26,0xc0,0x0b]
@@ -2937,7 +2791,7 @@ declare <4 x i32> @llvm.x86.avx512.mask.pternlog.d.128(<4 x i32>, <4 x i32>, <4
define <4 x i32>@test_int_x86_avx512_mask_pternlog_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x4) {
; CHECK-LABEL: test_int_x86_avx512_mask_pternlog_d_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovdqa %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8]
; CHECK-NEXT: vpternlogd $33, %xmm2, %xmm1, %xmm3 ## encoding: [0x62,0xf3,0x75,0x08,0x25,0xda,0x21]
@@ -2954,7 +2808,7 @@ declare <4 x i32> @llvm.x86.avx512.maskz.pternlog.d.128(<4 x i32>, <4 x i32>, <4
define <4 x i32>@test_int_x86_avx512_maskz_pternlog_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x4) {
; CHECK-LABEL: test_int_x86_avx512_maskz_pternlog_d_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovdqa %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8]
; CHECK-NEXT: vpternlogd $33, %xmm2, %xmm1, %xmm3 ## encoding: [0x62,0xf3,0x75,0x08,0x25,0xda,0x21]
@@ -2971,7 +2825,7 @@ declare <8 x i32> @llvm.x86.avx512.mask.pternlog.d.256(<8 x i32>, <8 x i32>, <8
define <8 x i32>@test_int_x86_avx512_mask_pternlog_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x4) {
; CHECK-LABEL: test_int_x86_avx512_mask_pternlog_d_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovdqa %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8]
; CHECK-NEXT: vpternlogd $33, %ymm2, %ymm1, %ymm3 ## encoding: [0x62,0xf3,0x75,0x28,0x25,0xda,0x21]
@@ -2988,7 +2842,7 @@ declare <8 x i32> @llvm.x86.avx512.maskz.pternlog.d.256(<8 x i32>, <8 x i32>, <8
define <8 x i32>@test_int_x86_avx512_maskz_pternlog_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x4) {
; CHECK-LABEL: test_int_x86_avx512_maskz_pternlog_d_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovdqa %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8]
; CHECK-NEXT: vpternlogd $33, %ymm2, %ymm1, %ymm3 ## encoding: [0x62,0xf3,0x75,0x28,0x25,0xda,0x21]
@@ -3005,7 +2859,7 @@ declare <2 x i64> @llvm.x86.avx512.mask.pternlog.q.128(<2 x i64>, <2 x i64>, <2
define <2 x i64>@test_int_x86_avx512_mask_pternlog_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x4) {
; CHECK-LABEL: test_int_x86_avx512_mask_pternlog_q_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovdqa %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8]
; CHECK-NEXT: vpternlogq $33, %xmm2, %xmm1, %xmm3 ## encoding: [0x62,0xf3,0xf5,0x08,0x25,0xda,0x21]
@@ -3022,7 +2876,7 @@ declare <2 x i64> @llvm.x86.avx512.maskz.pternlog.q.128(<2 x i64>, <2 x i64>, <2
define <2 x i64>@test_int_x86_avx512_maskz_pternlog_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x4) {
; CHECK-LABEL: test_int_x86_avx512_maskz_pternlog_q_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovdqa %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8]
; CHECK-NEXT: vpternlogq $33, %xmm2, %xmm1, %xmm3 ## encoding: [0x62,0xf3,0xf5,0x08,0x25,0xda,0x21]
@@ -3039,7 +2893,7 @@ declare <4 x i64> @llvm.x86.avx512.mask.pternlog.q.256(<4 x i64>, <4 x i64>, <4
define <4 x i64>@test_int_x86_avx512_mask_pternlog_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x4) {
; CHECK-LABEL: test_int_x86_avx512_mask_pternlog_q_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovdqa %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8]
; CHECK-NEXT: vpternlogq $33, %ymm2, %ymm1, %ymm3 ## encoding: [0x62,0xf3,0xf5,0x28,0x25,0xda,0x21]
@@ -3056,7 +2910,7 @@ declare <4 x i64> @llvm.x86.avx512.maskz.pternlog.q.256(<4 x i64>, <4 x i64>, <4
define <4 x i64>@test_int_x86_avx512_maskz_pternlog_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x4) {
; CHECK-LABEL: test_int_x86_avx512_maskz_pternlog_q_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovdqa %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8]
; CHECK-NEXT: vpternlogq $33, %ymm2, %ymm1, %ymm3 ## encoding: [0x62,0xf3,0xf5,0x28,0x25,0xda,0x21]
@@ -3071,7 +2925,7 @@ define <4 x i64>@test_int_x86_avx512_maskz_pternlog_q_256(<4 x i64> %x0, <4 x i6
define <4 x float> @test_x86_vcvtph2ps_128(<8 x i16> %a0) {
; CHECK-LABEL: test_x86_vcvtph2ps_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vcvtph2ps %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x13,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.vcvtph2ps.128(<8 x i16> %a0, <4 x float> zeroinitializer, i8 -1)
@@ -3080,7 +2934,7 @@ define <4 x float> @test_x86_vcvtph2ps_128(<8 x i16> %a0) {
define <4 x float> @test_x86_vcvtph2ps_128_rrk(<8 x i16> %a0,<4 x float> %a1, i8 %mask) {
; CHECK-LABEL: test_x86_vcvtph2ps_128_rrk:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vcvtph2ps %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x13,0xc8]
; CHECK-NEXT: vmovaps %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1]
@@ -3092,7 +2946,7 @@ define <4 x float> @test_x86_vcvtph2ps_128_rrk(<8 x i16> %a0,<4 x float> %a1, i8
define <4 x float> @test_x86_vcvtph2ps_128_rrkz(<8 x i16> %a0, i8 %mask) {
; CHECK-LABEL: test_x86_vcvtph2ps_128_rrkz:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vcvtph2ps %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x13,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -3104,7 +2958,7 @@ declare <4 x float> @llvm.x86.avx512.mask.vcvtph2ps.128(<8 x i16>, <4 x float>,
define <8 x float> @test_x86_vcvtph2ps_256(<8 x i16> %a0) {
; CHECK-LABEL: test_x86_vcvtph2ps_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vcvtph2ps %xmm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x13,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.mask.vcvtph2ps.256(<8 x i16> %a0, <8 x float> zeroinitializer, i8 -1)
@@ -3113,7 +2967,7 @@ define <8 x float> @test_x86_vcvtph2ps_256(<8 x i16> %a0) {
define <8 x float> @test_x86_vcvtph2ps_256_rrk(<8 x i16> %a0,<8 x float> %a1, i8 %mask) {
; CHECK-LABEL: test_x86_vcvtph2ps_256_rrk:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vcvtph2ps %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x13,0xc8]
; CHECK-NEXT: vmovaps %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1]
@@ -3124,7 +2978,7 @@ define <8 x float> @test_x86_vcvtph2ps_256_rrk(<8 x i16> %a0,<8 x float> %a1, i8
define <8 x float> @test_x86_vcvtph2ps_256_rrkz(<8 x i16> %a0, i8 %mask) {
; CHECK-LABEL: test_x86_vcvtph2ps_256_rrkz:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vcvtph2ps %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x13,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -3136,7 +2990,7 @@ declare <8 x float> @llvm.x86.avx512.mask.vcvtph2ps.256(<8 x i16>, <8 x float>,
define <8 x i16> @test_x86_vcvtps2ph_128(<4 x float> %a0, i8 %mask, <8 x i16> %src) {
; CHECK-LABEL: test_x86_vcvtps2ph_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vcvtps2ph $2, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x1d,0xc1,0x02]
; CHECK-NEXT: vcvtps2ph $2, %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0x89,0x1d,0xc2,0x02]
@@ -3156,7 +3010,7 @@ declare <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.128(<4 x float>, i32, <8 x i16
define <8 x i16> @test_x86_vcvtps2ph_256(<8 x float> %a0, i8 %mask, <8 x i16> %src) {
; CHECK-LABEL: test_x86_vcvtps2ph_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vcvtps2ph $2, %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x1d,0xc1,0x02]
; CHECK-NEXT: vcvtps2ph $2, %ymm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xa9,0x1d,0xc2,0x02]
@@ -3176,7 +3030,7 @@ declare <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.256(<8 x float>, i32, <8 x i16
define <8 x float> @test_rsqrt_ps_256_rr(<8 x float> %a0) {
; CHECK-LABEL: test_rsqrt_ps_256_rr:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vrsqrt14ps %ymm0, %ymm0 ## encoding: [0x62,0xf2,0x7d,0x28,0x4e,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.rsqrt14.ps.256(<8 x float> %a0, <8 x float> zeroinitializer, i8 -1)
@@ -3185,7 +3039,7 @@ define <8 x float> @test_rsqrt_ps_256_rr(<8 x float> %a0) {
define <8 x float> @test_rsqrt_ps_256_rrkz(<8 x float> %a0, i8 %mask) {
; CHECK-LABEL: test_rsqrt_ps_256_rrkz:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vrsqrt14ps %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x4e,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -3195,7 +3049,7 @@ define <8 x float> @test_rsqrt_ps_256_rrkz(<8 x float> %a0, i8 %mask) {
define <8 x float> @test_rsqrt_ps_256_rrk(<8 x float> %a0, <8 x float> %a1, i8 %mask) {
; CHECK-LABEL: test_rsqrt_ps_256_rrk:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vrsqrt14ps %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x4e,0xc8]
; CHECK-NEXT: vmovaps %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1]
@@ -3206,7 +3060,7 @@ define <8 x float> @test_rsqrt_ps_256_rrk(<8 x float> %a0, <8 x float> %a1, i8 %
define <4 x float> @test_rsqrt_ps_128_rr(<4 x float> %a0) {
; CHECK-LABEL: test_rsqrt_ps_128_rr:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vrsqrt14ps %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7d,0x08,0x4e,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.rsqrt14.ps.128(<4 x float> %a0, <4 x float> zeroinitializer, i8 -1)
@@ -3215,7 +3069,7 @@ define <4 x float> @test_rsqrt_ps_128_rr(<4 x float> %a0) {
define <4 x float> @test_rsqrt_ps_128_rrkz(<4 x float> %a0, i8 %mask) {
; CHECK-LABEL: test_rsqrt_ps_128_rrkz:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vrsqrt14ps %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x4e,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -3225,7 +3079,7 @@ define <4 x float> @test_rsqrt_ps_128_rrkz(<4 x float> %a0, i8 %mask) {
define <4 x float> @test_rsqrt_ps_128_rrk(<4 x float> %a0, <4 x float> %a1, i8 %mask) {
; CHECK-LABEL: test_rsqrt_ps_128_rrk:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vrsqrt14ps %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x4e,0xc8]
; CHECK-NEXT: vmovaps %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1]
@@ -3239,7 +3093,7 @@ declare <4 x float> @llvm.x86.avx512.rsqrt14.ps.128(<4 x float>, <4 x float>, i8
define <8 x float> @test_rcp_ps_256_rr(<8 x float> %a0) {
; CHECK-LABEL: test_rcp_ps_256_rr:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vrcp14ps %ymm0, %ymm0 ## encoding: [0x62,0xf2,0x7d,0x28,0x4c,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x float> @llvm.x86.avx512.rcp14.ps.256(<8 x float> %a0, <8 x float> zeroinitializer, i8 -1)
@@ -3248,7 +3102,7 @@ define <8 x float> @test_rcp_ps_256_rr(<8 x float> %a0) {
define <8 x float> @test_rcp_ps_256_rrkz(<8 x float> %a0, i8 %mask) {
; CHECK-LABEL: test_rcp_ps_256_rrkz:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vrcp14ps %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x4c,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -3258,7 +3112,7 @@ define <8 x float> @test_rcp_ps_256_rrkz(<8 x float> %a0, i8 %mask) {
define <8 x float> @test_rcp_ps_256_rrk(<8 x float> %a0, <8 x float> %a1, i8 %mask) {
; CHECK-LABEL: test_rcp_ps_256_rrk:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vrcp14ps %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x4c,0xc8]
; CHECK-NEXT: vmovaps %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1]
@@ -3269,7 +3123,7 @@ define <8 x float> @test_rcp_ps_256_rrk(<8 x float> %a0, <8 x float> %a1, i8 %ma
define <4 x float> @test_rcp_ps_128_rr(<4 x float> %a0) {
; CHECK-LABEL: test_rcp_ps_128_rr:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vrcp14ps %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7d,0x08,0x4c,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.rcp14.ps.128(<4 x float> %a0, <4 x float> zeroinitializer, i8 -1)
@@ -3278,7 +3132,7 @@ define <4 x float> @test_rcp_ps_128_rr(<4 x float> %a0) {
define <4 x float> @test_rcp_ps_128_rrkz(<4 x float> %a0, i8 %mask) {
; CHECK-LABEL: test_rcp_ps_128_rrkz:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vrcp14ps %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x4c,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -3288,7 +3142,7 @@ define <4 x float> @test_rcp_ps_128_rrkz(<4 x float> %a0, i8 %mask) {
define <4 x float> @test_rcp_ps_128_rrk(<4 x float> %a0, <4 x float> %a1, i8 %mask) {
; CHECK-LABEL: test_rcp_ps_128_rrk:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vrcp14ps %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x4c,0xc8]
; CHECK-NEXT: vmovaps %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1]
@@ -3302,7 +3156,7 @@ declare <4 x float> @llvm.x86.avx512.rcp14.ps.128(<4 x float>, <4 x float>, i8)
define <4 x double> @test_rsqrt_pd_256_rr(<4 x double> %a0) {
; CHECK-LABEL: test_rsqrt_pd_256_rr:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vrsqrt14pd %ymm0, %ymm0 ## encoding: [0x62,0xf2,0xfd,0x28,0x4e,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx512.rsqrt14.pd.256(<4 x double> %a0, <4 x double> zeroinitializer, i8 -1)
@@ -3311,7 +3165,7 @@ define <4 x double> @test_rsqrt_pd_256_rr(<4 x double> %a0) {
define <4 x double> @test_rsqrt_pd_256_rrkz(<4 x double> %a0, i8 %mask) {
; CHECK-LABEL: test_rsqrt_pd_256_rrkz:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vrsqrt14pd %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x4e,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -3321,7 +3175,7 @@ define <4 x double> @test_rsqrt_pd_256_rrkz(<4 x double> %a0, i8 %mask) {
define <4 x double> @test_rsqrt_pd_256_rrk(<4 x double> %a0, <4 x double> %a1, i8 %mask) {
; CHECK-LABEL: test_rsqrt_pd_256_rrk:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vrsqrt14pd %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x4e,0xc8]
; CHECK-NEXT: vmovapd %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xc1]
@@ -3332,7 +3186,7 @@ define <4 x double> @test_rsqrt_pd_256_rrk(<4 x double> %a0, <4 x double> %a1, i
define <2 x double> @test_rsqrt_pd_128_rr(<2 x double> %a0) {
; CHECK-LABEL: test_rsqrt_pd_128_rr:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vrsqrt14pd %xmm0, %xmm0 ## encoding: [0x62,0xf2,0xfd,0x08,0x4e,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.avx512.rsqrt14.pd.128(<2 x double> %a0, <2 x double> zeroinitializer, i8 -1)
@@ -3341,7 +3195,7 @@ define <2 x double> @test_rsqrt_pd_128_rr(<2 x double> %a0) {
define <2 x double> @test_rsqrt_pd_128_rrkz(<2 x double> %a0, i8 %mask) {
; CHECK-LABEL: test_rsqrt_pd_128_rrkz:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vrsqrt14pd %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0x4e,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -3351,7 +3205,7 @@ define <2 x double> @test_rsqrt_pd_128_rrkz(<2 x double> %a0, i8 %mask) {
define <2 x double> @test_rsqrt_pd_128_rrk(<2 x double> %a0, <2 x double> %a1, i8 %mask) {
; CHECK-LABEL: test_rsqrt_pd_128_rrk:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vrsqrt14pd %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x4e,0xc8]
; CHECK-NEXT: vmovapd %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc1]
@@ -3365,7 +3219,7 @@ declare <2 x double> @llvm.x86.avx512.rsqrt14.pd.128(<2 x double>, <2 x double>,
define <4 x double> @test_rcp_pd_256_rr(<4 x double> %a0) {
; CHECK-LABEL: test_rcp_pd_256_rr:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vrcp14pd %ymm0, %ymm0 ## encoding: [0x62,0xf2,0xfd,0x28,0x4c,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx512.rcp14.pd.256(<4 x double> %a0, <4 x double> zeroinitializer, i8 -1)
@@ -3374,7 +3228,7 @@ define <4 x double> @test_rcp_pd_256_rr(<4 x double> %a0) {
define <4 x double> @test_rcp_pd_256_rrkz(<4 x double> %a0, i8 %mask) {
; CHECK-LABEL: test_rcp_pd_256_rrkz:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vrcp14pd %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x4c,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -3384,7 +3238,7 @@ define <4 x double> @test_rcp_pd_256_rrkz(<4 x double> %a0, i8 %mask) {
define <4 x double> @test_rcp_pd_256_rrk(<4 x double> %a0, <4 x double> %a1, i8 %mask) {
; CHECK-LABEL: test_rcp_pd_256_rrk:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vrcp14pd %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x4c,0xc8]
; CHECK-NEXT: vmovapd %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xc1]
@@ -3395,7 +3249,7 @@ define <4 x double> @test_rcp_pd_256_rrk(<4 x double> %a0, <4 x double> %a1, i8
define <2 x double> @test_rcp_pd_128_rr(<2 x double> %a0) {
; CHECK-LABEL: test_rcp_pd_128_rr:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vrcp14pd %xmm0, %xmm0 ## encoding: [0x62,0xf2,0xfd,0x08,0x4c,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.avx512.rcp14.pd.128(<2 x double> %a0, <2 x double> zeroinitializer, i8 -1)
@@ -3404,7 +3258,7 @@ define <2 x double> @test_rcp_pd_128_rr(<2 x double> %a0) {
define <2 x double> @test_rcp_pd_128_rrkz(<2 x double> %a0, i8 %mask) {
; CHECK-LABEL: test_rcp_pd_128_rrkz:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vrcp14pd %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0x4c,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -3414,7 +3268,7 @@ define <2 x double> @test_rcp_pd_128_rrkz(<2 x double> %a0, i8 %mask) {
define <2 x double> @test_rcp_pd_128_rrk(<2 x double> %a0, <2 x double> %a1, i8 %mask) {
; CHECK-LABEL: test_rcp_pd_128_rrk:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vrcp14pd %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x4c,0xc8]
; CHECK-NEXT: vmovapd %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc1]
@@ -3426,85 +3280,11 @@ define <2 x double> @test_rcp_pd_128_rrk(<2 x double> %a0, <2 x double> %a1, i8
declare <4 x double> @llvm.x86.avx512.rcp14.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone
declare <2 x double> @llvm.x86.avx512.rcp14.pd.128(<2 x double>, <2 x double>, i8) nounwind readnone
-declare <8 x float> @llvm.x86.avx512.mask.broadcastf32x4.256(<4 x float>, <8 x float>, i8)
-
-define <8 x float>@test_int_x86_avx512_mask_broadcastf32x4_256(<4 x float> %x0, <8 x float> %x2, i8 %mask) {
-; CHECK-LABEL: test_int_x86_avx512_mask_broadcastf32x4_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<def>
-; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT: vshuff32x4 $0, %ymm0, %ymm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xa9,0x23,0xd0,0x00]
-; CHECK-NEXT: ## ymm2 {%k1} {z} = ymm0[0,1,2,3,0,1,2,3]
-; CHECK-NEXT: vshuff32x4 $0, %ymm0, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x23,0xc8,0x00]
-; CHECK-NEXT: ## ymm1 {%k1} = ymm0[0,1,2,3,0,1,2,3]
-; CHECK-NEXT: vshuff32x4 $0, %ymm0, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0x7d,0x28,0x23,0xc0,0x00]
-; CHECK-NEXT: ## ymm0 = ymm0[0,1,2,3,0,1,2,3]
-; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc1]
-; CHECK-NEXT: vaddps %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xc0]
-; CHECK-NEXT: retq ## encoding: [0xc3]
- %res1 = call <8 x float> @llvm.x86.avx512.mask.broadcastf32x4.256(<4 x float> %x0, <8 x float> %x2, i8 -1)
- %res2 = call <8 x float> @llvm.x86.avx512.mask.broadcastf32x4.256(<4 x float> %x0, <8 x float> %x2, i8 %mask)
- %res3 = call <8 x float> @llvm.x86.avx512.mask.broadcastf32x4.256(<4 x float> %x0, <8 x float> zeroinitializer, i8 %mask)
- %res4 = fadd <8 x float> %res1, %res2
- %res5 = fadd <8 x float> %res3, %res4
- ret <8 x float> %res5
-}
-
-define <8 x float>@test_int_x86_avx512_mask_broadcastf32x4_256_load(<4 x float>* %x0ptr, <8 x float> %x2, i8 %mask) {
-; CHECK-LABEL: test_int_x86_avx512_mask_broadcastf32x4_256_load:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
-; CHECK-NEXT: vmovaps (%rdi), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x0f]
-; CHECK-NEXT: vshuff32x4 $0, %ymm1, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf3,0x75,0x29,0x23,0xc1,0x00]
-; CHECK-NEXT: ## ymm0 {%k1} = ymm1[0,1,2,3,0,1,2,3]
-; CHECK-NEXT: retq ## encoding: [0xc3]
- %x0 = load <4 x float>, <4 x float>* %x0ptr
- %res = call <8 x float> @llvm.x86.avx512.mask.broadcastf32x4.256(<4 x float> %x0, <8 x float> %x2, i8 %mask)
- ret <8 x float> %res
-}
-
-declare <8 x i32> @llvm.x86.avx512.mask.broadcasti32x4.256(<4 x i32>, <8 x i32>, i8)
-
-define <8 x i32>@test_int_x86_avx512_mask_broadcasti32x4_256(<4 x i32> %x0, <8 x i32> %x2, i8 %mask) {
-; CHECK-LABEL: test_int_x86_avx512_mask_broadcasti32x4_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<def>
-; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT: vshufi32x4 $0, %ymm0, %ymm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xa9,0x43,0xd0,0x00]
-; CHECK-NEXT: ## ymm2 {%k1} {z} = ymm0[0,1,2,3,0,1,2,3]
-; CHECK-NEXT: vshufi32x4 $0, %ymm0, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x43,0xc8,0x00]
-; CHECK-NEXT: ## ymm1 {%k1} = ymm0[0,1,2,3,0,1,2,3]
-; CHECK-NEXT: vshufi32x4 $0, %ymm0, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0x7d,0x28,0x43,0xc0,0x00]
-; CHECK-NEXT: ## ymm0 = ymm0[0,1,2,3,0,1,2,3]
-; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc1]
-; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc0]
-; CHECK-NEXT: retq ## encoding: [0xc3]
- %res1 = call <8 x i32> @llvm.x86.avx512.mask.broadcasti32x4.256(<4 x i32> %x0, <8 x i32> %x2, i8 -1)
- %res2 = call <8 x i32> @llvm.x86.avx512.mask.broadcasti32x4.256(<4 x i32> %x0, <8 x i32> %x2, i8 %mask)
- %res3 = call <8 x i32> @llvm.x86.avx512.mask.broadcasti32x4.256(<4 x i32> %x0, <8 x i32> zeroinitializer, i8 %mask)
- %res4 = add <8 x i32> %res1, %res2
- %res5 = add <8 x i32> %res3, %res4
- ret <8 x i32> %res5
-}
-
-define <8 x i32>@test_int_x86_avx512_mask_broadcasti32x4_256_load(<4 x i32>* %x0ptr, <8 x i32> %x2, i8 %mask) {
-; CHECK-LABEL: test_int_x86_avx512_mask_broadcasti32x4_256_load:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
-; CHECK-NEXT: vmovdqa (%rdi), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0x0f]
-; CHECK-NEXT: vshufi32x4 $0, %ymm1, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf3,0x75,0x29,0x43,0xc1,0x00]
-; CHECK-NEXT: ## ymm0 {%k1} = ymm1[0,1,2,3,0,1,2,3]
-; CHECK-NEXT: retq ## encoding: [0xc3]
- %x0 = load <4 x i32>, <4 x i32>* %x0ptr
- %res = call <8 x i32> @llvm.x86.avx512.mask.broadcasti32x4.256(<4 x i32> %x0, <8 x i32> %x2, i8 %mask)
- ret <8 x i32> %res
-}
-
declare <4 x i32> @llvm.x86.avx512.mask.prorv.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i8)
define <4 x i32>@test_int_x86_avx512_mask_prorv_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_prorv_d_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vprorvd %xmm1, %xmm0, %xmm3 ## encoding: [0x62,0xf2,0x7d,0x08,0x14,0xd9]
; CHECK-NEXT: vprorvd %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x14,0xd1]
@@ -3524,7 +3304,7 @@ declare <8 x i32> @llvm.x86.avx512.mask.prorv.d.256(<8 x i32>, <8 x i32>, <8 x i
define <8 x i32>@test_int_x86_avx512_mask_prorv_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_prorv_d_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vprorvd %ymm1, %ymm0, %ymm3 ## encoding: [0x62,0xf2,0x7d,0x28,0x14,0xd9]
; CHECK-NEXT: vprorvd %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x14,0xd1]
@@ -3544,7 +3324,7 @@ declare <2 x i64> @llvm.x86.avx512.mask.prorv.q.128(<2 x i64>, <2 x i64>, <2 x i
define <2 x i64>@test_int_x86_avx512_mask_prorv_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_prorv_q_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vprorvq %xmm1, %xmm0, %xmm3 ## encoding: [0x62,0xf2,0xfd,0x08,0x14,0xd9]
; CHECK-NEXT: vprorvq %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x14,0xd1]
@@ -3564,7 +3344,7 @@ declare <4 x i64> @llvm.x86.avx512.mask.prorv.q.256(<4 x i64>, <4 x i64>, <4 x i
define <4 x i64>@test_int_x86_avx512_mask_prorv_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_prorv_q_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vprorvq %ymm1, %ymm0, %ymm3 ## encoding: [0x62,0xf2,0xfd,0x28,0x14,0xd9]
; CHECK-NEXT: vprorvq %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x14,0xd1]
@@ -3584,7 +3364,7 @@ declare <4 x i32> @llvm.x86.avx512.mask.prol.d.128(<4 x i32>, i32, <4 x i32>, i8
define <4 x i32>@test_int_x86_avx512_mask_prol_d_128(<4 x i32> %x0, i32 %x1, <4 x i32> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_prol_d_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vprold $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x09,0x72,0xc8,0x03]
; CHECK-NEXT: vprold $3, %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf1,0x6d,0x89,0x72,0xc8,0x03]
@@ -3604,7 +3384,7 @@ declare <8 x i32> @llvm.x86.avx512.mask.prol.d.256(<8 x i32>, i32, <8 x i32>, i8
define <8 x i32>@test_int_x86_avx512_mask_prol_d_256(<8 x i32> %x0, i32 %x1, <8 x i32> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_prol_d_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vprold $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x29,0x72,0xc8,0x03]
; CHECK-NEXT: vprold $3, %ymm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf1,0x6d,0xa9,0x72,0xc8,0x03]
@@ -3624,7 +3404,7 @@ declare <2 x i64> @llvm.x86.avx512.mask.prol.q.128(<2 x i64>, i32, <2 x i64>, i8
define <2 x i64>@test_int_x86_avx512_mask_prol_q_128(<2 x i64> %x0, i32 %x1, <2 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_prol_q_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vprolq $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x09,0x72,0xc8,0x03]
; CHECK-NEXT: vprolq $3, %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf1,0xed,0x89,0x72,0xc8,0x03]
@@ -3644,7 +3424,7 @@ declare <4 x i64> @llvm.x86.avx512.mask.prol.q.256(<4 x i64>, i32, <4 x i64>, i8
define <4 x i64>@test_int_x86_avx512_mask_prol_q_256(<4 x i64> %x0, i32 %x1, <4 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_prol_q_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vprolq $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x29,0x72,0xc8,0x03]
; CHECK-NEXT: vprolq $3, %ymm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf1,0xed,0xa9,0x72,0xc8,0x03]
@@ -3664,7 +3444,7 @@ declare <4 x i32> @llvm.x86.avx512.mask.prolv.d.128(<4 x i32>, <4 x i32>, <4 x i
define <4 x i32>@test_int_x86_avx512_mask_prolv_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_prolv_d_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vprolvd %xmm1, %xmm0, %xmm3 ## encoding: [0x62,0xf2,0x7d,0x08,0x15,0xd9]
; CHECK-NEXT: vprolvd %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x15,0xd1]
@@ -3684,7 +3464,7 @@ declare <8 x i32> @llvm.x86.avx512.mask.prolv.d.256(<8 x i32>, <8 x i32>, <8 x i
define <8 x i32>@test_int_x86_avx512_mask_prolv_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_prolv_d_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vprolvd %ymm1, %ymm0, %ymm3 ## encoding: [0x62,0xf2,0x7d,0x28,0x15,0xd9]
; CHECK-NEXT: vprolvd %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x15,0xd1]
@@ -3704,7 +3484,7 @@ declare <2 x i64> @llvm.x86.avx512.mask.prolv.q.128(<2 x i64>, <2 x i64>, <2 x i
define <2 x i64>@test_int_x86_avx512_mask_prolv_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_prolv_q_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vprolvq %xmm1, %xmm0, %xmm3 ## encoding: [0x62,0xf2,0xfd,0x08,0x15,0xd9]
; CHECK-NEXT: vprolvq %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x15,0xd1]
@@ -3724,7 +3504,7 @@ declare <4 x i64> @llvm.x86.avx512.mask.prolv.q.256(<4 x i64>, <4 x i64>, <4 x i
define <4 x i64>@test_int_x86_avx512_mask_prolv_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_prolv_q_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vprolvq %ymm1, %ymm0, %ymm3 ## encoding: [0x62,0xf2,0xfd,0x28,0x15,0xd9]
; CHECK-NEXT: vprolvq %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x15,0xd1]
@@ -3744,7 +3524,7 @@ declare <4 x i32> @llvm.x86.avx512.mask.pror.d.128(<4 x i32>, i32, <4 x i32>, i8
define <4 x i32>@test_int_x86_avx512_mask_pror_d_128(<4 x i32> %x0, i32 %x1, <4 x i32> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_pror_d_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vprord $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x09,0x72,0xc0,0x03]
; CHECK-NEXT: vprord $3, %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf1,0x6d,0x89,0x72,0xc0,0x03]
@@ -3764,7 +3544,7 @@ declare <8 x i32> @llvm.x86.avx512.mask.pror.d.256(<8 x i32>, i32, <8 x i32>, i8
define <8 x i32>@test_int_x86_avx512_mask_pror_d_256(<8 x i32> %x0, i32 %x1, <8 x i32> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_pror_d_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vprord $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x29,0x72,0xc0,0x03]
; CHECK-NEXT: vprord $3, %ymm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf1,0x6d,0xa9,0x72,0xc0,0x03]
@@ -3784,7 +3564,7 @@ declare <2 x i64> @llvm.x86.avx512.mask.pror.q.128(<2 x i64>, i32, <2 x i64>, i8
define <2 x i64>@test_int_x86_avx512_mask_pror_q_128(<2 x i64> %x0, i32 %x1, <2 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_pror_q_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vprorq $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x09,0x72,0xc0,0x03]
; CHECK-NEXT: vprorq $3, %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf1,0xed,0x89,0x72,0xc0,0x03]
@@ -3804,7 +3584,7 @@ declare <4 x i64> @llvm.x86.avx512.mask.pror.q.256(<4 x i64>, i32, <4 x i64>, i8
define <4 x i64>@test_int_x86_avx512_mask_pror_q_256(<4 x i64> %x0, i32 %x1, <4 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_pror_q_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vprorq $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x29,0x72,0xc0,0x03]
; CHECK-NEXT: vprorq $3, %ymm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf1,0xed,0xa9,0x72,0xc0,0x03]
@@ -3824,7 +3604,7 @@ declare <4 x double> @llvm.x86.avx512.mask.permvar.df.256(<4 x double>, <4 x i64
define <4 x double>@test_int_x86_avx512_mask_permvar_df_256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_permvar_df_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpermpd %ymm0, %ymm1, %ymm3 ## encoding: [0x62,0xf2,0xf5,0x28,0x16,0xd8]
; CHECK-NEXT: vpermpd %ymm0, %ymm1, %ymm2 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0x16,0xd0]
@@ -3844,7 +3624,7 @@ declare <4 x i64> @llvm.x86.avx512.mask.permvar.di.256(<4 x i64>, <4 x i64>, <4
define <4 x i64>@test_int_x86_avx512_mask_permvar_di_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_permvar_di_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpermq %ymm0, %ymm1, %ymm3 ## encoding: [0x62,0xf2,0xf5,0x28,0x36,0xd8]
; CHECK-NEXT: vpermq %ymm0, %ymm1, %ymm2 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0x36,0xd0]
@@ -3864,7 +3644,7 @@ declare <8 x float> @llvm.x86.avx512.mask.permvar.sf.256(<8 x float>, <8 x i32>,
define <8 x float>@test_int_x86_avx512_mask_permvar_sf_256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_permvar_sf_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0x16,0xd8]
; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm2 {%k1} ## encoding: [0x62,0xf2,0x75,0x29,0x16,0xd0]
@@ -3884,7 +3664,7 @@ declare <8 x i32> @llvm.x86.avx512.mask.permvar.si.256(<8 x i32>, <8 x i32>, <8
define <8 x i32>@test_int_x86_avx512_mask_permvar_si_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_permvar_si_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpermd %ymm0, %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0x36,0xd8]
; CHECK-NEXT: vpermd %ymm0, %ymm1, %ymm2 {%k1} ## encoding: [0x62,0xf2,0x75,0x29,0x36,0xd0]
@@ -3904,7 +3684,7 @@ declare <2 x double> @llvm.x86.avx512.mask.fixupimm.pd.128(<2 x double>, <2 x do
define <2 x double>@test_int_x86_avx512_mask_fixupimm_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x i64> %x2, i8 %x4) {
; CHECK-LABEL: test_int_x86_avx512_mask_fixupimm_pd_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovapd %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xd8]
; CHECK-NEXT: vfixupimmpd $5, %xmm2, %xmm1, %xmm3 {%k1} ## encoding: [0x62,0xf3,0xf5,0x09,0x54,0xda,0x05]
@@ -3926,7 +3706,7 @@ declare <2 x double> @llvm.x86.avx512.maskz.fixupimm.pd.128(<2 x double>, <2 x d
define <2 x double>@test_int_x86_avx512_maskz_fixupimm_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x i64> %x2, i8 %x4) {
; CHECK-LABEL: test_int_x86_avx512_maskz_fixupimm_pd_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovapd %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xd8]
; CHECK-NEXT: vfixupimmpd $5, %xmm2, %xmm1, %xmm3 {%k1} {z} ## encoding: [0x62,0xf3,0xf5,0x89,0x54,0xda,0x05]
@@ -3946,11 +3726,11 @@ declare <4 x double> @llvm.x86.avx512.mask.fixupimm.pd.256(<4 x double>, <4 x do
define <4 x double>@test_int_x86_avx512_mask_fixupimm_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x i64> %x2, i8 %x4) {
; CHECK-LABEL: test_int_x86_avx512_mask_fixupimm_pd_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovapd %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xd8]
; CHECK-NEXT: vfixupimmpd $4, %ymm2, %ymm1, %ymm3 {%k1} ## encoding: [0x62,0xf3,0xf5,0x29,0x54,0xda,0x04]
-; CHECK-NEXT: vpxor %ymm4, %ymm4, %ymm4 ## EVEX TO VEX Compression encoding: [0xc5,0xdd,0xef,0xe4]
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4]
; CHECK-NEXT: vfixupimmpd $5, %ymm2, %ymm1, %ymm4 {%k1} {z} ## encoding: [0x62,0xf3,0xf5,0xa9,0x54,0xe2,0x05]
; CHECK-NEXT: vaddpd %ymm4, %ymm3, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xe5,0x58,0xdc]
; CHECK-NEXT: vfixupimmpd $3, %ymm2, %ymm1, %ymm0 ## encoding: [0x62,0xf3,0xf5,0x28,0x54,0xc2,0x03]
@@ -3968,11 +3748,11 @@ declare <4 x double> @llvm.x86.avx512.maskz.fixupimm.pd.256(<4 x double>, <4 x d
define <4 x double>@test_int_x86_avx512_maskz_fixupimm_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x i64> %x2, i8 %x4) {
; CHECK-LABEL: test_int_x86_avx512_maskz_fixupimm_pd_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovapd %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xd8]
; CHECK-NEXT: vfixupimmpd $5, %ymm2, %ymm1, %ymm3 {%k1} {z} ## encoding: [0x62,0xf3,0xf5,0xa9,0x54,0xda,0x05]
-; CHECK-NEXT: vpxor %ymm4, %ymm4, %ymm4 ## EVEX TO VEX Compression encoding: [0xc5,0xdd,0xef,0xe4]
+; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4]
; CHECK-NEXT: vmovapd %ymm0, %ymm5 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xe8]
; CHECK-NEXT: vfixupimmpd $4, %ymm4, %ymm1, %ymm5 {%k1} {z} ## encoding: [0x62,0xf3,0xf5,0xa9,0x54,0xec,0x04]
; CHECK-NEXT: vaddpd %ymm5, %ymm3, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xe5,0x58,0xdd]
@@ -3991,7 +3771,7 @@ declare <4 x float> @llvm.x86.avx512.mask.fixupimm.ps.128(<4 x float>, <4 x floa
define <4 x float>@test_int_x86_avx512_mask_fixupimm_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i8 %x4) {
; CHECK-LABEL: test_int_x86_avx512_mask_fixupimm_ps_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovaps %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xd8]
; CHECK-NEXT: vfixupimmps $5, %xmm2, %xmm1, %xmm3 ## encoding: [0x62,0xf3,0x75,0x08,0x54,0xda,0x05]
@@ -4014,7 +3794,7 @@ declare <4 x float> @llvm.x86.avx512.maskz.fixupimm.ps.128(<4 x float>, <4 x flo
define <4 x float>@test_int_x86_avx512_maskz_fixupimm_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i8 %x4) {
; CHECK-LABEL: test_int_x86_avx512_maskz_fixupimm_ps_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovaps %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xd8]
; CHECK-NEXT: vfixupimmps $5, %xmm2, %xmm1, %xmm3 ## encoding: [0x62,0xf3,0x75,0x08,0x54,0xda,0x05]
@@ -4037,13 +3817,13 @@ declare <8 x float> @llvm.x86.avx512.mask.fixupimm.ps.256(<8 x float>, <8 x floa
define <8 x float>@test_int_x86_avx512_mask_fixupimm_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x i32> %x2, i8 %x4) {
; CHECK-LABEL: test_int_x86_avx512_mask_fixupimm_ps_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovaps %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xd8]
; CHECK-NEXT: vfixupimmps $5, %ymm2, %ymm1, %ymm3 ## encoding: [0x62,0xf3,0x75,0x28,0x54,0xda,0x05]
; CHECK-NEXT: vmovaps %ymm0, %ymm4 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xe0]
; CHECK-NEXT: vfixupimmps $5, %ymm2, %ymm1, %ymm4 {%k1} ## encoding: [0x62,0xf3,0x75,0x29,0x54,0xe2,0x05]
-; CHECK-NEXT: vpxor %ymm2, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xef,0xd2]
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2]
; CHECK-NEXT: vfixupimmps $5, %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf3,0x75,0x29,0x54,0xc2,0x05]
; CHECK-NEXT: vaddps %ymm0, %ymm4, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xdc,0x58,0xc0]
; CHECK-NEXT: vaddps %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc3]
@@ -4060,13 +3840,13 @@ declare <8 x float> @llvm.x86.avx512.maskz.fixupimm.ps.256(<8 x float>, <8 x flo
define <8 x float>@test_int_x86_avx512_maskz_fixupimm_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x i32> %x2, i8 %x4) {
; CHECK-LABEL: test_int_x86_avx512_maskz_fixupimm_ps_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovaps %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xd8]
; CHECK-NEXT: vfixupimmps $5, %ymm2, %ymm1, %ymm3 ## encoding: [0x62,0xf3,0x75,0x28,0x54,0xda,0x05]
; CHECK-NEXT: vmovaps %ymm0, %ymm4 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xe0]
; CHECK-NEXT: vfixupimmps $5, %ymm2, %ymm1, %ymm4 {%k1} {z} ## encoding: [0x62,0xf3,0x75,0xa9,0x54,0xe2,0x05]
-; CHECK-NEXT: vpxor %ymm2, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xef,0xd2]
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2]
; CHECK-NEXT: vfixupimmps $5, %ymm2, %ymm1, %ymm0 {%k1} {z} ## encoding: [0x62,0xf3,0x75,0xa9,0x54,0xc2,0x05]
; CHECK-NEXT: vaddps %ymm0, %ymm4, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xdc,0x58,0xc0]
; CHECK-NEXT: vaddps %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc3]
@@ -4079,242 +3859,9 @@ define <8 x float>@test_int_x86_avx512_maskz_fixupimm_ps_256(<8 x float> %x0, <8
ret <8 x float> %res4
}
-declare i8 @llvm.x86.avx512.ptestm.d.128(<4 x i32>, <4 x i32>,i8)
-
-define i8@test_int_x86_avx512_ptestm_d_128(<4 x i32> %x0, <4 x i32> %x1, i8 %x2) {
-; CHECK-LABEL: test_int_x86_avx512_ptestm_d_128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT: vptestmd %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x27,0xc1]
-; CHECK-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
-; CHECK-NEXT: vptestmd %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf2,0x7d,0x08,0x27,0xc1]
-; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
-; CHECK-NEXT: addb %cl, %al ## encoding: [0x00,0xc8]
-; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
-; CHECK-NEXT: retq ## encoding: [0xc3]
- %res = call i8 @llvm.x86.avx512.ptestm.d.128(<4 x i32> %x0, <4 x i32> %x1, i8 %x2)
- %res1 = call i8 @llvm.x86.avx512.ptestm.d.128(<4 x i32> %x0, <4 x i32> %x1, i8-1)
- %res2 = add i8 %res, %res1
- ret i8 %res2
-}
-
-declare i8 @llvm.x86.avx512.ptestm.d.256(<8 x i32>, <8 x i32>, i8)
-
-define i8@test_int_x86_avx512_ptestm_d_256(<8 x i32> %x0, <8 x i32> %x1, i8 %x2) {
-; CHECK-LABEL: test_int_x86_avx512_ptestm_d_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT: vptestmd %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf2,0x7d,0x28,0x27,0xc1]
-; CHECK-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
-; CHECK-NEXT: vptestmd %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x27,0xc1]
-; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
-; CHECK-NEXT: addb %cl, %al ## encoding: [0x00,0xc8]
-; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
-; CHECK-NEXT: retq ## encoding: [0xc3]
- %res = call i8 @llvm.x86.avx512.ptestm.d.256(<8 x i32> %x0, <8 x i32> %x1, i8 %x2)
- %res1 = call i8 @llvm.x86.avx512.ptestm.d.256(<8 x i32> %x0, <8 x i32> %x1, i8-1)
- %res2 = add i8 %res, %res1
- ret i8 %res2
-}
-
-declare i8 @llvm.x86.avx512.ptestm.q.128(<2 x i64>, <2 x i64>, i8)
-
-define i8@test_int_x86_avx512_ptestm_q_128(<2 x i64> %x0, <2 x i64> %x1, i8 %x2) {
-; CHECK-LABEL: test_int_x86_avx512_ptestm_q_128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT: vptestmq %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x27,0xc1]
-; CHECK-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
-; CHECK-NEXT: vptestmq %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf2,0xfd,0x08,0x27,0xc1]
-; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
-; CHECK-NEXT: addb %cl, %al ## encoding: [0x00,0xc8]
-; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
-; CHECK-NEXT: retq ## encoding: [0xc3]
- %res = call i8 @llvm.x86.avx512.ptestm.q.128(<2 x i64> %x0, <2 x i64> %x1, i8 %x2)
- %res1 = call i8 @llvm.x86.avx512.ptestm.q.128(<2 x i64> %x0, <2 x i64> %x1, i8-1)
- %res2 = add i8 %res, %res1
- ret i8 %res2
-}
-
-declare i8 @llvm.x86.avx512.ptestm.q.256(<4 x i64>, <4 x i64>, i8)
-
-define i8@test_int_x86_avx512_ptestm_q_256(<4 x i64> %x0, <4 x i64> %x1, i8 %x2) {
-; CHECK-LABEL: test_int_x86_avx512_ptestm_q_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT: vptestmq %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x27,0xc1]
-; CHECK-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
-; CHECK-NEXT: vptestmq %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf2,0xfd,0x28,0x27,0xc1]
-; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
-; CHECK-NEXT: addb %cl, %al ## encoding: [0x00,0xc8]
-; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
-; CHECK-NEXT: retq ## encoding: [0xc3]
- %res = call i8 @llvm.x86.avx512.ptestm.q.256(<4 x i64> %x0, <4 x i64> %x1, i8 %x2)
- %res1 = call i8 @llvm.x86.avx512.ptestm.q.256(<4 x i64> %x0, <4 x i64> %x1, i8-1)
- %res2 = add i8 %res, %res1
- ret i8 %res2
-}
-
-declare i8 @llvm.x86.avx512.ptestnm.d.128(<4 x i32>, <4 x i32>, i8 %x2)
-
-define i8@test_int_x86_avx512_ptestnm_d_128(<4 x i32> %x0, <4 x i32> %x1, i8 %x2) {
-; CHECK-LABEL: test_int_x86_avx512_ptestnm_d_128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT: vptestnmd %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x27,0xc1]
-; CHECK-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
-; CHECK-NEXT: vptestnmd %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf2,0x7e,0x08,0x27,0xc1]
-; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
-; CHECK-NEXT: addb %cl, %al ## encoding: [0x00,0xc8]
-; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
-; CHECK-NEXT: retq ## encoding: [0xc3]
- %res = call i8 @llvm.x86.avx512.ptestnm.d.128(<4 x i32> %x0, <4 x i32> %x1, i8 %x2)
- %res1 = call i8 @llvm.x86.avx512.ptestnm.d.128(<4 x i32> %x0, <4 x i32> %x1, i8-1)
- %res2 = add i8 %res, %res1
- ret i8 %res2
-}
-
-declare i8 @llvm.x86.avx512.ptestnm.d.256(<8 x i32>, <8 x i32>, i8 %x2)
-
-define i8@test_int_x86_avx512_ptestnm_d_256(<8 x i32> %x0, <8 x i32> %x1, i8 %x2) {
-; CHECK-LABEL: test_int_x86_avx512_ptestnm_d_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT: vptestnmd %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf2,0x7e,0x28,0x27,0xc1]
-; CHECK-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
-; CHECK-NEXT: vptestnmd %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x27,0xc1]
-; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
-; CHECK-NEXT: addb %cl, %al ## encoding: [0x00,0xc8]
-; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
-; CHECK-NEXT: retq ## encoding: [0xc3]
- %res = call i8 @llvm.x86.avx512.ptestnm.d.256(<8 x i32> %x0, <8 x i32> %x1, i8 %x2)
- %res1 = call i8 @llvm.x86.avx512.ptestnm.d.256(<8 x i32> %x0, <8 x i32> %x1, i8-1)
- %res2 = add i8 %res, %res1
- ret i8 %res2
-}
-
-declare i8 @llvm.x86.avx512.ptestnm.q.128(<2 x i64>, <2 x i64>, i8 %x2)
-
-define i8@test_int_x86_avx512_ptestnm_q_128(<2 x i64> %x0, <2 x i64> %x1, i8 %x2) {
-; CHECK-LABEL: test_int_x86_avx512_ptestnm_q_128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT: vptestnmq %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf2,0xfe,0x09,0x27,0xc1]
-; CHECK-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
-; CHECK-NEXT: vptestnmq %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf2,0xfe,0x08,0x27,0xc1]
-; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
-; CHECK-NEXT: addb %cl, %al ## encoding: [0x00,0xc8]
-; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
-; CHECK-NEXT: retq ## encoding: [0xc3]
- %res = call i8 @llvm.x86.avx512.ptestnm.q.128(<2 x i64> %x0, <2 x i64> %x1, i8 %x2)
- %res1 = call i8 @llvm.x86.avx512.ptestnm.q.128(<2 x i64> %x0, <2 x i64> %x1, i8-1)
- %res2 = add i8 %res, %res1
- ret i8 %res2
-}
-
-declare i8 @llvm.x86.avx512.ptestnm.q.256(<4 x i64>, <4 x i64>, i8 %x2)
-
-define i8@test_int_x86_avx512_ptestnm_q_256(<4 x i64> %x0, <4 x i64> %x1, i8 %x2) {
-; CHECK-LABEL: test_int_x86_avx512_ptestnm_q_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT: vptestnmq %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf2,0xfe,0x29,0x27,0xc1]
-; CHECK-NEXT: kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
-; CHECK-NEXT: vptestnmq %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf2,0xfe,0x28,0x27,0xc1]
-; CHECK-NEXT: kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
-; CHECK-NEXT: addb %cl, %al ## encoding: [0x00,0xc8]
-; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
-; CHECK-NEXT: retq ## encoding: [0xc3]
- %res = call i8 @llvm.x86.avx512.ptestnm.q.256(<4 x i64> %x0, <4 x i64> %x1, i8 %x2)
- %res1 = call i8 @llvm.x86.avx512.ptestnm.q.256(<4 x i64> %x0, <4 x i64> %x1, i8-1)
- %res2 = add i8 %res, %res1
- ret i8 %res2
-}
-
-declare <8 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.256(i32, <8 x i32>, i8)
-
-define <8 x i32>@test_int_x86_avx512_mask_pbroadcast_d_gpr_256(i32 %x0, <8 x i32> %x1, i8 %mask) {
-; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcast_d_gpr_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
-; CHECK-NEXT: vpbroadcastd %edi, %ymm1 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x7c,0xcf]
-; CHECK-NEXT: vpbroadcastd %edi, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x7c,0xc7]
-; CHECK-NEXT: vpbroadcastd %edi, %ymm2 ## encoding: [0x62,0xf2,0x7d,0x28,0x7c,0xd7]
-; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc0]
-; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc0]
-; CHECK-NEXT: retq ## encoding: [0xc3]
- %res = call <8 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.256(i32 %x0, <8 x i32> %x1, i8 -1)
- %res1 = call <8 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.256(i32 %x0, <8 x i32> %x1, i8 %mask)
- %res2 = call <8 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.256(i32 %x0, <8 x i32> zeroinitializer, i8 %mask)
- %res3 = add <8 x i32> %res, %res1
- %res4 = add <8 x i32> %res2, %res3
- ret <8 x i32> %res4
-}
-
-declare <4 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.128(i32, <4 x i32>, i8)
-
-define <4 x i32>@test_int_x86_avx512_mask_pbroadcast_d_gpr_128(i32 %x0, <4 x i32> %x1, i8 %mask) {
-; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcast_d_gpr_128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
-; CHECK-NEXT: vpbroadcastd %edi, %xmm1 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x7c,0xcf]
-; CHECK-NEXT: vpbroadcastd %edi, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x7c,0xc7]
-; CHECK-NEXT: vpbroadcastd %edi, %xmm2 ## encoding: [0x62,0xf2,0x7d,0x08,0x7c,0xd7]
-; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc0]
-; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0]
-; CHECK-NEXT: retq ## encoding: [0xc3]
- %res = call <4 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.128(i32 %x0, <4 x i32> %x1, i8 -1)
- %res1 = call <4 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.128(i32 %x0, <4 x i32> %x1, i8 %mask)
- %res2 = call <4 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.128(i32 %x0, <4 x i32> zeroinitializer, i8 %mask)
- %res3 = add <4 x i32> %res, %res1
- %res4 = add <4 x i32> %res2, %res3
- ret <4 x i32> %res4
-}
-
-declare <4 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.256(i64, <4 x i64>, i8)
-
-define <4 x i64>@test_int_x86_avx512_mask_pbroadcast_q_gpr_256(i64 %x0, <4 x i64> %x1, i8 %mask) {
-; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcast_q_gpr_256:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
-; CHECK-NEXT: vpbroadcastq %rdi, %ymm1 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x7c,0xcf]
-; CHECK-NEXT: vpbroadcastq %rdi, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x7c,0xc7]
-; CHECK-NEXT: vpbroadcastq %rdi, %ymm2 ## encoding: [0x62,0xf2,0xfd,0x28,0x7c,0xd7]
-; CHECK-NEXT: vpaddq %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc0]
-; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0]
-; CHECK-NEXT: retq ## encoding: [0xc3]
- %res = call <4 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.256(i64 %x0, <4 x i64> %x1,i8 -1)
- %res1 = call <4 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.256(i64 %x0, <4 x i64> %x1,i8 %mask)
- %res2 = call <4 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.256(i64 %x0, <4 x i64> zeroinitializer,i8 %mask)
- %res3 = add <4 x i64> %res, %res1
- %res4 = add <4 x i64> %res2, %res3
- ret <4 x i64> %res4
-}
-
-declare <2 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.128(i64, <2 x i64>, i8)
-
-define <2 x i64>@test_int_x86_avx512_mask_pbroadcast_q_gpr_128(i64 %x0, <2 x i64> %x1, i8 %mask) {
-; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcast_q_gpr_128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
-; CHECK-NEXT: vpbroadcastq %rdi, %xmm1 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0x7c,0xcf]
-; CHECK-NEXT: vpbroadcastq %rdi, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x7c,0xc7]
-; CHECK-NEXT: vpbroadcastq %rdi, %xmm2 ## encoding: [0x62,0xf2,0xfd,0x08,0x7c,0xd7]
-; CHECK-NEXT: vpaddq %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xc0]
-; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0]
-; CHECK-NEXT: retq ## encoding: [0xc3]
- %res = call <2 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.128(i64 %x0, <2 x i64> %x1,i8 -1)
- %res1 = call <2 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.128(i64 %x0, <2 x i64> %x1,i8 %mask)
- %res2 = call <2 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.128(i64 %x0, <2 x i64> zeroinitializer,i8 %mask)
- %res3 = add <2 x i64> %res, %res1
- %res4 = add <2 x i64> %res2, %res3
- ret <2 x i64> %res4
-}
-
-
define <2 x i64> @test_x86_avx512_psra_q_128(<2 x i64> %a0, <2 x i64> %a1) {
; CHECK-LABEL: test_x86_avx512_psra_q_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsraq %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0xe2,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x i64> @llvm.x86.avx512.psra.q.128(<2 x i64> %a0, <2 x i64> %a1) ; <<2 x i64>> [#uses=1]
@@ -4322,7 +3869,7 @@ define <2 x i64> @test_x86_avx512_psra_q_128(<2 x i64> %a0, <2 x i64> %a1) {
}
define <2 x i64> @test_x86_avx512_mask_psra_q_128(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %passthru, i8 %mask) {
; CHECK-LABEL: test_x86_avx512_mask_psra_q_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpsraq %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0xe2,0xd1]
; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
@@ -4335,7 +3882,7 @@ define <2 x i64> @test_x86_avx512_mask_psra_q_128(<2 x i64> %a0, <2 x i64> %a1,
}
define <2 x i64> @test_x86_avx512_maskz_psra_q_128(<2 x i64> %a0, <2 x i64> %a1, i8 %mask) {
; CHECK-LABEL: test_x86_avx512_maskz_psra_q_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpsraq %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0xe2,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -4350,7 +3897,7 @@ declare <2 x i64> @llvm.x86.avx512.psra.q.128(<2 x i64>, <2 x i64>) nounwind rea
define <4 x i64> @test_x86_avx512_psra_q_256(<4 x i64> %a0, <2 x i64> %a1) {
; CHECK-LABEL: test_x86_avx512_psra_q_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsraq %xmm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0xe2,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i64> @llvm.x86.avx512.psra.q.256(<4 x i64> %a0, <2 x i64> %a1) ; <<4 x i64>> [#uses=1]
@@ -4358,7 +3905,7 @@ define <4 x i64> @test_x86_avx512_psra_q_256(<4 x i64> %a0, <2 x i64> %a1) {
}
define <4 x i64> @test_x86_avx512_mask_psra_q_256(<4 x i64> %a0, <2 x i64> %a1, <4 x i64> %passthru, i8 %mask) {
; CHECK-LABEL: test_x86_avx512_mask_psra_q_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpsraq %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0xe2,0xd1]
; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2]
@@ -4371,7 +3918,7 @@ define <4 x i64> @test_x86_avx512_mask_psra_q_256(<4 x i64> %a0, <2 x i64> %a1,
}
define <4 x i64> @test_x86_avx512_maskz_psra_q_256(<4 x i64> %a0, <2 x i64> %a1, <4 x i64> %passthru, i8 %mask) {
; CHECK-LABEL: test_x86_avx512_maskz_psra_q_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpsraq %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0xe2,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -4386,7 +3933,7 @@ declare <4 x i64> @llvm.x86.avx512.psra.q.256(<4 x i64>, <2 x i64>) nounwind rea
define <2 x i64> @test_x86_avx512_psrai_q_128(<2 x i64> %a0) {
; CHECK-LABEL: test_x86_avx512_psrai_q_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsraq $7, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0x72,0xe0,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x i64> @llvm.x86.avx512.psrai.q.128(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]
@@ -4394,7 +3941,7 @@ define <2 x i64> @test_x86_avx512_psrai_q_128(<2 x i64> %a0) {
}
define <2 x i64> @test_x86_avx512_mask_psrai_q_128(<2 x i64> %a0, <2 x i64> %passthru, i8 %mask) {
; CHECK-LABEL: test_x86_avx512_mask_psrai_q_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpsraq $7, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x09,0x72,0xe0,0x07]
; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
@@ -4407,7 +3954,7 @@ define <2 x i64> @test_x86_avx512_mask_psrai_q_128(<2 x i64> %a0, <2 x i64> %pas
}
define <2 x i64> @test_x86_avx512_maskz_psrai_q_128(<2 x i64> %a0, i8 %mask) {
; CHECK-LABEL: test_x86_avx512_maskz_psrai_q_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpsraq $7, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0x72,0xe0,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -4422,7 +3969,7 @@ declare <2 x i64> @llvm.x86.avx512.psrai.q.128(<2 x i64>, i32) nounwind readnone
define <4 x i64> @test_x86_avx512_psrai_q_256(<4 x i64> %a0) {
; CHECK-LABEL: test_x86_avx512_psrai_q_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsraq $7, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0x72,0xe0,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i64> @llvm.x86.avx512.psrai.q.256(<4 x i64> %a0, i32 7) ; <<4 x i64>> [#uses=1]
@@ -4430,7 +3977,7 @@ define <4 x i64> @test_x86_avx512_psrai_q_256(<4 x i64> %a0) {
}
define <4 x i64> @test_x86_avx512_mask_psrai_q_256(<4 x i64> %a0, <4 x i64> %passthru, i8 %mask) {
; CHECK-LABEL: test_x86_avx512_mask_psrai_q_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpsraq $7, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x29,0x72,0xe0,0x07]
; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
@@ -4443,7 +3990,7 @@ define <4 x i64> @test_x86_avx512_mask_psrai_q_256(<4 x i64> %a0, <4 x i64> %pas
}
define <4 x i64> @test_x86_avx512_maskz_psrai_q_256(<4 x i64> %a0, i8 %mask) {
; CHECK-LABEL: test_x86_avx512_maskz_psrai_q_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpsraq $7, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0x72,0xe0,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -4457,7 +4004,7 @@ declare <4 x i64> @llvm.x86.avx512.psrai.q.256(<4 x i64>, i32) nounwind readnone
define <2 x i64> @test_x86_avx512_psrav_q_128(<2 x i64> %a0, <2 x i64> %a1) {
; CHECK-LABEL: test_x86_avx512_psrav_q_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsravq %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf2,0xfd,0x08,0x46,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x i64> @llvm.x86.avx512.psrav.q.128(<2 x i64> %a0, <2 x i64> %a1)
@@ -4466,7 +4013,7 @@ define <2 x i64> @test_x86_avx512_psrav_q_128(<2 x i64> %a0, <2 x i64> %a1) {
define <2 x i64> @test_x86_avx512_mask_psrav_q_128(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2, i8 %mask) {
; CHECK-LABEL: test_x86_avx512_mask_psrav_q_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpsravq %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x46,0xd1]
; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
@@ -4480,7 +4027,7 @@ define <2 x i64> @test_x86_avx512_mask_psrav_q_128(<2 x i64> %a0, <2 x i64> %a1,
define <2 x i64> @test_x86_avx512_maskz_psrav_q_128(<2 x i64> %a0, <2 x i64> %a1, i8 %mask) {
; CHECK-LABEL: test_x86_avx512_maskz_psrav_q_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpsravq %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0x46,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -4495,7 +4042,7 @@ declare <2 x i64> @llvm.x86.avx512.psrav.q.128(<2 x i64>, <2 x i64>) nounwind re
define <4 x i64> @test_x86_avx512_psrav_q_256(<4 x i64> %a0, <4 x i64> %a1) {
; CHECK-LABEL: test_x86_avx512_psrav_q_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpsravq %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf2,0xfd,0x28,0x46,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i64> @llvm.x86.avx512.psrav.q.256(<4 x i64> %a0, <4 x i64> %a1)
@@ -4504,7 +4051,7 @@ define <4 x i64> @test_x86_avx512_psrav_q_256(<4 x i64> %a0, <4 x i64> %a1) {
define <4 x i64> @test_x86_avx512_mask_psrav_q_256(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> %a2, i8 %mask) {
; CHECK-LABEL: test_x86_avx512_mask_psrav_q_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpsravq %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x46,0xd1]
; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2]
@@ -4518,7 +4065,7 @@ define <4 x i64> @test_x86_avx512_mask_psrav_q_256(<4 x i64> %a0, <4 x i64> %a1,
define <4 x i64> @test_x86_avx512_maskz_psrav_q_256(<4 x i64> %a0, <4 x i64> %a1, i8 %mask) {
; CHECK-LABEL: test_x86_avx512_maskz_psrav_q_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vpsravq %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x46,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -4535,7 +4082,7 @@ declare <8 x float> @llvm.x86.avx512.mask.vfmadd.ps.256(<8 x float>, <8 x float>
define <8 x float> @test_mask_vfmadd256_ps(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) {
; CHECK-LABEL: test_mask_vfmadd256_ps:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vfmadd132ps %ymm1, %ymm2, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x6d,0x29,0x98,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -4547,7 +4094,7 @@ declare <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float>, <4 x float>
define <4 x float> @test_mask_vfmadd128_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
; CHECK-LABEL: test_mask_vfmadd128_ps:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vfmadd132ps %xmm1, %xmm2, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x6d,0x09,0x98,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -4559,7 +4106,7 @@ declare <4 x double> @llvm.x86.avx512.mask.vfmadd.pd.256(<4 x double>, <4 x doub
define <4 x double> @test_mask_fmadd256_pd(<4 x double> %a, <4 x double> %b, <4 x double> %c, i8 %mask) {
; CHECK-LABEL: test_mask_fmadd256_pd:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vfmadd132pd %ymm1, %ymm2, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xed,0x29,0x98,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -4571,7 +4118,7 @@ declare <2 x double> @llvm.x86.avx512.mask.vfmadd.pd.128(<2 x double>, <2 x doub
define <2 x double> @test_mask_fmadd128_pd(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
; CHECK-LABEL: test_mask_fmadd128_pd:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vfmadd132pd %xmm1, %xmm2, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xed,0x09,0x98,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -4581,7 +4128,7 @@ define <2 x double> @test_mask_fmadd128_pd(<2 x double> %a, <2 x double> %b, <2
define <2 x double>@test_int_x86_avx512_mask_vfmadd_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vfmadd_pd_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovapd %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xd9]
; CHECK-NEXT: vfmadd213pd %xmm2, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xa8,0xda]
@@ -4598,7 +4145,7 @@ declare <2 x double> @llvm.x86.avx512.mask3.vfmadd.pd.128(<2 x double>, <2 x dou
define <2 x double>@test_int_x86_avx512_mask3_vfmadd_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask3_vfmadd_pd_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovapd %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xd9]
; CHECK-NEXT: vfmadd213pd %xmm2, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xa8,0xda]
@@ -4615,7 +4162,7 @@ declare <2 x double> @llvm.x86.avx512.maskz.vfmadd.pd.128(<2 x double>, <2 x dou
define <2 x double>@test_int_x86_avx512_maskz_vfmadd_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_maskz_vfmadd_pd_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovapd %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xd9]
; CHECK-NEXT: vfmadd213pd %xmm2, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xa8,0xda]
@@ -4630,7 +4177,7 @@ define <2 x double>@test_int_x86_avx512_maskz_vfmadd_pd_128(<2 x double> %x0, <2
define <4 x double>@test_int_x86_avx512_mask_vfmadd_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vfmadd_pd_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovapd %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xd9]
; CHECK-NEXT: vfmadd213pd %ymm2, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xfd,0xa8,0xda]
@@ -4647,7 +4194,7 @@ declare <4 x double> @llvm.x86.avx512.mask3.vfmadd.pd.256(<4 x double>, <4 x dou
define <4 x double>@test_int_x86_avx512_mask3_vfmadd_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask3_vfmadd_pd_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovapd %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xd9]
; CHECK-NEXT: vfmadd213pd %ymm2, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xfd,0xa8,0xda]
@@ -4664,7 +4211,7 @@ declare <4 x double> @llvm.x86.avx512.maskz.vfmadd.pd.256(<4 x double>, <4 x dou
define <4 x double>@test_int_x86_avx512_maskz_vfmadd_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_maskz_vfmadd_pd_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovapd %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xd9]
; CHECK-NEXT: vfmadd213pd %ymm2, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xfd,0xa8,0xda]
@@ -4679,7 +4226,7 @@ define <4 x double>@test_int_x86_avx512_maskz_vfmadd_pd_256(<4 x double> %x0, <4
define <4 x float>@test_int_x86_avx512_mask_vfmadd_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vfmadd_ps_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovaps %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xd9]
; CHECK-NEXT: vfmadd213ps %xmm2, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xa8,0xda]
@@ -4696,7 +4243,7 @@ declare <4 x float> @llvm.x86.avx512.mask3.vfmadd.ps.128(<4 x float>, <4 x float
define <4 x float>@test_int_x86_avx512_mask3_vfmadd_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask3_vfmadd_ps_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovaps %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xd9]
; CHECK-NEXT: vfmadd213ps %xmm2, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xa8,0xda]
@@ -4713,7 +4260,7 @@ declare <4 x float> @llvm.x86.avx512.maskz.vfmadd.ps.128(<4 x float>, <4 x float
define <4 x float>@test_int_x86_avx512_maskz_vfmadd_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_maskz_vfmadd_ps_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovaps %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xd9]
; CHECK-NEXT: vfmadd213ps %xmm2, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xa8,0xda]
@@ -4728,7 +4275,7 @@ define <4 x float>@test_int_x86_avx512_maskz_vfmadd_ps_128(<4 x float> %x0, <4 x
define <8 x float>@test_int_x86_avx512_mask_vfmadd_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vfmadd_ps_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovaps %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xd9]
; CHECK-NEXT: vfmadd213ps %ymm2, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0xa8,0xda]
@@ -4745,7 +4292,7 @@ declare <8 x float> @llvm.x86.avx512.mask3.vfmadd.ps.256(<8 x float>, <8 x float
define <8 x float>@test_int_x86_avx512_mask3_vfmadd_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask3_vfmadd_ps_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovaps %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xd9]
; CHECK-NEXT: vfmadd213ps %ymm2, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0xa8,0xda]
@@ -4762,7 +4309,7 @@ declare <8 x float> @llvm.x86.avx512.maskz.vfmadd.ps.256(<8 x float>, <8 x float
define <8 x float>@test_int_x86_avx512_maskz_vfmadd_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_maskz_vfmadd_ps_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovaps %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xd9]
; CHECK-NEXT: vfmadd213ps %ymm2, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0xa8,0xda]
@@ -4780,7 +4327,7 @@ declare <2 x double> @llvm.x86.avx512.mask3.vfmsub.pd.128(<2 x double>, <2 x dou
define <2 x double>@test_int_x86_avx512_mask3_vfmsub_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsub_pd_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovapd %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xd9]
; CHECK-NEXT: vfmsub213pd %xmm2, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xaa,0xda]
@@ -4798,7 +4345,7 @@ declare <4 x double> @llvm.x86.avx512.mask3.vfmsub.pd.256(<4 x double>, <4 x dou
define <4 x double>@test_int_x86_avx512_mask3_vfmsub_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsub_pd_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovapd %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xd9]
; CHECK-NEXT: vfmsub213pd %ymm2, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xfd,0xaa,0xda]
@@ -4815,7 +4362,7 @@ declare <4 x float> @llvm.x86.avx512.mask3.vfmsub.ps.128(<4 x float>, <4 x float
define <4 x float>@test_int_x86_avx512_mask3_vfmsub_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsub_ps_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovaps %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xd9]
; CHECK-NEXT: vfmsub213ps %xmm2, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xaa,0xda]
@@ -4832,7 +4379,7 @@ declare <8 x float> @llvm.x86.avx512.mask3.vfmsub.ps.256(<8 x float>, <8 x float
define <8 x float>@test_int_x86_avx512_mask3_vfmsub_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsub_ps_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovaps %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xd9]
; CHECK-NEXT: vfmsub213ps %ymm2, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0xaa,0xda]
@@ -4849,7 +4396,7 @@ declare <8 x float> @llvm.x86.avx512.mask.vfnmadd.ps.256(<8 x float>, <8 x float
define <8 x float> @test_mask_vfnmadd256_ps(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) {
; CHECK-LABEL: test_mask_vfnmadd256_ps:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vfnmadd132ps %ymm1, %ymm2, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x6d,0x29,0x9c,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -4861,7 +4408,7 @@ declare <4 x float> @llvm.x86.avx512.mask.vfnmadd.ps.128(<4 x float>, <4 x float
define <4 x float> @test_mask_vfnmadd128_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
; CHECK-LABEL: test_mask_vfnmadd128_ps:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vfnmadd132ps %xmm1, %xmm2, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x6d,0x09,0x9c,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -4873,7 +4420,7 @@ declare <4 x double> @llvm.x86.avx512.mask.vfnmadd.pd.256(<4 x double>, <4 x dou
define <4 x double> @test_mask_vfnmadd256_pd(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) {
; CHECK-LABEL: test_mask_vfnmadd256_pd:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vfnmadd132pd %ymm1, %ymm2, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xed,0x29,0x9c,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -4885,7 +4432,7 @@ declare <2 x double> @llvm.x86.avx512.mask.vfnmadd.pd.128(<2 x double>, <2 x dou
define <2 x double> @test_mask_vfnmadd128_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
; CHECK-LABEL: test_mask_vfnmadd128_pd:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vfnmadd132pd %xmm1, %xmm2, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xed,0x09,0x9c,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -4897,7 +4444,7 @@ declare <8 x float> @llvm.x86.avx512.mask.vfnmsub.ps.256(<8 x float>, <8 x float
define <8 x float> @test_mask_vfnmsub256_ps(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) {
; CHECK-LABEL: test_mask_vfnmsub256_ps:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vfnmsub132ps %ymm1, %ymm2, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x6d,0x29,0x9e,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -4909,7 +4456,7 @@ declare <4 x float> @llvm.x86.avx512.mask.vfnmsub.ps.128(<4 x float>, <4 x float
define <4 x float> @test_mask_vfnmsub128_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
; CHECK-LABEL: test_mask_vfnmsub128_ps:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vfnmsub132ps %xmm1, %xmm2, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x6d,0x09,0x9e,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -4921,7 +4468,7 @@ declare <4 x double> @llvm.x86.avx512.mask.vfnmsub.pd.256(<4 x double>, <4 x dou
define <4 x double> @test_mask_vfnmsub256_pd(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) {
; CHECK-LABEL: test_mask_vfnmsub256_pd:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vfnmsub132pd %ymm1, %ymm2, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xed,0x29,0x9e,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -4933,7 +4480,7 @@ declare <2 x double> @llvm.x86.avx512.mask.vfnmsub.pd.128(<2 x double>, <2 x dou
define <2 x double> @test_mask_vfnmsub128_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
; CHECK-LABEL: test_mask_vfnmsub128_pd:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vfnmsub132pd %xmm1, %xmm2, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xed,0x09,0x9e,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -4944,7 +4491,7 @@ define <2 x double> @test_mask_vfnmsub128_pd(<2 x double> %a0, <2 x double> %a1,
define <2 x double>@test_int_x86_avx512_mask_vfnmsub_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vfnmsub_pd_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovapd %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xd9]
; CHECK-NEXT: vfnmsub213pd %xmm2, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xae,0xda]
@@ -4961,7 +4508,7 @@ declare <2 x double> @llvm.x86.avx512.mask3.vfnmsub.pd.128(<2 x double>, <2 x do
define <2 x double>@test_int_x86_avx512_mask3_vfnmsub_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask3_vfnmsub_pd_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovapd %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xd9]
; CHECK-NEXT: vfnmsub213pd %xmm2, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xae,0xda]
@@ -4976,7 +4523,7 @@ define <2 x double>@test_int_x86_avx512_mask3_vfnmsub_pd_128(<2 x double> %x0, <
define <4 x double>@test_int_x86_avx512_mask_vfnmsub_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vfnmsub_pd_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovapd %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xd9]
; CHECK-NEXT: vfnmsub213pd %ymm2, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xfd,0xae,0xda]
@@ -4993,7 +4540,7 @@ declare <4 x double> @llvm.x86.avx512.mask3.vfnmsub.pd.256(<4 x double>, <4 x do
define <4 x double>@test_int_x86_avx512_mask3_vfnmsub_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask3_vfnmsub_pd_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovapd %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xd9]
; CHECK-NEXT: vfnmsub213pd %ymm2, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xfd,0xae,0xda]
@@ -5008,7 +4555,7 @@ define <4 x double>@test_int_x86_avx512_mask3_vfnmsub_pd_256(<4 x double> %x0, <
define <4 x float>@test_int_x86_avx512_mask_vfnmsub_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vfnmsub_ps_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovaps %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xd9]
; CHECK-NEXT: vfnmsub213ps %xmm2, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xae,0xda]
@@ -5025,7 +4572,7 @@ declare <4 x float> @llvm.x86.avx512.mask3.vfnmsub.ps.128(<4 x float>, <4 x floa
define <4 x float>@test_int_x86_avx512_mask3_vfnmsub_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask3_vfnmsub_ps_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovaps %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xd9]
; CHECK-NEXT: vfnmsub213ps %xmm2, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xae,0xda]
@@ -5040,7 +4587,7 @@ define <4 x float>@test_int_x86_avx512_mask3_vfnmsub_ps_128(<4 x float> %x0, <4
define <8 x float>@test_int_x86_avx512_mask_vfnmsub_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vfnmsub_ps_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovaps %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xd9]
; CHECK-NEXT: vfnmsub213ps %ymm2, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0xae,0xda]
@@ -5057,7 +4604,7 @@ declare <8 x float> @llvm.x86.avx512.mask3.vfnmsub.ps.256(<8 x float>, <8 x floa
define <8 x float>@test_int_x86_avx512_mask3_vfnmsub_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask3_vfnmsub_ps_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovaps %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xd9]
; CHECK-NEXT: vfnmsub213ps %ymm2, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0xae,0xda]
@@ -5072,7 +4619,7 @@ define <8 x float>@test_int_x86_avx512_mask3_vfnmsub_ps_256(<8 x float> %x0, <8
define <2 x double>@test_int_x86_avx512_mask_vfnmadd_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vfnmadd_pd_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovapd %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xd9]
; CHECK-NEXT: vfnmadd213pd %xmm2, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xac,0xda]
@@ -5087,7 +4634,7 @@ define <2 x double>@test_int_x86_avx512_mask_vfnmadd_pd_128(<2 x double> %x0, <2
define <4 x double>@test_int_x86_avx512_mask_vfnmadd_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vfnmadd_pd_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovapd %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xd9]
; CHECK-NEXT: vfnmadd213pd %ymm2, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xfd,0xac,0xda]
@@ -5102,7 +4649,7 @@ define <4 x double>@test_int_x86_avx512_mask_vfnmadd_pd_256(<4 x double> %x0, <4
define <4 x float>@test_int_x86_avx512_mask_vfnmadd_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vfnmadd_ps_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovaps %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xd9]
; CHECK-NEXT: vfnmadd213ps %xmm2, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xac,0xda]
@@ -5117,7 +4664,7 @@ define <4 x float>@test_int_x86_avx512_mask_vfnmadd_ps_128(<4 x float> %x0, <4 x
define <8 x float>@test_int_x86_avx512_mask_vfnmadd_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vfnmadd_ps_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovaps %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xd9]
; CHECK-NEXT: vfnmadd213ps %ymm2, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0xac,0xda]
@@ -5134,7 +4681,7 @@ declare <8 x float> @llvm.x86.avx512.mask.vfmaddsub.ps.256(<8 x float>, <8 x flo
define <8 x float> @test_mask_fmaddsub256_ps(<8 x float> %a, <8 x float> %b, <8 x float> %c, i8 %mask) {
; CHECK-LABEL: test_mask_fmaddsub256_ps:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vfmaddsub132ps %ymm1, %ymm2, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x6d,0x29,0x96,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -5146,7 +4693,7 @@ declare <4 x float> @llvm.x86.avx512.mask.vfmaddsub.ps.128(<4 x float>, <4 x flo
define <4 x float> @test_mask_fmaddsub128_ps(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
; CHECK-LABEL: test_mask_fmaddsub128_ps:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vfmaddsub132ps %xmm1, %xmm2, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x6d,0x09,0x96,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -5158,7 +4705,7 @@ declare <4 x double> @llvm.x86.avx512.mask.vfmaddsub.pd.256(<4 x double>, <4 x d
define <4 x double> @test_mask_vfmaddsub256_pd(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) {
; CHECK-LABEL: test_mask_vfmaddsub256_pd:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vfmaddsub132pd %ymm1, %ymm2, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xed,0x29,0x96,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -5170,7 +4717,7 @@ declare <2 x double> @llvm.x86.avx512.mask.vfmaddsub.pd.128(<2 x double>, <2 x d
define <2 x double> @test_mask_vfmaddsub128_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
; CHECK-LABEL: test_mask_vfmaddsub128_pd:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vfmaddsub132pd %xmm1, %xmm2, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xed,0x09,0x96,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -5180,7 +4727,7 @@ define <2 x double> @test_mask_vfmaddsub128_pd(<2 x double> %a0, <2 x double> %a
define <2 x double>@test_int_x86_avx512_mask_vfmaddsub_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vfmaddsub_pd_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovapd %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xd9]
; CHECK-NEXT: vfmaddsub213pd %xmm2, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xa6,0xda]
@@ -5197,7 +4744,7 @@ declare <2 x double> @llvm.x86.avx512.mask3.vfmaddsub.pd.128(<2 x double>, <2 x
define <2 x double>@test_int_x86_avx512_mask3_vfmaddsub_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask3_vfmaddsub_pd_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovapd %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xd9]
; CHECK-NEXT: vfmaddsub213pd %xmm2, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xa6,0xda]
@@ -5214,7 +4761,7 @@ declare <2 x double> @llvm.x86.avx512.maskz.vfmaddsub.pd.128(<2 x double>, <2 x
define <2 x double>@test_int_x86_avx512_maskz_vfmaddsub_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_maskz_vfmaddsub_pd_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovapd %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xd9]
; CHECK-NEXT: vfmaddsub213pd %xmm2, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xa6,0xda]
@@ -5229,7 +4776,7 @@ define <2 x double>@test_int_x86_avx512_maskz_vfmaddsub_pd_128(<2 x double> %x0,
define <4 x double>@test_int_x86_avx512_mask_vfmaddsub_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vfmaddsub_pd_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovapd %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xd9]
; CHECK-NEXT: vfmaddsub213pd %ymm2, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xfd,0xa6,0xda]
@@ -5246,7 +4793,7 @@ declare <4 x double> @llvm.x86.avx512.mask3.vfmaddsub.pd.256(<4 x double>, <4 x
define <4 x double>@test_int_x86_avx512_mask3_vfmaddsub_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask3_vfmaddsub_pd_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovapd %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xd9]
; CHECK-NEXT: vfmaddsub213pd %ymm2, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xfd,0xa6,0xda]
@@ -5263,7 +4810,7 @@ declare <4 x double> @llvm.x86.avx512.maskz.vfmaddsub.pd.256(<4 x double>, <4 x
define <4 x double>@test_int_x86_avx512_maskz_vfmaddsub_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_maskz_vfmaddsub_pd_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovapd %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xd9]
; CHECK-NEXT: vfmaddsub213pd %ymm2, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xfd,0xa6,0xda]
@@ -5278,7 +4825,7 @@ define <4 x double>@test_int_x86_avx512_maskz_vfmaddsub_pd_256(<4 x double> %x0,
define <4 x float>@test_int_x86_avx512_mask_vfmaddsub_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vfmaddsub_ps_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovaps %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xd9]
; CHECK-NEXT: vfmaddsub213ps %xmm2, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xa6,0xda]
@@ -5295,7 +4842,7 @@ declare <4 x float> @llvm.x86.avx512.mask3.vfmaddsub.ps.128(<4 x float>, <4 x fl
define <4 x float>@test_int_x86_avx512_mask3_vfmaddsub_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask3_vfmaddsub_ps_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovaps %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xd9]
; CHECK-NEXT: vfmaddsub213ps %xmm2, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xa6,0xda]
@@ -5312,7 +4859,7 @@ declare <4 x float> @llvm.x86.avx512.maskz.vfmaddsub.ps.128(<4 x float>, <4 x fl
define <4 x float>@test_int_x86_avx512_maskz_vfmaddsub_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_maskz_vfmaddsub_ps_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovaps %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xd9]
; CHECK-NEXT: vfmaddsub213ps %xmm2, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xa6,0xda]
@@ -5327,7 +4874,7 @@ define <4 x float>@test_int_x86_avx512_maskz_vfmaddsub_ps_128(<4 x float> %x0, <
define <8 x float>@test_int_x86_avx512_mask_vfmaddsub_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vfmaddsub_ps_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovaps %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xd9]
; CHECK-NEXT: vfmaddsub213ps %ymm2, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0xa6,0xda]
@@ -5344,7 +4891,7 @@ declare <8 x float> @llvm.x86.avx512.mask3.vfmaddsub.ps.256(<8 x float>, <8 x fl
define <8 x float>@test_int_x86_avx512_mask3_vfmaddsub_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask3_vfmaddsub_ps_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovaps %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xd9]
; CHECK-NEXT: vfmaddsub213ps %ymm2, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0xa6,0xda]
@@ -5361,7 +4908,7 @@ declare <8 x float> @llvm.x86.avx512.maskz.vfmaddsub.ps.256(<8 x float>, <8 x fl
define <8 x float>@test_int_x86_avx512_maskz_vfmaddsub_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_maskz_vfmaddsub_ps_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovaps %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xd9]
; CHECK-NEXT: vfmaddsub213ps %ymm2, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0xa6,0xda]
@@ -5378,7 +4925,7 @@ declare <2 x double> @llvm.x86.avx512.mask3.vfmsubadd.pd.128(<2 x double>, <2 x
define <2 x double>@test_int_x86_avx512_mask3_vfmsubadd_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsubadd_pd_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovapd %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xd9]
; CHECK-NEXT: vfmsubadd213pd %xmm2, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xa7,0xda]
@@ -5395,7 +4942,7 @@ declare <4 x double> @llvm.x86.avx512.mask3.vfmsubadd.pd.256(<4 x double>, <4 x
define <4 x double>@test_int_x86_avx512_mask3_vfmsubadd_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsubadd_pd_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovapd %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xd9]
; CHECK-NEXT: vfmsubadd213pd %ymm2, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xfd,0xa7,0xda]
@@ -5412,7 +4959,7 @@ declare <4 x float> @llvm.x86.avx512.mask3.vfmsubadd.ps.128(<4 x float>, <4 x fl
define <4 x float>@test_int_x86_avx512_mask3_vfmsubadd_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsubadd_ps_128:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovaps %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xd9]
; CHECK-NEXT: vfmsubadd213ps %xmm2, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xa7,0xda]
@@ -5429,7 +4976,7 @@ declare <8 x float> @llvm.x86.avx512.mask3.vfmsubadd.ps.256(<8 x float>, <8 x fl
define <8 x float>@test_int_x86_avx512_mask3_vfmsubadd_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsubadd_ps_256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovaps %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xd9]
; CHECK-NEXT: vfmsubadd213ps %ymm2, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0xa7,0xda]
@@ -5445,7 +4992,7 @@ define <8 x float>@test_int_x86_avx512_mask3_vfmsubadd_ps_256(<8 x float> %x0, <
define <4 x float> @test_mask_vfmadd128_ps_r(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
; CHECK-LABEL: test_mask_vfmadd128_ps_r:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vfmadd132ps %xmm1, %xmm2, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x6d,0x09,0x98,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -5455,7 +5002,7 @@ define <4 x float> @test_mask_vfmadd128_ps_r(<4 x float> %a0, <4 x float> %a1, <
define <4 x float> @test_mask_vfmadd128_ps_rz(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
; CHECK-LABEL: test_mask_vfmadd128_ps_rz:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vfmadd213ps %xmm2, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xa8,0xc2]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 -1) nounwind
@@ -5464,7 +5011,7 @@ define <4 x float> @test_mask_vfmadd128_ps_rz(<4 x float> %a0, <4 x float> %a1,
define <4 x float> @test_mask_vfmadd128_ps_rmk(<4 x float> %a0, <4 x float> %a1, <4 x float>* %ptr_a2, i8 %mask) {
; CHECK-LABEL: test_mask_vfmadd128_ps_rmk:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vfmadd213ps (%rdi), %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xa8,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -5475,7 +5022,7 @@ define <4 x float> @test_mask_vfmadd128_ps_rmk(<4 x float> %a0, <4 x float> %a1,
define <4 x float> @test_mask_vfmadd128_ps_rmka(<4 x float> %a0, <4 x float> %a1, <4 x float>* %ptr_a2, i8 %mask) {
; CHECK-LABEL: test_mask_vfmadd128_ps_rmka:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vfmadd213ps (%rdi), %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xa8,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -5486,7 +5033,7 @@ define <4 x float> @test_mask_vfmadd128_ps_rmka(<4 x float> %a0, <4 x float> %a1
define <4 x float> @test_mask_vfmadd128_ps_rmkz(<4 x float> %a0, <4 x float> %a1, <4 x float>* %ptr_a2) {
; CHECK-LABEL: test_mask_vfmadd128_ps_rmkz:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vfmadd213ps (%rdi), %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xa8,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%a2 = load <4 x float>, <4 x float>* %ptr_a2
@@ -5496,7 +5043,7 @@ define <4 x float> @test_mask_vfmadd128_ps_rmkz(<4 x float> %a0, <4 x float> %a1
define <4 x float> @test_mask_vfmadd128_ps_rmkza(<4 x float> %a0, <4 x float> %a1, <4 x float>* %ptr_a2) {
; CHECK-LABEL: test_mask_vfmadd128_ps_rmkza:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vfmadd213ps (%rdi), %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xa8,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%a2 = load <4 x float>, <4 x float>* %ptr_a2, align 4
@@ -5506,7 +5053,7 @@ define <4 x float> @test_mask_vfmadd128_ps_rmkza(<4 x float> %a0, <4 x float> %a
define <4 x float> @test_mask_vfmadd128_ps_rmb(<4 x float> %a0, <4 x float> %a1, float* %ptr_a2, i8 %mask) {
; CHECK-LABEL: test_mask_vfmadd128_ps_rmb:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vfmadd213ps (%rdi){1to4}, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x19,0xa8,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -5521,7 +5068,7 @@ define <4 x float> @test_mask_vfmadd128_ps_rmb(<4 x float> %a0, <4 x float> %a1,
define <4 x float> @test_mask_vfmadd128_ps_rmba(<4 x float> %a0, <4 x float> %a1, float* %ptr_a2, i8 %mask) {
; CHECK-LABEL: test_mask_vfmadd128_ps_rmba:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vfmadd213ps (%rdi){1to4}, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x19,0xa8,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -5536,7 +5083,7 @@ define <4 x float> @test_mask_vfmadd128_ps_rmba(<4 x float> %a0, <4 x float> %a1
define <4 x float> @test_mask_vfmadd128_ps_rmbz(<4 x float> %a0, <4 x float> %a1, float* %ptr_a2) {
; CHECK-LABEL: test_mask_vfmadd128_ps_rmbz:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vfmadd213ps (%rdi){1to4}, %xmm1, %xmm0 ## encoding: [0x62,0xf2,0x75,0x18,0xa8,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load float, float* %ptr_a2
@@ -5550,7 +5097,7 @@ define <4 x float> @test_mask_vfmadd128_ps_rmbz(<4 x float> %a0, <4 x float> %a1
define <4 x float> @test_mask_vfmadd128_ps_rmbza(<4 x float> %a0, <4 x float> %a1, float* %ptr_a2) {
; CHECK-LABEL: test_mask_vfmadd128_ps_rmbza:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vfmadd213ps (%rdi){1to4}, %xmm1, %xmm0 ## encoding: [0x62,0xf2,0x75,0x18,0xa8,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%q = load float, float* %ptr_a2, align 4
@@ -5564,7 +5111,7 @@ define <4 x float> @test_mask_vfmadd128_ps_rmbza(<4 x float> %a0, <4 x float> %a
define <2 x double> @test_mask_vfmadd128_pd_r(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
; CHECK-LABEL: test_mask_vfmadd128_pd_r:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vfmadd132pd %xmm1, %xmm2, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xed,0x09,0x98,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -5574,7 +5121,7 @@ define <2 x double> @test_mask_vfmadd128_pd_r(<2 x double> %a0, <2 x double> %a1
define <2 x double> @test_mask_vfmadd128_pd_rz(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
; CHECK-LABEL: test_mask_vfmadd128_pd_rz:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vfmadd213pd %xmm2, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xa8,0xc2]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.avx512.mask.vfmadd.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 -1) nounwind
@@ -5583,7 +5130,7 @@ define <2 x double> @test_mask_vfmadd128_pd_rz(<2 x double> %a0, <2 x double> %a
define <2 x double> @test_mask_vfmadd128_pd_rmk(<2 x double> %a0, <2 x double> %a1, <2 x double>* %ptr_a2, i8 %mask) {
; CHECK-LABEL: test_mask_vfmadd128_pd_rmk:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vfmadd213pd (%rdi), %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xa8,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -5594,7 +5141,7 @@ define <2 x double> @test_mask_vfmadd128_pd_rmk(<2 x double> %a0, <2 x double> %
define <2 x double> @test_mask_vfmadd128_pd_rmkz(<2 x double> %a0, <2 x double> %a1, <2 x double>* %ptr_a2) {
; CHECK-LABEL: test_mask_vfmadd128_pd_rmkz:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vfmadd213pd (%rdi), %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xa8,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%a2 = load <2 x double>, <2 x double>* %ptr_a2
@@ -5604,7 +5151,7 @@ define <2 x double> @test_mask_vfmadd128_pd_rmkz(<2 x double> %a0, <2 x double>
define <4 x double> @test_mask_vfmadd256_pd_r(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) {
; CHECK-LABEL: test_mask_vfmadd256_pd_r:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vfmadd132pd %ymm1, %ymm2, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xed,0x29,0x98,0xc1]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -5614,7 +5161,7 @@ define <4 x double> @test_mask_vfmadd256_pd_r(<4 x double> %a0, <4 x double> %a1
define <4 x double> @test_mask_vfmadd256_pd_rz(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) {
; CHECK-LABEL: test_mask_vfmadd256_pd_rz:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vfmadd213pd %ymm2, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf5,0xa8,0xc2]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x double> @llvm.x86.avx512.mask.vfmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 -1) nounwind
@@ -5623,7 +5170,7 @@ define <4 x double> @test_mask_vfmadd256_pd_rz(<4 x double> %a0, <4 x double> %a
define <4 x double> @test_mask_vfmadd256_pd_rmk(<4 x double> %a0, <4 x double> %a1, <4 x double>* %ptr_a2, i8 %mask) {
; CHECK-LABEL: test_mask_vfmadd256_pd_rmk:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
; CHECK-NEXT: vfmadd213pd (%rdi), %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0xa8,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -5634,7 +5181,7 @@ define <4 x double> @test_mask_vfmadd256_pd_rmk(<4 x double> %a0, <4 x double> %
define <4 x double> @test_mask_vfmadd256_pd_rmkz(<4 x double> %a0, <4 x double> %a1, <4 x double>* %ptr_a2) {
; CHECK-LABEL: test_mask_vfmadd256_pd_rmkz:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vfmadd213pd (%rdi), %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf5,0xa8,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%a2 = load <4 x double>, <4 x double>* %ptr_a2
diff --git a/test/CodeGen/X86/avx512vl-logic.ll b/test/CodeGen/X86/avx512vl-logic.ll
index 6e697cf59a4e..52b135c7c293 100644
--- a/test/CodeGen/X86/avx512vl-logic.ll
+++ b/test/CodeGen/X86/avx512vl-logic.ll
@@ -6,7 +6,7 @@
define <8 x i32> @vpandd256(<8 x i32> %a, <8 x i32> %b) nounwind uwtable readnone ssp {
; CHECK-LABEL: vpandd256:
-; CHECK: ## BB#0: ## %entry
+; CHECK: ## %bb.0: ## %entry
; CHECK-NEXT: vpaddd {{.*}}(%rip){1to8}, %ymm0, %ymm0
; CHECK-NEXT: vpand %ymm1, %ymm0, %ymm0
; CHECK-NEXT: retq
@@ -19,7 +19,7 @@ entry:
define <8 x i32> @vpandnd256(<8 x i32> %a, <8 x i32> %b) nounwind uwtable readnone ssp {
; CHECK-LABEL: vpandnd256:
-; CHECK: ## BB#0: ## %entry
+; CHECK: ## %bb.0: ## %entry
; CHECK-NEXT: vpaddd {{.*}}(%rip){1to8}, %ymm0, %ymm1
; CHECK-NEXT: vpandn %ymm1, %ymm0, %ymm0
; CHECK-NEXT: retq
@@ -33,7 +33,7 @@ entry:
define <8 x i32> @vpord256(<8 x i32> %a, <8 x i32> %b) nounwind uwtable readnone ssp {
; CHECK-LABEL: vpord256:
-; CHECK: ## BB#0: ## %entry
+; CHECK: ## %bb.0: ## %entry
; CHECK-NEXT: vpaddd {{.*}}(%rip){1to8}, %ymm0, %ymm0
; CHECK-NEXT: vpor %ymm1, %ymm0, %ymm0
; CHECK-NEXT: retq
@@ -46,7 +46,7 @@ entry:
define <8 x i32> @vpxord256(<8 x i32> %a, <8 x i32> %b) nounwind uwtable readnone ssp {
; CHECK-LABEL: vpxord256:
-; CHECK: ## BB#0: ## %entry
+; CHECK: ## %bb.0: ## %entry
; CHECK-NEXT: vpaddd {{.*}}(%rip){1to8}, %ymm0, %ymm0
; CHECK-NEXT: vpxor %ymm1, %ymm0, %ymm0
; CHECK-NEXT: retq
@@ -59,7 +59,7 @@ entry:
define <4 x i64> @vpandq256(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp {
; CHECK-LABEL: vpandq256:
-; CHECK: ## BB#0: ## %entry
+; CHECK: ## %bb.0: ## %entry
; CHECK-NEXT: vpaddq {{.*}}(%rip){1to4}, %ymm0, %ymm0
; CHECK-NEXT: vpand %ymm1, %ymm0, %ymm0
; CHECK-NEXT: retq
@@ -72,7 +72,7 @@ entry:
define <4 x i64> @vpandnq256(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp {
; CHECK-LABEL: vpandnq256:
-; CHECK: ## BB#0: ## %entry
+; CHECK: ## %bb.0: ## %entry
; CHECK-NEXT: vpaddq {{.*}}(%rip){1to4}, %ymm0, %ymm0
; CHECK-NEXT: vpandn %ymm0, %ymm1, %ymm0
; CHECK-NEXT: retq
@@ -86,7 +86,7 @@ entry:
define <4 x i64> @vporq256(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp {
; CHECK-LABEL: vporq256:
-; CHECK: ## BB#0: ## %entry
+; CHECK: ## %bb.0: ## %entry
; CHECK-NEXT: vpaddq {{.*}}(%rip){1to4}, %ymm0, %ymm0
; CHECK-NEXT: vpor %ymm1, %ymm0, %ymm0
; CHECK-NEXT: retq
@@ -99,7 +99,7 @@ entry:
define <4 x i64> @vpxorq256(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp {
; CHECK-LABEL: vpxorq256:
-; CHECK: ## BB#0: ## %entry
+; CHECK: ## %bb.0: ## %entry
; CHECK-NEXT: vpaddq {{.*}}(%rip){1to4}, %ymm0, %ymm0
; CHECK-NEXT: vpxor %ymm1, %ymm0, %ymm0
; CHECK-NEXT: retq
@@ -114,7 +114,7 @@ entry:
define <4 x i32> @vpandd128(<4 x i32> %a, <4 x i32> %b) nounwind uwtable readnone ssp {
; CHECK-LABEL: vpandd128:
-; CHECK: ## BB#0: ## %entry
+; CHECK: ## %bb.0: ## %entry
; CHECK-NEXT: vpaddd {{.*}}(%rip){1to4}, %xmm0, %xmm0
; CHECK-NEXT: vpand %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
@@ -127,7 +127,7 @@ entry:
define <4 x i32> @vpandnd128(<4 x i32> %a, <4 x i32> %b) nounwind uwtable readnone ssp {
; CHECK-LABEL: vpandnd128:
-; CHECK: ## BB#0: ## %entry
+; CHECK: ## %bb.0: ## %entry
; CHECK-NEXT: vpaddd {{.*}}(%rip){1to4}, %xmm0, %xmm0
; CHECK-NEXT: vpandn %xmm0, %xmm1, %xmm0
; CHECK-NEXT: retq
@@ -141,7 +141,7 @@ entry:
define <4 x i32> @vpord128(<4 x i32> %a, <4 x i32> %b) nounwind uwtable readnone ssp {
; CHECK-LABEL: vpord128:
-; CHECK: ## BB#0: ## %entry
+; CHECK: ## %bb.0: ## %entry
; CHECK-NEXT: vpaddd {{.*}}(%rip){1to4}, %xmm0, %xmm0
; CHECK-NEXT: vpor %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
@@ -154,7 +154,7 @@ entry:
define <4 x i32> @vpxord128(<4 x i32> %a, <4 x i32> %b) nounwind uwtable readnone ssp {
; CHECK-LABEL: vpxord128:
-; CHECK: ## BB#0: ## %entry
+; CHECK: ## %bb.0: ## %entry
; CHECK-NEXT: vpaddd {{.*}}(%rip){1to4}, %xmm0, %xmm0
; CHECK-NEXT: vpxor %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
@@ -167,7 +167,7 @@ entry:
define <2 x i64> @vpandq128(<2 x i64> %a, <2 x i64> %b) nounwind uwtable readnone ssp {
; CHECK-LABEL: vpandq128:
-; CHECK: ## BB#0: ## %entry
+; CHECK: ## %bb.0: ## %entry
; CHECK-NEXT: vpaddq {{.*}}(%rip), %xmm0, %xmm0
; CHECK-NEXT: vpand %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
@@ -180,7 +180,7 @@ entry:
define <2 x i64> @vpandnq128(<2 x i64> %a, <2 x i64> %b) nounwind uwtable readnone ssp {
; CHECK-LABEL: vpandnq128:
-; CHECK: ## BB#0: ## %entry
+; CHECK: ## %bb.0: ## %entry
; CHECK-NEXT: vpaddq {{.*}}(%rip), %xmm0, %xmm0
; CHECK-NEXT: vpandn %xmm0, %xmm1, %xmm0
; CHECK-NEXT: retq
@@ -194,7 +194,7 @@ entry:
define <2 x i64> @vporq128(<2 x i64> %a, <2 x i64> %b) nounwind uwtable readnone ssp {
; CHECK-LABEL: vporq128:
-; CHECK: ## BB#0: ## %entry
+; CHECK: ## %bb.0: ## %entry
; CHECK-NEXT: vpaddq {{.*}}(%rip), %xmm0, %xmm0
; CHECK-NEXT: vpor %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
@@ -207,7 +207,7 @@ entry:
define <2 x i64> @vpxorq128(<2 x i64> %a, <2 x i64> %b) nounwind uwtable readnone ssp {
; CHECK-LABEL: vpxorq128:
-; CHECK: ## BB#0: ## %entry
+; CHECK: ## %bb.0: ## %entry
; CHECK-NEXT: vpaddq {{.*}}(%rip), %xmm0, %xmm0
; CHECK-NEXT: vpxor %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
@@ -221,13 +221,13 @@ entry:
define <4 x double> @test_mm256_mask_andnot_pd(<4 x double> %__W, i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B) {
; KNL-LABEL: test_mm256_mask_andnot_pd:
-; KNL: ## BB#0: ## %entry
+; KNL: ## %bb.0: ## %entry
; KNL-NEXT: kmovw %edi, %k1
; KNL-NEXT: vpandnq %ymm2, %ymm1, %ymm0 {%k1}
; KNL-NEXT: retq
;
; SKX-LABEL: test_mm256_mask_andnot_pd:
-; SKX: ## BB#0: ## %entry
+; SKX: ## %bb.0: ## %entry
; SKX-NEXT: kmovd %edi, %k1
; SKX-NEXT: vandnpd %ymm2, %ymm1, %ymm0 {%k1}
; SKX-NEXT: retq
@@ -245,13 +245,13 @@ entry:
define <4 x double> @test_mm256_maskz_andnot_pd(i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B) {
; KNL-LABEL: test_mm256_maskz_andnot_pd:
-; KNL: ## BB#0: ## %entry
+; KNL: ## %bb.0: ## %entry
; KNL-NEXT: kmovw %edi, %k1
; KNL-NEXT: vpandnq %ymm1, %ymm0, %ymm0 {%k1} {z}
; KNL-NEXT: retq
;
; SKX-LABEL: test_mm256_maskz_andnot_pd:
-; SKX: ## BB#0: ## %entry
+; SKX: ## %bb.0: ## %entry
; SKX-NEXT: kmovd %edi, %k1
; SKX-NEXT: vandnpd %ymm1, %ymm0, %ymm0 {%k1} {z}
; SKX-NEXT: retq
@@ -269,13 +269,13 @@ entry:
define <2 x double> @test_mm_mask_andnot_pd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
; KNL-LABEL: test_mm_mask_andnot_pd:
-; KNL: ## BB#0: ## %entry
+; KNL: ## %bb.0: ## %entry
; KNL-NEXT: kmovw %edi, %k1
; KNL-NEXT: vpandnq %xmm2, %xmm1, %xmm0 {%k1}
; KNL-NEXT: retq
;
; SKX-LABEL: test_mm_mask_andnot_pd:
-; SKX: ## BB#0: ## %entry
+; SKX: ## %bb.0: ## %entry
; SKX-NEXT: kmovd %edi, %k1
; SKX-NEXT: vandnpd %xmm2, %xmm1, %xmm0 {%k1}
; SKX-NEXT: retq
@@ -293,13 +293,13 @@ entry:
define <2 x double> @test_mm_maskz_andnot_pd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
; KNL-LABEL: test_mm_maskz_andnot_pd:
-; KNL: ## BB#0: ## %entry
+; KNL: ## %bb.0: ## %entry
; KNL-NEXT: kmovw %edi, %k1
; KNL-NEXT: vpandnq %xmm1, %xmm0, %xmm0 {%k1} {z}
; KNL-NEXT: retq
;
; SKX-LABEL: test_mm_maskz_andnot_pd:
-; SKX: ## BB#0: ## %entry
+; SKX: ## %bb.0: ## %entry
; SKX-NEXT: kmovd %edi, %k1
; SKX-NEXT: vandnpd %xmm1, %xmm0, %xmm0 {%k1} {z}
; SKX-NEXT: retq
@@ -317,13 +317,13 @@ entry:
define <8 x float> @test_mm256_mask_andnot_ps(<8 x float> %__W, i8 zeroext %__U, <8 x float> %__A, <8 x float> %__B) {
; KNL-LABEL: test_mm256_mask_andnot_ps:
-; KNL: ## BB#0: ## %entry
+; KNL: ## %bb.0: ## %entry
; KNL-NEXT: kmovw %edi, %k1
; KNL-NEXT: vpandnd %ymm2, %ymm1, %ymm0 {%k1}
; KNL-NEXT: retq
;
; SKX-LABEL: test_mm256_mask_andnot_ps:
-; SKX: ## BB#0: ## %entry
+; SKX: ## %bb.0: ## %entry
; SKX-NEXT: kmovd %edi, %k1
; SKX-NEXT: vandnps %ymm2, %ymm1, %ymm0 {%k1}
; SKX-NEXT: retq
@@ -340,13 +340,13 @@ entry:
define <8 x float> @test_mm256_maskz_andnot_ps(i8 zeroext %__U, <8 x float> %__A, <8 x float> %__B) {
; KNL-LABEL: test_mm256_maskz_andnot_ps:
-; KNL: ## BB#0: ## %entry
+; KNL: ## %bb.0: ## %entry
; KNL-NEXT: kmovw %edi, %k1
; KNL-NEXT: vpandnd %ymm1, %ymm0, %ymm0 {%k1} {z}
; KNL-NEXT: retq
;
; SKX-LABEL: test_mm256_maskz_andnot_ps:
-; SKX: ## BB#0: ## %entry
+; SKX: ## %bb.0: ## %entry
; SKX-NEXT: kmovd %edi, %k1
; SKX-NEXT: vandnps %ymm1, %ymm0, %ymm0 {%k1} {z}
; SKX-NEXT: retq
@@ -363,13 +363,13 @@ entry:
define <4 x float> @test_mm_mask_andnot_ps(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
; KNL-LABEL: test_mm_mask_andnot_ps:
-; KNL: ## BB#0: ## %entry
+; KNL: ## %bb.0: ## %entry
; KNL-NEXT: kmovw %edi, %k1
; KNL-NEXT: vpandnd %xmm2, %xmm1, %xmm0 {%k1}
; KNL-NEXT: retq
;
; SKX-LABEL: test_mm_mask_andnot_ps:
-; SKX: ## BB#0: ## %entry
+; SKX: ## %bb.0: ## %entry
; SKX-NEXT: kmovd %edi, %k1
; SKX-NEXT: vandnps %xmm2, %xmm1, %xmm0 {%k1}
; SKX-NEXT: retq
@@ -387,13 +387,13 @@ entry:
define <4 x float> @test_mm_maskz_andnot_ps(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
; KNL-LABEL: test_mm_maskz_andnot_ps:
-; KNL: ## BB#0: ## %entry
+; KNL: ## %bb.0: ## %entry
; KNL-NEXT: kmovw %edi, %k1
; KNL-NEXT: vpandnd %xmm1, %xmm0, %xmm0 {%k1} {z}
; KNL-NEXT: retq
;
; SKX-LABEL: test_mm_maskz_andnot_ps:
-; SKX: ## BB#0: ## %entry
+; SKX: ## %bb.0: ## %entry
; SKX-NEXT: kmovd %edi, %k1
; SKX-NEXT: vandnps %xmm1, %xmm0, %xmm0 {%k1} {z}
; SKX-NEXT: retq
@@ -411,13 +411,13 @@ entry:
define <4 x double> @test_mm256_mask_and_pd(<4 x double> %__W, i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B) {
; KNL-LABEL: test_mm256_mask_and_pd:
-; KNL: ## BB#0: ## %entry
+; KNL: ## %bb.0: ## %entry
; KNL-NEXT: kmovw %edi, %k1
; KNL-NEXT: vpandq %ymm1, %ymm2, %ymm0 {%k1}
; KNL-NEXT: retq
;
; SKX-LABEL: test_mm256_mask_and_pd:
-; SKX: ## BB#0: ## %entry
+; SKX: ## %bb.0: ## %entry
; SKX-NEXT: kmovd %edi, %k1
; SKX-NEXT: vandpd %ymm1, %ymm2, %ymm0 {%k1}
; SKX-NEXT: retq
@@ -434,13 +434,13 @@ entry:
define <4 x double> @test_mm256_maskz_and_pd(i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B) {
; KNL-LABEL: test_mm256_maskz_and_pd:
-; KNL: ## BB#0: ## %entry
+; KNL: ## %bb.0: ## %entry
; KNL-NEXT: kmovw %edi, %k1
; KNL-NEXT: vpandq %ymm0, %ymm1, %ymm0 {%k1} {z}
; KNL-NEXT: retq
;
; SKX-LABEL: test_mm256_maskz_and_pd:
-; SKX: ## BB#0: ## %entry
+; SKX: ## %bb.0: ## %entry
; SKX-NEXT: kmovd %edi, %k1
; SKX-NEXT: vandpd %ymm0, %ymm1, %ymm0 {%k1} {z}
; SKX-NEXT: retq
@@ -457,13 +457,13 @@ entry:
define <2 x double> @test_mm_mask_and_pd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
; KNL-LABEL: test_mm_mask_and_pd:
-; KNL: ## BB#0: ## %entry
+; KNL: ## %bb.0: ## %entry
; KNL-NEXT: kmovw %edi, %k1
; KNL-NEXT: vpandq %xmm1, %xmm2, %xmm0 {%k1}
; KNL-NEXT: retq
;
; SKX-LABEL: test_mm_mask_and_pd:
-; SKX: ## BB#0: ## %entry
+; SKX: ## %bb.0: ## %entry
; SKX-NEXT: kmovd %edi, %k1
; SKX-NEXT: vandpd %xmm1, %xmm2, %xmm0 {%k1}
; SKX-NEXT: retq
@@ -480,13 +480,13 @@ entry:
define <2 x double> @test_mm_maskz_and_pd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
; KNL-LABEL: test_mm_maskz_and_pd:
-; KNL: ## BB#0: ## %entry
+; KNL: ## %bb.0: ## %entry
; KNL-NEXT: kmovw %edi, %k1
; KNL-NEXT: vpandq %xmm0, %xmm1, %xmm0 {%k1} {z}
; KNL-NEXT: retq
;
; SKX-LABEL: test_mm_maskz_and_pd:
-; SKX: ## BB#0: ## %entry
+; SKX: ## %bb.0: ## %entry
; SKX-NEXT: kmovd %edi, %k1
; SKX-NEXT: vandpd %xmm0, %xmm1, %xmm0 {%k1} {z}
; SKX-NEXT: retq
@@ -503,13 +503,13 @@ entry:
define <8 x float> @test_mm256_mask_and_ps(<8 x float> %__W, i8 zeroext %__U, <8 x float> %__A, <8 x float> %__B) {
; KNL-LABEL: test_mm256_mask_and_ps:
-; KNL: ## BB#0: ## %entry
+; KNL: ## %bb.0: ## %entry
; KNL-NEXT: kmovw %edi, %k1
; KNL-NEXT: vpandd %ymm1, %ymm2, %ymm0 {%k1}
; KNL-NEXT: retq
;
; SKX-LABEL: test_mm256_mask_and_ps:
-; SKX: ## BB#0: ## %entry
+; SKX: ## %bb.0: ## %entry
; SKX-NEXT: kmovd %edi, %k1
; SKX-NEXT: vandps %ymm1, %ymm2, %ymm0 {%k1}
; SKX-NEXT: retq
@@ -525,13 +525,13 @@ entry:
define <8 x float> @test_mm256_maskz_and_ps(i8 zeroext %__U, <8 x float> %__A, <8 x float> %__B) {
; KNL-LABEL: test_mm256_maskz_and_ps:
-; KNL: ## BB#0: ## %entry
+; KNL: ## %bb.0: ## %entry
; KNL-NEXT: kmovw %edi, %k1
; KNL-NEXT: vpandd %ymm0, %ymm1, %ymm0 {%k1} {z}
; KNL-NEXT: retq
;
; SKX-LABEL: test_mm256_maskz_and_ps:
-; SKX: ## BB#0: ## %entry
+; SKX: ## %bb.0: ## %entry
; SKX-NEXT: kmovd %edi, %k1
; SKX-NEXT: vandps %ymm0, %ymm1, %ymm0 {%k1} {z}
; SKX-NEXT: retq
@@ -547,13 +547,13 @@ entry:
define <4 x float> @test_mm_mask_and_ps(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
; KNL-LABEL: test_mm_mask_and_ps:
-; KNL: ## BB#0: ## %entry
+; KNL: ## %bb.0: ## %entry
; KNL-NEXT: kmovw %edi, %k1
; KNL-NEXT: vpandd %xmm1, %xmm2, %xmm0 {%k1}
; KNL-NEXT: retq
;
; SKX-LABEL: test_mm_mask_and_ps:
-; SKX: ## BB#0: ## %entry
+; SKX: ## %bb.0: ## %entry
; SKX-NEXT: kmovd %edi, %k1
; SKX-NEXT: vandps %xmm1, %xmm2, %xmm0 {%k1}
; SKX-NEXT: retq
@@ -570,13 +570,13 @@ entry:
define <4 x float> @test_mm_maskz_and_ps(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
; KNL-LABEL: test_mm_maskz_and_ps:
-; KNL: ## BB#0: ## %entry
+; KNL: ## %bb.0: ## %entry
; KNL-NEXT: kmovw %edi, %k1
; KNL-NEXT: vpandd %xmm0, %xmm1, %xmm0 {%k1} {z}
; KNL-NEXT: retq
;
; SKX-LABEL: test_mm_maskz_and_ps:
-; SKX: ## BB#0: ## %entry
+; SKX: ## %bb.0: ## %entry
; SKX-NEXT: kmovd %edi, %k1
; SKX-NEXT: vandps %xmm0, %xmm1, %xmm0 {%k1} {z}
; SKX-NEXT: retq
@@ -593,13 +593,13 @@ entry:
define <4 x double> @test_mm256_mask_xor_pd(<4 x double> %__W, i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B) {
; KNL-LABEL: test_mm256_mask_xor_pd:
-; KNL: ## BB#0: ## %entry
+; KNL: ## %bb.0: ## %entry
; KNL-NEXT: kmovw %edi, %k1
; KNL-NEXT: vpxorq %ymm2, %ymm1, %ymm0 {%k1}
; KNL-NEXT: retq
;
; SKX-LABEL: test_mm256_mask_xor_pd:
-; SKX: ## BB#0: ## %entry
+; SKX: ## %bb.0: ## %entry
; SKX-NEXT: kmovd %edi, %k1
; SKX-NEXT: vxorpd %ymm2, %ymm1, %ymm0 {%k1}
; SKX-NEXT: retq
@@ -616,13 +616,13 @@ entry:
define <4 x double> @test_mm256_maskz_xor_pd(i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B) {
; KNL-LABEL: test_mm256_maskz_xor_pd:
-; KNL: ## BB#0: ## %entry
+; KNL: ## %bb.0: ## %entry
; KNL-NEXT: kmovw %edi, %k1
; KNL-NEXT: vpxorq %ymm1, %ymm0, %ymm0 {%k1} {z}
; KNL-NEXT: retq
;
; SKX-LABEL: test_mm256_maskz_xor_pd:
-; SKX: ## BB#0: ## %entry
+; SKX: ## %bb.0: ## %entry
; SKX-NEXT: kmovd %edi, %k1
; SKX-NEXT: vxorpd %ymm1, %ymm0, %ymm0 {%k1} {z}
; SKX-NEXT: retq
@@ -639,13 +639,13 @@ entry:
define <2 x double> @test_mm_mask_xor_pd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
; KNL-LABEL: test_mm_mask_xor_pd:
-; KNL: ## BB#0: ## %entry
+; KNL: ## %bb.0: ## %entry
; KNL-NEXT: kmovw %edi, %k1
; KNL-NEXT: vpxorq %xmm2, %xmm1, %xmm0 {%k1}
; KNL-NEXT: retq
;
; SKX-LABEL: test_mm_mask_xor_pd:
-; SKX: ## BB#0: ## %entry
+; SKX: ## %bb.0: ## %entry
; SKX-NEXT: kmovd %edi, %k1
; SKX-NEXT: vxorpd %xmm2, %xmm1, %xmm0 {%k1}
; SKX-NEXT: retq
@@ -662,13 +662,13 @@ entry:
define <2 x double> @test_mm_maskz_xor_pd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
; KNL-LABEL: test_mm_maskz_xor_pd:
-; KNL: ## BB#0: ## %entry
+; KNL: ## %bb.0: ## %entry
; KNL-NEXT: kmovw %edi, %k1
; KNL-NEXT: vpxorq %xmm1, %xmm0, %xmm0 {%k1} {z}
; KNL-NEXT: retq
;
; SKX-LABEL: test_mm_maskz_xor_pd:
-; SKX: ## BB#0: ## %entry
+; SKX: ## %bb.0: ## %entry
; SKX-NEXT: kmovd %edi, %k1
; SKX-NEXT: vxorpd %xmm1, %xmm0, %xmm0 {%k1} {z}
; SKX-NEXT: retq
@@ -685,13 +685,13 @@ entry:
define <8 x float> @test_mm256_mask_xor_ps(<8 x float> %__W, i8 zeroext %__U, <8 x float> %__A, <8 x float> %__B) {
; KNL-LABEL: test_mm256_mask_xor_ps:
-; KNL: ## BB#0: ## %entry
+; KNL: ## %bb.0: ## %entry
; KNL-NEXT: kmovw %edi, %k1
; KNL-NEXT: vpxord %ymm2, %ymm1, %ymm0 {%k1}
; KNL-NEXT: retq
;
; SKX-LABEL: test_mm256_mask_xor_ps:
-; SKX: ## BB#0: ## %entry
+; SKX: ## %bb.0: ## %entry
; SKX-NEXT: kmovd %edi, %k1
; SKX-NEXT: vxorps %ymm2, %ymm1, %ymm0 {%k1}
; SKX-NEXT: retq
@@ -707,13 +707,13 @@ entry:
define <8 x float> @test_mm256_maskz_xor_ps(i8 zeroext %__U, <8 x float> %__A, <8 x float> %__B) {
; KNL-LABEL: test_mm256_maskz_xor_ps:
-; KNL: ## BB#0: ## %entry
+; KNL: ## %bb.0: ## %entry
; KNL-NEXT: kmovw %edi, %k1
; KNL-NEXT: vpxord %ymm1, %ymm0, %ymm0 {%k1} {z}
; KNL-NEXT: retq
;
; SKX-LABEL: test_mm256_maskz_xor_ps:
-; SKX: ## BB#0: ## %entry
+; SKX: ## %bb.0: ## %entry
; SKX-NEXT: kmovd %edi, %k1
; SKX-NEXT: vxorps %ymm1, %ymm0, %ymm0 {%k1} {z}
; SKX-NEXT: retq
@@ -729,13 +729,13 @@ entry:
define <4 x float> @test_mm_mask_xor_ps(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
; KNL-LABEL: test_mm_mask_xor_ps:
-; KNL: ## BB#0: ## %entry
+; KNL: ## %bb.0: ## %entry
; KNL-NEXT: kmovw %edi, %k1
; KNL-NEXT: vpxord %xmm2, %xmm1, %xmm0 {%k1}
; KNL-NEXT: retq
;
; SKX-LABEL: test_mm_mask_xor_ps:
-; SKX: ## BB#0: ## %entry
+; SKX: ## %bb.0: ## %entry
; SKX-NEXT: kmovd %edi, %k1
; SKX-NEXT: vxorps %xmm2, %xmm1, %xmm0 {%k1}
; SKX-NEXT: retq
@@ -752,13 +752,13 @@ entry:
define <4 x float> @test_mm_maskz_xor_ps(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
; KNL-LABEL: test_mm_maskz_xor_ps:
-; KNL: ## BB#0: ## %entry
+; KNL: ## %bb.0: ## %entry
; KNL-NEXT: kmovw %edi, %k1
; KNL-NEXT: vpxord %xmm1, %xmm0, %xmm0 {%k1} {z}
; KNL-NEXT: retq
;
; SKX-LABEL: test_mm_maskz_xor_ps:
-; SKX: ## BB#0: ## %entry
+; SKX: ## %bb.0: ## %entry
; SKX-NEXT: kmovd %edi, %k1
; SKX-NEXT: vxorps %xmm1, %xmm0, %xmm0 {%k1} {z}
; SKX-NEXT: retq
@@ -775,13 +775,13 @@ entry:
define <4 x double> @test_mm256_mask_or_pd(<4 x double> %__W, i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B) {
; KNL-LABEL: test_mm256_mask_or_pd:
-; KNL: ## BB#0: ## %entry
+; KNL: ## %bb.0: ## %entry
; KNL-NEXT: kmovw %edi, %k1
; KNL-NEXT: vporq %ymm1, %ymm2, %ymm0 {%k1}
; KNL-NEXT: retq
;
; SKX-LABEL: test_mm256_mask_or_pd:
-; SKX: ## BB#0: ## %entry
+; SKX: ## %bb.0: ## %entry
; SKX-NEXT: kmovd %edi, %k1
; SKX-NEXT: vorpd %ymm1, %ymm2, %ymm0 {%k1}
; SKX-NEXT: retq
@@ -798,13 +798,13 @@ entry:
define <4 x double> @test_mm256_maskz_or_pd(i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B) {
; KNL-LABEL: test_mm256_maskz_or_pd:
-; KNL: ## BB#0: ## %entry
+; KNL: ## %bb.0: ## %entry
; KNL-NEXT: kmovw %edi, %k1
; KNL-NEXT: vporq %ymm0, %ymm1, %ymm0 {%k1} {z}
; KNL-NEXT: retq
;
; SKX-LABEL: test_mm256_maskz_or_pd:
-; SKX: ## BB#0: ## %entry
+; SKX: ## %bb.0: ## %entry
; SKX-NEXT: kmovd %edi, %k1
; SKX-NEXT: vorpd %ymm0, %ymm1, %ymm0 {%k1} {z}
; SKX-NEXT: retq
@@ -821,13 +821,13 @@ entry:
define <2 x double> @test_mm_mask_or_pd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
; KNL-LABEL: test_mm_mask_or_pd:
-; KNL: ## BB#0: ## %entry
+; KNL: ## %bb.0: ## %entry
; KNL-NEXT: kmovw %edi, %k1
; KNL-NEXT: vporq %xmm1, %xmm2, %xmm0 {%k1}
; KNL-NEXT: retq
;
; SKX-LABEL: test_mm_mask_or_pd:
-; SKX: ## BB#0: ## %entry
+; SKX: ## %bb.0: ## %entry
; SKX-NEXT: kmovd %edi, %k1
; SKX-NEXT: vorpd %xmm1, %xmm2, %xmm0 {%k1}
; SKX-NEXT: retq
@@ -844,13 +844,13 @@ entry:
define <2 x double> @test_mm_maskz_or_pd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
; KNL-LABEL: test_mm_maskz_or_pd:
-; KNL: ## BB#0: ## %entry
+; KNL: ## %bb.0: ## %entry
; KNL-NEXT: kmovw %edi, %k1
; KNL-NEXT: vporq %xmm0, %xmm1, %xmm0 {%k1} {z}
; KNL-NEXT: retq
;
; SKX-LABEL: test_mm_maskz_or_pd:
-; SKX: ## BB#0: ## %entry
+; SKX: ## %bb.0: ## %entry
; SKX-NEXT: kmovd %edi, %k1
; SKX-NEXT: vorpd %xmm0, %xmm1, %xmm0 {%k1} {z}
; SKX-NEXT: retq
@@ -867,13 +867,13 @@ entry:
define <8 x float> @test_mm256_mask_or_ps(<8 x float> %__W, i8 zeroext %__U, <8 x float> %__A, <8 x float> %__B) {
; KNL-LABEL: test_mm256_mask_or_ps:
-; KNL: ## BB#0: ## %entry
+; KNL: ## %bb.0: ## %entry
; KNL-NEXT: kmovw %edi, %k1
; KNL-NEXT: vpord %ymm1, %ymm2, %ymm0 {%k1}
; KNL-NEXT: retq
;
; SKX-LABEL: test_mm256_mask_or_ps:
-; SKX: ## BB#0: ## %entry
+; SKX: ## %bb.0: ## %entry
; SKX-NEXT: kmovd %edi, %k1
; SKX-NEXT: vorps %ymm1, %ymm2, %ymm0 {%k1}
; SKX-NEXT: retq
@@ -889,13 +889,13 @@ entry:
define <8 x float> @test_mm256_maskz_or_ps(i8 zeroext %__U, <8 x float> %__A, <8 x float> %__B) {
; KNL-LABEL: test_mm256_maskz_or_ps:
-; KNL: ## BB#0: ## %entry
+; KNL: ## %bb.0: ## %entry
; KNL-NEXT: kmovw %edi, %k1
; KNL-NEXT: vpord %ymm0, %ymm1, %ymm0 {%k1} {z}
; KNL-NEXT: retq
;
; SKX-LABEL: test_mm256_maskz_or_ps:
-; SKX: ## BB#0: ## %entry
+; SKX: ## %bb.0: ## %entry
; SKX-NEXT: kmovd %edi, %k1
; SKX-NEXT: vorps %ymm0, %ymm1, %ymm0 {%k1} {z}
; SKX-NEXT: retq
@@ -911,13 +911,13 @@ entry:
define <4 x float> @test_mm_mask_or_ps(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
; KNL-LABEL: test_mm_mask_or_ps:
-; KNL: ## BB#0: ## %entry
+; KNL: ## %bb.0: ## %entry
; KNL-NEXT: kmovw %edi, %k1
; KNL-NEXT: vpord %xmm1, %xmm2, %xmm0 {%k1}
; KNL-NEXT: retq
;
; SKX-LABEL: test_mm_mask_or_ps:
-; SKX: ## BB#0: ## %entry
+; SKX: ## %bb.0: ## %entry
; SKX-NEXT: kmovd %edi, %k1
; SKX-NEXT: vorps %xmm1, %xmm2, %xmm0 {%k1}
; SKX-NEXT: retq
@@ -934,13 +934,13 @@ entry:
define <4 x float> @test_mm_maskz_or_ps(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
; KNL-LABEL: test_mm_maskz_or_ps:
-; KNL: ## BB#0: ## %entry
+; KNL: ## %bb.0: ## %entry
; KNL-NEXT: kmovw %edi, %k1
; KNL-NEXT: vpord %xmm0, %xmm1, %xmm0 {%k1} {z}
; KNL-NEXT: retq
;
; SKX-LABEL: test_mm_maskz_or_ps:
-; SKX: ## BB#0: ## %entry
+; SKX: ## %bb.0: ## %entry
; SKX-NEXT: kmovd %edi, %k1
; SKX-NEXT: vorps %xmm0, %xmm1, %xmm0 {%k1} {z}
; SKX-NEXT: retq
diff --git a/test/CodeGen/X86/avx512vl-mov.ll b/test/CodeGen/X86/avx512vl-mov.ll
index af449d6628c4..f0ce312305fe 100644
--- a/test/CodeGen/X86/avx512vl-mov.ll
+++ b/test/CodeGen/X86/avx512vl-mov.ll
@@ -1,9 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512vl --show-mc-encoding| FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512vl --show-mc-encoding| FileCheck %s
define <8 x i32> @test_256_1(i8 * %addr) {
; CHECK-LABEL: test_256_1:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovups (%rdi), %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x10,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <8 x i32>*
@@ -13,7 +13,7 @@ define <8 x i32> @test_256_1(i8 * %addr) {
define <8 x i32> @test_256_2(i8 * %addr) {
; CHECK-LABEL: test_256_2:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovaps (%rdi), %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <8 x i32>*
@@ -23,7 +23,7 @@ define <8 x i32> @test_256_2(i8 * %addr) {
define void @test_256_3(i8 * %addr, <4 x i64> %data) {
; CHECK-LABEL: test_256_3:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovaps %ymm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x29,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <4 x i64>*
@@ -33,7 +33,7 @@ define void @test_256_3(i8 * %addr, <4 x i64> %data) {
define void @test_256_4(i8 * %addr, <8 x i32> %data) {
; CHECK-LABEL: test_256_4:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovups %ymm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x11,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <8 x i32>*
@@ -43,7 +43,7 @@ define void @test_256_4(i8 * %addr, <8 x i32> %data) {
define void @test_256_5(i8 * %addr, <8 x i32> %data) {
; CHECK-LABEL: test_256_5:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovaps %ymm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x29,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <8 x i32>*
@@ -53,7 +53,7 @@ define void @test_256_5(i8 * %addr, <8 x i32> %data) {
define <4 x i64> @test_256_6(i8 * %addr) {
; CHECK-LABEL: test_256_6:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovaps (%rdi), %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <4 x i64>*
@@ -63,7 +63,7 @@ define <4 x i64> @test_256_6(i8 * %addr) {
define void @test_256_7(i8 * %addr, <4 x i64> %data) {
; CHECK-LABEL: test_256_7:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovups %ymm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x11,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <4 x i64>*
@@ -73,7 +73,7 @@ define void @test_256_7(i8 * %addr, <4 x i64> %data) {
define <4 x i64> @test_256_8(i8 * %addr) {
; CHECK-LABEL: test_256_8:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovups (%rdi), %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x10,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <4 x i64>*
@@ -83,7 +83,7 @@ define <4 x i64> @test_256_8(i8 * %addr) {
define void @test_256_9(i8 * %addr, <4 x double> %data) {
; CHECK-LABEL: test_256_9:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovaps %ymm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x29,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <4 x double>*
@@ -93,7 +93,7 @@ define void @test_256_9(i8 * %addr, <4 x double> %data) {
define <4 x double> @test_256_10(i8 * %addr) {
; CHECK-LABEL: test_256_10:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovaps (%rdi), %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <4 x double>*
@@ -103,7 +103,7 @@ define <4 x double> @test_256_10(i8 * %addr) {
define void @test_256_11(i8 * %addr, <8 x float> %data) {
; CHECK-LABEL: test_256_11:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovaps %ymm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x29,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <8 x float>*
@@ -113,7 +113,7 @@ define void @test_256_11(i8 * %addr, <8 x float> %data) {
define <8 x float> @test_256_12(i8 * %addr) {
; CHECK-LABEL: test_256_12:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovaps (%rdi), %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <8 x float>*
@@ -123,7 +123,7 @@ define <8 x float> @test_256_12(i8 * %addr) {
define void @test_256_13(i8 * %addr, <4 x double> %data) {
; CHECK-LABEL: test_256_13:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovups %ymm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x11,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <4 x double>*
@@ -133,7 +133,7 @@ define void @test_256_13(i8 * %addr, <4 x double> %data) {
define <4 x double> @test_256_14(i8 * %addr) {
; CHECK-LABEL: test_256_14:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovups (%rdi), %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x10,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <4 x double>*
@@ -143,7 +143,7 @@ define <4 x double> @test_256_14(i8 * %addr) {
define void @test_256_15(i8 * %addr, <8 x float> %data) {
; CHECK-LABEL: test_256_15:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovups %ymm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x11,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <8 x float>*
@@ -153,7 +153,7 @@ define void @test_256_15(i8 * %addr, <8 x float> %data) {
define <8 x float> @test_256_16(i8 * %addr) {
; CHECK-LABEL: test_256_16:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovups (%rdi), %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x10,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <8 x float>*
@@ -163,8 +163,8 @@ define <8 x float> @test_256_16(i8 * %addr) {
define <8 x i32> @test_256_17(i8 * %addr, <8 x i32> %old, <8 x i32> %mask1) {
; CHECK-LABEL: test_256_17:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpxor %ymm2, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xef,0xd2]
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2]
; CHECK-NEXT: vpcmpneqd %ymm2, %ymm1, %k1 ## encoding: [0x62,0xf3,0x75,0x28,0x1f,0xca,0x04]
; CHECK-NEXT: vmovdqa32 (%rdi), %ymm0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x6f,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -177,8 +177,8 @@ define <8 x i32> @test_256_17(i8 * %addr, <8 x i32> %old, <8 x i32> %mask1) {
define <8 x i32> @test_256_18(i8 * %addr, <8 x i32> %old, <8 x i32> %mask1) {
; CHECK-LABEL: test_256_18:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpxor %ymm2, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xef,0xd2]
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2]
; CHECK-NEXT: vpcmpneqd %ymm2, %ymm1, %k1 ## encoding: [0x62,0xf3,0x75,0x28,0x1f,0xca,0x04]
; CHECK-NEXT: vmovdqu32 (%rdi), %ymm0 {%k1} ## encoding: [0x62,0xf1,0x7e,0x29,0x6f,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -191,8 +191,8 @@ define <8 x i32> @test_256_18(i8 * %addr, <8 x i32> %old, <8 x i32> %mask1) {
define <8 x i32> @test_256_19(i8 * %addr, <8 x i32> %mask1) {
; CHECK-LABEL: test_256_19:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpxor %ymm1, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xef,0xc9]
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xef,0xc9]
; CHECK-NEXT: vpcmpneqd %ymm1, %ymm0, %k1 ## encoding: [0x62,0xf3,0x7d,0x28,0x1f,0xc9,0x04]
; CHECK-NEXT: vmovdqa32 (%rdi), %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x6f,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -205,8 +205,8 @@ define <8 x i32> @test_256_19(i8 * %addr, <8 x i32> %mask1) {
define <8 x i32> @test_256_20(i8 * %addr, <8 x i32> %mask1) {
; CHECK-LABEL: test_256_20:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpxor %ymm1, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xef,0xc9]
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xef,0xc9]
; CHECK-NEXT: vpcmpneqd %ymm1, %ymm0, %k1 ## encoding: [0x62,0xf3,0x7d,0x28,0x1f,0xc9,0x04]
; CHECK-NEXT: vmovdqu32 (%rdi), %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0xa9,0x6f,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -219,8 +219,8 @@ define <8 x i32> @test_256_20(i8 * %addr, <8 x i32> %mask1) {
define <4 x i64> @test_256_21(i8 * %addr, <4 x i64> %old, <4 x i64> %mask1) {
; CHECK-LABEL: test_256_21:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpxor %ymm2, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xef,0xd2]
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2]
; CHECK-NEXT: vpcmpneqq %ymm2, %ymm1, %k1 ## encoding: [0x62,0xf3,0xf5,0x28,0x1f,0xca,0x04]
; CHECK-NEXT: vmovdqa64 (%rdi), %ymm0 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0x6f,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -233,8 +233,8 @@ define <4 x i64> @test_256_21(i8 * %addr, <4 x i64> %old, <4 x i64> %mask1) {
define <4 x i64> @test_256_22(i8 * %addr, <4 x i64> %old, <4 x i64> %mask1) {
; CHECK-LABEL: test_256_22:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpxor %ymm2, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xef,0xd2]
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2]
; CHECK-NEXT: vpcmpneqq %ymm2, %ymm1, %k1 ## encoding: [0x62,0xf3,0xf5,0x28,0x1f,0xca,0x04]
; CHECK-NEXT: vmovdqu64 (%rdi), %ymm0 {%k1} ## encoding: [0x62,0xf1,0xfe,0x29,0x6f,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -247,8 +247,8 @@ define <4 x i64> @test_256_22(i8 * %addr, <4 x i64> %old, <4 x i64> %mask1) {
define <4 x i64> @test_256_23(i8 * %addr, <4 x i64> %mask1) {
; CHECK-LABEL: test_256_23:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpxor %ymm1, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xef,0xc9]
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xef,0xc9]
; CHECK-NEXT: vpcmpneqq %ymm1, %ymm0, %k1 ## encoding: [0x62,0xf3,0xfd,0x28,0x1f,0xc9,0x04]
; CHECK-NEXT: vmovdqa64 (%rdi), %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0x6f,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -261,8 +261,8 @@ define <4 x i64> @test_256_23(i8 * %addr, <4 x i64> %mask1) {
define <4 x i64> @test_256_24(i8 * %addr, <4 x i64> %mask1) {
; CHECK-LABEL: test_256_24:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpxor %ymm1, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xef,0xc9]
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xef,0xc9]
; CHECK-NEXT: vpcmpneqq %ymm1, %ymm0, %k1 ## encoding: [0x62,0xf3,0xfd,0x28,0x1f,0xc9,0x04]
; CHECK-NEXT: vmovdqu64 (%rdi), %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfe,0xa9,0x6f,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -275,10 +275,9 @@ define <4 x i64> @test_256_24(i8 * %addr, <4 x i64> %mask1) {
define <8 x float> @test_256_25(i8 * %addr, <8 x float> %old, <8 x float> %mask1) {
; CHECK-LABEL: test_256_25:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpxor %ymm2, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xef,0xd2]
-; CHECK-NEXT: vcmpordps %ymm2, %ymm1, %k1 ## encoding: [0x62,0xf1,0x74,0x28,0xc2,0xca,0x07]
-; CHECK-NEXT: vcmpneqps %ymm2, %ymm1, %k1 {%k1} ## encoding: [0x62,0xf1,0x74,0x29,0xc2,0xca,0x04]
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2]
+; CHECK-NEXT: vcmpneq_oqps %ymm2, %ymm1, %k1 ## encoding: [0x62,0xf1,0x74,0x28,0xc2,0xca,0x0c]
; CHECK-NEXT: vmovaps (%rdi), %ymm0 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x28,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = fcmp one <8 x float> %mask1, zeroinitializer
@@ -290,10 +289,9 @@ define <8 x float> @test_256_25(i8 * %addr, <8 x float> %old, <8 x float> %mask1
define <8 x float> @test_256_26(i8 * %addr, <8 x float> %old, <8 x float> %mask1) {
; CHECK-LABEL: test_256_26:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpxor %ymm2, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xef,0xd2]
-; CHECK-NEXT: vcmpordps %ymm2, %ymm1, %k1 ## encoding: [0x62,0xf1,0x74,0x28,0xc2,0xca,0x07]
-; CHECK-NEXT: vcmpneqps %ymm2, %ymm1, %k1 {%k1} ## encoding: [0x62,0xf1,0x74,0x29,0xc2,0xca,0x04]
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2]
+; CHECK-NEXT: vcmpneq_oqps %ymm2, %ymm1, %k1 ## encoding: [0x62,0xf1,0x74,0x28,0xc2,0xca,0x0c]
; CHECK-NEXT: vmovups (%rdi), %ymm0 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x10,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = fcmp one <8 x float> %mask1, zeroinitializer
@@ -305,10 +303,9 @@ define <8 x float> @test_256_26(i8 * %addr, <8 x float> %old, <8 x float> %mask1
define <8 x float> @test_256_27(i8 * %addr, <8 x float> %mask1) {
; CHECK-LABEL: test_256_27:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpxor %ymm1, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xef,0xc9]
-; CHECK-NEXT: vcmpordps %ymm1, %ymm0, %k1 ## encoding: [0x62,0xf1,0x7c,0x28,0xc2,0xc9,0x07]
-; CHECK-NEXT: vcmpneqps %ymm1, %ymm0, %k1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0xc2,0xc9,0x04]
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xef,0xc9]
+; CHECK-NEXT: vcmpneq_oqps %ymm1, %ymm0, %k1 ## encoding: [0x62,0xf1,0x7c,0x28,0xc2,0xc9,0x0c]
; CHECK-NEXT: vmovaps (%rdi), %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xa9,0x28,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = fcmp one <8 x float> %mask1, zeroinitializer
@@ -320,10 +317,9 @@ define <8 x float> @test_256_27(i8 * %addr, <8 x float> %mask1) {
define <8 x float> @test_256_28(i8 * %addr, <8 x float> %mask1) {
; CHECK-LABEL: test_256_28:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpxor %ymm1, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xef,0xc9]
-; CHECK-NEXT: vcmpordps %ymm1, %ymm0, %k1 ## encoding: [0x62,0xf1,0x7c,0x28,0xc2,0xc9,0x07]
-; CHECK-NEXT: vcmpneqps %ymm1, %ymm0, %k1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0xc2,0xc9,0x04]
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xef,0xc9]
+; CHECK-NEXT: vcmpneq_oqps %ymm1, %ymm0, %k1 ## encoding: [0x62,0xf1,0x7c,0x28,0xc2,0xc9,0x0c]
; CHECK-NEXT: vmovups (%rdi), %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xa9,0x10,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%mask = fcmp one <8 x float> %mask1, zeroinitializer
@@ -335,8 +331,8 @@ define <8 x float> @test_256_28(i8 * %addr, <8 x float> %mask1) {
define <4 x double> @test_256_29(i8 * %addr, <4 x double> %old, <4 x i64> %mask1) {
; CHECK-LABEL: test_256_29:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpxor %ymm2, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xef,0xd2]
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2]
; CHECK-NEXT: vpcmpneqq %ymm2, %ymm1, %k1 ## encoding: [0x62,0xf3,0xf5,0x28,0x1f,0xca,0x04]
; CHECK-NEXT: vmovapd (%rdi), %ymm0 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0x28,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -349,8 +345,8 @@ define <4 x double> @test_256_29(i8 * %addr, <4 x double> %old, <4 x i64> %mask1
define <4 x double> @test_256_30(i8 * %addr, <4 x double> %old, <4 x i64> %mask1) {
; CHECK-LABEL: test_256_30:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpxor %ymm2, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xef,0xd2]
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2]
; CHECK-NEXT: vpcmpneqq %ymm2, %ymm1, %k1 ## encoding: [0x62,0xf3,0xf5,0x28,0x1f,0xca,0x04]
; CHECK-NEXT: vmovupd (%rdi), %ymm0 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0x10,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -363,8 +359,8 @@ define <4 x double> @test_256_30(i8 * %addr, <4 x double> %old, <4 x i64> %mask1
define <4 x double> @test_256_31(i8 * %addr, <4 x i64> %mask1) {
; CHECK-LABEL: test_256_31:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpxor %ymm1, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xef,0xc9]
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xef,0xc9]
; CHECK-NEXT: vpcmpneqq %ymm1, %ymm0, %k1 ## encoding: [0x62,0xf3,0xfd,0x28,0x1f,0xc9,0x04]
; CHECK-NEXT: vmovapd (%rdi), %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0x28,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -377,8 +373,8 @@ define <4 x double> @test_256_31(i8 * %addr, <4 x i64> %mask1) {
define <4 x double> @test_256_32(i8 * %addr, <4 x i64> %mask1) {
; CHECK-LABEL: test_256_32:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpxor %ymm1, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xef,0xc9]
+; CHECK: ## %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xef,0xc9]
; CHECK-NEXT: vpcmpneqq %ymm1, %ymm0, %k1 ## encoding: [0x62,0xf3,0xfd,0x28,0x1f,0xc9,0x04]
; CHECK-NEXT: vmovupd (%rdi), %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0x10,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -391,7 +387,7 @@ define <4 x double> @test_256_32(i8 * %addr, <4 x i64> %mask1) {
define <4 x i32> @test_128_1(i8 * %addr) {
; CHECK-LABEL: test_128_1:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovups (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x10,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <4 x i32>*
@@ -401,7 +397,7 @@ define <4 x i32> @test_128_1(i8 * %addr) {
define <4 x i32> @test_128_2(i8 * %addr) {
; CHECK-LABEL: test_128_2:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovaps (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <4 x i32>*
@@ -411,7 +407,7 @@ define <4 x i32> @test_128_2(i8 * %addr) {
define void @test_128_3(i8 * %addr, <2 x i64> %data) {
; CHECK-LABEL: test_128_3:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovaps %xmm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <2 x i64>*
@@ -421,7 +417,7 @@ define void @test_128_3(i8 * %addr, <2 x i64> %data) {
define void @test_128_4(i8 * %addr, <4 x i32> %data) {
; CHECK-LABEL: test_128_4:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovups %xmm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x11,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <4 x i32>*
@@ -431,7 +427,7 @@ define void @test_128_4(i8 * %addr, <4 x i32> %data) {
define void @test_128_5(i8 * %addr, <4 x i32> %data) {
; CHECK-LABEL: test_128_5:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovaps %xmm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <4 x i32>*
@@ -441,7 +437,7 @@ define void @test_128_5(i8 * %addr, <4 x i32> %data) {
define <2 x i64> @test_128_6(i8 * %addr) {
; CHECK-LABEL: test_128_6:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovaps (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <2 x i64>*
@@ -451,7 +447,7 @@ define <2 x i64> @test_128_6(i8 * %addr) {
define void @test_128_7(i8 * %addr, <2 x i64> %data) {
; CHECK-LABEL: test_128_7:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovups %xmm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x11,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <2 x i64>*
@@ -461,7 +457,7 @@ define void @test_128_7(i8 * %addr, <2 x i64> %data) {
define <2 x i64> @test_128_8(i8 * %addr) {
; CHECK-LABEL: test_128_8:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovups (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x10,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <2 x i64>*
@@ -471,7 +467,7 @@ define <2 x i64> @test_128_8(i8 * %addr) {
define void @test_128_9(i8 * %addr, <2 x double> %data) {
; CHECK-LABEL: test_128_9:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovaps %xmm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <2 x double>*
@@ -481,7 +477,7 @@ define void @test_128_9(i8 * %addr, <2 x double> %data) {
define <2 x double> @test_128_10(i8 * %addr) {
; CHECK-LABEL: test_128_10:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovaps (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <2 x double>*
@@ -491,7 +487,7 @@ define <2 x double> @test_128_10(i8 * %addr) {
define void @test_128_11(i8 * %addr, <4 x float> %data) {
; CHECK-LABEL: test_128_11:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovaps %xmm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <4 x float>*
@@ -501,7 +497,7 @@ define void @test_128_11(i8 * %addr, <4 x float> %data) {
define <4 x float> @test_128_12(i8 * %addr) {
; CHECK-LABEL: test_128_12:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovaps (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <4 x float>*
@@ -511,7 +507,7 @@ define <4 x float> @test_128_12(i8 * %addr) {
define void @test_128_13(i8 * %addr, <2 x double> %data) {
; CHECK-LABEL: test_128_13:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovups %xmm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x11,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <2 x double>*
@@ -521,7 +517,7 @@ define void @test_128_13(i8 * %addr, <2 x double> %data) {
define <2 x double> @test_128_14(i8 * %addr) {
; CHECK-LABEL: test_128_14:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovups (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x10,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <2 x double>*
@@ -531,7 +527,7 @@ define <2 x double> @test_128_14(i8 * %addr) {
define void @test_128_15(i8 * %addr, <4 x float> %data) {
; CHECK-LABEL: test_128_15:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovups %xmm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x11,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <4 x float>*
@@ -541,7 +537,7 @@ define void @test_128_15(i8 * %addr, <4 x float> %data) {
define <4 x float> @test_128_16(i8 * %addr) {
; CHECK-LABEL: test_128_16:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovups (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x10,0x07]
; CHECK-NEXT: retq ## encoding: [0xc3]
%vaddr = bitcast i8* %addr to <4 x float>*
@@ -551,7 +547,7 @@ define <4 x float> @test_128_16(i8 * %addr) {
define <4 x i32> @test_128_17(i8 * %addr, <4 x i32> %old, <4 x i32> %mask1) {
; CHECK-LABEL: test_128_17:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2]
; CHECK-NEXT: vpcmpneqd %xmm2, %xmm1, %k1 ## encoding: [0x62,0xf3,0x75,0x08,0x1f,0xca,0x04]
; CHECK-NEXT: vmovdqa32 (%rdi), %xmm0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x6f,0x07]
@@ -565,7 +561,7 @@ define <4 x i32> @test_128_17(i8 * %addr, <4 x i32> %old, <4 x i32> %mask1) {
define <4 x i32> @test_128_18(i8 * %addr, <4 x i32> %old, <4 x i32> %mask1) {
; CHECK-LABEL: test_128_18:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2]
; CHECK-NEXT: vpcmpneqd %xmm2, %xmm1, %k1 ## encoding: [0x62,0xf3,0x75,0x08,0x1f,0xca,0x04]
; CHECK-NEXT: vmovdqu32 (%rdi), %xmm0 {%k1} ## encoding: [0x62,0xf1,0x7e,0x09,0x6f,0x07]
@@ -579,7 +575,7 @@ define <4 x i32> @test_128_18(i8 * %addr, <4 x i32> %old, <4 x i32> %mask1) {
define <4 x i32> @test_128_19(i8 * %addr, <4 x i32> %mask1) {
; CHECK-LABEL: test_128_19:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xef,0xc9]
; CHECK-NEXT: vpcmpneqd %xmm1, %xmm0, %k1 ## encoding: [0x62,0xf3,0x7d,0x08,0x1f,0xc9,0x04]
; CHECK-NEXT: vmovdqa32 (%rdi), %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x6f,0x07]
@@ -593,7 +589,7 @@ define <4 x i32> @test_128_19(i8 * %addr, <4 x i32> %mask1) {
define <4 x i32> @test_128_20(i8 * %addr, <4 x i32> %mask1) {
; CHECK-LABEL: test_128_20:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xef,0xc9]
; CHECK-NEXT: vpcmpneqd %xmm1, %xmm0, %k1 ## encoding: [0x62,0xf3,0x7d,0x08,0x1f,0xc9,0x04]
; CHECK-NEXT: vmovdqu32 (%rdi), %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0x89,0x6f,0x07]
@@ -607,7 +603,7 @@ define <4 x i32> @test_128_20(i8 * %addr, <4 x i32> %mask1) {
define <2 x i64> @test_128_21(i8 * %addr, <2 x i64> %old, <2 x i64> %mask1) {
; CHECK-LABEL: test_128_21:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2]
; CHECK-NEXT: vpcmpneqq %xmm2, %xmm1, %k1 ## encoding: [0x62,0xf3,0xf5,0x08,0x1f,0xca,0x04]
; CHECK-NEXT: vmovdqa64 (%rdi), %xmm0 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0x6f,0x07]
@@ -621,7 +617,7 @@ define <2 x i64> @test_128_21(i8 * %addr, <2 x i64> %old, <2 x i64> %mask1) {
define <2 x i64> @test_128_22(i8 * %addr, <2 x i64> %old, <2 x i64> %mask1) {
; CHECK-LABEL: test_128_22:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2]
; CHECK-NEXT: vpcmpneqq %xmm2, %xmm1, %k1 ## encoding: [0x62,0xf3,0xf5,0x08,0x1f,0xca,0x04]
; CHECK-NEXT: vmovdqu64 (%rdi), %xmm0 {%k1} ## encoding: [0x62,0xf1,0xfe,0x09,0x6f,0x07]
@@ -635,7 +631,7 @@ define <2 x i64> @test_128_22(i8 * %addr, <2 x i64> %old, <2 x i64> %mask1) {
define <2 x i64> @test_128_23(i8 * %addr, <2 x i64> %mask1) {
; CHECK-LABEL: test_128_23:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xef,0xc9]
; CHECK-NEXT: vpcmpneqq %xmm1, %xmm0, %k1 ## encoding: [0x62,0xf3,0xfd,0x08,0x1f,0xc9,0x04]
; CHECK-NEXT: vmovdqa64 (%rdi), %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0x6f,0x07]
@@ -649,7 +645,7 @@ define <2 x i64> @test_128_23(i8 * %addr, <2 x i64> %mask1) {
define <2 x i64> @test_128_24(i8 * %addr, <2 x i64> %mask1) {
; CHECK-LABEL: test_128_24:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xef,0xc9]
; CHECK-NEXT: vpcmpneqq %xmm1, %xmm0, %k1 ## encoding: [0x62,0xf3,0xfd,0x08,0x1f,0xc9,0x04]
; CHECK-NEXT: vmovdqu64 (%rdi), %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfe,0x89,0x6f,0x07]
@@ -663,7 +659,7 @@ define <2 x i64> @test_128_24(i8 * %addr, <2 x i64> %mask1) {
define <4 x float> @test_128_25(i8 * %addr, <4 x float> %old, <4 x i32> %mask1) {
; CHECK-LABEL: test_128_25:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2]
; CHECK-NEXT: vpcmpneqd %xmm2, %xmm1, %k1 ## encoding: [0x62,0xf3,0x75,0x08,0x1f,0xca,0x04]
; CHECK-NEXT: vmovaps (%rdi), %xmm0 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x28,0x07]
@@ -677,7 +673,7 @@ define <4 x float> @test_128_25(i8 * %addr, <4 x float> %old, <4 x i32> %mask1)
define <4 x float> @test_128_26(i8 * %addr, <4 x float> %old, <4 x i32> %mask1) {
; CHECK-LABEL: test_128_26:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2]
; CHECK-NEXT: vpcmpneqd %xmm2, %xmm1, %k1 ## encoding: [0x62,0xf3,0x75,0x08,0x1f,0xca,0x04]
; CHECK-NEXT: vmovups (%rdi), %xmm0 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x10,0x07]
@@ -691,7 +687,7 @@ define <4 x float> @test_128_26(i8 * %addr, <4 x float> %old, <4 x i32> %mask1)
define <4 x float> @test_128_27(i8 * %addr, <4 x i32> %mask1) {
; CHECK-LABEL: test_128_27:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xef,0xc9]
; CHECK-NEXT: vpcmpneqd %xmm1, %xmm0, %k1 ## encoding: [0x62,0xf3,0x7d,0x08,0x1f,0xc9,0x04]
; CHECK-NEXT: vmovaps (%rdi), %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x89,0x28,0x07]
@@ -705,7 +701,7 @@ define <4 x float> @test_128_27(i8 * %addr, <4 x i32> %mask1) {
define <4 x float> @test_128_28(i8 * %addr, <4 x i32> %mask1) {
; CHECK-LABEL: test_128_28:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xef,0xc9]
; CHECK-NEXT: vpcmpneqd %xmm1, %xmm0, %k1 ## encoding: [0x62,0xf3,0x7d,0x08,0x1f,0xc9,0x04]
; CHECK-NEXT: vmovups (%rdi), %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x89,0x10,0x07]
@@ -719,7 +715,7 @@ define <4 x float> @test_128_28(i8 * %addr, <4 x i32> %mask1) {
define <2 x double> @test_128_29(i8 * %addr, <2 x double> %old, <2 x i64> %mask1) {
; CHECK-LABEL: test_128_29:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2]
; CHECK-NEXT: vpcmpneqq %xmm2, %xmm1, %k1 ## encoding: [0x62,0xf3,0xf5,0x08,0x1f,0xca,0x04]
; CHECK-NEXT: vmovapd (%rdi), %xmm0 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0x28,0x07]
@@ -733,7 +729,7 @@ define <2 x double> @test_128_29(i8 * %addr, <2 x double> %old, <2 x i64> %mask1
define <2 x double> @test_128_30(i8 * %addr, <2 x double> %old, <2 x i64> %mask1) {
; CHECK-LABEL: test_128_30:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2]
; CHECK-NEXT: vpcmpneqq %xmm2, %xmm1, %k1 ## encoding: [0x62,0xf3,0xf5,0x08,0x1f,0xca,0x04]
; CHECK-NEXT: vmovupd (%rdi), %xmm0 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0x10,0x07]
@@ -747,7 +743,7 @@ define <2 x double> @test_128_30(i8 * %addr, <2 x double> %old, <2 x i64> %mask1
define <2 x double> @test_128_31(i8 * %addr, <2 x i64> %mask1) {
; CHECK-LABEL: test_128_31:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xef,0xc9]
; CHECK-NEXT: vpcmpneqq %xmm1, %xmm0, %k1 ## encoding: [0x62,0xf3,0xfd,0x08,0x1f,0xc9,0x04]
; CHECK-NEXT: vmovapd (%rdi), %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0x28,0x07]
@@ -761,7 +757,7 @@ define <2 x double> @test_128_31(i8 * %addr, <2 x i64> %mask1) {
define <2 x double> @test_128_32(i8 * %addr, <2 x i64> %mask1) {
; CHECK-LABEL: test_128_32:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xef,0xc9]
; CHECK-NEXT: vpcmpneqq %xmm1, %xmm0, %k1 ## encoding: [0x62,0xf3,0xfd,0x08,0x1f,0xc9,0x04]
; CHECK-NEXT: vmovupd (%rdi), %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0x10,0x07]
diff --git a/test/CodeGen/X86/avx512vl-nontemporal.ll b/test/CodeGen/X86/avx512vl-nontemporal.ll
index a55958a73459..683cae69bcae 100644
--- a/test/CodeGen/X86/avx512vl-nontemporal.ll
+++ b/test/CodeGen/X86/avx512vl-nontemporal.ll
@@ -1,34 +1,48 @@
-; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=skx --show-mc-encoding | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx --show-mc-encoding | FileCheck %s
-define void @f256(<8 x float> %A, <8 x float> %AA, i8* %B, <4 x double> %C, <4 x double> %CC, i32 %D, <4 x i64> %E, <4 x i64> %EE) {
+define i32 @f256(<8 x float> %A, <8 x float> %AA, i8* %B, <4 x double> %C, <4 x double> %CC, i32 %D, <4 x i64> %E, <4 x i64> %EE, i32* %loadptr) {
; CHECK: vmovntps %ymm{{.*}} ## EVEX TO VEX Compression encoding: [0xc5
+ %v0 = load i32, i32* %loadptr, align 1
%cast = bitcast i8* %B to <8 x float>*
%A2 = fadd <8 x float> %A, %AA
store <8 x float> %A2, <8 x float>* %cast, align 64, !nontemporal !0
; CHECK: vmovntdq %ymm{{.*}} ## EVEX TO VEX Compression encoding: [0xc5
+ %v1 = load i32, i32* %loadptr, align 1
%cast1 = bitcast i8* %B to <4 x i64>*
%E2 = add <4 x i64> %E, %EE
store <4 x i64> %E2, <4 x i64>* %cast1, align 64, !nontemporal !0
; CHECK: vmovntpd %ymm{{.*}} ## EVEX TO VEX Compression encoding: [0xc5
+ %v2 = load i32, i32* %loadptr, align 1
%cast2 = bitcast i8* %B to <4 x double>*
%C2 = fadd <4 x double> %C, %CC
store <4 x double> %C2, <4 x double>* %cast2, align 64, !nontemporal !0
- ret void
+ %v3 = load i32, i32* %loadptr, align 1
+ %sum1 = add i32 %v0, %v1
+ %sum2 = add i32 %sum1, %v2
+ %sum3 = add i32 %sum2, %v3
+ ret i32 %sum3
}
-define void @f128(<4 x float> %A, <4 x float> %AA, i8* %B, <2 x double> %C, <2 x double> %CC, i32 %D, <2 x i64> %E, <2 x i64> %EE) {
+define i32 @f128(<4 x float> %A, <4 x float> %AA, i8* %B, <2 x double> %C, <2 x double> %CC, i32 %D, <2 x i64> %E, <2 x i64> %EE, i32* %loadptr) {
+ %v0 = load i32, i32* %loadptr, align 1
; CHECK: vmovntps %xmm{{.*}} ## EVEX TO VEX Compression encoding: [0xc5
%cast = bitcast i8* %B to <4 x float>*
%A2 = fadd <4 x float> %A, %AA
store <4 x float> %A2, <4 x float>* %cast, align 64, !nontemporal !0
; CHECK: vmovntdq %xmm{{.*}} ## EVEX TO VEX Compression encoding: [0xc5
+ %v1 = load i32, i32* %loadptr, align 1
%cast1 = bitcast i8* %B to <2 x i64>*
%E2 = add <2 x i64> %E, %EE
store <2 x i64> %E2, <2 x i64>* %cast1, align 64, !nontemporal !0
; CHECK: vmovntpd %xmm{{.*}} ## EVEX TO VEX Compression encoding: [0xc5
+ %v2 = load i32, i32* %loadptr, align 1
%cast2 = bitcast i8* %B to <2 x double>*
%C2 = fadd <2 x double> %C, %CC
store <2 x double> %C2, <2 x double>* %cast2, align 64, !nontemporal !0
- ret void
+ %v3 = load i32, i32* %loadptr, align 1
+ %sum1 = add i32 %v0, %v1
+ %sum2 = add i32 %sum1, %v2
+ %sum3 = add i32 %sum2, %v3
+ ret i32 %sum3
}
!0 = !{i32 1}
diff --git a/test/CodeGen/X86/avx512vl-vbroadcast.ll b/test/CodeGen/X86/avx512vl-vbroadcast.ll
index 38a461ff0be2..7d24b8161e50 100644
--- a/test/CodeGen/X86/avx512vl-vbroadcast.ll
+++ b/test/CodeGen/X86/avx512vl-vbroadcast.ll
@@ -1,12 +1,11 @@
-; NOTE: Assertions have been autogenerated by update_llc_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f -mattr=+avx512vl| FileCheck %s
declare void @func_f32(float)
define <8 x float> @_256_broadcast_ss_spill(float %x) {
; CHECK-LABEL: _256_broadcast_ss_spill:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: subq $24, %rsp
-; CHECK-NEXT: .Lcfi0:
; CHECK-NEXT: .cfi_def_cfa_offset 32
; CHECK-NEXT: vaddss %xmm0, %xmm0, %xmm0
; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
@@ -23,9 +22,8 @@ define <8 x float> @_256_broadcast_ss_spill(float %x) {
define <4 x float> @_128_broadcast_ss_spill(float %x) {
; CHECK-LABEL: _128_broadcast_ss_spill:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: subq $24, %rsp
-; CHECK-NEXT: .Lcfi1:
; CHECK-NEXT: .cfi_def_cfa_offset 32
; CHECK-NEXT: vaddss %xmm0, %xmm0, %xmm0
; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
@@ -43,9 +41,8 @@ define <4 x float> @_128_broadcast_ss_spill(float %x) {
declare void @func_f64(double)
define <4 x double> @_256_broadcast_sd_spill(double %x) {
; CHECK-LABEL: _256_broadcast_sd_spill:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: subq $24, %rsp
-; CHECK-NEXT: .Lcfi2:
; CHECK-NEXT: .cfi_def_cfa_offset 32
; CHECK-NEXT: vaddsd %xmm0, %xmm0, %xmm0
; CHECK-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
@@ -62,7 +59,7 @@ define <4 x double> @_256_broadcast_sd_spill(double %x) {
define <8 x float> @_inreg8xfloat(float %a) {
; CHECK-LABEL: _inreg8xfloat:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vbroadcastss %xmm0, %ymm0
; CHECK-NEXT: retq
%b = insertelement <8 x float> undef, float %a, i32 0
@@ -72,8 +69,8 @@ define <8 x float> @_inreg8xfloat(float %a) {
define <8 x float> @_ss8xfloat_mask(<8 x float> %i, float %a, <8 x i32> %mask1) {
; CHECK-LABEL: _ss8xfloat_mask:
-; CHECK: # BB#0:
-; CHECK-NEXT: vpxor %ymm3, %ymm3, %ymm3
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vpcmpneqd %ymm3, %ymm2, %k1
; CHECK-NEXT: vbroadcastss %xmm1, %ymm0 {%k1}
; CHECK-NEXT: retq
@@ -86,8 +83,8 @@ define <8 x float> @_ss8xfloat_mask(<8 x float> %i, float %a, <8 x i32> %mask1
define <8 x float> @_ss8xfloat_maskz(float %a, <8 x i32> %mask1) {
; CHECK-LABEL: _ss8xfloat_maskz:
-; CHECK: # BB#0:
-; CHECK-NEXT: vpxor %ymm2, %ymm2, %ymm2
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vpcmpneqd %ymm2, %ymm1, %k1
; CHECK-NEXT: vbroadcastss %xmm0, %ymm0 {%k1} {z}
; CHECK-NEXT: retq
@@ -100,7 +97,7 @@ define <8 x float> @_ss8xfloat_maskz(float %a, <8 x i32> %mask1) {
define <4 x float> @_inreg4xfloat(float %a) {
; CHECK-LABEL: _inreg4xfloat:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vbroadcastss %xmm0, %xmm0
; CHECK-NEXT: retq
%b = insertelement <4 x float> undef, float %a, i32 0
@@ -110,7 +107,7 @@ define <4 x float> @_inreg4xfloat(float %a) {
define <4 x float> @_ss4xfloat_mask(<4 x float> %i, float %a, <4 x i32> %mask1) {
; CHECK-LABEL: _ss4xfloat_mask:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vpcmpneqd %xmm3, %xmm2, %k1
; CHECK-NEXT: vbroadcastss %xmm1, %xmm0 {%k1}
@@ -124,7 +121,7 @@ define <4 x float> @_ss4xfloat_mask(<4 x float> %i, float %a, <4 x i32> %mask1
define <4 x float> @_ss4xfloat_maskz(float %a, <4 x i32> %mask1) {
; CHECK-LABEL: _ss4xfloat_maskz:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vpcmpneqd %xmm2, %xmm1, %k1
; CHECK-NEXT: vbroadcastss %xmm0, %xmm0 {%k1} {z}
@@ -138,7 +135,7 @@ define <4 x float> @_ss4xfloat_maskz(float %a, <4 x i32> %mask1) {
define <4 x double> @_inreg4xdouble(double %a) {
; CHECK-LABEL: _inreg4xdouble:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0
; CHECK-NEXT: retq
%b = insertelement <4 x double> undef, double %a, i32 0
@@ -148,7 +145,7 @@ define <4 x double> @_inreg4xdouble(double %a) {
define <4 x double> @_ss4xdouble_mask(<4 x double> %i, double %a, <4 x i32> %mask1) {
; CHECK-LABEL: _ss4xdouble_mask:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vpcmpneqd %xmm3, %xmm2, %k1
; CHECK-NEXT: vbroadcastsd %xmm1, %ymm0 {%k1}
@@ -162,7 +159,7 @@ define <4 x double> @_ss4xdouble_mask(<4 x double> %i, double %a, <4 x i32> %m
define <4 x double> @_ss4xdouble_maskz(double %a, <4 x i32> %mask1) {
; CHECK-LABEL: _ss4xdouble_maskz:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vpcmpneqd %xmm2, %xmm1, %k1
; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0 {%k1} {z}
@@ -173,3 +170,30 @@ define <4 x double> @_ss4xdouble_maskz(double %a, <4 x i32> %mask1) {
%r = select <4 x i1> %mask, <4 x double> %c, <4 x double> zeroinitializer
ret <4 x double> %r
}
+
+define <2 x double> @test_v2f64_broadcast_fold(<2 x double> *%a0, <2 x double> %a1) {
+; CHECK-LABEL: test_v2f64_broadcast_fold:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vaddpd (%rdi){1to2}, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %1 = load <2 x double>, <2 x double> *%a0, align 16
+ %2 = shufflevector <2 x double> %1, <2 x double> undef, <2 x i32> zeroinitializer
+ %3 = fadd <2 x double> %2, %a1
+ ret <2 x double> %3
+}
+
+define <2 x double> @test_v2f64_broadcast_fold_mask(<2 x double> *%a0, <2 x double> %a1, <2 x i64> %mask1, <2 x double> %a2) {
+; CHECK-LABEL: test_v2f64_broadcast_fold_mask:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpneqq %xmm3, %xmm1, %k1
+; CHECK-NEXT: vaddpd (%rdi){1to2}, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT: vmovapd %xmm2, %xmm0
+; CHECK-NEXT: retq
+ %mask = icmp ne <2 x i64> %mask1, zeroinitializer
+ %1 = load <2 x double>, <2 x double> *%a0, align 16
+ %2 = shufflevector <2 x double> %1, <2 x double> undef, <2 x i32> zeroinitializer
+ %3 = fadd <2 x double> %2, %a1
+ %4 = select <2 x i1> %mask, <2 x double> %3, <2 x double> %a2
+ ret <2 x double> %4
+}
diff --git a/test/CodeGen/X86/avx512vl-vec-cmp.ll b/test/CodeGen/X86/avx512vl-vec-cmp.ll
index 43b1f53a09fa..8af9f73c8426 100644
--- a/test/CodeGen/X86/avx512vl-vec-cmp.ll
+++ b/test/CodeGen/X86/avx512vl-vec-cmp.ll
@@ -4,13 +4,13 @@
define <4 x i64> @test256_1(<4 x i64> %x, <4 x i64> %y) nounwind {
; VLX-LABEL: test256_1:
-; VLX: # BB#0:
+; VLX: # %bb.0:
; VLX-NEXT: vpcmpeqq %ymm1, %ymm0, %k1
; VLX-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
; VLX-NEXT: retq
;
; NoVLX-LABEL: test256_1:
-; NoVLX: # BB#0:
+; NoVLX: # %bb.0:
; NoVLX-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm2
; NoVLX-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
; NoVLX-NEXT: retq
@@ -21,13 +21,13 @@ define <4 x i64> @test256_1(<4 x i64> %x, <4 x i64> %y) nounwind {
define <4 x i64> @test256_2(<4 x i64> %x, <4 x i64> %y, <4 x i64> %x1) nounwind {
; VLX-LABEL: test256_2:
-; VLX: # BB#0:
+; VLX: # %bb.0:
; VLX-NEXT: vpcmpgtq %ymm1, %ymm0, %k1
; VLX-NEXT: vpblendmq %ymm2, %ymm1, %ymm0 {%k1}
; VLX-NEXT: retq
;
; NoVLX-LABEL: test256_2:
-; NoVLX: # BB#0:
+; NoVLX: # %bb.0:
; NoVLX-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0
; NoVLX-NEXT: vblendvpd %ymm0, %ymm2, %ymm1, %ymm0
; NoVLX-NEXT: retq
@@ -38,19 +38,19 @@ define <4 x i64> @test256_2(<4 x i64> %x, <4 x i64> %y, <4 x i64> %x1) nounwind
define <8 x i32> @test256_3(<8 x i32> %x, <8 x i32> %y, <8 x i32> %x1) nounwind {
; VLX-LABEL: test256_3:
-; VLX: # BB#0:
+; VLX: # %bb.0:
; VLX-NEXT: vpcmpled %ymm0, %ymm1, %k1
; VLX-NEXT: vpblendmd %ymm2, %ymm1, %ymm0 {%k1}
; VLX-NEXT: retq
;
; NoVLX-LABEL: test256_3:
-; NoVLX: # BB#0:
-; NoVLX-NEXT: # kill: %YMM2<def> %YMM2<kill> %ZMM2<def>
-; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
-; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX: # %bb.0:
+; NoVLX-NEXT: # kill: def %ymm2 killed %ymm2 def %zmm2
+; NoVLX-NEXT: # kill: def %ymm1 killed %ymm1 def %zmm1
+; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k1
; NoVLX-NEXT: vpblendmd %zmm2, %zmm1, %zmm0 {%k1}
-; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
; NoVLX-NEXT: retq
%mask = icmp sge <8 x i32> %x, %y
%max = select <8 x i1> %mask, <8 x i32> %x1, <8 x i32> %y
@@ -59,13 +59,13 @@ define <8 x i32> @test256_3(<8 x i32> %x, <8 x i32> %y, <8 x i32> %x1) nounwind
define <4 x i64> @test256_4(<4 x i64> %x, <4 x i64> %y, <4 x i64> %x1) nounwind {
; VLX-LABEL: test256_4:
-; VLX: # BB#0:
+; VLX: # %bb.0:
; VLX-NEXT: vpcmpnleuq %ymm1, %ymm0, %k1
; VLX-NEXT: vpblendmq %ymm2, %ymm1, %ymm0 {%k1}
; VLX-NEXT: retq
;
; NoVLX-LABEL: test256_4:
-; NoVLX: # BB#0:
+; NoVLX: # %bb.0:
; NoVLX-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
; NoVLX-NEXT: vpxor %ymm3, %ymm1, %ymm4
; NoVLX-NEXT: vpxor %ymm3, %ymm0, %ymm0
@@ -79,19 +79,19 @@ define <4 x i64> @test256_4(<4 x i64> %x, <4 x i64> %y, <4 x i64> %x1) nounwind
define <8 x i32> @test256_5(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %yp) nounwind {
; VLX-LABEL: test256_5:
-; VLX: # BB#0:
+; VLX: # %bb.0:
; VLX-NEXT: vpcmpeqd (%rdi), %ymm0, %k1
; VLX-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
; VLX-NEXT: retq
;
; NoVLX-LABEL: test256_5:
-; NoVLX: # BB#0:
-; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
-; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX: # %bb.0:
+; NoVLX-NEXT: # kill: def %ymm1 killed %ymm1 def %zmm1
+; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: vmovdqu (%rdi), %ymm2
; NoVLX-NEXT: vpcmpeqd %zmm2, %zmm0, %k1
; NoVLX-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
-; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
; NoVLX-NEXT: retq
%y = load <8 x i32>, <8 x i32>* %yp, align 4
%mask = icmp eq <8 x i32> %x, %y
@@ -101,19 +101,19 @@ define <8 x i32> @test256_5(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %yp) nounwin
define <8 x i32> @test256_5b(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %yp) nounwind {
; VLX-LABEL: test256_5b:
-; VLX: # BB#0:
+; VLX: # %bb.0:
; VLX-NEXT: vpcmpeqd (%rdi), %ymm0, %k1
; VLX-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
; VLX-NEXT: retq
;
; NoVLX-LABEL: test256_5b:
-; NoVLX: # BB#0:
-; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
-; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX: # %bb.0:
+; NoVLX-NEXT: # kill: def %ymm1 killed %ymm1 def %zmm1
+; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: vmovdqu (%rdi), %ymm2
; NoVLX-NEXT: vpcmpeqd %zmm0, %zmm2, %k1
; NoVLX-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
-; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
; NoVLX-NEXT: retq
%y = load <8 x i32>, <8 x i32>* %yp, align 4
%mask = icmp eq <8 x i32> %y, %x
@@ -123,19 +123,19 @@ define <8 x i32> @test256_5b(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %yp) nounwi
define <8 x i32> @test256_6(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %y.ptr) nounwind {
; VLX-LABEL: test256_6:
-; VLX: # BB#0:
+; VLX: # %bb.0:
; VLX-NEXT: vpcmpgtd (%rdi), %ymm0, %k1
; VLX-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
; VLX-NEXT: retq
;
; NoVLX-LABEL: test256_6:
-; NoVLX: # BB#0:
-; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
-; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX: # %bb.0:
+; NoVLX-NEXT: # kill: def %ymm1 killed %ymm1 def %zmm1
+; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: vmovdqu (%rdi), %ymm2
; NoVLX-NEXT: vpcmpgtd %zmm2, %zmm0, %k1
; NoVLX-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
-; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
; NoVLX-NEXT: retq
%y = load <8 x i32>, <8 x i32>* %y.ptr, align 4
%mask = icmp sgt <8 x i32> %x, %y
@@ -145,19 +145,19 @@ define <8 x i32> @test256_6(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %y.ptr) noun
define <8 x i32> @test256_6b(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %y.ptr) nounwind {
; VLX-LABEL: test256_6b:
-; VLX: # BB#0:
+; VLX: # %bb.0:
; VLX-NEXT: vpcmpgtd (%rdi), %ymm0, %k1
; VLX-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
; VLX-NEXT: retq
;
; NoVLX-LABEL: test256_6b:
-; NoVLX: # BB#0:
-; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
-; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX: # %bb.0:
+; NoVLX-NEXT: # kill: def %ymm1 killed %ymm1 def %zmm1
+; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: vmovdqu (%rdi), %ymm2
; NoVLX-NEXT: vpcmpgtd %zmm2, %zmm0, %k1
; NoVLX-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
-; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
; NoVLX-NEXT: retq
%y = load <8 x i32>, <8 x i32>* %y.ptr, align 4
%mask = icmp slt <8 x i32> %y, %x
@@ -167,19 +167,19 @@ define <8 x i32> @test256_6b(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %y.ptr) nou
define <8 x i32> @test256_7(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %y.ptr) nounwind {
; VLX-LABEL: test256_7:
-; VLX: # BB#0:
+; VLX: # %bb.0:
; VLX-NEXT: vpcmpled (%rdi), %ymm0, %k1
; VLX-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
; VLX-NEXT: retq
;
; NoVLX-LABEL: test256_7:
-; NoVLX: # BB#0:
-; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
-; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX: # %bb.0:
+; NoVLX-NEXT: # kill: def %ymm1 killed %ymm1 def %zmm1
+; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: vmovdqu (%rdi), %ymm2
; NoVLX-NEXT: vpcmpled %zmm2, %zmm0, %k1
; NoVLX-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
-; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
; NoVLX-NEXT: retq
%y = load <8 x i32>, <8 x i32>* %y.ptr, align 4
%mask = icmp sle <8 x i32> %x, %y
@@ -189,19 +189,19 @@ define <8 x i32> @test256_7(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %y.ptr) noun
define <8 x i32> @test256_7b(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %y.ptr) nounwind {
; VLX-LABEL: test256_7b:
-; VLX: # BB#0:
+; VLX: # %bb.0:
; VLX-NEXT: vpcmpled (%rdi), %ymm0, %k1
; VLX-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
; VLX-NEXT: retq
;
; NoVLX-LABEL: test256_7b:
-; NoVLX: # BB#0:
-; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
-; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX: # %bb.0:
+; NoVLX-NEXT: # kill: def %ymm1 killed %ymm1 def %zmm1
+; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: vmovdqu (%rdi), %ymm2
; NoVLX-NEXT: vpcmpled %zmm2, %zmm0, %k1
; NoVLX-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
-; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
; NoVLX-NEXT: retq
%y = load <8 x i32>, <8 x i32>* %y.ptr, align 4
%mask = icmp sge <8 x i32> %y, %x
@@ -211,19 +211,19 @@ define <8 x i32> @test256_7b(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %y.ptr) nou
define <8 x i32> @test256_8(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %y.ptr) nounwind {
; VLX-LABEL: test256_8:
-; VLX: # BB#0:
+; VLX: # %bb.0:
; VLX-NEXT: vpcmpleud (%rdi), %ymm0, %k1
; VLX-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
; VLX-NEXT: retq
;
; NoVLX-LABEL: test256_8:
-; NoVLX: # BB#0:
-; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
-; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX: # %bb.0:
+; NoVLX-NEXT: # kill: def %ymm1 killed %ymm1 def %zmm1
+; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: vmovdqu (%rdi), %ymm2
; NoVLX-NEXT: vpcmpleud %zmm2, %zmm0, %k1
; NoVLX-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
-; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
; NoVLX-NEXT: retq
%y = load <8 x i32>, <8 x i32>* %y.ptr, align 4
%mask = icmp ule <8 x i32> %x, %y
@@ -233,19 +233,19 @@ define <8 x i32> @test256_8(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %y.ptr) noun
define <8 x i32> @test256_8b(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %y.ptr) nounwind {
; VLX-LABEL: test256_8b:
-; VLX: # BB#0:
+; VLX: # %bb.0:
; VLX-NEXT: vpcmpleud (%rdi), %ymm0, %k1
; VLX-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
; VLX-NEXT: retq
;
; NoVLX-LABEL: test256_8b:
-; NoVLX: # BB#0:
-; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
-; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX: # %bb.0:
+; NoVLX-NEXT: # kill: def %ymm1 killed %ymm1 def %zmm1
+; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: vmovdqu (%rdi), %ymm2
; NoVLX-NEXT: vpcmpnltud %zmm0, %zmm2, %k1
; NoVLX-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
-; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
; NoVLX-NEXT: retq
%y = load <8 x i32>, <8 x i32>* %y.ptr, align 4
%mask = icmp uge <8 x i32> %y, %x
@@ -255,23 +255,22 @@ define <8 x i32> @test256_8b(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %y.ptr) nou
define <8 x i32> @test256_9(<8 x i32> %x, <8 x i32> %y, <8 x i32> %x1, <8 x i32> %y1) nounwind {
; VLX-LABEL: test256_9:
-; VLX: # BB#0:
+; VLX: # %bb.0:
; VLX-NEXT: vpcmpeqd %ymm1, %ymm0, %k1
; VLX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 {%k1}
; VLX-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
; VLX-NEXT: retq
;
; NoVLX-LABEL: test256_9:
-; NoVLX: # BB#0:
-; NoVLX-NEXT: # kill: %YMM3<def> %YMM3<kill> %ZMM3<def>
-; NoVLX-NEXT: # kill: %YMM2<def> %YMM2<kill> %ZMM2<def>
-; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
-; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
-; NoVLX-NEXT: vpcmpeqd %zmm3, %zmm2, %k0
+; NoVLX: # %bb.0:
+; NoVLX-NEXT: # kill: def %ymm3 killed %ymm3 def %zmm3
+; NoVLX-NEXT: # kill: def %ymm2 killed %ymm2 def %zmm2
+; NoVLX-NEXT: # kill: def %ymm1 killed %ymm1 def %zmm1
+; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k1
-; NoVLX-NEXT: kandw %k0, %k1, %k1
+; NoVLX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 {%k1}
; NoVLX-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
-; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
; NoVLX-NEXT: retq
%mask1 = icmp eq <8 x i32> %x1, %y1
%mask0 = icmp eq <8 x i32> %x, %y
@@ -282,14 +281,14 @@ define <8 x i32> @test256_9(<8 x i32> %x, <8 x i32> %y, <8 x i32> %x1, <8 x i32>
define <4 x i64> @test256_10(<4 x i64> %x, <4 x i64> %y, <4 x i64> %x1, <4 x i64> %y1) nounwind {
; VLX-LABEL: test256_10:
-; VLX: # BB#0:
+; VLX: # %bb.0:
; VLX-NEXT: vpcmpleq %ymm1, %ymm0, %k1
; VLX-NEXT: vpcmpleq %ymm2, %ymm3, %k1 {%k1}
; VLX-NEXT: vpblendmq %ymm0, %ymm2, %ymm0 {%k1}
; VLX-NEXT: retq
;
; NoVLX-LABEL: test256_10:
-; NoVLX: # BB#0:
+; NoVLX: # %bb.0:
; NoVLX-NEXT: vpcmpgtq %ymm2, %ymm3, %ymm3
; NoVLX-NEXT: vpcmpeqd %ymm4, %ymm4, %ymm4
; NoVLX-NEXT: vpxor %ymm4, %ymm3, %ymm3
@@ -306,16 +305,16 @@ define <4 x i64> @test256_10(<4 x i64> %x, <4 x i64> %y, <4 x i64> %x1, <4 x i64
define <4 x i64> @test256_11(<4 x i64> %x, <4 x i64>* %y.ptr, <4 x i64> %x1, <4 x i64> %y1) nounwind {
; VLX-LABEL: test256_11:
-; VLX: # BB#0:
+; VLX: # %bb.0:
; VLX-NEXT: vpcmpgtq %ymm2, %ymm1, %k1
; VLX-NEXT: vpcmpgtq (%rdi), %ymm0, %k1 {%k1}
; VLX-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
; VLX-NEXT: retq
;
; NoVLX-LABEL: test256_11:
-; NoVLX: # BB#0:
-; NoVLX-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm2
+; NoVLX: # %bb.0:
; NoVLX-NEXT: vpcmpgtq (%rdi), %ymm0, %ymm3
+; NoVLX-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm2
; NoVLX-NEXT: vpand %ymm2, %ymm3, %ymm2
; NoVLX-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
; NoVLX-NEXT: retq
@@ -329,23 +328,22 @@ define <4 x i64> @test256_11(<4 x i64> %x, <4 x i64>* %y.ptr, <4 x i64> %x1, <4
define <8 x i32> @test256_12(<8 x i32> %x, <8 x i32>* %y.ptr, <8 x i32> %x1, <8 x i32> %y1) nounwind {
; VLX-LABEL: test256_12:
-; VLX: # BB#0:
+; VLX: # %bb.0:
; VLX-NEXT: vpcmpled %ymm1, %ymm2, %k1
; VLX-NEXT: vpcmpleud (%rdi), %ymm0, %k1 {%k1}
; VLX-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
; VLX-NEXT: retq
;
; NoVLX-LABEL: test256_12:
-; NoVLX: # BB#0:
-; NoVLX-NEXT: # kill: %YMM2<def> %YMM2<kill> %ZMM2<def>
-; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
-; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
-; NoVLX-NEXT: vpcmpled %zmm1, %zmm2, %k0
-; NoVLX-NEXT: vmovdqu (%rdi), %ymm2
-; NoVLX-NEXT: vpcmpleud %zmm2, %zmm0, %k1
-; NoVLX-NEXT: kandw %k0, %k1, %k1
+; NoVLX: # %bb.0:
+; NoVLX-NEXT: # kill: def %ymm2 killed %ymm2 def %zmm2
+; NoVLX-NEXT: # kill: def %ymm1 killed %ymm1 def %zmm1
+; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
+; NoVLX-NEXT: vmovdqu (%rdi), %ymm3
+; NoVLX-NEXT: vpcmpleud %zmm3, %zmm0, %k1
+; NoVLX-NEXT: vpcmpled %zmm1, %zmm2, %k1 {%k1}
; NoVLX-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
-; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
; NoVLX-NEXT: retq
%mask1 = icmp sge <8 x i32> %x1, %y1
%y = load <8 x i32>, <8 x i32>* %y.ptr, align 4
@@ -357,13 +355,13 @@ define <8 x i32> @test256_12(<8 x i32> %x, <8 x i32>* %y.ptr, <8 x i32> %x1, <8
define <4 x i64> @test256_13(<4 x i64> %x, <4 x i64> %x1, i64* %yb.ptr) nounwind {
; VLX-LABEL: test256_13:
-; VLX: # BB#0:
+; VLX: # %bb.0:
; VLX-NEXT: vpcmpeqq (%rdi){1to4}, %ymm0, %k1
; VLX-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
; VLX-NEXT: retq
;
; NoVLX-LABEL: test256_13:
-; NoVLX: # BB#0:
+; NoVLX: # %bb.0:
; NoVLX-NEXT: vpbroadcastq (%rdi), %ymm2
; NoVLX-NEXT: vpcmpeqq %ymm2, %ymm0, %ymm2
; NoVLX-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
@@ -378,19 +376,19 @@ define <4 x i64> @test256_13(<4 x i64> %x, <4 x i64> %x1, i64* %yb.ptr) nounwind
define <8 x i32> @test256_14(<8 x i32> %x, i32* %yb.ptr, <8 x i32> %x1) nounwind {
; VLX-LABEL: test256_14:
-; VLX: # BB#0:
+; VLX: # %bb.0:
; VLX-NEXT: vpcmpled (%rdi){1to8}, %ymm0, %k1
; VLX-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
; VLX-NEXT: retq
;
; NoVLX-LABEL: test256_14:
-; NoVLX: # BB#0:
-; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
-; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX: # %bb.0:
+; NoVLX-NEXT: # kill: def %ymm1 killed %ymm1 def %zmm1
+; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: vpbroadcastd (%rdi), %ymm2
; NoVLX-NEXT: vpcmpled %zmm2, %zmm0, %k1
; NoVLX-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
-; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
; NoVLX-NEXT: retq
%yb = load i32, i32* %yb.ptr, align 4
%y.0 = insertelement <8 x i32> undef, i32 %yb, i32 0
@@ -402,23 +400,22 @@ define <8 x i32> @test256_14(<8 x i32> %x, i32* %yb.ptr, <8 x i32> %x1) nounwind
define <8 x i32> @test256_15(<8 x i32> %x, i32* %yb.ptr, <8 x i32> %x1, <8 x i32> %y1) nounwind {
; VLX-LABEL: test256_15:
-; VLX: # BB#0:
+; VLX: # %bb.0:
; VLX-NEXT: vpcmpled %ymm1, %ymm2, %k1
; VLX-NEXT: vpcmpgtd (%rdi){1to8}, %ymm0, %k1 {%k1}
; VLX-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
; VLX-NEXT: retq
;
; NoVLX-LABEL: test256_15:
-; NoVLX: # BB#0:
-; NoVLX-NEXT: # kill: %YMM2<def> %YMM2<kill> %ZMM2<def>
-; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
-; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
-; NoVLX-NEXT: vpcmpled %zmm1, %zmm2, %k0
-; NoVLX-NEXT: vpbroadcastd (%rdi), %ymm2
-; NoVLX-NEXT: vpcmpgtd %zmm2, %zmm0, %k1
-; NoVLX-NEXT: kandw %k0, %k1, %k1
+; NoVLX: # %bb.0:
+; NoVLX-NEXT: # kill: def %ymm2 killed %ymm2 def %zmm2
+; NoVLX-NEXT: # kill: def %ymm1 killed %ymm1 def %zmm1
+; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
+; NoVLX-NEXT: vpbroadcastd (%rdi), %ymm3
+; NoVLX-NEXT: vpcmpgtd %zmm3, %zmm0, %k1
+; NoVLX-NEXT: vpcmpled %zmm1, %zmm2, %k1 {%k1}
; NoVLX-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
-; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
; NoVLX-NEXT: retq
%mask1 = icmp sge <8 x i32> %x1, %y1
%yb = load i32, i32* %yb.ptr, align 4
@@ -432,14 +429,14 @@ define <8 x i32> @test256_15(<8 x i32> %x, i32* %yb.ptr, <8 x i32> %x1, <8 x i32
define <4 x i64> @test256_16(<4 x i64> %x, i64* %yb.ptr, <4 x i64> %x1, <4 x i64> %y1) nounwind {
; VLX-LABEL: test256_16:
-; VLX: # BB#0:
+; VLX: # %bb.0:
; VLX-NEXT: vpcmpleq %ymm1, %ymm2, %k1
; VLX-NEXT: vpcmpgtq (%rdi){1to4}, %ymm0, %k1 {%k1}
; VLX-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
; VLX-NEXT: retq
;
; NoVLX-LABEL: test256_16:
-; NoVLX: # BB#0:
+; NoVLX: # %bb.0:
; NoVLX-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm2
; NoVLX-NEXT: vpbroadcastq (%rdi), %ymm3
; NoVLX-NEXT: vpcmpgtq %ymm3, %ymm0, %ymm3
@@ -458,19 +455,19 @@ define <4 x i64> @test256_16(<4 x i64> %x, i64* %yb.ptr, <4 x i64> %x1, <4 x i64
define <8 x i32> @test256_17(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %yp) nounwind {
; VLX-LABEL: test256_17:
-; VLX: # BB#0:
+; VLX: # %bb.0:
; VLX-NEXT: vpcmpneqd (%rdi), %ymm0, %k1
; VLX-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
; VLX-NEXT: retq
;
; NoVLX-LABEL: test256_17:
-; NoVLX: # BB#0:
-; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
-; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX: # %bb.0:
+; NoVLX-NEXT: # kill: def %ymm1 killed %ymm1 def %zmm1
+; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: vmovdqu (%rdi), %ymm2
; NoVLX-NEXT: vpcmpneqd %zmm2, %zmm0, %k1
; NoVLX-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
-; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
; NoVLX-NEXT: retq
%y = load <8 x i32>, <8 x i32>* %yp, align 4
%mask = icmp ne <8 x i32> %x, %y
@@ -480,19 +477,19 @@ define <8 x i32> @test256_17(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %yp) nounwi
define <8 x i32> @test256_18(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %yp) nounwind {
; VLX-LABEL: test256_18:
-; VLX: # BB#0:
+; VLX: # %bb.0:
; VLX-NEXT: vpcmpneqd (%rdi), %ymm0, %k1
; VLX-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
; VLX-NEXT: retq
;
; NoVLX-LABEL: test256_18:
-; NoVLX: # BB#0:
-; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
-; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX: # %bb.0:
+; NoVLX-NEXT: # kill: def %ymm1 killed %ymm1 def %zmm1
+; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: vmovdqu (%rdi), %ymm2
; NoVLX-NEXT: vpcmpneqd %zmm0, %zmm2, %k1
; NoVLX-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
-; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
; NoVLX-NEXT: retq
%y = load <8 x i32>, <8 x i32>* %yp, align 4
%mask = icmp ne <8 x i32> %y, %x
@@ -502,19 +499,19 @@ define <8 x i32> @test256_18(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %yp) nounwi
define <8 x i32> @test256_19(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %yp) nounwind {
; VLX-LABEL: test256_19:
-; VLX: # BB#0:
+; VLX: # %bb.0:
; VLX-NEXT: vpcmpnltud (%rdi), %ymm0, %k1
; VLX-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
; VLX-NEXT: retq
;
; NoVLX-LABEL: test256_19:
-; NoVLX: # BB#0:
-; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
-; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX: # %bb.0:
+; NoVLX-NEXT: # kill: def %ymm1 killed %ymm1 def %zmm1
+; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: vmovdqu (%rdi), %ymm2
; NoVLX-NEXT: vpcmpnltud %zmm2, %zmm0, %k1
; NoVLX-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
-; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
; NoVLX-NEXT: retq
%y = load <8 x i32>, <8 x i32>* %yp, align 4
%mask = icmp uge <8 x i32> %x, %y
@@ -524,19 +521,19 @@ define <8 x i32> @test256_19(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %yp) nounwi
define <8 x i32> @test256_20(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %yp) nounwind {
; VLX-LABEL: test256_20:
-; VLX: # BB#0:
+; VLX: # %bb.0:
; VLX-NEXT: vpcmpleud (%rdi), %ymm0, %k1
; VLX-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
; VLX-NEXT: retq
;
; NoVLX-LABEL: test256_20:
-; NoVLX: # BB#0:
-; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
-; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX: # %bb.0:
+; NoVLX-NEXT: # kill: def %ymm1 killed %ymm1 def %zmm1
+; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: vmovdqu (%rdi), %ymm2
; NoVLX-NEXT: vpcmpnltud %zmm0, %zmm2, %k1
; NoVLX-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
-; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
; NoVLX-NEXT: retq
%y = load <8 x i32>, <8 x i32>* %yp, align 4
%mask = icmp uge <8 x i32> %y, %x
@@ -546,13 +543,13 @@ define <8 x i32> @test256_20(<8 x i32> %x, <8 x i32> %x1, <8 x i32>* %yp) nounwi
define <2 x i64> @test128_1(<2 x i64> %x, <2 x i64> %y) nounwind {
; VLX-LABEL: test128_1:
-; VLX: # BB#0:
+; VLX: # %bb.0:
; VLX-NEXT: vpcmpeqq %xmm1, %xmm0, %k1
; VLX-NEXT: vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
; VLX-NEXT: retq
;
; NoVLX-LABEL: test128_1:
-; NoVLX: # BB#0:
+; NoVLX: # %bb.0:
; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm2
; NoVLX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: retq
@@ -563,13 +560,13 @@ define <2 x i64> @test128_1(<2 x i64> %x, <2 x i64> %y) nounwind {
define <2 x i64> @test128_2(<2 x i64> %x, <2 x i64> %y, <2 x i64> %x1) nounwind {
; VLX-LABEL: test128_2:
-; VLX: # BB#0:
+; VLX: # %bb.0:
; VLX-NEXT: vpcmpgtq %xmm1, %xmm0, %k1
; VLX-NEXT: vpblendmq %xmm2, %xmm1, %xmm0 {%k1}
; VLX-NEXT: retq
;
; NoVLX-LABEL: test128_2:
-; NoVLX: # BB#0:
+; NoVLX: # %bb.0:
; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vblendvpd %xmm0, %xmm2, %xmm1, %xmm0
; NoVLX-NEXT: retq
@@ -580,13 +577,13 @@ define <2 x i64> @test128_2(<2 x i64> %x, <2 x i64> %y, <2 x i64> %x1) nounwind
define <4 x i32> @test128_3(<4 x i32> %x, <4 x i32> %y, <4 x i32> %x1) nounwind {
; VLX-LABEL: test128_3:
-; VLX: # BB#0:
+; VLX: # %bb.0:
; VLX-NEXT: vpcmpled %xmm0, %xmm1, %k1
; VLX-NEXT: vpblendmd %xmm2, %xmm1, %xmm0 {%k1}
; VLX-NEXT: retq
;
; NoVLX-LABEL: test128_3:
-; NoVLX: # BB#0:
+; NoVLX: # %bb.0:
; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
; NoVLX-NEXT: vpxor %xmm3, %xmm0, %xmm0
@@ -599,13 +596,13 @@ define <4 x i32> @test128_3(<4 x i32> %x, <4 x i32> %y, <4 x i32> %x1) nounwind
define <2 x i64> @test128_4(<2 x i64> %x, <2 x i64> %y, <2 x i64> %x1) nounwind {
; VLX-LABEL: test128_4:
-; VLX: # BB#0:
+; VLX: # %bb.0:
; VLX-NEXT: vpcmpnleuq %xmm1, %xmm0, %k1
; VLX-NEXT: vpblendmq %xmm2, %xmm1, %xmm0 {%k1}
; VLX-NEXT: retq
;
; NoVLX-LABEL: test128_4:
-; NoVLX: # BB#0:
+; NoVLX: # %bb.0:
; NoVLX-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
; NoVLX-NEXT: vpxor %xmm3, %xmm1, %xmm4
; NoVLX-NEXT: vpxor %xmm3, %xmm0, %xmm0
@@ -619,13 +616,13 @@ define <2 x i64> @test128_4(<2 x i64> %x, <2 x i64> %y, <2 x i64> %x1) nounwind
define <4 x i32> @test128_5(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %yp) nounwind {
; VLX-LABEL: test128_5:
-; VLX: # BB#0:
+; VLX: # %bb.0:
; VLX-NEXT: vpcmpeqd (%rdi), %xmm0, %k1
; VLX-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
; VLX-NEXT: retq
;
; NoVLX-LABEL: test128_5:
-; NoVLX: # BB#0:
+; NoVLX: # %bb.0:
; NoVLX-NEXT: vpcmpeqd (%rdi), %xmm0, %xmm2
; NoVLX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: retq
@@ -637,13 +634,13 @@ define <4 x i32> @test128_5(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %yp) nounwin
define <4 x i32> @test128_5b(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %yp) nounwind {
; VLX-LABEL: test128_5b:
-; VLX: # BB#0:
+; VLX: # %bb.0:
; VLX-NEXT: vpcmpeqd (%rdi), %xmm0, %k1
; VLX-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
; VLX-NEXT: retq
;
; NoVLX-LABEL: test128_5b:
-; NoVLX: # BB#0:
+; NoVLX: # %bb.0:
; NoVLX-NEXT: vpcmpeqd (%rdi), %xmm0, %xmm2
; NoVLX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: retq
@@ -655,13 +652,13 @@ define <4 x i32> @test128_5b(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %yp) nounwi
define <4 x i32> @test128_6(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) nounwind {
; VLX-LABEL: test128_6:
-; VLX: # BB#0:
+; VLX: # %bb.0:
; VLX-NEXT: vpcmpgtd (%rdi), %xmm0, %k1
; VLX-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
; VLX-NEXT: retq
;
; NoVLX-LABEL: test128_6:
-; NoVLX: # BB#0:
+; NoVLX: # %bb.0:
; NoVLX-NEXT: vpcmpgtd (%rdi), %xmm0, %xmm2
; NoVLX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: retq
@@ -673,13 +670,13 @@ define <4 x i32> @test128_6(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) noun
define <4 x i32> @test128_6b(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) nounwind {
; VLX-LABEL: test128_6b:
-; VLX: # BB#0:
+; VLX: # %bb.0:
; VLX-NEXT: vpcmpgtd (%rdi), %xmm0, %k1
; VLX-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
; VLX-NEXT: retq
;
; NoVLX-LABEL: test128_6b:
-; NoVLX: # BB#0:
+; NoVLX: # %bb.0:
; NoVLX-NEXT: vpcmpgtd (%rdi), %xmm0, %xmm2
; NoVLX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: retq
@@ -691,13 +688,13 @@ define <4 x i32> @test128_6b(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) nou
define <4 x i32> @test128_7(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) nounwind {
; VLX-LABEL: test128_7:
-; VLX: # BB#0:
+; VLX: # %bb.0:
; VLX-NEXT: vpcmpled (%rdi), %xmm0, %k1
; VLX-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
; VLX-NEXT: retq
;
; NoVLX-LABEL: test128_7:
-; NoVLX: # BB#0:
+; NoVLX: # %bb.0:
; NoVLX-NEXT: vpcmpgtd (%rdi), %xmm0, %xmm2
; NoVLX-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
; NoVLX-NEXT: vpxor %xmm3, %xmm2, %xmm2
@@ -711,13 +708,13 @@ define <4 x i32> @test128_7(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) noun
define <4 x i32> @test128_7b(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) nounwind {
; VLX-LABEL: test128_7b:
-; VLX: # BB#0:
+; VLX: # %bb.0:
; VLX-NEXT: vpcmpled (%rdi), %xmm0, %k1
; VLX-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
; VLX-NEXT: retq
;
; NoVLX-LABEL: test128_7b:
-; NoVLX: # BB#0:
+; NoVLX: # %bb.0:
; NoVLX-NEXT: vpcmpgtd (%rdi), %xmm0, %xmm2
; NoVLX-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
; NoVLX-NEXT: vpxor %xmm3, %xmm2, %xmm2
@@ -731,13 +728,13 @@ define <4 x i32> @test128_7b(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) nou
define <4 x i32> @test128_8(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) nounwind {
; VLX-LABEL: test128_8:
-; VLX: # BB#0:
+; VLX: # %bb.0:
; VLX-NEXT: vpcmpleud (%rdi), %xmm0, %k1
; VLX-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
; VLX-NEXT: retq
;
; NoVLX-LABEL: test128_8:
-; NoVLX: # BB#0:
+; NoVLX: # %bb.0:
; NoVLX-NEXT: vpminud (%rdi), %xmm0, %xmm2
; NoVLX-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm2
; NoVLX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
@@ -750,13 +747,13 @@ define <4 x i32> @test128_8(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) noun
define <4 x i32> @test128_8b(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) nounwind {
; VLX-LABEL: test128_8b:
-; VLX: # BB#0:
+; VLX: # %bb.0:
; VLX-NEXT: vpcmpleud (%rdi), %xmm0, %k1
; VLX-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
; VLX-NEXT: retq
;
; NoVLX-LABEL: test128_8b:
-; NoVLX: # BB#0:
+; NoVLX: # %bb.0:
; NoVLX-NEXT: vmovdqu (%rdi), %xmm2
; NoVLX-NEXT: vpmaxud %xmm0, %xmm2, %xmm3
; NoVLX-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2
@@ -770,14 +767,14 @@ define <4 x i32> @test128_8b(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) nou
define <4 x i32> @test128_9(<4 x i32> %x, <4 x i32> %y, <4 x i32> %x1, <4 x i32> %y1) nounwind {
; VLX-LABEL: test128_9:
-; VLX: # BB#0:
+; VLX: # %bb.0:
; VLX-NEXT: vpcmpeqd %xmm1, %xmm0, %k1
; VLX-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 {%k1}
; VLX-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
; VLX-NEXT: retq
;
; NoVLX-LABEL: test128_9:
-; NoVLX: # BB#0:
+; NoVLX: # %bb.0:
; NoVLX-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2
; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm3
; NoVLX-NEXT: vpand %xmm2, %xmm3, %xmm2
@@ -792,14 +789,14 @@ define <4 x i32> @test128_9(<4 x i32> %x, <4 x i32> %y, <4 x i32> %x1, <4 x i32>
define <2 x i64> @test128_10(<2 x i64> %x, <2 x i64> %y, <2 x i64> %x1, <2 x i64> %y1) nounwind {
; VLX-LABEL: test128_10:
-; VLX: # BB#0:
+; VLX: # %bb.0:
; VLX-NEXT: vpcmpleq %xmm1, %xmm0, %k1
; VLX-NEXT: vpcmpleq %xmm2, %xmm3, %k1 {%k1}
; VLX-NEXT: vpblendmq %xmm0, %xmm2, %xmm0 {%k1}
; VLX-NEXT: retq
;
; NoVLX-LABEL: test128_10:
-; NoVLX: # BB#0:
+; NoVLX: # %bb.0:
; NoVLX-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm3
; NoVLX-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
; NoVLX-NEXT: vpxor %xmm4, %xmm3, %xmm3
@@ -816,16 +813,16 @@ define <2 x i64> @test128_10(<2 x i64> %x, <2 x i64> %y, <2 x i64> %x1, <2 x i64
define <2 x i64> @test128_11(<2 x i64> %x, <2 x i64>* %y.ptr, <2 x i64> %x1, <2 x i64> %y1) nounwind {
; VLX-LABEL: test128_11:
-; VLX: # BB#0:
+; VLX: # %bb.0:
; VLX-NEXT: vpcmpgtq %xmm2, %xmm1, %k1
; VLX-NEXT: vpcmpgtq (%rdi), %xmm0, %k1 {%k1}
; VLX-NEXT: vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
; VLX-NEXT: retq
;
; NoVLX-LABEL: test128_11:
-; NoVLX: # BB#0:
-; NoVLX-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm2
+; NoVLX: # %bb.0:
; NoVLX-NEXT: vpcmpgtq (%rdi), %xmm0, %xmm3
+; NoVLX-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm2
; NoVLX-NEXT: vpand %xmm2, %xmm3, %xmm2
; NoVLX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: retq
@@ -839,14 +836,14 @@ define <2 x i64> @test128_11(<2 x i64> %x, <2 x i64>* %y.ptr, <2 x i64> %x1, <2
define <4 x i32> @test128_12(<4 x i32> %x, <4 x i32>* %y.ptr, <4 x i32> %x1, <4 x i32> %y1) nounwind {
; VLX-LABEL: test128_12:
-; VLX: # BB#0:
+; VLX: # %bb.0:
; VLX-NEXT: vpcmpled %xmm1, %xmm2, %k1
; VLX-NEXT: vpcmpleud (%rdi), %xmm0, %k1 {%k1}
; VLX-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
; VLX-NEXT: retq
;
; NoVLX-LABEL: test128_12:
-; NoVLX: # BB#0:
+; NoVLX: # %bb.0:
; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm2
; NoVLX-NEXT: vpminud (%rdi), %xmm0, %xmm3
; NoVLX-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm3
@@ -863,13 +860,13 @@ define <4 x i32> @test128_12(<4 x i32> %x, <4 x i32>* %y.ptr, <4 x i32> %x1, <4
define <2 x i64> @test128_13(<2 x i64> %x, <2 x i64> %x1, i64* %yb.ptr) nounwind {
; VLX-LABEL: test128_13:
-; VLX: # BB#0:
+; VLX: # %bb.0:
; VLX-NEXT: vpcmpeqq (%rdi){1to2}, %xmm0, %k1
; VLX-NEXT: vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
; VLX-NEXT: retq
;
; NoVLX-LABEL: test128_13:
-; NoVLX: # BB#0:
+; NoVLX: # %bb.0:
; NoVLX-NEXT: vpbroadcastq (%rdi), %xmm2
; NoVLX-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm2
; NoVLX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
@@ -884,13 +881,13 @@ define <2 x i64> @test128_13(<2 x i64> %x, <2 x i64> %x1, i64* %yb.ptr) nounwind
define <4 x i32> @test128_14(<4 x i32> %x, i32* %yb.ptr, <4 x i32> %x1) nounwind {
; VLX-LABEL: test128_14:
-; VLX: # BB#0:
+; VLX: # %bb.0:
; VLX-NEXT: vpcmpled (%rdi){1to4}, %xmm0, %k1
; VLX-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
; VLX-NEXT: retq
;
; NoVLX-LABEL: test128_14:
-; NoVLX: # BB#0:
+; NoVLX: # %bb.0:
; NoVLX-NEXT: vpbroadcastd (%rdi), %xmm2
; NoVLX-NEXT: vpcmpgtd %xmm2, %xmm0, %xmm2
; NoVLX-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
@@ -907,14 +904,14 @@ define <4 x i32> @test128_14(<4 x i32> %x, i32* %yb.ptr, <4 x i32> %x1) nounwind
define <4 x i32> @test128_15(<4 x i32> %x, i32* %yb.ptr, <4 x i32> %x1, <4 x i32> %y1) nounwind {
; VLX-LABEL: test128_15:
-; VLX: # BB#0:
+; VLX: # %bb.0:
; VLX-NEXT: vpcmpled %xmm1, %xmm2, %k1
; VLX-NEXT: vpcmpgtd (%rdi){1to4}, %xmm0, %k1 {%k1}
; VLX-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
; VLX-NEXT: retq
;
; NoVLX-LABEL: test128_15:
-; NoVLX: # BB#0:
+; NoVLX: # %bb.0:
; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm2
; NoVLX-NEXT: vpbroadcastd (%rdi), %xmm3
; NoVLX-NEXT: vpcmpgtd %xmm3, %xmm0, %xmm3
@@ -933,14 +930,14 @@ define <4 x i32> @test128_15(<4 x i32> %x, i32* %yb.ptr, <4 x i32> %x1, <4 x i32
define <2 x i64> @test128_16(<2 x i64> %x, i64* %yb.ptr, <2 x i64> %x1, <2 x i64> %y1) nounwind {
; VLX-LABEL: test128_16:
-; VLX: # BB#0:
+; VLX: # %bb.0:
; VLX-NEXT: vpcmpleq %xmm1, %xmm2, %k1
; VLX-NEXT: vpcmpgtq (%rdi){1to2}, %xmm0, %k1 {%k1}
; VLX-NEXT: vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
; VLX-NEXT: retq
;
; NoVLX-LABEL: test128_16:
-; NoVLX: # BB#0:
+; NoVLX: # %bb.0:
; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm2
; NoVLX-NEXT: vpbroadcastq (%rdi), %xmm3
; NoVLX-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm3
@@ -959,13 +956,13 @@ define <2 x i64> @test128_16(<2 x i64> %x, i64* %yb.ptr, <2 x i64> %x1, <2 x i64
define <4 x i32> @test128_17(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) nounwind {
; VLX-LABEL: test128_17:
-; VLX: # BB#0:
+; VLX: # %bb.0:
; VLX-NEXT: vpcmpneqd (%rdi), %xmm0, %k1
; VLX-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
; VLX-NEXT: retq
;
; NoVLX-LABEL: test128_17:
-; NoVLX: # BB#0:
+; NoVLX: # %bb.0:
; NoVLX-NEXT: vpcmpeqd (%rdi), %xmm0, %xmm2
; NoVLX-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
; NoVLX-NEXT: vpxor %xmm3, %xmm2, %xmm2
@@ -979,13 +976,13 @@ define <4 x i32> @test128_17(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) nou
define <4 x i32> @test128_18(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) nounwind {
; VLX-LABEL: test128_18:
-; VLX: # BB#0:
+; VLX: # %bb.0:
; VLX-NEXT: vpcmpneqd (%rdi), %xmm0, %k1
; VLX-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
; VLX-NEXT: retq
;
; NoVLX-LABEL: test128_18:
-; NoVLX: # BB#0:
+; NoVLX: # %bb.0:
; NoVLX-NEXT: vpcmpeqd (%rdi), %xmm0, %xmm2
; NoVLX-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
; NoVLX-NEXT: vpxor %xmm3, %xmm2, %xmm2
@@ -999,13 +996,13 @@ define <4 x i32> @test128_18(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) nou
define <4 x i32> @test128_19(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) nounwind {
; VLX-LABEL: test128_19:
-; VLX: # BB#0:
+; VLX: # %bb.0:
; VLX-NEXT: vpcmpnltud (%rdi), %xmm0, %k1
; VLX-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
; VLX-NEXT: retq
;
; NoVLX-LABEL: test128_19:
-; NoVLX: # BB#0:
+; NoVLX: # %bb.0:
; NoVLX-NEXT: vpmaxud (%rdi), %xmm0, %xmm2
; NoVLX-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm2
; NoVLX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
@@ -1018,13 +1015,13 @@ define <4 x i32> @test128_19(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) nou
define <4 x i32> @test128_20(<4 x i32> %x, <4 x i32> %x1, <4 x i32>* %y.ptr) nounwind {
; VLX-LABEL: test128_20:
-; VLX: # BB#0:
+; VLX: # %bb.0:
; VLX-NEXT: vpcmpleud (%rdi), %xmm0, %k1
; VLX-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
; VLX-NEXT: retq
;
; NoVLX-LABEL: test128_20:
-; NoVLX: # BB#0:
+; NoVLX: # %bb.0:
; NoVLX-NEXT: vmovdqu (%rdi), %xmm2
; NoVLX-NEXT: vpmaxud %xmm0, %xmm2, %xmm3
; NoVLX-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2
diff --git a/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll b/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll
index 4d3a1495617e..9863a0a7d283 100644
--- a/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll
+++ b/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll
@@ -1,123 +1,34 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skx | FileCheck %s --check-prefix=CHECK --check-prefix=VLX
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl | FileCheck %s --check-prefix=CHECK --check-prefix=NoVLX
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -disable-peephole -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq | FileCheck %s --check-prefix=CHECK --check-prefix=VLX
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -disable-peephole -mattr=+avx512f | FileCheck %s --check-prefix=CHECK --check-prefix=NoVLX
define zeroext i32 @test_vpcmpeqb_v16i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpeqb_v16i1_v32i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpeqb %xmm1, %xmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpeqb_v16i1_v32i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi0:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi2:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: .Lcfi3:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi4:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi5:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi6:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi7:
-; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <16 x i8>
@@ -130,120 +41,31 @@ entry:
define zeroext i32 @test_vpcmpeqb_v16i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpeqb_v16i1_v32i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpeqb (%rdi), %xmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpeqb_v16i1_v32i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi8:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi9:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi10:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: .Lcfi11:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi12:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi13:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi14:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi15:
-; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpcmpeqb (%rdi), %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <16 x i8>
@@ -257,122 +79,38 @@ entry:
define zeroext i32 @test_masked_vpcmpeqb_v16i1_v32i1_mask(i16 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpeqb_v16i1_v32i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpeqb %xmm1, %xmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpeqb_v16i1_v32i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi16:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi17:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi18:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: .Lcfi19:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi20:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi21:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi22:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi23:
-; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1}
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <16 x i8>
@@ -387,122 +125,38 @@ entry:
define zeroext i32 @test_masked_vpcmpeqb_v16i1_v32i1_mask_mem(i16 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpeqb_v16i1_v32i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpeqb (%rsi), %xmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpeqb_v16i1_v32i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi24:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi25:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi26:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: .Lcfi27:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi28:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi29:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi30:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi31:
-; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: vpcmpeqb (%rsi), %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1}
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <16 x i8>
@@ -519,125 +173,36 @@ entry:
define zeroext i64 @test_vpcmpeqb_v16i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpeqb_v16i1_v64i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpeqb %xmm1, %xmm0, %k0
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpeqb_v16i1_v64i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi32:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi33:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi34:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: .Lcfi35:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi36:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi37:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi38:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi39:
-; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
; NoVLX-NEXT: shlq $32, %rcx
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: orq %rcx, %rax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <16 x i8>
@@ -650,125 +215,36 @@ entry:
define zeroext i64 @test_vpcmpeqb_v16i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpeqb_v16i1_v64i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpeqb (%rdi), %xmm0, %k0
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpeqb_v16i1_v64i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi40:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi41:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi42:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: .Lcfi43:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi44:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi45:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi46:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi47:
-; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpcmpeqb (%rdi), %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
; NoVLX-NEXT: shlq $32, %rcx
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: orq %rcx, %rax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <16 x i8>
@@ -782,112 +258,32 @@ entry:
define zeroext i64 @test_masked_vpcmpeqb_v16i1_v64i1_mask(i16 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpeqb_v16i1_v64i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpeqb %xmm1, %xmm0, %k0 {%k1}
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpeqb_v16i1_v64i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi48:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi49:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi50:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: .Lcfi51:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi52:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi53:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi54:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi55:
-; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1}
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -896,13 +292,9 @@ define zeroext i64 @test_masked_vpcmpeqb_v16i1_v64i1_mask(i16 zeroext %__u, <2 x
; NoVLX-NEXT: shlq $32, %rcx
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: orq %rcx, %rax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <16 x i8>
@@ -917,112 +309,32 @@ entry:
define zeroext i64 @test_masked_vpcmpeqb_v16i1_v64i1_mask_mem(i16 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpeqb_v16i1_v64i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpeqb (%rsi), %xmm0, %k0 {%k1}
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpeqb_v16i1_v64i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi56:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi57:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi58:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: .Lcfi59:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi60:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi61:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi62:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi63:
-; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: vpcmpeqb (%rsi), %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1}
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -1031,13 +343,9 @@ define zeroext i64 @test_masked_vpcmpeqb_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: shlq $32, %rcx
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: orq %rcx, %rax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <16 x i8>
@@ -1054,21 +362,18 @@ entry:
define zeroext i64 @test_vpcmpeqb_v32i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpeqb_v32i1_v64i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpeqb %ymm1, %ymm0, %k0
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpeqb_v32i1_v64i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi64:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi65:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi66:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -1083,8 +388,7 @@ define zeroext i64 @test_vpcmpeqb_v32i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %ecx
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
@@ -1092,6 +396,7 @@ define zeroext i64 @test_vpcmpeqb_v32i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <32 x i8>
@@ -1104,21 +409,18 @@ entry:
define zeroext i64 @test_vpcmpeqb_v32i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpeqb_v32i1_v64i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpeqb (%rdi), %ymm0, %k0
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpeqb_v32i1_v64i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi67:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi68:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi69:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -1133,8 +435,7 @@ define zeroext i64 @test_vpcmpeqb_v32i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %ecx
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
@@ -1142,6 +443,7 @@ define zeroext i64 @test_vpcmpeqb_v32i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <32 x i8>
@@ -1155,7 +457,7 @@ entry:
define zeroext i64 @test_masked_vpcmpeqb_v32i1_v64i1_mask(i32 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpeqb_v32i1_v64i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpeqb %ymm1, %ymm0, %k0 {%k1}
; VLX-NEXT: kmovq %k0, %rax
@@ -1163,14 +465,11 @@ define zeroext i64 @test_masked_vpcmpeqb_v32i1_v64i1_mask(i32 zeroext %__u, <4 x
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpeqb_v32i1_v64i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi70:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi71:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi72:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $96, %rsp
@@ -1181,7 +480,6 @@ define zeroext i64 @test_masked_vpcmpeqb_v32i1_v64i1_mask(i32 zeroext %__u, <4 x
; NoVLX-NEXT: vpmovdb %zmm2, %xmm2
; NoVLX-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k2} {z}
; NoVLX-NEXT: vpmovdb %zmm3, %xmm3
-; NoVLX-NEXT: vpxord %zmm4, %zmm4, %zmm4
; NoVLX-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpand %xmm3, %xmm1, %xmm1
@@ -1195,7 +493,7 @@ define zeroext i64 @test_masked_vpcmpeqb_v32i1_v64i1_mask(i32 zeroext %__u, <4 x
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %ecx
-; NoVLX-NEXT: vptestmd %zmm4, %zmm4, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
@@ -1203,6 +501,7 @@ define zeroext i64 @test_masked_vpcmpeqb_v32i1_v64i1_mask(i32 zeroext %__u, <4 x
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <32 x i8>
@@ -1217,7 +516,7 @@ entry:
define zeroext i64 @test_masked_vpcmpeqb_v32i1_v64i1_mask_mem(i32 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpeqb_v32i1_v64i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpeqb (%rsi), %ymm0, %k0 {%k1}
; VLX-NEXT: kmovq %k0, %rax
@@ -1225,14 +524,11 @@ define zeroext i64 @test_masked_vpcmpeqb_v32i1_v64i1_mask_mem(i32 zeroext %__u,
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpeqb_v32i1_v64i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi73:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi74:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi75:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $96, %rsp
@@ -1243,10 +539,9 @@ define zeroext i64 @test_masked_vpcmpeqb_v32i1_v64i1_mask_mem(i32 zeroext %__u,
; NoVLX-NEXT: vpmovdb %zmm1, %xmm1
; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k2} {z}
; NoVLX-NEXT: vpmovdb %zmm2, %xmm2
-; NoVLX-NEXT: vpxord %zmm3, %zmm3, %zmm3
; NoVLX-NEXT: vpcmpeqb (%rsi), %ymm0, %ymm0
-; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm4
-; NoVLX-NEXT: vpand %xmm2, %xmm4, %xmm2
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm3
+; NoVLX-NEXT: vpand %xmm2, %xmm3, %xmm2
; NoVLX-NEXT: vpmovsxbd %xmm2, %zmm2
; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2
; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0
@@ -1257,7 +552,7 @@ define zeroext i64 @test_masked_vpcmpeqb_v32i1_v64i1_mask_mem(i32 zeroext %__u,
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %ecx
-; NoVLX-NEXT: vptestmd %zmm3, %zmm3, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
@@ -1265,6 +560,7 @@ define zeroext i64 @test_masked_vpcmpeqb_v32i1_v64i1_mask_mem(i32 zeroext %__u,
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <32 x i8>
@@ -1281,22 +577,21 @@ entry:
define zeroext i16 @test_vpcmpeqw_v8i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpeqw_v8i1_v16i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpeqw %xmm1, %xmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpeqw_v8i1_v16i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kunpckbw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
@@ -1309,22 +604,21 @@ entry:
define zeroext i16 @test_vpcmpeqw_v8i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpeqw_v8i1_v16i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpeqw (%rdi), %xmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpeqw_v8i1_v16i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpcmpeqw (%rdi), %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kunpckbw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
@@ -1338,24 +632,23 @@ entry:
define zeroext i16 @test_masked_vpcmpeqw_v8i1_v16i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpeqw_v8i1_v16i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpeqw %xmm1, %xmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpeqw_v8i1_v16i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kunpckbw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
@@ -1370,24 +663,23 @@ entry:
define zeroext i16 @test_masked_vpcmpeqw_v8i1_v16i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpeqw_v8i1_v16i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpeqw (%rsi), %xmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpeqw_v8i1_v16i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpcmpeqw (%rsi), %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kunpckbw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
@@ -1404,20 +696,17 @@ entry:
define zeroext i32 @test_vpcmpeqw_v8i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpeqw_v8i1_v32i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpeqw %xmm1, %xmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpeqw_v8i1_v32i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi76:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi77:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi78:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -1425,42 +714,32 @@ define zeroext i32 @test_vpcmpeqw_v8i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b
; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -1468,6 +747,7 @@ define zeroext i32 @test_vpcmpeqw_v8i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
@@ -1480,20 +760,17 @@ entry:
define zeroext i32 @test_vpcmpeqw_v8i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpeqw_v8i1_v32i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpeqw (%rdi), %xmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpeqw_v8i1_v32i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi79:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi80:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi81:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -1501,42 +778,32 @@ define zeroext i32 @test_vpcmpeqw_v8i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>*
; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -1544,6 +811,7 @@ define zeroext i32 @test_vpcmpeqw_v8i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>*
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
@@ -1557,21 +825,18 @@ entry:
define zeroext i32 @test_masked_vpcmpeqw_v8i1_v32i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpeqw_v8i1_v32i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpeqw %xmm1, %xmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpeqw_v8i1_v32i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi82:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi83:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi84:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -1580,42 +845,32 @@ define zeroext i32 @test_masked_vpcmpeqw_v8i1_v32i1_mask(i8 zeroext %__u, <2 x i
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -1623,6 +878,7 @@ define zeroext i32 @test_masked_vpcmpeqw_v8i1_v32i1_mask(i8 zeroext %__u, <2 x i
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
@@ -1637,21 +893,18 @@ entry:
define zeroext i32 @test_masked_vpcmpeqw_v8i1_v32i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpeqw_v8i1_v32i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpeqw (%rsi), %xmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpeqw_v8i1_v32i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi85:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi86:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi87:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -1660,42 +913,32 @@ define zeroext i32 @test_masked_vpcmpeqw_v8i1_v32i1_mask_mem(i8 zeroext %__u, <2
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -1703,6 +946,7 @@ define zeroext i32 @test_masked_vpcmpeqw_v8i1_v32i1_mask_mem(i8 zeroext %__u, <2
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
@@ -1719,20 +963,17 @@ entry:
define zeroext i64 @test_vpcmpeqw_v8i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpeqw_v8i1_v64i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpeqw %xmm1, %xmm0, %k0
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpeqw_v8i1_v64i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi88:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi89:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi90:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -1740,44 +981,34 @@ define zeroext i64 @test_vpcmpeqw_v8i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b
; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -1788,6 +1019,7 @@ define zeroext i64 @test_vpcmpeqw_v8i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
@@ -1800,20 +1032,17 @@ entry:
define zeroext i64 @test_vpcmpeqw_v8i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpeqw_v8i1_v64i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpeqw (%rdi), %xmm0, %k0
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpeqw_v8i1_v64i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi91:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi92:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi93:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -1821,44 +1050,34 @@ define zeroext i64 @test_vpcmpeqw_v8i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>*
; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -1869,6 +1088,7 @@ define zeroext i64 @test_vpcmpeqw_v8i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>*
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
@@ -1882,21 +1102,18 @@ entry:
define zeroext i64 @test_masked_vpcmpeqw_v8i1_v64i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpeqw_v8i1_v64i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpeqw %xmm1, %xmm0, %k0 {%k1}
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpeqw_v8i1_v64i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi94:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi95:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi96:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -1905,44 +1122,34 @@ define zeroext i64 @test_masked_vpcmpeqw_v8i1_v64i1_mask(i8 zeroext %__u, <2 x i
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -1953,6 +1160,7 @@ define zeroext i64 @test_masked_vpcmpeqw_v8i1_v64i1_mask(i8 zeroext %__u, <2 x i
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
@@ -1967,21 +1175,18 @@ entry:
define zeroext i64 @test_masked_vpcmpeqw_v8i1_v64i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpeqw_v8i1_v64i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpeqw (%rsi), %xmm0, %k0 {%k1}
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpeqw_v8i1_v64i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi97:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi98:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi99:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -1990,44 +1195,34 @@ define zeroext i64 @test_masked_vpcmpeqw_v8i1_v64i1_mask_mem(i8 zeroext %__u, <2
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -2038,6 +1233,7 @@ define zeroext i64 @test_masked_vpcmpeqw_v8i1_v64i1_mask_mem(i8 zeroext %__u, <2
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
@@ -2054,121 +1250,34 @@ entry:
define zeroext i32 @test_vpcmpeqw_v16i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpeqw_v16i1_v32i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpeqw %ymm1, %ymm0, %k0
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpeqw_v16i1_v32i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi100:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi101:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi102:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: .Lcfi103:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi104:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi105:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi106:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi107:
-; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0
; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <16 x i16>
@@ -2181,121 +1290,34 @@ entry:
define zeroext i32 @test_vpcmpeqw_v16i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpeqw_v16i1_v32i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpeqw (%rdi), %ymm0, %k0
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpeqw_v16i1_v32i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi108:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi109:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi110:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: .Lcfi111:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi112:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi113:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi114:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi115:
-; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpcmpeqw (%rdi), %ymm0, %ymm0
; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <16 x i16>
@@ -2309,7 +1331,7 @@ entry:
define zeroext i32 @test_masked_vpcmpeqw_v16i1_v32i1_mask(i16 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpeqw_v16i1_v32i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpeqw %ymm1, %ymm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
@@ -2317,115 +1339,31 @@ define zeroext i32 @test_masked_vpcmpeqw_v16i1_v32i1_mask(i16 zeroext %__u, <4 x
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpeqw_v16i1_v32i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi116:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi117:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi118:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: .Lcfi119:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi120:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi121:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi122:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi123:
-; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0
; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1}
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <16 x i16>
@@ -2440,7 +1378,7 @@ entry:
define zeroext i32 @test_masked_vpcmpeqw_v16i1_v32i1_mask_mem(i16 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpeqw_v16i1_v32i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpeqw (%rsi), %ymm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
@@ -2448,115 +1386,31 @@ define zeroext i32 @test_masked_vpcmpeqw_v16i1_v32i1_mask_mem(i16 zeroext %__u,
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpeqw_v16i1_v32i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi124:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi125:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi126:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: .Lcfi127:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi128:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi129:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi130:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi131:
-; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: vpcmpeqw (%rsi), %ymm0, %ymm0
; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1}
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <16 x i16>
@@ -2573,111 +1427,28 @@ entry:
define zeroext i64 @test_vpcmpeqw_v16i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpeqw_v16i1_v64i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpeqw %ymm1, %ymm0, %k0
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpeqw_v16i1_v64i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi132:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi133:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi134:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: .Lcfi135:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi136:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi137:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi138:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi139:
-; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0
; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -2686,13 +1457,9 @@ define zeroext i64 @test_vpcmpeqw_v16i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__
; NoVLX-NEXT: shlq $32, %rcx
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: orq %rcx, %rax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <16 x i16>
@@ -2705,111 +1472,28 @@ entry:
define zeroext i64 @test_vpcmpeqw_v16i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpeqw_v16i1_v64i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpeqw (%rdi), %ymm0, %k0
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpeqw_v16i1_v64i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi140:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi141:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi142:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: .Lcfi143:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi144:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi145:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi146:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi147:
-; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpcmpeqw (%rdi), %ymm0, %ymm0
; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -2818,13 +1502,9 @@ define zeroext i64 @test_vpcmpeqw_v16i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>
; NoVLX-NEXT: shlq $32, %rcx
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: orq %rcx, %rax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <16 x i16>
@@ -2838,7 +1518,7 @@ entry:
define zeroext i64 @test_masked_vpcmpeqw_v16i1_v64i1_mask(i16 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpeqw_v16i1_v64i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpeqw %ymm1, %ymm0, %k0 {%k1}
; VLX-NEXT: kmovq %k0, %rax
@@ -2846,105 +1526,25 @@ define zeroext i64 @test_masked_vpcmpeqw_v16i1_v64i1_mask(i16 zeroext %__u, <4 x
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpeqw_v16i1_v64i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi148:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi149:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi150:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: .Lcfi151:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi152:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi153:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi154:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi155:
-; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0
; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1}
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -2953,13 +1553,9 @@ define zeroext i64 @test_masked_vpcmpeqw_v16i1_v64i1_mask(i16 zeroext %__u, <4 x
; NoVLX-NEXT: shlq $32, %rcx
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: orq %rcx, %rax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <16 x i16>
@@ -2974,7 +1570,7 @@ entry:
define zeroext i64 @test_masked_vpcmpeqw_v16i1_v64i1_mask_mem(i16 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpeqw_v16i1_v64i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpeqw (%rsi), %ymm0, %k0 {%k1}
; VLX-NEXT: kmovq %k0, %rax
@@ -2982,105 +1578,25 @@ define zeroext i64 @test_masked_vpcmpeqw_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpeqw_v16i1_v64i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi156:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi157:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi158:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: .Lcfi159:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi160:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi161:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi162:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi163:
-; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: vpcmpeqw (%rsi), %ymm0, %ymm0
; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1}
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -3089,13 +1605,9 @@ define zeroext i64 @test_masked_vpcmpeqw_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: shlq $32, %rcx
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: orq %rcx, %rax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <16 x i16>
@@ -3112,25 +1624,22 @@ entry:
define zeroext i64 @test_vpcmpeqw_v32i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpeqw_v32i1_v64i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpeqw %zmm1, %zmm0, %k0
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpeqw_v32i1_v64i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi164:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi165:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi166:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm3
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm3
; NoVLX-NEXT: vmovq %xmm3, %rax
; NoVLX-NEXT: movq %rax, %rcx
; NoVLX-NEXT: movq %rax, %rdx
@@ -3139,7 +1648,7 @@ define zeroext i64 @test_vpcmpeqw_v32i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__
; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm5
; NoVLX-NEXT: vextracti32x4 $2, %zmm1, %xmm8
; NoVLX-NEXT: vextracti32x4 $3, %zmm1, %xmm4
-; NoVLX-NEXT: vextracti32x4 $1, %zmm1, %xmm6
+; NoVLX-NEXT: vextracti128 $1, %ymm1, %xmm6
; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm7
; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm2
; NoVLX-NEXT: shrq $32, %rdx
@@ -3303,148 +1812,19 @@ define zeroext i64 @test_vpcmpeqw_v32i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__
; NoVLX-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
; NoVLX-NEXT: vpcmpeqw %ymm1, %ymm3, %ymm1
; NoVLX-NEXT: vpmovsxwd %ymm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpmovdb %zmm1, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %ecx
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
@@ -3452,6 +1832,7 @@ define zeroext i64 @test_vpcmpeqw_v32i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <32 x i16>
@@ -3464,25 +1845,22 @@ entry:
define zeroext i64 @test_vpcmpeqw_v32i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpeqw_v32i1_v64i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpeqw (%rdi), %zmm0, %k0
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpeqw_v32i1_v64i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi167:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi168:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi169:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm2
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm2
; NoVLX-NEXT: vmovq %xmm2, %rax
; NoVLX-NEXT: movq %rax, %rcx
; NoVLX-NEXT: movq %rax, %rdx
@@ -3569,149 +1947,20 @@ define zeroext i64 @test_vpcmpeqw_v32i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>
; NoVLX-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
; NoVLX-NEXT: vpcmpeqw 32(%rdi), %ymm1, %ymm1
; NoVLX-NEXT: vpmovsxwd %ymm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %eax, %xmm1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpmovdb %zmm1, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpcmpeqw (%rdi), %ymm0, %ymm0
; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %ecx
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
@@ -3719,6 +1968,7 @@ define zeroext i64 @test_vpcmpeqw_v32i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <32 x i16>
@@ -3732,7 +1982,7 @@ entry:
define zeroext i64 @test_masked_vpcmpeqw_v32i1_v64i1_mask(i32 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpeqw_v32i1_v64i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpeqw %zmm1, %zmm0, %k0 {%k1}
; VLX-NEXT: kmovq %k0, %rax
@@ -3740,14 +1990,11 @@ define zeroext i64 @test_masked_vpcmpeqw_v32i1_v64i1_mask(i32 zeroext %__u, <8 x
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpeqw_v32i1_v64i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi170:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi171:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi172:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $96, %rsp
@@ -3759,10 +2006,10 @@ define zeroext i64 @test_masked_vpcmpeqw_v32i1_v64i1_mask(i32 zeroext %__u, <8 x
; NoVLX-NEXT: vmovd %eax, %xmm3
; NoVLX-NEXT: shrl $16, %eax
; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm4
-; NoVLX-NEXT: vextracti32x4 $1, %zmm1, %xmm8
+; NoVLX-NEXT: vextracti128 $1, %ymm1, %xmm8
; NoVLX-NEXT: vextracti32x4 $2, %zmm1, %xmm5
; NoVLX-NEXT: vextracti32x4 $3, %zmm1, %xmm7
-; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm6
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm6
; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm3
; NoVLX-NEXT: shrq $32, %rdx
; NoVLX-NEXT: vpinsrw $2, %edx, %xmm4, %xmm4
@@ -3908,173 +2155,44 @@ define zeroext i64 @test_masked_vpcmpeqw_v32i1_v64i1_mask(i32 zeroext %__u, <8 x
; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2
; NoVLX-NEXT: vpextrq $1, %xmm1, %rax
; NoVLX-NEXT: vinserti128 $1, %xmm9, %ymm4, %ymm1
-; NoVLX-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm8
+; NoVLX-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm0
; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1
; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k2
-; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vinserti128 $1, %xmm7, %ymm3, %ymm3
-; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
-; NoVLX-NEXT: vpternlogd $255, %zmm6, %zmm6, %zmm6 {%k2} {z}
-; NoVLX-NEXT: vpcmpeqw %ymm3, %ymm1, %ymm4
-; NoVLX-NEXT: vpmovdb %zmm6, %xmm6
+; NoVLX-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k1} {z}
+; NoVLX-NEXT: vpmovdb %zmm4, %xmm4
+; NoVLX-NEXT: vpcmpeqw %ymm3, %ymm1, %ymm1
+; NoVLX-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k2} {z}
+; NoVLX-NEXT: vpmovdb %zmm3, %xmm3
; NoVLX-NEXT: shrq $48, %rcx
-; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm1
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2
; NoVLX-NEXT: movl %eax, %ecx
; NoVLX-NEXT: shrl $16, %ecx
-; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm2
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2
; NoVLX-NEXT: movq %rax, %rcx
; NoVLX-NEXT: shrq $48, %rax
; NoVLX-NEXT: shrq $32, %rcx
-; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2
; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2
; NoVLX-NEXT: vinserti128 $1, %xmm5, %ymm2, %ymm2
-; NoVLX-NEXT: vpcmpeqw %ymm2, %ymm8, %ymm2
-; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2
-; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2
-; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm2
-; NoVLX-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm3
-; NoVLX-NEXT: vpmovsxwd %ymm4, %zmm2
-; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2
-; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm2
-; NoVLX-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: vpand %xmm6, %xmm2, %xmm2
-; NoVLX-NEXT: vpmovsxbd %xmm2, %zmm2
-; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2
-; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0
+; NoVLX-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
+; NoVLX-NEXT: vpand %xmm4, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxwd %ymm1, %zmm1
+; NoVLX-NEXT: vpmovdb %zmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm3, %xmm1, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: vpand %xmm0, %xmm3, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %ecx
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
@@ -4082,6 +2200,7 @@ define zeroext i64 @test_masked_vpcmpeqw_v32i1_v64i1_mask(i32 zeroext %__u, <8 x
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <32 x i16>
@@ -4096,7 +2215,7 @@ entry:
define zeroext i64 @test_masked_vpcmpeqw_v32i1_v64i1_mask_mem(i32 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpeqw_v32i1_v64i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpeqw (%rsi), %zmm0, %k0 {%k1}
; VLX-NEXT: kmovq %k0, %rax
@@ -4104,14 +2223,11 @@ define zeroext i64 @test_masked_vpcmpeqw_v32i1_v64i1_mask_mem(i32 zeroext %__u,
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpeqw_v32i1_v64i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi173:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi174:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi175:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $96, %rsp
@@ -4123,7 +2239,7 @@ define zeroext i64 @test_masked_vpcmpeqw_v32i1_v64i1_mask_mem(i32 zeroext %__u,
; NoVLX-NEXT: vmovd %eax, %xmm2
; NoVLX-NEXT: shrl $16, %eax
; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm3
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm3
; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm4
; NoVLX-NEXT: shrq $32, %rdx
; NoVLX-NEXT: vpinsrw $2, %edx, %xmm2, %xmm2
@@ -4139,221 +2255,92 @@ define zeroext i64 @test_masked_vpcmpeqw_v32i1_v64i1_mask_mem(i32 zeroext %__u,
; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vmovq %xmm4, %rcx
; NoVLX-NEXT: shrq $48, %rax
-; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm2
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1
; NoVLX-NEXT: movl %ecx, %eax
; NoVLX-NEXT: shrl $16, %eax
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vmovd %ecx, %xmm2
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2
; NoVLX-NEXT: movq %rcx, %rax
; NoVLX-NEXT: shrq $32, %rax
-; NoVLX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2
; NoVLX-NEXT: vpextrq $1, %xmm4, %rax
; NoVLX-NEXT: shrq $48, %rcx
-; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2
; NoVLX-NEXT: movl %eax, %ecx
; NoVLX-NEXT: shrl $16, %ecx
-; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2
; NoVLX-NEXT: movq %rax, %rcx
; NoVLX-NEXT: shrq $32, %rcx
-; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2
; NoVLX-NEXT: vmovq %xmm3, %rcx
; NoVLX-NEXT: shrq $48, %rax
-; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm4
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2
; NoVLX-NEXT: movl %ecx, %eax
; NoVLX-NEXT: shrl $16, %eax
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vmovd %ecx, %xmm4
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4
; NoVLX-NEXT: movq %rcx, %rax
; NoVLX-NEXT: shrq $32, %rax
-; NoVLX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4
; NoVLX-NEXT: vpextrq $1, %xmm3, %rax
; NoVLX-NEXT: shrq $48, %rcx
-; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm3
; NoVLX-NEXT: movl %eax, %ecx
; NoVLX-NEXT: shrl $16, %ecx
-; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3
; NoVLX-NEXT: movq %rax, %rcx
; NoVLX-NEXT: shrq $32, %rcx
-; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3
; NoVLX-NEXT: vmovq %xmm0, %rcx
; NoVLX-NEXT: shrq $48, %rax
-; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm3
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3
; NoVLX-NEXT: movl %ecx, %eax
; NoVLX-NEXT: shrl $16, %eax
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vmovd %ecx, %xmm4
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4
; NoVLX-NEXT: movq %rcx, %rax
; NoVLX-NEXT: shrq $32, %rax
-; NoVLX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4
; NoVLX-NEXT: vpextrq $1, %xmm0, %rax
; NoVLX-NEXT: shrq $48, %rcx
-; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm1, %xmm0
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm0
; NoVLX-NEXT: movl %eax, %ecx
; NoVLX-NEXT: shrl $16, %ecx
; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: movq %rax, %rcx
; NoVLX-NEXT: shrq $32, %rcx
-; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm5
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1
; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k2
-; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vpmovdb %zmm0, %xmm1
-; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
-; NoVLX-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm4
-; NoVLX-NEXT: vpmovdb %zmm0, %xmm2
+; NoVLX-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k1} {z}
+; NoVLX-NEXT: vpmovdb %zmm4, %xmm4
+; NoVLX-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k2} {z}
+; NoVLX-NEXT: vpmovdb %zmm2, %xmm2
; NoVLX-NEXT: shrq $48, %rax
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vpinsrw $7, %eax, %xmm5, %xmm5
-; NoVLX-NEXT: vinserti128 $1, %xmm3, %ymm5, %ymm3
-; NoVLX-NEXT: vpcmpeqw (%rsi), %ymm3, %ymm3
-; NoVLX-NEXT: vpmovsxwd %ymm3, %zmm3
-; NoVLX-NEXT: vpslld $31, %zmm3, %zmm3
-; NoVLX-NEXT: vptestmd %zmm3, %zmm3, %k0
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %eax, %xmm3
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm3, %xmm3
-; NoVLX-NEXT: vpcmpeqw 32(%rsi), %ymm4, %ymm4
-; NoVLX-NEXT: vpmovsxwd %ymm4, %zmm4
-; NoVLX-NEXT: vpslld $31, %zmm4, %zmm4
-; NoVLX-NEXT: vptestmd %zmm4, %zmm4, %k0
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm4
-; NoVLX-NEXT: vpinsrb $1, %eax, %xmm4, %xmm4
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $2, %eax, %xmm4, %xmm4
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $3, %eax, %xmm4, %xmm4
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm4, %xmm4
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm4, %xmm4
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm4, %xmm4
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm4, %xmm4
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm4, %xmm4
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $9, %eax, %xmm4, %xmm4
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $10, %eax, %xmm4, %xmm4
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm4, %xmm4
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $12, %eax, %xmm4, %xmm4
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $13, %eax, %xmm4, %xmm4
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $14, %eax, %xmm4, %xmm4
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm4, %xmm4
-; NoVLX-NEXT: vpand %xmm2, %xmm4, %xmm2
-; NoVLX-NEXT: vpmovsxbd %xmm2, %zmm2
-; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2
-; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0
-; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: vpand %xmm1, %xmm3, %xmm1
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
+; NoVLX-NEXT: vpcmpeqw (%rsi), %ymm0, %ymm0
+; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
+; NoVLX-NEXT: vpand %xmm4, %xmm0, %xmm0
+; NoVLX-NEXT: vpcmpeqw 32(%rsi), %ymm1, %ymm1
+; NoVLX-NEXT: vpmovsxwd %ymm1, %zmm1
+; NoVLX-NEXT: vpmovdb %zmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm2, %xmm1, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %ecx
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
@@ -4361,6 +2348,7 @@ define zeroext i64 @test_masked_vpcmpeqw_v32i1_v64i1_mask_mem(i32 zeroext %__u,
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <32 x i16>
@@ -4377,49 +2365,45 @@ entry:
define zeroext i8 @test_vpcmpeqd_v4i1_v8i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpeqd_v4i1_v8i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpeqd %xmm1, %xmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %al killed %al killed %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpeqd_v4i1_v8i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kshiftlw $7, %k0, %k0
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $13, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $12, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %al killed %al killed %eax
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -4432,49 +2416,45 @@ entry:
define zeroext i8 @test_vpcmpeqd_v4i1_v8i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpeqd_v4i1_v8i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpeqd (%rdi), %xmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %al killed %al killed %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpeqd_v4i1_v8i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpcmpeqd (%rdi), %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kshiftlw $7, %k0, %k0
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $13, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $12, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %al killed %al killed %eax
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -4488,68 +2468,59 @@ entry:
define zeroext i8 @test_masked_vpcmpeqd_v4i1_v8i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpeqd_v4i1_v8i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpeqd %xmm1, %xmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %al killed %al killed %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v8i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $13, %k0, %k2
-; NoVLX-NEXT: kshiftrw $15, %k2, %k2
-; NoVLX-NEXT: kshiftlw $15, %k0, %k3
-; NoVLX-NEXT: kshiftrw $15, %k3, %k3
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k3, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kshiftlw $7, %k0, %k0
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $13, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $12, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %al killed %al killed %eax
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -4565,68 +2536,59 @@ entry:
define zeroext i8 @test_masked_vpcmpeqd_v4i1_v8i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpeqd_v4i1_v8i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpeqd (%rsi), %xmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %al killed %al killed %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v8i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpcmpeqd (%rsi), %xmm0, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $13, %k0, %k2
-; NoVLX-NEXT: kshiftrw $15, %k2, %k2
-; NoVLX-NEXT: kshiftlw $15, %k0, %k3
-; NoVLX-NEXT: kshiftrw $15, %k3, %k3
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k3, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kshiftlw $7, %k0, %k0
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $13, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $12, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %al killed %al killed %eax
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -4644,50 +2606,46 @@ entry:
define zeroext i8 @test_vpcmpeqd_v4i1_v8i1_mask_mem_b(<2 x i64> %__a, i32* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpeqd_v4i1_v8i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpeqd (%rdi){1to4}, %xmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %al killed %al killed %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpeqd_v4i1_v8i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpbroadcastd (%rdi), %xmm1
; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kshiftlw $7, %k0, %k0
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $13, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $12, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %al killed %al killed %eax
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -4702,69 +2660,60 @@ entry:
define zeroext i8 @test_masked_vpcmpeqd_v4i1_v8i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i32* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpeqd_v4i1_v8i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpeqd (%rsi){1to4}, %xmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %al killed %al killed %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v8i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpbroadcastd (%rsi), %xmm1
; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $13, %k0, %k2
-; NoVLX-NEXT: kshiftrw $15, %k2, %k2
-; NoVLX-NEXT: kshiftlw $15, %k0, %k3
-; NoVLX-NEXT: kshiftrw $15, %k3, %k3
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k3, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kshiftlw $7, %k0, %k0
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $13, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $12, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %al killed %al killed %eax
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -4783,48 +2732,45 @@ entry:
define zeroext i16 @test_vpcmpeqd_v4i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpeqd_v4i1_v16i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpeqd %xmm1, %xmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpeqd_v4i1_v16i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kmovw %eax, %k0
; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: andl $1, %eax
-; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $13, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $12, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -4837,48 +2783,45 @@ entry:
define zeroext i16 @test_vpcmpeqd_v4i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpeqd_v4i1_v16i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpeqd (%rdi), %xmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpeqd_v4i1_v16i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpcmpeqd (%rdi), %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kmovw %eax, %k0
; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: andl $1, %eax
-; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $13, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $12, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -4892,67 +2835,59 @@ entry:
define zeroext i16 @test_masked_vpcmpeqd_v4i1_v16i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpeqd_v4i1_v16i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpeqd %xmm1, %xmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v16i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $13, %k0, %k2
-; NoVLX-NEXT: kshiftrw $15, %k2, %k2
-; NoVLX-NEXT: kshiftlw $15, %k0, %k3
-; NoVLX-NEXT: kshiftrw $15, %k3, %k3
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k3, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kmovw %eax, %k0
; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: andl $1, %eax
-; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $13, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $12, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -4968,67 +2903,59 @@ entry:
define zeroext i16 @test_masked_vpcmpeqd_v4i1_v16i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpeqd_v4i1_v16i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpeqd (%rsi), %xmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v16i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpcmpeqd (%rsi), %xmm0, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $13, %k0, %k2
-; NoVLX-NEXT: kshiftrw $15, %k2, %k2
-; NoVLX-NEXT: kshiftlw $15, %k0, %k3
-; NoVLX-NEXT: kshiftrw $15, %k3, %k3
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k3, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kmovw %eax, %k0
; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: andl $1, %eax
-; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $13, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $12, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -5046,49 +2973,46 @@ entry:
define zeroext i16 @test_vpcmpeqd_v4i1_v16i1_mask_mem_b(<2 x i64> %__a, i32* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpeqd_v4i1_v16i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpeqd (%rdi){1to4}, %xmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpeqd_v4i1_v16i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpbroadcastd (%rdi), %xmm1
; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kmovw %eax, %k0
; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: andl $1, %eax
-; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $13, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $12, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -5103,68 +3027,60 @@ entry:
define zeroext i16 @test_masked_vpcmpeqd_v4i1_v16i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i32* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpeqd_v4i1_v16i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpeqd (%rsi){1to4}, %xmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v16i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpbroadcastd (%rsi), %xmm1
; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $13, %k0, %k2
-; NoVLX-NEXT: kshiftrw $15, %k2, %k2
-; NoVLX-NEXT: kshiftlw $15, %k0, %k3
-; NoVLX-NEXT: kshiftrw $15, %k3, %k3
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k3, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kmovw %eax, %k0
; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: andl $1, %eax
-; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $13, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $12, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -5183,20 +3099,17 @@ entry:
define zeroext i32 @test_vpcmpeqd_v4i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpeqd_v4i1_v32i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpeqd %xmm1, %xmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpeqd_v4i1_v32i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi176:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi177:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi178:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -5214,6 +3127,7 @@ define zeroext i32 @test_vpcmpeqd_v4i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -5226,20 +3140,17 @@ entry:
define zeroext i32 @test_vpcmpeqd_v4i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpeqd_v4i1_v32i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpeqd (%rdi), %xmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpeqd_v4i1_v32i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi179:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi180:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi181:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -5257,6 +3168,7 @@ define zeroext i32 @test_vpcmpeqd_v4i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>*
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -5270,41 +3182,33 @@ entry:
define zeroext i32 @test_masked_vpcmpeqd_v4i1_v32i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpeqd_v4i1_v32i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpeqd %xmm1, %xmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v32i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi182:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi183:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi184:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $13, %k0, %k2
-; NoVLX-NEXT: kshiftrw $15, %k2, %k2
-; NoVLX-NEXT: kshiftlw $15, %k0, %k3
-; NoVLX-NEXT: kshiftrw $15, %k3, %k3
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k3, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -5320,6 +3224,7 @@ define zeroext i32 @test_masked_vpcmpeqd_v4i1_v32i1_mask(i8 zeroext %__u, <2 x i
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -5335,41 +3240,33 @@ entry:
define zeroext i32 @test_masked_vpcmpeqd_v4i1_v32i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpeqd_v4i1_v32i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpeqd (%rsi), %xmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v32i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi185:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi186:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi187:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
; NoVLX-NEXT: vpcmpeqd (%rsi), %xmm0, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $13, %k0, %k2
-; NoVLX-NEXT: kshiftrw $15, %k2, %k2
-; NoVLX-NEXT: kshiftlw $15, %k0, %k3
-; NoVLX-NEXT: kshiftrw $15, %k3, %k3
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k3, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -5385,6 +3282,7 @@ define zeroext i32 @test_masked_vpcmpeqd_v4i1_v32i1_mask_mem(i8 zeroext %__u, <2
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -5402,20 +3300,17 @@ entry:
define zeroext i32 @test_vpcmpeqd_v4i1_v32i1_mask_mem_b(<2 x i64> %__a, i32* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpeqd_v4i1_v32i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpeqd (%rdi){1to4}, %xmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpeqd_v4i1_v32i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi188:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi189:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi190:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -5434,6 +3329,7 @@ define zeroext i32 @test_vpcmpeqd_v4i1_v32i1_mask_mem_b(<2 x i64> %__a, i32* %__
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -5448,42 +3344,34 @@ entry:
define zeroext i32 @test_masked_vpcmpeqd_v4i1_v32i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i32* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpeqd_v4i1_v32i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpeqd (%rsi){1to4}, %xmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v32i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi191:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi192:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi193:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
; NoVLX-NEXT: vpbroadcastd (%rsi), %xmm1
; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $13, %k0, %k2
-; NoVLX-NEXT: kshiftrw $15, %k2, %k2
-; NoVLX-NEXT: kshiftlw $15, %k0, %k3
-; NoVLX-NEXT: kshiftrw $15, %k3, %k3
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k3, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -5499,6 +3387,7 @@ define zeroext i32 @test_masked_vpcmpeqd_v4i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -5517,26 +3406,22 @@ entry:
define zeroext i64 @test_vpcmpeqd_v4i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpeqd_v4i1_v64i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpeqd %xmm1, %xmm0, %k0
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpeqd_v4i1_v64i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi194:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi195:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi196:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
-; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -5555,6 +3440,7 @@ define zeroext i64 @test_vpcmpeqd_v4i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -5567,26 +3453,22 @@ entry:
define zeroext i64 @test_vpcmpeqd_v4i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpeqd_v4i1_v64i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpeqd (%rdi), %xmm0, %k0
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpeqd_v4i1_v64i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi197:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi198:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi199:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vpcmpeqd (%rdi), %xmm0, %xmm0
-; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -5605,6 +3487,7 @@ define zeroext i64 @test_vpcmpeqd_v4i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>*
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -5618,45 +3501,36 @@ entry:
define zeroext i64 @test_masked_vpcmpeqd_v4i1_v64i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpeqd_v4i1_v64i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpeqd %xmm1, %xmm0, %k0 {%k1}
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v64i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi200:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi201:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi202:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $13, %k0, %k2
-; NoVLX-NEXT: kshiftrw $15, %k2, %k2
-; NoVLX-NEXT: kshiftlw $15, %k0, %k3
-; NoVLX-NEXT: kshiftrw $15, %k3, %k3
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k3, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
-; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -5675,6 +3549,7 @@ define zeroext i64 @test_masked_vpcmpeqd_v4i1_v64i1_mask(i8 zeroext %__u, <2 x i
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -5690,45 +3565,36 @@ entry:
define zeroext i64 @test_masked_vpcmpeqd_v4i1_v64i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpeqd_v4i1_v64i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpeqd (%rsi), %xmm0, %k0 {%k1}
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v64i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi203:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi204:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi205:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vpcmpeqd (%rsi), %xmm0, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $13, %k0, %k2
-; NoVLX-NEXT: kshiftrw $15, %k2, %k2
-; NoVLX-NEXT: kshiftlw $15, %k0, %k3
-; NoVLX-NEXT: kshiftrw $15, %k3, %k3
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k3, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
-; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -5747,6 +3613,7 @@ define zeroext i64 @test_masked_vpcmpeqd_v4i1_v64i1_mask_mem(i8 zeroext %__u, <2
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -5764,27 +3631,23 @@ entry:
define zeroext i64 @test_vpcmpeqd_v4i1_v64i1_mask_mem_b(<2 x i64> %__a, i32* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpeqd_v4i1_v64i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpeqd (%rdi){1to4}, %xmm0, %k0
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpeqd_v4i1_v64i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi206:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi207:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi208:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vpbroadcastd (%rdi), %xmm1
; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
-; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -5803,6 +3666,7 @@ define zeroext i64 @test_vpcmpeqd_v4i1_v64i1_mask_mem_b(<2 x i64> %__a, i32* %__
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -5817,46 +3681,37 @@ entry:
define zeroext i64 @test_masked_vpcmpeqd_v4i1_v64i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i32* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpeqd_v4i1_v64i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpeqd (%rsi){1to4}, %xmm0, %k0 {%k1}
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v64i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi209:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi210:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi211:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vpbroadcastd (%rsi), %xmm1
; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $13, %k0, %k2
-; NoVLX-NEXT: kshiftrw $15, %k2, %k2
-; NoVLX-NEXT: kshiftlw $15, %k0, %k3
-; NoVLX-NEXT: kshiftrw $15, %k3, %k3
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k3, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
-; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -5875,6 +3730,7 @@ define zeroext i64 @test_masked_vpcmpeqd_v4i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -5893,22 +3749,23 @@ entry:
define zeroext i16 @test_vpcmpeqd_v8i1_v16i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpeqd_v8i1_v16i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpeqd %ymm1, %ymm0, %k0
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpeqd_v8i1_v16i1_mask:
-; NoVLX: # BB#0: # %entry
-; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
-; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX: # %bb.0: # %entry
+; NoVLX-NEXT: # kill: def %ymm1 killed %ymm1 def %zmm1
+; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
; NoVLX-NEXT: kshiftrw $8, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
@@ -5921,22 +3778,23 @@ entry:
define zeroext i16 @test_vpcmpeqd_v8i1_v16i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpeqd_v8i1_v16i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpeqd (%rdi), %ymm0, %k0
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpeqd_v8i1_v16i1_mask_mem:
-; NoVLX: # BB#0: # %entry
-; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX: # %bb.0: # %entry
+; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: vmovdqa (%rdi), %ymm1
; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
; NoVLX-NEXT: kshiftrw $8, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
@@ -5950,24 +3808,25 @@ entry:
define zeroext i16 @test_masked_vpcmpeqd_v8i1_v16i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpeqd_v8i1_v16i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpeqd %ymm1, %ymm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpeqd_v8i1_v16i1_mask:
-; NoVLX: # BB#0: # %entry
-; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
-; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX: # %bb.0: # %entry
+; NoVLX-NEXT: # kill: def %ymm1 killed %ymm1 def %zmm1
+; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 {%k1}
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
; NoVLX-NEXT: kshiftrw $8, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
@@ -5982,24 +3841,25 @@ entry:
define zeroext i16 @test_masked_vpcmpeqd_v8i1_v16i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpeqd_v8i1_v16i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpeqd (%rsi), %ymm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpeqd_v8i1_v16i1_mask_mem:
-; NoVLX: # BB#0: # %entry
-; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX: # %bb.0: # %entry
+; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: vmovdqa (%rsi), %ymm1
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 {%k1}
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
; NoVLX-NEXT: kshiftrw $8, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
@@ -6016,22 +3876,23 @@ entry:
define zeroext i16 @test_vpcmpeqd_v8i1_v16i1_mask_mem_b(<4 x i64> %__a, i32* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpeqd_v8i1_v16i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpeqd (%rdi){1to8}, %ymm0, %k0
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpeqd_v8i1_v16i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
-; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX: # %bb.0: # %entry
+; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: vpbroadcastd (%rdi), %ymm1
; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
; NoVLX-NEXT: kshiftrw $8, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
@@ -6046,24 +3907,25 @@ entry:
define zeroext i16 @test_masked_vpcmpeqd_v8i1_v16i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i32* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpeqd_v8i1_v16i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpeqd (%rsi){1to8}, %ymm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpeqd_v8i1_v16i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
-; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX: # %bb.0: # %entry
+; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: vpbroadcastd (%rsi), %ymm1
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 {%k1}
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
; NoVLX-NEXT: kshiftrw $8, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
@@ -6081,63 +3943,50 @@ entry:
define zeroext i32 @test_vpcmpeqd_v8i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpeqd_v8i1_v32i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpeqd %ymm1, %ymm0, %k0
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpeqd_v8i1_v32i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi212:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi213:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi214:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
-; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: # kill: def %ymm1 killed %ymm1 def %zmm1
+; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -6145,6 +3994,7 @@ define zeroext i32 @test_vpcmpeqd_v8i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__b
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
@@ -6157,63 +4007,50 @@ entry:
define zeroext i32 @test_vpcmpeqd_v8i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpeqd_v8i1_v32i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpeqd (%rdi), %ymm0, %k0
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpeqd_v8i1_v32i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi215:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi216:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi217:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: vmovdqa (%rdi), %ymm1
; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -6221,6 +4058,7 @@ define zeroext i32 @test_vpcmpeqd_v8i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>*
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
@@ -6234,7 +4072,7 @@ entry:
define zeroext i32 @test_masked_vpcmpeqd_v8i1_v32i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpeqd_v8i1_v32i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpeqd %ymm1, %ymm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
@@ -6242,58 +4080,44 @@ define zeroext i32 @test_masked_vpcmpeqd_v8i1_v32i1_mask(i8 zeroext %__u, <4 x i
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpeqd_v8i1_v32i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi218:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi219:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi220:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
-; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
-; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
+; NoVLX-NEXT: # kill: def %ymm1 killed %ymm1 def %zmm1
+; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
-; NoVLX-NEXT: kandw %k1, %k0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -6301,6 +4125,7 @@ define zeroext i32 @test_masked_vpcmpeqd_v8i1_v32i1_mask(i8 zeroext %__u, <4 x i
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
@@ -6315,7 +4140,7 @@ entry:
define zeroext i32 @test_masked_vpcmpeqd_v8i1_v32i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpeqd_v8i1_v32i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpeqd (%rsi), %ymm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
@@ -6323,58 +4148,44 @@ define zeroext i32 @test_masked_vpcmpeqd_v8i1_v32i1_mask_mem(i8 zeroext %__u, <4
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpeqd_v8i1_v32i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi221:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi222:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi223:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: vmovdqa (%rsi), %ymm1
-; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
; NoVLX-NEXT: kmovw %edi, %k1
-; NoVLX-NEXT: kandw %k1, %k0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -6382,6 +4193,7 @@ define zeroext i32 @test_masked_vpcmpeqd_v8i1_v32i1_mask_mem(i8 zeroext %__u, <4
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
@@ -6398,63 +4210,50 @@ entry:
define zeroext i32 @test_vpcmpeqd_v8i1_v32i1_mask_mem_b(<4 x i64> %__a, i32* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpeqd_v8i1_v32i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpeqd (%rdi){1to8}, %ymm0, %k0
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpeqd_v8i1_v32i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi224:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi225:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi226:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: vpbroadcastd (%rdi), %ymm1
; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -6462,6 +4261,7 @@ define zeroext i32 @test_vpcmpeqd_v8i1_v32i1_mask_mem_b(<4 x i64> %__a, i32* %__
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
@@ -6476,7 +4276,7 @@ entry:
define zeroext i32 @test_masked_vpcmpeqd_v8i1_v32i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i32* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpeqd_v8i1_v32i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpeqd (%rsi){1to8}, %ymm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
@@ -6484,58 +4284,44 @@ define zeroext i32 @test_masked_vpcmpeqd_v8i1_v32i1_mask_mem_b(i8 zeroext %__u,
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpeqd_v8i1_v32i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi227:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi228:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi229:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: vpbroadcastd (%rsi), %ymm1
-; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
; NoVLX-NEXT: kmovw %edi, %k1
-; NoVLX-NEXT: kandw %k0, %k1, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -6543,6 +4329,7 @@ define zeroext i32 @test_masked_vpcmpeqd_v8i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
@@ -6560,65 +4347,52 @@ entry:
define zeroext i64 @test_vpcmpeqd_v8i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpeqd_v8i1_v64i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpeqd %ymm1, %ymm0, %k0
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpeqd_v8i1_v64i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi230:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi231:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi232:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
-; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: # kill: def %ymm1 killed %ymm1 def %zmm1
+; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -6629,6 +4403,7 @@ define zeroext i64 @test_vpcmpeqd_v8i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
@@ -6641,65 +4416,52 @@ entry:
define zeroext i64 @test_vpcmpeqd_v8i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpeqd_v8i1_v64i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpeqd (%rdi), %ymm0, %k0
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpeqd_v8i1_v64i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi233:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi234:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi235:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: vmovdqa (%rdi), %ymm1
; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -6710,6 +4472,7 @@ define zeroext i64 @test_vpcmpeqd_v8i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>*
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
@@ -6723,7 +4486,7 @@ entry:
define zeroext i64 @test_masked_vpcmpeqd_v8i1_v64i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpeqd_v8i1_v64i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpeqd %ymm1, %ymm0, %k0 {%k1}
; VLX-NEXT: kmovq %k0, %rax
@@ -6731,60 +4494,46 @@ define zeroext i64 @test_masked_vpcmpeqd_v8i1_v64i1_mask(i8 zeroext %__u, <4 x i
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpeqd_v8i1_v64i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi236:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi237:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi238:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
-; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
-; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
+; NoVLX-NEXT: # kill: def %ymm1 killed %ymm1 def %zmm1
+; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
-; NoVLX-NEXT: kandw %k1, %k0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -6795,6 +4544,7 @@ define zeroext i64 @test_masked_vpcmpeqd_v8i1_v64i1_mask(i8 zeroext %__u, <4 x i
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
@@ -6809,7 +4559,7 @@ entry:
define zeroext i64 @test_masked_vpcmpeqd_v8i1_v64i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpeqd_v8i1_v64i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpeqd (%rsi), %ymm0, %k0 {%k1}
; VLX-NEXT: kmovq %k0, %rax
@@ -6817,60 +4567,46 @@ define zeroext i64 @test_masked_vpcmpeqd_v8i1_v64i1_mask_mem(i8 zeroext %__u, <4
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpeqd_v8i1_v64i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi239:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi240:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi241:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: vmovdqa (%rsi), %ymm1
-; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
; NoVLX-NEXT: kmovw %edi, %k1
-; NoVLX-NEXT: kandw %k1, %k0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -6881,6 +4617,7 @@ define zeroext i64 @test_masked_vpcmpeqd_v8i1_v64i1_mask_mem(i8 zeroext %__u, <4
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
@@ -6897,65 +4634,52 @@ entry:
define zeroext i64 @test_vpcmpeqd_v8i1_v64i1_mask_mem_b(<4 x i64> %__a, i32* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpeqd_v8i1_v64i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpeqd (%rdi){1to8}, %ymm0, %k0
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpeqd_v8i1_v64i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi242:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi243:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi244:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: vpbroadcastd (%rdi), %ymm1
; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -6966,6 +4690,7 @@ define zeroext i64 @test_vpcmpeqd_v8i1_v64i1_mask_mem_b(<4 x i64> %__a, i32* %__
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
@@ -6980,7 +4705,7 @@ entry:
define zeroext i64 @test_masked_vpcmpeqd_v8i1_v64i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i32* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpeqd_v8i1_v64i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpeqd (%rsi){1to8}, %ymm0, %k0 {%k1}
; VLX-NEXT: kmovq %k0, %rax
@@ -6988,60 +4713,46 @@ define zeroext i64 @test_masked_vpcmpeqd_v8i1_v64i1_mask_mem_b(i8 zeroext %__u,
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpeqd_v8i1_v64i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi245:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi246:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi247:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: vpbroadcastd (%rsi), %ymm1
-; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
; NoVLX-NEXT: kmovw %edi, %k1
-; NoVLX-NEXT: kandw %k0, %k1, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -7052,6 +4763,7 @@ define zeroext i64 @test_masked_vpcmpeqd_v8i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
@@ -7069,118 +4781,34 @@ entry:
define zeroext i32 @test_vpcmpeqd_v16i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpeqd_v16i1_v32i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpeqd_v16i1_v32i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi248:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi249:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi250:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: .Lcfi251:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi252:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi253:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi254:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi255:
-; NoVLX-NEXT: .cfi_offset %r15, -24
-; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k1
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
@@ -7193,118 +4821,34 @@ entry:
define zeroext i32 @test_vpcmpeqd_v16i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpeqd_v16i1_v32i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpeqd (%rdi), %zmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpeqd_v16i1_v32i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi256:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi257:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi258:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: .Lcfi259:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi260:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi261:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi262:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi263:
-; NoVLX-NEXT: .cfi_offset %r15, -24
-; NoVLX-NEXT: vpcmpeqd (%rdi), %zmm0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpcmpeqd (%rdi), %zmm0, %k1
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
@@ -7318,7 +4862,7 @@ entry:
define zeroext i32 @test_masked_vpcmpeqd_v16i1_v32i1_mask(i16 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpeqd_v16i1_v32i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
@@ -7326,112 +4870,28 @@ define zeroext i32 @test_masked_vpcmpeqd_v16i1_v32i1_mask(i16 zeroext %__u, <8 x
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpeqd_v16i1_v32i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi264:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi265:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi266:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: .Lcfi267:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi268:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi269:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi270:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi271:
-; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: kmovw %edi, %k1
-; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k1 {%k1}
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
@@ -7446,7 +4906,7 @@ entry:
define zeroext i32 @test_masked_vpcmpeqd_v16i1_v32i1_mask_mem(i16 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpeqd_v16i1_v32i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpeqd (%rsi), %zmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
@@ -7454,112 +4914,28 @@ define zeroext i32 @test_masked_vpcmpeqd_v16i1_v32i1_mask_mem(i16 zeroext %__u,
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpeqd_v16i1_v32i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi272:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi273:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi274:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: .Lcfi275:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi276:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi277:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi278:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi279:
-; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: kmovw %edi, %k1
-; NoVLX-NEXT: vpcmpeqd (%rsi), %zmm0, %k0 {%k1}
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpcmpeqd (%rsi), %zmm0, %k1 {%k1}
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
@@ -7576,118 +4952,34 @@ entry:
define zeroext i32 @test_vpcmpeqd_v16i1_v32i1_mask_mem_b(<8 x i64> %__a, i32* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpeqd_v16i1_v32i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpeqd (%rdi){1to16}, %zmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpeqd_v16i1_v32i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi280:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi281:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi282:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: .Lcfi283:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi284:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi285:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi286:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi287:
-; NoVLX-NEXT: .cfi_offset %r15, -24
-; NoVLX-NEXT: vpcmpeqd (%rdi){1to16}, %zmm0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpcmpeqd (%rdi){1to16}, %zmm0, %k1
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
@@ -7702,7 +4994,7 @@ entry:
define zeroext i32 @test_masked_vpcmpeqd_v16i1_v32i1_mask_mem_b(i16 zeroext %__u, <8 x i64> %__a, i32* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpeqd_v16i1_v32i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpeqd (%rsi){1to16}, %zmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
@@ -7710,112 +5002,28 @@ define zeroext i32 @test_masked_vpcmpeqd_v16i1_v32i1_mask_mem_b(i16 zeroext %__u
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpeqd_v16i1_v32i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi288:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi289:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi290:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: .Lcfi291:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi292:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi293:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi294:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi295:
-; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: kmovw %edi, %k1
-; NoVLX-NEXT: vpcmpeqd (%rsi){1to16}, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpcmpeqd (%rsi){1to16}, %zmm0, %k1 {%k1}
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
@@ -7833,108 +5041,28 @@ entry:
define zeroext i64 @test_vpcmpeqd_v16i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpeqd_v16i1_v64i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpeqd_v16i1_v64i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi296:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi297:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi298:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: .Lcfi299:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi300:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi301:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi302:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi303:
-; NoVLX-NEXT: .cfi_offset %r15, -24
-; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k1
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -7943,13 +5071,9 @@ define zeroext i64 @test_vpcmpeqd_v16i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__
; NoVLX-NEXT: shlq $32, %rcx
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: orq %rcx, %rax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
@@ -7962,108 +5086,28 @@ entry:
define zeroext i64 @test_vpcmpeqd_v16i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpeqd_v16i1_v64i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpeqd (%rdi), %zmm0, %k0
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpeqd_v16i1_v64i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi304:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi305:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi306:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: .Lcfi307:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi308:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi309:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi310:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi311:
-; NoVLX-NEXT: .cfi_offset %r15, -24
-; NoVLX-NEXT: vpcmpeqd (%rdi), %zmm0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpcmpeqd (%rdi), %zmm0, %k1
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -8072,13 +5116,9 @@ define zeroext i64 @test_vpcmpeqd_v16i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>
; NoVLX-NEXT: shlq $32, %rcx
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: orq %rcx, %rax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
@@ -8092,7 +5132,7 @@ entry:
define zeroext i64 @test_masked_vpcmpeqd_v16i1_v64i1_mask(i16 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpeqd_v16i1_v64i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 {%k1}
; VLX-NEXT: kmovq %k0, %rax
@@ -8100,102 +5140,22 @@ define zeroext i64 @test_masked_vpcmpeqd_v16i1_v64i1_mask(i16 zeroext %__u, <8 x
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpeqd_v16i1_v64i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi312:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi313:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi314:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: .Lcfi315:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi316:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi317:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi318:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi319:
-; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: kmovw %edi, %k1
-; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpcmpeqd %zmm1, %zmm0, %k1 {%k1}
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -8204,13 +5164,9 @@ define zeroext i64 @test_masked_vpcmpeqd_v16i1_v64i1_mask(i16 zeroext %__u, <8 x
; NoVLX-NEXT: shlq $32, %rcx
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: orq %rcx, %rax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
@@ -8225,7 +5181,7 @@ entry:
define zeroext i64 @test_masked_vpcmpeqd_v16i1_v64i1_mask_mem(i16 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpeqd_v16i1_v64i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpeqd (%rsi), %zmm0, %k0 {%k1}
; VLX-NEXT: kmovq %k0, %rax
@@ -8233,102 +5189,22 @@ define zeroext i64 @test_masked_vpcmpeqd_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpeqd_v16i1_v64i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi320:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi321:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi322:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: .Lcfi323:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi324:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi325:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi326:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi327:
-; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: kmovw %edi, %k1
-; NoVLX-NEXT: vpcmpeqd (%rsi), %zmm0, %k0 {%k1}
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpcmpeqd (%rsi), %zmm0, %k1 {%k1}
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -8337,13 +5213,9 @@ define zeroext i64 @test_masked_vpcmpeqd_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: shlq $32, %rcx
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: orq %rcx, %rax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
@@ -8360,108 +5232,28 @@ entry:
define zeroext i64 @test_vpcmpeqd_v16i1_v64i1_mask_mem_b(<8 x i64> %__a, i32* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpeqd_v16i1_v64i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpeqd (%rdi){1to16}, %zmm0, %k0
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpeqd_v16i1_v64i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi328:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi329:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi330:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: .Lcfi331:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi332:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi333:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi334:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi335:
-; NoVLX-NEXT: .cfi_offset %r15, -24
-; NoVLX-NEXT: vpcmpeqd (%rdi){1to16}, %zmm0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpcmpeqd (%rdi){1to16}, %zmm0, %k1
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -8470,13 +5262,9 @@ define zeroext i64 @test_vpcmpeqd_v16i1_v64i1_mask_mem_b(<8 x i64> %__a, i32* %_
; NoVLX-NEXT: shlq $32, %rcx
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: orq %rcx, %rax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
@@ -8491,7 +5279,7 @@ entry:
define zeroext i64 @test_masked_vpcmpeqd_v16i1_v64i1_mask_mem_b(i16 zeroext %__u, <8 x i64> %__a, i32* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpeqd_v16i1_v64i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpeqd (%rsi){1to16}, %zmm0, %k0 {%k1}
; VLX-NEXT: kmovq %k0, %rax
@@ -8499,102 +5287,22 @@ define zeroext i64 @test_masked_vpcmpeqd_v16i1_v64i1_mask_mem_b(i16 zeroext %__u
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpeqd_v16i1_v64i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi336:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi337:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi338:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: .Lcfi339:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi340:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi341:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi342:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi343:
-; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: kmovw %edi, %k1
-; NoVLX-NEXT: vpcmpeqd (%rsi){1to16}, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpcmpeqd (%rsi){1to16}, %zmm0, %k1 {%k1}
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -8603,13 +5311,9 @@ define zeroext i64 @test_masked_vpcmpeqd_v16i1_v64i1_mask_mem_b(i16 zeroext %__u
; NoVLX-NEXT: shlq $32, %rcx
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: orq %rcx, %rax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
@@ -8627,14 +5331,14 @@ entry:
define zeroext i4 @test_vpcmpeqq_v2i1_v4i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpeqq_v2i1_v4i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpeqq %xmm1, %xmm0, %k0
; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpeqq_v2i1_v4i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0
@@ -8642,6 +5346,7 @@ define zeroext i4 @test_vpcmpeqq_v2i1_v4i1_mask(<2 x i64> %__a, <2 x i64> %__b)
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp)
; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -8654,14 +5359,14 @@ entry:
define zeroext i4 @test_vpcmpeqq_v2i1_v4i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpeqq_v2i1_v4i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpeqq (%rdi), %xmm0, %k0
; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpeqq_v2i1_v4i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpcmpeqq (%rdi), %xmm0, %xmm0
; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0
@@ -8669,6 +5374,7 @@ define zeroext i4 @test_vpcmpeqq_v2i1_v4i1_mask_mem(<2 x i64> %__a, <2 x i64>* %
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp)
; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -8682,7 +5388,7 @@ entry:
define zeroext i4 @test_masked_vpcmpeqq_v2i1_v4i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpeqq_v2i1_v4i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 {%k1}
; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
@@ -8690,15 +5396,12 @@ define zeroext i4 @test_masked_vpcmpeqq_v2i1_v4i1_mask(i8 zeroext %__u, <2 x i64
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v4i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k0, %ecx
; NoVLX-NEXT: vmovd %ecx, %xmm1
; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
@@ -8708,6 +5411,7 @@ define zeroext i4 @test_masked_vpcmpeqq_v2i1_v4i1_mask(i8 zeroext %__u, <2 x i64
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp)
; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -8723,7 +5427,7 @@ entry:
define zeroext i4 @test_masked_vpcmpeqq_v2i1_v4i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpeqq_v2i1_v4i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpeqq (%rsi), %xmm0, %k0 {%k1}
; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
@@ -8731,15 +5435,12 @@ define zeroext i4 @test_masked_vpcmpeqq_v2i1_v4i1_mask_mem(i8 zeroext %__u, <2 x
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v4i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpcmpeqq (%rsi), %xmm0, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k0, %ecx
; NoVLX-NEXT: vmovd %ecx, %xmm1
; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
@@ -8749,6 +5450,7 @@ define zeroext i4 @test_masked_vpcmpeqq_v2i1_v4i1_mask_mem(i8 zeroext %__u, <2 x
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp)
; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -8766,14 +5468,14 @@ entry:
define zeroext i4 @test_vpcmpeqq_v2i1_v4i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpeqq_v2i1_v4i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpeqq (%rdi){1to2}, %xmm0, %k0
; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpeqq_v2i1_v4i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpbroadcastq (%rdi), %xmm1
; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
@@ -8782,6 +5484,7 @@ define zeroext i4 @test_vpcmpeqq_v2i1_v4i1_mask_mem_b(<2 x i64> %__a, i64* %__b)
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp)
; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -8796,7 +5499,7 @@ entry:
define zeroext i4 @test_masked_vpcmpeqq_v2i1_v4i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpeqq_v2i1_v4i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpeqq (%rsi){1to2}, %xmm0, %k0 {%k1}
; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
@@ -8804,16 +5507,13 @@ define zeroext i4 @test_masked_vpcmpeqq_v2i1_v4i1_mask_mem_b(i8 zeroext %__u, <2
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v4i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1
; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k0, %ecx
; NoVLX-NEXT: vmovd %ecx, %xmm1
; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
@@ -8823,6 +5523,7 @@ define zeroext i4 @test_masked_vpcmpeqq_v2i1_v4i1_mask_mem_b(i8 zeroext %__u, <2
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp)
; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -8841,33 +5542,31 @@ entry:
define zeroext i8 @test_vpcmpeqq_v2i1_v8i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpeqq_v2i1_v8i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpeqq %xmm1, %xmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %al killed %al killed %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpeqq_v2i1_v8i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kshiftlw $7, %k0, %k0
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
-; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %al killed %al killed %eax
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -8880,33 +5579,31 @@ entry:
define zeroext i8 @test_vpcmpeqq_v2i1_v8i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpeqq_v2i1_v8i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpeqq (%rdi), %xmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %al killed %al killed %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpeqq_v2i1_v8i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpcmpeqq (%rdi), %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kshiftlw $7, %k0, %k0
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
-; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %al killed %al killed %eax
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -8920,44 +5617,39 @@ entry:
define zeroext i8 @test_masked_vpcmpeqq_v2i1_v8i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpeqq_v2i1_v8i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %al killed %al killed %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v8i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k0, %ecx
; NoVLX-NEXT: vmovd %ecx, %xmm1
; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kshiftlw $7, %k0, %k0
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
-; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %al killed %al killed %eax
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -8973,44 +5665,39 @@ entry:
define zeroext i8 @test_masked_vpcmpeqq_v2i1_v8i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpeqq_v2i1_v8i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpeqq (%rsi), %xmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %al killed %al killed %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v8i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpcmpeqq (%rsi), %xmm0, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k0, %ecx
; NoVLX-NEXT: vmovd %ecx, %xmm1
; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kshiftlw $7, %k0, %k0
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
-; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %al killed %al killed %eax
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -9028,34 +5715,32 @@ entry:
define zeroext i8 @test_vpcmpeqq_v2i1_v8i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpeqq_v2i1_v8i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpeqq (%rdi){1to2}, %xmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %al killed %al killed %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpeqq_v2i1_v8i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpbroadcastq (%rdi), %xmm1
; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kshiftlw $7, %k0, %k0
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
-; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %al killed %al killed %eax
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -9070,45 +5755,40 @@ entry:
define zeroext i8 @test_masked_vpcmpeqq_v2i1_v8i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpeqq_v2i1_v8i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpeqq (%rsi){1to2}, %xmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %al killed %al killed %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v8i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1
; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k0, %ecx
; NoVLX-NEXT: vmovd %ecx, %xmm1
; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kshiftlw $7, %k0, %k0
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
-; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %al killed %al killed %eax
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -9127,32 +5807,31 @@ entry:
define zeroext i16 @test_vpcmpeqq_v2i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpeqq_v2i1_v16i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpeqq %xmm1, %xmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpeqq_v2i1_v16i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kmovw %eax, %k0
; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: andl $1, %eax
-; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
-; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -9165,32 +5844,31 @@ entry:
define zeroext i16 @test_vpcmpeqq_v2i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpeqq_v2i1_v16i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpeqq (%rdi), %xmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpeqq_v2i1_v16i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpcmpeqq (%rdi), %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kmovw %eax, %k0
; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: andl $1, %eax
-; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
-; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -9204,43 +5882,39 @@ entry:
define zeroext i16 @test_masked_vpcmpeqq_v2i1_v16i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpeqq_v2i1_v16i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v16i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k0, %ecx
; NoVLX-NEXT: vmovd %ecx, %xmm1
; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kmovw %eax, %k0
; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: andl $1, %eax
-; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
-; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -9256,43 +5930,39 @@ entry:
define zeroext i16 @test_masked_vpcmpeqq_v2i1_v16i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpeqq_v2i1_v16i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpeqq (%rsi), %xmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v16i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpcmpeqq (%rsi), %xmm0, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k0, %ecx
; NoVLX-NEXT: vmovd %ecx, %xmm1
; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kmovw %eax, %k0
; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: andl $1, %eax
-; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
-; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -9310,33 +5980,32 @@ entry:
define zeroext i16 @test_vpcmpeqq_v2i1_v16i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpeqq_v2i1_v16i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpeqq (%rdi){1to2}, %xmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpeqq_v2i1_v16i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpbroadcastq (%rdi), %xmm1
; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kmovw %eax, %k0
; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: andl $1, %eax
-; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
-; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -9351,44 +6020,40 @@ entry:
define zeroext i16 @test_masked_vpcmpeqq_v2i1_v16i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpeqq_v2i1_v16i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpeqq (%rsi){1to2}, %xmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v16i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1
; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k0, %ecx
; NoVLX-NEXT: vmovd %ecx, %xmm1
; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kmovw %eax, %k0
; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: andl $1, %eax
-; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
-; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -9407,20 +6072,17 @@ entry:
define zeroext i32 @test_vpcmpeqq_v2i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpeqq_v2i1_v32i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpeqq %xmm1, %xmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpeqq_v2i1_v32i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi344:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi345:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi346:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -9438,6 +6100,7 @@ define zeroext i32 @test_vpcmpeqq_v2i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -9450,20 +6113,17 @@ entry:
define zeroext i32 @test_vpcmpeqq_v2i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpeqq_v2i1_v32i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpeqq (%rdi), %xmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpeqq_v2i1_v32i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi347:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi348:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi349:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -9481,6 +6141,7 @@ define zeroext i32 @test_vpcmpeqq_v2i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>*
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -9494,32 +6155,26 @@ entry:
define zeroext i32 @test_masked_vpcmpeqq_v2i1_v32i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpeqq_v2i1_v32i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v32i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi350:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi351:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi352:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k0, %ecx
; NoVLX-NEXT: vmovd %ecx, %xmm1
; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
@@ -9536,6 +6191,7 @@ define zeroext i32 @test_masked_vpcmpeqq_v2i1_v32i1_mask(i8 zeroext %__u, <2 x i
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -9551,32 +6207,26 @@ entry:
define zeroext i32 @test_masked_vpcmpeqq_v2i1_v32i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpeqq_v2i1_v32i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpeqq (%rsi), %xmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v32i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi353:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi354:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi355:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
; NoVLX-NEXT: vpcmpeqq (%rsi), %xmm0, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k0, %ecx
; NoVLX-NEXT: vmovd %ecx, %xmm1
; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
@@ -9593,6 +6243,7 @@ define zeroext i32 @test_masked_vpcmpeqq_v2i1_v32i1_mask_mem(i8 zeroext %__u, <2
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -9610,20 +6261,17 @@ entry:
define zeroext i32 @test_vpcmpeqq_v2i1_v32i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpeqq_v2i1_v32i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpeqq (%rdi){1to2}, %xmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpeqq_v2i1_v32i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi356:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi357:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi358:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -9642,6 +6290,7 @@ define zeroext i32 @test_vpcmpeqq_v2i1_v32i1_mask_mem_b(<2 x i64> %__a, i64* %__
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -9656,33 +6305,27 @@ entry:
define zeroext i32 @test_masked_vpcmpeqq_v2i1_v32i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpeqq_v2i1_v32i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpeqq (%rsi){1to2}, %xmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v32i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi359:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi360:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi361:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1
; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k0, %ecx
; NoVLX-NEXT: vmovd %ecx, %xmm1
; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
@@ -9699,6 +6342,7 @@ define zeroext i32 @test_masked_vpcmpeqq_v2i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -9717,26 +6361,22 @@ entry:
define zeroext i64 @test_vpcmpeqq_v2i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpeqq_v2i1_v64i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpeqq %xmm1, %xmm0, %k0
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpeqq_v2i1_v64i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi362:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi363:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi364:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
-; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -9755,6 +6395,7 @@ define zeroext i64 @test_vpcmpeqq_v2i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -9767,26 +6408,22 @@ entry:
define zeroext i64 @test_vpcmpeqq_v2i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpeqq_v2i1_v64i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpeqq (%rdi), %xmm0, %k0
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpeqq_v2i1_v64i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi365:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi366:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi367:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vpcmpeqq (%rdi), %xmm0, %xmm0
-; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -9805,6 +6442,7 @@ define zeroext i64 @test_vpcmpeqq_v2i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>*
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -9818,37 +6456,30 @@ entry:
define zeroext i64 @test_masked_vpcmpeqq_v2i1_v64i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpeqq_v2i1_v64i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 {%k1}
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v64i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi368:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi369:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi370:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k0, %ecx
; NoVLX-NEXT: vmovd %ecx, %xmm1
; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
-; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -9867,6 +6498,7 @@ define zeroext i64 @test_masked_vpcmpeqq_v2i1_v64i1_mask(i8 zeroext %__u, <2 x i
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -9882,37 +6514,30 @@ entry:
define zeroext i64 @test_masked_vpcmpeqq_v2i1_v64i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpeqq_v2i1_v64i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpeqq (%rsi), %xmm0, %k0 {%k1}
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v64i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi371:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi372:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi373:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vpcmpeqq (%rsi), %xmm0, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k0, %ecx
; NoVLX-NEXT: vmovd %ecx, %xmm1
; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
-; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -9931,6 +6556,7 @@ define zeroext i64 @test_masked_vpcmpeqq_v2i1_v64i1_mask_mem(i8 zeroext %__u, <2
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -9948,27 +6574,23 @@ entry:
define zeroext i64 @test_vpcmpeqq_v2i1_v64i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpeqq_v2i1_v64i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpeqq (%rdi){1to2}, %xmm0, %k0
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpeqq_v2i1_v64i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi374:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi375:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi376:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vpbroadcastq (%rdi), %xmm1
; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
-; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -9987,6 +6609,7 @@ define zeroext i64 @test_vpcmpeqq_v2i1_v64i1_mask_mem_b(<2 x i64> %__a, i64* %__
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -10001,38 +6624,31 @@ entry:
define zeroext i64 @test_masked_vpcmpeqq_v2i1_v64i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpeqq_v2i1_v64i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpeqq (%rsi){1to2}, %xmm0, %k0 {%k1}
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v64i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi377:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi378:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi379:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1
; NoVLX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k0, %ecx
; NoVLX-NEXT: vmovd %ecx, %xmm1
; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
-; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -10051,6 +6667,7 @@ define zeroext i64 @test_masked_vpcmpeqq_v2i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -10069,51 +6686,48 @@ entry:
define zeroext i8 @test_vpcmpeqq_v4i1_v8i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpeqq_v4i1_v8i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpeqq %ymm1, %ymm0, %k0
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %al killed %al killed %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpeqq_v4i1_v8i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kshiftlw $7, %k0, %k0
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $13, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $12, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %al killed %al killed %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
@@ -10126,51 +6740,48 @@ entry:
define zeroext i8 @test_vpcmpeqq_v4i1_v8i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpeqq_v4i1_v8i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpeqq (%rdi), %ymm0, %k0
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %al killed %al killed %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpeqq_v4i1_v8i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpcmpeqq (%rdi), %ymm0, %ymm0
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kshiftlw $7, %k0, %k0
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $13, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $12, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %al killed %al killed %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
@@ -10184,70 +6795,62 @@ entry:
define zeroext i8 @test_masked_vpcmpeqq_v4i1_v8i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpeqq_v4i1_v8i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpeqq %ymm1, %ymm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %al killed %al killed %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v8i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $13, %k0, %k2
-; NoVLX-NEXT: kshiftrw $15, %k2, %k2
-; NoVLX-NEXT: kshiftlw $15, %k0, %k3
-; NoVLX-NEXT: kshiftrw $15, %k3, %k3
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k3, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kshiftlw $7, %k0, %k0
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $13, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $12, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %al killed %al killed %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
@@ -10263,70 +6866,62 @@ entry:
define zeroext i8 @test_masked_vpcmpeqq_v4i1_v8i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpeqq_v4i1_v8i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpeqq (%rsi), %ymm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %al killed %al killed %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v8i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpcmpeqq (%rsi), %ymm0, %ymm0
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $13, %k0, %k2
-; NoVLX-NEXT: kshiftrw $15, %k2, %k2
-; NoVLX-NEXT: kshiftlw $15, %k0, %k3
-; NoVLX-NEXT: kshiftrw $15, %k3, %k3
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k3, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kshiftlw $7, %k0, %k0
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $13, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $12, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %al killed %al killed %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
@@ -10344,52 +6939,49 @@ entry:
define zeroext i8 @test_vpcmpeqq_v4i1_v8i1_mask_mem_b(<4 x i64> %__a, i64* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpeqq_v4i1_v8i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpeqq (%rdi){1to4}, %ymm0, %k0
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %al killed %al killed %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpeqq_v4i1_v8i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpbroadcastq (%rdi), %ymm1
; NoVLX-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kshiftlw $7, %k0, %k0
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $13, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $12, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %al killed %al killed %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
@@ -10404,71 +6996,63 @@ entry:
define zeroext i8 @test_masked_vpcmpeqq_v4i1_v8i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i64* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpeqq_v4i1_v8i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpeqq (%rsi){1to4}, %ymm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %al killed %al killed %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v8i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpbroadcastq (%rsi), %ymm1
; NoVLX-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $13, %k0, %k2
-; NoVLX-NEXT: kshiftrw $15, %k2, %k2
-; NoVLX-NEXT: kshiftlw $15, %k0, %k3
-; NoVLX-NEXT: kshiftrw $15, %k3, %k3
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k3, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kshiftlw $7, %k0, %k0
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $13, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $12, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %al killed %al killed %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
@@ -10487,50 +7071,48 @@ entry:
define zeroext i16 @test_vpcmpeqq_v4i1_v16i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpeqq_v4i1_v16i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpeqq %ymm1, %ymm0, %k0
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpeqq_v4i1_v16i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kmovw %eax, %k0
; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: andl $1, %eax
-; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $13, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $12, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
@@ -10543,50 +7125,48 @@ entry:
define zeroext i16 @test_vpcmpeqq_v4i1_v16i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpeqq_v4i1_v16i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpeqq (%rdi), %ymm0, %k0
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpeqq_v4i1_v16i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpcmpeqq (%rdi), %ymm0, %ymm0
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kmovw %eax, %k0
; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: andl $1, %eax
-; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $13, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $12, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
@@ -10600,69 +7180,62 @@ entry:
define zeroext i16 @test_masked_vpcmpeqq_v4i1_v16i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpeqq_v4i1_v16i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpeqq %ymm1, %ymm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v16i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $13, %k0, %k2
-; NoVLX-NEXT: kshiftrw $15, %k2, %k2
-; NoVLX-NEXT: kshiftlw $15, %k0, %k3
-; NoVLX-NEXT: kshiftrw $15, %k3, %k3
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k3, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kmovw %eax, %k0
; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: andl $1, %eax
-; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $13, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $12, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
@@ -10678,69 +7251,62 @@ entry:
define zeroext i16 @test_masked_vpcmpeqq_v4i1_v16i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpeqq_v4i1_v16i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpeqq (%rsi), %ymm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v16i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpcmpeqq (%rsi), %ymm0, %ymm0
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $13, %k0, %k2
-; NoVLX-NEXT: kshiftrw $15, %k2, %k2
-; NoVLX-NEXT: kshiftlw $15, %k0, %k3
-; NoVLX-NEXT: kshiftrw $15, %k3, %k3
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k3, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kmovw %eax, %k0
; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: andl $1, %eax
-; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $13, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $12, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
@@ -10758,51 +7324,49 @@ entry:
define zeroext i16 @test_vpcmpeqq_v4i1_v16i1_mask_mem_b(<4 x i64> %__a, i64* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpeqq_v4i1_v16i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpeqq (%rdi){1to4}, %ymm0, %k0
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpeqq_v4i1_v16i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpbroadcastq (%rdi), %ymm1
; NoVLX-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kmovw %eax, %k0
; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: andl $1, %eax
-; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $13, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $12, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
@@ -10817,70 +7381,63 @@ entry:
define zeroext i16 @test_masked_vpcmpeqq_v4i1_v16i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i64* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpeqq_v4i1_v16i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpeqq (%rsi){1to4}, %ymm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v16i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpbroadcastq (%rsi), %ymm1
; NoVLX-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $13, %k0, %k2
-; NoVLX-NEXT: kshiftrw $15, %k2, %k2
-; NoVLX-NEXT: kshiftlw $15, %k0, %k3
-; NoVLX-NEXT: kshiftrw $15, %k3, %k3
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k3, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kmovw %eax, %k0
; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: andl $1, %eax
-; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $13, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $12, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
@@ -10899,21 +7456,18 @@ entry:
define zeroext i32 @test_vpcmpeqq_v4i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpeqq_v4i1_v32i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpeqq %ymm1, %ymm0, %k0
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpeqq_v4i1_v32i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi380:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi381:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi382:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -10932,6 +7486,7 @@ define zeroext i32 @test_vpcmpeqq_v4i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__b
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
@@ -10944,21 +7499,18 @@ entry:
define zeroext i32 @test_vpcmpeqq_v4i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpeqq_v4i1_v32i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpeqq (%rdi), %ymm0, %k0
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpeqq_v4i1_v32i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi383:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi384:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi385:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -10977,6 +7529,7 @@ define zeroext i32 @test_vpcmpeqq_v4i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>*
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
@@ -10990,7 +7543,7 @@ entry:
define zeroext i32 @test_masked_vpcmpeqq_v4i1_v32i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpeqq_v4i1_v32i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpeqq %ymm1, %ymm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
@@ -10998,35 +7551,27 @@ define zeroext i32 @test_masked_vpcmpeqq_v4i1_v32i1_mask(i8 zeroext %__u, <4 x i
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v32i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi386:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi387:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi388:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
; NoVLX-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $13, %k0, %k2
-; NoVLX-NEXT: kshiftrw $15, %k2, %k2
-; NoVLX-NEXT: kshiftlw $15, %k0, %k3
-; NoVLX-NEXT: kshiftrw $15, %k3, %k3
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k3, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -11042,6 +7587,7 @@ define zeroext i32 @test_masked_vpcmpeqq_v4i1_v32i1_mask(i8 zeroext %__u, <4 x i
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
@@ -11057,7 +7603,7 @@ entry:
define zeroext i32 @test_masked_vpcmpeqq_v4i1_v32i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpeqq_v4i1_v32i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpeqq (%rsi), %ymm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
@@ -11065,35 +7611,27 @@ define zeroext i32 @test_masked_vpcmpeqq_v4i1_v32i1_mask_mem(i8 zeroext %__u, <4
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v32i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi389:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi390:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi391:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
; NoVLX-NEXT: vpcmpeqq (%rsi), %ymm0, %ymm0
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $13, %k0, %k2
-; NoVLX-NEXT: kshiftrw $15, %k2, %k2
-; NoVLX-NEXT: kshiftlw $15, %k0, %k3
-; NoVLX-NEXT: kshiftrw $15, %k3, %k3
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k3, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -11109,6 +7647,7 @@ define zeroext i32 @test_masked_vpcmpeqq_v4i1_v32i1_mask_mem(i8 zeroext %__u, <4
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
@@ -11126,21 +7665,18 @@ entry:
define zeroext i32 @test_vpcmpeqq_v4i1_v32i1_mask_mem_b(<4 x i64> %__a, i64* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpeqq_v4i1_v32i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpeqq (%rdi){1to4}, %ymm0, %k0
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpeqq_v4i1_v32i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi392:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi393:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi394:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -11160,6 +7696,7 @@ define zeroext i32 @test_vpcmpeqq_v4i1_v32i1_mask_mem_b(<4 x i64> %__a, i64* %__
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
@@ -11174,7 +7711,7 @@ entry:
define zeroext i32 @test_masked_vpcmpeqq_v4i1_v32i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i64* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpeqq_v4i1_v32i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpeqq (%rsi){1to4}, %ymm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
@@ -11182,14 +7719,11 @@ define zeroext i32 @test_masked_vpcmpeqq_v4i1_v32i1_mask_mem_b(i8 zeroext %__u,
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v32i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi395:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi396:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi397:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -11197,21 +7731,16 @@ define zeroext i32 @test_masked_vpcmpeqq_v4i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $13, %k0, %k2
-; NoVLX-NEXT: kshiftrw $15, %k2, %k2
-; NoVLX-NEXT: kshiftlw $15, %k0, %k3
-; NoVLX-NEXT: kshiftrw $15, %k3, %k3
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k3, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -11227,6 +7756,7 @@ define zeroext i32 @test_masked_vpcmpeqq_v4i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
@@ -11245,28 +7775,24 @@ entry:
define zeroext i64 @test_vpcmpeqq_v4i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpeqq_v4i1_v64i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpeqq %ymm1, %ymm0, %k0
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpeqq_v4i1_v64i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi398:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi399:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi400:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
-; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -11285,6 +7811,7 @@ define zeroext i64 @test_vpcmpeqq_v4i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
@@ -11297,28 +7824,24 @@ entry:
define zeroext i64 @test_vpcmpeqq_v4i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpeqq_v4i1_v64i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpeqq (%rdi), %ymm0, %k0
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpeqq_v4i1_v64i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi401:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi402:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi403:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vpcmpeqq (%rdi), %ymm0, %ymm0
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
-; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -11337,6 +7860,7 @@ define zeroext i64 @test_vpcmpeqq_v4i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>*
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
@@ -11350,7 +7874,7 @@ entry:
define zeroext i64 @test_masked_vpcmpeqq_v4i1_v64i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpeqq_v4i1_v64i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpeqq %ymm1, %ymm0, %k0 {%k1}
; VLX-NEXT: kmovq %k0, %rax
@@ -11358,39 +7882,30 @@ define zeroext i64 @test_masked_vpcmpeqq_v4i1_v64i1_mask(i8 zeroext %__u, <4 x i
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v64i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi404:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi405:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi406:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $13, %k0, %k2
-; NoVLX-NEXT: kshiftrw $15, %k2, %k2
-; NoVLX-NEXT: kshiftlw $15, %k0, %k3
-; NoVLX-NEXT: kshiftrw $15, %k3, %k3
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k3, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
-; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -11409,6 +7924,7 @@ define zeroext i64 @test_masked_vpcmpeqq_v4i1_v64i1_mask(i8 zeroext %__u, <4 x i
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
@@ -11424,7 +7940,7 @@ entry:
define zeroext i64 @test_masked_vpcmpeqq_v4i1_v64i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpeqq_v4i1_v64i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpeqq (%rsi), %ymm0, %k0 {%k1}
; VLX-NEXT: kmovq %k0, %rax
@@ -11432,39 +7948,30 @@ define zeroext i64 @test_masked_vpcmpeqq_v4i1_v64i1_mask_mem(i8 zeroext %__u, <4
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v64i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi407:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi408:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi409:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vpcmpeqq (%rsi), %ymm0, %ymm0
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $13, %k0, %k2
-; NoVLX-NEXT: kshiftrw $15, %k2, %k2
-; NoVLX-NEXT: kshiftlw $15, %k0, %k3
-; NoVLX-NEXT: kshiftrw $15, %k3, %k3
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k3, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
-; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -11483,6 +7990,7 @@ define zeroext i64 @test_masked_vpcmpeqq_v4i1_v64i1_mask_mem(i8 zeroext %__u, <4
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
@@ -11500,29 +8008,25 @@ entry:
define zeroext i64 @test_vpcmpeqq_v4i1_v64i1_mask_mem_b(<4 x i64> %__a, i64* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpeqq_v4i1_v64i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpeqq (%rdi){1to4}, %ymm0, %k0
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpeqq_v4i1_v64i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi410:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi411:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi412:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vpbroadcastq (%rdi), %ymm1
; NoVLX-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
-; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -11541,6 +8045,7 @@ define zeroext i64 @test_vpcmpeqq_v4i1_v64i1_mask_mem_b(<4 x i64> %__a, i64* %__
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
@@ -11555,7 +8060,7 @@ entry:
define zeroext i64 @test_masked_vpcmpeqq_v4i1_v64i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i64* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpeqq_v4i1_v64i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpeqq (%rsi){1to4}, %ymm0, %k0 {%k1}
; VLX-NEXT: kmovq %k0, %rax
@@ -11563,14 +8068,11 @@ define zeroext i64 @test_masked_vpcmpeqq_v4i1_v64i1_mask_mem_b(i8 zeroext %__u,
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v64i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi413:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi414:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi415:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -11578,25 +8080,19 @@ define zeroext i64 @test_masked_vpcmpeqq_v4i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $13, %k0, %k2
-; NoVLX-NEXT: kshiftrw $15, %k2, %k2
-; NoVLX-NEXT: kshiftlw $15, %k0, %k3
-; NoVLX-NEXT: kshiftrw $15, %k3, %k3
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k3, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
-; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -11615,6 +8111,7 @@ define zeroext i64 @test_masked_vpcmpeqq_v4i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
@@ -11633,18 +8130,19 @@ entry:
define zeroext i16 @test_vpcmpeqq_v8i1_v16i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpeqq_v8i1_v16i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpeqq %zmm1, %zmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpeqq_v8i1_v16i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpcmpeqq %zmm1, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
@@ -11657,18 +8155,19 @@ entry:
define zeroext i16 @test_vpcmpeqq_v8i1_v16i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpeqq_v8i1_v16i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpeqq (%rdi), %zmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpeqq_v8i1_v16i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpcmpeqq (%rdi), %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
@@ -11682,20 +8181,21 @@ entry:
define zeroext i16 @test_masked_vpcmpeqq_v8i1_v16i1_mask(i8 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpeqq_v8i1_v16i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpeqq_v8i1_v16i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 {%k1}
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
@@ -11710,20 +8210,21 @@ entry:
define zeroext i16 @test_masked_vpcmpeqq_v8i1_v16i1_mask_mem(i8 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpeqq_v8i1_v16i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpeqq (%rsi), %zmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpeqq_v8i1_v16i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpeqq (%rsi), %zmm0, %k0 {%k1}
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
@@ -11740,18 +8241,19 @@ entry:
define zeroext i16 @test_vpcmpeqq_v8i1_v16i1_mask_mem_b(<8 x i64> %__a, i64* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpeqq_v8i1_v16i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpeqq (%rdi){1to8}, %zmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpeqq_v8i1_v16i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpcmpeqq (%rdi){1to8}, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
@@ -11766,20 +8268,21 @@ entry:
define zeroext i16 @test_masked_vpcmpeqq_v8i1_v16i1_mask_mem_b(i8 zeroext %__u, <8 x i64> %__a, i64* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpeqq_v8i1_v16i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpeqq (%rsi){1to8}, %zmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpeqq_v8i1_v16i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpeqq (%rsi){1to8}, %zmm0, %k0 {%k1}
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
@@ -11797,61 +8300,48 @@ entry:
define zeroext i32 @test_vpcmpeqq_v8i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpeqq_v8i1_v32i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpeqq %zmm1, %zmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpeqq_v8i1_v32i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi416:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi417:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi418:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
; NoVLX-NEXT: vpcmpeqq %zmm1, %zmm0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -11859,6 +8349,7 @@ define zeroext i32 @test_vpcmpeqq_v8i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %__b
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
@@ -11871,61 +8362,48 @@ entry:
define zeroext i32 @test_vpcmpeqq_v8i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpeqq_v8i1_v32i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpeqq (%rdi), %zmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpeqq_v8i1_v32i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi419:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi420:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi421:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
; NoVLX-NEXT: vpcmpeqq (%rdi), %zmm0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -11933,6 +8411,7 @@ define zeroext i32 @test_vpcmpeqq_v8i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64>*
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
@@ -11946,7 +8425,7 @@ entry:
define zeroext i32 @test_masked_vpcmpeqq_v8i1_v32i1_mask(i8 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpeqq_v8i1_v32i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
@@ -11954,55 +8433,42 @@ define zeroext i32 @test_masked_vpcmpeqq_v8i1_v32i1_mask(i8 zeroext %__u, <8 x i
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpeqq_v8i1_v32i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi422:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi423:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi424:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -12010,6 +8476,7 @@ define zeroext i32 @test_masked_vpcmpeqq_v8i1_v32i1_mask(i8 zeroext %__u, <8 x i
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
@@ -12024,7 +8491,7 @@ entry:
define zeroext i32 @test_masked_vpcmpeqq_v8i1_v32i1_mask_mem(i8 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpeqq_v8i1_v32i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpeqq (%rsi), %zmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
@@ -12032,55 +8499,42 @@ define zeroext i32 @test_masked_vpcmpeqq_v8i1_v32i1_mask_mem(i8 zeroext %__u, <8
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpeqq_v8i1_v32i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi425:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi426:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi427:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpeqq (%rsi), %zmm0, %k0 {%k1}
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -12088,6 +8542,7 @@ define zeroext i32 @test_masked_vpcmpeqq_v8i1_v32i1_mask_mem(i8 zeroext %__u, <8
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
@@ -12104,61 +8559,48 @@ entry:
define zeroext i32 @test_vpcmpeqq_v8i1_v32i1_mask_mem_b(<8 x i64> %__a, i64* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpeqq_v8i1_v32i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpeqq (%rdi){1to8}, %zmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpeqq_v8i1_v32i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi428:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi429:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi430:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
; NoVLX-NEXT: vpcmpeqq (%rdi){1to8}, %zmm0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -12166,6 +8608,7 @@ define zeroext i32 @test_vpcmpeqq_v8i1_v32i1_mask_mem_b(<8 x i64> %__a, i64* %__
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
@@ -12180,7 +8623,7 @@ entry:
define zeroext i32 @test_masked_vpcmpeqq_v8i1_v32i1_mask_mem_b(i8 zeroext %__u, <8 x i64> %__a, i64* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpeqq_v8i1_v32i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpeqq (%rsi){1to8}, %zmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
@@ -12188,55 +8631,42 @@ define zeroext i32 @test_masked_vpcmpeqq_v8i1_v32i1_mask_mem_b(i8 zeroext %__u,
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpeqq_v8i1_v32i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi431:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi432:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi433:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpeqq (%rsi){1to8}, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -12244,6 +8674,7 @@ define zeroext i32 @test_masked_vpcmpeqq_v8i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
@@ -12261,63 +8692,50 @@ entry:
define zeroext i64 @test_vpcmpeqq_v8i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpeqq_v8i1_v64i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpeqq %zmm1, %zmm0, %k0
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpeqq_v8i1_v64i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi434:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi435:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi436:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vpcmpeqq %zmm1, %zmm0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -12328,6 +8746,7 @@ define zeroext i64 @test_vpcmpeqq_v8i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__b
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
@@ -12340,63 +8759,50 @@ entry:
define zeroext i64 @test_vpcmpeqq_v8i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpeqq_v8i1_v64i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpeqq (%rdi), %zmm0, %k0
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpeqq_v8i1_v64i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi437:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi438:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi439:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vpcmpeqq (%rdi), %zmm0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -12407,6 +8813,7 @@ define zeroext i64 @test_vpcmpeqq_v8i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>*
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
@@ -12420,7 +8827,7 @@ entry:
define zeroext i64 @test_masked_vpcmpeqq_v8i1_v64i1_mask(i8 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpeqq_v8i1_v64i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 {%k1}
; VLX-NEXT: kmovq %k0, %rax
@@ -12428,57 +8835,44 @@ define zeroext i64 @test_masked_vpcmpeqq_v8i1_v64i1_mask(i8 zeroext %__u, <8 x i
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpeqq_v8i1_v64i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi440:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi441:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi442:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpeqq %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -12489,6 +8883,7 @@ define zeroext i64 @test_masked_vpcmpeqq_v8i1_v64i1_mask(i8 zeroext %__u, <8 x i
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
@@ -12503,7 +8898,7 @@ entry:
define zeroext i64 @test_masked_vpcmpeqq_v8i1_v64i1_mask_mem(i8 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpeqq_v8i1_v64i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpeqq (%rsi), %zmm0, %k0 {%k1}
; VLX-NEXT: kmovq %k0, %rax
@@ -12511,57 +8906,44 @@ define zeroext i64 @test_masked_vpcmpeqq_v8i1_v64i1_mask_mem(i8 zeroext %__u, <8
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpeqq_v8i1_v64i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi443:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi444:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi445:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpeqq (%rsi), %zmm0, %k0 {%k1}
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -12572,6 +8954,7 @@ define zeroext i64 @test_masked_vpcmpeqq_v8i1_v64i1_mask_mem(i8 zeroext %__u, <8
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
@@ -12588,63 +8971,50 @@ entry:
define zeroext i64 @test_vpcmpeqq_v8i1_v64i1_mask_mem_b(<8 x i64> %__a, i64* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpeqq_v8i1_v64i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpeqq (%rdi){1to8}, %zmm0, %k0
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpeqq_v8i1_v64i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi446:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi447:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi448:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vpcmpeqq (%rdi){1to8}, %zmm0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -12655,6 +9025,7 @@ define zeroext i64 @test_vpcmpeqq_v8i1_v64i1_mask_mem_b(<8 x i64> %__a, i64* %__
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
@@ -12669,7 +9040,7 @@ entry:
define zeroext i64 @test_masked_vpcmpeqq_v8i1_v64i1_mask_mem_b(i8 zeroext %__u, <8 x i64> %__a, i64* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpeqq_v8i1_v64i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpeqq (%rsi){1to8}, %zmm0, %k0 {%k1}
; VLX-NEXT: kmovq %k0, %rax
@@ -12677,57 +9048,44 @@ define zeroext i64 @test_masked_vpcmpeqq_v8i1_v64i1_mask_mem_b(i8 zeroext %__u,
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpeqq_v8i1_v64i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi449:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi450:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi451:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpeqq (%rsi){1to8}, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -12738,6 +9096,7 @@ define zeroext i64 @test_masked_vpcmpeqq_v8i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
@@ -12755,120 +9114,31 @@ entry:
define zeroext i32 @test_vpcmpsgtb_v16i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgtb_v16i1_v32i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpgtb %xmm1, %xmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgtb_v16i1_v32i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi452:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi453:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi454:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: .Lcfi455:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi456:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi457:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi458:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi459:
-; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <16 x i8>
@@ -12881,120 +9151,31 @@ entry:
define zeroext i32 @test_vpcmpsgtb_v16i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgtb_v16i1_v32i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpgtb (%rdi), %xmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgtb_v16i1_v32i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi460:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi461:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi462:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: .Lcfi463:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi464:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi465:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi466:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi467:
-; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpcmpgtb (%rdi), %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <16 x i8>
@@ -13008,122 +9189,38 @@ entry:
define zeroext i32 @test_masked_vpcmpsgtb_v16i1_v32i1_mask(i16 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgtb_v16i1_v32i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpgtb %xmm1, %xmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgtb_v16i1_v32i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi468:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi469:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi470:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: .Lcfi471:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi472:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi473:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi474:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi475:
-; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1}
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <16 x i8>
@@ -13138,122 +9235,38 @@ entry:
define zeroext i32 @test_masked_vpcmpsgtb_v16i1_v32i1_mask_mem(i16 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgtb_v16i1_v32i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpgtb (%rsi), %xmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgtb_v16i1_v32i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi476:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi477:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi478:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: .Lcfi479:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi480:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi481:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi482:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi483:
-; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: vpcmpgtb (%rsi), %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1}
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <16 x i8>
@@ -13270,125 +9283,36 @@ entry:
define zeroext i64 @test_vpcmpsgtb_v16i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgtb_v16i1_v64i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpgtb %xmm1, %xmm0, %k0
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgtb_v16i1_v64i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi484:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi485:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi486:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: .Lcfi487:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi488:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi489:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi490:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi491:
-; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
; NoVLX-NEXT: shlq $32, %rcx
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: orq %rcx, %rax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <16 x i8>
@@ -13401,125 +9325,36 @@ entry:
define zeroext i64 @test_vpcmpsgtb_v16i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgtb_v16i1_v64i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpgtb (%rdi), %xmm0, %k0
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgtb_v16i1_v64i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi492:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi493:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi494:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: .Lcfi495:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi496:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi497:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi498:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi499:
-; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpcmpgtb (%rdi), %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
; NoVLX-NEXT: shlq $32, %rcx
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: orq %rcx, %rax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <16 x i8>
@@ -13533,112 +9368,32 @@ entry:
define zeroext i64 @test_masked_vpcmpsgtb_v16i1_v64i1_mask(i16 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgtb_v16i1_v64i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpgtb %xmm1, %xmm0, %k0 {%k1}
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgtb_v16i1_v64i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi500:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi501:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi502:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: .Lcfi503:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi504:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi505:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi506:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi507:
-; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1}
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -13647,13 +9402,9 @@ define zeroext i64 @test_masked_vpcmpsgtb_v16i1_v64i1_mask(i16 zeroext %__u, <2
; NoVLX-NEXT: shlq $32, %rcx
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: orq %rcx, %rax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <16 x i8>
@@ -13668,112 +9419,32 @@ entry:
define zeroext i64 @test_masked_vpcmpsgtb_v16i1_v64i1_mask_mem(i16 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgtb_v16i1_v64i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpgtb (%rsi), %xmm0, %k0 {%k1}
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgtb_v16i1_v64i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi508:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi509:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi510:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: .Lcfi511:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi512:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi513:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi514:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi515:
-; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: vpcmpgtb (%rsi), %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1}
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -13782,13 +9453,9 @@ define zeroext i64 @test_masked_vpcmpsgtb_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: shlq $32, %rcx
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: orq %rcx, %rax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <16 x i8>
@@ -13805,21 +9472,18 @@ entry:
define zeroext i64 @test_vpcmpsgtb_v32i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgtb_v32i1_v64i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpgtb %ymm1, %ymm0, %k0
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgtb_v32i1_v64i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi516:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi517:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi518:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -13834,8 +9498,7 @@ define zeroext i64 @test_vpcmpsgtb_v32i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %_
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %ecx
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
@@ -13843,6 +9506,7 @@ define zeroext i64 @test_vpcmpsgtb_v32i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %_
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <32 x i8>
@@ -13855,21 +9519,18 @@ entry:
define zeroext i64 @test_vpcmpsgtb_v32i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgtb_v32i1_v64i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpgtb (%rdi), %ymm0, %k0
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgtb_v32i1_v64i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi519:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi520:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi521:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -13884,8 +9545,7 @@ define zeroext i64 @test_vpcmpsgtb_v32i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %ecx
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
@@ -13893,6 +9553,7 @@ define zeroext i64 @test_vpcmpsgtb_v32i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <32 x i8>
@@ -13906,7 +9567,7 @@ entry:
define zeroext i64 @test_masked_vpcmpsgtb_v32i1_v64i1_mask(i32 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgtb_v32i1_v64i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpgtb %ymm1, %ymm0, %k0 {%k1}
; VLX-NEXT: kmovq %k0, %rax
@@ -13914,14 +9575,11 @@ define zeroext i64 @test_masked_vpcmpsgtb_v32i1_v64i1_mask(i32 zeroext %__u, <4
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgtb_v32i1_v64i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi522:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi523:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi524:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $96, %rsp
@@ -13932,7 +9590,6 @@ define zeroext i64 @test_masked_vpcmpsgtb_v32i1_v64i1_mask(i32 zeroext %__u, <4
; NoVLX-NEXT: vpmovdb %zmm2, %xmm2
; NoVLX-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k2} {z}
; NoVLX-NEXT: vpmovdb %zmm3, %xmm3
-; NoVLX-NEXT: vpxord %zmm4, %zmm4, %zmm4
; NoVLX-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpand %xmm3, %xmm1, %xmm1
@@ -13946,7 +9603,7 @@ define zeroext i64 @test_masked_vpcmpsgtb_v32i1_v64i1_mask(i32 zeroext %__u, <4
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %ecx
-; NoVLX-NEXT: vptestmd %zmm4, %zmm4, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
@@ -13954,6 +9611,7 @@ define zeroext i64 @test_masked_vpcmpsgtb_v32i1_v64i1_mask(i32 zeroext %__u, <4
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <32 x i8>
@@ -13968,7 +9626,7 @@ entry:
define zeroext i64 @test_masked_vpcmpsgtb_v32i1_v64i1_mask_mem(i32 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgtb_v32i1_v64i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpgtb (%rsi), %ymm0, %k0 {%k1}
; VLX-NEXT: kmovq %k0, %rax
@@ -13976,14 +9634,11 @@ define zeroext i64 @test_masked_vpcmpsgtb_v32i1_v64i1_mask_mem(i32 zeroext %__u,
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgtb_v32i1_v64i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi525:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi526:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi527:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $96, %rsp
@@ -13994,10 +9649,9 @@ define zeroext i64 @test_masked_vpcmpsgtb_v32i1_v64i1_mask_mem(i32 zeroext %__u,
; NoVLX-NEXT: vpmovdb %zmm1, %xmm1
; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k2} {z}
; NoVLX-NEXT: vpmovdb %zmm2, %xmm2
-; NoVLX-NEXT: vpxord %zmm3, %zmm3, %zmm3
; NoVLX-NEXT: vpcmpgtb (%rsi), %ymm0, %ymm0
-; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm4
-; NoVLX-NEXT: vpand %xmm2, %xmm4, %xmm2
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm3
+; NoVLX-NEXT: vpand %xmm2, %xmm3, %xmm2
; NoVLX-NEXT: vpmovsxbd %xmm2, %zmm2
; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2
; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0
@@ -14008,7 +9662,7 @@ define zeroext i64 @test_masked_vpcmpsgtb_v32i1_v64i1_mask_mem(i32 zeroext %__u,
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %ecx
-; NoVLX-NEXT: vptestmd %zmm3, %zmm3, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
@@ -14016,6 +9670,7 @@ define zeroext i64 @test_masked_vpcmpsgtb_v32i1_v64i1_mask_mem(i32 zeroext %__u,
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <32 x i8>
@@ -14032,22 +9687,21 @@ entry:
define zeroext i16 @test_vpcmpsgtw_v8i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgtw_v8i1_v16i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpgtw %xmm1, %xmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgtw_v8i1_v16i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kunpckbw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
@@ -14060,22 +9714,21 @@ entry:
define zeroext i16 @test_vpcmpsgtw_v8i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgtw_v8i1_v16i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpgtw (%rdi), %xmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgtw_v8i1_v16i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpcmpgtw (%rdi), %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kunpckbw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
@@ -14089,24 +9742,23 @@ entry:
define zeroext i16 @test_masked_vpcmpsgtw_v8i1_v16i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgtw_v8i1_v16i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpgtw %xmm1, %xmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgtw_v8i1_v16i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kunpckbw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
@@ -14121,24 +9773,23 @@ entry:
define zeroext i16 @test_masked_vpcmpsgtw_v8i1_v16i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgtw_v8i1_v16i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpgtw (%rsi), %xmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgtw_v8i1_v16i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpcmpgtw (%rsi), %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kunpckbw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
@@ -14155,20 +9806,17 @@ entry:
define zeroext i32 @test_vpcmpsgtw_v8i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgtw_v8i1_v32i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpgtw %xmm1, %xmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgtw_v8i1_v32i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi528:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi529:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi530:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -14176,42 +9824,32 @@ define zeroext i32 @test_vpcmpsgtw_v8i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -14219,6 +9857,7 @@ define zeroext i32 @test_vpcmpsgtw_v8i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
@@ -14231,20 +9870,17 @@ entry:
define zeroext i32 @test_vpcmpsgtw_v8i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgtw_v8i1_v32i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpgtw (%rdi), %xmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgtw_v8i1_v32i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi531:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi532:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi533:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -14252,42 +9888,32 @@ define zeroext i32 @test_vpcmpsgtw_v8i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -14295,6 +9921,7 @@ define zeroext i32 @test_vpcmpsgtw_v8i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
@@ -14308,21 +9935,18 @@ entry:
define zeroext i32 @test_masked_vpcmpsgtw_v8i1_v32i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgtw_v8i1_v32i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpgtw %xmm1, %xmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgtw_v8i1_v32i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi534:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi535:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi536:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -14331,42 +9955,32 @@ define zeroext i32 @test_masked_vpcmpsgtw_v8i1_v32i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -14374,6 +9988,7 @@ define zeroext i32 @test_masked_vpcmpsgtw_v8i1_v32i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
@@ -14388,21 +10003,18 @@ entry:
define zeroext i32 @test_masked_vpcmpsgtw_v8i1_v32i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgtw_v8i1_v32i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpgtw (%rsi), %xmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgtw_v8i1_v32i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi537:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi538:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi539:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -14411,42 +10023,32 @@ define zeroext i32 @test_masked_vpcmpsgtw_v8i1_v32i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -14454,6 +10056,7 @@ define zeroext i32 @test_masked_vpcmpsgtw_v8i1_v32i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
@@ -14470,20 +10073,17 @@ entry:
define zeroext i64 @test_vpcmpsgtw_v8i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgtw_v8i1_v64i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpgtw %xmm1, %xmm0, %k0
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgtw_v8i1_v64i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi540:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi541:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi542:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -14491,44 +10091,34 @@ define zeroext i64 @test_vpcmpsgtw_v8i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -14539,6 +10129,7 @@ define zeroext i64 @test_vpcmpsgtw_v8i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
@@ -14551,20 +10142,17 @@ entry:
define zeroext i64 @test_vpcmpsgtw_v8i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgtw_v8i1_v64i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpgtw (%rdi), %xmm0, %k0
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgtw_v8i1_v64i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi543:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi544:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi545:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -14572,44 +10160,34 @@ define zeroext i64 @test_vpcmpsgtw_v8i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -14620,6 +10198,7 @@ define zeroext i64 @test_vpcmpsgtw_v8i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
@@ -14633,21 +10212,18 @@ entry:
define zeroext i64 @test_masked_vpcmpsgtw_v8i1_v64i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgtw_v8i1_v64i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpgtw %xmm1, %xmm0, %k0 {%k1}
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgtw_v8i1_v64i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi546:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi547:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi548:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -14656,44 +10232,34 @@ define zeroext i64 @test_masked_vpcmpsgtw_v8i1_v64i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -14704,6 +10270,7 @@ define zeroext i64 @test_masked_vpcmpsgtw_v8i1_v64i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
@@ -14718,21 +10285,18 @@ entry:
define zeroext i64 @test_masked_vpcmpsgtw_v8i1_v64i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgtw_v8i1_v64i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpgtw (%rsi), %xmm0, %k0 {%k1}
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgtw_v8i1_v64i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi549:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi550:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi551:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -14741,44 +10305,34 @@ define zeroext i64 @test_masked_vpcmpsgtw_v8i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -14789,6 +10343,7 @@ define zeroext i64 @test_masked_vpcmpsgtw_v8i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
@@ -14805,121 +10360,34 @@ entry:
define zeroext i32 @test_vpcmpsgtw_v16i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgtw_v16i1_v32i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpgtw %ymm1, %ymm0, %k0
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgtw_v16i1_v32i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi552:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi553:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi554:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: .Lcfi555:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi556:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi557:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi558:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi559:
-; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0
; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <16 x i16>
@@ -14932,121 +10400,34 @@ entry:
define zeroext i32 @test_vpcmpsgtw_v16i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgtw_v16i1_v32i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpgtw (%rdi), %ymm0, %k0
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgtw_v16i1_v32i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi560:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi561:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi562:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: .Lcfi563:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi564:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi565:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi566:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi567:
-; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpcmpgtw (%rdi), %ymm0, %ymm0
; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <16 x i16>
@@ -15060,7 +10441,7 @@ entry:
define zeroext i32 @test_masked_vpcmpsgtw_v16i1_v32i1_mask(i16 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgtw_v16i1_v32i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpgtw %ymm1, %ymm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
@@ -15068,115 +10449,31 @@ define zeroext i32 @test_masked_vpcmpsgtw_v16i1_v32i1_mask(i16 zeroext %__u, <4
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgtw_v16i1_v32i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi568:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi569:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi570:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: .Lcfi571:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi572:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi573:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi574:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi575:
-; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0
; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1}
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <16 x i16>
@@ -15191,7 +10488,7 @@ entry:
define zeroext i32 @test_masked_vpcmpsgtw_v16i1_v32i1_mask_mem(i16 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgtw_v16i1_v32i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpgtw (%rsi), %ymm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
@@ -15199,115 +10496,31 @@ define zeroext i32 @test_masked_vpcmpsgtw_v16i1_v32i1_mask_mem(i16 zeroext %__u,
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgtw_v16i1_v32i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi576:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi577:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi578:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: .Lcfi579:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi580:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi581:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi582:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi583:
-; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: vpcmpgtw (%rsi), %ymm0, %ymm0
; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1}
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <16 x i16>
@@ -15324,111 +10537,28 @@ entry:
define zeroext i64 @test_vpcmpsgtw_v16i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgtw_v16i1_v64i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpgtw %ymm1, %ymm0, %k0
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgtw_v16i1_v64i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi584:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi585:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi586:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: .Lcfi587:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi588:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi589:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi590:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi591:
-; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0
; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -15437,13 +10567,9 @@ define zeroext i64 @test_vpcmpsgtw_v16i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %_
; NoVLX-NEXT: shlq $32, %rcx
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: orq %rcx, %rax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <16 x i16>
@@ -15456,111 +10582,28 @@ entry:
define zeroext i64 @test_vpcmpsgtw_v16i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgtw_v16i1_v64i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpgtw (%rdi), %ymm0, %k0
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgtw_v16i1_v64i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi592:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi593:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi594:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: .Lcfi595:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi596:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi597:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi598:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi599:
-; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpcmpgtw (%rdi), %ymm0, %ymm0
; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -15569,13 +10612,9 @@ define zeroext i64 @test_vpcmpsgtw_v16i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64
; NoVLX-NEXT: shlq $32, %rcx
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: orq %rcx, %rax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <16 x i16>
@@ -15589,7 +10628,7 @@ entry:
define zeroext i64 @test_masked_vpcmpsgtw_v16i1_v64i1_mask(i16 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgtw_v16i1_v64i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpgtw %ymm1, %ymm0, %k0 {%k1}
; VLX-NEXT: kmovq %k0, %rax
@@ -15597,105 +10636,25 @@ define zeroext i64 @test_masked_vpcmpsgtw_v16i1_v64i1_mask(i16 zeroext %__u, <4
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgtw_v16i1_v64i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi600:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi601:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi602:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: .Lcfi603:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi604:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi605:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi606:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi607:
-; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0
; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1}
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -15704,13 +10663,9 @@ define zeroext i64 @test_masked_vpcmpsgtw_v16i1_v64i1_mask(i16 zeroext %__u, <4
; NoVLX-NEXT: shlq $32, %rcx
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: orq %rcx, %rax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <16 x i16>
@@ -15725,7 +10680,7 @@ entry:
define zeroext i64 @test_masked_vpcmpsgtw_v16i1_v64i1_mask_mem(i16 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgtw_v16i1_v64i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpgtw (%rsi), %ymm0, %k0 {%k1}
; VLX-NEXT: kmovq %k0, %rax
@@ -15733,105 +10688,25 @@ define zeroext i64 @test_masked_vpcmpsgtw_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgtw_v16i1_v64i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi608:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi609:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi610:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: .Lcfi611:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi612:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi613:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi614:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi615:
-; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: vpcmpgtw (%rsi), %ymm0, %ymm0
; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1}
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -15840,13 +10715,9 @@ define zeroext i64 @test_masked_vpcmpsgtw_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: shlq $32, %rcx
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: orq %rcx, %rax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <16 x i16>
@@ -15863,25 +10734,22 @@ entry:
define zeroext i64 @test_vpcmpsgtw_v32i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgtw_v32i1_v64i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpgtw %zmm1, %zmm0, %k0
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgtw_v32i1_v64i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi616:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi617:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi618:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm3
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm3
; NoVLX-NEXT: vmovq %xmm3, %rax
; NoVLX-NEXT: movq %rax, %rcx
; NoVLX-NEXT: movq %rax, %rdx
@@ -15890,7 +10758,7 @@ define zeroext i64 @test_vpcmpsgtw_v32i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %_
; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm5
; NoVLX-NEXT: vextracti32x4 $2, %zmm1, %xmm8
; NoVLX-NEXT: vextracti32x4 $3, %zmm1, %xmm4
-; NoVLX-NEXT: vextracti32x4 $1, %zmm1, %xmm6
+; NoVLX-NEXT: vextracti128 $1, %ymm1, %xmm6
; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm7
; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm2
; NoVLX-NEXT: shrq $32, %rdx
@@ -16054,148 +10922,19 @@ define zeroext i64 @test_vpcmpsgtw_v32i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %_
; NoVLX-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
; NoVLX-NEXT: vpcmpgtw %ymm1, %ymm3, %ymm1
; NoVLX-NEXT: vpmovsxwd %ymm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpmovdb %zmm1, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %ecx
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
@@ -16203,6 +10942,7 @@ define zeroext i64 @test_vpcmpsgtw_v32i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %_
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <32 x i16>
@@ -16215,25 +10955,22 @@ entry:
define zeroext i64 @test_vpcmpsgtw_v32i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgtw_v32i1_v64i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpgtw (%rdi), %zmm0, %k0
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgtw_v32i1_v64i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi619:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi620:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi621:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm2
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm2
; NoVLX-NEXT: vmovq %xmm2, %rax
; NoVLX-NEXT: movq %rax, %rcx
; NoVLX-NEXT: movq %rax, %rdx
@@ -16320,149 +11057,20 @@ define zeroext i64 @test_vpcmpsgtw_v32i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64
; NoVLX-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
; NoVLX-NEXT: vpcmpgtw 32(%rdi), %ymm1, %ymm1
; NoVLX-NEXT: vpmovsxwd %ymm1, %zmm1
-; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %eax, %xmm1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpmovdb %zmm1, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpcmpgtw (%rdi), %ymm0, %ymm0
; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %ecx
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
@@ -16470,6 +11078,7 @@ define zeroext i64 @test_vpcmpsgtw_v32i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <32 x i16>
@@ -16483,7 +11092,7 @@ entry:
define zeroext i64 @test_masked_vpcmpsgtw_v32i1_v64i1_mask(i32 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgtw_v32i1_v64i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpgtw %zmm1, %zmm0, %k0 {%k1}
; VLX-NEXT: kmovq %k0, %rax
@@ -16491,14 +11100,11 @@ define zeroext i64 @test_masked_vpcmpsgtw_v32i1_v64i1_mask(i32 zeroext %__u, <8
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgtw_v32i1_v64i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi622:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi623:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi624:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $96, %rsp
@@ -16510,10 +11116,10 @@ define zeroext i64 @test_masked_vpcmpsgtw_v32i1_v64i1_mask(i32 zeroext %__u, <8
; NoVLX-NEXT: vmovd %eax, %xmm3
; NoVLX-NEXT: shrl $16, %eax
; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm4
-; NoVLX-NEXT: vextracti32x4 $1, %zmm1, %xmm8
+; NoVLX-NEXT: vextracti128 $1, %ymm1, %xmm8
; NoVLX-NEXT: vextracti32x4 $2, %zmm1, %xmm5
; NoVLX-NEXT: vextracti32x4 $3, %zmm1, %xmm7
-; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm6
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm6
; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm3
; NoVLX-NEXT: shrq $32, %rdx
; NoVLX-NEXT: vpinsrw $2, %edx, %xmm4, %xmm4
@@ -16659,173 +11265,44 @@ define zeroext i64 @test_masked_vpcmpsgtw_v32i1_v64i1_mask(i32 zeroext %__u, <8
; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2
; NoVLX-NEXT: vpextrq $1, %xmm1, %rax
; NoVLX-NEXT: vinserti128 $1, %xmm9, %ymm4, %ymm1
-; NoVLX-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm8
+; NoVLX-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm0
; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1
; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k2
-; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vinserti128 $1, %xmm7, %ymm3, %ymm3
-; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
-; NoVLX-NEXT: vpternlogd $255, %zmm6, %zmm6, %zmm6 {%k2} {z}
-; NoVLX-NEXT: vpcmpgtw %ymm3, %ymm1, %ymm4
-; NoVLX-NEXT: vpmovdb %zmm6, %xmm6
+; NoVLX-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k1} {z}
+; NoVLX-NEXT: vpmovdb %zmm4, %xmm4
+; NoVLX-NEXT: vpcmpgtw %ymm3, %ymm1, %ymm1
+; NoVLX-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k2} {z}
+; NoVLX-NEXT: vpmovdb %zmm3, %xmm3
; NoVLX-NEXT: shrq $48, %rcx
-; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm1
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2
; NoVLX-NEXT: movl %eax, %ecx
; NoVLX-NEXT: shrl $16, %ecx
-; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm2
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2
; NoVLX-NEXT: movq %rax, %rcx
; NoVLX-NEXT: shrq $48, %rax
; NoVLX-NEXT: shrq $32, %rcx
-; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2
; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2
; NoVLX-NEXT: vinserti128 $1, %xmm5, %ymm2, %ymm2
-; NoVLX-NEXT: vpcmpgtw %ymm2, %ymm8, %ymm2
-; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2
-; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2
-; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm2
-; NoVLX-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm3
-; NoVLX-NEXT: vpmovsxwd %ymm4, %zmm2
-; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2
-; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm2
-; NoVLX-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: vpand %xmm6, %xmm2, %xmm2
-; NoVLX-NEXT: vpmovsxbd %xmm2, %zmm2
-; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2
-; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0
+; NoVLX-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
+; NoVLX-NEXT: vpand %xmm4, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxwd %ymm1, %zmm1
+; NoVLX-NEXT: vpmovdb %zmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm3, %xmm1, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: vpand %xmm0, %xmm3, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %ecx
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
@@ -16833,6 +11310,7 @@ define zeroext i64 @test_masked_vpcmpsgtw_v32i1_v64i1_mask(i32 zeroext %__u, <8
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <32 x i16>
@@ -16847,7 +11325,7 @@ entry:
define zeroext i64 @test_masked_vpcmpsgtw_v32i1_v64i1_mask_mem(i32 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgtw_v32i1_v64i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpgtw (%rsi), %zmm0, %k0 {%k1}
; VLX-NEXT: kmovq %k0, %rax
@@ -16855,14 +11333,11 @@ define zeroext i64 @test_masked_vpcmpsgtw_v32i1_v64i1_mask_mem(i32 zeroext %__u,
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgtw_v32i1_v64i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi625:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi626:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi627:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $96, %rsp
@@ -16874,7 +11349,7 @@ define zeroext i64 @test_masked_vpcmpsgtw_v32i1_v64i1_mask_mem(i32 zeroext %__u,
; NoVLX-NEXT: vmovd %eax, %xmm2
; NoVLX-NEXT: shrl $16, %eax
; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm3
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm3
; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm4
; NoVLX-NEXT: shrq $32, %rdx
; NoVLX-NEXT: vpinsrw $2, %edx, %xmm2, %xmm2
@@ -16890,221 +11365,92 @@ define zeroext i64 @test_masked_vpcmpsgtw_v32i1_v64i1_mask_mem(i32 zeroext %__u,
; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vmovq %xmm4, %rcx
; NoVLX-NEXT: shrq $48, %rax
-; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm2
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1
; NoVLX-NEXT: movl %ecx, %eax
; NoVLX-NEXT: shrl $16, %eax
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vmovd %ecx, %xmm2
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2
; NoVLX-NEXT: movq %rcx, %rax
; NoVLX-NEXT: shrq $32, %rax
-; NoVLX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2
; NoVLX-NEXT: vpextrq $1, %xmm4, %rax
; NoVLX-NEXT: shrq $48, %rcx
-; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2
; NoVLX-NEXT: movl %eax, %ecx
; NoVLX-NEXT: shrl $16, %ecx
-; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2
; NoVLX-NEXT: movq %rax, %rcx
; NoVLX-NEXT: shrq $32, %rcx
-; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2
; NoVLX-NEXT: vmovq %xmm3, %rcx
; NoVLX-NEXT: shrq $48, %rax
-; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm4
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2
; NoVLX-NEXT: movl %ecx, %eax
; NoVLX-NEXT: shrl $16, %eax
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vmovd %ecx, %xmm4
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4
; NoVLX-NEXT: movq %rcx, %rax
; NoVLX-NEXT: shrq $32, %rax
-; NoVLX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4
; NoVLX-NEXT: vpextrq $1, %xmm3, %rax
; NoVLX-NEXT: shrq $48, %rcx
-; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm3
; NoVLX-NEXT: movl %eax, %ecx
; NoVLX-NEXT: shrl $16, %ecx
-; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3
; NoVLX-NEXT: movq %rax, %rcx
; NoVLX-NEXT: shrq $32, %rcx
-; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3
; NoVLX-NEXT: vmovq %xmm0, %rcx
; NoVLX-NEXT: shrq $48, %rax
-; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm3
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3
; NoVLX-NEXT: movl %ecx, %eax
; NoVLX-NEXT: shrl $16, %eax
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vmovd %ecx, %xmm4
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4
; NoVLX-NEXT: movq %rcx, %rax
; NoVLX-NEXT: shrq $32, %rax
-; NoVLX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4
; NoVLX-NEXT: vpextrq $1, %xmm0, %rax
; NoVLX-NEXT: shrq $48, %rcx
-; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm1, %xmm0
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm0
; NoVLX-NEXT: movl %eax, %ecx
; NoVLX-NEXT: shrl $16, %ecx
; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: movq %rax, %rcx
; NoVLX-NEXT: shrq $32, %rcx
-; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm5
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1
; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k2
-; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vpmovdb %zmm0, %xmm1
-; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
-; NoVLX-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm4
-; NoVLX-NEXT: vpmovdb %zmm0, %xmm2
+; NoVLX-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k1} {z}
+; NoVLX-NEXT: vpmovdb %zmm4, %xmm4
+; NoVLX-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k2} {z}
+; NoVLX-NEXT: vpmovdb %zmm2, %xmm2
; NoVLX-NEXT: shrq $48, %rax
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vpinsrw $7, %eax, %xmm5, %xmm5
-; NoVLX-NEXT: vinserti128 $1, %xmm3, %ymm5, %ymm3
-; NoVLX-NEXT: vpcmpgtw (%rsi), %ymm3, %ymm3
-; NoVLX-NEXT: vpmovsxwd %ymm3, %zmm3
-; NoVLX-NEXT: vpslld $31, %zmm3, %zmm3
-; NoVLX-NEXT: vptestmd %zmm3, %zmm3, %k0
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %eax, %xmm3
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm3, %xmm3
-; NoVLX-NEXT: vpcmpgtw 32(%rsi), %ymm4, %ymm4
-; NoVLX-NEXT: vpmovsxwd %ymm4, %zmm4
-; NoVLX-NEXT: vpslld $31, %zmm4, %zmm4
-; NoVLX-NEXT: vptestmd %zmm4, %zmm4, %k0
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm4
-; NoVLX-NEXT: vpinsrb $1, %eax, %xmm4, %xmm4
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $2, %eax, %xmm4, %xmm4
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $3, %eax, %xmm4, %xmm4
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm4, %xmm4
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm4, %xmm4
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm4, %xmm4
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm4, %xmm4
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm4, %xmm4
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $9, %eax, %xmm4, %xmm4
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $10, %eax, %xmm4, %xmm4
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm4, %xmm4
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $12, %eax, %xmm4, %xmm4
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $13, %eax, %xmm4, %xmm4
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $14, %eax, %xmm4, %xmm4
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm4, %xmm4
-; NoVLX-NEXT: vpand %xmm2, %xmm4, %xmm2
-; NoVLX-NEXT: vpmovsxbd %xmm2, %zmm2
-; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2
-; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0
-; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: vpand %xmm1, %xmm3, %xmm1
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
+; NoVLX-NEXT: vpcmpgtw (%rsi), %ymm0, %ymm0
+; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
+; NoVLX-NEXT: vpand %xmm4, %xmm0, %xmm0
+; NoVLX-NEXT: vpcmpgtw 32(%rsi), %ymm1, %ymm1
+; NoVLX-NEXT: vpmovsxwd %ymm1, %zmm1
+; NoVLX-NEXT: vpmovdb %zmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm2, %xmm1, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %ecx
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
@@ -17112,6 +11458,7 @@ define zeroext i64 @test_masked_vpcmpsgtw_v32i1_v64i1_mask_mem(i32 zeroext %__u,
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <32 x i16>
@@ -17128,49 +11475,45 @@ entry:
define zeroext i8 @test_vpcmpsgtd_v4i1_v8i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgtd_v4i1_v8i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpgtd %xmm1, %xmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %al killed %al killed %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgtd_v4i1_v8i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kshiftlw $7, %k0, %k0
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $13, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $12, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %al killed %al killed %eax
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -17183,49 +11526,45 @@ entry:
define zeroext i8 @test_vpcmpsgtd_v4i1_v8i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgtd_v4i1_v8i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpgtd (%rdi), %xmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %al killed %al killed %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgtd_v4i1_v8i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpcmpgtd (%rdi), %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kshiftlw $7, %k0, %k0
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $13, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $12, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %al killed %al killed %eax
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -17239,68 +11578,59 @@ entry:
define zeroext i8 @test_masked_vpcmpsgtd_v4i1_v8i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgtd_v4i1_v8i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpgtd %xmm1, %xmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %al killed %al killed %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v8i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $13, %k0, %k2
-; NoVLX-NEXT: kshiftrw $15, %k2, %k2
-; NoVLX-NEXT: kshiftlw $15, %k0, %k3
-; NoVLX-NEXT: kshiftrw $15, %k3, %k3
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k3, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kshiftlw $7, %k0, %k0
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $13, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $12, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %al killed %al killed %eax
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -17316,68 +11646,59 @@ entry:
define zeroext i8 @test_masked_vpcmpsgtd_v4i1_v8i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgtd_v4i1_v8i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpgtd (%rsi), %xmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %al killed %al killed %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v8i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpcmpgtd (%rsi), %xmm0, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $13, %k0, %k2
-; NoVLX-NEXT: kshiftrw $15, %k2, %k2
-; NoVLX-NEXT: kshiftlw $15, %k0, %k3
-; NoVLX-NEXT: kshiftrw $15, %k3, %k3
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k3, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kshiftlw $7, %k0, %k0
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $13, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $12, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %al killed %al killed %eax
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -17395,50 +11716,46 @@ entry:
define zeroext i8 @test_vpcmpsgtd_v4i1_v8i1_mask_mem_b(<2 x i64> %__a, i32* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgtd_v4i1_v8i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpgtd (%rdi){1to4}, %xmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %al killed %al killed %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgtd_v4i1_v8i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpbroadcastd (%rdi), %xmm1
; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kshiftlw $7, %k0, %k0
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $13, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $12, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %al killed %al killed %eax
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -17453,69 +11770,60 @@ entry:
define zeroext i8 @test_masked_vpcmpsgtd_v4i1_v8i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i32* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgtd_v4i1_v8i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpgtd (%rsi){1to4}, %xmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %al killed %al killed %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v8i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpbroadcastd (%rsi), %xmm1
; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $13, %k0, %k2
-; NoVLX-NEXT: kshiftrw $15, %k2, %k2
-; NoVLX-NEXT: kshiftlw $15, %k0, %k3
-; NoVLX-NEXT: kshiftrw $15, %k3, %k3
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k3, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kshiftlw $7, %k0, %k0
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $13, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $12, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %al killed %al killed %eax
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -17534,48 +11842,45 @@ entry:
define zeroext i16 @test_vpcmpsgtd_v4i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgtd_v4i1_v16i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpgtd %xmm1, %xmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgtd_v4i1_v16i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kmovw %eax, %k0
; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: andl $1, %eax
-; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $13, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $12, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -17588,48 +11893,45 @@ entry:
define zeroext i16 @test_vpcmpsgtd_v4i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgtd_v4i1_v16i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpgtd (%rdi), %xmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgtd_v4i1_v16i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpcmpgtd (%rdi), %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kmovw %eax, %k0
; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: andl $1, %eax
-; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $13, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $12, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -17643,67 +11945,59 @@ entry:
define zeroext i16 @test_masked_vpcmpsgtd_v4i1_v16i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgtd_v4i1_v16i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpgtd %xmm1, %xmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v16i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $13, %k0, %k2
-; NoVLX-NEXT: kshiftrw $15, %k2, %k2
-; NoVLX-NEXT: kshiftlw $15, %k0, %k3
-; NoVLX-NEXT: kshiftrw $15, %k3, %k3
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k3, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kmovw %eax, %k0
; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: andl $1, %eax
-; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $13, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $12, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -17719,67 +12013,59 @@ entry:
define zeroext i16 @test_masked_vpcmpsgtd_v4i1_v16i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgtd_v4i1_v16i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpgtd (%rsi), %xmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v16i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpcmpgtd (%rsi), %xmm0, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $13, %k0, %k2
-; NoVLX-NEXT: kshiftrw $15, %k2, %k2
-; NoVLX-NEXT: kshiftlw $15, %k0, %k3
-; NoVLX-NEXT: kshiftrw $15, %k3, %k3
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k3, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kmovw %eax, %k0
; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: andl $1, %eax
-; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $13, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $12, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -17797,49 +12083,46 @@ entry:
define zeroext i16 @test_vpcmpsgtd_v4i1_v16i1_mask_mem_b(<2 x i64> %__a, i32* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgtd_v4i1_v16i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpgtd (%rdi){1to4}, %xmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgtd_v4i1_v16i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpbroadcastd (%rdi), %xmm1
; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kmovw %eax, %k0
; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: andl $1, %eax
-; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $13, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $12, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -17854,68 +12137,60 @@ entry:
define zeroext i16 @test_masked_vpcmpsgtd_v4i1_v16i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i32* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgtd_v4i1_v16i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpgtd (%rsi){1to4}, %xmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v16i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpbroadcastd (%rsi), %xmm1
; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $13, %k0, %k2
-; NoVLX-NEXT: kshiftrw $15, %k2, %k2
-; NoVLX-NEXT: kshiftlw $15, %k0, %k3
-; NoVLX-NEXT: kshiftrw $15, %k3, %k3
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k3, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kmovw %eax, %k0
; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: andl $1, %eax
-; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $13, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $12, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -17934,20 +12209,17 @@ entry:
define zeroext i32 @test_vpcmpsgtd_v4i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgtd_v4i1_v32i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpgtd %xmm1, %xmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgtd_v4i1_v32i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi628:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi629:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi630:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -17965,6 +12237,7 @@ define zeroext i32 @test_vpcmpsgtd_v4i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -17977,20 +12250,17 @@ entry:
define zeroext i32 @test_vpcmpsgtd_v4i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgtd_v4i1_v32i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpgtd (%rdi), %xmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgtd_v4i1_v32i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi631:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi632:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi633:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -18008,6 +12278,7 @@ define zeroext i32 @test_vpcmpsgtd_v4i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -18021,41 +12292,33 @@ entry:
define zeroext i32 @test_masked_vpcmpsgtd_v4i1_v32i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgtd_v4i1_v32i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpgtd %xmm1, %xmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v32i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi634:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi635:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi636:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $13, %k0, %k2
-; NoVLX-NEXT: kshiftrw $15, %k2, %k2
-; NoVLX-NEXT: kshiftlw $15, %k0, %k3
-; NoVLX-NEXT: kshiftrw $15, %k3, %k3
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k3, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -18071,6 +12334,7 @@ define zeroext i32 @test_masked_vpcmpsgtd_v4i1_v32i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -18086,41 +12350,33 @@ entry:
define zeroext i32 @test_masked_vpcmpsgtd_v4i1_v32i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgtd_v4i1_v32i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpgtd (%rsi), %xmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v32i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi637:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi638:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi639:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
; NoVLX-NEXT: vpcmpgtd (%rsi), %xmm0, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $13, %k0, %k2
-; NoVLX-NEXT: kshiftrw $15, %k2, %k2
-; NoVLX-NEXT: kshiftlw $15, %k0, %k3
-; NoVLX-NEXT: kshiftrw $15, %k3, %k3
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k3, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -18136,6 +12392,7 @@ define zeroext i32 @test_masked_vpcmpsgtd_v4i1_v32i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -18153,20 +12410,17 @@ entry:
define zeroext i32 @test_vpcmpsgtd_v4i1_v32i1_mask_mem_b(<2 x i64> %__a, i32* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgtd_v4i1_v32i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpgtd (%rdi){1to4}, %xmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgtd_v4i1_v32i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi640:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi641:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi642:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -18185,6 +12439,7 @@ define zeroext i32 @test_vpcmpsgtd_v4i1_v32i1_mask_mem_b(<2 x i64> %__a, i32* %_
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -18199,42 +12454,34 @@ entry:
define zeroext i32 @test_masked_vpcmpsgtd_v4i1_v32i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i32* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgtd_v4i1_v32i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpgtd (%rsi){1to4}, %xmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v32i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi643:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi644:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi645:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
; NoVLX-NEXT: vpbroadcastd (%rsi), %xmm1
; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $13, %k0, %k2
-; NoVLX-NEXT: kshiftrw $15, %k2, %k2
-; NoVLX-NEXT: kshiftlw $15, %k0, %k3
-; NoVLX-NEXT: kshiftrw $15, %k3, %k3
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k3, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -18250,6 +12497,7 @@ define zeroext i32 @test_masked_vpcmpsgtd_v4i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -18268,26 +12516,22 @@ entry:
define zeroext i64 @test_vpcmpsgtd_v4i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgtd_v4i1_v64i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpgtd %xmm1, %xmm0, %k0
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgtd_v4i1_v64i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi646:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi647:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi648:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
-; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -18306,6 +12550,7 @@ define zeroext i64 @test_vpcmpsgtd_v4i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -18318,26 +12563,22 @@ entry:
define zeroext i64 @test_vpcmpsgtd_v4i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgtd_v4i1_v64i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpgtd (%rdi), %xmm0, %k0
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgtd_v4i1_v64i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi649:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi650:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi651:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vpcmpgtd (%rdi), %xmm0, %xmm0
-; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -18356,6 +12597,7 @@ define zeroext i64 @test_vpcmpsgtd_v4i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -18369,45 +12611,36 @@ entry:
define zeroext i64 @test_masked_vpcmpsgtd_v4i1_v64i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgtd_v4i1_v64i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpgtd %xmm1, %xmm0, %k0 {%k1}
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v64i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi652:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi653:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi654:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $13, %k0, %k2
-; NoVLX-NEXT: kshiftrw $15, %k2, %k2
-; NoVLX-NEXT: kshiftlw $15, %k0, %k3
-; NoVLX-NEXT: kshiftrw $15, %k3, %k3
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k3, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
-; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -18426,6 +12659,7 @@ define zeroext i64 @test_masked_vpcmpsgtd_v4i1_v64i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -18441,45 +12675,36 @@ entry:
define zeroext i64 @test_masked_vpcmpsgtd_v4i1_v64i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgtd_v4i1_v64i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpgtd (%rsi), %xmm0, %k0 {%k1}
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v64i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi655:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi656:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi657:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vpcmpgtd (%rsi), %xmm0, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $13, %k0, %k2
-; NoVLX-NEXT: kshiftrw $15, %k2, %k2
-; NoVLX-NEXT: kshiftlw $15, %k0, %k3
-; NoVLX-NEXT: kshiftrw $15, %k3, %k3
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k3, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
-; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -18498,6 +12723,7 @@ define zeroext i64 @test_masked_vpcmpsgtd_v4i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -18515,27 +12741,23 @@ entry:
define zeroext i64 @test_vpcmpsgtd_v4i1_v64i1_mask_mem_b(<2 x i64> %__a, i32* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgtd_v4i1_v64i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpgtd (%rdi){1to4}, %xmm0, %k0
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgtd_v4i1_v64i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi658:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi659:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi660:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vpbroadcastd (%rdi), %xmm1
; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
-; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -18554,6 +12776,7 @@ define zeroext i64 @test_vpcmpsgtd_v4i1_v64i1_mask_mem_b(<2 x i64> %__a, i32* %_
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -18568,46 +12791,37 @@ entry:
define zeroext i64 @test_masked_vpcmpsgtd_v4i1_v64i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i32* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgtd_v4i1_v64i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpgtd (%rsi){1to4}, %xmm0, %k0 {%k1}
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v64i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi661:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi662:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi663:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vpbroadcastd (%rsi), %xmm1
; NoVLX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $13, %k0, %k2
-; NoVLX-NEXT: kshiftrw $15, %k2, %k2
-; NoVLX-NEXT: kshiftlw $15, %k0, %k3
-; NoVLX-NEXT: kshiftrw $15, %k3, %k3
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k3, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
-; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -18626,6 +12840,7 @@ define zeroext i64 @test_masked_vpcmpsgtd_v4i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -18644,22 +12859,23 @@ entry:
define zeroext i16 @test_vpcmpsgtd_v8i1_v16i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgtd_v8i1_v16i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpgtd %ymm1, %ymm0, %k0
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgtd_v8i1_v16i1_mask:
-; NoVLX: # BB#0: # %entry
-; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
-; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX: # %bb.0: # %entry
+; NoVLX-NEXT: # kill: def %ymm1 killed %ymm1 def %zmm1
+; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
; NoVLX-NEXT: kshiftrw $8, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
@@ -18672,22 +12888,23 @@ entry:
define zeroext i16 @test_vpcmpsgtd_v8i1_v16i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgtd_v8i1_v16i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpgtd (%rdi), %ymm0, %k0
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgtd_v8i1_v16i1_mask_mem:
-; NoVLX: # BB#0: # %entry
-; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX: # %bb.0: # %entry
+; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: vmovdqa (%rdi), %ymm1
; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
; NoVLX-NEXT: kshiftrw $8, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
@@ -18701,24 +12918,25 @@ entry:
define zeroext i16 @test_masked_vpcmpsgtd_v8i1_v16i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgtd_v8i1_v16i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpgtd %ymm1, %ymm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgtd_v8i1_v16i1_mask:
-; NoVLX: # BB#0: # %entry
-; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
-; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX: # %bb.0: # %entry
+; NoVLX-NEXT: # kill: def %ymm1 killed %ymm1 def %zmm1
+; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 {%k1}
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
; NoVLX-NEXT: kshiftrw $8, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
@@ -18733,24 +12951,25 @@ entry:
define zeroext i16 @test_masked_vpcmpsgtd_v8i1_v16i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgtd_v8i1_v16i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpgtd (%rsi), %ymm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgtd_v8i1_v16i1_mask_mem:
-; NoVLX: # BB#0: # %entry
-; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX: # %bb.0: # %entry
+; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: vmovdqa (%rsi), %ymm1
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 {%k1}
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
; NoVLX-NEXT: kshiftrw $8, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
@@ -18767,22 +12986,23 @@ entry:
define zeroext i16 @test_vpcmpsgtd_v8i1_v16i1_mask_mem_b(<4 x i64> %__a, i32* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgtd_v8i1_v16i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpgtd (%rdi){1to8}, %ymm0, %k0
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgtd_v8i1_v16i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
-; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX: # %bb.0: # %entry
+; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: vpbroadcastd (%rdi), %ymm1
; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
; NoVLX-NEXT: kshiftrw $8, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
@@ -18797,24 +13017,25 @@ entry:
define zeroext i16 @test_masked_vpcmpsgtd_v8i1_v16i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i32* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgtd_v8i1_v16i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpgtd (%rsi){1to8}, %ymm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgtd_v8i1_v16i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
-; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX: # %bb.0: # %entry
+; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: vpbroadcastd (%rsi), %ymm1
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 {%k1}
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
; NoVLX-NEXT: kshiftrw $8, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
@@ -18832,63 +13053,50 @@ entry:
define zeroext i32 @test_vpcmpsgtd_v8i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgtd_v8i1_v32i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpgtd %ymm1, %ymm0, %k0
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgtd_v8i1_v32i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi664:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi665:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi666:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
-; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: # kill: def %ymm1 killed %ymm1 def %zmm1
+; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -18896,6 +13104,7 @@ define zeroext i32 @test_vpcmpsgtd_v8i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
@@ -18908,63 +13117,50 @@ entry:
define zeroext i32 @test_vpcmpsgtd_v8i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgtd_v8i1_v32i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpgtd (%rdi), %ymm0, %k0
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgtd_v8i1_v32i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi667:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi668:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi669:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: vmovdqa (%rdi), %ymm1
; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -18972,6 +13168,7 @@ define zeroext i32 @test_vpcmpsgtd_v8i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
@@ -18985,7 +13182,7 @@ entry:
define zeroext i32 @test_masked_vpcmpsgtd_v8i1_v32i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgtd_v8i1_v32i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpgtd %ymm1, %ymm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
@@ -18993,58 +13190,44 @@ define zeroext i32 @test_masked_vpcmpsgtd_v8i1_v32i1_mask(i8 zeroext %__u, <4 x
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgtd_v8i1_v32i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi670:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi671:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi672:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
-; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
-; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0
+; NoVLX-NEXT: # kill: def %ymm1 killed %ymm1 def %zmm1
+; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
-; NoVLX-NEXT: kandw %k1, %k0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -19052,6 +13235,7 @@ define zeroext i32 @test_masked_vpcmpsgtd_v8i1_v32i1_mask(i8 zeroext %__u, <4 x
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
@@ -19066,7 +13250,7 @@ entry:
define zeroext i32 @test_masked_vpcmpsgtd_v8i1_v32i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgtd_v8i1_v32i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpgtd (%rsi), %ymm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
@@ -19074,58 +13258,44 @@ define zeroext i32 @test_masked_vpcmpsgtd_v8i1_v32i1_mask_mem(i8 zeroext %__u, <
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgtd_v8i1_v32i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi673:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi674:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi675:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: vmovdqa (%rsi), %ymm1
-; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0
; NoVLX-NEXT: kmovw %edi, %k1
-; NoVLX-NEXT: kandw %k1, %k0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -19133,6 +13303,7 @@ define zeroext i32 @test_masked_vpcmpsgtd_v8i1_v32i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
@@ -19149,63 +13320,50 @@ entry:
define zeroext i32 @test_vpcmpsgtd_v8i1_v32i1_mask_mem_b(<4 x i64> %__a, i32* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgtd_v8i1_v32i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpgtd (%rdi){1to8}, %ymm0, %k0
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgtd_v8i1_v32i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi676:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi677:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi678:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: vpbroadcastd (%rdi), %ymm1
; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -19213,6 +13371,7 @@ define zeroext i32 @test_vpcmpsgtd_v8i1_v32i1_mask_mem_b(<4 x i64> %__a, i32* %_
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
@@ -19227,7 +13386,7 @@ entry:
define zeroext i32 @test_masked_vpcmpsgtd_v8i1_v32i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i32* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgtd_v8i1_v32i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpgtd (%rsi){1to8}, %ymm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
@@ -19235,58 +13394,44 @@ define zeroext i32 @test_masked_vpcmpsgtd_v8i1_v32i1_mask_mem_b(i8 zeroext %__u,
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgtd_v8i1_v32i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi679:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi680:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi681:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: vpbroadcastd (%rsi), %ymm1
-; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0
; NoVLX-NEXT: kmovw %edi, %k1
-; NoVLX-NEXT: kandw %k0, %k1, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -19294,6 +13439,7 @@ define zeroext i32 @test_masked_vpcmpsgtd_v8i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
@@ -19311,65 +13457,52 @@ entry:
define zeroext i64 @test_vpcmpsgtd_v8i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgtd_v8i1_v64i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpgtd %ymm1, %ymm0, %k0
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgtd_v8i1_v64i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi682:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi683:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi684:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
-; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: # kill: def %ymm1 killed %ymm1 def %zmm1
+; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -19380,6 +13513,7 @@ define zeroext i64 @test_vpcmpsgtd_v8i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
@@ -19392,65 +13526,52 @@ entry:
define zeroext i64 @test_vpcmpsgtd_v8i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgtd_v8i1_v64i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpgtd (%rdi), %ymm0, %k0
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgtd_v8i1_v64i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi685:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi686:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi687:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: vmovdqa (%rdi), %ymm1
; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -19461,6 +13582,7 @@ define zeroext i64 @test_vpcmpsgtd_v8i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
@@ -19474,7 +13596,7 @@ entry:
define zeroext i64 @test_masked_vpcmpsgtd_v8i1_v64i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgtd_v8i1_v64i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpgtd %ymm1, %ymm0, %k0 {%k1}
; VLX-NEXT: kmovq %k0, %rax
@@ -19482,60 +13604,46 @@ define zeroext i64 @test_masked_vpcmpsgtd_v8i1_v64i1_mask(i8 zeroext %__u, <4 x
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgtd_v8i1_v64i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi688:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi689:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi690:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
-; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
-; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0
+; NoVLX-NEXT: # kill: def %ymm1 killed %ymm1 def %zmm1
+; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
-; NoVLX-NEXT: kandw %k1, %k0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -19546,6 +13654,7 @@ define zeroext i64 @test_masked_vpcmpsgtd_v8i1_v64i1_mask(i8 zeroext %__u, <4 x
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
@@ -19560,7 +13669,7 @@ entry:
define zeroext i64 @test_masked_vpcmpsgtd_v8i1_v64i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgtd_v8i1_v64i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpgtd (%rsi), %ymm0, %k0 {%k1}
; VLX-NEXT: kmovq %k0, %rax
@@ -19568,60 +13677,46 @@ define zeroext i64 @test_masked_vpcmpsgtd_v8i1_v64i1_mask_mem(i8 zeroext %__u, <
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgtd_v8i1_v64i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi691:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi692:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi693:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: vmovdqa (%rsi), %ymm1
-; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0
; NoVLX-NEXT: kmovw %edi, %k1
-; NoVLX-NEXT: kandw %k1, %k0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -19632,6 +13727,7 @@ define zeroext i64 @test_masked_vpcmpsgtd_v8i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
@@ -19648,65 +13744,52 @@ entry:
define zeroext i64 @test_vpcmpsgtd_v8i1_v64i1_mask_mem_b(<4 x i64> %__a, i32* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgtd_v8i1_v64i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpgtd (%rdi){1to8}, %ymm0, %k0
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgtd_v8i1_v64i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi694:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi695:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi696:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: vpbroadcastd (%rdi), %ymm1
; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -19717,6 +13800,7 @@ define zeroext i64 @test_vpcmpsgtd_v8i1_v64i1_mask_mem_b(<4 x i64> %__a, i32* %_
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
@@ -19731,7 +13815,7 @@ entry:
define zeroext i64 @test_masked_vpcmpsgtd_v8i1_v64i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i32* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgtd_v8i1_v64i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpgtd (%rsi){1to8}, %ymm0, %k0 {%k1}
; VLX-NEXT: kmovq %k0, %rax
@@ -19739,60 +13823,46 @@ define zeroext i64 @test_masked_vpcmpsgtd_v8i1_v64i1_mask_mem_b(i8 zeroext %__u,
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgtd_v8i1_v64i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi697:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi698:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi699:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: vpbroadcastd (%rsi), %ymm1
-; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0
; NoVLX-NEXT: kmovw %edi, %k1
-; NoVLX-NEXT: kandw %k0, %k1, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -19803,6 +13873,7 @@ define zeroext i64 @test_masked_vpcmpsgtd_v8i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
@@ -19820,118 +13891,34 @@ entry:
define zeroext i32 @test_vpcmpsgtd_v16i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgtd_v16i1_v32i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgtd_v16i1_v32i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi700:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi701:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi702:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: .Lcfi703:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi704:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi705:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi706:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi707:
-; NoVLX-NEXT: .cfi_offset %r15, -24
-; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k1
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
@@ -19944,118 +13931,34 @@ entry:
define zeroext i32 @test_vpcmpsgtd_v16i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgtd_v16i1_v32i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpgtd (%rdi), %zmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgtd_v16i1_v32i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi708:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi709:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi710:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: .Lcfi711:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi712:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi713:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi714:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi715:
-; NoVLX-NEXT: .cfi_offset %r15, -24
-; NoVLX-NEXT: vpcmpgtd (%rdi), %zmm0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpcmpgtd (%rdi), %zmm0, %k1
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
@@ -20069,7 +13972,7 @@ entry:
define zeroext i32 @test_masked_vpcmpsgtd_v16i1_v32i1_mask(i16 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgtd_v16i1_v32i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
@@ -20077,112 +13980,28 @@ define zeroext i32 @test_masked_vpcmpsgtd_v16i1_v32i1_mask(i16 zeroext %__u, <8
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgtd_v16i1_v32i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi716:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi717:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi718:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: .Lcfi719:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi720:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi721:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi722:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi723:
-; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: kmovw %edi, %k1
-; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k1 {%k1}
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
@@ -20197,7 +14016,7 @@ entry:
define zeroext i32 @test_masked_vpcmpsgtd_v16i1_v32i1_mask_mem(i16 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgtd_v16i1_v32i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpgtd (%rsi), %zmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
@@ -20205,112 +14024,28 @@ define zeroext i32 @test_masked_vpcmpsgtd_v16i1_v32i1_mask_mem(i16 zeroext %__u,
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgtd_v16i1_v32i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi724:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi725:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi726:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: .Lcfi727:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi728:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi729:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi730:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi731:
-; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: kmovw %edi, %k1
-; NoVLX-NEXT: vpcmpgtd (%rsi), %zmm0, %k0 {%k1}
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpcmpgtd (%rsi), %zmm0, %k1 {%k1}
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
@@ -20327,118 +14062,34 @@ entry:
define zeroext i32 @test_vpcmpsgtd_v16i1_v32i1_mask_mem_b(<8 x i64> %__a, i32* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgtd_v16i1_v32i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpgtd (%rdi){1to16}, %zmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgtd_v16i1_v32i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi732:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi733:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi734:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: .Lcfi735:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi736:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi737:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi738:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi739:
-; NoVLX-NEXT: .cfi_offset %r15, -24
-; NoVLX-NEXT: vpcmpgtd (%rdi){1to16}, %zmm0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpcmpgtd (%rdi){1to16}, %zmm0, %k1
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
@@ -20453,7 +14104,7 @@ entry:
define zeroext i32 @test_masked_vpcmpsgtd_v16i1_v32i1_mask_mem_b(i16 zeroext %__u, <8 x i64> %__a, i32* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgtd_v16i1_v32i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpgtd (%rsi){1to16}, %zmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
@@ -20461,112 +14112,28 @@ define zeroext i32 @test_masked_vpcmpsgtd_v16i1_v32i1_mask_mem_b(i16 zeroext %__
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgtd_v16i1_v32i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi740:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi741:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi742:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: .Lcfi743:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi744:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi745:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi746:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi747:
-; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: kmovw %edi, %k1
-; NoVLX-NEXT: vpcmpgtd (%rsi){1to16}, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpcmpgtd (%rsi){1to16}, %zmm0, %k1 {%k1}
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
@@ -20584,108 +14151,28 @@ entry:
define zeroext i64 @test_vpcmpsgtd_v16i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgtd_v16i1_v64i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgtd_v16i1_v64i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi748:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi749:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi750:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: .Lcfi751:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi752:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi753:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi754:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi755:
-; NoVLX-NEXT: .cfi_offset %r15, -24
-; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k1
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -20694,13 +14181,9 @@ define zeroext i64 @test_vpcmpsgtd_v16i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %_
; NoVLX-NEXT: shlq $32, %rcx
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: orq %rcx, %rax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
@@ -20713,108 +14196,28 @@ entry:
define zeroext i64 @test_vpcmpsgtd_v16i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgtd_v16i1_v64i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpgtd (%rdi), %zmm0, %k0
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgtd_v16i1_v64i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi756:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi757:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi758:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: .Lcfi759:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi760:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi761:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi762:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi763:
-; NoVLX-NEXT: .cfi_offset %r15, -24
-; NoVLX-NEXT: vpcmpgtd (%rdi), %zmm0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpcmpgtd (%rdi), %zmm0, %k1
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -20823,13 +14226,9 @@ define zeroext i64 @test_vpcmpsgtd_v16i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64
; NoVLX-NEXT: shlq $32, %rcx
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: orq %rcx, %rax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
@@ -20843,7 +14242,7 @@ entry:
define zeroext i64 @test_masked_vpcmpsgtd_v16i1_v64i1_mask(i16 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgtd_v16i1_v64i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 {%k1}
; VLX-NEXT: kmovq %k0, %rax
@@ -20851,102 +14250,22 @@ define zeroext i64 @test_masked_vpcmpsgtd_v16i1_v64i1_mask(i16 zeroext %__u, <8
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgtd_v16i1_v64i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi764:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi765:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi766:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: .Lcfi767:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi768:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi769:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi770:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi771:
-; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: kmovw %edi, %k1
-; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpcmpgtd %zmm1, %zmm0, %k1 {%k1}
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -20955,13 +14274,9 @@ define zeroext i64 @test_masked_vpcmpsgtd_v16i1_v64i1_mask(i16 zeroext %__u, <8
; NoVLX-NEXT: shlq $32, %rcx
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: orq %rcx, %rax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
@@ -20976,7 +14291,7 @@ entry:
define zeroext i64 @test_masked_vpcmpsgtd_v16i1_v64i1_mask_mem(i16 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgtd_v16i1_v64i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpgtd (%rsi), %zmm0, %k0 {%k1}
; VLX-NEXT: kmovq %k0, %rax
@@ -20984,102 +14299,22 @@ define zeroext i64 @test_masked_vpcmpsgtd_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgtd_v16i1_v64i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi772:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi773:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi774:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: .Lcfi775:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi776:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi777:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi778:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi779:
-; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: kmovw %edi, %k1
-; NoVLX-NEXT: vpcmpgtd (%rsi), %zmm0, %k0 {%k1}
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpcmpgtd (%rsi), %zmm0, %k1 {%k1}
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -21088,13 +14323,9 @@ define zeroext i64 @test_masked_vpcmpsgtd_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: shlq $32, %rcx
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: orq %rcx, %rax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
@@ -21111,108 +14342,28 @@ entry:
define zeroext i64 @test_vpcmpsgtd_v16i1_v64i1_mask_mem_b(<8 x i64> %__a, i32* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgtd_v16i1_v64i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpgtd (%rdi){1to16}, %zmm0, %k0
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgtd_v16i1_v64i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi780:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi781:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi782:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: .Lcfi783:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi784:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi785:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi786:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi787:
-; NoVLX-NEXT: .cfi_offset %r15, -24
-; NoVLX-NEXT: vpcmpgtd (%rdi){1to16}, %zmm0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpcmpgtd (%rdi){1to16}, %zmm0, %k1
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -21221,13 +14372,9 @@ define zeroext i64 @test_vpcmpsgtd_v16i1_v64i1_mask_mem_b(<8 x i64> %__a, i32* %
; NoVLX-NEXT: shlq $32, %rcx
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: orq %rcx, %rax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
@@ -21242,7 +14389,7 @@ entry:
define zeroext i64 @test_masked_vpcmpsgtd_v16i1_v64i1_mask_mem_b(i16 zeroext %__u, <8 x i64> %__a, i32* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgtd_v16i1_v64i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpgtd (%rsi){1to16}, %zmm0, %k0 {%k1}
; VLX-NEXT: kmovq %k0, %rax
@@ -21250,102 +14397,22 @@ define zeroext i64 @test_masked_vpcmpsgtd_v16i1_v64i1_mask_mem_b(i16 zeroext %__
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgtd_v16i1_v64i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi788:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi789:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi790:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: .Lcfi791:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi792:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi793:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi794:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi795:
-; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: kmovw %edi, %k1
-; NoVLX-NEXT: vpcmpgtd (%rsi){1to16}, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpcmpgtd (%rsi){1to16}, %zmm0, %k1 {%k1}
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -21354,13 +14421,9 @@ define zeroext i64 @test_masked_vpcmpsgtd_v16i1_v64i1_mask_mem_b(i16 zeroext %__
; NoVLX-NEXT: shlq $32, %rcx
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: orq %rcx, %rax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
@@ -21378,14 +14441,14 @@ entry:
define zeroext i4 @test_vpcmpsgtq_v2i1_v4i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgtq_v2i1_v4i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpgtq %xmm1, %xmm0, %k0
; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v4i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0
@@ -21393,6 +14456,7 @@ define zeroext i4 @test_vpcmpsgtq_v2i1_v4i1_mask(<2 x i64> %__a, <2 x i64> %__b)
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp)
; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -21405,14 +14469,14 @@ entry:
define zeroext i4 @test_vpcmpsgtq_v2i1_v4i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgtq_v2i1_v4i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpgtq (%rdi), %xmm0, %k0
; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v4i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpcmpgtq (%rdi), %xmm0, %xmm0
; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0
@@ -21420,6 +14484,7 @@ define zeroext i4 @test_vpcmpsgtq_v2i1_v4i1_mask_mem(<2 x i64> %__a, <2 x i64>*
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp)
; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -21433,7 +14498,7 @@ entry:
define zeroext i4 @test_masked_vpcmpsgtq_v2i1_v4i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgtq_v2i1_v4i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 {%k1}
; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
@@ -21441,15 +14506,12 @@ define zeroext i4 @test_masked_vpcmpsgtq_v2i1_v4i1_mask(i8 zeroext %__u, <2 x i6
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v4i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k0, %ecx
; NoVLX-NEXT: vmovd %ecx, %xmm1
; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
@@ -21459,6 +14521,7 @@ define zeroext i4 @test_masked_vpcmpsgtq_v2i1_v4i1_mask(i8 zeroext %__u, <2 x i6
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp)
; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -21474,7 +14537,7 @@ entry:
define zeroext i4 @test_masked_vpcmpsgtq_v2i1_v4i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgtq_v2i1_v4i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpgtq (%rsi), %xmm0, %k0 {%k1}
; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
@@ -21482,15 +14545,12 @@ define zeroext i4 @test_masked_vpcmpsgtq_v2i1_v4i1_mask_mem(i8 zeroext %__u, <2
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v4i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpcmpgtq (%rsi), %xmm0, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k0, %ecx
; NoVLX-NEXT: vmovd %ecx, %xmm1
; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
@@ -21500,6 +14560,7 @@ define zeroext i4 @test_masked_vpcmpsgtq_v2i1_v4i1_mask_mem(i8 zeroext %__u, <2
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp)
; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -21517,14 +14578,14 @@ entry:
define zeroext i4 @test_vpcmpsgtq_v2i1_v4i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgtq_v2i1_v4i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpgtq (%rdi){1to2}, %xmm0, %k0
; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v4i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpbroadcastq (%rdi), %xmm1
; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
@@ -21533,6 +14594,7 @@ define zeroext i4 @test_vpcmpsgtq_v2i1_v4i1_mask_mem_b(<2 x i64> %__a, i64* %__b
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp)
; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -21547,7 +14609,7 @@ entry:
define zeroext i4 @test_masked_vpcmpsgtq_v2i1_v4i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgtq_v2i1_v4i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpgtq (%rsi){1to2}, %xmm0, %k0 {%k1}
; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
@@ -21555,16 +14617,13 @@ define zeroext i4 @test_masked_vpcmpsgtq_v2i1_v4i1_mask_mem_b(i8 zeroext %__u, <
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v4i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1
; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k0, %ecx
; NoVLX-NEXT: vmovd %ecx, %xmm1
; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
@@ -21574,6 +14633,7 @@ define zeroext i4 @test_masked_vpcmpsgtq_v2i1_v4i1_mask_mem_b(i8 zeroext %__u, <
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp)
; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -21592,33 +14652,31 @@ entry:
define zeroext i8 @test_vpcmpsgtq_v2i1_v8i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgtq_v2i1_v8i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpgtq %xmm1, %xmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %al killed %al killed %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v8i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kshiftlw $7, %k0, %k0
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
-; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %al killed %al killed %eax
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -21631,33 +14689,31 @@ entry:
define zeroext i8 @test_vpcmpsgtq_v2i1_v8i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgtq_v2i1_v8i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpgtq (%rdi), %xmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %al killed %al killed %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v8i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpcmpgtq (%rdi), %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kshiftlw $7, %k0, %k0
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
-; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %al killed %al killed %eax
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -21671,44 +14727,39 @@ entry:
define zeroext i8 @test_masked_vpcmpsgtq_v2i1_v8i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgtq_v2i1_v8i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %al killed %al killed %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v8i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k0, %ecx
; NoVLX-NEXT: vmovd %ecx, %xmm1
; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kshiftlw $7, %k0, %k0
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
-; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %al killed %al killed %eax
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -21724,44 +14775,39 @@ entry:
define zeroext i8 @test_masked_vpcmpsgtq_v2i1_v8i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgtq_v2i1_v8i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpgtq (%rsi), %xmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %al killed %al killed %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v8i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpcmpgtq (%rsi), %xmm0, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k0, %ecx
; NoVLX-NEXT: vmovd %ecx, %xmm1
; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kshiftlw $7, %k0, %k0
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
-; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %al killed %al killed %eax
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -21779,34 +14825,32 @@ entry:
define zeroext i8 @test_vpcmpsgtq_v2i1_v8i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgtq_v2i1_v8i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpgtq (%rdi){1to2}, %xmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %al killed %al killed %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v8i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpbroadcastq (%rdi), %xmm1
; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kshiftlw $7, %k0, %k0
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
-; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %al killed %al killed %eax
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -21821,45 +14865,40 @@ entry:
define zeroext i8 @test_masked_vpcmpsgtq_v2i1_v8i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgtq_v2i1_v8i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpgtq (%rsi){1to2}, %xmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %al killed %al killed %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v8i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1
; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k0, %ecx
; NoVLX-NEXT: vmovd %ecx, %xmm1
; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kshiftlw $7, %k0, %k0
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
-; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %al killed %al killed %eax
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -21878,32 +14917,31 @@ entry:
define zeroext i16 @test_vpcmpsgtq_v2i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgtq_v2i1_v16i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpgtq %xmm1, %xmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v16i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kmovw %eax, %k0
; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: andl $1, %eax
-; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
-; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -21916,32 +14954,31 @@ entry:
define zeroext i16 @test_vpcmpsgtq_v2i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgtq_v2i1_v16i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpgtq (%rdi), %xmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v16i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpcmpgtq (%rdi), %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kmovw %eax, %k0
; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: andl $1, %eax
-; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
-; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -21955,43 +14992,39 @@ entry:
define zeroext i16 @test_masked_vpcmpsgtq_v2i1_v16i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgtq_v2i1_v16i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v16i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k0, %ecx
; NoVLX-NEXT: vmovd %ecx, %xmm1
; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kmovw %eax, %k0
; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: andl $1, %eax
-; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
-; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -22007,43 +15040,39 @@ entry:
define zeroext i16 @test_masked_vpcmpsgtq_v2i1_v16i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgtq_v2i1_v16i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpgtq (%rsi), %xmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v16i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpcmpgtq (%rsi), %xmm0, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k0, %ecx
; NoVLX-NEXT: vmovd %ecx, %xmm1
; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kmovw %eax, %k0
; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: andl $1, %eax
-; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
-; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -22061,33 +15090,32 @@ entry:
define zeroext i16 @test_vpcmpsgtq_v2i1_v16i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgtq_v2i1_v16i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpgtq (%rdi){1to2}, %xmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v16i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpbroadcastq (%rdi), %xmm1
; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kmovw %eax, %k0
; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: andl $1, %eax
-; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
-; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -22102,44 +15130,40 @@ entry:
define zeroext i16 @test_masked_vpcmpsgtq_v2i1_v16i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgtq_v2i1_v16i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpgtq (%rsi){1to2}, %xmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v16i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1
; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k0, %ecx
; NoVLX-NEXT: vmovd %ecx, %xmm1
; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kmovw %eax, %k0
; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: andl $1, %eax
-; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
-; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -22158,20 +15182,17 @@ entry:
define zeroext i32 @test_vpcmpsgtq_v2i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgtq_v2i1_v32i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpgtq %xmm1, %xmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v32i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi796:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi797:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi798:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -22189,6 +15210,7 @@ define zeroext i32 @test_vpcmpsgtq_v2i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -22201,20 +15223,17 @@ entry:
define zeroext i32 @test_vpcmpsgtq_v2i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgtq_v2i1_v32i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpgtq (%rdi), %xmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v32i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi799:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi800:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi801:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -22232,6 +15251,7 @@ define zeroext i32 @test_vpcmpsgtq_v2i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -22245,32 +15265,26 @@ entry:
define zeroext i32 @test_masked_vpcmpsgtq_v2i1_v32i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgtq_v2i1_v32i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v32i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi802:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi803:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi804:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k0, %ecx
; NoVLX-NEXT: vmovd %ecx, %xmm1
; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
@@ -22287,6 +15301,7 @@ define zeroext i32 @test_masked_vpcmpsgtq_v2i1_v32i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -22302,32 +15317,26 @@ entry:
define zeroext i32 @test_masked_vpcmpsgtq_v2i1_v32i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgtq_v2i1_v32i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpgtq (%rsi), %xmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v32i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi805:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi806:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi807:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
; NoVLX-NEXT: vpcmpgtq (%rsi), %xmm0, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k0, %ecx
; NoVLX-NEXT: vmovd %ecx, %xmm1
; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
@@ -22344,6 +15353,7 @@ define zeroext i32 @test_masked_vpcmpsgtq_v2i1_v32i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -22361,20 +15371,17 @@ entry:
define zeroext i32 @test_vpcmpsgtq_v2i1_v32i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgtq_v2i1_v32i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpgtq (%rdi){1to2}, %xmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v32i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi808:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi809:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi810:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -22393,6 +15400,7 @@ define zeroext i32 @test_vpcmpsgtq_v2i1_v32i1_mask_mem_b(<2 x i64> %__a, i64* %_
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -22407,33 +15415,27 @@ entry:
define zeroext i32 @test_masked_vpcmpsgtq_v2i1_v32i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgtq_v2i1_v32i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpgtq (%rsi){1to2}, %xmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v32i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi811:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi812:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi813:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1
; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k0, %ecx
; NoVLX-NEXT: vmovd %ecx, %xmm1
; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
@@ -22450,6 +15452,7 @@ define zeroext i32 @test_masked_vpcmpsgtq_v2i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -22468,26 +15471,22 @@ entry:
define zeroext i64 @test_vpcmpsgtq_v2i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgtq_v2i1_v64i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpgtq %xmm1, %xmm0, %k0
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v64i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi814:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi815:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi816:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
-; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -22506,6 +15505,7 @@ define zeroext i64 @test_vpcmpsgtq_v2i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -22518,26 +15518,22 @@ entry:
define zeroext i64 @test_vpcmpsgtq_v2i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgtq_v2i1_v64i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpgtq (%rdi), %xmm0, %k0
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v64i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi817:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi818:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi819:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vpcmpgtq (%rdi), %xmm0, %xmm0
-; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -22556,6 +15552,7 @@ define zeroext i64 @test_vpcmpsgtq_v2i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -22569,37 +15566,30 @@ entry:
define zeroext i64 @test_masked_vpcmpsgtq_v2i1_v64i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgtq_v2i1_v64i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 {%k1}
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v64i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi820:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi821:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi822:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k0, %ecx
; NoVLX-NEXT: vmovd %ecx, %xmm1
; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
-; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -22618,6 +15608,7 @@ define zeroext i64 @test_masked_vpcmpsgtq_v2i1_v64i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -22633,37 +15624,30 @@ entry:
define zeroext i64 @test_masked_vpcmpsgtq_v2i1_v64i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgtq_v2i1_v64i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpgtq (%rsi), %xmm0, %k0 {%k1}
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v64i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi823:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi824:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi825:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vpcmpgtq (%rsi), %xmm0, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k0, %ecx
; NoVLX-NEXT: vmovd %ecx, %xmm1
; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
-; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -22682,6 +15666,7 @@ define zeroext i64 @test_masked_vpcmpsgtq_v2i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -22699,27 +15684,23 @@ entry:
define zeroext i64 @test_vpcmpsgtq_v2i1_v64i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgtq_v2i1_v64i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpgtq (%rdi){1to2}, %xmm0, %k0
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v64i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi826:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi827:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi828:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vpbroadcastq (%rdi), %xmm1
; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
-; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -22738,6 +15719,7 @@ define zeroext i64 @test_vpcmpsgtq_v2i1_v64i1_mask_mem_b(<2 x i64> %__a, i64* %_
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -22752,38 +15734,31 @@ entry:
define zeroext i64 @test_masked_vpcmpsgtq_v2i1_v64i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgtq_v2i1_v64i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpgtq (%rsi){1to2}, %xmm0, %k0 {%k1}
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v64i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi829:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi830:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi831:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1
; NoVLX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k0, %ecx
; NoVLX-NEXT: vmovd %ecx, %xmm1
; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
-; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -22802,6 +15777,7 @@ define zeroext i64 @test_masked_vpcmpsgtq_v2i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -22820,51 +15796,48 @@ entry:
define zeroext i8 @test_vpcmpsgtq_v4i1_v8i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgtq_v4i1_v8i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpgtq %ymm1, %ymm0, %k0
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %al killed %al killed %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgtq_v4i1_v8i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kshiftlw $7, %k0, %k0
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $13, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $12, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %al killed %al killed %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
@@ -22877,51 +15850,48 @@ entry:
define zeroext i8 @test_vpcmpsgtq_v4i1_v8i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgtq_v4i1_v8i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpgtq (%rdi), %ymm0, %k0
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %al killed %al killed %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgtq_v4i1_v8i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpcmpgtq (%rdi), %ymm0, %ymm0
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kshiftlw $7, %k0, %k0
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $13, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $12, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %al killed %al killed %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
@@ -22935,70 +15905,62 @@ entry:
define zeroext i8 @test_masked_vpcmpsgtq_v4i1_v8i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgtq_v4i1_v8i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpgtq %ymm1, %ymm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %al killed %al killed %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v8i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $13, %k0, %k2
-; NoVLX-NEXT: kshiftrw $15, %k2, %k2
-; NoVLX-NEXT: kshiftlw $15, %k0, %k3
-; NoVLX-NEXT: kshiftrw $15, %k3, %k3
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k3, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kshiftlw $7, %k0, %k0
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $13, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $12, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %al killed %al killed %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
@@ -23014,70 +15976,62 @@ entry:
define zeroext i8 @test_masked_vpcmpsgtq_v4i1_v8i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgtq_v4i1_v8i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpgtq (%rsi), %ymm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %al killed %al killed %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v8i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpcmpgtq (%rsi), %ymm0, %ymm0
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $13, %k0, %k2
-; NoVLX-NEXT: kshiftrw $15, %k2, %k2
-; NoVLX-NEXT: kshiftlw $15, %k0, %k3
-; NoVLX-NEXT: kshiftrw $15, %k3, %k3
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k3, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kshiftlw $7, %k0, %k0
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $13, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $12, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %al killed %al killed %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
@@ -23095,52 +16049,49 @@ entry:
define zeroext i8 @test_vpcmpsgtq_v4i1_v8i1_mask_mem_b(<4 x i64> %__a, i64* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgtq_v4i1_v8i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpgtq (%rdi){1to4}, %ymm0, %k0
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %al killed %al killed %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgtq_v4i1_v8i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpbroadcastq (%rdi), %ymm1
; NoVLX-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kshiftlw $7, %k0, %k0
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $13, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $12, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %al killed %al killed %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
@@ -23155,71 +16106,63 @@ entry:
define zeroext i8 @test_masked_vpcmpsgtq_v4i1_v8i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i64* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgtq_v4i1_v8i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpgtq (%rsi){1to4}, %ymm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %al killed %al killed %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v8i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpbroadcastq (%rsi), %ymm1
; NoVLX-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $13, %k0, %k2
-; NoVLX-NEXT: kshiftrw $15, %k2, %k2
-; NoVLX-NEXT: kshiftlw $15, %k0, %k3
-; NoVLX-NEXT: kshiftrw $15, %k3, %k3
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k3, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kshiftlw $7, %k0, %k0
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $13, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $12, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %al killed %al killed %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
@@ -23238,50 +16181,48 @@ entry:
define zeroext i16 @test_vpcmpsgtq_v4i1_v16i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgtq_v4i1_v16i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpgtq %ymm1, %ymm0, %k0
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgtq_v4i1_v16i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kmovw %eax, %k0
; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: andl $1, %eax
-; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $13, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $12, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
@@ -23294,50 +16235,48 @@ entry:
define zeroext i16 @test_vpcmpsgtq_v4i1_v16i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgtq_v4i1_v16i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpgtq (%rdi), %ymm0, %k0
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgtq_v4i1_v16i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpcmpgtq (%rdi), %ymm0, %ymm0
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kmovw %eax, %k0
; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: andl $1, %eax
-; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $13, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $12, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
@@ -23351,69 +16290,62 @@ entry:
define zeroext i16 @test_masked_vpcmpsgtq_v4i1_v16i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgtq_v4i1_v16i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpgtq %ymm1, %ymm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v16i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $13, %k0, %k2
-; NoVLX-NEXT: kshiftrw $15, %k2, %k2
-; NoVLX-NEXT: kshiftlw $15, %k0, %k3
-; NoVLX-NEXT: kshiftrw $15, %k3, %k3
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k3, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kmovw %eax, %k0
; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: andl $1, %eax
-; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $13, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $12, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
@@ -23429,69 +16361,62 @@ entry:
define zeroext i16 @test_masked_vpcmpsgtq_v4i1_v16i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgtq_v4i1_v16i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpgtq (%rsi), %ymm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v16i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpcmpgtq (%rsi), %ymm0, %ymm0
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $13, %k0, %k2
-; NoVLX-NEXT: kshiftrw $15, %k2, %k2
-; NoVLX-NEXT: kshiftlw $15, %k0, %k3
-; NoVLX-NEXT: kshiftrw $15, %k3, %k3
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k3, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kmovw %eax, %k0
; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: andl $1, %eax
-; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $13, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $12, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
@@ -23509,51 +16434,49 @@ entry:
define zeroext i16 @test_vpcmpsgtq_v4i1_v16i1_mask_mem_b(<4 x i64> %__a, i64* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgtq_v4i1_v16i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpgtq (%rdi){1to4}, %ymm0, %k0
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgtq_v4i1_v16i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpbroadcastq (%rdi), %ymm1
; NoVLX-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kmovw %eax, %k0
; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: andl $1, %eax
-; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $13, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $12, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
@@ -23568,70 +16491,63 @@ entry:
define zeroext i16 @test_masked_vpcmpsgtq_v4i1_v16i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i64* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgtq_v4i1_v16i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpgtq (%rsi){1to4}, %ymm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v16i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpbroadcastq (%rsi), %ymm1
; NoVLX-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $13, %k0, %k2
-; NoVLX-NEXT: kshiftrw $15, %k2, %k2
-; NoVLX-NEXT: kshiftlw $15, %k0, %k3
-; NoVLX-NEXT: kshiftrw $15, %k3, %k3
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k3, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kmovw %eax, %k0
; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: andl $1, %eax
-; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $13, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $12, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
@@ -23650,21 +16566,18 @@ entry:
define zeroext i32 @test_vpcmpsgtq_v4i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgtq_v4i1_v32i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpgtq %ymm1, %ymm0, %k0
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgtq_v4i1_v32i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi832:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi833:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi834:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -23683,6 +16596,7 @@ define zeroext i32 @test_vpcmpsgtq_v4i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
@@ -23695,21 +16609,18 @@ entry:
define zeroext i32 @test_vpcmpsgtq_v4i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgtq_v4i1_v32i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpgtq (%rdi), %ymm0, %k0
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgtq_v4i1_v32i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi835:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi836:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi837:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -23728,6 +16639,7 @@ define zeroext i32 @test_vpcmpsgtq_v4i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
@@ -23741,7 +16653,7 @@ entry:
define zeroext i32 @test_masked_vpcmpsgtq_v4i1_v32i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgtq_v4i1_v32i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpgtq %ymm1, %ymm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
@@ -23749,35 +16661,27 @@ define zeroext i32 @test_masked_vpcmpsgtq_v4i1_v32i1_mask(i8 zeroext %__u, <4 x
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v32i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi838:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi839:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi840:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
; NoVLX-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $13, %k0, %k2
-; NoVLX-NEXT: kshiftrw $15, %k2, %k2
-; NoVLX-NEXT: kshiftlw $15, %k0, %k3
-; NoVLX-NEXT: kshiftrw $15, %k3, %k3
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k3, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -23793,6 +16697,7 @@ define zeroext i32 @test_masked_vpcmpsgtq_v4i1_v32i1_mask(i8 zeroext %__u, <4 x
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
@@ -23808,7 +16713,7 @@ entry:
define zeroext i32 @test_masked_vpcmpsgtq_v4i1_v32i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgtq_v4i1_v32i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpgtq (%rsi), %ymm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
@@ -23816,35 +16721,27 @@ define zeroext i32 @test_masked_vpcmpsgtq_v4i1_v32i1_mask_mem(i8 zeroext %__u, <
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v32i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi841:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi842:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi843:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
; NoVLX-NEXT: vpcmpgtq (%rsi), %ymm0, %ymm0
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $13, %k0, %k2
-; NoVLX-NEXT: kshiftrw $15, %k2, %k2
-; NoVLX-NEXT: kshiftlw $15, %k0, %k3
-; NoVLX-NEXT: kshiftrw $15, %k3, %k3
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k3, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -23860,6 +16757,7 @@ define zeroext i32 @test_masked_vpcmpsgtq_v4i1_v32i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
@@ -23877,21 +16775,18 @@ entry:
define zeroext i32 @test_vpcmpsgtq_v4i1_v32i1_mask_mem_b(<4 x i64> %__a, i64* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgtq_v4i1_v32i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpgtq (%rdi){1to4}, %ymm0, %k0
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgtq_v4i1_v32i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi844:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi845:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi846:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -23911,6 +16806,7 @@ define zeroext i32 @test_vpcmpsgtq_v4i1_v32i1_mask_mem_b(<4 x i64> %__a, i64* %_
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
@@ -23925,7 +16821,7 @@ entry:
define zeroext i32 @test_masked_vpcmpsgtq_v4i1_v32i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i64* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgtq_v4i1_v32i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpgtq (%rsi){1to4}, %ymm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
@@ -23933,14 +16829,11 @@ define zeroext i32 @test_masked_vpcmpsgtq_v4i1_v32i1_mask_mem_b(i8 zeroext %__u,
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v32i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi847:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi848:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi849:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -23948,21 +16841,16 @@ define zeroext i32 @test_masked_vpcmpsgtq_v4i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $13, %k0, %k2
-; NoVLX-NEXT: kshiftrw $15, %k2, %k2
-; NoVLX-NEXT: kshiftlw $15, %k0, %k3
-; NoVLX-NEXT: kshiftrw $15, %k3, %k3
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k3, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -23978,6 +16866,7 @@ define zeroext i32 @test_masked_vpcmpsgtq_v4i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
@@ -23996,28 +16885,24 @@ entry:
define zeroext i64 @test_vpcmpsgtq_v4i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgtq_v4i1_v64i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpgtq %ymm1, %ymm0, %k0
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgtq_v4i1_v64i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi850:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi851:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi852:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
-; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -24036,6 +16921,7 @@ define zeroext i64 @test_vpcmpsgtq_v4i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
@@ -24048,28 +16934,24 @@ entry:
define zeroext i64 @test_vpcmpsgtq_v4i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgtq_v4i1_v64i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpgtq (%rdi), %ymm0, %k0
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgtq_v4i1_v64i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi853:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi854:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi855:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vpcmpgtq (%rdi), %ymm0, %ymm0
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
-; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -24088,6 +16970,7 @@ define zeroext i64 @test_vpcmpsgtq_v4i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
@@ -24101,7 +16984,7 @@ entry:
define zeroext i64 @test_masked_vpcmpsgtq_v4i1_v64i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgtq_v4i1_v64i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpgtq %ymm1, %ymm0, %k0 {%k1}
; VLX-NEXT: kmovq %k0, %rax
@@ -24109,39 +16992,30 @@ define zeroext i64 @test_masked_vpcmpsgtq_v4i1_v64i1_mask(i8 zeroext %__u, <4 x
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v64i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi856:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi857:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi858:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $13, %k0, %k2
-; NoVLX-NEXT: kshiftrw $15, %k2, %k2
-; NoVLX-NEXT: kshiftlw $15, %k0, %k3
-; NoVLX-NEXT: kshiftrw $15, %k3, %k3
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k3, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
-; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -24160,6 +17034,7 @@ define zeroext i64 @test_masked_vpcmpsgtq_v4i1_v64i1_mask(i8 zeroext %__u, <4 x
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
@@ -24175,7 +17050,7 @@ entry:
define zeroext i64 @test_masked_vpcmpsgtq_v4i1_v64i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgtq_v4i1_v64i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpgtq (%rsi), %ymm0, %k0 {%k1}
; VLX-NEXT: kmovq %k0, %rax
@@ -24183,39 +17058,30 @@ define zeroext i64 @test_masked_vpcmpsgtq_v4i1_v64i1_mask_mem(i8 zeroext %__u, <
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v64i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi859:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi860:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi861:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vpcmpgtq (%rsi), %ymm0, %ymm0
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $13, %k0, %k2
-; NoVLX-NEXT: kshiftrw $15, %k2, %k2
-; NoVLX-NEXT: kshiftlw $15, %k0, %k3
-; NoVLX-NEXT: kshiftrw $15, %k3, %k3
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k3, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
-; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -24234,6 +17100,7 @@ define zeroext i64 @test_masked_vpcmpsgtq_v4i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
@@ -24251,29 +17118,25 @@ entry:
define zeroext i64 @test_vpcmpsgtq_v4i1_v64i1_mask_mem_b(<4 x i64> %__a, i64* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgtq_v4i1_v64i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpgtq (%rdi){1to4}, %ymm0, %k0
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgtq_v4i1_v64i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi862:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi863:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi864:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vpbroadcastq (%rdi), %ymm1
; NoVLX-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
-; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -24292,6 +17155,7 @@ define zeroext i64 @test_vpcmpsgtq_v4i1_v64i1_mask_mem_b(<4 x i64> %__a, i64* %_
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
@@ -24306,7 +17170,7 @@ entry:
define zeroext i64 @test_masked_vpcmpsgtq_v4i1_v64i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i64* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgtq_v4i1_v64i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpgtq (%rsi){1to4}, %ymm0, %k0 {%k1}
; VLX-NEXT: kmovq %k0, %rax
@@ -24314,14 +17178,11 @@ define zeroext i64 @test_masked_vpcmpsgtq_v4i1_v64i1_mask_mem_b(i8 zeroext %__u,
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v64i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi865:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi866:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi867:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -24329,25 +17190,19 @@ define zeroext i64 @test_masked_vpcmpsgtq_v4i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $13, %k0, %k2
-; NoVLX-NEXT: kshiftrw $15, %k2, %k2
-; NoVLX-NEXT: kshiftlw $15, %k0, %k3
-; NoVLX-NEXT: kshiftrw $15, %k3, %k3
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k3, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
-; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -24366,6 +17221,7 @@ define zeroext i64 @test_masked_vpcmpsgtq_v4i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
@@ -24384,18 +17240,19 @@ entry:
define zeroext i16 @test_vpcmpsgtq_v8i1_v16i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgtq_v8i1_v16i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpgtq %zmm1, %zmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgtq_v8i1_v16i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpcmpgtq %zmm1, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
@@ -24408,18 +17265,19 @@ entry:
define zeroext i16 @test_vpcmpsgtq_v8i1_v16i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgtq_v8i1_v16i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpgtq (%rdi), %zmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgtq_v8i1_v16i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpcmpgtq (%rdi), %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
@@ -24433,20 +17291,21 @@ entry:
define zeroext i16 @test_masked_vpcmpsgtq_v8i1_v16i1_mask(i8 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgtq_v8i1_v16i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpgtq %zmm1, %zmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgtq_v8i1_v16i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpgtq %zmm1, %zmm0, %k0 {%k1}
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
@@ -24461,20 +17320,21 @@ entry:
define zeroext i16 @test_masked_vpcmpsgtq_v8i1_v16i1_mask_mem(i8 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgtq_v8i1_v16i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpgtq (%rsi), %zmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgtq_v8i1_v16i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpgtq (%rsi), %zmm0, %k0 {%k1}
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
@@ -24491,18 +17351,19 @@ entry:
define zeroext i16 @test_vpcmpsgtq_v8i1_v16i1_mask_mem_b(<8 x i64> %__a, i64* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgtq_v8i1_v16i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpgtq (%rdi){1to8}, %zmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgtq_v8i1_v16i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpcmpgtq (%rdi){1to8}, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
@@ -24517,20 +17378,21 @@ entry:
define zeroext i16 @test_masked_vpcmpsgtq_v8i1_v16i1_mask_mem_b(i8 zeroext %__u, <8 x i64> %__a, i64* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgtq_v8i1_v16i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpgtq (%rsi){1to8}, %zmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgtq_v8i1_v16i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpgtq (%rsi){1to8}, %zmm0, %k0 {%k1}
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
@@ -24548,61 +17410,48 @@ entry:
define zeroext i32 @test_vpcmpsgtq_v8i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgtq_v8i1_v32i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpgtq %zmm1, %zmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgtq_v8i1_v32i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi868:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi869:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi870:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
; NoVLX-NEXT: vpcmpgtq %zmm1, %zmm0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -24610,6 +17459,7 @@ define zeroext i32 @test_vpcmpsgtq_v8i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %__
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
@@ -24622,61 +17472,48 @@ entry:
define zeroext i32 @test_vpcmpsgtq_v8i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgtq_v8i1_v32i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpgtq (%rdi), %zmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgtq_v8i1_v32i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi871:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi872:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi873:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
; NoVLX-NEXT: vpcmpgtq (%rdi), %zmm0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -24684,6 +17521,7 @@ define zeroext i32 @test_vpcmpsgtq_v8i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64>
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
@@ -24697,7 +17535,7 @@ entry:
define zeroext i32 @test_masked_vpcmpsgtq_v8i1_v32i1_mask(i8 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgtq_v8i1_v32i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpgtq %zmm1, %zmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
@@ -24705,55 +17543,42 @@ define zeroext i32 @test_masked_vpcmpsgtq_v8i1_v32i1_mask(i8 zeroext %__u, <8 x
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgtq_v8i1_v32i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi874:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi875:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi876:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpgtq %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -24761,6 +17586,7 @@ define zeroext i32 @test_masked_vpcmpsgtq_v8i1_v32i1_mask(i8 zeroext %__u, <8 x
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
@@ -24775,7 +17601,7 @@ entry:
define zeroext i32 @test_masked_vpcmpsgtq_v8i1_v32i1_mask_mem(i8 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgtq_v8i1_v32i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpgtq (%rsi), %zmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
@@ -24783,55 +17609,42 @@ define zeroext i32 @test_masked_vpcmpsgtq_v8i1_v32i1_mask_mem(i8 zeroext %__u, <
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgtq_v8i1_v32i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi877:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi878:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi879:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpgtq (%rsi), %zmm0, %k0 {%k1}
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -24839,6 +17652,7 @@ define zeroext i32 @test_masked_vpcmpsgtq_v8i1_v32i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
@@ -24855,61 +17669,48 @@ entry:
define zeroext i32 @test_vpcmpsgtq_v8i1_v32i1_mask_mem_b(<8 x i64> %__a, i64* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgtq_v8i1_v32i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpgtq (%rdi){1to8}, %zmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgtq_v8i1_v32i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi880:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi881:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi882:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
; NoVLX-NEXT: vpcmpgtq (%rdi){1to8}, %zmm0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -24917,6 +17718,7 @@ define zeroext i32 @test_vpcmpsgtq_v8i1_v32i1_mask_mem_b(<8 x i64> %__a, i64* %_
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
@@ -24931,7 +17733,7 @@ entry:
define zeroext i32 @test_masked_vpcmpsgtq_v8i1_v32i1_mask_mem_b(i8 zeroext %__u, <8 x i64> %__a, i64* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgtq_v8i1_v32i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpgtq (%rsi){1to8}, %zmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
@@ -24939,55 +17741,42 @@ define zeroext i32 @test_masked_vpcmpsgtq_v8i1_v32i1_mask_mem_b(i8 zeroext %__u,
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgtq_v8i1_v32i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi883:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi884:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi885:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpgtq (%rsi){1to8}, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -24995,6 +17784,7 @@ define zeroext i32 @test_masked_vpcmpsgtq_v8i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
@@ -25012,63 +17802,50 @@ entry:
define zeroext i64 @test_vpcmpsgtq_v8i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgtq_v8i1_v64i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpgtq %zmm1, %zmm0, %k0
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgtq_v8i1_v64i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi886:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi887:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi888:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vpcmpgtq %zmm1, %zmm0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -25079,6 +17856,7 @@ define zeroext i64 @test_vpcmpsgtq_v8i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
@@ -25091,63 +17869,50 @@ entry:
define zeroext i64 @test_vpcmpsgtq_v8i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgtq_v8i1_v64i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpgtq (%rdi), %zmm0, %k0
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgtq_v8i1_v64i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi889:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi890:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi891:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vpcmpgtq (%rdi), %zmm0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -25158,6 +17923,7 @@ define zeroext i64 @test_vpcmpsgtq_v8i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
@@ -25171,7 +17937,7 @@ entry:
define zeroext i64 @test_masked_vpcmpsgtq_v8i1_v64i1_mask(i8 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgtq_v8i1_v64i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpgtq %zmm1, %zmm0, %k0 {%k1}
; VLX-NEXT: kmovq %k0, %rax
@@ -25179,57 +17945,44 @@ define zeroext i64 @test_masked_vpcmpsgtq_v8i1_v64i1_mask(i8 zeroext %__u, <8 x
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgtq_v8i1_v64i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi892:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi893:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi894:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpgtq %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -25240,6 +17993,7 @@ define zeroext i64 @test_masked_vpcmpsgtq_v8i1_v64i1_mask(i8 zeroext %__u, <8 x
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
@@ -25254,7 +18008,7 @@ entry:
define zeroext i64 @test_masked_vpcmpsgtq_v8i1_v64i1_mask_mem(i8 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgtq_v8i1_v64i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpgtq (%rsi), %zmm0, %k0 {%k1}
; VLX-NEXT: kmovq %k0, %rax
@@ -25262,57 +18016,44 @@ define zeroext i64 @test_masked_vpcmpsgtq_v8i1_v64i1_mask_mem(i8 zeroext %__u, <
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgtq_v8i1_v64i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi895:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi896:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi897:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpgtq (%rsi), %zmm0, %k0 {%k1}
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -25323,6 +18064,7 @@ define zeroext i64 @test_masked_vpcmpsgtq_v8i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
@@ -25339,63 +18081,50 @@ entry:
define zeroext i64 @test_vpcmpsgtq_v8i1_v64i1_mask_mem_b(<8 x i64> %__a, i64* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgtq_v8i1_v64i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpgtq (%rdi){1to8}, %zmm0, %k0
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgtq_v8i1_v64i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi898:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi899:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi900:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vpcmpgtq (%rdi){1to8}, %zmm0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -25406,6 +18135,7 @@ define zeroext i64 @test_vpcmpsgtq_v8i1_v64i1_mask_mem_b(<8 x i64> %__a, i64* %_
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
@@ -25420,7 +18150,7 @@ entry:
define zeroext i64 @test_masked_vpcmpsgtq_v8i1_v64i1_mask_mem_b(i8 zeroext %__u, <8 x i64> %__a, i64* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgtq_v8i1_v64i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpgtq (%rsi){1to8}, %zmm0, %k0 {%k1}
; VLX-NEXT: kmovq %k0, %rax
@@ -25428,57 +18158,44 @@ define zeroext i64 @test_masked_vpcmpsgtq_v8i1_v64i1_mask_mem_b(i8 zeroext %__u,
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgtq_v8i1_v64i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi901:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi902:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi903:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpgtq (%rsi){1to8}, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -25489,6 +18206,7 @@ define zeroext i64 @test_masked_vpcmpsgtq_v8i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
@@ -25506,122 +18224,33 @@ entry:
define zeroext i32 @test_vpcmpsgeb_v16i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgeb_v16i1_v32i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpleb %xmm0, %xmm1, %k0
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgeb_v16i1_v32i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi904:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi905:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi906:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: .Lcfi907:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi908:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi909:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi910:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi911:
-; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <16 x i8>
@@ -25634,123 +18263,34 @@ entry:
define zeroext i32 @test_vpcmpsgeb_v16i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgeb_v16i1_v32i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpnltb (%rdi), %xmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgeb_v16i1_v32i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi912:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi913:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi914:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: .Lcfi915:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi916:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi917:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi918:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi919:
-; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: vmovdqa (%rdi), %xmm1
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <16 x i8>
@@ -25764,124 +18304,40 @@ entry:
define zeroext i32 @test_masked_vpcmpsgeb_v16i1_v32i1_mask(i16 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgeb_v16i1_v32i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpleb %xmm0, %xmm1, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgeb_v16i1_v32i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi920:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi921:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi922:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: .Lcfi923:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi924:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi925:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi926:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi927:
-; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1}
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <16 x i8>
@@ -25896,39 +18352,21 @@ entry:
define zeroext i32 @test_masked_vpcmpsgeb_v16i1_v32i1_mask_mem(i16 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgeb_v16i1_v32i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpnltb (%rsi), %xmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgeb_v16i1_v32i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi928:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi929:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi930:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: .Lcfi931:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi932:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi933:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi934:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi935:
-; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: vmovdqa (%rsi), %xmm1
; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
@@ -25936,85 +18374,19 @@ define zeroext i32 @test_masked_vpcmpsgeb_v16i1_v32i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1}
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <16 x i8>
@@ -26031,127 +18403,38 @@ entry:
define zeroext i64 @test_vpcmpsgeb_v16i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgeb_v16i1_v64i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpleb %xmm0, %xmm1, %k0
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgeb_v16i1_v64i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi936:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi937:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi938:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: .Lcfi939:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi940:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi941:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi942:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi943:
-; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
; NoVLX-NEXT: shlq $32, %rcx
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: orq %rcx, %rax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <16 x i8>
@@ -26164,128 +18447,39 @@ entry:
define zeroext i64 @test_vpcmpsgeb_v16i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgeb_v16i1_v64i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpnltb (%rdi), %xmm0, %k0
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgeb_v16i1_v64i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi944:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi945:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi946:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: .Lcfi947:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi948:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi949:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi950:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi951:
-; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: vmovdqa (%rdi), %xmm1
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
; NoVLX-NEXT: shlq $32, %rcx
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: orq %rcx, %rax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <16 x i8>
@@ -26299,114 +18493,34 @@ entry:
define zeroext i64 @test_masked_vpcmpsgeb_v16i1_v64i1_mask(i16 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgeb_v16i1_v64i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpleb %xmm0, %xmm1, %k0 {%k1}
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgeb_v16i1_v64i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi952:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi953:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi954:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: .Lcfi955:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi956:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi957:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi958:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi959:
-; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1}
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -26415,13 +18529,9 @@ define zeroext i64 @test_masked_vpcmpsgeb_v16i1_v64i1_mask(i16 zeroext %__u, <2
; NoVLX-NEXT: shlq $32, %rcx
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: orq %rcx, %rax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <16 x i8>
@@ -26436,39 +18546,21 @@ entry:
define zeroext i64 @test_masked_vpcmpsgeb_v16i1_v64i1_mask_mem(i16 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgeb_v16i1_v64i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpnltb (%rsi), %xmm0, %k0 {%k1}
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgeb_v16i1_v64i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi960:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi961:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi962:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: .Lcfi963:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi964:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi965:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi966:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi967:
-; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: vmovdqa (%rsi), %xmm1
; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
@@ -26476,75 +18568,13 @@ define zeroext i64 @test_masked_vpcmpsgeb_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1}
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -26553,13 +18583,9 @@ define zeroext i64 @test_masked_vpcmpsgeb_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: shlq $32, %rcx
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: orq %rcx, %rax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <16 x i8>
@@ -26576,21 +18602,18 @@ entry:
define zeroext i64 @test_vpcmpsgeb_v32i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgeb_v32i1_v64i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpleb %ymm0, %ymm1, %k0
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgeb_v32i1_v64i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi968:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi969:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi970:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -26607,8 +18630,7 @@ define zeroext i64 @test_vpcmpsgeb_v32i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %_
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %ecx
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
@@ -26616,6 +18638,7 @@ define zeroext i64 @test_vpcmpsgeb_v32i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %_
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <32 x i8>
@@ -26628,21 +18651,18 @@ entry:
define zeroext i64 @test_vpcmpsgeb_v32i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgeb_v32i1_v64i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpnltb (%rdi), %ymm0, %k0
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgeb_v32i1_v64i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi971:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi972:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi973:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -26660,8 +18680,7 @@ define zeroext i64 @test_vpcmpsgeb_v32i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %ecx
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
@@ -26669,6 +18688,7 @@ define zeroext i64 @test_vpcmpsgeb_v32i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <32 x i8>
@@ -26682,7 +18702,7 @@ entry:
define zeroext i64 @test_masked_vpcmpsgeb_v32i1_v64i1_mask(i32 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgeb_v32i1_v64i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpleb %ymm0, %ymm1, %k0 {%k1}
; VLX-NEXT: kmovq %k0, %rax
@@ -26690,14 +18710,11 @@ define zeroext i64 @test_masked_vpcmpsgeb_v32i1_v64i1_mask(i32 zeroext %__u, <4
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgeb_v32i1_v64i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi974:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi975:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi976:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $96, %rsp
@@ -26708,7 +18725,6 @@ define zeroext i64 @test_masked_vpcmpsgeb_v32i1_v64i1_mask(i32 zeroext %__u, <4
; NoVLX-NEXT: vpmovdb %zmm2, %xmm2
; NoVLX-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k2} {z}
; NoVLX-NEXT: vpmovdb %zmm3, %xmm3
-; NoVLX-NEXT: vpxord %zmm4, %zmm4, %zmm4
; NoVLX-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0
; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
@@ -26724,7 +18740,7 @@ define zeroext i64 @test_masked_vpcmpsgeb_v32i1_v64i1_mask(i32 zeroext %__u, <4
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %ecx
-; NoVLX-NEXT: vptestmd %zmm4, %zmm4, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
@@ -26732,6 +18748,7 @@ define zeroext i64 @test_masked_vpcmpsgeb_v32i1_v64i1_mask(i32 zeroext %__u, <4
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <32 x i8>
@@ -26746,7 +18763,7 @@ entry:
define zeroext i64 @test_masked_vpcmpsgeb_v32i1_v64i1_mask_mem(i32 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgeb_v32i1_v64i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpnltb (%rsi), %ymm0, %k0 {%k1}
; VLX-NEXT: kmovq %k0, %rax
@@ -26754,14 +18771,11 @@ define zeroext i64 @test_masked_vpcmpsgeb_v32i1_v64i1_mask_mem(i32 zeroext %__u,
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgeb_v32i1_v64i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi977:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi978:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi979:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $96, %rsp
@@ -26772,13 +18786,12 @@ define zeroext i64 @test_masked_vpcmpsgeb_v32i1_v64i1_mask_mem(i32 zeroext %__u,
; NoVLX-NEXT: vpmovdb %zmm1, %xmm1
; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k2} {z}
; NoVLX-NEXT: vpmovdb %zmm2, %xmm2
-; NoVLX-NEXT: vpxord %zmm3, %zmm3, %zmm3
-; NoVLX-NEXT: vmovdqa (%rsi), %ymm4
-; NoVLX-NEXT: vpcmpgtb %ymm0, %ymm4, %ymm0
-; NoVLX-NEXT: vpcmpeqd %ymm4, %ymm4, %ymm4
-; NoVLX-NEXT: vpxor %ymm4, %ymm0, %ymm0
-; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm4
-; NoVLX-NEXT: vpand %xmm2, %xmm4, %xmm2
+; NoVLX-NEXT: vmovdqa (%rsi), %ymm3
+; NoVLX-NEXT: vpcmpgtb %ymm0, %ymm3, %ymm0
+; NoVLX-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3
+; NoVLX-NEXT: vpxor %ymm3, %ymm0, %ymm0
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm3
+; NoVLX-NEXT: vpand %xmm2, %xmm3, %xmm2
; NoVLX-NEXT: vpmovsxbd %xmm2, %zmm2
; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2
; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0
@@ -26789,7 +18802,7 @@ define zeroext i64 @test_masked_vpcmpsgeb_v32i1_v64i1_mask_mem(i32 zeroext %__u,
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %ecx
-; NoVLX-NEXT: vptestmd %zmm3, %zmm3, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
@@ -26797,6 +18810,7 @@ define zeroext i64 @test_masked_vpcmpsgeb_v32i1_v64i1_mask_mem(i32 zeroext %__u,
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <32 x i8>
@@ -26813,24 +18827,23 @@ entry:
define zeroext i16 @test_vpcmpsgew_v8i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgew_v8i1_v16i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmplew %xmm0, %xmm1, %k0
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgew_v8i1_v16i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kunpckbw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
@@ -26843,14 +18856,14 @@ entry:
define zeroext i16 @test_vpcmpsgew_v8i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgew_v8i1_v16i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpnltw (%rdi), %xmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgew_v8i1_v16i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vmovdqa (%rdi), %xmm1
; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
@@ -26858,10 +18871,9 @@ define zeroext i16 @test_vpcmpsgew_v8i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kunpckbw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
@@ -26875,15 +18887,15 @@ entry:
define zeroext i16 @test_masked_vpcmpsgew_v8i1_v16i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgew_v8i1_v16i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmplew %xmm0, %xmm1, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgew_v8i1_v16i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
@@ -26891,10 +18903,9 @@ define zeroext i16 @test_masked_vpcmpsgew_v8i1_v16i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kunpckbw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
@@ -26909,15 +18920,15 @@ entry:
define zeroext i16 @test_masked_vpcmpsgew_v8i1_v16i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgew_v8i1_v16i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpnltw (%rsi), %xmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgew_v8i1_v16i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vmovdqa (%rsi), %xmm1
; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
@@ -26926,10 +18937,9 @@ define zeroext i16 @test_masked_vpcmpsgew_v8i1_v16i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kunpckbw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
@@ -26946,20 +18956,17 @@ entry:
define zeroext i32 @test_vpcmpsgew_v8i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgew_v8i1_v32i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmplew %xmm0, %xmm1, %k0
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgew_v8i1_v32i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi980:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi981:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi982:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -26969,42 +18976,32 @@ define zeroext i32 @test_vpcmpsgew_v8i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -27012,6 +19009,7 @@ define zeroext i32 @test_vpcmpsgew_v8i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
@@ -27024,20 +19022,17 @@ entry:
define zeroext i32 @test_vpcmpsgew_v8i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgew_v8i1_v32i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpnltw (%rdi), %xmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgew_v8i1_v32i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi983:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi984:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi985:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -27048,42 +19043,32 @@ define zeroext i32 @test_vpcmpsgew_v8i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -27091,6 +19076,7 @@ define zeroext i32 @test_vpcmpsgew_v8i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
@@ -27104,21 +19090,18 @@ entry:
define zeroext i32 @test_masked_vpcmpsgew_v8i1_v32i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgew_v8i1_v32i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmplew %xmm0, %xmm1, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgew_v8i1_v32i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi986:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi987:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi988:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -27129,42 +19112,32 @@ define zeroext i32 @test_masked_vpcmpsgew_v8i1_v32i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -27172,6 +19145,7 @@ define zeroext i32 @test_masked_vpcmpsgew_v8i1_v32i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
@@ -27186,21 +19160,18 @@ entry:
define zeroext i32 @test_masked_vpcmpsgew_v8i1_v32i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgew_v8i1_v32i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpnltw (%rsi), %xmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgew_v8i1_v32i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi989:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi990:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi991:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -27212,42 +19183,32 @@ define zeroext i32 @test_masked_vpcmpsgew_v8i1_v32i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -27255,6 +19216,7 @@ define zeroext i32 @test_masked_vpcmpsgew_v8i1_v32i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
@@ -27271,20 +19233,17 @@ entry:
define zeroext i64 @test_vpcmpsgew_v8i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgew_v8i1_v64i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmplew %xmm0, %xmm1, %k0
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgew_v8i1_v64i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi992:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi993:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi994:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -27294,44 +19253,34 @@ define zeroext i64 @test_vpcmpsgew_v8i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -27342,6 +19291,7 @@ define zeroext i64 @test_vpcmpsgew_v8i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
@@ -27354,20 +19304,17 @@ entry:
define zeroext i64 @test_vpcmpsgew_v8i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgew_v8i1_v64i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpnltw (%rdi), %xmm0, %k0
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgew_v8i1_v64i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi995:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi996:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi997:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -27378,44 +19325,34 @@ define zeroext i64 @test_vpcmpsgew_v8i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -27426,6 +19363,7 @@ define zeroext i64 @test_vpcmpsgew_v8i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
@@ -27439,21 +19377,18 @@ entry:
define zeroext i64 @test_masked_vpcmpsgew_v8i1_v64i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgew_v8i1_v64i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmplew %xmm0, %xmm1, %k0 {%k1}
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgew_v8i1_v64i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi998:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi999:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1000:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -27464,44 +19399,34 @@ define zeroext i64 @test_masked_vpcmpsgew_v8i1_v64i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -27512,6 +19437,7 @@ define zeroext i64 @test_masked_vpcmpsgew_v8i1_v64i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
@@ -27526,21 +19452,18 @@ entry:
define zeroext i64 @test_masked_vpcmpsgew_v8i1_v64i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgew_v8i1_v64i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpnltw (%rsi), %xmm0, %k0 {%k1}
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgew_v8i1_v64i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1001:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1002:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1003:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -27552,44 +19475,34 @@ define zeroext i64 @test_masked_vpcmpsgew_v8i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -27600,6 +19513,7 @@ define zeroext i64 @test_masked_vpcmpsgew_v8i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
@@ -27616,123 +19530,36 @@ entry:
define zeroext i32 @test_vpcmpsgew_v16i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgew_v16i1_v32i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmplew %ymm0, %ymm1, %k0
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgew_v16i1_v32i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1004:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1005:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1006:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: .Lcfi1007:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi1008:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi1009:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi1010:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi1011:
-; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <16 x i16>
@@ -27745,124 +19572,37 @@ entry:
define zeroext i32 @test_vpcmpsgew_v16i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgew_v16i1_v32i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpnltw (%rdi), %ymm0, %k0
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgew_v16i1_v32i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1012:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1013:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1014:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: .Lcfi1015:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi1016:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi1017:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi1018:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi1019:
-; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: vmovdqa (%rdi), %ymm1
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <16 x i16>
@@ -27876,7 +19616,7 @@ entry:
define zeroext i32 @test_masked_vpcmpsgew_v16i1_v32i1_mask(i16 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgew_v16i1_v32i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmplew %ymm0, %ymm1, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
@@ -27884,117 +19624,33 @@ define zeroext i32 @test_masked_vpcmpsgew_v16i1_v32i1_mask(i16 zeroext %__u, <4
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgew_v16i1_v32i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1020:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1021:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1022:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: .Lcfi1023:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi1024:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi1025:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi1026:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi1027:
-; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1}
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <16 x i16>
@@ -28009,7 +19665,7 @@ entry:
define zeroext i32 @test_masked_vpcmpsgew_v16i1_v32i1_mask_mem(i16 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgew_v16i1_v32i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpnltw (%rsi), %ymm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
@@ -28017,32 +19673,14 @@ define zeroext i32 @test_masked_vpcmpsgew_v16i1_v32i1_mask_mem(i16 zeroext %__u,
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgew_v16i1_v32i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1028:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1029:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1030:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: .Lcfi1031:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi1032:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi1033:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi1034:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi1035:
-; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: vmovdqa (%rsi), %ymm1
; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
@@ -28050,85 +19688,19 @@ define zeroext i32 @test_masked_vpcmpsgew_v16i1_v32i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1}
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <16 x i16>
@@ -28145,113 +19717,30 @@ entry:
define zeroext i64 @test_vpcmpsgew_v16i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgew_v16i1_v64i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmplew %ymm0, %ymm1, %k0
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgew_v16i1_v64i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1036:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1037:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1038:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: .Lcfi1039:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi1040:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi1041:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi1042:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi1043:
-; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -28260,13 +19749,9 @@ define zeroext i64 @test_vpcmpsgew_v16i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %_
; NoVLX-NEXT: shlq $32, %rcx
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: orq %rcx, %rax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <16 x i16>
@@ -28279,114 +19764,31 @@ entry:
define zeroext i64 @test_vpcmpsgew_v16i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgew_v16i1_v64i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpnltw (%rdi), %ymm0, %k0
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgew_v16i1_v64i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1044:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1045:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1046:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: .Lcfi1047:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi1048:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi1049:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi1050:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi1051:
-; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: vmovdqa (%rdi), %ymm1
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -28395,13 +19797,9 @@ define zeroext i64 @test_vpcmpsgew_v16i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64
; NoVLX-NEXT: shlq $32, %rcx
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: orq %rcx, %rax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <16 x i16>
@@ -28415,7 +19813,7 @@ entry:
define zeroext i64 @test_masked_vpcmpsgew_v16i1_v64i1_mask(i16 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgew_v16i1_v64i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmplew %ymm0, %ymm1, %k0 {%k1}
; VLX-NEXT: kmovq %k0, %rax
@@ -28423,107 +19821,27 @@ define zeroext i64 @test_masked_vpcmpsgew_v16i1_v64i1_mask(i16 zeroext %__u, <4
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgew_v16i1_v64i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1052:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1053:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1054:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: .Lcfi1055:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi1056:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi1057:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi1058:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi1059:
-; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1}
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -28532,13 +19850,9 @@ define zeroext i64 @test_masked_vpcmpsgew_v16i1_v64i1_mask(i16 zeroext %__u, <4
; NoVLX-NEXT: shlq $32, %rcx
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: orq %rcx, %rax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <16 x i16>
@@ -28553,7 +19867,7 @@ entry:
define zeroext i64 @test_masked_vpcmpsgew_v16i1_v64i1_mask_mem(i16 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgew_v16i1_v64i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpnltw (%rsi), %ymm0, %k0 {%k1}
; VLX-NEXT: kmovq %k0, %rax
@@ -28561,32 +19875,14 @@ define zeroext i64 @test_masked_vpcmpsgew_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgew_v16i1_v64i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1060:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1061:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1062:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: .Lcfi1063:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi1064:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi1065:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi1066:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi1067:
-; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: vmovdqa (%rsi), %ymm1
; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
@@ -28594,75 +19890,13 @@ define zeroext i64 @test_masked_vpcmpsgew_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1}
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -28671,13 +19905,9 @@ define zeroext i64 @test_masked_vpcmpsgew_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: shlq $32, %rcx
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: orq %rcx, %rax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <16 x i16>
@@ -28694,25 +19924,22 @@ entry:
define zeroext i64 @test_vpcmpsgew_v32i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgew_v32i1_v64i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmplew %zmm0, %zmm1, %k0
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgew_v32i1_v64i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1068:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1069:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1070:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm3
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm3
; NoVLX-NEXT: vmovq %xmm3, %rax
; NoVLX-NEXT: movq %rax, %rcx
; NoVLX-NEXT: movq %rax, %rdx
@@ -28721,7 +19948,7 @@ define zeroext i64 @test_vpcmpsgew_v32i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %_
; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm5
; NoVLX-NEXT: vextracti32x4 $2, %zmm1, %xmm8
; NoVLX-NEXT: vextracti32x4 $3, %zmm1, %xmm4
-; NoVLX-NEXT: vextracti32x4 $1, %zmm1, %xmm6
+; NoVLX-NEXT: vextracti128 $1, %ymm1, %xmm6
; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm7
; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm2
; NoVLX-NEXT: shrq $32, %rdx
@@ -28883,153 +20110,24 @@ define zeroext i64 @test_vpcmpsgew_v32i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %_
; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
-; NoVLX-NEXT: vpcmpgtw %ymm3, %ymm1, %ymm2
-; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
-; NoVLX-NEXT: vpxor %ymm1, %ymm2, %ymm2
-; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2
-; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2
-; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm2
-; NoVLX-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: vpmovsxbd %xmm2, %zmm2
-; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2
-; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0
+; NoVLX-NEXT: vpcmpgtw %ymm3, %ymm1, %ymm1
+; NoVLX-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
+; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1
+; NoVLX-NEXT: vpmovsxwd %ymm1, %zmm1
+; NoVLX-NEXT: vpmovdb %zmm1, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0
; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %ecx
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
@@ -29037,6 +20135,7 @@ define zeroext i64 @test_vpcmpsgew_v32i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %_
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <32 x i16>
@@ -29049,25 +20148,22 @@ entry:
define zeroext i64 @test_vpcmpsgew_v32i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgew_v32i1_v64i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpnltw (%rdi), %zmm0, %k0
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgew_v32i1_v64i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1071:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1072:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1073:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm2
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm2
; NoVLX-NEXT: vmovq %xmm2, %rax
; NoVLX-NEXT: movq %rax, %rcx
; NoVLX-NEXT: movq %rax, %rdx
@@ -29155,153 +20251,24 @@ define zeroext i64 @test_vpcmpsgew_v32i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64
; NoVLX-NEXT: vmovdqa (%rdi), %ymm2
; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm0
; NoVLX-NEXT: vmovdqa 32(%rdi), %ymm2
-; NoVLX-NEXT: vpcmpgtw %ymm1, %ymm2, %ymm2
-; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
-; NoVLX-NEXT: vpxor %ymm1, %ymm2, %ymm2
-; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2
-; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2
-; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm2
-; NoVLX-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: vpmovsxbd %xmm2, %zmm2
-; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2
-; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0
+; NoVLX-NEXT: vpcmpgtw %ymm1, %ymm2, %ymm1
+; NoVLX-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
+; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1
+; NoVLX-NEXT: vpmovsxwd %ymm1, %zmm1
+; NoVLX-NEXT: vpmovdb %zmm1, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0
; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %ecx
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
@@ -29309,6 +20276,7 @@ define zeroext i64 @test_vpcmpsgew_v32i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <32 x i16>
@@ -29322,7 +20290,7 @@ entry:
define zeroext i64 @test_masked_vpcmpsgew_v32i1_v64i1_mask(i32 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgew_v32i1_v64i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmplew %zmm0, %zmm1, %k0 {%k1}
; VLX-NEXT: kmovq %k0, %rax
@@ -29330,14 +20298,11 @@ define zeroext i64 @test_masked_vpcmpsgew_v32i1_v64i1_mask(i32 zeroext %__u, <8
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgew_v32i1_v64i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1074:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1075:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1076:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $96, %rsp
@@ -29349,10 +20314,10 @@ define zeroext i64 @test_masked_vpcmpsgew_v32i1_v64i1_mask(i32 zeroext %__u, <8
; NoVLX-NEXT: vmovd %eax, %xmm3
; NoVLX-NEXT: shrl $16, %eax
; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm4
-; NoVLX-NEXT: vextracti32x4 $1, %zmm1, %xmm8
+; NoVLX-NEXT: vextracti128 $1, %ymm1, %xmm8
; NoVLX-NEXT: vextracti32x4 $2, %zmm1, %xmm5
; NoVLX-NEXT: vextracti32x4 $3, %zmm1, %xmm7
-; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm6
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm6
; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm3
; NoVLX-NEXT: shrq $32, %rdx
; NoVLX-NEXT: vpinsrw $2, %edx, %xmm4, %xmm4
@@ -29498,176 +20463,47 @@ define zeroext i64 @test_masked_vpcmpsgew_v32i1_v64i1_mask(i32 zeroext %__u, <8
; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2
; NoVLX-NEXT: vpextrq $1, %xmm1, %rax
; NoVLX-NEXT: vinserti128 $1, %xmm9, %ymm4, %ymm1
-; NoVLX-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm8
+; NoVLX-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm0
; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1
; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k2
-; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; NoVLX-NEXT: vinserti128 $1, %xmm7, %ymm3, %ymm3
-; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
-; NoVLX-NEXT: vpternlogd $255, %zmm6, %zmm6, %zmm6 {%k2} {z}
-; NoVLX-NEXT: vpcmpgtw %ymm1, %ymm3, %ymm4
-; NoVLX-NEXT: vpmovdb %zmm6, %xmm6
+; NoVLX-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k1} {z}
+; NoVLX-NEXT: vpmovdb %zmm4, %xmm4
+; NoVLX-NEXT: vpcmpgtw %ymm1, %ymm3, %ymm1
+; NoVLX-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k2} {z}
+; NoVLX-NEXT: vpmovdb %zmm3, %xmm3
; NoVLX-NEXT: shrq $48, %rcx
-; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm1
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2
; NoVLX-NEXT: movl %eax, %ecx
; NoVLX-NEXT: shrl $16, %ecx
-; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm2
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2
; NoVLX-NEXT: movq %rax, %rcx
; NoVLX-NEXT: shrq $48, %rax
; NoVLX-NEXT: shrq $32, %rcx
-; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2
; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2
; NoVLX-NEXT: vinserti128 $1, %xmm5, %ymm2, %ymm2
-; NoVLX-NEXT: vpcmpgtw %ymm8, %ymm2, %ymm2
-; NoVLX-NEXT: vpcmpeqd %ymm5, %ymm5, %ymm5
-; NoVLX-NEXT: vpxor %ymm5, %ymm2, %ymm2
-; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2
-; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2
-; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm2
-; NoVLX-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm3
-; NoVLX-NEXT: vpxor %ymm5, %ymm4, %ymm2
-; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2
-; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2
-; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm2
-; NoVLX-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: vpand %xmm6, %xmm2, %xmm2
-; NoVLX-NEXT: vpmovsxbd %xmm2, %zmm2
-; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2
-; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0
+; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm0
+; NoVLX-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
+; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
+; NoVLX-NEXT: vpand %xmm4, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1
+; NoVLX-NEXT: vpmovsxwd %ymm1, %zmm1
+; NoVLX-NEXT: vpmovdb %zmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm3, %xmm1, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: vpand %xmm0, %xmm3, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %ecx
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
@@ -29675,6 +20511,7 @@ define zeroext i64 @test_masked_vpcmpsgew_v32i1_v64i1_mask(i32 zeroext %__u, <8
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <32 x i16>
@@ -29689,7 +20526,7 @@ entry:
define zeroext i64 @test_masked_vpcmpsgew_v32i1_v64i1_mask_mem(i32 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgew_v32i1_v64i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpnltw (%rsi), %zmm0, %k0 {%k1}
; VLX-NEXT: kmovq %k0, %rax
@@ -29697,14 +20534,11 @@ define zeroext i64 @test_masked_vpcmpsgew_v32i1_v64i1_mask_mem(i32 zeroext %__u,
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgew_v32i1_v64i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1077:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1078:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1079:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $96, %rsp
@@ -29716,7 +20550,7 @@ define zeroext i64 @test_masked_vpcmpsgew_v32i1_v64i1_mask_mem(i32 zeroext %__u,
; NoVLX-NEXT: vmovd %eax, %xmm2
; NoVLX-NEXT: shrl $16, %eax
; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm3
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm3
; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm4
; NoVLX-NEXT: shrq $32, %rdx
; NoVLX-NEXT: vpinsrw $2, %edx, %xmm2, %xmm2
@@ -29732,226 +20566,97 @@ define zeroext i64 @test_masked_vpcmpsgew_v32i1_v64i1_mask_mem(i32 zeroext %__u,
; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vmovq %xmm4, %rcx
; NoVLX-NEXT: shrq $48, %rax
-; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm2
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1
; NoVLX-NEXT: movl %ecx, %eax
; NoVLX-NEXT: shrl $16, %eax
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vmovd %ecx, %xmm2
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2
; NoVLX-NEXT: movq %rcx, %rax
; NoVLX-NEXT: shrq $32, %rax
-; NoVLX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2
; NoVLX-NEXT: vpextrq $1, %xmm4, %rax
; NoVLX-NEXT: shrq $48, %rcx
-; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2
; NoVLX-NEXT: movl %eax, %ecx
; NoVLX-NEXT: shrl $16, %ecx
-; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2
; NoVLX-NEXT: movq %rax, %rcx
; NoVLX-NEXT: shrq $32, %rcx
-; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2
; NoVLX-NEXT: vmovq %xmm3, %rcx
; NoVLX-NEXT: shrq $48, %rax
-; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm4
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2
; NoVLX-NEXT: movl %ecx, %eax
; NoVLX-NEXT: shrl $16, %eax
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vmovd %ecx, %xmm4
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4
; NoVLX-NEXT: movq %rcx, %rax
; NoVLX-NEXT: shrq $32, %rax
-; NoVLX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4
; NoVLX-NEXT: vpextrq $1, %xmm3, %rax
; NoVLX-NEXT: shrq $48, %rcx
-; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm3
; NoVLX-NEXT: movl %eax, %ecx
; NoVLX-NEXT: shrl $16, %ecx
-; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3
; NoVLX-NEXT: movq %rax, %rcx
; NoVLX-NEXT: shrq $32, %rcx
-; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3
; NoVLX-NEXT: vmovq %xmm0, %rcx
; NoVLX-NEXT: shrq $48, %rax
-; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm3
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3
; NoVLX-NEXT: movl %ecx, %eax
; NoVLX-NEXT: shrl $16, %eax
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vmovd %ecx, %xmm4
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4
; NoVLX-NEXT: movq %rcx, %rax
; NoVLX-NEXT: shrq $32, %rax
-; NoVLX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4
; NoVLX-NEXT: vpextrq $1, %xmm0, %rax
; NoVLX-NEXT: shrq $48, %rcx
-; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm1, %xmm0
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm0
; NoVLX-NEXT: movl %eax, %ecx
; NoVLX-NEXT: shrl $16, %ecx
; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: movq %rax, %rcx
; NoVLX-NEXT: shrq $32, %rcx
-; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm5
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1
; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k2
-; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vpmovdb %zmm0, %xmm1
-; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
-; NoVLX-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm4
-; NoVLX-NEXT: vpmovdb %zmm0, %xmm2
+; NoVLX-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k1} {z}
+; NoVLX-NEXT: vpmovdb %zmm4, %xmm4
+; NoVLX-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k2} {z}
+; NoVLX-NEXT: vpmovdb %zmm2, %xmm2
; NoVLX-NEXT: shrq $48, %rax
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vpinsrw $7, %eax, %xmm5, %xmm5
-; NoVLX-NEXT: vinserti128 $1, %xmm3, %ymm5, %ymm3
-; NoVLX-NEXT: vmovdqa (%rsi), %ymm5
-; NoVLX-NEXT: vpcmpgtw %ymm3, %ymm5, %ymm3
-; NoVLX-NEXT: vmovdqa 32(%rsi), %ymm5
-; NoVLX-NEXT: vpcmpgtw %ymm4, %ymm5, %ymm4
-; NoVLX-NEXT: vpcmpeqd %ymm5, %ymm5, %ymm5
-; NoVLX-NEXT: vpxor %ymm5, %ymm3, %ymm3
-; NoVLX-NEXT: vpmovsxwd %ymm3, %zmm3
-; NoVLX-NEXT: vpslld $31, %zmm3, %zmm3
-; NoVLX-NEXT: vptestmd %zmm3, %zmm3, %k0
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm3
-; NoVLX-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm3, %xmm3
-; NoVLX-NEXT: vpxor %ymm5, %ymm4, %ymm4
-; NoVLX-NEXT: vpmovsxwd %ymm4, %zmm4
-; NoVLX-NEXT: vpslld $31, %zmm4, %zmm4
-; NoVLX-NEXT: vptestmd %zmm4, %zmm4, %k0
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm4
-; NoVLX-NEXT: vpinsrb $1, %eax, %xmm4, %xmm4
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $2, %eax, %xmm4, %xmm4
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $3, %eax, %xmm4, %xmm4
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm4, %xmm4
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm4, %xmm4
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm4, %xmm4
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm4, %xmm4
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm4, %xmm4
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $9, %eax, %xmm4, %xmm4
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $10, %eax, %xmm4, %xmm4
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm4, %xmm4
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $12, %eax, %xmm4, %xmm4
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $13, %eax, %xmm4, %xmm4
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $14, %eax, %xmm4, %xmm4
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm4, %xmm4
-; NoVLX-NEXT: vpand %xmm2, %xmm4, %xmm2
-; NoVLX-NEXT: vpmovsxbd %xmm2, %zmm2
-; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2
-; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0
-; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: vpand %xmm1, %xmm3, %xmm1
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
+; NoVLX-NEXT: vmovdqa (%rsi), %ymm3
+; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm3, %ymm0
+; NoVLX-NEXT: vmovdqa 32(%rsi), %ymm3
+; NoVLX-NEXT: vpcmpgtw %ymm1, %ymm3, %ymm1
+; NoVLX-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3
+; NoVLX-NEXT: vpxor %ymm3, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
+; NoVLX-NEXT: vpand %xmm4, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor %ymm3, %ymm1, %ymm1
+; NoVLX-NEXT: vpmovsxwd %ymm1, %zmm1
+; NoVLX-NEXT: vpmovdb %zmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm2, %xmm1, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %ecx
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
@@ -29959,6 +20664,7 @@ define zeroext i64 @test_masked_vpcmpsgew_v32i1_v64i1_mask_mem(i32 zeroext %__u,
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <32 x i16>
@@ -29975,51 +20681,47 @@ entry:
define zeroext i8 @test_vpcmpsged_v4i1_v8i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsged_v4i1_v8i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpled %xmm0, %xmm1, %k0
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %al killed %al killed %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsged_v4i1_v8i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kshiftlw $7, %k0, %k0
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $13, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $12, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %al killed %al killed %eax
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -30032,52 +20734,48 @@ entry:
define zeroext i8 @test_vpcmpsged_v4i1_v8i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsged_v4i1_v8i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpnltd (%rdi), %xmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %al killed %al killed %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsged_v4i1_v8i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vmovdqa (%rdi), %xmm1
; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kshiftlw $7, %k0, %k0
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $13, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $12, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %al killed %al killed %eax
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -30091,68 +20789,59 @@ entry:
define zeroext i8 @test_masked_vpcmpsged_v4i1_v8i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsged_v4i1_v8i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpled %xmm0, %xmm1, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %al killed %al killed %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v8i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $13, %k0, %k2
-; NoVLX-NEXT: kshiftrw $15, %k2, %k2
-; NoVLX-NEXT: kshiftlw $15, %k0, %k3
-; NoVLX-NEXT: kshiftrw $15, %k3, %k3
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k3, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kshiftlw $7, %k0, %k0
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $13, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $12, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %al killed %al killed %eax
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -30168,69 +20857,60 @@ entry:
define zeroext i8 @test_masked_vpcmpsged_v4i1_v8i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsged_v4i1_v8i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpnltd (%rsi), %xmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %al killed %al killed %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v8i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vmovdqa (%rsi), %xmm1
; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $13, %k0, %k2
-; NoVLX-NEXT: kshiftrw $15, %k2, %k2
-; NoVLX-NEXT: kshiftlw $15, %k0, %k3
-; NoVLX-NEXT: kshiftrw $15, %k3, %k3
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k3, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kshiftlw $7, %k0, %k0
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $13, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $12, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %al killed %al killed %eax
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -30248,53 +20928,48 @@ entry:
define zeroext i8 @test_vpcmpsged_v4i1_v8i1_mask_mem_b(<2 x i64> %__a, i32* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsged_v4i1_v8i1_mask_mem_b:
-; VLX: # BB#0: # %entry
-; VLX-NEXT: vpbroadcastd (%rdi), %xmm1
-; VLX-NEXT: vpcmpled %xmm0, %xmm1, %k0
+; VLX: # %bb.0: # %entry
+; VLX-NEXT: vpcmpnltd (%rdi){1to4}, %xmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %al killed %al killed %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsged_v4i1_v8i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpbroadcastd (%rdi), %xmm1
; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kshiftlw $7, %k0, %k0
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $13, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $12, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %al killed %al killed %eax
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -30309,70 +20984,60 @@ entry:
define zeroext i8 @test_masked_vpcmpsged_v4i1_v8i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i32* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsged_v4i1_v8i1_mask_mem_b:
-; VLX: # BB#0: # %entry
-; VLX-NEXT: vpbroadcastd (%rsi), %xmm1
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
-; VLX-NEXT: vpcmpled %xmm0, %xmm1, %k0 {%k1}
+; VLX-NEXT: vpcmpnltd (%rsi){1to4}, %xmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %al killed %al killed %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v8i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpbroadcastd (%rsi), %xmm1
; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $13, %k0, %k2
-; NoVLX-NEXT: kshiftrw $15, %k2, %k2
-; NoVLX-NEXT: kshiftlw $15, %k0, %k3
-; NoVLX-NEXT: kshiftrw $15, %k3, %k3
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k3, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kshiftlw $7, %k0, %k0
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $13, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $12, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %al killed %al killed %eax
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -30391,50 +21056,47 @@ entry:
define zeroext i16 @test_vpcmpsged_v4i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsged_v4i1_v16i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpled %xmm0, %xmm1, %k0
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsged_v4i1_v16i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kmovw %eax, %k0
; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: andl $1, %eax
-; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $13, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $12, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -30447,51 +21109,48 @@ entry:
define zeroext i16 @test_vpcmpsged_v4i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsged_v4i1_v16i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpnltd (%rdi), %xmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsged_v4i1_v16i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vmovdqa (%rdi), %xmm1
; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kmovw %eax, %k0
; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: andl $1, %eax
-; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $13, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $12, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -30505,67 +21164,59 @@ entry:
define zeroext i16 @test_masked_vpcmpsged_v4i1_v16i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsged_v4i1_v16i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpled %xmm0, %xmm1, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v16i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $13, %k0, %k2
-; NoVLX-NEXT: kshiftrw $15, %k2, %k2
-; NoVLX-NEXT: kshiftlw $15, %k0, %k3
-; NoVLX-NEXT: kshiftrw $15, %k3, %k3
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k3, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kmovw %eax, %k0
; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: andl $1, %eax
-; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $13, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $12, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -30581,68 +21232,60 @@ entry:
define zeroext i16 @test_masked_vpcmpsged_v4i1_v16i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsged_v4i1_v16i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpnltd (%rsi), %xmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v16i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vmovdqa (%rsi), %xmm1
; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $13, %k0, %k2
-; NoVLX-NEXT: kshiftrw $15, %k2, %k2
-; NoVLX-NEXT: kshiftlw $15, %k0, %k3
-; NoVLX-NEXT: kshiftrw $15, %k3, %k3
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k3, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kmovw %eax, %k0
; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: andl $1, %eax
-; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $13, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $12, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -30660,52 +21303,48 @@ entry:
define zeroext i16 @test_vpcmpsged_v4i1_v16i1_mask_mem_b(<2 x i64> %__a, i32* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsged_v4i1_v16i1_mask_mem_b:
-; VLX: # BB#0: # %entry
-; VLX-NEXT: vpbroadcastd (%rdi), %xmm1
-; VLX-NEXT: vpcmpled %xmm0, %xmm1, %k0
+; VLX: # %bb.0: # %entry
+; VLX-NEXT: vpcmpnltd (%rdi){1to4}, %xmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsged_v4i1_v16i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpbroadcastd (%rdi), %xmm1
; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kmovw %eax, %k0
; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: andl $1, %eax
-; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $13, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $12, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -30720,69 +21359,60 @@ entry:
define zeroext i16 @test_masked_vpcmpsged_v4i1_v16i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i32* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsged_v4i1_v16i1_mask_mem_b:
-; VLX: # BB#0: # %entry
-; VLX-NEXT: vpbroadcastd (%rsi), %xmm1
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
-; VLX-NEXT: vpcmpled %xmm0, %xmm1, %k0 {%k1}
+; VLX-NEXT: vpcmpnltd (%rsi){1to4}, %xmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v16i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpbroadcastd (%rsi), %xmm1
; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $13, %k0, %k2
-; NoVLX-NEXT: kshiftrw $15, %k2, %k2
-; NoVLX-NEXT: kshiftlw $15, %k0, %k3
-; NoVLX-NEXT: kshiftrw $15, %k3, %k3
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k3, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kmovw %eax, %k0
; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: andl $1, %eax
-; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $13, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $12, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -30801,20 +21431,17 @@ entry:
define zeroext i32 @test_vpcmpsged_v4i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsged_v4i1_v32i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpled %xmm0, %xmm1, %k0
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsged_v4i1_v32i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1080:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1081:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1082:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -30834,6 +21461,7 @@ define zeroext i32 @test_vpcmpsged_v4i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -30846,20 +21474,17 @@ entry:
define zeroext i32 @test_vpcmpsged_v4i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsged_v4i1_v32i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpnltd (%rdi), %xmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsged_v4i1_v32i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1083:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1084:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1085:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -30880,6 +21505,7 @@ define zeroext i32 @test_vpcmpsged_v4i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -30893,41 +21519,33 @@ entry:
define zeroext i32 @test_masked_vpcmpsged_v4i1_v32i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsged_v4i1_v32i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpled %xmm0, %xmm1, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v32i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1086:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1087:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1088:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $13, %k0, %k2
-; NoVLX-NEXT: kshiftrw $15, %k2, %k2
-; NoVLX-NEXT: kshiftlw $15, %k0, %k3
-; NoVLX-NEXT: kshiftrw $15, %k3, %k3
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k3, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -30943,6 +21561,7 @@ define zeroext i32 @test_masked_vpcmpsged_v4i1_v32i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -30958,42 +21577,34 @@ entry:
define zeroext i32 @test_masked_vpcmpsged_v4i1_v32i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsged_v4i1_v32i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpnltd (%rsi), %xmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v32i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1089:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1090:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1091:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
; NoVLX-NEXT: vmovdqa (%rsi), %xmm1
; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $13, %k0, %k2
-; NoVLX-NEXT: kshiftrw $15, %k2, %k2
-; NoVLX-NEXT: kshiftlw $15, %k0, %k3
-; NoVLX-NEXT: kshiftrw $15, %k3, %k3
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k3, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -31009,6 +21620,7 @@ define zeroext i32 @test_masked_vpcmpsged_v4i1_v32i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -31026,21 +21638,17 @@ entry:
define zeroext i32 @test_vpcmpsged_v4i1_v32i1_mask_mem_b(<2 x i64> %__a, i32* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsged_v4i1_v32i1_mask_mem_b:
-; VLX: # BB#0: # %entry
-; VLX-NEXT: vpbroadcastd (%rdi), %xmm1
-; VLX-NEXT: vpcmpled %xmm0, %xmm1, %k0
+; VLX: # %bb.0: # %entry
+; VLX-NEXT: vpcmpnltd (%rdi){1to4}, %xmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsged_v4i1_v32i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1092:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1093:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1094:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -31061,6 +21669,7 @@ define zeroext i32 @test_vpcmpsged_v4i1_v32i1_mask_mem_b(<2 x i64> %__a, i32* %_
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -31075,43 +21684,34 @@ entry:
define zeroext i32 @test_masked_vpcmpsged_v4i1_v32i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i32* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsged_v4i1_v32i1_mask_mem_b:
-; VLX: # BB#0: # %entry
-; VLX-NEXT: vpbroadcastd (%rsi), %xmm1
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
-; VLX-NEXT: vpcmpled %xmm0, %xmm1, %k0 {%k1}
+; VLX-NEXT: vpcmpnltd (%rsi){1to4}, %xmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v32i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1095:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1096:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1097:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
; NoVLX-NEXT: vpbroadcastd (%rsi), %xmm1
; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $13, %k0, %k2
-; NoVLX-NEXT: kshiftrw $15, %k2, %k2
-; NoVLX-NEXT: kshiftlw $15, %k0, %k3
-; NoVLX-NEXT: kshiftrw $15, %k3, %k3
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k3, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -31127,6 +21727,7 @@ define zeroext i32 @test_masked_vpcmpsged_v4i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -31145,28 +21746,24 @@ entry:
define zeroext i64 @test_vpcmpsged_v4i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsged_v4i1_v64i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpled %xmm0, %xmm1, %k0
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsged_v4i1_v64i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1098:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1099:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1100:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -31185,6 +21782,7 @@ define zeroext i64 @test_vpcmpsged_v4i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -31197,20 +21795,17 @@ entry:
define zeroext i64 @test_vpcmpsged_v4i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsged_v4i1_v64i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpnltd (%rdi), %xmm0, %k0
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsged_v4i1_v64i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1101:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1102:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1103:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -31218,8 +21813,7 @@ define zeroext i64 @test_vpcmpsged_v4i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -31238,6 +21832,7 @@ define zeroext i64 @test_vpcmpsged_v4i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -31251,45 +21846,36 @@ entry:
define zeroext i64 @test_masked_vpcmpsged_v4i1_v64i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsged_v4i1_v64i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpled %xmm0, %xmm1, %k0 {%k1}
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v64i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1104:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1105:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1106:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $13, %k0, %k2
-; NoVLX-NEXT: kshiftrw $15, %k2, %k2
-; NoVLX-NEXT: kshiftlw $15, %k0, %k3
-; NoVLX-NEXT: kshiftrw $15, %k3, %k3
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k3, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0
-; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -31308,6 +21894,7 @@ define zeroext i64 @test_masked_vpcmpsged_v4i1_v64i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -31323,46 +21910,37 @@ entry:
define zeroext i64 @test_masked_vpcmpsged_v4i1_v64i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsged_v4i1_v64i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpnltd (%rsi), %xmm0, %k0 {%k1}
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v64i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1107:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1108:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1109:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vmovdqa (%rsi), %xmm1
; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $13, %k0, %k2
-; NoVLX-NEXT: kshiftrw $15, %k2, %k2
-; NoVLX-NEXT: kshiftlw $15, %k0, %k3
-; NoVLX-NEXT: kshiftrw $15, %k3, %k3
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k3, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0
-; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -31381,6 +21959,7 @@ define zeroext i64 @test_masked_vpcmpsged_v4i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -31398,21 +21977,17 @@ entry:
define zeroext i64 @test_vpcmpsged_v4i1_v64i1_mask_mem_b(<2 x i64> %__a, i32* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsged_v4i1_v64i1_mask_mem_b:
-; VLX: # BB#0: # %entry
-; VLX-NEXT: vpbroadcastd (%rdi), %xmm1
-; VLX-NEXT: vpcmpled %xmm0, %xmm1, %k0
+; VLX: # %bb.0: # %entry
+; VLX-NEXT: vpcmpnltd (%rdi){1to4}, %xmm0, %k0
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsged_v4i1_v64i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1110:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1111:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1112:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -31420,8 +21995,7 @@ define zeroext i64 @test_vpcmpsged_v4i1_v64i1_mask_mem_b(<2 x i64> %__a, i32* %_
; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -31440,6 +22014,7 @@ define zeroext i64 @test_vpcmpsged_v4i1_v64i1_mask_mem_b(<2 x i64> %__a, i32* %_
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -31454,47 +22029,37 @@ entry:
define zeroext i64 @test_masked_vpcmpsged_v4i1_v64i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i32* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsged_v4i1_v64i1_mask_mem_b:
-; VLX: # BB#0: # %entry
-; VLX-NEXT: vpbroadcastd (%rsi), %xmm1
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
-; VLX-NEXT: vpcmpled %xmm0, %xmm1, %k0 {%k1}
+; VLX-NEXT: vpcmpnltd (%rsi){1to4}, %xmm0, %k0 {%k1}
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v64i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1113:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1114:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1115:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vpbroadcastd (%rsi), %xmm1
; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $13, %k0, %k2
-; NoVLX-NEXT: kshiftrw $15, %k2, %k2
-; NoVLX-NEXT: kshiftlw $15, %k0, %k3
-; NoVLX-NEXT: kshiftrw $15, %k3, %k3
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k3, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0
-; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -31513,6 +22078,7 @@ define zeroext i64 @test_masked_vpcmpsged_v4i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -31531,22 +22097,23 @@ entry:
define zeroext i16 @test_vpcmpsged_v8i1_v16i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsged_v8i1_v16i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpled %ymm0, %ymm1, %k0
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsged_v8i1_v16i1_mask:
-; NoVLX: # BB#0: # %entry
-; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
-; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX: # %bb.0: # %entry
+; NoVLX-NEXT: # kill: def %ymm1 killed %ymm1 def %zmm1
+; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
; NoVLX-NEXT: kshiftrw $8, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
@@ -31559,22 +22126,23 @@ entry:
define zeroext i16 @test_vpcmpsged_v8i1_v16i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsged_v8i1_v16i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpnltd (%rdi), %ymm0, %k0
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsged_v8i1_v16i1_mask_mem:
-; NoVLX: # BB#0: # %entry
-; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX: # %bb.0: # %entry
+; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: vmovdqa (%rdi), %ymm1
; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
; NoVLX-NEXT: kshiftrw $8, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
@@ -31588,24 +22156,25 @@ entry:
define zeroext i16 @test_masked_vpcmpsged_v8i1_v16i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsged_v8i1_v16i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpled %ymm0, %ymm1, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsged_v8i1_v16i1_mask:
-; NoVLX: # BB#0: # %entry
-; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
-; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX: # %bb.0: # %entry
+; NoVLX-NEXT: # kill: def %ymm1 killed %ymm1 def %zmm1
+; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 {%k1}
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
; NoVLX-NEXT: kshiftrw $8, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
@@ -31620,24 +22189,25 @@ entry:
define zeroext i16 @test_masked_vpcmpsged_v8i1_v16i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsged_v8i1_v16i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpnltd (%rsi), %ymm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsged_v8i1_v16i1_mask_mem:
-; NoVLX: # BB#0: # %entry
-; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX: # %bb.0: # %entry
+; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: vmovdqa (%rsi), %ymm1
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 {%k1}
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
; NoVLX-NEXT: kshiftrw $8, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
@@ -31654,23 +22224,23 @@ entry:
define zeroext i16 @test_vpcmpsged_v8i1_v16i1_mask_mem_b(<4 x i64> %__a, i32* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsged_v8i1_v16i1_mask_mem_b:
-; VLX: # BB#0: # %entry
-; VLX-NEXT: vpbroadcastd (%rdi), %ymm1
-; VLX-NEXT: vpcmpled %ymm0, %ymm1, %k0
+; VLX: # %bb.0: # %entry
+; VLX-NEXT: vpcmpnltd (%rdi){1to8}, %ymm0, %k0
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsged_v8i1_v16i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
-; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX: # %bb.0: # %entry
+; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: vpbroadcastd (%rdi), %ymm1
; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
; NoVLX-NEXT: kshiftrw $8, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
@@ -31685,25 +22255,25 @@ entry:
define zeroext i16 @test_masked_vpcmpsged_v8i1_v16i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i32* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsged_v8i1_v16i1_mask_mem_b:
-; VLX: # BB#0: # %entry
-; VLX-NEXT: vpbroadcastd (%rsi), %ymm1
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
-; VLX-NEXT: vpcmpled %ymm0, %ymm1, %k0 {%k1}
+; VLX-NEXT: vpcmpnltd (%rsi){1to8}, %ymm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsged_v8i1_v16i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
-; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX: # %bb.0: # %entry
+; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: vpbroadcastd (%rsi), %ymm1
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 {%k1}
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
; NoVLX-NEXT: kshiftrw $8, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
@@ -31721,63 +22291,50 @@ entry:
define zeroext i32 @test_vpcmpsged_v8i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsged_v8i1_v32i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpled %ymm0, %ymm1, %k0
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsged_v8i1_v32i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1116:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1117:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1118:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
-; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: # kill: def %ymm1 killed %ymm1 def %zmm1
+; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -31785,6 +22342,7 @@ define zeroext i32 @test_vpcmpsged_v8i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
@@ -31797,63 +22355,50 @@ entry:
define zeroext i32 @test_vpcmpsged_v8i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsged_v8i1_v32i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpnltd (%rdi), %ymm0, %k0
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsged_v8i1_v32i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1119:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1120:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1121:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: vmovdqa (%rdi), %ymm1
; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -31861,6 +22406,7 @@ define zeroext i32 @test_vpcmpsged_v8i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
@@ -31874,7 +22420,7 @@ entry:
define zeroext i32 @test_masked_vpcmpsged_v8i1_v32i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsged_v8i1_v32i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpled %ymm0, %ymm1, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
@@ -31882,58 +22428,44 @@ define zeroext i32 @test_masked_vpcmpsged_v8i1_v32i1_mask(i8 zeroext %__u, <4 x
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsged_v8i1_v32i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1122:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1123:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1124:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
-; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
-; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0
+; NoVLX-NEXT: # kill: def %ymm1 killed %ymm1 def %zmm1
+; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
-; NoVLX-NEXT: kandw %k1, %k0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 {%k1}
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -31941,6 +22473,7 @@ define zeroext i32 @test_masked_vpcmpsged_v8i1_v32i1_mask(i8 zeroext %__u, <4 x
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
@@ -31955,7 +22488,7 @@ entry:
define zeroext i32 @test_masked_vpcmpsged_v8i1_v32i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsged_v8i1_v32i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpnltd (%rsi), %ymm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
@@ -31963,58 +22496,44 @@ define zeroext i32 @test_masked_vpcmpsged_v8i1_v32i1_mask_mem(i8 zeroext %__u, <
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsged_v8i1_v32i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1125:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1126:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1127:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: vmovdqa (%rsi), %ymm1
-; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0
; NoVLX-NEXT: kmovw %edi, %k1
-; NoVLX-NEXT: kandw %k1, %k0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 {%k1}
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -32022,6 +22541,7 @@ define zeroext i32 @test_masked_vpcmpsged_v8i1_v32i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
@@ -32038,64 +22558,50 @@ entry:
define zeroext i32 @test_vpcmpsged_v8i1_v32i1_mask_mem_b(<4 x i64> %__a, i32* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsged_v8i1_v32i1_mask_mem_b:
-; VLX: # BB#0: # %entry
-; VLX-NEXT: vpbroadcastd (%rdi), %ymm1
-; VLX-NEXT: vpcmpled %ymm0, %ymm1, %k0
+; VLX: # %bb.0: # %entry
+; VLX-NEXT: vpcmpnltd (%rdi){1to8}, %ymm0, %k0
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsged_v8i1_v32i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1128:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1129:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1130:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: vpbroadcastd (%rdi), %ymm1
; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -32103,6 +22609,7 @@ define zeroext i32 @test_vpcmpsged_v8i1_v32i1_mask_mem_b(<4 x i64> %__a, i32* %_
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
@@ -32117,67 +22624,52 @@ entry:
define zeroext i32 @test_masked_vpcmpsged_v8i1_v32i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i32* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsged_v8i1_v32i1_mask_mem_b:
-; VLX: # BB#0: # %entry
-; VLX-NEXT: vpbroadcastd (%rsi), %ymm1
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
-; VLX-NEXT: vpcmpled %ymm0, %ymm1, %k0 {%k1}
+; VLX-NEXT: vpcmpnltd (%rsi){1to8}, %ymm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsged_v8i1_v32i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1131:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1132:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1133:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: vpbroadcastd (%rsi), %ymm1
-; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0
; NoVLX-NEXT: kmovw %edi, %k1
-; NoVLX-NEXT: kandw %k0, %k1, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 {%k1}
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -32185,6 +22677,7 @@ define zeroext i32 @test_masked_vpcmpsged_v8i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
@@ -32202,65 +22695,52 @@ entry:
define zeroext i64 @test_vpcmpsged_v8i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsged_v8i1_v64i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpled %ymm0, %ymm1, %k0
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsged_v8i1_v64i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1134:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1135:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1136:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
-; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: # kill: def %ymm1 killed %ymm1 def %zmm1
+; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -32271,6 +22751,7 @@ define zeroext i64 @test_vpcmpsged_v8i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
@@ -32283,65 +22764,52 @@ entry:
define zeroext i64 @test_vpcmpsged_v8i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsged_v8i1_v64i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpnltd (%rdi), %ymm0, %k0
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsged_v8i1_v64i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1137:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1138:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1139:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: vmovdqa (%rdi), %ymm1
; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -32352,6 +22820,7 @@ define zeroext i64 @test_vpcmpsged_v8i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
@@ -32365,7 +22834,7 @@ entry:
define zeroext i64 @test_masked_vpcmpsged_v8i1_v64i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsged_v8i1_v64i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpled %ymm0, %ymm1, %k0 {%k1}
; VLX-NEXT: kmovq %k0, %rax
@@ -32373,60 +22842,46 @@ define zeroext i64 @test_masked_vpcmpsged_v8i1_v64i1_mask(i8 zeroext %__u, <4 x
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsged_v8i1_v64i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1140:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1141:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1142:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
-; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
-; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0
+; NoVLX-NEXT: # kill: def %ymm1 killed %ymm1 def %zmm1
+; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
-; NoVLX-NEXT: kandw %k1, %k0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 {%k1}
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -32437,6 +22892,7 @@ define zeroext i64 @test_masked_vpcmpsged_v8i1_v64i1_mask(i8 zeroext %__u, <4 x
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
@@ -32451,7 +22907,7 @@ entry:
define zeroext i64 @test_masked_vpcmpsged_v8i1_v64i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsged_v8i1_v64i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpnltd (%rsi), %ymm0, %k0 {%k1}
; VLX-NEXT: kmovq %k0, %rax
@@ -32459,60 +22915,46 @@ define zeroext i64 @test_masked_vpcmpsged_v8i1_v64i1_mask_mem(i8 zeroext %__u, <
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsged_v8i1_v64i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1143:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1144:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1145:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: vmovdqa (%rsi), %ymm1
-; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0
; NoVLX-NEXT: kmovw %edi, %k1
-; NoVLX-NEXT: kandw %k1, %k0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 {%k1}
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -32523,6 +22965,7 @@ define zeroext i64 @test_masked_vpcmpsged_v8i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
@@ -32539,66 +22982,52 @@ entry:
define zeroext i64 @test_vpcmpsged_v8i1_v64i1_mask_mem_b(<4 x i64> %__a, i32* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsged_v8i1_v64i1_mask_mem_b:
-; VLX: # BB#0: # %entry
-; VLX-NEXT: vpbroadcastd (%rdi), %ymm1
-; VLX-NEXT: vpcmpled %ymm0, %ymm1, %k0
+; VLX: # %bb.0: # %entry
+; VLX-NEXT: vpcmpnltd (%rdi){1to8}, %ymm0, %k0
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsged_v8i1_v64i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1146:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1147:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1148:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: vpbroadcastd (%rdi), %ymm1
; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -32609,6 +23038,7 @@ define zeroext i64 @test_vpcmpsged_v8i1_v64i1_mask_mem_b(<4 x i64> %__a, i32* %_
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
@@ -32623,69 +23053,54 @@ entry:
define zeroext i64 @test_masked_vpcmpsged_v8i1_v64i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i32* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsged_v8i1_v64i1_mask_mem_b:
-; VLX: # BB#0: # %entry
-; VLX-NEXT: vpbroadcastd (%rsi), %ymm1
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
-; VLX-NEXT: vpcmpled %ymm0, %ymm1, %k0 {%k1}
+; VLX-NEXT: vpcmpnltd (%rsi){1to8}, %ymm0, %k0 {%k1}
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsged_v8i1_v64i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1149:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1150:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1151:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: vpbroadcastd (%rsi), %ymm1
-; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0
; NoVLX-NEXT: kmovw %edi, %k1
-; NoVLX-NEXT: kandw %k0, %k1, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 {%k1}
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -32696,6 +23111,7 @@ define zeroext i64 @test_masked_vpcmpsged_v8i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
@@ -32713,118 +23129,34 @@ entry:
define zeroext i32 @test_vpcmpsged_v16i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsged_v16i1_v32i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpled %zmm0, %zmm1, %k0
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsged_v16i1_v32i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1152:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1153:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1154:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: .Lcfi1155:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi1156:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi1157:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi1158:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi1159:
-; NoVLX-NEXT: .cfi_offset %r15, -24
-; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k1
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
@@ -32837,118 +23169,34 @@ entry:
define zeroext i32 @test_vpcmpsged_v16i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsged_v16i1_v32i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpnltd (%rdi), %zmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsged_v16i1_v32i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1160:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1161:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1162:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: .Lcfi1163:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi1164:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi1165:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi1166:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi1167:
-; NoVLX-NEXT: .cfi_offset %r15, -24
-; NoVLX-NEXT: vpcmpnltd (%rdi), %zmm0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpcmpnltd (%rdi), %zmm0, %k1
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
@@ -32962,7 +23210,7 @@ entry:
define zeroext i32 @test_masked_vpcmpsged_v16i1_v32i1_mask(i16 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsged_v16i1_v32i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
@@ -32970,112 +23218,28 @@ define zeroext i32 @test_masked_vpcmpsged_v16i1_v32i1_mask(i16 zeroext %__u, <8
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsged_v16i1_v32i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1168:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1169:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1170:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: .Lcfi1171:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi1172:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi1173:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi1174:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi1175:
-; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: kmovw %edi, %k1
-; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 {%k1}
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k1 {%k1}
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
@@ -33090,7 +23254,7 @@ entry:
define zeroext i32 @test_masked_vpcmpsged_v16i1_v32i1_mask_mem(i16 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsged_v16i1_v32i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpnltd (%rsi), %zmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
@@ -33098,112 +23262,28 @@ define zeroext i32 @test_masked_vpcmpsged_v16i1_v32i1_mask_mem(i16 zeroext %__u,
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsged_v16i1_v32i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1176:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1177:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1178:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: .Lcfi1179:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi1180:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi1181:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi1182:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi1183:
-; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: kmovw %edi, %k1
-; NoVLX-NEXT: vpcmpnltd (%rsi), %zmm0, %k0 {%k1}
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpcmpnltd (%rsi), %zmm0, %k1 {%k1}
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
@@ -33220,120 +23300,34 @@ entry:
define zeroext i32 @test_vpcmpsged_v16i1_v32i1_mask_mem_b(<8 x i64> %__a, i32* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsged_v16i1_v32i1_mask_mem_b:
-; VLX: # BB#0: # %entry
-; VLX-NEXT: vpbroadcastd (%rdi), %zmm1
-; VLX-NEXT: vpcmpled %zmm0, %zmm1, %k0
+; VLX: # %bb.0: # %entry
+; VLX-NEXT: vpcmpnltd (%rdi){1to16}, %zmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsged_v16i1_v32i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1184:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1185:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1186:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: .Lcfi1187:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi1188:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi1189:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi1190:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi1191:
-; NoVLX-NEXT: .cfi_offset %r15, -24
-; NoVLX-NEXT: vpbroadcastd (%rdi), %zmm1
-; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpcmpnltd (%rdi){1to16}, %zmm0, %k1
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
@@ -33348,122 +23342,36 @@ entry:
define zeroext i32 @test_masked_vpcmpsged_v16i1_v32i1_mask_mem_b(i16 zeroext %__u, <8 x i64> %__a, i32* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsged_v16i1_v32i1_mask_mem_b:
-; VLX: # BB#0: # %entry
-; VLX-NEXT: vpbroadcastd (%rsi), %zmm1
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
-; VLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 {%k1}
+; VLX-NEXT: vpcmpnltd (%rsi){1to16}, %zmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsged_v16i1_v32i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1192:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1193:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1194:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: .Lcfi1195:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi1196:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi1197:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi1198:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi1199:
-; NoVLX-NEXT: .cfi_offset %r15, -24
-; NoVLX-NEXT: vpbroadcastd (%rsi), %zmm1
; NoVLX-NEXT: kmovw %edi, %k1
-; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 {%k1}
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpcmpnltd (%rsi){1to16}, %zmm0, %k1 {%k1}
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
@@ -33481,108 +23389,28 @@ entry:
define zeroext i64 @test_vpcmpsged_v16i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsged_v16i1_v64i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpled %zmm0, %zmm1, %k0
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsged_v16i1_v64i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1200:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1201:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1202:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: .Lcfi1203:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi1204:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi1205:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi1206:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi1207:
-; NoVLX-NEXT: .cfi_offset %r15, -24
-; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k1
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -33591,13 +23419,9 @@ define zeroext i64 @test_vpcmpsged_v16i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %_
; NoVLX-NEXT: shlq $32, %rcx
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: orq %rcx, %rax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
@@ -33610,108 +23434,28 @@ entry:
define zeroext i64 @test_vpcmpsged_v16i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsged_v16i1_v64i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpnltd (%rdi), %zmm0, %k0
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsged_v16i1_v64i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1208:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1209:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1210:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: .Lcfi1211:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi1212:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi1213:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi1214:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi1215:
-; NoVLX-NEXT: .cfi_offset %r15, -24
-; NoVLX-NEXT: vpcmpnltd (%rdi), %zmm0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpcmpnltd (%rdi), %zmm0, %k1
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -33720,13 +23464,9 @@ define zeroext i64 @test_vpcmpsged_v16i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64
; NoVLX-NEXT: shlq $32, %rcx
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: orq %rcx, %rax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
@@ -33740,7 +23480,7 @@ entry:
define zeroext i64 @test_masked_vpcmpsged_v16i1_v64i1_mask(i16 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsged_v16i1_v64i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 {%k1}
; VLX-NEXT: kmovq %k0, %rax
@@ -33748,102 +23488,22 @@ define zeroext i64 @test_masked_vpcmpsged_v16i1_v64i1_mask(i16 zeroext %__u, <8
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsged_v16i1_v64i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1216:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1217:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1218:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: .Lcfi1219:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi1220:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi1221:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi1222:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi1223:
-; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: kmovw %edi, %k1
-; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 {%k1}
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k1 {%k1}
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -33852,13 +23512,9 @@ define zeroext i64 @test_masked_vpcmpsged_v16i1_v64i1_mask(i16 zeroext %__u, <8
; NoVLX-NEXT: shlq $32, %rcx
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: orq %rcx, %rax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
@@ -33873,7 +23529,7 @@ entry:
define zeroext i64 @test_masked_vpcmpsged_v16i1_v64i1_mask_mem(i16 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsged_v16i1_v64i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpnltd (%rsi), %zmm0, %k0 {%k1}
; VLX-NEXT: kmovq %k0, %rax
@@ -33881,102 +23537,22 @@ define zeroext i64 @test_masked_vpcmpsged_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsged_v16i1_v64i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1224:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1225:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1226:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: .Lcfi1227:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi1228:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi1229:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi1230:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi1231:
-; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: kmovw %edi, %k1
-; NoVLX-NEXT: vpcmpnltd (%rsi), %zmm0, %k0 {%k1}
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpcmpnltd (%rsi), %zmm0, %k1 {%k1}
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -33985,13 +23561,9 @@ define zeroext i64 @test_masked_vpcmpsged_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: shlq $32, %rcx
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: orq %rcx, %rax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
@@ -34008,110 +23580,28 @@ entry:
define zeroext i64 @test_vpcmpsged_v16i1_v64i1_mask_mem_b(<8 x i64> %__a, i32* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsged_v16i1_v64i1_mask_mem_b:
-; VLX: # BB#0: # %entry
-; VLX-NEXT: vpbroadcastd (%rdi), %zmm1
-; VLX-NEXT: vpcmpled %zmm0, %zmm1, %k0
+; VLX: # %bb.0: # %entry
+; VLX-NEXT: vpcmpnltd (%rdi){1to16}, %zmm0, %k0
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsged_v16i1_v64i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1232:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1233:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1234:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: .Lcfi1235:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi1236:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi1237:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi1238:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi1239:
-; NoVLX-NEXT: .cfi_offset %r15, -24
-; NoVLX-NEXT: vpbroadcastd (%rdi), %zmm1
-; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpcmpnltd (%rdi){1to16}, %zmm0, %k1
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -34120,13 +23610,9 @@ define zeroext i64 @test_vpcmpsged_v16i1_v64i1_mask_mem_b(<8 x i64> %__a, i32* %
; NoVLX-NEXT: shlq $32, %rcx
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: orq %rcx, %rax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
@@ -34141,112 +23627,30 @@ entry:
define zeroext i64 @test_masked_vpcmpsged_v16i1_v64i1_mask_mem_b(i16 zeroext %__u, <8 x i64> %__a, i32* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsged_v16i1_v64i1_mask_mem_b:
-; VLX: # BB#0: # %entry
-; VLX-NEXT: vpbroadcastd (%rsi), %zmm1
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
-; VLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 {%k1}
+; VLX-NEXT: vpcmpnltd (%rsi){1to16}, %zmm0, %k0 {%k1}
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsged_v16i1_v64i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1240:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1241:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1242:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: .Lcfi1243:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi1244:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi1245:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi1246:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi1247:
-; NoVLX-NEXT: .cfi_offset %r15, -24
-; NoVLX-NEXT: vpbroadcastd (%rsi), %zmm1
; NoVLX-NEXT: kmovw %edi, %k1
-; NoVLX-NEXT: vpcmpled %zmm0, %zmm1, %k0 {%k1}
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpcmpnltd (%rsi){1to16}, %zmm0, %k1 {%k1}
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -34255,13 +23659,9 @@ define zeroext i64 @test_masked_vpcmpsged_v16i1_v64i1_mask_mem_b(i16 zeroext %__
; NoVLX-NEXT: shlq $32, %rcx
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: orq %rcx, %rax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
@@ -34279,14 +23679,14 @@ entry:
define zeroext i4 @test_vpcmpsgeq_v2i1_v4i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgeq_v2i1_v4i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpleq %xmm0, %xmm1, %k0
; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v4i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
@@ -34296,6 +23696,7 @@ define zeroext i4 @test_vpcmpsgeq_v2i1_v4i1_mask(<2 x i64> %__a, <2 x i64> %__b)
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp)
; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -34308,14 +23709,14 @@ entry:
define zeroext i4 @test_vpcmpsgeq_v2i1_v4i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgeq_v2i1_v4i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpnltq (%rdi), %xmm0, %k0
; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v4i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vmovdqa (%rdi), %xmm1
; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
@@ -34326,6 +23727,7 @@ define zeroext i4 @test_vpcmpsgeq_v2i1_v4i1_mask_mem(<2 x i64> %__a, <2 x i64>*
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp)
; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -34339,7 +23741,7 @@ entry:
define zeroext i4 @test_masked_vpcmpsgeq_v2i1_v4i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgeq_v2i1_v4i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpleq %xmm0, %xmm1, %k0 {%k1}
; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
@@ -34347,15 +23749,12 @@ define zeroext i4 @test_masked_vpcmpsgeq_v2i1_v4i1_mask(i8 zeroext %__u, <2 x i6
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v4i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k0, %ecx
; NoVLX-NEXT: vmovd %ecx, %xmm1
; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0
@@ -34365,6 +23764,7 @@ define zeroext i4 @test_masked_vpcmpsgeq_v2i1_v4i1_mask(i8 zeroext %__u, <2 x i6
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp)
; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -34380,7 +23780,7 @@ entry:
define zeroext i4 @test_masked_vpcmpsgeq_v2i1_v4i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgeq_v2i1_v4i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpnltq (%rsi), %xmm0, %k0 {%k1}
; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
@@ -34388,16 +23788,13 @@ define zeroext i4 @test_masked_vpcmpsgeq_v2i1_v4i1_mask_mem(i8 zeroext %__u, <2
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v4i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vmovdqa (%rsi), %xmm1
; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k0, %ecx
; NoVLX-NEXT: vmovd %ecx, %xmm1
; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0
@@ -34407,6 +23804,7 @@ define zeroext i4 @test_masked_vpcmpsgeq_v2i1_v4i1_mask_mem(i8 zeroext %__u, <2
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp)
; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -34424,15 +23822,14 @@ entry:
define zeroext i4 @test_vpcmpsgeq_v2i1_v4i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgeq_v2i1_v4i1_mask_mem_b:
-; VLX: # BB#0: # %entry
-; VLX-NEXT: vpbroadcastq (%rdi), %xmm1
-; VLX-NEXT: vpcmpleq %xmm0, %xmm1, %k0
+; VLX: # %bb.0: # %entry
+; VLX-NEXT: vpcmpnltq (%rdi){1to2}, %xmm0, %k0
; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v4i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpbroadcastq (%rdi), %xmm1
; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
@@ -34443,6 +23840,7 @@ define zeroext i4 @test_vpcmpsgeq_v2i1_v4i1_mask_mem_b(<2 x i64> %__a, i64* %__b
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp)
; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -34457,25 +23855,21 @@ entry:
define zeroext i4 @test_masked_vpcmpsgeq_v2i1_v4i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgeq_v2i1_v4i1_mask_mem_b:
-; VLX: # BB#0: # %entry
-; VLX-NEXT: vpbroadcastq (%rsi), %xmm1
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
-; VLX-NEXT: vpcmpleq %xmm0, %xmm1, %k0 {%k1}
+; VLX-NEXT: vpcmpnltq (%rsi){1to2}, %xmm0, %k0 {%k1}
; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v4i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1
; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k0, %ecx
; NoVLX-NEXT: vmovd %ecx, %xmm1
; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0
@@ -34485,6 +23879,7 @@ define zeroext i4 @test_masked_vpcmpsgeq_v2i1_v4i1_mask_mem_b(i8 zeroext %__u, <
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp)
; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -34503,35 +23898,33 @@ entry:
define zeroext i8 @test_vpcmpsgeq_v2i1_v8i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgeq_v2i1_v8i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpleq %xmm0, %xmm1, %k0
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %al killed %al killed %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v8i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kshiftlw $7, %k0, %k0
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
-; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %al killed %al killed %eax
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -34544,36 +23937,34 @@ entry:
define zeroext i8 @test_vpcmpsgeq_v2i1_v8i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgeq_v2i1_v8i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpnltq (%rdi), %xmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %al killed %al killed %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v8i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vmovdqa (%rdi), %xmm1
; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kshiftlw $7, %k0, %k0
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
-; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %al killed %al killed %eax
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -34587,44 +23978,39 @@ entry:
define zeroext i8 @test_masked_vpcmpsgeq_v2i1_v8i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgeq_v2i1_v8i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpleq %xmm0, %xmm1, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %al killed %al killed %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v8i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k0, %ecx
; NoVLX-NEXT: vmovd %ecx, %xmm1
; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kshiftlw $7, %k0, %k0
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
-; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %al killed %al killed %eax
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -34640,45 +24026,40 @@ entry:
define zeroext i8 @test_masked_vpcmpsgeq_v2i1_v8i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgeq_v2i1_v8i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpnltq (%rsi), %xmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %al killed %al killed %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v8i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vmovdqa (%rsi), %xmm1
; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k0, %ecx
; NoVLX-NEXT: vmovd %ecx, %xmm1
; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kshiftlw $7, %k0, %k0
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
-; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %al killed %al killed %eax
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -34696,37 +24077,34 @@ entry:
define zeroext i8 @test_vpcmpsgeq_v2i1_v8i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgeq_v2i1_v8i1_mask_mem_b:
-; VLX: # BB#0: # %entry
-; VLX-NEXT: vpbroadcastq (%rdi), %xmm1
-; VLX-NEXT: vpcmpleq %xmm0, %xmm1, %k0
+; VLX: # %bb.0: # %entry
+; VLX-NEXT: vpcmpnltq (%rdi){1to2}, %xmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %al killed %al killed %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v8i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpbroadcastq (%rdi), %xmm1
; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kshiftlw $7, %k0, %k0
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
-; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %al killed %al killed %eax
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -34741,46 +24119,40 @@ entry:
define zeroext i8 @test_masked_vpcmpsgeq_v2i1_v8i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgeq_v2i1_v8i1_mask_mem_b:
-; VLX: # BB#0: # %entry
-; VLX-NEXT: vpbroadcastq (%rsi), %xmm1
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
-; VLX-NEXT: vpcmpleq %xmm0, %xmm1, %k0 {%k1}
+; VLX-NEXT: vpcmpnltq (%rsi){1to2}, %xmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %al killed %al killed %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v8i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1
; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k0, %ecx
; NoVLX-NEXT: vmovd %ecx, %xmm1
; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kshiftlw $7, %k0, %k0
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
-; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %al killed %al killed %eax
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -34799,34 +24171,33 @@ entry:
define zeroext i16 @test_vpcmpsgeq_v2i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgeq_v2i1_v16i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpleq %xmm0, %xmm1, %k0
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v16i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kmovw %eax, %k0
; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: andl $1, %eax
-; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
-; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -34839,35 +24210,34 @@ entry:
define zeroext i16 @test_vpcmpsgeq_v2i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgeq_v2i1_v16i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpnltq (%rdi), %xmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v16i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vmovdqa (%rdi), %xmm1
; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kmovw %eax, %k0
; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: andl $1, %eax
-; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
-; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -34881,43 +24251,39 @@ entry:
define zeroext i16 @test_masked_vpcmpsgeq_v2i1_v16i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgeq_v2i1_v16i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpleq %xmm0, %xmm1, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v16i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k0, %ecx
; NoVLX-NEXT: vmovd %ecx, %xmm1
; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kmovw %eax, %k0
; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: andl $1, %eax
-; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
-; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -34933,44 +24299,40 @@ entry:
define zeroext i16 @test_masked_vpcmpsgeq_v2i1_v16i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgeq_v2i1_v16i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpnltq (%rsi), %xmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v16i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vmovdqa (%rsi), %xmm1
; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k0, %ecx
; NoVLX-NEXT: vmovd %ecx, %xmm1
; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kmovw %eax, %k0
; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: andl $1, %eax
-; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
-; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -34988,36 +24350,34 @@ entry:
define zeroext i16 @test_vpcmpsgeq_v2i1_v16i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgeq_v2i1_v16i1_mask_mem_b:
-; VLX: # BB#0: # %entry
-; VLX-NEXT: vpbroadcastq (%rdi), %xmm1
-; VLX-NEXT: vpcmpleq %xmm0, %xmm1, %k0
+; VLX: # %bb.0: # %entry
+; VLX-NEXT: vpcmpnltq (%rdi){1to2}, %xmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v16i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpbroadcastq (%rdi), %xmm1
; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kmovw %eax, %k0
; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: andl $1, %eax
-; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
-; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -35032,45 +24392,40 @@ entry:
define zeroext i16 @test_masked_vpcmpsgeq_v2i1_v16i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgeq_v2i1_v16i1_mask_mem_b:
-; VLX: # BB#0: # %entry
-; VLX-NEXT: vpbroadcastq (%rsi), %xmm1
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
-; VLX-NEXT: vpcmpleq %xmm0, %xmm1, %k0 {%k1}
+; VLX-NEXT: vpcmpnltq (%rsi){1to2}, %xmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v16i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1
; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k0, %ecx
; NoVLX-NEXT: vmovd %ecx, %xmm1
; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kmovw %eax, %k0
; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: andl $1, %eax
-; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
-; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -35089,20 +24444,17 @@ entry:
define zeroext i32 @test_vpcmpsgeq_v2i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgeq_v2i1_v32i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpleq %xmm0, %xmm1, %k0
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v32i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1248:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1249:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1250:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -35122,6 +24474,7 @@ define zeroext i32 @test_vpcmpsgeq_v2i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -35134,20 +24487,17 @@ entry:
define zeroext i32 @test_vpcmpsgeq_v2i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgeq_v2i1_v32i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpnltq (%rdi), %xmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v32i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1251:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1252:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1253:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -35168,6 +24518,7 @@ define zeroext i32 @test_vpcmpsgeq_v2i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -35181,32 +24532,26 @@ entry:
define zeroext i32 @test_masked_vpcmpsgeq_v2i1_v32i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgeq_v2i1_v32i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpleq %xmm0, %xmm1, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v32i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1254:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1255:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1256:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k0, %ecx
; NoVLX-NEXT: vmovd %ecx, %xmm1
; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0
@@ -35223,6 +24568,7 @@ define zeroext i32 @test_masked_vpcmpsgeq_v2i1_v32i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -35238,33 +24584,27 @@ entry:
define zeroext i32 @test_masked_vpcmpsgeq_v2i1_v32i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgeq_v2i1_v32i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpnltq (%rsi), %xmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v32i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1257:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1258:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1259:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
; NoVLX-NEXT: vmovdqa (%rsi), %xmm1
; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k0, %ecx
; NoVLX-NEXT: vmovd %ecx, %xmm1
; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0
@@ -35281,6 +24621,7 @@ define zeroext i32 @test_masked_vpcmpsgeq_v2i1_v32i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -35298,21 +24639,17 @@ entry:
define zeroext i32 @test_vpcmpsgeq_v2i1_v32i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgeq_v2i1_v32i1_mask_mem_b:
-; VLX: # BB#0: # %entry
-; VLX-NEXT: vpbroadcastq (%rdi), %xmm1
-; VLX-NEXT: vpcmpleq %xmm0, %xmm1, %k0
+; VLX: # %bb.0: # %entry
+; VLX-NEXT: vpcmpnltq (%rdi){1to2}, %xmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v32i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1260:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1261:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1262:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -35333,6 +24670,7 @@ define zeroext i32 @test_vpcmpsgeq_v2i1_v32i1_mask_mem_b(<2 x i64> %__a, i64* %_
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -35347,34 +24685,27 @@ entry:
define zeroext i32 @test_masked_vpcmpsgeq_v2i1_v32i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgeq_v2i1_v32i1_mask_mem_b:
-; VLX: # BB#0: # %entry
-; VLX-NEXT: vpbroadcastq (%rsi), %xmm1
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
-; VLX-NEXT: vpcmpleq %xmm0, %xmm1, %k0 {%k1}
+; VLX-NEXT: vpcmpnltq (%rsi){1to2}, %xmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v32i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1263:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1264:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1265:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1
; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k0, %ecx
; NoVLX-NEXT: vmovd %ecx, %xmm1
; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0
@@ -35391,6 +24722,7 @@ define zeroext i32 @test_masked_vpcmpsgeq_v2i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -35409,28 +24741,24 @@ entry:
define zeroext i64 @test_vpcmpsgeq_v2i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgeq_v2i1_v64i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpleq %xmm0, %xmm1, %k0
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v64i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1266:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1267:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1268:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -35449,6 +24777,7 @@ define zeroext i64 @test_vpcmpsgeq_v2i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -35461,20 +24790,17 @@ entry:
define zeroext i64 @test_vpcmpsgeq_v2i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgeq_v2i1_v64i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpnltq (%rdi), %xmm0, %k0
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v64i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1269:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1270:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1271:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -35482,8 +24808,7 @@ define zeroext i64 @test_vpcmpsgeq_v2i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -35502,6 +24827,7 @@ define zeroext i64 @test_vpcmpsgeq_v2i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -35515,37 +24841,30 @@ entry:
define zeroext i64 @test_masked_vpcmpsgeq_v2i1_v64i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgeq_v2i1_v64i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpleq %xmm0, %xmm1, %k0 {%k1}
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v64i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1272:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1273:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1274:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k0, %ecx
; NoVLX-NEXT: vmovd %ecx, %xmm1
; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0
-; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -35564,6 +24883,7 @@ define zeroext i64 @test_masked_vpcmpsgeq_v2i1_v64i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -35579,38 +24899,31 @@ entry:
define zeroext i64 @test_masked_vpcmpsgeq_v2i1_v64i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgeq_v2i1_v64i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpnltq (%rsi), %xmm0, %k0 {%k1}
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v64i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1275:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1276:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1277:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vmovdqa (%rsi), %xmm1
; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k0, %ecx
; NoVLX-NEXT: vmovd %ecx, %xmm1
; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0
-; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -35629,6 +24942,7 @@ define zeroext i64 @test_masked_vpcmpsgeq_v2i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -35646,21 +24960,17 @@ entry:
define zeroext i64 @test_vpcmpsgeq_v2i1_v64i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgeq_v2i1_v64i1_mask_mem_b:
-; VLX: # BB#0: # %entry
-; VLX-NEXT: vpbroadcastq (%rdi), %xmm1
-; VLX-NEXT: vpcmpleq %xmm0, %xmm1, %k0
+; VLX: # %bb.0: # %entry
+; VLX-NEXT: vpcmpnltq (%rdi){1to2}, %xmm0, %k0
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v64i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1278:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1279:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1280:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -35668,8 +24978,7 @@ define zeroext i64 @test_vpcmpsgeq_v2i1_v64i1_mask_mem_b(<2 x i64> %__a, i64* %_
; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -35688,6 +24997,7 @@ define zeroext i64 @test_vpcmpsgeq_v2i1_v64i1_mask_mem_b(<2 x i64> %__a, i64* %_
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -35702,39 +25012,31 @@ entry:
define zeroext i64 @test_masked_vpcmpsgeq_v2i1_v64i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgeq_v2i1_v64i1_mask_mem_b:
-; VLX: # BB#0: # %entry
-; VLX-NEXT: vpbroadcastq (%rsi), %xmm1
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
-; VLX-NEXT: vpcmpleq %xmm0, %xmm1, %k0 {%k1}
+; VLX-NEXT: vpcmpnltq (%rsi){1to2}, %xmm0, %k0 {%k1}
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v64i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1281:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1282:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1283:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1
; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k0, %ecx
; NoVLX-NEXT: vmovd %ecx, %xmm1
; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpandn %xmm1, %xmm0, %xmm0
-; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -35753,6 +25055,7 @@ define zeroext i64 @test_masked_vpcmpsgeq_v2i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -35771,53 +25074,50 @@ entry:
define zeroext i8 @test_vpcmpsgeq_v4i1_v8i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgeq_v4i1_v8i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpleq %ymm0, %ymm1, %k0
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %al killed %al killed %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgeq_v4i1_v8i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kshiftlw $7, %k0, %k0
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $13, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $12, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %al killed %al killed %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
@@ -35830,54 +25130,51 @@ entry:
define zeroext i8 @test_vpcmpsgeq_v4i1_v8i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgeq_v4i1_v8i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpnltq (%rdi), %ymm0, %k0
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %al killed %al killed %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgeq_v4i1_v8i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vmovdqa (%rdi), %ymm1
; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kshiftlw $7, %k0, %k0
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $13, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $12, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %al killed %al killed %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
@@ -35891,72 +25188,64 @@ entry:
define zeroext i8 @test_masked_vpcmpsgeq_v4i1_v8i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgeq_v4i1_v8i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpleq %ymm0, %ymm1, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %al killed %al killed %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgeq_v4i1_v8i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $13, %k0, %k2
-; NoVLX-NEXT: kshiftrw $15, %k2, %k2
-; NoVLX-NEXT: kshiftlw $15, %k0, %k3
-; NoVLX-NEXT: kshiftrw $15, %k3, %k3
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k3, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kshiftlw $7, %k0, %k0
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $13, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $12, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %al killed %al killed %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
@@ -35972,73 +25261,65 @@ entry:
define zeroext i8 @test_masked_vpcmpsgeq_v4i1_v8i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgeq_v4i1_v8i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpnltq (%rsi), %ymm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %al killed %al killed %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgeq_v4i1_v8i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vmovdqa (%rsi), %ymm1
; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $13, %k0, %k2
-; NoVLX-NEXT: kshiftrw $15, %k2, %k2
-; NoVLX-NEXT: kshiftlw $15, %k0, %k3
-; NoVLX-NEXT: kshiftrw $15, %k3, %k3
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k3, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kshiftlw $7, %k0, %k0
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $13, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $12, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %al killed %al killed %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
@@ -36056,55 +25337,51 @@ entry:
define zeroext i8 @test_vpcmpsgeq_v4i1_v8i1_mask_mem_b(<4 x i64> %__a, i64* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgeq_v4i1_v8i1_mask_mem_b:
-; VLX: # BB#0: # %entry
-; VLX-NEXT: vpbroadcastq (%rdi), %ymm1
-; VLX-NEXT: vpcmpleq %ymm0, %ymm1, %k0
+; VLX: # %bb.0: # %entry
+; VLX-NEXT: vpcmpnltq (%rdi){1to4}, %ymm0, %k0
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %al killed %al killed %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgeq_v4i1_v8i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpbroadcastq (%rdi), %ymm1
; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kshiftlw $7, %k0, %k0
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $13, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $12, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %al killed %al killed %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
@@ -36119,74 +25396,65 @@ entry:
define zeroext i8 @test_masked_vpcmpsgeq_v4i1_v8i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i64* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgeq_v4i1_v8i1_mask_mem_b:
-; VLX: # BB#0: # %entry
-; VLX-NEXT: vpbroadcastq (%rsi), %ymm1
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
-; VLX-NEXT: vpcmpleq %ymm0, %ymm1, %k0 {%k1}
+; VLX-NEXT: vpcmpnltq (%rsi){1to4}, %ymm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %al killed %al killed %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgeq_v4i1_v8i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpbroadcastq (%rsi), %ymm1
; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $13, %k0, %k2
-; NoVLX-NEXT: kshiftrw $15, %k2, %k2
-; NoVLX-NEXT: kshiftlw $15, %k0, %k3
-; NoVLX-NEXT: kshiftrw $15, %k3, %k3
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k3, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kshiftlw $7, %k0, %k0
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $13, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $12, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %al killed %al killed %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
@@ -36205,52 +25473,50 @@ entry:
define zeroext i16 @test_vpcmpsgeq_v4i1_v16i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgeq_v4i1_v16i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpleq %ymm0, %ymm1, %k0
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgeq_v4i1_v16i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kmovw %eax, %k0
; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: andl $1, %eax
-; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $13, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $12, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
@@ -36263,53 +25529,51 @@ entry:
define zeroext i16 @test_vpcmpsgeq_v4i1_v16i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgeq_v4i1_v16i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpnltq (%rdi), %ymm0, %k0
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgeq_v4i1_v16i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vmovdqa (%rdi), %ymm1
; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kmovw %eax, %k0
; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: andl $1, %eax
-; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $13, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $12, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
@@ -36323,71 +25587,64 @@ entry:
define zeroext i16 @test_masked_vpcmpsgeq_v4i1_v16i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgeq_v4i1_v16i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpleq %ymm0, %ymm1, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgeq_v4i1_v16i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $13, %k0, %k2
-; NoVLX-NEXT: kshiftrw $15, %k2, %k2
-; NoVLX-NEXT: kshiftlw $15, %k0, %k3
-; NoVLX-NEXT: kshiftrw $15, %k3, %k3
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k3, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kmovw %eax, %k0
; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: andl $1, %eax
-; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $13, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $12, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
@@ -36403,72 +25660,65 @@ entry:
define zeroext i16 @test_masked_vpcmpsgeq_v4i1_v16i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgeq_v4i1_v16i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpnltq (%rsi), %ymm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgeq_v4i1_v16i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vmovdqa (%rsi), %ymm1
; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $13, %k0, %k2
-; NoVLX-NEXT: kshiftrw $15, %k2, %k2
-; NoVLX-NEXT: kshiftlw $15, %k0, %k3
-; NoVLX-NEXT: kshiftrw $15, %k3, %k3
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k3, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kmovw %eax, %k0
; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: andl $1, %eax
-; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $13, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $12, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
@@ -36486,54 +25736,51 @@ entry:
define zeroext i16 @test_vpcmpsgeq_v4i1_v16i1_mask_mem_b(<4 x i64> %__a, i64* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgeq_v4i1_v16i1_mask_mem_b:
-; VLX: # BB#0: # %entry
-; VLX-NEXT: vpbroadcastq (%rdi), %ymm1
-; VLX-NEXT: vpcmpleq %ymm0, %ymm1, %k0
+; VLX: # %bb.0: # %entry
+; VLX-NEXT: vpcmpnltq (%rdi){1to4}, %ymm0, %k0
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgeq_v4i1_v16i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpbroadcastq (%rdi), %ymm1
; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kmovw %eax, %k0
; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: andl $1, %eax
-; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $13, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $12, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
@@ -36548,73 +25795,65 @@ entry:
define zeroext i16 @test_masked_vpcmpsgeq_v4i1_v16i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i64* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgeq_v4i1_v16i1_mask_mem_b:
-; VLX: # BB#0: # %entry
-; VLX-NEXT: vpbroadcastq (%rsi), %ymm1
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
-; VLX-NEXT: vpcmpleq %ymm0, %ymm1, %k0 {%k1}
+; VLX-NEXT: vpcmpnltq (%rsi){1to4}, %ymm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgeq_v4i1_v16i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpbroadcastq (%rsi), %ymm1
; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $13, %k0, %k2
-; NoVLX-NEXT: kshiftrw $15, %k2, %k2
-; NoVLX-NEXT: kshiftlw $15, %k0, %k3
-; NoVLX-NEXT: kshiftrw $15, %k3, %k3
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k3, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kmovw %eax, %k0
; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: andl $1, %eax
-; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $13, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $12, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
@@ -36633,21 +25872,18 @@ entry:
define zeroext i32 @test_vpcmpsgeq_v4i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgeq_v4i1_v32i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpleq %ymm0, %ymm1, %k0
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgeq_v4i1_v32i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1284:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1285:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1286:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -36668,6 +25904,7 @@ define zeroext i32 @test_vpcmpsgeq_v4i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
@@ -36680,21 +25917,18 @@ entry:
define zeroext i32 @test_vpcmpsgeq_v4i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgeq_v4i1_v32i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpnltq (%rdi), %ymm0, %k0
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgeq_v4i1_v32i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1287:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1288:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1289:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -36716,6 +25950,7 @@ define zeroext i32 @test_vpcmpsgeq_v4i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
@@ -36729,7 +25964,7 @@ entry:
define zeroext i32 @test_masked_vpcmpsgeq_v4i1_v32i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgeq_v4i1_v32i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpleq %ymm0, %ymm1, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
@@ -36737,14 +25972,11 @@ define zeroext i32 @test_masked_vpcmpsgeq_v4i1_v32i1_mask(i8 zeroext %__u, <4 x
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgeq_v4i1_v32i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1290:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1291:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1292:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -36753,21 +25985,16 @@ define zeroext i32 @test_masked_vpcmpsgeq_v4i1_v32i1_mask(i8 zeroext %__u, <4 x
; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $13, %k0, %k2
-; NoVLX-NEXT: kshiftrw $15, %k2, %k2
-; NoVLX-NEXT: kshiftlw $15, %k0, %k3
-; NoVLX-NEXT: kshiftrw $15, %k3, %k3
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k3, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -36783,6 +26010,7 @@ define zeroext i32 @test_masked_vpcmpsgeq_v4i1_v32i1_mask(i8 zeroext %__u, <4 x
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
@@ -36798,7 +26026,7 @@ entry:
define zeroext i32 @test_masked_vpcmpsgeq_v4i1_v32i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgeq_v4i1_v32i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpnltq (%rsi), %ymm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
@@ -36806,14 +26034,11 @@ define zeroext i32 @test_masked_vpcmpsgeq_v4i1_v32i1_mask_mem(i8 zeroext %__u, <
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgeq_v4i1_v32i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1293:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1294:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1295:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -36823,21 +26048,16 @@ define zeroext i32 @test_masked_vpcmpsgeq_v4i1_v32i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $13, %k0, %k2
-; NoVLX-NEXT: kshiftrw $15, %k2, %k2
-; NoVLX-NEXT: kshiftlw $15, %k0, %k3
-; NoVLX-NEXT: kshiftrw $15, %k3, %k3
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k3, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -36853,6 +26073,7 @@ define zeroext i32 @test_masked_vpcmpsgeq_v4i1_v32i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
@@ -36870,22 +26091,18 @@ entry:
define zeroext i32 @test_vpcmpsgeq_v4i1_v32i1_mask_mem_b(<4 x i64> %__a, i64* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgeq_v4i1_v32i1_mask_mem_b:
-; VLX: # BB#0: # %entry
-; VLX-NEXT: vpbroadcastq (%rdi), %ymm1
-; VLX-NEXT: vpcmpleq %ymm0, %ymm1, %k0
+; VLX: # %bb.0: # %entry
+; VLX-NEXT: vpcmpnltq (%rdi){1to4}, %ymm0, %k0
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgeq_v4i1_v32i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1296:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1297:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1298:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -36907,6 +26124,7 @@ define zeroext i32 @test_vpcmpsgeq_v4i1_v32i1_mask_mem_b(<4 x i64> %__a, i64* %_
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
@@ -36921,23 +26139,19 @@ entry:
define zeroext i32 @test_masked_vpcmpsgeq_v4i1_v32i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i64* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgeq_v4i1_v32i1_mask_mem_b:
-; VLX: # BB#0: # %entry
-; VLX-NEXT: vpbroadcastq (%rsi), %ymm1
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
-; VLX-NEXT: vpcmpleq %ymm0, %ymm1, %k0 {%k1}
+; VLX-NEXT: vpcmpnltq (%rsi){1to4}, %ymm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgeq_v4i1_v32i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1299:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1300:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1301:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -36947,21 +26161,16 @@ define zeroext i32 @test_masked_vpcmpsgeq_v4i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $13, %k0, %k2
-; NoVLX-NEXT: kshiftrw $15, %k2, %k2
-; NoVLX-NEXT: kshiftlw $15, %k0, %k3
-; NoVLX-NEXT: kshiftrw $15, %k3, %k3
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k3, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -36977,6 +26186,7 @@ define zeroext i32 @test_masked_vpcmpsgeq_v4i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
@@ -36995,21 +26205,18 @@ entry:
define zeroext i64 @test_vpcmpsgeq_v4i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgeq_v4i1_v64i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpleq %ymm0, %ymm1, %k0
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgeq_v4i1_v64i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1302:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1303:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1304:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -37017,8 +26224,7 @@ define zeroext i64 @test_vpcmpsgeq_v4i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__
; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
-; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -37037,6 +26243,7 @@ define zeroext i64 @test_vpcmpsgeq_v4i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
@@ -37049,21 +26256,18 @@ entry:
define zeroext i64 @test_vpcmpsgeq_v4i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgeq_v4i1_v64i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpnltq (%rdi), %ymm0, %k0
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgeq_v4i1_v64i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1305:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1306:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1307:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -37072,8 +26276,7 @@ define zeroext i64 @test_vpcmpsgeq_v4i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>
; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
-; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -37092,6 +26295,7 @@ define zeroext i64 @test_vpcmpsgeq_v4i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
@@ -37105,7 +26309,7 @@ entry:
define zeroext i64 @test_masked_vpcmpsgeq_v4i1_v64i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgeq_v4i1_v64i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpleq %ymm0, %ymm1, %k0 {%k1}
; VLX-NEXT: kmovq %k0, %rax
@@ -37113,14 +26317,11 @@ define zeroext i64 @test_masked_vpcmpsgeq_v4i1_v64i1_mask(i8 zeroext %__u, <4 x
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgeq_v4i1_v64i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1308:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1309:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1310:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -37129,25 +26330,19 @@ define zeroext i64 @test_masked_vpcmpsgeq_v4i1_v64i1_mask(i8 zeroext %__u, <4 x
; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $13, %k0, %k2
-; NoVLX-NEXT: kshiftrw $15, %k2, %k2
-; NoVLX-NEXT: kshiftlw $15, %k0, %k3
-; NoVLX-NEXT: kshiftrw $15, %k3, %k3
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k3, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
-; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -37166,6 +26361,7 @@ define zeroext i64 @test_masked_vpcmpsgeq_v4i1_v64i1_mask(i8 zeroext %__u, <4 x
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
@@ -37181,7 +26377,7 @@ entry:
define zeroext i64 @test_masked_vpcmpsgeq_v4i1_v64i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgeq_v4i1_v64i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpnltq (%rsi), %ymm0, %k0 {%k1}
; VLX-NEXT: kmovq %k0, %rax
@@ -37189,14 +26385,11 @@ define zeroext i64 @test_masked_vpcmpsgeq_v4i1_v64i1_mask_mem(i8 zeroext %__u, <
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgeq_v4i1_v64i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1311:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1312:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1313:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -37206,25 +26399,19 @@ define zeroext i64 @test_masked_vpcmpsgeq_v4i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $13, %k0, %k2
-; NoVLX-NEXT: kshiftrw $15, %k2, %k2
-; NoVLX-NEXT: kshiftlw $15, %k0, %k3
-; NoVLX-NEXT: kshiftrw $15, %k3, %k3
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k3, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
-; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -37243,6 +26430,7 @@ define zeroext i64 @test_masked_vpcmpsgeq_v4i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
@@ -37260,22 +26448,18 @@ entry:
define zeroext i64 @test_vpcmpsgeq_v4i1_v64i1_mask_mem_b(<4 x i64> %__a, i64* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgeq_v4i1_v64i1_mask_mem_b:
-; VLX: # BB#0: # %entry
-; VLX-NEXT: vpbroadcastq (%rdi), %ymm1
-; VLX-NEXT: vpcmpleq %ymm0, %ymm1, %k0
+; VLX: # %bb.0: # %entry
+; VLX-NEXT: vpcmpnltq (%rdi){1to4}, %ymm0, %k0
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgeq_v4i1_v64i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1314:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1315:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1316:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -37284,8 +26468,7 @@ define zeroext i64 @test_vpcmpsgeq_v4i1_v64i1_mask_mem_b(<4 x i64> %__a, i64* %_
; NoVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
-; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -37304,6 +26487,7 @@ define zeroext i64 @test_vpcmpsgeq_v4i1_v64i1_mask_mem_b(<4 x i64> %__a, i64* %_
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
@@ -37318,23 +26502,19 @@ entry:
define zeroext i64 @test_masked_vpcmpsgeq_v4i1_v64i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i64* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgeq_v4i1_v64i1_mask_mem_b:
-; VLX: # BB#0: # %entry
-; VLX-NEXT: vpbroadcastq (%rsi), %ymm1
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
-; VLX-NEXT: vpcmpleq %ymm0, %ymm1, %k0 {%k1}
+; VLX-NEXT: vpcmpnltq (%rsi){1to4}, %ymm0, %k0 {%k1}
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgeq_v4i1_v64i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1317:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1318:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1319:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -37344,25 +26524,19 @@ define zeroext i64 @test_masked_vpcmpsgeq_v4i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $13, %k0, %k2
-; NoVLX-NEXT: kshiftrw $15, %k2, %k2
-; NoVLX-NEXT: kshiftlw $15, %k0, %k3
-; NoVLX-NEXT: kshiftrw $15, %k3, %k3
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k3, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
-; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -37381,6 +26555,7 @@ define zeroext i64 @test_masked_vpcmpsgeq_v4i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
@@ -37399,18 +26574,19 @@ entry:
define zeroext i16 @test_vpcmpsgeq_v8i1_v16i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgeq_v8i1_v16i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgeq_v8i1_v16i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
@@ -37423,18 +26599,19 @@ entry:
define zeroext i16 @test_vpcmpsgeq_v8i1_v16i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgeq_v8i1_v16i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpnltq (%rdi), %zmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgeq_v8i1_v16i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpcmpnltq (%rdi), %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
@@ -37448,20 +26625,21 @@ entry:
define zeroext i16 @test_masked_vpcmpsgeq_v8i1_v16i1_mask(i8 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgeq_v8i1_v16i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgeq_v8i1_v16i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0 {%k1}
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
@@ -37476,20 +26654,21 @@ entry:
define zeroext i16 @test_masked_vpcmpsgeq_v8i1_v16i1_mask_mem(i8 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgeq_v8i1_v16i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpnltq (%rsi), %zmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgeq_v8i1_v16i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpnltq (%rsi), %zmm0, %k0 {%k1}
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
@@ -37506,20 +26685,19 @@ entry:
define zeroext i16 @test_vpcmpsgeq_v8i1_v16i1_mask_mem_b(<8 x i64> %__a, i64* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgeq_v8i1_v16i1_mask_mem_b:
-; VLX: # BB#0: # %entry
-; VLX-NEXT: vpbroadcastq (%rdi), %zmm1
-; VLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0
+; VLX: # %bb.0: # %entry
+; VLX-NEXT: vpcmpnltq (%rdi){1to8}, %zmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgeq_v8i1_v16i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
-; NoVLX-NEXT: vpbroadcastq (%rdi), %zmm1
-; NoVLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0
+; NoVLX: # %bb.0: # %entry
+; NoVLX-NEXT: vpcmpnltq (%rdi){1to8}, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
@@ -37534,22 +26712,21 @@ entry:
define zeroext i16 @test_masked_vpcmpsgeq_v8i1_v16i1_mask_mem_b(i8 zeroext %__u, <8 x i64> %__a, i64* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgeq_v8i1_v16i1_mask_mem_b:
-; VLX: # BB#0: # %entry
-; VLX-NEXT: vpbroadcastq (%rsi), %zmm1
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
-; VLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0 {%k1}
+; VLX-NEXT: vpcmpnltq (%rsi){1to8}, %zmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgeq_v8i1_v16i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
-; NoVLX-NEXT: vpbroadcastq (%rsi), %zmm1
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: kmovw %edi, %k1
-; NoVLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0 {%k1}
+; NoVLX-NEXT: vpcmpnltq (%rsi){1to8}, %zmm0, %k0 {%k1}
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
@@ -37567,61 +26744,48 @@ entry:
define zeroext i32 @test_vpcmpsgeq_v8i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgeq_v8i1_v32i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgeq_v8i1_v32i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1320:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1321:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1322:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
; NoVLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -37629,6 +26793,7 @@ define zeroext i32 @test_vpcmpsgeq_v8i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %__
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
@@ -37641,61 +26806,48 @@ entry:
define zeroext i32 @test_vpcmpsgeq_v8i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgeq_v8i1_v32i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpnltq (%rdi), %zmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgeq_v8i1_v32i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1323:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1324:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1325:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
; NoVLX-NEXT: vpcmpnltq (%rdi), %zmm0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -37703,6 +26855,7 @@ define zeroext i32 @test_vpcmpsgeq_v8i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64>
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
@@ -37716,7 +26869,7 @@ entry:
define zeroext i32 @test_masked_vpcmpsgeq_v8i1_v32i1_mask(i8 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgeq_v8i1_v32i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
@@ -37724,55 +26877,42 @@ define zeroext i32 @test_masked_vpcmpsgeq_v8i1_v32i1_mask(i8 zeroext %__u, <8 x
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgeq_v8i1_v32i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1326:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1327:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1328:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0 {%k1}
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -37780,6 +26920,7 @@ define zeroext i32 @test_masked_vpcmpsgeq_v8i1_v32i1_mask(i8 zeroext %__u, <8 x
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
@@ -37794,7 +26935,7 @@ entry:
define zeroext i32 @test_masked_vpcmpsgeq_v8i1_v32i1_mask_mem(i8 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgeq_v8i1_v32i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpnltq (%rsi), %zmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
@@ -37802,55 +26943,42 @@ define zeroext i32 @test_masked_vpcmpsgeq_v8i1_v32i1_mask_mem(i8 zeroext %__u, <
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgeq_v8i1_v32i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1329:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1330:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1331:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpnltq (%rsi), %zmm0, %k0 {%k1}
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -37858,6 +26986,7 @@ define zeroext i32 @test_masked_vpcmpsgeq_v8i1_v32i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
@@ -37874,63 +27003,48 @@ entry:
define zeroext i32 @test_vpcmpsgeq_v8i1_v32i1_mask_mem_b(<8 x i64> %__a, i64* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgeq_v8i1_v32i1_mask_mem_b:
-; VLX: # BB#0: # %entry
-; VLX-NEXT: vpbroadcastq (%rdi), %zmm1
-; VLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0
+; VLX: # %bb.0: # %entry
+; VLX-NEXT: vpcmpnltq (%rdi){1to8}, %zmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgeq_v8i1_v32i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1332:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1333:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1334:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: vpbroadcastq (%rdi), %zmm1
-; NoVLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpcmpnltq (%rdi){1to8}, %zmm0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -37938,6 +27052,7 @@ define zeroext i32 @test_vpcmpsgeq_v8i1_v32i1_mask_mem_b(<8 x i64> %__a, i64* %_
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
@@ -37952,65 +27067,50 @@ entry:
define zeroext i32 @test_masked_vpcmpsgeq_v8i1_v32i1_mask_mem_b(i8 zeroext %__u, <8 x i64> %__a, i64* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgeq_v8i1_v32i1_mask_mem_b:
-; VLX: # BB#0: # %entry
-; VLX-NEXT: vpbroadcastq (%rsi), %zmm1
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
-; VLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0 {%k1}
+; VLX-NEXT: vpcmpnltq (%rsi){1to8}, %zmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgeq_v8i1_v32i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1335:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1336:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1337:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: vpbroadcastq (%rsi), %zmm1
; NoVLX-NEXT: kmovw %edi, %k1
-; NoVLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0 {%k1}
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpcmpnltq (%rsi){1to8}, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -38018,6 +27118,7 @@ define zeroext i32 @test_masked_vpcmpsgeq_v8i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
@@ -38035,63 +27136,50 @@ entry:
define zeroext i64 @test_vpcmpsgeq_v8i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgeq_v8i1_v64i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgeq_v8i1_v64i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1338:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1339:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1340:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -38102,6 +27190,7 @@ define zeroext i64 @test_vpcmpsgeq_v8i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
@@ -38114,63 +27203,50 @@ entry:
define zeroext i64 @test_vpcmpsgeq_v8i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgeq_v8i1_v64i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpnltq (%rdi), %zmm0, %k0
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgeq_v8i1_v64i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1341:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1342:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1343:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vpcmpnltq (%rdi), %zmm0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -38181,6 +27257,7 @@ define zeroext i64 @test_vpcmpsgeq_v8i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
@@ -38194,7 +27271,7 @@ entry:
define zeroext i64 @test_masked_vpcmpsgeq_v8i1_v64i1_mask(i8 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgeq_v8i1_v64i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0 {%k1}
; VLX-NEXT: kmovq %k0, %rax
@@ -38202,57 +27279,44 @@ define zeroext i64 @test_masked_vpcmpsgeq_v8i1_v64i1_mask(i8 zeroext %__u, <8 x
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgeq_v8i1_v64i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1344:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1345:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1346:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0 {%k1}
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -38263,6 +27327,7 @@ define zeroext i64 @test_masked_vpcmpsgeq_v8i1_v64i1_mask(i8 zeroext %__u, <8 x
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
@@ -38277,7 +27342,7 @@ entry:
define zeroext i64 @test_masked_vpcmpsgeq_v8i1_v64i1_mask_mem(i8 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgeq_v8i1_v64i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpnltq (%rsi), %zmm0, %k0 {%k1}
; VLX-NEXT: kmovq %k0, %rax
@@ -38285,57 +27350,44 @@ define zeroext i64 @test_masked_vpcmpsgeq_v8i1_v64i1_mask_mem(i8 zeroext %__u, <
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgeq_v8i1_v64i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1347:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1348:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1349:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpnltq (%rsi), %zmm0, %k0 {%k1}
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -38346,6 +27398,7 @@ define zeroext i64 @test_masked_vpcmpsgeq_v8i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
@@ -38362,65 +27415,50 @@ entry:
define zeroext i64 @test_vpcmpsgeq_v8i1_v64i1_mask_mem_b(<8 x i64> %__a, i64* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpsgeq_v8i1_v64i1_mask_mem_b:
-; VLX: # BB#0: # %entry
-; VLX-NEXT: vpbroadcastq (%rdi), %zmm1
-; VLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0
+; VLX: # %bb.0: # %entry
+; VLX-NEXT: vpcmpnltq (%rdi){1to8}, %zmm0, %k0
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpsgeq_v8i1_v64i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1350:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1351:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1352:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: vpbroadcastq (%rdi), %zmm1
-; NoVLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpcmpnltq (%rdi){1to8}, %zmm0, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -38431,6 +27469,7 @@ define zeroext i64 @test_vpcmpsgeq_v8i1_v64i1_mask_mem_b(<8 x i64> %__a, i64* %_
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
@@ -38445,67 +27484,52 @@ entry:
define zeroext i64 @test_masked_vpcmpsgeq_v8i1_v64i1_mask_mem_b(i8 zeroext %__u, <8 x i64> %__a, i64* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpsgeq_v8i1_v64i1_mask_mem_b:
-; VLX: # BB#0: # %entry
-; VLX-NEXT: vpbroadcastq (%rsi), %zmm1
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
-; VLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0 {%k1}
+; VLX-NEXT: vpcmpnltq (%rsi){1to8}, %zmm0, %k0 {%k1}
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpsgeq_v8i1_v64i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1353:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1354:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1355:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: vpbroadcastq (%rsi), %zmm1
; NoVLX-NEXT: kmovw %edi, %k1
-; NoVLX-NEXT: vpcmpleq %zmm0, %zmm1, %k0 {%k1}
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpcmpnltq (%rsi){1to8}, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -38516,6 +27540,7 @@ define zeroext i64 @test_masked_vpcmpsgeq_v8i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
@@ -38533,38 +27558,22 @@ entry:
define zeroext i32 @test_vpcmpultb_v16i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpultb_v16i1_v32i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpltub %xmm1, %xmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpultb_v16i1_v32i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1356:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1357:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1358:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: .Lcfi1359:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi1360:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi1361:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi1362:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi1363:
-; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
@@ -38572,84 +27581,11 @@ define zeroext i32 @test_vpcmpultb_v16i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %_
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <16 x i8>
@@ -38662,38 +27598,22 @@ entry:
define zeroext i32 @test_vpcmpultb_v16i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpultb_v16i1_v32i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpltub (%rdi), %xmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpultb_v16i1_v32i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1364:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1365:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1366:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: .Lcfi1367:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi1368:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi1369:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi1370:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi1371:
-; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm1
@@ -38701,84 +27621,11 @@ define zeroext i32 @test_vpcmpultb_v16i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <16 x i8>
@@ -38792,39 +27639,21 @@ entry:
define zeroext i32 @test_masked_vpcmpultb_v16i1_v32i1_mask(i16 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpultb_v16i1_v32i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpltub %xmm1, %xmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpultb_v16i1_v32i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1372:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1373:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1374:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: .Lcfi1375:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi1376:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi1377:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi1378:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi1379:
-; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
@@ -38832,85 +27661,19 @@ define zeroext i32 @test_masked_vpcmpultb_v16i1_v32i1_mask(i16 zeroext %__u, <2
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1}
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <16 x i8>
@@ -38925,39 +27688,21 @@ entry:
define zeroext i32 @test_masked_vpcmpultb_v16i1_v32i1_mask_mem(i16 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpultb_v16i1_v32i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpltub (%rsi), %xmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpultb_v16i1_v32i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1380:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1381:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1382:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: .Lcfi1383:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi1384:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi1385:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi1386:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi1387:
-; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1
@@ -38965,85 +27710,19 @@ define zeroext i32 @test_masked_vpcmpultb_v16i1_v32i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1}
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <16 x i8>
@@ -39060,38 +27739,24 @@ entry:
define zeroext i64 @test_vpcmpultb_v16i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpultb_v16i1_v64i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpltub %xmm1, %xmm0, %k0
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpultb_v16i1_v64i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1388:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1389:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1390:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: .Lcfi1391:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi1392:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi1393:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi1394:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi1395:
-; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
@@ -39099,89 +27764,14 @@ define zeroext i64 @test_vpcmpultb_v16i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %_
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
; NoVLX-NEXT: shlq $32, %rcx
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: orq %rcx, %rax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <16 x i8>
@@ -39194,38 +27784,24 @@ entry:
define zeroext i64 @test_vpcmpultb_v16i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpultb_v16i1_v64i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpltub (%rdi), %xmm0, %k0
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpultb_v16i1_v64i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1396:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1397:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1398:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: .Lcfi1399:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi1400:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi1401:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi1402:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi1403:
-; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm1
@@ -39233,89 +27809,14 @@ define zeroext i64 @test_vpcmpultb_v16i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
; NoVLX-NEXT: shlq $32, %rcx
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: orq %rcx, %rax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <16 x i8>
@@ -39329,39 +27830,21 @@ entry:
define zeroext i64 @test_masked_vpcmpultb_v16i1_v64i1_mask(i16 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpultb_v16i1_v64i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpltub %xmm1, %xmm0, %k0 {%k1}
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpultb_v16i1_v64i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1404:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1405:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1406:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: .Lcfi1407:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi1408:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi1409:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi1410:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi1411:
-; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
@@ -39369,75 +27852,13 @@ define zeroext i64 @test_masked_vpcmpultb_v16i1_v64i1_mask(i16 zeroext %__u, <2
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1}
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -39446,13 +27867,9 @@ define zeroext i64 @test_masked_vpcmpultb_v16i1_v64i1_mask(i16 zeroext %__u, <2
; NoVLX-NEXT: shlq $32, %rcx
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: orq %rcx, %rax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <16 x i8>
@@ -39467,39 +27884,21 @@ entry:
define zeroext i64 @test_masked_vpcmpultb_v16i1_v64i1_mask_mem(i16 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpultb_v16i1_v64i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpltub (%rsi), %xmm0, %k0 {%k1}
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpultb_v16i1_v64i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1412:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1413:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1414:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: .Lcfi1415:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi1416:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi1417:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi1418:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi1419:
-; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1
@@ -39507,75 +27906,13 @@ define zeroext i64 @test_masked_vpcmpultb_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1}
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -39584,13 +27921,9 @@ define zeroext i64 @test_masked_vpcmpultb_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: shlq $32, %rcx
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: orq %rcx, %rax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <16 x i8>
@@ -39607,21 +27940,18 @@ entry:
define zeroext i64 @test_vpcmpultb_v32i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpultb_v32i1_v64i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpltub %ymm1, %ymm0, %k0
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpultb_v32i1_v64i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1420:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1421:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1422:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -39639,8 +27969,7 @@ define zeroext i64 @test_vpcmpultb_v32i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %_
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %ecx
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
@@ -39648,6 +27977,7 @@ define zeroext i64 @test_vpcmpultb_v32i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %_
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <32 x i8>
@@ -39660,21 +27990,18 @@ entry:
define zeroext i64 @test_vpcmpultb_v32i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpultb_v32i1_v64i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpltub (%rdi), %ymm0, %k0
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpultb_v32i1_v64i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1423:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1424:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1425:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -39692,8 +28019,7 @@ define zeroext i64 @test_vpcmpultb_v32i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %ecx
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
@@ -39701,6 +28027,7 @@ define zeroext i64 @test_vpcmpultb_v32i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <32 x i8>
@@ -39714,7 +28041,7 @@ entry:
define zeroext i64 @test_masked_vpcmpultb_v32i1_v64i1_mask(i32 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpultb_v32i1_v64i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpltub %ymm1, %ymm0, %k0 {%k1}
; VLX-NEXT: kmovq %k0, %rax
@@ -39722,14 +28049,11 @@ define zeroext i64 @test_masked_vpcmpultb_v32i1_v64i1_mask(i32 zeroext %__u, <4
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpultb_v32i1_v64i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1426:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1427:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1428:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $96, %rsp
@@ -39740,10 +28064,9 @@ define zeroext i64 @test_masked_vpcmpultb_v32i1_v64i1_mask(i32 zeroext %__u, <4
; NoVLX-NEXT: vpmovdb %zmm2, %xmm2
; NoVLX-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k2} {z}
; NoVLX-NEXT: vpmovdb %zmm3, %xmm3
-; NoVLX-NEXT: vpxord %zmm4, %zmm4, %zmm4
-; NoVLX-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
-; NoVLX-NEXT: vpxor %ymm5, %ymm0, %ymm0
-; NoVLX-NEXT: vpxor %ymm5, %ymm1, %ymm1
+; NoVLX-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; NoVLX-NEXT: vpxor %ymm4, %ymm0, %ymm0
+; NoVLX-NEXT: vpxor %ymm4, %ymm1, %ymm1
; NoVLX-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
; NoVLX-NEXT: vpand %xmm3, %xmm1, %xmm1
@@ -39757,7 +28080,7 @@ define zeroext i64 @test_masked_vpcmpultb_v32i1_v64i1_mask(i32 zeroext %__u, <4
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %ecx
-; NoVLX-NEXT: vptestmd %zmm4, %zmm4, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
@@ -39765,6 +28088,7 @@ define zeroext i64 @test_masked_vpcmpultb_v32i1_v64i1_mask(i32 zeroext %__u, <4
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <32 x i8>
@@ -39779,7 +28103,7 @@ entry:
define zeroext i64 @test_masked_vpcmpultb_v32i1_v64i1_mask_mem(i32 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpultb_v32i1_v64i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpltub (%rsi), %ymm0, %k0 {%k1}
; VLX-NEXT: kmovq %k0, %rax
@@ -39787,14 +28111,11 @@ define zeroext i64 @test_masked_vpcmpultb_v32i1_v64i1_mask_mem(i32 zeroext %__u,
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpultb_v32i1_v64i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1429:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1430:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1431:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $96, %rsp
@@ -39805,13 +28126,12 @@ define zeroext i64 @test_masked_vpcmpultb_v32i1_v64i1_mask_mem(i32 zeroext %__u,
; NoVLX-NEXT: vpmovdb %zmm1, %xmm1
; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k2} {z}
; NoVLX-NEXT: vpmovdb %zmm2, %xmm2
-; NoVLX-NEXT: vpxord %zmm3, %zmm3, %zmm3
-; NoVLX-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
-; NoVLX-NEXT: vpxor %ymm4, %ymm0, %ymm0
-; NoVLX-NEXT: vpxor (%rsi), %ymm4, %ymm4
-; NoVLX-NEXT: vpcmpgtb %ymm0, %ymm4, %ymm0
-; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm4
-; NoVLX-NEXT: vpand %xmm2, %xmm4, %xmm2
+; NoVLX-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; NoVLX-NEXT: vpxor %ymm3, %ymm0, %ymm0
+; NoVLX-NEXT: vpxor (%rsi), %ymm3, %ymm3
+; NoVLX-NEXT: vpcmpgtb %ymm0, %ymm3, %ymm0
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm3
+; NoVLX-NEXT: vpand %xmm2, %xmm3, %xmm2
; NoVLX-NEXT: vpmovsxbd %xmm2, %zmm2
; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2
; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0
@@ -39822,7 +28142,7 @@ define zeroext i64 @test_masked_vpcmpultb_v32i1_v64i1_mask_mem(i32 zeroext %__u,
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %ecx
-; NoVLX-NEXT: vptestmd %zmm3, %zmm3, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
@@ -39830,6 +28150,7 @@ define zeroext i64 @test_masked_vpcmpultb_v32i1_v64i1_mask_mem(i32 zeroext %__u,
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <32 x i8>
@@ -39846,14 +28167,14 @@ entry:
define zeroext i16 @test_vpcmpultw_v8i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpultw_v8i1_v16i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpltuw %xmm1, %xmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpultw_v8i1_v16i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
@@ -39861,10 +28182,9 @@ define zeroext i16 @test_vpcmpultw_v8i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kunpckbw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
@@ -39877,14 +28197,14 @@ entry:
define zeroext i16 @test_vpcmpultw_v8i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpultw_v8i1_v16i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpltuw (%rdi), %xmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpultw_v8i1_v16i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768]
; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm1
@@ -39892,10 +28212,9 @@ define zeroext i16 @test_vpcmpultw_v8i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kunpckbw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
@@ -39909,15 +28228,15 @@ entry:
define zeroext i16 @test_masked_vpcmpultw_v8i1_v16i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpultw_v8i1_v16i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpltuw %xmm1, %xmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpultw_v8i1_v16i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
@@ -39926,10 +28245,9 @@ define zeroext i16 @test_masked_vpcmpultw_v8i1_v16i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kunpckbw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
@@ -39944,15 +28262,15 @@ entry:
define zeroext i16 @test_masked_vpcmpultw_v8i1_v16i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpultw_v8i1_v16i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpltuw (%rsi), %xmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpultw_v8i1_v16i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768]
; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1
@@ -39961,10 +28279,9 @@ define zeroext i16 @test_masked_vpcmpultw_v8i1_v16i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kunpckbw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
@@ -39981,20 +28298,17 @@ entry:
define zeroext i32 @test_vpcmpultw_v8i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpultw_v8i1_v32i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpltuw %xmm1, %xmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpultw_v8i1_v32i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1432:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1433:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1434:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -40005,42 +28319,32 @@ define zeroext i32 @test_vpcmpultw_v8i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -40048,6 +28352,7 @@ define zeroext i32 @test_vpcmpultw_v8i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
@@ -40060,20 +28365,17 @@ entry:
define zeroext i32 @test_vpcmpultw_v8i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpultw_v8i1_v32i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpltuw (%rdi), %xmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpultw_v8i1_v32i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1435:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1436:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1437:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -40084,42 +28386,32 @@ define zeroext i32 @test_vpcmpultw_v8i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -40127,6 +28419,7 @@ define zeroext i32 @test_vpcmpultw_v8i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
@@ -40140,21 +28433,18 @@ entry:
define zeroext i32 @test_masked_vpcmpultw_v8i1_v32i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpultw_v8i1_v32i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpltuw %xmm1, %xmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpultw_v8i1_v32i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1438:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1439:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1440:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -40166,42 +28456,32 @@ define zeroext i32 @test_masked_vpcmpultw_v8i1_v32i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -40209,6 +28489,7 @@ define zeroext i32 @test_masked_vpcmpultw_v8i1_v32i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
@@ -40223,21 +28504,18 @@ entry:
define zeroext i32 @test_masked_vpcmpultw_v8i1_v32i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpultw_v8i1_v32i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpltuw (%rsi), %xmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpultw_v8i1_v32i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1441:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1442:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1443:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -40249,42 +28527,32 @@ define zeroext i32 @test_masked_vpcmpultw_v8i1_v32i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -40292,6 +28560,7 @@ define zeroext i32 @test_masked_vpcmpultw_v8i1_v32i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
@@ -40308,20 +28577,17 @@ entry:
define zeroext i64 @test_vpcmpultw_v8i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpultw_v8i1_v64i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpltuw %xmm1, %xmm0, %k0
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpultw_v8i1_v64i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1444:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1445:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1446:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -40332,44 +28598,34 @@ define zeroext i64 @test_vpcmpultw_v8i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -40380,6 +28636,7 @@ define zeroext i64 @test_vpcmpultw_v8i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
@@ -40392,20 +28649,17 @@ entry:
define zeroext i64 @test_vpcmpultw_v8i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpultw_v8i1_v64i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpltuw (%rdi), %xmm0, %k0
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpultw_v8i1_v64i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1447:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1448:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1449:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -40416,44 +28670,34 @@ define zeroext i64 @test_vpcmpultw_v8i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -40464,6 +28708,7 @@ define zeroext i64 @test_vpcmpultw_v8i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
@@ -40477,21 +28722,18 @@ entry:
define zeroext i64 @test_masked_vpcmpultw_v8i1_v64i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpultw_v8i1_v64i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpltuw %xmm1, %xmm0, %k0 {%k1}
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpultw_v8i1_v64i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1450:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1451:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1452:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -40503,44 +28745,34 @@ define zeroext i64 @test_masked_vpcmpultw_v8i1_v64i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -40551,6 +28783,7 @@ define zeroext i64 @test_masked_vpcmpultw_v8i1_v64i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
@@ -40565,21 +28798,18 @@ entry:
define zeroext i64 @test_masked_vpcmpultw_v8i1_v64i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpultw_v8i1_v64i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpltuw (%rsi), %xmm0, %k0 {%k1}
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpultw_v8i1_v64i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1453:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1454:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1455:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -40591,44 +28821,34 @@ define zeroext i64 @test_masked_vpcmpultw_v8i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -40639,6 +28859,7 @@ define zeroext i64 @test_masked_vpcmpultw_v8i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <8 x i16>
@@ -40655,124 +28876,37 @@ entry:
define zeroext i32 @test_vpcmpultw_v16i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpultw_v16i1_v32i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpltuw %ymm1, %ymm0, %k0
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpultw_v16i1_v32i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1456:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1457:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1458:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: .Lcfi1459:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi1460:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi1461:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi1462:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi1463:
-; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vmovdqa {{.*#+}} ymm2 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768]
; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0
; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1
; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <16 x i16>
@@ -40785,124 +28919,37 @@ entry:
define zeroext i32 @test_vpcmpultw_v16i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpultw_v16i1_v32i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpltuw (%rdi), %ymm0, %k0
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpultw_v16i1_v32i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1464:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1465:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1466:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: .Lcfi1467:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi1468:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi1469:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi1470:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi1471:
-; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768]
; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
; NoVLX-NEXT: vpxor (%rdi), %ymm1, %ymm1
; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <16 x i16>
@@ -40916,7 +28963,7 @@ entry:
define zeroext i32 @test_masked_vpcmpultw_v16i1_v32i1_mask(i16 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpultw_v16i1_v32i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpltuw %ymm1, %ymm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
@@ -40924,32 +28971,14 @@ define zeroext i32 @test_masked_vpcmpultw_v16i1_v32i1_mask(i16 zeroext %__u, <4
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpultw_v16i1_v32i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1472:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1473:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1474:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: .Lcfi1475:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi1476:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi1477:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi1478:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi1479:
-; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: vmovdqa {{.*#+}} ymm2 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768]
; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0
; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1
@@ -40957,85 +28986,19 @@ define zeroext i32 @test_masked_vpcmpultw_v16i1_v32i1_mask(i16 zeroext %__u, <4
; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1}
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <16 x i16>
@@ -41050,7 +29013,7 @@ entry:
define zeroext i32 @test_masked_vpcmpultw_v16i1_v32i1_mask_mem(i16 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpultw_v16i1_v32i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpltuw (%rsi), %ymm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
@@ -41058,32 +29021,14 @@ define zeroext i32 @test_masked_vpcmpultw_v16i1_v32i1_mask_mem(i16 zeroext %__u,
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpultw_v16i1_v32i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1480:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1481:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1482:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: .Lcfi1483:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi1484:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi1485:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi1486:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi1487:
-; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768]
; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
; NoVLX-NEXT: vpxor (%rsi), %ymm1, %ymm1
@@ -41091,85 +29036,19 @@ define zeroext i32 @test_masked_vpcmpultw_v16i1_v32i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1}
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <16 x i16>
@@ -41186,114 +29065,31 @@ entry:
define zeroext i64 @test_vpcmpultw_v16i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpultw_v16i1_v64i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpltuw %ymm1, %ymm0, %k0
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpultw_v16i1_v64i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1488:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1489:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1490:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: .Lcfi1491:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi1492:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi1493:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi1494:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi1495:
-; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vmovdqa {{.*#+}} ymm2 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768]
; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0
; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1
; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -41302,13 +29098,9 @@ define zeroext i64 @test_vpcmpultw_v16i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %_
; NoVLX-NEXT: shlq $32, %rcx
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: orq %rcx, %rax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <16 x i16>
@@ -41321,114 +29113,31 @@ entry:
define zeroext i64 @test_vpcmpultw_v16i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpultw_v16i1_v64i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpltuw (%rdi), %ymm0, %k0
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpultw_v16i1_v64i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1496:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1497:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1498:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: .Lcfi1499:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi1500:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi1501:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi1502:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi1503:
-; NoVLX-NEXT: .cfi_offset %r15, -24
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768]
; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
; NoVLX-NEXT: vpxor (%rdi), %ymm1, %ymm1
; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -41437,13 +29146,9 @@ define zeroext i64 @test_vpcmpultw_v16i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64
; NoVLX-NEXT: shlq $32, %rcx
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: orq %rcx, %rax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <16 x i16>
@@ -41457,7 +29162,7 @@ entry:
define zeroext i64 @test_masked_vpcmpultw_v16i1_v64i1_mask(i16 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpultw_v16i1_v64i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpltuw %ymm1, %ymm0, %k0 {%k1}
; VLX-NEXT: kmovq %k0, %rax
@@ -41465,32 +29170,14 @@ define zeroext i64 @test_masked_vpcmpultw_v16i1_v64i1_mask(i16 zeroext %__u, <4
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpultw_v16i1_v64i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1504:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1505:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1506:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: .Lcfi1507:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi1508:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi1509:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi1510:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi1511:
-; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: vmovdqa {{.*#+}} ymm2 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768]
; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0
; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1
@@ -41498,75 +29185,13 @@ define zeroext i64 @test_masked_vpcmpultw_v16i1_v64i1_mask(i16 zeroext %__u, <4
; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1}
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -41575,13 +29200,9 @@ define zeroext i64 @test_masked_vpcmpultw_v16i1_v64i1_mask(i16 zeroext %__u, <4
; NoVLX-NEXT: shlq $32, %rcx
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: orq %rcx, %rax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <16 x i16>
@@ -41596,7 +29217,7 @@ entry:
define zeroext i64 @test_masked_vpcmpultw_v16i1_v64i1_mask_mem(i16 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpultw_v16i1_v64i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpltuw (%rsi), %ymm0, %k0 {%k1}
; VLX-NEXT: kmovq %k0, %rax
@@ -41604,32 +29225,14 @@ define zeroext i64 @test_masked_vpcmpultw_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpultw_v16i1_v64i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1512:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1513:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1514:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: .Lcfi1515:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi1516:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi1517:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi1518:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi1519:
-; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768]
; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
; NoVLX-NEXT: vpxor (%rsi), %ymm1, %ymm1
@@ -41637,75 +29240,13 @@ define zeroext i64 @test_masked_vpcmpultw_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1}
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -41714,13 +29255,9 @@ define zeroext i64 @test_masked_vpcmpultw_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: shlq $32, %rcx
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: orq %rcx, %rax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <16 x i16>
@@ -41737,25 +29274,22 @@ entry:
define zeroext i64 @test_vpcmpultw_v32i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpultw_v32i1_v64i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpltuw %zmm1, %zmm0, %k0
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpultw_v32i1_v64i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1520:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1521:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1522:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm3
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm3
; NoVLX-NEXT: vmovq %xmm3, %rax
; NoVLX-NEXT: movq %rax, %rcx
; NoVLX-NEXT: movq %rax, %rdx
@@ -41764,7 +29298,7 @@ define zeroext i64 @test_vpcmpultw_v32i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %_
; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm5
; NoVLX-NEXT: vextracti32x4 $2, %zmm1, %xmm8
; NoVLX-NEXT: vextracti32x4 $3, %zmm1, %xmm4
-; NoVLX-NEXT: vextracti32x4 $1, %zmm1, %xmm6
+; NoVLX-NEXT: vextracti128 $1, %ymm1, %xmm6
; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm7
; NoVLX-NEXT: vextracti32x4 $3, %zmm0, %xmm2
; NoVLX-NEXT: shrq $32, %rdx
@@ -41924,157 +29458,28 @@ define zeroext i64 @test_vpcmpultw_v32i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %_
; NoVLX-NEXT: shrq $32, %rcx
; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm4, %xmm4
; NoVLX-NEXT: vpinsrw $7, %eax, %xmm4, %xmm4
-; NoVLX-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm4
-; NoVLX-NEXT: vmovdqa {{.*#+}} ymm2 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768]
-; NoVLX-NEXT: vpxor %ymm2, %ymm3, %ymm3
-; NoVLX-NEXT: vpxor %ymm2, %ymm4, %ymm4
-; NoVLX-NEXT: vpcmpgtw %ymm3, %ymm4, %ymm3
-; NoVLX-NEXT: vpmovsxwd %ymm3, %zmm3
-; NoVLX-NEXT: vpslld $31, %zmm3, %zmm3
-; NoVLX-NEXT: vptestmd %zmm3, %zmm3, %k0
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm3
-; NoVLX-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm3, %xmm3
-; NoVLX-NEXT: vpmovsxbd %xmm3, %zmm3
-; NoVLX-NEXT: vpslld $31, %zmm3, %zmm3
-; NoVLX-NEXT: vptestmd %zmm3, %zmm3, %k0
+; NoVLX-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2
+; NoVLX-NEXT: vmovdqa {{.*#+}} ymm4 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768]
+; NoVLX-NEXT: vpxor %ymm4, %ymm3, %ymm3
+; NoVLX-NEXT: vpxor %ymm4, %ymm2, %ymm2
+; NoVLX-NEXT: vpcmpgtw %ymm3, %ymm2, %ymm2
+; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2
+; NoVLX-NEXT: vpmovdb %zmm2, %xmm2
+; NoVLX-NEXT: vpmovsxbd %xmm2, %zmm2
+; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2
+; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0
-; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1
+; NoVLX-NEXT: vpxor %ymm4, %ymm0, %ymm0
+; NoVLX-NEXT: vpxor %ymm4, %ymm1, %ymm1
; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %ecx
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
@@ -42082,6 +29487,7 @@ define zeroext i64 @test_vpcmpultw_v32i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %_
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <32 x i16>
@@ -42094,25 +29500,22 @@ entry:
define zeroext i64 @test_vpcmpultw_v32i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpultw_v32i1_v64i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpltuw (%rdi), %zmm0, %k0
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpultw_v32i1_v64i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1523:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1524:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1525:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm2
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm2
; NoVLX-NEXT: vmovq %xmm2, %rax
; NoVLX-NEXT: movq %rax, %rcx
; NoVLX-NEXT: movq %rax, %rdx
@@ -42196,157 +29599,28 @@ define zeroext i64 @test_vpcmpultw_v32i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64
; NoVLX-NEXT: shrq $48, %rax
; NoVLX-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm2
-; NoVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768]
-; NoVLX-NEXT: vpxor %ymm1, %ymm2, %ymm2
-; NoVLX-NEXT: vpxor 32(%rdi), %ymm1, %ymm3
-; NoVLX-NEXT: vpcmpgtw %ymm2, %ymm3, %ymm2
-; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2
-; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2
-; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %eax, %xmm2
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: vpmovsxbd %xmm2, %zmm2
-; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2
-; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0
+; NoVLX-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
+; NoVLX-NEXT: vmovdqa {{.*#+}} ymm2 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768]
+; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1
+; NoVLX-NEXT: vpxor 32(%rdi), %ymm2, %ymm3
+; NoVLX-NEXT: vpcmpgtw %ymm1, %ymm3, %ymm1
+; NoVLX-NEXT: vpmovsxwd %ymm1, %zmm1
+; NoVLX-NEXT: vpmovdb %zmm1, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
-; NoVLX-NEXT: vpxor (%rdi), %ymm1, %ymm1
+; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0
+; NoVLX-NEXT: vpxor (%rdi), %ymm2, %ymm1
; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
-; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %ecx
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
@@ -42354,6 +29628,7 @@ define zeroext i64 @test_vpcmpultw_v32i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <32 x i16>
@@ -42367,7 +29642,7 @@ entry:
define zeroext i64 @test_masked_vpcmpultw_v32i1_v64i1_mask(i32 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpultw_v32i1_v64i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpltuw %zmm1, %zmm0, %k0 {%k1}
; VLX-NEXT: kmovq %k0, %rax
@@ -42375,14 +29650,11 @@ define zeroext i64 @test_masked_vpcmpultw_v32i1_v64i1_mask(i32 zeroext %__u, <8
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpultw_v32i1_v64i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1526:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1527:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1528:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $96, %rsp
@@ -42394,10 +29666,10 @@ define zeroext i64 @test_masked_vpcmpultw_v32i1_v64i1_mask(i32 zeroext %__u, <8
; NoVLX-NEXT: vmovd %eax, %xmm3
; NoVLX-NEXT: shrl $16, %eax
; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm4
-; NoVLX-NEXT: vextracti32x4 $1, %zmm1, %xmm8
+; NoVLX-NEXT: vextracti128 $1, %ymm1, %xmm8
; NoVLX-NEXT: vextracti32x4 $2, %zmm1, %xmm5
; NoVLX-NEXT: vextracti32x4 $3, %zmm1, %xmm7
-; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm6
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm6
; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm3
; NoVLX-NEXT: shrq $32, %rdx
; NoVLX-NEXT: vpinsrw $2, %edx, %xmm4, %xmm4
@@ -42513,208 +29785,79 @@ define zeroext i64 @test_masked_vpcmpultw_v32i1_v64i1_mask(i32 zeroext %__u, <8
; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3
; NoVLX-NEXT: vmovq %xmm8, %rcx
; NoVLX-NEXT: shrq $48, %rax
-; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm5
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3
; NoVLX-NEXT: movl %ecx, %eax
; NoVLX-NEXT: shrl $16, %eax
-; NoVLX-NEXT: vmovd %ecx, %xmm3
-; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vmovd %ecx, %xmm5
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm5, %xmm5
; NoVLX-NEXT: movq %rcx, %rax
; NoVLX-NEXT: shrq $32, %rax
-; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm5, %xmm5
; NoVLX-NEXT: vpextrq $1, %xmm8, %rax
; NoVLX-NEXT: shrq $48, %rcx
-; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm5
; NoVLX-NEXT: movl %eax, %ecx
; NoVLX-NEXT: shrl $16, %ecx
-; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
-; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm5, %xmm5
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm5, %xmm5
; NoVLX-NEXT: movq %rax, %rcx
; NoVLX-NEXT: shrq $32, %rcx
-; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm5, %xmm5
; NoVLX-NEXT: vmovq %xmm1, %rcx
; NoVLX-NEXT: shrq $48, %rax
-; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm2
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm5, %xmm5
; NoVLX-NEXT: movl %ecx, %eax
; NoVLX-NEXT: shrl $16, %eax
-; NoVLX-NEXT: vmovd %ecx, %xmm3
-; NoVLX-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vmovd %ecx, %xmm2
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2
; NoVLX-NEXT: movq %rcx, %rax
; NoVLX-NEXT: shrq $32, %rax
-; NoVLX-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2
; NoVLX-NEXT: vpextrq $1, %xmm1, %rax
-; NoVLX-NEXT: vinserti128 $1, %xmm9, %ymm4, %ymm8
+; NoVLX-NEXT: vinserti128 $1, %xmm9, %ymm4, %ymm1
; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1
; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k2
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm4
-; NoVLX-NEXT: vpmovdb %zmm1, %xmm0
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z}
-; NoVLX-NEXT: vinserti128 $1, %xmm7, %ymm5, %ymm5
-; NoVLX-NEXT: vpmovdb %zmm1, %xmm7
+; NoVLX-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm0
+; NoVLX-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k1} {z}
+; NoVLX-NEXT: vpmovdb %zmm4, %xmm4
+; NoVLX-NEXT: vinserti128 $1, %xmm7, %ymm3, %ymm3
+; NoVLX-NEXT: vpternlogd $255, %zmm6, %zmm6, %zmm6 {%k2} {z}
+; NoVLX-NEXT: vpmovdb %zmm6, %xmm6
; NoVLX-NEXT: shrq $48, %rcx
-; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm3, %xmm1
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2
; NoVLX-NEXT: movl %eax, %ecx
; NoVLX-NEXT: shrl $16, %ecx
-; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm3
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2
; NoVLX-NEXT: movq %rax, %rcx
; NoVLX-NEXT: shrq $48, %rax
; NoVLX-NEXT: shrq $32, %rcx
-; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
-; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3
-; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3
-; NoVLX-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2
-; NoVLX-NEXT: vmovdqa {{.*#+}} ymm6 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768]
-; NoVLX-NEXT: vpxor %ymm6, %ymm4, %ymm3
-; NoVLX-NEXT: vpxor %ymm6, %ymm2, %ymm2
-; NoVLX-NEXT: vpcmpgtw %ymm3, %ymm2, %ymm2
-; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2
-; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2
-; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm2
-; NoVLX-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm4
-; NoVLX-NEXT: vpxor %ymm6, %ymm8, %ymm2
-; NoVLX-NEXT: vpxor %ymm6, %ymm5, %ymm3
-; NoVLX-NEXT: vpcmpgtw %ymm2, %ymm3, %ymm2
-; NoVLX-NEXT: vpmovsxwd %ymm2, %zmm2
-; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2
-; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm2
-; NoVLX-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: vpand %xmm7, %xmm2, %xmm2
-; NoVLX-NEXT: vpmovsxbd %xmm2, %zmm2
-; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2
-; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vinserti128 $1, %xmm5, %ymm2, %ymm2
+; NoVLX-NEXT: vmovdqa {{.*#+}} ymm5 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768]
+; NoVLX-NEXT: vpxor %ymm5, %ymm0, %ymm0
+; NoVLX-NEXT: vpxor %ymm5, %ymm2, %ymm2
+; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm0
+; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
+; NoVLX-NEXT: vpand %xmm4, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor %ymm5, %ymm1, %ymm1
+; NoVLX-NEXT: vpxor %ymm5, %ymm3, %ymm2
+; NoVLX-NEXT: vpcmpgtw %ymm1, %ymm2, %ymm1
+; NoVLX-NEXT: vpmovsxwd %ymm1, %zmm1
+; NoVLX-NEXT: vpmovdb %zmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm6, %xmm1, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: vpand %xmm0, %xmm4, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %ecx
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
@@ -42722,6 +29865,7 @@ define zeroext i64 @test_masked_vpcmpultw_v32i1_v64i1_mask(i32 zeroext %__u, <8
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <32 x i16>
@@ -42736,7 +29880,7 @@ entry:
define zeroext i64 @test_masked_vpcmpultw_v32i1_v64i1_mask_mem(i32 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpultw_v32i1_v64i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpltuw (%rsi), %zmm0, %k0 {%k1}
; VLX-NEXT: kmovq %k0, %rax
@@ -42744,14 +29888,11 @@ define zeroext i64 @test_masked_vpcmpultw_v32i1_v64i1_mask_mem(i32 zeroext %__u,
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpultw_v32i1_v64i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1529:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1530:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1531:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $96, %rsp
@@ -42763,8 +29904,8 @@ define zeroext i64 @test_masked_vpcmpultw_v32i1_v64i1_mask_mem(i32 zeroext %__u,
; NoVLX-NEXT: vmovd %eax, %xmm2
; NoVLX-NEXT: shrl $16, %eax
; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2
-; NoVLX-NEXT: vextracti32x4 $1, %zmm0, %xmm4
-; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm5
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm3
+; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm4
; NoVLX-NEXT: shrq $32, %rdx
; NoVLX-NEXT: vpinsrw $2, %edx, %xmm2, %xmm2
; NoVLX-NEXT: vpextrq $1, %xmm1, %rax
@@ -42777,228 +29918,99 @@ define zeroext i64 @test_masked_vpcmpultw_v32i1_v64i1_mask_mem(i32 zeroext %__u,
; NoVLX-NEXT: movq %rax, %rcx
; NoVLX-NEXT: shrq $32, %rcx
; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1
-; NoVLX-NEXT: vmovq %xmm5, %rcx
+; NoVLX-NEXT: vmovq %xmm4, %rcx
; NoVLX-NEXT: shrq $48, %rax
-; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm3
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1
; NoVLX-NEXT: movl %ecx, %eax
; NoVLX-NEXT: shrl $16, %eax
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vmovd %ecx, %xmm2
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm2, %xmm2
; NoVLX-NEXT: movq %rcx, %rax
; NoVLX-NEXT: shrq $32, %rax
-; NoVLX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: vpextrq $1, %xmm5, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpextrq $1, %xmm4, %rax
; NoVLX-NEXT: shrq $48, %rcx
-; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm2, %xmm2
; NoVLX-NEXT: movl %eax, %ecx
; NoVLX-NEXT: shrl $16, %ecx
-; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2
; NoVLX-NEXT: movq %rax, %rcx
; NoVLX-NEXT: shrq $32, %rcx
-; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1
-; NoVLX-NEXT: vmovq %xmm4, %rcx
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: vmovq %xmm3, %rcx
; NoVLX-NEXT: shrq $48, %rax
-; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm5
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2
; NoVLX-NEXT: movl %ecx, %eax
; NoVLX-NEXT: shrl $16, %eax
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vmovd %ecx, %xmm4
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4
; NoVLX-NEXT: movq %rcx, %rax
; NoVLX-NEXT: shrq $32, %rax
-; NoVLX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: vpextrq $1, %xmm4, %rax
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4
+; NoVLX-NEXT: vpextrq $1, %xmm3, %rax
; NoVLX-NEXT: shrq $48, %rcx
-; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm3
; NoVLX-NEXT: movl %eax, %ecx
; NoVLX-NEXT: shrl $16, %ecx
-; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm3, %xmm3
; NoVLX-NEXT: movq %rax, %rcx
; NoVLX-NEXT: shrq $32, %rcx
-; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm3, %xmm3
; NoVLX-NEXT: vmovq %xmm0, %rcx
; NoVLX-NEXT: shrq $48, %rax
-; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm6
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm3, %xmm3
; NoVLX-NEXT: movl %ecx, %eax
; NoVLX-NEXT: shrl $16, %eax
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vmovd %ecx, %xmm4
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4
; NoVLX-NEXT: movq %rcx, %rax
; NoVLX-NEXT: shrq $32, %rax
-; NoVLX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrw $2, %eax, %xmm4, %xmm4
; NoVLX-NEXT: vpextrq $1, %xmm0, %rax
; NoVLX-NEXT: shrq $48, %rcx
-; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm1, %xmm0
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm0
; NoVLX-NEXT: movl %eax, %ecx
; NoVLX-NEXT: shrl $16, %ecx
; NoVLX-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: movq %rax, %rcx
; NoVLX-NEXT: shrq $32, %rcx
-; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm7
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k1
; NoVLX-NEXT: kmovw {{[0-9]+}}(%rsp), %k2
-; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vpmovdb %zmm0, %xmm1
-; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
-; NoVLX-NEXT: vpmovdb %zmm0, %xmm2
+; NoVLX-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k1} {z}
+; NoVLX-NEXT: vpmovdb %zmm4, %xmm4
+; NoVLX-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k2} {z}
+; NoVLX-NEXT: vpmovdb %zmm5, %xmm5
; NoVLX-NEXT: shrq $48, %rax
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vinserti128 $1, %xmm3, %ymm5, %ymm4
-; NoVLX-NEXT: vpinsrw $7, %eax, %xmm7, %xmm3
-; NoVLX-NEXT: vinserti128 $1, %xmm6, %ymm3, %ymm3
-; NoVLX-NEXT: vmovdqa {{.*#+}} ymm5 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768]
-; NoVLX-NEXT: vpxor %ymm5, %ymm3, %ymm3
-; NoVLX-NEXT: vpxor (%rsi), %ymm5, %ymm6
-; NoVLX-NEXT: vpcmpgtw %ymm3, %ymm6, %ymm3
-; NoVLX-NEXT: vpmovsxwd %ymm3, %zmm3
-; NoVLX-NEXT: vpslld $31, %zmm3, %zmm3
-; NoVLX-NEXT: vptestmd %zmm3, %zmm3, %k0
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %eax, %xmm3
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm3, %xmm3
-; NoVLX-NEXT: vpxor %ymm5, %ymm4, %ymm4
-; NoVLX-NEXT: vpxor 32(%rsi), %ymm5, %ymm5
-; NoVLX-NEXT: vpcmpgtw %ymm4, %ymm5, %ymm4
-; NoVLX-NEXT: vpmovsxwd %ymm4, %zmm4
-; NoVLX-NEXT: vpslld $31, %zmm4, %zmm4
-; NoVLX-NEXT: vptestmd %zmm4, %zmm4, %k0
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm4
-; NoVLX-NEXT: vpinsrb $1, %eax, %xmm4, %xmm4
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $2, %eax, %xmm4, %xmm4
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $3, %eax, %xmm4, %xmm4
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm4, %xmm4
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm4, %xmm4
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $6, %eax, %xmm4, %xmm4
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm4, %xmm4
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm4, %xmm4
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $9, %eax, %xmm4, %xmm4
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $10, %eax, %xmm4, %xmm4
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm4, %xmm4
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $12, %eax, %xmm4, %xmm4
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $13, %eax, %xmm4, %xmm4
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: vpinsrb $14, %eax, %xmm4, %xmm4
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm4, %xmm4
-; NoVLX-NEXT: vpand %xmm2, %xmm4, %xmm2
-; NoVLX-NEXT: vpmovsxbd %xmm2, %zmm2
-; NoVLX-NEXT: vpslld $31, %zmm2, %zmm2
-; NoVLX-NEXT: vptestmd %zmm2, %zmm2, %k0
-; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: vpand %xmm1, %xmm3, %xmm1
+; NoVLX-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
+; NoVLX-NEXT: vmovdqa {{.*#+}} ymm2 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768]
+; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0
+; NoVLX-NEXT: vpxor (%rsi), %ymm2, %ymm3
+; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm3, %ymm0
+; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
+; NoVLX-NEXT: vpand %xmm4, %xmm0, %xmm0
+; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1
+; NoVLX-NEXT: vpxor 32(%rsi), %ymm2, %ymm2
+; NoVLX-NEXT: vpcmpgtw %ymm1, %ymm2, %ymm1
+; NoVLX-NEXT: vpmovsxwd %ymm1, %zmm1
+; NoVLX-NEXT: vpmovdb %zmm1, %xmm1
+; NoVLX-NEXT: vpand %xmm5, %xmm1, %xmm1
; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %ecx
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
@@ -43006,6 +30018,7 @@ define zeroext i64 @test_masked_vpcmpultw_v32i1_v64i1_mask_mem(i32 zeroext %__u,
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <32 x i16>
@@ -43022,52 +30035,48 @@ entry:
define zeroext i8 @test_vpcmpultd_v4i1_v8i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpultd_v4i1_v8i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpltud %xmm1, %xmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %al killed %al killed %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpultd_v4i1_v8i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kshiftlw $7, %k0, %k0
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $13, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $12, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %al killed %al killed %eax
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -43080,52 +30089,48 @@ entry:
define zeroext i8 @test_vpcmpultd_v4i1_v8i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpultd_v4i1_v8i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpltud (%rdi), %xmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %al killed %al killed %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpultd_v4i1_v8i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm1
; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kshiftlw $7, %k0, %k0
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $13, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $12, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %al killed %al killed %eax
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -43139,71 +30144,62 @@ entry:
define zeroext i8 @test_masked_vpcmpultd_v4i1_v8i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpultd_v4i1_v8i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpltud %xmm1, %xmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %al killed %al killed %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpultd_v4i1_v8i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $13, %k0, %k2
-; NoVLX-NEXT: kshiftrw $15, %k2, %k2
-; NoVLX-NEXT: kshiftlw $15, %k0, %k3
-; NoVLX-NEXT: kshiftrw $15, %k3, %k3
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k3, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kshiftlw $7, %k0, %k0
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $13, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $12, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %al killed %al killed %eax
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -43219,71 +30215,62 @@ entry:
define zeroext i8 @test_masked_vpcmpultd_v4i1_v8i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpultd_v4i1_v8i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpltud (%rsi), %xmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %al killed %al killed %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpultd_v4i1_v8i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1
; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $13, %k0, %k2
-; NoVLX-NEXT: kshiftrw $15, %k2, %k2
-; NoVLX-NEXT: kshiftlw $15, %k0, %k3
-; NoVLX-NEXT: kshiftrw $15, %k3, %k3
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k3, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kshiftlw $7, %k0, %k0
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $13, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $12, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %al killed %al killed %eax
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -43301,53 +30288,49 @@ entry:
define zeroext i8 @test_vpcmpultd_v4i1_v8i1_mask_mem_b(<2 x i64> %__a, i32* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpultd_v4i1_v8i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpltud (%rdi){1to4}, %xmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %al killed %al killed %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpultd_v4i1_v8i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpbroadcastd (%rdi), %xmm1
; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kshiftlw $7, %k0, %k0
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $13, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $12, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %al killed %al killed %eax
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -43362,72 +30345,63 @@ entry:
define zeroext i8 @test_masked_vpcmpultd_v4i1_v8i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i32* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpultd_v4i1_v8i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpltud (%rsi){1to4}, %xmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %al killed %al killed %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpultd_v4i1_v8i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpbroadcastd (%rsi), %xmm1
; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $13, %k0, %k2
-; NoVLX-NEXT: kshiftrw $15, %k2, %k2
-; NoVLX-NEXT: kshiftlw $15, %k0, %k3
-; NoVLX-NEXT: kshiftrw $15, %k3, %k3
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k3, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kshiftlw $7, %k0, %k0
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $13, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $12, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %al killed %al killed %eax
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -43446,51 +30420,48 @@ entry:
define zeroext i16 @test_vpcmpultd_v4i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpultd_v4i1_v16i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpltud %xmm1, %xmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpultd_v4i1_v16i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kmovw %eax, %k0
; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: andl $1, %eax
-; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $13, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $12, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -43503,51 +30474,48 @@ entry:
define zeroext i16 @test_vpcmpultd_v4i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpultd_v4i1_v16i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpltud (%rdi), %xmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpultd_v4i1_v16i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm1
; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kmovw %eax, %k0
; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: andl $1, %eax
-; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $13, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $12, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -43561,70 +30529,62 @@ entry:
define zeroext i16 @test_masked_vpcmpultd_v4i1_v16i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpultd_v4i1_v16i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpltud %xmm1, %xmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpultd_v4i1_v16i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $13, %k0, %k2
-; NoVLX-NEXT: kshiftrw $15, %k2, %k2
-; NoVLX-NEXT: kshiftlw $15, %k0, %k3
-; NoVLX-NEXT: kshiftrw $15, %k3, %k3
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k3, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kmovw %eax, %k0
; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: andl $1, %eax
-; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $13, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $12, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -43640,70 +30600,62 @@ entry:
define zeroext i16 @test_masked_vpcmpultd_v4i1_v16i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpultd_v4i1_v16i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpltud (%rsi), %xmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpultd_v4i1_v16i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1
; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $13, %k0, %k2
-; NoVLX-NEXT: kshiftrw $15, %k2, %k2
-; NoVLX-NEXT: kshiftlw $15, %k0, %k3
-; NoVLX-NEXT: kshiftrw $15, %k3, %k3
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k3, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kmovw %eax, %k0
; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: andl $1, %eax
-; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $13, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $12, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -43721,52 +30673,49 @@ entry:
define zeroext i16 @test_vpcmpultd_v4i1_v16i1_mask_mem_b(<2 x i64> %__a, i32* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpultd_v4i1_v16i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpltud (%rdi){1to4}, %xmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpultd_v4i1_v16i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpbroadcastd (%rdi), %xmm1
; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kmovw %eax, %k0
; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: andl $1, %eax
-; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $13, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $12, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -43781,71 +30730,63 @@ entry:
define zeroext i16 @test_masked_vpcmpultd_v4i1_v16i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i32* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpultd_v4i1_v16i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpltud (%rsi){1to4}, %xmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpultd_v4i1_v16i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpbroadcastd (%rsi), %xmm1
; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $13, %k0, %k2
-; NoVLX-NEXT: kshiftrw $15, %k2, %k2
-; NoVLX-NEXT: kshiftlw $15, %k0, %k3
-; NoVLX-NEXT: kshiftrw $15, %k3, %k3
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k3, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kmovw %eax, %k0
; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: andl $1, %eax
-; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $13, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $12, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -43864,20 +30805,17 @@ entry:
define zeroext i32 @test_vpcmpultd_v4i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpultd_v4i1_v32i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpltud %xmm1, %xmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpultd_v4i1_v32i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1532:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1533:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1534:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -43898,6 +30836,7 @@ define zeroext i32 @test_vpcmpultd_v4i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -43910,20 +30849,17 @@ entry:
define zeroext i32 @test_vpcmpultd_v4i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpultd_v4i1_v32i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpltud (%rdi), %xmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpultd_v4i1_v32i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1535:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1536:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1537:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -43944,6 +30880,7 @@ define zeroext i32 @test_vpcmpultd_v4i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -43957,21 +30894,18 @@ entry:
define zeroext i32 @test_masked_vpcmpultd_v4i1_v32i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpultd_v4i1_v32i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpltud %xmm1, %xmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpultd_v4i1_v32i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1538:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1539:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1540:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -43980,21 +30914,16 @@ define zeroext i32 @test_masked_vpcmpultd_v4i1_v32i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $13, %k0, %k2
-; NoVLX-NEXT: kshiftrw $15, %k2, %k2
-; NoVLX-NEXT: kshiftlw $15, %k0, %k3
-; NoVLX-NEXT: kshiftrw $15, %k3, %k3
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k3, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -44010,6 +30939,7 @@ define zeroext i32 @test_masked_vpcmpultd_v4i1_v32i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -44025,21 +30955,18 @@ entry:
define zeroext i32 @test_masked_vpcmpultd_v4i1_v32i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpultd_v4i1_v32i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpltud (%rsi), %xmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpultd_v4i1_v32i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1541:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1542:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1543:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -44048,21 +30975,16 @@ define zeroext i32 @test_masked_vpcmpultd_v4i1_v32i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1
; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $13, %k0, %k2
-; NoVLX-NEXT: kshiftrw $15, %k2, %k2
-; NoVLX-NEXT: kshiftlw $15, %k0, %k3
-; NoVLX-NEXT: kshiftrw $15, %k3, %k3
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k3, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -44078,6 +31000,7 @@ define zeroext i32 @test_masked_vpcmpultd_v4i1_v32i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -44095,20 +31018,17 @@ entry:
define zeroext i32 @test_vpcmpultd_v4i1_v32i1_mask_mem_b(<2 x i64> %__a, i32* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpultd_v4i1_v32i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpltud (%rdi){1to4}, %xmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpultd_v4i1_v32i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1544:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1545:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1546:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -44130,6 +31050,7 @@ define zeroext i32 @test_vpcmpultd_v4i1_v32i1_mask_mem_b(<2 x i64> %__a, i32* %_
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -44144,21 +31065,18 @@ entry:
define zeroext i32 @test_masked_vpcmpultd_v4i1_v32i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i32* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpultd_v4i1_v32i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpltud (%rsi){1to4}, %xmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpultd_v4i1_v32i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1547:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1548:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1549:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -44168,21 +31086,16 @@ define zeroext i32 @test_masked_vpcmpultd_v4i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $13, %k0, %k2
-; NoVLX-NEXT: kshiftrw $15, %k2, %k2
-; NoVLX-NEXT: kshiftlw $15, %k0, %k3
-; NoVLX-NEXT: kshiftrw $15, %k3, %k3
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k3, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -44198,6 +31111,7 @@ define zeroext i32 @test_masked_vpcmpultd_v4i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -44216,20 +31130,17 @@ entry:
define zeroext i64 @test_vpcmpultd_v4i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpultd_v4i1_v64i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpltud %xmm1, %xmm0, %k0
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpultd_v4i1_v64i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1550:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1551:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1552:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -44237,8 +31148,7 @@ define zeroext i64 @test_vpcmpultd_v4i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
-; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -44257,6 +31167,7 @@ define zeroext i64 @test_vpcmpultd_v4i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -44269,20 +31180,17 @@ entry:
define zeroext i64 @test_vpcmpultd_v4i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpultd_v4i1_v64i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpltud (%rdi), %xmm0, %k0
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpultd_v4i1_v64i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1553:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1554:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1555:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -44290,8 +31198,7 @@ define zeroext i64 @test_vpcmpultd_v4i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm1
; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
-; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -44310,6 +31217,7 @@ define zeroext i64 @test_vpcmpultd_v4i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -44323,21 +31231,18 @@ entry:
define zeroext i64 @test_masked_vpcmpultd_v4i1_v64i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpultd_v4i1_v64i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpltud %xmm1, %xmm0, %k0 {%k1}
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpultd_v4i1_v64i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1556:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1557:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1558:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -44346,25 +31251,19 @@ define zeroext i64 @test_masked_vpcmpultd_v4i1_v64i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $13, %k0, %k2
-; NoVLX-NEXT: kshiftrw $15, %k2, %k2
-; NoVLX-NEXT: kshiftlw $15, %k0, %k3
-; NoVLX-NEXT: kshiftrw $15, %k3, %k3
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k3, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
-; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -44383,6 +31282,7 @@ define zeroext i64 @test_masked_vpcmpultd_v4i1_v64i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -44398,21 +31298,18 @@ entry:
define zeroext i64 @test_masked_vpcmpultd_v4i1_v64i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpultd_v4i1_v64i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpltud (%rsi), %xmm0, %k0 {%k1}
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpultd_v4i1_v64i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1559:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1560:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1561:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -44421,25 +31318,19 @@ define zeroext i64 @test_masked_vpcmpultd_v4i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1
; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $13, %k0, %k2
-; NoVLX-NEXT: kshiftrw $15, %k2, %k2
-; NoVLX-NEXT: kshiftlw $15, %k0, %k3
-; NoVLX-NEXT: kshiftrw $15, %k3, %k3
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k3, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
-; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -44458,6 +31349,7 @@ define zeroext i64 @test_masked_vpcmpultd_v4i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -44475,20 +31367,17 @@ entry:
define zeroext i64 @test_vpcmpultd_v4i1_v64i1_mask_mem_b(<2 x i64> %__a, i32* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpultd_v4i1_v64i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpltud (%rdi){1to4}, %xmm0, %k0
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpultd_v4i1_v64i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1562:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1563:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1564:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -44497,8 +31386,7 @@ define zeroext i64 @test_vpcmpultd_v4i1_v64i1_mask_mem_b(<2 x i64> %__a, i32* %_
; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
-; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -44517,6 +31405,7 @@ define zeroext i64 @test_vpcmpultd_v4i1_v64i1_mask_mem_b(<2 x i64> %__a, i32* %_
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -44531,21 +31420,18 @@ entry:
define zeroext i64 @test_masked_vpcmpultd_v4i1_v64i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i32* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpultd_v4i1_v64i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpltud (%rsi){1to4}, %xmm0, %k0 {%k1}
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpultd_v4i1_v64i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1565:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1566:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1567:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -44555,25 +31441,19 @@ define zeroext i64 @test_masked_vpcmpultd_v4i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $13, %k0, %k2
-; NoVLX-NEXT: kshiftrw $15, %k2, %k2
-; NoVLX-NEXT: kshiftlw $15, %k0, %k3
-; NoVLX-NEXT: kshiftrw $15, %k3, %k3
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k3, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
-; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -44592,6 +31472,7 @@ define zeroext i64 @test_masked_vpcmpultd_v4i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x i32>
@@ -44610,22 +31491,23 @@ entry:
define zeroext i16 @test_vpcmpultd_v8i1_v16i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpultd_v8i1_v16i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpltud %ymm1, %ymm0, %k0
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpultd_v8i1_v16i1_mask:
-; NoVLX: # BB#0: # %entry
-; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
-; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX: # %bb.0: # %entry
+; NoVLX-NEXT: # kill: def %ymm1 killed %ymm1 def %zmm1
+; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
; NoVLX-NEXT: kshiftrw $8, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
@@ -44638,22 +31520,23 @@ entry:
define zeroext i16 @test_vpcmpultd_v8i1_v16i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpultd_v8i1_v16i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpltud (%rdi), %ymm0, %k0
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpultd_v8i1_v16i1_mask_mem:
-; NoVLX: # BB#0: # %entry
-; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX: # %bb.0: # %entry
+; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: vmovdqa (%rdi), %ymm1
; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
; NoVLX-NEXT: kshiftrw $8, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
@@ -44667,24 +31550,25 @@ entry:
define zeroext i16 @test_masked_vpcmpultd_v8i1_v16i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpultd_v8i1_v16i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpltud %ymm1, %ymm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpultd_v8i1_v16i1_mask:
-; NoVLX: # BB#0: # %entry
-; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
-; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX: # %bb.0: # %entry
+; NoVLX-NEXT: # kill: def %ymm1 killed %ymm1 def %zmm1
+; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 {%k1}
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
; NoVLX-NEXT: kshiftrw $8, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
@@ -44699,24 +31583,25 @@ entry:
define zeroext i16 @test_masked_vpcmpultd_v8i1_v16i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpultd_v8i1_v16i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpltud (%rsi), %ymm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpultd_v8i1_v16i1_mask_mem:
-; NoVLX: # BB#0: # %entry
-; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX: # %bb.0: # %entry
+; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: vmovdqa (%rsi), %ymm1
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 {%k1}
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
; NoVLX-NEXT: kshiftrw $8, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
@@ -44733,22 +31618,23 @@ entry:
define zeroext i16 @test_vpcmpultd_v8i1_v16i1_mask_mem_b(<4 x i64> %__a, i32* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpultd_v8i1_v16i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpltud (%rdi){1to8}, %ymm0, %k0
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpultd_v8i1_v16i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
-; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX: # %bb.0: # %entry
+; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: vpbroadcastd (%rdi), %ymm1
; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
; NoVLX-NEXT: kshiftrw $8, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
@@ -44763,24 +31649,25 @@ entry:
define zeroext i16 @test_masked_vpcmpultd_v8i1_v16i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i32* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpultd_v8i1_v16i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpltud (%rsi){1to8}, %ymm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpultd_v8i1_v16i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
-; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX: # %bb.0: # %entry
+; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: vpbroadcastd (%rsi), %ymm1
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 {%k1}
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
; NoVLX-NEXT: kshiftrw $8, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
@@ -44798,63 +31685,50 @@ entry:
define zeroext i32 @test_vpcmpultd_v8i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpultd_v8i1_v32i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpltud %ymm1, %ymm0, %k0
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpultd_v8i1_v32i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1568:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1569:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1570:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
-; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: # kill: def %ymm1 killed %ymm1 def %zmm1
+; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -44862,6 +31736,7 @@ define zeroext i32 @test_vpcmpultd_v8i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
@@ -44874,63 +31749,50 @@ entry:
define zeroext i32 @test_vpcmpultd_v8i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpultd_v8i1_v32i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpltud (%rdi), %ymm0, %k0
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpultd_v8i1_v32i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1571:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1572:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1573:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: vmovdqa (%rdi), %ymm1
; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -44938,6 +31800,7 @@ define zeroext i32 @test_vpcmpultd_v8i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
@@ -44951,7 +31814,7 @@ entry:
define zeroext i32 @test_masked_vpcmpultd_v8i1_v32i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpultd_v8i1_v32i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpltud %ymm1, %ymm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
@@ -44959,58 +31822,44 @@ define zeroext i32 @test_masked_vpcmpultd_v8i1_v32i1_mask(i8 zeroext %__u, <4 x
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpultd_v8i1_v32i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1574:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1575:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1576:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
-; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
-; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0
+; NoVLX-NEXT: # kill: def %ymm1 killed %ymm1 def %zmm1
+; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
-; NoVLX-NEXT: kandw %k1, %k0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -45018,6 +31867,7 @@ define zeroext i32 @test_masked_vpcmpultd_v8i1_v32i1_mask(i8 zeroext %__u, <4 x
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
@@ -45032,7 +31882,7 @@ entry:
define zeroext i32 @test_masked_vpcmpultd_v8i1_v32i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpultd_v8i1_v32i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpltud (%rsi), %ymm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
@@ -45040,58 +31890,44 @@ define zeroext i32 @test_masked_vpcmpultd_v8i1_v32i1_mask_mem(i8 zeroext %__u, <
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpultd_v8i1_v32i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1577:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1578:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1579:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: vmovdqa (%rsi), %ymm1
-; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0
; NoVLX-NEXT: kmovw %edi, %k1
-; NoVLX-NEXT: kandw %k1, %k0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -45099,6 +31935,7 @@ define zeroext i32 @test_masked_vpcmpultd_v8i1_v32i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
@@ -45115,63 +31952,50 @@ entry:
define zeroext i32 @test_vpcmpultd_v8i1_v32i1_mask_mem_b(<4 x i64> %__a, i32* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpultd_v8i1_v32i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpltud (%rdi){1to8}, %ymm0, %k0
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpultd_v8i1_v32i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1580:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1581:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1582:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: vpbroadcastd (%rdi), %ymm1
; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -45179,6 +32003,7 @@ define zeroext i32 @test_vpcmpultd_v8i1_v32i1_mask_mem_b(<4 x i64> %__a, i32* %_
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
@@ -45193,7 +32018,7 @@ entry:
define zeroext i32 @test_masked_vpcmpultd_v8i1_v32i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i32* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpultd_v8i1_v32i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpltud (%rsi){1to8}, %ymm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
@@ -45201,58 +32026,44 @@ define zeroext i32 @test_masked_vpcmpultd_v8i1_v32i1_mask_mem_b(i8 zeroext %__u,
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpultd_v8i1_v32i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1583:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1584:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1585:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: vpbroadcastd (%rsi), %ymm1
-; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0
; NoVLX-NEXT: kmovw %edi, %k1
-; NoVLX-NEXT: kandw %k0, %k1, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -45260,6 +32071,7 @@ define zeroext i32 @test_masked_vpcmpultd_v8i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
@@ -45277,65 +32089,52 @@ entry:
define zeroext i64 @test_vpcmpultd_v8i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpultd_v8i1_v64i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpltud %ymm1, %ymm0, %k0
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpultd_v8i1_v64i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1586:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1587:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1588:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
-; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: # kill: def %ymm1 killed %ymm1 def %zmm1
+; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -45346,6 +32145,7 @@ define zeroext i64 @test_vpcmpultd_v8i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
@@ -45358,65 +32158,52 @@ entry:
define zeroext i64 @test_vpcmpultd_v8i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpultd_v8i1_v64i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpltud (%rdi), %ymm0, %k0
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpultd_v8i1_v64i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1589:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1590:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1591:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: vmovdqa (%rdi), %ymm1
; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -45427,6 +32214,7 @@ define zeroext i64 @test_vpcmpultd_v8i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
@@ -45440,7 +32228,7 @@ entry:
define zeroext i64 @test_masked_vpcmpultd_v8i1_v64i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpultd_v8i1_v64i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpltud %ymm1, %ymm0, %k0 {%k1}
; VLX-NEXT: kmovq %k0, %rax
@@ -45448,60 +32236,46 @@ define zeroext i64 @test_masked_vpcmpultd_v8i1_v64i1_mask(i8 zeroext %__u, <4 x
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpultd_v8i1_v64i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1592:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1593:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1594:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
-; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
-; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0
+; NoVLX-NEXT: # kill: def %ymm1 killed %ymm1 def %zmm1
+; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
-; NoVLX-NEXT: kandw %k1, %k0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -45512,6 +32286,7 @@ define zeroext i64 @test_masked_vpcmpultd_v8i1_v64i1_mask(i8 zeroext %__u, <4 x
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
@@ -45526,7 +32301,7 @@ entry:
define zeroext i64 @test_masked_vpcmpultd_v8i1_v64i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpultd_v8i1_v64i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpltud (%rsi), %ymm0, %k0 {%k1}
; VLX-NEXT: kmovq %k0, %rax
@@ -45534,60 +32309,46 @@ define zeroext i64 @test_masked_vpcmpultd_v8i1_v64i1_mask_mem(i8 zeroext %__u, <
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpultd_v8i1_v64i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1595:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1596:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1597:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: vmovdqa (%rsi), %ymm1
-; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0
; NoVLX-NEXT: kmovw %edi, %k1
-; NoVLX-NEXT: kandw %k1, %k0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -45598,6 +32359,7 @@ define zeroext i64 @test_masked_vpcmpultd_v8i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
@@ -45614,65 +32376,52 @@ entry:
define zeroext i64 @test_vpcmpultd_v8i1_v64i1_mask_mem_b(<4 x i64> %__a, i32* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpultd_v8i1_v64i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpltud (%rdi){1to8}, %ymm0, %k0
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpultd_v8i1_v64i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1598:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1599:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1600:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: vpbroadcastd (%rdi), %ymm1
; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -45683,6 +32432,7 @@ define zeroext i64 @test_vpcmpultd_v8i1_v64i1_mask_mem_b(<4 x i64> %__a, i32* %_
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
@@ -45697,7 +32447,7 @@ entry:
define zeroext i64 @test_masked_vpcmpultd_v8i1_v64i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i32* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpultd_v8i1_v64i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpltud (%rsi){1to8}, %ymm0, %k0 {%k1}
; VLX-NEXT: kmovq %k0, %rax
@@ -45705,60 +32455,46 @@ define zeroext i64 @test_masked_vpcmpultd_v8i1_v64i1_mask_mem_b(i8 zeroext %__u,
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpultd_v8i1_v64i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1601:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1602:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1603:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: vpbroadcastd (%rsi), %ymm1
-; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0
; NoVLX-NEXT: kmovw %edi, %k1
-; NoVLX-NEXT: kandw %k0, %k1, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -45769,6 +32505,7 @@ define zeroext i64 @test_masked_vpcmpultd_v8i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x i32>
@@ -45786,118 +32523,34 @@ entry:
define zeroext i32 @test_vpcmpultd_v16i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpultd_v16i1_v32i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpultd_v16i1_v32i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1604:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1605:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1606:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: .Lcfi1607:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi1608:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi1609:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi1610:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi1611:
-; NoVLX-NEXT: .cfi_offset %r15, -24
-; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k1
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
@@ -45910,118 +32563,34 @@ entry:
define zeroext i32 @test_vpcmpultd_v16i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpultd_v16i1_v32i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpltud (%rdi), %zmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpultd_v16i1_v32i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1612:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1613:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1614:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: .Lcfi1615:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi1616:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi1617:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi1618:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi1619:
-; NoVLX-NEXT: .cfi_offset %r15, -24
-; NoVLX-NEXT: vpcmpltud (%rdi), %zmm0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpcmpltud (%rdi), %zmm0, %k1
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
@@ -46035,7 +32604,7 @@ entry:
define zeroext i32 @test_masked_vpcmpultd_v16i1_v32i1_mask(i16 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpultd_v16i1_v32i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
@@ -46043,112 +32612,28 @@ define zeroext i32 @test_masked_vpcmpultd_v16i1_v32i1_mask(i16 zeroext %__u, <8
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpultd_v16i1_v32i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1620:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1621:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1622:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: .Lcfi1623:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi1624:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi1625:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi1626:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi1627:
-; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: kmovw %edi, %k1
-; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k1 {%k1}
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
@@ -46163,7 +32648,7 @@ entry:
define zeroext i32 @test_masked_vpcmpultd_v16i1_v32i1_mask_mem(i16 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpultd_v16i1_v32i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpltud (%rsi), %zmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
@@ -46171,112 +32656,28 @@ define zeroext i32 @test_masked_vpcmpultd_v16i1_v32i1_mask_mem(i16 zeroext %__u,
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpultd_v16i1_v32i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1628:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1629:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1630:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: .Lcfi1631:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi1632:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi1633:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi1634:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi1635:
-; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: kmovw %edi, %k1
-; NoVLX-NEXT: vpcmpltud (%rsi), %zmm0, %k0 {%k1}
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpcmpltud (%rsi), %zmm0, %k1 {%k1}
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
@@ -46293,118 +32694,34 @@ entry:
define zeroext i32 @test_vpcmpultd_v16i1_v32i1_mask_mem_b(<8 x i64> %__a, i32* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpultd_v16i1_v32i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpltud (%rdi){1to16}, %zmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpultd_v16i1_v32i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1636:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1637:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1638:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: .Lcfi1639:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi1640:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi1641:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi1642:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi1643:
-; NoVLX-NEXT: .cfi_offset %r15, -24
-; NoVLX-NEXT: vpcmpltud (%rdi){1to16}, %zmm0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpcmpltud (%rdi){1to16}, %zmm0, %k1
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
@@ -46419,7 +32736,7 @@ entry:
define zeroext i32 @test_masked_vpcmpultd_v16i1_v32i1_mask_mem_b(i16 zeroext %__u, <8 x i64> %__a, i32* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpultd_v16i1_v32i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpltud (%rsi){1to16}, %zmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
@@ -46427,112 +32744,28 @@ define zeroext i32 @test_masked_vpcmpultd_v16i1_v32i1_mask_mem_b(i16 zeroext %__
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpultd_v16i1_v32i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1644:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1645:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1646:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: .Lcfi1647:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi1648:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi1649:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi1650:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi1651:
-; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: kmovw %edi, %k1
-; NoVLX-NEXT: vpcmpltud (%rsi){1to16}, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpcmpltud (%rsi){1to16}, %zmm0, %k1 {%k1}
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
@@ -46550,108 +32783,28 @@ entry:
define zeroext i64 @test_vpcmpultd_v16i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpultd_v16i1_v64i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpultd_v16i1_v64i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1652:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1653:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1654:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: .Lcfi1655:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi1656:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi1657:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi1658:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi1659:
-; NoVLX-NEXT: .cfi_offset %r15, -24
-; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k1
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -46660,13 +32813,9 @@ define zeroext i64 @test_vpcmpultd_v16i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %_
; NoVLX-NEXT: shlq $32, %rcx
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: orq %rcx, %rax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
@@ -46679,108 +32828,28 @@ entry:
define zeroext i64 @test_vpcmpultd_v16i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpultd_v16i1_v64i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpltud (%rdi), %zmm0, %k0
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpultd_v16i1_v64i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1660:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1661:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1662:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: .Lcfi1663:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi1664:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi1665:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi1666:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi1667:
-; NoVLX-NEXT: .cfi_offset %r15, -24
-; NoVLX-NEXT: vpcmpltud (%rdi), %zmm0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpcmpltud (%rdi), %zmm0, %k1
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -46789,13 +32858,9 @@ define zeroext i64 @test_vpcmpultd_v16i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64
; NoVLX-NEXT: shlq $32, %rcx
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: orq %rcx, %rax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
@@ -46809,7 +32874,7 @@ entry:
define zeroext i64 @test_masked_vpcmpultd_v16i1_v64i1_mask(i16 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpultd_v16i1_v64i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 {%k1}
; VLX-NEXT: kmovq %k0, %rax
@@ -46817,102 +32882,22 @@ define zeroext i64 @test_masked_vpcmpultd_v16i1_v64i1_mask(i16 zeroext %__u, <8
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpultd_v16i1_v64i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1668:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1669:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1670:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: .Lcfi1671:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi1672:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi1673:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi1674:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi1675:
-; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: kmovw %edi, %k1
-; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpcmpltud %zmm1, %zmm0, %k1 {%k1}
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -46921,13 +32906,9 @@ define zeroext i64 @test_masked_vpcmpultd_v16i1_v64i1_mask(i16 zeroext %__u, <8
; NoVLX-NEXT: shlq $32, %rcx
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: orq %rcx, %rax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
@@ -46942,7 +32923,7 @@ entry:
define zeroext i64 @test_masked_vpcmpultd_v16i1_v64i1_mask_mem(i16 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpultd_v16i1_v64i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpltud (%rsi), %zmm0, %k0 {%k1}
; VLX-NEXT: kmovq %k0, %rax
@@ -46950,102 +32931,22 @@ define zeroext i64 @test_masked_vpcmpultd_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpultd_v16i1_v64i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1676:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1677:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1678:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: .Lcfi1679:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi1680:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi1681:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi1682:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi1683:
-; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: kmovw %edi, %k1
-; NoVLX-NEXT: vpcmpltud (%rsi), %zmm0, %k0 {%k1}
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpcmpltud (%rsi), %zmm0, %k1 {%k1}
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -47054,13 +32955,9 @@ define zeroext i64 @test_masked_vpcmpultd_v16i1_v64i1_mask_mem(i16 zeroext %__u,
; NoVLX-NEXT: shlq $32, %rcx
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: orq %rcx, %rax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
@@ -47077,108 +32974,28 @@ entry:
define zeroext i64 @test_vpcmpultd_v16i1_v64i1_mask_mem_b(<8 x i64> %__a, i32* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpultd_v16i1_v64i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpltud (%rdi){1to16}, %zmm0, %k0
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpultd_v16i1_v64i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1684:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1685:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1686:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: .Lcfi1687:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi1688:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi1689:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi1690:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi1691:
-; NoVLX-NEXT: .cfi_offset %r15, -24
-; NoVLX-NEXT: vpcmpltud (%rdi){1to16}, %zmm0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpcmpltud (%rdi){1to16}, %zmm0, %k1
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -47187,13 +33004,9 @@ define zeroext i64 @test_vpcmpultd_v16i1_v64i1_mask_mem_b(<8 x i64> %__a, i32* %
; NoVLX-NEXT: shlq $32, %rcx
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: orq %rcx, %rax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
@@ -47208,7 +33021,7 @@ entry:
define zeroext i64 @test_masked_vpcmpultd_v16i1_v64i1_mask_mem_b(i16 zeroext %__u, <8 x i64> %__a, i32* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpultd_v16i1_v64i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpltud (%rsi){1to16}, %zmm0, %k0 {%k1}
; VLX-NEXT: kmovq %k0, %rax
@@ -47216,102 +33029,22 @@ define zeroext i64 @test_masked_vpcmpultd_v16i1_v64i1_mask_mem_b(i16 zeroext %__
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpultd_v16i1_v64i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1692:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1693:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1694:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: .Lcfi1695:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi1696:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi1697:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi1698:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi1699:
-; NoVLX-NEXT: .cfi_offset %r15, -24
; NoVLX-NEXT: kmovw %edi, %k1
-; NoVLX-NEXT: vpcmpltud (%rsi){1to16}, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpcmpltud (%rsi){1to16}, %zmm0, %k1 {%k1}
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -47320,13 +33053,9 @@ define zeroext i64 @test_masked_vpcmpultd_v16i1_v64i1_mask_mem_b(i16 zeroext %__
; NoVLX-NEXT: shlq $32, %rcx
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: orq %rcx, %rax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x i32>
@@ -47344,14 +33073,14 @@ entry:
define zeroext i4 @test_vpcmpultq_v2i1_v4i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpultq_v2i1_v4i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpltuq %xmm1, %xmm0, %k0
; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpultq_v2i1_v4i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
@@ -47362,6 +33091,7 @@ define zeroext i4 @test_vpcmpultq_v2i1_v4i1_mask(<2 x i64> %__a, <2 x i64> %__b)
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp)
; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -47374,14 +33104,14 @@ entry:
define zeroext i4 @test_vpcmpultq_v2i1_v4i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpultq_v2i1_v4i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpltuq (%rdi), %xmm0, %k0
; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpultq_v2i1_v4i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm1
@@ -47392,6 +33122,7 @@ define zeroext i4 @test_vpcmpultq_v2i1_v4i1_mask_mem(<2 x i64> %__a, <2 x i64>*
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp)
; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -47405,7 +33136,7 @@ entry:
define zeroext i4 @test_masked_vpcmpultq_v2i1_v4i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpultq_v2i1_v4i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpltuq %xmm1, %xmm0, %k0 {%k1}
; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
@@ -47413,18 +33144,15 @@ define zeroext i4 @test_masked_vpcmpultq_v2i1_v4i1_mask(i8 zeroext %__u, <2 x i6
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v4i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k0, %ecx
; NoVLX-NEXT: vmovd %ecx, %xmm1
; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
@@ -47434,6 +33162,7 @@ define zeroext i4 @test_masked_vpcmpultq_v2i1_v4i1_mask(i8 zeroext %__u, <2 x i6
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp)
; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -47449,7 +33178,7 @@ entry:
define zeroext i4 @test_masked_vpcmpultq_v2i1_v4i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpultq_v2i1_v4i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpltuq (%rsi), %xmm0, %k0 {%k1}
; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
@@ -47457,18 +33186,15 @@ define zeroext i4 @test_masked_vpcmpultq_v2i1_v4i1_mask_mem(i8 zeroext %__u, <2
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v4i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1
; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k0, %ecx
; NoVLX-NEXT: vmovd %ecx, %xmm1
; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
@@ -47478,6 +33204,7 @@ define zeroext i4 @test_masked_vpcmpultq_v2i1_v4i1_mask_mem(i8 zeroext %__u, <2
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp)
; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -47495,14 +33222,14 @@ entry:
define zeroext i4 @test_vpcmpultq_v2i1_v4i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpultq_v2i1_v4i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpltuq (%rdi){1to2}, %xmm0, %k0
; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpultq_v2i1_v4i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpbroadcastq (%rdi), %xmm1
; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
@@ -47514,6 +33241,7 @@ define zeroext i4 @test_vpcmpultq_v2i1_v4i1_mask_mem_b(<2 x i64> %__a, i64* %__b
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp)
; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -47528,7 +33256,7 @@ entry:
define zeroext i4 @test_masked_vpcmpultq_v2i1_v4i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpultq_v2i1_v4i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpltuq (%rsi){1to2}, %xmm0, %k0 {%k1}
; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
@@ -47536,19 +33264,16 @@ define zeroext i4 @test_masked_vpcmpultq_v2i1_v4i1_mask_mem_b(i8 zeroext %__u, <
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v4i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1
; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k0, %ecx
; NoVLX-NEXT: vmovd %ecx, %xmm1
; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
@@ -47558,6 +33283,7 @@ define zeroext i4 @test_masked_vpcmpultq_v2i1_v4i1_mask_mem_b(i8 zeroext %__u, <
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp)
; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -47576,36 +33302,34 @@ entry:
define zeroext i8 @test_vpcmpultq_v2i1_v8i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpultq_v2i1_v8i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpltuq %xmm1, %xmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %al killed %al killed %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpultq_v2i1_v8i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kshiftlw $7, %k0, %k0
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
-; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %al killed %al killed %eax
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -47618,36 +33342,34 @@ entry:
define zeroext i8 @test_vpcmpultq_v2i1_v8i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpultq_v2i1_v8i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpltuq (%rdi), %xmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %al killed %al killed %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpultq_v2i1_v8i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm1
; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kshiftlw $7, %k0, %k0
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
-; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %al killed %al killed %eax
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -47661,47 +33383,42 @@ entry:
define zeroext i8 @test_masked_vpcmpultq_v2i1_v8i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpultq_v2i1_v8i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpltuq %xmm1, %xmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %al killed %al killed %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v8i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k0, %ecx
; NoVLX-NEXT: vmovd %ecx, %xmm1
; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kshiftlw $7, %k0, %k0
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
-; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %al killed %al killed %eax
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -47717,47 +33434,42 @@ entry:
define zeroext i8 @test_masked_vpcmpultq_v2i1_v8i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpultq_v2i1_v8i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpltuq (%rsi), %xmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %al killed %al killed %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v8i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1
; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k0, %ecx
; NoVLX-NEXT: vmovd %ecx, %xmm1
; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kshiftlw $7, %k0, %k0
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
-; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %al killed %al killed %eax
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -47775,37 +33487,35 @@ entry:
define zeroext i8 @test_vpcmpultq_v2i1_v8i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpultq_v2i1_v8i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpltuq (%rdi){1to2}, %xmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %al killed %al killed %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpultq_v2i1_v8i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpbroadcastq (%rdi), %xmm1
; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kshiftlw $7, %k0, %k0
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
-; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %al killed %al killed %eax
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -47820,48 +33530,43 @@ entry:
define zeroext i8 @test_masked_vpcmpultq_v2i1_v8i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpultq_v2i1_v8i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpltuq (%rsi){1to2}, %xmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %al killed %al killed %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v8i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1
; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k0, %ecx
; NoVLX-NEXT: vmovd %ecx, %xmm1
; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kshiftlw $7, %k0, %k0
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
-; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %al killed %al killed %eax
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -47880,35 +33585,34 @@ entry:
define zeroext i16 @test_vpcmpultq_v2i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpultq_v2i1_v16i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpltuq %xmm1, %xmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpultq_v2i1_v16i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kmovw %eax, %k0
; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: andl $1, %eax
-; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
-; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -47921,35 +33625,34 @@ entry:
define zeroext i16 @test_vpcmpultq_v2i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpultq_v2i1_v16i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpltuq (%rdi), %xmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpultq_v2i1_v16i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm1
; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kmovw %eax, %k0
; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: andl $1, %eax
-; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
-; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -47963,46 +33666,42 @@ entry:
define zeroext i16 @test_masked_vpcmpultq_v2i1_v16i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpultq_v2i1_v16i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpltuq %xmm1, %xmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v16i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k0, %ecx
; NoVLX-NEXT: vmovd %ecx, %xmm1
; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kmovw %eax, %k0
; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: andl $1, %eax
-; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
-; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -48018,46 +33717,42 @@ entry:
define zeroext i16 @test_masked_vpcmpultq_v2i1_v16i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpultq_v2i1_v16i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpltuq (%rsi), %xmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v16i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1
; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k0, %ecx
; NoVLX-NEXT: vmovd %ecx, %xmm1
; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kmovw %eax, %k0
; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: andl $1, %eax
-; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
-; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -48075,36 +33770,35 @@ entry:
define zeroext i16 @test_vpcmpultq_v2i1_v16i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpultq_v2i1_v16i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpltuq (%rdi){1to2}, %xmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpultq_v2i1_v16i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpbroadcastq (%rdi), %xmm1
; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kmovw %eax, %k0
; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: andl $1, %eax
-; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
-; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -48119,47 +33813,43 @@ entry:
define zeroext i16 @test_masked_vpcmpultq_v2i1_v16i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpultq_v2i1_v16i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpltuq (%rsi){1to2}, %xmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v16i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpbroadcastq (%rsi), %xmm1
; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k0, %ecx
; NoVLX-NEXT: vmovd %ecx, %xmm1
; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kmovw %eax, %k0
; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: andl $1, %eax
-; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
-; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -48178,20 +33868,17 @@ entry:
define zeroext i32 @test_vpcmpultq_v2i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpultq_v2i1_v32i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpltuq %xmm1, %xmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpultq_v2i1_v32i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1700:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1701:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1702:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -48212,6 +33899,7 @@ define zeroext i32 @test_vpcmpultq_v2i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -48224,20 +33912,17 @@ entry:
define zeroext i32 @test_vpcmpultq_v2i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpultq_v2i1_v32i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpltuq (%rdi), %xmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpultq_v2i1_v32i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1703:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1704:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1705:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -48258,6 +33943,7 @@ define zeroext i32 @test_vpcmpultq_v2i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -48271,21 +33957,18 @@ entry:
define zeroext i32 @test_masked_vpcmpultq_v2i1_v32i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpultq_v2i1_v32i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpltuq %xmm1, %xmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v32i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1706:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1707:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1708:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -48294,12 +33977,9 @@ define zeroext i32 @test_masked_vpcmpultq_v2i1_v32i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k0, %ecx
; NoVLX-NEXT: vmovd %ecx, %xmm1
; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
@@ -48316,6 +33996,7 @@ define zeroext i32 @test_masked_vpcmpultq_v2i1_v32i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -48331,21 +34012,18 @@ entry:
define zeroext i32 @test_masked_vpcmpultq_v2i1_v32i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpultq_v2i1_v32i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpltuq (%rsi), %xmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v32i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1709:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1710:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1711:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -48354,12 +34032,9 @@ define zeroext i32 @test_masked_vpcmpultq_v2i1_v32i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1
; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k0, %ecx
; NoVLX-NEXT: vmovd %ecx, %xmm1
; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
@@ -48376,6 +34051,7 @@ define zeroext i32 @test_masked_vpcmpultq_v2i1_v32i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -48393,20 +34069,17 @@ entry:
define zeroext i32 @test_vpcmpultq_v2i1_v32i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpultq_v2i1_v32i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpltuq (%rdi){1to2}, %xmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpultq_v2i1_v32i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1712:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1713:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1714:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -48428,6 +34101,7 @@ define zeroext i32 @test_vpcmpultq_v2i1_v32i1_mask_mem_b(<2 x i64> %__a, i64* %_
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -48442,21 +34116,18 @@ entry:
define zeroext i32 @test_masked_vpcmpultq_v2i1_v32i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpultq_v2i1_v32i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpltuq (%rsi){1to2}, %xmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v32i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1715:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1716:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1717:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -48466,12 +34137,9 @@ define zeroext i32 @test_masked_vpcmpultq_v2i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k0, %ecx
; NoVLX-NEXT: vmovd %ecx, %xmm1
; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
@@ -48488,6 +34156,7 @@ define zeroext i32 @test_masked_vpcmpultq_v2i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -48506,20 +34175,17 @@ entry:
define zeroext i64 @test_vpcmpultq_v2i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpultq_v2i1_v64i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpltuq %xmm1, %xmm0, %k0
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpultq_v2i1_v64i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1718:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1719:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1720:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -48527,8 +34193,7 @@ define zeroext i64 @test_vpcmpultq_v2i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
-; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -48547,6 +34212,7 @@ define zeroext i64 @test_vpcmpultq_v2i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -48559,20 +34225,17 @@ entry:
define zeroext i64 @test_vpcmpultq_v2i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpultq_v2i1_v64i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpltuq (%rdi), %xmm0, %k0
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpultq_v2i1_v64i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1721:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1722:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1723:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -48580,8 +34243,7 @@ define zeroext i64 @test_vpcmpultq_v2i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm1
; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
-; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -48600,6 +34262,7 @@ define zeroext i64 @test_vpcmpultq_v2i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -48613,21 +34276,18 @@ entry:
define zeroext i64 @test_masked_vpcmpultq_v2i1_v64i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpultq_v2i1_v64i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpltuq %xmm1, %xmm0, %k0 {%k1}
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v64i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1724:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1725:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1726:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -48636,17 +34296,13 @@ define zeroext i64 @test_masked_vpcmpultq_v2i1_v64i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k0, %ecx
; NoVLX-NEXT: vmovd %ecx, %xmm1
; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
-; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -48665,6 +34321,7 @@ define zeroext i64 @test_masked_vpcmpultq_v2i1_v64i1_mask(i8 zeroext %__u, <2 x
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -48680,21 +34337,18 @@ entry:
define zeroext i64 @test_masked_vpcmpultq_v2i1_v64i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpultq_v2i1_v64i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpltuq (%rsi), %xmm0, %k0 {%k1}
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v64i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1727:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1728:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1729:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -48703,17 +34357,13 @@ define zeroext i64 @test_masked_vpcmpultq_v2i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1
; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k0, %ecx
; NoVLX-NEXT: vmovd %ecx, %xmm1
; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
-; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -48732,6 +34382,7 @@ define zeroext i64 @test_masked_vpcmpultq_v2i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -48749,20 +34400,17 @@ entry:
define zeroext i64 @test_vpcmpultq_v2i1_v64i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpultq_v2i1_v64i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpltuq (%rdi){1to2}, %xmm0, %k0
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpultq_v2i1_v64i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1730:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1731:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1732:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -48771,8 +34419,7 @@ define zeroext i64 @test_vpcmpultq_v2i1_v64i1_mask_mem_b(<2 x i64> %__a, i64* %_
; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
-; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -48791,6 +34438,7 @@ define zeroext i64 @test_vpcmpultq_v2i1_v64i1_mask_mem_b(<2 x i64> %__a, i64* %_
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -48805,21 +34453,18 @@ entry:
define zeroext i64 @test_masked_vpcmpultq_v2i1_v64i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpultq_v2i1_v64i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpltuq (%rsi){1to2}, %xmm0, %k0 {%k1}
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v64i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1733:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1734:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1735:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -48829,17 +34474,13 @@ define zeroext i64 @test_masked_vpcmpultq_v2i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
; NoVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kmovw %k0, %ecx
; NoVLX-NEXT: vmovd %ecx, %xmm1
; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
-; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -48858,6 +34499,7 @@ define zeroext i64 @test_masked_vpcmpultq_v2i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x i64>
@@ -48876,54 +34518,51 @@ entry:
define zeroext i8 @test_vpcmpultq_v4i1_v8i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpultq_v4i1_v8i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpltuq %ymm1, %ymm0, %k0
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %al killed %al killed %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpultq_v4i1_v8i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0
; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1
; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kshiftlw $7, %k0, %k0
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $13, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $12, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %al killed %al killed %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
@@ -48936,54 +34575,51 @@ entry:
define zeroext i8 @test_vpcmpultq_v4i1_v8i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpultq_v4i1_v8i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpltuq (%rdi), %ymm0, %k0
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %al killed %al killed %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpultq_v4i1_v8i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
; NoVLX-NEXT: vpxor (%rdi), %ymm1, %ymm1
; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kshiftlw $7, %k0, %k0
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $13, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $12, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %al killed %al killed %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
@@ -48997,73 +34633,65 @@ entry:
define zeroext i8 @test_masked_vpcmpultq_v4i1_v8i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpultq_v4i1_v8i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpltuq %ymm1, %ymm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %al killed %al killed %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpultq_v4i1_v8i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0
; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1
; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $13, %k0, %k2
-; NoVLX-NEXT: kshiftrw $15, %k2, %k2
-; NoVLX-NEXT: kshiftlw $15, %k0, %k3
-; NoVLX-NEXT: kshiftrw $15, %k3, %k3
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k3, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kshiftlw $7, %k0, %k0
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $13, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $12, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %al killed %al killed %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
@@ -49079,73 +34707,65 @@ entry:
define zeroext i8 @test_masked_vpcmpultq_v4i1_v8i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpultq_v4i1_v8i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpltuq (%rsi), %ymm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %al killed %al killed %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpultq_v4i1_v8i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
; NoVLX-NEXT: vpxor (%rsi), %ymm1, %ymm1
; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $13, %k0, %k2
-; NoVLX-NEXT: kshiftrw $15, %k2, %k2
-; NoVLX-NEXT: kshiftlw $15, %k0, %k3
-; NoVLX-NEXT: kshiftrw $15, %k3, %k3
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k3, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kshiftlw $7, %k0, %k0
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $13, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $12, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %al killed %al killed %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
@@ -49163,15 +34783,15 @@ entry:
define zeroext i8 @test_vpcmpultq_v4i1_v8i1_mask_mem_b(<4 x i64> %__a, i64* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpultq_v4i1_v8i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpltuq (%rdi){1to4}, %ymm0, %k0
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %al killed %al killed %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpultq_v4i1_v8i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpbroadcastq (%rdi), %ymm1
; NoVLX-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0
@@ -49179,39 +34799,36 @@ define zeroext i8 @test_vpcmpultq_v4i1_v8i1_mask_mem_b(<4 x i64> %__a, i64* %__b
; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kshiftlw $7, %k0, %k0
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $13, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $12, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %al killed %al killed %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
@@ -49226,16 +34843,16 @@ entry:
define zeroext i8 @test_masked_vpcmpultq_v4i1_v8i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i64* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpultq_v4i1_v8i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpltuq (%rsi){1to4}, %ymm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %al killed %al killed %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpultq_v4i1_v8i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpbroadcastq (%rsi), %ymm1
; NoVLX-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0
@@ -49243,57 +34860,49 @@ define zeroext i8 @test_masked_vpcmpultq_v4i1_v8i1_mask_mem_b(i8 zeroext %__u, <
; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $13, %k0, %k2
-; NoVLX-NEXT: kshiftrw $15, %k2, %k2
-; NoVLX-NEXT: kshiftlw $15, %k0, %k3
-; NoVLX-NEXT: kshiftrw $15, %k3, %k3
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k3, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kshiftlw $7, %k0, %k0
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $13, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $12, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %al killed %al killed %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
@@ -49312,53 +34921,51 @@ entry:
define zeroext i16 @test_vpcmpultq_v4i1_v16i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpultq_v4i1_v16i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpltuq %ymm1, %ymm0, %k0
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpultq_v4i1_v16i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0
; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1
; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kmovw %eax, %k0
; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: andl $1, %eax
-; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $13, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $12, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
@@ -49371,53 +34978,51 @@ entry:
define zeroext i16 @test_vpcmpultq_v4i1_v16i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpultq_v4i1_v16i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpltuq (%rdi), %ymm0, %k0
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpultq_v4i1_v16i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
; NoVLX-NEXT: vpxor (%rdi), %ymm1, %ymm1
; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kmovw %eax, %k0
; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: andl $1, %eax
-; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $13, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $12, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
@@ -49431,72 +35036,65 @@ entry:
define zeroext i16 @test_masked_vpcmpultq_v4i1_v16i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpultq_v4i1_v16i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpltuq %ymm1, %ymm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpultq_v4i1_v16i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0
; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1
; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $13, %k0, %k2
-; NoVLX-NEXT: kshiftrw $15, %k2, %k2
-; NoVLX-NEXT: kshiftlw $15, %k0, %k3
-; NoVLX-NEXT: kshiftrw $15, %k3, %k3
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k3, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kmovw %eax, %k0
; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: andl $1, %eax
-; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $13, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $12, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
@@ -49512,72 +35110,65 @@ entry:
define zeroext i16 @test_masked_vpcmpultq_v4i1_v16i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpultq_v4i1_v16i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpltuq (%rsi), %ymm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpultq_v4i1_v16i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
; NoVLX-NEXT: vpxor (%rsi), %ymm1, %ymm1
; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $13, %k0, %k2
-; NoVLX-NEXT: kshiftrw $15, %k2, %k2
-; NoVLX-NEXT: kshiftlw $15, %k0, %k3
-; NoVLX-NEXT: kshiftrw $15, %k3, %k3
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k3, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kmovw %eax, %k0
; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: andl $1, %eax
-; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $13, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $12, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
@@ -49595,15 +35186,15 @@ entry:
define zeroext i16 @test_vpcmpultq_v4i1_v16i1_mask_mem_b(<4 x i64> %__a, i64* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpultq_v4i1_v16i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpltuq (%rdi){1to4}, %ymm0, %k0
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpultq_v4i1_v16i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpbroadcastq (%rdi), %ymm1
; NoVLX-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0
@@ -49611,38 +35202,36 @@ define zeroext i16 @test_vpcmpultq_v4i1_v16i1_mask_mem_b(<4 x i64> %__a, i64* %_
; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kmovw %eax, %k0
; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: andl $1, %eax
-; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $13, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $12, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
@@ -49657,16 +35246,16 @@ entry:
define zeroext i16 @test_masked_vpcmpultq_v4i1_v16i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i64* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpultq_v4i1_v16i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpltuq (%rsi){1to4}, %ymm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpultq_v4i1_v16i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpbroadcastq (%rsi), %ymm1
; NoVLX-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0
@@ -49674,56 +35263,49 @@ define zeroext i16 @test_masked_vpcmpultq_v4i1_v16i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $13, %k0, %k2
-; NoVLX-NEXT: kshiftrw $15, %k2, %k2
-; NoVLX-NEXT: kshiftlw $15, %k0, %k3
-; NoVLX-NEXT: kshiftrw $15, %k3, %k3
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k3, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kmovw %eax, %k0
; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: andl $1, %eax
-; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $13, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $12, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
@@ -49742,21 +35324,18 @@ entry:
define zeroext i32 @test_vpcmpultq_v4i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpultq_v4i1_v32i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpltuq %ymm1, %ymm0, %k0
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpultq_v4i1_v32i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1736:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1737:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1738:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -49778,6 +35357,7 @@ define zeroext i32 @test_vpcmpultq_v4i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
@@ -49790,21 +35370,18 @@ entry:
define zeroext i32 @test_vpcmpultq_v4i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpultq_v4i1_v32i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpltuq (%rdi), %ymm0, %k0
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpultq_v4i1_v32i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1739:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1740:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1741:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -49826,6 +35403,7 @@ define zeroext i32 @test_vpcmpultq_v4i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
@@ -49839,7 +35417,7 @@ entry:
define zeroext i32 @test_masked_vpcmpultq_v4i1_v32i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpultq_v4i1_v32i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpltuq %ymm1, %ymm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
@@ -49847,14 +35425,11 @@ define zeroext i32 @test_masked_vpcmpultq_v4i1_v32i1_mask(i8 zeroext %__u, <4 x
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpultq_v4i1_v32i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1742:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1743:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1744:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -49864,21 +35439,16 @@ define zeroext i32 @test_masked_vpcmpultq_v4i1_v32i1_mask(i8 zeroext %__u, <4 x
; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $13, %k0, %k2
-; NoVLX-NEXT: kshiftrw $15, %k2, %k2
-; NoVLX-NEXT: kshiftlw $15, %k0, %k3
-; NoVLX-NEXT: kshiftrw $15, %k3, %k3
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k3, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -49894,6 +35464,7 @@ define zeroext i32 @test_masked_vpcmpultq_v4i1_v32i1_mask(i8 zeroext %__u, <4 x
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
@@ -49909,7 +35480,7 @@ entry:
define zeroext i32 @test_masked_vpcmpultq_v4i1_v32i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpultq_v4i1_v32i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpltuq (%rsi), %ymm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
@@ -49917,14 +35488,11 @@ define zeroext i32 @test_masked_vpcmpultq_v4i1_v32i1_mask_mem(i8 zeroext %__u, <
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpultq_v4i1_v32i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1745:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1746:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1747:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -49934,21 +35502,16 @@ define zeroext i32 @test_masked_vpcmpultq_v4i1_v32i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $13, %k0, %k2
-; NoVLX-NEXT: kshiftrw $15, %k2, %k2
-; NoVLX-NEXT: kshiftlw $15, %k0, %k3
-; NoVLX-NEXT: kshiftrw $15, %k3, %k3
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k3, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -49964,6 +35527,7 @@ define zeroext i32 @test_masked_vpcmpultq_v4i1_v32i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
@@ -49981,21 +35545,18 @@ entry:
define zeroext i32 @test_vpcmpultq_v4i1_v32i1_mask_mem_b(<4 x i64> %__a, i64* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpultq_v4i1_v32i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpltuq (%rdi){1to4}, %ymm0, %k0
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpultq_v4i1_v32i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1748:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1749:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1750:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -50018,6 +35579,7 @@ define zeroext i32 @test_vpcmpultq_v4i1_v32i1_mask_mem_b(<4 x i64> %__a, i64* %_
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
@@ -50032,7 +35594,7 @@ entry:
define zeroext i32 @test_masked_vpcmpultq_v4i1_v32i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i64* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpultq_v4i1_v32i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpltuq (%rsi){1to4}, %ymm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
@@ -50040,14 +35602,11 @@ define zeroext i32 @test_masked_vpcmpultq_v4i1_v32i1_mask_mem_b(i8 zeroext %__u,
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpultq_v4i1_v32i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1751:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1752:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1753:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -50058,21 +35617,16 @@ define zeroext i32 @test_masked_vpcmpultq_v4i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $13, %k0, %k2
-; NoVLX-NEXT: kshiftrw $15, %k2, %k2
-; NoVLX-NEXT: kshiftlw $15, %k0, %k3
-; NoVLX-NEXT: kshiftrw $15, %k3, %k3
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k3, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -50088,6 +35642,7 @@ define zeroext i32 @test_masked_vpcmpultq_v4i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
@@ -50106,21 +35661,18 @@ entry:
define zeroext i64 @test_vpcmpultq_v4i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpultq_v4i1_v64i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpltuq %ymm1, %ymm0, %k0
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpultq_v4i1_v64i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1754:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1755:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1756:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -50129,8 +35681,7 @@ define zeroext i64 @test_vpcmpultq_v4i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__
; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1
; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
-; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -50149,6 +35700,7 @@ define zeroext i64 @test_vpcmpultq_v4i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
@@ -50161,21 +35713,18 @@ entry:
define zeroext i64 @test_vpcmpultq_v4i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpultq_v4i1_v64i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpltuq (%rdi), %ymm0, %k0
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpultq_v4i1_v64i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1757:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1758:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1759:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -50184,8 +35733,7 @@ define zeroext i64 @test_vpcmpultq_v4i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>
; NoVLX-NEXT: vpxor (%rdi), %ymm1, %ymm1
; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
-; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -50204,6 +35752,7 @@ define zeroext i64 @test_vpcmpultq_v4i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
@@ -50217,7 +35766,7 @@ entry:
define zeroext i64 @test_masked_vpcmpultq_v4i1_v64i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpultq_v4i1_v64i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpltuq %ymm1, %ymm0, %k0 {%k1}
; VLX-NEXT: kmovq %k0, %rax
@@ -50225,14 +35774,11 @@ define zeroext i64 @test_masked_vpcmpultq_v4i1_v64i1_mask(i8 zeroext %__u, <4 x
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpultq_v4i1_v64i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1760:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1761:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1762:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -50242,25 +35788,19 @@ define zeroext i64 @test_masked_vpcmpultq_v4i1_v64i1_mask(i8 zeroext %__u, <4 x
; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $13, %k0, %k2
-; NoVLX-NEXT: kshiftrw $15, %k2, %k2
-; NoVLX-NEXT: kshiftlw $15, %k0, %k3
-; NoVLX-NEXT: kshiftrw $15, %k3, %k3
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k3, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
-; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -50279,6 +35819,7 @@ define zeroext i64 @test_masked_vpcmpultq_v4i1_v64i1_mask(i8 zeroext %__u, <4 x
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
@@ -50294,7 +35835,7 @@ entry:
define zeroext i64 @test_masked_vpcmpultq_v4i1_v64i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpultq_v4i1_v64i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpltuq (%rsi), %ymm0, %k0 {%k1}
; VLX-NEXT: kmovq %k0, %rax
@@ -50302,14 +35843,11 @@ define zeroext i64 @test_masked_vpcmpultq_v4i1_v64i1_mask_mem(i8 zeroext %__u, <
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpultq_v4i1_v64i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1763:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1764:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1765:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -50319,25 +35857,19 @@ define zeroext i64 @test_masked_vpcmpultq_v4i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $13, %k0, %k2
-; NoVLX-NEXT: kshiftrw $15, %k2, %k2
-; NoVLX-NEXT: kshiftlw $15, %k0, %k3
-; NoVLX-NEXT: kshiftrw $15, %k3, %k3
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k3, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
-; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -50356,6 +35888,7 @@ define zeroext i64 @test_masked_vpcmpultq_v4i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
@@ -50373,21 +35906,18 @@ entry:
define zeroext i64 @test_vpcmpultq_v4i1_v64i1_mask_mem_b(<4 x i64> %__a, i64* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpultq_v4i1_v64i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpltuq (%rdi){1to4}, %ymm0, %k0
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpultq_v4i1_v64i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1766:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1767:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1768:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -50397,8 +35927,7 @@ define zeroext i64 @test_vpcmpultq_v4i1_v64i1_mask_mem_b(<4 x i64> %__a, i64* %_
; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1
; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
-; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -50417,6 +35946,7 @@ define zeroext i64 @test_vpcmpultq_v4i1_v64i1_mask_mem_b(<4 x i64> %__a, i64* %_
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
@@ -50431,7 +35961,7 @@ entry:
define zeroext i64 @test_masked_vpcmpultq_v4i1_v64i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i64* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpultq_v4i1_v64i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpltuq (%rsi){1to4}, %ymm0, %k0 {%k1}
; VLX-NEXT: kmovq %k0, %rax
@@ -50439,14 +35969,11 @@ define zeroext i64 @test_masked_vpcmpultq_v4i1_v64i1_mask_mem_b(i8 zeroext %__u,
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpultq_v4i1_v64i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1769:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1770:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1771:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
@@ -50457,25 +35984,19 @@ define zeroext i64 @test_masked_vpcmpultq_v4i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: kmovw %edi, %k0
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kshiftlw $13, %k0, %k2
-; NoVLX-NEXT: kshiftrw $15, %k2, %k2
-; NoVLX-NEXT: kshiftlw $15, %k0, %k3
-; NoVLX-NEXT: kshiftrw $15, %k3, %k3
-; NoVLX-NEXT: kshiftlw $14, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: kmovw %k3, %ecx
-; NoVLX-NEXT: vmovd %ecx, %xmm1
-; NoVLX-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: kmovw %k2, %eax
-; NoVLX-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edx
+; NoVLX-NEXT: kmovw %k0, %esi
+; NoVLX-NEXT: vmovd %esi, %xmm1
+; NoVLX-NEXT: vpinsrb $4, %edx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; NoVLX-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
; NoVLX-NEXT: vpand %xmm0, %xmm1, %xmm0
-; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -50494,6 +36015,7 @@ define zeroext i64 @test_masked_vpcmpultq_v4i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x i64>
@@ -50512,18 +36034,19 @@ entry:
define zeroext i16 @test_vpcmpultq_v8i1_v16i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpultq_v8i1_v16i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpltuq %zmm1, %zmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpultq_v8i1_v16i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpcmpltuq %zmm1, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
@@ -50536,18 +36059,19 @@ entry:
define zeroext i16 @test_vpcmpultq_v8i1_v16i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpultq_v8i1_v16i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpltuq (%rdi), %zmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpultq_v8i1_v16i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpcmpltuq (%rdi), %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
@@ -50561,20 +36085,21 @@ entry:
define zeroext i16 @test_masked_vpcmpultq_v8i1_v16i1_mask(i8 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpultq_v8i1_v16i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpltuq %zmm1, %zmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpultq_v8i1_v16i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpltuq %zmm1, %zmm0, %k0 {%k1}
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
@@ -50589,20 +36114,21 @@ entry:
define zeroext i16 @test_masked_vpcmpultq_v8i1_v16i1_mask_mem(i8 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpultq_v8i1_v16i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpltuq (%rsi), %zmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpultq_v8i1_v16i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpltuq (%rsi), %zmm0, %k0 {%k1}
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
@@ -50619,18 +36145,19 @@ entry:
define zeroext i16 @test_vpcmpultq_v8i1_v16i1_mask_mem_b(<8 x i64> %__a, i64* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpultq_v8i1_v16i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpltuq (%rdi){1to8}, %zmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpultq_v8i1_v16i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vpcmpltuq (%rdi){1to8}, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
@@ -50645,20 +36172,21 @@ entry:
define zeroext i16 @test_masked_vpcmpultq_v8i1_v16i1_mask_mem_b(i8 zeroext %__u, <8 x i64> %__a, i64* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpultq_v8i1_v16i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpltuq (%rsi){1to8}, %zmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpultq_v8i1_v16i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpltuq (%rsi){1to8}, %zmm0, %k0 {%k1}
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
@@ -50676,61 +36204,48 @@ entry:
define zeroext i32 @test_vpcmpultq_v8i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpultq_v8i1_v32i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpltuq %zmm1, %zmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpultq_v8i1_v32i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1772:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1773:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1774:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
; NoVLX-NEXT: vpcmpltuq %zmm1, %zmm0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -50738,6 +36253,7 @@ define zeroext i32 @test_vpcmpultq_v8i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %__
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
@@ -50750,61 +36266,48 @@ entry:
define zeroext i32 @test_vpcmpultq_v8i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpultq_v8i1_v32i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpltuq (%rdi), %zmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpultq_v8i1_v32i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1775:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1776:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1777:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
; NoVLX-NEXT: vpcmpltuq (%rdi), %zmm0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -50812,6 +36315,7 @@ define zeroext i32 @test_vpcmpultq_v8i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64>
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
@@ -50825,7 +36329,7 @@ entry:
define zeroext i32 @test_masked_vpcmpultq_v8i1_v32i1_mask(i8 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpultq_v8i1_v32i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpltuq %zmm1, %zmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
@@ -50833,55 +36337,42 @@ define zeroext i32 @test_masked_vpcmpultq_v8i1_v32i1_mask(i8 zeroext %__u, <8 x
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpultq_v8i1_v32i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1778:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1779:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1780:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpltuq %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -50889,6 +36380,7 @@ define zeroext i32 @test_masked_vpcmpultq_v8i1_v32i1_mask(i8 zeroext %__u, <8 x
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
@@ -50903,7 +36395,7 @@ entry:
define zeroext i32 @test_masked_vpcmpultq_v8i1_v32i1_mask_mem(i8 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpultq_v8i1_v32i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpltuq (%rsi), %zmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
@@ -50911,55 +36403,42 @@ define zeroext i32 @test_masked_vpcmpultq_v8i1_v32i1_mask_mem(i8 zeroext %__u, <
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpultq_v8i1_v32i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1781:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1782:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1783:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpltuq (%rsi), %zmm0, %k0 {%k1}
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -50967,6 +36446,7 @@ define zeroext i32 @test_masked_vpcmpultq_v8i1_v32i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
@@ -50983,61 +36463,48 @@ entry:
define zeroext i32 @test_vpcmpultq_v8i1_v32i1_mask_mem_b(<8 x i64> %__a, i64* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpultq_v8i1_v32i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpltuq (%rdi){1to8}, %zmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpultq_v8i1_v32i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1784:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1785:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1786:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
; NoVLX-NEXT: vpcmpltuq (%rdi){1to8}, %zmm0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -51045,6 +36512,7 @@ define zeroext i32 @test_vpcmpultq_v8i1_v32i1_mask_mem_b(<8 x i64> %__a, i64* %_
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
@@ -51059,7 +36527,7 @@ entry:
define zeroext i32 @test_masked_vpcmpultq_v8i1_v32i1_mask_mem_b(i8 zeroext %__u, <8 x i64> %__a, i64* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpultq_v8i1_v32i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpltuq (%rsi){1to8}, %zmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
@@ -51067,55 +36535,42 @@ define zeroext i32 @test_masked_vpcmpultq_v8i1_v32i1_mask_mem_b(i8 zeroext %__u,
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpultq_v8i1_v32i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1787:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1788:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1789:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpltuq (%rsi){1to8}, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -51123,6 +36578,7 @@ define zeroext i32 @test_masked_vpcmpultq_v8i1_v32i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
@@ -51140,63 +36596,50 @@ entry:
define zeroext i64 @test_vpcmpultq_v8i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpultq_v8i1_v64i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpltuq %zmm1, %zmm0, %k0
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpultq_v8i1_v64i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1790:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1791:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1792:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vpcmpltuq %zmm1, %zmm0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -51207,6 +36650,7 @@ define zeroext i64 @test_vpcmpultq_v8i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
@@ -51219,63 +36663,50 @@ entry:
define zeroext i64 @test_vpcmpultq_v8i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpultq_v8i1_v64i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpltuq (%rdi), %zmm0, %k0
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpultq_v8i1_v64i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1793:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1794:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1795:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vpcmpltuq (%rdi), %zmm0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -51286,6 +36717,7 @@ define zeroext i64 @test_vpcmpultq_v8i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
@@ -51299,7 +36731,7 @@ entry:
define zeroext i64 @test_masked_vpcmpultq_v8i1_v64i1_mask(i8 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpultq_v8i1_v64i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpltuq %zmm1, %zmm0, %k0 {%k1}
; VLX-NEXT: kmovq %k0, %rax
@@ -51307,57 +36739,44 @@ define zeroext i64 @test_masked_vpcmpultq_v8i1_v64i1_mask(i8 zeroext %__u, <8 x
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpultq_v8i1_v64i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1796:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1797:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1798:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpltuq %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -51368,6 +36787,7 @@ define zeroext i64 @test_masked_vpcmpultq_v8i1_v64i1_mask(i8 zeroext %__u, <8 x
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
@@ -51382,7 +36802,7 @@ entry:
define zeroext i64 @test_masked_vpcmpultq_v8i1_v64i1_mask_mem(i8 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpultq_v8i1_v64i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpltuq (%rsi), %zmm0, %k0 {%k1}
; VLX-NEXT: kmovq %k0, %rax
@@ -51390,57 +36810,44 @@ define zeroext i64 @test_masked_vpcmpultq_v8i1_v64i1_mask_mem(i8 zeroext %__u, <
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpultq_v8i1_v64i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1799:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1800:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1801:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpltuq (%rsi), %zmm0, %k0 {%k1}
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -51451,6 +36858,7 @@ define zeroext i64 @test_masked_vpcmpultq_v8i1_v64i1_mask_mem(i8 zeroext %__u, <
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
@@ -51467,63 +36875,50 @@ entry:
define zeroext i64 @test_vpcmpultq_v8i1_v64i1_mask_mem_b(<8 x i64> %__a, i64* %__b) local_unnamed_addr {
; VLX-LABEL: test_vpcmpultq_v8i1_v64i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vpcmpltuq (%rdi){1to8}, %zmm0, %k0
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vpcmpultq_v8i1_v64i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1802:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1803:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1804:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vpcmpltuq (%rdi){1to8}, %zmm0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -51534,6 +36929,7 @@ define zeroext i64 @test_vpcmpultq_v8i1_v64i1_mask_mem_b(<8 x i64> %__a, i64* %_
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
@@ -51548,7 +36944,7 @@ entry:
define zeroext i64 @test_masked_vpcmpultq_v8i1_v64i1_mask_mem_b(i8 zeroext %__u, <8 x i64> %__a, i64* %__b) local_unnamed_addr {
; VLX-LABEL: test_masked_vpcmpultq_v8i1_v64i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: kmovd %edi, %k1
; VLX-NEXT: vpcmpltuq (%rsi){1to8}, %zmm0, %k0 {%k1}
; VLX-NEXT: kmovq %k0, %rax
@@ -51556,57 +36952,44 @@ define zeroext i64 @test_masked_vpcmpultq_v8i1_v64i1_mask_mem_b(i8 zeroext %__u,
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_masked_vpcmpultq_v8i1_v64i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1805:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1806:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1807:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vpcmpltuq (%rsi){1to8}, %zmm0, %k0 {%k1}
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -51617,6 +37000,7 @@ define zeroext i64 @test_masked_vpcmpultq_v8i1_v64i1_mask_mem_b(i8 zeroext %__u,
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x i64>
@@ -51635,49 +37019,45 @@ entry:
declare i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> , <16 x float> , i32, i16, i32)
define zeroext i8 @test_vcmpoeqps_v4i1_v8i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vcmpoeqps_v4i1_v8i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vcmpeqps %xmm1, %xmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %al killed %al killed %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vcmpoeqps_v4i1_v8i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vcmpeqps %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kshiftlw $7, %k0, %k0
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $13, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $12, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %al killed %al killed %eax
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x float>
@@ -51690,49 +37070,45 @@ entry:
define zeroext i8 @test_vcmpoeqps_v4i1_v8i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vcmpoeqps_v4i1_v8i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vcmpeqps (%rdi), %xmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %al killed %al killed %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vcmpoeqps_v4i1_v8i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vcmpeqps (%rdi), %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kshiftlw $7, %k0, %k0
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $13, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $12, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %al killed %al killed %eax
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x float>
@@ -51746,50 +37122,46 @@ entry:
define zeroext i8 @test_vcmpoeqps_v4i1_v8i1_mask_mem_b(<2 x i64> %__a, float* %__b) local_unnamed_addr {
; VLX-LABEL: test_vcmpoeqps_v4i1_v8i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vcmpeqps (%rdi){1to4}, %xmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %al killed %al killed %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vcmpoeqps_v4i1_v8i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vbroadcastss (%rdi), %xmm1
; NoVLX-NEXT: vcmpeqps %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kshiftlw $7, %k0, %k0
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $13, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $12, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %al killed %al killed %eax
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x float>
@@ -51802,51 +37174,236 @@ entry:
ret i8 %4
}
-
-define zeroext i16 @test_vcmpoeqps_v4i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; VLX-LABEL: test_vcmpoeqps_v4i1_v16i1_mask:
-; VLX: # BB#0: # %entry
-; VLX-NEXT: vcmpeqps %xmm1, %xmm0, %k0
+define zeroext i8 @test_masked_vcmpoeqps_v4i1_v8i1_mask(i4 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqps_v4i1_v8i1_mask:
+; VLX: # %bb.0: # %entry
+; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT: vcmpeqps %xmm1, %xmm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %al killed %al killed %eax
; VLX-NEXT: retq
;
-; NoVLX-LABEL: test_vcmpoeqps_v4i1_v16i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX-LABEL: test_masked_vcmpoeqps_v4i1_v8i1_mask:
+; NoVLX: # %bb.0: # %entry
+; NoVLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vcmpeqps %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vandps %xmm2, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $13, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $12, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: def %al killed %al killed %eax
+; NoVLX-NEXT: vzeroupper
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <2 x i64> %__a to <4 x float>
+ %1 = bitcast <2 x i64> %__b to <4 x float>
+ %2 = fcmp oeq <4 x float> %0, %1
+ %3 = bitcast i4 %__u to <4 x i1>
+ %4 = and <4 x i1> %2, %3
+ %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %6 = bitcast <8 x i1> %5 to i8
+ ret i8 %6
+}
+
+define zeroext i8 @test_masked_vcmpoeqps_v4i1_v8i1_mask_mem(i4 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqps_v4i1_v8i1_mask_mem:
+; VLX: # %bb.0: # %entry
+; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT: vcmpeqps (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: def %al killed %al killed %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqps_v4i1_v8i1_mask_mem:
+; NoVLX: # %bb.0: # %entry
+; NoVLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vcmpeqps (%rsi), %xmm0, %xmm0
+; NoVLX-NEXT: vandps %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: andl $1, %eax
-; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $13, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $12, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: def %al killed %al killed %eax
+; NoVLX-NEXT: vzeroupper
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <2 x i64> %__a to <4 x float>
+ %load = load <2 x i64>, <2 x i64>* %__b
+ %1 = bitcast <2 x i64> %load to <4 x float>
+ %2 = fcmp oeq <4 x float> %0, %1
+ %3 = bitcast i4 %__u to <4 x i1>
+ %4 = and <4 x i1> %2, %3
+ %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %6 = bitcast <8 x i1> %5 to i8
+ ret i8 %6
+}
+
+define zeroext i8 @test_masked_vcmpoeqps_v4i1_v8i1_mask_mem_b(i4 zeroext %__u, <2 x i64> %__a, float* %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqps_v4i1_v8i1_mask_mem_b:
+; VLX: # %bb.0: # %entry
+; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT: vcmpeqps (%rsi){1to4}, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: def %al killed %al killed %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqps_v4i1_v8i1_mask_mem_b:
+; NoVLX: # %bb.0: # %entry
+; NoVLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vbroadcastss (%rsi), %xmm2
+; NoVLX-NEXT: vcmpeqps %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vandps %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $13, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $12, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: def %al killed %al killed %eax
+; NoVLX-NEXT: vzeroupper
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <2 x i64> %__a to <4 x float>
+ %load = load float, float* %__b
+ %vec = insertelement <4 x float> undef, float %load, i32 0
+ %1 = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+ %2 = fcmp oeq <4 x float> %0, %1
+ %3 = bitcast i4 %__u to <4 x i1>
+ %4 = and <4 x i1> %2, %3
+ %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %6 = bitcast <8 x i1> %5 to i8
+ ret i8 %6
+}
+
+
+
+define zeroext i16 @test_vcmpoeqps_v4i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
+; VLX-LABEL: test_vcmpoeqps_v4i1_v16i1_mask:
+; VLX: # %bb.0: # %entry
+; VLX-NEXT: vcmpeqps %xmm1, %xmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqps_v4i1_v16i1_mask:
+; NoVLX: # %bb.0: # %entry
+; NoVLX-NEXT: vcmpeqps %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $13, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $12, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x float>
@@ -51859,48 +37416,45 @@ entry:
define zeroext i16 @test_vcmpoeqps_v4i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vcmpoeqps_v4i1_v16i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vcmpeqps (%rdi), %xmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vcmpoeqps_v4i1_v16i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vcmpeqps (%rdi), %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kmovw %eax, %k0
; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: andl $1, %eax
-; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $13, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $12, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x float>
@@ -51914,49 +37468,231 @@ entry:
define zeroext i16 @test_vcmpoeqps_v4i1_v16i1_mask_mem_b(<2 x i64> %__a, float* %__b) local_unnamed_addr {
; VLX-LABEL: test_vcmpoeqps_v4i1_v16i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vcmpeqps (%rdi){1to4}, %xmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vcmpoeqps_v4i1_v16i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vbroadcastss (%rdi), %xmm1
; NoVLX-NEXT: vcmpeqps %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kmovw %eax, %k0
; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: andl $1, %eax
-; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $13, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $12, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <2 x i64> %__a to <4 x float>
+ %load = load float, float* %__b
+ %vec = insertelement <4 x float> undef, float %load, i32 0
+ %1 = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+ %2 = fcmp oeq <4 x float> %0, %1
+ %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
+ %4 = bitcast <16 x i1> %3 to i16
+ ret i16 %4
+}
+
+define zeroext i16 @test_masked_vcmpoeqps_v4i1_v16i1_mask(i4 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqps_v4i1_v16i1_mask:
+; VLX: # %bb.0: # %entry
+; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT: vcmpeqps %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqps_v4i1_v16i1_mask:
+; NoVLX: # %bb.0: # %entry
+; NoVLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vcmpeqps %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vandps %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $13, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $12, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
+; NoVLX-NEXT: vzeroupper
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <2 x i64> %__a to <4 x float>
+ %1 = bitcast <2 x i64> %__b to <4 x float>
+ %2 = fcmp oeq <4 x float> %0, %1
+ %3 = bitcast i4 %__u to <4 x i1>
+ %4 = and <4 x i1> %2, %3
+ %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
+ %6 = bitcast <16 x i1> %5 to i16
+ ret i16 %6
+}
+
+define zeroext i16 @test_masked_vcmpoeqps_v4i1_v16i1_mask_mem(i4 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqps_v4i1_v16i1_mask_mem:
+; VLX: # %bb.0: # %entry
+; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT: vcmpeqps (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqps_v4i1_v16i1_mask_mem:
+; NoVLX: # %bb.0: # %entry
+; NoVLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vcmpeqps (%rsi), %xmm0, %xmm0
+; NoVLX-NEXT: vandps %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $13, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $12, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
+; NoVLX-NEXT: vzeroupper
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <2 x i64> %__a to <4 x float>
+ %load = load <2 x i64>, <2 x i64>* %__b
+ %1 = bitcast <2 x i64> %load to <4 x float>
+ %2 = fcmp oeq <4 x float> %0, %1
+ %3 = bitcast i4 %__u to <4 x i1>
+ %4 = and <4 x i1> %2, %3
+ %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
+ %6 = bitcast <16 x i1> %5 to i16
+ ret i16 %6
+}
+
+define zeroext i16 @test_masked_vcmpoeqps_v4i1_v16i1_mask_mem_b(i4 zeroext %__u, <2 x i64> %__a, float* %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqps_v4i1_v16i1_mask_mem_b:
+; VLX: # %bb.0: # %entry
+; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT: vcmpeqps (%rsi){1to4}, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqps_v4i1_v16i1_mask_mem_b:
+; NoVLX: # %bb.0: # %entry
+; NoVLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vbroadcastss (%rsi), %xmm2
+; NoVLX-NEXT: vcmpeqps %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vandps %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $13, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $12, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x float>
@@ -51964,28 +37700,28 @@ entry:
%vec = insertelement <4 x float> undef, float %load, i32 0
%1 = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
%2 = fcmp oeq <4 x float> %0, %1
- %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
- %4 = bitcast <16 x i1> %3 to i16
- ret i16 %4
+ %3 = bitcast i4 %__u to <4 x i1>
+ %4 = and <4 x i1> %2, %3
+ %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
+ %6 = bitcast <16 x i1> %5 to i16
+ ret i16 %6
}
+
define zeroext i32 @test_vcmpoeqps_v4i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vcmpoeqps_v4i1_v32i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vcmpeqps %xmm1, %xmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vcmpoeqps_v4i1_v32i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1808:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1809:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1810:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -52003,6 +37739,7 @@ define zeroext i32 @test_vcmpoeqps_v4i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x float>
@@ -52015,20 +37752,17 @@ entry:
define zeroext i32 @test_vcmpoeqps_v4i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vcmpoeqps_v4i1_v32i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vcmpeqps (%rdi), %xmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vcmpoeqps_v4i1_v32i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1811:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1812:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1813:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -52046,6 +37780,7 @@ define zeroext i32 @test_vcmpoeqps_v4i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x float>
@@ -52059,20 +37794,17 @@ entry:
define zeroext i32 @test_vcmpoeqps_v4i1_v32i1_mask_mem_b(<2 x i64> %__a, float* %__b) local_unnamed_addr {
; VLX-LABEL: test_vcmpoeqps_v4i1_v32i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vcmpeqps (%rdi){1to4}, %xmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vcmpoeqps_v4i1_v32i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1814:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1815:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1816:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -52091,6 +37823,7 @@ define zeroext i32 @test_vcmpoeqps_v4i1_v32i1_mask_mem_b(<2 x i64> %__a, float*
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x float>
@@ -52103,29 +37836,180 @@ entry:
ret i32 %4
}
+define zeroext i32 @test_masked_vcmpoeqps_v4i1_v32i1_mask(i4 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqps_v4i1_v32i1_mask:
+; VLX: # %bb.0: # %entry
+; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT: vcmpeqps %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqps_v4i1_v32i1_mask:
+; NoVLX: # %bb.0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: movb %dil, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vcmpeqps %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vandps %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <2 x i64> %__a to <4 x float>
+ %1 = bitcast <2 x i64> %__b to <4 x float>
+ %2 = fcmp oeq <4 x float> %0, %1
+ %3 = bitcast i4 %__u to <4 x i1>
+ %4 = and <4 x i1> %2, %3
+ %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
+ %6 = bitcast <32 x i1> %5 to i32
+ ret i32 %6
+}
+
+define zeroext i32 @test_masked_vcmpoeqps_v4i1_v32i1_mask_mem(i4 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqps_v4i1_v32i1_mask_mem:
+; VLX: # %bb.0: # %entry
+; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT: vcmpeqps (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqps_v4i1_v32i1_mask_mem:
+; NoVLX: # %bb.0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: movb %dil, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vcmpeqps (%rsi), %xmm0, %xmm0
+; NoVLX-NEXT: vandps %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <2 x i64> %__a to <4 x float>
+ %load = load <2 x i64>, <2 x i64>* %__b
+ %1 = bitcast <2 x i64> %load to <4 x float>
+ %2 = fcmp oeq <4 x float> %0, %1
+ %3 = bitcast i4 %__u to <4 x i1>
+ %4 = and <4 x i1> %2, %3
+ %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
+ %6 = bitcast <32 x i1> %5 to i32
+ ret i32 %6
+}
+
+define zeroext i32 @test_masked_vcmpoeqps_v4i1_v32i1_mask_mem_b(i4 zeroext %__u, <2 x i64> %__a, float* %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqps_v4i1_v32i1_mask_mem_b:
+; VLX: # %bb.0: # %entry
+; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT: vcmpeqps (%rsi){1to4}, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqps_v4i1_v32i1_mask_mem_b:
+; NoVLX: # %bb.0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: movb %dil, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vbroadcastss (%rsi), %xmm2
+; NoVLX-NEXT: vcmpeqps %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vandps %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <2 x i64> %__a to <4 x float>
+ %load = load float, float* %__b
+ %vec = insertelement <4 x float> undef, float %load, i32 0
+ %1 = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+ %2 = fcmp oeq <4 x float> %0, %1
+ %3 = bitcast i4 %__u to <4 x i1>
+ %4 = and <4 x i1> %2, %3
+ %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
+ %6 = bitcast <32 x i1> %5 to i32
+ ret i32 %6
+}
+
+
define zeroext i64 @test_vcmpoeqps_v4i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vcmpoeqps_v4i1_v64i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vcmpeqps %xmm1, %xmm0, %k0
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vcmpoeqps_v4i1_v64i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1817:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1818:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1819:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vcmpeqps %xmm1, %xmm0, %xmm0
-; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -52144,6 +38028,7 @@ define zeroext i64 @test_vcmpoeqps_v4i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x float>
@@ -52156,26 +38041,22 @@ entry:
define zeroext i64 @test_vcmpoeqps_v4i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vcmpoeqps_v4i1_v64i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vcmpeqps (%rdi), %xmm0, %k0
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vcmpoeqps_v4i1_v64i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1820:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1821:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1822:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vcmpeqps (%rdi), %xmm0, %xmm0
-; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -52194,6 +38075,7 @@ define zeroext i64 @test_vcmpoeqps_v4i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x float>
@@ -52207,27 +38089,23 @@ entry:
define zeroext i64 @test_vcmpoeqps_v4i1_v64i1_mask_mem_b(<2 x i64> %__a, float* %__b) local_unnamed_addr {
; VLX-LABEL: test_vcmpoeqps_v4i1_v64i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vcmpeqps (%rdi){1to4}, %xmm0, %k0
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vcmpoeqps_v4i1_v64i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1823:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1824:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1825:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vbroadcastss (%rdi), %xmm1
; NoVLX-NEXT: vcmpeqps %xmm1, %xmm0, %xmm0
-; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -52246,6 +38124,7 @@ define zeroext i64 @test_vcmpoeqps_v4i1_v64i1_mask_mem_b(<2 x i64> %__a, float*
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <4 x float>
@@ -52258,25 +38137,199 @@ entry:
ret i64 %4
}
+define zeroext i64 @test_masked_vcmpoeqps_v4i1_v64i1_mask(i4 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqps_v4i1_v64i1_mask:
+; VLX: # %bb.0: # %entry
+; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT: vcmpeqps %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqps_v4i1_v64i1_mask:
+; NoVLX: # %bb.0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $96, %rsp
+; NoVLX-NEXT: movb %dil, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vcmpeqps %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vandps %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <2 x i64> %__a to <4 x float>
+ %1 = bitcast <2 x i64> %__b to <4 x float>
+ %2 = fcmp oeq <4 x float> %0, %1
+ %3 = bitcast i4 %__u to <4 x i1>
+ %4 = and <4 x i1> %2, %3
+ %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
+ %6 = bitcast <64 x i1> %5 to i64
+ ret i64 %6
+}
+
+define zeroext i64 @test_masked_vcmpoeqps_v4i1_v64i1_mask_mem(i4 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqps_v4i1_v64i1_mask_mem:
+; VLX: # %bb.0: # %entry
+; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT: vcmpeqps (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqps_v4i1_v64i1_mask_mem:
+; NoVLX: # %bb.0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $96, %rsp
+; NoVLX-NEXT: movb %dil, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vcmpeqps (%rsi), %xmm0, %xmm0
+; NoVLX-NEXT: vandps %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <2 x i64> %__a to <4 x float>
+ %load = load <2 x i64>, <2 x i64>* %__b
+ %1 = bitcast <2 x i64> %load to <4 x float>
+ %2 = fcmp oeq <4 x float> %0, %1
+ %3 = bitcast i4 %__u to <4 x i1>
+ %4 = and <4 x i1> %2, %3
+ %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
+ %6 = bitcast <64 x i1> %5 to i64
+ ret i64 %6
+}
+
+define zeroext i64 @test_masked_vcmpoeqps_v4i1_v64i1_mask_mem_b(i4 zeroext %__u, <2 x i64> %__a, float* %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqps_v4i1_v64i1_mask_mem_b:
+; VLX: # %bb.0: # %entry
+; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT: vcmpeqps (%rsi){1to4}, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqps_v4i1_v64i1_mask_mem_b:
+; NoVLX: # %bb.0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $96, %rsp
+; NoVLX-NEXT: movb %dil, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vbroadcastss (%rsi), %xmm2
+; NoVLX-NEXT: vcmpeqps %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vandps %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <2 x i64> %__a to <4 x float>
+ %load = load float, float* %__b
+ %vec = insertelement <4 x float> undef, float %load, i32 0
+ %1 = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+ %2 = fcmp oeq <4 x float> %0, %1
+ %3 = bitcast i4 %__u to <4 x i1>
+ %4 = and <4 x i1> %2, %3
+ %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
+ %6 = bitcast <64 x i1> %5 to i64
+ ret i64 %6
+}
+
+
define zeroext i16 @test_vcmpoeqps_v8i1_v16i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vcmpoeqps_v8i1_v16i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vcmpeqps %ymm1, %ymm0, %k0
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vcmpoeqps_v8i1_v16i1_mask:
-; NoVLX: # BB#0: # %entry
-; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
-; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX: # %bb.0: # %entry
+; NoVLX-NEXT: # kill: def %ymm1 killed %ymm1 def %zmm1
+; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
; NoVLX-NEXT: kshiftrw $8, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x float>
@@ -52289,22 +38342,23 @@ entry:
define zeroext i16 @test_vcmpoeqps_v8i1_v16i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vcmpoeqps_v8i1_v16i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vcmpeqps (%rdi), %ymm0, %k0
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vcmpoeqps_v8i1_v16i1_mask_mem:
-; NoVLX: # BB#0: # %entry
-; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX: # %bb.0: # %entry
+; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: vmovaps (%rdi), %ymm1
; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
; NoVLX-NEXT: kshiftrw $8, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x float>
@@ -52318,22 +38372,23 @@ entry:
define zeroext i16 @test_vcmpoeqps_v8i1_v16i1_mask_mem_b(<4 x i64> %__a, float* %__b) local_unnamed_addr {
; VLX-LABEL: test_vcmpoeqps_v8i1_v16i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vcmpeqps (%rdi){1to8}, %ymm0, %k0
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vcmpoeqps_v8i1_v16i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
-; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX: # %bb.0: # %entry
+; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: vbroadcastss (%rdi), %ymm1
; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0
; NoVLX-NEXT: kshiftlw $8, %k0, %k0
; NoVLX-NEXT: kshiftrw $8, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x float>
@@ -52346,66 +38401,156 @@ entry:
ret i16 %4
}
+define zeroext i16 @test_masked_vcmpoeqps_v8i1_v16i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqps_v8i1_v16i1_mask:
+; VLX: # %bb.0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vcmpeqps %ymm1, %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqps_v8i1_v16i1_mask:
+; NoVLX: # %bb.0: # %entry
+; NoVLX-NEXT: # kill: def %ymm1 killed %ymm1 def %zmm1
+; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $8, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
+; NoVLX-NEXT: vzeroupper
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <4 x i64> %__a to <8 x float>
+ %1 = bitcast <4 x i64> %__b to <8 x float>
+ %2 = fcmp oeq <8 x float> %0, %1
+ %3 = bitcast i8 %__u to <8 x i1>
+ %4 = and <8 x i1> %2, %3
+ %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %6 = bitcast <16 x i1> %5 to i16
+ ret i16 %6
+}
+
+define zeroext i16 @test_masked_vcmpoeqps_v8i1_v16i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqps_v8i1_v16i1_mask_mem:
+; VLX: # %bb.0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vcmpeqps (%rsi), %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqps_v8i1_v16i1_mask_mem:
+; NoVLX: # %bb.0: # %entry
+; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
+; NoVLX-NEXT: vmovaps (%rsi), %ymm1
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $8, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
+; NoVLX-NEXT: vzeroupper
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <4 x i64> %__a to <8 x float>
+ %load = load <4 x i64>, <4 x i64>* %__b
+ %1 = bitcast <4 x i64> %load to <8 x float>
+ %2 = fcmp oeq <8 x float> %0, %1
+ %3 = bitcast i8 %__u to <8 x i1>
+ %4 = and <8 x i1> %2, %3
+ %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %6 = bitcast <16 x i1> %5 to i16
+ ret i16 %6
+}
+
+define zeroext i16 @test_masked_vcmpoeqps_v8i1_v16i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, float* %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqps_v8i1_v16i1_mask_mem_b:
+; VLX: # %bb.0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vcmpeqps (%rsi){1to8}, %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqps_v8i1_v16i1_mask_mem_b:
+; NoVLX: # %bb.0: # %entry
+; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
+; NoVLX-NEXT: vbroadcastss (%rsi), %ymm1
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: kshiftlw $8, %k0, %k0
+; NoVLX-NEXT: kshiftrw $8, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
+; NoVLX-NEXT: vzeroupper
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <4 x i64> %__a to <8 x float>
+ %load = load float, float* %__b
+ %vec = insertelement <8 x float> undef, float %load, i32 0
+ %1 = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %2 = fcmp oeq <8 x float> %0, %1
+ %3 = bitcast i8 %__u to <8 x i1>
+ %4 = and <8 x i1> %2, %3
+ %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %6 = bitcast <16 x i1> %5 to i16
+ ret i16 %6
+}
+
+
define zeroext i32 @test_vcmpoeqps_v8i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vcmpoeqps_v8i1_v32i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vcmpeqps %ymm1, %ymm0, %k0
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vcmpoeqps_v8i1_v32i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1826:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1827:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1828:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
-; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: # kill: def %ymm1 killed %ymm1 def %zmm1
+; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -52413,6 +38558,7 @@ define zeroext i32 @test_vcmpoeqps_v8i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x float>
@@ -52425,63 +38571,50 @@ entry:
define zeroext i32 @test_vcmpoeqps_v8i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vcmpoeqps_v8i1_v32i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vcmpeqps (%rdi), %ymm0, %k0
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vcmpoeqps_v8i1_v32i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1829:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1830:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1831:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: vmovaps (%rdi), %ymm1
; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -52489,6 +38622,7 @@ define zeroext i32 @test_vcmpoeqps_v8i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x float>
@@ -52502,63 +38636,50 @@ entry:
define zeroext i32 @test_vcmpoeqps_v8i1_v32i1_mask_mem_b(<4 x i64> %__a, float* %__b) local_unnamed_addr {
; VLX-LABEL: test_vcmpoeqps_v8i1_v32i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vcmpeqps (%rdi){1to8}, %ymm0, %k0
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vcmpoeqps_v8i1_v32i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1832:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1833:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1834:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: vbroadcastss (%rdi), %ymm1
; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -52566,6 +38687,7 @@ define zeroext i32 @test_vcmpoeqps_v8i1_v32i1_mask_mem_b(<4 x i64> %__a, float*
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x float>
@@ -52578,68 +38700,263 @@ entry:
ret i32 %4
}
+define zeroext i32 @test_masked_vcmpoeqps_v8i1_v32i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqps_v8i1_v32i1_mask:
+; VLX: # %bb.0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vcmpeqps %ymm1, %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqps_v8i1_v32i1_mask:
+; NoVLX: # %bb.0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: # kill: def %ymm1 killed %ymm1 def %zmm1
+; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <4 x i64> %__a to <8 x float>
+ %1 = bitcast <4 x i64> %__b to <8 x float>
+ %2 = fcmp oeq <8 x float> %0, %1
+ %3 = bitcast i8 %__u to <8 x i1>
+ %4 = and <8 x i1> %2, %3
+ %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %6 = bitcast <32 x i1> %5 to i32
+ ret i32 %6
+}
+
+define zeroext i32 @test_masked_vcmpoeqps_v8i1_v32i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqps_v8i1_v32i1_mask_mem:
+; VLX: # %bb.0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vcmpeqps (%rsi), %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqps_v8i1_v32i1_mask_mem:
+; NoVLX: # %bb.0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
+; NoVLX-NEXT: vmovaps (%rsi), %ymm1
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <4 x i64> %__a to <8 x float>
+ %load = load <4 x i64>, <4 x i64>* %__b
+ %1 = bitcast <4 x i64> %load to <8 x float>
+ %2 = fcmp oeq <8 x float> %0, %1
+ %3 = bitcast i8 %__u to <8 x i1>
+ %4 = and <8 x i1> %2, %3
+ %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %6 = bitcast <32 x i1> %5 to i32
+ ret i32 %6
+}
+
+define zeroext i32 @test_masked_vcmpoeqps_v8i1_v32i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, float* %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqps_v8i1_v32i1_mask_mem_b:
+; VLX: # %bb.0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vcmpeqps (%rsi){1to8}, %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqps_v8i1_v32i1_mask_mem_b:
+; NoVLX: # %bb.0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
+; NoVLX-NEXT: vbroadcastss (%rsi), %ymm1
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <4 x i64> %__a to <8 x float>
+ %load = load float, float* %__b
+ %vec = insertelement <8 x float> undef, float %load, i32 0
+ %1 = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %2 = fcmp oeq <8 x float> %0, %1
+ %3 = bitcast i8 %__u to <8 x i1>
+ %4 = and <8 x i1> %2, %3
+ %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %6 = bitcast <32 x i1> %5 to i32
+ ret i32 %6
+}
+
+
define zeroext i64 @test_vcmpoeqps_v8i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vcmpoeqps_v8i1_v64i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vcmpeqps %ymm1, %ymm0, %k0
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vcmpoeqps_v8i1_v64i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1835:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1836:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1837:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
-; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: # kill: def %ymm1 killed %ymm1 def %zmm1
+; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -52650,6 +38967,7 @@ define zeroext i64 @test_vcmpoeqps_v8i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x float>
@@ -52662,65 +38980,52 @@ entry:
define zeroext i64 @test_vcmpoeqps_v8i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vcmpoeqps_v8i1_v64i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vcmpeqps (%rdi), %ymm0, %k0
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vcmpoeqps_v8i1_v64i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1838:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1839:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1840:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: vmovaps (%rdi), %ymm1
; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -52731,6 +39036,7 @@ define zeroext i64 @test_vcmpoeqps_v8i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x float>
@@ -52744,65 +39050,52 @@ entry:
define zeroext i64 @test_vcmpoeqps_v8i1_v64i1_mask_mem_b(<4 x i64> %__a, float* %__b) local_unnamed_addr {
; VLX-LABEL: test_vcmpoeqps_v8i1_v64i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vcmpeqps (%rdi){1to8}, %ymm0, %k0
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vcmpoeqps_v8i1_v64i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1841:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1842:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1843:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; NoVLX-NEXT: vbroadcastss (%rdi), %ymm1
; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -52813,6 +39106,7 @@ define zeroext i64 @test_vcmpoeqps_v8i1_v64i1_mask_mem_b(<4 x i64> %__a, float*
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <8 x float>
@@ -52825,121 +39119,260 @@ entry:
ret i64 %4
}
-
-define zeroext i32 @test_vcmpoeqps_v16i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; VLX-LABEL: test_vcmpoeqps_v16i1_v32i1_mask:
-; VLX: # BB#0: # %entry
-; VLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0
-; VLX-NEXT: kmovd %k0, %eax
+define zeroext i64 @test_masked_vcmpoeqps_v8i1_v64i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqps_v8i1_v64i1_mask:
+; VLX: # %bb.0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vcmpeqps %ymm1, %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
-; NoVLX-LABEL: test_vcmpoeqps_v16i1_v32i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX-LABEL: test_masked_vcmpoeqps_v8i1_v64i1_mask:
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1844:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1845:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1846:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
-; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: .Lcfi1847:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi1848:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi1849:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi1850:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi1851:
-; NoVLX-NEXT: .cfi_offset %r15, -24
-; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: # kill: def %ymm1 killed %ymm1 def %zmm1
+; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <4 x i64> %__a to <8 x float>
+ %1 = bitcast <4 x i64> %__b to <8 x float>
+ %2 = fcmp oeq <8 x float> %0, %1
+ %3 = bitcast i8 %__u to <8 x i1>
+ %4 = and <8 x i1> %2, %3
+ %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %6 = bitcast <64 x i1> %5 to i64
+ ret i64 %6
+}
+
+define zeroext i64 @test_masked_vcmpoeqps_v8i1_v64i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqps_v8i1_v64i1_mask_mem:
+; VLX: # %bb.0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vcmpeqps (%rsi), %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqps_v8i1_v64i1_mask_mem:
+; NoVLX: # %bb.0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
+; NoVLX-NEXT: vmovaps (%rsi), %ymm1
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <4 x i64> %__a to <8 x float>
+ %load = load <4 x i64>, <4 x i64>* %__b
+ %1 = bitcast <4 x i64> %load to <8 x float>
+ %2 = fcmp oeq <8 x float> %0, %1
+ %3 = bitcast i8 %__u to <8 x i1>
+ %4 = and <8 x i1> %2, %3
+ %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %6 = bitcast <64 x i1> %5 to i64
+ ret i64 %6
+}
+
+define zeroext i64 @test_masked_vcmpoeqps_v8i1_v64i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, float* %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqps_v8i1_v64i1_mask_mem_b:
+; VLX: # %bb.0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vcmpeqps (%rsi){1to8}, %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqps_v8i1_v64i1_mask_mem_b:
+; NoVLX: # %bb.0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
+; NoVLX-NEXT: vbroadcastss (%rsi), %ymm1
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <4 x i64> %__a to <8 x float>
+ %load = load float, float* %__b
+ %vec = insertelement <8 x float> undef, float %load, i32 0
+ %1 = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %2 = fcmp oeq <8 x float> %0, %1
+ %3 = bitcast i8 %__u to <8 x i1>
+ %4 = and <8 x i1> %2, %3
+ %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %6 = bitcast <64 x i1> %5 to i64
+ ret i64 %6
+}
+
+
+
+define zeroext i32 @test_vcmpoeqps_v16i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
+; VLX-LABEL: test_vcmpoeqps_v16i1_v32i1_mask:
+; VLX: # %bb.0: # %entry
+; VLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqps_v16i1_v32i1_mask:
+; NoVLX: # %bb.0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k1
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x float>
@@ -52952,118 +39385,34 @@ entry:
define zeroext i32 @test_vcmpoeqps_v16i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vcmpoeqps_v16i1_v32i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vcmpeqps (%rdi), %zmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vcmpoeqps_v16i1_v32i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1852:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1853:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1854:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: .Lcfi1855:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi1856:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi1857:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi1858:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi1859:
-; NoVLX-NEXT: .cfi_offset %r15, -24
-; NoVLX-NEXT: vcmpeqps (%rdi), %zmm0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vcmpeqps (%rdi), %zmm0, %k1
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x float>
@@ -53077,118 +39426,34 @@ entry:
define zeroext i32 @test_vcmpoeqps_v16i1_v32i1_mask_mem_b(<8 x i64> %__a, float* %__b) local_unnamed_addr {
; VLX-LABEL: test_vcmpoeqps_v16i1_v32i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vcmpeqps (%rdi){1to16}, %zmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vcmpoeqps_v16i1_v32i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1860:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1861:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1862:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
-; NoVLX-NEXT: .Lcfi1863:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi1864:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi1865:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi1866:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi1867:
-; NoVLX-NEXT: .cfi_offset %r15, -24
-; NoVLX-NEXT: vcmpeqps (%rdi){1to16}, %zmm0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vcmpeqps (%rdi){1to16}, %zmm0, %k1
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, (%rsp)
; NoVLX-NEXT: movl (%rsp), %eax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x float>
@@ -53201,133 +39466,208 @@ entry:
ret i32 %4
}
+define zeroext i32 @test_masked_vcmpoeqps_v16i1_v32i1_mask(i16 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqps_v16i1_v32i1_mask:
+; VLX: # %bb.0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqps_v16i1_v32i1_mask:
+; NoVLX: # %bb.0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k1 {%k1}
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <8 x i64> %__a to <16 x float>
+ %1 = bitcast <8 x i64> %__b to <16 x float>
+ %2 = fcmp oeq <16 x float> %0, %1
+ %3 = bitcast i16 %__u to <16 x i1>
+ %4 = and <16 x i1> %2, %3
+ %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+ %6 = bitcast <32 x i1> %5 to i32
+ ret i32 %6
+}
+
+define zeroext i32 @test_masked_vcmpoeqps_v16i1_v32i1_mask_mem(i16 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqps_v16i1_v32i1_mask_mem:
+; VLX: # %bb.0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vcmpeqps (%rsi), %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqps_v16i1_v32i1_mask_mem:
+; NoVLX: # %bb.0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vcmpeqps (%rsi), %zmm0, %k1 {%k1}
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <8 x i64> %__a to <16 x float>
+ %load = load <8 x i64>, <8 x i64>* %__b
+ %1 = bitcast <8 x i64> %load to <16 x float>
+ %2 = fcmp oeq <16 x float> %0, %1
+ %3 = bitcast i16 %__u to <16 x i1>
+ %4 = and <16 x i1> %2, %3
+ %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+ %6 = bitcast <32 x i1> %5 to i32
+ ret i32 %6
+}
+
+define zeroext i32 @test_masked_vcmpoeqps_v16i1_v32i1_mask_mem_b(i16 zeroext %__u, <8 x i64> %__a, float* %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqps_v16i1_v32i1_mask_mem_b:
+; VLX: # %bb.0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vcmpeqps (%rsi){1to16}, %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqps_v16i1_v32i1_mask_mem_b:
+; NoVLX: # %bb.0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vcmpeqps (%rsi){1to16}, %zmm0, %k1 {%k1}
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <8 x i64> %__a to <16 x float>
+ %load = load float, float* %__b
+ %vec = insertelement <16 x float> undef, float %load, i32 0
+ %1 = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %2 = fcmp oeq <16 x float> %0, %1
+ %3 = bitcast i16 %__u to <16 x i1>
+ %4 = and <16 x i1> %2, %3
+ %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+ %6 = bitcast <32 x i1> %5 to i32
+ ret i32 %6
+}
+
+
define zeroext i32 @test_vcmpoeqps_v16i1_v32i1_sae_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; VLX-LABEL: test_vcmpoeqps_v16i1_v32i1_sae_mask:
-; VLX: # BB#0: # %entry
-; VLX-NEXT: vcmpleps {sae}, %zmm1, %zmm0, %k0
+; CHECK-LABEL: test_vcmpoeqps_v16i1_v32i1_sae_mask:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vcmpleps {sae}, %zmm1, %zmm0, %k0
+; CHECK-NEXT: kmovw %k0, %eax
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+entry:
+ %0 = bitcast <8 x i64> %__a to <16 x float>
+ %1 = bitcast <8 x i64> %__b to <16 x float>
+ %2 = call i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %0, <16 x float> %1, i32 2, i16 -1, i32 8)
+ %3 = zext i16 %2 to i32
+ ret i32 %3
+}
+
+define zeroext i32 @test_masked_vcmpoeqps_v16i1_v32i1_sae_mask(i16 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqps_v16i1_v32i1_sae_mask:
+; VLX: # %bb.0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vcmpleps {sae}, %zmm1, %zmm0, %k0 {%k1}
; VLX-NEXT: kmovw %k0, %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
-; NoVLX-LABEL: test_vcmpoeqps_v16i1_v32i1_sae_mask:
-; NoVLX: # BB#0: # %entry
-; NoVLX-NEXT: vcmpleps {sae}, %zmm1, %zmm0, %k0
+; NoVLX-LABEL: test_masked_vcmpoeqps_v16i1_v32i1_sae_mask:
+; NoVLX: # %bb.0: # %entry
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vcmpleps {sae}, %zmm1, %zmm0, %k0 {%k1}
; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x float>
%1 = bitcast <8 x i64> %__b to <16 x float>
- %2 = call i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %0, <16 x float> %1, i32 2, i16 -1, i32 8)
+ %2 = call i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %0, <16 x float> %1, i32 2, i16 %__u, i32 8)
%3 = zext i16 %2 to i32
ret i32 %3
}
+
define zeroext i64 @test_vcmpoeqps_v16i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vcmpoeqps_v16i1_v64i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vcmpoeqps_v16i1_v64i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1868:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1869:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1870:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: .Lcfi1871:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi1872:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi1873:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi1874:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi1875:
-; NoVLX-NEXT: .cfi_offset %r15, -24
-; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k1
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -53336,13 +39676,9 @@ define zeroext i64 @test_vcmpoeqps_v16i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %_
; NoVLX-NEXT: shlq $32, %rcx
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: orq %rcx, %rax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x float>
@@ -53355,108 +39691,28 @@ entry:
define zeroext i64 @test_vcmpoeqps_v16i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vcmpoeqps_v16i1_v64i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vcmpeqps (%rdi), %zmm0, %k0
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vcmpoeqps_v16i1_v64i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1876:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1877:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1878:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: .Lcfi1879:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi1880:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi1881:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi1882:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi1883:
-; NoVLX-NEXT: .cfi_offset %r15, -24
-; NoVLX-NEXT: vcmpeqps (%rdi), %zmm0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vcmpeqps (%rdi), %zmm0, %k1
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -53465,13 +39721,9 @@ define zeroext i64 @test_vcmpoeqps_v16i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64
; NoVLX-NEXT: shlq $32, %rcx
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: orq %rcx, %rax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x float>
@@ -53485,108 +39737,28 @@ entry:
define zeroext i64 @test_vcmpoeqps_v16i1_v64i1_mask_mem_b(<8 x i64> %__a, float* %__b) local_unnamed_addr {
; VLX-LABEL: test_vcmpoeqps_v16i1_v64i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vcmpeqps (%rdi){1to16}, %zmm0, %k0
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vcmpoeqps_v16i1_v64i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1884:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1885:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1886:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
-; NoVLX-NEXT: pushq %r15
-; NoVLX-NEXT: pushq %r14
-; NoVLX-NEXT: pushq %r13
-; NoVLX-NEXT: pushq %r12
-; NoVLX-NEXT: pushq %rbx
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
-; NoVLX-NEXT: .Lcfi1887:
-; NoVLX-NEXT: .cfi_offset %rbx, -56
-; NoVLX-NEXT: .Lcfi1888:
-; NoVLX-NEXT: .cfi_offset %r12, -48
-; NoVLX-NEXT: .Lcfi1889:
-; NoVLX-NEXT: .cfi_offset %r13, -40
-; NoVLX-NEXT: .Lcfi1890:
-; NoVLX-NEXT: .cfi_offset %r14, -32
-; NoVLX-NEXT: .Lcfi1891:
-; NoVLX-NEXT: .cfi_offset %r15, -24
-; NoVLX-NEXT: vcmpeqps (%rdi){1to16}, %zmm0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r11d
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r14d
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r15d
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r12d
-; NoVLX-NEXT: kshiftlw $8, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %r13d
-; NoVLX-NEXT: kshiftlw $7, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $6, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %ebx
-; NoVLX-NEXT: kshiftlw $5, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $4, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $3, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $2, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vmovd %r10d, %xmm0
-; NoVLX-NEXT: kmovw %k1, %r10d
-; NoVLX-NEXT: kshiftlw $1, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vcmpeqps (%rdi){1to16}, %zmm0, %k1
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -53595,13 +39767,9 @@ define zeroext i64 @test_vcmpoeqps_v16i1_v64i1_mask_mem_b(<8 x i64> %__a, float*
; NoVLX-NEXT: shlq $32, %rcx
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: orq %rcx, %rax
-; NoVLX-NEXT: leaq -40(%rbp), %rsp
-; NoVLX-NEXT: popq %rbx
-; NoVLX-NEXT: popq %r12
-; NoVLX-NEXT: popq %r13
-; NoVLX-NEXT: popq %r14
-; NoVLX-NEXT: popq %r15
+; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x float>
@@ -53614,10 +39782,161 @@ entry:
ret i64 %4
}
+define zeroext i64 @test_masked_vcmpoeqps_v16i1_v64i1_mask(i16 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqps_v16i1_v64i1_mask:
+; VLX: # %bb.0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vcmpeqps %zmm1, %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqps_v16i1_v64i1_mask:
+; NoVLX: # %bb.0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vcmpeqps %zmm1, %zmm0, %k1 {%k1}
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <8 x i64> %__a to <16 x float>
+ %1 = bitcast <8 x i64> %__b to <16 x float>
+ %2 = fcmp oeq <16 x float> %0, %1
+ %3 = bitcast i16 %__u to <16 x i1>
+ %4 = and <16 x i1> %2, %3
+ %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <64 x i32> <i32 0,i32 1,i32 2,i32 3,i32 4,i32 5,i32 6,i32 7,i32 8,i32 9,i32 10,i32 11,i32 12,i32 13,i32 14,i32 15,i32 16,i32 17,i32 18,i32 19,i32 20,i32 21,i32 22,i32 23,i32 24,i32 25,i32 26,i32 27,i32 28,i32 29,i32 30,i32 31,i32 16,i32 17,i32 18,i32 19,i32 20,i32 21,i32 22,i32 23,i32 24,i32 25,i32 26,i32 27,i32 28,i32 29,i32 30,i32 31,i32 16,i32 17,i32 18,i32 19,i32 20,i32 21,i32 22,i32 23,i32 24,i32 25,i32 26,i32 27,i32 28,i32 29,i32 30,i32 31>
+ %6 = bitcast <64 x i1> %5 to i64
+ ret i64 %6
+}
+
+define zeroext i64 @test_masked_vcmpoeqps_v16i1_v64i1_mask_mem(i16 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqps_v16i1_v64i1_mask_mem:
+; VLX: # %bb.0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vcmpeqps (%rsi), %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqps_v16i1_v64i1_mask_mem:
+; NoVLX: # %bb.0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vcmpeqps (%rsi), %zmm0, %k1 {%k1}
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <8 x i64> %__a to <16 x float>
+ %load = load <8 x i64>, <8 x i64>* %__b
+ %1 = bitcast <8 x i64> %load to <16 x float>
+ %2 = fcmp oeq <16 x float> %0, %1
+ %3 = bitcast i16 %__u to <16 x i1>
+ %4 = and <16 x i1> %2, %3
+ %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <64 x i32> <i32 0,i32 1,i32 2,i32 3,i32 4,i32 5,i32 6,i32 7,i32 8,i32 9,i32 10,i32 11,i32 12,i32 13,i32 14,i32 15,i32 16,i32 17,i32 18,i32 19,i32 20,i32 21,i32 22,i32 23,i32 24,i32 25,i32 26,i32 27,i32 28,i32 29,i32 30,i32 31,i32 16,i32 17,i32 18,i32 19,i32 20,i32 21,i32 22,i32 23,i32 24,i32 25,i32 26,i32 27,i32 28,i32 29,i32 30,i32 31,i32 16,i32 17,i32 18,i32 19,i32 20,i32 21,i32 22,i32 23,i32 24,i32 25,i32 26,i32 27,i32 28,i32 29,i32 30,i32 31>
+ %6 = bitcast <64 x i1> %5 to i64
+ ret i64 %6
+}
+
+define zeroext i64 @test_masked_vcmpoeqps_v16i1_v64i1_mask_mem_b(i16 zeroext %__u, <8 x i64> %__a, float* %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqps_v16i1_v64i1_mask_mem_b:
+; VLX: # %bb.0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vcmpeqps (%rsi){1to16}, %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqps_v16i1_v64i1_mask_mem_b:
+; NoVLX: # %bb.0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vcmpeqps (%rsi){1to16}, %zmm0, %k1 {%k1}
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <8 x i64> %__a to <16 x float>
+ %load = load float, float* %__b
+ %vec = insertelement <16 x float> undef, float %load, i32 0
+ %1 = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %2 = fcmp oeq <16 x float> %0, %1
+ %3 = bitcast i16 %__u to <16 x i1>
+ %4 = and <16 x i1> %2, %3
+ %5 = shufflevector <16 x i1> %4, <16 x i1> zeroinitializer, <64 x i32> <i32 0,i32 1,i32 2,i32 3,i32 4,i32 5,i32 6,i32 7,i32 8,i32 9,i32 10,i32 11,i32 12,i32 13,i32 14,i32 15,i32 16,i32 17,i32 18,i32 19,i32 20,i32 21,i32 22,i32 23,i32 24,i32 25,i32 26,i32 27,i32 28,i32 29,i32 30,i32 31,i32 16,i32 17,i32 18,i32 19,i32 20,i32 21,i32 22,i32 23,i32 24,i32 25,i32 26,i32 27,i32 28,i32 29,i32 30,i32 31,i32 16,i32 17,i32 18,i32 19,i32 20,i32 21,i32 22,i32 23,i32 24,i32 25,i32 26,i32 27,i32 28,i32 29,i32 30,i32 31>
+ %6 = bitcast <64 x i1> %5 to i64
+ ret i64 %6
+}
+
+
define zeroext i64 @test_vcmpoeqps_v16i1_v64i1_sae_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vcmpoeqps_v16i1_v64i1_sae_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vcmpleps {sae}, %zmm1, %zmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: movzwl %ax, %eax
@@ -53625,10 +39944,11 @@ define zeroext i64 @test_vcmpoeqps_v16i1_v64i1_sae_mask(<8 x i64> %__a, <8 x i64
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vcmpoeqps_v16i1_v64i1_sae_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vcmpleps {sae}, %zmm1, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: movzwl %ax, %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <16 x float>
@@ -53638,18 +39958,45 @@ entry:
ret i64 %3
}
+define zeroext i64 @test_masked_vcmpoeqps_v16i1_v64i1_sae_mask(i16 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqps_v16i1_v64i1_sae_mask:
+; VLX: # %bb.0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vcmpleps {sae}, %zmm1, %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: movzwl %ax, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqps_v16i1_v64i1_sae_mask:
+; NoVLX: # %bb.0: # %entry
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vcmpleps {sae}, %zmm1, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: movzwl %ax, %eax
+; NoVLX-NEXT: vzeroupper
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <8 x i64> %__a to <16 x float>
+ %1 = bitcast <8 x i64> %__b to <16 x float>
+ %2 = call i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %0, <16 x float> %1, i32 2, i16 %__u, i32 8)
+ %3 = zext i16 %2 to i64
+ ret i64 %3
+}
+
+
declare i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> , <8 x double> , i32, i8, i32)
define zeroext i4 @test_vcmpoeqpd_v2i1_v4i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vcmpoeqpd_v2i1_v4i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vcmpeqpd %xmm1, %xmm0, %k0
; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v4i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0
@@ -53657,6 +40004,7 @@ define zeroext i4 @test_vcmpoeqpd_v2i1_v4i1_mask(<2 x i64> %__a, <2 x i64> %__b)
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp)
; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x double>
@@ -53669,14 +40017,14 @@ entry:
define zeroext i4 @test_vcmpoeqpd_v2i1_v4i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vcmpoeqpd_v2i1_v4i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vcmpeqpd (%rdi), %xmm0, %k0
; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v4i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vcmpeqpd (%rdi), %xmm0, %xmm0
; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0
@@ -53684,6 +40032,7 @@ define zeroext i4 @test_vcmpoeqpd_v2i1_v4i1_mask_mem(<2 x i64> %__a, <2 x i64>*
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp)
; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x double>
@@ -53697,14 +40046,14 @@ entry:
define zeroext i4 @test_vcmpoeqpd_v2i1_v4i1_mask_mem_b(<2 x i64> %__a, double* %__b) local_unnamed_addr {
; VLX-LABEL: test_vcmpoeqpd_v2i1_v4i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vcmpeqpd (%rdi){1to2}, %xmm0, %k0
; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v4i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0]
; NoVLX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
@@ -53713,6 +40062,7 @@ define zeroext i4 @test_vcmpoeqpd_v2i1_v4i1_mask_mem_b(<2 x i64> %__a, double* %
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp)
; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x double>
@@ -53725,36 +40075,150 @@ entry:
ret i4 %4
}
+define zeroext i4 @test_masked_vcmpoeqpd_v2i1_v4i1_mask(i2 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqpd_v2i1_v4i1_mask:
+; VLX: # %bb.0: # %entry
+; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT: vcmpeqpd %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqpd_v2i1_v4i1_mask:
+; NoVLX: # %bb.0: # %entry
+; NoVLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vandpd %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: vzeroupper
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <2 x i64> %__a to <2 x double>
+ %1 = bitcast <2 x i64> %__b to <2 x double>
+ %2 = fcmp oeq <2 x double> %0, %1
+ %3 = bitcast i2 %__u to <2 x i1>
+ %4 = and <2 x i1> %2, %3
+ %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %6 = bitcast <4 x i1> %5 to i4
+ ret i4 %6
+}
+
+define zeroext i4 @test_masked_vcmpoeqpd_v2i1_v4i1_mask_mem(i2 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqpd_v2i1_v4i1_mask_mem:
+; VLX: # %bb.0: # %entry
+; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT: vcmpeqpd (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqpd_v2i1_v4i1_mask_mem:
+; NoVLX: # %bb.0: # %entry
+; NoVLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vcmpeqpd (%rsi), %xmm0, %xmm0
+; NoVLX-NEXT: vandpd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: vzeroupper
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <2 x i64> %__a to <2 x double>
+ %load = load <2 x i64>, <2 x i64>* %__b
+ %1 = bitcast <2 x i64> %load to <2 x double>
+ %2 = fcmp oeq <2 x double> %0, %1
+ %3 = bitcast i2 %__u to <2 x i1>
+ %4 = and <2 x i1> %2, %3
+ %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %6 = bitcast <4 x i1> %5 to i4
+ ret i4 %6
+}
+
+define zeroext i4 @test_masked_vcmpoeqpd_v2i1_v4i1_mask_mem_b(i2 zeroext %__u, <2 x i64> %__a, double* %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqpd_v2i1_v4i1_mask_mem_b:
+; VLX: # %bb.0: # %entry
+; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT: vcmpeqpd (%rsi){1to2}, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovb %k0, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqpd_v2i1_v4i1_mask_mem_b:
+; NoVLX: # %bb.0: # %entry
+; NoVLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0]
+; NoVLX-NEXT: vcmpeqpd %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vandpd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; NoVLX-NEXT: vpslld $31, %ymm0, %ymm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: vzeroupper
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <2 x i64> %__a to <2 x double>
+ %load = load double, double* %__b
+ %vec = insertelement <2 x double> undef, double %load, i32 0
+ %1 = shufflevector <2 x double> %vec, <2 x double> undef, <2 x i32> <i32 0, i32 0>
+ %2 = fcmp oeq <2 x double> %0, %1
+ %3 = bitcast i2 %__u to <2 x i1>
+ %4 = and <2 x i1> %2, %3
+ %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %6 = bitcast <4 x i1> %5 to i4
+ ret i4 %6
+}
+
+
define zeroext i8 @test_vcmpoeqpd_v2i1_v8i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vcmpoeqpd_v2i1_v8i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vcmpeqpd %xmm1, %xmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %al killed %al killed %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v8i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kshiftlw $7, %k0, %k0
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
-; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %al killed %al killed %eax
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x double>
@@ -53767,33 +40231,31 @@ entry:
define zeroext i8 @test_vcmpoeqpd_v2i1_v8i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vcmpoeqpd_v2i1_v8i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vcmpeqpd (%rdi), %xmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %al killed %al killed %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v8i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vcmpeqpd (%rdi), %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kshiftlw $7, %k0, %k0
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
-; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %al killed %al killed %eax
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x double>
@@ -53807,34 +40269,32 @@ entry:
define zeroext i8 @test_vcmpoeqpd_v2i1_v8i1_mask_mem_b(<2 x i64> %__a, double* %__b) local_unnamed_addr {
; VLX-LABEL: test_vcmpoeqpd_v2i1_v8i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vcmpeqpd (%rdi){1to2}, %xmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %al killed %al killed %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v8i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0]
; NoVLX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kshiftlw $7, %k0, %k0
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
-; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %al killed %al killed %eax
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x double>
@@ -53847,35 +40307,180 @@ entry:
ret i8 %4
}
+define zeroext i8 @test_masked_vcmpoeqpd_v2i1_v8i1_mask(i2 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqpd_v2i1_v8i1_mask:
+; VLX: # %bb.0: # %entry
+; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT: vcmpeqpd %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: def %al killed %al killed %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqpd_v2i1_v8i1_mask:
+; NoVLX: # %bb.0: # %entry
+; NoVLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vandpd %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: def %al killed %al killed %eax
+; NoVLX-NEXT: vzeroupper
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <2 x i64> %__a to <2 x double>
+ %1 = bitcast <2 x i64> %__b to <2 x double>
+ %2 = fcmp oeq <2 x double> %0, %1
+ %3 = bitcast i2 %__u to <2 x i1>
+ %4 = and <2 x i1> %2, %3
+ %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
+ %6 = bitcast <8 x i1> %5 to i8
+ ret i8 %6
+}
+
+define zeroext i8 @test_masked_vcmpoeqpd_v2i1_v8i1_mask_mem(i2 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqpd_v2i1_v8i1_mask_mem:
+; VLX: # %bb.0: # %entry
+; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT: vcmpeqpd (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: def %al killed %al killed %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqpd_v2i1_v8i1_mask_mem:
+; NoVLX: # %bb.0: # %entry
+; NoVLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vcmpeqpd (%rsi), %xmm0, %xmm0
+; NoVLX-NEXT: vandpd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: def %al killed %al killed %eax
+; NoVLX-NEXT: vzeroupper
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <2 x i64> %__a to <2 x double>
+ %load = load <2 x i64>, <2 x i64>* %__b
+ %1 = bitcast <2 x i64> %load to <2 x double>
+ %2 = fcmp oeq <2 x double> %0, %1
+ %3 = bitcast i2 %__u to <2 x i1>
+ %4 = and <2 x i1> %2, %3
+ %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
+ %6 = bitcast <8 x i1> %5 to i8
+ ret i8 %6
+}
+
+define zeroext i8 @test_masked_vcmpoeqpd_v2i1_v8i1_mask_mem_b(i2 zeroext %__u, <2 x i64> %__a, double* %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqpd_v2i1_v8i1_mask_mem_b:
+; VLX: # %bb.0: # %entry
+; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT: vcmpeqpd (%rsi){1to2}, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: def %al killed %al killed %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqpd_v2i1_v8i1_mask_mem_b:
+; NoVLX: # %bb.0: # %entry
+; NoVLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0]
+; NoVLX-NEXT: vcmpeqpd %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vandpd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: def %al killed %al killed %eax
+; NoVLX-NEXT: vzeroupper
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <2 x i64> %__a to <2 x double>
+ %load = load double, double* %__b
+ %vec = insertelement <2 x double> undef, double %load, i32 0
+ %1 = shufflevector <2 x double> %vec, <2 x double> undef, <2 x i32> <i32 0, i32 0>
+ %2 = fcmp oeq <2 x double> %0, %1
+ %3 = bitcast i2 %__u to <2 x i1>
+ %4 = and <2 x i1> %2, %3
+ %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
+ %6 = bitcast <8 x i1> %5 to i8
+ ret i8 %6
+}
+
+
define zeroext i16 @test_vcmpoeqpd_v2i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vcmpoeqpd_v2i1_v16i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vcmpeqpd %xmm1, %xmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v16i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kmovw %eax, %k0
; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: andl $1, %eax
-; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
-; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x double>
@@ -53888,32 +40493,31 @@ entry:
define zeroext i16 @test_vcmpoeqpd_v2i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vcmpoeqpd_v2i1_v16i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vcmpeqpd (%rdi), %xmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v16i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vcmpeqpd (%rdi), %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kmovw %eax, %k0
; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: andl $1, %eax
-; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
-; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x double>
@@ -53927,33 +40531,32 @@ entry:
define zeroext i16 @test_vcmpoeqpd_v2i1_v16i1_mask_mem_b(<2 x i64> %__a, double* %__b) local_unnamed_addr {
; VLX-LABEL: test_vcmpoeqpd_v2i1_v16i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vcmpeqpd (%rdi){1to2}, %xmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v16i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0]
; NoVLX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kmovw %eax, %k0
; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: andl $1, %eax
-; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
-; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x double>
@@ -53966,23 +40569,166 @@ entry:
ret i16 %4
}
+define zeroext i16 @test_masked_vcmpoeqpd_v2i1_v16i1_mask(i2 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqpd_v2i1_v16i1_mask:
+; VLX: # %bb.0: # %entry
+; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT: vcmpeqpd %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqpd_v2i1_v16i1_mask:
+; NoVLX: # %bb.0: # %entry
+; NoVLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vandpd %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
+; NoVLX-NEXT: vzeroupper
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <2 x i64> %__a to <2 x double>
+ %1 = bitcast <2 x i64> %__b to <2 x double>
+ %2 = fcmp oeq <2 x double> %0, %1
+ %3 = bitcast i2 %__u to <2 x i1>
+ %4 = and <2 x i1> %2, %3
+ %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
+ %6 = bitcast <16 x i1> %5 to i16
+ ret i16 %6
+}
+
+define zeroext i16 @test_masked_vcmpoeqpd_v2i1_v16i1_mask_mem(i2 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqpd_v2i1_v16i1_mask_mem:
+; VLX: # %bb.0: # %entry
+; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT: vcmpeqpd (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqpd_v2i1_v16i1_mask_mem:
+; NoVLX: # %bb.0: # %entry
+; NoVLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vcmpeqpd (%rsi), %xmm0, %xmm0
+; NoVLX-NEXT: vandpd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
+; NoVLX-NEXT: vzeroupper
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <2 x i64> %__a to <2 x double>
+ %load = load <2 x i64>, <2 x i64>* %__b
+ %1 = bitcast <2 x i64> %load to <2 x double>
+ %2 = fcmp oeq <2 x double> %0, %1
+ %3 = bitcast i2 %__u to <2 x i1>
+ %4 = and <2 x i1> %2, %3
+ %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
+ %6 = bitcast <16 x i1> %5 to i16
+ ret i16 %6
+}
+
+define zeroext i16 @test_masked_vcmpoeqpd_v2i1_v16i1_mask_mem_b(i2 zeroext %__u, <2 x i64> %__a, double* %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqpd_v2i1_v16i1_mask_mem_b:
+; VLX: # %bb.0: # %entry
+; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT: vcmpeqpd (%rsi){1to2}, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqpd_v2i1_v16i1_mask_mem_b:
+; NoVLX: # %bb.0: # %entry
+; NoVLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0]
+; NoVLX-NEXT: vcmpeqpd %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vandpd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
+; NoVLX-NEXT: vzeroupper
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <2 x i64> %__a to <2 x double>
+ %load = load double, double* %__b
+ %vec = insertelement <2 x double> undef, double %load, i32 0
+ %1 = shufflevector <2 x double> %vec, <2 x double> undef, <2 x i32> <i32 0, i32 0>
+ %2 = fcmp oeq <2 x double> %0, %1
+ %3 = bitcast i2 %__u to <2 x i1>
+ %4 = and <2 x i1> %2, %3
+ %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
+ %6 = bitcast <16 x i1> %5 to i16
+ ret i16 %6
+}
+
+
define zeroext i32 @test_vcmpoeqpd_v2i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vcmpoeqpd_v2i1_v32i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vcmpeqpd %xmm1, %xmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v32i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1892:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1893:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1894:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -54000,6 +40746,7 @@ define zeroext i32 @test_vcmpoeqpd_v2i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x double>
@@ -54012,20 +40759,17 @@ entry:
define zeroext i32 @test_vcmpoeqpd_v2i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vcmpoeqpd_v2i1_v32i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vcmpeqpd (%rdi), %xmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v32i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1895:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1896:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1897:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -54043,6 +40787,7 @@ define zeroext i32 @test_vcmpoeqpd_v2i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x double>
@@ -54056,20 +40801,17 @@ entry:
define zeroext i32 @test_vcmpoeqpd_v2i1_v32i1_mask_mem_b(<2 x i64> %__a, double* %__b) local_unnamed_addr {
; VLX-LABEL: test_vcmpoeqpd_v2i1_v32i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vcmpeqpd (%rdi){1to2}, %xmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v32i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1898:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1899:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1900:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -54088,6 +40830,7 @@ define zeroext i32 @test_vcmpoeqpd_v2i1_v32i1_mask_mem_b(<2 x i64> %__a, double*
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x double>
@@ -54100,29 +40843,180 @@ entry:
ret i32 %4
}
+define zeroext i32 @test_masked_vcmpoeqpd_v2i1_v32i1_mask(i2 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqpd_v2i1_v32i1_mask:
+; VLX: # %bb.0: # %entry
+; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT: vcmpeqpd %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqpd_v2i1_v32i1_mask:
+; NoVLX: # %bb.0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: movb %dil, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vandpd %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <2 x i64> %__a to <2 x double>
+ %1 = bitcast <2 x i64> %__b to <2 x double>
+ %2 = fcmp oeq <2 x double> %0, %1
+ %3 = bitcast i2 %__u to <2 x i1>
+ %4 = and <2 x i1> %2, %3
+ %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
+ %6 = bitcast <32 x i1> %5 to i32
+ ret i32 %6
+}
+
+define zeroext i32 @test_masked_vcmpoeqpd_v2i1_v32i1_mask_mem(i2 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqpd_v2i1_v32i1_mask_mem:
+; VLX: # %bb.0: # %entry
+; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT: vcmpeqpd (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqpd_v2i1_v32i1_mask_mem:
+; NoVLX: # %bb.0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: movb %dil, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vcmpeqpd (%rsi), %xmm0, %xmm0
+; NoVLX-NEXT: vandpd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <2 x i64> %__a to <2 x double>
+ %load = load <2 x i64>, <2 x i64>* %__b
+ %1 = bitcast <2 x i64> %load to <2 x double>
+ %2 = fcmp oeq <2 x double> %0, %1
+ %3 = bitcast i2 %__u to <2 x i1>
+ %4 = and <2 x i1> %2, %3
+ %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
+ %6 = bitcast <32 x i1> %5 to i32
+ ret i32 %6
+}
+
+define zeroext i32 @test_masked_vcmpoeqpd_v2i1_v32i1_mask_mem_b(i2 zeroext %__u, <2 x i64> %__a, double* %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqpd_v2i1_v32i1_mask_mem_b:
+; VLX: # %bb.0: # %entry
+; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT: vcmpeqpd (%rsi){1to2}, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqpd_v2i1_v32i1_mask_mem_b:
+; NoVLX: # %bb.0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: movb %dil, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0]
+; NoVLX-NEXT: vcmpeqpd %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vandpd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <2 x i64> %__a to <2 x double>
+ %load = load double, double* %__b
+ %vec = insertelement <2 x double> undef, double %load, i32 0
+ %1 = shufflevector <2 x double> %vec, <2 x double> undef, <2 x i32> <i32 0, i32 0>
+ %2 = fcmp oeq <2 x double> %0, %1
+ %3 = bitcast i2 %__u to <2 x i1>
+ %4 = and <2 x i1> %2, %3
+ %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
+ %6 = bitcast <32 x i1> %5 to i32
+ ret i32 %6
+}
+
+
define zeroext i64 @test_vcmpoeqpd_v2i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vcmpoeqpd_v2i1_v64i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vcmpeqpd %xmm1, %xmm0, %k0
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v64i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1901:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1902:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1903:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm0
-; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -54141,6 +41035,7 @@ define zeroext i64 @test_vcmpoeqpd_v2i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x double>
@@ -54153,26 +41048,22 @@ entry:
define zeroext i64 @test_vcmpoeqpd_v2i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vcmpoeqpd_v2i1_v64i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vcmpeqpd (%rdi), %xmm0, %k0
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v64i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1904:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1905:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1906:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vcmpeqpd (%rdi), %xmm0, %xmm0
-; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -54191,6 +41082,7 @@ define zeroext i64 @test_vcmpoeqpd_v2i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x double>
@@ -54204,27 +41096,23 @@ entry:
define zeroext i64 @test_vcmpoeqpd_v2i1_v64i1_mask_mem_b(<2 x i64> %__a, double* %__b) local_unnamed_addr {
; VLX-LABEL: test_vcmpoeqpd_v2i1_v64i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vcmpeqpd (%rdi){1to2}, %xmm0, %k0
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v64i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1907:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1908:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1909:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0]
; NoVLX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm0
-; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -54243,6 +41131,7 @@ define zeroext i64 @test_vcmpoeqpd_v2i1_v64i1_mask_mem_b(<2 x i64> %__a, double*
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__a to <2 x double>
@@ -54255,54 +41144,224 @@ entry:
ret i64 %4
}
+define zeroext i64 @test_masked_vcmpoeqpd_v2i1_v64i1_mask(i2 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqpd_v2i1_v64i1_mask:
+; VLX: # %bb.0: # %entry
+; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT: vcmpeqpd %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqpd_v2i1_v64i1_mask:
+; NoVLX: # %bb.0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $96, %rsp
+; NoVLX-NEXT: movb %dil, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vandpd %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <2 x i64> %__a to <2 x double>
+ %1 = bitcast <2 x i64> %__b to <2 x double>
+ %2 = fcmp oeq <2 x double> %0, %1
+ %3 = bitcast i2 %__u to <2 x i1>
+ %4 = and <2 x i1> %2, %3
+ %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
+ %6 = bitcast <64 x i1> %5 to i64
+ ret i64 %6
+}
+
+define zeroext i64 @test_masked_vcmpoeqpd_v2i1_v64i1_mask_mem(i2 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqpd_v2i1_v64i1_mask_mem:
+; VLX: # %bb.0: # %entry
+; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT: vcmpeqpd (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqpd_v2i1_v64i1_mask_mem:
+; NoVLX: # %bb.0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $96, %rsp
+; NoVLX-NEXT: movb %dil, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vcmpeqpd (%rsi), %xmm0, %xmm0
+; NoVLX-NEXT: vandpd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <2 x i64> %__a to <2 x double>
+ %load = load <2 x i64>, <2 x i64>* %__b
+ %1 = bitcast <2 x i64> %load to <2 x double>
+ %2 = fcmp oeq <2 x double> %0, %1
+ %3 = bitcast i2 %__u to <2 x i1>
+ %4 = and <2 x i1> %2, %3
+ %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
+ %6 = bitcast <64 x i1> %5 to i64
+ ret i64 %6
+}
+
+define zeroext i64 @test_masked_vcmpoeqpd_v2i1_v64i1_mask_mem_b(i2 zeroext %__u, <2 x i64> %__a, double* %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqpd_v2i1_v64i1_mask_mem_b:
+; VLX: # %bb.0: # %entry
+; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT: vcmpeqpd (%rsi){1to2}, %xmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqpd_v2i1_v64i1_mask_mem_b:
+; NoVLX: # %bb.0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $96, %rsp
+; NoVLX-NEXT: movb %dil, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0]
+; NoVLX-NEXT: vcmpeqpd %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vandpd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <2 x i64> %__a to <2 x double>
+ %load = load double, double* %__b
+ %vec = insertelement <2 x double> undef, double %load, i32 0
+ %1 = shufflevector <2 x double> %vec, <2 x double> undef, <2 x i32> <i32 0, i32 0>
+ %2 = fcmp oeq <2 x double> %0, %1
+ %3 = bitcast i2 %__u to <2 x i1>
+ %4 = and <2 x i1> %2, %3
+ %5 = shufflevector <2 x i1> %4, <2 x i1> zeroinitializer, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
+ %6 = bitcast <64 x i1> %5 to i64
+ ret i64 %6
+}
+
+
define zeroext i8 @test_vcmpoeqpd_v4i1_v8i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vcmpoeqpd_v4i1_v8i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vcmpeqpd %ymm1, %ymm0, %k0
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %al killed %al killed %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v8i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm0
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kshiftlw $7, %k0, %k0
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $13, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $12, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %al killed %al killed %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x double>
@@ -54315,51 +41374,48 @@ entry:
define zeroext i8 @test_vcmpoeqpd_v4i1_v8i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vcmpoeqpd_v4i1_v8i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vcmpeqpd (%rdi), %ymm0, %k0
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %al killed %al killed %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v8i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vcmpeqpd (%rdi), %ymm0, %ymm0
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kshiftlw $7, %k0, %k0
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $13, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $12, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %al killed %al killed %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x double>
@@ -54373,52 +41429,49 @@ entry:
define zeroext i8 @test_vcmpoeqpd_v4i1_v8i1_mask_mem_b(<4 x i64> %__a, double* %__b) local_unnamed_addr {
; VLX-LABEL: test_vcmpoeqpd_v4i1_v8i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vcmpeqpd (%rdi){1to4}, %ymm0, %k0
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %al killed %al killed %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v8i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vbroadcastsd (%rdi), %ymm1
; NoVLX-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm0
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kshiftlw $7, %k0, %k0
-; NoVLX-NEXT: kshiftrw $7, %k0, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT: vpsllq $63, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $13, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
-; NoVLX-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT: vpsllq $63, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $12, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %al killed %al killed %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x double>
@@ -54431,53 +41484,245 @@ entry:
ret i8 %4
}
-
-define zeroext i16 @test_vcmpoeqpd_v4i1_v16i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; VLX-LABEL: test_vcmpoeqpd_v4i1_v16i1_mask:
-; VLX: # BB#0: # %entry
-; VLX-NEXT: vcmpeqpd %ymm1, %ymm0, %k0
+define zeroext i8 @test_masked_vcmpoeqpd_v4i1_v8i1_mask(i4 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqpd_v4i1_v8i1_mask:
+; VLX: # %bb.0: # %entry
+; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT: vcmpeqpd %ymm1, %ymm0, %k0 {%k1}
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %al killed %al killed %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
-; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v16i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX-LABEL: test_masked_vcmpoeqpd_v4i1_v8i1_mask:
+; NoVLX: # %bb.0: # %entry
+; NoVLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; NoVLX-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm0
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpand %xmm2, %xmm0, %xmm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $13, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $12, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: def %al killed %al killed %eax
+; NoVLX-NEXT: vzeroupper
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <4 x i64> %__a to <4 x double>
+ %1 = bitcast <4 x i64> %__b to <4 x double>
+ %2 = fcmp oeq <4 x double> %0, %1
+ %3 = bitcast i4 %__u to <4 x i1>
+ %4 = and <4 x i1> %2, %3
+ %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %6 = bitcast <8 x i1> %5 to i8
+ ret i8 %6
+}
+
+define zeroext i8 @test_masked_vcmpoeqpd_v4i1_v8i1_mask_mem(i4 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqpd_v4i1_v8i1_mask_mem:
+; VLX: # %bb.0: # %entry
+; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT: vcmpeqpd (%rsi), %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: def %al killed %al killed %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqpd_v4i1_v8i1_mask_mem:
+; NoVLX: # %bb.0: # %entry
+; NoVLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vcmpeqpd (%rsi), %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: andl $1, %eax
-; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $13, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $12, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: def %al killed %al killed %eax
+; NoVLX-NEXT: vzeroupper
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <4 x i64> %__a to <4 x double>
+ %load = load <4 x i64>, <4 x i64>* %__b
+ %1 = bitcast <4 x i64> %load to <4 x double>
+ %2 = fcmp oeq <4 x double> %0, %1
+ %3 = bitcast i4 %__u to <4 x i1>
+ %4 = and <4 x i1> %2, %3
+ %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %6 = bitcast <8 x i1> %5 to i8
+ ret i8 %6
+}
+
+define zeroext i8 @test_masked_vcmpoeqpd_v4i1_v8i1_mask_mem_b(i4 zeroext %__u, <4 x i64> %__a, double* %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqpd_v4i1_v8i1_mask_mem_b:
+; VLX: # %bb.0: # %entry
+; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT: vcmpeqpd (%rsi){1to4}, %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: def %al killed %al killed %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqpd_v4i1_v8i1_mask_mem_b:
+; NoVLX: # %bb.0: # %entry
+; NoVLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vbroadcastsd (%rsi), %ymm2
+; NoVLX-NEXT: vcmpeqpd %ymm2, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $13, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $12, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: def %al killed %al killed %eax
+; NoVLX-NEXT: vzeroupper
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <4 x i64> %__a to <4 x double>
+ %load = load double, double* %__b
+ %vec = insertelement <4 x double> undef, double %load, i32 0
+ %1 = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+ %2 = fcmp oeq <4 x double> %0, %1
+ %3 = bitcast i4 %__u to <4 x i1>
+ %4 = and <4 x i1> %2, %3
+ %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %6 = bitcast <8 x i1> %5 to i8
+ ret i8 %6
+}
+
+
+
+define zeroext i16 @test_vcmpoeqpd_v4i1_v16i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
+; VLX-LABEL: test_vcmpoeqpd_v4i1_v16i1_mask:
+; VLX: # %bb.0: # %entry
+; VLX-NEXT: vcmpeqpd %ymm1, %ymm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v16i1_mask:
+; NoVLX: # %bb.0: # %entry
+; NoVLX-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $13, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $12, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x double>
@@ -54490,50 +41735,48 @@ entry:
define zeroext i16 @test_vcmpoeqpd_v4i1_v16i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vcmpoeqpd_v4i1_v16i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vcmpeqpd (%rdi), %ymm0, %k0
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v16i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vcmpeqpd (%rdi), %ymm0, %ymm0
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kmovw %eax, %k0
; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: andl $1, %eax
-; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $13, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $12, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x double>
@@ -54547,51 +41790,240 @@ entry:
define zeroext i16 @test_vcmpoeqpd_v4i1_v16i1_mask_mem_b(<4 x i64> %__a, double* %__b) local_unnamed_addr {
; VLX-LABEL: test_vcmpoeqpd_v4i1_v16i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vcmpeqpd (%rdi){1to4}, %ymm0, %k0
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v16i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vbroadcastsd (%rdi), %ymm1
; NoVLX-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm0
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
-; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kmovw %eax, %k0
; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
; NoVLX-NEXT: andl $1, %eax
-; NoVLX-NEXT: kmovw %eax, %k0
-; NoVLX-NEXT: kxorw %k0, %k0, %k1
-; NoVLX-NEXT: kshiftrw $1, %k1, %k1
-; NoVLX-NEXT: kshiftlw $1, %k1, %k1
-; NoVLX-NEXT: korw %k0, %k1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
-; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $13, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $12, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
+; NoVLX-NEXT: vzeroupper
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <4 x i64> %__a to <4 x double>
+ %load = load double, double* %__b
+ %vec = insertelement <4 x double> undef, double %load, i32 0
+ %1 = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+ %2 = fcmp oeq <4 x double> %0, %1
+ %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
+ %4 = bitcast <16 x i1> %3 to i16
+ ret i16 %4
+}
+
+define zeroext i16 @test_masked_vcmpoeqpd_v4i1_v16i1_mask(i4 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqpd_v4i1_v16i1_mask:
+; VLX: # %bb.0: # %entry
+; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT: vcmpeqpd %ymm1, %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqpd_v4i1_v16i1_mask:
+; NoVLX: # %bb.0: # %entry
+; NoVLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
-; NoVLX-NEXT: vpslld $31, %zmm3, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpand %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $13, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $12, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
+; NoVLX-NEXT: vzeroupper
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <4 x i64> %__a to <4 x double>
+ %1 = bitcast <4 x i64> %__b to <4 x double>
+ %2 = fcmp oeq <4 x double> %0, %1
+ %3 = bitcast i4 %__u to <4 x i1>
+ %4 = and <4 x i1> %2, %3
+ %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
+ %6 = bitcast <16 x i1> %5 to i16
+ ret i16 %6
+}
+
+define zeroext i16 @test_masked_vcmpoeqpd_v4i1_v16i1_mask_mem(i4 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqpd_v4i1_v16i1_mask_mem:
+; VLX: # %bb.0: # %entry
+; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT: vcmpeqpd (%rsi), %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqpd_v4i1_v16i1_mask_mem:
+; NoVLX: # %bb.0: # %entry
+; NoVLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: kmovw %eax, %k1
; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vcmpeqpd (%rsi), %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $13, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $12, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
+; NoVLX-NEXT: vzeroupper
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <4 x i64> %__a to <4 x double>
+ %load = load <4 x i64>, <4 x i64>* %__b
+ %1 = bitcast <4 x i64> %load to <4 x double>
+ %2 = fcmp oeq <4 x double> %0, %1
+ %3 = bitcast i4 %__u to <4 x i1>
+ %4 = and <4 x i1> %2, %3
+ %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
+ %6 = bitcast <16 x i1> %5 to i16
+ ret i16 %6
+}
+
+define zeroext i16 @test_masked_vcmpoeqpd_v4i1_v16i1_mask_mem_b(i4 zeroext %__u, <4 x i64> %__a, double* %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqpd_v4i1_v16i1_mask_mem_b:
+; VLX: # %bb.0: # %entry
+; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT: vcmpeqpd (%rsi){1to4}, %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqpd_v4i1_v16i1_mask_mem_b:
+; NoVLX: # %bb.0: # %entry
+; NoVLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; NoVLX-NEXT: kmovw %eax, %k1
-; NoVLX-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
-; NoVLX-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
-; NoVLX-NEXT: vpslld $31, %zmm2, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vbroadcastsd (%rsi), %ymm2
+; NoVLX-NEXT: vcmpeqpd %ymm2, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: andl $1, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $1, %k2, %k2
+; NoVLX-NEXT: kshiftlw $1, %k2, %k2
+; NoVLX-NEXT: korw %k1, %k2, %k1
+; NoVLX-NEXT: kshiftrw $1, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $14, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $13, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $12, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x double>
@@ -54599,29 +42031,29 @@ entry:
%vec = insertelement <4 x double> undef, double %load, i32 0
%1 = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
%2 = fcmp oeq <4 x double> %0, %1
- %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
- %4 = bitcast <16 x i1> %3 to i16
- ret i16 %4
+ %3 = bitcast i4 %__u to <4 x i1>
+ %4 = and <4 x i1> %2, %3
+ %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
+ %6 = bitcast <16 x i1> %5 to i16
+ ret i16 %6
}
+
define zeroext i32 @test_vcmpoeqpd_v4i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vcmpoeqpd_v4i1_v32i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vcmpeqpd %ymm1, %ymm0, %k0
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v32i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1910:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1911:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1912:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -54640,6 +42072,7 @@ define zeroext i32 @test_vcmpoeqpd_v4i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x double>
@@ -54652,21 +42085,18 @@ entry:
define zeroext i32 @test_vcmpoeqpd_v4i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vcmpoeqpd_v4i1_v32i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vcmpeqpd (%rdi), %ymm0, %k0
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v32i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1913:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1914:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1915:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -54685,6 +42115,7 @@ define zeroext i32 @test_vcmpoeqpd_v4i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x double>
@@ -54698,21 +42129,18 @@ entry:
define zeroext i32 @test_vcmpoeqpd_v4i1_v32i1_mask_mem_b(<4 x i64> %__a, double* %__b) local_unnamed_addr {
; VLX-LABEL: test_vcmpoeqpd_v4i1_v32i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vcmpeqpd (%rdi){1to4}, %ymm0, %k0
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v32i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1916:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1917:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1918:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
@@ -54732,6 +42160,7 @@ define zeroext i32 @test_vcmpoeqpd_v4i1_v32i1_mask_mem_b(<4 x i64> %__a, double*
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x double>
@@ -54744,31 +42173,188 @@ entry:
ret i32 %4
}
+define zeroext i32 @test_masked_vcmpoeqpd_v4i1_v32i1_mask(i4 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqpd_v4i1_v32i1_mask:
+; VLX: # %bb.0: # %entry
+; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT: vcmpeqpd %ymm1, %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqpd_v4i1_v32i1_mask:
+; NoVLX: # %bb.0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: movb %dil, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpand %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <4 x i64> %__a to <4 x double>
+ %1 = bitcast <4 x i64> %__b to <4 x double>
+ %2 = fcmp oeq <4 x double> %0, %1
+ %3 = bitcast i4 %__u to <4 x i1>
+ %4 = and <4 x i1> %2, %3
+ %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
+ %6 = bitcast <32 x i1> %5 to i32
+ ret i32 %6
+}
+
+define zeroext i32 @test_masked_vcmpoeqpd_v4i1_v32i1_mask_mem(i4 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqpd_v4i1_v32i1_mask_mem:
+; VLX: # %bb.0: # %entry
+; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT: vcmpeqpd (%rsi), %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqpd_v4i1_v32i1_mask_mem:
+; NoVLX: # %bb.0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: movb %dil, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vcmpeqpd (%rsi), %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <4 x i64> %__a to <4 x double>
+ %load = load <4 x i64>, <4 x i64>* %__b
+ %1 = bitcast <4 x i64> %load to <4 x double>
+ %2 = fcmp oeq <4 x double> %0, %1
+ %3 = bitcast i4 %__u to <4 x i1>
+ %4 = and <4 x i1> %2, %3
+ %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
+ %6 = bitcast <32 x i1> %5 to i32
+ ret i32 %6
+}
+
+define zeroext i32 @test_masked_vcmpoeqpd_v4i1_v32i1_mask_mem_b(i4 zeroext %__u, <4 x i64> %__a, double* %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqpd_v4i1_v32i1_mask_mem_b:
+; VLX: # %bb.0: # %entry
+; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT: vcmpeqpd (%rsi){1to4}, %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqpd_v4i1_v32i1_mask_mem_b:
+; NoVLX: # %bb.0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: movb %dil, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vbroadcastsd (%rsi), %ymm2
+; NoVLX-NEXT: vcmpeqpd %ymm2, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <4 x i64> %__a to <4 x double>
+ %load = load double, double* %__b
+ %vec = insertelement <4 x double> undef, double %load, i32 0
+ %1 = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+ %2 = fcmp oeq <4 x double> %0, %1
+ %3 = bitcast i4 %__u to <4 x i1>
+ %4 = and <4 x i1> %2, %3
+ %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
+ %6 = bitcast <32 x i1> %5 to i32
+ ret i32 %6
+}
+
+
define zeroext i64 @test_vcmpoeqpd_v4i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vcmpoeqpd_v4i1_v64i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vcmpeqpd %ymm1, %ymm0, %k0
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v64i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1919:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1920:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1921:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm0
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
-; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -54787,6 +42373,7 @@ define zeroext i64 @test_vcmpoeqpd_v4i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x double>
@@ -54799,28 +42386,24 @@ entry:
define zeroext i64 @test_vcmpoeqpd_v4i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vcmpoeqpd_v4i1_v64i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vcmpeqpd (%rdi), %ymm0, %k0
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v64i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1922:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1923:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1924:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vcmpeqpd (%rdi), %ymm0, %ymm0
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
-; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -54839,6 +42422,7 @@ define zeroext i64 @test_vcmpoeqpd_v4i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x double>
@@ -54852,29 +42436,25 @@ entry:
define zeroext i64 @test_vcmpoeqpd_v4i1_v64i1_mask_mem_b(<4 x i64> %__a, double* %__b) local_unnamed_addr {
; VLX-LABEL: test_vcmpoeqpd_v4i1_v64i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vcmpeqpd (%rdi){1to4}, %ymm0, %k0
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v64i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1925:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1926:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1927:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vbroadcastsd (%rdi), %ymm1
; NoVLX-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm0
; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
-; NoVLX-NEXT: vpxord %zmm1, %zmm1, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -54893,6 +42473,7 @@ define zeroext i64 @test_vcmpoeqpd_v4i1_v64i1_mask_mem_b(<4 x i64> %__a, double*
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__a to <4 x double>
@@ -54905,21 +42486,201 @@ entry:
ret i64 %4
}
+define zeroext i64 @test_masked_vcmpoeqpd_v4i1_v64i1_mask(i4 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqpd_v4i1_v64i1_mask:
+; VLX: # %bb.0: # %entry
+; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT: vcmpeqpd %ymm1, %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqpd_v4i1_v64i1_mask:
+; NoVLX: # %bb.0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $96, %rsp
+; NoVLX-NEXT: movb %dil, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpand %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <4 x i64> %__a to <4 x double>
+ %1 = bitcast <4 x i64> %__b to <4 x double>
+ %2 = fcmp oeq <4 x double> %0, %1
+ %3 = bitcast i4 %__u to <4 x i1>
+ %4 = and <4 x i1> %2, %3
+ %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
+ %6 = bitcast <64 x i1> %5 to i64
+ ret i64 %6
+}
+
+define zeroext i64 @test_masked_vcmpoeqpd_v4i1_v64i1_mask_mem(i4 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqpd_v4i1_v64i1_mask_mem:
+; VLX: # %bb.0: # %entry
+; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT: vcmpeqpd (%rsi), %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqpd_v4i1_v64i1_mask_mem:
+; NoVLX: # %bb.0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $96, %rsp
+; NoVLX-NEXT: movb %dil, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vcmpeqpd (%rsi), %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <4 x i64> %__a to <4 x double>
+ %load = load <4 x i64>, <4 x i64>* %__b
+ %1 = bitcast <4 x i64> %load to <4 x double>
+ %2 = fcmp oeq <4 x double> %0, %1
+ %3 = bitcast i4 %__u to <4 x i1>
+ %4 = and <4 x i1> %2, %3
+ %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
+ %6 = bitcast <64 x i1> %5 to i64
+ ret i64 %6
+}
+
+define zeroext i64 @test_masked_vcmpoeqpd_v4i1_v64i1_mask_mem_b(i4 zeroext %__u, <4 x i64> %__a, double* %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqpd_v4i1_v64i1_mask_mem_b:
+; VLX: # %bb.0: # %entry
+; VLX-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; VLX-NEXT: kmovb -{{[0-9]+}}(%rsp), %k1
+; VLX-NEXT: vcmpeqpd (%rsi){1to4}, %ymm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqpd_v4i1_v64i1_mask_mem_b:
+; NoVLX: # %bb.0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $96, %rsp
+; NoVLX-NEXT: movb %dil, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT: vbroadcastsd (%rsi), %ymm2
+; NoVLX-NEXT: vcmpeqpd %ymm2, %ymm0, %ymm0
+; NoVLX-NEXT: vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT: vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <4 x i64> %__a to <4 x double>
+ %load = load double, double* %__b
+ %vec = insertelement <4 x double> undef, double %load, i32 0
+ %1 = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+ %2 = fcmp oeq <4 x double> %0, %1
+ %3 = bitcast i4 %__u to <4 x i1>
+ %4 = and <4 x i1> %2, %3
+ %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
+ %6 = bitcast <64 x i1> %5 to i64
+ ret i64 %6
+}
+
+
define zeroext i16 @test_vcmpoeqpd_v8i1_v16i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vcmpoeqpd_v8i1_v16i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vcmpeqpd %zmm1, %zmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v16i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vcmpeqpd %zmm1, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x double>
@@ -54932,18 +42693,19 @@ entry:
define zeroext i16 @test_vcmpoeqpd_v8i1_v16i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vcmpoeqpd_v8i1_v16i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vcmpeqpd (%rdi), %zmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v16i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vcmpeqpd (%rdi), %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x double>
@@ -54957,18 +42719,19 @@ entry:
define zeroext i16 @test_vcmpoeqpd_v8i1_v16i1_mask_mem_b(<8 x i64> %__a, double* %__b) local_unnamed_addr {
; VLX-LABEL: test_vcmpoeqpd_v8i1_v16i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vcmpeqpd (%rdi){1to8}, %zmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v16i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vcmpeqpd (%rdi){1to8}, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x double>
@@ -54981,23 +42744,115 @@ entry:
ret i16 %4
}
+define zeroext i16 @test_masked_vcmpoeqpd_v8i1_v16i1_mask(i8 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqpd_v8i1_v16i1_mask:
+; VLX: # %bb.0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vcmpeqpd %zmm1, %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqpd_v8i1_v16i1_mask:
+; NoVLX: # %bb.0: # %entry
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vcmpeqpd %zmm1, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
+; NoVLX-NEXT: vzeroupper
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <8 x i64> %__a to <8 x double>
+ %1 = bitcast <8 x i64> %__b to <8 x double>
+ %2 = fcmp oeq <8 x double> %0, %1
+ %3 = bitcast i8 %__u to <8 x i1>
+ %4 = and <8 x i1> %2, %3
+ %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %6 = bitcast <16 x i1> %5 to i16
+ ret i16 %6
+}
+
+define zeroext i16 @test_masked_vcmpoeqpd_v8i1_v16i1_mask_mem(i8 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqpd_v8i1_v16i1_mask_mem:
+; VLX: # %bb.0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vcmpeqpd (%rsi), %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqpd_v8i1_v16i1_mask_mem:
+; NoVLX: # %bb.0: # %entry
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vcmpeqpd (%rsi), %zmm0, %k0 {%k1}
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
+; NoVLX-NEXT: vzeroupper
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <8 x i64> %__a to <8 x double>
+ %load = load <8 x i64>, <8 x i64>* %__b
+ %1 = bitcast <8 x i64> %load to <8 x double>
+ %2 = fcmp oeq <8 x double> %0, %1
+ %3 = bitcast i8 %__u to <8 x i1>
+ %4 = and <8 x i1> %2, %3
+ %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %6 = bitcast <16 x i1> %5 to i16
+ ret i16 %6
+}
+
+define zeroext i16 @test_masked_vcmpoeqpd_v8i1_v16i1_mask_mem_b(i8 zeroext %__u, <8 x i64> %__a, double* %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqpd_v8i1_v16i1_mask_mem_b:
+; VLX: # %bb.0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vcmpeqpd (%rsi){1to8}, %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqpd_v8i1_v16i1_mask_mem_b:
+; NoVLX: # %bb.0: # %entry
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vcmpeqpd (%rsi){1to8}, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
+; NoVLX-NEXT: vzeroupper
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <8 x i64> %__a to <8 x double>
+ %load = load double, double* %__b
+ %vec = insertelement <8 x double> undef, double %load, i32 0
+ %1 = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %2 = fcmp oeq <8 x double> %0, %1
+ %3 = bitcast i8 %__u to <8 x i1>
+ %4 = and <8 x i1> %2, %3
+ %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %6 = bitcast <16 x i1> %5 to i16
+ ret i16 %6
+}
+
+
define zeroext i16 @test_vcmpoeqpd_v8i1_v16i1_sae_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vcmpoeqpd_v8i1_v16i1_sae_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vcmplepd {sae}, %zmm1, %zmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: movzbl %al, %eax
-; VLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v16i1_sae_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vcmplepd {sae}, %zmm1, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: movzbl %al, %eax
-; NoVLX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x double>
@@ -55007,64 +42862,80 @@ entry:
ret i16 %3
}
+define zeroext i16 @test_masked_vcmpoeqpd_v8i1_v16i1_sae_mask(i8 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqpd_v8i1_v16i1_sae_mask:
+; VLX: # %bb.0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vcmplepd {sae}, %zmm1, %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: movzbl %al, %eax
+; VLX-NEXT: # kill: def %ax killed %ax killed %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqpd_v8i1_v16i1_sae_mask:
+; NoVLX: # %bb.0: # %entry
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vcmplepd {sae}, %zmm1, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: movzbl %al, %eax
+; NoVLX-NEXT: # kill: def %ax killed %ax killed %eax
+; NoVLX-NEXT: vzeroupper
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <8 x i64> %__a to <8 x double>
+ %1 = bitcast <8 x i64> %__b to <8 x double>
+ %2 = call i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %0, <8 x double> %1, i32 2, i8 %__u, i32 8)
+ %3 = zext i8 %2 to i16
+ ret i16 %3
+}
+
+
define zeroext i32 @test_vcmpoeqpd_v8i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vcmpoeqpd_v8i1_v32i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vcmpeqpd %zmm1, %zmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v32i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1928:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1929:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1930:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
; NoVLX-NEXT: vcmpeqpd %zmm1, %zmm0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vxorpd %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -55072,6 +42943,7 @@ define zeroext i32 @test_vcmpoeqpd_v8i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %__
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x double>
@@ -55084,61 +42956,48 @@ entry:
define zeroext i32 @test_vcmpoeqpd_v8i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vcmpoeqpd_v8i1_v32i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vcmpeqpd (%rdi), %zmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v32i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1931:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1932:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1933:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
; NoVLX-NEXT: vcmpeqpd (%rdi), %zmm0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vxorpd %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -55146,6 +43005,7 @@ define zeroext i32 @test_vcmpoeqpd_v8i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64>
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x double>
@@ -55159,61 +43019,48 @@ entry:
define zeroext i32 @test_vcmpoeqpd_v8i1_v32i1_mask_mem_b(<8 x i64> %__a, double* %__b) local_unnamed_addr {
; VLX-LABEL: test_vcmpoeqpd_v8i1_v32i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vcmpeqpd (%rdi){1to8}, %zmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v32i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1934:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1935:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1936:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $32, %rsp
; NoVLX-NEXT: vcmpeqpd (%rdi){1to8}, %zmm0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vxorpd %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -55221,6 +43068,7 @@ define zeroext i32 @test_vcmpoeqpd_v8i1_v32i1_mask_mem_b(<8 x i64> %__a, double*
; NoVLX-NEXT: movl (%rsp), %eax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x double>
@@ -55233,20 +43081,223 @@ entry:
ret i32 %4
}
+define zeroext i32 @test_masked_vcmpoeqpd_v8i1_v32i1_mask(i8 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqpd_v8i1_v32i1_mask:
+; VLX: # %bb.0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vcmpeqpd %zmm1, %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqpd_v8i1_v32i1_mask:
+; NoVLX: # %bb.0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vcmpeqpd %zmm1, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vxorpd %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <8 x i64> %__a to <8 x double>
+ %1 = bitcast <8 x i64> %__b to <8 x double>
+ %2 = fcmp oeq <8 x double> %0, %1
+ %3 = bitcast i8 %__u to <8 x i1>
+ %4 = and <8 x i1> %2, %3
+ %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %6 = bitcast <32 x i1> %5 to i32
+ ret i32 %6
+}
+
+define zeroext i32 @test_masked_vcmpoeqpd_v8i1_v32i1_mask_mem(i8 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqpd_v8i1_v32i1_mask_mem:
+; VLX: # %bb.0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vcmpeqpd (%rsi), %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqpd_v8i1_v32i1_mask_mem:
+; NoVLX: # %bb.0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vcmpeqpd (%rsi), %zmm0, %k0 {%k1}
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vxorpd %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <8 x i64> %__a to <8 x double>
+ %load = load <8 x i64>, <8 x i64>* %__b
+ %1 = bitcast <8 x i64> %load to <8 x double>
+ %2 = fcmp oeq <8 x double> %0, %1
+ %3 = bitcast i8 %__u to <8 x i1>
+ %4 = and <8 x i1> %2, %3
+ %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %6 = bitcast <32 x i1> %5 to i32
+ ret i32 %6
+}
+
+define zeroext i32 @test_masked_vcmpoeqpd_v8i1_v32i1_mask_mem_b(i8 zeroext %__u, <8 x i64> %__a, double* %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqpd_v8i1_v32i1_mask_mem_b:
+; VLX: # %bb.0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vcmpeqpd (%rsi){1to8}, %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqpd_v8i1_v32i1_mask_mem_b:
+; NoVLX: # %bb.0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vcmpeqpd (%rsi){1to8}, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vxorpd %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <8 x i64> %__a to <8 x double>
+ %load = load double, double* %__b
+ %vec = insertelement <8 x double> undef, double %load, i32 0
+ %1 = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %2 = fcmp oeq <8 x double> %0, %1
+ %3 = bitcast i8 %__u to <8 x i1>
+ %4 = and <8 x i1> %2, %3
+ %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %6 = bitcast <32 x i1> %5 to i32
+ ret i32 %6
+}
+
+
define zeroext i32 @test_vcmpoeqpd_v8i1_v32i1_sae_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vcmpoeqpd_v8i1_v32i1_sae_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vcmplepd {sae}, %zmm1, %zmm0, %k0
; VLX-NEXT: kmovb %k0, %eax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v32i1_sae_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vcmplepd {sae}, %zmm1, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: movzbl %al, %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x double>
@@ -55256,66 +43307,79 @@ entry:
ret i32 %3
}
+define zeroext i32 @test_masked_vcmpoeqpd_v8i1_v32i1_sae_mask(i8 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqpd_v8i1_v32i1_sae_mask:
+; VLX: # %bb.0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vcmplepd {sae}, %zmm1, %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovb %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqpd_v8i1_v32i1_sae_mask:
+; NoVLX: # %bb.0: # %entry
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vcmplepd {sae}, %zmm1, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: movzbl %al, %eax
+; NoVLX-NEXT: vzeroupper
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <8 x i64> %__a to <8 x double>
+ %1 = bitcast <8 x i64> %__b to <8 x double>
+ %2 = call i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %0, <8 x double> %1, i32 2, i8 %__u, i32 8)
+ %3 = zext i8 %2 to i32
+ ret i32 %3
+}
+
+
define zeroext i64 @test_vcmpoeqpd_v8i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vcmpoeqpd_v8i1_v64i1_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vcmpeqpd %zmm1, %zmm0, %k0
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v64i1_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1937:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1938:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1939:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vcmpeqpd %zmm1, %zmm0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vxorpd %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -55326,6 +43390,7 @@ define zeroext i64 @test_vcmpoeqpd_v8i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x double>
@@ -55338,63 +43403,50 @@ entry:
define zeroext i64 @test_vcmpoeqpd_v8i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
; VLX-LABEL: test_vcmpoeqpd_v8i1_v64i1_mask_mem:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vcmpeqpd (%rdi), %zmm0, %k0
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v64i1_mask_mem:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1940:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1941:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1942:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vcmpeqpd (%rdi), %zmm0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vxorpd %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -55405,6 +43457,7 @@ define zeroext i64 @test_vcmpoeqpd_v8i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x double>
@@ -55418,63 +43471,50 @@ entry:
define zeroext i64 @test_vcmpoeqpd_v8i1_v64i1_mask_mem_b(<8 x i64> %__a, double* %__b) local_unnamed_addr {
; VLX-LABEL: test_vcmpoeqpd_v8i1_v64i1_mask_mem_b:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vcmpeqpd (%rdi){1to8}, %zmm0, %k0
; VLX-NEXT: kmovq %k0, %rax
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v64i1_mask_mem_b:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: pushq %rbp
-; NoVLX-NEXT: .Lcfi1943:
; NoVLX-NEXT: .cfi_def_cfa_offset 16
-; NoVLX-NEXT: .Lcfi1944:
; NoVLX-NEXT: .cfi_offset %rbp, -16
; NoVLX-NEXT: movq %rsp, %rbp
-; NoVLX-NEXT: .Lcfi1945:
; NoVLX-NEXT: .cfi_def_cfa_register %rbp
; NoVLX-NEXT: andq $-32, %rsp
; NoVLX-NEXT: subq $64, %rsp
; NoVLX-NEXT: vcmpeqpd (%rdi){1to8}, %zmm0, %k0
-; NoVLX-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT: kshiftlw $15, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r8d
-; NoVLX-NEXT: kshiftlw $14, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %r9d
-; NoVLX-NEXT: kshiftlw $13, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
-; NoVLX-NEXT: kmovw %k1, %edx
-; NoVLX-NEXT: kshiftlw $12, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %esi
-; NoVLX-NEXT: kshiftlw $11, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %edi
-; NoVLX-NEXT: kshiftlw $10, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %eax
-; NoVLX-NEXT: kshiftlw $9, %k0, %k1
-; NoVLX-NEXT: kshiftrw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
; NoVLX-NEXT: kmovw %k1, %ecx
-; NoVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $4, %edi, %xmm0, %xmm0
-; NoVLX-NEXT: kshiftlw $8, %k0, %k0
-; NoVLX-NEXT: kshiftrw $15, %k0, %k0
-; NoVLX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: kmovw %k0, %eax
-; NoVLX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vxorpd %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -55485,6 +43525,7 @@ define zeroext i64 @test_vcmpoeqpd_v8i1_v64i1_mask_mem_b(<8 x i64> %__a, double*
; NoVLX-NEXT: orq %rcx, %rax
; NoVLX-NEXT: movq %rbp, %rsp
; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x double>
@@ -55497,10 +43538,227 @@ entry:
ret i64 %4
}
+define zeroext i64 @test_masked_vcmpoeqpd_v8i1_v64i1_mask(i8 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqpd_v8i1_v64i1_mask:
+; VLX: # %bb.0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vcmpeqpd %zmm1, %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqpd_v8i1_v64i1_mask:
+; NoVLX: # %bb.0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vcmpeqpd %zmm1, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vxorpd %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <8 x i64> %__a to <8 x double>
+ %1 = bitcast <8 x i64> %__b to <8 x double>
+ %2 = fcmp oeq <8 x double> %0, %1
+ %3 = bitcast i8 %__u to <8 x i1>
+ %4 = and <8 x i1> %2, %3
+ %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %6 = bitcast <64 x i1> %5 to i64
+ ret i64 %6
+}
+
+define zeroext i64 @test_masked_vcmpoeqpd_v8i1_v64i1_mask_mem(i8 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqpd_v8i1_v64i1_mask_mem:
+; VLX: # %bb.0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vcmpeqpd (%rsi), %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqpd_v8i1_v64i1_mask_mem:
+; NoVLX: # %bb.0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vcmpeqpd (%rsi), %zmm0, %k0 {%k1}
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vxorpd %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <8 x i64> %__a to <8 x double>
+ %load = load <8 x i64>, <8 x i64>* %__b
+ %1 = bitcast <8 x i64> %load to <8 x double>
+ %2 = fcmp oeq <8 x double> %0, %1
+ %3 = bitcast i8 %__u to <8 x i1>
+ %4 = and <8 x i1> %2, %3
+ %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %6 = bitcast <64 x i1> %5 to i64
+ ret i64 %6
+}
+
+define zeroext i64 @test_masked_vcmpoeqpd_v8i1_v64i1_mask_mem_b(i8 zeroext %__u, <8 x i64> %__a, double* %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqpd_v8i1_v64i1_mask_mem_b:
+; VLX: # %bb.0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vcmpeqpd (%rsi){1to8}, %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovq %k0, %rax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqpd_v8i1_v64i1_mask_mem_b:
+; NoVLX: # %bb.0: # %entry
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $64, %rsp
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vcmpeqpd (%rsi){1to8}, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r8d
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r9d
+; NoVLX-NEXT: kshiftrw $5, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %r10d
+; NoVLX-NEXT: kshiftrw $4, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %esi
+; NoVLX-NEXT: kshiftrw $3, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %edi
+; NoVLX-NEXT: kshiftrw $2, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %eax
+; NoVLX-NEXT: kshiftrw $1, %k0, %k1
+; NoVLX-NEXT: kmovw %k1, %ecx
+; NoVLX-NEXT: kmovw %k0, %edx
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vxorpd %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $0, %edx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $3, %edi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $5, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT: vpinsrb $7, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT: shlq $32, %rcx
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: orq %rcx, %rax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <8 x i64> %__a to <8 x double>
+ %load = load double, double* %__b
+ %vec = insertelement <8 x double> undef, double %load, i32 0
+ %1 = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ %2 = fcmp oeq <8 x double> %0, %1
+ %3 = bitcast i8 %__u to <8 x i1>
+ %4 = and <8 x i1> %2, %3
+ %5 = shufflevector <8 x i1> %4, <8 x i1> zeroinitializer, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %6 = bitcast <64 x i1> %5 to i64
+ ret i64 %6
+}
+
+
define zeroext i64 @test_vcmpoeqpd_v8i1_v64i1_sae_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
; VLX-LABEL: test_vcmpoeqpd_v8i1_v64i1_sae_mask:
-; VLX: # BB#0: # %entry
+; VLX: # %bb.0: # %entry
; VLX-NEXT: vcmplepd {sae}, %zmm1, %zmm0, %k0
; VLX-NEXT: kmovd %k0, %eax
; VLX-NEXT: movzbl %al, %eax
@@ -55508,10 +43766,11 @@ define zeroext i64 @test_vcmpoeqpd_v8i1_v64i1_sae_mask(<8 x i64> %__a, <8 x i64>
; VLX-NEXT: retq
;
; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v64i1_sae_mask:
-; NoVLX: # BB#0: # %entry
+; NoVLX: # %bb.0: # %entry
; NoVLX-NEXT: vcmplepd {sae}, %zmm1, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: movzbl %al, %eax
+; NoVLX-NEXT: vzeroupper
; NoVLX-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__a to <8 x double>
@@ -55521,5 +43780,120 @@ entry:
ret i64 %3
}
+define zeroext i64 @test_masked_vcmpoeqpd_v8i1_v64i1_sae_mask(i8 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
+; VLX-LABEL: test_masked_vcmpoeqpd_v8i1_v64i1_sae_mask:
+; VLX: # %bb.0: # %entry
+; VLX-NEXT: kmovd %edi, %k1
+; VLX-NEXT: vcmplepd {sae}, %zmm1, %zmm0, %k0 {%k1}
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: movzbl %al, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_masked_vcmpoeqpd_v8i1_v64i1_sae_mask:
+; NoVLX: # %bb.0: # %entry
+; NoVLX-NEXT: kmovw %edi, %k1
+; NoVLX-NEXT: vcmplepd {sae}, %zmm1, %zmm0, %k0 {%k1}
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: movzbl %al, %eax
+; NoVLX-NEXT: vzeroupper
+; NoVLX-NEXT: retq
+entry:
+ %0 = bitcast <8 x i64> %__a to <8 x double>
+ %1 = bitcast <8 x i64> %__b to <8 x double>
+ %2 = call i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %0, <8 x double> %1, i32 2, i8 %__u, i32 8)
+ %3 = zext i8 %2 to i64
+ ret i64 %3
+}
+; Test that we understand that cmpps with rounding zeros the upper bits of the mask register.
+define i32 @test_cmpm_rnd_zero(<16 x float> %a, <16 x float> %b) {
+; VLX-LABEL: test_cmpm_rnd_zero:
+; VLX: # %bb.0:
+; VLX-NEXT: vcmpleps {sae}, %zmm1, %zmm0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: vzeroupper
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: test_cmpm_rnd_zero:
+; NoVLX: # %bb.0:
+; NoVLX-NEXT: pushq %rbp
+; NoVLX-NEXT: .cfi_def_cfa_offset 16
+; NoVLX-NEXT: .cfi_offset %rbp, -16
+; NoVLX-NEXT: movq %rsp, %rbp
+; NoVLX-NEXT: .cfi_def_cfa_register %rbp
+; NoVLX-NEXT: andq $-32, %rsp
+; NoVLX-NEXT: subq $32, %rsp
+; NoVLX-NEXT: vcmpleps {sae}, %zmm1, %zmm0, %k1
+; NoVLX-NEXT: kxorw %k0, %k0, %k0
+; NoVLX-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; NoVLX-NEXT: vpmovdb %zmm0, %xmm0
+; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT: kmovw %k0, (%rsp)
+; NoVLX-NEXT: movl (%rsp), %eax
+; NoVLX-NEXT: movq %rbp, %rsp
+; NoVLX-NEXT: popq %rbp
+; NoVLX-NEXT: vzeroupper
+; NoVLX-NEXT: retq
+ %res = call i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %a, <16 x float> %b, i32 2, i16 -1, i32 8)
+ %cast = bitcast i16 %res to <16 x i1>
+ %shuffle = shufflevector <16 x i1> %cast, <16 x i1> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+ %cast2 = bitcast <32 x i1> %shuffle to i32
+ ret i32 %cast2
+}
+define i8 @mask_zero_lower(<4 x i32> %a) {
+; VLX-LABEL: mask_zero_lower:
+; VLX: # %bb.0:
+; VLX-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; VLX-NEXT: vpcmpltud %xmm1, %xmm0, %k0
+; VLX-NEXT: kshiftlb $4, %k0, %k0
+; VLX-NEXT: kmovd %k0, %eax
+; VLX-NEXT: # kill: def %al killed %al killed %eax
+; VLX-NEXT: retq
+;
+; NoVLX-LABEL: mask_zero_lower:
+; NoVLX: # %bb.0:
+; NoVLX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
+; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k0
+; NoVLX-NEXT: vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k1
+; NoVLX-NEXT: kxorw %k0, %k0, %k2
+; NoVLX-NEXT: kshiftrw $4, %k2, %k3
+; NoVLX-NEXT: kxorw %k1, %k3, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $11, %k1, %k1
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftrw $5, %k1, %k2
+; NoVLX-NEXT: kxorw %k0, %k2, %k0
+; NoVLX-NEXT: kshiftlw $15, %k0, %k0
+; NoVLX-NEXT: kshiftrw $10, %k0, %k0
+; NoVLX-NEXT: kxorw %k1, %k0, %k0
+; NoVLX-NEXT: kshiftrw $6, %k0, %k1
+; NoVLX-NEXT: vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $9, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kshiftrw $7, %k0, %k1
+; NoVLX-NEXT: vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT: kmovw %eax, %k2
+; NoVLX-NEXT: kxorw %k2, %k1, %k1
+; NoVLX-NEXT: kshiftlw $15, %k1, %k1
+; NoVLX-NEXT: kshiftrw $8, %k1, %k1
+; NoVLX-NEXT: kxorw %k0, %k1, %k0
+; NoVLX-NEXT: kmovw %k0, %eax
+; NoVLX-NEXT: # kill: def %al killed %al killed %eax
+; NoVLX-NEXT: retq
+ %cmp = icmp ult <4 x i32> %a, zeroinitializer
+ %concat = shufflevector <4 x i1> %cmp, <4 x i1> zeroinitializer, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
+ %cast = bitcast <8 x i1> %concat to i8
+ ret i8 %cast
+}
diff --git a/test/CodeGen/X86/avx512vl-vec-test-testn.ll b/test/CodeGen/X86/avx512vl-vec-test-testn.ll
new file mode 100644
index 000000000000..89791abdeea4
--- /dev/null
+++ b/test/CodeGen/X86/avx512vl-vec-test-testn.ll
@@ -0,0 +1,440 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=X86_64
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=I386
+
+; Function Attrs: norecurse nounwind readnone
+define zeroext i8 @TEST_mm_test_epi64_mask(<2 x i64> %__A, <2 x i64> %__B) local_unnamed_addr #0 {
+; X86_64-LABEL: TEST_mm_test_epi64_mask:
+; X86_64: # %bb.0: # %entry
+; X86_64-NEXT: vptestmq %xmm0, %xmm1, %k0
+; X86_64-NEXT: kmovw %k0, %eax
+; X86_64-NEXT: # kill: def %al killed %al killed %eax
+; X86_64-NEXT: retq
+;
+; I386-LABEL: TEST_mm_test_epi64_mask:
+; I386: # %bb.0: # %entry
+; I386-NEXT: vptestmq %xmm0, %xmm1, %k0
+; I386-NEXT: kmovw %k0, %eax
+; I386-NEXT: # kill: def %al killed %al killed %eax
+; I386-NEXT: retl
+entry:
+ %and.i.i = and <2 x i64> %__B, %__A
+ %0 = icmp ne <2 x i64> %and.i.i, zeroinitializer
+ %1 = shufflevector <2 x i1> %0, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
+ %2 = bitcast <8 x i1> %1 to i8
+ ret i8 %2
+}
+
+; Function Attrs: norecurse nounwind readnone
+define zeroext i8 @TEST_mm_test_epi32_mask(<2 x i64> %__A, <2 x i64> %__B) local_unnamed_addr #0 {
+; X86_64-LABEL: TEST_mm_test_epi32_mask:
+; X86_64: # %bb.0: # %entry
+; X86_64-NEXT: vptestmd %xmm0, %xmm1, %k0
+; X86_64-NEXT: kmovw %k0, %eax
+; X86_64-NEXT: # kill: def %al killed %al killed %eax
+; X86_64-NEXT: retq
+;
+; I386-LABEL: TEST_mm_test_epi32_mask:
+; I386: # %bb.0: # %entry
+; I386-NEXT: vptestmd %xmm0, %xmm1, %k0
+; I386-NEXT: kmovw %k0, %eax
+; I386-NEXT: # kill: def %al killed %al killed %eax
+; I386-NEXT: retl
+entry:
+ %and.i.i = and <2 x i64> %__B, %__A
+ %0 = bitcast <2 x i64> %and.i.i to <4 x i32>
+ %1 = icmp ne <4 x i32> %0, zeroinitializer
+ %2 = shufflevector <4 x i1> %1, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %3 = bitcast <8 x i1> %2 to i8
+ ret i8 %3
+}
+
+; Function Attrs: norecurse nounwind readnone
+define zeroext i8 @TEST_mm256_test_epi64_mask(<4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 {
+; X86_64-LABEL: TEST_mm256_test_epi64_mask:
+; X86_64: # %bb.0: # %entry
+; X86_64-NEXT: vptestmq %ymm0, %ymm1, %k0
+; X86_64-NEXT: kmovw %k0, %eax
+; X86_64-NEXT: # kill: def %al killed %al killed %eax
+; X86_64-NEXT: vzeroupper
+; X86_64-NEXT: retq
+;
+; I386-LABEL: TEST_mm256_test_epi64_mask:
+; I386: # %bb.0: # %entry
+; I386-NEXT: vptestmq %ymm0, %ymm1, %k0
+; I386-NEXT: kmovw %k0, %eax
+; I386-NEXT: # kill: def %al killed %al killed %eax
+; I386-NEXT: vzeroupper
+; I386-NEXT: retl
+entry:
+ %and.i.i = and <4 x i64> %__B, %__A
+ %0 = icmp ne <4 x i64> %and.i.i, zeroinitializer
+ %1 = shufflevector <4 x i1> %0, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %2 = bitcast <8 x i1> %1 to i8
+ ret i8 %2
+}
+
+; Function Attrs: norecurse nounwind readnone
+define zeroext i8 @TEST_mm256_test_epi32_mask(<4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 {
+; X86_64-LABEL: TEST_mm256_test_epi32_mask:
+; X86_64: # %bb.0: # %entry
+; X86_64-NEXT: vptestmd %ymm0, %ymm1, %k0
+; X86_64-NEXT: kmovw %k0, %eax
+; X86_64-NEXT: # kill: def %al killed %al killed %eax
+; X86_64-NEXT: vzeroupper
+; X86_64-NEXT: retq
+;
+; I386-LABEL: TEST_mm256_test_epi32_mask:
+; I386: # %bb.0: # %entry
+; I386-NEXT: vptestmd %ymm0, %ymm1, %k0
+; I386-NEXT: kmovw %k0, %eax
+; I386-NEXT: # kill: def %al killed %al killed %eax
+; I386-NEXT: vzeroupper
+; I386-NEXT: retl
+entry:
+ %and.i.i = and <4 x i64> %__B, %__A
+ %0 = bitcast <4 x i64> %and.i.i to <8 x i32>
+ %1 = icmp ne <8 x i32> %0, zeroinitializer
+ %2 = bitcast <8 x i1> %1 to i8
+ ret i8 %2
+}
+
+; Function Attrs: norecurse nounwind readnone
+define zeroext i8 @TEST_mm_mask_test_epi64_mask(i8 %__U, <2 x i64> %__A, <2 x i64> %__B) local_unnamed_addr #0 {
+; X86_64-LABEL: TEST_mm_mask_test_epi64_mask:
+; X86_64: # %bb.0: # %entry
+; X86_64-NEXT: kmovw %edi, %k1
+; X86_64-NEXT: vptestmq %xmm0, %xmm1, %k0 {%k1}
+; X86_64-NEXT: kmovw %k0, %eax
+; X86_64-NEXT: # kill: def %al killed %al killed %eax
+; X86_64-NEXT: retq
+;
+; I386-LABEL: TEST_mm_mask_test_epi64_mask:
+; I386: # %bb.0: # %entry
+; I386-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; I386-NEXT: kmovw %eax, %k1
+; I386-NEXT: vptestmq %xmm0, %xmm1, %k0 {%k1}
+; I386-NEXT: kmovw %k0, %eax
+; I386-NEXT: # kill: def %al killed %al killed %eax
+; I386-NEXT: retl
+entry:
+ %and.i.i = and <2 x i64> %__B, %__A
+ %0 = icmp ne <2 x i64> %and.i.i, zeroinitializer
+ %1 = bitcast i8 %__U to <8 x i1>
+ %2 = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
+ %3 = and <2 x i1> %0, %2
+ %4 = shufflevector <2 x i1> %3, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
+ %5 = bitcast <8 x i1> %4 to i8
+ ret i8 %5
+}
+
+; Function Attrs: norecurse nounwind readnone
+define zeroext i8 @TEST_mm_mask_test_epi32_mask(i8 %__U, <2 x i64> %__A, <2 x i64> %__B) local_unnamed_addr #0 {
+; X86_64-LABEL: TEST_mm_mask_test_epi32_mask:
+; X86_64: # %bb.0: # %entry
+; X86_64-NEXT: kmovw %edi, %k1
+; X86_64-NEXT: vptestmd %xmm0, %xmm1, %k0 {%k1}
+; X86_64-NEXT: kmovw %k0, %eax
+; X86_64-NEXT: # kill: def %al killed %al killed %eax
+; X86_64-NEXT: retq
+;
+; I386-LABEL: TEST_mm_mask_test_epi32_mask:
+; I386: # %bb.0: # %entry
+; I386-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; I386-NEXT: kmovw %eax, %k1
+; I386-NEXT: vptestmd %xmm0, %xmm1, %k0 {%k1}
+; I386-NEXT: kmovw %k0, %eax
+; I386-NEXT: # kill: def %al killed %al killed %eax
+; I386-NEXT: retl
+entry:
+ %and.i.i = and <2 x i64> %__B, %__A
+ %0 = bitcast <2 x i64> %and.i.i to <4 x i32>
+ %1 = icmp ne <4 x i32> %0, zeroinitializer
+ %2 = bitcast i8 %__U to <8 x i1>
+ %3 = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %4 = and <4 x i1> %1, %3
+ %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %6 = bitcast <8 x i1> %5 to i8
+ ret i8 %6
+}
+
+
+; Function Attrs: norecurse nounwind readnone
+define zeroext i8 @TEST_mm256_mask_test_epi64_mask(i8 %__U, <4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 {
+; X86_64-LABEL: TEST_mm256_mask_test_epi64_mask:
+; X86_64: # %bb.0: # %entry
+; X86_64-NEXT: kmovw %edi, %k1
+; X86_64-NEXT: vptestmq %ymm0, %ymm1, %k0 {%k1}
+; X86_64-NEXT: kmovw %k0, %eax
+; X86_64-NEXT: # kill: def %al killed %al killed %eax
+; X86_64-NEXT: vzeroupper
+; X86_64-NEXT: retq
+;
+; I386-LABEL: TEST_mm256_mask_test_epi64_mask:
+; I386: # %bb.0: # %entry
+; I386-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; I386-NEXT: kmovw %eax, %k1
+; I386-NEXT: vptestmq %ymm0, %ymm1, %k0 {%k1}
+; I386-NEXT: kmovw %k0, %eax
+; I386-NEXT: # kill: def %al killed %al killed %eax
+; I386-NEXT: vzeroupper
+; I386-NEXT: retl
+entry:
+ %and.i.i = and <4 x i64> %__B, %__A
+ %0 = icmp ne <4 x i64> %and.i.i, zeroinitializer
+ %1 = bitcast i8 %__U to <8 x i1>
+ %2 = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %3 = and <4 x i1> %0, %2
+ %4 = shufflevector <4 x i1> %3, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %5 = bitcast <8 x i1> %4 to i8
+ ret i8 %5
+}
+
+; Function Attrs: norecurse nounwind readnone
+define zeroext i8 @TEST_mm256_mask_test_epi32_mask(i8 %__U, <4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 {
+; X86_64-LABEL: TEST_mm256_mask_test_epi32_mask:
+; X86_64: # %bb.0: # %entry
+; X86_64-NEXT: kmovw %edi, %k1
+; X86_64-NEXT: vptestmd %ymm0, %ymm1, %k0 {%k1}
+; X86_64-NEXT: kmovw %k0, %eax
+; X86_64-NEXT: # kill: def %al killed %al killed %eax
+; X86_64-NEXT: vzeroupper
+; X86_64-NEXT: retq
+;
+; I386-LABEL: TEST_mm256_mask_test_epi32_mask:
+; I386: # %bb.0: # %entry
+; I386-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; I386-NEXT: kmovw %eax, %k1
+; I386-NEXT: vptestmd %ymm0, %ymm1, %k0 {%k1}
+; I386-NEXT: kmovw %k0, %eax
+; I386-NEXT: # kill: def %al killed %al killed %eax
+; I386-NEXT: vzeroupper
+; I386-NEXT: retl
+entry:
+ %and.i.i = and <4 x i64> %__B, %__A
+ %0 = bitcast <4 x i64> %and.i.i to <8 x i32>
+ %1 = icmp ne <8 x i32> %0, zeroinitializer
+ %2 = bitcast i8 %__U to <8 x i1>
+ %3 = and <8 x i1> %1, %2
+ %4 = bitcast <8 x i1> %3 to i8
+ ret i8 %4
+}
+
+; Function Attrs: norecurse nounwind readnone
+define zeroext i8 @TEST_mm_testn_epi64_mask(<2 x i64> %__A, <2 x i64> %__B) local_unnamed_addr #0 {
+; X86_64-LABEL: TEST_mm_testn_epi64_mask:
+; X86_64: # %bb.0: # %entry
+; X86_64-NEXT: vptestnmq %xmm0, %xmm1, %k0
+; X86_64-NEXT: kmovw %k0, %eax
+; X86_64-NEXT: # kill: def %al killed %al killed %eax
+; X86_64-NEXT: retq
+;
+; I386-LABEL: TEST_mm_testn_epi64_mask:
+; I386: # %bb.0: # %entry
+; I386-NEXT: vptestnmq %xmm0, %xmm1, %k0
+; I386-NEXT: kmovw %k0, %eax
+; I386-NEXT: # kill: def %al killed %al killed %eax
+; I386-NEXT: retl
+entry:
+ %and.i.i = and <2 x i64> %__B, %__A
+ %0 = icmp eq <2 x i64> %and.i.i, zeroinitializer
+ %1 = shufflevector <2 x i1> %0, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
+ %2 = bitcast <8 x i1> %1 to i8
+ ret i8 %2
+}
+
+; Function Attrs: norecurse nounwind readnone
+define zeroext i8 @TEST_mm_testn_epi32_mask(<2 x i64> %__A, <2 x i64> %__B) local_unnamed_addr #0 {
+; X86_64-LABEL: TEST_mm_testn_epi32_mask:
+; X86_64: # %bb.0: # %entry
+; X86_64-NEXT: vptestnmd %xmm0, %xmm1, %k0
+; X86_64-NEXT: kmovw %k0, %eax
+; X86_64-NEXT: # kill: def %al killed %al killed %eax
+; X86_64-NEXT: retq
+;
+; I386-LABEL: TEST_mm_testn_epi32_mask:
+; I386: # %bb.0: # %entry
+; I386-NEXT: vptestnmd %xmm0, %xmm1, %k0
+; I386-NEXT: kmovw %k0, %eax
+; I386-NEXT: # kill: def %al killed %al killed %eax
+; I386-NEXT: retl
+entry:
+ %and.i.i = and <2 x i64> %__B, %__A
+ %0 = bitcast <2 x i64> %and.i.i to <4 x i32>
+ %1 = icmp eq <4 x i32> %0, zeroinitializer
+ %2 = shufflevector <4 x i1> %1, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %3 = bitcast <8 x i1> %2 to i8
+ ret i8 %3
+}
+
+; Function Attrs: norecurse nounwind readnone
+define zeroext i8 @TEST_mm256_testn_epi64_mask(<4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 {
+; X86_64-LABEL: TEST_mm256_testn_epi64_mask:
+; X86_64: # %bb.0: # %entry
+; X86_64-NEXT: vptestnmq %ymm0, %ymm1, %k0
+; X86_64-NEXT: kmovw %k0, %eax
+; X86_64-NEXT: # kill: def %al killed %al killed %eax
+; X86_64-NEXT: vzeroupper
+; X86_64-NEXT: retq
+;
+; I386-LABEL: TEST_mm256_testn_epi64_mask:
+; I386: # %bb.0: # %entry
+; I386-NEXT: vptestnmq %ymm0, %ymm1, %k0
+; I386-NEXT: kmovw %k0, %eax
+; I386-NEXT: # kill: def %al killed %al killed %eax
+; I386-NEXT: vzeroupper
+; I386-NEXT: retl
+entry:
+ %and.i.i = and <4 x i64> %__B, %__A
+ %0 = icmp eq <4 x i64> %and.i.i, zeroinitializer
+ %1 = shufflevector <4 x i1> %0, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %2 = bitcast <8 x i1> %1 to i8
+ ret i8 %2
+}
+
+; Function Attrs: norecurse nounwind readnone
+define zeroext i8 @TEST_mm256_testn_epi32_mask(<4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 {
+; X86_64-LABEL: TEST_mm256_testn_epi32_mask:
+; X86_64: # %bb.0: # %entry
+; X86_64-NEXT: vptestnmd %ymm0, %ymm1, %k0
+; X86_64-NEXT: kmovw %k0, %eax
+; X86_64-NEXT: # kill: def %al killed %al killed %eax
+; X86_64-NEXT: vzeroupper
+; X86_64-NEXT: retq
+;
+; I386-LABEL: TEST_mm256_testn_epi32_mask:
+; I386: # %bb.0: # %entry
+; I386-NEXT: vptestnmd %ymm0, %ymm1, %k0
+; I386-NEXT: kmovw %k0, %eax
+; I386-NEXT: # kill: def %al killed %al killed %eax
+; I386-NEXT: vzeroupper
+; I386-NEXT: retl
+entry:
+ %and.i.i = and <4 x i64> %__B, %__A
+ %0 = bitcast <4 x i64> %and.i.i to <8 x i32>
+ %1 = icmp eq <8 x i32> %0, zeroinitializer
+ %2 = bitcast <8 x i1> %1 to i8
+ ret i8 %2
+}
+
+; Function Attrs: norecurse nounwind readnone
+define zeroext i8 @TEST_mm_mask_testn_epi64_mask(i8 %__U, <2 x i64> %__A, <2 x i64> %__B) local_unnamed_addr #0 {
+; X86_64-LABEL: TEST_mm_mask_testn_epi64_mask:
+; X86_64: # %bb.0: # %entry
+; X86_64-NEXT: kmovw %edi, %k1
+; X86_64-NEXT: vptestnmq %xmm0, %xmm1, %k0 {%k1}
+; X86_64-NEXT: kmovw %k0, %eax
+; X86_64-NEXT: # kill: def %al killed %al killed %eax
+; X86_64-NEXT: retq
+;
+; I386-LABEL: TEST_mm_mask_testn_epi64_mask:
+; I386: # %bb.0: # %entry
+; I386-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; I386-NEXT: kmovw %eax, %k1
+; I386-NEXT: vptestnmq %xmm0, %xmm1, %k0 {%k1}
+; I386-NEXT: kmovw %k0, %eax
+; I386-NEXT: # kill: def %al killed %al killed %eax
+; I386-NEXT: retl
+entry:
+ %and.i.i = and <2 x i64> %__B, %__A
+ %0 = icmp eq <2 x i64> %and.i.i, zeroinitializer
+ %1 = bitcast i8 %__U to <8 x i1>
+ %2 = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
+ %3 = and <2 x i1> %0, %2
+ %4 = shufflevector <2 x i1> %3, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
+ %5 = bitcast <8 x i1> %4 to i8
+ ret i8 %5
+}
+
+; Function Attrs: norecurse nounwind readnone
+define zeroext i8 @TEST_mm_mask_testn_epi32_mask(i8 %__U, <2 x i64> %__A, <2 x i64> %__B) local_unnamed_addr #0 {
+; X86_64-LABEL: TEST_mm_mask_testn_epi32_mask:
+; X86_64: # %bb.0: # %entry
+; X86_64-NEXT: kmovw %edi, %k1
+; X86_64-NEXT: vptestnmd %xmm0, %xmm1, %k0 {%k1}
+; X86_64-NEXT: kmovw %k0, %eax
+; X86_64-NEXT: # kill: def %al killed %al killed %eax
+; X86_64-NEXT: retq
+;
+; I386-LABEL: TEST_mm_mask_testn_epi32_mask:
+; I386: # %bb.0: # %entry
+; I386-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; I386-NEXT: kmovw %eax, %k1
+; I386-NEXT: vptestnmd %xmm0, %xmm1, %k0 {%k1}
+; I386-NEXT: kmovw %k0, %eax
+; I386-NEXT: # kill: def %al killed %al killed %eax
+; I386-NEXT: retl
+entry:
+ %and.i.i = and <2 x i64> %__B, %__A
+ %0 = bitcast <2 x i64> %and.i.i to <4 x i32>
+ %1 = icmp eq <4 x i32> %0, zeroinitializer
+ %2 = bitcast i8 %__U to <8 x i1>
+ %3 = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %4 = and <4 x i1> %1, %3
+ %5 = shufflevector <4 x i1> %4, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %6 = bitcast <8 x i1> %5 to i8
+ ret i8 %6
+}
+
+
+; Function Attrs: norecurse nounwind readnone
+define zeroext i8 @TEST_mm256_mask_testn_epi64_mask(i8 %__U, <4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 {
+; X86_64-LABEL: TEST_mm256_mask_testn_epi64_mask:
+; X86_64: # %bb.0: # %entry
+; X86_64-NEXT: kmovw %edi, %k1
+; X86_64-NEXT: vptestnmq %ymm0, %ymm1, %k0 {%k1}
+; X86_64-NEXT: kmovw %k0, %eax
+; X86_64-NEXT: # kill: def %al killed %al killed %eax
+; X86_64-NEXT: vzeroupper
+; X86_64-NEXT: retq
+;
+; I386-LABEL: TEST_mm256_mask_testn_epi64_mask:
+; I386: # %bb.0: # %entry
+; I386-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; I386-NEXT: kmovw %eax, %k1
+; I386-NEXT: vptestnmq %ymm0, %ymm1, %k0 {%k1}
+; I386-NEXT: kmovw %k0, %eax
+; I386-NEXT: # kill: def %al killed %al killed %eax
+; I386-NEXT: vzeroupper
+; I386-NEXT: retl
+entry:
+ %and.i.i = and <4 x i64> %__B, %__A
+ %0 = icmp eq <4 x i64> %and.i.i, zeroinitializer
+ %1 = bitcast i8 %__U to <8 x i1>
+ %2 = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %3 = and <4 x i1> %0, %2
+ %4 = shufflevector <4 x i1> %3, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %5 = bitcast <8 x i1> %4 to i8
+ ret i8 %5
+}
+
+; Function Attrs: norecurse nounwind readnone
+define zeroext i8 @TEST_mm256_mask_testn_epi32_mask(i8 %__U, <4 x i64> %__A, <4 x i64> %__B) local_unnamed_addr #0 {
+; X86_64-LABEL: TEST_mm256_mask_testn_epi32_mask:
+; X86_64: # %bb.0: # %entry
+; X86_64-NEXT: kmovw %edi, %k1
+; X86_64-NEXT: vptestnmd %ymm0, %ymm1, %k0 {%k1}
+; X86_64-NEXT: kmovw %k0, %eax
+; X86_64-NEXT: # kill: def %al killed %al killed %eax
+; X86_64-NEXT: vzeroupper
+; X86_64-NEXT: retq
+;
+; I386-LABEL: TEST_mm256_mask_testn_epi32_mask:
+; I386: # %bb.0: # %entry
+; I386-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; I386-NEXT: kmovw %eax, %k1
+; I386-NEXT: vptestnmd %ymm0, %ymm1, %k0 {%k1}
+; I386-NEXT: kmovw %k0, %eax
+; I386-NEXT: # kill: def %al killed %al killed %eax
+; I386-NEXT: vzeroupper
+; I386-NEXT: retl
+entry:
+ %and.i.i = and <4 x i64> %__B, %__A
+ %0 = bitcast <4 x i64> %and.i.i to <8 x i32>
+ %1 = icmp eq <8 x i32> %0, zeroinitializer
+ %2 = bitcast i8 %__U to <8 x i1>
+ %3 = and <8 x i1> %1, %2
+ %4 = bitcast <8 x i1> %3 to i8
+ ret i8 %4
+}
+
diff --git a/test/CodeGen/X86/avx512vl-vpclmulqdq.ll b/test/CodeGen/X86/avx512vl-vpclmulqdq.ll
new file mode 100644
index 000000000000..777a70db5a84
--- /dev/null
+++ b/test/CodeGen/X86/avx512vl-vpclmulqdq.ll
@@ -0,0 +1,22 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512vl,+vpclmulqdq -show-mc-encoding | FileCheck %s --check-prefix=AVX512VL_VPCLMULQDQ
+
+define <2 x i64> @test_x86_pclmulqdq(<2 x i64> %a0, <2 x i64> %a1) {
+; AVX512VL_VPCLMULQDQ-LABEL: test_x86_pclmulqdq:
+; AVX512VL_VPCLMULQDQ: # %bb.0:
+; AVX512VL_VPCLMULQDQ-NEXT: vpclmulqdq $1, %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x44,0xc1,0x01]
+; AVX512VL_VPCLMULQDQ-NEXT: retq # encoding: [0xc3]
+ %res = call <2 x i64> @llvm.x86.pclmulqdq(<2 x i64> %a0, <2 x i64> %a1, i8 1)
+ ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.pclmulqdq(<2 x i64>, <2 x i64>, i8) nounwind readnone
+
+define <4 x i64> @test_x86_pclmulqdq_256(<4 x i64> %a0, <4 x i64> %a1) {
+; AVX512VL_VPCLMULQDQ-LABEL: test_x86_pclmulqdq_256:
+; AVX512VL_VPCLMULQDQ: # %bb.0:
+; AVX512VL_VPCLMULQDQ-NEXT: vpclmulqdq $16, %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x44,0xc1,0x10]
+; AVX512VL_VPCLMULQDQ-NEXT: retq # encoding: [0xc3]
+ %res = call <4 x i64> @llvm.x86.pclmulqdq.256(<4 x i64> %a0, <4 x i64> %a1, i8 16)
+ ret <4 x i64> %res
+}
+declare <4 x i64> @llvm.x86.pclmulqdq.256(<4 x i64>, <4 x i64>, i8) nounwind readnone
diff --git a/test/CodeGen/X86/avx512vl_vnni-intrinsics.ll b/test/CodeGen/X86/avx512vl_vnni-intrinsics.ll
new file mode 100644
index 000000000000..a098389f00cf
--- /dev/null
+++ b/test/CodeGen/X86/avx512vl_vnni-intrinsics.ll
@@ -0,0 +1,195 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512vnni,+avx512vl| FileCheck %s
+
+declare <8 x i32> @llvm.x86.avx512.mask.vpdpbusd.256(<8 x i32>, <8 x i32>, <8 x i32>, i8)
+declare <8 x i32> @llvm.x86.avx512.maskz.vpdpbusd.256(<8 x i32>, <8 x i32>, <8 x i32>, i8)
+
+define <8 x i32>@test_int_x86_avx512_mask_vpdpbusd_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32>* %x2p, <8 x i32> %x4, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpdpbusd_256:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovw %esi, %k1
+; CHECK-NEXT: vmovaps %ymm0, %ymm3
+; CHECK-NEXT: vpdpbusd (%rdi), %ymm1, %ymm3 {%k1}
+; CHECK-NEXT: vmovaps %ymm0, %ymm4
+; CHECK-NEXT: vpdpbusd %ymm2, %ymm1, %ymm4
+; CHECK-NEXT: vpdpbusd %ymm2, %ymm1, %ymm0 {%k1} {z}
+; CHECK-NEXT: vpaddd %ymm0, %ymm4, %ymm0
+; CHECK-NEXT: vpaddd %ymm0, %ymm3, %ymm0
+; CHECK-NEXT: retq
+ %x2 = load <8 x i32>, <8 x i32>* %x2p
+ %res = call <8 x i32> @llvm.x86.avx512.mask.vpdpbusd.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3)
+ %res1 = call <8 x i32> @llvm.x86.avx512.mask.vpdpbusd.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x4, i8 -1)
+ %res2 = call <8 x i32> @llvm.x86.avx512.maskz.vpdpbusd.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x4, i8 %x3)
+ %res3 = add <8 x i32> %res, %res1
+ %res4 = add <8 x i32> %res2, %res3
+ ret <8 x i32> %res4
+}
+
+declare <4 x i32> @llvm.x86.avx512.mask.vpdpbusd.128(<4 x i32>, <4 x i32>, <4 x i32>, i8)
+declare <4 x i32> @llvm.x86.avx512.maskz.vpdpbusd.128(<4 x i32>, <4 x i32>, <4 x i32>, i8)
+
+define <4 x i32>@test_int_x86_avx512_mask_vpdpbusd_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32>* %x2p, <4 x i32> %x4, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpdpbusd_128:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovw %esi, %k1
+; CHECK-NEXT: vmovaps %xmm0, %xmm3
+; CHECK-NEXT: vpdpbusd (%rdi), %xmm1, %xmm3 {%k1}
+; CHECK-NEXT: vmovaps %xmm0, %xmm4
+; CHECK-NEXT: vpdpbusd %xmm2, %xmm1, %xmm4
+; CHECK-NEXT: vpdpbusd %xmm2, %xmm1, %xmm0 {%k1} {z}
+; CHECK-NEXT: vpaddd %xmm0, %xmm4, %xmm0
+; CHECK-NEXT: vpaddd %xmm0, %xmm3, %xmm0
+; CHECK-NEXT: retq
+ %x2 = load <4 x i32>, <4 x i32>* %x2p
+ %res = call <4 x i32> @llvm.x86.avx512.mask.vpdpbusd.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3)
+ %res1 = call <4 x i32> @llvm.x86.avx512.mask.vpdpbusd.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x4, i8 -1)
+ %res2 = call <4 x i32> @llvm.x86.avx512.maskz.vpdpbusd.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x4, i8 %x3)
+ %res3 = add <4 x i32> %res, %res1
+ %res4 = add <4 x i32> %res2, %res3
+ ret <4 x i32> %res4
+}
+
+declare <8 x i32> @llvm.x86.avx512.mask.vpdpbusds.256(<8 x i32>, <8 x i32>, <8 x i32>, i8)
+declare <8 x i32> @llvm.x86.avx512.maskz.vpdpbusds.256(<8 x i32>, <8 x i32>, <8 x i32>, i8)
+
+define <8 x i32>@test_int_x86_avx512_mask_vpdpbusds_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32>* %x2p, <8 x i32> %x4, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpdpbusds_256:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovw %esi, %k1
+; CHECK-NEXT: vmovaps %ymm0, %ymm3
+; CHECK-NEXT: vpdpbusds (%rdi), %ymm1, %ymm3 {%k1}
+; CHECK-NEXT: vmovaps %ymm0, %ymm4
+; CHECK-NEXT: vpdpbusds %ymm2, %ymm1, %ymm4
+; CHECK-NEXT: vpdpbusds %ymm2, %ymm1, %ymm0 {%k1} {z}
+; CHECK-NEXT: vpaddd %ymm0, %ymm4, %ymm0
+; CHECK-NEXT: vpaddd %ymm0, %ymm3, %ymm0
+; CHECK-NEXT: retq
+ %x2 = load <8 x i32>, <8 x i32>* %x2p
+ %res = call <8 x i32> @llvm.x86.avx512.mask.vpdpbusds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3)
+ %res1 = call <8 x i32> @llvm.x86.avx512.mask.vpdpbusds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x4, i8 -1)
+ %res2 = call <8 x i32> @llvm.x86.avx512.maskz.vpdpbusds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x4, i8 %x3)
+ %res3 = add <8 x i32> %res, %res1
+ %res4 = add <8 x i32> %res2, %res3
+ ret <8 x i32> %res4
+}
+
+declare <4 x i32> @llvm.x86.avx512.mask.vpdpbusds.128(<4 x i32>, <4 x i32>, <4 x i32>, i8)
+declare <4 x i32> @llvm.x86.avx512.maskz.vpdpbusds.128(<4 x i32>, <4 x i32>, <4 x i32>, i8)
+
+define <4 x i32>@test_int_x86_avx512_mask_vpdpbusds_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32>* %x2p, <4 x i32> %x4, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpdpbusds_128:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovw %esi, %k1
+; CHECK-NEXT: vmovaps %xmm0, %xmm3
+; CHECK-NEXT: vpdpbusds (%rdi), %xmm1, %xmm3 {%k1}
+; CHECK-NEXT: vmovaps %xmm0, %xmm4
+; CHECK-NEXT: vpdpbusds %xmm2, %xmm1, %xmm4
+; CHECK-NEXT: vpdpbusds %xmm2, %xmm1, %xmm0 {%k1} {z}
+; CHECK-NEXT: vpaddd %xmm0, %xmm4, %xmm0
+; CHECK-NEXT: vpaddd %xmm0, %xmm3, %xmm0
+; CHECK-NEXT: retq
+ %x2 = load <4 x i32>, <4 x i32>* %x2p
+ %res = call <4 x i32> @llvm.x86.avx512.mask.vpdpbusds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3)
+ %res1 = call <4 x i32> @llvm.x86.avx512.mask.vpdpbusds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x4, i8 -1)
+ %res2 = call <4 x i32> @llvm.x86.avx512.maskz.vpdpbusds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x4, i8 %x3)
+ %res3 = add <4 x i32> %res, %res1
+ %res4 = add <4 x i32> %res2, %res3
+ ret <4 x i32> %res4
+}
+
+declare <8 x i32> @llvm.x86.avx512.mask.vpdpwssd.256(<8 x i32>, <8 x i32>, <8 x i32>, i8)
+declare <8 x i32> @llvm.x86.avx512.maskz.vpdpwssd.256(<8 x i32>, <8 x i32>, <8 x i32>, i8)
+
+define <8 x i32>@test_int_x86_avx512_mask_vpdpwssd_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32>* %x2p, <8 x i32> %x4, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpdpwssd_256:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovw %esi, %k1
+; CHECK-NEXT: vmovaps %ymm0, %ymm3
+; CHECK-NEXT: vpdpwssd (%rdi), %ymm1, %ymm3 {%k1}
+; CHECK-NEXT: vmovaps %ymm0, %ymm4
+; CHECK-NEXT: vpdpwssd %ymm2, %ymm1, %ymm4
+; CHECK-NEXT: vpdpwssd %ymm2, %ymm1, %ymm0 {%k1} {z}
+; CHECK-NEXT: vpaddd %ymm0, %ymm4, %ymm0
+; CHECK-NEXT: vpaddd %ymm0, %ymm3, %ymm0
+; CHECK-NEXT: retq
+ %x2 = load <8 x i32>, <8 x i32>* %x2p
+ %res = call <8 x i32> @llvm.x86.avx512.mask.vpdpwssd.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3)
+ %res1 = call <8 x i32> @llvm.x86.avx512.mask.vpdpwssd.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x4, i8 -1)
+ %res2 = call <8 x i32> @llvm.x86.avx512.maskz.vpdpwssd.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x4, i8 %x3)
+ %res3 = add <8 x i32> %res, %res1
+ %res4 = add <8 x i32> %res2, %res3
+ ret <8 x i32> %res4
+}
+
+declare <4 x i32> @llvm.x86.avx512.mask.vpdpwssd.128(<4 x i32>, <4 x i32>, <4 x i32>, i8)
+declare <4 x i32> @llvm.x86.avx512.maskz.vpdpwssd.128(<4 x i32>, <4 x i32>, <4 x i32>, i8)
+
+define <4 x i32>@test_int_x86_avx512_mask_vpdpwssd_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32>* %x2p, <4 x i32> %x4, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpdpwssd_128:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovw %esi, %k1
+; CHECK-NEXT: vmovaps %xmm0, %xmm3
+; CHECK-NEXT: vpdpwssd (%rdi), %xmm1, %xmm3 {%k1}
+; CHECK-NEXT: vmovaps %xmm0, %xmm4
+; CHECK-NEXT: vpdpwssd %xmm2, %xmm1, %xmm4
+; CHECK-NEXT: vpdpwssd %xmm2, %xmm1, %xmm0 {%k1} {z}
+; CHECK-NEXT: vpaddd %xmm0, %xmm4, %xmm0
+; CHECK-NEXT: vpaddd %xmm0, %xmm3, %xmm0
+; CHECK-NEXT: retq
+ %x2 = load <4 x i32>, <4 x i32>* %x2p
+ %res = call <4 x i32> @llvm.x86.avx512.mask.vpdpwssd.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3)
+ %res1 = call <4 x i32> @llvm.x86.avx512.mask.vpdpwssd.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x4, i8 -1)
+ %res2 = call <4 x i32> @llvm.x86.avx512.maskz.vpdpwssd.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x4, i8 %x3)
+ %res3 = add <4 x i32> %res, %res1
+ %res4 = add <4 x i32> %res2, %res3
+ ret <4 x i32> %res4
+}
+
+
+declare <8 x i32> @llvm.x86.avx512.mask.vpdpwssds.256(<8 x i32>, <8 x i32>, <8 x i32>, i8)
+declare <8 x i32> @llvm.x86.avx512.maskz.vpdpwssds.256(<8 x i32>, <8 x i32>, <8 x i32>, i8)
+
+define <8 x i32>@test_int_x86_avx512_mask_vpdpwssds_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32>* %x2p, <8 x i32> %x4, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpdpwssds_256:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovw %esi, %k1
+; CHECK-NEXT: vmovaps %ymm0, %ymm3
+; CHECK-NEXT: vpdpwssds (%rdi), %ymm1, %ymm3 {%k1}
+; CHECK-NEXT: vmovaps %ymm0, %ymm4
+; CHECK-NEXT: vpdpwssds %ymm2, %ymm1, %ymm4
+; CHECK-NEXT: vpdpwssds %ymm2, %ymm1, %ymm0 {%k1} {z}
+; CHECK-NEXT: vpaddd %ymm0, %ymm4, %ymm0
+; CHECK-NEXT: vpaddd %ymm0, %ymm3, %ymm0
+; CHECK-NEXT: retq
+ %x2 = load <8 x i32>, <8 x i32>* %x2p
+ %res = call <8 x i32> @llvm.x86.avx512.mask.vpdpwssds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3)
+ %res1 = call <8 x i32> @llvm.x86.avx512.mask.vpdpwssds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x4, i8 -1)
+ %res2 = call <8 x i32> @llvm.x86.avx512.maskz.vpdpwssds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x4, i8 %x3)
+ %res3 = add <8 x i32> %res, %res1
+ %res4 = add <8 x i32> %res2, %res3
+ ret <8 x i32> %res4
+}
+
+declare <4 x i32> @llvm.x86.avx512.mask.vpdpwssds.128(<4 x i32>, <4 x i32>, <4 x i32>, i8)
+declare <4 x i32> @llvm.x86.avx512.maskz.vpdpwssds.128(<4 x i32>, <4 x i32>, <4 x i32>, i8)
+
+define <4 x i32>@test_int_x86_avx512_mask_vpdpwssds_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32>* %x2p, <4 x i32> %x4, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpdpwssds_128:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovw %esi, %k1
+; CHECK-NEXT: vmovaps %xmm0, %xmm3
+; CHECK-NEXT: vpdpwssds (%rdi), %xmm1, %xmm3 {%k1}
+; CHECK-NEXT: vmovaps %xmm0, %xmm4
+; CHECK-NEXT: vpdpwssds %xmm2, %xmm1, %xmm4
+; CHECK-NEXT: vpdpwssds %xmm2, %xmm1, %xmm0 {%k1} {z}
+; CHECK-NEXT: vpaddd %xmm0, %xmm4, %xmm0
+; CHECK-NEXT: vpaddd %xmm0, %xmm3, %xmm0
+; CHECK-NEXT: retq
+ %x2 = load <4 x i32>, <4 x i32>* %x2p
+ %res = call <4 x i32> @llvm.x86.avx512.mask.vpdpwssds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3)
+ %res1 = call <4 x i32> @llvm.x86.avx512.mask.vpdpwssds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x4, i8 -1)
+ %res2 = call <4 x i32> @llvm.x86.avx512.maskz.vpdpwssds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x4, i8 %x3)
+ %res3 = add <4 x i32> %res, %res1
+ %res4 = add <4 x i32> %res2, %res3
+ ret <4 x i32> %res4
+}
+
diff --git a/test/CodeGen/X86/avx512vlcd-intrinsics-fast-isel.ll b/test/CodeGen/X86/avx512vlcd-intrinsics-fast-isel.ll
new file mode 100644
index 000000000000..1f0efeefd328
--- /dev/null
+++ b/test/CodeGen/X86/avx512vlcd-intrinsics-fast-isel.ll
@@ -0,0 +1,75 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx512cd,+avx512vl | FileCheck %s
+
+define <2 x i64> @test_mm_broadcastmb_epi64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: test_mm_broadcastmb_epi64:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k0
+; CHECK-NEXT: vpbroadcastmb2q %k0, %xmm0
+; CHECK-NEXT: retq
+entry:
+ %0 = bitcast <2 x i64> %a to <4 x i32>
+ %1 = bitcast <2 x i64> %b to <4 x i32>
+ %2 = icmp eq <4 x i32> %0, %1
+ %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %4 = bitcast <8 x i1> %3 to i8
+ %conv.i = zext i8 %4 to i64
+ %vecinit.i.i = insertelement <2 x i64> undef, i64 %conv.i, i32 0
+ %vecinit1.i.i = shufflevector <2 x i64> %vecinit.i.i, <2 x i64> undef, <2 x i32> zeroinitializer
+ ret <2 x i64> %vecinit1.i.i
+}
+
+define <4 x i64> @test_mm256_broadcastmb_epi64(<4 x i64> %a, <4 x i64> %b) {
+; CHECK-LABEL: test_mm256_broadcastmb_epi64:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k0
+; CHECK-NEXT: vpbroadcastmb2q %k0, %ymm0
+; CHECK-NEXT: retq
+entry:
+ %0 = icmp eq <4 x i64> %a, %b
+ %1 = shufflevector <4 x i1> %0, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %2 = bitcast <8 x i1> %1 to i8
+ %conv.i = zext i8 %2 to i64
+ %vecinit.i.i = insertelement <4 x i64> undef, i64 %conv.i, i32 0
+ %vecinit3.i.i = shufflevector <4 x i64> %vecinit.i.i, <4 x i64> undef, <4 x i32> zeroinitializer
+ ret <4 x i64> %vecinit3.i.i
+}
+
+define <2 x i64> @test_mm_broadcastmw_epi32(<8 x i64> %a, <8 x i64> %b) {
+; CHECK-LABEL: test_mm_broadcastmw_epi32:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
+; CHECK-NEXT: vpbroadcastmw2d %k0, %xmm0
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+entry:
+ %0 = bitcast <8 x i64> %a to <16 x i32>
+ %1 = bitcast <8 x i64> %b to <16 x i32>
+ %2 = icmp eq <16 x i32> %0, %1
+ %3 = bitcast <16 x i1> %2 to i16
+ %conv.i = zext i16 %3 to i32
+ %vecinit.i.i = insertelement <4 x i32> undef, i32 %conv.i, i32 0
+ %vecinit3.i.i = shufflevector <4 x i32> %vecinit.i.i, <4 x i32> undef, <4 x i32> zeroinitializer
+ %4 = bitcast <4 x i32> %vecinit3.i.i to <2 x i64>
+ ret <2 x i64> %4
+}
+
+define <4 x i64> @test_mm256_broadcastmw_epi32(<8 x i64> %a, <8 x i64> %b) {
+; CHECK-LABEL: test_mm256_broadcastmw_epi32:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
+; CHECK-NEXT: vpbroadcastmw2d %k0, %ymm0
+; CHECK-NEXT: retq
+entry:
+ %0 = bitcast <8 x i64> %a to <16 x i32>
+ %1 = bitcast <8 x i64> %b to <16 x i32>
+ %2 = icmp eq <16 x i32> %0, %1
+ %3 = bitcast <16 x i1> %2 to i16
+ %conv.i = zext i16 %3 to i32
+ %vecinit.i.i = insertelement <8 x i32> undef, i32 %conv.i, i32 0
+ %vecinit7.i.i = shufflevector <8 x i32> %vecinit.i.i, <8 x i32> undef, <8 x i32> zeroinitializer
+ %4 = bitcast <8 x i32> %vecinit7.i.i to <4 x i64>
+ ret <4 x i64> %4
+}
+
+
diff --git a/test/CodeGen/X86/avx512vnni-intrinsics.ll b/test/CodeGen/X86/avx512vnni-intrinsics.ll
new file mode 100644
index 000000000000..0ee0ca0cde41
--- /dev/null
+++ b/test/CodeGen/X86/avx512vnni-intrinsics.ll
@@ -0,0 +1,98 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512vnni | FileCheck %s
+
+declare <16 x i32> @llvm.x86.avx512.mask.vpdpbusd.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
+declare <16 x i32> @llvm.x86.avx512.maskz.vpdpbusd.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
+
+define <16 x i32>@test_int_x86_avx512_mask_vpdpbusd_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32>* %x2p, <16 x i32> %x4, i16 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpdpbusd_512:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovw %esi, %k1
+; CHECK-NEXT: vmovaps %zmm0, %zmm3
+; CHECK-NEXT: vpdpbusd (%rdi), %zmm1, %zmm3 {%k1}
+; CHECK-NEXT: vmovaps %zmm0, %zmm4
+; CHECK-NEXT: vpdpbusd %zmm2, %zmm1, %zmm4
+; CHECK-NEXT: vpdpbusd %zmm2, %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT: vpaddd %zmm0, %zmm4, %zmm0
+; CHECK-NEXT: vpaddd %zmm0, %zmm3, %zmm0
+; CHECK-NEXT: retq
+ %x2 = load <16 x i32>, <16 x i32>* %x2p
+ %res = call <16 x i32> @llvm.x86.avx512.mask.vpdpbusd.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)
+ %res1 = call <16 x i32> @llvm.x86.avx512.mask.vpdpbusd.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x4, i16 -1)
+ %res2 = call <16 x i32> @llvm.x86.avx512.maskz.vpdpbusd.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x4, i16 %x3)
+ %res3 = add <16 x i32> %res, %res1
+ %res4 = add <16 x i32> %res2, %res3
+ ret <16 x i32> %res4
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.vpdpbusds.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
+declare <16 x i32> @llvm.x86.avx512.maskz.vpdpbusds.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
+
+define <16 x i32>@test_int_x86_avx512_mask_vpdpbusds_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32>* %x2p, <16 x i32> %x4, i16 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpdpbusds_512:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovw %esi, %k1
+; CHECK-NEXT: vmovaps %zmm0, %zmm3
+; CHECK-NEXT: vpdpbusds (%rdi), %zmm1, %zmm3 {%k1}
+; CHECK-NEXT: vmovaps %zmm0, %zmm4
+; CHECK-NEXT: vpdpbusds %zmm2, %zmm1, %zmm4
+; CHECK-NEXT: vpdpbusds %zmm2, %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT: vpaddd %zmm0, %zmm4, %zmm0
+; CHECK-NEXT: vpaddd %zmm0, %zmm3, %zmm0
+; CHECK-NEXT: retq
+ %x2 = load <16 x i32>, <16 x i32>* %x2p
+ %res = call <16 x i32> @llvm.x86.avx512.mask.vpdpbusds.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)
+ %res1 = call <16 x i32> @llvm.x86.avx512.mask.vpdpbusds.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x4, i16 -1)
+ %res2 = call <16 x i32> @llvm.x86.avx512.maskz.vpdpbusds.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x4, i16 %x3)
+ %res3 = add <16 x i32> %res, %res1
+ %res4 = add <16 x i32> %res2, %res3
+ ret <16 x i32> %res4
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.vpdpwssd.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
+declare <16 x i32> @llvm.x86.avx512.maskz.vpdpwssd.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
+
+define <16 x i32>@test_int_x86_avx512_mask_vpdpwssd_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32>* %x2p, <16 x i32> %x4, i16 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpdpwssd_512:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovw %esi, %k1
+; CHECK-NEXT: vmovaps %zmm0, %zmm3
+; CHECK-NEXT: vpdpwssd (%rdi), %zmm1, %zmm3 {%k1}
+; CHECK-NEXT: vmovaps %zmm0, %zmm4
+; CHECK-NEXT: vpdpwssd %zmm2, %zmm1, %zmm4
+; CHECK-NEXT: vpdpwssd %zmm2, %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT: vpaddd %zmm0, %zmm4, %zmm0
+; CHECK-NEXT: vpaddd %zmm0, %zmm3, %zmm0
+; CHECK-NEXT: retq
+ %x2 = load <16 x i32>, <16 x i32>* %x2p
+ %res = call <16 x i32> @llvm.x86.avx512.mask.vpdpwssd.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)
+ %res1 = call <16 x i32> @llvm.x86.avx512.mask.vpdpwssd.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x4, i16 -1)
+ %res2 = call <16 x i32> @llvm.x86.avx512.maskz.vpdpwssd.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x4, i16 %x3)
+ %res3 = add <16 x i32> %res, %res1
+ %res4 = add <16 x i32> %res2, %res3
+ ret <16 x i32> %res4
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.vpdpwssds.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
+declare <16 x i32> @llvm.x86.avx512.maskz.vpdpwssds.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
+
+define <16 x i32>@test_int_x86_avx512_mask_vpdpwssds_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32>* %x2p, <16 x i32> %x4, i16 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vpdpwssds_512:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovw %esi, %k1
+; CHECK-NEXT: vmovaps %zmm0, %zmm3
+; CHECK-NEXT: vpdpwssds (%rdi), %zmm1, %zmm3 {%k1}
+; CHECK-NEXT: vmovaps %zmm0, %zmm4
+; CHECK-NEXT: vpdpwssds %zmm2, %zmm1, %zmm4
+; CHECK-NEXT: vpdpwssds %zmm2, %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT: vpaddd %zmm0, %zmm4, %zmm0
+; CHECK-NEXT: vpaddd %zmm0, %zmm3, %zmm0
+; CHECK-NEXT: retq
+ %x2 = load <16 x i32>, <16 x i32>* %x2p
+ %res = call <16 x i32> @llvm.x86.avx512.mask.vpdpwssds.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)
+ %res1 = call <16 x i32> @llvm.x86.avx512.mask.vpdpwssds.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x4, i16 -1)
+ %res2 = call <16 x i32> @llvm.x86.avx512.maskz.vpdpwssds.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x4, i16 %x3)
+ %res3 = add <16 x i32> %res, %res1
+ %res4 = add <16 x i32> %res2, %res3
+ ret <16 x i32> %res4
+}
+
diff --git a/test/CodeGen/X86/avx512vpopcntdq-intrinsics.ll b/test/CodeGen/X86/avx512vpopcntdq-intrinsics.ll
index 019c5282f63b..34330a19d829 100644
--- a/test/CodeGen/X86/avx512vpopcntdq-intrinsics.ll
+++ b/test/CodeGen/X86/avx512vpopcntdq-intrinsics.ll
@@ -10,13 +10,13 @@
define <16 x i32> @test_mask_vpopcnt_d(<16 x i32> %a, i16 %mask, <16 x i32> %b) {
; X86_64-LABEL: test_mask_vpopcnt_d:
-; X86_64: # BB#0:
+; X86_64: # %bb.0:
; X86_64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
; X86_64-NEXT: vpopcntd %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x55,0xc1]
; X86_64-NEXT: retq # encoding: [0xc3]
;
; X86-LABEL: test_mask_vpopcnt_d:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
; X86-NEXT: vpopcntd %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x55,0xc1]
; X86-NEXT: retl # encoding: [0xc3]
@@ -28,13 +28,13 @@ define <16 x i32> @test_mask_vpopcnt_d(<16 x i32> %a, i16 %mask, <16 x i32> %b)
define <16 x i32> @test_maskz_vpopcnt_d(i16 %mask, <16 x i32> %a) {
; X86_64-LABEL: test_maskz_vpopcnt_d:
-; X86_64: # BB#0:
+; X86_64: # %bb.0:
; X86_64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
; X86_64-NEXT: vpopcntd %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xc9,0x55,0xc0]
; X86_64-NEXT: retq # encoding: [0xc3]
;
; X86-LABEL: test_maskz_vpopcnt_d:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
; X86-NEXT: vpopcntd %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xc9,0x55,0xc0]
; X86-NEXT: retl # encoding: [0xc3]
@@ -46,14 +46,14 @@ define <16 x i32> @test_maskz_vpopcnt_d(i16 %mask, <16 x i32> %a) {
define <8 x i64> @test_mask_vpopcnt_q(<8 x i64> %a, <8 x i64> %b, i8 %mask) {
; X86_64-LABEL: test_mask_vpopcnt_q:
-; X86_64: # BB#0:
+; X86_64: # %bb.0:
; X86_64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
; X86_64-NEXT: vpopcntq %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x49,0x55,0xc8]
; X86_64-NEXT: vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1]
; X86_64-NEXT: retq # encoding: [0xc3]
;
; X86-LABEL: test_mask_vpopcnt_q:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
; X86-NEXT: vpopcntq %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x49,0x55,0xc8]
@@ -67,13 +67,13 @@ define <8 x i64> @test_mask_vpopcnt_q(<8 x i64> %a, <8 x i64> %b, i8 %mask) {
define <8 x i64> @test_maskz_vpopcnt_q(<8 x i64> %a, i8 %mask) {
; X86_64-LABEL: test_maskz_vpopcnt_q:
-; X86_64: # BB#0:
+; X86_64: # %bb.0:
; X86_64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
; X86_64-NEXT: vpopcntq %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xc9,0x55,0xc0]
; X86_64-NEXT: retq # encoding: [0xc3]
;
; X86-LABEL: test_maskz_vpopcnt_q:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
; X86-NEXT: vpopcntq %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xc9,0x55,0xc0]
diff --git a/test/CodeGen/X86/avx512vpopcntdq-schedule.ll b/test/CodeGen/X86/avx512vpopcntdq-schedule.ll
new file mode 100644
index 000000000000..d44038ac92f1
--- /dev/null
+++ b/test/CodeGen/X86/avx512vpopcntdq-schedule.ll
@@ -0,0 +1,79 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=+avx512vpopcntdq | FileCheck %s --check-prefix=GENERIC
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=icelake | FileCheck %s --check-prefix=ICELAKE
+
+define void @test_vpopcntd(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> *%a2, i16 %a3) {
+; GENERIC-LABEL: test_vpopcntd:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: kmovw %esi, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: vpopcntd %zmm1, %zmm0 # sched: [3:1.00]
+; GENERIC-NEXT: vpopcntd %zmm1, %zmm0 {%k1} # sched: [3:1.00]
+; GENERIC-NEXT: vpopcntd %zmm1, %zmm0 {%k1} {z} # sched: [3:1.00]
+; GENERIC-NEXT: vpopcntd (%rdi), %zmm0 # sched: [7:1.00]
+; GENERIC-NEXT: vpopcntd (%rdi), %zmm0 {%k1} # sched: [7:1.00]
+; GENERIC-NEXT: vpopcntd (%rdi), %zmm0 {%k1} {z} # sched: [7:1.00]
+; GENERIC-NEXT: vpopcntd (%rdi){1to16}, %zmm0 # sched: [7:1.00]
+; GENERIC-NEXT: vpopcntd (%rdi){1to16}, %zmm0 {%k1} # sched: [7:1.00]
+; GENERIC-NEXT: vpopcntd (%rdi){1to16}, %zmm0 {%k1} {z} # sched: [7:1.00]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ICELAKE-LABEL: test_vpopcntd:
+; ICELAKE: # %bb.0:
+; ICELAKE-NEXT: kmovd %esi, %k1 # sched: [1:1.00]
+; ICELAKE-NEXT: #APP
+; ICELAKE-NEXT: vpopcntd %zmm1, %zmm0 # sched: [1:0.50]
+; ICELAKE-NEXT: vpopcntd %zmm1, %zmm0 {%k1} # sched: [1:0.50]
+; ICELAKE-NEXT: vpopcntd %zmm1, %zmm0 {%k1} {z} # sched: [1:0.50]
+; ICELAKE-NEXT: vpopcntd (%rdi), %zmm0 # sched: [6:0.50]
+; ICELAKE-NEXT: vpopcntd (%rdi), %zmm0 {%k1} # sched: [6:0.50]
+; ICELAKE-NEXT: vpopcntd (%rdi), %zmm0 {%k1} {z} # sched: [6:0.50]
+; ICELAKE-NEXT: vpopcntd (%rdi){1to16}, %zmm0 # sched: [6:0.50]
+; ICELAKE-NEXT: vpopcntd (%rdi){1to16}, %zmm0 {%k1} # sched: [6:0.50]
+; ICELAKE-NEXT: vpopcntd (%rdi){1to16}, %zmm0 {%k1} {z} # sched: [6:0.50]
+; ICELAKE-NEXT: #NO_APP
+; ICELAKE-NEXT: vzeroupper # sched: [4:1.00]
+; ICELAKE-NEXT: retq # sched: [7:1.00]
+ tail call void asm "vpopcntd $1, $0 \0A\09 vpopcntd $1, $0 {$3} \0A\09 vpopcntd $1, $0 {$3} {z} \0A\09 vpopcntd $2, $0 \0A\09 vpopcntd $2, $0 {$3} \0A\09 vpopcntd $2, $0 {$3} {z} \0A\09 vpopcntd $2{1to16}, $0 \0A\09 vpopcntd $2{1to16}, $0 {$3} \0A\09 vpopcntd $2{1to16}, $0 {$3} {z}", "v,v,*m,^Yk"(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> *%a2, i16 %a3) nounwind
+ ret void
+}
+
+define void @test_vpopcntq(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> *%a2, i8 %a3) {
+; GENERIC-LABEL: test_vpopcntq:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: kmovw %esi, %k1 # sched: [1:0.33]
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: vpopcntq %zmm1, %zmm0 # sched: [3:1.00]
+; GENERIC-NEXT: vpopcntq %zmm1, %zmm0 {%k1} # sched: [3:1.00]
+; GENERIC-NEXT: vpopcntq %zmm1, %zmm0 {%k1} {z} # sched: [3:1.00]
+; GENERIC-NEXT: vpopcntq (%rdi), %zmm0 # sched: [7:1.00]
+; GENERIC-NEXT: vpopcntq (%rdi), %zmm0 {%k1} # sched: [7:1.00]
+; GENERIC-NEXT: vpopcntq (%rdi), %zmm0 {%k1} {z} # sched: [7:1.00]
+; GENERIC-NEXT: vpopcntq (%rdi){1to8}, %zmm0 # sched: [7:1.00]
+; GENERIC-NEXT: vpopcntq (%rdi){1to8}, %zmm0 {%k1} # sched: [7:1.00]
+; GENERIC-NEXT: vpopcntq (%rdi){1to8}, %zmm0 {%k1} {z} # sched: [7:1.00]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ICELAKE-LABEL: test_vpopcntq:
+; ICELAKE: # %bb.0:
+; ICELAKE-NEXT: kmovd %esi, %k1 # sched: [1:1.00]
+; ICELAKE-NEXT: #APP
+; ICELAKE-NEXT: vpopcntq %zmm1, %zmm0 # sched: [1:0.50]
+; ICELAKE-NEXT: vpopcntq %zmm1, %zmm0 {%k1} # sched: [1:0.50]
+; ICELAKE-NEXT: vpopcntq %zmm1, %zmm0 {%k1} {z} # sched: [1:0.50]
+; ICELAKE-NEXT: vpopcntq (%rdi), %zmm0 # sched: [6:0.50]
+; ICELAKE-NEXT: vpopcntq (%rdi), %zmm0 {%k1} # sched: [6:0.50]
+; ICELAKE-NEXT: vpopcntq (%rdi), %zmm0 {%k1} {z} # sched: [6:0.50]
+; ICELAKE-NEXT: vpopcntq (%rdi){1to8}, %zmm0 # sched: [6:0.50]
+; ICELAKE-NEXT: vpopcntq (%rdi){1to8}, %zmm0 {%k1} # sched: [6:0.50]
+; ICELAKE-NEXT: vpopcntq (%rdi){1to8}, %zmm0 {%k1} {z} # sched: [6:0.50]
+; ICELAKE-NEXT: #NO_APP
+; ICELAKE-NEXT: vzeroupper # sched: [4:1.00]
+; ICELAKE-NEXT: retq # sched: [7:1.00]
+ tail call void asm "vpopcntq $1, $0 \0A\09 vpopcntq $1, $0 {$3} \0A\09 vpopcntq $1, $0 {$3} {z} \0A\09 vpopcntq $2, $0 \0A\09 vpopcntq $2, $0 {$3} \0A\09 vpopcntq $2, $0 {$3} {z} \0A\09 vpopcntq $2{1to8}, $0 \0A\09 vpopcntq $2{1to8}, $0 {$3} \0A\09 vpopcntq $2{1to8}, $0 {$3} {z}", "v,v,*m,^Yk"(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> *%a2, i8 %a3) nounwind
+ ret void
+}
diff --git a/test/CodeGen/X86/barrier.ll b/test/CodeGen/X86/barrier.ll
index 1f60131f33ca..9031a0eb0129 100644
--- a/test/CodeGen/X86/barrier.ll
+++ b/test/CodeGen/X86/barrier.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=-sse2 | FileCheck %s
+; RUN: llc < %s -mtriple=i686-- -mattr=-sse2 | FileCheck %s
define void @test() {
; CHECK: lock
diff --git a/test/CodeGen/X86/base-pointer-and-cmpxchg.ll b/test/CodeGen/X86/base-pointer-and-cmpxchg.ll
index 8de6d64428e3..a79509b039ac 100644
--- a/test/CodeGen/X86/base-pointer-and-cmpxchg.ll
+++ b/test/CodeGen/X86/base-pointer-and-cmpxchg.ll
@@ -19,23 +19,23 @@
; USE_BASE_64: movq %rsp, %rbx
; USE_BASE_32: movl %esp, %ebx
;
-; Make sure the base pointer is saved before the RBX argument for
+; Make sure the base pointer is saved before the rbx argument for
; cmpxchg16b is set.
;
-; Because of how the test is written, we spill SAVE_RBX.
+; Because of how the test is written, we spill SAVE_rbx.
; However, it would have been perfectly fine to just keep it in register.
-; USE_BASE: movq %rbx, [[SAVE_RBX_SLOT:[0-9]*\(%[er]bx\)]]
+; USE_BASE: movq %rbx, [[SAVE_rbx_SLOT:[0-9]*\(%[er]bx\)]]
;
-; SAVE_RBX must be in register before we clobber rbx.
+; SAVE_rbx must be in register before we clobber rbx.
; It is fine to use any register but rbx and the ones defined and use
; by cmpxchg. Since such regex would be complicated to write, just stick
; to the numbered registers. The bottom line is: if this test case fails
; because of that regex, this is likely just the regex being too conservative.
-; USE_BASE: movq [[SAVE_RBX_SLOT]], [[SAVE_RBX:%r[0-9]+]]
+; USE_BASE: movq [[SAVE_rbx_SLOT]], [[SAVE_rbx:%r[0-9]+]]
;
; USE_BASE: movq {{[^ ]+}}, %rbx
; USE_BASE-NEXT: cmpxchg16b
-; USE_BASE-NEXT: movq [[SAVE_RBX]], %rbx
+; USE_BASE-NEXT: movq [[SAVE_rbx]], %rbx
;
; DONT_USE_BASE-NOT: movq %rsp, %rbx
; DONT_USE_BASE-NOT: movl %esp, %ebx
diff --git a/test/CodeGen/X86/basic-promote-integers.ll b/test/CodeGen/X86/basic-promote-integers.ll
index fce6b7f5565c..739727100a79 100644
--- a/test/CodeGen/X86/basic-promote-integers.ll
+++ b/test/CodeGen/X86/basic-promote-integers.ll
@@ -1,7 +1,7 @@
; Test that vectors are scalarized/lowered correctly
; (with both legalization methods).
-; RUN: llc -march=x86 < %s
-; RUN: llc -march=x86 < %s
+; RUN: llc -mtriple=i686-- < %s
+; RUN: llc -mtriple=i686-- < %s
; A simple test to check copyToParts and copyFromParts.
diff --git a/test/CodeGen/X86/bc-extract.ll b/test/CodeGen/X86/bc-extract.ll
index b43c70e303a1..506ba906800a 100644
--- a/test/CodeGen/X86/bc-extract.ll
+++ b/test/CodeGen/X86/bc-extract.ll
@@ -4,12 +4,12 @@
define float @extractFloat1() nounwind {
; X32-LABEL: extractFloat1:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: fld1
; X32-NEXT: retl
;
; X64-LABEL: extractFloat1:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X64-NEXT: retq
entry:
@@ -20,12 +20,12 @@ entry:
define float @extractFloat2() nounwind {
; X32-LABEL: extractFloat2:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: fldz
; X32-NEXT: retl
;
; X64-LABEL: extractFloat2:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: xorps %xmm0, %xmm0
; X64-NEXT: retq
entry:
@@ -36,12 +36,12 @@ entry:
define i32 @extractInt2() nounwind {
; X32-LABEL: extractInt2:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: xorl %eax, %eax
; X32-NEXT: retl
;
; X64-LABEL: extractInt2:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: retq
entry:
diff --git a/test/CodeGen/X86/bigstructret.ll b/test/CodeGen/X86/bigstructret.ll
index 3c499fae820f..d4db764c6800 100644
--- a/test/CodeGen/X86/bigstructret.ll
+++ b/test/CodeGen/X86/bigstructret.ll
@@ -1,15 +1,28 @@
-; RUN: llc < %s -march=x86 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=X64
%0 = type { i32, i32, i32, i32 }
%1 = type { i1, i1, i1, i32 }
-; CHECK: ReturnBigStruct
-; CHECK: movl $24601, 12(%ecx)
-; CHECK: movl $48, 8(%ecx)
-; CHECK: movl $24, 4(%ecx)
-; CHECK: movl $12, (%ecx)
-
define fastcc %0 @ReturnBigStruct() nounwind readnone {
+; X86-LABEL: ReturnBigStruct:
+; X86: # %bb.0: # %entry
+; X86-NEXT: movl $24601, 12(%ecx) # imm = 0x6019
+; X86-NEXT: movl $48, 8(%ecx)
+; X86-NEXT: movl $24, 4(%ecx)
+; X86-NEXT: movl $12, (%ecx)
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: retl
+;
+; X64-LABEL: ReturnBigStruct:
+; X64: # %bb.0: # %entry
+; X64-NEXT: movabsq $105660490448944, %rax # imm = 0x601900000030
+; X64-NEXT: movq %rax, 8(%rdi)
+; X64-NEXT: movabsq $103079215116, %rax # imm = 0x180000000C
+; X64-NEXT: movq %rax, (%rdi)
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: retq
entry:
%0 = insertvalue %0 zeroinitializer, i32 12, 0
%1 = insertvalue %0 %0, i32 24, 1
@@ -18,13 +31,23 @@ entry:
ret %0 %3
}
-; CHECK: ReturnBigStruct2
-; CHECK: movl $48, 4(%ecx)
-; CHECK: movb $1, 2(%ecx)
-; CHECK: movb $1, 1(%ecx)
-; CHECK: movb $0, (%ecx)
define fastcc %1 @ReturnBigStruct2() nounwind readnone {
+; X86-LABEL: ReturnBigStruct2:
+; X86: # %bb.0: # %entry
+; X86-NEXT: movl $48, 4(%ecx)
+; X86-NEXT: movb $1, 2(%ecx)
+; X86-NEXT: movw $256, (%ecx) # imm = 0x100
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: retl
+;
+; X64-LABEL: ReturnBigStruct2:
+; X64: # %bb.0: # %entry
+; X64-NEXT: movl $48, 4(%rdi)
+; X64-NEXT: movb $1, 2(%rdi)
+; X64-NEXT: movw $256, (%rdi) # imm = 0x100
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: retq
entry:
%0 = insertvalue %1 zeroinitializer, i1 false, 0
%1 = insertvalue %1 %0, i1 true, 1
diff --git a/test/CodeGen/X86/bigstructret2.ll b/test/CodeGen/X86/bigstructret2.ll
index 6a79139d9bcf..be41cd335f6c 100644
--- a/test/CodeGen/X86/bigstructret2.ll
+++ b/test/CodeGen/X86/bigstructret2.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mtriple=i686-pc-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=i686-pc-linux-gnu | FileCheck %s
; CHECK: .cfi_startproc
; CHECK: .cfi_def_cfa_offset 8
diff --git a/test/CodeGen/X86/bit-test-shift.ll b/test/CodeGen/X86/bit-test-shift.ll
index 7497613f2565..8970db4027ee 100644
--- a/test/CodeGen/X86/bit-test-shift.ll
+++ b/test/CodeGen/X86/bit-test-shift.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | FileCheck %s
+; RUN: llc < %s -mtriple=i686-- | FileCheck %s
; <rdar://problem/8285015>
define i32 @x(i32 %t) nounwind readnone ssp {
diff --git a/test/CodeGen/X86/bitcast-and-setcc-128.ll b/test/CodeGen/X86/bitcast-and-setcc-128.ll
index 1d78ee26a0b9..2276e5634537 100644
--- a/test/CodeGen/X86/bitcast-and-setcc-128.ll
+++ b/test/CodeGen/X86/bitcast-and-setcc-128.ll
@@ -3,47 +3,52 @@
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE2-SSSE3,SSSE3
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX12,AVX1
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX12,AVX2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=AVX512 --check-prefixes=AVX512F
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512 --check-prefixes=AVX512BW
define i8 @v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c, <8 x i16> %d) {
-; SSE2-LABEL: v8i16:
-; SSE2: # BB#0:
-; SSE2-NEXT: pcmpgtw %xmm1, %xmm0
-; SSE2-NEXT: pcmpgtw %xmm3, %xmm2
-; SSE2-NEXT: pand %xmm0, %xmm2
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm2
-; SSE2-NEXT: packuswb %xmm2, %xmm2
-; SSE2-NEXT: pmovmskb %xmm2, %eax
-; SSE2-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: v8i16:
-; SSSE3: # BB#0:
-; SSSE3-NEXT: pcmpgtw %xmm1, %xmm0
-; SSSE3-NEXT: pcmpgtw %xmm3, %xmm2
-; SSSE3-NEXT: pand %xmm0, %xmm2
-; SSSE3-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
-; SSSE3-NEXT: pmovmskb %xmm2, %eax
-; SSSE3-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
-; SSSE3-NEXT: retq
+; SSE2-SSSE3-LABEL: v8i16:
+; SSE2-SSSE3: # %bb.0:
+; SSE2-SSSE3-NEXT: pcmpgtw %xmm1, %xmm0
+; SSE2-SSSE3-NEXT: pcmpgtw %xmm3, %xmm2
+; SSE2-SSSE3-NEXT: pand %xmm0, %xmm2
+; SSE2-SSSE3-NEXT: packsswb %xmm0, %xmm2
+; SSE2-SSSE3-NEXT: pmovmskb %xmm2, %eax
+; SSE2-SSSE3-NEXT: # kill: def %al killed %al killed %eax
+; SSE2-SSSE3-NEXT: retq
;
; AVX12-LABEL: v8i16:
-; AVX12: # BB#0:
+; AVX12: # %bb.0:
; AVX12-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0
; AVX12-NEXT: vpcmpgtw %xmm3, %xmm2, %xmm1
; AVX12-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX12-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX12-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
; AVX12-NEXT: vpmovmskb %xmm0, %eax
-; AVX12-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX12-NEXT: # kill: def %al killed %al killed %eax
; AVX12-NEXT: retq
;
-; AVX512-LABEL: v8i16:
-; AVX512: # BB#0:
-; AVX512-NEXT: vpcmpgtw %xmm1, %xmm0, %k1
-; AVX512-NEXT: vpcmpgtw %xmm3, %xmm2, %k0 {%k1}
-; AVX512-NEXT: kmovd %k0, %eax
-; AVX512-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
-; AVX512-NEXT: retq
+; AVX512F-LABEL: v8i16:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0
+; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0
+; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1
+; AVX512F-NEXT: vpcmpgtw %xmm3, %xmm2, %xmm0
+; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0
+; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0
+; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: # kill: def %al killed %al killed %eax
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: v8i16:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpcmpgtw %xmm1, %xmm0, %k1
+; AVX512BW-NEXT: vpcmpgtw %xmm3, %xmm2, %k0 {%k1}
+; AVX512BW-NEXT: kmovd %k0, %eax
+; AVX512BW-NEXT: # kill: def %al killed %al killed %eax
+; AVX512BW-NEXT: retq
%x0 = icmp sgt <8 x i16> %a, %b
%x1 = icmp sgt <8 x i16> %c, %d
%y = and <8 x i1> %x0, %x1
@@ -53,31 +58,40 @@ define i8 @v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c, <8 x i16> %d) {
define i4 @v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d) {
; SSE2-SSSE3-LABEL: v4i32:
-; SSE2-SSSE3: # BB#0:
+; SSE2-SSSE3: # %bb.0:
; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm0
; SSE2-SSSE3-NEXT: pcmpgtd %xmm3, %xmm2
; SSE2-SSSE3-NEXT: pand %xmm0, %xmm2
; SSE2-SSSE3-NEXT: movmskps %xmm2, %eax
-; SSE2-SSSE3-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; SSE2-SSSE3-NEXT: # kill: def %al killed %al killed %eax
; SSE2-SSSE3-NEXT: retq
;
; AVX12-LABEL: v4i32:
-; AVX12: # BB#0:
+; AVX12: # %bb.0:
; AVX12-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
; AVX12-NEXT: vpcmpgtd %xmm3, %xmm2, %xmm1
; AVX12-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX12-NEXT: vmovmskps %xmm0, %eax
-; AVX12-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX12-NEXT: # kill: def %al killed %al killed %eax
; AVX12-NEXT: retq
;
-; AVX512-LABEL: v4i32:
-; AVX512: # BB#0:
-; AVX512-NEXT: vpcmpgtd %xmm1, %xmm0, %k1
-; AVX512-NEXT: vpcmpgtd %xmm3, %xmm2, %k0 {%k1}
-; AVX512-NEXT: kmovd %k0, %eax
-; AVX512-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; AVX512-NEXT: retq
+; AVX512F-LABEL: v4i32:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpcmpgtd %xmm1, %xmm0, %k1
+; AVX512F-NEXT: vpcmpgtd %xmm3, %xmm2, %k0 {%k1}
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: v4i32:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpcmpgtd %xmm1, %xmm0, %k1
+; AVX512BW-NEXT: vpcmpgtd %xmm3, %xmm2, %k0 {%k1}
+; AVX512BW-NEXT: kmovd %k0, %eax
+; AVX512BW-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; AVX512BW-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; AVX512BW-NEXT: retq
%x0 = icmp sgt <4 x i32> %a, %b
%x1 = icmp sgt <4 x i32> %c, %d
%y = and <4 x i1> %x0, %x1
@@ -87,31 +101,40 @@ define i4 @v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d) {
define i4 @v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d) {
; SSE2-SSSE3-LABEL: v4f32:
-; SSE2-SSSE3: # BB#0:
+; SSE2-SSSE3: # %bb.0:
; SSE2-SSSE3-NEXT: cmpltps %xmm0, %xmm1
; SSE2-SSSE3-NEXT: cmpltps %xmm2, %xmm3
; SSE2-SSSE3-NEXT: andps %xmm1, %xmm3
; SSE2-SSSE3-NEXT: movmskps %xmm3, %eax
-; SSE2-SSSE3-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; SSE2-SSSE3-NEXT: # kill: def %al killed %al killed %eax
; SSE2-SSSE3-NEXT: retq
;
; AVX12-LABEL: v4f32:
-; AVX12: # BB#0:
+; AVX12: # %bb.0:
; AVX12-NEXT: vcmpltps %xmm0, %xmm1, %xmm0
; AVX12-NEXT: vcmpltps %xmm2, %xmm3, %xmm1
; AVX12-NEXT: vandps %xmm1, %xmm0, %xmm0
; AVX12-NEXT: vmovmskps %xmm0, %eax
-; AVX12-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX12-NEXT: # kill: def %al killed %al killed %eax
; AVX12-NEXT: retq
;
-; AVX512-LABEL: v4f32:
-; AVX512: # BB#0:
-; AVX512-NEXT: vcmpltps %xmm0, %xmm1, %k1
-; AVX512-NEXT: vcmpltps %xmm2, %xmm3, %k0 {%k1}
-; AVX512-NEXT: kmovd %k0, %eax
-; AVX512-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; AVX512-NEXT: retq
+; AVX512F-LABEL: v4f32:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vcmpltps %xmm0, %xmm1, %k1
+; AVX512F-NEXT: vcmpltps %xmm2, %xmm3, %k0 {%k1}
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: v4f32:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vcmpltps %xmm0, %xmm1, %k1
+; AVX512BW-NEXT: vcmpltps %xmm2, %xmm3, %k0 {%k1}
+; AVX512BW-NEXT: kmovd %k0, %eax
+; AVX512BW-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; AVX512BW-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; AVX512BW-NEXT: retq
%x0 = fcmp ogt <4 x float> %a, %b
%x1 = fcmp ogt <4 x float> %c, %d
%y = and <4 x i1> %x0, %x1
@@ -121,30 +144,45 @@ define i4 @v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d)
define i16 @v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) {
; SSE2-SSSE3-LABEL: v16i8:
-; SSE2-SSSE3: # BB#0:
+; SSE2-SSSE3: # %bb.0:
; SSE2-SSSE3-NEXT: pcmpgtb %xmm1, %xmm0
; SSE2-SSSE3-NEXT: pcmpgtb %xmm3, %xmm2
; SSE2-SSSE3-NEXT: pand %xmm0, %xmm2
; SSE2-SSSE3-NEXT: pmovmskb %xmm2, %eax
-; SSE2-SSSE3-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; SSE2-SSSE3-NEXT: # kill: def %ax killed %ax killed %eax
; SSE2-SSSE3-NEXT: retq
;
; AVX12-LABEL: v16i8:
-; AVX12: # BB#0:
+; AVX12: # %bb.0:
; AVX12-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0
; AVX12-NEXT: vpcmpgtb %xmm3, %xmm2, %xmm1
; AVX12-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX12-NEXT: vpmovmskb %xmm0, %eax
-; AVX12-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX12-NEXT: # kill: def %ax killed %ax killed %eax
; AVX12-NEXT: retq
;
-; AVX512-LABEL: v16i8:
-; AVX512: # BB#0:
-; AVX512-NEXT: vpcmpgtb %xmm1, %xmm0, %k1
-; AVX512-NEXT: vpcmpgtb %xmm3, %xmm2, %k0 {%k1}
-; AVX512-NEXT: kmovd %k0, %eax
-; AVX512-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
-; AVX512-NEXT: retq
+; AVX512F-LABEL: v16i8:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
+; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
+; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1
+; AVX512F-NEXT: vpcmpgtb %xmm3, %xmm2, %xmm0
+; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
+; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
+; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1}
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: # kill: def %ax killed %ax killed %eax
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: v16i8:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpcmpgtb %xmm1, %xmm0, %k1
+; AVX512BW-NEXT: vpcmpgtb %xmm3, %xmm2, %k0 {%k1}
+; AVX512BW-NEXT: kmovd %k0, %eax
+; AVX512BW-NEXT: # kill: def %ax killed %ax killed %eax
+; AVX512BW-NEXT: retq
%x0 = icmp sgt <16 x i8> %a, %b
%x1 = icmp sgt <16 x i8> %c, %d
%y = and <16 x i1> %x0, %x1
@@ -154,7 +192,7 @@ define i16 @v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) {
define i2 @v2i8(<2 x i8> %a, <2 x i8> %b, <2 x i8> %c, <2 x i8> %d) {
; SSE2-SSSE3-LABEL: v2i8:
-; SSE2-SSSE3: # BB#0:
+; SSE2-SSSE3: # %bb.0:
; SSE2-SSSE3-NEXT: psllq $56, %xmm2
; SSE2-SSSE3-NEXT: movdqa %xmm2, %xmm4
; SSE2-SSSE3-NEXT: psrad $31, %xmm4
@@ -206,11 +244,11 @@ define i2 @v2i8(<2 x i8> %a, <2 x i8> %b, <2 x i8> %c, <2 x i8> %d) {
; SSE2-SSSE3-NEXT: por %xmm2, %xmm0
; SSE2-SSSE3-NEXT: pand %xmm1, %xmm0
; SSE2-SSSE3-NEXT: movmskpd %xmm0, %eax
-; SSE2-SSSE3-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; SSE2-SSSE3-NEXT: # kill: def %al killed %al killed %eax
; SSE2-SSSE3-NEXT: retq
;
; AVX1-LABEL: v2i8:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpsllq $56, %xmm3, %xmm3
; AVX1-NEXT: vpsrad $31, %xmm3, %xmm4
; AVX1-NEXT: vpsrad $24, %xmm3, %xmm3
@@ -235,11 +273,11 @@ define i2 @v2i8(<2 x i8> %a, <2 x i8> %b, <2 x i8> %c, <2 x i8> %d) {
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vmovmskpd %xmm0, %eax
-; AVX1-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX1-NEXT: # kill: def %al killed %al killed %eax
; AVX1-NEXT: retq
;
; AVX2-LABEL: v2i8:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpsllq $56, %xmm3, %xmm3
; AVX2-NEXT: vpsrad $31, %xmm3, %xmm4
; AVX2-NEXT: vpsrad $24, %xmm3, %xmm3
@@ -264,25 +302,42 @@ define i2 @v2i8(<2 x i8> %a, <2 x i8> %b, <2 x i8> %c, <2 x i8> %d) {
; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vmovmskpd %xmm0, %eax
-; AVX2-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX2-NEXT: # kill: def %al killed %al killed %eax
; AVX2-NEXT: retq
;
-; AVX512-LABEL: v2i8:
-; AVX512: # BB#0:
-; AVX512-NEXT: vpsllq $56, %xmm3, %xmm3
-; AVX512-NEXT: vpsraq $56, %xmm3, %xmm3
-; AVX512-NEXT: vpsllq $56, %xmm2, %xmm2
-; AVX512-NEXT: vpsraq $56, %xmm2, %xmm2
-; AVX512-NEXT: vpsllq $56, %xmm1, %xmm1
-; AVX512-NEXT: vpsraq $56, %xmm1, %xmm1
-; AVX512-NEXT: vpsllq $56, %xmm0, %xmm0
-; AVX512-NEXT: vpsraq $56, %xmm0, %xmm0
-; AVX512-NEXT: vpcmpgtq %xmm1, %xmm0, %k1
-; AVX512-NEXT: vpcmpgtq %xmm3, %xmm2, %k0 {%k1}
-; AVX512-NEXT: kmovd %k0, %eax
-; AVX512-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; AVX512-NEXT: retq
+; AVX512F-LABEL: v2i8:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpsllq $56, %xmm3, %xmm3
+; AVX512F-NEXT: vpsraq $56, %xmm3, %xmm3
+; AVX512F-NEXT: vpsllq $56, %xmm2, %xmm2
+; AVX512F-NEXT: vpsraq $56, %xmm2, %xmm2
+; AVX512F-NEXT: vpsllq $56, %xmm1, %xmm1
+; AVX512F-NEXT: vpsraq $56, %xmm1, %xmm1
+; AVX512F-NEXT: vpsllq $56, %xmm0, %xmm0
+; AVX512F-NEXT: vpsraq $56, %xmm0, %xmm0
+; AVX512F-NEXT: vpcmpgtq %xmm1, %xmm0, %k1
+; AVX512F-NEXT: vpcmpgtq %xmm3, %xmm2, %k0 {%k1}
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: v2i8:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpsllq $56, %xmm3, %xmm3
+; AVX512BW-NEXT: vpsraq $56, %xmm3, %xmm3
+; AVX512BW-NEXT: vpsllq $56, %xmm2, %xmm2
+; AVX512BW-NEXT: vpsraq $56, %xmm2, %xmm2
+; AVX512BW-NEXT: vpsllq $56, %xmm1, %xmm1
+; AVX512BW-NEXT: vpsraq $56, %xmm1, %xmm1
+; AVX512BW-NEXT: vpsllq $56, %xmm0, %xmm0
+; AVX512BW-NEXT: vpsraq $56, %xmm0, %xmm0
+; AVX512BW-NEXT: vpcmpgtq %xmm1, %xmm0, %k1
+; AVX512BW-NEXT: vpcmpgtq %xmm3, %xmm2, %k0 {%k1}
+; AVX512BW-NEXT: kmovd %k0, %eax
+; AVX512BW-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; AVX512BW-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; AVX512BW-NEXT: retq
%x0 = icmp sgt <2 x i8> %a, %b
%x1 = icmp sgt <2 x i8> %c, %d
%y = and <2 x i1> %x0, %x1
@@ -292,7 +347,7 @@ define i2 @v2i8(<2 x i8> %a, <2 x i8> %b, <2 x i8> %c, <2 x i8> %d) {
define i2 @v2i16(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, <2 x i16> %d) {
; SSE2-SSSE3-LABEL: v2i16:
-; SSE2-SSSE3: # BB#0:
+; SSE2-SSSE3: # %bb.0:
; SSE2-SSSE3-NEXT: psllq $48, %xmm2
; SSE2-SSSE3-NEXT: movdqa %xmm2, %xmm4
; SSE2-SSSE3-NEXT: psrad $31, %xmm4
@@ -344,11 +399,11 @@ define i2 @v2i16(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, <2 x i16> %d) {
; SSE2-SSSE3-NEXT: por %xmm2, %xmm0
; SSE2-SSSE3-NEXT: pand %xmm1, %xmm0
; SSE2-SSSE3-NEXT: movmskpd %xmm0, %eax
-; SSE2-SSSE3-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; SSE2-SSSE3-NEXT: # kill: def %al killed %al killed %eax
; SSE2-SSSE3-NEXT: retq
;
; AVX1-LABEL: v2i16:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpsllq $48, %xmm3, %xmm3
; AVX1-NEXT: vpsrad $31, %xmm3, %xmm4
; AVX1-NEXT: vpsrad $16, %xmm3, %xmm3
@@ -373,11 +428,11 @@ define i2 @v2i16(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, <2 x i16> %d) {
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vmovmskpd %xmm0, %eax
-; AVX1-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX1-NEXT: # kill: def %al killed %al killed %eax
; AVX1-NEXT: retq
;
; AVX2-LABEL: v2i16:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpsllq $48, %xmm3, %xmm3
; AVX2-NEXT: vpsrad $31, %xmm3, %xmm4
; AVX2-NEXT: vpsrad $16, %xmm3, %xmm3
@@ -402,25 +457,42 @@ define i2 @v2i16(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, <2 x i16> %d) {
; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vmovmskpd %xmm0, %eax
-; AVX2-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX2-NEXT: # kill: def %al killed %al killed %eax
; AVX2-NEXT: retq
;
-; AVX512-LABEL: v2i16:
-; AVX512: # BB#0:
-; AVX512-NEXT: vpsllq $48, %xmm3, %xmm3
-; AVX512-NEXT: vpsraq $48, %xmm3, %xmm3
-; AVX512-NEXT: vpsllq $48, %xmm2, %xmm2
-; AVX512-NEXT: vpsraq $48, %xmm2, %xmm2
-; AVX512-NEXT: vpsllq $48, %xmm1, %xmm1
-; AVX512-NEXT: vpsraq $48, %xmm1, %xmm1
-; AVX512-NEXT: vpsllq $48, %xmm0, %xmm0
-; AVX512-NEXT: vpsraq $48, %xmm0, %xmm0
-; AVX512-NEXT: vpcmpgtq %xmm1, %xmm0, %k1
-; AVX512-NEXT: vpcmpgtq %xmm3, %xmm2, %k0 {%k1}
-; AVX512-NEXT: kmovd %k0, %eax
-; AVX512-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; AVX512-NEXT: retq
+; AVX512F-LABEL: v2i16:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpsllq $48, %xmm3, %xmm3
+; AVX512F-NEXT: vpsraq $48, %xmm3, %xmm3
+; AVX512F-NEXT: vpsllq $48, %xmm2, %xmm2
+; AVX512F-NEXT: vpsraq $48, %xmm2, %xmm2
+; AVX512F-NEXT: vpsllq $48, %xmm1, %xmm1
+; AVX512F-NEXT: vpsraq $48, %xmm1, %xmm1
+; AVX512F-NEXT: vpsllq $48, %xmm0, %xmm0
+; AVX512F-NEXT: vpsraq $48, %xmm0, %xmm0
+; AVX512F-NEXT: vpcmpgtq %xmm1, %xmm0, %k1
+; AVX512F-NEXT: vpcmpgtq %xmm3, %xmm2, %k0 {%k1}
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: v2i16:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpsllq $48, %xmm3, %xmm3
+; AVX512BW-NEXT: vpsraq $48, %xmm3, %xmm3
+; AVX512BW-NEXT: vpsllq $48, %xmm2, %xmm2
+; AVX512BW-NEXT: vpsraq $48, %xmm2, %xmm2
+; AVX512BW-NEXT: vpsllq $48, %xmm1, %xmm1
+; AVX512BW-NEXT: vpsraq $48, %xmm1, %xmm1
+; AVX512BW-NEXT: vpsllq $48, %xmm0, %xmm0
+; AVX512BW-NEXT: vpsraq $48, %xmm0, %xmm0
+; AVX512BW-NEXT: vpcmpgtq %xmm1, %xmm0, %k1
+; AVX512BW-NEXT: vpcmpgtq %xmm3, %xmm2, %k0 {%k1}
+; AVX512BW-NEXT: kmovd %k0, %eax
+; AVX512BW-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; AVX512BW-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; AVX512BW-NEXT: retq
%x0 = icmp sgt <2 x i16> %a, %b
%x1 = icmp sgt <2 x i16> %c, %d
%y = and <2 x i1> %x0, %x1
@@ -430,7 +502,7 @@ define i2 @v2i16(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, <2 x i16> %d) {
define i2 @v2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x i32> %d) {
; SSE2-SSSE3-LABEL: v2i32:
-; SSE2-SSSE3: # BB#0:
+; SSE2-SSSE3: # %bb.0:
; SSE2-SSSE3-NEXT: psllq $32, %xmm2
; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,3,2,3]
; SSE2-SSSE3-NEXT: psrad $31, %xmm2
@@ -474,11 +546,11 @@ define i2 @v2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x i32> %d) {
; SSE2-SSSE3-NEXT: por %xmm2, %xmm0
; SSE2-SSSE3-NEXT: pand %xmm3, %xmm0
; SSE2-SSSE3-NEXT: movmskpd %xmm0, %eax
-; SSE2-SSSE3-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; SSE2-SSSE3-NEXT: # kill: def %al killed %al killed %eax
; SSE2-SSSE3-NEXT: retq
;
; AVX1-LABEL: v2i32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpsllq $32, %xmm3, %xmm3
; AVX1-NEXT: vpsrad $31, %xmm3, %xmm4
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
@@ -499,11 +571,11 @@ define i2 @v2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x i32> %d) {
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vmovmskpd %xmm0, %eax
-; AVX1-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX1-NEXT: # kill: def %al killed %al killed %eax
; AVX1-NEXT: retq
;
; AVX2-LABEL: v2i32:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpsllq $32, %xmm3, %xmm3
; AVX2-NEXT: vpsrad $31, %xmm3, %xmm4
; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
@@ -524,25 +596,42 @@ define i2 @v2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x i32> %d) {
; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vmovmskpd %xmm0, %eax
-; AVX2-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX2-NEXT: # kill: def %al killed %al killed %eax
; AVX2-NEXT: retq
;
-; AVX512-LABEL: v2i32:
-; AVX512: # BB#0:
-; AVX512-NEXT: vpsllq $32, %xmm3, %xmm3
-; AVX512-NEXT: vpsraq $32, %xmm3, %xmm3
-; AVX512-NEXT: vpsllq $32, %xmm2, %xmm2
-; AVX512-NEXT: vpsraq $32, %xmm2, %xmm2
-; AVX512-NEXT: vpsllq $32, %xmm1, %xmm1
-; AVX512-NEXT: vpsraq $32, %xmm1, %xmm1
-; AVX512-NEXT: vpsllq $32, %xmm0, %xmm0
-; AVX512-NEXT: vpsraq $32, %xmm0, %xmm0
-; AVX512-NEXT: vpcmpgtq %xmm1, %xmm0, %k1
-; AVX512-NEXT: vpcmpgtq %xmm3, %xmm2, %k0 {%k1}
-; AVX512-NEXT: kmovd %k0, %eax
-; AVX512-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; AVX512-NEXT: retq
+; AVX512F-LABEL: v2i32:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpsllq $32, %xmm3, %xmm3
+; AVX512F-NEXT: vpsraq $32, %xmm3, %xmm3
+; AVX512F-NEXT: vpsllq $32, %xmm2, %xmm2
+; AVX512F-NEXT: vpsraq $32, %xmm2, %xmm2
+; AVX512F-NEXT: vpsllq $32, %xmm1, %xmm1
+; AVX512F-NEXT: vpsraq $32, %xmm1, %xmm1
+; AVX512F-NEXT: vpsllq $32, %xmm0, %xmm0
+; AVX512F-NEXT: vpsraq $32, %xmm0, %xmm0
+; AVX512F-NEXT: vpcmpgtq %xmm1, %xmm0, %k1
+; AVX512F-NEXT: vpcmpgtq %xmm3, %xmm2, %k0 {%k1}
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: v2i32:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpsllq $32, %xmm3, %xmm3
+; AVX512BW-NEXT: vpsraq $32, %xmm3, %xmm3
+; AVX512BW-NEXT: vpsllq $32, %xmm2, %xmm2
+; AVX512BW-NEXT: vpsraq $32, %xmm2, %xmm2
+; AVX512BW-NEXT: vpsllq $32, %xmm1, %xmm1
+; AVX512BW-NEXT: vpsraq $32, %xmm1, %xmm1
+; AVX512BW-NEXT: vpsllq $32, %xmm0, %xmm0
+; AVX512BW-NEXT: vpsraq $32, %xmm0, %xmm0
+; AVX512BW-NEXT: vpcmpgtq %xmm1, %xmm0, %k1
+; AVX512BW-NEXT: vpcmpgtq %xmm3, %xmm2, %k0 {%k1}
+; AVX512BW-NEXT: kmovd %k0, %eax
+; AVX512BW-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; AVX512BW-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; AVX512BW-NEXT: retq
%x0 = icmp sgt <2 x i32> %a, %b
%x1 = icmp sgt <2 x i32> %c, %d
%y = and <2 x i1> %x0, %x1
@@ -552,7 +641,7 @@ define i2 @v2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x i32> %d) {
define i2 @v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c, <2 x i64> %d) {
; SSE2-SSSE3-LABEL: v2i64:
-; SSE2-SSSE3: # BB#0:
+; SSE2-SSSE3: # %bb.0:
; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,0,2147483648,0]
; SSE2-SSSE3-NEXT: pxor %xmm4, %xmm1
; SSE2-SSSE3-NEXT: pxor %xmm4, %xmm0
@@ -576,26 +665,35 @@ define i2 @v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c, <2 x i64> %d) {
; SSE2-SSSE3-NEXT: por %xmm2, %xmm0
; SSE2-SSSE3-NEXT: pand %xmm1, %xmm0
; SSE2-SSSE3-NEXT: movmskpd %xmm0, %eax
-; SSE2-SSSE3-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; SSE2-SSSE3-NEXT: # kill: def %al killed %al killed %eax
; SSE2-SSSE3-NEXT: retq
;
; AVX12-LABEL: v2i64:
-; AVX12: # BB#0:
+; AVX12: # %bb.0:
; AVX12-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; AVX12-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm1
; AVX12-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX12-NEXT: vmovmskpd %xmm0, %eax
-; AVX12-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX12-NEXT: # kill: def %al killed %al killed %eax
; AVX12-NEXT: retq
;
-; AVX512-LABEL: v2i64:
-; AVX512: # BB#0:
-; AVX512-NEXT: vpcmpgtq %xmm1, %xmm0, %k1
-; AVX512-NEXT: vpcmpgtq %xmm3, %xmm2, %k0 {%k1}
-; AVX512-NEXT: kmovd %k0, %eax
-; AVX512-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; AVX512-NEXT: retq
+; AVX512F-LABEL: v2i64:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpcmpgtq %xmm1, %xmm0, %k1
+; AVX512F-NEXT: vpcmpgtq %xmm3, %xmm2, %k0 {%k1}
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: v2i64:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpcmpgtq %xmm1, %xmm0, %k1
+; AVX512BW-NEXT: vpcmpgtq %xmm3, %xmm2, %k0 {%k1}
+; AVX512BW-NEXT: kmovd %k0, %eax
+; AVX512BW-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; AVX512BW-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; AVX512BW-NEXT: retq
%x0 = icmp sgt <2 x i64> %a, %b
%x1 = icmp sgt <2 x i64> %c, %d
%y = and <2 x i1> %x0, %x1
@@ -605,31 +703,40 @@ define i2 @v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c, <2 x i64> %d) {
define i2 @v2f64(<2 x double> %a, <2 x double> %b, <2 x double> %c, <2 x double> %d) {
; SSE2-SSSE3-LABEL: v2f64:
-; SSE2-SSSE3: # BB#0:
+; SSE2-SSSE3: # %bb.0:
; SSE2-SSSE3-NEXT: cmpltpd %xmm0, %xmm1
; SSE2-SSSE3-NEXT: cmpltpd %xmm2, %xmm3
; SSE2-SSSE3-NEXT: andpd %xmm1, %xmm3
; SSE2-SSSE3-NEXT: movmskpd %xmm3, %eax
-; SSE2-SSSE3-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; SSE2-SSSE3-NEXT: # kill: def %al killed %al killed %eax
; SSE2-SSSE3-NEXT: retq
;
; AVX12-LABEL: v2f64:
-; AVX12: # BB#0:
+; AVX12: # %bb.0:
; AVX12-NEXT: vcmpltpd %xmm0, %xmm1, %xmm0
; AVX12-NEXT: vcmpltpd %xmm2, %xmm3, %xmm1
; AVX12-NEXT: vandpd %xmm1, %xmm0, %xmm0
; AVX12-NEXT: vmovmskpd %xmm0, %eax
-; AVX12-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX12-NEXT: # kill: def %al killed %al killed %eax
; AVX12-NEXT: retq
;
-; AVX512-LABEL: v2f64:
-; AVX512: # BB#0:
-; AVX512-NEXT: vcmpltpd %xmm0, %xmm1, %k1
-; AVX512-NEXT: vcmpltpd %xmm2, %xmm3, %k0 {%k1}
-; AVX512-NEXT: kmovd %k0, %eax
-; AVX512-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; AVX512-NEXT: retq
+; AVX512F-LABEL: v2f64:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vcmpltpd %xmm0, %xmm1, %k1
+; AVX512F-NEXT: vcmpltpd %xmm2, %xmm3, %k0 {%k1}
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: v2f64:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vcmpltpd %xmm0, %xmm1, %k1
+; AVX512BW-NEXT: vcmpltpd %xmm2, %xmm3, %k0 {%k1}
+; AVX512BW-NEXT: kmovd %k0, %eax
+; AVX512BW-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; AVX512BW-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; AVX512BW-NEXT: retq
%x0 = fcmp ogt <2 x double> %a, %b
%x1 = fcmp ogt <2 x double> %c, %d
%y = and <2 x i1> %x0, %x1
@@ -639,7 +746,7 @@ define i2 @v2f64(<2 x double> %a, <2 x double> %b, <2 x double> %c, <2 x double>
define i4 @v4i8(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <4 x i8> %d) {
; SSE2-SSSE3-LABEL: v4i8:
-; SSE2-SSSE3: # BB#0:
+; SSE2-SSSE3: # %bb.0:
; SSE2-SSSE3-NEXT: pslld $24, %xmm3
; SSE2-SSSE3-NEXT: psrad $24, %xmm3
; SSE2-SSSE3-NEXT: pslld $24, %xmm2
@@ -652,11 +759,11 @@ define i4 @v4i8(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <4 x i8> %d) {
; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm0
; SSE2-SSSE3-NEXT: pand %xmm2, %xmm0
; SSE2-SSSE3-NEXT: movmskps %xmm0, %eax
-; SSE2-SSSE3-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; SSE2-SSSE3-NEXT: # kill: def %al killed %al killed %eax
; SSE2-SSSE3-NEXT: retq
;
; AVX12-LABEL: v4i8:
-; AVX12: # BB#0:
+; AVX12: # %bb.0:
; AVX12-NEXT: vpslld $24, %xmm3, %xmm3
; AVX12-NEXT: vpsrad $24, %xmm3, %xmm3
; AVX12-NEXT: vpslld $24, %xmm2, %xmm2
@@ -669,25 +776,42 @@ define i4 @v4i8(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <4 x i8> %d) {
; AVX12-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
; AVX12-NEXT: vpand %xmm2, %xmm0, %xmm0
; AVX12-NEXT: vmovmskps %xmm0, %eax
-; AVX12-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX12-NEXT: # kill: def %al killed %al killed %eax
; AVX12-NEXT: retq
;
-; AVX512-LABEL: v4i8:
-; AVX512: # BB#0:
-; AVX512-NEXT: vpslld $24, %xmm3, %xmm3
-; AVX512-NEXT: vpsrad $24, %xmm3, %xmm3
-; AVX512-NEXT: vpslld $24, %xmm2, %xmm2
-; AVX512-NEXT: vpsrad $24, %xmm2, %xmm2
-; AVX512-NEXT: vpslld $24, %xmm1, %xmm1
-; AVX512-NEXT: vpsrad $24, %xmm1, %xmm1
-; AVX512-NEXT: vpslld $24, %xmm0, %xmm0
-; AVX512-NEXT: vpsrad $24, %xmm0, %xmm0
-; AVX512-NEXT: vpcmpgtd %xmm1, %xmm0, %k1
-; AVX512-NEXT: vpcmpgtd %xmm3, %xmm2, %k0 {%k1}
-; AVX512-NEXT: kmovd %k0, %eax
-; AVX512-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; AVX512-NEXT: retq
+; AVX512F-LABEL: v4i8:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpslld $24, %xmm3, %xmm3
+; AVX512F-NEXT: vpsrad $24, %xmm3, %xmm3
+; AVX512F-NEXT: vpslld $24, %xmm2, %xmm2
+; AVX512F-NEXT: vpsrad $24, %xmm2, %xmm2
+; AVX512F-NEXT: vpslld $24, %xmm1, %xmm1
+; AVX512F-NEXT: vpsrad $24, %xmm1, %xmm1
+; AVX512F-NEXT: vpslld $24, %xmm0, %xmm0
+; AVX512F-NEXT: vpsrad $24, %xmm0, %xmm0
+; AVX512F-NEXT: vpcmpgtd %xmm1, %xmm0, %k1
+; AVX512F-NEXT: vpcmpgtd %xmm3, %xmm2, %k0 {%k1}
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: v4i8:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpslld $24, %xmm3, %xmm3
+; AVX512BW-NEXT: vpsrad $24, %xmm3, %xmm3
+; AVX512BW-NEXT: vpslld $24, %xmm2, %xmm2
+; AVX512BW-NEXT: vpsrad $24, %xmm2, %xmm2
+; AVX512BW-NEXT: vpslld $24, %xmm1, %xmm1
+; AVX512BW-NEXT: vpsrad $24, %xmm1, %xmm1
+; AVX512BW-NEXT: vpslld $24, %xmm0, %xmm0
+; AVX512BW-NEXT: vpsrad $24, %xmm0, %xmm0
+; AVX512BW-NEXT: vpcmpgtd %xmm1, %xmm0, %k1
+; AVX512BW-NEXT: vpcmpgtd %xmm3, %xmm2, %k0 {%k1}
+; AVX512BW-NEXT: kmovd %k0, %eax
+; AVX512BW-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; AVX512BW-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; AVX512BW-NEXT: retq
%x0 = icmp sgt <4 x i8> %a, %b
%x1 = icmp sgt <4 x i8> %c, %d
%y = and <4 x i1> %x0, %x1
@@ -697,7 +821,7 @@ define i4 @v4i8(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <4 x i8> %d) {
define i4 @v4i16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c, <4 x i16> %d) {
; SSE2-SSSE3-LABEL: v4i16:
-; SSE2-SSSE3: # BB#0:
+; SSE2-SSSE3: # %bb.0:
; SSE2-SSSE3-NEXT: pslld $16, %xmm3
; SSE2-SSSE3-NEXT: psrad $16, %xmm3
; SSE2-SSSE3-NEXT: pslld $16, %xmm2
@@ -710,11 +834,11 @@ define i4 @v4i16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c, <4 x i16> %d) {
; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm0
; SSE2-SSSE3-NEXT: pand %xmm2, %xmm0
; SSE2-SSSE3-NEXT: movmskps %xmm0, %eax
-; SSE2-SSSE3-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; SSE2-SSSE3-NEXT: # kill: def %al killed %al killed %eax
; SSE2-SSSE3-NEXT: retq
;
; AVX12-LABEL: v4i16:
-; AVX12: # BB#0:
+; AVX12: # %bb.0:
; AVX12-NEXT: vpslld $16, %xmm3, %xmm3
; AVX12-NEXT: vpsrad $16, %xmm3, %xmm3
; AVX12-NEXT: vpslld $16, %xmm2, %xmm2
@@ -727,25 +851,42 @@ define i4 @v4i16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c, <4 x i16> %d) {
; AVX12-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
; AVX12-NEXT: vpand %xmm2, %xmm0, %xmm0
; AVX12-NEXT: vmovmskps %xmm0, %eax
-; AVX12-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX12-NEXT: # kill: def %al killed %al killed %eax
; AVX12-NEXT: retq
;
-; AVX512-LABEL: v4i16:
-; AVX512: # BB#0:
-; AVX512-NEXT: vpslld $16, %xmm3, %xmm3
-; AVX512-NEXT: vpsrad $16, %xmm3, %xmm3
-; AVX512-NEXT: vpslld $16, %xmm2, %xmm2
-; AVX512-NEXT: vpsrad $16, %xmm2, %xmm2
-; AVX512-NEXT: vpslld $16, %xmm1, %xmm1
-; AVX512-NEXT: vpsrad $16, %xmm1, %xmm1
-; AVX512-NEXT: vpslld $16, %xmm0, %xmm0
-; AVX512-NEXT: vpsrad $16, %xmm0, %xmm0
-; AVX512-NEXT: vpcmpgtd %xmm1, %xmm0, %k1
-; AVX512-NEXT: vpcmpgtd %xmm3, %xmm2, %k0 {%k1}
-; AVX512-NEXT: kmovd %k0, %eax
-; AVX512-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; AVX512-NEXT: retq
+; AVX512F-LABEL: v4i16:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpslld $16, %xmm3, %xmm3
+; AVX512F-NEXT: vpsrad $16, %xmm3, %xmm3
+; AVX512F-NEXT: vpslld $16, %xmm2, %xmm2
+; AVX512F-NEXT: vpsrad $16, %xmm2, %xmm2
+; AVX512F-NEXT: vpslld $16, %xmm1, %xmm1
+; AVX512F-NEXT: vpsrad $16, %xmm1, %xmm1
+; AVX512F-NEXT: vpslld $16, %xmm0, %xmm0
+; AVX512F-NEXT: vpsrad $16, %xmm0, %xmm0
+; AVX512F-NEXT: vpcmpgtd %xmm1, %xmm0, %k1
+; AVX512F-NEXT: vpcmpgtd %xmm3, %xmm2, %k0 {%k1}
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: v4i16:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpslld $16, %xmm3, %xmm3
+; AVX512BW-NEXT: vpsrad $16, %xmm3, %xmm3
+; AVX512BW-NEXT: vpslld $16, %xmm2, %xmm2
+; AVX512BW-NEXT: vpsrad $16, %xmm2, %xmm2
+; AVX512BW-NEXT: vpslld $16, %xmm1, %xmm1
+; AVX512BW-NEXT: vpsrad $16, %xmm1, %xmm1
+; AVX512BW-NEXT: vpslld $16, %xmm0, %xmm0
+; AVX512BW-NEXT: vpsrad $16, %xmm0, %xmm0
+; AVX512BW-NEXT: vpcmpgtd %xmm1, %xmm0, %k1
+; AVX512BW-NEXT: vpcmpgtd %xmm3, %xmm2, %k0 {%k1}
+; AVX512BW-NEXT: kmovd %k0, %eax
+; AVX512BW-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; AVX512BW-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; AVX512BW-NEXT: retq
%x0 = icmp sgt <4 x i16> %a, %b
%x1 = icmp sgt <4 x i16> %c, %d
%y = and <4 x i1> %x0, %x1
@@ -754,45 +895,26 @@ define i4 @v4i16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c, <4 x i16> %d) {
}
define i8 @v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d) {
-; SSE2-LABEL: v8i8:
-; SSE2: # BB#0:
-; SSE2-NEXT: psllw $8, %xmm3
-; SSE2-NEXT: psraw $8, %xmm3
-; SSE2-NEXT: psllw $8, %xmm2
-; SSE2-NEXT: psraw $8, %xmm2
-; SSE2-NEXT: pcmpgtw %xmm3, %xmm2
-; SSE2-NEXT: psllw $8, %xmm1
-; SSE2-NEXT: psraw $8, %xmm1
-; SSE2-NEXT: psllw $8, %xmm0
-; SSE2-NEXT: psraw $8, %xmm0
-; SSE2-NEXT: pcmpgtw %xmm1, %xmm0
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT: packuswb %xmm0, %xmm0
-; SSE2-NEXT: pmovmskb %xmm0, %eax
-; SSE2-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: v8i8:
-; SSSE3: # BB#0:
-; SSSE3-NEXT: psllw $8, %xmm3
-; SSSE3-NEXT: psraw $8, %xmm3
-; SSSE3-NEXT: psllw $8, %xmm2
-; SSSE3-NEXT: psraw $8, %xmm2
-; SSSE3-NEXT: pcmpgtw %xmm3, %xmm2
-; SSSE3-NEXT: psllw $8, %xmm1
-; SSSE3-NEXT: psraw $8, %xmm1
-; SSSE3-NEXT: psllw $8, %xmm0
-; SSSE3-NEXT: psraw $8, %xmm0
-; SSSE3-NEXT: pcmpgtw %xmm1, %xmm0
-; SSSE3-NEXT: pand %xmm2, %xmm0
-; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
-; SSSE3-NEXT: pmovmskb %xmm0, %eax
-; SSSE3-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
-; SSSE3-NEXT: retq
+; SSE2-SSSE3-LABEL: v8i8:
+; SSE2-SSSE3: # %bb.0:
+; SSE2-SSSE3-NEXT: psllw $8, %xmm3
+; SSE2-SSSE3-NEXT: psraw $8, %xmm3
+; SSE2-SSSE3-NEXT: psllw $8, %xmm2
+; SSE2-SSSE3-NEXT: psraw $8, %xmm2
+; SSE2-SSSE3-NEXT: pcmpgtw %xmm3, %xmm2
+; SSE2-SSSE3-NEXT: psllw $8, %xmm1
+; SSE2-SSSE3-NEXT: psraw $8, %xmm1
+; SSE2-SSSE3-NEXT: psllw $8, %xmm0
+; SSE2-SSSE3-NEXT: psraw $8, %xmm0
+; SSE2-SSSE3-NEXT: pcmpgtw %xmm1, %xmm0
+; SSE2-SSSE3-NEXT: pand %xmm2, %xmm0
+; SSE2-SSSE3-NEXT: packsswb %xmm0, %xmm0
+; SSE2-SSSE3-NEXT: pmovmskb %xmm0, %eax
+; SSE2-SSSE3-NEXT: # kill: def %al killed %al killed %eax
+; SSE2-SSSE3-NEXT: retq
;
; AVX12-LABEL: v8i8:
-; AVX12: # BB#0:
+; AVX12: # %bb.0:
; AVX12-NEXT: vpsllw $8, %xmm3, %xmm3
; AVX12-NEXT: vpsraw $8, %xmm3, %xmm3
; AVX12-NEXT: vpsllw $8, %xmm2, %xmm2
@@ -804,26 +926,49 @@ define i8 @v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d) {
; AVX12-NEXT: vpsraw $8, %xmm0, %xmm0
; AVX12-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0
; AVX12-NEXT: vpand %xmm2, %xmm0, %xmm0
-; AVX12-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX12-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
; AVX12-NEXT: vpmovmskb %xmm0, %eax
-; AVX12-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX12-NEXT: # kill: def %al killed %al killed %eax
; AVX12-NEXT: retq
;
-; AVX512-LABEL: v8i8:
-; AVX512: # BB#0:
-; AVX512-NEXT: vpsllw $8, %xmm3, %xmm3
-; AVX512-NEXT: vpsraw $8, %xmm3, %xmm3
-; AVX512-NEXT: vpsllw $8, %xmm2, %xmm2
-; AVX512-NEXT: vpsraw $8, %xmm2, %xmm2
-; AVX512-NEXT: vpsllw $8, %xmm1, %xmm1
-; AVX512-NEXT: vpsraw $8, %xmm1, %xmm1
-; AVX512-NEXT: vpsllw $8, %xmm0, %xmm0
-; AVX512-NEXT: vpsraw $8, %xmm0, %xmm0
-; AVX512-NEXT: vpcmpgtw %xmm1, %xmm0, %k1
-; AVX512-NEXT: vpcmpgtw %xmm3, %xmm2, %k0 {%k1}
-; AVX512-NEXT: kmovd %k0, %eax
-; AVX512-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
-; AVX512-NEXT: retq
+; AVX512F-LABEL: v8i8:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpsllw $8, %xmm3, %xmm3
+; AVX512F-NEXT: vpsraw $8, %xmm3, %xmm3
+; AVX512F-NEXT: vpsllw $8, %xmm2, %xmm2
+; AVX512F-NEXT: vpsraw $8, %xmm2, %xmm2
+; AVX512F-NEXT: vpcmpgtw %xmm3, %xmm2, %xmm2
+; AVX512F-NEXT: vpsllw $8, %xmm1, %xmm1
+; AVX512F-NEXT: vpsraw $8, %xmm1, %xmm1
+; AVX512F-NEXT: vpsllw $8, %xmm0, %xmm0
+; AVX512F-NEXT: vpsraw $8, %xmm0, %xmm0
+; AVX512F-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0
+; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0
+; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1
+; AVX512F-NEXT: vpmovsxwq %xmm2, %zmm0
+; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0
+; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: # kill: def %al killed %al killed %eax
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: v8i8:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpsllw $8, %xmm3, %xmm3
+; AVX512BW-NEXT: vpsraw $8, %xmm3, %xmm3
+; AVX512BW-NEXT: vpsllw $8, %xmm2, %xmm2
+; AVX512BW-NEXT: vpsraw $8, %xmm2, %xmm2
+; AVX512BW-NEXT: vpsllw $8, %xmm1, %xmm1
+; AVX512BW-NEXT: vpsraw $8, %xmm1, %xmm1
+; AVX512BW-NEXT: vpsllw $8, %xmm0, %xmm0
+; AVX512BW-NEXT: vpsraw $8, %xmm0, %xmm0
+; AVX512BW-NEXT: vpcmpgtw %xmm1, %xmm0, %k1
+; AVX512BW-NEXT: vpcmpgtw %xmm3, %xmm2, %k0 {%k1}
+; AVX512BW-NEXT: kmovd %k0, %eax
+; AVX512BW-NEXT: # kill: def %al killed %al killed %eax
+; AVX512BW-NEXT: retq
%x0 = icmp sgt <8 x i8> %a, %b
%x1 = icmp sgt <8 x i8> %c, %d
%y = and <8 x i1> %x0, %x1
diff --git a/test/CodeGen/X86/bitcast-and-setcc-256.ll b/test/CodeGen/X86/bitcast-and-setcc-256.ll
index 95529686a58a..fdce65516e32 100644
--- a/test/CodeGen/X86/bitcast-and-setcc-256.ll
+++ b/test/CodeGen/X86/bitcast-and-setcc-256.ll
@@ -3,11 +3,12 @@
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+SSSE3 | FileCheck %s --check-prefixes=SSE2-SSSE3,SSSE3
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX12,AVX1
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX12,AVX2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl,+avx512bw | FileCheck %s --check-prefix=AVX512
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl,+avx512bw | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BW
define i4 @v4i64(<4 x i64> %a, <4 x i64> %b, <4 x i64> %c, <4 x i64> %d) {
; SSE2-SSSE3-LABEL: v4i64:
-; SSE2-SSSE3: # BB#0:
+; SSE2-SSSE3: # %bb.0:
; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,0,2147483648,0]
; SSE2-SSSE3-NEXT: pxor %xmm8, %xmm3
; SSE2-SSSE3-NEXT: pxor %xmm8, %xmm1
@@ -30,8 +31,6 @@ define i4 @v4i64(<4 x i64> %a, <4 x i64> %b, <4 x i64> %c, <4 x i64> %d) {
; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
; SSE2-SSSE3-NEXT: por %xmm2, %xmm0
; SSE2-SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2]
-; SSE2-SSSE3-NEXT: pslld $31, %xmm0
-; SSE2-SSSE3-NEXT: psrad $31, %xmm0
; SSE2-SSSE3-NEXT: pxor %xmm8, %xmm7
; SSE2-SSSE3-NEXT: pxor %xmm8, %xmm5
; SSE2-SSSE3-NEXT: movdqa %xmm5, %xmm1
@@ -53,54 +52,62 @@ define i4 @v4i64(<4 x i64> %a, <4 x i64> %b, <4 x i64> %c, <4 x i64> %d) {
; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; SSE2-SSSE3-NEXT: por %xmm4, %xmm2
; SSE2-SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[0,2]
-; SSE2-SSSE3-NEXT: pslld $31, %xmm2
-; SSE2-SSSE3-NEXT: psrad $31, %xmm2
-; SSE2-SSSE3-NEXT: pand %xmm0, %xmm2
+; SSE2-SSSE3-NEXT: andps %xmm0, %xmm2
; SSE2-SSSE3-NEXT: movmskps %xmm2, %eax
-; SSE2-SSSE3-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; SSE2-SSSE3-NEXT: # kill: def %al killed %al killed %eax
; SSE2-SSSE3-NEXT: retq
;
; AVX1-LABEL: v4i64:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
; AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpacksswb %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vpackssdw %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm1
; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpacksswb %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vpackssdw %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vmovmskps %xmm0, %eax
-; AVX1-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX1-NEXT: # kill: def %al killed %al killed %eax
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: v4i64:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpcmpgtq %ymm3, %ymm2, %ymm1
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX2-NEXT: vpacksswb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vmovmskps %xmm0, %eax
-; AVX2-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX2-NEXT: # kill: def %al killed %al killed %eax
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
-; AVX512-LABEL: v4i64:
-; AVX512: # BB#0:
-; AVX512-NEXT: vpcmpgtq %ymm1, %ymm0, %k1
-; AVX512-NEXT: vpcmpgtq %ymm3, %ymm2, %k0 {%k1}
-; AVX512-NEXT: kmovd %k0, %eax
-; AVX512-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
+; AVX512F-LABEL: v4i64:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpcmpgtq %ymm1, %ymm0, %k1
+; AVX512F-NEXT: vpcmpgtq %ymm3, %ymm2, %k0 {%k1}
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: v4i64:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpcmpgtq %ymm1, %ymm0, %k1
+; AVX512BW-NEXT: vpcmpgtq %ymm3, %ymm2, %k0 {%k1}
+; AVX512BW-NEXT: kmovd %k0, %eax
+; AVX512BW-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; AVX512BW-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
%x0 = icmp sgt <4 x i64> %a, %b
%x1 = icmp sgt <4 x i64> %c, %d
%y = and <4 x i1> %x0, %x1
@@ -110,45 +117,51 @@ define i4 @v4i64(<4 x i64> %a, <4 x i64> %b, <4 x i64> %c, <4 x i64> %d) {
define i4 @v4f64(<4 x double> %a, <4 x double> %b, <4 x double> %c, <4 x double> %d) {
; SSE2-SSSE3-LABEL: v4f64:
-; SSE2-SSSE3: # BB#0:
+; SSE2-SSSE3: # %bb.0:
; SSE2-SSSE3-NEXT: cmpltpd %xmm1, %xmm3
; SSE2-SSSE3-NEXT: cmpltpd %xmm0, %xmm2
; SSE2-SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2]
-; SSE2-SSSE3-NEXT: pslld $31, %xmm2
-; SSE2-SSSE3-NEXT: psrad $31, %xmm2
; SSE2-SSSE3-NEXT: cmpltpd %xmm5, %xmm7
; SSE2-SSSE3-NEXT: cmpltpd %xmm4, %xmm6
; SSE2-SSSE3-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,2],xmm7[0,2]
-; SSE2-SSSE3-NEXT: pslld $31, %xmm6
-; SSE2-SSSE3-NEXT: psrad $31, %xmm6
-; SSE2-SSSE3-NEXT: pand %xmm2, %xmm6
+; SSE2-SSSE3-NEXT: andps %xmm2, %xmm6
; SSE2-SSSE3-NEXT: movmskps %xmm6, %eax
-; SSE2-SSSE3-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; SSE2-SSSE3-NEXT: # kill: def %al killed %al killed %eax
; SSE2-SSSE3-NEXT: retq
;
; AVX12-LABEL: v4f64:
-; AVX12: # BB#0:
+; AVX12: # %bb.0:
; AVX12-NEXT: vcmpltpd %ymm0, %ymm1, %ymm0
; AVX12-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX12-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
+; AVX12-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
; AVX12-NEXT: vcmpltpd %ymm2, %ymm3, %ymm1
; AVX12-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX12-NEXT: vpacksswb %xmm2, %xmm1, %xmm1
+; AVX12-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
; AVX12-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX12-NEXT: vmovmskps %xmm0, %eax
-; AVX12-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX12-NEXT: # kill: def %al killed %al killed %eax
; AVX12-NEXT: vzeroupper
; AVX12-NEXT: retq
;
-; AVX512-LABEL: v4f64:
-; AVX512: # BB#0:
-; AVX512-NEXT: vcmpltpd %ymm0, %ymm1, %k1
-; AVX512-NEXT: vcmpltpd %ymm2, %ymm3, %k0 {%k1}
-; AVX512-NEXT: kmovd %k0, %eax
-; AVX512-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
+; AVX512F-LABEL: v4f64:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vcmpltpd %ymm0, %ymm1, %k1
+; AVX512F-NEXT: vcmpltpd %ymm2, %ymm3, %k0 {%k1}
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: v4f64:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vcmpltpd %ymm0, %ymm1, %k1
+; AVX512BW-NEXT: vcmpltpd %ymm2, %ymm3, %k0 {%k1}
+; AVX512BW-NEXT: kmovd %k0, %eax
+; AVX512BW-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; AVX512BW-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
%x0 = fcmp ogt <4 x double> %a, %b
%x1 = fcmp ogt <4 x double> %c, %d
%y = and <4 x i1> %x0, %x1
@@ -157,62 +170,21 @@ define i4 @v4f64(<4 x double> %a, <4 x double> %b, <4 x double> %c, <4 x double>
}
define i16 @v16i16(<16 x i16> %a, <16 x i16> %b, <16 x i16> %c, <16 x i16> %d) {
-; SSE2-LABEL: v16i16:
-; SSE2: # BB#0:
-; SSE2-NEXT: pcmpgtw %xmm3, %xmm1
-; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
-; SSE2-NEXT: pand %xmm3, %xmm1
-; SSE2-NEXT: pcmpgtw %xmm2, %xmm0
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: packuswb %xmm1, %xmm0
-; SSE2-NEXT: psllw $7, %xmm0
-; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
-; SSE2-NEXT: pand %xmm8, %xmm0
-; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: pxor %xmm1, %xmm1
-; SSE2-NEXT: pcmpgtb %xmm0, %xmm1
-; SSE2-NEXT: pcmpgtw %xmm7, %xmm5
-; SSE2-NEXT: pand %xmm3, %xmm5
-; SSE2-NEXT: pcmpgtw %xmm6, %xmm4
-; SSE2-NEXT: pand %xmm3, %xmm4
-; SSE2-NEXT: packuswb %xmm5, %xmm4
-; SSE2-NEXT: psllw $7, %xmm4
-; SSE2-NEXT: pand %xmm8, %xmm4
-; SSE2-NEXT: pcmpgtb %xmm4, %xmm2
-; SSE2-NEXT: pand %xmm1, %xmm2
-; SSE2-NEXT: pmovmskb %xmm2, %eax
-; SSE2-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: v16i16:
-; SSSE3: # BB#0:
-; SSSE3-NEXT: pcmpgtw %xmm3, %xmm1
-; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; SSSE3-NEXT: pshufb %xmm3, %xmm1
-; SSSE3-NEXT: pcmpgtw %xmm2, %xmm0
-; SSSE3-NEXT: pshufb %xmm3, %xmm0
-; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSSE3-NEXT: psllw $7, %xmm0
-; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
-; SSSE3-NEXT: pand %xmm8, %xmm0
-; SSSE3-NEXT: pxor %xmm2, %xmm2
-; SSSE3-NEXT: pxor %xmm1, %xmm1
-; SSSE3-NEXT: pcmpgtb %xmm0, %xmm1
-; SSSE3-NEXT: pcmpgtw %xmm7, %xmm5
-; SSSE3-NEXT: pshufb %xmm3, %xmm5
-; SSSE3-NEXT: pcmpgtw %xmm6, %xmm4
-; SSSE3-NEXT: pshufb %xmm3, %xmm4
-; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0]
-; SSSE3-NEXT: psllw $7, %xmm4
-; SSSE3-NEXT: pand %xmm8, %xmm4
-; SSSE3-NEXT: pcmpgtb %xmm4, %xmm2
-; SSSE3-NEXT: pand %xmm1, %xmm2
-; SSSE3-NEXT: pmovmskb %xmm2, %eax
-; SSSE3-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
-; SSSE3-NEXT: retq
+; SSE2-SSSE3-LABEL: v16i16:
+; SSE2-SSSE3: # %bb.0:
+; SSE2-SSSE3-NEXT: pcmpgtw %xmm3, %xmm1
+; SSE2-SSSE3-NEXT: pcmpgtw %xmm2, %xmm0
+; SSE2-SSSE3-NEXT: packsswb %xmm1, %xmm0
+; SSE2-SSSE3-NEXT: pcmpgtw %xmm7, %xmm5
+; SSE2-SSSE3-NEXT: pcmpgtw %xmm6, %xmm4
+; SSE2-SSSE3-NEXT: packsswb %xmm5, %xmm4
+; SSE2-SSSE3-NEXT: pand %xmm0, %xmm4
+; SSE2-SSSE3-NEXT: pmovmskb %xmm4, %eax
+; SSE2-SSSE3-NEXT: # kill: def %ax killed %ax killed %eax
+; SSE2-SSSE3-NEXT: retq
;
; AVX1-LABEL: v16i16:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
; AVX1-NEXT: vpcmpgtw %xmm4, %xmm5, %xmm4
@@ -225,12 +197,12 @@ define i16 @v16i16(<16 x i16> %a, <16 x i16> %b, <16 x i16> %c, <16 x i16> %d) {
; AVX1-NEXT: vpacksswb %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpmovmskb %xmm0, %eax
-; AVX1-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX1-NEXT: # kill: def %ax killed %ax killed %eax
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: v16i16:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
@@ -239,18 +211,33 @@ define i16 @v16i16(<16 x i16> %a, <16 x i16> %b, <16 x i16> %c, <16 x i16> %d) {
; AVX2-NEXT: vpacksswb %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpmovmskb %xmm0, %eax
-; AVX2-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX2-NEXT: # kill: def %ax killed %ax killed %eax
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
-; AVX512-LABEL: v16i16:
-; AVX512: # BB#0:
-; AVX512-NEXT: vpcmpgtw %ymm1, %ymm0, %k1
-; AVX512-NEXT: vpcmpgtw %ymm3, %ymm2, %k0 {%k1}
-; AVX512-NEXT: kmovd %k0, %eax
-; AVX512-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
+; AVX512F-LABEL: v16i16:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
+; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
+; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1
+; AVX512F-NEXT: vpcmpgtw %ymm3, %ymm2, %ymm0
+; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
+; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
+; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1}
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: # kill: def %ax killed %ax killed %eax
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: v16i16:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpcmpgtw %ymm1, %ymm0, %k1
+; AVX512BW-NEXT: vpcmpgtw %ymm3, %ymm2, %k0 {%k1}
+; AVX512BW-NEXT: kmovd %k0, %eax
+; AVX512BW-NEXT: # kill: def %ax killed %ax killed %eax
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
%x0 = icmp sgt <16 x i16> %a, %b
%x1 = icmp sgt <16 x i16> %c, %d
%y = and <16 x i1> %x0, %x1
@@ -259,102 +246,71 @@ define i16 @v16i16(<16 x i16> %a, <16 x i16> %b, <16 x i16> %c, <16 x i16> %d) {
}
define i8 @v8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, <8 x i32> %d) {
-; SSE2-LABEL: v8i32:
-; SSE2: # BB#0:
-; SSE2-NEXT: pcmpgtd %xmm3, %xmm1
-; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm0
-; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE2-NEXT: psllw $15, %xmm0
-; SSE2-NEXT: psraw $15, %xmm0
-; SSE2-NEXT: pcmpgtd %xmm7, %xmm5
-; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm5[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE2-NEXT: pcmpgtd %xmm6, %xmm4
-; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
-; SSE2-NEXT: psllw $15, %xmm2
-; SSE2-NEXT: psraw $15, %xmm2
-; SSE2-NEXT: pand %xmm0, %xmm2
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm2
-; SSE2-NEXT: packuswb %xmm2, %xmm2
-; SSE2-NEXT: pmovmskb %xmm2, %eax
-; SSE2-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: v8i32:
-; SSSE3: # BB#0:
-; SSSE3-NEXT: pcmpgtd %xmm3, %xmm1
-; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; SSSE3-NEXT: pshufb %xmm3, %xmm1
-; SSSE3-NEXT: pcmpgtd %xmm2, %xmm0
-; SSSE3-NEXT: pshufb %xmm3, %xmm0
-; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSSE3-NEXT: psllw $15, %xmm0
-; SSSE3-NEXT: psraw $15, %xmm0
-; SSSE3-NEXT: pcmpgtd %xmm7, %xmm5
-; SSSE3-NEXT: pshufb %xmm3, %xmm5
-; SSSE3-NEXT: pcmpgtd %xmm6, %xmm4
-; SSSE3-NEXT: pshufb %xmm3, %xmm4
-; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0]
-; SSSE3-NEXT: psllw $15, %xmm4
-; SSSE3-NEXT: psraw $15, %xmm4
-; SSSE3-NEXT: pand %xmm0, %xmm4
-; SSSE3-NEXT: pshufb {{.*#+}} xmm4 = xmm4[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
-; SSSE3-NEXT: pmovmskb %xmm4, %eax
-; SSSE3-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
-; SSSE3-NEXT: retq
+; SSE2-SSSE3-LABEL: v8i32:
+; SSE2-SSSE3: # %bb.0:
+; SSE2-SSSE3-NEXT: pcmpgtd %xmm3, %xmm1
+; SSE2-SSSE3-NEXT: pcmpgtd %xmm2, %xmm0
+; SSE2-SSSE3-NEXT: packssdw %xmm1, %xmm0
+; SSE2-SSSE3-NEXT: pcmpgtd %xmm7, %xmm5
+; SSE2-SSSE3-NEXT: pcmpgtd %xmm6, %xmm4
+; SSE2-SSSE3-NEXT: packssdw %xmm5, %xmm4
+; SSE2-SSSE3-NEXT: pand %xmm0, %xmm4
+; SSE2-SSSE3-NEXT: packsswb %xmm0, %xmm4
+; SSE2-SSSE3-NEXT: pmovmskb %xmm4, %eax
+; SSE2-SSSE3-NEXT: # kill: def %al killed %al killed %eax
+; SSE2-SSSE3-NEXT: retq
;
; AVX1-LABEL: v8i32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
; AVX1-NEXT: vpcmpgtd %xmm4, %xmm5, %xmm4
; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpacksswb %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vpackssdw %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
; AVX1-NEXT: vpcmpgtd %xmm1, %xmm4, %xmm1
; AVX1-NEXT: vpcmpgtd %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpacksswb %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vpackssdw %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX1-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
; AVX1-NEXT: vpmovmskb %xmm0, %eax
-; AVX1-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX1-NEXT: # kill: def %al killed %al killed %eax
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: v8i32:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpcmpgtd %ymm3, %ymm2, %ymm1
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX2-NEXT: vpacksswb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX2-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
; AVX2-NEXT: vpmovmskb %xmm0, %eax
-; AVX2-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX2-NEXT: # kill: def %al killed %al killed %eax
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
-; AVX512-LABEL: v8i32:
-; AVX512: # BB#0:
-; AVX512-NEXT: vpcmpgtd %ymm1, %ymm0, %k1
-; AVX512-NEXT: vpcmpgtd %ymm3, %ymm2, %k0 {%k1}
-; AVX512-NEXT: kmovd %k0, %eax
-; AVX512-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
+; AVX512F-LABEL: v8i32:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpcmpgtd %ymm1, %ymm0, %k1
+; AVX512F-NEXT: vpcmpgtd %ymm3, %ymm2, %k0 {%k1}
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: # kill: def %al killed %al killed %eax
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: v8i32:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpcmpgtd %ymm1, %ymm0, %k1
+; AVX512BW-NEXT: vpcmpgtd %ymm3, %ymm2, %k0 {%k1}
+; AVX512BW-NEXT: kmovd %k0, %eax
+; AVX512BW-NEXT: # kill: def %al killed %al killed %eax
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
%x0 = icmp sgt <8 x i32> %a, %b
%x1 = icmp sgt <8 x i32> %c, %d
%y = and <8 x i1> %x0, %x1
@@ -363,83 +319,52 @@ define i8 @v8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, <8 x i32> %d) {
}
define i8 @v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c, <8 x float> %d) {
-; SSE2-LABEL: v8f32:
-; SSE2: # BB#0:
-; SSE2-NEXT: cmpltps %xmm1, %xmm3
-; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE2-NEXT: cmpltps %xmm0, %xmm2
-; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE2-NEXT: psllw $15, %xmm0
-; SSE2-NEXT: psraw $15, %xmm0
-; SSE2-NEXT: cmpltps %xmm5, %xmm7
-; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm7[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE2-NEXT: cmpltps %xmm4, %xmm6
-; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm6[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
-; SSE2-NEXT: psllw $15, %xmm2
-; SSE2-NEXT: psraw $15, %xmm2
-; SSE2-NEXT: pand %xmm0, %xmm2
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm2
-; SSE2-NEXT: packuswb %xmm2, %xmm2
-; SSE2-NEXT: pmovmskb %xmm2, %eax
-; SSE2-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: v8f32:
-; SSSE3: # BB#0:
-; SSSE3-NEXT: cmpltps %xmm1, %xmm3
-; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; SSSE3-NEXT: pshufb %xmm1, %xmm3
-; SSSE3-NEXT: cmpltps %xmm0, %xmm2
-; SSSE3-NEXT: pshufb %xmm1, %xmm2
-; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; SSSE3-NEXT: psllw $15, %xmm2
-; SSSE3-NEXT: psraw $15, %xmm2
-; SSSE3-NEXT: cmpltps %xmm5, %xmm7
-; SSSE3-NEXT: pshufb %xmm1, %xmm7
-; SSSE3-NEXT: cmpltps %xmm4, %xmm6
-; SSSE3-NEXT: pshufb %xmm1, %xmm6
-; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm7[0]
-; SSSE3-NEXT: psllw $15, %xmm6
-; SSSE3-NEXT: psraw $15, %xmm6
-; SSSE3-NEXT: pand %xmm2, %xmm6
-; SSSE3-NEXT: pshufb {{.*#+}} xmm6 = xmm6[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
-; SSSE3-NEXT: pmovmskb %xmm6, %eax
-; SSSE3-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
-; SSSE3-NEXT: retq
+; SSE2-SSSE3-LABEL: v8f32:
+; SSE2-SSSE3: # %bb.0:
+; SSE2-SSSE3-NEXT: cmpltps %xmm1, %xmm3
+; SSE2-SSSE3-NEXT: cmpltps %xmm0, %xmm2
+; SSE2-SSSE3-NEXT: packssdw %xmm3, %xmm2
+; SSE2-SSSE3-NEXT: cmpltps %xmm5, %xmm7
+; SSE2-SSSE3-NEXT: cmpltps %xmm4, %xmm6
+; SSE2-SSSE3-NEXT: packssdw %xmm7, %xmm6
+; SSE2-SSSE3-NEXT: pand %xmm2, %xmm6
+; SSE2-SSSE3-NEXT: packsswb %xmm0, %xmm6
+; SSE2-SSSE3-NEXT: pmovmskb %xmm6, %eax
+; SSE2-SSSE3-NEXT: # kill: def %al killed %al killed %eax
+; SSE2-SSSE3-NEXT: retq
;
; AVX12-LABEL: v8f32:
-; AVX12: # BB#0:
+; AVX12: # %bb.0:
; AVX12-NEXT: vcmpltps %ymm0, %ymm1, %ymm0
; AVX12-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX12-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
+; AVX12-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
; AVX12-NEXT: vcmpltps %ymm2, %ymm3, %ymm1
; AVX12-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX12-NEXT: vpacksswb %xmm2, %xmm1, %xmm1
+; AVX12-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
; AVX12-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX12-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX12-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
; AVX12-NEXT: vpmovmskb %xmm0, %eax
-; AVX12-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX12-NEXT: # kill: def %al killed %al killed %eax
; AVX12-NEXT: vzeroupper
; AVX12-NEXT: retq
;
-; AVX512-LABEL: v8f32:
-; AVX512: # BB#0:
-; AVX512-NEXT: vcmpltps %ymm0, %ymm1, %k1
-; AVX512-NEXT: vcmpltps %ymm2, %ymm3, %k0 {%k1}
-; AVX512-NEXT: kmovd %k0, %eax
-; AVX512-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
+; AVX512F-LABEL: v8f32:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vcmpltps %ymm0, %ymm1, %k1
+; AVX512F-NEXT: vcmpltps %ymm2, %ymm3, %k0 {%k1}
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: # kill: def %al killed %al killed %eax
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: v8f32:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vcmpltps %ymm0, %ymm1, %k1
+; AVX512BW-NEXT: vcmpltps %ymm2, %ymm3, %k0 {%k1}
+; AVX512BW-NEXT: kmovd %k0, %eax
+; AVX512BW-NEXT: # kill: def %al killed %al killed %eax
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
%x0 = fcmp ogt <8 x float> %a, %b
%x1 = fcmp ogt <8 x float> %c, %d
%y = and <8 x i1> %x0, %x1
@@ -449,245 +374,40 @@ define i8 @v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c, <8 x float> %d)
define i32 @v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> %c, <32 x i8> %d) {
; SSE2-SSSE3-LABEL: v32i8:
-; SSE2-SSSE3: # BB#0:
+; SSE2-SSSE3: # %bb.0:
; SSE2-SSSE3-NEXT: pcmpgtb %xmm2, %xmm0
; SSE2-SSSE3-NEXT: pcmpgtb %xmm3, %xmm1
; SSE2-SSSE3-NEXT: pcmpgtb %xmm6, %xmm4
; SSE2-SSSE3-NEXT: pand %xmm0, %xmm4
; SSE2-SSSE3-NEXT: pcmpgtb %xmm7, %xmm5
; SSE2-SSSE3-NEXT: pand %xmm1, %xmm5
-; SSE2-SSSE3-NEXT: movdqa %xmm5, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movdqa %xmm4, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; SSE2-SSSE3-NEXT: andb $1, %al
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; SSE2-SSSE3-NEXT: andb $1, %al
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; SSE2-SSSE3-NEXT: andb $1, %al
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; SSE2-SSSE3-NEXT: andb $1, %al
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; SSE2-SSSE3-NEXT: andb $1, %al
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; SSE2-SSSE3-NEXT: andb $1, %al
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; SSE2-SSSE3-NEXT: andb $1, %al
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; SSE2-SSSE3-NEXT: andb $1, %al
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; SSE2-SSSE3-NEXT: andb $1, %al
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; SSE2-SSSE3-NEXT: andb $1, %al
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; SSE2-SSSE3-NEXT: andb $1, %al
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; SSE2-SSSE3-NEXT: andb $1, %al
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; SSE2-SSSE3-NEXT: andb $1, %al
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; SSE2-SSSE3-NEXT: andb $1, %al
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl
-; SSE2-SSSE3-NEXT: andb $1, %cl
-; SSE2-SSSE3-NEXT: movb %cl, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: andb $1, %al
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; SSE2-SSSE3-NEXT: andb $1, %al
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; SSE2-SSSE3-NEXT: andb $1, %al
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; SSE2-SSSE3-NEXT: andb $1, %al
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; SSE2-SSSE3-NEXT: andb $1, %al
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; SSE2-SSSE3-NEXT: andb $1, %al
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; SSE2-SSSE3-NEXT: andb $1, %al
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; SSE2-SSSE3-NEXT: andb $1, %al
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; SSE2-SSSE3-NEXT: andb $1, %al
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; SSE2-SSSE3-NEXT: andb $1, %al
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; SSE2-SSSE3-NEXT: andb $1, %al
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; SSE2-SSSE3-NEXT: andb $1, %al
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; SSE2-SSSE3-NEXT: andb $1, %al
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; SSE2-SSSE3-NEXT: andb $1, %al
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; SSE2-SSSE3-NEXT: andb $1, %al
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl
-; SSE2-SSSE3-NEXT: andb $1, %cl
-; SSE2-SSSE3-NEXT: movb %cl, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: andb $1, %al
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-SSSE3-NEXT: shll $16, %ecx
-; SSE2-SSSE3-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax
+; SSE2-SSSE3-NEXT: pmovmskb %xmm4, %ecx
+; SSE2-SSSE3-NEXT: pmovmskb %xmm5, %eax
+; SSE2-SSSE3-NEXT: shll $16, %eax
; SSE2-SSSE3-NEXT: orl %ecx, %eax
; SSE2-SSSE3-NEXT: retq
;
; AVX1-LABEL: v32i8:
-; AVX1: # BB#0:
-; AVX1-NEXT: pushq %rbp
-; AVX1-NEXT: .Lcfi0:
-; AVX1-NEXT: .cfi_def_cfa_offset 16
-; AVX1-NEXT: .Lcfi1:
-; AVX1-NEXT: .cfi_offset %rbp, -16
-; AVX1-NEXT: movq %rsp, %rbp
-; AVX1-NEXT: .Lcfi2:
-; AVX1-NEXT: .cfi_def_cfa_register %rbp
-; AVX1-NEXT: andq $-32, %rsp
-; AVX1-NEXT: subq $32, %rsp
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
; AVX1-NEXT: vpcmpgtb %xmm4, %xmm5, %xmm4
; AVX1-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm1
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
-; AVX1-NEXT: vpcmpgtb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
+; AVX1-NEXT: vpcmpgtb %xmm1, %xmm5, %xmm1
+; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm1
; AVX1-NEXT: vpcmpgtb %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
-; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpextrb $15, %xmm1, %eax
-; AVX1-NEXT: andb $1, %al
-; AVX1-NEXT: movb %al, (%rsp)
-; AVX1-NEXT: vpextrb $14, %xmm1, %eax
-; AVX1-NEXT: andb $1, %al
-; AVX1-NEXT: movb %al, (%rsp)
-; AVX1-NEXT: vpextrb $13, %xmm1, %eax
-; AVX1-NEXT: andb $1, %al
-; AVX1-NEXT: movb %al, (%rsp)
-; AVX1-NEXT: vpextrb $12, %xmm1, %eax
-; AVX1-NEXT: andb $1, %al
-; AVX1-NEXT: movb %al, (%rsp)
-; AVX1-NEXT: vpextrb $11, %xmm1, %eax
-; AVX1-NEXT: andb $1, %al
-; AVX1-NEXT: movb %al, (%rsp)
-; AVX1-NEXT: vpextrb $10, %xmm1, %eax
-; AVX1-NEXT: andb $1, %al
-; AVX1-NEXT: movb %al, (%rsp)
-; AVX1-NEXT: vpextrb $9, %xmm1, %eax
-; AVX1-NEXT: andb $1, %al
-; AVX1-NEXT: movb %al, (%rsp)
-; AVX1-NEXT: vpextrb $8, %xmm1, %eax
-; AVX1-NEXT: andb $1, %al
-; AVX1-NEXT: movb %al, (%rsp)
-; AVX1-NEXT: vpextrb $7, %xmm1, %eax
-; AVX1-NEXT: andb $1, %al
-; AVX1-NEXT: movb %al, (%rsp)
-; AVX1-NEXT: vpextrb $6, %xmm1, %eax
-; AVX1-NEXT: andb $1, %al
-; AVX1-NEXT: movb %al, (%rsp)
-; AVX1-NEXT: vpextrb $5, %xmm1, %eax
-; AVX1-NEXT: andb $1, %al
-; AVX1-NEXT: movb %al, (%rsp)
-; AVX1-NEXT: vpextrb $4, %xmm1, %eax
-; AVX1-NEXT: andb $1, %al
-; AVX1-NEXT: movb %al, (%rsp)
-; AVX1-NEXT: vpextrb $3, %xmm1, %eax
-; AVX1-NEXT: andb $1, %al
-; AVX1-NEXT: movb %al, (%rsp)
-; AVX1-NEXT: vpextrb $2, %xmm1, %eax
-; AVX1-NEXT: andb $1, %al
-; AVX1-NEXT: movb %al, (%rsp)
-; AVX1-NEXT: vpextrb $1, %xmm1, %eax
-; AVX1-NEXT: andb $1, %al
-; AVX1-NEXT: movb %al, (%rsp)
-; AVX1-NEXT: vpextrb $0, %xmm1, %eax
-; AVX1-NEXT: andb $1, %al
-; AVX1-NEXT: movb %al, (%rsp)
-; AVX1-NEXT: vpextrb $15, %xmm0, %eax
-; AVX1-NEXT: andb $1, %al
-; AVX1-NEXT: movb %al, (%rsp)
-; AVX1-NEXT: vpextrb $14, %xmm0, %eax
-; AVX1-NEXT: andb $1, %al
-; AVX1-NEXT: movb %al, (%rsp)
-; AVX1-NEXT: vpextrb $13, %xmm0, %eax
-; AVX1-NEXT: andb $1, %al
-; AVX1-NEXT: movb %al, (%rsp)
-; AVX1-NEXT: vpextrb $12, %xmm0, %eax
-; AVX1-NEXT: andb $1, %al
-; AVX1-NEXT: movb %al, (%rsp)
-; AVX1-NEXT: vpextrb $11, %xmm0, %eax
-; AVX1-NEXT: andb $1, %al
-; AVX1-NEXT: movb %al, (%rsp)
-; AVX1-NEXT: vpextrb $10, %xmm0, %eax
-; AVX1-NEXT: andb $1, %al
-; AVX1-NEXT: movb %al, (%rsp)
-; AVX1-NEXT: vpextrb $9, %xmm0, %eax
-; AVX1-NEXT: andb $1, %al
-; AVX1-NEXT: movb %al, (%rsp)
-; AVX1-NEXT: vpextrb $8, %xmm0, %eax
-; AVX1-NEXT: andb $1, %al
-; AVX1-NEXT: movb %al, (%rsp)
-; AVX1-NEXT: vpextrb $7, %xmm0, %eax
-; AVX1-NEXT: andb $1, %al
-; AVX1-NEXT: movb %al, (%rsp)
-; AVX1-NEXT: vpextrb $6, %xmm0, %eax
-; AVX1-NEXT: andb $1, %al
-; AVX1-NEXT: movb %al, (%rsp)
-; AVX1-NEXT: vpextrb $5, %xmm0, %eax
-; AVX1-NEXT: andb $1, %al
-; AVX1-NEXT: movb %al, (%rsp)
-; AVX1-NEXT: vpextrb $4, %xmm0, %eax
-; AVX1-NEXT: andb $1, %al
-; AVX1-NEXT: movb %al, (%rsp)
-; AVX1-NEXT: vpextrb $3, %xmm0, %eax
-; AVX1-NEXT: andb $1, %al
-; AVX1-NEXT: movb %al, (%rsp)
-; AVX1-NEXT: vpextrb $2, %xmm0, %eax
-; AVX1-NEXT: andb $1, %al
-; AVX1-NEXT: movb %al, (%rsp)
-; AVX1-NEXT: vpextrb $1, %xmm0, %eax
-; AVX1-NEXT: andb $1, %al
-; AVX1-NEXT: movb %al, (%rsp)
-; AVX1-NEXT: vpextrb $0, %xmm0, %eax
-; AVX1-NEXT: andb $1, %al
-; AVX1-NEXT: movb %al, (%rsp)
-; AVX1-NEXT: movl (%rsp), %eax
-; AVX1-NEXT: movq %rbp, %rsp
-; AVX1-NEXT: popq %rbp
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpmovmskb %xmm0, %ecx
+; AVX1-NEXT: vpmovmskb %xmm1, %eax
+; AVX1-NEXT: shll $16, %eax
+; AVX1-NEXT: orl %ecx, %eax
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: v32i8:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpcmpgtb %ymm3, %ymm2, %ymm1
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -695,13 +415,40 @@ define i32 @v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> %c, <32 x i8> %d) {
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
-; AVX512-LABEL: v32i8:
-; AVX512: # BB#0:
-; AVX512-NEXT: vpcmpgtb %ymm1, %ymm0, %k1
-; AVX512-NEXT: vpcmpgtb %ymm3, %ymm2, %k0 {%k1}
-; AVX512-NEXT: kmovd %k0, %eax
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
+; AVX512F-LABEL: v32i8:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: pushq %rbp
+; AVX512F-NEXT: .cfi_def_cfa_offset 16
+; AVX512F-NEXT: .cfi_offset %rbp, -16
+; AVX512F-NEXT: movq %rsp, %rbp
+; AVX512F-NEXT: .cfi_def_cfa_register %rbp
+; AVX512F-NEXT: andq $-32, %rsp
+; AVX512F-NEXT: subq $32, %rsp
+; AVX512F-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT: vpcmpgtb %ymm3, %ymm2, %ymm1
+; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm1
+; AVX512F-NEXT: vpslld $31, %zmm1, %zmm1
+; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0
+; AVX512F-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
+; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
+; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
+; AVX512F-NEXT: kmovw %k0, (%rsp)
+; AVX512F-NEXT: movl (%rsp), %eax
+; AVX512F-NEXT: movq %rbp, %rsp
+; AVX512F-NEXT: popq %rbp
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: v32i8:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpcmpgtb %ymm1, %ymm0, %k1
+; AVX512BW-NEXT: vpcmpgtb %ymm3, %ymm2, %k0 {%k1}
+; AVX512BW-NEXT: kmovd %k0, %eax
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
%x0 = icmp sgt <32 x i8> %a, %b
%x1 = icmp sgt <32 x i8> %c, %d
%y = and <32 x i1> %x0, %x1
diff --git a/test/CodeGen/X86/bitcast-and-setcc-512.ll b/test/CodeGen/X86/bitcast-and-setcc-512.ll
index 2eba79b0297f..dfda374aa52f 100644
--- a/test/CodeGen/X86/bitcast-and-setcc-512.ll
+++ b/test/CodeGen/X86/bitcast-and-setcc-512.ll
@@ -7,135 +7,110 @@
define i8 @v8i64(<8 x i64> %a, <8 x i64> %b, <8 x i64> %c, <8 x i64> %d) {
; SSE-LABEL: v8i64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
-; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm10
; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9
+; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm10
; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm11
; SSE-NEXT: pcmpgtq %xmm7, %xmm3
; SSE-NEXT: pcmpgtq %xmm6, %xmm2
; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2]
-; SSE-NEXT: pslld $31, %xmm2
-; SSE-NEXT: psrad $31, %xmm2
-; SSE-NEXT: movdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; SSE-NEXT: pshufb %xmm3, %xmm2
; SSE-NEXT: pcmpgtq %xmm5, %xmm1
; SSE-NEXT: pcmpgtq %xmm4, %xmm0
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
-; SSE-NEXT: pslld $31, %xmm0
-; SSE-NEXT: psrad $31, %xmm0
-; SSE-NEXT: pshufb %xmm3, %xmm0
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; SSE-NEXT: psllw $15, %xmm0
-; SSE-NEXT: psraw $15, %xmm0
+; SSE-NEXT: packssdw %xmm2, %xmm0
; SSE-NEXT: pcmpgtq {{[0-9]+}}(%rsp), %xmm11
-; SSE-NEXT: pcmpgtq {{[0-9]+}}(%rsp), %xmm9
-; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,2],xmm11[0,2]
-; SSE-NEXT: pslld $31, %xmm9
-; SSE-NEXT: psrad $31, %xmm9
-; SSE-NEXT: pshufb %xmm3, %xmm9
; SSE-NEXT: pcmpgtq {{[0-9]+}}(%rsp), %xmm10
+; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm11[0,2]
+; SSE-NEXT: pcmpgtq {{[0-9]+}}(%rsp), %xmm9
; SSE-NEXT: pcmpgtq {{[0-9]+}}(%rsp), %xmm8
-; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,2],xmm10[0,2]
-; SSE-NEXT: pslld $31, %xmm8
-; SSE-NEXT: psrad $31, %xmm8
-; SSE-NEXT: pshufb %xmm3, %xmm8
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm9[0]
-; SSE-NEXT: psllw $15, %xmm8
-; SSE-NEXT: psraw $15, %xmm8
+; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,2],xmm9[0,2]
+; SSE-NEXT: packssdw %xmm10, %xmm8
; SSE-NEXT: pand %xmm0, %xmm8
-; SSE-NEXT: pshufb {{.*#+}} xmm8 = xmm8[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; SSE-NEXT: packsswb %xmm0, %xmm8
; SSE-NEXT: pmovmskb %xmm8, %eax
-; SSE-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; SSE-NEXT: # kill: def %al killed %al killed %eax
; SSE-NEXT: retq
;
; AVX1-LABEL: v8i64:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm8
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm9
; AVX1-NEXT: vpcmpgtq %xmm8, %xmm9, %xmm8
; AVX1-NEXT: vpcmpgtq %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vpacksswb %xmm8, %xmm1, %xmm1
+; AVX1-NEXT: vpackssdw %xmm8, %xmm1, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; AVX1-NEXT: vpshufb %xmm8, %xmm1, %xmm9
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpcmpgtq %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpshufb %xmm8, %xmm0, %xmm0
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm9[0]
-; AVX1-NEXT: vpsllw $15, %xmm0, %xmm0
-; AVX1-NEXT: vpsraw $15, %xmm0, %xmm0
; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm2
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vpcmpgtq %xmm7, %xmm5, %xmm2
-; AVX1-NEXT: vpacksswb %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vpackssdw %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vpshufb %xmm8, %xmm1, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm3
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpcmpgtq %xmm6, %xmm4, %xmm3
-; AVX1-NEXT: vpacksswb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpackssdw %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpshufb %xmm8, %xmm2, %xmm2
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX1-NEXT: vpsllw $15, %xmm1, %xmm1
-; AVX1-NEXT: vpsraw $15, %xmm1, %xmm1
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX1-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
; AVX1-NEXT: vpmovmskb %xmm0, %eax
-; AVX1-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX1-NEXT: # kill: def %al killed %al killed %eax
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: v8i64:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpcmpgtq %ymm3, %ymm1, %ymm1
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
-; AVX2-NEXT: vpacksswb %xmm3, %xmm1, %xmm1
+; AVX2-NEXT: vpackssdw %xmm3, %xmm1, %xmm1
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX2-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
-; AVX2-NEXT: vpacksswb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX2-NEXT: vpsllw $15, %xmm0, %xmm0
-; AVX2-NEXT: vpsraw $15, %xmm0, %xmm0
; AVX2-NEXT: vpcmpgtq %ymm7, %ymm5, %ymm1
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX2-NEXT: vpacksswb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX2-NEXT: vpcmpgtq %ymm6, %ymm4, %ymm2
; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm4
-; AVX2-NEXT: vpacksswb %xmm4, %xmm2, %xmm2
+; AVX2-NEXT: vpackssdw %xmm4, %xmm2, %xmm2
; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX2-NEXT: vpsllw $15, %xmm1, %xmm1
-; AVX2-NEXT: vpsraw $15, %xmm1, %xmm1
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX2-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
; AVX2-NEXT: vpmovmskb %xmm0, %eax
-; AVX2-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX2-NEXT: # kill: def %al killed %al killed %eax
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512F-LABEL: v8i64:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vpcmpgtq %zmm1, %zmm0, %k1
; AVX512F-NEXT: vpcmpgtq %zmm3, %zmm2, %k0 {%k1}
; AVX512F-NEXT: kmovw %k0, %eax
-; AVX512F-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512F-NEXT: # kill: def %al killed %al killed %eax
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: v8i64:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpcmpgtq %zmm1, %zmm0, %k1
; AVX512BW-NEXT: vpcmpgtq %zmm3, %zmm2, %k0 {%k1}
; AVX512BW-NEXT: kmovd %k0, %eax
-; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: # kill: def %al killed %al killed %eax
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
%x0 = icmp sgt <8 x i64> %a, %b
@@ -147,95 +122,74 @@ define i8 @v8i64(<8 x i64> %a, <8 x i64> %b, <8 x i64> %c, <8 x i64> %d) {
define i8 @v8f64(<8 x double> %a, <8 x double> %b, <8 x double> %c, <8 x double> %d) {
; SSE-LABEL: v8f64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm8
-; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm10
; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm9
+; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm10
; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm11
; SSE-NEXT: cmpltpd %xmm3, %xmm7
; SSE-NEXT: cmpltpd %xmm2, %xmm6
; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,2],xmm7[0,2]
-; SSE-NEXT: pslld $31, %xmm6
-; SSE-NEXT: psrad $31, %xmm6
-; SSE-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; SSE-NEXT: pshufb %xmm2, %xmm6
; SSE-NEXT: cmpltpd %xmm1, %xmm5
; SSE-NEXT: cmpltpd %xmm0, %xmm4
; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm5[0,2]
-; SSE-NEXT: pslld $31, %xmm4
-; SSE-NEXT: psrad $31, %xmm4
-; SSE-NEXT: pshufb %xmm2, %xmm4
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm6[0]
-; SSE-NEXT: psllw $15, %xmm4
-; SSE-NEXT: psraw $15, %xmm4
+; SSE-NEXT: packssdw %xmm6, %xmm4
; SSE-NEXT: cmpltpd {{[0-9]+}}(%rsp), %xmm11
-; SSE-NEXT: cmpltpd {{[0-9]+}}(%rsp), %xmm9
-; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,2],xmm11[0,2]
-; SSE-NEXT: pslld $31, %xmm9
-; SSE-NEXT: psrad $31, %xmm9
-; SSE-NEXT: pshufb %xmm2, %xmm9
; SSE-NEXT: cmpltpd {{[0-9]+}}(%rsp), %xmm10
+; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm11[0,2]
+; SSE-NEXT: cmpltpd {{[0-9]+}}(%rsp), %xmm9
; SSE-NEXT: cmpltpd {{[0-9]+}}(%rsp), %xmm8
-; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,2],xmm10[0,2]
-; SSE-NEXT: pslld $31, %xmm8
-; SSE-NEXT: psrad $31, %xmm8
-; SSE-NEXT: pshufb %xmm2, %xmm8
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm9[0]
-; SSE-NEXT: psllw $15, %xmm8
-; SSE-NEXT: psraw $15, %xmm8
+; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,2],xmm9[0,2]
+; SSE-NEXT: packssdw %xmm10, %xmm8
; SSE-NEXT: pand %xmm4, %xmm8
-; SSE-NEXT: pshufb {{.*#+}} xmm8 = xmm8[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; SSE-NEXT: packsswb %xmm0, %xmm8
; SSE-NEXT: pmovmskb %xmm8, %eax
-; SSE-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; SSE-NEXT: # kill: def %al killed %al killed %eax
; SSE-NEXT: retq
;
; AVX12-LABEL: v8f64:
-; AVX12: # BB#0:
+; AVX12: # %bb.0:
; AVX12-NEXT: vcmpltpd %ymm1, %ymm3, %ymm1
; AVX12-NEXT: vextractf128 $1, %ymm1, %xmm3
-; AVX12-NEXT: vpacksswb %xmm3, %xmm1, %xmm1
+; AVX12-NEXT: vpackssdw %xmm3, %xmm1, %xmm1
; AVX12-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; AVX12-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX12-NEXT: vcmpltpd %ymm0, %ymm2, %ymm0
; AVX12-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX12-NEXT: vpacksswb %xmm2, %xmm0, %xmm0
+; AVX12-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
; AVX12-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; AVX12-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX12-NEXT: vpsllw $15, %xmm0, %xmm0
-; AVX12-NEXT: vpsraw $15, %xmm0, %xmm0
; AVX12-NEXT: vcmpltpd %ymm5, %ymm7, %ymm1
; AVX12-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX12-NEXT: vpacksswb %xmm2, %xmm1, %xmm1
+; AVX12-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
; AVX12-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX12-NEXT: vcmpltpd %ymm4, %ymm6, %ymm2
; AVX12-NEXT: vextractf128 $1, %ymm2, %xmm4
-; AVX12-NEXT: vpacksswb %xmm4, %xmm2, %xmm2
+; AVX12-NEXT: vpackssdw %xmm4, %xmm2, %xmm2
; AVX12-NEXT: vpshufb %xmm3, %xmm2, %xmm2
; AVX12-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX12-NEXT: vpsllw $15, %xmm1, %xmm1
-; AVX12-NEXT: vpsraw $15, %xmm1, %xmm1
; AVX12-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX12-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX12-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
; AVX12-NEXT: vpmovmskb %xmm0, %eax
-; AVX12-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX12-NEXT: # kill: def %al killed %al killed %eax
; AVX12-NEXT: vzeroupper
; AVX12-NEXT: retq
;
; AVX512F-LABEL: v8f64:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vcmpltpd %zmm0, %zmm1, %k1
; AVX512F-NEXT: vcmpltpd %zmm2, %zmm3, %k0 {%k1}
; AVX512F-NEXT: kmovw %k0, %eax
-; AVX512F-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512F-NEXT: # kill: def %al killed %al killed %eax
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: v8f64:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vcmpltpd %zmm0, %zmm1, %k1
; AVX512BW-NEXT: vcmpltpd %zmm2, %zmm3, %k0 {%k1}
; AVX512BW-NEXT: kmovd %k0, %eax
-; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: # kill: def %al killed %al killed %eax
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
%x0 = fcmp ogt <8 x double> %a, %b
@@ -247,148 +201,33 @@ define i8 @v8f64(<8 x double> %a, <8 x double> %b, <8 x double> %c, <8 x double>
define i32 @v32i16(<32 x i16> %a, <32 x i16> %b, <32 x i16> %c, <32 x i16> %d) {
; SSE-LABEL: v32i16:
-; SSE: # BB#0:
+; SSE: # %bb.0:
+; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9
; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm10
-; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm11
; SSE-NEXT: pcmpgtw %xmm5, %xmm1
-; SSE-NEXT: movdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; SSE-NEXT: pshufb %xmm5, %xmm1
; SSE-NEXT: pcmpgtw %xmm4, %xmm0
-; SSE-NEXT: pshufb %xmm5, %xmm0
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT: packsswb %xmm1, %xmm0
; SSE-NEXT: pcmpgtw %xmm7, %xmm3
-; SSE-NEXT: pshufb %xmm5, %xmm3
; SSE-NEXT: pcmpgtw %xmm6, %xmm2
-; SSE-NEXT: pshufb %xmm5, %xmm2
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; SSE-NEXT: packsswb %xmm3, %xmm2
; SSE-NEXT: pcmpgtw {{[0-9]+}}(%rsp), %xmm11
-; SSE-NEXT: pshufb %xmm5, %xmm11
-; SSE-NEXT: pcmpgtw {{[0-9]+}}(%rsp), %xmm8
-; SSE-NEXT: pshufb %xmm5, %xmm8
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm11[0]
-; SSE-NEXT: pand %xmm0, %xmm8
; SSE-NEXT: pcmpgtw {{[0-9]+}}(%rsp), %xmm10
-; SSE-NEXT: pshufb %xmm5, %xmm10
+; SSE-NEXT: packsswb %xmm11, %xmm10
+; SSE-NEXT: pand %xmm0, %xmm10
; SSE-NEXT: pcmpgtw {{[0-9]+}}(%rsp), %xmm9
-; SSE-NEXT: pshufb %xmm5, %xmm9
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm9 = xmm9[0],xmm10[0]
-; SSE-NEXT: pand %xmm2, %xmm9
-; SSE-NEXT: pextrb $15, %xmm9, %eax
-; SSE-NEXT: andb $1, %al
-; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: pextrb $14, %xmm9, %eax
-; SSE-NEXT: andb $1, %al
-; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: pextrb $13, %xmm9, %eax
-; SSE-NEXT: andb $1, %al
-; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: pextrb $12, %xmm9, %eax
-; SSE-NEXT: andb $1, %al
-; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: pextrb $11, %xmm9, %eax
-; SSE-NEXT: andb $1, %al
-; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: pextrb $10, %xmm9, %eax
-; SSE-NEXT: andb $1, %al
-; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: pextrb $9, %xmm9, %eax
-; SSE-NEXT: andb $1, %al
-; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: pextrb $8, %xmm9, %eax
-; SSE-NEXT: andb $1, %al
-; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: pextrb $7, %xmm9, %eax
-; SSE-NEXT: andb $1, %al
-; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: pextrb $6, %xmm9, %eax
-; SSE-NEXT: andb $1, %al
-; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: pextrb $5, %xmm9, %eax
-; SSE-NEXT: andb $1, %al
-; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: pextrb $4, %xmm9, %eax
-; SSE-NEXT: andb $1, %al
-; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: pextrb $3, %xmm9, %eax
-; SSE-NEXT: andb $1, %al
-; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: pextrb $2, %xmm9, %eax
-; SSE-NEXT: andb $1, %al
-; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: pextrb $1, %xmm9, %eax
-; SSE-NEXT: andb $1, %al
-; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: pextrb $0, %xmm9, %eax
-; SSE-NEXT: andb $1, %al
-; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: pextrb $15, %xmm8, %eax
-; SSE-NEXT: andb $1, %al
-; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: pextrb $14, %xmm8, %eax
-; SSE-NEXT: andb $1, %al
-; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: pextrb $13, %xmm8, %eax
-; SSE-NEXT: andb $1, %al
-; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: pextrb $12, %xmm8, %eax
-; SSE-NEXT: andb $1, %al
-; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: pextrb $11, %xmm8, %eax
-; SSE-NEXT: andb $1, %al
-; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: pextrb $10, %xmm8, %eax
-; SSE-NEXT: andb $1, %al
-; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: pextrb $9, %xmm8, %eax
-; SSE-NEXT: andb $1, %al
-; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: pextrb $8, %xmm8, %eax
-; SSE-NEXT: andb $1, %al
-; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: pextrb $7, %xmm8, %eax
-; SSE-NEXT: andb $1, %al
-; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: pextrb $6, %xmm8, %eax
-; SSE-NEXT: andb $1, %al
-; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: pextrb $5, %xmm8, %eax
-; SSE-NEXT: andb $1, %al
-; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: pextrb $4, %xmm8, %eax
-; SSE-NEXT: andb $1, %al
-; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: pextrb $3, %xmm8, %eax
-; SSE-NEXT: andb $1, %al
-; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: pextrb $2, %xmm8, %eax
-; SSE-NEXT: andb $1, %al
-; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: pextrb $1, %xmm8, %eax
-; SSE-NEXT: andb $1, %al
-; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: pextrb $0, %xmm8, %eax
-; SSE-NEXT: andb $1, %al
-; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movl -{{[0-9]+}}(%rsp), %ecx
-; SSE-NEXT: shll $16, %ecx
-; SSE-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax
+; SSE-NEXT: pcmpgtw {{[0-9]+}}(%rsp), %xmm8
+; SSE-NEXT: packsswb %xmm9, %xmm8
+; SSE-NEXT: pand %xmm2, %xmm8
+; SSE-NEXT: pmovmskb %xmm10, %ecx
+; SSE-NEXT: pmovmskb %xmm8, %eax
+; SSE-NEXT: shll $16, %eax
; SSE-NEXT: orl %ecx, %eax
; SSE-NEXT: retq
;
; AVX1-LABEL: v32i16:
-; AVX1: # BB#0:
-; AVX1-NEXT: pushq %rbp
-; AVX1-NEXT: .Lcfi0:
-; AVX1-NEXT: .cfi_def_cfa_offset 16
-; AVX1-NEXT: .Lcfi1:
-; AVX1-NEXT: .cfi_offset %rbp, -16
-; AVX1-NEXT: movq %rsp, %rbp
-; AVX1-NEXT: .Lcfi2:
-; AVX1-NEXT: .cfi_def_cfa_register %rbp
-; AVX1-NEXT: andq $-32, %rsp
-; AVX1-NEXT: subq $32, %rsp
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm8
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm9
; AVX1-NEXT: vpcmpgtw %xmm8, %xmm9, %xmm8
@@ -399,124 +238,27 @@ define i32 @v32i16(<32 x i16> %a, <32 x i16> %b, <32 x i16> %c, <32 x i16> %d) {
; AVX1-NEXT: vpcmpgtw %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vpcmpgtw %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm2
; AVX1-NEXT: vpcmpgtw %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vpcmpgtw %xmm7, %xmm5, %xmm2
; AVX1-NEXT: vpacksswb %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vpand %xmm1, %xmm8, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm3
; AVX1-NEXT: vpcmpgtw %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpcmpgtw %xmm6, %xmm4, %xmm3
; AVX1-NEXT: vpacksswb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
-; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpextrb $15, %xmm1, %eax
-; AVX1-NEXT: andb $1, %al
-; AVX1-NEXT: movb %al, (%rsp)
-; AVX1-NEXT: vpextrb $14, %xmm1, %eax
-; AVX1-NEXT: andb $1, %al
-; AVX1-NEXT: movb %al, (%rsp)
-; AVX1-NEXT: vpextrb $13, %xmm1, %eax
-; AVX1-NEXT: andb $1, %al
-; AVX1-NEXT: movb %al, (%rsp)
-; AVX1-NEXT: vpextrb $12, %xmm1, %eax
-; AVX1-NEXT: andb $1, %al
-; AVX1-NEXT: movb %al, (%rsp)
-; AVX1-NEXT: vpextrb $11, %xmm1, %eax
-; AVX1-NEXT: andb $1, %al
-; AVX1-NEXT: movb %al, (%rsp)
-; AVX1-NEXT: vpextrb $10, %xmm1, %eax
-; AVX1-NEXT: andb $1, %al
-; AVX1-NEXT: movb %al, (%rsp)
-; AVX1-NEXT: vpextrb $9, %xmm1, %eax
-; AVX1-NEXT: andb $1, %al
-; AVX1-NEXT: movb %al, (%rsp)
-; AVX1-NEXT: vpextrb $8, %xmm1, %eax
-; AVX1-NEXT: andb $1, %al
-; AVX1-NEXT: movb %al, (%rsp)
-; AVX1-NEXT: vpextrb $7, %xmm1, %eax
-; AVX1-NEXT: andb $1, %al
-; AVX1-NEXT: movb %al, (%rsp)
-; AVX1-NEXT: vpextrb $6, %xmm1, %eax
-; AVX1-NEXT: andb $1, %al
-; AVX1-NEXT: movb %al, (%rsp)
-; AVX1-NEXT: vpextrb $5, %xmm1, %eax
-; AVX1-NEXT: andb $1, %al
-; AVX1-NEXT: movb %al, (%rsp)
-; AVX1-NEXT: vpextrb $4, %xmm1, %eax
-; AVX1-NEXT: andb $1, %al
-; AVX1-NEXT: movb %al, (%rsp)
-; AVX1-NEXT: vpextrb $3, %xmm1, %eax
-; AVX1-NEXT: andb $1, %al
-; AVX1-NEXT: movb %al, (%rsp)
-; AVX1-NEXT: vpextrb $2, %xmm1, %eax
-; AVX1-NEXT: andb $1, %al
-; AVX1-NEXT: movb %al, (%rsp)
-; AVX1-NEXT: vpextrb $1, %xmm1, %eax
-; AVX1-NEXT: andb $1, %al
-; AVX1-NEXT: movb %al, (%rsp)
-; AVX1-NEXT: vpextrb $0, %xmm1, %eax
-; AVX1-NEXT: andb $1, %al
-; AVX1-NEXT: movb %al, (%rsp)
-; AVX1-NEXT: vpextrb $15, %xmm0, %eax
-; AVX1-NEXT: andb $1, %al
-; AVX1-NEXT: movb %al, (%rsp)
-; AVX1-NEXT: vpextrb $14, %xmm0, %eax
-; AVX1-NEXT: andb $1, %al
-; AVX1-NEXT: movb %al, (%rsp)
-; AVX1-NEXT: vpextrb $13, %xmm0, %eax
-; AVX1-NEXT: andb $1, %al
-; AVX1-NEXT: movb %al, (%rsp)
-; AVX1-NEXT: vpextrb $12, %xmm0, %eax
-; AVX1-NEXT: andb $1, %al
-; AVX1-NEXT: movb %al, (%rsp)
-; AVX1-NEXT: vpextrb $11, %xmm0, %eax
-; AVX1-NEXT: andb $1, %al
-; AVX1-NEXT: movb %al, (%rsp)
-; AVX1-NEXT: vpextrb $10, %xmm0, %eax
-; AVX1-NEXT: andb $1, %al
-; AVX1-NEXT: movb %al, (%rsp)
-; AVX1-NEXT: vpextrb $9, %xmm0, %eax
-; AVX1-NEXT: andb $1, %al
-; AVX1-NEXT: movb %al, (%rsp)
-; AVX1-NEXT: vpextrb $8, %xmm0, %eax
-; AVX1-NEXT: andb $1, %al
-; AVX1-NEXT: movb %al, (%rsp)
-; AVX1-NEXT: vpextrb $7, %xmm0, %eax
-; AVX1-NEXT: andb $1, %al
-; AVX1-NEXT: movb %al, (%rsp)
-; AVX1-NEXT: vpextrb $6, %xmm0, %eax
-; AVX1-NEXT: andb $1, %al
-; AVX1-NEXT: movb %al, (%rsp)
-; AVX1-NEXT: vpextrb $5, %xmm0, %eax
-; AVX1-NEXT: andb $1, %al
-; AVX1-NEXT: movb %al, (%rsp)
-; AVX1-NEXT: vpextrb $4, %xmm0, %eax
-; AVX1-NEXT: andb $1, %al
-; AVX1-NEXT: movb %al, (%rsp)
-; AVX1-NEXT: vpextrb $3, %xmm0, %eax
-; AVX1-NEXT: andb $1, %al
-; AVX1-NEXT: movb %al, (%rsp)
-; AVX1-NEXT: vpextrb $2, %xmm0, %eax
-; AVX1-NEXT: andb $1, %al
-; AVX1-NEXT: movb %al, (%rsp)
-; AVX1-NEXT: vpextrb $1, %xmm0, %eax
-; AVX1-NEXT: andb $1, %al
-; AVX1-NEXT: movb %al, (%rsp)
-; AVX1-NEXT: vpextrb $0, %xmm0, %eax
-; AVX1-NEXT: andb $1, %al
-; AVX1-NEXT: movb %al, (%rsp)
-; AVX1-NEXT: movl (%rsp), %eax
-; AVX1-NEXT: movq %rbp, %rsp
-; AVX1-NEXT: popq %rbp
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpmovmskb %xmm0, %ecx
+; AVX1-NEXT: vpmovmskb %xmm1, %eax
+; AVX1-NEXT: shll $16, %eax
+; AVX1-NEXT: orl %ecx, %eax
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: v32i16:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpcmpgtw %ymm3, %ymm1, %ymm1
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
; AVX2-NEXT: vpacksswb %xmm3, %xmm1, %xmm1
@@ -532,298 +274,33 @@ define i32 @v32i16(<32 x i16> %a, <32 x i16> %b, <32 x i16> %c, <32 x i16> %d) {
; AVX2-NEXT: vpacksswb %xmm3, %xmm2, %xmm2
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpsllw $7, %ymm0, %ymm0
-; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
-; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
-; AVX2-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vpmovmskb %ymm0, %eax
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512F-LABEL: v32i16:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: pushq %rbp
-; AVX512F-NEXT: .Lcfi0:
; AVX512F-NEXT: .cfi_def_cfa_offset 16
-; AVX512F-NEXT: .Lcfi1:
; AVX512F-NEXT: .cfi_offset %rbp, -16
; AVX512F-NEXT: movq %rsp, %rbp
-; AVX512F-NEXT: .Lcfi2:
; AVX512F-NEXT: .cfi_def_cfa_register %rbp
; AVX512F-NEXT: andq $-32, %rsp
; AVX512F-NEXT: subq $32, %rsp
-; AVX512F-NEXT: vpcmpgtw %ymm3, %ymm1, %ymm1
-; AVX512F-NEXT: vpmovsxwd %ymm1, %zmm1
-; AVX512F-NEXT: vpslld $31, %zmm1, %zmm1
-; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0
-; AVX512F-NEXT: kshiftlw $14, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: kshiftlw $15, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %ecx
-; AVX512F-NEXT: vmovd %ecx, %xmm1
-; AVX512F-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
-; AVX512F-NEXT: kshiftlw $13, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
-; AVX512F-NEXT: kshiftlw $12, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1
-; AVX512F-NEXT: kshiftlw $11, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; AVX512F-NEXT: kshiftlw $10, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
-; AVX512F-NEXT: kshiftlw $9, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1
-; AVX512F-NEXT: kshiftlw $8, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1
-; AVX512F-NEXT: kshiftlw $7, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
-; AVX512F-NEXT: kshiftlw $6, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1
-; AVX512F-NEXT: kshiftlw $5, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1
-; AVX512F-NEXT: kshiftlw $4, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1
-; AVX512F-NEXT: kshiftlw $3, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
-; AVX512F-NEXT: kshiftlw $2, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
-; AVX512F-NEXT: kshiftlw $1, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1
-; AVX512F-NEXT: kshiftrw $15, %k0, %k0
-; AVX512F-NEXT: kmovw %k0, %eax
-; AVX512F-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1
; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm0
; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
-; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
-; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
-; AVX512F-NEXT: kshiftlw $14, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: kshiftlw $15, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %ecx
-; AVX512F-NEXT: vmovd %ecx, %xmm0
-; AVX512F-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
-; AVX512F-NEXT: kshiftlw $13, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
-; AVX512F-NEXT: kshiftlw $12, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
-; AVX512F-NEXT: kshiftlw $11, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
-; AVX512F-NEXT: kshiftlw $10, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; AVX512F-NEXT: kshiftlw $9, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; AVX512F-NEXT: kshiftlw $8, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
-; AVX512F-NEXT: kshiftlw $7, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
-; AVX512F-NEXT: kshiftlw $6, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
-; AVX512F-NEXT: kshiftlw $5, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
-; AVX512F-NEXT: kshiftlw $4, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; AVX512F-NEXT: kshiftlw $3, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
-; AVX512F-NEXT: kshiftlw $2, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
-; AVX512F-NEXT: kshiftlw $1, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
-; AVX512F-NEXT: kshiftrw $15, %k0, %k0
-; AVX512F-NEXT: kmovw %k0, %eax
-; AVX512F-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
-; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512F-NEXT: vpcmpgtw %ymm7, %ymm5, %ymm1
+; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT: vpcmpgtw %ymm3, %ymm1, %ymm1
; AVX512F-NEXT: vpmovsxwd %ymm1, %zmm1
-; AVX512F-NEXT: vpslld $31, %zmm1, %zmm1
-; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0
-; AVX512F-NEXT: kshiftlw $14, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: kshiftlw $15, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %ecx
-; AVX512F-NEXT: vmovd %ecx, %xmm1
-; AVX512F-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
-; AVX512F-NEXT: kshiftlw $13, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
-; AVX512F-NEXT: kshiftlw $12, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1
-; AVX512F-NEXT: kshiftlw $11, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; AVX512F-NEXT: kshiftlw $10, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
-; AVX512F-NEXT: kshiftlw $9, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1
-; AVX512F-NEXT: kshiftlw $8, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1
-; AVX512F-NEXT: kshiftlw $7, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
-; AVX512F-NEXT: kshiftlw $6, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1
-; AVX512F-NEXT: kshiftlw $5, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1
-; AVX512F-NEXT: kshiftlw $4, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1
-; AVX512F-NEXT: kshiftlw $3, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
-; AVX512F-NEXT: kshiftlw $2, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
-; AVX512F-NEXT: kshiftlw $1, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1
-; AVX512F-NEXT: kshiftrw $15, %k0, %k0
-; AVX512F-NEXT: kmovw %k0, %eax
-; AVX512F-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1
+; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
; AVX512F-NEXT: vpcmpgtw %ymm6, %ymm4, %ymm2
; AVX512F-NEXT: vpmovsxwd %ymm2, %zmm2
-; AVX512F-NEXT: vpslld $31, %zmm2, %zmm2
-; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0
-; AVX512F-NEXT: kshiftlw $14, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: kshiftlw $15, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %ecx
-; AVX512F-NEXT: vmovd %ecx, %xmm2
-; AVX512F-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2
-; AVX512F-NEXT: kshiftlw $13, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2
-; AVX512F-NEXT: kshiftlw $12, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2
-; AVX512F-NEXT: kshiftlw $11, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
-; AVX512F-NEXT: kshiftlw $10, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2
-; AVX512F-NEXT: kshiftlw $9, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2
-; AVX512F-NEXT: kshiftlw $8, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2
-; AVX512F-NEXT: kshiftlw $7, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
-; AVX512F-NEXT: kshiftlw $6, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2
-; AVX512F-NEXT: kshiftlw $5, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2
-; AVX512F-NEXT: kshiftlw $4, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2
-; AVX512F-NEXT: kshiftlw $3, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
-; AVX512F-NEXT: kshiftlw $2, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2
-; AVX512F-NEXT: kshiftlw $1, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2
-; AVX512F-NEXT: kshiftrw $15, %k0, %k0
-; AVX512F-NEXT: kmovw %k0, %eax
-; AVX512F-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2
-; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
-; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0
-; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT: vpmovdb %zmm2, %xmm2
+; AVX512F-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX512F-NEXT: vpcmpgtw %ymm7, %ymm5, %ymm2
+; AVX512F-NEXT: vpmovsxwd %ymm2, %zmm2
+; AVX512F-NEXT: vpmovdb %zmm2, %xmm2
+; AVX512F-NEXT: vpand %xmm2, %xmm1, %xmm1
; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm1
; AVX512F-NEXT: vpslld $31, %zmm1, %zmm1
; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0
@@ -839,7 +316,7 @@ define i32 @v32i16(<32 x i16> %a, <32 x i16> %b, <32 x i16> %c, <32 x i16> %d) {
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: v32i16:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpcmpgtw %zmm1, %zmm0, %k1
; AVX512BW-NEXT: vpcmpgtw %zmm3, %zmm2, %k0 {%k1}
; AVX512BW-NEXT: kmovd %k0, %eax
@@ -854,154 +331,97 @@ define i32 @v32i16(<32 x i16> %a, <32 x i16> %b, <32 x i16> %c, <32 x i16> %d) {
define i16 @v16i32(<16 x i32> %a, <16 x i32> %b, <16 x i32> %c, <16 x i32> %d) {
; SSE-LABEL: v16i32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
-; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm10
; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9
+; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm10
; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm11
; SSE-NEXT: pcmpgtd %xmm7, %xmm3
-; SSE-NEXT: movdqa {{.*#+}} xmm7 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; SSE-NEXT: pshufb %xmm7, %xmm3
; SSE-NEXT: pcmpgtd %xmm6, %xmm2
-; SSE-NEXT: pshufb %xmm7, %xmm2
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; SSE-NEXT: psllw $15, %xmm2
-; SSE-NEXT: psraw $15, %xmm2
-; SSE-NEXT: movdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; SSE-NEXT: pshufb %xmm3, %xmm2
+; SSE-NEXT: packssdw %xmm3, %xmm2
; SSE-NEXT: pcmpgtd %xmm5, %xmm1
-; SSE-NEXT: pshufb %xmm7, %xmm1
; SSE-NEXT: pcmpgtd %xmm4, %xmm0
-; SSE-NEXT: pshufb %xmm7, %xmm0
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE-NEXT: psllw $15, %xmm0
-; SSE-NEXT: psraw $15, %xmm0
-; SSE-NEXT: pshufb %xmm3, %xmm0
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; SSE-NEXT: psllw $7, %xmm0
-; SSE-NEXT: movdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
-; SSE-NEXT: pand %xmm2, %xmm0
-; SSE-NEXT: pxor %xmm1, %xmm1
-; SSE-NEXT: pxor %xmm4, %xmm4
-; SSE-NEXT: pcmpgtb %xmm0, %xmm4
+; SSE-NEXT: packssdw %xmm1, %xmm0
+; SSE-NEXT: packsswb %xmm2, %xmm0
; SSE-NEXT: pcmpgtd {{[0-9]+}}(%rsp), %xmm11
-; SSE-NEXT: pshufb %xmm7, %xmm11
-; SSE-NEXT: pcmpgtd {{[0-9]+}}(%rsp), %xmm9
-; SSE-NEXT: pshufb %xmm7, %xmm9
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm9 = xmm9[0],xmm11[0]
-; SSE-NEXT: psllw $15, %xmm9
-; SSE-NEXT: psraw $15, %xmm9
-; SSE-NEXT: pshufb %xmm3, %xmm9
; SSE-NEXT: pcmpgtd {{[0-9]+}}(%rsp), %xmm10
-; SSE-NEXT: pshufb %xmm7, %xmm10
+; SSE-NEXT: packssdw %xmm11, %xmm10
+; SSE-NEXT: pcmpgtd {{[0-9]+}}(%rsp), %xmm9
; SSE-NEXT: pcmpgtd {{[0-9]+}}(%rsp), %xmm8
-; SSE-NEXT: pshufb %xmm7, %xmm8
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm10[0]
-; SSE-NEXT: psllw $15, %xmm8
-; SSE-NEXT: psraw $15, %xmm8
-; SSE-NEXT: pshufb %xmm3, %xmm8
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm9[0]
-; SSE-NEXT: psllw $7, %xmm8
-; SSE-NEXT: pand %xmm2, %xmm8
-; SSE-NEXT: pcmpgtb %xmm8, %xmm1
-; SSE-NEXT: pand %xmm4, %xmm1
-; SSE-NEXT: pmovmskb %xmm1, %eax
-; SSE-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; SSE-NEXT: packssdw %xmm9, %xmm8
+; SSE-NEXT: packsswb %xmm10, %xmm8
+; SSE-NEXT: pand %xmm0, %xmm8
+; SSE-NEXT: pmovmskb %xmm8, %eax
+; SSE-NEXT: # kill: def %ax killed %ax killed %eax
; SSE-NEXT: retq
;
; AVX1-LABEL: v16i32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm8
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm9
; AVX1-NEXT: vpcmpgtd %xmm8, %xmm9, %xmm8
; AVX1-NEXT: vpcmpgtd %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vpacksswb %xmm8, %xmm1, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX1-NEXT: vpshufb %xmm8, %xmm1, %xmm9
+; AVX1-NEXT: vpackssdw %xmm8, %xmm1, %xmm8
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpcmpgtd %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vpcmpgtd %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufb %xmm8, %xmm0, %xmm0
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm9[0]
-; AVX1-NEXT: vpsllw $7, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm9 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
-; AVX1-NEXT: vpand %xmm9, %xmm0, %xmm0
-; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm0
-; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm3
-; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm1
-; AVX1-NEXT: vpcmpgtd %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vpcmpgtd %xmm7, %xmm5, %xmm3
-; AVX1-NEXT: vpacksswb %xmm1, %xmm3, %xmm1
-; AVX1-NEXT: vpshufb %xmm8, %xmm1, %xmm1
-; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm3
-; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm5
-; AVX1-NEXT: vpcmpgtd %xmm3, %xmm5, %xmm3
-; AVX1-NEXT: vpcmpgtd %xmm6, %xmm4, %xmm4
-; AVX1-NEXT: vpacksswb %xmm3, %xmm4, %xmm3
-; AVX1-NEXT: vpshufb %xmm8, %xmm3, %xmm3
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm1[0]
-; AVX1-NEXT: vpsllw $7, %xmm1, %xmm1
-; AVX1-NEXT: vpand %xmm9, %xmm1, %xmm1
-; AVX1-NEXT: vpcmpgtb %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpacksswb %xmm8, %xmm0, %xmm0
+; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm2
+; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vpcmpgtd %xmm7, %xmm5, %xmm2
+; AVX1-NEXT: vpackssdw %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm3
+; AVX1-NEXT: vpcmpgtd %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpcmpgtd %xmm6, %xmm4, %xmm3
+; AVX1-NEXT: vpackssdw %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpacksswb %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpmovmskb %xmm0, %eax
-; AVX1-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX1-NEXT: # kill: def %ax killed %ax killed %eax
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: v16i32:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpcmpgtd %ymm3, %ymm1, %ymm1
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
-; AVX2-NEXT: vpacksswb %xmm3, %xmm1, %xmm1
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX2-NEXT: vpackssdw %xmm3, %xmm1, %xmm1
; AVX2-NEXT: vpcmpgtd %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
-; AVX2-NEXT: vpacksswb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX2-NEXT: vpsllw $7, %xmm0, %xmm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
-; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm0
-; AVX2-NEXT: vpcmpgtd %ymm7, %ymm5, %ymm5
-; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm7
-; AVX2-NEXT: vpacksswb %xmm7, %xmm5, %xmm5
-; AVX2-NEXT: vpshufb %xmm3, %xmm5, %xmm5
-; AVX2-NEXT: vpcmpgtd %ymm6, %ymm4, %ymm4
-; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm6
-; AVX2-NEXT: vpacksswb %xmm6, %xmm4, %xmm4
-; AVX2-NEXT: vpshufb %xmm3, %xmm4, %xmm3
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm5[0]
-; AVX2-NEXT: vpsllw $7, %xmm3, %xmm3
-; AVX2-NEXT: vpand %xmm1, %xmm3, %xmm1
-; AVX2-NEXT: vpcmpgtb %xmm1, %xmm2, %xmm1
+; AVX2-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpcmpgtd %ymm7, %ymm5, %ymm1
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpcmpgtd %ymm6, %ymm4, %ymm2
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX2-NEXT: vpackssdw %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vpacksswb %xmm1, %xmm2, %xmm1
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpmovmskb %xmm0, %eax
-; AVX2-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX2-NEXT: # kill: def %ax killed %ax killed %eax
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512F-LABEL: v16i32:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vpcmpgtd %zmm1, %zmm0, %k1
; AVX512F-NEXT: vpcmpgtd %zmm3, %zmm2, %k0 {%k1}
; AVX512F-NEXT: kmovw %k0, %eax
-; AVX512F-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX512F-NEXT: # kill: def %ax killed %ax killed %eax
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: v16i32:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpcmpgtd %zmm1, %zmm0, %k1
; AVX512BW-NEXT: vpcmpgtd %zmm3, %zmm2, %k0 {%k1}
; AVX512BW-NEXT: kmovd %k0, %eax
-; AVX512BW-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX512BW-NEXT: # kill: def %ax killed %ax killed %eax
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
%x0 = icmp sgt <16 x i32> %a, %b
@@ -1013,111 +433,67 @@ define i16 @v16i32(<16 x i32> %a, <16 x i32> %b, <16 x i32> %c, <16 x i32> %d) {
define i16 @v16f32(<16 x float> %a, <16 x float> %b, <16 x float> %c, <16 x float> %d) {
; SSE-LABEL: v16f32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm8
-; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm10
; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm9
+; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm10
; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm11
; SSE-NEXT: cmpltps %xmm3, %xmm7
-; SSE-NEXT: movdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; SSE-NEXT: pshufb %xmm3, %xmm7
; SSE-NEXT: cmpltps %xmm2, %xmm6
-; SSE-NEXT: pshufb %xmm3, %xmm6
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm7[0]
-; SSE-NEXT: psllw $15, %xmm6
-; SSE-NEXT: psraw $15, %xmm6
-; SSE-NEXT: movdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; SSE-NEXT: pshufb %xmm2, %xmm6
+; SSE-NEXT: packssdw %xmm7, %xmm6
; SSE-NEXT: cmpltps %xmm1, %xmm5
-; SSE-NEXT: pshufb %xmm3, %xmm5
; SSE-NEXT: cmpltps %xmm0, %xmm4
-; SSE-NEXT: pshufb %xmm3, %xmm4
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0]
-; SSE-NEXT: psllw $15, %xmm4
-; SSE-NEXT: psraw $15, %xmm4
-; SSE-NEXT: pshufb %xmm2, %xmm4
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm6[0]
-; SSE-NEXT: psllw $7, %xmm4
-; SSE-NEXT: movdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
-; SSE-NEXT: pand %xmm1, %xmm4
-; SSE-NEXT: xorps %xmm0, %xmm0
-; SSE-NEXT: pxor %xmm5, %xmm5
-; SSE-NEXT: pcmpgtb %xmm4, %xmm5
+; SSE-NEXT: packssdw %xmm5, %xmm4
+; SSE-NEXT: packsswb %xmm6, %xmm4
; SSE-NEXT: cmpltps {{[0-9]+}}(%rsp), %xmm11
-; SSE-NEXT: pshufb %xmm3, %xmm11
-; SSE-NEXT: cmpltps {{[0-9]+}}(%rsp), %xmm9
-; SSE-NEXT: pshufb %xmm3, %xmm9
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm9 = xmm9[0],xmm11[0]
-; SSE-NEXT: psllw $15, %xmm9
-; SSE-NEXT: psraw $15, %xmm9
-; SSE-NEXT: pshufb %xmm2, %xmm9
; SSE-NEXT: cmpltps {{[0-9]+}}(%rsp), %xmm10
-; SSE-NEXT: pshufb %xmm3, %xmm10
+; SSE-NEXT: packssdw %xmm11, %xmm10
+; SSE-NEXT: cmpltps {{[0-9]+}}(%rsp), %xmm9
; SSE-NEXT: cmpltps {{[0-9]+}}(%rsp), %xmm8
-; SSE-NEXT: pshufb %xmm3, %xmm8
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm10[0]
-; SSE-NEXT: psllw $15, %xmm8
-; SSE-NEXT: psraw $15, %xmm8
-; SSE-NEXT: pshufb %xmm2, %xmm8
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm9[0]
-; SSE-NEXT: psllw $7, %xmm8
-; SSE-NEXT: pand %xmm1, %xmm8
-; SSE-NEXT: pcmpgtb %xmm8, %xmm0
-; SSE-NEXT: pand %xmm5, %xmm0
-; SSE-NEXT: pmovmskb %xmm0, %eax
-; SSE-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; SSE-NEXT: packssdw %xmm9, %xmm8
+; SSE-NEXT: packsswb %xmm10, %xmm8
+; SSE-NEXT: pand %xmm4, %xmm8
+; SSE-NEXT: pmovmskb %xmm8, %eax
+; SSE-NEXT: # kill: def %ax killed %ax killed %eax
; SSE-NEXT: retq
;
; AVX12-LABEL: v16f32:
-; AVX12: # BB#0:
+; AVX12: # %bb.0:
; AVX12-NEXT: vcmpltps %ymm1, %ymm3, %ymm1
; AVX12-NEXT: vextractf128 $1, %ymm1, %xmm3
-; AVX12-NEXT: vpacksswb %xmm3, %xmm1, %xmm1
-; AVX12-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX12-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX12-NEXT: vpackssdw %xmm3, %xmm1, %xmm1
; AVX12-NEXT: vcmpltps %ymm0, %ymm2, %ymm0
; AVX12-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX12-NEXT: vpacksswb %xmm2, %xmm0, %xmm0
-; AVX12-NEXT: vpshufb %xmm3, %xmm0, %xmm0
-; AVX12-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX12-NEXT: vpsllw $7, %xmm0, %xmm0
-; AVX12-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
-; AVX12-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX12-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX12-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm0
-; AVX12-NEXT: vcmpltps %ymm5, %ymm7, %ymm5
-; AVX12-NEXT: vextractf128 $1, %ymm5, %xmm7
-; AVX12-NEXT: vpacksswb %xmm7, %xmm5, %xmm5
-; AVX12-NEXT: vpshufb %xmm3, %xmm5, %xmm5
-; AVX12-NEXT: vcmpltps %ymm4, %ymm6, %ymm4
-; AVX12-NEXT: vextractf128 $1, %ymm4, %xmm6
-; AVX12-NEXT: vpacksswb %xmm6, %xmm4, %xmm4
-; AVX12-NEXT: vpshufb %xmm3, %xmm4, %xmm3
-; AVX12-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm5[0]
-; AVX12-NEXT: vpsllw $7, %xmm3, %xmm3
-; AVX12-NEXT: vpand %xmm1, %xmm3, %xmm1
-; AVX12-NEXT: vpcmpgtb %xmm1, %xmm2, %xmm1
+; AVX12-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
+; AVX12-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
+; AVX12-NEXT: vcmpltps %ymm5, %ymm7, %ymm1
+; AVX12-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX12-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
+; AVX12-NEXT: vcmpltps %ymm4, %ymm6, %ymm2
+; AVX12-NEXT: vextractf128 $1, %ymm2, %xmm3
+; AVX12-NEXT: vpackssdw %xmm3, %xmm2, %xmm2
+; AVX12-NEXT: vpacksswb %xmm1, %xmm2, %xmm1
; AVX12-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX12-NEXT: vpmovmskb %xmm0, %eax
-; AVX12-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX12-NEXT: # kill: def %ax killed %ax killed %eax
; AVX12-NEXT: vzeroupper
; AVX12-NEXT: retq
;
; AVX512F-LABEL: v16f32:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vcmpltps %zmm0, %zmm1, %k1
; AVX512F-NEXT: vcmpltps %zmm2, %zmm3, %k0 {%k1}
; AVX512F-NEXT: kmovw %k0, %eax
-; AVX512F-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX512F-NEXT: # kill: def %ax killed %ax killed %eax
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: v16f32:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vcmpltps %zmm0, %zmm1, %k1
; AVX512BW-NEXT: vcmpltps %zmm2, %zmm3, %k0 {%k1}
; AVX512BW-NEXT: kmovd %k0, %eax
-; AVX512BW-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX512BW-NEXT: # kill: def %ax killed %ax killed %eax
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
%x0 = fcmp ogt <16 x float> %a, %b
@@ -1129,7 +505,7 @@ define i16 @v16f32(<16 x float> %a, <16 x float> %b, <16 x float> %c, <16 x floa
define i64 @v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %c, <64 x i8> %d) {
; SSE-LABEL: v64i8:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm11
; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm10
; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9
@@ -1351,14 +727,11 @@ define i64 @v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %c, <64 x i8> %d) {
; SSE-NEXT: retq
;
; AVX1-LABEL: v64i8:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: pushq %rbp
-; AVX1-NEXT: .Lcfi3:
; AVX1-NEXT: .cfi_def_cfa_offset 16
-; AVX1-NEXT: .Lcfi4:
; AVX1-NEXT: .cfi_offset %rbp, -16
; AVX1-NEXT: movq %rsp, %rbp
-; AVX1-NEXT: .Lcfi5:
; AVX1-NEXT: .cfi_def_cfa_register %rbp
; AVX1-NEXT: andq $-32, %rsp
; AVX1-NEXT: subq $64, %rsp
@@ -1588,14 +961,11 @@ define i64 @v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %c, <64 x i8> %d) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: v64i8:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: pushq %rbp
-; AVX2-NEXT: .Lcfi0:
; AVX2-NEXT: .cfi_def_cfa_offset 16
-; AVX2-NEXT: .Lcfi1:
; AVX2-NEXT: .cfi_offset %rbp, -16
; AVX2-NEXT: movq %rsp, %rbp
-; AVX2-NEXT: .Lcfi2:
; AVX2-NEXT: .cfi_def_cfa_register %rbp
; AVX2-NEXT: andq $-32, %rsp
; AVX2-NEXT: subq $64, %rsp
@@ -1809,14 +1179,11 @@ define i64 @v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %c, <64 x i8> %d) {
; AVX2-NEXT: retq
;
; AVX512F-LABEL: v64i8:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: pushq %rbp
-; AVX512F-NEXT: .Lcfi3:
; AVX512F-NEXT: .cfi_def_cfa_offset 16
-; AVX512F-NEXT: .Lcfi4:
; AVX512F-NEXT: .cfi_offset %rbp, -16
; AVX512F-NEXT: movq %rsp, %rbp
-; AVX512F-NEXT: .Lcfi5:
; AVX512F-NEXT: .cfi_def_cfa_register %rbp
; AVX512F-NEXT: andq $-32, %rsp
; AVX512F-NEXT: subq $64, %rsp
@@ -1854,7 +1221,7 @@ define i64 @v64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %c, <64 x i8> %d) {
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: v64i8:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpcmpgtb %zmm1, %zmm0, %k1
; AVX512BW-NEXT: vpcmpgtb %zmm3, %zmm2, %k0 {%k1}
; AVX512BW-NEXT: kmovq %k0, %rax
diff --git a/test/CodeGen/X86/bitcast-i256.ll b/test/CodeGen/X86/bitcast-i256.ll
index 6c3009d1d7ca..a29292e4ba16 100644
--- a/test/CodeGen/X86/bitcast-i256.ll
+++ b/test/CodeGen/X86/bitcast-i256.ll
@@ -4,19 +4,16 @@
define i256 @foo(<8 x i32> %a) {
; FAST-LABEL: foo:
-; FAST: # BB#0:
+; FAST: # %bb.0:
; FAST-NEXT: vmovups %ymm0, (%rdi)
; FAST-NEXT: movq %rdi, %rax
; FAST-NEXT: vzeroupper
; FAST-NEXT: retq
;
; SLOW-LABEL: foo:
-; SLOW: # BB#0:
-; SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
-; SLOW-NEXT: vpextrq $1, %xmm1, 24(%rdi)
-; SLOW-NEXT: vmovq %xmm1, 16(%rdi)
-; SLOW-NEXT: vpextrq $1, %xmm0, 8(%rdi)
-; SLOW-NEXT: vmovq %xmm0, (%rdi)
+; SLOW: # %bb.0:
+; SLOW-NEXT: vextractf128 $1, %ymm0, 16(%rdi)
+; SLOW-NEXT: vmovups %xmm0, (%rdi)
; SLOW-NEXT: movq %rdi, %rax
; SLOW-NEXT: vzeroupper
; SLOW-NEXT: retq
diff --git a/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll b/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll
index 9b6401d1a76c..dcddb8e82642 100644
--- a/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll
+++ b/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll
@@ -11,43 +11,45 @@
define <2 x i64> @ext_i2_2i64(i2 %a0) {
; SSE2-SSSE3-LABEL: ext_i2_2i64:
-; SSE2-SSSE3: # BB#0:
-; SSE2-SSSE3-NEXT: andb $3, %dil
-; SSE2-SSSE3-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-SSSE3-NEXT: movq %rax, %rcx
-; SSE2-SSSE3-NEXT: shlq $62, %rcx
-; SSE2-SSSE3-NEXT: sarq $63, %rcx
-; SSE2-SSSE3-NEXT: movq %rcx, %xmm1
-; SSE2-SSSE3-NEXT: shlq $63, %rax
-; SSE2-SSSE3-NEXT: sarq $63, %rax
-; SSE2-SSSE3-NEXT: movq %rax, %xmm0
-; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-SSSE3: # %bb.0:
+; SSE2-SSSE3-NEXT: # kill: def %edi killed %edi def %rdi
+; SSE2-SSSE3-NEXT: movq %rdi, %xmm0
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,1]
+; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [1,2]
+; SSE2-SSSE3-NEXT: pand %xmm0, %xmm1
+; SSE2-SSSE3-NEXT: pcmpeqd %xmm0, %xmm1
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,0,3,2]
+; SSE2-SSSE3-NEXT: pand %xmm1, %xmm0
; SSE2-SSSE3-NEXT: retq
;
-; AVX12-LABEL: ext_i2_2i64:
-; AVX12: # BB#0:
-; AVX12-NEXT: andb $3, %dil
-; AVX12-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; AVX12-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; AVX12-NEXT: movq %rax, %rcx
-; AVX12-NEXT: shlq $62, %rcx
-; AVX12-NEXT: sarq $63, %rcx
-; AVX12-NEXT: vmovq %rcx, %xmm0
-; AVX12-NEXT: shlq $63, %rax
-; AVX12-NEXT: sarq $63, %rax
-; AVX12-NEXT: vmovq %rax, %xmm1
-; AVX12-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX12-NEXT: retq
+; AVX1-LABEL: ext_i2_2i64:
+; AVX1: # %bb.0:
+; AVX1-NEXT: # kill: def %edi killed %edi def %rdi
+; AVX1-NEXT: vmovq %rdi, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2]
+; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: ext_i2_2i64:
+; AVX2: # %bb.0:
+; AVX2-NEXT: # kill: def %edi killed %edi def %rdi
+; AVX2-NEXT: vmovq %rdi, %xmm0
+; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2]
+; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: retq
;
; AVX512-LABEL: ext_i2_2i64:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: andb $3, %dil
; AVX512-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
; AVX512-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; AVX512-NEXT: kmovd %eax, %k1
; AVX512-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = bitcast i2 %a0 to <2 x i1>
@@ -57,61 +59,41 @@ define <2 x i64> @ext_i2_2i64(i2 %a0) {
define <4 x i32> @ext_i4_4i32(i4 %a0) {
; SSE2-SSSE3-LABEL: ext_i4_4i32:
-; SSE2-SSSE3: # BB#0:
-; SSE2-SSSE3-NEXT: andb $15, %dil
-; SSE2-SSSE3-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-SSSE3-NEXT: movq %rax, %rcx
-; SSE2-SSSE3-NEXT: shlq $60, %rcx
-; SSE2-SSSE3-NEXT: sarq $63, %rcx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
-; SSE2-SSSE3-NEXT: movq %rax, %rcx
-; SSE2-SSSE3-NEXT: shlq $61, %rcx
-; SSE2-SSSE3-NEXT: sarq $63, %rcx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
-; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-SSSE3-NEXT: movq %rax, %rcx
-; SSE2-SSSE3-NEXT: shlq $62, %rcx
-; SSE2-SSSE3-NEXT: sarq $63, %rcx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
-; SSE2-SSSE3-NEXT: shlq $63, %rax
-; SSE2-SSSE3-NEXT: sarq $63, %rax
-; SSE2-SSSE3-NEXT: movd %eax, %xmm0
-; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-SSSE3: # %bb.0:
+; SSE2-SSSE3-NEXT: movd %edi, %xmm0
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8]
+; SSE2-SSSE3-NEXT: pand %xmm1, %xmm0
+; SSE2-SSSE3-NEXT: pcmpeqd %xmm1, %xmm0
; SSE2-SSSE3-NEXT: retq
;
-; AVX12-LABEL: ext_i4_4i32:
-; AVX12: # BB#0:
-; AVX12-NEXT: andb $15, %dil
-; AVX12-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; AVX12-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; AVX12-NEXT: movq %rax, %rcx
-; AVX12-NEXT: shlq $62, %rcx
-; AVX12-NEXT: sarq $63, %rcx
-; AVX12-NEXT: movq %rax, %rdx
-; AVX12-NEXT: shlq $63, %rdx
-; AVX12-NEXT: sarq $63, %rdx
-; AVX12-NEXT: vmovd %edx, %xmm0
-; AVX12-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0
-; AVX12-NEXT: movq %rax, %rcx
-; AVX12-NEXT: shlq $61, %rcx
-; AVX12-NEXT: sarq $63, %rcx
-; AVX12-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0
-; AVX12-NEXT: shlq $60, %rax
-; AVX12-NEXT: sarq $63, %rax
-; AVX12-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0
-; AVX12-NEXT: retq
+; AVX1-LABEL: ext_i4_4i32:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovd %edi, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2,4,8]
+; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: ext_i4_4i32:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovd %edi, %xmm0
+; AVX2-NEXT: vpbroadcastd %xmm0, %xmm0
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2,4,8]
+; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: retq
;
; AVX512-LABEL: ext_i4_4i32:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: andb $15, %dil
; AVX512-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
; AVX512-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; AVX512-NEXT: kmovd %eax, %k1
; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; AVX512-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
-; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = bitcast i4 %a0 to <4 x i1>
@@ -121,86 +103,36 @@ define <4 x i32> @ext_i4_4i32(i4 %a0) {
define <8 x i16> @ext_i8_8i16(i8 %a0) {
; SSE2-SSSE3-LABEL: ext_i8_8i16:
-; SSE2-SSSE3: # BB#0:
-; SSE2-SSSE3-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movsbq -{{[0-9]+}}(%rsp), %rax
-; SSE2-SSSE3-NEXT: movq %rax, %rcx
-; SSE2-SSSE3-NEXT: shrq $7, %rcx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
-; SSE2-SSSE3-NEXT: movq %rax, %rcx
-; SSE2-SSSE3-NEXT: shlq $57, %rcx
-; SSE2-SSSE3-NEXT: sarq $63, %rcx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
-; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE2-SSSE3-NEXT: movq %rax, %rcx
-; SSE2-SSSE3-NEXT: shlq $58, %rcx
-; SSE2-SSSE3-NEXT: sarq $63, %rcx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
-; SSE2-SSSE3-NEXT: movq %rax, %rcx
-; SSE2-SSSE3-NEXT: shlq $59, %rcx
-; SSE2-SSSE3-NEXT: sarq $63, %rcx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
-; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; SSE2-SSSE3-NEXT: movq %rax, %rcx
-; SSE2-SSSE3-NEXT: shlq $60, %rcx
-; SSE2-SSSE3-NEXT: sarq $63, %rcx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
-; SSE2-SSSE3-NEXT: movq %rax, %rcx
-; SSE2-SSSE3-NEXT: shlq $61, %rcx
-; SSE2-SSSE3-NEXT: sarq $63, %rcx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
-; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE2-SSSE3-NEXT: movq %rax, %rcx
-; SSE2-SSSE3-NEXT: shlq $62, %rcx
-; SSE2-SSSE3-NEXT: sarq $63, %rcx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm3
-; SSE2-SSSE3-NEXT: shlq $63, %rax
-; SSE2-SSSE3-NEXT: sarq $63, %rax
-; SSE2-SSSE3-NEXT: movd %eax, %xmm0
-; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
-; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-SSSE3: # %bb.0:
+; SSE2-SSSE3-NEXT: movd %edi, %xmm0
+; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128]
+; SSE2-SSSE3-NEXT: pand %xmm1, %xmm0
+; SSE2-SSSE3-NEXT: pcmpeqw %xmm1, %xmm0
; SSE2-SSSE3-NEXT: retq
;
-; AVX12-LABEL: ext_i8_8i16:
-; AVX12: # BB#0:
-; AVX12-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; AVX12-NEXT: movsbq -{{[0-9]+}}(%rsp), %rax
-; AVX12-NEXT: movq %rax, %rcx
-; AVX12-NEXT: shlq $62, %rcx
-; AVX12-NEXT: sarq $63, %rcx
-; AVX12-NEXT: movq %rax, %rdx
-; AVX12-NEXT: shlq $63, %rdx
-; AVX12-NEXT: sarq $63, %rdx
-; AVX12-NEXT: vmovd %edx, %xmm0
-; AVX12-NEXT: vpinsrw $1, %ecx, %xmm0, %xmm0
-; AVX12-NEXT: movq %rax, %rcx
-; AVX12-NEXT: shlq $61, %rcx
-; AVX12-NEXT: sarq $63, %rcx
-; AVX12-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0
-; AVX12-NEXT: movq %rax, %rcx
-; AVX12-NEXT: shlq $60, %rcx
-; AVX12-NEXT: sarq $63, %rcx
-; AVX12-NEXT: vpinsrw $3, %ecx, %xmm0, %xmm0
-; AVX12-NEXT: movq %rax, %rcx
-; AVX12-NEXT: shlq $59, %rcx
-; AVX12-NEXT: sarq $63, %rcx
-; AVX12-NEXT: vpinsrw $4, %ecx, %xmm0, %xmm0
-; AVX12-NEXT: movq %rax, %rcx
-; AVX12-NEXT: shlq $58, %rcx
-; AVX12-NEXT: sarq $63, %rcx
-; AVX12-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0
-; AVX12-NEXT: movq %rax, %rcx
-; AVX12-NEXT: shlq $57, %rcx
-; AVX12-NEXT: sarq $63, %rcx
-; AVX12-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0
-; AVX12-NEXT: shrq $7, %rax
-; AVX12-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0
-; AVX12-NEXT: retq
+; AVX1-LABEL: ext_i8_8i16:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovd %edi, %xmm0
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128]
+; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: ext_i8_8i16:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovd %edi, %xmm0
+; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128]
+; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: retq
;
; AVX512-LABEL: ext_i8_8i16:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: kmovd %edi, %k0
; AVX512-NEXT: vpmovm2w %k0, %xmm0
; AVX512-NEXT: retq
@@ -210,194 +142,46 @@ define <8 x i16> @ext_i8_8i16(i8 %a0) {
}
define <16 x i8> @ext_i16_16i8(i16 %a0) {
-; SSE2-SSSE3-LABEL: ext_i16_16i8:
-; SSE2-SSSE3: # BB#0:
-; SSE2-SSSE3-NEXT: pushq %rbp
-; SSE2-SSSE3-NEXT: .Lcfi0:
-; SSE2-SSSE3-NEXT: .cfi_def_cfa_offset 16
-; SSE2-SSSE3-NEXT: pushq %r15
-; SSE2-SSSE3-NEXT: .Lcfi1:
-; SSE2-SSSE3-NEXT: .cfi_def_cfa_offset 24
-; SSE2-SSSE3-NEXT: pushq %r14
-; SSE2-SSSE3-NEXT: .Lcfi2:
-; SSE2-SSSE3-NEXT: .cfi_def_cfa_offset 32
-; SSE2-SSSE3-NEXT: pushq %r13
-; SSE2-SSSE3-NEXT: .Lcfi3:
-; SSE2-SSSE3-NEXT: .cfi_def_cfa_offset 40
-; SSE2-SSSE3-NEXT: pushq %r12
-; SSE2-SSSE3-NEXT: .Lcfi4:
-; SSE2-SSSE3-NEXT: .cfi_def_cfa_offset 48
-; SSE2-SSSE3-NEXT: pushq %rbx
-; SSE2-SSSE3-NEXT: .Lcfi5:
-; SSE2-SSSE3-NEXT: .cfi_def_cfa_offset 56
-; SSE2-SSSE3-NEXT: .Lcfi6:
-; SSE2-SSSE3-NEXT: .cfi_offset %rbx, -56
-; SSE2-SSSE3-NEXT: .Lcfi7:
-; SSE2-SSSE3-NEXT: .cfi_offset %r12, -48
-; SSE2-SSSE3-NEXT: .Lcfi8:
-; SSE2-SSSE3-NEXT: .cfi_offset %r13, -40
-; SSE2-SSSE3-NEXT: .Lcfi9:
-; SSE2-SSSE3-NEXT: .cfi_offset %r14, -32
-; SSE2-SSSE3-NEXT: .Lcfi10:
-; SSE2-SSSE3-NEXT: .cfi_offset %r15, -24
-; SSE2-SSSE3-NEXT: .Lcfi11:
-; SSE2-SSSE3-NEXT: .cfi_offset %rbp, -16
-; SSE2-SSSE3-NEXT: movw %di, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movswq -{{[0-9]+}}(%rsp), %rax
-; SSE2-SSSE3-NEXT: movq %rax, %r8
-; SSE2-SSSE3-NEXT: movq %rax, %r9
-; SSE2-SSSE3-NEXT: movq %rax, %r10
-; SSE2-SSSE3-NEXT: movq %rax, %r11
-; SSE2-SSSE3-NEXT: movq %rax, %r14
-; SSE2-SSSE3-NEXT: movq %rax, %r15
-; SSE2-SSSE3-NEXT: movq %rax, %r12
-; SSE2-SSSE3-NEXT: movq %rax, %r13
-; SSE2-SSSE3-NEXT: movq %rax, %rbx
-; SSE2-SSSE3-NEXT: movq %rax, %rcx
-; SSE2-SSSE3-NEXT: movq %rax, %rdx
-; SSE2-SSSE3-NEXT: movq %rax, %rsi
-; SSE2-SSSE3-NEXT: movq %rax, %rdi
-; SSE2-SSSE3-NEXT: movq %rax, %rbp
-; SSE2-SSSE3-NEXT: shrq $15, %rbp
-; SSE2-SSSE3-NEXT: movd %ebp, %xmm0
-; SSE2-SSSE3-NEXT: movq %rax, %rbp
-; SSE2-SSSE3-NEXT: movsbq %al, %rax
-; SSE2-SSSE3-NEXT: shlq $49, %r8
-; SSE2-SSSE3-NEXT: sarq $63, %r8
-; SSE2-SSSE3-NEXT: movd %r8d, %xmm1
-; SSE2-SSSE3-NEXT: shlq $50, %r9
-; SSE2-SSSE3-NEXT: sarq $63, %r9
-; SSE2-SSSE3-NEXT: movd %r9d, %xmm2
-; SSE2-SSSE3-NEXT: shlq $51, %r10
-; SSE2-SSSE3-NEXT: sarq $63, %r10
-; SSE2-SSSE3-NEXT: movd %r10d, %xmm3
-; SSE2-SSSE3-NEXT: shlq $52, %r11
-; SSE2-SSSE3-NEXT: sarq $63, %r11
-; SSE2-SSSE3-NEXT: movd %r11d, %xmm4
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-SSSE3-NEXT: shlq $53, %r14
-; SSE2-SSSE3-NEXT: sarq $63, %r14
-; SSE2-SSSE3-NEXT: movd %r14d, %xmm0
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
-; SSE2-SSSE3-NEXT: shlq $54, %r15
-; SSE2-SSSE3-NEXT: sarq $63, %r15
-; SSE2-SSSE3-NEXT: movd %r15d, %xmm2
-; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
-; SSE2-SSSE3-NEXT: shlq $55, %r12
-; SSE2-SSSE3-NEXT: sarq $63, %r12
-; SSE2-SSSE3-NEXT: movd %r12d, %xmm1
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
-; SSE2-SSSE3-NEXT: shlq $60, %r13
-; SSE2-SSSE3-NEXT: sarq $63, %r13
-; SSE2-SSSE3-NEXT: movd %r13d, %xmm4
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; SSE2-SSSE3-NEXT: shlq $61, %rbx
-; SSE2-SSSE3-NEXT: sarq $63, %rbx
-; SSE2-SSSE3-NEXT: movd %ebx, %xmm2
-; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-SSSE3-NEXT: shlq $62, %rcx
-; SSE2-SSSE3-NEXT: sarq $63, %rcx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm5
-; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
-; SSE2-SSSE3-NEXT: shlq $63, %rdx
-; SSE2-SSSE3-NEXT: sarq $63, %rdx
-; SSE2-SSSE3-NEXT: movd %edx, %xmm0
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
-; SSE2-SSSE3-NEXT: shlq $58, %rsi
-; SSE2-SSSE3-NEXT: sarq $63, %rsi
-; SSE2-SSSE3-NEXT: movd %esi, %xmm3
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7]
-; SSE2-SSSE3-NEXT: shlq $59, %rdi
-; SSE2-SSSE3-NEXT: sarq $63, %rdi
-; SSE2-SSSE3-NEXT: movd %edi, %xmm4
-; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
-; SSE2-SSSE3-NEXT: shlq $57, %rbp
-; SSE2-SSSE3-NEXT: sarq $63, %rbp
-; SSE2-SSSE3-NEXT: movd %ebp, %xmm2
-; SSE2-SSSE3-NEXT: shrq $7, %rax
-; SSE2-SSSE3-NEXT: movd %eax, %xmm3
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
-; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
-; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
-; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE2-SSSE3-NEXT: popq %rbx
-; SSE2-SSSE3-NEXT: popq %r12
-; SSE2-SSSE3-NEXT: popq %r13
-; SSE2-SSSE3-NEXT: popq %r14
-; SSE2-SSSE3-NEXT: popq %r15
-; SSE2-SSSE3-NEXT: popq %rbp
-; SSE2-SSSE3-NEXT: retq
+; SSE2-LABEL: ext_i16_16i8:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movd %edi, %xmm0
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,1,1,4,5,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
+; SSE2-NEXT: pand %xmm1, %xmm0
+; SSE2-NEXT: pcmpeqb %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: ext_i16_16i8:
+; SSSE3: # %bb.0:
+; SSSE3-NEXT: movd %edi, %xmm0
+; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
+; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
+; SSSE3-NEXT: pand %xmm1, %xmm0
+; SSSE3-NEXT: pcmpeqb %xmm1, %xmm0
+; SSSE3-NEXT: retq
+;
+; AVX1-LABEL: ext_i16_16i8:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovd %edi, %xmm0
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0]
+; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: retq
;
-; AVX12-LABEL: ext_i16_16i8:
-; AVX12: # BB#0:
-; AVX12-NEXT: movw %di, -{{[0-9]+}}(%rsp)
-; AVX12-NEXT: movswq -{{[0-9]+}}(%rsp), %rax
-; AVX12-NEXT: movq %rax, %rcx
-; AVX12-NEXT: shlq $62, %rcx
-; AVX12-NEXT: sarq $63, %rcx
-; AVX12-NEXT: movq %rax, %rdx
-; AVX12-NEXT: shlq $63, %rdx
-; AVX12-NEXT: sarq $63, %rdx
-; AVX12-NEXT: vmovd %edx, %xmm0
-; AVX12-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
-; AVX12-NEXT: movq %rax, %rcx
-; AVX12-NEXT: shlq $61, %rcx
-; AVX12-NEXT: sarq $63, %rcx
-; AVX12-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0
-; AVX12-NEXT: movq %rax, %rcx
-; AVX12-NEXT: shlq $60, %rcx
-; AVX12-NEXT: sarq $63, %rcx
-; AVX12-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0
-; AVX12-NEXT: movq %rax, %rcx
-; AVX12-NEXT: shlq $59, %rcx
-; AVX12-NEXT: sarq $63, %rcx
-; AVX12-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0
-; AVX12-NEXT: movq %rax, %rcx
-; AVX12-NEXT: shlq $58, %rcx
-; AVX12-NEXT: sarq $63, %rcx
-; AVX12-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0
-; AVX12-NEXT: movq %rax, %rcx
-; AVX12-NEXT: shlq $57, %rcx
-; AVX12-NEXT: sarq $63, %rcx
-; AVX12-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; AVX12-NEXT: movsbq %al, %rcx
-; AVX12-NEXT: shrq $7, %rcx
-; AVX12-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0
-; AVX12-NEXT: movq %rax, %rcx
-; AVX12-NEXT: shlq $55, %rcx
-; AVX12-NEXT: sarq $63, %rcx
-; AVX12-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0
-; AVX12-NEXT: movq %rax, %rcx
-; AVX12-NEXT: shlq $54, %rcx
-; AVX12-NEXT: sarq $63, %rcx
-; AVX12-NEXT: vpinsrb $9, %ecx, %xmm0, %xmm0
-; AVX12-NEXT: movq %rax, %rcx
-; AVX12-NEXT: shlq $53, %rcx
-; AVX12-NEXT: sarq $63, %rcx
-; AVX12-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0
-; AVX12-NEXT: movq %rax, %rcx
-; AVX12-NEXT: shlq $52, %rcx
-; AVX12-NEXT: sarq $63, %rcx
-; AVX12-NEXT: vpinsrb $11, %ecx, %xmm0, %xmm0
-; AVX12-NEXT: movq %rax, %rcx
-; AVX12-NEXT: shlq $51, %rcx
-; AVX12-NEXT: sarq $63, %rcx
-; AVX12-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0
-; AVX12-NEXT: movq %rax, %rcx
-; AVX12-NEXT: shlq $50, %rcx
-; AVX12-NEXT: sarq $63, %rcx
-; AVX12-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0
-; AVX12-NEXT: movq %rax, %rcx
-; AVX12-NEXT: shlq $49, %rcx
-; AVX12-NEXT: sarq $63, %rcx
-; AVX12-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; AVX12-NEXT: shrq $15, %rax
-; AVX12-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
-; AVX12-NEXT: retq
+; AVX2-LABEL: ext_i16_16i8:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovd %edi, %xmm0
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [9241421688590303745,9241421688590303745]
+; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: retq
;
; AVX512-LABEL: ext_i16_16i8:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: kmovd %edi, %k0
; AVX512-NEXT: vpmovm2b %k0, %xmm0
; AVX512-NEXT: retq
@@ -412,91 +196,58 @@ define <16 x i8> @ext_i16_16i8(i16 %a0) {
define <4 x i64> @ext_i4_4i64(i4 %a0) {
; SSE2-SSSE3-LABEL: ext_i4_4i64:
-; SSE2-SSSE3: # BB#0:
-; SSE2-SSSE3-NEXT: andb $15, %dil
-; SSE2-SSSE3-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movl -{{[0-9]+}}(%rsp), %eax
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $3, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $2, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
-; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-SSSE3-NEXT: movd %eax, %xmm2
-; SSE2-SSSE3-NEXT: shrl %eax
-; SSE2-SSSE3-NEXT: movd %eax, %xmm0
-; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
-; SSE2-SSSE3-NEXT: pand {{.*}}(%rip), %xmm2
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,1,3]
-; SSE2-SSSE3-NEXT: psllq $63, %xmm0
-; SSE2-SSSE3-NEXT: psrad $31, %xmm0
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,1,3,3]
-; SSE2-SSSE3-NEXT: psllq $63, %xmm1
-; SSE2-SSSE3-NEXT: psrad $31, %xmm1
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-SSSE3: # %bb.0:
+; SSE2-SSSE3-NEXT: # kill: def %edi killed %edi def %rdi
+; SSE2-SSSE3-NEXT: movq %rdi, %xmm0
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1]
+; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [1,2]
+; SSE2-SSSE3-NEXT: movdqa %xmm2, %xmm1
+; SSE2-SSSE3-NEXT: pand %xmm0, %xmm1
+; SSE2-SSSE3-NEXT: pcmpeqd %xmm0, %xmm1
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,0,3,2]
+; SSE2-SSSE3-NEXT: pand %xmm1, %xmm0
+; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [4,8]
+; SSE2-SSSE3-NEXT: pand %xmm1, %xmm2
+; SSE2-SSSE3-NEXT: pcmpeqd %xmm1, %xmm2
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,0,3,2]
+; SSE2-SSSE3-NEXT: pand %xmm2, %xmm1
; SSE2-SSSE3-NEXT: retq
;
; AVX1-LABEL: ext_i4_4i64:
-; AVX1: # BB#0:
-; AVX1-NEXT: andb $15, %dil
-; AVX1-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; AVX1-NEXT: movq %rax, %rcx
-; AVX1-NEXT: shlq $60, %rcx
-; AVX1-NEXT: sarq $63, %rcx
-; AVX1-NEXT: vmovq %rcx, %xmm0
-; AVX1-NEXT: movq %rax, %rcx
-; AVX1-NEXT: shlq $61, %rcx
-; AVX1-NEXT: sarq $63, %rcx
-; AVX1-NEXT: vmovq %rcx, %xmm1
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX1-NEXT: movq %rax, %rcx
-; AVX1-NEXT: shlq $62, %rcx
-; AVX1-NEXT: sarq $63, %rcx
-; AVX1-NEXT: vmovq %rcx, %xmm1
-; AVX1-NEXT: shlq $63, %rax
-; AVX1-NEXT: sarq $63, %rax
-; AVX1-NEXT: vmovq %rax, %xmm2
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1: # %bb.0:
+; AVX1-NEXT: # kill: def %edi killed %edi def %rdi
+; AVX1-NEXT: vmovq %rdi, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ext_i4_4i64:
-; AVX2: # BB#0:
-; AVX2-NEXT: andb $15, %dil
-; AVX2-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; AVX2-NEXT: movq %rax, %rcx
-; AVX2-NEXT: shlq $60, %rcx
-; AVX2-NEXT: sarq $63, %rcx
-; AVX2-NEXT: vmovq %rcx, %xmm0
-; AVX2-NEXT: movq %rax, %rcx
-; AVX2-NEXT: shlq $61, %rcx
-; AVX2-NEXT: sarq $63, %rcx
-; AVX2-NEXT: vmovq %rcx, %xmm1
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX2-NEXT: movq %rax, %rcx
-; AVX2-NEXT: shlq $62, %rcx
-; AVX2-NEXT: sarq $63, %rcx
-; AVX2-NEXT: vmovq %rcx, %xmm1
-; AVX2-NEXT: shlq $63, %rax
-; AVX2-NEXT: sarq $63, %rax
-; AVX2-NEXT: vmovq %rax, %xmm2
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2: # %bb.0:
+; AVX2-NEXT: # kill: def %edi killed %edi def %rdi
+; AVX2-NEXT: vmovq %rdi, %xmm0
+; AVX2-NEXT: vpbroadcastq %xmm0, %ymm0
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [1,2,4,8]
+; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: ext_i4_4i64:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: andb $15, %dil
; AVX512-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
; AVX512-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; AVX512-NEXT: kmovd %eax, %k1
; AVX512-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; AVX512-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
; AVX512-NEXT: retq
%1 = bitcast i4 %a0 to <4 x i1>
%2 = sext <4 x i1> %1 to <4 x i64>
@@ -505,131 +256,45 @@ define <4 x i64> @ext_i4_4i64(i4 %a0) {
define <8 x i32> @ext_i8_8i32(i8 %a0) {
; SSE2-SSSE3-LABEL: ext_i8_8i32:
-; SSE2-SSSE3: # BB#0:
-; SSE2-SSSE3-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $3, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $2, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
-; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
-; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $5, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $4, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
-; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $6, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
-; SSE2-SSSE3-NEXT: shrl $7, %eax
-; SSE2-SSSE3-NEXT: movzwl %ax, %eax
-; SSE2-SSSE3-NEXT: movd %eax, %xmm3
-; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
-; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSE2-SSSE3: # %bb.0:
+; SSE2-SSSE3-NEXT: movd %edi, %xmm0
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
+; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [1,2,4,8]
; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm0
-; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; SSE2-SSSE3-NEXT: pslld $31, %xmm0
-; SSE2-SSSE3-NEXT: psrad $31, %xmm0
-; SSE2-SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-SSSE3-NEXT: pslld $31, %xmm1
-; SSE2-SSSE3-NEXT: psrad $31, %xmm1
+; SSE2-SSSE3-NEXT: pand %xmm2, %xmm0
+; SSE2-SSSE3-NEXT: pcmpeqd %xmm2, %xmm0
+; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [16,32,64,128]
+; SSE2-SSSE3-NEXT: pand %xmm2, %xmm1
+; SSE2-SSSE3-NEXT: pcmpeqd %xmm2, %xmm1
; SSE2-SSSE3-NEXT: retq
;
; AVX1-LABEL: ext_i8_8i32:
-; AVX1: # BB#0:
-; AVX1-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT: movsbq -{{[0-9]+}}(%rsp), %rax
-; AVX1-NEXT: movq %rax, %rcx
-; AVX1-NEXT: shlq $58, %rcx
-; AVX1-NEXT: sarq $63, %rcx
-; AVX1-NEXT: movq %rax, %rdx
-; AVX1-NEXT: shlq $59, %rdx
-; AVX1-NEXT: sarq $63, %rdx
-; AVX1-NEXT: vmovd %edx, %xmm0
-; AVX1-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0
-; AVX1-NEXT: movq %rax, %rcx
-; AVX1-NEXT: shlq $57, %rcx
-; AVX1-NEXT: sarq $63, %rcx
-; AVX1-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0
-; AVX1-NEXT: movq %rax, %rcx
-; AVX1-NEXT: shrq $7, %rcx
-; AVX1-NEXT: vpinsrd $3, %ecx, %xmm0, %xmm0
-; AVX1-NEXT: movq %rax, %rcx
-; AVX1-NEXT: shlq $62, %rcx
-; AVX1-NEXT: sarq $63, %rcx
-; AVX1-NEXT: movq %rax, %rdx
-; AVX1-NEXT: shlq $63, %rdx
-; AVX1-NEXT: sarq $63, %rdx
-; AVX1-NEXT: vmovd %edx, %xmm1
-; AVX1-NEXT: vpinsrd $1, %ecx, %xmm1, %xmm1
-; AVX1-NEXT: movq %rax, %rcx
-; AVX1-NEXT: shlq $61, %rcx
-; AVX1-NEXT: sarq $63, %rcx
-; AVX1-NEXT: vpinsrd $2, %ecx, %xmm1, %xmm1
-; AVX1-NEXT: shlq $60, %rax
-; AVX1-NEXT: sarq $63, %rax
-; AVX1-NEXT: vpinsrd $3, %eax, %xmm1, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovd %edi, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ext_i8_8i32:
-; AVX2: # BB#0:
-; AVX2-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: movsbq -{{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT: movq %rax, %rcx
-; AVX2-NEXT: shlq $58, %rcx
-; AVX2-NEXT: sarq $63, %rcx
-; AVX2-NEXT: movq %rax, %rdx
-; AVX2-NEXT: shlq $59, %rdx
-; AVX2-NEXT: sarq $63, %rdx
-; AVX2-NEXT: vmovd %edx, %xmm0
-; AVX2-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0
-; AVX2-NEXT: movq %rax, %rcx
-; AVX2-NEXT: shlq $57, %rcx
-; AVX2-NEXT: sarq $63, %rcx
-; AVX2-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0
-; AVX2-NEXT: movq %rax, %rcx
-; AVX2-NEXT: shrq $7, %rcx
-; AVX2-NEXT: vpinsrd $3, %ecx, %xmm0, %xmm0
-; AVX2-NEXT: movq %rax, %rcx
-; AVX2-NEXT: shlq $62, %rcx
-; AVX2-NEXT: sarq $63, %rcx
-; AVX2-NEXT: movq %rax, %rdx
-; AVX2-NEXT: shlq $63, %rdx
-; AVX2-NEXT: sarq $63, %rdx
-; AVX2-NEXT: vmovd %edx, %xmm1
-; AVX2-NEXT: vpinsrd $1, %ecx, %xmm1, %xmm1
-; AVX2-NEXT: movq %rax, %rcx
-; AVX2-NEXT: shlq $61, %rcx
-; AVX2-NEXT: sarq $63, %rcx
-; AVX2-NEXT: vpinsrd $2, %ecx, %xmm1, %xmm1
-; AVX2-NEXT: shlq $60, %rax
-; AVX2-NEXT: sarq $63, %rax
-; AVX2-NEXT: vpinsrd $3, %eax, %xmm1, %xmm1
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovd %edi, %xmm0
+; AVX2-NEXT: vpbroadcastd %xmm0, %ymm0
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128]
+; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpcmpeqd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: ext_i8_8i32:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: kmovd %edi, %k1
; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; AVX512-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
@@ -641,305 +306,47 @@ define <8 x i32> @ext_i8_8i32(i8 %a0) {
define <16 x i16> @ext_i16_16i16(i16 %a0) {
; SSE2-SSSE3-LABEL: ext_i16_16i16:
-; SSE2-SSSE3: # BB#0:
-; SSE2-SSSE3-NEXT: movw %di, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $7, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $6, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $5, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $4, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $3, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $2, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm3
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
-; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $11, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $10, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $9, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm3
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $8, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
-; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $13, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $12, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm3
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $14, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
-; SSE2-SSSE3-NEXT: shrl $15, %eax
-; SSE2-SSSE3-NEXT: movzwl %ax, %eax
-; SSE2-SSSE3-NEXT: movd %eax, %xmm4
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
-; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
-; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE2-SSSE3: # %bb.0:
+; SSE2-SSSE3-NEXT: movd %edi, %xmm0
+; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1]
+; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128]
; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm0
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-SSSE3-NEXT: psllw $15, %xmm0
-; SSE2-SSSE3-NEXT: psraw $15, %xmm0
-; SSE2-SSSE3-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
-; SSE2-SSSE3-NEXT: psllw $15, %xmm1
-; SSE2-SSSE3-NEXT: psraw $15, %xmm1
+; SSE2-SSSE3-NEXT: pand %xmm2, %xmm0
+; SSE2-SSSE3-NEXT: pcmpeqw %xmm2, %xmm0
+; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [256,512,1024,2048,4096,8192,16384,32768]
+; SSE2-SSSE3-NEXT: pand %xmm2, %xmm1
+; SSE2-SSSE3-NEXT: pcmpeqw %xmm2, %xmm1
; SSE2-SSSE3-NEXT: retq
;
; AVX1-LABEL: ext_i16_16i16:
-; AVX1: # BB#0:
-; AVX1-NEXT: pushq %rbp
-; AVX1-NEXT: .Lcfi0:
-; AVX1-NEXT: .cfi_def_cfa_offset 16
-; AVX1-NEXT: pushq %r15
-; AVX1-NEXT: .Lcfi1:
-; AVX1-NEXT: .cfi_def_cfa_offset 24
-; AVX1-NEXT: pushq %r14
-; AVX1-NEXT: .Lcfi2:
-; AVX1-NEXT: .cfi_def_cfa_offset 32
-; AVX1-NEXT: pushq %r13
-; AVX1-NEXT: .Lcfi3:
-; AVX1-NEXT: .cfi_def_cfa_offset 40
-; AVX1-NEXT: pushq %r12
-; AVX1-NEXT: .Lcfi4:
-; AVX1-NEXT: .cfi_def_cfa_offset 48
-; AVX1-NEXT: pushq %rbx
-; AVX1-NEXT: .Lcfi5:
-; AVX1-NEXT: .cfi_def_cfa_offset 56
-; AVX1-NEXT: .Lcfi6:
-; AVX1-NEXT: .cfi_offset %rbx, -56
-; AVX1-NEXT: .Lcfi7:
-; AVX1-NEXT: .cfi_offset %r12, -48
-; AVX1-NEXT: .Lcfi8:
-; AVX1-NEXT: .cfi_offset %r13, -40
-; AVX1-NEXT: .Lcfi9:
-; AVX1-NEXT: .cfi_offset %r14, -32
-; AVX1-NEXT: .Lcfi10:
-; AVX1-NEXT: .cfi_offset %r15, -24
-; AVX1-NEXT: .Lcfi11:
-; AVX1-NEXT: .cfi_offset %rbp, -16
-; AVX1-NEXT: movw %di, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT: movswq -{{[0-9]+}}(%rsp), %rax
-; AVX1-NEXT: movq %rax, %rcx
-; AVX1-NEXT: shlq $55, %rcx
-; AVX1-NEXT: sarq $63, %rcx
-; AVX1-NEXT: vmovd %ecx, %xmm0
-; AVX1-NEXT: movq %rax, %r8
-; AVX1-NEXT: movq %rax, %r10
-; AVX1-NEXT: movq %rax, %r11
-; AVX1-NEXT: movq %rax, %r14
-; AVX1-NEXT: movq %rax, %r15
-; AVX1-NEXT: movq %rax, %r9
-; AVX1-NEXT: movq %rax, %r12
-; AVX1-NEXT: movq %rax, %r13
-; AVX1-NEXT: movq %rax, %rbx
-; AVX1-NEXT: movq %rax, %rdi
-; AVX1-NEXT: movq %rax, %rcx
-; AVX1-NEXT: movq %rax, %rdx
-; AVX1-NEXT: movq %rax, %rsi
-; AVX1-NEXT: movsbq %al, %rbp
-; AVX1-NEXT: shlq $54, %rax
-; AVX1-NEXT: sarq $63, %rax
-; AVX1-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0
-; AVX1-NEXT: shlq $53, %r8
-; AVX1-NEXT: sarq $63, %r8
-; AVX1-NEXT: vpinsrw $2, %r8d, %xmm0, %xmm0
-; AVX1-NEXT: shlq $52, %r10
-; AVX1-NEXT: sarq $63, %r10
-; AVX1-NEXT: vpinsrw $3, %r10d, %xmm0, %xmm0
-; AVX1-NEXT: shlq $51, %r11
-; AVX1-NEXT: sarq $63, %r11
-; AVX1-NEXT: vpinsrw $4, %r11d, %xmm0, %xmm0
-; AVX1-NEXT: shlq $50, %r14
-; AVX1-NEXT: sarq $63, %r14
-; AVX1-NEXT: vpinsrw $5, %r14d, %xmm0, %xmm0
-; AVX1-NEXT: shlq $49, %r15
-; AVX1-NEXT: sarq $63, %r15
-; AVX1-NEXT: vpinsrw $6, %r15d, %xmm0, %xmm0
-; AVX1-NEXT: shrq $15, %r9
-; AVX1-NEXT: vpinsrw $7, %r9d, %xmm0, %xmm0
-; AVX1-NEXT: shlq $63, %r13
-; AVX1-NEXT: sarq $63, %r13
-; AVX1-NEXT: vmovd %r13d, %xmm1
-; AVX1-NEXT: shlq $62, %r12
-; AVX1-NEXT: sarq $63, %r12
-; AVX1-NEXT: vpinsrw $1, %r12d, %xmm1, %xmm1
-; AVX1-NEXT: shlq $61, %rbx
-; AVX1-NEXT: sarq $63, %rbx
-; AVX1-NEXT: vpinsrw $2, %ebx, %xmm1, %xmm1
-; AVX1-NEXT: shlq $60, %rdi
-; AVX1-NEXT: sarq $63, %rdi
-; AVX1-NEXT: vpinsrw $3, %edi, %xmm1, %xmm1
-; AVX1-NEXT: shlq $59, %rcx
-; AVX1-NEXT: sarq $63, %rcx
-; AVX1-NEXT: vpinsrw $4, %ecx, %xmm1, %xmm1
-; AVX1-NEXT: shlq $58, %rdx
-; AVX1-NEXT: sarq $63, %rdx
-; AVX1-NEXT: vpinsrw $5, %edx, %xmm1, %xmm1
-; AVX1-NEXT: shlq $57, %rsi
-; AVX1-NEXT: sarq $63, %rsi
-; AVX1-NEXT: vpinsrw $6, %esi, %xmm1, %xmm1
-; AVX1-NEXT: shrq $7, %rbp
-; AVX1-NEXT: vpinsrw $7, %ebp, %xmm1, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT: popq %rbx
-; AVX1-NEXT: popq %r12
-; AVX1-NEXT: popq %r13
-; AVX1-NEXT: popq %r14
-; AVX1-NEXT: popq %r15
-; AVX1-NEXT: popq %rbp
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovd %edi, %xmm0
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ext_i16_16i16:
-; AVX2: # BB#0:
-; AVX2-NEXT: pushq %rbp
-; AVX2-NEXT: .Lcfi0:
-; AVX2-NEXT: .cfi_def_cfa_offset 16
-; AVX2-NEXT: pushq %r15
-; AVX2-NEXT: .Lcfi1:
-; AVX2-NEXT: .cfi_def_cfa_offset 24
-; AVX2-NEXT: pushq %r14
-; AVX2-NEXT: .Lcfi2:
-; AVX2-NEXT: .cfi_def_cfa_offset 32
-; AVX2-NEXT: pushq %r13
-; AVX2-NEXT: .Lcfi3:
-; AVX2-NEXT: .cfi_def_cfa_offset 40
-; AVX2-NEXT: pushq %r12
-; AVX2-NEXT: .Lcfi4:
-; AVX2-NEXT: .cfi_def_cfa_offset 48
-; AVX2-NEXT: pushq %rbx
-; AVX2-NEXT: .Lcfi5:
-; AVX2-NEXT: .cfi_def_cfa_offset 56
-; AVX2-NEXT: .Lcfi6:
-; AVX2-NEXT: .cfi_offset %rbx, -56
-; AVX2-NEXT: .Lcfi7:
-; AVX2-NEXT: .cfi_offset %r12, -48
-; AVX2-NEXT: .Lcfi8:
-; AVX2-NEXT: .cfi_offset %r13, -40
-; AVX2-NEXT: .Lcfi9:
-; AVX2-NEXT: .cfi_offset %r14, -32
-; AVX2-NEXT: .Lcfi10:
-; AVX2-NEXT: .cfi_offset %r15, -24
-; AVX2-NEXT: .Lcfi11:
-; AVX2-NEXT: .cfi_offset %rbp, -16
-; AVX2-NEXT: movw %di, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: movswq -{{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT: movq %rax, %rcx
-; AVX2-NEXT: shlq $55, %rcx
-; AVX2-NEXT: sarq $63, %rcx
-; AVX2-NEXT: vmovd %ecx, %xmm0
-; AVX2-NEXT: movq %rax, %r8
-; AVX2-NEXT: movq %rax, %r10
-; AVX2-NEXT: movq %rax, %r11
-; AVX2-NEXT: movq %rax, %r14
-; AVX2-NEXT: movq %rax, %r15
-; AVX2-NEXT: movq %rax, %r9
-; AVX2-NEXT: movq %rax, %r12
-; AVX2-NEXT: movq %rax, %r13
-; AVX2-NEXT: movq %rax, %rbx
-; AVX2-NEXT: movq %rax, %rdi
-; AVX2-NEXT: movq %rax, %rcx
-; AVX2-NEXT: movq %rax, %rdx
-; AVX2-NEXT: movq %rax, %rsi
-; AVX2-NEXT: movsbq %al, %rbp
-; AVX2-NEXT: shlq $54, %rax
-; AVX2-NEXT: sarq $63, %rax
-; AVX2-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0
-; AVX2-NEXT: shlq $53, %r8
-; AVX2-NEXT: sarq $63, %r8
-; AVX2-NEXT: vpinsrw $2, %r8d, %xmm0, %xmm0
-; AVX2-NEXT: shlq $52, %r10
-; AVX2-NEXT: sarq $63, %r10
-; AVX2-NEXT: vpinsrw $3, %r10d, %xmm0, %xmm0
-; AVX2-NEXT: shlq $51, %r11
-; AVX2-NEXT: sarq $63, %r11
-; AVX2-NEXT: vpinsrw $4, %r11d, %xmm0, %xmm0
-; AVX2-NEXT: shlq $50, %r14
-; AVX2-NEXT: sarq $63, %r14
-; AVX2-NEXT: vpinsrw $5, %r14d, %xmm0, %xmm0
-; AVX2-NEXT: shlq $49, %r15
-; AVX2-NEXT: sarq $63, %r15
-; AVX2-NEXT: vpinsrw $6, %r15d, %xmm0, %xmm0
-; AVX2-NEXT: shrq $15, %r9
-; AVX2-NEXT: vpinsrw $7, %r9d, %xmm0, %xmm0
-; AVX2-NEXT: shlq $63, %r13
-; AVX2-NEXT: sarq $63, %r13
-; AVX2-NEXT: vmovd %r13d, %xmm1
-; AVX2-NEXT: shlq $62, %r12
-; AVX2-NEXT: sarq $63, %r12
-; AVX2-NEXT: vpinsrw $1, %r12d, %xmm1, %xmm1
-; AVX2-NEXT: shlq $61, %rbx
-; AVX2-NEXT: sarq $63, %rbx
-; AVX2-NEXT: vpinsrw $2, %ebx, %xmm1, %xmm1
-; AVX2-NEXT: shlq $60, %rdi
-; AVX2-NEXT: sarq $63, %rdi
-; AVX2-NEXT: vpinsrw $3, %edi, %xmm1, %xmm1
-; AVX2-NEXT: shlq $59, %rcx
-; AVX2-NEXT: sarq $63, %rcx
-; AVX2-NEXT: vpinsrw $4, %ecx, %xmm1, %xmm1
-; AVX2-NEXT: shlq $58, %rdx
-; AVX2-NEXT: sarq $63, %rdx
-; AVX2-NEXT: vpinsrw $5, %edx, %xmm1, %xmm1
-; AVX2-NEXT: shlq $57, %rsi
-; AVX2-NEXT: sarq $63, %rsi
-; AVX2-NEXT: vpinsrw $6, %esi, %xmm1, %xmm1
-; AVX2-NEXT: shrq $7, %rbp
-; AVX2-NEXT: vpinsrw $7, %ebp, %xmm1, %xmm1
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX2-NEXT: popq %rbx
-; AVX2-NEXT: popq %r12
-; AVX2-NEXT: popq %r13
-; AVX2-NEXT: popq %r14
-; AVX2-NEXT: popq %r15
-; AVX2-NEXT: popq %rbp
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovd %edi, %xmm0
+; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
+; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: ext_i16_16i16:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: kmovd %edi, %k0
; AVX512-NEXT: vpmovm2w %k0, %ymm0
; AVX512-NEXT: retq
@@ -950,544 +357,56 @@ define <16 x i16> @ext_i16_16i16(i16 %a0) {
define <32 x i8> @ext_i32_32i8(i32 %a0) {
; SSE2-SSSE3-LABEL: ext_i32_32i8:
-; SSE2-SSSE3: # BB#0:
-; SSE2-SSSE3-NEXT: pushq %rbp
-; SSE2-SSSE3-NEXT: .Lcfi12:
-; SSE2-SSSE3-NEXT: .cfi_def_cfa_offset 16
-; SSE2-SSSE3-NEXT: pushq %r15
-; SSE2-SSSE3-NEXT: .Lcfi13:
-; SSE2-SSSE3-NEXT: .cfi_def_cfa_offset 24
-; SSE2-SSSE3-NEXT: pushq %r14
-; SSE2-SSSE3-NEXT: .Lcfi14:
-; SSE2-SSSE3-NEXT: .cfi_def_cfa_offset 32
-; SSE2-SSSE3-NEXT: pushq %r13
-; SSE2-SSSE3-NEXT: .Lcfi15:
-; SSE2-SSSE3-NEXT: .cfi_def_cfa_offset 40
-; SSE2-SSSE3-NEXT: pushq %r12
-; SSE2-SSSE3-NEXT: .Lcfi16:
-; SSE2-SSSE3-NEXT: .cfi_def_cfa_offset 48
-; SSE2-SSSE3-NEXT: pushq %rbx
-; SSE2-SSSE3-NEXT: .Lcfi17:
-; SSE2-SSSE3-NEXT: .cfi_def_cfa_offset 56
-; SSE2-SSSE3-NEXT: .Lcfi18:
-; SSE2-SSSE3-NEXT: .cfi_offset %rbx, -56
-; SSE2-SSSE3-NEXT: .Lcfi19:
-; SSE2-SSSE3-NEXT: .cfi_offset %r12, -48
-; SSE2-SSSE3-NEXT: .Lcfi20:
-; SSE2-SSSE3-NEXT: .cfi_offset %r13, -40
-; SSE2-SSSE3-NEXT: .Lcfi21:
-; SSE2-SSSE3-NEXT: .cfi_offset %r14, -32
-; SSE2-SSSE3-NEXT: .Lcfi22:
-; SSE2-SSSE3-NEXT: .cfi_offset %r15, -24
-; SSE2-SSSE3-NEXT: .Lcfi23:
-; SSE2-SSSE3-NEXT: .cfi_offset %rbp, -16
-; SSE2-SSSE3-NEXT: movw %di, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: shrl $16, %edi
-; SSE2-SSSE3-NEXT: movw %di, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movswq -{{[0-9]+}}(%rsp), %rbx
-; SSE2-SSSE3-NEXT: movq %rbx, %r8
-; SSE2-SSSE3-NEXT: movq %rbx, %r9
-; SSE2-SSSE3-NEXT: movq %rbx, %r10
-; SSE2-SSSE3-NEXT: movq %rbx, %r11
-; SSE2-SSSE3-NEXT: movq %rbx, %r14
-; SSE2-SSSE3-NEXT: movq %rbx, %r15
-; SSE2-SSSE3-NEXT: movq %rbx, %r12
-; SSE2-SSSE3-NEXT: movq %rbx, %r13
-; SSE2-SSSE3-NEXT: movq %rbx, %rdi
-; SSE2-SSSE3-NEXT: movq %rbx, %rcx
-; SSE2-SSSE3-NEXT: movq %rbx, %rdx
-; SSE2-SSSE3-NEXT: movq %rbx, %rbp
-; SSE2-SSSE3-NEXT: movq %rbx, %rsi
-; SSE2-SSSE3-NEXT: movq %rbx, %rax
-; SSE2-SSSE3-NEXT: shrq $15, %rax
-; SSE2-SSSE3-NEXT: movd %eax, %xmm0
-; SSE2-SSSE3-NEXT: movq %rbx, %rax
-; SSE2-SSSE3-NEXT: movsbq %bl, %rbx
-; SSE2-SSSE3-NEXT: shlq $49, %r8
-; SSE2-SSSE3-NEXT: sarq $63, %r8
-; SSE2-SSSE3-NEXT: movd %r8d, %xmm15
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3],xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7]
-; SSE2-SSSE3-NEXT: shlq $50, %r9
-; SSE2-SSSE3-NEXT: sarq $63, %r9
-; SSE2-SSSE3-NEXT: movd %r9d, %xmm8
-; SSE2-SSSE3-NEXT: shlq $51, %r10
-; SSE2-SSSE3-NEXT: sarq $63, %r10
-; SSE2-SSSE3-NEXT: movd %r10d, %xmm3
-; SSE2-SSSE3-NEXT: shlq $52, %r11
-; SSE2-SSSE3-NEXT: sarq $63, %r11
-; SSE2-SSSE3-NEXT: movd %r11d, %xmm9
-; SSE2-SSSE3-NEXT: shlq $53, %r14
-; SSE2-SSSE3-NEXT: sarq $63, %r14
-; SSE2-SSSE3-NEXT: movd %r14d, %xmm6
-; SSE2-SSSE3-NEXT: shlq $54, %r15
-; SSE2-SSSE3-NEXT: sarq $63, %r15
-; SSE2-SSSE3-NEXT: movd %r15d, %xmm10
-; SSE2-SSSE3-NEXT: shlq $55, %r12
-; SSE2-SSSE3-NEXT: sarq $63, %r12
-; SSE2-SSSE3-NEXT: movd %r12d, %xmm1
-; SSE2-SSSE3-NEXT: shlq $60, %r13
-; SSE2-SSSE3-NEXT: sarq $63, %r13
-; SSE2-SSSE3-NEXT: movd %r13d, %xmm11
-; SSE2-SSSE3-NEXT: shlq $61, %rdi
-; SSE2-SSSE3-NEXT: sarq $63, %rdi
-; SSE2-SSSE3-NEXT: movd %edi, %xmm5
-; SSE2-SSSE3-NEXT: shlq $62, %rcx
-; SSE2-SSSE3-NEXT: sarq $63, %rcx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm12
-; SSE2-SSSE3-NEXT: shlq $63, %rdx
-; SSE2-SSSE3-NEXT: sarq $63, %rdx
-; SSE2-SSSE3-NEXT: movd %edx, %xmm0
-; SSE2-SSSE3-NEXT: shlq $58, %rbp
-; SSE2-SSSE3-NEXT: sarq $63, %rbp
-; SSE2-SSSE3-NEXT: movd %ebp, %xmm13
-; SSE2-SSSE3-NEXT: shlq $59, %rsi
-; SSE2-SSSE3-NEXT: sarq $63, %rsi
-; SSE2-SSSE3-NEXT: movd %esi, %xmm7
-; SSE2-SSSE3-NEXT: shlq $57, %rax
-; SSE2-SSSE3-NEXT: sarq $63, %rax
-; SSE2-SSSE3-NEXT: movd %eax, %xmm4
-; SSE2-SSSE3-NEXT: shrq $7, %rbx
-; SSE2-SSSE3-NEXT: movd %ebx, %xmm14
-; SSE2-SSSE3-NEXT: movswq -{{[0-9]+}}(%rsp), %rsi
-; SSE2-SSSE3-NEXT: movq %rsi, %r8
-; SSE2-SSSE3-NEXT: movq %rsi, %r9
-; SSE2-SSSE3-NEXT: movq %rsi, %r10
-; SSE2-SSSE3-NEXT: movq %rsi, %r11
-; SSE2-SSSE3-NEXT: movq %rsi, %r14
-; SSE2-SSSE3-NEXT: movq %rsi, %r15
-; SSE2-SSSE3-NEXT: movq %rsi, %r12
-; SSE2-SSSE3-NEXT: movq %rsi, %r13
-; SSE2-SSSE3-NEXT: movq %rsi, %rbx
-; SSE2-SSSE3-NEXT: movq %rsi, %rax
-; SSE2-SSSE3-NEXT: movq %rsi, %rcx
-; SSE2-SSSE3-NEXT: movq %rsi, %rdx
-; SSE2-SSSE3-NEXT: movq %rsi, %rdi
-; SSE2-SSSE3-NEXT: movq %rsi, %rbp
-; SSE2-SSSE3-NEXT: shrq $15, %rbp
-; SSE2-SSSE3-NEXT: movd %ebp, %xmm2
-; SSE2-SSSE3-NEXT: movq %rsi, %rbp
-; SSE2-SSSE3-NEXT: movsbq %sil, %rsi
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3],xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7]
-; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3]
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3],xmm6[4],xmm9[4],xmm6[5],xmm9[5],xmm6[6],xmm9[6],xmm6[7],xmm9[7]
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3],xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7]
-; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3]
-; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm11[0],xmm5[1],xmm11[1],xmm5[2],xmm11[2],xmm5[3],xmm11[3],xmm5[4],xmm11[4],xmm5[5],xmm11[5],xmm5[6],xmm11[6],xmm5[7],xmm11[7]
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3],xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7]
-; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3]
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm13[0],xmm7[1],xmm13[1],xmm7[2],xmm13[2],xmm7[3],xmm13[3],xmm7[4],xmm13[4],xmm7[5],xmm13[5],xmm7[6],xmm13[6],xmm7[7],xmm13[7]
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm14[0],xmm4[1],xmm14[1],xmm4[2],xmm14[2],xmm4[3],xmm14[3],xmm4[4],xmm14[4],xmm4[5],xmm14[5],xmm4[6],xmm14[6],xmm4[7],xmm14[7]
-; SSE2-SSSE3-NEXT: shlq $49, %r8
-; SSE2-SSSE3-NEXT: sarq $63, %r8
-; SSE2-SSSE3-NEXT: movd %r8d, %xmm3
-; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3]
-; SSE2-SSSE3-NEXT: shlq $50, %r9
-; SSE2-SSSE3-NEXT: sarq $63, %r9
-; SSE2-SSSE3-NEXT: movd %r9d, %xmm4
-; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1]
-; SSE2-SSSE3-NEXT: shlq $51, %r10
-; SSE2-SSSE3-NEXT: sarq $63, %r10
-; SSE2-SSSE3-NEXT: movd %r10d, %xmm5
-; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE2-SSSE3-NEXT: shlq $52, %r11
-; SSE2-SSSE3-NEXT: sarq $63, %r11
-; SSE2-SSSE3-NEXT: movd %r11d, %xmm1
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
-; SSE2-SSSE3-NEXT: shlq $53, %r14
-; SSE2-SSSE3-NEXT: sarq $63, %r14
-; SSE2-SSSE3-NEXT: movd %r14d, %xmm2
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
-; SSE2-SSSE3-NEXT: shlq $54, %r15
-; SSE2-SSSE3-NEXT: sarq $63, %r15
-; SSE2-SSSE3-NEXT: movd %r15d, %xmm4
-; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
-; SSE2-SSSE3-NEXT: shlq $55, %r12
-; SSE2-SSSE3-NEXT: sarq $63, %r12
-; SSE2-SSSE3-NEXT: movd %r12d, %xmm3
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; SSE2-SSSE3-NEXT: shlq $60, %r13
-; SSE2-SSSE3-NEXT: sarq $63, %r13
-; SSE2-SSSE3-NEXT: movd %r13d, %xmm6
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
-; SSE2-SSSE3-NEXT: shlq $61, %rbx
-; SSE2-SSSE3-NEXT: sarq $63, %rbx
-; SSE2-SSSE3-NEXT: movd %ebx, %xmm4
-; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; SSE2-SSSE3-NEXT: shlq $62, %rax
-; SSE2-SSSE3-NEXT: sarq $63, %rax
-; SSE2-SSSE3-NEXT: movd %eax, %xmm2
-; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1]
-; SSE2-SSSE3-NEXT: shlq $63, %rcx
-; SSE2-SSSE3-NEXT: sarq $63, %rcx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7]
-; SSE2-SSSE3-NEXT: shlq $58, %rdx
-; SSE2-SSSE3-NEXT: sarq $63, %rdx
-; SSE2-SSSE3-NEXT: movd %edx, %xmm5
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; SSE2-SSSE3-NEXT: shlq $59, %rdi
-; SSE2-SSSE3-NEXT: sarq $63, %rdi
-; SSE2-SSSE3-NEXT: movd %edi, %xmm2
-; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7]
-; SSE2-SSSE3-NEXT: shlq $57, %rbp
-; SSE2-SSSE3-NEXT: sarq $63, %rbp
-; SSE2-SSSE3-NEXT: movd %ebp, %xmm4
-; SSE2-SSSE3-NEXT: shrq $7, %rsi
-; SSE2-SSSE3-NEXT: movd %esi, %xmm5
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
-; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
-; SSE2-SSSE3-NEXT: popq %rbx
-; SSE2-SSSE3-NEXT: popq %r12
-; SSE2-SSSE3-NEXT: popq %r13
-; SSE2-SSSE3-NEXT: popq %r14
-; SSE2-SSSE3-NEXT: popq %r15
-; SSE2-SSSE3-NEXT: popq %rbp
+; SSE2-SSSE3: # %bb.0:
+; SSE2-SSSE3-NEXT: movd %edi, %xmm1
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,0,1,1,4,5,6,7]
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
+; SSE2-SSSE3-NEXT: pand %xmm2, %xmm0
+; SSE2-SSSE3-NEXT: pcmpeqb %xmm2, %xmm0
+; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,2,3,3,4,5,6,7]
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
+; SSE2-SSSE3-NEXT: pand %xmm2, %xmm1
+; SSE2-SSSE3-NEXT: pcmpeqb %xmm2, %xmm1
; SSE2-SSSE3-NEXT: retq
;
; AVX1-LABEL: ext_i32_32i8:
-; AVX1: # BB#0:
-; AVX1-NEXT: pushq %rbp
-; AVX1-NEXT: .Lcfi12:
-; AVX1-NEXT: .cfi_def_cfa_offset 16
-; AVX1-NEXT: .Lcfi13:
-; AVX1-NEXT: .cfi_offset %rbp, -16
-; AVX1-NEXT: movq %rsp, %rbp
-; AVX1-NEXT: .Lcfi14:
-; AVX1-NEXT: .cfi_def_cfa_register %rbp
-; AVX1-NEXT: pushq %r15
-; AVX1-NEXT: pushq %r14
-; AVX1-NEXT: pushq %r13
-; AVX1-NEXT: pushq %r12
-; AVX1-NEXT: pushq %rbx
-; AVX1-NEXT: andq $-32, %rsp
-; AVX1-NEXT: subq $64, %rsp
-; AVX1-NEXT: .Lcfi15:
-; AVX1-NEXT: .cfi_offset %rbx, -56
-; AVX1-NEXT: .Lcfi16:
-; AVX1-NEXT: .cfi_offset %r12, -48
-; AVX1-NEXT: .Lcfi17:
-; AVX1-NEXT: .cfi_offset %r13, -40
-; AVX1-NEXT: .Lcfi18:
-; AVX1-NEXT: .cfi_offset %r14, -32
-; AVX1-NEXT: .Lcfi19:
-; AVX1-NEXT: .cfi_offset %r15, -24
-; AVX1-NEXT: movl %edi, (%rsp)
-; AVX1-NEXT: movslq (%rsp), %rdx
-; AVX1-NEXT: movq %rdx, %rcx
-; AVX1-NEXT: shlq $47, %rcx
-; AVX1-NEXT: sarq $63, %rcx
-; AVX1-NEXT: vmovd %ecx, %xmm0
-; AVX1-NEXT: movq %rdx, {{[0-9]+}}(%rsp) # 8-byte Spill
-; AVX1-NEXT: movq %rdx, %r8
-; AVX1-NEXT: movq %rdx, %rcx
-; AVX1-NEXT: movq %rdx, %rdi
-; AVX1-NEXT: movq %rdx, %r13
-; AVX1-NEXT: movq %rdx, %rsi
-; AVX1-NEXT: movq %rdx, %r10
-; AVX1-NEXT: movq %rdx, %r11
-; AVX1-NEXT: movq %rdx, %r9
-; AVX1-NEXT: movq %rdx, %rbx
-; AVX1-NEXT: movq %rdx, %r14
-; AVX1-NEXT: movq %rdx, %r15
-; AVX1-NEXT: movq %rdx, %r12
-; AVX1-NEXT: movq %rdx, %rax
-; AVX1-NEXT: shlq $46, %rax
-; AVX1-NEXT: sarq $63, %rax
-; AVX1-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
-; AVX1-NEXT: movq %rdx, {{[0-9]+}}(%rsp) # 8-byte Spill
-; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %rax # 8-byte Reload
-; AVX1-NEXT: shlq $45, %rax
-; AVX1-NEXT: sarq $63, %rax
-; AVX1-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
-; AVX1-NEXT: movq %rdx, {{[0-9]+}}(%rsp) # 8-byte Spill
-; AVX1-NEXT: shlq $44, %r8
-; AVX1-NEXT: sarq $63, %r8
-; AVX1-NEXT: vpinsrb $3, %r8d, %xmm0, %xmm0
-; AVX1-NEXT: movq %rdx, %r8
-; AVX1-NEXT: shlq $43, %rcx
-; AVX1-NEXT: sarq $63, %rcx
-; AVX1-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0
-; AVX1-NEXT: movq %rdx, %rcx
-; AVX1-NEXT: shlq $42, %rdi
-; AVX1-NEXT: sarq $63, %rdi
-; AVX1-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; AVX1-NEXT: movq %rdx, %rdi
-; AVX1-NEXT: shlq $41, %r13
-; AVX1-NEXT: sarq $63, %r13
-; AVX1-NEXT: vpinsrb $6, %r13d, %xmm0, %xmm0
-; AVX1-NEXT: movq %rdx, %r13
-; AVX1-NEXT: shlq $40, %rsi
-; AVX1-NEXT: sarq $63, %rsi
-; AVX1-NEXT: vpinsrb $7, %esi, %xmm0, %xmm0
-; AVX1-NEXT: movq %rdx, %rsi
-; AVX1-NEXT: shlq $39, %r10
-; AVX1-NEXT: sarq $63, %r10
-; AVX1-NEXT: vpinsrb $8, %r10d, %xmm0, %xmm0
-; AVX1-NEXT: movq %rdx, %r10
-; AVX1-NEXT: shlq $38, %r11
-; AVX1-NEXT: sarq $63, %r11
-; AVX1-NEXT: vpinsrb $9, %r11d, %xmm0, %xmm0
-; AVX1-NEXT: movsbq %dl, %rax
-; AVX1-NEXT: movq %rax, {{[0-9]+}}(%rsp) # 8-byte Spill
-; AVX1-NEXT: shlq $37, %r9
-; AVX1-NEXT: sarq $63, %r9
-; AVX1-NEXT: vpinsrb $10, %r9d, %xmm0, %xmm0
-; AVX1-NEXT: movq %rdx, %r9
-; AVX1-NEXT: shlq $36, %rbx
-; AVX1-NEXT: sarq $63, %rbx
-; AVX1-NEXT: vpinsrb $11, %ebx, %xmm0, %xmm0
-; AVX1-NEXT: movq %rdx, %rbx
-; AVX1-NEXT: shlq $35, %r14
-; AVX1-NEXT: sarq $63, %r14
-; AVX1-NEXT: vpinsrb $12, %r14d, %xmm0, %xmm0
-; AVX1-NEXT: movq %rdx, %r14
-; AVX1-NEXT: shlq $34, %r15
-; AVX1-NEXT: sarq $63, %r15
-; AVX1-NEXT: vpinsrb $13, %r15d, %xmm0, %xmm0
-; AVX1-NEXT: movq %rdx, %r15
-; AVX1-NEXT: shlq $33, %r12
-; AVX1-NEXT: sarq $63, %r12
-; AVX1-NEXT: vpinsrb $14, %r12d, %xmm0, %xmm0
-; AVX1-NEXT: movq %rdx, %r12
-; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %rax # 8-byte Reload
-; AVX1-NEXT: shrq $31, %rax
-; AVX1-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
-; AVX1-NEXT: movq %rdx, %rax
-; AVX1-NEXT: shlq $63, %r8
-; AVX1-NEXT: sarq $63, %r8
-; AVX1-NEXT: vmovd %r8d, %xmm1
-; AVX1-NEXT: movq %rdx, %r8
-; AVX1-NEXT: movswq %dx, %rdx
-; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %r11 # 8-byte Reload
-; AVX1-NEXT: shlq $62, %r11
-; AVX1-NEXT: sarq $63, %r11
-; AVX1-NEXT: vpinsrb $1, %r11d, %xmm1, %xmm1
-; AVX1-NEXT: shlq $61, %rcx
-; AVX1-NEXT: sarq $63, %rcx
-; AVX1-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1
-; AVX1-NEXT: shlq $60, %rdi
-; AVX1-NEXT: sarq $63, %rdi
-; AVX1-NEXT: vpinsrb $3, %edi, %xmm1, %xmm1
-; AVX1-NEXT: shlq $59, %r13
-; AVX1-NEXT: sarq $63, %r13
-; AVX1-NEXT: vpinsrb $4, %r13d, %xmm1, %xmm1
-; AVX1-NEXT: shlq $58, %rsi
-; AVX1-NEXT: sarq $63, %rsi
-; AVX1-NEXT: vpinsrb $5, %esi, %xmm1, %xmm1
-; AVX1-NEXT: shlq $57, %r10
-; AVX1-NEXT: sarq $63, %r10
-; AVX1-NEXT: vpinsrb $6, %r10d, %xmm1, %xmm1
-; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %rcx # 8-byte Reload
-; AVX1-NEXT: shrq $7, %rcx
-; AVX1-NEXT: vpinsrb $7, %ecx, %xmm1, %xmm1
-; AVX1-NEXT: shlq $55, %r9
-; AVX1-NEXT: sarq $63, %r9
-; AVX1-NEXT: vpinsrb $8, %r9d, %xmm1, %xmm1
-; AVX1-NEXT: shlq $54, %rbx
-; AVX1-NEXT: sarq $63, %rbx
-; AVX1-NEXT: vpinsrb $9, %ebx, %xmm1, %xmm1
-; AVX1-NEXT: shlq $53, %r14
-; AVX1-NEXT: sarq $63, %r14
-; AVX1-NEXT: vpinsrb $10, %r14d, %xmm1, %xmm1
-; AVX1-NEXT: shlq $52, %r15
-; AVX1-NEXT: sarq $63, %r15
-; AVX1-NEXT: vpinsrb $11, %r15d, %xmm1, %xmm1
-; AVX1-NEXT: shlq $51, %r12
-; AVX1-NEXT: sarq $63, %r12
-; AVX1-NEXT: vpinsrb $12, %r12d, %xmm1, %xmm1
-; AVX1-NEXT: shlq $50, %rax
-; AVX1-NEXT: sarq $63, %rax
-; AVX1-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
-; AVX1-NEXT: shlq $49, %r8
-; AVX1-NEXT: sarq $63, %r8
-; AVX1-NEXT: vpinsrb $14, %r8d, %xmm1, %xmm1
-; AVX1-NEXT: shrq $15, %rdx
-; AVX1-NEXT: vpinsrb $15, %edx, %xmm1, %xmm1
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovd %edi, %xmm0
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,1,1,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,2,3,3,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT: leaq -40(%rbp), %rsp
-; AVX1-NEXT: popq %rbx
-; AVX1-NEXT: popq %r12
-; AVX1-NEXT: popq %r13
-; AVX1-NEXT: popq %r14
-; AVX1-NEXT: popq %r15
-; AVX1-NEXT: popq %rbp
+; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ext_i32_32i8:
-; AVX2: # BB#0:
-; AVX2-NEXT: pushq %rbp
-; AVX2-NEXT: .Lcfi12:
-; AVX2-NEXT: .cfi_def_cfa_offset 16
-; AVX2-NEXT: .Lcfi13:
-; AVX2-NEXT: .cfi_offset %rbp, -16
-; AVX2-NEXT: movq %rsp, %rbp
-; AVX2-NEXT: .Lcfi14:
-; AVX2-NEXT: .cfi_def_cfa_register %rbp
-; AVX2-NEXT: pushq %r15
-; AVX2-NEXT: pushq %r14
-; AVX2-NEXT: pushq %r13
-; AVX2-NEXT: pushq %r12
-; AVX2-NEXT: pushq %rbx
-; AVX2-NEXT: andq $-32, %rsp
-; AVX2-NEXT: subq $64, %rsp
-; AVX2-NEXT: .Lcfi15:
-; AVX2-NEXT: .cfi_offset %rbx, -56
-; AVX2-NEXT: .Lcfi16:
-; AVX2-NEXT: .cfi_offset %r12, -48
-; AVX2-NEXT: .Lcfi17:
-; AVX2-NEXT: .cfi_offset %r13, -40
-; AVX2-NEXT: .Lcfi18:
-; AVX2-NEXT: .cfi_offset %r14, -32
-; AVX2-NEXT: .Lcfi19:
-; AVX2-NEXT: .cfi_offset %r15, -24
-; AVX2-NEXT: movl %edi, (%rsp)
-; AVX2-NEXT: movslq (%rsp), %rdx
-; AVX2-NEXT: movq %rdx, %rcx
-; AVX2-NEXT: shlq $47, %rcx
-; AVX2-NEXT: sarq $63, %rcx
-; AVX2-NEXT: vmovd %ecx, %xmm0
-; AVX2-NEXT: movq %rdx, {{[0-9]+}}(%rsp) # 8-byte Spill
-; AVX2-NEXT: movq %rdx, %r8
-; AVX2-NEXT: movq %rdx, %rcx
-; AVX2-NEXT: movq %rdx, %rdi
-; AVX2-NEXT: movq %rdx, %r13
-; AVX2-NEXT: movq %rdx, %rsi
-; AVX2-NEXT: movq %rdx, %r10
-; AVX2-NEXT: movq %rdx, %r11
-; AVX2-NEXT: movq %rdx, %r9
-; AVX2-NEXT: movq %rdx, %rbx
-; AVX2-NEXT: movq %rdx, %r14
-; AVX2-NEXT: movq %rdx, %r15
-; AVX2-NEXT: movq %rdx, %r12
-; AVX2-NEXT: movq %rdx, %rax
-; AVX2-NEXT: shlq $46, %rax
-; AVX2-NEXT: sarq $63, %rax
-; AVX2-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
-; AVX2-NEXT: movq %rdx, {{[0-9]+}}(%rsp) # 8-byte Spill
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax # 8-byte Reload
-; AVX2-NEXT: shlq $45, %rax
-; AVX2-NEXT: sarq $63, %rax
-; AVX2-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
-; AVX2-NEXT: movq %rdx, {{[0-9]+}}(%rsp) # 8-byte Spill
-; AVX2-NEXT: shlq $44, %r8
-; AVX2-NEXT: sarq $63, %r8
-; AVX2-NEXT: vpinsrb $3, %r8d, %xmm0, %xmm0
-; AVX2-NEXT: movq %rdx, %r8
-; AVX2-NEXT: shlq $43, %rcx
-; AVX2-NEXT: sarq $63, %rcx
-; AVX2-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0
-; AVX2-NEXT: movq %rdx, %rcx
-; AVX2-NEXT: shlq $42, %rdi
-; AVX2-NEXT: sarq $63, %rdi
-; AVX2-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; AVX2-NEXT: movq %rdx, %rdi
-; AVX2-NEXT: shlq $41, %r13
-; AVX2-NEXT: sarq $63, %r13
-; AVX2-NEXT: vpinsrb $6, %r13d, %xmm0, %xmm0
-; AVX2-NEXT: movq %rdx, %r13
-; AVX2-NEXT: shlq $40, %rsi
-; AVX2-NEXT: sarq $63, %rsi
-; AVX2-NEXT: vpinsrb $7, %esi, %xmm0, %xmm0
-; AVX2-NEXT: movq %rdx, %rsi
-; AVX2-NEXT: shlq $39, %r10
-; AVX2-NEXT: sarq $63, %r10
-; AVX2-NEXT: vpinsrb $8, %r10d, %xmm0, %xmm0
-; AVX2-NEXT: movq %rdx, %r10
-; AVX2-NEXT: shlq $38, %r11
-; AVX2-NEXT: sarq $63, %r11
-; AVX2-NEXT: vpinsrb $9, %r11d, %xmm0, %xmm0
-; AVX2-NEXT: movsbq %dl, %rax
-; AVX2-NEXT: movq %rax, {{[0-9]+}}(%rsp) # 8-byte Spill
-; AVX2-NEXT: shlq $37, %r9
-; AVX2-NEXT: sarq $63, %r9
-; AVX2-NEXT: vpinsrb $10, %r9d, %xmm0, %xmm0
-; AVX2-NEXT: movq %rdx, %r9
-; AVX2-NEXT: shlq $36, %rbx
-; AVX2-NEXT: sarq $63, %rbx
-; AVX2-NEXT: vpinsrb $11, %ebx, %xmm0, %xmm0
-; AVX2-NEXT: movq %rdx, %rbx
-; AVX2-NEXT: shlq $35, %r14
-; AVX2-NEXT: sarq $63, %r14
-; AVX2-NEXT: vpinsrb $12, %r14d, %xmm0, %xmm0
-; AVX2-NEXT: movq %rdx, %r14
-; AVX2-NEXT: shlq $34, %r15
-; AVX2-NEXT: sarq $63, %r15
-; AVX2-NEXT: vpinsrb $13, %r15d, %xmm0, %xmm0
-; AVX2-NEXT: movq %rdx, %r15
-; AVX2-NEXT: shlq $33, %r12
-; AVX2-NEXT: sarq $63, %r12
-; AVX2-NEXT: vpinsrb $14, %r12d, %xmm0, %xmm0
-; AVX2-NEXT: movq %rdx, %r12
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax # 8-byte Reload
-; AVX2-NEXT: shrq $31, %rax
-; AVX2-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
-; AVX2-NEXT: movq %rdx, %rax
-; AVX2-NEXT: shlq $63, %r8
-; AVX2-NEXT: sarq $63, %r8
-; AVX2-NEXT: vmovd %r8d, %xmm1
-; AVX2-NEXT: movq %rdx, %r8
-; AVX2-NEXT: movswq %dx, %rdx
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r11 # 8-byte Reload
-; AVX2-NEXT: shlq $62, %r11
-; AVX2-NEXT: sarq $63, %r11
-; AVX2-NEXT: vpinsrb $1, %r11d, %xmm1, %xmm1
-; AVX2-NEXT: shlq $61, %rcx
-; AVX2-NEXT: sarq $63, %rcx
-; AVX2-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1
-; AVX2-NEXT: shlq $60, %rdi
-; AVX2-NEXT: sarq $63, %rdi
-; AVX2-NEXT: vpinsrb $3, %edi, %xmm1, %xmm1
-; AVX2-NEXT: shlq $59, %r13
-; AVX2-NEXT: sarq $63, %r13
-; AVX2-NEXT: vpinsrb $4, %r13d, %xmm1, %xmm1
-; AVX2-NEXT: shlq $58, %rsi
-; AVX2-NEXT: sarq $63, %rsi
-; AVX2-NEXT: vpinsrb $5, %esi, %xmm1, %xmm1
-; AVX2-NEXT: shlq $57, %r10
-; AVX2-NEXT: sarq $63, %r10
-; AVX2-NEXT: vpinsrb $6, %r10d, %xmm1, %xmm1
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rcx # 8-byte Reload
-; AVX2-NEXT: shrq $7, %rcx
-; AVX2-NEXT: vpinsrb $7, %ecx, %xmm1, %xmm1
-; AVX2-NEXT: shlq $55, %r9
-; AVX2-NEXT: sarq $63, %r9
-; AVX2-NEXT: vpinsrb $8, %r9d, %xmm1, %xmm1
-; AVX2-NEXT: shlq $54, %rbx
-; AVX2-NEXT: sarq $63, %rbx
-; AVX2-NEXT: vpinsrb $9, %ebx, %xmm1, %xmm1
-; AVX2-NEXT: shlq $53, %r14
-; AVX2-NEXT: sarq $63, %r14
-; AVX2-NEXT: vpinsrb $10, %r14d, %xmm1, %xmm1
-; AVX2-NEXT: shlq $52, %r15
-; AVX2-NEXT: sarq $63, %r15
-; AVX2-NEXT: vpinsrb $11, %r15d, %xmm1, %xmm1
-; AVX2-NEXT: shlq $51, %r12
-; AVX2-NEXT: sarq $63, %r12
-; AVX2-NEXT: vpinsrb $12, %r12d, %xmm1, %xmm1
-; AVX2-NEXT: shlq $50, %rax
-; AVX2-NEXT: sarq $63, %rax
-; AVX2-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
-; AVX2-NEXT: shlq $49, %r8
-; AVX2-NEXT: sarq $63, %r8
-; AVX2-NEXT: vpinsrb $14, %r8d, %xmm1, %xmm1
-; AVX2-NEXT: shrq $15, %rdx
-; AVX2-NEXT: vpinsrb $15, %edx, %xmm1, %xmm1
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovd %edi, %xmm0
+; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,1,1,4,5,6,7]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,2,3,3,4,5,6,7]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX2-NEXT: leaq -40(%rbp), %rsp
-; AVX2-NEXT: popq %rbx
-; AVX2-NEXT: popq %r12
-; AVX2-NEXT: popq %r13
-; AVX2-NEXT: popq %r14
-; AVX2-NEXT: popq %r15
-; AVX2-NEXT: popq %rbp
+; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
+; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: ext_i32_32i8:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: kmovd %edi, %k0
; AVX512-NEXT: vpmovm2b %k0, %ymm0
; AVX512-NEXT: retq
@@ -1502,164 +421,74 @@ define <32 x i8> @ext_i32_32i8(i32 %a0) {
define <8 x i64> @ext_i8_8i64(i8 %a0) {
; SSE2-SSSE3-LABEL: ext_i8_8i64:
-; SSE2-SSSE3: # BB#0:
-; SSE2-SSSE3-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $3, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $2, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
-; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm3
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
-; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
-; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $5, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $4, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
-; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $6, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
-; SSE2-SSSE3-NEXT: shrl $7, %eax
-; SSE2-SSSE3-NEXT: movzwl %ax, %eax
-; SSE2-SSSE3-NEXT: movd %eax, %xmm2
-; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0]
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,1,0,3]
-; SSE2-SSSE3-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,6,7]
-; SSE2-SSSE3-NEXT: psllq $63, %xmm0
-; SSE2-SSSE3-NEXT: psrad $31, %xmm0
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,3]
-; SSE2-SSSE3-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,6,7]
-; SSE2-SSSE3-NEXT: psllq $63, %xmm1
-; SSE2-SSSE3-NEXT: psrad $31, %xmm1
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,1,2,3]
-; SSE2-SSSE3-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,6,7]
-; SSE2-SSSE3-NEXT: psllq $63, %xmm2
-; SSE2-SSSE3-NEXT: psrad $31, %xmm2
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,1,3,3]
-; SSE2-SSSE3-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,6,7]
-; SSE2-SSSE3-NEXT: psllq $63, %xmm3
-; SSE2-SSSE3-NEXT: psrad $31, %xmm3
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSE2-SSSE3: # %bb.0:
+; SSE2-SSSE3-NEXT: # kill: def %edi killed %edi def %rdi
+; SSE2-SSSE3-NEXT: movq %rdi, %xmm0
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,1,0,1]
+; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [1,2]
+; SSE2-SSSE3-NEXT: movdqa %xmm4, %xmm1
+; SSE2-SSSE3-NEXT: pand %xmm0, %xmm1
+; SSE2-SSSE3-NEXT: pcmpeqd %xmm0, %xmm1
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,0,3,2]
+; SSE2-SSSE3-NEXT: pand %xmm1, %xmm0
+; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [4,8]
+; SSE2-SSSE3-NEXT: movdqa %xmm4, %xmm2
+; SSE2-SSSE3-NEXT: pand %xmm1, %xmm2
+; SSE2-SSSE3-NEXT: pcmpeqd %xmm1, %xmm2
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,0,3,2]
+; SSE2-SSSE3-NEXT: pand %xmm2, %xmm1
+; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [16,32]
+; SSE2-SSSE3-NEXT: movdqa %xmm4, %xmm3
+; SSE2-SSSE3-NEXT: pand %xmm2, %xmm3
+; SSE2-SSSE3-NEXT: pcmpeqd %xmm2, %xmm3
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,0,3,2]
+; SSE2-SSSE3-NEXT: pand %xmm3, %xmm2
+; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [64,128]
+; SSE2-SSSE3-NEXT: pand %xmm3, %xmm4
+; SSE2-SSSE3-NEXT: pcmpeqd %xmm3, %xmm4
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,0,3,2]
+; SSE2-SSSE3-NEXT: pand %xmm4, %xmm3
; SSE2-SSSE3-NEXT: retq
;
; AVX1-LABEL: ext_i8_8i64:
-; AVX1: # BB#0:
-; AVX1-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; AVX1-NEXT: movl %eax, %ecx
-; AVX1-NEXT: shrl %ecx
-; AVX1-NEXT: andl $1, %ecx
-; AVX1-NEXT: movl %eax, %edx
-; AVX1-NEXT: andl $1, %edx
-; AVX1-NEXT: vmovd %edx, %xmm0
-; AVX1-NEXT: vpinsrw $1, %ecx, %xmm0, %xmm0
-; AVX1-NEXT: movl %eax, %ecx
-; AVX1-NEXT: shrl $2, %ecx
-; AVX1-NEXT: andl $1, %ecx
-; AVX1-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0
-; AVX1-NEXT: movl %eax, %ecx
-; AVX1-NEXT: shrl $3, %ecx
-; AVX1-NEXT: andl $1, %ecx
-; AVX1-NEXT: vpinsrw $3, %ecx, %xmm0, %xmm0
-; AVX1-NEXT: movl %eax, %ecx
-; AVX1-NEXT: shrl $4, %ecx
-; AVX1-NEXT: andl $1, %ecx
-; AVX1-NEXT: vpinsrw $4, %ecx, %xmm0, %xmm0
-; AVX1-NEXT: movl %eax, %ecx
-; AVX1-NEXT: shrl $5, %ecx
-; AVX1-NEXT: andl $1, %ecx
-; AVX1-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0
-; AVX1-NEXT: movl %eax, %ecx
-; AVX1-NEXT: shrl $6, %ecx
-; AVX1-NEXT: andl $1, %ecx
-; AVX1-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0
-; AVX1-NEXT: shrl $7, %eax
-; AVX1-NEXT: movzwl %ax, %eax
-; AVX1-NEXT: vpinsrw $7, %eax, %xmm0, %xmm1
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; AVX1-NEXT: vpslld $31, %xmm0, %xmm0
-; AVX1-NEXT: vpsrad $31, %xmm0, %xmm0
-; AVX1-NEXT: vpmovsxdq %xmm0, %xmm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; AVX1-NEXT: vpslld $31, %xmm1, %xmm1
-; AVX1-NEXT: vpsrad $31, %xmm1, %xmm1
-; AVX1-NEXT: vpmovsxdq %xmm1, %xmm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; AVX1-NEXT: vpmovsxdq %xmm1, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1: # %bb.0:
+; AVX1-NEXT: # kill: def %edi killed %edi def %rdi
+; AVX1-NEXT: vmovq %rdi, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1
+; AVX1-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpcmpeqq %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
+; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpcmpeqq %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpcmpeqq %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpcmpeqq %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
; AVX1-NEXT: retq
;
; AVX2-LABEL: ext_i8_8i64:
-; AVX2: # BB#0:
-; AVX2-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; AVX2-NEXT: movl %eax, %ecx
-; AVX2-NEXT: shrl %ecx
-; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: movl %eax, %edx
-; AVX2-NEXT: andl $1, %edx
-; AVX2-NEXT: vmovd %edx, %xmm0
-; AVX2-NEXT: vpinsrw $1, %ecx, %xmm0, %xmm0
-; AVX2-NEXT: movl %eax, %ecx
-; AVX2-NEXT: shrl $2, %ecx
-; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0
-; AVX2-NEXT: movl %eax, %ecx
-; AVX2-NEXT: shrl $3, %ecx
-; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: vpinsrw $3, %ecx, %xmm0, %xmm0
-; AVX2-NEXT: movl %eax, %ecx
-; AVX2-NEXT: shrl $4, %ecx
-; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: vpinsrw $4, %ecx, %xmm0, %xmm0
-; AVX2-NEXT: movl %eax, %ecx
-; AVX2-NEXT: shrl $5, %ecx
-; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0
-; AVX2-NEXT: movl %eax, %ecx
-; AVX2-NEXT: shrl $6, %ecx
-; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0
-; AVX2-NEXT: shrl $7, %eax
-; AVX2-NEXT: movzwl %ax, %eax
-; AVX2-NEXT: vpinsrw $7, %eax, %xmm0, %xmm1
-; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; AVX2-NEXT: vpslld $31, %xmm0, %xmm0
-; AVX2-NEXT: vpsrad $31, %xmm0, %xmm0
-; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0
-; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; AVX2-NEXT: vpslld $31, %xmm1, %xmm1
-; AVX2-NEXT: vpsrad $31, %xmm1, %xmm1
-; AVX2-NEXT: vpmovsxdq %xmm1, %ymm1
+; AVX2: # %bb.0:
+; AVX2-NEXT: # kill: def %edi killed %edi def %rdi
+; AVX2-NEXT: vmovq %rdi, %xmm0
+; AVX2-NEXT: vpbroadcastq %xmm0, %ymm1
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [1,2,4,8]
+; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm2
+; AVX2-NEXT: vpcmpeqq %ymm0, %ymm2, %ymm0
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [16,32,64,128]
+; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpcmpeqq %ymm2, %ymm1, %ymm1
; AVX2-NEXT: retq
;
; AVX512-LABEL: ext_i8_8i64:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: kmovd %edi, %k1
; AVX512-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512-NEXT: retq
@@ -1670,267 +499,63 @@ define <8 x i64> @ext_i8_8i64(i8 %a0) {
define <16 x i32> @ext_i16_16i32(i16 %a0) {
; SSE2-SSSE3-LABEL: ext_i16_16i32:
-; SSE2-SSSE3: # BB#0:
-; SSE2-SSSE3-NEXT: movw %di, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $7, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $6, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $5, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $4, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $3, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $2, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm3
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
-; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $11, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $10, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $9, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $8, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $13, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $12, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $14, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
-; SSE2-SSSE3-NEXT: shrl $15, %eax
-; SSE2-SSSE3-NEXT: movzwl %ax, %eax
-; SSE2-SSSE3-NEXT: movd %eax, %xmm4
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
-; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0]
+; SSE2-SSSE3: # %bb.0:
+; SSE2-SSSE3-NEXT: movd %edi, %xmm0
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0]
+; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8]
+; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm0
+; SSE2-SSSE3-NEXT: pand %xmm1, %xmm0
+; SSE2-SSSE3-NEXT: pcmpeqd %xmm1, %xmm0
+; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [16,32,64,128]
; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm1
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm0
-; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; SSE2-SSSE3-NEXT: pslld $31, %xmm0
-; SSE2-SSSE3-NEXT: psrad $31, %xmm0
-; SSE2-SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-SSSE3-NEXT: pslld $31, %xmm1
-; SSE2-SSSE3-NEXT: psrad $31, %xmm1
-; SSE2-SSSE3-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
+; SSE2-SSSE3-NEXT: pand %xmm2, %xmm1
+; SSE2-SSSE3-NEXT: pcmpeqd %xmm2, %xmm1
+; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [256,512,1024,2048]
; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm2
-; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE2-SSSE3-NEXT: pslld $31, %xmm2
-; SSE2-SSSE3-NEXT: psrad $31, %xmm2
-; SSE2-SSSE3-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; SSE2-SSSE3-NEXT: pslld $31, %xmm3
-; SSE2-SSSE3-NEXT: psrad $31, %xmm3
+; SSE2-SSSE3-NEXT: pand %xmm4, %xmm2
+; SSE2-SSSE3-NEXT: pcmpeqd %xmm4, %xmm2
+; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [4096,8192,16384,32768]
+; SSE2-SSSE3-NEXT: pand %xmm4, %xmm3
+; SSE2-SSSE3-NEXT: pcmpeqd %xmm4, %xmm3
; SSE2-SSSE3-NEXT: retq
;
; AVX1-LABEL: ext_i16_16i32:
-; AVX1: # BB#0:
-; AVX1-NEXT: movw %di, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax
-; AVX1-NEXT: movl %eax, %ecx
-; AVX1-NEXT: shrl %ecx
-; AVX1-NEXT: andl $1, %ecx
-; AVX1-NEXT: movl %eax, %edx
-; AVX1-NEXT: andl $1, %edx
-; AVX1-NEXT: vmovd %edx, %xmm0
-; AVX1-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
-; AVX1-NEXT: movl %eax, %ecx
-; AVX1-NEXT: shrl $2, %ecx
-; AVX1-NEXT: andl $1, %ecx
-; AVX1-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0
-; AVX1-NEXT: movl %eax, %ecx
-; AVX1-NEXT: shrl $3, %ecx
-; AVX1-NEXT: andl $1, %ecx
-; AVX1-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0
-; AVX1-NEXT: movl %eax, %ecx
-; AVX1-NEXT: shrl $4, %ecx
-; AVX1-NEXT: andl $1, %ecx
-; AVX1-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0
-; AVX1-NEXT: movl %eax, %ecx
-; AVX1-NEXT: shrl $5, %ecx
-; AVX1-NEXT: andl $1, %ecx
-; AVX1-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0
-; AVX1-NEXT: movl %eax, %ecx
-; AVX1-NEXT: shrl $6, %ecx
-; AVX1-NEXT: andl $1, %ecx
-; AVX1-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; AVX1-NEXT: movl %eax, %ecx
-; AVX1-NEXT: shrl $7, %ecx
-; AVX1-NEXT: andl $1, %ecx
-; AVX1-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0
-; AVX1-NEXT: movl %eax, %ecx
-; AVX1-NEXT: shrl $8, %ecx
-; AVX1-NEXT: andl $1, %ecx
-; AVX1-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0
-; AVX1-NEXT: movl %eax, %ecx
-; AVX1-NEXT: shrl $9, %ecx
-; AVX1-NEXT: andl $1, %ecx
-; AVX1-NEXT: vpinsrb $9, %ecx, %xmm0, %xmm0
-; AVX1-NEXT: movl %eax, %ecx
-; AVX1-NEXT: shrl $10, %ecx
-; AVX1-NEXT: andl $1, %ecx
-; AVX1-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0
-; AVX1-NEXT: movl %eax, %ecx
-; AVX1-NEXT: shrl $11, %ecx
-; AVX1-NEXT: andl $1, %ecx
-; AVX1-NEXT: vpinsrb $11, %ecx, %xmm0, %xmm0
-; AVX1-NEXT: movl %eax, %ecx
-; AVX1-NEXT: shrl $12, %ecx
-; AVX1-NEXT: andl $1, %ecx
-; AVX1-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0
-; AVX1-NEXT: movl %eax, %ecx
-; AVX1-NEXT: shrl $13, %ecx
-; AVX1-NEXT: andl $1, %ecx
-; AVX1-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0
-; AVX1-NEXT: movl %eax, %ecx
-; AVX1-NEXT: shrl $14, %ecx
-; AVX1-NEXT: andl $1, %ecx
-; AVX1-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; AVX1-NEXT: shrl $15, %eax
-; AVX1-NEXT: movzwl %ax, %eax
-; AVX1-NEXT: vpinsrb $15, %eax, %xmm0, %xmm1
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX1-NEXT: vpslld $31, %xmm2, %xmm2
-; AVX1-NEXT: vpsrad $31, %xmm2, %xmm2
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
-; AVX1-NEXT: vpslld $31, %xmm0, %xmm0
-; AVX1-NEXT: vpsrad $31, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; AVX1-NEXT: vpslld $31, %xmm2, %xmm2
-; AVX1-NEXT: vpsrad $31, %xmm2, %xmm2
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; AVX1-NEXT: vpslld $31, %xmm1, %xmm1
-; AVX1-NEXT: vpsrad $31, %xmm1, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovd %edi, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1
+; AVX1-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
+; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
; AVX1-NEXT: retq
;
; AVX2-LABEL: ext_i16_16i32:
-; AVX2: # BB#0:
-; AVX2-NEXT: movw %di, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax
-; AVX2-NEXT: movl %eax, %ecx
-; AVX2-NEXT: shrl %ecx
-; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: movl %eax, %edx
-; AVX2-NEXT: andl $1, %edx
-; AVX2-NEXT: vmovd %edx, %xmm0
-; AVX2-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
-; AVX2-NEXT: movl %eax, %ecx
-; AVX2-NEXT: shrl $2, %ecx
-; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0
-; AVX2-NEXT: movl %eax, %ecx
-; AVX2-NEXT: shrl $3, %ecx
-; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0
-; AVX2-NEXT: movl %eax, %ecx
-; AVX2-NEXT: shrl $4, %ecx
-; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0
-; AVX2-NEXT: movl %eax, %ecx
-; AVX2-NEXT: shrl $5, %ecx
-; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0
-; AVX2-NEXT: movl %eax, %ecx
-; AVX2-NEXT: shrl $6, %ecx
-; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; AVX2-NEXT: movl %eax, %ecx
-; AVX2-NEXT: shrl $7, %ecx
-; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0
-; AVX2-NEXT: movl %eax, %ecx
-; AVX2-NEXT: shrl $8, %ecx
-; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0
-; AVX2-NEXT: movl %eax, %ecx
-; AVX2-NEXT: shrl $9, %ecx
-; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: vpinsrb $9, %ecx, %xmm0, %xmm0
-; AVX2-NEXT: movl %eax, %ecx
-; AVX2-NEXT: shrl $10, %ecx
-; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0
-; AVX2-NEXT: movl %eax, %ecx
-; AVX2-NEXT: shrl $11, %ecx
-; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: vpinsrb $11, %ecx, %xmm0, %xmm0
-; AVX2-NEXT: movl %eax, %ecx
-; AVX2-NEXT: shrl $12, %ecx
-; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0
-; AVX2-NEXT: movl %eax, %ecx
-; AVX2-NEXT: shrl $13, %ecx
-; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0
-; AVX2-NEXT: movl %eax, %ecx
-; AVX2-NEXT: shrl $14, %ecx
-; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; AVX2-NEXT: shrl $15, %eax
-; AVX2-NEXT: movzwl %ax, %eax
-; AVX2-NEXT: vpinsrb $15, %eax, %xmm0, %xmm1
-; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX2-NEXT: vpslld $31, %ymm0, %ymm0
-; AVX2-NEXT: vpsrad $31, %ymm0, %ymm0
-; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX2-NEXT: vpslld $31, %ymm1, %ymm1
-; AVX2-NEXT: vpsrad $31, %ymm1, %ymm1
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovd %edi, %xmm0
+; AVX2-NEXT: vpbroadcastd %xmm0, %ymm1
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [1,2,4,8,16,32,64,128]
+; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm2
+; AVX2-NEXT: vpcmpeqd %ymm0, %ymm2, %ymm0
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [256,512,1024,2048,4096,8192,16384,32768]
+; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpcmpeqd %ymm2, %ymm1, %ymm1
; AVX2-NEXT: retq
;
; AVX512-LABEL: ext_i16_16i32:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: kmovd %edi, %k1
; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512-NEXT: retq
@@ -1941,563 +566,70 @@ define <16 x i32> @ext_i16_16i32(i16 %a0) {
define <32 x i16> @ext_i32_32i16(i32 %a0) {
; SSE2-SSSE3-LABEL: ext_i32_32i16:
-; SSE2-SSSE3: # BB#0:
-; SSE2-SSSE3-NEXT: movl %edi, %eax
-; SSE2-SSSE3-NEXT: shrl $16, %eax
-; SSE2-SSSE3-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movw %di, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $7, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $6, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $5, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $4, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $3, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $2, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm3
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
-; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $11, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $10, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $9, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $8, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $13, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $12, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $14, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
-; SSE2-SSSE3-NEXT: shrl $15, %eax
-; SSE2-SSSE3-NEXT: movzwl %ax, %eax
-; SSE2-SSSE3-NEXT: movd %eax, %xmm4
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
-; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0]
-; SSE2-SSSE3-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $7, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $6, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $5, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $4, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $3, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $2, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm4
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
-; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $11, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $10, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $9, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm4
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $8, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
-; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $13, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $12, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm4
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $14, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
-; SSE2-SSSE3-NEXT: shrl $15, %eax
-; SSE2-SSSE3-NEXT: movzwl %ax, %eax
-; SSE2-SSSE3-NEXT: movd %eax, %xmm5
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7]
-; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
-; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
-; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE2-SSSE3: # %bb.0:
+; SSE2-SSSE3-NEXT: movd %edi, %xmm2
+; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,0,0,0,4,5,6,7]
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1]
+; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [1,2,4,8,16,32,64,128]
; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm0
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-SSSE3-NEXT: psllw $15, %xmm0
-; SSE2-SSSE3-NEXT: psraw $15, %xmm0
-; SSE2-SSSE3-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
-; SSE2-SSSE3-NEXT: psllw $15, %xmm1
-; SSE2-SSSE3-NEXT: psraw $15, %xmm1
+; SSE2-SSSE3-NEXT: pand %xmm4, %xmm0
+; SSE2-SSSE3-NEXT: pcmpeqw %xmm4, %xmm0
+; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [256,512,1024,2048,4096,8192,16384,32768]
+; SSE2-SSSE3-NEXT: pand %xmm5, %xmm1
+; SSE2-SSSE3-NEXT: pcmpeqw %xmm5, %xmm1
+; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,1,1,1,4,5,6,7]
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,1,1]
; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm2
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE2-SSSE3-NEXT: psllw $15, %xmm2
-; SSE2-SSSE3-NEXT: psraw $15, %xmm2
-; SSE2-SSSE3-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
-; SSE2-SSSE3-NEXT: psllw $15, %xmm3
-; SSE2-SSSE3-NEXT: psraw $15, %xmm3
+; SSE2-SSSE3-NEXT: pand %xmm4, %xmm2
+; SSE2-SSSE3-NEXT: pcmpeqw %xmm4, %xmm2
+; SSE2-SSSE3-NEXT: pand %xmm5, %xmm3
+; SSE2-SSSE3-NEXT: pcmpeqw %xmm5, %xmm3
; SSE2-SSSE3-NEXT: retq
;
; AVX1-LABEL: ext_i32_32i16:
-; AVX1: # BB#0:
-; AVX1-NEXT: pushq %rbp
-; AVX1-NEXT: .Lcfi20:
-; AVX1-NEXT: .cfi_def_cfa_offset 16
-; AVX1-NEXT: .Lcfi21:
-; AVX1-NEXT: .cfi_offset %rbp, -16
-; AVX1-NEXT: movq %rsp, %rbp
-; AVX1-NEXT: .Lcfi22:
-; AVX1-NEXT: .cfi_def_cfa_register %rbp
-; AVX1-NEXT: pushq %r15
-; AVX1-NEXT: pushq %r14
-; AVX1-NEXT: pushq %r13
-; AVX1-NEXT: pushq %r12
-; AVX1-NEXT: pushq %rbx
-; AVX1-NEXT: andq $-32, %rsp
-; AVX1-NEXT: subq $128, %rsp
-; AVX1-NEXT: .Lcfi23:
-; AVX1-NEXT: .cfi_offset %rbx, -56
-; AVX1-NEXT: .Lcfi24:
-; AVX1-NEXT: .cfi_offset %r12, -48
-; AVX1-NEXT: .Lcfi25:
-; AVX1-NEXT: .cfi_offset %r13, -40
-; AVX1-NEXT: .Lcfi26:
-; AVX1-NEXT: .cfi_offset %r14, -32
-; AVX1-NEXT: .Lcfi27:
-; AVX1-NEXT: .cfi_offset %r15, -24
-; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
-; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
-; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
-; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
-; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
-; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
-; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
-; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
-; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
-; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
-; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
-; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
-; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
-; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
-; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
-; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
-; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
-; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
-; AVX1-NEXT: movl %edi, %r13d
-; AVX1-NEXT: movl %edi, %r12d
-; AVX1-NEXT: movl %edi, %r15d
-; AVX1-NEXT: movl %edi, %r14d
-; AVX1-NEXT: movl %edi, %ebx
-; AVX1-NEXT: movl %edi, %r11d
-; AVX1-NEXT: movl %edi, %r10d
-; AVX1-NEXT: movl %edi, %r9d
-; AVX1-NEXT: movl %edi, %r8d
-; AVX1-NEXT: movl %edi, %esi
-; AVX1-NEXT: movl %edi, %edx
-; AVX1-NEXT: movl %edi, %ecx
-; AVX1-NEXT: movl %edi, %eax
-; AVX1-NEXT: andl $1, %edi
-; AVX1-NEXT: vmovd %edi, %xmm0
-; AVX1-NEXT: shrl %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
-; AVX1-NEXT: shrl $2, %ecx
-; AVX1-NEXT: andl $1, %ecx
-; AVX1-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0
-; AVX1-NEXT: shrl $3, %edx
-; AVX1-NEXT: andl $1, %edx
-; AVX1-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
-; AVX1-NEXT: shrl $4, %esi
-; AVX1-NEXT: andl $1, %esi
-; AVX1-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; AVX1-NEXT: shrl $5, %r8d
-; AVX1-NEXT: andl $1, %r8d
-; AVX1-NEXT: vpinsrb $5, %r8d, %xmm0, %xmm0
-; AVX1-NEXT: shrl $6, %r9d
-; AVX1-NEXT: andl $1, %r9d
-; AVX1-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
-; AVX1-NEXT: shrl $7, %r10d
-; AVX1-NEXT: andl $1, %r10d
-; AVX1-NEXT: vpinsrb $7, %r10d, %xmm0, %xmm0
-; AVX1-NEXT: shrl $8, %r11d
-; AVX1-NEXT: andl $1, %r11d
-; AVX1-NEXT: vpinsrb $8, %r11d, %xmm0, %xmm0
-; AVX1-NEXT: shrl $9, %ebx
-; AVX1-NEXT: andl $1, %ebx
-; AVX1-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; AVX1-NEXT: shrl $10, %r14d
-; AVX1-NEXT: andl $1, %r14d
-; AVX1-NEXT: vpinsrb $10, %r14d, %xmm0, %xmm0
-; AVX1-NEXT: shrl $11, %r15d
-; AVX1-NEXT: andl $1, %r15d
-; AVX1-NEXT: vpinsrb $11, %r15d, %xmm0, %xmm0
-; AVX1-NEXT: shrl $12, %r12d
-; AVX1-NEXT: andl $1, %r12d
-; AVX1-NEXT: vpinsrb $12, %r12d, %xmm0, %xmm0
-; AVX1-NEXT: shrl $13, %r13d
-; AVX1-NEXT: andl $1, %r13d
-; AVX1-NEXT: vpinsrb $13, %r13d, %xmm0, %xmm0
-; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
-; AVX1-NEXT: shrl $14, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
-; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
-; AVX1-NEXT: shrl $15, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
-; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
-; AVX1-NEXT: shrl $16, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vmovd %eax, %xmm1
-; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
-; AVX1-NEXT: shrl $17, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
-; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
-; AVX1-NEXT: shrl $18, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
-; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
-; AVX1-NEXT: shrl $19, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1
-; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
-; AVX1-NEXT: shrl $20, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
-; AVX1-NEXT: shrl $21, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
-; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
-; AVX1-NEXT: shrl $22, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1
-; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
-; AVX1-NEXT: shrl $23, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1
-; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
-; AVX1-NEXT: shrl $24, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
-; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
-; AVX1-NEXT: shrl $25, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1
-; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
-; AVX1-NEXT: shrl $26, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1
-; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
-; AVX1-NEXT: shrl $27, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1
-; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
-; AVX1-NEXT: shrl $28, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
-; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
-; AVX1-NEXT: shrl $29, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
-; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
-; AVX1-NEXT: shrl $30, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1
-; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
-; AVX1-NEXT: shrl $31, %eax
-; AVX1-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX1-NEXT: vpsllw $15, %xmm2, %xmm2
-; AVX1-NEXT: vpsraw $15, %xmm2, %xmm2
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-NEXT: vpsllw $15, %xmm0, %xmm0
-; AVX1-NEXT: vpsraw $15, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX1-NEXT: vpsllw $15, %xmm2, %xmm2
-; AVX1-NEXT: vpsraw $15, %xmm2, %xmm2
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-NEXT: vpsllw $15, %xmm1, %xmm1
-; AVX1-NEXT: vpsraw $15, %xmm1, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
-; AVX1-NEXT: leaq -40(%rbp), %rsp
-; AVX1-NEXT: popq %rbx
-; AVX1-NEXT: popq %r12
-; AVX1-NEXT: popq %r13
-; AVX1-NEXT: popq %r14
-; AVX1-NEXT: popq %r15
-; AVX1-NEXT: popq %rbp
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovd %edi, %xmm1
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
+; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; AVX1-NEXT: vpcmpeqw %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
+; AVX1-NEXT: vpxor %xmm5, %xmm3, %xmm3
+; AVX1-NEXT: vpcmpeqw %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vpxor %xmm5, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,1,1,1,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1
+; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpcmpeqw %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpxor %xmm5, %xmm2, %xmm2
+; AVX1-NEXT: vpcmpeqw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpxor %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
; AVX1-NEXT: retq
;
; AVX2-LABEL: ext_i32_32i16:
-; AVX2: # BB#0:
-; AVX2-NEXT: pushq %rbp
-; AVX2-NEXT: .Lcfi20:
-; AVX2-NEXT: .cfi_def_cfa_offset 16
-; AVX2-NEXT: .Lcfi21:
-; AVX2-NEXT: .cfi_offset %rbp, -16
-; AVX2-NEXT: movq %rsp, %rbp
-; AVX2-NEXT: .Lcfi22:
-; AVX2-NEXT: .cfi_def_cfa_register %rbp
-; AVX2-NEXT: pushq %r15
-; AVX2-NEXT: pushq %r14
-; AVX2-NEXT: pushq %r13
-; AVX2-NEXT: pushq %r12
-; AVX2-NEXT: pushq %rbx
-; AVX2-NEXT: andq $-32, %rsp
-; AVX2-NEXT: subq $128, %rsp
-; AVX2-NEXT: .Lcfi23:
-; AVX2-NEXT: .cfi_offset %rbx, -56
-; AVX2-NEXT: .Lcfi24:
-; AVX2-NEXT: .cfi_offset %r12, -48
-; AVX2-NEXT: .Lcfi25:
-; AVX2-NEXT: .cfi_offset %r13, -40
-; AVX2-NEXT: .Lcfi26:
-; AVX2-NEXT: .cfi_offset %r14, -32
-; AVX2-NEXT: .Lcfi27:
-; AVX2-NEXT: .cfi_offset %r15, -24
-; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
-; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
-; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
-; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
-; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
-; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
-; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
-; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
-; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
-; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
-; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
-; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
-; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
-; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
-; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
-; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
-; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
-; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
-; AVX2-NEXT: movl %edi, %r13d
-; AVX2-NEXT: movl %edi, %r12d
-; AVX2-NEXT: movl %edi, %r15d
-; AVX2-NEXT: movl %edi, %r14d
-; AVX2-NEXT: movl %edi, %ebx
-; AVX2-NEXT: movl %edi, %r11d
-; AVX2-NEXT: movl %edi, %r10d
-; AVX2-NEXT: movl %edi, %r9d
-; AVX2-NEXT: movl %edi, %r8d
-; AVX2-NEXT: movl %edi, %esi
-; AVX2-NEXT: movl %edi, %edx
-; AVX2-NEXT: movl %edi, %ecx
-; AVX2-NEXT: movl %edi, %eax
-; AVX2-NEXT: andl $1, %edi
+; AVX2: # %bb.0:
; AVX2-NEXT: vmovd %edi, %xmm0
-; AVX2-NEXT: shrl %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
-; AVX2-NEXT: shrl $2, %ecx
-; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0
-; AVX2-NEXT: shrl $3, %edx
-; AVX2-NEXT: andl $1, %edx
-; AVX2-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
-; AVX2-NEXT: shrl $4, %esi
-; AVX2-NEXT: andl $1, %esi
-; AVX2-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; AVX2-NEXT: shrl $5, %r8d
-; AVX2-NEXT: andl $1, %r8d
-; AVX2-NEXT: vpinsrb $5, %r8d, %xmm0, %xmm0
-; AVX2-NEXT: shrl $6, %r9d
-; AVX2-NEXT: andl $1, %r9d
-; AVX2-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
-; AVX2-NEXT: shrl $7, %r10d
-; AVX2-NEXT: andl $1, %r10d
-; AVX2-NEXT: vpinsrb $7, %r10d, %xmm0, %xmm0
-; AVX2-NEXT: shrl $8, %r11d
-; AVX2-NEXT: andl $1, %r11d
-; AVX2-NEXT: vpinsrb $8, %r11d, %xmm0, %xmm0
-; AVX2-NEXT: shrl $9, %ebx
-; AVX2-NEXT: andl $1, %ebx
-; AVX2-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; AVX2-NEXT: shrl $10, %r14d
-; AVX2-NEXT: andl $1, %r14d
-; AVX2-NEXT: vpinsrb $10, %r14d, %xmm0, %xmm0
-; AVX2-NEXT: shrl $11, %r15d
-; AVX2-NEXT: andl $1, %r15d
-; AVX2-NEXT: vpinsrb $11, %r15d, %xmm0, %xmm0
-; AVX2-NEXT: shrl $12, %r12d
-; AVX2-NEXT: andl $1, %r12d
-; AVX2-NEXT: vpinsrb $12, %r12d, %xmm0, %xmm0
-; AVX2-NEXT: shrl $13, %r13d
-; AVX2-NEXT: andl $1, %r13d
-; AVX2-NEXT: vpinsrb $13, %r13d, %xmm0, %xmm0
-; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
-; AVX2-NEXT: shrl $14, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
-; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
-; AVX2-NEXT: shrl $15, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
-; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
-; AVX2-NEXT: shrl $16, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vmovd %eax, %xmm1
-; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
-; AVX2-NEXT: shrl $17, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
-; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
-; AVX2-NEXT: shrl $18, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
-; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
-; AVX2-NEXT: shrl $19, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1
-; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
-; AVX2-NEXT: shrl $20, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
-; AVX2-NEXT: shrl $21, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
-; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
-; AVX2-NEXT: shrl $22, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1
-; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
-; AVX2-NEXT: shrl $23, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1
-; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
-; AVX2-NEXT: shrl $24, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
-; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
-; AVX2-NEXT: shrl $25, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1
-; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
-; AVX2-NEXT: shrl $26, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1
-; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
-; AVX2-NEXT: shrl $27, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1
-; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
-; AVX2-NEXT: shrl $28, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
-; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
-; AVX2-NEXT: shrl $29, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
-; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
-; AVX2-NEXT: shrl $30, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1
-; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
-; AVX2-NEXT: shrl $31, %eax
-; AVX2-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1
-; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX2-NEXT: vpsllw $15, %ymm0, %ymm0
-; AVX2-NEXT: vpsraw $15, %ymm0, %ymm0
-; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX2-NEXT: vpsllw $15, %ymm1, %ymm1
-; AVX2-NEXT: vpsraw $15, %ymm1, %ymm1
-; AVX2-NEXT: leaq -40(%rbp), %rsp
-; AVX2-NEXT: popq %rbx
-; AVX2-NEXT: popq %r12
-; AVX2-NEXT: popq %r13
-; AVX2-NEXT: popq %r14
-; AVX2-NEXT: popq %r15
-; AVX2-NEXT: popq %rbp
+; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
+; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: shrl $16, %edi
+; AVX2-NEXT: vmovd %edi, %xmm2
+; AVX2-NEXT: vpbroadcastw %xmm2, %ymm2
+; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm2
+; AVX2-NEXT: vpcmpeqw %ymm1, %ymm2, %ymm1
; AVX2-NEXT: retq
;
; AVX512-LABEL: ext_i32_32i16:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: kmovd %edi, %k0
; AVX512-NEXT: vpmovm2w %k0, %zmm0
; AVX512-NEXT: retq
@@ -2508,972 +640,84 @@ define <32 x i16> @ext_i32_32i16(i32 %a0) {
define <64 x i8> @ext_i64_64i8(i64 %a0) {
; SSE2-SSSE3-LABEL: ext_i64_64i8:
-; SSE2-SSSE3: # BB#0:
-; SSE2-SSSE3-NEXT: pushq %rbp
-; SSE2-SSSE3-NEXT: .Lcfi24:
-; SSE2-SSSE3-NEXT: .cfi_def_cfa_offset 16
-; SSE2-SSSE3-NEXT: pushq %r15
-; SSE2-SSSE3-NEXT: .Lcfi25:
-; SSE2-SSSE3-NEXT: .cfi_def_cfa_offset 24
-; SSE2-SSSE3-NEXT: pushq %r14
-; SSE2-SSSE3-NEXT: .Lcfi26:
-; SSE2-SSSE3-NEXT: .cfi_def_cfa_offset 32
-; SSE2-SSSE3-NEXT: pushq %r13
-; SSE2-SSSE3-NEXT: .Lcfi27:
-; SSE2-SSSE3-NEXT: .cfi_def_cfa_offset 40
-; SSE2-SSSE3-NEXT: pushq %r12
-; SSE2-SSSE3-NEXT: .Lcfi28:
-; SSE2-SSSE3-NEXT: .cfi_def_cfa_offset 48
-; SSE2-SSSE3-NEXT: pushq %rbx
-; SSE2-SSSE3-NEXT: .Lcfi29:
-; SSE2-SSSE3-NEXT: .cfi_def_cfa_offset 56
-; SSE2-SSSE3-NEXT: .Lcfi30:
-; SSE2-SSSE3-NEXT: .cfi_offset %rbx, -56
-; SSE2-SSSE3-NEXT: .Lcfi31:
-; SSE2-SSSE3-NEXT: .cfi_offset %r12, -48
-; SSE2-SSSE3-NEXT: .Lcfi32:
-; SSE2-SSSE3-NEXT: .cfi_offset %r13, -40
-; SSE2-SSSE3-NEXT: .Lcfi33:
-; SSE2-SSSE3-NEXT: .cfi_offset %r14, -32
-; SSE2-SSSE3-NEXT: .Lcfi34:
-; SSE2-SSSE3-NEXT: .cfi_offset %r15, -24
-; SSE2-SSSE3-NEXT: .Lcfi35:
-; SSE2-SSSE3-NEXT: .cfi_offset %rbp, -16
-; SSE2-SSSE3-NEXT: movw %di, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movq %rdi, %rax
-; SSE2-SSSE3-NEXT: shrq $32, %rax
-; SSE2-SSSE3-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movq %rdi, %rax
-; SSE2-SSSE3-NEXT: shrq $48, %rax
-; SSE2-SSSE3-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: shrl $16, %edi
-; SSE2-SSSE3-NEXT: movw %di, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movswq -{{[0-9]+}}(%rsp), %rbx
-; SSE2-SSSE3-NEXT: movq %rbx, %r8
-; SSE2-SSSE3-NEXT: movq %rbx, %r9
-; SSE2-SSSE3-NEXT: movq %rbx, %r10
-; SSE2-SSSE3-NEXT: movq %rbx, %r11
-; SSE2-SSSE3-NEXT: movq %rbx, %r14
-; SSE2-SSSE3-NEXT: movq %rbx, %r15
-; SSE2-SSSE3-NEXT: movq %rbx, %r12
-; SSE2-SSSE3-NEXT: movq %rbx, %r13
-; SSE2-SSSE3-NEXT: movq %rbx, %rdi
-; SSE2-SSSE3-NEXT: movq %rbx, %rcx
-; SSE2-SSSE3-NEXT: movq %rbx, %rdx
-; SSE2-SSSE3-NEXT: movq %rbx, %rsi
-; SSE2-SSSE3-NEXT: movq %rbx, %rbp
-; SSE2-SSSE3-NEXT: movq %rbx, %rax
-; SSE2-SSSE3-NEXT: shrq $15, %rax
-; SSE2-SSSE3-NEXT: movd %eax, %xmm0
-; SSE2-SSSE3-NEXT: movq %rbx, %rax
-; SSE2-SSSE3-NEXT: movsbq %bl, %rbx
-; SSE2-SSSE3-NEXT: shlq $49, %r8
-; SSE2-SSSE3-NEXT: sarq $63, %r8
-; SSE2-SSSE3-NEXT: movd %r8d, %xmm15
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3],xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7]
-; SSE2-SSSE3-NEXT: shlq $50, %r9
-; SSE2-SSSE3-NEXT: sarq $63, %r9
-; SSE2-SSSE3-NEXT: movd %r9d, %xmm8
-; SSE2-SSSE3-NEXT: shlq $51, %r10
-; SSE2-SSSE3-NEXT: sarq $63, %r10
-; SSE2-SSSE3-NEXT: movd %r10d, %xmm2
-; SSE2-SSSE3-NEXT: shlq $52, %r11
-; SSE2-SSSE3-NEXT: sarq $63, %r11
-; SSE2-SSSE3-NEXT: movd %r11d, %xmm9
-; SSE2-SSSE3-NEXT: shlq $53, %r14
-; SSE2-SSSE3-NEXT: sarq $63, %r14
-; SSE2-SSSE3-NEXT: movd %r14d, %xmm6
-; SSE2-SSSE3-NEXT: shlq $54, %r15
-; SSE2-SSSE3-NEXT: sarq $63, %r15
-; SSE2-SSSE3-NEXT: movd %r15d, %xmm10
-; SSE2-SSSE3-NEXT: shlq $55, %r12
-; SSE2-SSSE3-NEXT: sarq $63, %r12
-; SSE2-SSSE3-NEXT: movd %r12d, %xmm4
-; SSE2-SSSE3-NEXT: shlq $60, %r13
-; SSE2-SSSE3-NEXT: sarq $63, %r13
-; SSE2-SSSE3-NEXT: movd %r13d, %xmm11
-; SSE2-SSSE3-NEXT: shlq $61, %rdi
-; SSE2-SSSE3-NEXT: sarq $63, %rdi
-; SSE2-SSSE3-NEXT: movd %edi, %xmm5
-; SSE2-SSSE3-NEXT: shlq $62, %rcx
-; SSE2-SSSE3-NEXT: sarq $63, %rcx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm12
-; SSE2-SSSE3-NEXT: shlq $63, %rdx
-; SSE2-SSSE3-NEXT: sarq $63, %rdx
-; SSE2-SSSE3-NEXT: movd %edx, %xmm0
-; SSE2-SSSE3-NEXT: shlq $58, %rsi
-; SSE2-SSSE3-NEXT: sarq $63, %rsi
-; SSE2-SSSE3-NEXT: movd %esi, %xmm13
-; SSE2-SSSE3-NEXT: shlq $59, %rbp
-; SSE2-SSSE3-NEXT: sarq $63, %rbp
-; SSE2-SSSE3-NEXT: movd %ebp, %xmm7
-; SSE2-SSSE3-NEXT: shlq $57, %rax
-; SSE2-SSSE3-NEXT: sarq $63, %rax
-; SSE2-SSSE3-NEXT: movd %eax, %xmm3
-; SSE2-SSSE3-NEXT: shrq $7, %rbx
-; SSE2-SSSE3-NEXT: movd %ebx, %xmm14
-; SSE2-SSSE3-NEXT: movswq -{{[0-9]+}}(%rsp), %rsi
-; SSE2-SSSE3-NEXT: movq %rsi, %r8
-; SSE2-SSSE3-NEXT: movq %rsi, %r9
-; SSE2-SSSE3-NEXT: movq %rsi, %r10
-; SSE2-SSSE3-NEXT: movq %rsi, %r11
-; SSE2-SSSE3-NEXT: movq %rsi, %r14
-; SSE2-SSSE3-NEXT: movq %rsi, %r15
-; SSE2-SSSE3-NEXT: movq %rsi, %r12
-; SSE2-SSSE3-NEXT: movq %rsi, %r13
-; SSE2-SSSE3-NEXT: movq %rsi, %rbx
-; SSE2-SSSE3-NEXT: movq %rsi, %rax
-; SSE2-SSSE3-NEXT: movq %rsi, %rcx
-; SSE2-SSSE3-NEXT: movq %rsi, %rdx
-; SSE2-SSSE3-NEXT: movq %rsi, %rdi
-; SSE2-SSSE3-NEXT: movq %rsi, %rbp
-; SSE2-SSSE3-NEXT: shrq $15, %rbp
-; SSE2-SSSE3-NEXT: movd %ebp, %xmm1
-; SSE2-SSSE3-NEXT: movq %rsi, %rbp
-; SSE2-SSSE3-NEXT: movsbq %sil, %rsi
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3],xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7]
-; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3]
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3],xmm6[4],xmm9[4],xmm6[5],xmm9[5],xmm6[6],xmm9[6],xmm6[7],xmm9[7]
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3],xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7]
-; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3]
-; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm11[0],xmm5[1],xmm11[1],xmm5[2],xmm11[2],xmm5[3],xmm11[3],xmm5[4],xmm11[4],xmm5[5],xmm11[5],xmm5[6],xmm11[6],xmm5[7],xmm11[7]
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3],xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7]
-; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3]
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm13[0],xmm7[1],xmm13[1],xmm7[2],xmm13[2],xmm7[3],xmm13[3],xmm7[4],xmm13[4],xmm7[5],xmm13[5],xmm7[6],xmm13[6],xmm7[7],xmm13[7]
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3],xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7]
-; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3]
-; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1]
-; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
-; SSE2-SSSE3-NEXT: shlq $49, %r8
-; SSE2-SSSE3-NEXT: sarq $63, %r8
-; SSE2-SSSE3-NEXT: movd %r8d, %xmm13
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1],xmm13[2],xmm1[2],xmm13[3],xmm1[3],xmm13[4],xmm1[4],xmm13[5],xmm1[5],xmm13[6],xmm1[6],xmm13[7],xmm1[7]
-; SSE2-SSSE3-NEXT: shlq $50, %r9
-; SSE2-SSSE3-NEXT: sarq $63, %r9
-; SSE2-SSSE3-NEXT: movd %r9d, %xmm1
-; SSE2-SSSE3-NEXT: shlq $51, %r10
-; SSE2-SSSE3-NEXT: sarq $63, %r10
-; SSE2-SSSE3-NEXT: movd %r10d, %xmm3
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
-; SSE2-SSSE3-NEXT: shlq $52, %r11
-; SSE2-SSSE3-NEXT: sarq $63, %r11
-; SSE2-SSSE3-NEXT: movd %r11d, %xmm8
-; SSE2-SSSE3-NEXT: shlq $53, %r14
-; SSE2-SSSE3-NEXT: sarq $63, %r14
-; SSE2-SSSE3-NEXT: movd %r14d, %xmm15
-; SSE2-SSSE3-NEXT: shlq $54, %r15
-; SSE2-SSSE3-NEXT: sarq $63, %r15
-; SSE2-SSSE3-NEXT: movd %r15d, %xmm9
-; SSE2-SSSE3-NEXT: shlq $55, %r12
-; SSE2-SSSE3-NEXT: sarq $63, %r12
-; SSE2-SSSE3-NEXT: movd %r12d, %xmm4
-; SSE2-SSSE3-NEXT: shlq $60, %r13
-; SSE2-SSSE3-NEXT: sarq $63, %r13
-; SSE2-SSSE3-NEXT: movd %r13d, %xmm10
-; SSE2-SSSE3-NEXT: shlq $61, %rbx
-; SSE2-SSSE3-NEXT: sarq $63, %rbx
-; SSE2-SSSE3-NEXT: movd %ebx, %xmm7
-; SSE2-SSSE3-NEXT: shlq $62, %rax
-; SSE2-SSSE3-NEXT: sarq $63, %rax
-; SSE2-SSSE3-NEXT: movd %eax, %xmm11
-; SSE2-SSSE3-NEXT: shlq $63, %rcx
-; SSE2-SSSE3-NEXT: sarq $63, %rcx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
-; SSE2-SSSE3-NEXT: shlq $58, %rdx
-; SSE2-SSSE3-NEXT: sarq $63, %rdx
-; SSE2-SSSE3-NEXT: movd %edx, %xmm12
-; SSE2-SSSE3-NEXT: shlq $59, %rdi
-; SSE2-SSSE3-NEXT: sarq $63, %rdi
-; SSE2-SSSE3-NEXT: movd %edi, %xmm5
-; SSE2-SSSE3-NEXT: shlq $57, %rbp
-; SSE2-SSSE3-NEXT: sarq $63, %rbp
-; SSE2-SSSE3-NEXT: movd %ebp, %xmm1
-; SSE2-SSSE3-NEXT: shrq $7, %rsi
-; SSE2-SSSE3-NEXT: movd %esi, %xmm14
-; SSE2-SSSE3-NEXT: movswq -{{[0-9]+}}(%rsp), %rsi
-; SSE2-SSSE3-NEXT: movq %rsi, %r8
-; SSE2-SSSE3-NEXT: movq %rsi, %r9
-; SSE2-SSSE3-NEXT: movq %rsi, %r10
-; SSE2-SSSE3-NEXT: movq %rsi, %r11
-; SSE2-SSSE3-NEXT: movq %rsi, %r14
-; SSE2-SSSE3-NEXT: movq %rsi, %r15
-; SSE2-SSSE3-NEXT: movq %rsi, %r12
-; SSE2-SSSE3-NEXT: movq %rsi, %r13
-; SSE2-SSSE3-NEXT: movq %rsi, %rbx
-; SSE2-SSSE3-NEXT: movq %rsi, %rax
-; SSE2-SSSE3-NEXT: movq %rsi, %rcx
-; SSE2-SSSE3-NEXT: movq %rsi, %rdx
-; SSE2-SSSE3-NEXT: movq %rsi, %rdi
-; SSE2-SSSE3-NEXT: movq %rsi, %rbp
-; SSE2-SSSE3-NEXT: shrq $15, %rbp
-; SSE2-SSSE3-NEXT: movd %ebp, %xmm6
-; SSE2-SSSE3-NEXT: movq %rsi, %rbp
-; SSE2-SSSE3-NEXT: movsbq %sil, %rsi
-; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm13[0],xmm3[1],xmm13[1],xmm3[2],xmm13[2],xmm3[3],xmm13[3]
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7]
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3],xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7]
-; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm15[0],xmm4[1],xmm15[1],xmm4[2],xmm15[2],xmm4[3],xmm15[3]
-; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3],xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7]
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm11[0],xmm2[1],xmm11[1],xmm2[2],xmm11[2],xmm2[3],xmm11[3],xmm2[4],xmm11[4],xmm2[5],xmm11[5],xmm2[6],xmm11[6],xmm2[7],xmm11[7]
-; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3]
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1],xmm5[2],xmm12[2],xmm5[3],xmm12[3],xmm5[4],xmm12[4],xmm5[5],xmm12[5],xmm5[6],xmm12[6],xmm5[7],xmm12[7]
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3],xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7]
-; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3]
-; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
-; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0]
-; SSE2-SSSE3-NEXT: shlq $49, %r8
-; SSE2-SSSE3-NEXT: sarq $63, %r8
-; SSE2-SSSE3-NEXT: movd %r8d, %xmm1
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7]
-; SSE2-SSSE3-NEXT: shlq $50, %r9
-; SSE2-SSSE3-NEXT: sarq $63, %r9
-; SSE2-SSSE3-NEXT: movd %r9d, %xmm3
-; SSE2-SSSE3-NEXT: shlq $51, %r10
-; SSE2-SSSE3-NEXT: sarq $63, %r10
-; SSE2-SSSE3-NEXT: movd %r10d, %xmm4
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
-; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
-; SSE2-SSSE3-NEXT: shlq $52, %r11
-; SSE2-SSSE3-NEXT: sarq $63, %r11
-; SSE2-SSSE3-NEXT: movd %r11d, %xmm8
-; SSE2-SSSE3-NEXT: shlq $53, %r14
-; SSE2-SSSE3-NEXT: sarq $63, %r14
-; SSE2-SSSE3-NEXT: movd %r14d, %xmm13
-; SSE2-SSSE3-NEXT: shlq $54, %r15
-; SSE2-SSSE3-NEXT: sarq $63, %r15
-; SSE2-SSSE3-NEXT: movd %r15d, %xmm9
-; SSE2-SSSE3-NEXT: shlq $55, %r12
-; SSE2-SSSE3-NEXT: sarq $63, %r12
-; SSE2-SSSE3-NEXT: movd %r12d, %xmm1
-; SSE2-SSSE3-NEXT: shlq $60, %r13
-; SSE2-SSSE3-NEXT: sarq $63, %r13
-; SSE2-SSSE3-NEXT: movd %r13d, %xmm10
-; SSE2-SSSE3-NEXT: shlq $61, %rbx
-; SSE2-SSSE3-NEXT: sarq $63, %rbx
-; SSE2-SSSE3-NEXT: movd %ebx, %xmm15
-; SSE2-SSSE3-NEXT: shlq $62, %rax
-; SSE2-SSSE3-NEXT: sarq $63, %rax
-; SSE2-SSSE3-NEXT: movd %eax, %xmm11
-; SSE2-SSSE3-NEXT: shlq $63, %rcx
-; SSE2-SSSE3-NEXT: sarq $63, %rcx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm3
-; SSE2-SSSE3-NEXT: shlq $58, %rdx
-; SSE2-SSSE3-NEXT: sarq $63, %rdx
-; SSE2-SSSE3-NEXT: movd %edx, %xmm12
-; SSE2-SSSE3-NEXT: shlq $59, %rdi
-; SSE2-SSSE3-NEXT: sarq $63, %rdi
-; SSE2-SSSE3-NEXT: movd %edi, %xmm5
-; SSE2-SSSE3-NEXT: shlq $57, %rbp
-; SSE2-SSSE3-NEXT: sarq $63, %rbp
-; SSE2-SSSE3-NEXT: movd %ebp, %xmm6
-; SSE2-SSSE3-NEXT: shrq $7, %rsi
-; SSE2-SSSE3-NEXT: movd %esi, %xmm14
-; SSE2-SSSE3-NEXT: movswq -{{[0-9]+}}(%rsp), %rsi
-; SSE2-SSSE3-NEXT: movq %rsi, %r8
-; SSE2-SSSE3-NEXT: movq %rsi, %r9
-; SSE2-SSSE3-NEXT: movq %rsi, %r10
-; SSE2-SSSE3-NEXT: movq %rsi, %r11
-; SSE2-SSSE3-NEXT: movq %rsi, %r14
-; SSE2-SSSE3-NEXT: movq %rsi, %r15
-; SSE2-SSSE3-NEXT: movq %rsi, %r12
-; SSE2-SSSE3-NEXT: movq %rsi, %r13
-; SSE2-SSSE3-NEXT: movq %rsi, %rbx
-; SSE2-SSSE3-NEXT: movq %rsi, %rax
-; SSE2-SSSE3-NEXT: movq %rsi, %rcx
-; SSE2-SSSE3-NEXT: movq %rsi, %rdx
-; SSE2-SSSE3-NEXT: movq %rsi, %rdi
-; SSE2-SSSE3-NEXT: movq %rsi, %rbp
-; SSE2-SSSE3-NEXT: shrq $15, %rbp
-; SSE2-SSSE3-NEXT: movd %ebp, %xmm7
-; SSE2-SSSE3-NEXT: movq %rsi, %rbp
-; SSE2-SSSE3-NEXT: movsbq %sil, %rsi
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm8[0],xmm13[1],xmm8[1],xmm13[2],xmm8[2],xmm13[3],xmm8[3],xmm13[4],xmm8[4],xmm13[5],xmm8[5],xmm13[6],xmm8[6],xmm13[7],xmm8[7]
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7]
-; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1],xmm1[2],xmm13[2],xmm1[3],xmm13[3]
-; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm10[0],xmm15[1],xmm10[1],xmm15[2],xmm10[2],xmm15[3],xmm10[3],xmm15[4],xmm10[4],xmm15[5],xmm10[5],xmm15[6],xmm10[6],xmm15[7],xmm10[7]
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm11[0],xmm3[1],xmm11[1],xmm3[2],xmm11[2],xmm3[3],xmm11[3],xmm3[4],xmm11[4],xmm3[5],xmm11[5],xmm3[6],xmm11[6],xmm3[7],xmm11[7]
-; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3]
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1],xmm5[2],xmm12[2],xmm5[3],xmm12[3],xmm5[4],xmm12[4],xmm5[5],xmm12[5],xmm5[6],xmm12[6],xmm5[7],xmm12[7]
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm14[0],xmm6[1],xmm14[1],xmm6[2],xmm14[2],xmm6[3],xmm14[3],xmm6[4],xmm14[4],xmm6[5],xmm14[5],xmm6[6],xmm14[6],xmm6[7],xmm14[7]
-; SSE2-SSSE3-NEXT: shlq $49, %r8
-; SSE2-SSSE3-NEXT: sarq $63, %r8
-; SSE2-SSSE3-NEXT: movd %r8d, %xmm4
-; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
-; SSE2-SSSE3-NEXT: shlq $50, %r9
-; SSE2-SSSE3-NEXT: sarq $63, %r9
-; SSE2-SSSE3-NEXT: movd %r9d, %xmm6
-; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1]
-; SSE2-SSSE3-NEXT: shlq $51, %r10
-; SSE2-SSSE3-NEXT: sarq $63, %r10
-; SSE2-SSSE3-NEXT: movd %r10d, %xmm5
-; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0]
-; SSE2-SSSE3-NEXT: shlq $52, %r11
-; SSE2-SSSE3-NEXT: sarq $63, %r11
-; SSE2-SSSE3-NEXT: movd %r11d, %xmm1
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3],xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7]
-; SSE2-SSSE3-NEXT: shlq $53, %r14
-; SSE2-SSSE3-NEXT: sarq $63, %r14
-; SSE2-SSSE3-NEXT: movd %r14d, %xmm7
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7]
-; SSE2-SSSE3-NEXT: shlq $54, %r15
-; SSE2-SSSE3-NEXT: sarq $63, %r15
-; SSE2-SSSE3-NEXT: movd %r15d, %xmm6
-; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; SSE2-SSSE3-NEXT: shlq $55, %r12
-; SSE2-SSSE3-NEXT: sarq $63, %r12
-; SSE2-SSSE3-NEXT: movd %r12d, %xmm4
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3],xmm7[4],xmm1[4],xmm7[5],xmm1[5],xmm7[6],xmm1[6],xmm7[7],xmm1[7]
-; SSE2-SSSE3-NEXT: shlq $60, %r13
-; SSE2-SSSE3-NEXT: sarq $63, %r13
-; SSE2-SSSE3-NEXT: movd %r13d, %xmm8
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7]
-; SSE2-SSSE3-NEXT: shlq $61, %rbx
-; SSE2-SSSE3-NEXT: sarq $63, %rbx
-; SSE2-SSSE3-NEXT: movd %ebx, %xmm6
-; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3]
-; SSE2-SSSE3-NEXT: shlq $62, %rax
-; SSE2-SSSE3-NEXT: sarq $63, %rax
-; SSE2-SSSE3-NEXT: movd %eax, %xmm7
-; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
-; SSE2-SSSE3-NEXT: shlq $63, %rcx
-; SSE2-SSSE3-NEXT: sarq $63, %rcx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3],xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7]
-; SSE2-SSSE3-NEXT: shlq $58, %rdx
-; SSE2-SSSE3-NEXT: sarq $63, %rdx
-; SSE2-SSSE3-NEXT: movd %edx, %xmm5
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3],xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7]
-; SSE2-SSSE3-NEXT: shlq $59, %rdi
-; SSE2-SSSE3-NEXT: sarq $63, %rdi
-; SSE2-SSSE3-NEXT: movd %edi, %xmm7
-; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3]
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3],xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7]
-; SSE2-SSSE3-NEXT: shlq $57, %rbp
-; SSE2-SSSE3-NEXT: sarq $63, %rbp
-; SSE2-SSSE3-NEXT: movd %ebp, %xmm5
-; SSE2-SSSE3-NEXT: shrq $7, %rsi
-; SSE2-SSSE3-NEXT: movd %esi, %xmm6
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7]
-; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3]
-; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1]
-; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm4[0]
-; SSE2-SSSE3-NEXT: popq %rbx
-; SSE2-SSSE3-NEXT: popq %r12
-; SSE2-SSSE3-NEXT: popq %r13
-; SSE2-SSSE3-NEXT: popq %r14
-; SSE2-SSSE3-NEXT: popq %r15
-; SSE2-SSSE3-NEXT: popq %rbp
+; SSE2-SSSE3: # %bb.0:
+; SSE2-SSSE3-NEXT: movq %rdi, %xmm3
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,0,1,1,4,5,6,7]
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
+; SSE2-SSSE3-NEXT: pand %xmm4, %xmm0
+; SSE2-SSSE3-NEXT: pcmpeqb %xmm4, %xmm0
+; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[2,2,3,3,4,5,6,7]
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
+; SSE2-SSSE3-NEXT: pand %xmm4, %xmm1
+; SSE2-SSSE3-NEXT: pcmpeqb %xmm4, %xmm1
+; SSE2-SSSE3-NEXT: pshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,4,5,5]
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,2,3,3]
+; SSE2-SSSE3-NEXT: pand %xmm4, %xmm2
+; SSE2-SSSE3-NEXT: pcmpeqb %xmm4, %xmm2
+; SSE2-SSSE3-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,6,7,7]
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,2,3,3]
+; SSE2-SSSE3-NEXT: pand %xmm4, %xmm3
+; SSE2-SSSE3-NEXT: pcmpeqb %xmm4, %xmm3
; SSE2-SSSE3-NEXT: retq
;
; AVX1-LABEL: ext_i64_64i8:
-; AVX1: # BB#0:
-; AVX1-NEXT: pushq %rbp
-; AVX1-NEXT: .Lcfi28:
-; AVX1-NEXT: .cfi_def_cfa_offset 16
-; AVX1-NEXT: .Lcfi29:
-; AVX1-NEXT: .cfi_offset %rbp, -16
-; AVX1-NEXT: movq %rsp, %rbp
-; AVX1-NEXT: .Lcfi30:
-; AVX1-NEXT: .cfi_def_cfa_register %rbp
-; AVX1-NEXT: pushq %r15
-; AVX1-NEXT: pushq %r14
-; AVX1-NEXT: pushq %r13
-; AVX1-NEXT: pushq %r12
-; AVX1-NEXT: pushq %rbx
-; AVX1-NEXT: andq $-32, %rsp
-; AVX1-NEXT: subq $128, %rsp
-; AVX1-NEXT: .Lcfi31:
-; AVX1-NEXT: .cfi_offset %rbx, -56
-; AVX1-NEXT: .Lcfi32:
-; AVX1-NEXT: .cfi_offset %r12, -48
-; AVX1-NEXT: .Lcfi33:
-; AVX1-NEXT: .cfi_offset %r13, -40
-; AVX1-NEXT: .Lcfi34:
-; AVX1-NEXT: .cfi_offset %r14, -32
-; AVX1-NEXT: .Lcfi35:
-; AVX1-NEXT: .cfi_offset %r15, -24
-; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp)
-; AVX1-NEXT: shrq $32, %rdi
-; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp)
-; AVX1-NEXT: movslq {{[0-9]+}}(%rsp), %rdx
-; AVX1-NEXT: movq %rdx, %rcx
-; AVX1-NEXT: shlq $47, %rcx
-; AVX1-NEXT: sarq $63, %rcx
-; AVX1-NEXT: vmovd %ecx, %xmm0
-; AVX1-NEXT: movq %rdx, {{[0-9]+}}(%rsp) # 8-byte Spill
-; AVX1-NEXT: movq %rdx, %r8
-; AVX1-NEXT: movq %rdx, %rcx
-; AVX1-NEXT: movq %rdx, %rdi
-; AVX1-NEXT: movq %rdx, %r13
-; AVX1-NEXT: movq %rdx, %rsi
-; AVX1-NEXT: movq %rdx, %r10
-; AVX1-NEXT: movq %rdx, %r11
-; AVX1-NEXT: movq %rdx, %r9
-; AVX1-NEXT: movq %rdx, %rbx
-; AVX1-NEXT: movq %rdx, %r14
-; AVX1-NEXT: movq %rdx, %r15
-; AVX1-NEXT: movq %rdx, %r12
-; AVX1-NEXT: movq %rdx, %rax
-; AVX1-NEXT: shlq $46, %rax
-; AVX1-NEXT: sarq $63, %rax
-; AVX1-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
-; AVX1-NEXT: movq %rdx, {{[0-9]+}}(%rsp) # 8-byte Spill
-; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %rax # 8-byte Reload
-; AVX1-NEXT: shlq $45, %rax
-; AVX1-NEXT: sarq $63, %rax
-; AVX1-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
-; AVX1-NEXT: movq %rdx, {{[0-9]+}}(%rsp) # 8-byte Spill
-; AVX1-NEXT: shlq $44, %r8
-; AVX1-NEXT: sarq $63, %r8
-; AVX1-NEXT: vpinsrb $3, %r8d, %xmm0, %xmm0
-; AVX1-NEXT: movq %rdx, %r8
-; AVX1-NEXT: shlq $43, %rcx
-; AVX1-NEXT: sarq $63, %rcx
-; AVX1-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0
-; AVX1-NEXT: movq %rdx, %rcx
-; AVX1-NEXT: shlq $42, %rdi
-; AVX1-NEXT: sarq $63, %rdi
-; AVX1-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; AVX1-NEXT: movq %rdx, %rdi
-; AVX1-NEXT: shlq $41, %r13
-; AVX1-NEXT: sarq $63, %r13
-; AVX1-NEXT: vpinsrb $6, %r13d, %xmm0, %xmm0
-; AVX1-NEXT: movq %rdx, %r13
-; AVX1-NEXT: shlq $40, %rsi
-; AVX1-NEXT: sarq $63, %rsi
-; AVX1-NEXT: vpinsrb $7, %esi, %xmm0, %xmm0
-; AVX1-NEXT: movq %rdx, %rsi
-; AVX1-NEXT: shlq $39, %r10
-; AVX1-NEXT: sarq $63, %r10
-; AVX1-NEXT: vpinsrb $8, %r10d, %xmm0, %xmm0
-; AVX1-NEXT: movq %rdx, %r10
-; AVX1-NEXT: shlq $38, %r11
-; AVX1-NEXT: sarq $63, %r11
-; AVX1-NEXT: vpinsrb $9, %r11d, %xmm0, %xmm0
-; AVX1-NEXT: movsbq %dl, %rax
-; AVX1-NEXT: movq %rax, {{[0-9]+}}(%rsp) # 8-byte Spill
-; AVX1-NEXT: shlq $37, %r9
-; AVX1-NEXT: sarq $63, %r9
-; AVX1-NEXT: vpinsrb $10, %r9d, %xmm0, %xmm0
-; AVX1-NEXT: movq %rdx, %r9
-; AVX1-NEXT: shlq $36, %rbx
-; AVX1-NEXT: sarq $63, %rbx
-; AVX1-NEXT: vpinsrb $11, %ebx, %xmm0, %xmm0
-; AVX1-NEXT: movq %rdx, %rbx
-; AVX1-NEXT: shlq $35, %r14
-; AVX1-NEXT: sarq $63, %r14
-; AVX1-NEXT: vpinsrb $12, %r14d, %xmm0, %xmm0
-; AVX1-NEXT: movq %rdx, %r14
-; AVX1-NEXT: shlq $34, %r15
-; AVX1-NEXT: sarq $63, %r15
-; AVX1-NEXT: vpinsrb $13, %r15d, %xmm0, %xmm0
-; AVX1-NEXT: movq %rdx, %r15
-; AVX1-NEXT: shlq $33, %r12
-; AVX1-NEXT: sarq $63, %r12
-; AVX1-NEXT: vpinsrb $14, %r12d, %xmm0, %xmm0
-; AVX1-NEXT: movq %rdx, %r12
-; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %rax # 8-byte Reload
-; AVX1-NEXT: shrq $31, %rax
-; AVX1-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
-; AVX1-NEXT: movq %rdx, %rax
-; AVX1-NEXT: shlq $63, %r8
-; AVX1-NEXT: sarq $63, %r8
-; AVX1-NEXT: vmovd %r8d, %xmm1
-; AVX1-NEXT: movq %rdx, %r8
-; AVX1-NEXT: movswq %dx, %rdx
-; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %r11 # 8-byte Reload
-; AVX1-NEXT: shlq $62, %r11
-; AVX1-NEXT: sarq $63, %r11
-; AVX1-NEXT: vpinsrb $1, %r11d, %xmm1, %xmm1
-; AVX1-NEXT: shlq $61, %rcx
-; AVX1-NEXT: sarq $63, %rcx
-; AVX1-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1
-; AVX1-NEXT: shlq $60, %rdi
-; AVX1-NEXT: sarq $63, %rdi
-; AVX1-NEXT: vpinsrb $3, %edi, %xmm1, %xmm1
-; AVX1-NEXT: shlq $59, %r13
-; AVX1-NEXT: sarq $63, %r13
-; AVX1-NEXT: vpinsrb $4, %r13d, %xmm1, %xmm1
-; AVX1-NEXT: shlq $58, %rsi
-; AVX1-NEXT: sarq $63, %rsi
-; AVX1-NEXT: vpinsrb $5, %esi, %xmm1, %xmm1
-; AVX1-NEXT: shlq $57, %r10
-; AVX1-NEXT: sarq $63, %r10
-; AVX1-NEXT: vpinsrb $6, %r10d, %xmm1, %xmm1
-; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %rcx # 8-byte Reload
-; AVX1-NEXT: shrq $7, %rcx
-; AVX1-NEXT: vpinsrb $7, %ecx, %xmm1, %xmm1
-; AVX1-NEXT: shlq $55, %r9
-; AVX1-NEXT: sarq $63, %r9
-; AVX1-NEXT: vpinsrb $8, %r9d, %xmm1, %xmm1
-; AVX1-NEXT: shlq $54, %rbx
-; AVX1-NEXT: sarq $63, %rbx
-; AVX1-NEXT: vpinsrb $9, %ebx, %xmm1, %xmm1
-; AVX1-NEXT: shlq $53, %r14
-; AVX1-NEXT: sarq $63, %r14
-; AVX1-NEXT: vpinsrb $10, %r14d, %xmm1, %xmm1
-; AVX1-NEXT: shlq $52, %r15
-; AVX1-NEXT: sarq $63, %r15
-; AVX1-NEXT: vpinsrb $11, %r15d, %xmm1, %xmm1
-; AVX1-NEXT: shlq $51, %r12
-; AVX1-NEXT: sarq $63, %r12
-; AVX1-NEXT: vpinsrb $12, %r12d, %xmm1, %xmm1
-; AVX1-NEXT: shlq $50, %rax
-; AVX1-NEXT: sarq $63, %rax
-; AVX1-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
-; AVX1-NEXT: shlq $49, %r8
-; AVX1-NEXT: sarq $63, %r8
-; AVX1-NEXT: vpinsrb $14, %r8d, %xmm1, %xmm1
-; AVX1-NEXT: shrq $15, %rdx
-; AVX1-NEXT: vpinsrb $15, %edx, %xmm1, %xmm1
-; AVX1-NEXT: movslq {{[0-9]+}}(%rsp), %rdx
-; AVX1-NEXT: movq %rdx, %rcx
-; AVX1-NEXT: shlq $47, %rcx
-; AVX1-NEXT: sarq $63, %rcx
-; AVX1-NEXT: vmovd %ecx, %xmm2
-; AVX1-NEXT: movq %rdx, %r13
-; AVX1-NEXT: movq %rdx, %rcx
-; AVX1-NEXT: movq %rdx, %r9
-; AVX1-NEXT: movq %rdx, %r12
-; AVX1-NEXT: movq %rdx, %rdi
-; AVX1-NEXT: movq %rdx, %rbx
-; AVX1-NEXT: movq %rdx, %r8
-; AVX1-NEXT: movq %rdx, %r10
-; AVX1-NEXT: movq %rdx, {{[0-9]+}}(%rsp) # 8-byte Spill
-; AVX1-NEXT: movq %rdx, %rsi
-; AVX1-NEXT: movq %rdx, %r11
-; AVX1-NEXT: movq %rdx, %r14
-; AVX1-NEXT: movq %rdx, %r15
-; AVX1-NEXT: movq %rdx, %rax
-; AVX1-NEXT: shlq $46, %rax
-; AVX1-NEXT: sarq $63, %rax
-; AVX1-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2
-; AVX1-NEXT: movq %rdx, {{[0-9]+}}(%rsp) # 8-byte Spill
-; AVX1-NEXT: shlq $45, %r13
-; AVX1-NEXT: sarq $63, %r13
-; AVX1-NEXT: vpinsrb $2, %r13d, %xmm2, %xmm2
-; AVX1-NEXT: movq %rdx, %r13
-; AVX1-NEXT: shlq $44, %rcx
-; AVX1-NEXT: sarq $63, %rcx
-; AVX1-NEXT: vpinsrb $3, %ecx, %xmm2, %xmm2
-; AVX1-NEXT: movq %rdx, %rcx
-; AVX1-NEXT: shlq $43, %r9
-; AVX1-NEXT: sarq $63, %r9
-; AVX1-NEXT: vpinsrb $4, %r9d, %xmm2, %xmm2
-; AVX1-NEXT: movq %rdx, %r9
-; AVX1-NEXT: shlq $42, %r12
-; AVX1-NEXT: sarq $63, %r12
-; AVX1-NEXT: vpinsrb $5, %r12d, %xmm2, %xmm2
-; AVX1-NEXT: movq %rdx, %r12
-; AVX1-NEXT: shlq $41, %rdi
-; AVX1-NEXT: sarq $63, %rdi
-; AVX1-NEXT: vpinsrb $6, %edi, %xmm2, %xmm2
-; AVX1-NEXT: movq %rdx, %rdi
-; AVX1-NEXT: shlq $40, %rbx
-; AVX1-NEXT: sarq $63, %rbx
-; AVX1-NEXT: vpinsrb $7, %ebx, %xmm2, %xmm2
-; AVX1-NEXT: movq %rdx, %rbx
-; AVX1-NEXT: shlq $39, %r8
-; AVX1-NEXT: sarq $63, %r8
-; AVX1-NEXT: vpinsrb $8, %r8d, %xmm2, %xmm2
-; AVX1-NEXT: movq %rdx, %r8
-; AVX1-NEXT: shlq $38, %r10
-; AVX1-NEXT: sarq $63, %r10
-; AVX1-NEXT: vpinsrb $9, %r10d, %xmm2, %xmm2
-; AVX1-NEXT: movsbq %dl, %rax
-; AVX1-NEXT: movq %rax, {{[0-9]+}}(%rsp) # 8-byte Spill
-; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %rax # 8-byte Reload
-; AVX1-NEXT: shlq $37, %rax
-; AVX1-NEXT: sarq $63, %rax
-; AVX1-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2
-; AVX1-NEXT: movq %rdx, %r10
-; AVX1-NEXT: shlq $36, %rsi
-; AVX1-NEXT: sarq $63, %rsi
-; AVX1-NEXT: vpinsrb $11, %esi, %xmm2, %xmm2
-; AVX1-NEXT: movq %rdx, %rsi
-; AVX1-NEXT: shlq $35, %r11
-; AVX1-NEXT: sarq $63, %r11
-; AVX1-NEXT: vpinsrb $12, %r11d, %xmm2, %xmm2
-; AVX1-NEXT: movq %rdx, %r11
-; AVX1-NEXT: shlq $34, %r14
-; AVX1-NEXT: sarq $63, %r14
-; AVX1-NEXT: vpinsrb $13, %r14d, %xmm2, %xmm2
-; AVX1-NEXT: movq %rdx, %r14
-; AVX1-NEXT: shlq $33, %r15
-; AVX1-NEXT: sarq $63, %r15
-; AVX1-NEXT: vpinsrb $14, %r15d, %xmm2, %xmm2
-; AVX1-NEXT: movq %rdx, %r15
-; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %rax # 8-byte Reload
-; AVX1-NEXT: shrq $31, %rax
-; AVX1-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2
-; AVX1-NEXT: movq %rdx, %rax
-; AVX1-NEXT: shlq $63, %rcx
-; AVX1-NEXT: sarq $63, %rcx
-; AVX1-NEXT: vmovd %ecx, %xmm3
-; AVX1-NEXT: movq %rdx, %rcx
-; AVX1-NEXT: movswq %dx, %rdx
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT: shlq $62, %r13
-; AVX1-NEXT: sarq $63, %r13
-; AVX1-NEXT: vpinsrb $1, %r13d, %xmm3, %xmm1
-; AVX1-NEXT: shlq $61, %r9
-; AVX1-NEXT: sarq $63, %r9
-; AVX1-NEXT: vpinsrb $2, %r9d, %xmm1, %xmm1
-; AVX1-NEXT: shlq $60, %r12
-; AVX1-NEXT: sarq $63, %r12
-; AVX1-NEXT: vpinsrb $3, %r12d, %xmm1, %xmm1
-; AVX1-NEXT: shlq $59, %rdi
-; AVX1-NEXT: sarq $63, %rdi
-; AVX1-NEXT: vpinsrb $4, %edi, %xmm1, %xmm1
-; AVX1-NEXT: shlq $58, %rbx
-; AVX1-NEXT: sarq $63, %rbx
-; AVX1-NEXT: vpinsrb $5, %ebx, %xmm1, %xmm1
-; AVX1-NEXT: shlq $57, %r8
-; AVX1-NEXT: sarq $63, %r8
-; AVX1-NEXT: vpinsrb $6, %r8d, %xmm1, %xmm1
-; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %rdi # 8-byte Reload
-; AVX1-NEXT: shrq $7, %rdi
-; AVX1-NEXT: vpinsrb $7, %edi, %xmm1, %xmm1
-; AVX1-NEXT: shlq $55, %r10
-; AVX1-NEXT: sarq $63, %r10
-; AVX1-NEXT: vpinsrb $8, %r10d, %xmm1, %xmm1
-; AVX1-NEXT: shlq $54, %rsi
-; AVX1-NEXT: sarq $63, %rsi
-; AVX1-NEXT: vpinsrb $9, %esi, %xmm1, %xmm1
-; AVX1-NEXT: shlq $53, %r11
-; AVX1-NEXT: sarq $63, %r11
-; AVX1-NEXT: vpinsrb $10, %r11d, %xmm1, %xmm1
-; AVX1-NEXT: shlq $52, %r14
-; AVX1-NEXT: sarq $63, %r14
-; AVX1-NEXT: vpinsrb $11, %r14d, %xmm1, %xmm1
-; AVX1-NEXT: shlq $51, %r15
-; AVX1-NEXT: sarq $63, %r15
-; AVX1-NEXT: vpinsrb $12, %r15d, %xmm1, %xmm1
-; AVX1-NEXT: shlq $50, %rax
-; AVX1-NEXT: sarq $63, %rax
-; AVX1-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
-; AVX1-NEXT: shlq $49, %rcx
-; AVX1-NEXT: sarq $63, %rcx
-; AVX1-NEXT: vpinsrb $14, %ecx, %xmm1, %xmm1
-; AVX1-NEXT: shrq $15, %rdx
-; AVX1-NEXT: vpinsrb $15, %edx, %xmm1, %xmm1
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovq %rdi, %xmm0
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,0,1,1,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[2,2,3,3,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,1,1]
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
+; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; AVX1-NEXT: vpcmpeqb %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
+; AVX1-NEXT: vpxor %xmm5, %xmm3, %xmm3
+; AVX1-NEXT: vpcmpeqb %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vpxor %xmm5, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,4,5,5]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3]
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,6,7,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1
+; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpcmpeqb %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpxor %xmm5, %xmm2, %xmm2
+; AVX1-NEXT: vpcmpeqb %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpxor %xmm5, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
-; AVX1-NEXT: leaq -40(%rbp), %rsp
-; AVX1-NEXT: popq %rbx
-; AVX1-NEXT: popq %r12
-; AVX1-NEXT: popq %r13
-; AVX1-NEXT: popq %r14
-; AVX1-NEXT: popq %r15
-; AVX1-NEXT: popq %rbp
; AVX1-NEXT: retq
;
; AVX2-LABEL: ext_i64_64i8:
-; AVX2: # BB#0:
-; AVX2-NEXT: pushq %rbp
-; AVX2-NEXT: .Lcfi28:
-; AVX2-NEXT: .cfi_def_cfa_offset 16
-; AVX2-NEXT: .Lcfi29:
-; AVX2-NEXT: .cfi_offset %rbp, -16
-; AVX2-NEXT: movq %rsp, %rbp
-; AVX2-NEXT: .Lcfi30:
-; AVX2-NEXT: .cfi_def_cfa_register %rbp
-; AVX2-NEXT: pushq %r15
-; AVX2-NEXT: pushq %r14
-; AVX2-NEXT: pushq %r13
-; AVX2-NEXT: pushq %r12
-; AVX2-NEXT: pushq %rbx
-; AVX2-NEXT: andq $-32, %rsp
-; AVX2-NEXT: subq $128, %rsp
-; AVX2-NEXT: .Lcfi31:
-; AVX2-NEXT: .cfi_offset %rbx, -56
-; AVX2-NEXT: .Lcfi32:
-; AVX2-NEXT: .cfi_offset %r12, -48
-; AVX2-NEXT: .Lcfi33:
-; AVX2-NEXT: .cfi_offset %r13, -40
-; AVX2-NEXT: .Lcfi34:
-; AVX2-NEXT: .cfi_offset %r14, -32
-; AVX2-NEXT: .Lcfi35:
-; AVX2-NEXT: .cfi_offset %r15, -24
-; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: shrq $32, %rdi
-; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movslq {{[0-9]+}}(%rsp), %rdx
-; AVX2-NEXT: movq %rdx, %rcx
-; AVX2-NEXT: shlq $47, %rcx
-; AVX2-NEXT: sarq $63, %rcx
-; AVX2-NEXT: vmovd %ecx, %xmm0
-; AVX2-NEXT: movq %rdx, {{[0-9]+}}(%rsp) # 8-byte Spill
-; AVX2-NEXT: movq %rdx, %r8
-; AVX2-NEXT: movq %rdx, %rcx
-; AVX2-NEXT: movq %rdx, %rdi
-; AVX2-NEXT: movq %rdx, %r13
-; AVX2-NEXT: movq %rdx, %rsi
-; AVX2-NEXT: movq %rdx, %r10
-; AVX2-NEXT: movq %rdx, %r11
-; AVX2-NEXT: movq %rdx, %r9
-; AVX2-NEXT: movq %rdx, %rbx
-; AVX2-NEXT: movq %rdx, %r14
-; AVX2-NEXT: movq %rdx, %r15
-; AVX2-NEXT: movq %rdx, %r12
-; AVX2-NEXT: movq %rdx, %rax
-; AVX2-NEXT: shlq $46, %rax
-; AVX2-NEXT: sarq $63, %rax
-; AVX2-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
-; AVX2-NEXT: movq %rdx, {{[0-9]+}}(%rsp) # 8-byte Spill
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax # 8-byte Reload
-; AVX2-NEXT: shlq $45, %rax
-; AVX2-NEXT: sarq $63, %rax
-; AVX2-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
-; AVX2-NEXT: movq %rdx, {{[0-9]+}}(%rsp) # 8-byte Spill
-; AVX2-NEXT: shlq $44, %r8
-; AVX2-NEXT: sarq $63, %r8
-; AVX2-NEXT: vpinsrb $3, %r8d, %xmm0, %xmm0
-; AVX2-NEXT: movq %rdx, %r8
-; AVX2-NEXT: shlq $43, %rcx
-; AVX2-NEXT: sarq $63, %rcx
-; AVX2-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0
-; AVX2-NEXT: movq %rdx, %rcx
-; AVX2-NEXT: shlq $42, %rdi
-; AVX2-NEXT: sarq $63, %rdi
-; AVX2-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
-; AVX2-NEXT: movq %rdx, %rdi
-; AVX2-NEXT: shlq $41, %r13
-; AVX2-NEXT: sarq $63, %r13
-; AVX2-NEXT: vpinsrb $6, %r13d, %xmm0, %xmm0
-; AVX2-NEXT: movq %rdx, %r13
-; AVX2-NEXT: shlq $40, %rsi
-; AVX2-NEXT: sarq $63, %rsi
-; AVX2-NEXT: vpinsrb $7, %esi, %xmm0, %xmm0
-; AVX2-NEXT: movq %rdx, %rsi
-; AVX2-NEXT: shlq $39, %r10
-; AVX2-NEXT: sarq $63, %r10
-; AVX2-NEXT: vpinsrb $8, %r10d, %xmm0, %xmm0
-; AVX2-NEXT: movq %rdx, %r10
-; AVX2-NEXT: shlq $38, %r11
-; AVX2-NEXT: sarq $63, %r11
-; AVX2-NEXT: vpinsrb $9, %r11d, %xmm0, %xmm0
-; AVX2-NEXT: movsbq %dl, %rax
-; AVX2-NEXT: movq %rax, {{[0-9]+}}(%rsp) # 8-byte Spill
-; AVX2-NEXT: shlq $37, %r9
-; AVX2-NEXT: sarq $63, %r9
-; AVX2-NEXT: vpinsrb $10, %r9d, %xmm0, %xmm0
-; AVX2-NEXT: movq %rdx, %r9
-; AVX2-NEXT: shlq $36, %rbx
-; AVX2-NEXT: sarq $63, %rbx
-; AVX2-NEXT: vpinsrb $11, %ebx, %xmm0, %xmm0
-; AVX2-NEXT: movq %rdx, %rbx
-; AVX2-NEXT: shlq $35, %r14
-; AVX2-NEXT: sarq $63, %r14
-; AVX2-NEXT: vpinsrb $12, %r14d, %xmm0, %xmm0
-; AVX2-NEXT: movq %rdx, %r14
-; AVX2-NEXT: shlq $34, %r15
-; AVX2-NEXT: sarq $63, %r15
-; AVX2-NEXT: vpinsrb $13, %r15d, %xmm0, %xmm0
-; AVX2-NEXT: movq %rdx, %r15
-; AVX2-NEXT: shlq $33, %r12
-; AVX2-NEXT: sarq $63, %r12
-; AVX2-NEXT: vpinsrb $14, %r12d, %xmm0, %xmm0
-; AVX2-NEXT: movq %rdx, %r12
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax # 8-byte Reload
-; AVX2-NEXT: shrq $31, %rax
-; AVX2-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
-; AVX2-NEXT: movq %rdx, %rax
-; AVX2-NEXT: shlq $63, %r8
-; AVX2-NEXT: sarq $63, %r8
-; AVX2-NEXT: vmovd %r8d, %xmm1
-; AVX2-NEXT: movq %rdx, %r8
-; AVX2-NEXT: movswq %dx, %rdx
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r11 # 8-byte Reload
-; AVX2-NEXT: shlq $62, %r11
-; AVX2-NEXT: sarq $63, %r11
-; AVX2-NEXT: vpinsrb $1, %r11d, %xmm1, %xmm1
-; AVX2-NEXT: shlq $61, %rcx
-; AVX2-NEXT: sarq $63, %rcx
-; AVX2-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1
-; AVX2-NEXT: shlq $60, %rdi
-; AVX2-NEXT: sarq $63, %rdi
-; AVX2-NEXT: vpinsrb $3, %edi, %xmm1, %xmm1
-; AVX2-NEXT: shlq $59, %r13
-; AVX2-NEXT: sarq $63, %r13
-; AVX2-NEXT: vpinsrb $4, %r13d, %xmm1, %xmm1
-; AVX2-NEXT: shlq $58, %rsi
-; AVX2-NEXT: sarq $63, %rsi
-; AVX2-NEXT: vpinsrb $5, %esi, %xmm1, %xmm1
-; AVX2-NEXT: shlq $57, %r10
-; AVX2-NEXT: sarq $63, %r10
-; AVX2-NEXT: vpinsrb $6, %r10d, %xmm1, %xmm1
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rcx # 8-byte Reload
-; AVX2-NEXT: shrq $7, %rcx
-; AVX2-NEXT: vpinsrb $7, %ecx, %xmm1, %xmm1
-; AVX2-NEXT: shlq $55, %r9
-; AVX2-NEXT: sarq $63, %r9
-; AVX2-NEXT: vpinsrb $8, %r9d, %xmm1, %xmm1
-; AVX2-NEXT: shlq $54, %rbx
-; AVX2-NEXT: sarq $63, %rbx
-; AVX2-NEXT: vpinsrb $9, %ebx, %xmm1, %xmm1
-; AVX2-NEXT: shlq $53, %r14
-; AVX2-NEXT: sarq $63, %r14
-; AVX2-NEXT: vpinsrb $10, %r14d, %xmm1, %xmm1
-; AVX2-NEXT: shlq $52, %r15
-; AVX2-NEXT: sarq $63, %r15
-; AVX2-NEXT: vpinsrb $11, %r15d, %xmm1, %xmm1
-; AVX2-NEXT: shlq $51, %r12
-; AVX2-NEXT: sarq $63, %r12
-; AVX2-NEXT: vpinsrb $12, %r12d, %xmm1, %xmm1
-; AVX2-NEXT: shlq $50, %rax
-; AVX2-NEXT: sarq $63, %rax
-; AVX2-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
-; AVX2-NEXT: shlq $49, %r8
-; AVX2-NEXT: sarq $63, %r8
-; AVX2-NEXT: vpinsrb $14, %r8d, %xmm1, %xmm1
-; AVX2-NEXT: shrq $15, %rdx
-; AVX2-NEXT: vpinsrb $15, %edx, %xmm1, %xmm1
-; AVX2-NEXT: movslq {{[0-9]+}}(%rsp), %rdx
-; AVX2-NEXT: movq %rdx, %rcx
-; AVX2-NEXT: shlq $47, %rcx
-; AVX2-NEXT: sarq $63, %rcx
-; AVX2-NEXT: vmovd %ecx, %xmm2
-; AVX2-NEXT: movq %rdx, %r13
-; AVX2-NEXT: movq %rdx, %rcx
-; AVX2-NEXT: movq %rdx, %r9
-; AVX2-NEXT: movq %rdx, %r12
-; AVX2-NEXT: movq %rdx, %rdi
-; AVX2-NEXT: movq %rdx, %rbx
-; AVX2-NEXT: movq %rdx, %r8
-; AVX2-NEXT: movq %rdx, %r10
-; AVX2-NEXT: movq %rdx, {{[0-9]+}}(%rsp) # 8-byte Spill
-; AVX2-NEXT: movq %rdx, %rsi
-; AVX2-NEXT: movq %rdx, %r11
-; AVX2-NEXT: movq %rdx, %r14
-; AVX2-NEXT: movq %rdx, %r15
-; AVX2-NEXT: movq %rdx, %rax
-; AVX2-NEXT: shlq $46, %rax
-; AVX2-NEXT: sarq $63, %rax
-; AVX2-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2
-; AVX2-NEXT: movq %rdx, {{[0-9]+}}(%rsp) # 8-byte Spill
-; AVX2-NEXT: shlq $45, %r13
-; AVX2-NEXT: sarq $63, %r13
-; AVX2-NEXT: vpinsrb $2, %r13d, %xmm2, %xmm2
-; AVX2-NEXT: movq %rdx, %r13
-; AVX2-NEXT: shlq $44, %rcx
-; AVX2-NEXT: sarq $63, %rcx
-; AVX2-NEXT: vpinsrb $3, %ecx, %xmm2, %xmm2
-; AVX2-NEXT: movq %rdx, %rcx
-; AVX2-NEXT: shlq $43, %r9
-; AVX2-NEXT: sarq $63, %r9
-; AVX2-NEXT: vpinsrb $4, %r9d, %xmm2, %xmm2
-; AVX2-NEXT: movq %rdx, %r9
-; AVX2-NEXT: shlq $42, %r12
-; AVX2-NEXT: sarq $63, %r12
-; AVX2-NEXT: vpinsrb $5, %r12d, %xmm2, %xmm2
-; AVX2-NEXT: movq %rdx, %r12
-; AVX2-NEXT: shlq $41, %rdi
-; AVX2-NEXT: sarq $63, %rdi
-; AVX2-NEXT: vpinsrb $6, %edi, %xmm2, %xmm2
-; AVX2-NEXT: movq %rdx, %rdi
-; AVX2-NEXT: shlq $40, %rbx
-; AVX2-NEXT: sarq $63, %rbx
-; AVX2-NEXT: vpinsrb $7, %ebx, %xmm2, %xmm2
-; AVX2-NEXT: movq %rdx, %rbx
-; AVX2-NEXT: shlq $39, %r8
-; AVX2-NEXT: sarq $63, %r8
-; AVX2-NEXT: vpinsrb $8, %r8d, %xmm2, %xmm2
-; AVX2-NEXT: movq %rdx, %r8
-; AVX2-NEXT: shlq $38, %r10
-; AVX2-NEXT: sarq $63, %r10
-; AVX2-NEXT: vpinsrb $9, %r10d, %xmm2, %xmm2
-; AVX2-NEXT: movsbq %dl, %rax
-; AVX2-NEXT: movq %rax, {{[0-9]+}}(%rsp) # 8-byte Spill
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax # 8-byte Reload
-; AVX2-NEXT: shlq $37, %rax
-; AVX2-NEXT: sarq $63, %rax
-; AVX2-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2
-; AVX2-NEXT: movq %rdx, %r10
-; AVX2-NEXT: shlq $36, %rsi
-; AVX2-NEXT: sarq $63, %rsi
-; AVX2-NEXT: vpinsrb $11, %esi, %xmm2, %xmm2
-; AVX2-NEXT: movq %rdx, %rsi
-; AVX2-NEXT: shlq $35, %r11
-; AVX2-NEXT: sarq $63, %r11
-; AVX2-NEXT: vpinsrb $12, %r11d, %xmm2, %xmm2
-; AVX2-NEXT: movq %rdx, %r11
-; AVX2-NEXT: shlq $34, %r14
-; AVX2-NEXT: sarq $63, %r14
-; AVX2-NEXT: vpinsrb $13, %r14d, %xmm2, %xmm2
-; AVX2-NEXT: movq %rdx, %r14
-; AVX2-NEXT: shlq $33, %r15
-; AVX2-NEXT: sarq $63, %r15
-; AVX2-NEXT: vpinsrb $14, %r15d, %xmm2, %xmm2
-; AVX2-NEXT: movq %rdx, %r15
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax # 8-byte Reload
-; AVX2-NEXT: shrq $31, %rax
-; AVX2-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2
-; AVX2-NEXT: movq %rdx, %rax
-; AVX2-NEXT: shlq $63, %rcx
-; AVX2-NEXT: sarq $63, %rcx
-; AVX2-NEXT: vmovd %ecx, %xmm3
-; AVX2-NEXT: movq %rdx, %rcx
-; AVX2-NEXT: movswq %dx, %rdx
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX2-NEXT: shlq $62, %r13
-; AVX2-NEXT: sarq $63, %r13
-; AVX2-NEXT: vpinsrb $1, %r13d, %xmm3, %xmm1
-; AVX2-NEXT: shlq $61, %r9
-; AVX2-NEXT: sarq $63, %r9
-; AVX2-NEXT: vpinsrb $2, %r9d, %xmm1, %xmm1
-; AVX2-NEXT: shlq $60, %r12
-; AVX2-NEXT: sarq $63, %r12
-; AVX2-NEXT: vpinsrb $3, %r12d, %xmm1, %xmm1
-; AVX2-NEXT: shlq $59, %rdi
-; AVX2-NEXT: sarq $63, %rdi
-; AVX2-NEXT: vpinsrb $4, %edi, %xmm1, %xmm1
-; AVX2-NEXT: shlq $58, %rbx
-; AVX2-NEXT: sarq $63, %rbx
-; AVX2-NEXT: vpinsrb $5, %ebx, %xmm1, %xmm1
-; AVX2-NEXT: shlq $57, %r8
-; AVX2-NEXT: sarq $63, %r8
-; AVX2-NEXT: vpinsrb $6, %r8d, %xmm1, %xmm1
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rdi # 8-byte Reload
-; AVX2-NEXT: shrq $7, %rdi
-; AVX2-NEXT: vpinsrb $7, %edi, %xmm1, %xmm1
-; AVX2-NEXT: shlq $55, %r10
-; AVX2-NEXT: sarq $63, %r10
-; AVX2-NEXT: vpinsrb $8, %r10d, %xmm1, %xmm1
-; AVX2-NEXT: shlq $54, %rsi
-; AVX2-NEXT: sarq $63, %rsi
-; AVX2-NEXT: vpinsrb $9, %esi, %xmm1, %xmm1
-; AVX2-NEXT: shlq $53, %r11
-; AVX2-NEXT: sarq $63, %r11
-; AVX2-NEXT: vpinsrb $10, %r11d, %xmm1, %xmm1
-; AVX2-NEXT: shlq $52, %r14
-; AVX2-NEXT: sarq $63, %r14
-; AVX2-NEXT: vpinsrb $11, %r14d, %xmm1, %xmm1
-; AVX2-NEXT: shlq $51, %r15
-; AVX2-NEXT: sarq $63, %r15
-; AVX2-NEXT: vpinsrb $12, %r15d, %xmm1, %xmm1
-; AVX2-NEXT: shlq $50, %rax
-; AVX2-NEXT: sarq $63, %rax
-; AVX2-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
-; AVX2-NEXT: shlq $49, %rcx
-; AVX2-NEXT: sarq $63, %rcx
-; AVX2-NEXT: vpinsrb $14, %ecx, %xmm1, %xmm1
-; AVX2-NEXT: shrq $15, %rdx
-; AVX2-NEXT: vpinsrb $15, %edx, %xmm1, %xmm1
-; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
-; AVX2-NEXT: leaq -40(%rbp), %rsp
-; AVX2-NEXT: popq %rbx
-; AVX2-NEXT: popq %r12
-; AVX2-NEXT: popq %r13
-; AVX2-NEXT: popq %r14
-; AVX2-NEXT: popq %r15
-; AVX2-NEXT: popq %rbp
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovq %rdi, %xmm0
+; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,0,1,1,4,5,6,7]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[2,2,3,3,4,5,6,7]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,1,1]
+; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
+; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,4,5,5]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3]
+; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,6,7,7]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1
+; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpcmpeqb %ymm2, %ymm1, %ymm1
; AVX2-NEXT: retq
;
; AVX512-LABEL: ext_i64_64i8:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: kmovq %rdi, %k0
; AVX512-NEXT: vpmovm2b %k0, %zmm0
; AVX512-NEXT: retq
diff --git a/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll b/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll
index aa9e60df1404..f88b540323cb 100644
--- a/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll
+++ b/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll
@@ -3,7 +3,8 @@
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE2-SSSE3,SSSE3
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX12,AVX1
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX12,AVX2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512VLBW
;
; 128-bit vectors
@@ -11,43 +12,61 @@
define <2 x i64> @ext_i2_2i64(i2 %a0) {
; SSE2-SSSE3-LABEL: ext_i2_2i64:
-; SSE2-SSSE3: # BB#0:
-; SSE2-SSSE3-NEXT: andb $3, %dil
-; SSE2-SSSE3-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movq %rcx, %xmm0
-; SSE2-SSSE3-NEXT: shrl %eax
-; SSE2-SSSE3-NEXT: andl $1, %eax
-; SSE2-SSSE3-NEXT: movq %rax, %xmm1
-; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-SSSE3: # %bb.0:
+; SSE2-SSSE3-NEXT: # kill: def %edi killed %edi def %rdi
+; SSE2-SSSE3-NEXT: movq %rdi, %xmm0
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,1]
+; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [1,2]
+; SSE2-SSSE3-NEXT: pand %xmm0, %xmm1
+; SSE2-SSSE3-NEXT: pcmpeqd %xmm0, %xmm1
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,0,3,2]
+; SSE2-SSSE3-NEXT: pand %xmm1, %xmm0
+; SSE2-SSSE3-NEXT: psrlq $63, %xmm0
; SSE2-SSSE3-NEXT: retq
;
-; AVX12-LABEL: ext_i2_2i64:
-; AVX12: # BB#0:
-; AVX12-NEXT: andb $3, %dil
-; AVX12-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; AVX12-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; AVX12-NEXT: movl %eax, %ecx
-; AVX12-NEXT: andl $1, %ecx
-; AVX12-NEXT: vmovq %rcx, %xmm0
-; AVX12-NEXT: shrl %eax
-; AVX12-NEXT: andl $1, %eax
-; AVX12-NEXT: vmovq %rax, %xmm1
-; AVX12-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX12-NEXT: retq
+; AVX1-LABEL: ext_i2_2i64:
+; AVX1: # %bb.0:
+; AVX1-NEXT: # kill: def %edi killed %edi def %rdi
+; AVX1-NEXT: vmovq %rdi, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2]
+; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpsrlq $63, %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: ext_i2_2i64:
+; AVX2: # %bb.0:
+; AVX2-NEXT: # kill: def %edi killed %edi def %rdi
+; AVX2-NEXT: vmovq %rdi, %xmm0
+; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2]
+; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpsrlq $63, %xmm0, %xmm0
+; AVX2-NEXT: retq
;
-; AVX512-LABEL: ext_i2_2i64:
-; AVX512: # BB#0:
-; AVX512-NEXT: andb $3, %dil
-; AVX512-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; AVX512-NEXT: kmovd %eax, %k1
-; AVX512-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z}
-; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
+; AVX512F-LABEL: ext_i2_2i64:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: andb $3, %dil
+; AVX512F-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; AVX512F-NEXT: kmovw %eax, %k1
+; AVX512F-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z}
+; AVX512F-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512VLBW-LABEL: ext_i2_2i64:
+; AVX512VLBW: # %bb.0:
+; AVX512VLBW-NEXT: andb $3, %dil
+; AVX512VLBW-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; AVX512VLBW-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; AVX512VLBW-NEXT: kmovd %eax, %k1
+; AVX512VLBW-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z}
+; AVX512VLBW-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0
+; AVX512VLBW-NEXT: vzeroupper
+; AVX512VLBW-NEXT: retq
%1 = bitcast i2 %a0 to <2 x i1>
%2 = zext <2 x i1> %1 to <2 x i64>
ret <2 x i64> %2
@@ -55,70 +74,56 @@ define <2 x i64> @ext_i2_2i64(i2 %a0) {
define <4 x i32> @ext_i4_4i32(i4 %a0) {
; SSE2-SSSE3-LABEL: ext_i4_4i32:
-; SSE2-SSSE3: # BB#0:
-; SSE2-SSSE3-NEXT: andb $15, %dil
-; SSE2-SSSE3-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movl -{{[0-9]+}}(%rsp), %eax
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $3, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $2, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
-; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-SSSE3-NEXT: movd %eax, %xmm0
-; SSE2-SSSE3-NEXT: shrl %eax
-; SSE2-SSSE3-NEXT: movd %eax, %xmm2
-; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE2-SSSE3-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-SSSE3: # %bb.0:
+; SSE2-SSSE3-NEXT: movd %edi, %xmm0
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8]
+; SSE2-SSSE3-NEXT: pand %xmm1, %xmm0
+; SSE2-SSSE3-NEXT: pcmpeqd %xmm1, %xmm0
+; SSE2-SSSE3-NEXT: psrld $31, %xmm0
; SSE2-SSSE3-NEXT: retq
;
; AVX1-LABEL: ext_i4_4i32:
-; AVX1: # BB#0:
-; AVX1-NEXT: andb $15, %dil
-; AVX1-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT: movl -{{[0-9]+}}(%rsp), %eax
-; AVX1-NEXT: movl %eax, %ecx
-; AVX1-NEXT: shrl %ecx
-; AVX1-NEXT: vmovd %eax, %xmm0
-; AVX1-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0
-; AVX1-NEXT: movl %eax, %ecx
-; AVX1-NEXT: shrl $2, %ecx
-; AVX1-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0
-; AVX1-NEXT: shrl $3, %eax
-; AVX1-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0
-; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovd %edi, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2,4,8]
+; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpsrld $31, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ext_i4_4i32:
-; AVX2: # BB#0:
-; AVX2-NEXT: andb $15, %dil
-; AVX2-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: movl -{{[0-9]+}}(%rsp), %eax
-; AVX2-NEXT: movl %eax, %ecx
-; AVX2-NEXT: shrl %ecx
-; AVX2-NEXT: vmovd %eax, %xmm0
-; AVX2-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0
-; AVX2-NEXT: movl %eax, %ecx
-; AVX2-NEXT: shrl $2, %ecx
-; AVX2-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0
-; AVX2-NEXT: shrl $3, %eax
-; AVX2-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1]
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovd %edi, %xmm0
+; AVX2-NEXT: vpbroadcastd %xmm0, %xmm0
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2,4,8]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpsrld $31, %xmm0, %xmm0
; AVX2-NEXT: retq
;
-; AVX512-LABEL: ext_i4_4i32:
-; AVX512: # BB#0:
-; AVX512-NEXT: andb $15, %dil
-; AVX512-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; AVX512-NEXT: kmovd %eax, %k1
-; AVX512-NEXT: vpbroadcastd {{.*}}(%rip), %ymm0 {%k1} {z}
-; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
+; AVX512F-LABEL: ext_i4_4i32:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: andb $15, %dil
+; AVX512F-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; AVX512F-NEXT: kmovw %eax, %k1
+; AVX512F-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; AVX512F-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512VLBW-LABEL: ext_i4_4i32:
+; AVX512VLBW: # %bb.0:
+; AVX512VLBW-NEXT: andb $15, %dil
+; AVX512VLBW-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; AVX512VLBW-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; AVX512VLBW-NEXT: kmovd %eax, %k1
+; AVX512VLBW-NEXT: vpbroadcastd {{.*}}(%rip), %ymm0 {%k1} {z}
+; AVX512VLBW-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
+; AVX512VLBW-NEXT: vzeroupper
+; AVX512VLBW-NEXT: retq
%1 = bitcast i4 %a0 to <4 x i1>
%2 = zext <4 x i1> %1 to <4 x i32>
ret <4 x i32> %2
@@ -126,389 +131,116 @@ define <4 x i32> @ext_i4_4i32(i4 %a0) {
define <8 x i16> @ext_i8_8i16(i8 %a0) {
; SSE2-SSSE3-LABEL: ext_i8_8i16:
-; SSE2-SSSE3: # BB#0:
-; SSE2-SSSE3-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $3, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $2, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
-; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
-; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $5, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $4, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
-; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $6, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
-; SSE2-SSSE3-NEXT: shrl $7, %eax
-; SSE2-SSSE3-NEXT: movzwl %ax, %eax
-; SSE2-SSSE3-NEXT: movd %eax, %xmm3
-; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
-; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; SSE2-SSSE3: # %bb.0:
+; SSE2-SSSE3-NEXT: movd %edi, %xmm0
+; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128]
+; SSE2-SSSE3-NEXT: pand %xmm1, %xmm0
+; SSE2-SSSE3-NEXT: pcmpeqw %xmm1, %xmm0
+; SSE2-SSSE3-NEXT: psrlw $15, %xmm0
; SSE2-SSSE3-NEXT: retq
;
-; AVX12-LABEL: ext_i8_8i16:
-; AVX12: # BB#0:
-; AVX12-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; AVX12-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; AVX12-NEXT: movl %eax, %ecx
-; AVX12-NEXT: shrl %ecx
-; AVX12-NEXT: andl $1, %ecx
-; AVX12-NEXT: movl %eax, %edx
-; AVX12-NEXT: andl $1, %edx
-; AVX12-NEXT: vmovd %edx, %xmm0
-; AVX12-NEXT: vpinsrw $1, %ecx, %xmm0, %xmm0
-; AVX12-NEXT: movl %eax, %ecx
-; AVX12-NEXT: shrl $2, %ecx
-; AVX12-NEXT: andl $1, %ecx
-; AVX12-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0
-; AVX12-NEXT: movl %eax, %ecx
-; AVX12-NEXT: shrl $3, %ecx
-; AVX12-NEXT: andl $1, %ecx
-; AVX12-NEXT: vpinsrw $3, %ecx, %xmm0, %xmm0
-; AVX12-NEXT: movl %eax, %ecx
-; AVX12-NEXT: shrl $4, %ecx
-; AVX12-NEXT: andl $1, %ecx
-; AVX12-NEXT: vpinsrw $4, %ecx, %xmm0, %xmm0
-; AVX12-NEXT: movl %eax, %ecx
-; AVX12-NEXT: shrl $5, %ecx
-; AVX12-NEXT: andl $1, %ecx
-; AVX12-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0
-; AVX12-NEXT: movl %eax, %ecx
-; AVX12-NEXT: shrl $6, %ecx
-; AVX12-NEXT: andl $1, %ecx
-; AVX12-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0
-; AVX12-NEXT: shrl $7, %eax
-; AVX12-NEXT: movzwl %ax, %eax
-; AVX12-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0
-; AVX12-NEXT: retq
+; AVX1-LABEL: ext_i8_8i16:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovd %edi, %xmm0
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128]
+; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpsrlw $15, %xmm0, %xmm0
+; AVX1-NEXT: retq
;
-; AVX512-LABEL: ext_i8_8i16:
-; AVX512: # BB#0:
-; AVX512-NEXT: kmovd %edi, %k5
-; AVX512-NEXT: kshiftlw $8, %k5, %k0
-; AVX512-NEXT: kshiftrw $15, %k0, %k0
-; AVX512-NEXT: kshiftlw $9, %k5, %k1
-; AVX512-NEXT: kshiftrw $15, %k1, %k1
-; AVX512-NEXT: kshiftlw $10, %k5, %k2
-; AVX512-NEXT: kshiftrw $15, %k2, %k2
-; AVX512-NEXT: kshiftlw $11, %k5, %k3
-; AVX512-NEXT: kshiftrw $15, %k3, %k3
-; AVX512-NEXT: kshiftlw $12, %k5, %k4
-; AVX512-NEXT: kshiftrw $15, %k4, %k4
-; AVX512-NEXT: kshiftlw $13, %k5, %k6
-; AVX512-NEXT: kshiftrw $15, %k6, %k6
-; AVX512-NEXT: kshiftlw $15, %k5, %k7
-; AVX512-NEXT: kshiftrw $15, %k7, %k7
-; AVX512-NEXT: kshiftlw $14, %k5, %k5
-; AVX512-NEXT: kshiftrw $15, %k5, %k5
-; AVX512-NEXT: kmovd %k5, %eax
-; AVX512-NEXT: andl $1, %eax
-; AVX512-NEXT: kmovd %k7, %ecx
-; AVX512-NEXT: andl $1, %ecx
-; AVX512-NEXT: vmovd %ecx, %xmm0
-; AVX512-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0
-; AVX512-NEXT: kmovd %k6, %eax
-; AVX512-NEXT: andl $1, %eax
-; AVX512-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0
-; AVX512-NEXT: kmovd %k4, %eax
-; AVX512-NEXT: andl $1, %eax
-; AVX512-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0
-; AVX512-NEXT: kmovd %k3, %eax
-; AVX512-NEXT: andl $1, %eax
-; AVX512-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0
-; AVX512-NEXT: kmovd %k2, %eax
-; AVX512-NEXT: andl $1, %eax
-; AVX512-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0
-; AVX512-NEXT: kmovd %k1, %eax
-; AVX512-NEXT: andl $1, %eax
-; AVX512-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0
-; AVX512-NEXT: kmovd %k0, %eax
-; AVX512-NEXT: andl $1, %eax
-; AVX512-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0
-; AVX512-NEXT: retq
+; AVX2-LABEL: ext_i8_8i16:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovd %edi, %xmm0
+; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128]
+; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpsrlw $15, %xmm0, %xmm0
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: ext_i8_8i16:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: kmovw %edi, %k1
+; AVX512F-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512F-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512VLBW-LABEL: ext_i8_8i16:
+; AVX512VLBW: # %bb.0:
+; AVX512VLBW-NEXT: kmovd %edi, %k1
+; AVX512VLBW-NEXT: vmovdqu16 {{.*}}(%rip), %xmm0 {%k1} {z}
+; AVX512VLBW-NEXT: retq
%1 = bitcast i8 %a0 to <8 x i1>
%2 = zext <8 x i1> %1 to <8 x i16>
ret <8 x i16> %2
}
define <16 x i8> @ext_i16_16i8(i16 %a0) {
-; SSE2-SSSE3-LABEL: ext_i16_16i8:
-; SSE2-SSSE3: # BB#0:
-; SSE2-SSSE3-NEXT: movw %di, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $7, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $6, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $5, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $4, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $3, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $2, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm3
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
-; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $11, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $10, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $9, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm3
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $8, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
-; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $13, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $12, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm3
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $14, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
-; SSE2-SSSE3-NEXT: shrl $15, %eax
-; SSE2-SSSE3-NEXT: movzwl %ax, %eax
-; SSE2-SSSE3-NEXT: movd %eax, %xmm4
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
-; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
-; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE2-SSSE3-NEXT: retq
+; SSE2-LABEL: ext_i16_16i8:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movd %edi, %xmm0
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,1,1,4,5,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
+; SSE2-NEXT: pand %xmm1, %xmm0
+; SSE2-NEXT: pcmpeqb %xmm1, %xmm0
+; SSE2-NEXT: psrlw $7, %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: ext_i16_16i8:
+; SSSE3: # %bb.0:
+; SSSE3-NEXT: movd %edi, %xmm0
+; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
+; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
+; SSSE3-NEXT: pand %xmm1, %xmm0
+; SSSE3-NEXT: pcmpeqb %xmm1, %xmm0
+; SSSE3-NEXT: psrlw $7, %xmm0
+; SSSE3-NEXT: pand {{.*}}(%rip), %xmm0
+; SSSE3-NEXT: retq
+;
+; AVX1-LABEL: ext_i16_16i8:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovd %edi, %xmm0
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0]
+; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpsrlw $7, %xmm0, %xmm0
+; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: retq
;
-; AVX12-LABEL: ext_i16_16i8:
-; AVX12: # BB#0:
-; AVX12-NEXT: movw %di, -{{[0-9]+}}(%rsp)
-; AVX12-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax
-; AVX12-NEXT: movl %eax, %ecx
-; AVX12-NEXT: shrl %ecx
-; AVX12-NEXT: andl $1, %ecx
-; AVX12-NEXT: movl %eax, %edx
-; AVX12-NEXT: andl $1, %edx
-; AVX12-NEXT: vmovd %edx, %xmm0
-; AVX12-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
-; AVX12-NEXT: movl %eax, %ecx
-; AVX12-NEXT: shrl $2, %ecx
-; AVX12-NEXT: andl $1, %ecx
-; AVX12-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0
-; AVX12-NEXT: movl %eax, %ecx
-; AVX12-NEXT: shrl $3, %ecx
-; AVX12-NEXT: andl $1, %ecx
-; AVX12-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0
-; AVX12-NEXT: movl %eax, %ecx
-; AVX12-NEXT: shrl $4, %ecx
-; AVX12-NEXT: andl $1, %ecx
-; AVX12-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0
-; AVX12-NEXT: movl %eax, %ecx
-; AVX12-NEXT: shrl $5, %ecx
-; AVX12-NEXT: andl $1, %ecx
-; AVX12-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0
-; AVX12-NEXT: movl %eax, %ecx
-; AVX12-NEXT: shrl $6, %ecx
-; AVX12-NEXT: andl $1, %ecx
-; AVX12-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; AVX12-NEXT: movl %eax, %ecx
-; AVX12-NEXT: shrl $7, %ecx
-; AVX12-NEXT: andl $1, %ecx
-; AVX12-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0
-; AVX12-NEXT: movl %eax, %ecx
-; AVX12-NEXT: shrl $8, %ecx
-; AVX12-NEXT: andl $1, %ecx
-; AVX12-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0
-; AVX12-NEXT: movl %eax, %ecx
-; AVX12-NEXT: shrl $9, %ecx
-; AVX12-NEXT: andl $1, %ecx
-; AVX12-NEXT: vpinsrb $9, %ecx, %xmm0, %xmm0
-; AVX12-NEXT: movl %eax, %ecx
-; AVX12-NEXT: shrl $10, %ecx
-; AVX12-NEXT: andl $1, %ecx
-; AVX12-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0
-; AVX12-NEXT: movl %eax, %ecx
-; AVX12-NEXT: shrl $11, %ecx
-; AVX12-NEXT: andl $1, %ecx
-; AVX12-NEXT: vpinsrb $11, %ecx, %xmm0, %xmm0
-; AVX12-NEXT: movl %eax, %ecx
-; AVX12-NEXT: shrl $12, %ecx
-; AVX12-NEXT: andl $1, %ecx
-; AVX12-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0
-; AVX12-NEXT: movl %eax, %ecx
-; AVX12-NEXT: shrl $13, %ecx
-; AVX12-NEXT: andl $1, %ecx
-; AVX12-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0
-; AVX12-NEXT: movl %eax, %ecx
-; AVX12-NEXT: shrl $14, %ecx
-; AVX12-NEXT: andl $1, %ecx
-; AVX12-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; AVX12-NEXT: shrl $15, %eax
-; AVX12-NEXT: movzwl %ax, %eax
-; AVX12-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
-; AVX12-NEXT: retq
+; AVX2-LABEL: ext_i16_16i8:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovd %edi, %xmm0
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [9241421688590303745,9241421688590303745]
+; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpsrlw $7, %xmm0, %xmm0
+; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT: retq
;
-; AVX512-LABEL: ext_i16_16i8:
-; AVX512: # BB#0:
-; AVX512-NEXT: pushq %rbp
-; AVX512-NEXT: .Lcfi0:
-; AVX512-NEXT: .cfi_def_cfa_offset 16
-; AVX512-NEXT: pushq %r15
-; AVX512-NEXT: .Lcfi1:
-; AVX512-NEXT: .cfi_def_cfa_offset 24
-; AVX512-NEXT: pushq %r14
-; AVX512-NEXT: .Lcfi2:
-; AVX512-NEXT: .cfi_def_cfa_offset 32
-; AVX512-NEXT: pushq %r13
-; AVX512-NEXT: .Lcfi3:
-; AVX512-NEXT: .cfi_def_cfa_offset 40
-; AVX512-NEXT: pushq %r12
-; AVX512-NEXT: .Lcfi4:
-; AVX512-NEXT: .cfi_def_cfa_offset 48
-; AVX512-NEXT: pushq %rbx
-; AVX512-NEXT: .Lcfi5:
-; AVX512-NEXT: .cfi_def_cfa_offset 56
-; AVX512-NEXT: .Lcfi6:
-; AVX512-NEXT: .cfi_offset %rbx, -56
-; AVX512-NEXT: .Lcfi7:
-; AVX512-NEXT: .cfi_offset %r12, -48
-; AVX512-NEXT: .Lcfi8:
-; AVX512-NEXT: .cfi_offset %r13, -40
-; AVX512-NEXT: .Lcfi9:
-; AVX512-NEXT: .cfi_offset %r14, -32
-; AVX512-NEXT: .Lcfi10:
-; AVX512-NEXT: .cfi_offset %r15, -24
-; AVX512-NEXT: .Lcfi11:
-; AVX512-NEXT: .cfi_offset %rbp, -16
-; AVX512-NEXT: kmovd %edi, %k0
-; AVX512-NEXT: kshiftlw $14, %k0, %k1
-; AVX512-NEXT: kshiftrw $15, %k1, %k1
-; AVX512-NEXT: kmovd %k1, %r8d
-; AVX512-NEXT: kshiftlw $15, %k0, %k1
-; AVX512-NEXT: kshiftrw $15, %k1, %k1
-; AVX512-NEXT: kmovd %k1, %r9d
-; AVX512-NEXT: kshiftlw $13, %k0, %k1
-; AVX512-NEXT: kshiftrw $15, %k1, %k1
-; AVX512-NEXT: kmovd %k1, %r10d
-; AVX512-NEXT: kshiftlw $12, %k0, %k1
-; AVX512-NEXT: kshiftrw $15, %k1, %k1
-; AVX512-NEXT: kmovd %k1, %r11d
-; AVX512-NEXT: kshiftlw $11, %k0, %k1
-; AVX512-NEXT: kshiftrw $15, %k1, %k1
-; AVX512-NEXT: kmovd %k1, %r14d
-; AVX512-NEXT: kshiftlw $10, %k0, %k1
-; AVX512-NEXT: kshiftrw $15, %k1, %k1
-; AVX512-NEXT: kmovd %k1, %r15d
-; AVX512-NEXT: kshiftlw $9, %k0, %k1
-; AVX512-NEXT: kshiftrw $15, %k1, %k1
-; AVX512-NEXT: kmovd %k1, %r12d
-; AVX512-NEXT: kshiftlw $8, %k0, %k1
-; AVX512-NEXT: kshiftrw $15, %k1, %k1
-; AVX512-NEXT: kmovd %k1, %r13d
-; AVX512-NEXT: kshiftlw $7, %k0, %k1
-; AVX512-NEXT: kshiftrw $15, %k1, %k1
-; AVX512-NEXT: kmovd %k1, %esi
-; AVX512-NEXT: kshiftlw $6, %k0, %k1
-; AVX512-NEXT: kshiftrw $15, %k1, %k1
-; AVX512-NEXT: kmovd %k1, %ebx
-; AVX512-NEXT: kshiftlw $5, %k0, %k1
-; AVX512-NEXT: kshiftrw $15, %k1, %k1
-; AVX512-NEXT: kmovd %k1, %ebp
-; AVX512-NEXT: kshiftlw $4, %k0, %k1
-; AVX512-NEXT: kshiftrw $15, %k1, %k1
-; AVX512-NEXT: kmovd %k1, %edi
-; AVX512-NEXT: kshiftlw $3, %k0, %k1
-; AVX512-NEXT: kshiftrw $15, %k1, %k1
-; AVX512-NEXT: kmovd %k1, %eax
-; AVX512-NEXT: kshiftlw $2, %k0, %k1
-; AVX512-NEXT: kshiftrw $15, %k1, %k1
-; AVX512-NEXT: kmovd %k1, %ecx
-; AVX512-NEXT: kshiftlw $1, %k0, %k1
-; AVX512-NEXT: kshiftrw $15, %k1, %k1
-; AVX512-NEXT: kmovd %k1, %edx
-; AVX512-NEXT: kshiftrw $15, %k0, %k0
-; AVX512-NEXT: vmovd %r9d, %xmm0
-; AVX512-NEXT: kmovd %k0, %r9d
-; AVX512-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
-; AVX512-NEXT: vpinsrb $2, %r10d, %xmm0, %xmm0
-; AVX512-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
-; AVX512-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; AVX512-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
-; AVX512-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
-; AVX512-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
-; AVX512-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0
-; AVX512-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; AVX512-NEXT: vpinsrb $10, %ebp, %xmm0, %xmm0
-; AVX512-NEXT: vpinsrb $11, %edi, %xmm0, %xmm0
-; AVX512-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
-; AVX512-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0
-; AVX512-NEXT: vpinsrb $14, %edx, %xmm0, %xmm0
-; AVX512-NEXT: vpinsrb $15, %r9d, %xmm0, %xmm0
-; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX512-NEXT: popq %rbx
-; AVX512-NEXT: popq %r12
-; AVX512-NEXT: popq %r13
-; AVX512-NEXT: popq %r14
-; AVX512-NEXT: popq %r15
-; AVX512-NEXT: popq %rbp
-; AVX512-NEXT: retq
+; AVX512F-LABEL: ext_i16_16i8:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: kmovw %edi, %k1
+; AVX512F-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512VLBW-LABEL: ext_i16_16i8:
+; AVX512VLBW: # %bb.0:
+; AVX512VLBW-NEXT: kmovd %edi, %k1
+; AVX512VLBW-NEXT: vmovdqu8 {{.*}}(%rip), %xmm0 {%k1} {z}
+; AVX512VLBW-NEXT: retq
%1 = bitcast i16 %a0 to <16 x i1>
%2 = zext <16 x i1> %1 to <16 x i8>
ret <16 x i8> %2
@@ -520,87 +252,74 @@ define <16 x i8> @ext_i16_16i8(i16 %a0) {
define <4 x i64> @ext_i4_4i64(i4 %a0) {
; SSE2-SSSE3-LABEL: ext_i4_4i64:
-; SSE2-SSSE3: # BB#0:
-; SSE2-SSSE3-NEXT: andb $15, %dil
-; SSE2-SSSE3-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movl -{{[0-9]+}}(%rsp), %eax
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $3, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $2, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
-; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-SSSE3-NEXT: movd %eax, %xmm2
-; SSE2-SSSE3-NEXT: shrl %eax
-; SSE2-SSSE3-NEXT: movd %eax, %xmm0
-; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
-; SSE2-SSSE3-NEXT: pand {{.*}}(%rip), %xmm2
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,1,3]
-; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [1,1]
-; SSE2-SSSE3-NEXT: pand %xmm3, %xmm0
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,1,3,3]
-; SSE2-SSSE3-NEXT: pand %xmm3, %xmm1
+; SSE2-SSSE3: # %bb.0:
+; SSE2-SSSE3-NEXT: # kill: def %edi killed %edi def %rdi
+; SSE2-SSSE3-NEXT: movq %rdi, %xmm0
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1]
+; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [1,2]
+; SSE2-SSSE3-NEXT: movdqa %xmm2, %xmm1
+; SSE2-SSSE3-NEXT: pand %xmm0, %xmm1
+; SSE2-SSSE3-NEXT: pcmpeqd %xmm0, %xmm1
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,0,3,2]
+; SSE2-SSSE3-NEXT: pand %xmm1, %xmm0
+; SSE2-SSSE3-NEXT: psrlq $63, %xmm0
+; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [4,8]
+; SSE2-SSSE3-NEXT: pand %xmm1, %xmm2
+; SSE2-SSSE3-NEXT: pcmpeqd %xmm1, %xmm2
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,0,3,2]
+; SSE2-SSSE3-NEXT: pand %xmm2, %xmm1
+; SSE2-SSSE3-NEXT: psrlq $63, %xmm1
; SSE2-SSSE3-NEXT: retq
;
; AVX1-LABEL: ext_i4_4i64:
-; AVX1: # BB#0:
-; AVX1-NEXT: andb $15, %dil
-; AVX1-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; AVX1-NEXT: movl %eax, %ecx
-; AVX1-NEXT: shrl $3, %ecx
-; AVX1-NEXT: andl $1, %ecx
-; AVX1-NEXT: vmovq %rcx, %xmm0
-; AVX1-NEXT: movl %eax, %ecx
-; AVX1-NEXT: shrl $2, %ecx
-; AVX1-NEXT: andl $1, %ecx
-; AVX1-NEXT: vmovq %rcx, %xmm1
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX1-NEXT: movl %eax, %ecx
-; AVX1-NEXT: andl $1, %ecx
-; AVX1-NEXT: vmovq %rcx, %xmm1
-; AVX1-NEXT: shrl %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vmovq %rax, %xmm2
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1: # %bb.0:
+; AVX1-NEXT: # kill: def %edi killed %edi def %rdi
+; AVX1-NEXT: vmovq %rdi, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm2
+; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpsrlq $63, %xmm2, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpsrlq $63, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ext_i4_4i64:
-; AVX2: # BB#0:
-; AVX2-NEXT: andb $15, %dil
-; AVX2-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; AVX2-NEXT: movl %eax, %ecx
-; AVX2-NEXT: shrl $3, %ecx
-; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: vmovq %rcx, %xmm0
-; AVX2-NEXT: movl %eax, %ecx
-; AVX2-NEXT: shrl $2, %ecx
-; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: vmovq %rcx, %xmm1
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX2-NEXT: movl %eax, %ecx
-; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: vmovq %rcx, %xmm1
-; AVX2-NEXT: shrl %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vmovq %rax, %xmm2
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2: # %bb.0:
+; AVX2-NEXT: # kill: def %edi killed %edi def %rdi
+; AVX2-NEXT: vmovq %rdi, %xmm0
+; AVX2-NEXT: vpbroadcastq %xmm0, %ymm0
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [1,2,4,8]
+; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpsrlq $63, %ymm0, %ymm0
; AVX2-NEXT: retq
;
-; AVX512-LABEL: ext_i4_4i64:
-; AVX512: # BB#0:
-; AVX512-NEXT: andb $15, %dil
-; AVX512-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; AVX512-NEXT: kmovd %eax, %k1
-; AVX512-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z}
-; AVX512-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
-; AVX512-NEXT: retq
+; AVX512F-LABEL: ext_i4_4i64:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: andb $15, %dil
+; AVX512F-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; AVX512F-NEXT: kmovw %eax, %k1
+; AVX512F-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z}
+; AVX512F-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
+; AVX512F-NEXT: retq
+;
+; AVX512VLBW-LABEL: ext_i4_4i64:
+; AVX512VLBW: # %bb.0:
+; AVX512VLBW-NEXT: andb $15, %dil
+; AVX512VLBW-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; AVX512VLBW-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; AVX512VLBW-NEXT: kmovd %eax, %k1
+; AVX512VLBW-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z}
+; AVX512VLBW-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
+; AVX512VLBW-NEXT: retq
%1 = bitcast i4 %a0 to <4 x i1>
%2 = zext <4 x i1> %1 to <4 x i64>
ret <4 x i64> %2
@@ -608,118 +327,60 @@ define <4 x i64> @ext_i4_4i64(i4 %a0) {
define <8 x i32> @ext_i8_8i32(i8 %a0) {
; SSE2-SSSE3-LABEL: ext_i8_8i32:
-; SSE2-SSSE3: # BB#0:
-; SSE2-SSSE3-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $3, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $2, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
-; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
-; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $5, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $4, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
-; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $6, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
-; SSE2-SSSE3-NEXT: shrl $7, %eax
-; SSE2-SSSE3-NEXT: movzwl %ax, %eax
-; SSE2-SSSE3-NEXT: movd %eax, %xmm3
-; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
-; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSE2-SSSE3: # %bb.0:
+; SSE2-SSSE3-NEXT: movd %edi, %xmm0
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
+; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [1,2,4,8]
; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm0
-; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [1,1,1,1]
; SSE2-SSSE3-NEXT: pand %xmm2, %xmm0
-; SSE2-SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-SSSE3-NEXT: pcmpeqd %xmm2, %xmm0
+; SSE2-SSSE3-NEXT: psrld $31, %xmm0
+; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [16,32,64,128]
; SSE2-SSSE3-NEXT: pand %xmm2, %xmm1
+; SSE2-SSSE3-NEXT: pcmpeqd %xmm2, %xmm1
+; SSE2-SSSE3-NEXT: psrld $31, %xmm1
; SSE2-SSSE3-NEXT: retq
;
; AVX1-LABEL: ext_i8_8i32:
-; AVX1: # BB#0:
-; AVX1-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT: movl -{{[0-9]+}}(%rsp), %eax
-; AVX1-NEXT: movl %eax, %ecx
-; AVX1-NEXT: shrl $5, %ecx
-; AVX1-NEXT: movl %eax, %edx
-; AVX1-NEXT: shrl $4, %edx
-; AVX1-NEXT: vmovd %edx, %xmm0
-; AVX1-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0
-; AVX1-NEXT: movl %eax, %ecx
-; AVX1-NEXT: shrl $6, %ecx
-; AVX1-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0
-; AVX1-NEXT: movl %eax, %ecx
-; AVX1-NEXT: shrl $7, %ecx
-; AVX1-NEXT: vpinsrd $3, %ecx, %xmm0, %xmm0
-; AVX1-NEXT: movl %eax, %ecx
-; AVX1-NEXT: shrl %ecx
-; AVX1-NEXT: vmovd %eax, %xmm1
-; AVX1-NEXT: vpinsrd $1, %ecx, %xmm1, %xmm1
-; AVX1-NEXT: movl %eax, %ecx
-; AVX1-NEXT: shrl $2, %ecx
-; AVX1-NEXT: vpinsrd $2, %ecx, %xmm1, %xmm1
-; AVX1-NEXT: shrl $3, %eax
-; AVX1-NEXT: vpinsrd $3, %eax, %xmm1, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovd %edi, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm2
+; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpsrld $31, %xmm2, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpsrld $31, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ext_i8_8i32:
-; AVX2: # BB#0:
-; AVX2-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: movl -{{[0-9]+}}(%rsp), %eax
-; AVX2-NEXT: movl %eax, %ecx
-; AVX2-NEXT: shrl $5, %ecx
-; AVX2-NEXT: movl %eax, %edx
-; AVX2-NEXT: shrl $4, %edx
-; AVX2-NEXT: vmovd %edx, %xmm0
-; AVX2-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0
-; AVX2-NEXT: movl %eax, %ecx
-; AVX2-NEXT: shrl $6, %ecx
-; AVX2-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0
-; AVX2-NEXT: movl %eax, %ecx
-; AVX2-NEXT: shrl $7, %ecx
-; AVX2-NEXT: vpinsrd $3, %ecx, %xmm0, %xmm0
-; AVX2-NEXT: movl %eax, %ecx
-; AVX2-NEXT: shrl %ecx
-; AVX2-NEXT: vmovd %eax, %xmm1
-; AVX2-NEXT: vpinsrd $1, %ecx, %xmm1, %xmm1
-; AVX2-NEXT: movl %eax, %ecx
-; AVX2-NEXT: shrl $2, %ecx
-; AVX2-NEXT: vpinsrd $2, %ecx, %xmm1, %xmm1
-; AVX2-NEXT: shrl $3, %eax
-; AVX2-NEXT: vpinsrd $3, %eax, %xmm1, %xmm1
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1]
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovd %edi, %xmm0
+; AVX2-NEXT: vpbroadcastd %xmm0, %ymm0
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpcmpeqd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpsrld $31, %ymm0, %ymm0
; AVX2-NEXT: retq
;
-; AVX512-LABEL: ext_i8_8i32:
-; AVX512: # BB#0:
-; AVX512-NEXT: kmovd %edi, %k1
-; AVX512-NEXT: vpbroadcastd {{.*}}(%rip), %ymm0 {%k1} {z}
-; AVX512-NEXT: retq
+; AVX512F-LABEL: ext_i8_8i32:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: kmovw %edi, %k1
+; AVX512F-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; AVX512F-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
+; AVX512F-NEXT: retq
+;
+; AVX512VLBW-LABEL: ext_i8_8i32:
+; AVX512VLBW: # %bb.0:
+; AVX512VLBW-NEXT: kmovd %edi, %k1
+; AVX512VLBW-NEXT: vpbroadcastd {{.*}}(%rip), %ymm0 {%k1} {z}
+; AVX512VLBW-NEXT: retq
%1 = bitcast i8 %a0 to <8 x i1>
%2 = zext <8 x i1> %1 to <8 x i32>
ret <8 x i32> %2
@@ -727,237 +388,62 @@ define <8 x i32> @ext_i8_8i32(i8 %a0) {
define <16 x i16> @ext_i16_16i16(i16 %a0) {
; SSE2-SSSE3-LABEL: ext_i16_16i16:
-; SSE2-SSSE3: # BB#0:
-; SSE2-SSSE3-NEXT: movw %di, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $7, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $6, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $5, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $4, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $3, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $2, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm3
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
-; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $11, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $10, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $9, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm3
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $8, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
-; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $13, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $12, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm3
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $14, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
-; SSE2-SSSE3-NEXT: shrl $15, %eax
-; SSE2-SSSE3-NEXT: movzwl %ax, %eax
-; SSE2-SSSE3-NEXT: movd %eax, %xmm4
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
-; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
-; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE2-SSSE3: # %bb.0:
+; SSE2-SSSE3-NEXT: movd %edi, %xmm0
+; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1]
+; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128]
; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm0
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1]
; SSE2-SSSE3-NEXT: pand %xmm2, %xmm0
-; SSE2-SSSE3-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; SSE2-SSSE3-NEXT: pcmpeqw %xmm2, %xmm0
+; SSE2-SSSE3-NEXT: psrlw $15, %xmm0
+; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [256,512,1024,2048,4096,8192,16384,32768]
; SSE2-SSSE3-NEXT: pand %xmm2, %xmm1
+; SSE2-SSSE3-NEXT: pcmpeqw %xmm2, %xmm1
+; SSE2-SSSE3-NEXT: psrlw $15, %xmm1
; SSE2-SSSE3-NEXT: retq
;
; AVX1-LABEL: ext_i16_16i16:
-; AVX1: # BB#0:
-; AVX1-NEXT: movw %di, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax
-; AVX1-NEXT: movl %eax, %ecx
-; AVX1-NEXT: shrl $9, %ecx
-; AVX1-NEXT: andl $1, %ecx
-; AVX1-NEXT: movl %eax, %edx
-; AVX1-NEXT: shrl $8, %edx
-; AVX1-NEXT: andl $1, %edx
-; AVX1-NEXT: vmovd %edx, %xmm0
-; AVX1-NEXT: vpinsrw $1, %ecx, %xmm0, %xmm0
-; AVX1-NEXT: movl %eax, %ecx
-; AVX1-NEXT: shrl $10, %ecx
-; AVX1-NEXT: andl $1, %ecx
-; AVX1-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0
-; AVX1-NEXT: movl %eax, %ecx
-; AVX1-NEXT: shrl $11, %ecx
-; AVX1-NEXT: andl $1, %ecx
-; AVX1-NEXT: vpinsrw $3, %ecx, %xmm0, %xmm0
-; AVX1-NEXT: movl %eax, %ecx
-; AVX1-NEXT: shrl $12, %ecx
-; AVX1-NEXT: andl $1, %ecx
-; AVX1-NEXT: vpinsrw $4, %ecx, %xmm0, %xmm0
-; AVX1-NEXT: movl %eax, %ecx
-; AVX1-NEXT: shrl $13, %ecx
-; AVX1-NEXT: andl $1, %ecx
-; AVX1-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0
-; AVX1-NEXT: movl %eax, %ecx
-; AVX1-NEXT: shrl $14, %ecx
-; AVX1-NEXT: andl $1, %ecx
-; AVX1-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0
-; AVX1-NEXT: movl %eax, %ecx
-; AVX1-NEXT: shrl $15, %ecx
-; AVX1-NEXT: movzwl %cx, %ecx
-; AVX1-NEXT: vpinsrw $7, %ecx, %xmm0, %xmm0
-; AVX1-NEXT: movl %eax, %ecx
-; AVX1-NEXT: shrl %ecx
-; AVX1-NEXT: andl $1, %ecx
-; AVX1-NEXT: movl %eax, %edx
-; AVX1-NEXT: andl $1, %edx
-; AVX1-NEXT: vmovd %edx, %xmm1
-; AVX1-NEXT: vpinsrw $1, %ecx, %xmm1, %xmm1
-; AVX1-NEXT: movl %eax, %ecx
-; AVX1-NEXT: shrl $2, %ecx
-; AVX1-NEXT: andl $1, %ecx
-; AVX1-NEXT: vpinsrw $2, %ecx, %xmm1, %xmm1
-; AVX1-NEXT: movl %eax, %ecx
-; AVX1-NEXT: shrl $3, %ecx
-; AVX1-NEXT: andl $1, %ecx
-; AVX1-NEXT: vpinsrw $3, %ecx, %xmm1, %xmm1
-; AVX1-NEXT: movl %eax, %ecx
-; AVX1-NEXT: shrl $4, %ecx
-; AVX1-NEXT: andl $1, %ecx
-; AVX1-NEXT: vpinsrw $4, %ecx, %xmm1, %xmm1
-; AVX1-NEXT: movl %eax, %ecx
-; AVX1-NEXT: shrl $5, %ecx
-; AVX1-NEXT: andl $1, %ecx
-; AVX1-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1
-; AVX1-NEXT: movl %eax, %ecx
-; AVX1-NEXT: shrl $6, %ecx
-; AVX1-NEXT: andl $1, %ecx
-; AVX1-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1
-; AVX1-NEXT: shrl $7, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovd %edi, %xmm0
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm2
+; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpsrlw $15, %xmm2, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpsrlw $15, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ext_i16_16i16:
-; AVX2: # BB#0:
-; AVX2-NEXT: movw %di, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax
-; AVX2-NEXT: movl %eax, %ecx
-; AVX2-NEXT: shrl $9, %ecx
-; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: movl %eax, %edx
-; AVX2-NEXT: shrl $8, %edx
-; AVX2-NEXT: andl $1, %edx
-; AVX2-NEXT: vmovd %edx, %xmm0
-; AVX2-NEXT: vpinsrw $1, %ecx, %xmm0, %xmm0
-; AVX2-NEXT: movl %eax, %ecx
-; AVX2-NEXT: shrl $10, %ecx
-; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0
-; AVX2-NEXT: movl %eax, %ecx
-; AVX2-NEXT: shrl $11, %ecx
-; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: vpinsrw $3, %ecx, %xmm0, %xmm0
-; AVX2-NEXT: movl %eax, %ecx
-; AVX2-NEXT: shrl $12, %ecx
-; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: vpinsrw $4, %ecx, %xmm0, %xmm0
-; AVX2-NEXT: movl %eax, %ecx
-; AVX2-NEXT: shrl $13, %ecx
-; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0
-; AVX2-NEXT: movl %eax, %ecx
-; AVX2-NEXT: shrl $14, %ecx
-; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0
-; AVX2-NEXT: movl %eax, %ecx
-; AVX2-NEXT: shrl $15, %ecx
-; AVX2-NEXT: movzwl %cx, %ecx
-; AVX2-NEXT: vpinsrw $7, %ecx, %xmm0, %xmm0
-; AVX2-NEXT: movl %eax, %ecx
-; AVX2-NEXT: shrl %ecx
-; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: movl %eax, %edx
-; AVX2-NEXT: andl $1, %edx
-; AVX2-NEXT: vmovd %edx, %xmm1
-; AVX2-NEXT: vpinsrw $1, %ecx, %xmm1, %xmm1
-; AVX2-NEXT: movl %eax, %ecx
-; AVX2-NEXT: shrl $2, %ecx
-; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: vpinsrw $2, %ecx, %xmm1, %xmm1
-; AVX2-NEXT: movl %eax, %ecx
-; AVX2-NEXT: shrl $3, %ecx
-; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: vpinsrw $3, %ecx, %xmm1, %xmm1
-; AVX2-NEXT: movl %eax, %ecx
-; AVX2-NEXT: shrl $4, %ecx
-; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: vpinsrw $4, %ecx, %xmm1, %xmm1
-; AVX2-NEXT: movl %eax, %ecx
-; AVX2-NEXT: shrl $5, %ecx
-; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1
-; AVX2-NEXT: movl %eax, %ecx
-; AVX2-NEXT: shrl $6, %ecx
-; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1
-; AVX2-NEXT: shrl $7, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovd %edi, %xmm0
+; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
+; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpsrlw $15, %ymm0, %ymm0
; AVX2-NEXT: retq
;
-; AVX512-LABEL: ext_i16_16i16:
-; AVX512: # BB#0:
-; AVX512-NEXT: kmovd %edi, %k1
-; AVX512-NEXT: vmovdqu16 {{.*}}(%rip), %ymm0 {%k1} {z}
-; AVX512-NEXT: retq
+; AVX512F-LABEL: ext_i16_16i16:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: kmovw %edi, %k1
+; AVX512F-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512F-NEXT: retq
+;
+; AVX512VLBW-LABEL: ext_i16_16i16:
+; AVX512VLBW: # %bb.0:
+; AVX512VLBW-NEXT: kmovd %edi, %k1
+; AVX512VLBW-NEXT: vmovdqu16 {{.*}}(%rip), %ymm0 {%k1} {z}
+; AVX512VLBW-NEXT: retq
%1 = bitcast i16 %a0 to <16 x i1>
%2 = zext <16 x i1> %1 to <16 x i16>
ret <16 x i16> %2
@@ -965,457 +451,93 @@ define <16 x i16> @ext_i16_16i16(i16 %a0) {
define <32 x i8> @ext_i32_32i8(i32 %a0) {
; SSE2-SSSE3-LABEL: ext_i32_32i8:
-; SSE2-SSSE3: # BB#0:
-; SSE2-SSSE3-NEXT: movw %di, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: shrl $16, %edi
-; SSE2-SSSE3-NEXT: movw %di, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $7, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $6, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $5, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $4, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $3, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $2, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm3
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
-; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $11, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $10, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $9, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm3
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $8, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
-; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $13, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $12, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm3
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $14, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
-; SSE2-SSSE3-NEXT: shrl $15, %eax
-; SSE2-SSSE3-NEXT: movzwl %ax, %eax
-; SSE2-SSSE3-NEXT: movd %eax, %xmm4
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
-; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
-; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE2-SSSE3-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $7, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $6, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $5, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $4, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm3
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
-; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $3, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $2, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm4
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
-; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $11, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $10, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm3
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $9, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm4
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $8, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
-; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $13, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm3
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $12, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm4
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $14, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm3
-; SSE2-SSSE3-NEXT: shrl $15, %eax
-; SSE2-SSSE3-NEXT: movzwl %ax, %eax
-; SSE2-SSSE3-NEXT: movd %eax, %xmm5
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
-; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
-; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
-; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSE2-SSSE3: # %bb.0:
+; SSE2-SSSE3-NEXT: movd %edi, %xmm1
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,0,1,1,4,5,6,7]
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
+; SSE2-SSSE3-NEXT: pand %xmm2, %xmm0
+; SSE2-SSSE3-NEXT: pcmpeqb %xmm2, %xmm0
+; SSE2-SSSE3-NEXT: psrlw $7, %xmm0
+; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; SSE2-SSSE3-NEXT: pand %xmm3, %xmm0
+; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,2,3,3,4,5,6,7]
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
+; SSE2-SSSE3-NEXT: pand %xmm2, %xmm1
+; SSE2-SSSE3-NEXT: pcmpeqb %xmm2, %xmm1
+; SSE2-SSSE3-NEXT: psrlw $7, %xmm1
+; SSE2-SSSE3-NEXT: pand %xmm3, %xmm1
; SSE2-SSSE3-NEXT: retq
;
; AVX1-LABEL: ext_i32_32i8:
-; AVX1: # BB#0:
-; AVX1-NEXT: pushq %rbp
-; AVX1-NEXT: .Lcfi0:
-; AVX1-NEXT: .cfi_def_cfa_offset 16
-; AVX1-NEXT: .Lcfi1:
-; AVX1-NEXT: .cfi_offset %rbp, -16
-; AVX1-NEXT: movq %rsp, %rbp
-; AVX1-NEXT: .Lcfi2:
-; AVX1-NEXT: .cfi_def_cfa_register %rbp
-; AVX1-NEXT: andq $-32, %rsp
-; AVX1-NEXT: subq $32, %rsp
-; AVX1-NEXT: movl %edi, %eax
-; AVX1-NEXT: shrl $17, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: movl %edi, %ecx
-; AVX1-NEXT: shrl $16, %ecx
-; AVX1-NEXT: andl $1, %ecx
-; AVX1-NEXT: vmovd %ecx, %xmm0
-; AVX1-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
-; AVX1-NEXT: movl %edi, %eax
-; AVX1-NEXT: shrl $18, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
-; AVX1-NEXT: movl %edi, %eax
-; AVX1-NEXT: shrl $19, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
-; AVX1-NEXT: movl %edi, %eax
-; AVX1-NEXT: shrl $20, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
-; AVX1-NEXT: movl %edi, %eax
-; AVX1-NEXT: shrl $21, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; AVX1-NEXT: movl %edi, %eax
-; AVX1-NEXT: shrl $22, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; AVX1-NEXT: movl %edi, %eax
-; AVX1-NEXT: shrl $23, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
-; AVX1-NEXT: movl %edi, %eax
-; AVX1-NEXT: shrl $24, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
-; AVX1-NEXT: movl %edi, %eax
-; AVX1-NEXT: shrl $25, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
-; AVX1-NEXT: movl %edi, %eax
-; AVX1-NEXT: shrl $26, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
-; AVX1-NEXT: movl %edi, %eax
-; AVX1-NEXT: shrl $27, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; AVX1-NEXT: movl %edi, %eax
-; AVX1-NEXT: shrl $28, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
-; AVX1-NEXT: movl %edi, %eax
-; AVX1-NEXT: shrl $29, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
-; AVX1-NEXT: movl %edi, %eax
-; AVX1-NEXT: shrl $30, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
-; AVX1-NEXT: movl %edi, %eax
-; AVX1-NEXT: shrl $31, %eax
-; AVX1-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
-; AVX1-NEXT: movl %edi, %eax
-; AVX1-NEXT: shrl %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: movl %edi, %ecx
-; AVX1-NEXT: andl $1, %ecx
-; AVX1-NEXT: vmovd %ecx, %xmm1
-; AVX1-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
-; AVX1-NEXT: movl %edi, %eax
-; AVX1-NEXT: shrl $2, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
-; AVX1-NEXT: movl %edi, %eax
-; AVX1-NEXT: shrl $3, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1
-; AVX1-NEXT: movl %edi, %eax
-; AVX1-NEXT: shrl $4, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; AVX1-NEXT: movl %edi, %eax
-; AVX1-NEXT: shrl $5, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
-; AVX1-NEXT: movl %edi, %eax
-; AVX1-NEXT: shrl $6, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1
-; AVX1-NEXT: movl %edi, %eax
-; AVX1-NEXT: shrl $7, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1
-; AVX1-NEXT: movl %edi, %eax
-; AVX1-NEXT: shrl $8, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
-; AVX1-NEXT: movl %edi, %eax
-; AVX1-NEXT: shrl $9, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1
-; AVX1-NEXT: movl %edi, %eax
-; AVX1-NEXT: shrl $10, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1
-; AVX1-NEXT: movl %edi, %eax
-; AVX1-NEXT: shrl $11, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1
-; AVX1-NEXT: movl %edi, %eax
-; AVX1-NEXT: shrl $12, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
-; AVX1-NEXT: movl %edi, %eax
-; AVX1-NEXT: shrl $13, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
-; AVX1-NEXT: movl %edi, %eax
-; AVX1-NEXT: shrl $14, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1
-; AVX1-NEXT: shrl $15, %edi
-; AVX1-NEXT: andl $1, %edi
-; AVX1-NEXT: vpinsrb $15, %edi, %xmm1, %xmm1
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovd %edi, %xmm0
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,1,1,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,2,3,3,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT: movq %rbp, %rsp
-; AVX1-NEXT: popq %rbp
+; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpsrlw $7, %xmm1, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpsrlw $7, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ext_i32_32i8:
-; AVX2: # BB#0:
-; AVX2-NEXT: pushq %rbp
-; AVX2-NEXT: .Lcfi0:
-; AVX2-NEXT: .cfi_def_cfa_offset 16
-; AVX2-NEXT: .Lcfi1:
-; AVX2-NEXT: .cfi_offset %rbp, -16
-; AVX2-NEXT: movq %rsp, %rbp
-; AVX2-NEXT: .Lcfi2:
-; AVX2-NEXT: .cfi_def_cfa_register %rbp
-; AVX2-NEXT: andq $-32, %rsp
-; AVX2-NEXT: subq $32, %rsp
-; AVX2-NEXT: movl %edi, %eax
-; AVX2-NEXT: shrl $17, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: movl %edi, %ecx
-; AVX2-NEXT: shrl $16, %ecx
-; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: vmovd %ecx, %xmm0
-; AVX2-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
-; AVX2-NEXT: movl %edi, %eax
-; AVX2-NEXT: shrl $18, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
-; AVX2-NEXT: movl %edi, %eax
-; AVX2-NEXT: shrl $19, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
-; AVX2-NEXT: movl %edi, %eax
-; AVX2-NEXT: shrl $20, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
-; AVX2-NEXT: movl %edi, %eax
-; AVX2-NEXT: shrl $21, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; AVX2-NEXT: movl %edi, %eax
-; AVX2-NEXT: shrl $22, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; AVX2-NEXT: movl %edi, %eax
-; AVX2-NEXT: shrl $23, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
-; AVX2-NEXT: movl %edi, %eax
-; AVX2-NEXT: shrl $24, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
-; AVX2-NEXT: movl %edi, %eax
-; AVX2-NEXT: shrl $25, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
-; AVX2-NEXT: movl %edi, %eax
-; AVX2-NEXT: shrl $26, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
-; AVX2-NEXT: movl %edi, %eax
-; AVX2-NEXT: shrl $27, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; AVX2-NEXT: movl %edi, %eax
-; AVX2-NEXT: shrl $28, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
-; AVX2-NEXT: movl %edi, %eax
-; AVX2-NEXT: shrl $29, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
-; AVX2-NEXT: movl %edi, %eax
-; AVX2-NEXT: shrl $30, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
-; AVX2-NEXT: movl %edi, %eax
-; AVX2-NEXT: shrl $31, %eax
-; AVX2-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
-; AVX2-NEXT: movl %edi, %eax
-; AVX2-NEXT: shrl %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: movl %edi, %ecx
-; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: vmovd %ecx, %xmm1
-; AVX2-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
-; AVX2-NEXT: movl %edi, %eax
-; AVX2-NEXT: shrl $2, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
-; AVX2-NEXT: movl %edi, %eax
-; AVX2-NEXT: shrl $3, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1
-; AVX2-NEXT: movl %edi, %eax
-; AVX2-NEXT: shrl $4, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; AVX2-NEXT: movl %edi, %eax
-; AVX2-NEXT: shrl $5, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
-; AVX2-NEXT: movl %edi, %eax
-; AVX2-NEXT: shrl $6, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1
-; AVX2-NEXT: movl %edi, %eax
-; AVX2-NEXT: shrl $7, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1
-; AVX2-NEXT: movl %edi, %eax
-; AVX2-NEXT: shrl $8, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
-; AVX2-NEXT: movl %edi, %eax
-; AVX2-NEXT: shrl $9, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1
-; AVX2-NEXT: movl %edi, %eax
-; AVX2-NEXT: shrl $10, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1
-; AVX2-NEXT: movl %edi, %eax
-; AVX2-NEXT: shrl $11, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1
-; AVX2-NEXT: movl %edi, %eax
-; AVX2-NEXT: shrl $12, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
-; AVX2-NEXT: movl %edi, %eax
-; AVX2-NEXT: shrl $13, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
-; AVX2-NEXT: movl %edi, %eax
-; AVX2-NEXT: shrl $14, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1
-; AVX2-NEXT: shrl $15, %edi
-; AVX2-NEXT: andl $1, %edi
-; AVX2-NEXT: vpinsrb $15, %edi, %xmm1, %xmm1
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovd %edi, %xmm0
+; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,1,1,4,5,6,7]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,2,3,3,4,5,6,7]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX2-NEXT: movq %rbp, %rsp
-; AVX2-NEXT: popq %rbp
+; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
+; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpsrlw $7, %ymm0, %ymm0
+; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
; AVX2-NEXT: retq
;
-; AVX512-LABEL: ext_i32_32i8:
-; AVX512: # BB#0:
-; AVX512-NEXT: kmovd %edi, %k1
-; AVX512-NEXT: vmovdqu8 {{.*}}(%rip), %ymm0 {%k1} {z}
-; AVX512-NEXT: retq
+; AVX512F-LABEL: ext_i32_32i8:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: pushq %rbp
+; AVX512F-NEXT: .cfi_def_cfa_offset 16
+; AVX512F-NEXT: .cfi_offset %rbp, -16
+; AVX512F-NEXT: movq %rsp, %rbp
+; AVX512F-NEXT: .cfi_def_cfa_register %rbp
+; AVX512F-NEXT: andq $-32, %rsp
+; AVX512F-NEXT: subq $32, %rsp
+; AVX512F-NEXT: movl %edi, (%rsp)
+; AVX512F-NEXT: movl {{.*}}(%rip), %eax
+; AVX512F-NEXT: kmovw (%rsp), %k1
+; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k2
+; AVX512F-NEXT: vpbroadcastd %eax, %zmm0 {%k1} {z}
+; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT: vpbroadcastd %eax, %zmm1 {%k2} {z}
+; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
+; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT: movq %rbp, %rsp
+; AVX512F-NEXT: popq %rbp
+; AVX512F-NEXT: retq
+;
+; AVX512VLBW-LABEL: ext_i32_32i8:
+; AVX512VLBW: # %bb.0:
+; AVX512VLBW-NEXT: kmovd %edi, %k1
+; AVX512VLBW-NEXT: vmovdqu8 {{.*}}(%rip), %ymm0 {%k1} {z}
+; AVX512VLBW-NEXT: retq
%1 = bitcast i32 %a0 to <32 x i1>
%2 = zext <32 x i1> %1 to <32 x i8>
ret <32 x i8> %2
@@ -1427,156 +549,93 @@ define <32 x i8> @ext_i32_32i8(i32 %a0) {
define <8 x i64> @ext_i8_8i64(i8 %a0) {
; SSE2-SSSE3-LABEL: ext_i8_8i64:
-; SSE2-SSSE3: # BB#0:
-; SSE2-SSSE3-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $3, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $2, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
-; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm3
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
-; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
-; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $5, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $4, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
-; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $6, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
-; SSE2-SSSE3-NEXT: shrl $7, %eax
-; SSE2-SSSE3-NEXT: movzwl %ax, %eax
-; SSE2-SSSE3-NEXT: movd %eax, %xmm2
-; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0]
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,1,0,3]
-; SSE2-SSSE3-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,6,7]
-; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [1,1]
-; SSE2-SSSE3-NEXT: pand %xmm4, %xmm0
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,3]
-; SSE2-SSSE3-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,6,7]
-; SSE2-SSSE3-NEXT: pand %xmm4, %xmm1
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,1,2,3]
-; SSE2-SSSE3-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,6,7]
-; SSE2-SSSE3-NEXT: pand %xmm4, %xmm2
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,1,3,3]
-; SSE2-SSSE3-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,6,7]
+; SSE2-SSSE3: # %bb.0:
+; SSE2-SSSE3-NEXT: # kill: def %edi killed %edi def %rdi
+; SSE2-SSSE3-NEXT: movq %rdi, %xmm0
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,1,0,1]
+; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [1,2]
+; SSE2-SSSE3-NEXT: movdqa %xmm4, %xmm1
+; SSE2-SSSE3-NEXT: pand %xmm0, %xmm1
+; SSE2-SSSE3-NEXT: pcmpeqd %xmm0, %xmm1
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,0,3,2]
+; SSE2-SSSE3-NEXT: pand %xmm1, %xmm0
+; SSE2-SSSE3-NEXT: psrlq $63, %xmm0
+; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [4,8]
+; SSE2-SSSE3-NEXT: movdqa %xmm4, %xmm2
+; SSE2-SSSE3-NEXT: pand %xmm1, %xmm2
+; SSE2-SSSE3-NEXT: pcmpeqd %xmm1, %xmm2
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,0,3,2]
+; SSE2-SSSE3-NEXT: pand %xmm2, %xmm1
+; SSE2-SSSE3-NEXT: psrlq $63, %xmm1
+; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [16,32]
+; SSE2-SSSE3-NEXT: movdqa %xmm4, %xmm3
+; SSE2-SSSE3-NEXT: pand %xmm2, %xmm3
+; SSE2-SSSE3-NEXT: pcmpeqd %xmm2, %xmm3
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,0,3,2]
+; SSE2-SSSE3-NEXT: pand %xmm3, %xmm2
+; SSE2-SSSE3-NEXT: psrlq $63, %xmm2
+; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [64,128]
+; SSE2-SSSE3-NEXT: pand %xmm3, %xmm4
+; SSE2-SSSE3-NEXT: pcmpeqd %xmm3, %xmm4
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,0,3,2]
; SSE2-SSSE3-NEXT: pand %xmm4, %xmm3
+; SSE2-SSSE3-NEXT: psrlq $63, %xmm3
; SSE2-SSSE3-NEXT: retq
;
; AVX1-LABEL: ext_i8_8i64:
-; AVX1: # BB#0:
-; AVX1-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; AVX1-NEXT: movl %eax, %ecx
-; AVX1-NEXT: shrl %ecx
-; AVX1-NEXT: andl $1, %ecx
-; AVX1-NEXT: movl %eax, %edx
-; AVX1-NEXT: andl $1, %edx
-; AVX1-NEXT: vmovd %edx, %xmm0
-; AVX1-NEXT: vpinsrw $1, %ecx, %xmm0, %xmm0
-; AVX1-NEXT: movl %eax, %ecx
-; AVX1-NEXT: shrl $2, %ecx
-; AVX1-NEXT: andl $1, %ecx
-; AVX1-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0
-; AVX1-NEXT: movl %eax, %ecx
-; AVX1-NEXT: shrl $3, %ecx
-; AVX1-NEXT: andl $1, %ecx
-; AVX1-NEXT: vpinsrw $3, %ecx, %xmm0, %xmm0
-; AVX1-NEXT: movl %eax, %ecx
-; AVX1-NEXT: shrl $4, %ecx
-; AVX1-NEXT: andl $1, %ecx
-; AVX1-NEXT: vpinsrw $4, %ecx, %xmm0, %xmm0
-; AVX1-NEXT: movl %eax, %ecx
-; AVX1-NEXT: shrl $5, %ecx
-; AVX1-NEXT: andl $1, %ecx
-; AVX1-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0
-; AVX1-NEXT: movl %eax, %ecx
-; AVX1-NEXT: shrl $6, %ecx
-; AVX1-NEXT: andl $1, %ecx
-; AVX1-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0
-; AVX1-NEXT: shrl $7, %eax
-; AVX1-NEXT: movzwl %ax, %eax
-; AVX1-NEXT: vpinsrw $7, %eax, %xmm0, %xmm1
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
-; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [1,1,1,1]
-; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
+; AVX1: # %bb.0:
+; AVX1-NEXT: # kill: def %edi killed %edi def %rdi
+; AVX1-NEXT: vmovq %rdi, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1
+; AVX1-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm0
+; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm3
+; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
+; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpsrlq $63, %xmm3, %xmm3
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vpsrlq $63, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0
+; AVX1-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1
+; AVX1-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm3
+; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpsrlq $63, %xmm3, %xmm3
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpsrlq $63, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1
-; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
; AVX1-NEXT: retq
;
; AVX2-LABEL: ext_i8_8i64:
-; AVX2: # BB#0:
-; AVX2-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; AVX2-NEXT: movl %eax, %ecx
-; AVX2-NEXT: shrl %ecx
-; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: movl %eax, %edx
-; AVX2-NEXT: andl $1, %edx
-; AVX2-NEXT: vmovd %edx, %xmm0
-; AVX2-NEXT: vpinsrw $1, %ecx, %xmm0, %xmm0
-; AVX2-NEXT: movl %eax, %ecx
-; AVX2-NEXT: shrl $2, %ecx
-; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0
-; AVX2-NEXT: movl %eax, %ecx
-; AVX2-NEXT: shrl $3, %ecx
-; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: vpinsrw $3, %ecx, %xmm0, %xmm0
-; AVX2-NEXT: movl %eax, %ecx
-; AVX2-NEXT: shrl $4, %ecx
-; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: vpinsrw $4, %ecx, %xmm0, %xmm0
-; AVX2-NEXT: movl %eax, %ecx
-; AVX2-NEXT: shrl $5, %ecx
-; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0
-; AVX2-NEXT: movl %eax, %ecx
-; AVX2-NEXT: shrl $6, %ecx
-; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0
-; AVX2-NEXT: shrl $7, %eax
-; AVX2-NEXT: movzwl %ax, %eax
-; AVX2-NEXT: vpinsrw $7, %eax, %xmm0, %xmm1
-; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [1,1,1,1]
-; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
-; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; AVX2: # %bb.0:
+; AVX2-NEXT: # kill: def %edi killed %edi def %rdi
+; AVX2-NEXT: vmovq %rdi, %xmm0
+; AVX2-NEXT: vpbroadcastq %xmm0, %ymm1
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [1,2,4,8]
+; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm2
+; AVX2-NEXT: vpcmpeqq %ymm0, %ymm2, %ymm0
+; AVX2-NEXT: vpsrlq $63, %ymm0, %ymm0
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [16,32,64,128]
; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpcmpeqq %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpsrlq $63, %ymm1, %ymm1
; AVX2-NEXT: retq
;
-; AVX512-LABEL: ext_i8_8i64:
-; AVX512: # BB#0:
-; AVX512-NEXT: kmovd %edi, %k1
-; AVX512-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z}
-; AVX512-NEXT: retq
+; AVX512F-LABEL: ext_i8_8i64:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: kmovw %edi, %k1
+; AVX512F-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z}
+; AVX512F-NEXT: retq
+;
+; AVX512VLBW-LABEL: ext_i8_8i64:
+; AVX512VLBW: # %bb.0:
+; AVX512VLBW-NEXT: kmovd %edi, %k1
+; AVX512VLBW-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z}
+; AVX512VLBW-NEXT: retq
%1 = bitcast i8 %a0 to <8 x i1>
%2 = zext <8 x i1> %1 to <8 x i64>
ret <8 x i64> %2
@@ -1584,261 +643,82 @@ define <8 x i64> @ext_i8_8i64(i8 %a0) {
define <16 x i32> @ext_i16_16i32(i16 %a0) {
; SSE2-SSSE3-LABEL: ext_i16_16i32:
-; SSE2-SSSE3: # BB#0:
-; SSE2-SSSE3-NEXT: movw %di, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $7, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $6, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $5, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $4, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $3, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $2, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm3
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
-; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $11, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $10, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $9, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $8, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $13, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $12, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $14, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
-; SSE2-SSSE3-NEXT: shrl $15, %eax
-; SSE2-SSSE3-NEXT: movzwl %ax, %eax
-; SSE2-SSSE3-NEXT: movd %eax, %xmm4
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
-; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0]
+; SSE2-SSSE3: # %bb.0:
+; SSE2-SSSE3-NEXT: movd %edi, %xmm0
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0]
+; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8]
+; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm0
+; SSE2-SSSE3-NEXT: pand %xmm1, %xmm0
+; SSE2-SSSE3-NEXT: pcmpeqd %xmm1, %xmm0
+; SSE2-SSSE3-NEXT: psrld $31, %xmm0
+; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [16,32,64,128]
; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm1
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm0
-; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [1,1,1,1]
-; SSE2-SSSE3-NEXT: pand %xmm4, %xmm0
-; SSE2-SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-SSSE3-NEXT: pand %xmm4, %xmm1
-; SSE2-SSSE3-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
+; SSE2-SSSE3-NEXT: pand %xmm2, %xmm1
+; SSE2-SSSE3-NEXT: pcmpeqd %xmm2, %xmm1
+; SSE2-SSSE3-NEXT: psrld $31, %xmm1
+; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [256,512,1024,2048]
; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm2
-; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
; SSE2-SSSE3-NEXT: pand %xmm4, %xmm2
-; SSE2-SSSE3-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; SSE2-SSSE3-NEXT: pcmpeqd %xmm4, %xmm2
+; SSE2-SSSE3-NEXT: psrld $31, %xmm2
+; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [4096,8192,16384,32768]
; SSE2-SSSE3-NEXT: pand %xmm4, %xmm3
+; SSE2-SSSE3-NEXT: pcmpeqd %xmm4, %xmm3
+; SSE2-SSSE3-NEXT: psrld $31, %xmm3
; SSE2-SSSE3-NEXT: retq
;
; AVX1-LABEL: ext_i16_16i32:
-; AVX1: # BB#0:
-; AVX1-NEXT: movw %di, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax
-; AVX1-NEXT: movl %eax, %ecx
-; AVX1-NEXT: shrl %ecx
-; AVX1-NEXT: andl $1, %ecx
-; AVX1-NEXT: movl %eax, %edx
-; AVX1-NEXT: andl $1, %edx
-; AVX1-NEXT: vmovd %edx, %xmm0
-; AVX1-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
-; AVX1-NEXT: movl %eax, %ecx
-; AVX1-NEXT: shrl $2, %ecx
-; AVX1-NEXT: andl $1, %ecx
-; AVX1-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0
-; AVX1-NEXT: movl %eax, %ecx
-; AVX1-NEXT: shrl $3, %ecx
-; AVX1-NEXT: andl $1, %ecx
-; AVX1-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0
-; AVX1-NEXT: movl %eax, %ecx
-; AVX1-NEXT: shrl $4, %ecx
-; AVX1-NEXT: andl $1, %ecx
-; AVX1-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0
-; AVX1-NEXT: movl %eax, %ecx
-; AVX1-NEXT: shrl $5, %ecx
-; AVX1-NEXT: andl $1, %ecx
-; AVX1-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0
-; AVX1-NEXT: movl %eax, %ecx
-; AVX1-NEXT: shrl $6, %ecx
-; AVX1-NEXT: andl $1, %ecx
-; AVX1-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; AVX1-NEXT: movl %eax, %ecx
-; AVX1-NEXT: shrl $7, %ecx
-; AVX1-NEXT: andl $1, %ecx
-; AVX1-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0
-; AVX1-NEXT: movl %eax, %ecx
-; AVX1-NEXT: shrl $8, %ecx
-; AVX1-NEXT: andl $1, %ecx
-; AVX1-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0
-; AVX1-NEXT: movl %eax, %ecx
-; AVX1-NEXT: shrl $9, %ecx
-; AVX1-NEXT: andl $1, %ecx
-; AVX1-NEXT: vpinsrb $9, %ecx, %xmm0, %xmm0
-; AVX1-NEXT: movl %eax, %ecx
-; AVX1-NEXT: shrl $10, %ecx
-; AVX1-NEXT: andl $1, %ecx
-; AVX1-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0
-; AVX1-NEXT: movl %eax, %ecx
-; AVX1-NEXT: shrl $11, %ecx
-; AVX1-NEXT: andl $1, %ecx
-; AVX1-NEXT: vpinsrb $11, %ecx, %xmm0, %xmm0
-; AVX1-NEXT: movl %eax, %ecx
-; AVX1-NEXT: shrl $12, %ecx
-; AVX1-NEXT: andl $1, %ecx
-; AVX1-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0
-; AVX1-NEXT: movl %eax, %ecx
-; AVX1-NEXT: shrl $13, %ecx
-; AVX1-NEXT: andl $1, %ecx
-; AVX1-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0
-; AVX1-NEXT: movl %eax, %ecx
-; AVX1-NEXT: shrl $14, %ecx
-; AVX1-NEXT: andl $1, %ecx
-; AVX1-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; AVX1-NEXT: shrl $15, %eax
-; AVX1-NEXT: movzwl %ax, %eax
-; AVX1-NEXT: vpinsrb $15, %eax, %xmm0, %xmm1
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4,4,5,5,6,6,7,7]
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1]
-; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
-; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovd %edi, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1
+; AVX1-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm0
+; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm3
+; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
+; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpsrld $31, %xmm3, %xmm3
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vpsrld $31, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0
+; AVX1-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1
+; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm3
+; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpsrld $31, %xmm3, %xmm3
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpsrld $31, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1
; AVX1-NEXT: retq
;
; AVX2-LABEL: ext_i16_16i32:
-; AVX2: # BB#0:
-; AVX2-NEXT: movw %di, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax
-; AVX2-NEXT: movl %eax, %ecx
-; AVX2-NEXT: shrl %ecx
-; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: movl %eax, %edx
-; AVX2-NEXT: andl $1, %edx
-; AVX2-NEXT: vmovd %edx, %xmm0
-; AVX2-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
-; AVX2-NEXT: movl %eax, %ecx
-; AVX2-NEXT: shrl $2, %ecx
-; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0
-; AVX2-NEXT: movl %eax, %ecx
-; AVX2-NEXT: shrl $3, %ecx
-; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0
-; AVX2-NEXT: movl %eax, %ecx
-; AVX2-NEXT: shrl $4, %ecx
-; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0
-; AVX2-NEXT: movl %eax, %ecx
-; AVX2-NEXT: shrl $5, %ecx
-; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0
-; AVX2-NEXT: movl %eax, %ecx
-; AVX2-NEXT: shrl $6, %ecx
-; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; AVX2-NEXT: movl %eax, %ecx
-; AVX2-NEXT: shrl $7, %ecx
-; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0
-; AVX2-NEXT: movl %eax, %ecx
-; AVX2-NEXT: shrl $8, %ecx
-; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0
-; AVX2-NEXT: movl %eax, %ecx
-; AVX2-NEXT: shrl $9, %ecx
-; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: vpinsrb $9, %ecx, %xmm0, %xmm0
-; AVX2-NEXT: movl %eax, %ecx
-; AVX2-NEXT: shrl $10, %ecx
-; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0
-; AVX2-NEXT: movl %eax, %ecx
-; AVX2-NEXT: shrl $11, %ecx
-; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: vpinsrb $11, %ecx, %xmm0, %xmm0
-; AVX2-NEXT: movl %eax, %ecx
-; AVX2-NEXT: shrl $12, %ecx
-; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0
-; AVX2-NEXT: movl %eax, %ecx
-; AVX2-NEXT: shrl $13, %ecx
-; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0
-; AVX2-NEXT: movl %eax, %ecx
-; AVX2-NEXT: shrl $14, %ecx
-; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; AVX2-NEXT: shrl $15, %eax
-; AVX2-NEXT: movzwl %ax, %eax
-; AVX2-NEXT: vpinsrb $15, %eax, %xmm0, %xmm1
-; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1]
-; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
-; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovd %edi, %xmm0
+; AVX2-NEXT: vpbroadcastd %xmm0, %ymm1
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [1,2,4,8,16,32,64,128]
+; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm2
+; AVX2-NEXT: vpcmpeqd %ymm0, %ymm2, %ymm0
+; AVX2-NEXT: vpsrld $31, %ymm0, %ymm0
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [256,512,1024,2048,4096,8192,16384,32768]
; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpcmpeqd %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpsrld $31, %ymm1, %ymm1
; AVX2-NEXT: retq
;
-; AVX512-LABEL: ext_i16_16i32:
-; AVX512: # BB#0:
-; AVX512-NEXT: kmovd %edi, %k1
-; AVX512-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
-; AVX512-NEXT: retq
+; AVX512F-LABEL: ext_i16_16i32:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: kmovw %edi, %k1
+; AVX512F-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; AVX512F-NEXT: retq
+;
+; AVX512VLBW-LABEL: ext_i16_16i32:
+; AVX512VLBW: # %bb.0:
+; AVX512VLBW-NEXT: kmovd %edi, %k1
+; AVX512VLBW-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; AVX512VLBW-NEXT: retq
%1 = bitcast i16 %a0 to <16 x i1>
%2 = zext <16 x i1> %1 to <16 x i32>
ret <16 x i32> %2
@@ -1846,557 +726,108 @@ define <16 x i32> @ext_i16_16i32(i16 %a0) {
define <32 x i16> @ext_i32_32i16(i32 %a0) {
; SSE2-SSSE3-LABEL: ext_i32_32i16:
-; SSE2-SSSE3: # BB#0:
-; SSE2-SSSE3-NEXT: movl %edi, %eax
-; SSE2-SSSE3-NEXT: shrl $16, %eax
-; SSE2-SSSE3-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movw %di, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $7, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $6, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $5, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $4, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $3, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $2, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm3
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
-; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $11, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $10, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $9, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $8, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $13, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $12, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $14, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
-; SSE2-SSSE3-NEXT: shrl $15, %eax
-; SSE2-SSSE3-NEXT: movzwl %ax, %eax
-; SSE2-SSSE3-NEXT: movd %eax, %xmm4
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
-; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0]
-; SSE2-SSSE3-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $7, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $6, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $5, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $4, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $3, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $2, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm4
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
-; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $11, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $10, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $9, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm4
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $8, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
-; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $13, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $12, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm4
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $14, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
-; SSE2-SSSE3-NEXT: shrl $15, %eax
-; SSE2-SSSE3-NEXT: movzwl %ax, %eax
-; SSE2-SSSE3-NEXT: movd %eax, %xmm5
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7]
-; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
-; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
-; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE2-SSSE3: # %bb.0:
+; SSE2-SSSE3-NEXT: movd %edi, %xmm2
+; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,0,0,0,4,5,6,7]
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1]
+; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [1,2,4,8,16,32,64,128]
; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm0
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [1,1,1,1,1,1,1,1]
; SSE2-SSSE3-NEXT: pand %xmm4, %xmm0
-; SSE2-SSSE3-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
-; SSE2-SSSE3-NEXT: pand %xmm4, %xmm1
+; SSE2-SSSE3-NEXT: pcmpeqw %xmm4, %xmm0
+; SSE2-SSSE3-NEXT: psrlw $15, %xmm0
+; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [256,512,1024,2048,4096,8192,16384,32768]
+; SSE2-SSSE3-NEXT: pand %xmm5, %xmm1
+; SSE2-SSSE3-NEXT: pcmpeqw %xmm5, %xmm1
+; SSE2-SSSE3-NEXT: psrlw $15, %xmm1
+; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,1,1,1,4,5,6,7]
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,1,1]
; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm2
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
; SSE2-SSSE3-NEXT: pand %xmm4, %xmm2
-; SSE2-SSSE3-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
-; SSE2-SSSE3-NEXT: pand %xmm4, %xmm3
+; SSE2-SSSE3-NEXT: pcmpeqw %xmm4, %xmm2
+; SSE2-SSSE3-NEXT: psrlw $15, %xmm2
+; SSE2-SSSE3-NEXT: pand %xmm5, %xmm3
+; SSE2-SSSE3-NEXT: pcmpeqw %xmm5, %xmm3
+; SSE2-SSSE3-NEXT: psrlw $15, %xmm3
; SSE2-SSSE3-NEXT: retq
;
; AVX1-LABEL: ext_i32_32i16:
-; AVX1: # BB#0:
-; AVX1-NEXT: pushq %rbp
-; AVX1-NEXT: .Lcfi3:
-; AVX1-NEXT: .cfi_def_cfa_offset 16
-; AVX1-NEXT: .Lcfi4:
-; AVX1-NEXT: .cfi_offset %rbp, -16
-; AVX1-NEXT: movq %rsp, %rbp
-; AVX1-NEXT: .Lcfi5:
-; AVX1-NEXT: .cfi_def_cfa_register %rbp
-; AVX1-NEXT: pushq %r15
-; AVX1-NEXT: pushq %r14
-; AVX1-NEXT: pushq %r13
-; AVX1-NEXT: pushq %r12
-; AVX1-NEXT: pushq %rbx
-; AVX1-NEXT: andq $-32, %rsp
-; AVX1-NEXT: subq $128, %rsp
-; AVX1-NEXT: .Lcfi6:
-; AVX1-NEXT: .cfi_offset %rbx, -56
-; AVX1-NEXT: .Lcfi7:
-; AVX1-NEXT: .cfi_offset %r12, -48
-; AVX1-NEXT: .Lcfi8:
-; AVX1-NEXT: .cfi_offset %r13, -40
-; AVX1-NEXT: .Lcfi9:
-; AVX1-NEXT: .cfi_offset %r14, -32
-; AVX1-NEXT: .Lcfi10:
-; AVX1-NEXT: .cfi_offset %r15, -24
-; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
-; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
-; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
-; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
-; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
-; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
-; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
-; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
-; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
-; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
-; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
-; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
-; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
-; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
-; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
-; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
-; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
-; AVX1-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
-; AVX1-NEXT: movl %edi, %r13d
-; AVX1-NEXT: movl %edi, %r12d
-; AVX1-NEXT: movl %edi, %r15d
-; AVX1-NEXT: movl %edi, %r14d
-; AVX1-NEXT: movl %edi, %ebx
-; AVX1-NEXT: movl %edi, %r11d
-; AVX1-NEXT: movl %edi, %r10d
-; AVX1-NEXT: movl %edi, %r9d
-; AVX1-NEXT: movl %edi, %r8d
-; AVX1-NEXT: movl %edi, %esi
-; AVX1-NEXT: movl %edi, %edx
-; AVX1-NEXT: movl %edi, %ecx
-; AVX1-NEXT: movl %edi, %eax
-; AVX1-NEXT: andl $1, %edi
-; AVX1-NEXT: vmovd %edi, %xmm0
-; AVX1-NEXT: shrl %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
-; AVX1-NEXT: shrl $2, %ecx
-; AVX1-NEXT: andl $1, %ecx
-; AVX1-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0
-; AVX1-NEXT: shrl $3, %edx
-; AVX1-NEXT: andl $1, %edx
-; AVX1-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
-; AVX1-NEXT: shrl $4, %esi
-; AVX1-NEXT: andl $1, %esi
-; AVX1-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; AVX1-NEXT: shrl $5, %r8d
-; AVX1-NEXT: andl $1, %r8d
-; AVX1-NEXT: vpinsrb $5, %r8d, %xmm0, %xmm0
-; AVX1-NEXT: shrl $6, %r9d
-; AVX1-NEXT: andl $1, %r9d
-; AVX1-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
-; AVX1-NEXT: shrl $7, %r10d
-; AVX1-NEXT: andl $1, %r10d
-; AVX1-NEXT: vpinsrb $7, %r10d, %xmm0, %xmm0
-; AVX1-NEXT: shrl $8, %r11d
-; AVX1-NEXT: andl $1, %r11d
-; AVX1-NEXT: vpinsrb $8, %r11d, %xmm0, %xmm0
-; AVX1-NEXT: shrl $9, %ebx
-; AVX1-NEXT: andl $1, %ebx
-; AVX1-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; AVX1-NEXT: shrl $10, %r14d
-; AVX1-NEXT: andl $1, %r14d
-; AVX1-NEXT: vpinsrb $10, %r14d, %xmm0, %xmm0
-; AVX1-NEXT: shrl $11, %r15d
-; AVX1-NEXT: andl $1, %r15d
-; AVX1-NEXT: vpinsrb $11, %r15d, %xmm0, %xmm0
-; AVX1-NEXT: shrl $12, %r12d
-; AVX1-NEXT: andl $1, %r12d
-; AVX1-NEXT: vpinsrb $12, %r12d, %xmm0, %xmm0
-; AVX1-NEXT: shrl $13, %r13d
-; AVX1-NEXT: andl $1, %r13d
-; AVX1-NEXT: vpinsrb $13, %r13d, %xmm0, %xmm0
-; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
-; AVX1-NEXT: shrl $14, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
-; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
-; AVX1-NEXT: shrl $15, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
-; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
-; AVX1-NEXT: shrl $16, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vmovd %eax, %xmm1
-; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
-; AVX1-NEXT: shrl $17, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
-; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
-; AVX1-NEXT: shrl $18, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
-; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
-; AVX1-NEXT: shrl $19, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1
-; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
-; AVX1-NEXT: shrl $20, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
-; AVX1-NEXT: shrl $21, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
-; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
-; AVX1-NEXT: shrl $22, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1
-; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
-; AVX1-NEXT: shrl $23, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1
-; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
-; AVX1-NEXT: shrl $24, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
-; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
-; AVX1-NEXT: shrl $25, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1
-; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
-; AVX1-NEXT: shrl $26, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1
-; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
-; AVX1-NEXT: shrl $27, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1
-; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
-; AVX1-NEXT: shrl $28, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
-; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
-; AVX1-NEXT: shrl $29, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
-; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
-; AVX1-NEXT: shrl $30, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1
-; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
-; AVX1-NEXT: shrl $31, %eax
-; AVX1-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
-; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovd %edi, %xmm1
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1
+; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpcmpeqw %xmm3, %xmm0, %xmm4
+; AVX1-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
+; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vpsrlw $15, %xmm4, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpcmpeqw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpxor %xmm5, %xmm0, %xmm0
+; AVX1-NEXT: vpsrlw $15, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,1,1,1,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1
; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
-; AVX1-NEXT: leaq -40(%rbp), %rsp
-; AVX1-NEXT: popq %rbx
-; AVX1-NEXT: popq %r12
-; AVX1-NEXT: popq %r13
-; AVX1-NEXT: popq %r14
-; AVX1-NEXT: popq %r15
-; AVX1-NEXT: popq %rbp
+; AVX1-NEXT: vpcmpeqw %xmm3, %xmm1, %xmm2
+; AVX1-NEXT: vpxor %xmm5, %xmm2, %xmm2
+; AVX1-NEXT: vpsrlw $15, %xmm2, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT: vpcmpeqw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpxor %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vpsrlw $15, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
; AVX1-NEXT: retq
;
; AVX2-LABEL: ext_i32_32i16:
-; AVX2: # BB#0:
-; AVX2-NEXT: pushq %rbp
-; AVX2-NEXT: .Lcfi3:
-; AVX2-NEXT: .cfi_def_cfa_offset 16
-; AVX2-NEXT: .Lcfi4:
-; AVX2-NEXT: .cfi_offset %rbp, -16
-; AVX2-NEXT: movq %rsp, %rbp
-; AVX2-NEXT: .Lcfi5:
-; AVX2-NEXT: .cfi_def_cfa_register %rbp
-; AVX2-NEXT: pushq %r15
-; AVX2-NEXT: pushq %r14
-; AVX2-NEXT: pushq %r13
-; AVX2-NEXT: pushq %r12
-; AVX2-NEXT: pushq %rbx
-; AVX2-NEXT: andq $-32, %rsp
-; AVX2-NEXT: subq $128, %rsp
-; AVX2-NEXT: .Lcfi6:
-; AVX2-NEXT: .cfi_offset %rbx, -56
-; AVX2-NEXT: .Lcfi7:
-; AVX2-NEXT: .cfi_offset %r12, -48
-; AVX2-NEXT: .Lcfi8:
-; AVX2-NEXT: .cfi_offset %r13, -40
-; AVX2-NEXT: .Lcfi9:
-; AVX2-NEXT: .cfi_offset %r14, -32
-; AVX2-NEXT: .Lcfi10:
-; AVX2-NEXT: .cfi_offset %r15, -24
-; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
-; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
-; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
-; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
-; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
-; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
-; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
-; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
-; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
-; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
-; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
-; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
-; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
-; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
-; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
-; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
-; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
-; AVX2-NEXT: movl %edi, {{[0-9]+}}(%rsp) # 4-byte Spill
-; AVX2-NEXT: movl %edi, %r13d
-; AVX2-NEXT: movl %edi, %r12d
-; AVX2-NEXT: movl %edi, %r15d
-; AVX2-NEXT: movl %edi, %r14d
-; AVX2-NEXT: movl %edi, %ebx
-; AVX2-NEXT: movl %edi, %r11d
-; AVX2-NEXT: movl %edi, %r10d
-; AVX2-NEXT: movl %edi, %r9d
-; AVX2-NEXT: movl %edi, %r8d
-; AVX2-NEXT: movl %edi, %esi
-; AVX2-NEXT: movl %edi, %edx
-; AVX2-NEXT: movl %edi, %ecx
-; AVX2-NEXT: movl %edi, %eax
-; AVX2-NEXT: andl $1, %edi
+; AVX2: # %bb.0:
; AVX2-NEXT: vmovd %edi, %xmm0
-; AVX2-NEXT: shrl %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
-; AVX2-NEXT: shrl $2, %ecx
-; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0
-; AVX2-NEXT: shrl $3, %edx
-; AVX2-NEXT: andl $1, %edx
-; AVX2-NEXT: vpinsrb $3, %edx, %xmm0, %xmm0
-; AVX2-NEXT: shrl $4, %esi
-; AVX2-NEXT: andl $1, %esi
-; AVX2-NEXT: vpinsrb $4, %esi, %xmm0, %xmm0
-; AVX2-NEXT: shrl $5, %r8d
-; AVX2-NEXT: andl $1, %r8d
-; AVX2-NEXT: vpinsrb $5, %r8d, %xmm0, %xmm0
-; AVX2-NEXT: shrl $6, %r9d
-; AVX2-NEXT: andl $1, %r9d
-; AVX2-NEXT: vpinsrb $6, %r9d, %xmm0, %xmm0
-; AVX2-NEXT: shrl $7, %r10d
-; AVX2-NEXT: andl $1, %r10d
-; AVX2-NEXT: vpinsrb $7, %r10d, %xmm0, %xmm0
-; AVX2-NEXT: shrl $8, %r11d
-; AVX2-NEXT: andl $1, %r11d
-; AVX2-NEXT: vpinsrb $8, %r11d, %xmm0, %xmm0
-; AVX2-NEXT: shrl $9, %ebx
-; AVX2-NEXT: andl $1, %ebx
-; AVX2-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0
-; AVX2-NEXT: shrl $10, %r14d
-; AVX2-NEXT: andl $1, %r14d
-; AVX2-NEXT: vpinsrb $10, %r14d, %xmm0, %xmm0
-; AVX2-NEXT: shrl $11, %r15d
-; AVX2-NEXT: andl $1, %r15d
-; AVX2-NEXT: vpinsrb $11, %r15d, %xmm0, %xmm0
-; AVX2-NEXT: shrl $12, %r12d
-; AVX2-NEXT: andl $1, %r12d
-; AVX2-NEXT: vpinsrb $12, %r12d, %xmm0, %xmm0
-; AVX2-NEXT: shrl $13, %r13d
-; AVX2-NEXT: andl $1, %r13d
-; AVX2-NEXT: vpinsrb $13, %r13d, %xmm0, %xmm0
-; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
-; AVX2-NEXT: shrl $14, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
-; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
-; AVX2-NEXT: shrl $15, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
-; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
-; AVX2-NEXT: shrl $16, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vmovd %eax, %xmm1
-; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
-; AVX2-NEXT: shrl $17, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
-; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
-; AVX2-NEXT: shrl $18, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
-; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
-; AVX2-NEXT: shrl $19, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1
-; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
-; AVX2-NEXT: shrl $20, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
-; AVX2-NEXT: shrl $21, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
-; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
-; AVX2-NEXT: shrl $22, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1
-; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
-; AVX2-NEXT: shrl $23, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1
-; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
-; AVX2-NEXT: shrl $24, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
-; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
-; AVX2-NEXT: shrl $25, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1
-; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
-; AVX2-NEXT: shrl $26, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1
-; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
-; AVX2-NEXT: shrl $27, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1
-; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
-; AVX2-NEXT: shrl $28, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
-; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
-; AVX2-NEXT: shrl $29, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
-; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
-; AVX2-NEXT: shrl $30, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1
-; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax # 4-byte Reload
-; AVX2-NEXT: shrl $31, %eax
-; AVX2-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1
-; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
-; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
-; AVX2-NEXT: leaq -40(%rbp), %rsp
-; AVX2-NEXT: popq %rbx
-; AVX2-NEXT: popq %r12
-; AVX2-NEXT: popq %r13
-; AVX2-NEXT: popq %r14
-; AVX2-NEXT: popq %r15
-; AVX2-NEXT: popq %rbp
+; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
+; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpsrlw $15, %ymm0, %ymm0
+; AVX2-NEXT: shrl $16, %edi
+; AVX2-NEXT: vmovd %edi, %xmm2
+; AVX2-NEXT: vpbroadcastw %xmm2, %ymm2
+; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm2
+; AVX2-NEXT: vpcmpeqw %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpsrlw $15, %ymm1, %ymm1
; AVX2-NEXT: retq
;
-; AVX512-LABEL: ext_i32_32i16:
-; AVX512: # BB#0:
-; AVX512-NEXT: kmovd %edi, %k1
-; AVX512-NEXT: vmovdqu16 {{.*}}(%rip), %zmm0 {%k1} {z}
-; AVX512-NEXT: retq
+; AVX512F-LABEL: ext_i32_32i16:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: pushq %rbp
+; AVX512F-NEXT: .cfi_def_cfa_offset 16
+; AVX512F-NEXT: .cfi_offset %rbp, -16
+; AVX512F-NEXT: movq %rsp, %rbp
+; AVX512F-NEXT: .cfi_def_cfa_register %rbp
+; AVX512F-NEXT: andq $-32, %rsp
+; AVX512F-NEXT: subq $32, %rsp
+; AVX512F-NEXT: movl %edi, (%rsp)
+; AVX512F-NEXT: kmovw (%rsp), %k1
+; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k2
+; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
+; AVX512F-NEXT: vpmovdb %zmm0, %xmm1
+; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX512F-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512F-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512F-NEXT: movq %rbp, %rsp
+; AVX512F-NEXT: popq %rbp
+; AVX512F-NEXT: retq
+;
+; AVX512VLBW-LABEL: ext_i32_32i16:
+; AVX512VLBW: # %bb.0:
+; AVX512VLBW-NEXT: kmovd %edi, %k1
+; AVX512VLBW-NEXT: vmovdqu16 {{.*}}(%rip), %zmm0 {%k1} {z}
+; AVX512VLBW-NEXT: retq
%1 = bitcast i32 %a0 to <32 x i1>
%2 = zext <32 x i1> %1 to <32 x i16>
ret <32 x i16> %2
@@ -2404,875 +835,141 @@ define <32 x i16> @ext_i32_32i16(i32 %a0) {
define <64 x i8> @ext_i64_64i8(i64 %a0) {
; SSE2-SSSE3-LABEL: ext_i64_64i8:
-; SSE2-SSSE3: # BB#0:
-; SSE2-SSSE3-NEXT: movw %di, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movq %rdi, %rax
-; SSE2-SSSE3-NEXT: shrq $32, %rax
-; SSE2-SSSE3-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movq %rdi, %rax
-; SSE2-SSSE3-NEXT: shrq $48, %rax
-; SSE2-SSSE3-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: shrl $16, %edi
-; SSE2-SSSE3-NEXT: movw %di, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $7, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $6, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $5, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $4, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $3, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $2, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm3
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
-; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $11, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $10, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $9, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm3
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $8, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
-; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $13, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $12, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm3
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $14, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
-; SSE2-SSSE3-NEXT: shrl $15, %eax
-; SSE2-SSSE3-NEXT: movzwl %ax, %eax
-; SSE2-SSSE3-NEXT: movd %eax, %xmm4
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
-; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
-; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE2-SSSE3-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $7, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $6, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $5, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $4, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm3
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
-; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $3, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $2, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm4
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $11, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $10, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm3
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $9, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm4
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $8, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
-; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $13, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm3
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $12, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm4
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $14, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm3
-; SSE2-SSSE3-NEXT: shrl $15, %eax
-; SSE2-SSSE3-NEXT: movzwl %ax, %eax
-; SSE2-SSSE3-NEXT: movd %eax, %xmm5
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
-; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
-; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
-; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
-; SSE2-SSSE3-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $7, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $6, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm3
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $5, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $4, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm4
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
-; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $3, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $2, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm5
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm3
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
-; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3]
-; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $11, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $10, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm4
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $9, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm5
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $8, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7]
-; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $13, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm4
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $12, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm5
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $14, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm4
-; SSE2-SSSE3-NEXT: shrl $15, %eax
-; SSE2-SSSE3-NEXT: movzwl %ax, %eax
-; SSE2-SSSE3-NEXT: movd %eax, %xmm6
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7]
-; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1]
-; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0]
-; SSE2-SSSE3-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $7, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $6, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm4
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $5, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $4, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm5
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
-; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $3, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $2, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm4
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm6
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7]
-; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
-; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1]
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $11, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm4
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $10, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm5
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $9, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm6
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $8, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm4
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7]
-; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $13, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm5
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $12, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm6
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $14, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm5
-; SSE2-SSSE3-NEXT: shrl $15, %eax
-; SSE2-SSSE3-NEXT: movzwl %ax, %eax
-; SSE2-SSSE3-NEXT: movd %eax, %xmm7
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3],xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7]
-; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
-; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1]
-; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm4[0]
+; SSE2-SSSE3: # %bb.0:
+; SSE2-SSSE3-NEXT: movq %rdi, %xmm3
+; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,0,1,1,4,5,6,7]
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
+; SSE2-SSSE3-NEXT: pand %xmm4, %xmm0
+; SSE2-SSSE3-NEXT: pcmpeqb %xmm4, %xmm0
+; SSE2-SSSE3-NEXT: psrlw $7, %xmm0
+; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; SSE2-SSSE3-NEXT: pand %xmm5, %xmm0
+; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[2,2,3,3,4,5,6,7]
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
+; SSE2-SSSE3-NEXT: pand %xmm4, %xmm1
+; SSE2-SSSE3-NEXT: pcmpeqb %xmm4, %xmm1
+; SSE2-SSSE3-NEXT: psrlw $7, %xmm1
+; SSE2-SSSE3-NEXT: pand %xmm5, %xmm1
+; SSE2-SSSE3-NEXT: pshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,4,5,5]
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,2,3,3]
+; SSE2-SSSE3-NEXT: pand %xmm4, %xmm2
+; SSE2-SSSE3-NEXT: pcmpeqb %xmm4, %xmm2
+; SSE2-SSSE3-NEXT: psrlw $7, %xmm2
+; SSE2-SSSE3-NEXT: pand %xmm5, %xmm2
+; SSE2-SSSE3-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,6,7,7]
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,2,3,3]
+; SSE2-SSSE3-NEXT: pand %xmm4, %xmm3
+; SSE2-SSSE3-NEXT: pcmpeqb %xmm4, %xmm3
+; SSE2-SSSE3-NEXT: psrlw $7, %xmm3
+; SSE2-SSSE3-NEXT: pand %xmm5, %xmm3
; SSE2-SSSE3-NEXT: retq
;
; AVX1-LABEL: ext_i64_64i8:
-; AVX1: # BB#0:
-; AVX1-NEXT: pushq %rbp
-; AVX1-NEXT: .Lcfi11:
-; AVX1-NEXT: .cfi_def_cfa_offset 16
-; AVX1-NEXT: .Lcfi12:
-; AVX1-NEXT: .cfi_offset %rbp, -16
-; AVX1-NEXT: movq %rsp, %rbp
-; AVX1-NEXT: .Lcfi13:
-; AVX1-NEXT: .cfi_def_cfa_register %rbp
-; AVX1-NEXT: andq $-32, %rsp
-; AVX1-NEXT: subq $64, %rsp
-; AVX1-NEXT: movl %edi, %eax
-; AVX1-NEXT: shrl $17, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: movl %edi, %ecx
-; AVX1-NEXT: shrl $16, %ecx
-; AVX1-NEXT: andl $1, %ecx
-; AVX1-NEXT: vmovd %ecx, %xmm0
-; AVX1-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
-; AVX1-NEXT: movl %edi, %eax
-; AVX1-NEXT: shrl $18, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
-; AVX1-NEXT: movl %edi, %eax
-; AVX1-NEXT: shrl $19, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
-; AVX1-NEXT: movl %edi, %eax
-; AVX1-NEXT: shrl $20, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
-; AVX1-NEXT: movl %edi, %eax
-; AVX1-NEXT: shrl $21, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; AVX1-NEXT: movl %edi, %eax
-; AVX1-NEXT: shrl $22, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; AVX1-NEXT: movl %edi, %eax
-; AVX1-NEXT: shrl $23, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
-; AVX1-NEXT: movl %edi, %eax
-; AVX1-NEXT: shrl $24, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
-; AVX1-NEXT: movl %edi, %eax
-; AVX1-NEXT: shrl $25, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
-; AVX1-NEXT: movl %edi, %eax
-; AVX1-NEXT: shrl $26, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
-; AVX1-NEXT: movl %edi, %eax
-; AVX1-NEXT: shrl $27, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; AVX1-NEXT: movl %edi, %eax
-; AVX1-NEXT: shrl $28, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
-; AVX1-NEXT: movl %edi, %eax
-; AVX1-NEXT: shrl $29, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
-; AVX1-NEXT: movl %edi, %eax
-; AVX1-NEXT: shrl $30, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
-; AVX1-NEXT: movl %edi, %eax
-; AVX1-NEXT: shrl $31, %eax
-; AVX1-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
-; AVX1-NEXT: movl %edi, %eax
-; AVX1-NEXT: shrl %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: movl %edi, %ecx
-; AVX1-NEXT: andl $1, %ecx
-; AVX1-NEXT: vmovd %ecx, %xmm1
-; AVX1-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
-; AVX1-NEXT: movl %edi, %eax
-; AVX1-NEXT: shrl $2, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
-; AVX1-NEXT: movl %edi, %eax
-; AVX1-NEXT: shrl $3, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1
-; AVX1-NEXT: movl %edi, %eax
-; AVX1-NEXT: shrl $4, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; AVX1-NEXT: movl %edi, %eax
-; AVX1-NEXT: shrl $5, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
-; AVX1-NEXT: movl %edi, %eax
-; AVX1-NEXT: shrl $6, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1
-; AVX1-NEXT: movl %edi, %eax
-; AVX1-NEXT: shrl $7, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1
-; AVX1-NEXT: movl %edi, %eax
-; AVX1-NEXT: shrl $8, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
-; AVX1-NEXT: movl %edi, %eax
-; AVX1-NEXT: shrl $9, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1
-; AVX1-NEXT: movl %edi, %eax
-; AVX1-NEXT: shrl $10, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1
-; AVX1-NEXT: movl %edi, %eax
-; AVX1-NEXT: shrl $11, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1
-; AVX1-NEXT: movl %edi, %eax
-; AVX1-NEXT: shrl $12, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
-; AVX1-NEXT: movl %edi, %eax
-; AVX1-NEXT: shrl $13, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
-; AVX1-NEXT: movl %edi, %eax
-; AVX1-NEXT: shrl $14, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1
-; AVX1-NEXT: movl %edi, %eax
-; AVX1-NEXT: shrl $15, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT: movq %rdi, %rax
-; AVX1-NEXT: shrq $49, %rax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: movq %rdi, %rcx
-; AVX1-NEXT: shrq $48, %rcx
-; AVX1-NEXT: andl $1, %ecx
-; AVX1-NEXT: vmovd %ecx, %xmm1
-; AVX1-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
-; AVX1-NEXT: movq %rdi, %rax
-; AVX1-NEXT: shrq $50, %rax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
-; AVX1-NEXT: movq %rdi, %rax
-; AVX1-NEXT: shrq $51, %rax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1
-; AVX1-NEXT: movq %rdi, %rax
-; AVX1-NEXT: shrq $52, %rax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; AVX1-NEXT: movq %rdi, %rax
-; AVX1-NEXT: shrq $53, %rax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
-; AVX1-NEXT: movq %rdi, %rax
-; AVX1-NEXT: shrq $54, %rax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1
-; AVX1-NEXT: movq %rdi, %rax
-; AVX1-NEXT: shrq $55, %rax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1
-; AVX1-NEXT: movq %rdi, %rax
-; AVX1-NEXT: shrq $56, %rax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
-; AVX1-NEXT: movq %rdi, %rax
-; AVX1-NEXT: shrq $57, %rax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1
-; AVX1-NEXT: movq %rdi, %rax
-; AVX1-NEXT: shrq $58, %rax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1
-; AVX1-NEXT: movq %rdi, %rax
-; AVX1-NEXT: shrq $59, %rax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1
-; AVX1-NEXT: movq %rdi, %rax
-; AVX1-NEXT: shrq $60, %rax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
-; AVX1-NEXT: movq %rdi, %rax
-; AVX1-NEXT: shrq $61, %rax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
-; AVX1-NEXT: movq %rdi, %rax
-; AVX1-NEXT: shrq $62, %rax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1
-; AVX1-NEXT: movq %rdi, %rax
-; AVX1-NEXT: shrq $63, %rax
-; AVX1-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1
-; AVX1-NEXT: movq %rdi, %rax
-; AVX1-NEXT: shrq $33, %rax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: movq %rdi, %rcx
-; AVX1-NEXT: shrq $32, %rcx
-; AVX1-NEXT: andl $1, %ecx
-; AVX1-NEXT: vmovd %ecx, %xmm2
-; AVX1-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2
-; AVX1-NEXT: movq %rdi, %rax
-; AVX1-NEXT: shrq $34, %rax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2
-; AVX1-NEXT: movq %rdi, %rax
-; AVX1-NEXT: shrq $35, %rax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2
-; AVX1-NEXT: movq %rdi, %rax
-; AVX1-NEXT: shrq $36, %rax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
-; AVX1-NEXT: movq %rdi, %rax
-; AVX1-NEXT: shrq $37, %rax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2
-; AVX1-NEXT: movq %rdi, %rax
-; AVX1-NEXT: shrq $38, %rax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2
-; AVX1-NEXT: movq %rdi, %rax
-; AVX1-NEXT: shrq $39, %rax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2
-; AVX1-NEXT: movq %rdi, %rax
-; AVX1-NEXT: shrq $40, %rax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
-; AVX1-NEXT: movq %rdi, %rax
-; AVX1-NEXT: shrq $41, %rax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2
-; AVX1-NEXT: movq %rdi, %rax
-; AVX1-NEXT: shrq $42, %rax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2
-; AVX1-NEXT: movq %rdi, %rax
-; AVX1-NEXT: shrq $43, %rax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2
-; AVX1-NEXT: movq %rdi, %rax
-; AVX1-NEXT: shrq $44, %rax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
-; AVX1-NEXT: movq %rdi, %rax
-; AVX1-NEXT: shrq $45, %rax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2
-; AVX1-NEXT: movq %rdi, %rax
-; AVX1-NEXT: shrq $46, %rax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2
-; AVX1-NEXT: shrq $47, %rdi
-; AVX1-NEXT: andl $1, %edi
-; AVX1-NEXT: vpinsrb $15, %edi, %xmm2, %xmm2
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
-; AVX1-NEXT: movq %rbp, %rsp
-; AVX1-NEXT: popq %rbp
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovq %rdi, %xmm0
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,0,1,1,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[2,2,3,3,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,1,1]
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
+; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; AVX1-NEXT: vpcmpeqb %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
+; AVX1-NEXT: vpxor %xmm5, %xmm3, %xmm3
+; AVX1-NEXT: vpsrlw $7, %xmm3, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3
+; AVX1-NEXT: vpcmpeqb %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vpxor %xmm5, %xmm0, %xmm0
+; AVX1-NEXT: vpsrlw $7, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm6, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,4,5,5]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3]
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,6,7,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1
+; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpcmpeqb %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpxor %xmm5, %xmm2, %xmm2
+; AVX1-NEXT: vpsrlw $7, %xmm2, %xmm2
+; AVX1-NEXT: vpand %xmm6, %xmm2, %xmm2
+; AVX1-NEXT: vpcmpeqb %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpxor %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vpsrlw $7, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm6, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
; AVX1-NEXT: retq
;
; AVX2-LABEL: ext_i64_64i8:
-; AVX2: # BB#0:
-; AVX2-NEXT: pushq %rbp
-; AVX2-NEXT: .Lcfi11:
-; AVX2-NEXT: .cfi_def_cfa_offset 16
-; AVX2-NEXT: .Lcfi12:
-; AVX2-NEXT: .cfi_offset %rbp, -16
-; AVX2-NEXT: movq %rsp, %rbp
-; AVX2-NEXT: .Lcfi13:
-; AVX2-NEXT: .cfi_def_cfa_register %rbp
-; AVX2-NEXT: andq $-32, %rsp
-; AVX2-NEXT: subq $64, %rsp
-; AVX2-NEXT: movl %edi, %eax
-; AVX2-NEXT: shrl $17, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: movl %edi, %ecx
-; AVX2-NEXT: shrl $16, %ecx
-; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: vmovd %ecx, %xmm0
-; AVX2-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
-; AVX2-NEXT: movl %edi, %eax
-; AVX2-NEXT: shrl $18, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
-; AVX2-NEXT: movl %edi, %eax
-; AVX2-NEXT: shrl $19, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
-; AVX2-NEXT: movl %edi, %eax
-; AVX2-NEXT: shrl $20, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
-; AVX2-NEXT: movl %edi, %eax
-; AVX2-NEXT: shrl $21, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; AVX2-NEXT: movl %edi, %eax
-; AVX2-NEXT: shrl $22, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; AVX2-NEXT: movl %edi, %eax
-; AVX2-NEXT: shrl $23, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
-; AVX2-NEXT: movl %edi, %eax
-; AVX2-NEXT: shrl $24, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
-; AVX2-NEXT: movl %edi, %eax
-; AVX2-NEXT: shrl $25, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
-; AVX2-NEXT: movl %edi, %eax
-; AVX2-NEXT: shrl $26, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
-; AVX2-NEXT: movl %edi, %eax
-; AVX2-NEXT: shrl $27, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; AVX2-NEXT: movl %edi, %eax
-; AVX2-NEXT: shrl $28, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
-; AVX2-NEXT: movl %edi, %eax
-; AVX2-NEXT: shrl $29, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
-; AVX2-NEXT: movl %edi, %eax
-; AVX2-NEXT: shrl $30, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
-; AVX2-NEXT: movl %edi, %eax
-; AVX2-NEXT: shrl $31, %eax
-; AVX2-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
-; AVX2-NEXT: movl %edi, %eax
-; AVX2-NEXT: shrl %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: movl %edi, %ecx
-; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: vmovd %ecx, %xmm1
-; AVX2-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
-; AVX2-NEXT: movl %edi, %eax
-; AVX2-NEXT: shrl $2, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
-; AVX2-NEXT: movl %edi, %eax
-; AVX2-NEXT: shrl $3, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1
-; AVX2-NEXT: movl %edi, %eax
-; AVX2-NEXT: shrl $4, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; AVX2-NEXT: movl %edi, %eax
-; AVX2-NEXT: shrl $5, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
-; AVX2-NEXT: movl %edi, %eax
-; AVX2-NEXT: shrl $6, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1
-; AVX2-NEXT: movl %edi, %eax
-; AVX2-NEXT: shrl $7, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1
-; AVX2-NEXT: movl %edi, %eax
-; AVX2-NEXT: shrl $8, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
-; AVX2-NEXT: movl %edi, %eax
-; AVX2-NEXT: shrl $9, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1
-; AVX2-NEXT: movl %edi, %eax
-; AVX2-NEXT: shrl $10, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1
-; AVX2-NEXT: movl %edi, %eax
-; AVX2-NEXT: shrl $11, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1
-; AVX2-NEXT: movl %edi, %eax
-; AVX2-NEXT: shrl $12, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
-; AVX2-NEXT: movl %edi, %eax
-; AVX2-NEXT: shrl $13, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
-; AVX2-NEXT: movl %edi, %eax
-; AVX2-NEXT: shrl $14, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1
-; AVX2-NEXT: movl %edi, %eax
-; AVX2-NEXT: shrl $15, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX2-NEXT: movq %rdi, %rax
-; AVX2-NEXT: shrq $49, %rax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: movq %rdi, %rcx
-; AVX2-NEXT: shrq $48, %rcx
-; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: vmovd %ecx, %xmm1
-; AVX2-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
-; AVX2-NEXT: movq %rdi, %rax
-; AVX2-NEXT: shrq $50, %rax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
-; AVX2-NEXT: movq %rdi, %rax
-; AVX2-NEXT: shrq $51, %rax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1
-; AVX2-NEXT: movq %rdi, %rax
-; AVX2-NEXT: shrq $52, %rax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; AVX2-NEXT: movq %rdi, %rax
-; AVX2-NEXT: shrq $53, %rax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
-; AVX2-NEXT: movq %rdi, %rax
-; AVX2-NEXT: shrq $54, %rax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1
-; AVX2-NEXT: movq %rdi, %rax
-; AVX2-NEXT: shrq $55, %rax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1
-; AVX2-NEXT: movq %rdi, %rax
-; AVX2-NEXT: shrq $56, %rax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
-; AVX2-NEXT: movq %rdi, %rax
-; AVX2-NEXT: shrq $57, %rax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1
-; AVX2-NEXT: movq %rdi, %rax
-; AVX2-NEXT: shrq $58, %rax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1
-; AVX2-NEXT: movq %rdi, %rax
-; AVX2-NEXT: shrq $59, %rax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1
-; AVX2-NEXT: movq %rdi, %rax
-; AVX2-NEXT: shrq $60, %rax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
-; AVX2-NEXT: movq %rdi, %rax
-; AVX2-NEXT: shrq $61, %rax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
-; AVX2-NEXT: movq %rdi, %rax
-; AVX2-NEXT: shrq $62, %rax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1
-; AVX2-NEXT: movq %rdi, %rax
-; AVX2-NEXT: shrq $63, %rax
-; AVX2-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1
-; AVX2-NEXT: movq %rdi, %rax
-; AVX2-NEXT: shrq $33, %rax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: movq %rdi, %rcx
-; AVX2-NEXT: shrq $32, %rcx
-; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: vmovd %ecx, %xmm2
-; AVX2-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2
-; AVX2-NEXT: movq %rdi, %rax
-; AVX2-NEXT: shrq $34, %rax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2
-; AVX2-NEXT: movq %rdi, %rax
-; AVX2-NEXT: shrq $35, %rax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2
-; AVX2-NEXT: movq %rdi, %rax
-; AVX2-NEXT: shrq $36, %rax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
-; AVX2-NEXT: movq %rdi, %rax
-; AVX2-NEXT: shrq $37, %rax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2
-; AVX2-NEXT: movq %rdi, %rax
-; AVX2-NEXT: shrq $38, %rax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2
-; AVX2-NEXT: movq %rdi, %rax
-; AVX2-NEXT: shrq $39, %rax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2
-; AVX2-NEXT: movq %rdi, %rax
-; AVX2-NEXT: shrq $40, %rax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
-; AVX2-NEXT: movq %rdi, %rax
-; AVX2-NEXT: shrq $41, %rax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2
-; AVX2-NEXT: movq %rdi, %rax
-; AVX2-NEXT: shrq $42, %rax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2
-; AVX2-NEXT: movq %rdi, %rax
-; AVX2-NEXT: shrq $43, %rax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2
-; AVX2-NEXT: movq %rdi, %rax
-; AVX2-NEXT: shrq $44, %rax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
-; AVX2-NEXT: movq %rdi, %rax
-; AVX2-NEXT: shrq $45, %rax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2
-; AVX2-NEXT: movq %rdi, %rax
-; AVX2-NEXT: shrq $46, %rax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2
-; AVX2-NEXT: shrq $47, %rdi
-; AVX2-NEXT: andl $1, %edi
-; AVX2-NEXT: vpinsrb $15, %edi, %xmm2, %xmm2
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
-; AVX2-NEXT: movq %rbp, %rsp
-; AVX2-NEXT: popq %rbp
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovq %rdi, %xmm0
+; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,0,1,1,4,5,6,7]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[2,2,3,3,4,5,6,7]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,1,1]
+; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
+; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpsrlw $7, %ymm0, %ymm0
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm0
+; AVX2-NEXT: vpshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,4,4,5,5]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3]
+; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,6,7,7]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm4, %ymm1
+; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpcmpeqb %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpsrlw $7, %ymm1, %ymm1
+; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1
; AVX2-NEXT: retq
;
-; AVX512-LABEL: ext_i64_64i8:
-; AVX512: # BB#0:
-; AVX512-NEXT: kmovq %rdi, %k1
-; AVX512-NEXT: vmovdqu8 {{.*}}(%rip), %zmm0 {%k1} {z}
-; AVX512-NEXT: retq
+; AVX512F-LABEL: ext_i64_64i8:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: pushq %rbp
+; AVX512F-NEXT: .cfi_def_cfa_offset 16
+; AVX512F-NEXT: .cfi_offset %rbp, -16
+; AVX512F-NEXT: movq %rsp, %rbp
+; AVX512F-NEXT: .cfi_def_cfa_register %rbp
+; AVX512F-NEXT: andq $-32, %rsp
+; AVX512F-NEXT: subq $64, %rsp
+; AVX512F-NEXT: movl %edi, (%rsp)
+; AVX512F-NEXT: shrq $32, %rdi
+; AVX512F-NEXT: movl %edi, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movl {{.*}}(%rip), %eax
+; AVX512F-NEXT: kmovw (%rsp), %k1
+; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k2
+; AVX512F-NEXT: vpbroadcastd %eax, %zmm0 {%k1} {z}
+; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT: vpbroadcastd %eax, %zmm1 {%k2} {z}
+; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
+; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k1
+; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k2
+; AVX512F-NEXT: vpbroadcastd %eax, %zmm1 {%k1} {z}
+; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
+; AVX512F-NEXT: vpbroadcastd %eax, %zmm2 {%k2} {z}
+; AVX512F-NEXT: vpmovdb %zmm2, %xmm2
+; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512F-NEXT: movq %rbp, %rsp
+; AVX512F-NEXT: popq %rbp
+; AVX512F-NEXT: retq
+;
+; AVX512VLBW-LABEL: ext_i64_64i8:
+; AVX512VLBW: # %bb.0:
+; AVX512VLBW-NEXT: kmovq %rdi, %k1
+; AVX512VLBW-NEXT: vmovdqu8 {{.*}}(%rip), %zmm0 {%k1} {z}
+; AVX512VLBW-NEXT: retq
%1 = bitcast i64 %a0 to <64 x i1>
%2 = zext <64 x i1> %1 to <64 x i8>
ret <64 x i8> %2
diff --git a/test/CodeGen/X86/bitcast-int-to-vector-bool.ll b/test/CodeGen/X86/bitcast-int-to-vector-bool.ll
index a190e0575522..6d9f832d861f 100644
--- a/test/CodeGen/X86/bitcast-int-to-vector-bool.ll
+++ b/test/CodeGen/X86/bitcast-int-to-vector-bool.ll
@@ -7,38 +7,47 @@
define <2 x i1> @bitcast_i2_2i1(i2 zeroext %a0) {
; SSE2-SSSE3-LABEL: bitcast_i2_2i1:
-; SSE2-SSSE3: # BB#0:
-; SSE2-SSSE3-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movq %rcx, %xmm0
-; SSE2-SSSE3-NEXT: shrl %eax
-; SSE2-SSSE3-NEXT: andl $1, %eax
-; SSE2-SSSE3-NEXT: movq %rax, %xmm1
-; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-SSSE3: # %bb.0:
+; SSE2-SSSE3-NEXT: # kill: def %edi killed %edi def %rdi
+; SSE2-SSSE3-NEXT: movq %rdi, %xmm0
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,1]
+; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [1,2]
+; SSE2-SSSE3-NEXT: pand %xmm0, %xmm1
+; SSE2-SSSE3-NEXT: pcmpeqd %xmm0, %xmm1
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,0,3,2]
+; SSE2-SSSE3-NEXT: pand %xmm1, %xmm0
+; SSE2-SSSE3-NEXT: psrlq $63, %xmm0
; SSE2-SSSE3-NEXT: retq
;
-; AVX12-LABEL: bitcast_i2_2i1:
-; AVX12: # BB#0:
-; AVX12-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; AVX12-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; AVX12-NEXT: movl %eax, %ecx
-; AVX12-NEXT: andl $1, %ecx
-; AVX12-NEXT: vmovq %rcx, %xmm0
-; AVX12-NEXT: shrl %eax
-; AVX12-NEXT: andl $1, %eax
-; AVX12-NEXT: vmovq %rax, %xmm1
-; AVX12-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX12-NEXT: retq
+; AVX1-LABEL: bitcast_i2_2i1:
+; AVX1: # %bb.0:
+; AVX1-NEXT: # kill: def %edi killed %edi def %rdi
+; AVX1-NEXT: vmovq %rdi, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2]
+; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpsrlq $63, %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: bitcast_i2_2i1:
+; AVX2: # %bb.0:
+; AVX2-NEXT: # kill: def %edi killed %edi def %rdi
+; AVX2-NEXT: vmovq %rdi, %xmm0
+; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2]
+; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpsrlq $63, %xmm0, %xmm0
+; AVX2-NEXT: retq
;
; AVX512-LABEL: bitcast_i2_2i1:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
; AVX512-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; AVX512-NEXT: kmovd %eax, %k1
; AVX512-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = bitcast i2 %a0 to <2 x i1>
@@ -47,65 +56,43 @@ define <2 x i1> @bitcast_i2_2i1(i2 zeroext %a0) {
define <4 x i1> @bitcast_i4_4i1(i4 zeroext %a0) {
; SSE2-SSSE3-LABEL: bitcast_i4_4i1:
-; SSE2-SSSE3: # BB#0:
-; SSE2-SSSE3-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movl -{{[0-9]+}}(%rsp), %eax
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $3, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $2, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
-; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-SSSE3-NEXT: movd %eax, %xmm0
-; SSE2-SSSE3-NEXT: shrl %eax
-; SSE2-SSSE3-NEXT: movd %eax, %xmm2
-; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE2-SSSE3-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-SSSE3: # %bb.0:
+; SSE2-SSSE3-NEXT: movd %edi, %xmm0
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8]
+; SSE2-SSSE3-NEXT: pand %xmm1, %xmm0
+; SSE2-SSSE3-NEXT: pcmpeqd %xmm1, %xmm0
+; SSE2-SSSE3-NEXT: psrld $31, %xmm0
; SSE2-SSSE3-NEXT: retq
;
; AVX1-LABEL: bitcast_i4_4i1:
-; AVX1: # BB#0:
-; AVX1-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT: movl -{{[0-9]+}}(%rsp), %eax
-; AVX1-NEXT: movl %eax, %ecx
-; AVX1-NEXT: shrl %ecx
-; AVX1-NEXT: vmovd %eax, %xmm0
-; AVX1-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0
-; AVX1-NEXT: movl %eax, %ecx
-; AVX1-NEXT: shrl $2, %ecx
-; AVX1-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0
-; AVX1-NEXT: shrl $3, %eax
-; AVX1-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0
-; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovd %edi, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2,4,8]
+; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpsrld $31, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: bitcast_i4_4i1:
-; AVX2: # BB#0:
-; AVX2-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: movl -{{[0-9]+}}(%rsp), %eax
-; AVX2-NEXT: movl %eax, %ecx
-; AVX2-NEXT: shrl %ecx
-; AVX2-NEXT: vmovd %eax, %xmm0
-; AVX2-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0
-; AVX2-NEXT: movl %eax, %ecx
-; AVX2-NEXT: shrl $2, %ecx
-; AVX2-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0
-; AVX2-NEXT: shrl $3, %eax
-; AVX2-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1]
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovd %edi, %xmm0
+; AVX2-NEXT: vpbroadcastd %xmm0, %xmm0
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2,4,8]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpsrld $31, %xmm0, %xmm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: bitcast_i4_4i1:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
; AVX512-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; AVX512-NEXT: kmovd %eax, %k1
; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; AVX512-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
-; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = bitcast i4 %a0 to <4 x i1>
@@ -114,86 +101,39 @@ define <4 x i1> @bitcast_i4_4i1(i4 zeroext %a0) {
define <8 x i1> @bitcast_i8_8i1(i8 zeroext %a0) {
; SSE2-SSSE3-LABEL: bitcast_i8_8i1:
-; SSE2-SSSE3: # BB#0:
-; SSE2-SSSE3-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $3, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $2, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
-; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
-; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $5, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $4, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
-; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $6, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
-; SSE2-SSSE3-NEXT: shrl $7, %eax
-; SSE2-SSSE3-NEXT: movzwl %ax, %eax
-; SSE2-SSSE3-NEXT: movd %eax, %xmm3
-; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
-; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; SSE2-SSSE3: # %bb.0:
+; SSE2-SSSE3-NEXT: movd %edi, %xmm0
+; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128]
+; SSE2-SSSE3-NEXT: pand %xmm1, %xmm0
+; SSE2-SSSE3-NEXT: pcmpeqw %xmm1, %xmm0
+; SSE2-SSSE3-NEXT: psrlw $15, %xmm0
; SSE2-SSSE3-NEXT: retq
;
-; AVX12-LABEL: bitcast_i8_8i1:
-; AVX12: # BB#0:
-; AVX12-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; AVX12-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; AVX12-NEXT: movl %eax, %ecx
-; AVX12-NEXT: shrl %ecx
-; AVX12-NEXT: andl $1, %ecx
-; AVX12-NEXT: movl %eax, %edx
-; AVX12-NEXT: andl $1, %edx
-; AVX12-NEXT: vmovd %edx, %xmm0
-; AVX12-NEXT: vpinsrw $1, %ecx, %xmm0, %xmm0
-; AVX12-NEXT: movl %eax, %ecx
-; AVX12-NEXT: shrl $2, %ecx
-; AVX12-NEXT: andl $1, %ecx
-; AVX12-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0
-; AVX12-NEXT: movl %eax, %ecx
-; AVX12-NEXT: shrl $3, %ecx
-; AVX12-NEXT: andl $1, %ecx
-; AVX12-NEXT: vpinsrw $3, %ecx, %xmm0, %xmm0
-; AVX12-NEXT: movl %eax, %ecx
-; AVX12-NEXT: shrl $4, %ecx
-; AVX12-NEXT: andl $1, %ecx
-; AVX12-NEXT: vpinsrw $4, %ecx, %xmm0, %xmm0
-; AVX12-NEXT: movl %eax, %ecx
-; AVX12-NEXT: shrl $5, %ecx
-; AVX12-NEXT: andl $1, %ecx
-; AVX12-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0
-; AVX12-NEXT: movl %eax, %ecx
-; AVX12-NEXT: shrl $6, %ecx
-; AVX12-NEXT: andl $1, %ecx
-; AVX12-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0
-; AVX12-NEXT: shrl $7, %eax
-; AVX12-NEXT: movzwl %ax, %eax
-; AVX12-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0
-; AVX12-NEXT: retq
+; AVX1-LABEL: bitcast_i8_8i1:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovd %edi, %xmm0
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128]
+; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpsrlw $15, %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: bitcast_i8_8i1:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovd %edi, %xmm0
+; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128]
+; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpsrlw $15, %xmm0, %xmm0
+; AVX2-NEXT: retq
;
; AVX512-LABEL: bitcast_i8_8i1:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: kmovd %edi, %k0
; AVX512-NEXT: vpmovm2w %k0, %xmm0
; AVX512-NEXT: retq
@@ -202,159 +142,54 @@ define <8 x i1> @bitcast_i8_8i1(i8 zeroext %a0) {
}
define <16 x i1> @bitcast_i16_16i1(i16 zeroext %a0) {
-; SSE2-SSSE3-LABEL: bitcast_i16_16i1:
-; SSE2-SSSE3: # BB#0:
-; SSE2-SSSE3-NEXT: movw %di, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $7, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $6, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $5, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $4, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $3, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $2, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm0
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm3
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
-; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $11, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $10, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $9, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm3
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $8, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm1
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
-; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $13, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $12, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm3
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
-; SSE2-SSSE3-NEXT: movl %eax, %ecx
-; SSE2-SSSE3-NEXT: shrl $14, %ecx
-; SSE2-SSSE3-NEXT: andl $1, %ecx
-; SSE2-SSSE3-NEXT: movd %ecx, %xmm2
-; SSE2-SSSE3-NEXT: shrl $15, %eax
-; SSE2-SSSE3-NEXT: movzwl %ax, %eax
-; SSE2-SSSE3-NEXT: movd %eax, %xmm4
-; SSE2-SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
-; SSE2-SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; SSE2-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
-; SSE2-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE2-SSSE3-NEXT: retq
+; SSE2-LABEL: bitcast_i16_16i1:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movd %edi, %xmm0
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,1,1,4,5,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
+; SSE2-NEXT: pand %xmm1, %xmm0
+; SSE2-NEXT: pcmpeqb %xmm1, %xmm0
+; SSE2-NEXT: psrlw $7, %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: retq
;
-; AVX12-LABEL: bitcast_i16_16i1:
-; AVX12: # BB#0:
-; AVX12-NEXT: movw %di, -{{[0-9]+}}(%rsp)
-; AVX12-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax
-; AVX12-NEXT: movl %eax, %ecx
-; AVX12-NEXT: shrl %ecx
-; AVX12-NEXT: andl $1, %ecx
-; AVX12-NEXT: movl %eax, %edx
-; AVX12-NEXT: andl $1, %edx
-; AVX12-NEXT: vmovd %edx, %xmm0
-; AVX12-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0
-; AVX12-NEXT: movl %eax, %ecx
-; AVX12-NEXT: shrl $2, %ecx
-; AVX12-NEXT: andl $1, %ecx
-; AVX12-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0
-; AVX12-NEXT: movl %eax, %ecx
-; AVX12-NEXT: shrl $3, %ecx
-; AVX12-NEXT: andl $1, %ecx
-; AVX12-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0
-; AVX12-NEXT: movl %eax, %ecx
-; AVX12-NEXT: shrl $4, %ecx
-; AVX12-NEXT: andl $1, %ecx
-; AVX12-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0
-; AVX12-NEXT: movl %eax, %ecx
-; AVX12-NEXT: shrl $5, %ecx
-; AVX12-NEXT: andl $1, %ecx
-; AVX12-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0
-; AVX12-NEXT: movl %eax, %ecx
-; AVX12-NEXT: shrl $6, %ecx
-; AVX12-NEXT: andl $1, %ecx
-; AVX12-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0
-; AVX12-NEXT: movl %eax, %ecx
-; AVX12-NEXT: shrl $7, %ecx
-; AVX12-NEXT: andl $1, %ecx
-; AVX12-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0
-; AVX12-NEXT: movl %eax, %ecx
-; AVX12-NEXT: shrl $8, %ecx
-; AVX12-NEXT: andl $1, %ecx
-; AVX12-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0
-; AVX12-NEXT: movl %eax, %ecx
-; AVX12-NEXT: shrl $9, %ecx
-; AVX12-NEXT: andl $1, %ecx
-; AVX12-NEXT: vpinsrb $9, %ecx, %xmm0, %xmm0
-; AVX12-NEXT: movl %eax, %ecx
-; AVX12-NEXT: shrl $10, %ecx
-; AVX12-NEXT: andl $1, %ecx
-; AVX12-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0
-; AVX12-NEXT: movl %eax, %ecx
-; AVX12-NEXT: shrl $11, %ecx
-; AVX12-NEXT: andl $1, %ecx
-; AVX12-NEXT: vpinsrb $11, %ecx, %xmm0, %xmm0
-; AVX12-NEXT: movl %eax, %ecx
-; AVX12-NEXT: shrl $12, %ecx
-; AVX12-NEXT: andl $1, %ecx
-; AVX12-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0
-; AVX12-NEXT: movl %eax, %ecx
-; AVX12-NEXT: shrl $13, %ecx
-; AVX12-NEXT: andl $1, %ecx
-; AVX12-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0
-; AVX12-NEXT: movl %eax, %ecx
-; AVX12-NEXT: shrl $14, %ecx
-; AVX12-NEXT: andl $1, %ecx
-; AVX12-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; AVX12-NEXT: shrl $15, %eax
-; AVX12-NEXT: movzwl %ax, %eax
-; AVX12-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
-; AVX12-NEXT: retq
+; SSSE3-LABEL: bitcast_i16_16i1:
+; SSSE3: # %bb.0:
+; SSSE3-NEXT: movd %edi, %xmm0
+; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
+; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
+; SSSE3-NEXT: pand %xmm1, %xmm0
+; SSSE3-NEXT: pcmpeqb %xmm1, %xmm0
+; SSSE3-NEXT: psrlw $7, %xmm0
+; SSSE3-NEXT: pand {{.*}}(%rip), %xmm0
+; SSSE3-NEXT: retq
+;
+; AVX1-LABEL: bitcast_i16_16i1:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovd %edi, %xmm0
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0]
+; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpsrlw $7, %xmm0, %xmm0
+; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: bitcast_i16_16i1:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovd %edi, %xmm0
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [9241421688590303745,9241421688590303745]
+; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpsrlw $7, %xmm0, %xmm0
+; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT: retq
;
; AVX512-LABEL: bitcast_i16_16i1:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: kmovd %edi, %k0
; AVX512-NEXT: vpmovm2b %k0, %xmm0
; AVX512-NEXT: retq
@@ -364,297 +199,54 @@ define <16 x i1> @bitcast_i16_16i1(i16 zeroext %a0) {
define <32 x i1> @bitcast_i32_32i1(i32 %a0) {
; SSE2-SSSE3-LABEL: bitcast_i32_32i1:
-; SSE2-SSSE3: # BB#0:
+; SSE2-SSSE3: # %bb.0:
; SSE2-SSSE3-NEXT: movl %esi, (%rdi)
; SSE2-SSSE3-NEXT: movq %rdi, %rax
; SSE2-SSSE3-NEXT: retq
;
; AVX1-LABEL: bitcast_i32_32i1:
-; AVX1: # BB#0:
-; AVX1-NEXT: pushq %rbp
-; AVX1-NEXT: .Lcfi0:
-; AVX1-NEXT: .cfi_def_cfa_offset 16
-; AVX1-NEXT: .Lcfi1:
-; AVX1-NEXT: .cfi_offset %rbp, -16
-; AVX1-NEXT: movq %rsp, %rbp
-; AVX1-NEXT: .Lcfi2:
-; AVX1-NEXT: .cfi_def_cfa_register %rbp
-; AVX1-NEXT: andq $-32, %rsp
-; AVX1-NEXT: subq $32, %rsp
-; AVX1-NEXT: movl %edi, %eax
-; AVX1-NEXT: shrl $17, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: movl %edi, %ecx
-; AVX1-NEXT: shrl $16, %ecx
-; AVX1-NEXT: andl $1, %ecx
-; AVX1-NEXT: vmovd %ecx, %xmm0
-; AVX1-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
-; AVX1-NEXT: movl %edi, %eax
-; AVX1-NEXT: shrl $18, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
-; AVX1-NEXT: movl %edi, %eax
-; AVX1-NEXT: shrl $19, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
-; AVX1-NEXT: movl %edi, %eax
-; AVX1-NEXT: shrl $20, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
-; AVX1-NEXT: movl %edi, %eax
-; AVX1-NEXT: shrl $21, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; AVX1-NEXT: movl %edi, %eax
-; AVX1-NEXT: shrl $22, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; AVX1-NEXT: movl %edi, %eax
-; AVX1-NEXT: shrl $23, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
-; AVX1-NEXT: movl %edi, %eax
-; AVX1-NEXT: shrl $24, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
-; AVX1-NEXT: movl %edi, %eax
-; AVX1-NEXT: shrl $25, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
-; AVX1-NEXT: movl %edi, %eax
-; AVX1-NEXT: shrl $26, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
-; AVX1-NEXT: movl %edi, %eax
-; AVX1-NEXT: shrl $27, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; AVX1-NEXT: movl %edi, %eax
-; AVX1-NEXT: shrl $28, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
-; AVX1-NEXT: movl %edi, %eax
-; AVX1-NEXT: shrl $29, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
-; AVX1-NEXT: movl %edi, %eax
-; AVX1-NEXT: shrl $30, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
-; AVX1-NEXT: movl %edi, %eax
-; AVX1-NEXT: shrl $31, %eax
-; AVX1-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
-; AVX1-NEXT: movl %edi, %eax
-; AVX1-NEXT: shrl %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: movl %edi, %ecx
-; AVX1-NEXT: andl $1, %ecx
-; AVX1-NEXT: vmovd %ecx, %xmm1
-; AVX1-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
-; AVX1-NEXT: movl %edi, %eax
-; AVX1-NEXT: shrl $2, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
-; AVX1-NEXT: movl %edi, %eax
-; AVX1-NEXT: shrl $3, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1
-; AVX1-NEXT: movl %edi, %eax
-; AVX1-NEXT: shrl $4, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; AVX1-NEXT: movl %edi, %eax
-; AVX1-NEXT: shrl $5, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
-; AVX1-NEXT: movl %edi, %eax
-; AVX1-NEXT: shrl $6, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1
-; AVX1-NEXT: movl %edi, %eax
-; AVX1-NEXT: shrl $7, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1
-; AVX1-NEXT: movl %edi, %eax
-; AVX1-NEXT: shrl $8, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
-; AVX1-NEXT: movl %edi, %eax
-; AVX1-NEXT: shrl $9, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1
-; AVX1-NEXT: movl %edi, %eax
-; AVX1-NEXT: shrl $10, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1
-; AVX1-NEXT: movl %edi, %eax
-; AVX1-NEXT: shrl $11, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1
-; AVX1-NEXT: movl %edi, %eax
-; AVX1-NEXT: shrl $12, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
-; AVX1-NEXT: movl %edi, %eax
-; AVX1-NEXT: shrl $13, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
-; AVX1-NEXT: movl %edi, %eax
-; AVX1-NEXT: shrl $14, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1
-; AVX1-NEXT: shrl $15, %edi
-; AVX1-NEXT: andl $1, %edi
-; AVX1-NEXT: vpinsrb $15, %edi, %xmm1, %xmm1
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovd %edi, %xmm0
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,1,1,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,2,3,3,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT: movq %rbp, %rsp
-; AVX1-NEXT: popq %rbp
+; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpsrlw $7, %xmm1, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpsrlw $7, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: bitcast_i32_32i1:
-; AVX2: # BB#0:
-; AVX2-NEXT: pushq %rbp
-; AVX2-NEXT: .Lcfi0:
-; AVX2-NEXT: .cfi_def_cfa_offset 16
-; AVX2-NEXT: .Lcfi1:
-; AVX2-NEXT: .cfi_offset %rbp, -16
-; AVX2-NEXT: movq %rsp, %rbp
-; AVX2-NEXT: .Lcfi2:
-; AVX2-NEXT: .cfi_def_cfa_register %rbp
-; AVX2-NEXT: andq $-32, %rsp
-; AVX2-NEXT: subq $32, %rsp
-; AVX2-NEXT: movl %edi, %eax
-; AVX2-NEXT: shrl $17, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: movl %edi, %ecx
-; AVX2-NEXT: shrl $16, %ecx
-; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: vmovd %ecx, %xmm0
-; AVX2-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
-; AVX2-NEXT: movl %edi, %eax
-; AVX2-NEXT: shrl $18, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
-; AVX2-NEXT: movl %edi, %eax
-; AVX2-NEXT: shrl $19, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
-; AVX2-NEXT: movl %edi, %eax
-; AVX2-NEXT: shrl $20, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
-; AVX2-NEXT: movl %edi, %eax
-; AVX2-NEXT: shrl $21, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; AVX2-NEXT: movl %edi, %eax
-; AVX2-NEXT: shrl $22, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; AVX2-NEXT: movl %edi, %eax
-; AVX2-NEXT: shrl $23, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
-; AVX2-NEXT: movl %edi, %eax
-; AVX2-NEXT: shrl $24, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
-; AVX2-NEXT: movl %edi, %eax
-; AVX2-NEXT: shrl $25, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
-; AVX2-NEXT: movl %edi, %eax
-; AVX2-NEXT: shrl $26, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
-; AVX2-NEXT: movl %edi, %eax
-; AVX2-NEXT: shrl $27, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; AVX2-NEXT: movl %edi, %eax
-; AVX2-NEXT: shrl $28, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
-; AVX2-NEXT: movl %edi, %eax
-; AVX2-NEXT: shrl $29, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
-; AVX2-NEXT: movl %edi, %eax
-; AVX2-NEXT: shrl $30, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
-; AVX2-NEXT: movl %edi, %eax
-; AVX2-NEXT: shrl $31, %eax
-; AVX2-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
-; AVX2-NEXT: movl %edi, %eax
-; AVX2-NEXT: shrl %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: movl %edi, %ecx
-; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: vmovd %ecx, %xmm1
-; AVX2-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
-; AVX2-NEXT: movl %edi, %eax
-; AVX2-NEXT: shrl $2, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
-; AVX2-NEXT: movl %edi, %eax
-; AVX2-NEXT: shrl $3, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1
-; AVX2-NEXT: movl %edi, %eax
-; AVX2-NEXT: shrl $4, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; AVX2-NEXT: movl %edi, %eax
-; AVX2-NEXT: shrl $5, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
-; AVX2-NEXT: movl %edi, %eax
-; AVX2-NEXT: shrl $6, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1
-; AVX2-NEXT: movl %edi, %eax
-; AVX2-NEXT: shrl $7, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1
-; AVX2-NEXT: movl %edi, %eax
-; AVX2-NEXT: shrl $8, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
-; AVX2-NEXT: movl %edi, %eax
-; AVX2-NEXT: shrl $9, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1
-; AVX2-NEXT: movl %edi, %eax
-; AVX2-NEXT: shrl $10, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1
-; AVX2-NEXT: movl %edi, %eax
-; AVX2-NEXT: shrl $11, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1
-; AVX2-NEXT: movl %edi, %eax
-; AVX2-NEXT: shrl $12, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
-; AVX2-NEXT: movl %edi, %eax
-; AVX2-NEXT: shrl $13, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
-; AVX2-NEXT: movl %edi, %eax
-; AVX2-NEXT: shrl $14, %eax
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1
-; AVX2-NEXT: shrl $15, %edi
-; AVX2-NEXT: andl $1, %edi
-; AVX2-NEXT: vpinsrb $15, %edi, %xmm1, %xmm1
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovd %edi, %xmm0
+; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,1,1,4,5,6,7]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,2,3,3,4,5,6,7]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX2-NEXT: movq %rbp, %rsp
-; AVX2-NEXT: popq %rbp
+; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
+; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpsrlw $7, %ymm0, %ymm0
+; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: bitcast_i32_32i1:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: kmovd %edi, %k0
; AVX512-NEXT: vpmovm2b %k0, %ymm0
; AVX512-NEXT: retq
@@ -664,19 +256,19 @@ define <32 x i1> @bitcast_i32_32i1(i32 %a0) {
define <64 x i1> @bitcast_i64_64i1(i64 %a0) {
; SSE2-SSSE3-LABEL: bitcast_i64_64i1:
-; SSE2-SSSE3: # BB#0:
+; SSE2-SSSE3: # %bb.0:
; SSE2-SSSE3-NEXT: movq %rsi, (%rdi)
; SSE2-SSSE3-NEXT: movq %rdi, %rax
; SSE2-SSSE3-NEXT: retq
;
; AVX12-LABEL: bitcast_i64_64i1:
-; AVX12: # BB#0:
+; AVX12: # %bb.0:
; AVX12-NEXT: movq %rsi, (%rdi)
; AVX12-NEXT: movq %rdi, %rax
; AVX12-NEXT: retq
;
; AVX512-LABEL: bitcast_i64_64i1:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: kmovq %rdi, %k0
; AVX512-NEXT: vpmovm2b %k0, %zmm0
; AVX512-NEXT: retq
diff --git a/test/CodeGen/X86/bitcast-int-to-vector.ll b/test/CodeGen/X86/bitcast-int-to-vector.ll
index 4c25979dcd5e..64fcf3d4944a 100644
--- a/test/CodeGen/X86/bitcast-int-to-vector.ll
+++ b/test/CodeGen/X86/bitcast-int-to-vector.ll
@@ -1,7 +1,34 @@
-; RUN: llc < %s -march=x86
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -mtriple=i686-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86-SSE
+; RUN: llc < %s -mtriple=x86_64-unknown | FileCheck %s --check-prefix=X64
-define i1 @foo(i64 %a)
-{
+define i1 @foo(i64 %a) {
+; X86-LABEL: foo:
+; X86: # %bb.0:
+; X86-NEXT: flds {{[0-9]+}}(%esp)
+; X86-NEXT: flds {{[0-9]+}}(%esp)
+; X86-NEXT: fucompp
+; X86-NEXT: fnstsw %ax
+; X86-NEXT: # kill: def %ah killed %ah killed %ax
+; X86-NEXT: sahf
+; X86-NEXT: setp %al
+; X86-NEXT: retl
+;
+; X86-SSE-LABEL: foo:
+; X86-SSE: # %bb.0:
+; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-SSE-NEXT: ucomiss {{[0-9]+}}(%esp), %xmm0
+; X86-SSE-NEXT: setp %al
+; X86-SSE-NEXT: retl
+;
+; X64-LABEL: foo:
+; X64: # %bb.0:
+; X64-NEXT: movq %rdi, %xmm0
+; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-NEXT: ucomiss %xmm1, %xmm0
+; X64-NEXT: setp %al
+; X64-NEXT: retq
%t = bitcast i64 %a to <2 x float>
%r = extractelement <2 x float> %t, i32 0
%s = extractelement <2 x float> %t, i32 1
diff --git a/test/CodeGen/X86/bitcast-mmx.ll b/test/CodeGen/X86/bitcast-mmx.ll
index f0318ede531a..9f612574a322 100644
--- a/test/CodeGen/X86/bitcast-mmx.ll
+++ b/test/CodeGen/X86/bitcast-mmx.ll
@@ -4,13 +4,13 @@
define i32 @t0(i64 %x) nounwind {
; X86-LABEL: t0:
-; X86: # BB#0: # %entry
+; X86: # %bb.0: # %entry
; X86-NEXT: pshufw $238, {{[0-9]+}}(%esp), %mm0 # mm0 = mem[2,3,2,3]
; X86-NEXT: movd %mm0, %eax
; X86-NEXT: retl
;
; X64-LABEL: t0:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: movd %rdi, %mm0
; X64-NEXT: pshufw $238, %mm0, %mm0 # mm0 = mm0[2,3,2,3]
; X64-NEXT: movd %mm0, %eax
@@ -29,7 +29,7 @@ entry:
define i64 @t1(i64 %x, i32 %n) nounwind {
; X86-LABEL: t1:
-; X86: # BB#0: # %entry
+; X86: # %bb.0: # %entry
; X86-NEXT: pushl %ebp
; X86-NEXT: movl %esp, %ebp
; X86-NEXT: andl $-8, %esp
@@ -45,7 +45,7 @@ define i64 @t1(i64 %x, i32 %n) nounwind {
; X86-NEXT: retl
;
; X64-LABEL: t1:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: movd %esi, %mm0
; X64-NEXT: movd %rdi, %mm1
; X64-NEXT: psllq %mm0, %mm1
@@ -60,7 +60,7 @@ entry:
define i64 @t2(i64 %x, i32 %n, i32 %w) nounwind {
; X86-LABEL: t2:
-; X86: # BB#0: # %entry
+; X86: # %bb.0: # %entry
; X86-NEXT: pushl %ebp
; X86-NEXT: movl %esp, %ebp
; X86-NEXT: andl $-8, %esp
@@ -77,7 +77,7 @@ define i64 @t2(i64 %x, i32 %n, i32 %w) nounwind {
; X86-NEXT: retl
;
; X64-LABEL: t2:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: movd %esi, %mm0
; X64-NEXT: movd %edx, %mm1
; X64-NEXT: psllq %mm0, %mm1
@@ -98,7 +98,7 @@ entry:
define i64 @t3(<1 x i64>* %y, i32* %n) nounwind {
; X86-LABEL: t3:
-; X86: # BB#0: # %entry
+; X86: # %bb.0: # %entry
; X86-NEXT: pushl %ebp
; X86-NEXT: movl %esp, %ebp
; X86-NEXT: andl $-8, %esp
@@ -116,7 +116,7 @@ define i64 @t3(<1 x i64>* %y, i32* %n) nounwind {
; X86-NEXT: retl
;
; X64-LABEL: t3:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: movq (%rdi), %mm0
; X64-NEXT: movd (%rsi), %mm1
; X64-NEXT: psllq %mm1, %mm0
diff --git a/test/CodeGen/X86/bitcast-setcc-128.ll b/test/CodeGen/X86/bitcast-setcc-128.ll
index 5616276da08d..7d0381837b70 100644
--- a/test/CodeGen/X86/bitcast-setcc-128.ll
+++ b/test/CodeGen/X86/bitcast-setcc-128.ll
@@ -3,40 +3,43 @@
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE2-SSSE3,SSSE3
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX12,AVX1
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX12,AVX2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=AVX512 --check-prefixes=AVX512F
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512 --check-prefixes=AVX512BW
define i8 @v8i16(<8 x i16> %a, <8 x i16> %b) {
-; SSE2-LABEL: v8i16:
-; SSE2: # BB#0:
-; SSE2-NEXT: pcmpgtw %xmm1, %xmm0
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT: packuswb %xmm0, %xmm0
-; SSE2-NEXT: pmovmskb %xmm0, %eax
-; SSE2-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: v8i16:
-; SSSE3: # BB#0:
-; SSSE3-NEXT: pcmpgtw %xmm1, %xmm0
-; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
-; SSSE3-NEXT: pmovmskb %xmm0, %eax
-; SSSE3-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
-; SSSE3-NEXT: retq
+; SSE2-SSSE3-LABEL: v8i16:
+; SSE2-SSSE3: # %bb.0:
+; SSE2-SSSE3-NEXT: pcmpgtw %xmm1, %xmm0
+; SSE2-SSSE3-NEXT: packsswb %xmm0, %xmm0
+; SSE2-SSSE3-NEXT: pmovmskb %xmm0, %eax
+; SSE2-SSSE3-NEXT: # kill: def %al killed %al killed %eax
+; SSE2-SSSE3-NEXT: retq
;
; AVX12-LABEL: v8i16:
-; AVX12: # BB#0:
+; AVX12: # %bb.0:
; AVX12-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0
-; AVX12-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX12-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
; AVX12-NEXT: vpmovmskb %xmm0, %eax
-; AVX12-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX12-NEXT: # kill: def %al killed %al killed %eax
; AVX12-NEXT: retq
;
-; AVX512-LABEL: v8i16:
-; AVX512: # BB#0:
-; AVX512-NEXT: vpcmpgtw %xmm1, %xmm0, %k0
-; AVX512-NEXT: kmovd %k0, %eax
-; AVX512-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
-; AVX512-NEXT: retq
+; AVX512F-LABEL: v8i16:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0
+; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0
+; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: # kill: def %al killed %al killed %eax
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: v8i16:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpcmpgtw %xmm1, %xmm0, %k0
+; AVX512BW-NEXT: kmovd %k0, %eax
+; AVX512BW-NEXT: # kill: def %al killed %al killed %eax
+; AVX512BW-NEXT: retq
%x = icmp sgt <8 x i16> %a, %b
%res = bitcast <8 x i1> %x to i8
ret i8 %res
@@ -44,26 +47,34 @@ define i8 @v8i16(<8 x i16> %a, <8 x i16> %b) {
define i4 @v4i32(<4 x i32> %a, <4 x i32> %b) {
; SSE2-SSSE3-LABEL: v4i32:
-; SSE2-SSSE3: # BB#0:
+; SSE2-SSSE3: # %bb.0:
; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm0
; SSE2-SSSE3-NEXT: movmskps %xmm0, %eax
-; SSE2-SSSE3-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; SSE2-SSSE3-NEXT: # kill: def %al killed %al killed %eax
; SSE2-SSSE3-NEXT: retq
;
; AVX12-LABEL: v4i32:
-; AVX12: # BB#0:
+; AVX12: # %bb.0:
; AVX12-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
; AVX12-NEXT: vmovmskps %xmm0, %eax
-; AVX12-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX12-NEXT: # kill: def %al killed %al killed %eax
; AVX12-NEXT: retq
;
-; AVX512-LABEL: v4i32:
-; AVX512: # BB#0:
-; AVX512-NEXT: vpcmpgtd %xmm1, %xmm0, %k0
-; AVX512-NEXT: kmovd %k0, %eax
-; AVX512-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; AVX512-NEXT: retq
+; AVX512F-LABEL: v4i32:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpcmpgtd %xmm1, %xmm0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: v4i32:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpcmpgtd %xmm1, %xmm0, %k0
+; AVX512BW-NEXT: kmovd %k0, %eax
+; AVX512BW-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; AVX512BW-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; AVX512BW-NEXT: retq
%x = icmp sgt <4 x i32> %a, %b
%res = bitcast <4 x i1> %x to i4
ret i4 %res
@@ -71,26 +82,34 @@ define i4 @v4i32(<4 x i32> %a, <4 x i32> %b) {
define i4 @v4f32(<4 x float> %a, <4 x float> %b) {
; SSE2-SSSE3-LABEL: v4f32:
-; SSE2-SSSE3: # BB#0:
+; SSE2-SSSE3: # %bb.0:
; SSE2-SSSE3-NEXT: cmpltps %xmm0, %xmm1
; SSE2-SSSE3-NEXT: movmskps %xmm1, %eax
-; SSE2-SSSE3-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; SSE2-SSSE3-NEXT: # kill: def %al killed %al killed %eax
; SSE2-SSSE3-NEXT: retq
;
; AVX12-LABEL: v4f32:
-; AVX12: # BB#0:
+; AVX12: # %bb.0:
; AVX12-NEXT: vcmpltps %xmm0, %xmm1, %xmm0
; AVX12-NEXT: vmovmskps %xmm0, %eax
-; AVX12-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX12-NEXT: # kill: def %al killed %al killed %eax
; AVX12-NEXT: retq
;
-; AVX512-LABEL: v4f32:
-; AVX512: # BB#0:
-; AVX512-NEXT: vcmpltps %xmm0, %xmm1, %k0
-; AVX512-NEXT: kmovd %k0, %eax
-; AVX512-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; AVX512-NEXT: retq
+; AVX512F-LABEL: v4f32:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vcmpltps %xmm0, %xmm1, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: v4f32:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vcmpltps %xmm0, %xmm1, %k0
+; AVX512BW-NEXT: kmovd %k0, %eax
+; AVX512BW-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; AVX512BW-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; AVX512BW-NEXT: retq
%x = fcmp ogt <4 x float> %a, %b
%res = bitcast <4 x i1> %x to i4
ret i4 %res
@@ -98,25 +117,36 @@ define i4 @v4f32(<4 x float> %a, <4 x float> %b) {
define i16 @v16i8(<16 x i8> %a, <16 x i8> %b) {
; SSE2-SSSE3-LABEL: v16i8:
-; SSE2-SSSE3: # BB#0:
+; SSE2-SSSE3: # %bb.0:
; SSE2-SSSE3-NEXT: pcmpgtb %xmm1, %xmm0
; SSE2-SSSE3-NEXT: pmovmskb %xmm0, %eax
-; SSE2-SSSE3-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; SSE2-SSSE3-NEXT: # kill: def %ax killed %ax killed %eax
; SSE2-SSSE3-NEXT: retq
;
; AVX12-LABEL: v16i8:
-; AVX12: # BB#0:
+; AVX12: # %bb.0:
; AVX12-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0
; AVX12-NEXT: vpmovmskb %xmm0, %eax
-; AVX12-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX12-NEXT: # kill: def %ax killed %ax killed %eax
; AVX12-NEXT: retq
;
-; AVX512-LABEL: v16i8:
-; AVX512: # BB#0:
-; AVX512-NEXT: vpcmpgtb %xmm1, %xmm0, %k0
-; AVX512-NEXT: kmovd %k0, %eax
-; AVX512-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
-; AVX512-NEXT: retq
+; AVX512F-LABEL: v16i8:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
+; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
+; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: # kill: def %ax killed %ax killed %eax
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: v16i8:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpcmpgtb %xmm1, %xmm0, %k0
+; AVX512BW-NEXT: kmovd %k0, %eax
+; AVX512BW-NEXT: # kill: def %ax killed %ax killed %eax
+; AVX512BW-NEXT: retq
%x = icmp sgt <16 x i8> %a, %b
%res = bitcast <16 x i1> %x to i16
ret i16 %res
@@ -124,7 +154,7 @@ define i16 @v16i8(<16 x i8> %a, <16 x i8> %b) {
define i2 @v2i8(<2 x i8> %a, <2 x i8> %b) {
; SSE2-SSSE3-LABEL: v2i8:
-; SSE2-SSSE3: # BB#0:
+; SSE2-SSSE3: # %bb.0:
; SSE2-SSSE3-NEXT: psllq $56, %xmm0
; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm2
; SSE2-SSSE3-NEXT: psrad $31, %xmm2
@@ -151,11 +181,11 @@ define i2 @v2i8(<2 x i8> %a, <2 x i8> %b) {
; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
; SSE2-SSSE3-NEXT: por %xmm0, %xmm1
; SSE2-SSSE3-NEXT: movmskpd %xmm1, %eax
-; SSE2-SSSE3-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; SSE2-SSSE3-NEXT: # kill: def %al killed %al killed %eax
; SSE2-SSSE3-NEXT: retq
;
; AVX1-LABEL: v2i8:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpsllq $56, %xmm1, %xmm1
; AVX1-NEXT: vpsrad $31, %xmm1, %xmm2
; AVX1-NEXT: vpsrad $24, %xmm1, %xmm1
@@ -168,11 +198,11 @@ define i2 @v2i8(<2 x i8> %a, <2 x i8> %b) {
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vmovmskpd %xmm0, %eax
-; AVX1-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX1-NEXT: # kill: def %al killed %al killed %eax
; AVX1-NEXT: retq
;
; AVX2-LABEL: v2i8:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpsllq $56, %xmm1, %xmm1
; AVX2-NEXT: vpsrad $31, %xmm1, %xmm2
; AVX2-NEXT: vpsrad $24, %xmm1, %xmm1
@@ -185,20 +215,32 @@ define i2 @v2i8(<2 x i8> %a, <2 x i8> %b) {
; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vmovmskpd %xmm0, %eax
-; AVX2-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX2-NEXT: # kill: def %al killed %al killed %eax
; AVX2-NEXT: retq
;
-; AVX512-LABEL: v2i8:
-; AVX512: # BB#0:
-; AVX512-NEXT: vpsllq $56, %xmm1, %xmm1
-; AVX512-NEXT: vpsraq $56, %xmm1, %xmm1
-; AVX512-NEXT: vpsllq $56, %xmm0, %xmm0
-; AVX512-NEXT: vpsraq $56, %xmm0, %xmm0
-; AVX512-NEXT: vpcmpgtq %xmm1, %xmm0, %k0
-; AVX512-NEXT: kmovd %k0, %eax
-; AVX512-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; AVX512-NEXT: retq
+; AVX512F-LABEL: v2i8:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpsllq $56, %xmm1, %xmm1
+; AVX512F-NEXT: vpsraq $56, %xmm1, %xmm1
+; AVX512F-NEXT: vpsllq $56, %xmm0, %xmm0
+; AVX512F-NEXT: vpsraq $56, %xmm0, %xmm0
+; AVX512F-NEXT: vpcmpgtq %xmm1, %xmm0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: v2i8:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpsllq $56, %xmm1, %xmm1
+; AVX512BW-NEXT: vpsraq $56, %xmm1, %xmm1
+; AVX512BW-NEXT: vpsllq $56, %xmm0, %xmm0
+; AVX512BW-NEXT: vpsraq $56, %xmm0, %xmm0
+; AVX512BW-NEXT: vpcmpgtq %xmm1, %xmm0, %k0
+; AVX512BW-NEXT: kmovd %k0, %eax
+; AVX512BW-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; AVX512BW-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; AVX512BW-NEXT: retq
%x = icmp sgt <2 x i8> %a, %b
%res = bitcast <2 x i1> %x to i2
ret i2 %res
@@ -206,7 +248,7 @@ define i2 @v2i8(<2 x i8> %a, <2 x i8> %b) {
define i2 @v2i16(<2 x i16> %a, <2 x i16> %b) {
; SSE2-SSSE3-LABEL: v2i16:
-; SSE2-SSSE3: # BB#0:
+; SSE2-SSSE3: # %bb.0:
; SSE2-SSSE3-NEXT: psllq $48, %xmm0
; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm2
; SSE2-SSSE3-NEXT: psrad $31, %xmm2
@@ -233,11 +275,11 @@ define i2 @v2i16(<2 x i16> %a, <2 x i16> %b) {
; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
; SSE2-SSSE3-NEXT: por %xmm0, %xmm1
; SSE2-SSSE3-NEXT: movmskpd %xmm1, %eax
-; SSE2-SSSE3-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; SSE2-SSSE3-NEXT: # kill: def %al killed %al killed %eax
; SSE2-SSSE3-NEXT: retq
;
; AVX1-LABEL: v2i16:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpsllq $48, %xmm1, %xmm1
; AVX1-NEXT: vpsrad $31, %xmm1, %xmm2
; AVX1-NEXT: vpsrad $16, %xmm1, %xmm1
@@ -250,11 +292,11 @@ define i2 @v2i16(<2 x i16> %a, <2 x i16> %b) {
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vmovmskpd %xmm0, %eax
-; AVX1-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX1-NEXT: # kill: def %al killed %al killed %eax
; AVX1-NEXT: retq
;
; AVX2-LABEL: v2i16:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpsllq $48, %xmm1, %xmm1
; AVX2-NEXT: vpsrad $31, %xmm1, %xmm2
; AVX2-NEXT: vpsrad $16, %xmm1, %xmm1
@@ -267,20 +309,32 @@ define i2 @v2i16(<2 x i16> %a, <2 x i16> %b) {
; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vmovmskpd %xmm0, %eax
-; AVX2-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX2-NEXT: # kill: def %al killed %al killed %eax
; AVX2-NEXT: retq
;
-; AVX512-LABEL: v2i16:
-; AVX512: # BB#0:
-; AVX512-NEXT: vpsllq $48, %xmm1, %xmm1
-; AVX512-NEXT: vpsraq $48, %xmm1, %xmm1
-; AVX512-NEXT: vpsllq $48, %xmm0, %xmm0
-; AVX512-NEXT: vpsraq $48, %xmm0, %xmm0
-; AVX512-NEXT: vpcmpgtq %xmm1, %xmm0, %k0
-; AVX512-NEXT: kmovd %k0, %eax
-; AVX512-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; AVX512-NEXT: retq
+; AVX512F-LABEL: v2i16:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpsllq $48, %xmm1, %xmm1
+; AVX512F-NEXT: vpsraq $48, %xmm1, %xmm1
+; AVX512F-NEXT: vpsllq $48, %xmm0, %xmm0
+; AVX512F-NEXT: vpsraq $48, %xmm0, %xmm0
+; AVX512F-NEXT: vpcmpgtq %xmm1, %xmm0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: v2i16:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpsllq $48, %xmm1, %xmm1
+; AVX512BW-NEXT: vpsraq $48, %xmm1, %xmm1
+; AVX512BW-NEXT: vpsllq $48, %xmm0, %xmm0
+; AVX512BW-NEXT: vpsraq $48, %xmm0, %xmm0
+; AVX512BW-NEXT: vpcmpgtq %xmm1, %xmm0, %k0
+; AVX512BW-NEXT: kmovd %k0, %eax
+; AVX512BW-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; AVX512BW-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; AVX512BW-NEXT: retq
%x = icmp sgt <2 x i16> %a, %b
%res = bitcast <2 x i1> %x to i2
ret i2 %res
@@ -288,7 +342,7 @@ define i2 @v2i16(<2 x i16> %a, <2 x i16> %b) {
define i2 @v2i32(<2 x i32> %a, <2 x i32> %b) {
; SSE2-SSSE3-LABEL: v2i32:
-; SSE2-SSSE3: # BB#0:
+; SSE2-SSSE3: # %bb.0:
; SSE2-SSSE3-NEXT: psllq $32, %xmm0
; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
; SSE2-SSSE3-NEXT: psrad $31, %xmm0
@@ -311,11 +365,11 @@ define i2 @v2i32(<2 x i32> %a, <2 x i32> %b) {
; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSE2-SSSE3-NEXT: por %xmm0, %xmm1
; SSE2-SSSE3-NEXT: movmskpd %xmm1, %eax
-; SSE2-SSSE3-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; SSE2-SSSE3-NEXT: # kill: def %al killed %al killed %eax
; SSE2-SSSE3-NEXT: retq
;
; AVX1-LABEL: v2i32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpsllq $32, %xmm1, %xmm1
; AVX1-NEXT: vpsrad $31, %xmm1, %xmm2
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
@@ -326,11 +380,11 @@ define i2 @v2i32(<2 x i32> %a, <2 x i32> %b) {
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vmovmskpd %xmm0, %eax
-; AVX1-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX1-NEXT: # kill: def %al killed %al killed %eax
; AVX1-NEXT: retq
;
; AVX2-LABEL: v2i32:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpsllq $32, %xmm1, %xmm1
; AVX2-NEXT: vpsrad $31, %xmm1, %xmm2
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
@@ -341,20 +395,32 @@ define i2 @v2i32(<2 x i32> %a, <2 x i32> %b) {
; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vmovmskpd %xmm0, %eax
-; AVX2-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX2-NEXT: # kill: def %al killed %al killed %eax
; AVX2-NEXT: retq
;
-; AVX512-LABEL: v2i32:
-; AVX512: # BB#0:
-; AVX512-NEXT: vpsllq $32, %xmm1, %xmm1
-; AVX512-NEXT: vpsraq $32, %xmm1, %xmm1
-; AVX512-NEXT: vpsllq $32, %xmm0, %xmm0
-; AVX512-NEXT: vpsraq $32, %xmm0, %xmm0
-; AVX512-NEXT: vpcmpgtq %xmm1, %xmm0, %k0
-; AVX512-NEXT: kmovd %k0, %eax
-; AVX512-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; AVX512-NEXT: retq
+; AVX512F-LABEL: v2i32:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpsllq $32, %xmm1, %xmm1
+; AVX512F-NEXT: vpsraq $32, %xmm1, %xmm1
+; AVX512F-NEXT: vpsllq $32, %xmm0, %xmm0
+; AVX512F-NEXT: vpsraq $32, %xmm0, %xmm0
+; AVX512F-NEXT: vpcmpgtq %xmm1, %xmm0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: v2i32:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpsllq $32, %xmm1, %xmm1
+; AVX512BW-NEXT: vpsraq $32, %xmm1, %xmm1
+; AVX512BW-NEXT: vpsllq $32, %xmm0, %xmm0
+; AVX512BW-NEXT: vpsraq $32, %xmm0, %xmm0
+; AVX512BW-NEXT: vpcmpgtq %xmm1, %xmm0, %k0
+; AVX512BW-NEXT: kmovd %k0, %eax
+; AVX512BW-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; AVX512BW-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; AVX512BW-NEXT: retq
%x = icmp sgt <2 x i32> %a, %b
%res = bitcast <2 x i1> %x to i2
ret i2 %res
@@ -362,7 +428,7 @@ define i2 @v2i32(<2 x i32> %a, <2 x i32> %b) {
define i2 @v2i64(<2 x i64> %a, <2 x i64> %b) {
; SSE2-SSSE3-LABEL: v2i64:
-; SSE2-SSSE3: # BB#0:
+; SSE2-SSSE3: # %bb.0:
; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0]
; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm1
; SSE2-SSSE3-NEXT: pxor %xmm2, %xmm0
@@ -375,23 +441,31 @@ define i2 @v2i64(<2 x i64> %a, <2 x i64> %b) {
; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
; SSE2-SSSE3-NEXT: por %xmm0, %xmm1
; SSE2-SSSE3-NEXT: movmskpd %xmm1, %eax
-; SSE2-SSSE3-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; SSE2-SSSE3-NEXT: # kill: def %al killed %al killed %eax
; SSE2-SSSE3-NEXT: retq
;
; AVX12-LABEL: v2i64:
-; AVX12: # BB#0:
+; AVX12: # %bb.0:
; AVX12-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; AVX12-NEXT: vmovmskpd %xmm0, %eax
-; AVX12-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX12-NEXT: # kill: def %al killed %al killed %eax
; AVX12-NEXT: retq
;
-; AVX512-LABEL: v2i64:
-; AVX512: # BB#0:
-; AVX512-NEXT: vpcmpgtq %xmm1, %xmm0, %k0
-; AVX512-NEXT: kmovd %k0, %eax
-; AVX512-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; AVX512-NEXT: retq
+; AVX512F-LABEL: v2i64:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpcmpgtq %xmm1, %xmm0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: v2i64:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpcmpgtq %xmm1, %xmm0, %k0
+; AVX512BW-NEXT: kmovd %k0, %eax
+; AVX512BW-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; AVX512BW-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; AVX512BW-NEXT: retq
%x = icmp sgt <2 x i64> %a, %b
%res = bitcast <2 x i1> %x to i2
ret i2 %res
@@ -399,26 +473,34 @@ define i2 @v2i64(<2 x i64> %a, <2 x i64> %b) {
define i2 @v2f64(<2 x double> %a, <2 x double> %b) {
; SSE2-SSSE3-LABEL: v2f64:
-; SSE2-SSSE3: # BB#0:
+; SSE2-SSSE3: # %bb.0:
; SSE2-SSSE3-NEXT: cmpltpd %xmm0, %xmm1
; SSE2-SSSE3-NEXT: movmskpd %xmm1, %eax
-; SSE2-SSSE3-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; SSE2-SSSE3-NEXT: # kill: def %al killed %al killed %eax
; SSE2-SSSE3-NEXT: retq
;
; AVX12-LABEL: v2f64:
-; AVX12: # BB#0:
+; AVX12: # %bb.0:
; AVX12-NEXT: vcmpltpd %xmm0, %xmm1, %xmm0
; AVX12-NEXT: vmovmskpd %xmm0, %eax
-; AVX12-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX12-NEXT: # kill: def %al killed %al killed %eax
; AVX12-NEXT: retq
;
-; AVX512-LABEL: v2f64:
-; AVX512: # BB#0:
-; AVX512-NEXT: vcmpltpd %xmm0, %xmm1, %k0
-; AVX512-NEXT: kmovd %k0, %eax
-; AVX512-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; AVX512-NEXT: retq
+; AVX512F-LABEL: v2f64:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vcmpltpd %xmm0, %xmm1, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: v2f64:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vcmpltpd %xmm0, %xmm1, %k0
+; AVX512BW-NEXT: kmovd %k0, %eax
+; AVX512BW-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; AVX512BW-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; AVX512BW-NEXT: retq
%x = fcmp ogt <2 x double> %a, %b
%res = bitcast <2 x i1> %x to i2
ret i2 %res
@@ -426,38 +508,50 @@ define i2 @v2f64(<2 x double> %a, <2 x double> %b) {
define i4 @v4i8(<4 x i8> %a, <4 x i8> %b) {
; SSE2-SSSE3-LABEL: v4i8:
-; SSE2-SSSE3: # BB#0:
+; SSE2-SSSE3: # %bb.0:
; SSE2-SSSE3-NEXT: pslld $24, %xmm1
; SSE2-SSSE3-NEXT: psrad $24, %xmm1
; SSE2-SSSE3-NEXT: pslld $24, %xmm0
; SSE2-SSSE3-NEXT: psrad $24, %xmm0
; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm0
; SSE2-SSSE3-NEXT: movmskps %xmm0, %eax
-; SSE2-SSSE3-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; SSE2-SSSE3-NEXT: # kill: def %al killed %al killed %eax
; SSE2-SSSE3-NEXT: retq
;
; AVX12-LABEL: v4i8:
-; AVX12: # BB#0:
+; AVX12: # %bb.0:
; AVX12-NEXT: vpslld $24, %xmm1, %xmm1
; AVX12-NEXT: vpsrad $24, %xmm1, %xmm1
; AVX12-NEXT: vpslld $24, %xmm0, %xmm0
; AVX12-NEXT: vpsrad $24, %xmm0, %xmm0
; AVX12-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
; AVX12-NEXT: vmovmskps %xmm0, %eax
-; AVX12-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX12-NEXT: # kill: def %al killed %al killed %eax
; AVX12-NEXT: retq
;
-; AVX512-LABEL: v4i8:
-; AVX512: # BB#0:
-; AVX512-NEXT: vpslld $24, %xmm1, %xmm1
-; AVX512-NEXT: vpsrad $24, %xmm1, %xmm1
-; AVX512-NEXT: vpslld $24, %xmm0, %xmm0
-; AVX512-NEXT: vpsrad $24, %xmm0, %xmm0
-; AVX512-NEXT: vpcmpgtd %xmm1, %xmm0, %k0
-; AVX512-NEXT: kmovd %k0, %eax
-; AVX512-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; AVX512-NEXT: retq
+; AVX512F-LABEL: v4i8:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpslld $24, %xmm1, %xmm1
+; AVX512F-NEXT: vpsrad $24, %xmm1, %xmm1
+; AVX512F-NEXT: vpslld $24, %xmm0, %xmm0
+; AVX512F-NEXT: vpsrad $24, %xmm0, %xmm0
+; AVX512F-NEXT: vpcmpgtd %xmm1, %xmm0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: v4i8:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpslld $24, %xmm1, %xmm1
+; AVX512BW-NEXT: vpsrad $24, %xmm1, %xmm1
+; AVX512BW-NEXT: vpslld $24, %xmm0, %xmm0
+; AVX512BW-NEXT: vpsrad $24, %xmm0, %xmm0
+; AVX512BW-NEXT: vpcmpgtd %xmm1, %xmm0, %k0
+; AVX512BW-NEXT: kmovd %k0, %eax
+; AVX512BW-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; AVX512BW-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; AVX512BW-NEXT: retq
%x = icmp sgt <4 x i8> %a, %b
%res = bitcast <4 x i1> %x to i4
ret i4 %res
@@ -465,91 +559,105 @@ define i4 @v4i8(<4 x i8> %a, <4 x i8> %b) {
define i4 @v4i16(<4 x i16> %a, <4 x i16> %b) {
; SSE2-SSSE3-LABEL: v4i16:
-; SSE2-SSSE3: # BB#0:
+; SSE2-SSSE3: # %bb.0:
; SSE2-SSSE3-NEXT: pslld $16, %xmm1
; SSE2-SSSE3-NEXT: psrad $16, %xmm1
; SSE2-SSSE3-NEXT: pslld $16, %xmm0
; SSE2-SSSE3-NEXT: psrad $16, %xmm0
; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm0
; SSE2-SSSE3-NEXT: movmskps %xmm0, %eax
-; SSE2-SSSE3-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; SSE2-SSSE3-NEXT: # kill: def %al killed %al killed %eax
; SSE2-SSSE3-NEXT: retq
;
; AVX12-LABEL: v4i16:
-; AVX12: # BB#0:
+; AVX12: # %bb.0:
; AVX12-NEXT: vpslld $16, %xmm1, %xmm1
; AVX12-NEXT: vpsrad $16, %xmm1, %xmm1
; AVX12-NEXT: vpslld $16, %xmm0, %xmm0
; AVX12-NEXT: vpsrad $16, %xmm0, %xmm0
; AVX12-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
; AVX12-NEXT: vmovmskps %xmm0, %eax
-; AVX12-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX12-NEXT: # kill: def %al killed %al killed %eax
; AVX12-NEXT: retq
;
-; AVX512-LABEL: v4i16:
-; AVX512: # BB#0:
-; AVX512-NEXT: vpslld $16, %xmm1, %xmm1
-; AVX512-NEXT: vpsrad $16, %xmm1, %xmm1
-; AVX512-NEXT: vpslld $16, %xmm0, %xmm0
-; AVX512-NEXT: vpsrad $16, %xmm0, %xmm0
-; AVX512-NEXT: vpcmpgtd %xmm1, %xmm0, %k0
-; AVX512-NEXT: kmovd %k0, %eax
-; AVX512-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; AVX512-NEXT: retq
+; AVX512F-LABEL: v4i16:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpslld $16, %xmm1, %xmm1
+; AVX512F-NEXT: vpsrad $16, %xmm1, %xmm1
+; AVX512F-NEXT: vpslld $16, %xmm0, %xmm0
+; AVX512F-NEXT: vpsrad $16, %xmm0, %xmm0
+; AVX512F-NEXT: vpcmpgtd %xmm1, %xmm0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: v4i16:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpslld $16, %xmm1, %xmm1
+; AVX512BW-NEXT: vpsrad $16, %xmm1, %xmm1
+; AVX512BW-NEXT: vpslld $16, %xmm0, %xmm0
+; AVX512BW-NEXT: vpsrad $16, %xmm0, %xmm0
+; AVX512BW-NEXT: vpcmpgtd %xmm1, %xmm0, %k0
+; AVX512BW-NEXT: kmovd %k0, %eax
+; AVX512BW-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; AVX512BW-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; AVX512BW-NEXT: retq
%x = icmp sgt <4 x i16> %a, %b
%res = bitcast <4 x i1> %x to i4
ret i4 %res
}
define i8 @v8i8(<8 x i8> %a, <8 x i8> %b) {
-; SSE2-LABEL: v8i8:
-; SSE2: # BB#0:
-; SSE2-NEXT: psllw $8, %xmm1
-; SSE2-NEXT: psraw $8, %xmm1
-; SSE2-NEXT: psllw $8, %xmm0
-; SSE2-NEXT: psraw $8, %xmm0
-; SSE2-NEXT: pcmpgtw %xmm1, %xmm0
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT: packuswb %xmm0, %xmm0
-; SSE2-NEXT: pmovmskb %xmm0, %eax
-; SSE2-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: v8i8:
-; SSSE3: # BB#0:
-; SSSE3-NEXT: psllw $8, %xmm1
-; SSSE3-NEXT: psraw $8, %xmm1
-; SSSE3-NEXT: psllw $8, %xmm0
-; SSSE3-NEXT: psraw $8, %xmm0
-; SSSE3-NEXT: pcmpgtw %xmm1, %xmm0
-; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
-; SSSE3-NEXT: pmovmskb %xmm0, %eax
-; SSSE3-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
-; SSSE3-NEXT: retq
+; SSE2-SSSE3-LABEL: v8i8:
+; SSE2-SSSE3: # %bb.0:
+; SSE2-SSSE3-NEXT: psllw $8, %xmm1
+; SSE2-SSSE3-NEXT: psraw $8, %xmm1
+; SSE2-SSSE3-NEXT: psllw $8, %xmm0
+; SSE2-SSSE3-NEXT: psraw $8, %xmm0
+; SSE2-SSSE3-NEXT: pcmpgtw %xmm1, %xmm0
+; SSE2-SSSE3-NEXT: packsswb %xmm0, %xmm0
+; SSE2-SSSE3-NEXT: pmovmskb %xmm0, %eax
+; SSE2-SSSE3-NEXT: # kill: def %al killed %al killed %eax
+; SSE2-SSSE3-NEXT: retq
;
; AVX12-LABEL: v8i8:
-; AVX12: # BB#0:
+; AVX12: # %bb.0:
; AVX12-NEXT: vpsllw $8, %xmm1, %xmm1
; AVX12-NEXT: vpsraw $8, %xmm1, %xmm1
; AVX12-NEXT: vpsllw $8, %xmm0, %xmm0
; AVX12-NEXT: vpsraw $8, %xmm0, %xmm0
; AVX12-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0
-; AVX12-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX12-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
; AVX12-NEXT: vpmovmskb %xmm0, %eax
-; AVX12-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX12-NEXT: # kill: def %al killed %al killed %eax
; AVX12-NEXT: retq
;
-; AVX512-LABEL: v8i8:
-; AVX512: # BB#0:
-; AVX512-NEXT: vpsllw $8, %xmm1, %xmm1
-; AVX512-NEXT: vpsraw $8, %xmm1, %xmm1
-; AVX512-NEXT: vpsllw $8, %xmm0, %xmm0
-; AVX512-NEXT: vpsraw $8, %xmm0, %xmm0
-; AVX512-NEXT: vpcmpgtw %xmm1, %xmm0, %k0
-; AVX512-NEXT: kmovd %k0, %eax
-; AVX512-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
-; AVX512-NEXT: retq
+; AVX512F-LABEL: v8i8:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpsllw $8, %xmm1, %xmm1
+; AVX512F-NEXT: vpsraw $8, %xmm1, %xmm1
+; AVX512F-NEXT: vpsllw $8, %xmm0, %xmm0
+; AVX512F-NEXT: vpsraw $8, %xmm0, %xmm0
+; AVX512F-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0
+; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0
+; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: # kill: def %al killed %al killed %eax
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: v8i8:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpsllw $8, %xmm1, %xmm1
+; AVX512BW-NEXT: vpsraw $8, %xmm1, %xmm1
+; AVX512BW-NEXT: vpsllw $8, %xmm0, %xmm0
+; AVX512BW-NEXT: vpsraw $8, %xmm0, %xmm0
+; AVX512BW-NEXT: vpcmpgtw %xmm1, %xmm0, %k0
+; AVX512BW-NEXT: kmovd %k0, %eax
+; AVX512BW-NEXT: # kill: def %al killed %al killed %eax
+; AVX512BW-NEXT: retq
%x = icmp sgt <8 x i8> %a, %b
%res = bitcast <8 x i1> %x to i8
ret i8 %res
diff --git a/test/CodeGen/X86/bitcast-setcc-256.ll b/test/CodeGen/X86/bitcast-setcc-256.ll
index 86475c42e79e..48e28c9d26ca 100644
--- a/test/CodeGen/X86/bitcast-setcc-256.ll
+++ b/test/CodeGen/X86/bitcast-setcc-256.ll
@@ -3,155 +3,149 @@
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+SSSE3 | FileCheck %s --check-prefixes=SSE2-SSSE3,SSSE3
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX12,AVX1
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX12,AVX2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl,+avx512bw | FileCheck %s --check-prefix=AVX512
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=AVX512 --check-prefixes=AVX512F
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512 --check-prefixes=AVX512BW
define i16 @v16i16(<16 x i16> %a, <16 x i16> %b) {
; SSE2-SSSE3-LABEL: v16i16:
-; SSE2-SSSE3: # BB#0:
+; SSE2-SSSE3: # %bb.0:
; SSE2-SSSE3-NEXT: pcmpgtw %xmm3, %xmm1
; SSE2-SSSE3-NEXT: pcmpgtw %xmm2, %xmm0
; SSE2-SSSE3-NEXT: packsswb %xmm1, %xmm0
; SSE2-SSSE3-NEXT: pmovmskb %xmm0, %eax
-; SSE2-SSSE3-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; SSE2-SSSE3-NEXT: # kill: def %ax killed %ax killed %eax
; SSE2-SSSE3-NEXT: retq
;
; AVX1-LABEL: v16i16:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpcmpgtw %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpacksswb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpmovmskb %xmm0, %eax
-; AVX1-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX1-NEXT: # kill: def %ax killed %ax killed %eax
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: v16i16:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpmovmskb %xmm0, %eax
-; AVX2-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX2-NEXT: # kill: def %ax killed %ax killed %eax
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
-; AVX512-LABEL: v16i16:
-; AVX512: # BB#0:
-; AVX512-NEXT: vpcmpgtw %ymm1, %ymm0, %k0
-; AVX512-NEXT: kmovd %k0, %eax
-; AVX512-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
+; AVX512F-LABEL: v16i16:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
+; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
+; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: # kill: def %ax killed %ax killed %eax
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: v16i16:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpcmpgtw %ymm1, %ymm0, %k0
+; AVX512BW-NEXT: kmovd %k0, %eax
+; AVX512BW-NEXT: # kill: def %ax killed %ax killed %eax
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
%x = icmp sgt <16 x i16> %a, %b
%res = bitcast <16 x i1> %x to i16
ret i16 %res
}
define i8 @v8i32(<8 x i32> %a, <8 x i32> %b) {
-; SSE2-LABEL: v8i32:
-; SSE2: # BB#0:
-; SSE2-NEXT: pcmpgtd %xmm3, %xmm1
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm0
-; SSE2-NEXT: packsswb %xmm1, %xmm0
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT: packuswb %xmm0, %xmm0
-; SSE2-NEXT: pmovmskb %xmm0, %eax
-; SSE2-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: v8i32:
-; SSSE3: # BB#0:
-; SSSE3-NEXT: pcmpgtd %xmm3, %xmm1
-; SSSE3-NEXT: pcmpgtd %xmm2, %xmm0
-; SSSE3-NEXT: packsswb %xmm1, %xmm0
-; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
-; SSSE3-NEXT: pmovmskb %xmm0, %eax
-; SSSE3-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
-; SSSE3-NEXT: retq
+; SSE2-SSSE3-LABEL: v8i32:
+; SSE2-SSSE3: # %bb.0:
+; SSE2-SSSE3-NEXT: pcmpgtd %xmm3, %xmm1
+; SSE2-SSSE3-NEXT: pcmpgtd %xmm2, %xmm0
+; SSE2-SSSE3-NEXT: packssdw %xmm1, %xmm0
+; SSE2-SSSE3-NEXT: packsswb %xmm0, %xmm0
+; SSE2-SSSE3-NEXT: pmovmskb %xmm0, %eax
+; SSE2-SSSE3-NEXT: # kill: def %al killed %al killed %eax
+; SSE2-SSSE3-NEXT: retq
;
; AVX1-LABEL: v8i32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpcmpgtd %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpacksswb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
-; AVX1-NEXT: vpmovmskb %xmm0, %eax
-; AVX1-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: vmovmskps %ymm0, %eax
+; AVX1-NEXT: # kill: def %al killed %al killed %eax
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: v8i32:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vmovmskps %ymm0, %eax
-; AVX2-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX2-NEXT: # kill: def %al killed %al killed %eax
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
-; AVX512-LABEL: v8i32:
-; AVX512: # BB#0:
-; AVX512-NEXT: vpcmpgtd %ymm1, %ymm0, %k0
-; AVX512-NEXT: kmovd %k0, %eax
-; AVX512-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
+; AVX512F-LABEL: v8i32:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpcmpgtd %ymm1, %ymm0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: # kill: def %al killed %al killed %eax
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: v8i32:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpcmpgtd %ymm1, %ymm0, %k0
+; AVX512BW-NEXT: kmovd %k0, %eax
+; AVX512BW-NEXT: # kill: def %al killed %al killed %eax
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
%x = icmp sgt <8 x i32> %a, %b
%res = bitcast <8 x i1> %x to i8
ret i8 %res
}
define i8 @v8f32(<8 x float> %a, <8 x float> %b) {
-; SSE2-LABEL: v8f32:
-; SSE2: # BB#0:
-; SSE2-NEXT: cmpltps %xmm1, %xmm3
-; SSE2-NEXT: cmpltps %xmm0, %xmm2
-; SSE2-NEXT: packsswb %xmm3, %xmm2
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm2
-; SSE2-NEXT: packuswb %xmm2, %xmm2
-; SSE2-NEXT: pmovmskb %xmm2, %eax
-; SSE2-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
-; SSE2-NEXT: retq
+; SSE2-SSSE3-LABEL: v8f32:
+; SSE2-SSSE3: # %bb.0:
+; SSE2-SSSE3-NEXT: cmpltps %xmm1, %xmm3
+; SSE2-SSSE3-NEXT: cmpltps %xmm0, %xmm2
+; SSE2-SSSE3-NEXT: packssdw %xmm3, %xmm2
+; SSE2-SSSE3-NEXT: packsswb %xmm0, %xmm2
+; SSE2-SSSE3-NEXT: pmovmskb %xmm2, %eax
+; SSE2-SSSE3-NEXT: # kill: def %al killed %al killed %eax
+; SSE2-SSSE3-NEXT: retq
;
-; SSSE3-LABEL: v8f32:
-; SSSE3: # BB#0:
-; SSSE3-NEXT: cmpltps %xmm1, %xmm3
-; SSSE3-NEXT: cmpltps %xmm0, %xmm2
-; SSSE3-NEXT: packsswb %xmm3, %xmm2
-; SSSE3-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
-; SSSE3-NEXT: pmovmskb %xmm2, %eax
-; SSSE3-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
-; SSSE3-NEXT: retq
+; AVX12-LABEL: v8f32:
+; AVX12: # %bb.0:
+; AVX12-NEXT: vcmpltps %ymm0, %ymm1, %ymm0
+; AVX12-NEXT: vmovmskps %ymm0, %eax
+; AVX12-NEXT: # kill: def %al killed %al killed %eax
+; AVX12-NEXT: vzeroupper
+; AVX12-NEXT: retq
;
-; AVX1-LABEL: v8f32:
-; AVX1: # BB#0:
-; AVX1-NEXT: vcmpltps %ymm0, %ymm1, %ymm0
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
-; AVX1-NEXT: vpmovmskb %xmm0, %eax
-; AVX1-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: v8f32:
-; AVX2: # BB#0:
-; AVX2-NEXT: vcmpltps %ymm0, %ymm1, %ymm0
-; AVX2-NEXT: vmovmskps %ymm0, %eax
-; AVX2-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
+; AVX512F-LABEL: v8f32:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vcmpltps %ymm0, %ymm1, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: # kill: def %al killed %al killed %eax
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
;
-; AVX512-LABEL: v8f32:
-; AVX512: # BB#0:
-; AVX512-NEXT: vcmpltps %ymm0, %ymm1, %k0
-; AVX512-NEXT: kmovd %k0, %eax
-; AVX512-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
+; AVX512BW-LABEL: v8f32:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vcmpltps %ymm0, %ymm1, %k0
+; AVX512BW-NEXT: kmovd %k0, %eax
+; AVX512BW-NEXT: # kill: def %al killed %al killed %eax
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
%x = fcmp ogt <8 x float> %a, %b
%res = bitcast <8 x i1> %x to i8
ret i8 %res
@@ -159,244 +153,66 @@ define i8 @v8f32(<8 x float> %a, <8 x float> %b) {
define i32 @v32i8(<32 x i8> %a, <32 x i8> %b) {
; SSE2-SSSE3-LABEL: v32i8:
-; SSE2-SSSE3: # BB#0:
-; SSE2-SSSE3-NEXT: pcmpgtb %xmm3, %xmm1
-; SSE2-SSSE3-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3: # %bb.0:
; SSE2-SSSE3-NEXT: pcmpgtb %xmm2, %xmm0
-; SSE2-SSSE3-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; SSE2-SSSE3-NEXT: andb $1, %al
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; SSE2-SSSE3-NEXT: andb $1, %al
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; SSE2-SSSE3-NEXT: andb $1, %al
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; SSE2-SSSE3-NEXT: andb $1, %al
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; SSE2-SSSE3-NEXT: andb $1, %al
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; SSE2-SSSE3-NEXT: andb $1, %al
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; SSE2-SSSE3-NEXT: andb $1, %al
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; SSE2-SSSE3-NEXT: andb $1, %al
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; SSE2-SSSE3-NEXT: andb $1, %al
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; SSE2-SSSE3-NEXT: andb $1, %al
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; SSE2-SSSE3-NEXT: andb $1, %al
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; SSE2-SSSE3-NEXT: andb $1, %al
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; SSE2-SSSE3-NEXT: andb $1, %al
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; SSE2-SSSE3-NEXT: andb $1, %al
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl
-; SSE2-SSSE3-NEXT: andb $1, %cl
-; SSE2-SSSE3-NEXT: movb %cl, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: andb $1, %al
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; SSE2-SSSE3-NEXT: andb $1, %al
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; SSE2-SSSE3-NEXT: andb $1, %al
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; SSE2-SSSE3-NEXT: andb $1, %al
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; SSE2-SSSE3-NEXT: andb $1, %al
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; SSE2-SSSE3-NEXT: andb $1, %al
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; SSE2-SSSE3-NEXT: andb $1, %al
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; SSE2-SSSE3-NEXT: andb $1, %al
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; SSE2-SSSE3-NEXT: andb $1, %al
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; SSE2-SSSE3-NEXT: andb $1, %al
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; SSE2-SSSE3-NEXT: andb $1, %al
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; SSE2-SSSE3-NEXT: andb $1, %al
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; SSE2-SSSE3-NEXT: andb $1, %al
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; SSE2-SSSE3-NEXT: andb $1, %al
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; SSE2-SSSE3-NEXT: andb $1, %al
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; SSE2-SSSE3-NEXT: movb -{{[0-9]+}}(%rsp), %cl
-; SSE2-SSSE3-NEXT: andb $1, %cl
-; SSE2-SSSE3-NEXT: movb %cl, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: andb $1, %al
-; SSE2-SSSE3-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE2-SSSE3-NEXT: movl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-SSSE3-NEXT: shll $16, %ecx
-; SSE2-SSSE3-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax
+; SSE2-SSSE3-NEXT: pmovmskb %xmm0, %ecx
+; SSE2-SSSE3-NEXT: pcmpgtb %xmm3, %xmm1
+; SSE2-SSSE3-NEXT: pmovmskb %xmm1, %eax
+; SSE2-SSSE3-NEXT: shll $16, %eax
; SSE2-SSSE3-NEXT: orl %ecx, %eax
; SSE2-SSSE3-NEXT: retq
;
; AVX1-LABEL: v32i8:
-; AVX1: # BB#0:
-; AVX1-NEXT: pushq %rbp
-; AVX1-NEXT: .Lcfi0:
-; AVX1-NEXT: .cfi_def_cfa_offset 16
-; AVX1-NEXT: .Lcfi1:
-; AVX1-NEXT: .cfi_offset %rbp, -16
-; AVX1-NEXT: movq %rsp, %rbp
-; AVX1-NEXT: .Lcfi2:
-; AVX1-NEXT: .cfi_def_cfa_register %rbp
-; AVX1-NEXT: andq $-32, %rsp
-; AVX1-NEXT: subq $32, %rsp
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT: vpcmpgtb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpextrb $15, %xmm2, %eax
-; AVX1-NEXT: andb $1, %al
-; AVX1-NEXT: movb %al, (%rsp)
-; AVX1-NEXT: vpextrb $14, %xmm2, %eax
-; AVX1-NEXT: andb $1, %al
-; AVX1-NEXT: movb %al, (%rsp)
-; AVX1-NEXT: vpextrb $13, %xmm2, %eax
-; AVX1-NEXT: andb $1, %al
-; AVX1-NEXT: movb %al, (%rsp)
-; AVX1-NEXT: vpextrb $12, %xmm2, %eax
-; AVX1-NEXT: andb $1, %al
-; AVX1-NEXT: movb %al, (%rsp)
-; AVX1-NEXT: vpextrb $11, %xmm2, %eax
-; AVX1-NEXT: andb $1, %al
-; AVX1-NEXT: movb %al, (%rsp)
-; AVX1-NEXT: vpextrb $10, %xmm2, %eax
-; AVX1-NEXT: andb $1, %al
-; AVX1-NEXT: movb %al, (%rsp)
-; AVX1-NEXT: vpextrb $9, %xmm2, %eax
-; AVX1-NEXT: andb $1, %al
-; AVX1-NEXT: movb %al, (%rsp)
-; AVX1-NEXT: vpextrb $8, %xmm2, %eax
-; AVX1-NEXT: andb $1, %al
-; AVX1-NEXT: movb %al, (%rsp)
-; AVX1-NEXT: vpextrb $7, %xmm2, %eax
-; AVX1-NEXT: andb $1, %al
-; AVX1-NEXT: movb %al, (%rsp)
-; AVX1-NEXT: vpextrb $6, %xmm2, %eax
-; AVX1-NEXT: andb $1, %al
-; AVX1-NEXT: movb %al, (%rsp)
-; AVX1-NEXT: vpextrb $5, %xmm2, %eax
-; AVX1-NEXT: andb $1, %al
-; AVX1-NEXT: movb %al, (%rsp)
-; AVX1-NEXT: vpextrb $4, %xmm2, %eax
-; AVX1-NEXT: andb $1, %al
-; AVX1-NEXT: movb %al, (%rsp)
-; AVX1-NEXT: vpextrb $3, %xmm2, %eax
-; AVX1-NEXT: andb $1, %al
-; AVX1-NEXT: movb %al, (%rsp)
-; AVX1-NEXT: vpextrb $2, %xmm2, %eax
-; AVX1-NEXT: andb $1, %al
-; AVX1-NEXT: movb %al, (%rsp)
-; AVX1-NEXT: vpextrb $1, %xmm2, %eax
-; AVX1-NEXT: andb $1, %al
-; AVX1-NEXT: movb %al, (%rsp)
-; AVX1-NEXT: vpextrb $0, %xmm2, %eax
-; AVX1-NEXT: andb $1, %al
-; AVX1-NEXT: movb %al, (%rsp)
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm2
+; AVX1-NEXT: vpmovmskb %xmm2, %ecx
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpextrb $15, %xmm0, %eax
-; AVX1-NEXT: andb $1, %al
-; AVX1-NEXT: movb %al, (%rsp)
-; AVX1-NEXT: vpextrb $14, %xmm0, %eax
-; AVX1-NEXT: andb $1, %al
-; AVX1-NEXT: movb %al, (%rsp)
-; AVX1-NEXT: vpextrb $13, %xmm0, %eax
-; AVX1-NEXT: andb $1, %al
-; AVX1-NEXT: movb %al, (%rsp)
-; AVX1-NEXT: vpextrb $12, %xmm0, %eax
-; AVX1-NEXT: andb $1, %al
-; AVX1-NEXT: movb %al, (%rsp)
-; AVX1-NEXT: vpextrb $11, %xmm0, %eax
-; AVX1-NEXT: andb $1, %al
-; AVX1-NEXT: movb %al, (%rsp)
-; AVX1-NEXT: vpextrb $10, %xmm0, %eax
-; AVX1-NEXT: andb $1, %al
-; AVX1-NEXT: movb %al, (%rsp)
-; AVX1-NEXT: vpextrb $9, %xmm0, %eax
-; AVX1-NEXT: andb $1, %al
-; AVX1-NEXT: movb %al, (%rsp)
-; AVX1-NEXT: vpextrb $8, %xmm0, %eax
-; AVX1-NEXT: andb $1, %al
-; AVX1-NEXT: movb %al, (%rsp)
-; AVX1-NEXT: vpextrb $7, %xmm0, %eax
-; AVX1-NEXT: andb $1, %al
-; AVX1-NEXT: movb %al, (%rsp)
-; AVX1-NEXT: vpextrb $6, %xmm0, %eax
-; AVX1-NEXT: andb $1, %al
-; AVX1-NEXT: movb %al, (%rsp)
-; AVX1-NEXT: vpextrb $5, %xmm0, %eax
-; AVX1-NEXT: andb $1, %al
-; AVX1-NEXT: movb %al, (%rsp)
-; AVX1-NEXT: vpextrb $4, %xmm0, %eax
-; AVX1-NEXT: andb $1, %al
-; AVX1-NEXT: movb %al, (%rsp)
-; AVX1-NEXT: vpextrb $3, %xmm0, %eax
-; AVX1-NEXT: andb $1, %al
-; AVX1-NEXT: movb %al, (%rsp)
-; AVX1-NEXT: vpextrb $2, %xmm0, %eax
-; AVX1-NEXT: andb $1, %al
-; AVX1-NEXT: movb %al, (%rsp)
-; AVX1-NEXT: vpextrb $1, %xmm0, %eax
-; AVX1-NEXT: andb $1, %al
-; AVX1-NEXT: movb %al, (%rsp)
-; AVX1-NEXT: vpextrb $0, %xmm0, %eax
-; AVX1-NEXT: andb $1, %al
-; AVX1-NEXT: movb %al, (%rsp)
-; AVX1-NEXT: movl (%rsp), %eax
-; AVX1-NEXT: movq %rbp, %rsp
-; AVX1-NEXT: popq %rbp
+; AVX1-NEXT: vpmovmskb %xmm0, %eax
+; AVX1-NEXT: shll $16, %eax
+; AVX1-NEXT: orl %ecx, %eax
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: v32i8:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpmovmskb %ymm0, %eax
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
-; AVX512-LABEL: v32i8:
-; AVX512: # BB#0:
-; AVX512-NEXT: vpcmpgtb %ymm1, %ymm0, %k0
-; AVX512-NEXT: kmovd %k0, %eax
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
+; AVX512F-LABEL: v32i8:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: pushq %rbp
+; AVX512F-NEXT: .cfi_def_cfa_offset 16
+; AVX512F-NEXT: .cfi_offset %rbp, -16
+; AVX512F-NEXT: movq %rsp, %rbp
+; AVX512F-NEXT: .cfi_def_cfa_register %rbp
+; AVX512F-NEXT: andq $-32, %rsp
+; AVX512F-NEXT: subq $32, %rsp
+; AVX512F-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm1
+; AVX512F-NEXT: vpslld $31, %zmm1, %zmm1
+; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0
+; AVX512F-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
+; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
+; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
+; AVX512F-NEXT: kmovw %k0, (%rsp)
+; AVX512F-NEXT: movl (%rsp), %eax
+; AVX512F-NEXT: movq %rbp, %rsp
+; AVX512F-NEXT: popq %rbp
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: v32i8:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpcmpgtb %ymm1, %ymm0, %k0
+; AVX512BW-NEXT: kmovd %k0, %eax
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
%x = icmp sgt <32 x i8> %a, %b
%res = bitcast <32 x i1> %x to i32
ret i32 %res
@@ -404,7 +220,7 @@ define i32 @v32i8(<32 x i8> %a, <32 x i8> %b) {
define i4 @v4i64(<4 x i64> %a, <4 x i64> %b) {
; SSE2-SSSE3-LABEL: v4i64:
-; SSE2-SSSE3: # BB#0:
+; SSE2-SSSE3: # %bb.0:
; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,0,2147483648,0]
; SSE2-SSSE3-NEXT: pxor %xmm4, %xmm3
; SSE2-SSSE3-NEXT: pxor %xmm4, %xmm1
@@ -426,39 +242,48 @@ define i4 @v4i64(<4 x i64> %a, <4 x i64> %b) {
; SSE2-SSSE3-NEXT: pand %xmm4, %xmm0
; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSE2-SSSE3-NEXT: por %xmm0, %xmm1
-; SSE2-SSSE3-NEXT: packsswb %xmm3, %xmm1
+; SSE2-SSSE3-NEXT: packssdw %xmm3, %xmm1
; SSE2-SSSE3-NEXT: movmskps %xmm1, %eax
-; SSE2-SSSE3-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; SSE2-SSSE3-NEXT: # kill: def %al killed %al killed %eax
; SSE2-SSSE3-NEXT: retq
;
; AVX1-LABEL: v4i64:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpacksswb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vmovmskps %xmm0, %eax
-; AVX1-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: vmovmskpd %ymm0, %eax
+; AVX1-NEXT: # kill: def %al killed %al killed %eax
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: v4i64:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vmovmskpd %ymm0, %eax
-; AVX2-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX2-NEXT: # kill: def %al killed %al killed %eax
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
-; AVX512-LABEL: v4i64:
-; AVX512: # BB#0:
-; AVX512-NEXT: vpcmpgtq %ymm1, %ymm0, %k0
-; AVX512-NEXT: kmovd %k0, %eax
-; AVX512-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
+; AVX512F-LABEL: v4i64:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpcmpgtq %ymm1, %ymm0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: v4i64:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpcmpgtq %ymm1, %ymm0, %k0
+; AVX512BW-NEXT: kmovd %k0, %eax
+; AVX512BW-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; AVX512BW-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
%x = icmp sgt <4 x i64> %a, %b
%res = bitcast <4 x i1> %x to i4
ret i4 %res
@@ -466,40 +291,39 @@ define i4 @v4i64(<4 x i64> %a, <4 x i64> %b) {
define i4 @v4f64(<4 x double> %a, <4 x double> %b) {
; SSE2-SSSE3-LABEL: v4f64:
-; SSE2-SSSE3: # BB#0:
+; SSE2-SSSE3: # %bb.0:
; SSE2-SSSE3-NEXT: cmpltpd %xmm1, %xmm3
; SSE2-SSSE3-NEXT: cmpltpd %xmm0, %xmm2
-; SSE2-SSSE3-NEXT: packsswb %xmm3, %xmm2
+; SSE2-SSSE3-NEXT: packssdw %xmm3, %xmm2
; SSE2-SSSE3-NEXT: movmskps %xmm2, %eax
-; SSE2-SSSE3-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; SSE2-SSSE3-NEXT: # kill: def %al killed %al killed %eax
; SSE2-SSSE3-NEXT: retq
;
-; AVX1-LABEL: v4f64:
-; AVX1: # BB#0:
-; AVX1-NEXT: vcmpltpd %ymm0, %ymm1, %ymm0
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovmskps %xmm0, %eax
-; AVX1-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
+; AVX12-LABEL: v4f64:
+; AVX12: # %bb.0:
+; AVX12-NEXT: vcmpltpd %ymm0, %ymm1, %ymm0
+; AVX12-NEXT: vmovmskpd %ymm0, %eax
+; AVX12-NEXT: # kill: def %al killed %al killed %eax
+; AVX12-NEXT: vzeroupper
+; AVX12-NEXT: retq
;
-; AVX2-LABEL: v4f64:
-; AVX2: # BB#0:
-; AVX2-NEXT: vcmpltpd %ymm0, %ymm1, %ymm0
-; AVX2-NEXT: vmovmskpd %ymm0, %eax
-; AVX2-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
+; AVX512F-LABEL: v4f64:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vcmpltpd %ymm0, %ymm1, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
;
-; AVX512-LABEL: v4f64:
-; AVX512: # BB#0:
-; AVX512-NEXT: vcmpltpd %ymm0, %ymm1, %k0
-; AVX512-NEXT: kmovd %k0, %eax
-; AVX512-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: movb -{{[0-9]+}}(%rsp), %al
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
+; AVX512BW-LABEL: v4f64:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vcmpltpd %ymm0, %ymm1, %k0
+; AVX512BW-NEXT: kmovd %k0, %eax
+; AVX512BW-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; AVX512BW-NEXT: movb -{{[0-9]+}}(%rsp), %al
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
%x = fcmp ogt <4 x double> %a, %b
%res = bitcast <4 x i1> %x to i4
ret i4 %res
diff --git a/test/CodeGen/X86/bitcast-setcc-512.ll b/test/CodeGen/X86/bitcast-setcc-512.ll
index 4a5ef99a8653..9914f0b93434 100644
--- a/test/CodeGen/X86/bitcast-setcc-512.ll
+++ b/test/CodeGen/X86/bitcast-setcc-512.ll
@@ -7,237 +7,40 @@
define i32 @v32i16(<32 x i16> %a, <32 x i16> %b) {
; SSE-LABEL: v32i16:
-; SSE: # BB#0:
-; SSE-NEXT: pcmpgtw %xmm7, %xmm3
-; SSE-NEXT: pextrb $14, %xmm3, %eax
-; SSE-NEXT: andb $1, %al
-; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: pextrb $12, %xmm3, %eax
-; SSE-NEXT: andb $1, %al
-; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: pextrb $10, %xmm3, %eax
-; SSE-NEXT: andb $1, %al
-; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: pextrb $8, %xmm3, %eax
-; SSE-NEXT: andb $1, %al
-; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: pextrb $6, %xmm3, %eax
-; SSE-NEXT: andb $1, %al
-; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: pextrb $4, %xmm3, %eax
-; SSE-NEXT: andb $1, %al
-; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: pextrb $2, %xmm3, %eax
-; SSE-NEXT: andb $1, %al
-; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: pextrb $0, %xmm3, %eax
-; SSE-NEXT: andb $1, %al
-; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: pcmpgtw %xmm6, %xmm2
-; SSE-NEXT: pextrb $14, %xmm2, %eax
-; SSE-NEXT: andb $1, %al
-; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: pextrb $12, %xmm2, %eax
-; SSE-NEXT: andb $1, %al
-; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: pextrb $10, %xmm2, %eax
-; SSE-NEXT: andb $1, %al
-; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: pextrb $8, %xmm2, %eax
-; SSE-NEXT: andb $1, %al
-; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: pextrb $6, %xmm2, %eax
-; SSE-NEXT: andb $1, %al
-; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: pextrb $4, %xmm2, %eax
-; SSE-NEXT: andb $1, %al
-; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: pextrb $2, %xmm2, %eax
-; SSE-NEXT: andb $1, %al
-; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: pextrb $0, %xmm2, %eax
-; SSE-NEXT: andb $1, %al
-; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE: # %bb.0:
; SSE-NEXT: pcmpgtw %xmm5, %xmm1
-; SSE-NEXT: pextrb $14, %xmm1, %eax
-; SSE-NEXT: andb $1, %al
-; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: pextrb $12, %xmm1, %eax
-; SSE-NEXT: andb $1, %al
-; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: pextrb $10, %xmm1, %eax
-; SSE-NEXT: andb $1, %al
-; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: pextrb $8, %xmm1, %eax
-; SSE-NEXT: andb $1, %al
-; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: pextrb $6, %xmm1, %eax
-; SSE-NEXT: andb $1, %al
-; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: pextrb $4, %xmm1, %eax
-; SSE-NEXT: andb $1, %al
-; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: pextrb $2, %xmm1, %eax
-; SSE-NEXT: andb $1, %al
-; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: pextrb $0, %xmm1, %eax
-; SSE-NEXT: andb $1, %al
-; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
; SSE-NEXT: pcmpgtw %xmm4, %xmm0
-; SSE-NEXT: pextrb $14, %xmm0, %eax
-; SSE-NEXT: andb $1, %al
-; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: pextrb $12, %xmm0, %eax
-; SSE-NEXT: andb $1, %al
-; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: pextrb $10, %xmm0, %eax
-; SSE-NEXT: andb $1, %al
-; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: pextrb $8, %xmm0, %eax
-; SSE-NEXT: andb $1, %al
-; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: pextrb $6, %xmm0, %eax
-; SSE-NEXT: andb $1, %al
-; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: pextrb $4, %xmm0, %eax
-; SSE-NEXT: andb $1, %al
-; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: pextrb $2, %xmm0, %eax
-; SSE-NEXT: andb $1, %al
-; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: pextrb $0, %xmm0, %eax
-; SSE-NEXT: andb $1, %al
-; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movl -{{[0-9]+}}(%rsp), %ecx
-; SSE-NEXT: shll $16, %ecx
-; SSE-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax
+; SSE-NEXT: packsswb %xmm1, %xmm0
+; SSE-NEXT: pmovmskb %xmm0, %ecx
+; SSE-NEXT: pcmpgtw %xmm7, %xmm3
+; SSE-NEXT: pcmpgtw %xmm6, %xmm2
+; SSE-NEXT: packsswb %xmm3, %xmm2
+; SSE-NEXT: pmovmskb %xmm2, %eax
+; SSE-NEXT: shll $16, %eax
; SSE-NEXT: orl %ecx, %eax
; SSE-NEXT: retq
;
; AVX1-LABEL: v32i16:
-; AVX1: # BB#0:
-; AVX1-NEXT: pushq %rbp
-; AVX1-NEXT: .Lcfi0:
-; AVX1-NEXT: .cfi_def_cfa_offset 16
-; AVX1-NEXT: .Lcfi1:
-; AVX1-NEXT: .cfi_offset %rbp, -16
-; AVX1-NEXT: movq %rsp, %rbp
-; AVX1-NEXT: .Lcfi2:
-; AVX1-NEXT: .cfi_def_cfa_register %rbp
-; AVX1-NEXT: andq $-32, %rsp
-; AVX1-NEXT: subq $32, %rsp
-; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
+; AVX1: # %bb.0:
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
; AVX1-NEXT: vpcmpgtw %xmm4, %xmm5, %xmm4
-; AVX1-NEXT: vpextrb $14, %xmm4, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: movb %al, (%rsp)
-; AVX1-NEXT: vpextrb $12, %xmm4, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: movb %al, (%rsp)
-; AVX1-NEXT: vpextrb $10, %xmm4, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: movb %al, (%rsp)
-; AVX1-NEXT: vpextrb $8, %xmm4, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: movb %al, (%rsp)
-; AVX1-NEXT: vpextrb $6, %xmm4, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: movb %al, (%rsp)
-; AVX1-NEXT: vpextrb $4, %xmm4, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: movb %al, (%rsp)
-; AVX1-NEXT: vpextrb $2, %xmm4, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: movb %al, (%rsp)
-; AVX1-NEXT: vpextrb $0, %xmm4, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: movb %al, (%rsp)
-; AVX1-NEXT: vpcmpgtw %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vpextrb $14, %xmm1, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: movb %al, (%rsp)
-; AVX1-NEXT: vpextrb $12, %xmm1, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: movb %al, (%rsp)
-; AVX1-NEXT: vpextrb $10, %xmm1, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: movb %al, (%rsp)
-; AVX1-NEXT: vpextrb $8, %xmm1, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: movb %al, (%rsp)
-; AVX1-NEXT: vpextrb $6, %xmm1, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: movb %al, (%rsp)
-; AVX1-NEXT: vpextrb $4, %xmm1, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: movb %al, (%rsp)
-; AVX1-NEXT: vpextrb $2, %xmm1, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: movb %al, (%rsp)
-; AVX1-NEXT: vpextrb $0, %xmm1, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: movb %al, (%rsp)
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT: vpcmpgtw %xmm1, %xmm3, %xmm1
-; AVX1-NEXT: vpextrb $14, %xmm1, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: movb %al, (%rsp)
-; AVX1-NEXT: vpextrb $12, %xmm1, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: movb %al, (%rsp)
-; AVX1-NEXT: vpextrb $10, %xmm1, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: movb %al, (%rsp)
-; AVX1-NEXT: vpextrb $8, %xmm1, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: movb %al, (%rsp)
-; AVX1-NEXT: vpextrb $6, %xmm1, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: movb %al, (%rsp)
-; AVX1-NEXT: vpextrb $4, %xmm1, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: movb %al, (%rsp)
-; AVX1-NEXT: vpextrb $2, %xmm1, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: movb %al, (%rsp)
-; AVX1-NEXT: vpextrb $0, %xmm1, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: movb %al, (%rsp)
; AVX1-NEXT: vpcmpgtw %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpextrb $14, %xmm0, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: movb %al, (%rsp)
-; AVX1-NEXT: vpextrb $12, %xmm0, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: movb %al, (%rsp)
-; AVX1-NEXT: vpextrb $10, %xmm0, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: movb %al, (%rsp)
-; AVX1-NEXT: vpextrb $8, %xmm0, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: movb %al, (%rsp)
-; AVX1-NEXT: vpextrb $6, %xmm0, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: movb %al, (%rsp)
-; AVX1-NEXT: vpextrb $4, %xmm0, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: movb %al, (%rsp)
-; AVX1-NEXT: vpextrb $2, %xmm0, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: movb %al, (%rsp)
-; AVX1-NEXT: vpextrb $0, %xmm0, %eax
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: movb %al, (%rsp)
-; AVX1-NEXT: movl (%rsp), %eax
-; AVX1-NEXT: movq %rbp, %rsp
-; AVX1-NEXT: popq %rbp
+; AVX1-NEXT: vpacksswb %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vpmovmskb %xmm0, %ecx
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm0
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpcmpgtw %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vpcmpgtw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpacksswb %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpmovmskb %xmm0, %eax
+; AVX1-NEXT: shll $16, %eax
+; AVX1-NEXT: orl %ecx, %eax
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: v32i16:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpcmpgtw %ymm3, %ymm1, %ymm1
; AVX2-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpacksswb %ymm1, %ymm0, %ymm0
@@ -247,155 +50,24 @@ define i32 @v32i16(<32 x i16> %a, <32 x i16> %b) {
; AVX2-NEXT: retq
;
; AVX512F-LABEL: v32i16:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: pushq %rbp
-; AVX512F-NEXT: .Lcfi0:
; AVX512F-NEXT: .cfi_def_cfa_offset 16
-; AVX512F-NEXT: .Lcfi1:
; AVX512F-NEXT: .cfi_offset %rbp, -16
; AVX512F-NEXT: movq %rsp, %rbp
-; AVX512F-NEXT: .Lcfi2:
; AVX512F-NEXT: .cfi_def_cfa_register %rbp
; AVX512F-NEXT: andq $-32, %rsp
; AVX512F-NEXT: subq $32, %rsp
; AVX512F-NEXT: vpcmpgtw %ymm3, %ymm1, %ymm1
; AVX512F-NEXT: vpmovsxwd %ymm1, %zmm1
-; AVX512F-NEXT: vpslld $31, %zmm1, %zmm1
-; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0
-; AVX512F-NEXT: kshiftlw $14, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: kshiftlw $15, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %ecx
-; AVX512F-NEXT: vmovd %ecx, %xmm1
-; AVX512F-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
-; AVX512F-NEXT: kshiftlw $13, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
-; AVX512F-NEXT: kshiftlw $12, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1
-; AVX512F-NEXT: kshiftlw $11, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; AVX512F-NEXT: kshiftlw $10, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
-; AVX512F-NEXT: kshiftlw $9, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1
-; AVX512F-NEXT: kshiftlw $8, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1
-; AVX512F-NEXT: kshiftlw $7, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
-; AVX512F-NEXT: kshiftlw $6, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1
-; AVX512F-NEXT: kshiftlw $5, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1
-; AVX512F-NEXT: kshiftlw $4, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1
-; AVX512F-NEXT: kshiftlw $3, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
-; AVX512F-NEXT: kshiftlw $2, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
-; AVX512F-NEXT: kshiftlw $1, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1
-; AVX512F-NEXT: kshiftrw $15, %k0, %k0
-; AVX512F-NEXT: kmovw %k0, %eax
-; AVX512F-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1
+; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm1
; AVX512F-NEXT: vpslld $31, %zmm1, %zmm1
; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0
; AVX512F-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm0
; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
-; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
-; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
-; AVX512F-NEXT: kshiftlw $14, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: kshiftlw $15, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %ecx
-; AVX512F-NEXT: vmovd %ecx, %xmm0
-; AVX512F-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
-; AVX512F-NEXT: kshiftlw $13, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
-; AVX512F-NEXT: kshiftlw $12, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
-; AVX512F-NEXT: kshiftlw $11, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
-; AVX512F-NEXT: kshiftlw $10, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; AVX512F-NEXT: kshiftlw $9, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; AVX512F-NEXT: kshiftlw $8, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
-; AVX512F-NEXT: kshiftlw $7, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
-; AVX512F-NEXT: kshiftlw $6, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
-; AVX512F-NEXT: kshiftlw $5, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
-; AVX512F-NEXT: kshiftlw $4, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; AVX512F-NEXT: kshiftlw $3, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
-; AVX512F-NEXT: kshiftlw $2, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
-; AVX512F-NEXT: kshiftlw $1, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
-; AVX512F-NEXT: kshiftrw $15, %k0, %k0
-; AVX512F-NEXT: kmovw %k0, %eax
-; AVX512F-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0
; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -407,7 +79,7 @@ define i32 @v32i16(<32 x i16> %a, <32 x i16> %b) {
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: v32i16:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpcmpgtw %zmm1, %zmm0, %k0
; AVX512BW-NEXT: kmovd %k0, %eax
; AVX512BW-NEXT: vzeroupper
@@ -419,38 +91,38 @@ define i32 @v32i16(<32 x i16> %a, <32 x i16> %b) {
define i16 @v16i32(<16 x i32> %a, <16 x i32> %b) {
; SSE-LABEL: v16i32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pcmpgtd %xmm7, %xmm3
; SSE-NEXT: pcmpgtd %xmm6, %xmm2
-; SSE-NEXT: packsswb %xmm3, %xmm2
+; SSE-NEXT: packssdw %xmm3, %xmm2
; SSE-NEXT: pcmpgtd %xmm5, %xmm1
; SSE-NEXT: pcmpgtd %xmm4, %xmm0
-; SSE-NEXT: packsswb %xmm1, %xmm0
+; SSE-NEXT: packssdw %xmm1, %xmm0
; SSE-NEXT: packsswb %xmm2, %xmm0
; SSE-NEXT: pmovmskb %xmm0, %eax
-; SSE-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; SSE-NEXT: # kill: def %ax killed %ax killed %eax
; SSE-NEXT: retq
;
; AVX1-LABEL: v16i32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
; AVX1-NEXT: vpcmpgtd %xmm4, %xmm5, %xmm4
; AVX1-NEXT: vpcmpgtd %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vpacksswb %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpackssdw %xmm4, %xmm1, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
; AVX1-NEXT: vpcmpgtd %xmm3, %xmm4, %xmm3
; AVX1-NEXT: vpcmpgtd %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpacksswb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpackssdw %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpmovmskb %xmm0, %eax
-; AVX1-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX1-NEXT: # kill: def %ax killed %ax killed %eax
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: v16i32:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpcmpgtd %ymm3, %ymm1, %ymm1
; AVX2-NEXT: vpcmpgtd %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpacksswb %ymm1, %ymm0, %ymm0
@@ -458,23 +130,23 @@ define i16 @v16i32(<16 x i32> %a, <16 x i32> %b) {
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpmovmskb %xmm0, %eax
-; AVX2-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX2-NEXT: # kill: def %ax killed %ax killed %eax
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512F-LABEL: v16i32:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vpcmpgtd %zmm1, %zmm0, %k0
; AVX512F-NEXT: kmovw %k0, %eax
-; AVX512F-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX512F-NEXT: # kill: def %ax killed %ax killed %eax
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: v16i32:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpcmpgtd %zmm1, %zmm0, %k0
; AVX512BW-NEXT: kmovd %k0, %eax
-; AVX512BW-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX512BW-NEXT: # kill: def %ax killed %ax killed %eax
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
%x = icmp sgt <16 x i32> %a, %b
@@ -484,34 +156,34 @@ define i16 @v16i32(<16 x i32> %a, <16 x i32> %b) {
define i16 @v16f32(<16 x float> %a, <16 x float> %b) {
; SSE-LABEL: v16f32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: cmpltps %xmm3, %xmm7
; SSE-NEXT: cmpltps %xmm2, %xmm6
-; SSE-NEXT: packsswb %xmm7, %xmm6
+; SSE-NEXT: packssdw %xmm7, %xmm6
; SSE-NEXT: cmpltps %xmm1, %xmm5
; SSE-NEXT: cmpltps %xmm0, %xmm4
-; SSE-NEXT: packsswb %xmm5, %xmm4
+; SSE-NEXT: packssdw %xmm5, %xmm4
; SSE-NEXT: packsswb %xmm6, %xmm4
; SSE-NEXT: pmovmskb %xmm4, %eax
-; SSE-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; SSE-NEXT: # kill: def %ax killed %ax killed %eax
; SSE-NEXT: retq
;
; AVX1-LABEL: v16f32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vcmpltps %ymm1, %ymm3, %ymm1
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT: vpacksswb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpackssdw %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vcmpltps %ymm0, %ymm2, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vpacksswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpmovmskb %xmm0, %eax
-; AVX1-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX1-NEXT: # kill: def %ax killed %ax killed %eax
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: v16f32:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vcmpltps %ymm1, %ymm3, %ymm1
; AVX2-NEXT: vcmpltps %ymm0, %ymm2, %ymm0
; AVX2-NEXT: vpacksswb %ymm1, %ymm0, %ymm0
@@ -519,23 +191,23 @@ define i16 @v16f32(<16 x float> %a, <16 x float> %b) {
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpmovmskb %xmm0, %eax
-; AVX2-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX2-NEXT: # kill: def %ax killed %ax killed %eax
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512F-LABEL: v16f32:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vcmpltps %zmm0, %zmm1, %k0
; AVX512F-NEXT: kmovw %k0, %eax
-; AVX512F-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX512F-NEXT: # kill: def %ax killed %ax killed %eax
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: v16f32:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vcmpltps %zmm0, %zmm1, %k0
; AVX512BW-NEXT: kmovd %k0, %eax
-; AVX512BW-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX512BW-NEXT: # kill: def %ax killed %ax killed %eax
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
%x = fcmp ogt <16 x float> %a, %b
@@ -545,7 +217,7 @@ define i16 @v16f32(<16 x float> %a, <16 x float> %b) {
define i64 @v64i8(<64 x i8> %a, <64 x i8> %b) {
; SSE-LABEL: v64i8:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pcmpgtb %xmm5, %xmm1
; SSE-NEXT: pextrb $15, %xmm1, %eax
; SSE-NEXT: andb $1, %al
@@ -755,14 +427,11 @@ define i64 @v64i8(<64 x i8> %a, <64 x i8> %b) {
; SSE-NEXT: retq
;
; AVX1-LABEL: v64i8:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: pushq %rbp
-; AVX1-NEXT: .Lcfi3:
; AVX1-NEXT: .cfi_def_cfa_offset 16
-; AVX1-NEXT: .Lcfi4:
; AVX1-NEXT: .cfi_offset %rbp, -16
; AVX1-NEXT: movq %rsp, %rbp
-; AVX1-NEXT: .Lcfi5:
; AVX1-NEXT: .cfi_def_cfa_register %rbp
; AVX1-NEXT: andq $-32, %rsp
; AVX1-NEXT: subq $64, %rsp
@@ -976,14 +645,11 @@ define i64 @v64i8(<64 x i8> %a, <64 x i8> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: v64i8:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: pushq %rbp
-; AVX2-NEXT: .Lcfi0:
; AVX2-NEXT: .cfi_def_cfa_offset 16
-; AVX2-NEXT: .Lcfi1:
; AVX2-NEXT: .cfi_offset %rbp, -16
; AVX2-NEXT: movq %rsp, %rbp
-; AVX2-NEXT: .Lcfi2:
; AVX2-NEXT: .cfi_def_cfa_register %rbp
; AVX2-NEXT: andq $-32, %rsp
; AVX2-NEXT: subq $64, %rsp
@@ -1193,14 +859,11 @@ define i64 @v64i8(<64 x i8> %a, <64 x i8> %b) {
; AVX2-NEXT: retq
;
; AVX512F-LABEL: v64i8:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: pushq %rbp
-; AVX512F-NEXT: .Lcfi3:
; AVX512F-NEXT: .cfi_def_cfa_offset 16
-; AVX512F-NEXT: .Lcfi4:
; AVX512F-NEXT: .cfi_offset %rbp, -16
; AVX512F-NEXT: movq %rsp, %rbp
-; AVX512F-NEXT: .Lcfi5:
; AVX512F-NEXT: .cfi_def_cfa_register %rbp
; AVX512F-NEXT: andq $-32, %rsp
; AVX512F-NEXT: subq $64, %rsp
@@ -1234,7 +897,7 @@ define i64 @v64i8(<64 x i8> %a, <64 x i8> %b) {
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: v64i8:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpcmpgtb %zmm1, %zmm0, %k0
; AVX512BW-NEXT: kmovq %k0, %rax
; AVX512BW-NEXT: vzeroupper
@@ -1246,65 +909,61 @@ define i64 @v64i8(<64 x i8> %a, <64 x i8> %b) {
define i8 @v8i64(<8 x i64> %a, <8 x i64> %b) {
; SSE-LABEL: v8i64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pcmpgtq %xmm7, %xmm3
; SSE-NEXT: pcmpgtq %xmm6, %xmm2
-; SSE-NEXT: packsswb %xmm3, %xmm2
+; SSE-NEXT: packssdw %xmm3, %xmm2
; SSE-NEXT: pcmpgtq %xmm5, %xmm1
; SSE-NEXT: pcmpgtq %xmm4, %xmm0
-; SSE-NEXT: packsswb %xmm1, %xmm0
-; SSE-NEXT: packsswb %xmm2, %xmm0
-; SSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; SSE-NEXT: packssdw %xmm1, %xmm0
+; SSE-NEXT: packssdw %xmm2, %xmm0
+; SSE-NEXT: packsswb %xmm0, %xmm0
; SSE-NEXT: pmovmskb %xmm0, %eax
-; SSE-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; SSE-NEXT: # kill: def %al killed %al killed %eax
; SSE-NEXT: retq
;
; AVX1-LABEL: v8i64:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
; AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4
; AVX1-NEXT: vpcmpgtq %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vpacksswb %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpackssdw %xmm4, %xmm1, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpacksswb %xmm3, %xmm0, %xmm0
-; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
-; AVX1-NEXT: vpmovmskb %xmm0, %eax
-; AVX1-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX1-NEXT: vpackssdw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vmovmskps %ymm0, %eax
+; AVX1-NEXT: # kill: def %al killed %al killed %eax
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: v8i64:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpcmpgtq %ymm3, %ymm1, %ymm1
; AVX2-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm0
-; AVX2-NEXT: vpacksswb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
-; AVX2-NEXT: vpmovmskb %xmm0, %eax
-; AVX2-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX2-NEXT: vmovmskps %ymm0, %eax
+; AVX2-NEXT: # kill: def %al killed %al killed %eax
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512F-LABEL: v8i64:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vpcmpgtq %zmm1, %zmm0, %k0
; AVX512F-NEXT: kmovw %k0, %eax
-; AVX512F-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512F-NEXT: # kill: def %al killed %al killed %eax
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: v8i64:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpcmpgtq %zmm1, %zmm0, %k0
; AVX512BW-NEXT: kmovd %k0, %eax
-; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: # kill: def %al killed %al killed %eax
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
%x = icmp sgt <8 x i64> %a, %b
@@ -1314,61 +973,57 @@ define i8 @v8i64(<8 x i64> %a, <8 x i64> %b) {
define i8 @v8f64(<8 x double> %a, <8 x double> %b) {
; SSE-LABEL: v8f64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: cmpltpd %xmm3, %xmm7
; SSE-NEXT: cmpltpd %xmm2, %xmm6
-; SSE-NEXT: packsswb %xmm7, %xmm6
+; SSE-NEXT: packssdw %xmm7, %xmm6
; SSE-NEXT: cmpltpd %xmm1, %xmm5
; SSE-NEXT: cmpltpd %xmm0, %xmm4
-; SSE-NEXT: packsswb %xmm5, %xmm4
-; SSE-NEXT: packsswb %xmm6, %xmm4
-; SSE-NEXT: pshufb {{.*#+}} xmm4 = xmm4[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; SSE-NEXT: packssdw %xmm5, %xmm4
+; SSE-NEXT: packssdw %xmm6, %xmm4
+; SSE-NEXT: packsswb %xmm0, %xmm4
; SSE-NEXT: pmovmskb %xmm4, %eax
-; SSE-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; SSE-NEXT: # kill: def %al killed %al killed %eax
; SSE-NEXT: retq
;
; AVX1-LABEL: v8f64:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vcmpltpd %ymm1, %ymm3, %ymm1
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT: vpacksswb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpackssdw %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vcmpltpd %ymm0, %ymm2, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vpacksswb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
-; AVX1-NEXT: vpmovmskb %xmm0, %eax
-; AVX1-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vmovmskps %ymm0, %eax
+; AVX1-NEXT: # kill: def %al killed %al killed %eax
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: v8f64:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vcmpltpd %ymm1, %ymm3, %ymm1
; AVX2-NEXT: vcmpltpd %ymm0, %ymm2, %ymm0
-; AVX2-NEXT: vpacksswb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
-; AVX2-NEXT: vpmovmskb %xmm0, %eax
-; AVX2-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX2-NEXT: vmovmskps %ymm0, %eax
+; AVX2-NEXT: # kill: def %al killed %al killed %eax
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512F-LABEL: v8f64:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vcmpltpd %zmm0, %zmm1, %k0
; AVX512F-NEXT: kmovw %k0, %eax
-; AVX512F-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512F-NEXT: # kill: def %al killed %al killed %eax
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: v8f64:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vcmpltpd %zmm0, %zmm1, %k0
; AVX512BW-NEXT: kmovd %k0, %eax
-; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT: # kill: def %al killed %al killed %eax
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
%x = fcmp ogt <8 x double> %a, %b
diff --git a/test/CodeGen/X86/bitcast.ll b/test/CodeGen/X86/bitcast.ll
index c34c6753bfed..0866a0b1b2bd 100644
--- a/test/CodeGen/X86/bitcast.ll
+++ b/test/CodeGen/X86/bitcast.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=x86
-; RUN: llc < %s -march=x86-64
+; RUN: llc < %s -mtriple=i686--
+; RUN: llc < %s -mtriple=x86_64--
; PR1033
define i64 @test1(double %t) {
diff --git a/test/CodeGen/X86/bitcast2.ll b/test/CodeGen/X86/bitcast2.ll
index b75db95869c2..febd7ba12dbb 100644
--- a/test/CodeGen/X86/bitcast2.ll
+++ b/test/CodeGen/X86/bitcast2.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=x86-64 -mattr=-avx | grep movq | count 2
-; RUN: llc < %s -march=x86-64 -mattr=-avx | not grep rsp
+; RUN: llc < %s -mtriple=x86_64-- -mattr=-avx | grep movq | count 2
+; RUN: llc < %s -mtriple=x86_64-- -mattr=-avx | not grep rsp
define i64 @test1(double %A) {
%B = bitcast double %A to i64
diff --git a/test/CodeGen/X86/bitreverse.ll b/test/CodeGen/X86/bitreverse.ll
index 06daf014c151..a393db30c9f2 100644
--- a/test/CodeGen/X86/bitreverse.ll
+++ b/test/CodeGen/X86/bitreverse.ll
@@ -9,9 +9,9 @@ declare <2 x i16> @llvm.bitreverse.v2i16(<2 x i16>) readnone
define <2 x i16> @test_bitreverse_v2i16(<2 x i16> %a) nounwind {
; X86-LABEL: test_bitreverse_v2i16:
-; X86: # BB#0:
-; X86-NEXT: movw {{[0-9]+}}(%esp), %cx
-; X86-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X86: # %bb.0:
+; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X86-NEXT: rolw $8, %ax
; X86-NEXT: movl %eax, %edx
; X86-NEXT: andl $3855, %edx # imm = 0xF0F
@@ -46,12 +46,12 @@ define <2 x i16> @test_bitreverse_v2i16(<2 x i16> %a) nounwind {
; X86-NEXT: andl $43690, %ecx # imm = 0xAAAA
; X86-NEXT: shrl %ecx
; X86-NEXT: leal (%ecx,%edx,2), %edx
-; X86-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
-; X86-NEXT: # kill: %DX<def> %DX<kill> %EDX<kill>
+; X86-NEXT: # kill: def %ax killed %ax killed %eax
+; X86-NEXT: # kill: def %dx killed %dx killed %edx
; X86-NEXT: retl
;
; X64-LABEL: test_bitreverse_v2i16:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: pxor %xmm1, %xmm1
; X64-NEXT: movdqa %xmm0, %xmm2
; X64-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
@@ -98,7 +98,7 @@ declare i64 @llvm.bitreverse.i64(i64) readnone
define i64 @test_bitreverse_i64(i64 %a) nounwind {
; X86-LABEL: test_bitreverse_i64:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: bswapl %eax
@@ -138,7 +138,7 @@ define i64 @test_bitreverse_i64(i64 %a) nounwind {
; X86-NEXT: retl
;
; X64-LABEL: test_bitreverse_i64:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: bswapq %rdi
; X64-NEXT: movabsq $1085102592571150095, %rax # imm = 0xF0F0F0F0F0F0F0F
; X64-NEXT: andq %rdi, %rax
@@ -168,7 +168,7 @@ declare i32 @llvm.bitreverse.i32(i32) readnone
define i32 @test_bitreverse_i32(i32 %a) nounwind {
; X86-LABEL: test_bitreverse_i32:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: bswapl %eax
; X86-NEXT: movl %eax, %ecx
@@ -190,8 +190,8 @@ define i32 @test_bitreverse_i32(i32 %a) nounwind {
; X86-NEXT: retl
;
; X64-LABEL: test_bitreverse_i32:
-; X64: # BB#0:
-; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64: # %bb.0:
+; X64-NEXT: # kill: def %edi killed %edi def %rdi
; X64-NEXT: bswapl %edi
; X64-NEXT: movl %edi, %eax
; X64-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F
@@ -218,7 +218,7 @@ declare i24 @llvm.bitreverse.i24(i24) readnone
define i24 @test_bitreverse_i24(i24 %a) nounwind {
; X86-LABEL: test_bitreverse_i24:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: bswapl %eax
; X86-NEXT: movl %eax, %ecx
@@ -241,8 +241,8 @@ define i24 @test_bitreverse_i24(i24 %a) nounwind {
; X86-NEXT: retl
;
; X64-LABEL: test_bitreverse_i24:
-; X64: # BB#0:
-; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64: # %bb.0:
+; X64-NEXT: # kill: def %edi killed %edi def %rdi
; X64-NEXT: bswapl %edi
; X64-NEXT: movl %edi, %eax
; X64-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F
@@ -270,8 +270,8 @@ declare i16 @llvm.bitreverse.i16(i16) readnone
define i16 @test_bitreverse_i16(i16 %a) nounwind {
; X86-LABEL: test_bitreverse_i16:
-; X86: # BB#0:
-; X86-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X86: # %bb.0:
+; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X86-NEXT: rolw $8, %ax
; X86-NEXT: movl %eax, %ecx
; X86-NEXT: andl $3855, %ecx # imm = 0xF0F
@@ -289,12 +289,12 @@ define i16 @test_bitreverse_i16(i16 %a) nounwind {
; X86-NEXT: andl $43690, %eax # imm = 0xAAAA
; X86-NEXT: shrl %eax
; X86-NEXT: leal (%eax,%ecx,2), %eax
-; X86-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-NEXT: # kill: def %ax killed %ax killed %eax
; X86-NEXT: retl
;
; X64-LABEL: test_bitreverse_i16:
-; X64: # BB#0:
-; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64: # %bb.0:
+; X64-NEXT: # kill: def %edi killed %edi def %rdi
; X64-NEXT: rolw $8, %di
; X64-NEXT: movl %edi, %eax
; X64-NEXT: andl $3855, %eax # imm = 0xF0F
@@ -312,7 +312,7 @@ define i16 @test_bitreverse_i16(i16 %a) nounwind {
; X64-NEXT: andl $43690, %eax # imm = 0xAAAA
; X64-NEXT: shrl %eax
; X64-NEXT: leal (%rax,%rcx,2), %eax
-; X64-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-NEXT: # kill: def %ax killed %ax killed %eax
; X64-NEXT: retq
%b = call i16 @llvm.bitreverse.i16(i16 %a)
ret i16 %b
@@ -322,7 +322,7 @@ declare i8 @llvm.bitreverse.i8(i8) readnone
define i8 @test_bitreverse_i8(i8 %a) {
; X86-LABEL: test_bitreverse_i8:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movb {{[0-9]+}}(%esp), %al
; X86-NEXT: rolb $4, %al
; X86-NEXT: movl %eax, %ecx
@@ -340,7 +340,7 @@ define i8 @test_bitreverse_i8(i8 %a) {
; X86-NEXT: retl
;
; X64-LABEL: test_bitreverse_i8:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: rolb $4, %dil
; X64-NEXT: movl %edi, %eax
; X64-NEXT: andb $51, %al
@@ -364,7 +364,7 @@ declare i4 @llvm.bitreverse.i4(i4) readnone
define i4 @test_bitreverse_i4(i4 %a) {
; X86-LABEL: test_bitreverse_i4:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movb {{[0-9]+}}(%esp), %al
; X86-NEXT: rolb $4, %al
; X86-NEXT: movl %eax, %ecx
@@ -383,7 +383,7 @@ define i4 @test_bitreverse_i4(i4 %a) {
; X86-NEXT: retl
;
; X64-LABEL: test_bitreverse_i4:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: rolb $4, %dil
; X64-NEXT: movl %edi, %eax
; X64-NEXT: andb $51, %al
@@ -408,13 +408,13 @@ define i4 @test_bitreverse_i4(i4 %a) {
define <2 x i16> @fold_v2i16() {
; X86-LABEL: fold_v2i16:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movw $-4096, %ax # imm = 0xF000
; X86-NEXT: movw $240, %dx
; X86-NEXT: retl
;
; X64-LABEL: fold_v2i16:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movaps {{.*#+}} xmm0 = [61440,240]
; X64-NEXT: retq
%b = call <2 x i16> @llvm.bitreverse.v2i16(<2 x i16> <i16 15, i16 3840>)
@@ -423,12 +423,12 @@ define <2 x i16> @fold_v2i16() {
define i24 @fold_i24() {
; X86-LABEL: fold_i24:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movl $2048, %eax # imm = 0x800
; X86-NEXT: retl
;
; X64-LABEL: fold_i24:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movl $2048, %eax # imm = 0x800
; X64-NEXT: retq
%b = call i24 @llvm.bitreverse.i24(i24 4096)
@@ -437,12 +437,12 @@ define i24 @fold_i24() {
define i8 @fold_i8() {
; X86-LABEL: fold_i8:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movb $-16, %al
; X86-NEXT: retl
;
; X64-LABEL: fold_i8:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movb $-16, %al
; X64-NEXT: retq
%b = call i8 @llvm.bitreverse.i8(i8 15)
@@ -451,12 +451,12 @@ define i8 @fold_i8() {
define i4 @fold_i4() {
; X86-LABEL: fold_i4:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movb $1, %al
; X86-NEXT: retl
;
; X64-LABEL: fold_i4:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movb $1, %al
; X64-NEXT: retq
%b = call i4 @llvm.bitreverse.i4(i4 8)
@@ -467,12 +467,12 @@ define i4 @fold_i4() {
define i8 @identity_i8(i8 %a) {
; X86-LABEL: identity_i8:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movb {{[0-9]+}}(%esp), %al
; X86-NEXT: retl
;
; X64-LABEL: identity_i8:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movl %edi, %eax
; X64-NEXT: retq
%b = call i8 @llvm.bitreverse.i8(i8 %a)
@@ -482,13 +482,13 @@ define i8 @identity_i8(i8 %a) {
define <2 x i16> @identity_v2i16(<2 x i16> %a) {
; X86-LABEL: identity_v2i16:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movzwl {{[0-9]+}}(%esp), %edx
; X86-NEXT: retl
;
; X64-LABEL: identity_v2i16:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: retq
%b = call <2 x i16> @llvm.bitreverse.v2i16(<2 x i16> %a)
%c = call <2 x i16> @llvm.bitreverse.v2i16(<2 x i16> %b)
@@ -499,11 +499,11 @@ define <2 x i16> @identity_v2i16(<2 x i16> %a) {
define i8 @undef_i8() {
; X86-LABEL: undef_i8:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: retl
;
; X64-LABEL: undef_i8:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: retq
%b = call i8 @llvm.bitreverse.i8(i8 undef)
ret i8 %b
@@ -511,11 +511,11 @@ define i8 @undef_i8() {
define <2 x i16> @undef_v2i16() {
; X86-LABEL: undef_v2i16:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: retl
;
; X64-LABEL: undef_v2i16:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: retq
%b = call <2 x i16> @llvm.bitreverse.v2i16(<2 x i16> undef)
ret <2 x i16> %b
diff --git a/test/CodeGen/X86/block-placement.ll b/test/CodeGen/X86/block-placement.ll
index b3f6534d14b3..aa2e9aac433d 100644
--- a/test/CodeGen/X86/block-placement.ll
+++ b/test/CodeGen/X86/block-placement.ll
@@ -474,11 +474,11 @@ define void @fpcmp_unanalyzable_branch(i1 %cond) {
; edge in 'entry' -> 'entry.if.then_crit_edge' -> 'if.then' -> 'if.end' is
; fall-through.
; CHECK-LABEL: fpcmp_unanalyzable_branch:
-; CHECK: # BB#0: # %entry
-; CHECK: # BB#1: # %entry.if.then_crit_edge
+; CHECK: # %bb.0: # %entry
+; CHECK: # %bb.1: # %entry.if.then_crit_edge
; CHECK: .LBB10_5: # %if.then
; CHECK: .LBB10_6: # %if.end
-; CHECK: # BB#3: # %exit
+; CHECK: # %bb.3: # %exit
; CHECK: jne .LBB10_4
; CHECK-NEXT: jnp .LBB10_6
; CHECK: jmp .LBB10_5
@@ -943,7 +943,7 @@ define void @benchmark_heapsort(i32 %n, double* nocapture %ra) {
; 2) The exiting edge from the loop which is rotated to be laid out at the
; bottom of the loop needs to be exiting into the nearest enclosing loop (to
; which there is an exit). Otherwise, we force that enclosing loop into
-; strange layouts that are siginificantly less efficient, often times maing
+; strange layouts that are siginificantly less efficient, often times making
; it discontiguous.
;
; CHECK-LABEL: @benchmark_heapsort
diff --git a/test/CodeGen/X86/block-placement.mir b/test/CodeGen/X86/block-placement.mir
index c0cd7057d5c6..600bc13f14ca 100644
--- a/test/CodeGen/X86/block-placement.mir
+++ b/test/CodeGen/X86/block-placement.mir
@@ -46,28 +46,28 @@ liveins:
- { reg: '%rdi' }
- { reg: '%esi' }
-# CHECK: %eax = FAULTING_OP 1, %bb.3.null, 1684, killed %rdi, 1, _, 0, _ :: (load 4 from %ir.ptr)
-# CHECK-NEXT: JMP_1 %bb.2.not_null
+# CHECK: %eax = FAULTING_OP 1, %bb.3, 1684, killed %rdi, 1, %noreg, 0, %noreg :: (load 4 from %ir.ptr)
+# CHECK-NEXT: JMP_1 %bb.2
# CHECK: bb.3.null:
# CHECK: bb.4.right:
# CHECK: bb.2.not_null:
body: |
bb.0.entry:
- successors: %bb.1.left(0x7ffff800), %bb.3.right(0x00000800)
+ successors: %bb.1(0x7ffff800), %bb.3(0x00000800)
liveins: %esi, %rdi
frame-setup PUSH64r undef %rax, implicit-def %rsp, implicit %rsp
CFI_INSTRUCTION def_cfa_offset 16
TEST8ri %sil, 1, implicit-def %eflags, implicit killed %esi
- JE_1 %bb.3.right, implicit killed %eflags
+ JE_1 %bb.3, implicit killed %eflags
bb.1.left:
- successors: %bb.2.null(0x7ffff800), %bb.4.not_null(0x00000800)
+ successors: %bb.2(0x7ffff800), %bb.4(0x00000800)
liveins: %rdi
- %eax = FAULTING_OP 1, %bb.2.null, 1684, killed %rdi, 1, _, 0, _ :: (load 4 from %ir.ptr)
- JMP_1 %bb.4.not_null
+ %eax = FAULTING_OP 1, %bb.2, 1684, killed %rdi, 1, %noreg, 0, %noreg :: (load 4 from %ir.ptr)
+ JMP_1 %bb.4
bb.4.not_null:
liveins: %rdi, %eax
diff --git a/test/CodeGen/X86/bmi-intrinsics-fast-isel-x86_64.ll b/test/CodeGen/X86/bmi-intrinsics-fast-isel-x86_64.ll
index 8b15a1591b67..f86df57b687e 100644
--- a/test/CodeGen/X86/bmi-intrinsics-fast-isel-x86_64.ll
+++ b/test/CodeGen/X86/bmi-intrinsics-fast-isel-x86_64.ll
@@ -9,7 +9,7 @@
define i64 @test__andn_u64(i64 %a0, i64 %a1) {
; X64-LABEL: test__andn_u64:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: xorq $-1, %rdi
; X64-NEXT: andq %rsi, %rdi
; X64-NEXT: movq %rdi, %rax
@@ -21,7 +21,7 @@ define i64 @test__andn_u64(i64 %a0, i64 %a1) {
define i64 @test__bextr_u64(i64 %a0, i64 %a1) {
; X64-LABEL: test__bextr_u64:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: bextrq %rsi, %rdi, %rax
; X64-NEXT: retq
%res = call i64 @llvm.x86.bmi.bextr.64(i64 %a0, i64 %a1)
@@ -30,7 +30,7 @@ define i64 @test__bextr_u64(i64 %a0, i64 %a1) {
define i64 @test__blsi_u64(i64 %a0) {
; X64-LABEL: test__blsi_u64:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: subq %rdi, %rax
; X64-NEXT: andq %rdi, %rax
@@ -42,7 +42,7 @@ define i64 @test__blsi_u64(i64 %a0) {
define i64 @test__blsmsk_u64(i64 %a0) {
; X64-LABEL: test__blsmsk_u64:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movq %rdi, %rax
; X64-NEXT: subq $1, %rax
; X64-NEXT: xorq %rdi, %rax
@@ -54,7 +54,7 @@ define i64 @test__blsmsk_u64(i64 %a0) {
define i64 @test__blsr_u64(i64 %a0) {
; X64-LABEL: test__blsr_u64:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movq %rdi, %rax
; X64-NEXT: subq $1, %rax
; X64-NEXT: andq %rdi, %rax
@@ -66,7 +66,7 @@ define i64 @test__blsr_u64(i64 %a0) {
define i64 @test__tzcnt_u64(i64 %a0) {
; X64-LABEL: test__tzcnt_u64:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movl $64, %ecx
; X64-NEXT: tzcntq %rdi, %rax
; X64-NEXT: cmovbq %rcx, %rax
@@ -83,7 +83,7 @@ define i64 @test__tzcnt_u64(i64 %a0) {
define i64 @test_andn_u64(i64 %a0, i64 %a1) {
; X64-LABEL: test_andn_u64:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: xorq $-1, %rdi
; X64-NEXT: andq %rsi, %rdi
; X64-NEXT: movq %rdi, %rax
@@ -95,7 +95,7 @@ define i64 @test_andn_u64(i64 %a0, i64 %a1) {
define i64 @test_bextr_u64(i64 %a0, i32 %a1, i32 %a2) {
; X64-LABEL: test_bextr_u64:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: andl $255, %esi
; X64-NEXT: andl $255, %edx
; X64-NEXT: shll $8, %edx
@@ -114,7 +114,7 @@ define i64 @test_bextr_u64(i64 %a0, i32 %a1, i32 %a2) {
define i64 @test_blsi_u64(i64 %a0) {
; X64-LABEL: test_blsi_u64:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: subq %rdi, %rax
; X64-NEXT: andq %rdi, %rax
@@ -126,7 +126,7 @@ define i64 @test_blsi_u64(i64 %a0) {
define i64 @test_blsmsk_u64(i64 %a0) {
; X64-LABEL: test_blsmsk_u64:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movq %rdi, %rax
; X64-NEXT: subq $1, %rax
; X64-NEXT: xorq %rdi, %rax
@@ -138,7 +138,7 @@ define i64 @test_blsmsk_u64(i64 %a0) {
define i64 @test_blsr_u64(i64 %a0) {
; X64-LABEL: test_blsr_u64:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movq %rdi, %rax
; X64-NEXT: subq $1, %rax
; X64-NEXT: andq %rdi, %rax
@@ -150,7 +150,7 @@ define i64 @test_blsr_u64(i64 %a0) {
define i64 @test_tzcnt_u64(i64 %a0) {
; X64-LABEL: test_tzcnt_u64:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movl $64, %ecx
; X64-NEXT: tzcntq %rdi, %rax
; X64-NEXT: cmovbq %rcx, %rax
diff --git a/test/CodeGen/X86/bmi-intrinsics-fast-isel.ll b/test/CodeGen/X86/bmi-intrinsics-fast-isel.ll
index 2b889dd054fa..3c183a59f9cd 100644
--- a/test/CodeGen/X86/bmi-intrinsics-fast-isel.ll
+++ b/test/CodeGen/X86/bmi-intrinsics-fast-isel.ll
@@ -10,12 +10,12 @@
define i16 @test__tzcnt_u16(i16 %a0) {
; X32-LABEL: test__tzcnt_u16:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movzwl %ax, %ecx
; X32-NEXT: cmpl $0, %ecx
; X32-NEXT: jne .LBB0_1
-; X32-NEXT: # BB#2:
+; X32-NEXT: # %bb.2:
; X32-NEXT: movw $16, %ax
; X32-NEXT: retl
; X32-NEXT: .LBB0_1:
@@ -23,7 +23,7 @@ define i16 @test__tzcnt_u16(i16 %a0) {
; X32-NEXT: retl
;
; X64-LABEL: test__tzcnt_u16:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movw $16, %cx
; X64-NEXT: movzwl %di, %edx
; X64-NEXT: tzcntw %dx, %ax
@@ -39,14 +39,14 @@ define i16 @test__tzcnt_u16(i16 %a0) {
define i32 @test__andn_u32(i32 %a0, i32 %a1) {
; X32-LABEL: test__andn_u32:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: xorl $-1, %eax
; X32-NEXT: andl {{[0-9]+}}(%esp), %eax
; X32-NEXT: retl
;
; X64-LABEL: test__andn_u32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: xorl $-1, %edi
; X64-NEXT: andl %esi, %edi
; X64-NEXT: movl %edi, %eax
@@ -58,13 +58,13 @@ define i32 @test__andn_u32(i32 %a0, i32 %a1) {
define i32 @test__bextr_u32(i32 %a0, i32 %a1) {
; X32-LABEL: test__bextr_u32:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: bextrl %eax, {{[0-9]+}}(%esp), %eax
; X32-NEXT: retl
;
; X64-LABEL: test__bextr_u32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: bextrl %esi, %edi, %eax
; X64-NEXT: retq
%res = call i32 @llvm.x86.bmi.bextr.32(i32 %a0, i32 %a1)
@@ -73,7 +73,7 @@ define i32 @test__bextr_u32(i32 %a0, i32 %a1) {
define i32 @test__blsi_u32(i32 %a0) {
; X32-LABEL: test__blsi_u32:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: xorl %eax, %eax
; X32-NEXT: subl %ecx, %eax
@@ -81,7 +81,7 @@ define i32 @test__blsi_u32(i32 %a0) {
; X32-NEXT: retl
;
; X64-LABEL: test__blsi_u32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: subl %edi, %eax
; X64-NEXT: andl %edi, %eax
@@ -93,7 +93,7 @@ define i32 @test__blsi_u32(i32 %a0) {
define i32 @test__blsmsk_u32(i32 %a0) {
; X32-LABEL: test__blsmsk_u32:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: movl %ecx, %eax
; X32-NEXT: subl $1, %eax
@@ -101,7 +101,7 @@ define i32 @test__blsmsk_u32(i32 %a0) {
; X32-NEXT: retl
;
; X64-LABEL: test__blsmsk_u32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movl %edi, %eax
; X64-NEXT: subl $1, %eax
; X64-NEXT: xorl %edi, %eax
@@ -113,7 +113,7 @@ define i32 @test__blsmsk_u32(i32 %a0) {
define i32 @test__blsr_u32(i32 %a0) {
; X32-LABEL: test__blsr_u32:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: movl %ecx, %eax
; X32-NEXT: subl $1, %eax
@@ -121,7 +121,7 @@ define i32 @test__blsr_u32(i32 %a0) {
; X32-NEXT: retl
;
; X64-LABEL: test__blsr_u32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movl %edi, %eax
; X64-NEXT: subl $1, %eax
; X64-NEXT: andl %edi, %eax
@@ -133,11 +133,11 @@ define i32 @test__blsr_u32(i32 %a0) {
define i32 @test__tzcnt_u32(i32 %a0) {
; X32-LABEL: test__tzcnt_u32:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: cmpl $0, %eax
; X32-NEXT: jne .LBB6_1
-; X32-NEXT: # BB#2:
+; X32-NEXT: # %bb.2:
; X32-NEXT: movl $32, %eax
; X32-NEXT: retl
; X32-NEXT: .LBB6_1:
@@ -145,7 +145,7 @@ define i32 @test__tzcnt_u32(i32 %a0) {
; X32-NEXT: retl
;
; X64-LABEL: test__tzcnt_u32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movl $32, %ecx
; X64-NEXT: tzcntl %edi, %eax
; X64-NEXT: cmovbl %ecx, %eax
@@ -162,12 +162,12 @@ define i32 @test__tzcnt_u32(i32 %a0) {
define i16 @test_tzcnt_u16(i16 %a0) {
; X32-LABEL: test_tzcnt_u16:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movzwl %ax, %ecx
; X32-NEXT: cmpl $0, %ecx
; X32-NEXT: jne .LBB7_1
-; X32-NEXT: # BB#2:
+; X32-NEXT: # %bb.2:
; X32-NEXT: movw $16, %ax
; X32-NEXT: retl
; X32-NEXT: .LBB7_1:
@@ -175,7 +175,7 @@ define i16 @test_tzcnt_u16(i16 %a0) {
; X32-NEXT: retl
;
; X64-LABEL: test_tzcnt_u16:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movw $16, %cx
; X64-NEXT: movzwl %di, %edx
; X64-NEXT: tzcntw %dx, %ax
@@ -191,14 +191,14 @@ define i16 @test_tzcnt_u16(i16 %a0) {
define i32 @test_andn_u32(i32 %a0, i32 %a1) {
; X32-LABEL: test_andn_u32:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: xorl $-1, %eax
; X32-NEXT: andl {{[0-9]+}}(%esp), %eax
; X32-NEXT: retl
;
; X64-LABEL: test_andn_u32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: xorl $-1, %edi
; X64-NEXT: andl %esi, %edi
; X64-NEXT: movl %edi, %eax
@@ -210,7 +210,7 @@ define i32 @test_andn_u32(i32 %a0, i32 %a1) {
define i32 @test_bextr_u32(i32 %a0, i32 %a1, i32 %a2) {
; X32-LABEL: test_bextr_u32:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: andl $255, %ecx
@@ -221,7 +221,7 @@ define i32 @test_bextr_u32(i32 %a0, i32 %a1, i32 %a2) {
; X32-NEXT: retl
;
; X64-LABEL: test_bextr_u32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: andl $255, %esi
; X64-NEXT: andl $255, %edx
; X64-NEXT: shll $8, %edx
@@ -238,7 +238,7 @@ define i32 @test_bextr_u32(i32 %a0, i32 %a1, i32 %a2) {
define i32 @test_blsi_u32(i32 %a0) {
; X32-LABEL: test_blsi_u32:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: xorl %eax, %eax
; X32-NEXT: subl %ecx, %eax
@@ -246,7 +246,7 @@ define i32 @test_blsi_u32(i32 %a0) {
; X32-NEXT: retl
;
; X64-LABEL: test_blsi_u32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: subl %edi, %eax
; X64-NEXT: andl %edi, %eax
@@ -258,7 +258,7 @@ define i32 @test_blsi_u32(i32 %a0) {
define i32 @test_blsmsk_u32(i32 %a0) {
; X32-LABEL: test_blsmsk_u32:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: movl %ecx, %eax
; X32-NEXT: subl $1, %eax
@@ -266,7 +266,7 @@ define i32 @test_blsmsk_u32(i32 %a0) {
; X32-NEXT: retl
;
; X64-LABEL: test_blsmsk_u32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movl %edi, %eax
; X64-NEXT: subl $1, %eax
; X64-NEXT: xorl %edi, %eax
@@ -278,7 +278,7 @@ define i32 @test_blsmsk_u32(i32 %a0) {
define i32 @test_blsr_u32(i32 %a0) {
; X32-LABEL: test_blsr_u32:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: movl %ecx, %eax
; X32-NEXT: subl $1, %eax
@@ -286,7 +286,7 @@ define i32 @test_blsr_u32(i32 %a0) {
; X32-NEXT: retl
;
; X64-LABEL: test_blsr_u32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movl %edi, %eax
; X64-NEXT: subl $1, %eax
; X64-NEXT: andl %edi, %eax
@@ -298,11 +298,11 @@ define i32 @test_blsr_u32(i32 %a0) {
define i32 @test_tzcnt_u32(i32 %a0) {
; X32-LABEL: test_tzcnt_u32:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: cmpl $0, %eax
; X32-NEXT: jne .LBB13_1
-; X32-NEXT: # BB#2:
+; X32-NEXT: # %bb.2:
; X32-NEXT: movl $32, %eax
; X32-NEXT: retl
; X32-NEXT: .LBB13_1:
@@ -310,7 +310,7 @@ define i32 @test_tzcnt_u32(i32 %a0) {
; X32-NEXT: retl
;
; X64-LABEL: test_tzcnt_u32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movl $32, %ecx
; X64-NEXT: tzcntl %edi, %eax
; X64-NEXT: cmovbl %ecx, %eax
diff --git a/test/CodeGen/X86/bmi-schedule.ll b/test/CodeGen/X86/bmi-schedule.ll
index 75be2d9c0f01..8d41a5100aea 100644
--- a/test/CodeGen/X86/bmi-schedule.ll
+++ b/test/CodeGen/X86/bmi-schedule.ll
@@ -1,47 +1,66 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mattr=+bmi | FileCheck %s --check-prefix=CHECK --check-prefix=GENERIC
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=+bmi | FileCheck %s --check-prefix=CHECK --check-prefix=GENERIC
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=haswell | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=broadwell | FileCheck %s --check-prefix=CHECK --check-prefix=BROADWELL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=SKYLAKE
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=knl | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1
define i16 @test_andn_i16(i16 zeroext %a0, i16 zeroext %a1, i16 *%a2) {
; GENERIC-LABEL: test_andn_i16:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: andnl %esi, %edi, %eax
-; GENERIC-NEXT: notl %edi
-; GENERIC-NEXT: andw (%rdx), %di
-; GENERIC-NEXT: addl %edi, %eax
-; GENERIC-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: andnl %esi, %edi, %eax # sched: [1:0.33]
+; GENERIC-NEXT: notl %edi # sched: [1:0.33]
+; GENERIC-NEXT: andw (%rdx), %di # sched: [6:0.50]
+; GENERIC-NEXT: addl %edi, %eax # sched: [1:0.33]
+; GENERIC-NEXT: # kill: def %ax killed %ax killed %eax
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_andn_i16:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: andnl %esi, %edi, %eax # sched: [1:0.50]
; HASWELL-NEXT: notl %edi # sched: [1:0.25]
-; HASWELL-NEXT: andw (%rdx), %di # sched: [5:0.50]
+; HASWELL-NEXT: andw (%rdx), %di # sched: [6:0.50]
; HASWELL-NEXT: addl %edi, %eax # sched: [1:0.25]
-; HASWELL-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: # kill: def %ax killed %ax killed %eax
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_andn_i16:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: andnl %esi, %edi, %eax # sched: [1:0.50]
+; BROADWELL-NEXT: notl %edi # sched: [1:0.25]
+; BROADWELL-NEXT: andw (%rdx), %di # sched: [6:0.50]
+; BROADWELL-NEXT: addl %edi, %eax # sched: [1:0.25]
+; BROADWELL-NEXT: # kill: def %ax killed %ax killed %eax
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_andn_i16:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: andnl %esi, %edi, %eax # sched: [1:0.50]
+; SKYLAKE-NEXT: notl %edi # sched: [1:0.25]
+; SKYLAKE-NEXT: andw (%rdx), %di # sched: [6:0.50]
+; SKYLAKE-NEXT: addl %edi, %eax # sched: [1:0.25]
+; SKYLAKE-NEXT: # kill: def %ax killed %ax killed %eax
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_andn_i16:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: andnl %esi, %edi, %eax # sched: [1:0.50]
; BTVER2-NEXT: notl %edi # sched: [1:0.50]
; BTVER2-NEXT: andw (%rdx), %di # sched: [4:1.00]
; BTVER2-NEXT: addl %edi, %eax # sched: [1:0.50]
-; BTVER2-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; BTVER2-NEXT: # kill: def %ax killed %ax killed %eax
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_andn_i16:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: andnl %esi, %edi, %eax # sched: [1:0.25]
; ZNVER1-NEXT: notl %edi # sched: [1:0.25]
; ZNVER1-NEXT: andw (%rdx), %di # sched: [5:0.50]
; ZNVER1-NEXT: addl %edi, %eax # sched: [1:0.25]
-; ZNVER1-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: # kill: def %ax killed %ax killed %eax
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = load i16, i16 *%a2
%2 = xor i16 %a0, -1
%3 = and i16 %2, %a1
@@ -52,32 +71,46 @@ define i16 @test_andn_i16(i16 zeroext %a0, i16 zeroext %a1, i16 *%a2) {
define i32 @test_andn_i32(i32 %a0, i32 %a1, i32 *%a2) {
; GENERIC-LABEL: test_andn_i32:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: andnl %esi, %edi, %ecx
-; GENERIC-NEXT: andnl (%rdx), %edi, %eax
-; GENERIC-NEXT: addl %ecx, %eax
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: andnl %esi, %edi, %ecx # sched: [1:0.33]
+; GENERIC-NEXT: andnl (%rdx), %edi, %eax # sched: [5:0.50]
+; GENERIC-NEXT: addl %ecx, %eax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_andn_i32:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: andnl %esi, %edi, %ecx # sched: [1:0.50]
-; HASWELL-NEXT: andnl (%rdx), %edi, %eax # sched: [4:0.50]
+; HASWELL-NEXT: andnl (%rdx), %edi, %eax # sched: [6:0.50]
; HASWELL-NEXT: addl %ecx, %eax # sched: [1:0.25]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_andn_i32:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: andnl %esi, %edi, %ecx # sched: [1:0.50]
+; BROADWELL-NEXT: andnl (%rdx), %edi, %eax # sched: [6:0.50]
+; BROADWELL-NEXT: addl %ecx, %eax # sched: [1:0.25]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_andn_i32:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: andnl %esi, %edi, %ecx # sched: [1:0.50]
+; SKYLAKE-NEXT: andnl (%rdx), %edi, %eax # sched: [6:0.50]
+; SKYLAKE-NEXT: addl %ecx, %eax # sched: [1:0.25]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_andn_i32:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: andnl (%rdx), %edi, %eax # sched: [4:1.00]
; BTVER2-NEXT: andnl %esi, %edi, %ecx # sched: [1:0.50]
; BTVER2-NEXT: addl %ecx, %eax # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_andn_i32:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: andnl (%rdx), %edi, %eax # sched: [5:0.50]
; ZNVER1-NEXT: andnl %esi, %edi, %ecx # sched: [1:0.25]
; ZNVER1-NEXT: addl %ecx, %eax # sched: [1:0.25]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = load i32, i32 *%a2
%2 = xor i32 %a0, -1
%3 = and i32 %2, %a1
@@ -88,32 +121,46 @@ define i32 @test_andn_i32(i32 %a0, i32 %a1, i32 *%a2) {
define i64 @test_andn_i64(i64 %a0, i64 %a1, i64 *%a2) {
; GENERIC-LABEL: test_andn_i64:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: andnq %rsi, %rdi, %rcx
-; GENERIC-NEXT: andnq (%rdx), %rdi, %rax
-; GENERIC-NEXT: addq %rcx, %rax
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: andnq %rsi, %rdi, %rcx # sched: [1:0.33]
+; GENERIC-NEXT: andnq (%rdx), %rdi, %rax # sched: [5:0.50]
+; GENERIC-NEXT: addq %rcx, %rax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_andn_i64:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: andnq %rsi, %rdi, %rcx # sched: [1:0.50]
-; HASWELL-NEXT: andnq (%rdx), %rdi, %rax # sched: [4:0.50]
+; HASWELL-NEXT: andnq (%rdx), %rdi, %rax # sched: [6:0.50]
; HASWELL-NEXT: addq %rcx, %rax # sched: [1:0.25]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_andn_i64:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: andnq %rsi, %rdi, %rcx # sched: [1:0.50]
+; BROADWELL-NEXT: andnq (%rdx), %rdi, %rax # sched: [6:0.50]
+; BROADWELL-NEXT: addq %rcx, %rax # sched: [1:0.25]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_andn_i64:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: andnq %rsi, %rdi, %rcx # sched: [1:0.50]
+; SKYLAKE-NEXT: andnq (%rdx), %rdi, %rax # sched: [6:0.50]
+; SKYLAKE-NEXT: addq %rcx, %rax # sched: [1:0.25]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_andn_i64:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: andnq (%rdx), %rdi, %rax # sched: [4:1.00]
; BTVER2-NEXT: andnq %rsi, %rdi, %rcx # sched: [1:0.50]
; BTVER2-NEXT: addq %rcx, %rax # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_andn_i64:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: andnq (%rdx), %rdi, %rax # sched: [5:0.50]
; ZNVER1-NEXT: andnq %rsi, %rdi, %rcx # sched: [1:0.25]
; ZNVER1-NEXT: addq %rcx, %rax # sched: [1:0.25]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = load i64, i64 *%a2
%2 = xor i64 %a0, -1
%3 = and i64 %2, %a1
@@ -124,32 +171,46 @@ define i64 @test_andn_i64(i64 %a0, i64 %a1, i64 *%a2) {
define i32 @test_bextr_i32(i32 %a0, i32 %a1, i32 *%a2) {
; GENERIC-LABEL: test_bextr_i32:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: bextrl %edi, (%rdx), %ecx
-; GENERIC-NEXT: bextrl %edi, %esi, %eax
-; GENERIC-NEXT: addl %ecx, %eax
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: bextrl %edi, (%rdx), %ecx # sched: [5:0.50]
+; GENERIC-NEXT: bextrl %edi, %esi, %eax # sched: [1:0.33]
+; GENERIC-NEXT: addl %ecx, %eax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_bextr_i32:
-; HASWELL: # BB#0:
-; HASWELL-NEXT: bextrl %edi, (%rdx), %ecx # sched: [6:0.50]
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: bextrl %edi, (%rdx), %ecx # sched: [7:0.50]
; HASWELL-NEXT: bextrl %edi, %esi, %eax # sched: [2:0.50]
; HASWELL-NEXT: addl %ecx, %eax # sched: [1:0.25]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_bextr_i32:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: bextrl %edi, (%rdx), %ecx # sched: [7:0.50]
+; BROADWELL-NEXT: bextrl %edi, %esi, %eax # sched: [2:0.50]
+; BROADWELL-NEXT: addl %ecx, %eax # sched: [1:0.25]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_bextr_i32:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: bextrl %edi, (%rdx), %ecx # sched: [7:0.50]
+; SKYLAKE-NEXT: bextrl %edi, %esi, %eax # sched: [2:0.50]
+; SKYLAKE-NEXT: addl %ecx, %eax # sched: [1:0.25]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_bextr_i32:
-; BTVER2: # BB#0:
-; BTVER2-NEXT: bextrl %edi, (%rdx), %ecx # sched: [?:0.000000e+00]
-; BTVER2-NEXT: bextrl %edi, %esi, %eax # sched: [?:0.000000e+00]
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: bextrl %edi, (%rdx), %ecx # sched: [4:1.00]
+; BTVER2-NEXT: bextrl %edi, %esi, %eax # sched: [1:0.50]
; BTVER2-NEXT: addl %ecx, %eax # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_bextr_i32:
-; ZNVER1: # BB#0:
-; ZNVER1-NEXT: bextrl %edi, (%rdx), %ecx # sched: [?:0.000000e+00]
-; ZNVER1-NEXT: bextrl %edi, %esi, %eax # sched: [?:0.000000e+00]
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: bextrl %edi, (%rdx), %ecx # sched: [5:0.50]
+; ZNVER1-NEXT: bextrl %edi, %esi, %eax # sched: [1:0.25]
; ZNVER1-NEXT: addl %ecx, %eax # sched: [1:0.25]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = load i32, i32 *%a2
%2 = tail call i32 @llvm.x86.bmi.bextr.32(i32 %1, i32 %a0)
%3 = tail call i32 @llvm.x86.bmi.bextr.32(i32 %a1, i32 %a0)
@@ -160,32 +221,46 @@ declare i32 @llvm.x86.bmi.bextr.32(i32, i32)
define i64 @test_bextr_i64(i64 %a0, i64 %a1, i64 *%a2) {
; GENERIC-LABEL: test_bextr_i64:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: bextrq %rdi, (%rdx), %rcx
-; GENERIC-NEXT: bextrq %rdi, %rsi, %rax
-; GENERIC-NEXT: addq %rcx, %rax
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: bextrq %rdi, (%rdx), %rcx # sched: [5:0.50]
+; GENERIC-NEXT: bextrq %rdi, %rsi, %rax # sched: [1:0.33]
+; GENERIC-NEXT: addq %rcx, %rax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_bextr_i64:
-; HASWELL: # BB#0:
-; HASWELL-NEXT: bextrq %rdi, (%rdx), %rcx # sched: [6:0.50]
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: bextrq %rdi, (%rdx), %rcx # sched: [7:0.50]
; HASWELL-NEXT: bextrq %rdi, %rsi, %rax # sched: [2:0.50]
; HASWELL-NEXT: addq %rcx, %rax # sched: [1:0.25]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_bextr_i64:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: bextrq %rdi, (%rdx), %rcx # sched: [7:0.50]
+; BROADWELL-NEXT: bextrq %rdi, %rsi, %rax # sched: [2:0.50]
+; BROADWELL-NEXT: addq %rcx, %rax # sched: [1:0.25]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_bextr_i64:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: bextrq %rdi, (%rdx), %rcx # sched: [7:0.50]
+; SKYLAKE-NEXT: bextrq %rdi, %rsi, %rax # sched: [2:0.50]
+; SKYLAKE-NEXT: addq %rcx, %rax # sched: [1:0.25]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_bextr_i64:
-; BTVER2: # BB#0:
-; BTVER2-NEXT: bextrq %rdi, (%rdx), %rcx # sched: [?:0.000000e+00]
-; BTVER2-NEXT: bextrq %rdi, %rsi, %rax # sched: [?:0.000000e+00]
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: bextrq %rdi, (%rdx), %rcx # sched: [4:1.00]
+; BTVER2-NEXT: bextrq %rdi, %rsi, %rax # sched: [1:0.50]
; BTVER2-NEXT: addq %rcx, %rax # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_bextr_i64:
-; ZNVER1: # BB#0:
-; ZNVER1-NEXT: bextrq %rdi, (%rdx), %rcx # sched: [?:0.000000e+00]
-; ZNVER1-NEXT: bextrq %rdi, %rsi, %rax # sched: [?:0.000000e+00]
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: bextrq %rdi, (%rdx), %rcx # sched: [5:0.50]
+; ZNVER1-NEXT: bextrq %rdi, %rsi, %rax # sched: [1:0.25]
; ZNVER1-NEXT: addq %rcx, %rax # sched: [1:0.25]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = load i64, i64 *%a2
%2 = tail call i64 @llvm.x86.bmi.bextr.64(i64 %1, i64 %a0)
%3 = tail call i64 @llvm.x86.bmi.bextr.64(i64 %a1, i64 %a0)
@@ -196,32 +271,46 @@ declare i64 @llvm.x86.bmi.bextr.64(i64, i64)
define i32 @test_blsi_i32(i32 %a0, i32 *%a1) {
; GENERIC-LABEL: test_blsi_i32:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: blsil (%rsi), %ecx
-; GENERIC-NEXT: blsil %edi, %eax
-; GENERIC-NEXT: addl %ecx, %eax
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: blsil (%rsi), %ecx # sched: [5:0.50]
+; GENERIC-NEXT: blsil %edi, %eax # sched: [1:0.33]
+; GENERIC-NEXT: addl %ecx, %eax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_blsi_i32:
-; HASWELL: # BB#0:
-; HASWELL-NEXT: blsil (%rsi), %ecx # sched: [4:0.50]
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: blsil (%rsi), %ecx # sched: [6:0.50]
; HASWELL-NEXT: blsil %edi, %eax # sched: [1:0.50]
; HASWELL-NEXT: addl %ecx, %eax # sched: [1:0.25]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_blsi_i32:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: blsil (%rsi), %ecx # sched: [6:0.50]
+; BROADWELL-NEXT: blsil %edi, %eax # sched: [1:0.50]
+; BROADWELL-NEXT: addl %ecx, %eax # sched: [1:0.25]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_blsi_i32:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: blsil (%rsi), %ecx # sched: [6:0.50]
+; SKYLAKE-NEXT: blsil %edi, %eax # sched: [1:0.50]
+; SKYLAKE-NEXT: addl %ecx, %eax # sched: [1:0.25]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_blsi_i32:
-; BTVER2: # BB#0:
-; BTVER2-NEXT: blsil (%rsi), %ecx # sched: [?:0.000000e+00]
-; BTVER2-NEXT: blsil %edi, %eax # sched: [?:0.000000e+00]
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: blsil (%rsi), %ecx # sched: [4:1.00]
+; BTVER2-NEXT: blsil %edi, %eax # sched: [1:0.50]
; BTVER2-NEXT: addl %ecx, %eax # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_blsi_i32:
-; ZNVER1: # BB#0:
-; ZNVER1-NEXT: blsil (%rsi), %ecx # sched: [?:0.000000e+00]
-; ZNVER1-NEXT: blsil %edi, %eax # sched: [?:0.000000e+00]
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: blsil (%rsi), %ecx # sched: [6:0.50]
+; ZNVER1-NEXT: blsil %edi, %eax # sched: [2:0.25]
; ZNVER1-NEXT: addl %ecx, %eax # sched: [1:0.25]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = load i32, i32 *%a1
%2 = sub i32 0, %1
%3 = sub i32 0, %a0
@@ -233,32 +322,46 @@ define i32 @test_blsi_i32(i32 %a0, i32 *%a1) {
define i64 @test_blsi_i64(i64 %a0, i64 *%a1) {
; GENERIC-LABEL: test_blsi_i64:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: blsiq (%rsi), %rcx
-; GENERIC-NEXT: blsiq %rdi, %rax
-; GENERIC-NEXT: addq %rcx, %rax
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: blsiq (%rsi), %rcx # sched: [5:0.50]
+; GENERIC-NEXT: blsiq %rdi, %rax # sched: [1:0.33]
+; GENERIC-NEXT: addq %rcx, %rax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_blsi_i64:
-; HASWELL: # BB#0:
-; HASWELL-NEXT: blsiq (%rsi), %rcx # sched: [4:0.50]
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: blsiq (%rsi), %rcx # sched: [6:0.50]
; HASWELL-NEXT: blsiq %rdi, %rax # sched: [1:0.50]
; HASWELL-NEXT: addq %rcx, %rax # sched: [1:0.25]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_blsi_i64:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: blsiq (%rsi), %rcx # sched: [6:0.50]
+; BROADWELL-NEXT: blsiq %rdi, %rax # sched: [1:0.50]
+; BROADWELL-NEXT: addq %rcx, %rax # sched: [1:0.25]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_blsi_i64:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: blsiq (%rsi), %rcx # sched: [6:0.50]
+; SKYLAKE-NEXT: blsiq %rdi, %rax # sched: [1:0.50]
+; SKYLAKE-NEXT: addq %rcx, %rax # sched: [1:0.25]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_blsi_i64:
-; BTVER2: # BB#0:
-; BTVER2-NEXT: blsiq (%rsi), %rcx # sched: [?:0.000000e+00]
-; BTVER2-NEXT: blsiq %rdi, %rax # sched: [?:0.000000e+00]
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: blsiq (%rsi), %rcx # sched: [4:1.00]
+; BTVER2-NEXT: blsiq %rdi, %rax # sched: [1:0.50]
; BTVER2-NEXT: addq %rcx, %rax # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_blsi_i64:
-; ZNVER1: # BB#0:
-; ZNVER1-NEXT: blsiq (%rsi), %rcx # sched: [?:0.000000e+00]
-; ZNVER1-NEXT: blsiq %rdi, %rax # sched: [?:0.000000e+00]
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: blsiq (%rsi), %rcx # sched: [6:0.50]
+; ZNVER1-NEXT: blsiq %rdi, %rax # sched: [2:0.25]
; ZNVER1-NEXT: addq %rcx, %rax # sched: [1:0.25]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = load i64, i64 *%a1
%2 = sub i64 0, %1
%3 = sub i64 0, %a0
@@ -270,32 +373,46 @@ define i64 @test_blsi_i64(i64 %a0, i64 *%a1) {
define i32 @test_blsmsk_i32(i32 %a0, i32 *%a1) {
; GENERIC-LABEL: test_blsmsk_i32:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: blsmskl (%rsi), %ecx
-; GENERIC-NEXT: blsmskl %edi, %eax
-; GENERIC-NEXT: addl %ecx, %eax
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: blsmskl (%rsi), %ecx # sched: [5:0.50]
+; GENERIC-NEXT: blsmskl %edi, %eax # sched: [1:0.33]
+; GENERIC-NEXT: addl %ecx, %eax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_blsmsk_i32:
-; HASWELL: # BB#0:
-; HASWELL-NEXT: blsmskl (%rsi), %ecx # sched: [4:0.50]
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: blsmskl (%rsi), %ecx # sched: [6:0.50]
; HASWELL-NEXT: blsmskl %edi, %eax # sched: [1:0.50]
; HASWELL-NEXT: addl %ecx, %eax # sched: [1:0.25]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_blsmsk_i32:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: blsmskl (%rsi), %ecx # sched: [6:0.50]
+; BROADWELL-NEXT: blsmskl %edi, %eax # sched: [1:0.50]
+; BROADWELL-NEXT: addl %ecx, %eax # sched: [1:0.25]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_blsmsk_i32:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: blsmskl (%rsi), %ecx # sched: [6:0.50]
+; SKYLAKE-NEXT: blsmskl %edi, %eax # sched: [1:0.50]
+; SKYLAKE-NEXT: addl %ecx, %eax # sched: [1:0.25]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_blsmsk_i32:
-; BTVER2: # BB#0:
-; BTVER2-NEXT: blsmskl (%rsi), %ecx # sched: [?:0.000000e+00]
-; BTVER2-NEXT: blsmskl %edi, %eax # sched: [?:0.000000e+00]
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: blsmskl (%rsi), %ecx # sched: [4:1.00]
+; BTVER2-NEXT: blsmskl %edi, %eax # sched: [1:0.50]
; BTVER2-NEXT: addl %ecx, %eax # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_blsmsk_i32:
-; ZNVER1: # BB#0:
-; ZNVER1-NEXT: blsmskl (%rsi), %ecx # sched: [?:0.000000e+00]
-; ZNVER1-NEXT: blsmskl %edi, %eax # sched: [?:0.000000e+00]
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: blsmskl (%rsi), %ecx # sched: [6:0.50]
+; ZNVER1-NEXT: blsmskl %edi, %eax # sched: [2:0.25]
; ZNVER1-NEXT: addl %ecx, %eax # sched: [1:0.25]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = load i32, i32 *%a1
%2 = sub i32 %1, 1
%3 = sub i32 %a0, 1
@@ -307,32 +424,46 @@ define i32 @test_blsmsk_i32(i32 %a0, i32 *%a1) {
define i64 @test_blsmsk_i64(i64 %a0, i64 *%a1) {
; GENERIC-LABEL: test_blsmsk_i64:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: blsmskq (%rsi), %rcx
-; GENERIC-NEXT: blsmskq %rdi, %rax
-; GENERIC-NEXT: addq %rcx, %rax
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: blsmskq (%rsi), %rcx # sched: [5:0.50]
+; GENERIC-NEXT: blsmskq %rdi, %rax # sched: [1:0.33]
+; GENERIC-NEXT: addq %rcx, %rax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_blsmsk_i64:
-; HASWELL: # BB#0:
-; HASWELL-NEXT: blsmskq (%rsi), %rcx # sched: [4:0.50]
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: blsmskq (%rsi), %rcx # sched: [6:0.50]
; HASWELL-NEXT: blsmskq %rdi, %rax # sched: [1:0.50]
; HASWELL-NEXT: addq %rcx, %rax # sched: [1:0.25]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_blsmsk_i64:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: blsmskq (%rsi), %rcx # sched: [6:0.50]
+; BROADWELL-NEXT: blsmskq %rdi, %rax # sched: [1:0.50]
+; BROADWELL-NEXT: addq %rcx, %rax # sched: [1:0.25]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_blsmsk_i64:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: blsmskq (%rsi), %rcx # sched: [6:0.50]
+; SKYLAKE-NEXT: blsmskq %rdi, %rax # sched: [1:0.50]
+; SKYLAKE-NEXT: addq %rcx, %rax # sched: [1:0.25]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_blsmsk_i64:
-; BTVER2: # BB#0:
-; BTVER2-NEXT: blsmskq (%rsi), %rcx # sched: [?:0.000000e+00]
-; BTVER2-NEXT: blsmskq %rdi, %rax # sched: [?:0.000000e+00]
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: blsmskq (%rsi), %rcx # sched: [4:1.00]
+; BTVER2-NEXT: blsmskq %rdi, %rax # sched: [1:0.50]
; BTVER2-NEXT: addq %rcx, %rax # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_blsmsk_i64:
-; ZNVER1: # BB#0:
-; ZNVER1-NEXT: blsmskq (%rsi), %rcx # sched: [?:0.000000e+00]
-; ZNVER1-NEXT: blsmskq %rdi, %rax # sched: [?:0.000000e+00]
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: blsmskq (%rsi), %rcx # sched: [6:0.50]
+; ZNVER1-NEXT: blsmskq %rdi, %rax # sched: [2:0.25]
; ZNVER1-NEXT: addq %rcx, %rax # sched: [1:0.25]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = load i64, i64 *%a1
%2 = sub i64 %1, 1
%3 = sub i64 %a0, 1
@@ -344,32 +475,46 @@ define i64 @test_blsmsk_i64(i64 %a0, i64 *%a1) {
define i32 @test_blsr_i32(i32 %a0, i32 *%a1) {
; GENERIC-LABEL: test_blsr_i32:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: blsrl (%rsi), %ecx
-; GENERIC-NEXT: blsrl %edi, %eax
-; GENERIC-NEXT: addl %ecx, %eax
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: blsrl (%rsi), %ecx # sched: [5:0.50]
+; GENERIC-NEXT: blsrl %edi, %eax # sched: [1:0.33]
+; GENERIC-NEXT: addl %ecx, %eax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_blsr_i32:
-; HASWELL: # BB#0:
-; HASWELL-NEXT: blsrl (%rsi), %ecx # sched: [4:0.50]
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: blsrl (%rsi), %ecx # sched: [6:0.50]
; HASWELL-NEXT: blsrl %edi, %eax # sched: [1:0.50]
; HASWELL-NEXT: addl %ecx, %eax # sched: [1:0.25]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_blsr_i32:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: blsrl (%rsi), %ecx # sched: [6:0.50]
+; BROADWELL-NEXT: blsrl %edi, %eax # sched: [1:0.50]
+; BROADWELL-NEXT: addl %ecx, %eax # sched: [1:0.25]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_blsr_i32:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: blsrl (%rsi), %ecx # sched: [6:0.50]
+; SKYLAKE-NEXT: blsrl %edi, %eax # sched: [1:0.50]
+; SKYLAKE-NEXT: addl %ecx, %eax # sched: [1:0.25]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_blsr_i32:
-; BTVER2: # BB#0:
-; BTVER2-NEXT: blsrl (%rsi), %ecx # sched: [?:0.000000e+00]
-; BTVER2-NEXT: blsrl %edi, %eax # sched: [?:0.000000e+00]
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: blsrl (%rsi), %ecx # sched: [4:1.00]
+; BTVER2-NEXT: blsrl %edi, %eax # sched: [1:0.50]
; BTVER2-NEXT: addl %ecx, %eax # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_blsr_i32:
-; ZNVER1: # BB#0:
-; ZNVER1-NEXT: blsrl (%rsi), %ecx # sched: [?:0.000000e+00]
-; ZNVER1-NEXT: blsrl %edi, %eax # sched: [?:0.000000e+00]
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: blsrl (%rsi), %ecx # sched: [6:0.50]
+; ZNVER1-NEXT: blsrl %edi, %eax # sched: [2:0.25]
; ZNVER1-NEXT: addl %ecx, %eax # sched: [1:0.25]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = load i32, i32 *%a1
%2 = sub i32 %1, 1
%3 = sub i32 %a0, 1
@@ -381,32 +526,46 @@ define i32 @test_blsr_i32(i32 %a0, i32 *%a1) {
define i64 @test_blsr_i64(i64 %a0, i64 *%a1) {
; GENERIC-LABEL: test_blsr_i64:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: blsrq (%rsi), %rcx
-; GENERIC-NEXT: blsrq %rdi, %rax
-; GENERIC-NEXT: addq %rcx, %rax
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: blsrq (%rsi), %rcx # sched: [5:0.50]
+; GENERIC-NEXT: blsrq %rdi, %rax # sched: [1:0.33]
+; GENERIC-NEXT: addq %rcx, %rax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_blsr_i64:
-; HASWELL: # BB#0:
-; HASWELL-NEXT: blsrq (%rsi), %rcx # sched: [4:0.50]
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: blsrq (%rsi), %rcx # sched: [6:0.50]
; HASWELL-NEXT: blsrq %rdi, %rax # sched: [1:0.50]
; HASWELL-NEXT: addq %rcx, %rax # sched: [1:0.25]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_blsr_i64:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: blsrq (%rsi), %rcx # sched: [6:0.50]
+; BROADWELL-NEXT: blsrq %rdi, %rax # sched: [1:0.50]
+; BROADWELL-NEXT: addq %rcx, %rax # sched: [1:0.25]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_blsr_i64:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: blsrq (%rsi), %rcx # sched: [6:0.50]
+; SKYLAKE-NEXT: blsrq %rdi, %rax # sched: [1:0.50]
+; SKYLAKE-NEXT: addq %rcx, %rax # sched: [1:0.25]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_blsr_i64:
-; BTVER2: # BB#0:
-; BTVER2-NEXT: blsrq (%rsi), %rcx # sched: [?:0.000000e+00]
-; BTVER2-NEXT: blsrq %rdi, %rax # sched: [?:0.000000e+00]
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: blsrq (%rsi), %rcx # sched: [4:1.00]
+; BTVER2-NEXT: blsrq %rdi, %rax # sched: [1:0.50]
; BTVER2-NEXT: addq %rcx, %rax # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_blsr_i64:
-; ZNVER1: # BB#0:
-; ZNVER1-NEXT: blsrq (%rsi), %rcx # sched: [?:0.000000e+00]
-; ZNVER1-NEXT: blsrq %rdi, %rax # sched: [?:0.000000e+00]
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: blsrq (%rsi), %rcx # sched: [6:0.50]
+; ZNVER1-NEXT: blsrq %rdi, %rax # sched: [2:0.25]
; ZNVER1-NEXT: addq %rcx, %rax # sched: [1:0.25]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = load i64, i64 *%a1
%2 = sub i64 %1, 1
%3 = sub i64 %a0, 1
@@ -418,36 +577,52 @@ define i64 @test_blsr_i64(i64 %a0, i64 *%a1) {
define i16 @test_cttz_i16(i16 zeroext %a0, i16 *%a1) {
; GENERIC-LABEL: test_cttz_i16:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: tzcntw (%rsi), %cx
-; GENERIC-NEXT: tzcntw %di, %ax
-; GENERIC-NEXT: orl %ecx, %eax
-; GENERIC-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: tzcntw (%rsi), %cx # sched: [7:1.00]
+; GENERIC-NEXT: tzcntw %di, %ax # sched: [3:1.00]
+; GENERIC-NEXT: orl %ecx, %eax # sched: [1:0.33]
+; GENERIC-NEXT: # kill: def %ax killed %ax killed %eax
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cttz_i16:
-; HASWELL: # BB#0:
-; HASWELL-NEXT: tzcntw (%rsi), %cx # sched: [7:1.00]
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: tzcntw (%rsi), %cx # sched: [8:1.00]
; HASWELL-NEXT: tzcntw %di, %ax # sched: [3:1.00]
; HASWELL-NEXT: orl %ecx, %eax # sched: [1:0.25]
-; HASWELL-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: # kill: def %ax killed %ax killed %eax
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_cttz_i16:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: tzcntw (%rsi), %cx # sched: [8:1.00]
+; BROADWELL-NEXT: tzcntw %di, %ax # sched: [3:1.00]
+; BROADWELL-NEXT: orl %ecx, %eax # sched: [1:0.25]
+; BROADWELL-NEXT: # kill: def %ax killed %ax killed %eax
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_cttz_i16:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: tzcntw (%rsi), %cx # sched: [8:1.00]
+; SKYLAKE-NEXT: tzcntw %di, %ax # sched: [3:1.00]
+; SKYLAKE-NEXT: orl %ecx, %eax # sched: [1:0.25]
+; SKYLAKE-NEXT: # kill: def %ax killed %ax killed %eax
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_cttz_i16:
-; BTVER2: # BB#0:
-; BTVER2-NEXT: tzcntw (%rsi), %cx # sched: [?:0.000000e+00]
-; BTVER2-NEXT: tzcntw %di, %ax # sched: [?:0.000000e+00]
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: tzcntw (%rsi), %cx # sched: [6:1.00]
+; BTVER2-NEXT: tzcntw %di, %ax # sched: [3:1.00]
; BTVER2-NEXT: orl %ecx, %eax # sched: [1:0.50]
-; BTVER2-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; BTVER2-NEXT: # kill: def %ax killed %ax killed %eax
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_cttz_i16:
-; ZNVER1: # BB#0:
-; ZNVER1-NEXT: tzcntw (%rsi), %cx # sched: [?:0.000000e+00]
-; ZNVER1-NEXT: tzcntw %di, %ax # sched: [?:0.000000e+00]
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: tzcntw (%rsi), %cx # sched: [6:0.50]
+; ZNVER1-NEXT: tzcntw %di, %ax # sched: [2:0.25]
; ZNVER1-NEXT: orl %ecx, %eax # sched: [1:0.25]
-; ZNVER1-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: # kill: def %ax killed %ax killed %eax
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = load i16, i16 *%a1
%2 = tail call i16 @llvm.cttz.i16( i16 %1, i1 false )
%3 = tail call i16 @llvm.cttz.i16( i16 %a0, i1 false )
@@ -458,32 +633,46 @@ declare i16 @llvm.cttz.i16(i16, i1)
define i32 @test_cttz_i32(i32 %a0, i32 *%a1) {
; GENERIC-LABEL: test_cttz_i32:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: tzcntl (%rsi), %ecx
-; GENERIC-NEXT: tzcntl %edi, %eax
-; GENERIC-NEXT: orl %ecx, %eax
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: tzcntl (%rsi), %ecx # sched: [7:1.00]
+; GENERIC-NEXT: tzcntl %edi, %eax # sched: [3:1.00]
+; GENERIC-NEXT: orl %ecx, %eax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cttz_i32:
-; HASWELL: # BB#0:
-; HASWELL-NEXT: tzcntl (%rsi), %ecx # sched: [7:1.00]
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: tzcntl (%rsi), %ecx # sched: [8:1.00]
; HASWELL-NEXT: tzcntl %edi, %eax # sched: [3:1.00]
; HASWELL-NEXT: orl %ecx, %eax # sched: [1:0.25]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_cttz_i32:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: tzcntl (%rsi), %ecx # sched: [8:1.00]
+; BROADWELL-NEXT: tzcntl %edi, %eax # sched: [3:1.00]
+; BROADWELL-NEXT: orl %ecx, %eax # sched: [1:0.25]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_cttz_i32:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: tzcntl (%rsi), %ecx # sched: [8:1.00]
+; SKYLAKE-NEXT: tzcntl %edi, %eax # sched: [3:1.00]
+; SKYLAKE-NEXT: orl %ecx, %eax # sched: [1:0.25]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_cttz_i32:
-; BTVER2: # BB#0:
-; BTVER2-NEXT: tzcntl (%rsi), %ecx # sched: [?:0.000000e+00]
-; BTVER2-NEXT: tzcntl %edi, %eax # sched: [?:0.000000e+00]
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: tzcntl (%rsi), %ecx # sched: [6:1.00]
+; BTVER2-NEXT: tzcntl %edi, %eax # sched: [3:1.00]
; BTVER2-NEXT: orl %ecx, %eax # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_cttz_i32:
-; ZNVER1: # BB#0:
-; ZNVER1-NEXT: tzcntl (%rsi), %ecx # sched: [?:0.000000e+00]
-; ZNVER1-NEXT: tzcntl %edi, %eax # sched: [?:0.000000e+00]
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: tzcntl (%rsi), %ecx # sched: [6:0.50]
+; ZNVER1-NEXT: tzcntl %edi, %eax # sched: [2:0.25]
; ZNVER1-NEXT: orl %ecx, %eax # sched: [1:0.25]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = load i32, i32 *%a1
%2 = tail call i32 @llvm.cttz.i32( i32 %1, i1 false )
%3 = tail call i32 @llvm.cttz.i32( i32 %a0, i1 false )
@@ -494,32 +683,46 @@ declare i32 @llvm.cttz.i32(i32, i1)
define i64 @test_cttz_i64(i64 %a0, i64 *%a1) {
; GENERIC-LABEL: test_cttz_i64:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: tzcntq (%rsi), %rcx
-; GENERIC-NEXT: tzcntq %rdi, %rax
-; GENERIC-NEXT: orq %rcx, %rax
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: tzcntq (%rsi), %rcx # sched: [7:1.00]
+; GENERIC-NEXT: tzcntq %rdi, %rax # sched: [3:1.00]
+; GENERIC-NEXT: orq %rcx, %rax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cttz_i64:
-; HASWELL: # BB#0:
-; HASWELL-NEXT: tzcntq (%rsi), %rcx # sched: [7:1.00]
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: tzcntq (%rsi), %rcx # sched: [8:1.00]
; HASWELL-NEXT: tzcntq %rdi, %rax # sched: [3:1.00]
; HASWELL-NEXT: orq %rcx, %rax # sched: [1:0.25]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_cttz_i64:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: tzcntq (%rsi), %rcx # sched: [8:1.00]
+; BROADWELL-NEXT: tzcntq %rdi, %rax # sched: [3:1.00]
+; BROADWELL-NEXT: orq %rcx, %rax # sched: [1:0.25]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_cttz_i64:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: tzcntq (%rsi), %rcx # sched: [8:1.00]
+; SKYLAKE-NEXT: tzcntq %rdi, %rax # sched: [3:1.00]
+; SKYLAKE-NEXT: orq %rcx, %rax # sched: [1:0.25]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_cttz_i64:
-; BTVER2: # BB#0:
-; BTVER2-NEXT: tzcntq (%rsi), %rcx # sched: [?:0.000000e+00]
-; BTVER2-NEXT: tzcntq %rdi, %rax # sched: [?:0.000000e+00]
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: tzcntq (%rsi), %rcx # sched: [6:1.00]
+; BTVER2-NEXT: tzcntq %rdi, %rax # sched: [3:1.00]
; BTVER2-NEXT: orq %rcx, %rax # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_cttz_i64:
-; ZNVER1: # BB#0:
-; ZNVER1-NEXT: tzcntq (%rsi), %rcx # sched: [?:0.000000e+00]
-; ZNVER1-NEXT: tzcntq %rdi, %rax # sched: [?:0.000000e+00]
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: tzcntq (%rsi), %rcx # sched: [6:0.50]
+; ZNVER1-NEXT: tzcntq %rdi, %rax # sched: [2:0.25]
; ZNVER1-NEXT: orq %rcx, %rax # sched: [1:0.25]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = load i64, i64 *%a1
%2 = tail call i64 @llvm.cttz.i64( i64 %1, i1 false )
%3 = tail call i64 @llvm.cttz.i64( i64 %a0, i1 false )
diff --git a/test/CodeGen/X86/bmi.ll b/test/CodeGen/X86/bmi.ll
index 94e2ee7a0aa9..b855b89183b0 100644
--- a/test/CodeGen/X86/bmi.ll
+++ b/test/CodeGen/X86/bmi.ll
@@ -1,5 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+bmi,+bmi2 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+bmi | FileCheck %s --check-prefix=CHECK --check-prefix=BMI1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+bmi,+bmi2 | FileCheck %s --check-prefix=CHECK --check-prefix=BMI2
declare i8 @llvm.cttz.i8(i8, i1)
declare i16 @llvm.cttz.i16(i16, i1)
@@ -8,11 +9,11 @@ declare i64 @llvm.cttz.i64(i64, i1)
define i8 @t1(i8 %x) {
; CHECK-LABEL: t1:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movzbl %dil, %eax
; CHECK-NEXT: orl $256, %eax # imm = 0x100
; CHECK-NEXT: tzcntl %eax, %eax
-; CHECK-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: # kill: def %al killed %al killed %eax
; CHECK-NEXT: retq
%tmp = tail call i8 @llvm.cttz.i8( i8 %x, i1 false )
ret i8 %tmp
@@ -20,7 +21,7 @@ define i8 @t1(i8 %x) {
define i16 @t2(i16 %x) {
; CHECK-LABEL: t2:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: tzcntw %di, %ax
; CHECK-NEXT: retq
%tmp = tail call i16 @llvm.cttz.i16( i16 %x, i1 false )
@@ -29,7 +30,7 @@ define i16 @t2(i16 %x) {
define i32 @t3(i32 %x) {
; CHECK-LABEL: t3:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: tzcntl %edi, %eax
; CHECK-NEXT: retq
%tmp = tail call i32 @llvm.cttz.i32( i32 %x, i1 false )
@@ -38,7 +39,7 @@ define i32 @t3(i32 %x) {
define i32 @tzcnt32_load(i32* %x) {
; CHECK-LABEL: tzcnt32_load:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: tzcntl (%rdi), %eax
; CHECK-NEXT: retq
%x1 = load i32, i32* %x
@@ -48,7 +49,7 @@ define i32 @tzcnt32_load(i32* %x) {
define i64 @t4(i64 %x) {
; CHECK-LABEL: t4:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: tzcntq %rdi, %rax
; CHECK-NEXT: retq
%tmp = tail call i64 @llvm.cttz.i64( i64 %x, i1 false )
@@ -57,10 +58,10 @@ define i64 @t4(i64 %x) {
define i8 @t5(i8 %x) {
; CHECK-LABEL: t5:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movzbl %dil, %eax
; CHECK-NEXT: tzcntl %eax, %eax
-; CHECK-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: # kill: def %al killed %al killed %eax
; CHECK-NEXT: retq
%tmp = tail call i8 @llvm.cttz.i8( i8 %x, i1 true )
ret i8 %tmp
@@ -68,7 +69,7 @@ define i8 @t5(i8 %x) {
define i16 @t6(i16 %x) {
; CHECK-LABEL: t6:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: tzcntw %di, %ax
; CHECK-NEXT: retq
%tmp = tail call i16 @llvm.cttz.i16( i16 %x, i1 true )
@@ -77,7 +78,7 @@ define i16 @t6(i16 %x) {
define i32 @t7(i32 %x) {
; CHECK-LABEL: t7:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: tzcntl %edi, %eax
; CHECK-NEXT: retq
%tmp = tail call i32 @llvm.cttz.i32( i32 %x, i1 true )
@@ -86,7 +87,7 @@ define i32 @t7(i32 %x) {
define i64 @t8(i64 %x) {
; CHECK-LABEL: t8:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: tzcntq %rdi, %rax
; CHECK-NEXT: retq
%tmp = tail call i64 @llvm.cttz.i64( i64 %x, i1 true )
@@ -95,7 +96,7 @@ define i64 @t8(i64 %x) {
define i32 @andn32(i32 %x, i32 %y) {
; CHECK-LABEL: andn32:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: andnl %esi, %edi, %eax
; CHECK-NEXT: retq
%tmp1 = xor i32 %x, -1
@@ -105,7 +106,7 @@ define i32 @andn32(i32 %x, i32 %y) {
define i32 @andn32_load(i32 %x, i32* %y) {
; CHECK-LABEL: andn32_load:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: andnl (%rsi), %edi, %eax
; CHECK-NEXT: retq
%y1 = load i32, i32* %y
@@ -116,7 +117,7 @@ define i32 @andn32_load(i32 %x, i32* %y) {
define i64 @andn64(i64 %x, i64 %y) {
; CHECK-LABEL: andn64:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: andnq %rsi, %rdi, %rax
; CHECK-NEXT: retq
%tmp1 = xor i64 %x, -1
@@ -127,7 +128,7 @@ define i64 @andn64(i64 %x, i64 %y) {
; Don't choose a 'test' if an 'andn' can be used.
define i1 @andn_cmp(i32 %x, i32 %y) {
; CHECK-LABEL: andn_cmp:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: andnl %esi, %edi, %eax
; CHECK-NEXT: sete %al
; CHECK-NEXT: retq
@@ -140,7 +141,7 @@ define i1 @andn_cmp(i32 %x, i32 %y) {
; Recognize a disguised andn in the following 4 tests.
define i1 @and_cmp1(i32 %x, i32 %y) {
; CHECK-LABEL: and_cmp1:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: andnl %esi, %edi, %eax
; CHECK-NEXT: sete %al
; CHECK-NEXT: retq
@@ -151,7 +152,7 @@ define i1 @and_cmp1(i32 %x, i32 %y) {
define i1 @and_cmp2(i32 %x, i32 %y) {
; CHECK-LABEL: and_cmp2:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: andnl %esi, %edi, %eax
; CHECK-NEXT: setne %al
; CHECK-NEXT: retq
@@ -162,7 +163,7 @@ define i1 @and_cmp2(i32 %x, i32 %y) {
define i1 @and_cmp3(i32 %x, i32 %y) {
; CHECK-LABEL: and_cmp3:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: andnl %esi, %edi, %eax
; CHECK-NEXT: sete %al
; CHECK-NEXT: retq
@@ -173,7 +174,7 @@ define i1 @and_cmp3(i32 %x, i32 %y) {
define i1 @and_cmp4(i32 %x, i32 %y) {
; CHECK-LABEL: and_cmp4:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: andnl %esi, %edi, %eax
; CHECK-NEXT: setne %al
; CHECK-NEXT: retq
@@ -186,7 +187,7 @@ define i1 @and_cmp4(i32 %x, i32 %y) {
; even though the BMI instruction doesn't have an immediate form.
define i1 @and_cmp_const(i32 %x) {
; CHECK-LABEL: and_cmp_const:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movl $43, %eax
; CHECK-NEXT: andnl %eax, %edi, %eax
; CHECK-NEXT: sete %al
@@ -199,7 +200,7 @@ define i1 @and_cmp_const(i32 %x) {
; But don't use 'andn' if the mask is a power-of-two.
define i1 @and_cmp_const_power_of_two(i32 %x, i32 %y) {
; CHECK-LABEL: and_cmp_const_power_of_two:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: btl %esi, %edi
; CHECK-NEXT: setae %al
; CHECK-NEXT: retq
@@ -212,7 +213,7 @@ define i1 @and_cmp_const_power_of_two(i32 %x, i32 %y) {
; Don't transform to 'andn' if there's another use of the 'and'.
define i32 @and_cmp_not_one_use(i32 %x) {
; CHECK-LABEL: and_cmp_not_one_use:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: andl $37, %edi
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: cmpl $37, %edi
@@ -229,7 +230,7 @@ define i32 @and_cmp_not_one_use(i32 %x) {
; Verify that we're not transforming invalid comparison predicates.
define i1 @not_an_andn1(i32 %x, i32 %y) {
; CHECK-LABEL: not_an_andn1:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: andl %esi, %edi
; CHECK-NEXT: cmpl %edi, %esi
; CHECK-NEXT: setg %al
@@ -241,7 +242,7 @@ define i1 @not_an_andn1(i32 %x, i32 %y) {
define i1 @not_an_andn2(i32 %x, i32 %y) {
; CHECK-LABEL: not_an_andn2:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: andl %esi, %edi
; CHECK-NEXT: cmpl %edi, %esi
; CHECK-NEXT: setbe %al
@@ -254,7 +255,7 @@ define i1 @not_an_andn2(i32 %x, i32 %y) {
; Don't choose a 'test' if an 'andn' can be used.
define i1 @andn_cmp_swap_ops(i64 %x, i64 %y) {
; CHECK-LABEL: andn_cmp_swap_ops:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: andnq %rsi, %rdi, %rax
; CHECK-NEXT: sete %al
; CHECK-NEXT: retq
@@ -267,7 +268,7 @@ define i1 @andn_cmp_swap_ops(i64 %x, i64 %y) {
; Use a 'test' (not an 'and') because 'andn' only works for i32/i64.
define i1 @andn_cmp_i8(i8 %x, i8 %y) {
; CHECK-LABEL: andn_cmp_i8:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: notb %sil
; CHECK-NEXT: testb %sil, %dil
; CHECK-NEXT: sete %al
@@ -280,7 +281,7 @@ define i1 @andn_cmp_i8(i8 %x, i8 %y) {
define i32 @bextr32(i32 %x, i32 %y) {
; CHECK-LABEL: bextr32:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: bextrl %esi, %edi, %eax
; CHECK-NEXT: retq
%tmp = tail call i32 @llvm.x86.bmi.bextr.32(i32 %x, i32 %y)
@@ -289,7 +290,7 @@ define i32 @bextr32(i32 %x, i32 %y) {
define i32 @bextr32_load(i32* %x, i32 %y) {
; CHECK-LABEL: bextr32_load:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: bextrl %esi, (%rdi), %eax
; CHECK-NEXT: retq
%x1 = load i32, i32* %x
@@ -301,7 +302,7 @@ declare i32 @llvm.x86.bmi.bextr.32(i32, i32)
define i32 @bextr32b(i32 %x) uwtable ssp {
; CHECK-LABEL: bextr32b:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movl $3076, %eax # imm = 0xC04
; CHECK-NEXT: bextrl %eax, %edi, %eax
; CHECK-NEXT: retq
@@ -310,9 +311,21 @@ define i32 @bextr32b(i32 %x) uwtable ssp {
ret i32 %2
}
+; Make sure we still use AH subreg trick to extract 15:8
+define i32 @bextr32_subreg(i32 %x) uwtable ssp {
+; CHECK-LABEL: bextr32_subreg:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: movzbl %ah, %eax # NOREX
+; CHECK-NEXT: retq
+ %1 = lshr i32 %x, 8
+ %2 = and i32 %1, 255
+ ret i32 %2
+}
+
define i32 @bextr32b_load(i32* %x) uwtable ssp {
; CHECK-LABEL: bextr32b_load:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movl $3076, %eax # imm = 0xC04
; CHECK-NEXT: bextrl %eax, (%rdi), %eax
; CHECK-NEXT: retq
@@ -322,9 +335,21 @@ define i32 @bextr32b_load(i32* %x) uwtable ssp {
ret i32 %3
}
+; PR34042
+define i32 @bextr32c(i32 %x, i16 zeroext %y) {
+; CHECK-LABEL: bextr32c:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movswl %si, %eax
+; CHECK-NEXT: bextrl %eax, %edi, %eax
+; CHECK-NEXT: retq
+ %tmp0 = sext i16 %y to i32
+ %tmp1 = tail call i32 @llvm.x86.bmi.bextr.32(i32 %x, i32 %tmp0)
+ ret i32 %tmp1
+}
+
define i64 @bextr64(i64 %x, i64 %y) {
; CHECK-LABEL: bextr64:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: bextrq %rsi, %rdi, %rax
; CHECK-NEXT: retq
%tmp = tail call i64 @llvm.x86.bmi.bextr.64(i64 %x, i64 %y)
@@ -335,7 +360,7 @@ declare i64 @llvm.x86.bmi.bextr.64(i64, i64)
define i64 @bextr64b(i64 %x) uwtable ssp {
; CHECK-LABEL: bextr64b:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movl $3076, %eax # imm = 0xC04
; CHECK-NEXT: bextrl %eax, %edi, %eax
; CHECK-NEXT: retq
@@ -344,9 +369,21 @@ define i64 @bextr64b(i64 %x) uwtable ssp {
ret i64 %2
}
+; Make sure we still use the AH subreg trick to extract 15:8
+define i64 @bextr64_subreg(i64 %x) uwtable ssp {
+; CHECK-LABEL: bextr64_subreg:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movq %rdi, %rax
+; CHECK-NEXT: movzbl %ah, %eax # NOREX
+; CHECK-NEXT: retq
+ %1 = lshr i64 %x, 8
+ %2 = and i64 %1, 255
+ ret i64 %2
+}
+
define i64 @bextr64b_load(i64* %x) {
; CHECK-LABEL: bextr64b_load:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movl $3076, %eax # imm = 0xC04
; CHECK-NEXT: bextrl %eax, (%rdi), %eax
; CHECK-NEXT: retq
@@ -356,9 +393,33 @@ define i64 @bextr64b_load(i64* %x) {
ret i64 %3
}
+; PR34042
+define i64 @bextr64c(i64 %x, i32 %y) {
+; CHECK-LABEL: bextr64c:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movslq %esi, %rax
+; CHECK-NEXT: bextrq %rax, %rdi, %rax
+; CHECK-NEXT: retq
+ %tmp0 = sext i32 %y to i64
+ %tmp1 = tail call i64 @llvm.x86.bmi.bextr.64(i64 %x, i64 %tmp0)
+ ret i64 %tmp1
+}
+
+define i64 @bextr64d(i64 %a) {
+; CHECK-LABEL: bextr64d:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: movl $8450, %eax # imm = 0x2102
+; CHECK-NEXT: bextrq %rax, %rdi, %rax
+; CHECK-NEXT: retq
+entry:
+ %shr = lshr i64 %a, 2
+ %and = and i64 %shr, 8589934591
+ ret i64 %and
+}
+
define i32 @non_bextr32(i32 %x) {
; CHECK-LABEL: non_bextr32:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: shrl $2, %edi
; CHECK-NEXT: andl $111, %edi
; CHECK-NEXT: movl %edi, %eax
@@ -371,7 +432,7 @@ entry:
define i64 @non_bextr64(i64 %x) {
; CHECK-LABEL: non_bextr64:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: shrq $2, %rdi
; CHECK-NEXT: movabsq $8589934590, %rax # imm = 0x1FFFFFFFE
; CHECK-NEXT: andq %rdi, %rax
@@ -382,43 +443,20 @@ entry:
ret i64 %and
}
-define i32 @bzhi32(i32 %x, i32 %y) {
-; CHECK-LABEL: bzhi32:
-; CHECK: # BB#0:
-; CHECK-NEXT: bzhil %esi, %edi, %eax
-; CHECK-NEXT: retq
- %tmp = tail call i32 @llvm.x86.bmi.bzhi.32(i32 %x, i32 %y)
- ret i32 %tmp
-}
-
-define i32 @bzhi32_load(i32* %x, i32 %y) {
-; CHECK-LABEL: bzhi32_load:
-; CHECK: # BB#0:
-; CHECK-NEXT: bzhil %esi, (%rdi), %eax
-; CHECK-NEXT: retq
- %x1 = load i32, i32* %x
- %tmp = tail call i32 @llvm.x86.bmi.bzhi.32(i32 %x1, i32 %y)
- ret i32 %tmp
-}
-
-declare i32 @llvm.x86.bmi.bzhi.32(i32, i32)
-
-define i64 @bzhi64(i64 %x, i64 %y) {
-; CHECK-LABEL: bzhi64:
-; CHECK: # BB#0:
-; CHECK-NEXT: bzhiq %rsi, %rdi, %rax
-; CHECK-NEXT: retq
- %tmp = tail call i64 @llvm.x86.bmi.bzhi.64(i64 %x, i64 %y)
- ret i64 %tmp
-}
-
-declare i64 @llvm.x86.bmi.bzhi.64(i64, i64)
-
define i32 @bzhi32b(i32 %x, i8 zeroext %index) {
-; CHECK-LABEL: bzhi32b:
-; CHECK: # BB#0: # %entry
-; CHECK-NEXT: bzhil %esi, %edi, %eax
-; CHECK-NEXT: retq
+; BMI1-LABEL: bzhi32b:
+; BMI1: # %bb.0: # %entry
+; BMI1-NEXT: movl $1, %eax
+; BMI1-NEXT: movl %esi, %ecx
+; BMI1-NEXT: shll %cl, %eax
+; BMI1-NEXT: decl %eax
+; BMI1-NEXT: andl %edi, %eax
+; BMI1-NEXT: retq
+;
+; BMI2-LABEL: bzhi32b:
+; BMI2: # %bb.0: # %entry
+; BMI2-NEXT: bzhil %esi, %edi, %eax
+; BMI2-NEXT: retq
entry:
%conv = zext i8 %index to i32
%shl = shl i32 1, %conv
@@ -428,10 +466,19 @@ entry:
}
define i32 @bzhi32b_load(i32* %w, i8 zeroext %index) {
-; CHECK-LABEL: bzhi32b_load:
-; CHECK: # BB#0: # %entry
-; CHECK-NEXT: bzhil %esi, (%rdi), %eax
-; CHECK-NEXT: retq
+; BMI1-LABEL: bzhi32b_load:
+; BMI1: # %bb.0: # %entry
+; BMI1-NEXT: movl $1, %eax
+; BMI1-NEXT: movl %esi, %ecx
+; BMI1-NEXT: shll %cl, %eax
+; BMI1-NEXT: decl %eax
+; BMI1-NEXT: andl (%rdi), %eax
+; BMI1-NEXT: retq
+;
+; BMI2-LABEL: bzhi32b_load:
+; BMI2: # %bb.0: # %entry
+; BMI2-NEXT: bzhil %esi, (%rdi), %eax
+; BMI2-NEXT: retq
entry:
%x = load i32, i32* %w
%conv = zext i8 %index to i32
@@ -442,10 +489,19 @@ entry:
}
define i32 @bzhi32c(i32 %x, i8 zeroext %index) {
-; CHECK-LABEL: bzhi32c:
-; CHECK: # BB#0: # %entry
-; CHECK-NEXT: bzhil %esi, %edi, %eax
-; CHECK-NEXT: retq
+; BMI1-LABEL: bzhi32c:
+; BMI1: # %bb.0: # %entry
+; BMI1-NEXT: movl $1, %eax
+; BMI1-NEXT: movl %esi, %ecx
+; BMI1-NEXT: shll %cl, %eax
+; BMI1-NEXT: decl %eax
+; BMI1-NEXT: andl %edi, %eax
+; BMI1-NEXT: retq
+;
+; BMI2-LABEL: bzhi32c:
+; BMI2: # %bb.0: # %entry
+; BMI2-NEXT: bzhil %esi, %edi, %eax
+; BMI2-NEXT: retq
entry:
%conv = zext i8 %index to i32
%shl = shl i32 1, %conv
@@ -455,10 +511,20 @@ entry:
}
define i32 @bzhi32d(i32 %a, i32 %b) {
-; CHECK-LABEL: bzhi32d:
-; CHECK: # BB#0: # %entry
-; CHECK-NEXT: bzhil %esi, %edi, %eax
-; CHECK-NEXT: retq
+; BMI1-LABEL: bzhi32d:
+; BMI1: # %bb.0: # %entry
+; BMI1-NEXT: movl $32, %ecx
+; BMI1-NEXT: subl %esi, %ecx
+; BMI1-NEXT: movl $-1, %eax
+; BMI1-NEXT: # kill: def %cl killed %cl killed %ecx
+; BMI1-NEXT: shrl %cl, %eax
+; BMI1-NEXT: andl %edi, %eax
+; BMI1-NEXT: retq
+;
+; BMI2-LABEL: bzhi32d:
+; BMI2: # %bb.0: # %entry
+; BMI2-NEXT: bzhil %esi, %edi, %eax
+; BMI2-NEXT: retq
entry:
%sub = sub i32 32, %b
%shr = lshr i32 -1, %sub
@@ -467,10 +533,20 @@ entry:
}
define i32 @bzhi32e(i32 %a, i32 %b) {
-; CHECK-LABEL: bzhi32e:
-; CHECK: # BB#0: # %entry
-; CHECK-NEXT: bzhil %esi, %edi, %eax
-; CHECK-NEXT: retq
+; BMI1-LABEL: bzhi32e:
+; BMI1: # %bb.0: # %entry
+; BMI1-NEXT: movl $32, %ecx
+; BMI1-NEXT: subl %esi, %ecx
+; BMI1-NEXT: shll %cl, %edi
+; BMI1-NEXT: # kill: def %cl killed %cl killed %ecx
+; BMI1-NEXT: shrl %cl, %edi
+; BMI1-NEXT: movl %edi, %eax
+; BMI1-NEXT: retq
+;
+; BMI2-LABEL: bzhi32e:
+; BMI2: # %bb.0: # %entry
+; BMI2-NEXT: bzhil %esi, %edi, %eax
+; BMI2-NEXT: retq
entry:
%sub = sub i32 32, %b
%shl = shl i32 %a, %sub
@@ -479,11 +555,20 @@ entry:
}
define i64 @bzhi64b(i64 %x, i8 zeroext %index) {
-; CHECK-LABEL: bzhi64b:
-; CHECK: # BB#0: # %entry
-; CHECK-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
-; CHECK-NEXT: bzhiq %rsi, %rdi, %rax
-; CHECK-NEXT: retq
+; BMI1-LABEL: bzhi64b:
+; BMI1: # %bb.0: # %entry
+; BMI1-NEXT: movl $1, %eax
+; BMI1-NEXT: movl %esi, %ecx
+; BMI1-NEXT: shlq %cl, %rax
+; BMI1-NEXT: decq %rax
+; BMI1-NEXT: andq %rdi, %rax
+; BMI1-NEXT: retq
+;
+; BMI2-LABEL: bzhi64b:
+; BMI2: # %bb.0: # %entry
+; BMI2-NEXT: # kill: def %esi killed %esi def %rsi
+; BMI2-NEXT: bzhiq %rsi, %rdi, %rax
+; BMI2-NEXT: retq
entry:
%conv = zext i8 %index to i64
%shl = shl i64 1, %conv
@@ -493,10 +578,20 @@ entry:
}
define i64 @bzhi64c(i64 %a, i64 %b) {
-; CHECK-LABEL: bzhi64c:
-; CHECK: # BB#0: # %entry
-; CHECK-NEXT: bzhiq %rsi, %rdi, %rax
-; CHECK-NEXT: retq
+; BMI1-LABEL: bzhi64c:
+; BMI1: # %bb.0: # %entry
+; BMI1-NEXT: movl $64, %ecx
+; BMI1-NEXT: subl %esi, %ecx
+; BMI1-NEXT: movq $-1, %rax
+; BMI1-NEXT: # kill: def %cl killed %cl killed %ecx
+; BMI1-NEXT: shrq %cl, %rax
+; BMI1-NEXT: andq %rdi, %rax
+; BMI1-NEXT: retq
+;
+; BMI2-LABEL: bzhi64c:
+; BMI2: # %bb.0: # %entry
+; BMI2-NEXT: bzhiq %rsi, %rdi, %rax
+; BMI2-NEXT: retq
entry:
%sub = sub i64 64, %b
%shr = lshr i64 -1, %sub
@@ -505,11 +600,21 @@ entry:
}
define i64 @bzhi64d(i64 %a, i32 %b) {
-; CHECK-LABEL: bzhi64d:
-; CHECK: # BB#0: # %entry
-; CHECK-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
-; CHECK-NEXT: bzhiq %rsi, %rdi, %rax
-; CHECK-NEXT: retq
+; BMI1-LABEL: bzhi64d:
+; BMI1: # %bb.0: # %entry
+; BMI1-NEXT: movl $64, %ecx
+; BMI1-NEXT: subl %esi, %ecx
+; BMI1-NEXT: movq $-1, %rax
+; BMI1-NEXT: # kill: def %cl killed %cl killed %ecx
+; BMI1-NEXT: shrq %cl, %rax
+; BMI1-NEXT: andq %rdi, %rax
+; BMI1-NEXT: retq
+;
+; BMI2-LABEL: bzhi64d:
+; BMI2: # %bb.0: # %entry
+; BMI2-NEXT: # kill: def %esi killed %esi def %rsi
+; BMI2-NEXT: bzhiq %rsi, %rdi, %rax
+; BMI2-NEXT: retq
entry:
%sub = sub i32 64, %b
%sh_prom = zext i32 %sub to i64
@@ -519,10 +624,20 @@ entry:
}
define i64 @bzhi64e(i64 %a, i64 %b) {
-; CHECK-LABEL: bzhi64e:
-; CHECK: # BB#0: # %entry
-; CHECK-NEXT: bzhiq %rsi, %rdi, %rax
-; CHECK-NEXT: retq
+; BMI1-LABEL: bzhi64e:
+; BMI1: # %bb.0: # %entry
+; BMI1-NEXT: movl $64, %ecx
+; BMI1-NEXT: subl %esi, %ecx
+; BMI1-NEXT: shlq %cl, %rdi
+; BMI1-NEXT: # kill: def %cl killed %cl killed %ecx
+; BMI1-NEXT: shrq %cl, %rdi
+; BMI1-NEXT: movq %rdi, %rax
+; BMI1-NEXT: retq
+;
+; BMI2-LABEL: bzhi64e:
+; BMI2: # %bb.0: # %entry
+; BMI2-NEXT: bzhiq %rsi, %rdi, %rax
+; BMI2-NEXT: retq
entry:
%sub = sub i64 64, %b
%shl = shl i64 %a, %sub
@@ -531,11 +646,21 @@ entry:
}
define i64 @bzhi64f(i64 %a, i32 %b) {
-; CHECK-LABEL: bzhi64f:
-; CHECK: # BB#0: # %entry
-; CHECK-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
-; CHECK-NEXT: bzhiq %rsi, %rdi, %rax
-; CHECK-NEXT: retq
+; BMI1-LABEL: bzhi64f:
+; BMI1: # %bb.0: # %entry
+; BMI1-NEXT: movl $64, %ecx
+; BMI1-NEXT: subl %esi, %ecx
+; BMI1-NEXT: shlq %cl, %rdi
+; BMI1-NEXT: # kill: def %cl killed %cl killed %ecx
+; BMI1-NEXT: shrq %cl, %rdi
+; BMI1-NEXT: movq %rdi, %rax
+; BMI1-NEXT: retq
+;
+; BMI2-LABEL: bzhi64f:
+; BMI2: # %bb.0: # %entry
+; BMI2-NEXT: # kill: def %esi killed %esi def %rsi
+; BMI2-NEXT: bzhiq %rsi, %rdi, %rax
+; BMI2-NEXT: retq
entry:
%sub = sub i32 64, %b
%sh_prom = zext i32 %sub to i64
@@ -545,19 +670,43 @@ entry:
}
define i64 @bzhi64_constant_mask(i64 %x) {
-; CHECK-LABEL: bzhi64_constant_mask:
-; CHECK: # BB#0: # %entry
-; CHECK-NEXT: movb $62, %al
-; CHECK-NEXT: bzhiq %rax, %rdi, %rax
-; CHECK-NEXT: retq
+; BMI1-LABEL: bzhi64_constant_mask:
+; BMI1: # %bb.0: # %entry
+; BMI1-NEXT: movl $15872, %eax # imm = 0x3E00
+; BMI1-NEXT: bextrq %rax, %rdi, %rax
+; BMI1-NEXT: retq
+;
+; BMI2-LABEL: bzhi64_constant_mask:
+; BMI2: # %bb.0: # %entry
+; BMI2-NEXT: movb $62, %al
+; BMI2-NEXT: bzhiq %rax, %rdi, %rax
+; BMI2-NEXT: retq
entry:
%and = and i64 %x, 4611686018427387903
ret i64 %and
}
+define i64 @bzhi64_constant_mask_load(i64* %x) {
+; BMI1-LABEL: bzhi64_constant_mask_load:
+; BMI1: # %bb.0: # %entry
+; BMI1-NEXT: movl $15872, %eax # imm = 0x3E00
+; BMI1-NEXT: bextrq %rax, (%rdi), %rax
+; BMI1-NEXT: retq
+;
+; BMI2-LABEL: bzhi64_constant_mask_load:
+; BMI2: # %bb.0: # %entry
+; BMI2-NEXT: movb $62, %al
+; BMI2-NEXT: bzhiq %rax, (%rdi), %rax
+; BMI2-NEXT: retq
+entry:
+ %x1 = load i64, i64* %x
+ %and = and i64 %x1, 4611686018427387903
+ ret i64 %and
+}
+
define i64 @bzhi64_small_constant_mask(i64 %x) {
; CHECK-LABEL: bzhi64_small_constant_mask:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: andl $2147483647, %edi # imm = 0x7FFFFFFF
; CHECK-NEXT: movq %rdi, %rax
; CHECK-NEXT: retq
@@ -568,7 +717,7 @@ entry:
define i32 @blsi32(i32 %x) {
; CHECK-LABEL: blsi32:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: blsil %edi, %eax
; CHECK-NEXT: retq
%tmp = sub i32 0, %x
@@ -578,7 +727,7 @@ define i32 @blsi32(i32 %x) {
define i32 @blsi32_load(i32* %x) {
; CHECK-LABEL: blsi32_load:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: blsil (%rdi), %eax
; CHECK-NEXT: retq
%x1 = load i32, i32* %x
@@ -589,7 +738,7 @@ define i32 @blsi32_load(i32* %x) {
define i64 @blsi64(i64 %x) {
; CHECK-LABEL: blsi64:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: blsiq %rdi, %rax
; CHECK-NEXT: retq
%tmp = sub i64 0, %x
@@ -599,7 +748,7 @@ define i64 @blsi64(i64 %x) {
define i32 @blsmsk32(i32 %x) {
; CHECK-LABEL: blsmsk32:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: blsmskl %edi, %eax
; CHECK-NEXT: retq
%tmp = sub i32 %x, 1
@@ -609,7 +758,7 @@ define i32 @blsmsk32(i32 %x) {
define i32 @blsmsk32_load(i32* %x) {
; CHECK-LABEL: blsmsk32_load:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: blsmskl (%rdi), %eax
; CHECK-NEXT: retq
%x1 = load i32, i32* %x
@@ -620,7 +769,7 @@ define i32 @blsmsk32_load(i32* %x) {
define i64 @blsmsk64(i64 %x) {
; CHECK-LABEL: blsmsk64:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: blsmskq %rdi, %rax
; CHECK-NEXT: retq
%tmp = sub i64 %x, 1
@@ -630,7 +779,7 @@ define i64 @blsmsk64(i64 %x) {
define i32 @blsr32(i32 %x) {
; CHECK-LABEL: blsr32:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: blsrl %edi, %eax
; CHECK-NEXT: retq
%tmp = sub i32 %x, 1
@@ -640,7 +789,7 @@ define i32 @blsr32(i32 %x) {
define i32 @blsr32_load(i32* %x) {
; CHECK-LABEL: blsr32_load:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: blsrl (%rdi), %eax
; CHECK-NEXT: retq
%x1 = load i32, i32* %x
@@ -651,7 +800,7 @@ define i32 @blsr32_load(i32* %x) {
define i64 @blsr64(i64 %x) {
; CHECK-LABEL: blsr64:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: blsrq %rdi, %rax
; CHECK-NEXT: retq
%tmp = sub i64 %x, 1
@@ -659,67 +808,3 @@ define i64 @blsr64(i64 %x) {
ret i64 %tmp2
}
-define i32 @pdep32(i32 %x, i32 %y) {
-; CHECK-LABEL: pdep32:
-; CHECK: # BB#0:
-; CHECK-NEXT: pdepl %esi, %edi, %eax
-; CHECK-NEXT: retq
- %tmp = tail call i32 @llvm.x86.bmi.pdep.32(i32 %x, i32 %y)
- ret i32 %tmp
-}
-
-define i32 @pdep32_load(i32 %x, i32* %y) {
-; CHECK-LABEL: pdep32_load:
-; CHECK: # BB#0:
-; CHECK-NEXT: pdepl (%rsi), %edi, %eax
-; CHECK-NEXT: retq
- %y1 = load i32, i32* %y
- %tmp = tail call i32 @llvm.x86.bmi.pdep.32(i32 %x, i32 %y1)
- ret i32 %tmp
-}
-
-declare i32 @llvm.x86.bmi.pdep.32(i32, i32)
-
-define i64 @pdep64(i64 %x, i64 %y) {
-; CHECK-LABEL: pdep64:
-; CHECK: # BB#0:
-; CHECK-NEXT: pdepq %rsi, %rdi, %rax
-; CHECK-NEXT: retq
- %tmp = tail call i64 @llvm.x86.bmi.pdep.64(i64 %x, i64 %y)
- ret i64 %tmp
-}
-
-declare i64 @llvm.x86.bmi.pdep.64(i64, i64)
-
-define i32 @pext32(i32 %x, i32 %y) {
-; CHECK-LABEL: pext32:
-; CHECK: # BB#0:
-; CHECK-NEXT: pextl %esi, %edi, %eax
-; CHECK-NEXT: retq
- %tmp = tail call i32 @llvm.x86.bmi.pext.32(i32 %x, i32 %y)
- ret i32 %tmp
-}
-
-define i32 @pext32_load(i32 %x, i32* %y) {
-; CHECK-LABEL: pext32_load:
-; CHECK: # BB#0:
-; CHECK-NEXT: pextl (%rsi), %edi, %eax
-; CHECK-NEXT: retq
- %y1 = load i32, i32* %y
- %tmp = tail call i32 @llvm.x86.bmi.pext.32(i32 %x, i32 %y1)
- ret i32 %tmp
-}
-
-declare i32 @llvm.x86.bmi.pext.32(i32, i32)
-
-define i64 @pext64(i64 %x, i64 %y) {
-; CHECK-LABEL: pext64:
-; CHECK: # BB#0:
-; CHECK-NEXT: pextq %rsi, %rdi, %rax
-; CHECK-NEXT: retq
- %tmp = tail call i64 @llvm.x86.bmi.pext.64(i64 %x, i64 %y)
- ret i64 %tmp
-}
-
-declare i64 @llvm.x86.bmi.pext.64(i64, i64)
-
diff --git a/test/CodeGen/X86/bmi2-schedule.ll b/test/CodeGen/X86/bmi2-schedule.ll
index 9666dd85d853..7effa1e1fb20 100644
--- a/test/CodeGen/X86/bmi2-schedule.ll
+++ b/test/CodeGen/X86/bmi2-schedule.ll
@@ -1,31 +1,53 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mattr=+bmi2 | FileCheck %s --check-prefix=CHECK --check-prefix=GENERIC
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=+bmi2 | FileCheck %s --check-prefix=CHECK --check-prefix=GENERIC
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=haswell | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=knl | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=broadwell | FileCheck %s --check-prefix=CHECK --check-prefix=BROADWELL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=SKYLAKE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=knl | FileCheck %s --check-prefix=CHECK --check-prefix=KNL
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1
define i32 @test_bzhi_i32(i32 %a0, i32 %a1, i32 *%a2) {
; GENERIC-LABEL: test_bzhi_i32:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: bzhil %edi, (%rdx), %ecx
-; GENERIC-NEXT: bzhil %edi, %esi, %eax
-; GENERIC-NEXT: addl %ecx, %eax
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: bzhil %edi, (%rdx), %ecx # sched: [5:0.50]
+; GENERIC-NEXT: bzhil %edi, %esi, %eax # sched: [1:0.33]
+; GENERIC-NEXT: addl %ecx, %eax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_bzhi_i32:
-; HASWELL: # BB#0:
-; HASWELL-NEXT: bzhil %edi, (%rdx), %ecx # sched: [4:0.50]
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: bzhil %edi, (%rdx), %ecx # sched: [6:0.50]
; HASWELL-NEXT: bzhil %edi, %esi, %eax # sched: [1:0.50]
; HASWELL-NEXT: addl %ecx, %eax # sched: [1:0.25]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_bzhi_i32:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: bzhil %edi, (%rdx), %ecx # sched: [6:0.50]
+; BROADWELL-NEXT: bzhil %edi, %esi, %eax # sched: [1:0.50]
+; BROADWELL-NEXT: addl %ecx, %eax # sched: [1:0.25]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_bzhi_i32:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: bzhil %edi, (%rdx), %ecx # sched: [6:0.50]
+; SKYLAKE-NEXT: bzhil %edi, %esi, %eax # sched: [1:0.50]
+; SKYLAKE-NEXT: addl %ecx, %eax # sched: [1:0.25]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; KNL-LABEL: test_bzhi_i32:
+; KNL: # %bb.0:
+; KNL-NEXT: bzhil %edi, (%rdx), %ecx # sched: [6:0.50]
+; KNL-NEXT: bzhil %edi, %esi, %eax # sched: [1:0.50]
+; KNL-NEXT: addl %ecx, %eax # sched: [1:0.25]
+; KNL-NEXT: retq # sched: [7:1.00]
;
; ZNVER1-LABEL: test_bzhi_i32:
-; ZNVER1: # BB#0:
-; ZNVER1-NEXT: bzhil %edi, (%rdx), %ecx # sched: [?:0.000000e+00]
-; ZNVER1-NEXT: bzhil %edi, %esi, %eax # sched: [?:0.000000e+00]
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: bzhil %edi, (%rdx), %ecx # sched: [5:0.50]
+; ZNVER1-NEXT: bzhil %edi, %esi, %eax # sched: [1:0.25]
; ZNVER1-NEXT: addl %ecx, %eax # sched: [1:0.25]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = load i32, i32 *%a2
%2 = tail call i32 @llvm.x86.bmi.bzhi.32(i32 %1, i32 %a0)
%3 = tail call i32 @llvm.x86.bmi.bzhi.32(i32 %a1, i32 %a0)
@@ -36,25 +58,46 @@ declare i32 @llvm.x86.bmi.bzhi.32(i32, i32)
define i64 @test_bzhi_i64(i64 %a0, i64 %a1, i64 *%a2) {
; GENERIC-LABEL: test_bzhi_i64:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: bzhiq %rdi, (%rdx), %rcx
-; GENERIC-NEXT: bzhiq %rdi, %rsi, %rax
-; GENERIC-NEXT: addq %rcx, %rax
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: bzhiq %rdi, (%rdx), %rcx # sched: [5:0.50]
+; GENERIC-NEXT: bzhiq %rdi, %rsi, %rax # sched: [1:0.33]
+; GENERIC-NEXT: addq %rcx, %rax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_bzhi_i64:
-; HASWELL: # BB#0:
-; HASWELL-NEXT: bzhiq %rdi, (%rdx), %rcx # sched: [4:0.50]
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: bzhiq %rdi, (%rdx), %rcx # sched: [6:0.50]
; HASWELL-NEXT: bzhiq %rdi, %rsi, %rax # sched: [1:0.50]
; HASWELL-NEXT: addq %rcx, %rax # sched: [1:0.25]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_bzhi_i64:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: bzhiq %rdi, (%rdx), %rcx # sched: [6:0.50]
+; BROADWELL-NEXT: bzhiq %rdi, %rsi, %rax # sched: [1:0.50]
+; BROADWELL-NEXT: addq %rcx, %rax # sched: [1:0.25]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_bzhi_i64:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: bzhiq %rdi, (%rdx), %rcx # sched: [6:0.50]
+; SKYLAKE-NEXT: bzhiq %rdi, %rsi, %rax # sched: [1:0.50]
+; SKYLAKE-NEXT: addq %rcx, %rax # sched: [1:0.25]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; KNL-LABEL: test_bzhi_i64:
+; KNL: # %bb.0:
+; KNL-NEXT: bzhiq %rdi, (%rdx), %rcx # sched: [6:0.50]
+; KNL-NEXT: bzhiq %rdi, %rsi, %rax # sched: [1:0.50]
+; KNL-NEXT: addq %rcx, %rax # sched: [1:0.25]
+; KNL-NEXT: retq # sched: [7:1.00]
;
; ZNVER1-LABEL: test_bzhi_i64:
-; ZNVER1: # BB#0:
-; ZNVER1-NEXT: bzhiq %rdi, (%rdx), %rcx # sched: [?:0.000000e+00]
-; ZNVER1-NEXT: bzhiq %rdi, %rsi, %rax # sched: [?:0.000000e+00]
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: bzhiq %rdi, (%rdx), %rcx # sched: [5:0.50]
+; ZNVER1-NEXT: bzhiq %rdi, %rsi, %rax # sched: [1:0.25]
; ZNVER1-NEXT: addq %rcx, %rax # sched: [1:0.25]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = load i64, i64 *%a2
%2 = tail call i64 @llvm.x86.bmi.bzhi.64(i64 %1, i64 %a0)
%3 = tail call i64 @llvm.x86.bmi.bzhi.64(i64 %a1, i64 %a0)
@@ -63,27 +106,168 @@ define i64 @test_bzhi_i64(i64 %a0, i64 %a1, i64 *%a2) {
}
declare i64 @llvm.x86.bmi.bzhi.64(i64, i64)
+define void @test_mulx_i32(i32 %a0, i32 %a1, i32* %a2) optsize {
+; GENERIC-LABEL: test_mulx_i32:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: mulxl %esi, %esi, %edi # sched: [3:1.00]
+; GENERIC-NEXT: mulxl (%rdx), %esi, %edi # sched: [7:1.00]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_mulx_i32:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: mulxl %esi, %esi, %edi # sched: [5:1.00]
+; HASWELL-NEXT: mulxl (%rdx), %esi, %edi # sched: [10:1.00]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_mulx_i32:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: mulxl %esi, %esi, %edi # sched: [5:1.00]
+; BROADWELL-NEXT: mulxl (%rdx), %esi, %edi # sched: [10:1.00]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_mulx_i32:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: mulxl %esi, %esi, %edi # sched: [5:1.00]
+; SKYLAKE-NEXT: mulxl (%rdx), %esi, %edi # sched: [10:1.00]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; KNL-LABEL: test_mulx_i32:
+; KNL: # %bb.0:
+; KNL-NEXT: #APP
+; KNL-NEXT: mulxl %esi, %esi, %edi # sched: [5:1.00]
+; KNL-NEXT: mulxl (%rdx), %esi, %edi # sched: [10:1.00]
+; KNL-NEXT: #NO_APP
+; KNL-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_mulx_i32:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: mulxl %esi, %esi, %edi # sched: [3:2.00]
+; ZNVER1-NEXT: mulxl (%rdx), %esi, %edi # sched: [8:2.00]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ tail call void asm "mulx $1, $1, $0 \0A\09 mulx $2, $1, $0 ", "r,r,*m"(i32 %a0, i32 %a1, i32* %a2) nounwind
+ ret void
+}
+
+define i64 @test_mulx_i64(i64 %a0, i64 %a1, i64 *%a2) {
+; GENERIC-LABEL: test_mulx_i64:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: movq %rdx, %rax # sched: [1:0.33]
+; GENERIC-NEXT: movq %rdi, %rdx # sched: [1:0.33]
+; GENERIC-NEXT: mulxq %rsi, %rsi, %rcx # sched: [3:1.00]
+; GENERIC-NEXT: mulxq (%rax), %rdx, %rax # sched: [7:1.00]
+; GENERIC-NEXT: orq %rcx, %rax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_mulx_i64:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: movq %rdx, %rax # sched: [1:0.25]
+; HASWELL-NEXT: movq %rdi, %rdx # sched: [1:0.25]
+; HASWELL-NEXT: mulxq %rsi, %rsi, %rcx # sched: [4:1.00]
+; HASWELL-NEXT: mulxq (%rax), %rdx, %rax # sched: [9:1.00]
+; HASWELL-NEXT: orq %rcx, %rax # sched: [1:0.25]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_mulx_i64:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: movq %rdx, %rax # sched: [1:0.25]
+; BROADWELL-NEXT: movq %rdi, %rdx # sched: [1:0.25]
+; BROADWELL-NEXT: mulxq %rsi, %rsi, %rcx # sched: [4:1.00]
+; BROADWELL-NEXT: mulxq (%rax), %rdx, %rax # sched: [9:1.00]
+; BROADWELL-NEXT: orq %rcx, %rax # sched: [1:0.25]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_mulx_i64:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: movq %rdx, %rax # sched: [1:0.25]
+; SKYLAKE-NEXT: movq %rdi, %rdx # sched: [1:0.25]
+; SKYLAKE-NEXT: mulxq %rsi, %rsi, %rcx # sched: [4:1.00]
+; SKYLAKE-NEXT: mulxq (%rax), %rdx, %rax # sched: [9:1.00]
+; SKYLAKE-NEXT: orq %rcx, %rax # sched: [1:0.25]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; KNL-LABEL: test_mulx_i64:
+; KNL: # %bb.0:
+; KNL-NEXT: movq %rdx, %rax # sched: [1:0.25]
+; KNL-NEXT: movq %rdi, %rdx # sched: [1:0.25]
+; KNL-NEXT: mulxq %rsi, %rsi, %rcx # sched: [4:1.00]
+; KNL-NEXT: mulxq (%rax), %rdx, %rax # sched: [9:1.00]
+; KNL-NEXT: orq %rcx, %rax # sched: [1:0.25]
+; KNL-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_mulx_i64:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: movq %rdx, %rax # sched: [1:0.25]
+; ZNVER1-NEXT: movq %rdi, %rdx # sched: [1:0.25]
+; ZNVER1-NEXT: mulxq %rsi, %rsi, %rcx # sched: [3:1.00]
+; ZNVER1-NEXT: mulxq (%rax), %rdx, %rax # sched: [8:1.00]
+; ZNVER1-NEXT: orq %rcx, %rax # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = load i64, i64 *%a2
+ %2 = zext i64 %a0 to i128
+ %3 = zext i64 %a1 to i128
+ %4 = zext i64 %1 to i128
+ %5 = mul i128 %2, %3
+ %6 = mul i128 %2, %4
+ %7 = lshr i128 %5, 64
+ %8 = lshr i128 %6, 64
+ %9 = trunc i128 %7 to i64
+ %10 = trunc i128 %8 to i64
+ %11 = or i64 %9, %10
+ ret i64 %11
+}
+
define i32 @test_pdep_i32(i32 %a0, i32 %a1, i32 *%a2) {
; GENERIC-LABEL: test_pdep_i32:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: pdepl (%rdx), %edi, %ecx
-; GENERIC-NEXT: pdepl %esi, %edi, %eax
-; GENERIC-NEXT: addl %ecx, %eax
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: pdepl (%rdx), %edi, %ecx # sched: [5:0.50]
+; GENERIC-NEXT: pdepl %esi, %edi, %eax # sched: [1:0.33]
+; GENERIC-NEXT: addl %ecx, %eax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pdep_i32:
-; HASWELL: # BB#0:
-; HASWELL-NEXT: pdepl (%rdx), %edi, %ecx # sched: [7:1.00]
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: pdepl (%rdx), %edi, %ecx # sched: [8:1.00]
; HASWELL-NEXT: pdepl %esi, %edi, %eax # sched: [3:1.00]
; HASWELL-NEXT: addl %ecx, %eax # sched: [1:0.25]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pdep_i32:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: pdepl (%rdx), %edi, %ecx # sched: [8:1.00]
+; BROADWELL-NEXT: pdepl %esi, %edi, %eax # sched: [3:1.00]
+; BROADWELL-NEXT: addl %ecx, %eax # sched: [1:0.25]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pdep_i32:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: pdepl (%rdx), %edi, %ecx # sched: [8:1.00]
+; SKYLAKE-NEXT: pdepl %esi, %edi, %eax # sched: [3:1.00]
+; SKYLAKE-NEXT: addl %ecx, %eax # sched: [1:0.25]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; KNL-LABEL: test_pdep_i32:
+; KNL: # %bb.0:
+; KNL-NEXT: pdepl (%rdx), %edi, %ecx # sched: [8:1.00]
+; KNL-NEXT: pdepl %esi, %edi, %eax # sched: [3:1.00]
+; KNL-NEXT: addl %ecx, %eax # sched: [1:0.25]
+; KNL-NEXT: retq # sched: [7:1.00]
;
; ZNVER1-LABEL: test_pdep_i32:
-; ZNVER1: # BB#0:
-; ZNVER1-NEXT: pdepl (%rdx), %edi, %ecx # sched: [?:0.000000e+00]
-; ZNVER1-NEXT: pdepl %esi, %edi, %eax # sched: [?:0.000000e+00]
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: pdepl (%rdx), %edi, %ecx # sched: [100:?]
+; ZNVER1-NEXT: pdepl %esi, %edi, %eax # sched: [100:?]
; ZNVER1-NEXT: addl %ecx, %eax # sched: [1:0.25]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = load i32, i32 *%a2
%2 = tail call i32 @llvm.x86.bmi.pdep.32(i32 %a0, i32 %1)
%3 = tail call i32 @llvm.x86.bmi.pdep.32(i32 %a0, i32 %a1)
@@ -94,25 +278,46 @@ declare i32 @llvm.x86.bmi.pdep.32(i32, i32)
define i64 @test_pdep_i64(i64 %a0, i64 %a1, i64 *%a2) {
; GENERIC-LABEL: test_pdep_i64:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: pdepq (%rdx), %rdi, %rcx
-; GENERIC-NEXT: pdepq %rsi, %rdi, %rax
-; GENERIC-NEXT: addq %rcx, %rax
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: pdepq (%rdx), %rdi, %rcx # sched: [5:0.50]
+; GENERIC-NEXT: pdepq %rsi, %rdi, %rax # sched: [1:0.33]
+; GENERIC-NEXT: addq %rcx, %rax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pdep_i64:
-; HASWELL: # BB#0:
-; HASWELL-NEXT: pdepq (%rdx), %rdi, %rcx # sched: [7:1.00]
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: pdepq (%rdx), %rdi, %rcx # sched: [8:1.00]
; HASWELL-NEXT: pdepq %rsi, %rdi, %rax # sched: [3:1.00]
; HASWELL-NEXT: addq %rcx, %rax # sched: [1:0.25]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pdep_i64:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: pdepq (%rdx), %rdi, %rcx # sched: [8:1.00]
+; BROADWELL-NEXT: pdepq %rsi, %rdi, %rax # sched: [3:1.00]
+; BROADWELL-NEXT: addq %rcx, %rax # sched: [1:0.25]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pdep_i64:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: pdepq (%rdx), %rdi, %rcx # sched: [8:1.00]
+; SKYLAKE-NEXT: pdepq %rsi, %rdi, %rax # sched: [3:1.00]
+; SKYLAKE-NEXT: addq %rcx, %rax # sched: [1:0.25]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; KNL-LABEL: test_pdep_i64:
+; KNL: # %bb.0:
+; KNL-NEXT: pdepq (%rdx), %rdi, %rcx # sched: [8:1.00]
+; KNL-NEXT: pdepq %rsi, %rdi, %rax # sched: [3:1.00]
+; KNL-NEXT: addq %rcx, %rax # sched: [1:0.25]
+; KNL-NEXT: retq # sched: [7:1.00]
;
; ZNVER1-LABEL: test_pdep_i64:
-; ZNVER1: # BB#0:
-; ZNVER1-NEXT: pdepq (%rdx), %rdi, %rcx # sched: [?:0.000000e+00]
-; ZNVER1-NEXT: pdepq %rsi, %rdi, %rax # sched: [?:0.000000e+00]
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: pdepq (%rdx), %rdi, %rcx # sched: [100:?]
+; ZNVER1-NEXT: pdepq %rsi, %rdi, %rax # sched: [100:?]
; ZNVER1-NEXT: addq %rcx, %rax # sched: [1:0.25]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = load i64, i64 *%a2
%2 = tail call i64 @llvm.x86.bmi.pdep.64(i64 %a0, i64 %1)
%3 = tail call i64 @llvm.x86.bmi.pdep.64(i64 %a0, i64 %a1)
@@ -123,25 +328,46 @@ declare i64 @llvm.x86.bmi.pdep.64(i64, i64)
define i32 @test_pext_i32(i32 %a0, i32 %a1, i32 *%a2) {
; GENERIC-LABEL: test_pext_i32:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: pextl (%rdx), %edi, %ecx
-; GENERIC-NEXT: pextl %esi, %edi, %eax
-; GENERIC-NEXT: addl %ecx, %eax
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: pextl (%rdx), %edi, %ecx # sched: [5:0.50]
+; GENERIC-NEXT: pextl %esi, %edi, %eax # sched: [1:0.33]
+; GENERIC-NEXT: addl %ecx, %eax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pext_i32:
-; HASWELL: # BB#0:
-; HASWELL-NEXT: pextl (%rdx), %edi, %ecx # sched: [7:1.00]
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: pextl (%rdx), %edi, %ecx # sched: [8:1.00]
; HASWELL-NEXT: pextl %esi, %edi, %eax # sched: [3:1.00]
; HASWELL-NEXT: addl %ecx, %eax # sched: [1:0.25]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pext_i32:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: pextl (%rdx), %edi, %ecx # sched: [8:1.00]
+; BROADWELL-NEXT: pextl %esi, %edi, %eax # sched: [3:1.00]
+; BROADWELL-NEXT: addl %ecx, %eax # sched: [1:0.25]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pext_i32:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: pextl (%rdx), %edi, %ecx # sched: [8:1.00]
+; SKYLAKE-NEXT: pextl %esi, %edi, %eax # sched: [3:1.00]
+; SKYLAKE-NEXT: addl %ecx, %eax # sched: [1:0.25]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; KNL-LABEL: test_pext_i32:
+; KNL: # %bb.0:
+; KNL-NEXT: pextl (%rdx), %edi, %ecx # sched: [8:1.00]
+; KNL-NEXT: pextl %esi, %edi, %eax # sched: [3:1.00]
+; KNL-NEXT: addl %ecx, %eax # sched: [1:0.25]
+; KNL-NEXT: retq # sched: [7:1.00]
;
; ZNVER1-LABEL: test_pext_i32:
-; ZNVER1: # BB#0:
-; ZNVER1-NEXT: pextl (%rdx), %edi, %ecx # sched: [?:0.000000e+00]
-; ZNVER1-NEXT: pextl %esi, %edi, %eax # sched: [?:0.000000e+00]
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: pextl (%rdx), %edi, %ecx # sched: [100:?]
+; ZNVER1-NEXT: pextl %esi, %edi, %eax # sched: [100:?]
; ZNVER1-NEXT: addl %ecx, %eax # sched: [1:0.25]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = load i32, i32 *%a2
%2 = tail call i32 @llvm.x86.bmi.pext.32(i32 %a0, i32 %1)
%3 = tail call i32 @llvm.x86.bmi.pext.32(i32 %a0, i32 %a1)
@@ -152,25 +378,46 @@ declare i32 @llvm.x86.bmi.pext.32(i32, i32)
define i64 @test_pext_i64(i64 %a0, i64 %a1, i64 *%a2) {
; GENERIC-LABEL: test_pext_i64:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: pextq (%rdx), %rdi, %rcx
-; GENERIC-NEXT: pextq %rsi, %rdi, %rax
-; GENERIC-NEXT: addq %rcx, %rax
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: pextq (%rdx), %rdi, %rcx # sched: [5:0.50]
+; GENERIC-NEXT: pextq %rsi, %rdi, %rax # sched: [1:0.33]
+; GENERIC-NEXT: addq %rcx, %rax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pext_i64:
-; HASWELL: # BB#0:
-; HASWELL-NEXT: pextq (%rdx), %rdi, %rcx # sched: [7:1.00]
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: pextq (%rdx), %rdi, %rcx # sched: [8:1.00]
; HASWELL-NEXT: pextq %rsi, %rdi, %rax # sched: [3:1.00]
; HASWELL-NEXT: addq %rcx, %rax # sched: [1:0.25]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pext_i64:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: pextq (%rdx), %rdi, %rcx # sched: [8:1.00]
+; BROADWELL-NEXT: pextq %rsi, %rdi, %rax # sched: [3:1.00]
+; BROADWELL-NEXT: addq %rcx, %rax # sched: [1:0.25]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pext_i64:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: pextq (%rdx), %rdi, %rcx # sched: [8:1.00]
+; SKYLAKE-NEXT: pextq %rsi, %rdi, %rax # sched: [3:1.00]
+; SKYLAKE-NEXT: addq %rcx, %rax # sched: [1:0.25]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; KNL-LABEL: test_pext_i64:
+; KNL: # %bb.0:
+; KNL-NEXT: pextq (%rdx), %rdi, %rcx # sched: [8:1.00]
+; KNL-NEXT: pextq %rsi, %rdi, %rax # sched: [3:1.00]
+; KNL-NEXT: addq %rcx, %rax # sched: [1:0.25]
+; KNL-NEXT: retq # sched: [7:1.00]
;
; ZNVER1-LABEL: test_pext_i64:
-; ZNVER1: # BB#0:
-; ZNVER1-NEXT: pextq (%rdx), %rdi, %rcx # sched: [?:0.000000e+00]
-; ZNVER1-NEXT: pextq %rsi, %rdi, %rax # sched: [?:0.000000e+00]
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: pextq (%rdx), %rdi, %rcx # sched: [100:?]
+; ZNVER1-NEXT: pextq %rsi, %rdi, %rax # sched: [100:?]
; ZNVER1-NEXT: addq %rcx, %rax # sched: [1:0.25]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = load i64, i64 *%a2
%2 = tail call i64 @llvm.x86.bmi.pext.64(i64 %a0, i64 %1)
%3 = tail call i64 @llvm.x86.bmi.pext.64(i64 %a0, i64 %a1)
@@ -178,3 +425,403 @@ define i64 @test_pext_i64(i64 %a0, i64 %a1, i64 *%a2) {
ret i64 %4
}
declare i64 @llvm.x86.bmi.pext.64(i64, i64)
+
+define i32 @test_rorx_i32(i32 %a0, i32 %a1, i32 *%a2) {
+; GENERIC-LABEL: test_rorx_i32:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: rorxl $5, %edi, %ecx # sched: [1:0.50]
+; GENERIC-NEXT: rorxl $5, (%rdx), %eax # sched: [5:0.50]
+; GENERIC-NEXT: addl %ecx, %eax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_rorx_i32:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: rorxl $5, %edi, %ecx # sched: [1:0.50]
+; HASWELL-NEXT: rorxl $5, (%rdx), %eax # sched: [6:0.50]
+; HASWELL-NEXT: addl %ecx, %eax # sched: [1:0.25]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_rorx_i32:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: rorxl $5, %edi, %ecx # sched: [1:0.50]
+; BROADWELL-NEXT: rorxl $5, (%rdx), %eax # sched: [6:0.50]
+; BROADWELL-NEXT: addl %ecx, %eax # sched: [1:0.25]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_rorx_i32:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: rorxl $5, %edi, %ecx # sched: [1:0.50]
+; SKYLAKE-NEXT: rorxl $5, (%rdx), %eax # sched: [6:0.50]
+; SKYLAKE-NEXT: addl %ecx, %eax # sched: [1:0.25]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; KNL-LABEL: test_rorx_i32:
+; KNL: # %bb.0:
+; KNL-NEXT: rorxl $5, %edi, %ecx # sched: [1:0.50]
+; KNL-NEXT: rorxl $5, (%rdx), %eax # sched: [6:0.50]
+; KNL-NEXT: addl %ecx, %eax # sched: [1:0.25]
+; KNL-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_rorx_i32:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: rorxl $5, (%rdx), %eax # sched: [5:0.50]
+; ZNVER1-NEXT: rorxl $5, %edi, %ecx # sched: [1:0.25]
+; ZNVER1-NEXT: addl %ecx, %eax # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = load i32, i32 *%a2
+ %2 = lshr i32 %a0, 5
+ %3 = shl i32 %a0, 27
+ %4 = or i32 %2, %3
+ %5 = lshr i32 %1, 5
+ %6 = shl i32 %1, 27
+ %7 = or i32 %5, %6
+ %8 = add i32 %4, %7
+ ret i32 %8
+}
+
+define i64 @test_rorx_i64(i64 %a0, i64 %a1, i64 *%a2) {
+; GENERIC-LABEL: test_rorx_i64:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: rorxq $5, %rdi, %rcx # sched: [1:0.50]
+; GENERIC-NEXT: rorxq $5, (%rdx), %rax # sched: [5:0.50]
+; GENERIC-NEXT: addq %rcx, %rax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_rorx_i64:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: rorxq $5, %rdi, %rcx # sched: [1:0.50]
+; HASWELL-NEXT: rorxq $5, (%rdx), %rax # sched: [6:0.50]
+; HASWELL-NEXT: addq %rcx, %rax # sched: [1:0.25]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_rorx_i64:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: rorxq $5, %rdi, %rcx # sched: [1:0.50]
+; BROADWELL-NEXT: rorxq $5, (%rdx), %rax # sched: [6:0.50]
+; BROADWELL-NEXT: addq %rcx, %rax # sched: [1:0.25]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_rorx_i64:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: rorxq $5, %rdi, %rcx # sched: [1:0.50]
+; SKYLAKE-NEXT: rorxq $5, (%rdx), %rax # sched: [6:0.50]
+; SKYLAKE-NEXT: addq %rcx, %rax # sched: [1:0.25]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; KNL-LABEL: test_rorx_i64:
+; KNL: # %bb.0:
+; KNL-NEXT: rorxq $5, %rdi, %rcx # sched: [1:0.50]
+; KNL-NEXT: rorxq $5, (%rdx), %rax # sched: [6:0.50]
+; KNL-NEXT: addq %rcx, %rax # sched: [1:0.25]
+; KNL-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_rorx_i64:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: rorxq $5, (%rdx), %rax # sched: [5:0.50]
+; ZNVER1-NEXT: rorxq $5, %rdi, %rcx # sched: [1:0.25]
+; ZNVER1-NEXT: addq %rcx, %rax # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = load i64, i64 *%a2
+ %2 = lshr i64 %a0, 5
+ %3 = shl i64 %a0, 59
+ %4 = or i64 %2, %3
+ %5 = lshr i64 %1, 5
+ %6 = shl i64 %1, 59
+ %7 = or i64 %5, %6
+ %8 = add i64 %4, %7
+ ret i64 %8
+}
+
+define i32 @test_sarx_i32(i32 %a0, i32 %a1, i32 *%a2) {
+; GENERIC-LABEL: test_sarx_i32:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: sarxl %esi, %edi, %ecx # sched: [1:0.50]
+; GENERIC-NEXT: sarxl %esi, (%rdx), %eax # sched: [5:0.50]
+; GENERIC-NEXT: addl %ecx, %eax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_sarx_i32:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: sarxl %esi, %edi, %ecx # sched: [1:0.50]
+; HASWELL-NEXT: sarxl %esi, (%rdx), %eax # sched: [6:0.50]
+; HASWELL-NEXT: addl %ecx, %eax # sched: [1:0.25]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_sarx_i32:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: sarxl %esi, %edi, %ecx # sched: [1:0.50]
+; BROADWELL-NEXT: sarxl %esi, (%rdx), %eax # sched: [6:0.50]
+; BROADWELL-NEXT: addl %ecx, %eax # sched: [1:0.25]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_sarx_i32:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: sarxl %esi, %edi, %ecx # sched: [1:0.50]
+; SKYLAKE-NEXT: sarxl %esi, (%rdx), %eax # sched: [6:0.50]
+; SKYLAKE-NEXT: addl %ecx, %eax # sched: [1:0.25]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; KNL-LABEL: test_sarx_i32:
+; KNL: # %bb.0:
+; KNL-NEXT: sarxl %esi, %edi, %ecx # sched: [1:0.50]
+; KNL-NEXT: sarxl %esi, (%rdx), %eax # sched: [6:0.50]
+; KNL-NEXT: addl %ecx, %eax # sched: [1:0.25]
+; KNL-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_sarx_i32:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: sarxl %esi, (%rdx), %eax # sched: [5:0.50]
+; ZNVER1-NEXT: sarxl %esi, %edi, %ecx # sched: [1:0.25]
+; ZNVER1-NEXT: addl %ecx, %eax # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = load i32, i32 *%a2
+ %2 = ashr i32 %a0, %a1
+ %3 = ashr i32 %1, %a1
+ %4 = add i32 %2, %3
+ ret i32 %4
+}
+
+define i64 @test_sarx_i64(i64 %a0, i64 %a1, i64 *%a2) {
+; GENERIC-LABEL: test_sarx_i64:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: sarxq %rsi, %rdi, %rcx # sched: [1:0.50]
+; GENERIC-NEXT: sarxq %rsi, (%rdx), %rax # sched: [5:0.50]
+; GENERIC-NEXT: addq %rcx, %rax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_sarx_i64:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: sarxq %rsi, %rdi, %rcx # sched: [1:0.50]
+; HASWELL-NEXT: sarxq %rsi, (%rdx), %rax # sched: [6:0.50]
+; HASWELL-NEXT: addq %rcx, %rax # sched: [1:0.25]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_sarx_i64:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: sarxq %rsi, %rdi, %rcx # sched: [1:0.50]
+; BROADWELL-NEXT: sarxq %rsi, (%rdx), %rax # sched: [6:0.50]
+; BROADWELL-NEXT: addq %rcx, %rax # sched: [1:0.25]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_sarx_i64:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: sarxq %rsi, %rdi, %rcx # sched: [1:0.50]
+; SKYLAKE-NEXT: sarxq %rsi, (%rdx), %rax # sched: [6:0.50]
+; SKYLAKE-NEXT: addq %rcx, %rax # sched: [1:0.25]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; KNL-LABEL: test_sarx_i64:
+; KNL: # %bb.0:
+; KNL-NEXT: sarxq %rsi, %rdi, %rcx # sched: [1:0.50]
+; KNL-NEXT: sarxq %rsi, (%rdx), %rax # sched: [6:0.50]
+; KNL-NEXT: addq %rcx, %rax # sched: [1:0.25]
+; KNL-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_sarx_i64:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: sarxq %rsi, (%rdx), %rax # sched: [5:0.50]
+; ZNVER1-NEXT: sarxq %rsi, %rdi, %rcx # sched: [1:0.25]
+; ZNVER1-NEXT: addq %rcx, %rax # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = load i64, i64 *%a2
+ %2 = ashr i64 %a0, %a1
+ %3 = ashr i64 %1, %a1
+ %4 = add i64 %2, %3
+ ret i64 %4
+}
+
+define i32 @test_shlx_i32(i32 %a0, i32 %a1, i32 *%a2) {
+; GENERIC-LABEL: test_shlx_i32:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: shlxl %esi, %edi, %ecx # sched: [1:0.50]
+; GENERIC-NEXT: shlxl %esi, (%rdx), %eax # sched: [5:0.50]
+; GENERIC-NEXT: addl %ecx, %eax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_shlx_i32:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: shlxl %esi, %edi, %ecx # sched: [1:0.50]
+; HASWELL-NEXT: shlxl %esi, (%rdx), %eax # sched: [6:0.50]
+; HASWELL-NEXT: addl %ecx, %eax # sched: [1:0.25]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_shlx_i32:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: shlxl %esi, %edi, %ecx # sched: [1:0.50]
+; BROADWELL-NEXT: shlxl %esi, (%rdx), %eax # sched: [6:0.50]
+; BROADWELL-NEXT: addl %ecx, %eax # sched: [1:0.25]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_shlx_i32:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: shlxl %esi, %edi, %ecx # sched: [1:0.50]
+; SKYLAKE-NEXT: shlxl %esi, (%rdx), %eax # sched: [6:0.50]
+; SKYLAKE-NEXT: addl %ecx, %eax # sched: [1:0.25]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; KNL-LABEL: test_shlx_i32:
+; KNL: # %bb.0:
+; KNL-NEXT: shlxl %esi, %edi, %ecx # sched: [1:0.50]
+; KNL-NEXT: shlxl %esi, (%rdx), %eax # sched: [6:0.50]
+; KNL-NEXT: addl %ecx, %eax # sched: [1:0.25]
+; KNL-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_shlx_i32:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: shlxl %esi, (%rdx), %eax # sched: [5:0.50]
+; ZNVER1-NEXT: shlxl %esi, %edi, %ecx # sched: [1:0.25]
+; ZNVER1-NEXT: addl %ecx, %eax # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = load i32, i32 *%a2
+ %2 = shl i32 %a0, %a1
+ %3 = shl i32 %1, %a1
+ %4 = add i32 %2, %3
+ ret i32 %4
+}
+
+define i64 @test_shlx_i64(i64 %a0, i64 %a1, i64 *%a2) {
+; GENERIC-LABEL: test_shlx_i64:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: shlxq %rsi, %rdi, %rcx # sched: [1:0.50]
+; GENERIC-NEXT: shlxq %rsi, (%rdx), %rax # sched: [5:0.50]
+; GENERIC-NEXT: addq %rcx, %rax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_shlx_i64:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: shlxq %rsi, %rdi, %rcx # sched: [1:0.50]
+; HASWELL-NEXT: shlxq %rsi, (%rdx), %rax # sched: [6:0.50]
+; HASWELL-NEXT: addq %rcx, %rax # sched: [1:0.25]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_shlx_i64:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: shlxq %rsi, %rdi, %rcx # sched: [1:0.50]
+; BROADWELL-NEXT: shlxq %rsi, (%rdx), %rax # sched: [6:0.50]
+; BROADWELL-NEXT: addq %rcx, %rax # sched: [1:0.25]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_shlx_i64:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: shlxq %rsi, %rdi, %rcx # sched: [1:0.50]
+; SKYLAKE-NEXT: shlxq %rsi, (%rdx), %rax # sched: [6:0.50]
+; SKYLAKE-NEXT: addq %rcx, %rax # sched: [1:0.25]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; KNL-LABEL: test_shlx_i64:
+; KNL: # %bb.0:
+; KNL-NEXT: shlxq %rsi, %rdi, %rcx # sched: [1:0.50]
+; KNL-NEXT: shlxq %rsi, (%rdx), %rax # sched: [6:0.50]
+; KNL-NEXT: addq %rcx, %rax # sched: [1:0.25]
+; KNL-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_shlx_i64:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: shlxq %rsi, (%rdx), %rax # sched: [5:0.50]
+; ZNVER1-NEXT: shlxq %rsi, %rdi, %rcx # sched: [1:0.25]
+; ZNVER1-NEXT: addq %rcx, %rax # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = load i64, i64 *%a2
+ %2 = shl i64 %a0, %a1
+ %3 = shl i64 %1, %a1
+ %4 = add i64 %2, %3
+ ret i64 %4
+}
+
+define i32 @test_shrx_i32(i32 %a0, i32 %a1, i32 *%a2) {
+; GENERIC-LABEL: test_shrx_i32:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: shrxl %esi, %edi, %ecx # sched: [1:0.50]
+; GENERIC-NEXT: shrxl %esi, (%rdx), %eax # sched: [5:0.50]
+; GENERIC-NEXT: addl %ecx, %eax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_shrx_i32:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: shrxl %esi, %edi, %ecx # sched: [1:0.50]
+; HASWELL-NEXT: shrxl %esi, (%rdx), %eax # sched: [6:0.50]
+; HASWELL-NEXT: addl %ecx, %eax # sched: [1:0.25]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_shrx_i32:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: shrxl %esi, %edi, %ecx # sched: [1:0.50]
+; BROADWELL-NEXT: shrxl %esi, (%rdx), %eax # sched: [6:0.50]
+; BROADWELL-NEXT: addl %ecx, %eax # sched: [1:0.25]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_shrx_i32:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: shrxl %esi, %edi, %ecx # sched: [1:0.50]
+; SKYLAKE-NEXT: shrxl %esi, (%rdx), %eax # sched: [6:0.50]
+; SKYLAKE-NEXT: addl %ecx, %eax # sched: [1:0.25]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; KNL-LABEL: test_shrx_i32:
+; KNL: # %bb.0:
+; KNL-NEXT: shrxl %esi, %edi, %ecx # sched: [1:0.50]
+; KNL-NEXT: shrxl %esi, (%rdx), %eax # sched: [6:0.50]
+; KNL-NEXT: addl %ecx, %eax # sched: [1:0.25]
+; KNL-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_shrx_i32:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: shrxl %esi, (%rdx), %eax # sched: [5:0.50]
+; ZNVER1-NEXT: shrxl %esi, %edi, %ecx # sched: [1:0.25]
+; ZNVER1-NEXT: addl %ecx, %eax # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = load i32, i32 *%a2
+ %2 = lshr i32 %a0, %a1
+ %3 = lshr i32 %1, %a1
+ %4 = add i32 %2, %3
+ ret i32 %4
+}
+
+define i64 @test_shrx_i64(i64 %a0, i64 %a1, i64 *%a2) {
+; GENERIC-LABEL: test_shrx_i64:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: shrxq %rsi, %rdi, %rcx # sched: [1:0.50]
+; GENERIC-NEXT: shrxq %rsi, (%rdx), %rax # sched: [5:0.50]
+; GENERIC-NEXT: addq %rcx, %rax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_shrx_i64:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: shrxq %rsi, %rdi, %rcx # sched: [1:0.50]
+; HASWELL-NEXT: shrxq %rsi, (%rdx), %rax # sched: [6:0.50]
+; HASWELL-NEXT: addq %rcx, %rax # sched: [1:0.25]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_shrx_i64:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: shrxq %rsi, %rdi, %rcx # sched: [1:0.50]
+; BROADWELL-NEXT: shrxq %rsi, (%rdx), %rax # sched: [6:0.50]
+; BROADWELL-NEXT: addq %rcx, %rax # sched: [1:0.25]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_shrx_i64:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: shrxq %rsi, %rdi, %rcx # sched: [1:0.50]
+; SKYLAKE-NEXT: shrxq %rsi, (%rdx), %rax # sched: [6:0.50]
+; SKYLAKE-NEXT: addq %rcx, %rax # sched: [1:0.25]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; KNL-LABEL: test_shrx_i64:
+; KNL: # %bb.0:
+; KNL-NEXT: shrxq %rsi, %rdi, %rcx # sched: [1:0.50]
+; KNL-NEXT: shrxq %rsi, (%rdx), %rax # sched: [6:0.50]
+; KNL-NEXT: addq %rcx, %rax # sched: [1:0.25]
+; KNL-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_shrx_i64:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: shrxq %rsi, (%rdx), %rax # sched: [5:0.50]
+; ZNVER1-NEXT: shrxq %rsi, %rdi, %rcx # sched: [1:0.25]
+; ZNVER1-NEXT: addq %rcx, %rax # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = load i64, i64 *%a2
+ %2 = lshr i64 %a0, %a1
+ %3 = lshr i64 %1, %a1
+ %4 = add i64 %2, %3
+ ret i64 %4
+}
diff --git a/test/CodeGen/X86/bmi2.ll b/test/CodeGen/X86/bmi2.ll
new file mode 100644
index 000000000000..226bf6531fd7
--- /dev/null
+++ b/test/CodeGen/X86/bmi2.ll
@@ -0,0 +1,99 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+bmi,+bmi2 | FileCheck %s
+
+define i32 @bzhi32(i32 %x, i32 %y) {
+; CHECK-LABEL: bzhi32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: bzhil %esi, %edi, %eax
+; CHECK-NEXT: retq
+ %tmp = tail call i32 @llvm.x86.bmi.bzhi.32(i32 %x, i32 %y)
+ ret i32 %tmp
+}
+
+define i32 @bzhi32_load(i32* %x, i32 %y) {
+; CHECK-LABEL: bzhi32_load:
+; CHECK: # %bb.0:
+; CHECK-NEXT: bzhil %esi, (%rdi), %eax
+; CHECK-NEXT: retq
+ %x1 = load i32, i32* %x
+ %tmp = tail call i32 @llvm.x86.bmi.bzhi.32(i32 %x1, i32 %y)
+ ret i32 %tmp
+}
+
+declare i32 @llvm.x86.bmi.bzhi.32(i32, i32)
+
+define i64 @bzhi64(i64 %x, i64 %y) {
+; CHECK-LABEL: bzhi64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: bzhiq %rsi, %rdi, %rax
+; CHECK-NEXT: retq
+ %tmp = tail call i64 @llvm.x86.bmi.bzhi.64(i64 %x, i64 %y)
+ ret i64 %tmp
+}
+
+declare i64 @llvm.x86.bmi.bzhi.64(i64, i64)
+
+define i32 @pdep32(i32 %x, i32 %y) {
+; CHECK-LABEL: pdep32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: pdepl %esi, %edi, %eax
+; CHECK-NEXT: retq
+ %tmp = tail call i32 @llvm.x86.bmi.pdep.32(i32 %x, i32 %y)
+ ret i32 %tmp
+}
+
+define i32 @pdep32_load(i32 %x, i32* %y) {
+; CHECK-LABEL: pdep32_load:
+; CHECK: # %bb.0:
+; CHECK-NEXT: pdepl (%rsi), %edi, %eax
+; CHECK-NEXT: retq
+ %y1 = load i32, i32* %y
+ %tmp = tail call i32 @llvm.x86.bmi.pdep.32(i32 %x, i32 %y1)
+ ret i32 %tmp
+}
+
+declare i32 @llvm.x86.bmi.pdep.32(i32, i32)
+
+define i64 @pdep64(i64 %x, i64 %y) {
+; CHECK-LABEL: pdep64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: pdepq %rsi, %rdi, %rax
+; CHECK-NEXT: retq
+ %tmp = tail call i64 @llvm.x86.bmi.pdep.64(i64 %x, i64 %y)
+ ret i64 %tmp
+}
+
+declare i64 @llvm.x86.bmi.pdep.64(i64, i64)
+
+define i32 @pext32(i32 %x, i32 %y) {
+; CHECK-LABEL: pext32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: pextl %esi, %edi, %eax
+; CHECK-NEXT: retq
+ %tmp = tail call i32 @llvm.x86.bmi.pext.32(i32 %x, i32 %y)
+ ret i32 %tmp
+}
+
+define i32 @pext32_load(i32 %x, i32* %y) {
+; CHECK-LABEL: pext32_load:
+; CHECK: # %bb.0:
+; CHECK-NEXT: pextl (%rsi), %edi, %eax
+; CHECK-NEXT: retq
+ %y1 = load i32, i32* %y
+ %tmp = tail call i32 @llvm.x86.bmi.pext.32(i32 %x, i32 %y1)
+ ret i32 %tmp
+}
+
+declare i32 @llvm.x86.bmi.pext.32(i32, i32)
+
+define i64 @pext64(i64 %x, i64 %y) {
+; CHECK-LABEL: pext64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: pextq %rsi, %rdi, %rax
+; CHECK-NEXT: retq
+ %tmp = tail call i64 @llvm.x86.bmi.pext.64(i64 %x, i64 %y)
+ ret i64 %tmp
+}
+
+declare i64 @llvm.x86.bmi.pext.64(i64, i64)
+
diff --git a/test/CodeGen/X86/bool-ext-inc.ll b/test/CodeGen/X86/bool-ext-inc.ll
index 7c1042878d59..d5711fdb3ca1 100644
--- a/test/CodeGen/X86/bool-ext-inc.ll
+++ b/test/CodeGen/X86/bool-ext-inc.ll
@@ -5,7 +5,7 @@
define i32 @sext_inc(i1 zeroext %x) nounwind {
; CHECK-LABEL: sext_inc:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: xorb $1, %dil
; CHECK-NEXT: movzbl %dil, %eax
; CHECK-NEXT: retq
@@ -18,7 +18,7 @@ define i32 @sext_inc(i1 zeroext %x) nounwind {
define <4 x i32> @sext_inc_vec(<4 x i1> %x) nounwind {
; CHECK-LABEL: sext_inc_vec:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vbroadcastss {{.*#+}} xmm1 = [1,1,1,1]
; CHECK-NEXT: vandnps %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
@@ -29,7 +29,7 @@ define <4 x i32> @sext_inc_vec(<4 x i1> %x) nounwind {
define <4 x i32> @cmpgt_sext_inc_vec(<4 x i32> %x, <4 x i32> %y) nounwind {
; CHECK-LABEL: cmpgt_sext_inc_vec:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1]
; CHECK-NEXT: vpandn %xmm1, %xmm0, %xmm0
@@ -42,7 +42,7 @@ define <4 x i32> @cmpgt_sext_inc_vec(<4 x i32> %x, <4 x i32> %y) nounwind {
define <4 x i32> @cmpne_sext_inc_vec(<4 x i32> %x, <4 x i32> %y) nounwind {
; CHECK-LABEL: cmpne_sext_inc_vec:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; CHECK-NEXT: vpsrld $31, %xmm0, %xmm0
; CHECK-NEXT: retq
@@ -54,7 +54,7 @@ define <4 x i32> @cmpne_sext_inc_vec(<4 x i32> %x, <4 x i32> %y) nounwind {
define <4 x i64> @cmpgt_sext_inc_vec256(<4 x i64> %x, <4 x i64> %y) nounwind {
; CHECK-LABEL: cmpgt_sext_inc_vec256:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0
; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm1 = [1,1,1,1]
; CHECK-NEXT: vpandn %ymm1, %ymm0, %ymm0
@@ -67,7 +67,7 @@ define <4 x i64> @cmpgt_sext_inc_vec256(<4 x i64> %x, <4 x i64> %y) nounwind {
define i32 @bool_logic_and_math(i32 %a, i32 %b, i32 %c, i32 %d) nounwind {
; CHECK-LABEL: bool_logic_and_math:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: cmpl %esi, %edi
; CHECK-NEXT: sete %al
; CHECK-NEXT: cmpl %ecx, %edx
@@ -85,7 +85,7 @@ define i32 @bool_logic_and_math(i32 %a, i32 %b, i32 %c, i32 %d) nounwind {
define <4 x i32> @bool_logic_and_math_vec(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d) nounwind {
; CHECK-LABEL: bool_logic_and_math_vec:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; CHECK-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm1
; CHECK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
diff --git a/test/CodeGen/X86/bool-simplify.ll b/test/CodeGen/X86/bool-simplify.ll
index 7f7f9791d903..87929ad33258 100644
--- a/test/CodeGen/X86/bool-simplify.ll
+++ b/test/CodeGen/X86/bool-simplify.ll
@@ -3,7 +3,7 @@
define i32 @foo(<2 x i64> %c, i32 %a, i32 %b) {
; CHECK-LABEL: foo:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: ptest %xmm0, %xmm0
; CHECK-NEXT: cmovnel %esi, %edi
; CHECK-NEXT: movl %edi, %eax
@@ -16,10 +16,10 @@ define i32 @foo(<2 x i64> %c, i32 %a, i32 %b) {
define i32 @bar(<2 x i64> %c) {
; CHECK-LABEL: bar:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: ptest %xmm0, %xmm0
; CHECK-NEXT: jne .LBB1_2
-; CHECK-NEXT: # BB#1: # %if-true-block
+; CHECK-NEXT: # %bb.1: # %if-true-block
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: retq
; CHECK-NEXT: .LBB1_2: # %endif-block
@@ -37,7 +37,7 @@ endif-block:
define i32 @bax(<2 x i64> %c) {
; CHECK-LABEL: bax:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: ptest %xmm0, %xmm0
; CHECK-NEXT: sete %al
@@ -50,12 +50,12 @@ define i32 @bax(<2 x i64> %c) {
define i16 @rnd16(i16 %arg) nounwind {
; CHECK-LABEL: rnd16:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: rdrandw %cx
; CHECK-NEXT: cmovbw %di, %ax
; CHECK-NEXT: addl %ecx, %eax
-; CHECK-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; CHECK-NEXT: # kill: def %ax killed %ax killed %eax
; CHECK-NEXT: retq
%1 = tail call { i16, i32 } @llvm.x86.rdrand.16() nounwind
%2 = extractvalue { i16, i32 } %1, 0
@@ -68,7 +68,7 @@ define i16 @rnd16(i16 %arg) nounwind {
define i32 @rnd32(i32 %arg) nounwind {
; CHECK-LABEL: rnd32:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: rdrandl %ecx
; CHECK-NEXT: cmovbl %edi, %eax
@@ -85,7 +85,7 @@ define i32 @rnd32(i32 %arg) nounwind {
define i64 @rnd64(i64 %arg) nounwind {
; CHECK-LABEL: rnd64:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: rdrandq %rcx
; CHECK-NEXT: cmovbq %rdi, %rax
@@ -102,12 +102,12 @@ define i64 @rnd64(i64 %arg) nounwind {
define i16 @seed16(i16 %arg) nounwind {
; CHECK-LABEL: seed16:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: rdseedw %cx
; CHECK-NEXT: cmovbw %di, %ax
; CHECK-NEXT: addl %ecx, %eax
-; CHECK-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; CHECK-NEXT: # kill: def %ax killed %ax killed %eax
; CHECK-NEXT: retq
%1 = tail call { i16, i32 } @llvm.x86.rdseed.16() nounwind
%2 = extractvalue { i16, i32 } %1, 0
@@ -120,7 +120,7 @@ define i16 @seed16(i16 %arg) nounwind {
define i32 @seed32(i32 %arg) nounwind {
; CHECK-LABEL: seed32:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: rdseedl %ecx
; CHECK-NEXT: cmovbl %edi, %eax
@@ -137,7 +137,7 @@ define i32 @seed32(i32 %arg) nounwind {
define i64 @seed64(i64 %arg) nounwind {
; CHECK-LABEL: seed64:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: rdseedq %rcx
; CHECK-NEXT: cmovbq %rdi, %rax
diff --git a/test/CodeGen/X86/bool-vector.ll b/test/CodeGen/X86/bool-vector.ll
new file mode 100644
index 000000000000..ec9e42fceceb
--- /dev/null
+++ b/test/CodeGen/X86/bool-vector.ll
@@ -0,0 +1,200 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=-sse2 | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X32-SSE2
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X32-AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=-sse2 | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X64-SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X64-AVX2
+
+define i32 @PR15215_bad(<4 x i32> %input) {
+; X32-LABEL: PR15215_bad:
+; X32: # %bb.0: # %entry
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: movb {{[0-9]+}}(%esp), %cl
+; X32-NEXT: movb {{[0-9]+}}(%esp), %dl
+; X32-NEXT: movb {{[0-9]+}}(%esp), %ah
+; X32-NEXT: addb %ah, %ah
+; X32-NEXT: andb $1, %dl
+; X32-NEXT: orb %ah, %dl
+; X32-NEXT: shlb $2, %dl
+; X32-NEXT: addb %cl, %cl
+; X32-NEXT: andb $1, %al
+; X32-NEXT: orb %cl, %al
+; X32-NEXT: andb $3, %al
+; X32-NEXT: orb %dl, %al
+; X32-NEXT: movzbl %al, %eax
+; X32-NEXT: andl $15, %eax
+; X32-NEXT: retl
+;
+; X32-SSE2-LABEL: PR15215_bad:
+; X32-SSE2: # %bb.0: # %entry
+; X32-SSE2-NEXT: pslld $31, %xmm0
+; X32-SSE2-NEXT: psrad $31, %xmm0
+; X32-SSE2-NEXT: movmskps %xmm0, %eax
+; X32-SSE2-NEXT: retl
+;
+; X32-AVX2-LABEL: PR15215_bad:
+; X32-AVX2: # %bb.0: # %entry
+; X32-AVX2-NEXT: vpslld $31, %xmm0, %xmm0
+; X32-AVX2-NEXT: vpsrad $31, %xmm0, %xmm0
+; X32-AVX2-NEXT: vmovmskps %xmm0, %eax
+; X32-AVX2-NEXT: retl
+;
+; X64-LABEL: PR15215_bad:
+; X64: # %bb.0: # %entry
+; X64-NEXT: addb %cl, %cl
+; X64-NEXT: andb $1, %dl
+; X64-NEXT: orb %cl, %dl
+; X64-NEXT: shlb $2, %dl
+; X64-NEXT: addb %sil, %sil
+; X64-NEXT: andb $1, %dil
+; X64-NEXT: orb %sil, %dil
+; X64-NEXT: andb $3, %dil
+; X64-NEXT: orb %dl, %dil
+; X64-NEXT: movzbl %dil, %eax
+; X64-NEXT: andl $15, %eax
+; X64-NEXT: retq
+;
+; X64-SSE2-LABEL: PR15215_bad:
+; X64-SSE2: # %bb.0: # %entry
+; X64-SSE2-NEXT: pslld $31, %xmm0
+; X64-SSE2-NEXT: psrad $31, %xmm0
+; X64-SSE2-NEXT: movmskps %xmm0, %eax
+; X64-SSE2-NEXT: retq
+;
+; X64-AVX2-LABEL: PR15215_bad:
+; X64-AVX2: # %bb.0: # %entry
+; X64-AVX2-NEXT: vpslld $31, %xmm0, %xmm0
+; X64-AVX2-NEXT: vpsrad $31, %xmm0, %xmm0
+; X64-AVX2-NEXT: vmovmskps %xmm0, %eax
+; X64-AVX2-NEXT: retq
+entry:
+ %0 = trunc <4 x i32> %input to <4 x i1>
+ %1 = bitcast <4 x i1> %0 to i4
+ %2 = zext i4 %1 to i32
+ ret i32 %2
+}
+
+define i32 @PR15215_good(<4 x i32> %input) {
+; X32-LABEL: PR15215_good:
+; X32: # %bb.0: # %entry
+; X32-NEXT: pushl %esi
+; X32-NEXT: .cfi_def_cfa_offset 8
+; X32-NEXT: .cfi_offset %esi, -8
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: andl $1, %eax
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: andl $1, %ecx
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %edx
+; X32-NEXT: andl $1, %edx
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %esi
+; X32-NEXT: andl $1, %esi
+; X32-NEXT: leal (%eax,%ecx,2), %eax
+; X32-NEXT: leal (%eax,%edx,4), %eax
+; X32-NEXT: leal (%eax,%esi,8), %eax
+; X32-NEXT: popl %esi
+; X32-NEXT: retl
+;
+; X32-SSE2-LABEL: PR15215_good:
+; X32-SSE2: # %bb.0: # %entry
+; X32-SSE2-NEXT: pushl %esi
+; X32-SSE2-NEXT: .cfi_def_cfa_offset 8
+; X32-SSE2-NEXT: .cfi_offset %esi, -8
+; X32-SSE2-NEXT: movd %xmm0, %eax
+; X32-SSE2-NEXT: andl $1, %eax
+; X32-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X32-SSE2-NEXT: movd %xmm1, %ecx
+; X32-SSE2-NEXT: andl $1, %ecx
+; X32-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X32-SSE2-NEXT: movd %xmm1, %edx
+; X32-SSE2-NEXT: andl $1, %edx
+; X32-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; X32-SSE2-NEXT: movd %xmm0, %esi
+; X32-SSE2-NEXT: andl $1, %esi
+; X32-SSE2-NEXT: leal (%eax,%ecx,2), %eax
+; X32-SSE2-NEXT: leal (%eax,%edx,4), %eax
+; X32-SSE2-NEXT: leal (%eax,%esi,8), %eax
+; X32-SSE2-NEXT: popl %esi
+; X32-SSE2-NEXT: retl
+;
+; X32-AVX2-LABEL: PR15215_good:
+; X32-AVX2: # %bb.0: # %entry
+; X32-AVX2-NEXT: pushl %esi
+; X32-AVX2-NEXT: .cfi_def_cfa_offset 8
+; X32-AVX2-NEXT: .cfi_offset %esi, -8
+; X32-AVX2-NEXT: vmovd %xmm0, %eax
+; X32-AVX2-NEXT: andl $1, %eax
+; X32-AVX2-NEXT: vpextrd $1, %xmm0, %ecx
+; X32-AVX2-NEXT: andl $1, %ecx
+; X32-AVX2-NEXT: vpextrd $2, %xmm0, %edx
+; X32-AVX2-NEXT: andl $1, %edx
+; X32-AVX2-NEXT: vpextrd $3, %xmm0, %esi
+; X32-AVX2-NEXT: andl $1, %esi
+; X32-AVX2-NEXT: leal (%eax,%ecx,2), %eax
+; X32-AVX2-NEXT: leal (%eax,%edx,4), %eax
+; X32-AVX2-NEXT: leal (%eax,%esi,8), %eax
+; X32-AVX2-NEXT: popl %esi
+; X32-AVX2-NEXT: retl
+;
+; X64-LABEL: PR15215_good:
+; X64: # %bb.0: # %entry
+; X64-NEXT: # kill: def %ecx killed %ecx def %rcx
+; X64-NEXT: # kill: def %edx killed %edx def %rdx
+; X64-NEXT: # kill: def %esi killed %esi def %rsi
+; X64-NEXT: # kill: def %edi killed %edi def %rdi
+; X64-NEXT: andl $1, %edi
+; X64-NEXT: andl $1, %esi
+; X64-NEXT: andl $1, %edx
+; X64-NEXT: andl $1, %ecx
+; X64-NEXT: leal (%rdi,%rsi,2), %eax
+; X64-NEXT: leal (%rax,%rdx,4), %eax
+; X64-NEXT: leal (%rax,%rcx,8), %eax
+; X64-NEXT: retq
+;
+; X64-SSE2-LABEL: PR15215_good:
+; X64-SSE2: # %bb.0: # %entry
+; X64-SSE2-NEXT: movd %xmm0, %eax
+; X64-SSE2-NEXT: andl $1, %eax
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-SSE2-NEXT: movd %xmm1, %ecx
+; X64-SSE2-NEXT: andl $1, %ecx
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: movd %xmm1, %edx
+; X64-SSE2-NEXT: andl $1, %edx
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; X64-SSE2-NEXT: movd %xmm0, %esi
+; X64-SSE2-NEXT: andl $1, %esi
+; X64-SSE2-NEXT: leal (%rax,%rcx,2), %eax
+; X64-SSE2-NEXT: leal (%rax,%rdx,4), %eax
+; X64-SSE2-NEXT: leal (%rax,%rsi,8), %eax
+; X64-SSE2-NEXT: retq
+;
+; X64-AVX2-LABEL: PR15215_good:
+; X64-AVX2: # %bb.0: # %entry
+; X64-AVX2-NEXT: vmovd %xmm0, %eax
+; X64-AVX2-NEXT: andl $1, %eax
+; X64-AVX2-NEXT: vpextrd $1, %xmm0, %ecx
+; X64-AVX2-NEXT: andl $1, %ecx
+; X64-AVX2-NEXT: vpextrd $2, %xmm0, %edx
+; X64-AVX2-NEXT: andl $1, %edx
+; X64-AVX2-NEXT: vpextrd $3, %xmm0, %esi
+; X64-AVX2-NEXT: andl $1, %esi
+; X64-AVX2-NEXT: leal (%rax,%rcx,2), %eax
+; X64-AVX2-NEXT: leal (%rax,%rdx,4), %eax
+; X64-AVX2-NEXT: leal (%rax,%rsi,8), %eax
+; X64-AVX2-NEXT: retq
+entry:
+ %0 = trunc <4 x i32> %input to <4 x i1>
+ %1 = extractelement <4 x i1> %0, i32 0
+ %e1 = select i1 %1, i32 1, i32 0
+ %2 = extractelement <4 x i1> %0, i32 1
+ %e2 = select i1 %2, i32 2, i32 0
+ %3 = extractelement <4 x i1> %0, i32 2
+ %e3 = select i1 %3, i32 4, i32 0
+ %4 = extractelement <4 x i1> %0, i32 3
+ %e4 = select i1 %4, i32 8, i32 0
+ %5 = or i32 %e1, %e2
+ %6 = or i32 %5, %e3
+ %7 = or i32 %6, %e4
+ ret i32 %7
+}
diff --git a/test/CodeGen/X86/bool-zext.ll b/test/CodeGen/X86/bool-zext.ll
index 5cc758c06b5d..82b6a993ac22 100644
--- a/test/CodeGen/X86/bool-zext.ll
+++ b/test/CodeGen/X86/bool-zext.ll
@@ -1,56 +1,62 @@
-; RUN: llc < %s -mtriple=i686-unknown-linux-gnu | FileCheck %s -check-prefix=X86
-; RUN: llc < %s -mtriple=x86_64-apple-darwin10 | FileCheck %s -check-prefix=X64
-; RUN: llc < %s -mtriple=x86_64-pc-win32 | FileCheck %s -check-prefix=WIN64
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown | FileCheck %s -check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s -check-prefix=X64
-; Check that the argument gets zero-extended before calling.
-; X86-LABEL: bar1
-; X86: movzbl
-; X86: calll
-; X64-LABEL: bar1
-; X64: movzbl
-; X64: jmp
-; WIN64-LABEL: bar1
-; WIN64: movzbl
-; WIN64: callq
+; It's not necessary to zero-extend the arg because it is specified 'zeroext'.
define void @bar1(i1 zeroext %v1) nounwind ssp {
-entry:
+; X32-LABEL: bar1:
+; X32: # %bb.0:
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: pushl %eax
+; X32-NEXT: calll foo1
+; X32-NEXT: addl $4, %esp
+; X32-NEXT: retl
+;
+; X64-LABEL: bar1:
+; X64: # %bb.0:
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: jmp foo1 # TAILCALL
%conv = zext i1 %v1 to i32
%call = tail call i32 (...) @foo1(i32 %conv) nounwind
ret void
}
; Check that on x86-64 the arguments are simply forwarded.
-; X64-LABEL: bar2
-; X64-NOT: movzbl
-; X64: jmp
-; WIN64-LABEL: bar2
-; WIN64-NOT: movzbl
-; WIN64: callq
define void @bar2(i8 zeroext %v1) nounwind ssp {
-entry:
+; X32-LABEL: bar2:
+; X32: # %bb.0:
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: pushl %eax
+; X32-NEXT: calll foo1
+; X32-NEXT: addl $4, %esp
+; X32-NEXT: retl
+;
+; X64-LABEL: bar2:
+; X64: # %bb.0:
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: jmp foo1 # TAILCALL
%conv = zext i8 %v1 to i32
%call = tail call i32 (...) @foo1(i32 %conv) nounwind
ret void
}
; Check that i1 return values are not zero-extended.
-; X86-LABEL: bar3
-; X86: call
-; X86-NEXT: {{add|pop}}
-; X86-NEXT: ret
-; X64-LABEL: bar3
-; X64: call
-; X64-NEXT: {{add|pop}}
-; X64-NEXT: ret
-; WIN64-LABEL: bar3
-; WIN64: call
-; WIN64-NEXT: {{add|pop}}
-; WIN64-NEXT: ret
define zeroext i1 @bar3() nounwind ssp {
-entry:
+; X32-LABEL: bar3:
+; X32: # %bb.0:
+; X32-NEXT: calll foo2
+; X32-NEXT: retl
+;
+; X64-LABEL: bar3:
+; X64: # %bb.0:
+; X64-NEXT: pushq %rax
+; X64-NEXT: callq foo2
+; X64-NEXT: popq %rcx
+; X64-NEXT: retq
%call = call i1 @foo2() nounwind
ret i1 %call
}
declare i32 @foo1(...)
declare zeroext i1 @foo2()
+
diff --git a/test/CodeGen/X86/branch_instruction_and_target_split_perf_nops.mir b/test/CodeGen/X86/branch_instruction_and_target_split_perf_nops.mir
new file mode 100644
index 000000000000..965014162073
--- /dev/null
+++ b/test/CodeGen/X86/branch_instruction_and_target_split_perf_nops.mir
@@ -0,0 +1,288 @@
+# RUN: llc -mcpu=haswell -filetype=obj -start-before stack-protector -O2 %s -o - | llvm-objdump -d - | FileCheck %s
+
+# Test 1:
+#
+# Source C code:
+# volatile int y;
+# volatile int x;
+#
+# int switchCase(int z, int w) {
+# int result = 0;
+# while (x > 0 && y < 0) {
+# switch(z) {
+# case 0:
+# result+=result*5;break;
+# case 1:
+# result--; break;
+# case 2:
+# result *= result; break;
+# case 3:
+# result <<= 7; break;
+# case 4:
+# result >>= 7; break;
+# case 5:
+# result = result * 16 | ~result; break;
+# }
+# }
+# return result;
+# }
+#
+# CHECK: 49: eb 4a jmp 74 <switchCase+0x95>
+# CHECK: 57: eb 3c jmp 60 <switchCase+0x95>
+# CHECK: 65: eb 2e jmp 46 <switchCase+0x95>
+# CHECK: 73: eb 20 jmp 32 <switchCase+0x95>
+# CHECK: 81: eb 12 jmp 18 <switchCase+0x95>
+# CHECK: 93: 7f 8b jg -117 <switchCase+0x20>
+
+# Test 2:
+#
+# Source C code:
+#
+# int ifElse(int z) {
+# int w = 0;
+# while(1) {
+# if(x < 0)
+# w++;
+# else if(y > 0)
+# w--;
+# else if((x & y) == 3)
+# w*=2;
+# else if ((x | y) == 18)
+# w += 2;
+# else if ((y ^ x) == 154)
+# w -= 3;
+# else if(((y ^ x) & 1) != 0)
+# break;
+# }
+# return w;
+# }
+#
+# CHECK: 129: eb 13 jmp 19 <ifElse+0x7E>
+# CHECK: 12e: eb a0 jmp -96 <ifElse+0x10>
+# CHECK: 132: eb 9c jmp -100 <ifElse+0x10>
+# CHECK: 137: eb 97 jmp -105 <ifElse+0x10>
+# CHECK: 13c: eb 92 jmp -110 <ifElse+0x10>
+--- |
+ ; ModuleID = 'D:\iusers\opaparo\dev_test\branch_instruction_and_target_split_perf_nops.ll'
+ source_filename = "D:\5C\5Ciusers\5C\5Copaparo\5C\5Cdev_test\5C\5Cbranch_instruction_and_target_split_perf_nops.c"
+ target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
+ target triple = "x86_64-pc-windows-msvc19.0.24210"
+
+ @x = common global i32 0, align 4
+ @y = common global i32 0, align 4
+
+ ; Function Attrs: norecurse nounwind uwtable
+ define i32 @switchCase(i32 %z, i32 %w) local_unnamed_addr #0 {
+ entry:
+ %0 = load volatile i32, i32* @x, align 4, !tbaa !3
+ %cmp19 = icmp sgt i32 %0, 0
+ br i1 %cmp19, label %land.rhs.preheader, label %while.end
+
+ land.rhs.preheader: ; preds = %entry
+ br label %land.rhs
+
+ land.rhs: ; preds = %sw.epilog, %land.rhs.preheader
+ %result.020 = phi i32 [ %result.1, %sw.epilog ], [ 0, %land.rhs.preheader ]
+ %1 = load volatile i32, i32* @y, align 4, !tbaa !3
+ %cmp1 = icmp slt i32 %1, 0
+ br i1 %cmp1, label %while.body, label %while.end
+
+ while.body: ; preds = %land.rhs
+ switch i32 %z, label %sw.epilog [
+ i32 0, label %sw.bb
+ i32 1, label %sw.bb2
+ i32 2, label %sw.bb3
+ i32 3, label %sw.bb5
+ i32 4, label %sw.bb6
+ i32 5, label %sw.bb7
+ ]
+
+ sw.bb: ; preds = %while.body
+ %add = mul nsw i32 %result.020, 6
+ br label %sw.epilog
+
+ sw.bb2: ; preds = %while.body
+ %dec = add nsw i32 %result.020, -1
+ br label %sw.epilog
+
+ sw.bb3: ; preds = %while.body
+ %mul4 = mul nsw i32 %result.020, %result.020
+ br label %sw.epilog
+
+ sw.bb5: ; preds = %while.body
+ %shl = shl i32 %result.020, 7
+ br label %sw.epilog
+
+ sw.bb6: ; preds = %while.body
+ %shr = ashr i32 %result.020, 7
+ br label %sw.epilog
+
+ sw.bb7: ; preds = %while.body
+ %mul8 = shl nsw i32 %result.020, 4
+ %neg = xor i32 %result.020, -1
+ %or = or i32 %mul8, %neg
+ br label %sw.epilog
+
+ sw.epilog: ; preds = %sw.bb7, %sw.bb6, %sw.bb5, %sw.bb3, %sw.bb2, %sw.bb, %while.body
+ %result.1 = phi i32 [ %result.020, %while.body ], [ %or, %sw.bb7 ], [ %shr, %sw.bb6 ], [ %shl, %sw.bb5 ], [ %mul4, %sw.bb3 ], [ %dec, %sw.bb2 ], [ %add, %sw.bb ]
+ %2 = load volatile i32, i32* @x, align 4, !tbaa !3
+ %cmp = icmp sgt i32 %2, 0
+ br i1 %cmp, label %land.rhs, label %while.end
+
+ while.end: ; preds = %sw.epilog, %land.rhs, %entry
+ %result.0.lcssa = phi i32 [ 0, %entry ], [ %result.020, %land.rhs ], [ %result.1, %sw.epilog ]
+ ret i32 %result.0.lcssa
+ }
+
+ ; Function Attrs: norecurse nounwind uwtable
+ define i32 @ifElse(i32 %z) local_unnamed_addr #0 {
+ entry:
+ br label %while.cond.outer
+
+ while.cond.outer: ; preds = %if.then, %if.then2, %if.then5, %if.then8, %if.then11, %entry
+ %w.0.ph = phi i32 [ 0, %entry ], [ %sub, %if.then11 ], [ %add, %if.then8 ], [ %mul, %if.then5 ], [ %dec, %if.then2 ], [ %inc, %if.then ]
+ br label %while.cond
+
+ while.cond: ; preds = %if.else12, %while.cond.outer
+ %0 = load volatile i32, i32* @x, align 4, !tbaa !3
+ %cmp = icmp slt i32 %0, 0
+ br i1 %cmp, label %if.then, label %if.else
+
+ if.then: ; preds = %while.cond
+ %inc = add nsw i32 %w.0.ph, 1
+ br label %while.cond.outer
+
+ if.else: ; preds = %while.cond
+ %1 = load volatile i32, i32* @y, align 4, !tbaa !3
+ %cmp1 = icmp sgt i32 %1, 0
+ br i1 %cmp1, label %if.then2, label %if.else3
+
+ if.then2: ; preds = %if.else
+ %dec = add nsw i32 %w.0.ph, -1
+ br label %while.cond.outer
+
+ if.else3: ; preds = %if.else
+ %2 = load volatile i32, i32* @x, align 4, !tbaa !3
+ %3 = load volatile i32, i32* @y, align 4, !tbaa !3
+ %and = and i32 %3, %2
+ %cmp4 = icmp eq i32 %and, 3
+ br i1 %cmp4, label %if.then5, label %if.else6
+
+ if.then5: ; preds = %if.else3
+ %mul = shl nsw i32 %w.0.ph, 1
+ br label %while.cond.outer
+
+ if.else6: ; preds = %if.else3
+ %4 = load volatile i32, i32* @x, align 4, !tbaa !3
+ %5 = load volatile i32, i32* @y, align 4, !tbaa !3
+ %or = or i32 %5, %4
+ %cmp7 = icmp eq i32 %or, 18
+ br i1 %cmp7, label %if.then8, label %if.else9
+
+ if.then8: ; preds = %if.else6
+ %add = add nsw i32 %w.0.ph, 2
+ br label %while.cond.outer
+
+ if.else9: ; preds = %if.else6
+ %6 = load volatile i32, i32* @y, align 4, !tbaa !3
+ %7 = load volatile i32, i32* @x, align 4, !tbaa !3
+ %xor = xor i32 %7, %6
+ %cmp10 = icmp eq i32 %xor, 154
+ br i1 %cmp10, label %if.then11, label %if.else12
+
+ if.then11: ; preds = %if.else9
+ %sub = add nsw i32 %w.0.ph, -3
+ br label %while.cond.outer
+
+ if.else12: ; preds = %if.else9
+ %8 = load volatile i32, i32* @y, align 4, !tbaa !3
+ %9 = load volatile i32, i32* @x, align 4, !tbaa !3
+ %xor13 = xor i32 %9, %8
+ %and14 = and i32 %xor13, 1
+ %cmp15 = icmp eq i32 %and14, 0
+ br i1 %cmp15, label %while.cond, label %while.end
+
+ while.end: ; preds = %if.else12
+ ret i32 %w.0.ph
+ }
+
+ attributes #0 = { norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="haswell" "target-features"="+aes,+avx,+avx2,+bmi,+bmi2,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+rdrnd,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+ !llvm.module.flags = !{!0, !1}
+ !llvm.ident = !{!2}
+
+ !0 = !{i32 1, !"wchar_size", i32 2}
+ !1 = !{i32 7, !"PIC Level", i32 2}
+ !2 = !{!"clang version 6.0.0 (ssh://git-amr-1.devtools.intel.com:29418/dpd_icl-llvm_clang_worldread 3789ad4283ec09df1ed8411abbb227d76e7ef8cb) (ssh://git-amr-1.devtools.intel.com:29418/dpd_icl-llvm_llvm_worldread 42897913cc9fac0d94e8636d9aed4dc193d7864e)"}
+ !3 = !{!4, !4, i64 0}
+ !4 = !{!"int", !5, i64 0}
+ !5 = !{!"omnipotent char", !6, i64 0}
+ !6 = !{!"Simple C/C++ TBAA"}
+
+...
+---
+name: switchCase
+alignment: 4
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+registers:
+liveins:
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 0
+ adjustsStack: false
+ hasCalls: false
+ stackProtector: ''
+ maxCallFrameSize: 4294967295
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+ savePoint: ''
+ restorePoint: ''
+fixedStack:
+stack:
+constants:
+body: |
+
+...
+---
+name: ifElse
+alignment: 4
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+registers:
+liveins:
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 0
+ adjustsStack: false
+ hasCalls: false
+ stackProtector: ''
+ maxCallFrameSize: 4294967295
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+ savePoint: ''
+ restorePoint: ''
+fixedStack:
+stack:
+constants:
+body: |
+
+...
diff --git a/test/CodeGen/X86/branchfolding-undef.mir b/test/CodeGen/X86/branchfolding-undef.mir
index 1a7dfb941875..1062b343338a 100644
--- a/test/CodeGen/X86/branchfolding-undef.mir
+++ b/test/CodeGen/X86/branchfolding-undef.mir
@@ -1,4 +1,4 @@
-# RUN: llc -o - %s -march=x86 -run-pass branch-folder | FileCheck %s
+# RUN: llc -o - %s -mtriple=i686-- -run-pass branch-folder | FileCheck %s
# Test that tail merging drops undef flags that aren't present on all
# instructions to be merged.
--- |
diff --git a/test/CodeGen/X86/break-anti-dependencies.ll b/test/CodeGen/X86/break-anti-dependencies.ll
index c54ac108819e..e8e270924786 100644
--- a/test/CodeGen/X86/break-anti-dependencies.ll
+++ b/test/CodeGen/X86/break-anti-dependencies.ll
@@ -1,10 +1,10 @@
; Without list-burr scheduling we may not see the difference in codegen here.
; Use a subtarget that has post-RA scheduling enabled because the anti-dependency
; breaker requires liveness information to be kept.
-; RUN: llc < %s -march=x86-64 -mcpu=atom -enable-misched=false -post-RA-scheduler -pre-RA-sched=list-burr -break-anti-dependencies=none > %t
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=atom -enable-misched=false -post-RA-scheduler -pre-RA-sched=list-burr -break-anti-dependencies=none > %t
; RUN: grep "%xmm0" %t | count 14
; RUN: not grep "%xmm1" %t
-; RUN: llc < %s -march=x86-64 -mcpu=atom -post-RA-scheduler -break-anti-dependencies=critical > %t
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=atom -post-RA-scheduler -break-anti-dependencies=critical > %t
; RUN: grep "%xmm0" %t | count 7
; RUN: grep "%xmm1" %t | count 7
diff --git a/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll b/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll
index 14bdb3853b03..a20689dae3c9 100644
--- a/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll
+++ b/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll
@@ -18,28 +18,28 @@
define <16 x i8> @f16xi8_i16(<16 x i8> %a) {
; AVX-LABEL: f16xi8_i16:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX-NEXT: retl
;
; ALL32-LABEL: f16xi8_i16:
-; ALL32: # BB#0:
+; ALL32: # %bb.0:
; ALL32-NEXT: vpbroadcastw {{.*#+}} xmm1 = [256,256,256,256,256,256,256,256]
; ALL32-NEXT: vpaddb %xmm1, %xmm0, %xmm0
; ALL32-NEXT: vpand %xmm1, %xmm0, %xmm0
; ALL32-NEXT: retl
;
; AVX-64-LABEL: f16xi8_i16:
-; AVX-64: # BB#0:
+; AVX-64: # %bb.0:
; AVX-64-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
; AVX-64-NEXT: vpaddb %xmm1, %xmm0, %xmm0
; AVX-64-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX-64-NEXT: retq
;
; ALL64-LABEL: f16xi8_i16:
-; ALL64: # BB#0:
+; ALL64: # %bb.0:
; ALL64-NEXT: vpbroadcastw {{.*#+}} xmm1 = [256,256,256,256,256,256,256,256]
; ALL64-NEXT: vpaddb %xmm1, %xmm0, %xmm0
; ALL64-NEXT: vpand %xmm1, %xmm0, %xmm0
@@ -52,28 +52,28 @@ define <16 x i8> @f16xi8_i16(<16 x i8> %a) {
define <16 x i8> @f16xi8_i32(<16 x i8> %a) {
; AVX-LABEL: f16xi8_i32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [3.82047143E-37,3.82047143E-37,3.82047143E-37,3.82047143E-37]
; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX-NEXT: retl
;
; ALL32-LABEL: f16xi8_i32:
-; ALL32: # BB#0:
+; ALL32: # %bb.0:
; ALL32-NEXT: vpbroadcastd {{.*#+}} xmm1 = [50462976,50462976,50462976,50462976]
; ALL32-NEXT: vpaddb %xmm1, %xmm0, %xmm0
; ALL32-NEXT: vpand %xmm1, %xmm0, %xmm0
; ALL32-NEXT: retl
;
; AVX-64-LABEL: f16xi8_i32:
-; AVX-64: # BB#0:
+; AVX-64: # %bb.0:
; AVX-64-NEXT: vbroadcastss {{.*#+}} xmm1 = [3.82047143E-37,3.82047143E-37,3.82047143E-37,3.82047143E-37]
; AVX-64-NEXT: vpaddb %xmm1, %xmm0, %xmm0
; AVX-64-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX-64-NEXT: retq
;
; ALL64-LABEL: f16xi8_i32:
-; ALL64: # BB#0:
+; ALL64: # %bb.0:
; ALL64-NEXT: vpbroadcastd {{.*#+}} xmm1 = [50462976,50462976,50462976,50462976]
; ALL64-NEXT: vpaddb %xmm1, %xmm0, %xmm0
; ALL64-NEXT: vpand %xmm1, %xmm0, %xmm0
@@ -86,28 +86,28 @@ define <16 x i8> @f16xi8_i32(<16 x i8> %a) {
define <16 x i8> @f16xi8_i64(<16 x i8> %a) {
; AVX-LABEL: f16xi8_i64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0]
; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX-NEXT: retl
;
; ALL32-LABEL: f16xi8_i64:
-; ALL32: # BB#0:
+; ALL32: # %bb.0:
; ALL32-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0]
; ALL32-NEXT: vpaddb %xmm1, %xmm0, %xmm0
; ALL32-NEXT: vpand %xmm1, %xmm0, %xmm0
; ALL32-NEXT: retl
;
; AVX-64-LABEL: f16xi8_i64:
-; AVX-64: # BB#0:
+; AVX-64: # %bb.0:
; AVX-64-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0]
; AVX-64-NEXT: vpaddb %xmm1, %xmm0, %xmm0
; AVX-64-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX-64-NEXT: retq
;
; ALL64-LABEL: f16xi8_i64:
-; ALL64: # BB#0:
+; ALL64: # %bb.0:
; ALL64-NEXT: vpbroadcastq {{.*#+}} xmm1 = [506097522914230528,506097522914230528]
; ALL64-NEXT: vpaddb %xmm1, %xmm0, %xmm0
; ALL64-NEXT: vpand %xmm1, %xmm0, %xmm0
@@ -120,7 +120,7 @@ define <16 x i8> @f16xi8_i64(<16 x i8> %a) {
define <32 x i8> @f32xi8_i16(<32 x i8> %a) {
; AVX-LABEL: f32xi8_i16:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
; AVX-NEXT: vpaddb %xmm2, %xmm1, %xmm1
@@ -130,14 +130,14 @@ define <32 x i8> @f32xi8_i16(<32 x i8> %a) {
; AVX-NEXT: retl
;
; ALL32-LABEL: f32xi8_i16:
-; ALL32: # BB#0:
+; ALL32: # %bb.0:
; ALL32-NEXT: vpbroadcastw {{.*#+}} ymm1 = [256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256]
; ALL32-NEXT: vpaddb %ymm1, %ymm0, %ymm0
; ALL32-NEXT: vpand %ymm1, %ymm0, %ymm0
; ALL32-NEXT: retl
;
; AVX-64-LABEL: f32xi8_i16:
-; AVX-64: # BB#0:
+; AVX-64: # %bb.0:
; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX-64-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
; AVX-64-NEXT: vpaddb %xmm2, %xmm1, %xmm1
@@ -147,7 +147,7 @@ define <32 x i8> @f32xi8_i16(<32 x i8> %a) {
; AVX-64-NEXT: retq
;
; ALL64-LABEL: f32xi8_i16:
-; ALL64: # BB#0:
+; ALL64: # %bb.0:
; ALL64-NEXT: vpbroadcastw {{.*#+}} ymm1 = [256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256]
; ALL64-NEXT: vpaddb %ymm1, %ymm0, %ymm0
; ALL64-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -160,7 +160,7 @@ define <32 x i8> @f32xi8_i16(<32 x i8> %a) {
define <32 x i8> @f32xi8_i32(<32 x i8> %a) {
; AVX-LABEL: f32xi8_i32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX-NEXT: vbroadcastss {{.*#+}} xmm2 = [3.82047143E-37,3.82047143E-37,3.82047143E-37,3.82047143E-37]
; AVX-NEXT: vpaddb %xmm2, %xmm1, %xmm1
@@ -170,14 +170,14 @@ define <32 x i8> @f32xi8_i32(<32 x i8> %a) {
; AVX-NEXT: retl
;
; ALL32-LABEL: f32xi8_i32:
-; ALL32: # BB#0:
+; ALL32: # %bb.0:
; ALL32-NEXT: vpbroadcastd {{.*#+}} ymm1 = [50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976]
; ALL32-NEXT: vpaddb %ymm1, %ymm0, %ymm0
; ALL32-NEXT: vpand %ymm1, %ymm0, %ymm0
; ALL32-NEXT: retl
;
; AVX-64-LABEL: f32xi8_i32:
-; AVX-64: # BB#0:
+; AVX-64: # %bb.0:
; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX-64-NEXT: vbroadcastss {{.*#+}} xmm2 = [3.82047143E-37,3.82047143E-37,3.82047143E-37,3.82047143E-37]
; AVX-64-NEXT: vpaddb %xmm2, %xmm1, %xmm1
@@ -187,7 +187,7 @@ define <32 x i8> @f32xi8_i32(<32 x i8> %a) {
; AVX-64-NEXT: retq
;
; ALL64-LABEL: f32xi8_i32:
-; ALL64: # BB#0:
+; ALL64: # %bb.0:
; ALL64-NEXT: vpbroadcastd {{.*#+}} ymm1 = [50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976]
; ALL64-NEXT: vpaddb %ymm1, %ymm0, %ymm0
; ALL64-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -200,7 +200,7 @@ define <32 x i8> @f32xi8_i32(<32 x i8> %a) {
define <32 x i8> @f32xi8_i64(<32 x i8> %a) {
; AVX-LABEL: f32xi8_i64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0]
; AVX-NEXT: vpaddb %xmm2, %xmm1, %xmm1
@@ -210,14 +210,14 @@ define <32 x i8> @f32xi8_i64(<32 x i8> %a) {
; AVX-NEXT: retl
;
; ALL32-LABEL: f32xi8_i64:
-; ALL32: # BB#0:
+; ALL32: # %bb.0:
; ALL32-NEXT: vpbroadcastq {{.*#+}} ymm1 = [7.9499288951273625E-275,7.9499288951273625E-275,7.9499288951273625E-275,7.9499288951273625E-275]
; ALL32-NEXT: vpaddb %ymm1, %ymm0, %ymm0
; ALL32-NEXT: vpand %ymm1, %ymm0, %ymm0
; ALL32-NEXT: retl
;
; AVX-64-LABEL: f32xi8_i64:
-; AVX-64: # BB#0:
+; AVX-64: # %bb.0:
; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX-64-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0]
; AVX-64-NEXT: vpaddb %xmm2, %xmm1, %xmm1
@@ -227,7 +227,7 @@ define <32 x i8> @f32xi8_i64(<32 x i8> %a) {
; AVX-64-NEXT: retq
;
; ALL64-LABEL: f32xi8_i64:
-; ALL64: # BB#0:
+; ALL64: # %bb.0:
; ALL64-NEXT: vpbroadcastq {{.*#+}} ymm1 = [506097522914230528,506097522914230528,506097522914230528,506097522914230528]
; ALL64-NEXT: vpaddb %ymm1, %ymm0, %ymm0
; ALL64-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -240,7 +240,7 @@ define <32 x i8> @f32xi8_i64(<32 x i8> %a) {
define <32 x i8> @f32xi8_i128(<32 x i8> %a) {
; AVX-LABEL: f32xi8_i128:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
; AVX-NEXT: vpaddb %xmm2, %xmm1, %xmm1
@@ -250,7 +250,7 @@ define <32 x i8> @f32xi8_i128(<32 x i8> %a) {
; AVX-NEXT: retl
;
; ALL32-LABEL: f32xi8_i128:
-; ALL32: # BB#0:
+; ALL32: # %bb.0:
; ALL32-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
; ALL32-NEXT: # ymm1 = mem[0,1,0,1]
; ALL32-NEXT: vpaddb %ymm1, %ymm0, %ymm0
@@ -258,7 +258,7 @@ define <32 x i8> @f32xi8_i128(<32 x i8> %a) {
; ALL32-NEXT: retl
;
; AVX-64-LABEL: f32xi8_i128:
-; AVX-64: # BB#0:
+; AVX-64: # %bb.0:
; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX-64-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
; AVX-64-NEXT: vpaddb %xmm2, %xmm1, %xmm1
@@ -268,7 +268,7 @@ define <32 x i8> @f32xi8_i128(<32 x i8> %a) {
; AVX-64-NEXT: retq
;
; ALL64-LABEL: f32xi8_i128:
-; ALL64: # BB#0:
+; ALL64: # %bb.0:
; ALL64-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
; ALL64-NEXT: # ymm1 = mem[0,1,0,1]
; ALL64-NEXT: vpaddb %ymm1, %ymm0, %ymm0
@@ -282,7 +282,7 @@ define <32 x i8> @f32xi8_i128(<32 x i8> %a) {
define <64 x i8> @f64xi8_i16(<64 x i8> %a) {
; AVX-LABEL: f64xi8_i16:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
; AVX-NEXT: vpaddb %xmm3, %xmm2, %xmm2
@@ -298,7 +298,7 @@ define <64 x i8> @f64xi8_i16(<64 x i8> %a) {
; AVX-NEXT: retl
;
; NO-AVX512BW-LABEL: f64xi8_i16:
-; NO-AVX512BW: # BB#0:
+; NO-AVX512BW: # %bb.0:
; NO-AVX512BW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256]
; NO-AVX512BW-NEXT: vpaddb %ymm2, %ymm1, %ymm1
; NO-AVX512BW-NEXT: vpaddb %ymm2, %ymm0, %ymm0
@@ -307,14 +307,14 @@ define <64 x i8> @f64xi8_i16(<64 x i8> %a) {
; NO-AVX512BW-NEXT: retl
;
; AVX512BW-LABEL: f64xi8_i16:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm1 = [256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256]
; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: retl
;
; AVX-64-LABEL: f64xi8_i16:
-; AVX-64: # BB#0:
+; AVX-64: # %bb.0:
; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX-64-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
; AVX-64-NEXT: vpaddb %xmm3, %xmm2, %xmm2
@@ -330,7 +330,7 @@ define <64 x i8> @f64xi8_i16(<64 x i8> %a) {
; AVX-64-NEXT: retq
;
; NO-AVX512BW-64-LABEL: f64xi8_i16:
-; NO-AVX512BW-64: # BB#0:
+; NO-AVX512BW-64: # %bb.0:
; NO-AVX512BW-64-NEXT: vpbroadcastw {{.*#+}} ymm2 = [256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256]
; NO-AVX512BW-64-NEXT: vpaddb %ymm2, %ymm1, %ymm1
; NO-AVX512BW-64-NEXT: vpaddb %ymm2, %ymm0, %ymm0
@@ -339,7 +339,7 @@ define <64 x i8> @f64xi8_i16(<64 x i8> %a) {
; NO-AVX512BW-64-NEXT: retq
;
; AVX512BW-64-LABEL: f64xi8_i16:
-; AVX512BW-64: # BB#0:
+; AVX512BW-64: # %bb.0:
; AVX512BW-64-NEXT: vpbroadcastw {{.*#+}} zmm1 = [256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256]
; AVX512BW-64-NEXT: vpaddb %zmm1, %zmm0, %zmm0
; AVX512BW-64-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -352,7 +352,7 @@ define <64 x i8> @f64xi8_i16(<64 x i8> %a) {
define <64 x i8> @f64i8_i32(<64 x i8> %a) {
; AVX-LABEL: f64i8_i32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX-NEXT: vbroadcastss {{.*#+}} xmm3 = [3.82047143E-37,3.82047143E-37,3.82047143E-37,3.82047143E-37]
; AVX-NEXT: vpaddb %xmm3, %xmm2, %xmm2
@@ -368,7 +368,7 @@ define <64 x i8> @f64i8_i32(<64 x i8> %a) {
; AVX-NEXT: retl
;
; NO-AVX512BW-LABEL: f64i8_i32:
-; NO-AVX512BW: # BB#0:
+; NO-AVX512BW: # %bb.0:
; NO-AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm2 = [50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976]
; NO-AVX512BW-NEXT: vpaddb %ymm2, %ymm1, %ymm1
; NO-AVX512BW-NEXT: vpaddb %ymm2, %ymm0, %ymm0
@@ -377,14 +377,14 @@ define <64 x i8> @f64i8_i32(<64 x i8> %a) {
; NO-AVX512BW-NEXT: retl
;
; AVX512BW-LABEL: f64i8_i32:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpbroadcastd {{.*#+}} zmm1 = [50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976]
; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: retl
;
; AVX-64-LABEL: f64i8_i32:
-; AVX-64: # BB#0:
+; AVX-64: # %bb.0:
; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX-64-NEXT: vbroadcastss {{.*#+}} xmm3 = [3.82047143E-37,3.82047143E-37,3.82047143E-37,3.82047143E-37]
; AVX-64-NEXT: vpaddb %xmm3, %xmm2, %xmm2
@@ -400,7 +400,7 @@ define <64 x i8> @f64i8_i32(<64 x i8> %a) {
; AVX-64-NEXT: retq
;
; NO-AVX512BW-64-LABEL: f64i8_i32:
-; NO-AVX512BW-64: # BB#0:
+; NO-AVX512BW-64: # %bb.0:
; NO-AVX512BW-64-NEXT: vpbroadcastd {{.*#+}} ymm2 = [50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976]
; NO-AVX512BW-64-NEXT: vpaddb %ymm2, %ymm1, %ymm1
; NO-AVX512BW-64-NEXT: vpaddb %ymm2, %ymm0, %ymm0
@@ -409,7 +409,7 @@ define <64 x i8> @f64i8_i32(<64 x i8> %a) {
; NO-AVX512BW-64-NEXT: retq
;
; AVX512BW-64-LABEL: f64i8_i32:
-; AVX512BW-64: # BB#0:
+; AVX512BW-64: # %bb.0:
; AVX512BW-64-NEXT: vpbroadcastd {{.*#+}} zmm1 = [50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976]
; AVX512BW-64-NEXT: vpaddb %zmm1, %zmm0, %zmm0
; AVX512BW-64-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -422,7 +422,7 @@ define <64 x i8> @f64i8_i32(<64 x i8> %a) {
define <64 x i8> @f64xi8_i64(<64 x i8> %a) {
; AVX-LABEL: f64xi8_i64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0]
; AVX-NEXT: vpaddb %xmm3, %xmm2, %xmm2
@@ -438,7 +438,7 @@ define <64 x i8> @f64xi8_i64(<64 x i8> %a) {
; AVX-NEXT: retl
;
; NO-AVX512BW-LABEL: f64xi8_i64:
-; NO-AVX512BW: # BB#0:
+; NO-AVX512BW: # %bb.0:
; NO-AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm2 = [7.9499288951273625E-275,7.9499288951273625E-275,7.9499288951273625E-275,7.9499288951273625E-275]
; NO-AVX512BW-NEXT: vpaddb %ymm2, %ymm1, %ymm1
; NO-AVX512BW-NEXT: vpaddb %ymm2, %ymm0, %ymm0
@@ -447,14 +447,14 @@ define <64 x i8> @f64xi8_i64(<64 x i8> %a) {
; NO-AVX512BW-NEXT: retl
;
; AVX512BW-LABEL: f64xi8_i64:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpbroadcastq {{.*#+}} zmm1 = [7.9499288951273625E-275,7.9499288951273625E-275,7.9499288951273625E-275,7.9499288951273625E-275,7.9499288951273625E-275,7.9499288951273625E-275,7.9499288951273625E-275,7.9499288951273625E-275]
; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: retl
;
; AVX-64-LABEL: f64xi8_i64:
-; AVX-64: # BB#0:
+; AVX-64: # %bb.0:
; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX-64-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0]
; AVX-64-NEXT: vpaddb %xmm3, %xmm2, %xmm2
@@ -470,7 +470,7 @@ define <64 x i8> @f64xi8_i64(<64 x i8> %a) {
; AVX-64-NEXT: retq
;
; NO-AVX512BW-64-LABEL: f64xi8_i64:
-; NO-AVX512BW-64: # BB#0:
+; NO-AVX512BW-64: # %bb.0:
; NO-AVX512BW-64-NEXT: vpbroadcastq {{.*#+}} ymm2 = [506097522914230528,506097522914230528,506097522914230528,506097522914230528]
; NO-AVX512BW-64-NEXT: vpaddb %ymm2, %ymm1, %ymm1
; NO-AVX512BW-64-NEXT: vpaddb %ymm2, %ymm0, %ymm0
@@ -479,7 +479,7 @@ define <64 x i8> @f64xi8_i64(<64 x i8> %a) {
; NO-AVX512BW-64-NEXT: retq
;
; AVX512BW-64-LABEL: f64xi8_i64:
-; AVX512BW-64: # BB#0:
+; AVX512BW-64: # %bb.0:
; AVX512BW-64-NEXT: vpbroadcastq {{.*#+}} zmm1 = [506097522914230528,506097522914230528,506097522914230528,506097522914230528,506097522914230528,506097522914230528,506097522914230528,506097522914230528]
; AVX512BW-64-NEXT: vpaddb %zmm1, %zmm0, %zmm0
; AVX512BW-64-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -492,7 +492,7 @@ define <64 x i8> @f64xi8_i64(<64 x i8> %a) {
define <64 x i8> @f64xi8_i128(<64 x i8> %a) {
; AVX-LABEL: f64xi8_i128:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
; AVX-NEXT: vpaddb %xmm3, %xmm2, %xmm2
@@ -508,7 +508,7 @@ define <64 x i8> @f64xi8_i128(<64 x i8> %a) {
; AVX-NEXT: retl
;
; NO-AVX512BW-LABEL: f64xi8_i128:
-; NO-AVX512BW: # BB#0:
+; NO-AVX512BW: # %bb.0:
; NO-AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
; NO-AVX512BW-NEXT: # ymm2 = mem[0,1,0,1]
; NO-AVX512BW-NEXT: vpaddb %ymm2, %ymm1, %ymm1
@@ -518,7 +518,7 @@ define <64 x i8> @f64xi8_i128(<64 x i8> %a) {
; NO-AVX512BW-NEXT: retl
;
; AVX512BW-LABEL: f64xi8_i128:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm0
@@ -526,7 +526,7 @@ define <64 x i8> @f64xi8_i128(<64 x i8> %a) {
; AVX512BW-NEXT: retl
;
; AVX-64-LABEL: f64xi8_i128:
-; AVX-64: # BB#0:
+; AVX-64: # %bb.0:
; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX-64-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
; AVX-64-NEXT: vpaddb %xmm3, %xmm2, %xmm2
@@ -542,7 +542,7 @@ define <64 x i8> @f64xi8_i128(<64 x i8> %a) {
; AVX-64-NEXT: retq
;
; NO-AVX512BW-64-LABEL: f64xi8_i128:
-; NO-AVX512BW-64: # BB#0:
+; NO-AVX512BW-64: # %bb.0:
; NO-AVX512BW-64-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
; NO-AVX512BW-64-NEXT: # ymm2 = mem[0,1,0,1]
; NO-AVX512BW-64-NEXT: vpaddb %ymm2, %ymm1, %ymm1
@@ -552,7 +552,7 @@ define <64 x i8> @f64xi8_i128(<64 x i8> %a) {
; NO-AVX512BW-64-NEXT: retq
;
; AVX512BW-64-LABEL: f64xi8_i128:
-; AVX512BW-64: # BB#0:
+; AVX512BW-64: # %bb.0:
; AVX512BW-64-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
; AVX512BW-64-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-64-NEXT: vpaddb %zmm1, %zmm0, %zmm0
@@ -566,7 +566,7 @@ define <64 x i8> @f64xi8_i128(<64 x i8> %a) {
define <64 x i8> @f64xi8_i256(<64 x i8> %a) {
; AVX-LABEL: f64xi8_i256:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]
; AVX-NEXT: vpaddb %xmm3, %xmm2, %xmm2
@@ -583,7 +583,7 @@ define <64 x i8> @f64xi8_i256(<64 x i8> %a) {
; AVX-NEXT: retl
;
; NO-AVX512BW-LABEL: f64xi8_i256:
-; NO-AVX512BW: # BB#0:
+; NO-AVX512BW: # %bb.0:
; NO-AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]
; NO-AVX512BW-NEXT: vpaddb %ymm2, %ymm1, %ymm1
; NO-AVX512BW-NEXT: vpaddb %ymm2, %ymm0, %ymm0
@@ -592,7 +592,7 @@ define <64 x i8> @f64xi8_i256(<64 x i8> %a) {
; NO-AVX512BW-NEXT: retl
;
; AVX512BW-LABEL: f64xi8_i256:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]
; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm0
@@ -600,7 +600,7 @@ define <64 x i8> @f64xi8_i256(<64 x i8> %a) {
; AVX512BW-NEXT: retl
;
; AVX-64-LABEL: f64xi8_i256:
-; AVX-64: # BB#0:
+; AVX-64: # %bb.0:
; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX-64-NEXT: vmovdqa {{.*#+}} xmm3 = [16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]
; AVX-64-NEXT: vpaddb %xmm3, %xmm2, %xmm2
@@ -617,7 +617,7 @@ define <64 x i8> @f64xi8_i256(<64 x i8> %a) {
; AVX-64-NEXT: retq
;
; NO-AVX512BW-64-LABEL: f64xi8_i256:
-; NO-AVX512BW-64: # BB#0:
+; NO-AVX512BW-64: # %bb.0:
; NO-AVX512BW-64-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]
; NO-AVX512BW-64-NEXT: vpaddb %ymm2, %ymm1, %ymm1
; NO-AVX512BW-64-NEXT: vpaddb %ymm2, %ymm0, %ymm0
@@ -626,7 +626,7 @@ define <64 x i8> @f64xi8_i256(<64 x i8> %a) {
; NO-AVX512BW-64-NEXT: retq
;
; AVX512BW-64-LABEL: f64xi8_i256:
-; AVX512BW-64: # BB#0:
+; AVX512BW-64: # %bb.0:
; AVX512BW-64-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]
; AVX512BW-64-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
; AVX512BW-64-NEXT: vpaddb %zmm1, %zmm0, %zmm0
@@ -640,28 +640,28 @@ define <64 x i8> @f64xi8_i256(<64 x i8> %a) {
define <8 x i16> @f8xi16_i32(<8 x i16> %a) {
; AVX-LABEL: f8xi16_i32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [9.18354962E-41,9.18354962E-41,9.18354962E-41,9.18354962E-41]
; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX-NEXT: retl
;
; ALL32-LABEL: f8xi16_i32:
-; ALL32: # BB#0:
+; ALL32: # %bb.0:
; ALL32-NEXT: vpbroadcastd {{.*#+}} xmm1 = [65536,65536,65536,65536]
; ALL32-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; ALL32-NEXT: vpand %xmm1, %xmm0, %xmm0
; ALL32-NEXT: retl
;
; AVX-64-LABEL: f8xi16_i32:
-; AVX-64: # BB#0:
+; AVX-64: # %bb.0:
; AVX-64-NEXT: vbroadcastss {{.*#+}} xmm1 = [9.18354962E-41,9.18354962E-41,9.18354962E-41,9.18354962E-41]
; AVX-64-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; AVX-64-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX-64-NEXT: retq
;
; ALL64-LABEL: f8xi16_i32:
-; ALL64: # BB#0:
+; ALL64: # %bb.0:
; ALL64-NEXT: vpbroadcastd {{.*#+}} xmm1 = [65536,65536,65536,65536]
; ALL64-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; ALL64-NEXT: vpand %xmm1, %xmm0, %xmm0
@@ -674,28 +674,28 @@ define <8 x i16> @f8xi16_i32(<8 x i16> %a) {
define <8 x i16> @f8xi16_i64(<8 x i16> %a) {
; AVX-LABEL: f8xi16_i64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0]
; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX-NEXT: retl
;
; ALL32-LABEL: f8xi16_i64:
-; ALL32: # BB#0:
+; ALL32: # %bb.0:
; ALL32-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0]
; ALL32-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; ALL32-NEXT: vpand %xmm1, %xmm0, %xmm0
; ALL32-NEXT: retl
;
; AVX-64-LABEL: f8xi16_i64:
-; AVX-64: # BB#0:
+; AVX-64: # %bb.0:
; AVX-64-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0]
; AVX-64-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; AVX-64-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX-64-NEXT: retq
;
; ALL64-LABEL: f8xi16_i64:
-; ALL64: # BB#0:
+; ALL64: # %bb.0:
; ALL64-NEXT: vpbroadcastq {{.*#+}} xmm1 = [844433520132096,844433520132096]
; ALL64-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; ALL64-NEXT: vpand %xmm1, %xmm0, %xmm0
@@ -708,7 +708,7 @@ define <8 x i16> @f8xi16_i64(<8 x i16> %a) {
define <16 x i16> @f16xi16_i32(<16 x i16> %a) {
; AVX-LABEL: f16xi16_i32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX-NEXT: vbroadcastss {{.*#+}} xmm2 = [9.18354962E-41,9.18354962E-41,9.18354962E-41,9.18354962E-41]
; AVX-NEXT: vpaddw %xmm2, %xmm1, %xmm1
@@ -718,14 +718,14 @@ define <16 x i16> @f16xi16_i32(<16 x i16> %a) {
; AVX-NEXT: retl
;
; ALL32-LABEL: f16xi16_i32:
-; ALL32: # BB#0:
+; ALL32: # %bb.0:
; ALL32-NEXT: vpbroadcastd {{.*#+}} ymm1 = [65536,65536,65536,65536,65536,65536,65536,65536]
; ALL32-NEXT: vpaddw %ymm1, %ymm0, %ymm0
; ALL32-NEXT: vpand %ymm1, %ymm0, %ymm0
; ALL32-NEXT: retl
;
; AVX-64-LABEL: f16xi16_i32:
-; AVX-64: # BB#0:
+; AVX-64: # %bb.0:
; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX-64-NEXT: vbroadcastss {{.*#+}} xmm2 = [9.18354962E-41,9.18354962E-41,9.18354962E-41,9.18354962E-41]
; AVX-64-NEXT: vpaddw %xmm2, %xmm1, %xmm1
@@ -735,7 +735,7 @@ define <16 x i16> @f16xi16_i32(<16 x i16> %a) {
; AVX-64-NEXT: retq
;
; ALL64-LABEL: f16xi16_i32:
-; ALL64: # BB#0:
+; ALL64: # %bb.0:
; ALL64-NEXT: vpbroadcastd {{.*#+}} ymm1 = [65536,65536,65536,65536,65536,65536,65536,65536]
; ALL64-NEXT: vpaddw %ymm1, %ymm0, %ymm0
; ALL64-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -748,7 +748,7 @@ define <16 x i16> @f16xi16_i32(<16 x i16> %a) {
define <16 x i16> @f16xi16_i64(<16 x i16> %a) {
; AVX-LABEL: f16xi16_i64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0]
; AVX-NEXT: vpaddw %xmm2, %xmm1, %xmm1
@@ -758,14 +758,14 @@ define <16 x i16> @f16xi16_i64(<16 x i16> %a) {
; AVX-NEXT: retl
;
; ALL32-LABEL: f16xi16_i64:
-; ALL32: # BB#0:
+; ALL32: # %bb.0:
; ALL32-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4.1720559249406128E-309,4.1720559249406128E-309,4.1720559249406128E-309,4.1720559249406128E-309]
; ALL32-NEXT: vpaddw %ymm1, %ymm0, %ymm0
; ALL32-NEXT: vpand %ymm1, %ymm0, %ymm0
; ALL32-NEXT: retl
;
; AVX-64-LABEL: f16xi16_i64:
-; AVX-64: # BB#0:
+; AVX-64: # %bb.0:
; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX-64-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0]
; AVX-64-NEXT: vpaddw %xmm2, %xmm1, %xmm1
@@ -775,7 +775,7 @@ define <16 x i16> @f16xi16_i64(<16 x i16> %a) {
; AVX-64-NEXT: retq
;
; ALL64-LABEL: f16xi16_i64:
-; ALL64: # BB#0:
+; ALL64: # %bb.0:
; ALL64-NEXT: vpbroadcastq {{.*#+}} ymm1 = [844433520132096,844433520132096,844433520132096,844433520132096]
; ALL64-NEXT: vpaddw %ymm1, %ymm0, %ymm0
; ALL64-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -788,7 +788,7 @@ define <16 x i16> @f16xi16_i64(<16 x i16> %a) {
define <16 x i16> @f16xi16_i128(<16 x i16> %a) {
; AVX-LABEL: f16xi16_i128:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7]
; AVX-NEXT: vpaddw %xmm2, %xmm1, %xmm1
@@ -798,7 +798,7 @@ define <16 x i16> @f16xi16_i128(<16 x i16> %a) {
; AVX-NEXT: retl
;
; ALL32-LABEL: f16xi16_i128:
-; ALL32: # BB#0:
+; ALL32: # %bb.0:
; ALL32-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
; ALL32-NEXT: # ymm1 = mem[0,1,0,1]
; ALL32-NEXT: vpaddw %ymm1, %ymm0, %ymm0
@@ -806,7 +806,7 @@ define <16 x i16> @f16xi16_i128(<16 x i16> %a) {
; ALL32-NEXT: retl
;
; AVX-64-LABEL: f16xi16_i128:
-; AVX-64: # BB#0:
+; AVX-64: # %bb.0:
; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX-64-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7]
; AVX-64-NEXT: vpaddw %xmm2, %xmm1, %xmm1
@@ -816,7 +816,7 @@ define <16 x i16> @f16xi16_i128(<16 x i16> %a) {
; AVX-64-NEXT: retq
;
; ALL64-LABEL: f16xi16_i128:
-; ALL64: # BB#0:
+; ALL64: # %bb.0:
; ALL64-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
; ALL64-NEXT: # ymm1 = mem[0,1,0,1]
; ALL64-NEXT: vpaddw %ymm1, %ymm0, %ymm0
@@ -830,7 +830,7 @@ define <16 x i16> @f16xi16_i128(<16 x i16> %a) {
define <32 x i16> @f32xi16_i32(<32 x i16> %a) {
; AVX-LABEL: f32xi16_i32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX-NEXT: vbroadcastss {{.*#+}} xmm3 = [9.18354962E-41,9.18354962E-41,9.18354962E-41,9.18354962E-41]
; AVX-NEXT: vpaddw %xmm3, %xmm2, %xmm2
@@ -846,7 +846,7 @@ define <32 x i16> @f32xi16_i32(<32 x i16> %a) {
; AVX-NEXT: retl
;
; NO-AVX512BW-LABEL: f32xi16_i32:
-; NO-AVX512BW: # BB#0:
+; NO-AVX512BW: # %bb.0:
; NO-AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm2 = [65536,65536,65536,65536,65536,65536,65536,65536]
; NO-AVX512BW-NEXT: vpaddw %ymm2, %ymm1, %ymm1
; NO-AVX512BW-NEXT: vpaddw %ymm2, %ymm0, %ymm0
@@ -855,14 +855,14 @@ define <32 x i16> @f32xi16_i32(<32 x i16> %a) {
; NO-AVX512BW-NEXT: retl
;
; AVX512BW-LABEL: f32xi16_i32:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpbroadcastd {{.*#+}} zmm1 = [65536,65536,65536,65536,65536,65536,65536,65536,65536,65536,65536,65536,65536,65536,65536,65536]
; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: retl
;
; AVX-64-LABEL: f32xi16_i32:
-; AVX-64: # BB#0:
+; AVX-64: # %bb.0:
; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX-64-NEXT: vbroadcastss {{.*#+}} xmm3 = [9.18354962E-41,9.18354962E-41,9.18354962E-41,9.18354962E-41]
; AVX-64-NEXT: vpaddw %xmm3, %xmm2, %xmm2
@@ -878,7 +878,7 @@ define <32 x i16> @f32xi16_i32(<32 x i16> %a) {
; AVX-64-NEXT: retq
;
; NO-AVX512BW-64-LABEL: f32xi16_i32:
-; NO-AVX512BW-64: # BB#0:
+; NO-AVX512BW-64: # %bb.0:
; NO-AVX512BW-64-NEXT: vpbroadcastd {{.*#+}} ymm2 = [65536,65536,65536,65536,65536,65536,65536,65536]
; NO-AVX512BW-64-NEXT: vpaddw %ymm2, %ymm1, %ymm1
; NO-AVX512BW-64-NEXT: vpaddw %ymm2, %ymm0, %ymm0
@@ -887,7 +887,7 @@ define <32 x i16> @f32xi16_i32(<32 x i16> %a) {
; NO-AVX512BW-64-NEXT: retq
;
; AVX512BW-64-LABEL: f32xi16_i32:
-; AVX512BW-64: # BB#0:
+; AVX512BW-64: # %bb.0:
; AVX512BW-64-NEXT: vpbroadcastd {{.*#+}} zmm1 = [65536,65536,65536,65536,65536,65536,65536,65536,65536,65536,65536,65536,65536,65536,65536,65536]
; AVX512BW-64-NEXT: vpaddw %zmm1, %zmm0, %zmm0
; AVX512BW-64-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -900,7 +900,7 @@ define <32 x i16> @f32xi16_i32(<32 x i16> %a) {
define <32 x i16> @f32xi16_i64(<32 x i16> %a) {
; AVX-LABEL: f32xi16_i64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0]
; AVX-NEXT: vpaddw %xmm3, %xmm2, %xmm2
@@ -916,7 +916,7 @@ define <32 x i16> @f32xi16_i64(<32 x i16> %a) {
; AVX-NEXT: retl
;
; NO-AVX512BW-LABEL: f32xi16_i64:
-; NO-AVX512BW: # BB#0:
+; NO-AVX512BW: # %bb.0:
; NO-AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4.1720559249406128E-309,4.1720559249406128E-309,4.1720559249406128E-309,4.1720559249406128E-309]
; NO-AVX512BW-NEXT: vpaddw %ymm2, %ymm1, %ymm1
; NO-AVX512BW-NEXT: vpaddw %ymm2, %ymm0, %ymm0
@@ -925,14 +925,14 @@ define <32 x i16> @f32xi16_i64(<32 x i16> %a) {
; NO-AVX512BW-NEXT: retl
;
; AVX512BW-LABEL: f32xi16_i64:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpbroadcastq {{.*#+}} zmm1 = [4.1720559249406128E-309,4.1720559249406128E-309,4.1720559249406128E-309,4.1720559249406128E-309,4.1720559249406128E-309,4.1720559249406128E-309,4.1720559249406128E-309,4.1720559249406128E-309]
; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: retl
;
; AVX-64-LABEL: f32xi16_i64:
-; AVX-64: # BB#0:
+; AVX-64: # %bb.0:
; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX-64-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0]
; AVX-64-NEXT: vpaddw %xmm3, %xmm2, %xmm2
@@ -948,7 +948,7 @@ define <32 x i16> @f32xi16_i64(<32 x i16> %a) {
; AVX-64-NEXT: retq
;
; NO-AVX512BW-64-LABEL: f32xi16_i64:
-; NO-AVX512BW-64: # BB#0:
+; NO-AVX512BW-64: # %bb.0:
; NO-AVX512BW-64-NEXT: vpbroadcastq {{.*#+}} ymm2 = [844433520132096,844433520132096,844433520132096,844433520132096]
; NO-AVX512BW-64-NEXT: vpaddw %ymm2, %ymm1, %ymm1
; NO-AVX512BW-64-NEXT: vpaddw %ymm2, %ymm0, %ymm0
@@ -957,7 +957,7 @@ define <32 x i16> @f32xi16_i64(<32 x i16> %a) {
; NO-AVX512BW-64-NEXT: retq
;
; AVX512BW-64-LABEL: f32xi16_i64:
-; AVX512BW-64: # BB#0:
+; AVX512BW-64: # %bb.0:
; AVX512BW-64-NEXT: vpbroadcastq {{.*#+}} zmm1 = [844433520132096,844433520132096,844433520132096,844433520132096,844433520132096,844433520132096,844433520132096,844433520132096]
; AVX512BW-64-NEXT: vpaddw %zmm1, %zmm0, %zmm0
; AVX512BW-64-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -970,7 +970,7 @@ define <32 x i16> @f32xi16_i64(<32 x i16> %a) {
define <32 x i16> @f32xi16_i128(<32 x i16> %a) {
; AVX-LABEL: f32xi16_i128:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7]
; AVX-NEXT: vpaddw %xmm3, %xmm2, %xmm2
@@ -986,7 +986,7 @@ define <32 x i16> @f32xi16_i128(<32 x i16> %a) {
; AVX-NEXT: retl
;
; NO-AVX512BW-LABEL: f32xi16_i128:
-; NO-AVX512BW: # BB#0:
+; NO-AVX512BW: # %bb.0:
; NO-AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
; NO-AVX512BW-NEXT: # ymm2 = mem[0,1,0,1]
; NO-AVX512BW-NEXT: vpaddw %ymm2, %ymm1, %ymm1
@@ -996,7 +996,7 @@ define <32 x i16> @f32xi16_i128(<32 x i16> %a) {
; NO-AVX512BW-NEXT: retl
;
; AVX512BW-LABEL: f32xi16_i128:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm0
@@ -1004,7 +1004,7 @@ define <32 x i16> @f32xi16_i128(<32 x i16> %a) {
; AVX512BW-NEXT: retl
;
; AVX-64-LABEL: f32xi16_i128:
-; AVX-64: # BB#0:
+; AVX-64: # %bb.0:
; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX-64-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7]
; AVX-64-NEXT: vpaddw %xmm3, %xmm2, %xmm2
@@ -1020,7 +1020,7 @@ define <32 x i16> @f32xi16_i128(<32 x i16> %a) {
; AVX-64-NEXT: retq
;
; NO-AVX512BW-64-LABEL: f32xi16_i128:
-; NO-AVX512BW-64: # BB#0:
+; NO-AVX512BW-64: # %bb.0:
; NO-AVX512BW-64-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
; NO-AVX512BW-64-NEXT: # ymm2 = mem[0,1,0,1]
; NO-AVX512BW-64-NEXT: vpaddw %ymm2, %ymm1, %ymm1
@@ -1030,7 +1030,7 @@ define <32 x i16> @f32xi16_i128(<32 x i16> %a) {
; NO-AVX512BW-64-NEXT: retq
;
; AVX512BW-64-LABEL: f32xi16_i128:
-; AVX512BW-64: # BB#0:
+; AVX512BW-64: # %bb.0:
; AVX512BW-64-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
; AVX512BW-64-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-64-NEXT: vpaddw %zmm1, %zmm0, %zmm0
@@ -1044,7 +1044,7 @@ define <32 x i16> @f32xi16_i128(<32 x i16> %a) {
define <32 x i16> @f32xi16_i256(<32 x i16> %a) {
; AVX-LABEL: f32xi16_i256:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,10,11,12,13,14,15]
; AVX-NEXT: vpaddw %xmm3, %xmm2, %xmm2
@@ -1061,7 +1061,7 @@ define <32 x i16> @f32xi16_i256(<32 x i16> %a) {
; AVX-NEXT: retl
;
; NO-AVX512BW-LABEL: f32xi16_i256:
-; NO-AVX512BW: # BB#0:
+; NO-AVX512BW: # %bb.0:
; NO-AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
; NO-AVX512BW-NEXT: vpaddw %ymm2, %ymm1, %ymm1
; NO-AVX512BW-NEXT: vpaddw %ymm2, %ymm0, %ymm0
@@ -1070,7 +1070,7 @@ define <32 x i16> @f32xi16_i256(<32 x i16> %a) {
; NO-AVX512BW-NEXT: retl
;
; AVX512BW-LABEL: f32xi16_i256:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm0
@@ -1078,7 +1078,7 @@ define <32 x i16> @f32xi16_i256(<32 x i16> %a) {
; AVX512BW-NEXT: retl
;
; AVX-64-LABEL: f32xi16_i256:
-; AVX-64: # BB#0:
+; AVX-64: # %bb.0:
; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX-64-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,10,11,12,13,14,15]
; AVX-64-NEXT: vpaddw %xmm3, %xmm2, %xmm2
@@ -1095,7 +1095,7 @@ define <32 x i16> @f32xi16_i256(<32 x i16> %a) {
; AVX-64-NEXT: retq
;
; NO-AVX512BW-64-LABEL: f32xi16_i256:
-; NO-AVX512BW-64: # BB#0:
+; NO-AVX512BW-64: # %bb.0:
; NO-AVX512BW-64-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
; NO-AVX512BW-64-NEXT: vpaddw %ymm2, %ymm1, %ymm1
; NO-AVX512BW-64-NEXT: vpaddw %ymm2, %ymm0, %ymm0
@@ -1104,7 +1104,7 @@ define <32 x i16> @f32xi16_i256(<32 x i16> %a) {
; NO-AVX512BW-64-NEXT: retq
;
; AVX512BW-64-LABEL: f32xi16_i256:
-; AVX512BW-64: # BB#0:
+; AVX512BW-64: # %bb.0:
; AVX512BW-64-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
; AVX512BW-64-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
; AVX512BW-64-NEXT: vpaddw %zmm1, %zmm0, %zmm0
@@ -1119,28 +1119,28 @@ define <32 x i16> @f32xi16_i256(<32 x i16> %a) {
define <4 x i32> @f4xi32_i64(<4 x i32> %a) {
; AVX-LABEL: f4xi32_i64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0]
; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX-NEXT: retl
;
; ALL32-LABEL: f4xi32_i64:
-; ALL32: # BB#0:
+; ALL32: # %bb.0:
; ALL32-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0]
; ALL32-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; ALL32-NEXT: vpand %xmm1, %xmm0, %xmm0
; ALL32-NEXT: retl
;
; AVX-64-LABEL: f4xi32_i64:
-; AVX-64: # BB#0:
+; AVX-64: # %bb.0:
; AVX-64-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0]
; AVX-64-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX-64-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX-64-NEXT: retq
;
; ALL64-LABEL: f4xi32_i64:
-; ALL64: # BB#0:
+; ALL64: # %bb.0:
; ALL64-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4294967296,4294967296]
; ALL64-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; ALL64-NEXT: vpand %xmm1, %xmm0, %xmm0
@@ -1153,7 +1153,7 @@ define <4 x i32> @f4xi32_i64(<4 x i32> %a) {
define <8 x i32> @f8xi32_i64(<8 x i32> %a) {
; AVX-LABEL: f8xi32_i64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0]
; AVX-NEXT: vpaddd %xmm2, %xmm1, %xmm1
@@ -1163,14 +1163,14 @@ define <8 x i32> @f8xi32_i64(<8 x i32> %a) {
; AVX-NEXT: retl
;
; ALL32-LABEL: f8xi32_i64:
-; ALL32: # BB#0:
+; ALL32: # %bb.0:
; ALL32-NEXT: vpbroadcastq {{.*#+}} ymm1 = [2.1219957909652723E-314,2.1219957909652723E-314,2.1219957909652723E-314,2.1219957909652723E-314]
; ALL32-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; ALL32-NEXT: vpand %ymm1, %ymm0, %ymm0
; ALL32-NEXT: retl
;
; AVX-64-LABEL: f8xi32_i64:
-; AVX-64: # BB#0:
+; AVX-64: # %bb.0:
; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX-64-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0]
; AVX-64-NEXT: vpaddd %xmm2, %xmm1, %xmm1
@@ -1180,7 +1180,7 @@ define <8 x i32> @f8xi32_i64(<8 x i32> %a) {
; AVX-64-NEXT: retq
;
; ALL64-LABEL: f8xi32_i64:
-; ALL64: # BB#0:
+; ALL64: # %bb.0:
; ALL64-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4294967296,4294967296,4294967296,4294967296]
; ALL64-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; ALL64-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -1193,7 +1193,7 @@ define <8 x i32> @f8xi32_i64(<8 x i32> %a) {
define <8 x i32> @f8xi32_i128(<8 x i32> %a) {
; AVX-LABEL: f8xi32_i128:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3]
; AVX-NEXT: vpaddd %xmm2, %xmm1, %xmm1
@@ -1203,7 +1203,7 @@ define <8 x i32> @f8xi32_i128(<8 x i32> %a) {
; AVX-NEXT: retl
;
; ALL32-LABEL: f8xi32_i128:
-; ALL32: # BB#0:
+; ALL32: # %bb.0:
; ALL32-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,1,2,3,0,1,2,3]
; ALL32-NEXT: # ymm1 = mem[0,1,0,1]
; ALL32-NEXT: vpaddd %ymm1, %ymm0, %ymm0
@@ -1211,7 +1211,7 @@ define <8 x i32> @f8xi32_i128(<8 x i32> %a) {
; ALL32-NEXT: retl
;
; AVX-64-LABEL: f8xi32_i128:
-; AVX-64: # BB#0:
+; AVX-64: # %bb.0:
; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX-64-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3]
; AVX-64-NEXT: vpaddd %xmm2, %xmm1, %xmm1
@@ -1221,7 +1221,7 @@ define <8 x i32> @f8xi32_i128(<8 x i32> %a) {
; AVX-64-NEXT: retq
;
; ALL64-LABEL: f8xi32_i128:
-; ALL64: # BB#0:
+; ALL64: # %bb.0:
; ALL64-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,1,2,3,0,1,2,3]
; ALL64-NEXT: # ymm1 = mem[0,1,0,1]
; ALL64-NEXT: vpaddd %ymm1, %ymm0, %ymm0
@@ -1235,7 +1235,7 @@ define <8 x i32> @f8xi32_i128(<8 x i32> %a) {
define <16 x i32> @f16xi32_i64(<16 x i32> %a) {
; AVX-LABEL: f16xi32_i64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0]
; AVX-NEXT: vpaddd %xmm3, %xmm2, %xmm2
@@ -1251,7 +1251,7 @@ define <16 x i32> @f16xi32_i64(<16 x i32> %a) {
; AVX-NEXT: retl
;
; AVX2-LABEL: f16xi32_i64:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [2.1219957909652723E-314,2.1219957909652723E-314,2.1219957909652723E-314,2.1219957909652723E-314]
; AVX2-NEXT: vpaddd %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0
@@ -1260,14 +1260,14 @@ define <16 x i32> @f16xi32_i64(<16 x i32> %a) {
; AVX2-NEXT: retl
;
; AVX512-LABEL: f16xi32_i64:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm1 = [2.1219957909652723E-314,2.1219957909652723E-314,2.1219957909652723E-314,2.1219957909652723E-314,2.1219957909652723E-314,2.1219957909652723E-314,2.1219957909652723E-314,2.1219957909652723E-314]
; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
; AVX512-NEXT: retl
;
; AVX-64-LABEL: f16xi32_i64:
-; AVX-64: # BB#0:
+; AVX-64: # %bb.0:
; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX-64-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0]
; AVX-64-NEXT: vpaddd %xmm3, %xmm2, %xmm2
@@ -1283,7 +1283,7 @@ define <16 x i32> @f16xi32_i64(<16 x i32> %a) {
; AVX-64-NEXT: retq
;
; AVX2-64-LABEL: f16xi32_i64:
-; AVX2-64: # BB#0:
+; AVX2-64: # %bb.0:
; AVX2-64-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4294967296,4294967296,4294967296,4294967296]
; AVX2-64-NEXT: vpaddd %ymm2, %ymm1, %ymm1
; AVX2-64-NEXT: vpaddd %ymm2, %ymm0, %ymm0
@@ -1292,7 +1292,7 @@ define <16 x i32> @f16xi32_i64(<16 x i32> %a) {
; AVX2-64-NEXT: retq
;
; AVX512F-64-LABEL: f16xi32_i64:
-; AVX512F-64: # BB#0:
+; AVX512F-64: # %bb.0:
; AVX512F-64-NEXT: vpbroadcastq {{.*#+}} zmm1 = [4294967296,4294967296,4294967296,4294967296,4294967296,4294967296,4294967296,4294967296]
; AVX512F-64-NEXT: vpaddd %zmm1, %zmm0, %zmm0
; AVX512F-64-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -1305,7 +1305,7 @@ define <16 x i32> @f16xi32_i64(<16 x i32> %a) {
define <16 x i32> @f16xi32_i128(<16 x i32> %a) {
; AVX-LABEL: f16xi32_i128:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3]
; AVX-NEXT: vpaddd %xmm3, %xmm2, %xmm2
@@ -1321,7 +1321,7 @@ define <16 x i32> @f16xi32_i128(<16 x i32> %a) {
; AVX-NEXT: retl
;
; AVX2-LABEL: f16xi32_i128:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3]
; AVX2-NEXT: # ymm2 = mem[0,1,0,1]
; AVX2-NEXT: vpaddd %ymm2, %ymm1, %ymm1
@@ -1331,15 +1331,15 @@ define <16 x i32> @f16xi32_i128(<16 x i32> %a) {
; AVX2-NEXT: retl
;
; AVX512-LABEL: f16xi32_i128:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
; AVX512-NEXT: retl
;
; AVX-64-LABEL: f16xi32_i128:
-; AVX-64: # BB#0:
+; AVX-64: # %bb.0:
; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX-64-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3]
; AVX-64-NEXT: vpaddd %xmm3, %xmm2, %xmm2
@@ -1355,7 +1355,7 @@ define <16 x i32> @f16xi32_i128(<16 x i32> %a) {
; AVX-64-NEXT: retq
;
; AVX2-64-LABEL: f16xi32_i128:
-; AVX2-64: # BB#0:
+; AVX2-64: # %bb.0:
; AVX2-64-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3]
; AVX2-64-NEXT: # ymm2 = mem[0,1,0,1]
; AVX2-64-NEXT: vpaddd %ymm2, %ymm1, %ymm1
@@ -1365,11 +1365,11 @@ define <16 x i32> @f16xi32_i128(<16 x i32> %a) {
; AVX2-64-NEXT: retq
;
; AVX512F-64-LABEL: f16xi32_i128:
-; AVX512F-64: # BB#0:
+; AVX512F-64: # %bb.0:
; AVX512F-64-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512F-64-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512F-64-NEXT: vpaddd %zmm1, %zmm0, %zmm0
-; AVX512F-64-NEXT: vpandd %zmm1, %zmm0, %zmm0
+; AVX512F-64-NEXT: vpandq %zmm1, %zmm0, %zmm0
; AVX512F-64-NEXT: retq
%res1 = add <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>, %a
%res2 = and <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>, %res1
@@ -1379,7 +1379,7 @@ define <16 x i32> @f16xi32_i128(<16 x i32> %a) {
define <4 x i64> @f4xi64_i128(<4 x i64> %a) {
; AVX-LABEL: f4xi64_i128:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,1,0,0,0,1,0]
; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm3
@@ -1390,14 +1390,14 @@ define <4 x i64> @f4xi64_i128(<4 x i64> %a) {
; AVX-NEXT: retl
;
; ALL32-LABEL: f4xi64_i128:
-; ALL32: # BB#0:
+; ALL32: # %bb.0:
; ALL32-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,1,0,0,0,1,0]
; ALL32-NEXT: vpaddq %ymm1, %ymm0, %ymm0
; ALL32-NEXT: vpand %ymm1, %ymm0, %ymm0
; ALL32-NEXT: retl
;
; AVX-64-LABEL: f4xi64_i128:
-; AVX-64: # BB#0:
+; AVX-64: # %bb.0:
; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX-64-NEXT: movl $1, %eax
; AVX-64-NEXT: vmovq %rax, %xmm2
@@ -1409,7 +1409,7 @@ define <4 x i64> @f4xi64_i128(<4 x i64> %a) {
; AVX-64-NEXT: retq
;
; ALL64-LABEL: f4xi64_i128:
-; ALL64: # BB#0:
+; ALL64: # %bb.0:
; ALL64-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,1,0,1]
; ALL64-NEXT: # ymm1 = mem[0,1,0,1]
; ALL64-NEXT: vpaddq %ymm1, %ymm0, %ymm0
@@ -1423,7 +1423,7 @@ define <4 x i64> @f4xi64_i128(<4 x i64> %a) {
define <8 x i64> @f8xi64_i128(<8 x i64> %a) {
; AVX-LABEL: f8xi64_i128:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,1,0,0,0,1,0]
; AVX-NEXT: vextractf128 $1, %ymm2, %xmm3
; AVX-NEXT: vextractf128 $1, %ymm1, %xmm4
@@ -1439,7 +1439,7 @@ define <8 x i64> @f8xi64_i128(<8 x i64> %a) {
; AVX-NEXT: retl
;
; AVX2-LABEL: f8xi64_i128:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,1,0,0,0,1,0]
; AVX2-NEXT: vpaddq %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0
@@ -1448,14 +1448,14 @@ define <8 x i64> @f8xi64_i128(<8 x i64> %a) {
; AVX2-NEXT: retl
;
; AVX512-LABEL: f8xi64_i128:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0]
; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
; AVX512-NEXT: retl
;
; AVX-64-LABEL: f8xi64_i128:
-; AVX-64: # BB#0:
+; AVX-64: # %bb.0:
; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX-64-NEXT: movl $1, %eax
; AVX-64-NEXT: vmovq %rax, %xmm3
@@ -1474,7 +1474,7 @@ define <8 x i64> @f8xi64_i128(<8 x i64> %a) {
; AVX-64-NEXT: retq
;
; AVX2-64-LABEL: f8xi64_i128:
-; AVX2-64: # BB#0:
+; AVX2-64: # %bb.0:
; AVX2-64-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,1,0,1]
; AVX2-64-NEXT: # ymm2 = mem[0,1,0,1]
; AVX2-64-NEXT: vpaddq %ymm2, %ymm1, %ymm1
@@ -1484,7 +1484,7 @@ define <8 x i64> @f8xi64_i128(<8 x i64> %a) {
; AVX2-64-NEXT: retq
;
; AVX512F-64-LABEL: f8xi64_i128:
-; AVX512F-64: # BB#0:
+; AVX512F-64: # %bb.0:
; AVX512F-64-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,1,0,1,0,1,0,1]
; AVX512F-64-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512F-64-NEXT: vpaddq %zmm1, %zmm0, %zmm0
@@ -1498,7 +1498,7 @@ define <8 x i64> @f8xi64_i128(<8 x i64> %a) {
define <8 x i64> @f8xi64_i256(<8 x i64> %a) {
; AVX-LABEL: f8xi64_i256:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,1,0,2,0,3,0]
; AVX-NEXT: vextractf128 $1, %ymm2, %xmm3
; AVX-NEXT: vextractf128 $1, %ymm1, %xmm4
@@ -1514,7 +1514,7 @@ define <8 x i64> @f8xi64_i256(<8 x i64> %a) {
; AVX-NEXT: retl
;
; AVX2-LABEL: f8xi64_i256:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,1,0,2,0,3,0]
; AVX2-NEXT: vpaddq %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0
@@ -1523,14 +1523,14 @@ define <8 x i64> @f8xi64_i256(<8 x i64> %a) {
; AVX2-NEXT: retl
;
; AVX512-LABEL: f8xi64_i256:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,1,0,2,0,3,0,0,0,1,0,2,0,3,0]
; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
; AVX512-NEXT: retl
;
; AVX-64-LABEL: f8xi64_i256:
-; AVX-64: # BB#0:
+; AVX-64: # %bb.0:
; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX-64-NEXT: vmovdqa {{.*#+}} xmm3 = [2,3]
; AVX-64-NEXT: vpaddq %xmm3, %xmm2, %xmm2
@@ -1549,7 +1549,7 @@ define <8 x i64> @f8xi64_i256(<8 x i64> %a) {
; AVX-64-NEXT: retq
;
; AVX2-64-LABEL: f8xi64_i256:
-; AVX2-64: # BB#0:
+; AVX2-64: # %bb.0:
; AVX2-64-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3]
; AVX2-64-NEXT: vpaddq %ymm2, %ymm1, %ymm1
; AVX2-64-NEXT: vpaddq %ymm2, %ymm0, %ymm0
@@ -1558,7 +1558,7 @@ define <8 x i64> @f8xi64_i256(<8 x i64> %a) {
; AVX2-64-NEXT: retq
;
; AVX512F-64-LABEL: f8xi64_i256:
-; AVX512F-64: # BB#0:
+; AVX512F-64: # %bb.0:
; AVX512F-64-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,2,3,0,1,2,3]
; AVX512F-64-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
; AVX512F-64-NEXT: vpaddq %zmm1, %zmm0, %zmm0
@@ -1572,28 +1572,28 @@ define <8 x i64> @f8xi64_i256(<8 x i64> %a) {
define <4 x float> @f4xf32_f64(<4 x float> %a) {
; AVX-LABEL: f4xf32_f64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0]
; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0
; AVX-NEXT: vdivps %xmm0, %xmm1, %xmm0
; AVX-NEXT: retl
;
; ALL32-LABEL: f4xf32_f64:
-; ALL32: # BB#0:
+; ALL32: # %bb.0:
; ALL32-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0]
; ALL32-NEXT: vaddps %xmm1, %xmm0, %xmm0
; ALL32-NEXT: vdivps %xmm0, %xmm1, %xmm0
; ALL32-NEXT: retl
;
; AVX-64-LABEL: f4xf32_f64:
-; AVX-64: # BB#0:
+; AVX-64: # %bb.0:
; AVX-64-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0]
; AVX-64-NEXT: vaddps %xmm1, %xmm0, %xmm0
; AVX-64-NEXT: vdivps %xmm0, %xmm1, %xmm0
; AVX-64-NEXT: retq
;
; ALL64-LABEL: f4xf32_f64:
-; ALL64: # BB#0:
+; ALL64: # %bb.0:
; ALL64-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4575657222482165760,4575657222482165760]
; ALL64-NEXT: vaddps %xmm1, %xmm0, %xmm0
; ALL64-NEXT: vdivps %xmm0, %xmm1, %xmm0
@@ -1606,28 +1606,28 @@ define <4 x float> @f4xf32_f64(<4 x float> %a) {
define <8 x float> @f8xf32_f64(<8 x float> %a) {
; AVX-LABEL: f8xf32_f64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vbroadcastsd {{.*#+}} ymm1 = [0.0078125018626451492,0.0078125018626451492,0.0078125018626451492,0.0078125018626451492]
; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0
; AVX-NEXT: vdivps %ymm0, %ymm1, %ymm0
; AVX-NEXT: retl
;
; ALL32-LABEL: f8xf32_f64:
-; ALL32: # BB#0:
+; ALL32: # %bb.0:
; ALL32-NEXT: vbroadcastsd {{.*#+}} ymm1 = [0.0078125018626451492,0.0078125018626451492,0.0078125018626451492,0.0078125018626451492]
; ALL32-NEXT: vaddps %ymm1, %ymm0, %ymm0
; ALL32-NEXT: vdivps %ymm0, %ymm1, %ymm0
; ALL32-NEXT: retl
;
; AVX-64-LABEL: f8xf32_f64:
-; AVX-64: # BB#0:
+; AVX-64: # %bb.0:
; AVX-64-NEXT: vbroadcastsd {{.*#+}} ymm1 = [0.0078125018626451492,0.0078125018626451492,0.0078125018626451492,0.0078125018626451492]
; AVX-64-NEXT: vaddps %ymm1, %ymm0, %ymm0
; AVX-64-NEXT: vdivps %ymm0, %ymm1, %ymm0
; AVX-64-NEXT: retq
;
; ALL64-LABEL: f8xf32_f64:
-; ALL64: # BB#0:
+; ALL64: # %bb.0:
; ALL64-NEXT: vbroadcastsd {{.*#+}} ymm1 = [4575657222482165760,4575657222482165760,4575657222482165760,4575657222482165760]
; ALL64-NEXT: vaddps %ymm1, %ymm0, %ymm0
; ALL64-NEXT: vdivps %ymm0, %ymm1, %ymm0
@@ -1640,7 +1640,7 @@ define <8 x float> @f8xf32_f64(<8 x float> %a) {
define <8 x float> @f8xf32_f128(<8 x float> %a) {
; AVX-LABEL: f8xf32_f128:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00]
; AVX-NEXT: # ymm1 = mem[0,1,0,1]
; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0
@@ -1648,7 +1648,7 @@ define <8 x float> @f8xf32_f128(<8 x float> %a) {
; AVX-NEXT: retl
;
; ALL32-LABEL: f8xf32_f128:
-; ALL32: # BB#0:
+; ALL32: # %bb.0:
; ALL32-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00]
; ALL32-NEXT: # ymm1 = mem[0,1,0,1]
; ALL32-NEXT: vaddps %ymm1, %ymm0, %ymm0
@@ -1656,7 +1656,7 @@ define <8 x float> @f8xf32_f128(<8 x float> %a) {
; ALL32-NEXT: retl
;
; AVX-64-LABEL: f8xf32_f128:
-; AVX-64: # BB#0:
+; AVX-64: # %bb.0:
; AVX-64-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00]
; AVX-64-NEXT: # ymm1 = mem[0,1,0,1]
; AVX-64-NEXT: vaddps %ymm1, %ymm0, %ymm0
@@ -1664,7 +1664,7 @@ define <8 x float> @f8xf32_f128(<8 x float> %a) {
; AVX-64-NEXT: retq
;
; ALL64-LABEL: f8xf32_f128:
-; ALL64: # BB#0:
+; ALL64: # %bb.0:
; ALL64-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00]
; ALL64-NEXT: # ymm1 = mem[0,1,0,1]
; ALL64-NEXT: vaddps %ymm1, %ymm0, %ymm0
@@ -1678,7 +1678,7 @@ define <8 x float> @f8xf32_f128(<8 x float> %a) {
define <16 x float> @f16xf32_f64(<16 x float> %a) {
; AVX-LABEL: f16xf32_f64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vbroadcastsd {{.*#+}} ymm2 = [0.0078125018626451492,0.0078125018626451492,0.0078125018626451492,0.0078125018626451492]
; AVX-NEXT: vaddps %ymm2, %ymm1, %ymm1
; AVX-NEXT: vaddps %ymm2, %ymm0, %ymm0
@@ -1687,7 +1687,7 @@ define <16 x float> @f16xf32_f64(<16 x float> %a) {
; AVX-NEXT: retl
;
; AVX2-LABEL: f16xf32_f64:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm2 = [0.0078125018626451492,0.0078125018626451492,0.0078125018626451492,0.0078125018626451492]
; AVX2-NEXT: vaddps %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vaddps %ymm2, %ymm0, %ymm0
@@ -1696,14 +1696,14 @@ define <16 x float> @f16xf32_f64(<16 x float> %a) {
; AVX2-NEXT: retl
;
; AVX512-LABEL: f16xf32_f64:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vbroadcastsd {{.*#+}} zmm1 = [0.0078125018626451492,0.0078125018626451492,0.0078125018626451492,0.0078125018626451492,0.0078125018626451492,0.0078125018626451492,0.0078125018626451492,0.0078125018626451492]
; AVX512-NEXT: vaddps %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vdivps %zmm0, %zmm1, %zmm0
; AVX512-NEXT: retl
;
; AVX-64-LABEL: f16xf32_f64:
-; AVX-64: # BB#0:
+; AVX-64: # %bb.0:
; AVX-64-NEXT: vbroadcastsd {{.*#+}} ymm2 = [0.0078125018626451492,0.0078125018626451492,0.0078125018626451492,0.0078125018626451492]
; AVX-64-NEXT: vaddps %ymm2, %ymm1, %ymm1
; AVX-64-NEXT: vaddps %ymm2, %ymm0, %ymm0
@@ -1712,7 +1712,7 @@ define <16 x float> @f16xf32_f64(<16 x float> %a) {
; AVX-64-NEXT: retq
;
; AVX2-64-LABEL: f16xf32_f64:
-; AVX2-64: # BB#0:
+; AVX2-64: # %bb.0:
; AVX2-64-NEXT: vbroadcastsd {{.*#+}} ymm2 = [4575657222482165760,4575657222482165760,4575657222482165760,4575657222482165760]
; AVX2-64-NEXT: vaddps %ymm2, %ymm1, %ymm1
; AVX2-64-NEXT: vaddps %ymm2, %ymm0, %ymm0
@@ -1721,7 +1721,7 @@ define <16 x float> @f16xf32_f64(<16 x float> %a) {
; AVX2-64-NEXT: retq
;
; AVX512F-64-LABEL: f16xf32_f64:
-; AVX512F-64: # BB#0:
+; AVX512F-64: # %bb.0:
; AVX512F-64-NEXT: vbroadcastsd {{.*#+}} zmm1 = [4575657222482165760,4575657222482165760,4575657222482165760,4575657222482165760,4575657222482165760,4575657222482165760,4575657222482165760,4575657222482165760]
; AVX512F-64-NEXT: vaddps %zmm1, %zmm0, %zmm0
; AVX512F-64-NEXT: vdivps %zmm0, %zmm1, %zmm0
@@ -1734,7 +1734,7 @@ define <16 x float> @f16xf32_f64(<16 x float> %a) {
define <16 x float> @f16xf32_f128(<16 x float> %a) {
; AVX-LABEL: f16xf32_f128:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00]
; AVX-NEXT: # ymm2 = mem[0,1,0,1]
; AVX-NEXT: vaddps %ymm2, %ymm1, %ymm1
@@ -1744,7 +1744,7 @@ define <16 x float> @f16xf32_f128(<16 x float> %a) {
; AVX-NEXT: retl
;
; AVX2-LABEL: f16xf32_f128:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00]
; AVX2-NEXT: # ymm2 = mem[0,1,0,1]
; AVX2-NEXT: vaddps %ymm2, %ymm1, %ymm1
@@ -1754,7 +1754,7 @@ define <16 x float> @f16xf32_f128(<16 x float> %a) {
; AVX2-NEXT: retl
;
; AVX512-LABEL: f16xf32_f128:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vbroadcastf32x4 {{.*#+}} zmm1 = [4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00]
; AVX512-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512-NEXT: vaddps %zmm1, %zmm0, %zmm0
@@ -1762,7 +1762,7 @@ define <16 x float> @f16xf32_f128(<16 x float> %a) {
; AVX512-NEXT: retl
;
; AVX-64-LABEL: f16xf32_f128:
-; AVX-64: # BB#0:
+; AVX-64: # %bb.0:
; AVX-64-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00]
; AVX-64-NEXT: # ymm2 = mem[0,1,0,1]
; AVX-64-NEXT: vaddps %ymm2, %ymm1, %ymm1
@@ -1772,7 +1772,7 @@ define <16 x float> @f16xf32_f128(<16 x float> %a) {
; AVX-64-NEXT: retq
;
; AVX2-64-LABEL: f16xf32_f128:
-; AVX2-64: # BB#0:
+; AVX2-64: # %bb.0:
; AVX2-64-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00]
; AVX2-64-NEXT: # ymm2 = mem[0,1,0,1]
; AVX2-64-NEXT: vaddps %ymm2, %ymm1, %ymm1
@@ -1782,7 +1782,7 @@ define <16 x float> @f16xf32_f128(<16 x float> %a) {
; AVX2-64-NEXT: retq
;
; AVX512F-64-LABEL: f16xf32_f128:
-; AVX512F-64: # BB#0:
+; AVX512F-64: # %bb.0:
; AVX512F-64-NEXT: vbroadcastf32x4 {{.*#+}} zmm1 = [4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00]
; AVX512F-64-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512F-64-NEXT: vaddps %zmm1, %zmm0, %zmm0
@@ -1796,7 +1796,7 @@ define <16 x float> @f16xf32_f128(<16 x float> %a) {
define <16 x float> @f16xf32_f256(<16 x float> %a) {
; AVX-LABEL: f16xf32_f256:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [8.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00]
; AVX-NEXT: vaddps %ymm2, %ymm1, %ymm1
; AVX-NEXT: vaddps %ymm2, %ymm0, %ymm0
@@ -1805,7 +1805,7 @@ define <16 x float> @f16xf32_f256(<16 x float> %a) {
; AVX-NEXT: retl
;
; AVX2-LABEL: f16xf32_f256:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vmovaps {{.*#+}} ymm2 = [8.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00]
; AVX2-NEXT: vaddps %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vaddps %ymm2, %ymm0, %ymm0
@@ -1814,7 +1814,7 @@ define <16 x float> @f16xf32_f256(<16 x float> %a) {
; AVX2-NEXT: retl
;
; AVX512-LABEL: f16xf32_f256:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vbroadcastf64x4 {{.*#+}} zmm1 = [8.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00,8.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00]
; AVX512-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
; AVX512-NEXT: vaddps %zmm1, %zmm0, %zmm0
@@ -1822,7 +1822,7 @@ define <16 x float> @f16xf32_f256(<16 x float> %a) {
; AVX512-NEXT: retl
;
; AVX-64-LABEL: f16xf32_f256:
-; AVX-64: # BB#0:
+; AVX-64: # %bb.0:
; AVX-64-NEXT: vmovaps {{.*#+}} ymm2 = [8.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00]
; AVX-64-NEXT: vaddps %ymm2, %ymm1, %ymm1
; AVX-64-NEXT: vaddps %ymm2, %ymm0, %ymm0
@@ -1831,7 +1831,7 @@ define <16 x float> @f16xf32_f256(<16 x float> %a) {
; AVX-64-NEXT: retq
;
; AVX2-64-LABEL: f16xf32_f256:
-; AVX2-64: # BB#0:
+; AVX2-64: # %bb.0:
; AVX2-64-NEXT: vmovaps {{.*#+}} ymm2 = [8.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00]
; AVX2-64-NEXT: vaddps %ymm2, %ymm1, %ymm1
; AVX2-64-NEXT: vaddps %ymm2, %ymm0, %ymm0
@@ -1840,7 +1840,7 @@ define <16 x float> @f16xf32_f256(<16 x float> %a) {
; AVX2-64-NEXT: retq
;
; AVX512F-64-LABEL: f16xf32_f256:
-; AVX512F-64: # BB#0:
+; AVX512F-64: # %bb.0:
; AVX512F-64-NEXT: vbroadcastf64x4 {{.*#+}} zmm1 = [8.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00,8.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00]
; AVX512F-64-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
; AVX512F-64-NEXT: vaddps %zmm1, %zmm0, %zmm0
@@ -1854,7 +1854,7 @@ define <16 x float> @f16xf32_f256(<16 x float> %a) {
define <4 x double> @f4xf64_f128(<4 x double> %a) {
; AVX-LABEL: f4xf64_f128:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [2.000000e+00,1.000000e+00,2.000000e+00,1.000000e+00]
; AVX-NEXT: # ymm1 = mem[0,1,0,1]
; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0
@@ -1862,7 +1862,7 @@ define <4 x double> @f4xf64_f128(<4 x double> %a) {
; AVX-NEXT: retl
;
; ALL32-LABEL: f4xf64_f128:
-; ALL32: # BB#0:
+; ALL32: # %bb.0:
; ALL32-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [2.000000e+00,1.000000e+00,2.000000e+00,1.000000e+00]
; ALL32-NEXT: # ymm1 = mem[0,1,0,1]
; ALL32-NEXT: vaddpd %ymm1, %ymm0, %ymm0
@@ -1870,7 +1870,7 @@ define <4 x double> @f4xf64_f128(<4 x double> %a) {
; ALL32-NEXT: retl
;
; AVX-64-LABEL: f4xf64_f128:
-; AVX-64: # BB#0:
+; AVX-64: # %bb.0:
; AVX-64-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [2.000000e+00,1.000000e+00,2.000000e+00,1.000000e+00]
; AVX-64-NEXT: # ymm1 = mem[0,1,0,1]
; AVX-64-NEXT: vaddpd %ymm1, %ymm0, %ymm0
@@ -1878,7 +1878,7 @@ define <4 x double> @f4xf64_f128(<4 x double> %a) {
; AVX-64-NEXT: retq
;
; ALL64-LABEL: f4xf64_f128:
-; ALL64: # BB#0:
+; ALL64: # %bb.0:
; ALL64-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [2.000000e+00,1.000000e+00,2.000000e+00,1.000000e+00]
; ALL64-NEXT: # ymm1 = mem[0,1,0,1]
; ALL64-NEXT: vaddpd %ymm1, %ymm0, %ymm0
@@ -1892,7 +1892,7 @@ define <4 x double> @f4xf64_f128(<4 x double> %a) {
define <8 x double> @f8xf64_f128(<8 x double> %a) {
; AVX-LABEL: f8xf64_f128:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [2.000000e+00,1.000000e+00,2.000000e+00,1.000000e+00]
; AVX-NEXT: # ymm2 = mem[0,1,0,1]
; AVX-NEXT: vaddpd %ymm2, %ymm1, %ymm1
@@ -1902,7 +1902,7 @@ define <8 x double> @f8xf64_f128(<8 x double> %a) {
; AVX-NEXT: retl
;
; AVX2-LABEL: f8xf64_f128:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [2.000000e+00,1.000000e+00,2.000000e+00,1.000000e+00]
; AVX2-NEXT: # ymm2 = mem[0,1,0,1]
; AVX2-NEXT: vaddpd %ymm2, %ymm1, %ymm1
@@ -1912,7 +1912,7 @@ define <8 x double> @f8xf64_f128(<8 x double> %a) {
; AVX2-NEXT: retl
;
; AVX512-LABEL: f8xf64_f128:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vbroadcastf32x4 {{.*#+}} zmm1 = [2.000000e+00,1.000000e+00,2.000000e+00,1.000000e+00,2.000000e+00,1.000000e+00,2.000000e+00,1.000000e+00]
; AVX512-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0
@@ -1920,7 +1920,7 @@ define <8 x double> @f8xf64_f128(<8 x double> %a) {
; AVX512-NEXT: retl
;
; AVX-64-LABEL: f8xf64_f128:
-; AVX-64: # BB#0:
+; AVX-64: # %bb.0:
; AVX-64-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [2.000000e+00,1.000000e+00,2.000000e+00,1.000000e+00]
; AVX-64-NEXT: # ymm2 = mem[0,1,0,1]
; AVX-64-NEXT: vaddpd %ymm2, %ymm1, %ymm1
@@ -1930,7 +1930,7 @@ define <8 x double> @f8xf64_f128(<8 x double> %a) {
; AVX-64-NEXT: retq
;
; AVX2-64-LABEL: f8xf64_f128:
-; AVX2-64: # BB#0:
+; AVX2-64: # %bb.0:
; AVX2-64-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [2.000000e+00,1.000000e+00,2.000000e+00,1.000000e+00]
; AVX2-64-NEXT: # ymm2 = mem[0,1,0,1]
; AVX2-64-NEXT: vaddpd %ymm2, %ymm1, %ymm1
@@ -1940,7 +1940,7 @@ define <8 x double> @f8xf64_f128(<8 x double> %a) {
; AVX2-64-NEXT: retq
;
; AVX512F-64-LABEL: f8xf64_f128:
-; AVX512F-64: # BB#0:
+; AVX512F-64: # %bb.0:
; AVX512F-64-NEXT: vbroadcastf32x4 {{.*#+}} zmm1 = [2.000000e+00,1.000000e+00,2.000000e+00,1.000000e+00,2.000000e+00,1.000000e+00,2.000000e+00,1.000000e+00]
; AVX512F-64-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512F-64-NEXT: vaddpd %zmm1, %zmm0, %zmm0
@@ -1961,7 +1961,7 @@ define <8 x double> @f8xf64_f128(<8 x double> %a) {
define <8 x double> @f8xf64_f256(<8 x double> %a) {
; AVX-LABEL: f8xf64_f256:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovapd {{.*#+}} ymm2 = [4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00]
; AVX-NEXT: vaddpd %ymm2, %ymm1, %ymm1
; AVX-NEXT: vaddpd %ymm2, %ymm0, %ymm0
@@ -1970,7 +1970,7 @@ define <8 x double> @f8xf64_f256(<8 x double> %a) {
; AVX-NEXT: retl
;
; AVX2-LABEL: f8xf64_f256:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vmovapd {{.*#+}} ymm2 = [4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00]
; AVX2-NEXT: vaddpd %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vaddpd %ymm2, %ymm0, %ymm0
@@ -1979,7 +1979,7 @@ define <8 x double> @f8xf64_f256(<8 x double> %a) {
; AVX2-NEXT: retl
;
; AVX512-LABEL: f8xf64_f256:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vbroadcastf64x4 {{.*#+}} zmm1 = [4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00]
; AVX512-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0
@@ -1987,7 +1987,7 @@ define <8 x double> @f8xf64_f256(<8 x double> %a) {
; AVX512-NEXT: retl
;
; AVX-64-LABEL: f8xf64_f256:
-; AVX-64: # BB#0:
+; AVX-64: # %bb.0:
; AVX-64-NEXT: vmovapd {{.*#+}} ymm2 = [4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00]
; AVX-64-NEXT: vaddpd %ymm2, %ymm1, %ymm1
; AVX-64-NEXT: vaddpd %ymm2, %ymm0, %ymm0
@@ -1996,7 +1996,7 @@ define <8 x double> @f8xf64_f256(<8 x double> %a) {
; AVX-64-NEXT: retq
;
; AVX2-64-LABEL: f8xf64_f256:
-; AVX2-64: # BB#0:
+; AVX2-64: # %bb.0:
; AVX2-64-NEXT: vmovapd {{.*#+}} ymm2 = [4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00]
; AVX2-64-NEXT: vaddpd %ymm2, %ymm1, %ymm1
; AVX2-64-NEXT: vaddpd %ymm2, %ymm0, %ymm0
@@ -2005,7 +2005,7 @@ define <8 x double> @f8xf64_f256(<8 x double> %a) {
; AVX2-64-NEXT: retq
;
; AVX512F-64-LABEL: f8xf64_f256:
-; AVX512F-64: # BB#0:
+; AVX512F-64: # %bb.0:
; AVX512F-64-NEXT: vbroadcastf64x4 {{.*#+}} zmm1 = [4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00,1.000000e+00,2.000000e+00,3.000000e+00]
; AVX512F-64-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
; AVX512F-64-NEXT: vaddpd %zmm1, %zmm0, %zmm0
@@ -2020,28 +2020,28 @@ define <8 x double> @f8xf64_f256(<8 x double> %a) {
define <8 x i16> @f8xi16_i32_NaN(<8 x i16> %a) {
; AVX-LABEL: f8xi16_i32_NaN:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN]
; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX-NEXT: retl
;
; ALL32-LABEL: f8xi16_i32_NaN:
-; ALL32: # BB#0:
+; ALL32: # %bb.0:
; ALL32-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4290379776,4290379776,4290379776,4290379776]
; ALL32-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; ALL32-NEXT: vpand %xmm1, %xmm0, %xmm0
; ALL32-NEXT: retl
;
; AVX-64-LABEL: f8xi16_i32_NaN:
-; AVX-64: # BB#0:
+; AVX-64: # %bb.0:
; AVX-64-NEXT: vbroadcastss {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN]
; AVX-64-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; AVX-64-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX-64-NEXT: retq
;
; ALL64-LABEL: f8xi16_i32_NaN:
-; ALL64: # BB#0:
+; ALL64: # %bb.0:
; ALL64-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4290379776,4290379776,4290379776,4290379776]
; ALL64-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; ALL64-NEXT: vpand %xmm1, %xmm0, %xmm0
diff --git a/test/CodeGen/X86/broadcastm-lowering.ll b/test/CodeGen/X86/broadcastm-lowering.ll
new file mode 100644
index 000000000000..8548d8b7677d
--- /dev/null
+++ b/test/CodeGen/X86/broadcastm-lowering.ll
@@ -0,0 +1,212 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512cd | FileCheck %s --check-prefix=ALL --check-prefix=AVX512CD
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,avx512cd,+avx512bw| FileCheck %s --check-prefix=ALL --check-prefix=AVX512VLCDBW
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512vl,avx512cd,+avx512bw| FileCheck %s --check-prefix=ALL --check-prefix=X86-AVX512VLCDBW
+
+define <2 x i64> @test_mm_epi64(<8 x i16> %a, <8 x i16> %b) {
+; AVX512CD-LABEL: test_mm_epi64:
+; AVX512CD: # %bb.0: # %entry
+; AVX512CD-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
+; AVX512CD-NEXT: vpmovsxwq %xmm0, %zmm0
+; AVX512CD-NEXT: vpsllq $63, %zmm0, %zmm0
+; AVX512CD-NEXT: vptestmq %zmm0, %zmm0, %k0
+; AVX512CD-NEXT: kmovw %k0, %eax
+; AVX512CD-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; AVX512CD-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0
+; AVX512CD-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
+; AVX512CD-NEXT: vzeroupper
+; AVX512CD-NEXT: retq
+;
+; AVX512VLCDBW-LABEL: test_mm_epi64:
+; AVX512VLCDBW: # %bb.0: # %entry
+; AVX512VLCDBW-NEXT: vpcmpeqw %xmm1, %xmm0, %k0
+; AVX512VLCDBW-NEXT: vpbroadcastmb2q %k0, %xmm0
+; AVX512VLCDBW-NEXT: retq
+;
+; X86-AVX512VLCDBW-LABEL: test_mm_epi64:
+; X86-AVX512VLCDBW: # %bb.0: # %entry
+; X86-AVX512VLCDBW-NEXT: vpcmpeqw %xmm1, %xmm0, %k0
+; X86-AVX512VLCDBW-NEXT: kmovd %k0, %eax
+; X86-AVX512VLCDBW-NEXT: movzbl %al, %eax
+; X86-AVX512VLCDBW-NEXT: vmovd %eax, %xmm0
+; X86-AVX512VLCDBW-NEXT: vpbroadcastq %xmm0, %xmm0
+; X86-AVX512VLCDBW-NEXT: retl
+entry:
+ %0 = icmp eq <8 x i16> %a, %b
+ %1 = bitcast <8 x i1> %0 to i8
+ %conv.i = zext i8 %1 to i64
+ %vecinit.i.i = insertelement <2 x i64> undef, i64 %conv.i, i32 0
+ %vecinit1.i.i = shufflevector <2 x i64> %vecinit.i.i, <2 x i64> undef, <2 x i32> zeroinitializer
+ ret <2 x i64> %vecinit1.i.i
+}
+
+define <4 x i32> @test_mm_epi32(<16 x i8> %a, <16 x i8> %b) {
+; AVX512CD-LABEL: test_mm_epi32:
+; AVX512CD: # %bb.0: # %entry
+; AVX512CD-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
+; AVX512CD-NEXT: vpmovsxbd %xmm0, %zmm0
+; AVX512CD-NEXT: vpslld $31, %zmm0, %zmm0
+; AVX512CD-NEXT: vptestmd %zmm0, %zmm0, %k0
+; AVX512CD-NEXT: kmovw %k0, %eax
+; AVX512CD-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; AVX512CD-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0
+; AVX512CD-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0
+; AVX512CD-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0
+; AVX512CD-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0
+; AVX512CD-NEXT: vzeroupper
+; AVX512CD-NEXT: retq
+;
+; AVX512VLCDBW-LABEL: test_mm_epi32:
+; AVX512VLCDBW: # %bb.0: # %entry
+; AVX512VLCDBW-NEXT: vpcmpeqb %xmm1, %xmm0, %k0
+; AVX512VLCDBW-NEXT: vpbroadcastmw2d %k0, %xmm0
+; AVX512VLCDBW-NEXT: retq
+;
+; X86-AVX512VLCDBW-LABEL: test_mm_epi32:
+; X86-AVX512VLCDBW: # %bb.0: # %entry
+; X86-AVX512VLCDBW-NEXT: vpcmpeqb %xmm1, %xmm0, %k0
+; X86-AVX512VLCDBW-NEXT: vpbroadcastmw2d %k0, %xmm0
+; X86-AVX512VLCDBW-NEXT: retl
+entry:
+ %0 = icmp eq <16 x i8> %a, %b
+ %1 = bitcast <16 x i1> %0 to i16
+ %conv.i = zext i16 %1 to i32
+ %vecinit.i.i = insertelement <4 x i32> undef, i32 %conv.i, i32 0
+ %vecinit3.i.i = shufflevector <4 x i32> %vecinit.i.i, <4 x i32> undef, <4 x i32> zeroinitializer
+ ret <4 x i32> %vecinit3.i.i
+}
+
+define <16 x i32> @test_mm512_epi32(<16 x i32> %a, <16 x i32> %b) {
+; AVX512CD-LABEL: test_mm512_epi32:
+; AVX512CD: # %bb.0: # %entry
+; AVX512CD-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
+; AVX512CD-NEXT: vpbroadcastmw2d %k0, %zmm0
+; AVX512CD-NEXT: retq
+;
+; AVX512VLCDBW-LABEL: test_mm512_epi32:
+; AVX512VLCDBW: # %bb.0: # %entry
+; AVX512VLCDBW-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
+; AVX512VLCDBW-NEXT: vpbroadcastmw2d %k0, %zmm0
+; AVX512VLCDBW-NEXT: retq
+;
+; X86-AVX512VLCDBW-LABEL: test_mm512_epi32:
+; X86-AVX512VLCDBW: # %bb.0: # %entry
+; X86-AVX512VLCDBW-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
+; X86-AVX512VLCDBW-NEXT: vpbroadcastmw2d %k0, %zmm0
+; X86-AVX512VLCDBW-NEXT: retl
+entry:
+ %0 = icmp eq <16 x i32> %a, %b
+ %1 = bitcast <16 x i1> %0 to i16
+ %conv.i = zext i16 %1 to i32
+ %vecinit.i.i = insertelement <16 x i32> undef, i32 %conv.i, i32 0
+ %vecinit15.i.i = shufflevector <16 x i32> %vecinit.i.i, <16 x i32> undef, <16 x i32> zeroinitializer
+ ret <16 x i32> %vecinit15.i.i
+}
+
+define <8 x i64> @test_mm512_epi64(<8 x i32> %a, <8 x i32> %b) {
+; AVX512CD-LABEL: test_mm512_epi64:
+; AVX512CD: # %bb.0: # %entry
+; AVX512CD-NEXT: # kill: def %ymm1 killed %ymm1 def %zmm1
+; AVX512CD-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
+; AVX512CD-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
+; AVX512CD-NEXT: vpbroadcastmb2q %k0, %zmm0
+; AVX512CD-NEXT: retq
+;
+; AVX512VLCDBW-LABEL: test_mm512_epi64:
+; AVX512VLCDBW: # %bb.0: # %entry
+; AVX512VLCDBW-NEXT: vpcmpeqd %ymm1, %ymm0, %k0
+; AVX512VLCDBW-NEXT: vpbroadcastmb2q %k0, %zmm0
+; AVX512VLCDBW-NEXT: retq
+;
+; X86-AVX512VLCDBW-LABEL: test_mm512_epi64:
+; X86-AVX512VLCDBW: # %bb.0: # %entry
+; X86-AVX512VLCDBW-NEXT: vpcmpeqd %ymm1, %ymm0, %k0
+; X86-AVX512VLCDBW-NEXT: kmovd %k0, %eax
+; X86-AVX512VLCDBW-NEXT: movzbl %al, %eax
+; X86-AVX512VLCDBW-NEXT: vmovd %eax, %xmm0
+; X86-AVX512VLCDBW-NEXT: vpbroadcastq %xmm0, %xmm0
+; X86-AVX512VLCDBW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; X86-AVX512VLCDBW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
+; X86-AVX512VLCDBW-NEXT: retl
+entry:
+ %0 = icmp eq <8 x i32> %a, %b
+ %1 = bitcast <8 x i1> %0 to i8
+ %conv.i = zext i8 %1 to i64
+ %vecinit.i.i = insertelement <8 x i64> undef, i64 %conv.i, i32 0
+ %vecinit7.i.i = shufflevector <8 x i64> %vecinit.i.i, <8 x i64> undef, <8 x i32> zeroinitializer
+ ret <8 x i64> %vecinit7.i.i
+}
+
+define <4 x i64> @test_mm256_epi64(<8 x i32> %a, <8 x i32> %b) {
+; AVX512CD-LABEL: test_mm256_epi64:
+; AVX512CD: # %bb.0: # %entry
+; AVX512CD-NEXT: # kill: def %ymm1 killed %ymm1 def %zmm1
+; AVX512CD-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
+; AVX512CD-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
+; AVX512CD-NEXT: kmovw %k0, %eax
+; AVX512CD-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; AVX512CD-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0
+; AVX512CD-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
+; AVX512CD-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512CD-NEXT: retq
+;
+; AVX512VLCDBW-LABEL: test_mm256_epi64:
+; AVX512VLCDBW: # %bb.0: # %entry
+; AVX512VLCDBW-NEXT: vpcmpeqd %ymm1, %ymm0, %k0
+; AVX512VLCDBW-NEXT: vpbroadcastmb2q %k0, %ymm0
+; AVX512VLCDBW-NEXT: retq
+;
+; X86-AVX512VLCDBW-LABEL: test_mm256_epi64:
+; X86-AVX512VLCDBW: # %bb.0: # %entry
+; X86-AVX512VLCDBW-NEXT: vpcmpeqd %ymm1, %ymm0, %k0
+; X86-AVX512VLCDBW-NEXT: kmovd %k0, %eax
+; X86-AVX512VLCDBW-NEXT: movzbl %al, %eax
+; X86-AVX512VLCDBW-NEXT: vmovd %eax, %xmm0
+; X86-AVX512VLCDBW-NEXT: vpbroadcastq %xmm0, %xmm0
+; X86-AVX512VLCDBW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; X86-AVX512VLCDBW-NEXT: retl
+entry:
+ %0 = icmp eq <8 x i32> %a, %b
+ %1 = bitcast <8 x i1> %0 to i8
+ %conv.i = zext i8 %1 to i64
+ %vecinit.i.i = insertelement <4 x i64> undef, i64 %conv.i, i32 0
+ %vecinit3.i.i = shufflevector <4 x i64> %vecinit.i.i, <4 x i64> undef, <4 x i32> zeroinitializer
+ ret <4 x i64> %vecinit3.i.i
+}
+
+define <8 x i32> @test_mm256_epi32(<16 x i16> %a, <16 x i16> %b) {
+; AVX512CD-LABEL: test_mm256_epi32:
+; AVX512CD: # %bb.0: # %entry
+; AVX512CD-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0
+; AVX512CD-NEXT: vpmovsxwd %ymm0, %zmm0
+; AVX512CD-NEXT: vpslld $31, %zmm0, %zmm0
+; AVX512CD-NEXT: vptestmd %zmm0, %zmm0, %k0
+; AVX512CD-NEXT: kmovw %k0, %eax
+; AVX512CD-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; AVX512CD-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0
+; AVX512CD-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0
+; AVX512CD-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0
+; AVX512CD-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0
+; AVX512CD-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512CD-NEXT: retq
+;
+; AVX512VLCDBW-LABEL: test_mm256_epi32:
+; AVX512VLCDBW: # %bb.0: # %entry
+; AVX512VLCDBW-NEXT: vpcmpeqw %ymm1, %ymm0, %k0
+; AVX512VLCDBW-NEXT: vpbroadcastmw2d %k0, %ymm0
+; AVX512VLCDBW-NEXT: retq
+;
+; X86-AVX512VLCDBW-LABEL: test_mm256_epi32:
+; X86-AVX512VLCDBW: # %bb.0: # %entry
+; X86-AVX512VLCDBW-NEXT: vpcmpeqw %ymm1, %ymm0, %k0
+; X86-AVX512VLCDBW-NEXT: vpbroadcastmw2d %k0, %ymm0
+; X86-AVX512VLCDBW-NEXT: retl
+entry:
+ %0 = icmp eq <16 x i16> %a, %b
+ %1 = bitcast <16 x i1> %0 to i16
+ %conv.i = zext i16 %1 to i32
+ %vecinit.i.i = insertelement <8 x i32> undef, i32 %conv.i, i32 0
+ %vecinit7.i.i = shufflevector <8 x i32> %vecinit.i.i, <8 x i32> undef, <8 x i32> zeroinitializer
+ ret <8 x i32> %vecinit7.i.i
+}
+
diff --git a/test/CodeGen/X86/bss_pagealigned.ll b/test/CodeGen/X86/bss_pagealigned.ll
index 4e9f9241011c..0a3bd014937c 100644
--- a/test/CodeGen/X86/bss_pagealigned.ll
+++ b/test/CodeGen/X86/bss_pagealigned.ll
@@ -1,4 +1,4 @@
-; RUN: llc --code-model=kernel -march=x86-64 <%s -asm-verbose=0 | FileCheck %s
+; RUN: llc --code-model=kernel <%s -asm-verbose=0 | FileCheck %s
; PR4933
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
target triple = "x86_64-unknown-linux-gnu"
diff --git a/test/CodeGen/X86/bswap-rotate.ll b/test/CodeGen/X86/bswap-rotate.ll
index f686febe5645..62798ba67e28 100644
--- a/test/CodeGen/X86/bswap-rotate.ll
+++ b/test/CodeGen/X86/bswap-rotate.ll
@@ -7,13 +7,13 @@
define i16 @combine_bswap_rotate(i16 %a0) {
; X86-LABEL: combine_bswap_rotate:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X86-NEXT: rolw $9, %ax
; X86-NEXT: retl
;
; X64-LABEL: combine_bswap_rotate:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: rolw $9, %di
; X64-NEXT: movl %edi, %eax
; X64-NEXT: retq
diff --git a/test/CodeGen/X86/bswap-vector.ll b/test/CodeGen/X86/bswap-vector.ll
index 7463f5f6d086..e7cb91a42f8b 100644
--- a/test/CodeGen/X86/bswap-vector.ll
+++ b/test/CodeGen/X86/bswap-vector.ll
@@ -10,7 +10,7 @@ declare <2 x i64> @llvm.bswap.v2i64(<2 x i64>)
define <8 x i16> @test1(<8 x i16> %v) {
; CHECK-NOSSSE3-LABEL: test1:
-; CHECK-NOSSSE3: # BB#0: # %entry
+; CHECK-NOSSSE3: # %bb.0: # %entry
; CHECK-NOSSSE3-NEXT: pxor %xmm1, %xmm1
; CHECK-NOSSSE3-NEXT: movdqa %xmm0, %xmm2
; CHECK-NOSSSE3-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
@@ -23,17 +23,17 @@ define <8 x i16> @test1(<8 x i16> %v) {
; CHECK-NOSSSE3-NEXT: retq
;
; CHECK-SSSE3-LABEL: test1:
-; CHECK-SSSE3: # BB#0: # %entry
+; CHECK-SSSE3: # %bb.0: # %entry
; CHECK-SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
; CHECK-SSSE3-NEXT: retq
;
; CHECK-AVX-LABEL: test1:
-; CHECK-AVX: # BB#0: # %entry
+; CHECK-AVX: # %bb.0: # %entry
; CHECK-AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
; CHECK-AVX-NEXT: retq
;
; CHECK-WIDE-AVX-LABEL: test1:
-; CHECK-WIDE-AVX: # BB#0: # %entry
+; CHECK-WIDE-AVX: # %bb.0: # %entry
; CHECK-WIDE-AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
; CHECK-WIDE-AVX-NEXT: retq
entry:
@@ -43,7 +43,7 @@ entry:
define <4 x i32> @test2(<4 x i32> %v) {
; CHECK-NOSSSE3-LABEL: test2:
-; CHECK-NOSSSE3: # BB#0: # %entry
+; CHECK-NOSSSE3: # %bb.0: # %entry
; CHECK-NOSSSE3-NEXT: pxor %xmm1, %xmm1
; CHECK-NOSSSE3-NEXT: movdqa %xmm0, %xmm2
; CHECK-NOSSSE3-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
@@ -56,17 +56,17 @@ define <4 x i32> @test2(<4 x i32> %v) {
; CHECK-NOSSSE3-NEXT: retq
;
; CHECK-SSSE3-LABEL: test2:
-; CHECK-SSSE3: # BB#0: # %entry
+; CHECK-SSSE3: # %bb.0: # %entry
; CHECK-SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
; CHECK-SSSE3-NEXT: retq
;
; CHECK-AVX-LABEL: test2:
-; CHECK-AVX: # BB#0: # %entry
+; CHECK-AVX: # %bb.0: # %entry
; CHECK-AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
; CHECK-AVX-NEXT: retq
;
; CHECK-WIDE-AVX-LABEL: test2:
-; CHECK-WIDE-AVX: # BB#0: # %entry
+; CHECK-WIDE-AVX: # %bb.0: # %entry
; CHECK-WIDE-AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
; CHECK-WIDE-AVX-NEXT: retq
entry:
@@ -76,7 +76,7 @@ entry:
define <2 x i64> @test3(<2 x i64> %v) {
; CHECK-NOSSSE3-LABEL: test3:
-; CHECK-NOSSSE3: # BB#0: # %entry
+; CHECK-NOSSSE3: # %bb.0: # %entry
; CHECK-NOSSSE3-NEXT: pxor %xmm1, %xmm1
; CHECK-NOSSSE3-NEXT: movdqa %xmm0, %xmm2
; CHECK-NOSSSE3-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
@@ -91,17 +91,17 @@ define <2 x i64> @test3(<2 x i64> %v) {
; CHECK-NOSSSE3-NEXT: retq
;
; CHECK-SSSE3-LABEL: test3:
-; CHECK-SSSE3: # BB#0: # %entry
+; CHECK-SSSE3: # %bb.0: # %entry
; CHECK-SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
; CHECK-SSSE3-NEXT: retq
;
; CHECK-AVX-LABEL: test3:
-; CHECK-AVX: # BB#0: # %entry
+; CHECK-AVX: # %bb.0: # %entry
; CHECK-AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
; CHECK-AVX-NEXT: retq
;
; CHECK-WIDE-AVX-LABEL: test3:
-; CHECK-WIDE-AVX: # BB#0: # %entry
+; CHECK-WIDE-AVX: # %bb.0: # %entry
; CHECK-WIDE-AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
; CHECK-WIDE-AVX-NEXT: retq
entry:
@@ -115,7 +115,7 @@ declare <4 x i64> @llvm.bswap.v4i64(<4 x i64>)
define <16 x i16> @test4(<16 x i16> %v) {
; CHECK-NOSSSE3-LABEL: test4:
-; CHECK-NOSSSE3: # BB#0: # %entry
+; CHECK-NOSSSE3: # %bb.0: # %entry
; CHECK-NOSSSE3-NEXT: pxor %xmm2, %xmm2
; CHECK-NOSSSE3-NEXT: movdqa %xmm0, %xmm3
; CHECK-NOSSSE3-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
@@ -136,19 +136,19 @@ define <16 x i16> @test4(<16 x i16> %v) {
; CHECK-NOSSSE3-NEXT: retq
;
; CHECK-SSSE3-LABEL: test4:
-; CHECK-SSSE3: # BB#0: # %entry
+; CHECK-SSSE3: # %bb.0: # %entry
; CHECK-SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
; CHECK-SSSE3-NEXT: pshufb %xmm2, %xmm0
; CHECK-SSSE3-NEXT: pshufb %xmm2, %xmm1
; CHECK-SSSE3-NEXT: retq
;
; CHECK-AVX-LABEL: test4:
-; CHECK-AVX: # BB#0: # %entry
+; CHECK-AVX: # %bb.0: # %entry
; CHECK-AVX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30]
; CHECK-AVX-NEXT: retq
;
; CHECK-WIDE-AVX-LABEL: test4:
-; CHECK-WIDE-AVX: # BB#0: # %entry
+; CHECK-WIDE-AVX: # %bb.0: # %entry
; CHECK-WIDE-AVX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30]
; CHECK-WIDE-AVX-NEXT: retq
entry:
@@ -158,7 +158,7 @@ entry:
define <8 x i32> @test5(<8 x i32> %v) {
; CHECK-NOSSSE3-LABEL: test5:
-; CHECK-NOSSSE3: # BB#0: # %entry
+; CHECK-NOSSSE3: # %bb.0: # %entry
; CHECK-NOSSSE3-NEXT: pxor %xmm2, %xmm2
; CHECK-NOSSSE3-NEXT: movdqa %xmm0, %xmm3
; CHECK-NOSSSE3-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
@@ -179,19 +179,19 @@ define <8 x i32> @test5(<8 x i32> %v) {
; CHECK-NOSSSE3-NEXT: retq
;
; CHECK-SSSE3-LABEL: test5:
-; CHECK-SSSE3: # BB#0: # %entry
+; CHECK-SSSE3: # %bb.0: # %entry
; CHECK-SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
; CHECK-SSSE3-NEXT: pshufb %xmm2, %xmm0
; CHECK-SSSE3-NEXT: pshufb %xmm2, %xmm1
; CHECK-SSSE3-NEXT: retq
;
; CHECK-AVX-LABEL: test5:
-; CHECK-AVX: # BB#0: # %entry
+; CHECK-AVX: # %bb.0: # %entry
; CHECK-AVX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28]
; CHECK-AVX-NEXT: retq
;
; CHECK-WIDE-AVX-LABEL: test5:
-; CHECK-WIDE-AVX: # BB#0: # %entry
+; CHECK-WIDE-AVX: # %bb.0: # %entry
; CHECK-WIDE-AVX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28]
; CHECK-WIDE-AVX-NEXT: retq
entry:
@@ -201,7 +201,7 @@ entry:
define <4 x i64> @test6(<4 x i64> %v) {
; CHECK-NOSSSE3-LABEL: test6:
-; CHECK-NOSSSE3: # BB#0: # %entry
+; CHECK-NOSSSE3: # %bb.0: # %entry
; CHECK-NOSSSE3-NEXT: pxor %xmm2, %xmm2
; CHECK-NOSSSE3-NEXT: movdqa %xmm0, %xmm3
; CHECK-NOSSSE3-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
@@ -226,19 +226,19 @@ define <4 x i64> @test6(<4 x i64> %v) {
; CHECK-NOSSSE3-NEXT: retq
;
; CHECK-SSSE3-LABEL: test6:
-; CHECK-SSSE3: # BB#0: # %entry
+; CHECK-SSSE3: # %bb.0: # %entry
; CHECK-SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
; CHECK-SSSE3-NEXT: pshufb %xmm2, %xmm0
; CHECK-SSSE3-NEXT: pshufb %xmm2, %xmm1
; CHECK-SSSE3-NEXT: retq
;
; CHECK-AVX-LABEL: test6:
-; CHECK-AVX: # BB#0: # %entry
+; CHECK-AVX: # %bb.0: # %entry
; CHECK-AVX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24]
; CHECK-AVX-NEXT: retq
;
; CHECK-WIDE-AVX-LABEL: test6:
-; CHECK-WIDE-AVX: # BB#0: # %entry
+; CHECK-WIDE-AVX: # %bb.0: # %entry
; CHECK-WIDE-AVX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24]
; CHECK-WIDE-AVX-NEXT: retq
entry:
@@ -250,7 +250,7 @@ declare <4 x i16> @llvm.bswap.v4i16(<4 x i16>)
define <4 x i16> @test7(<4 x i16> %v) {
; CHECK-NOSSSE3-LABEL: test7:
-; CHECK-NOSSSE3: # BB#0: # %entry
+; CHECK-NOSSSE3: # %bb.0: # %entry
; CHECK-NOSSSE3-NEXT: pxor %xmm1, %xmm1
; CHECK-NOSSSE3-NEXT: movdqa %xmm0, %xmm2
; CHECK-NOSSSE3-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
@@ -264,17 +264,17 @@ define <4 x i16> @test7(<4 x i16> %v) {
; CHECK-NOSSSE3-NEXT: retq
;
; CHECK-SSSE3-LABEL: test7:
-; CHECK-SSSE3: # BB#0: # %entry
+; CHECK-SSSE3: # %bb.0: # %entry
; CHECK-SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,0],zero,zero,xmm0[5,4],zero,zero,xmm0[9,8],zero,zero,xmm0[13,12],zero,zero
; CHECK-SSSE3-NEXT: retq
;
; CHECK-AVX-LABEL: test7:
-; CHECK-AVX: # BB#0: # %entry
+; CHECK-AVX: # %bb.0: # %entry
; CHECK-AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,0],zero,zero,xmm0[5,4],zero,zero,xmm0[9,8],zero,zero,xmm0[13,12],zero,zero
; CHECK-AVX-NEXT: retq
;
; CHECK-WIDE-AVX-LABEL: test7:
-; CHECK-WIDE-AVX: # BB#0: # %entry
+; CHECK-WIDE-AVX: # %bb.0: # %entry
; CHECK-WIDE-AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
; CHECK-WIDE-AVX-NEXT: retq
entry:
@@ -288,7 +288,7 @@ entry:
define <8 x i16> @identity_v8i16(<8 x i16> %v) {
; CHECK-ALL-LABEL: identity_v8i16:
-; CHECK-ALL: # BB#0: # %entry
+; CHECK-ALL: # %bb.0: # %entry
; CHECK-ALL-NEXT: retq
entry:
%bs1 = call <8 x i16> @llvm.bswap.v8i16(<8 x i16> %v)
@@ -298,7 +298,7 @@ entry:
define <4 x i32> @identity_v4i32(<4 x i32> %v) {
; CHECK-ALL-LABEL: identity_v4i32:
-; CHECK-ALL: # BB#0: # %entry
+; CHECK-ALL: # %bb.0: # %entry
; CHECK-ALL-NEXT: retq
entry:
%bs1 = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %v)
@@ -308,7 +308,7 @@ entry:
define <2 x i64> @identity_v2i64(<2 x i64> %v) {
; CHECK-ALL-LABEL: identity_v2i64:
-; CHECK-ALL: # BB#0: # %entry
+; CHECK-ALL: # %bb.0: # %entry
; CHECK-ALL-NEXT: retq
entry:
%bs1 = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %v)
@@ -318,7 +318,7 @@ entry:
define <16 x i16> @identity_v16i16(<16 x i16> %v) {
; CHECK-ALL-LABEL: identity_v16i16:
-; CHECK-ALL: # BB#0: # %entry
+; CHECK-ALL: # %bb.0: # %entry
; CHECK-ALL-NEXT: retq
entry:
%bs1 = call <16 x i16> @llvm.bswap.v16i16(<16 x i16> %v)
@@ -328,7 +328,7 @@ entry:
define <8 x i32> @identity_v8i32(<8 x i32> %v) {
; CHECK-ALL-LABEL: identity_v8i32:
-; CHECK-ALL: # BB#0: # %entry
+; CHECK-ALL: # %bb.0: # %entry
; CHECK-ALL-NEXT: retq
entry:
%bs1 = call <8 x i32> @llvm.bswap.v8i32(<8 x i32> %v)
@@ -338,7 +338,7 @@ entry:
define <4 x i64> @identity_v4i64(<4 x i64> %v) {
; CHECK-ALL-LABEL: identity_v4i64:
-; CHECK-ALL: # BB#0: # %entry
+; CHECK-ALL: # %bb.0: # %entry
; CHECK-ALL-NEXT: retq
entry:
%bs1 = call <4 x i64> @llvm.bswap.v4i64(<4 x i64> %v)
@@ -348,7 +348,7 @@ entry:
define <4 x i16> @identity_v4i16(<4 x i16> %v) {
; CHECK-ALL-LABEL: identity_v4i16:
-; CHECK-ALL: # BB#0: # %entry
+; CHECK-ALL: # %bb.0: # %entry
; CHECK-ALL-NEXT: retq
entry:
%bs1 = call <4 x i16> @llvm.bswap.v4i16(<4 x i16> %v)
@@ -362,17 +362,17 @@ entry:
define <8 x i16> @fold_v8i16() {
; CHECK-SSE-LABEL: fold_v8i16:
-; CHECK-SSE: # BB#0: # %entry
+; CHECK-SSE: # %bb.0: # %entry
; CHECK-SSE-NEXT: movaps {{.*#+}} xmm0 = [0,256,65535,512,65023,1024,64511,1536]
; CHECK-SSE-NEXT: retq
;
; CHECK-AVX-LABEL: fold_v8i16:
-; CHECK-AVX: # BB#0: # %entry
+; CHECK-AVX: # %bb.0: # %entry
; CHECK-AVX-NEXT: vmovaps {{.*#+}} xmm0 = [0,256,65535,512,65023,1024,64511,1536]
; CHECK-AVX-NEXT: retq
;
; CHECK-WIDE-AVX-LABEL: fold_v8i16:
-; CHECK-WIDE-AVX: # BB#0: # %entry
+; CHECK-WIDE-AVX: # %bb.0: # %entry
; CHECK-WIDE-AVX-NEXT: vmovaps {{.*#+}} xmm0 = [0,256,65535,512,65023,1024,64511,1536]
; CHECK-WIDE-AVX-NEXT: retq
entry:
@@ -382,17 +382,17 @@ entry:
define <4 x i32> @fold_v4i32() {
; CHECK-SSE-LABEL: fold_v4i32:
-; CHECK-SSE: # BB#0: # %entry
+; CHECK-SSE: # %bb.0: # %entry
; CHECK-SSE-NEXT: movaps {{.*#+}} xmm0 = [0,4294967295,33554432,4261412863]
; CHECK-SSE-NEXT: retq
;
; CHECK-AVX-LABEL: fold_v4i32:
-; CHECK-AVX: # BB#0: # %entry
+; CHECK-AVX: # %bb.0: # %entry
; CHECK-AVX-NEXT: vmovaps {{.*#+}} xmm0 = [0,4294967295,33554432,4261412863]
; CHECK-AVX-NEXT: retq
;
; CHECK-WIDE-AVX-LABEL: fold_v4i32:
-; CHECK-WIDE-AVX: # BB#0: # %entry
+; CHECK-WIDE-AVX: # %bb.0: # %entry
; CHECK-WIDE-AVX-NEXT: vmovaps {{.*#+}} xmm0 = [0,4294967295,33554432,4261412863]
; CHECK-WIDE-AVX-NEXT: retq
entry:
@@ -402,17 +402,17 @@ entry:
define <2 x i64> @fold_v2i64() {
; CHECK-SSE-LABEL: fold_v2i64:
-; CHECK-SSE: # BB#0: # %entry
+; CHECK-SSE: # %bb.0: # %entry
; CHECK-SSE-NEXT: movaps {{.*#+}} xmm0 = [18374686479671623680,18446744073709551615]
; CHECK-SSE-NEXT: retq
;
; CHECK-AVX-LABEL: fold_v2i64:
-; CHECK-AVX: # BB#0: # %entry
+; CHECK-AVX: # %bb.0: # %entry
; CHECK-AVX-NEXT: vmovaps {{.*#+}} xmm0 = [18374686479671623680,18446744073709551615]
; CHECK-AVX-NEXT: retq
;
; CHECK-WIDE-AVX-LABEL: fold_v2i64:
-; CHECK-WIDE-AVX: # BB#0: # %entry
+; CHECK-WIDE-AVX: # %bb.0: # %entry
; CHECK-WIDE-AVX-NEXT: vmovaps {{.*#+}} xmm0 = [18374686479671623680,18446744073709551615]
; CHECK-WIDE-AVX-NEXT: retq
entry:
@@ -422,18 +422,18 @@ entry:
define <16 x i16> @fold_v16i16() {
; CHECK-SSE-LABEL: fold_v16i16:
-; CHECK-SSE: # BB#0: # %entry
+; CHECK-SSE: # %bb.0: # %entry
; CHECK-SSE-NEXT: movaps {{.*#+}} xmm0 = [0,256,65535,512,65023,1024,64511,1536]
; CHECK-SSE-NEXT: movaps {{.*#+}} xmm1 = [63999,2048,63487,2560,62975,3072,62463,3584]
; CHECK-SSE-NEXT: retq
;
; CHECK-AVX-LABEL: fold_v16i16:
-; CHECK-AVX: # BB#0: # %entry
+; CHECK-AVX: # %bb.0: # %entry
; CHECK-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [0,256,65535,512,65023,1024,64511,1536,63999,2048,63487,2560,62975,3072,62463,3584]
; CHECK-AVX-NEXT: retq
;
; CHECK-WIDE-AVX-LABEL: fold_v16i16:
-; CHECK-WIDE-AVX: # BB#0: # %entry
+; CHECK-WIDE-AVX: # %bb.0: # %entry
; CHECK-WIDE-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [0,256,65535,512,65023,1024,64511,1536,63999,2048,63487,2560,62975,3072,62463,3584]
; CHECK-WIDE-AVX-NEXT: retq
entry:
@@ -443,18 +443,18 @@ entry:
define <8 x i32> @fold_v8i32() {
; CHECK-SSE-LABEL: fold_v8i32:
-; CHECK-SSE: # BB#0: # %entry
+; CHECK-SSE: # %bb.0: # %entry
; CHECK-SSE-NEXT: movaps {{.*#+}} xmm0 = [0,16777216,4294967295,33554432]
; CHECK-SSE-NEXT: movaps {{.*#+}} xmm1 = [4261412863,67108864,4227858431,100663296]
; CHECK-SSE-NEXT: retq
;
; CHECK-AVX-LABEL: fold_v8i32:
-; CHECK-AVX: # BB#0: # %entry
+; CHECK-AVX: # %bb.0: # %entry
; CHECK-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [0,16777216,4294967295,33554432,4261412863,67108864,4227858431,100663296]
; CHECK-AVX-NEXT: retq
;
; CHECK-WIDE-AVX-LABEL: fold_v8i32:
-; CHECK-WIDE-AVX: # BB#0: # %entry
+; CHECK-WIDE-AVX: # %bb.0: # %entry
; CHECK-WIDE-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [0,16777216,4294967295,33554432,4261412863,67108864,4227858431,100663296]
; CHECK-WIDE-AVX-NEXT: retq
entry:
@@ -464,18 +464,18 @@ entry:
define <4 x i64> @fold_v4i64() {
; CHECK-SSE-LABEL: fold_v4i64:
-; CHECK-SSE: # BB#0: # %entry
+; CHECK-SSE: # %bb.0: # %entry
; CHECK-SSE-NEXT: movaps {{.*#+}} xmm0 = [18374686479671623680,18446744073709551615]
; CHECK-SSE-NEXT: movaps {{.*#+}} xmm1 = [18446462598732840960,72056494526300160]
; CHECK-SSE-NEXT: retq
;
; CHECK-AVX-LABEL: fold_v4i64:
-; CHECK-AVX: # BB#0: # %entry
+; CHECK-AVX: # %bb.0: # %entry
; CHECK-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [18374686479671623680,18446744073709551615,18446462598732840960,72056494526300160]
; CHECK-AVX-NEXT: retq
;
; CHECK-WIDE-AVX-LABEL: fold_v4i64:
-; CHECK-WIDE-AVX: # BB#0: # %entry
+; CHECK-WIDE-AVX: # %bb.0: # %entry
; CHECK-WIDE-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [18374686479671623680,18446744073709551615,18446462598732840960,72056494526300160]
; CHECK-WIDE-AVX-NEXT: retq
entry:
diff --git a/test/CodeGen/X86/bswap-wide-int.ll b/test/CodeGen/X86/bswap-wide-int.ll
index 858dbf5fd85f..8d6416158e37 100644
--- a/test/CodeGen/X86/bswap-wide-int.ll
+++ b/test/CodeGen/X86/bswap-wide-int.ll
@@ -10,7 +10,7 @@ declare i256 @llvm.bswap.i256(i256)
define i64 @bswap_i64(i64 %a0) nounwind {
; X86-LABEL: bswap_i64:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: bswapl %eax
@@ -18,19 +18,19 @@ define i64 @bswap_i64(i64 %a0) nounwind {
; X86-NEXT: retl
;
; X86-MOVBE-LABEL: bswap_i64:
-; X86-MOVBE: # BB#0:
+; X86-MOVBE: # %bb.0:
; X86-MOVBE-NEXT: movbel {{[0-9]+}}(%esp), %eax
; X86-MOVBE-NEXT: movbel {{[0-9]+}}(%esp), %edx
; X86-MOVBE-NEXT: retl
;
; X64-LABEL: bswap_i64:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: bswapq %rdi
; X64-NEXT: movq %rdi, %rax
; X64-NEXT: retq
;
; X64-MOVBE-LABEL: bswap_i64:
-; X64-MOVBE: # BB#0:
+; X64-MOVBE: # %bb.0:
; X64-MOVBE-NEXT: bswapq %rdi
; X64-MOVBE-NEXT: movq %rdi, %rax
; X64-MOVBE-NEXT: retq
@@ -40,7 +40,7 @@ define i64 @bswap_i64(i64 %a0) nounwind {
define i128 @bswap_i128(i128 %a0) nounwind {
; X86-LABEL: bswap_i128:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
@@ -61,7 +61,7 @@ define i128 @bswap_i128(i128 %a0) nounwind {
; X86-NEXT: retl $4
;
; X86-MOVBE-LABEL: bswap_i128:
-; X86-MOVBE: # BB#0:
+; X86-MOVBE: # %bb.0:
; X86-MOVBE-NEXT: pushl %edi
; X86-MOVBE-NEXT: pushl %esi
; X86-MOVBE-NEXT: movl {{[0-9]+}}(%esp), %eax
@@ -78,7 +78,7 @@ define i128 @bswap_i128(i128 %a0) nounwind {
; X86-MOVBE-NEXT: retl $4
;
; X64-LABEL: bswap_i128:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: bswapq %rsi
; X64-NEXT: bswapq %rdi
; X64-NEXT: movq %rsi, %rax
@@ -86,7 +86,7 @@ define i128 @bswap_i128(i128 %a0) nounwind {
; X64-NEXT: retq
;
; X64-MOVBE-LABEL: bswap_i128:
-; X64-MOVBE: # BB#0:
+; X64-MOVBE: # %bb.0:
; X64-MOVBE-NEXT: bswapq %rsi
; X64-MOVBE-NEXT: bswapq %rdi
; X64-MOVBE-NEXT: movq %rsi, %rax
@@ -98,7 +98,7 @@ define i128 @bswap_i128(i128 %a0) nounwind {
define i256 @bswap_i256(i256 %a0) nounwind {
; X86-LABEL: bswap_i256:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: bswapl %ecx
@@ -127,7 +127,7 @@ define i256 @bswap_i256(i256 %a0) nounwind {
; X86-NEXT: retl $4
;
; X86-MOVBE-LABEL: bswap_i256:
-; X86-MOVBE: # BB#0:
+; X86-MOVBE: # %bb.0:
; X86-MOVBE-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-MOVBE-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-MOVBE-NEXT: movbel %ecx, 28(%eax)
@@ -148,7 +148,7 @@ define i256 @bswap_i256(i256 %a0) nounwind {
; X86-MOVBE-NEXT: retl $4
;
; X64-LABEL: bswap_i256:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: bswapq %r8
; X64-NEXT: bswapq %rcx
; X64-NEXT: bswapq %rdx
@@ -161,7 +161,7 @@ define i256 @bswap_i256(i256 %a0) nounwind {
; X64-NEXT: retq
;
; X64-MOVBE-LABEL: bswap_i256:
-; X64-MOVBE: # BB#0:
+; X64-MOVBE: # %bb.0:
; X64-MOVBE-NEXT: movbeq %rsi, 24(%rdi)
; X64-MOVBE-NEXT: movbeq %rdx, 16(%rdi)
; X64-MOVBE-NEXT: movbeq %rcx, 8(%rdi)
diff --git a/test/CodeGen/X86/bswap.ll b/test/CodeGen/X86/bswap.ll
index 48dc18e0ac14..336aca9a0dd8 100644
--- a/test/CodeGen/X86/bswap.ll
+++ b/test/CodeGen/X86/bswap.ll
@@ -1,7 +1,7 @@
; bswap should be constant folded when it is passed a constant argument
-; RUN: llc < %s -march=x86 -mcpu=i686 | FileCheck %s
-; RUN: llc < %s -march=x86-64 | FileCheck %s --check-prefix=CHECK64
+; RUN: llc < %s -mtriple=i686-- -mcpu=i686 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-- | FileCheck %s --check-prefix=CHECK64
declare i16 @llvm.bswap.i16(i16)
diff --git a/test/CodeGen/X86/bswap_tree.ll b/test/CodeGen/X86/bswap_tree.ll
index c217879d4386..acd9330458f4 100644
--- a/test/CodeGen/X86/bswap_tree.ll
+++ b/test/CodeGen/X86/bswap_tree.ll
@@ -12,14 +12,14 @@
; => (rotl (bswap x), 16)
define i32 @test1(i32 %x) nounwind {
; CHECK-LABEL: test1:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: bswapl %eax
; CHECK-NEXT: roll $16, %eax
; CHECK-NEXT: retl
;
; CHECK64-LABEL: test1:
-; CHECK64: # BB#0:
+; CHECK64: # %bb.0:
; CHECK64-NEXT: bswapl %edi
; CHECK64-NEXT: roll $16, %edi
; CHECK64-NEXT: movl %edi, %eax
@@ -45,14 +45,14 @@ define i32 @test1(i32 %x) nounwind {
; ((x >> 8) & 0x00ff0000)
define i32 @test2(i32 %x) nounwind {
; CHECK-LABEL: test2:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: bswapl %eax
; CHECK-NEXT: roll $16, %eax
; CHECK-NEXT: retl
;
; CHECK64-LABEL: test2:
-; CHECK64: # BB#0:
+; CHECK64: # %bb.0:
; CHECK64-NEXT: bswapl %edi
; CHECK64-NEXT: roll $16, %edi
; CHECK64-NEXT: movl %edi, %eax
diff --git a/test/CodeGen/X86/bswap_tree2.ll b/test/CodeGen/X86/bswap_tree2.ll
index 1340b7662a7a..f4d75f4ef1eb 100644
--- a/test/CodeGen/X86/bswap_tree2.ll
+++ b/test/CodeGen/X86/bswap_tree2.ll
@@ -8,7 +8,7 @@
; (with only half of the swap tree valid).
define i32 @test1(i32 %x) nounwind {
; CHECK-LABEL: test1:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: movl %eax, %ecx
; CHECK-NEXT: andl $16711680, %ecx # imm = 0xFF0000
@@ -23,7 +23,7 @@
; CHECK-NEXT: retl
;
; CHECK64-LABEL: test1:
-; CHECK64: # BB#0:
+; CHECK64: # %bb.0:
; CHECK64-NEXT: movl %edi, %eax
; CHECK64-NEXT: andl $16711680, %eax # imm = 0xFF0000
; CHECK64-NEXT: movl %edi, %ecx
@@ -58,7 +58,7 @@
; ((x >> 8) & 0x00ff0000)
define i32 @test2(i32 %x) nounwind {
; CHECK-LABEL: test2:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
; CHECK-NEXT: movl %ecx, %eax
; CHECK-NEXT: shrl $8, %eax
@@ -72,7 +72,7 @@ define i32 @test2(i32 %x) nounwind {
; CHECK-NEXT: retl
;
; CHECK64-LABEL: test2:
-; CHECK64: # BB#0:
+; CHECK64: # %bb.0:
; CHECK64-NEXT: movl %edi, %eax
; CHECK64-NEXT: shrl $8, %eax
; CHECK64-NEXT: shll $8, %edi
@@ -100,7 +100,7 @@ define i32 @test2(i32 %x) nounwind {
; Invalid pattern involving a unary op
define i32 @test3(float %x) nounwind {
; CHECK-LABEL: test3:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: subl $8, %esp
; CHECK-NEXT: flds {{[0-9]+}}(%esp)
; CHECK-NEXT: fnstcw {{[0-9]+}}(%esp)
@@ -124,7 +124,7 @@ define i32 @test3(float %x) nounwind {
; CHECK-NEXT: retl
;
; CHECK64-LABEL: test3:
-; CHECK64: # BB#0:
+; CHECK64: # %bb.0:
; CHECK64-NEXT: cvttss2si %xmm0, %ecx
; CHECK64-NEXT: movl %ecx, %edx
; CHECK64-NEXT: shll $8, %edx
diff --git a/test/CodeGen/X86/bt.ll b/test/CodeGen/X86/bt.ll
index 064058115684..144e9e7e50c7 100644
--- a/test/CodeGen/X86/bt.ll
+++ b/test/CodeGen/X86/bt.ll
@@ -1,6 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s
+; RUN: llc < %s -mtriple=i686-unknown-unknown | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=X64
+
; PR3253
; The register+memory form of the BT instruction should be usable on
@@ -20,16 +22,27 @@
; - The and can be commuted.
define void @test2(i32 %x, i32 %n) nounwind {
-; CHECK-LABEL: test2:
-; CHECK: # BB#0: # %entry
-; CHECK-NEXT: btl %esi, %edi
-; CHECK-NEXT: jb .LBB0_2
-; CHECK-NEXT: # BB#1: # %bb
-; CHECK-NEXT: pushq %rax
-; CHECK-NEXT: callq foo
-; CHECK-NEXT: popq %rax
-; CHECK-NEXT: .LBB0_2: # %UnifiedReturnBlock
-; CHECK-NEXT: retq
+; X86-LABEL: test2:
+; X86: # %bb.0: # %entry
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: btl %ecx, %eax
+; X86-NEXT: jb .LBB0_2
+; X86-NEXT: # %bb.1: # %bb
+; X86-NEXT: calll foo
+; X86-NEXT: .LBB0_2: # %UnifiedReturnBlock
+; X86-NEXT: retl
+;
+; X64-LABEL: test2:
+; X64: # %bb.0: # %entry
+; X64-NEXT: btl %esi, %edi
+; X64-NEXT: jb .LBB0_2
+; X64-NEXT: # %bb.1: # %bb
+; X64-NEXT: pushq %rax
+; X64-NEXT: callq foo
+; X64-NEXT: popq %rax
+; X64-NEXT: .LBB0_2: # %UnifiedReturnBlock
+; X64-NEXT: retq
entry:
%tmp29 = lshr i32 %x, %n
%tmp3 = and i32 %tmp29, 1
@@ -45,17 +58,29 @@ UnifiedReturnBlock:
}
define void @test2b(i32 %x, i32 %n) nounwind {
-; CHECK-LABEL: test2b:
-; CHECK: # BB#0: # %entry
-; CHECK-NEXT: btl %esi, %edi
-; CHECK-NEXT: jae .LBB1_1
-; CHECK-NEXT: # BB#2: # %UnifiedReturnBlock
-; CHECK-NEXT: retq
-; CHECK-NEXT: .LBB1_1: # %bb
-; CHECK-NEXT: pushq %rax
-; CHECK-NEXT: callq foo
-; CHECK-NEXT: popq %rax
-; CHECK-NEXT: retq
+; X86-LABEL: test2b:
+; X86: # %bb.0: # %entry
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: btl %ecx, %eax
+; X86-NEXT: jae .LBB1_1
+; X86-NEXT: # %bb.2: # %UnifiedReturnBlock
+; X86-NEXT: retl
+; X86-NEXT: .LBB1_1: # %bb
+; X86-NEXT: calll foo
+; X86-NEXT: retl
+;
+; X64-LABEL: test2b:
+; X64: # %bb.0: # %entry
+; X64-NEXT: btl %esi, %edi
+; X64-NEXT: jae .LBB1_1
+; X64-NEXT: # %bb.2: # %UnifiedReturnBlock
+; X64-NEXT: retq
+; X64-NEXT: .LBB1_1: # %bb
+; X64-NEXT: pushq %rax
+; X64-NEXT: callq foo
+; X64-NEXT: popq %rax
+; X64-NEXT: retq
entry:
%tmp29 = lshr i32 %x, %n
%tmp3 = and i32 1, %tmp29
@@ -71,16 +96,27 @@ UnifiedReturnBlock:
}
define void @atest2(i32 %x, i32 %n) nounwind {
-; CHECK-LABEL: atest2:
-; CHECK: # BB#0: # %entry
-; CHECK-NEXT: btl %esi, %edi
-; CHECK-NEXT: jb .LBB2_2
-; CHECK-NEXT: # BB#1: # %bb
-; CHECK-NEXT: pushq %rax
-; CHECK-NEXT: callq foo
-; CHECK-NEXT: popq %rax
-; CHECK-NEXT: .LBB2_2: # %UnifiedReturnBlock
-; CHECK-NEXT: retq
+; X86-LABEL: atest2:
+; X86: # %bb.0: # %entry
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: btl %ecx, %eax
+; X86-NEXT: jb .LBB2_2
+; X86-NEXT: # %bb.1: # %bb
+; X86-NEXT: calll foo
+; X86-NEXT: .LBB2_2: # %UnifiedReturnBlock
+; X86-NEXT: retl
+;
+; X64-LABEL: atest2:
+; X64: # %bb.0: # %entry
+; X64-NEXT: btl %esi, %edi
+; X64-NEXT: jb .LBB2_2
+; X64-NEXT: # %bb.1: # %bb
+; X64-NEXT: pushq %rax
+; X64-NEXT: callq foo
+; X64-NEXT: popq %rax
+; X64-NEXT: .LBB2_2: # %UnifiedReturnBlock
+; X64-NEXT: retq
entry:
%tmp29 = ashr i32 %x, %n
%tmp3 = and i32 %tmp29, 1
@@ -96,17 +132,29 @@ UnifiedReturnBlock:
}
define void @atest2b(i32 %x, i32 %n) nounwind {
-; CHECK-LABEL: atest2b:
-; CHECK: # BB#0: # %entry
-; CHECK-NEXT: btl %esi, %edi
-; CHECK-NEXT: jae .LBB3_1
-; CHECK-NEXT: # BB#2: # %UnifiedReturnBlock
-; CHECK-NEXT: retq
-; CHECK-NEXT: .LBB3_1: # %bb
-; CHECK-NEXT: pushq %rax
-; CHECK-NEXT: callq foo
-; CHECK-NEXT: popq %rax
-; CHECK-NEXT: retq
+; X86-LABEL: atest2b:
+; X86: # %bb.0: # %entry
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: btl %ecx, %eax
+; X86-NEXT: jae .LBB3_1
+; X86-NEXT: # %bb.2: # %UnifiedReturnBlock
+; X86-NEXT: retl
+; X86-NEXT: .LBB3_1: # %bb
+; X86-NEXT: calll foo
+; X86-NEXT: retl
+;
+; X64-LABEL: atest2b:
+; X64: # %bb.0: # %entry
+; X64-NEXT: btl %esi, %edi
+; X64-NEXT: jae .LBB3_1
+; X64-NEXT: # %bb.2: # %UnifiedReturnBlock
+; X64-NEXT: retq
+; X64-NEXT: .LBB3_1: # %bb
+; X64-NEXT: pushq %rax
+; X64-NEXT: callq foo
+; X64-NEXT: popq %rax
+; X64-NEXT: retq
entry:
%tmp29 = ashr i32 %x, %n
%tmp3 = and i32 1, %tmp29
@@ -122,17 +170,29 @@ UnifiedReturnBlock:
}
define void @test3(i32 %x, i32 %n) nounwind {
-; CHECK-LABEL: test3:
-; CHECK: # BB#0: # %entry
-; CHECK-NEXT: btl %esi, %edi
-; CHECK-NEXT: jae .LBB4_1
-; CHECK-NEXT: # BB#2: # %UnifiedReturnBlock
-; CHECK-NEXT: retq
-; CHECK-NEXT: .LBB4_1: # %bb
-; CHECK-NEXT: pushq %rax
-; CHECK-NEXT: callq foo
-; CHECK-NEXT: popq %rax
-; CHECK-NEXT: retq
+; X86-LABEL: test3:
+; X86: # %bb.0: # %entry
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: btl %ecx, %eax
+; X86-NEXT: jae .LBB4_1
+; X86-NEXT: # %bb.2: # %UnifiedReturnBlock
+; X86-NEXT: retl
+; X86-NEXT: .LBB4_1: # %bb
+; X86-NEXT: calll foo
+; X86-NEXT: retl
+;
+; X64-LABEL: test3:
+; X64: # %bb.0: # %entry
+; X64-NEXT: btl %esi, %edi
+; X64-NEXT: jae .LBB4_1
+; X64-NEXT: # %bb.2: # %UnifiedReturnBlock
+; X64-NEXT: retq
+; X64-NEXT: .LBB4_1: # %bb
+; X64-NEXT: pushq %rax
+; X64-NEXT: callq foo
+; X64-NEXT: popq %rax
+; X64-NEXT: retq
entry:
%tmp29 = shl i32 1, %n
%tmp3 = and i32 %tmp29, %x
@@ -148,17 +208,29 @@ UnifiedReturnBlock:
}
define void @test3b(i32 %x, i32 %n) nounwind {
-; CHECK-LABEL: test3b:
-; CHECK: # BB#0: # %entry
-; CHECK-NEXT: btl %esi, %edi
-; CHECK-NEXT: jae .LBB5_1
-; CHECK-NEXT: # BB#2: # %UnifiedReturnBlock
-; CHECK-NEXT: retq
-; CHECK-NEXT: .LBB5_1: # %bb
-; CHECK-NEXT: pushq %rax
-; CHECK-NEXT: callq foo
-; CHECK-NEXT: popq %rax
-; CHECK-NEXT: retq
+; X86-LABEL: test3b:
+; X86: # %bb.0: # %entry
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: btl %ecx, %eax
+; X86-NEXT: jae .LBB5_1
+; X86-NEXT: # %bb.2: # %UnifiedReturnBlock
+; X86-NEXT: retl
+; X86-NEXT: .LBB5_1: # %bb
+; X86-NEXT: calll foo
+; X86-NEXT: retl
+;
+; X64-LABEL: test3b:
+; X64: # %bb.0: # %entry
+; X64-NEXT: btl %esi, %edi
+; X64-NEXT: jae .LBB5_1
+; X64-NEXT: # %bb.2: # %UnifiedReturnBlock
+; X64-NEXT: retq
+; X64-NEXT: .LBB5_1: # %bb
+; X64-NEXT: pushq %rax
+; X64-NEXT: callq foo
+; X64-NEXT: popq %rax
+; X64-NEXT: retq
entry:
%tmp29 = shl i32 1, %n
%tmp3 = and i32 %x, %tmp29
@@ -174,16 +246,27 @@ UnifiedReturnBlock:
}
define void @testne2(i32 %x, i32 %n) nounwind {
-; CHECK-LABEL: testne2:
-; CHECK: # BB#0: # %entry
-; CHECK-NEXT: btl %esi, %edi
-; CHECK-NEXT: jae .LBB6_2
-; CHECK-NEXT: # BB#1: # %bb
-; CHECK-NEXT: pushq %rax
-; CHECK-NEXT: callq foo
-; CHECK-NEXT: popq %rax
-; CHECK-NEXT: .LBB6_2: # %UnifiedReturnBlock
-; CHECK-NEXT: retq
+; X86-LABEL: testne2:
+; X86: # %bb.0: # %entry
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: btl %ecx, %eax
+; X86-NEXT: jae .LBB6_2
+; X86-NEXT: # %bb.1: # %bb
+; X86-NEXT: calll foo
+; X86-NEXT: .LBB6_2: # %UnifiedReturnBlock
+; X86-NEXT: retl
+;
+; X64-LABEL: testne2:
+; X64: # %bb.0: # %entry
+; X64-NEXT: btl %esi, %edi
+; X64-NEXT: jae .LBB6_2
+; X64-NEXT: # %bb.1: # %bb
+; X64-NEXT: pushq %rax
+; X64-NEXT: callq foo
+; X64-NEXT: popq %rax
+; X64-NEXT: .LBB6_2: # %UnifiedReturnBlock
+; X64-NEXT: retq
entry:
%tmp29 = lshr i32 %x, %n
%tmp3 = and i32 %tmp29, 1
@@ -199,16 +282,27 @@ UnifiedReturnBlock:
}
define void @testne2b(i32 %x, i32 %n) nounwind {
-; CHECK-LABEL: testne2b:
-; CHECK: # BB#0: # %entry
-; CHECK-NEXT: btl %esi, %edi
-; CHECK-NEXT: jae .LBB7_2
-; CHECK-NEXT: # BB#1: # %bb
-; CHECK-NEXT: pushq %rax
-; CHECK-NEXT: callq foo
-; CHECK-NEXT: popq %rax
-; CHECK-NEXT: .LBB7_2: # %UnifiedReturnBlock
-; CHECK-NEXT: retq
+; X86-LABEL: testne2b:
+; X86: # %bb.0: # %entry
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: btl %ecx, %eax
+; X86-NEXT: jae .LBB7_2
+; X86-NEXT: # %bb.1: # %bb
+; X86-NEXT: calll foo
+; X86-NEXT: .LBB7_2: # %UnifiedReturnBlock
+; X86-NEXT: retl
+;
+; X64-LABEL: testne2b:
+; X64: # %bb.0: # %entry
+; X64-NEXT: btl %esi, %edi
+; X64-NEXT: jae .LBB7_2
+; X64-NEXT: # %bb.1: # %bb
+; X64-NEXT: pushq %rax
+; X64-NEXT: callq foo
+; X64-NEXT: popq %rax
+; X64-NEXT: .LBB7_2: # %UnifiedReturnBlock
+; X64-NEXT: retq
entry:
%tmp29 = lshr i32 %x, %n
%tmp3 = and i32 1, %tmp29
@@ -224,16 +318,27 @@ UnifiedReturnBlock:
}
define void @atestne2(i32 %x, i32 %n) nounwind {
-; CHECK-LABEL: atestne2:
-; CHECK: # BB#0: # %entry
-; CHECK-NEXT: btl %esi, %edi
-; CHECK-NEXT: jae .LBB8_2
-; CHECK-NEXT: # BB#1: # %bb
-; CHECK-NEXT: pushq %rax
-; CHECK-NEXT: callq foo
-; CHECK-NEXT: popq %rax
-; CHECK-NEXT: .LBB8_2: # %UnifiedReturnBlock
-; CHECK-NEXT: retq
+; X86-LABEL: atestne2:
+; X86: # %bb.0: # %entry
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: btl %ecx, %eax
+; X86-NEXT: jae .LBB8_2
+; X86-NEXT: # %bb.1: # %bb
+; X86-NEXT: calll foo
+; X86-NEXT: .LBB8_2: # %UnifiedReturnBlock
+; X86-NEXT: retl
+;
+; X64-LABEL: atestne2:
+; X64: # %bb.0: # %entry
+; X64-NEXT: btl %esi, %edi
+; X64-NEXT: jae .LBB8_2
+; X64-NEXT: # %bb.1: # %bb
+; X64-NEXT: pushq %rax
+; X64-NEXT: callq foo
+; X64-NEXT: popq %rax
+; X64-NEXT: .LBB8_2: # %UnifiedReturnBlock
+; X64-NEXT: retq
entry:
%tmp29 = ashr i32 %x, %n
%tmp3 = and i32 %tmp29, 1
@@ -249,16 +354,27 @@ UnifiedReturnBlock:
}
define void @atestne2b(i32 %x, i32 %n) nounwind {
-; CHECK-LABEL: atestne2b:
-; CHECK: # BB#0: # %entry
-; CHECK-NEXT: btl %esi, %edi
-; CHECK-NEXT: jae .LBB9_2
-; CHECK-NEXT: # BB#1: # %bb
-; CHECK-NEXT: pushq %rax
-; CHECK-NEXT: callq foo
-; CHECK-NEXT: popq %rax
-; CHECK-NEXT: .LBB9_2: # %UnifiedReturnBlock
-; CHECK-NEXT: retq
+; X86-LABEL: atestne2b:
+; X86: # %bb.0: # %entry
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: btl %ecx, %eax
+; X86-NEXT: jae .LBB9_2
+; X86-NEXT: # %bb.1: # %bb
+; X86-NEXT: calll foo
+; X86-NEXT: .LBB9_2: # %UnifiedReturnBlock
+; X86-NEXT: retl
+;
+; X64-LABEL: atestne2b:
+; X64: # %bb.0: # %entry
+; X64-NEXT: btl %esi, %edi
+; X64-NEXT: jae .LBB9_2
+; X64-NEXT: # %bb.1: # %bb
+; X64-NEXT: pushq %rax
+; X64-NEXT: callq foo
+; X64-NEXT: popq %rax
+; X64-NEXT: .LBB9_2: # %UnifiedReturnBlock
+; X64-NEXT: retq
entry:
%tmp29 = ashr i32 %x, %n
%tmp3 = and i32 1, %tmp29
@@ -274,16 +390,27 @@ UnifiedReturnBlock:
}
define void @testne3(i32 %x, i32 %n) nounwind {
-; CHECK-LABEL: testne3:
-; CHECK: # BB#0: # %entry
-; CHECK-NEXT: btl %esi, %edi
-; CHECK-NEXT: jae .LBB10_2
-; CHECK-NEXT: # BB#1: # %bb
-; CHECK-NEXT: pushq %rax
-; CHECK-NEXT: callq foo
-; CHECK-NEXT: popq %rax
-; CHECK-NEXT: .LBB10_2: # %UnifiedReturnBlock
-; CHECK-NEXT: retq
+; X86-LABEL: testne3:
+; X86: # %bb.0: # %entry
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: btl %ecx, %eax
+; X86-NEXT: jae .LBB10_2
+; X86-NEXT: # %bb.1: # %bb
+; X86-NEXT: calll foo
+; X86-NEXT: .LBB10_2: # %UnifiedReturnBlock
+; X86-NEXT: retl
+;
+; X64-LABEL: testne3:
+; X64: # %bb.0: # %entry
+; X64-NEXT: btl %esi, %edi
+; X64-NEXT: jae .LBB10_2
+; X64-NEXT: # %bb.1: # %bb
+; X64-NEXT: pushq %rax
+; X64-NEXT: callq foo
+; X64-NEXT: popq %rax
+; X64-NEXT: .LBB10_2: # %UnifiedReturnBlock
+; X64-NEXT: retq
entry:
%tmp29 = shl i32 1, %n
%tmp3 = and i32 %tmp29, %x
@@ -299,16 +426,27 @@ UnifiedReturnBlock:
}
define void @testne3b(i32 %x, i32 %n) nounwind {
-; CHECK-LABEL: testne3b:
-; CHECK: # BB#0: # %entry
-; CHECK-NEXT: btl %esi, %edi
-; CHECK-NEXT: jae .LBB11_2
-; CHECK-NEXT: # BB#1: # %bb
-; CHECK-NEXT: pushq %rax
-; CHECK-NEXT: callq foo
-; CHECK-NEXT: popq %rax
-; CHECK-NEXT: .LBB11_2: # %UnifiedReturnBlock
-; CHECK-NEXT: retq
+; X86-LABEL: testne3b:
+; X86: # %bb.0: # %entry
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: btl %ecx, %eax
+; X86-NEXT: jae .LBB11_2
+; X86-NEXT: # %bb.1: # %bb
+; X86-NEXT: calll foo
+; X86-NEXT: .LBB11_2: # %UnifiedReturnBlock
+; X86-NEXT: retl
+;
+; X64-LABEL: testne3b:
+; X64: # %bb.0: # %entry
+; X64-NEXT: btl %esi, %edi
+; X64-NEXT: jae .LBB11_2
+; X64-NEXT: # %bb.1: # %bb
+; X64-NEXT: pushq %rax
+; X64-NEXT: callq foo
+; X64-NEXT: popq %rax
+; X64-NEXT: .LBB11_2: # %UnifiedReturnBlock
+; X64-NEXT: retq
entry:
%tmp29 = shl i32 1, %n
%tmp3 = and i32 %x, %tmp29
@@ -324,16 +462,27 @@ UnifiedReturnBlock:
}
define void @query2(i32 %x, i32 %n) nounwind {
-; CHECK-LABEL: query2:
-; CHECK: # BB#0: # %entry
-; CHECK-NEXT: btl %esi, %edi
-; CHECK-NEXT: jae .LBB12_2
-; CHECK-NEXT: # BB#1: # %bb
-; CHECK-NEXT: pushq %rax
-; CHECK-NEXT: callq foo
-; CHECK-NEXT: popq %rax
-; CHECK-NEXT: .LBB12_2: # %UnifiedReturnBlock
-; CHECK-NEXT: retq
+; X86-LABEL: query2:
+; X86: # %bb.0: # %entry
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: btl %ecx, %eax
+; X86-NEXT: jae .LBB12_2
+; X86-NEXT: # %bb.1: # %bb
+; X86-NEXT: calll foo
+; X86-NEXT: .LBB12_2: # %UnifiedReturnBlock
+; X86-NEXT: retl
+;
+; X64-LABEL: query2:
+; X64: # %bb.0: # %entry
+; X64-NEXT: btl %esi, %edi
+; X64-NEXT: jae .LBB12_2
+; X64-NEXT: # %bb.1: # %bb
+; X64-NEXT: pushq %rax
+; X64-NEXT: callq foo
+; X64-NEXT: popq %rax
+; X64-NEXT: .LBB12_2: # %UnifiedReturnBlock
+; X64-NEXT: retq
entry:
%tmp29 = lshr i32 %x, %n
%tmp3 = and i32 %tmp29, 1
@@ -349,16 +498,27 @@ UnifiedReturnBlock:
}
define void @query2b(i32 %x, i32 %n) nounwind {
-; CHECK-LABEL: query2b:
-; CHECK: # BB#0: # %entry
-; CHECK-NEXT: btl %esi, %edi
-; CHECK-NEXT: jae .LBB13_2
-; CHECK-NEXT: # BB#1: # %bb
-; CHECK-NEXT: pushq %rax
-; CHECK-NEXT: callq foo
-; CHECK-NEXT: popq %rax
-; CHECK-NEXT: .LBB13_2: # %UnifiedReturnBlock
-; CHECK-NEXT: retq
+; X86-LABEL: query2b:
+; X86: # %bb.0: # %entry
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: btl %ecx, %eax
+; X86-NEXT: jae .LBB13_2
+; X86-NEXT: # %bb.1: # %bb
+; X86-NEXT: calll foo
+; X86-NEXT: .LBB13_2: # %UnifiedReturnBlock
+; X86-NEXT: retl
+;
+; X64-LABEL: query2b:
+; X64: # %bb.0: # %entry
+; X64-NEXT: btl %esi, %edi
+; X64-NEXT: jae .LBB13_2
+; X64-NEXT: # %bb.1: # %bb
+; X64-NEXT: pushq %rax
+; X64-NEXT: callq foo
+; X64-NEXT: popq %rax
+; X64-NEXT: .LBB13_2: # %UnifiedReturnBlock
+; X64-NEXT: retq
entry:
%tmp29 = lshr i32 %x, %n
%tmp3 = and i32 1, %tmp29
@@ -374,16 +534,27 @@ UnifiedReturnBlock:
}
define void @aquery2(i32 %x, i32 %n) nounwind {
-; CHECK-LABEL: aquery2:
-; CHECK: # BB#0: # %entry
-; CHECK-NEXT: btl %esi, %edi
-; CHECK-NEXT: jae .LBB14_2
-; CHECK-NEXT: # BB#1: # %bb
-; CHECK-NEXT: pushq %rax
-; CHECK-NEXT: callq foo
-; CHECK-NEXT: popq %rax
-; CHECK-NEXT: .LBB14_2: # %UnifiedReturnBlock
-; CHECK-NEXT: retq
+; X86-LABEL: aquery2:
+; X86: # %bb.0: # %entry
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: btl %ecx, %eax
+; X86-NEXT: jae .LBB14_2
+; X86-NEXT: # %bb.1: # %bb
+; X86-NEXT: calll foo
+; X86-NEXT: .LBB14_2: # %UnifiedReturnBlock
+; X86-NEXT: retl
+;
+; X64-LABEL: aquery2:
+; X64: # %bb.0: # %entry
+; X64-NEXT: btl %esi, %edi
+; X64-NEXT: jae .LBB14_2
+; X64-NEXT: # %bb.1: # %bb
+; X64-NEXT: pushq %rax
+; X64-NEXT: callq foo
+; X64-NEXT: popq %rax
+; X64-NEXT: .LBB14_2: # %UnifiedReturnBlock
+; X64-NEXT: retq
entry:
%tmp29 = ashr i32 %x, %n
%tmp3 = and i32 %tmp29, 1
@@ -399,16 +570,27 @@ UnifiedReturnBlock:
}
define void @aquery2b(i32 %x, i32 %n) nounwind {
-; CHECK-LABEL: aquery2b:
-; CHECK: # BB#0: # %entry
-; CHECK-NEXT: btl %esi, %edi
-; CHECK-NEXT: jae .LBB15_2
-; CHECK-NEXT: # BB#1: # %bb
-; CHECK-NEXT: pushq %rax
-; CHECK-NEXT: callq foo
-; CHECK-NEXT: popq %rax
-; CHECK-NEXT: .LBB15_2: # %UnifiedReturnBlock
-; CHECK-NEXT: retq
+; X86-LABEL: aquery2b:
+; X86: # %bb.0: # %entry
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: btl %ecx, %eax
+; X86-NEXT: jae .LBB15_2
+; X86-NEXT: # %bb.1: # %bb
+; X86-NEXT: calll foo
+; X86-NEXT: .LBB15_2: # %UnifiedReturnBlock
+; X86-NEXT: retl
+;
+; X64-LABEL: aquery2b:
+; X64: # %bb.0: # %entry
+; X64-NEXT: btl %esi, %edi
+; X64-NEXT: jae .LBB15_2
+; X64-NEXT: # %bb.1: # %bb
+; X64-NEXT: pushq %rax
+; X64-NEXT: callq foo
+; X64-NEXT: popq %rax
+; X64-NEXT: .LBB15_2: # %UnifiedReturnBlock
+; X64-NEXT: retq
entry:
%tmp29 = ashr i32 %x, %n
%tmp3 = and i32 1, %tmp29
@@ -424,16 +606,27 @@ UnifiedReturnBlock:
}
define void @query3(i32 %x, i32 %n) nounwind {
-; CHECK-LABEL: query3:
-; CHECK: # BB#0: # %entry
-; CHECK-NEXT: btl %esi, %edi
-; CHECK-NEXT: jae .LBB16_2
-; CHECK-NEXT: # BB#1: # %bb
-; CHECK-NEXT: pushq %rax
-; CHECK-NEXT: callq foo
-; CHECK-NEXT: popq %rax
-; CHECK-NEXT: .LBB16_2: # %UnifiedReturnBlock
-; CHECK-NEXT: retq
+; X86-LABEL: query3:
+; X86: # %bb.0: # %entry
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: btl %ecx, %eax
+; X86-NEXT: jae .LBB16_2
+; X86-NEXT: # %bb.1: # %bb
+; X86-NEXT: calll foo
+; X86-NEXT: .LBB16_2: # %UnifiedReturnBlock
+; X86-NEXT: retl
+;
+; X64-LABEL: query3:
+; X64: # %bb.0: # %entry
+; X64-NEXT: btl %esi, %edi
+; X64-NEXT: jae .LBB16_2
+; X64-NEXT: # %bb.1: # %bb
+; X64-NEXT: pushq %rax
+; X64-NEXT: callq foo
+; X64-NEXT: popq %rax
+; X64-NEXT: .LBB16_2: # %UnifiedReturnBlock
+; X64-NEXT: retq
entry:
%tmp29 = shl i32 1, %n
%tmp3 = and i32 %tmp29, %x
@@ -449,16 +642,27 @@ UnifiedReturnBlock:
}
define void @query3b(i32 %x, i32 %n) nounwind {
-; CHECK-LABEL: query3b:
-; CHECK: # BB#0: # %entry
-; CHECK-NEXT: btl %esi, %edi
-; CHECK-NEXT: jae .LBB17_2
-; CHECK-NEXT: # BB#1: # %bb
-; CHECK-NEXT: pushq %rax
-; CHECK-NEXT: callq foo
-; CHECK-NEXT: popq %rax
-; CHECK-NEXT: .LBB17_2: # %UnifiedReturnBlock
-; CHECK-NEXT: retq
+; X86-LABEL: query3b:
+; X86: # %bb.0: # %entry
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: btl %ecx, %eax
+; X86-NEXT: jae .LBB17_2
+; X86-NEXT: # %bb.1: # %bb
+; X86-NEXT: calll foo
+; X86-NEXT: .LBB17_2: # %UnifiedReturnBlock
+; X86-NEXT: retl
+;
+; X64-LABEL: query3b:
+; X64: # %bb.0: # %entry
+; X64-NEXT: btl %esi, %edi
+; X64-NEXT: jae .LBB17_2
+; X64-NEXT: # %bb.1: # %bb
+; X64-NEXT: pushq %rax
+; X64-NEXT: callq foo
+; X64-NEXT: popq %rax
+; X64-NEXT: .LBB17_2: # %UnifiedReturnBlock
+; X64-NEXT: retq
entry:
%tmp29 = shl i32 1, %n
%tmp3 = and i32 %x, %tmp29
@@ -474,16 +678,27 @@ UnifiedReturnBlock:
}
define void @query3x(i32 %x, i32 %n) nounwind {
-; CHECK-LABEL: query3x:
-; CHECK: # BB#0: # %entry
-; CHECK-NEXT: btl %esi, %edi
-; CHECK-NEXT: jae .LBB18_2
-; CHECK-NEXT: # BB#1: # %bb
-; CHECK-NEXT: pushq %rax
-; CHECK-NEXT: callq foo
-; CHECK-NEXT: popq %rax
-; CHECK-NEXT: .LBB18_2: # %UnifiedReturnBlock
-; CHECK-NEXT: retq
+; X86-LABEL: query3x:
+; X86: # %bb.0: # %entry
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: btl %ecx, %eax
+; X86-NEXT: jae .LBB18_2
+; X86-NEXT: # %bb.1: # %bb
+; X86-NEXT: calll foo
+; X86-NEXT: .LBB18_2: # %UnifiedReturnBlock
+; X86-NEXT: retl
+;
+; X64-LABEL: query3x:
+; X64: # %bb.0: # %entry
+; X64-NEXT: btl %esi, %edi
+; X64-NEXT: jae .LBB18_2
+; X64-NEXT: # %bb.1: # %bb
+; X64-NEXT: pushq %rax
+; X64-NEXT: callq foo
+; X64-NEXT: popq %rax
+; X64-NEXT: .LBB18_2: # %UnifiedReturnBlock
+; X64-NEXT: retq
entry:
%tmp29 = shl i32 1, %n
%tmp3 = and i32 %tmp29, %x
@@ -499,16 +714,27 @@ UnifiedReturnBlock:
}
define void @query3bx(i32 %x, i32 %n) nounwind {
-; CHECK-LABEL: query3bx:
-; CHECK: # BB#0: # %entry
-; CHECK-NEXT: btl %esi, %edi
-; CHECK-NEXT: jae .LBB19_2
-; CHECK-NEXT: # BB#1: # %bb
-; CHECK-NEXT: pushq %rax
-; CHECK-NEXT: callq foo
-; CHECK-NEXT: popq %rax
-; CHECK-NEXT: .LBB19_2: # %UnifiedReturnBlock
-; CHECK-NEXT: retq
+; X86-LABEL: query3bx:
+; X86: # %bb.0: # %entry
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: btl %ecx, %eax
+; X86-NEXT: jae .LBB19_2
+; X86-NEXT: # %bb.1: # %bb
+; X86-NEXT: calll foo
+; X86-NEXT: .LBB19_2: # %UnifiedReturnBlock
+; X86-NEXT: retl
+;
+; X64-LABEL: query3bx:
+; X64: # %bb.0: # %entry
+; X64-NEXT: btl %esi, %edi
+; X64-NEXT: jae .LBB19_2
+; X64-NEXT: # %bb.1: # %bb
+; X64-NEXT: pushq %rax
+; X64-NEXT: callq foo
+; X64-NEXT: popq %rax
+; X64-NEXT: .LBB19_2: # %UnifiedReturnBlock
+; X64-NEXT: retq
entry:
%tmp29 = shl i32 1, %n
%tmp3 = and i32 %x, %tmp29
@@ -524,16 +750,27 @@ UnifiedReturnBlock:
}
define void @queryne2(i32 %x, i32 %n) nounwind {
-; CHECK-LABEL: queryne2:
-; CHECK: # BB#0: # %entry
-; CHECK-NEXT: btl %esi, %edi
-; CHECK-NEXT: jb .LBB20_2
-; CHECK-NEXT: # BB#1: # %bb
-; CHECK-NEXT: pushq %rax
-; CHECK-NEXT: callq foo
-; CHECK-NEXT: popq %rax
-; CHECK-NEXT: .LBB20_2: # %UnifiedReturnBlock
-; CHECK-NEXT: retq
+; X86-LABEL: queryne2:
+; X86: # %bb.0: # %entry
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: btl %ecx, %eax
+; X86-NEXT: jb .LBB20_2
+; X86-NEXT: # %bb.1: # %bb
+; X86-NEXT: calll foo
+; X86-NEXT: .LBB20_2: # %UnifiedReturnBlock
+; X86-NEXT: retl
+;
+; X64-LABEL: queryne2:
+; X64: # %bb.0: # %entry
+; X64-NEXT: btl %esi, %edi
+; X64-NEXT: jb .LBB20_2
+; X64-NEXT: # %bb.1: # %bb
+; X64-NEXT: pushq %rax
+; X64-NEXT: callq foo
+; X64-NEXT: popq %rax
+; X64-NEXT: .LBB20_2: # %UnifiedReturnBlock
+; X64-NEXT: retq
entry:
%tmp29 = lshr i32 %x, %n
%tmp3 = and i32 %tmp29, 1
@@ -549,16 +786,27 @@ UnifiedReturnBlock:
}
define void @queryne2b(i32 %x, i32 %n) nounwind {
-; CHECK-LABEL: queryne2b:
-; CHECK: # BB#0: # %entry
-; CHECK-NEXT: btl %esi, %edi
-; CHECK-NEXT: jb .LBB21_2
-; CHECK-NEXT: # BB#1: # %bb
-; CHECK-NEXT: pushq %rax
-; CHECK-NEXT: callq foo
-; CHECK-NEXT: popq %rax
-; CHECK-NEXT: .LBB21_2: # %UnifiedReturnBlock
-; CHECK-NEXT: retq
+; X86-LABEL: queryne2b:
+; X86: # %bb.0: # %entry
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: btl %ecx, %eax
+; X86-NEXT: jb .LBB21_2
+; X86-NEXT: # %bb.1: # %bb
+; X86-NEXT: calll foo
+; X86-NEXT: .LBB21_2: # %UnifiedReturnBlock
+; X86-NEXT: retl
+;
+; X64-LABEL: queryne2b:
+; X64: # %bb.0: # %entry
+; X64-NEXT: btl %esi, %edi
+; X64-NEXT: jb .LBB21_2
+; X64-NEXT: # %bb.1: # %bb
+; X64-NEXT: pushq %rax
+; X64-NEXT: callq foo
+; X64-NEXT: popq %rax
+; X64-NEXT: .LBB21_2: # %UnifiedReturnBlock
+; X64-NEXT: retq
entry:
%tmp29 = lshr i32 %x, %n
%tmp3 = and i32 1, %tmp29
@@ -574,16 +822,27 @@ UnifiedReturnBlock:
}
define void @aqueryne2(i32 %x, i32 %n) nounwind {
-; CHECK-LABEL: aqueryne2:
-; CHECK: # BB#0: # %entry
-; CHECK-NEXT: btl %esi, %edi
-; CHECK-NEXT: jb .LBB22_2
-; CHECK-NEXT: # BB#1: # %bb
-; CHECK-NEXT: pushq %rax
-; CHECK-NEXT: callq foo
-; CHECK-NEXT: popq %rax
-; CHECK-NEXT: .LBB22_2: # %UnifiedReturnBlock
-; CHECK-NEXT: retq
+; X86-LABEL: aqueryne2:
+; X86: # %bb.0: # %entry
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: btl %ecx, %eax
+; X86-NEXT: jb .LBB22_2
+; X86-NEXT: # %bb.1: # %bb
+; X86-NEXT: calll foo
+; X86-NEXT: .LBB22_2: # %UnifiedReturnBlock
+; X86-NEXT: retl
+;
+; X64-LABEL: aqueryne2:
+; X64: # %bb.0: # %entry
+; X64-NEXT: btl %esi, %edi
+; X64-NEXT: jb .LBB22_2
+; X64-NEXT: # %bb.1: # %bb
+; X64-NEXT: pushq %rax
+; X64-NEXT: callq foo
+; X64-NEXT: popq %rax
+; X64-NEXT: .LBB22_2: # %UnifiedReturnBlock
+; X64-NEXT: retq
entry:
%tmp29 = ashr i32 %x, %n
%tmp3 = and i32 %tmp29, 1
@@ -599,16 +858,27 @@ UnifiedReturnBlock:
}
define void @aqueryne2b(i32 %x, i32 %n) nounwind {
-; CHECK-LABEL: aqueryne2b:
-; CHECK: # BB#0: # %entry
-; CHECK-NEXT: btl %esi, %edi
-; CHECK-NEXT: jb .LBB23_2
-; CHECK-NEXT: # BB#1: # %bb
-; CHECK-NEXT: pushq %rax
-; CHECK-NEXT: callq foo
-; CHECK-NEXT: popq %rax
-; CHECK-NEXT: .LBB23_2: # %UnifiedReturnBlock
-; CHECK-NEXT: retq
+; X86-LABEL: aqueryne2b:
+; X86: # %bb.0: # %entry
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: btl %ecx, %eax
+; X86-NEXT: jb .LBB23_2
+; X86-NEXT: # %bb.1: # %bb
+; X86-NEXT: calll foo
+; X86-NEXT: .LBB23_2: # %UnifiedReturnBlock
+; X86-NEXT: retl
+;
+; X64-LABEL: aqueryne2b:
+; X64: # %bb.0: # %entry
+; X64-NEXT: btl %esi, %edi
+; X64-NEXT: jb .LBB23_2
+; X64-NEXT: # %bb.1: # %bb
+; X64-NEXT: pushq %rax
+; X64-NEXT: callq foo
+; X64-NEXT: popq %rax
+; X64-NEXT: .LBB23_2: # %UnifiedReturnBlock
+; X64-NEXT: retq
entry:
%tmp29 = ashr i32 %x, %n
%tmp3 = and i32 1, %tmp29
@@ -624,16 +894,27 @@ UnifiedReturnBlock:
}
define void @queryne3(i32 %x, i32 %n) nounwind {
-; CHECK-LABEL: queryne3:
-; CHECK: # BB#0: # %entry
-; CHECK-NEXT: btl %esi, %edi
-; CHECK-NEXT: jb .LBB24_2
-; CHECK-NEXT: # BB#1: # %bb
-; CHECK-NEXT: pushq %rax
-; CHECK-NEXT: callq foo
-; CHECK-NEXT: popq %rax
-; CHECK-NEXT: .LBB24_2: # %UnifiedReturnBlock
-; CHECK-NEXT: retq
+; X86-LABEL: queryne3:
+; X86: # %bb.0: # %entry
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: btl %ecx, %eax
+; X86-NEXT: jb .LBB24_2
+; X86-NEXT: # %bb.1: # %bb
+; X86-NEXT: calll foo
+; X86-NEXT: .LBB24_2: # %UnifiedReturnBlock
+; X86-NEXT: retl
+;
+; X64-LABEL: queryne3:
+; X64: # %bb.0: # %entry
+; X64-NEXT: btl %esi, %edi
+; X64-NEXT: jb .LBB24_2
+; X64-NEXT: # %bb.1: # %bb
+; X64-NEXT: pushq %rax
+; X64-NEXT: callq foo
+; X64-NEXT: popq %rax
+; X64-NEXT: .LBB24_2: # %UnifiedReturnBlock
+; X64-NEXT: retq
entry:
%tmp29 = shl i32 1, %n
%tmp3 = and i32 %tmp29, %x
@@ -649,16 +930,27 @@ UnifiedReturnBlock:
}
define void @queryne3b(i32 %x, i32 %n) nounwind {
-; CHECK-LABEL: queryne3b:
-; CHECK: # BB#0: # %entry
-; CHECK-NEXT: btl %esi, %edi
-; CHECK-NEXT: jb .LBB25_2
-; CHECK-NEXT: # BB#1: # %bb
-; CHECK-NEXT: pushq %rax
-; CHECK-NEXT: callq foo
-; CHECK-NEXT: popq %rax
-; CHECK-NEXT: .LBB25_2: # %UnifiedReturnBlock
-; CHECK-NEXT: retq
+; X86-LABEL: queryne3b:
+; X86: # %bb.0: # %entry
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: btl %ecx, %eax
+; X86-NEXT: jb .LBB25_2
+; X86-NEXT: # %bb.1: # %bb
+; X86-NEXT: calll foo
+; X86-NEXT: .LBB25_2: # %UnifiedReturnBlock
+; X86-NEXT: retl
+;
+; X64-LABEL: queryne3b:
+; X64: # %bb.0: # %entry
+; X64-NEXT: btl %esi, %edi
+; X64-NEXT: jb .LBB25_2
+; X64-NEXT: # %bb.1: # %bb
+; X64-NEXT: pushq %rax
+; X64-NEXT: callq foo
+; X64-NEXT: popq %rax
+; X64-NEXT: .LBB25_2: # %UnifiedReturnBlock
+; X64-NEXT: retq
entry:
%tmp29 = shl i32 1, %n
%tmp3 = and i32 %x, %tmp29
@@ -674,16 +966,27 @@ UnifiedReturnBlock:
}
define void @queryne3x(i32 %x, i32 %n) nounwind {
-; CHECK-LABEL: queryne3x:
-; CHECK: # BB#0: # %entry
-; CHECK-NEXT: btl %esi, %edi
-; CHECK-NEXT: jb .LBB26_2
-; CHECK-NEXT: # BB#1: # %bb
-; CHECK-NEXT: pushq %rax
-; CHECK-NEXT: callq foo
-; CHECK-NEXT: popq %rax
-; CHECK-NEXT: .LBB26_2: # %UnifiedReturnBlock
-; CHECK-NEXT: retq
+; X86-LABEL: queryne3x:
+; X86: # %bb.0: # %entry
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: btl %ecx, %eax
+; X86-NEXT: jb .LBB26_2
+; X86-NEXT: # %bb.1: # %bb
+; X86-NEXT: calll foo
+; X86-NEXT: .LBB26_2: # %UnifiedReturnBlock
+; X86-NEXT: retl
+;
+; X64-LABEL: queryne3x:
+; X64: # %bb.0: # %entry
+; X64-NEXT: btl %esi, %edi
+; X64-NEXT: jb .LBB26_2
+; X64-NEXT: # %bb.1: # %bb
+; X64-NEXT: pushq %rax
+; X64-NEXT: callq foo
+; X64-NEXT: popq %rax
+; X64-NEXT: .LBB26_2: # %UnifiedReturnBlock
+; X64-NEXT: retq
entry:
%tmp29 = shl i32 1, %n
%tmp3 = and i32 %tmp29, %x
@@ -699,16 +1002,27 @@ UnifiedReturnBlock:
}
define void @queryne3bx(i32 %x, i32 %n) nounwind {
-; CHECK-LABEL: queryne3bx:
-; CHECK: # BB#0: # %entry
-; CHECK-NEXT: btl %esi, %edi
-; CHECK-NEXT: jb .LBB27_2
-; CHECK-NEXT: # BB#1: # %bb
-; CHECK-NEXT: pushq %rax
-; CHECK-NEXT: callq foo
-; CHECK-NEXT: popq %rax
-; CHECK-NEXT: .LBB27_2: # %UnifiedReturnBlock
-; CHECK-NEXT: retq
+; X86-LABEL: queryne3bx:
+; X86: # %bb.0: # %entry
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: btl %ecx, %eax
+; X86-NEXT: jb .LBB27_2
+; X86-NEXT: # %bb.1: # %bb
+; X86-NEXT: calll foo
+; X86-NEXT: .LBB27_2: # %UnifiedReturnBlock
+; X86-NEXT: retl
+;
+; X64-LABEL: queryne3bx:
+; X64: # %bb.0: # %entry
+; X64-NEXT: btl %esi, %edi
+; X64-NEXT: jb .LBB27_2
+; X64-NEXT: # %bb.1: # %bb
+; X64-NEXT: pushq %rax
+; X64-NEXT: callq foo
+; X64-NEXT: popq %rax
+; X64-NEXT: .LBB27_2: # %UnifiedReturnBlock
+; X64-NEXT: retq
entry:
%tmp29 = shl i32 1, %n
%tmp3 = and i32 %x, %tmp29
@@ -726,12 +1040,21 @@ UnifiedReturnBlock:
declare void @foo()
define zeroext i1 @invert(i32 %flags, i32 %flag) nounwind {
-; CHECK-LABEL: invert:
-; CHECK: # BB#0:
-; CHECK-NEXT: notl %edi
-; CHECK-NEXT: btl %esi, %edi
-; CHECK-NEXT: setb %al
-; CHECK-NEXT: retq
+; X86-LABEL: invert:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: notl %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: btl %ecx, %eax
+; X86-NEXT: setb %al
+; X86-NEXT: retl
+;
+; X64-LABEL: invert:
+; X64: # %bb.0:
+; X64-NEXT: notl %edi
+; X64-NEXT: btl %esi, %edi
+; X64-NEXT: setb %al
+; X64-NEXT: retq
%neg = xor i32 %flags, -1
%shl = shl i32 1, %flag
%and = and i32 %shl, %neg
@@ -740,11 +1063,19 @@ define zeroext i1 @invert(i32 %flags, i32 %flag) nounwind {
}
define zeroext i1 @extend(i32 %bit, i64 %bits) {
-; CHECK-LABEL: extend:
-; CHECK: # BB#0: # %entry
-; CHECK-NEXT: btl %edi, %esi
-; CHECK-NEXT: setb %al
-; CHECK-NEXT: retq
+; X86-LABEL: extend:
+; X86: # %bb.0: # %entry
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: btl %eax, %ecx
+; X86-NEXT: setb %al
+; X86-NEXT: retl
+;
+; X64-LABEL: extend:
+; X64: # %bb.0: # %entry
+; X64-NEXT: btl %edi, %esi
+; X64-NEXT: setb %al
+; X64-NEXT: retq
entry:
%and = and i32 %bit, 31
%sh_prom = zext i32 %and to i64
@@ -753,3 +1084,63 @@ entry:
%tobool = icmp ne i64 %and1, 0
ret i1 %tobool
}
+
+; TODO: BT fails to look through to demanded bits as c%32 has more than one use.
+; void demanded_i32(int *a, int *b, unsigned c) {
+; if ((a[c/32] >> (c % 32)) & 1)
+; b[c/32] |= 1 << (c % 32);
+; }
+define void @demanded_i32(i32* nocapture readonly, i32* nocapture, i32) nounwind {
+; X86-LABEL: demanded_i32:
+; X86: # %bb.0:
+; X86-NEXT: pushl %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: shrl $5, %eax
+; X86-NEXT: movl (%edx,%eax,4), %esi
+; X86-NEXT: movl $1, %edx
+; X86-NEXT: shll %cl, %edx
+; X86-NEXT: btl %ecx, %esi
+; X86-NEXT: jae .LBB30_2
+; X86-NEXT: # %bb.1:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: orl %edx, (%ecx,%eax,4)
+; X86-NEXT: .LBB30_2:
+; X86-NEXT: popl %esi
+; X86-NEXT: retl
+;
+; X64-LABEL: demanded_i32:
+; X64: # %bb.0:
+; X64-NEXT: movl %edx, %eax
+; X64-NEXT: shrl $5, %eax
+; X64-NEXT: movl (%rdi,%rax,4), %r8d
+; X64-NEXT: movl $1, %edi
+; X64-NEXT: movl %edx, %ecx
+; X64-NEXT: shll %cl, %edi
+; X64-NEXT: btl %edx, %r8d
+; X64-NEXT: jae .LBB30_2
+; X64-NEXT: # %bb.1:
+; X64-NEXT: orl %edi, (%rsi,%rax,4)
+; X64-NEXT: .LBB30_2:
+; X64-NEXT: retq
+ %4 = lshr i32 %2, 5
+ %5 = zext i32 %4 to i64
+ %6 = getelementptr inbounds i32, i32* %0, i64 %5
+ %7 = load i32, i32* %6, align 4
+ %8 = and i32 %2, 31
+ %9 = shl i32 1, %8
+ %10 = and i32 %7, %9
+ %11 = icmp eq i32 %10, 0
+ br i1 %11, label %16, label %12
+
+; <label>:12:
+ %13 = getelementptr inbounds i32, i32* %1, i64 %5
+ %14 = load i32, i32* %13, align 4
+ %15 = or i32 %14, %9
+ store i32 %15, i32* %13, align 4
+ br label %16
+
+; <label>:16:
+ ret void
+}
diff --git a/test/CodeGen/X86/btq.ll b/test/CodeGen/X86/btq.ll
index add65765e389..1a17de177158 100644
--- a/test/CodeGen/X86/btq.ll
+++ b/test/CodeGen/X86/btq.ll
@@ -1,15 +1,21 @@
-; RUN: llc < %s -march=x86-64 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s
declare void @bar()
define void @test1(i64 %foo) nounwind {
+; CHECK-LABEL: test1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: btq $32, %rdi
+; CHECK-NEXT: jb .LBB0_2
+; CHECK-NEXT: # %bb.1: # %if.end
+; CHECK-NEXT: retq
+; CHECK-NEXT: .LBB0_2: # %if.then
+; CHECK-NEXT: jmp bar # TAILCALL
%and = and i64 %foo, 4294967296
%tobool = icmp eq i64 %and, 0
br i1 %tobool, label %if.end, label %if.then
-; CHECK-LABEL: test1:
-; CHECK: btq $32
-
if.then:
tail call void @bar() nounwind
br label %if.end
@@ -19,13 +25,18 @@ if.end:
}
define void @test2(i64 %foo) nounwind {
+; CHECK-LABEL: test2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: testl $-2147483648, %edi # imm = 0x80000000
+; CHECK-NEXT: jne .LBB1_2
+; CHECK-NEXT: # %bb.1: # %if.end
+; CHECK-NEXT: retq
+; CHECK-NEXT: .LBB1_2: # %if.then
+; CHECK-NEXT: jmp bar # TAILCALL
%and = and i64 %foo, 2147483648
%tobool = icmp eq i64 %and, 0
br i1 %tobool, label %if.end, label %if.then
-; CHECK-LABEL: test2:
-; CHECK: testl $-2147483648
-
if.then:
tail call void @bar() nounwind
br label %if.end
diff --git a/test/CodeGen/X86/bug26810.ll b/test/CodeGen/X86/bug26810.ll
new file mode 100644
index 000000000000..816bc8224d8e
--- /dev/null
+++ b/test/CodeGen/X86/bug26810.ll
@@ -0,0 +1,312 @@
+; RUN: llc < %s -march=x86 -regalloc=greedy -stop-after=greedy | FileCheck %s
+; Make sure bad eviction sequence doesnt occur
+
+; Fix for bugzilla 26810.
+; This test is meant to make sure bad eviction sequence like the one described
+; below does not occur
+;
+; movapd %xmm7, 160(%esp) # 16-byte Spill
+; movapd %xmm5, %xmm7
+; movapd %xmm4, %xmm5
+; movapd %xmm3, %xmm4
+; movapd %xmm2, %xmm3
+; some_inst
+; movapd %xmm3, %xmm2
+; movapd %xmm4, %xmm3
+; movapd %xmm5, %xmm4
+; movapd %xmm7, %xmm5
+; movapd 160(%esp), %xmm7 # 16-byte Reload
+
+; Make sure we have no redundant copies in the problematic code section
+; CHECK-LABEL: name: loop
+; CHECK: bb.2.for.body:
+; CHECK: SUBPDrr
+; CHECK-NEXT: MOVAPSmr
+; CHECK-NEXT: MOVAPSrm
+; CHECK-NEXT: MULPDrm
+; CHECK-NEXT: ADDPDrr
+; CHECK-NEXT: ADD32ri8
+
+target datalayout = "e-m:x-p:32:32-i64:64-f80:32-n8:16:32-a:0:32-S32"
+target triple = "i386-pc-linux-gnu"
+
+%struct._iobuf = type { i8* }
+
+$"\01??_C@_01NOFIACDB@w?$AA@" = comdat any
+
+$"\01??_C@_09LAIDGMDM@?1dev?1null?$AA@" = comdat any
+
+@"\01?v@@3PAU__m128d@@A" = global [8 x <2 x double>] zeroinitializer, align 16
+@"\01?m1@@3PAU__m128d@@A" = local_unnamed_addr global [76800000 x <2 x double>] zeroinitializer, align 16
+@"\01?m2@@3PAU__m128d@@A" = local_unnamed_addr global [8 x <2 x double>] zeroinitializer, align 16
+@"\01??_C@_01NOFIACDB@w?$AA@" = linkonce_odr unnamed_addr constant [2 x i8] c"w\00", comdat, align 1
+@"\01??_C@_09LAIDGMDM@?1dev?1null?$AA@" = linkonce_odr unnamed_addr constant [10 x i8] c"/dev/null\00", comdat, align 1
+
+; Function Attrs: norecurse
+define i32 @main() local_unnamed_addr #0 {
+entry:
+ tail call void @init()
+ %0 = load <2 x double>, <2 x double>* getelementptr inbounds ([8 x <2 x double>], [8 x <2 x double>]* @"\01?m2@@3PAU__m128d@@A", i32 0, i32 0), align 16, !tbaa !8
+ %1 = load <2 x double>, <2 x double>* getelementptr inbounds ([8 x <2 x double>], [8 x <2 x double>]* @"\01?m2@@3PAU__m128d@@A", i32 0, i32 1), align 16, !tbaa !8
+ %2 = load <2 x double>, <2 x double>* getelementptr inbounds ([8 x <2 x double>], [8 x <2 x double>]* @"\01?m2@@3PAU__m128d@@A", i32 0, i32 2), align 16, !tbaa !8
+ %3 = load <2 x double>, <2 x double>* getelementptr inbounds ([8 x <2 x double>], [8 x <2 x double>]* @"\01?m2@@3PAU__m128d@@A", i32 0, i32 3), align 16, !tbaa !8
+ %4 = load <2 x double>, <2 x double>* getelementptr inbounds ([8 x <2 x double>], [8 x <2 x double>]* @"\01?m2@@3PAU__m128d@@A", i32 0, i32 4), align 16, !tbaa !8
+ %5 = load <2 x double>, <2 x double>* getelementptr inbounds ([8 x <2 x double>], [8 x <2 x double>]* @"\01?m2@@3PAU__m128d@@A", i32 0, i32 5), align 16, !tbaa !8
+ %6 = load <2 x double>, <2 x double>* getelementptr inbounds ([8 x <2 x double>], [8 x <2 x double>]* @"\01?m2@@3PAU__m128d@@A", i32 0, i32 6), align 16, !tbaa !8
+ %7 = load <2 x double>, <2 x double>* getelementptr inbounds ([8 x <2 x double>], [8 x <2 x double>]* @"\01?m2@@3PAU__m128d@@A", i32 0, i32 7), align 16, !tbaa !8
+ %.promoted.i = load <2 x double>, <2 x double>* getelementptr inbounds ([8 x <2 x double>], [8 x <2 x double>]* @"\01?v@@3PAU__m128d@@A", i32 0, i32 0), align 16, !tbaa !8
+ %.promoted51.i = load <2 x double>, <2 x double>* getelementptr inbounds ([8 x <2 x double>], [8 x <2 x double>]* @"\01?v@@3PAU__m128d@@A", i32 0, i32 1), align 16, !tbaa !8
+ %.promoted53.i = load <2 x double>, <2 x double>* getelementptr inbounds ([8 x <2 x double>], [8 x <2 x double>]* @"\01?v@@3PAU__m128d@@A", i32 0, i32 2), align 16, !tbaa !8
+ %.promoted55.i = load <2 x double>, <2 x double>* getelementptr inbounds ([8 x <2 x double>], [8 x <2 x double>]* @"\01?v@@3PAU__m128d@@A", i32 0, i32 3), align 16, !tbaa !8
+ %.promoted57.i = load <2 x double>, <2 x double>* getelementptr inbounds ([8 x <2 x double>], [8 x <2 x double>]* @"\01?v@@3PAU__m128d@@A", i32 0, i32 4), align 16, !tbaa !8
+ %.promoted59.i = load <2 x double>, <2 x double>* getelementptr inbounds ([8 x <2 x double>], [8 x <2 x double>]* @"\01?v@@3PAU__m128d@@A", i32 0, i32 5), align 16, !tbaa !8
+ %.promoted61.i = load <2 x double>, <2 x double>* getelementptr inbounds ([8 x <2 x double>], [8 x <2 x double>]* @"\01?v@@3PAU__m128d@@A", i32 0, i32 6), align 16, !tbaa !8
+ %.promoted63.i = load <2 x double>, <2 x double>* getelementptr inbounds ([8 x <2 x double>], [8 x <2 x double>]* @"\01?v@@3PAU__m128d@@A", i32 0, i32 7), align 16, !tbaa !8
+ br label %for.body.i
+
+for.body.i: ; preds = %for.body.i, %entry
+ %add.i64.i = phi <2 x double> [ %.promoted63.i, %entry ], [ %add.i.i, %for.body.i ]
+ %add.i3662.i = phi <2 x double> [ %.promoted61.i, %entry ], [ %add.i36.i, %for.body.i ]
+ %add.i3860.i = phi <2 x double> [ %.promoted59.i, %entry ], [ %add.i38.i, %for.body.i ]
+ %add.i4058.i = phi <2 x double> [ %.promoted57.i, %entry ], [ %add.i40.i, %for.body.i ]
+ %add.i4256.i = phi <2 x double> [ %.promoted55.i, %entry ], [ %add.i42.i, %for.body.i ]
+ %add.i4454.i = phi <2 x double> [ %.promoted53.i, %entry ], [ %add.i44.i, %for.body.i ]
+ %add.i4652.i = phi <2 x double> [ %.promoted51.i, %entry ], [ %add.i46.i, %for.body.i ]
+ %add.i4850.i = phi <2 x double> [ %.promoted.i, %entry ], [ %add.i48.i, %for.body.i ]
+ %i.049.i = phi i32 [ 0, %entry ], [ %inc.i, %for.body.i ]
+ %arrayidx.i = getelementptr inbounds [76800000 x <2 x double>], [76800000 x <2 x double>]* @"\01?m1@@3PAU__m128d@@A", i32 0, i32 %i.049.i
+ %8 = load <2 x double>, <2 x double>* %arrayidx.i, align 16, !tbaa !8
+ %mul.i.i = fmul <2 x double> %0, %8
+ %add.i48.i = fadd <2 x double> %add.i4850.i, %mul.i.i
+ %mul.i47.i = fmul <2 x double> %1, %8
+ %add.i46.i = fadd <2 x double> %add.i4652.i, %mul.i47.i
+ %mul.i45.i = fmul <2 x double> %2, %8
+ %add.i44.i = fadd <2 x double> %add.i4454.i, %mul.i45.i
+ %mul.i43.i = fmul <2 x double> %3, %8
+ %add.i42.i = fadd <2 x double> %add.i4256.i, %mul.i43.i
+ %mul.i41.i = fmul <2 x double> %4, %8
+ %add.i40.i = fadd <2 x double> %add.i4058.i, %mul.i41.i
+ %mul.i39.i = fmul <2 x double> %5, %8
+ %add.i38.i = fadd <2 x double> %add.i3860.i, %mul.i39.i
+ %mul.i37.i = fmul <2 x double> %6, %8
+ %add.i36.i = fsub <2 x double> %add.i3662.i, %mul.i37.i
+ %mul.i35.i = fmul <2 x double> %7, %8
+ %add.i.i = fadd <2 x double> %add.i64.i, %mul.i35.i
+ %inc.i = add nuw nsw i32 %i.049.i, 1
+ %exitcond.i = icmp eq i32 %inc.i, 76800000
+ br i1 %exitcond.i, label %loop.exit, label %for.body.i
+
+loop.exit: ; preds = %for.body.i
+ store <2 x double> %add.i48.i, <2 x double>* getelementptr inbounds ([8 x <2 x double>], [8 x <2 x double>]* @"\01?v@@3PAU__m128d@@A", i32 0, i32 0), align 16, !tbaa !8
+ store <2 x double> %add.i46.i, <2 x double>* getelementptr inbounds ([8 x <2 x double>], [8 x <2 x double>]* @"\01?v@@3PAU__m128d@@A", i32 0, i32 1), align 16, !tbaa !8
+ store <2 x double> %add.i46.i, <2 x double>* getelementptr inbounds ([8 x <2 x double>], [8 x <2 x double>]* @"\01?v@@3PAU__m128d@@A", i32 0, i32 1), align 16, !tbaa !8
+ store <2 x double> %add.i44.i, <2 x double>* getelementptr inbounds ([8 x <2 x double>], [8 x <2 x double>]* @"\01?v@@3PAU__m128d@@A", i32 0, i32 2), align 16, !tbaa !8
+ store <2 x double> %add.i42.i, <2 x double>* getelementptr inbounds ([8 x <2 x double>], [8 x <2 x double>]* @"\01?v@@3PAU__m128d@@A", i32 0, i32 3), align 16, !tbaa !8
+ store <2 x double> %add.i40.i, <2 x double>* getelementptr inbounds ([8 x <2 x double>], [8 x <2 x double>]* @"\01?v@@3PAU__m128d@@A", i32 0, i32 4), align 16, !tbaa !8
+ store <2 x double> %add.i38.i, <2 x double>* getelementptr inbounds ([8 x <2 x double>], [8 x <2 x double>]* @"\01?v@@3PAU__m128d@@A", i32 0, i32 5), align 16, !tbaa !8
+ store <2 x double> %add.i36.i, <2 x double>* getelementptr inbounds ([8 x <2 x double>], [8 x <2 x double>]* @"\01?v@@3PAU__m128d@@A", i32 0, i32 6), align 16, !tbaa !8
+ store <2 x double> %add.i.i, <2 x double>* getelementptr inbounds ([8 x <2 x double>], [8 x <2 x double>]* @"\01?v@@3PAU__m128d@@A", i32 0, i32 7), align 16, !tbaa !8
+ %call.i = tail call %struct._iobuf* @fopen(i8* getelementptr inbounds ([10 x i8], [10 x i8]* @"\01??_C@_09LAIDGMDM@?1dev?1null?$AA@", i32 0, i32 0), i8* getelementptr inbounds ([2 x i8], [2 x i8]* @"\01??_C@_01NOFIACDB@w?$AA@", i32 0, i32 0)) #7
+ %call1.i = tail call i32 @fwrite(i8* bitcast ([8 x <2 x double>]* @"\01?v@@3PAU__m128d@@A" to i8*), i32 16, i32 8, %struct._iobuf* %call.i) #7
+ %call2.i = tail call i32 @fclose(%struct._iobuf* %call.i) #7
+ ret i32 0
+}
+
+define void @init() local_unnamed_addr #1 {
+entry:
+ call void @llvm.memset.p0i8.i32(i8* bitcast ([8 x <2 x double>]* @"\01?v@@3PAU__m128d@@A" to i8*), i8 0, i32 128, i32 16, i1 false)
+ %call.i = tail call i64 @_time64(i64* null)
+ %conv = trunc i64 %call.i to i32
+ tail call void @srand(i32 %conv)
+ br label %for.body6
+
+for.body6: ; preds = %for.body6, %entry
+ %i2.051 = phi i32 [ 0, %entry ], [ %inc14, %for.body6 ]
+ %call7 = tail call i32 @rand()
+ %conv8 = sitofp i32 %call7 to double
+ %tmp.sroa.0.0.vec.insert = insertelement <2 x double> undef, double %conv8, i32 0
+ %call9 = tail call i32 @rand()
+ %conv10 = sitofp i32 %call9 to double
+ %tmp.sroa.0.8.vec.insert = insertelement <2 x double> %tmp.sroa.0.0.vec.insert, double %conv10, i32 1
+ %arrayidx12 = getelementptr inbounds [76800000 x <2 x double>], [76800000 x <2 x double>]* @"\01?m1@@3PAU__m128d@@A", i32 0, i32 %i2.051
+ store <2 x double> %tmp.sroa.0.8.vec.insert, <2 x double>* %arrayidx12, align 16, !tbaa !8
+ %inc14 = add nuw nsw i32 %i2.051, 1
+ %exitcond = icmp eq i32 %inc14, 76800000
+ br i1 %exitcond, label %for.body21.preheader, label %for.body6
+
+for.body21.preheader: ; preds = %for.body6
+ %call25 = tail call i32 @rand()
+ %conv26 = sitofp i32 %call25 to double
+ %tmp23.sroa.0.0.vec.insert = insertelement <2 x double> undef, double %conv26, i32 0
+ %call28 = tail call i32 @rand()
+ %conv29 = sitofp i32 %call28 to double
+ %tmp23.sroa.0.8.vec.insert = insertelement <2 x double> %tmp23.sroa.0.0.vec.insert, double %conv29, i32 1
+ store <2 x double> %tmp23.sroa.0.8.vec.insert, <2 x double>* getelementptr inbounds ([8 x <2 x double>], [8 x <2 x double>]* @"\01?m2@@3PAU__m128d@@A", i32 0, i32 0), align 16, !tbaa !8
+ %call25.1 = tail call i32 @rand()
+ %conv26.1 = sitofp i32 %call25.1 to double
+ %tmp23.sroa.0.0.vec.insert.1 = insertelement <2 x double> undef, double %conv26.1, i32 0
+ %call28.1 = tail call i32 @rand()
+ %conv29.1 = sitofp i32 %call28.1 to double
+ %tmp23.sroa.0.8.vec.insert.1 = insertelement <2 x double> %tmp23.sroa.0.0.vec.insert.1, double %conv29.1, i32 1
+ store <2 x double> %tmp23.sroa.0.8.vec.insert.1, <2 x double>* getelementptr inbounds ([8 x <2 x double>], [8 x <2 x double>]* @"\01?m2@@3PAU__m128d@@A", i32 0, i32 1), align 16, !tbaa !8
+ %call25.2 = tail call i32 @rand()
+ %conv26.2 = sitofp i32 %call25.2 to double
+ %tmp23.sroa.0.0.vec.insert.2 = insertelement <2 x double> undef, double %conv26.2, i32 0
+ %call28.2 = tail call i32 @rand()
+ %conv29.2 = sitofp i32 %call28.2 to double
+ %tmp23.sroa.0.8.vec.insert.2 = insertelement <2 x double> %tmp23.sroa.0.0.vec.insert.2, double %conv29.2, i32 1
+ store <2 x double> %tmp23.sroa.0.8.vec.insert.2, <2 x double>* getelementptr inbounds ([8 x <2 x double>], [8 x <2 x double>]* @"\01?m2@@3PAU__m128d@@A", i32 0, i32 2), align 16, !tbaa !8
+ %call25.3 = tail call i32 @rand()
+ %conv26.3 = sitofp i32 %call25.3 to double
+ %tmp23.sroa.0.0.vec.insert.3 = insertelement <2 x double> undef, double %conv26.3, i32 0
+ %call28.3 = tail call i32 @rand()
+ %conv29.3 = sitofp i32 %call28.3 to double
+ %tmp23.sroa.0.8.vec.insert.3 = insertelement <2 x double> %tmp23.sroa.0.0.vec.insert.3, double %conv29.3, i32 1
+ store <2 x double> %tmp23.sroa.0.8.vec.insert.3, <2 x double>* getelementptr inbounds ([8 x <2 x double>], [8 x <2 x double>]* @"\01?m2@@3PAU__m128d@@A", i32 0, i32 3), align 16, !tbaa !8
+ %call25.4 = tail call i32 @rand()
+ %conv26.4 = sitofp i32 %call25.4 to double
+ %tmp23.sroa.0.0.vec.insert.4 = insertelement <2 x double> undef, double %conv26.4, i32 0
+ %call28.4 = tail call i32 @rand()
+ %conv29.4 = sitofp i32 %call28.4 to double
+ %tmp23.sroa.0.8.vec.insert.4 = insertelement <2 x double> %tmp23.sroa.0.0.vec.insert.4, double %conv29.4, i32 1
+ store <2 x double> %tmp23.sroa.0.8.vec.insert.4, <2 x double>* getelementptr inbounds ([8 x <2 x double>], [8 x <2 x double>]* @"\01?m2@@3PAU__m128d@@A", i32 0, i32 4), align 16, !tbaa !8
+ %call25.5 = tail call i32 @rand()
+ %conv26.5 = sitofp i32 %call25.5 to double
+ %tmp23.sroa.0.0.vec.insert.5 = insertelement <2 x double> undef, double %conv26.5, i32 0
+ %call28.5 = tail call i32 @rand()
+ %conv29.5 = sitofp i32 %call28.5 to double
+ %tmp23.sroa.0.8.vec.insert.5 = insertelement <2 x double> %tmp23.sroa.0.0.vec.insert.5, double %conv29.5, i32 1
+ store <2 x double> %tmp23.sroa.0.8.vec.insert.5, <2 x double>* getelementptr inbounds ([8 x <2 x double>], [8 x <2 x double>]* @"\01?m2@@3PAU__m128d@@A", i32 0, i32 5), align 16, !tbaa !8
+ %call25.6 = tail call i32 @rand()
+ %conv26.6 = sitofp i32 %call25.6 to double
+ %tmp23.sroa.0.0.vec.insert.6 = insertelement <2 x double> undef, double %conv26.6, i32 0
+ %call28.6 = tail call i32 @rand()
+ %conv29.6 = sitofp i32 %call28.6 to double
+ %tmp23.sroa.0.8.vec.insert.6 = insertelement <2 x double> %tmp23.sroa.0.0.vec.insert.6, double %conv29.6, i32 1
+ store <2 x double> %tmp23.sroa.0.8.vec.insert.6, <2 x double>* getelementptr inbounds ([8 x <2 x double>], [8 x <2 x double>]* @"\01?m2@@3PAU__m128d@@A", i32 0, i32 6), align 16, !tbaa !8
+ %call25.7 = tail call i32 @rand()
+ %conv26.7 = sitofp i32 %call25.7 to double
+ %tmp23.sroa.0.0.vec.insert.7 = insertelement <2 x double> undef, double %conv26.7, i32 0
+ %call28.7 = tail call i32 @rand()
+ %conv29.7 = sitofp i32 %call28.7 to double
+ %tmp23.sroa.0.8.vec.insert.7 = insertelement <2 x double> %tmp23.sroa.0.0.vec.insert.7, double %conv29.7, i32 1
+ store <2 x double> %tmp23.sroa.0.8.vec.insert.7, <2 x double>* getelementptr inbounds ([8 x <2 x double>], [8 x <2 x double>]* @"\01?m2@@3PAU__m128d@@A", i32 0, i32 7), align 16, !tbaa !8
+ ret void
+}
+
+; Function Attrs: norecurse nounwind
+define void @loop() local_unnamed_addr #2 {
+entry:
+ %0 = load <2 x double>, <2 x double>* getelementptr inbounds ([8 x <2 x double>], [8 x <2 x double>]* @"\01?m2@@3PAU__m128d@@A", i32 0, i32 0), align 16, !tbaa !8
+ %1 = load <2 x double>, <2 x double>* getelementptr inbounds ([8 x <2 x double>], [8 x <2 x double>]* @"\01?m2@@3PAU__m128d@@A", i32 0, i32 1), align 16, !tbaa !8
+ %2 = load <2 x double>, <2 x double>* getelementptr inbounds ([8 x <2 x double>], [8 x <2 x double>]* @"\01?m2@@3PAU__m128d@@A", i32 0, i32 2), align 16, !tbaa !8
+ %3 = load <2 x double>, <2 x double>* getelementptr inbounds ([8 x <2 x double>], [8 x <2 x double>]* @"\01?m2@@3PAU__m128d@@A", i32 0, i32 3), align 16, !tbaa !8
+ %4 = load <2 x double>, <2 x double>* getelementptr inbounds ([8 x <2 x double>], [8 x <2 x double>]* @"\01?m2@@3PAU__m128d@@A", i32 0, i32 4), align 16, !tbaa !8
+ %5 = load <2 x double>, <2 x double>* getelementptr inbounds ([8 x <2 x double>], [8 x <2 x double>]* @"\01?m2@@3PAU__m128d@@A", i32 0, i32 5), align 16, !tbaa !8
+ %6 = load <2 x double>, <2 x double>* getelementptr inbounds ([8 x <2 x double>], [8 x <2 x double>]* @"\01?m2@@3PAU__m128d@@A", i32 0, i32 6), align 16, !tbaa !8
+ %7 = load <2 x double>, <2 x double>* getelementptr inbounds ([8 x <2 x double>], [8 x <2 x double>]* @"\01?m2@@3PAU__m128d@@A", i32 0, i32 7), align 16, !tbaa !8
+ %.promoted = load <2 x double>, <2 x double>* getelementptr inbounds ([8 x <2 x double>], [8 x <2 x double>]* @"\01?v@@3PAU__m128d@@A", i32 0, i32 0), align 16, !tbaa !8
+ %.promoted51 = load <2 x double>, <2 x double>* getelementptr inbounds ([8 x <2 x double>], [8 x <2 x double>]* @"\01?v@@3PAU__m128d@@A", i32 0, i32 1), align 16, !tbaa !8
+ %.promoted53 = load <2 x double>, <2 x double>* getelementptr inbounds ([8 x <2 x double>], [8 x <2 x double>]* @"\01?v@@3PAU__m128d@@A", i32 0, i32 2), align 16, !tbaa !8
+ %.promoted55 = load <2 x double>, <2 x double>* getelementptr inbounds ([8 x <2 x double>], [8 x <2 x double>]* @"\01?v@@3PAU__m128d@@A", i32 0, i32 3), align 16, !tbaa !8
+ %.promoted57 = load <2 x double>, <2 x double>* getelementptr inbounds ([8 x <2 x double>], [8 x <2 x double>]* @"\01?v@@3PAU__m128d@@A", i32 0, i32 4), align 16, !tbaa !8
+ %.promoted59 = load <2 x double>, <2 x double>* getelementptr inbounds ([8 x <2 x double>], [8 x <2 x double>]* @"\01?v@@3PAU__m128d@@A", i32 0, i32 5), align 16, !tbaa !8
+ %.promoted61 = load <2 x double>, <2 x double>* getelementptr inbounds ([8 x <2 x double>], [8 x <2 x double>]* @"\01?v@@3PAU__m128d@@A", i32 0, i32 6), align 16, !tbaa !8
+ %.promoted63 = load <2 x double>, <2 x double>* getelementptr inbounds ([8 x <2 x double>], [8 x <2 x double>]* @"\01?v@@3PAU__m128d@@A", i32 0, i32 7), align 16, !tbaa !8
+ br label %for.body
+
+for.cond.cleanup: ; preds = %for.body
+ store <2 x double> %add.i48, <2 x double>* getelementptr inbounds ([8 x <2 x double>], [8 x <2 x double>]* @"\01?v@@3PAU__m128d@@A", i32 0, i32 0), align 16, !tbaa !8
+ store <2 x double> %add.i46, <2 x double>* getelementptr inbounds ([8 x <2 x double>], [8 x <2 x double>]* @"\01?v@@3PAU__m128d@@A", i32 0, i32 1), align 16, !tbaa !8
+ store <2 x double> %add.i44, <2 x double>* getelementptr inbounds ([8 x <2 x double>], [8 x <2 x double>]* @"\01?v@@3PAU__m128d@@A", i32 0, i32 2), align 16, !tbaa !8
+ store <2 x double> %add.i42, <2 x double>* getelementptr inbounds ([8 x <2 x double>], [8 x <2 x double>]* @"\01?v@@3PAU__m128d@@A", i32 0, i32 3), align 16, !tbaa !8
+ store <2 x double> %add.i40, <2 x double>* getelementptr inbounds ([8 x <2 x double>], [8 x <2 x double>]* @"\01?v@@3PAU__m128d@@A", i32 0, i32 4), align 16, !tbaa !8
+ store <2 x double> %add.i38, <2 x double>* getelementptr inbounds ([8 x <2 x double>], [8 x <2 x double>]* @"\01?v@@3PAU__m128d@@A", i32 0, i32 5), align 16, !tbaa !8
+ store <2 x double> %add.i36, <2 x double>* getelementptr inbounds ([8 x <2 x double>], [8 x <2 x double>]* @"\01?v@@3PAU__m128d@@A", i32 0, i32 6), align 16, !tbaa !8
+ store <2 x double> %add.i, <2 x double>* getelementptr inbounds ([8 x <2 x double>], [8 x <2 x double>]* @"\01?v@@3PAU__m128d@@A", i32 0, i32 7), align 16, !tbaa !8
+ ret void
+
+for.body: ; preds = %for.body, %entry
+ %add.i64 = phi <2 x double> [ %.promoted63, %entry ], [ %add.i, %for.body ]
+ %add.i3662 = phi <2 x double> [ %.promoted61, %entry ], [ %add.i36, %for.body ]
+ %add.i3860 = phi <2 x double> [ %.promoted59, %entry ], [ %add.i38, %for.body ]
+ %add.i4058 = phi <2 x double> [ %.promoted57, %entry ], [ %add.i40, %for.body ]
+ %add.i4256 = phi <2 x double> [ %.promoted55, %entry ], [ %add.i42, %for.body ]
+ %add.i4454 = phi <2 x double> [ %.promoted53, %entry ], [ %add.i44, %for.body ]
+ %add.i4652 = phi <2 x double> [ %.promoted51, %entry ], [ %add.i46, %for.body ]
+ %add.i4850 = phi <2 x double> [ %.promoted, %entry ], [ %add.i48, %for.body ]
+ %i.049 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+ %arrayidx = getelementptr inbounds [76800000 x <2 x double>], [76800000 x <2 x double>]* @"\01?m1@@3PAU__m128d@@A", i32 0, i32 %i.049
+ %8 = load <2 x double>, <2 x double>* %arrayidx, align 16, !tbaa !8
+ %mul.i = fmul <2 x double> %8, %0
+ %add.i48 = fadd <2 x double> %add.i4850, %mul.i
+ %mul.i47 = fmul <2 x double> %8, %1
+ %add.i46 = fadd <2 x double> %add.i4652, %mul.i47
+ %mul.i45 = fmul <2 x double> %8, %2
+ %add.i44 = fadd <2 x double> %add.i4454, %mul.i45
+ %mul.i43 = fmul <2 x double> %8, %3
+ %add.i42 = fadd <2 x double> %add.i4256, %mul.i43
+ %mul.i41 = fmul <2 x double> %8, %4
+ %add.i40 = fadd <2 x double> %add.i4058, %mul.i41
+ %mul.i39 = fmul <2 x double> %8, %5
+ %add.i38 = fadd <2 x double> %add.i3860, %mul.i39
+ %mul.i37 = fmul <2 x double> %8, %6
+ %add.i36 = fsub <2 x double> %add.i3662, %mul.i37
+ %mul.i35 = fmul <2 x double> %8, %7
+ %add.i = fadd <2 x double> %add.i64, %mul.i35
+ %inc = add nuw nsw i32 %i.049, 1
+ %exitcond = icmp eq i32 %inc, 76800000
+ br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+; Function Attrs: nounwind
+define void @"\01?dump@@YAXXZ"() local_unnamed_addr #3 {
+entry:
+ %call = tail call %struct._iobuf* @fopen(i8* getelementptr inbounds ([10 x i8], [10 x i8]* @"\01??_C@_09LAIDGMDM@?1dev?1null?$AA@", i32 0, i32 0), i8* getelementptr inbounds ([2 x i8], [2 x i8]* @"\01??_C@_01NOFIACDB@w?$AA@", i32 0, i32 0))
+ %call1 = tail call i32 @fwrite(i8* bitcast ([8 x <2 x double>]* @"\01?v@@3PAU__m128d@@A" to i8*), i32 16, i32 8, %struct._iobuf* %call)
+ %call2 = tail call i32 @fclose(%struct._iobuf* %call)
+ ret void
+}
+
+declare void @srand(i32) local_unnamed_addr #4
+
+declare i32 @rand() local_unnamed_addr #4
+
+; Function Attrs: nounwind
+declare noalias %struct._iobuf* @fopen(i8* nocapture readonly, i8* nocapture readonly) local_unnamed_addr #5
+
+; Function Attrs: nounwind
+declare i32 @fwrite(i8* nocapture, i32, i32, %struct._iobuf* nocapture) local_unnamed_addr #5
+
+; Function Attrs: nounwind
+declare i32 @fclose(%struct._iobuf* nocapture) local_unnamed_addr #5
+
+declare i64 @_time64(i64*) local_unnamed_addr #4
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.memset.p0i8.i32(i8* nocapture writeonly, i8, i32, i32, i1) #6
+
+attributes #0 = { norecurse "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pentium4" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pentium4" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pentium4" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pentium4" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #4 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pentium4" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #5 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pentium4" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #6 = { argmemonly nounwind }
+attributes #7 = { nounwind }
+
+!llvm.linker.options = !{!0, !1, !2, !3, !4}
+!llvm.module.flags = !{!5, !6}
+!llvm.ident = !{!7}
+
+!0 = !{!"/FAILIFMISMATCH:\22_MSC_VER=1900\22"}
+!1 = !{!"/FAILIFMISMATCH:\22_ITERATOR_DEBUG_LEVEL=0\22"}
+!2 = !{!"/FAILIFMISMATCH:\22RuntimeLibrary=MT_StaticRelease\22"}
+!3 = !{!"/DEFAULTLIB:libcpmt.lib"}
+!4 = !{!"/FAILIFMISMATCH:\22_CRT_STDIO_ISO_WIDE_SPECIFIERS=0\22"}
+!5 = !{i32 1, !"NumRegisterParameters", i32 0}
+!6 = !{i32 1, !"wchar_size", i32 2}
+!7 = !{!"clang version 5.0.0 (cfe/trunk 305640)"}
+!8 = !{!9, !9, i64 0}
+!9 = !{!"omnipotent char", !10, i64 0}
+!10 = !{!"Simple C++ TBAA"}
diff --git a/test/CodeGen/X86/build-vector-128.ll b/test/CodeGen/X86/build-vector-128.ll
index 531c6de5f90c..da92fe6c3fda 100644
--- a/test/CodeGen/X86/build-vector-128.ll
+++ b/test/CodeGen/X86/build-vector-128.ll
@@ -10,23 +10,23 @@
define <2 x double> @test_buildvector_v2f64(double %a0, double %a1) {
; SSE-32-LABEL: test_buildvector_v2f64:
-; SSE-32: # BB#0:
+; SSE-32: # %bb.0:
; SSE-32-NEXT: movups {{[0-9]+}}(%esp), %xmm0
; SSE-32-NEXT: retl
;
; SSE-64-LABEL: test_buildvector_v2f64:
-; SSE-64: # BB#0:
-; SSE-64-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-64: # %bb.0:
+; SSE-64-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE-64-NEXT: retq
;
; AVX-32-LABEL: test_buildvector_v2f64:
-; AVX-32: # BB#0:
+; AVX-32: # %bb.0:
; AVX-32-NEXT: vmovups {{[0-9]+}}(%esp), %xmm0
; AVX-32-NEXT: retl
;
; AVX-64-LABEL: test_buildvector_v2f64:
-; AVX-64: # BB#0:
-; AVX-64-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-64: # %bb.0:
+; AVX-64-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX-64-NEXT: retq
%ins0 = insertelement <2 x double> undef, double %a0, i32 0
%ins1 = insertelement <2 x double> %ins0, double %a1, i32 1
@@ -35,31 +35,31 @@ define <2 x double> @test_buildvector_v2f64(double %a0, double %a1) {
define <4 x float> @test_buildvector_v4f32(float %a0, float %a1, float %a2, float %a3) {
; SSE-32-LABEL: test_buildvector_v4f32:
-; SSE-32: # BB#0:
+; SSE-32: # %bb.0:
; SSE-32-NEXT: movups {{[0-9]+}}(%esp), %xmm0
; SSE-32-NEXT: retl
;
; SSE2-64-LABEL: test_buildvector_v4f32:
-; SSE2-64: # BB#0:
+; SSE2-64: # %bb.0:
; SSE2-64-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; SSE2-64-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-64-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; SSE2-64-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; SSE2-64-NEXT: retq
;
; SSE41-64-LABEL: test_buildvector_v4f32:
-; SSE41-64: # BB#0:
+; SSE41-64: # %bb.0:
; SSE41-64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
; SSE41-64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
; SSE41-64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0]
; SSE41-64-NEXT: retq
;
; AVX-32-LABEL: test_buildvector_v4f32:
-; AVX-32: # BB#0:
+; AVX-32: # %bb.0:
; AVX-32-NEXT: vmovups {{[0-9]+}}(%esp), %xmm0
; AVX-32-NEXT: retl
;
; AVX-64-LABEL: test_buildvector_v4f32:
-; AVX-64: # BB#0:
+; AVX-64: # %bb.0:
; AVX-64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
; AVX-64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
; AVX-64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0]
@@ -73,24 +73,24 @@ define <4 x float> @test_buildvector_v4f32(float %a0, float %a1, float %a2, floa
define <2 x i64> @test_buildvector_v2i64(i64 %a0, i64 %a1) {
; SSE-32-LABEL: test_buildvector_v2i64:
-; SSE-32: # BB#0:
+; SSE-32: # %bb.0:
; SSE-32-NEXT: movups {{[0-9]+}}(%esp), %xmm0
; SSE-32-NEXT: retl
;
; SSE-64-LABEL: test_buildvector_v2i64:
-; SSE-64: # BB#0:
+; SSE-64: # %bb.0:
; SSE-64-NEXT: movq %rsi, %xmm1
; SSE-64-NEXT: movq %rdi, %xmm0
; SSE-64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE-64-NEXT: retq
;
; AVX-32-LABEL: test_buildvector_v2i64:
-; AVX-32: # BB#0:
+; AVX-32: # %bb.0:
; AVX-32-NEXT: vmovups {{[0-9]+}}(%esp), %xmm0
; AVX-32-NEXT: retl
;
; AVX-64-LABEL: test_buildvector_v2i64:
-; AVX-64: # BB#0:
+; AVX-64: # %bb.0:
; AVX-64-NEXT: vmovq %rsi, %xmm0
; AVX-64-NEXT: vmovq %rdi, %xmm1
; AVX-64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
@@ -102,12 +102,12 @@ define <2 x i64> @test_buildvector_v2i64(i64 %a0, i64 %a1) {
define <4 x i32> @test_buildvector_v4i32(i32 %f0, i32 %f1, i32 %f2, i32 %f3) {
; SSE-32-LABEL: test_buildvector_v4i32:
-; SSE-32: # BB#0:
+; SSE-32: # %bb.0:
; SSE-32-NEXT: movups {{[0-9]+}}(%esp), %xmm0
; SSE-32-NEXT: retl
;
; SSE2-64-LABEL: test_buildvector_v4i32:
-; SSE2-64: # BB#0:
+; SSE2-64: # %bb.0:
; SSE2-64-NEXT: movd %ecx, %xmm0
; SSE2-64-NEXT: movd %edx, %xmm1
; SSE2-64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
@@ -118,7 +118,7 @@ define <4 x i32> @test_buildvector_v4i32(i32 %f0, i32 %f1, i32 %f2, i32 %f3) {
; SSE2-64-NEXT: retq
;
; SSE41-64-LABEL: test_buildvector_v4i32:
-; SSE41-64: # BB#0:
+; SSE41-64: # %bb.0:
; SSE41-64-NEXT: movd %edi, %xmm0
; SSE41-64-NEXT: pinsrd $1, %esi, %xmm0
; SSE41-64-NEXT: pinsrd $2, %edx, %xmm0
@@ -126,12 +126,12 @@ define <4 x i32> @test_buildvector_v4i32(i32 %f0, i32 %f1, i32 %f2, i32 %f3) {
; SSE41-64-NEXT: retq
;
; AVX-32-LABEL: test_buildvector_v4i32:
-; AVX-32: # BB#0:
+; AVX-32: # %bb.0:
; AVX-32-NEXT: vmovups {{[0-9]+}}(%esp), %xmm0
; AVX-32-NEXT: retl
;
; AVX-64-LABEL: test_buildvector_v4i32:
-; AVX-64: # BB#0:
+; AVX-64: # %bb.0:
; AVX-64-NEXT: vmovd %edi, %xmm0
; AVX-64-NEXT: vpinsrd $1, %esi, %xmm0, %xmm0
; AVX-64-NEXT: vpinsrd $2, %edx, %xmm0, %xmm0
@@ -146,7 +146,7 @@ define <4 x i32> @test_buildvector_v4i32(i32 %f0, i32 %f1, i32 %f2, i32 %f3) {
define <8 x i16> @test_buildvector_v8i16(i16 %a0, i16 %a1, i16 %a2, i16 %a3, i16 %a4, i16 %a5, i16 %a6, i16 %a7) {
; SSE2-32-LABEL: test_buildvector_v8i16:
-; SSE2-32: # BB#0:
+; SSE2-32: # %bb.0:
; SSE2-32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE2-32-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; SSE2-32-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
@@ -165,7 +165,7 @@ define <8 x i16> @test_buildvector_v8i16(i16 %a0, i16 %a1, i16 %a2, i16 %a3, i16
; SSE2-32-NEXT: retl
;
; SSE2-64-LABEL: test_buildvector_v8i16:
-; SSE2-64: # BB#0:
+; SSE2-64: # %bb.0:
; SSE2-64-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE2-64-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; SSE2-64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
@@ -184,7 +184,7 @@ define <8 x i16> @test_buildvector_v8i16(i16 %a0, i16 %a1, i16 %a2, i16 %a3, i16
; SSE2-64-NEXT: retq
;
; SSE41-32-LABEL: test_buildvector_v8i16:
-; SSE41-32: # BB#0:
+; SSE41-32: # %bb.0:
; SSE41-32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE41-32-NEXT: pinsrw $1, {{[0-9]+}}(%esp), %xmm0
; SSE41-32-NEXT: pinsrw $2, {{[0-9]+}}(%esp), %xmm0
@@ -196,7 +196,7 @@ define <8 x i16> @test_buildvector_v8i16(i16 %a0, i16 %a1, i16 %a2, i16 %a3, i16
; SSE41-32-NEXT: retl
;
; SSE41-64-LABEL: test_buildvector_v8i16:
-; SSE41-64: # BB#0:
+; SSE41-64: # %bb.0:
; SSE41-64-NEXT: movd %edi, %xmm0
; SSE41-64-NEXT: pinsrw $1, %esi, %xmm0
; SSE41-64-NEXT: pinsrw $2, %edx, %xmm0
@@ -208,7 +208,7 @@ define <8 x i16> @test_buildvector_v8i16(i16 %a0, i16 %a1, i16 %a2, i16 %a3, i16
; SSE41-64-NEXT: retq
;
; AVX-32-LABEL: test_buildvector_v8i16:
-; AVX-32: # BB#0:
+; AVX-32: # %bb.0:
; AVX-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX-32-NEXT: vpinsrw $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
; AVX-32-NEXT: vpinsrw $2, {{[0-9]+}}(%esp), %xmm0, %xmm0
@@ -220,7 +220,7 @@ define <8 x i16> @test_buildvector_v8i16(i16 %a0, i16 %a1, i16 %a2, i16 %a3, i16
; AVX-32-NEXT: retl
;
; AVX-64-LABEL: test_buildvector_v8i16:
-; AVX-64: # BB#0:
+; AVX-64: # %bb.0:
; AVX-64-NEXT: vmovd %edi, %xmm0
; AVX-64-NEXT: vpinsrw $1, %esi, %xmm0, %xmm0
; AVX-64-NEXT: vpinsrw $2, %edx, %xmm0, %xmm0
@@ -243,7 +243,7 @@ define <8 x i16> @test_buildvector_v8i16(i16 %a0, i16 %a1, i16 %a2, i16 %a3, i16
define <16 x i8> @test_buildvector_v16i8(i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, i8 %a5, i8 %a6, i8 %a7, i8 %a8, i8 %a9, i8 %a10, i8 %a11, i8 %a12, i8 %a13, i8 %a14, i8 %a15) {
; SSE2-32-LABEL: test_buildvector_v16i8:
-; SSE2-32: # BB#0:
+; SSE2-32: # %bb.0:
; SSE2-32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE2-32-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; SSE2-32-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
@@ -278,7 +278,7 @@ define <16 x i8> @test_buildvector_v16i8(i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4,
; SSE2-32-NEXT: retl
;
; SSE2-64-LABEL: test_buildvector_v16i8:
-; SSE2-64: # BB#0:
+; SSE2-64: # %bb.0:
; SSE2-64-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE2-64-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; SSE2-64-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
@@ -313,7 +313,7 @@ define <16 x i8> @test_buildvector_v16i8(i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4,
; SSE2-64-NEXT: retq
;
; SSE41-32-LABEL: test_buildvector_v16i8:
-; SSE41-32: # BB#0:
+; SSE41-32: # %bb.0:
; SSE41-32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE41-32-NEXT: pinsrb $1, {{[0-9]+}}(%esp), %xmm0
; SSE41-32-NEXT: pinsrb $2, {{[0-9]+}}(%esp), %xmm0
@@ -333,7 +333,7 @@ define <16 x i8> @test_buildvector_v16i8(i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4,
; SSE41-32-NEXT: retl
;
; SSE41-64-LABEL: test_buildvector_v16i8:
-; SSE41-64: # BB#0:
+; SSE41-64: # %bb.0:
; SSE41-64-NEXT: movd %edi, %xmm0
; SSE41-64-NEXT: pinsrb $1, %esi, %xmm0
; SSE41-64-NEXT: pinsrb $2, %edx, %xmm0
@@ -353,7 +353,7 @@ define <16 x i8> @test_buildvector_v16i8(i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4,
; SSE41-64-NEXT: retq
;
; AVX-32-LABEL: test_buildvector_v16i8:
-; AVX-32: # BB#0:
+; AVX-32: # %bb.0:
; AVX-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX-32-NEXT: vpinsrb $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
; AVX-32-NEXT: vpinsrb $2, {{[0-9]+}}(%esp), %xmm0, %xmm0
@@ -373,7 +373,7 @@ define <16 x i8> @test_buildvector_v16i8(i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4,
; AVX-32-NEXT: retl
;
; AVX-64-LABEL: test_buildvector_v16i8:
-; AVX-64: # BB#0:
+; AVX-64: # %bb.0:
; AVX-64-NEXT: vmovd %edi, %xmm0
; AVX-64-NEXT: vpinsrb $1, %esi, %xmm0, %xmm0
; AVX-64-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
diff --git a/test/CodeGen/X86/build-vector-256.ll b/test/CodeGen/X86/build-vector-256.ll
index 942b7779abe6..f2f17710033d 100644
--- a/test/CodeGen/X86/build-vector-256.ll
+++ b/test/CodeGen/X86/build-vector-256.ll
@@ -6,14 +6,14 @@
define <4 x double> @test_buildvector_v4f64(double %a0, double %a1, double %a2, double %a3) {
; AVX-32-LABEL: test_buildvector_v4f64:
-; AVX-32: # BB#0:
+; AVX-32: # %bb.0:
; AVX-32-NEXT: vmovups {{[0-9]+}}(%esp), %ymm0
; AVX-32-NEXT: retl
;
; AVX-64-LABEL: test_buildvector_v4f64:
-; AVX-64: # BB#0:
-; AVX-64-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; AVX-64-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-64: # %bb.0:
+; AVX-64-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; AVX-64-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX-64-NEXT: retq
%ins0 = insertelement <4 x double> undef, double %a0, i32 0
@@ -25,12 +25,12 @@ define <4 x double> @test_buildvector_v4f64(double %a0, double %a1, double %a2,
define <8 x float> @test_buildvector_v8f32(float %a0, float %a1, float %a2, float %a3, float %a4, float %a5, float %a6, float %a7) {
; AVX-32-LABEL: test_buildvector_v8f32:
-; AVX-32: # BB#0:
+; AVX-32: # %bb.0:
; AVX-32-NEXT: vmovups {{[0-9]+}}(%esp), %ymm0
; AVX-32-NEXT: retl
;
; AVX-64-LABEL: test_buildvector_v8f32:
-; AVX-64: # BB#0:
+; AVX-64: # %bb.0:
; AVX-64-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[2,3]
; AVX-64-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1],xmm6[0],xmm4[3]
; AVX-64-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1,2],xmm7[0]
@@ -52,12 +52,12 @@ define <8 x float> @test_buildvector_v8f32(float %a0, float %a1, float %a2, floa
define <4 x i64> @test_buildvector_v4i64(i64 %a0, i64 %a1, i64 %a2, i64 %a3) {
; AVX-32-LABEL: test_buildvector_v4i64:
-; AVX-32: # BB#0:
+; AVX-32: # %bb.0:
; AVX-32-NEXT: vmovups {{[0-9]+}}(%esp), %ymm0
; AVX-32-NEXT: retl
;
; AVX1-64-LABEL: test_buildvector_v4i64:
-; AVX1-64: # BB#0:
+; AVX1-64: # %bb.0:
; AVX1-64-NEXT: vmovq %rcx, %xmm0
; AVX1-64-NEXT: vmovq %rdx, %xmm1
; AVX1-64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
@@ -68,7 +68,7 @@ define <4 x i64> @test_buildvector_v4i64(i64 %a0, i64 %a1, i64 %a2, i64 %a3) {
; AVX1-64-NEXT: retq
;
; AVX2-64-LABEL: test_buildvector_v4i64:
-; AVX2-64: # BB#0:
+; AVX2-64: # %bb.0:
; AVX2-64-NEXT: vmovq %rcx, %xmm0
; AVX2-64-NEXT: vmovq %rdx, %xmm1
; AVX2-64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
@@ -86,12 +86,12 @@ define <4 x i64> @test_buildvector_v4i64(i64 %a0, i64 %a1, i64 %a2, i64 %a3) {
define <8 x i32> @test_buildvector_v8i32(i32 %a0, i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7) {
; AVX-32-LABEL: test_buildvector_v8i32:
-; AVX-32: # BB#0:
+; AVX-32: # %bb.0:
; AVX-32-NEXT: vmovups {{[0-9]+}}(%esp), %ymm0
; AVX-32-NEXT: retl
;
; AVX1-64-LABEL: test_buildvector_v8i32:
-; AVX1-64: # BB#0:
+; AVX1-64: # %bb.0:
; AVX1-64-NEXT: vmovd %edi, %xmm0
; AVX1-64-NEXT: vpinsrd $1, %esi, %xmm0, %xmm0
; AVX1-64-NEXT: vpinsrd $2, %edx, %xmm0, %xmm0
@@ -104,7 +104,7 @@ define <8 x i32> @test_buildvector_v8i32(i32 %a0, i32 %a1, i32 %a2, i32 %a3, i32
; AVX1-64-NEXT: retq
;
; AVX2-64-LABEL: test_buildvector_v8i32:
-; AVX2-64: # BB#0:
+; AVX2-64: # %bb.0:
; AVX2-64-NEXT: vmovd %edi, %xmm0
; AVX2-64-NEXT: vpinsrd $1, %esi, %xmm0, %xmm0
; AVX2-64-NEXT: vpinsrd $2, %edx, %xmm0, %xmm0
@@ -128,7 +128,7 @@ define <8 x i32> @test_buildvector_v8i32(i32 %a0, i32 %a1, i32 %a2, i32 %a3, i32
define <16 x i16> @test_buildvector_v16i16(i16 %a0, i16 %a1, i16 %a2, i16 %a3, i16 %a4, i16 %a5, i16 %a6, i16 %a7, i16 %a8, i16 %a9, i16 %a10, i16 %a11, i16 %a12, i16 %a13, i16 %a14, i16 %a15) {
; AVX1-32-LABEL: test_buildvector_v16i16:
-; AVX1-32: # BB#0:
+; AVX1-32: # %bb.0:
; AVX1-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX1-32-NEXT: vpinsrw $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
; AVX1-32-NEXT: vpinsrw $2, {{[0-9]+}}(%esp), %xmm0, %xmm0
@@ -149,7 +149,7 @@ define <16 x i16> @test_buildvector_v16i16(i16 %a0, i16 %a1, i16 %a2, i16 %a3, i
; AVX1-32-NEXT: retl
;
; AVX1-64-LABEL: test_buildvector_v16i16:
-; AVX1-64: # BB#0:
+; AVX1-64: # %bb.0:
; AVX1-64-NEXT: vmovd %edi, %xmm0
; AVX1-64-NEXT: vpinsrw $1, %esi, %xmm0, %xmm0
; AVX1-64-NEXT: vpinsrw $2, %edx, %xmm0, %xmm0
@@ -170,7 +170,7 @@ define <16 x i16> @test_buildvector_v16i16(i16 %a0, i16 %a1, i16 %a2, i16 %a3, i
; AVX1-64-NEXT: retq
;
; AVX2-32-LABEL: test_buildvector_v16i16:
-; AVX2-32: # BB#0:
+; AVX2-32: # %bb.0:
; AVX2-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX2-32-NEXT: vpinsrw $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
; AVX2-32-NEXT: vpinsrw $2, {{[0-9]+}}(%esp), %xmm0, %xmm0
@@ -191,7 +191,7 @@ define <16 x i16> @test_buildvector_v16i16(i16 %a0, i16 %a1, i16 %a2, i16 %a3, i
; AVX2-32-NEXT: retl
;
; AVX2-64-LABEL: test_buildvector_v16i16:
-; AVX2-64: # BB#0:
+; AVX2-64: # %bb.0:
; AVX2-64-NEXT: vmovd %edi, %xmm0
; AVX2-64-NEXT: vpinsrw $1, %esi, %xmm0, %xmm0
; AVX2-64-NEXT: vpinsrw $2, %edx, %xmm0, %xmm0
@@ -231,7 +231,7 @@ define <16 x i16> @test_buildvector_v16i16(i16 %a0, i16 %a1, i16 %a2, i16 %a3, i
define <32 x i8> @test_buildvector_v32i8(i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, i8 %a5, i8 %a6, i8 %a7, i8 %a8, i8 %a9, i8 %a10, i8 %a11, i8 %a12, i8 %a13, i8 %a14, i8 %a15, i8 %a16, i8 %a17, i8 %a18, i8 %a19, i8 %a20, i8 %a21, i8 %a22, i8 %a23, i8 %a24, i8 %a25, i8 %a26, i8 %a27, i8 %a28, i8 %a29, i8 %a30, i8 %a31) {
; AVX1-32-LABEL: test_buildvector_v32i8:
-; AVX1-32: # BB#0:
+; AVX1-32: # %bb.0:
; AVX1-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX1-32-NEXT: vpinsrb $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
; AVX1-32-NEXT: vpinsrb $2, {{[0-9]+}}(%esp), %xmm0, %xmm0
@@ -268,7 +268,7 @@ define <32 x i8> @test_buildvector_v32i8(i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4,
; AVX1-32-NEXT: retl
;
; AVX1-64-LABEL: test_buildvector_v32i8:
-; AVX1-64: # BB#0:
+; AVX1-64: # %bb.0:
; AVX1-64-NEXT: vmovd %edi, %xmm0
; AVX1-64-NEXT: vpinsrb $1, %esi, %xmm0, %xmm0
; AVX1-64-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
@@ -305,7 +305,7 @@ define <32 x i8> @test_buildvector_v32i8(i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4,
; AVX1-64-NEXT: retq
;
; AVX2-32-LABEL: test_buildvector_v32i8:
-; AVX2-32: # BB#0:
+; AVX2-32: # %bb.0:
; AVX2-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX2-32-NEXT: vpinsrb $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
; AVX2-32-NEXT: vpinsrb $2, {{[0-9]+}}(%esp), %xmm0, %xmm0
@@ -342,7 +342,7 @@ define <32 x i8> @test_buildvector_v32i8(i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4,
; AVX2-32-NEXT: retl
;
; AVX2-64-LABEL: test_buildvector_v32i8:
-; AVX2-64: # BB#0:
+; AVX2-64: # %bb.0:
; AVX2-64-NEXT: vmovd %edi, %xmm0
; AVX2-64-NEXT: vpinsrb $1, %esi, %xmm0, %xmm0
; AVX2-64-NEXT: vpinsrb $2, %edx, %xmm0, %xmm0
diff --git a/test/CodeGen/X86/build-vector-512.ll b/test/CodeGen/X86/build-vector-512.ll
index fbfbf2d53c63..aba8b13db967 100644
--- a/test/CodeGen/X86/build-vector-512.ll
+++ b/test/CodeGen/X86/build-vector-512.ll
@@ -6,17 +6,17 @@
define <8 x double> @test_buildvector_v8f64(double %a0, double %a1, double %a2, double %a3, double %a4, double %a5, double %a6, double %a7) {
; AVX-32-LABEL: test_buildvector_v8f64:
-; AVX-32: # BB#0:
+; AVX-32: # %bb.0:
; AVX-32-NEXT: vmovups {{[0-9]+}}(%esp), %zmm0
; AVX-32-NEXT: retl
;
; AVX-64-LABEL: test_buildvector_v8f64:
-; AVX-64: # BB#0:
-; AVX-64-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],xmm7[0]
-; AVX-64-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm4[0],xmm5[0]
+; AVX-64: # %bb.0:
+; AVX-64-NEXT: vmovlhps {{.*#+}} xmm6 = xmm6[0],xmm7[0]
+; AVX-64-NEXT: vmovlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0]
; AVX-64-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4
-; AVX-64-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; AVX-64-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-64-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; AVX-64-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX-64-NEXT: vinsertf64x4 $1, %ymm4, %zmm0, %zmm0
; AVX-64-NEXT: retq
@@ -33,12 +33,12 @@ define <8 x double> @test_buildvector_v8f64(double %a0, double %a1, double %a2,
define <16 x float> @test_buildvector_v16f32(float %a0, float %a1, float %a2, float %a3, float %a4, float %a5, float %a6, float %a7, float %a8, float %a9, float %a10, float %a11, float %a12, float %a13, float %a14, float %a15) {
; AVX-32-LABEL: test_buildvector_v16f32:
-; AVX-32: # BB#0:
+; AVX-32: # %bb.0:
; AVX-32-NEXT: vmovups {{[0-9]+}}(%esp), %zmm0
; AVX-32-NEXT: retl
;
; AVX-64-LABEL: test_buildvector_v16f32:
-; AVX-64: # BB#0:
+; AVX-64: # %bb.0:
; AVX-64-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[2,3]
; AVX-64-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1],xmm6[0],xmm4[3]
; AVX-64-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1,2],xmm7[0]
@@ -78,12 +78,12 @@ define <16 x float> @test_buildvector_v16f32(float %a0, float %a1, float %a2, fl
define <8 x i64> @test_buildvector_v8i64(i64 %a0, i64 %a1, i64 %a2, i64 %a3, i64 %a4, i64 %a5, i64 %a6, i64 %a7) {
; AVX-32-LABEL: test_buildvector_v8i64:
-; AVX-32: # BB#0:
+; AVX-32: # %bb.0:
; AVX-32-NEXT: vmovups {{[0-9]+}}(%esp), %zmm0
; AVX-32-NEXT: retl
;
; AVX-64-LABEL: test_buildvector_v8i64:
-; AVX-64: # BB#0:
+; AVX-64: # %bb.0:
; AVX-64-NEXT: vmovq %rcx, %xmm0
; AVX-64-NEXT: vmovq %rdx, %xmm1
; AVX-64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
@@ -110,12 +110,12 @@ define <8 x i64> @test_buildvector_v8i64(i64 %a0, i64 %a1, i64 %a2, i64 %a3, i64
define <16 x i32> @test_buildvector_v16i32(i32 %a0, i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7, i32 %a8, i32 %a9, i32 %a10, i32 %a11, i32 %a12, i32 %a13, i32 %a14, i32 %a15) {
; AVX-32-LABEL: test_buildvector_v16i32:
-; AVX-32: # BB#0:
+; AVX-32: # %bb.0:
; AVX-32-NEXT: vmovups {{[0-9]+}}(%esp), %zmm0
; AVX-32-NEXT: retl
;
; AVX-64-LABEL: test_buildvector_v16i32:
-; AVX-64: # BB#0:
+; AVX-64: # %bb.0:
; AVX-64-NEXT: vmovd %edi, %xmm0
; AVX-64-NEXT: vpinsrd $1, %esi, %xmm0, %xmm0
; AVX-64-NEXT: vpinsrd $2, %edx, %xmm0, %xmm0
@@ -157,7 +157,7 @@ define <16 x i32> @test_buildvector_v16i32(i32 %a0, i32 %a1, i32 %a2, i32 %a3, i
define <32 x i16> @test_buildvector_v32i16(i16 %a0, i16 %a1, i16 %a2, i16 %a3, i16 %a4, i16 %a5, i16 %a6, i16 %a7, i16 %a8, i16 %a9, i16 %a10, i16 %a11, i16 %a12, i16 %a13, i16 %a14, i16 %a15, i16 %a16, i16 %a17, i16 %a18, i16 %a19, i16 %a20, i16 %a21, i16 %a22, i16 %a23, i16 %a24, i16 %a25, i16 %a26, i16 %a27, i16 %a28, i16 %a29, i16 %a30, i16 %a31) {
; AVX512F-32-LABEL: test_buildvector_v32i16:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX512F-32-NEXT: vpinsrw $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
; AVX512F-32-NEXT: vpinsrw $2, {{[0-9]+}}(%esp), %xmm0, %xmm0
@@ -195,7 +195,7 @@ define <32 x i16> @test_buildvector_v32i16(i16 %a0, i16 %a1, i16 %a2, i16 %a3, i
; AVX512F-32-NEXT: retl
;
; AVX512F-64-LABEL: test_buildvector_v32i16:
-; AVX512F-64: # BB#0:
+; AVX512F-64: # %bb.0:
; AVX512F-64-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX512F-64-NEXT: vpinsrw $1, {{[0-9]+}}(%rsp), %xmm0, %xmm0
; AVX512F-64-NEXT: vpinsrw $2, {{[0-9]+}}(%rsp), %xmm0, %xmm0
@@ -233,7 +233,7 @@ define <32 x i16> @test_buildvector_v32i16(i16 %a0, i16 %a1, i16 %a2, i16 %a3, i
; AVX512F-64-NEXT: retq
;
; AVX512BW-32-LABEL: test_buildvector_v32i16:
-; AVX512BW-32: # BB#0:
+; AVX512BW-32: # %bb.0:
; AVX512BW-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX512BW-32-NEXT: vpinsrw $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
; AVX512BW-32-NEXT: vpinsrw $2, {{[0-9]+}}(%esp), %xmm0, %xmm0
@@ -272,7 +272,7 @@ define <32 x i16> @test_buildvector_v32i16(i16 %a0, i16 %a1, i16 %a2, i16 %a3, i
; AVX512BW-32-NEXT: retl
;
; AVX512BW-64-LABEL: test_buildvector_v32i16:
-; AVX512BW-64: # BB#0:
+; AVX512BW-64: # %bb.0:
; AVX512BW-64-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX512BW-64-NEXT: vpinsrw $1, {{[0-9]+}}(%rsp), %xmm0, %xmm0
; AVX512BW-64-NEXT: vpinsrw $2, {{[0-9]+}}(%rsp), %xmm0, %xmm0
@@ -346,7 +346,7 @@ define <32 x i16> @test_buildvector_v32i16(i16 %a0, i16 %a1, i16 %a2, i16 %a3, i
define <64 x i8> @test_buildvector_v64i8(i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, i8 %a5, i8 %a6, i8 %a7, i8 %a8, i8 %a9, i8 %a10, i8 %a11, i8 %a12, i8 %a13, i8 %a14, i8 %a15, i8 %a16, i8 %a17, i8 %a18, i8 %a19, i8 %a20, i8 %a21, i8 %a22, i8 %a23, i8 %a24, i8 %a25, i8 %a26, i8 %a27, i8 %a28, i8 %a29, i8 %a30, i8 %a31, i8 %a32, i8 %a33, i8 %a34, i8 %a35, i8 %a36, i8 %a37, i8 %a38, i8 %a39, i8 %a40, i8 %a41, i8 %a42, i8 %a43, i8 %a44, i8 %a45, i8 %a46, i8 %a47, i8 %a48, i8 %a49, i8 %a50, i8 %a51, i8 %a52, i8 %a53, i8 %a54, i8 %a55, i8 %a56, i8 %a57, i8 %a58, i8 %a59, i8 %a60, i8 %a61, i8 %a62, i8 %a63) {
; AVX512F-32-LABEL: test_buildvector_v64i8:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX512F-32-NEXT: vpinsrb $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
; AVX512F-32-NEXT: vpinsrb $2, {{[0-9]+}}(%esp), %xmm0, %xmm0
@@ -416,7 +416,7 @@ define <64 x i8> @test_buildvector_v64i8(i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4,
; AVX512F-32-NEXT: retl
;
; AVX512F-64-LABEL: test_buildvector_v64i8:
-; AVX512F-64: # BB#0:
+; AVX512F-64: # %bb.0:
; AVX512F-64-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX512F-64-NEXT: vpinsrb $1, {{[0-9]+}}(%rsp), %xmm0, %xmm0
; AVX512F-64-NEXT: vpinsrb $2, {{[0-9]+}}(%rsp), %xmm0, %xmm0
@@ -486,7 +486,7 @@ define <64 x i8> @test_buildvector_v64i8(i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4,
; AVX512F-64-NEXT: retq
;
; AVX512BW-32-LABEL: test_buildvector_v64i8:
-; AVX512BW-32: # BB#0:
+; AVX512BW-32: # %bb.0:
; AVX512BW-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX512BW-32-NEXT: vpinsrb $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
; AVX512BW-32-NEXT: vpinsrb $2, {{[0-9]+}}(%esp), %xmm0, %xmm0
@@ -557,7 +557,7 @@ define <64 x i8> @test_buildvector_v64i8(i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4,
; AVX512BW-32-NEXT: retl
;
; AVX512BW-64-LABEL: test_buildvector_v64i8:
-; AVX512BW-64: # BB#0:
+; AVX512BW-64: # %bb.0:
; AVX512BW-64-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX512BW-64-NEXT: vpinsrb $1, {{[0-9]+}}(%rsp), %xmm0, %xmm0
; AVX512BW-64-NEXT: vpinsrb $2, {{[0-9]+}}(%rsp), %xmm0, %xmm0
diff --git a/test/CodeGen/X86/buildvec-insertvec.ll b/test/CodeGen/X86/buildvec-insertvec.ll
index cd5abc1373b9..88b5df04c760 100644
--- a/test/CodeGen/X86/buildvec-insertvec.ll
+++ b/test/CodeGen/X86/buildvec-insertvec.ll
@@ -4,7 +4,7 @@
define void @foo(<3 x float> %in, <4 x i8>* nocapture %out) nounwind {
; SSE2-LABEL: foo:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: cvttps2dq %xmm0, %xmm0
; SSE2-NEXT: movl $255, %eax
; SSE2-NEXT: movd %eax, %xmm1
@@ -17,7 +17,7 @@ define void @foo(<3 x float> %in, <4 x i8>* nocapture %out) nounwind {
; SSE2-NEXT: retq
;
; SSE41-LABEL: foo:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: cvttps2dq %xmm0, %xmm0
; SSE41-NEXT: movl $255, %eax
; SSE41-NEXT: pinsrd $3, %eax, %xmm0
@@ -36,18 +36,18 @@ define void @foo(<3 x float> %in, <4 x i8>* nocapture %out) nounwind {
define <4 x float> @test_negative_zero_1(<4 x float> %A) {
; SSE2-LABEL: test_negative_zero_1:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movaps %xmm0, %xmm1
; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
-; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; SSE2-NEXT: xorps %xmm2, %xmm2
; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
-; SSE2-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_negative_zero_1:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2],zero
; SSE41-NEXT: retq
entry:
@@ -60,11 +60,20 @@ entry:
ret <4 x float> %5
}
+; FIXME: This could be 'movhpd {{.*#+}} xmm0 = xmm0[0],mem[0]'.
+
define <2 x double> @test_negative_zero_2(<2 x double> %A) {
-; CHECK-LABEL: test_negative_zero_2:
-; CHECK: # BB#0: # %entry
-; CHECK-NEXT: movhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
-; CHECK-NEXT: retq
+; SSE2-LABEL: test_negative_zero_2:
+; SSE2: # %bb.0: # %entry
+; SSE2-NEXT: movapd {{.*#+}} xmm1 = <u,-0>
+; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
+; SSE2-NEXT: movapd %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: test_negative_zero_2:
+; SSE41: # %bb.0: # %entry
+; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],mem[1]
+; SSE41-NEXT: retq
entry:
%0 = extractelement <2 x double> %A, i32 0
%1 = insertelement <2 x double> undef, double %0, i32 0
@@ -74,14 +83,14 @@ entry:
define <4 x float> @test_buildvector_v4f32_register(float %f0, float %f1, float %f2, float %f3) {
; SSE2-LABEL: test_buildvector_v4f32_register:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_buildvector_v4f32_register:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0]
@@ -95,18 +104,18 @@ define <4 x float> @test_buildvector_v4f32_register(float %f0, float %f1, float
define <4 x float> @test_buildvector_v4f32_load(float* %p0, float* %p1, float* %p2, float* %p3) {
; SSE2-LABEL: test_buildvector_v4f32_load:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE2-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_buildvector_v4f32_load:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
@@ -125,15 +134,15 @@ define <4 x float> @test_buildvector_v4f32_load(float* %p0, float* %p1, float* %
define <4 x float> @test_buildvector_v4f32_partial_load(float %f0, float %f1, float %f2, float* %p3) {
; SSE2-LABEL: test_buildvector_v4f32_partial_load:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; SSE2-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSE2-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_buildvector_v4f32_partial_load:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
@@ -148,7 +157,7 @@ define <4 x float> @test_buildvector_v4f32_partial_load(float %f0, float %f1, fl
define <4 x i32> @test_buildvector_v4i32_register(i32 %a0, i32 %a1, i32 %a2, i32 %a3) {
; SSE2-LABEL: test_buildvector_v4i32_register:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movd %ecx, %xmm0
; SSE2-NEXT: movd %edx, %xmm1
; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
@@ -159,7 +168,7 @@ define <4 x i32> @test_buildvector_v4i32_register(i32 %a0, i32 %a1, i32 %a2, i32
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_buildvector_v4i32_register:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movd %edi, %xmm0
; SSE41-NEXT: pinsrd $1, %esi, %xmm0
; SSE41-NEXT: pinsrd $2, %edx, %xmm0
@@ -174,7 +183,7 @@ define <4 x i32> @test_buildvector_v4i32_register(i32 %a0, i32 %a1, i32 %a2, i32
define <4 x i32> @test_buildvector_v4i32_partial(i32 %a0, i32 %a3) {
; SSE2-LABEL: test_buildvector_v4i32_partial:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movd %edi, %xmm0
; SSE2-NEXT: movd %esi, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
@@ -182,7 +191,7 @@ define <4 x i32> @test_buildvector_v4i32_partial(i32 %a0, i32 %a3) {
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_buildvector_v4i32_partial:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movd %edi, %xmm0
; SSE41-NEXT: pinsrd $3, %esi, %xmm0
; SSE41-NEXT: retq
@@ -195,7 +204,7 @@ define <4 x i32> @test_buildvector_v4i32_partial(i32 %a0, i32 %a3) {
define <4 x i32> @test_buildvector_v4i32_register_zero(i32 %a0, i32 %a2, i32 %a3) {
; CHECK-LABEL: test_buildvector_v4i32_register_zero:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movd %edx, %xmm0
; CHECK-NEXT: movd %esi, %xmm1
; CHECK-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
@@ -211,7 +220,7 @@ define <4 x i32> @test_buildvector_v4i32_register_zero(i32 %a0, i32 %a2, i32 %a3
define <4 x i32> @test_buildvector_v4i32_register_zero_2(i32 %a1, i32 %a2, i32 %a3) {
; CHECK-LABEL: test_buildvector_v4i32_register_zero_2:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movd %edx, %xmm0
; CHECK-NEXT: movd %esi, %xmm1
; CHECK-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
@@ -227,7 +236,7 @@ define <4 x i32> @test_buildvector_v4i32_register_zero_2(i32 %a1, i32 %a2, i32 %
define <8 x i16> @test_buildvector_v8i16_register(i16 %a0, i16 %a1, i16 %a2, i16 %a3, i16 %a4, i16 %a5, i16 %a6, i16 %a7) {
; SSE2-LABEL: test_buildvector_v8i16_register:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
@@ -246,7 +255,7 @@ define <8 x i16> @test_buildvector_v8i16_register(i16 %a0, i16 %a1, i16 %a2, i16
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_buildvector_v8i16_register:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movd %edi, %xmm0
; SSE41-NEXT: pinsrw $1, %esi, %xmm0
; SSE41-NEXT: pinsrw $2, %edx, %xmm0
@@ -269,7 +278,7 @@ define <8 x i16> @test_buildvector_v8i16_register(i16 %a0, i16 %a1, i16 %a2, i16
define <8 x i16> @test_buildvector_v8i16_partial(i16 %a1, i16 %a3, i16 %a4, i16 %a5) {
; CHECK-LABEL: test_buildvector_v8i16_partial:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: pxor %xmm0, %xmm0
; CHECK-NEXT: pinsrw $1, %edi, %xmm0
; CHECK-NEXT: pinsrw $3, %esi, %xmm0
@@ -289,7 +298,7 @@ define <8 x i16> @test_buildvector_v8i16_partial(i16 %a1, i16 %a3, i16 %a4, i16
define <8 x i16> @test_buildvector_v8i16_register_zero(i16 %a0, i16 %a3, i16 %a4, i16 %a5) {
; CHECK-LABEL: test_buildvector_v8i16_register_zero:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: pxor %xmm0, %xmm0
; CHECK-NEXT: pinsrw $0, %edi, %xmm0
; CHECK-NEXT: pinsrw $3, %esi, %xmm0
@@ -309,7 +318,7 @@ define <8 x i16> @test_buildvector_v8i16_register_zero(i16 %a0, i16 %a3, i16 %a4
define <8 x i16> @test_buildvector_v8i16_register_zero_2(i16 %a1, i16 %a3, i16 %a4, i16 %a5) {
; CHECK-LABEL: test_buildvector_v8i16_register_zero_2:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: pxor %xmm0, %xmm0
; CHECK-NEXT: pinsrw $1, %edi, %xmm0
; CHECK-NEXT: pinsrw $3, %esi, %xmm0
@@ -329,7 +338,7 @@ define <8 x i16> @test_buildvector_v8i16_register_zero_2(i16 %a1, i16 %a3, i16 %
define <16 x i8> @test_buildvector_v16i8_register(i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, i8 %a5, i8 %a6, i8 %a7, i8 %a8, i8 %a9, i8 %a10, i8 %a11, i8 %a12, i8 %a13, i8 %a14, i8 %a15) {
; SSE2-LABEL: test_buildvector_v16i8_register:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
@@ -364,7 +373,7 @@ define <16 x i8> @test_buildvector_v16i8_register(i8 %a0, i8 %a1, i8 %a2, i8 %a3
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_buildvector_v16i8_register:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movd %edi, %xmm0
; SSE41-NEXT: pinsrb $1, %esi, %xmm0
; SSE41-NEXT: pinsrb $2, %edx, %xmm0
@@ -403,7 +412,7 @@ define <16 x i8> @test_buildvector_v16i8_register(i8 %a0, i8 %a1, i8 %a2, i8 %a3
define <16 x i8> @test_buildvector_v16i8_partial(i8 %a2, i8 %a6, i8 %a8, i8 %a11, i8 %a12, i8 %a15) {
; SSE2-LABEL: test_buildvector_v16i8_partial:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movzbl %dil, %eax
; SSE2-NEXT: pinsrw $1, %eax, %xmm0
; SSE2-NEXT: movzbl %sil, %eax
@@ -419,7 +428,7 @@ define <16 x i8> @test_buildvector_v16i8_partial(i8 %a2, i8 %a6, i8 %a8, i8 %a11
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_buildvector_v16i8_partial:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pxor %xmm0, %xmm0
; SSE41-NEXT: pinsrb $2, %edi, %xmm0
; SSE41-NEXT: pinsrb $6, %esi, %xmm0
@@ -449,7 +458,7 @@ define <16 x i8> @test_buildvector_v16i8_partial(i8 %a2, i8 %a6, i8 %a8, i8 %a11
define <16 x i8> @test_buildvector_v16i8_register_zero(i8 %a0, i8 %a4, i8 %a6, i8 %a8, i8 %a11, i8 %a12, i8 %a15) {
; SSE2-LABEL: test_buildvector_v16i8_register_zero:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movzbl %sil, %eax
; SSE2-NEXT: movzbl %dil, %esi
; SSE2-NEXT: movd %esi, %xmm0
@@ -468,7 +477,7 @@ define <16 x i8> @test_buildvector_v16i8_register_zero(i8 %a0, i8 %a4, i8 %a6, i
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_buildvector_v16i8_register_zero:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pxor %xmm0, %xmm0
; SSE41-NEXT: pinsrb $0, %edi, %xmm0
; SSE41-NEXT: pinsrb $4, %esi, %xmm0
@@ -499,7 +508,7 @@ define <16 x i8> @test_buildvector_v16i8_register_zero(i8 %a0, i8 %a4, i8 %a6, i
define <16 x i8> @test_buildvector_v16i8_register_zero_2(i8 %a2, i8 %a3, i8 %a6, i8 %a8, i8 %a11, i8 %a12, i8 %a15) {
; SSE2-LABEL: test_buildvector_v16i8_register_zero_2:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: shll $8, %esi
; SSE2-NEXT: movzbl %dil, %eax
; SSE2-NEXT: orl %esi, %eax
@@ -519,7 +528,7 @@ define <16 x i8> @test_buildvector_v16i8_register_zero_2(i8 %a2, i8 %a3, i8 %a6,
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_buildvector_v16i8_register_zero_2:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pxor %xmm0, %xmm0
; SSE41-NEXT: pinsrb $2, %edi, %xmm0
; SSE41-NEXT: pinsrb $3, %esi, %xmm0
diff --git a/test/CodeGen/X86/bypass-slow-division-32.ll b/test/CodeGen/X86/bypass-slow-division-32.ll
index 9f266647d8aa..a3a07519b3ea 100644
--- a/test/CodeGen/X86/bypass-slow-division-32.ll
+++ b/test/CodeGen/X86/bypass-slow-division-32.ll
@@ -4,20 +4,20 @@
define i32 @Test_get_quotient(i32 %a, i32 %b) nounwind {
; CHECK-LABEL: Test_get_quotient:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: movl %eax, %edx
; CHECK-NEXT: orl %ecx, %edx
; CHECK-NEXT: testl $-256, %edx
; CHECK-NEXT: je .LBB0_1
-; CHECK-NEXT: # BB#2:
+; CHECK-NEXT: # %bb.2:
; CHECK-NEXT: cltd
; CHECK-NEXT: idivl %ecx
; CHECK-NEXT: retl
; CHECK-NEXT: .LBB0_1:
; CHECK-NEXT: movzbl %al, %eax
-; CHECK-NEXT: # kill: %EAX<def> %EAX<kill> %AX<def>
+; CHECK-NEXT: # kill: def %eax killed %eax def %ax
; CHECK-NEXT: divb %cl
; CHECK-NEXT: movzbl %al, %eax
; CHECK-NEXT: retl
@@ -27,21 +27,21 @@ define i32 @Test_get_quotient(i32 %a, i32 %b) nounwind {
define i32 @Test_get_remainder(i32 %a, i32 %b) nounwind {
; CHECK-LABEL: Test_get_remainder:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: movl %eax, %edx
; CHECK-NEXT: orl %ecx, %edx
; CHECK-NEXT: testl $-256, %edx
; CHECK-NEXT: je .LBB1_1
-; CHECK-NEXT: # BB#2:
+; CHECK-NEXT: # %bb.2:
; CHECK-NEXT: cltd
; CHECK-NEXT: idivl %ecx
; CHECK-NEXT: movl %edx, %eax
; CHECK-NEXT: retl
; CHECK-NEXT: .LBB1_1:
; CHECK-NEXT: movzbl %al, %eax
-; CHECK-NEXT: # kill: %EAX<def> %EAX<kill> %AX<def>
+; CHECK-NEXT: # kill: def %eax killed %eax def %ax
; CHECK-NEXT: divb %cl
; CHECK-NEXT: movzbl %ah, %eax # NOREX
; CHECK-NEXT: retl
@@ -51,21 +51,21 @@ define i32 @Test_get_remainder(i32 %a, i32 %b) nounwind {
define i32 @Test_get_quotient_and_remainder(i32 %a, i32 %b) nounwind {
; CHECK-LABEL: Test_get_quotient_and_remainder:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: movl %eax, %edx
; CHECK-NEXT: orl %ecx, %edx
; CHECK-NEXT: testl $-256, %edx
; CHECK-NEXT: je .LBB2_1
-; CHECK-NEXT: # BB#2:
+; CHECK-NEXT: # %bb.2:
; CHECK-NEXT: cltd
; CHECK-NEXT: idivl %ecx
; CHECK-NEXT: addl %edx, %eax
; CHECK-NEXT: retl
; CHECK-NEXT: .LBB2_1:
; CHECK-NEXT: movzbl %al, %eax
-; CHECK-NEXT: # kill: %EAX<def> %EAX<kill> %AX<def>
+; CHECK-NEXT: # kill: def %eax killed %eax def %ax
; CHECK-NEXT: divb %cl
; CHECK-NEXT: movzbl %ah, %edx # NOREX
; CHECK-NEXT: movzbl %al, %eax
@@ -79,7 +79,7 @@ define i32 @Test_get_quotient_and_remainder(i32 %a, i32 %b) nounwind {
define i32 @Test_use_div_and_idiv(i32 %a, i32 %b) nounwind {
; CHECK-LABEL: Test_use_div_and_idiv:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: pushl %ebx
; CHECK-NEXT: pushl %edi
; CHECK-NEXT: pushl %esi
@@ -89,7 +89,7 @@ define i32 @Test_use_div_and_idiv(i32 %a, i32 %b) nounwind {
; CHECK-NEXT: orl %ebx, %edi
; CHECK-NEXT: testl $-256, %edi
; CHECK-NEXT: je .LBB3_1
-; CHECK-NEXT: # BB#2:
+; CHECK-NEXT: # %bb.2:
; CHECK-NEXT: movl %ecx, %eax
; CHECK-NEXT: cltd
; CHECK-NEXT: idivl %ebx
@@ -103,14 +103,14 @@ define i32 @Test_use_div_and_idiv(i32 %a, i32 %b) nounwind {
; CHECK-NEXT: jmp .LBB3_6
; CHECK-NEXT: .LBB3_1:
; CHECK-NEXT: movzbl %cl, %eax
-; CHECK-NEXT: # kill: %EAX<def> %EAX<kill> %AX<def>
+; CHECK-NEXT: # kill: def %eax killed %eax def %ax
; CHECK-NEXT: divb %bl
; CHECK-NEXT: movzbl %al, %esi
; CHECK-NEXT: testl $-256, %edi
; CHECK-NEXT: jne .LBB3_5
; CHECK-NEXT: .LBB3_4:
; CHECK-NEXT: movzbl %cl, %eax
-; CHECK-NEXT: # kill: %EAX<def> %EAX<kill> %AX<def>
+; CHECK-NEXT: # kill: def %eax killed %eax def %ax
; CHECK-NEXT: divb %bl
; CHECK-NEXT: movzbl %al, %eax
; CHECK-NEXT: .LBB3_6:
@@ -128,7 +128,7 @@ define i32 @Test_use_div_and_idiv(i32 %a, i32 %b) nounwind {
define i32 @Test_use_div_imm_imm() nounwind {
; CHECK-LABEL: Test_use_div_imm_imm:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movl $64, %eax
; CHECK-NEXT: retl
%resultdiv = sdiv i32 256, 4
@@ -137,7 +137,7 @@ define i32 @Test_use_div_imm_imm() nounwind {
define i32 @Test_use_div_reg_imm(i32 %a) nounwind {
; CHECK-LABEL: Test_use_div_reg_imm:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movl $1041204193, %eax # imm = 0x3E0F83E1
; CHECK-NEXT: imull {{[0-9]+}}(%esp)
; CHECK-NEXT: movl %edx, %eax
@@ -151,7 +151,7 @@ define i32 @Test_use_div_reg_imm(i32 %a) nounwind {
define i32 @Test_use_rem_reg_imm(i32 %a) nounwind {
; CHECK-LABEL: Test_use_rem_reg_imm:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
; CHECK-NEXT: movl $1041204193, %edx # imm = 0x3E0F83E1
; CHECK-NEXT: movl %ecx, %eax
@@ -172,7 +172,7 @@ define i32 @Test_use_rem_reg_imm(i32 %a) nounwind {
define i32 @Test_use_divrem_reg_imm(i32 %a) nounwind {
; CHECK-LABEL: Test_use_divrem_reg_imm:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
; CHECK-NEXT: movl $1041204193, %edx # imm = 0x3E0F83E1
; CHECK-NEXT: movl %ecx, %eax
@@ -196,11 +196,11 @@ define i32 @Test_use_divrem_reg_imm(i32 %a) nounwind {
define i32 @Test_use_div_imm_reg(i32 %a) nounwind {
; CHECK-LABEL: Test_use_div_imm_reg:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
; CHECK-NEXT: testl $-256, %ecx
; CHECK-NEXT: je .LBB8_1
-; CHECK-NEXT: # BB#2:
+; CHECK-NEXT: # %bb.2:
; CHECK-NEXT: movl $4, %eax
; CHECK-NEXT: xorl %edx, %edx
; CHECK-NEXT: idivl %ecx
@@ -208,7 +208,7 @@ define i32 @Test_use_div_imm_reg(i32 %a) nounwind {
; CHECK-NEXT: .LBB8_1:
; CHECK-NEXT: movb $4, %al
; CHECK-NEXT: movzbl %al, %eax
-; CHECK-NEXT: # kill: %EAX<def> %EAX<kill> %AX<def>
+; CHECK-NEXT: # kill: def %eax killed %eax def %ax
; CHECK-NEXT: divb %cl
; CHECK-NEXT: movzbl %al, %eax
; CHECK-NEXT: retl
@@ -218,11 +218,11 @@ define i32 @Test_use_div_imm_reg(i32 %a) nounwind {
define i32 @Test_use_rem_imm_reg(i32 %a) nounwind {
; CHECK-LABEL: Test_use_rem_imm_reg:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
; CHECK-NEXT: testl $-256, %ecx
; CHECK-NEXT: je .LBB9_1
-; CHECK-NEXT: # BB#2:
+; CHECK-NEXT: # %bb.2:
; CHECK-NEXT: movl $4, %eax
; CHECK-NEXT: xorl %edx, %edx
; CHECK-NEXT: idivl %ecx
@@ -230,7 +230,7 @@ define i32 @Test_use_rem_imm_reg(i32 %a) nounwind {
; CHECK-NEXT: .LBB9_1:
; CHECK-NEXT: movb $4, %al
; CHECK-NEXT: movzbl %al, %eax
-; CHECK-NEXT: # kill: %EAX<def> %EAX<kill> %AX<def>
+; CHECK-NEXT: # kill: def %eax killed %eax def %ax
; CHECK-NEXT: divb %cl
; CHECK-NEXT: movzbl %al, %eax
; CHECK-NEXT: retl
diff --git a/test/CodeGen/X86/bypass-slow-division-64.ll b/test/CodeGen/X86/bypass-slow-division-64.ll
index b067f9e1503c..cf5cd70ac4fc 100644
--- a/test/CodeGen/X86/bypass-slow-division-64.ll
+++ b/test/CodeGen/X86/bypass-slow-division-64.ll
@@ -6,12 +6,12 @@
define i64 @Test_get_quotient(i64 %a, i64 %b) nounwind {
; CHECK-LABEL: Test_get_quotient:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movq %rdi, %rax
; CHECK-NEXT: orq %rsi, %rax
; CHECK-NEXT: shrq $32, %rax
; CHECK-NEXT: je .LBB0_1
-; CHECK-NEXT: # BB#2:
+; CHECK-NEXT: # %bb.2:
; CHECK-NEXT: movq %rdi, %rax
; CHECK-NEXT: cqto
; CHECK-NEXT: idivq %rsi
@@ -20,7 +20,7 @@ define i64 @Test_get_quotient(i64 %a, i64 %b) nounwind {
; CHECK-NEXT: xorl %edx, %edx
; CHECK-NEXT: movl %edi, %eax
; CHECK-NEXT: divl %esi
-; CHECK-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<def>
+; CHECK-NEXT: # kill: def %eax killed %eax def %rax
; CHECK-NEXT: retq
%result = sdiv i64 %a, %b
ret i64 %result
@@ -28,12 +28,12 @@ define i64 @Test_get_quotient(i64 %a, i64 %b) nounwind {
define i64 @Test_get_remainder(i64 %a, i64 %b) nounwind {
; CHECK-LABEL: Test_get_remainder:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movq %rdi, %rax
; CHECK-NEXT: orq %rsi, %rax
; CHECK-NEXT: shrq $32, %rax
; CHECK-NEXT: je .LBB1_1
-; CHECK-NEXT: # BB#2:
+; CHECK-NEXT: # %bb.2:
; CHECK-NEXT: movq %rdi, %rax
; CHECK-NEXT: cqto
; CHECK-NEXT: idivq %rsi
@@ -43,7 +43,7 @@ define i64 @Test_get_remainder(i64 %a, i64 %b) nounwind {
; CHECK-NEXT: xorl %edx, %edx
; CHECK-NEXT: movl %edi, %eax
; CHECK-NEXT: divl %esi
-; CHECK-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<def>
+; CHECK-NEXT: # kill: def %edx killed %edx def %rdx
; CHECK-NEXT: movq %rdx, %rax
; CHECK-NEXT: retq
%result = srem i64 %a, %b
@@ -52,12 +52,12 @@ define i64 @Test_get_remainder(i64 %a, i64 %b) nounwind {
define i64 @Test_get_quotient_and_remainder(i64 %a, i64 %b) nounwind {
; CHECK-LABEL: Test_get_quotient_and_remainder:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movq %rdi, %rax
; CHECK-NEXT: orq %rsi, %rax
; CHECK-NEXT: shrq $32, %rax
; CHECK-NEXT: je .LBB2_1
-; CHECK-NEXT: # BB#2:
+; CHECK-NEXT: # %bb.2:
; CHECK-NEXT: movq %rdi, %rax
; CHECK-NEXT: cqto
; CHECK-NEXT: idivq %rsi
@@ -67,8 +67,8 @@ define i64 @Test_get_quotient_and_remainder(i64 %a, i64 %b) nounwind {
; CHECK-NEXT: xorl %edx, %edx
; CHECK-NEXT: movl %edi, %eax
; CHECK-NEXT: divl %esi
-; CHECK-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<def>
-; CHECK-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<def>
+; CHECK-NEXT: # kill: def %edx killed %edx def %rdx
+; CHECK-NEXT: # kill: def %eax killed %eax def %rax
; CHECK-NEXT: addq %rdx, %rax
; CHECK-NEXT: retq
%resultdiv = sdiv i64 %a, %b
diff --git a/test/CodeGen/X86/bypass-slow-division-tune.ll b/test/CodeGen/X86/bypass-slow-division-tune.ll
index b6a53130cf23..2439f4689520 100644
--- a/test/CodeGen/X86/bypass-slow-division-tune.ll
+++ b/test/CodeGen/X86/bypass-slow-division-tune.ll
@@ -2,6 +2,7 @@
; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mcpu=atom < %s | FileCheck -check-prefixes=ATOM,CHECK %s
; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mcpu=silvermont < %s | FileCheck -check-prefixes=REST,CHECK %s
; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mcpu=skylake < %s | FileCheck -check-prefixes=REST,CHECK %s
+; RUN: llc -profile-summary-huge-working-set-size-threshold=1 -mtriple=x86_64-unknown-linux-gnu -mcpu=skylake < %s | FileCheck -check-prefixes=HUGEWS %s
; Verify that div32 is bypassed only for Atoms.
define i32 @div32(i32 %a, i32 %b) {
@@ -36,6 +37,15 @@ entry:
define i64 @div64_optsize(i64 %a, i64 %b) optsize {
; CHECK-LABEL: div64_optsize:
; CHECK-NOT: divl
+; CHECK: ret
+ %div = sdiv i64 %a, %b
+ ret i64 %div
+}
+
+define i64 @div64_hugews(i64 %a, i64 %b) {
+; HUGEWS-LABEL: div64_hugews:
+; HUGEWS-NOT: divl
+; HUGEWS: ret
%div = sdiv i64 %a, %b
ret i64 %div
}
@@ -43,6 +53,7 @@ define i64 @div64_optsize(i64 %a, i64 %b) optsize {
define i32 @div32_optsize(i32 %a, i32 %b) optsize {
; CHECK-LABEL: div32_optsize:
; CHECK-NOT: divb
+; CHECK: ret
%div = sdiv i32 %a, %b
ret i32 %div
}
@@ -50,6 +61,23 @@ define i32 @div32_optsize(i32 %a, i32 %b) optsize {
define i32 @div32_minsize(i32 %a, i32 %b) minsize {
; CHECK-LABEL: div32_minsize:
; CHECK-NOT: divb
+; CHECK: ret
%div = sdiv i32 %a, %b
ret i32 %div
}
+
+!llvm.module.flags = !{!1}
+!1 = !{i32 1, !"ProfileSummary", !2}
+!2 = !{!3, !4, !5, !6, !7, !8, !9, !10}
+!3 = !{!"ProfileFormat", !"InstrProf"}
+!4 = !{!"TotalCount", i64 10000}
+!5 = !{!"MaxCount", i64 1000}
+!6 = !{!"MaxInternalCount", i64 1}
+!7 = !{!"MaxFunctionCount", i64 1000}
+!8 = !{!"NumCounts", i64 3}
+!9 = !{!"NumFunctions", i64 3}
+!10 = !{!"DetailedSummary", !11}
+!11 = !{!12, !13, !14}
+!12 = !{i32 10000, i64 1000, i32 1}
+!13 = !{i32 999000, i64 1000, i32 3}
+!14 = !{i32 999999, i64 5, i32 3}
diff --git a/test/CodeGen/X86/byval.ll b/test/CodeGen/X86/byval.ll
index f29511a54c41..359662038575 100644
--- a/test/CodeGen/X86/byval.ll
+++ b/test/CodeGen/X86/byval.ll
@@ -1,6 +1,6 @@
; RUN: llc < %s -mtriple=x86_64-linux | FileCheck -check-prefix=X86-64 %s
; Win64 has not supported byval yet.
-; RUN: llc < %s -march=x86 | FileCheck -check-prefix=X86 %s
+; RUN: llc < %s -mtriple=i686-- | FileCheck -check-prefix=X86 %s
; X86: movl 4(%esp), %eax
; X86: movl 8(%esp), %edx
diff --git a/test/CodeGen/X86/byval2.ll b/test/CodeGen/X86/byval2.ll
index 5eb8b590e8da..666caf19965e 100644
--- a/test/CodeGen/X86/byval2.ll
+++ b/test/CodeGen/X86/byval2.ll
@@ -12,7 +12,7 @@
; Win64 has not supported byval yet.
-; RUN: llc < %s -march=x86 -mattr=-avx | FileCheck %s -check-prefix=X32
+; RUN: llc < %s -mtriple=i686-- -mattr=-avx | FileCheck %s -check-prefix=X32
; X32-NOT: movsl
; X32: rep
; X32-NOT: rep
diff --git a/test/CodeGen/X86/byval3.ll b/test/CodeGen/X86/byval3.ll
index 85ecdaf1c67d..e1741d2e8134 100644
--- a/test/CodeGen/X86/byval3.ll
+++ b/test/CodeGen/X86/byval3.ll
@@ -12,7 +12,7 @@
; Win64 has not supported byval yet.
-; RUN: llc < %s -march=x86 -mattr=-avx | FileCheck %s -check-prefix=X32
+; RUN: llc < %s -mtriple=i686-- -mattr=-avx | FileCheck %s -check-prefix=X32
; X32-NOT: movsl
; X32: rep
; X32-NOT: rep
diff --git a/test/CodeGen/X86/byval4.ll b/test/CodeGen/X86/byval4.ll
index 1e436f7903ac..eaf7e60b192d 100644
--- a/test/CodeGen/X86/byval4.ll
+++ b/test/CodeGen/X86/byval4.ll
@@ -12,7 +12,7 @@
; Win64 has not supported byval yet.
-; RUN: llc < %s -march=x86 -mattr=-avx | FileCheck %s -check-prefix=X32
+; RUN: llc < %s -mtriple=i686-- -mattr=-avx | FileCheck %s -check-prefix=X32
; X32-NOT: movsl
; X32: rep
; X32-NOT: rep
diff --git a/test/CodeGen/X86/byval5.ll b/test/CodeGen/X86/byval5.ll
index 6d734a44b3c3..c93edac9787a 100644
--- a/test/CodeGen/X86/byval5.ll
+++ b/test/CodeGen/X86/byval5.ll
@@ -12,7 +12,7 @@
; Win64 has not supported byval yet.
-; RUN: llc < %s -march=x86 -mattr=-avx | FileCheck %s -check-prefix=X32
+; RUN: llc < %s -mtriple=i686-- -mattr=-avx | FileCheck %s -check-prefix=X32
; X32-NOT: movsl
; X32: rep
; X32-NOT: rep
diff --git a/test/CodeGen/X86/byval6.ll b/test/CodeGen/X86/byval6.ll
index c3e7b7ef435a..d3bd1ff9a08f 100644
--- a/test/CodeGen/X86/byval6.ll
+++ b/test/CodeGen/X86/byval6.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mcpu=generic -march=x86 | grep add | not grep 16
+; RUN: llc < %s -mcpu=generic -mtriple=i686-- | grep add | not grep 16
%struct.W = type { x86_fp80, x86_fp80 }
@B = global %struct.W { x86_fp80 0xK4001A000000000000000, x86_fp80 0xK4001C000000000000000 }, align 32
diff --git a/test/CodeGen/X86/byval7.ll b/test/CodeGen/X86/byval7.ll
index 8d5dd8c5887e..584a6a449fff 100644
--- a/test/CodeGen/X86/byval7.ll
+++ b/test/CodeGen/X86/byval7.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mcpu=yonah | FileCheck %s
+; RUN: llc < %s -mtriple=i686-- -mcpu=yonah | FileCheck %s
%struct.S = type { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>,
<2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>,
diff --git a/test/CodeGen/X86/call-imm.ll b/test/CodeGen/X86/call-imm.ll
index 898b4ec203ad..a91b582caff1 100644
--- a/test/CodeGen/X86/call-imm.ll
+++ b/test/CodeGen/X86/call-imm.ll
@@ -6,7 +6,7 @@
; Call to immediate is not safe on x86-64 unless we *know* that the
; call will be within 32-bits pcrel from the dest immediate.
-; RUN: llc < %s -march=x86-64 | FileCheck -check-prefix X64 %s
+; RUN: llc < %s -mtriple=x86_64-- | FileCheck -check-prefix X64 %s
; PR3666
; PR3773
diff --git a/test/CodeGen/X86/cast-vsel.ll b/test/CodeGen/X86/cast-vsel.ll
index 260535985e2d..ee63ec653918 100644
--- a/test/CodeGen/X86/cast-vsel.ll
+++ b/test/CodeGen/X86/cast-vsel.ll
@@ -10,44 +10,33 @@
define <8 x i32> @sext(<8 x float> %a, <8 x float> %b, <8 x i16> %c, <8 x i16> %d) {
; SSE2-LABEL: sext:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: cmpltps %xmm3, %xmm1
-; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; SSE2-NEXT: cmpltps %xmm2, %xmm0
-; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
-; SSE2-NEXT: pand %xmm2, %xmm4
-; SSE2-NEXT: pandn %xmm5, %xmm2
-; SSE2-NEXT: por %xmm4, %xmm2
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; SSE2-NEXT: psrad $16, %xmm0
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE2-NEXT: packssdw %xmm1, %xmm0
+; SSE2-NEXT: pand %xmm0, %xmm4
+; SSE2-NEXT: pandn %xmm5, %xmm0
+; SSE2-NEXT: por %xmm4, %xmm0
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSE2-NEXT: psrad $16, %xmm2
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; SSE2-NEXT: psrad $16, %xmm1
+; SSE2-NEXT: movdqa %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: sext:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: cmpltps %xmm3, %xmm1
-; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; SSE41-NEXT: pshufb %xmm3, %xmm1
; SSE41-NEXT: cmpltps %xmm2, %xmm0
-; SSE41-NEXT: pshufb %xmm3, %xmm0
-; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE41-NEXT: pand %xmm0, %xmm4
-; SSE41-NEXT: pandn %xmm5, %xmm0
-; SSE41-NEXT: por %xmm4, %xmm0
-; SSE41-NEXT: pmovsxwd %xmm0, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE41-NEXT: pmovsxwd %xmm0, %xmm1
-; SSE41-NEXT: movdqa %xmm2, %xmm0
+; SSE41-NEXT: packssdw %xmm1, %xmm0
+; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm5
+; SSE41-NEXT: pmovsxwd %xmm5, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,3,0,1]
+; SSE41-NEXT: pmovsxwd %xmm1, %xmm1
; SSE41-NEXT: retq
;
; AVX1-LABEL: sext:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vcmpltps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vpmovsxwd %xmm2, %xmm1
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
@@ -61,7 +50,7 @@ define <8 x i32> @sext(<8 x float> %a, <8 x float> %b, <8 x i16> %c, <8 x i16> %
; AVX1-NEXT: retq
;
; AVX2-LABEL: sext:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vcmpltps %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpmovsxwd %xmm2, %ymm1
; AVX2-NEXT: vpmovsxwd %xmm3, %ymm2
@@ -75,44 +64,34 @@ define <8 x i32> @sext(<8 x float> %a, <8 x float> %b, <8 x i16> %c, <8 x i16> %
define <8 x i32> @zext(<8 x float> %a, <8 x float> %b, <8 x i16> %c, <8 x i16> %d) {
; SSE2-LABEL: zext:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movaps %xmm0, %xmm6
; SSE2-NEXT: cmpltps %xmm3, %xmm1
-; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,2,2,3]
-; SSE2-NEXT: cmpltps %xmm2, %xmm0
-; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
-; SSE2-NEXT: pand %xmm1, %xmm4
-; SSE2-NEXT: pandn %xmm5, %xmm1
-; SSE2-NEXT: por %xmm4, %xmm1
-; SSE2-NEXT: xorps %xmm2, %xmm2
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE2-NEXT: cmpltps %xmm2, %xmm6
+; SSE2-NEXT: packssdw %xmm1, %xmm6
+; SSE2-NEXT: pand %xmm6, %xmm4
+; SSE2-NEXT: pandn %xmm5, %xmm6
+; SSE2-NEXT: por %xmm4, %xmm6
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: movdqa %xmm6, %xmm0
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm1[4],xmm6[5],xmm1[5],xmm6[6],xmm1[6],xmm6[7],xmm1[7]
+; SSE2-NEXT: movdqa %xmm6, %xmm1
; SSE2-NEXT: retq
;
; SSE41-LABEL: zext:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: cmpltps %xmm3, %xmm1
-; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; SSE41-NEXT: pshufb %xmm3, %xmm1
; SSE41-NEXT: cmpltps %xmm2, %xmm0
-; SSE41-NEXT: pshufb %xmm3, %xmm0
-; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE41-NEXT: pand %xmm0, %xmm4
-; SSE41-NEXT: pandn %xmm5, %xmm0
-; SSE41-NEXT: por %xmm4, %xmm0
-; SSE41-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE41-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; SSE41-NEXT: movdqa %xmm2, %xmm0
+; SSE41-NEXT: packssdw %xmm1, %xmm0
+; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm5
+; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,3,0,1]
+; SSE41-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
; SSE41-NEXT: retq
;
; AVX1-LABEL: zext:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vcmpltps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
@@ -126,7 +105,7 @@ define <8 x i32> @zext(<8 x float> %a, <8 x float> %b, <8 x i16> %c, <8 x i16> %
; AVX1-NEXT: retq
;
; AVX2-LABEL: zext:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vcmpltps %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
@@ -140,7 +119,7 @@ define <8 x i32> @zext(<8 x float> %a, <8 x float> %b, <8 x i16> %c, <8 x i16> %
define <4 x double> @fpext(<4 x double> %a, <4 x double> %b, <4 x float> %c, <4 x float> %d) {
; SSE2-LABEL: fpext:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: cmpltpd %xmm3, %xmm1
; SSE2-NEXT: cmpltpd %xmm2, %xmm0
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
@@ -154,7 +133,7 @@ define <4 x double> @fpext(<4 x double> %a, <4 x double> %b, <4 x float> %c, <4
; SSE2-NEXT: retq
;
; SSE41-LABEL: fpext:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: cmpltpd %xmm3, %xmm1
; SSE41-NEXT: cmpltpd %xmm2, %xmm0
; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
@@ -165,7 +144,7 @@ define <4 x double> @fpext(<4 x double> %a, <4 x double> %b, <4 x float> %c, <4
; SSE41-NEXT: retq
;
; AVX-LABEL: fpext:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vcmpltpd %ymm1, %ymm0, %ymm0
; AVX-NEXT: vcvtps2pd %xmm2, %ymm1
; AVX-NEXT: vcvtps2pd %xmm3, %ymm2
@@ -179,7 +158,7 @@ define <4 x double> @fpext(<4 x double> %a, <4 x double> %b, <4 x float> %c, <4
define <8 x i16> @trunc(<8 x i16> %a, <8 x i16> %b, <8 x i32> %c, <8 x i32> %d) {
; SSE2-LABEL: trunc:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: pcmpeqw %xmm1, %xmm0
; SSE2-NEXT: pslld $16, %xmm5
; SSE2-NEXT: psrad $16, %xmm5
@@ -197,7 +176,7 @@ define <8 x i16> @trunc(<8 x i16> %a, <8 x i16> %b, <8 x i32> %c, <8 x i32> %d)
; SSE2-NEXT: retq
;
; SSE41-LABEL: trunc:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pcmpeqw %xmm1, %xmm0
; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; SSE41-NEXT: pshufb %xmm1, %xmm3
@@ -211,7 +190,7 @@ define <8 x i16> @trunc(<8 x i16> %a, <8 x i16> %b, <8 x i32> %c, <8 x i32> %d)
; SSE41-NEXT: retq
;
; AVX1-LABEL: trunc:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
@@ -227,7 +206,7 @@ define <8 x i16> @trunc(<8 x i16> %a, <8 x i16> %b, <8 x i32> %c, <8 x i32> %d)
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-NEXT: vpshufb %ymm1, %ymm2, %ymm2
@@ -245,7 +224,7 @@ define <8 x i16> @trunc(<8 x i16> %a, <8 x i16> %b, <8 x i32> %c, <8 x i32> %d)
define <4 x float> @fptrunc(<4 x float> %a, <4 x float> %b, <4 x double> %c, <4 x double> %d) {
; SSE2-LABEL: fptrunc:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: cmpltps %xmm1, %xmm0
; SSE2-NEXT: cvtpd2ps %xmm5, %xmm1
; SSE2-NEXT: cvtpd2ps %xmm4, %xmm4
@@ -259,7 +238,7 @@ define <4 x float> @fptrunc(<4 x float> %a, <4 x float> %b, <4 x double> %c, <4
; SSE2-NEXT: retq
;
; SSE41-LABEL: fptrunc:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: cmpltps %xmm1, %xmm0
; SSE41-NEXT: cvtpd2ps %xmm3, %xmm1
; SSE41-NEXT: cvtpd2ps %xmm2, %xmm2
@@ -272,7 +251,7 @@ define <4 x float> @fptrunc(<4 x float> %a, <4 x float> %b, <4 x double> %c, <4
; SSE41-NEXT: retq
;
; AVX-LABEL: fptrunc:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vcmpltps %xmm1, %xmm0, %xmm0
; AVX-NEXT: vcvtpd2ps %ymm2, %xmm1
; AVX-NEXT: vcvtpd2ps %ymm3, %xmm2
@@ -297,7 +276,7 @@ define <4 x float> @fptrunc(<4 x float> %a, <4 x float> %b, <4 x double> %c, <4
define void @example25() nounwind {
; SSE2-LABEL: example25:
-; SSE2: # BB#0: # %vector.ph
+; SSE2: # %bb.0: # %vector.ph
; SSE2-NEXT: movq $-4096, %rax # imm = 0xF000
; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [1,1,1,1]
; SSE2-NEXT: .p2align 4, 0x90
@@ -306,29 +285,13 @@ define void @example25() nounwind {
; SSE2-NEXT: movaps da+4096(%rax), %xmm1
; SSE2-NEXT: movaps da+4112(%rax), %xmm2
; SSE2-NEXT: cmpltps db+4112(%rax), %xmm2
-; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
; SSE2-NEXT: cmpltps db+4096(%rax), %xmm1
-; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; SSE2-NEXT: psllw $15, %xmm1
-; SSE2-NEXT: psraw $15, %xmm1
+; SSE2-NEXT: packssdw %xmm2, %xmm1
; SSE2-NEXT: movaps dc+4096(%rax), %xmm2
; SSE2-NEXT: movaps dc+4112(%rax), %xmm3
; SSE2-NEXT: cmpltps dd+4112(%rax), %xmm3
-; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
; SSE2-NEXT: cmpltps dd+4096(%rax), %xmm2
-; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; SSE2-NEXT: psllw $15, %xmm2
-; SSE2-NEXT: psraw $15, %xmm2
+; SSE2-NEXT: packssdw %xmm3, %xmm2
; SSE2-NEXT: pand %xmm1, %xmm2
; SSE2-NEXT: movdqa %xmm2, %xmm1
; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
@@ -339,49 +302,40 @@ define void @example25() nounwind {
; SSE2-NEXT: movdqa %xmm1, dj+4096(%rax)
; SSE2-NEXT: addq $32, %rax
; SSE2-NEXT: jne .LBB5_1
-; SSE2-NEXT: # BB#2: # %for.end
+; SSE2-NEXT: # %bb.2: # %for.end
; SSE2-NEXT: retq
;
; SSE41-LABEL: example25:
-; SSE41: # BB#0: # %vector.ph
+; SSE41: # %bb.0: # %vector.ph
; SSE41-NEXT: movq $-4096, %rax # imm = 0xF000
-; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,1,1,1]
+; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [1,1,1,1]
; SSE41-NEXT: .p2align 4, 0x90
; SSE41-NEXT: .LBB5_1: # %vector.body
; SSE41-NEXT: # =>This Inner Loop Header: Depth=1
-; SSE41-NEXT: movaps da+4096(%rax), %xmm2
-; SSE41-NEXT: movaps da+4112(%rax), %xmm3
-; SSE41-NEXT: cmpltps db+4112(%rax), %xmm3
-; SSE41-NEXT: pshufb %xmm0, %xmm3
-; SSE41-NEXT: cmpltps db+4096(%rax), %xmm2
-; SSE41-NEXT: pshufb %xmm0, %xmm2
-; SSE41-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; SSE41-NEXT: psllw $15, %xmm2
-; SSE41-NEXT: psraw $15, %xmm2
-; SSE41-NEXT: movaps dc+4096(%rax), %xmm3
-; SSE41-NEXT: movaps dc+4112(%rax), %xmm4
-; SSE41-NEXT: cmpltps dd+4112(%rax), %xmm4
-; SSE41-NEXT: pshufb %xmm0, %xmm4
-; SSE41-NEXT: cmpltps dd+4096(%rax), %xmm3
-; SSE41-NEXT: pshufb %xmm0, %xmm3
-; SSE41-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0]
-; SSE41-NEXT: psllw $15, %xmm3
-; SSE41-NEXT: psraw $15, %xmm3
-; SSE41-NEXT: pand %xmm2, %xmm3
-; SSE41-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
+; SSE41-NEXT: movaps da+4096(%rax), %xmm1
+; SSE41-NEXT: movaps da+4112(%rax), %xmm2
+; SSE41-NEXT: cmpltps db+4112(%rax), %xmm2
+; SSE41-NEXT: cmpltps db+4096(%rax), %xmm1
+; SSE41-NEXT: packssdw %xmm2, %xmm1
+; SSE41-NEXT: movaps dc+4096(%rax), %xmm2
+; SSE41-NEXT: movaps dc+4112(%rax), %xmm3
+; SSE41-NEXT: cmpltps dd+4112(%rax), %xmm3
+; SSE41-NEXT: cmpltps dd+4096(%rax), %xmm2
+; SSE41-NEXT: packssdw %xmm3, %xmm2
; SSE41-NEXT: pand %xmm1, %xmm2
-; SSE41-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; SSE41-NEXT: pand %xmm1, %xmm3
-; SSE41-NEXT: movdqa %xmm3, dj+4112(%rax)
-; SSE41-NEXT: movdqa %xmm2, dj+4096(%rax)
+; SSE41-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
+; SSE41-NEXT: pand %xmm0, %xmm1
+; SSE41-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE41-NEXT: pand %xmm0, %xmm2
+; SSE41-NEXT: movdqa %xmm2, dj+4112(%rax)
+; SSE41-NEXT: movdqa %xmm1, dj+4096(%rax)
; SSE41-NEXT: addq $32, %rax
; SSE41-NEXT: jne .LBB5_1
-; SSE41-NEXT: # BB#2: # %for.end
+; SSE41-NEXT: # %bb.2: # %for.end
; SSE41-NEXT: retq
;
; AVX1-LABEL: example25:
-; AVX1: # BB#0: # %vector.ph
+; AVX1: # %bb.0: # %vector.ph
; AVX1-NEXT: movq $-4096, %rax # imm = 0xF000
; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [1,1,1,1,1,1,1,1]
; AVX1-NEXT: .p2align 4, 0x90
@@ -396,14 +350,14 @@ define void @example25() nounwind {
; AVX1-NEXT: vmovups %ymm1, dj+4096(%rax)
; AVX1-NEXT: addq $32, %rax
; AVX1-NEXT: jne .LBB5_1
-; AVX1-NEXT: # BB#2: # %for.end
+; AVX1-NEXT: # %bb.2: # %for.end
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: example25:
-; AVX2: # BB#0: # %vector.ph
+; AVX2: # %bb.0: # %vector.ph
; AVX2-NEXT: movq $-4096, %rax # imm = 0xF000
-; AVX2-NEXT: vbroadcastss {{.*}}(%rip), %ymm0
+; AVX2-NEXT: vbroadcastss {{.*#+}} ymm0 = [1,1,1,1,1,1,1,1]
; AVX2-NEXT: .p2align 4, 0x90
; AVX2-NEXT: .LBB5_1: # %vector.body
; AVX2-NEXT: # =>This Inner Loop Header: Depth=1
@@ -411,12 +365,12 @@ define void @example25() nounwind {
; AVX2-NEXT: vcmpltps db+4096(%rax), %ymm1, %ymm1
; AVX2-NEXT: vmovups dc+4096(%rax), %ymm2
; AVX2-NEXT: vcmpltps dd+4096(%rax), %ymm2, %ymm2
+; AVX2-NEXT: vandps %ymm0, %ymm2, %ymm2
; AVX2-NEXT: vandps %ymm2, %ymm1, %ymm1
-; AVX2-NEXT: vandps %ymm0, %ymm1, %ymm1
; AVX2-NEXT: vmovups %ymm1, dj+4096(%rax)
; AVX2-NEXT: addq $32, %rax
; AVX2-NEXT: jne .LBB5_1
-; AVX2-NEXT: # BB#2: # %for.end
+; AVX2-NEXT: # %bb.2: # %for.end
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
vector.ph:
@@ -453,7 +407,7 @@ for.end:
define void @example24(i16 signext %x, i16 signext %y) nounwind {
; SSE2-LABEL: example24:
-; SSE2: # BB#0: # %vector.ph
+; SSE2: # %bb.0: # %vector.ph
; SSE2-NEXT: movd %edi, %xmm0
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
@@ -467,14 +421,8 @@ define void @example24(i16 signext %x, i16 signext %y) nounwind {
; SSE2-NEXT: movaps da+4096(%rax), %xmm2
; SSE2-NEXT: movaps da+4112(%rax), %xmm3
; SSE2-NEXT: cmpltps db+4112(%rax), %xmm3
-; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
; SSE2-NEXT: cmpltps db+4096(%rax), %xmm2
-; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; SSE2-NEXT: packssdw %xmm3, %xmm2
; SSE2-NEXT: movdqa %xmm0, %xmm3
; SSE2-NEXT: pand %xmm2, %xmm3
; SSE2-NEXT: pandn %xmm1, %xmm2
@@ -487,45 +435,40 @@ define void @example24(i16 signext %x, i16 signext %y) nounwind {
; SSE2-NEXT: movdqa %xmm3, dj+4096(%rax)
; SSE2-NEXT: addq $32, %rax
; SSE2-NEXT: jne .LBB6_1
-; SSE2-NEXT: # BB#2: # %for.end
+; SSE2-NEXT: # %bb.2: # %for.end
; SSE2-NEXT: retq
;
; SSE41-LABEL: example24:
-; SSE41: # BB#0: # %vector.ph
+; SSE41: # %bb.0: # %vector.ph
; SSE41-NEXT: movd %edi, %xmm0
; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
-; SSE41-NEXT: movd %esi, %xmm1
-; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1]
+; SSE41-NEXT: movd %esi, %xmm0
+; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,1,1]
; SSE41-NEXT: movq $-4096, %rax # imm = 0xF000
-; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; SSE41-NEXT: .p2align 4, 0x90
; SSE41-NEXT: .LBB6_1: # %vector.body
; SSE41-NEXT: # =>This Inner Loop Header: Depth=1
-; SSE41-NEXT: movaps da+4096(%rax), %xmm3
-; SSE41-NEXT: movaps da+4112(%rax), %xmm4
-; SSE41-NEXT: cmpltps db+4112(%rax), %xmm4
-; SSE41-NEXT: pshufb %xmm2, %xmm4
-; SSE41-NEXT: cmpltps db+4096(%rax), %xmm3
-; SSE41-NEXT: pshufb %xmm2, %xmm3
-; SSE41-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0]
-; SSE41-NEXT: movdqa %xmm0, %xmm4
-; SSE41-NEXT: pand %xmm3, %xmm4
-; SSE41-NEXT: pandn %xmm1, %xmm3
-; SSE41-NEXT: por %xmm4, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm3[2,3,0,1]
-; SSE41-NEXT: pmovsxwd %xmm4, %xmm4
+; SSE41-NEXT: movaps da+4096(%rax), %xmm0
+; SSE41-NEXT: movaps da+4112(%rax), %xmm3
+; SSE41-NEXT: cmpltps db+4112(%rax), %xmm3
+; SSE41-NEXT: cmpltps db+4096(%rax), %xmm0
+; SSE41-NEXT: packssdw %xmm3, %xmm0
+; SSE41-NEXT: movdqa %xmm2, %xmm3
+; SSE41-NEXT: pblendvb %xmm0, %xmm1, %xmm3
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1]
+; SSE41-NEXT: pmovsxwd %xmm0, %xmm0
; SSE41-NEXT: pmovsxwd %xmm3, %xmm3
; SSE41-NEXT: movdqa %xmm3, dj+4096(%rax)
-; SSE41-NEXT: movdqa %xmm4, dj+4112(%rax)
+; SSE41-NEXT: movdqa %xmm0, dj+4112(%rax)
; SSE41-NEXT: addq $32, %rax
; SSE41-NEXT: jne .LBB6_1
-; SSE41-NEXT: # BB#2: # %for.end
+; SSE41-NEXT: # %bb.2: # %for.end
; SSE41-NEXT: retq
;
; AVX1-LABEL: example24:
-; AVX1: # BB#0: # %vector.ph
+; AVX1: # %bb.0: # %vector.ph
; AVX1-NEXT: vmovd %edi, %xmm0
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
@@ -550,12 +493,12 @@ define void @example24(i16 signext %x, i16 signext %y) nounwind {
; AVX1-NEXT: vmovups %ymm2, dj+4096(%rax)
; AVX1-NEXT: addq $32, %rax
; AVX1-NEXT: jne .LBB6_1
-; AVX1-NEXT: # BB#2: # %for.end
+; AVX1-NEXT: # %bb.2: # %for.end
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: example24:
-; AVX2: # BB#0: # %vector.ph
+; AVX2: # %bb.0: # %vector.ph
; AVX2-NEXT: vmovd %edi, %xmm0
; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0
; AVX2-NEXT: vmovd %esi, %xmm1
@@ -572,7 +515,7 @@ define void @example24(i16 signext %x, i16 signext %y) nounwind {
; AVX2-NEXT: vmovups %ymm2, dj+4096(%rax)
; AVX2-NEXT: addq $32, %rax
; AVX2-NEXT: jne .LBB6_1
-; AVX2-NEXT: # BB#2: # %for.end
+; AVX2-NEXT: # %bb.2: # %for.end
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
vector.ph:
diff --git a/test/CodeGen/X86/catchpad-weight.ll b/test/CodeGen/X86/catchpad-weight.ll
index 6caf0c6012f7..c122ad2c20ad 100644
--- a/test/CodeGen/X86/catchpad-weight.ll
+++ b/test/CodeGen/X86/catchpad-weight.ll
@@ -1,8 +1,8 @@
-; RUN: llc -march=x86-64 -print-machineinstrs=expand-isel-pseudos %s -o /dev/null 2>&1 | FileCheck %s
+; RUN: llc -print-machineinstrs=expand-isel-pseudos %s -o /dev/null 2>&1 | FileCheck %s
; Check if the edge weight to the catchpad is calculated correctly.
-; CHECK: Successors according to CFG: BB#2(0x7ffff100 / 0x80000000 = 100.00%) BB#1(0x00000800 / 0x80000000 = 0.00%) BB#3(0x00000400 / 0x80000000 = 0.00%) BB#4(0x00000200 / 0x80000000 = 0.00%) BB#5(0x00000100 / 0x80000000 = 0.00%)
+; CHECK: Successors according to CFG: %bb.2(0x7ffff100 / 0x80000000 = 100.00%) %bb.1(0x00000800 / 0x80000000 = 0.00%) %bb.3(0x00000400 / 0x80000000 = 0.00%) %bb.4(0x00000200 / 0x80000000 = 0.00%) %bb.5(0x00000100 / 0x80000000 = 0.00%)
target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64--windows-msvc18.0.0"
diff --git a/test/CodeGen/X86/cfi-xmm.ll b/test/CodeGen/X86/cfi-xmm.ll
new file mode 100644
index 000000000000..dbb8a61320dc
--- /dev/null
+++ b/test/CodeGen/X86/cfi-xmm.ll
@@ -0,0 +1,21 @@
+; RUN: llc -mtriple x86_64-w64-windows-gnu -filetype=asm -exception-model=dwarf -o - %s | FileCheck %s
+
+define void @_Z1fv() {
+entry:
+ tail call void asm sideeffect "", "~{xmm10},~{xmm15},~{dirflag},~{fpsr},~{flags}"()
+ ret void
+}
+
+; CHECK-LABEL: _Z1fv:
+; CHECK: .cfi_startproc
+; CHECK: subq $40, %rsp
+; CHECK: movaps %xmm15, 16(%rsp)
+; CHECK: movaps %xmm10, (%rsp)
+; CHECK: .cfi_def_cfa_offset 48
+; CHECK: .cfi_offset %xmm10, -48
+; CHECK: .cfi_offset %xmm15, -32
+; CHECK: movaps (%rsp), %xmm10
+; CHECK: movaps 16(%rsp), %xmm15
+; CHECK: addq $40, %rsp
+; CHECK: retq
+; CHECK: .cfi_endproc
diff --git a/test/CodeGen/X86/chain_order.ll b/test/CodeGen/X86/chain_order.ll
index cc48e5b6149c..b9e188f6a1b3 100644
--- a/test/CodeGen/X86/chain_order.ll
+++ b/test/CodeGen/X86/chain_order.ll
@@ -4,7 +4,7 @@
; A test from pifft (after SLP-vectorization) that fails when we drop the chain on newly merged loads.
define void @cftx020(double* nocapture %a) {
; CHECK-LABEL: cftx020:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
; CHECK-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
diff --git a/test/CodeGen/X86/change-compare-stride-1.ll b/test/CodeGen/X86/change-compare-stride-1.ll
index c5480ba2b490..fe4fa1b026bd 100644
--- a/test/CodeGen/X86/change-compare-stride-1.ll
+++ b/test/CodeGen/X86/change-compare-stride-1.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -enable-lsr-nested | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-- -enable-lsr-nested | FileCheck %s
;
; Nested LSR is required to optimize this case.
; We do not expect to see this form of IR without -enable-iv-rewrite.
diff --git a/test/CodeGen/X86/change-compare-stride-trickiness-1.ll b/test/CodeGen/X86/change-compare-stride-trickiness-1.ll
index 63733abc5f34..5f3c17b32792 100644
--- a/test/CodeGen/X86/change-compare-stride-trickiness-1.ll
+++ b/test/CodeGen/X86/change-compare-stride-trickiness-1.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=x86 < %s | FileCheck %s
+; RUN: llc -mtriple=i686-- < %s | FileCheck %s
; The comparison happens after the relevant use, so the stride can easily
; be changed. The comparison can be done in a narrower mode than the
diff --git a/test/CodeGen/X86/clear_upper_vector_element_bits.ll b/test/CodeGen/X86/clear_upper_vector_element_bits.ll
index e2a4368b255a..2af9ec1b813e 100644
--- a/test/CodeGen/X86/clear_upper_vector_element_bits.ll
+++ b/test/CodeGen/X86/clear_upper_vector_element_bits.ll
@@ -1,5 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE42
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
@@ -8,21 +9,27 @@
;
define <2 x i64> @_clearupper2xi64a(<2 x i64>) nounwind {
-; SSE-LABEL: _clearupper2xi64a:
-; SSE: # BB#0:
-; SSE-NEXT: andps {{.*}}(%rip), %xmm0
-; SSE-NEXT: retq
+; SSE2-LABEL: _clearupper2xi64a:
+; SSE2: # %bb.0:
+; SSE2-NEXT: andps {{.*}}(%rip), %xmm0
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: _clearupper2xi64a:
+; SSE42: # %bb.0:
+; SSE42-NEXT: pxor %xmm1, %xmm1
+; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; SSE42-NEXT: retq
;
; AVX1-LABEL: _clearupper2xi64a:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
; AVX1-NEXT: retq
;
; AVX2-LABEL: _clearupper2xi64a:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; AVX2: # %bb.0:
+; AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
; AVX2-NEXT: retq
%x0 = extractelement <2 x i64> %0, i32 0
%x1 = extractelement <2 x i64> %0, i32 1
@@ -36,24 +43,25 @@ define <2 x i64> @_clearupper2xi64a(<2 x i64>) nounwind {
}
define <4 x i64> @_clearupper4xi64a(<4 x i64>) nounwind {
-; SSE-LABEL: _clearupper4xi64a:
-; SSE: # BB#0:
-; SSE-NEXT: movaps {{.*#+}} xmm2 = [4294967295,4294967295]
-; SSE-NEXT: andps %xmm2, %xmm0
-; SSE-NEXT: andps %xmm2, %xmm1
-; SSE-NEXT: retq
+; SSE2-LABEL: _clearupper4xi64a:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movaps {{.*#+}} xmm2 = [4294967295,4294967295]
+; SSE2-NEXT: andps %xmm2, %xmm0
+; SSE2-NEXT: andps %xmm2, %xmm1
+; SSE2-NEXT: retq
;
-; AVX1-LABEL: _clearupper4xi64a:
-; AVX1: # BB#0:
-; AVX1-NEXT: vxorps %ymm1, %ymm1, %ymm1
-; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
-; AVX1-NEXT: retq
+; SSE42-LABEL: _clearupper4xi64a:
+; SSE42: # %bb.0:
+; SSE42-NEXT: pxor %xmm2, %xmm2
+; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; SSE42-NEXT: retq
;
-; AVX2-LABEL: _clearupper4xi64a:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
-; AVX2-NEXT: retq
+; AVX-LABEL: _clearupper4xi64a:
+; AVX: # %bb.0:
+; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
+; AVX-NEXT: retq
%x0 = extractelement <4 x i64> %0, i32 0
%x1 = extractelement <4 x i64> %0, i32 1
%x2 = extractelement <4 x i64> %0, i32 2
@@ -74,13 +82,19 @@ define <4 x i64> @_clearupper4xi64a(<4 x i64>) nounwind {
}
define <4 x i32> @_clearupper4xi32a(<4 x i32>) nounwind {
-; SSE-LABEL: _clearupper4xi32a:
-; SSE: # BB#0:
-; SSE-NEXT: andps {{.*}}(%rip), %xmm0
-; SSE-NEXT: retq
+; SSE2-LABEL: _clearupper4xi32a:
+; SSE2: # %bb.0:
+; SSE2-NEXT: andps {{.*}}(%rip), %xmm0
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: _clearupper4xi32a:
+; SSE42: # %bb.0:
+; SSE42-NEXT: pxor %xmm1, %xmm1
+; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
+; SSE42-NEXT: retq
;
; AVX-LABEL: _clearupper4xi32a:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
; AVX-NEXT: retq
@@ -104,21 +118,28 @@ define <4 x i32> @_clearupper4xi32a(<4 x i32>) nounwind {
}
define <8 x i32> @_clearupper8xi32a(<8 x i32>) nounwind {
-; SSE-LABEL: _clearupper8xi32a:
-; SSE: # BB#0:
-; SSE-NEXT: movaps {{.*#+}} xmm2 = [65535,65535,65535,65535]
-; SSE-NEXT: andps %xmm2, %xmm0
-; SSE-NEXT: andps %xmm2, %xmm1
-; SSE-NEXT: retq
+; SSE2-LABEL: _clearupper8xi32a:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movaps {{.*#+}} xmm2 = [65535,65535,65535,65535]
+; SSE2-NEXT: andps %xmm2, %xmm0
+; SSE2-NEXT: andps %xmm2, %xmm1
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: _clearupper8xi32a:
+; SSE42: # %bb.0:
+; SSE42-NEXT: pxor %xmm2, %xmm2
+; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
+; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
+; SSE42-NEXT: retq
;
; AVX1-LABEL: _clearupper8xi32a:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: _clearupper8xi32a:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
; AVX2-NEXT: retq
%x0 = extractelement <8 x i32> %0, i32 0
@@ -158,12 +179,12 @@ define <8 x i32> @_clearupper8xi32a(<8 x i32>) nounwind {
define <8 x i16> @_clearupper8xi16a(<8 x i16>) nounwind {
; SSE-LABEL: _clearupper8xi16a:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: andps {{.*}}(%rip), %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: _clearupper8xi16a:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: retq
%x0 = extractelement <8 x i16> %0, i32 0
@@ -203,14 +224,14 @@ define <8 x i16> @_clearupper8xi16a(<8 x i16>) nounwind {
define <16 x i16> @_clearupper16xi16a(<16 x i16>) nounwind {
; SSE-LABEL: _clearupper16xi16a:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
; SSE-NEXT: andps %xmm2, %xmm0
; SSE-NEXT: andps %xmm2, %xmm1
; SSE-NEXT: retq
;
; AVX-LABEL: _clearupper16xi16a:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
; AVX-NEXT: retq
%x0 = extractelement <16 x i16> %0, i32 0
@@ -281,57 +302,62 @@ define <16 x i16> @_clearupper16xi16a(<16 x i16>) nounwind {
}
define <16 x i8> @_clearupper16xi8a(<16 x i8>) nounwind {
-; SSE-LABEL: _clearupper16xi8a:
-; SSE: # BB#0:
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE-NEXT: movd %eax, %xmm0
-; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE-NEXT: movd %eax, %xmm1
-; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE-NEXT: movd %eax, %xmm0
-; SSE-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE-NEXT: movd %eax, %xmm0
-; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE-NEXT: movd %eax, %xmm3
-; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE-NEXT: movd %eax, %xmm0
-; SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
-; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE-NEXT: movd %eax, %xmm0
-; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE-NEXT: movd %eax, %xmm2
-; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE-NEXT: movd %eax, %xmm0
-; SSE-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE-NEXT: movd %eax, %xmm0
-; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE-NEXT: movd %eax, %xmm2
-; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE-NEXT: movd %eax, %xmm4
-; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
-; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE-NEXT: retq
+; SSE2-LABEL: _clearupper16xi8a:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE2-NEXT: movd %eax, %xmm0
+; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE2-NEXT: movd %eax, %xmm1
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE2-NEXT: movd %eax, %xmm0
+; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE2-NEXT: movd %eax, %xmm0
+; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE2-NEXT: movd %eax, %xmm3
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE2-NEXT: movd %eax, %xmm0
+; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE2-NEXT: movd %eax, %xmm0
+; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE2-NEXT: movd %eax, %xmm2
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE2-NEXT: movd %eax, %xmm0
+; SSE2-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE2-NEXT: movd %eax, %xmm0
+; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE2-NEXT: movd %eax, %xmm2
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE2-NEXT: movd %eax, %xmm4
+; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: _clearupper16xi8a:
+; SSE42: # %bb.0:
+; SSE42-NEXT: andps {{.*}}(%rip), %xmm0
+; SSE42-NEXT: retq
;
; AVX-LABEL: _clearupper16xi8a:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: retq
%x0 = extractelement <16 x i8> %0, i32 0
@@ -402,103 +428,110 @@ define <16 x i8> @_clearupper16xi8a(<16 x i8>) nounwind {
}
define <32 x i8> @_clearupper32xi8a(<32 x i8>) nounwind {
-; SSE-LABEL: _clearupper32xi8a:
-; SSE: # BB#0:
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE-NEXT: movd %eax, %xmm0
-; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE-NEXT: movd %eax, %xmm1
-; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE-NEXT: movd %eax, %xmm0
-; SSE-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE-NEXT: movd %eax, %xmm0
-; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE-NEXT: movd %eax, %xmm3
-; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE-NEXT: movd %eax, %xmm0
-; SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
-; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE-NEXT: movd %eax, %xmm0
-; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE-NEXT: movd %eax, %xmm2
-; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE-NEXT: movd %eax, %xmm0
-; SSE-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE-NEXT: movd %eax, %xmm0
-; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE-NEXT: movd %eax, %xmm2
-; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE-NEXT: movd %eax, %xmm4
-; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
-; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; SSE-NEXT: pand %xmm2, %xmm0
-; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE-NEXT: movd %eax, %xmm1
-; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE-NEXT: movd %eax, %xmm3
-; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
-; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE-NEXT: movd %eax, %xmm1
-; SSE-NEXT: movd {{.*#+}} xmm4 = mem[0],zero,zero,zero
-; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
-; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
-; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE-NEXT: movd %eax, %xmm1
-; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE-NEXT: movd %eax, %xmm5
-; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
-; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE-NEXT: movd %eax, %xmm1
-; SSE-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
-; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3]
-; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
-; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE-NEXT: movd %eax, %xmm1
-; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE-NEXT: movd %eax, %xmm4
-; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
-; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE-NEXT: movd %eax, %xmm1
-; SSE-NEXT: movd {{.*#+}} xmm5 = mem[0],zero,zero,zero
-; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
-; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE-NEXT: movd %eax, %xmm1
-; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE-NEXT: movd %eax, %xmm4
-; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
-; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE-NEXT: movd %eax, %xmm6
-; SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7]
-; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
-; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1]
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
-; SSE-NEXT: pand %xmm2, %xmm1
-; SSE-NEXT: retq
+; SSE2-LABEL: _clearupper32xi8a:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE2-NEXT: movd %eax, %xmm0
+; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE2-NEXT: movd %eax, %xmm1
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE2-NEXT: movd %eax, %xmm0
+; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE2-NEXT: movd %eax, %xmm0
+; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE2-NEXT: movd %eax, %xmm3
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE2-NEXT: movd %eax, %xmm0
+; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE2-NEXT: movd %eax, %xmm0
+; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE2-NEXT: movd %eax, %xmm2
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE2-NEXT: movd %eax, %xmm0
+; SSE2-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE2-NEXT: movd %eax, %xmm0
+; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE2-NEXT: movd %eax, %xmm2
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE2-NEXT: movd %eax, %xmm4
+; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE2-NEXT: movd %eax, %xmm1
+; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE2-NEXT: movd %eax, %xmm3
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
+; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE2-NEXT: movd %eax, %xmm1
+; SSE2-NEXT: movd {{.*#+}} xmm4 = mem[0],zero,zero,zero
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE2-NEXT: movd %eax, %xmm1
+; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE2-NEXT: movd %eax, %xmm5
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
+; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE2-NEXT: movd %eax, %xmm1
+; SSE2-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
+; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE2-NEXT: movd %eax, %xmm1
+; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE2-NEXT: movd %eax, %xmm4
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
+; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE2-NEXT: movd %eax, %xmm1
+; SSE2-NEXT: movd {{.*#+}} xmm5 = mem[0],zero,zero,zero
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
+; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE2-NEXT: movd %eax, %xmm1
+; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE2-NEXT: movd %eax, %xmm4
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
+; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE2-NEXT: movd %eax, %xmm6
+; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1]
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
+; SSE2-NEXT: pand %xmm2, %xmm1
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: _clearupper32xi8a:
+; SSE42: # %bb.0:
+; SSE42-NEXT: movaps {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; SSE42-NEXT: andps %xmm2, %xmm0
+; SSE42-NEXT: andps %xmm2, %xmm1
+; SSE42-NEXT: retq
;
; AVX-LABEL: _clearupper32xi8a:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
; AVX-NEXT: retq
%x0 = extractelement <32 x i8> %0, i32 0
@@ -633,21 +666,27 @@ define <32 x i8> @_clearupper32xi8a(<32 x i8>) nounwind {
}
define <2 x i64> @_clearupper2xi64b(<2 x i64>) nounwind {
-; SSE-LABEL: _clearupper2xi64b:
-; SSE: # BB#0:
-; SSE-NEXT: andps {{.*}}(%rip), %xmm0
-; SSE-NEXT: retq
+; SSE2-LABEL: _clearupper2xi64b:
+; SSE2: # %bb.0:
+; SSE2-NEXT: andps {{.*}}(%rip), %xmm0
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: _clearupper2xi64b:
+; SSE42: # %bb.0:
+; SSE42-NEXT: pxor %xmm1, %xmm1
+; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; SSE42-NEXT: retq
;
; AVX1-LABEL: _clearupper2xi64b:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
; AVX1-NEXT: retq
;
; AVX2-LABEL: _clearupper2xi64b:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; AVX2: # %bb.0:
+; AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
; AVX2-NEXT: retq
%x32 = bitcast <2 x i64> %0 to <4 x i32>
%r0 = insertelement <4 x i32> %x32, i32 zeroinitializer, i32 1
@@ -657,24 +696,25 @@ define <2 x i64> @_clearupper2xi64b(<2 x i64>) nounwind {
}
define <4 x i64> @_clearupper4xi64b(<4 x i64>) nounwind {
-; SSE-LABEL: _clearupper4xi64b:
-; SSE: # BB#0:
-; SSE-NEXT: movaps {{.*#+}} xmm2 = [4294967295,0,4294967295,0]
-; SSE-NEXT: andps %xmm2, %xmm0
-; SSE-NEXT: andps %xmm2, %xmm1
-; SSE-NEXT: retq
+; SSE2-LABEL: _clearupper4xi64b:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movaps {{.*#+}} xmm2 = [4294967295,0,4294967295,0]
+; SSE2-NEXT: andps %xmm2, %xmm0
+; SSE2-NEXT: andps %xmm2, %xmm1
+; SSE2-NEXT: retq
;
-; AVX1-LABEL: _clearupper4xi64b:
-; AVX1: # BB#0:
-; AVX1-NEXT: vxorps %ymm1, %ymm1, %ymm1
-; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
-; AVX1-NEXT: retq
+; SSE42-LABEL: _clearupper4xi64b:
+; SSE42: # %bb.0:
+; SSE42-NEXT: pxor %xmm2, %xmm2
+; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; SSE42-NEXT: retq
;
-; AVX2-LABEL: _clearupper4xi64b:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
-; AVX2-NEXT: retq
+; AVX-LABEL: _clearupper4xi64b:
+; AVX: # %bb.0:
+; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
+; AVX-NEXT: retq
%x32 = bitcast <4 x i64> %0 to <8 x i32>
%r0 = insertelement <8 x i32> %x32, i32 zeroinitializer, i32 1
%r1 = insertelement <8 x i32> %r0, i32 zeroinitializer, i32 3
@@ -685,13 +725,19 @@ define <4 x i64> @_clearupper4xi64b(<4 x i64>) nounwind {
}
define <4 x i32> @_clearupper4xi32b(<4 x i32>) nounwind {
-; SSE-LABEL: _clearupper4xi32b:
-; SSE: # BB#0:
-; SSE-NEXT: andps {{.*}}(%rip), %xmm0
-; SSE-NEXT: retq
+; SSE2-LABEL: _clearupper4xi32b:
+; SSE2: # %bb.0:
+; SSE2-NEXT: andps {{.*}}(%rip), %xmm0
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: _clearupper4xi32b:
+; SSE42: # %bb.0:
+; SSE42-NEXT: pxor %xmm1, %xmm1
+; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
+; SSE42-NEXT: retq
;
; AVX-LABEL: _clearupper4xi32b:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
; AVX-NEXT: retq
@@ -705,21 +751,28 @@ define <4 x i32> @_clearupper4xi32b(<4 x i32>) nounwind {
}
define <8 x i32> @_clearupper8xi32b(<8 x i32>) nounwind {
-; SSE-LABEL: _clearupper8xi32b:
-; SSE: # BB#0:
-; SSE-NEXT: movaps {{.*#+}} xmm2 = [65535,0,65535,0,65535,0,65535,0]
-; SSE-NEXT: andps %xmm2, %xmm0
-; SSE-NEXT: andps %xmm2, %xmm1
-; SSE-NEXT: retq
+; SSE2-LABEL: _clearupper8xi32b:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movaps {{.*#+}} xmm2 = [65535,0,65535,0,65535,0,65535,0]
+; SSE2-NEXT: andps %xmm2, %xmm0
+; SSE2-NEXT: andps %xmm2, %xmm1
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: _clearupper8xi32b:
+; SSE42: # %bb.0:
+; SSE42-NEXT: pxor %xmm2, %xmm2
+; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
+; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
+; SSE42-NEXT: retq
;
; AVX1-LABEL: _clearupper8xi32b:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: _clearupper8xi32b:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
; AVX2-NEXT: retq
%x16 = bitcast <8 x i32> %0 to <16 x i16>
@@ -737,12 +790,12 @@ define <8 x i32> @_clearupper8xi32b(<8 x i32>) nounwind {
define <8 x i16> @_clearupper8xi16b(<8 x i16>) nounwind {
; SSE-LABEL: _clearupper8xi16b:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: andps {{.*}}(%rip), %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: _clearupper8xi16b:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: retq
%x8 = bitcast <8 x i16> %0 to <16 x i8>
@@ -760,31 +813,20 @@ define <8 x i16> @_clearupper8xi16b(<8 x i16>) nounwind {
define <16 x i16> @_clearupper16xi16b(<16 x i16>) nounwind {
; SSE-LABEL: _clearupper16xi16b:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
; SSE-NEXT: andps %xmm2, %xmm0
; SSE-NEXT: andps %xmm2, %xmm1
; SSE-NEXT: retq
;
-; AVX1-LABEL: _clearupper16xi16b:
-; AVX1: # BB#0:
-; AVX1-NEXT: vmovaps {{.*#+}} xmm1 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
-; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5,6,7]
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: _clearupper16xi16b:
-; AVX2: # BB#0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
-; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5,6,7]
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0
-; AVX2-NEXT: retq
+; AVX-LABEL: _clearupper16xi16b:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovaps {{.*#+}} xmm1 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; AVX-NEXT: vandps %xmm1, %xmm0, %xmm2
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX-NEXT: vandps %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX-NEXT: retq
%x8 = bitcast <16 x i16> %0 to <32 x i8>
%r0 = insertelement <32 x i8> %x8, i8 zeroinitializer, i32 1
%r1 = insertelement <32 x i8> %r0, i8 zeroinitializer, i32 3
@@ -807,84 +849,159 @@ define <16 x i16> @_clearupper16xi16b(<16 x i16>) nounwind {
}
define <16 x i8> @_clearupper16xi8b(<16 x i8>) nounwind {
-; SSE-LABEL: _clearupper16xi8b:
-; SSE: # BB#0:
-; SSE-NEXT: pushq %r14
-; SSE-NEXT: pushq %rbx
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE-NEXT: movq %xmm0, %rcx
-; SSE-NEXT: movq %rcx, %r8
-; SSE-NEXT: movq %rcx, %r9
-; SSE-NEXT: movq %rcx, %r10
-; SSE-NEXT: movq %rcx, %rax
-; SSE-NEXT: movq %rcx, %rdx
-; SSE-NEXT: movq %rcx, %rsi
-; SSE-NEXT: movq %rcx, %rdi
-; SSE-NEXT: andb $15, %cl
-; SSE-NEXT: movb %cl, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq %xmm1, %rcx
-; SSE-NEXT: shrq $56, %rdi
-; SSE-NEXT: andb $15, %dil
-; SSE-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq %rcx, %r11
-; SSE-NEXT: shrq $48, %rsi
-; SSE-NEXT: andb $15, %sil
-; SSE-NEXT: movb %sil, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq %rcx, %r14
-; SSE-NEXT: shrq $40, %rdx
-; SSE-NEXT: andb $15, %dl
-; SSE-NEXT: movb %dl, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq %rcx, %rdx
-; SSE-NEXT: shrq $32, %rax
-; SSE-NEXT: andb $15, %al
-; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq %rcx, %rax
-; SSE-NEXT: shrq $24, %r10
-; SSE-NEXT: andb $15, %r10b
-; SSE-NEXT: movb %r10b, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq %rcx, %rdi
-; SSE-NEXT: shrq $16, %r9
-; SSE-NEXT: andb $15, %r9b
-; SSE-NEXT: movb %r9b, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq %rcx, %rsi
-; SSE-NEXT: shrq $8, %r8
-; SSE-NEXT: andb $15, %r8b
-; SSE-NEXT: movb %r8b, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq %rcx, %rbx
-; SSE-NEXT: movb $0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: andb $15, %cl
-; SSE-NEXT: movb %cl, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: shrq $56, %rbx
-; SSE-NEXT: andb $15, %bl
-; SSE-NEXT: movb %bl, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: shrq $48, %rsi
-; SSE-NEXT: andb $15, %sil
-; SSE-NEXT: movb %sil, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: shrq $40, %rdi
-; SSE-NEXT: andb $15, %dil
-; SSE-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: shrq $32, %rax
-; SSE-NEXT: andb $15, %al
-; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: shrq $24, %rdx
-; SSE-NEXT: andb $15, %dl
-; SSE-NEXT: movb %dl, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: shrq $16, %r14
-; SSE-NEXT: andb $15, %r14b
-; SSE-NEXT: movb %r14b, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: shrq $8, %r11
-; SSE-NEXT: andb $15, %r11b
-; SSE-NEXT: movb %r11b, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movb $0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE-NEXT: popq %rbx
-; SSE-NEXT: popq %r14
-; SSE-NEXT: retq
+; SSE2-LABEL: _clearupper16xi8b:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pushq %r14
+; SSE2-NEXT: pushq %rbx
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2-NEXT: movq %xmm0, %rcx
+; SSE2-NEXT: movq %rcx, %r8
+; SSE2-NEXT: movq %rcx, %r9
+; SSE2-NEXT: movq %rcx, %r10
+; SSE2-NEXT: movq %rcx, %rax
+; SSE2-NEXT: movq %rcx, %rdx
+; SSE2-NEXT: movq %rcx, %rsi
+; SSE2-NEXT: movq %rcx, %rdi
+; SSE2-NEXT: andb $15, %cl
+; SSE2-NEXT: movb %cl, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: movq %xmm1, %rcx
+; SSE2-NEXT: shrq $56, %rdi
+; SSE2-NEXT: andb $15, %dil
+; SSE2-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: movq %rcx, %r11
+; SSE2-NEXT: shrq $48, %rsi
+; SSE2-NEXT: andb $15, %sil
+; SSE2-NEXT: movb %sil, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: movq %rcx, %r14
+; SSE2-NEXT: shrq $40, %rdx
+; SSE2-NEXT: andb $15, %dl
+; SSE2-NEXT: movb %dl, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: movq %rcx, %rdx
+; SSE2-NEXT: shrq $32, %rax
+; SSE2-NEXT: andb $15, %al
+; SSE2-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: movq %rcx, %rax
+; SSE2-NEXT: shrq $24, %r10
+; SSE2-NEXT: andb $15, %r10b
+; SSE2-NEXT: movb %r10b, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: movq %rcx, %rdi
+; SSE2-NEXT: shrq $16, %r9
+; SSE2-NEXT: andb $15, %r9b
+; SSE2-NEXT: movb %r9b, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: movq %rcx, %rsi
+; SSE2-NEXT: shrq $8, %r8
+; SSE2-NEXT: andb $15, %r8b
+; SSE2-NEXT: movb %r8b, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: movq %rcx, %rbx
+; SSE2-NEXT: movb $0, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: andb $15, %cl
+; SSE2-NEXT: movb %cl, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: shrq $56, %rbx
+; SSE2-NEXT: andb $15, %bl
+; SSE2-NEXT: movb %bl, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: shrq $48, %rsi
+; SSE2-NEXT: andb $15, %sil
+; SSE2-NEXT: movb %sil, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: shrq $40, %rdi
+; SSE2-NEXT: andb $15, %dil
+; SSE2-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: shrq $32, %rax
+; SSE2-NEXT: andb $15, %al
+; SSE2-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: shrq $24, %rdx
+; SSE2-NEXT: andb $15, %dl
+; SSE2-NEXT: movb %dl, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: shrq $16, %r14
+; SSE2-NEXT: andb $15, %r14b
+; SSE2-NEXT: movb %r14b, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: shrq $8, %r11
+; SSE2-NEXT: andb $15, %r11b
+; SSE2-NEXT: movb %r11b, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: movb $0, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE2-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
+; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT: popq %rbx
+; SSE2-NEXT: popq %r14
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: _clearupper16xi8b:
+; SSE42: # %bb.0:
+; SSE42-NEXT: pushq %r14
+; SSE42-NEXT: pushq %rbx
+; SSE42-NEXT: movq %xmm0, %rcx
+; SSE42-NEXT: movq %rcx, %r8
+; SSE42-NEXT: movq %rcx, %r9
+; SSE42-NEXT: movq %rcx, %r10
+; SSE42-NEXT: movq %rcx, %rax
+; SSE42-NEXT: movq %rcx, %rdx
+; SSE42-NEXT: movq %rcx, %rsi
+; SSE42-NEXT: movq %rcx, %rdi
+; SSE42-NEXT: andb $15, %cl
+; SSE42-NEXT: movb %cl, -{{[0-9]+}}(%rsp)
+; SSE42-NEXT: pextrq $1, %xmm0, %rcx
+; SSE42-NEXT: shrq $56, %rdi
+; SSE42-NEXT: andb $15, %dil
+; SSE42-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; SSE42-NEXT: movq %rcx, %r11
+; SSE42-NEXT: shrq $48, %rsi
+; SSE42-NEXT: andb $15, %sil
+; SSE42-NEXT: movb %sil, -{{[0-9]+}}(%rsp)
+; SSE42-NEXT: movq %rcx, %r14
+; SSE42-NEXT: shrq $40, %rdx
+; SSE42-NEXT: andb $15, %dl
+; SSE42-NEXT: movb %dl, -{{[0-9]+}}(%rsp)
+; SSE42-NEXT: movq %rcx, %rdx
+; SSE42-NEXT: shrq $32, %rax
+; SSE42-NEXT: andb $15, %al
+; SSE42-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE42-NEXT: movq %rcx, %rax
+; SSE42-NEXT: shrq $24, %r10
+; SSE42-NEXT: andb $15, %r10b
+; SSE42-NEXT: movb %r10b, -{{[0-9]+}}(%rsp)
+; SSE42-NEXT: movq %rcx, %rdi
+; SSE42-NEXT: shrq $16, %r9
+; SSE42-NEXT: andb $15, %r9b
+; SSE42-NEXT: movb %r9b, -{{[0-9]+}}(%rsp)
+; SSE42-NEXT: movq %rcx, %rsi
+; SSE42-NEXT: shrq $8, %r8
+; SSE42-NEXT: andb $15, %r8b
+; SSE42-NEXT: movb %r8b, -{{[0-9]+}}(%rsp)
+; SSE42-NEXT: movq %rcx, %rbx
+; SSE42-NEXT: movb $0, -{{[0-9]+}}(%rsp)
+; SSE42-NEXT: andb $15, %cl
+; SSE42-NEXT: movb %cl, -{{[0-9]+}}(%rsp)
+; SSE42-NEXT: shrq $56, %rbx
+; SSE42-NEXT: andb $15, %bl
+; SSE42-NEXT: movb %bl, -{{[0-9]+}}(%rsp)
+; SSE42-NEXT: shrq $48, %rsi
+; SSE42-NEXT: andb $15, %sil
+; SSE42-NEXT: movb %sil, -{{[0-9]+}}(%rsp)
+; SSE42-NEXT: shrq $40, %rdi
+; SSE42-NEXT: andb $15, %dil
+; SSE42-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; SSE42-NEXT: shrq $32, %rax
+; SSE42-NEXT: andb $15, %al
+; SSE42-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE42-NEXT: shrq $24, %rdx
+; SSE42-NEXT: andb $15, %dl
+; SSE42-NEXT: movb %dl, -{{[0-9]+}}(%rsp)
+; SSE42-NEXT: shrq $16, %r14
+; SSE42-NEXT: andb $15, %r14b
+; SSE42-NEXT: movb %r14b, -{{[0-9]+}}(%rsp)
+; SSE42-NEXT: shrq $8, %r11
+; SSE42-NEXT: andb $15, %r11b
+; SSE42-NEXT: movb %r11b, -{{[0-9]+}}(%rsp)
+; SSE42-NEXT: movb $0, -{{[0-9]+}}(%rsp)
+; SSE42-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
+; SSE42-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE42-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE42-NEXT: popq %rbx
+; SSE42-NEXT: popq %r14
+; SSE42-NEXT: retq
;
; AVX-LABEL: _clearupper16xi8b:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: pushq %rbp
; AVX-NEXT: pushq %r15
; AVX-NEXT: pushq %r14
@@ -985,104 +1102,180 @@ define <16 x i8> @_clearupper16xi8b(<16 x i8>) nounwind {
}
define <32 x i8> @_clearupper32xi8b(<32 x i8>) nounwind {
-; SSE-LABEL: _clearupper32xi8b:
-; SSE: # BB#0:
-; SSE-NEXT: pushq %r14
-; SSE-NEXT: pushq %rbx
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
-; SSE-NEXT: movq %xmm0, %rcx
-; SSE-NEXT: movq %rcx, %r8
-; SSE-NEXT: movq %rcx, %r9
-; SSE-NEXT: movq %rcx, %r10
-; SSE-NEXT: movq %rcx, %rax
-; SSE-NEXT: movq %rcx, %rdx
-; SSE-NEXT: movq %rcx, %rsi
-; SSE-NEXT: movq %rcx, %rdi
-; SSE-NEXT: andb $15, %cl
-; SSE-NEXT: movb %cl, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq %xmm2, %rcx
-; SSE-NEXT: shrq $56, %rdi
-; SSE-NEXT: andb $15, %dil
-; SSE-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq %rcx, %r11
-; SSE-NEXT: shrq $48, %rsi
-; SSE-NEXT: andb $15, %sil
-; SSE-NEXT: movb %sil, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq %rcx, %r14
-; SSE-NEXT: shrq $40, %rdx
-; SSE-NEXT: andb $15, %dl
-; SSE-NEXT: movb %dl, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq %rcx, %rdx
-; SSE-NEXT: shrq $32, %rax
-; SSE-NEXT: andb $15, %al
-; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq %rcx, %rax
-; SSE-NEXT: shrq $24, %r10
-; SSE-NEXT: andb $15, %r10b
-; SSE-NEXT: movb %r10b, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq %rcx, %rdi
-; SSE-NEXT: shrq $16, %r9
-; SSE-NEXT: andb $15, %r9b
-; SSE-NEXT: movb %r9b, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq %rcx, %rsi
-; SSE-NEXT: shrq $8, %r8
-; SSE-NEXT: andb $15, %r8b
-; SSE-NEXT: movb %r8b, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq %rcx, %rbx
-; SSE-NEXT: movb $0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: andb $15, %cl
-; SSE-NEXT: movb %cl, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: shrq $56, %rbx
-; SSE-NEXT: andb $15, %bl
-; SSE-NEXT: movb %bl, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: shrq $48, %rsi
-; SSE-NEXT: andb $15, %sil
-; SSE-NEXT: movb %sil, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: shrq $40, %rdi
-; SSE-NEXT: andb $15, %dil
-; SSE-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: shrq $32, %rax
-; SSE-NEXT: andb $15, %al
-; SSE-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: shrq $24, %rdx
-; SSE-NEXT: andb $15, %dl
-; SSE-NEXT: movb %dl, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: shrq $16, %r14
-; SSE-NEXT: andb $15, %r14b
-; SSE-NEXT: movb %r14b, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: shrq $8, %r11
-; SSE-NEXT: andb $15, %r11b
-; SSE-NEXT: movb %r11b, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movb $0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; SSE-NEXT: movq {{.*#+}} xmm2 = mem[0],zero
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; SSE-NEXT: popq %rbx
-; SSE-NEXT: popq %r14
-; SSE-NEXT: retq
+; SSE2-LABEL: _clearupper32xi8b:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pushq %r14
+; SSE2-NEXT: pushq %rbx
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; SSE2-NEXT: movq %xmm0, %rcx
+; SSE2-NEXT: movq %rcx, %r8
+; SSE2-NEXT: movq %rcx, %r9
+; SSE2-NEXT: movq %rcx, %r10
+; SSE2-NEXT: movq %rcx, %rax
+; SSE2-NEXT: movq %rcx, %rdx
+; SSE2-NEXT: movq %rcx, %rsi
+; SSE2-NEXT: movq %rcx, %rdi
+; SSE2-NEXT: andb $15, %cl
+; SSE2-NEXT: movb %cl, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: movq %xmm2, %rcx
+; SSE2-NEXT: shrq $56, %rdi
+; SSE2-NEXT: andb $15, %dil
+; SSE2-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: movq %rcx, %r11
+; SSE2-NEXT: shrq $48, %rsi
+; SSE2-NEXT: andb $15, %sil
+; SSE2-NEXT: movb %sil, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: movq %rcx, %r14
+; SSE2-NEXT: shrq $40, %rdx
+; SSE2-NEXT: andb $15, %dl
+; SSE2-NEXT: movb %dl, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: movq %rcx, %rdx
+; SSE2-NEXT: shrq $32, %rax
+; SSE2-NEXT: andb $15, %al
+; SSE2-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: movq %rcx, %rax
+; SSE2-NEXT: shrq $24, %r10
+; SSE2-NEXT: andb $15, %r10b
+; SSE2-NEXT: movb %r10b, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: movq %rcx, %rdi
+; SSE2-NEXT: shrq $16, %r9
+; SSE2-NEXT: andb $15, %r9b
+; SSE2-NEXT: movb %r9b, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: movq %rcx, %rsi
+; SSE2-NEXT: shrq $8, %r8
+; SSE2-NEXT: andb $15, %r8b
+; SSE2-NEXT: movb %r8b, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: movq %rcx, %rbx
+; SSE2-NEXT: movb $0, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: andb $15, %cl
+; SSE2-NEXT: movb %cl, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: shrq $56, %rbx
+; SSE2-NEXT: andb $15, %bl
+; SSE2-NEXT: movb %bl, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: shrq $48, %rsi
+; SSE2-NEXT: andb $15, %sil
+; SSE2-NEXT: movb %sil, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: shrq $40, %rdi
+; SSE2-NEXT: andb $15, %dil
+; SSE2-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: shrq $32, %rax
+; SSE2-NEXT: andb $15, %al
+; SSE2-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: shrq $24, %rdx
+; SSE2-NEXT: andb $15, %dl
+; SSE2-NEXT: movb %dl, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: shrq $16, %r14
+; SSE2-NEXT: andb $15, %r14b
+; SSE2-NEXT: movb %r14b, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: shrq $8, %r11
+; SSE2-NEXT: andb $15, %r11b
+; SSE2-NEXT: movb %r11b, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: movb $0, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE2-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero
+; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; SSE2-NEXT: popq %rbx
+; SSE2-NEXT: popq %r14
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: _clearupper32xi8b:
+; SSE42: # %bb.0:
+; SSE42-NEXT: pushq %r14
+; SSE42-NEXT: pushq %rbx
+; SSE42-NEXT: movq %xmm0, %rcx
+; SSE42-NEXT: movq %rcx, %r8
+; SSE42-NEXT: movq %rcx, %r9
+; SSE42-NEXT: movq %rcx, %r10
+; SSE42-NEXT: movq %rcx, %rax
+; SSE42-NEXT: movq %rcx, %rdx
+; SSE42-NEXT: movq %rcx, %rsi
+; SSE42-NEXT: movq %rcx, %rdi
+; SSE42-NEXT: andb $15, %cl
+; SSE42-NEXT: movb %cl, -{{[0-9]+}}(%rsp)
+; SSE42-NEXT: pextrq $1, %xmm0, %rcx
+; SSE42-NEXT: shrq $56, %rdi
+; SSE42-NEXT: andb $15, %dil
+; SSE42-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; SSE42-NEXT: movq %rcx, %r11
+; SSE42-NEXT: shrq $48, %rsi
+; SSE42-NEXT: andb $15, %sil
+; SSE42-NEXT: movb %sil, -{{[0-9]+}}(%rsp)
+; SSE42-NEXT: movq %rcx, %r14
+; SSE42-NEXT: shrq $40, %rdx
+; SSE42-NEXT: andb $15, %dl
+; SSE42-NEXT: movb %dl, -{{[0-9]+}}(%rsp)
+; SSE42-NEXT: movq %rcx, %rdx
+; SSE42-NEXT: shrq $32, %rax
+; SSE42-NEXT: andb $15, %al
+; SSE42-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE42-NEXT: movq %rcx, %rax
+; SSE42-NEXT: shrq $24, %r10
+; SSE42-NEXT: andb $15, %r10b
+; SSE42-NEXT: movb %r10b, -{{[0-9]+}}(%rsp)
+; SSE42-NEXT: movq %rcx, %rdi
+; SSE42-NEXT: shrq $16, %r9
+; SSE42-NEXT: andb $15, %r9b
+; SSE42-NEXT: movb %r9b, -{{[0-9]+}}(%rsp)
+; SSE42-NEXT: movq %rcx, %rsi
+; SSE42-NEXT: shrq $8, %r8
+; SSE42-NEXT: andb $15, %r8b
+; SSE42-NEXT: movb %r8b, -{{[0-9]+}}(%rsp)
+; SSE42-NEXT: movq %rcx, %rbx
+; SSE42-NEXT: movb $0, -{{[0-9]+}}(%rsp)
+; SSE42-NEXT: andb $15, %cl
+; SSE42-NEXT: movb %cl, -{{[0-9]+}}(%rsp)
+; SSE42-NEXT: shrq $56, %rbx
+; SSE42-NEXT: andb $15, %bl
+; SSE42-NEXT: movb %bl, -{{[0-9]+}}(%rsp)
+; SSE42-NEXT: shrq $48, %rsi
+; SSE42-NEXT: andb $15, %sil
+; SSE42-NEXT: movb %sil, -{{[0-9]+}}(%rsp)
+; SSE42-NEXT: shrq $40, %rdi
+; SSE42-NEXT: andb $15, %dil
+; SSE42-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; SSE42-NEXT: shrq $32, %rax
+; SSE42-NEXT: andb $15, %al
+; SSE42-NEXT: movb %al, -{{[0-9]+}}(%rsp)
+; SSE42-NEXT: shrq $24, %rdx
+; SSE42-NEXT: andb $15, %dl
+; SSE42-NEXT: movb %dl, -{{[0-9]+}}(%rsp)
+; SSE42-NEXT: shrq $16, %r14
+; SSE42-NEXT: andb $15, %r14b
+; SSE42-NEXT: movb %r14b, -{{[0-9]+}}(%rsp)
+; SSE42-NEXT: shrq $8, %r11
+; SSE42-NEXT: andb $15, %r11b
+; SSE42-NEXT: movb %r11b, -{{[0-9]+}}(%rsp)
+; SSE42-NEXT: movb $0, -{{[0-9]+}}(%rsp)
+; SSE42-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero
+; SSE42-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE42-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; SSE42-NEXT: popq %rbx
+; SSE42-NEXT: popq %r14
+; SSE42-NEXT: retq
;
; AVX1-LABEL: _clearupper32xi8b:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: pushq %rbp
; AVX1-NEXT: pushq %r15
; AVX1-NEXT: pushq %r14
; AVX1-NEXT: pushq %r13
; AVX1-NEXT: pushq %r12
; AVX1-NEXT: pushq %rbx
-; AVX1-NEXT: vmovq %xmm0, %rcx
+; AVX1-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX1-NEXT: movq -{{[0-9]+}}(%rsp), %rdx
; AVX1-NEXT: movq %rcx, %r8
; AVX1-NEXT: movq %rcx, %r9
; AVX1-NEXT: movq %rcx, %r10
; AVX1-NEXT: movq %rcx, %r11
; AVX1-NEXT: movq %rcx, %r14
; AVX1-NEXT: movq %rcx, %r15
-; AVX1-NEXT: vpextrq $1, %xmm0, %rdx
; AVX1-NEXT: movq %rdx, %r12
; AVX1-NEXT: movq %rdx, %r13
-; AVX1-NEXT: movq %rdx, %rbx
-; AVX1-NEXT: movq %rdx, %rax
; AVX1-NEXT: movq %rdx, %rdi
+; AVX1-NEXT: movq %rdx, %rax
; AVX1-NEXT: movq %rdx, %rsi
+; AVX1-NEXT: movq %rdx, %rbx
; AVX1-NEXT: movq %rdx, %rbp
; AVX1-NEXT: andb $15, %dl
; AVX1-NEXT: movb %dl, -{{[0-9]+}}(%rsp)
@@ -1092,18 +1285,18 @@ define <32 x i8> @_clearupper32xi8b(<32 x i8>) nounwind {
; AVX1-NEXT: shrq $56, %rbp
; AVX1-NEXT: andb $15, %bpl
; AVX1-NEXT: movb %bpl, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT: shrq $48, %rsi
+; AVX1-NEXT: shrq $48, %rbx
+; AVX1-NEXT: andb $15, %bl
+; AVX1-NEXT: movb %bl, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT: shrq $40, %rsi
; AVX1-NEXT: andb $15, %sil
; AVX1-NEXT: movb %sil, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT: shrq $40, %rdi
-; AVX1-NEXT: andb $15, %dil
-; AVX1-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
; AVX1-NEXT: shrq $32, %rax
; AVX1-NEXT: andb $15, %al
; AVX1-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT: shrq $24, %rbx
-; AVX1-NEXT: andb $15, %bl
-; AVX1-NEXT: movb %bl, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT: shrq $24, %rdi
+; AVX1-NEXT: andb $15, %dil
+; AVX1-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
; AVX1-NEXT: shrq $16, %r13
; AVX1-NEXT: andb $15, %r13b
; AVX1-NEXT: movb %r13b, -{{[0-9]+}}(%rsp)
@@ -1232,27 +1425,28 @@ define <32 x i8> @_clearupper32xi8b(<32 x i8>) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: _clearupper32xi8b:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: pushq %rbp
; AVX2-NEXT: pushq %r15
; AVX2-NEXT: pushq %r14
; AVX2-NEXT: pushq %r13
; AVX2-NEXT: pushq %r12
; AVX2-NEXT: pushq %rbx
-; AVX2-NEXT: vmovq %xmm0, %rcx
+; AVX2-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX2-NEXT: movq -{{[0-9]+}}(%rsp), %rdx
; AVX2-NEXT: movq %rcx, %r8
; AVX2-NEXT: movq %rcx, %r9
; AVX2-NEXT: movq %rcx, %r10
; AVX2-NEXT: movq %rcx, %r11
; AVX2-NEXT: movq %rcx, %r14
; AVX2-NEXT: movq %rcx, %r15
-; AVX2-NEXT: vpextrq $1, %xmm0, %rdx
; AVX2-NEXT: movq %rdx, %r12
; AVX2-NEXT: movq %rdx, %r13
-; AVX2-NEXT: movq %rdx, %rbx
-; AVX2-NEXT: movq %rdx, %rax
; AVX2-NEXT: movq %rdx, %rdi
+; AVX2-NEXT: movq %rdx, %rax
; AVX2-NEXT: movq %rdx, %rsi
+; AVX2-NEXT: movq %rdx, %rbx
; AVX2-NEXT: movq %rdx, %rbp
; AVX2-NEXT: andb $15, %dl
; AVX2-NEXT: movb %dl, -{{[0-9]+}}(%rsp)
@@ -1262,18 +1456,18 @@ define <32 x i8> @_clearupper32xi8b(<32 x i8>) nounwind {
; AVX2-NEXT: shrq $56, %rbp
; AVX2-NEXT: andb $15, %bpl
; AVX2-NEXT: movb %bpl, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: shrq $48, %rsi
+; AVX2-NEXT: shrq $48, %rbx
+; AVX2-NEXT: andb $15, %bl
+; AVX2-NEXT: movb %bl, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: shrq $40, %rsi
; AVX2-NEXT: andb $15, %sil
; AVX2-NEXT: movb %sil, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: shrq $40, %rdi
-; AVX2-NEXT: andb $15, %dil
-; AVX2-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
; AVX2-NEXT: shrq $32, %rax
; AVX2-NEXT: andb $15, %al
; AVX2-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: shrq $24, %rbx
-; AVX2-NEXT: andb $15, %bl
-; AVX2-NEXT: movb %bl, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: shrq $24, %rdi
+; AVX2-NEXT: andb $15, %dil
+; AVX2-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
; AVX2-NEXT: shrq $16, %r13
; AVX2-NEXT: andb $15, %r13b
; AVX2-NEXT: movb %r13b, -{{[0-9]+}}(%rsp)
@@ -1438,57 +1632,70 @@ define <32 x i8> @_clearupper32xi8b(<32 x i8>) nounwind {
}
define <2 x i64> @_clearupper2xi64c(<2 x i64>) nounwind {
-; SSE-LABEL: _clearupper2xi64c:
-; SSE: # BB#0:
-; SSE-NEXT: andps {{.*}}(%rip), %xmm0
-; SSE-NEXT: retq
+; SSE2-LABEL: _clearupper2xi64c:
+; SSE2: # %bb.0:
+; SSE2-NEXT: andps {{.*}}(%rip), %xmm0
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: _clearupper2xi64c:
+; SSE42: # %bb.0:
+; SSE42-NEXT: pxor %xmm1, %xmm1
+; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; SSE42-NEXT: retq
;
; AVX1-LABEL: _clearupper2xi64c:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
; AVX1-NEXT: retq
;
; AVX2-LABEL: _clearupper2xi64c:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; AVX2: # %bb.0:
+; AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
; AVX2-NEXT: retq
%r = and <2 x i64> <i64 4294967295, i64 4294967295>, %0
ret <2 x i64> %r
}
define <4 x i64> @_clearupper4xi64c(<4 x i64>) nounwind {
-; SSE-LABEL: _clearupper4xi64c:
-; SSE: # BB#0:
-; SSE-NEXT: movaps {{.*#+}} xmm2 = [4294967295,0,4294967295,0]
-; SSE-NEXT: andps %xmm2, %xmm0
-; SSE-NEXT: andps %xmm2, %xmm1
-; SSE-NEXT: retq
+; SSE2-LABEL: _clearupper4xi64c:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movaps {{.*#+}} xmm2 = [4294967295,0,4294967295,0]
+; SSE2-NEXT: andps %xmm2, %xmm0
+; SSE2-NEXT: andps %xmm2, %xmm1
+; SSE2-NEXT: retq
;
-; AVX1-LABEL: _clearupper4xi64c:
-; AVX1: # BB#0:
-; AVX1-NEXT: vxorps %ymm1, %ymm1, %ymm1
-; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
-; AVX1-NEXT: retq
+; SSE42-LABEL: _clearupper4xi64c:
+; SSE42: # %bb.0:
+; SSE42-NEXT: pxor %xmm2, %xmm2
+; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; SSE42-NEXT: retq
;
-; AVX2-LABEL: _clearupper4xi64c:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
-; AVX2-NEXT: retq
+; AVX-LABEL: _clearupper4xi64c:
+; AVX: # %bb.0:
+; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
+; AVX-NEXT: retq
%r = and <4 x i64> <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>, %0
ret <4 x i64> %r
}
define <4 x i32> @_clearupper4xi32c(<4 x i32>) nounwind {
-; SSE-LABEL: _clearupper4xi32c:
-; SSE: # BB#0:
-; SSE-NEXT: andps {{.*}}(%rip), %xmm0
-; SSE-NEXT: retq
+; SSE2-LABEL: _clearupper4xi32c:
+; SSE2: # %bb.0:
+; SSE2-NEXT: andps {{.*}}(%rip), %xmm0
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: _clearupper4xi32c:
+; SSE42: # %bb.0:
+; SSE42-NEXT: pxor %xmm1, %xmm1
+; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
+; SSE42-NEXT: retq
;
; AVX-LABEL: _clearupper4xi32c:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
; AVX-NEXT: retq
@@ -1497,21 +1704,28 @@ define <4 x i32> @_clearupper4xi32c(<4 x i32>) nounwind {
}
define <8 x i32> @_clearupper8xi32c(<8 x i32>) nounwind {
-; SSE-LABEL: _clearupper8xi32c:
-; SSE: # BB#0:
-; SSE-NEXT: movaps {{.*#+}} xmm2 = [65535,0,65535,0,65535,0,65535,0]
-; SSE-NEXT: andps %xmm2, %xmm0
-; SSE-NEXT: andps %xmm2, %xmm1
-; SSE-NEXT: retq
+; SSE2-LABEL: _clearupper8xi32c:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movaps {{.*#+}} xmm2 = [65535,0,65535,0,65535,0,65535,0]
+; SSE2-NEXT: andps %xmm2, %xmm0
+; SSE2-NEXT: andps %xmm2, %xmm1
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: _clearupper8xi32c:
+; SSE42: # %bb.0:
+; SSE42-NEXT: pxor %xmm2, %xmm2
+; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
+; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
+; SSE42-NEXT: retq
;
; AVX1-LABEL: _clearupper8xi32c:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: _clearupper8xi32c:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
; AVX2-NEXT: retq
%r = and <8 x i32> <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>, %0
@@ -1520,12 +1734,12 @@ define <8 x i32> @_clearupper8xi32c(<8 x i32>) nounwind {
define <8 x i16> @_clearupper8xi16c(<8 x i16>) nounwind {
; SSE-LABEL: _clearupper8xi16c:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: andps {{.*}}(%rip), %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: _clearupper8xi16c:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: retq
%r = and <8 x i16> <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>, %0
@@ -1534,14 +1748,14 @@ define <8 x i16> @_clearupper8xi16c(<8 x i16>) nounwind {
define <16 x i16> @_clearupper16xi16c(<16 x i16>) nounwind {
; SSE-LABEL: _clearupper16xi16c:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
; SSE-NEXT: andps %xmm2, %xmm0
; SSE-NEXT: andps %xmm2, %xmm1
; SSE-NEXT: retq
;
; AVX-LABEL: _clearupper16xi16c:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
; AVX-NEXT: retq
%r = and <16 x i16> <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>, %0
@@ -1550,12 +1764,12 @@ define <16 x i16> @_clearupper16xi16c(<16 x i16>) nounwind {
define <16 x i8> @_clearupper16xi8c(<16 x i8>) nounwind {
; SSE-LABEL: _clearupper16xi8c:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: andps {{.*}}(%rip), %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: _clearupper16xi8c:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: retq
%r = and <16 x i8> <i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15>, %0
@@ -1564,14 +1778,14 @@ define <16 x i8> @_clearupper16xi8c(<16 x i8>) nounwind {
define <32 x i8> @_clearupper32xi8c(<32 x i8>) nounwind {
; SSE-LABEL: _clearupper32xi8c:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; SSE-NEXT: andps %xmm2, %xmm0
; SSE-NEXT: andps %xmm2, %xmm1
; SSE-NEXT: retq
;
; AVX-LABEL: _clearupper32xi8c:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
; AVX-NEXT: retq
%r = and <32 x i8> <i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15>, %0
diff --git a/test/CodeGen/X86/clflushopt-schedule.ll b/test/CodeGen/X86/clflushopt-schedule.ll
new file mode 100644
index 000000000000..14b4551cabc2
--- /dev/null
+++ b/test/CodeGen/X86/clflushopt-schedule.ll
@@ -0,0 +1,36 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=+clflushopt | FileCheck %s --check-prefix=CHECK --check-prefix=GENERIC
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=goldmont | FileCheck %s --check-prefix=CHECK --check-prefix=GLM
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=SKYLAKE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skx | FileCheck %s --check-prefix=CHECK --check-prefix=SKX
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1
+
+define void @clflushopt(i8* %p) nounwind {
+; GENERIC-LABEL: clflushopt:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: clflushopt (%rdi) # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; GLM-LABEL: clflushopt:
+; GLM: # %bb.0:
+; GLM-NEXT: clflushopt (%rdi) # sched: [3:1.00]
+; GLM-NEXT: retq # sched: [4:1.00]
+;
+; SKYLAKE-LABEL: clflushopt:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: clflushopt (%rdi) # sched: [2:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: clflushopt:
+; SKX: # %bb.0:
+; SKX-NEXT: clflushopt (%rdi) # sched: [2:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: clflushopt:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: clflushopt (%rdi) # sched: [8:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ tail call void @llvm.x86.clflushopt(i8* %p)
+ ret void
+}
+declare void @llvm.x86.clflushopt(i8*) nounwind
diff --git a/test/CodeGen/X86/clflushopt.ll b/test/CodeGen/X86/clflushopt.ll
index ee416eb96c5e..decd4cc35aba 100644
--- a/test/CodeGen/X86/clflushopt.ll
+++ b/test/CodeGen/X86/clflushopt.ll
@@ -1,12 +1,18 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=clflushopt | FileCheck %s
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=clflushopt | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=clflushopt | FileCheck %s --check-prefix=X64
define void @clflushopt(i8* %p) nounwind {
-; CHECK-LABEL: clflushopt:
-; CHECK: ## BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: clflushopt (%eax)
-; CHECK-NEXT: retl
+; X86-LABEL: clflushopt:
+; X86: ## %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: clflushopt (%eax)
+; X86-NEXT: retl
+;
+; X64-LABEL: clflushopt:
+; X64: ## %bb.0:
+; X64-NEXT: clflushopt (%rdi)
+; X64-NEXT: retq
tail call void @llvm.x86.clflushopt(i8* %p)
ret void
}
diff --git a/test/CodeGen/X86/clwb-schedule.ll b/test/CodeGen/X86/clwb-schedule.ll
new file mode 100644
index 000000000000..24931ad549db
--- /dev/null
+++ b/test/CodeGen/X86/clwb-schedule.ll
@@ -0,0 +1,18 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=+clwb | FileCheck %s --check-prefix=GENERIC
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skx | FileCheck %s --check-prefix=SKX
+
+define void @clwb(i8* %a0) nounwind {
+; GENERIC-LABEL: clwb:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: clwb (%rdi) # sched: [4:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKX-LABEL: clwb:
+; SKX: # %bb.0:
+; SKX-NEXT: clwb (%rdi) # sched: [5:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+ tail call void @llvm.x86.clwb(i8* %a0)
+ ret void
+}
+declare void @llvm.x86.clwb(i8*) nounwind
diff --git a/test/CodeGen/X86/clwb.ll b/test/CodeGen/X86/clwb.ll
new file mode 100644
index 000000000000..0bbb14917f7f
--- /dev/null
+++ b/test/CodeGen/X86/clwb.ll
@@ -0,0 +1,13 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=clwb | FileCheck %s
+
+define void @clwb(i8* %p) nounwind {
+; CHECK-LABEL: clwb:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: clwb (%eax)
+; CHECK-NEXT: retl
+ tail call void @llvm.x86.clwb(i8* %p)
+ ret void
+}
+declare void @llvm.x86.clwb(i8*) nounwind
diff --git a/test/CodeGen/X86/clz.ll b/test/CodeGen/X86/clz.ll
index 9d827fc88b34..bd63a8006e51 100644
--- a/test/CodeGen/X86/clz.ll
+++ b/test/CodeGen/X86/clz.ll
@@ -16,31 +16,31 @@ declare i64 @llvm.ctlz.i64(i64, i1)
define i8 @cttz_i8(i8 %x) {
; X32-LABEL: cttz_i8:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; X32-NEXT: bsfl %eax, %eax
-; X32-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; X32-NEXT: # kill: def %al killed %al killed %eax
; X32-NEXT: retl
;
; X64-LABEL: cttz_i8:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movzbl %dil, %eax
; X64-NEXT: bsfl %eax, %eax
-; X64-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-NEXT: # kill: def %al killed %al killed %eax
; X64-NEXT: retq
;
; X32-CLZ-LABEL: cttz_i8:
-; X32-CLZ: # BB#0:
+; X32-CLZ: # %bb.0:
; X32-CLZ-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; X32-CLZ-NEXT: tzcntl %eax, %eax
-; X32-CLZ-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; X32-CLZ-NEXT: # kill: def %al killed %al killed %eax
; X32-CLZ-NEXT: retl
;
; X64-CLZ-LABEL: cttz_i8:
-; X64-CLZ: # BB#0:
+; X64-CLZ: # %bb.0:
; X64-CLZ-NEXT: movzbl %dil, %eax
; X64-CLZ-NEXT: tzcntl %eax, %eax
-; X64-CLZ-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-CLZ-NEXT: # kill: def %al killed %al killed %eax
; X64-CLZ-NEXT: retq
%tmp = call i8 @llvm.cttz.i8( i8 %x, i1 true )
ret i8 %tmp
@@ -48,22 +48,22 @@ define i8 @cttz_i8(i8 %x) {
define i16 @cttz_i16(i16 %x) {
; X32-LABEL: cttz_i16:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: bsfw {{[0-9]+}}(%esp), %ax
; X32-NEXT: retl
;
; X64-LABEL: cttz_i16:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: bsfw %di, %ax
; X64-NEXT: retq
;
; X32-CLZ-LABEL: cttz_i16:
-; X32-CLZ: # BB#0:
+; X32-CLZ: # %bb.0:
; X32-CLZ-NEXT: tzcntw {{[0-9]+}}(%esp), %ax
; X32-CLZ-NEXT: retl
;
; X64-CLZ-LABEL: cttz_i16:
-; X64-CLZ: # BB#0:
+; X64-CLZ: # %bb.0:
; X64-CLZ-NEXT: tzcntw %di, %ax
; X64-CLZ-NEXT: retq
%tmp = call i16 @llvm.cttz.i16( i16 %x, i1 true )
@@ -72,22 +72,22 @@ define i16 @cttz_i16(i16 %x) {
define i32 @cttz_i32(i32 %x) {
; X32-LABEL: cttz_i32:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: bsfl {{[0-9]+}}(%esp), %eax
; X32-NEXT: retl
;
; X64-LABEL: cttz_i32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: bsfl %edi, %eax
; X64-NEXT: retq
;
; X32-CLZ-LABEL: cttz_i32:
-; X32-CLZ: # BB#0:
+; X32-CLZ: # %bb.0:
; X32-CLZ-NEXT: tzcntl {{[0-9]+}}(%esp), %eax
; X32-CLZ-NEXT: retl
;
; X64-CLZ-LABEL: cttz_i32:
-; X64-CLZ: # BB#0:
+; X64-CLZ: # %bb.0:
; X64-CLZ-NEXT: tzcntl %edi, %eax
; X64-CLZ-NEXT: retq
%tmp = call i32 @llvm.cttz.i32( i32 %x, i1 true )
@@ -96,11 +96,11 @@ define i32 @cttz_i32(i32 %x) {
define i64 @cttz_i64(i64 %x) {
; X32-LABEL: cttz_i64:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: testl %eax, %eax
; X32-NEXT: jne .LBB3_1
-; X32-NEXT: # BB#2:
+; X32-NEXT: # %bb.2:
; X32-NEXT: bsfl {{[0-9]+}}(%esp), %eax
; X32-NEXT: addl $32, %eax
; X32-NEXT: xorl %edx, %edx
@@ -111,16 +111,16 @@ define i64 @cttz_i64(i64 %x) {
; X32-NEXT: retl
;
; X64-LABEL: cttz_i64:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: bsfq %rdi, %rax
; X64-NEXT: retq
;
; X32-CLZ-LABEL: cttz_i64:
-; X32-CLZ: # BB#0:
+; X32-CLZ: # %bb.0:
; X32-CLZ-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-CLZ-NEXT: testl %eax, %eax
; X32-CLZ-NEXT: jne .LBB3_1
-; X32-CLZ-NEXT: # BB#2:
+; X32-CLZ-NEXT: # %bb.2:
; X32-CLZ-NEXT: tzcntl {{[0-9]+}}(%esp), %eax
; X32-CLZ-NEXT: addl $32, %eax
; X32-CLZ-NEXT: xorl %edx, %edx
@@ -131,7 +131,7 @@ define i64 @cttz_i64(i64 %x) {
; X32-CLZ-NEXT: retl
;
; X64-CLZ-LABEL: cttz_i64:
-; X64-CLZ: # BB#0:
+; X64-CLZ: # %bb.0:
; X64-CLZ-NEXT: tzcntq %rdi, %rax
; X64-CLZ-NEXT: retq
%tmp = call i64 @llvm.cttz.i64( i64 %x, i1 true )
@@ -140,35 +140,35 @@ define i64 @cttz_i64(i64 %x) {
define i8 @ctlz_i8(i8 %x) {
; X32-LABEL: ctlz_i8:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; X32-NEXT: bsrl %eax, %eax
; X32-NEXT: xorl $7, %eax
-; X32-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; X32-NEXT: # kill: def %al killed %al killed %eax
; X32-NEXT: retl
;
; X64-LABEL: ctlz_i8:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movzbl %dil, %eax
; X64-NEXT: bsrl %eax, %eax
; X64-NEXT: xorl $7, %eax
-; X64-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-NEXT: # kill: def %al killed %al killed %eax
; X64-NEXT: retq
;
; X32-CLZ-LABEL: ctlz_i8:
-; X32-CLZ: # BB#0:
+; X32-CLZ: # %bb.0:
; X32-CLZ-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; X32-CLZ-NEXT: lzcntl %eax, %eax
; X32-CLZ-NEXT: addl $-24, %eax
-; X32-CLZ-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; X32-CLZ-NEXT: # kill: def %al killed %al killed %eax
; X32-CLZ-NEXT: retl
;
; X64-CLZ-LABEL: ctlz_i8:
-; X64-CLZ: # BB#0:
+; X64-CLZ: # %bb.0:
; X64-CLZ-NEXT: movzbl %dil, %eax
; X64-CLZ-NEXT: lzcntl %eax, %eax
; X64-CLZ-NEXT: addl $-24, %eax
-; X64-CLZ-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-CLZ-NEXT: # kill: def %al killed %al killed %eax
; X64-CLZ-NEXT: retq
%tmp2 = call i8 @llvm.ctlz.i8( i8 %x, i1 true )
ret i8 %tmp2
@@ -176,26 +176,26 @@ define i8 @ctlz_i8(i8 %x) {
define i16 @ctlz_i16(i16 %x) {
; X32-LABEL: ctlz_i16:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: bsrw {{[0-9]+}}(%esp), %ax
; X32-NEXT: xorl $15, %eax
-; X32-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; X32-NEXT: # kill: def %ax killed %ax killed %eax
; X32-NEXT: retl
;
; X64-LABEL: ctlz_i16:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: bsrw %di, %ax
; X64-NEXT: xorl $15, %eax
-; X64-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-NEXT: # kill: def %ax killed %ax killed %eax
; X64-NEXT: retq
;
; X32-CLZ-LABEL: ctlz_i16:
-; X32-CLZ: # BB#0:
+; X32-CLZ: # %bb.0:
; X32-CLZ-NEXT: lzcntw {{[0-9]+}}(%esp), %ax
; X32-CLZ-NEXT: retl
;
; X64-CLZ-LABEL: ctlz_i16:
-; X64-CLZ: # BB#0:
+; X64-CLZ: # %bb.0:
; X64-CLZ-NEXT: lzcntw %di, %ax
; X64-CLZ-NEXT: retq
%tmp2 = call i16 @llvm.ctlz.i16( i16 %x, i1 true )
@@ -204,24 +204,24 @@ define i16 @ctlz_i16(i16 %x) {
define i32 @ctlz_i32(i32 %x) {
; X32-LABEL: ctlz_i32:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: bsrl {{[0-9]+}}(%esp), %eax
; X32-NEXT: xorl $31, %eax
; X32-NEXT: retl
;
; X64-LABEL: ctlz_i32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: bsrl %edi, %eax
; X64-NEXT: xorl $31, %eax
; X64-NEXT: retq
;
; X32-CLZ-LABEL: ctlz_i32:
-; X32-CLZ: # BB#0:
+; X32-CLZ: # %bb.0:
; X32-CLZ-NEXT: lzcntl {{[0-9]+}}(%esp), %eax
; X32-CLZ-NEXT: retl
;
; X64-CLZ-LABEL: ctlz_i32:
-; X64-CLZ: # BB#0:
+; X64-CLZ: # %bb.0:
; X64-CLZ-NEXT: lzcntl %edi, %eax
; X64-CLZ-NEXT: retq
%tmp = call i32 @llvm.ctlz.i32( i32 %x, i1 true )
@@ -230,11 +230,11 @@ define i32 @ctlz_i32(i32 %x) {
define i64 @ctlz_i64(i64 %x) {
; X32-LABEL: ctlz_i64:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: testl %eax, %eax
; X32-NEXT: jne .LBB7_1
-; X32-NEXT: # BB#2:
+; X32-NEXT: # %bb.2:
; X32-NEXT: bsrl {{[0-9]+}}(%esp), %eax
; X32-NEXT: xorl $31, %eax
; X32-NEXT: addl $32, %eax
@@ -247,17 +247,17 @@ define i64 @ctlz_i64(i64 %x) {
; X32-NEXT: retl
;
; X64-LABEL: ctlz_i64:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: bsrq %rdi, %rax
; X64-NEXT: xorq $63, %rax
; X64-NEXT: retq
;
; X32-CLZ-LABEL: ctlz_i64:
-; X32-CLZ: # BB#0:
+; X32-CLZ: # %bb.0:
; X32-CLZ-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-CLZ-NEXT: testl %eax, %eax
; X32-CLZ-NEXT: jne .LBB7_1
-; X32-CLZ-NEXT: # BB#2:
+; X32-CLZ-NEXT: # %bb.2:
; X32-CLZ-NEXT: lzcntl {{[0-9]+}}(%esp), %eax
; X32-CLZ-NEXT: addl $32, %eax
; X32-CLZ-NEXT: xorl %edx, %edx
@@ -268,7 +268,7 @@ define i64 @ctlz_i64(i64 %x) {
; X32-CLZ-NEXT: retl
;
; X64-CLZ-LABEL: ctlz_i64:
-; X64-CLZ: # BB#0:
+; X64-CLZ: # %bb.0:
; X64-CLZ-NEXT: lzcntq %rdi, %rax
; X64-CLZ-NEXT: retq
%tmp = call i64 @llvm.ctlz.i64( i64 %x, i1 true )
@@ -278,50 +278,50 @@ define i64 @ctlz_i64(i64 %x) {
; Generate a test and branch to handle zero inputs because bsr/bsf are very slow.
define i8 @ctlz_i8_zero_test(i8 %n) {
; X32-LABEL: ctlz_i8_zero_test:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movb {{[0-9]+}}(%esp), %al
; X32-NEXT: testb %al, %al
; X32-NEXT: je .LBB8_1
-; X32-NEXT: # BB#2: # %cond.false
+; X32-NEXT: # %bb.2: # %cond.false
; X32-NEXT: movzbl %al, %eax
; X32-NEXT: bsrl %eax, %eax
; X32-NEXT: xorl $7, %eax
-; X32-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; X32-NEXT: # kill: def %al killed %al killed %eax
; X32-NEXT: retl
; X32-NEXT: .LBB8_1:
; X32-NEXT: movb $8, %al
-; X32-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; X32-NEXT: # kill: def %al killed %al killed %eax
; X32-NEXT: retl
;
; X64-LABEL: ctlz_i8_zero_test:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: testb %dil, %dil
; X64-NEXT: je .LBB8_1
-; X64-NEXT: # BB#2: # %cond.false
+; X64-NEXT: # %bb.2: # %cond.false
; X64-NEXT: movzbl %dil, %eax
; X64-NEXT: bsrl %eax, %eax
; X64-NEXT: xorl $7, %eax
-; X64-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-NEXT: # kill: def %al killed %al killed %eax
; X64-NEXT: retq
; X64-NEXT: .LBB8_1:
; X64-NEXT: movb $8, %al
-; X64-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-NEXT: # kill: def %al killed %al killed %eax
; X64-NEXT: retq
;
; X32-CLZ-LABEL: ctlz_i8_zero_test:
-; X32-CLZ: # BB#0:
+; X32-CLZ: # %bb.0:
; X32-CLZ-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; X32-CLZ-NEXT: lzcntl %eax, %eax
; X32-CLZ-NEXT: addl $-24, %eax
-; X32-CLZ-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; X32-CLZ-NEXT: # kill: def %al killed %al killed %eax
; X32-CLZ-NEXT: retl
;
; X64-CLZ-LABEL: ctlz_i8_zero_test:
-; X64-CLZ: # BB#0:
+; X64-CLZ: # %bb.0:
; X64-CLZ-NEXT: movzbl %dil, %eax
; X64-CLZ-NEXT: lzcntl %eax, %eax
; X64-CLZ-NEXT: addl $-24, %eax
-; X64-CLZ-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-CLZ-NEXT: # kill: def %al killed %al killed %eax
; X64-CLZ-NEXT: retq
%tmp1 = call i8 @llvm.ctlz.i8(i8 %n, i1 false)
ret i8 %tmp1
@@ -330,41 +330,41 @@ define i8 @ctlz_i8_zero_test(i8 %n) {
; Generate a test and branch to handle zero inputs because bsr/bsf are very slow.
define i16 @ctlz_i16_zero_test(i16 %n) {
; X32-LABEL: ctlz_i16_zero_test:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: testw %ax, %ax
; X32-NEXT: je .LBB9_1
-; X32-NEXT: # BB#2: # %cond.false
+; X32-NEXT: # %bb.2: # %cond.false
; X32-NEXT: bsrw %ax, %ax
; X32-NEXT: xorl $15, %eax
-; X32-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; X32-NEXT: # kill: def %ax killed %ax killed %eax
; X32-NEXT: retl
; X32-NEXT: .LBB9_1:
; X32-NEXT: movw $16, %ax
-; X32-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; X32-NEXT: # kill: def %ax killed %ax killed %eax
; X32-NEXT: retl
;
; X64-LABEL: ctlz_i16_zero_test:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: testw %di, %di
; X64-NEXT: je .LBB9_1
-; X64-NEXT: # BB#2: # %cond.false
+; X64-NEXT: # %bb.2: # %cond.false
; X64-NEXT: bsrw %di, %ax
; X64-NEXT: xorl $15, %eax
-; X64-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-NEXT: # kill: def %ax killed %ax killed %eax
; X64-NEXT: retq
; X64-NEXT: .LBB9_1:
; X64-NEXT: movw $16, %ax
-; X64-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-NEXT: # kill: def %ax killed %ax killed %eax
; X64-NEXT: retq
;
; X32-CLZ-LABEL: ctlz_i16_zero_test:
-; X32-CLZ: # BB#0:
+; X32-CLZ: # %bb.0:
; X32-CLZ-NEXT: lzcntw {{[0-9]+}}(%esp), %ax
; X32-CLZ-NEXT: retl
;
; X64-CLZ-LABEL: ctlz_i16_zero_test:
-; X64-CLZ: # BB#0:
+; X64-CLZ: # %bb.0:
; X64-CLZ-NEXT: lzcntw %di, %ax
; X64-CLZ-NEXT: retq
%tmp1 = call i16 @llvm.ctlz.i16(i16 %n, i1 false)
@@ -374,11 +374,11 @@ define i16 @ctlz_i16_zero_test(i16 %n) {
; Generate a test and branch to handle zero inputs because bsr/bsf are very slow.
define i32 @ctlz_i32_zero_test(i32 %n) {
; X32-LABEL: ctlz_i32_zero_test:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: testl %eax, %eax
; X32-NEXT: je .LBB10_1
-; X32-NEXT: # BB#2: # %cond.false
+; X32-NEXT: # %bb.2: # %cond.false
; X32-NEXT: bsrl %eax, %eax
; X32-NEXT: xorl $31, %eax
; X32-NEXT: retl
@@ -387,10 +387,10 @@ define i32 @ctlz_i32_zero_test(i32 %n) {
; X32-NEXT: retl
;
; X64-LABEL: ctlz_i32_zero_test:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: testl %edi, %edi
; X64-NEXT: je .LBB10_1
-; X64-NEXT: # BB#2: # %cond.false
+; X64-NEXT: # %bb.2: # %cond.false
; X64-NEXT: bsrl %edi, %eax
; X64-NEXT: xorl $31, %eax
; X64-NEXT: retq
@@ -399,12 +399,12 @@ define i32 @ctlz_i32_zero_test(i32 %n) {
; X64-NEXT: retq
;
; X32-CLZ-LABEL: ctlz_i32_zero_test:
-; X32-CLZ: # BB#0:
+; X32-CLZ: # %bb.0:
; X32-CLZ-NEXT: lzcntl {{[0-9]+}}(%esp), %eax
; X32-CLZ-NEXT: retl
;
; X64-CLZ-LABEL: ctlz_i32_zero_test:
-; X64-CLZ: # BB#0:
+; X64-CLZ: # %bb.0:
; X64-CLZ-NEXT: lzcntl %edi, %eax
; X64-CLZ-NEXT: retq
%tmp1 = call i32 @llvm.ctlz.i32(i32 %n, i1 false)
@@ -414,17 +414,17 @@ define i32 @ctlz_i32_zero_test(i32 %n) {
; Generate a test and branch to handle zero inputs because bsr/bsf are very slow.
define i64 @ctlz_i64_zero_test(i64 %n) {
; X32-LABEL: ctlz_i64_zero_test:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: bsrl {{[0-9]+}}(%esp), %edx
; X32-NEXT: movl $63, %eax
; X32-NEXT: je .LBB11_2
-; X32-NEXT: # BB#1:
+; X32-NEXT: # %bb.1:
; X32-NEXT: movl %edx, %eax
; X32-NEXT: .LBB11_2:
; X32-NEXT: testl %ecx, %ecx
; X32-NEXT: jne .LBB11_3
-; X32-NEXT: # BB#4:
+; X32-NEXT: # %bb.4:
; X32-NEXT: xorl $31, %eax
; X32-NEXT: addl $32, %eax
; X32-NEXT: xorl %edx, %edx
@@ -436,10 +436,10 @@ define i64 @ctlz_i64_zero_test(i64 %n) {
; X32-NEXT: retl
;
; X64-LABEL: ctlz_i64_zero_test:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: testq %rdi, %rdi
; X64-NEXT: je .LBB11_1
-; X64-NEXT: # BB#2: # %cond.false
+; X64-NEXT: # %bb.2: # %cond.false
; X64-NEXT: bsrq %rdi, %rax
; X64-NEXT: xorq $63, %rax
; X64-NEXT: retq
@@ -448,11 +448,11 @@ define i64 @ctlz_i64_zero_test(i64 %n) {
; X64-NEXT: retq
;
; X32-CLZ-LABEL: ctlz_i64_zero_test:
-; X32-CLZ: # BB#0:
+; X32-CLZ: # %bb.0:
; X32-CLZ-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-CLZ-NEXT: testl %eax, %eax
; X32-CLZ-NEXT: jne .LBB11_1
-; X32-CLZ-NEXT: # BB#2:
+; X32-CLZ-NEXT: # %bb.2:
; X32-CLZ-NEXT: lzcntl {{[0-9]+}}(%esp), %eax
; X32-CLZ-NEXT: addl $32, %eax
; X32-CLZ-NEXT: xorl %edx, %edx
@@ -463,7 +463,7 @@ define i64 @ctlz_i64_zero_test(i64 %n) {
; X32-CLZ-NEXT: retl
;
; X64-CLZ-LABEL: ctlz_i64_zero_test:
-; X64-CLZ: # BB#0:
+; X64-CLZ: # %bb.0:
; X64-CLZ-NEXT: lzcntq %rdi, %rax
; X64-CLZ-NEXT: retq
%tmp1 = call i64 @llvm.ctlz.i64(i64 %n, i1 false)
@@ -473,48 +473,48 @@ define i64 @ctlz_i64_zero_test(i64 %n) {
; Generate a test and branch to handle zero inputs because bsr/bsf are very slow.
define i8 @cttz_i8_zero_test(i8 %n) {
; X32-LABEL: cttz_i8_zero_test:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movb {{[0-9]+}}(%esp), %al
; X32-NEXT: testb %al, %al
; X32-NEXT: je .LBB12_1
-; X32-NEXT: # BB#2: # %cond.false
+; X32-NEXT: # %bb.2: # %cond.false
; X32-NEXT: movzbl %al, %eax
; X32-NEXT: bsfl %eax, %eax
-; X32-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; X32-NEXT: # kill: def %al killed %al killed %eax
; X32-NEXT: retl
; X32-NEXT: .LBB12_1
; X32-NEXT: movb $8, %al
-; X32-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; X32-NEXT: # kill: def %al killed %al killed %eax
; X32-NEXT: retl
;
; X64-LABEL: cttz_i8_zero_test:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: testb %dil, %dil
; X64-NEXT: je .LBB12_1
-; X64-NEXT: # BB#2: # %cond.false
+; X64-NEXT: # %bb.2: # %cond.false
; X64-NEXT: movzbl %dil, %eax
; X64-NEXT: bsfl %eax, %eax
-; X64-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-NEXT: # kill: def %al killed %al killed %eax
; X64-NEXT: retq
; X64-NEXT: .LBB12_1:
; X64-NEXT: movb $8, %al
-; X64-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-NEXT: # kill: def %al killed %al killed %eax
; X64-NEXT: retq
;
; X32-CLZ-LABEL: cttz_i8_zero_test:
-; X32-CLZ: # BB#0:
+; X32-CLZ: # %bb.0:
; X32-CLZ-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; X32-CLZ-NEXT: orl $256, %eax # imm = 0x100
; X32-CLZ-NEXT: tzcntl %eax, %eax
-; X32-CLZ-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; X32-CLZ-NEXT: # kill: def %al killed %al killed %eax
; X32-CLZ-NEXT: retl
;
; X64-CLZ-LABEL: cttz_i8_zero_test:
-; X64-CLZ: # BB#0:
+; X64-CLZ: # %bb.0:
; X64-CLZ-NEXT: movzbl %dil, %eax
; X64-CLZ-NEXT: orl $256, %eax # imm = 0x100
; X64-CLZ-NEXT: tzcntl %eax, %eax
-; X64-CLZ-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-CLZ-NEXT: # kill: def %al killed %al killed %eax
; X64-CLZ-NEXT: retq
%tmp1 = call i8 @llvm.cttz.i8(i8 %n, i1 false)
ret i8 %tmp1
@@ -523,11 +523,11 @@ define i8 @cttz_i8_zero_test(i8 %n) {
; Generate a test and branch to handle zero inputs because bsr/bsf are very slow.
define i16 @cttz_i16_zero_test(i16 %n) {
; X32-LABEL: cttz_i16_zero_test:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: testw %ax, %ax
; X32-NEXT: je .LBB13_1
-; X32-NEXT: # BB#2: # %cond.false
+; X32-NEXT: # %bb.2: # %cond.false
; X32-NEXT: bsfw %ax, %ax
; X32-NEXT: retl
; X32-NEXT: .LBB13_1
@@ -535,10 +535,10 @@ define i16 @cttz_i16_zero_test(i16 %n) {
; X32-NEXT: retl
;
; X64-LABEL: cttz_i16_zero_test:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: testw %di, %di
; X64-NEXT: je .LBB13_1
-; X64-NEXT: # BB#2: # %cond.false
+; X64-NEXT: # %bb.2: # %cond.false
; X64-NEXT: bsfw %di, %ax
; X64-NEXT: retq
; X64-NEXT: .LBB13_1:
@@ -546,12 +546,12 @@ define i16 @cttz_i16_zero_test(i16 %n) {
; X64-NEXT: retq
;
; X32-CLZ-LABEL: cttz_i16_zero_test:
-; X32-CLZ: # BB#0:
+; X32-CLZ: # %bb.0:
; X32-CLZ-NEXT: tzcntw {{[0-9]+}}(%esp), %ax
; X32-CLZ-NEXT: retl
;
; X64-CLZ-LABEL: cttz_i16_zero_test:
-; X64-CLZ: # BB#0:
+; X64-CLZ: # %bb.0:
; X64-CLZ-NEXT: tzcntw %di, %ax
; X64-CLZ-NEXT: retq
%tmp1 = call i16 @llvm.cttz.i16(i16 %n, i1 false)
@@ -561,11 +561,11 @@ define i16 @cttz_i16_zero_test(i16 %n) {
; Generate a test and branch to handle zero inputs because bsr/bsf are very slow.
define i32 @cttz_i32_zero_test(i32 %n) {
; X32-LABEL: cttz_i32_zero_test:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: testl %eax, %eax
; X32-NEXT: je .LBB14_1
-; X32-NEXT: # BB#2: # %cond.false
+; X32-NEXT: # %bb.2: # %cond.false
; X32-NEXT: bsfl %eax, %eax
; X32-NEXT: retl
; X32-NEXT: .LBB14_1
@@ -573,10 +573,10 @@ define i32 @cttz_i32_zero_test(i32 %n) {
; X32-NEXT: retl
;
; X64-LABEL: cttz_i32_zero_test:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: testl %edi, %edi
; X64-NEXT: je .LBB14_1
-; X64-NEXT: # BB#2: # %cond.false
+; X64-NEXT: # %bb.2: # %cond.false
; X64-NEXT: bsfl %edi, %eax
; X64-NEXT: retq
; X64-NEXT: .LBB14_1:
@@ -584,12 +584,12 @@ define i32 @cttz_i32_zero_test(i32 %n) {
; X64-NEXT: retq
;
; X32-CLZ-LABEL: cttz_i32_zero_test:
-; X32-CLZ: # BB#0:
+; X32-CLZ: # %bb.0:
; X32-CLZ-NEXT: tzcntl {{[0-9]+}}(%esp), %eax
; X32-CLZ-NEXT: retl
;
; X64-CLZ-LABEL: cttz_i32_zero_test:
-; X64-CLZ: # BB#0:
+; X64-CLZ: # %bb.0:
; X64-CLZ-NEXT: tzcntl %edi, %eax
; X64-CLZ-NEXT: retq
%tmp1 = call i32 @llvm.cttz.i32(i32 %n, i1 false)
@@ -599,17 +599,17 @@ define i32 @cttz_i32_zero_test(i32 %n) {
; Generate a test and branch to handle zero inputs because bsr/bsf are very slow.
define i64 @cttz_i64_zero_test(i64 %n) {
; X32-LABEL: cttz_i64_zero_test:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: bsfl {{[0-9]+}}(%esp), %edx
; X32-NEXT: movl $32, %eax
; X32-NEXT: je .LBB15_2
-; X32-NEXT: # BB#1:
+; X32-NEXT: # %bb.1:
; X32-NEXT: movl %edx, %eax
; X32-NEXT: .LBB15_2:
; X32-NEXT: testl %ecx, %ecx
; X32-NEXT: jne .LBB15_3
-; X32-NEXT: # BB#4:
+; X32-NEXT: # %bb.4:
; X32-NEXT: addl $32, %eax
; X32-NEXT: xorl %edx, %edx
; X32-NEXT: retl
@@ -619,10 +619,10 @@ define i64 @cttz_i64_zero_test(i64 %n) {
; X32-NEXT: retl
;
; X64-LABEL: cttz_i64_zero_test:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: testq %rdi, %rdi
; X64-NEXT: je .LBB15_1
-; X64-NEXT: # BB#2: # %cond.false
+; X64-NEXT: # %bb.2: # %cond.false
; X64-NEXT: bsfq %rdi, %rax
; X64-NEXT: retq
; X64-NEXT: .LBB15_1:
@@ -630,11 +630,11 @@ define i64 @cttz_i64_zero_test(i64 %n) {
; X64-NEXT: retq
;
; X32-CLZ-LABEL: cttz_i64_zero_test:
-; X32-CLZ: # BB#0:
+; X32-CLZ: # %bb.0:
; X32-CLZ-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-CLZ-NEXT: testl %eax, %eax
; X32-CLZ-NEXT: jne .LBB15_1
-; X32-CLZ-NEXT: # BB#2:
+; X32-CLZ-NEXT: # %bb.2:
; X32-CLZ-NEXT: tzcntl {{[0-9]+}}(%esp), %eax
; X32-CLZ-NEXT: addl $32, %eax
; X32-CLZ-NEXT: xorl %edx, %edx
@@ -645,7 +645,7 @@ define i64 @cttz_i64_zero_test(i64 %n) {
; X32-CLZ-NEXT: retl
;
; X64-CLZ-LABEL: cttz_i64_zero_test:
-; X64-CLZ: # BB#0:
+; X64-CLZ: # %bb.0:
; X64-CLZ-NEXT: tzcntq %rdi, %rax
; X64-CLZ-NEXT: retq
%tmp1 = call i64 @llvm.cttz.i64(i64 %n, i1 false)
@@ -659,11 +659,11 @@ define i64 @cttz_i64_zero_test(i64 %n) {
; codegen doesn't know how to delete the movl and je.
define i32 @ctlz_i32_fold_cmov(i32 %n) {
; X32-LABEL: ctlz_i32_fold_cmov:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: orl $1, %eax
; X32-NEXT: je .LBB16_1
-; X32-NEXT: # BB#2: # %cond.false
+; X32-NEXT: # %bb.2: # %cond.false
; X32-NEXT: bsrl %eax, %eax
; X32-NEXT: xorl $31, %eax
; X32-NEXT: retl
@@ -672,10 +672,10 @@ define i32 @ctlz_i32_fold_cmov(i32 %n) {
; X32-NEXT: retl
;
; X64-LABEL: ctlz_i32_fold_cmov:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: orl $1, %edi
; X64-NEXT: je .LBB16_1
-; X64-NEXT: # BB#2: # %cond.false
+; X64-NEXT: # %bb.2: # %cond.false
; X64-NEXT: bsrl %edi, %eax
; X64-NEXT: xorl $31, %eax
; X64-NEXT: retq
@@ -684,14 +684,14 @@ define i32 @ctlz_i32_fold_cmov(i32 %n) {
; X64-NEXT: retq
;
; X32-CLZ-LABEL: ctlz_i32_fold_cmov:
-; X32-CLZ: # BB#0:
+; X32-CLZ: # %bb.0:
; X32-CLZ-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-CLZ-NEXT: orl $1, %eax
; X32-CLZ-NEXT: lzcntl %eax, %eax
; X32-CLZ-NEXT: retl
;
; X64-CLZ-LABEL: ctlz_i32_fold_cmov:
-; X64-CLZ: # BB#0:
+; X64-CLZ: # %bb.0:
; X64-CLZ-NEXT: orl $1, %edi
; X64-CLZ-NEXT: lzcntl %edi, %eax
; X64-CLZ-NEXT: retq
@@ -705,23 +705,23 @@ define i32 @ctlz_i32_fold_cmov(i32 %n) {
; FIXME: We should probably select BSR instead of LZCNT in these circumstances.
define i32 @ctlz_bsr(i32 %n) {
; X32-LABEL: ctlz_bsr:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: bsrl {{[0-9]+}}(%esp), %eax
; X32-NEXT: retl
;
; X64-LABEL: ctlz_bsr:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: bsrl %edi, %eax
; X64-NEXT: retq
;
; X32-CLZ-LABEL: ctlz_bsr:
-; X32-CLZ: # BB#0:
+; X32-CLZ: # %bb.0:
; X32-CLZ-NEXT: lzcntl {{[0-9]+}}(%esp), %eax
; X32-CLZ-NEXT: xorl $31, %eax
; X32-CLZ-NEXT: retl
;
; X64-CLZ-LABEL: ctlz_bsr:
-; X64-CLZ: # BB#0:
+; X64-CLZ: # %bb.0:
; X64-CLZ-NEXT: lzcntl %edi, %eax
; X64-CLZ-NEXT: xorl $31, %eax
; X64-CLZ-NEXT: retq
@@ -735,11 +735,11 @@ define i32 @ctlz_bsr(i32 %n) {
; codegen doesn't know how to combine the $32 and $31 into $63.
define i32 @ctlz_bsr_zero_test(i32 %n) {
; X32-LABEL: ctlz_bsr_zero_test:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: testl %eax, %eax
; X32-NEXT: je .LBB18_1
-; X32-NEXT: # BB#2: # %cond.false
+; X32-NEXT: # %bb.2: # %cond.false
; X32-NEXT: bsrl %eax, %eax
; X32-NEXT: xorl $31, %eax
; X32-NEXT: xorl $31, %eax
@@ -750,10 +750,10 @@ define i32 @ctlz_bsr_zero_test(i32 %n) {
; X32-NEXT: retl
;
; X64-LABEL: ctlz_bsr_zero_test:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: testl %edi, %edi
; X64-NEXT: je .LBB18_1
-; X64-NEXT: # BB#2: # %cond.false
+; X64-NEXT: # %bb.2: # %cond.false
; X64-NEXT: bsrl %edi, %eax
; X64-NEXT: xorl $31, %eax
; X64-NEXT: xorl $31, %eax
@@ -764,13 +764,13 @@ define i32 @ctlz_bsr_zero_test(i32 %n) {
; X64-NEXT: retq
;
; X32-CLZ-LABEL: ctlz_bsr_zero_test:
-; X32-CLZ: # BB#0:
+; X32-CLZ: # %bb.0:
; X32-CLZ-NEXT: lzcntl {{[0-9]+}}(%esp), %eax
; X32-CLZ-NEXT: xorl $31, %eax
; X32-CLZ-NEXT: retl
;
; X64-CLZ-LABEL: ctlz_bsr_zero_test:
-; X64-CLZ: # BB#0:
+; X64-CLZ: # %bb.0:
; X64-CLZ-NEXT: lzcntl %edi, %eax
; X64-CLZ-NEXT: xorl $31, %eax
; X64-CLZ-NEXT: retq
@@ -781,37 +781,37 @@ define i32 @ctlz_bsr_zero_test(i32 %n) {
define i8 @cttz_i8_knownbits(i8 %x) {
; X32-LABEL: cttz_i8_knownbits:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movb {{[0-9]+}}(%esp), %al
; X32-NEXT: orb $2, %al
; X32-NEXT: movzbl %al, %eax
; X32-NEXT: bsfl %eax, %eax
-; X32-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; X32-NEXT: # kill: def %al killed %al killed %eax
; X32-NEXT: retl
;
; X64-LABEL: cttz_i8_knownbits:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: orb $2, %dil
; X64-NEXT: movzbl %dil, %eax
; X64-NEXT: bsfl %eax, %eax
-; X64-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-NEXT: # kill: def %al killed %al killed %eax
; X64-NEXT: retq
;
; X32-CLZ-LABEL: cttz_i8_knownbits:
-; X32-CLZ: # BB#0:
+; X32-CLZ: # %bb.0:
; X32-CLZ-NEXT: movb {{[0-9]+}}(%esp), %al
; X32-CLZ-NEXT: orb $2, %al
; X32-CLZ-NEXT: movzbl %al, %eax
; X32-CLZ-NEXT: tzcntl %eax, %eax
-; X32-CLZ-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; X32-CLZ-NEXT: # kill: def %al killed %al killed %eax
; X32-CLZ-NEXT: retl
;
; X64-CLZ-LABEL: cttz_i8_knownbits:
-; X64-CLZ: # BB#0:
+; X64-CLZ: # %bb.0:
; X64-CLZ-NEXT: orb $2, %dil
; X64-CLZ-NEXT: movzbl %dil, %eax
; X64-CLZ-NEXT: tzcntl %eax, %eax
-; X64-CLZ-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-CLZ-NEXT: # kill: def %al killed %al killed %eax
; X64-CLZ-NEXT: retq
%x2 = or i8 %x, 2
%tmp = call i8 @llvm.cttz.i8(i8 %x2, i1 true )
@@ -821,41 +821,41 @@ define i8 @cttz_i8_knownbits(i8 %x) {
define i8 @ctlz_i8_knownbits(i8 %x) {
; X32-LABEL: ctlz_i8_knownbits:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movb {{[0-9]+}}(%esp), %al
; X32-NEXT: orb $64, %al
; X32-NEXT: movzbl %al, %eax
; X32-NEXT: bsrl %eax, %eax
; X32-NEXT: xorl $7, %eax
-; X32-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; X32-NEXT: # kill: def %al killed %al killed %eax
; X32-NEXT: retl
;
; X64-LABEL: ctlz_i8_knownbits:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: orb $64, %dil
; X64-NEXT: movzbl %dil, %eax
; X64-NEXT: bsrl %eax, %eax
; X64-NEXT: xorl $7, %eax
-; X64-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-NEXT: # kill: def %al killed %al killed %eax
; X64-NEXT: retq
;
; X32-CLZ-LABEL: ctlz_i8_knownbits:
-; X32-CLZ: # BB#0:
+; X32-CLZ: # %bb.0:
; X32-CLZ-NEXT: movb {{[0-9]+}}(%esp), %al
; X32-CLZ-NEXT: orb $64, %al
; X32-CLZ-NEXT: movzbl %al, %eax
; X32-CLZ-NEXT: lzcntl %eax, %eax
; X32-CLZ-NEXT: addl $-24, %eax
-; X32-CLZ-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; X32-CLZ-NEXT: # kill: def %al killed %al killed %eax
; X32-CLZ-NEXT: retl
;
; X64-CLZ-LABEL: ctlz_i8_knownbits:
-; X64-CLZ: # BB#0:
+; X64-CLZ: # %bb.0:
; X64-CLZ-NEXT: orb $64, %dil
; X64-CLZ-NEXT: movzbl %dil, %eax
; X64-CLZ-NEXT: lzcntl %eax, %eax
; X64-CLZ-NEXT: addl $-24, %eax
-; X64-CLZ-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-CLZ-NEXT: # kill: def %al killed %al killed %eax
; X64-CLZ-NEXT: retq
%x2 = or i8 %x, 64
diff --git a/test/CodeGen/X86/clzero-schedule.ll b/test/CodeGen/X86/clzero-schedule.ll
new file mode 100644
index 000000000000..3a1c1b2cdc7d
--- /dev/null
+++ b/test/CodeGen/X86/clzero-schedule.ll
@@ -0,0 +1,20 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=+clzero | FileCheck %s --check-prefix=GENERIC
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=ZNVER1
+
+define void @test_clzero(i8* %p) {
+; GENERIC-LABEL: test_clzero:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: leaq (%rdi), %rax # sched: [1:0.50]
+; GENERIC-NEXT: clzero # sched: [100:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ZNVER1-LABEL: test_clzero:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: leaq (%rdi), %rax # sched: [1:0.25]
+; ZNVER1-NEXT: clzero # sched: [100:?]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ tail call void @llvm.x86.clzero(i8* %p)
+ ret void
+}
+declare void @llvm.x86.clzero(i8*)
diff --git a/test/CodeGen/X86/clzero.ll b/test/CodeGen/X86/clzero.ll
index f15d4deedeff..d08470dda925 100644
--- a/test/CodeGen/X86/clzero.ll
+++ b/test/CodeGen/X86/clzero.ll
@@ -4,13 +4,13 @@
define void @foo(i8* %p) #0 {
; X64-LABEL: foo:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: leaq (%rdi), %rax
; X64-NEXT: clzero
; X64-NEXT: retq
;
; X32-LABEL: foo:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: leal (%eax), %eax
; X32-NEXT: clzero
diff --git a/test/CodeGen/X86/cmov-fp.ll b/test/CodeGen/X86/cmov-fp.ll
index 768af943eb49..d32ccffe4932 100644
--- a/test/CodeGen/X86/cmov-fp.ll
+++ b/test/CodeGen/X86/cmov-fp.ll
@@ -1,7 +1,7 @@
-; RUN: llc -march x86 -mcpu pentium4 < %s | FileCheck %s -check-prefix=SSE
-; RUN: llc -march x86 -mcpu pentium3 < %s | FileCheck %s -check-prefix=NOSSE2
-; RUN: llc -march x86 -mcpu pentium2 < %s | FileCheck %s -check-prefix=NOSSE1
-; RUN: llc -march x86 -mcpu pentium < %s | FileCheck %s -check-prefix=NOCMOV
+; RUN: llc -mtriple=i686-- -mcpu pentium4 < %s | FileCheck %s -check-prefix=SSE
+; RUN: llc -mtriple=i686-- -mcpu pentium3 < %s | FileCheck %s -check-prefix=NOSSE2
+; RUN: llc -mtriple=i686-- -mcpu pentium2 < %s | FileCheck %s -check-prefix=NOSSE1
+; RUN: llc -mtriple=i686-- -mcpu pentium < %s | FileCheck %s -check-prefix=NOCMOV
; PR14035
define double @test1(i32 %a, i32 %b, double %x) nounwind {
diff --git a/test/CodeGen/X86/cmov-into-branch.ll b/test/CodeGen/X86/cmov-into-branch.ll
index e38039501646..c18a9ca7459c 100644
--- a/test/CodeGen/X86/cmov-into-branch.ll
+++ b/test/CodeGen/X86/cmov-into-branch.ll
@@ -4,7 +4,7 @@
; cmp with single-use load, should not form branch.
define i32 @test1(double %a, double* nocapture %b, i32 %x, i32 %y) {
; CHECK-LABEL: test1:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: ucomisd (%rdi), %xmm0
; CHECK-NEXT: cmovbel %edx, %esi
; CHECK-NEXT: movl %esi, %eax
@@ -18,7 +18,7 @@ define i32 @test1(double %a, double* nocapture %b, i32 %x, i32 %y) {
; Sanity check: no load.
define i32 @test2(double %a, double %b, i32 %x, i32 %y) {
; CHECK-LABEL: test2:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: ucomisd %xmm1, %xmm0
; CHECK-NEXT: cmovbel %esi, %edi
; CHECK-NEXT: movl %edi, %eax
@@ -31,7 +31,7 @@ define i32 @test2(double %a, double %b, i32 %x, i32 %y) {
; Multiple uses of the load.
define i32 @test4(i32 %a, i32* nocapture %b, i32 %x, i32 %y) {
; CHECK-LABEL: test4:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movl (%rsi), %eax
; CHECK-NEXT: cmpl %edi, %eax
; CHECK-NEXT: cmovael %ecx, %edx
@@ -47,7 +47,7 @@ define i32 @test4(i32 %a, i32* nocapture %b, i32 %x, i32 %y) {
; Multiple uses of the cmp.
define i32 @test5(i32 %a, i32* nocapture %b, i32 %x, i32 %y) {
; CHECK-LABEL: test5:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: cmpl %edi, (%rsi)
; CHECK-NEXT: cmoval %edi, %ecx
; CHECK-NEXT: cmovael %edx, %ecx
@@ -61,10 +61,28 @@ define i32 @test5(i32 %a, i32* nocapture %b, i32 %x, i32 %y) {
ret i32 %cond5
}
+; Zero-extended select.
+define void @test6(i32 %a, i32 %x, i32* %y.ptr, i64* %z.ptr) {
+; CHECK-LABEL: test6:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: # kill: def %esi killed %esi def %rsi
+; CHECK-NEXT: testl %edi, %edi
+; CHECK-NEXT: cmovnsl (%rdx), %esi
+; CHECK-NEXT: movq %rsi, (%rcx)
+; CHECK-NEXT: retq
+entry:
+ %y = load i32, i32* %y.ptr
+ %cmp = icmp slt i32 %a, 0
+ %z = select i1 %cmp, i32 %x, i32 %y
+ %z.ext = zext i32 %z to i64
+ store i64 %z.ext, i64* %z.ptr
+ ret void
+}
+
; If a select is not obviously predictable, don't turn it into a branch.
define i32 @weighted_select1(i32 %a, i32 %b) {
; CHECK-LABEL: weighted_select1:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: testl %edi, %edi
; CHECK-NEXT: cmovnel %edi, %esi
; CHECK-NEXT: movl %esi, %eax
@@ -77,12 +95,12 @@ define i32 @weighted_select1(i32 %a, i32 %b) {
; If a select is obviously predictable, turn it into a branch.
define i32 @weighted_select2(i32 %a, i32 %b) {
; CHECK-LABEL: weighted_select2:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: testl %edi, %edi
-; CHECK-NEXT: jne .LBB5_2
-; CHECK-NEXT: # BB#1: # %select.false
+; CHECK-NEXT: jne .LBB6_2
+; CHECK-NEXT: # %bb.1: # %select.false
; CHECK-NEXT: movl %esi, %edi
-; CHECK-NEXT: .LBB5_2: # %select.end
+; CHECK-NEXT: .LBB6_2: # %select.end
; CHECK-NEXT: movl %edi, %eax
; CHECK-NEXT: retq
%cmp = icmp ne i32 %a, 0
@@ -96,13 +114,13 @@ define i32 @weighted_select2(i32 %a, i32 %b) {
; TODO: But likely true vs. likely false should affect basic block placement?
define i32 @weighted_select3(i32 %a, i32 %b) {
; CHECK-LABEL: weighted_select3:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: testl %edi, %edi
-; CHECK-NEXT: je .LBB6_1
-; CHECK-NEXT: # BB#2: # %select.end
+; CHECK-NEXT: je .LBB7_1
+; CHECK-NEXT: # %bb.2: # %select.end
; CHECK-NEXT: movl %edi, %eax
; CHECK-NEXT: retq
-; CHECK-NEXT: .LBB6_1: # %select.false
+; CHECK-NEXT: .LBB7_1: # %select.false
; CHECK-NEXT: movl %esi, %edi
; CHECK-NEXT: movl %edi, %eax
; CHECK-NEXT: retq
@@ -114,7 +132,7 @@ define i32 @weighted_select3(i32 %a, i32 %b) {
; Weightlessness is no reason to die.
define i32 @unweighted_select(i32 %a, i32 %b) {
; CHECK-LABEL: unweighted_select:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: testl %edi, %edi
; CHECK-NEXT: cmovnel %edi, %esi
; CHECK-NEXT: movl %esi, %eax
diff --git a/test/CodeGen/X86/cmov-promotion.ll b/test/CodeGen/X86/cmov-promotion.ll
new file mode 100644
index 000000000000..8e34b62eadbd
--- /dev/null
+++ b/test/CodeGen/X86/cmov-promotion.ll
@@ -0,0 +1,317 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+cmov | FileCheck %s --check-prefix=CMOV
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=-cmov | FileCheck %s --check-prefix=NO_CMOV
+
+define i16 @cmov_zpromotion_8_to_16(i1 %c) {
+; CMOV-LABEL: cmov_zpromotion_8_to_16:
+; CMOV: # %bb.0:
+; CMOV-NEXT: testb $1, %dil
+; CMOV-NEXT: movb $117, %al
+; CMOV-NEXT: jne .LBB0_2
+; CMOV-NEXT: # %bb.1:
+; CMOV-NEXT: movb $-19, %al
+; CMOV-NEXT: .LBB0_2:
+; CMOV-NEXT: movzbl %al, %eax
+; CMOV-NEXT: # kill: def %ax killed %ax killed %eax
+; CMOV-NEXT: retq
+;
+; NO_CMOV-LABEL: cmov_zpromotion_8_to_16:
+; NO_CMOV: # %bb.0:
+; NO_CMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
+; NO_CMOV-NEXT: movb $117, %al
+; NO_CMOV-NEXT: jne .LBB0_2
+; NO_CMOV-NEXT: # %bb.1:
+; NO_CMOV-NEXT: movb $-19, %al
+; NO_CMOV-NEXT: .LBB0_2:
+; NO_CMOV-NEXT: movzbl %al, %eax
+; NO_CMOV-NEXT: # kill: def %ax killed %ax killed %eax
+; NO_CMOV-NEXT: retl
+ %t0 = select i1 %c, i8 117, i8 -19
+ %ret = zext i8 %t0 to i16
+ ret i16 %ret
+}
+
+define i32 @cmov_zpromotion_8_to_32(i1 %c) {
+; CMOV-LABEL: cmov_zpromotion_8_to_32:
+; CMOV: # %bb.0:
+; CMOV-NEXT: testb $1, %dil
+; CMOV-NEXT: movb $126, %al
+; CMOV-NEXT: jne .LBB1_2
+; CMOV-NEXT: # %bb.1:
+; CMOV-NEXT: movb $-1, %al
+; CMOV-NEXT: .LBB1_2:
+; CMOV-NEXT: movzbl %al, %eax
+; CMOV-NEXT: retq
+;
+; NO_CMOV-LABEL: cmov_zpromotion_8_to_32:
+; NO_CMOV: # %bb.0:
+; NO_CMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
+; NO_CMOV-NEXT: movb $126, %al
+; NO_CMOV-NEXT: jne .LBB1_2
+; NO_CMOV-NEXT: # %bb.1:
+; NO_CMOV-NEXT: movb $-1, %al
+; NO_CMOV-NEXT: .LBB1_2:
+; NO_CMOV-NEXT: movzbl %al, %eax
+; NO_CMOV-NEXT: retl
+ %t0 = select i1 %c, i8 12414, i8 -1
+ %ret = zext i8 %t0 to i32
+ ret i32 %ret
+}
+
+define i64 @cmov_zpromotion_8_to_64(i1 %c) {
+; CMOV-LABEL: cmov_zpromotion_8_to_64:
+; CMOV: # %bb.0:
+; CMOV-NEXT: testb $1, %dil
+; CMOV-NEXT: movb $126, %al
+; CMOV-NEXT: jne .LBB2_2
+; CMOV-NEXT: # %bb.1:
+; CMOV-NEXT: movb $-1, %al
+; CMOV-NEXT: .LBB2_2:
+; CMOV-NEXT: movzbl %al, %eax
+; CMOV-NEXT: retq
+;
+; NO_CMOV-LABEL: cmov_zpromotion_8_to_64:
+; NO_CMOV: # %bb.0:
+; NO_CMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
+; NO_CMOV-NEXT: movb $126, %al
+; NO_CMOV-NEXT: jne .LBB2_2
+; NO_CMOV-NEXT: # %bb.1:
+; NO_CMOV-NEXT: movb $-1, %al
+; NO_CMOV-NEXT: .LBB2_2:
+; NO_CMOV-NEXT: movzbl %al, %eax
+; NO_CMOV-NEXT: xorl %edx, %edx
+; NO_CMOV-NEXT: retl
+ %t0 = select i1 %c, i8 12414, i8 -1
+ %ret = zext i8 %t0 to i64
+ ret i64 %ret
+}
+
+define i32 @cmov_zpromotion_16_to_32(i1 %c) {
+; CMOV-LABEL: cmov_zpromotion_16_to_32:
+; CMOV: # %bb.0:
+; CMOV-NEXT: testb $1, %dil
+; CMOV-NEXT: movl $12414, %ecx # imm = 0x307E
+; CMOV-NEXT: movl $65535, %eax # imm = 0xFFFF
+; CMOV-NEXT: cmovnel %ecx, %eax
+; CMOV-NEXT: retq
+;
+; NO_CMOV-LABEL: cmov_zpromotion_16_to_32:
+; NO_CMOV: # %bb.0:
+; NO_CMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
+; NO_CMOV-NEXT: movl $12414, %eax # imm = 0x307E
+; NO_CMOV-NEXT: jne .LBB3_2
+; NO_CMOV-NEXT: # %bb.1:
+; NO_CMOV-NEXT: movl $65535, %eax # imm = 0xFFFF
+; NO_CMOV-NEXT: .LBB3_2:
+; NO_CMOV-NEXT: retl
+ %t0 = select i1 %c, i16 12414, i16 -1
+ %ret = zext i16 %t0 to i32
+ ret i32 %ret
+}
+
+define i64 @cmov_zpromotion_16_to_64(i1 %c) {
+; CMOV-LABEL: cmov_zpromotion_16_to_64:
+; CMOV: # %bb.0:
+; CMOV-NEXT: testb $1, %dil
+; CMOV-NEXT: movl $12414, %ecx # imm = 0x307E
+; CMOV-NEXT: movl $65535, %eax # imm = 0xFFFF
+; CMOV-NEXT: cmovneq %rcx, %rax
+; CMOV-NEXT: retq
+;
+; NO_CMOV-LABEL: cmov_zpromotion_16_to_64:
+; NO_CMOV: # %bb.0:
+; NO_CMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
+; NO_CMOV-NEXT: movl $12414, %eax # imm = 0x307E
+; NO_CMOV-NEXT: jne .LBB4_2
+; NO_CMOV-NEXT: # %bb.1:
+; NO_CMOV-NEXT: movl $65535, %eax # imm = 0xFFFF
+; NO_CMOV-NEXT: .LBB4_2:
+; NO_CMOV-NEXT: xorl %edx, %edx
+; NO_CMOV-NEXT: retl
+ %t0 = select i1 %c, i16 12414, i16 -1
+ %ret = zext i16 %t0 to i64
+ ret i64 %ret
+}
+
+define i64 @cmov_zpromotion_32_to_64(i1 %c) {
+; CMOV-LABEL: cmov_zpromotion_32_to_64:
+; CMOV: # %bb.0:
+; CMOV-NEXT: testb $1, %dil
+; CMOV-NEXT: movl $12414, %ecx # imm = 0x307E
+; CMOV-NEXT: movl $-1, %eax
+; CMOV-NEXT: cmovnel %ecx, %eax
+; CMOV-NEXT: retq
+;
+; NO_CMOV-LABEL: cmov_zpromotion_32_to_64:
+; NO_CMOV: # %bb.0:
+; NO_CMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
+; NO_CMOV-NEXT: movl $12414, %eax # imm = 0x307E
+; NO_CMOV-NEXT: jne .LBB5_2
+; NO_CMOV-NEXT: # %bb.1:
+; NO_CMOV-NEXT: movl $-1, %eax
+; NO_CMOV-NEXT: .LBB5_2:
+; NO_CMOV-NEXT: xorl %edx, %edx
+; NO_CMOV-NEXT: retl
+ %t0 = select i1 %c, i32 12414, i32 -1
+ %ret = zext i32 %t0 to i64
+ ret i64 %ret
+}
+
+define i16 @cmov_spromotion_8_to_16(i1 %c) {
+; CMOV-LABEL: cmov_spromotion_8_to_16:
+; CMOV: # %bb.0:
+; CMOV-NEXT: testb $1, %dil
+; CMOV-NEXT: movb $117, %al
+; CMOV-NEXT: jne .LBB6_2
+; CMOV-NEXT: # %bb.1:
+; CMOV-NEXT: movb $-19, %al
+; CMOV-NEXT: .LBB6_2:
+; CMOV-NEXT: movsbl %al, %eax
+; CMOV-NEXT: # kill: def %ax killed %ax killed %eax
+; CMOV-NEXT: retq
+;
+; NO_CMOV-LABEL: cmov_spromotion_8_to_16:
+; NO_CMOV: # %bb.0:
+; NO_CMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
+; NO_CMOV-NEXT: movb $117, %al
+; NO_CMOV-NEXT: jne .LBB6_2
+; NO_CMOV-NEXT: # %bb.1:
+; NO_CMOV-NEXT: movb $-19, %al
+; NO_CMOV-NEXT: .LBB6_2:
+; NO_CMOV-NEXT: movsbl %al, %eax
+; NO_CMOV-NEXT: # kill: def %ax killed %ax killed %eax
+; NO_CMOV-NEXT: retl
+ %t0 = select i1 %c, i8 117, i8 -19
+ %ret = sext i8 %t0 to i16
+ ret i16 %ret
+}
+
+define i32 @cmov_spromotion_8_to_32(i1 %c) {
+; CMOV-LABEL: cmov_spromotion_8_to_32:
+; CMOV: # %bb.0:
+; CMOV-NEXT: testb $1, %dil
+; CMOV-NEXT: movb $126, %al
+; CMOV-NEXT: jne .LBB7_2
+; CMOV-NEXT: # %bb.1:
+; CMOV-NEXT: movb $-1, %al
+; CMOV-NEXT: .LBB7_2:
+; CMOV-NEXT: movsbl %al, %eax
+; CMOV-NEXT: retq
+;
+; NO_CMOV-LABEL: cmov_spromotion_8_to_32:
+; NO_CMOV: # %bb.0:
+; NO_CMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
+; NO_CMOV-NEXT: movb $126, %al
+; NO_CMOV-NEXT: jne .LBB7_2
+; NO_CMOV-NEXT: # %bb.1:
+; NO_CMOV-NEXT: movb $-1, %al
+; NO_CMOV-NEXT: .LBB7_2:
+; NO_CMOV-NEXT: movsbl %al, %eax
+; NO_CMOV-NEXT: retl
+ %t0 = select i1 %c, i8 12414, i8 -1
+ %ret = sext i8 %t0 to i32
+ ret i32 %ret
+}
+
+define i64 @cmov_spromotion_8_to_64(i1 %c) {
+; CMOV-LABEL: cmov_spromotion_8_to_64:
+; CMOV: # %bb.0:
+; CMOV-NEXT: testb $1, %dil
+; CMOV-NEXT: movb $126, %al
+; CMOV-NEXT: jne .LBB8_2
+; CMOV-NEXT: # %bb.1:
+; CMOV-NEXT: movb $-1, %al
+; CMOV-NEXT: .LBB8_2:
+; CMOV-NEXT: movsbq %al, %rax
+; CMOV-NEXT: retq
+;
+; NO_CMOV-LABEL: cmov_spromotion_8_to_64:
+; NO_CMOV: # %bb.0:
+; NO_CMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
+; NO_CMOV-NEXT: movb $126, %al
+; NO_CMOV-NEXT: jne .LBB8_2
+; NO_CMOV-NEXT: # %bb.1:
+; NO_CMOV-NEXT: movb $-1, %al
+; NO_CMOV-NEXT: .LBB8_2:
+; NO_CMOV-NEXT: movsbl %al, %eax
+; NO_CMOV-NEXT: movl %eax, %edx
+; NO_CMOV-NEXT: sarl $31, %edx
+; NO_CMOV-NEXT: retl
+ %t0 = select i1 %c, i8 12414, i8 -1
+ %ret = sext i8 %t0 to i64
+ ret i64 %ret
+}
+
+define i32 @cmov_spromotion_16_to_32(i1 %c) {
+; CMOV-LABEL: cmov_spromotion_16_to_32:
+; CMOV: # %bb.0:
+; CMOV-NEXT: testb $1, %dil
+; CMOV-NEXT: movl $12414, %ecx # imm = 0x307E
+; CMOV-NEXT: movl $-1, %eax
+; CMOV-NEXT: cmovnel %ecx, %eax
+; CMOV-NEXT: retq
+;
+; NO_CMOV-LABEL: cmov_spromotion_16_to_32:
+; NO_CMOV: # %bb.0:
+; NO_CMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
+; NO_CMOV-NEXT: movl $12414, %eax # imm = 0x307E
+; NO_CMOV-NEXT: jne .LBB9_2
+; NO_CMOV-NEXT: # %bb.1:
+; NO_CMOV-NEXT: movl $-1, %eax
+; NO_CMOV-NEXT: .LBB9_2:
+; NO_CMOV-NEXT: retl
+ %t0 = select i1 %c, i16 12414, i16 -1
+ %ret = sext i16 %t0 to i32
+ ret i32 %ret
+}
+
+define i64 @cmov_spromotion_16_to_64(i1 %c) {
+; CMOV-LABEL: cmov_spromotion_16_to_64:
+; CMOV: # %bb.0:
+; CMOV-NEXT: testb $1, %dil
+; CMOV-NEXT: movl $12414, %ecx # imm = 0x307E
+; CMOV-NEXT: movq $-1, %rax
+; CMOV-NEXT: cmovneq %rcx, %rax
+; CMOV-NEXT: retq
+;
+; NO_CMOV-LABEL: cmov_spromotion_16_to_64:
+; NO_CMOV: # %bb.0:
+; NO_CMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
+; NO_CMOV-NEXT: movl $12414, %eax # imm = 0x307E
+; NO_CMOV-NEXT: jne .LBB10_2
+; NO_CMOV-NEXT: # %bb.1:
+; NO_CMOV-NEXT: movl $-1, %eax
+; NO_CMOV-NEXT: .LBB10_2:
+; NO_CMOV-NEXT: movl %eax, %edx
+; NO_CMOV-NEXT: sarl $31, %edx
+; NO_CMOV-NEXT: retl
+ %t0 = select i1 %c, i16 12414, i16 -1
+ %ret = sext i16 %t0 to i64
+ ret i64 %ret
+}
+
+define i64 @cmov_spromotion_32_to_64(i1 %c) {
+; CMOV-LABEL: cmov_spromotion_32_to_64:
+; CMOV: # %bb.0:
+; CMOV-NEXT: testb $1, %dil
+; CMOV-NEXT: movl $12414, %eax # imm = 0x307E
+; CMOV-NEXT: movl $-1, %ecx
+; CMOV-NEXT: cmovnel %eax, %ecx
+; CMOV-NEXT: movslq %ecx, %rax
+; CMOV-NEXT: retq
+;
+; NO_CMOV-LABEL: cmov_spromotion_32_to_64:
+; NO_CMOV: # %bb.0:
+; NO_CMOV-NEXT: testb $1, {{[0-9]+}}(%esp)
+; NO_CMOV-NEXT: movl $12414, %eax # imm = 0x307E
+; NO_CMOV-NEXT: jne .LBB11_2
+; NO_CMOV-NEXT: # %bb.1:
+; NO_CMOV-NEXT: movl $-1, %eax
+; NO_CMOV-NEXT: .LBB11_2:
+; NO_CMOV-NEXT: movl %eax, %edx
+; NO_CMOV-NEXT: sarl $31, %edx
+; NO_CMOV-NEXT: retl
+ %t0 = select i1 %c, i32 12414, i32 -1
+ %ret = sext i32 %t0 to i64
+ ret i64 %ret
+}
diff --git a/test/CodeGen/X86/cmov-schedule.ll b/test/CodeGen/X86/cmov-schedule.ll
new file mode 100644
index 000000000000..4053f63a0a7f
--- /dev/null
+++ b/test/CodeGen/X86/cmov-schedule.ll
@@ -0,0 +1,2004 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 | FileCheck %s --check-prefix=CHECK --check-prefix=GENERIC
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=atom | FileCheck %s --check-prefix=CHECK --check-prefix=ATOM
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=slm | FileCheck %s --check-prefix=CHECK --check-prefix=SLM
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=sandybridge | FileCheck %s --check-prefix=CHECK --check-prefix=SANDY
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=ivybridge | FileCheck %s --check-prefix=CHECK --check-prefix=SANDY
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=haswell | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=broadwell | FileCheck %s --check-prefix=CHECK --check-prefix=BROADWELL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=SKYLAKE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skx | FileCheck %s --check-prefix=CHECK --check-prefix=SKX
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1
+
+define void @test_cmov_16(i16 %a0, i16 %a1, i16 *%a2) optsize {
+; GENERIC-LABEL: test_cmov_16:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: cmovow %si, %di # sched: [2:0.67]
+; GENERIC-NEXT: cmovnow %si, %di # sched: [2:0.67]
+; GENERIC-NEXT: cmovbw %si, %di # sched: [2:0.67]
+; GENERIC-NEXT: cmovbw %si, %di # sched: [2:0.67]
+; GENERIC-NEXT: cmovbw %si, %di # sched: [2:0.67]
+; GENERIC-NEXT: cmovaew %si, %di # sched: [2:0.67]
+; GENERIC-NEXT: cmovaew %si, %di # sched: [2:0.67]
+; GENERIC-NEXT: cmovaew %si, %di # sched: [2:0.67]
+; GENERIC-NEXT: cmovew %si, %di # sched: [2:0.67]
+; GENERIC-NEXT: cmovew %si, %di # sched: [2:0.67]
+; GENERIC-NEXT: cmovnew %si, %di # sched: [2:0.67]
+; GENERIC-NEXT: cmovnew %si, %di # sched: [2:0.67]
+; GENERIC-NEXT: cmovbew %si, %di # sched: [3:1.00]
+; GENERIC-NEXT: cmovbew %si, %di # sched: [3:1.00]
+; GENERIC-NEXT: cmovaw %si, %di # sched: [3:1.00]
+; GENERIC-NEXT: cmovaw %si, %di # sched: [3:1.00]
+; GENERIC-NEXT: cmovsw %si, %di # sched: [2:0.67]
+; GENERIC-NEXT: cmovnsw %si, %di # sched: [2:0.67]
+; GENERIC-NEXT: cmovpw %si, %di # sched: [2:0.67]
+; GENERIC-NEXT: cmovpw %si, %di # sched: [2:0.67]
+; GENERIC-NEXT: cmovnpw %si, %di # sched: [2:0.67]
+; GENERIC-NEXT: cmovnpw %si, %di # sched: [2:0.67]
+; GENERIC-NEXT: cmovlw %si, %di # sched: [2:0.67]
+; GENERIC-NEXT: cmovlw %si, %di # sched: [2:0.67]
+; GENERIC-NEXT: cmovgew %si, %di # sched: [2:0.67]
+; GENERIC-NEXT: cmovgew %si, %di # sched: [2:0.67]
+; GENERIC-NEXT: cmovlew %si, %di # sched: [2:0.67]
+; GENERIC-NEXT: cmovlew %si, %di # sched: [2:0.67]
+; GENERIC-NEXT: cmovgw %si, %di # sched: [2:0.67]
+; GENERIC-NEXT: cmovgw %si, %di # sched: [2:0.67]
+; GENERIC-NEXT: cmovow (%rdx), %di # sched: [7:0.67]
+; GENERIC-NEXT: cmovnow (%rdx), %di # sched: [7:0.67]
+; GENERIC-NEXT: cmovbw (%rdx), %di # sched: [7:0.67]
+; GENERIC-NEXT: cmovbw (%rdx), %di # sched: [7:0.67]
+; GENERIC-NEXT: cmovbw (%rdx), %di # sched: [7:0.67]
+; GENERIC-NEXT: cmovaew (%rdx), %di # sched: [7:0.67]
+; GENERIC-NEXT: cmovaew (%rdx), %di # sched: [7:0.67]
+; GENERIC-NEXT: cmovaew (%rdx), %di # sched: [7:0.67]
+; GENERIC-NEXT: cmovew (%rdx), %di # sched: [7:0.67]
+; GENERIC-NEXT: cmovew (%rdx), %di # sched: [7:0.67]
+; GENERIC-NEXT: cmovnew (%rdx), %di # sched: [7:0.67]
+; GENERIC-NEXT: cmovnew (%rdx), %di # sched: [7:0.67]
+; GENERIC-NEXT: cmovbew (%rdx), %di # sched: [8:1.00]
+; GENERIC-NEXT: cmovbew (%rdx), %di # sched: [8:1.00]
+; GENERIC-NEXT: cmovaw (%rdx), %di # sched: [8:1.00]
+; GENERIC-NEXT: cmovaw (%rdx), %di # sched: [8:1.00]
+; GENERIC-NEXT: cmovsw (%rdx), %di # sched: [7:0.67]
+; GENERIC-NEXT: cmovnsw (%rdx), %di # sched: [7:0.67]
+; GENERIC-NEXT: cmovpw (%rdx), %di # sched: [7:0.67]
+; GENERIC-NEXT: cmovpw (%rdx), %di # sched: [7:0.67]
+; GENERIC-NEXT: cmovnpw (%rdx), %di # sched: [7:0.67]
+; GENERIC-NEXT: cmovnpw (%rdx), %di # sched: [7:0.67]
+; GENERIC-NEXT: cmovlw (%rdx), %di # sched: [7:0.67]
+; GENERIC-NEXT: cmovlw (%rdx), %di # sched: [7:0.67]
+; GENERIC-NEXT: cmovgew (%rdx), %di # sched: [7:0.67]
+; GENERIC-NEXT: cmovgew (%rdx), %di # sched: [7:0.67]
+; GENERIC-NEXT: cmovlew (%rdx), %di # sched: [7:0.67]
+; GENERIC-NEXT: cmovlew (%rdx), %di # sched: [7:0.67]
+; GENERIC-NEXT: cmovgw (%rdx), %di # sched: [7:0.67]
+; GENERIC-NEXT: cmovgw (%rdx), %di # sched: [7:0.67]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_cmov_16:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: cmovow %si, %di # sched: [1:0.50]
+; ATOM-NEXT: cmovnow %si, %di # sched: [1:0.50]
+; ATOM-NEXT: cmovbw %si, %di # sched: [1:0.50]
+; ATOM-NEXT: cmovbw %si, %di # sched: [1:0.50]
+; ATOM-NEXT: cmovbw %si, %di # sched: [1:0.50]
+; ATOM-NEXT: cmovaew %si, %di # sched: [1:0.50]
+; ATOM-NEXT: cmovaew %si, %di # sched: [1:0.50]
+; ATOM-NEXT: cmovaew %si, %di # sched: [1:0.50]
+; ATOM-NEXT: cmovew %si, %di # sched: [1:0.50]
+; ATOM-NEXT: cmovew %si, %di # sched: [1:0.50]
+; ATOM-NEXT: cmovnew %si, %di # sched: [1:0.50]
+; ATOM-NEXT: cmovnew %si, %di # sched: [1:0.50]
+; ATOM-NEXT: cmovbew %si, %di # sched: [1:0.50]
+; ATOM-NEXT: cmovbew %si, %di # sched: [1:0.50]
+; ATOM-NEXT: cmovaw %si, %di # sched: [1:0.50]
+; ATOM-NEXT: cmovaw %si, %di # sched: [1:0.50]
+; ATOM-NEXT: cmovsw %si, %di # sched: [1:0.50]
+; ATOM-NEXT: cmovnsw %si, %di # sched: [1:0.50]
+; ATOM-NEXT: cmovpw %si, %di # sched: [1:0.50]
+; ATOM-NEXT: cmovpw %si, %di # sched: [1:0.50]
+; ATOM-NEXT: cmovnpw %si, %di # sched: [1:0.50]
+; ATOM-NEXT: cmovnpw %si, %di # sched: [1:0.50]
+; ATOM-NEXT: cmovlw %si, %di # sched: [1:0.50]
+; ATOM-NEXT: cmovlw %si, %di # sched: [1:0.50]
+; ATOM-NEXT: cmovgew %si, %di # sched: [1:0.50]
+; ATOM-NEXT: cmovgew %si, %di # sched: [1:0.50]
+; ATOM-NEXT: cmovlew %si, %di # sched: [1:0.50]
+; ATOM-NEXT: cmovlew %si, %di # sched: [1:0.50]
+; ATOM-NEXT: cmovgw %si, %di # sched: [1:0.50]
+; ATOM-NEXT: cmovgw %si, %di # sched: [1:0.50]
+; ATOM-NEXT: cmovow (%rdx), %di # sched: [1:1.00]
+; ATOM-NEXT: cmovnow (%rdx), %di # sched: [1:1.00]
+; ATOM-NEXT: cmovbw (%rdx), %di # sched: [1:1.00]
+; ATOM-NEXT: cmovbw (%rdx), %di # sched: [1:1.00]
+; ATOM-NEXT: cmovbw (%rdx), %di # sched: [1:1.00]
+; ATOM-NEXT: cmovaew (%rdx), %di # sched: [1:1.00]
+; ATOM-NEXT: cmovaew (%rdx), %di # sched: [1:1.00]
+; ATOM-NEXT: cmovaew (%rdx), %di # sched: [1:1.00]
+; ATOM-NEXT: cmovew (%rdx), %di # sched: [1:1.00]
+; ATOM-NEXT: cmovew (%rdx), %di # sched: [1:1.00]
+; ATOM-NEXT: cmovnew (%rdx), %di # sched: [1:1.00]
+; ATOM-NEXT: cmovnew (%rdx), %di # sched: [1:1.00]
+; ATOM-NEXT: cmovbew (%rdx), %di # sched: [1:1.00]
+; ATOM-NEXT: cmovbew (%rdx), %di # sched: [1:1.00]
+; ATOM-NEXT: cmovaw (%rdx), %di # sched: [1:1.00]
+; ATOM-NEXT: cmovaw (%rdx), %di # sched: [1:1.00]
+; ATOM-NEXT: cmovsw (%rdx), %di # sched: [1:1.00]
+; ATOM-NEXT: cmovnsw (%rdx), %di # sched: [1:1.00]
+; ATOM-NEXT: cmovpw (%rdx), %di # sched: [1:1.00]
+; ATOM-NEXT: cmovpw (%rdx), %di # sched: [1:1.00]
+; ATOM-NEXT: cmovnpw (%rdx), %di # sched: [1:1.00]
+; ATOM-NEXT: cmovnpw (%rdx), %di # sched: [1:1.00]
+; ATOM-NEXT: cmovlw (%rdx), %di # sched: [1:1.00]
+; ATOM-NEXT: cmovlw (%rdx), %di # sched: [1:1.00]
+; ATOM-NEXT: cmovgew (%rdx), %di # sched: [1:1.00]
+; ATOM-NEXT: cmovgew (%rdx), %di # sched: [1:1.00]
+; ATOM-NEXT: cmovlew (%rdx), %di # sched: [1:1.00]
+; ATOM-NEXT: cmovlew (%rdx), %di # sched: [1:1.00]
+; ATOM-NEXT: cmovgw (%rdx), %di # sched: [1:1.00]
+; ATOM-NEXT: cmovgw (%rdx), %di # sched: [1:1.00]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_cmov_16:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: cmovow %si, %di # sched: [1:0.50]
+; SLM-NEXT: cmovnow %si, %di # sched: [1:0.50]
+; SLM-NEXT: cmovbw %si, %di # sched: [1:0.50]
+; SLM-NEXT: cmovbw %si, %di # sched: [1:0.50]
+; SLM-NEXT: cmovbw %si, %di # sched: [1:0.50]
+; SLM-NEXT: cmovaew %si, %di # sched: [1:0.50]
+; SLM-NEXT: cmovaew %si, %di # sched: [1:0.50]
+; SLM-NEXT: cmovaew %si, %di # sched: [1:0.50]
+; SLM-NEXT: cmovew %si, %di # sched: [1:0.50]
+; SLM-NEXT: cmovew %si, %di # sched: [1:0.50]
+; SLM-NEXT: cmovnew %si, %di # sched: [1:0.50]
+; SLM-NEXT: cmovnew %si, %di # sched: [1:0.50]
+; SLM-NEXT: cmovbew %si, %di # sched: [1:0.50]
+; SLM-NEXT: cmovbew %si, %di # sched: [1:0.50]
+; SLM-NEXT: cmovaw %si, %di # sched: [1:0.50]
+; SLM-NEXT: cmovaw %si, %di # sched: [1:0.50]
+; SLM-NEXT: cmovsw %si, %di # sched: [1:0.50]
+; SLM-NEXT: cmovnsw %si, %di # sched: [1:0.50]
+; SLM-NEXT: cmovpw %si, %di # sched: [1:0.50]
+; SLM-NEXT: cmovpw %si, %di # sched: [1:0.50]
+; SLM-NEXT: cmovnpw %si, %di # sched: [1:0.50]
+; SLM-NEXT: cmovnpw %si, %di # sched: [1:0.50]
+; SLM-NEXT: cmovlw %si, %di # sched: [1:0.50]
+; SLM-NEXT: cmovlw %si, %di # sched: [1:0.50]
+; SLM-NEXT: cmovgew %si, %di # sched: [1:0.50]
+; SLM-NEXT: cmovgew %si, %di # sched: [1:0.50]
+; SLM-NEXT: cmovlew %si, %di # sched: [1:0.50]
+; SLM-NEXT: cmovlew %si, %di # sched: [1:0.50]
+; SLM-NEXT: cmovgw %si, %di # sched: [1:0.50]
+; SLM-NEXT: cmovgw %si, %di # sched: [1:0.50]
+; SLM-NEXT: cmovow (%rdx), %di # sched: [4:1.00]
+; SLM-NEXT: cmovnow (%rdx), %di # sched: [4:1.00]
+; SLM-NEXT: cmovbw (%rdx), %di # sched: [4:1.00]
+; SLM-NEXT: cmovbw (%rdx), %di # sched: [4:1.00]
+; SLM-NEXT: cmovbw (%rdx), %di # sched: [4:1.00]
+; SLM-NEXT: cmovaew (%rdx), %di # sched: [4:1.00]
+; SLM-NEXT: cmovaew (%rdx), %di # sched: [4:1.00]
+; SLM-NEXT: cmovaew (%rdx), %di # sched: [4:1.00]
+; SLM-NEXT: cmovew (%rdx), %di # sched: [4:1.00]
+; SLM-NEXT: cmovew (%rdx), %di # sched: [4:1.00]
+; SLM-NEXT: cmovnew (%rdx), %di # sched: [4:1.00]
+; SLM-NEXT: cmovnew (%rdx), %di # sched: [4:1.00]
+; SLM-NEXT: cmovbew (%rdx), %di # sched: [4:1.00]
+; SLM-NEXT: cmovbew (%rdx), %di # sched: [4:1.00]
+; SLM-NEXT: cmovaw (%rdx), %di # sched: [4:1.00]
+; SLM-NEXT: cmovaw (%rdx), %di # sched: [4:1.00]
+; SLM-NEXT: cmovsw (%rdx), %di # sched: [4:1.00]
+; SLM-NEXT: cmovnsw (%rdx), %di # sched: [4:1.00]
+; SLM-NEXT: cmovpw (%rdx), %di # sched: [4:1.00]
+; SLM-NEXT: cmovpw (%rdx), %di # sched: [4:1.00]
+; SLM-NEXT: cmovnpw (%rdx), %di # sched: [4:1.00]
+; SLM-NEXT: cmovnpw (%rdx), %di # sched: [4:1.00]
+; SLM-NEXT: cmovlw (%rdx), %di # sched: [4:1.00]
+; SLM-NEXT: cmovlw (%rdx), %di # sched: [4:1.00]
+; SLM-NEXT: cmovgew (%rdx), %di # sched: [4:1.00]
+; SLM-NEXT: cmovgew (%rdx), %di # sched: [4:1.00]
+; SLM-NEXT: cmovlew (%rdx), %di # sched: [4:1.00]
+; SLM-NEXT: cmovlew (%rdx), %di # sched: [4:1.00]
+; SLM-NEXT: cmovgw (%rdx), %di # sched: [4:1.00]
+; SLM-NEXT: cmovgw (%rdx), %di # sched: [4:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_cmov_16:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: cmovow %si, %di # sched: [2:0.67]
+; SANDY-NEXT: cmovnow %si, %di # sched: [2:0.67]
+; SANDY-NEXT: cmovbw %si, %di # sched: [2:0.67]
+; SANDY-NEXT: cmovbw %si, %di # sched: [2:0.67]
+; SANDY-NEXT: cmovbw %si, %di # sched: [2:0.67]
+; SANDY-NEXT: cmovaew %si, %di # sched: [2:0.67]
+; SANDY-NEXT: cmovaew %si, %di # sched: [2:0.67]
+; SANDY-NEXT: cmovaew %si, %di # sched: [2:0.67]
+; SANDY-NEXT: cmovew %si, %di # sched: [2:0.67]
+; SANDY-NEXT: cmovew %si, %di # sched: [2:0.67]
+; SANDY-NEXT: cmovnew %si, %di # sched: [2:0.67]
+; SANDY-NEXT: cmovnew %si, %di # sched: [2:0.67]
+; SANDY-NEXT: cmovbew %si, %di # sched: [3:1.00]
+; SANDY-NEXT: cmovbew %si, %di # sched: [3:1.00]
+; SANDY-NEXT: cmovaw %si, %di # sched: [3:1.00]
+; SANDY-NEXT: cmovaw %si, %di # sched: [3:1.00]
+; SANDY-NEXT: cmovsw %si, %di # sched: [2:0.67]
+; SANDY-NEXT: cmovnsw %si, %di # sched: [2:0.67]
+; SANDY-NEXT: cmovpw %si, %di # sched: [2:0.67]
+; SANDY-NEXT: cmovpw %si, %di # sched: [2:0.67]
+; SANDY-NEXT: cmovnpw %si, %di # sched: [2:0.67]
+; SANDY-NEXT: cmovnpw %si, %di # sched: [2:0.67]
+; SANDY-NEXT: cmovlw %si, %di # sched: [2:0.67]
+; SANDY-NEXT: cmovlw %si, %di # sched: [2:0.67]
+; SANDY-NEXT: cmovgew %si, %di # sched: [2:0.67]
+; SANDY-NEXT: cmovgew %si, %di # sched: [2:0.67]
+; SANDY-NEXT: cmovlew %si, %di # sched: [2:0.67]
+; SANDY-NEXT: cmovlew %si, %di # sched: [2:0.67]
+; SANDY-NEXT: cmovgw %si, %di # sched: [2:0.67]
+; SANDY-NEXT: cmovgw %si, %di # sched: [2:0.67]
+; SANDY-NEXT: cmovow (%rdx), %di # sched: [7:0.67]
+; SANDY-NEXT: cmovnow (%rdx), %di # sched: [7:0.67]
+; SANDY-NEXT: cmovbw (%rdx), %di # sched: [7:0.67]
+; SANDY-NEXT: cmovbw (%rdx), %di # sched: [7:0.67]
+; SANDY-NEXT: cmovbw (%rdx), %di # sched: [7:0.67]
+; SANDY-NEXT: cmovaew (%rdx), %di # sched: [7:0.67]
+; SANDY-NEXT: cmovaew (%rdx), %di # sched: [7:0.67]
+; SANDY-NEXT: cmovaew (%rdx), %di # sched: [7:0.67]
+; SANDY-NEXT: cmovew (%rdx), %di # sched: [7:0.67]
+; SANDY-NEXT: cmovew (%rdx), %di # sched: [7:0.67]
+; SANDY-NEXT: cmovnew (%rdx), %di # sched: [7:0.67]
+; SANDY-NEXT: cmovnew (%rdx), %di # sched: [7:0.67]
+; SANDY-NEXT: cmovbew (%rdx), %di # sched: [8:1.00]
+; SANDY-NEXT: cmovbew (%rdx), %di # sched: [8:1.00]
+; SANDY-NEXT: cmovaw (%rdx), %di # sched: [8:1.00]
+; SANDY-NEXT: cmovaw (%rdx), %di # sched: [8:1.00]
+; SANDY-NEXT: cmovsw (%rdx), %di # sched: [7:0.67]
+; SANDY-NEXT: cmovnsw (%rdx), %di # sched: [7:0.67]
+; SANDY-NEXT: cmovpw (%rdx), %di # sched: [7:0.67]
+; SANDY-NEXT: cmovpw (%rdx), %di # sched: [7:0.67]
+; SANDY-NEXT: cmovnpw (%rdx), %di # sched: [7:0.67]
+; SANDY-NEXT: cmovnpw (%rdx), %di # sched: [7:0.67]
+; SANDY-NEXT: cmovlw (%rdx), %di # sched: [7:0.67]
+; SANDY-NEXT: cmovlw (%rdx), %di # sched: [7:0.67]
+; SANDY-NEXT: cmovgew (%rdx), %di # sched: [7:0.67]
+; SANDY-NEXT: cmovgew (%rdx), %di # sched: [7:0.67]
+; SANDY-NEXT: cmovlew (%rdx), %di # sched: [7:0.67]
+; SANDY-NEXT: cmovlew (%rdx), %di # sched: [7:0.67]
+; SANDY-NEXT: cmovgw (%rdx), %di # sched: [7:0.67]
+; SANDY-NEXT: cmovgw (%rdx), %di # sched: [7:0.67]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_cmov_16:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: cmovow %si, %di # sched: [2:0.50]
+; HASWELL-NEXT: cmovnow %si, %di # sched: [2:0.50]
+; HASWELL-NEXT: cmovbw %si, %di # sched: [2:0.50]
+; HASWELL-NEXT: cmovbw %si, %di # sched: [2:0.50]
+; HASWELL-NEXT: cmovbw %si, %di # sched: [2:0.50]
+; HASWELL-NEXT: cmovaew %si, %di # sched: [2:0.50]
+; HASWELL-NEXT: cmovaew %si, %di # sched: [2:0.50]
+; HASWELL-NEXT: cmovaew %si, %di # sched: [2:0.50]
+; HASWELL-NEXT: cmovew %si, %di # sched: [2:0.50]
+; HASWELL-NEXT: cmovew %si, %di # sched: [2:0.50]
+; HASWELL-NEXT: cmovnew %si, %di # sched: [2:0.50]
+; HASWELL-NEXT: cmovnew %si, %di # sched: [2:0.50]
+; HASWELL-NEXT: cmovbew %si, %di # sched: [3:0.75]
+; HASWELL-NEXT: cmovbew %si, %di # sched: [3:0.75]
+; HASWELL-NEXT: cmovaw %si, %di # sched: [3:0.75]
+; HASWELL-NEXT: cmovaw %si, %di # sched: [3:0.75]
+; HASWELL-NEXT: cmovsw %si, %di # sched: [2:0.50]
+; HASWELL-NEXT: cmovnsw %si, %di # sched: [2:0.50]
+; HASWELL-NEXT: cmovpw %si, %di # sched: [2:0.50]
+; HASWELL-NEXT: cmovpw %si, %di # sched: [2:0.50]
+; HASWELL-NEXT: cmovnpw %si, %di # sched: [2:0.50]
+; HASWELL-NEXT: cmovnpw %si, %di # sched: [2:0.50]
+; HASWELL-NEXT: cmovlw %si, %di # sched: [2:0.50]
+; HASWELL-NEXT: cmovlw %si, %di # sched: [2:0.50]
+; HASWELL-NEXT: cmovgew %si, %di # sched: [2:0.50]
+; HASWELL-NEXT: cmovgew %si, %di # sched: [2:0.50]
+; HASWELL-NEXT: cmovlew %si, %di # sched: [2:0.50]
+; HASWELL-NEXT: cmovlew %si, %di # sched: [2:0.50]
+; HASWELL-NEXT: cmovgw %si, %di # sched: [2:0.50]
+; HASWELL-NEXT: cmovgw %si, %di # sched: [2:0.50]
+; HASWELL-NEXT: cmovow (%rdx), %di # sched: [7:0.50]
+; HASWELL-NEXT: cmovnow (%rdx), %di # sched: [7:0.50]
+; HASWELL-NEXT: cmovbw (%rdx), %di # sched: [7:0.50]
+; HASWELL-NEXT: cmovbw (%rdx), %di # sched: [7:0.50]
+; HASWELL-NEXT: cmovbw (%rdx), %di # sched: [7:0.50]
+; HASWELL-NEXT: cmovaew (%rdx), %di # sched: [7:0.50]
+; HASWELL-NEXT: cmovaew (%rdx), %di # sched: [7:0.50]
+; HASWELL-NEXT: cmovaew (%rdx), %di # sched: [7:0.50]
+; HASWELL-NEXT: cmovew (%rdx), %di # sched: [7:0.50]
+; HASWELL-NEXT: cmovew (%rdx), %di # sched: [7:0.50]
+; HASWELL-NEXT: cmovnew (%rdx), %di # sched: [7:0.50]
+; HASWELL-NEXT: cmovnew (%rdx), %di # sched: [7:0.50]
+; HASWELL-NEXT: cmovbew (%rdx), %di # sched: [8:0.75]
+; HASWELL-NEXT: cmovbew (%rdx), %di # sched: [8:0.75]
+; HASWELL-NEXT: cmovaw (%rdx), %di # sched: [8:0.75]
+; HASWELL-NEXT: cmovaw (%rdx), %di # sched: [8:0.75]
+; HASWELL-NEXT: cmovsw (%rdx), %di # sched: [7:0.50]
+; HASWELL-NEXT: cmovnsw (%rdx), %di # sched: [7:0.50]
+; HASWELL-NEXT: cmovpw (%rdx), %di # sched: [7:0.50]
+; HASWELL-NEXT: cmovpw (%rdx), %di # sched: [7:0.50]
+; HASWELL-NEXT: cmovnpw (%rdx), %di # sched: [7:0.50]
+; HASWELL-NEXT: cmovnpw (%rdx), %di # sched: [7:0.50]
+; HASWELL-NEXT: cmovlw (%rdx), %di # sched: [7:0.50]
+; HASWELL-NEXT: cmovlw (%rdx), %di # sched: [7:0.50]
+; HASWELL-NEXT: cmovgew (%rdx), %di # sched: [7:0.50]
+; HASWELL-NEXT: cmovgew (%rdx), %di # sched: [7:0.50]
+; HASWELL-NEXT: cmovlew (%rdx), %di # sched: [7:0.50]
+; HASWELL-NEXT: cmovlew (%rdx), %di # sched: [7:0.50]
+; HASWELL-NEXT: cmovgw (%rdx), %di # sched: [7:0.50]
+; HASWELL-NEXT: cmovgw (%rdx), %di # sched: [7:0.50]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_cmov_16:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: cmovow %si, %di # sched: [1:0.50]
+; BROADWELL-NEXT: cmovnow %si, %di # sched: [1:0.50]
+; BROADWELL-NEXT: cmovbw %si, %di # sched: [1:0.50]
+; BROADWELL-NEXT: cmovbw %si, %di # sched: [1:0.50]
+; BROADWELL-NEXT: cmovbw %si, %di # sched: [1:0.50]
+; BROADWELL-NEXT: cmovaew %si, %di # sched: [1:0.50]
+; BROADWELL-NEXT: cmovaew %si, %di # sched: [1:0.50]
+; BROADWELL-NEXT: cmovaew %si, %di # sched: [1:0.50]
+; BROADWELL-NEXT: cmovew %si, %di # sched: [1:0.50]
+; BROADWELL-NEXT: cmovew %si, %di # sched: [1:0.50]
+; BROADWELL-NEXT: cmovnew %si, %di # sched: [1:0.50]
+; BROADWELL-NEXT: cmovnew %si, %di # sched: [1:0.50]
+; BROADWELL-NEXT: cmovbew %si, %di # sched: [2:0.50]
+; BROADWELL-NEXT: cmovbew %si, %di # sched: [2:0.50]
+; BROADWELL-NEXT: cmovaw %si, %di # sched: [2:0.50]
+; BROADWELL-NEXT: cmovaw %si, %di # sched: [2:0.50]
+; BROADWELL-NEXT: cmovsw %si, %di # sched: [1:0.50]
+; BROADWELL-NEXT: cmovnsw %si, %di # sched: [1:0.50]
+; BROADWELL-NEXT: cmovpw %si, %di # sched: [1:0.50]
+; BROADWELL-NEXT: cmovpw %si, %di # sched: [1:0.50]
+; BROADWELL-NEXT: cmovnpw %si, %di # sched: [1:0.50]
+; BROADWELL-NEXT: cmovnpw %si, %di # sched: [1:0.50]
+; BROADWELL-NEXT: cmovlw %si, %di # sched: [1:0.50]
+; BROADWELL-NEXT: cmovlw %si, %di # sched: [1:0.50]
+; BROADWELL-NEXT: cmovgew %si, %di # sched: [1:0.50]
+; BROADWELL-NEXT: cmovgew %si, %di # sched: [1:0.50]
+; BROADWELL-NEXT: cmovlew %si, %di # sched: [1:0.50]
+; BROADWELL-NEXT: cmovlew %si, %di # sched: [1:0.50]
+; BROADWELL-NEXT: cmovgw %si, %di # sched: [1:0.50]
+; BROADWELL-NEXT: cmovgw %si, %di # sched: [1:0.50]
+; BROADWELL-NEXT: cmovow (%rdx), %di # sched: [6:0.50]
+; BROADWELL-NEXT: cmovnow (%rdx), %di # sched: [6:0.50]
+; BROADWELL-NEXT: cmovbw (%rdx), %di # sched: [6:0.50]
+; BROADWELL-NEXT: cmovbw (%rdx), %di # sched: [6:0.50]
+; BROADWELL-NEXT: cmovbw (%rdx), %di # sched: [6:0.50]
+; BROADWELL-NEXT: cmovaew (%rdx), %di # sched: [6:0.50]
+; BROADWELL-NEXT: cmovaew (%rdx), %di # sched: [6:0.50]
+; BROADWELL-NEXT: cmovaew (%rdx), %di # sched: [6:0.50]
+; BROADWELL-NEXT: cmovew (%rdx), %di # sched: [6:0.50]
+; BROADWELL-NEXT: cmovew (%rdx), %di # sched: [6:0.50]
+; BROADWELL-NEXT: cmovnew (%rdx), %di # sched: [6:0.50]
+; BROADWELL-NEXT: cmovnew (%rdx), %di # sched: [6:0.50]
+; BROADWELL-NEXT: cmovbew (%rdx), %di # sched: [7:0.50]
+; BROADWELL-NEXT: cmovbew (%rdx), %di # sched: [7:0.50]
+; BROADWELL-NEXT: cmovaw (%rdx), %di # sched: [7:0.50]
+; BROADWELL-NEXT: cmovaw (%rdx), %di # sched: [7:0.50]
+; BROADWELL-NEXT: cmovsw (%rdx), %di # sched: [6:0.50]
+; BROADWELL-NEXT: cmovnsw (%rdx), %di # sched: [6:0.50]
+; BROADWELL-NEXT: cmovpw (%rdx), %di # sched: [6:0.50]
+; BROADWELL-NEXT: cmovpw (%rdx), %di # sched: [6:0.50]
+; BROADWELL-NEXT: cmovnpw (%rdx), %di # sched: [6:0.50]
+; BROADWELL-NEXT: cmovnpw (%rdx), %di # sched: [6:0.50]
+; BROADWELL-NEXT: cmovlw (%rdx), %di # sched: [6:0.50]
+; BROADWELL-NEXT: cmovlw (%rdx), %di # sched: [6:0.50]
+; BROADWELL-NEXT: cmovgew (%rdx), %di # sched: [6:0.50]
+; BROADWELL-NEXT: cmovgew (%rdx), %di # sched: [6:0.50]
+; BROADWELL-NEXT: cmovlew (%rdx), %di # sched: [6:0.50]
+; BROADWELL-NEXT: cmovlew (%rdx), %di # sched: [6:0.50]
+; BROADWELL-NEXT: cmovgw (%rdx), %di # sched: [6:0.50]
+; BROADWELL-NEXT: cmovgw (%rdx), %di # sched: [6:0.50]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_cmov_16:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: cmovow %si, %di # sched: [1:0.50]
+; SKYLAKE-NEXT: cmovnow %si, %di # sched: [1:0.50]
+; SKYLAKE-NEXT: cmovbw %si, %di # sched: [1:0.50]
+; SKYLAKE-NEXT: cmovbw %si, %di # sched: [1:0.50]
+; SKYLAKE-NEXT: cmovbw %si, %di # sched: [1:0.50]
+; SKYLAKE-NEXT: cmovaew %si, %di # sched: [1:0.50]
+; SKYLAKE-NEXT: cmovaew %si, %di # sched: [1:0.50]
+; SKYLAKE-NEXT: cmovaew %si, %di # sched: [1:0.50]
+; SKYLAKE-NEXT: cmovew %si, %di # sched: [1:0.50]
+; SKYLAKE-NEXT: cmovew %si, %di # sched: [1:0.50]
+; SKYLAKE-NEXT: cmovnew %si, %di # sched: [1:0.50]
+; SKYLAKE-NEXT: cmovnew %si, %di # sched: [1:0.50]
+; SKYLAKE-NEXT: cmovbew %si, %di # sched: [2:1.00]
+; SKYLAKE-NEXT: cmovbew %si, %di # sched: [2:1.00]
+; SKYLAKE-NEXT: cmovaw %si, %di # sched: [2:1.00]
+; SKYLAKE-NEXT: cmovaw %si, %di # sched: [2:1.00]
+; SKYLAKE-NEXT: cmovsw %si, %di # sched: [1:0.50]
+; SKYLAKE-NEXT: cmovnsw %si, %di # sched: [1:0.50]
+; SKYLAKE-NEXT: cmovpw %si, %di # sched: [1:0.50]
+; SKYLAKE-NEXT: cmovpw %si, %di # sched: [1:0.50]
+; SKYLAKE-NEXT: cmovnpw %si, %di # sched: [1:0.50]
+; SKYLAKE-NEXT: cmovnpw %si, %di # sched: [1:0.50]
+; SKYLAKE-NEXT: cmovlw %si, %di # sched: [1:0.50]
+; SKYLAKE-NEXT: cmovlw %si, %di # sched: [1:0.50]
+; SKYLAKE-NEXT: cmovgew %si, %di # sched: [1:0.50]
+; SKYLAKE-NEXT: cmovgew %si, %di # sched: [1:0.50]
+; SKYLAKE-NEXT: cmovlew %si, %di # sched: [1:0.50]
+; SKYLAKE-NEXT: cmovlew %si, %di # sched: [1:0.50]
+; SKYLAKE-NEXT: cmovgw %si, %di # sched: [1:0.50]
+; SKYLAKE-NEXT: cmovgw %si, %di # sched: [1:0.50]
+; SKYLAKE-NEXT: cmovow (%rdx), %di # sched: [6:0.50]
+; SKYLAKE-NEXT: cmovnow (%rdx), %di # sched: [6:0.50]
+; SKYLAKE-NEXT: cmovbw (%rdx), %di # sched: [6:0.50]
+; SKYLAKE-NEXT: cmovbw (%rdx), %di # sched: [6:0.50]
+; SKYLAKE-NEXT: cmovbw (%rdx), %di # sched: [6:0.50]
+; SKYLAKE-NEXT: cmovaew (%rdx), %di # sched: [6:0.50]
+; SKYLAKE-NEXT: cmovaew (%rdx), %di # sched: [6:0.50]
+; SKYLAKE-NEXT: cmovaew (%rdx), %di # sched: [6:0.50]
+; SKYLAKE-NEXT: cmovew (%rdx), %di # sched: [6:0.50]
+; SKYLAKE-NEXT: cmovew (%rdx), %di # sched: [6:0.50]
+; SKYLAKE-NEXT: cmovnew (%rdx), %di # sched: [6:0.50]
+; SKYLAKE-NEXT: cmovnew (%rdx), %di # sched: [6:0.50]
+; SKYLAKE-NEXT: cmovbew (%rdx), %di # sched: [7:1.00]
+; SKYLAKE-NEXT: cmovbew (%rdx), %di # sched: [7:1.00]
+; SKYLAKE-NEXT: cmovaw (%rdx), %di # sched: [7:1.00]
+; SKYLAKE-NEXT: cmovaw (%rdx), %di # sched: [7:1.00]
+; SKYLAKE-NEXT: cmovsw (%rdx), %di # sched: [6:0.50]
+; SKYLAKE-NEXT: cmovnsw (%rdx), %di # sched: [6:0.50]
+; SKYLAKE-NEXT: cmovpw (%rdx), %di # sched: [6:0.50]
+; SKYLAKE-NEXT: cmovpw (%rdx), %di # sched: [6:0.50]
+; SKYLAKE-NEXT: cmovnpw (%rdx), %di # sched: [6:0.50]
+; SKYLAKE-NEXT: cmovnpw (%rdx), %di # sched: [6:0.50]
+; SKYLAKE-NEXT: cmovlw (%rdx), %di # sched: [6:0.50]
+; SKYLAKE-NEXT: cmovlw (%rdx), %di # sched: [6:0.50]
+; SKYLAKE-NEXT: cmovgew (%rdx), %di # sched: [6:0.50]
+; SKYLAKE-NEXT: cmovgew (%rdx), %di # sched: [6:0.50]
+; SKYLAKE-NEXT: cmovlew (%rdx), %di # sched: [6:0.50]
+; SKYLAKE-NEXT: cmovlew (%rdx), %di # sched: [6:0.50]
+; SKYLAKE-NEXT: cmovgw (%rdx), %di # sched: [6:0.50]
+; SKYLAKE-NEXT: cmovgw (%rdx), %di # sched: [6:0.50]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_cmov_16:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: cmovow %si, %di # sched: [1:0.50]
+; SKX-NEXT: cmovnow %si, %di # sched: [1:0.50]
+; SKX-NEXT: cmovbw %si, %di # sched: [1:0.50]
+; SKX-NEXT: cmovbw %si, %di # sched: [1:0.50]
+; SKX-NEXT: cmovbw %si, %di # sched: [1:0.50]
+; SKX-NEXT: cmovaew %si, %di # sched: [1:0.50]
+; SKX-NEXT: cmovaew %si, %di # sched: [1:0.50]
+; SKX-NEXT: cmovaew %si, %di # sched: [1:0.50]
+; SKX-NEXT: cmovew %si, %di # sched: [1:0.50]
+; SKX-NEXT: cmovew %si, %di # sched: [1:0.50]
+; SKX-NEXT: cmovnew %si, %di # sched: [1:0.50]
+; SKX-NEXT: cmovnew %si, %di # sched: [1:0.50]
+; SKX-NEXT: cmovbew %si, %di # sched: [2:1.00]
+; SKX-NEXT: cmovbew %si, %di # sched: [2:1.00]
+; SKX-NEXT: cmovaw %si, %di # sched: [2:1.00]
+; SKX-NEXT: cmovaw %si, %di # sched: [2:1.00]
+; SKX-NEXT: cmovsw %si, %di # sched: [1:0.50]
+; SKX-NEXT: cmovnsw %si, %di # sched: [1:0.50]
+; SKX-NEXT: cmovpw %si, %di # sched: [1:0.50]
+; SKX-NEXT: cmovpw %si, %di # sched: [1:0.50]
+; SKX-NEXT: cmovnpw %si, %di # sched: [1:0.50]
+; SKX-NEXT: cmovnpw %si, %di # sched: [1:0.50]
+; SKX-NEXT: cmovlw %si, %di # sched: [1:0.50]
+; SKX-NEXT: cmovlw %si, %di # sched: [1:0.50]
+; SKX-NEXT: cmovgew %si, %di # sched: [1:0.50]
+; SKX-NEXT: cmovgew %si, %di # sched: [1:0.50]
+; SKX-NEXT: cmovlew %si, %di # sched: [1:0.50]
+; SKX-NEXT: cmovlew %si, %di # sched: [1:0.50]
+; SKX-NEXT: cmovgw %si, %di # sched: [1:0.50]
+; SKX-NEXT: cmovgw %si, %di # sched: [1:0.50]
+; SKX-NEXT: cmovow (%rdx), %di # sched: [6:0.50]
+; SKX-NEXT: cmovnow (%rdx), %di # sched: [6:0.50]
+; SKX-NEXT: cmovbw (%rdx), %di # sched: [6:0.50]
+; SKX-NEXT: cmovbw (%rdx), %di # sched: [6:0.50]
+; SKX-NEXT: cmovbw (%rdx), %di # sched: [6:0.50]
+; SKX-NEXT: cmovaew (%rdx), %di # sched: [6:0.50]
+; SKX-NEXT: cmovaew (%rdx), %di # sched: [6:0.50]
+; SKX-NEXT: cmovaew (%rdx), %di # sched: [6:0.50]
+; SKX-NEXT: cmovew (%rdx), %di # sched: [6:0.50]
+; SKX-NEXT: cmovew (%rdx), %di # sched: [6:0.50]
+; SKX-NEXT: cmovnew (%rdx), %di # sched: [6:0.50]
+; SKX-NEXT: cmovnew (%rdx), %di # sched: [6:0.50]
+; SKX-NEXT: cmovbew (%rdx), %di # sched: [7:1.00]
+; SKX-NEXT: cmovbew (%rdx), %di # sched: [7:1.00]
+; SKX-NEXT: cmovaw (%rdx), %di # sched: [7:1.00]
+; SKX-NEXT: cmovaw (%rdx), %di # sched: [7:1.00]
+; SKX-NEXT: cmovsw (%rdx), %di # sched: [6:0.50]
+; SKX-NEXT: cmovnsw (%rdx), %di # sched: [6:0.50]
+; SKX-NEXT: cmovpw (%rdx), %di # sched: [6:0.50]
+; SKX-NEXT: cmovpw (%rdx), %di # sched: [6:0.50]
+; SKX-NEXT: cmovnpw (%rdx), %di # sched: [6:0.50]
+; SKX-NEXT: cmovnpw (%rdx), %di # sched: [6:0.50]
+; SKX-NEXT: cmovlw (%rdx), %di # sched: [6:0.50]
+; SKX-NEXT: cmovlw (%rdx), %di # sched: [6:0.50]
+; SKX-NEXT: cmovgew (%rdx), %di # sched: [6:0.50]
+; SKX-NEXT: cmovgew (%rdx), %di # sched: [6:0.50]
+; SKX-NEXT: cmovlew (%rdx), %di # sched: [6:0.50]
+; SKX-NEXT: cmovlew (%rdx), %di # sched: [6:0.50]
+; SKX-NEXT: cmovgw (%rdx), %di # sched: [6:0.50]
+; SKX-NEXT: cmovgw (%rdx), %di # sched: [6:0.50]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_cmov_16:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: cmovow %si, %di # sched: [1:0.50]
+; BTVER2-NEXT: cmovnow %si, %di # sched: [1:0.50]
+; BTVER2-NEXT: cmovbw %si, %di # sched: [1:0.50]
+; BTVER2-NEXT: cmovbw %si, %di # sched: [1:0.50]
+; BTVER2-NEXT: cmovbw %si, %di # sched: [1:0.50]
+; BTVER2-NEXT: cmovaew %si, %di # sched: [1:0.50]
+; BTVER2-NEXT: cmovaew %si, %di # sched: [1:0.50]
+; BTVER2-NEXT: cmovaew %si, %di # sched: [1:0.50]
+; BTVER2-NEXT: cmovew %si, %di # sched: [1:0.50]
+; BTVER2-NEXT: cmovew %si, %di # sched: [1:0.50]
+; BTVER2-NEXT: cmovnew %si, %di # sched: [1:0.50]
+; BTVER2-NEXT: cmovnew %si, %di # sched: [1:0.50]
+; BTVER2-NEXT: cmovbew %si, %di # sched: [1:0.50]
+; BTVER2-NEXT: cmovbew %si, %di # sched: [1:0.50]
+; BTVER2-NEXT: cmovaw %si, %di # sched: [1:0.50]
+; BTVER2-NEXT: cmovaw %si, %di # sched: [1:0.50]
+; BTVER2-NEXT: cmovsw %si, %di # sched: [1:0.50]
+; BTVER2-NEXT: cmovnsw %si, %di # sched: [1:0.50]
+; BTVER2-NEXT: cmovpw %si, %di # sched: [1:0.50]
+; BTVER2-NEXT: cmovpw %si, %di # sched: [1:0.50]
+; BTVER2-NEXT: cmovnpw %si, %di # sched: [1:0.50]
+; BTVER2-NEXT: cmovnpw %si, %di # sched: [1:0.50]
+; BTVER2-NEXT: cmovlw %si, %di # sched: [1:0.50]
+; BTVER2-NEXT: cmovlw %si, %di # sched: [1:0.50]
+; BTVER2-NEXT: cmovgew %si, %di # sched: [1:0.50]
+; BTVER2-NEXT: cmovgew %si, %di # sched: [1:0.50]
+; BTVER2-NEXT: cmovlew %si, %di # sched: [1:0.50]
+; BTVER2-NEXT: cmovlew %si, %di # sched: [1:0.50]
+; BTVER2-NEXT: cmovgw %si, %di # sched: [1:0.50]
+; BTVER2-NEXT: cmovgw %si, %di # sched: [1:0.50]
+; BTVER2-NEXT: cmovow (%rdx), %di # sched: [4:1.00]
+; BTVER2-NEXT: cmovnow (%rdx), %di # sched: [4:1.00]
+; BTVER2-NEXT: cmovbw (%rdx), %di # sched: [4:1.00]
+; BTVER2-NEXT: cmovbw (%rdx), %di # sched: [4:1.00]
+; BTVER2-NEXT: cmovbw (%rdx), %di # sched: [4:1.00]
+; BTVER2-NEXT: cmovaew (%rdx), %di # sched: [4:1.00]
+; BTVER2-NEXT: cmovaew (%rdx), %di # sched: [4:1.00]
+; BTVER2-NEXT: cmovaew (%rdx), %di # sched: [4:1.00]
+; BTVER2-NEXT: cmovew (%rdx), %di # sched: [4:1.00]
+; BTVER2-NEXT: cmovew (%rdx), %di # sched: [4:1.00]
+; BTVER2-NEXT: cmovnew (%rdx), %di # sched: [4:1.00]
+; BTVER2-NEXT: cmovnew (%rdx), %di # sched: [4:1.00]
+; BTVER2-NEXT: cmovbew (%rdx), %di # sched: [4:1.00]
+; BTVER2-NEXT: cmovbew (%rdx), %di # sched: [4:1.00]
+; BTVER2-NEXT: cmovaw (%rdx), %di # sched: [4:1.00]
+; BTVER2-NEXT: cmovaw (%rdx), %di # sched: [4:1.00]
+; BTVER2-NEXT: cmovsw (%rdx), %di # sched: [4:1.00]
+; BTVER2-NEXT: cmovnsw (%rdx), %di # sched: [4:1.00]
+; BTVER2-NEXT: cmovpw (%rdx), %di # sched: [4:1.00]
+; BTVER2-NEXT: cmovpw (%rdx), %di # sched: [4:1.00]
+; BTVER2-NEXT: cmovnpw (%rdx), %di # sched: [4:1.00]
+; BTVER2-NEXT: cmovnpw (%rdx), %di # sched: [4:1.00]
+; BTVER2-NEXT: cmovlw (%rdx), %di # sched: [4:1.00]
+; BTVER2-NEXT: cmovlw (%rdx), %di # sched: [4:1.00]
+; BTVER2-NEXT: cmovgew (%rdx), %di # sched: [4:1.00]
+; BTVER2-NEXT: cmovgew (%rdx), %di # sched: [4:1.00]
+; BTVER2-NEXT: cmovlew (%rdx), %di # sched: [4:1.00]
+; BTVER2-NEXT: cmovlew (%rdx), %di # sched: [4:1.00]
+; BTVER2-NEXT: cmovgw (%rdx), %di # sched: [4:1.00]
+; BTVER2-NEXT: cmovgw (%rdx), %di # sched: [4:1.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_cmov_16:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: cmovow %si, %di # sched: [1:0.25]
+; ZNVER1-NEXT: cmovnow %si, %di # sched: [1:0.25]
+; ZNVER1-NEXT: cmovbw %si, %di # sched: [1:0.25]
+; ZNVER1-NEXT: cmovbw %si, %di # sched: [1:0.25]
+; ZNVER1-NEXT: cmovbw %si, %di # sched: [1:0.25]
+; ZNVER1-NEXT: cmovaew %si, %di # sched: [1:0.25]
+; ZNVER1-NEXT: cmovaew %si, %di # sched: [1:0.25]
+; ZNVER1-NEXT: cmovaew %si, %di # sched: [1:0.25]
+; ZNVER1-NEXT: cmovew %si, %di # sched: [1:0.25]
+; ZNVER1-NEXT: cmovew %si, %di # sched: [1:0.25]
+; ZNVER1-NEXT: cmovnew %si, %di # sched: [1:0.25]
+; ZNVER1-NEXT: cmovnew %si, %di # sched: [1:0.25]
+; ZNVER1-NEXT: cmovbew %si, %di # sched: [1:0.25]
+; ZNVER1-NEXT: cmovbew %si, %di # sched: [1:0.25]
+; ZNVER1-NEXT: cmovaw %si, %di # sched: [1:0.25]
+; ZNVER1-NEXT: cmovaw %si, %di # sched: [1:0.25]
+; ZNVER1-NEXT: cmovsw %si, %di # sched: [1:0.25]
+; ZNVER1-NEXT: cmovnsw %si, %di # sched: [1:0.25]
+; ZNVER1-NEXT: cmovpw %si, %di # sched: [1:0.25]
+; ZNVER1-NEXT: cmovpw %si, %di # sched: [1:0.25]
+; ZNVER1-NEXT: cmovnpw %si, %di # sched: [1:0.25]
+; ZNVER1-NEXT: cmovnpw %si, %di # sched: [1:0.25]
+; ZNVER1-NEXT: cmovlw %si, %di # sched: [1:0.25]
+; ZNVER1-NEXT: cmovlw %si, %di # sched: [1:0.25]
+; ZNVER1-NEXT: cmovgew %si, %di # sched: [1:0.25]
+; ZNVER1-NEXT: cmovgew %si, %di # sched: [1:0.25]
+; ZNVER1-NEXT: cmovlew %si, %di # sched: [1:0.25]
+; ZNVER1-NEXT: cmovlew %si, %di # sched: [1:0.25]
+; ZNVER1-NEXT: cmovgw %si, %di # sched: [1:0.25]
+; ZNVER1-NEXT: cmovgw %si, %di # sched: [1:0.25]
+; ZNVER1-NEXT: cmovow (%rdx), %di # sched: [5:0.50]
+; ZNVER1-NEXT: cmovnow (%rdx), %di # sched: [5:0.50]
+; ZNVER1-NEXT: cmovbw (%rdx), %di # sched: [5:0.50]
+; ZNVER1-NEXT: cmovbw (%rdx), %di # sched: [5:0.50]
+; ZNVER1-NEXT: cmovbw (%rdx), %di # sched: [5:0.50]
+; ZNVER1-NEXT: cmovaew (%rdx), %di # sched: [5:0.50]
+; ZNVER1-NEXT: cmovaew (%rdx), %di # sched: [5:0.50]
+; ZNVER1-NEXT: cmovaew (%rdx), %di # sched: [5:0.50]
+; ZNVER1-NEXT: cmovew (%rdx), %di # sched: [5:0.50]
+; ZNVER1-NEXT: cmovew (%rdx), %di # sched: [5:0.50]
+; ZNVER1-NEXT: cmovnew (%rdx), %di # sched: [5:0.50]
+; ZNVER1-NEXT: cmovnew (%rdx), %di # sched: [5:0.50]
+; ZNVER1-NEXT: cmovbew (%rdx), %di # sched: [5:0.50]
+; ZNVER1-NEXT: cmovbew (%rdx), %di # sched: [5:0.50]
+; ZNVER1-NEXT: cmovaw (%rdx), %di # sched: [5:0.50]
+; ZNVER1-NEXT: cmovaw (%rdx), %di # sched: [5:0.50]
+; ZNVER1-NEXT: cmovsw (%rdx), %di # sched: [5:0.50]
+; ZNVER1-NEXT: cmovnsw (%rdx), %di # sched: [5:0.50]
+; ZNVER1-NEXT: cmovpw (%rdx), %di # sched: [5:0.50]
+; ZNVER1-NEXT: cmovpw (%rdx), %di # sched: [5:0.50]
+; ZNVER1-NEXT: cmovnpw (%rdx), %di # sched: [5:0.50]
+; ZNVER1-NEXT: cmovnpw (%rdx), %di # sched: [5:0.50]
+; ZNVER1-NEXT: cmovlw (%rdx), %di # sched: [5:0.50]
+; ZNVER1-NEXT: cmovlw (%rdx), %di # sched: [5:0.50]
+; ZNVER1-NEXT: cmovgew (%rdx), %di # sched: [5:0.50]
+; ZNVER1-NEXT: cmovgew (%rdx), %di # sched: [5:0.50]
+; ZNVER1-NEXT: cmovlew (%rdx), %di # sched: [5:0.50]
+; ZNVER1-NEXT: cmovlew (%rdx), %di # sched: [5:0.50]
+; ZNVER1-NEXT: cmovgw (%rdx), %di # sched: [5:0.50]
+; ZNVER1-NEXT: cmovgw (%rdx), %di # sched: [5:0.50]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ call void asm sideeffect "cmovow $1, $0 \0A\09 cmovnow $1, $0 \0A\09 cmovbw $1, $0 \0A\09 cmovcw $1, $0 \0A\09 cmovnaew $1, $0 \0A\09 cmovnbw $1, $0 \0A\09 cmovncw $1, $0 \0A\09 cmovaew $1, $0 \0A\09 cmovzw $1, $0 \0A\09 cmovew $1, $0 \0A\09 cmovnzw $1, $0 \0A\09 cmovnew $1, $0 \0A\09 cmovbew $1, $0 \0A\09 cmovnaw $1, $0 \0A\09 cmovnbew $1, $0 \0A\09 cmovaw $1, $0 \0A\09 cmovsw $1, $0 \0A\09 cmovnsw $1, $0 \0A\09 cmovpw $1, $0 \0A\09 cmovpew $1, $0 \0A\09 cmovnpw $1, $0 \0A\09 cmovpow $1, $0 \0A\09 cmovlw $1, $0 \0A\09 cmovngew $1, $0 \0A\09 cmovnlw $1, $0 \0A\09 cmovgew $1, $0 \0A\09 cmovlew $1, $0 \0A\09 cmovngw $1, $0 \0A\09 cmovnlew $1, $0 \0A\09 cmovgw $1, $0 \0A\09 cmovow $2, $0 \0A\09 cmovnow $2, $0 \0A\09 cmovbw $2, $0 \0A\09 cmovcw $2, $0 \0A\09 cmovnaew $2, $0 \0A\09 cmovnbw $2, $0 \0A\09 cmovncw $2, $0 \0A\09 cmovaew $2, $0 \0A\09 cmovzw $2, $0 \0A\09 cmovew $2, $0 \0A\09 cmovnzw $2, $0 \0A\09 cmovnew $2, $0 \0A\09 cmovbew $2, $0 \0A\09 cmovnaw $2, $0 \0A\09 cmovnbew $2, $0 \0A\09 cmovaw $2, $0 \0A\09 cmovsw $2, $0 \0A\09 cmovnsw $2, $0 \0A\09 cmovpw $2, $0 \0A\09 cmovpew $2, $0 \0A\09 cmovnpw $2, $0 \0A\09 cmovpow $2, $0 \0A\09 cmovlw $2, $0 \0A\09 cmovngew $2, $0 \0A\09 cmovnlw $2, $0 \0A\09 cmovgew $2, $0 \0A\09 cmovlew $2, $0 \0A\09 cmovngw $2, $0 \0A\09 cmovnlew $2, $0 \0A\09 cmovgw $2, $0", "r,r,*m"(i16 %a0, i16 %a1, i16 *%a2)
+ ret void
+}
+
+define void @test_cmov_32(i32 %a0, i32 %a1, i32 *%a2) optsize {
+; GENERIC-LABEL: test_cmov_32:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: cmovol %esi, %edi # sched: [2:0.67]
+; GENERIC-NEXT: cmovnol %esi, %edi # sched: [2:0.67]
+; GENERIC-NEXT: cmovbl %esi, %edi # sched: [2:0.67]
+; GENERIC-NEXT: cmovbl %esi, %edi # sched: [2:0.67]
+; GENERIC-NEXT: cmovbl %esi, %edi # sched: [2:0.67]
+; GENERIC-NEXT: cmovael %esi, %edi # sched: [2:0.67]
+; GENERIC-NEXT: cmovael %esi, %edi # sched: [2:0.67]
+; GENERIC-NEXT: cmovael %esi, %edi # sched: [2:0.67]
+; GENERIC-NEXT: cmovel %esi, %edi # sched: [2:0.67]
+; GENERIC-NEXT: cmovel %esi, %edi # sched: [2:0.67]
+; GENERIC-NEXT: cmovnel %esi, %edi # sched: [2:0.67]
+; GENERIC-NEXT: cmovnel %esi, %edi # sched: [2:0.67]
+; GENERIC-NEXT: cmovbel %esi, %edi # sched: [3:1.00]
+; GENERIC-NEXT: cmovbel %esi, %edi # sched: [3:1.00]
+; GENERIC-NEXT: cmoval %esi, %edi # sched: [3:1.00]
+; GENERIC-NEXT: cmoval %esi, %edi # sched: [3:1.00]
+; GENERIC-NEXT: cmovsl %esi, %edi # sched: [2:0.67]
+; GENERIC-NEXT: cmovnsl %esi, %edi # sched: [2:0.67]
+; GENERIC-NEXT: cmovpl %esi, %edi # sched: [2:0.67]
+; GENERIC-NEXT: cmovpl %esi, %edi # sched: [2:0.67]
+; GENERIC-NEXT: cmovnpl %esi, %edi # sched: [2:0.67]
+; GENERIC-NEXT: cmovnpl %esi, %edi # sched: [2:0.67]
+; GENERIC-NEXT: cmovll %esi, %edi # sched: [2:0.67]
+; GENERIC-NEXT: cmovll %esi, %edi # sched: [2:0.67]
+; GENERIC-NEXT: cmovgel %esi, %edi # sched: [2:0.67]
+; GENERIC-NEXT: cmovgel %esi, %edi # sched: [2:0.67]
+; GENERIC-NEXT: cmovlel %esi, %edi # sched: [2:0.67]
+; GENERIC-NEXT: cmovlel %esi, %edi # sched: [2:0.67]
+; GENERIC-NEXT: cmovgl %esi, %edi # sched: [2:0.67]
+; GENERIC-NEXT: cmovgl %esi, %edi # sched: [2:0.67]
+; GENERIC-NEXT: cmovol (%rdx), %edi # sched: [7:0.67]
+; GENERIC-NEXT: cmovnol (%rdx), %edi # sched: [7:0.67]
+; GENERIC-NEXT: cmovbl (%rdx), %edi # sched: [7:0.67]
+; GENERIC-NEXT: cmovbl (%rdx), %edi # sched: [7:0.67]
+; GENERIC-NEXT: cmovbl (%rdx), %edi # sched: [7:0.67]
+; GENERIC-NEXT: cmovael (%rdx), %edi # sched: [7:0.67]
+; GENERIC-NEXT: cmovael (%rdx), %edi # sched: [7:0.67]
+; GENERIC-NEXT: cmovael (%rdx), %edi # sched: [7:0.67]
+; GENERIC-NEXT: cmovel (%rdx), %edi # sched: [7:0.67]
+; GENERIC-NEXT: cmovel (%rdx), %edi # sched: [7:0.67]
+; GENERIC-NEXT: cmovnel (%rdx), %edi # sched: [7:0.67]
+; GENERIC-NEXT: cmovnel (%rdx), %edi # sched: [7:0.67]
+; GENERIC-NEXT: cmovbel (%rdx), %edi # sched: [8:1.00]
+; GENERIC-NEXT: cmovbel (%rdx), %edi # sched: [8:1.00]
+; GENERIC-NEXT: cmoval (%rdx), %edi # sched: [8:1.00]
+; GENERIC-NEXT: cmoval (%rdx), %edi # sched: [8:1.00]
+; GENERIC-NEXT: cmovsl (%rdx), %edi # sched: [7:0.67]
+; GENERIC-NEXT: cmovnsl (%rdx), %edi # sched: [7:0.67]
+; GENERIC-NEXT: cmovpl (%rdx), %edi # sched: [7:0.67]
+; GENERIC-NEXT: cmovpl (%rdx), %edi # sched: [7:0.67]
+; GENERIC-NEXT: cmovnpl (%rdx), %edi # sched: [7:0.67]
+; GENERIC-NEXT: cmovnpl (%rdx), %edi # sched: [7:0.67]
+; GENERIC-NEXT: cmovll (%rdx), %edi # sched: [7:0.67]
+; GENERIC-NEXT: cmovll (%rdx), %edi # sched: [7:0.67]
+; GENERIC-NEXT: cmovgel (%rdx), %edi # sched: [7:0.67]
+; GENERIC-NEXT: cmovgel (%rdx), %edi # sched: [7:0.67]
+; GENERIC-NEXT: cmovlel (%rdx), %edi # sched: [7:0.67]
+; GENERIC-NEXT: cmovlel (%rdx), %edi # sched: [7:0.67]
+; GENERIC-NEXT: cmovgl (%rdx), %edi # sched: [7:0.67]
+; GENERIC-NEXT: cmovgl (%rdx), %edi # sched: [7:0.67]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_cmov_32:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: cmovol %esi, %edi # sched: [1:0.50]
+; ATOM-NEXT: cmovnol %esi, %edi # sched: [1:0.50]
+; ATOM-NEXT: cmovbl %esi, %edi # sched: [1:0.50]
+; ATOM-NEXT: cmovbl %esi, %edi # sched: [1:0.50]
+; ATOM-NEXT: cmovbl %esi, %edi # sched: [1:0.50]
+; ATOM-NEXT: cmovael %esi, %edi # sched: [1:0.50]
+; ATOM-NEXT: cmovael %esi, %edi # sched: [1:0.50]
+; ATOM-NEXT: cmovael %esi, %edi # sched: [1:0.50]
+; ATOM-NEXT: cmovel %esi, %edi # sched: [1:0.50]
+; ATOM-NEXT: cmovel %esi, %edi # sched: [1:0.50]
+; ATOM-NEXT: cmovnel %esi, %edi # sched: [1:0.50]
+; ATOM-NEXT: cmovnel %esi, %edi # sched: [1:0.50]
+; ATOM-NEXT: cmovbel %esi, %edi # sched: [1:0.50]
+; ATOM-NEXT: cmovbel %esi, %edi # sched: [1:0.50]
+; ATOM-NEXT: cmoval %esi, %edi # sched: [1:0.50]
+; ATOM-NEXT: cmoval %esi, %edi # sched: [1:0.50]
+; ATOM-NEXT: cmovsl %esi, %edi # sched: [1:0.50]
+; ATOM-NEXT: cmovnsl %esi, %edi # sched: [1:0.50]
+; ATOM-NEXT: cmovpl %esi, %edi # sched: [1:0.50]
+; ATOM-NEXT: cmovpl %esi, %edi # sched: [1:0.50]
+; ATOM-NEXT: cmovnpl %esi, %edi # sched: [1:0.50]
+; ATOM-NEXT: cmovnpl %esi, %edi # sched: [1:0.50]
+; ATOM-NEXT: cmovll %esi, %edi # sched: [1:0.50]
+; ATOM-NEXT: cmovll %esi, %edi # sched: [1:0.50]
+; ATOM-NEXT: cmovgel %esi, %edi # sched: [1:0.50]
+; ATOM-NEXT: cmovgel %esi, %edi # sched: [1:0.50]
+; ATOM-NEXT: cmovlel %esi, %edi # sched: [1:0.50]
+; ATOM-NEXT: cmovlel %esi, %edi # sched: [1:0.50]
+; ATOM-NEXT: cmovgl %esi, %edi # sched: [1:0.50]
+; ATOM-NEXT: cmovgl %esi, %edi # sched: [1:0.50]
+; ATOM-NEXT: cmovol (%rdx), %edi # sched: [1:1.00]
+; ATOM-NEXT: cmovnol (%rdx), %edi # sched: [1:1.00]
+; ATOM-NEXT: cmovbl (%rdx), %edi # sched: [1:1.00]
+; ATOM-NEXT: cmovbl (%rdx), %edi # sched: [1:1.00]
+; ATOM-NEXT: cmovbl (%rdx), %edi # sched: [1:1.00]
+; ATOM-NEXT: cmovael (%rdx), %edi # sched: [1:1.00]
+; ATOM-NEXT: cmovael (%rdx), %edi # sched: [1:1.00]
+; ATOM-NEXT: cmovael (%rdx), %edi # sched: [1:1.00]
+; ATOM-NEXT: cmovel (%rdx), %edi # sched: [1:1.00]
+; ATOM-NEXT: cmovel (%rdx), %edi # sched: [1:1.00]
+; ATOM-NEXT: cmovnel (%rdx), %edi # sched: [1:1.00]
+; ATOM-NEXT: cmovnel (%rdx), %edi # sched: [1:1.00]
+; ATOM-NEXT: cmovbel (%rdx), %edi # sched: [1:1.00]
+; ATOM-NEXT: cmovbel (%rdx), %edi # sched: [1:1.00]
+; ATOM-NEXT: cmoval (%rdx), %edi # sched: [1:1.00]
+; ATOM-NEXT: cmoval (%rdx), %edi # sched: [1:1.00]
+; ATOM-NEXT: cmovsl (%rdx), %edi # sched: [1:1.00]
+; ATOM-NEXT: cmovnsl (%rdx), %edi # sched: [1:1.00]
+; ATOM-NEXT: cmovpl (%rdx), %edi # sched: [1:1.00]
+; ATOM-NEXT: cmovpl (%rdx), %edi # sched: [1:1.00]
+; ATOM-NEXT: cmovnpl (%rdx), %edi # sched: [1:1.00]
+; ATOM-NEXT: cmovnpl (%rdx), %edi # sched: [1:1.00]
+; ATOM-NEXT: cmovll (%rdx), %edi # sched: [1:1.00]
+; ATOM-NEXT: cmovll (%rdx), %edi # sched: [1:1.00]
+; ATOM-NEXT: cmovgel (%rdx), %edi # sched: [1:1.00]
+; ATOM-NEXT: cmovgel (%rdx), %edi # sched: [1:1.00]
+; ATOM-NEXT: cmovlel (%rdx), %edi # sched: [1:1.00]
+; ATOM-NEXT: cmovlel (%rdx), %edi # sched: [1:1.00]
+; ATOM-NEXT: cmovgl (%rdx), %edi # sched: [1:1.00]
+; ATOM-NEXT: cmovgl (%rdx), %edi # sched: [1:1.00]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_cmov_32:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: cmovol %esi, %edi # sched: [1:0.50]
+; SLM-NEXT: cmovnol %esi, %edi # sched: [1:0.50]
+; SLM-NEXT: cmovbl %esi, %edi # sched: [1:0.50]
+; SLM-NEXT: cmovbl %esi, %edi # sched: [1:0.50]
+; SLM-NEXT: cmovbl %esi, %edi # sched: [1:0.50]
+; SLM-NEXT: cmovael %esi, %edi # sched: [1:0.50]
+; SLM-NEXT: cmovael %esi, %edi # sched: [1:0.50]
+; SLM-NEXT: cmovael %esi, %edi # sched: [1:0.50]
+; SLM-NEXT: cmovel %esi, %edi # sched: [1:0.50]
+; SLM-NEXT: cmovel %esi, %edi # sched: [1:0.50]
+; SLM-NEXT: cmovnel %esi, %edi # sched: [1:0.50]
+; SLM-NEXT: cmovnel %esi, %edi # sched: [1:0.50]
+; SLM-NEXT: cmovbel %esi, %edi # sched: [1:0.50]
+; SLM-NEXT: cmovbel %esi, %edi # sched: [1:0.50]
+; SLM-NEXT: cmoval %esi, %edi # sched: [1:0.50]
+; SLM-NEXT: cmoval %esi, %edi # sched: [1:0.50]
+; SLM-NEXT: cmovsl %esi, %edi # sched: [1:0.50]
+; SLM-NEXT: cmovnsl %esi, %edi # sched: [1:0.50]
+; SLM-NEXT: cmovpl %esi, %edi # sched: [1:0.50]
+; SLM-NEXT: cmovpl %esi, %edi # sched: [1:0.50]
+; SLM-NEXT: cmovnpl %esi, %edi # sched: [1:0.50]
+; SLM-NEXT: cmovnpl %esi, %edi # sched: [1:0.50]
+; SLM-NEXT: cmovll %esi, %edi # sched: [1:0.50]
+; SLM-NEXT: cmovll %esi, %edi # sched: [1:0.50]
+; SLM-NEXT: cmovgel %esi, %edi # sched: [1:0.50]
+; SLM-NEXT: cmovgel %esi, %edi # sched: [1:0.50]
+; SLM-NEXT: cmovlel %esi, %edi # sched: [1:0.50]
+; SLM-NEXT: cmovlel %esi, %edi # sched: [1:0.50]
+; SLM-NEXT: cmovgl %esi, %edi # sched: [1:0.50]
+; SLM-NEXT: cmovgl %esi, %edi # sched: [1:0.50]
+; SLM-NEXT: cmovol (%rdx), %edi # sched: [4:1.00]
+; SLM-NEXT: cmovnol (%rdx), %edi # sched: [4:1.00]
+; SLM-NEXT: cmovbl (%rdx), %edi # sched: [4:1.00]
+; SLM-NEXT: cmovbl (%rdx), %edi # sched: [4:1.00]
+; SLM-NEXT: cmovbl (%rdx), %edi # sched: [4:1.00]
+; SLM-NEXT: cmovael (%rdx), %edi # sched: [4:1.00]
+; SLM-NEXT: cmovael (%rdx), %edi # sched: [4:1.00]
+; SLM-NEXT: cmovael (%rdx), %edi # sched: [4:1.00]
+; SLM-NEXT: cmovel (%rdx), %edi # sched: [4:1.00]
+; SLM-NEXT: cmovel (%rdx), %edi # sched: [4:1.00]
+; SLM-NEXT: cmovnel (%rdx), %edi # sched: [4:1.00]
+; SLM-NEXT: cmovnel (%rdx), %edi # sched: [4:1.00]
+; SLM-NEXT: cmovbel (%rdx), %edi # sched: [4:1.00]
+; SLM-NEXT: cmovbel (%rdx), %edi # sched: [4:1.00]
+; SLM-NEXT: cmoval (%rdx), %edi # sched: [4:1.00]
+; SLM-NEXT: cmoval (%rdx), %edi # sched: [4:1.00]
+; SLM-NEXT: cmovsl (%rdx), %edi # sched: [4:1.00]
+; SLM-NEXT: cmovnsl (%rdx), %edi # sched: [4:1.00]
+; SLM-NEXT: cmovpl (%rdx), %edi # sched: [4:1.00]
+; SLM-NEXT: cmovpl (%rdx), %edi # sched: [4:1.00]
+; SLM-NEXT: cmovnpl (%rdx), %edi # sched: [4:1.00]
+; SLM-NEXT: cmovnpl (%rdx), %edi # sched: [4:1.00]
+; SLM-NEXT: cmovll (%rdx), %edi # sched: [4:1.00]
+; SLM-NEXT: cmovll (%rdx), %edi # sched: [4:1.00]
+; SLM-NEXT: cmovgel (%rdx), %edi # sched: [4:1.00]
+; SLM-NEXT: cmovgel (%rdx), %edi # sched: [4:1.00]
+; SLM-NEXT: cmovlel (%rdx), %edi # sched: [4:1.00]
+; SLM-NEXT: cmovlel (%rdx), %edi # sched: [4:1.00]
+; SLM-NEXT: cmovgl (%rdx), %edi # sched: [4:1.00]
+; SLM-NEXT: cmovgl (%rdx), %edi # sched: [4:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_cmov_32:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: cmovol %esi, %edi # sched: [2:0.67]
+; SANDY-NEXT: cmovnol %esi, %edi # sched: [2:0.67]
+; SANDY-NEXT: cmovbl %esi, %edi # sched: [2:0.67]
+; SANDY-NEXT: cmovbl %esi, %edi # sched: [2:0.67]
+; SANDY-NEXT: cmovbl %esi, %edi # sched: [2:0.67]
+; SANDY-NEXT: cmovael %esi, %edi # sched: [2:0.67]
+; SANDY-NEXT: cmovael %esi, %edi # sched: [2:0.67]
+; SANDY-NEXT: cmovael %esi, %edi # sched: [2:0.67]
+; SANDY-NEXT: cmovel %esi, %edi # sched: [2:0.67]
+; SANDY-NEXT: cmovel %esi, %edi # sched: [2:0.67]
+; SANDY-NEXT: cmovnel %esi, %edi # sched: [2:0.67]
+; SANDY-NEXT: cmovnel %esi, %edi # sched: [2:0.67]
+; SANDY-NEXT: cmovbel %esi, %edi # sched: [3:1.00]
+; SANDY-NEXT: cmovbel %esi, %edi # sched: [3:1.00]
+; SANDY-NEXT: cmoval %esi, %edi # sched: [3:1.00]
+; SANDY-NEXT: cmoval %esi, %edi # sched: [3:1.00]
+; SANDY-NEXT: cmovsl %esi, %edi # sched: [2:0.67]
+; SANDY-NEXT: cmovnsl %esi, %edi # sched: [2:0.67]
+; SANDY-NEXT: cmovpl %esi, %edi # sched: [2:0.67]
+; SANDY-NEXT: cmovpl %esi, %edi # sched: [2:0.67]
+; SANDY-NEXT: cmovnpl %esi, %edi # sched: [2:0.67]
+; SANDY-NEXT: cmovnpl %esi, %edi # sched: [2:0.67]
+; SANDY-NEXT: cmovll %esi, %edi # sched: [2:0.67]
+; SANDY-NEXT: cmovll %esi, %edi # sched: [2:0.67]
+; SANDY-NEXT: cmovgel %esi, %edi # sched: [2:0.67]
+; SANDY-NEXT: cmovgel %esi, %edi # sched: [2:0.67]
+; SANDY-NEXT: cmovlel %esi, %edi # sched: [2:0.67]
+; SANDY-NEXT: cmovlel %esi, %edi # sched: [2:0.67]
+; SANDY-NEXT: cmovgl %esi, %edi # sched: [2:0.67]
+; SANDY-NEXT: cmovgl %esi, %edi # sched: [2:0.67]
+; SANDY-NEXT: cmovol (%rdx), %edi # sched: [7:0.67]
+; SANDY-NEXT: cmovnol (%rdx), %edi # sched: [7:0.67]
+; SANDY-NEXT: cmovbl (%rdx), %edi # sched: [7:0.67]
+; SANDY-NEXT: cmovbl (%rdx), %edi # sched: [7:0.67]
+; SANDY-NEXT: cmovbl (%rdx), %edi # sched: [7:0.67]
+; SANDY-NEXT: cmovael (%rdx), %edi # sched: [7:0.67]
+; SANDY-NEXT: cmovael (%rdx), %edi # sched: [7:0.67]
+; SANDY-NEXT: cmovael (%rdx), %edi # sched: [7:0.67]
+; SANDY-NEXT: cmovel (%rdx), %edi # sched: [7:0.67]
+; SANDY-NEXT: cmovel (%rdx), %edi # sched: [7:0.67]
+; SANDY-NEXT: cmovnel (%rdx), %edi # sched: [7:0.67]
+; SANDY-NEXT: cmovnel (%rdx), %edi # sched: [7:0.67]
+; SANDY-NEXT: cmovbel (%rdx), %edi # sched: [8:1.00]
+; SANDY-NEXT: cmovbel (%rdx), %edi # sched: [8:1.00]
+; SANDY-NEXT: cmoval (%rdx), %edi # sched: [8:1.00]
+; SANDY-NEXT: cmoval (%rdx), %edi # sched: [8:1.00]
+; SANDY-NEXT: cmovsl (%rdx), %edi # sched: [7:0.67]
+; SANDY-NEXT: cmovnsl (%rdx), %edi # sched: [7:0.67]
+; SANDY-NEXT: cmovpl (%rdx), %edi # sched: [7:0.67]
+; SANDY-NEXT: cmovpl (%rdx), %edi # sched: [7:0.67]
+; SANDY-NEXT: cmovnpl (%rdx), %edi # sched: [7:0.67]
+; SANDY-NEXT: cmovnpl (%rdx), %edi # sched: [7:0.67]
+; SANDY-NEXT: cmovll (%rdx), %edi # sched: [7:0.67]
+; SANDY-NEXT: cmovll (%rdx), %edi # sched: [7:0.67]
+; SANDY-NEXT: cmovgel (%rdx), %edi # sched: [7:0.67]
+; SANDY-NEXT: cmovgel (%rdx), %edi # sched: [7:0.67]
+; SANDY-NEXT: cmovlel (%rdx), %edi # sched: [7:0.67]
+; SANDY-NEXT: cmovlel (%rdx), %edi # sched: [7:0.67]
+; SANDY-NEXT: cmovgl (%rdx), %edi # sched: [7:0.67]
+; SANDY-NEXT: cmovgl (%rdx), %edi # sched: [7:0.67]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_cmov_32:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: cmovol %esi, %edi # sched: [2:0.50]
+; HASWELL-NEXT: cmovnol %esi, %edi # sched: [2:0.50]
+; HASWELL-NEXT: cmovbl %esi, %edi # sched: [2:0.50]
+; HASWELL-NEXT: cmovbl %esi, %edi # sched: [2:0.50]
+; HASWELL-NEXT: cmovbl %esi, %edi # sched: [2:0.50]
+; HASWELL-NEXT: cmovael %esi, %edi # sched: [2:0.50]
+; HASWELL-NEXT: cmovael %esi, %edi # sched: [2:0.50]
+; HASWELL-NEXT: cmovael %esi, %edi # sched: [2:0.50]
+; HASWELL-NEXT: cmovel %esi, %edi # sched: [2:0.50]
+; HASWELL-NEXT: cmovel %esi, %edi # sched: [2:0.50]
+; HASWELL-NEXT: cmovnel %esi, %edi # sched: [2:0.50]
+; HASWELL-NEXT: cmovnel %esi, %edi # sched: [2:0.50]
+; HASWELL-NEXT: cmovbel %esi, %edi # sched: [3:0.75]
+; HASWELL-NEXT: cmovbel %esi, %edi # sched: [3:0.75]
+; HASWELL-NEXT: cmoval %esi, %edi # sched: [3:0.75]
+; HASWELL-NEXT: cmoval %esi, %edi # sched: [3:0.75]
+; HASWELL-NEXT: cmovsl %esi, %edi # sched: [2:0.50]
+; HASWELL-NEXT: cmovnsl %esi, %edi # sched: [2:0.50]
+; HASWELL-NEXT: cmovpl %esi, %edi # sched: [2:0.50]
+; HASWELL-NEXT: cmovpl %esi, %edi # sched: [2:0.50]
+; HASWELL-NEXT: cmovnpl %esi, %edi # sched: [2:0.50]
+; HASWELL-NEXT: cmovnpl %esi, %edi # sched: [2:0.50]
+; HASWELL-NEXT: cmovll %esi, %edi # sched: [2:0.50]
+; HASWELL-NEXT: cmovll %esi, %edi # sched: [2:0.50]
+; HASWELL-NEXT: cmovgel %esi, %edi # sched: [2:0.50]
+; HASWELL-NEXT: cmovgel %esi, %edi # sched: [2:0.50]
+; HASWELL-NEXT: cmovlel %esi, %edi # sched: [2:0.50]
+; HASWELL-NEXT: cmovlel %esi, %edi # sched: [2:0.50]
+; HASWELL-NEXT: cmovgl %esi, %edi # sched: [2:0.50]
+; HASWELL-NEXT: cmovgl %esi, %edi # sched: [2:0.50]
+; HASWELL-NEXT: cmovol (%rdx), %edi # sched: [7:0.50]
+; HASWELL-NEXT: cmovnol (%rdx), %edi # sched: [7:0.50]
+; HASWELL-NEXT: cmovbl (%rdx), %edi # sched: [7:0.50]
+; HASWELL-NEXT: cmovbl (%rdx), %edi # sched: [7:0.50]
+; HASWELL-NEXT: cmovbl (%rdx), %edi # sched: [7:0.50]
+; HASWELL-NEXT: cmovael (%rdx), %edi # sched: [7:0.50]
+; HASWELL-NEXT: cmovael (%rdx), %edi # sched: [7:0.50]
+; HASWELL-NEXT: cmovael (%rdx), %edi # sched: [7:0.50]
+; HASWELL-NEXT: cmovel (%rdx), %edi # sched: [7:0.50]
+; HASWELL-NEXT: cmovel (%rdx), %edi # sched: [7:0.50]
+; HASWELL-NEXT: cmovnel (%rdx), %edi # sched: [7:0.50]
+; HASWELL-NEXT: cmovnel (%rdx), %edi # sched: [7:0.50]
+; HASWELL-NEXT: cmovbel (%rdx), %edi # sched: [8:0.75]
+; HASWELL-NEXT: cmovbel (%rdx), %edi # sched: [8:0.75]
+; HASWELL-NEXT: cmoval (%rdx), %edi # sched: [8:0.75]
+; HASWELL-NEXT: cmoval (%rdx), %edi # sched: [8:0.75]
+; HASWELL-NEXT: cmovsl (%rdx), %edi # sched: [7:0.50]
+; HASWELL-NEXT: cmovnsl (%rdx), %edi # sched: [7:0.50]
+; HASWELL-NEXT: cmovpl (%rdx), %edi # sched: [7:0.50]
+; HASWELL-NEXT: cmovpl (%rdx), %edi # sched: [7:0.50]
+; HASWELL-NEXT: cmovnpl (%rdx), %edi # sched: [7:0.50]
+; HASWELL-NEXT: cmovnpl (%rdx), %edi # sched: [7:0.50]
+; HASWELL-NEXT: cmovll (%rdx), %edi # sched: [7:0.50]
+; HASWELL-NEXT: cmovll (%rdx), %edi # sched: [7:0.50]
+; HASWELL-NEXT: cmovgel (%rdx), %edi # sched: [7:0.50]
+; HASWELL-NEXT: cmovgel (%rdx), %edi # sched: [7:0.50]
+; HASWELL-NEXT: cmovlel (%rdx), %edi # sched: [7:0.50]
+; HASWELL-NEXT: cmovlel (%rdx), %edi # sched: [7:0.50]
+; HASWELL-NEXT: cmovgl (%rdx), %edi # sched: [7:0.50]
+; HASWELL-NEXT: cmovgl (%rdx), %edi # sched: [7:0.50]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_cmov_32:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: cmovol %esi, %edi # sched: [1:0.50]
+; BROADWELL-NEXT: cmovnol %esi, %edi # sched: [1:0.50]
+; BROADWELL-NEXT: cmovbl %esi, %edi # sched: [1:0.50]
+; BROADWELL-NEXT: cmovbl %esi, %edi # sched: [1:0.50]
+; BROADWELL-NEXT: cmovbl %esi, %edi # sched: [1:0.50]
+; BROADWELL-NEXT: cmovael %esi, %edi # sched: [1:0.50]
+; BROADWELL-NEXT: cmovael %esi, %edi # sched: [1:0.50]
+; BROADWELL-NEXT: cmovael %esi, %edi # sched: [1:0.50]
+; BROADWELL-NEXT: cmovel %esi, %edi # sched: [1:0.50]
+; BROADWELL-NEXT: cmovel %esi, %edi # sched: [1:0.50]
+; BROADWELL-NEXT: cmovnel %esi, %edi # sched: [1:0.50]
+; BROADWELL-NEXT: cmovnel %esi, %edi # sched: [1:0.50]
+; BROADWELL-NEXT: cmovbel %esi, %edi # sched: [2:0.50]
+; BROADWELL-NEXT: cmovbel %esi, %edi # sched: [2:0.50]
+; BROADWELL-NEXT: cmoval %esi, %edi # sched: [2:0.50]
+; BROADWELL-NEXT: cmoval %esi, %edi # sched: [2:0.50]
+; BROADWELL-NEXT: cmovsl %esi, %edi # sched: [1:0.50]
+; BROADWELL-NEXT: cmovnsl %esi, %edi # sched: [1:0.50]
+; BROADWELL-NEXT: cmovpl %esi, %edi # sched: [1:0.50]
+; BROADWELL-NEXT: cmovpl %esi, %edi # sched: [1:0.50]
+; BROADWELL-NEXT: cmovnpl %esi, %edi # sched: [1:0.50]
+; BROADWELL-NEXT: cmovnpl %esi, %edi # sched: [1:0.50]
+; BROADWELL-NEXT: cmovll %esi, %edi # sched: [1:0.50]
+; BROADWELL-NEXT: cmovll %esi, %edi # sched: [1:0.50]
+; BROADWELL-NEXT: cmovgel %esi, %edi # sched: [1:0.50]
+; BROADWELL-NEXT: cmovgel %esi, %edi # sched: [1:0.50]
+; BROADWELL-NEXT: cmovlel %esi, %edi # sched: [1:0.50]
+; BROADWELL-NEXT: cmovlel %esi, %edi # sched: [1:0.50]
+; BROADWELL-NEXT: cmovgl %esi, %edi # sched: [1:0.50]
+; BROADWELL-NEXT: cmovgl %esi, %edi # sched: [1:0.50]
+; BROADWELL-NEXT: cmovol (%rdx), %edi # sched: [6:0.50]
+; BROADWELL-NEXT: cmovnol (%rdx), %edi # sched: [6:0.50]
+; BROADWELL-NEXT: cmovbl (%rdx), %edi # sched: [6:0.50]
+; BROADWELL-NEXT: cmovbl (%rdx), %edi # sched: [6:0.50]
+; BROADWELL-NEXT: cmovbl (%rdx), %edi # sched: [6:0.50]
+; BROADWELL-NEXT: cmovael (%rdx), %edi # sched: [6:0.50]
+; BROADWELL-NEXT: cmovael (%rdx), %edi # sched: [6:0.50]
+; BROADWELL-NEXT: cmovael (%rdx), %edi # sched: [6:0.50]
+; BROADWELL-NEXT: cmovel (%rdx), %edi # sched: [6:0.50]
+; BROADWELL-NEXT: cmovel (%rdx), %edi # sched: [6:0.50]
+; BROADWELL-NEXT: cmovnel (%rdx), %edi # sched: [6:0.50]
+; BROADWELL-NEXT: cmovnel (%rdx), %edi # sched: [6:0.50]
+; BROADWELL-NEXT: cmovbel (%rdx), %edi # sched: [7:0.50]
+; BROADWELL-NEXT: cmovbel (%rdx), %edi # sched: [7:0.50]
+; BROADWELL-NEXT: cmoval (%rdx), %edi # sched: [7:0.50]
+; BROADWELL-NEXT: cmoval (%rdx), %edi # sched: [7:0.50]
+; BROADWELL-NEXT: cmovsl (%rdx), %edi # sched: [6:0.50]
+; BROADWELL-NEXT: cmovnsl (%rdx), %edi # sched: [6:0.50]
+; BROADWELL-NEXT: cmovpl (%rdx), %edi # sched: [6:0.50]
+; BROADWELL-NEXT: cmovpl (%rdx), %edi # sched: [6:0.50]
+; BROADWELL-NEXT: cmovnpl (%rdx), %edi # sched: [6:0.50]
+; BROADWELL-NEXT: cmovnpl (%rdx), %edi # sched: [6:0.50]
+; BROADWELL-NEXT: cmovll (%rdx), %edi # sched: [6:0.50]
+; BROADWELL-NEXT: cmovll (%rdx), %edi # sched: [6:0.50]
+; BROADWELL-NEXT: cmovgel (%rdx), %edi # sched: [6:0.50]
+; BROADWELL-NEXT: cmovgel (%rdx), %edi # sched: [6:0.50]
+; BROADWELL-NEXT: cmovlel (%rdx), %edi # sched: [6:0.50]
+; BROADWELL-NEXT: cmovlel (%rdx), %edi # sched: [6:0.50]
+; BROADWELL-NEXT: cmovgl (%rdx), %edi # sched: [6:0.50]
+; BROADWELL-NEXT: cmovgl (%rdx), %edi # sched: [6:0.50]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_cmov_32:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: cmovol %esi, %edi # sched: [1:0.50]
+; SKYLAKE-NEXT: cmovnol %esi, %edi # sched: [1:0.50]
+; SKYLAKE-NEXT: cmovbl %esi, %edi # sched: [1:0.50]
+; SKYLAKE-NEXT: cmovbl %esi, %edi # sched: [1:0.50]
+; SKYLAKE-NEXT: cmovbl %esi, %edi # sched: [1:0.50]
+; SKYLAKE-NEXT: cmovael %esi, %edi # sched: [1:0.50]
+; SKYLAKE-NEXT: cmovael %esi, %edi # sched: [1:0.50]
+; SKYLAKE-NEXT: cmovael %esi, %edi # sched: [1:0.50]
+; SKYLAKE-NEXT: cmovel %esi, %edi # sched: [1:0.50]
+; SKYLAKE-NEXT: cmovel %esi, %edi # sched: [1:0.50]
+; SKYLAKE-NEXT: cmovnel %esi, %edi # sched: [1:0.50]
+; SKYLAKE-NEXT: cmovnel %esi, %edi # sched: [1:0.50]
+; SKYLAKE-NEXT: cmovbel %esi, %edi # sched: [2:1.00]
+; SKYLAKE-NEXT: cmovbel %esi, %edi # sched: [2:1.00]
+; SKYLAKE-NEXT: cmoval %esi, %edi # sched: [2:1.00]
+; SKYLAKE-NEXT: cmoval %esi, %edi # sched: [2:1.00]
+; SKYLAKE-NEXT: cmovsl %esi, %edi # sched: [1:0.50]
+; SKYLAKE-NEXT: cmovnsl %esi, %edi # sched: [1:0.50]
+; SKYLAKE-NEXT: cmovpl %esi, %edi # sched: [1:0.50]
+; SKYLAKE-NEXT: cmovpl %esi, %edi # sched: [1:0.50]
+; SKYLAKE-NEXT: cmovnpl %esi, %edi # sched: [1:0.50]
+; SKYLAKE-NEXT: cmovnpl %esi, %edi # sched: [1:0.50]
+; SKYLAKE-NEXT: cmovll %esi, %edi # sched: [1:0.50]
+; SKYLAKE-NEXT: cmovll %esi, %edi # sched: [1:0.50]
+; SKYLAKE-NEXT: cmovgel %esi, %edi # sched: [1:0.50]
+; SKYLAKE-NEXT: cmovgel %esi, %edi # sched: [1:0.50]
+; SKYLAKE-NEXT: cmovlel %esi, %edi # sched: [1:0.50]
+; SKYLAKE-NEXT: cmovlel %esi, %edi # sched: [1:0.50]
+; SKYLAKE-NEXT: cmovgl %esi, %edi # sched: [1:0.50]
+; SKYLAKE-NEXT: cmovgl %esi, %edi # sched: [1:0.50]
+; SKYLAKE-NEXT: cmovol (%rdx), %edi # sched: [6:0.50]
+; SKYLAKE-NEXT: cmovnol (%rdx), %edi # sched: [6:0.50]
+; SKYLAKE-NEXT: cmovbl (%rdx), %edi # sched: [6:0.50]
+; SKYLAKE-NEXT: cmovbl (%rdx), %edi # sched: [6:0.50]
+; SKYLAKE-NEXT: cmovbl (%rdx), %edi # sched: [6:0.50]
+; SKYLAKE-NEXT: cmovael (%rdx), %edi # sched: [6:0.50]
+; SKYLAKE-NEXT: cmovael (%rdx), %edi # sched: [6:0.50]
+; SKYLAKE-NEXT: cmovael (%rdx), %edi # sched: [6:0.50]
+; SKYLAKE-NEXT: cmovel (%rdx), %edi # sched: [6:0.50]
+; SKYLAKE-NEXT: cmovel (%rdx), %edi # sched: [6:0.50]
+; SKYLAKE-NEXT: cmovnel (%rdx), %edi # sched: [6:0.50]
+; SKYLAKE-NEXT: cmovnel (%rdx), %edi # sched: [6:0.50]
+; SKYLAKE-NEXT: cmovbel (%rdx), %edi # sched: [7:1.00]
+; SKYLAKE-NEXT: cmovbel (%rdx), %edi # sched: [7:1.00]
+; SKYLAKE-NEXT: cmoval (%rdx), %edi # sched: [7:1.00]
+; SKYLAKE-NEXT: cmoval (%rdx), %edi # sched: [7:1.00]
+; SKYLAKE-NEXT: cmovsl (%rdx), %edi # sched: [6:0.50]
+; SKYLAKE-NEXT: cmovnsl (%rdx), %edi # sched: [6:0.50]
+; SKYLAKE-NEXT: cmovpl (%rdx), %edi # sched: [6:0.50]
+; SKYLAKE-NEXT: cmovpl (%rdx), %edi # sched: [6:0.50]
+; SKYLAKE-NEXT: cmovnpl (%rdx), %edi # sched: [6:0.50]
+; SKYLAKE-NEXT: cmovnpl (%rdx), %edi # sched: [6:0.50]
+; SKYLAKE-NEXT: cmovll (%rdx), %edi # sched: [6:0.50]
+; SKYLAKE-NEXT: cmovll (%rdx), %edi # sched: [6:0.50]
+; SKYLAKE-NEXT: cmovgel (%rdx), %edi # sched: [6:0.50]
+; SKYLAKE-NEXT: cmovgel (%rdx), %edi # sched: [6:0.50]
+; SKYLAKE-NEXT: cmovlel (%rdx), %edi # sched: [6:0.50]
+; SKYLAKE-NEXT: cmovlel (%rdx), %edi # sched: [6:0.50]
+; SKYLAKE-NEXT: cmovgl (%rdx), %edi # sched: [6:0.50]
+; SKYLAKE-NEXT: cmovgl (%rdx), %edi # sched: [6:0.50]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_cmov_32:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: cmovol %esi, %edi # sched: [1:0.50]
+; SKX-NEXT: cmovnol %esi, %edi # sched: [1:0.50]
+; SKX-NEXT: cmovbl %esi, %edi # sched: [1:0.50]
+; SKX-NEXT: cmovbl %esi, %edi # sched: [1:0.50]
+; SKX-NEXT: cmovbl %esi, %edi # sched: [1:0.50]
+; SKX-NEXT: cmovael %esi, %edi # sched: [1:0.50]
+; SKX-NEXT: cmovael %esi, %edi # sched: [1:0.50]
+; SKX-NEXT: cmovael %esi, %edi # sched: [1:0.50]
+; SKX-NEXT: cmovel %esi, %edi # sched: [1:0.50]
+; SKX-NEXT: cmovel %esi, %edi # sched: [1:0.50]
+; SKX-NEXT: cmovnel %esi, %edi # sched: [1:0.50]
+; SKX-NEXT: cmovnel %esi, %edi # sched: [1:0.50]
+; SKX-NEXT: cmovbel %esi, %edi # sched: [2:1.00]
+; SKX-NEXT: cmovbel %esi, %edi # sched: [2:1.00]
+; SKX-NEXT: cmoval %esi, %edi # sched: [2:1.00]
+; SKX-NEXT: cmoval %esi, %edi # sched: [2:1.00]
+; SKX-NEXT: cmovsl %esi, %edi # sched: [1:0.50]
+; SKX-NEXT: cmovnsl %esi, %edi # sched: [1:0.50]
+; SKX-NEXT: cmovpl %esi, %edi # sched: [1:0.50]
+; SKX-NEXT: cmovpl %esi, %edi # sched: [1:0.50]
+; SKX-NEXT: cmovnpl %esi, %edi # sched: [1:0.50]
+; SKX-NEXT: cmovnpl %esi, %edi # sched: [1:0.50]
+; SKX-NEXT: cmovll %esi, %edi # sched: [1:0.50]
+; SKX-NEXT: cmovll %esi, %edi # sched: [1:0.50]
+; SKX-NEXT: cmovgel %esi, %edi # sched: [1:0.50]
+; SKX-NEXT: cmovgel %esi, %edi # sched: [1:0.50]
+; SKX-NEXT: cmovlel %esi, %edi # sched: [1:0.50]
+; SKX-NEXT: cmovlel %esi, %edi # sched: [1:0.50]
+; SKX-NEXT: cmovgl %esi, %edi # sched: [1:0.50]
+; SKX-NEXT: cmovgl %esi, %edi # sched: [1:0.50]
+; SKX-NEXT: cmovol (%rdx), %edi # sched: [6:0.50]
+; SKX-NEXT: cmovnol (%rdx), %edi # sched: [6:0.50]
+; SKX-NEXT: cmovbl (%rdx), %edi # sched: [6:0.50]
+; SKX-NEXT: cmovbl (%rdx), %edi # sched: [6:0.50]
+; SKX-NEXT: cmovbl (%rdx), %edi # sched: [6:0.50]
+; SKX-NEXT: cmovael (%rdx), %edi # sched: [6:0.50]
+; SKX-NEXT: cmovael (%rdx), %edi # sched: [6:0.50]
+; SKX-NEXT: cmovael (%rdx), %edi # sched: [6:0.50]
+; SKX-NEXT: cmovel (%rdx), %edi # sched: [6:0.50]
+; SKX-NEXT: cmovel (%rdx), %edi # sched: [6:0.50]
+; SKX-NEXT: cmovnel (%rdx), %edi # sched: [6:0.50]
+; SKX-NEXT: cmovnel (%rdx), %edi # sched: [6:0.50]
+; SKX-NEXT: cmovbel (%rdx), %edi # sched: [7:1.00]
+; SKX-NEXT: cmovbel (%rdx), %edi # sched: [7:1.00]
+; SKX-NEXT: cmoval (%rdx), %edi # sched: [7:1.00]
+; SKX-NEXT: cmoval (%rdx), %edi # sched: [7:1.00]
+; SKX-NEXT: cmovsl (%rdx), %edi # sched: [6:0.50]
+; SKX-NEXT: cmovnsl (%rdx), %edi # sched: [6:0.50]
+; SKX-NEXT: cmovpl (%rdx), %edi # sched: [6:0.50]
+; SKX-NEXT: cmovpl (%rdx), %edi # sched: [6:0.50]
+; SKX-NEXT: cmovnpl (%rdx), %edi # sched: [6:0.50]
+; SKX-NEXT: cmovnpl (%rdx), %edi # sched: [6:0.50]
+; SKX-NEXT: cmovll (%rdx), %edi # sched: [6:0.50]
+; SKX-NEXT: cmovll (%rdx), %edi # sched: [6:0.50]
+; SKX-NEXT: cmovgel (%rdx), %edi # sched: [6:0.50]
+; SKX-NEXT: cmovgel (%rdx), %edi # sched: [6:0.50]
+; SKX-NEXT: cmovlel (%rdx), %edi # sched: [6:0.50]
+; SKX-NEXT: cmovlel (%rdx), %edi # sched: [6:0.50]
+; SKX-NEXT: cmovgl (%rdx), %edi # sched: [6:0.50]
+; SKX-NEXT: cmovgl (%rdx), %edi # sched: [6:0.50]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_cmov_32:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: cmovol %esi, %edi # sched: [1:0.50]
+; BTVER2-NEXT: cmovnol %esi, %edi # sched: [1:0.50]
+; BTVER2-NEXT: cmovbl %esi, %edi # sched: [1:0.50]
+; BTVER2-NEXT: cmovbl %esi, %edi # sched: [1:0.50]
+; BTVER2-NEXT: cmovbl %esi, %edi # sched: [1:0.50]
+; BTVER2-NEXT: cmovael %esi, %edi # sched: [1:0.50]
+; BTVER2-NEXT: cmovael %esi, %edi # sched: [1:0.50]
+; BTVER2-NEXT: cmovael %esi, %edi # sched: [1:0.50]
+; BTVER2-NEXT: cmovel %esi, %edi # sched: [1:0.50]
+; BTVER2-NEXT: cmovel %esi, %edi # sched: [1:0.50]
+; BTVER2-NEXT: cmovnel %esi, %edi # sched: [1:0.50]
+; BTVER2-NEXT: cmovnel %esi, %edi # sched: [1:0.50]
+; BTVER2-NEXT: cmovbel %esi, %edi # sched: [1:0.50]
+; BTVER2-NEXT: cmovbel %esi, %edi # sched: [1:0.50]
+; BTVER2-NEXT: cmoval %esi, %edi # sched: [1:0.50]
+; BTVER2-NEXT: cmoval %esi, %edi # sched: [1:0.50]
+; BTVER2-NEXT: cmovsl %esi, %edi # sched: [1:0.50]
+; BTVER2-NEXT: cmovnsl %esi, %edi # sched: [1:0.50]
+; BTVER2-NEXT: cmovpl %esi, %edi # sched: [1:0.50]
+; BTVER2-NEXT: cmovpl %esi, %edi # sched: [1:0.50]
+; BTVER2-NEXT: cmovnpl %esi, %edi # sched: [1:0.50]
+; BTVER2-NEXT: cmovnpl %esi, %edi # sched: [1:0.50]
+; BTVER2-NEXT: cmovll %esi, %edi # sched: [1:0.50]
+; BTVER2-NEXT: cmovll %esi, %edi # sched: [1:0.50]
+; BTVER2-NEXT: cmovgel %esi, %edi # sched: [1:0.50]
+; BTVER2-NEXT: cmovgel %esi, %edi # sched: [1:0.50]
+; BTVER2-NEXT: cmovlel %esi, %edi # sched: [1:0.50]
+; BTVER2-NEXT: cmovlel %esi, %edi # sched: [1:0.50]
+; BTVER2-NEXT: cmovgl %esi, %edi # sched: [1:0.50]
+; BTVER2-NEXT: cmovgl %esi, %edi # sched: [1:0.50]
+; BTVER2-NEXT: cmovol (%rdx), %edi # sched: [4:1.00]
+; BTVER2-NEXT: cmovnol (%rdx), %edi # sched: [4:1.00]
+; BTVER2-NEXT: cmovbl (%rdx), %edi # sched: [4:1.00]
+; BTVER2-NEXT: cmovbl (%rdx), %edi # sched: [4:1.00]
+; BTVER2-NEXT: cmovbl (%rdx), %edi # sched: [4:1.00]
+; BTVER2-NEXT: cmovael (%rdx), %edi # sched: [4:1.00]
+; BTVER2-NEXT: cmovael (%rdx), %edi # sched: [4:1.00]
+; BTVER2-NEXT: cmovael (%rdx), %edi # sched: [4:1.00]
+; BTVER2-NEXT: cmovel (%rdx), %edi # sched: [4:1.00]
+; BTVER2-NEXT: cmovel (%rdx), %edi # sched: [4:1.00]
+; BTVER2-NEXT: cmovnel (%rdx), %edi # sched: [4:1.00]
+; BTVER2-NEXT: cmovnel (%rdx), %edi # sched: [4:1.00]
+; BTVER2-NEXT: cmovbel (%rdx), %edi # sched: [4:1.00]
+; BTVER2-NEXT: cmovbel (%rdx), %edi # sched: [4:1.00]
+; BTVER2-NEXT: cmoval (%rdx), %edi # sched: [4:1.00]
+; BTVER2-NEXT: cmoval (%rdx), %edi # sched: [4:1.00]
+; BTVER2-NEXT: cmovsl (%rdx), %edi # sched: [4:1.00]
+; BTVER2-NEXT: cmovnsl (%rdx), %edi # sched: [4:1.00]
+; BTVER2-NEXT: cmovpl (%rdx), %edi # sched: [4:1.00]
+; BTVER2-NEXT: cmovpl (%rdx), %edi # sched: [4:1.00]
+; BTVER2-NEXT: cmovnpl (%rdx), %edi # sched: [4:1.00]
+; BTVER2-NEXT: cmovnpl (%rdx), %edi # sched: [4:1.00]
+; BTVER2-NEXT: cmovll (%rdx), %edi # sched: [4:1.00]
+; BTVER2-NEXT: cmovll (%rdx), %edi # sched: [4:1.00]
+; BTVER2-NEXT: cmovgel (%rdx), %edi # sched: [4:1.00]
+; BTVER2-NEXT: cmovgel (%rdx), %edi # sched: [4:1.00]
+; BTVER2-NEXT: cmovlel (%rdx), %edi # sched: [4:1.00]
+; BTVER2-NEXT: cmovlel (%rdx), %edi # sched: [4:1.00]
+; BTVER2-NEXT: cmovgl (%rdx), %edi # sched: [4:1.00]
+; BTVER2-NEXT: cmovgl (%rdx), %edi # sched: [4:1.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_cmov_32:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: cmovol %esi, %edi # sched: [1:0.25]
+; ZNVER1-NEXT: cmovnol %esi, %edi # sched: [1:0.25]
+; ZNVER1-NEXT: cmovbl %esi, %edi # sched: [1:0.25]
+; ZNVER1-NEXT: cmovbl %esi, %edi # sched: [1:0.25]
+; ZNVER1-NEXT: cmovbl %esi, %edi # sched: [1:0.25]
+; ZNVER1-NEXT: cmovael %esi, %edi # sched: [1:0.25]
+; ZNVER1-NEXT: cmovael %esi, %edi # sched: [1:0.25]
+; ZNVER1-NEXT: cmovael %esi, %edi # sched: [1:0.25]
+; ZNVER1-NEXT: cmovel %esi, %edi # sched: [1:0.25]
+; ZNVER1-NEXT: cmovel %esi, %edi # sched: [1:0.25]
+; ZNVER1-NEXT: cmovnel %esi, %edi # sched: [1:0.25]
+; ZNVER1-NEXT: cmovnel %esi, %edi # sched: [1:0.25]
+; ZNVER1-NEXT: cmovbel %esi, %edi # sched: [1:0.25]
+; ZNVER1-NEXT: cmovbel %esi, %edi # sched: [1:0.25]
+; ZNVER1-NEXT: cmoval %esi, %edi # sched: [1:0.25]
+; ZNVER1-NEXT: cmoval %esi, %edi # sched: [1:0.25]
+; ZNVER1-NEXT: cmovsl %esi, %edi # sched: [1:0.25]
+; ZNVER1-NEXT: cmovnsl %esi, %edi # sched: [1:0.25]
+; ZNVER1-NEXT: cmovpl %esi, %edi # sched: [1:0.25]
+; ZNVER1-NEXT: cmovpl %esi, %edi # sched: [1:0.25]
+; ZNVER1-NEXT: cmovnpl %esi, %edi # sched: [1:0.25]
+; ZNVER1-NEXT: cmovnpl %esi, %edi # sched: [1:0.25]
+; ZNVER1-NEXT: cmovll %esi, %edi # sched: [1:0.25]
+; ZNVER1-NEXT: cmovll %esi, %edi # sched: [1:0.25]
+; ZNVER1-NEXT: cmovgel %esi, %edi # sched: [1:0.25]
+; ZNVER1-NEXT: cmovgel %esi, %edi # sched: [1:0.25]
+; ZNVER1-NEXT: cmovlel %esi, %edi # sched: [1:0.25]
+; ZNVER1-NEXT: cmovlel %esi, %edi # sched: [1:0.25]
+; ZNVER1-NEXT: cmovgl %esi, %edi # sched: [1:0.25]
+; ZNVER1-NEXT: cmovgl %esi, %edi # sched: [1:0.25]
+; ZNVER1-NEXT: cmovol (%rdx), %edi # sched: [5:0.50]
+; ZNVER1-NEXT: cmovnol (%rdx), %edi # sched: [5:0.50]
+; ZNVER1-NEXT: cmovbl (%rdx), %edi # sched: [5:0.50]
+; ZNVER1-NEXT: cmovbl (%rdx), %edi # sched: [5:0.50]
+; ZNVER1-NEXT: cmovbl (%rdx), %edi # sched: [5:0.50]
+; ZNVER1-NEXT: cmovael (%rdx), %edi # sched: [5:0.50]
+; ZNVER1-NEXT: cmovael (%rdx), %edi # sched: [5:0.50]
+; ZNVER1-NEXT: cmovael (%rdx), %edi # sched: [5:0.50]
+; ZNVER1-NEXT: cmovel (%rdx), %edi # sched: [5:0.50]
+; ZNVER1-NEXT: cmovel (%rdx), %edi # sched: [5:0.50]
+; ZNVER1-NEXT: cmovnel (%rdx), %edi # sched: [5:0.50]
+; ZNVER1-NEXT: cmovnel (%rdx), %edi # sched: [5:0.50]
+; ZNVER1-NEXT: cmovbel (%rdx), %edi # sched: [5:0.50]
+; ZNVER1-NEXT: cmovbel (%rdx), %edi # sched: [5:0.50]
+; ZNVER1-NEXT: cmoval (%rdx), %edi # sched: [5:0.50]
+; ZNVER1-NEXT: cmoval (%rdx), %edi # sched: [5:0.50]
+; ZNVER1-NEXT: cmovsl (%rdx), %edi # sched: [5:0.50]
+; ZNVER1-NEXT: cmovnsl (%rdx), %edi # sched: [5:0.50]
+; ZNVER1-NEXT: cmovpl (%rdx), %edi # sched: [5:0.50]
+; ZNVER1-NEXT: cmovpl (%rdx), %edi # sched: [5:0.50]
+; ZNVER1-NEXT: cmovnpl (%rdx), %edi # sched: [5:0.50]
+; ZNVER1-NEXT: cmovnpl (%rdx), %edi # sched: [5:0.50]
+; ZNVER1-NEXT: cmovll (%rdx), %edi # sched: [5:0.50]
+; ZNVER1-NEXT: cmovll (%rdx), %edi # sched: [5:0.50]
+; ZNVER1-NEXT: cmovgel (%rdx), %edi # sched: [5:0.50]
+; ZNVER1-NEXT: cmovgel (%rdx), %edi # sched: [5:0.50]
+; ZNVER1-NEXT: cmovlel (%rdx), %edi # sched: [5:0.50]
+; ZNVER1-NEXT: cmovlel (%rdx), %edi # sched: [5:0.50]
+; ZNVER1-NEXT: cmovgl (%rdx), %edi # sched: [5:0.50]
+; ZNVER1-NEXT: cmovgl (%rdx), %edi # sched: [5:0.50]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ call void asm sideeffect "cmovol $1, $0 \0A\09 cmovnol $1, $0 \0A\09 cmovbl $1, $0 \0A\09 cmovcl $1, $0 \0A\09 cmovnael $1, $0 \0A\09 cmovnbl $1, $0 \0A\09 cmovncl $1, $0 \0A\09 cmovael $1, $0 \0A\09 cmovzl $1, $0 \0A\09 cmovel $1, $0 \0A\09 cmovnzl $1, $0 \0A\09 cmovnel $1, $0 \0A\09 cmovbel $1, $0 \0A\09 cmovnal $1, $0 \0A\09 cmovnbel $1, $0 \0A\09 cmoval $1, $0 \0A\09 cmovsl $1, $0 \0A\09 cmovnsl $1, $0 \0A\09 cmovpl $1, $0 \0A\09 cmovpel $1, $0 \0A\09 cmovnpl $1, $0 \0A\09 cmovpol $1, $0 \0A\09 cmovll $1, $0 \0A\09 cmovngel $1, $0 \0A\09 cmovnll $1, $0 \0A\09 cmovgel $1, $0 \0A\09 cmovlel $1, $0 \0A\09 cmovngl $1, $0 \0A\09 cmovnlel $1, $0 \0A\09 cmovgl $1, $0 \0A\09 cmovol $2, $0 \0A\09 cmovnol $2, $0 \0A\09 cmovbl $2, $0 \0A\09 cmovcl $2, $0 \0A\09 cmovnael $2, $0 \0A\09 cmovnbl $2, $0 \0A\09 cmovncl $2, $0 \0A\09 cmovael $2, $0 \0A\09 cmovzl $2, $0 \0A\09 cmovel $2, $0 \0A\09 cmovnzl $2, $0 \0A\09 cmovnel $2, $0 \0A\09 cmovbel $2, $0 \0A\09 cmovnal $2, $0 \0A\09 cmovnbel $2, $0 \0A\09 cmoval $2, $0 \0A\09 cmovsl $2, $0 \0A\09 cmovnsl $2, $0 \0A\09 cmovpl $2, $0 \0A\09 cmovpel $2, $0 \0A\09 cmovnpl $2, $0 \0A\09 cmovpol $2, $0 \0A\09 cmovll $2, $0 \0A\09 cmovngel $2, $0 \0A\09 cmovnll $2, $0 \0A\09 cmovgel $2, $0 \0A\09 cmovlel $2, $0 \0A\09 cmovngl $2, $0 \0A\09 cmovnlel $2, $0 \0A\09 cmovgl $2, $0", "r,r,*m"(i32 %a0, i32 %a1, i32 *%a2)
+ ret void
+}
+
+define void @test_cmov_64(i64 %a0, i64 %a1, i64 *%a2) optsize {
+; GENERIC-LABEL: test_cmov_64:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: cmovoq %rsi, %rdi # sched: [2:0.67]
+; GENERIC-NEXT: cmovnoq %rsi, %rdi # sched: [2:0.67]
+; GENERIC-NEXT: cmovbq %rsi, %rdi # sched: [2:0.67]
+; GENERIC-NEXT: cmovbq %rsi, %rdi # sched: [2:0.67]
+; GENERIC-NEXT: cmovbq %rsi, %rdi # sched: [2:0.67]
+; GENERIC-NEXT: cmovaeq %rsi, %rdi # sched: [2:0.67]
+; GENERIC-NEXT: cmovaeq %rsi, %rdi # sched: [2:0.67]
+; GENERIC-NEXT: cmovaeq %rsi, %rdi # sched: [2:0.67]
+; GENERIC-NEXT: cmoveq %rsi, %rdi # sched: [2:0.67]
+; GENERIC-NEXT: cmoveq %rsi, %rdi # sched: [2:0.67]
+; GENERIC-NEXT: cmovneq %rsi, %rdi # sched: [2:0.67]
+; GENERIC-NEXT: cmovneq %rsi, %rdi # sched: [2:0.67]
+; GENERIC-NEXT: cmovbeq %rsi, %rdi # sched: [3:1.00]
+; GENERIC-NEXT: cmovbeq %rsi, %rdi # sched: [3:1.00]
+; GENERIC-NEXT: cmovaq %rsi, %rdi # sched: [3:1.00]
+; GENERIC-NEXT: cmovaq %rsi, %rdi # sched: [3:1.00]
+; GENERIC-NEXT: cmovsq %rsi, %rdi # sched: [2:0.67]
+; GENERIC-NEXT: cmovnsq %rsi, %rdi # sched: [2:0.67]
+; GENERIC-NEXT: cmovpq %rsi, %rdi # sched: [2:0.67]
+; GENERIC-NEXT: cmovpq %rsi, %rdi # sched: [2:0.67]
+; GENERIC-NEXT: cmovnpq %rsi, %rdi # sched: [2:0.67]
+; GENERIC-NEXT: cmovnpq %rsi, %rdi # sched: [2:0.67]
+; GENERIC-NEXT: cmovlq %rsi, %rdi # sched: [2:0.67]
+; GENERIC-NEXT: cmovlq %rsi, %rdi # sched: [2:0.67]
+; GENERIC-NEXT: cmovgeq %rsi, %rdi # sched: [2:0.67]
+; GENERIC-NEXT: cmovgeq %rsi, %rdi # sched: [2:0.67]
+; GENERIC-NEXT: cmovleq %rsi, %rdi # sched: [2:0.67]
+; GENERIC-NEXT: cmovleq %rsi, %rdi # sched: [2:0.67]
+; GENERIC-NEXT: cmovgq %rsi, %rdi # sched: [2:0.67]
+; GENERIC-NEXT: cmovgq %rsi, %rdi # sched: [2:0.67]
+; GENERIC-NEXT: cmovoq (%rdx), %rdi # sched: [7:0.67]
+; GENERIC-NEXT: cmovnoq (%rdx), %rdi # sched: [7:0.67]
+; GENERIC-NEXT: cmovbq (%rdx), %rdi # sched: [7:0.67]
+; GENERIC-NEXT: cmovbq (%rdx), %rdi # sched: [7:0.67]
+; GENERIC-NEXT: cmovbq (%rdx), %rdi # sched: [7:0.67]
+; GENERIC-NEXT: cmovaeq (%rdx), %rdi # sched: [7:0.67]
+; GENERIC-NEXT: cmovaeq (%rdx), %rdi # sched: [7:0.67]
+; GENERIC-NEXT: cmovaeq (%rdx), %rdi # sched: [7:0.67]
+; GENERIC-NEXT: cmoveq (%rdx), %rdi # sched: [7:0.67]
+; GENERIC-NEXT: cmoveq (%rdx), %rdi # sched: [7:0.67]
+; GENERIC-NEXT: cmovneq (%rdx), %rdi # sched: [7:0.67]
+; GENERIC-NEXT: cmovneq (%rdx), %rdi # sched: [7:0.67]
+; GENERIC-NEXT: cmovbeq (%rdx), %rdi # sched: [8:1.00]
+; GENERIC-NEXT: cmovbeq (%rdx), %rdi # sched: [8:1.00]
+; GENERIC-NEXT: cmovaq (%rdx), %rdi # sched: [8:1.00]
+; GENERIC-NEXT: cmovaq (%rdx), %rdi # sched: [8:1.00]
+; GENERIC-NEXT: cmovsq (%rdx), %rdi # sched: [7:0.67]
+; GENERIC-NEXT: cmovnsq (%rdx), %rdi # sched: [7:0.67]
+; GENERIC-NEXT: cmovpq (%rdx), %rdi # sched: [7:0.67]
+; GENERIC-NEXT: cmovpq (%rdx), %rdi # sched: [7:0.67]
+; GENERIC-NEXT: cmovnpq (%rdx), %rdi # sched: [7:0.67]
+; GENERIC-NEXT: cmovnpq (%rdx), %rdi # sched: [7:0.67]
+; GENERIC-NEXT: cmovlq (%rdx), %rdi # sched: [7:0.67]
+; GENERIC-NEXT: cmovlq (%rdx), %rdi # sched: [7:0.67]
+; GENERIC-NEXT: cmovgeq (%rdx), %rdi # sched: [7:0.67]
+; GENERIC-NEXT: cmovgeq (%rdx), %rdi # sched: [7:0.67]
+; GENERIC-NEXT: cmovleq (%rdx), %rdi # sched: [7:0.67]
+; GENERIC-NEXT: cmovleq (%rdx), %rdi # sched: [7:0.67]
+; GENERIC-NEXT: cmovgq (%rdx), %rdi # sched: [7:0.67]
+; GENERIC-NEXT: cmovgq (%rdx), %rdi # sched: [7:0.67]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_cmov_64:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: cmovoq %rsi, %rdi # sched: [1:0.50]
+; ATOM-NEXT: cmovnoq %rsi, %rdi # sched: [1:0.50]
+; ATOM-NEXT: cmovbq %rsi, %rdi # sched: [1:0.50]
+; ATOM-NEXT: cmovbq %rsi, %rdi # sched: [1:0.50]
+; ATOM-NEXT: cmovbq %rsi, %rdi # sched: [1:0.50]
+; ATOM-NEXT: cmovaeq %rsi, %rdi # sched: [1:0.50]
+; ATOM-NEXT: cmovaeq %rsi, %rdi # sched: [1:0.50]
+; ATOM-NEXT: cmovaeq %rsi, %rdi # sched: [1:0.50]
+; ATOM-NEXT: cmoveq %rsi, %rdi # sched: [1:0.50]
+; ATOM-NEXT: cmoveq %rsi, %rdi # sched: [1:0.50]
+; ATOM-NEXT: cmovneq %rsi, %rdi # sched: [1:0.50]
+; ATOM-NEXT: cmovneq %rsi, %rdi # sched: [1:0.50]
+; ATOM-NEXT: cmovbeq %rsi, %rdi # sched: [1:0.50]
+; ATOM-NEXT: cmovbeq %rsi, %rdi # sched: [1:0.50]
+; ATOM-NEXT: cmovaq %rsi, %rdi # sched: [1:0.50]
+; ATOM-NEXT: cmovaq %rsi, %rdi # sched: [1:0.50]
+; ATOM-NEXT: cmovsq %rsi, %rdi # sched: [1:0.50]
+; ATOM-NEXT: cmovnsq %rsi, %rdi # sched: [1:0.50]
+; ATOM-NEXT: cmovpq %rsi, %rdi # sched: [1:0.50]
+; ATOM-NEXT: cmovpq %rsi, %rdi # sched: [1:0.50]
+; ATOM-NEXT: cmovnpq %rsi, %rdi # sched: [1:0.50]
+; ATOM-NEXT: cmovnpq %rsi, %rdi # sched: [1:0.50]
+; ATOM-NEXT: cmovlq %rsi, %rdi # sched: [1:0.50]
+; ATOM-NEXT: cmovlq %rsi, %rdi # sched: [1:0.50]
+; ATOM-NEXT: cmovgeq %rsi, %rdi # sched: [1:0.50]
+; ATOM-NEXT: cmovgeq %rsi, %rdi # sched: [1:0.50]
+; ATOM-NEXT: cmovleq %rsi, %rdi # sched: [1:0.50]
+; ATOM-NEXT: cmovleq %rsi, %rdi # sched: [1:0.50]
+; ATOM-NEXT: cmovgq %rsi, %rdi # sched: [1:0.50]
+; ATOM-NEXT: cmovgq %rsi, %rdi # sched: [1:0.50]
+; ATOM-NEXT: cmovoq (%rdx), %rdi # sched: [1:1.00]
+; ATOM-NEXT: cmovnoq (%rdx), %rdi # sched: [1:1.00]
+; ATOM-NEXT: cmovbq (%rdx), %rdi # sched: [1:1.00]
+; ATOM-NEXT: cmovbq (%rdx), %rdi # sched: [1:1.00]
+; ATOM-NEXT: cmovbq (%rdx), %rdi # sched: [1:1.00]
+; ATOM-NEXT: cmovaeq (%rdx), %rdi # sched: [1:1.00]
+; ATOM-NEXT: cmovaeq (%rdx), %rdi # sched: [1:1.00]
+; ATOM-NEXT: cmovaeq (%rdx), %rdi # sched: [1:1.00]
+; ATOM-NEXT: cmoveq (%rdx), %rdi # sched: [1:1.00]
+; ATOM-NEXT: cmoveq (%rdx), %rdi # sched: [1:1.00]
+; ATOM-NEXT: cmovneq (%rdx), %rdi # sched: [1:1.00]
+; ATOM-NEXT: cmovneq (%rdx), %rdi # sched: [1:1.00]
+; ATOM-NEXT: cmovbeq (%rdx), %rdi # sched: [1:1.00]
+; ATOM-NEXT: cmovbeq (%rdx), %rdi # sched: [1:1.00]
+; ATOM-NEXT: cmovaq (%rdx), %rdi # sched: [1:1.00]
+; ATOM-NEXT: cmovaq (%rdx), %rdi # sched: [1:1.00]
+; ATOM-NEXT: cmovsq (%rdx), %rdi # sched: [1:1.00]
+; ATOM-NEXT: cmovnsq (%rdx), %rdi # sched: [1:1.00]
+; ATOM-NEXT: cmovpq (%rdx), %rdi # sched: [1:1.00]
+; ATOM-NEXT: cmovpq (%rdx), %rdi # sched: [1:1.00]
+; ATOM-NEXT: cmovnpq (%rdx), %rdi # sched: [1:1.00]
+; ATOM-NEXT: cmovnpq (%rdx), %rdi # sched: [1:1.00]
+; ATOM-NEXT: cmovlq (%rdx), %rdi # sched: [1:1.00]
+; ATOM-NEXT: cmovlq (%rdx), %rdi # sched: [1:1.00]
+; ATOM-NEXT: cmovgeq (%rdx), %rdi # sched: [1:1.00]
+; ATOM-NEXT: cmovgeq (%rdx), %rdi # sched: [1:1.00]
+; ATOM-NEXT: cmovleq (%rdx), %rdi # sched: [1:1.00]
+; ATOM-NEXT: cmovleq (%rdx), %rdi # sched: [1:1.00]
+; ATOM-NEXT: cmovgq (%rdx), %rdi # sched: [1:1.00]
+; ATOM-NEXT: cmovgq (%rdx), %rdi # sched: [1:1.00]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_cmov_64:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: cmovoq %rsi, %rdi # sched: [1:0.50]
+; SLM-NEXT: cmovnoq %rsi, %rdi # sched: [1:0.50]
+; SLM-NEXT: cmovbq %rsi, %rdi # sched: [1:0.50]
+; SLM-NEXT: cmovbq %rsi, %rdi # sched: [1:0.50]
+; SLM-NEXT: cmovbq %rsi, %rdi # sched: [1:0.50]
+; SLM-NEXT: cmovaeq %rsi, %rdi # sched: [1:0.50]
+; SLM-NEXT: cmovaeq %rsi, %rdi # sched: [1:0.50]
+; SLM-NEXT: cmovaeq %rsi, %rdi # sched: [1:0.50]
+; SLM-NEXT: cmoveq %rsi, %rdi # sched: [1:0.50]
+; SLM-NEXT: cmoveq %rsi, %rdi # sched: [1:0.50]
+; SLM-NEXT: cmovneq %rsi, %rdi # sched: [1:0.50]
+; SLM-NEXT: cmovneq %rsi, %rdi # sched: [1:0.50]
+; SLM-NEXT: cmovbeq %rsi, %rdi # sched: [1:0.50]
+; SLM-NEXT: cmovbeq %rsi, %rdi # sched: [1:0.50]
+; SLM-NEXT: cmovaq %rsi, %rdi # sched: [1:0.50]
+; SLM-NEXT: cmovaq %rsi, %rdi # sched: [1:0.50]
+; SLM-NEXT: cmovsq %rsi, %rdi # sched: [1:0.50]
+; SLM-NEXT: cmovnsq %rsi, %rdi # sched: [1:0.50]
+; SLM-NEXT: cmovpq %rsi, %rdi # sched: [1:0.50]
+; SLM-NEXT: cmovpq %rsi, %rdi # sched: [1:0.50]
+; SLM-NEXT: cmovnpq %rsi, %rdi # sched: [1:0.50]
+; SLM-NEXT: cmovnpq %rsi, %rdi # sched: [1:0.50]
+; SLM-NEXT: cmovlq %rsi, %rdi # sched: [1:0.50]
+; SLM-NEXT: cmovlq %rsi, %rdi # sched: [1:0.50]
+; SLM-NEXT: cmovgeq %rsi, %rdi # sched: [1:0.50]
+; SLM-NEXT: cmovgeq %rsi, %rdi # sched: [1:0.50]
+; SLM-NEXT: cmovleq %rsi, %rdi # sched: [1:0.50]
+; SLM-NEXT: cmovleq %rsi, %rdi # sched: [1:0.50]
+; SLM-NEXT: cmovgq %rsi, %rdi # sched: [1:0.50]
+; SLM-NEXT: cmovgq %rsi, %rdi # sched: [1:0.50]
+; SLM-NEXT: cmovoq (%rdx), %rdi # sched: [4:1.00]
+; SLM-NEXT: cmovnoq (%rdx), %rdi # sched: [4:1.00]
+; SLM-NEXT: cmovbq (%rdx), %rdi # sched: [4:1.00]
+; SLM-NEXT: cmovbq (%rdx), %rdi # sched: [4:1.00]
+; SLM-NEXT: cmovbq (%rdx), %rdi # sched: [4:1.00]
+; SLM-NEXT: cmovaeq (%rdx), %rdi # sched: [4:1.00]
+; SLM-NEXT: cmovaeq (%rdx), %rdi # sched: [4:1.00]
+; SLM-NEXT: cmovaeq (%rdx), %rdi # sched: [4:1.00]
+; SLM-NEXT: cmoveq (%rdx), %rdi # sched: [4:1.00]
+; SLM-NEXT: cmoveq (%rdx), %rdi # sched: [4:1.00]
+; SLM-NEXT: cmovneq (%rdx), %rdi # sched: [4:1.00]
+; SLM-NEXT: cmovneq (%rdx), %rdi # sched: [4:1.00]
+; SLM-NEXT: cmovbeq (%rdx), %rdi # sched: [4:1.00]
+; SLM-NEXT: cmovbeq (%rdx), %rdi # sched: [4:1.00]
+; SLM-NEXT: cmovaq (%rdx), %rdi # sched: [4:1.00]
+; SLM-NEXT: cmovaq (%rdx), %rdi # sched: [4:1.00]
+; SLM-NEXT: cmovsq (%rdx), %rdi # sched: [4:1.00]
+; SLM-NEXT: cmovnsq (%rdx), %rdi # sched: [4:1.00]
+; SLM-NEXT: cmovpq (%rdx), %rdi # sched: [4:1.00]
+; SLM-NEXT: cmovpq (%rdx), %rdi # sched: [4:1.00]
+; SLM-NEXT: cmovnpq (%rdx), %rdi # sched: [4:1.00]
+; SLM-NEXT: cmovnpq (%rdx), %rdi # sched: [4:1.00]
+; SLM-NEXT: cmovlq (%rdx), %rdi # sched: [4:1.00]
+; SLM-NEXT: cmovlq (%rdx), %rdi # sched: [4:1.00]
+; SLM-NEXT: cmovgeq (%rdx), %rdi # sched: [4:1.00]
+; SLM-NEXT: cmovgeq (%rdx), %rdi # sched: [4:1.00]
+; SLM-NEXT: cmovleq (%rdx), %rdi # sched: [4:1.00]
+; SLM-NEXT: cmovleq (%rdx), %rdi # sched: [4:1.00]
+; SLM-NEXT: cmovgq (%rdx), %rdi # sched: [4:1.00]
+; SLM-NEXT: cmovgq (%rdx), %rdi # sched: [4:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_cmov_64:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: cmovoq %rsi, %rdi # sched: [2:0.67]
+; SANDY-NEXT: cmovnoq %rsi, %rdi # sched: [2:0.67]
+; SANDY-NEXT: cmovbq %rsi, %rdi # sched: [2:0.67]
+; SANDY-NEXT: cmovbq %rsi, %rdi # sched: [2:0.67]
+; SANDY-NEXT: cmovbq %rsi, %rdi # sched: [2:0.67]
+; SANDY-NEXT: cmovaeq %rsi, %rdi # sched: [2:0.67]
+; SANDY-NEXT: cmovaeq %rsi, %rdi # sched: [2:0.67]
+; SANDY-NEXT: cmovaeq %rsi, %rdi # sched: [2:0.67]
+; SANDY-NEXT: cmoveq %rsi, %rdi # sched: [2:0.67]
+; SANDY-NEXT: cmoveq %rsi, %rdi # sched: [2:0.67]
+; SANDY-NEXT: cmovneq %rsi, %rdi # sched: [2:0.67]
+; SANDY-NEXT: cmovneq %rsi, %rdi # sched: [2:0.67]
+; SANDY-NEXT: cmovbeq %rsi, %rdi # sched: [3:1.00]
+; SANDY-NEXT: cmovbeq %rsi, %rdi # sched: [3:1.00]
+; SANDY-NEXT: cmovaq %rsi, %rdi # sched: [3:1.00]
+; SANDY-NEXT: cmovaq %rsi, %rdi # sched: [3:1.00]
+; SANDY-NEXT: cmovsq %rsi, %rdi # sched: [2:0.67]
+; SANDY-NEXT: cmovnsq %rsi, %rdi # sched: [2:0.67]
+; SANDY-NEXT: cmovpq %rsi, %rdi # sched: [2:0.67]
+; SANDY-NEXT: cmovpq %rsi, %rdi # sched: [2:0.67]
+; SANDY-NEXT: cmovnpq %rsi, %rdi # sched: [2:0.67]
+; SANDY-NEXT: cmovnpq %rsi, %rdi # sched: [2:0.67]
+; SANDY-NEXT: cmovlq %rsi, %rdi # sched: [2:0.67]
+; SANDY-NEXT: cmovlq %rsi, %rdi # sched: [2:0.67]
+; SANDY-NEXT: cmovgeq %rsi, %rdi # sched: [2:0.67]
+; SANDY-NEXT: cmovgeq %rsi, %rdi # sched: [2:0.67]
+; SANDY-NEXT: cmovleq %rsi, %rdi # sched: [2:0.67]
+; SANDY-NEXT: cmovleq %rsi, %rdi # sched: [2:0.67]
+; SANDY-NEXT: cmovgq %rsi, %rdi # sched: [2:0.67]
+; SANDY-NEXT: cmovgq %rsi, %rdi # sched: [2:0.67]
+; SANDY-NEXT: cmovoq (%rdx), %rdi # sched: [7:0.67]
+; SANDY-NEXT: cmovnoq (%rdx), %rdi # sched: [7:0.67]
+; SANDY-NEXT: cmovbq (%rdx), %rdi # sched: [7:0.67]
+; SANDY-NEXT: cmovbq (%rdx), %rdi # sched: [7:0.67]
+; SANDY-NEXT: cmovbq (%rdx), %rdi # sched: [7:0.67]
+; SANDY-NEXT: cmovaeq (%rdx), %rdi # sched: [7:0.67]
+; SANDY-NEXT: cmovaeq (%rdx), %rdi # sched: [7:0.67]
+; SANDY-NEXT: cmovaeq (%rdx), %rdi # sched: [7:0.67]
+; SANDY-NEXT: cmoveq (%rdx), %rdi # sched: [7:0.67]
+; SANDY-NEXT: cmoveq (%rdx), %rdi # sched: [7:0.67]
+; SANDY-NEXT: cmovneq (%rdx), %rdi # sched: [7:0.67]
+; SANDY-NEXT: cmovneq (%rdx), %rdi # sched: [7:0.67]
+; SANDY-NEXT: cmovbeq (%rdx), %rdi # sched: [8:1.00]
+; SANDY-NEXT: cmovbeq (%rdx), %rdi # sched: [8:1.00]
+; SANDY-NEXT: cmovaq (%rdx), %rdi # sched: [8:1.00]
+; SANDY-NEXT: cmovaq (%rdx), %rdi # sched: [8:1.00]
+; SANDY-NEXT: cmovsq (%rdx), %rdi # sched: [7:0.67]
+; SANDY-NEXT: cmovnsq (%rdx), %rdi # sched: [7:0.67]
+; SANDY-NEXT: cmovpq (%rdx), %rdi # sched: [7:0.67]
+; SANDY-NEXT: cmovpq (%rdx), %rdi # sched: [7:0.67]
+; SANDY-NEXT: cmovnpq (%rdx), %rdi # sched: [7:0.67]
+; SANDY-NEXT: cmovnpq (%rdx), %rdi # sched: [7:0.67]
+; SANDY-NEXT: cmovlq (%rdx), %rdi # sched: [7:0.67]
+; SANDY-NEXT: cmovlq (%rdx), %rdi # sched: [7:0.67]
+; SANDY-NEXT: cmovgeq (%rdx), %rdi # sched: [7:0.67]
+; SANDY-NEXT: cmovgeq (%rdx), %rdi # sched: [7:0.67]
+; SANDY-NEXT: cmovleq (%rdx), %rdi # sched: [7:0.67]
+; SANDY-NEXT: cmovleq (%rdx), %rdi # sched: [7:0.67]
+; SANDY-NEXT: cmovgq (%rdx), %rdi # sched: [7:0.67]
+; SANDY-NEXT: cmovgq (%rdx), %rdi # sched: [7:0.67]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_cmov_64:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: cmovoq %rsi, %rdi # sched: [2:0.50]
+; HASWELL-NEXT: cmovnoq %rsi, %rdi # sched: [2:0.50]
+; HASWELL-NEXT: cmovbq %rsi, %rdi # sched: [2:0.50]
+; HASWELL-NEXT: cmovbq %rsi, %rdi # sched: [2:0.50]
+; HASWELL-NEXT: cmovbq %rsi, %rdi # sched: [2:0.50]
+; HASWELL-NEXT: cmovaeq %rsi, %rdi # sched: [2:0.50]
+; HASWELL-NEXT: cmovaeq %rsi, %rdi # sched: [2:0.50]
+; HASWELL-NEXT: cmovaeq %rsi, %rdi # sched: [2:0.50]
+; HASWELL-NEXT: cmoveq %rsi, %rdi # sched: [2:0.50]
+; HASWELL-NEXT: cmoveq %rsi, %rdi # sched: [2:0.50]
+; HASWELL-NEXT: cmovneq %rsi, %rdi # sched: [2:0.50]
+; HASWELL-NEXT: cmovneq %rsi, %rdi # sched: [2:0.50]
+; HASWELL-NEXT: cmovbeq %rsi, %rdi # sched: [3:0.75]
+; HASWELL-NEXT: cmovbeq %rsi, %rdi # sched: [3:0.75]
+; HASWELL-NEXT: cmovaq %rsi, %rdi # sched: [3:0.75]
+; HASWELL-NEXT: cmovaq %rsi, %rdi # sched: [3:0.75]
+; HASWELL-NEXT: cmovsq %rsi, %rdi # sched: [2:0.50]
+; HASWELL-NEXT: cmovnsq %rsi, %rdi # sched: [2:0.50]
+; HASWELL-NEXT: cmovpq %rsi, %rdi # sched: [2:0.50]
+; HASWELL-NEXT: cmovpq %rsi, %rdi # sched: [2:0.50]
+; HASWELL-NEXT: cmovnpq %rsi, %rdi # sched: [2:0.50]
+; HASWELL-NEXT: cmovnpq %rsi, %rdi # sched: [2:0.50]
+; HASWELL-NEXT: cmovlq %rsi, %rdi # sched: [2:0.50]
+; HASWELL-NEXT: cmovlq %rsi, %rdi # sched: [2:0.50]
+; HASWELL-NEXT: cmovgeq %rsi, %rdi # sched: [2:0.50]
+; HASWELL-NEXT: cmovgeq %rsi, %rdi # sched: [2:0.50]
+; HASWELL-NEXT: cmovleq %rsi, %rdi # sched: [2:0.50]
+; HASWELL-NEXT: cmovleq %rsi, %rdi # sched: [2:0.50]
+; HASWELL-NEXT: cmovgq %rsi, %rdi # sched: [2:0.50]
+; HASWELL-NEXT: cmovgq %rsi, %rdi # sched: [2:0.50]
+; HASWELL-NEXT: cmovoq (%rdx), %rdi # sched: [7:0.50]
+; HASWELL-NEXT: cmovnoq (%rdx), %rdi # sched: [7:0.50]
+; HASWELL-NEXT: cmovbq (%rdx), %rdi # sched: [7:0.50]
+; HASWELL-NEXT: cmovbq (%rdx), %rdi # sched: [7:0.50]
+; HASWELL-NEXT: cmovbq (%rdx), %rdi # sched: [7:0.50]
+; HASWELL-NEXT: cmovaeq (%rdx), %rdi # sched: [7:0.50]
+; HASWELL-NEXT: cmovaeq (%rdx), %rdi # sched: [7:0.50]
+; HASWELL-NEXT: cmovaeq (%rdx), %rdi # sched: [7:0.50]
+; HASWELL-NEXT: cmoveq (%rdx), %rdi # sched: [7:0.50]
+; HASWELL-NEXT: cmoveq (%rdx), %rdi # sched: [7:0.50]
+; HASWELL-NEXT: cmovneq (%rdx), %rdi # sched: [7:0.50]
+; HASWELL-NEXT: cmovneq (%rdx), %rdi # sched: [7:0.50]
+; HASWELL-NEXT: cmovbeq (%rdx), %rdi # sched: [8:0.75]
+; HASWELL-NEXT: cmovbeq (%rdx), %rdi # sched: [8:0.75]
+; HASWELL-NEXT: cmovaq (%rdx), %rdi # sched: [8:0.75]
+; HASWELL-NEXT: cmovaq (%rdx), %rdi # sched: [8:0.75]
+; HASWELL-NEXT: cmovsq (%rdx), %rdi # sched: [7:0.50]
+; HASWELL-NEXT: cmovnsq (%rdx), %rdi # sched: [7:0.50]
+; HASWELL-NEXT: cmovpq (%rdx), %rdi # sched: [7:0.50]
+; HASWELL-NEXT: cmovpq (%rdx), %rdi # sched: [7:0.50]
+; HASWELL-NEXT: cmovnpq (%rdx), %rdi # sched: [7:0.50]
+; HASWELL-NEXT: cmovnpq (%rdx), %rdi # sched: [7:0.50]
+; HASWELL-NEXT: cmovlq (%rdx), %rdi # sched: [7:0.50]
+; HASWELL-NEXT: cmovlq (%rdx), %rdi # sched: [7:0.50]
+; HASWELL-NEXT: cmovgeq (%rdx), %rdi # sched: [7:0.50]
+; HASWELL-NEXT: cmovgeq (%rdx), %rdi # sched: [7:0.50]
+; HASWELL-NEXT: cmovleq (%rdx), %rdi # sched: [7:0.50]
+; HASWELL-NEXT: cmovleq (%rdx), %rdi # sched: [7:0.50]
+; HASWELL-NEXT: cmovgq (%rdx), %rdi # sched: [7:0.50]
+; HASWELL-NEXT: cmovgq (%rdx), %rdi # sched: [7:0.50]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_cmov_64:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: cmovoq %rsi, %rdi # sched: [1:0.50]
+; BROADWELL-NEXT: cmovnoq %rsi, %rdi # sched: [1:0.50]
+; BROADWELL-NEXT: cmovbq %rsi, %rdi # sched: [1:0.50]
+; BROADWELL-NEXT: cmovbq %rsi, %rdi # sched: [1:0.50]
+; BROADWELL-NEXT: cmovbq %rsi, %rdi # sched: [1:0.50]
+; BROADWELL-NEXT: cmovaeq %rsi, %rdi # sched: [1:0.50]
+; BROADWELL-NEXT: cmovaeq %rsi, %rdi # sched: [1:0.50]
+; BROADWELL-NEXT: cmovaeq %rsi, %rdi # sched: [1:0.50]
+; BROADWELL-NEXT: cmoveq %rsi, %rdi # sched: [1:0.50]
+; BROADWELL-NEXT: cmoveq %rsi, %rdi # sched: [1:0.50]
+; BROADWELL-NEXT: cmovneq %rsi, %rdi # sched: [1:0.50]
+; BROADWELL-NEXT: cmovneq %rsi, %rdi # sched: [1:0.50]
+; BROADWELL-NEXT: cmovbeq %rsi, %rdi # sched: [2:0.50]
+; BROADWELL-NEXT: cmovbeq %rsi, %rdi # sched: [2:0.50]
+; BROADWELL-NEXT: cmovaq %rsi, %rdi # sched: [2:0.50]
+; BROADWELL-NEXT: cmovaq %rsi, %rdi # sched: [2:0.50]
+; BROADWELL-NEXT: cmovsq %rsi, %rdi # sched: [1:0.50]
+; BROADWELL-NEXT: cmovnsq %rsi, %rdi # sched: [1:0.50]
+; BROADWELL-NEXT: cmovpq %rsi, %rdi # sched: [1:0.50]
+; BROADWELL-NEXT: cmovpq %rsi, %rdi # sched: [1:0.50]
+; BROADWELL-NEXT: cmovnpq %rsi, %rdi # sched: [1:0.50]
+; BROADWELL-NEXT: cmovnpq %rsi, %rdi # sched: [1:0.50]
+; BROADWELL-NEXT: cmovlq %rsi, %rdi # sched: [1:0.50]
+; BROADWELL-NEXT: cmovlq %rsi, %rdi # sched: [1:0.50]
+; BROADWELL-NEXT: cmovgeq %rsi, %rdi # sched: [1:0.50]
+; BROADWELL-NEXT: cmovgeq %rsi, %rdi # sched: [1:0.50]
+; BROADWELL-NEXT: cmovleq %rsi, %rdi # sched: [1:0.50]
+; BROADWELL-NEXT: cmovleq %rsi, %rdi # sched: [1:0.50]
+; BROADWELL-NEXT: cmovgq %rsi, %rdi # sched: [1:0.50]
+; BROADWELL-NEXT: cmovgq %rsi, %rdi # sched: [1:0.50]
+; BROADWELL-NEXT: cmovoq (%rdx), %rdi # sched: [6:0.50]
+; BROADWELL-NEXT: cmovnoq (%rdx), %rdi # sched: [6:0.50]
+; BROADWELL-NEXT: cmovbq (%rdx), %rdi # sched: [6:0.50]
+; BROADWELL-NEXT: cmovbq (%rdx), %rdi # sched: [6:0.50]
+; BROADWELL-NEXT: cmovbq (%rdx), %rdi # sched: [6:0.50]
+; BROADWELL-NEXT: cmovaeq (%rdx), %rdi # sched: [6:0.50]
+; BROADWELL-NEXT: cmovaeq (%rdx), %rdi # sched: [6:0.50]
+; BROADWELL-NEXT: cmovaeq (%rdx), %rdi # sched: [6:0.50]
+; BROADWELL-NEXT: cmoveq (%rdx), %rdi # sched: [6:0.50]
+; BROADWELL-NEXT: cmoveq (%rdx), %rdi # sched: [6:0.50]
+; BROADWELL-NEXT: cmovneq (%rdx), %rdi # sched: [6:0.50]
+; BROADWELL-NEXT: cmovneq (%rdx), %rdi # sched: [6:0.50]
+; BROADWELL-NEXT: cmovbeq (%rdx), %rdi # sched: [7:0.50]
+; BROADWELL-NEXT: cmovbeq (%rdx), %rdi # sched: [7:0.50]
+; BROADWELL-NEXT: cmovaq (%rdx), %rdi # sched: [7:0.50]
+; BROADWELL-NEXT: cmovaq (%rdx), %rdi # sched: [7:0.50]
+; BROADWELL-NEXT: cmovsq (%rdx), %rdi # sched: [6:0.50]
+; BROADWELL-NEXT: cmovnsq (%rdx), %rdi # sched: [6:0.50]
+; BROADWELL-NEXT: cmovpq (%rdx), %rdi # sched: [6:0.50]
+; BROADWELL-NEXT: cmovpq (%rdx), %rdi # sched: [6:0.50]
+; BROADWELL-NEXT: cmovnpq (%rdx), %rdi # sched: [6:0.50]
+; BROADWELL-NEXT: cmovnpq (%rdx), %rdi # sched: [6:0.50]
+; BROADWELL-NEXT: cmovlq (%rdx), %rdi # sched: [6:0.50]
+; BROADWELL-NEXT: cmovlq (%rdx), %rdi # sched: [6:0.50]
+; BROADWELL-NEXT: cmovgeq (%rdx), %rdi # sched: [6:0.50]
+; BROADWELL-NEXT: cmovgeq (%rdx), %rdi # sched: [6:0.50]
+; BROADWELL-NEXT: cmovleq (%rdx), %rdi # sched: [6:0.50]
+; BROADWELL-NEXT: cmovleq (%rdx), %rdi # sched: [6:0.50]
+; BROADWELL-NEXT: cmovgq (%rdx), %rdi # sched: [6:0.50]
+; BROADWELL-NEXT: cmovgq (%rdx), %rdi # sched: [6:0.50]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_cmov_64:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: cmovoq %rsi, %rdi # sched: [1:0.50]
+; SKYLAKE-NEXT: cmovnoq %rsi, %rdi # sched: [1:0.50]
+; SKYLAKE-NEXT: cmovbq %rsi, %rdi # sched: [1:0.50]
+; SKYLAKE-NEXT: cmovbq %rsi, %rdi # sched: [1:0.50]
+; SKYLAKE-NEXT: cmovbq %rsi, %rdi # sched: [1:0.50]
+; SKYLAKE-NEXT: cmovaeq %rsi, %rdi # sched: [1:0.50]
+; SKYLAKE-NEXT: cmovaeq %rsi, %rdi # sched: [1:0.50]
+; SKYLAKE-NEXT: cmovaeq %rsi, %rdi # sched: [1:0.50]
+; SKYLAKE-NEXT: cmoveq %rsi, %rdi # sched: [1:0.50]
+; SKYLAKE-NEXT: cmoveq %rsi, %rdi # sched: [1:0.50]
+; SKYLAKE-NEXT: cmovneq %rsi, %rdi # sched: [1:0.50]
+; SKYLAKE-NEXT: cmovneq %rsi, %rdi # sched: [1:0.50]
+; SKYLAKE-NEXT: cmovbeq %rsi, %rdi # sched: [2:1.00]
+; SKYLAKE-NEXT: cmovbeq %rsi, %rdi # sched: [2:1.00]
+; SKYLAKE-NEXT: cmovaq %rsi, %rdi # sched: [2:1.00]
+; SKYLAKE-NEXT: cmovaq %rsi, %rdi # sched: [2:1.00]
+; SKYLAKE-NEXT: cmovsq %rsi, %rdi # sched: [1:0.50]
+; SKYLAKE-NEXT: cmovnsq %rsi, %rdi # sched: [1:0.50]
+; SKYLAKE-NEXT: cmovpq %rsi, %rdi # sched: [1:0.50]
+; SKYLAKE-NEXT: cmovpq %rsi, %rdi # sched: [1:0.50]
+; SKYLAKE-NEXT: cmovnpq %rsi, %rdi # sched: [1:0.50]
+; SKYLAKE-NEXT: cmovnpq %rsi, %rdi # sched: [1:0.50]
+; SKYLAKE-NEXT: cmovlq %rsi, %rdi # sched: [1:0.50]
+; SKYLAKE-NEXT: cmovlq %rsi, %rdi # sched: [1:0.50]
+; SKYLAKE-NEXT: cmovgeq %rsi, %rdi # sched: [1:0.50]
+; SKYLAKE-NEXT: cmovgeq %rsi, %rdi # sched: [1:0.50]
+; SKYLAKE-NEXT: cmovleq %rsi, %rdi # sched: [1:0.50]
+; SKYLAKE-NEXT: cmovleq %rsi, %rdi # sched: [1:0.50]
+; SKYLAKE-NEXT: cmovgq %rsi, %rdi # sched: [1:0.50]
+; SKYLAKE-NEXT: cmovgq %rsi, %rdi # sched: [1:0.50]
+; SKYLAKE-NEXT: cmovoq (%rdx), %rdi # sched: [6:0.50]
+; SKYLAKE-NEXT: cmovnoq (%rdx), %rdi # sched: [6:0.50]
+; SKYLAKE-NEXT: cmovbq (%rdx), %rdi # sched: [6:0.50]
+; SKYLAKE-NEXT: cmovbq (%rdx), %rdi # sched: [6:0.50]
+; SKYLAKE-NEXT: cmovbq (%rdx), %rdi # sched: [6:0.50]
+; SKYLAKE-NEXT: cmovaeq (%rdx), %rdi # sched: [6:0.50]
+; SKYLAKE-NEXT: cmovaeq (%rdx), %rdi # sched: [6:0.50]
+; SKYLAKE-NEXT: cmovaeq (%rdx), %rdi # sched: [6:0.50]
+; SKYLAKE-NEXT: cmoveq (%rdx), %rdi # sched: [6:0.50]
+; SKYLAKE-NEXT: cmoveq (%rdx), %rdi # sched: [6:0.50]
+; SKYLAKE-NEXT: cmovneq (%rdx), %rdi # sched: [6:0.50]
+; SKYLAKE-NEXT: cmovneq (%rdx), %rdi # sched: [6:0.50]
+; SKYLAKE-NEXT: cmovbeq (%rdx), %rdi # sched: [7:1.00]
+; SKYLAKE-NEXT: cmovbeq (%rdx), %rdi # sched: [7:1.00]
+; SKYLAKE-NEXT: cmovaq (%rdx), %rdi # sched: [7:1.00]
+; SKYLAKE-NEXT: cmovaq (%rdx), %rdi # sched: [7:1.00]
+; SKYLAKE-NEXT: cmovsq (%rdx), %rdi # sched: [6:0.50]
+; SKYLAKE-NEXT: cmovnsq (%rdx), %rdi # sched: [6:0.50]
+; SKYLAKE-NEXT: cmovpq (%rdx), %rdi # sched: [6:0.50]
+; SKYLAKE-NEXT: cmovpq (%rdx), %rdi # sched: [6:0.50]
+; SKYLAKE-NEXT: cmovnpq (%rdx), %rdi # sched: [6:0.50]
+; SKYLAKE-NEXT: cmovnpq (%rdx), %rdi # sched: [6:0.50]
+; SKYLAKE-NEXT: cmovlq (%rdx), %rdi # sched: [6:0.50]
+; SKYLAKE-NEXT: cmovlq (%rdx), %rdi # sched: [6:0.50]
+; SKYLAKE-NEXT: cmovgeq (%rdx), %rdi # sched: [6:0.50]
+; SKYLAKE-NEXT: cmovgeq (%rdx), %rdi # sched: [6:0.50]
+; SKYLAKE-NEXT: cmovleq (%rdx), %rdi # sched: [6:0.50]
+; SKYLAKE-NEXT: cmovleq (%rdx), %rdi # sched: [6:0.50]
+; SKYLAKE-NEXT: cmovgq (%rdx), %rdi # sched: [6:0.50]
+; SKYLAKE-NEXT: cmovgq (%rdx), %rdi # sched: [6:0.50]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_cmov_64:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: cmovoq %rsi, %rdi # sched: [1:0.50]
+; SKX-NEXT: cmovnoq %rsi, %rdi # sched: [1:0.50]
+; SKX-NEXT: cmovbq %rsi, %rdi # sched: [1:0.50]
+; SKX-NEXT: cmovbq %rsi, %rdi # sched: [1:0.50]
+; SKX-NEXT: cmovbq %rsi, %rdi # sched: [1:0.50]
+; SKX-NEXT: cmovaeq %rsi, %rdi # sched: [1:0.50]
+; SKX-NEXT: cmovaeq %rsi, %rdi # sched: [1:0.50]
+; SKX-NEXT: cmovaeq %rsi, %rdi # sched: [1:0.50]
+; SKX-NEXT: cmoveq %rsi, %rdi # sched: [1:0.50]
+; SKX-NEXT: cmoveq %rsi, %rdi # sched: [1:0.50]
+; SKX-NEXT: cmovneq %rsi, %rdi # sched: [1:0.50]
+; SKX-NEXT: cmovneq %rsi, %rdi # sched: [1:0.50]
+; SKX-NEXT: cmovbeq %rsi, %rdi # sched: [2:1.00]
+; SKX-NEXT: cmovbeq %rsi, %rdi # sched: [2:1.00]
+; SKX-NEXT: cmovaq %rsi, %rdi # sched: [2:1.00]
+; SKX-NEXT: cmovaq %rsi, %rdi # sched: [2:1.00]
+; SKX-NEXT: cmovsq %rsi, %rdi # sched: [1:0.50]
+; SKX-NEXT: cmovnsq %rsi, %rdi # sched: [1:0.50]
+; SKX-NEXT: cmovpq %rsi, %rdi # sched: [1:0.50]
+; SKX-NEXT: cmovpq %rsi, %rdi # sched: [1:0.50]
+; SKX-NEXT: cmovnpq %rsi, %rdi # sched: [1:0.50]
+; SKX-NEXT: cmovnpq %rsi, %rdi # sched: [1:0.50]
+; SKX-NEXT: cmovlq %rsi, %rdi # sched: [1:0.50]
+; SKX-NEXT: cmovlq %rsi, %rdi # sched: [1:0.50]
+; SKX-NEXT: cmovgeq %rsi, %rdi # sched: [1:0.50]
+; SKX-NEXT: cmovgeq %rsi, %rdi # sched: [1:0.50]
+; SKX-NEXT: cmovleq %rsi, %rdi # sched: [1:0.50]
+; SKX-NEXT: cmovleq %rsi, %rdi # sched: [1:0.50]
+; SKX-NEXT: cmovgq %rsi, %rdi # sched: [1:0.50]
+; SKX-NEXT: cmovgq %rsi, %rdi # sched: [1:0.50]
+; SKX-NEXT: cmovoq (%rdx), %rdi # sched: [6:0.50]
+; SKX-NEXT: cmovnoq (%rdx), %rdi # sched: [6:0.50]
+; SKX-NEXT: cmovbq (%rdx), %rdi # sched: [6:0.50]
+; SKX-NEXT: cmovbq (%rdx), %rdi # sched: [6:0.50]
+; SKX-NEXT: cmovbq (%rdx), %rdi # sched: [6:0.50]
+; SKX-NEXT: cmovaeq (%rdx), %rdi # sched: [6:0.50]
+; SKX-NEXT: cmovaeq (%rdx), %rdi # sched: [6:0.50]
+; SKX-NEXT: cmovaeq (%rdx), %rdi # sched: [6:0.50]
+; SKX-NEXT: cmoveq (%rdx), %rdi # sched: [6:0.50]
+; SKX-NEXT: cmoveq (%rdx), %rdi # sched: [6:0.50]
+; SKX-NEXT: cmovneq (%rdx), %rdi # sched: [6:0.50]
+; SKX-NEXT: cmovneq (%rdx), %rdi # sched: [6:0.50]
+; SKX-NEXT: cmovbeq (%rdx), %rdi # sched: [7:1.00]
+; SKX-NEXT: cmovbeq (%rdx), %rdi # sched: [7:1.00]
+; SKX-NEXT: cmovaq (%rdx), %rdi # sched: [7:1.00]
+; SKX-NEXT: cmovaq (%rdx), %rdi # sched: [7:1.00]
+; SKX-NEXT: cmovsq (%rdx), %rdi # sched: [6:0.50]
+; SKX-NEXT: cmovnsq (%rdx), %rdi # sched: [6:0.50]
+; SKX-NEXT: cmovpq (%rdx), %rdi # sched: [6:0.50]
+; SKX-NEXT: cmovpq (%rdx), %rdi # sched: [6:0.50]
+; SKX-NEXT: cmovnpq (%rdx), %rdi # sched: [6:0.50]
+; SKX-NEXT: cmovnpq (%rdx), %rdi # sched: [6:0.50]
+; SKX-NEXT: cmovlq (%rdx), %rdi # sched: [6:0.50]
+; SKX-NEXT: cmovlq (%rdx), %rdi # sched: [6:0.50]
+; SKX-NEXT: cmovgeq (%rdx), %rdi # sched: [6:0.50]
+; SKX-NEXT: cmovgeq (%rdx), %rdi # sched: [6:0.50]
+; SKX-NEXT: cmovleq (%rdx), %rdi # sched: [6:0.50]
+; SKX-NEXT: cmovleq (%rdx), %rdi # sched: [6:0.50]
+; SKX-NEXT: cmovgq (%rdx), %rdi # sched: [6:0.50]
+; SKX-NEXT: cmovgq (%rdx), %rdi # sched: [6:0.50]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_cmov_64:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: cmovoq %rsi, %rdi # sched: [1:0.50]
+; BTVER2-NEXT: cmovnoq %rsi, %rdi # sched: [1:0.50]
+; BTVER2-NEXT: cmovbq %rsi, %rdi # sched: [1:0.50]
+; BTVER2-NEXT: cmovbq %rsi, %rdi # sched: [1:0.50]
+; BTVER2-NEXT: cmovbq %rsi, %rdi # sched: [1:0.50]
+; BTVER2-NEXT: cmovaeq %rsi, %rdi # sched: [1:0.50]
+; BTVER2-NEXT: cmovaeq %rsi, %rdi # sched: [1:0.50]
+; BTVER2-NEXT: cmovaeq %rsi, %rdi # sched: [1:0.50]
+; BTVER2-NEXT: cmoveq %rsi, %rdi # sched: [1:0.50]
+; BTVER2-NEXT: cmoveq %rsi, %rdi # sched: [1:0.50]
+; BTVER2-NEXT: cmovneq %rsi, %rdi # sched: [1:0.50]
+; BTVER2-NEXT: cmovneq %rsi, %rdi # sched: [1:0.50]
+; BTVER2-NEXT: cmovbeq %rsi, %rdi # sched: [1:0.50]
+; BTVER2-NEXT: cmovbeq %rsi, %rdi # sched: [1:0.50]
+; BTVER2-NEXT: cmovaq %rsi, %rdi # sched: [1:0.50]
+; BTVER2-NEXT: cmovaq %rsi, %rdi # sched: [1:0.50]
+; BTVER2-NEXT: cmovsq %rsi, %rdi # sched: [1:0.50]
+; BTVER2-NEXT: cmovnsq %rsi, %rdi # sched: [1:0.50]
+; BTVER2-NEXT: cmovpq %rsi, %rdi # sched: [1:0.50]
+; BTVER2-NEXT: cmovpq %rsi, %rdi # sched: [1:0.50]
+; BTVER2-NEXT: cmovnpq %rsi, %rdi # sched: [1:0.50]
+; BTVER2-NEXT: cmovnpq %rsi, %rdi # sched: [1:0.50]
+; BTVER2-NEXT: cmovlq %rsi, %rdi # sched: [1:0.50]
+; BTVER2-NEXT: cmovlq %rsi, %rdi # sched: [1:0.50]
+; BTVER2-NEXT: cmovgeq %rsi, %rdi # sched: [1:0.50]
+; BTVER2-NEXT: cmovgeq %rsi, %rdi # sched: [1:0.50]
+; BTVER2-NEXT: cmovleq %rsi, %rdi # sched: [1:0.50]
+; BTVER2-NEXT: cmovleq %rsi, %rdi # sched: [1:0.50]
+; BTVER2-NEXT: cmovgq %rsi, %rdi # sched: [1:0.50]
+; BTVER2-NEXT: cmovgq %rsi, %rdi # sched: [1:0.50]
+; BTVER2-NEXT: cmovoq (%rdx), %rdi # sched: [4:1.00]
+; BTVER2-NEXT: cmovnoq (%rdx), %rdi # sched: [4:1.00]
+; BTVER2-NEXT: cmovbq (%rdx), %rdi # sched: [4:1.00]
+; BTVER2-NEXT: cmovbq (%rdx), %rdi # sched: [4:1.00]
+; BTVER2-NEXT: cmovbq (%rdx), %rdi # sched: [4:1.00]
+; BTVER2-NEXT: cmovaeq (%rdx), %rdi # sched: [4:1.00]
+; BTVER2-NEXT: cmovaeq (%rdx), %rdi # sched: [4:1.00]
+; BTVER2-NEXT: cmovaeq (%rdx), %rdi # sched: [4:1.00]
+; BTVER2-NEXT: cmoveq (%rdx), %rdi # sched: [4:1.00]
+; BTVER2-NEXT: cmoveq (%rdx), %rdi # sched: [4:1.00]
+; BTVER2-NEXT: cmovneq (%rdx), %rdi # sched: [4:1.00]
+; BTVER2-NEXT: cmovneq (%rdx), %rdi # sched: [4:1.00]
+; BTVER2-NEXT: cmovbeq (%rdx), %rdi # sched: [4:1.00]
+; BTVER2-NEXT: cmovbeq (%rdx), %rdi # sched: [4:1.00]
+; BTVER2-NEXT: cmovaq (%rdx), %rdi # sched: [4:1.00]
+; BTVER2-NEXT: cmovaq (%rdx), %rdi # sched: [4:1.00]
+; BTVER2-NEXT: cmovsq (%rdx), %rdi # sched: [4:1.00]
+; BTVER2-NEXT: cmovnsq (%rdx), %rdi # sched: [4:1.00]
+; BTVER2-NEXT: cmovpq (%rdx), %rdi # sched: [4:1.00]
+; BTVER2-NEXT: cmovpq (%rdx), %rdi # sched: [4:1.00]
+; BTVER2-NEXT: cmovnpq (%rdx), %rdi # sched: [4:1.00]
+; BTVER2-NEXT: cmovnpq (%rdx), %rdi # sched: [4:1.00]
+; BTVER2-NEXT: cmovlq (%rdx), %rdi # sched: [4:1.00]
+; BTVER2-NEXT: cmovlq (%rdx), %rdi # sched: [4:1.00]
+; BTVER2-NEXT: cmovgeq (%rdx), %rdi # sched: [4:1.00]
+; BTVER2-NEXT: cmovgeq (%rdx), %rdi # sched: [4:1.00]
+; BTVER2-NEXT: cmovleq (%rdx), %rdi # sched: [4:1.00]
+; BTVER2-NEXT: cmovleq (%rdx), %rdi # sched: [4:1.00]
+; BTVER2-NEXT: cmovgq (%rdx), %rdi # sched: [4:1.00]
+; BTVER2-NEXT: cmovgq (%rdx), %rdi # sched: [4:1.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_cmov_64:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: cmovoq %rsi, %rdi # sched: [1:0.25]
+; ZNVER1-NEXT: cmovnoq %rsi, %rdi # sched: [1:0.25]
+; ZNVER1-NEXT: cmovbq %rsi, %rdi # sched: [1:0.25]
+; ZNVER1-NEXT: cmovbq %rsi, %rdi # sched: [1:0.25]
+; ZNVER1-NEXT: cmovbq %rsi, %rdi # sched: [1:0.25]
+; ZNVER1-NEXT: cmovaeq %rsi, %rdi # sched: [1:0.25]
+; ZNVER1-NEXT: cmovaeq %rsi, %rdi # sched: [1:0.25]
+; ZNVER1-NEXT: cmovaeq %rsi, %rdi # sched: [1:0.25]
+; ZNVER1-NEXT: cmoveq %rsi, %rdi # sched: [1:0.25]
+; ZNVER1-NEXT: cmoveq %rsi, %rdi # sched: [1:0.25]
+; ZNVER1-NEXT: cmovneq %rsi, %rdi # sched: [1:0.25]
+; ZNVER1-NEXT: cmovneq %rsi, %rdi # sched: [1:0.25]
+; ZNVER1-NEXT: cmovbeq %rsi, %rdi # sched: [1:0.25]
+; ZNVER1-NEXT: cmovbeq %rsi, %rdi # sched: [1:0.25]
+; ZNVER1-NEXT: cmovaq %rsi, %rdi # sched: [1:0.25]
+; ZNVER1-NEXT: cmovaq %rsi, %rdi # sched: [1:0.25]
+; ZNVER1-NEXT: cmovsq %rsi, %rdi # sched: [1:0.25]
+; ZNVER1-NEXT: cmovnsq %rsi, %rdi # sched: [1:0.25]
+; ZNVER1-NEXT: cmovpq %rsi, %rdi # sched: [1:0.25]
+; ZNVER1-NEXT: cmovpq %rsi, %rdi # sched: [1:0.25]
+; ZNVER1-NEXT: cmovnpq %rsi, %rdi # sched: [1:0.25]
+; ZNVER1-NEXT: cmovnpq %rsi, %rdi # sched: [1:0.25]
+; ZNVER1-NEXT: cmovlq %rsi, %rdi # sched: [1:0.25]
+; ZNVER1-NEXT: cmovlq %rsi, %rdi # sched: [1:0.25]
+; ZNVER1-NEXT: cmovgeq %rsi, %rdi # sched: [1:0.25]
+; ZNVER1-NEXT: cmovgeq %rsi, %rdi # sched: [1:0.25]
+; ZNVER1-NEXT: cmovleq %rsi, %rdi # sched: [1:0.25]
+; ZNVER1-NEXT: cmovleq %rsi, %rdi # sched: [1:0.25]
+; ZNVER1-NEXT: cmovgq %rsi, %rdi # sched: [1:0.25]
+; ZNVER1-NEXT: cmovgq %rsi, %rdi # sched: [1:0.25]
+; ZNVER1-NEXT: cmovoq (%rdx), %rdi # sched: [5:0.50]
+; ZNVER1-NEXT: cmovnoq (%rdx), %rdi # sched: [5:0.50]
+; ZNVER1-NEXT: cmovbq (%rdx), %rdi # sched: [5:0.50]
+; ZNVER1-NEXT: cmovbq (%rdx), %rdi # sched: [5:0.50]
+; ZNVER1-NEXT: cmovbq (%rdx), %rdi # sched: [5:0.50]
+; ZNVER1-NEXT: cmovaeq (%rdx), %rdi # sched: [5:0.50]
+; ZNVER1-NEXT: cmovaeq (%rdx), %rdi # sched: [5:0.50]
+; ZNVER1-NEXT: cmovaeq (%rdx), %rdi # sched: [5:0.50]
+; ZNVER1-NEXT: cmoveq (%rdx), %rdi # sched: [5:0.50]
+; ZNVER1-NEXT: cmoveq (%rdx), %rdi # sched: [5:0.50]
+; ZNVER1-NEXT: cmovneq (%rdx), %rdi # sched: [5:0.50]
+; ZNVER1-NEXT: cmovneq (%rdx), %rdi # sched: [5:0.50]
+; ZNVER1-NEXT: cmovbeq (%rdx), %rdi # sched: [5:0.50]
+; ZNVER1-NEXT: cmovbeq (%rdx), %rdi # sched: [5:0.50]
+; ZNVER1-NEXT: cmovaq (%rdx), %rdi # sched: [5:0.50]
+; ZNVER1-NEXT: cmovaq (%rdx), %rdi # sched: [5:0.50]
+; ZNVER1-NEXT: cmovsq (%rdx), %rdi # sched: [5:0.50]
+; ZNVER1-NEXT: cmovnsq (%rdx), %rdi # sched: [5:0.50]
+; ZNVER1-NEXT: cmovpq (%rdx), %rdi # sched: [5:0.50]
+; ZNVER1-NEXT: cmovpq (%rdx), %rdi # sched: [5:0.50]
+; ZNVER1-NEXT: cmovnpq (%rdx), %rdi # sched: [5:0.50]
+; ZNVER1-NEXT: cmovnpq (%rdx), %rdi # sched: [5:0.50]
+; ZNVER1-NEXT: cmovlq (%rdx), %rdi # sched: [5:0.50]
+; ZNVER1-NEXT: cmovlq (%rdx), %rdi # sched: [5:0.50]
+; ZNVER1-NEXT: cmovgeq (%rdx), %rdi # sched: [5:0.50]
+; ZNVER1-NEXT: cmovgeq (%rdx), %rdi # sched: [5:0.50]
+; ZNVER1-NEXT: cmovleq (%rdx), %rdi # sched: [5:0.50]
+; ZNVER1-NEXT: cmovleq (%rdx), %rdi # sched: [5:0.50]
+; ZNVER1-NEXT: cmovgq (%rdx), %rdi # sched: [5:0.50]
+; ZNVER1-NEXT: cmovgq (%rdx), %rdi # sched: [5:0.50]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ call void asm sideeffect "cmovoq $1, $0 \0A\09 cmovnoq $1, $0 \0A\09 cmovbq $1, $0 \0A\09 cmovcq $1, $0 \0A\09 cmovnaeq $1, $0 \0A\09 cmovnbq $1, $0 \0A\09 cmovncq $1, $0 \0A\09 cmovaeq $1, $0 \0A\09 cmovzq $1, $0 \0A\09 cmoveq $1, $0 \0A\09 cmovnzq $1, $0 \0A\09 cmovneq $1, $0 \0A\09 cmovbeq $1, $0 \0A\09 cmovnaq $1, $0 \0A\09 cmovnbeq $1, $0 \0A\09 cmovaq $1, $0 \0A\09 cmovsq $1, $0 \0A\09 cmovnsq $1, $0 \0A\09 cmovpq $1, $0 \0A\09 cmovpeq $1, $0 \0A\09 cmovnpq $1, $0 \0A\09 cmovpoq $1, $0 \0A\09 cmovlq $1, $0 \0A\09 cmovngeq $1, $0 \0A\09 cmovnlq $1, $0 \0A\09 cmovgeq $1, $0 \0A\09 cmovleq $1, $0 \0A\09 cmovngq $1, $0 \0A\09 cmovnleq $1, $0 \0A\09 cmovgq $1, $0 \0A\09 cmovoq $2, $0 \0A\09 cmovnoq $2, $0 \0A\09 cmovbq $2, $0 \0A\09 cmovcq $2, $0 \0A\09 cmovnaeq $2, $0 \0A\09 cmovnbq $2, $0 \0A\09 cmovncq $2, $0 \0A\09 cmovaeq $2, $0 \0A\09 cmovzq $2, $0 \0A\09 cmoveq $2, $0 \0A\09 cmovnzq $2, $0 \0A\09 cmovneq $2, $0 \0A\09 cmovbeq $2, $0 \0A\09 cmovnaq $2, $0 \0A\09 cmovnbeq $2, $0 \0A\09 cmovaq $2, $0 \0A\09 cmovsq $2, $0 \0A\09 cmovnsq $2, $0 \0A\09 cmovpq $2, $0 \0A\09 cmovpeq $2, $0 \0A\09 cmovnpq $2, $0 \0A\09 cmovpoq $2, $0 \0A\09 cmovlq $2, $0 \0A\09 cmovngeq $2, $0 \0A\09 cmovnlq $2, $0 \0A\09 cmovgeq $2, $0 \0A\09 cmovleq $2, $0 \0A\09 cmovngq $2, $0 \0A\09 cmovnleq $2, $0 \0A\09 cmovgq $2, $0", "r,r,*m"(i64 %a0, i64 %a1, i64 *%a2)
+ ret void
+}
diff --git a/test/CodeGen/X86/cmov.ll b/test/CodeGen/X86/cmov.ll
index fca39bca6c76..e860a59806eb 100644
--- a/test/CodeGen/X86/cmov.ll
+++ b/test/CodeGen/X86/cmov.ll
@@ -1,10 +1,10 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64-unknown-unknown -disable-cgp-select2branch | FileCheck %s
+; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64-unknown-unknown -disable-cgp-select2branch -x86-cmov-converter=false | FileCheck %s
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
define i32 @test1(i32 %x, i32 %n, i32 %w, i32* %vp) nounwind readnone {
; CHECK-LABEL: test1:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: btl %esi, %edi
; CHECK-NEXT: movl $12, %eax
; CHECK-NEXT: cmovael (%rcx), %eax
@@ -20,7 +20,7 @@ entry:
define i32 @test2(i32 %x, i32 %n, i32 %w, i32* %vp) nounwind readnone {
; CHECK-LABEL: test2:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: btl %esi, %edi
; CHECK-NEXT: movl $12, %eax
; CHECK-NEXT: cmovbl (%rcx), %eax
@@ -43,7 +43,7 @@ declare void @bar(i64) nounwind
define void @test3(i64 %a, i64 %b, i1 %p) nounwind {
; CHECK-LABEL: test3:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: pushq %rax
; CHECK-NEXT: testb $1, %dl
; CHECK-NEXT: cmovel %esi, %edi
@@ -77,37 +77,37 @@ define void @test3(i64 %a, i64 %b, i1 %p) nounwind {
define i1 @test4() nounwind {
; CHECK-LABEL: test4:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movsbl {{.*}}(%rip), %edx
; CHECK-NEXT: movl %edx, %eax
; CHECK-NEXT: shrb $7, %al
; CHECK-NEXT: movzbl %al, %ecx
; CHECK-NEXT: xorl $1, %ecx
-; CHECK-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
+; CHECK-NEXT: # kill: def %cl killed %cl killed %ecx
; CHECK-NEXT: sarl %cl, %edx
; CHECK-NEXT: movb {{.*}}(%rip), %al
; CHECK-NEXT: testb %al, %al
; CHECK-NEXT: je .LBB3_2
-; CHECK-NEXT: # BB#1: # %bb.i.i.i
+; CHECK-NEXT: # %bb.1: # %bb.i.i.i
; CHECK-NEXT: movb {{.*}}(%rip), %cl
; CHECK-NEXT: .LBB3_2: # %func_4.exit.i
; CHECK-NEXT: pushq %rbx
; CHECK-NEXT: testb %dl, %dl
; CHECK-NEXT: setne %bl
-; CHECK-NEXT: movb %al, %cl
+; CHECK-NEXT: movl %eax, %ecx
; CHECK-NEXT: je .LBB3_4
-; CHECK-NEXT: # BB#3: # %func_4.exit.i
+; CHECK-NEXT: # %bb.3: # %func_4.exit.i
; CHECK-NEXT: xorl %ecx, %ecx
; CHECK-NEXT: .LBB3_4: # %func_4.exit.i
; CHECK-NEXT: testb %al, %al
; CHECK-NEXT: je .LBB3_7
-; CHECK-NEXT: # BB#5: # %func_4.exit.i
+; CHECK-NEXT: # %bb.5: # %func_4.exit.i
; CHECK-NEXT: testb %bl, %bl
; CHECK-NEXT: jne .LBB3_7
-; CHECK-NEXT: # BB#6: # %bb.i.i
+; CHECK-NEXT: # %bb.6: # %bb.i.i
; CHECK-NEXT: movb {{.*}}(%rip), %cl
; CHECK-NEXT: xorl %ebx, %ebx
-; CHECK-NEXT: movb %al, %cl
+; CHECK-NEXT: movl %eax, %ecx
; CHECK-NEXT: .LBB3_7: # %func_1.exit
; CHECK-NEXT: movb %cl, {{.*}}(%rip)
; CHECK-NEXT: movzbl %cl, %esi
@@ -160,7 +160,7 @@ declare i32 @printf(i8* nocapture, ...) nounwind
; rdar://6668608
define i32 @test5(i32* nocapture %P) nounwind readonly {
; CHECK-LABEL: test5:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: cmpl $41, (%rdi)
; CHECK-NEXT: setg %al
@@ -175,7 +175,7 @@ entry:
define i32 @test6(i32* nocapture %P) nounwind readonly {
; CHECK-LABEL: test6:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: cmpl $42, (%rdi)
; CHECK-NEXT: setl %al
@@ -193,11 +193,11 @@ entry:
; because it isn't worth it. Just use a branch instead.
define i8 @test7(i1 inreg %c, i8 inreg %a, i8 inreg %b) nounwind {
; CHECK-LABEL: test7:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: testb $1, %dil
; CHECK-NEXT: jne .LBB6_2
-; CHECK-NEXT: # BB#1:
-; CHECK-NEXT: movb %dl, %sil
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: movl %edx, %esi
; CHECK-NEXT: .LBB6_2:
; CHECK-NEXT: movl %esi, %eax
; CHECK-NEXT: retq
@@ -207,8 +207,9 @@ define i8 @test7(i1 inreg %c, i8 inreg %a, i8 inreg %b) nounwind {
define i32 @smin(i32 %x) {
; CHECK-LABEL: smin:
-; CHECK: # BB#0:
-; CHECK-NEXT: xorl $-1, %edi
+; CHECK: # %bb.0:
+; CHECK-NEXT: notl %edi
+; CHECK-NEXT: testl %edi, %edi
; CHECK-NEXT: movl $-1, %eax
; CHECK-NEXT: cmovsl %edi, %eax
; CHECK-NEXT: retq
diff --git a/test/CodeGen/X86/cmovcmov.ll b/test/CodeGen/X86/cmovcmov.ll
index 5b984d27249b..98a7eb7db0f5 100644
--- a/test/CodeGen/X86/cmovcmov.ll
+++ b/test/CodeGen/X86/cmovcmov.ll
@@ -53,8 +53,7 @@ entry:
; NOCMOV-NEXT: leal 12(%esp), %ecx
; NOCMOV-NEXT: [[TBB]]:
; NOCMOV-NEXT: movl (%ecx), %eax
-; NOCMOV-NEXT: orl $4, %ecx
-; NOCMOV-NEXT: movl (%ecx), %edx
+; NOCMOV-NEXT: movl 4(%ecx), %edx
; NOCMOV-NEXT: retl
define i64 @test_select_fcmp_oeq_i64(float %a, float %b, i64 %c, i64 %d) #0 {
entry:
@@ -82,8 +81,7 @@ entry:
; NOCMOV-NEXT: leal 20(%esp), %ecx
; NOCMOV-NEXT: [[TBB]]:
; NOCMOV-NEXT: movl (%ecx), %eax
-; NOCMOV-NEXT: orl $4, %ecx
-; NOCMOV-NEXT: movl (%ecx), %edx
+; NOCMOV-NEXT: movl 4(%ecx), %edx
; NOCMOV-NEXT: retl
define i64 @test_select_fcmp_une_i64(float %a, float %b, i64 %c, i64 %d) #0 {
entry:
@@ -229,21 +227,21 @@ attributes #0 = { nounwind }
; The following test failed because llvm had a bug where a structure like:
;
-; %vreg12<def> = CMOV_GR8 %vreg7, %vreg11 ... (lt)
-; %vreg13<def> = CMOV_GR8 %vreg12, %vreg11 ... (gt)
+; %12 = CMOV_GR8 %7, %11 ... (lt)
+; %13 = CMOV_GR8 %12, %11 ... (gt)
;
; was lowered to:
;
; The first two cmovs got expanded to:
-; BB#0:
-; JL_1 BB#9
-; BB#7:
-; JG_1 BB#9
-; BB#8:
-; BB#9:
-; vreg12 = phi(vreg7, BB#8, vreg11, BB#0, vreg12, BB#7)
-; vreg13 = COPY vreg12
-; Which was invalid as %vreg12 is not the same value as %vreg13
+; %bb.0:
+; JL_1 %bb.9
+; %bb.7:
+; JG_1 %bb.9
+; %bb.8:
+; %bb.9:
+; %12 = phi(%7, %bb.8, %11, %bb.0, %12, %bb.7)
+; %13 = COPY %12
+; Which was invalid as %12 is not the same value as %13
; CHECK-LABEL: no_cascade_opt:
; CMOV-DAG: cmpl %edx, %esi
diff --git a/test/CodeGen/X86/cmp.ll b/test/CodeGen/X86/cmp.ll
index 5d05c699f431..1ab8421638d0 100644
--- a/test/CodeGen/X86/cmp.ll
+++ b/test/CodeGen/X86/cmp.ll
@@ -1,265 +1,392 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin10 -show-mc-encoding | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -show-mc-encoding | FileCheck %s
+
+@d = global i8 0, align 1
define i32 @test1(i32 %X, i32* %y) nounwind {
- %tmp = load i32, i32* %y ; <i32> [#uses=1]
- %tmp.upgrd.1 = icmp eq i32 %tmp, 0 ; <i1> [#uses=1]
- br i1 %tmp.upgrd.1, label %ReturnBlock, label %cond_true
+; CHECK-LABEL: test1:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: cmpl $0, (%rsi) # encoding: [0x83,0x3e,0x00]
+; CHECK-NEXT: je .LBB0_2 # encoding: [0x74,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: .LBB0_2-1, kind: FK_PCRel_1
+; CHECK-NEXT: # %bb.1: # %cond_true
+; CHECK-NEXT: movl $1, %eax # encoding: [0xb8,0x01,0x00,0x00,0x00]
+; CHECK-NEXT: retq # encoding: [0xc3]
+; CHECK-NEXT: .LBB0_2: # %ReturnBlock
+; CHECK-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0]
+; CHECK-NEXT: retq # encoding: [0xc3]
+entry:
+ %tmp = load i32, i32* %y
+ %tmp.upgrd.1 = icmp eq i32 %tmp, 0
+ br i1 %tmp.upgrd.1, label %ReturnBlock, label %cond_true
-cond_true: ; preds = %0
- ret i32 1
+cond_true:
+ ret i32 1
-ReturnBlock: ; preds = %0
- ret i32 0
-; CHECK-LABEL: test1:
-; CHECK: cmpl $0, (%rsi)
+ReturnBlock:
+ ret i32 0
}
define i32 @test2(i32 %X, i32* %y) nounwind {
- %tmp = load i32, i32* %y ; <i32> [#uses=1]
- %tmp1 = shl i32 %tmp, 3 ; <i32> [#uses=1]
- %tmp1.upgrd.2 = icmp eq i32 %tmp1, 0 ; <i1> [#uses=1]
- br i1 %tmp1.upgrd.2, label %ReturnBlock, label %cond_true
+; CHECK-LABEL: test2:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: testl $536870911, (%rsi) # encoding: [0xf7,0x06,0xff,0xff,0xff,0x1f]
+; CHECK-NEXT: # imm = 0x1FFFFFFF
+; CHECK-NEXT: je .LBB1_2 # encoding: [0x74,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: .LBB1_2-1, kind: FK_PCRel_1
+; CHECK-NEXT: # %bb.1: # %cond_true
+; CHECK-NEXT: movl $1, %eax # encoding: [0xb8,0x01,0x00,0x00,0x00]
+; CHECK-NEXT: retq # encoding: [0xc3]
+; CHECK-NEXT: .LBB1_2: # %ReturnBlock
+; CHECK-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0]
+; CHECK-NEXT: retq # encoding: [0xc3]
+entry:
+ %tmp = load i32, i32* %y
+ %tmp1 = shl i32 %tmp, 3
+ %tmp1.upgrd.2 = icmp eq i32 %tmp1, 0
+ br i1 %tmp1.upgrd.2, label %ReturnBlock, label %cond_true
-cond_true: ; preds = %0
- ret i32 1
+cond_true:
+ ret i32 1
-ReturnBlock: ; preds = %0
- ret i32 0
-; CHECK-LABEL: test2:
-; CHECK: testl $536870911, (%rsi)
+ReturnBlock:
+ ret i32 0
}
define i8 @test2b(i8 %X, i8* %y) nounwind {
- %tmp = load i8, i8* %y ; <i8> [#uses=1]
- %tmp1 = shl i8 %tmp, 3 ; <i8> [#uses=1]
- %tmp1.upgrd.2 = icmp eq i8 %tmp1, 0 ; <i1> [#uses=1]
- br i1 %tmp1.upgrd.2, label %ReturnBlock, label %cond_true
+; CHECK-LABEL: test2b:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: testb $31, (%rsi) # encoding: [0xf6,0x06,0x1f]
+; CHECK-NEXT: je .LBB2_2 # encoding: [0x74,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: .LBB2_2-1, kind: FK_PCRel_1
+; CHECK-NEXT: # %bb.1: # %cond_true
+; CHECK-NEXT: movb $1, %al # encoding: [0xb0,0x01]
+; CHECK-NEXT: retq # encoding: [0xc3]
+; CHECK-NEXT: .LBB2_2: # %ReturnBlock
+; CHECK-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0]
+; CHECK-NEXT: retq # encoding: [0xc3]
+entry:
+ %tmp = load i8, i8* %y
+ %tmp1 = shl i8 %tmp, 3
+ %tmp1.upgrd.2 = icmp eq i8 %tmp1, 0
+ br i1 %tmp1.upgrd.2, label %ReturnBlock, label %cond_true
-cond_true: ; preds = %0
- ret i8 1
+cond_true:
+ ret i8 1
-ReturnBlock: ; preds = %0
- ret i8 0
-; CHECK-LABEL: test2b:
-; CHECK: testb $31, (%rsi)
+ReturnBlock:
+ ret i8 0
}
define i64 @test3(i64 %x) nounwind {
+; CHECK-LABEL: test3:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0]
+; CHECK-NEXT: testq %rdi, %rdi # encoding: [0x48,0x85,0xff]
+; CHECK-NEXT: sete %al # encoding: [0x0f,0x94,0xc0]
+; CHECK-NEXT: retq # encoding: [0xc3]
+entry:
%t = icmp eq i64 %x, 0
%r = zext i1 %t to i64
ret i64 %r
-; CHECK-LABEL: test3:
-; CHECK: xorl %eax, %eax
-; CHECK: testq %rdi, %rdi
-; CHECK: sete %al
-; CHECK: ret
}
define i64 @test4(i64 %x) nounwind {
+; CHECK-LABEL: test4:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0]
+; CHECK-NEXT: testq %rdi, %rdi # encoding: [0x48,0x85,0xff]
+; CHECK-NEXT: setle %al # encoding: [0x0f,0x9e,0xc0]
+; CHECK-NEXT: retq # encoding: [0xc3]
+entry:
%t = icmp slt i64 %x, 1
%r = zext i1 %t to i64
ret i64 %r
-; CHECK-LABEL: test4:
-; CHECK: xorl %eax, %eax
-; CHECK: testq %rdi, %rdi
-; CHECK: setle %al
-; CHECK: ret
}
+define i32 @test5(double %A) nounwind {
+; CHECK-LABEL: test5:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: ucomisd {{.*}}(%rip), %xmm0 # encoding: [0x66,0x0f,0x2e,0x05,A,A,A,A]
+; CHECK-NEXT: # fixup A - offset: 4, value: {{\.LCPI.*}}-4, kind: reloc_riprel_4byte
+; CHECK-NEXT: ja .LBB5_3 # encoding: [0x77,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: .LBB5_3-1, kind: FK_PCRel_1
+; CHECK-NEXT: # %bb.1: # %entry
+; CHECK-NEXT: ucomisd {{.*}}(%rip), %xmm0 # encoding: [0x66,0x0f,0x2e,0x05,A,A,A,A]
+; CHECK-NEXT: # fixup A - offset: 4, value: {{\.LCPI.*}}-4, kind: reloc_riprel_4byte
+; CHECK-NEXT: jb .LBB5_3 # encoding: [0x72,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: .LBB5_3-1, kind: FK_PCRel_1
+; CHECK-NEXT: # %bb.2: # %bb12
+; CHECK-NEXT: movl $32, %eax # encoding: [0xb8,0x20,0x00,0x00,0x00]
+; CHECK-NEXT: retq # encoding: [0xc3]
+; CHECK-NEXT: .LBB5_3: # %bb8
+; CHECK-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0]
+; CHECK-NEXT: jmp foo # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: foo-1, kind: FK_PCRel_1
+entry:
+ %tmp2 = fcmp ogt double %A, 1.500000e+02
+ %tmp5 = fcmp ult double %A, 7.500000e+01
+ %bothcond = or i1 %tmp2, %tmp5
+ br i1 %bothcond, label %bb8, label %bb12
-define i32 @test5(double %A) nounwind {
- entry:
- %tmp2 = fcmp ogt double %A, 1.500000e+02; <i1> [#uses=1]
- %tmp5 = fcmp ult double %A, 7.500000e+01; <i1> [#uses=1]
- %bothcond = or i1 %tmp2, %tmp5; <i1> [#uses=1]
- br i1 %bothcond, label %bb8, label %bb12
-
- bb8:; preds = %entry
- %tmp9 = tail call i32 (...) @foo( ) nounwind ; <i32> [#uses=1]
- ret i32 %tmp9
+bb8:
+ %tmp9 = tail call i32 (...) @foo() nounwind
+ ret i32 %tmp9
- bb12:; preds = %entry
- ret i32 32
-; CHECK-LABEL: test5:
-; CHECK: ucomisd LCPI5_0(%rip), %xmm0
-; CHECK: ucomisd LCPI5_1(%rip), %xmm0
+bb12:
+ ret i32 32
}
declare i32 @foo(...)
define i32 @test6() nounwind align 2 {
- %A = alloca {i64, i64}, align 8
- %B = getelementptr inbounds {i64, i64}, {i64, i64}* %A, i64 0, i32 1
+; CHECK-LABEL: test6:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: cmpq $0, -{{[0-9]+}}(%rsp) # encoding: [0x48,0x83,0x7c,0x24,0xf8,0x00]
+; CHECK-NEXT: je .LBB6_1 # encoding: [0x74,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: .LBB6_1-1, kind: FK_PCRel_1
+; CHECK-NEXT: # %bb.2: # %F
+; CHECK-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0]
+; CHECK-NEXT: retq # encoding: [0xc3]
+; CHECK-NEXT: .LBB6_1: # %T
+; CHECK-NEXT: movl $1, %eax # encoding: [0xb8,0x01,0x00,0x00,0x00]
+; CHECK-NEXT: retq # encoding: [0xc3]
+entry:
+ %A = alloca { i64, i64 }, align 8
+ %B = getelementptr inbounds { i64, i64 }, { i64, i64 }* %A, i64 0, i32 1
%C = load i64, i64* %B
%D = icmp eq i64 %C, 0
br i1 %D, label %T, label %F
+
T:
ret i32 1
-
+
F:
ret i32 0
-; CHECK-LABEL: test6:
-; CHECK: cmpq $0, -8(%rsp)
-; CHECK: encoding: [0x48,0x83,0x7c,0x24,0xf8,0x00]
}
-; rdar://11866926
define i32 @test7(i64 %res) nounwind {
-entry:
; CHECK-LABEL: test7:
-; CHECK-NOT: movabsq
-; CHECK: shrq $32, %rdi
-; CHECK: sete
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0]
+; CHECK-NEXT: shrq $32, %rdi # encoding: [0x48,0xc1,0xef,0x20]
+; CHECK-NEXT: sete %al # encoding: [0x0f,0x94,0xc0]
+; CHECK-NEXT: retq # encoding: [0xc3]
+entry:
%lnot = icmp ult i64 %res, 4294967296
%lnot.ext = zext i1 %lnot to i32
ret i32 %lnot.ext
}
define i32 @test8(i64 %res) nounwind {
-entry:
; CHECK-LABEL: test8:
-; CHECK-NOT: movabsq
-; CHECK: shrq $32, %rdi
-; CHECK: cmpq $3, %rdi
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: shrq $32, %rdi # encoding: [0x48,0xc1,0xef,0x20]
+; CHECK-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0]
+; CHECK-NEXT: cmpq $3, %rdi # encoding: [0x48,0x83,0xff,0x03]
+; CHECK-NEXT: setb %al # encoding: [0x0f,0x92,0xc0]
+; CHECK-NEXT: retq # encoding: [0xc3]
+entry:
%lnot = icmp ult i64 %res, 12884901888
%lnot.ext = zext i1 %lnot to i32
ret i32 %lnot.ext
}
define i32 @test9(i64 %res) nounwind {
-entry:
; CHECK-LABEL: test9:
-; CHECK-NOT: movabsq
-; CHECK: shrq $33, %rdi
-; CHECK: sete
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0]
+; CHECK-NEXT: shrq $33, %rdi # encoding: [0x48,0xc1,0xef,0x21]
+; CHECK-NEXT: sete %al # encoding: [0x0f,0x94,0xc0]
+; CHECK-NEXT: retq # encoding: [0xc3]
+entry:
%lnot = icmp ult i64 %res, 8589934592
%lnot.ext = zext i1 %lnot to i32
ret i32 %lnot.ext
}
define i32 @test10(i64 %res) nounwind {
-entry:
; CHECK-LABEL: test10:
-; CHECK-NOT: movabsq
-; CHECK: shrq $32, %rdi
-; CHECK: setne
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0]
+; CHECK-NEXT: shrq $32, %rdi # encoding: [0x48,0xc1,0xef,0x20]
+; CHECK-NEXT: setne %al # encoding: [0x0f,0x95,0xc0]
+; CHECK-NEXT: retq # encoding: [0xc3]
+entry:
%lnot = icmp uge i64 %res, 4294967296
%lnot.ext = zext i1 %lnot to i32
ret i32 %lnot.ext
}
-; rdar://9758774
define i32 @test11(i64 %l) nounwind {
-entry:
; CHECK-LABEL: test11:
-; CHECK-NOT: movabsq
-; CHECK-NOT: andq
-; CHECK: shrq $47, %rdi
-; CHECK: cmpq $1, %rdi
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: shrq $47, %rdi # encoding: [0x48,0xc1,0xef,0x2f]
+; CHECK-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0]
+; CHECK-NEXT: cmpq $1, %rdi # encoding: [0x48,0x83,0xff,0x01]
+; CHECK-NEXT: sete %al # encoding: [0x0f,0x94,0xc0]
+; CHECK-NEXT: retq # encoding: [0xc3]
+entry:
%shr.mask = and i64 %l, -140737488355328
%cmp = icmp eq i64 %shr.mask, 140737488355328
%conv = zext i1 %cmp to i32
ret i32 %conv
}
-define i32 @test12() uwtable ssp {
+define i32 @test12() ssp uwtable {
; CHECK-LABEL: test12:
-; CHECK: testb
- %1 = call zeroext i1 @test12b()
- br i1 %1, label %2, label %3
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: pushq %rax # encoding: [0x50]
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: callq test12b # encoding: [0xe8,A,A,A,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: test12b-4, kind: FK_PCRel_4
+; CHECK-NEXT: testb %al, %al # encoding: [0x84,0xc0]
+; CHECK-NEXT: je .LBB12_2 # encoding: [0x74,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: .LBB12_2-1, kind: FK_PCRel_1
+; CHECK-NEXT: # %bb.1: # %T
+; CHECK-NEXT: movl $1, %eax # encoding: [0xb8,0x01,0x00,0x00,0x00]
+; CHECK-NEXT: popq %rcx # encoding: [0x59]
+; CHECK-NEXT: retq # encoding: [0xc3]
+; CHECK-NEXT: .LBB12_2: # %F
+; CHECK-NEXT: movl $2, %eax # encoding: [0xb8,0x02,0x00,0x00,0x00]
+; CHECK-NEXT: popq %rcx # encoding: [0x59]
+; CHECK-NEXT: retq # encoding: [0xc3]
+entry:
+ %tmp1 = call zeroext i1 @test12b()
+ br i1 %tmp1, label %T, label %F
-; <label>:2 ; preds = %0
+T:
ret i32 1
-; <label>:3 ; preds = %0
+F:
ret i32 2
}
declare zeroext i1 @test12b()
define i32 @test13(i32 %mask, i32 %base, i32 %intra) {
+; CHECK-LABEL: test13:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: testb $8, %dil # encoding: [0x40,0xf6,0xc7,0x08]
+; CHECK-NEXT: cmovnel %edx, %esi # encoding: [0x0f,0x45,0xf2]
+; CHECK-NEXT: movl %esi, %eax # encoding: [0x89,0xf0]
+; CHECK-NEXT: retq # encoding: [0xc3]
+entry:
%and = and i32 %mask, 8
%tobool = icmp ne i32 %and, 0
%cond = select i1 %tobool, i32 %intra, i32 %base
ret i32 %cond
-; CHECK-LABEL: test13:
-; CHECK: testb $8, %dil
-; CHECK: cmovnel
}
-define i32 @test14(i32 %mask, i32 %base, i32 %intra) #0 {
+define i32 @test14(i32 %mask, i32 %base, i32 %intra) {
+; CHECK-LABEL: test14:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: shrl $7, %edi # encoding: [0xc1,0xef,0x07]
+; CHECK-NEXT: cmovnsl %edx, %esi # encoding: [0x0f,0x49,0xf2]
+; CHECK-NEXT: movl %esi, %eax # encoding: [0x89,0xf0]
+; CHECK-NEXT: retq # encoding: [0xc3]
+entry:
%s = lshr i32 %mask, 7
%tobool = icmp sgt i32 %s, -1
%cond = select i1 %tobool, i32 %intra, i32 %base
ret i32 %cond
-; CHECK-LABEL: test14:
-; CHECK: shrl $7, %edi
-; CHECK-NEXT: cmovnsl %edx, %esi
}
; PR19964
define zeroext i1 @test15(i32 %bf.load, i32 %n) {
+; CHECK-LABEL: test15:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: shrl $16, %edi # encoding: [0xc1,0xef,0x10]
+; CHECK-NEXT: sete %cl # encoding: [0x0f,0x94,0xc1]
+; CHECK-NEXT: cmpl %esi, %edi # encoding: [0x39,0xf7]
+; CHECK-NEXT: setae %al # encoding: [0x0f,0x93,0xc0]
+; CHECK-NEXT: orb %cl, %al # encoding: [0x08,0xc8]
+; CHECK-NEXT: retq # encoding: [0xc3]
+entry:
%bf.lshr = lshr i32 %bf.load, 16
%cmp2 = icmp eq i32 %bf.lshr, 0
%cmp5 = icmp uge i32 %bf.lshr, %n
%.cmp5 = or i1 %cmp2, %cmp5
ret i1 %.cmp5
-; CHECK-LABEL: test15:
-; CHECK: shrl $16, %edi
-; CHECK: cmpl %esi, %edi
}
define i8 @test16(i16 signext %L) {
- %lshr = lshr i16 %L, 15
+; CHECK-LABEL: test16:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: testw %di, %di # encoding: [0x66,0x85,0xff]
+; CHECK-NEXT: setns %al # encoding: [0x0f,0x99,0xc0]
+; CHECK-NEXT: retq # encoding: [0xc3]
+entry:
+ %lshr = lshr i16 %L, 15
%trunc = trunc i16 %lshr to i8
- %not = xor i8 %trunc, 1
+ %not = xor i8 %trunc, 1
ret i8 %not
-; CHECK-LABEL: test16:
-; CHECK: testw %di, %di
-; CHECK: setns %al
}
define i8 @test17(i32 %L) {
- %lshr = lshr i32 %L, 31
+; CHECK-LABEL: test17:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: testl %edi, %edi # encoding: [0x85,0xff]
+; CHECK-NEXT: setns %al # encoding: [0x0f,0x99,0xc0]
+; CHECK-NEXT: retq # encoding: [0xc3]
+entry:
+ %lshr = lshr i32 %L, 31
%trunc = trunc i32 %lshr to i8
- %not = xor i8 %trunc, 1
+ %not = xor i8 %trunc, 1
ret i8 %not
-; CHECK-LABEL: test17:
-; CHECK: testl %edi, %edi
-; CHECK: setns %al
}
define i8 @test18(i64 %L) {
- %lshr = lshr i64 %L, 63
+; CHECK-LABEL: test18:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: testq %rdi, %rdi # encoding: [0x48,0x85,0xff]
+; CHECK-NEXT: setns %al # encoding: [0x0f,0x99,0xc0]
+; CHECK-NEXT: retq # encoding: [0xc3]
+entry:
+ %lshr = lshr i64 %L, 63
%trunc = trunc i64 %lshr to i8
- %not = xor i8 %trunc, 1
+ %not = xor i8 %trunc, 1
ret i8 %not
-; CHECK-LABEL: test18:
-; CHECK: testq %rdi, %rdi
-; CHECK: setns %al
}
define zeroext i1 @test19(i32 %L) {
- %lshr = lshr i32 %L, 31
+; CHECK-LABEL: test19:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: testl %edi, %edi # encoding: [0x85,0xff]
+; CHECK-NEXT: setns %al # encoding: [0x0f,0x99,0xc0]
+; CHECK-NEXT: retq # encoding: [0xc3]
+entry:
+ %lshr = lshr i32 %L, 31
%trunc = trunc i32 %lshr to i1
- %not = xor i1 %trunc, 1
+ %not = xor i1 %trunc, true
ret i1 %not
-; CHECK-LABEL: test19:
-; CHECK: testl %edi, %edi
-; CHECK: setns %al
}
-@d = global i8 0, align 1
-
; This test failed due to incorrect handling of "shift + icmp" sequence
define void @test20(i32 %bf.load, i8 %x1, i8* %b_addr) {
+; CHECK-LABEL: test20:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0]
+; CHECK-NEXT: testl $16777215, %edi # encoding: [0xf7,0xc7,0xff,0xff,0xff,0x00]
+; CHECK-NEXT: # imm = 0xFFFFFF
+; CHECK-NEXT: setne %al # encoding: [0x0f,0x95,0xc0]
+; CHECK-NEXT: movzbl %sil, %ecx # encoding: [0x40,0x0f,0xb6,0xce]
+; CHECK-NEXT: addl %eax, %ecx # encoding: [0x01,0xc1]
+; CHECK-NEXT: setne (%rdx) # encoding: [0x0f,0x95,0x02]
+; CHECK-NEXT: testl $16777215, %edi # encoding: [0xf7,0xc7,0xff,0xff,0xff,0x00]
+; CHECK-NEXT: # imm = 0xFFFFFF
+; CHECK-NEXT: setne {{.*}}(%rip) # encoding: [0x0f,0x95,0x05,A,A,A,A]
+; CHECK-NEXT: # fixup A - offset: 3, value: d-4, kind: reloc_riprel_4byte
+; CHECK-NEXT: retq # encoding: [0xc3]
+entry:
%bf.shl = shl i32 %bf.load, 8
%bf.ashr = ashr exact i32 %bf.shl, 8
%tobool4 = icmp ne i32 %bf.ashr, 0
@@ -274,61 +401,64 @@ define void @test20(i32 %bf.load, i8 %x1, i8* %b_addr) {
store i8 %frombool15, i8* @d, align 1
ret void
-; CHECK-LABEL: test20
-; CHECK: andl
-; CHECK: setne
-; CHECK: addl
-; CHECK: setne
-; CHECK: testl
-; CHECK: setne
}
define i32 @test21(i64 %val) {
- %and = and i64 %val, -2199023255552 ; 0xFFFFFE0000000000
+; CHECK-LABEL: test21:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0]
+; CHECK-NEXT: shrq $41, %rdi # encoding: [0x48,0xc1,0xef,0x29]
+; CHECK-NEXT: setne %al # encoding: [0x0f,0x95,0xc0]
+; CHECK-NEXT: retq # encoding: [0xc3]
+entry:
+ %and = and i64 %val, -2199023255552
%cmp = icmp ne i64 %and, 0
%ret = zext i1 %cmp to i32
ret i32 %ret
-; CHECK-LABEL: test21
-; CHECK: shrq $41, %rdi
-; CHECK-NOT: test
-; CHECK: setne %al
-; CHECK: retq
}
; AND-to-SHR transformation is enabled for eq/ne condition codes only.
define i32 @test22(i64 %val) {
- %and = and i64 %val, -2199023255552 ; 0xFFFFFE0000000000
+; CHECK-LABEL: test22:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0]
+; CHECK-NEXT: retq # encoding: [0xc3]
+entry:
+ %and = and i64 %val, -2199023255552
%cmp = icmp ult i64 %and, 0
%ret = zext i1 %cmp to i32
ret i32 %ret
-; CHECK-LABEL: test22
-; CHECK-NOT: shrq $41
-; CHECK: retq
}
define i32 @test23(i64 %val) {
- %and = and i64 %val, -1048576 ; 0xFFFFFFFFFFF00000
+; CHECK-LABEL: test23:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0]
+; CHECK-NEXT: testq $-1048576, %rdi # encoding: [0x48,0xf7,0xc7,0x00,0x00,0xf0,0xff]
+; CHECK-NEXT: # imm = 0xFFF00000
+; CHECK-NEXT: setne %al # encoding: [0x0f,0x95,0xc0]
+; CHECK-NEXT: retq # encoding: [0xc3]
+entry:
+ %and = and i64 %val, -1048576
%cmp = icmp ne i64 %and, 0
%ret = zext i1 %cmp to i32
ret i32 %ret
-; CHECK-LABEL: test23
-; CHECK: testq $-1048576, %rdi
-; CHECK: setne %al
-; CHECK: retq
}
define i32 @test24(i64 %val) {
- %and = and i64 %val, 281474976710655 ; 0x0000FFFFFFFFFFFF
+; CHECK-LABEL: test24:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0]
+; CHECK-NEXT: shlq $16, %rdi # encoding: [0x48,0xc1,0xe7,0x10]
+; CHECK-NEXT: setne %al # encoding: [0x0f,0x95,0xc0]
+; CHECK-NEXT: retq # encoding: [0xc3]
+entry:
+ %and = and i64 %val, 281474976710655
%cmp = icmp ne i64 %and, 0
%ret = zext i1 %cmp to i32
ret i32 %ret
-; CHECK-LABEL: test24
-; CHECK: shlq $16, %rdi
-; CHECK-NOT: test
-; CHECK: setne %al
-; CHECK: retq
}
diff --git a/test/CodeGen/X86/cmpxchg-clobber-flags.ll b/test/CodeGen/X86/cmpxchg-clobber-flags.ll
index f2b9dee91037..8d289fa9fb03 100644
--- a/test/CodeGen/X86/cmpxchg-clobber-flags.ll
+++ b/test/CodeGen/X86/cmpxchg-clobber-flags.ll
@@ -31,18 +31,44 @@ define i64 @test_intervening_call(i64* %foo, i64 %bar, i64 %baz) {
; i386-NEXT: sahf
; i386-NEXT: jne
+; In the following case we get a long chain of EFLAGS save/restore due to
+; a sequence of:
+; cmpxchg8b (implicit-def eflags)
+; eax = copy eflags
+; adjcallstackdown32
+; ...
+; use of eax
+; During PEI the adjcallstackdown32 is replaced with the subl which
+; clobbers eflags, effectively interfering in the liveness interval.
+; Is this a case we care about? Maybe no, considering this issue
+; happens with the fast pre-regalloc scheduler enforced. A more
+; performant scheduler would move the adjcallstackdown32 out of the
+; eflags liveness interval.
+
; i386f-LABEL: test_intervening_call:
; i386f: cmpxchg8b
-; i386f-NEXT: movl %eax, (%esp)
-; i386f-NEXT: movl %edx, 4(%esp)
-; i386f-NEXT: seto %al
+; i386f-NEXT: pushl %eax
+; i386f-NEXT: seto %al
; i386f-NEXT: lahf
-; i386f-NEXT: movl %eax, [[FLAGS:%.*]]
-; i386f-NEXT: calll bar
-; i386f-NEXT: movl [[FLAGS]], %eax
-; i386f-NEXT: addb $127, %al
+; i386f-NEXT: movl %eax, [[FLAGS:%.*]]
+; i386f-NEXT: popl %eax
+; i386f-NEXT: subl $8, %esp
+; i386f-NEXT: pushl %eax
+; i386f-NEXT: movl %ecx, %eax
+; i386f-NEXT: addb $127, %al
; i386f-NEXT: sahf
-; i386f-NEXT: jne
+; i386f-NEXT: popl %eax
+; i386f-NEXT: pushl %eax
+; i386f-NEXT: seto %al
+; i386f-NEXT: lahf
+; i386f-NEXT: movl %eax, %esi
+; i386f-NEXT: popl %eax
+; i386f-NEXT: pushl %edx
+; i386f-NEXT: pushl %eax
+; i386f-NEXT: calll bar
+; i386f-NEXT: addl $16, %esp
+; i386f-NEXT: movl %esi, %eax
+; i386f-NEXT: addb $127, %al
; x8664-LABEL: test_intervening_call:
; x8664: cmpxchgq
diff --git a/test/CodeGen/X86/cmpxchg16b.ll b/test/CodeGen/X86/cmpxchg16b.ll
index d514c0a35f5b..f070a4682ec0 100644
--- a/test/CodeGen/X86/cmpxchg16b.ll
+++ b/test/CodeGen/X86/cmpxchg16b.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mcpu=core2 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=core2 | FileCheck %s
; Basic 128-bit cmpxchg
define void @t1(i128* nocapture %p) nounwind ssp {
diff --git a/test/CodeGen/X86/cmpxchg8b_alloca_regalloc_handling.ll b/test/CodeGen/X86/cmpxchg8b_alloca_regalloc_handling.ll
index 8a325c4cbdb9..b500484a4c89 100644
--- a/test/CodeGen/X86/cmpxchg8b_alloca_regalloc_handling.ll
+++ b/test/CodeGen/X86/cmpxchg8b_alloca_regalloc_handling.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -stackrealign -O2 | FileCheck %s
+; RUN: llc < %s -mtriple=i686-- -stackrealign -O2 | FileCheck %s
; PR28755
; Check that register allocator is able to handle that
diff --git a/test/CodeGen/X86/coalesce_commute_movsd.ll b/test/CodeGen/X86/coalesce_commute_movsd.ll
index 2f4680755b21..bcd7f2fb9659 100644
--- a/test/CodeGen/X86/coalesce_commute_movsd.ll
+++ b/test/CodeGen/X86/coalesce_commute_movsd.ll
@@ -8,23 +8,23 @@
define <2 x double> @insert_f64(double %a0, <2 x double> %a1) {
; SSE2-LABEL: insert_f64:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
; SSE2-NEXT: movapd %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: insert_f64:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
; SSE41-NEXT: retq
;
; AVX-LABEL: insert_f64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
; AVX-NEXT: retq
;
; AVX512-LABEL: insert_f64:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
; AVX512-NEXT: retq
%1 = insertelement <2 x double> %a1, double %a0, i32 0
@@ -33,23 +33,23 @@ define <2 x double> @insert_f64(double %a0, <2 x double> %a1) {
define <4 x float> @insert_f32(float %a0, <4 x float> %a1) {
; SSE2-LABEL: insert_f32:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
; SSE2-NEXT: movaps %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: insert_f32:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; SSE41-NEXT: retq
;
; AVX-LABEL: insert_f32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; AVX-NEXT: retq
;
; AVX512-LABEL: insert_f32:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; AVX512-NEXT: retq
%1 = insertelement <4 x float> %a1, float %a0, i32 0
diff --git a/test/CodeGen/X86/coalescer-dce.ll b/test/CodeGen/X86/coalescer-dce.ll
index 208d70660faa..90a07720e65f 100644
--- a/test/CodeGen/X86/coalescer-dce.ll
+++ b/test/CodeGen/X86/coalescer-dce.ll
@@ -4,28 +4,28 @@ target triple = "x86_64-apple-macosx10.7.0"
; This test case has a sub-register join followed by a remat:
;
-; 256L %vreg2<def> = COPY %vreg7:sub_32bit<kill>; GR32:%vreg2 GR64:%vreg7
-; Considering merging %vreg2 with %vreg7:sub_32bit
+; 256L %2 = COPY killed %7:sub_32bit; GR32:%2 GR64:%7
+; Considering merging %2 with %7:sub_32bit
; Cross-class to GR64.
-; RHS = %vreg2 = [256d,272d:0) 0@256d
-; LHS = %vreg7 = [208d,256d:0)[304L,480L:0) 0@208d
-; updated: 272L %vreg0<def> = COPY %vreg7:sub_32bit<kill>; GR32:%vreg0 GR64:%vreg7
-; Joined. Result = %vreg7 = [208d,272d:0)[304L,480L:0) 0@208d
+; RHS = %2 = [256d,272d:0) 0@256d
+; LHS = %7 = [208d,256d:0)[304L,480L:0) 0@208d
+; updated: 272L %0 = COPY killed %7:sub_32bit; GR32:%0 GR64:%7
+; Joined. Result = %7 = [208d,272d:0)[304L,480L:0) 0@208d
;
-; 272L %vreg10:sub_32bit<def> = COPY %vreg7:sub_32bit<kill>, %vreg10<imp-def>; GR64:%vreg10,%vreg7
-; Considering merging %vreg7 with %vreg10
-; RHS = %vreg7 = [208d,272d:0)[304L,480L:0) 0@208d
-; LHS = %vreg10 = [16d,64L:2)[64L,160L:1)[192L,240L:1)[272d,304L:3)[304L,352d:1)[352d,400d:0)[400d,400S:4) 0@352d 1@64L-phidef 2@16d-phikill 3@272d-phikill 4@400d
-; Remat: %vreg10<def> = MOV64r0 %vreg10<imp-def>, %EFLAGS<imp-def,dead>, %vreg10<imp-def>; GR64:%vreg10
-; Shrink: %vreg7 = [208d,272d:0)[304L,480L:0) 0@208d
+; 272L %10:sub_32bit = COPY killed %7:sub_32bit, implicit-def %10; GR64:%10,%7
+; Considering merging %7 with %10
+; RHS = %7 = [208d,272d:0)[304L,480L:0) 0@208d
+; LHS = %10 = [16d,64L:2)[64L,160L:1)[192L,240L:1)[272d,304L:3)[304L,352d:1)[352d,400d:0)[400d,400S:4) 0@352d 1@64L-phidef 2@16d-phikill 3@272d-phikill 4@400d
+; Remat: %10 = MOV64r0 implicit-def %10, implicit dead %eflags, implicit-def %10; GR64:%10
+; Shrink: %7 = [208d,272d:0)[304L,480L:0) 0@208d
; live-in at 240L
; live-in at 416L
; live-in at 320L
; live-in at 304L
-; Shrunk: %vreg7 = [208d,256d:0)[304L,480L:0) 0@208d
+; Shrunk: %7 = [208d,256d:0)[304L,480L:0) 0@208d
;
; The COPY at 256L is rewritten as a partial def, and that would artificially
-; extend the live range of %vreg7 to end at 256d. When the joined copy is
+; extend the live range of %7 to end at 256d. When the joined copy is
; removed, -verify-coalescing complains about the dangling kill.
;
; <rdar://problem/9967101>
diff --git a/test/CodeGen/X86/code_placement.ll b/test/CodeGen/X86/code_placement.ll
index 7d2358480051..7b5f4c346908 100644
--- a/test/CodeGen/X86/code_placement.ll
+++ b/test/CodeGen/X86/code_placement.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=x86 < %s | FileCheck %s
+; RUN: llc -mtriple=i686-- < %s | FileCheck %s
@Te0 = external global [256 x i32] ; <[256 x i32]*> [#uses=5]
@Te1 = external global [256 x i32] ; <[256 x i32]*> [#uses=4]
diff --git a/test/CodeGen/X86/codegen-prepare-cast.ll b/test/CodeGen/X86/codegen-prepare-cast.ll
index c5c2d64f63d8..08371d8fa564 100644
--- a/test/CodeGen/X86/codegen-prepare-cast.ll
+++ b/test/CodeGen/X86/codegen-prepare-cast.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64
+; RUN: llc < %s
; PR4297
; RUN: opt -S < %s -codegenprepare | FileCheck %s
diff --git a/test/CodeGen/X86/combine-64bit-vec-binop.ll b/test/CodeGen/X86/combine-64bit-vec-binop.ll
index 2935a2095bbf..e434bfc11c4c 100644
--- a/test/CodeGen/X86/combine-64bit-vec-binop.ll
+++ b/test/CodeGen/X86/combine-64bit-vec-binop.ll
@@ -3,7 +3,7 @@
define double @test1_add(double %A, double %B) {
; SSE41-LABEL: test1_add:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: paddd %xmm1, %xmm0
; SSE41-NEXT: retq
%1 = bitcast double %A to <2 x i32>
@@ -15,7 +15,7 @@ define double @test1_add(double %A, double %B) {
define double @test2_add(double %A, double %B) {
; SSE41-LABEL: test2_add:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: paddw %xmm1, %xmm0
; SSE41-NEXT: retq
%1 = bitcast double %A to <4 x i16>
@@ -27,7 +27,7 @@ define double @test2_add(double %A, double %B) {
define double @test3_add(double %A, double %B) {
; SSE41-LABEL: test3_add:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: paddb %xmm1, %xmm0
; SSE41-NEXT: retq
%1 = bitcast double %A to <8 x i8>
@@ -39,7 +39,7 @@ define double @test3_add(double %A, double %B) {
define double @test1_sub(double %A, double %B) {
; SSE41-LABEL: test1_sub:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: psubd %xmm1, %xmm0
; SSE41-NEXT: retq
%1 = bitcast double %A to <2 x i32>
@@ -51,7 +51,7 @@ define double @test1_sub(double %A, double %B) {
define double @test2_sub(double %A, double %B) {
; SSE41-LABEL: test2_sub:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: psubw %xmm1, %xmm0
; SSE41-NEXT: retq
%1 = bitcast double %A to <4 x i16>
@@ -63,7 +63,7 @@ define double @test2_sub(double %A, double %B) {
define double @test3_sub(double %A, double %B) {
; SSE41-LABEL: test3_sub:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: psubb %xmm1, %xmm0
; SSE41-NEXT: retq
%1 = bitcast double %A to <8 x i8>
@@ -75,7 +75,7 @@ define double @test3_sub(double %A, double %B) {
define double @test1_mul(double %A, double %B) {
; SSE41-LABEL: test1_mul:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pmulld %xmm1, %xmm0
; SSE41-NEXT: retq
%1 = bitcast double %A to <2 x i32>
@@ -87,7 +87,7 @@ define double @test1_mul(double %A, double %B) {
define double @test2_mul(double %A, double %B) {
; SSE41-LABEL: test2_mul:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pmullw %xmm1, %xmm0
; SSE41-NEXT: retq
%1 = bitcast double %A to <4 x i16>
@@ -100,7 +100,7 @@ define double @test2_mul(double %A, double %B) {
; There is no legal ISD::MUL with type MVT::v8i16.
define double @test3_mul(double %A, double %B) {
; SSE41-LABEL: test3_mul:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; SSE41-NEXT: pmullw %xmm2, %xmm0
@@ -115,7 +115,7 @@ define double @test3_mul(double %A, double %B) {
define double @test1_and(double %A, double %B) {
; SSE41-LABEL: test1_and:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: andps %xmm1, %xmm0
; SSE41-NEXT: retq
%1 = bitcast double %A to <2 x i32>
@@ -127,7 +127,7 @@ define double @test1_and(double %A, double %B) {
define double @test2_and(double %A, double %B) {
; SSE41-LABEL: test2_and:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: andps %xmm1, %xmm0
; SSE41-NEXT: retq
%1 = bitcast double %A to <4 x i16>
@@ -139,7 +139,7 @@ define double @test2_and(double %A, double %B) {
define double @test3_and(double %A, double %B) {
; SSE41-LABEL: test3_and:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: andps %xmm1, %xmm0
; SSE41-NEXT: retq
%1 = bitcast double %A to <8 x i8>
@@ -151,7 +151,7 @@ define double @test3_and(double %A, double %B) {
define double @test1_or(double %A, double %B) {
; SSE41-LABEL: test1_or:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: orps %xmm1, %xmm0
; SSE41-NEXT: retq
%1 = bitcast double %A to <2 x i32>
@@ -163,7 +163,7 @@ define double @test1_or(double %A, double %B) {
define double @test2_or(double %A, double %B) {
; SSE41-LABEL: test2_or:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: orps %xmm1, %xmm0
; SSE41-NEXT: retq
%1 = bitcast double %A to <4 x i16>
@@ -175,7 +175,7 @@ define double @test2_or(double %A, double %B) {
define double @test3_or(double %A, double %B) {
; SSE41-LABEL: test3_or:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: orps %xmm1, %xmm0
; SSE41-NEXT: retq
%1 = bitcast double %A to <8 x i8>
@@ -187,7 +187,7 @@ define double @test3_or(double %A, double %B) {
define double @test1_xor(double %A, double %B) {
; SSE41-LABEL: test1_xor:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: xorps %xmm1, %xmm0
; SSE41-NEXT: retq
%1 = bitcast double %A to <2 x i32>
@@ -199,7 +199,7 @@ define double @test1_xor(double %A, double %B) {
define double @test2_xor(double %A, double %B) {
; SSE41-LABEL: test2_xor:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: xorps %xmm1, %xmm0
; SSE41-NEXT: retq
%1 = bitcast double %A to <4 x i16>
@@ -211,7 +211,7 @@ define double @test2_xor(double %A, double %B) {
define double @test3_xor(double %A, double %B) {
; SSE41-LABEL: test3_xor:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: xorps %xmm1, %xmm0
; SSE41-NEXT: retq
%1 = bitcast double %A to <8 x i8>
@@ -223,7 +223,7 @@ define double @test3_xor(double %A, double %B) {
define double @test_fadd(double %A, double %B) {
; SSE41-LABEL: test_fadd:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: addps %xmm1, %xmm0
; SSE41-NEXT: retq
%1 = bitcast double %A to <2 x float>
@@ -235,7 +235,7 @@ define double @test_fadd(double %A, double %B) {
define double @test_fsub(double %A, double %B) {
; SSE41-LABEL: test_fsub:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: subps %xmm1, %xmm0
; SSE41-NEXT: retq
%1 = bitcast double %A to <2 x float>
@@ -247,7 +247,7 @@ define double @test_fsub(double %A, double %B) {
define double @test_fmul(double %A, double %B) {
; SSE41-LABEL: test_fmul:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: mulps %xmm1, %xmm0
; SSE41-NEXT: retq
%1 = bitcast double %A to <2 x float>
diff --git a/test/CodeGen/X86/combine-abs.ll b/test/CodeGen/X86/combine-abs.ll
index 37beb438d737..dd8675380924 100644
--- a/test/CodeGen/X86/combine-abs.ll
+++ b/test/CodeGen/X86/combine-abs.ll
@@ -6,7 +6,7 @@
; fold (abs c1) -> c2
define <4 x i32> @combine_v4i32_abs_constant() {
; CHECK-LABEL: combine_v4i32_abs_constant:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [0,1,3,2147483648]
; CHECK-NEXT: retq
%1 = call <4 x i32> @llvm.x86.ssse3.pabs.d.128(<4 x i32> <i32 0, i32 -1, i32 3, i32 -2147483648>)
@@ -15,7 +15,7 @@ define <4 x i32> @combine_v4i32_abs_constant() {
define <16 x i16> @combine_v16i16_abs_constant() {
; CHECK-LABEL: combine_v16i16_abs_constant:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps {{.*#+}} ymm0 = [0,1,1,3,3,7,7,255,255,4096,4096,32767,32767,32768,32768,0]
; CHECK-NEXT: retq
%1 = call <16 x i16> @llvm.x86.avx2.pabs.w(<16 x i16> <i16 0, i16 1, i16 -1, i16 3, i16 -3, i16 7, i16 -7, i16 255, i16 -255, i16 4096, i16 -4096, i16 32767, i16 -32767, i16 -32768, i16 32768, i16 65536>)
@@ -23,9 +23,25 @@ define <16 x i16> @combine_v16i16_abs_constant() {
}
; fold (abs (abs x)) -> (abs x)
+define i32 @combine_i32_abs_abs(i32 %a) {
+; CHECK-LABEL: combine_i32_abs_abs:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: negl %eax
+; CHECK-NEXT: cmovll %edi, %eax
+; CHECK-NEXT: retq
+ %n1 = sub i32 zeroinitializer, %a
+ %b1 = icmp slt i32 %a, zeroinitializer
+ %a1 = select i1 %b1, i32 %n1, i32 %a
+ %n2 = sub i32 zeroinitializer, %a1
+ %b2 = icmp sgt i32 %a1, zeroinitializer
+ %a2 = select i1 %b2, i32 %a1, i32 %n2
+ ret i32 %a2
+}
+
define <8 x i16> @combine_v8i16_abs_abs(<8 x i16> %a) {
; CHECK-LABEL: combine_v8i16_abs_abs:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpabsw %xmm0, %xmm0
; CHECK-NEXT: retq
%a1 = call <8 x i16> @llvm.x86.ssse3.pabs.w.128(<8 x i16> %a)
@@ -37,7 +53,7 @@ define <8 x i16> @combine_v8i16_abs_abs(<8 x i16> %a) {
define <32 x i8> @combine_v32i8_abs_abs(<32 x i8> %a) {
; CHECK-LABEL: combine_v32i8_abs_abs:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpabsb %ymm0, %ymm0
; CHECK-NEXT: retq
%n1 = sub <32 x i8> zeroinitializer, %a
@@ -49,8 +65,8 @@ define <32 x i8> @combine_v32i8_abs_abs(<32 x i8> %a) {
define <4 x i64> @combine_v4i64_abs_abs(<4 x i64> %a) {
; AVX2-LABEL: combine_v4i64_abs_abs:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2
; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0
@@ -60,14 +76,14 @@ define <4 x i64> @combine_v4i64_abs_abs(<4 x i64> %a) {
; AVX2-NEXT: retq
;
; AVX512F-LABEL: combine_v4i64_abs_abs:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; AVX512F-NEXT: vpabsq %zmm0, %zmm0
-; AVX512F-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512F-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: combine_v4i64_abs_abs:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpabsq %ymm0, %ymm0
; AVX512VL-NEXT: retq
%n1 = sub <4 x i64> zeroinitializer, %a
@@ -81,11 +97,20 @@ define <4 x i64> @combine_v4i64_abs_abs(<4 x i64> %a) {
; fold (abs x) -> x iff not-negative
define <16 x i8> @combine_v16i8_abs_constant(<16 x i8> %a) {
-; CHECK-LABEL: combine_v16i8_abs_constant:
-; CHECK: # BB#0:
-; CHECK-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; CHECK-NEXT: vpabsb %xmm0, %xmm0
-; CHECK-NEXT: retq
+; AVX2-LABEL: combine_v16i8_abs_constant:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: combine_v16i8_abs_constant:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: combine_v16i8_abs_constant:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VL-NEXT: retq
%1 = insertelement <16 x i8> undef, i8 15, i32 0
%2 = shufflevector <16 x i8> %1, <16 x i8> undef, <16 x i32> zeroinitializer
%3 = and <16 x i8> %a, %2
@@ -95,7 +120,7 @@ define <16 x i8> @combine_v16i8_abs_constant(<16 x i8> %a) {
define <8 x i32> @combine_v8i32_abs_pos(<8 x i32> %a) {
; CHECK-LABEL: combine_v8i32_abs_pos:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpsrld $1, %ymm0, %ymm0
; CHECK-NEXT: retq
%1 = lshr <8 x i32> %a, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
diff --git a/test/CodeGen/X86/combine-add.ll b/test/CodeGen/X86/combine-add.ll
index a4e959c0b8f9..9a9f535c6086 100644
--- a/test/CodeGen/X86/combine-add.ll
+++ b/test/CodeGen/X86/combine-add.ll
@@ -5,11 +5,11 @@
; fold (add x, 0) -> x
define <4 x i32> @combine_vec_add_to_zero(<4 x i32> %a) {
; SSE-LABEL: combine_vec_add_to_zero:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_add_to_zero:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: retq
%1 = add <4 x i32> %a, zeroinitializer
ret <4 x i32> %1
@@ -18,14 +18,14 @@ define <4 x i32> @combine_vec_add_to_zero(<4 x i32> %a) {
; fold ((c1-A)+c2) -> (c1+c2)-A
define <4 x i32> @combine_vec_add_constant_sub(<4 x i32> %a) {
; SSE-LABEL: combine_vec_add_constant_sub:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,2,4,6]
; SSE-NEXT: psubd %xmm0, %xmm1
; SSE-NEXT: movdqa %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_add_constant_sub:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,2,4,6]
; AVX-NEXT: vpsubd %xmm0, %xmm1, %xmm0
; AVX-NEXT: retq
@@ -37,13 +37,13 @@ define <4 x i32> @combine_vec_add_constant_sub(<4 x i32> %a) {
; fold ((0-A) + B) -> B-A
define <4 x i32> @combine_vec_add_neg0(<4 x i32> %a, <4 x i32> %b) {
; SSE-LABEL: combine_vec_add_neg0:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: psubd %xmm0, %xmm1
; SSE-NEXT: movdqa %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_add_neg0:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpsubd %xmm0, %xmm1, %xmm0
; AVX-NEXT: retq
%1 = sub <4 x i32> zeroinitializer, %a
@@ -54,12 +54,12 @@ define <4 x i32> @combine_vec_add_neg0(<4 x i32> %a, <4 x i32> %b) {
; fold (A + (0-B)) -> A-B
define <4 x i32> @combine_vec_add_neg1(<4 x i32> %a, <4 x i32> %b) {
; SSE-LABEL: combine_vec_add_neg1:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: psubd %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_add_neg1:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpsubd %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%1 = sub <4 x i32> zeroinitializer, %b
@@ -70,12 +70,12 @@ define <4 x i32> @combine_vec_add_neg1(<4 x i32> %a, <4 x i32> %b) {
; fold (A+(B-A)) -> B
define <4 x i32> @combine_vec_add_sub0(<4 x i32> %a, <4 x i32> %b) {
; SSE-LABEL: combine_vec_add_sub0:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_add_sub0:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovaps %xmm1, %xmm0
; AVX-NEXT: retq
%1 = sub <4 x i32> %b, %a
@@ -86,12 +86,12 @@ define <4 x i32> @combine_vec_add_sub0(<4 x i32> %a, <4 x i32> %b) {
; fold ((B-A)+A) -> B
define <4 x i32> @combine_vec_add_sub1(<4 x i32> %a, <4 x i32> %b) {
; SSE-LABEL: combine_vec_add_sub1:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_add_sub1:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovaps %xmm1, %xmm0
; AVX-NEXT: retq
%1 = sub <4 x i32> %b, %a
@@ -102,13 +102,13 @@ define <4 x i32> @combine_vec_add_sub1(<4 x i32> %a, <4 x i32> %b) {
; fold (A+(B-(A+C))) to (B-C)
define <4 x i32> @combine_vec_add_sub_add0(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
; SSE-LABEL: combine_vec_add_sub_add0:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: psubd %xmm2, %xmm1
; SSE-NEXT: movdqa %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_add_sub_add0:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpsubd %xmm2, %xmm1, %xmm0
; AVX-NEXT: retq
%1 = add <4 x i32> %a, %c
@@ -120,13 +120,13 @@ define <4 x i32> @combine_vec_add_sub_add0(<4 x i32> %a, <4 x i32> %b, <4 x i32>
; fold (A+(B-(C+A))) to (B-C)
define <4 x i32> @combine_vec_add_sub_add1(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
; SSE-LABEL: combine_vec_add_sub_add1:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: psubd %xmm2, %xmm1
; SSE-NEXT: movdqa %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_add_sub_add1:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpsubd %xmm2, %xmm1, %xmm0
; AVX-NEXT: retq
%1 = add <4 x i32> %c, %a
@@ -138,13 +138,13 @@ define <4 x i32> @combine_vec_add_sub_add1(<4 x i32> %a, <4 x i32> %b, <4 x i32>
; fold (A+((B-A)+C)) to (B+C)
define <4 x i32> @combine_vec_add_sub_add2(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
; SSE-LABEL: combine_vec_add_sub_add2:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: paddd %xmm2, %xmm1
; SSE-NEXT: movdqa %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_add_sub_add2:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpaddd %xmm2, %xmm1, %xmm0
; AVX-NEXT: retq
%1 = sub <4 x i32> %b, %a
@@ -156,13 +156,13 @@ define <4 x i32> @combine_vec_add_sub_add2(<4 x i32> %a, <4 x i32> %b, <4 x i32>
; fold (A+((B-A)-C)) to (B-C)
define <4 x i32> @combine_vec_add_sub_add3(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
; SSE-LABEL: combine_vec_add_sub_add3:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: psubd %xmm2, %xmm1
; SSE-NEXT: movdqa %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_add_sub_add3:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpsubd %xmm2, %xmm1, %xmm0
; AVX-NEXT: retq
%1 = sub <4 x i32> %b, %a
@@ -174,14 +174,14 @@ define <4 x i32> @combine_vec_add_sub_add3(<4 x i32> %a, <4 x i32> %b, <4 x i32>
; fold (A-B)+(C-D) to (A+C)-(B+D) when A or C is constant
define <4 x i32> @combine_vec_add_sub_sub(<4 x i32> %a, <4 x i32> %b, <4 x i32> %d) {
; SSE-LABEL: combine_vec_add_sub_sub:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: paddd {{.*}}(%rip), %xmm0
; SSE-NEXT: paddd %xmm2, %xmm1
; SSE-NEXT: psubd %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_add_sub_sub:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: vpaddd %xmm2, %xmm1, %xmm1
; AVX-NEXT: vpsubd %xmm1, %xmm0, %xmm0
@@ -195,14 +195,14 @@ define <4 x i32> @combine_vec_add_sub_sub(<4 x i32> %a, <4 x i32> %b, <4 x i32>
; fold (a+b) -> (a|b) iff a and b share no bits.
define <4 x i32> @combine_vec_add_uniquebits(<4 x i32> %a, <4 x i32> %b) {
; SSE-LABEL: combine_vec_add_uniquebits:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: andps {{.*}}(%rip), %xmm0
; SSE-NEXT: andps {{.*}}(%rip), %xmm1
; SSE-NEXT: orps %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_add_uniquebits:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vbroadcastss {{.*}}(%rip), %xmm2
; AVX-NEXT: vandps %xmm2, %xmm0, %xmm0
; AVX-NEXT: vbroadcastss {{.*}}(%rip), %xmm2
@@ -218,13 +218,13 @@ define <4 x i32> @combine_vec_add_uniquebits(<4 x i32> %a, <4 x i32> %b) {
; fold (add x, shl(0 - y, n)) -> sub(x, shl(y, n))
define <4 x i32> @combine_vec_add_shl_neg0(<4 x i32> %x, <4 x i32> %y) {
; SSE-LABEL: combine_vec_add_shl_neg0:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pslld $5, %xmm1
; SSE-NEXT: psubd %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_add_shl_neg0:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpslld $5, %xmm1, %xmm1
; AVX-NEXT: vpsubd %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
@@ -237,13 +237,13 @@ define <4 x i32> @combine_vec_add_shl_neg0(<4 x i32> %x, <4 x i32> %y) {
; fold (add shl(0 - y, n), x) -> sub(x, shl(y, n))
define <4 x i32> @combine_vec_add_shl_neg1(<4 x i32> %x, <4 x i32> %y) {
; SSE-LABEL: combine_vec_add_shl_neg1:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pslld $5, %xmm1
; SSE-NEXT: psubd %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_add_shl_neg1:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpslld $5, %xmm1, %xmm1
; AVX-NEXT: vpsubd %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
@@ -257,13 +257,13 @@ define <4 x i32> @combine_vec_add_shl_neg1(<4 x i32> %x, <4 x i32> %y) {
; and similar xforms where the inner op is either ~0 or 0.
define <4 x i32> @combine_vec_add_and_compare(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
; SSE-LABEL: combine_vec_add_and_compare:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pcmpeqd %xmm2, %xmm1
; SSE-NEXT: psubd %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_add_and_compare:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1
; AVX-NEXT: vpsubd %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
@@ -277,14 +277,14 @@ define <4 x i32> @combine_vec_add_and_compare(<4 x i32> %a0, <4 x i32> %a1, <4 x
; add (sext i1), X -> sub X, (zext i1)
define <4 x i32> @combine_vec_add_sext(<4 x i1> %a0, <4 x i32> %a1) {
; SSE-LABEL: combine_vec_add_sext:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pslld $31, %xmm0
; SSE-NEXT: psrad $31, %xmm0
; SSE-NEXT: paddd %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_add_sext:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpslld $31, %xmm0, %xmm0
; AVX-NEXT: vpsrad $31, %xmm0, %xmm0
; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
@@ -297,14 +297,14 @@ define <4 x i32> @combine_vec_add_sext(<4 x i1> %a0, <4 x i32> %a1) {
; add (sext i1), X -> sub X, (zext i1)
define <4 x i32> @combine_vec_add_sextinreg(<4 x i32> %a0, <4 x i32> %a1) {
; SSE-LABEL: combine_vec_add_sextinreg:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pslld $31, %xmm0
; SSE-NEXT: psrad $31, %xmm0
; SSE-NEXT: paddd %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_add_sextinreg:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpslld $31, %xmm0, %xmm0
; AVX-NEXT: vpsrad $31, %xmm0, %xmm0
; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
diff --git a/test/CodeGen/X86/combine-and.ll b/test/CodeGen/X86/combine-and.ll
index f30fa61bbfbe..e92237f524f5 100644
--- a/test/CodeGen/X86/combine-and.ll
+++ b/test/CodeGen/X86/combine-and.ll
@@ -3,7 +3,7 @@
define i32 @and_self(i32 %x) {
; CHECK-LABEL: and_self:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movl %edi, %eax
; CHECK-NEXT: retq
%and = and i32 %x, %x
@@ -12,7 +12,7 @@ define i32 @and_self(i32 %x) {
define <4 x i32> @and_self_vec(<4 x i32> %x) {
; CHECK-LABEL: and_self_vec:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: retq
%and = and <4 x i32> %x, %x
ret <4 x i32> %and
@@ -26,7 +26,7 @@ define <4 x i32> @and_self_vec(<4 x i32> %x) {
define <4 x i32> @test1(<4 x i32> %A) {
; CHECK-LABEL: test1:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: pxor %xmm1, %xmm1
; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
; CHECK-NEXT: retq
@@ -36,7 +36,7 @@ define <4 x i32> @test1(<4 x i32> %A) {
define <4 x i32> @test2(<4 x i32> %A) {
; CHECK-LABEL: test2:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: pxor %xmm1, %xmm1
; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
; CHECK-NEXT: retq
@@ -46,7 +46,7 @@ define <4 x i32> @test2(<4 x i32> %A) {
define <4 x i32> @test3(<4 x i32> %A) {
; CHECK-LABEL: test3:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: pxor %xmm1, %xmm1
; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7]
; CHECK-NEXT: retq
@@ -56,7 +56,7 @@ define <4 x i32> @test3(<4 x i32> %A) {
define <4 x i32> @test4(<4 x i32> %A) {
; CHECK-LABEL: test4:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: pxor %xmm1, %xmm1
; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7]
; CHECK-NEXT: retq
@@ -66,7 +66,7 @@ define <4 x i32> @test4(<4 x i32> %A) {
define <4 x i32> @test5(<4 x i32> %A) {
; CHECK-LABEL: test5:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: pxor %xmm1, %xmm1
; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
; CHECK-NEXT: retq
@@ -76,7 +76,7 @@ define <4 x i32> @test5(<4 x i32> %A) {
define <4 x i32> @test6(<4 x i32> %A) {
; CHECK-LABEL: test6:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: pxor %xmm1, %xmm1
; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
; CHECK-NEXT: retq
@@ -86,7 +86,7 @@ define <4 x i32> @test6(<4 x i32> %A) {
define <4 x i32> @test7(<4 x i32> %A) {
; CHECK-LABEL: test7:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: pxor %xmm1, %xmm1
; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
; CHECK-NEXT: retq
@@ -96,7 +96,7 @@ define <4 x i32> @test7(<4 x i32> %A) {
define <4 x i32> @test8(<4 x i32> %A) {
; CHECK-LABEL: test8:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: pxor %xmm1, %xmm1
; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5],xmm0[6,7]
; CHECK-NEXT: retq
@@ -106,7 +106,7 @@ define <4 x i32> @test8(<4 x i32> %A) {
define <4 x i32> @test9(<4 x i32> %A) {
; CHECK-LABEL: test9:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
; CHECK-NEXT: retq
%1 = and <4 x i32> %A, <i32 -1, i32 -1, i32 0, i32 0>
@@ -115,7 +115,7 @@ define <4 x i32> @test9(<4 x i32> %A) {
define <4 x i32> @test10(<4 x i32> %A) {
; CHECK-LABEL: test10:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: pxor %xmm1, %xmm1
; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7]
; CHECK-NEXT: retq
@@ -125,7 +125,7 @@ define <4 x i32> @test10(<4 x i32> %A) {
define <4 x i32> @test11(<4 x i32> %A) {
; CHECK-LABEL: test11:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: pxor %xmm1, %xmm1
; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
; CHECK-NEXT: retq
@@ -135,7 +135,7 @@ define <4 x i32> @test11(<4 x i32> %A) {
define <4 x i32> @test12(<4 x i32> %A) {
; CHECK-LABEL: test12:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: pxor %xmm1, %xmm1
; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
; CHECK-NEXT: retq
@@ -145,7 +145,7 @@ define <4 x i32> @test12(<4 x i32> %A) {
define <4 x i32> @test13(<4 x i32> %A) {
; CHECK-LABEL: test13:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: pxor %xmm1, %xmm1
; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5],xmm0[6,7]
; CHECK-NEXT: retq
@@ -155,7 +155,7 @@ define <4 x i32> @test13(<4 x i32> %A) {
define <4 x i32> @test14(<4 x i32> %A) {
; CHECK-LABEL: test14:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: pxor %xmm1, %xmm1
; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
; CHECK-NEXT: retq
@@ -165,7 +165,7 @@ define <4 x i32> @test14(<4 x i32> %A) {
define <4 x i32> @test15(<4 x i32> %A, <4 x i32> %B) {
; CHECK-LABEL: test15:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
; CHECK-NEXT: retq
%1 = and <4 x i32> %A, <i32 -1, i32 0, i32 -1, i32 -1>
@@ -176,7 +176,7 @@ define <4 x i32> @test15(<4 x i32> %A, <4 x i32> %B) {
define <4 x i32> @test16(<4 x i32> %A, <4 x i32> %B) {
; CHECK-LABEL: test16:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
; CHECK-NEXT: retq
%1 = and <4 x i32> %A, <i32 -1, i32 0, i32 -1, i32 0>
@@ -187,7 +187,7 @@ define <4 x i32> @test16(<4 x i32> %A, <4 x i32> %B) {
define <4 x i32> @test17(<4 x i32> %A, <4 x i32> %B) {
; CHECK-LABEL: test17:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
; CHECK-NEXT: retq
%1 = and <4 x i32> %A, <i32 0, i32 -1, i32 0, i32 -1>
@@ -202,7 +202,7 @@ define <4 x i32> @test17(<4 x i32> %A, <4 x i32> %B) {
define <2 x i64> @and_or_v2i64(<2 x i64> %a0) {
; CHECK-LABEL: and_or_v2i64:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movaps {{.*#+}} xmm0 = [8,8]
; CHECK-NEXT: retq
%1 = or <2 x i64> %a0, <i64 255, i64 255>
@@ -212,7 +212,7 @@ define <2 x i64> @and_or_v2i64(<2 x i64> %a0) {
define <4 x i32> @and_or_v4i32(<4 x i32> %a0) {
; CHECK-LABEL: and_or_v4i32:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movaps {{.*#+}} xmm0 = [3,3,3,3]
; CHECK-NEXT: retq
%1 = or <4 x i32> %a0, <i32 15, i32 15, i32 15, i32 15>
@@ -226,7 +226,7 @@ define <4 x i32> @and_or_v4i32(<4 x i32> %a0) {
define <2 x i64> @and_or_zext_v2i32(<2 x i32> %a0) {
; CHECK-LABEL: and_or_zext_v2i32:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: xorps %xmm0, %xmm0
; CHECK-NEXT: retq
%1 = zext <2 x i32> %a0 to <2 x i64>
@@ -237,7 +237,7 @@ define <2 x i64> @and_or_zext_v2i32(<2 x i32> %a0) {
define <4 x i32> @and_or_zext_v4i16(<4 x i16> %a0) {
; CHECK-LABEL: and_or_zext_v4i16:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: xorps %xmm0, %xmm0
; CHECK-NEXT: retq
%1 = zext <4 x i16> %a0 to <4 x i32>
@@ -252,7 +252,7 @@ define <4 x i32> @and_or_zext_v4i16(<4 x i16> %a0) {
define <8 x i16> @ashr_mask1_v8i16(<8 x i16> %a0) {
; CHECK-LABEL: ashr_mask1_v8i16:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: psrlw $15, %xmm0
; CHECK-NEXT: retq
%1 = ashr <8 x i16> %a0, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
@@ -262,7 +262,7 @@ define <8 x i16> @ashr_mask1_v8i16(<8 x i16> %a0) {
define <4 x i32> @ashr_mask7_v4i32(<4 x i32> %a0) {
; CHECK-LABEL: ashr_mask7_v4i32:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: psrad $31, %xmm0
; CHECK-NEXT: psrld $29, %xmm0
; CHECK-NEXT: retq
@@ -270,3 +270,22 @@ define <4 x i32> @ashr_mask7_v4i32(<4 x i32> %a0) {
%2 = and <4 x i32> %1, <i32 7, i32 7, i32 7, i32 7>
ret <4 x i32> %2
}
+
+;
+; SimplifyDemandedBits
+;
+
+; PR34620 - redundant PAND after vector shift of a byte vector (PSRLW)
+define <16 x i8> @PR34620(<16 x i8> %a0, <16 x i8> %a1) {
+; CHECK-LABEL: PR34620:
+; CHECK: # %bb.0:
+; CHECK-NEXT: psrlw $1, %xmm0
+; CHECK-NEXT: pand {{.*}}(%rip), %xmm0
+; CHECK-NEXT: pand {{.*}}(%rip), %xmm0
+; CHECK-NEXT: paddb %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %1 = lshr <16 x i8> %a0, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+ %2 = and <16 x i8> %1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+ %3 = add <16 x i8> %2, %a1
+ ret <16 x i8> %3
+}
diff --git a/test/CodeGen/X86/combine-avx-intrinsics.ll b/test/CodeGen/X86/combine-avx-intrinsics.ll
index 811b1f20833c..e46a1903e81e 100644
--- a/test/CodeGen/X86/combine-avx-intrinsics.ll
+++ b/test/CodeGen/X86/combine-avx-intrinsics.ll
@@ -4,7 +4,7 @@
define <4 x double> @test_x86_avx_blend_pd_256(<4 x double> %a0) {
; CHECK-LABEL: test_x86_avx_blend_pd_256:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: retq
%1 = call <4 x double> @llvm.x86.avx.blend.pd.256(<4 x double> %a0, <4 x double> %a0, i32 7)
ret <4 x double> %1
@@ -12,7 +12,7 @@ define <4 x double> @test_x86_avx_blend_pd_256(<4 x double> %a0) {
define <8 x float> @test_x86_avx_blend_ps_256(<8 x float> %a0) {
; CHECK-LABEL: test_x86_avx_blend_ps_256:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: retq
%1 = call <8 x float> @llvm.x86.avx.blend.ps.256(<8 x float> %a0, <8 x float> %a0, i32 7)
ret <8 x float> %1
@@ -20,7 +20,7 @@ define <8 x float> @test_x86_avx_blend_ps_256(<8 x float> %a0) {
define <4 x double> @test2_x86_avx_blend_pd_256(<4 x double> %a0, <4 x double> %a1) {
; CHECK-LABEL: test2_x86_avx_blend_pd_256:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: retq
%1 = call <4 x double> @llvm.x86.avx.blend.pd.256(<4 x double> %a0, <4 x double> %a1, i32 0)
ret <4 x double> %1
@@ -28,7 +28,7 @@ define <4 x double> @test2_x86_avx_blend_pd_256(<4 x double> %a0, <4 x double> %
define <8 x float> @test2_x86_avx_blend_ps_256(<8 x float> %a0, <8 x float> %a1) {
; CHECK-LABEL: test2_x86_avx_blend_ps_256:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: retq
%1 = call <8 x float> @llvm.x86.avx.blend.ps.256(<8 x float> %a0, <8 x float> %a1, i32 0)
ret <8 x float> %1
@@ -36,7 +36,7 @@ define <8 x float> @test2_x86_avx_blend_ps_256(<8 x float> %a0, <8 x float> %a1)
define <4 x double> @test3_x86_avx_blend_pd_256(<4 x double> %a0, <4 x double> %a1) {
; CHECK-LABEL: test3_x86_avx_blend_pd_256:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps %ymm1, %ymm0
; CHECK-NEXT: retq
%1 = call <4 x double> @llvm.x86.avx.blend.pd.256(<4 x double> %a0, <4 x double> %a1, i32 -1)
@@ -45,7 +45,7 @@ define <4 x double> @test3_x86_avx_blend_pd_256(<4 x double> %a0, <4 x double> %
define <8 x float> @test3_x86_avx_blend_ps_256(<8 x float> %a0, <8 x float> %a1) {
; CHECK-LABEL: test3_x86_avx_blend_ps_256:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps %ymm1, %ymm0
; CHECK-NEXT: retq
%1 = call <8 x float> @llvm.x86.avx.blend.ps.256(<8 x float> %a0, <8 x float> %a1, i32 -1)
diff --git a/test/CodeGen/X86/combine-avx2-intrinsics.ll b/test/CodeGen/X86/combine-avx2-intrinsics.ll
index 9a548f6b7f0e..672820d86c6d 100644
--- a/test/CodeGen/X86/combine-avx2-intrinsics.ll
+++ b/test/CodeGen/X86/combine-avx2-intrinsics.ll
@@ -6,7 +6,7 @@
define <16 x i16> @test_x86_avx2_pblendw(<16 x i16> %a0) {
; CHECK-LABEL: test_x86_avx2_pblendw:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: retq
%res = call <16 x i16> @llvm.x86.avx2.pblendw(<16 x i16> %a0, <16 x i16> %a0, i32 7)
ret <16 x i16> %res
@@ -14,7 +14,7 @@ define <16 x i16> @test_x86_avx2_pblendw(<16 x i16> %a0) {
define <4 x i32> @test_x86_avx2_pblendd_128(<4 x i32> %a0) {
; CHECK-LABEL: test_x86_avx2_pblendd_128:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: retq
%res = call <4 x i32> @llvm.x86.avx2.pblendd.128(<4 x i32> %a0, <4 x i32> %a0, i32 7)
ret <4 x i32> %res
@@ -22,7 +22,7 @@ define <4 x i32> @test_x86_avx2_pblendd_128(<4 x i32> %a0) {
define <8 x i32> @test_x86_avx2_pblendd_256(<8 x i32> %a0) {
; CHECK-LABEL: test_x86_avx2_pblendd_256:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: retq
%res = call <8 x i32> @llvm.x86.avx2.pblendd.256(<8 x i32> %a0, <8 x i32> %a0, i32 7)
ret <8 x i32> %res
@@ -30,7 +30,7 @@ define <8 x i32> @test_x86_avx2_pblendd_256(<8 x i32> %a0) {
define <16 x i16> @test2_x86_avx2_pblendw(<16 x i16> %a0, <16 x i16> %a1) {
; CHECK-LABEL: test2_x86_avx2_pblendw:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: retq
%res = call <16 x i16> @llvm.x86.avx2.pblendw(<16 x i16> %a0, <16 x i16> %a1, i32 0)
ret <16 x i16> %res
@@ -38,7 +38,7 @@ define <16 x i16> @test2_x86_avx2_pblendw(<16 x i16> %a0, <16 x i16> %a1) {
define <4 x i32> @test2_x86_avx2_pblendd_128(<4 x i32> %a0, <4 x i32> %a1) {
; CHECK-LABEL: test2_x86_avx2_pblendd_128:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: retq
%res = call <4 x i32> @llvm.x86.avx2.pblendd.128(<4 x i32> %a0, <4 x i32> %a1, i32 0)
ret <4 x i32> %res
@@ -46,7 +46,7 @@ define <4 x i32> @test2_x86_avx2_pblendd_128(<4 x i32> %a0, <4 x i32> %a1) {
define <8 x i32> @test2_x86_avx2_pblendd_256(<8 x i32> %a0, <8 x i32> %a1) {
; CHECK-LABEL: test2_x86_avx2_pblendd_256:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: retq
%res = call <8 x i32> @llvm.x86.avx2.pblendd.256(<8 x i32> %a0, <8 x i32> %a1, i32 0)
ret <8 x i32> %res
@@ -54,7 +54,7 @@ define <8 x i32> @test2_x86_avx2_pblendd_256(<8 x i32> %a0, <8 x i32> %a1) {
define <16 x i16> @test3_x86_avx2_pblendw(<16 x i16> %a0, <16 x i16> %a1) {
; CHECK-LABEL: test3_x86_avx2_pblendw:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps %ymm1, %ymm0
; CHECK-NEXT: retq
%res = call <16 x i16> @llvm.x86.avx2.pblendw(<16 x i16> %a0, <16 x i16> %a1, i32 -1)
@@ -63,7 +63,7 @@ define <16 x i16> @test3_x86_avx2_pblendw(<16 x i16> %a0, <16 x i16> %a1) {
define <4 x i32> @test3_x86_avx2_pblendd_128(<4 x i32> %a0, <4 x i32> %a1) {
; CHECK-LABEL: test3_x86_avx2_pblendd_128:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps %xmm1, %xmm0
; CHECK-NEXT: retq
%res = call <4 x i32> @llvm.x86.avx2.pblendd.128(<4 x i32> %a0, <4 x i32> %a1, i32 -1)
@@ -72,7 +72,7 @@ define <4 x i32> @test3_x86_avx2_pblendd_128(<4 x i32> %a0, <4 x i32> %a1) {
define <8 x i32> @test3_x86_avx2_pblendd_256(<8 x i32> %a0, <8 x i32> %a1) {
; CHECK-LABEL: test3_x86_avx2_pblendd_256:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps %ymm1, %ymm0
; CHECK-NEXT: retq
%res = call <8 x i32> @llvm.x86.avx2.pblendd.256(<8 x i32> %a0, <8 x i32> %a1, i32 -1)
diff --git a/test/CodeGen/X86/combine-fcopysign.ll b/test/CodeGen/X86/combine-fcopysign.ll
index 43e09bfe5fea..6298192226e0 100644
--- a/test/CodeGen/X86/combine-fcopysign.ll
+++ b/test/CodeGen/X86/combine-fcopysign.ll
@@ -9,7 +9,7 @@
; copysign(x, c1) -> fabs(x) iff ispos(c1)
define <4 x float> @combine_vec_fcopysign_pos_constant0(<4 x float> %x) {
; SSE-LABEL: combine_vec_fcopysign_pos_constant0:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm1 = [2.000000e+00,2.000000e+00,2.000000e+00,2.000000e+00]
; SSE-NEXT: andps {{.*}}(%rip), %xmm1
; SSE-NEXT: andps {{.*}}(%rip), %xmm0
@@ -17,7 +17,7 @@ define <4 x float> @combine_vec_fcopysign_pos_constant0(<4 x float> %x) {
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_fcopysign_pos_constant0:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vbroadcastss {{.*}}(%rip), %xmm1
; AVX-NEXT: vbroadcastss {{.*}}(%rip), %xmm2
; AVX-NEXT: vandps %xmm1, %xmm2, %xmm1
@@ -31,7 +31,7 @@ define <4 x float> @combine_vec_fcopysign_pos_constant0(<4 x float> %x) {
define <4 x float> @combine_vec_fcopysign_pos_constant1(<4 x float> %x) {
; SSE-LABEL: combine_vec_fcopysign_pos_constant1:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm1 = [0.000000e+00,2.000000e+00,4.000000e+00,8.000000e+00]
; SSE-NEXT: andps {{.*}}(%rip), %xmm1
; SSE-NEXT: andps {{.*}}(%rip), %xmm0
@@ -39,7 +39,7 @@ define <4 x float> @combine_vec_fcopysign_pos_constant1(<4 x float> %x) {
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_fcopysign_pos_constant1:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vbroadcastss {{.*}}(%rip), %xmm1
; AVX-NEXT: vandps %xmm1, %xmm0, %xmm0
; AVX-NEXT: vbroadcastss {{.*}}(%rip), %xmm1
@@ -52,12 +52,12 @@ define <4 x float> @combine_vec_fcopysign_pos_constant1(<4 x float> %x) {
define <4 x float> @combine_vec_fcopysign_fabs_sgn(<4 x float> %x, <4 x float> %y) {
; SSE-LABEL: combine_vec_fcopysign_fabs_sgn:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: andps {{.*}}(%rip), %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_fcopysign_fabs_sgn:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vbroadcastss {{.*}}(%rip), %xmm1
; AVX-NEXT: vandps %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
@@ -69,7 +69,7 @@ define <4 x float> @combine_vec_fcopysign_fabs_sgn(<4 x float> %x, <4 x float> %
; copysign(x, c1) -> fneg(fabs(x)) iff isneg(c1)
define <4 x float> @combine_vec_fcopysign_neg_constant0(<4 x float> %x) {
; SSE-LABEL: combine_vec_fcopysign_neg_constant0:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm1 = [-2.000000e+00,-2.000000e+00,-2.000000e+00,-2.000000e+00]
; SSE-NEXT: andps {{.*}}(%rip), %xmm1
; SSE-NEXT: andps {{.*}}(%rip), %xmm0
@@ -77,7 +77,7 @@ define <4 x float> @combine_vec_fcopysign_neg_constant0(<4 x float> %x) {
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_fcopysign_neg_constant0:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vbroadcastss {{.*}}(%rip), %xmm1
; AVX-NEXT: vbroadcastss {{.*}}(%rip), %xmm2
; AVX-NEXT: vandps %xmm1, %xmm2, %xmm1
@@ -91,7 +91,7 @@ define <4 x float> @combine_vec_fcopysign_neg_constant0(<4 x float> %x) {
define <4 x float> @combine_vec_fcopysign_neg_constant1(<4 x float> %x) {
; SSE-LABEL: combine_vec_fcopysign_neg_constant1:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm1 = [-0.000000e+00,-2.000000e+00,-4.000000e+00,-8.000000e+00]
; SSE-NEXT: andps {{.*}}(%rip), %xmm1
; SSE-NEXT: andps {{.*}}(%rip), %xmm0
@@ -99,7 +99,7 @@ define <4 x float> @combine_vec_fcopysign_neg_constant1(<4 x float> %x) {
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_fcopysign_neg_constant1:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vbroadcastss {{.*}}(%rip), %xmm1
; AVX-NEXT: vandps %xmm1, %xmm0, %xmm0
; AVX-NEXT: vbroadcastss {{.*}}(%rip), %xmm1
@@ -112,21 +112,16 @@ define <4 x float> @combine_vec_fcopysign_neg_constant1(<4 x float> %x) {
define <4 x float> @combine_vec_fcopysign_fneg_fabs_sgn(<4 x float> %x, <4 x float> %y) {
; SSE-LABEL: combine_vec_fcopysign_fneg_fabs_sgn:
-; SSE: # BB#0:
-; SSE-NEXT: movaps {{.*#+}} xmm2 = [-0.000000e+00,-0.000000e+00,-0.000000e+00,-0.000000e+00]
-; SSE-NEXT: orps %xmm2, %xmm1
-; SSE-NEXT: andps %xmm2, %xmm1
+; SSE: # %bb.0:
; SSE-NEXT: andps {{.*}}(%rip), %xmm0
-; SSE-NEXT: orps %xmm1, %xmm0
+; SSE-NEXT: orps {{.*}}(%rip), %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_fcopysign_fneg_fabs_sgn:
-; AVX: # BB#0:
+; AVX: # %bb.0:
+; AVX-NEXT: vbroadcastss {{.*}}(%rip), %xmm1
; AVX-NEXT: vbroadcastss {{.*}}(%rip), %xmm2
-; AVX-NEXT: vorps %xmm2, %xmm1, %xmm1
-; AVX-NEXT: vbroadcastss {{.*}}(%rip), %xmm3
-; AVX-NEXT: vandps %xmm3, %xmm0, %xmm0
-; AVX-NEXT: vandps %xmm2, %xmm1, %xmm1
+; AVX-NEXT: vandps %xmm2, %xmm0, %xmm0
; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%1 = call <4 x float> @llvm.fabs.v4f32(<4 x float> %y)
@@ -138,14 +133,14 @@ define <4 x float> @combine_vec_fcopysign_fneg_fabs_sgn(<4 x float> %x, <4 x flo
; copysign(fabs(x), y) -> copysign(x, y)
define <4 x float> @combine_vec_fcopysign_fabs_mag(<4 x float> %x, <4 x float> %y) {
; SSE-LABEL: combine_vec_fcopysign_fabs_mag:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: andps {{.*}}(%rip), %xmm1
; SSE-NEXT: andps {{.*}}(%rip), %xmm0
; SSE-NEXT: orps %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_fcopysign_fabs_mag:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vbroadcastss {{.*}}(%rip), %xmm2
; AVX-NEXT: vandps %xmm2, %xmm1, %xmm1
; AVX-NEXT: vbroadcastss {{.*}}(%rip), %xmm2
@@ -160,14 +155,14 @@ define <4 x float> @combine_vec_fcopysign_fabs_mag(<4 x float> %x, <4 x float> %
; copysign(fneg(x), y) -> copysign(x, y)
define <4 x float> @combine_vec_fcopysign_fneg_mag(<4 x float> %x, <4 x float> %y) {
; SSE-LABEL: combine_vec_fcopysign_fneg_mag:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: andps {{.*}}(%rip), %xmm1
; SSE-NEXT: andps {{.*}}(%rip), %xmm0
; SSE-NEXT: orps %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_fcopysign_fneg_mag:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vbroadcastss {{.*}}(%rip), %xmm2
; AVX-NEXT: vandps %xmm2, %xmm1, %xmm1
; AVX-NEXT: vbroadcastss {{.*}}(%rip), %xmm2
@@ -182,14 +177,14 @@ define <4 x float> @combine_vec_fcopysign_fneg_mag(<4 x float> %x, <4 x float> %
; copysign(copysign(x,z), y) -> copysign(x, y)
define <4 x float> @combine_vec_fcopysign_fcopysign_mag(<4 x float> %x, <4 x float> %y, <4 x float> %z) {
; SSE-LABEL: combine_vec_fcopysign_fcopysign_mag:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: andps {{.*}}(%rip), %xmm1
; SSE-NEXT: andps {{.*}}(%rip), %xmm0
; SSE-NEXT: orps %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_fcopysign_fcopysign_mag:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vbroadcastss {{.*}}(%rip), %xmm2
; AVX-NEXT: vandps %xmm2, %xmm1, %xmm1
; AVX-NEXT: vbroadcastss {{.*}}(%rip), %xmm2
@@ -204,14 +199,14 @@ define <4 x float> @combine_vec_fcopysign_fcopysign_mag(<4 x float> %x, <4 x flo
; copysign(x, copysign(y,z)) -> copysign(x, z)
define <4 x float> @combine_vec_fcopysign_fcopysign_sgn(<4 x float> %x, <4 x float> %y, <4 x float> %z) {
; SSE-LABEL: combine_vec_fcopysign_fcopysign_sgn:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: andps {{.*}}(%rip), %xmm2
; SSE-NEXT: andps {{.*}}(%rip), %xmm0
; SSE-NEXT: orps %xmm2, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_fcopysign_fcopysign_sgn:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vbroadcastss {{.*}}(%rip), %xmm1
; AVX-NEXT: vandps %xmm1, %xmm2, %xmm1
; AVX-NEXT: vbroadcastss {{.*}}(%rip), %xmm2
@@ -226,7 +221,7 @@ define <4 x float> @combine_vec_fcopysign_fcopysign_sgn(<4 x float> %x, <4 x flo
; copysign(x, fp_extend(y)) -> copysign(x, y)
define <4 x double> @combine_vec_fcopysign_fpext_sgn(<4 x double> %x, <4 x float> %y) {
; SSE-LABEL: combine_vec_fcopysign_fpext_sgn:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps %xmm2, %xmm3
; SSE-NEXT: cvtss2sd %xmm2, %xmm4
; SSE-NEXT: movshdup {{.*#+}} xmm5 = xmm2[1,1,3,3]
@@ -245,7 +240,7 @@ define <4 x double> @combine_vec_fcopysign_fpext_sgn(<4 x double> %x, <4 x float
; SSE-NEXT: cvtss2sd %xmm5, %xmm4
; SSE-NEXT: andps %xmm8, %xmm4
; SSE-NEXT: orps %xmm0, %xmm4
-; SSE-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm4[0]
+; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm4[0]
; SSE-NEXT: movaps %xmm1, %xmm0
; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
; SSE-NEXT: andps %xmm7, %xmm0
@@ -257,12 +252,12 @@ define <4 x double> @combine_vec_fcopysign_fpext_sgn(<4 x double> %x, <4 x float
; SSE-NEXT: cvtss2sd %xmm6, %xmm0
; SSE-NEXT: andps %xmm8, %xmm0
; SSE-NEXT: orps %xmm0, %xmm1
-; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm3[0]
-; SSE-NEXT: movapd %xmm2, %xmm0
+; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0]
+; SSE-NEXT: movaps %xmm2, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_fcopysign_fpext_sgn:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vbroadcastsd {{.*}}(%rip), %ymm2
; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0
; AVX-NEXT: vcvtps2pd %xmm1, %ymm1
@@ -278,7 +273,7 @@ define <4 x double> @combine_vec_fcopysign_fpext_sgn(<4 x double> %x, <4 x float
; copysign(x, fp_round(y)) -> copysign(x, y)
define <4 x float> @combine_vec_fcopysign_fptrunc_sgn(<4 x float> %x, <4 x double> %y) {
; SSE-LABEL: combine_vec_fcopysign_fptrunc_sgn:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps %xmm0, %xmm3
; SSE-NEXT: movaps {{.*#+}} xmm5
; SSE-NEXT: andps %xmm5, %xmm0
@@ -312,7 +307,7 @@ define <4 x float> @combine_vec_fcopysign_fptrunc_sgn(<4 x float> %x, <4 x doubl
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_fcopysign_fptrunc_sgn:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vbroadcastss {{.*}}(%rip), %xmm2
; AVX-NEXT: vandpd %xmm2, %xmm0, %xmm0
; AVX-NEXT: vcvtpd2ps %ymm1, %xmm1
diff --git a/test/CodeGen/X86/combine-lds.ll b/test/CodeGen/X86/combine-lds.ll
index b49d081a64f1..f2c81a4959bd 100644
--- a/test/CodeGen/X86/combine-lds.ll
+++ b/test/CodeGen/X86/combine-lds.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 | grep fldl | count 1
+; RUN: llc < %s -mtriple=i686-- -mattr=+sse2 | grep fldl | count 1
define double @doload64(i64 %x) nounwind {
%tmp717 = bitcast i64 %x to double
diff --git a/test/CodeGen/X86/combine-mul.ll b/test/CodeGen/X86/combine-mul.ll
index 3a8058280245..f021788e245f 100644
--- a/test/CodeGen/X86/combine-mul.ll
+++ b/test/CodeGen/X86/combine-mul.ll
@@ -5,11 +5,11 @@
; fold (mul undef, x) -> 0
define <4 x i32> @combine_vec_mul_undef0(<4 x i32> %x) {
; SSE-LABEL: combine_vec_mul_undef0:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_mul_undef0:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: retq
%1 = mul <4 x i32> undef, %x
ret <4 x i32> %1
@@ -18,11 +18,11 @@ define <4 x i32> @combine_vec_mul_undef0(<4 x i32> %x) {
; fold (mul x, undef) -> 0
define <4 x i32> @combine_vec_mul_undef1(<4 x i32> %x) {
; SSE-LABEL: combine_vec_mul_undef1:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_mul_undef1:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: retq
%1 = mul <4 x i32> %x, undef
ret <4 x i32> %1
@@ -31,12 +31,12 @@ define <4 x i32> @combine_vec_mul_undef1(<4 x i32> %x) {
; fold (mul x, 0) -> 0
define <4 x i32> @combine_vec_mul_zero(<4 x i32> %x) {
; SSE-LABEL: combine_vec_mul_zero:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: xorps %xmm0, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_mul_zero:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
; AVX-NEXT: retq
%1 = mul <4 x i32> %x, zeroinitializer
@@ -46,11 +46,11 @@ define <4 x i32> @combine_vec_mul_zero(<4 x i32> %x) {
; fold (mul x, 1) -> x
define <4 x i32> @combine_vec_mul_one(<4 x i32> %x) {
; SSE-LABEL: combine_vec_mul_one:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_mul_one:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: retq
%1 = mul <4 x i32> %x, <i32 1, i32 1, i32 1, i32 1>
ret <4 x i32> %1
@@ -59,14 +59,14 @@ define <4 x i32> @combine_vec_mul_one(<4 x i32> %x) {
; fold (mul x, -1) -> 0-x
define <4 x i32> @combine_vec_mul_negone(<4 x i32> %x) {
; SSE-LABEL: combine_vec_mul_negone:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pxor %xmm1, %xmm1
; SSE-NEXT: psubd %xmm0, %xmm1
; SSE-NEXT: movdqa %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_mul_negone:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX-NEXT: vpsubd %xmm0, %xmm1, %xmm0
; AVX-NEXT: retq
@@ -77,12 +77,12 @@ define <4 x i32> @combine_vec_mul_negone(<4 x i32> %x) {
; fold (mul x, (1 << c)) -> x << c
define <4 x i32> @combine_vec_mul_pow2a(<4 x i32> %x) {
; SSE-LABEL: combine_vec_mul_pow2a:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: paddd %xmm0, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_mul_pow2a:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpaddd %xmm0, %xmm0, %xmm0
; AVX-NEXT: retq
%1 = mul <4 x i32> %x, <i32 2, i32 2, i32 2, i32 2>
@@ -91,13 +91,13 @@ define <4 x i32> @combine_vec_mul_pow2a(<4 x i32> %x) {
define <4 x i32> @combine_vec_mul_pow2b(<4 x i32> %x) {
; SSE-LABEL: combine_vec_mul_pow2b:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pmulld {{.*}}(%rip), %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_mul_pow2b:
-; AVX: # BB#0:
-; AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; AVX: # %bb.0:
+; AVX-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: retq
%1 = mul <4 x i32> %x, <i32 1, i32 2, i32 4, i32 16>
ret <4 x i32> %1
@@ -105,31 +105,20 @@ define <4 x i32> @combine_vec_mul_pow2b(<4 x i32> %x) {
define <4 x i64> @combine_vec_mul_pow2c(<4 x i64> %x) {
; SSE-LABEL: combine_vec_mul_pow2c:
-; SSE: # BB#0:
-; SSE-NEXT: movdqa {{.*#+}} xmm2 = [1,2]
-; SSE-NEXT: movdqa %xmm0, %xmm3
-; SSE-NEXT: pmuludq %xmm2, %xmm3
-; SSE-NEXT: psrlq $32, %xmm0
-; SSE-NEXT: pmuludq %xmm2, %xmm0
-; SSE-NEXT: psllq $32, %xmm0
-; SSE-NEXT: paddq %xmm3, %xmm0
-; SSE-NEXT: movdqa {{.*#+}} xmm2 = [4,16]
-; SSE-NEXT: movdqa %xmm1, %xmm3
-; SSE-NEXT: pmuludq %xmm2, %xmm3
-; SSE-NEXT: psrlq $32, %xmm1
-; SSE-NEXT: pmuludq %xmm2, %xmm1
-; SSE-NEXT: psllq $32, %xmm1
-; SSE-NEXT: paddq %xmm3, %xmm1
+; SSE: # %bb.0:
+; SSE-NEXT: movdqa %xmm0, %xmm2
+; SSE-NEXT: psllq $1, %xmm2
+; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1,2,3],xmm2[4,5,6,7]
+; SSE-NEXT: movdqa %xmm1, %xmm0
+; SSE-NEXT: psllq $4, %xmm0
+; SSE-NEXT: psllq $2, %xmm1
+; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; SSE-NEXT: movdqa %xmm2, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_mul_pow2c:
-; AVX: # BB#0:
-; AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [1,2,4,16]
-; AVX-NEXT: vpmuludq %ymm1, %ymm0, %ymm2
-; AVX-NEXT: vpsrlq $32, %ymm0, %ymm0
-; AVX-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
-; AVX-NEXT: vpsllq $32, %ymm0, %ymm0
-; AVX-NEXT: vpaddq %ymm0, %ymm2, %ymm0
+; AVX: # %bb.0:
+; AVX-NEXT: vpsllvq {{.*}}(%rip), %ymm0, %ymm0
; AVX-NEXT: retq
%1 = mul <4 x i64> %x, <i64 1, i64 2, i64 4, i64 16>
ret <4 x i64> %1
@@ -138,7 +127,7 @@ define <4 x i64> @combine_vec_mul_pow2c(<4 x i64> %x) {
; fold (mul x, -(1 << c)) -> -(x << c) or (-x) << c
define <4 x i32> @combine_vec_mul_negpow2a(<4 x i32> %x) {
; SSE-LABEL: combine_vec_mul_negpow2a:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: paddd %xmm0, %xmm0
; SSE-NEXT: pxor %xmm1, %xmm1
; SSE-NEXT: psubd %xmm0, %xmm1
@@ -146,7 +135,7 @@ define <4 x i32> @combine_vec_mul_negpow2a(<4 x i32> %x) {
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_mul_negpow2a:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpaddd %xmm0, %xmm0, %xmm0
; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX-NEXT: vpsubd %xmm0, %xmm1, %xmm0
@@ -157,12 +146,12 @@ define <4 x i32> @combine_vec_mul_negpow2a(<4 x i32> %x) {
define <4 x i32> @combine_vec_mul_negpow2b(<4 x i32> %x) {
; SSE-LABEL: combine_vec_mul_negpow2b:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pmulld {{.*}}(%rip), %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_mul_negpow2b:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: retq
%1 = mul <4 x i32> %x, <i32 -1, i32 -2, i32 -4, i32 -16>
@@ -171,7 +160,7 @@ define <4 x i32> @combine_vec_mul_negpow2b(<4 x i32> %x) {
define <4 x i64> @combine_vec_mul_negpow2c(<4 x i64> %x) {
; SSE-LABEL: combine_vec_mul_negpow2c:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,4294967295]
; SSE-NEXT: movdqa %xmm0, %xmm3
; SSE-NEXT: pmuludq %xmm2, %xmm3
@@ -195,8 +184,8 @@ define <4 x i64> @combine_vec_mul_negpow2c(<4 x i64> %x) {
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_mul_negpow2c:
-; AVX: # BB#0:
-; AVX-NEXT: vpbroadcastq {{.*}}(%rip), %ymm1
+; AVX: # %bb.0:
+; AVX-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,4294967295]
; AVX-NEXT: vpmuludq %ymm1, %ymm0, %ymm1
; AVX-NEXT: vpsrlq $32, %ymm0, %ymm2
; AVX-NEXT: vmovdqa {{.*#+}} ymm3 = [18446744073709551615,18446744073709551614,18446744073709551612,18446744073709551600]
@@ -213,12 +202,12 @@ define <4 x i64> @combine_vec_mul_negpow2c(<4 x i64> %x) {
; (mul (shl X, c1), c2) -> (mul X, c2 << c1)
define <4 x i32> @combine_vec_mul_shl_const(<4 x i32> %x) {
; SSE-LABEL: combine_vec_mul_shl_const:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pmulld {{.*}}(%rip), %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_mul_shl_const:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: retq
%1 = shl <4 x i32> %x, <i32 1, i32 2, i32 8, i32 16>
@@ -229,13 +218,13 @@ define <4 x i32> @combine_vec_mul_shl_const(<4 x i32> %x) {
; (mul (shl X, C), Y) -> (shl (mul X, Y), C) when the shift has one use.
define <4 x i32> @combine_vec_mul_shl_oneuse0(<4 x i32> %x, <4 x i32> %y) {
; SSE-LABEL: combine_vec_mul_shl_oneuse0:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pmulld %xmm1, %xmm0
; SSE-NEXT: pmulld {{.*}}(%rip), %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_mul_shl_oneuse0:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: retq
@@ -246,13 +235,13 @@ define <4 x i32> @combine_vec_mul_shl_oneuse0(<4 x i32> %x, <4 x i32> %y) {
define <4 x i32> @combine_vec_mul_shl_oneuse1(<4 x i32> %x, <4 x i32> %y) {
; SSE-LABEL: combine_vec_mul_shl_oneuse1:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pmulld %xmm1, %xmm0
; SSE-NEXT: pmulld {{.*}}(%rip), %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_mul_shl_oneuse1:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: retq
@@ -263,14 +252,14 @@ define <4 x i32> @combine_vec_mul_shl_oneuse1(<4 x i32> %x, <4 x i32> %y) {
define <4 x i32> @combine_vec_mul_shl_multiuse0(<4 x i32> %x, <4 x i32> %y) {
; SSE-LABEL: combine_vec_mul_shl_multiuse0:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pmulld {{.*}}(%rip), %xmm0
; SSE-NEXT: pmulld %xmm0, %xmm1
; SSE-NEXT: paddd %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_mul_shl_multiuse0:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm1
; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
@@ -283,14 +272,14 @@ define <4 x i32> @combine_vec_mul_shl_multiuse0(<4 x i32> %x, <4 x i32> %y) {
define <4 x i32> @combine_vec_mul_shl_multiuse1(<4 x i32> %x, <4 x i32> %y) {
; SSE-LABEL: combine_vec_mul_shl_multiuse1:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pmulld {{.*}}(%rip), %xmm0
; SSE-NEXT: pmulld %xmm0, %xmm1
; SSE-NEXT: paddd %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_mul_shl_multiuse1:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm1
; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
@@ -305,13 +294,13 @@ define <4 x i32> @combine_vec_mul_shl_multiuse1(<4 x i32> %x, <4 x i32> %y) {
define <4 x i32> @combine_vec_mul_add(<4 x i32> %x) {
; SSE-LABEL: combine_vec_mul_add:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pmulld {{.*}}(%rip), %xmm0
; SSE-NEXT: paddd {{.*}}(%rip), %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_mul_add:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: retq
@@ -319,3 +308,37 @@ define <4 x i32> @combine_vec_mul_add(<4 x i32> %x) {
%2 = mul <4 x i32> %1, <i32 4, i32 6, i32 2, i32 0>
ret <4 x i32> %2
}
+
+; This would infinite loop because DAGCombiner wants to turn this into a shift,
+; but x86 lowering wants to avoid non-uniform vector shift amounts.
+
+define <16 x i8> @PR35579(<16 x i8> %x) {
+; SSE-LABEL: PR35579:
+; SSE: # %bb.0:
+; SSE-NEXT: pmovsxbw %xmm0, %xmm1
+; SSE-NEXT: pmullw {{.*}}(%rip), %xmm1
+; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; SSE-NEXT: pand %xmm2, %xmm1
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE-NEXT: pmovsxbw %xmm0, %xmm0
+; SSE-NEXT: pmullw {{.*}}(%rip), %xmm0
+; SSE-NEXT: pand %xmm2, %xmm0
+; SSE-NEXT: packuswb %xmm0, %xmm1
+; SSE-NEXT: movdqa %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: PR35579:
+; AVX: # %bb.0:
+; AVX-NEXT: vpmovsxbw %xmm0, %ymm0
+; AVX-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
+; AVX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
+ %r = mul <16 x i8> %x, <i8 0, i8 1, i8 2, i8 1, i8 4, i8 1, i8 2, i8 1, i8 8, i8 1, i8 2, i8 1, i8 4, i8 1, i8 2, i8 1>
+ ret <16 x i8> %r
+}
+
diff --git a/test/CodeGen/X86/combine-multiplies.ll b/test/CodeGen/X86/combine-multiplies.ll
index ab30b9b489e5..98fc16ca2269 100644
--- a/test/CodeGen/X86/combine-multiplies.ll
+++ b/test/CodeGen/X86/combine-multiplies.ll
@@ -33,7 +33,7 @@
; Function Attrs: nounwind
define void @testCombineMultiplies([100 x i32]* nocapture %a, i32 %lll) nounwind {
; CHECK-LABEL: testCombineMultiplies:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: pushl %esi
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
@@ -74,7 +74,7 @@ entry:
; Output looks something like this:
;
; testCombineMultiplies_splat: # @testCombineMultiplies_splat
-; # BB#0: # %entry
+; # %bb.0: # %entry
; movdqa .LCPI1_0, %xmm1 # xmm1 = [11,11,11,11]
; paddd %xmm0, %xmm1
; movdqa .LCPI1_1, %xmm2 # xmm2 = [22,22,22,22]
@@ -104,7 +104,7 @@ entry:
; Function Attrs: nounwind
define void @testCombineMultiplies_splat(<4 x i32> %v1) nounwind {
; CHECK-LABEL: testCombineMultiplies_splat:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [11,11,11,11]
; CHECK-NEXT: paddd %xmm0, %xmm1
; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [22,22,22,22]
@@ -138,7 +138,7 @@ entry:
; Function Attrs: nounwind
define void @testCombineMultiplies_non_splat(<4 x i32> %v1) nounwind {
; CHECK-LABEL: testCombineMultiplies_non_splat:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [11,22,33,44]
; CHECK-NEXT: paddd %xmm0, %xmm1
; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [22,33,44,55]
diff --git a/test/CodeGen/X86/combine-or.ll b/test/CodeGen/X86/combine-or.ll
index d7f52d247988..1601c67dce25 100644
--- a/test/CodeGen/X86/combine-or.ll
+++ b/test/CodeGen/X86/combine-or.ll
@@ -3,7 +3,7 @@
define i32 @or_self(i32 %x) {
; CHECK-LABEL: or_self:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movl %edi, %eax
; CHECK-NEXT: retq
%or = or i32 %x, %x
@@ -12,7 +12,7 @@ define i32 @or_self(i32 %x) {
define <4 x i32> @or_self_vec(<4 x i32> %x) {
; CHECK-LABEL: or_self_vec:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: retq
%or = or <4 x i32> %x, %x
ret <4 x i32> %or
@@ -23,7 +23,7 @@ define <4 x i32> @or_self_vec(<4 x i32> %x) {
define <2 x i64> @test1(<2 x i64> %a, <2 x i64> %b) {
; CHECK-LABEL: test1:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
; CHECK-NEXT: retq
%shuf1 = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32><i32 0, i32 2>
@@ -35,7 +35,7 @@ define <2 x i64> @test1(<2 x i64> %a, <2 x i64> %b) {
define <4 x i32> @test2(<4 x i32> %a, <4 x i32> %b) {
; CHECK-LABEL: test2:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
; CHECK-NEXT: retq
%shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 4, i32 2, i32 3>
@@ -47,7 +47,7 @@ define <4 x i32> @test2(<4 x i32> %a, <4 x i32> %b) {
define <2 x i64> @test3(<2 x i64> %a, <2 x i64> %b) {
; CHECK-LABEL: test3:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
; CHECK-NEXT: retq
%shuf1 = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32><i32 2, i32 1>
@@ -59,7 +59,7 @@ define <2 x i64> @test3(<2 x i64> %a, <2 x i64> %b) {
define <4 x i32> @test4(<4 x i32> %a, <4 x i32> %b) {
; CHECK-LABEL: test4:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
; CHECK-NEXT: retq
%shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 4, i32 4, i32 4>
@@ -71,7 +71,7 @@ define <4 x i32> @test4(<4 x i32> %a, <4 x i32> %b) {
define <4 x i32> @test5(<4 x i32> %a, <4 x i32> %b) {
; CHECK-LABEL: test5:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
; CHECK-NEXT: retq
%shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 1, i32 2, i32 3>
@@ -83,7 +83,7 @@ define <4 x i32> @test5(<4 x i32> %a, <4 x i32> %b) {
define <4 x i32> @test6(<4 x i32> %a, <4 x i32> %b) {
; CHECK-LABEL: test6:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
; CHECK-NEXT: retq
%shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 1, i32 4, i32 4>
@@ -95,7 +95,7 @@ define <4 x i32> @test6(<4 x i32> %a, <4 x i32> %b) {
define <4 x i32> @test7(<4 x i32> %a, <4 x i32> %b) {
; CHECK-LABEL: test7:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
; CHECK-NEXT: retq
%and1 = and <4 x i32> %a, <i32 -1, i32 -1, i32 0, i32 0>
@@ -107,7 +107,7 @@ define <4 x i32> @test7(<4 x i32> %a, <4 x i32> %b) {
define <2 x i64> @test8(<2 x i64> %a, <2 x i64> %b) {
; CHECK-LABEL: test8:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
; CHECK-NEXT: retq
%and1 = and <2 x i64> %a, <i64 -1, i64 0>
@@ -119,7 +119,7 @@ define <2 x i64> @test8(<2 x i64> %a, <2 x i64> %b) {
define <4 x i32> @test9(<4 x i32> %a, <4 x i32> %b) {
; CHECK-LABEL: test9:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
; CHECK-NEXT: retq
%and1 = and <4 x i32> %a, <i32 0, i32 0, i32 -1, i32 -1>
@@ -131,7 +131,7 @@ define <4 x i32> @test9(<4 x i32> %a, <4 x i32> %b) {
define <2 x i64> @test10(<2 x i64> %a, <2 x i64> %b) {
; CHECK-LABEL: test10:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
; CHECK-NEXT: retq
%and1 = and <2 x i64> %a, <i64 0, i64 -1>
@@ -143,7 +143,7 @@ define <2 x i64> @test10(<2 x i64> %a, <2 x i64> %b) {
define <4 x i32> @test11(<4 x i32> %a, <4 x i32> %b) {
; CHECK-LABEL: test11:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
; CHECK-NEXT: retq
%and1 = and <4 x i32> %a, <i32 -1, i32 0, i32 0, i32 0>
@@ -155,7 +155,7 @@ define <4 x i32> @test11(<4 x i32> %a, <4 x i32> %b) {
define <4 x i32> @test12(<4 x i32> %a, <4 x i32> %b) {
; CHECK-LABEL: test12:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
; CHECK-NEXT: retq
%and1 = and <4 x i32> %a, <i32 0, i32 -1, i32 -1, i32 -1>
@@ -169,7 +169,7 @@ define <4 x i32> @test12(<4 x i32> %a, <4 x i32> %b) {
define <4 x i32> @test13(<4 x i32> %a, <4 x i32> %b) {
; CHECK-LABEL: test13:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[2,3]
; CHECK-NEXT: retq
%shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 1, i32 1, i32 4, i32 4>
@@ -181,8 +181,8 @@ define <4 x i32> @test13(<4 x i32> %a, <4 x i32> %b) {
define <2 x i64> @test14(<2 x i64> %a, <2 x i64> %b) {
; CHECK-LABEL: test14:
-; CHECK: # BB#0:
-; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; CHECK: # %bb.0:
+; CHECK-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; CHECK-NEXT: retq
%shuf1 = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32><i32 0, i32 2>
%shuf2 = shufflevector <2 x i64> %b, <2 x i64> zeroinitializer, <2 x i32><i32 2, i32 0>
@@ -193,7 +193,7 @@ define <2 x i64> @test14(<2 x i64> %a, <2 x i64> %b) {
define <4 x i32> @test15(<4 x i32> %a, <4 x i32> %b) {
; CHECK-LABEL: test15:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,1],xmm0[2,1]
; CHECK-NEXT: movaps %xmm1, %xmm0
; CHECK-NEXT: retq
@@ -206,9 +206,9 @@ define <4 x i32> @test15(<4 x i32> %a, <4 x i32> %b) {
define <2 x i64> @test16(<2 x i64> %a, <2 x i64> %b) {
; CHECK-LABEL: test16:
-; CHECK: # BB#0:
-; CHECK-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; CHECK-NEXT: movdqa %xmm1, %xmm0
+; CHECK: # %bb.0:
+; CHECK-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; CHECK-NEXT: movaps %xmm1, %xmm0
; CHECK-NEXT: retq
%shuf1 = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32><i32 2, i32 0>
%shuf2 = shufflevector <2 x i64> %b, <2 x i64> zeroinitializer, <2 x i32><i32 0, i32 2>
@@ -222,7 +222,7 @@ define <2 x i64> @test16(<2 x i64> %a, <2 x i64> %b) {
define <4 x i32> @test17(<4 x i32> %a, <4 x i32> %b) {
; CHECK-LABEL: test17:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: psllq $32, %xmm0
; CHECK-NEXT: movq {{.*#+}} xmm1 = xmm1[0],zero
; CHECK-NEXT: por %xmm1, %xmm0
@@ -236,7 +236,7 @@ define <4 x i32> @test17(<4 x i32> %a, <4 x i32> %b) {
define <4 x i32> @test18(<4 x i32> %a, <4 x i32> %b) {
; CHECK-LABEL: test18:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: pxor %xmm2, %xmm2
; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7]
; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,1,1]
@@ -252,7 +252,7 @@ define <4 x i32> @test18(<4 x i32> %a, <4 x i32> %b) {
define <4 x i32> @test19(<4 x i32> %a, <4 x i32> %b) {
; CHECK-LABEL: test19:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,2,3]
; CHECK-NEXT: pxor %xmm3, %xmm3
; CHECK-NEXT: pblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
@@ -269,7 +269,7 @@ define <4 x i32> @test19(<4 x i32> %a, <4 x i32> %b) {
define <2 x i64> @test20(<2 x i64> %a, <2 x i64> %b) {
; CHECK-LABEL: test20:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: por %xmm1, %xmm0
; CHECK-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
; CHECK-NEXT: retq
@@ -282,7 +282,7 @@ define <2 x i64> @test20(<2 x i64> %a, <2 x i64> %b) {
define <2 x i64> @test21(<2 x i64> %a, <2 x i64> %b) {
; CHECK-LABEL: test21:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: por %xmm1, %xmm0
; CHECK-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
; CHECK-NEXT: retq
@@ -298,7 +298,7 @@ define <2 x i64> @test21(<2 x i64> %a, <2 x i64> %b) {
define <2 x double> @test22(<2 x double> %a0, <2 x double> %a1) {
; CHECK-LABEL: test22:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; CHECK-NEXT: retq
%bc1 = bitcast <2 x double> %a0 to <2 x i64>
@@ -313,7 +313,7 @@ define <2 x double> @test22(<2 x double> %a0, <2 x double> %a1) {
define <4 x float> @test23(<4 x float> %a0, <4 x float> %a1) {
; CHECK-LABEL: test23:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
; CHECK-NEXT: retq
%bc1 = bitcast <4 x float> %a0 to <4 x i32>
@@ -328,7 +328,7 @@ define <4 x float> @test23(<4 x float> %a0, <4 x float> %a1) {
define <4 x float> @test24(<4 x float> %a0, <4 x float> %a1) {
; CHECK-LABEL: test24:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; CHECK-NEXT: retq
%bc1 = bitcast <4 x float> %a0 to <2 x i64>
@@ -343,7 +343,7 @@ define <4 x float> @test24(<4 x float> %a0, <4 x float> %a1) {
define <4 x float> @test25(<4 x float> %a0) {
; CHECK-LABEL: test25:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: blendps {{.*#+}} xmm0 = mem[0],xmm0[1,2],mem[3]
; CHECK-NEXT: retq
%bc1 = bitcast <4 x float> %a0 to <4 x i32>
@@ -361,7 +361,7 @@ define <4 x float> @test25(<4 x float> %a0) {
; handle legal vector value types.
define <4 x i8> @test_crash(<4 x i8> %a, <4 x i8> %b) {
; CHECK-LABEL: test_crash:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
; CHECK-NEXT: retq
%shuf1 = shufflevector <4 x i8> %a, <4 x i8> zeroinitializer, <4 x i32><i32 4, i32 4, i32 2, i32 3>
@@ -374,7 +374,7 @@ define <4 x i8> @test_crash(<4 x i8> %a, <4 x i8> %b) {
define <4 x i32> @test2b(<4 x i32> %a, <4 x i32> %b) {
; CHECK-LABEL: test2b:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
; CHECK-NEXT: retq
%shuf1 = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32><i32 0, i32 0, i32 6, i32 7>
@@ -385,7 +385,7 @@ define <4 x i32> @test2b(<4 x i32> %a, <4 x i32> %b) {
define <4 x i32> @test2c(<4 x i32> %a, <4 x i32> %b) {
; CHECK-LABEL: test2c:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
; CHECK-NEXT: retq
%shuf1 = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32><i32 0, i32 0, i32 6, i32 7>
@@ -397,7 +397,7 @@ define <4 x i32> @test2c(<4 x i32> %a, <4 x i32> %b) {
define <4 x i32> @test2d(<4 x i32> %a, <4 x i32> %b) {
; CHECK-LABEL: test2d:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
; CHECK-NEXT: retq
%shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 4, i32 2, i32 3>
@@ -410,7 +410,7 @@ define <4 x i32> @test2d(<4 x i32> %a, <4 x i32> %b) {
define <4 x i32> @test2e(<4 x i32> %a, <4 x i32> %b) {
; CHECK-LABEL: test2e:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
; CHECK-NEXT: retq
%shuf1 = shufflevector <4 x i32> %a, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>, <4 x i32><i32 undef, i32 4, i32 2, i32 3>
@@ -421,7 +421,7 @@ define <4 x i32> @test2e(<4 x i32> %a, <4 x i32> %b) {
define <4 x i32> @test2f(<4 x i32> %a, <4 x i32> %b) {
; CHECK-LABEL: test2f:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
; CHECK-NEXT: retq
%shuf1 = shufflevector <4 x i32> %a, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>, <4 x i32><i32 4, i32 4, i32 2, i32 3>
@@ -435,7 +435,7 @@ define <4 x i32> @test2f(<4 x i32> %a, <4 x i32> %b) {
define <2 x i64> @or_and_v2i64(<2 x i64> %a0) {
; CHECK-LABEL: or_and_v2i64:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: andps {{.*}}(%rip), %xmm0
; CHECK-NEXT: orps {{.*}}(%rip), %xmm0
; CHECK-NEXT: retq
@@ -448,7 +448,7 @@ define <2 x i64> @or_and_v2i64(<2 x i64> %a0) {
define <4 x i32> @or_and_v4i32(<4 x i32> %a0) {
; CHECK-LABEL: or_and_v4i32:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movaps {{.*#+}} xmm0 = [3,3,3,3]
; CHECK-NEXT: retq
%1 = and <4 x i32> %a0, <i32 1, i32 1, i32 1, i32 1>
@@ -460,7 +460,7 @@ define <4 x i32> @or_and_v4i32(<4 x i32> %a0) {
define <2 x i64> @or_zext_v2i32(<2 x i32> %a0) {
; CHECK-LABEL: or_zext_v2i32:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movaps {{.*#+}} xmm0 = [4294967295,4294967295]
; CHECK-NEXT: retq
%1 = zext <2 x i32> %a0 to <2 x i64>
@@ -470,7 +470,7 @@ define <2 x i64> @or_zext_v2i32(<2 x i32> %a0) {
define <4 x i32> @or_zext_v4i16(<4 x i16> %a0) {
; CHECK-LABEL: or_zext_v4i16:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movaps {{.*#+}} xmm0 = [65535,65535,65535,65535]
; CHECK-NEXT: retq
%1 = zext <4 x i16> %a0 to <4 x i32>
diff --git a/test/CodeGen/X86/combine-pmuldq.ll b/test/CodeGen/X86/combine-pmuldq.ll
index 09a142aa831b..53ab87a386b3 100644
--- a/test/CodeGen/X86/combine-pmuldq.ll
+++ b/test/CodeGen/X86/combine-pmuldq.ll
@@ -5,7 +5,7 @@
; TODO - shuffle+sext are superfluous
define <2 x i64> @combine_shuffle_sext_pmuldq(<4 x i32> %a0, <4 x i32> %a1) {
; SSE-LABEL: combine_shuffle_sext_pmuldq:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SSE-NEXT: pmovsxdq %xmm0, %xmm2
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
@@ -14,7 +14,7 @@ define <2 x i64> @combine_shuffle_sext_pmuldq(<4 x i32> %a0, <4 x i32> %a1) {
; SSE-NEXT: retq
;
; AVX-LABEL: combine_shuffle_sext_pmuldq:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX-NEXT: vpmovsxdq %xmm0, %xmm0
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
@@ -32,7 +32,7 @@ define <2 x i64> @combine_shuffle_sext_pmuldq(<4 x i32> %a0, <4 x i32> %a1) {
; TODO - shuffle+zext are superfluous
define <2 x i64> @combine_shuffle_zext_pmuludq(<4 x i32> %a0, <4 x i32> %a1) {
; SSE-LABEL: combine_shuffle_zext_pmuludq:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SSE-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
@@ -41,7 +41,7 @@ define <2 x i64> @combine_shuffle_zext_pmuludq(<4 x i32> %a0, <4 x i32> %a1) {
; SSE-NEXT: retq
;
; AVX-LABEL: combine_shuffle_zext_pmuludq:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
@@ -59,7 +59,7 @@ define <2 x i64> @combine_shuffle_zext_pmuludq(<4 x i32> %a0, <4 x i32> %a1) {
; TODO - blends are superfluous
define <2 x i64> @combine_shuffle_zero_pmuludq(<4 x i32> %a0, <4 x i32> %a1) {
; SSE-LABEL: combine_shuffle_zero_pmuludq:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pxor %xmm2, %xmm2
; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
@@ -67,7 +67,7 @@ define <2 x i64> @combine_shuffle_zero_pmuludq(<4 x i32> %a0, <4 x i32> %a1) {
; SSE-NEXT: retq
;
; AVX-LABEL: combine_shuffle_zero_pmuludq:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
; AVX-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
@@ -84,7 +84,7 @@ define <2 x i64> @combine_shuffle_zero_pmuludq(<4 x i32> %a0, <4 x i32> %a1) {
; TODO - blends are superfluous
define <4 x i64> @combine_shuffle_zero_pmuludq_256(<8 x i32> %a0, <8 x i32> %a1) {
; SSE-LABEL: combine_shuffle_zero_pmuludq_256:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pxor %xmm4, %xmm4
; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3],xmm1[4,5],xmm4[6,7]
; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3],xmm0[4,5],xmm4[6,7]
@@ -95,8 +95,8 @@ define <4 x i64> @combine_shuffle_zero_pmuludq_256(<8 x i32> %a0, <8 x i32> %a1)
; SSE-NEXT: retq
;
; AVX-LABEL: combine_shuffle_zero_pmuludq_256:
-; AVX: # BB#0:
-; AVX-NEXT: vpxor %ymm2, %ymm2, %ymm2
+; AVX: # %bb.0:
+; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7]
; AVX-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
; AVX-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
diff --git a/test/CodeGen/X86/combine-rotates.ll b/test/CodeGen/X86/combine-rotates.ll
index 0d74c937af33..e75973af05ed 100644
--- a/test/CodeGen/X86/combine-rotates.ll
+++ b/test/CodeGen/X86/combine-rotates.ll
@@ -5,12 +5,12 @@
; fold (rot (rot x, c1), c2) -> rot x, c1+c2
define <4 x i32> @combine_vec_rot_rot(<4 x i32> %x) {
; XOP-LABEL: combine_vec_rot_rot:
-; XOP: # BB#0:
+; XOP: # %bb.0:
; XOP-NEXT: vprotd {{.*}}(%rip), %xmm0, %xmm0
; XOP-NEXT: retq
;
; AVX512-LABEL: combine_vec_rot_rot:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vprolvd {{.*}}(%rip), %xmm0, %xmm0
; AVX512-NEXT: retq
%1 = lshr <4 x i32> %x, <i32 1, i32 2, i32 3, i32 4>
@@ -24,12 +24,12 @@ define <4 x i32> @combine_vec_rot_rot(<4 x i32> %x) {
define <4 x i32> @combine_vec_rot_rot_splat(<4 x i32> %x) {
; XOP-LABEL: combine_vec_rot_rot_splat:
-; XOP: # BB#0:
+; XOP: # %bb.0:
; XOP-NEXT: vprotd $7, %xmm0, %xmm0
; XOP-NEXT: retq
;
; AVX512-LABEL: combine_vec_rot_rot_splat:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vprold $7, %xmm0, %xmm0
; AVX512-NEXT: retq
%1 = lshr <4 x i32> %x, <i32 3, i32 3, i32 3, i32 3>
@@ -43,11 +43,11 @@ define <4 x i32> @combine_vec_rot_rot_splat(<4 x i32> %x) {
define <4 x i32> @combine_vec_rot_rot_splat_zero(<4 x i32> %x) {
; XOP-LABEL: combine_vec_rot_rot_splat_zero:
-; XOP: # BB#0:
+; XOP: # %bb.0:
; XOP-NEXT: retq
;
; AVX512-LABEL: combine_vec_rot_rot_splat_zero:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: retq
%1 = lshr <4 x i32> %x, <i32 1, i32 1, i32 1, i32 1>
%2 = shl <4 x i32> %x, <i32 31, i32 31, i32 31, i32 31>
diff --git a/test/CodeGen/X86/combine-sdiv.ll b/test/CodeGen/X86/combine-sdiv.ll
index ddb1786e37d5..8fb30a2594be 100644
--- a/test/CodeGen/X86/combine-sdiv.ll
+++ b/test/CodeGen/X86/combine-sdiv.ll
@@ -6,11 +6,11 @@
; fold (sdiv undef, x) -> 0
define <4 x i32> @combine_vec_sdiv_undef0(<4 x i32> %x) {
; SSE-LABEL: combine_vec_sdiv_undef0:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_sdiv_undef0:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: retq
%1 = sdiv <4 x i32> undef, %x
ret <4 x i32> %1
@@ -19,11 +19,11 @@ define <4 x i32> @combine_vec_sdiv_undef0(<4 x i32> %x) {
; fold (sdiv x, undef) -> undef
define <4 x i32> @combine_vec_sdiv_undef1(<4 x i32> %x) {
; SSE-LABEL: combine_vec_sdiv_undef1:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_sdiv_undef1:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: retq
%1 = sdiv <4 x i32> %x, undef
ret <4 x i32> %1
@@ -32,11 +32,11 @@ define <4 x i32> @combine_vec_sdiv_undef1(<4 x i32> %x) {
; fold (sdiv x, 1) -> x
define <4 x i32> @combine_vec_sdiv_by_one(<4 x i32> %x) {
; SSE-LABEL: combine_vec_sdiv_by_one:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_sdiv_by_one:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: retq
%1 = sdiv <4 x i32> %x, <i32 1, i32 1, i32 1, i32 1>
ret <4 x i32> %1
@@ -45,14 +45,14 @@ define <4 x i32> @combine_vec_sdiv_by_one(<4 x i32> %x) {
; fold (sdiv x, -1) -> 0 - x
define <4 x i32> @combine_vec_sdiv_by_negone(<4 x i32> %x) {
; SSE-LABEL: combine_vec_sdiv_by_negone:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pxor %xmm1, %xmm1
; SSE-NEXT: psubd %xmm0, %xmm1
; SSE-NEXT: movdqa %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_sdiv_by_negone:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX-NEXT: vpsubd %xmm0, %xmm1, %xmm0
; AVX-NEXT: retq
@@ -63,13 +63,13 @@ define <4 x i32> @combine_vec_sdiv_by_negone(<4 x i32> %x) {
; fold (sdiv x, y) -> (udiv x, y) iff x and y are positive
define <4 x i32> @combine_vec_sdiv_by_pos0(<4 x i32> %x) {
; SSE-LABEL: combine_vec_sdiv_by_pos0:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pand {{.*}}(%rip), %xmm0
; SSE-NEXT: psrld $2, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_sdiv_by_pos0:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: vpsrld $2, %xmm0, %xmm0
; AVX-NEXT: retq
@@ -80,7 +80,7 @@ define <4 x i32> @combine_vec_sdiv_by_pos0(<4 x i32> %x) {
define <4 x i32> @combine_vec_sdiv_by_pos1(<4 x i32> %x) {
; SSE-LABEL: combine_vec_sdiv_by_pos1:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pand {{.*}}(%rip), %xmm0
; SSE-NEXT: movdqa %xmm0, %xmm2
; SSE-NEXT: movdqa %xmm0, %xmm1
@@ -94,7 +94,7 @@ define <4 x i32> @combine_vec_sdiv_by_pos1(<4 x i32> %x) {
; SSE-NEXT: retq
;
; AVX1-LABEL: combine_vec_sdiv_by_pos1:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX1-NEXT: vpsrld $4, %xmm0, %xmm1
; AVX1-NEXT: vpsrld $2, %xmm0, %xmm2
@@ -105,7 +105,7 @@ define <4 x i32> @combine_vec_sdiv_by_pos1(<4 x i32> %x) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: combine_vec_sdiv_by_pos1:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
; AVX2-NEXT: retq
@@ -117,7 +117,7 @@ define <4 x i32> @combine_vec_sdiv_by_pos1(<4 x i32> %x) {
; fold (sdiv x, (1 << c)) -> x >>u c
define <4 x i32> @combine_vec_sdiv_by_pow2a(<4 x i32> %x) {
; SSE-LABEL: combine_vec_sdiv_by_pow2a:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movdqa %xmm0, %xmm1
; SSE-NEXT: psrad $31, %xmm1
; SSE-NEXT: psrld $30, %xmm1
@@ -127,7 +127,7 @@ define <4 x i32> @combine_vec_sdiv_by_pow2a(<4 x i32> %x) {
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_sdiv_by_pow2a:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpsrad $31, %xmm0, %xmm1
; AVX-NEXT: vpsrld $30, %xmm1, %xmm1
; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
@@ -139,7 +139,7 @@ define <4 x i32> @combine_vec_sdiv_by_pow2a(<4 x i32> %x) {
define <4 x i32> @combine_vec_sdiv_by_pow2b(<4 x i32> %x) {
; SSE-LABEL: combine_vec_sdiv_by_pow2b:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pextrd $1, %xmm0, %eax
; SSE-NEXT: movl %eax, %ecx
; SSE-NEXT: sarl $31, %ecx
@@ -164,7 +164,7 @@ define <4 x i32> @combine_vec_sdiv_by_pow2b(<4 x i32> %x) {
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_sdiv_by_pow2b:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpextrd $1, %xmm0, %eax
; AVX-NEXT: movl %eax, %ecx
; AVX-NEXT: sarl $31, %ecx
diff --git a/test/CodeGen/X86/combine-sext-in-reg.ll b/test/CodeGen/X86/combine-sext-in-reg.ll
index 3e60f3bf95ef..686945a7bcd9 100644
--- a/test/CodeGen/X86/combine-sext-in-reg.ll
+++ b/test/CodeGen/X86/combine-sext-in-reg.ll
@@ -5,7 +5,7 @@
; fold sextinreg(zext) -> sext
define <4 x i64> @sextinreg_zext_v16i8_4i64(<16 x i8> %a0) {
; SSE-LABEL: sextinreg_zext_v16i8_4i64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pmovsxbq %xmm0, %xmm2
; SSE-NEXT: psrld $16, %xmm0
; SSE-NEXT: pmovsxbq %xmm0, %xmm1
@@ -13,7 +13,7 @@ define <4 x i64> @sextinreg_zext_v16i8_4i64(<16 x i8> %a0) {
; SSE-NEXT: retq
;
; AVX-LABEL: sextinreg_zext_v16i8_4i64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpmovsxbq %xmm0, %ymm0
; AVX-NEXT: retq
%1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -26,7 +26,7 @@ define <4 x i64> @sextinreg_zext_v16i8_4i64(<16 x i8> %a0) {
; fold sextinreg(zext(sext)) -> sext
define <4 x i64> @sextinreg_zext_sext_v16i8_4i64(<16 x i8> %a0) {
; SSE-LABEL: sextinreg_zext_sext_v16i8_4i64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pmovsxbq %xmm0, %xmm2
; SSE-NEXT: psrld $16, %xmm0
; SSE-NEXT: pmovsxbq %xmm0, %xmm1
@@ -34,7 +34,7 @@ define <4 x i64> @sextinreg_zext_sext_v16i8_4i64(<16 x i8> %a0) {
; SSE-NEXT: retq
;
; AVX-LABEL: sextinreg_zext_sext_v16i8_4i64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpmovsxbq %xmm0, %ymm0
; AVX-NEXT: retq
%1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
diff --git a/test/CodeGen/X86/combine-shl.ll b/test/CodeGen/X86/combine-shl.ll
index a6491a0a8694..6effd2ad0441 100644
--- a/test/CodeGen/X86/combine-shl.ll
+++ b/test/CodeGen/X86/combine-shl.ll
@@ -5,12 +5,12 @@
; fold (shl 0, x) -> 0
define <4 x i32> @combine_vec_shl_zero(<4 x i32> %x) {
; SSE-LABEL: combine_vec_shl_zero:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: xorps %xmm0, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_shl_zero:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
; AVX-NEXT: retq
%1 = shl <4 x i32> zeroinitializer, %x
@@ -20,11 +20,11 @@ define <4 x i32> @combine_vec_shl_zero(<4 x i32> %x) {
; fold (shl x, c >= size(x)) -> undef
define <4 x i32> @combine_vec_shl_outofrange0(<4 x i32> %x) {
; SSE-LABEL: combine_vec_shl_outofrange0:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_shl_outofrange0:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: retq
%1 = shl <4 x i32> %x, <i32 33, i32 33, i32 33, i32 33>
ret <4 x i32> %1
@@ -32,12 +32,11 @@ define <4 x i32> @combine_vec_shl_outofrange0(<4 x i32> %x) {
define <4 x i32> @combine_vec_shl_outofrange1(<4 x i32> %x) {
; SSE-LABEL: combine_vec_shl_outofrange1:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_shl_outofrange1:
-; AVX: # BB#0:
-; AVX-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
+; AVX: # %bb.0:
; AVX-NEXT: retq
%1 = shl <4 x i32> %x, <i32 33, i32 34, i32 35, i32 36>
ret <4 x i32> %1
@@ -45,11 +44,11 @@ define <4 x i32> @combine_vec_shl_outofrange1(<4 x i32> %x) {
define <4 x i32> @combine_vec_shl_outofrange2(<4 x i32> %a0) {
; SSE-LABEL: combine_vec_shl_outofrange2:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_shl_outofrange2:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: retq
%1 = and <4 x i32> %a0, <i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647>
%2 = shl <4 x i32> %1, <i32 33, i32 33, i32 33, i32 33>
@@ -59,11 +58,11 @@ define <4 x i32> @combine_vec_shl_outofrange2(<4 x i32> %a0) {
; fold (shl x, 0) -> x
define <4 x i32> @combine_vec_shl_by_zero(<4 x i32> %x) {
; SSE-LABEL: combine_vec_shl_by_zero:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_shl_by_zero:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: retq
%1 = shl <4 x i32> %x, zeroinitializer
ret <4 x i32> %1
@@ -72,12 +71,12 @@ define <4 x i32> @combine_vec_shl_by_zero(<4 x i32> %x) {
; if (shl x, c) is known to be zero, return 0
define <4 x i32> @combine_vec_shl_known_zero0(<4 x i32> %x) {
; SSE-LABEL: combine_vec_shl_known_zero0:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: xorps %xmm0, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_shl_known_zero0:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
; AVX-NEXT: retq
%1 = and <4 x i32> %x, <i32 4294901760, i32 4294901760, i32 4294901760, i32 4294901760>
@@ -87,13 +86,13 @@ define <4 x i32> @combine_vec_shl_known_zero0(<4 x i32> %x) {
define <4 x i32> @combine_vec_shl_known_zero1(<4 x i32> %x) {
; SSE-LABEL: combine_vec_shl_known_zero1:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pand {{.*}}(%rip), %xmm0
; SSE-NEXT: pmulld {{.*}}(%rip), %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_shl_known_zero1:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: retq
@@ -105,7 +104,7 @@ define <4 x i32> @combine_vec_shl_known_zero1(<4 x i32> %x) {
; fold (shl x, (trunc (and y, c))) -> (shl x, (and (trunc y), (trunc c))).
define <4 x i32> @combine_vec_shl_trunc_and(<4 x i32> %x, <4 x i64> %y) {
; SSE-LABEL: combine_vec_shl_trunc_and:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
; SSE-NEXT: andps {{.*}}(%rip), %xmm1
; SSE-NEXT: pslld $23, %xmm1
@@ -115,7 +114,7 @@ define <4 x i32> @combine_vec_shl_trunc_and(<4 x i32> %x, <4 x i64> %y) {
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_shl_trunc_and:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
; AVX-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
; AVX-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
@@ -131,12 +130,12 @@ define <4 x i32> @combine_vec_shl_trunc_and(<4 x i32> %x, <4 x i64> %y) {
; fold (shl (shl x, c1), c2) -> (shl x, (add c1, c2))
define <4 x i32> @combine_vec_shl_shl0(<4 x i32> %x) {
; SSE-LABEL: combine_vec_shl_shl0:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pslld $6, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_shl_shl0:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpslld $6, %xmm0, %xmm0
; AVX-NEXT: retq
%1 = shl <4 x i32> %x, <i32 2, i32 2, i32 2, i32 2>
@@ -146,13 +145,12 @@ define <4 x i32> @combine_vec_shl_shl0(<4 x i32> %x) {
define <4 x i32> @combine_vec_shl_shl1(<4 x i32> %x) {
; SSE-LABEL: combine_vec_shl_shl1:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pmulld {{.*}}(%rip), %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_shl_shl1:
-; AVX: # BB#0:
-; AVX-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
+; AVX: # %bb.0:
; AVX-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: retq
%1 = shl <4 x i32> %x, <i32 0, i32 1, i32 2, i32 3>
@@ -163,12 +161,12 @@ define <4 x i32> @combine_vec_shl_shl1(<4 x i32> %x) {
; fold (shl (shl x, c1), c2) -> 0
define <4 x i32> @combine_vec_shl_shlr_zero0(<4 x i32> %x) {
; SSE-LABEL: combine_vec_shl_shlr_zero0:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: xorps %xmm0, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_shl_shlr_zero0:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
; AVX-NEXT: retq
%1 = shl <4 x i32> %x, <i32 16, i32 16, i32 16, i32 16>
@@ -178,14 +176,13 @@ define <4 x i32> @combine_vec_shl_shlr_zero0(<4 x i32> %x) {
define <4 x i32> @combine_vec_shl_shl_zero1(<4 x i32> %x) {
; SSE-LABEL: combine_vec_shl_shl_zero1:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: xorps %xmm0, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_shl_shl_zero1:
-; AVX: # BB#0:
-; AVX-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
-; AVX-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
+; AVX: # %bb.0:
+; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
; AVX-NEXT: retq
%1 = shl <4 x i32> %x, <i32 17, i32 18, i32 19, i32 20>
%2 = shl <4 x i32> %1, <i32 25, i32 26, i32 27, i32 28>
@@ -195,18 +192,17 @@ define <4 x i32> @combine_vec_shl_shl_zero1(<4 x i32> %x) {
; fold (shl (ext (shl x, c1)), c2) -> (ext (shl x, (add c1, c2)))
define <8 x i32> @combine_vec_shl_ext_shl0(<8 x i16> %x) {
; SSE-LABEL: combine_vec_shl_ext_shl0:
-; SSE: # BB#0:
-; SSE-NEXT: pmovsxwd %xmm0, %xmm2
-; SSE-NEXT: pslld $20, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE-NEXT: pmovsxwd %xmm0, %xmm1
+; SSE: # %bb.0:
+; SSE-NEXT: movdqa %xmm0, %xmm1
+; SSE-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; SSE-NEXT: pslld $20, %xmm1
-; SSE-NEXT: movdqa %xmm2, %xmm0
+; SSE-NEXT: pslld $20, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_shl_ext_shl0:
-; AVX: # BB#0:
-; AVX-NEXT: vpmovsxwd %xmm0, %ymm0
+; AVX: # %bb.0:
+; AVX-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX-NEXT: vpslld $20, %ymm0, %ymm0
; AVX-NEXT: retq
%1 = shl <8 x i16> %x, <i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4>
@@ -217,7 +213,7 @@ define <8 x i32> @combine_vec_shl_ext_shl0(<8 x i16> %x) {
define <8 x i32> @combine_vec_shl_ext_shl1(<8 x i16> %x) {
; SSE-LABEL: combine_vec_shl_ext_shl1:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pmullw {{.*}}(%rip), %xmm0
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; SSE-NEXT: pmovsxwd %xmm1, %xmm1
@@ -227,7 +223,7 @@ define <8 x i32> @combine_vec_shl_ext_shl1(<8 x i16> %x) {
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_shl_ext_shl1:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: vpmovsxwd %xmm0, %ymm0
; AVX-NEXT: vpsllvd {{.*}}(%rip), %ymm0, %ymm0
@@ -241,7 +237,7 @@ define <8 x i32> @combine_vec_shl_ext_shl1(<8 x i16> %x) {
; fold (shl (zext (srl x, C)), C) -> (zext (shl (srl x, C), C))
define <8 x i32> @combine_vec_shl_zext_lshr0(<8 x i16> %x) {
; SSE-LABEL: combine_vec_shl_zext_lshr0:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pand {{.*}}(%rip), %xmm0
; SSE-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
@@ -250,7 +246,7 @@ define <8 x i32> @combine_vec_shl_zext_lshr0(<8 x i16> %x) {
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_shl_zext_lshr0:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX-NEXT: retq
@@ -262,7 +258,7 @@ define <8 x i32> @combine_vec_shl_zext_lshr0(<8 x i16> %x) {
define <8 x i32> @combine_vec_shl_zext_lshr1(<8 x i16> %x) {
; SSE-LABEL: combine_vec_shl_zext_lshr1:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movdqa %xmm0, %xmm1
; SSE-NEXT: psrlw $8, %xmm1
; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5,6],xmm1[7]
@@ -283,7 +279,7 @@ define <8 x i32> @combine_vec_shl_zext_lshr1(<8 x i16> %x) {
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_shl_zext_lshr1:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [1,2,3,4,5,6,7,8]
; AVX-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
@@ -301,12 +297,12 @@ define <8 x i32> @combine_vec_shl_zext_lshr1(<8 x i16> %x) {
; fold (shl (sr[la] exact X, C1), C2) -> (shl X, (C2-C1)) if C1 <= C2
define <4 x i32> @combine_vec_shl_ge_ashr_extact0(<4 x i32> %x) {
; SSE-LABEL: combine_vec_shl_ge_ashr_extact0:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pslld $2, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_shl_ge_ashr_extact0:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpslld $2, %xmm0, %xmm0
; AVX-NEXT: retq
%1 = ashr exact <4 x i32> %x, <i32 3, i32 3, i32 3, i32 3>
@@ -316,7 +312,7 @@ define <4 x i32> @combine_vec_shl_ge_ashr_extact0(<4 x i32> %x) {
define <4 x i32> @combine_vec_shl_ge_ashr_extact1(<4 x i32> %x) {
; SSE-LABEL: combine_vec_shl_ge_ashr_extact1:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movdqa %xmm0, %xmm1
; SSE-NEXT: psrad $8, %xmm1
; SSE-NEXT: movdqa %xmm0, %xmm2
@@ -331,7 +327,7 @@ define <4 x i32> @combine_vec_shl_ge_ashr_extact1(<4 x i32> %x) {
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_shl_ge_ashr_extact1:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: retq
@@ -343,12 +339,12 @@ define <4 x i32> @combine_vec_shl_ge_ashr_extact1(<4 x i32> %x) {
; fold (shl (sr[la] exact X, C1), C2) -> (sr[la] X, (C2-C1)) if C1 > C2
define <4 x i32> @combine_vec_shl_lt_ashr_extact0(<4 x i32> %x) {
; SSE-LABEL: combine_vec_shl_lt_ashr_extact0:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: psrad $2, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_shl_lt_ashr_extact0:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpsrad $2, %xmm0, %xmm0
; AVX-NEXT: retq
%1 = ashr exact <4 x i32> %x, <i32 5, i32 5, i32 5, i32 5>
@@ -358,7 +354,7 @@ define <4 x i32> @combine_vec_shl_lt_ashr_extact0(<4 x i32> %x) {
define <4 x i32> @combine_vec_shl_lt_ashr_extact1(<4 x i32> %x) {
; SSE-LABEL: combine_vec_shl_lt_ashr_extact1:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movdqa %xmm0, %xmm1
; SSE-NEXT: psrad $8, %xmm1
; SSE-NEXT: movdqa %xmm0, %xmm2
@@ -373,7 +369,7 @@ define <4 x i32> @combine_vec_shl_lt_ashr_extact1(<4 x i32> %x) {
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_shl_lt_ashr_extact1:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: retq
@@ -385,13 +381,13 @@ define <4 x i32> @combine_vec_shl_lt_ashr_extact1(<4 x i32> %x) {
; fold (shl (srl x, c1), c2) -> (and (shl x, (sub c2, c1), MASK) if C2 > C1
define <4 x i32> @combine_vec_shl_gt_lshr0(<4 x i32> %x) {
; SSE-LABEL: combine_vec_shl_gt_lshr0:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pslld $2, %xmm0
; SSE-NEXT: pand {{.*}}(%rip), %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_shl_gt_lshr0:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4294967264,4294967264,4294967264,4294967264]
; AVX-NEXT: vpslld $2, %xmm0, %xmm0
; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
@@ -403,7 +399,7 @@ define <4 x i32> @combine_vec_shl_gt_lshr0(<4 x i32> %x) {
define <4 x i32> @combine_vec_shl_gt_lshr1(<4 x i32> %x) {
; SSE-LABEL: combine_vec_shl_gt_lshr1:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movdqa %xmm0, %xmm1
; SSE-NEXT: psrld $8, %xmm1
; SSE-NEXT: movdqa %xmm0, %xmm2
@@ -418,7 +414,7 @@ define <4 x i32> @combine_vec_shl_gt_lshr1(<4 x i32> %x) {
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_shl_gt_lshr1:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: retq
@@ -430,13 +426,13 @@ define <4 x i32> @combine_vec_shl_gt_lshr1(<4 x i32> %x) {
; fold (shl (srl x, c1), c2) -> (and (srl x, (sub c1, c2), MASK) if C1 >= C2
define <4 x i32> @combine_vec_shl_le_lshr0(<4 x i32> %x) {
; SSE-LABEL: combine_vec_shl_le_lshr0:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: psrld $2, %xmm0
; SSE-NEXT: pand {{.*}}(%rip), %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_shl_le_lshr0:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1073741816,1073741816,1073741816,1073741816]
; AVX-NEXT: vpsrld $2, %xmm0, %xmm0
; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
@@ -448,7 +444,7 @@ define <4 x i32> @combine_vec_shl_le_lshr0(<4 x i32> %x) {
define <4 x i32> @combine_vec_shl_le_lshr1(<4 x i32> %x) {
; SSE-LABEL: combine_vec_shl_le_lshr1:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movdqa %xmm0, %xmm1
; SSE-NEXT: psrld $8, %xmm1
; SSE-NEXT: movdqa %xmm0, %xmm2
@@ -463,7 +459,7 @@ define <4 x i32> @combine_vec_shl_le_lshr1(<4 x i32> %x) {
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_shl_le_lshr1:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: retq
@@ -475,12 +471,12 @@ define <4 x i32> @combine_vec_shl_le_lshr1(<4 x i32> %x) {
; fold (shl (sra x, c1), c1) -> (and x, (shl -1, c1))
define <4 x i32> @combine_vec_shl_ashr0(<4 x i32> %x) {
; SSE-LABEL: combine_vec_shl_ashr0:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: andps {{.*}}(%rip), %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_shl_ashr0:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [4294967264,4294967264,4294967264,4294967264]
; AVX-NEXT: vandps %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
@@ -491,12 +487,12 @@ define <4 x i32> @combine_vec_shl_ashr0(<4 x i32> %x) {
define <4 x i32> @combine_vec_shl_ashr1(<4 x i32> %x) {
; SSE-LABEL: combine_vec_shl_ashr1:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: andps {{.*}}(%rip), %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_shl_ashr1:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: retq
%1 = ashr <4 x i32> %x, <i32 5, i32 6, i32 7, i32 8>
@@ -507,13 +503,13 @@ define <4 x i32> @combine_vec_shl_ashr1(<4 x i32> %x) {
; fold (shl (add x, c1), c2) -> (add (shl x, c2), c1 << c2)
define <4 x i32> @combine_vec_shl_add0(<4 x i32> %x) {
; SSE-LABEL: combine_vec_shl_add0:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pslld $2, %xmm0
; SSE-NEXT: paddd {{.*}}(%rip), %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_shl_add0:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpslld $2, %xmm0, %xmm0
; AVX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [20,20,20,20]
; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
@@ -525,13 +521,13 @@ define <4 x i32> @combine_vec_shl_add0(<4 x i32> %x) {
define <4 x i32> @combine_vec_shl_add1(<4 x i32> %x) {
; SSE-LABEL: combine_vec_shl_add1:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pmulld {{.*}}(%rip), %xmm0
; SSE-NEXT: paddd {{.*}}(%rip), %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_shl_add1:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: retq
@@ -540,19 +536,19 @@ define <4 x i32> @combine_vec_shl_add1(<4 x i32> %x) {
ret <4 x i32> %2
}
-; FIXME: fold (shl (or x, c1), c2) -> (or (shl x, c2), c1 << c2)
+; fold (shl (or x, c1), c2) -> (or (shl x, c2), c1 << c2)
define <4 x i32> @combine_vec_shl_or0(<4 x i32> %x) {
; SSE-LABEL: combine_vec_shl_or0:
-; SSE: # BB#0:
-; SSE-NEXT: por {{.*}}(%rip), %xmm0
+; SSE: # %bb.0:
; SSE-NEXT: pslld $2, %xmm0
+; SSE-NEXT: por {{.*}}(%rip), %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_shl_or0:
-; AVX: # BB#0:
-; AVX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [5,5,5,5]
-; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
+; AVX: # %bb.0:
; AVX-NEXT: vpslld $2, %xmm0, %xmm0
+; AVX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [20,20,20,20]
+; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%1 = or <4 x i32> %x, <i32 5, i32 5, i32 5, i32 5>
%2 = shl <4 x i32> %1, <i32 2, i32 2, i32 2, i32 2>
@@ -561,15 +557,15 @@ define <4 x i32> @combine_vec_shl_or0(<4 x i32> %x) {
define <4 x i32> @combine_vec_shl_or1(<4 x i32> %x) {
; SSE-LABEL: combine_vec_shl_or1:
-; SSE: # BB#0:
-; SSE-NEXT: por {{.*}}(%rip), %xmm0
+; SSE: # %bb.0:
; SSE-NEXT: pmulld {{.*}}(%rip), %xmm0
+; SSE-NEXT: por {{.*}}(%rip), %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_shl_or1:
-; AVX: # BB#0:
-; AVX-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
+; AVX: # %bb.0:
; AVX-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: retq
%1 = or <4 x i32> %x, <i32 5, i32 6, i32 7, i32 8>
%2 = shl <4 x i32> %1, <i32 1, i32 2, i32 3, i32 4>
@@ -579,12 +575,12 @@ define <4 x i32> @combine_vec_shl_or1(<4 x i32> %x) {
; fold (shl (mul x, c1), c2) -> (mul x, c1 << c2)
define <4 x i32> @combine_vec_shl_mul0(<4 x i32> %x) {
; SSE-LABEL: combine_vec_shl_mul0:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pmulld {{.*}}(%rip), %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_shl_mul0:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [20,20,20,20]
; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
@@ -595,12 +591,12 @@ define <4 x i32> @combine_vec_shl_mul0(<4 x i32> %x) {
define <4 x i32> @combine_vec_shl_mul1(<4 x i32> %x) {
; SSE-LABEL: combine_vec_shl_mul1:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pmulld {{.*}}(%rip), %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_shl_mul1:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: retq
%1 = mul <4 x i32> %x, <i32 5, i32 6, i32 7, i32 8>
diff --git a/test/CodeGen/X86/combine-sra.ll b/test/CodeGen/X86/combine-sra.ll
index 49ebce4857e0..436f48e14b08 100644
--- a/test/CodeGen/X86/combine-sra.ll
+++ b/test/CodeGen/X86/combine-sra.ll
@@ -5,12 +5,12 @@
; fold (sra 0, x) -> 0
define <4 x i32> @combine_vec_ashr_zero(<4 x i32> %x) {
; SSE-LABEL: combine_vec_ashr_zero:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: xorps %xmm0, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_ashr_zero:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
; AVX-NEXT: retq
%1 = ashr <4 x i32> zeroinitializer, %x
@@ -20,12 +20,12 @@ define <4 x i32> @combine_vec_ashr_zero(<4 x i32> %x) {
; fold (sra -1, x) -> -1
define <4 x i32> @combine_vec_ashr_allones(<4 x i32> %x) {
; SSE-LABEL: combine_vec_ashr_allones:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pcmpeqd %xmm0, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_ashr_allones:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; AVX-NEXT: retq
%1 = ashr <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, %x
@@ -35,11 +35,11 @@ define <4 x i32> @combine_vec_ashr_allones(<4 x i32> %x) {
; fold (sra x, c >= size(x)) -> undef
define <4 x i32> @combine_vec_ashr_outofrange0(<4 x i32> %x) {
; SSE-LABEL: combine_vec_ashr_outofrange0:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_ashr_outofrange0:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: retq
%1 = ashr <4 x i32> %x, <i32 33, i32 33, i32 33, i32 33>
ret <4 x i32> %1
@@ -47,13 +47,11 @@ define <4 x i32> @combine_vec_ashr_outofrange0(<4 x i32> %x) {
define <4 x i32> @combine_vec_ashr_outofrange1(<4 x i32> %x) {
; SSE-LABEL: combine_vec_ashr_outofrange1:
-; SSE: # BB#0:
-; SSE-NEXT: psrad $31, %xmm0
+; SSE: # %bb.0:
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_ashr_outofrange1:
-; AVX: # BB#0:
-; AVX-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0
+; AVX: # %bb.0:
; AVX-NEXT: retq
%1 = ashr <4 x i32> %x, <i32 33, i32 34, i32 35, i32 36>
ret <4 x i32> %1
@@ -62,11 +60,11 @@ define <4 x i32> @combine_vec_ashr_outofrange1(<4 x i32> %x) {
; fold (sra x, 0) -> x
define <4 x i32> @combine_vec_ashr_by_zero(<4 x i32> %x) {
; SSE-LABEL: combine_vec_ashr_by_zero:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_ashr_by_zero:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: retq
%1 = ashr <4 x i32> %x, zeroinitializer
ret <4 x i32> %1
@@ -75,12 +73,12 @@ define <4 x i32> @combine_vec_ashr_by_zero(<4 x i32> %x) {
; fold (sra (sra x, c1), c2) -> (sra x, (add c1, c2))
define <4 x i32> @combine_vec_ashr_ashr0(<4 x i32> %x) {
; SSE-LABEL: combine_vec_ashr_ashr0:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: psrad $6, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_ashr_ashr0:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpsrad $6, %xmm0, %xmm0
; AVX-NEXT: retq
%1 = ashr <4 x i32> %x, <i32 2, i32 2, i32 2, i32 2>
@@ -90,31 +88,21 @@ define <4 x i32> @combine_vec_ashr_ashr0(<4 x i32> %x) {
define <4 x i32> @combine_vec_ashr_ashr1(<4 x i32> %x) {
; SSE-LABEL: combine_vec_ashr_ashr1:
-; SSE: # BB#0:
+; SSE: # %bb.0:
+; SSE-NEXT: movdqa %xmm0, %xmm1
+; SSE-NEXT: psrad $10, %xmm1
; SSE-NEXT: movdqa %xmm0, %xmm2
+; SSE-NEXT: psrad $6, %xmm2
+; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
; SSE-NEXT: movdqa %xmm0, %xmm1
-; SSE-NEXT: psrad $2, %xmm1
-; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7]
-; SSE-NEXT: psrad $3, %xmm0
-; SSE-NEXT: psrad $1, %xmm2
-; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5,6,7]
-; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; SSE-NEXT: movdqa %xmm1, %xmm0
-; SSE-NEXT: psrad $7, %xmm0
-; SSE-NEXT: movdqa %xmm1, %xmm2
-; SSE-NEXT: psrad $5, %xmm2
-; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5,6,7]
-; SSE-NEXT: movdqa %xmm1, %xmm0
-; SSE-NEXT: psrad $6, %xmm0
-; SSE-NEXT: psrad $4, %xmm1
-; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5,6,7]
-; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; SSE-NEXT: movdqa %xmm1, %xmm0
+; SSE-NEXT: psrad $8, %xmm1
+; SSE-NEXT: psrad $4, %xmm0
+; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_ashr_ashr1:
-; AVX: # BB#0:
-; AVX-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0
+; AVX: # %bb.0:
; AVX-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: retq
%1 = ashr <4 x i32> %x, <i32 0, i32 1, i32 2, i32 3>
@@ -124,43 +112,53 @@ define <4 x i32> @combine_vec_ashr_ashr1(<4 x i32> %x) {
define <4 x i32> @combine_vec_ashr_ashr2(<4 x i32> %x) {
; SSE-LABEL: combine_vec_ashr_ashr2:
-; SSE: # BB#0:
+; SSE: # %bb.0:
+; SSE-NEXT: psrad $31, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: combine_vec_ashr_ashr2:
+; AVX: # %bb.0:
+; AVX-NEXT: vpsrad $31, %xmm0, %xmm0
+; AVX-NEXT: retq
+ %1 = ashr <4 x i32> %x, <i32 17, i32 18, i32 19, i32 20>
+ %2 = ashr <4 x i32> %1, <i32 25, i32 26, i32 27, i32 28>
+ ret <4 x i32> %2
+}
+
+define <4 x i32> @combine_vec_ashr_ashr3(<4 x i32> %x) {
+; SSE-LABEL: combine_vec_ashr_ashr3:
+; SSE: # %bb.0:
; SSE-NEXT: movdqa %xmm0, %xmm1
-; SSE-NEXT: psrad $20, %xmm1
+; SSE-NEXT: psrad $27, %xmm1
; SSE-NEXT: movdqa %xmm0, %xmm2
-; SSE-NEXT: psrad $18, %xmm2
+; SSE-NEXT: psrad $5, %xmm2
; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
; SSE-NEXT: movdqa %xmm0, %xmm1
-; SSE-NEXT: psrad $19, %xmm1
-; SSE-NEXT: psrad $17, %xmm0
+; SSE-NEXT: psrad $31, %xmm1
+; SSE-NEXT: psrad $1, %xmm0
; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
; SSE-NEXT: movdqa %xmm0, %xmm1
-; SSE-NEXT: psrad $28, %xmm1
-; SSE-NEXT: movdqa %xmm0, %xmm2
-; SSE-NEXT: psrad $26, %xmm2
-; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
-; SSE-NEXT: movdqa %xmm0, %xmm1
-; SSE-NEXT: psrad $27, %xmm1
-; SSE-NEXT: psrad $25, %xmm0
-; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
-; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; SSE-NEXT: psrad $10, %xmm1
+; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; SSE-NEXT: psrad $31, %xmm0
+; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
; SSE-NEXT: retq
;
-; AVX-LABEL: combine_vec_ashr_ashr2:
-; AVX: # BB#0:
+; AVX-LABEL: combine_vec_ashr_ashr3:
+; AVX: # %bb.0:
; AVX-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: retq
- %1 = ashr <4 x i32> %x, <i32 17, i32 18, i32 19, i32 20>
- %2 = ashr <4 x i32> %1, <i32 25, i32 26, i32 27, i32 28>
+ %1 = ashr <4 x i32> %x, <i32 1, i32 5, i32 50, i32 27>
+ %2 = ashr <4 x i32> %1, <i32 33, i32 10, i32 33, i32 0>
ret <4 x i32> %2
}
; fold (sra x, (trunc (and y, c))) -> (sra x, (and (trunc y), (trunc c))).
define <4 x i32> @combine_vec_ashr_trunc_and(<4 x i32> %x, <4 x i64> %y) {
; SSE-LABEL: combine_vec_ashr_trunc_and:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
; SSE-NEXT: andps {{.*}}(%rip), %xmm1
; SSE-NEXT: movaps %xmm1, %xmm2
@@ -183,7 +181,7 @@ define <4 x i32> @combine_vec_ashr_trunc_and(<4 x i32> %x, <4 x i64> %y) {
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_ashr_trunc_and:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
; AVX-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
; AVX-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
@@ -200,7 +198,7 @@ define <4 x i32> @combine_vec_ashr_trunc_and(<4 x i32> %x, <4 x i64> %y) {
; if c1 is equal to the number of bits the trunc removes
define <4 x i32> @combine_vec_ashr_trunc_lshr(<4 x i64> %x) {
; SSE-LABEL: combine_vec_ashr_trunc_lshr:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: psrlq $32, %xmm1
; SSE-NEXT: psrlq $32, %xmm0
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
@@ -216,7 +214,7 @@ define <4 x i32> @combine_vec_ashr_trunc_lshr(<4 x i64> %x) {
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_ashr_trunc_lshr:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpsrlq $32, %ymm0, %ymm0
; AVX-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
; AVX-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
@@ -233,7 +231,7 @@ define <4 x i32> @combine_vec_ashr_trunc_lshr(<4 x i64> %x) {
; if c1 is equal to the number of bits the trunc removes
define <4 x i32> @combine_vec_ashr_trunc_ashr(<4 x i64> %x) {
; SSE-LABEL: combine_vec_ashr_trunc_ashr:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
; SSE-NEXT: psrad $31, %xmm1
; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
@@ -250,7 +248,7 @@ define <4 x i32> @combine_vec_ashr_trunc_ashr(<4 x i64> %x) {
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_ashr_trunc_ashr:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,3,2,3,5,7,6,7]
; AVX-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0
@@ -265,7 +263,7 @@ define <4 x i32> @combine_vec_ashr_trunc_ashr(<4 x i64> %x) {
; If the sign bit is known to be zero, switch this to a SRL.
define <4 x i32> @combine_vec_ashr_positive(<4 x i32> %x, <4 x i32> %y) {
; SSE-LABEL: combine_vec_ashr_positive:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pand {{.*}}(%rip), %xmm0
; SSE-NEXT: movdqa %xmm1, %xmm2
; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -287,7 +285,7 @@ define <4 x i32> @combine_vec_ashr_positive(<4 x i32> %x, <4 x i32> %y) {
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_ashr_positive:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
@@ -298,12 +296,12 @@ define <4 x i32> @combine_vec_ashr_positive(<4 x i32> %x, <4 x i32> %y) {
define <4 x i32> @combine_vec_ashr_positive_splat(<4 x i32> %x, <4 x i32> %y) {
; SSE-LABEL: combine_vec_ashr_positive_splat:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: xorps %xmm0, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_ashr_positive_splat:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
; AVX-NEXT: retq
%1 = and <4 x i32> %x, <i32 1023, i32 1023, i32 1023, i32 1023>
diff --git a/test/CodeGen/X86/combine-srem.ll b/test/CodeGen/X86/combine-srem.ll
index 6c1956ac77c9..336c6b8ac03e 100644
--- a/test/CodeGen/X86/combine-srem.ll
+++ b/test/CodeGen/X86/combine-srem.ll
@@ -6,11 +6,11 @@
; fold (srem undef, x) -> 0
define <4 x i32> @combine_vec_srem_undef0(<4 x i32> %x) {
; SSE-LABEL: combine_vec_srem_undef0:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_srem_undef0:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: retq
%1 = srem <4 x i32> undef, %x
ret <4 x i32> %1
@@ -19,11 +19,11 @@ define <4 x i32> @combine_vec_srem_undef0(<4 x i32> %x) {
; fold (srem x, undef) -> undef
define <4 x i32> @combine_vec_srem_undef1(<4 x i32> %x) {
; SSE-LABEL: combine_vec_srem_undef1:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_srem_undef1:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: retq
%1 = srem <4 x i32> %x, undef
ret <4 x i32> %1
@@ -32,17 +32,17 @@ define <4 x i32> @combine_vec_srem_undef1(<4 x i32> %x) {
; fold (srem x, y) -> (urem x, y) iff x and y are positive
define <4 x i32> @combine_vec_srem_by_pos0(<4 x i32> %x) {
; SSE-LABEL: combine_vec_srem_by_pos0:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: andps {{.*}}(%rip), %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: combine_vec_srem_by_pos0:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: combine_vec_srem_by_pos0:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vbroadcastss {{.*}}(%rip), %xmm1
; AVX2-NEXT: vandps %xmm1, %xmm0, %xmm0
; AVX2-NEXT: retq
@@ -53,12 +53,12 @@ define <4 x i32> @combine_vec_srem_by_pos0(<4 x i32> %x) {
define <4 x i32> @combine_vec_srem_by_pos1(<4 x i32> %x) {
; SSE-LABEL: combine_vec_srem_by_pos1:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: andps {{.*}}(%rip), %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_srem_by_pos1:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: retq
%1 = and <4 x i32> %x, <i32 255, i32 255, i32 255, i32 255>
diff --git a/test/CodeGen/X86/combine-srl.ll b/test/CodeGen/X86/combine-srl.ll
index 473fae19f4fd..58452855055f 100644
--- a/test/CodeGen/X86/combine-srl.ll
+++ b/test/CodeGen/X86/combine-srl.ll
@@ -5,12 +5,12 @@
; fold (srl 0, x) -> 0
define <4 x i32> @combine_vec_lshr_zero(<4 x i32> %x) {
; SSE-LABEL: combine_vec_lshr_zero:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: xorps %xmm0, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_lshr_zero:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
; AVX-NEXT: retq
%1 = lshr <4 x i32> zeroinitializer, %x
@@ -20,11 +20,11 @@ define <4 x i32> @combine_vec_lshr_zero(<4 x i32> %x) {
; fold (srl x, c >= size(x)) -> undef
define <4 x i32> @combine_vec_lshr_outofrange0(<4 x i32> %x) {
; SSE-LABEL: combine_vec_lshr_outofrange0:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_lshr_outofrange0:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: retq
%1 = lshr <4 x i32> %x, <i32 33, i32 33, i32 33, i32 33>
ret <4 x i32> %1
@@ -32,13 +32,11 @@ define <4 x i32> @combine_vec_lshr_outofrange0(<4 x i32> %x) {
define <4 x i32> @combine_vec_lshr_outofrange1(<4 x i32> %x) {
; SSE-LABEL: combine_vec_lshr_outofrange1:
-; SSE: # BB#0:
-; SSE-NEXT: xorps %xmm0, %xmm0
+; SSE: # %bb.0:
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_lshr_outofrange1:
-; AVX: # BB#0:
-; AVX-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
+; AVX: # %bb.0:
; AVX-NEXT: retq
%1 = lshr <4 x i32> %x, <i32 33, i32 34, i32 35, i32 36>
ret <4 x i32> %1
@@ -47,11 +45,11 @@ define <4 x i32> @combine_vec_lshr_outofrange1(<4 x i32> %x) {
; fold (srl x, 0) -> x
define <4 x i32> @combine_vec_lshr_by_zero(<4 x i32> %x) {
; SSE-LABEL: combine_vec_lshr_by_zero:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_lshr_by_zero:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: retq
%1 = lshr <4 x i32> %x, zeroinitializer
ret <4 x i32> %1
@@ -60,12 +58,12 @@ define <4 x i32> @combine_vec_lshr_by_zero(<4 x i32> %x) {
; if (srl x, c) is known to be zero, return 0
define <4 x i32> @combine_vec_lshr_known_zero0(<4 x i32> %x) {
; SSE-LABEL: combine_vec_lshr_known_zero0:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: xorps %xmm0, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_lshr_known_zero0:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
; AVX-NEXT: retq
%1 = and <4 x i32> %x, <i32 15, i32 15, i32 15, i32 15>
@@ -75,7 +73,7 @@ define <4 x i32> @combine_vec_lshr_known_zero0(<4 x i32> %x) {
define <4 x i32> @combine_vec_lshr_known_zero1(<4 x i32> %x) {
; SSE-LABEL: combine_vec_lshr_known_zero1:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pand {{.*}}(%rip), %xmm0
; SSE-NEXT: movdqa %xmm0, %xmm1
; SSE-NEXT: psrld $11, %xmm1
@@ -90,7 +88,7 @@ define <4 x i32> @combine_vec_lshr_known_zero1(<4 x i32> %x) {
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_lshr_known_zero1:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [15,15,15,15]
; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
@@ -103,12 +101,12 @@ define <4 x i32> @combine_vec_lshr_known_zero1(<4 x i32> %x) {
; fold (srl (srl x, c1), c2) -> (srl x, (add c1, c2))
define <4 x i32> @combine_vec_lshr_lshr0(<4 x i32> %x) {
; SSE-LABEL: combine_vec_lshr_lshr0:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: psrld $6, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_lshr_lshr0:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpsrld $6, %xmm0, %xmm0
; AVX-NEXT: retq
%1 = lshr <4 x i32> %x, <i32 2, i32 2, i32 2, i32 2>
@@ -118,31 +116,21 @@ define <4 x i32> @combine_vec_lshr_lshr0(<4 x i32> %x) {
define <4 x i32> @combine_vec_lshr_lshr1(<4 x i32> %x) {
; SSE-LABEL: combine_vec_lshr_lshr1:
-; SSE: # BB#0:
+; SSE: # %bb.0:
+; SSE-NEXT: movdqa %xmm0, %xmm1
+; SSE-NEXT: psrld $10, %xmm1
; SSE-NEXT: movdqa %xmm0, %xmm2
+; SSE-NEXT: psrld $6, %xmm2
+; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
; SSE-NEXT: movdqa %xmm0, %xmm1
-; SSE-NEXT: psrld $2, %xmm1
-; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7]
-; SSE-NEXT: psrld $3, %xmm0
-; SSE-NEXT: psrld $1, %xmm2
-; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5,6,7]
-; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; SSE-NEXT: movdqa %xmm1, %xmm0
-; SSE-NEXT: psrld $7, %xmm0
-; SSE-NEXT: movdqa %xmm1, %xmm2
-; SSE-NEXT: psrld $5, %xmm2
-; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5,6,7]
-; SSE-NEXT: movdqa %xmm1, %xmm0
-; SSE-NEXT: psrld $6, %xmm0
-; SSE-NEXT: psrld $4, %xmm1
-; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5,6,7]
-; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; SSE-NEXT: movdqa %xmm1, %xmm0
+; SSE-NEXT: psrld $8, %xmm1
+; SSE-NEXT: psrld $4, %xmm0
+; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_lshr_lshr1:
-; AVX: # BB#0:
-; AVX-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
+; AVX: # %bb.0:
; AVX-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: retq
%1 = lshr <4 x i32> %x, <i32 0, i32 1, i32 2, i32 3>
@@ -153,12 +141,12 @@ define <4 x i32> @combine_vec_lshr_lshr1(<4 x i32> %x) {
; fold (srl (srl x, c1), c2) -> 0
define <4 x i32> @combine_vec_lshr_lshr_zero0(<4 x i32> %x) {
; SSE-LABEL: combine_vec_lshr_lshr_zero0:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: xorps %xmm0, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_lshr_lshr_zero0:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
; AVX-NEXT: retq
%1 = lshr <4 x i32> %x, <i32 16, i32 16, i32 16, i32 16>
@@ -168,33 +156,13 @@ define <4 x i32> @combine_vec_lshr_lshr_zero0(<4 x i32> %x) {
define <4 x i32> @combine_vec_lshr_lshr_zero1(<4 x i32> %x) {
; SSE-LABEL: combine_vec_lshr_lshr_zero1:
-; SSE: # BB#0:
-; SSE-NEXT: movdqa %xmm0, %xmm1
-; SSE-NEXT: psrld $20, %xmm1
-; SSE-NEXT: movdqa %xmm0, %xmm2
-; SSE-NEXT: psrld $18, %xmm2
-; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
-; SSE-NEXT: movdqa %xmm0, %xmm1
-; SSE-NEXT: psrld $19, %xmm1
-; SSE-NEXT: psrld $17, %xmm0
-; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
-; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
-; SSE-NEXT: movdqa %xmm0, %xmm1
-; SSE-NEXT: psrld $28, %xmm1
-; SSE-NEXT: movdqa %xmm0, %xmm2
-; SSE-NEXT: psrld $26, %xmm2
-; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
-; SSE-NEXT: movdqa %xmm0, %xmm1
-; SSE-NEXT: psrld $27, %xmm1
-; SSE-NEXT: psrld $25, %xmm0
-; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
-; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; SSE: # %bb.0:
+; SSE-NEXT: xorps %xmm0, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_lshr_lshr_zero1:
-; AVX: # BB#0:
-; AVX-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
-; AVX-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
+; AVX: # %bb.0:
+; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
; AVX-NEXT: retq
%1 = lshr <4 x i32> %x, <i32 17, i32 18, i32 19, i32 20>
%2 = lshr <4 x i32> %1, <i32 25, i32 26, i32 27, i32 28>
@@ -204,18 +172,17 @@ define <4 x i32> @combine_vec_lshr_lshr_zero1(<4 x i32> %x) {
; fold (srl (trunc (srl x, c1)), c2) -> (trunc (srl x, (add c1, c2)))
define <4 x i32> @combine_vec_lshr_trunc_lshr0(<4 x i64> %x) {
; SSE-LABEL: combine_vec_lshr_trunc_lshr0:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: psrlq $48, %xmm1
; SSE-NEXT: psrlq $48, %xmm0
-; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; SSE-NEXT: packusdw %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_lshr_trunc_lshr0:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpsrlq $48, %ymm0, %ymm0
-; AVX-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
%1 = lshr <4 x i64> %x, <i64 32, i64 32, i64 32, i64 32>
@@ -226,7 +193,7 @@ define <4 x i32> @combine_vec_lshr_trunc_lshr0(<4 x i64> %x) {
define <4 x i32> @combine_vec_lshr_trunc_lshr1(<4 x i64> %x) {
; SSE-LABEL: combine_vec_lshr_trunc_lshr1:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movdqa %xmm1, %xmm2
; SSE-NEXT: psrlq $35, %xmm2
; SSE-NEXT: psrlq $34, %xmm1
@@ -249,7 +216,7 @@ define <4 x i32> @combine_vec_lshr_trunc_lshr1(<4 x i64> %x) {
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_lshr_trunc_lshr1:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpsrlvq {{.*}}(%rip), %ymm0, %ymm0
; AVX-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
; AVX-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
@@ -265,12 +232,12 @@ define <4 x i32> @combine_vec_lshr_trunc_lshr1(<4 x i64> %x) {
; fold (srl (trunc (srl x, c1)), c2) -> 0
define <4 x i32> @combine_vec_lshr_trunc_lshr_zero0(<4 x i64> %x) {
; SSE-LABEL: combine_vec_lshr_trunc_lshr_zero0:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: xorps %xmm0, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_lshr_trunc_lshr_zero0:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
; AVX-NEXT: retq
%1 = lshr <4 x i64> %x, <i64 48, i64 48, i64 48, i64 48>
@@ -281,7 +248,7 @@ define <4 x i32> @combine_vec_lshr_trunc_lshr_zero0(<4 x i64> %x) {
define <4 x i32> @combine_vec_lshr_trunc_lshr_zero1(<4 x i64> %x) {
; SSE-LABEL: combine_vec_lshr_trunc_lshr_zero1:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movdqa %xmm1, %xmm2
; SSE-NEXT: psrlq $51, %xmm2
; SSE-NEXT: psrlq $50, %xmm1
@@ -290,13 +257,13 @@ define <4 x i32> @combine_vec_lshr_trunc_lshr_zero1(<4 x i64> %x) {
; SSE-NEXT: psrlq $49, %xmm2
; SSE-NEXT: psrlq $48, %xmm0
; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
-; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
-; SSE-NEXT: movaps %xmm0, %xmm1
+; SSE-NEXT: packusdw %xmm1, %xmm0
+; SSE-NEXT: movdqa %xmm0, %xmm1
; SSE-NEXT: psrld $27, %xmm1
-; SSE-NEXT: movaps %xmm0, %xmm2
+; SSE-NEXT: movdqa %xmm0, %xmm2
; SSE-NEXT: psrld $25, %xmm2
; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
-; SSE-NEXT: movaps %xmm0, %xmm1
+; SSE-NEXT: movdqa %xmm0, %xmm1
; SSE-NEXT: psrld $26, %xmm1
; SSE-NEXT: psrld $24, %xmm0
; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
@@ -304,10 +271,10 @@ define <4 x i32> @combine_vec_lshr_trunc_lshr_zero1(<4 x i64> %x) {
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_lshr_trunc_lshr_zero1:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpsrlvq {{.*}}(%rip), %ymm0, %ymm0
-; AVX-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
@@ -320,12 +287,12 @@ define <4 x i32> @combine_vec_lshr_trunc_lshr_zero1(<4 x i64> %x) {
; fold (srl (shl x, c), c) -> (and x, cst2)
define <4 x i32> @combine_vec_lshr_shl_mask0(<4 x i32> %x) {
; SSE-LABEL: combine_vec_lshr_shl_mask0:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: andps {{.*}}(%rip), %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_lshr_shl_mask0:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [1073741823,1073741823,1073741823,1073741823]
; AVX-NEXT: vandps %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
@@ -336,12 +303,12 @@ define <4 x i32> @combine_vec_lshr_shl_mask0(<4 x i32> %x) {
define <4 x i32> @combine_vec_lshr_shl_mask1(<4 x i32> %x) {
; SSE-LABEL: combine_vec_lshr_shl_mask1:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: andps {{.*}}(%rip), %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_lshr_shl_mask1:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: retq
%1 = shl <4 x i32> %x, <i32 2, i32 3, i32 4, i32 5>
@@ -352,12 +319,12 @@ define <4 x i32> @combine_vec_lshr_shl_mask1(<4 x i32> %x) {
; fold (srl (sra X, Y), 31) -> (srl X, 31)
define <4 x i32> @combine_vec_lshr_ashr_sign(<4 x i32> %x, <4 x i32> %y) {
; SSE-LABEL: combine_vec_lshr_ashr_sign:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: psrld $31, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_lshr_ashr_sign:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpsrld $31, %xmm0, %xmm0
; AVX-NEXT: retq
%1 = ashr <4 x i32> %x, %y
@@ -368,14 +335,14 @@ define <4 x i32> @combine_vec_lshr_ashr_sign(<4 x i32> %x, <4 x i32> %y) {
; fold (srl (ctlz x), "5") -> x iff x has one bit set (the low bit).
define <4 x i32> @combine_vec_lshr_lzcnt_bit0(<4 x i32> %x) {
; SSE-LABEL: combine_vec_lshr_lzcnt_bit0:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pand {{.*}}(%rip), %xmm0
; SSE-NEXT: psrld $4, %xmm0
; SSE-NEXT: pxor {{.*}}(%rip), %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_lshr_lzcnt_bit0:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [16,16,16,16]
; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpsrld $4, %xmm0, %xmm0
@@ -390,7 +357,7 @@ define <4 x i32> @combine_vec_lshr_lzcnt_bit0(<4 x i32> %x) {
define <4 x i32> @combine_vec_lshr_lzcnt_bit1(<4 x i32> %x) {
; SSE-LABEL: combine_vec_lshr_lzcnt_bit1:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pand {{.*}}(%rip), %xmm0
; SSE-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; SSE-NEXT: movdqa %xmm0, %xmm1
@@ -422,7 +389,7 @@ define <4 x i32> @combine_vec_lshr_lzcnt_bit1(<4 x i32> %x) {
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_lshr_lzcnt_bit1:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2
@@ -457,7 +424,7 @@ declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1)
; fold (srl x, (trunc (and y, c))) -> (srl x, (and (trunc y), (trunc c))).
define <4 x i32> @combine_vec_lshr_trunc_and(<4 x i32> %x, <4 x i64> %y) {
; SSE-LABEL: combine_vec_lshr_trunc_and:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
; SSE-NEXT: andps {{.*}}(%rip), %xmm1
; SSE-NEXT: movaps %xmm1, %xmm2
@@ -480,7 +447,7 @@ define <4 x i32> @combine_vec_lshr_trunc_and(<4 x i32> %x, <4 x i64> %y) {
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_lshr_trunc_and:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
; AVX-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
; AVX-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
diff --git a/test/CodeGen/X86/combine-sse41-intrinsics.ll b/test/CodeGen/X86/combine-sse41-intrinsics.ll
index 0c8e7b317ec6..698e5bc423c1 100644
--- a/test/CodeGen/X86/combine-sse41-intrinsics.ll
+++ b/test/CodeGen/X86/combine-sse41-intrinsics.ll
@@ -4,7 +4,7 @@
define <2 x double> @test_x86_sse41_blend_pd(<2 x double> %a0, <2 x double> %a1) {
; CHECK-LABEL: test_x86_sse41_blend_pd:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: retq
%1 = call <2 x double> @llvm.x86.sse41.blendpd(<2 x double> %a0, <2 x double> %a1, i32 0)
ret <2 x double> %1
@@ -12,7 +12,7 @@ define <2 x double> @test_x86_sse41_blend_pd(<2 x double> %a0, <2 x double> %a1)
define <4 x float> @test_x86_sse41_blend_ps(<4 x float> %a0, <4 x float> %a1) {
; CHECK-LABEL: test_x86_sse41_blend_ps:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: retq
%1 = call <4 x float> @llvm.x86.sse41.blendps(<4 x float> %a0, <4 x float> %a1, i32 0)
ret <4 x float> %1
@@ -20,7 +20,7 @@ define <4 x float> @test_x86_sse41_blend_ps(<4 x float> %a0, <4 x float> %a1) {
define <8 x i16> @test_x86_sse41_pblend_w(<8 x i16> %a0, <8 x i16> %a1) {
; CHECK-LABEL: test_x86_sse41_pblend_w:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: retq
%1 = call <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16> %a0, <8 x i16> %a1, i32 0)
ret <8 x i16> %1
@@ -28,7 +28,7 @@ define <8 x i16> @test_x86_sse41_pblend_w(<8 x i16> %a0, <8 x i16> %a1) {
define <2 x double> @test2_x86_sse41_blend_pd(<2 x double> %a0, <2 x double> %a1) {
; CHECK-LABEL: test2_x86_sse41_blend_pd:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movaps %xmm1, %xmm0
; CHECK-NEXT: retq
%1 = call <2 x double> @llvm.x86.sse41.blendpd(<2 x double> %a0, <2 x double> %a1, i32 -1)
@@ -37,7 +37,7 @@ define <2 x double> @test2_x86_sse41_blend_pd(<2 x double> %a0, <2 x double> %a1
define <4 x float> @test2_x86_sse41_blend_ps(<4 x float> %a0, <4 x float> %a1) {
; CHECK-LABEL: test2_x86_sse41_blend_ps:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movaps %xmm1, %xmm0
; CHECK-NEXT: retq
%1 = call <4 x float> @llvm.x86.sse41.blendps(<4 x float> %a0, <4 x float> %a1, i32 -1)
@@ -46,7 +46,7 @@ define <4 x float> @test2_x86_sse41_blend_ps(<4 x float> %a0, <4 x float> %a1) {
define <8 x i16> @test2_x86_sse41_pblend_w(<8 x i16> %a0, <8 x i16> %a1) {
; CHECK-LABEL: test2_x86_sse41_pblend_w:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movaps %xmm1, %xmm0
; CHECK-NEXT: retq
%1 = call <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16> %a0, <8 x i16> %a1, i32 -1)
@@ -55,7 +55,7 @@ define <8 x i16> @test2_x86_sse41_pblend_w(<8 x i16> %a0, <8 x i16> %a1) {
define <2 x double> @test3_x86_sse41_blend_pd(<2 x double> %a0) {
; CHECK-LABEL: test3_x86_sse41_blend_pd:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: retq
%1 = call <2 x double> @llvm.x86.sse41.blendpd(<2 x double> %a0, <2 x double> %a0, i32 7)
ret <2 x double> %1
@@ -63,7 +63,7 @@ define <2 x double> @test3_x86_sse41_blend_pd(<2 x double> %a0) {
define <4 x float> @test3_x86_sse41_blend_ps(<4 x float> %a0) {
; CHECK-LABEL: test3_x86_sse41_blend_ps:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: retq
%1 = call <4 x float> @llvm.x86.sse41.blendps(<4 x float> %a0, <4 x float> %a0, i32 7)
ret <4 x float> %1
@@ -71,7 +71,7 @@ define <4 x float> @test3_x86_sse41_blend_ps(<4 x float> %a0) {
define <8 x i16> @test3_x86_sse41_pblend_w(<8 x i16> %a0) {
; CHECK-LABEL: test3_x86_sse41_pblend_w:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: retq
%1 = call <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16> %a0, <8 x i16> %a0, i32 7)
ret <8 x i16> %1
diff --git a/test/CodeGen/X86/combine-sub.ll b/test/CodeGen/X86/combine-sub.ll
index e062440b42ba..df5aba0f26cf 100644
--- a/test/CodeGen/X86/combine-sub.ll
+++ b/test/CodeGen/X86/combine-sub.ll
@@ -5,11 +5,11 @@
; fold (sub x, 0) -> x
define <4 x i32> @combine_vec_sub_zero(<4 x i32> %a) {
; SSE-LABEL: combine_vec_sub_zero:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_sub_zero:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: retq
%1 = sub <4 x i32> %a, zeroinitializer
ret <4 x i32> %1
@@ -18,12 +18,12 @@ define <4 x i32> @combine_vec_sub_zero(<4 x i32> %a) {
; fold (sub x, x) -> 0
define <4 x i32> @combine_vec_sub_self(<4 x i32> %a) {
; SSE-LABEL: combine_vec_sub_self:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: xorps %xmm0, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_sub_self:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
; AVX-NEXT: retq
%1 = sub <4 x i32> %a, %a
@@ -33,12 +33,12 @@ define <4 x i32> @combine_vec_sub_self(<4 x i32> %a) {
; fold (sub x, c) -> (add x, -c)
define <4 x i32> @combine_vec_sub_constant(<4 x i32> %x) {
; SSE-LABEL: combine_vec_sub_constant:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: psubd {{.*}}(%rip), %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_sub_constant:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpsubd {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: retq
%1 = sub <4 x i32> %x, <i32 0, i32 1, i32 2, i32 3>
@@ -48,13 +48,13 @@ define <4 x i32> @combine_vec_sub_constant(<4 x i32> %x) {
; Canonicalize (sub -1, x) -> ~x, i.e. (xor x, -1)
define <4 x i32> @combine_vec_sub_negone(<4 x i32> %x) {
; SSE-LABEL: combine_vec_sub_negone:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pcmpeqd %xmm1, %xmm1
; SSE-NEXT: pxor %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_sub_negone:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
@@ -65,12 +65,12 @@ define <4 x i32> @combine_vec_sub_negone(<4 x i32> %x) {
; fold A-(A-B) -> B
define <4 x i32> @combine_vec_sub_sub(<4 x i32> %a, <4 x i32> %b) {
; SSE-LABEL: combine_vec_sub_sub:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_sub_sub:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovaps %xmm1, %xmm0
; AVX-NEXT: retq
%1 = sub <4 x i32> %a, %b
@@ -81,12 +81,12 @@ define <4 x i32> @combine_vec_sub_sub(<4 x i32> %a, <4 x i32> %b) {
; fold (A+B)-A -> B
define <4 x i32> @combine_vec_sub_add0(<4 x i32> %a, <4 x i32> %b) {
; SSE-LABEL: combine_vec_sub_add0:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_sub_add0:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovaps %xmm1, %xmm0
; AVX-NEXT: retq
%1 = add <4 x i32> %a, %b
@@ -97,11 +97,11 @@ define <4 x i32> @combine_vec_sub_add0(<4 x i32> %a, <4 x i32> %b) {
; fold (A+B)-B -> A
define <4 x i32> @combine_vec_sub_add1(<4 x i32> %a, <4 x i32> %b) {
; SSE-LABEL: combine_vec_sub_add1:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_sub_add1:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: retq
%1 = add <4 x i32> %a, %b
%2 = sub <4 x i32> %1, %b
@@ -111,14 +111,14 @@ define <4 x i32> @combine_vec_sub_add1(<4 x i32> %a, <4 x i32> %b) {
; fold C2-(A+C1) -> (C2-C1)-A
define <4 x i32> @combine_vec_sub_constant_add(<4 x i32> %a) {
; SSE-LABEL: combine_vec_sub_constant_add:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movdqa {{.*#+}} xmm1 = [3,1,4294967295,4294967293]
; SSE-NEXT: psubd %xmm0, %xmm1
; SSE-NEXT: movdqa %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_sub_constant_add:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [3,1,4294967295,4294967293]
; AVX-NEXT: vpsubd %xmm0, %xmm1, %xmm0
; AVX-NEXT: retq
@@ -130,12 +130,12 @@ define <4 x i32> @combine_vec_sub_constant_add(<4 x i32> %a) {
; fold ((A+(B+C))-B) -> A+C
define <4 x i32> @combine_vec_sub_add_add(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
; SSE-LABEL: combine_vec_sub_add_add:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: paddd %xmm2, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_sub_add_add:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpaddd %xmm2, %xmm0, %xmm0
; AVX-NEXT: retq
%1 = add <4 x i32> %b, %c
@@ -147,12 +147,12 @@ define <4 x i32> @combine_vec_sub_add_add(<4 x i32> %a, <4 x i32> %b, <4 x i32>
; fold ((A+(B-C))-B) -> A-C
define <4 x i32> @combine_vec_sub_add_sub(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
; SSE-LABEL: combine_vec_sub_add_sub:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: psubd %xmm2, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_sub_add_sub:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpsubd %xmm2, %xmm0, %xmm0
; AVX-NEXT: retq
%1 = sub <4 x i32> %b, %c
@@ -164,12 +164,12 @@ define <4 x i32> @combine_vec_sub_add_sub(<4 x i32> %a, <4 x i32> %b, <4 x i32>
; fold ((A-(B-C))-C) -> A-B
define <4 x i32> @combine_vec_sub_sub_sub(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
; SSE-LABEL: combine_vec_sub_sub_sub:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: psubd %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_sub_sub_sub:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpsubd %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%1 = sub <4 x i32> %b, %c
@@ -181,11 +181,11 @@ define <4 x i32> @combine_vec_sub_sub_sub(<4 x i32> %a, <4 x i32> %b, <4 x i32>
; fold undef-A -> undef
define <4 x i32> @combine_vec_sub_undef0(<4 x i32> %a) {
; SSE-LABEL: combine_vec_sub_undef0:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_sub_undef0:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: retq
%1 = sub <4 x i32> undef, %a
ret <4 x i32> %1
@@ -194,11 +194,11 @@ define <4 x i32> @combine_vec_sub_undef0(<4 x i32> %a) {
; fold A-undef -> undef
define <4 x i32> @combine_vec_sub_undef1(<4 x i32> %a) {
; SSE-LABEL: combine_vec_sub_undef1:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_sub_undef1:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: retq
%1 = sub <4 x i32> %a, undef
ret <4 x i32> %1
@@ -207,14 +207,14 @@ define <4 x i32> @combine_vec_sub_undef1(<4 x i32> %a) {
; sub X, (sext Y i1) -> add X, (and Y 1)
define <4 x i32> @combine_vec_add_sext(<4 x i32> %x, <4 x i1> %y) {
; SSE-LABEL: combine_vec_add_sext:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pslld $31, %xmm1
; SSE-NEXT: psrad $31, %xmm1
; SSE-NEXT: psubd %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_add_sext:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpslld $31, %xmm1, %xmm1
; AVX-NEXT: vpsrad $31, %xmm1, %xmm1
; AVX-NEXT: vpsubd %xmm1, %xmm0, %xmm0
@@ -227,14 +227,14 @@ define <4 x i32> @combine_vec_add_sext(<4 x i32> %x, <4 x i1> %y) {
; sub X, (sextinreg Y i1) -> add X, (and Y 1)
define <4 x i32> @combine_vec_sub_sextinreg(<4 x i32> %x, <4 x i32> %y) {
; SSE-LABEL: combine_vec_sub_sextinreg:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pslld $31, %xmm1
; SSE-NEXT: psrad $31, %xmm1
; SSE-NEXT: psubd %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_sub_sextinreg:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpslld $31, %xmm1, %xmm1
; AVX-NEXT: vpsrad $31, %xmm1, %xmm1
; AVX-NEXT: vpsubd %xmm1, %xmm0, %xmm0
diff --git a/test/CodeGen/X86/combine-testm-and.ll b/test/CodeGen/X86/combine-testm-and.ll
index b10a4b5ed298..9c03bce7b6da 100644
--- a/test/CodeGen/X86/combine-testm-and.ll
+++ b/test/CodeGen/X86/combine-testm-and.ll
@@ -3,7 +3,7 @@
define i32 @combineTESTM_AND_1(<8 x i64> %a, <8 x i64> %b) {
; CHECK-LABEL: combineTESTM_AND_1:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vptestmq %zmm0, %zmm1, %k0
; CHECK-NEXT: kmovb %k0, %eax
; CHECK-NEXT: vzeroupper
@@ -16,7 +16,7 @@ define i32 @combineTESTM_AND_1(<8 x i64> %a, <8 x i64> %b) {
define i32 @combineTESTM_AND_2(<8 x i64> %a, <8 x i64> %b , i8 %mask) {
; CHECK-LABEL: combineTESTM_AND_2:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %edi, %k1
; CHECK-NEXT: vptestmq %zmm0, %zmm1, %k0 {%k1}
; CHECK-NEXT: kmovb %k0, %eax
@@ -30,7 +30,7 @@ define i32 @combineTESTM_AND_2(<8 x i64> %a, <8 x i64> %b , i8 %mask) {
define i32 @combineTESTM_AND_mask_3(<8 x i64> %a, <8 x i64>* %bptr , i8 %mask) {
; CHECK-LABEL: combineTESTM_AND_mask_3:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: vptestmq (%rdi), %zmm0, %k0 {%k1}
; CHECK-NEXT: kmovb %k0, %eax
@@ -45,7 +45,7 @@ define i32 @combineTESTM_AND_mask_3(<8 x i64> %a, <8 x i64>* %bptr , i8 %mask) {
define i32 @combineTESTM_AND_mask_4(<8 x i64> %a, <8 x i64>* %bptr , i8 %mask) {
; CHECK-LABEL: combineTESTM_AND_mask_4:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: vptestmq (%rdi), %zmm0, %k0 {%k1}
; CHECK-NEXT: kmovb %k0, %eax
diff --git a/test/CodeGen/X86/combine-udiv.ll b/test/CodeGen/X86/combine-udiv.ll
index b6ae2fa6d157..d8e1ac216c99 100644
--- a/test/CodeGen/X86/combine-udiv.ll
+++ b/test/CodeGen/X86/combine-udiv.ll
@@ -6,11 +6,11 @@
; fold (udiv undef, x) -> 0
define <4 x i32> @combine_vec_udiv_undef0(<4 x i32> %x) {
; SSE-LABEL: combine_vec_udiv_undef0:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_udiv_undef0:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: retq
%1 = udiv <4 x i32> undef, %x
ret <4 x i32> %1
@@ -19,11 +19,11 @@ define <4 x i32> @combine_vec_udiv_undef0(<4 x i32> %x) {
; fold (udiv x, undef) -> undef
define <4 x i32> @combine_vec_udiv_undef1(<4 x i32> %x) {
; SSE-LABEL: combine_vec_udiv_undef1:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_udiv_undef1:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: retq
%1 = udiv <4 x i32> %x, undef
ret <4 x i32> %1
@@ -32,12 +32,12 @@ define <4 x i32> @combine_vec_udiv_undef1(<4 x i32> %x) {
; fold (udiv x, (1 << c)) -> x >>u c
define <4 x i32> @combine_vec_udiv_by_pow2a(<4 x i32> %x) {
; SSE-LABEL: combine_vec_udiv_by_pow2a:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: psrld $2, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_udiv_by_pow2a:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpsrld $2, %xmm0, %xmm0
; AVX-NEXT: retq
%1 = udiv <4 x i32> %x, <i32 4, i32 4, i32 4, i32 4>
@@ -46,7 +46,7 @@ define <4 x i32> @combine_vec_udiv_by_pow2a(<4 x i32> %x) {
define <4 x i32> @combine_vec_udiv_by_pow2b(<4 x i32> %x) {
; SSE-LABEL: combine_vec_udiv_by_pow2b:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movdqa %xmm0, %xmm2
; SSE-NEXT: movdqa %xmm0, %xmm1
; SSE-NEXT: psrld $3, %xmm1
@@ -59,7 +59,7 @@ define <4 x i32> @combine_vec_udiv_by_pow2b(<4 x i32> %x) {
; SSE-NEXT: retq
;
; AVX1-LABEL: combine_vec_udiv_by_pow2b:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpsrld $4, %xmm0, %xmm1
; AVX1-NEXT: vpsrld $2, %xmm0, %xmm2
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
@@ -69,7 +69,7 @@ define <4 x i32> @combine_vec_udiv_by_pow2b(<4 x i32> %x) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: combine_vec_udiv_by_pow2b:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
; AVX2-NEXT: retq
%1 = udiv <4 x i32> %x, <i32 1, i32 4, i32 8, i32 16>
@@ -78,7 +78,7 @@ define <4 x i32> @combine_vec_udiv_by_pow2b(<4 x i32> %x) {
define <4 x i32> @combine_vec_udiv_by_pow2c(<4 x i32> %x, <4 x i32> %y) {
; SSE-LABEL: combine_vec_udiv_by_pow2c:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movdqa %xmm1, %xmm2
; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; SSE-NEXT: movdqa %xmm0, %xmm3
@@ -99,7 +99,7 @@ define <4 x i32> @combine_vec_udiv_by_pow2c(<4 x i32> %x, <4 x i32> %y) {
; SSE-NEXT: retq
;
; AVX1-LABEL: combine_vec_udiv_by_pow2c:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; AVX1-NEXT: vpsrld %xmm2, %xmm0, %xmm2
; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm3
@@ -115,7 +115,7 @@ define <4 x i32> @combine_vec_udiv_by_pow2c(<4 x i32> %x, <4 x i32> %y) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: combine_vec_udiv_by_pow2c:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: retq
%1 = shl <4 x i32> <i32 1, i32 1, i32 1, i32 1>, %y
@@ -126,7 +126,7 @@ define <4 x i32> @combine_vec_udiv_by_pow2c(<4 x i32> %x, <4 x i32> %y) {
; fold (udiv x, (shl c, y)) -> x >>u (log2(c)+y) iff c is power of 2
define <4 x i32> @combine_vec_udiv_by_shl_pow2a(<4 x i32> %x, <4 x i32> %y) {
; SSE-LABEL: combine_vec_udiv_by_shl_pow2a:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: paddd {{.*}}(%rip), %xmm1
; SSE-NEXT: movdqa %xmm1, %xmm2
; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -148,7 +148,7 @@ define <4 x i32> @combine_vec_udiv_by_shl_pow2a(<4 x i32> %x, <4 x i32> %y) {
; SSE-NEXT: retq
;
; AVX1-LABEL: combine_vec_udiv_by_shl_pow2a:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm1, %xmm1
; AVX1-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; AVX1-NEXT: vpsrld %xmm2, %xmm0, %xmm2
@@ -165,7 +165,7 @@ define <4 x i32> @combine_vec_udiv_by_shl_pow2a(<4 x i32> %x, <4 x i32> %y) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: combine_vec_udiv_by_shl_pow2a:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2,2,2,2]
; AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
@@ -177,7 +177,7 @@ define <4 x i32> @combine_vec_udiv_by_shl_pow2a(<4 x i32> %x, <4 x i32> %y) {
define <4 x i32> @combine_vec_udiv_by_shl_pow2b(<4 x i32> %x, <4 x i32> %y) {
; SSE-LABEL: combine_vec_udiv_by_shl_pow2b:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: paddd {{.*}}(%rip), %xmm1
; SSE-NEXT: movdqa %xmm1, %xmm2
; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -199,7 +199,7 @@ define <4 x i32> @combine_vec_udiv_by_shl_pow2b(<4 x i32> %x, <4 x i32> %y) {
; SSE-NEXT: retq
;
; AVX1-LABEL: combine_vec_udiv_by_shl_pow2b:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm1, %xmm1
; AVX1-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; AVX1-NEXT: vpsrld %xmm2, %xmm0, %xmm2
@@ -216,7 +216,7 @@ define <4 x i32> @combine_vec_udiv_by_shl_pow2b(<4 x i32> %x, <4 x i32> %y) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: combine_vec_udiv_by_shl_pow2b:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpaddd {{.*}}(%rip), %xmm1, %xmm1
; AVX2-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: retq
diff --git a/test/CodeGen/X86/combine-urem.ll b/test/CodeGen/X86/combine-urem.ll
index 4c7716bbaebe..2530136c0545 100644
--- a/test/CodeGen/X86/combine-urem.ll
+++ b/test/CodeGen/X86/combine-urem.ll
@@ -6,11 +6,11 @@
; fold (urem undef, x) -> 0
define <4 x i32> @combine_vec_urem_undef0(<4 x i32> %x) {
; SSE-LABEL: combine_vec_urem_undef0:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_urem_undef0:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: retq
%1 = urem <4 x i32> undef, %x
ret <4 x i32> %1
@@ -19,11 +19,11 @@ define <4 x i32> @combine_vec_urem_undef0(<4 x i32> %x) {
; fold (urem x, undef) -> undef
define <4 x i32> @combine_vec_urem_undef1(<4 x i32> %x) {
; SSE-LABEL: combine_vec_urem_undef1:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_urem_undef1:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: retq
%1 = urem <4 x i32> %x, undef
ret <4 x i32> %1
@@ -32,17 +32,17 @@ define <4 x i32> @combine_vec_urem_undef1(<4 x i32> %x) {
; fold (urem x, pow2) -> (and x, (pow2-1))
define <4 x i32> @combine_vec_urem_by_pow2a(<4 x i32> %x) {
; SSE-LABEL: combine_vec_urem_by_pow2a:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: andps {{.*}}(%rip), %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: combine_vec_urem_by_pow2a:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: combine_vec_urem_by_pow2a:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vbroadcastss {{.*#+}} xmm1 = [3,3,3,3]
; AVX2-NEXT: vandps %xmm1, %xmm0, %xmm0
; AVX2-NEXT: retq
@@ -52,12 +52,12 @@ define <4 x i32> @combine_vec_urem_by_pow2a(<4 x i32> %x) {
define <4 x i32> @combine_vec_urem_by_pow2b(<4 x i32> %x) {
; SSE-LABEL: combine_vec_urem_by_pow2b:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: andps {{.*}}(%rip), %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vec_urem_by_pow2b:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: retq
%1 = urem <4 x i32> %x, <i32 1, i32 4, i32 8, i32 16>
@@ -66,7 +66,7 @@ define <4 x i32> @combine_vec_urem_by_pow2b(<4 x i32> %x) {
define <4 x i32> @combine_vec_urem_by_pow2c(<4 x i32> %x, <4 x i32> %y) {
; SSE-LABEL: combine_vec_urem_by_pow2c:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pslld $23, %xmm1
; SSE-NEXT: paddd {{.*}}(%rip), %xmm1
; SSE-NEXT: cvttps2dq %xmm1, %xmm1
@@ -76,7 +76,7 @@ define <4 x i32> @combine_vec_urem_by_pow2c(<4 x i32> %x, <4 x i32> %y) {
; SSE-NEXT: retq
;
; AVX1-LABEL: combine_vec_urem_by_pow2c:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpslld $23, %xmm1, %xmm1
; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm1, %xmm1
; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1
@@ -86,7 +86,7 @@ define <4 x i32> @combine_vec_urem_by_pow2c(<4 x i32> %x, <4 x i32> %y) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: combine_vec_urem_by_pow2c:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1,1,1,1]
; AVX2-NEXT: vpsllvd %xmm1, %xmm2, %xmm1
; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
@@ -100,7 +100,7 @@ define <4 x i32> @combine_vec_urem_by_pow2c(<4 x i32> %x, <4 x i32> %y) {
define <4 x i32> @combine_vec_urem_by_pow2d(<4 x i32> %x, <4 x i32> %y) {
; SSE-LABEL: combine_vec_urem_by_pow2d:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movdqa %xmm1, %xmm2
; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; SSE-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
@@ -125,7 +125,7 @@ define <4 x i32> @combine_vec_urem_by_pow2d(<4 x i32> %x, <4 x i32> %y) {
; SSE-NEXT: retq
;
; AVX1-LABEL: combine_vec_urem_by_pow2d:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
; AVX1-NEXT: vpsrld %xmm2, %xmm3, %xmm2
@@ -145,7 +145,7 @@ define <4 x i32> @combine_vec_urem_by_pow2d(<4 x i32> %x, <4 x i32> %y) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: combine_vec_urem_by_pow2d:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
; AVX2-NEXT: vpsrlvd %xmm1, %xmm2, %xmm1
; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
@@ -160,7 +160,7 @@ define <4 x i32> @combine_vec_urem_by_pow2d(<4 x i32> %x, <4 x i32> %y) {
; fold (urem x, (shl pow2, y)) -> (and x, (add (shl pow2, y), -1))
define <4 x i32> @combine_vec_urem_by_shl_pow2a(<4 x i32> %x, <4 x i32> %y) {
; SSE-LABEL: combine_vec_urem_by_shl_pow2a:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pslld $23, %xmm1
; SSE-NEXT: paddd {{.*}}(%rip), %xmm1
; SSE-NEXT: cvttps2dq %xmm1, %xmm1
@@ -171,7 +171,7 @@ define <4 x i32> @combine_vec_urem_by_shl_pow2a(<4 x i32> %x, <4 x i32> %y) {
; SSE-NEXT: retq
;
; AVX1-LABEL: combine_vec_urem_by_shl_pow2a:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpslld $23, %xmm1, %xmm1
; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm1, %xmm1
; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1
@@ -182,7 +182,7 @@ define <4 x i32> @combine_vec_urem_by_shl_pow2a(<4 x i32> %x, <4 x i32> %y) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: combine_vec_urem_by_shl_pow2a:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [4,4,4,4]
; AVX2-NEXT: vpsllvd %xmm1, %xmm2, %xmm1
; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
@@ -196,7 +196,7 @@ define <4 x i32> @combine_vec_urem_by_shl_pow2a(<4 x i32> %x, <4 x i32> %y) {
define <4 x i32> @combine_vec_urem_by_shl_pow2b(<4 x i32> %x, <4 x i32> %y) {
; SSE-LABEL: combine_vec_urem_by_shl_pow2b:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pslld $23, %xmm1
; SSE-NEXT: paddd {{.*}}(%rip), %xmm1
; SSE-NEXT: cvttps2dq %xmm1, %xmm1
@@ -207,7 +207,7 @@ define <4 x i32> @combine_vec_urem_by_shl_pow2b(<4 x i32> %x, <4 x i32> %y) {
; SSE-NEXT: retq
;
; AVX1-LABEL: combine_vec_urem_by_shl_pow2b:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpslld $23, %xmm1, %xmm1
; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm1, %xmm1
; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1
@@ -218,7 +218,7 @@ define <4 x i32> @combine_vec_urem_by_shl_pow2b(<4 x i32> %x, <4 x i32> %y) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: combine_vec_urem_by_shl_pow2b:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [1,4,8,16]
; AVX2-NEXT: vpsllvd %xmm1, %xmm2, %xmm1
; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
diff --git a/test/CodeGen/X86/commute-3dnow.ll b/test/CodeGen/X86/commute-3dnow.ll
index b7a01efe2d3a..bf7892af44f5 100644
--- a/test/CodeGen/X86/commute-3dnow.ll
+++ b/test/CodeGen/X86/commute-3dnow.ll
@@ -4,7 +4,7 @@
define void @commute_m_pfadd(x86_mmx *%a0, x86_mmx *%a1, x86_mmx *%a2) nounwind {
; X32-LABEL: commute_m_pfadd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
@@ -15,7 +15,7 @@ define void @commute_m_pfadd(x86_mmx *%a0, x86_mmx *%a1, x86_mmx *%a2) nounwind
; X32-NEXT: retl
;
; X64-LABEL: commute_m_pfadd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movq (%rdi), %mm0
; X64-NEXT: pfadd (%rsi), %mm0
; X64-NEXT: pfadd (%rdx), %mm0
@@ -33,7 +33,7 @@ declare x86_mmx @llvm.x86.3dnow.pfadd(x86_mmx, x86_mmx)
define void @commute_m_pfsub(x86_mmx *%a0, x86_mmx *%a1, x86_mmx *%a2) nounwind {
; X32-LABEL: commute_m_pfsub:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
@@ -44,7 +44,7 @@ define void @commute_m_pfsub(x86_mmx *%a0, x86_mmx *%a1, x86_mmx *%a2) nounwind
; X32-NEXT: retl
;
; X64-LABEL: commute_m_pfsub:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movq (%rdi), %mm0
; X64-NEXT: pfsub (%rsi), %mm0
; X64-NEXT: pfsubr (%rdx), %mm0
@@ -62,7 +62,7 @@ declare x86_mmx @llvm.x86.3dnow.pfsub(x86_mmx, x86_mmx)
define void @commute_m_pfsubr(x86_mmx *%a0, x86_mmx *%a1, x86_mmx *%a2) nounwind {
; X32-LABEL: commute_m_pfsubr:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
@@ -73,7 +73,7 @@ define void @commute_m_pfsubr(x86_mmx *%a0, x86_mmx *%a1, x86_mmx *%a2) nounwind
; X32-NEXT: retl
;
; X64-LABEL: commute_m_pfsubr:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movq (%rdi), %mm0
; X64-NEXT: pfsubr (%rsi), %mm0
; X64-NEXT: pfsub (%rdx), %mm0
@@ -91,7 +91,7 @@ declare x86_mmx @llvm.x86.3dnow.pfsubr(x86_mmx, x86_mmx)
define void @commute_m_pfmul(x86_mmx *%a0, x86_mmx *%a1, x86_mmx *%a2) nounwind {
; X32-LABEL: commute_m_pfmul:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
@@ -102,7 +102,7 @@ define void @commute_m_pfmul(x86_mmx *%a0, x86_mmx *%a1, x86_mmx *%a2) nounwind
; X32-NEXT: retl
;
; X64-LABEL: commute_m_pfmul:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movq (%rdi), %mm0
; X64-NEXT: pfmul (%rsi), %mm0
; X64-NEXT: pfmul (%rdx), %mm0
@@ -121,7 +121,7 @@ declare x86_mmx @llvm.x86.3dnow.pfmul(x86_mmx, x86_mmx)
; PFMAX can't commute without fast-math.
define void @commute_m_pfmax(x86_mmx *%a0, x86_mmx *%a1, x86_mmx *%a2) nounwind {
; X32-LABEL: commute_m_pfmax:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
@@ -133,7 +133,7 @@ define void @commute_m_pfmax(x86_mmx *%a0, x86_mmx *%a1, x86_mmx *%a2) nounwind
; X32-NEXT: retl
;
; X64-LABEL: commute_m_pfmax:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movq (%rdi), %mm0
; X64-NEXT: movq (%rdx), %mm1
; X64-NEXT: pfmax (%rsi), %mm0
@@ -153,7 +153,7 @@ declare x86_mmx @llvm.x86.3dnow.pfmax(x86_mmx, x86_mmx)
; PFMIN can't commute without fast-math.
define void @commute_m_pfmin(x86_mmx *%a0, x86_mmx *%a1, x86_mmx *%a2) nounwind {
; X32-LABEL: commute_m_pfmin:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
@@ -165,7 +165,7 @@ define void @commute_m_pfmin(x86_mmx *%a0, x86_mmx *%a1, x86_mmx *%a2) nounwind
; X32-NEXT: retl
;
; X64-LABEL: commute_m_pfmin:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movq (%rdi), %mm0
; X64-NEXT: movq (%rdx), %mm1
; X64-NEXT: pfmin (%rsi), %mm0
@@ -184,7 +184,7 @@ declare x86_mmx @llvm.x86.3dnow.pfmin(x86_mmx, x86_mmx)
define void @commute_m_pfcmpeq(x86_mmx *%a0, x86_mmx *%a1, x86_mmx *%a2) nounwind {
; X32-LABEL: commute_m_pfcmpeq:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
@@ -195,7 +195,7 @@ define void @commute_m_pfcmpeq(x86_mmx *%a0, x86_mmx *%a1, x86_mmx *%a2) nounwin
; X32-NEXT: retl
;
; X64-LABEL: commute_m_pfcmpeq:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movq (%rdi), %mm0
; X64-NEXT: pfcmpeq (%rsi), %mm0
; X64-NEXT: pfcmpeq (%rdx), %mm0
@@ -213,7 +213,7 @@ declare x86_mmx @llvm.x86.3dnow.pfcmpeq(x86_mmx, x86_mmx)
define void @commute_m_pavgusb(x86_mmx *%a0, x86_mmx *%a1, x86_mmx *%a2) nounwind {
; X32-LABEL: commute_m_pavgusb:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
@@ -224,7 +224,7 @@ define void @commute_m_pavgusb(x86_mmx *%a0, x86_mmx *%a1, x86_mmx *%a2) nounwin
; X32-NEXT: retl
;
; X64-LABEL: commute_m_pavgusb:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movq (%rdi), %mm0
; X64-NEXT: pavgusb (%rsi), %mm0
; X64-NEXT: pavgusb (%rdx), %mm0
@@ -242,7 +242,7 @@ declare x86_mmx @llvm.x86.3dnow.pavgusb(x86_mmx, x86_mmx)
define void @commute_m_pmulhrw(x86_mmx *%a0, x86_mmx *%a1, x86_mmx *%a2) nounwind {
; X32-LABEL: commute_m_pmulhrw:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
@@ -253,7 +253,7 @@ define void @commute_m_pmulhrw(x86_mmx *%a0, x86_mmx *%a1, x86_mmx *%a2) nounwin
; X32-NEXT: retl
;
; X64-LABEL: commute_m_pmulhrw:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movq (%rdi), %mm0
; X64-NEXT: pmulhrw (%rsi), %mm0
; X64-NEXT: pmulhrw (%rdx), %mm0
diff --git a/test/CodeGen/X86/commute-blend-avx2.ll b/test/CodeGen/X86/commute-blend-avx2.ll
index c39aa0b12b32..b3c8cefab5b2 100644
--- a/test/CodeGen/X86/commute-blend-avx2.ll
+++ b/test/CodeGen/X86/commute-blend-avx2.ll
@@ -3,7 +3,7 @@
define <8 x i16> @commute_fold_vpblendw_128(<8 x i16> %a, <8 x i16>* %b) #0 {
; CHECK-LABEL: commute_fold_vpblendw_128:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2,3],xmm0[4],mem[5,6,7]
; CHECK-NEXT: retq
%1 = load <8 x i16>, <8 x i16>* %b
@@ -14,7 +14,7 @@ declare <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16>, <8 x i16>, i8) nounwind rea
define <16 x i16> @commute_fold_vpblendw_256(<16 x i16> %a, <16 x i16>* %b) #0 {
; CHECK-LABEL: commute_fold_vpblendw_256:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],mem[1,2,3],ymm0[4],mem[5,6,7],ymm0[8],mem[9,10,11],ymm0[12],mem[13,14,15]
; CHECK-NEXT: retq
%1 = load <16 x i16>, <16 x i16>* %b
@@ -25,8 +25,8 @@ declare <16 x i16> @llvm.x86.avx2.pblendw(<16 x i16>, <16 x i16>, i8) nounwind r
define <4 x i32> @commute_fold_vpblendd_128(<4 x i32> %a, <4 x i32>* %b) #0 {
; CHECK-LABEL: commute_fold_vpblendd_128:
-; CHECK: # BB#0:
-; CHECK-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],mem[1,2,3]
+; CHECK: # %bb.0:
+; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],mem[1,2,3]
; CHECK-NEXT: retq
%1 = load <4 x i32>, <4 x i32>* %b
%2 = call <4 x i32> @llvm.x86.avx2.pblendd.128(<4 x i32> %1, <4 x i32> %a, i8 1)
@@ -36,8 +36,8 @@ declare <4 x i32> @llvm.x86.avx2.pblendd.128(<4 x i32>, <4 x i32>, i8) nounwind
define <8 x i32> @commute_fold_vpblendd_256(<8 x i32> %a, <8 x i32>* %b) #0 {
; CHECK-LABEL: commute_fold_vpblendd_256:
-; CHECK: # BB#0:
-; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],mem[1,2,3,4,5,6],ymm0[7]
+; CHECK: # %bb.0:
+; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],mem[1,2,3,4,5,6],ymm0[7]
; CHECK-NEXT: retq
%1 = load <8 x i32>, <8 x i32>* %b
%2 = call <8 x i32> @llvm.x86.avx2.pblendd.256(<8 x i32> %1, <8 x i32> %a, i8 129)
@@ -47,7 +47,7 @@ declare <8 x i32> @llvm.x86.avx2.pblendd.256(<8 x i32>, <8 x i32>, i8) nounwind
define <4 x float> @commute_fold_vblendps_128(<4 x float> %a, <4 x float>* %b) #0 {
; CHECK-LABEL: commute_fold_vblendps_128:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2],mem[3]
; CHECK-NEXT: retq
%1 = load <4 x float>, <4 x float>* %b
@@ -58,7 +58,7 @@ declare <4 x float> @llvm.x86.sse41.blendps(<4 x float>, <4 x float>, i8) nounwi
define <8 x float> @commute_fold_vblendps_256(<8 x float> %a, <8 x float>* %b) #0 {
; CHECK-LABEL: commute_fold_vblendps_256:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],mem[3,4,5,6,7]
; CHECK-NEXT: retq
%1 = load <8 x float>, <8 x float>* %b
@@ -69,7 +69,7 @@ declare <8 x float> @llvm.x86.avx.blend.ps.256(<8 x float>, <8 x float>, i8) nou
define <2 x double> @commute_fold_vblendpd_128(<2 x double> %a, <2 x double>* %b) #0 {
; CHECK-LABEL: commute_fold_vblendpd_128:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],mem[1]
; CHECK-NEXT: retq
%1 = load <2 x double>, <2 x double>* %b
@@ -80,7 +80,7 @@ declare <2 x double> @llvm.x86.sse41.blendpd(<2 x double>, <2 x double>, i8) nou
define <4 x double> @commute_fold_vblendpd_256(<4 x double> %a, <4 x double>* %b) #0 {
; CHECK-LABEL: commute_fold_vblendpd_256:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],mem[3]
; CHECK-NEXT: retq
%1 = load <4 x double>, <4 x double>* %b
diff --git a/test/CodeGen/X86/commute-blend-sse41.ll b/test/CodeGen/X86/commute-blend-sse41.ll
index 14a685b179a5..d296c10fdaeb 100644
--- a/test/CodeGen/X86/commute-blend-sse41.ll
+++ b/test/CodeGen/X86/commute-blend-sse41.ll
@@ -3,7 +3,7 @@
define <8 x i16> @commute_fold_pblendw(<8 x i16> %a, <8 x i16>* %b) #0 {
; CHECK-LABEL: commute_fold_pblendw:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2,3],xmm0[4],mem[5,6,7]
; CHECK-NEXT: retq
%1 = load <8 x i16>, <8 x i16>* %b
@@ -14,7 +14,7 @@ declare <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16>, <8 x i16>, i8) nounwind rea
define <4 x float> @commute_fold_blendps(<4 x float> %a, <4 x float>* %b) #0 {
; CHECK-LABEL: commute_fold_blendps:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2],mem[3]
; CHECK-NEXT: retq
%1 = load <4 x float>, <4 x float>* %b
@@ -25,7 +25,7 @@ declare <4 x float> @llvm.x86.sse41.blendps(<4 x float>, <4 x float>, i8) nounwi
define <2 x double> @commute_fold_blendpd(<2 x double> %a, <2 x double>* %b) #0 {
; CHECK-LABEL: commute_fold_blendpd:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],mem[1]
; CHECK-NEXT: retq
%1 = load <2 x double>, <2 x double>* %b
diff --git a/test/CodeGen/X86/commute-clmul.ll b/test/CodeGen/X86/commute-clmul.ll
index 84d9a914c9bb..e8c61befc399 100644
--- a/test/CodeGen/X86/commute-clmul.ll
+++ b/test/CodeGen/X86/commute-clmul.ll
@@ -1,17 +1,18 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2,+pclmul | FileCheck %s --check-prefix=SSE
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2,+pclmul | FileCheck %s --check-prefix=AVX
+; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown -mattr=+sse2,+pclmul | FileCheck %s --check-prefix=SSE
+; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown -mattr=+avx2,+pclmul | FileCheck %s --check-prefix=AVX
+; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown -mattr=+avx512vl,+vpclmulqdq | FileCheck %s --check-prefix=AVX
declare <2 x i64> @llvm.x86.pclmulqdq(<2 x i64>, <2 x i64>, i8) nounwind readnone
define <2 x i64> @commute_lq_lq(<2 x i64>* %a0, <2 x i64> %a1) #0 {
; SSE-LABEL: commute_lq_lq:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pclmulqdq $0, (%rdi), %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: commute_lq_lq:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpclmulqdq $0, (%rdi), %xmm0, %xmm0
; AVX-NEXT: retq
%1 = load <2 x i64>, <2 x i64>* %a0
@@ -21,12 +22,12 @@ define <2 x i64> @commute_lq_lq(<2 x i64>* %a0, <2 x i64> %a1) #0 {
define <2 x i64> @commute_lq_hq(<2 x i64>* %a0, <2 x i64> %a1) #0 {
; SSE-LABEL: commute_lq_hq:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pclmulqdq $1, (%rdi), %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: commute_lq_hq:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpclmulqdq $1, (%rdi), %xmm0, %xmm0
; AVX-NEXT: retq
%1 = load <2 x i64>, <2 x i64>* %a0
@@ -36,12 +37,12 @@ define <2 x i64> @commute_lq_hq(<2 x i64>* %a0, <2 x i64> %a1) #0 {
define <2 x i64> @commute_hq_lq(<2 x i64>* %a0, <2 x i64> %a1) #0 {
; SSE-LABEL: commute_hq_lq:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pclmulqdq $16, (%rdi), %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: commute_hq_lq:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpclmulqdq $16, (%rdi), %xmm0, %xmm0
; AVX-NEXT: retq
%1 = load <2 x i64>, <2 x i64>* %a0
@@ -51,12 +52,12 @@ define <2 x i64> @commute_hq_lq(<2 x i64>* %a0, <2 x i64> %a1) #0 {
define <2 x i64> @commute_hq_hq(<2 x i64>* %a0, <2 x i64> %a1) #0 {
; SSE-LABEL: commute_hq_hq:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pclmulqdq $17, (%rdi), %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: commute_hq_hq:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpclmulqdq $17, (%rdi), %xmm0, %xmm0
; AVX-NEXT: retq
%1 = load <2 x i64>, <2 x i64>* %a0
diff --git a/test/CodeGen/X86/commute-fcmp.ll b/test/CodeGen/X86/commute-fcmp.ll
index f05fb805b411..f5b70304d701 100644
--- a/test/CodeGen/X86/commute-fcmp.ll
+++ b/test/CodeGen/X86/commute-fcmp.ll
@@ -1,6 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2 -disable-peephole | FileCheck %s --check-prefix=SSE
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 -disable-peephole | FileCheck %s --check-prefix=AVX
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512vl -disable-peephole | FileCheck %s --check-prefix=AVX512
;
; Float Comparisons
@@ -9,14 +10,21 @@
define <4 x i32> @commute_cmpps_eq(<4 x float>* %a0, <4 x float> %a1) {
; SSE-LABEL: commute_cmpps_eq:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: cmpeqps (%rdi), %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: commute_cmpps_eq:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vcmpeqps (%rdi), %xmm0, %xmm0
; AVX-NEXT: retq
+;
+; AVX512-LABEL: commute_cmpps_eq:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vcmpeqps (%rdi), %xmm0, %k1
+; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
+; AVX512-NEXT: retq
%1 = load <4 x float>, <4 x float>* %a0
%2 = fcmp oeq <4 x float> %1, %a1
%3 = sext <4 x i1> %2 to <4 x i32>
@@ -25,14 +33,21 @@ define <4 x i32> @commute_cmpps_eq(<4 x float>* %a0, <4 x float> %a1) {
define <4 x i32> @commute_cmpps_ne(<4 x float>* %a0, <4 x float> %a1) {
; SSE-LABEL: commute_cmpps_ne:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: cmpneqps (%rdi), %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: commute_cmpps_ne:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vcmpneqps (%rdi), %xmm0, %xmm0
; AVX-NEXT: retq
+;
+; AVX512-LABEL: commute_cmpps_ne:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vcmpneqps (%rdi), %xmm0, %k1
+; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
+; AVX512-NEXT: retq
%1 = load <4 x float>, <4 x float>* %a0
%2 = fcmp une <4 x float> %1, %a1
%3 = sext <4 x i1> %2 to <4 x i32>
@@ -41,14 +56,21 @@ define <4 x i32> @commute_cmpps_ne(<4 x float>* %a0, <4 x float> %a1) {
define <4 x i32> @commute_cmpps_ord(<4 x float>* %a0, <4 x float> %a1) {
; SSE-LABEL: commute_cmpps_ord:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: cmpordps (%rdi), %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: commute_cmpps_ord:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vcmpordps (%rdi), %xmm0, %xmm0
; AVX-NEXT: retq
+;
+; AVX512-LABEL: commute_cmpps_ord:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vcmpordps (%rdi), %xmm0, %k1
+; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
+; AVX512-NEXT: retq
%1 = load <4 x float>, <4 x float>* %a0
%2 = fcmp ord <4 x float> %1, %a1
%3 = sext <4 x i1> %2 to <4 x i32>
@@ -57,14 +79,21 @@ define <4 x i32> @commute_cmpps_ord(<4 x float>* %a0, <4 x float> %a1) {
define <4 x i32> @commute_cmpps_uno(<4 x float>* %a0, <4 x float> %a1) {
; SSE-LABEL: commute_cmpps_uno:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: cmpunordps (%rdi), %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: commute_cmpps_uno:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vcmpunordps (%rdi), %xmm0, %xmm0
; AVX-NEXT: retq
+;
+; AVX512-LABEL: commute_cmpps_uno:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vcmpunordps (%rdi), %xmm0, %k1
+; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
+; AVX512-NEXT: retq
%1 = load <4 x float>, <4 x float>* %a0
%2 = fcmp uno <4 x float> %1, %a1
%3 = sext <4 x i1> %2 to <4 x i32>
@@ -73,7 +102,7 @@ define <4 x i32> @commute_cmpps_uno(<4 x float>* %a0, <4 x float> %a1) {
define <4 x i32> @commute_cmpps_ueq(<4 x float>* %a0, <4 x float> %a1) {
; SSE-LABEL: commute_cmpps_ueq:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps (%rdi), %xmm1
; SSE-NEXT: movaps %xmm1, %xmm2
; SSE-NEXT: cmpeqps %xmm0, %xmm2
@@ -82,12 +111,18 @@ define <4 x i32> @commute_cmpps_ueq(<4 x float>* %a0, <4 x float> %a1) {
; SSE-NEXT: retq
;
; AVX-LABEL: commute_cmpps_ueq:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovaps (%rdi), %xmm1
-; AVX-NEXT: vcmpeqps %xmm0, %xmm1, %xmm2
-; AVX-NEXT: vcmpunordps %xmm0, %xmm1, %xmm0
-; AVX-NEXT: vorps %xmm2, %xmm0, %xmm0
+; AVX-NEXT: vcmpeq_uqps %xmm0, %xmm1, %xmm0
; AVX-NEXT: retq
+;
+; AVX512-LABEL: commute_cmpps_ueq:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vmovaps (%rdi), %xmm1
+; AVX512-NEXT: vcmpeq_uqps %xmm0, %xmm1, %k1
+; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
+; AVX512-NEXT: retq
%1 = load <4 x float>, <4 x float>* %a0
%2 = fcmp ueq <4 x float> %1, %a1
%3 = sext <4 x i1> %2 to <4 x i32>
@@ -96,7 +131,7 @@ define <4 x i32> @commute_cmpps_ueq(<4 x float>* %a0, <4 x float> %a1) {
define <4 x i32> @commute_cmpps_one(<4 x float>* %a0, <4 x float> %a1) {
; SSE-LABEL: commute_cmpps_one:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps (%rdi), %xmm1
; SSE-NEXT: movaps %xmm1, %xmm2
; SSE-NEXT: cmpneqps %xmm0, %xmm2
@@ -105,12 +140,18 @@ define <4 x i32> @commute_cmpps_one(<4 x float>* %a0, <4 x float> %a1) {
; SSE-NEXT: retq
;
; AVX-LABEL: commute_cmpps_one:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovaps (%rdi), %xmm1
-; AVX-NEXT: vcmpneqps %xmm0, %xmm1, %xmm2
-; AVX-NEXT: vcmpordps %xmm0, %xmm1, %xmm0
-; AVX-NEXT: vandps %xmm2, %xmm0, %xmm0
+; AVX-NEXT: vcmpneq_oqps %xmm0, %xmm1, %xmm0
; AVX-NEXT: retq
+;
+; AVX512-LABEL: commute_cmpps_one:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vmovaps (%rdi), %xmm1
+; AVX512-NEXT: vcmpneq_oqps %xmm0, %xmm1, %k1
+; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
+; AVX512-NEXT: retq
%1 = load <4 x float>, <4 x float>* %a0
%2 = fcmp one <4 x float> %1, %a1
%3 = sext <4 x i1> %2 to <4 x i32>
@@ -119,17 +160,25 @@ define <4 x i32> @commute_cmpps_one(<4 x float>* %a0, <4 x float> %a1) {
define <4 x i32> @commute_cmpps_lt(<4 x float>* %a0, <4 x float> %a1) {
; SSE-LABEL: commute_cmpps_lt:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps (%rdi), %xmm1
; SSE-NEXT: cmpltps %xmm0, %xmm1
; SSE-NEXT: movaps %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: commute_cmpps_lt:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovaps (%rdi), %xmm1
; AVX-NEXT: vcmpltps %xmm0, %xmm1, %xmm0
; AVX-NEXT: retq
+;
+; AVX512-LABEL: commute_cmpps_lt:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vmovaps (%rdi), %xmm1
+; AVX512-NEXT: vcmpltps %xmm0, %xmm1, %k1
+; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
+; AVX512-NEXT: retq
%1 = load <4 x float>, <4 x float>* %a0
%2 = fcmp olt <4 x float> %1, %a1
%3 = sext <4 x i1> %2 to <4 x i32>
@@ -138,17 +187,25 @@ define <4 x i32> @commute_cmpps_lt(<4 x float>* %a0, <4 x float> %a1) {
define <4 x i32> @commute_cmpps_le(<4 x float>* %a0, <4 x float> %a1) {
; SSE-LABEL: commute_cmpps_le:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps (%rdi), %xmm1
; SSE-NEXT: cmpleps %xmm0, %xmm1
; SSE-NEXT: movaps %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: commute_cmpps_le:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovaps (%rdi), %xmm1
; AVX-NEXT: vcmpleps %xmm0, %xmm1, %xmm0
; AVX-NEXT: retq
+;
+; AVX512-LABEL: commute_cmpps_le:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vmovaps (%rdi), %xmm1
+; AVX512-NEXT: vcmpleps %xmm0, %xmm1, %k1
+; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
+; AVX512-NEXT: retq
%1 = load <4 x float>, <4 x float>* %a0
%2 = fcmp ole <4 x float> %1, %a1
%3 = sext <4 x i1> %2 to <4 x i32>
@@ -157,15 +214,22 @@ define <4 x i32> @commute_cmpps_le(<4 x float>* %a0, <4 x float> %a1) {
define <8 x i32> @commute_cmpps_eq_ymm(<8 x float>* %a0, <8 x float> %a1) {
; SSE-LABEL: commute_cmpps_eq_ymm:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: cmpeqps (%rdi), %xmm0
; SSE-NEXT: cmpeqps 16(%rdi), %xmm1
; SSE-NEXT: retq
;
; AVX-LABEL: commute_cmpps_eq_ymm:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vcmpeqps (%rdi), %ymm0, %ymm0
; AVX-NEXT: retq
+;
+; AVX512-LABEL: commute_cmpps_eq_ymm:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vcmpeqps (%rdi), %ymm0, %k1
+; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
+; AVX512-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; AVX512-NEXT: retq
%1 = load <8 x float>, <8 x float>* %a0
%2 = fcmp oeq <8 x float> %1, %a1
%3 = sext <8 x i1> %2 to <8 x i32>
@@ -174,15 +238,22 @@ define <8 x i32> @commute_cmpps_eq_ymm(<8 x float>* %a0, <8 x float> %a1) {
define <8 x i32> @commute_cmpps_ne_ymm(<8 x float>* %a0, <8 x float> %a1) {
; SSE-LABEL: commute_cmpps_ne_ymm:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: cmpneqps (%rdi), %xmm0
; SSE-NEXT: cmpneqps 16(%rdi), %xmm1
; SSE-NEXT: retq
;
; AVX-LABEL: commute_cmpps_ne_ymm:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vcmpneqps (%rdi), %ymm0, %ymm0
; AVX-NEXT: retq
+;
+; AVX512-LABEL: commute_cmpps_ne_ymm:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vcmpneqps (%rdi), %ymm0, %k1
+; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
+; AVX512-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; AVX512-NEXT: retq
%1 = load <8 x float>, <8 x float>* %a0
%2 = fcmp une <8 x float> %1, %a1
%3 = sext <8 x i1> %2 to <8 x i32>
@@ -191,15 +262,22 @@ define <8 x i32> @commute_cmpps_ne_ymm(<8 x float>* %a0, <8 x float> %a1) {
define <8 x i32> @commute_cmpps_ord_ymm(<8 x float>* %a0, <8 x float> %a1) {
; SSE-LABEL: commute_cmpps_ord_ymm:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: cmpordps (%rdi), %xmm0
; SSE-NEXT: cmpordps 16(%rdi), %xmm1
; SSE-NEXT: retq
;
; AVX-LABEL: commute_cmpps_ord_ymm:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vcmpordps (%rdi), %ymm0, %ymm0
; AVX-NEXT: retq
+;
+; AVX512-LABEL: commute_cmpps_ord_ymm:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vcmpordps (%rdi), %ymm0, %k1
+; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
+; AVX512-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; AVX512-NEXT: retq
%1 = load <8 x float>, <8 x float>* %a0
%2 = fcmp ord <8 x float> %1, %a1
%3 = sext <8 x i1> %2 to <8 x i32>
@@ -208,15 +286,22 @@ define <8 x i32> @commute_cmpps_ord_ymm(<8 x float>* %a0, <8 x float> %a1) {
define <8 x i32> @commute_cmpps_uno_ymm(<8 x float>* %a0, <8 x float> %a1) {
; SSE-LABEL: commute_cmpps_uno_ymm:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: cmpunordps (%rdi), %xmm0
; SSE-NEXT: cmpunordps 16(%rdi), %xmm1
; SSE-NEXT: retq
;
; AVX-LABEL: commute_cmpps_uno_ymm:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vcmpunordps (%rdi), %ymm0, %ymm0
; AVX-NEXT: retq
+;
+; AVX512-LABEL: commute_cmpps_uno_ymm:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vcmpunordps (%rdi), %ymm0, %k1
+; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
+; AVX512-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; AVX512-NEXT: retq
%1 = load <8 x float>, <8 x float>* %a0
%2 = fcmp uno <8 x float> %1, %a1
%3 = sext <8 x i1> %2 to <8 x i32>
@@ -225,7 +310,7 @@ define <8 x i32> @commute_cmpps_uno_ymm(<8 x float>* %a0, <8 x float> %a1) {
define <8 x i32> @commute_cmpps_ueq_ymm(<8 x float>* %a0, <8 x float> %a1) {
; SSE-LABEL: commute_cmpps_ueq_ymm:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps (%rdi), %xmm2
; SSE-NEXT: movaps 16(%rdi), %xmm3
; SSE-NEXT: movaps %xmm2, %xmm4
@@ -239,12 +324,18 @@ define <8 x i32> @commute_cmpps_ueq_ymm(<8 x float>* %a0, <8 x float> %a1) {
; SSE-NEXT: retq
;
; AVX-LABEL: commute_cmpps_ueq_ymm:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovaps (%rdi), %ymm1
-; AVX-NEXT: vcmpeqps %ymm0, %ymm1, %ymm2
-; AVX-NEXT: vcmpunordps %ymm0, %ymm1, %ymm0
-; AVX-NEXT: vorps %ymm2, %ymm0, %ymm0
+; AVX-NEXT: vcmpeq_uqps %ymm0, %ymm1, %ymm0
; AVX-NEXT: retq
+;
+; AVX512-LABEL: commute_cmpps_ueq_ymm:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vmovaps (%rdi), %ymm1
+; AVX512-NEXT: vcmpeq_uqps %ymm0, %ymm1, %k1
+; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
+; AVX512-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; AVX512-NEXT: retq
%1 = load <8 x float>, <8 x float>* %a0
%2 = fcmp ueq <8 x float> %1, %a1
%3 = sext <8 x i1> %2 to <8 x i32>
@@ -253,7 +344,7 @@ define <8 x i32> @commute_cmpps_ueq_ymm(<8 x float>* %a0, <8 x float> %a1) {
define <8 x i32> @commute_cmpps_one_ymm(<8 x float>* %a0, <8 x float> %a1) {
; SSE-LABEL: commute_cmpps_one_ymm:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps (%rdi), %xmm2
; SSE-NEXT: movaps 16(%rdi), %xmm3
; SSE-NEXT: movaps %xmm2, %xmm4
@@ -267,12 +358,18 @@ define <8 x i32> @commute_cmpps_one_ymm(<8 x float>* %a0, <8 x float> %a1) {
; SSE-NEXT: retq
;
; AVX-LABEL: commute_cmpps_one_ymm:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovaps (%rdi), %ymm1
-; AVX-NEXT: vcmpneqps %ymm0, %ymm1, %ymm2
-; AVX-NEXT: vcmpordps %ymm0, %ymm1, %ymm0
-; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0
+; AVX-NEXT: vcmpneq_oqps %ymm0, %ymm1, %ymm0
; AVX-NEXT: retq
+;
+; AVX512-LABEL: commute_cmpps_one_ymm:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vmovaps (%rdi), %ymm1
+; AVX512-NEXT: vcmpneq_oqps %ymm0, %ymm1, %k1
+; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
+; AVX512-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; AVX512-NEXT: retq
%1 = load <8 x float>, <8 x float>* %a0
%2 = fcmp one <8 x float> %1, %a1
%3 = sext <8 x i1> %2 to <8 x i32>
@@ -281,7 +378,7 @@ define <8 x i32> @commute_cmpps_one_ymm(<8 x float>* %a0, <8 x float> %a1) {
define <8 x i32> @commute_cmpps_lt_ymm(<8 x float>* %a0, <8 x float> %a1) {
; SSE-LABEL: commute_cmpps_lt_ymm:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps (%rdi), %xmm2
; SSE-NEXT: movaps 16(%rdi), %xmm3
; SSE-NEXT: cmpltps %xmm0, %xmm2
@@ -291,10 +388,18 @@ define <8 x i32> @commute_cmpps_lt_ymm(<8 x float>* %a0, <8 x float> %a1) {
; SSE-NEXT: retq
;
; AVX-LABEL: commute_cmpps_lt_ymm:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovaps (%rdi), %ymm1
; AVX-NEXT: vcmpltps %ymm0, %ymm1, %ymm0
; AVX-NEXT: retq
+;
+; AVX512-LABEL: commute_cmpps_lt_ymm:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vmovaps (%rdi), %ymm1
+; AVX512-NEXT: vcmpltps %ymm0, %ymm1, %k1
+; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
+; AVX512-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; AVX512-NEXT: retq
%1 = load <8 x float>, <8 x float>* %a0
%2 = fcmp olt <8 x float> %1, %a1
%3 = sext <8 x i1> %2 to <8 x i32>
@@ -303,7 +408,7 @@ define <8 x i32> @commute_cmpps_lt_ymm(<8 x float>* %a0, <8 x float> %a1) {
define <8 x i32> @commute_cmpps_le_ymm(<8 x float>* %a0, <8 x float> %a1) {
; SSE-LABEL: commute_cmpps_le_ymm:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps (%rdi), %xmm2
; SSE-NEXT: movaps 16(%rdi), %xmm3
; SSE-NEXT: cmpleps %xmm0, %xmm2
@@ -313,10 +418,18 @@ define <8 x i32> @commute_cmpps_le_ymm(<8 x float>* %a0, <8 x float> %a1) {
; SSE-NEXT: retq
;
; AVX-LABEL: commute_cmpps_le_ymm:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovaps (%rdi), %ymm1
; AVX-NEXT: vcmpleps %ymm0, %ymm1, %ymm0
; AVX-NEXT: retq
+;
+; AVX512-LABEL: commute_cmpps_le_ymm:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vmovaps (%rdi), %ymm1
+; AVX512-NEXT: vcmpleps %ymm0, %ymm1, %k1
+; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
+; AVX512-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; AVX512-NEXT: retq
%1 = load <8 x float>, <8 x float>* %a0
%2 = fcmp ole <8 x float> %1, %a1
%3 = sext <8 x i1> %2 to <8 x i32>
@@ -330,14 +443,21 @@ define <8 x i32> @commute_cmpps_le_ymm(<8 x float>* %a0, <8 x float> %a1) {
define <2 x i64> @commute_cmppd_eq(<2 x double>* %a0, <2 x double> %a1) {
; SSE-LABEL: commute_cmppd_eq:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: cmpeqpd (%rdi), %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: commute_cmppd_eq:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vcmpeqpd (%rdi), %xmm0, %xmm0
; AVX-NEXT: retq
+;
+; AVX512-LABEL: commute_cmppd_eq:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vcmpeqpd (%rdi), %xmm0, %k1
+; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
+; AVX512-NEXT: retq
%1 = load <2 x double>, <2 x double>* %a0
%2 = fcmp oeq <2 x double> %1, %a1
%3 = sext <2 x i1> %2 to <2 x i64>
@@ -346,14 +466,21 @@ define <2 x i64> @commute_cmppd_eq(<2 x double>* %a0, <2 x double> %a1) {
define <2 x i64> @commute_cmppd_ne(<2 x double>* %a0, <2 x double> %a1) {
; SSE-LABEL: commute_cmppd_ne:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: cmpneqpd (%rdi), %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: commute_cmppd_ne:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vcmpneqpd (%rdi), %xmm0, %xmm0
; AVX-NEXT: retq
+;
+; AVX512-LABEL: commute_cmppd_ne:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vcmpneqpd (%rdi), %xmm0, %k1
+; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
+; AVX512-NEXT: retq
%1 = load <2 x double>, <2 x double>* %a0
%2 = fcmp une <2 x double> %1, %a1
%3 = sext <2 x i1> %2 to <2 x i64>
@@ -362,14 +489,21 @@ define <2 x i64> @commute_cmppd_ne(<2 x double>* %a0, <2 x double> %a1) {
define <2 x i64> @commute_cmppd_ord(<2 x double>* %a0, <2 x double> %a1) {
; SSE-LABEL: commute_cmppd_ord:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: cmpordpd (%rdi), %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: commute_cmppd_ord:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vcmpordpd (%rdi), %xmm0, %xmm0
; AVX-NEXT: retq
+;
+; AVX512-LABEL: commute_cmppd_ord:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vcmpordpd (%rdi), %xmm0, %k1
+; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
+; AVX512-NEXT: retq
%1 = load <2 x double>, <2 x double>* %a0
%2 = fcmp ord <2 x double> %1, %a1
%3 = sext <2 x i1> %2 to <2 x i64>
@@ -378,7 +512,7 @@ define <2 x i64> @commute_cmppd_ord(<2 x double>* %a0, <2 x double> %a1) {
define <2 x i64> @commute_cmppd_ueq(<2 x double>* %a0, <2 x double> %a1) {
; SSE-LABEL: commute_cmppd_ueq:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movapd (%rdi), %xmm1
; SSE-NEXT: movapd %xmm1, %xmm2
; SSE-NEXT: cmpeqpd %xmm0, %xmm2
@@ -387,12 +521,18 @@ define <2 x i64> @commute_cmppd_ueq(<2 x double>* %a0, <2 x double> %a1) {
; SSE-NEXT: retq
;
; AVX-LABEL: commute_cmppd_ueq:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovapd (%rdi), %xmm1
-; AVX-NEXT: vcmpeqpd %xmm0, %xmm1, %xmm2
-; AVX-NEXT: vcmpunordpd %xmm0, %xmm1, %xmm0
-; AVX-NEXT: vorpd %xmm2, %xmm0, %xmm0
+; AVX-NEXT: vcmpeq_uqpd %xmm0, %xmm1, %xmm0
; AVX-NEXT: retq
+;
+; AVX512-LABEL: commute_cmppd_ueq:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vmovapd (%rdi), %xmm1
+; AVX512-NEXT: vcmpeq_uqpd %xmm0, %xmm1, %k1
+; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
+; AVX512-NEXT: retq
%1 = load <2 x double>, <2 x double>* %a0
%2 = fcmp ueq <2 x double> %1, %a1
%3 = sext <2 x i1> %2 to <2 x i64>
@@ -401,7 +541,7 @@ define <2 x i64> @commute_cmppd_ueq(<2 x double>* %a0, <2 x double> %a1) {
define <2 x i64> @commute_cmppd_one(<2 x double>* %a0, <2 x double> %a1) {
; SSE-LABEL: commute_cmppd_one:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movapd (%rdi), %xmm1
; SSE-NEXT: movapd %xmm1, %xmm2
; SSE-NEXT: cmpneqpd %xmm0, %xmm2
@@ -410,12 +550,18 @@ define <2 x i64> @commute_cmppd_one(<2 x double>* %a0, <2 x double> %a1) {
; SSE-NEXT: retq
;
; AVX-LABEL: commute_cmppd_one:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovapd (%rdi), %xmm1
-; AVX-NEXT: vcmpneqpd %xmm0, %xmm1, %xmm2
-; AVX-NEXT: vcmpordpd %xmm0, %xmm1, %xmm0
-; AVX-NEXT: vandpd %xmm2, %xmm0, %xmm0
+; AVX-NEXT: vcmpneq_oqpd %xmm0, %xmm1, %xmm0
; AVX-NEXT: retq
+;
+; AVX512-LABEL: commute_cmppd_one:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vmovapd (%rdi), %xmm1
+; AVX512-NEXT: vcmpneq_oqpd %xmm0, %xmm1, %k1
+; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
+; AVX512-NEXT: retq
%1 = load <2 x double>, <2 x double>* %a0
%2 = fcmp one <2 x double> %1, %a1
%3 = sext <2 x i1> %2 to <2 x i64>
@@ -424,14 +570,21 @@ define <2 x i64> @commute_cmppd_one(<2 x double>* %a0, <2 x double> %a1) {
define <2 x i64> @commute_cmppd_uno(<2 x double>* %a0, <2 x double> %a1) {
; SSE-LABEL: commute_cmppd_uno:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: cmpunordpd (%rdi), %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: commute_cmppd_uno:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vcmpunordpd (%rdi), %xmm0, %xmm0
; AVX-NEXT: retq
+;
+; AVX512-LABEL: commute_cmppd_uno:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vcmpunordpd (%rdi), %xmm0, %k1
+; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
+; AVX512-NEXT: retq
%1 = load <2 x double>, <2 x double>* %a0
%2 = fcmp uno <2 x double> %1, %a1
%3 = sext <2 x i1> %2 to <2 x i64>
@@ -440,17 +593,25 @@ define <2 x i64> @commute_cmppd_uno(<2 x double>* %a0, <2 x double> %a1) {
define <2 x i64> @commute_cmppd_lt(<2 x double>* %a0, <2 x double> %a1) {
; SSE-LABEL: commute_cmppd_lt:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movapd (%rdi), %xmm1
; SSE-NEXT: cmpltpd %xmm0, %xmm1
; SSE-NEXT: movapd %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: commute_cmppd_lt:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovapd (%rdi), %xmm1
; AVX-NEXT: vcmpltpd %xmm0, %xmm1, %xmm0
; AVX-NEXT: retq
+;
+; AVX512-LABEL: commute_cmppd_lt:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vmovapd (%rdi), %xmm1
+; AVX512-NEXT: vcmpltpd %xmm0, %xmm1, %k1
+; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
+; AVX512-NEXT: retq
%1 = load <2 x double>, <2 x double>* %a0
%2 = fcmp olt <2 x double> %1, %a1
%3 = sext <2 x i1> %2 to <2 x i64>
@@ -459,17 +620,25 @@ define <2 x i64> @commute_cmppd_lt(<2 x double>* %a0, <2 x double> %a1) {
define <2 x i64> @commute_cmppd_le(<2 x double>* %a0, <2 x double> %a1) {
; SSE-LABEL: commute_cmppd_le:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movapd (%rdi), %xmm1
; SSE-NEXT: cmplepd %xmm0, %xmm1
; SSE-NEXT: movapd %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: commute_cmppd_le:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovapd (%rdi), %xmm1
; AVX-NEXT: vcmplepd %xmm0, %xmm1, %xmm0
; AVX-NEXT: retq
+;
+; AVX512-LABEL: commute_cmppd_le:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vmovapd (%rdi), %xmm1
+; AVX512-NEXT: vcmplepd %xmm0, %xmm1, %k1
+; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
+; AVX512-NEXT: retq
%1 = load <2 x double>, <2 x double>* %a0
%2 = fcmp ole <2 x double> %1, %a1
%3 = sext <2 x i1> %2 to <2 x i64>
@@ -478,15 +647,22 @@ define <2 x i64> @commute_cmppd_le(<2 x double>* %a0, <2 x double> %a1) {
define <4 x i64> @commute_cmppd_eq_ymmm(<4 x double>* %a0, <4 x double> %a1) {
; SSE-LABEL: commute_cmppd_eq_ymmm:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: cmpeqpd (%rdi), %xmm0
; SSE-NEXT: cmpeqpd 16(%rdi), %xmm1
; SSE-NEXT: retq
;
; AVX-LABEL: commute_cmppd_eq_ymmm:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vcmpeqpd (%rdi), %ymm0, %ymm0
; AVX-NEXT: retq
+;
+; AVX512-LABEL: commute_cmppd_eq_ymmm:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vcmpeqpd (%rdi), %ymm0, %k1
+; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
+; AVX512-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z}
+; AVX512-NEXT: retq
%1 = load <4 x double>, <4 x double>* %a0
%2 = fcmp oeq <4 x double> %1, %a1
%3 = sext <4 x i1> %2 to <4 x i64>
@@ -495,15 +671,22 @@ define <4 x i64> @commute_cmppd_eq_ymmm(<4 x double>* %a0, <4 x double> %a1) {
define <4 x i64> @commute_cmppd_ne_ymmm(<4 x double>* %a0, <4 x double> %a1) {
; SSE-LABEL: commute_cmppd_ne_ymmm:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: cmpneqpd (%rdi), %xmm0
; SSE-NEXT: cmpneqpd 16(%rdi), %xmm1
; SSE-NEXT: retq
;
; AVX-LABEL: commute_cmppd_ne_ymmm:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vcmpneqpd (%rdi), %ymm0, %ymm0
; AVX-NEXT: retq
+;
+; AVX512-LABEL: commute_cmppd_ne_ymmm:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vcmpneqpd (%rdi), %ymm0, %k1
+; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
+; AVX512-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z}
+; AVX512-NEXT: retq
%1 = load <4 x double>, <4 x double>* %a0
%2 = fcmp une <4 x double> %1, %a1
%3 = sext <4 x i1> %2 to <4 x i64>
@@ -512,15 +695,22 @@ define <4 x i64> @commute_cmppd_ne_ymmm(<4 x double>* %a0, <4 x double> %a1) {
define <4 x i64> @commute_cmppd_ord_ymmm(<4 x double>* %a0, <4 x double> %a1) {
; SSE-LABEL: commute_cmppd_ord_ymmm:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: cmpordpd (%rdi), %xmm0
; SSE-NEXT: cmpordpd 16(%rdi), %xmm1
; SSE-NEXT: retq
;
; AVX-LABEL: commute_cmppd_ord_ymmm:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vcmpordpd (%rdi), %ymm0, %ymm0
; AVX-NEXT: retq
+;
+; AVX512-LABEL: commute_cmppd_ord_ymmm:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vcmpordpd (%rdi), %ymm0, %k1
+; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
+; AVX512-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z}
+; AVX512-NEXT: retq
%1 = load <4 x double>, <4 x double>* %a0
%2 = fcmp ord <4 x double> %1, %a1
%3 = sext <4 x i1> %2 to <4 x i64>
@@ -529,15 +719,22 @@ define <4 x i64> @commute_cmppd_ord_ymmm(<4 x double>* %a0, <4 x double> %a1) {
define <4 x i64> @commute_cmppd_uno_ymmm(<4 x double>* %a0, <4 x double> %a1) {
; SSE-LABEL: commute_cmppd_uno_ymmm:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: cmpunordpd (%rdi), %xmm0
; SSE-NEXT: cmpunordpd 16(%rdi), %xmm1
; SSE-NEXT: retq
;
; AVX-LABEL: commute_cmppd_uno_ymmm:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vcmpunordpd (%rdi), %ymm0, %ymm0
; AVX-NEXT: retq
+;
+; AVX512-LABEL: commute_cmppd_uno_ymmm:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vcmpunordpd (%rdi), %ymm0, %k1
+; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
+; AVX512-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z}
+; AVX512-NEXT: retq
%1 = load <4 x double>, <4 x double>* %a0
%2 = fcmp uno <4 x double> %1, %a1
%3 = sext <4 x i1> %2 to <4 x i64>
@@ -546,7 +743,7 @@ define <4 x i64> @commute_cmppd_uno_ymmm(<4 x double>* %a0, <4 x double> %a1) {
define <4 x i64> @commute_cmppd_ueq_ymmm(<4 x double>* %a0, <4 x double> %a1) {
; SSE-LABEL: commute_cmppd_ueq_ymmm:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movapd (%rdi), %xmm2
; SSE-NEXT: movapd 16(%rdi), %xmm3
; SSE-NEXT: movapd %xmm2, %xmm4
@@ -560,12 +757,18 @@ define <4 x i64> @commute_cmppd_ueq_ymmm(<4 x double>* %a0, <4 x double> %a1) {
; SSE-NEXT: retq
;
; AVX-LABEL: commute_cmppd_ueq_ymmm:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovapd (%rdi), %ymm1
-; AVX-NEXT: vcmpeqpd %ymm0, %ymm1, %ymm2
-; AVX-NEXT: vcmpunordpd %ymm0, %ymm1, %ymm0
-; AVX-NEXT: vorpd %ymm2, %ymm0, %ymm0
+; AVX-NEXT: vcmpeq_uqpd %ymm0, %ymm1, %ymm0
; AVX-NEXT: retq
+;
+; AVX512-LABEL: commute_cmppd_ueq_ymmm:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vmovapd (%rdi), %ymm1
+; AVX512-NEXT: vcmpeq_uqpd %ymm0, %ymm1, %k1
+; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
+; AVX512-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z}
+; AVX512-NEXT: retq
%1 = load <4 x double>, <4 x double>* %a0
%2 = fcmp ueq <4 x double> %1, %a1
%3 = sext <4 x i1> %2 to <4 x i64>
@@ -574,7 +777,7 @@ define <4 x i64> @commute_cmppd_ueq_ymmm(<4 x double>* %a0, <4 x double> %a1) {
define <4 x i64> @commute_cmppd_one_ymmm(<4 x double>* %a0, <4 x double> %a1) {
; SSE-LABEL: commute_cmppd_one_ymmm:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movapd (%rdi), %xmm2
; SSE-NEXT: movapd 16(%rdi), %xmm3
; SSE-NEXT: movapd %xmm2, %xmm4
@@ -588,12 +791,18 @@ define <4 x i64> @commute_cmppd_one_ymmm(<4 x double>* %a0, <4 x double> %a1) {
; SSE-NEXT: retq
;
; AVX-LABEL: commute_cmppd_one_ymmm:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovapd (%rdi), %ymm1
-; AVX-NEXT: vcmpneqpd %ymm0, %ymm1, %ymm2
-; AVX-NEXT: vcmpordpd %ymm0, %ymm1, %ymm0
-; AVX-NEXT: vandpd %ymm2, %ymm0, %ymm0
+; AVX-NEXT: vcmpneq_oqpd %ymm0, %ymm1, %ymm0
; AVX-NEXT: retq
+;
+; AVX512-LABEL: commute_cmppd_one_ymmm:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vmovapd (%rdi), %ymm1
+; AVX512-NEXT: vcmpneq_oqpd %ymm0, %ymm1, %k1
+; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
+; AVX512-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z}
+; AVX512-NEXT: retq
%1 = load <4 x double>, <4 x double>* %a0
%2 = fcmp one <4 x double> %1, %a1
%3 = sext <4 x i1> %2 to <4 x i64>
@@ -602,7 +811,7 @@ define <4 x i64> @commute_cmppd_one_ymmm(<4 x double>* %a0, <4 x double> %a1) {
define <4 x i64> @commute_cmppd_lt_ymmm(<4 x double>* %a0, <4 x double> %a1) {
; SSE-LABEL: commute_cmppd_lt_ymmm:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movapd (%rdi), %xmm2
; SSE-NEXT: movapd 16(%rdi), %xmm3
; SSE-NEXT: cmpltpd %xmm0, %xmm2
@@ -612,10 +821,18 @@ define <4 x i64> @commute_cmppd_lt_ymmm(<4 x double>* %a0, <4 x double> %a1) {
; SSE-NEXT: retq
;
; AVX-LABEL: commute_cmppd_lt_ymmm:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovapd (%rdi), %ymm1
; AVX-NEXT: vcmpltpd %ymm0, %ymm1, %ymm0
; AVX-NEXT: retq
+;
+; AVX512-LABEL: commute_cmppd_lt_ymmm:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vmovapd (%rdi), %ymm1
+; AVX512-NEXT: vcmpltpd %ymm0, %ymm1, %k1
+; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
+; AVX512-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z}
+; AVX512-NEXT: retq
%1 = load <4 x double>, <4 x double>* %a0
%2 = fcmp olt <4 x double> %1, %a1
%3 = sext <4 x i1> %2 to <4 x i64>
@@ -624,7 +841,7 @@ define <4 x i64> @commute_cmppd_lt_ymmm(<4 x double>* %a0, <4 x double> %a1) {
define <4 x i64> @commute_cmppd_le_ymmm(<4 x double>* %a0, <4 x double> %a1) {
; SSE-LABEL: commute_cmppd_le_ymmm:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movapd (%rdi), %xmm2
; SSE-NEXT: movapd 16(%rdi), %xmm3
; SSE-NEXT: cmplepd %xmm0, %xmm2
@@ -634,10 +851,18 @@ define <4 x i64> @commute_cmppd_le_ymmm(<4 x double>* %a0, <4 x double> %a1) {
; SSE-NEXT: retq
;
; AVX-LABEL: commute_cmppd_le_ymmm:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovapd (%rdi), %ymm1
; AVX-NEXT: vcmplepd %ymm0, %ymm1, %ymm0
; AVX-NEXT: retq
+;
+; AVX512-LABEL: commute_cmppd_le_ymmm:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vmovapd (%rdi), %ymm1
+; AVX512-NEXT: vcmplepd %ymm0, %ymm1, %k1
+; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
+; AVX512-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z}
+; AVX512-NEXT: retq
%1 = load <4 x double>, <4 x double>* %a0
%2 = fcmp ole <4 x double> %1, %a1
%3 = sext <4 x i1> %2 to <4 x i64>
diff --git a/test/CodeGen/X86/commute-vpclmulqdq-avx.ll b/test/CodeGen/X86/commute-vpclmulqdq-avx.ll
new file mode 100644
index 000000000000..0d9ea5450a08
--- /dev/null
+++ b/test/CodeGen/X86/commute-vpclmulqdq-avx.ll
@@ -0,0 +1,42 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+vpclmulqdq | FileCheck %s
+; FIXME: actual vpclmulqdq operation should be eliminated
+
+declare <4 x i64> @llvm.x86.pclmulqdq.256(<4 x i64>, <4 x i64>, i8) nounwind readnone
+
+define <4 x i64> @commute_v1(<4 x i64> %a0, <4 x i64> %a1) {
+; CHECK-LABEL: commute_v1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpclmulqdq $0, %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; CHECK-NEXT: retq
+ %1 = call <4 x i64> @llvm.x86.pclmulqdq.256(<4 x i64> %a0, <4 x i64> %a1, i8 0)
+ %2 = call <4 x i64> @llvm.x86.pclmulqdq.256(<4 x i64> %a1, <4 x i64> %a0, i8 0)
+ %3 = xor <4 x i64> %1, %2
+ ret <4 x i64> %3
+}
+
+define <4 x i64> @commute_v2(<4 x i64> %a0, <4 x i64> %a1) {
+; CHECK-LABEL: commute_v2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpclmulqdq $16, %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; CHECK-NEXT: retq
+ %1 = call <4 x i64> @llvm.x86.pclmulqdq.256(<4 x i64> %a0, <4 x i64> %a1, i8 16)
+ %2 = call <4 x i64> @llvm.x86.pclmulqdq.256(<4 x i64> %a1, <4 x i64> %a0, i8 1)
+ %3 = xor <4 x i64> %2, %1
+ ret <4 x i64> %3
+}
+
+define <4 x i64> @commute_v3(<4 x i64> %a0, <4 x i64> %a1) {
+; CHECK-LABEL: commute_v3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpclmulqdq $17, %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; CHECK-NEXT: retq
+ %1 = call <4 x i64> @llvm.x86.pclmulqdq.256(<4 x i64> %a0, <4 x i64> %a1, i8 17)
+ %2 = call <4 x i64> @llvm.x86.pclmulqdq.256(<4 x i64> %a1, <4 x i64> %a0, i8 17)
+ %3 = xor <4 x i64> %2, %1
+ ret <4 x i64> %3
+}
+
diff --git a/test/CodeGen/X86/commute-vpclmulqdq-avx512.ll b/test/CodeGen/X86/commute-vpclmulqdq-avx512.ll
new file mode 100644
index 000000000000..400f27baca61
--- /dev/null
+++ b/test/CodeGen/X86/commute-vpclmulqdq-avx512.ll
@@ -0,0 +1,116 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+vpclmulqdq,+avx512vl | FileCheck %s
+; FIXME: actual vpclmulqdq operation should be eliminated
+
+declare <2 x i64> @llvm.x86.pclmulqdq(<2 x i64>, <2 x i64>, i8) nounwind readnone
+declare <4 x i64> @llvm.x86.pclmulqdq.256(<4 x i64>, <4 x i64>, i8) nounwind readnone
+declare <8 x i64> @llvm.x86.pclmulqdq.512(<8 x i64>, <8 x i64>, i8) nounwind readnone
+
+define <2 x i64> @commute_xmm_v1(<2 x i64> %a0, <2 x i64> %a1) {
+; CHECK-LABEL: commute_xmm_v1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpclmulqdq $0, %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %1 = call <2 x i64> @llvm.x86.pclmulqdq(<2 x i64> %a0, <2 x i64> %a1, i8 0)
+ %2 = call <2 x i64> @llvm.x86.pclmulqdq(<2 x i64> %a1, <2 x i64> %a0, i8 0)
+ %3 = xor <2 x i64> %1, %2
+ ret <2 x i64> %3
+}
+
+define <2 x i64> @commute_xmm_v2(<2 x i64> %a0, <2 x i64> %a1) {
+; CHECK-LABEL: commute_xmm_v2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpclmulqdq $16, %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %1 = call <2 x i64> @llvm.x86.pclmulqdq(<2 x i64> %a0, <2 x i64> %a1, i8 16)
+ %2 = call <2 x i64> @llvm.x86.pclmulqdq(<2 x i64> %a1, <2 x i64> %a0, i8 1)
+ %3 = xor <2 x i64> %2, %1
+ ret <2 x i64> %3
+}
+
+define <2 x i64> @commute_xmm_v3(<2 x i64> %a0, <2 x i64> %a1) {
+; CHECK-LABEL: commute_xmm_v3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpclmulqdq $17, %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %1 = call <2 x i64> @llvm.x86.pclmulqdq(<2 x i64> %a0, <2 x i64> %a1, i8 17)
+ %2 = call <2 x i64> @llvm.x86.pclmulqdq(<2 x i64> %a1, <2 x i64> %a0, i8 17)
+ %3 = xor <2 x i64> %2, %1
+ ret <2 x i64> %3
+}
+
+define <4 x i64> @commute_ymm_v1(<4 x i64> %a0, <4 x i64> %a1) {
+; CHECK-LABEL: commute_ymm_v1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpclmulqdq $0, %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: vpxor %ymm0, %ymm0, %ymm0
+; CHECK-NEXT: retq
+ %1 = call <4 x i64> @llvm.x86.pclmulqdq.256(<4 x i64> %a0, <4 x i64> %a1, i8 0)
+ %2 = call <4 x i64> @llvm.x86.pclmulqdq.256(<4 x i64> %a1, <4 x i64> %a0, i8 0)
+ %3 = xor <4 x i64> %1, %2
+ ret <4 x i64> %3
+}
+
+define <4 x i64> @commute_ymm_v2(<4 x i64> %a0, <4 x i64> %a1) {
+; CHECK-LABEL: commute_ymm_v2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpclmulqdq $16, %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: vpxor %ymm0, %ymm0, %ymm0
+; CHECK-NEXT: retq
+ %1 = call <4 x i64> @llvm.x86.pclmulqdq.256(<4 x i64> %a0, <4 x i64> %a1, i8 16)
+ %2 = call <4 x i64> @llvm.x86.pclmulqdq.256(<4 x i64> %a1, <4 x i64> %a0, i8 1)
+ %3 = xor <4 x i64> %2, %1
+ ret <4 x i64> %3
+}
+
+define <4 x i64> @commute_ymm_v3(<4 x i64> %a0, <4 x i64> %a1) {
+; CHECK-LABEL: commute_ymm_v3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpclmulqdq $17, %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: vpxor %ymm0, %ymm0, %ymm0
+; CHECK-NEXT: retq
+ %1 = call <4 x i64> @llvm.x86.pclmulqdq.256(<4 x i64> %a0, <4 x i64> %a1, i8 17)
+ %2 = call <4 x i64> @llvm.x86.pclmulqdq.256(<4 x i64> %a1, <4 x i64> %a0, i8 17)
+ %3 = xor <4 x i64> %2, %1
+ ret <4 x i64> %3
+}
+
+define <8 x i64> @commute_zmm_v1(<8 x i64> %a0, <8 x i64> %a1) {
+; CHECK-LABEL: commute_zmm_v1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpclmulqdq $0, %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: vpxorq %zmm0, %zmm0, %zmm0
+; CHECK-NEXT: retq
+ %1 = call <8 x i64> @llvm.x86.pclmulqdq.512(<8 x i64> %a0, <8 x i64> %a1, i8 0)
+ %2 = call <8 x i64> @llvm.x86.pclmulqdq.512(<8 x i64> %a1, <8 x i64> %a0, i8 0)
+ %3 = xor <8 x i64> %1, %2
+ ret <8 x i64> %3
+}
+
+define <8 x i64> @commute_zmm_v2(<8 x i64> %a0, <8 x i64> %a1) {
+; CHECK-LABEL: commute_zmm_v2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpclmulqdq $16, %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: vpxorq %zmm0, %zmm0, %zmm0
+; CHECK-NEXT: retq
+ %1 = call <8 x i64> @llvm.x86.pclmulqdq.512(<8 x i64> %a0, <8 x i64> %a1, i8 16)
+ %2 = call <8 x i64> @llvm.x86.pclmulqdq.512(<8 x i64> %a1, <8 x i64> %a0, i8 1)
+ %3 = xor <8 x i64> %2, %1
+ ret <8 x i64> %3
+}
+
+define <8 x i64> @commute_zmm_v3(<8 x i64> %a0, <8 x i64> %a1) {
+; CHECK-LABEL: commute_zmm_v3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpclmulqdq $17, %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: vpxorq %zmm0, %zmm0, %zmm0
+; CHECK-NEXT: retq
+ %1 = call <8 x i64> @llvm.x86.pclmulqdq.512(<8 x i64> %a0, <8 x i64> %a1, i8 17)
+ %2 = call <8 x i64> @llvm.x86.pclmulqdq.512(<8 x i64> %a1, <8 x i64> %a0, i8 17)
+ %3 = xor <8 x i64> %2, %1
+ ret <8 x i64> %3
+}
+
diff --git a/test/CodeGen/X86/commute-xop.ll b/test/CodeGen/X86/commute-xop.ll
index 4043155ba8d4..3dfb24db7fbf 100644
--- a/test/CodeGen/X86/commute-xop.ll
+++ b/test/CodeGen/X86/commute-xop.ll
@@ -4,13 +4,13 @@
define <16 x i8> @commute_fold_vpcomb(<16 x i8>* %a0, <16 x i8> %a1) {
; X32-LABEL: commute_fold_vpcomb:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpcomgtb (%eax), %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: commute_fold_vpcomb:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpcomgtb (%rdi), %xmm0, %xmm0
; X64-NEXT: retq
%1 = load <16 x i8>, <16 x i8>* %a0
@@ -21,13 +21,13 @@ declare <16 x i8> @llvm.x86.xop.vpcomb(<16 x i8>, <16 x i8>, i8) nounwind readno
define <4 x i32> @commute_fold_vpcomd(<4 x i32>* %a0, <4 x i32> %a1) {
; X32-LABEL: commute_fold_vpcomd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpcomged (%eax), %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: commute_fold_vpcomd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpcomged (%rdi), %xmm0, %xmm0
; X64-NEXT: retq
%1 = load <4 x i32>, <4 x i32>* %a0
@@ -38,13 +38,13 @@ declare <4 x i32> @llvm.x86.xop.vpcomd(<4 x i32>, <4 x i32>, i8) nounwind readno
define <2 x i64> @commute_fold_vpcomq(<2 x i64>* %a0, <2 x i64> %a1) {
; X32-LABEL: commute_fold_vpcomq:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpcomltq (%eax), %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: commute_fold_vpcomq:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpcomltq (%rdi), %xmm0, %xmm0
; X64-NEXT: retq
%1 = load <2 x i64>, <2 x i64>* %a0
@@ -55,13 +55,13 @@ declare <2 x i64> @llvm.x86.xop.vpcomq(<2 x i64>, <2 x i64>, i8) nounwind readno
define <16 x i8> @commute_fold_vpcomub(<16 x i8>* %a0, <16 x i8> %a1) {
; X32-LABEL: commute_fold_vpcomub:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpcomleub (%eax), %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: commute_fold_vpcomub:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpcomleub (%rdi), %xmm0, %xmm0
; X64-NEXT: retq
%1 = load <16 x i8>, <16 x i8>* %a0
@@ -72,13 +72,13 @@ declare <16 x i8> @llvm.x86.xop.vpcomub(<16 x i8>, <16 x i8>, i8) nounwind readn
define <4 x i32> @commute_fold_vpcomud(<4 x i32>* %a0, <4 x i32> %a1) {
; X32-LABEL: commute_fold_vpcomud:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpcomequd (%eax), %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: commute_fold_vpcomud:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpcomequd (%rdi), %xmm0, %xmm0
; X64-NEXT: retq
%1 = load <4 x i32>, <4 x i32>* %a0
@@ -89,13 +89,13 @@ declare <4 x i32> @llvm.x86.xop.vpcomud(<4 x i32>, <4 x i32>, i8) nounwind readn
define <2 x i64> @commute_fold_vpcomuq(<2 x i64>* %a0, <2 x i64> %a1) {
; X32-LABEL: commute_fold_vpcomuq:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpcomnequq (%eax), %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: commute_fold_vpcomuq:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpcomnequq (%rdi), %xmm0, %xmm0
; X64-NEXT: retq
%1 = load <2 x i64>, <2 x i64>* %a0
@@ -106,13 +106,13 @@ declare <2 x i64> @llvm.x86.xop.vpcomuq(<2 x i64>, <2 x i64>, i8) nounwind readn
define <8 x i16> @commute_fold_vpcomuw(<8 x i16>* %a0, <8 x i16> %a1) {
; X32-LABEL: commute_fold_vpcomuw:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpcomfalseuw (%eax), %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: commute_fold_vpcomuw:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpcomfalseuw (%rdi), %xmm0, %xmm0
; X64-NEXT: retq
%1 = load <8 x i16>, <8 x i16>* %a0
@@ -123,13 +123,13 @@ declare <8 x i16> @llvm.x86.xop.vpcomuw(<8 x i16>, <8 x i16>, i8) nounwind readn
define <8 x i16> @commute_fold_vpcomw(<8 x i16>* %a0, <8 x i16> %a1) {
; X32-LABEL: commute_fold_vpcomw:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpcomtruew (%eax), %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: commute_fold_vpcomw:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpcomtruew (%rdi), %xmm0, %xmm0
; X64-NEXT: retq
%1 = load <8 x i16>, <8 x i16>* %a0
@@ -140,13 +140,13 @@ declare <8 x i16> @llvm.x86.xop.vpcomw(<8 x i16>, <8 x i16>, i8) nounwind readno
define <4 x i32> @commute_fold_vpmacsdd(<4 x i32>* %a0, <4 x i32> %a1, <4 x i32> %a2) {
; X32-LABEL: commute_fold_vpmacsdd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpmacsdd %xmm1, (%eax), %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: commute_fold_vpmacsdd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpmacsdd %xmm1, (%rdi), %xmm0, %xmm0
; X64-NEXT: retq
%1 = load <4 x i32>, <4 x i32>* %a0
@@ -157,13 +157,13 @@ declare <4 x i32> @llvm.x86.xop.vpmacsdd(<4 x i32>, <4 x i32>, <4 x i32>) nounwi
define <2 x i64> @commute_fold_vpmacsdqh(<4 x i32>* %a0, <4 x i32> %a1, <2 x i64> %a2) {
; X32-LABEL: commute_fold_vpmacsdqh:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpmacsdqh %xmm1, (%eax), %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: commute_fold_vpmacsdqh:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpmacsdqh %xmm1, (%rdi), %xmm0, %xmm0
; X64-NEXT: retq
%1 = load <4 x i32>, <4 x i32>* %a0
@@ -174,13 +174,13 @@ declare <2 x i64> @llvm.x86.xop.vpmacsdqh(<4 x i32>, <4 x i32>, <2 x i64>) nounw
define <2 x i64> @commute_fold_vpmacsdql(<4 x i32>* %a0, <4 x i32> %a1, <2 x i64> %a2) {
; X32-LABEL: commute_fold_vpmacsdql:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpmacsdql %xmm1, (%eax), %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: commute_fold_vpmacsdql:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpmacsdql %xmm1, (%rdi), %xmm0, %xmm0
; X64-NEXT: retq
%1 = load <4 x i32>, <4 x i32>* %a0
@@ -191,13 +191,13 @@ declare <2 x i64> @llvm.x86.xop.vpmacsdql(<4 x i32>, <4 x i32>, <2 x i64>) nounw
define <4 x i32> @commute_fold_vpmacssdd(<4 x i32>* %a0, <4 x i32> %a1, <4 x i32> %a2) {
; X32-LABEL: commute_fold_vpmacssdd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpmacssdd %xmm1, (%eax), %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: commute_fold_vpmacssdd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpmacssdd %xmm1, (%rdi), %xmm0, %xmm0
; X64-NEXT: retq
%1 = load <4 x i32>, <4 x i32>* %a0
@@ -208,13 +208,13 @@ declare <4 x i32> @llvm.x86.xop.vpmacssdd(<4 x i32>, <4 x i32>, <4 x i32>) nounw
define <2 x i64> @commute_fold_vpmacssdqh(<4 x i32>* %a0, <4 x i32> %a1, <2 x i64> %a2) {
; X32-LABEL: commute_fold_vpmacssdqh:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpmacssdqh %xmm1, (%eax), %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: commute_fold_vpmacssdqh:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpmacssdqh %xmm1, (%rdi), %xmm0, %xmm0
; X64-NEXT: retq
%1 = load <4 x i32>, <4 x i32>* %a0
@@ -225,13 +225,13 @@ declare <2 x i64> @llvm.x86.xop.vpmacssdqh(<4 x i32>, <4 x i32>, <2 x i64>) noun
define <2 x i64> @commute_fold_vpmacssdql(<4 x i32>* %a0, <4 x i32> %a1, <2 x i64> %a2) {
; X32-LABEL: commute_fold_vpmacssdql:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpmacssdql %xmm1, (%eax), %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: commute_fold_vpmacssdql:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpmacssdql %xmm1, (%rdi), %xmm0, %xmm0
; X64-NEXT: retq
%1 = load <4 x i32>, <4 x i32>* %a0
@@ -242,13 +242,13 @@ declare <2 x i64> @llvm.x86.xop.vpmacssdql(<4 x i32>, <4 x i32>, <2 x i64>) noun
define <4 x i32> @commute_fold_vpmacsswd(<8 x i16>* %a0, <8 x i16> %a1, <4 x i32> %a2) {
; X32-LABEL: commute_fold_vpmacsswd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpmacsswd %xmm1, (%eax), %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: commute_fold_vpmacsswd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpmacsswd %xmm1, (%rdi), %xmm0, %xmm0
; X64-NEXT: retq
%1 = load <8 x i16>, <8 x i16>* %a0
@@ -259,13 +259,13 @@ declare <4 x i32> @llvm.x86.xop.vpmacsswd(<8 x i16>, <8 x i16>, <4 x i32>) nounw
define <8 x i16> @commute_fold_vpmacssww(<8 x i16>* %a0, <8 x i16> %a1, <8 x i16> %a2) {
; X32-LABEL: commute_fold_vpmacssww:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpmacssww %xmm1, (%eax), %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: commute_fold_vpmacssww:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpmacssww %xmm1, (%rdi), %xmm0, %xmm0
; X64-NEXT: retq
%1 = load <8 x i16>, <8 x i16>* %a0
@@ -276,13 +276,13 @@ declare <8 x i16> @llvm.x86.xop.vpmacssww(<8 x i16>, <8 x i16>, <8 x i16>) nounw
define <4 x i32> @commute_fold_vpmacswd(<8 x i16>* %a0, <8 x i16> %a1, <4 x i32> %a2) {
; X32-LABEL: commute_fold_vpmacswd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpmacswd %xmm1, (%eax), %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: commute_fold_vpmacswd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpmacswd %xmm1, (%rdi), %xmm0, %xmm0
; X64-NEXT: retq
%1 = load <8 x i16>, <8 x i16>* %a0
@@ -293,13 +293,13 @@ declare <4 x i32> @llvm.x86.xop.vpmacswd(<8 x i16>, <8 x i16>, <4 x i32>) nounwi
define <8 x i16> @commute_fold_vpmacsww(<8 x i16>* %a0, <8 x i16> %a1, <8 x i16> %a2) {
; X32-LABEL: commute_fold_vpmacsww:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpmacsww %xmm1, (%eax), %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: commute_fold_vpmacsww:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpmacsww %xmm1, (%rdi), %xmm0, %xmm0
; X64-NEXT: retq
%1 = load <8 x i16>, <8 x i16>* %a0
@@ -310,13 +310,13 @@ declare <8 x i16> @llvm.x86.xop.vpmacsww(<8 x i16>, <8 x i16>, <8 x i16>) nounwi
define <4 x i32> @commute_fold_vpmadcsswd(<8 x i16>* %a0, <8 x i16> %a1, <4 x i32> %a2) {
; X32-LABEL: commute_fold_vpmadcsswd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpmadcsswd %xmm1, (%eax), %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: commute_fold_vpmadcsswd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpmadcsswd %xmm1, (%rdi), %xmm0, %xmm0
; X64-NEXT: retq
%1 = load <8 x i16>, <8 x i16>* %a0
@@ -327,13 +327,13 @@ declare <4 x i32> @llvm.x86.xop.vpmadcsswd(<8 x i16>, <8 x i16>, <4 x i32>) noun
define <4 x i32> @commute_fold_vpmadcswd(<8 x i16>* %a0, <8 x i16> %a1, <4 x i32> %a2) {
; X32-LABEL: commute_fold_vpmadcswd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpmadcswd %xmm1, (%eax), %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: commute_fold_vpmadcswd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpmadcswd %xmm1, (%rdi), %xmm0, %xmm0
; X64-NEXT: retq
%1 = load <8 x i16>, <8 x i16>* %a0
diff --git a/test/CodeGen/X86/compare-add.ll b/test/CodeGen/X86/compare-add.ll
index 358ee59c95a5..51e47ea4a71c 100644
--- a/test/CodeGen/X86/compare-add.ll
+++ b/test/CodeGen/X86/compare-add.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | not grep add
+; RUN: llc < %s -mtriple=i686-- | not grep add
define i1 @X(i32 %X) {
%Y = add i32 %X, 14 ; <i32> [#uses=1]
diff --git a/test/CodeGen/X86/compare-inf.ll b/test/CodeGen/X86/compare-inf.ll
index 5eb0135277d3..5beec4d76e22 100644
--- a/test/CodeGen/X86/compare-inf.ll
+++ b/test/CodeGen/X86/compare-inf.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-- | FileCheck %s
; Convert oeq and une to ole/oge/ule/uge when comparing with infinity
; and negative infinity, because those are more efficient on x86.
diff --git a/test/CodeGen/X86/compare_folding.ll b/test/CodeGen/X86/compare_folding.ll
index 84c152d77215..0f85d644a3e6 100644
--- a/test/CodeGen/X86/compare_folding.ll
+++ b/test/CodeGen/X86/compare_folding.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -march=x86 -mcpu=yonah | \
+; RUN: llc < %s -mtriple=i686-- -mcpu=yonah | \
; RUN: grep movsd | count 1
-; RUN: llc < %s -march=x86 -mcpu=yonah | \
+; RUN: llc < %s -mtriple=i686-- -mcpu=yonah | \
; RUN: grep ucomisd
declare i1 @llvm.isunordered.f64(double, double)
diff --git a/test/CodeGen/X86/complex-fastmath.ll b/test/CodeGen/X86/complex-fastmath.ll
index d31707260a0a..9c02ac6667f6 100644
--- a/test/CodeGen/X86/complex-fastmath.ll
+++ b/test/CodeGen/X86/complex-fastmath.ll
@@ -11,7 +11,7 @@
define <2 x float> @complex_square_f32(<2 x float>) #0 {
; SSE-LABEL: complex_square_f32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; SSE-NEXT: movaps %xmm0, %xmm2
; SSE-NEXT: addss %xmm2, %xmm2
@@ -23,7 +23,7 @@ define <2 x float> @complex_square_f32(<2 x float>) #0 {
; SSE-NEXT: retq
;
; AVX1-LABEL: complex_square_f32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; AVX1-NEXT: vaddss %xmm0, %xmm0, %xmm2
; AVX1-NEXT: vmulss %xmm2, %xmm1, %xmm2
@@ -34,7 +34,7 @@ define <2 x float> @complex_square_f32(<2 x float>) #0 {
; AVX1-NEXT: retq
;
; FMA-LABEL: complex_square_f32:
-; FMA: # BB#0:
+; FMA: # %bb.0:
; FMA-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; FMA-NEXT: vaddss %xmm0, %xmm0, %xmm2
; FMA-NEXT: vmulss %xmm2, %xmm1, %xmm2
@@ -56,7 +56,7 @@ define <2 x float> @complex_square_f32(<2 x float>) #0 {
define <2 x double> @complex_square_f64(<2 x double>) #0 {
; SSE-LABEL: complex_square_f64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps %xmm0, %xmm1
; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
; SSE-NEXT: movaps %xmm0, %xmm2
@@ -69,7 +69,7 @@ define <2 x double> @complex_square_f64(<2 x double>) #0 {
; SSE-NEXT: retq
;
; AVX1-LABEL: complex_square_f64:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX1-NEXT: vaddsd %xmm0, %xmm0, %xmm2
; AVX1-NEXT: vmulsd %xmm2, %xmm1, %xmm2
@@ -80,7 +80,7 @@ define <2 x double> @complex_square_f64(<2 x double>) #0 {
; AVX1-NEXT: retq
;
; FMA-LABEL: complex_square_f64:
-; FMA: # BB#0:
+; FMA: # %bb.0:
; FMA-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; FMA-NEXT: vaddsd %xmm0, %xmm0, %xmm2
; FMA-NEXT: vmulsd %xmm2, %xmm1, %xmm2
@@ -106,7 +106,7 @@ define <2 x double> @complex_square_f64(<2 x double>) #0 {
define <2 x float> @complex_mul_f32(<2 x float>, <2 x float>) #0 {
; SSE-LABEL: complex_mul_f32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
; SSE-NEXT: movshdup {{.*#+}} xmm3 = xmm1[1,1,3,3]
; SSE-NEXT: movaps %xmm3, %xmm4
@@ -120,7 +120,7 @@ define <2 x float> @complex_mul_f32(<2 x float>, <2 x float>) #0 {
; SSE-NEXT: retq
;
; AVX1-LABEL: complex_mul_f32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
; AVX1-NEXT: vmovshdup {{.*#+}} xmm3 = xmm1[1,1,3,3]
; AVX1-NEXT: vmulss %xmm0, %xmm3, %xmm4
@@ -133,7 +133,7 @@ define <2 x float> @complex_mul_f32(<2 x float>, <2 x float>) #0 {
; AVX1-NEXT: retq
;
; FMA-LABEL: complex_mul_f32:
-; FMA: # BB#0:
+; FMA: # %bb.0:
; FMA-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
; FMA-NEXT: vmovshdup {{.*#+}} xmm3 = xmm1[1,1,3,3]
; FMA-NEXT: vmulss %xmm2, %xmm1, %xmm4
@@ -159,7 +159,7 @@ define <2 x float> @complex_mul_f32(<2 x float>, <2 x float>) #0 {
define <2 x double> @complex_mul_f64(<2 x double>, <2 x double>) #0 {
; SSE-LABEL: complex_mul_f64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps %xmm0, %xmm2
; SSE-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1]
; SSE-NEXT: movaps %xmm1, %xmm3
@@ -175,7 +175,7 @@ define <2 x double> @complex_mul_f64(<2 x double>, <2 x double>) #0 {
; SSE-NEXT: retq
;
; AVX1-LABEL: complex_mul_f64:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
; AVX1-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0]
; AVX1-NEXT: vmulsd %xmm0, %xmm3, %xmm4
@@ -188,7 +188,7 @@ define <2 x double> @complex_mul_f64(<2 x double>, <2 x double>) #0 {
; AVX1-NEXT: retq
;
; FMA-LABEL: complex_mul_f64:
-; FMA: # BB#0:
+; FMA: # %bb.0:
; FMA-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
; FMA-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0]
; FMA-NEXT: vmulsd %xmm2, %xmm1, %xmm4
diff --git a/test/CodeGen/X86/complex-fca.ll b/test/CodeGen/X86/complex-fca.ll
index 78b27b7dc3f5..d1da121213d4 100644
--- a/test/CodeGen/X86/complex-fca.ll
+++ b/test/CodeGen/X86/complex-fca.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | FileCheck %s
+; RUN: llc < %s -mtriple=i686-- | FileCheck %s
define void @ccosl({ x86_fp80, x86_fp80 }* noalias sret %agg.result, { x86_fp80, x86_fp80 } %z) nounwind {
entry:
diff --git a/test/CodeGen/X86/compress_expand.ll b/test/CodeGen/X86/compress_expand.ll
index f62e18869a98..14b41094109c 100644
--- a/test/CodeGen/X86/compress_expand.ll
+++ b/test/CodeGen/X86/compress_expand.ll
@@ -9,14 +9,14 @@ target triple = "x86_64-unknown-linux-gnu"
define <16 x float> @test1(float* %base) {
; SKX-LABEL: test1:
-; SKX: # BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: movw $-2049, %ax # imm = 0xF7FF
; SKX-NEXT: kmovd %eax, %k1
; SKX-NEXT: vexpandps (%rdi), %zmm0 {%k1} {z}
; SKX-NEXT: retq
;
; KNL-LABEL: test1:
-; KNL: # BB#0:
+; KNL: # %bb.0:
; KNL-NEXT: movw $-2049, %ax # imm = 0xF7FF
; KNL-NEXT: kmovw %eax, %k1
; KNL-NEXT: vexpandps (%rdi), %zmm0 {%k1} {z}
@@ -27,14 +27,14 @@ define <16 x float> @test1(float* %base) {
define <16 x float> @test2(float* %base, <16 x float> %src0) {
; SKX-LABEL: test2:
-; SKX: # BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: movw $30719, %ax # imm = 0x77FF
; SKX-NEXT: kmovd %eax, %k1
; SKX-NEXT: vexpandps (%rdi), %zmm0 {%k1}
; SKX-NEXT: retq
;
; KNL-LABEL: test2:
-; KNL: # BB#0:
+; KNL: # %bb.0:
; KNL-NEXT: movw $30719, %ax # imm = 0x77FF
; KNL-NEXT: kmovw %eax, %k1
; KNL-NEXT: vexpandps (%rdi), %zmm0 {%k1}
@@ -45,14 +45,14 @@ define <16 x float> @test2(float* %base, <16 x float> %src0) {
define <8 x double> @test3(double* %base, <8 x double> %src0, <8 x i1> %mask) {
; SKX-LABEL: test3:
-; SKX: # BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vpsllw $15, %xmm1, %xmm1
; SKX-NEXT: vpmovw2m %xmm1, %k1
; SKX-NEXT: vexpandpd (%rdi), %zmm0 {%k1}
; SKX-NEXT: retq
;
; KNL-LABEL: test3:
-; KNL: # BB#0:
+; KNL: # %bb.0:
; KNL-NEXT: vpmovsxwq %xmm1, %zmm1
; KNL-NEXT: vpsllq $63, %zmm1, %zmm1
; KNL-NEXT: vptestmq %zmm1, %zmm1, %k1
@@ -64,19 +64,19 @@ define <8 x double> @test3(double* %base, <8 x double> %src0, <8 x i1> %mask) {
define <4 x float> @test4(float* %base, <4 x float> %src0) {
; SKX-LABEL: test4:
-; SKX: # BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: movb $7, %al
; SKX-NEXT: kmovd %eax, %k1
; SKX-NEXT: vexpandps (%rdi), %xmm0 {%k1}
; SKX-NEXT: retq
;
; KNL-LABEL: test4:
-; KNL: # BB#0:
-; KNL-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; KNL: # %bb.0:
+; KNL-NEXT: # kill: def %xmm0 killed %xmm0 def %zmm0
; KNL-NEXT: movw $7, %ax
; KNL-NEXT: kmovw %eax, %k1
; KNL-NEXT: vexpandps (%rdi), %zmm0 {%k1}
-; KNL-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; KNL-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0
; KNL-NEXT: retq
%res = call <4 x float> @llvm.masked.expandload.v4f32(float* %base, <4 x i1> <i1 true, i1 true, i1 true, i1 false>, <4 x float> %src0)
ret <4 x float>%res
@@ -84,19 +84,19 @@ define <4 x float> @test4(float* %base, <4 x float> %src0) {
define <2 x i64> @test5(i64* %base, <2 x i64> %src0) {
; SKX-LABEL: test5:
-; SKX: # BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: movb $2, %al
; SKX-NEXT: kmovd %eax, %k1
; SKX-NEXT: vpexpandq (%rdi), %xmm0 {%k1}
; SKX-NEXT: retq
;
; KNL-LABEL: test5:
-; KNL: # BB#0:
-; KNL-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; KNL: # %bb.0:
+; KNL-NEXT: # kill: def %xmm0 killed %xmm0 def %zmm0
; KNL-NEXT: movb $2, %al
; KNL-NEXT: kmovw %eax, %k1
; KNL-NEXT: vpexpandq (%rdi), %zmm0 {%k1}
-; KNL-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; KNL-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0
; KNL-NEXT: retq
%res = call <2 x i64> @llvm.masked.expandload.v2i64(i64* %base, <2 x i1> <i1 false, i1 true>, <2 x i64> %src0)
ret <2 x i64>%res
@@ -109,7 +109,7 @@ declare <2 x i64> @llvm.masked.expandload.v2i64(i64*, <2 x i1>, <2 x i64>)
define void @test6(float* %base, <16 x float> %V) {
; SKX-LABEL: test6:
-; SKX: # BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: movw $-2049, %ax # imm = 0xF7FF
; SKX-NEXT: kmovd %eax, %k1
; SKX-NEXT: vcompressps %zmm0, (%rdi) {%k1}
@@ -117,7 +117,7 @@ define void @test6(float* %base, <16 x float> %V) {
; SKX-NEXT: retq
;
; KNL-LABEL: test6:
-; KNL: # BB#0:
+; KNL: # %bb.0:
; KNL-NEXT: movw $-2049, %ax # imm = 0xF7FF
; KNL-NEXT: kmovw %eax, %k1
; KNL-NEXT: vcompressps %zmm0, (%rdi) {%k1}
@@ -128,7 +128,7 @@ define void @test6(float* %base, <16 x float> %V) {
define void @test7(float* %base, <8 x float> %V, <8 x i1> %mask) {
; SKX-LABEL: test7:
-; SKX: # BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vpsllw $15, %xmm1, %xmm1
; SKX-NEXT: vpmovw2m %xmm1, %k1
; SKX-NEXT: vcompressps %ymm0, (%rdi) {%k1}
@@ -136,13 +136,11 @@ define void @test7(float* %base, <8 x float> %V, <8 x i1> %mask) {
; SKX-NEXT: retq
;
; KNL-LABEL: test7:
-; KNL: # BB#0:
-; KNL-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; KNL: # %bb.0:
+; KNL-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; KNL-NEXT: vpmovsxwq %xmm1, %zmm1
; KNL-NEXT: vpsllq $63, %zmm1, %zmm1
-; KNL-NEXT: vptestmq %zmm1, %zmm1, %k0
-; KNL-NEXT: kshiftlw $8, %k0, %k0
-; KNL-NEXT: kshiftrw $8, %k0, %k1
+; KNL-NEXT: vptestmq %zmm1, %zmm1, %k1
; KNL-NEXT: vcompressps %zmm0, (%rdi) {%k1}
; KNL-NEXT: retq
call void @llvm.masked.compressstore.v8f32(<8 x float> %V, float* %base, <8 x i1> %mask)
@@ -151,7 +149,7 @@ define void @test7(float* %base, <8 x float> %V, <8 x i1> %mask) {
define void @test8(double* %base, <8 x double> %V, <8 x i1> %mask) {
; SKX-LABEL: test8:
-; SKX: # BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vpsllw $15, %xmm1, %xmm1
; SKX-NEXT: vpmovw2m %xmm1, %k1
; SKX-NEXT: vcompresspd %zmm0, (%rdi) {%k1}
@@ -159,7 +157,7 @@ define void @test8(double* %base, <8 x double> %V, <8 x i1> %mask) {
; SKX-NEXT: retq
;
; KNL-LABEL: test8:
-; KNL: # BB#0:
+; KNL: # %bb.0:
; KNL-NEXT: vpmovsxwq %xmm1, %zmm1
; KNL-NEXT: vpsllq $63, %zmm1, %zmm1
; KNL-NEXT: vptestmq %zmm1, %zmm1, %k1
@@ -171,7 +169,7 @@ define void @test8(double* %base, <8 x double> %V, <8 x i1> %mask) {
define void @test9(i64* %base, <8 x i64> %V, <8 x i1> %mask) {
; SKX-LABEL: test9:
-; SKX: # BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vpsllw $15, %xmm1, %xmm1
; SKX-NEXT: vpmovw2m %xmm1, %k1
; SKX-NEXT: vpcompressq %zmm0, (%rdi) {%k1}
@@ -179,7 +177,7 @@ define void @test9(i64* %base, <8 x i64> %V, <8 x i1> %mask) {
; SKX-NEXT: retq
;
; KNL-LABEL: test9:
-; KNL: # BB#0:
+; KNL: # %bb.0:
; KNL-NEXT: vpmovsxwq %xmm1, %zmm1
; KNL-NEXT: vpsllq $63, %zmm1, %zmm1
; KNL-NEXT: vptestmq %zmm1, %zmm1, %k1
@@ -191,7 +189,7 @@ define void @test9(i64* %base, <8 x i64> %V, <8 x i1> %mask) {
define void @test10(i64* %base, <4 x i64> %V, <4 x i1> %mask) {
; SKX-LABEL: test10:
-; SKX: # BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vpslld $31, %xmm1, %xmm1
; SKX-NEXT: vptestmd %xmm1, %xmm1, %k1
; SKX-NEXT: vpcompressq %ymm0, (%rdi) {%k1}
@@ -199,13 +197,12 @@ define void @test10(i64* %base, <4 x i64> %V, <4 x i1> %mask) {
; SKX-NEXT: retq
;
; KNL-LABEL: test10:
-; KNL: # BB#0:
-; KNL-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; KNL: # %bb.0:
+; KNL-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; KNL-NEXT: vpslld $31, %xmm1, %xmm1
; KNL-NEXT: vpsrad $31, %xmm1, %xmm1
; KNL-NEXT: vpmovsxdq %xmm1, %ymm1
-; KNL-NEXT: vpxord %zmm2, %zmm2, %zmm2
-; KNL-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
+; KNL-NEXT: vmovdqa %ymm1, %ymm1
; KNL-NEXT: vpsllq $63, %zmm1, %zmm1
; KNL-NEXT: vptestmq %zmm1, %zmm1, %k1
; KNL-NEXT: vpcompressq %zmm0, (%rdi) {%k1}
@@ -216,19 +213,18 @@ define void @test10(i64* %base, <4 x i64> %V, <4 x i1> %mask) {
define void @test11(i64* %base, <2 x i64> %V, <2 x i1> %mask) {
; SKX-LABEL: test11:
-; SKX: # BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vpsllq $63, %xmm1, %xmm1
; SKX-NEXT: vptestmq %xmm1, %xmm1, %k1
; SKX-NEXT: vpcompressq %xmm0, (%rdi) {%k1}
; SKX-NEXT: retq
;
; KNL-LABEL: test11:
-; KNL: # BB#0:
-; KNL-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; KNL: # %bb.0:
+; KNL-NEXT: # kill: def %xmm0 killed %xmm0 def %zmm0
; KNL-NEXT: vpsllq $63, %xmm1, %xmm1
; KNL-NEXT: vpsraq $63, %zmm1, %zmm1
-; KNL-NEXT: vpxord %zmm2, %zmm2, %zmm2
-; KNL-NEXT: vinserti32x4 $0, %xmm1, %zmm2, %zmm1
+; KNL-NEXT: vmovdqa %xmm1, %xmm1
; KNL-NEXT: vpsllq $63, %zmm1, %zmm1
; KNL-NEXT: vptestmq %zmm1, %zmm1, %k1
; KNL-NEXT: vpcompressq %zmm0, (%rdi) {%k1}
@@ -239,19 +235,18 @@ define void @test11(i64* %base, <2 x i64> %V, <2 x i1> %mask) {
define void @test12(float* %base, <4 x float> %V, <4 x i1> %mask) {
; SKX-LABEL: test12:
-; SKX: # BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vpslld $31, %xmm1, %xmm1
; SKX-NEXT: vptestmd %xmm1, %xmm1, %k1
; SKX-NEXT: vcompressps %xmm0, (%rdi) {%k1}
; SKX-NEXT: retq
;
; KNL-LABEL: test12:
-; KNL: # BB#0:
-; KNL-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; KNL: # %bb.0:
+; KNL-NEXT: # kill: def %xmm0 killed %xmm0 def %zmm0
; KNL-NEXT: vpslld $31, %xmm1, %xmm1
; KNL-NEXT: vpsrad $31, %xmm1, %xmm1
-; KNL-NEXT: vpxord %zmm2, %zmm2, %zmm2
-; KNL-NEXT: vinserti32x4 $0, %xmm1, %zmm2, %zmm1
+; KNL-NEXT: vmovdqa %xmm1, %xmm1
; KNL-NEXT: vpslld $31, %zmm1, %zmm1
; KNL-NEXT: vptestmd %zmm1, %zmm1, %k1
; KNL-NEXT: vcompressps %zmm0, (%rdi) {%k1}
@@ -262,7 +257,7 @@ define void @test12(float* %base, <4 x float> %V, <4 x i1> %mask) {
define <2 x float> @test13(float* %base, <2 x float> %src0, <2 x i32> %trigger) {
; SKX-LABEL: test13:
-; SKX: # BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2
; SKX-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
; SKX-NEXT: vpcmpeqq %xmm2, %xmm1, %k1
@@ -270,18 +265,17 @@ define <2 x float> @test13(float* %base, <2 x float> %src0, <2 x i32> %trigger)
; SKX-NEXT: retq
;
; KNL-LABEL: test13:
-; KNL: # BB#0:
-; KNL-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; KNL: # %bb.0:
+; KNL-NEXT: # kill: def %xmm0 killed %xmm0 def %zmm0
; KNL-NEXT: vpxor %xmm2, %xmm2, %xmm2
; KNL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
; KNL-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1
; KNL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,2],zero,zero
-; KNL-NEXT: vpxord %zmm2, %zmm2, %zmm2
-; KNL-NEXT: vinserti32x4 $0, %xmm1, %zmm2, %zmm1
+; KNL-NEXT: vmovaps %xmm1, %xmm1
; KNL-NEXT: vpslld $31, %zmm1, %zmm1
; KNL-NEXT: vptestmd %zmm1, %zmm1, %k1
; KNL-NEXT: vexpandps (%rdi), %zmm0 {%k1}
-; KNL-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; KNL-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0
; KNL-NEXT: retq
%mask = icmp eq <2 x i32> %trigger, zeroinitializer
%res = call <2 x float> @llvm.masked.expandload.v2f32(float* %base, <2 x i1> %mask, <2 x float> %src0)
@@ -290,7 +284,7 @@ define <2 x float> @test13(float* %base, <2 x float> %src0, <2 x i32> %trigger)
define void @test14(float* %base, <2 x float> %V, <2 x i32> %trigger) {
; SKX-LABEL: test14:
-; SKX: # BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2
; SKX-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
; SKX-NEXT: vpcmpeqq %xmm2, %xmm1, %k1
@@ -298,14 +292,13 @@ define void @test14(float* %base, <2 x float> %V, <2 x i32> %trigger) {
; SKX-NEXT: retq
;
; KNL-LABEL: test14:
-; KNL: # BB#0:
-; KNL-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; KNL: # %bb.0:
+; KNL-NEXT: # kill: def %xmm0 killed %xmm0 def %zmm0
; KNL-NEXT: vpxor %xmm2, %xmm2, %xmm2
; KNL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
; KNL-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1
; KNL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,2],zero,zero
-; KNL-NEXT: vpxord %zmm2, %zmm2, %zmm2
-; KNL-NEXT: vinserti32x4 $0, %xmm1, %zmm2, %zmm1
+; KNL-NEXT: vmovaps %xmm1, %xmm1
; KNL-NEXT: vpslld $31, %zmm1, %zmm1
; KNL-NEXT: vptestmd %zmm1, %zmm1, %k1
; KNL-NEXT: vcompressps %zmm0, (%rdi) {%k1}
@@ -317,8 +310,8 @@ define void @test14(float* %base, <2 x float> %V, <2 x i32> %trigger) {
define <32 x float> @test15(float* %base, <32 x float> %src0, <32 x i32> %trigger) {
; ALL-LABEL: test15:
-; ALL: # BB#0:
-; ALL-NEXT: vpxord %zmm4, %zmm4, %zmm4
+; ALL: # %bb.0:
+; ALL-NEXT: vpxor %xmm4, %xmm4, %xmm4
; ALL-NEXT: vpcmpeqd %zmm4, %zmm3, %k1
; ALL-NEXT: vpcmpeqd %zmm4, %zmm2, %k2
; ALL-NEXT: kmovw %k2, %eax
@@ -333,9 +326,9 @@ define <32 x float> @test15(float* %base, <32 x float> %src0, <32 x i32> %trigge
define <16 x double> @test16(double* %base, <16 x double> %src0, <16 x i32> %trigger) {
; SKX-LABEL: test16:
-; SKX: # BB#0:
-; SKX-NEXT: vextracti32x8 $1, %zmm2, %ymm3
-; SKX-NEXT: vpxor %ymm4, %ymm4, %ymm4
+; SKX: # %bb.0:
+; SKX-NEXT: vextracti64x4 $1, %zmm2, %ymm3
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4
; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1
; SKX-NEXT: vpcmpeqd %ymm4, %ymm2, %k2
; SKX-NEXT: kmovb %k2, %eax
@@ -345,8 +338,8 @@ define <16 x double> @test16(double* %base, <16 x double> %src0, <16 x i32> %tri
; SKX-NEXT: retq
;
; KNL-LABEL: test16:
-; KNL: # BB#0:
-; KNL-NEXT: vpxor %ymm3, %ymm3, %ymm3
+; KNL: # %bb.0:
+; KNL-NEXT: vpxor %xmm3, %xmm3, %xmm3
; KNL-NEXT: vextracti64x4 $1, %zmm2, %ymm4
; KNL-NEXT: vpcmpeqd %zmm3, %zmm4, %k1
; KNL-NEXT: vpcmpeqd %zmm3, %zmm2, %k2
@@ -363,8 +356,8 @@ define <16 x double> @test16(double* %base, <16 x double> %src0, <16 x i32> %tri
define void @test17(float* %base, <32 x float> %V, <32 x i32> %trigger) {
; SKX-LABEL: test17:
-; SKX: # BB#0:
-; SKX-NEXT: vpxord %zmm4, %zmm4, %zmm4
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4
; SKX-NEXT: vpcmpeqd %zmm4, %zmm3, %k1
; SKX-NEXT: vpcmpeqd %zmm4, %zmm2, %k2
; SKX-NEXT: kmovw %k2, %eax
@@ -375,8 +368,8 @@ define void @test17(float* %base, <32 x float> %V, <32 x i32> %trigger) {
; SKX-NEXT: retq
;
; KNL-LABEL: test17:
-; KNL: # BB#0:
-; KNL-NEXT: vpxord %zmm4, %zmm4, %zmm4
+; KNL: # %bb.0:
+; KNL-NEXT: vpxor %xmm4, %xmm4, %xmm4
; KNL-NEXT: vpcmpeqd %zmm4, %zmm3, %k1
; KNL-NEXT: vpcmpeqd %zmm4, %zmm2, %k2
; KNL-NEXT: kmovw %k2, %eax
@@ -391,7 +384,7 @@ define void @test17(float* %base, <32 x float> %V, <32 x i32> %trigger) {
define void @test18(double* %base, <16 x double> %V, <16 x i1> %mask) {
; SKX-LABEL: test18:
-; SKX: # BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vpsllw $7, %xmm2, %xmm2
; SKX-NEXT: vpmovb2m %xmm2, %k1
; SKX-NEXT: kshiftrw $8, %k1, %k2
@@ -403,7 +396,7 @@ define void @test18(double* %base, <16 x double> %V, <16 x i1> %mask) {
; SKX-NEXT: retq
;
; KNL-LABEL: test18:
-; KNL: # BB#0:
+; KNL: # %bb.0:
; KNL-NEXT: vpmovsxbd %xmm2, %zmm2
; KNL-NEXT: vpslld $31, %zmm2, %zmm2
; KNL-NEXT: vptestmd %zmm2, %zmm2, %k1
diff --git a/test/CodeGen/X86/computeKnownBits_urem.ll b/test/CodeGen/X86/computeKnownBits_urem.ll
index f09370dc2fbf..4701ee5e0850 100644
--- a/test/CodeGen/X86/computeKnownBits_urem.ll
+++ b/test/CodeGen/X86/computeKnownBits_urem.ll
@@ -4,7 +4,7 @@
define i32 @main() nounwind {
; X86-LABEL: main:
-; X86: # BB#0: # %entry
+; X86: # %bb.0: # %entry
; X86-NEXT: pushl %eax
; X86-NEXT: movl $1, (%esp)
; X86-NEXT: movl $1, %eax
@@ -12,7 +12,7 @@ define i32 @main() nounwind {
; X86-NEXT: retl
;
; X64-LABEL: main:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: movl $1, -{{[0-9]+}}(%rsp)
; X64-NEXT: movl $1, %eax
; X64-NEXT: retq
diff --git a/test/CodeGen/X86/conditional-indecrement.ll b/test/CodeGen/X86/conditional-indecrement.ll
index f9e18f626972..6a681445bf89 100644
--- a/test/CodeGen/X86/conditional-indecrement.ll
+++ b/test/CodeGen/X86/conditional-indecrement.ll
@@ -3,7 +3,7 @@
define i32 @test1(i32 %a, i32 %b) nounwind readnone {
; CHECK-LABEL: test1:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: cmpl $1, %edi
; CHECK-NEXT: sbbl $-1, %esi
; CHECK-NEXT: movl %esi, %eax
@@ -16,7 +16,7 @@ define i32 @test1(i32 %a, i32 %b) nounwind readnone {
define i32 @test1_commute(i32 %a, i32 %b) nounwind readnone {
; CHECK-LABEL: test1_commute:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: cmpl $1, %edi
; CHECK-NEXT: sbbl $-1, %esi
; CHECK-NEXT: movl %esi, %eax
@@ -29,7 +29,7 @@ define i32 @test1_commute(i32 %a, i32 %b) nounwind readnone {
define i32 @test2(i32 %a, i32 %b) nounwind readnone {
; CHECK-LABEL: test2:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: cmpl $1, %edi
; CHECK-NEXT: adcl $0, %esi
; CHECK-NEXT: movl %esi, %eax
@@ -42,7 +42,7 @@ define i32 @test2(i32 %a, i32 %b) nounwind readnone {
define i32 @test3(i32 %a, i32 %b) nounwind readnone {
; CHECK-LABEL: test3:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: cmpl $1, %edi
; CHECK-NEXT: adcl $0, %esi
; CHECK-NEXT: movl %esi, %eax
@@ -55,7 +55,7 @@ define i32 @test3(i32 %a, i32 %b) nounwind readnone {
define i32 @test4(i32 %a, i32 %b) nounwind readnone {
; CHECK-LABEL: test4:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: cmpl $1, %edi
; CHECK-NEXT: sbbl $-1, %esi
; CHECK-NEXT: movl %esi, %eax
@@ -68,7 +68,7 @@ define i32 @test4(i32 %a, i32 %b) nounwind readnone {
define i32 @test5(i32 %a, i32 %b) nounwind readnone {
; CHECK-LABEL: test5:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: cmpl $1, %edi
; CHECK-NEXT: adcl $-1, %esi
; CHECK-NEXT: movl %esi, %eax
@@ -81,7 +81,7 @@ define i32 @test5(i32 %a, i32 %b) nounwind readnone {
define i32 @test6(i32 %a, i32 %b) nounwind readnone {
; CHECK-LABEL: test6:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: cmpl $1, %edi
; CHECK-NEXT: sbbl $0, %esi
; CHECK-NEXT: movl %esi, %eax
@@ -94,7 +94,7 @@ define i32 @test6(i32 %a, i32 %b) nounwind readnone {
define i32 @test7(i32 %a, i32 %b) nounwind readnone {
; CHECK-LABEL: test7:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: cmpl $1, %edi
; CHECK-NEXT: sbbl $0, %esi
; CHECK-NEXT: movl %esi, %eax
@@ -107,7 +107,7 @@ define i32 @test7(i32 %a, i32 %b) nounwind readnone {
define i32 @test8(i32 %a, i32 %b) nounwind readnone {
; CHECK-LABEL: test8:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: cmpl $1, %edi
; CHECK-NEXT: adcl $-1, %esi
; CHECK-NEXT: movl %esi, %eax
diff --git a/test/CodeGen/X86/conditional-tailcall-samedest.mir b/test/CodeGen/X86/conditional-tailcall-samedest.mir
index c18a98be53f3..f975e6b65d46 100644
--- a/test/CodeGen/X86/conditional-tailcall-samedest.mir
+++ b/test/CodeGen/X86/conditional-tailcall-samedest.mir
@@ -8,7 +8,7 @@
# CHECK: body: |
# CHECK: bb.0.entry:
-# CHECK: successors: %bb.1.sw.bb(0x40000000)
+# CHECK: successors: %bb.1(0x40000000)
# CHECK: liveins: %edi
# CHECK: CMP32ri8 killed %edi, 2, implicit-def %eflags
# CHECK: TCRETURNdi64cc @mergeable_conditional_tailcall
@@ -101,27 +101,27 @@ stack:
constants:
body: |
bb.0.entry:
- successors: %bb.2.sw.bb(0x40000000), %bb.1.entry(0x40000000)
+ successors: %bb.2(0x40000000), %bb.1(0x40000000)
liveins: %edi
CMP32ri8 killed %edi, 2, implicit-def %eflags
- JB_1 %bb.2.sw.bb, implicit %eflags
- JMP_1 %bb.1.entry
+ JB_1 %bb.2, implicit %eflags
+ JMP_1 %bb.1
bb.1.entry:
- successors: %bb.4.sw.bb2(0x40000000), %bb.5.sw.epilog(0x40000000)
+ successors: %bb.4(0x40000000), %bb.5(0x40000000)
liveins: %eflags
- JE_1 %bb.4.sw.bb2, implicit killed %eflags
- JMP_1 %bb.5.sw.epilog
+ JE_1 %bb.4, implicit killed %eflags
+ JMP_1 %bb.5
bb.2.sw.bb:
- successors: %bb.3.init.check.i(0x00000800), %bb.6.return(0x7ffff800)
+ successors: %bb.3(0x00000800), %bb.6(0x7ffff800)
- %al = ACQUIRE_MOV8rm %rip, 1, _, @static_local_guard, _ :: (volatile load acquire 1 from `i8* bitcast (i64* @static_local_guard to i8*)`, align 8)
+ %al = ACQUIRE_MOV8rm %rip, 1, %noreg, @static_local_guard, %noreg :: (volatile load acquire 1 from `i8* bitcast (i64* @static_local_guard to i8*)`, align 8)
TEST8rr killed %al, %al, implicit-def %eflags
- JNE_1 %bb.6.return, implicit killed %eflags
- JMP_1 %bb.3.init.check.i
+ JNE_1 %bb.6, implicit killed %eflags
+ JMP_1 %bb.3
bb.3.init.check.i:
dead %edi = MOV32ri64 @static_local_guard, implicit-def %rdi
diff --git a/test/CodeGen/X86/constant-combines.ll b/test/CodeGen/X86/constant-combines.ll
index 4f55814958f4..85741685beb8 100644
--- a/test/CodeGen/X86/constant-combines.ll
+++ b/test/CodeGen/X86/constant-combines.ll
@@ -14,13 +14,12 @@ define void @PR22524({ float, float }* %arg) {
; being useful.
;
; CHECK-LABEL: PR22524:
-; CHECK: # BB#0: # %entry
-; CHECK-NEXT: movl $0, 4(%rdi)
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: movd %eax, %xmm0
; CHECK-NEXT: xorps %xmm1, %xmm1
; CHECK-NEXT: mulss %xmm0, %xmm1
-; CHECK-NEXT: movl $0, (%rdi)
+; CHECK-NEXT: movq $0, (%rdi)
; CHECK-NEXT: movss %xmm1, 4(%rdi)
; CHECK-NEXT: retq
entry:
diff --git a/test/CodeGen/X86/constant-hoisting-and.ll b/test/CodeGen/X86/constant-hoisting-and.ll
index 611445f4a249..416a216d8fef 100644
--- a/test/CodeGen/X86/constant-hoisting-and.ll
+++ b/test/CodeGen/X86/constant-hoisting-and.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -O3 -march=x86-64 |FileCheck %s
+; RUN: llc < %s -O3 -mtriple=x86_64-- |FileCheck %s
define i64 @foo(i1 %z, i64 %data1, i64 %data2)
{
; If constant 4294967294 is hoisted to a variable, then we won't be able to use
diff --git a/test/CodeGen/X86/constant-hoisting-cmp.ll b/test/CodeGen/X86/constant-hoisting-cmp.ll
index 4e9e49487287..b90080003dd0 100644
--- a/test/CodeGen/X86/constant-hoisting-cmp.ll
+++ b/test/CodeGen/X86/constant-hoisting-cmp.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -O3 -march=x86-64 |FileCheck %s
+; RUN: llc < %s -O3 -mtriple=x86_64-- |FileCheck %s
define i64 @foo(i64 %data1, i64 %data2, i64 %data3)
{
; If constant 4294967295 is hoisted to a variable, then we won't be able to
diff --git a/test/CodeGen/X86/constant-hoisting-shift-immediate.ll b/test/CodeGen/X86/constant-hoisting-shift-immediate.ll
index 65c26f818a6a..e406de2af4ac 100644
--- a/test/CodeGen/X86/constant-hoisting-shift-immediate.ll
+++ b/test/CodeGen/X86/constant-hoisting-shift-immediate.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -O3 -march=x86-64 |FileCheck %s
+; RUN: llc < %s -O3 -mtriple=x86_64-- |FileCheck %s
define i64 @foo(i1 %z, i192* %p, i192* %q)
{
; If const 128 is hoisted to a variable, then in basic block L_val2 we would
diff --git a/test/CodeGen/X86/constant-pool-remat-0.ll b/test/CodeGen/X86/constant-pool-remat-0.ll
index e42a87c6acde..5722dcc93dbe 100644
--- a/test/CodeGen/X86/constant-pool-remat-0.ll
+++ b/test/CodeGen/X86/constant-pool-remat-0.ll
@@ -10,7 +10,7 @@
; RUN: llc < %s -mtriple=x86_64-linux -o /dev/null -stats -info-output-file - | FileCheck %s -check-prefix=X64stat
; X64stat: 6 asm-printer
-; RUN: llc < %s -march=x86 -mattr=+sse2 -o /dev/null -stats -info-output-file - | FileCheck %s -check-prefix=X32stat
+; RUN: llc < %s -mtriple=i686-- -mattr=+sse2 -o /dev/null -stats -info-output-file - | FileCheck %s -check-prefix=X32stat
; X32stat: 12 asm-printer
declare float @qux(float %y)
diff --git a/test/CodeGen/X86/constpool.ll b/test/CodeGen/X86/constpool.ll
index 2aac486323a8..5f81c75d4fa4 100644
--- a/test/CodeGen/X86/constpool.ll
+++ b/test/CodeGen/X86/constpool.ll
@@ -1,11 +1,10 @@
-; RUN: llc < %s
-; RUN: llc < %s -fast-isel
-; RUN: llc < %s -march=x86-64
-; RUN: llc < %s -fast-isel -march=x86-64
+; RUN: llc < %s -mtriple=i386-apple-darwin9.7
+; RUN: llc < %s -mtriple=i386-apple-darwin9.7 -fast-isel
+; RUN: llc < %s -mtriple=x86_64-apple-darwin9.7
+; RUN: llc < %s -mtriple=x86_64-apple-darwin9.7 -fast-isel
; PR4466
target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
-target triple = "i386-apple-darwin9.7"
define i32 @main() nounwind {
entry:
diff --git a/test/CodeGen/X86/constructor.ll b/test/CodeGen/X86/constructor.ll
index be12c016cce5..d4518f19b7e6 100644
--- a/test/CodeGen/X86/constructor.ll
+++ b/test/CodeGen/X86/constructor.ll
@@ -7,7 +7,8 @@
; RUN: llc -mtriple x86_64-unknown-nacl < %s | FileCheck --check-prefix=NACL %s
; RUN: llc -mtriple i586-intel-elfiamcu -use-ctors < %s | FileCheck %s --check-prefix=MCU-CTORS
; RUN: llc -mtriple i586-intel-elfiamcu < %s | FileCheck %s --check-prefix=MCU-INIT-ARRAY
-@llvm.global_ctors = appending global [2 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 65535, void ()* @f, i8* null}, { i32, void ()*, i8* } { i32 15, void ()* @g, i8* @v }]
+; RUN: llc -mtriple x86_64-win32-gnu < %s | FileCheck --check-prefix=COFF-CTOR %s
+@llvm.global_ctors = appending global [3 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 65535, void ()* @f, i8* null}, { i32, void ()*, i8* } { i32 15, void ()* @g, i8* @v }, { i32, void ()*, i8* } { i32 55555, void ()* @h, i8* @v }]
@v = weak_odr global i8 0
@@ -21,9 +22,17 @@ entry:
ret void
}
+define void @h() {
+entry:
+ ret void
+}
+
; CTOR: .section .ctors.65520,"aGw",@progbits,v,comdat
; CTOR-NEXT: .p2align 3
; CTOR-NEXT: .quad g
+; CTOR-NEXT: .section .ctors.09980,"aGw",@progbits,v,comdat
+; CTOR-NEXT: .p2align 3
+; CTOR-NEXT: .quad h
; CTOR-NEXT: .section .ctors,"aw",@progbits
; CTOR-NEXT: .p2align 3
; CTOR-NEXT: .quad f
@@ -31,6 +40,9 @@ entry:
; INIT-ARRAY: .section .init_array.15,"aGw",@init_array,v,comdat
; INIT-ARRAY-NEXT: .p2align 3
; INIT-ARRAY-NEXT: .quad g
+; INIT-ARRAY-NEXT: .section .init_array.55555,"aGw",@init_array,v,comdat
+; INIT-ARRAY-NEXT: .p2align 3
+; INIT-ARRAY-NEXT: .quad h
; INIT-ARRAY-NEXT: .section .init_array,"aw",@init_array
; INIT-ARRAY-NEXT: .p2align 3
; INIT-ARRAY-NEXT: .quad f
@@ -38,9 +50,22 @@ entry:
; NACL: .section .init_array.15,"aGw",@init_array,v,comdat
; NACL-NEXT: .p2align 2
; NACL-NEXT: .long g
+; NACL-NEXT: .section .init_array.55555,"aGw",@init_array,v,comdat
+; NACL-NEXT: .p2align 2
+; NACL-NEXT: .long h
; NACL-NEXT: .section .init_array,"aw",@init_array
; NACL-NEXT: .p2align 2
; NACL-NEXT: .long f
; MCU-CTORS: .section .ctors,"aw",@progbits
; MCU-INIT-ARRAY: .section .init_array,"aw",@init_array
+
+; COFF-CTOR: .section .ctors.65520,"dw",associative,v
+; COFF-CTOR-NEXT: .p2align 3
+; COFF-CTOR-NEXT: .quad g
+; COFF-CTOR-NEXT: .section .ctors.09980,"dw",associative,v
+; COFF-CTOR-NEXT: .p2align 3
+; COFF-CTOR-NEXT: .quad h
+; COFF-CTOR-NEXT: .section .ctors,"dw"
+; COFF-CTOR-NEXT: .p2align 3
+; COFF-CTOR-NEXT: .quad f
diff --git a/test/CodeGen/X86/copysign-constant-magnitude.ll b/test/CodeGen/X86/copysign-constant-magnitude.ll
index 8af045914cf9..61cb6d0960d1 100644
--- a/test/CodeGen/X86/copysign-constant-magnitude.ll
+++ b/test/CodeGen/X86/copysign-constant-magnitude.ll
@@ -11,7 +11,7 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
define double @mag_pos0_double(double %x) nounwind {
; CHECK-LABEL: mag_pos0_double:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: andps [[SIGNMASK1]](%rip), %xmm0
; CHECK-NEXT: retq
;
@@ -24,7 +24,7 @@ define double @mag_pos0_double(double %x) nounwind {
define double @mag_neg0_double(double %x) nounwind {
; CHECK-LABEL: mag_neg0_double:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: movsd [[SIGNMASK2]](%rip), %xmm1
; CHECK-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0,0]
; CHECK-NEXT: andps %xmm1, %xmm0
@@ -42,7 +42,7 @@ define double @mag_neg0_double(double %x) nounwind {
define double @mag_pos1_double(double %x) nounwind {
; CHECK-LABEL: mag_pos1_double:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: andps [[SIGNMASK3]](%rip), %xmm0
; CHECK-NEXT: movsd [[ONE3]](%rip), %xmm1
; CHECK-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0,0]
@@ -62,7 +62,7 @@ define double @mag_pos1_double(double %x) nounwind {
define double @mag_neg1_double(double %x) nounwind {
; CHECK-LABEL: mag_neg1_double:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: andps [[SIGNMASK4]](%rip), %xmm0
; CHECK-NEXT: orps [[ONE4]](%rip), %xmm0
; CHECK-NEXT: retq
@@ -77,7 +77,7 @@ define double @mag_neg1_double(double %x) nounwind {
define float @mag_pos0_float(float %x) nounwind {
; CHECK-LABEL: mag_pos0_float:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: andps [[SIGNMASK5]](%rip), %xmm0
; CHECK-NEXT: retq
;
@@ -90,7 +90,7 @@ define float @mag_pos0_float(float %x) nounwind {
define float @mag_neg0_float(float %x) nounwind {
; CHECK-LABEL: mag_neg0_float:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: movss [[SIGNMASK6]](%rip), %xmm1
; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,0]
; CHECK-NEXT: andps %xmm1, %xmm0
@@ -108,7 +108,7 @@ define float @mag_neg0_float(float %x) nounwind {
define float @mag_pos1_float(float %x) nounwind {
; CHECK-LABEL: mag_pos1_float:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: andps [[SIGNMASK7]](%rip), %xmm0
; CHECK-NEXT: movss [[ONE7]](%rip), %xmm1
; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,0]
@@ -130,7 +130,7 @@ define float @mag_pos1_float(float %x) nounwind {
define float @mag_neg1_float(float %x) nounwind {
; CHECK-LABEL: mag_neg1_float:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: andps [[SIGNMASK8]](%rip), %xmm0
; CHECK-NEXT: orps [[ONE8]](%rip), %xmm0
; CHECK-NEXT: retq
diff --git a/test/CodeGen/X86/cpus.ll b/test/CodeGen/X86/cpus.ll
index 7901858cb5dc..04bd74ae5c3e 100644
--- a/test/CodeGen/X86/cpus.ll
+++ b/test/CodeGen/X86/cpus.ll
@@ -6,19 +6,69 @@
;
; Now ensure the error message doesn't occur for valid CPUs.
; CHECK-NO-ERROR-NOT: not a recognized processor for this target
+
+; RUN: llc < %s -o /dev/null -mtriple=i686-unknown-unknown -mcpu=generic 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
+
;
+; Intel Targets
+;
+
+; RUN: llc < %s -o /dev/null -mtriple=i686-unknown-unknown -mcpu=i386 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
+; RUN: llc < %s -o /dev/null -mtriple=i686-unknown-unknown -mcpu=i486 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
+; RUN: llc < %s -o /dev/null -mtriple=i686-unknown-unknown -mcpu=i586 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
+; RUN: llc < %s -o /dev/null -mtriple=i686-unknown-unknown -mcpu=pentium 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
+; RUN: llc < %s -o /dev/null -mtriple=i686-unknown-unknown -mcpu=pentium-mmx 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
+; RUN: llc < %s -o /dev/null -mtriple=i686-unknown-unknown -mcpu=i686 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
+; RUN: llc < %s -o /dev/null -mtriple=i686-unknown-unknown -mcpu=pentiumpro 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
+; RUN: llc < %s -o /dev/null -mtriple=i686-unknown-unknown -mcpu=pentium2 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
+; RUN: llc < %s -o /dev/null -mtriple=i686-unknown-unknown -mcpu=pentium3 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
+; RUN: llc < %s -o /dev/null -mtriple=i686-unknown-unknown -mcpu=pentium3m 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
+; RUN: llc < %s -o /dev/null -mtriple=i686-unknown-unknown -mcpu=pentium-m 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
+; RUN: llc < %s -o /dev/null -mtriple=i686-unknown-unknown -mcpu=pentium4 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
+; RUN: llc < %s -o /dev/null -mtriple=i686-unknown-unknown -mcpu=pentium4m 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
+; RUN: llc < %s -o /dev/null -mtriple=i686-unknown-unknown -mcpu=yonah 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
+
+; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=prescott 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=nocona 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=core2 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=penryn 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=nehalem 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
+; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=corei7 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=westmere 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=sandybridge 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
+; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=corei7-avx 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=ivybridge 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
+; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=core-avx-i 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=haswell 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
+; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=core-avx2 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=broadwell 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
+; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=skylake 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
+; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=skylake-avx512 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
+; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=skx 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
+; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=cannonlake 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
+; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=icelake 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
+; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=atom 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=bonnell 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=silvermont 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
+; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=slm 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=goldmont 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
+; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=lakemont 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
+; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=knl 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
+; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=knm 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
+
+;
+; AMD Targets
+;
+
+; RUN: llc < %s -o /dev/null -mtriple=i686-unknown-unknown -mcpu=k6 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
+; RUN: llc < %s -o /dev/null -mtriple=i686-unknown-unknown -mcpu=k6-2 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
+; RUN: llc < %s -o /dev/null -mtriple=i686-unknown-unknown -mcpu=k6-3 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
+; RUN: llc < %s -o /dev/null -mtriple=i686-unknown-unknown -mcpu=athlon 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
+; RUN: llc < %s -o /dev/null -mtriple=i686-unknown-unknown -mcpu=athlon-tbird 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
+; RUN: llc < %s -o /dev/null -mtriple=i686-unknown-unknown -mcpu=athlon-4 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
+; RUN: llc < %s -o /dev/null -mtriple=i686-unknown-unknown -mcpu=athlon-xp 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
+; RUN: llc < %s -o /dev/null -mtriple=i686-unknown-unknown -mcpu=athlon-mp 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
+
; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=k8 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=opteron 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=athlon64 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
@@ -36,3 +86,12 @@
; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=btver2 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=znver1 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
+;
+; Other Targets
+;
+
+; RUN: llc < %s -o /dev/null -mtriple=i686-unknown-unknown -mcpu=geode 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
+; RUN: llc < %s -o /dev/null -mtriple=i686-unknown-unknown -mcpu=winchip-c6 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
+; RUN: llc < %s -o /dev/null -mtriple=i686-unknown-unknown -mcpu=winchip2 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
+; RUN: llc < %s -o /dev/null -mtriple=i686-unknown-unknown -mcpu=c3 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
+; RUN: llc < %s -o /dev/null -mtriple=i686-unknown-unknown -mcpu=c3-2 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
diff --git a/test/CodeGen/X86/crash.ll b/test/CodeGen/X86/crash.ll
index 4bdb2ddfab62..537a09b1c607 100644
--- a/test/CodeGen/X86/crash.ll
+++ b/test/CodeGen/X86/crash.ll
@@ -1,6 +1,6 @@
; REQUIRES: asserts
-; RUN: llc -march=x86 -no-integrated-as < %s -verify-machineinstrs -precompute-phys-liveness
-; RUN: llc -march=x86-64 -no-integrated-as < %s -verify-machineinstrs -precompute-phys-liveness
+; RUN: llc -mtriple=i686-- -no-integrated-as < %s -verify-machineinstrs -precompute-phys-liveness
+; RUN: llc -mtriple=x86_64-- -no-integrated-as < %s -verify-machineinstrs -precompute-phys-liveness
; PR6497
@@ -481,10 +481,10 @@ declare void @fn3(...)
; Check coalescing of IMPLICIT_DEF instructions:
;
-; %vreg1 = IMPLICIT_DEF
-; %vreg2 = MOV32r0
+; %1 = IMPLICIT_DEF
+; %2 = MOV32r0
;
-; When coalescing %vreg1 and %vreg2, the IMPLICIT_DEF instruction should be
+; When coalescing %1 and %2, the IMPLICIT_DEF instruction should be
; erased along with its value number.
;
define void @rdar12474033() nounwind ssp {
diff --git a/test/CodeGen/X86/critical-edge-split-2.ll b/test/CodeGen/X86/critical-edge-split-2.ll
index d5878bd1a748..4ebfddf03161 100644
--- a/test/CodeGen/X86/critical-edge-split-2.ll
+++ b/test/CodeGen/X86/critical-edge-split-2.ll
@@ -1,6 +1,5 @@
-; RUN: llc < %s | FileCheck %s
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
-target triple = "x86_64-apple-darwin10.0.0"
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s
%0 = type <{ %1, %1 }>
%1 = type { i8, i8, i8, i8 }
@@ -8,22 +7,34 @@ target triple = "x86_64-apple-darwin10.0.0"
@g_2 = global %0 zeroinitializer
@g_4 = global %1 zeroinitializer, align 4
-
; PR8642
define i16 @test1(i1 zeroext %C, i8** nocapture %argv) nounwind ssp {
+; CHECK-LABEL: test1:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: movw $1, %ax
+; CHECK-NEXT: testl %edi, %edi
+; CHECK-NEXT: jne .LBB0_2
+; CHECK-NEXT: # %bb.1: # %cond.false.i
+; CHECK-NEXT: movl $g_4, %eax
+; CHECK-NEXT: movl $g_2+4, %ecx
+; CHECK-NEXT: xorl %esi, %esi
+; CHECK-NEXT: cmpq %rax, %rcx
+; CHECK-NEXT: sete %sil
+; CHECK-NEXT: movl $1, %eax
+; CHECK-NEXT: xorl %edx, %edx
+; CHECK-NEXT: divl %esi
+; CHECK-NEXT: movl %edx, %eax
+; CHECK-NEXT: .LBB0_2: # %cond.end.i
+; CHECK-NEXT: # kill: def %ax killed %ax killed %eax
+; CHECK-NEXT: retq
entry:
br i1 %C, label %cond.end.i, label %cond.false.i
-cond.false.i: ; preds = %entry
+cond.false.i:
br label %cond.end.i
-cond.end.i: ; preds = %entry
+cond.end.i:
%call1 = phi i16 [ trunc (i32 srem (i32 1, i32 zext (i1 icmp eq (%1* bitcast (i8* getelementptr inbounds (%0, %0* @g_2, i64 0, i32 1, i32 0) to %1*), %1* @g_4) to i32)) to i16), %cond.false.i ], [ 1, %entry ]
ret i16 %call1
}
-; CHECK-LABEL: test1:
-; CHECK: testb %dil, %dil
-; CHECK: jne LBB0_2
-; CHECK: divl
-; CHECK: LBB0_2:
diff --git a/test/CodeGen/X86/ctpop-combine.ll b/test/CodeGen/X86/ctpop-combine.ll
index bbfc2ead04c6..40dc6c464879 100644
--- a/test/CodeGen/X86/ctpop-combine.ll
+++ b/test/CodeGen/X86/ctpop-combine.ll
@@ -6,7 +6,7 @@ declare i64 @llvm.ctpop.i64(i64) nounwind readnone
define i32 @test1(i64 %x) nounwind readnone {
; CHECK-LABEL: test1:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: leaq -1(%rdi), %rcx
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: testq %rcx, %rdi
@@ -22,7 +22,7 @@ define i32 @test1(i64 %x) nounwind readnone {
define i32 @test2(i64 %x) nounwind readnone {
; CHECK-LABEL: test2:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: leaq -1(%rdi), %rcx
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: testq %rcx, %rdi
@@ -36,7 +36,7 @@ define i32 @test2(i64 %x) nounwind readnone {
define i32 @test3(i64 %x) nounwind readnone {
; CHECK-LABEL: test3:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: popcntq %rdi, %rcx
; CHECK-NEXT: andb $63, %cl
; CHECK-NEXT: xorl %eax, %eax
@@ -52,10 +52,10 @@ define i32 @test3(i64 %x) nounwind readnone {
define i8 @test4(i8 %x) nounwind readnone {
; CHECK-LABEL: test4:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: andl $127, %edi
-; CHECK-NEXT: popcntw %di, %ax
-; CHECK-NEXT: # kill: %AL<def> %AL<kill> %AX<kill>
+; CHECK-NEXT: popcntl %edi, %eax
+; CHECK-NEXT: # kill: def %al killed %al killed %eax
; CHECK-NEXT: retq
%x2 = and i8 %x, 127
%count = tail call i8 @llvm.ctpop.i8(i8 %x2)
diff --git a/test/CodeGen/X86/cvt16.ll b/test/CodeGen/X86/cvt16.ll
index 5ee399fc137a..3679ebb6f9af 100644
--- a/test/CodeGen/X86/cvt16.ll
+++ b/test/CodeGen/X86/cvt16.ll
@@ -1,7 +1,7 @@
-; RUN: llc < %s -march=x86-64 -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -mattr=-f16c | FileCheck %s -check-prefix=CHECK -check-prefix=LIBCALL
-; RUN: llc < %s -march=x86-64 -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -mattr=+f16c | FileCheck %s -check-prefix=CHECK -check-prefix=F16C
-; RUN: llc < %s -march=x86-64 -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -mattr=-f16c,+soft-float | FileCheck %s -check-prefix=CHECK -check-prefix=SOFTFLOAT
-; RUN: llc < %s -march=x86-64 -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -mattr=+f16c,+soft-float | FileCheck %s -check-prefix=CHECK -check-prefix=SOFTFLOAT
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -mattr=-f16c | FileCheck %s -check-prefix=CHECK -check-prefix=LIBCALL
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -mattr=+f16c | FileCheck %s -check-prefix=CHECK -check-prefix=F16C
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -mattr=-f16c,+soft-float | FileCheck %s -check-prefix=CHECK -check-prefix=SOFTFLOAT
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -mattr=+f16c,+soft-float | FileCheck %s -check-prefix=CHECK -check-prefix=SOFTFLOAT
; This is a test for float to half float conversions on x86-64.
;
diff --git a/test/CodeGen/X86/cvtv2f32.ll b/test/CodeGen/X86/cvtv2f32.ll
index 297692f6bd61..556c858759fc 100644
--- a/test/CodeGen/X86/cvtv2f32.ll
+++ b/test/CodeGen/X86/cvtv2f32.ll
@@ -7,7 +7,7 @@
define <2 x float> @uitofp_2i32_buildvector(i32 %x, i32 %y, <2 x float> %v) {
; X32-LABEL: uitofp_2i32_buildvector:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
; X32-NEXT: movdqa {{.*#+}} xmm2 = [1258291200,1258291200,1258291200,1258291200]
; X32-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
@@ -19,7 +19,7 @@ define <2 x float> @uitofp_2i32_buildvector(i32 %x, i32 %y, <2 x float> %v) {
; X32-NEXT: retl
;
; X64-LABEL: uitofp_2i32_buildvector:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movd %edi, %xmm1
; X64-NEXT: pinsrd $1, %esi, %xmm1
; X64-NEXT: movdqa {{.*#+}} xmm2 = [1258291200,1258291200,1258291200,1258291200]
@@ -40,7 +40,7 @@ define <2 x float> @uitofp_2i32_buildvector(i32 %x, i32 %y, <2 x float> %v) {
define <2 x float> @uitofp_2i32_legalized(<2 x i32> %in, <2 x float> %v) {
; X32-LABEL: uitofp_2i32_legalized:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pxor %xmm2, %xmm2
; X32-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
; X32-NEXT: movdqa {{.*#+}} xmm0 = [4.503600e+15,4.503600e+15]
@@ -51,7 +51,7 @@ define <2 x float> @uitofp_2i32_legalized(<2 x i32> %in, <2 x float> %v) {
; X32-NEXT: retl
;
; X64-LABEL: uitofp_2i32_legalized:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: pxor %xmm2, %xmm2
; X64-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
; X64-NEXT: movdqa {{.*#+}} xmm0 = [4.503600e+15,4.503600e+15]
diff --git a/test/CodeGen/X86/dag-fmf-cse.ll b/test/CodeGen/X86/dag-fmf-cse.ll
index c12c49d0f40b..021459eb4bde 100644
--- a/test/CodeGen/X86/dag-fmf-cse.ll
+++ b/test/CodeGen/X86/dag-fmf-cse.ll
@@ -8,7 +8,7 @@
define float @fmf_should_not_break_cse(float %a, float %b) {
; CHECK-LABEL: fmf_should_not_break_cse:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vmulss %xmm1, %xmm0, %xmm0
; CHECK-NEXT: vaddss %xmm0, %xmm0, %xmm0
; CHECK-NEXT: retq
diff --git a/test/CodeGen/X86/dag-merge-fast-accesses.ll b/test/CodeGen/X86/dag-merge-fast-accesses.ll
index e5dfccb278ce..662f74f97548 100644
--- a/test/CodeGen/X86/dag-merge-fast-accesses.ll
+++ b/test/CodeGen/X86/dag-merge-fast-accesses.ll
@@ -7,13 +7,13 @@
define void @merge_const_vec_store(i64* %ptr) {
; FAST-LABEL: merge_const_vec_store:
-; FAST: # BB#0:
+; FAST: # %bb.0:
; FAST-NEXT: xorps %xmm0, %xmm0
; FAST-NEXT: movups %xmm0, (%rdi)
; FAST-NEXT: retq
;
; SLOW-LABEL: merge_const_vec_store:
-; SLOW: # BB#0:
+; SLOW: # %bb.0:
; SLOW-NEXT: movq $0, (%rdi)
; SLOW-NEXT: movq $0, 8(%rdi)
; SLOW-NEXT: retq
@@ -29,12 +29,12 @@ define void @merge_const_vec_store(i64* %ptr) {
define void @merge_vec_element_store(<4 x double> %v, double* %ptr) {
; FAST-LABEL: merge_vec_element_store:
-; FAST: # BB#0:
+; FAST: # %bb.0:
; FAST-NEXT: movups %xmm0, (%rdi)
; FAST-NEXT: retq
;
; SLOW-LABEL: merge_vec_element_store:
-; SLOW: # BB#0:
+; SLOW: # %bb.0:
; SLOW-NEXT: movlpd %xmm0, (%rdi)
; SLOW-NEXT: movhpd %xmm0, 8(%rdi)
; SLOW-NEXT: retq
@@ -53,13 +53,13 @@ define void @merge_vec_element_store(<4 x double> %v, double* %ptr) {
define void @merge_vec_load_and_stores(i64 *%ptr) {
; FAST-LABEL: merge_vec_load_and_stores:
-; FAST: # BB#0:
+; FAST: # %bb.0:
; FAST-NEXT: movups (%rdi), %xmm0
; FAST-NEXT: movups %xmm0, 40(%rdi)
; FAST-NEXT: retq
;
; SLOW-LABEL: merge_vec_load_and_stores:
-; SLOW: # BB#0:
+; SLOW: # %bb.0:
; SLOW-NEXT: movq (%rdi), %rax
; SLOW-NEXT: movq 8(%rdi), %rcx
; SLOW-NEXT: movq %rax, 40(%rdi)
diff --git a/test/CodeGen/X86/dag-rauw-cse.ll b/test/CodeGen/X86/dag-rauw-cse.ll
index 12a2e626687b..5e6b6cf10e74 100644
--- a/test/CodeGen/X86/dag-rauw-cse.ll
+++ b/test/CodeGen/X86/dag-rauw-cse.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | FileCheck %s
+; RUN: llc < %s -mtriple=i686-- | FileCheck %s
; PR3018
define i32 @test(i32 %A) nounwind {
diff --git a/test/CodeGen/X86/dagcombine-buildvector.ll b/test/CodeGen/X86/dagcombine-buildvector.ll
index d60fb734e685..59f042f7f26e 100644
--- a/test/CodeGen/X86/dagcombine-buildvector.ll
+++ b/test/CodeGen/X86/dagcombine-buildvector.ll
@@ -6,10 +6,10 @@
define void @test(<2 x double>* %dst, <4 x double> %src) nounwind {
; CHECK-LABEL: test:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; CHECK-NEXT: movapd %xmm0, (%eax)
+; CHECK-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; CHECK-NEXT: movaps %xmm0, (%eax)
; CHECK-NEXT: retl
entry:
%tmp7.i = shufflevector <4 x double> %src, <4 x double> undef, <2 x i32> <i32 0, i32 2>
@@ -19,7 +19,7 @@ entry:
define void @test2(<4 x i16>* %src, <4 x i32>* %dest) nounwind {
; CHECK-LABEL: test2:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
; CHECK-NEXT: pmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
diff --git a/test/CodeGen/X86/dagcombine-cse.ll b/test/CodeGen/X86/dagcombine-cse.ll
index 726e30fce63b..544407e184a4 100644
--- a/test/CodeGen/X86/dagcombine-cse.ll
+++ b/test/CodeGen/X86/dagcombine-cse.ll
@@ -4,7 +4,7 @@
define i32 @t(i8* %ref_frame_ptr, i32 %ref_frame_stride, i32 %idxX, i32 %idxY) nounwind {
; X32-LABEL: t:
-; X32: ## BB#0: ## %entry
+; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: imull {{[0-9]+}}(%esp), %ecx
@@ -18,9 +18,9 @@ define i32 @t(i8* %ref_frame_ptr, i32 %ref_frame_stride, i32 %idxX, i32 %idxY) n
; X32-NEXT: retl
;
; X64-LABEL: t:
-; X64: ## BB#0: ## %entry
-; X64-NEXT: ## kill: %EDX<def> %EDX<kill> %RDX<def>
-; X64-NEXT: ## kill: %ESI<def> %ESI<kill> %RSI<def>
+; X64: ## %bb.0: ## %entry
+; X64-NEXT: ## kill: def %edx killed %edx def %rdx
+; X64-NEXT: ## kill: def %esi killed %esi def %rsi
; X64-NEXT: imull %ecx, %esi
; X64-NEXT: leal (%rsi,%rdx), %eax
; X64-NEXT: cltq
diff --git a/test/CodeGen/X86/darwin-preemption.ll b/test/CodeGen/X86/darwin-preemption.ll
new file mode 100644
index 000000000000..9df0389d0d26
--- /dev/null
+++ b/test/CodeGen/X86/darwin-preemption.ll
@@ -0,0 +1,251 @@
+; RUN: llc -mtriple x86_64-apple-darwin \
+; RUN: -relocation-model=static < %s | FileCheck %s
+; RUN: llc -mtriple x86_64-apple-darwin \
+; RUN: -relocation-model=pic < %s | FileCheck %s
+; RUN: llc -mtriple x86_64-apple-darwin \
+; RUN: -relocation-model=dynamic-no-pic < %s | FileCheck %s
+
+; 32 bits
+
+; RUN: llc -mtriple i386-apple-darwin \
+; RUN: -relocation-model=static < %s | FileCheck --check-prefix=DARWIN32_S %s
+; RUN: llc -mtriple i386-apple-darwin \
+; RUN: -relocation-model=pic < %s | FileCheck --check-prefix=DARWIN32 %s
+; RUN: llc -mtriple i386-apple-darwin \
+; RUN: -relocation-model=dynamic-no-pic < %s | \
+; RUN: FileCheck --check-prefix=DARWIN32_DNP %s
+
+; globals
+
+@strong_default_global = global i32 42
+define i32* @get_strong_default_global() {
+ ret i32* @strong_default_global
+}
+; CHECK: leaq _strong_default_global(%rip), %rax
+; DARWIN32: leal _strong_default_global-L{{.*}}$pb(%eax), %eax
+; DARWIN32_S: movl $_strong_default_global, %eax
+; DARWIN32_DNP: movl $_strong_default_global, %eax
+
+@weak_default_global = weak global i32 42
+define i32* @get_weak_default_global() {
+ ret i32* @weak_default_global
+}
+; CHECK: movq _weak_default_global@GOTPCREL(%rip), %rax
+; DARWIN32: movl L_weak_default_global$non_lazy_ptr-L{{.*}}$pb(%eax), %eax
+; DARWIN32_S: movl $_weak_default_global, %eax
+; DARWIN32_DNP: movl L_weak_default_global$non_lazy_ptr, %eax
+
+@external_default_global = external global i32
+define i32* @get_external_default_global() {
+ ret i32* @external_default_global
+}
+; CHECK: movq _external_default_global@GOTPCREL(%rip), %rax
+; DARWIN32: movl L_external_default_global$non_lazy_ptr-L{{.*}}$pb(%eax), %eax
+; DARWIN32_S: movl $_external_default_global, %eax
+; DARWIN32_DNP: movl L_external_default_global$non_lazy_ptr, %eax
+
+@strong_local_global = dso_local global i32 42
+define i32* @get_strong_local_global() {
+ ret i32* @strong_local_global
+}
+; CHECK: leaq _strong_local_global(%rip), %rax
+; DARWIN32: leal _strong_local_global-L{{.*}}$pb(%eax), %eax
+; DARWIN32_S: movl $_strong_local_global, %eax
+; DARWIN32_DNP: movl $_strong_local_global, %eax
+
+@weak_local_global = weak dso_local global i32 42
+define i32* @get_weak_local_global() {
+ ret i32* @weak_local_global
+}
+; CHECK: leaq _weak_local_global(%rip), %rax
+; DARWIN32: leal _weak_local_global-L{{.}}$pb(%eax), %eax
+; DARWIN32_S: movl $_weak_local_global, %eax
+; DARWIN32_DNP: movl $_weak_local_global, %eax
+
+@external_local_global = external dso_local global i32
+define i32* @get_external_local_global() {
+ ret i32* @external_local_global
+}
+; CHECK: leaq _external_local_global(%rip), %rax
+; DARWIN32: movl L_external_local_global$non_lazy_ptr-L{{.*}}$pb(%eax), %eax
+; DARWIN32_S: movl $_external_local_global, %eax
+; DARWIN32_DNP: movl $_external_local_global, %eax
+
+@strong_preemptable_global = dso_preemptable global i32 42
+define i32* @get_strong_preemptable_global() {
+ ret i32* @strong_preemptable_global
+}
+; CHECK: leaq _strong_preemptable_global(%rip), %rax
+; DARWIN32: leal _strong_preemptable_global-L{{.*}}$pb(%eax), %eax
+; DARWIN32_S: movl $_strong_preemptable_global, %eax
+; DARWIN32_DNP: movl $_strong_preemptable_global, %eax
+
+@weak_preemptable_global = weak dso_preemptable global i32 42
+define i32* @get_weak_preemptable_global() {
+ ret i32* @weak_preemptable_global
+}
+; CHECK: movq _weak_preemptable_global@GOTPCREL(%rip), %rax
+; DARWIN32: movl L_weak_preemptable_global$non_lazy_ptr-L{{.*}}$pb(%eax), %eax
+; DARWIN32_S: movl $_weak_preemptable_global, %eax
+; DARWIN32_DNP: movl L_weak_preemptable_global$non_lazy_ptr, %eax
+
+@external_preemptable_global = external dso_preemptable global i32
+define i32* @get_external_preemptable_global() {
+ ret i32* @external_preemptable_global
+}
+; CHECK: movq _external_preemptable_global@GOTPCREL(%rip), %rax
+; DARWIN32: movl L_external_preemptable_global$non_lazy_ptr-L{{.*}}$pb(%eax), %eax
+; DARWIN32_S: movl $_external_preemptable_global, %eax
+; DARWIN32_DNP: movl L_external_preemptable_global$non_lazy_ptr, %eax
+
+; aliases
+@aliasee = global i32 42
+
+@strong_default_alias = alias i32, i32* @aliasee
+define i32* @get_strong_default_alias() {
+ ret i32* @strong_default_alias
+}
+; CHECK: leaq _strong_default_alias(%rip), %rax
+; DARWIN32: leal _strong_default_alias-L{{.*}}$pb(%eax), %eax
+; DARWIN32_S: movl $_strong_default_alias, %eax
+; DARWIN32_DNP: movl $_strong_default_alias, %eax
+
+@weak_default_alias = weak alias i32, i32* @aliasee
+define i32* @get_weak_default_alias() {
+ ret i32* @weak_default_alias
+}
+; CHECK: movq _weak_default_alias@GOTPCREL(%rip), %rax
+; DARWIN32: movl L_weak_default_alias$non_lazy_ptr-L{{.*}}$pb(%eax), %eax
+; DARWIN32_S: movl $_weak_default_alias, %eax
+; DARWIN32_DNP: movl L_weak_default_alias$non_lazy_ptr, %eax
+
+@strong_local_alias = dso_local alias i32, i32* @aliasee
+define i32* @get_strong_local_alias() {
+ ret i32* @strong_local_alias
+}
+; CHECK: leaq _strong_local_alias(%rip), %rax
+; DARWIN32: leal _strong_local_alias-L{{.*}}$pb(%eax), %eax
+; DARWIN32_S: movl $_strong_local_alias, %eax
+; DARWIN32_DNP: movl $_strong_local_alias, %eax
+
+@weak_local_alias = weak dso_local alias i32, i32* @aliasee
+define i32* @get_weak_local_alias() {
+ ret i32* @weak_local_alias
+}
+; CHECK: leaq _weak_local_alias(%rip), %rax
+; DARWIN32: leal _weak_local_alias-L{{.*}}$pb(%eax), %eax
+; DARWIN32_S: movl $_weak_local_alias, %eax
+; DARWIN32_DNP: movl $_weak_local_alias, %eax
+
+@strong_preemptable_alias = dso_preemptable alias i32, i32* @aliasee
+define i32* @get_strong_preemptable_alias() {
+ ret i32* @strong_preemptable_alias
+}
+; CHECK: leaq _strong_preemptable_alias(%rip), %rax
+; DARWIN32: leal _strong_preemptable_alias-L{{.*}}$pb(%eax), %eax
+; DARWIN32_S: movl $_strong_preemptable_alias, %eax
+; DARWIN32_DNP: movl $_strong_preemptable_alias, %eax
+
+@weak_preemptable_alias = weak dso_preemptable alias i32, i32* @aliasee
+define i32* @get_weak_preemptable_alias() {
+ ret i32* @weak_preemptable_alias
+}
+; CHECK: movq _weak_preemptable_alias@GOTPCREL(%rip), %rax
+; DARWIN32: movl L_weak_preemptable_alias$non_lazy_ptr-L{{.*}}$pb(%eax), %eax
+; DARWIN32_S: movl $_weak_preemptable_alias, %eax
+; DARWIN32_DNP: movl L_weak_preemptable_alias$non_lazy_ptr, %eax
+
+; functions
+
+define void @strong_default_function() {
+ ret void
+}
+define void()* @get_strong_default_function() {
+ ret void()* @strong_default_function
+}
+; CHECK: leaq _strong_default_function(%rip), %rax
+; DARWIN32: leal _strong_default_function-L{{.*}}$pb(%eax), %eax
+; DARWIN32_S: movl $_strong_default_function, %eax
+; DARWIN32_DNP: movl $_strong_default_function, %eax
+
+define weak void @weak_default_function() {
+ ret void
+}
+define void()* @get_weak_default_function() {
+ ret void()* @weak_default_function
+}
+; CHECK: movq _weak_default_function@GOTPCREL(%rip), %rax
+; DARWIN32: movl L_weak_default_function$non_lazy_ptr-L{{.*}}$pb(%eax), %eax
+; DARWIN32_S: movl $_weak_default_function, %eax
+; DARWIN32_DNP: movl L_weak_default_function$non_lazy_ptr, %eax
+
+declare void @external_default_function()
+define void()* @get_external_default_function() {
+ ret void()* @external_default_function
+}
+; CHECK: movq _external_default_function@GOTPCREL(%rip), %rax
+; DARWIN32: movl L_external_default_function$non_lazy_ptr-L{{.*}}$pb(%eax), %eax
+; DARWIN32_S: movl $_external_default_function, %eax
+; DARWIN32_DNP: movl L_external_default_function$non_lazy_ptr, %eax
+
+define dso_local void @strong_local_function() {
+ ret void
+}
+define void()* @get_strong_local_function() {
+ ret void()* @strong_local_function
+}
+; CHECK: leaq _strong_local_function(%rip), %rax
+; DARWIN32: leal _strong_local_function-L{{.*}}$pb(%eax), %eax
+; DARWIN32_S: movl $_strong_local_function, %eax
+; DARWIN32_DNP: movl $_strong_local_function, %eax
+
+define weak dso_local void @weak_local_function() {
+ ret void
+}
+define void()* @get_weak_local_function() {
+ ret void()* @weak_local_function
+}
+; CHECK: leaq _weak_local_function(%rip), %rax
+; DARWIN32: leal _weak_local_function-L{{.*}}$pb(%eax), %eax
+; DARWIN32_S: movl $_weak_local_function, %eax
+; DARWIN32_DNP: movl $_weak_local_function, %eax
+
+declare dso_local void @external_local_function()
+define void()* @get_external_local_function() {
+ ret void()* @external_local_function
+}
+; CHECK: leaq _external_local_function(%rip), %rax
+; DARWIN32: movl L_external_local_function$non_lazy_ptr-L{{.*}}$pb(%eax), %eax
+; DARWIN32_S: movl $_external_local_function, %eax
+; DARWIN32_DNP: movl $_external_local_function, %eax
+
+define dso_preemptable void @strong_preemptable_function() {
+ ret void
+}
+define void()* @get_strong_preemptable_function() {
+ ret void()* @strong_preemptable_function
+}
+; CHECK: leaq _strong_preemptable_function(%rip), %rax
+; DARWIN32: leal _strong_preemptable_function-L{{.*}}$pb(%eax), %eax
+; DARWIN32_S: movl $_strong_preemptable_function, %eax
+; DARWIN32_DNP: movl $_strong_preemptable_function, %eax
+
+define weak dso_preemptable void @weak_preemptable_function() {
+ ret void
+}
+define void()* @get_weak_preemptable_function() {
+ ret void()* @weak_preemptable_function
+}
+; CHECK: movq _weak_preemptable_function@GOTPCREL(%rip), %rax
+; DARWIN32: movl L_weak_preemptable_function$non_lazy_ptr-L{{.*}}$pb(%eax), %eax
+; DARWIN32_S: movl $_weak_preemptable_function, %eax
+; DARWIN32_DNP: movl L_weak_preemptable_function$non_lazy_ptr, %eax
+
+declare dso_preemptable void @external_preemptable_function()
+define void()* @get_external_preemptable_function() {
+ ret void()* @external_preemptable_function
+}
+; CHECK: movq _external_preemptable_function@GOTPCREL(%rip), %rax
+; DARWIN32: movl L_external_preemptable_function$non_lazy_ptr-L{{.*}}$pb(%eax), %eax
+; DARWIN32_S: movl $_external_preemptable_function, %eax
+; DARWIN32_DNP: movl L_external_preemptable_function$non_lazy_ptr, %eax
diff --git a/test/CodeGen/X86/dbg-baseptr.ll b/test/CodeGen/X86/dbg-baseptr.ll
index 893ca93a9944..436c7f42c594 100644
--- a/test/CodeGen/X86/dbg-baseptr.ll
+++ b/test/CodeGen/X86/dbg-baseptr.ll
@@ -1,5 +1,5 @@
; RUN: llc -o - %s | FileCheck %s
-; RUN: llc -filetype=obj -o - %s | llvm-dwarfdump - | FileCheck %s --check-prefix=DWARF
+; RUN: llc -filetype=obj -o - %s | llvm-dwarfdump -v - | FileCheck %s --check-prefix=DWARF
; This test checks that parameters on the stack pointer are correctly
; referenced by debug info.
target triple = "x86_64--"
@@ -22,12 +22,10 @@ define i32 @f0(%struct.s* byval align 8 %input) !dbg !8 {
; DWARF-LABEL: .debug_info contents:
; DWARF-LABEL: DW_TAG_subprogram
-; DWARF: DW_AT_frame_base [DW_FORM_exprloc] (<0x1> 57 )
-; 0x57 -> RSP
+; DWARF: DW_AT_frame_base [DW_FORM_exprloc] (DW_OP_reg7 RSP)
; DWARF: DW_AT_name [DW_FORM_strp] ( {{.*}}"f0")
; DWARF: DW_TAG_formal_parameter
-; DWARF-NEXT: DW_AT_location [DW_FORM_exprloc] (<0x2> 91 08 )
-; DW_OP_fbreg (0x91) 0x08
+; DWARF-NEXT: DW_AT_location [DW_FORM_exprloc] (DW_OP_fbreg +8)
; DWARF-NEXT: DW_AT_name [DW_FORM_strp] ( {{.*}}"input")
@@ -48,12 +46,10 @@ define i32 @f1(%struct.s* byval align 8 %input) !dbg !19 {
}
; DWARF-LABEL: DW_TAG_subprogram
-; DWARF: DW_AT_frame_base [DW_FORM_exprloc] (<0x1> 56 )
-; 0x56 -> RBP
+; DWARF: DW_AT_frame_base [DW_FORM_exprloc] (DW_OP_reg6 RBP)
; DWARF: DW_AT_name [DW_FORM_strp] ( {{.*}}"f1")
; DWARF: DW_TAG_formal_parameter
-; DWARF-NEXT: DW_AT_location [DW_FORM_exprloc] (<0x2> 91 10 )
-; DW_OP_fbreg (0x91) 0x10
+; DWARF-NEXT: DW_AT_location [DW_FORM_exprloc] (DW_OP_fbreg +16)
; DWARF-NEXT: DW_AT_name [DW_FORM_strp] ( {{.*}}"input")
; CHECK-LABEL: f2:
@@ -75,12 +71,10 @@ define i32 @f2(%struct.s* byval align 8 %input) !dbg !22 {
; "input" should still be referred to through RBP.
; DWARF-LABEL: DW_TAG_subprogram
-; DWARF: DW_AT_frame_base [DW_FORM_exprloc] (<0x1> 56 )
-; 0x56 -> RBP
+; DWARF: DW_AT_frame_base [DW_FORM_exprloc] (DW_OP_reg6 RBP)
; DWARF: DW_AT_name [DW_FORM_strp] ( {{.*}}"f2")
; DWARF: DW_TAG_formal_parameter
-; DWARF-NEXT: DW_AT_location [DW_FORM_exprloc] (<0x2> 91 10 )
-; DW_OP_fbreg (0x91) 0x10
+; DWARF-NEXT: DW_AT_location [DW_FORM_exprloc] (DW_OP_fbreg +16)
; DWARF-NEXT: DW_AT_name [DW_FORM_strp] ( {{.*}}"input")
declare void @llvm.dbg.declare(metadata, metadata, metadata)
diff --git a/test/CodeGen/X86/dbg-changes-codegen-branch-folding.ll b/test/CodeGen/X86/dbg-changes-codegen-branch-folding.ll
index 7a19dd2a98d1..395a31959293 100644
--- a/test/CodeGen/X86/dbg-changes-codegen-branch-folding.ll
+++ b/test/CodeGen/X86/dbg-changes-codegen-branch-folding.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=x86-64 -mtriple=x86_64-linux < %s | FileCheck %s
-; RUN: opt -strip-debug < %s | llc -march=x86-64 -mtriple=x86_64-linux | FileCheck %s
+; RUN: llc -mtriple=x86_64-linux < %s | FileCheck %s
+; RUN: opt -strip-debug < %s | llc -mtriple=x86_64-linux | FileCheck %s
; http://llvm.org/PR19051. Minor code-motion difference with -g.
; Presence of debug info shouldn't affect the codegen. Make sure that
; we generated the same code sequence with and without debug info.
diff --git a/test/CodeGen/X86/dbg-changes-codegen.ll b/test/CodeGen/X86/dbg-changes-codegen.ll
index bee86b4617c7..3d01e07d79a1 100644
--- a/test/CodeGen/X86/dbg-changes-codegen.ll
+++ b/test/CodeGen/X86/dbg-changes-codegen.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mtriple=x86_64-linux | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s
; The Peephole optimizer should fold the load into the cmp even with debug info.
; CHECK-LABEL: _ZN3Foo3batEv
diff --git a/test/CodeGen/X86/dbg-line-0-no-discriminator.ll b/test/CodeGen/X86/dbg-line-0-no-discriminator.ll
new file mode 100644
index 000000000000..cc96c3affc84
--- /dev/null
+++ b/test/CodeGen/X86/dbg-line-0-no-discriminator.ll
@@ -0,0 +1,39 @@
+; RUN: llc -filetype=obj -use-unknown-locations=Enable -mtriple=x86_64-unknown-linux %s -o %t
+; RUN: llvm-dwarfdump -debug-line %t | FileCheck %s
+
+define void @_Z3bazv() !dbg !6 {
+ call void @_Z3foov(), !dbg !9
+ call void @_Z3foov() ; no !dbg, so will be marked as line 0
+ ret void, !dbg !11
+}
+
+declare void @_Z3foov()
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4}
+!llvm.ident = !{!5}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 3.9.0 (trunk 267219)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
+!1 = !DIFile(filename: "test.cc", directory: ".")
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{!"clang version 3.9.0 (trunk 267219)"}
+!6 = distinct !DISubprogram(name: "baz", linkageName: "_Z3bazv", scope: !1, file: !1, line: 3, type: !7, isLocal: false, isDefinition: true, scopeLine: 3, flags: DIFlagPrototyped, isOptimized: false, unit: !0, variables: !2)
+!7 = !DISubroutineType(types: !8)
+!8 = !{null}
+!9 = !DILocation(line: 4, column: 3, scope: !10)
+!10 = !DILexicalBlockFile(scope: !6, file: !1, discriminator: 1)
+!11 = !DILocation(line: 6, column: 1, scope: !6)
+
+; Look at the lengths. We can't verify the line-number-program size
+; directly, but the difference in the two lengths should not change
+; unexpectedly.
+; CHECK: total_length: 0x00000044
+; CHECK: prologue_length: 0x0000001f
+;
+; Verify that we see a line entry with a discriminator, and the next entry
+; has line 0 and no discriminator.
+; line column file ISA discriminator
+; CHECK: 4 3 1 0 1
+; CHECK-NEXT: 0 3 1 0 0
diff --git a/test/CodeGen/X86/debug-nodebug-crash.ll b/test/CodeGen/X86/debug-nodebug-crash.ll
new file mode 100644
index 000000000000..a957626ac37d
--- /dev/null
+++ b/test/CodeGen/X86/debug-nodebug-crash.ll
@@ -0,0 +1,61 @@
+;RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu | FileCheck %s
+;
+; LexicalScope objects were not cleared when a nodebug function is handled in
+; LiveDebugValues. This may lead to an assertion in the constructor for LexicalScope,
+; triggered by LiveDebugValues when another (debug) function is handled later.
+;
+; This minimal example does not leave much to check for, so we just make sure we get
+; reasonable output, preserving function labels and a DBG_VALUE comment.
+;
+; CHECK-LABEL: foo:
+; CHECK-NEXT: Lfunc_begin0:
+; CHECK: Lfunc_end0:
+; CHECK-LABEL: bar:
+; CHECK-NEXT: Lfunc_begin1:
+; CHECK: #DEBUG_VALUE: foo:x <-
+; CHECK: Lfunc_end1:
+
+define i32 @foo() {
+entry:
+ ret i32 0
+}
+
+define i32 @bar(i32 %x) !dbg !50 {
+entry:
+ tail call void @llvm.dbg.value(metadata i32 %x, i64 0, metadata !41, metadata !43), !dbg !52
+ %tobool.i = icmp eq i32 %x, 0
+ br i1 %tobool.i, label %foo.exit, label %if.then.i
+
+if.then.i:
+ br label %foo.exit
+
+foo.exit:
+ %x.addr.0.i = phi i32 [ 1, %if.then.i ], [ 0, %entry ]
+ ret i32 %x.addr.0.i
+}
+
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata)
+
+!llvm.dbg.cu = !{!0, !3}
+!llvm.ident = !{!5, !5}
+!llvm.module.flags = !{!6, !7}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
+!1 = !DIFile(filename: "foo.cpp", directory: "c:\temp")
+!2 = !{}
+!3 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !4, producer: "", isOptimized: true, runtimeVersion: 0, emissionKind: NoDebug, enums: !2)
+!4 = !DIFile(filename: "bar.cpp", directory: "c:\temp")
+!5 = !{!"clang version 4.0.0"}
+!6 = !{i32 2, !"Dwarf Version", i32 4}
+!7 = !{i32 2, !"Debug Info Version", i32 3}
+!36 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 1, type: !37, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, variables: !40)
+!37 = !DISubroutineType(types: !38)
+!38 = !{!39, !39}
+!39 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!40 = !{!41}
+!41 = !DILocalVariable(name: "x", arg: 1, scope: !36, file: !1, line: 1, type: !39)
+!43 = !DIExpression()
+!50 = distinct !DISubprogram(name: "bar", scope: !4, file: !4, line: 3, type: !51, isLocal: false, isDefinition: true, scopeLine: 3, flags: DIFlagPrototyped, isOptimized: true, unit: !3, variables: !2)
+!51 = !DISubroutineType(types: !2)
+!52 = !DILocation(line: 1, scope: !36, inlinedAt: !53)
+!53 = distinct !DILocation(line: 5, scope: !50)
diff --git a/test/CodeGen/X86/debugloc-no-line-0.ll b/test/CodeGen/X86/debugloc-no-line-0.ll
new file mode 100644
index 000000000000..27b72caf360c
--- /dev/null
+++ b/test/CodeGen/X86/debugloc-no-line-0.ll
@@ -0,0 +1,49 @@
+; RUN: llc -O0 -mtriple=x86_64-unknown-linux-gnu -stop-before="regallocfast" -o - %s | FileCheck %s
+;
+; We check that all the instructions in bb4 now have a debug-location
+; annotation, and that the annotation is identical to the one on e.g.,
+; the jmp to bb4.
+;
+; CHECK: JMP{{.*}}%bb.4, debug-location ![[JUMPLOC:[0-9]+]]
+; CHECK: bb.4.entry:
+; CHECK: successors:
+; CHECK: JE{{.*}}debug-location ![[JUMPLOC]]
+; CHECK: JMP{{.*}}debug-location ![[JUMPLOC]]
+
+define i32 @main() !dbg !12 {
+entry:
+ %add = add nsw i32 undef, 1, !dbg !16
+ switch i32 %add, label %sw.epilog [
+ i32 1, label %sw.bb
+ i32 2, label %sw.bb2
+ ], !dbg !17
+
+sw.bb: ; preds = %entry
+ br label %sw.epilog, !dbg !20
+
+sw.bb2: ; preds = %entry
+ br label %sw.epilog, !dbg !22
+
+sw.epilog: ; preds = %sw.bb2, %sw.bb, %entry
+ ret i32 4711, !dbg !23
+}
+!llvm.dbg.cu = !{!2}
+!llvm.module.flags = !{!8, !9}
+!llvm.ident = !{!11}
+
+!2 = distinct !DICompileUnit(language: DW_LANG_C99, file: !3, emissionKind: FullDebug, enums: !4)
+!3 = !DIFile(filename: "foo.c", directory: ".")
+!4 = !{}
+!7 = !DIBasicType(name: "int", size: 16, encoding: DW_ATE_signed)
+!8 = !{i32 2, !"Dwarf Version", i32 4}
+!9 = !{i32 2, !"Debug Info Version", i32 3}
+!11 = !{!"clang"}
+!12 = distinct !DISubprogram(name: "main", scope: !3, file: !3, line: 4, type: !13, isLocal: false, isDefinition: true, scopeLine: 5, flags: DIFlagPrototyped, isOptimized: false, unit: !2, variables: !4)
+!13 = !DISubroutineType(types: !14)
+!14 = !{!7}
+!16 = !DILocation(line: 6, column: 13, scope: !12)
+!17 = !DILocation(line: 6, column: 3, scope: !12)
+!19 = distinct !DILexicalBlock(scope: !12, file: !3, line: 7, column: 5)
+!20 = !DILocation(line: 10, column: 7, scope: !19)
+!22 = !DILocation(line: 13, column: 7, scope: !19)
+!23 = !DILocation(line: 24, column: 1, scope: !12)
diff --git a/test/CodeGen/X86/deopt-intrinsic-cconv.ll b/test/CodeGen/X86/deopt-intrinsic-cconv.ll
index 97bca1f69dbf..edf9d0e03448 100644
--- a/test/CodeGen/X86/deopt-intrinsic-cconv.ll
+++ b/test/CodeGen/X86/deopt-intrinsic-cconv.ll
@@ -12,7 +12,6 @@ define i64 @caller_1() {
; CHECK-NEXT: {{.+cfi.+}}
; CHECK-NEXT: ##{{.+}}
; CHECK-NEXT: pushq %rax
-; CHECK-NEXT: {{Lcfi[0-9]+}}:
; CHECK-NEXT: {{.+cfi.+}}
; CHECK-NEXT: movl $1140457472, (%rsp) ## imm = 0x43FA0000
; CHECK-NEXT: movl $42, %eax
diff --git a/test/CodeGen/X86/deopt-intrinsic.ll b/test/CodeGen/X86/deopt-intrinsic.ll
index 0e894516ffa3..b99482f0fb03 100644
--- a/test/CodeGen/X86/deopt-intrinsic.ll
+++ b/test/CodeGen/X86/deopt-intrinsic.ll
@@ -13,7 +13,6 @@ define i32 @caller_0() {
; CHECK-NEXT: {{.+cfi.+}}
; CHECK-NEXT: ##{{.+}}
; CHECK-NEXT: pushq %rax
-; CHECK-NEXT: {{Lcfi[0-9]+}}:
; CHECK-NEXT: {{.+cfi.+}}
; CHECK-NEXT: callq ___llvm_deoptimize
; CHECK-NEXT: {{Ltmp[0-9]+}}:
@@ -27,7 +26,6 @@ define i8 @caller_1() {
; CHECK-NEXT: {{.+cfi.+}}
; CHECK-NEXT: ##{{.+}}
; CHECK-NEXT: pushq %rax
-; CHECK-NEXT: {{Lcfi[0-9]+}}:
; CHECK-NEXT: {{.+cfi.+}}
; CHECK-NEXT: movss {{[a-zA-Z0-9_]+}}(%rip), %xmm0 ## xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: movl $42, %edi
diff --git a/test/CodeGen/X86/disable-tail-calls.ll b/test/CodeGen/X86/disable-tail-calls.ll
index 80e8fd74e92d..16f838e8fdca 100644
--- a/test/CodeGen/X86/disable-tail-calls.ll
+++ b/test/CodeGen/X86/disable-tail-calls.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -march x86-64 | FileCheck %s --check-prefix=NO-OPTION
-; RUN: llc < %s -march x86-64 -disable-tail-calls | FileCheck %s --check-prefix=DISABLE-TRUE
-; RUN: llc < %s -march x86-64 -disable-tail-calls=false | FileCheck %s --check-prefix=DISABLE-FALSE
+; RUN: llc < %s -mtriple=x86_64-- | FileCheck %s --check-prefix=NO-OPTION
+; RUN: llc < %s -mtriple=x86_64-- -disable-tail-calls | FileCheck %s --check-prefix=DISABLE-TRUE
+; RUN: llc < %s -mtriple=x86_64-- -disable-tail-calls=false | FileCheck %s --check-prefix=DISABLE-FALSE
; Check that command line option "-disable-tail-calls" overrides function
; attribute "disable-tail-calls".
diff --git a/test/CodeGen/X86/discontiguous-loops.ll b/test/CodeGen/X86/discontiguous-loops.ll
index 20db750d206b..01ba4dffe33b 100644
--- a/test/CodeGen/X86/discontiguous-loops.ll
+++ b/test/CodeGen/X86/discontiguous-loops.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-loop-info -verify-dom-info -march=x86-64 < %s
+; RUN: llc -verify-loop-info -verify-dom-info -mtriple=x86_64-- < %s
; PR5243
@.str96 = external constant [37 x i8], align 8 ; <[37 x i8]*> [#uses=1]
diff --git a/test/CodeGen/X86/div-rem-simplify.ll b/test/CodeGen/X86/div-rem-simplify.ll
index 04cf439dc155..af43df007559 100644
--- a/test/CodeGen/X86/div-rem-simplify.ll
+++ b/test/CodeGen/X86/div-rem-simplify.ll
@@ -5,7 +5,7 @@
define i32 @srem0(i32 %x) {
; CHECK-LABEL: srem0:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: retq
%rem = srem i32 %x, 0
ret i32 %rem
@@ -13,7 +13,7 @@ define i32 @srem0(i32 %x) {
define i32 @urem0(i32 %x) {
; CHECK-LABEL: urem0:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: retq
%rem = urem i32 %x, 0
ret i32 %rem
@@ -21,7 +21,7 @@ define i32 @urem0(i32 %x) {
define i32 @sdiv0(i32 %x) {
; CHECK-LABEL: sdiv0:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: retq
%div = sdiv i32 %x, 0
ret i32 %div
@@ -29,7 +29,7 @@ define i32 @sdiv0(i32 %x) {
define i32 @udiv0(i32 %x) {
; CHECK-LABEL: udiv0:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: retq
%div = udiv i32 %x, 0
ret i32 %div
@@ -39,7 +39,7 @@ define i32 @udiv0(i32 %x) {
define <4 x i32> @srem_vec0(<4 x i32> %x) {
; CHECK-LABEL: srem_vec0:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: retq
%rem = srem <4 x i32> %x, zeroinitializer
ret <4 x i32> %rem
@@ -47,7 +47,7 @@ define <4 x i32> @srem_vec0(<4 x i32> %x) {
define <4 x i32> @urem_vec0(<4 x i32> %x) {
; CHECK-LABEL: urem_vec0:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: retq
%rem = urem <4 x i32> %x, zeroinitializer
ret <4 x i32> %rem
@@ -55,7 +55,7 @@ define <4 x i32> @urem_vec0(<4 x i32> %x) {
define <4 x i32> @sdiv_vec0(<4 x i32> %x) {
; CHECK-LABEL: sdiv_vec0:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: retq
%div = sdiv <4 x i32> %x, zeroinitializer
ret <4 x i32> %div
@@ -63,7 +63,7 @@ define <4 x i32> @sdiv_vec0(<4 x i32> %x) {
define <4 x i32> @udiv_vec0(<4 x i32> %x) {
; CHECK-LABEL: udiv_vec0:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: retq
%div = udiv <4 x i32> %x, zeroinitializer
ret <4 x i32> %div
@@ -74,7 +74,7 @@ define <4 x i32> @udiv_vec0(<4 x i32> %x) {
define i32 @sel_urem0(i1 %cond) {
; CHECK-LABEL: sel_urem0:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: retq
%sel = select i1 %cond, i32 23, i32 234
%rem = urem i32 %sel, 0
@@ -83,7 +83,7 @@ define i32 @sel_urem0(i1 %cond) {
define i32 @sel_srem0(i1 %cond) {
; CHECK-LABEL: sel_srem0:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: retq
%sel = select i1 %cond, i32 23, i32 234
%rem = srem i32 %sel, 0
@@ -92,7 +92,7 @@ define i32 @sel_srem0(i1 %cond) {
define i32 @sel_udiv0(i1 %cond) {
; CHECK-LABEL: sel_udiv0:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: retq
%sel = select i1 %cond, i32 23, i32 234
%div = udiv i32 %sel, 0
@@ -101,7 +101,7 @@ define i32 @sel_udiv0(i1 %cond) {
define i32 @sel_sdiv0(i1 %cond) {
; CHECK-LABEL: sel_sdiv0:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: retq
%sel = select i1 %cond, i32 23, i32 234
%div = sdiv i32 %sel, 0
@@ -113,7 +113,7 @@ define i32 @sel_sdiv0(i1 %cond) {
define <4 x i32> @sel_urem0_vec(i1 %cond) {
; CHECK-LABEL: sel_urem0_vec:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: retq
%sel = select i1 %cond, <4 x i32> <i32 -1, i32 0, i32 1, i32 2>, <4 x i32> <i32 11, i32 12, i32 13, i32 14>
%rem = urem <4 x i32> %sel, zeroinitializer
@@ -122,7 +122,7 @@ define <4 x i32> @sel_urem0_vec(i1 %cond) {
define <4 x i32> @sel_srem0_vec(i1 %cond) {
; CHECK-LABEL: sel_srem0_vec:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: retq
%sel = select i1 %cond, <4 x i32> <i32 -1, i32 0, i32 1, i32 2>, <4 x i32> <i32 11, i32 12, i32 13, i32 14>
%rem = srem <4 x i32> %sel, zeroinitializer
@@ -131,7 +131,7 @@ define <4 x i32> @sel_srem0_vec(i1 %cond) {
define <4 x i32> @sel_udiv0_vec(i1 %cond) {
; CHECK-LABEL: sel_udiv0_vec:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: retq
%sel = select i1 %cond, <4 x i32> <i32 -1, i32 0, i32 1, i32 2>, <4 x i32> <i32 11, i32 12, i32 13, i32 14>
%div = udiv <4 x i32> %sel, zeroinitializer
@@ -140,7 +140,7 @@ define <4 x i32> @sel_udiv0_vec(i1 %cond) {
define <4 x i32> @sel_sdiv0_vec(i1 %cond) {
; CHECK-LABEL: sel_sdiv0_vec:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: retq
%sel = select i1 %cond, <4 x i32> <i32 -1, i32 0, i32 1, i32 2>, <4 x i32> <i32 11, i32 12, i32 13, i32 14>
%div = sdiv <4 x i32> %sel, zeroinitializer
@@ -151,7 +151,7 @@ define <4 x i32> @sel_sdiv0_vec(i1 %cond) {
define <4 x i32> @sdiv0elt_vec(<4 x i32> %x) {
; CHECK-LABEL: sdiv0elt_vec:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: retq
%zero = and <4 x i32> %x, <i32 0, i32 0, i32 0, i32 0>
%some_ones = or <4 x i32> %zero, <i32 0, i32 -1, i32 0, i32 3>
@@ -161,7 +161,7 @@ define <4 x i32> @sdiv0elt_vec(<4 x i32> %x) {
define <4 x i32> @udiv0elt_vec(<4 x i32> %x) {
; CHECK-LABEL: udiv0elt_vec:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: retq
%div = udiv <4 x i32> <i32 11, i32 12, i32 13, i32 14>, <i32 0, i32 3, i32 4, i32 0>
ret <4 x i32> %div
@@ -169,7 +169,7 @@ define <4 x i32> @udiv0elt_vec(<4 x i32> %x) {
define <4 x i32> @urem0elt_vec(<4 x i32> %x) {
; CHECK-LABEL: urem0elt_vec:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: retq
%zero = and <4 x i32> %x, <i32 0, i32 0, i32 0, i32 0>
%some_ones = or <4 x i32> %zero, <i32 0, i32 0, i32 0, i32 3>
@@ -179,7 +179,7 @@ define <4 x i32> @urem0elt_vec(<4 x i32> %x) {
define <4 x i32> @srem0elt_vec(<4 x i32> %x) {
; CHECK-LABEL: srem0elt_vec:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: retq
%rem = srem <4 x i32> <i32 -11, i32 -12, i32 -13, i32 -14>, <i32 -3, i32 -3, i32 0, i32 2>
ret <4 x i32> %rem
diff --git a/test/CodeGen/X86/divide-by-constant.ll b/test/CodeGen/X86/divide-by-constant.ll
index ee53dd3233d3..f1322dd61454 100644
--- a/test/CodeGen/X86/divide-by-constant.ll
+++ b/test/CodeGen/X86/divide-by-constant.ll
@@ -4,18 +4,18 @@
define zeroext i16 @test1(i16 zeroext %x) nounwind {
; X32-LABEL: test1:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: imull $63551, %eax, %eax # imm = 0xF83F
; X32-NEXT: shrl $21, %eax
-; X32-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; X32-NEXT: # kill: def %ax killed %ax killed %eax
; X32-NEXT: retl
;
; X64-LABEL: test1:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: imull $63551, %edi, %eax # imm = 0xF83F
; X64-NEXT: shrl $21, %eax
-; X64-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-NEXT: # kill: def %ax killed %ax killed %eax
; X64-NEXT: retq
entry:
%div = udiv i16 %x, 33
@@ -24,18 +24,18 @@ entry:
define zeroext i16 @test2(i8 signext %x, i16 zeroext %c) nounwind readnone ssp noredzone {
; X32-LABEL: test2:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: imull $43691, %eax, %eax # imm = 0xAAAB
; X32-NEXT: shrl $17, %eax
-; X32-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; X32-NEXT: # kill: def %ax killed %ax killed %eax
; X32-NEXT: retl
;
; X64-LABEL: test2:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: imull $43691, %esi, %eax # imm = 0xAAAB
; X64-NEXT: shrl $17, %eax
-; X64-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-NEXT: # kill: def %ax killed %ax killed %eax
; X64-NEXT: retq
entry:
%div = udiv i16 %c, 3
@@ -45,20 +45,20 @@ entry:
define zeroext i8 @test3(i8 zeroext %x, i8 zeroext %c) nounwind readnone ssp noredzone {
; X32-LABEL: test3:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; X32-NEXT: imull $171, %eax, %eax
-; X32-NEXT: andl $65024, %eax # imm = 0xFE00
; X32-NEXT: shrl $9, %eax
-; X32-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; X32-NEXT: movzwl %ax, %eax
+; X32-NEXT: # kill: def %al killed %al killed %eax
; X32-NEXT: retl
;
; X64-LABEL: test3:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: imull $171, %esi, %eax
-; X64-NEXT: andl $65024, %eax # imm = 0xFE00
; X64-NEXT: shrl $9, %eax
-; X64-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-NEXT: movzwl %ax, %eax
+; X64-NEXT: # kill: def %al killed %al killed %eax
; X64-NEXT: retq
entry:
%div = udiv i8 %c, 3
@@ -67,24 +67,24 @@ entry:
define signext i16 @test4(i16 signext %x) nounwind {
; X32-LABEL: test4:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: movswl {{[0-9]+}}(%esp), %eax
; X32-NEXT: imull $1986, %eax, %eax # imm = 0x7C2
; X32-NEXT: movl %eax, %ecx
; X32-NEXT: shrl $31, %ecx
; X32-NEXT: shrl $16, %eax
; X32-NEXT: addl %ecx, %eax
-; X32-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; X32-NEXT: # kill: def %ax killed %ax killed %eax
; X32-NEXT: retl
;
; X64-LABEL: test4:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: imull $1986, %edi, %eax # imm = 0x7C2
; X64-NEXT: movl %eax, %ecx
; X64-NEXT: shrl $31, %ecx
; X64-NEXT: shrl $16, %eax
; X64-NEXT: addl %ecx, %eax
-; X64-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-NEXT: # kill: def %ax killed %ax killed %eax
; X64-NEXT: retq
entry:
%div = sdiv i16 %x, 33 ; <i32> [#uses=1]
@@ -93,7 +93,7 @@ entry:
define i32 @test5(i32 %A) nounwind {
; X32-LABEL: test5:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl $365384439, %eax # imm = 0x15C752F7
; X32-NEXT: mull {{[0-9]+}}(%esp)
; X32-NEXT: shrl $27, %edx
@@ -101,11 +101,11 @@ define i32 @test5(i32 %A) nounwind {
; X32-NEXT: retl
;
; X64-LABEL: test5:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movl %edi, %eax
; X64-NEXT: imulq $365384439, %rax, %rax # imm = 0x15C752F7
; X64-NEXT: shrq $59, %rax
-; X64-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill>
+; X64-NEXT: # kill: def %eax killed %eax killed %rax
; X64-NEXT: retq
%tmp1 = udiv i32 %A, 1577682821 ; <i32> [#uses=1]
ret i32 %tmp1
@@ -113,24 +113,24 @@ define i32 @test5(i32 %A) nounwind {
define signext i16 @test6(i16 signext %x) nounwind {
; X32-LABEL: test6:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: movswl {{[0-9]+}}(%esp), %eax
; X32-NEXT: imull $26215, %eax, %eax # imm = 0x6667
; X32-NEXT: movl %eax, %ecx
; X32-NEXT: shrl $31, %ecx
; X32-NEXT: sarl $18, %eax
; X32-NEXT: addl %ecx, %eax
-; X32-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; X32-NEXT: # kill: def %ax killed %ax killed %eax
; X32-NEXT: retl
;
; X64-LABEL: test6:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: imull $26215, %edi, %eax # imm = 0x6667
; X64-NEXT: movl %eax, %ecx
; X64-NEXT: shrl $31, %ecx
; X64-NEXT: sarl $18, %eax
; X64-NEXT: addl %ecx, %eax
-; X64-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-NEXT: # kill: def %ax killed %ax killed %eax
; X64-NEXT: retq
entry:
%div = sdiv i16 %x, 10
@@ -139,7 +139,7 @@ entry:
define i32 @test7(i32 %x) nounwind {
; X32-LABEL: test7:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: shrl $2, %eax
; X32-NEXT: movl $613566757, %ecx # imm = 0x24924925
@@ -148,12 +148,12 @@ define i32 @test7(i32 %x) nounwind {
; X32-NEXT: retl
;
; X64-LABEL: test7:
-; X64: # BB#0:
-; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64: # %bb.0:
+; X64-NEXT: # kill: def %edi killed %edi def %rdi
; X64-NEXT: shrl $2, %edi
; X64-NEXT: imulq $613566757, %rdi, %rax # imm = 0x24924925
; X64-NEXT: shrq $32, %rax
-; X64-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill>
+; X64-NEXT: # kill: def %eax killed %eax killed %rax
; X64-NEXT: retq
%div = udiv i32 %x, 28
ret i32 %div
@@ -162,24 +162,24 @@ define i32 @test7(i32 %x) nounwind {
; PR13326
define i8 @test8(i8 %x) nounwind {
; X32-LABEL: test8:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movb {{[0-9]+}}(%esp), %al
; X32-NEXT: shrb %al
; X32-NEXT: movzbl %al, %eax
; X32-NEXT: imull $211, %eax, %eax
-; X32-NEXT: andl $24576, %eax # imm = 0x6000
; X32-NEXT: shrl $13, %eax
-; X32-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; X32-NEXT: movzwl %ax, %eax
+; X32-NEXT: # kill: def %al killed %al killed %eax
; X32-NEXT: retl
;
; X64-LABEL: test8:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: shrb %dil
; X64-NEXT: movzbl %dil, %eax
; X64-NEXT: imull $211, %eax, %eax
-; X64-NEXT: andl $24576, %eax # imm = 0x6000
; X64-NEXT: shrl $13, %eax
-; X64-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-NEXT: movzwl %ax, %eax
+; X64-NEXT: # kill: def %al killed %al killed %eax
; X64-NEXT: retq
%div = udiv i8 %x, 78
ret i8 %div
@@ -187,24 +187,24 @@ define i8 @test8(i8 %x) nounwind {
define i8 @test9(i8 %x) nounwind {
; X32-LABEL: test9:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movb {{[0-9]+}}(%esp), %al
; X32-NEXT: shrb $2, %al
; X32-NEXT: movzbl %al, %eax
; X32-NEXT: imull $71, %eax, %eax
-; X32-NEXT: andl $6144, %eax # imm = 0x1800
; X32-NEXT: shrl $11, %eax
-; X32-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; X32-NEXT: movzwl %ax, %eax
+; X32-NEXT: # kill: def %al killed %al killed %eax
; X32-NEXT: retl
;
; X64-LABEL: test9:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: shrb $2, %dil
; X64-NEXT: movzbl %dil, %eax
; X64-NEXT: imull $71, %eax, %eax
-; X64-NEXT: andl $6144, %eax # imm = 0x1800
; X64-NEXT: shrl $11, %eax
-; X64-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-NEXT: movzwl %ax, %eax
+; X64-NEXT: # kill: def %al killed %al killed %eax
; X64-NEXT: retq
%div = udiv i8 %x, 116
ret i8 %div
@@ -212,7 +212,7 @@ define i8 @test9(i8 %x) nounwind {
define i32 @testsize1(i32 %x) minsize nounwind {
; X32-LABEL: testsize1:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: pushl $32
; X32-NEXT: popl %ecx
@@ -221,7 +221,7 @@ define i32 @testsize1(i32 %x) minsize nounwind {
; X32-NEXT: retl
;
; X64-LABEL: testsize1:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: pushq $32
; X64-NEXT: popq %rcx
; X64-NEXT: movl %edi, %eax
@@ -235,7 +235,7 @@ entry:
define i32 @testsize2(i32 %x) minsize nounwind {
; X32-LABEL: testsize2:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: pushl $33
; X32-NEXT: popl %ecx
@@ -244,7 +244,7 @@ define i32 @testsize2(i32 %x) minsize nounwind {
; X32-NEXT: retl
;
; X64-LABEL: testsize2:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: pushq $33
; X64-NEXT: popq %rcx
; X64-NEXT: movl %edi, %eax
@@ -258,13 +258,13 @@ entry:
define i32 @testsize3(i32 %x) minsize nounwind {
; X32-LABEL: testsize3:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: shrl $5, %eax
; X32-NEXT: retl
;
; X64-LABEL: testsize3:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: shrl $5, %edi
; X64-NEXT: movl %edi, %eax
; X64-NEXT: retq
@@ -275,7 +275,7 @@ entry:
define i32 @testsize4(i32 %x) minsize nounwind {
; X32-LABEL: testsize4:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: pushl $33
; X32-NEXT: popl %ecx
@@ -284,7 +284,7 @@ define i32 @testsize4(i32 %x) minsize nounwind {
; X32-NEXT: retl
;
; X64-LABEL: testsize4:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: pushq $33
; X64-NEXT: popq %rcx
; X64-NEXT: xorl %edx, %edx
@@ -298,7 +298,7 @@ entry:
define i64 @PR23590(i64 %x) nounwind {
; X32-LABEL: PR23590:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: subl $12, %esp
; X32-NEXT: pushl $0
; X32-NEXT: pushl $12345 # imm = 0x3039
@@ -315,7 +315,7 @@ define i64 @PR23590(i64 %x) nounwind {
; X32-NEXT: retl
;
; X64-LABEL: PR23590:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: movq %rdi, %rcx
; X64-NEXT: movabsq $6120523590596543007, %rdx # imm = 0x54F077C718E7C21F
; X64-NEXT: movq %rcx, %rax
diff --git a/test/CodeGen/X86/divrem.ll b/test/CodeGen/X86/divrem.ll
index 73d16060be72..6648d34aa0ff 100644
--- a/test/CodeGen/X86/divrem.ll
+++ b/test/CodeGen/X86/divrem.ll
@@ -4,7 +4,7 @@
define void @si64(i64 %x, i64 %y, i64* %p, i64* %q) nounwind {
; X32-LABEL: si64:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pushl %ebp
; X32-NEXT: pushl %ebx
; X32-NEXT: pushl %edi
@@ -38,7 +38,7 @@ define void @si64(i64 %x, i64 %y, i64* %p, i64* %q) nounwind {
; X32-NEXT: retl
;
; X64-LABEL: si64:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movq %rdx, %r8
; X64-NEXT: movq %rdi, %rax
; X64-NEXT: cqto
@@ -55,7 +55,7 @@ define void @si64(i64 %x, i64 %y, i64* %p, i64* %q) nounwind {
define void @si32(i32 %x, i32 %y, i32* %p, i32* %q) nounwind {
; X32-LABEL: si32:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pushl %esi
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
@@ -68,7 +68,7 @@ define void @si32(i32 %x, i32 %y, i32* %p, i32* %q) nounwind {
; X32-NEXT: retl
;
; X64-LABEL: si32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movq %rdx, %r8
; X64-NEXT: movl %edi, %eax
; X64-NEXT: cltd
@@ -85,7 +85,7 @@ define void @si32(i32 %x, i32 %y, i32* %p, i32* %q) nounwind {
define void @si16(i16 %x, i16 %y, i16* %p, i16* %q) nounwind {
; X32-LABEL: si16:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pushl %esi
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
@@ -98,7 +98,7 @@ define void @si16(i16 %x, i16 %y, i16* %p, i16* %q) nounwind {
; X32-NEXT: retl
;
; X64-LABEL: si16:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movq %rdx, %r8
; X64-NEXT: movl %edi, %eax
; X64-NEXT: cwtd
@@ -115,7 +115,7 @@ define void @si16(i16 %x, i16 %y, i16* %p, i16* %q) nounwind {
define void @si8(i8 %x, i8 %y, i8* %p, i8* %q) nounwind {
; X32-LABEL: si8:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pushl %ebx
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
@@ -129,7 +129,7 @@ define void @si8(i8 %x, i8 %y, i8* %p, i8* %q) nounwind {
; X32-NEXT: retl
;
; X64-LABEL: si8:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movl %edi, %eax
; X64-NEXT: cbtw
; X64-NEXT: idivb %sil
@@ -146,7 +146,7 @@ define void @si8(i8 %x, i8 %y, i8* %p, i8* %q) nounwind {
define void @ui64(i64 %x, i64 %y, i64* %p, i64* %q) nounwind {
; X32-LABEL: ui64:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pushl %ebp
; X32-NEXT: pushl %ebx
; X32-NEXT: pushl %edi
@@ -180,7 +180,7 @@ define void @ui64(i64 %x, i64 %y, i64* %p, i64* %q) nounwind {
; X32-NEXT: retl
;
; X64-LABEL: ui64:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movq %rdx, %r8
; X64-NEXT: xorl %edx, %edx
; X64-NEXT: movq %rdi, %rax
@@ -197,7 +197,7 @@ define void @ui64(i64 %x, i64 %y, i64* %p, i64* %q) nounwind {
define void @ui32(i32 %x, i32 %y, i32* %p, i32* %q) nounwind {
; X32-LABEL: ui32:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pushl %esi
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
@@ -210,7 +210,7 @@ define void @ui32(i32 %x, i32 %y, i32* %p, i32* %q) nounwind {
; X32-NEXT: retl
;
; X64-LABEL: ui32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movq %rdx, %r8
; X64-NEXT: xorl %edx, %edx
; X64-NEXT: movl %edi, %eax
@@ -227,7 +227,7 @@ define void @ui32(i32 %x, i32 %y, i32* %p, i32* %q) nounwind {
define void @ui16(i16 %x, i16 %y, i16* %p, i16* %q) nounwind {
; X32-LABEL: ui16:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pushl %esi
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
@@ -240,7 +240,7 @@ define void @ui16(i16 %x, i16 %y, i16* %p, i16* %q) nounwind {
; X32-NEXT: retl
;
; X64-LABEL: ui16:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movq %rdx, %r8
; X64-NEXT: xorl %edx, %edx
; X64-NEXT: movl %edi, %eax
@@ -257,12 +257,12 @@ define void @ui16(i16 %x, i16 %y, i16* %p, i16* %q) nounwind {
define void @ui8(i8 %x, i8 %y, i8* %p, i8* %q) nounwind {
; X32-LABEL: ui8:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pushl %ebx
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: # kill: %EAX<def> %EAX<kill> %AX<def>
+; X32-NEXT: # kill: def %eax killed %eax def %ax
; X32-NEXT: divb {{[0-9]+}}(%esp)
; X32-NEXT: movzbl %ah, %ebx # NOREX
; X32-NEXT: movb %al, (%edx)
@@ -271,9 +271,9 @@ define void @ui8(i8 %x, i8 %y, i8* %p, i8* %q) nounwind {
; X32-NEXT: retl
;
; X64-LABEL: ui8:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movzbl %dil, %eax
-; X64-NEXT: # kill: %EAX<def> %EAX<kill> %AX<def>
+; X64-NEXT: # kill: def %eax killed %eax def %ax
; X64-NEXT: divb %sil
; X64-NEXT: movzbl %ah, %esi # NOREX
; X64-NEXT: movb %al, (%rdx)
diff --git a/test/CodeGen/X86/divrem8_ext.ll b/test/CodeGen/X86/divrem8_ext.ll
index 7521156a370e..8b6590141e17 100644
--- a/test/CodeGen/X86/divrem8_ext.ll
+++ b/test/CodeGen/X86/divrem8_ext.ll
@@ -4,9 +4,9 @@
define zeroext i8 @test_udivrem_zext_ah(i8 %x, i8 %y) {
; X32-LABEL: test_udivrem_zext_ah:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: # kill: %EAX<def> %EAX<kill> %AX<def>
+; X32-NEXT: # kill: def %eax killed %eax def %ax
; X32-NEXT: divb {{[0-9]+}}(%esp)
; X32-NEXT: movzbl %ah, %ecx # NOREX
; X32-NEXT: movb %al, z
@@ -14,9 +14,9 @@ define zeroext i8 @test_udivrem_zext_ah(i8 %x, i8 %y) {
; X32-NEXT: retl
;
; X64-LABEL: test_udivrem_zext_ah:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movzbl %dil, %eax
-; X64-NEXT: # kill: %EAX<def> %EAX<kill> %AX<def>
+; X64-NEXT: # kill: def %eax killed %eax def %ax
; X64-NEXT: divb %sil
; X64-NEXT: movzbl %ah, %ecx # NOREX
; X64-NEXT: movb %al, {{.*}}(%rip)
@@ -30,21 +30,21 @@ define zeroext i8 @test_udivrem_zext_ah(i8 %x, i8 %y) {
define zeroext i8 @test_urem_zext_ah(i8 %x, i8 %y) {
; X32-LABEL: test_urem_zext_ah:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: # kill: %EAX<def> %EAX<kill> %AX<def>
+; X32-NEXT: # kill: def %eax killed %eax def %ax
; X32-NEXT: divb {{[0-9]+}}(%esp)
; X32-NEXT: movzbl %ah, %eax # NOREX
-; X32-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; X32-NEXT: # kill: def %al killed %al killed %eax
; X32-NEXT: retl
;
; X64-LABEL: test_urem_zext_ah:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movzbl %dil, %eax
-; X64-NEXT: # kill: %EAX<def> %EAX<kill> %AX<def>
+; X64-NEXT: # kill: def %eax killed %eax def %ax
; X64-NEXT: divb %sil
; X64-NEXT: movzbl %ah, %eax # NOREX
-; X64-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-NEXT: # kill: def %al killed %al killed %eax
; X64-NEXT: retq
%1 = urem i8 %x, %y
ret i8 %1
@@ -52,24 +52,24 @@ define zeroext i8 @test_urem_zext_ah(i8 %x, i8 %y) {
define i8 @test_urem_noext_ah(i8 %x, i8 %y) {
; X32-LABEL: test_urem_noext_ah:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movb {{[0-9]+}}(%esp), %cl
; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: # kill: %EAX<def> %EAX<kill> %AX<def>
+; X32-NEXT: # kill: def %eax killed %eax def %ax
; X32-NEXT: divb %cl
; X32-NEXT: movzbl %ah, %eax # NOREX
; X32-NEXT: addb %cl, %al
-; X32-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; X32-NEXT: # kill: def %al killed %al killed %eax
; X32-NEXT: retl
;
; X64-LABEL: test_urem_noext_ah:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movzbl %dil, %eax
-; X64-NEXT: # kill: %EAX<def> %EAX<kill> %AX<def>
+; X64-NEXT: # kill: def %eax killed %eax def %ax
; X64-NEXT: divb %sil
; X64-NEXT: movzbl %ah, %eax # NOREX
; X64-NEXT: addb %sil, %al
-; X64-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-NEXT: # kill: def %al killed %al killed %eax
; X64-NEXT: retq
%1 = urem i8 %x, %y
%2 = add i8 %1, %y
@@ -78,21 +78,20 @@ define i8 @test_urem_noext_ah(i8 %x, i8 %y) {
define i64 @test_urem_zext64_ah(i8 %x, i8 %y) {
; X32-LABEL: test_urem_zext64_ah:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: # kill: %EAX<def> %EAX<kill> %AX<def>
+; X32-NEXT: # kill: def %eax killed %eax def %ax
; X32-NEXT: divb {{[0-9]+}}(%esp)
; X32-NEXT: movzbl %ah, %eax # NOREX
; X32-NEXT: xorl %edx, %edx
; X32-NEXT: retl
;
; X64-LABEL: test_urem_zext64_ah:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movzbl %dil, %eax
-; X64-NEXT: # kill: %EAX<def> %EAX<kill> %AX<def>
+; X64-NEXT: # kill: def %eax killed %eax def %ax
; X64-NEXT: divb %sil
; X64-NEXT: movzbl %ah, %eax # NOREX
-; X64-NEXT: movzbl %al, %eax
; X64-NEXT: retq
%1 = urem i8 %x, %y
%2 = zext i8 %1 to i64
@@ -101,7 +100,7 @@ define i64 @test_urem_zext64_ah(i8 %x, i8 %y) {
define signext i8 @test_sdivrem_sext_ah(i8 %x, i8 %y) {
; X32-LABEL: test_sdivrem_sext_ah:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movb {{[0-9]+}}(%esp), %al
; X32-NEXT: cbtw
; X32-NEXT: idivb {{[0-9]+}}(%esp)
@@ -111,7 +110,7 @@ define signext i8 @test_sdivrem_sext_ah(i8 %x, i8 %y) {
; X32-NEXT: retl
;
; X64-LABEL: test_sdivrem_sext_ah:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movl %edi, %eax
; X64-NEXT: cbtw
; X64-NEXT: idivb %sil
@@ -127,21 +126,21 @@ define signext i8 @test_sdivrem_sext_ah(i8 %x, i8 %y) {
define signext i8 @test_srem_sext_ah(i8 %x, i8 %y) {
; X32-LABEL: test_srem_sext_ah:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movb {{[0-9]+}}(%esp), %al
; X32-NEXT: cbtw
; X32-NEXT: idivb {{[0-9]+}}(%esp)
; X32-NEXT: movsbl %ah, %eax # NOREX
-; X32-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; X32-NEXT: # kill: def %al killed %al killed %eax
; X32-NEXT: retl
;
; X64-LABEL: test_srem_sext_ah:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movl %edi, %eax
; X64-NEXT: cbtw
; X64-NEXT: idivb %sil
; X64-NEXT: movsbl %ah, %eax # NOREX
-; X64-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-NEXT: # kill: def %al killed %al killed %eax
; X64-NEXT: retq
%1 = srem i8 %x, %y
ret i8 %1
@@ -149,24 +148,24 @@ define signext i8 @test_srem_sext_ah(i8 %x, i8 %y) {
define i8 @test_srem_noext_ah(i8 %x, i8 %y) {
; X32-LABEL: test_srem_noext_ah:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movb {{[0-9]+}}(%esp), %al
; X32-NEXT: movb {{[0-9]+}}(%esp), %cl
; X32-NEXT: cbtw
; X32-NEXT: idivb %cl
; X32-NEXT: movsbl %ah, %eax # NOREX
; X32-NEXT: addb %cl, %al
-; X32-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; X32-NEXT: # kill: def %al killed %al killed %eax
; X32-NEXT: retl
;
; X64-LABEL: test_srem_noext_ah:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movl %edi, %eax
; X64-NEXT: cbtw
; X64-NEXT: idivb %sil
; X64-NEXT: movsbl %ah, %eax # NOREX
; X64-NEXT: addb %sil, %al
-; X64-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-NEXT: # kill: def %al killed %al killed %eax
; X64-NEXT: retq
%1 = srem i8 %x, %y
%2 = add i8 %1, %y
@@ -175,7 +174,7 @@ define i8 @test_srem_noext_ah(i8 %x, i8 %y) {
define i64 @test_srem_sext64_ah(i8 %x, i8 %y) {
; X32-LABEL: test_srem_sext64_ah:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movb {{[0-9]+}}(%esp), %al
; X32-NEXT: cbtw
; X32-NEXT: idivb {{[0-9]+}}(%esp)
@@ -185,12 +184,12 @@ define i64 @test_srem_sext64_ah(i8 %x, i8 %y) {
; X32-NEXT: retl
;
; X64-LABEL: test_srem_sext64_ah:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movl %edi, %eax
; X64-NEXT: cbtw
; X64-NEXT: idivb %sil
; X64-NEXT: movsbl %ah, %eax # NOREX
-; X64-NEXT: movsbq %al, %rax
+; X64-NEXT: cltq
; X64-NEXT: retq
%1 = srem i8 %x, %y
%2 = sext i8 %1 to i64
@@ -199,9 +198,9 @@ define i64 @test_srem_sext64_ah(i8 %x, i8 %y) {
define i64 @pr25754(i8 %a, i8 %c) {
; X32-LABEL: pr25754:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: # kill: %EAX<def> %EAX<kill> %AX<def>
+; X32-NEXT: # kill: def %eax killed %eax def %ax
; X32-NEXT: divb {{[0-9]+}}(%esp)
; X32-NEXT: movzbl %ah, %ecx # NOREX
; X32-NEXT: movzbl %al, %eax
@@ -210,12 +209,11 @@ define i64 @pr25754(i8 %a, i8 %c) {
; X32-NEXT: retl
;
; X64-LABEL: pr25754:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movzbl %dil, %eax
-; X64-NEXT: # kill: %EAX<def> %EAX<kill> %AX<def>
+; X64-NEXT: # kill: def %eax killed %eax def %ax
; X64-NEXT: divb %sil
; X64-NEXT: movzbl %ah, %ecx # NOREX
-; X64-NEXT: movzbl %cl, %ecx
; X64-NEXT: movzbl %al, %eax
; X64-NEXT: addq %rcx, %rax
; X64-NEXT: retq
diff --git a/test/CodeGen/X86/dllimport-x86_64.ll b/test/CodeGen/X86/dllimport-x86_64.ll
index 7ee6b4323d15..205736ada515 100644
--- a/test/CodeGen/X86/dllimport-x86_64.ll
+++ b/test/CodeGen/X86/dllimport-x86_64.ll
@@ -1,7 +1,8 @@
; RUN: llc -mtriple x86_64-pc-win32 < %s | FileCheck %s
; RUN: llc -mtriple x86_64-pc-mingw32 < %s | FileCheck %s
;
-; RUN: llc -mtriple x86_64-pc-mingw32 -O0 < %s | FileCheck %s -check-prefix=FAST
+; RUN: llc -mtriple x86_64-pc-mingw32 -O0 < %s | FileCheck %s
+; RUN: llc -mtriple x86_64-pc-windows-msvc -O0 < %s | FileCheck %s
; PR6275
;
; RUN: opt -mtriple x86_64-pc-win32 -O3 -S < %s | FileCheck %s -check-prefix=OPT
@@ -23,8 +24,6 @@ declare void @dummy(...)
define void @use() nounwind {
; CHECK: callq *__imp_fun(%rip)
-; FAST: movq __imp_fun(%rip), [[R:%[a-z]{3}]]
-; FAST-NEXT: callq *[[R]]
call void @fun()
; CHECK: callq *__imp_inline1(%rip)
diff --git a/test/CodeGen/X86/dllimport.ll b/test/CodeGen/X86/dllimport.ll
index 34faaeb6fed7..7f90aa7e313a 100644
--- a/test/CodeGen/X86/dllimport.ll
+++ b/test/CodeGen/X86/dllimport.ll
@@ -1,7 +1,8 @@
; RUN: llc -mtriple i386-pc-win32 < %s | FileCheck %s
; RUN: llc -mtriple i386-pc-mingw32 < %s | FileCheck %s
;
-; RUN: llc -mtriple i386-pc-mingw32 -O0 < %s | FileCheck %s -check-prefix=FAST
+; RUN: llc -mtriple i386-pc-mingw32 -O0 < %s | FileCheck %s
+; RUN: llc -mtriple i386-pc-windows-msvc -O0 < %s | FileCheck %s
; PR6275
;
; RUN: opt -mtriple i386-pc-win32 -O3 -S < %s | FileCheck %s -check-prefix=OPT
@@ -27,8 +28,6 @@ declare void @dummy(...)
define void @use() nounwind {
; CHECK: calll *__imp__fun
-; FAST: movl __imp__fun, [[R:%[a-z]{3}]]
-; FAST-NEXT: calll *[[R]]
call void @fun()
; CHECK: calll *__imp__inline1
diff --git a/test/CodeGen/X86/dollar-name.ll b/test/CodeGen/X86/dollar-name.ll
index a31b806c031f..78ca900c2b89 100644
--- a/test/CodeGen/X86/dollar-name.ll
+++ b/test/CodeGen/X86/dollar-name.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mtriple=i386-linux | FileCheck %s
+; RUN: llc < %s -mtriple=i386-linux | FileCheck %s
; PR1339
@"$bar" = global i32 zeroinitializer
diff --git a/test/CodeGen/X86/domain-reassignment.mir b/test/CodeGen/X86/domain-reassignment.mir
new file mode 100644
index 000000000000..3cb4b5dd1396
--- /dev/null
+++ b/test/CodeGen/X86/domain-reassignment.mir
@@ -0,0 +1,754 @@
+# RUN: llc -run-pass x86-domain-reassignment -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512dq -o - %s | FileCheck %s
+--- |
+ ; ModuleID = '../test/CodeGen/X86/gpr-to-mask.ll'
+ source_filename = "../test/CodeGen/X86/gpr-to-mask.ll"
+ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+ target triple = "x86_64-unknown-unknown"
+
+ define void @test_fcmp_storefloat(i1 %cond, float* %fptr, float %f1, float %f2, float %f3, float %f4, float %f5, float %f6) #0 {
+ entry:
+ br i1 %cond, label %if, label %else
+
+ if: ; preds = %entry
+ %cmp1 = fcmp oeq float %f3, %f4
+ br label %exit
+
+ else: ; preds = %entry
+ %cmp2 = fcmp oeq float %f5, %f6
+ br label %exit
+
+ exit: ; preds = %else, %if
+ %val = phi i1 [ %cmp1, %if ], [ %cmp2, %else ]
+ %selected = select i1 %val, float %f1, float %f2
+ store float %selected, float* %fptr
+ ret void
+ }
+
+ define void @test_8bitops() #0 {
+ ret void
+ }
+ define void @test_16bitops() #0 {
+ ret void
+ }
+ define void @test_32bitops() #0 {
+ ret void
+ }
+ define void @test_64bitops() #0 {
+ ret void
+ }
+ define void @test_16bitext() #0 {
+ ret void
+ }
+ define void @test_32bitext() #0 {
+ ret void
+ }
+ define void @test_64bitext() #0 {
+ ret void
+ }
+...
+---
+name: test_fcmp_storefloat
+# CHECK-LABEL: name: test_fcmp_storefloat
+alignment: 4
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: gr8, preferred-register: '' }
+ - { id: 1, class: gr8, preferred-register: '' }
+ - { id: 2, class: gr8, preferred-register: '' }
+ - { id: 3, class: gr32, preferred-register: '' }
+ - { id: 4, class: gr64, preferred-register: '' }
+ - { id: 5, class: vr128x, preferred-register: '' }
+ - { id: 6, class: fr32x, preferred-register: '' }
+ - { id: 7, class: fr32x, preferred-register: '' }
+ - { id: 8, class: fr32x, preferred-register: '' }
+ - { id: 9, class: fr32x, preferred-register: '' }
+ - { id: 10, class: fr32x, preferred-register: '' }
+ - { id: 11, class: gr8, preferred-register: '' }
+ - { id: 12, class: vk1, preferred-register: '' }
+ - { id: 13, class: gr32, preferred-register: '' }
+ - { id: 14, class: vk1, preferred-register: '' }
+ - { id: 15, class: gr32, preferred-register: '' }
+ - { id: 16, class: gr32, preferred-register: '' }
+ - { id: 17, class: gr32, preferred-register: '' }
+ - { id: 18, class: vk1wm, preferred-register: '' }
+ - { id: 19, class: vr128x, preferred-register: '' }
+ - { id: 20, class: fr128, preferred-register: '' }
+ - { id: 21, class: fr128, preferred-register: '' }
+ - { id: 22, class: fr32x, preferred-register: '' }
+liveins:
+ - { reg: '%edi', virtual-reg: '%3' }
+ - { reg: '%rsi', virtual-reg: '%4' }
+ - { reg: '%xmm0', virtual-reg: '%5' }
+ - { reg: '%xmm1', virtual-reg: '%6' }
+ - { reg: '%xmm2', virtual-reg: '%7' }
+ - { reg: '%xmm3', virtual-reg: '%8' }
+ - { reg: '%xmm4', virtual-reg: '%9' }
+ - { reg: '%xmm5', virtual-reg: '%10' }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 0
+ adjustsStack: false
+ hasCalls: false
+ stackProtector: ''
+ maxCallFrameSize: 4294967295
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+ savePoint: ''
+ restorePoint: ''
+fixedStack:
+stack:
+constants:
+body: |
+ bb.0.entry:
+ successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ liveins: %edi, %rsi, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5
+
+ %10 = COPY %xmm5
+ %9 = COPY %xmm4
+ %8 = COPY %xmm3
+ %7 = COPY %xmm2
+ %6 = COPY %xmm1
+ %5 = COPY %xmm0
+ %4 = COPY %rsi
+ %3 = COPY %edi
+ %11 = COPY %3.sub_8bit
+ TEST8ri killed %11, 1, implicit-def %eflags
+ JE_1 %bb.2, implicit %eflags
+ JMP_1 %bb.1
+
+ bb.1.if:
+ successors: %bb.3(0x80000000)
+
+ %14 = VCMPSSZrr %7, %8, 0
+
+ ; check that cross domain copies are replaced with same domain copies.
+ ; CHECK: %15:vk32 = COPY %14
+ ; CHECK: %0:vk8 = COPY %15
+
+ %15 = COPY %14
+ %0 = COPY %15.sub_8bit
+ JMP_1 %bb.3
+
+ bb.2.else:
+ successors: %bb.3(0x80000000)
+ %12 = VCMPSSZrr %9, %10, 0
+
+ ; check that cross domain copies are replaced with same domain copies.
+ ; CHECK: %13:vk32 = COPY %12
+ ; CHECK: %1:vk8 = COPY %13
+
+ %13 = COPY %12
+ %1 = COPY %13.sub_8bit
+
+ bb.3.exit:
+
+ ; check PHI, IMPLICIT_DEF, and INSERT_SUBREG replacers.
+ ; CHECK: %2:vk8 = PHI %1, %bb.2, %0, %bb.1
+ ; CHECK: %16:vk32 = COPY %2
+ ; CHECK: %18:vk1wm = COPY %16
+
+ %2 = PHI %1, %bb.2, %0, %bb.1
+ %17 = IMPLICIT_DEF
+ %16 = INSERT_SUBREG %17, %2, 1
+ %18 = COPY %16
+ %19 = COPY %6
+ %21 = IMPLICIT_DEF
+ %20 = VMOVSSZrrk %19, killed %18, killed %21, %5
+ %22 = COPY %20
+ VMOVSSZmr %4, 1, %noreg, 0, %noreg, killed %22 :: (store 4 into %ir.fptr)
+ RET 0
+
+...
+---
+name: test_8bitops
+# CHECK-LABEL: name: test_8bitops
+alignment: 4
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: gr64, preferred-register: '' }
+ - { id: 1, class: vr512, preferred-register: '' }
+ - { id: 2, class: vr512, preferred-register: '' }
+ - { id: 3, class: vr512, preferred-register: '' }
+ - { id: 4, class: vr512, preferred-register: '' }
+ - { id: 5, class: vk8, preferred-register: '' }
+ - { id: 6, class: gr32, preferred-register: '' }
+ - { id: 7, class: gr8, preferred-register: '' }
+ - { id: 8, class: gr32, preferred-register: '' }
+ - { id: 9, class: gr32, preferred-register: '' }
+ - { id: 10, class: vk8wm, preferred-register: '' }
+ - { id: 11, class: vr512, preferred-register: '' }
+ - { id: 12, class: gr8, preferred-register: '' }
+ - { id: 13, class: gr8, preferred-register: '' }
+ - { id: 14, class: gr8, preferred-register: '' }
+ - { id: 15, class: gr8, preferred-register: '' }
+ - { id: 16, class: gr8, preferred-register: '' }
+ - { id: 17, class: gr8, preferred-register: '' }
+ - { id: 18, class: gr8, preferred-register: '' }
+liveins:
+ - { reg: '%rdi', virtual-reg: '%0' }
+ - { reg: '%zmm0', virtual-reg: '%1' }
+ - { reg: '%zmm1', virtual-reg: '%2' }
+ - { reg: '%zmm2', virtual-reg: '%3' }
+ - { reg: '%zmm3', virtual-reg: '%4' }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 0
+ adjustsStack: false
+ hasCalls: false
+ stackProtector: ''
+ maxCallFrameSize: 4294967295
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+ savePoint: ''
+ restorePoint: ''
+fixedStack:
+stack:
+constants:
+body: |
+ bb.0:
+ liveins: %rdi, %zmm0, %zmm1, %zmm2, %zmm3
+
+ %0 = COPY %rdi
+ %1 = COPY %zmm0
+ %2 = COPY %zmm1
+ %3 = COPY %zmm2
+ %4 = COPY %zmm3
+
+ %5 = VCMPPDZrri %3, %4, 0
+ ; CHECK: %6:vk32 = COPY %5
+ ; CHECK: %7:vk8 = COPY %6
+ %6 = COPY %5
+ %7 = COPY %6.sub_8bit
+
+ ; CHECK: %12:vk8 = KSHIFTRBri %7, 2
+ ; CHECK: %13:vk8 = KSHIFTLBri %12, 1
+ ; CHECK: %14:vk8 = KNOTBrr %13
+ ; CHECK: %15:vk8 = KORBrr %14, %12
+ ; CHECK: %16:vk8 = KANDBrr %15, %13
+ ; CHECK: %17:vk8 = KXORBrr %16, %12
+ ; CHECK: %18:vk8 = KADDBrr %17, %14
+ %12 = SHR8ri %7, 2, implicit-def dead %eflags
+ %13 = SHL8ri %12, 1, implicit-def dead %eflags
+ %14 = NOT8r %13
+ %15 = OR8rr %14, %12, implicit-def dead %eflags
+ %16 = AND8rr %15, %13, implicit-def dead %eflags
+ %17 = XOR8rr %16, %12, implicit-def dead %eflags
+ %18 = ADD8rr %17, %14, implicit-def dead %eflags
+
+ ; CHECK: %9:vk32 = COPY %18
+ ; CHECK: %10:vk8wm = COPY %9
+ %8 = IMPLICIT_DEF
+ %9 = INSERT_SUBREG %8, %18, 1
+ %10 = COPY %9
+ %11 = VMOVAPDZrrk %2, killed %10, %1
+ VMOVAPDZmr %0, 1, %noreg, 0, %noreg, killed %11
+
+ ; CHECK: KTESTBrr %18, %18, implicit-def %eflags
+ TEST8rr %18, %18, implicit-def %eflags
+ JE_1 %bb.1, implicit %eflags
+ JMP_1 %bb.2
+
+ bb.1:
+
+ bb.2:
+ RET 0
+
+...
+---
+name: test_16bitops
+# CHECK-LABEL: name: test_16bitops
+alignment: 4
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: gr64, preferred-register: '' }
+ - { id: 1, class: vr512, preferred-register: '' }
+ - { id: 2, class: vr512, preferred-register: '' }
+ - { id: 3, class: vr512, preferred-register: '' }
+ - { id: 4, class: vr512, preferred-register: '' }
+ - { id: 5, class: vk16, preferred-register: '' }
+ - { id: 6, class: gr32, preferred-register: '' }
+ - { id: 7, class: gr16, preferred-register: '' }
+ - { id: 8, class: gr32, preferred-register: '' }
+ - { id: 9, class: gr32, preferred-register: '' }
+ - { id: 10, class: vk16wm, preferred-register: '' }
+ - { id: 11, class: vr512, preferred-register: '' }
+ - { id: 12, class: gr16, preferred-register: '' }
+ - { id: 13, class: gr16, preferred-register: '' }
+ - { id: 14, class: gr16, preferred-register: '' }
+ - { id: 15, class: gr16, preferred-register: '' }
+ - { id: 16, class: gr16, preferred-register: '' }
+ - { id: 17, class: gr16, preferred-register: '' }
+liveins:
+ - { reg: '%rdi', virtual-reg: '%0' }
+ - { reg: '%zmm0', virtual-reg: '%1' }
+ - { reg: '%zmm1', virtual-reg: '%2' }
+ - { reg: '%zmm2', virtual-reg: '%3' }
+ - { reg: '%zmm3', virtual-reg: '%4' }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 0
+ adjustsStack: false
+ hasCalls: false
+ stackProtector: ''
+ maxCallFrameSize: 4294967295
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+ savePoint: ''
+ restorePoint: ''
+fixedStack:
+stack:
+constants:
+body: |
+ bb.0:
+ liveins: %rdi, %zmm0, %zmm1, %zmm2, %zmm3
+
+ %0 = COPY %rdi
+ %1 = COPY %zmm0
+ %2 = COPY %zmm1
+ %3 = COPY %zmm2
+ %4 = COPY %zmm3
+
+ %5 = VCMPPSZrri %3, %4, 0
+ ; CHECK: %6:vk32 = COPY %5
+ ; CHECK: %7:vk16 = COPY %6
+ %6 = COPY %5
+ %7 = COPY %6.sub_16bit
+
+ ; CHECK: %12:vk16 = KSHIFTRWri %7, 2
+ ; CHECK: %13:vk16 = KSHIFTLWri %12, 1
+ ; CHECK: %14:vk16 = KNOTWrr %13
+ ; CHECK: %15:vk16 = KORWrr %14, %12
+ ; CHECK: %16:vk16 = KANDWrr %15, %13
+ ; CHECK: %17:vk16 = KXORWrr %16, %12
+ %12 = SHR16ri %7, 2, implicit-def dead %eflags
+ %13 = SHL16ri %12, 1, implicit-def dead %eflags
+ %14 = NOT16r %13
+ %15 = OR16rr %14, %12, implicit-def dead %eflags
+ %16 = AND16rr %15, %13, implicit-def dead %eflags
+ %17 = XOR16rr %16, %12, implicit-def dead %eflags
+
+ ; CHECK: %9:vk32 = COPY %17
+ ; CHECK: %10:vk16wm = COPY %9
+ %8 = IMPLICIT_DEF
+ %9 = INSERT_SUBREG %8, %17, 3
+ %10 = COPY %9
+ %11 = VMOVAPSZrrk %2, killed %10, %1
+ VMOVAPSZmr %0, 1, %noreg, 0, %noreg, killed %11
+
+ ; CHECK: KTESTWrr %17, %17, implicit-def %eflags
+ TEST16rr %17, %17, implicit-def %eflags
+ JE_1 %bb.1, implicit %eflags
+ JMP_1 %bb.2
+
+ bb.1:
+
+ bb.2:
+ RET 0
+
+...
+---
+name: test_32bitops
+# CHECK-LABEL: name: test_32bitops
+alignment: 4
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: gr64, preferred-register: '' }
+ - { id: 1, class: vr512, preferred-register: '' }
+ - { id: 2, class: vr512, preferred-register: '' }
+ - { id: 3, class: vk32wm, preferred-register: '' }
+ - { id: 4, class: vr512, preferred-register: '' }
+ - { id: 5, class: gr32, preferred-register: '' }
+ - { id: 6, class: gr32, preferred-register: '' }
+ - { id: 7, class: gr32, preferred-register: '' }
+ - { id: 8, class: gr32, preferred-register: '' }
+ - { id: 9, class: gr32, preferred-register: '' }
+ - { id: 10, class: gr32, preferred-register: '' }
+ - { id: 11, class: gr32, preferred-register: '' }
+ - { id: 12, class: gr32, preferred-register: '' }
+ - { id: 13, class: gr32, preferred-register: '' }
+liveins:
+ - { reg: '%rdi', virtual-reg: '%0' }
+ - { reg: '%zmm0', virtual-reg: '%1' }
+ - { reg: '%zmm1', virtual-reg: '%2' }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 0
+ adjustsStack: false
+ hasCalls: false
+ stackProtector: ''
+ maxCallFrameSize: 4294967295
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+ savePoint: ''
+ restorePoint: ''
+fixedStack:
+stack:
+constants:
+body: |
+ bb.0:
+ liveins: %rdi, %zmm0, %zmm1
+
+ %0 = COPY %rdi
+ %1 = COPY %zmm0
+ %2 = COPY %zmm1
+
+ ; CHECK: %5:vk32 = KMOVDkm %0, 1, %noreg, 0, %noreg
+ ; CHECK: %6:vk32 = KSHIFTRDri %5, 2
+ ; CHECK: %7:vk32 = KSHIFTLDri %6, 1
+ ; CHECK: %8:vk32 = KNOTDrr %7
+ ; CHECK: %9:vk32 = KORDrr %8, %6
+ ; CHECK: %10:vk32 = KANDDrr %9, %7
+ ; CHECK: %11:vk32 = KXORDrr %10, %6
+ ; CHECK: %12:vk32 = KANDNDrr %11, %9
+ ; CHECK: %13:vk32 = KADDDrr %12, %11
+ %5 = MOV32rm %0, 1, %noreg, 0, %noreg
+ %6 = SHR32ri %5, 2, implicit-def dead %eflags
+ %7 = SHL32ri %6, 1, implicit-def dead %eflags
+ %8 = NOT32r %7
+ %9 = OR32rr %8, %6, implicit-def dead %eflags
+ %10 = AND32rr %9, %7, implicit-def dead %eflags
+ %11 = XOR32rr %10, %6, implicit-def dead %eflags
+ %12 = ANDN32rr %11, %9, implicit-def dead %eflags
+ %13 = ADD32rr %12, %11, implicit-def dead %eflags
+
+ ; CHECK: %3:vk32wm = COPY %13
+ %3 = COPY %13
+ %4 = VMOVDQU16Zrrk %2, killed %3, %1
+ VMOVDQA32Zmr %0, 1, %noreg, 0, %noreg, killed %4
+
+ ; CHECK: KTESTDrr %13, %13, implicit-def %eflags
+ TEST32rr %13, %13, implicit-def %eflags
+ JE_1 %bb.1, implicit %eflags
+ JMP_1 %bb.2
+
+ bb.1:
+
+ bb.2:
+ RET 0
+
+...
+---
+name: test_64bitops
+# CHECK-LABEL: name: test_64bitops
+alignment: 4
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: gr64, preferred-register: '' }
+ - { id: 1, class: vr512, preferred-register: '' }
+ - { id: 2, class: vr512, preferred-register: '' }
+ - { id: 3, class: vk64wm, preferred-register: '' }
+ - { id: 4, class: vr512, preferred-register: '' }
+ - { id: 5, class: gr64, preferred-register: '' }
+ - { id: 6, class: gr64, preferred-register: '' }
+ - { id: 7, class: gr64, preferred-register: '' }
+ - { id: 8, class: gr64, preferred-register: '' }
+ - { id: 9, class: gr64, preferred-register: '' }
+ - { id: 10, class: gr64, preferred-register: '' }
+ - { id: 11, class: gr64, preferred-register: '' }
+ - { id: 12, class: gr64, preferred-register: '' }
+ - { id: 13, class: gr64, preferred-register: '' }
+liveins:
+ - { reg: '%rdi', virtual-reg: '%0' }
+ - { reg: '%zmm0', virtual-reg: '%1' }
+ - { reg: '%zmm1', virtual-reg: '%2' }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 0
+ adjustsStack: false
+ hasCalls: false
+ stackProtector: ''
+ maxCallFrameSize: 4294967295
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+ savePoint: ''
+ restorePoint: ''
+fixedStack:
+stack:
+constants:
+body: |
+ bb.0:
+ liveins: %rdi, %zmm0, %zmm1
+
+ %0 = COPY %rdi
+ %1 = COPY %zmm0
+ %2 = COPY %zmm1
+
+ ; CHECK: %5:vk64 = KMOVQkm %0, 1, %noreg, 0, %noreg
+ ; CHECK: %6:vk64 = KSHIFTRQri %5, 2
+ ; CHECK: %7:vk64 = KSHIFTLQri %6, 1
+ ; CHECK: %8:vk64 = KNOTQrr %7
+ ; CHECK: %9:vk64 = KORQrr %8, %6
+ ; CHECK: %10:vk64 = KANDQrr %9, %7
+ ; CHECK: %11:vk64 = KXORQrr %10, %6
+ ; CHECK: %12:vk64 = KANDNQrr %11, %9
+ ; CHECK: %13:vk64 = KADDQrr %12, %11
+ %5 = MOV64rm %0, 1, %noreg, 0, %noreg
+ %6 = SHR64ri %5, 2, implicit-def dead %eflags
+ %7 = SHL64ri %6, 1, implicit-def dead %eflags
+ %8 = NOT64r %7
+ %9 = OR64rr %8, %6, implicit-def dead %eflags
+ %10 = AND64rr %9, %7, implicit-def dead %eflags
+ %11 = XOR64rr %10, %6, implicit-def dead %eflags
+ %12 = ANDN64rr %11, %9, implicit-def dead %eflags
+ %13 = ADD64rr %12, %11, implicit-def dead %eflags
+
+ ; CHECK: %3:vk64wm = COPY %13
+ %3 = COPY %13
+ %4 = VMOVDQU8Zrrk %2, killed %3, %1
+ VMOVDQA32Zmr %0, 1, %noreg, 0, %noreg, killed %4
+
+ ; CHECK: KTESTQrr %13, %13, implicit-def %eflags
+ TEST64rr %13, %13, implicit-def %eflags
+ JE_1 %bb.1, implicit %eflags
+ JMP_1 %bb.2
+
+ bb.1:
+
+ bb.2:
+ RET 0
+
+...
+---
+name: test_16bitext
+# CHECK-LABEL: name: test_16bitext
+alignment: 4
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: gr64, preferred-register: '' }
+ - { id: 1, class: vr512, preferred-register: '' }
+ - { id: 2, class: vr512, preferred-register: '' }
+ - { id: 3, class: vk16wm, preferred-register: '' }
+ - { id: 4, class: vr512, preferred-register: '' }
+ - { id: 5, class: gr16, preferred-register: '' }
+ - { id: 6, class: gr16, preferred-register: '' }
+liveins:
+ - { reg: '%rdi', virtual-reg: '%0' }
+ - { reg: '%zmm0', virtual-reg: '%1' }
+ - { reg: '%zmm1', virtual-reg: '%2' }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 0
+ adjustsStack: false
+ hasCalls: false
+ stackProtector: ''
+ maxCallFrameSize: 4294967295
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+ savePoint: ''
+ restorePoint: ''
+fixedStack:
+stack:
+constants:
+body: |
+ bb.0:
+ liveins: %rdi, %zmm0, %zmm1
+
+ %0 = COPY %rdi
+ %1 = COPY %zmm0
+ %2 = COPY %zmm1
+
+ ; CHECK: %7:vk8 = KMOVBkm %0, 1, %noreg, 0, %noreg
+ ; CHECK: %5:vk16 = COPY %7
+ ; CHECK: %6:vk16 = KNOTWrr %5
+ %5 = MOVZX16rm8 %0, 1, %noreg, 0, %noreg
+ %6 = NOT16r %5
+
+ ; CHECK: %3:vk16wm = COPY %6
+ %3 = COPY %6
+ %4 = VMOVAPSZrrk %2, killed %3, %1
+ VMOVAPSZmr %0, 1, %noreg, 0, %noreg, killed %4
+ RET 0
+
+...
+---
+name: test_32bitext
+# CHECK-LABEL: name: test_32bitext
+alignment: 4
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: gr64, preferred-register: '' }
+ - { id: 1, class: vr512, preferred-register: '' }
+ - { id: 2, class: vr512, preferred-register: '' }
+ - { id: 3, class: vk64wm, preferred-register: '' }
+ - { id: 4, class: vr512, preferred-register: '' }
+ - { id: 5, class: gr32, preferred-register: '' }
+ - { id: 6, class: gr32, preferred-register: '' }
+ - { id: 7, class: gr32, preferred-register: '' }
+liveins:
+ - { reg: '%rdi', virtual-reg: '%0' }
+ - { reg: '%zmm0', virtual-reg: '%1' }
+ - { reg: '%zmm1', virtual-reg: '%2' }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 0
+ adjustsStack: false
+ hasCalls: false
+ stackProtector: ''
+ maxCallFrameSize: 4294967295
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+ savePoint: ''
+ restorePoint: ''
+fixedStack:
+stack:
+constants:
+body: |
+ bb.0:
+ liveins: %rdi, %zmm0, %zmm1
+
+ %0 = COPY %rdi
+ %1 = COPY %zmm0
+ %2 = COPY %zmm1
+
+ ; CHECK: %8:vk8 = KMOVBkm %0, 1, %noreg, 0, %noreg
+ ; CHECK: %5:vk32 = COPY %8
+ ; CHECK: %9:vk16 = KMOVWkm %0, 1, %noreg, 0, %noreg
+ ; CHECK: %6:vk32 = COPY %9
+ ; CHECK: %7:vk32 = KADDDrr %5, %6
+ %5 = MOVZX32rm8 %0, 1, %noreg, 0, %noreg
+ %6 = MOVZX32rm16 %0, 1, %noreg, 0, %noreg
+ %7 = ADD32rr %5, %6, implicit-def dead %eflags
+
+ ; CHECK: %3:vk64wm = COPY %7
+ %3 = COPY %7
+ %4 = VMOVDQU16Zrrk %2, killed %3, %1
+ VMOVDQA32Zmr %0, 1, %noreg, 0, %noreg, killed %4
+ RET 0
+
+...
+---
+name: test_64bitext
+# CHECK-LABEL: name: test_64bitext
+alignment: 4
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: gr64, preferred-register: '' }
+ - { id: 1, class: vr512, preferred-register: '' }
+ - { id: 2, class: vr512, preferred-register: '' }
+ - { id: 3, class: vk64wm, preferred-register: '' }
+ - { id: 4, class: vr512, preferred-register: '' }
+ - { id: 5, class: gr64, preferred-register: '' }
+ - { id: 6, class: gr64, preferred-register: '' }
+ - { id: 7, class: gr64, preferred-register: '' }
+liveins:
+ - { reg: '%rdi', virtual-reg: '%0' }
+ - { reg: '%zmm0', virtual-reg: '%1' }
+ - { reg: '%zmm1', virtual-reg: '%2' }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 0
+ adjustsStack: false
+ hasCalls: false
+ stackProtector: ''
+ maxCallFrameSize: 4294967295
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+ savePoint: ''
+ restorePoint: ''
+fixedStack:
+stack:
+constants:
+body: |
+ bb.0:
+ liveins: %rdi, %zmm0, %zmm1
+
+ %0 = COPY %rdi
+ %1 = COPY %zmm0
+ %2 = COPY %zmm1
+
+ ; CHECK: %8:vk8 = KMOVBkm %0, 1, %noreg, 0, %noreg
+ ; CHECK: %5:vk64 = COPY %8
+ ; CHECK: %9:vk16 = KMOVWkm %0, 1, %noreg, 0, %noreg
+ ; CHECK: %6:vk64 = COPY %9
+ ; CHECK: %7:vk64 = KADDQrr %5, %6
+ %5 = MOVZX64rm8 %0, 1, %noreg, 0, %noreg
+ %6 = MOVZX64rm16 %0, 1, %noreg, 0, %noreg
+ %7 = ADD64rr %5, %6, implicit-def dead %eflags
+
+ ; CHECK: %3:vk64wm = COPY %7
+ %3 = COPY %7
+ %4 = VMOVDQU8Zrrk %2, killed %3, %1
+ VMOVDQA32Zmr %0, 1, %noreg, 0, %noreg, killed %4
+ RET 0
+
+...
diff --git a/test/CodeGen/X86/dont-trunc-store-double-to-float.ll b/test/CodeGen/X86/dont-trunc-store-double-to-float.ll
index 05245d0d9e1e..e9287b8b93d4 100644
--- a/test/CodeGen/X86/dont-trunc-store-double-to-float.ll
+++ b/test/CodeGen/X86/dont-trunc-store-double-to-float.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=x86 < %s | FileCheck %s
+; RUN: llc -mtriple=i686-- < %s | FileCheck %s
; CHECK-LABEL: @bar
; CHECK-DAG: movl $1074339512,
diff --git a/test/CodeGen/X86/dwarf-comp-dir.ll b/test/CodeGen/X86/dwarf-comp-dir.ll
index b744a70288e5..8c4b8fdc3607 100644
--- a/test/CodeGen/X86/dwarf-comp-dir.ll
+++ b/test/CodeGen/X86/dwarf-comp-dir.ll
@@ -1,5 +1,5 @@
; RUN: llc %s -o %t -filetype=obj
-; RUN: llvm-dwarfdump -debug-dump=line %t | FileCheck %s
+; RUN: llvm-dwarfdump -debug-line %t | FileCheck %s
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
diff --git a/test/CodeGen/X86/dwarf-headers.ll b/test/CodeGen/X86/dwarf-headers.ll
index c2111f672a2e..9aacaca9c76d 100644
--- a/test/CodeGen/X86/dwarf-headers.ll
+++ b/test/CodeGen/X86/dwarf-headers.ll
@@ -1,18 +1,18 @@
; RUN: llc -dwarf-version=4 -generate-type-units \
; RUN: -filetype=obj -O0 -mtriple=x86_64-unknown-linux-gnu < %s \
-; RUN: | llvm-dwarfdump - | FileCheck %s --check-prefix=SINGLE-4
+; RUN: | llvm-dwarfdump -v - | FileCheck %s --check-prefix=SINGLE-4
; RUN: llc -split-dwarf-file=foo.dwo -dwarf-version=4 -generate-type-units \
; RUN: -filetype=obj -O0 -mtriple=x86_64-unknown-linux-gnu < %s \
-; RUN: | llvm-dwarfdump - | FileCheck %s --check-prefix=SPLIT-4
+; RUN: | llvm-dwarfdump -v - | FileCheck %s --check-prefix=SPLIT-4
; RUN: llc -dwarf-version=5 -generate-type-units \
; RUN: -filetype=obj -O0 -mtriple=x86_64-unknown-linux-gnu < %s \
-; RUN: | llvm-dwarfdump - | FileCheck %s --check-prefix=SINGLE-5
+; RUN: | llvm-dwarfdump -v - | FileCheck %s --check-prefix=SINGLE-5
; RUN: llc -split-dwarf-file=foo.dwo -dwarf-version=5 -generate-type-units \
; RUN: -filetype=obj -O0 -mtriple=x86_64-unknown-linux-gnu < %s \
-; RUN: | llvm-dwarfdump - | FileCheck %s --check-prefix=SPLIT-5
+; RUN: | llvm-dwarfdump -v - | FileCheck %s --check-prefix=SPLIT-5
; Looking for DWARF headers to be generated correctly.
; There are 7 variants: v4 CU, v4 TU, v5 (normal/skeleton/split) CU,
@@ -94,7 +94,7 @@ target triple = "x86_64-unknown-linux-gnu"
!llvm.module.flags = !{!10, !11}
!llvm.ident = !{!12}
-!0 = !DIGlobalVariableExpression(var: !1)
+!0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression())
!1 = distinct !DIGlobalVariable(name: "s", scope: !2, file: !3, line: 5, type: !6, isLocal: false, isDefinition: true)
!2 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !3, producer: "clang version 5.0.0 (trunk 295942)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !4, globals: !5)
!3 = !DIFile(filename: "t.cpp", directory: "/home/probinson/projects/scratch")
diff --git a/test/CodeGen/X86/dynamic-alloca-lifetime.ll b/test/CodeGen/X86/dynamic-alloca-lifetime.ll
index 996eec05163d..4f3238b44776 100644
--- a/test/CodeGen/X86/dynamic-alloca-lifetime.ll
+++ b/test/CodeGen/X86/dynamic-alloca-lifetime.ll
@@ -41,4 +41,4 @@ if.else130: ; preds = %bb1
declare void @bar()
attributes #0 = { nounwind }
-attributes #1 = { ssp } \ No newline at end of file
+attributes #1 = { ssp }
diff --git a/test/CodeGen/X86/dynamic-allocas-VLAs.ll b/test/CodeGen/X86/dynamic-allocas-VLAs.ll
index b976a8918679..37de41ea7db1 100644
--- a/test/CodeGen/X86/dynamic-allocas-VLAs.ll
+++ b/test/CodeGen/X86/dynamic-allocas-VLAs.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -stack-symbol-ordering=0 -mcpu=generic -march=x86-64 -mattr=+avx -mtriple=i686-apple-darwin10 | FileCheck %s
-; RUN: llc < %s -stack-symbol-ordering=0 -mcpu=generic -stackrealign -stack-alignment=32 -march=x86-64 -mattr=+avx -mtriple=i686-apple-darwin10 | FileCheck %s -check-prefix=FORCE-ALIGN
+; RUN: llc < %s -stack-symbol-ordering=0 -mcpu=generic -mattr=+avx -mtriple=x86_64-apple-darwin10 | FileCheck %s
+; RUN: llc < %s -stack-symbol-ordering=0 -mcpu=generic -stackrealign -stack-alignment=32 -mattr=+avx -mtriple=x86_64-apple-darwin10 | FileCheck %s -check-prefix=FORCE-ALIGN
; rdar://11496434
; no VLAs or dynamic alignment
diff --git a/test/CodeGen/X86/eflags-copy-expansion.mir b/test/CodeGen/X86/eflags-copy-expansion.mir
index 28f47c3c2496..11d4c81b9253 100644
--- a/test/CodeGen/X86/eflags-copy-expansion.mir
+++ b/test/CodeGen/X86/eflags-copy-expansion.mir
@@ -48,7 +48,7 @@ body: |
; Save AL.
; CHECK: PUSH32r killed %eax
- ; Copy EDI into EFLAGS
+ ; Copy edi into EFLAGS
; CHECK-NEXT: %eax = MOV32rr %edi
; CHECK-NEXT: %al = ADD8ri %al, 127, implicit-def %eflags
; CHECK-NEXT: SAHF implicit-def %eflags, implicit %ah
diff --git a/test/CodeGen/X86/empty-functions.ll b/test/CodeGen/X86/empty-functions.ll
index 0c139534e567..520acfb8d316 100644
--- a/test/CodeGen/X86/empty-functions.ll
+++ b/test/CodeGen/X86/empty-functions.ll
@@ -18,9 +18,7 @@ entry:
; CHECK-FP-NEXT: .cfi_startproc
; CHECK-FP-NEXT: :
; CHECK-FP-NEXT: pushq %rbp
-; CHECK-FP-NEXT: :
; CHECK-FP-NEXT: .cfi_def_cfa_offset 16
-; CHECK-FP-NEXT: :
; CHECK-FP-NEXT: .cfi_offset %rbp, -16
; CHECK-FP-NEXT: movq %rsp, %rbp
; CHECK-FP-NEXT: .cfi_endproc
@@ -38,9 +36,7 @@ entry:
; LINUX-FP-NEXT: .cfi_startproc
; LINUX-FP-NEXT: {{^}}#
; LINUX-FP-NEXT: pushq %rbp
-; LINUX-FP-NEXT: {{^}}.L{{.*}}:{{$}}
; LINUX-FP-NEXT: .cfi_def_cfa_offset 16
-; LINUX-FP-NEXT: {{^}}.L{{.*}}:{{$}}
; LINUX-FP-NEXT: .cfi_offset %rbp, -16
; LINUX-FP-NEXT: movq %rsp, %rbp
; LINUX-FP-NEXT:{{^}}.L{{.*}}:{{$}}
diff --git a/test/CodeGen/X86/empty-struct-return-type.ll b/test/CodeGen/X86/empty-struct-return-type.ll
index 34cd5d925052..1bfc2f7f467d 100644
--- a/test/CodeGen/X86/empty-struct-return-type.ll
+++ b/test/CodeGen/X86/empty-struct-return-type.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 | grep call
+; RUN: llc < %s | grep call
; PR4688
; Return types can be empty structs, which can be awkward.
diff --git a/test/CodeGen/X86/emutls-pic.ll b/test/CodeGen/X86/emutls-pic.ll
index 50dc72653aea..a83639d0f84a 100644
--- a/test/CodeGen/X86/emutls-pic.ll
+++ b/test/CodeGen/X86/emutls-pic.ll
@@ -1,7 +1,7 @@
-; RUN: llc < %s -emulated-tls -march=x86 -mtriple=i386-linux-gnu -relocation-model=pic | FileCheck -check-prefix=X32 %s
-; RUN: llc < %s -emulated-tls -march=x86-64 -mtriple=x86_64-linux-gnu -relocation-model=pic | FileCheck -check-prefix=X64 %s
-; RUN: llc < %s -emulated-tls -march=x86 -mtriple=i386-linux-android -relocation-model=pic | FileCheck -check-prefix=X32 %s
-; RUN: llc < %s -emulated-tls -march=x86-64 -mtriple=x86_64-linux-android -relocation-model=pic | FileCheck -check-prefix=X64 %s
+; RUN: llc < %s -emulated-tls -mtriple=i386-linux-gnu -relocation-model=pic | FileCheck -check-prefix=X32 %s
+; RUN: llc < %s -emulated-tls -mtriple=x86_64-linux-gnu -relocation-model=pic | FileCheck -check-prefix=X64 %s
+; RUN: llc < %s -emulated-tls -mtriple=i386-linux-android -relocation-model=pic | FileCheck -check-prefix=X32 %s
+; RUN: llc < %s -emulated-tls -mtriple=x86_64-linux-android -relocation-model=pic | FileCheck -check-prefix=X64 %s
; Use my_emutls_get_address like __emutls_get_address.
@my_emutls_v_xyz = external global i8*, align 4
diff --git a/test/CodeGen/X86/emutls-pie.ll b/test/CodeGen/X86/emutls-pie.ll
index 5db8c888a4e4..3c312a926695 100644
--- a/test/CodeGen/X86/emutls-pie.ll
+++ b/test/CodeGen/X86/emutls-pie.ll
@@ -1,10 +1,10 @@
-; RUN: llc < %s -emulated-tls -march=x86 -mcpu=generic -mtriple=i386-linux-gnu -relocation-model=pic \
+; RUN: llc < %s -emulated-tls -mcpu=generic -mtriple=i386-linux-gnu -relocation-model=pic \
; RUN: | FileCheck -check-prefix=X32 %s
-; RUN: llc < %s -emulated-tls -march=x86-64 -mcpu=generic -mtriple=x86_64-linux-gnu -relocation-model=pic \
+; RUN: llc < %s -emulated-tls -mcpu=generic -mtriple=x86_64-linux-gnu -relocation-model=pic \
; RUN: | FileCheck -check-prefix=X64 %s
-; RUN: llc < %s -emulated-tls -march=x86 -mcpu=generic -mtriple=i386-linux-android -relocation-model=pic \
+; RUN: llc < %s -emulated-tls -mcpu=generic -mtriple=i386-linux-android -relocation-model=pic \
; RUN: | FileCheck -check-prefix=X32 %s
-; RUN: llc < %s -emulated-tls -march=x86-64 -mcpu=generic -mtriple=x86_64-linux-android -relocation-model=pic \
+; RUN: llc < %s -emulated-tls -mcpu=generic -mtriple=x86_64-linux-android -relocation-model=pic \
; RUN: | FileCheck -check-prefix=X64 %s
; Use my_emutls_get_address like __emutls_get_address.
diff --git a/test/CodeGen/X86/emutls.ll b/test/CodeGen/X86/emutls.ll
index 9266fe962df2..8c0ba903659b 100644
--- a/test/CodeGen/X86/emutls.ll
+++ b/test/CodeGen/X86/emutls.ll
@@ -1,7 +1,7 @@
-; RUN: llc < %s -emulated-tls -march=x86 -mtriple=i386-linux-gnu | FileCheck -check-prefix=X32 %s
-; RUN: llc < %s -emulated-tls -march=x86-64 -mtriple=x86_64-linux-gnu | FileCheck -check-prefix=X64 %s
-; RUN: llc < %s -emulated-tls -march=x86 -mtriple=x86-linux-android | FileCheck -check-prefix=X32 %s
-; RUN: llc < %s -emulated-tls -march=x86-64 -mtriple=x86_64-linux-android | FileCheck -check-prefix=X64 %s
+; RUN: llc < %s -emulated-tls -mtriple=i386-linux-gnu | FileCheck -check-prefix=X32 %s
+; RUN: llc < %s -emulated-tls -mtriple=x86_64-linux-gnu | FileCheck -check-prefix=X64 %s
+; RUN: llc < %s -emulated-tls -mtriple=i386-linux-android | FileCheck -check-prefix=X32 %s
+; RUN: llc < %s -emulated-tls -mtriple=x86_64-linux-android | FileCheck -check-prefix=X64 %s
; Copied from tls.ll; emulated TLS model is not implemented
; for *-pc-win32 and *-pc-winows targets yet.
diff --git a/test/CodeGen/X86/emutls_generic.ll b/test/CodeGen/X86/emutls_generic.ll
index 16d90001426f..0c534f370efc 100644
--- a/test/CodeGen/X86/emutls_generic.ll
+++ b/test/CodeGen/X86/emutls_generic.ll
@@ -1,10 +1,10 @@
; RUN: llc < %s -emulated-tls -mtriple=i686-linux-android -relocation-model=pic \
; RUN: | FileCheck -check-prefix=X86_32 %s
-; RUN: llc < %s -emulated-tls -mtriple=x86_64-linux-android -march=x86 -relocation-model=pic \
+; RUN: llc < %s -emulated-tls -mtriple=i686-linux-android -relocation-model=pic \
; RUN: | FileCheck -check-prefix=X86_32 %s
; RUN: llc < %s -emulated-tls -mtriple=x86_64-linux-android -relocation-model=pic \
; RUN: | FileCheck -check-prefix=X86_64 %s
-; RUN: llc < %s -emulated-tls -march=x86 -mtriple=i386-linux-gnu -relocation-model=pic \
+; RUN: llc < %s -emulated-tls -mtriple=i386-linux-gnu -relocation-model=pic \
; RUN: | FileCheck %s
; Make sure that TLS symbols are emitted in expected order.
diff --git a/test/CodeGen/X86/epilogue.ll b/test/CodeGen/X86/epilogue.ll
index 090680e48feb..a4fa3e38dff2 100644
--- a/test/CodeGen/X86/epilogue.ll
+++ b/test/CodeGen/X86/epilogue.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mcpu=generic -march=x86 | FileCheck %s
+; RUN: llc < %s -mcpu=generic -mtriple=i686-- | FileCheck %s
; CHECK-NOT: lea{{.*}}(%esp)
; CHECK: {{(mov.* %ebp, %esp)|(lea.*\(%ebp\), %esp)}}
diff --git a/test/CodeGen/X86/evex-to-vex-compress.mir b/test/CodeGen/X86/evex-to-vex-compress.mir
index 2295ddb5b2b9..d436547b9d27 100755
--- a/test/CodeGen/X86/evex-to-vex-compress.mir
+++ b/test/CodeGen/X86/evex-to-vex-compress.mir
@@ -1,4 +1,4 @@
-# RUN: llc -march=x86-64 -run-pass x86-evex-to-vex-compress -verify-machineinstrs -mcpu=skx -o - %s | FileCheck %s
+# RUN: llc -mtriple=x86_64-- -run-pass x86-evex-to-vex-compress -verify-machineinstrs -mcpu=skx -o - %s | FileCheck %s
# This test verifies VEX encdoing for AVX-512 instructions that use registers of low inedexes and
# do not use zmm or mask registers and have a corresponding AVX/AVX2 opcode
@@ -17,878 +17,878 @@
name: evex_z256_to_vex_test
body: |
bb.0:
- ; CHECK: VMOVAPDYmr %rdi, 1, _, 0, _, %ymm0
- VMOVAPDZ256mr %rdi, 1, _, 0, _, %ymm0
- ; CHECK: %ymm0 = VMOVAPDYrm %rip, 1, _, %rax, _
- %ymm0 = VMOVAPDZ256rm %rip, 1, _, %rax, _
+ ; CHECK: VMOVAPDYmr %rdi, 1, %noreg, 0, %noreg, %ymm0
+ VMOVAPDZ256mr %rdi, 1, %noreg, 0, %noreg, %ymm0
+ ; CHECK: %ymm0 = VMOVAPDYrm %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VMOVAPDZ256rm %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VMOVAPDYrr %ymm0
%ymm0 = VMOVAPDZ256rr %ymm0
; CHECK: %ymm0 = VMOVAPDYrr_REV %ymm0
%ymm0 = VMOVAPDZ256rr_REV %ymm0
- ; CHECK: VMOVAPSYmr %rdi, 1, _, 0, _, %ymm0
- VMOVAPSZ256mr %rdi, 1, _, 0, _, %ymm0
- ; CHECK: %ymm0 = VMOVAPSYrm %rip, 1, _, %rax, _
- %ymm0 = VMOVAPSZ256rm %rip, 1, _, %rax, _
+ ; CHECK: VMOVAPSYmr %rdi, 1, %noreg, 0, %noreg, %ymm0
+ VMOVAPSZ256mr %rdi, 1, %noreg, 0, %noreg, %ymm0
+ ; CHECK: %ymm0 = VMOVAPSYrm %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VMOVAPSZ256rm %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VMOVAPSYrr %ymm0
%ymm0 = VMOVAPSZ256rr %ymm0
; CHECK: %ymm0 = VMOVAPSYrr_REV %ymm0
%ymm0 = VMOVAPSZ256rr_REV %ymm0
- ; CHECK: %ymm0 = VMOVDDUPYrm %rip, 1, _, %rax, _
- %ymm0 = VMOVDDUPZ256rm %rip, 1, _, %rax, _
+ ; CHECK: %ymm0 = VMOVDDUPYrm %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VMOVDDUPZ256rm %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VMOVDDUPYrr %ymm0
%ymm0 = VMOVDDUPZ256rr %ymm0
- ; CHECK: VMOVDQAYmr %rdi, 1, _, 0, _, %ymm0
- VMOVDQA32Z256mr %rdi, 1, _, 0, _, %ymm0
- ; CHECK: %ymm0 = VMOVDQAYrm %rip, 1, _, %rax, _
- %ymm0 = VMOVDQA32Z256rm %rip, 1, _, %rax, _
+ ; CHECK: VMOVDQAYmr %rdi, 1, %noreg, 0, %noreg, %ymm0
+ VMOVDQA32Z256mr %rdi, 1, %noreg, 0, %noreg, %ymm0
+ ; CHECK: %ymm0 = VMOVDQAYrm %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VMOVDQA32Z256rm %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VMOVDQAYrr %ymm0
%ymm0 = VMOVDQA32Z256rr %ymm0
; CHECK: %ymm0 = VMOVDQAYrr_REV %ymm0
%ymm0 = VMOVDQA32Z256rr_REV %ymm0
- ; CHECK: VMOVDQAYmr %rdi, 1, _, 0, _, %ymm0
- VMOVDQA64Z256mr %rdi, 1, _, 0, _, %ymm0
- ; CHECK: %ymm0 = VMOVDQAYrm %rip, 1, _, %rax, _
- %ymm0 = VMOVDQA64Z256rm %rip, 1, _, %rax, _
+ ; CHECK: VMOVDQAYmr %rdi, 1, %noreg, 0, %noreg, %ymm0
+ VMOVDQA64Z256mr %rdi, 1, %noreg, 0, %noreg, %ymm0
+ ; CHECK: %ymm0 = VMOVDQAYrm %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VMOVDQA64Z256rm %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VMOVDQAYrr %ymm0
%ymm0 = VMOVDQA64Z256rr %ymm0
; CHECK: %ymm0 = VMOVDQAYrr_REV %ymm0
%ymm0 = VMOVDQA64Z256rr_REV %ymm0
- ; CHECK: VMOVDQUYmr %rdi, 1, _, 0, _, %ymm0
- VMOVDQU16Z256mr %rdi, 1, _, 0, _, %ymm0
- ; CHECK: %ymm0 = VMOVDQUYrm %rip, 1, _, %rax, _
- %ymm0 = VMOVDQU16Z256rm %rip, 1, _, %rax, _
+ ; CHECK: VMOVDQUYmr %rdi, 1, %noreg, 0, %noreg, %ymm0
+ VMOVDQU16Z256mr %rdi, 1, %noreg, 0, %noreg, %ymm0
+ ; CHECK: %ymm0 = VMOVDQUYrm %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VMOVDQU16Z256rm %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VMOVDQUYrr %ymm0
%ymm0 = VMOVDQU16Z256rr %ymm0
; CHECK: %ymm0 = VMOVDQUYrr_REV %ymm0
%ymm0 = VMOVDQU16Z256rr_REV %ymm0
- ; CHECK: VMOVDQUYmr %rdi, 1, _, 0, _, %ymm0
- VMOVDQU32Z256mr %rdi, 1, _, 0, _, %ymm0
- ; CHECK: %ymm0 = VMOVDQUYrm %rip, 1, _, %rax, _
- %ymm0 = VMOVDQU32Z256rm %rip, 1, _, %rax, _
+ ; CHECK: VMOVDQUYmr %rdi, 1, %noreg, 0, %noreg, %ymm0
+ VMOVDQU32Z256mr %rdi, 1, %noreg, 0, %noreg, %ymm0
+ ; CHECK: %ymm0 = VMOVDQUYrm %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VMOVDQU32Z256rm %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VMOVDQUYrr %ymm0
%ymm0 = VMOVDQU32Z256rr %ymm0
; CHECK: %ymm0 = VMOVDQUYrr_REV %ymm0
%ymm0 = VMOVDQU32Z256rr_REV %ymm0
- ; CHECK: VMOVDQUYmr %rdi, 1, _, 0, _, %ymm0
- VMOVDQU64Z256mr %rdi, 1, _, 0, _, %ymm0
- ; CHECK: %ymm0 = VMOVDQUYrm %rip, 1, _, %rax, _
- %ymm0 = VMOVDQU64Z256rm %rip, 1, _, %rax, _
+ ; CHECK: VMOVDQUYmr %rdi, 1, %noreg, 0, %noreg, %ymm0
+ VMOVDQU64Z256mr %rdi, 1, %noreg, 0, %noreg, %ymm0
+ ; CHECK: %ymm0 = VMOVDQUYrm %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VMOVDQU64Z256rm %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VMOVDQUYrr %ymm0
%ymm0 = VMOVDQU64Z256rr %ymm0
; CHECK: %ymm0 = VMOVDQUYrr_REV %ymm0
%ymm0 = VMOVDQU64Z256rr_REV %ymm0
- ; CHECK: VMOVDQUYmr %rdi, 1, _, 0, _, %ymm0
- VMOVDQU8Z256mr %rdi, 1, _, 0, _, %ymm0
- ; CHECK: %ymm0 = VMOVDQUYrm %rip, 1, _, %rax, _
- %ymm0 = VMOVDQU8Z256rm %rip, 1, _, %rax, _
+ ; CHECK: VMOVDQUYmr %rdi, 1, %noreg, 0, %noreg, %ymm0
+ VMOVDQU8Z256mr %rdi, 1, %noreg, 0, %noreg, %ymm0
+ ; CHECK: %ymm0 = VMOVDQUYrm %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VMOVDQU8Z256rm %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VMOVDQUYrr %ymm0
%ymm0 = VMOVDQU8Z256rr %ymm0
; CHECK: %ymm0 = VMOVDQUYrr_REV %ymm0
%ymm0 = VMOVDQU8Z256rr_REV %ymm0
- ; CHECK: %ymm0 = VMOVNTDQAYrm %rip, 1, _, %rax, _
- %ymm0 = VMOVNTDQAZ256rm %rip, 1, _, %rax, _
- ; CHECK: VMOVNTDQYmr %rdi, 1, _, 0, _, %ymm0
- VMOVNTDQZ256mr %rdi, 1, _, 0, _, %ymm0
- ; CHECK: VMOVNTPDYmr %rdi, 1, _, 0, _, %ymm0
- VMOVNTPDZ256mr %rdi, 1, _, 0, _, %ymm0
- ; CHECK: VMOVNTPSYmr %rdi, 1, _, 0, _, %ymm0
- VMOVNTPSZ256mr %rdi, 1, _, 0, _, %ymm0
- ; CHECK: %ymm0 = VMOVSHDUPYrm %rip, 1, _, %rax, _
- %ymm0 = VMOVSHDUPZ256rm %rip, 1, _, %rax, _
+ ; CHECK: %ymm0 = VMOVNTDQAYrm %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VMOVNTDQAZ256rm %rip, 1, %noreg, %rax, %noreg
+ ; CHECK: VMOVNTDQYmr %rdi, 1, %noreg, 0, %noreg, %ymm0
+ VMOVNTDQZ256mr %rdi, 1, %noreg, 0, %noreg, %ymm0
+ ; CHECK: VMOVNTPDYmr %rdi, 1, %noreg, 0, %noreg, %ymm0
+ VMOVNTPDZ256mr %rdi, 1, %noreg, 0, %noreg, %ymm0
+ ; CHECK: VMOVNTPSYmr %rdi, 1, %noreg, 0, %noreg, %ymm0
+ VMOVNTPSZ256mr %rdi, 1, %noreg, 0, %noreg, %ymm0
+ ; CHECK: %ymm0 = VMOVSHDUPYrm %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VMOVSHDUPZ256rm %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VMOVSHDUPYrr %ymm0
%ymm0 = VMOVSHDUPZ256rr %ymm0
- ; CHECK: %ymm0 = VMOVSLDUPYrm %rip, 1, _, %rax, _
- %ymm0 = VMOVSLDUPZ256rm %rip, 1, _, %rax, _
+ ; CHECK: %ymm0 = VMOVSLDUPYrm %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VMOVSLDUPZ256rm %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VMOVSLDUPYrr %ymm0
%ymm0 = VMOVSLDUPZ256rr %ymm0
- ; CHECK: VMOVUPDYmr %rdi, 1, _, 0, _, %ymm0
- VMOVUPDZ256mr %rdi, 1, _, 0, _, %ymm0
- ; CHECK: %ymm0 = VMOVUPDYrm %rip, 1, _, %rax, _
- %ymm0 = VMOVUPDZ256rm %rip, 1, _, %rax, _
+ ; CHECK: VMOVUPDYmr %rdi, 1, %noreg, 0, %noreg, %ymm0
+ VMOVUPDZ256mr %rdi, 1, %noreg, 0, %noreg, %ymm0
+ ; CHECK: %ymm0 = VMOVUPDYrm %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VMOVUPDZ256rm %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VMOVUPDYrr %ymm0
%ymm0 = VMOVUPDZ256rr %ymm0
; CHECK: %ymm0 = VMOVUPDYrr_REV %ymm0
%ymm0 = VMOVUPDZ256rr_REV %ymm0
- ; CHECK: VMOVUPSYmr %rdi, 1, _, 0, _, %ymm0
- VMOVUPSZ256mr %rdi, 1, _, 0, _, %ymm0
- ; CHECK: %ymm0 = VPANDYrm %ymm0, %rip, 1, _, %rax, _
- %ymm0 = VPANDDZ256rm %ymm0, %rip, 1, _, %rax, _
+ ; CHECK: VMOVUPSYmr %rdi, 1, %noreg, 0, %noreg, %ymm0
+ VMOVUPSZ256mr %rdi, 1, %noreg, 0, %noreg, %ymm0
+ ; CHECK: %ymm0 = VPANDYrm %ymm0, %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VPANDDZ256rm %ymm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VPANDYrr %ymm0, %ymm1
%ymm0 = VPANDDZ256rr %ymm0, %ymm1
- ; CHECK: %ymm0 = VPANDYrm %ymm0, %rip, 1, _, %rax, _
- %ymm0 = VPANDQZ256rm %ymm0, %rip, 1, _, %rax, _
+ ; CHECK: %ymm0 = VPANDYrm %ymm0, %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VPANDQZ256rm %ymm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VPANDYrr %ymm0, %ymm1
%ymm0 = VPANDQZ256rr %ymm0, %ymm1
- ; CHECK: %ymm0 = VPANDNYrm %ymm0, %rip, 1, _, %rax, _
- %ymm0 = VPANDNDZ256rm %ymm0, %rip, 1, _, %rax, _
+ ; CHECK: %ymm0 = VPANDNYrm %ymm0, %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VPANDNDZ256rm %ymm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VPANDNYrr %ymm0, %ymm1
%ymm0 = VPANDNDZ256rr %ymm0, %ymm1
- ; CHECK: %ymm0 = VPANDNYrm %ymm0, %rip, 1, _, %rax, _
- %ymm0 = VPANDNQZ256rm %ymm0, %rip, 1, _, %rax, _
+ ; CHECK: %ymm0 = VPANDNYrm %ymm0, %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VPANDNQZ256rm %ymm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VPANDNYrr %ymm0, %ymm1
%ymm0 = VPANDNQZ256rr %ymm0, %ymm1
- ; CHECK: %ymm0 = VPAVGBYrm %ymm0, %rip, 1, _, %rax, _
- %ymm0 = VPAVGBZ256rm %ymm0, %rip, 1, _, %rax, _
+ ; CHECK: %ymm0 = VPAVGBYrm %ymm0, %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VPAVGBZ256rm %ymm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VPAVGBYrr %ymm0, %ymm1
%ymm0 = VPAVGBZ256rr %ymm0, %ymm1
- ; CHECK: %ymm0 = VPAVGWYrm %ymm0, %rip, 1, _, %rax, _
- %ymm0 = VPAVGWZ256rm %ymm0, %rip, 1, _, %rax, _
+ ; CHECK: %ymm0 = VPAVGWYrm %ymm0, %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VPAVGWZ256rm %ymm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VPAVGWYrr %ymm0, %ymm1
%ymm0 = VPAVGWZ256rr %ymm0, %ymm1
- ; CHECK: %ymm0 = VPADDBYrm %ymm0, %rip, 1, _, %rax, _
- %ymm0 = VPADDBZ256rm %ymm0, %rip, 1, _, %rax, _
+ ; CHECK: %ymm0 = VPADDBYrm %ymm0, %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VPADDBZ256rm %ymm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VPADDBYrr %ymm0, %ymm1
%ymm0 = VPADDBZ256rr %ymm0, %ymm1
- ; CHECK: %ymm0 = VPADDDYrm %ymm0, %rip, 1, _, %rax, _
- %ymm0 = VPADDDZ256rm %ymm0, %rip, 1, _, %rax, _
+ ; CHECK: %ymm0 = VPADDDYrm %ymm0, %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VPADDDZ256rm %ymm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VPADDDYrr %ymm0, %ymm1
%ymm0 = VPADDDZ256rr %ymm0, %ymm1
- ; CHECK: %ymm0 = VPADDQYrm %ymm0, %rip, 1, _, %rax, _
- %ymm0 = VPADDQZ256rm %ymm0, %rip, 1, _, %rax, _
+ ; CHECK: %ymm0 = VPADDQYrm %ymm0, %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VPADDQZ256rm %ymm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VPADDQYrr %ymm0, %ymm1
%ymm0 = VPADDQZ256rr %ymm0, %ymm1
- ; CHECK: %ymm0 = VPADDSBYrm %ymm0, %rip, 1, _, %rax, _
- %ymm0 = VPADDSBZ256rm %ymm0, %rip, 1, _, %rax, _
+ ; CHECK: %ymm0 = VPADDSBYrm %ymm0, %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VPADDSBZ256rm %ymm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VPADDSBYrr %ymm0, %ymm1
%ymm0 = VPADDSBZ256rr %ymm0, %ymm1
- ; CHECK: %ymm0 = VPADDSWYrm %ymm0, %rip, 1, _, %rax, _
- %ymm0 = VPADDSWZ256rm %ymm0, %rip, 1, _, %rax, _
+ ; CHECK: %ymm0 = VPADDSWYrm %ymm0, %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VPADDSWZ256rm %ymm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VPADDSWYrr %ymm0, %ymm1
%ymm0 = VPADDSWZ256rr %ymm0, %ymm1
- ; CHECK: %ymm0 = VPADDUSBYrm %ymm0, %rip, 1, _, %rax, _
- %ymm0 = VPADDUSBZ256rm %ymm0, %rip, 1, _, %rax, _
+ ; CHECK: %ymm0 = VPADDUSBYrm %ymm0, %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VPADDUSBZ256rm %ymm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VPADDUSBYrr %ymm0, %ymm1
%ymm0 = VPADDUSBZ256rr %ymm0, %ymm1
- ; CHECK: %ymm0 = VPADDUSWYrm %ymm0, %rip, 1, _, %rax, _
- %ymm0 = VPADDUSWZ256rm %ymm0, %rip, 1, _, %rax, _
+ ; CHECK: %ymm0 = VPADDUSWYrm %ymm0, %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VPADDUSWZ256rm %ymm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VPADDUSWYrr %ymm0, %ymm1
%ymm0 = VPADDUSWZ256rr %ymm0, %ymm1
- ; CHECK: %ymm0 = VPADDWYrm %ymm0, %rip, 1, _, %rax, _
- %ymm0 = VPADDWZ256rm %ymm0, %rip, 1, _, %rax, _
+ ; CHECK: %ymm0 = VPADDWYrm %ymm0, %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VPADDWZ256rm %ymm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VPADDWYrr %ymm0, %ymm1
%ymm0 = VPADDWZ256rr %ymm0, %ymm1
- ; CHECK: %ymm0 = VMULPDYrm %ymm0, %rip, 1, _, %rax, _
- %ymm0 = VMULPDZ256rm %ymm0, %rip, 1, _, %rax, _
+ ; CHECK: %ymm0 = VMULPDYrm %ymm0, %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VMULPDZ256rm %ymm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VMULPDYrr %ymm0, %ymm1
%ymm0 = VMULPDZ256rr %ymm0, %ymm1
- ; CHECK: %ymm0 = VMULPSYrm %ymm0, %rip, 1, _, %rax, _
- %ymm0 = VMULPSZ256rm %ymm0, %rip, 1, _, %rax, _
+ ; CHECK: %ymm0 = VMULPSYrm %ymm0, %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VMULPSZ256rm %ymm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VMULPSYrr %ymm0, %ymm1
%ymm0 = VMULPSZ256rr %ymm0, %ymm1
- ; CHECK: %ymm0 = VORPDYrm %ymm0, %rip, 1, _, %rax, _
- %ymm0 = VORPDZ256rm %ymm0, %rip, 1, _, %rax, _
+ ; CHECK: %ymm0 = VORPDYrm %ymm0, %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VORPDZ256rm %ymm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VORPDYrr %ymm0, %ymm1
%ymm0 = VORPDZ256rr %ymm0, %ymm1
- ; CHECK: %ymm0 = VORPSYrm %ymm0, %rip, 1, _, %rax, _
- %ymm0 = VORPSZ256rm %ymm0, %rip, 1, _, %rax, _
+ ; CHECK: %ymm0 = VORPSYrm %ymm0, %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VORPSZ256rm %ymm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VORPSYrr %ymm0, %ymm1
%ymm0 = VORPSZ256rr %ymm0, %ymm1
- ; CHECK: %ymm0 = VPMADDUBSWYrm %ymm0, %rip, 1, _, %rax, _
- %ymm0 = VPMADDUBSWZ256rm %ymm0, %rip, 1, _, %rax, _
+ ; CHECK: %ymm0 = VPMADDUBSWYrm %ymm0, %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VPMADDUBSWZ256rm %ymm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VPMADDUBSWYrr %ymm0, %ymm1
%ymm0 = VPMADDUBSWZ256rr %ymm0, %ymm1
- ; CHECK: %ymm0 = VPMADDWDYrm %ymm0, %rip, 1, _, %rax, _
- %ymm0 = VPMADDWDZ256rm %ymm0, %rip, 1, _, %rax, _
+ ; CHECK: %ymm0 = VPMADDWDYrm %ymm0, %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VPMADDWDZ256rm %ymm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VPMADDWDYrr %ymm0, %ymm1
%ymm0 = VPMADDWDZ256rr %ymm0, %ymm1
- ; CHECK: %ymm0 = VPMAXSBYrm %ymm0, %rip, 1, _, %rax, _
- %ymm0 = VPMAXSBZ256rm %ymm0, %rip, 1, _, %rax, _
+ ; CHECK: %ymm0 = VPMAXSBYrm %ymm0, %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VPMAXSBZ256rm %ymm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VPMAXSBYrr %ymm0, %ymm1
%ymm0 = VPMAXSBZ256rr %ymm0, %ymm1
- ; CHECK: %ymm0 = VPMAXSDYrm %ymm0, %rip, 1, _, %rax, _
- %ymm0 = VPMAXSDZ256rm %ymm0, %rip, 1, _, %rax, _
+ ; CHECK: %ymm0 = VPMAXSDYrm %ymm0, %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VPMAXSDZ256rm %ymm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VPMAXSDYrr %ymm0, %ymm1
%ymm0 = VPMAXSDZ256rr %ymm0, %ymm1
- ; CHECK: %ymm0 = VPMAXSWYrm %ymm0, %rip, 1, _, %rax, _
- %ymm0 = VPMAXSWZ256rm %ymm0, %rip, 1, _, %rax, _
+ ; CHECK: %ymm0 = VPMAXSWYrm %ymm0, %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VPMAXSWZ256rm %ymm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VPMAXSWYrr %ymm0, %ymm1
%ymm0 = VPMAXSWZ256rr %ymm0, %ymm1
- ; CHECK: %ymm0 = VPMAXUBYrm %ymm0, %rip, 1, _, %rax, _
- %ymm0 = VPMAXUBZ256rm %ymm0, %rip, 1, _, %rax, _
+ ; CHECK: %ymm0 = VPMAXUBYrm %ymm0, %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VPMAXUBZ256rm %ymm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VPMAXUBYrr %ymm0, %ymm1
%ymm0 = VPMAXUBZ256rr %ymm0, %ymm1
- ; CHECK: %ymm0 = VPMAXUDYrm %ymm0, %rip, 1, _, %rax, _
- %ymm0 = VPMAXUDZ256rm %ymm0, %rip, 1, _, %rax, _
+ ; CHECK: %ymm0 = VPMAXUDYrm %ymm0, %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VPMAXUDZ256rm %ymm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VPMAXUDYrr %ymm0, %ymm1
%ymm0 = VPMAXUDZ256rr %ymm0, %ymm1
- ; CHECK: %ymm0 = VPMAXUWYrm %ymm0, %rip, 1, _, %rax, _
- %ymm0 = VPMAXUWZ256rm %ymm0, %rip, 1, _, %rax, _
+ ; CHECK: %ymm0 = VPMAXUWYrm %ymm0, %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VPMAXUWZ256rm %ymm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VPMAXUWYrr %ymm0, %ymm1
%ymm0 = VPMAXUWZ256rr %ymm0, %ymm1
- ; CHECK: %ymm0 = VPMINSBYrm %ymm0, %rip, 1, _, %rax, _
- %ymm0 = VPMINSBZ256rm %ymm0, %rip, 1, _, %rax, _
+ ; CHECK: %ymm0 = VPMINSBYrm %ymm0, %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VPMINSBZ256rm %ymm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VPMINSBYrr %ymm0, %ymm1
%ymm0 = VPMINSBZ256rr %ymm0, %ymm1
- ; CHECK: %ymm0 = VPMINSDYrm %ymm0, %rip, 1, _, %rax, _
- %ymm0 = VPMINSDZ256rm %ymm0, %rip, 1, _, %rax, _
+ ; CHECK: %ymm0 = VPMINSDYrm %ymm0, %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VPMINSDZ256rm %ymm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VPMINSDYrr %ymm0, %ymm1
%ymm0 = VPMINSDZ256rr %ymm0, %ymm1
- ; CHECK: %ymm0 = VPMINSWYrm %ymm0, %rip, 1, _, %rax, _
- %ymm0 = VPMINSWZ256rm %ymm0, %rip, 1, _, %rax, _
+ ; CHECK: %ymm0 = VPMINSWYrm %ymm0, %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VPMINSWZ256rm %ymm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VPMINSWYrr %ymm0, %ymm1
%ymm0 = VPMINSWZ256rr %ymm0, %ymm1
- ; CHECK: %ymm0 = VPMINUBYrm %ymm0, %rip, 1, _, %rax, _
- %ymm0 = VPMINUBZ256rm %ymm0, %rip, 1, _, %rax, _
+ ; CHECK: %ymm0 = VPMINUBYrm %ymm0, %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VPMINUBZ256rm %ymm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VPMINUBYrr %ymm0, %ymm1
%ymm0 = VPMINUBZ256rr %ymm0, %ymm1
- ; CHECK: %ymm0 = VPMINUDYrm %ymm0, %rip, 1, _, %rax, _
- %ymm0 = VPMINUDZ256rm %ymm0, %rip, 1, _, %rax, _
+ ; CHECK: %ymm0 = VPMINUDYrm %ymm0, %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VPMINUDZ256rm %ymm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VPMINUDYrr %ymm0, %ymm1
%ymm0 = VPMINUDZ256rr %ymm0, %ymm1
- ; CHECK: %ymm0 = VPMINUWYrm %ymm0, %rip, 1, _, %rax, _
- %ymm0 = VPMINUWZ256rm %ymm0, %rip, 1, _, %rax, _
+ ; CHECK: %ymm0 = VPMINUWYrm %ymm0, %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VPMINUWZ256rm %ymm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VPMINUWYrr %ymm0, %ymm1
%ymm0 = VPMINUWZ256rr %ymm0, %ymm1
- ; CHECK: %ymm0 = VPMULDQYrm %ymm0, %rip, 1, _, %rax, _
- %ymm0 = VPMULDQZ256rm %ymm0, %rip, 1, _, %rax, _
+ ; CHECK: %ymm0 = VPMULDQYrm %ymm0, %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VPMULDQZ256rm %ymm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VPMULDQYrr %ymm0, %ymm1
%ymm0 = VPMULDQZ256rr %ymm0, %ymm1
- ; CHECK: %ymm0 = VPMULHRSWYrm %ymm0, %rip, 1, _, %rax, _
- %ymm0 = VPMULHRSWZ256rm %ymm0, %rip, 1, _, %rax, _
+ ; CHECK: %ymm0 = VPMULHRSWYrm %ymm0, %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VPMULHRSWZ256rm %ymm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VPMULHRSWYrr %ymm0, %ymm1
%ymm0 = VPMULHRSWZ256rr %ymm0, %ymm1
- ; CHECK: %ymm0 = VPMULHUWYrm %ymm0, %rip, 1, _, %rax, _
- %ymm0 = VPMULHUWZ256rm %ymm0, %rip, 1, _, %rax, _
+ ; CHECK: %ymm0 = VPMULHUWYrm %ymm0, %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VPMULHUWZ256rm %ymm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VPMULHUWYrr %ymm0, %ymm1
%ymm0 = VPMULHUWZ256rr %ymm0, %ymm1
- ; CHECK: %ymm0 = VPMULHWYrm %ymm0, %rip, 1, _, %rax, _
- %ymm0 = VPMULHWZ256rm %ymm0, %rip, 1, _, %rax, _
+ ; CHECK: %ymm0 = VPMULHWYrm %ymm0, %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VPMULHWZ256rm %ymm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VPMULHWYrr %ymm0, %ymm1
%ymm0 = VPMULHWZ256rr %ymm0, %ymm1
- ; CHECK: %ymm0 = VPMULLDYrm %ymm0, %rip, 1, _, %rax, _
- %ymm0 = VPMULLDZ256rm %ymm0, %rip, 1, _, %rax, _
+ ; CHECK: %ymm0 = VPMULLDYrm %ymm0, %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VPMULLDZ256rm %ymm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VPMULLDYrr %ymm0, %ymm1
%ymm0 = VPMULLDZ256rr %ymm0, %ymm1
- ; CHECK: %ymm0 = VPMULLWYrm %ymm0, %rip, 1, _, %rax, _
- %ymm0 = VPMULLWZ256rm %ymm0, %rip, 1, _, %rax, _
+ ; CHECK: %ymm0 = VPMULLWYrm %ymm0, %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VPMULLWZ256rm %ymm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VPMULLWYrr %ymm0, %ymm1
%ymm0 = VPMULLWZ256rr %ymm0, %ymm1
- ; CHECK: %ymm0 = VPMULUDQYrm %ymm0, %rip, 1, _, %rax, _
- %ymm0 = VPMULUDQZ256rm %ymm0, %rip, 1, _, %rax, _
+ ; CHECK: %ymm0 = VPMULUDQYrm %ymm0, %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VPMULUDQZ256rm %ymm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VPMULUDQYrr %ymm0, %ymm1
%ymm0 = VPMULUDQZ256rr %ymm0, %ymm1
- ; CHECK: %ymm0 = VPORYrm %ymm0, %rip, 1, _, %rax, _
- %ymm0 = VPORDZ256rm %ymm0, %rip, 1, _, %rax, _
+ ; CHECK: %ymm0 = VPORYrm %ymm0, %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VPORDZ256rm %ymm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VPORYrr %ymm0, %ymm1
%ymm0 = VPORDZ256rr %ymm0, %ymm1
- ; CHECK: %ymm0 = VPORYrm %ymm0, %rip, 1, _, %rax, _
- %ymm0 = VPORQZ256rm %ymm0, %rip, 1, _, %rax, _
+ ; CHECK: %ymm0 = VPORYrm %ymm0, %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VPORQZ256rm %ymm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VPORYrr %ymm0, %ymm1
%ymm0 = VPORQZ256rr %ymm0, %ymm1
- ; CHECK: %ymm0 = VPSUBBYrm %ymm0, %rip, 1, _, %rax, _
- %ymm0 = VPSUBBZ256rm %ymm0, %rip, 1, _, %rax, _
+ ; CHECK: %ymm0 = VPSUBBYrm %ymm0, %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VPSUBBZ256rm %ymm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VPSUBBYrr %ymm0, %ymm1
%ymm0 = VPSUBBZ256rr %ymm0, %ymm1
- ; CHECK: %ymm0 = VPSUBDYrm %ymm0, %rip, 1, _, %rax, _
- %ymm0 = VPSUBDZ256rm %ymm0, %rip, 1, _, %rax, _
+ ; CHECK: %ymm0 = VPSUBDYrm %ymm0, %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VPSUBDZ256rm %ymm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VPSUBDYrr %ymm0, %ymm1
%ymm0 = VPSUBDZ256rr %ymm0, %ymm1
- ; CHECK: %ymm0 = VPSUBQYrm %ymm0, %rip, 1, _, %rax, _
- %ymm0 = VPSUBQZ256rm %ymm0, %rip, 1, _, %rax, _
+ ; CHECK: %ymm0 = VPSUBQYrm %ymm0, %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VPSUBQZ256rm %ymm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VPSUBQYrr %ymm0, %ymm1
%ymm0 = VPSUBQZ256rr %ymm0, %ymm1
- ; CHECK: %ymm0 = VPSUBSBYrm %ymm0, %rip, 1, _, %rax, _
- %ymm0 = VPSUBSBZ256rm %ymm0, %rip, 1, _, %rax, _
+ ; CHECK: %ymm0 = VPSUBSBYrm %ymm0, %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VPSUBSBZ256rm %ymm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VPSUBSBYrr %ymm0, %ymm1
%ymm0 = VPSUBSBZ256rr %ymm0, %ymm1
- ; CHECK: %ymm0 = VPSUBSWYrm %ymm0, %rip, 1, _, %rax, _
- %ymm0 = VPSUBSWZ256rm %ymm0, %rip, 1, _, %rax, _
+ ; CHECK: %ymm0 = VPSUBSWYrm %ymm0, %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VPSUBSWZ256rm %ymm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VPSUBSWYrr %ymm0, %ymm1
%ymm0 = VPSUBSWZ256rr %ymm0, %ymm1
- ; CHECK: %ymm0 = VPSUBUSBYrm %ymm0, %rip, 1, _, %rax, _
- %ymm0 = VPSUBUSBZ256rm %ymm0, %rip, 1, _, %rax, _
+ ; CHECK: %ymm0 = VPSUBUSBYrm %ymm0, %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VPSUBUSBZ256rm %ymm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VPSUBUSBYrr %ymm0, %ymm1
%ymm0 = VPSUBUSBZ256rr %ymm0, %ymm1
- ; CHECK: %ymm0 = VPSUBUSWYrm %ymm0, %rip, 1, _, %rax, _
- %ymm0 = VPSUBUSWZ256rm %ymm0, %rip, 1, _, %rax, _
+ ; CHECK: %ymm0 = VPSUBUSWYrm %ymm0, %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VPSUBUSWZ256rm %ymm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VPSUBUSWYrr %ymm0, %ymm1
%ymm0 = VPSUBUSWZ256rr %ymm0, %ymm1
- ; CHECK: %ymm0 = VPSUBWYrm %ymm0, %rip, 1, _, %rax, _
- %ymm0 = VPSUBWZ256rm %ymm0, %rip, 1, _, %rax, _
+ ; CHECK: %ymm0 = VPSUBWYrm %ymm0, %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VPSUBWZ256rm %ymm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VPSUBWYrr %ymm0, %ymm1
%ymm0 = VPSUBWZ256rr %ymm0, %ymm1
- ; CHECK: %ymm0 = VPXORYrm %ymm0, %rip, 1, _, %rax, _
- %ymm0 = VPXORDZ256rm %ymm0, %rip, 1, _, %rax, _
+ ; CHECK: %ymm0 = VPXORYrm %ymm0, %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VPXORDZ256rm %ymm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VPXORYrr %ymm0, %ymm1
%ymm0 = VPXORDZ256rr %ymm0, %ymm1
- ; CHECK: %ymm0 = VPXORYrm %ymm0, %rip, 1, _, %rax, _
- %ymm0 = VPXORQZ256rm %ymm0, %rip, 1, _, %rax, _
+ ; CHECK: %ymm0 = VPXORYrm %ymm0, %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VPXORQZ256rm %ymm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VPXORYrr %ymm0, %ymm1
%ymm0 = VPXORQZ256rr %ymm0, %ymm1
- ; CHECK: %ymm0 = VADDPDYrm %ymm0, %rip, 1, _, %rax, _
- %ymm0 = VADDPDZ256rm %ymm0, %rip, 1, _, %rax, _
+ ; CHECK: %ymm0 = VADDPDYrm %ymm0, %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VADDPDZ256rm %ymm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VADDPDYrr %ymm0, %ymm1
%ymm0 = VADDPDZ256rr %ymm0, %ymm1
- ; CHECK: %ymm0 = VADDPSYrm %ymm0, %rip, 1, _, %rax, _
- %ymm0 = VADDPSZ256rm %ymm0, %rip, 1, _, %rax, _
+ ; CHECK: %ymm0 = VADDPSYrm %ymm0, %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VADDPSZ256rm %ymm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VADDPSYrr %ymm0, %ymm1
%ymm0 = VADDPSZ256rr %ymm0, %ymm1
- ; CHECK: %ymm0 = VANDNPDYrm %ymm0, %rip, 1, _, %rax, _
- %ymm0 = VANDNPDZ256rm %ymm0, %rip, 1, _, %rax, _
+ ; CHECK: %ymm0 = VANDNPDYrm %ymm0, %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VANDNPDZ256rm %ymm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VANDNPDYrr %ymm0, %ymm1
%ymm0 = VANDNPDZ256rr %ymm0, %ymm1
- ; CHECK: %ymm0 = VANDNPSYrm %ymm0, %rip, 1, _, %rax, _
- %ymm0 = VANDNPSZ256rm %ymm0, %rip, 1, _, %rax, _
+ ; CHECK: %ymm0 = VANDNPSYrm %ymm0, %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VANDNPSZ256rm %ymm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VANDNPSYrr %ymm0, %ymm1
%ymm0 = VANDNPSZ256rr %ymm0, %ymm1
- ; CHECK: %ymm0 = VANDPDYrm %ymm0, %rip, 1, _, %rax, _
- %ymm0 = VANDPDZ256rm %ymm0, %rip, 1, _, %rax, _
+ ; CHECK: %ymm0 = VANDPDYrm %ymm0, %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VANDPDZ256rm %ymm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VANDPDYrr %ymm0, %ymm1
%ymm0 = VANDPDZ256rr %ymm0, %ymm1
- ; CHECK: %ymm0 = VANDPSYrm %ymm0, %rip, 1, _, %rax, _
- %ymm0 = VANDPSZ256rm %ymm0, %rip, 1, _, %rax, _
+ ; CHECK: %ymm0 = VANDPSYrm %ymm0, %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VANDPSZ256rm %ymm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VANDPSYrr %ymm0, %ymm1
%ymm0 = VANDPSZ256rr %ymm0, %ymm1
- ; CHECK: %ymm0 = VDIVPDYrm %ymm0, %rip, 1, _, %rax, _
- %ymm0 = VDIVPDZ256rm %ymm0, %rip, 1, _, %rax, _
+ ; CHECK: %ymm0 = VDIVPDYrm %ymm0, %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VDIVPDZ256rm %ymm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VDIVPDYrr %ymm0, %ymm1
%ymm0 = VDIVPDZ256rr %ymm0, %ymm1
- ; CHECK: %ymm0 = VDIVPSYrm %ymm0, %rip, 1, _, %rax, _
- %ymm0 = VDIVPSZ256rm %ymm0, %rip, 1, _, %rax, _
+ ; CHECK: %ymm0 = VDIVPSYrm %ymm0, %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VDIVPSZ256rm %ymm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VDIVPSYrr %ymm0, %ymm1
%ymm0 = VDIVPSZ256rr %ymm0, %ymm1
- ; CHECK: %ymm0 = VMAXCPDYrm %ymm0, %rip, 1, _, %rax, _
- %ymm0 = VMAXCPDZ256rm %ymm0, %rip, 1, _, %rax, _
+ ; CHECK: %ymm0 = VMAXCPDYrm %ymm0, %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VMAXCPDZ256rm %ymm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VMAXCPDYrr %ymm0, %ymm1
%ymm0 = VMAXCPDZ256rr %ymm0, %ymm1
- ; CHECK: %ymm0 = VMAXCPSYrm %ymm0, %rip, 1, _, %rax, _
- %ymm0 = VMAXCPSZ256rm %ymm0, %rip, 1, _, %rax, _
+ ; CHECK: %ymm0 = VMAXCPSYrm %ymm0, %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VMAXCPSZ256rm %ymm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VMAXCPSYrr %ymm0, %ymm1
%ymm0 = VMAXCPSZ256rr %ymm0, %ymm1
- ; CHECK: %ymm0 = VMAXCPDYrm %ymm0, %rip, 1, _, %rax, _
- %ymm0 = VMAXPDZ256rm %ymm0, %rip, 1, _, %rax, _
+ ; CHECK: %ymm0 = VMAXCPDYrm %ymm0, %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VMAXPDZ256rm %ymm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VMAXCPDYrr %ymm0, %ymm1
%ymm0 = VMAXPDZ256rr %ymm0, %ymm1
- ; CHECK: %ymm0 = VMAXCPSYrm %ymm0, %rip, 1, _, %rax, _
- %ymm0 = VMAXPSZ256rm %ymm0, %rip, 1, _, %rax, _
+ ; CHECK: %ymm0 = VMAXCPSYrm %ymm0, %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VMAXPSZ256rm %ymm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VMAXCPSYrr %ymm0, %ymm1
%ymm0 = VMAXPSZ256rr %ymm0, %ymm1
- ; CHECK: %ymm0 = VMINCPDYrm %ymm0, %rip, 1, _, %rax, _
- %ymm0 = VMINCPDZ256rm %ymm0, %rip, 1, _, %rax, _
+ ; CHECK: %ymm0 = VMINCPDYrm %ymm0, %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VMINCPDZ256rm %ymm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VMINCPDYrr %ymm0, %ymm1
%ymm0 = VMINCPDZ256rr %ymm0, %ymm1
- ; CHECK: %ymm0 = VMINCPSYrm %ymm0, %rip, 1, _, %rax, _
- %ymm0 = VMINCPSZ256rm %ymm0, %rip, 1, _, %rax, _
+ ; CHECK: %ymm0 = VMINCPSYrm %ymm0, %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VMINCPSZ256rm %ymm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VMINCPSYrr %ymm0, %ymm1
%ymm0 = VMINCPSZ256rr %ymm0, %ymm1
- ; CHECK: %ymm0 = VMINCPDYrm %ymm0, %rip, 1, _, %rax, _
- %ymm0 = VMINPDZ256rm %ymm0, %rip, 1, _, %rax, _
+ ; CHECK: %ymm0 = VMINCPDYrm %ymm0, %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VMINPDZ256rm %ymm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VMINCPDYrr %ymm0, %ymm1
%ymm0 = VMINPDZ256rr %ymm0, %ymm1
- ; CHECK: %ymm0 = VMINCPSYrm %ymm0, %rip, 1, _, %rax, _
- %ymm0 = VMINPSZ256rm %ymm0, %rip, 1, _, %rax, _
+ ; CHECK: %ymm0 = VMINCPSYrm %ymm0, %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VMINPSZ256rm %ymm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VMINCPSYrr %ymm0, %ymm1
%ymm0 = VMINPSZ256rr %ymm0, %ymm1
- ; CHECK: %ymm0 = VXORPDYrm %ymm0, %rip, 1, _, %rax, _
- %ymm0 = VXORPDZ256rm %ymm0, %rip, 1, _, %rax, _
+ ; CHECK: %ymm0 = VXORPDYrm %ymm0, %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VXORPDZ256rm %ymm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VXORPDYrr %ymm0, %ymm1
%ymm0 = VXORPDZ256rr %ymm0, %ymm1
- ; CHECK: %ymm0 = VXORPSYrm %ymm0, %rip, 1, _, %rax, _
- %ymm0 = VXORPSZ256rm %ymm0, %rip, 1, _, %rax, _
+ ; CHECK: %ymm0 = VXORPSYrm %ymm0, %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VXORPSZ256rm %ymm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VXORPSYrr %ymm0, %ymm1
%ymm0 = VXORPSZ256rr %ymm0, %ymm1
- ; CHECK: %ymm0 = VPACKSSDWYrm %ymm0, %rip, 1, _, %rax, _
- %ymm0 = VPACKSSDWZ256rm %ymm0, %rip, 1, _, %rax, _
+ ; CHECK: %ymm0 = VPACKSSDWYrm %ymm0, %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VPACKSSDWZ256rm %ymm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VPACKSSDWYrr %ymm0, %ymm1
%ymm0 = VPACKSSDWZ256rr %ymm0, %ymm1
- ; CHECK: %ymm0 = VPACKSSWBYrm %ymm0, %rip, 1, _, %rax, _
- %ymm0 = VPACKSSWBZ256rm %ymm0, %rip, 1, _, %rax, _
+ ; CHECK: %ymm0 = VPACKSSWBYrm %ymm0, %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VPACKSSWBZ256rm %ymm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VPACKSSWBYrr %ymm0, %ymm1
%ymm0 = VPACKSSWBZ256rr %ymm0, %ymm1
- ; CHECK: %ymm0 = VPACKUSDWYrm %ymm0, %rip, 1, _, %rax, _
- %ymm0 = VPACKUSDWZ256rm %ymm0, %rip, 1, _, %rax, _
+ ; CHECK: %ymm0 = VPACKUSDWYrm %ymm0, %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VPACKUSDWZ256rm %ymm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VPACKUSDWYrr %ymm0, %ymm1
%ymm0 = VPACKUSDWZ256rr %ymm0, %ymm1
- ; CHECK: %ymm0 = VPACKUSWBYrm %ymm0, %rip, 1, _, %rax, _
- %ymm0 = VPACKUSWBZ256rm %ymm0, %rip, 1, _, %rax, _
+ ; CHECK: %ymm0 = VPACKUSWBYrm %ymm0, %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VPACKUSWBZ256rm %ymm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VPACKUSWBYrr %ymm0, %ymm1
%ymm0 = VPACKUSWBZ256rr %ymm0, %ymm1
- ; CHECK: %ymm0 = VUNPCKHPDYrm %ymm0, %rip, 1, _, %rax, _
- %ymm0 = VUNPCKHPDZ256rm %ymm0, %rip, 1, _, %rax, _
+ ; CHECK: %ymm0 = VUNPCKHPDYrm %ymm0, %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VUNPCKHPDZ256rm %ymm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VUNPCKHPDYrr %ymm0, %ymm1
%ymm0 = VUNPCKHPDZ256rr %ymm0, %ymm1
- ; CHECK: %ymm0 = VUNPCKHPSYrm %ymm0, %rip, 1, _, %rax, _
- %ymm0 = VUNPCKHPSZ256rm %ymm0, %rip, 1, _, %rax, _
+ ; CHECK: %ymm0 = VUNPCKHPSYrm %ymm0, %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VUNPCKHPSZ256rm %ymm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VUNPCKHPSYrr %ymm0, %ymm1
%ymm0 = VUNPCKHPSZ256rr %ymm0, %ymm1
- ; CHECK: %ymm0 = VUNPCKLPDYrm %ymm0, %rip, 1, _, %rax, _
- %ymm0 = VUNPCKLPDZ256rm %ymm0, %rip, 1, _, %rax, _
+ ; CHECK: %ymm0 = VUNPCKLPDYrm %ymm0, %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VUNPCKLPDZ256rm %ymm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VUNPCKLPDYrr %ymm0, %ymm1
%ymm0 = VUNPCKLPDZ256rr %ymm0, %ymm1
- ; CHECK: %ymm0 = VUNPCKLPSYrm %ymm0, %rip, 1, _, %rax, _
- %ymm0 = VUNPCKLPSZ256rm %ymm0, %rip, 1, _, %rax, _
+ ; CHECK: %ymm0 = VUNPCKLPSYrm %ymm0, %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VUNPCKLPSZ256rm %ymm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VUNPCKLPSYrr %ymm0, %ymm1
%ymm0 = VUNPCKLPSZ256rr %ymm0, %ymm1
- ; CHECK: %ymm0 = VSUBPDYrm %ymm0, %rip, 1, _, %rax, _
- %ymm0 = VSUBPDZ256rm %ymm0, %rip, 1, _, %rax, _
+ ; CHECK: %ymm0 = VSUBPDYrm %ymm0, %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VSUBPDZ256rm %ymm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VSUBPDYrr %ymm0, %ymm1
%ymm0 = VSUBPDZ256rr %ymm0, %ymm1
- ; CHECK: %ymm0 = VSUBPSYrm %ymm0, %rip, 1, _, %rax, _
- %ymm0 = VSUBPSZ256rm %ymm0, %rip, 1, _, %rax, _
+ ; CHECK: %ymm0 = VSUBPSYrm %ymm0, %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VSUBPSZ256rm %ymm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VSUBPSYrr %ymm0, %ymm1
%ymm0 = VSUBPSZ256rr %ymm0, %ymm1
- ; CHECK: %ymm0 = VPUNPCKHBWYrm %ymm0, %rip, 1, _, %rax, _
- %ymm0 = VPUNPCKHBWZ256rm %ymm0, %rip, 1, _, %rax, _
+ ; CHECK: %ymm0 = VPUNPCKHBWYrm %ymm0, %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VPUNPCKHBWZ256rm %ymm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VPUNPCKHBWYrr %ymm0, %ymm1
%ymm0 = VPUNPCKHBWZ256rr %ymm0, %ymm1
- ; CHECK: %ymm0 = VPUNPCKHDQYrm %ymm0, %rip, 1, _, %rax, _
- %ymm0 = VPUNPCKHDQZ256rm %ymm0, %rip, 1, _, %rax, _
+ ; CHECK: %ymm0 = VPUNPCKHDQYrm %ymm0, %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VPUNPCKHDQZ256rm %ymm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VPUNPCKHDQYrr %ymm0, %ymm1
%ymm0 = VPUNPCKHDQZ256rr %ymm0, %ymm1
- ; CHECK: %ymm0 = VPUNPCKHQDQYrm %ymm0, %rip, 1, _, %rax, _
- %ymm0 = VPUNPCKHQDQZ256rm %ymm0, %rip, 1, _, %rax, _
+ ; CHECK: %ymm0 = VPUNPCKHQDQYrm %ymm0, %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VPUNPCKHQDQZ256rm %ymm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VPUNPCKHQDQYrr %ymm0, %ymm1
%ymm0 = VPUNPCKHQDQZ256rr %ymm0, %ymm1
- ; CHECK: %ymm0 = VPUNPCKHWDYrm %ymm0, %rip, 1, _, %rax, _
- %ymm0 = VPUNPCKHWDZ256rm %ymm0, %rip, 1, _, %rax, _
+ ; CHECK: %ymm0 = VPUNPCKHWDYrm %ymm0, %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VPUNPCKHWDZ256rm %ymm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VPUNPCKHWDYrr %ymm0, %ymm1
%ymm0 = VPUNPCKHWDZ256rr %ymm0, %ymm1
- ; CHECK: %ymm0 = VPUNPCKLBWYrm %ymm0, %rip, 1, _, %rax, _
- %ymm0 = VPUNPCKLBWZ256rm %ymm0, %rip, 1, _, %rax, _
+ ; CHECK: %ymm0 = VPUNPCKLBWYrm %ymm0, %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VPUNPCKLBWZ256rm %ymm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VPUNPCKLBWYrr %ymm0, %ymm1
%ymm0 = VPUNPCKLBWZ256rr %ymm0, %ymm1
- ; CHECK: %ymm0 = VPUNPCKLDQYrm %ymm0, %rip, 1, _, %rax, _
- %ymm0 = VPUNPCKLDQZ256rm %ymm0, %rip, 1, _, %rax, _
+ ; CHECK: %ymm0 = VPUNPCKLDQYrm %ymm0, %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VPUNPCKLDQZ256rm %ymm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VPUNPCKLDQYrr %ymm0, %ymm1
%ymm0 = VPUNPCKLDQZ256rr %ymm0, %ymm1
- ; CHECK: %ymm0 = VPUNPCKLQDQYrm %ymm0, %rip, 1, _, %rax, _
- %ymm0 = VPUNPCKLQDQZ256rm %ymm0, %rip, 1, _, %rax, _
+ ; CHECK: %ymm0 = VPUNPCKLQDQYrm %ymm0, %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VPUNPCKLQDQZ256rm %ymm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VPUNPCKLQDQYrr %ymm0, %ymm1
%ymm0 = VPUNPCKLQDQZ256rr %ymm0, %ymm1
- ; CHECK: %ymm0 = VPUNPCKLWDYrm %ymm0, %rip, 1, _, %rax, _
- %ymm0 = VPUNPCKLWDZ256rm %ymm0, %rip, 1, _, %rax, _
+ ; CHECK: %ymm0 = VPUNPCKLWDYrm %ymm0, %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VPUNPCKLWDZ256rm %ymm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VPUNPCKLWDYrr %ymm0, %ymm1
%ymm0 = VPUNPCKLWDZ256rr %ymm0, %ymm1
- ; CHECK: %ymm0 = VFMADD132PDYm %ymm0, %ymm0, %rsi, 1, _, 0, _
- %ymm0 = VFMADD132PDZ256m %ymm0, %ymm0, %rsi, 1, _, 0, _
+ ; CHECK: %ymm0 = VFMADD132PDYm %ymm0, %ymm0, %rsi, 1, %noreg, 0, %noreg
+ %ymm0 = VFMADD132PDZ256m %ymm0, %ymm0, %rsi, 1, %noreg, 0, %noreg
; CHECK: %ymm0 = VFMADD132PDYr %ymm0, %ymm1, %ymm2
%ymm0 = VFMADD132PDZ256r %ymm0, %ymm1, %ymm2
- ; CHECK: %ymm0 = VFMADD132PSYm %ymm0, %ymm0, %rsi, 1, _, 0, _
- %ymm0 = VFMADD132PSZ256m %ymm0, %ymm0, %rsi, 1, _, 0, _
+ ; CHECK: %ymm0 = VFMADD132PSYm %ymm0, %ymm0, %rsi, 1, %noreg, 0, %noreg
+ %ymm0 = VFMADD132PSZ256m %ymm0, %ymm0, %rsi, 1, %noreg, 0, %noreg
; CHECK: %ymm0 = VFMADD132PSYr %ymm0, %ymm1, %ymm2
%ymm0 = VFMADD132PSZ256r %ymm0, %ymm1, %ymm2
- ; CHECK: %ymm0 = VFMADD213PDYm %ymm0, %ymm0, %rsi, 1, _, 0, _
- %ymm0 = VFMADD213PDZ256m %ymm0, %ymm0, %rsi, 1, _, 0, _
+ ; CHECK: %ymm0 = VFMADD213PDYm %ymm0, %ymm0, %rsi, 1, %noreg, 0, %noreg
+ %ymm0 = VFMADD213PDZ256m %ymm0, %ymm0, %rsi, 1, %noreg, 0, %noreg
; CHECK: %ymm0 = VFMADD213PDYr %ymm0, %ymm1, %ymm2
%ymm0 = VFMADD213PDZ256r %ymm0, %ymm1, %ymm2
- ; CHECK: %ymm0 = VFMADD213PSYm %ymm0, %ymm0, %rsi, 1, _, 0, _
- %ymm0 = VFMADD213PSZ256m %ymm0, %ymm0, %rsi, 1, _, 0, _
+ ; CHECK: %ymm0 = VFMADD213PSYm %ymm0, %ymm0, %rsi, 1, %noreg, 0, %noreg
+ %ymm0 = VFMADD213PSZ256m %ymm0, %ymm0, %rsi, 1, %noreg, 0, %noreg
; CHECK: %ymm0 = VFMADD213PSYr %ymm0, %ymm1, %ymm2
%ymm0 = VFMADD213PSZ256r %ymm0, %ymm1, %ymm2
- ; CHECK: %ymm0 = VFMADD231PDYm %ymm0, %ymm0, %rsi, 1, _, 0, _
- %ymm0 = VFMADD231PDZ256m %ymm0, %ymm0, %rsi, 1, _, 0, _
+ ; CHECK: %ymm0 = VFMADD231PDYm %ymm0, %ymm0, %rsi, 1, %noreg, 0, %noreg
+ %ymm0 = VFMADD231PDZ256m %ymm0, %ymm0, %rsi, 1, %noreg, 0, %noreg
; CHECK: %ymm0 = VFMADD231PDYr %ymm0, %ymm1, %ymm2
%ymm0 = VFMADD231PDZ256r %ymm0, %ymm1, %ymm2
- ; CHECK: %ymm0 = VFMADD231PSYm %ymm0, %ymm0, %rsi, 1, _, 0, _
- %ymm0 = VFMADD231PSZ256m %ymm0, %ymm0, %rsi, 1, _, 0, _
+ ; CHECK: %ymm0 = VFMADD231PSYm %ymm0, %ymm0, %rsi, 1, %noreg, 0, %noreg
+ %ymm0 = VFMADD231PSZ256m %ymm0, %ymm0, %rsi, 1, %noreg, 0, %noreg
; CHECK: %ymm0 = VFMADD231PSYr %ymm0, %ymm1, %ymm2
%ymm0 = VFMADD231PSZ256r %ymm0, %ymm1, %ymm2
- ; CHECK: %ymm0 = VFMADDSUB132PDYm %ymm0, %ymm0, %rsi, 1, _, 0, _
- %ymm0 = VFMADDSUB132PDZ256m %ymm0, %ymm0, %rsi, 1, _, 0, _
+ ; CHECK: %ymm0 = VFMADDSUB132PDYm %ymm0, %ymm0, %rsi, 1, %noreg, 0, %noreg
+ %ymm0 = VFMADDSUB132PDZ256m %ymm0, %ymm0, %rsi, 1, %noreg, 0, %noreg
; CHECK: %ymm0 = VFMADDSUB132PDYr %ymm0, %ymm1, %ymm2
%ymm0 = VFMADDSUB132PDZ256r %ymm0, %ymm1, %ymm2
- ; CHECK: %ymm0 = VFMADDSUB132PSYm %ymm0, %ymm0, %rsi, 1, _, 0, _
- %ymm0 = VFMADDSUB132PSZ256m %ymm0, %ymm0, %rsi, 1, _, 0, _
+ ; CHECK: %ymm0 = VFMADDSUB132PSYm %ymm0, %ymm0, %rsi, 1, %noreg, 0, %noreg
+ %ymm0 = VFMADDSUB132PSZ256m %ymm0, %ymm0, %rsi, 1, %noreg, 0, %noreg
; CHECK: %ymm0 = VFMADDSUB132PSYr %ymm0, %ymm1, %ymm2
%ymm0 = VFMADDSUB132PSZ256r %ymm0, %ymm1, %ymm2
- ; CHECK: %ymm0 = VFMADDSUB213PDYm %ymm0, %ymm0, %rsi, 1, _, 0, _
- %ymm0 = VFMADDSUB213PDZ256m %ymm0, %ymm0, %rsi, 1, _, 0, _
+ ; CHECK: %ymm0 = VFMADDSUB213PDYm %ymm0, %ymm0, %rsi, 1, %noreg, 0, %noreg
+ %ymm0 = VFMADDSUB213PDZ256m %ymm0, %ymm0, %rsi, 1, %noreg, 0, %noreg
; CHECK: %ymm0 = VFMADDSUB213PDYr %ymm0, %ymm1, %ymm2
%ymm0 = VFMADDSUB213PDZ256r %ymm0, %ymm1, %ymm2
- ; CHECK: %ymm0 = VFMADDSUB213PSYm %ymm0, %ymm0, %rsi, 1, _, 0, _
- %ymm0 = VFMADDSUB213PSZ256m %ymm0, %ymm0, %rsi, 1, _, 0, _
+ ; CHECK: %ymm0 = VFMADDSUB213PSYm %ymm0, %ymm0, %rsi, 1, %noreg, 0, %noreg
+ %ymm0 = VFMADDSUB213PSZ256m %ymm0, %ymm0, %rsi, 1, %noreg, 0, %noreg
; CHECK: %ymm0 = VFMADDSUB213PSYr %ymm0, %ymm1, %ymm2
%ymm0 = VFMADDSUB213PSZ256r %ymm0, %ymm1, %ymm2
- ; CHECK: %ymm0 = VFMADDSUB231PDYm %ymm0, %ymm0, %rsi, 1, _, 0, _
- %ymm0 = VFMADDSUB231PDZ256m %ymm0, %ymm0, %rsi, 1, _, 0, _
+ ; CHECK: %ymm0 = VFMADDSUB231PDYm %ymm0, %ymm0, %rsi, 1, %noreg, 0, %noreg
+ %ymm0 = VFMADDSUB231PDZ256m %ymm0, %ymm0, %rsi, 1, %noreg, 0, %noreg
; CHECK: %ymm0 = VFMADDSUB231PDYr %ymm0, %ymm1, %ymm2
%ymm0 = VFMADDSUB231PDZ256r %ymm0, %ymm1, %ymm2
- ; CHECK: %ymm0 = VFMADDSUB231PSYm %ymm0, %ymm0, %rsi, 1, _, 0, _
- %ymm0 = VFMADDSUB231PSZ256m %ymm0, %ymm0, %rsi, 1, _, 0, _
+ ; CHECK: %ymm0 = VFMADDSUB231PSYm %ymm0, %ymm0, %rsi, 1, %noreg, 0, %noreg
+ %ymm0 = VFMADDSUB231PSZ256m %ymm0, %ymm0, %rsi, 1, %noreg, 0, %noreg
; CHECK: %ymm0 = VFMADDSUB231PSYr %ymm0, %ymm1, %ymm2
%ymm0 = VFMADDSUB231PSZ256r %ymm0, %ymm1, %ymm2
- ; CHECK: %ymm0 = VFMSUB132PDYm %ymm0, %ymm0, %rsi, 1, _, 0, _
- %ymm0 = VFMSUB132PDZ256m %ymm0, %ymm0, %rsi, 1, _, 0, _
+ ; CHECK: %ymm0 = VFMSUB132PDYm %ymm0, %ymm0, %rsi, 1, %noreg, 0, %noreg
+ %ymm0 = VFMSUB132PDZ256m %ymm0, %ymm0, %rsi, 1, %noreg, 0, %noreg
; CHECK: %ymm0 = VFMSUB132PDYr %ymm0, %ymm1, %ymm2
%ymm0 = VFMSUB132PDZ256r %ymm0, %ymm1, %ymm2
- ; CHECK: %ymm0 = VFMSUB132PSYm %ymm0, %ymm0, %rsi, 1, _, 0, _
- %ymm0 = VFMSUB132PSZ256m %ymm0, %ymm0, %rsi, 1, _, 0, _
+ ; CHECK: %ymm0 = VFMSUB132PSYm %ymm0, %ymm0, %rsi, 1, %noreg, 0, %noreg
+ %ymm0 = VFMSUB132PSZ256m %ymm0, %ymm0, %rsi, 1, %noreg, 0, %noreg
; CHECK: %ymm0 = VFMSUB132PSYr %ymm0, %ymm1, %ymm2
%ymm0 = VFMSUB132PSZ256r %ymm0, %ymm1, %ymm2
- ; CHECK: %ymm0 = VFMSUB213PDYm %ymm0, %ymm0, %rsi, 1, _, 0, _
- %ymm0 = VFMSUB213PDZ256m %ymm0, %ymm0, %rsi, 1, _, 0, _
+ ; CHECK: %ymm0 = VFMSUB213PDYm %ymm0, %ymm0, %rsi, 1, %noreg, 0, %noreg
+ %ymm0 = VFMSUB213PDZ256m %ymm0, %ymm0, %rsi, 1, %noreg, 0, %noreg
; CHECK: %ymm0 = VFMSUB213PDYr %ymm0, %ymm1, %ymm2
%ymm0 = VFMSUB213PDZ256r %ymm0, %ymm1, %ymm2
- ; CHECK: %ymm0 = VFMSUB213PSYm %ymm0, %ymm0, %rsi, 1, _, 0, _
- %ymm0 = VFMSUB213PSZ256m %ymm0, %ymm0, %rsi, 1, _, 0, _
+ ; CHECK: %ymm0 = VFMSUB213PSYm %ymm0, %ymm0, %rsi, 1, %noreg, 0, %noreg
+ %ymm0 = VFMSUB213PSZ256m %ymm0, %ymm0, %rsi, 1, %noreg, 0, %noreg
; CHECK: %ymm0 = VFMSUB213PSYr %ymm0, %ymm1, %ymm2
%ymm0 = VFMSUB213PSZ256r %ymm0, %ymm1, %ymm2
- ; CHECK: %ymm0 = VFMSUB231PDYm %ymm0, %ymm0, %rsi, 1, _, 0, _
- %ymm0 = VFMSUB231PDZ256m %ymm0, %ymm0, %rsi, 1, _, 0, _
+ ; CHECK: %ymm0 = VFMSUB231PDYm %ymm0, %ymm0, %rsi, 1, %noreg, 0, %noreg
+ %ymm0 = VFMSUB231PDZ256m %ymm0, %ymm0, %rsi, 1, %noreg, 0, %noreg
; CHECK: %ymm0 = VFMSUB231PDYr %ymm0, %ymm1, %ymm2
%ymm0 = VFMSUB231PDZ256r %ymm0, %ymm1, %ymm2
- ; CHECK: %ymm0 = VFMSUB231PSYm %ymm0, %ymm0, %rsi, 1, _, 0, _
- %ymm0 = VFMSUB231PSZ256m %ymm0, %ymm0, %rsi, 1, _, 0, _
+ ; CHECK: %ymm0 = VFMSUB231PSYm %ymm0, %ymm0, %rsi, 1, %noreg, 0, %noreg
+ %ymm0 = VFMSUB231PSZ256m %ymm0, %ymm0, %rsi, 1, %noreg, 0, %noreg
; CHECK: %ymm0 = VFMSUB231PSYr %ymm0, %ymm1, %ymm2
%ymm0 = VFMSUB231PSZ256r %ymm0, %ymm1, %ymm2
- ; CHECK: %ymm0 = VFMSUBADD132PDYm %ymm0, %ymm0, %rsi, 1, _, 0, _
- %ymm0 = VFMSUBADD132PDZ256m %ymm0, %ymm0, %rsi, 1, _, 0, _
+ ; CHECK: %ymm0 = VFMSUBADD132PDYm %ymm0, %ymm0, %rsi, 1, %noreg, 0, %noreg
+ %ymm0 = VFMSUBADD132PDZ256m %ymm0, %ymm0, %rsi, 1, %noreg, 0, %noreg
; CHECK: %ymm0 = VFMSUBADD132PDYr %ymm0, %ymm1, %ymm2
%ymm0 = VFMSUBADD132PDZ256r %ymm0, %ymm1, %ymm2
- ; CHECK: %ymm0 = VFMSUBADD132PSYm %ymm0, %ymm0, %rsi, 1, _, 0, _
- %ymm0 = VFMSUBADD132PSZ256m %ymm0, %ymm0, %rsi, 1, _, 0, _
+ ; CHECK: %ymm0 = VFMSUBADD132PSYm %ymm0, %ymm0, %rsi, 1, %noreg, 0, %noreg
+ %ymm0 = VFMSUBADD132PSZ256m %ymm0, %ymm0, %rsi, 1, %noreg, 0, %noreg
; CHECK: %ymm0 = VFMSUBADD132PSYr %ymm0, %ymm1, %ymm2
%ymm0 = VFMSUBADD132PSZ256r %ymm0, %ymm1, %ymm2
- ; CHECK: %ymm0 = VFMSUBADD213PDYm %ymm0, %ymm0, %rsi, 1, _, 0, _
- %ymm0 = VFMSUBADD213PDZ256m %ymm0, %ymm0, %rsi, 1, _, 0, _
+ ; CHECK: %ymm0 = VFMSUBADD213PDYm %ymm0, %ymm0, %rsi, 1, %noreg, 0, %noreg
+ %ymm0 = VFMSUBADD213PDZ256m %ymm0, %ymm0, %rsi, 1, %noreg, 0, %noreg
; CHECK: %ymm0 = VFMSUBADD213PDYr %ymm0, %ymm1, %ymm2
%ymm0 = VFMSUBADD213PDZ256r %ymm0, %ymm1, %ymm2
- ; CHECK: %ymm0 = VFMSUBADD213PSYm %ymm0, %ymm0, %rsi, 1, _, 0, _
- %ymm0 = VFMSUBADD213PSZ256m %ymm0, %ymm0, %rsi, 1, _, 0, _
+ ; CHECK: %ymm0 = VFMSUBADD213PSYm %ymm0, %ymm0, %rsi, 1, %noreg, 0, %noreg
+ %ymm0 = VFMSUBADD213PSZ256m %ymm0, %ymm0, %rsi, 1, %noreg, 0, %noreg
; CHECK: %ymm0 = VFMSUBADD213PSYr %ymm0, %ymm1, %ymm2
%ymm0 = VFMSUBADD213PSZ256r %ymm0, %ymm1, %ymm2
- ; CHECK: %ymm0 = VFMSUBADD231PDYm %ymm0, %ymm0, %rsi, 1, _, 0, _
- %ymm0 = VFMSUBADD231PDZ256m %ymm0, %ymm0, %rsi, 1, _, 0, _
+ ; CHECK: %ymm0 = VFMSUBADD231PDYm %ymm0, %ymm0, %rsi, 1, %noreg, 0, %noreg
+ %ymm0 = VFMSUBADD231PDZ256m %ymm0, %ymm0, %rsi, 1, %noreg, 0, %noreg
; CHECK: %ymm0 = VFMSUBADD231PDYr %ymm0, %ymm1, %ymm2
%ymm0 = VFMSUBADD231PDZ256r %ymm0, %ymm1, %ymm2
- ; CHECK: %ymm0 = VFMSUBADD231PSYm %ymm0, %ymm0, %rsi, 1, _, 0, _
- %ymm0 = VFMSUBADD231PSZ256m %ymm0, %ymm0, %rsi, 1, _, 0, _
+ ; CHECK: %ymm0 = VFMSUBADD231PSYm %ymm0, %ymm0, %rsi, 1, %noreg, 0, %noreg
+ %ymm0 = VFMSUBADD231PSZ256m %ymm0, %ymm0, %rsi, 1, %noreg, 0, %noreg
; CHECK: %ymm0 = VFMSUBADD231PSYr %ymm0, %ymm1, %ymm2
%ymm0 = VFMSUBADD231PSZ256r %ymm0, %ymm1, %ymm2
- ; CHECK: %ymm0 = VFNMADD132PDYm %ymm0, %ymm0, %rsi, 1, _, 0, _
- %ymm0 = VFNMADD132PDZ256m %ymm0, %ymm0, %rsi, 1, _, 0, _
+ ; CHECK: %ymm0 = VFNMADD132PDYm %ymm0, %ymm0, %rsi, 1, %noreg, 0, %noreg
+ %ymm0 = VFNMADD132PDZ256m %ymm0, %ymm0, %rsi, 1, %noreg, 0, %noreg
; CHECK: %ymm0 = VFNMADD132PDYr %ymm0, %ymm1, %ymm2
%ymm0 = VFNMADD132PDZ256r %ymm0, %ymm1, %ymm2
- ; CHECK: %ymm0 = VFNMADD132PSYm %ymm0, %ymm0, %rsi, 1, _, 0, _
- %ymm0 = VFNMADD132PSZ256m %ymm0, %ymm0, %rsi, 1, _, 0, _
+ ; CHECK: %ymm0 = VFNMADD132PSYm %ymm0, %ymm0, %rsi, 1, %noreg, 0, %noreg
+ %ymm0 = VFNMADD132PSZ256m %ymm0, %ymm0, %rsi, 1, %noreg, 0, %noreg
; CHECK: %ymm0 = VFNMADD132PSYr %ymm0, %ymm1, %ymm2
%ymm0 = VFNMADD132PSZ256r %ymm0, %ymm1, %ymm2
- ; CHECK: %ymm0 = VFNMADD213PDYm %ymm0, %ymm0, %rsi, 1, _, 0, _
- %ymm0 = VFNMADD213PDZ256m %ymm0, %ymm0, %rsi, 1, _, 0, _
+ ; CHECK: %ymm0 = VFNMADD213PDYm %ymm0, %ymm0, %rsi, 1, %noreg, 0, %noreg
+ %ymm0 = VFNMADD213PDZ256m %ymm0, %ymm0, %rsi, 1, %noreg, 0, %noreg
; CHECK: %ymm0 = VFNMADD213PDYr %ymm0, %ymm1, %ymm2
%ymm0 = VFNMADD213PDZ256r %ymm0, %ymm1, %ymm2
- ; CHECK: %ymm0 = VFNMADD213PSYm %ymm0, %ymm0, %rsi, 1, _, 0, _
- %ymm0 = VFNMADD213PSZ256m %ymm0, %ymm0, %rsi, 1, _, 0, _
+ ; CHECK: %ymm0 = VFNMADD213PSYm %ymm0, %ymm0, %rsi, 1, %noreg, 0, %noreg
+ %ymm0 = VFNMADD213PSZ256m %ymm0, %ymm0, %rsi, 1, %noreg, 0, %noreg
; CHECK: %ymm0 = VFNMADD213PSYr %ymm0, %ymm1, %ymm2
%ymm0 = VFNMADD213PSZ256r %ymm0, %ymm1, %ymm2
- ; CHECK: %ymm0 = VFNMADD231PDYm %ymm0, %ymm0, %rsi, 1, _, 0, _
- %ymm0 = VFNMADD231PDZ256m %ymm0, %ymm0, %rsi, 1, _, 0, _
+ ; CHECK: %ymm0 = VFNMADD231PDYm %ymm0, %ymm0, %rsi, 1, %noreg, 0, %noreg
+ %ymm0 = VFNMADD231PDZ256m %ymm0, %ymm0, %rsi, 1, %noreg, 0, %noreg
; CHECK: %ymm0 = VFNMADD231PDYr %ymm0, %ymm1, %ymm2
%ymm0 = VFNMADD231PDZ256r %ymm0, %ymm1, %ymm2
- ; CHECK: %ymm0 = VFNMADD231PSYm %ymm0, %ymm0, %rsi, 1, _, 0, _
- %ymm0 = VFNMADD231PSZ256m %ymm0, %ymm0, %rsi, 1, _, 0, _
+ ; CHECK: %ymm0 = VFNMADD231PSYm %ymm0, %ymm0, %rsi, 1, %noreg, 0, %noreg
+ %ymm0 = VFNMADD231PSZ256m %ymm0, %ymm0, %rsi, 1, %noreg, 0, %noreg
; CHECK: %ymm0 = VFNMADD231PSYr %ymm0, %ymm1, %ymm2
%ymm0 = VFNMADD231PSZ256r %ymm0, %ymm1, %ymm2
- ; CHECK: %ymm0 = VFNMSUB132PDYm %ymm0, %ymm0, %rsi, 1, _, 0, _
- %ymm0 = VFNMSUB132PDZ256m %ymm0, %ymm0, %rsi, 1, _, 0, _
+ ; CHECK: %ymm0 = VFNMSUB132PDYm %ymm0, %ymm0, %rsi, 1, %noreg, 0, %noreg
+ %ymm0 = VFNMSUB132PDZ256m %ymm0, %ymm0, %rsi, 1, %noreg, 0, %noreg
; CHECK: %ymm0 = VFNMSUB132PDYr %ymm0, %ymm1, %ymm2
%ymm0 = VFNMSUB132PDZ256r %ymm0, %ymm1, %ymm2
- ; CHECK: %ymm0 = VFNMSUB132PSYm %ymm0, %ymm0, %rsi, 1, _, 0, _
- %ymm0 = VFNMSUB132PSZ256m %ymm0, %ymm0, %rsi, 1, _, 0, _
+ ; CHECK: %ymm0 = VFNMSUB132PSYm %ymm0, %ymm0, %rsi, 1, %noreg, 0, %noreg
+ %ymm0 = VFNMSUB132PSZ256m %ymm0, %ymm0, %rsi, 1, %noreg, 0, %noreg
; CHECK: %ymm0 = VFNMSUB132PSYr %ymm0, %ymm1, %ymm2
%ymm0 = VFNMSUB132PSZ256r %ymm0, %ymm1, %ymm2
- ; CHECK: %ymm0 = VFNMSUB213PDYm %ymm0, %ymm0, %rsi, 1, _, 0, _
- %ymm0 = VFNMSUB213PDZ256m %ymm0, %ymm0, %rsi, 1, _, 0, _
+ ; CHECK: %ymm0 = VFNMSUB213PDYm %ymm0, %ymm0, %rsi, 1, %noreg, 0, %noreg
+ %ymm0 = VFNMSUB213PDZ256m %ymm0, %ymm0, %rsi, 1, %noreg, 0, %noreg
; CHECK: %ymm0 = VFNMSUB213PDYr %ymm0, %ymm1, %ymm2
%ymm0 = VFNMSUB213PDZ256r %ymm0, %ymm1, %ymm2
- ; CHECK: %ymm0 = VFNMSUB213PSYm %ymm0, %ymm0, %rsi, 1, _, 0, _
- %ymm0 = VFNMSUB213PSZ256m %ymm0, %ymm0, %rsi, 1, _, 0, _
+ ; CHECK: %ymm0 = VFNMSUB213PSYm %ymm0, %ymm0, %rsi, 1, %noreg, 0, %noreg
+ %ymm0 = VFNMSUB213PSZ256m %ymm0, %ymm0, %rsi, 1, %noreg, 0, %noreg
; CHECK: %ymm0 = VFNMSUB213PSYr %ymm0, %ymm1, %ymm2
%ymm0 = VFNMSUB213PSZ256r %ymm0, %ymm1, %ymm2
- ; CHECK: %ymm0 = VFNMSUB231PDYm %ymm0, %ymm0, %rsi, 1, _, 0, _
- %ymm0 = VFNMSUB231PDZ256m %ymm0, %ymm0, %rsi, 1, _, 0, _
+ ; CHECK: %ymm0 = VFNMSUB231PDYm %ymm0, %ymm0, %rsi, 1, %noreg, 0, %noreg
+ %ymm0 = VFNMSUB231PDZ256m %ymm0, %ymm0, %rsi, 1, %noreg, 0, %noreg
; CHECK: %ymm0 = VFNMSUB231PDYr %ymm0, %ymm1, %ymm2
%ymm0 = VFNMSUB231PDZ256r %ymm0, %ymm1, %ymm2
- ; CHECK: %ymm0 = VFNMSUB231PSYm %ymm0, %ymm0, %rsi, 1, _, 0, _
- %ymm0 = VFNMSUB231PSZ256m %ymm0, %ymm0, %rsi, 1, _, 0, _
+ ; CHECK: %ymm0 = VFNMSUB231PSYm %ymm0, %ymm0, %rsi, 1, %noreg, 0, %noreg
+ %ymm0 = VFNMSUB231PSZ256m %ymm0, %ymm0, %rsi, 1, %noreg, 0, %noreg
; CHECK: %ymm0 = VFNMSUB231PSYr %ymm0, %ymm1, %ymm2
%ymm0 = VFNMSUB231PSZ256r %ymm0, %ymm1, %ymm2
; CHECK: %ymm0 = VPSRADYri %ymm0, 7
%ymm0 = VPSRADZ256ri %ymm0, 7
- ; CHECK: %ymm0 = VPSRADYrm %ymm0, %rip, 1, _, %rax, _
- %ymm0 = VPSRADZ256rm %ymm0, %rip, 1, _, %rax, _
+ ; CHECK: %ymm0 = VPSRADYrm %ymm0, %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VPSRADZ256rm %ymm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VPSRADYrr %ymm0, %xmm1
%ymm0 = VPSRADZ256rr %ymm0, %xmm1
- ; CHECK: %ymm0 = VPSRAVDYrm %ymm0, %rip, 1, _, %rax, _
- %ymm0 = VPSRAVDZ256rm %ymm0, %rip, 1, _, %rax, _
+ ; CHECK: %ymm0 = VPSRAVDYrm %ymm0, %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VPSRAVDZ256rm %ymm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VPSRAVDYrr %ymm0, %ymm1
%ymm0 = VPSRAVDZ256rr %ymm0, %ymm1
; CHECK: %ymm0 = VPSRAWYri %ymm0, 7
%ymm0 = VPSRAWZ256ri %ymm0, 7
- ; CHECK: %ymm0 = VPSRAWYrm %ymm0, %rip, 1, _, %rax, _
- %ymm0 = VPSRAWZ256rm %ymm0, %rip, 1, _, %rax, _
+ ; CHECK: %ymm0 = VPSRAWYrm %ymm0, %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VPSRAWZ256rm %ymm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VPSRAWYrr %ymm0, %xmm1
%ymm0 = VPSRAWZ256rr %ymm0, %xmm1
; CHECK: %ymm0 = VPSRLDQYri %ymm0, %ymm1
%ymm0 = VPSRLDQZ256rr %ymm0, %ymm1
; CHECK: %ymm0 = VPSRLDYri %ymm0, 7
%ymm0 = VPSRLDZ256ri %ymm0, 7
- ; CHECK: %ymm0 = VPSRLDYrm %ymm0, %rip, 1, _, %rax, _
- %ymm0 = VPSRLDZ256rm %ymm0, %rip, 1, _, %rax, _
+ ; CHECK: %ymm0 = VPSRLDYrm %ymm0, %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VPSRLDZ256rm %ymm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VPSRLDYrr %ymm0, %xmm1
%ymm0 = VPSRLDZ256rr %ymm0, %xmm1
; CHECK: %ymm0 = VPSRLQYri %ymm0, 7
%ymm0 = VPSRLQZ256ri %ymm0, 7
- ; CHECK: %ymm0 = VPSRLQYrm %ymm0, %rip, 1, _, %rax, _
- %ymm0 = VPSRLQZ256rm %ymm0, %rip, 1, _, %rax, _
+ ; CHECK: %ymm0 = VPSRLQYrm %ymm0, %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VPSRLQZ256rm %ymm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VPSRLQYrr %ymm0, %xmm1
%ymm0 = VPSRLQZ256rr %ymm0, %xmm1
- ; CHECK: %ymm0 = VPSRLVDYrm %ymm0, %rip, 1, _, %rax, _
- %ymm0 = VPSRLVDZ256rm %ymm0, %rip, 1, _, %rax, _
+ ; CHECK: %ymm0 = VPSRLVDYrm %ymm0, %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VPSRLVDZ256rm %ymm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VPSRLVDYrr %ymm0, %ymm1
%ymm0 = VPSRLVDZ256rr %ymm0, %ymm1
- ; CHECK: %ymm0 = VPSRLVQYrm %ymm0, %rip, 1, _, %rax, _
- %ymm0 = VPSRLVQZ256rm %ymm0, %rip, 1, _, %rax, _
+ ; CHECK: %ymm0 = VPSRLVQYrm %ymm0, %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VPSRLVQZ256rm %ymm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VPSRLVQYrr %ymm0, %ymm1
%ymm0 = VPSRLVQZ256rr %ymm0, %ymm1
; CHECK: %ymm0 = VPSRLWYri %ymm0, 7
%ymm0 = VPSRLWZ256ri %ymm0, 7
- ; CHECK: %ymm0 = VPSRLWYrm %ymm0, %rip, 1, _, %rax, _
- %ymm0 = VPSRLWZ256rm %ymm0, %rip, 1, _, %rax, _
+ ; CHECK: %ymm0 = VPSRLWYrm %ymm0, %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VPSRLWZ256rm %ymm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VPSRLWYrr %ymm0, %xmm1
%ymm0 = VPSRLWZ256rr %ymm0, %xmm1
- ; CHECK: %ymm0 = VPMOVSXBDYrm %rip, 1, _, %rax, _
- %ymm0 = VPMOVSXBDZ256rm %rip, 1, _, %rax, _
+ ; CHECK: %ymm0 = VPMOVSXBDYrm %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VPMOVSXBDZ256rm %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VPMOVSXBDYrr %xmm0
%ymm0 = VPMOVSXBDZ256rr %xmm0
- ; CHECK: %ymm0 = VPMOVSXBQYrm %rip, 1, _, %rax, _
- %ymm0 = VPMOVSXBQZ256rm %rip, 1, _, %rax, _
+ ; CHECK: %ymm0 = VPMOVSXBQYrm %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VPMOVSXBQZ256rm %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VPMOVSXBQYrr %xmm0
%ymm0 = VPMOVSXBQZ256rr %xmm0
- ; CHECK: %ymm0 = VPMOVSXBWYrm %rip, 1, _, %rax, _
- %ymm0 = VPMOVSXBWZ256rm %rip, 1, _, %rax, _
+ ; CHECK: %ymm0 = VPMOVSXBWYrm %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VPMOVSXBWZ256rm %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VPMOVSXBWYrr %xmm0
%ymm0 = VPMOVSXBWZ256rr %xmm0
- ; CHECK: %ymm0 = VPMOVSXDQYrm %rip, 1, _, %rax, _
- %ymm0 = VPMOVSXDQZ256rm %rip, 1, _, %rax, _
+ ; CHECK: %ymm0 = VPMOVSXDQYrm %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VPMOVSXDQZ256rm %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VPMOVSXDQYrr %xmm0
%ymm0 = VPMOVSXDQZ256rr %xmm0
- ; CHECK: %ymm0 = VPMOVSXWDYrm %rip, 1, _, %rax, _
- %ymm0 = VPMOVSXWDZ256rm %rip, 1, _, %rax, _
+ ; CHECK: %ymm0 = VPMOVSXWDYrm %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VPMOVSXWDZ256rm %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VPMOVSXWDYrr %xmm0
%ymm0 = VPMOVSXWDZ256rr %xmm0
- ; CHECK: %ymm0 = VPMOVSXWQYrm %rip, 1, _, %rax, _
- %ymm0 = VPMOVSXWQZ256rm %rip, 1, _, %rax, _
+ ; CHECK: %ymm0 = VPMOVSXWQYrm %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VPMOVSXWQZ256rm %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VPMOVSXWQYrr %xmm0
%ymm0 = VPMOVSXWQZ256rr %xmm0
- ; CHECK: %ymm0 = VPMOVZXBDYrm %rip, 1, _, %rax, _
- %ymm0 = VPMOVZXBDZ256rm %rip, 1, _, %rax, _
+ ; CHECK: %ymm0 = VPMOVZXBDYrm %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VPMOVZXBDZ256rm %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VPMOVZXBDYrr %xmm0
%ymm0 = VPMOVZXBDZ256rr %xmm0
- ; CHECK: %ymm0 = VPMOVZXBQYrm %rip, 1, _, %rax, _
- %ymm0 = VPMOVZXBQZ256rm %rip, 1, _, %rax, _
+ ; CHECK: %ymm0 = VPMOVZXBQYrm %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VPMOVZXBQZ256rm %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VPMOVZXBQYrr %xmm0
%ymm0 = VPMOVZXBQZ256rr %xmm0
- ; CHECK: %ymm0 = VPMOVZXBWYrm %rip, 1, _, %rax, _
- %ymm0 = VPMOVZXBWZ256rm %rip, 1, _, %rax, _
+ ; CHECK: %ymm0 = VPMOVZXBWYrm %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VPMOVZXBWZ256rm %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VPMOVZXBWYrr %xmm0
%ymm0 = VPMOVZXBWZ256rr %xmm0
- ; CHECK: %ymm0 = VPMOVZXDQYrm %rip, 1, _, %rax, _
- %ymm0 = VPMOVZXDQZ256rm %rip, 1, _, %rax, _
+ ; CHECK: %ymm0 = VPMOVZXDQYrm %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VPMOVZXDQZ256rm %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VPMOVZXDQYrr %xmm0
%ymm0 = VPMOVZXDQZ256rr %xmm0
- ; CHECK: %ymm0 = VPMOVZXWDYrm %rip, 1, _, %rax, _
- %ymm0 = VPMOVZXWDZ256rm %rip, 1, _, %rax, _
+ ; CHECK: %ymm0 = VPMOVZXWDYrm %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VPMOVZXWDZ256rm %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VPMOVZXWDYrr %xmm0
%ymm0 = VPMOVZXWDZ256rr %xmm0
- ; CHECK: %ymm0 = VPMOVZXWQYrm %rip, 1, _, %rax, _
- %ymm0 = VPMOVZXWQZ256rm %rip, 1, _, %rax, _
+ ; CHECK: %ymm0 = VPMOVZXWQYrm %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VPMOVZXWQZ256rm %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VPMOVZXWQYrr %xmm0
%ymm0 = VPMOVZXWQZ256rr %xmm0
- ; CHECK: %ymm0 = VBROADCASTF128 %rip, 1, _, %rax, _
- %ymm0 = VBROADCASTF32X4Z256rm %rip, 1, _, %rax, _
- ; CHECK: %ymm0 = VBROADCASTSDYrm %rip, 1, _, %rax, _
- %ymm0 = VBROADCASTF32X2Z256m %rip, 1, _, %rax, _
+ ; CHECK: %ymm0 = VBROADCASTF128 %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VBROADCASTF32X4Z256rm %rip, 1, %noreg, %rax, %noreg
+ ; CHECK: %ymm0 = VBROADCASTSDYrm %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VBROADCASTF32X2Z256m %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VBROADCASTSDYrr %xmm0
%ymm0 = VBROADCASTF32X2Z256r %xmm0
- ; CHECK: %ymm0 = VBROADCASTSDYrm %rip, 1, _, %rax, _
- %ymm0 = VBROADCASTSDZ256m %rip, 1, _, %rax, _
+ ; CHECK: %ymm0 = VBROADCASTSDYrm %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VBROADCASTSDZ256m %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VBROADCASTSDYrr %xmm0
%ymm0 = VBROADCASTSDZ256r %xmm0
- ; CHECK: %ymm0 = VBROADCASTSSYrm %rip, 1, _, %rax, _
- %ymm0 = VBROADCASTSSZ256m %rip, 1, _, %rax, _
+ ; CHECK: %ymm0 = VBROADCASTSSYrm %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VBROADCASTSSZ256m %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VBROADCASTSSYrr %xmm0
%ymm0 = VBROADCASTSSZ256r %xmm0
- ; CHECK: %ymm0 = VPBROADCASTBYrm %rip, 1, _, %rax, _
- %ymm0 = VPBROADCASTBZ256m %rip, 1, _, %rax, _
+ ; CHECK: %ymm0 = VPBROADCASTBYrm %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VPBROADCASTBZ256m %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VPBROADCASTBYrr %xmm0
%ymm0 = VPBROADCASTBZ256r %xmm0
- ; CHECK: %ymm0 = VPBROADCASTDYrm %rip, 1, _, %rax, _
- %ymm0 = VPBROADCASTDZ256m %rip, 1, _, %rax, _
+ ; CHECK: %ymm0 = VPBROADCASTDYrm %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VPBROADCASTDZ256m %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VPBROADCASTDYrr %xmm0
%ymm0 = VPBROADCASTDZ256r %xmm0
- ; CHECK: %ymm0 = VPBROADCASTWYrm %rip, 1, _, %rax, _
- %ymm0 = VPBROADCASTWZ256m %rip, 1, _, %rax, _
+ ; CHECK: %ymm0 = VPBROADCASTWYrm %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VPBROADCASTWZ256m %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VPBROADCASTWYrr %xmm0
%ymm0 = VPBROADCASTWZ256r %xmm0
- ; CHECK: %ymm0 = VBROADCASTI128 %rip, 1, _, %rax, _
- %ymm0 = VBROADCASTI32X4Z256rm %rip, 1, _, %rax, _
- ; CHECK: %ymm0 = VPBROADCASTQYrm %rip, 1, _, %rax, _
- %ymm0 = VBROADCASTI32X2Z256m %rip, 1, _, %rax, _
+ ; CHECK: %ymm0 = VBROADCASTI128 %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VBROADCASTI32X4Z256rm %rip, 1, %noreg, %rax, %noreg
+ ; CHECK: %ymm0 = VPBROADCASTQYrm %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VBROADCASTI32X2Z256m %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VPBROADCASTQYrr %xmm0
%ymm0 = VBROADCASTI32X2Z256r %xmm0
- ; CHECK: %ymm0 = VPBROADCASTQYrm %rip, 1, _, %rax, _
- %ymm0 = VPBROADCASTQZ256m %rip, 1, _, %rax, _
+ ; CHECK: %ymm0 = VPBROADCASTQYrm %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VPBROADCASTQZ256m %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VPBROADCASTQYrr %xmm0
%ymm0 = VPBROADCASTQZ256r %xmm0
- ; CHECK: %ymm0 = VPABSBYrm %rip, 1, _, %rax, _
- %ymm0 = VPABSBZ256rm %rip, 1, _, %rax, _
+ ; CHECK: %ymm0 = VPABSBYrm %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VPABSBZ256rm %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VPABSBYrr %ymm0
%ymm0 = VPABSBZ256rr %ymm0
- ; CHECK: %ymm0 = VPABSDYrm %rip, 1, _, %rax, _
- %ymm0 = VPABSDZ256rm %rip, 1, _, %rax, _
+ ; CHECK: %ymm0 = VPABSDYrm %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VPABSDZ256rm %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VPABSDYrr %ymm0
%ymm0 = VPABSDZ256rr %ymm0
- ; CHECK: %ymm0 = VPABSWYrm %rip, 1, _, %rax, _
- %ymm0 = VPABSWZ256rm %rip, 1, _, %rax, _
+ ; CHECK: %ymm0 = VPABSWYrm %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VPABSWZ256rm %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VPABSWYrr %ymm0
%ymm0 = VPABSWZ256rr %ymm0
- ; CHECK: %ymm0 = VPSADBWYrm %ymm0, 1, _, %rax, _, _
- %ymm0 = VPSADBWZ256rm %ymm0, 1, _, %rax, _, _
+ ; CHECK: %ymm0 = VPSADBWYrm %ymm0, 1, %noreg, %rax, %noreg, %noreg
+ %ymm0 = VPSADBWZ256rm %ymm0, 1, %noreg, %rax, %noreg, %noreg
; CHECK: %ymm0 = VPSADBWYrr %ymm0, %ymm1
%ymm0 = VPSADBWZ256rr %ymm0, %ymm1
- ; CHECK: %ymm0 = VPERMDYrm %ymm0, %rdi, 1, _, 0, _
- %ymm0 = VPERMDZ256rm %ymm0, %rdi, 1, _, 0, _
+ ; CHECK: %ymm0 = VPERMDYrm %ymm0, %rdi, 1, %noreg, 0, %noreg
+ %ymm0 = VPERMDZ256rm %ymm0, %rdi, 1, %noreg, 0, %noreg
; CHECK: %ymm0 = VPERMDYrr %ymm1, %ymm0
%ymm0 = VPERMDZ256rr %ymm1, %ymm0
- ; CHECK: %ymm0 = VPERMILPDYmi %rdi, 1, _, 0, _, _
- %ymm0 = VPERMILPDZ256mi %rdi, 1, _, 0, _, _
+ ; CHECK: %ymm0 = VPERMILPDYmi %rdi, 1, %noreg, 0, %noreg, %noreg
+ %ymm0 = VPERMILPDZ256mi %rdi, 1, %noreg, 0, %noreg, %noreg
; CHECK: %ymm0 = VPERMILPDYri %ymm0, 7
%ymm0 = VPERMILPDZ256ri %ymm0, 7
- ; CHECK: %ymm0 = VPERMILPDYrm %ymm0, %rdi, 1, _, 0, _
- %ymm0 = VPERMILPDZ256rm %ymm0, %rdi, 1, _, 0, _
+ ; CHECK: %ymm0 = VPERMILPDYrm %ymm0, %rdi, 1, %noreg, 0, %noreg
+ %ymm0 = VPERMILPDZ256rm %ymm0, %rdi, 1, %noreg, 0, %noreg
; CHECK: %ymm0 = VPERMILPDYrr %ymm1, %ymm0
%ymm0 = VPERMILPDZ256rr %ymm1, %ymm0
- ; CHECK: %ymm0 = VPERMILPSYmi %rdi, 1, _, 0, _, _
- %ymm0 = VPERMILPSZ256mi %rdi, 1, _, 0, _, _
+ ; CHECK: %ymm0 = VPERMILPSYmi %rdi, 1, %noreg, 0, %noreg, %noreg
+ %ymm0 = VPERMILPSZ256mi %rdi, 1, %noreg, 0, %noreg, %noreg
; CHECK: %ymm0 = VPERMILPSYri %ymm0, 7
%ymm0 = VPERMILPSZ256ri %ymm0, 7
- ; CHECK: %ymm0 = VPERMILPSYrm %ymm0, %rdi, 1, _, 0, _
- %ymm0 = VPERMILPSZ256rm %ymm0, %rdi, 1, _, 0, _
+ ; CHECK: %ymm0 = VPERMILPSYrm %ymm0, %rdi, 1, %noreg, 0, %noreg
+ %ymm0 = VPERMILPSZ256rm %ymm0, %rdi, 1, %noreg, 0, %noreg
; CHECK: %ymm0 = VPERMILPSYrr %ymm1, %ymm0
%ymm0 = VPERMILPSZ256rr %ymm1, %ymm0
- ; CHECK: %ymm0 = VPERMPDYmi %rdi, 1, _, 0, _, _
- %ymm0 = VPERMPDZ256mi %rdi, 1, _, 0, _, _
+ ; CHECK: %ymm0 = VPERMPDYmi %rdi, 1, %noreg, 0, %noreg, %noreg
+ %ymm0 = VPERMPDZ256mi %rdi, 1, %noreg, 0, %noreg, %noreg
; CHECK: %ymm0 = VPERMPDYri %ymm0, 7
%ymm0 = VPERMPDZ256ri %ymm0, 7
- ; CHECK: %ymm0 = VPERMPSYrm %ymm0, %rdi, 1, _, 0, _
- %ymm0 = VPERMPSZ256rm %ymm0, %rdi, 1, _, 0, _
+ ; CHECK: %ymm0 = VPERMPSYrm %ymm0, %rdi, 1, %noreg, 0, %noreg
+ %ymm0 = VPERMPSZ256rm %ymm0, %rdi, 1, %noreg, 0, %noreg
; CHECK: %ymm0 = VPERMPSYrr %ymm1, %ymm0
%ymm0 = VPERMPSZ256rr %ymm1, %ymm0
- ; CHECK: %ymm0 = VPERMQYmi %rdi, 1, _, 0, _, _
- %ymm0 = VPERMQZ256mi %rdi, 1, _, 0, _, _
+ ; CHECK: %ymm0 = VPERMQYmi %rdi, 1, %noreg, 0, %noreg, %noreg
+ %ymm0 = VPERMQZ256mi %rdi, 1, %noreg, 0, %noreg, %noreg
; CHECK: %ymm0 = VPERMQYri %ymm0, 7
%ymm0 = VPERMQZ256ri %ymm0, 7
; CHECK: %ymm0 = VPSLLDQYri %ymm0, 14
%ymm0 = VPSLLDQZ256rr %ymm0, 14
; CHECK: %ymm0 = VPSLLDYri %ymm0, 7
%ymm0 = VPSLLDZ256ri %ymm0, 7
- ; CHECK: %ymm0 = VPSLLDYrm %ymm0, %rip, 1, _, %rax, _
- %ymm0 = VPSLLDZ256rm %ymm0, %rip, 1, _, %rax, _
+ ; CHECK: %ymm0 = VPSLLDYrm %ymm0, %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VPSLLDZ256rm %ymm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VPSLLDYrr %ymm0, 14
%ymm0 = VPSLLDZ256rr %ymm0, 14
; CHECK: %ymm0 = VPSLLQYri %ymm0, 7
%ymm0 = VPSLLQZ256ri %ymm0, 7
- ; CHECK: %ymm0 = VPSLLQYrm %ymm0, %rip, 1, _, %rax, _
- %ymm0 = VPSLLQZ256rm %ymm0, %rip, 1, _, %rax, _
+ ; CHECK: %ymm0 = VPSLLQYrm %ymm0, %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VPSLLQZ256rm %ymm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VPSLLQYrr %ymm0, 14
%ymm0 = VPSLLQZ256rr %ymm0, 14
- ; CHECK: %ymm0 = VPSLLVDYrm %ymm0, %rip, 1, _, %rax, _
- %ymm0 = VPSLLVDZ256rm %ymm0, %rip, 1, _, %rax, _
+ ; CHECK: %ymm0 = VPSLLVDYrm %ymm0, %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VPSLLVDZ256rm %ymm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VPSLLVDYrr %ymm0, 14
%ymm0 = VPSLLVDZ256rr %ymm0, 14
- ; CHECK: %ymm0 = VPSLLVQYrm %ymm0, %rip, 1, _, %rax, _
- %ymm0 = VPSLLVQZ256rm %ymm0, %rip, 1, _, %rax, _
+ ; CHECK: %ymm0 = VPSLLVQYrm %ymm0, %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VPSLLVQZ256rm %ymm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VPSLLVQYrr %ymm0, 14
%ymm0 = VPSLLVQZ256rr %ymm0, 14
; CHECK: %ymm0 = VPSLLWYri %ymm0, 7
%ymm0 = VPSLLWZ256ri %ymm0, 7
- ; CHECK: %ymm0 = VPSLLWYrm %ymm0, %rip, 1, _, %rax, _
- %ymm0 = VPSLLWZ256rm %ymm0, %rip, 1, _, %rax, _
+ ; CHECK: %ymm0 = VPSLLWYrm %ymm0, %rip, 1, %noreg, %rax, %noreg
+ %ymm0 = VPSLLWZ256rm %ymm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm0 = VPSLLWYrr %ymm0, 14
%ymm0 = VPSLLWZ256rr %ymm0, 14
- ; CHECK: %ymm0 = VCVTDQ2PDYrm %rdi, %ymm0, 1, _, 0
- %ymm0 = VCVTDQ2PDZ256rm %rdi, %ymm0, 1, _, 0
+ ; CHECK: %ymm0 = VCVTDQ2PDYrm %rdi, %ymm0, 1, %noreg, 0
+ %ymm0 = VCVTDQ2PDZ256rm %rdi, %ymm0, 1, %noreg, 0
; CHECK: %ymm0 = VCVTDQ2PDYrr %xmm0
%ymm0 = VCVTDQ2PDZ256rr %xmm0
- ; CHECK: %ymm0 = VCVTDQ2PSYrm %rdi, %ymm0, 1, _, 0
- %ymm0 = VCVTDQ2PSZ256rm %rdi, %ymm0, 1, _, 0
+ ; CHECK: %ymm0 = VCVTDQ2PSYrm %rdi, %ymm0, 1, %noreg, 0
+ %ymm0 = VCVTDQ2PSZ256rm %rdi, %ymm0, 1, %noreg, 0
; CHECK: %ymm0 = VCVTDQ2PSYrr %ymm0
%ymm0 = VCVTDQ2PSZ256rr %ymm0
- ; CHECK: %xmm0 = VCVTPD2DQYrm %rdi, %ymm0, 1, _, 0
- %xmm0 = VCVTPD2DQZ256rm %rdi, %ymm0, 1, _, 0
+ ; CHECK: %xmm0 = VCVTPD2DQYrm %rdi, %ymm0, 1, %noreg, 0
+ %xmm0 = VCVTPD2DQZ256rm %rdi, %ymm0, 1, %noreg, 0
; CHECK: %xmm0 = VCVTPD2DQYrr %ymm0
%xmm0 = VCVTPD2DQZ256rr %ymm0
- ; CHECK: %xmm0 = VCVTPD2PSYrm %rdi, %ymm0, 1, _, 0
- %xmm0 = VCVTPD2PSZ256rm %rdi, %ymm0, 1, _, 0
+ ; CHECK: %xmm0 = VCVTPD2PSYrm %rdi, %ymm0, 1, %noreg, 0
+ %xmm0 = VCVTPD2PSZ256rm %rdi, %ymm0, 1, %noreg, 0
; CHECK: %xmm0 = VCVTPD2PSYrr %ymm0
%xmm0 = VCVTPD2PSZ256rr %ymm0
- ; CHECK: %ymm0 = VCVTPS2DQYrm %rdi, %ymm0, 1, _, 0
- %ymm0 = VCVTPS2DQZ256rm %rdi, %ymm0, 1, _, 0
+ ; CHECK: %ymm0 = VCVTPS2DQYrm %rdi, %ymm0, 1, %noreg, 0
+ %ymm0 = VCVTPS2DQZ256rm %rdi, %ymm0, 1, %noreg, 0
; CHECK: %ymm0 = VCVTPS2DQYrr %ymm0
%ymm0 = VCVTPS2DQZ256rr %ymm0
- ; CHECK: %ymm0 = VCVTPS2PDYrm %rdi, %ymm0, 1, _, 0
- %ymm0 = VCVTPS2PDZ256rm %rdi, %ymm0, 1, _, 0
+ ; CHECK: %ymm0 = VCVTPS2PDYrm %rdi, %ymm0, 1, %noreg, 0
+ %ymm0 = VCVTPS2PDZ256rm %rdi, %ymm0, 1, %noreg, 0
; CHECK: %ymm0 = VCVTPS2PDYrr %xmm0
%ymm0 = VCVTPS2PDZ256rr %xmm0
- ; CHECK: VCVTPS2PHYmr %rdi, %ymm0, 1, _, 0, _, _
- VCVTPS2PHZ256mr %rdi, %ymm0, 1, _, 0, _, _
- ; CHECK: %xmm0 = VCVTPS2PHYrr %ymm0, _
- %xmm0 = VCVTPS2PHZ256rr %ymm0, _
- ; CHECK: %ymm0 = VCVTPH2PSYrm %rdi, %ymm0, 1, _, 0
- %ymm0 = VCVTPH2PSZ256rm %rdi, %ymm0, 1, _, 0
+ ; CHECK: VCVTPS2PHYmr %rdi, %ymm0, 1, %noreg, 0, %noreg, %noreg
+ VCVTPS2PHZ256mr %rdi, %ymm0, 1, %noreg, 0, %noreg, %noreg
+ ; CHECK: %xmm0 = VCVTPS2PHYrr %ymm0, %noreg
+ %xmm0 = VCVTPS2PHZ256rr %ymm0, %noreg
+ ; CHECK: %ymm0 = VCVTPH2PSYrm %rdi, %ymm0, 1, %noreg, 0
+ %ymm0 = VCVTPH2PSZ256rm %rdi, %ymm0, 1, %noreg, 0
; CHECK: %ymm0 = VCVTPH2PSYrr %xmm0
%ymm0 = VCVTPH2PSZ256rr %xmm0
- ; CHECK: %xmm0 = VCVTTPD2DQYrm %rdi, %ymm0, 1, _, 0
- %xmm0 = VCVTTPD2DQZ256rm %rdi, %ymm0, 1, _, 0
+ ; CHECK: %xmm0 = VCVTTPD2DQYrm %rdi, %ymm0, 1, %noreg, 0
+ %xmm0 = VCVTTPD2DQZ256rm %rdi, %ymm0, 1, %noreg, 0
; CHECK: %xmm0 = VCVTTPD2DQYrr %ymm0
%xmm0 = VCVTTPD2DQZ256rr %ymm0
- ; CHECK: %ymm0 = VCVTTPS2DQYrm %rdi, %ymm0, 1, _, 0
- %ymm0 = VCVTTPS2DQZ256rm %rdi, %ymm0, 1, _, 0
+ ; CHECK: %ymm0 = VCVTTPS2DQYrm %rdi, %ymm0, 1, %noreg, 0
+ %ymm0 = VCVTTPS2DQZ256rm %rdi, %ymm0, 1, %noreg, 0
; CHECK: %ymm0 = VCVTTPS2DQYrr %ymm0
%ymm0 = VCVTTPS2DQZ256rr %ymm0
- ; CHECK: %ymm0 = VSQRTPDYm %rdi, _, _, _, _
- %ymm0 = VSQRTPDZ256m %rdi, _, _, _, _
+ ; CHECK: %ymm0 = VSQRTPDYm %rdi, %noreg, %noreg, %noreg, %noreg
+ %ymm0 = VSQRTPDZ256m %rdi, %noreg, %noreg, %noreg, %noreg
; CHECK: %ymm0 = VSQRTPDYr %ymm0
%ymm0 = VSQRTPDZ256r %ymm0
- ; CHECK: %ymm0 = VSQRTPSYm %rdi, _, _, _, _
- %ymm0 = VSQRTPSZ256m %rdi, _, _, _, _
+ ; CHECK: %ymm0 = VSQRTPSYm %rdi, %noreg, %noreg, %noreg, %noreg
+ %ymm0 = VSQRTPSZ256m %rdi, %noreg, %noreg, %noreg, %noreg
; CHECK: %ymm0 = VSQRTPSYr %ymm0
%ymm0 = VSQRTPSZ256r %ymm0
- ; CHECK: %ymm0 = VPALIGNRYrmi %ymm0, %rdi, _, _, _, _, _
- %ymm0 = VPALIGNRZ256rmi %ymm0, %rdi, _, _, _, _, _
- ; CHECK: %ymm0 = VPALIGNRYrri %ymm0, %ymm1, _
- %ymm0 = VPALIGNRZ256rri %ymm0, %ymm1, _
- ; CHECK: %ymm0 = VMOVUPSYrm %rdi, 1, _, 0, _
- %ymm0 = VMOVUPSZ256rm %rdi, 1, _, 0, _
+ ; CHECK: %ymm0 = VPALIGNRYrmi %ymm0, %rdi, %noreg, %noreg, %noreg, %noreg, %noreg
+ %ymm0 = VPALIGNRZ256rmi %ymm0, %rdi, %noreg, %noreg, %noreg, %noreg, %noreg
+ ; CHECK: %ymm0 = VPALIGNRYrri %ymm0, %ymm1, %noreg
+ %ymm0 = VPALIGNRZ256rri %ymm0, %ymm1, %noreg
+ ; CHECK: %ymm0 = VMOVUPSYrm %rdi, 1, %noreg, 0, %noreg
+ %ymm0 = VMOVUPSZ256rm %rdi, 1, %noreg, 0, %noreg
; CHECK: %ymm0 = VMOVUPSYrr %ymm0
%ymm0 = VMOVUPSZ256rr %ymm0
; CHECK: %ymm0 = VMOVUPSYrr_REV %ymm0
%ymm0 = VMOVUPSZ256rr_REV %ymm0
- ; CHECK: %ymm0 = VPSHUFBYrm %ymm0, _, _, _, _, _
- %ymm0 = VPSHUFBZ256rm %ymm0, _, _, _, _, _
+ ; CHECK: %ymm0 = VPSHUFBYrm %ymm0, %noreg, %noreg, %noreg, %noreg, %noreg
+ %ymm0 = VPSHUFBZ256rm %ymm0, %noreg, %noreg, %noreg, %noreg, %noreg
; CHECK: %ymm0 = VPSHUFBYrr %ymm0, %ymm1
%ymm0 = VPSHUFBZ256rr %ymm0, %ymm1
- ; CHECK: %ymm0 = VPSHUFDYmi %rdi, 1, _, 0, _, _
- %ymm0 = VPSHUFDZ256mi %rdi, 1, _, 0, _, _
+ ; CHECK: %ymm0 = VPSHUFDYmi %rdi, 1, %noreg, 0, %noreg, %noreg
+ %ymm0 = VPSHUFDZ256mi %rdi, 1, %noreg, 0, %noreg, %noreg
; CHECK: %ymm0 = VPSHUFDYri %ymm0, -24
%ymm0 = VPSHUFDZ256ri %ymm0, -24
- ; CHECK: %ymm0 = VPSHUFHWYmi %rdi, 1, _, 0, _, _
- %ymm0 = VPSHUFHWZ256mi %rdi, 1, _, 0, _, _
+ ; CHECK: %ymm0 = VPSHUFHWYmi %rdi, 1, %noreg, 0, %noreg, %noreg
+ %ymm0 = VPSHUFHWZ256mi %rdi, 1, %noreg, 0, %noreg, %noreg
; CHECK: %ymm0 = VPSHUFHWYri %ymm0, -24
%ymm0 = VPSHUFHWZ256ri %ymm0, -24
- ; CHECK: %ymm0 = VPSHUFLWYmi %rdi, 1, _, 0, _, _
- %ymm0 = VPSHUFLWZ256mi %rdi, 1, _, 0, _, _
+ ; CHECK: %ymm0 = VPSHUFLWYmi %rdi, 1, %noreg, 0, %noreg, %noreg
+ %ymm0 = VPSHUFLWZ256mi %rdi, 1, %noreg, 0, %noreg, %noreg
; CHECK: %ymm0 = VPSHUFLWYri %ymm0, -24
%ymm0 = VPSHUFLWZ256ri %ymm0, -24
- ; CHECK: %ymm0 = VSHUFPDYrmi %ymm0, _, _, _, _, _, _
- %ymm0 = VSHUFPDZ256rmi %ymm0, _, _, _, _, _, _
- ; CHECK: %ymm0 = VSHUFPDYrri %ymm0, _, _
- %ymm0 = VSHUFPDZ256rri %ymm0, _, _
- ; CHECK: %ymm0 = VSHUFPSYrmi %ymm0, _, _, _, _, _, _
- %ymm0 = VSHUFPSZ256rmi %ymm0, _, _, _, _, _, _
- ; CHECK: %ymm0 = VSHUFPSYrri %ymm0, _, _
- %ymm0 = VSHUFPSZ256rri %ymm0, _, _
+ ; CHECK: %ymm0 = VSHUFPDYrmi %ymm0, %noreg, %noreg, %noreg, %noreg, %noreg, %noreg
+ %ymm0 = VSHUFPDZ256rmi %ymm0, %noreg, %noreg, %noreg, %noreg, %noreg, %noreg
+ ; CHECK: %ymm0 = VSHUFPDYrri %ymm0, %noreg, %noreg
+ %ymm0 = VSHUFPDZ256rri %ymm0, %noreg, %noreg
+ ; CHECK: %ymm0 = VSHUFPSYrmi %ymm0, %noreg, %noreg, %noreg, %noreg, %noreg, %noreg
+ %ymm0 = VSHUFPSZ256rmi %ymm0, %noreg, %noreg, %noreg, %noreg, %noreg, %noreg
+ ; CHECK: %ymm0 = VSHUFPSYrri %ymm0, %noreg, %noreg
+ %ymm0 = VSHUFPSZ256rri %ymm0, %noreg, %noreg
RET 0, %zmm0, %zmm1
...
@@ -899,80 +899,80 @@ body: |
name: evex_z128_to_vex_test
body: |
bb.0:
- ; CHECK: VMOVAPDmr %rdi, 1, _, 0, _, %xmm0
- VMOVAPDZ128mr %rdi, 1, _, 0, _, %xmm0
- ; CHECK: %xmm0 = VMOVAPDrm %rip, 1, _, %rax, _
- %xmm0 = VMOVAPDZ128rm %rip, 1, _, %rax, _
+ ; CHECK: VMOVAPDmr %rdi, 1, %noreg, 0, %noreg, %xmm0
+ VMOVAPDZ128mr %rdi, 1, %noreg, 0, %noreg, %xmm0
+ ; CHECK: %xmm0 = VMOVAPDrm %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VMOVAPDZ128rm %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VMOVAPDrr %xmm0
%xmm0 = VMOVAPDZ128rr %xmm0
- ; CHECK: VMOVAPSmr %rdi, 1, _, 0, _, %xmm0
- VMOVAPSZ128mr %rdi, 1, _, 0, _, %xmm0
- ; CHECK: %xmm0 = VMOVAPSrm %rip, 1, _, %rax, _
- %xmm0 = VMOVAPSZ128rm %rip, 1, _, %rax, _
+ ; CHECK: VMOVAPSmr %rdi, 1, %noreg, 0, %noreg, %xmm0
+ VMOVAPSZ128mr %rdi, 1, %noreg, 0, %noreg, %xmm0
+ ; CHECK: %xmm0 = VMOVAPSrm %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VMOVAPSZ128rm %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VMOVAPSrr %xmm0
%xmm0 = VMOVAPSZ128rr %xmm0
- ; CHECK: VMOVDQAmr %rdi, 1, _, 0, _, %xmm0
- VMOVDQA32Z128mr %rdi, 1, _, 0, _, %xmm0
- ; CHECK: %xmm0 = VMOVDQArm %rip, 1, _, %rax, _
- %xmm0 = VMOVDQA32Z128rm %rip, 1, _, %rax, _
+ ; CHECK: VMOVDQAmr %rdi, 1, %noreg, 0, %noreg, %xmm0
+ VMOVDQA32Z128mr %rdi, 1, %noreg, 0, %noreg, %xmm0
+ ; CHECK: %xmm0 = VMOVDQArm %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VMOVDQA32Z128rm %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VMOVDQArr %xmm0
%xmm0 = VMOVDQA32Z128rr %xmm0
- ; CHECK: VMOVDQAmr %rdi, 1, _, 0, _, %xmm0
- VMOVDQA64Z128mr %rdi, 1, _, 0, _, %xmm0
- ; CHECK: %xmm0 = VMOVDQArm %rip, 1, _, %rax, _
- %xmm0 = VMOVDQA64Z128rm %rip, 1, _, %rax, _
+ ; CHECK: VMOVDQAmr %rdi, 1, %noreg, 0, %noreg, %xmm0
+ VMOVDQA64Z128mr %rdi, 1, %noreg, 0, %noreg, %xmm0
+ ; CHECK: %xmm0 = VMOVDQArm %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VMOVDQA64Z128rm %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VMOVDQArr %xmm0
%xmm0 = VMOVDQA64Z128rr %xmm0
- ; CHECK: VMOVDQUmr %rdi, 1, _, 0, _, %xmm0
- VMOVDQU16Z128mr %rdi, 1, _, 0, _, %xmm0
- ; CHECK: %xmm0 = VMOVDQUrm %rip, 1, _, %rax, _
- %xmm0 = VMOVDQU16Z128rm %rip, 1, _, %rax, _
+ ; CHECK: VMOVDQUmr %rdi, 1, %noreg, 0, %noreg, %xmm0
+ VMOVDQU16Z128mr %rdi, 1, %noreg, 0, %noreg, %xmm0
+ ; CHECK: %xmm0 = VMOVDQUrm %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VMOVDQU16Z128rm %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VMOVDQUrr %xmm0
%xmm0 = VMOVDQU16Z128rr %xmm0
- ; CHECK: VMOVDQUmr %rdi, 1, _, 0, _, %xmm0
- VMOVDQU32Z128mr %rdi, 1, _, 0, _, %xmm0
- ; CHECK: %xmm0 = VMOVDQUrm %rip, 1, _, %rax, _
- %xmm0 = VMOVDQU32Z128rm %rip, 1, _, %rax, _
+ ; CHECK: VMOVDQUmr %rdi, 1, %noreg, 0, %noreg, %xmm0
+ VMOVDQU32Z128mr %rdi, 1, %noreg, 0, %noreg, %xmm0
+ ; CHECK: %xmm0 = VMOVDQUrm %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VMOVDQU32Z128rm %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VMOVDQUrr %xmm0
%xmm0 = VMOVDQU32Z128rr %xmm0
- ; CHECK: VMOVDQUmr %rdi, 1, _, 0, _, %xmm0
- VMOVDQU64Z128mr %rdi, 1, _, 0, _, %xmm0
- ; CHECK: %xmm0 = VMOVDQUrm %rip, 1, _, %rax, _
- %xmm0 = VMOVDQU64Z128rm %rip, 1, _, %rax, _
+ ; CHECK: VMOVDQUmr %rdi, 1, %noreg, 0, %noreg, %xmm0
+ VMOVDQU64Z128mr %rdi, 1, %noreg, 0, %noreg, %xmm0
+ ; CHECK: %xmm0 = VMOVDQUrm %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VMOVDQU64Z128rm %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VMOVDQUrr %xmm0
%xmm0 = VMOVDQU64Z128rr %xmm0
- ; CHECK: VMOVDQUmr %rdi, 1, _, 0, _, %xmm0
- VMOVDQU8Z128mr %rdi, 1, _, 0, _, %xmm0
- ; CHECK: %xmm0 = VMOVDQUrm %rip, 1, _, %rax, _
- %xmm0 = VMOVDQU8Z128rm %rip, 1, _, %rax, _
+ ; CHECK: VMOVDQUmr %rdi, 1, %noreg, 0, %noreg, %xmm0
+ VMOVDQU8Z128mr %rdi, 1, %noreg, 0, %noreg, %xmm0
+ ; CHECK: %xmm0 = VMOVDQUrm %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VMOVDQU8Z128rm %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VMOVDQUrr %xmm0
%xmm0 = VMOVDQU8Z128rr %xmm0
; CHECK: %xmm0 = VMOVDQUrr_REV %xmm0
%xmm0 = VMOVDQU8Z128rr_REV %xmm0
- ; CHECK: %xmm0 = VMOVNTDQArm %rip, 1, _, %rax, _
- %xmm0 = VMOVNTDQAZ128rm %rip, 1, _, %rax, _
- ; CHECK: VMOVUPDmr %rdi, 1, _, 0, _, %xmm0
- VMOVUPDZ128mr %rdi, 1, _, 0, _, %xmm0
- ; CHECK: %xmm0 = VMOVUPDrm %rip, 1, _, %rax, _
- %xmm0 = VMOVUPDZ128rm %rip, 1, _, %rax, _
+ ; CHECK: %xmm0 = VMOVNTDQArm %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VMOVNTDQAZ128rm %rip, 1, %noreg, %rax, %noreg
+ ; CHECK: VMOVUPDmr %rdi, 1, %noreg, 0, %noreg, %xmm0
+ VMOVUPDZ128mr %rdi, 1, %noreg, 0, %noreg, %xmm0
+ ; CHECK: %xmm0 = VMOVUPDrm %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VMOVUPDZ128rm %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VMOVUPDrr %xmm0
%xmm0 = VMOVUPDZ128rr %xmm0
; CHECK: %xmm0 = VMOVUPDrr_REV %xmm0
%xmm0 = VMOVUPDZ128rr_REV %xmm0
- ; CHECK: VMOVUPSmr %rdi, 1, _, 0, _, %xmm0
- VMOVUPSZ128mr %rdi, 1, _, 0, _, %xmm0
- ; CHECK: %xmm0 = VMOVUPSrm %rip, 1, _, %rax, _
- %xmm0 = VMOVUPSZ128rm %rip, 1, _, %rax, _
+ ; CHECK: VMOVUPSmr %rdi, 1, %noreg, 0, %noreg, %xmm0
+ VMOVUPSZ128mr %rdi, 1, %noreg, 0, %noreg, %xmm0
+ ; CHECK: %xmm0 = VMOVUPSrm %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VMOVUPSZ128rm %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VMOVUPSrr %xmm0
%xmm0 = VMOVUPSZ128rr %xmm0
; CHECK: %xmm0 = VMOVUPSrr_REV %xmm0
%xmm0 = VMOVUPSZ128rr_REV %xmm0
- ; CHECK: VMOVNTDQmr %rdi, 1, _, 0, _, %xmm0
- VMOVNTDQZ128mr %rdi, 1, _, 0, _, %xmm0
- ; CHECK: VMOVNTPDmr %rdi, 1, _, 0, _, %xmm0
- VMOVNTPDZ128mr %rdi, 1, _, 0, _, %xmm0
- ; CHECK: VMOVNTPSmr %rdi, 1, _, 0, _, %xmm0
- VMOVNTPSZ128mr %rdi, 1, _, 0, _, %xmm0
+ ; CHECK: VMOVNTDQmr %rdi, 1, %noreg, 0, %noreg, %xmm0
+ VMOVNTDQZ128mr %rdi, 1, %noreg, 0, %noreg, %xmm0
+ ; CHECK: VMOVNTPDmr %rdi, 1, %noreg, 0, %noreg, %xmm0
+ VMOVNTPDZ128mr %rdi, 1, %noreg, 0, %noreg, %xmm0
+ ; CHECK: VMOVNTPSmr %rdi, 1, %noreg, 0, %noreg, %xmm0
+ VMOVNTPSZ128mr %rdi, 1, %noreg, 0, %noreg, %xmm0
; CHECK: %xmm0 = VMOVAPDrr_REV %xmm0
%xmm0 = VMOVAPDZ128rr_REV %xmm0
; CHECK: %xmm0 = VMOVAPSrr_REV %xmm0
@@ -987,776 +987,776 @@ body: |
%xmm0 = VMOVDQU32Z128rr_REV %xmm0
; CHECK: %xmm0 = VMOVDQUrr_REV %xmm0
%xmm0 = VMOVDQU64Z128rr_REV %xmm0
- ; CHECK: %xmm0 = VPMOVSXBDrm %rip, 1, _, %rax, _
- %xmm0 = VPMOVSXBDZ128rm %rip, 1, _, %rax, _
+ ; CHECK: %xmm0 = VPMOVSXBDrm %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VPMOVSXBDZ128rm %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VPMOVSXBDrr %xmm0
%xmm0 = VPMOVSXBDZ128rr %xmm0
- ; CHECK: %xmm0 = VPMOVSXBQrm %rip, 1, _, %rax, _
- %xmm0 = VPMOVSXBQZ128rm %rip, 1, _, %rax, _
+ ; CHECK: %xmm0 = VPMOVSXBQrm %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VPMOVSXBQZ128rm %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VPMOVSXBQrr %xmm0
%xmm0 = VPMOVSXBQZ128rr %xmm0
- ; CHECK: %xmm0 = VPMOVSXBWrm %rip, 1, _, %rax, _
- %xmm0 = VPMOVSXBWZ128rm %rip, 1, _, %rax, _
+ ; CHECK: %xmm0 = VPMOVSXBWrm %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VPMOVSXBWZ128rm %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VPMOVSXBWrr %xmm0
%xmm0 = VPMOVSXBWZ128rr %xmm0
- ; CHECK: %xmm0 = VPMOVSXDQrm %rip, 1, _, %rax, _
- %xmm0 = VPMOVSXDQZ128rm %rip, 1, _, %rax, _
+ ; CHECK: %xmm0 = VPMOVSXDQrm %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VPMOVSXDQZ128rm %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VPMOVSXDQrr %xmm0
%xmm0 = VPMOVSXDQZ128rr %xmm0
- ; CHECK: %xmm0 = VPMOVSXWDrm %rip, 1, _, %rax, _
- %xmm0 = VPMOVSXWDZ128rm %rip, 1, _, %rax, _
+ ; CHECK: %xmm0 = VPMOVSXWDrm %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VPMOVSXWDZ128rm %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VPMOVSXWDrr %xmm0
%xmm0 = VPMOVSXWDZ128rr %xmm0
- ; CHECK: %xmm0 = VPMOVSXWQrm %rip, 1, _, %rax, _
- %xmm0 = VPMOVSXWQZ128rm %rip, 1, _, %rax, _
+ ; CHECK: %xmm0 = VPMOVSXWQrm %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VPMOVSXWQZ128rm %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VPMOVSXWQrr %xmm0
%xmm0 = VPMOVSXWQZ128rr %xmm0
- ; CHECK: %xmm0 = VPMOVZXBDrm %rip, 1, _, %rax, _
- %xmm0 = VPMOVZXBDZ128rm %rip, 1, _, %rax, _
+ ; CHECK: %xmm0 = VPMOVZXBDrm %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VPMOVZXBDZ128rm %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VPMOVZXBDrr %xmm0
%xmm0 = VPMOVZXBDZ128rr %xmm0
- ; CHECK: %xmm0 = VPMOVZXBQrm %rip, 1, _, %rax, _
- %xmm0 = VPMOVZXBQZ128rm %rip, 1, _, %rax, _
+ ; CHECK: %xmm0 = VPMOVZXBQrm %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VPMOVZXBQZ128rm %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VPMOVZXBQrr %xmm0
%xmm0 = VPMOVZXBQZ128rr %xmm0
- ; CHECK: %xmm0 = VPMOVZXBWrm %rip, 1, _, %rax, _
- %xmm0 = VPMOVZXBWZ128rm %rip, 1, _, %rax, _
+ ; CHECK: %xmm0 = VPMOVZXBWrm %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VPMOVZXBWZ128rm %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VPMOVZXBWrr %xmm0
%xmm0 = VPMOVZXBWZ128rr %xmm0
- ; CHECK: %xmm0 = VPMOVZXDQrm %rip, 1, _, %rax, _
- %xmm0 = VPMOVZXDQZ128rm %rip, 1, _, %rax, _
+ ; CHECK: %xmm0 = VPMOVZXDQrm %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VPMOVZXDQZ128rm %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VPMOVZXDQrr %xmm0
%xmm0 = VPMOVZXDQZ128rr %xmm0
- ; CHECK: %xmm0 = VPMOVZXWDrm %rip, 1, _, %rax, _
- %xmm0 = VPMOVZXWDZ128rm %rip, 1, _, %rax, _
+ ; CHECK: %xmm0 = VPMOVZXWDrm %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VPMOVZXWDZ128rm %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VPMOVZXWDrr %xmm0
%xmm0 = VPMOVZXWDZ128rr %xmm0
- ; CHECK: %xmm0 = VPMOVZXWQrm %rip, 1, _, %rax, _
- %xmm0 = VPMOVZXWQZ128rm %rip, 1, _, %rax, _
+ ; CHECK: %xmm0 = VPMOVZXWQrm %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VPMOVZXWQZ128rm %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VPMOVZXWQrr %xmm0
%xmm0 = VPMOVZXWQZ128rr %xmm0
- ; CHECK: VMOVHPDmr %rdi, 1, _, 0, _, %xmm0
- VMOVHPDZ128mr %rdi, 1, _, 0, _, %xmm0
- ; CHECK: %xmm0 = VMOVHPDrm %xmm0, %rdi, 1, _, 0, _
- %xmm0 = VMOVHPDZ128rm %xmm0, %rdi, 1, _, 0, _
- ; CHECK: VMOVHPSmr %rdi, 1, _, 0, _, %xmm0
- VMOVHPSZ128mr %rdi, 1, _, 0, _, %xmm0
- ; CHECK: %xmm0 = VMOVHPSrm %xmm0, %rdi, 1, _, 0, _
- %xmm0 = VMOVHPSZ128rm %xmm0, %rdi, 1, _, 0, _
- ; CHECK: VMOVLPDmr %rdi, 1, _, 0, _, %xmm0
- VMOVLPDZ128mr %rdi, 1, _, 0, _, %xmm0
- ; CHECK: %xmm0 = VMOVLPDrm %xmm0, %rdi, 1, _, 0, _
- %xmm0 = VMOVLPDZ128rm %xmm0, %rdi, 1, _, 0, _
- ; CHECK: VMOVLPSmr %rdi, 1, _, 0, _, %xmm0
- VMOVLPSZ128mr %rdi, 1, _, 0, _, %xmm0
- ; CHECK: %xmm0 = VMOVLPSrm %xmm0, %rdi, 1, _, 0, _
- %xmm0 = VMOVLPSZ128rm %xmm0, %rdi, 1, _, 0, _
- ; CHECK: %xmm0 = VMAXCPDrm %xmm0, %rip, 1, _, %rax, _
- %xmm0 = VMAXCPDZ128rm %xmm0, %rip, 1, _, %rax, _
+ ; CHECK: VMOVHPDmr %rdi, 1, %noreg, 0, %noreg, %xmm0
+ VMOVHPDZ128mr %rdi, 1, %noreg, 0, %noreg, %xmm0
+ ; CHECK: %xmm0 = VMOVHPDrm %xmm0, %rdi, 1, %noreg, 0, %noreg
+ %xmm0 = VMOVHPDZ128rm %xmm0, %rdi, 1, %noreg, 0, %noreg
+ ; CHECK: VMOVHPSmr %rdi, 1, %noreg, 0, %noreg, %xmm0
+ VMOVHPSZ128mr %rdi, 1, %noreg, 0, %noreg, %xmm0
+ ; CHECK: %xmm0 = VMOVHPSrm %xmm0, %rdi, 1, %noreg, 0, %noreg
+ %xmm0 = VMOVHPSZ128rm %xmm0, %rdi, 1, %noreg, 0, %noreg
+ ; CHECK: VMOVLPDmr %rdi, 1, %noreg, 0, %noreg, %xmm0
+ VMOVLPDZ128mr %rdi, 1, %noreg, 0, %noreg, %xmm0
+ ; CHECK: %xmm0 = VMOVLPDrm %xmm0, %rdi, 1, %noreg, 0, %noreg
+ %xmm0 = VMOVLPDZ128rm %xmm0, %rdi, 1, %noreg, 0, %noreg
+ ; CHECK: VMOVLPSmr %rdi, 1, %noreg, 0, %noreg, %xmm0
+ VMOVLPSZ128mr %rdi, 1, %noreg, 0, %noreg, %xmm0
+ ; CHECK: %xmm0 = VMOVLPSrm %xmm0, %rdi, 1, %noreg, 0, %noreg
+ %xmm0 = VMOVLPSZ128rm %xmm0, %rdi, 1, %noreg, 0, %noreg
+ ; CHECK: %xmm0 = VMAXCPDrm %xmm0, %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VMAXCPDZ128rm %xmm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VMAXCPDrr %xmm0, %xmm1
%xmm0 = VMAXCPDZ128rr %xmm0, %xmm1
- ; CHECK: %xmm0 = VMAXCPSrm %xmm0, %rip, 1, _, %rax, _
- %xmm0 = VMAXCPSZ128rm %xmm0, %rip, 1, _, %rax, _
+ ; CHECK: %xmm0 = VMAXCPSrm %xmm0, %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VMAXCPSZ128rm %xmm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VMAXCPSrr %xmm0, %xmm1
%xmm0 = VMAXCPSZ128rr %xmm0, %xmm1
- ; CHECK: %xmm0 = VMAXCPDrm %xmm0, %rip, 1, _, %rax, _
- %xmm0 = VMAXPDZ128rm %xmm0, %rip, 1, _, %rax, _
+ ; CHECK: %xmm0 = VMAXCPDrm %xmm0, %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VMAXPDZ128rm %xmm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VMAXCPDrr %xmm0, %xmm1
%xmm0 = VMAXPDZ128rr %xmm0, %xmm1
- ; CHECK: %xmm0 = VMAXCPSrm %xmm0, %rip, 1, _, %rax, _
- %xmm0 = VMAXPSZ128rm %xmm0, %rip, 1, _, %rax, _
+ ; CHECK: %xmm0 = VMAXCPSrm %xmm0, %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VMAXPSZ128rm %xmm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VMAXCPSrr %xmm0, %xmm1
%xmm0 = VMAXPSZ128rr %xmm0, %xmm1
- ; CHECK: %xmm0 = VMINCPDrm %xmm0, %rip, 1, _, %rax, _
- %xmm0 = VMINCPDZ128rm %xmm0, %rip, 1, _, %rax, _
+ ; CHECK: %xmm0 = VMINCPDrm %xmm0, %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VMINCPDZ128rm %xmm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VMINCPDrr %xmm0, %xmm1
%xmm0 = VMINCPDZ128rr %xmm0, %xmm1
- ; CHECK: %xmm0 = VMINCPSrm %xmm0, %rip, 1, _, %rax, _
- %xmm0 = VMINCPSZ128rm %xmm0, %rip, 1, _, %rax, _
+ ; CHECK: %xmm0 = VMINCPSrm %xmm0, %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VMINCPSZ128rm %xmm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VMINCPSrr %xmm0, %xmm1
%xmm0 = VMINCPSZ128rr %xmm0, %xmm1
- ; CHECK: %xmm0 = VMINCPDrm %xmm0, %rip, 1, _, %rax, _
- %xmm0 = VMINPDZ128rm %xmm0, %rip, 1, _, %rax, _
+ ; CHECK: %xmm0 = VMINCPDrm %xmm0, %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VMINPDZ128rm %xmm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VMINCPDrr %xmm0, %xmm1
%xmm0 = VMINPDZ128rr %xmm0, %xmm1
- ; CHECK: %xmm0 = VMINCPSrm %xmm0, %rip, 1, _, %rax, _
- %xmm0 = VMINPSZ128rm %xmm0, %rip, 1, _, %rax, _
+ ; CHECK: %xmm0 = VMINCPSrm %xmm0, %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VMINPSZ128rm %xmm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VMINCPSrr %xmm0, %xmm1
%xmm0 = VMINPSZ128rr %xmm0, %xmm1
- ; CHECK: %xmm0 = VMULPDrm %xmm0, %rip, 1, _, %rax, _
- %xmm0 = VMULPDZ128rm %xmm0, %rip, 1, _, %rax, _
+ ; CHECK: %xmm0 = VMULPDrm %xmm0, %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VMULPDZ128rm %xmm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VMULPDrr %xmm0, %xmm1
%xmm0 = VMULPDZ128rr %xmm0, %xmm1
- ; CHECK: %xmm0 = VMULPSrm %xmm0, %rip, 1, _, %rax, _
- %xmm0 = VMULPSZ128rm %xmm0, %rip, 1, _, %rax, _
+ ; CHECK: %xmm0 = VMULPSrm %xmm0, %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VMULPSZ128rm %xmm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VMULPSrr %xmm0, %xmm1
%xmm0 = VMULPSZ128rr %xmm0, %xmm1
- ; CHECK: %xmm0 = VORPDrm %xmm0, %rip, 1, _, %rax, _
- %xmm0 = VORPDZ128rm %xmm0, %rip, 1, _, %rax, _
+ ; CHECK: %xmm0 = VORPDrm %xmm0, %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VORPDZ128rm %xmm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VORPDrr %xmm0, %xmm1
%xmm0 = VORPDZ128rr %xmm0, %xmm1
- ; CHECK: %xmm0 = VORPSrm %xmm0, %rip, 1, _, %rax, _
- %xmm0 = VORPSZ128rm %xmm0, %rip, 1, _, %rax, _
+ ; CHECK: %xmm0 = VORPSrm %xmm0, %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VORPSZ128rm %xmm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VORPSrr %xmm0, %xmm1
%xmm0 = VORPSZ128rr %xmm0, %xmm1
- ; CHECK: %xmm0 = VPADDBrm %xmm0, %rip, 1, _, %rax, _
- %xmm0 = VPADDBZ128rm %xmm0, %rip, 1, _, %rax, _
+ ; CHECK: %xmm0 = VPADDBrm %xmm0, %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VPADDBZ128rm %xmm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VPADDBrr %xmm0, %xmm1
%xmm0 = VPADDBZ128rr %xmm0, %xmm1
- ; CHECK: %xmm0 = VPADDDrm %xmm0, %rip, 1, _, %rax, _
- %xmm0 = VPADDDZ128rm %xmm0, %rip, 1, _, %rax, _
+ ; CHECK: %xmm0 = VPADDDrm %xmm0, %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VPADDDZ128rm %xmm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VPADDDrr %xmm0, %xmm1
%xmm0 = VPADDDZ128rr %xmm0, %xmm1
- ; CHECK: %xmm0 = VPADDQrm %xmm0, %rip, 1, _, %rax, _
- %xmm0 = VPADDQZ128rm %xmm0, %rip, 1, _, %rax, _
+ ; CHECK: %xmm0 = VPADDQrm %xmm0, %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VPADDQZ128rm %xmm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VPADDQrr %xmm0, %xmm1
%xmm0 = VPADDQZ128rr %xmm0, %xmm1
- ; CHECK: %xmm0 = VPADDSBrm %xmm0, %rip, 1, _, %rax, _
- %xmm0 = VPADDSBZ128rm %xmm0, %rip, 1, _, %rax, _
+ ; CHECK: %xmm0 = VPADDSBrm %xmm0, %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VPADDSBZ128rm %xmm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VPADDSBrr %xmm0, %xmm1
%xmm0 = VPADDSBZ128rr %xmm0, %xmm1
- ; CHECK: %xmm0 = VPADDSWrm %xmm0, %rip, 1, _, %rax, _
- %xmm0 = VPADDSWZ128rm %xmm0, %rip, 1, _, %rax, _
+ ; CHECK: %xmm0 = VPADDSWrm %xmm0, %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VPADDSWZ128rm %xmm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VPADDSWrr %xmm0, %xmm1
%xmm0 = VPADDSWZ128rr %xmm0, %xmm1
- ; CHECK: %xmm0 = VPADDUSBrm %xmm0, %rip, 1, _, %rax, _
- %xmm0 = VPADDUSBZ128rm %xmm0, %rip, 1, _, %rax, _
+ ; CHECK: %xmm0 = VPADDUSBrm %xmm0, %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VPADDUSBZ128rm %xmm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VPADDUSBrr %xmm0, %xmm1
%xmm0 = VPADDUSBZ128rr %xmm0, %xmm1
- ; CHECK: %xmm0 = VPADDUSWrm %xmm0, %rip, 1, _, %rax, _
- %xmm0 = VPADDUSWZ128rm %xmm0, %rip, 1, _, %rax, _
+ ; CHECK: %xmm0 = VPADDUSWrm %xmm0, %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VPADDUSWZ128rm %xmm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VPADDUSWrr %xmm0, %xmm1
%xmm0 = VPADDUSWZ128rr %xmm0, %xmm1
- ; CHECK: %xmm0 = VPADDWrm %xmm0, %rip, 1, _, %rax, _
- %xmm0 = VPADDWZ128rm %xmm0, %rip, 1, _, %rax, _
+ ; CHECK: %xmm0 = VPADDWrm %xmm0, %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VPADDWZ128rm %xmm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VPADDWrr %xmm0, %xmm1
%xmm0 = VPADDWZ128rr %xmm0, %xmm1
- ; CHECK: %xmm0 = VPANDrm %xmm0, %rip, 1, _, %rax, _
- %xmm0 = VPANDDZ128rm %xmm0, %rip, 1, _, %rax, _
+ ; CHECK: %xmm0 = VPANDrm %xmm0, %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VPANDDZ128rm %xmm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VPANDrr %xmm0, %xmm1
%xmm0 = VPANDDZ128rr %xmm0, %xmm1
- ; CHECK: %xmm0 = VPANDrm %xmm0, %rip, 1, _, %rax, _
- %xmm0 = VPANDQZ128rm %xmm0, %rip, 1, _, %rax, _
+ ; CHECK: %xmm0 = VPANDrm %xmm0, %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VPANDQZ128rm %xmm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VPANDrr %xmm0, %xmm1
%xmm0 = VPANDQZ128rr %xmm0, %xmm1
- ; CHECK: %xmm0 = VPANDNrm %xmm0, %rip, 1, _, %rax, _
- %xmm0 = VPANDNDZ128rm %xmm0, %rip, 1, _, %rax, _
+ ; CHECK: %xmm0 = VPANDNrm %xmm0, %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VPANDNDZ128rm %xmm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VPANDNrr %xmm0, %xmm1
%xmm0 = VPANDNDZ128rr %xmm0, %xmm1
- ; CHECK: %xmm0 = VPANDNrm %xmm0, %rip, 1, _, %rax, _
- %xmm0 = VPANDNQZ128rm %xmm0, %rip, 1, _, %rax, _
+ ; CHECK: %xmm0 = VPANDNrm %xmm0, %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VPANDNQZ128rm %xmm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VPANDNrr %xmm0, %xmm1
%xmm0 = VPANDNQZ128rr %xmm0, %xmm1
- ; CHECK: %xmm0 = VPAVGBrm %xmm0, %rip, 1, _, %rax, _
- %xmm0 = VPAVGBZ128rm %xmm0, %rip, 1, _, %rax, _
+ ; CHECK: %xmm0 = VPAVGBrm %xmm0, %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VPAVGBZ128rm %xmm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VPAVGBrr %xmm0, %xmm1
%xmm0 = VPAVGBZ128rr %xmm0, %xmm1
- ; CHECK: %xmm0 = VPAVGWrm %xmm0, %rip, 1, _, %rax, _
- %xmm0 = VPAVGWZ128rm %xmm0, %rip, 1, _, %rax, _
+ ; CHECK: %xmm0 = VPAVGWrm %xmm0, %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VPAVGWZ128rm %xmm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VPAVGWrr %xmm0, %xmm1
%xmm0 = VPAVGWZ128rr %xmm0, %xmm1
- ; CHECK: %xmm0 = VPMAXSBrm %xmm0, %rip, 1, _, %rax, _
- %xmm0 = VPMAXSBZ128rm %xmm0, %rip, 1, _, %rax, _
+ ; CHECK: %xmm0 = VPMAXSBrm %xmm0, %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VPMAXSBZ128rm %xmm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VPMAXSBrr %xmm0, %xmm1
%xmm0 = VPMAXSBZ128rr %xmm0, %xmm1
- ; CHECK: %xmm0 = VPMAXSDrm %xmm0, %rip, 1, _, %rax, _
- %xmm0 = VPMAXSDZ128rm %xmm0, %rip, 1, _, %rax, _
+ ; CHECK: %xmm0 = VPMAXSDrm %xmm0, %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VPMAXSDZ128rm %xmm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VPMAXSDrr %xmm0, %xmm1
%xmm0 = VPMAXSDZ128rr %xmm0, %xmm1
- ; CHECK: %xmm0 = VPMAXSWrm %xmm0, %rip, 1, _, %rax, _
- %xmm0 = VPMAXSWZ128rm %xmm0, %rip, 1, _, %rax, _
+ ; CHECK: %xmm0 = VPMAXSWrm %xmm0, %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VPMAXSWZ128rm %xmm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VPMAXSWrr %xmm0, %xmm1
%xmm0 = VPMAXSWZ128rr %xmm0, %xmm1
- ; CHECK: %xmm0 = VPMAXUBrm %xmm0, %rip, 1, _, %rax, _
- %xmm0 = VPMAXUBZ128rm %xmm0, %rip, 1, _, %rax, _
+ ; CHECK: %xmm0 = VPMAXUBrm %xmm0, %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VPMAXUBZ128rm %xmm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VPMAXUBrr %xmm0, %xmm1
%xmm0 = VPMAXUBZ128rr %xmm0, %xmm1
- ; CHECK: %xmm0 = VPMAXUDrm %xmm0, %rip, 1, _, %rax, _
- %xmm0 = VPMAXUDZ128rm %xmm0, %rip, 1, _, %rax, _
+ ; CHECK: %xmm0 = VPMAXUDrm %xmm0, %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VPMAXUDZ128rm %xmm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VPMAXUDrr %xmm0, %xmm1
%xmm0 = VPMAXUDZ128rr %xmm0, %xmm1
- ; CHECK: %xmm0 = VPMAXUWrm %xmm0, %rip, 1, _, %rax, _
- %xmm0 = VPMAXUWZ128rm %xmm0, %rip, 1, _, %rax, _
+ ; CHECK: %xmm0 = VPMAXUWrm %xmm0, %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VPMAXUWZ128rm %xmm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VPMAXUWrr %xmm0, %xmm1
%xmm0 = VPMAXUWZ128rr %xmm0, %xmm1
- ; CHECK: %xmm0 = VPMINSBrm %xmm0, %rip, 1, _, %rax, _
- %xmm0 = VPMINSBZ128rm %xmm0, %rip, 1, _, %rax, _
+ ; CHECK: %xmm0 = VPMINSBrm %xmm0, %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VPMINSBZ128rm %xmm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VPMINSBrr %xmm0, %xmm1
%xmm0 = VPMINSBZ128rr %xmm0, %xmm1
- ; CHECK: %xmm0 = VPMINSDrm %xmm0, %rip, 1, _, %rax, _
- %xmm0 = VPMINSDZ128rm %xmm0, %rip, 1, _, %rax, _
+ ; CHECK: %xmm0 = VPMINSDrm %xmm0, %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VPMINSDZ128rm %xmm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VPMINSDrr %xmm0, %xmm1
%xmm0 = VPMINSDZ128rr %xmm0, %xmm1
- ; CHECK: %xmm0 = VPMINSWrm %xmm0, %rip, 1, _, %rax, _
- %xmm0 = VPMINSWZ128rm %xmm0, %rip, 1, _, %rax, _
+ ; CHECK: %xmm0 = VPMINSWrm %xmm0, %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VPMINSWZ128rm %xmm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VPMINSWrr %xmm0, %xmm1
%xmm0 = VPMINSWZ128rr %xmm0, %xmm1
- ; CHECK: %xmm0 = VPMINUBrm %xmm0, %rip, 1, _, %rax, _
- %xmm0 = VPMINUBZ128rm %xmm0, %rip, 1, _, %rax, _
+ ; CHECK: %xmm0 = VPMINUBrm %xmm0, %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VPMINUBZ128rm %xmm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VPMINUBrr %xmm0, %xmm1
%xmm0 = VPMINUBZ128rr %xmm0, %xmm1
- ; CHECK: %xmm0 = VPMINUDrm %xmm0, %rip, 1, _, %rax, _
- %xmm0 = VPMINUDZ128rm %xmm0, %rip, 1, _, %rax, _
+ ; CHECK: %xmm0 = VPMINUDrm %xmm0, %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VPMINUDZ128rm %xmm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VPMINUDrr %xmm0, %xmm1
%xmm0 = VPMINUDZ128rr %xmm0, %xmm1
- ; CHECK: %xmm0 = VPMINUWrm %xmm0, %rip, 1, _, %rax, _
- %xmm0 = VPMINUWZ128rm %xmm0, %rip, 1, _, %rax, _
+ ; CHECK: %xmm0 = VPMINUWrm %xmm0, %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VPMINUWZ128rm %xmm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VPMINUWrr %xmm0, %xmm1
%xmm0 = VPMINUWZ128rr %xmm0, %xmm1
- ; CHECK: %xmm0 = VPMULDQrm %xmm0, %rip, 1, _, %rax, _
- %xmm0 = VPMULDQZ128rm %xmm0, %rip, 1, _, %rax, _
+ ; CHECK: %xmm0 = VPMULDQrm %xmm0, %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VPMULDQZ128rm %xmm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VPMULDQrr %xmm0, %xmm1
%xmm0 = VPMULDQZ128rr %xmm0, %xmm1
- ; CHECK: %xmm0 = VPMULHRSWrm %xmm0, %rip, 1, _, %rax, _
- %xmm0 = VPMULHRSWZ128rm %xmm0, %rip, 1, _, %rax, _
+ ; CHECK: %xmm0 = VPMULHRSWrm %xmm0, %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VPMULHRSWZ128rm %xmm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VPMULHRSWrr %xmm0, %xmm1
%xmm0 = VPMULHRSWZ128rr %xmm0, %xmm1
- ; CHECK: %xmm0 = VPMULHUWrm %xmm0, %rip, 1, _, %rax, _
- %xmm0 = VPMULHUWZ128rm %xmm0, %rip, 1, _, %rax, _
+ ; CHECK: %xmm0 = VPMULHUWrm %xmm0, %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VPMULHUWZ128rm %xmm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VPMULHUWrr %xmm0, %xmm1
%xmm0 = VPMULHUWZ128rr %xmm0, %xmm1
- ; CHECK: %xmm0 = VPMULHWrm %xmm0, %rip, 1, _, %rax, _
- %xmm0 = VPMULHWZ128rm %xmm0, %rip, 1, _, %rax, _
+ ; CHECK: %xmm0 = VPMULHWrm %xmm0, %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VPMULHWZ128rm %xmm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VPMULHWrr %xmm0, %xmm1
%xmm0 = VPMULHWZ128rr %xmm0, %xmm1
- ; CHECK: %xmm0 = VPMULLDrm %xmm0, %rip, 1, _, %rax, _
- %xmm0 = VPMULLDZ128rm %xmm0, %rip, 1, _, %rax, _
+ ; CHECK: %xmm0 = VPMULLDrm %xmm0, %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VPMULLDZ128rm %xmm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VPMULLDrr %xmm0, %xmm1
%xmm0 = VPMULLDZ128rr %xmm0, %xmm1
- ; CHECK: %xmm0 = VPMULLWrm %xmm0, %rip, 1, _, %rax, _
- %xmm0 = VPMULLWZ128rm %xmm0, %rip, 1, _, %rax, _
+ ; CHECK: %xmm0 = VPMULLWrm %xmm0, %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VPMULLWZ128rm %xmm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VPMULLWrr %xmm0, %xmm1
%xmm0 = VPMULLWZ128rr %xmm0, %xmm1
- ; CHECK: %xmm0 = VPMULUDQrm %xmm0, %rip, 1, _, %rax, _
- %xmm0 = VPMULUDQZ128rm %xmm0, %rip, 1, _, %rax, _
+ ; CHECK: %xmm0 = VPMULUDQrm %xmm0, %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VPMULUDQZ128rm %xmm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VPMULUDQrr %xmm0, %xmm1
%xmm0 = VPMULUDQZ128rr %xmm0, %xmm1
- ; CHECK: %xmm0 = VPORrm %xmm0, %rip, 1, _, %rax, _
- %xmm0 = VPORDZ128rm %xmm0, %rip, 1, _, %rax, _
+ ; CHECK: %xmm0 = VPORrm %xmm0, %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VPORDZ128rm %xmm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VPORrr %xmm0, %xmm1
%xmm0 = VPORDZ128rr %xmm0, %xmm1
- ; CHECK: %xmm0 = VPORrm %xmm0, %rip, 1, _, %rax, _
- %xmm0 = VPORQZ128rm %xmm0, %rip, 1, _, %rax, _
+ ; CHECK: %xmm0 = VPORrm %xmm0, %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VPORQZ128rm %xmm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VPORrr %xmm0, %xmm1
%xmm0 = VPORQZ128rr %xmm0, %xmm1
- ; CHECK: %xmm0 = VPSUBBrm %xmm0, %rip, 1, _, %rax, _
- %xmm0 = VPSUBBZ128rm %xmm0, %rip, 1, _, %rax, _
+ ; CHECK: %xmm0 = VPSUBBrm %xmm0, %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VPSUBBZ128rm %xmm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VPSUBBrr %xmm0, %xmm1
%xmm0 = VPSUBBZ128rr %xmm0, %xmm1
- ; CHECK: %xmm0 = VPSUBDrm %xmm0, %rip, 1, _, %rax, _
- %xmm0 = VPSUBDZ128rm %xmm0, %rip, 1, _, %rax, _
+ ; CHECK: %xmm0 = VPSUBDrm %xmm0, %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VPSUBDZ128rm %xmm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VPSUBDrr %xmm0, %xmm1
%xmm0 = VPSUBDZ128rr %xmm0, %xmm1
- ; CHECK: %xmm0 = VPSUBQrm %xmm0, %rip, 1, _, %rax, _
- %xmm0 = VPSUBQZ128rm %xmm0, %rip, 1, _, %rax, _
+ ; CHECK: %xmm0 = VPSUBQrm %xmm0, %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VPSUBQZ128rm %xmm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VPSUBQrr %xmm0, %xmm1
%xmm0 = VPSUBQZ128rr %xmm0, %xmm1
- ; CHECK: %xmm0 = VPSUBSBrm %xmm0, %rip, 1, _, %rax, _
- %xmm0 = VPSUBSBZ128rm %xmm0, %rip, 1, _, %rax, _
+ ; CHECK: %xmm0 = VPSUBSBrm %xmm0, %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VPSUBSBZ128rm %xmm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VPSUBSBrr %xmm0, %xmm1
%xmm0 = VPSUBSBZ128rr %xmm0, %xmm1
- ; CHECK: %xmm0 = VPSUBSWrm %xmm0, %rip, 1, _, %rax, _
- %xmm0 = VPSUBSWZ128rm %xmm0, %rip, 1, _, %rax, _
+ ; CHECK: %xmm0 = VPSUBSWrm %xmm0, %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VPSUBSWZ128rm %xmm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VPSUBSWrr %xmm0, %xmm1
%xmm0 = VPSUBSWZ128rr %xmm0, %xmm1
- ; CHECK: %xmm0 = VPSUBUSBrm %xmm0, %rip, 1, _, %rax, _
- %xmm0 = VPSUBUSBZ128rm %xmm0, %rip, 1, _, %rax, _
+ ; CHECK: %xmm0 = VPSUBUSBrm %xmm0, %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VPSUBUSBZ128rm %xmm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VPSUBUSBrr %xmm0, %xmm1
%xmm0 = VPSUBUSBZ128rr %xmm0, %xmm1
- ; CHECK: %xmm0 = VPSUBUSWrm %xmm0, %rip, 1, _, %rax, _
- %xmm0 = VPSUBUSWZ128rm %xmm0, %rip, 1, _, %rax, _
+ ; CHECK: %xmm0 = VPSUBUSWrm %xmm0, %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VPSUBUSWZ128rm %xmm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VPSUBUSWrr %xmm0, %xmm1
%xmm0 = VPSUBUSWZ128rr %xmm0, %xmm1
- ; CHECK: %xmm0 = VPSUBWrm %xmm0, %rip, 1, _, %rax, _
- %xmm0 = VPSUBWZ128rm %xmm0, %rip, 1, _, %rax, _
+ ; CHECK: %xmm0 = VPSUBWrm %xmm0, %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VPSUBWZ128rm %xmm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VPSUBWrr %xmm0, %xmm1
%xmm0 = VPSUBWZ128rr %xmm0, %xmm1
- ; CHECK: %xmm0 = VADDPDrm %xmm0, %rip, 1, _, %rax, _
- %xmm0 = VADDPDZ128rm %xmm0, %rip, 1, _, %rax, _
+ ; CHECK: %xmm0 = VADDPDrm %xmm0, %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VADDPDZ128rm %xmm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VADDPDrr %xmm0, %xmm1
%xmm0 = VADDPDZ128rr %xmm0, %xmm1
- ; CHECK: %xmm0 = VADDPSrm %xmm0, %rip, 1, _, %rax, _
- %xmm0 = VADDPSZ128rm %xmm0, %rip, 1, _, %rax, _
+ ; CHECK: %xmm0 = VADDPSrm %xmm0, %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VADDPSZ128rm %xmm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VADDPSrr %xmm0, %xmm1
%xmm0 = VADDPSZ128rr %xmm0, %xmm1
- ; CHECK: %xmm0 = VANDNPDrm %xmm0, %rip, 1, _, %rax, _
- %xmm0 = VANDNPDZ128rm %xmm0, %rip, 1, _, %rax, _
+ ; CHECK: %xmm0 = VANDNPDrm %xmm0, %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VANDNPDZ128rm %xmm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VANDNPDrr %xmm0, %xmm1
%xmm0 = VANDNPDZ128rr %xmm0, %xmm1
- ; CHECK: %xmm0 = VANDNPSrm %xmm0, %rip, 1, _, %rax, _
- %xmm0 = VANDNPSZ128rm %xmm0, %rip, 1, _, %rax, _
+ ; CHECK: %xmm0 = VANDNPSrm %xmm0, %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VANDNPSZ128rm %xmm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VANDNPSrr %xmm0, %xmm1
%xmm0 = VANDNPSZ128rr %xmm0, %xmm1
- ; CHECK: %xmm0 = VANDPDrm %xmm0, %rip, 1, _, %rax, _
- %xmm0 = VANDPDZ128rm %xmm0, %rip, 1, _, %rax, _
+ ; CHECK: %xmm0 = VANDPDrm %xmm0, %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VANDPDZ128rm %xmm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VANDPDrr %xmm0, %xmm1
%xmm0 = VANDPDZ128rr %xmm0, %xmm1
- ; CHECK: %xmm0 = VANDPSrm %xmm0, %rip, 1, _, %rax, _
- %xmm0 = VANDPSZ128rm %xmm0, %rip, 1, _, %rax, _
+ ; CHECK: %xmm0 = VANDPSrm %xmm0, %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VANDPSZ128rm %xmm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VANDPSrr %xmm0, %xmm1
%xmm0 = VANDPSZ128rr %xmm0, %xmm1
- ; CHECK: %xmm0 = VDIVPDrm %xmm0, %rip, 1, _, %rax, _
- %xmm0 = VDIVPDZ128rm %xmm0, %rip, 1, _, %rax, _
+ ; CHECK: %xmm0 = VDIVPDrm %xmm0, %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VDIVPDZ128rm %xmm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VDIVPDrr %xmm0, %xmm1
%xmm0 = VDIVPDZ128rr %xmm0, %xmm1
- ; CHECK: %xmm0 = VDIVPSrm %xmm0, %rip, 1, _, %rax, _
- %xmm0 = VDIVPSZ128rm %xmm0, %rip, 1, _, %rax, _
+ ; CHECK: %xmm0 = VDIVPSrm %xmm0, %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VDIVPSZ128rm %xmm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VDIVPSrr %xmm0, %xmm1
%xmm0 = VDIVPSZ128rr %xmm0, %xmm1
- ; CHECK: %xmm0 = VPXORrm %xmm0, %rip, 1, _, %rax, _
- %xmm0 = VPXORDZ128rm %xmm0, %rip, 1, _, %rax, _
+ ; CHECK: %xmm0 = VPXORrm %xmm0, %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VPXORDZ128rm %xmm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VPXORrr %xmm0, %xmm1
%xmm0 = VPXORDZ128rr %xmm0, %xmm1
- ; CHECK: %xmm0 = VPXORrm %xmm0, %rip, 1, _, %rax, _
- %xmm0 = VPXORQZ128rm %xmm0, %rip, 1, _, %rax, _
+ ; CHECK: %xmm0 = VPXORrm %xmm0, %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VPXORQZ128rm %xmm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VPXORrr %xmm0, %xmm1
%xmm0 = VPXORQZ128rr %xmm0, %xmm1
- ; CHECK: %xmm0 = VSUBPDrm %xmm0, %rip, 1, _, %rax, _
- %xmm0 = VSUBPDZ128rm %xmm0, %rip, 1, _, %rax, _
+ ; CHECK: %xmm0 = VSUBPDrm %xmm0, %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VSUBPDZ128rm %xmm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VSUBPDrr %xmm0, %xmm1
%xmm0 = VSUBPDZ128rr %xmm0, %xmm1
- ; CHECK: %xmm0 = VSUBPSrm %xmm0, %rip, 1, _, %rax, _
- %xmm0 = VSUBPSZ128rm %xmm0, %rip, 1, _, %rax, _
+ ; CHECK: %xmm0 = VSUBPSrm %xmm0, %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VSUBPSZ128rm %xmm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VSUBPSrr %xmm0, %xmm1
%xmm0 = VSUBPSZ128rr %xmm0, %xmm1
- ; CHECK: %xmm0 = VXORPDrm %xmm0, %rip, 1, _, %rax, _
- %xmm0 = VXORPDZ128rm %xmm0, %rip, 1, _, %rax, _
+ ; CHECK: %xmm0 = VXORPDrm %xmm0, %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VXORPDZ128rm %xmm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VXORPDrr %xmm0, %xmm1
%xmm0 = VXORPDZ128rr %xmm0, %xmm1
- ; CHECK: %xmm0 = VXORPSrm %xmm0, %rip, 1, _, %rax, _
- %xmm0 = VXORPSZ128rm %xmm0, %rip, 1, _, %rax, _
+ ; CHECK: %xmm0 = VXORPSrm %xmm0, %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VXORPSZ128rm %xmm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VXORPSrr %xmm0, %xmm1
%xmm0 = VXORPSZ128rr %xmm0, %xmm1
- ; CHECK: %xmm0 = VPMADDUBSWrm %xmm0, %rip, 1, _, %rax, _
- %xmm0 = VPMADDUBSWZ128rm %xmm0, %rip, 1, _, %rax, _
+ ; CHECK: %xmm0 = VPMADDUBSWrm %xmm0, %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VPMADDUBSWZ128rm %xmm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VPMADDUBSWrr %xmm0, %xmm1
%xmm0 = VPMADDUBSWZ128rr %xmm0, %xmm1
- ; CHECK: %xmm0 = VPMADDWDrm %xmm0, %rip, 1, _, %rax, _
- %xmm0 = VPMADDWDZ128rm %xmm0, %rip, 1, _, %rax, _
+ ; CHECK: %xmm0 = VPMADDWDrm %xmm0, %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VPMADDWDZ128rm %xmm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VPMADDWDrr %xmm0, %xmm1
%xmm0 = VPMADDWDZ128rr %xmm0, %xmm1
- ; CHECK: %xmm0 = VPACKSSDWrm %xmm0, %rip, 1, _, %rax, _
- %xmm0 = VPACKSSDWZ128rm %xmm0, %rip, 1, _, %rax, _
+ ; CHECK: %xmm0 = VPACKSSDWrm %xmm0, %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VPACKSSDWZ128rm %xmm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VPACKSSDWrr %xmm0, %xmm1
%xmm0 = VPACKSSDWZ128rr %xmm0, %xmm1
- ; CHECK: %xmm0 = VPACKSSWBrm %xmm0, %rip, 1, _, %rax, _
- %xmm0 = VPACKSSWBZ128rm %xmm0, %rip, 1, _, %rax, _
+ ; CHECK: %xmm0 = VPACKSSWBrm %xmm0, %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VPACKSSWBZ128rm %xmm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VPACKSSWBrr %xmm0, %xmm1
%xmm0 = VPACKSSWBZ128rr %xmm0, %xmm1
- ; CHECK: %xmm0 = VPACKUSDWrm %xmm0, %rip, 1, _, %rax, _
- %xmm0 = VPACKUSDWZ128rm %xmm0, %rip, 1, _, %rax, _
+ ; CHECK: %xmm0 = VPACKUSDWrm %xmm0, %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VPACKUSDWZ128rm %xmm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VPACKUSDWrr %xmm0, %xmm1
%xmm0 = VPACKUSDWZ128rr %xmm0, %xmm1
- ; CHECK: %xmm0 = VPACKUSWBrm %xmm0, %rip, 1, _, %rax, _
- %xmm0 = VPACKUSWBZ128rm %xmm0, %rip, 1, _, %rax, _
+ ; CHECK: %xmm0 = VPACKUSWBrm %xmm0, %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VPACKUSWBZ128rm %xmm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VPACKUSWBrr %xmm0, %xmm1
%xmm0 = VPACKUSWBZ128rr %xmm0, %xmm1
- ; CHECK: %xmm0 = VPUNPCKHBWrm %xmm0, %rip, 1, _, %rax, _
- %xmm0 = VPUNPCKHBWZ128rm %xmm0, %rip, 1, _, %rax, _
+ ; CHECK: %xmm0 = VPUNPCKHBWrm %xmm0, %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VPUNPCKHBWZ128rm %xmm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VPUNPCKHBWrr %xmm0, %xmm1
%xmm0 = VPUNPCKHBWZ128rr %xmm0, %xmm1
- ; CHECK: %xmm0 = VPUNPCKHDQrm %xmm0, %rip, 1, _, %rax, _
- %xmm0 = VPUNPCKHDQZ128rm %xmm0, %rip, 1, _, %rax, _
+ ; CHECK: %xmm0 = VPUNPCKHDQrm %xmm0, %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VPUNPCKHDQZ128rm %xmm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VPUNPCKHDQrr %xmm0, %xmm1
%xmm0 = VPUNPCKHDQZ128rr %xmm0, %xmm1
- ; CHECK: %xmm0 = VPUNPCKHQDQrm %xmm0, %rip, 1, _, %rax, _
- %xmm0 = VPUNPCKHQDQZ128rm %xmm0, %rip, 1, _, %rax, _
+ ; CHECK: %xmm0 = VPUNPCKHQDQrm %xmm0, %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VPUNPCKHQDQZ128rm %xmm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VPUNPCKHQDQrr %xmm0, %xmm1
%xmm0 = VPUNPCKHQDQZ128rr %xmm0, %xmm1
- ; CHECK: %xmm0 = VPUNPCKHWDrm %xmm0, %rip, 1, _, %rax, _
- %xmm0 = VPUNPCKHWDZ128rm %xmm0, %rip, 1, _, %rax, _
+ ; CHECK: %xmm0 = VPUNPCKHWDrm %xmm0, %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VPUNPCKHWDZ128rm %xmm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VPUNPCKHWDrr %xmm0, %xmm1
%xmm0 = VPUNPCKHWDZ128rr %xmm0, %xmm1
- ; CHECK: %xmm0 = VPUNPCKLBWrm %xmm0, %rip, 1, _, %rax, _
- %xmm0 = VPUNPCKLBWZ128rm %xmm0, %rip, 1, _, %rax, _
+ ; CHECK: %xmm0 = VPUNPCKLBWrm %xmm0, %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VPUNPCKLBWZ128rm %xmm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VPUNPCKLBWrr %xmm0, %xmm1
%xmm0 = VPUNPCKLBWZ128rr %xmm0, %xmm1
- ; CHECK: %xmm0 = VPUNPCKLDQrm %xmm0, %rip, 1, _, %rax, _
- %xmm0 = VPUNPCKLDQZ128rm %xmm0, %rip, 1, _, %rax, _
+ ; CHECK: %xmm0 = VPUNPCKLDQrm %xmm0, %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VPUNPCKLDQZ128rm %xmm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VPUNPCKLDQrr %xmm0, %xmm1
%xmm0 = VPUNPCKLDQZ128rr %xmm0, %xmm1
- ; CHECK: %xmm0 = VPUNPCKLQDQrm %xmm0, %rip, 1, _, %rax, _
- %xmm0 = VPUNPCKLQDQZ128rm %xmm0, %rip, 1, _, %rax, _
+ ; CHECK: %xmm0 = VPUNPCKLQDQrm %xmm0, %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VPUNPCKLQDQZ128rm %xmm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VPUNPCKLQDQrr %xmm0, %xmm1
%xmm0 = VPUNPCKLQDQZ128rr %xmm0, %xmm1
- ; CHECK: %xmm0 = VPUNPCKLWDrm %xmm0, %rip, 1, _, %rax, _
- %xmm0 = VPUNPCKLWDZ128rm %xmm0, %rip, 1, _, %rax, _
+ ; CHECK: %xmm0 = VPUNPCKLWDrm %xmm0, %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VPUNPCKLWDZ128rm %xmm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VPUNPCKLWDrr %xmm0, %xmm1
%xmm0 = VPUNPCKLWDZ128rr %xmm0, %xmm1
- ; CHECK: %xmm0 = VUNPCKHPDrm %xmm0, %rip, 1, _, %rax, _
- %xmm0 = VUNPCKHPDZ128rm %xmm0, %rip, 1, _, %rax, _
+ ; CHECK: %xmm0 = VUNPCKHPDrm %xmm0, %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VUNPCKHPDZ128rm %xmm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VUNPCKHPDrr %xmm0, %xmm1
%xmm0 = VUNPCKHPDZ128rr %xmm0, %xmm1
- ; CHECK: %xmm0 = VUNPCKHPSrm %xmm0, %rip, 1, _, %rax, _
- %xmm0 = VUNPCKHPSZ128rm %xmm0, %rip, 1, _, %rax, _
+ ; CHECK: %xmm0 = VUNPCKHPSrm %xmm0, %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VUNPCKHPSZ128rm %xmm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VUNPCKHPSrr %xmm0, %xmm1
%xmm0 = VUNPCKHPSZ128rr %xmm0, %xmm1
- ; CHECK: %xmm0 = VUNPCKLPDrm %xmm0, %rip, 1, _, %rax, _
- %xmm0 = VUNPCKLPDZ128rm %xmm0, %rip, 1, _, %rax, _
+ ; CHECK: %xmm0 = VUNPCKLPDrm %xmm0, %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VUNPCKLPDZ128rm %xmm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VUNPCKLPDrr %xmm0, %xmm1
%xmm0 = VUNPCKLPDZ128rr %xmm0, %xmm1
- ; CHECK: %xmm0 = VUNPCKLPSrm %xmm0, %rip, 1, _, %rax, _
- %xmm0 = VUNPCKLPSZ128rm %xmm0, %rip, 1, _, %rax, _
+ ; CHECK: %xmm0 = VUNPCKLPSrm %xmm0, %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VUNPCKLPSZ128rm %xmm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VUNPCKLPSrr %xmm0, %xmm1
%xmm0 = VUNPCKLPSZ128rr %xmm0, %xmm1
- ; CHECK: %xmm0 = VFMADD132PDm %xmm0, %xmm0, %rsi, 1, _, 0, _
- %xmm0 = VFMADD132PDZ128m %xmm0, %xmm0, %rsi, 1, _, 0, _
+ ; CHECK: %xmm0 = VFMADD132PDm %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
+ %xmm0 = VFMADD132PDZ128m %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
; CHECK: %xmm0 = VFMADD132PDr %xmm0, %xmm1, %xmm2
%xmm0 = VFMADD132PDZ128r %xmm0, %xmm1, %xmm2
- ; CHECK: %xmm0 = VFMADD132PSm %xmm0, %xmm0, %rsi, 1, _, 0, _
- %xmm0 = VFMADD132PSZ128m %xmm0, %xmm0, %rsi, 1, _, 0, _
+ ; CHECK: %xmm0 = VFMADD132PSm %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
+ %xmm0 = VFMADD132PSZ128m %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
; CHECK: %xmm0 = VFMADD132PSr %xmm0, %xmm1, %xmm2
%xmm0 = VFMADD132PSZ128r %xmm0, %xmm1, %xmm2
- ; CHECK: %xmm0 = VFMADD213PDm %xmm0, %xmm0, %rsi, 1, _, 0, _
- %xmm0 = VFMADD213PDZ128m %xmm0, %xmm0, %rsi, 1, _, 0, _
+ ; CHECK: %xmm0 = VFMADD213PDm %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
+ %xmm0 = VFMADD213PDZ128m %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
; CHECK: %xmm0 = VFMADD213PDr %xmm0, %xmm1, %xmm2
%xmm0 = VFMADD213PDZ128r %xmm0, %xmm1, %xmm2
- ; CHECK: %xmm0 = VFMADD213PSm %xmm0, %xmm0, %rsi, 1, _, 0, _
- %xmm0 = VFMADD213PSZ128m %xmm0, %xmm0, %rsi, 1, _, 0, _
+ ; CHECK: %xmm0 = VFMADD213PSm %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
+ %xmm0 = VFMADD213PSZ128m %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
; CHECK: %xmm0 = VFMADD213PSr %xmm0, %xmm1, %xmm2
%xmm0 = VFMADD213PSZ128r %xmm0, %xmm1, %xmm2
- ; CHECK: %xmm0 = VFMADD231PDm %xmm0, %xmm0, %rsi, 1, _, 0, _
- %xmm0 = VFMADD231PDZ128m %xmm0, %xmm0, %rsi, 1, _, 0, _
+ ; CHECK: %xmm0 = VFMADD231PDm %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
+ %xmm0 = VFMADD231PDZ128m %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
; CHECK: %xmm0 = VFMADD231PDr %xmm0, %xmm1, %xmm2
%xmm0 = VFMADD231PDZ128r %xmm0, %xmm1, %xmm2
- ; CHECK: %xmm0 = VFMADD231PSm %xmm0, %xmm0, %rsi, 1, _, 0, _
- %xmm0 = VFMADD231PSZ128m %xmm0, %xmm0, %rsi, 1, _, 0, _
+ ; CHECK: %xmm0 = VFMADD231PSm %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
+ %xmm0 = VFMADD231PSZ128m %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
; CHECK: %xmm0 = VFMADD231PSr %xmm0, %xmm1, %xmm2
%xmm0 = VFMADD231PSZ128r %xmm0, %xmm1, %xmm2
- ; CHECK: %xmm0 = VFMADDSUB132PDm %xmm0, %xmm0, %rsi, 1, _, 0, _
- %xmm0 = VFMADDSUB132PDZ128m %xmm0, %xmm0, %rsi, 1, _, 0, _
+ ; CHECK: %xmm0 = VFMADDSUB132PDm %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
+ %xmm0 = VFMADDSUB132PDZ128m %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
; CHECK: %xmm0 = VFMADDSUB132PDr %xmm0, %xmm1, %xmm2
%xmm0 = VFMADDSUB132PDZ128r %xmm0, %xmm1, %xmm2
- ; CHECK: %xmm0 = VFMADDSUB132PSm %xmm0, %xmm0, %rsi, 1, _, 0, _
- %xmm0 = VFMADDSUB132PSZ128m %xmm0, %xmm0, %rsi, 1, _, 0, _
+ ; CHECK: %xmm0 = VFMADDSUB132PSm %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
+ %xmm0 = VFMADDSUB132PSZ128m %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
; CHECK: %xmm0 = VFMADDSUB132PSr %xmm0, %xmm1, %xmm2
%xmm0 = VFMADDSUB132PSZ128r %xmm0, %xmm1, %xmm2
- ; CHECK: %xmm0 = VFMADDSUB213PDm %xmm0, %xmm0, %rsi, 1, _, 0, _
- %xmm0 = VFMADDSUB213PDZ128m %xmm0, %xmm0, %rsi, 1, _, 0, _
+ ; CHECK: %xmm0 = VFMADDSUB213PDm %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
+ %xmm0 = VFMADDSUB213PDZ128m %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
; CHECK: %xmm0 = VFMADDSUB213PDr %xmm0, %xmm1, %xmm2
%xmm0 = VFMADDSUB213PDZ128r %xmm0, %xmm1, %xmm2
- ; CHECK: %xmm0 = VFMADDSUB213PSm %xmm0, %xmm0, %rsi, 1, _, 0, _
- %xmm0 = VFMADDSUB213PSZ128m %xmm0, %xmm0, %rsi, 1, _, 0, _
+ ; CHECK: %xmm0 = VFMADDSUB213PSm %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
+ %xmm0 = VFMADDSUB213PSZ128m %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
; CHECK: %xmm0 = VFMADDSUB213PSr %xmm0, %xmm1, %xmm2
%xmm0 = VFMADDSUB213PSZ128r %xmm0, %xmm1, %xmm2
- ; CHECK: %xmm0 = VFMADDSUB231PDm %xmm0, %xmm0, %rsi, 1, _, 0, _
- %xmm0 = VFMADDSUB231PDZ128m %xmm0, %xmm0, %rsi, 1, _, 0, _
+ ; CHECK: %xmm0 = VFMADDSUB231PDm %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
+ %xmm0 = VFMADDSUB231PDZ128m %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
; CHECK: %xmm0 = VFMADDSUB231PDr %xmm0, %xmm1, %xmm2
%xmm0 = VFMADDSUB231PDZ128r %xmm0, %xmm1, %xmm2
- ; CHECK: %xmm0 = VFMADDSUB231PSm %xmm0, %xmm0, %rsi, 1, _, 0, _
- %xmm0 = VFMADDSUB231PSZ128m %xmm0, %xmm0, %rsi, 1, _, 0, _
+ ; CHECK: %xmm0 = VFMADDSUB231PSm %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
+ %xmm0 = VFMADDSUB231PSZ128m %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
; CHECK: %xmm0 = VFMADDSUB231PSr %xmm0, %xmm1, %xmm2
%xmm0 = VFMADDSUB231PSZ128r %xmm0, %xmm1, %xmm2
- ; CHECK: %xmm0 = VFMSUB132PDm %xmm0, %xmm0, %rsi, 1, _, 0, _
- %xmm0 = VFMSUB132PDZ128m %xmm0, %xmm0, %rsi, 1, _, 0, _
+ ; CHECK: %xmm0 = VFMSUB132PDm %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
+ %xmm0 = VFMSUB132PDZ128m %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
; CHECK: %xmm0 = VFMSUB132PDr %xmm0, %xmm1, %xmm2
%xmm0 = VFMSUB132PDZ128r %xmm0, %xmm1, %xmm2
- ; CHECK: %xmm0 = VFMSUB132PSm %xmm0, %xmm0, %rsi, 1, _, 0, _
- %xmm0 = VFMSUB132PSZ128m %xmm0, %xmm0, %rsi, 1, _, 0, _
+ ; CHECK: %xmm0 = VFMSUB132PSm %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
+ %xmm0 = VFMSUB132PSZ128m %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
; CHECK: %xmm0 = VFMSUB132PSr %xmm0, %xmm1, %xmm2
%xmm0 = VFMSUB132PSZ128r %xmm0, %xmm1, %xmm2
- ; CHECK: %xmm0 = VFMSUB213PDm %xmm0, %xmm0, %rsi, 1, _, 0, _
- %xmm0 = VFMSUB213PDZ128m %xmm0, %xmm0, %rsi, 1, _, 0, _
+ ; CHECK: %xmm0 = VFMSUB213PDm %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
+ %xmm0 = VFMSUB213PDZ128m %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
; CHECK: %xmm0 = VFMSUB213PDr %xmm0, %xmm1, %xmm2
%xmm0 = VFMSUB213PDZ128r %xmm0, %xmm1, %xmm2
- ; CHECK: %xmm0 = VFMSUB213PSm %xmm0, %xmm0, %rsi, 1, _, 0, _
- %xmm0 = VFMSUB213PSZ128m %xmm0, %xmm0, %rsi, 1, _, 0, _
+ ; CHECK: %xmm0 = VFMSUB213PSm %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
+ %xmm0 = VFMSUB213PSZ128m %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
; CHECK: %xmm0 = VFMSUB213PSr %xmm0, %xmm1, %xmm2
%xmm0 = VFMSUB213PSZ128r %xmm0, %xmm1, %xmm2
- ; CHECK: %xmm0 = VFMSUB231PDm %xmm0, %xmm0, %rsi, 1, _, 0, _
- %xmm0 = VFMSUB231PDZ128m %xmm0, %xmm0, %rsi, 1, _, 0, _
+ ; CHECK: %xmm0 = VFMSUB231PDm %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
+ %xmm0 = VFMSUB231PDZ128m %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
; CHECK: %xmm0 = VFMSUB231PDr %xmm0, %xmm1, %xmm2
%xmm0 = VFMSUB231PDZ128r %xmm0, %xmm1, %xmm2
- ; CHECK: %xmm0 = VFMSUB231PSm %xmm0, %xmm0, %rsi, 1, _, 0, _
- %xmm0 = VFMSUB231PSZ128m %xmm0, %xmm0, %rsi, 1, _, 0, _
+ ; CHECK: %xmm0 = VFMSUB231PSm %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
+ %xmm0 = VFMSUB231PSZ128m %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
; CHECK: %xmm0 = VFMSUB231PSr %xmm0, %xmm1, %xmm2
%xmm0 = VFMSUB231PSZ128r %xmm0, %xmm1, %xmm2
- ; CHECK: %xmm0 = VFMSUBADD132PDm %xmm0, %xmm0, %rsi, 1, _, 0, _
- %xmm0 = VFMSUBADD132PDZ128m %xmm0, %xmm0, %rsi, 1, _, 0, _
+ ; CHECK: %xmm0 = VFMSUBADD132PDm %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
+ %xmm0 = VFMSUBADD132PDZ128m %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
; CHECK: %xmm0 = VFMSUBADD132PDr %xmm0, %xmm1, %xmm2
%xmm0 = VFMSUBADD132PDZ128r %xmm0, %xmm1, %xmm2
- ; CHECK: %xmm0 = VFMSUBADD132PSm %xmm0, %xmm0, %rsi, 1, _, 0, _
- %xmm0 = VFMSUBADD132PSZ128m %xmm0, %xmm0, %rsi, 1, _, 0, _
+ ; CHECK: %xmm0 = VFMSUBADD132PSm %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
+ %xmm0 = VFMSUBADD132PSZ128m %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
; CHECK: %xmm0 = VFMSUBADD132PSr %xmm0, %xmm1, %xmm2
%xmm0 = VFMSUBADD132PSZ128r %xmm0, %xmm1, %xmm2
- ; CHECK: %xmm0 = VFMSUBADD213PDm %xmm0, %xmm0, %rsi, 1, _, 0, _
- %xmm0 = VFMSUBADD213PDZ128m %xmm0, %xmm0, %rsi, 1, _, 0, _
+ ; CHECK: %xmm0 = VFMSUBADD213PDm %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
+ %xmm0 = VFMSUBADD213PDZ128m %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
; CHECK: %xmm0 = VFMSUBADD213PDr %xmm0, %xmm1, %xmm2
%xmm0 = VFMSUBADD213PDZ128r %xmm0, %xmm1, %xmm2
- ; CHECK: %xmm0 = VFMSUBADD213PSm %xmm0, %xmm0, %rsi, 1, _, 0, _
- %xmm0 = VFMSUBADD213PSZ128m %xmm0, %xmm0, %rsi, 1, _, 0, _
+ ; CHECK: %xmm0 = VFMSUBADD213PSm %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
+ %xmm0 = VFMSUBADD213PSZ128m %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
; CHECK: %xmm0 = VFMSUBADD213PSr %xmm0, %xmm1, %xmm2
%xmm0 = VFMSUBADD213PSZ128r %xmm0, %xmm1, %xmm2
- ; CHECK: %xmm0 = VFMSUBADD231PDm %xmm0, %xmm0, %rsi, 1, _, 0, _
- %xmm0 = VFMSUBADD231PDZ128m %xmm0, %xmm0, %rsi, 1, _, 0, _
+ ; CHECK: %xmm0 = VFMSUBADD231PDm %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
+ %xmm0 = VFMSUBADD231PDZ128m %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
; CHECK: %xmm0 = VFMSUBADD231PDr %xmm0, %xmm1, %xmm2
%xmm0 = VFMSUBADD231PDZ128r %xmm0, %xmm1, %xmm2
- ; CHECK: %xmm0 = VFMSUBADD231PSm %xmm0, %xmm0, %rsi, 1, _, 0, _
- %xmm0 = VFMSUBADD231PSZ128m %xmm0, %xmm0, %rsi, 1, _, 0, _
+ ; CHECK: %xmm0 = VFMSUBADD231PSm %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
+ %xmm0 = VFMSUBADD231PSZ128m %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
; CHECK: %xmm0 = VFMSUBADD231PSr %xmm0, %xmm1, %xmm2
%xmm0 = VFMSUBADD231PSZ128r %xmm0, %xmm1, %xmm2
- ; CHECK: %xmm0 = VFNMADD132PDm %xmm0, %xmm0, %rsi, 1, _, 0, _
- %xmm0 = VFNMADD132PDZ128m %xmm0, %xmm0, %rsi, 1, _, 0, _
+ ; CHECK: %xmm0 = VFNMADD132PDm %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
+ %xmm0 = VFNMADD132PDZ128m %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
; CHECK: %xmm0 = VFNMADD132PDr %xmm0, %xmm1, %xmm2
%xmm0 = VFNMADD132PDZ128r %xmm0, %xmm1, %xmm2
- ; CHECK: %xmm0 = VFNMADD132PSm %xmm0, %xmm0, %rsi, 1, _, 0, _
- %xmm0 = VFNMADD132PSZ128m %xmm0, %xmm0, %rsi, 1, _, 0, _
+ ; CHECK: %xmm0 = VFNMADD132PSm %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
+ %xmm0 = VFNMADD132PSZ128m %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
; CHECK: %xmm0 = VFNMADD132PSr %xmm0, %xmm1, %xmm2
%xmm0 = VFNMADD132PSZ128r %xmm0, %xmm1, %xmm2
- ; CHECK: %xmm0 = VFNMADD213PDm %xmm0, %xmm0, %rsi, 1, _, 0, _
- %xmm0 = VFNMADD213PDZ128m %xmm0, %xmm0, %rsi, 1, _, 0, _
+ ; CHECK: %xmm0 = VFNMADD213PDm %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
+ %xmm0 = VFNMADD213PDZ128m %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
; CHECK: %xmm0 = VFNMADD213PDr %xmm0, %xmm1, %xmm2
%xmm0 = VFNMADD213PDZ128r %xmm0, %xmm1, %xmm2
- ; CHECK: %xmm0 = VFNMADD213PSm %xmm0, %xmm0, %rsi, 1, _, 0, _
- %xmm0 = VFNMADD213PSZ128m %xmm0, %xmm0, %rsi, 1, _, 0, _
+ ; CHECK: %xmm0 = VFNMADD213PSm %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
+ %xmm0 = VFNMADD213PSZ128m %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
; CHECK: %xmm0 = VFNMADD213PSr %xmm0, %xmm1, %xmm2
%xmm0 = VFNMADD213PSZ128r %xmm0, %xmm1, %xmm2
- ; CHECK: %xmm0 = VFNMADD231PDm %xmm0, %xmm0, %rsi, 1, _, 0, _
- %xmm0 = VFNMADD231PDZ128m %xmm0, %xmm0, %rsi, 1, _, 0, _
+ ; CHECK: %xmm0 = VFNMADD231PDm %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
+ %xmm0 = VFNMADD231PDZ128m %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
; CHECK: %xmm0 = VFNMADD231PDr %xmm0, %xmm1, %xmm2
%xmm0 = VFNMADD231PDZ128r %xmm0, %xmm1, %xmm2
- ; CHECK: %xmm0 = VFNMADD231PSm %xmm0, %xmm0, %rsi, 1, _, 0, _
- %xmm0 = VFNMADD231PSZ128m %xmm0, %xmm0, %rsi, 1, _, 0, _
+ ; CHECK: %xmm0 = VFNMADD231PSm %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
+ %xmm0 = VFNMADD231PSZ128m %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
; CHECK: %xmm0 = VFNMADD231PSr %xmm0, %xmm1, %xmm2
%xmm0 = VFNMADD231PSZ128r %xmm0, %xmm1, %xmm2
- ; CHECK: %xmm0 = VFNMSUB132PDm %xmm0, %xmm0, %rsi, 1, _, 0, _
- %xmm0 = VFNMSUB132PDZ128m %xmm0, %xmm0, %rsi, 1, _, 0, _
+ ; CHECK: %xmm0 = VFNMSUB132PDm %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
+ %xmm0 = VFNMSUB132PDZ128m %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
; CHECK: %xmm0 = VFNMSUB132PDr %xmm0, %xmm1, %xmm2
%xmm0 = VFNMSUB132PDZ128r %xmm0, %xmm1, %xmm2
- ; CHECK: %xmm0 = VFNMSUB132PSm %xmm0, %xmm0, %rsi, 1, _, 0, _
- %xmm0 = VFNMSUB132PSZ128m %xmm0, %xmm0, %rsi, 1, _, 0, _
+ ; CHECK: %xmm0 = VFNMSUB132PSm %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
+ %xmm0 = VFNMSUB132PSZ128m %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
; CHECK: %xmm0 = VFNMSUB132PSr %xmm0, %xmm1, %xmm2
%xmm0 = VFNMSUB132PSZ128r %xmm0, %xmm1, %xmm2
- ; CHECK: %xmm0 = VFNMSUB213PDm %xmm0, %xmm0, %rsi, 1, _, 0, _
- %xmm0 = VFNMSUB213PDZ128m %xmm0, %xmm0, %rsi, 1, _, 0, _
+ ; CHECK: %xmm0 = VFNMSUB213PDm %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
+ %xmm0 = VFNMSUB213PDZ128m %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
; CHECK: %xmm0 = VFNMSUB213PDr %xmm0, %xmm1, %xmm2
%xmm0 = VFNMSUB213PDZ128r %xmm0, %xmm1, %xmm2
- ; CHECK: %xmm0 = VFNMSUB213PSm %xmm0, %xmm0, %rsi, 1, _, 0, _
- %xmm0 = VFNMSUB213PSZ128m %xmm0, %xmm0, %rsi, 1, _, 0, _
+ ; CHECK: %xmm0 = VFNMSUB213PSm %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
+ %xmm0 = VFNMSUB213PSZ128m %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
; CHECK: %xmm0 = VFNMSUB213PSr %xmm0, %xmm1, %xmm2
%xmm0 = VFNMSUB213PSZ128r %xmm0, %xmm1, %xmm2
- ; CHECK: %xmm0 = VFNMSUB231PDm %xmm0, %xmm0, %rsi, 1, _, 0, _
- %xmm0 = VFNMSUB231PDZ128m %xmm0, %xmm0, %rsi, 1, _, 0, _
+ ; CHECK: %xmm0 = VFNMSUB231PDm %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
+ %xmm0 = VFNMSUB231PDZ128m %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
; CHECK: %xmm0 = VFNMSUB231PDr %xmm0, %xmm1, %xmm2
%xmm0 = VFNMSUB231PDZ128r %xmm0, %xmm1, %xmm2
- ; CHECK: %xmm0 = VFNMSUB231PSm %xmm0, %xmm0, %rsi, 1, _, 0, _
- %xmm0 = VFNMSUB231PSZ128m %xmm0, %xmm0, %rsi, 1, _, 0, _
+ ; CHECK: %xmm0 = VFNMSUB231PSm %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
+ %xmm0 = VFNMSUB231PSZ128m %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
; CHECK: %xmm0 = VFNMSUB231PSr %xmm0, %xmm1, %xmm2
%xmm0 = VFNMSUB231PSZ128r %xmm0, %xmm1, %xmm2
; CHECK: %xmm0 = VPSLLDri %xmm0, 7
%xmm0 = VPSLLDZ128ri %xmm0, 7
- ; CHECK: %xmm0 = VPSLLDrm %xmm0, %rip, 1, _, %rax, _
- %xmm0 = VPSLLDZ128rm %xmm0, %rip, 1, _, %rax, _
+ ; CHECK: %xmm0 = VPSLLDrm %xmm0, %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VPSLLDZ128rm %xmm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VPSLLDrr %xmm0, 14
%xmm0 = VPSLLDZ128rr %xmm0, 14
; CHECK: %xmm0 = VPSLLQri %xmm0, 7
%xmm0 = VPSLLQZ128ri %xmm0, 7
- ; CHECK: %xmm0 = VPSLLQrm %xmm0, %rip, 1, _, %rax, _
- %xmm0 = VPSLLQZ128rm %xmm0, %rip, 1, _, %rax, _
+ ; CHECK: %xmm0 = VPSLLQrm %xmm0, %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VPSLLQZ128rm %xmm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VPSLLQrr %xmm0, 14
%xmm0 = VPSLLQZ128rr %xmm0, 14
- ; CHECK: %xmm0 = VPSLLVDrm %xmm0, %rip, 1, _, %rax, _
- %xmm0 = VPSLLVDZ128rm %xmm0, %rip, 1, _, %rax, _
+ ; CHECK: %xmm0 = VPSLLVDrm %xmm0, %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VPSLLVDZ128rm %xmm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VPSLLVDrr %xmm0, 14
%xmm0 = VPSLLVDZ128rr %xmm0, 14
- ; CHECK: %xmm0 = VPSLLVQrm %xmm0, %rip, 1, _, %rax, _
- %xmm0 = VPSLLVQZ128rm %xmm0, %rip, 1, _, %rax, _
+ ; CHECK: %xmm0 = VPSLLVQrm %xmm0, %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VPSLLVQZ128rm %xmm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VPSLLVQrr %xmm0, 14
%xmm0 = VPSLLVQZ128rr %xmm0, 14
; CHECK: %xmm0 = VPSLLWri %xmm0, 7
%xmm0 = VPSLLWZ128ri %xmm0, 7
- ; CHECK: %xmm0 = VPSLLWrm %xmm0, %rip, 1, _, %rax, _
- %xmm0 = VPSLLWZ128rm %xmm0, %rip, 1, _, %rax, _
+ ; CHECK: %xmm0 = VPSLLWrm %xmm0, %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VPSLLWZ128rm %xmm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VPSLLWrr %xmm0, 14
%xmm0 = VPSLLWZ128rr %xmm0, 14
; CHECK: %xmm0 = VPSRADri %xmm0, 7
%xmm0 = VPSRADZ128ri %xmm0, 7
- ; CHECK: %xmm0 = VPSRADrm %xmm0, %rip, 1, _, %rax, _
- %xmm0 = VPSRADZ128rm %xmm0, %rip, 1, _, %rax, _
+ ; CHECK: %xmm0 = VPSRADrm %xmm0, %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VPSRADZ128rm %xmm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VPSRADrr %xmm0, 14
%xmm0 = VPSRADZ128rr %xmm0, 14
- ; CHECK: %xmm0 = VPSRAVDrm %xmm0, %rip, 1, _, %rax, _
- %xmm0 = VPSRAVDZ128rm %xmm0, %rip, 1, _, %rax, _
+ ; CHECK: %xmm0 = VPSRAVDrm %xmm0, %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VPSRAVDZ128rm %xmm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VPSRAVDrr %xmm0, 14
%xmm0 = VPSRAVDZ128rr %xmm0, 14
; CHECK: %xmm0 = VPSRAWri %xmm0, 7
%xmm0 = VPSRAWZ128ri %xmm0, 7
- ; CHECK: %xmm0 = VPSRAWrm %xmm0, %rip, 1, _, %rax, _
- %xmm0 = VPSRAWZ128rm %xmm0, %rip, 1, _, %rax, _
+ ; CHECK: %xmm0 = VPSRAWrm %xmm0, %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VPSRAWZ128rm %xmm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VPSRAWrr %xmm0, 14
%xmm0 = VPSRAWZ128rr %xmm0, 14
; CHECK: %xmm0 = VPSRLDQri %xmm0, 14
%xmm0 = VPSRLDQZ128rr %xmm0, 14
; CHECK: %xmm0 = VPSRLDri %xmm0, 7
%xmm0 = VPSRLDZ128ri %xmm0, 7
- ; CHECK: %xmm0 = VPSRLDrm %xmm0, %rip, 1, _, %rax, _
- %xmm0 = VPSRLDZ128rm %xmm0, %rip, 1, _, %rax, _
+ ; CHECK: %xmm0 = VPSRLDrm %xmm0, %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VPSRLDZ128rm %xmm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VPSRLDrr %xmm0, 14
%xmm0 = VPSRLDZ128rr %xmm0, 14
; CHECK: %xmm0 = VPSRLQri %xmm0, 7
%xmm0 = VPSRLQZ128ri %xmm0, 7
- ; CHECK: %xmm0 = VPSRLQrm %xmm0, %rip, 1, _, %rax, _
- %xmm0 = VPSRLQZ128rm %xmm0, %rip, 1, _, %rax, _
+ ; CHECK: %xmm0 = VPSRLQrm %xmm0, %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VPSRLQZ128rm %xmm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VPSRLQrr %xmm0, 14
%xmm0 = VPSRLQZ128rr %xmm0, 14
- ; CHECK: %xmm0 = VPSRLVDrm %xmm0, %rip, 1, _, %rax, _
- %xmm0 = VPSRLVDZ128rm %xmm0, %rip, 1, _, %rax, _
+ ; CHECK: %xmm0 = VPSRLVDrm %xmm0, %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VPSRLVDZ128rm %xmm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VPSRLVDrr %xmm0, 14
%xmm0 = VPSRLVDZ128rr %xmm0, 14
- ; CHECK: %xmm0 = VPSRLVQrm %xmm0, %rip, 1, _, %rax, _
- %xmm0 = VPSRLVQZ128rm %xmm0, %rip, 1, _, %rax, _
+ ; CHECK: %xmm0 = VPSRLVQrm %xmm0, %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VPSRLVQZ128rm %xmm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VPSRLVQrr %xmm0, 14
%xmm0 = VPSRLVQZ128rr %xmm0, 14
; CHECK: %xmm0 = VPSRLWri %xmm0, 7
%xmm0 = VPSRLWZ128ri %xmm0, 7
- ; CHECK: %xmm0 = VPSRLWrm %xmm0, %rip, 1, _, %rax, _
- %xmm0 = VPSRLWZ128rm %xmm0, %rip, 1, _, %rax, _
+ ; CHECK: %xmm0 = VPSRLWrm %xmm0, %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VPSRLWZ128rm %xmm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VPSRLWrr %xmm0, 14
%xmm0 = VPSRLWZ128rr %xmm0, 14
- ; CHECK: %xmm0 = VPERMILPDmi %rdi, 1, _, 0, _, _
- %xmm0 = VPERMILPDZ128mi %rdi, 1, _, 0, _, _
+ ; CHECK: %xmm0 = VPERMILPDmi %rdi, 1, %noreg, 0, %noreg, %noreg
+ %xmm0 = VPERMILPDZ128mi %rdi, 1, %noreg, 0, %noreg, %noreg
; CHECK: %xmm0 = VPERMILPDri %xmm0, 9
%xmm0 = VPERMILPDZ128ri %xmm0, 9
- ; CHECK: %xmm0 = VPERMILPDrm %xmm0, %rdi, 1, _, 0, _
- %xmm0 = VPERMILPDZ128rm %xmm0, %rdi, 1, _, 0, _
+ ; CHECK: %xmm0 = VPERMILPDrm %xmm0, %rdi, 1, %noreg, 0, %noreg
+ %xmm0 = VPERMILPDZ128rm %xmm0, %rdi, 1, %noreg, 0, %noreg
; CHECK: %xmm0 = VPERMILPDrr %xmm0, %xmm1
%xmm0 = VPERMILPDZ128rr %xmm0, %xmm1
- ; CHECK: %xmm0 = VPERMILPSmi %rdi, 1, _, 0, _, _
- %xmm0 = VPERMILPSZ128mi %rdi, 1, _, 0, _, _
+ ; CHECK: %xmm0 = VPERMILPSmi %rdi, 1, %noreg, 0, %noreg, %noreg
+ %xmm0 = VPERMILPSZ128mi %rdi, 1, %noreg, 0, %noreg, %noreg
; CHECK: %xmm0 = VPERMILPSri %xmm0, 9
%xmm0 = VPERMILPSZ128ri %xmm0, 9
- ; CHECK: %xmm0 = VPERMILPSrm %xmm0, %rdi, 1, _, 0, _
- %xmm0 = VPERMILPSZ128rm %xmm0, %rdi, 1, _, 0, _
+ ; CHECK: %xmm0 = VPERMILPSrm %xmm0, %rdi, 1, %noreg, 0, %noreg
+ %xmm0 = VPERMILPSZ128rm %xmm0, %rdi, 1, %noreg, 0, %noreg
; CHECK: %xmm0 = VPERMILPSrr %xmm0, %xmm1
%xmm0 = VPERMILPSZ128rr %xmm0, %xmm1
- ; CHECK: %xmm0 = VCVTPH2PSrm %rdi, %xmm0, 1, _, 0
- %xmm0 = VCVTPH2PSZ128rm %rdi, %xmm0, 1, _, 0
+ ; CHECK: %xmm0 = VCVTPH2PSrm %rdi, %xmm0, 1, %noreg, 0
+ %xmm0 = VCVTPH2PSZ128rm %rdi, %xmm0, 1, %noreg, 0
; CHECK: %xmm0 = VCVTPH2PSrr %xmm0
%xmm0 = VCVTPH2PSZ128rr %xmm0
- ; CHECK: %xmm0 = VCVTDQ2PDrm %rdi, %xmm0, 1, _, 0
- %xmm0 = VCVTDQ2PDZ128rm %rdi, %xmm0, 1, _, 0
+ ; CHECK: %xmm0 = VCVTDQ2PDrm %rdi, %xmm0, 1, %noreg, 0
+ %xmm0 = VCVTDQ2PDZ128rm %rdi, %xmm0, 1, %noreg, 0
; CHECK: %xmm0 = VCVTDQ2PDrr %xmm0
%xmm0 = VCVTDQ2PDZ128rr %xmm0
- ; CHECK: %xmm0 = VCVTDQ2PSrm %rdi, %xmm0, 1, _, 0
- %xmm0 = VCVTDQ2PSZ128rm %rdi, %xmm0, 1, _, 0
+ ; CHECK: %xmm0 = VCVTDQ2PSrm %rdi, %xmm0, 1, %noreg, 0
+ %xmm0 = VCVTDQ2PSZ128rm %rdi, %xmm0, 1, %noreg, 0
; CHECK: %xmm0 = VCVTDQ2PSrr %xmm0
%xmm0 = VCVTDQ2PSZ128rr %xmm0
- ; CHECK: %xmm0 = VCVTPD2DQrm %rdi, %xmm0, 1, _, 0
- %xmm0 = VCVTPD2DQZ128rm %rdi, %xmm0, 1, _, 0
+ ; CHECK: %xmm0 = VCVTPD2DQrm %rdi, %xmm0, 1, %noreg, 0
+ %xmm0 = VCVTPD2DQZ128rm %rdi, %xmm0, 1, %noreg, 0
; CHECK: %xmm0 = VCVTPD2DQrr %xmm0
%xmm0 = VCVTPD2DQZ128rr %xmm0
- ; CHECK: %xmm0 = VCVTPD2PSrm %rdi, %xmm0, 1, _, 0
- %xmm0 = VCVTPD2PSZ128rm %rdi, %xmm0, 1, _, 0
+ ; CHECK: %xmm0 = VCVTPD2PSrm %rdi, %xmm0, 1, %noreg, 0
+ %xmm0 = VCVTPD2PSZ128rm %rdi, %xmm0, 1, %noreg, 0
; CHECK: %xmm0 = VCVTPD2PSrr %xmm0
%xmm0 = VCVTPD2PSZ128rr %xmm0
- ; CHECK: %xmm0 = VCVTPS2DQrm %rdi, %xmm0, 1, _, 0
- %xmm0 = VCVTPS2DQZ128rm %rdi, %xmm0, 1, _, 0
+ ; CHECK: %xmm0 = VCVTPS2DQrm %rdi, %xmm0, 1, %noreg, 0
+ %xmm0 = VCVTPS2DQZ128rm %rdi, %xmm0, 1, %noreg, 0
; CHECK: %xmm0 = VCVTPS2DQrr %xmm0
%xmm0 = VCVTPS2DQZ128rr %xmm0
- ; CHECK: %xmm0 = VCVTPS2PDrm %rdi, %xmm0, 1, _, 0
- %xmm0 = VCVTPS2PDZ128rm %rdi, %xmm0, 1, _, 0
+ ; CHECK: %xmm0 = VCVTPS2PDrm %rdi, %xmm0, 1, %noreg, 0
+ %xmm0 = VCVTPS2PDZ128rm %rdi, %xmm0, 1, %noreg, 0
; CHECK: %xmm0 = VCVTPS2PDrr %xmm0
%xmm0 = VCVTPS2PDZ128rr %xmm0
- ; CHECK: %xmm0 = VCVTTPD2DQrm %rdi, %xmm0, 1, _, 0
- %xmm0 = VCVTTPD2DQZ128rm %rdi, %xmm0, 1, _, 0
+ ; CHECK: %xmm0 = VCVTTPD2DQrm %rdi, %xmm0, 1, %noreg, 0
+ %xmm0 = VCVTTPD2DQZ128rm %rdi, %xmm0, 1, %noreg, 0
; CHECK: %xmm0 = VCVTTPD2DQrr %xmm0
%xmm0 = VCVTTPD2DQZ128rr %xmm0
- ; CHECK: %xmm0 = VCVTTPS2DQrm %rdi, %xmm0, 1, _, 0
- %xmm0 = VCVTTPS2DQZ128rm %rdi, %xmm0, 1, _, 0
+ ; CHECK: %xmm0 = VCVTTPS2DQrm %rdi, %xmm0, 1, %noreg, 0
+ %xmm0 = VCVTTPS2DQZ128rm %rdi, %xmm0, 1, %noreg, 0
; CHECK: %xmm0 = VCVTTPS2DQrr %xmm0
%xmm0 = VCVTTPS2DQZ128rr %xmm0
- ; CHECK: %xmm0 = VSQRTPDm %rdi, _, _, _, _
- %xmm0 = VSQRTPDZ128m %rdi, _, _, _, _
+ ; CHECK: %xmm0 = VSQRTPDm %rdi, %noreg, %noreg, %noreg, %noreg
+ %xmm0 = VSQRTPDZ128m %rdi, %noreg, %noreg, %noreg, %noreg
; CHECK: %xmm0 = VSQRTPDr %xmm0
%xmm0 = VSQRTPDZ128r %xmm0
- ; CHECK: %xmm0 = VSQRTPSm %rdi, _, _, _, _
- %xmm0 = VSQRTPSZ128m %rdi, _, _, _, _
+ ; CHECK: %xmm0 = VSQRTPSm %rdi, %noreg, %noreg, %noreg, %noreg
+ %xmm0 = VSQRTPSZ128m %rdi, %noreg, %noreg, %noreg, %noreg
; CHECK: %xmm0 = VSQRTPSr %xmm0
%xmm0 = VSQRTPSZ128r %xmm0
- ; CHECK: %xmm0 = VMOVDDUPrm %rdi, 1, _, 0, _
- %xmm0 = VMOVDDUPZ128rm %rdi, 1, _, 0, _
+ ; CHECK: %xmm0 = VMOVDDUPrm %rdi, 1, %noreg, 0, %noreg
+ %xmm0 = VMOVDDUPZ128rm %rdi, 1, %noreg, 0, %noreg
; CHECK: %xmm0 = VMOVDDUPrr %xmm0
%xmm0 = VMOVDDUPZ128rr %xmm0
- ; CHECK: %xmm0 = VMOVSHDUPrm %rdi, 1, _, 0, _
- %xmm0 = VMOVSHDUPZ128rm %rdi, 1, _, 0, _
+ ; CHECK: %xmm0 = VMOVSHDUPrm %rdi, 1, %noreg, 0, %noreg
+ %xmm0 = VMOVSHDUPZ128rm %rdi, 1, %noreg, 0, %noreg
; CHECK: %xmm0 = VMOVSHDUPrr %xmm0
%xmm0 = VMOVSHDUPZ128rr %xmm0
- ; CHECK: %xmm0 = VMOVSLDUPrm %rdi, 1, _, 0, _
- %xmm0 = VMOVSLDUPZ128rm %rdi, 1, _, 0, _
+ ; CHECK: %xmm0 = VMOVSLDUPrm %rdi, 1, %noreg, 0, %noreg
+ %xmm0 = VMOVSLDUPZ128rm %rdi, 1, %noreg, 0, %noreg
; CHECK: %xmm0 = VMOVSLDUPrr %xmm0
%xmm0 = VMOVSLDUPZ128rr %xmm0
- ; CHECK: %xmm0 = VPSHUFBrm %xmm0, _, _, _, _, _
- %xmm0 = VPSHUFBZ128rm %xmm0, _, _, _, _, _
+ ; CHECK: %xmm0 = VPSHUFBrm %xmm0, %noreg, %noreg, %noreg, %noreg, %noreg
+ %xmm0 = VPSHUFBZ128rm %xmm0, %noreg, %noreg, %noreg, %noreg, %noreg
; CHECK: %xmm0 = VPSHUFBrr %xmm0, %xmm1
%xmm0 = VPSHUFBZ128rr %xmm0, %xmm1
- ; CHECK: %xmm0 = VPSHUFDmi %rdi, 1, _, 0, _, _
- %xmm0 = VPSHUFDZ128mi %rdi, 1, _, 0, _, _
+ ; CHECK: %xmm0 = VPSHUFDmi %rdi, 1, %noreg, 0, %noreg, %noreg
+ %xmm0 = VPSHUFDZ128mi %rdi, 1, %noreg, 0, %noreg, %noreg
; CHECK: %xmm0 = VPSHUFDri %xmm0, -24
%xmm0 = VPSHUFDZ128ri %xmm0, -24
- ; CHECK: %xmm0 = VPSHUFHWmi %rdi, 1, _, 0, _, _
- %xmm0 = VPSHUFHWZ128mi %rdi, 1, _, 0, _, _
+ ; CHECK: %xmm0 = VPSHUFHWmi %rdi, 1, %noreg, 0, %noreg, %noreg
+ %xmm0 = VPSHUFHWZ128mi %rdi, 1, %noreg, 0, %noreg, %noreg
; CHECK: %xmm0 = VPSHUFHWri %xmm0, -24
%xmm0 = VPSHUFHWZ128ri %xmm0, -24
- ; CHECK: %xmm0 = VPSHUFLWmi %rdi, 1, _, 0, _, _
- %xmm0 = VPSHUFLWZ128mi %rdi, 1, _, 0, _, _
+ ; CHECK: %xmm0 = VPSHUFLWmi %rdi, 1, %noreg, 0, %noreg, %noreg
+ %xmm0 = VPSHUFLWZ128mi %rdi, 1, %noreg, 0, %noreg, %noreg
; CHECK: %xmm0 = VPSHUFLWri %xmm0, -24
%xmm0 = VPSHUFLWZ128ri %xmm0, -24
; CHECK: %xmm0 = VPSLLDQri %xmm0, %xmm1
%xmm0 = VPSLLDQZ128rr %xmm0, %xmm1
- ; CHECK: %xmm0 = VSHUFPDrmi %xmm0, _, _, _, _, _, _
- %xmm0 = VSHUFPDZ128rmi %xmm0, _, _, _, _, _, _
- ; CHECK: %xmm0 = VSHUFPDrri %xmm0, _, _
- %xmm0 = VSHUFPDZ128rri %xmm0, _, _
- ; CHECK: %xmm0 = VSHUFPSrmi %xmm0, _, _, _, _, _, _
- %xmm0 = VSHUFPSZ128rmi %xmm0, _, _, _, _, _, _
- ; CHECK: %xmm0 = VSHUFPSrri %xmm0, _, _
- %xmm0 = VSHUFPSZ128rri %xmm0, _, _
- ; CHECK: %xmm0 = VPSADBWrm %xmm0, 1, _, %rax, _, _
- %xmm0 = VPSADBWZ128rm %xmm0, 1, _, %rax, _, _
+ ; CHECK: %xmm0 = VSHUFPDrmi %xmm0, %noreg, %noreg, %noreg, %noreg, %noreg, %noreg
+ %xmm0 = VSHUFPDZ128rmi %xmm0, %noreg, %noreg, %noreg, %noreg, %noreg, %noreg
+ ; CHECK: %xmm0 = VSHUFPDrri %xmm0, %noreg, %noreg
+ %xmm0 = VSHUFPDZ128rri %xmm0, %noreg, %noreg
+ ; CHECK: %xmm0 = VSHUFPSrmi %xmm0, %noreg, %noreg, %noreg, %noreg, %noreg, %noreg
+ %xmm0 = VSHUFPSZ128rmi %xmm0, %noreg, %noreg, %noreg, %noreg, %noreg, %noreg
+ ; CHECK: %xmm0 = VSHUFPSrri %xmm0, %noreg, %noreg
+ %xmm0 = VSHUFPSZ128rri %xmm0, %noreg, %noreg
+ ; CHECK: %xmm0 = VPSADBWrm %xmm0, 1, %noreg, %rax, %noreg, %noreg
+ %xmm0 = VPSADBWZ128rm %xmm0, 1, %noreg, %rax, %noreg, %noreg
; CHECK: %xmm0 = VPSADBWrr %xmm0, %xmm1
%xmm0 = VPSADBWZ128rr %xmm0, %xmm1
- ; CHECK: %xmm0 = VBROADCASTSSrm %rip, _, _, _, _
- %xmm0 = VBROADCASTSSZ128m %rip, _, _, _, _
+ ; CHECK: %xmm0 = VBROADCASTSSrm %rip, %noreg, %noreg, %noreg, %noreg
+ %xmm0 = VBROADCASTSSZ128m %rip, %noreg, %noreg, %noreg, %noreg
; CHECK: %xmm0 = VBROADCASTSSrr %xmm0
%xmm0 = VBROADCASTSSZ128r %xmm0
- ; CHECK: %xmm0 = VPBROADCASTBrm %rip, _, _, _, _
- %xmm0 = VPBROADCASTBZ128m %rip, _, _, _, _
+ ; CHECK: %xmm0 = VPBROADCASTBrm %rip, %noreg, %noreg, %noreg, %noreg
+ %xmm0 = VPBROADCASTBZ128m %rip, %noreg, %noreg, %noreg, %noreg
; CHECK: %xmm0 = VPBROADCASTBrr %xmm0
%xmm0 = VPBROADCASTBZ128r %xmm0
- ; CHECK: %xmm0 = VPBROADCASTDrm %rip, _, _, _, _
- %xmm0 = VPBROADCASTDZ128m %rip, _, _, _, _
+ ; CHECK: %xmm0 = VPBROADCASTDrm %rip, %noreg, %noreg, %noreg, %noreg
+ %xmm0 = VPBROADCASTDZ128m %rip, %noreg, %noreg, %noreg, %noreg
; CHECK: %xmm0 = VPBROADCASTDrr %xmm0
%xmm0 = VPBROADCASTDZ128r %xmm0
- ; CHECK: %xmm0 = VPBROADCASTQrm %rip, _, _, _, _
- %xmm0 = VPBROADCASTQZ128m %rip, _, _, _, _
+ ; CHECK: %xmm0 = VPBROADCASTQrm %rip, %noreg, %noreg, %noreg, %noreg
+ %xmm0 = VPBROADCASTQZ128m %rip, %noreg, %noreg, %noreg, %noreg
; CHECK: %xmm0 = VPBROADCASTQrr %xmm0
%xmm0 = VPBROADCASTQZ128r %xmm0
- ; CHECK: %xmm0 = VPBROADCASTWrm %rip, _, _, _, _
- %xmm0 = VPBROADCASTWZ128m %rip, _, _, _, _
+ ; CHECK: %xmm0 = VPBROADCASTWrm %rip, %noreg, %noreg, %noreg, %noreg
+ %xmm0 = VPBROADCASTWZ128m %rip, %noreg, %noreg, %noreg, %noreg
; CHECK: %xmm0 = VPBROADCASTWrr %xmm0
%xmm0 = VPBROADCASTWZ128r %xmm0
- ; CHECK: %xmm0 = VPBROADCASTQrm %rip, _, _, _, _
- %xmm0 = VBROADCASTI32X2Z128m %rip, _, _, _, _
+ ; CHECK: %xmm0 = VPBROADCASTQrm %rip, %noreg, %noreg, %noreg, %noreg
+ %xmm0 = VBROADCASTI32X2Z128m %rip, %noreg, %noreg, %noreg, %noreg
; CHECK: %xmm0 = VPBROADCASTQrr %xmm0
%xmm0 = VBROADCASTI32X2Z128r %xmm0
; CHECK: %xmm0 = VCVTPS2PHrr %xmm0, 2
%xmm0 = VCVTPS2PHZ128rr %xmm0, 2
- ; CHECK: VCVTPS2PHmr %rdi, %xmm0, 1, _, 0, _, _
- VCVTPS2PHZ128mr %rdi, %xmm0, 1, _, 0, _, _
- ; CHECK: %xmm0 = VPABSBrm %rip, 1, _, %rax, _
- %xmm0 = VPABSBZ128rm %rip, 1, _, %rax, _
+ ; CHECK: VCVTPS2PHmr %rdi, %xmm0, 1, %noreg, 0, %noreg, %noreg
+ VCVTPS2PHZ128mr %rdi, %xmm0, 1, %noreg, 0, %noreg, %noreg
+ ; CHECK: %xmm0 = VPABSBrm %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VPABSBZ128rm %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VPABSBrr %xmm0
%xmm0 = VPABSBZ128rr %xmm0
- ; CHECK: %xmm0 = VPABSDrm %rip, 1, _, %rax, _
- %xmm0 = VPABSDZ128rm %rip, 1, _, %rax, _
+ ; CHECK: %xmm0 = VPABSDrm %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VPABSDZ128rm %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VPABSDrr %xmm0
%xmm0 = VPABSDZ128rr %xmm0
- ; CHECK: %xmm0 = VPABSWrm %rip, 1, _, %rax, _
- %xmm0 = VPABSWZ128rm %rip, 1, _, %rax, _
+ ; CHECK: %xmm0 = VPABSWrm %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VPABSWZ128rm %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VPABSWrr %xmm0
%xmm0 = VPABSWZ128rr %xmm0
- ; CHECK: %xmm0 = VPALIGNRrmi %xmm0, _, _, _, _, _, _
- %xmm0 = VPALIGNRZ128rmi %xmm0, _, _, _, _, _, _
+ ; CHECK: %xmm0 = VPALIGNRrmi %xmm0, %noreg, %noreg, %noreg, %noreg, %noreg, %noreg
+ %xmm0 = VPALIGNRZ128rmi %xmm0, %noreg, %noreg, %noreg, %noreg, %noreg, %noreg
; CHECK: %xmm0 = VPALIGNRrri %xmm0, %xmm1, 15
%xmm0 = VPALIGNRZ128rri %xmm0, %xmm1, 15
@@ -1770,552 +1770,552 @@ name: evex_scalar_to_vex_test
body: |
bb.0:
- ; CHECK: %xmm0 = VADDSDrm %xmm0, %rip, 1, _, %rax, _
- %xmm0 = VADDSDZrm %xmm0, %rip, 1, _, %rax, _
- ; CHECK: %xmm0 = VADDSDrm_Int %xmm0, %rip, 1, _, %rax, _
- %xmm0 = VADDSDZrm_Int %xmm0, %rip, 1, _, %rax, _
+ ; CHECK: %xmm0 = VADDSDrm %xmm0, %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VADDSDZrm %xmm0, %rip, 1, %noreg, %rax, %noreg
+ ; CHECK: %xmm0 = VADDSDrm_Int %xmm0, %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VADDSDZrm_Int %xmm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VADDSDrr %xmm0, %xmm1
%xmm0 = VADDSDZrr %xmm0, %xmm1
; CHECK: %xmm0 = VADDSDrr_Int %xmm0, %xmm1
%xmm0 = VADDSDZrr_Int %xmm0, %xmm1
- ; CHECK: %xmm0 = VADDSSrm %xmm0, %rip, 1, _, %rax, _
- %xmm0 = VADDSSZrm %xmm0, %rip, 1, _, %rax, _
- ; CHECK: %xmm0 = VADDSSrm_Int %xmm0, %rip, 1, _, %rax, _
- %xmm0 = VADDSSZrm_Int %xmm0, %rip, 1, _, %rax, _
+ ; CHECK: %xmm0 = VADDSSrm %xmm0, %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VADDSSZrm %xmm0, %rip, 1, %noreg, %rax, %noreg
+ ; CHECK: %xmm0 = VADDSSrm_Int %xmm0, %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VADDSSZrm_Int %xmm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VADDSSrr %xmm0, %xmm1
%xmm0 = VADDSSZrr %xmm0, %xmm1
; CHECK: %xmm0 = VADDSSrr_Int %xmm0, %xmm1
%xmm0 = VADDSSZrr_Int %xmm0, %xmm1
- ; CHECK: %xmm0 = VDIVSDrm %xmm0, %rip, 1, _, %rax, _
- %xmm0 = VDIVSDZrm %xmm0, %rip, 1, _, %rax, _
- ; CHECK: %xmm0 = VDIVSDrm_Int %xmm0, %rip, 1, _, %rax, _
- %xmm0 = VDIVSDZrm_Int %xmm0, %rip, 1, _, %rax, _
+ ; CHECK: %xmm0 = VDIVSDrm %xmm0, %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VDIVSDZrm %xmm0, %rip, 1, %noreg, %rax, %noreg
+ ; CHECK: %xmm0 = VDIVSDrm_Int %xmm0, %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VDIVSDZrm_Int %xmm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VDIVSDrr %xmm0, %xmm1
%xmm0 = VDIVSDZrr %xmm0, %xmm1
; CHECK: %xmm0 = VDIVSDrr_Int %xmm0, %xmm1
%xmm0 = VDIVSDZrr_Int %xmm0, %xmm1
- ; CHECK: %xmm0 = VDIVSSrm %xmm0, %rip, 1, _, %rax, _
- %xmm0 = VDIVSSZrm %xmm0, %rip, 1, _, %rax, _
- ; CHECK: %xmm0 = VDIVSSrm_Int %xmm0, %rip, 1, _, %rax, _
- %xmm0 = VDIVSSZrm_Int %xmm0, %rip, 1, _, %rax, _
+ ; CHECK: %xmm0 = VDIVSSrm %xmm0, %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VDIVSSZrm %xmm0, %rip, 1, %noreg, %rax, %noreg
+ ; CHECK: %xmm0 = VDIVSSrm_Int %xmm0, %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VDIVSSZrm_Int %xmm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VDIVSSrr %xmm0, %xmm1
%xmm0 = VDIVSSZrr %xmm0, %xmm1
; CHECK: %xmm0 = VDIVSSrr_Int %xmm0, %xmm1
%xmm0 = VDIVSSZrr_Int %xmm0, %xmm1
- ; CHECK: %xmm0 = VMAXCSDrm %xmm0, %rip, 1, _, %rax, _
- %xmm0 = VMAXCSDZrm %xmm0, %rip, 1, _, %rax, _
+ ; CHECK: %xmm0 = VMAXCSDrm %xmm0, %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VMAXCSDZrm %xmm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VMAXCSDrr %xmm0, %xmm1
%xmm0 = VMAXCSDZrr %xmm0, %xmm1
- ; CHECK: %xmm0 = VMAXCSSrm %xmm0, %rip, 1, _, %rax, _
- %xmm0 = VMAXCSSZrm %xmm0, %rip, 1, _, %rax, _
+ ; CHECK: %xmm0 = VMAXCSSrm %xmm0, %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VMAXCSSZrm %xmm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VMAXCSSrr %xmm0, %xmm1
%xmm0 = VMAXCSSZrr %xmm0, %xmm1
- ; CHECK: %xmm0 = VMAXCSDrm %xmm0, %rip, 1, _, %rax, _
- %xmm0 = VMAXSDZrm %xmm0, %rip, 1, _, %rax, _
- ; CHECK: %xmm0 = VMAXSDrm_Int %xmm0, %rip, 1, _, %rax, _
- %xmm0 = VMAXSDZrm_Int %xmm0, %rip, 1, _, %rax, _
+ ; CHECK: %xmm0 = VMAXCSDrm %xmm0, %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VMAXSDZrm %xmm0, %rip, 1, %noreg, %rax, %noreg
+ ; CHECK: %xmm0 = VMAXSDrm_Int %xmm0, %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VMAXSDZrm_Int %xmm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VMAXCSDrr %xmm0, %xmm1
%xmm0 = VMAXSDZrr %xmm0, %xmm1
; CHECK: %xmm0 = VMAXSDrr_Int %xmm0, %xmm1
%xmm0 = VMAXSDZrr_Int %xmm0, %xmm1
- ; CHECK: %xmm0 = VMAXCSSrm %xmm0, %rip, 1, _, %rax, _
- %xmm0 = VMAXSSZrm %xmm0, %rip, 1, _, %rax, _
- ; CHECK: %xmm0 = VMAXSSrm_Int %xmm0, %rip, 1, _, %rax, _
- %xmm0 = VMAXSSZrm_Int %xmm0, %rip, 1, _, %rax, _
+ ; CHECK: %xmm0 = VMAXCSSrm %xmm0, %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VMAXSSZrm %xmm0, %rip, 1, %noreg, %rax, %noreg
+ ; CHECK: %xmm0 = VMAXSSrm_Int %xmm0, %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VMAXSSZrm_Int %xmm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VMAXCSSrr %xmm0, %xmm1
%xmm0 = VMAXSSZrr %xmm0, %xmm1
; CHECK: %xmm0 = VMAXSSrr_Int %xmm0, %xmm1
%xmm0 = VMAXSSZrr_Int %xmm0, %xmm1
- ; CHECK: %xmm0 = VMINCSDrm %xmm0, %rip, 1, _, %rax, _
- %xmm0 = VMINCSDZrm %xmm0, %rip, 1, _, %rax, _
+ ; CHECK: %xmm0 = VMINCSDrm %xmm0, %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VMINCSDZrm %xmm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VMINCSDrr %xmm0, %xmm1
%xmm0 = VMINCSDZrr %xmm0, %xmm1
- ; CHECK: %xmm0 = VMINCSSrm %xmm0, %rip, 1, _, %rax, _
- %xmm0 = VMINCSSZrm %xmm0, %rip, 1, _, %rax, _
+ ; CHECK: %xmm0 = VMINCSSrm %xmm0, %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VMINCSSZrm %xmm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VMINCSSrr %xmm0, %xmm1
%xmm0 = VMINCSSZrr %xmm0, %xmm1
- ; CHECK: %xmm0 = VMINCSDrm %xmm0, %rip, 1, _, %rax, _
- %xmm0 = VMINSDZrm %xmm0, %rip, 1, _, %rax, _
- ; CHECK: %xmm0 = VMINSDrm_Int %xmm0, %rip, 1, _, %rax, _
- %xmm0 = VMINSDZrm_Int %xmm0, %rip, 1, _, %rax, _
+ ; CHECK: %xmm0 = VMINCSDrm %xmm0, %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VMINSDZrm %xmm0, %rip, 1, %noreg, %rax, %noreg
+ ; CHECK: %xmm0 = VMINSDrm_Int %xmm0, %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VMINSDZrm_Int %xmm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VMINCSDrr %xmm0, %xmm1
%xmm0 = VMINSDZrr %xmm0, %xmm1
; CHECK: %xmm0 = VMINSDrr_Int %xmm0, %xmm1
%xmm0 = VMINSDZrr_Int %xmm0, %xmm1
- ; CHECK: %xmm0 = VMINCSSrm %xmm0, %rip, 1, _, %rax, _
- %xmm0 = VMINSSZrm %xmm0, %rip, 1, _, %rax, _
- ; CHECK: %xmm0 = VMINSSrm_Int %xmm0, %rip, 1, _, %rax, _
- %xmm0 = VMINSSZrm_Int %xmm0, %rip, 1, _, %rax, _
+ ; CHECK: %xmm0 = VMINCSSrm %xmm0, %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VMINSSZrm %xmm0, %rip, 1, %noreg, %rax, %noreg
+ ; CHECK: %xmm0 = VMINSSrm_Int %xmm0, %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VMINSSZrm_Int %xmm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VMINCSSrr %xmm0, %xmm1
%xmm0 = VMINSSZrr %xmm0, %xmm1
; CHECK: %xmm0 = VMINSSrr_Int %xmm0, %xmm1
%xmm0 = VMINSSZrr_Int %xmm0, %xmm1
- ; CHECK: %xmm0 = VMULSDrm %xmm0, %rip, 1, _, %rax, _
- %xmm0 = VMULSDZrm %xmm0, %rip, 1, _, %rax, _
- ; CHECK: %xmm0 = VMULSDrm_Int %xmm0, %rip, 1, _, %rax, _
- %xmm0 = VMULSDZrm_Int %xmm0, %rip, 1, _, %rax, _
+ ; CHECK: %xmm0 = VMULSDrm %xmm0, %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VMULSDZrm %xmm0, %rip, 1, %noreg, %rax, %noreg
+ ; CHECK: %xmm0 = VMULSDrm_Int %xmm0, %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VMULSDZrm_Int %xmm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VMULSDrr %xmm0, %xmm1
%xmm0 = VMULSDZrr %xmm0, %xmm1
; CHECK: %xmm0 = VMULSDrr_Int %xmm0, %xmm1
%xmm0 = VMULSDZrr_Int %xmm0, %xmm1
- ; CHECK: %xmm0 = VMULSSrm %xmm0, %rip, 1, _, %rax, _
- %xmm0 = VMULSSZrm %xmm0, %rip, 1, _, %rax, _
- ; CHECK: %xmm0 = VMULSSrm_Int %xmm0, %rip, 1, _, %rax, _
- %xmm0 = VMULSSZrm_Int %xmm0, %rip, 1, _, %rax, _
+ ; CHECK: %xmm0 = VMULSSrm %xmm0, %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VMULSSZrm %xmm0, %rip, 1, %noreg, %rax, %noreg
+ ; CHECK: %xmm0 = VMULSSrm_Int %xmm0, %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VMULSSZrm_Int %xmm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VMULSSrr %xmm0, %xmm1
%xmm0 = VMULSSZrr %xmm0, %xmm1
; CHECK: %xmm0 = VMULSSrr_Int %xmm0, %xmm1
%xmm0 = VMULSSZrr_Int %xmm0, %xmm1
- ; CHECK: %xmm0 = VSUBSDrm %xmm0, %rip, 1, _, %rax, _
- %xmm0 = VSUBSDZrm %xmm0, %rip, 1, _, %rax, _
- ; CHECK: %xmm0 = VSUBSDrm_Int %xmm0, %rip, 1, _, %rax, _
- %xmm0 = VSUBSDZrm_Int %xmm0, %rip, 1, _, %rax, _
+ ; CHECK: %xmm0 = VSUBSDrm %xmm0, %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VSUBSDZrm %xmm0, %rip, 1, %noreg, %rax, %noreg
+ ; CHECK: %xmm0 = VSUBSDrm_Int %xmm0, %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VSUBSDZrm_Int %xmm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VSUBSDrr %xmm0, %xmm1
%xmm0 = VSUBSDZrr %xmm0, %xmm1
; CHECK: %xmm0 = VSUBSDrr_Int %xmm0, %xmm1
%xmm0 = VSUBSDZrr_Int %xmm0, %xmm1
- ; CHECK: %xmm0 = VSUBSSrm %xmm0, %rip, 1, _, %rax, _
- %xmm0 = VSUBSSZrm %xmm0, %rip, 1, _, %rax, _
- ; CHECK: %xmm0 = VSUBSSrm_Int %xmm0, %rip, 1, _, %rax, _
- %xmm0 = VSUBSSZrm_Int %xmm0, %rip, 1, _, %rax, _
+ ; CHECK: %xmm0 = VSUBSSrm %xmm0, %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VSUBSSZrm %xmm0, %rip, 1, %noreg, %rax, %noreg
+ ; CHECK: %xmm0 = VSUBSSrm_Int %xmm0, %rip, 1, %noreg, %rax, %noreg
+ %xmm0 = VSUBSSZrm_Int %xmm0, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm0 = VSUBSSrr %xmm0, %xmm1
%xmm0 = VSUBSSZrr %xmm0, %xmm1
; CHECK: %xmm0 = VSUBSSrr_Int %xmm0, %xmm1
%xmm0 = VSUBSSZrr_Int %xmm0, %xmm1
- ; CHECK: %xmm0 = VFMADD132SDm %xmm0, %xmm0, %rsi, 1, _, 0, _
- %xmm0 = VFMADD132SDZm %xmm0, %xmm0, %rsi, 1, _, 0, _
- ; CHECK: %xmm0 = VFMADD132SDm_Int %xmm0, %xmm0, %rsi, 1, _, 0, _
- %xmm0 = VFMADD132SDZm_Int %xmm0, %xmm0, %rsi, 1, _, 0, _
+ ; CHECK: %xmm0 = VFMADD132SDm %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
+ %xmm0 = VFMADD132SDZm %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
+ ; CHECK: %xmm0 = VFMADD132SDm_Int %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
+ %xmm0 = VFMADD132SDZm_Int %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
; CHECK: %xmm0 = VFMADD132SDr %xmm0, %xmm1, %xmm2
%xmm0 = VFMADD132SDZr %xmm0, %xmm1, %xmm2
; CHECK: %xmm0 = VFMADD132SDr_Int %xmm0, %xmm1, %xmm2
%xmm0 = VFMADD132SDZr_Int %xmm0, %xmm1, %xmm2
- ; CHECK: %xmm0 = VFMADD132SSm %xmm0, %xmm0, %rsi, 1, _, 0, _
- %xmm0 = VFMADD132SSZm %xmm0, %xmm0, %rsi, 1, _, 0, _
- ; CHECK: %xmm0 = VFMADD132SSm_Int %xmm0, %xmm0, %rsi, 1, _, 0, _
- %xmm0 = VFMADD132SSZm_Int %xmm0, %xmm0, %rsi, 1, _, 0, _
+ ; CHECK: %xmm0 = VFMADD132SSm %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
+ %xmm0 = VFMADD132SSZm %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
+ ; CHECK: %xmm0 = VFMADD132SSm_Int %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
+ %xmm0 = VFMADD132SSZm_Int %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
; CHECK: %xmm0 = VFMADD132SSr %xmm0, %xmm1, %xmm2
%xmm0 = VFMADD132SSZr %xmm0, %xmm1, %xmm2
; CHECK: %xmm0 = VFMADD132SSr_Int %xmm0, %xmm1, %xmm2
%xmm0 = VFMADD132SSZr_Int %xmm0, %xmm1, %xmm2
- ; CHECK: %xmm0 = VFMADD213SDm %xmm0, %xmm0, %rsi, 1, _, 0, _
- %xmm0 = VFMADD213SDZm %xmm0, %xmm0, %rsi, 1, _, 0, _
- ; CHECK: %xmm0 = VFMADD213SDm_Int %xmm0, %xmm0, %rsi, 1, _, 0, _
- %xmm0 = VFMADD213SDZm_Int %xmm0, %xmm0, %rsi, 1, _, 0, _
+ ; CHECK: %xmm0 = VFMADD213SDm %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
+ %xmm0 = VFMADD213SDZm %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
+ ; CHECK: %xmm0 = VFMADD213SDm_Int %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
+ %xmm0 = VFMADD213SDZm_Int %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
; CHECK: %xmm0 = VFMADD213SDr %xmm0, %xmm1, %xmm2
%xmm0 = VFMADD213SDZr %xmm0, %xmm1, %xmm2
; CHECK: %xmm0 = VFMADD213SDr_Int %xmm0, %xmm1, %xmm2
%xmm0 = VFMADD213SDZr_Int %xmm0, %xmm1, %xmm2
- ; CHECK: %xmm0 = VFMADD213SSm %xmm0, %xmm0, %rsi, 1, _, 0, _
- %xmm0 = VFMADD213SSZm %xmm0, %xmm0, %rsi, 1, _, 0, _
- ; CHECK: %xmm0 = VFMADD213SSm_Int %xmm0, %xmm0, %rsi, 1, _, 0, _
- %xmm0 = VFMADD213SSZm_Int %xmm0, %xmm0, %rsi, 1, _, 0, _
+ ; CHECK: %xmm0 = VFMADD213SSm %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
+ %xmm0 = VFMADD213SSZm %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
+ ; CHECK: %xmm0 = VFMADD213SSm_Int %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
+ %xmm0 = VFMADD213SSZm_Int %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
; CHECK: %xmm0 = VFMADD213SSr %xmm0, %xmm1, %xmm2
%xmm0 = VFMADD213SSZr %xmm0, %xmm1, %xmm2
; CHECK: %xmm0 = VFMADD213SSr_Int %xmm0, %xmm1, %xmm2
%xmm0 = VFMADD213SSZr_Int %xmm0, %xmm1, %xmm2
- ; CHECK: %xmm0 = VFMADD231SDm %xmm0, %xmm0, %rsi, 1, _, 0, _
- %xmm0 = VFMADD231SDZm %xmm0, %xmm0, %rsi, 1, _, 0, _
- ; CHECK: %xmm0 = VFMADD231SDm_Int %xmm0, %xmm0, %rsi, 1, _, 0, _
- %xmm0 = VFMADD231SDZm_Int %xmm0, %xmm0, %rsi, 1, _, 0, _
+ ; CHECK: %xmm0 = VFMADD231SDm %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
+ %xmm0 = VFMADD231SDZm %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
+ ; CHECK: %xmm0 = VFMADD231SDm_Int %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
+ %xmm0 = VFMADD231SDZm_Int %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
; CHECK: %xmm0 = VFMADD231SDr %xmm0, %xmm1, %xmm2
%xmm0 = VFMADD231SDZr %xmm0, %xmm1, %xmm2
; CHECK: %xmm0 = VFMADD231SDr_Int %xmm0, %xmm1, %xmm2
%xmm0 = VFMADD231SDZr_Int %xmm0, %xmm1, %xmm2
- ; CHECK: %xmm0 = VFMADD231SSm %xmm0, %xmm0, %rsi, 1, _, 0, _
- %xmm0 = VFMADD231SSZm %xmm0, %xmm0, %rsi, 1, _, 0, _
- ; CHECK: %xmm0 = VFMADD231SSm_Int %xmm0, %xmm0, %rsi, 1, _, 0, _
- %xmm0 = VFMADD231SSZm_Int %xmm0, %xmm0, %rsi, 1, _, 0, _
+ ; CHECK: %xmm0 = VFMADD231SSm %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
+ %xmm0 = VFMADD231SSZm %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
+ ; CHECK: %xmm0 = VFMADD231SSm_Int %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
+ %xmm0 = VFMADD231SSZm_Int %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
; CHECK: %xmm0 = VFMADD231SSr %xmm0, %xmm1, %xmm2
%xmm0 = VFMADD231SSZr %xmm0, %xmm1, %xmm2
; CHECK: %xmm0 = VFMADD231SSr_Int %xmm0, %xmm1, %xmm2
%xmm0 = VFMADD231SSZr_Int %xmm0, %xmm1, %xmm2
- ; CHECK: %xmm0 = VFMSUB132SDm %xmm0, %xmm0, %rsi, 1, _, 0, _
- %xmm0 = VFMSUB132SDZm %xmm0, %xmm0, %rsi, 1, _, 0, _
- ; CHECK: %xmm0 = VFMSUB132SDm_Int %xmm0, %xmm0, %rsi, 1, _, 0, _
- %xmm0 = VFMSUB132SDZm_Int %xmm0, %xmm0, %rsi, 1, _, 0, _
+ ; CHECK: %xmm0 = VFMSUB132SDm %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
+ %xmm0 = VFMSUB132SDZm %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
+ ; CHECK: %xmm0 = VFMSUB132SDm_Int %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
+ %xmm0 = VFMSUB132SDZm_Int %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
; CHECK: %xmm0 = VFMSUB132SDr %xmm0, %xmm1, %xmm2
%xmm0 = VFMSUB132SDZr %xmm0, %xmm1, %xmm2
; CHECK: %xmm0 = VFMSUB132SDr_Int %xmm0, %xmm1, %xmm2
%xmm0 = VFMSUB132SDZr_Int %xmm0, %xmm1, %xmm2
- ; CHECK: %xmm0 = VFMSUB132SSm %xmm0, %xmm0, %rsi, 1, _, 0, _
- %xmm0 = VFMSUB132SSZm %xmm0, %xmm0, %rsi, 1, _, 0, _
- ; CHECK: %xmm0 = VFMSUB132SSm_Int %xmm0, %xmm0, %rsi, 1, _, 0, _
- %xmm0 = VFMSUB132SSZm_Int %xmm0, %xmm0, %rsi, 1, _, 0, _
+ ; CHECK: %xmm0 = VFMSUB132SSm %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
+ %xmm0 = VFMSUB132SSZm %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
+ ; CHECK: %xmm0 = VFMSUB132SSm_Int %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
+ %xmm0 = VFMSUB132SSZm_Int %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
; CHECK: %xmm0 = VFMSUB132SSr %xmm0, %xmm1, %xmm2
%xmm0 = VFMSUB132SSZr %xmm0, %xmm1, %xmm2
; CHECK: %xmm0 = VFMSUB132SSr_Int %xmm0, %xmm1, %xmm2
%xmm0 = VFMSUB132SSZr_Int %xmm0, %xmm1, %xmm2
- ; CHECK: %xmm0 = VFMSUB213SDm %xmm0, %xmm0, %rsi, 1, _, 0, _
- %xmm0 = VFMSUB213SDZm %xmm0, %xmm0, %rsi, 1, _, 0, _
- ; CHECK: %xmm0 = VFMSUB213SDm_Int %xmm0, %xmm0, %rsi, 1, _, 0, _
- %xmm0 = VFMSUB213SDZm_Int %xmm0, %xmm0, %rsi, 1, _, 0, _
+ ; CHECK: %xmm0 = VFMSUB213SDm %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
+ %xmm0 = VFMSUB213SDZm %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
+ ; CHECK: %xmm0 = VFMSUB213SDm_Int %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
+ %xmm0 = VFMSUB213SDZm_Int %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
; CHECK: %xmm0 = VFMSUB213SDr %xmm0, %xmm1, %xmm2
%xmm0 = VFMSUB213SDZr %xmm0, %xmm1, %xmm2
; CHECK: %xmm0 = VFMSUB213SDr_Int %xmm0, %xmm1, %xmm2
%xmm0 = VFMSUB213SDZr_Int %xmm0, %xmm1, %xmm2
- ; CHECK: %xmm0 = VFMSUB213SSm %xmm0, %xmm0, %rsi, 1, _, 0, _
- %xmm0 = VFMSUB213SSZm %xmm0, %xmm0, %rsi, 1, _, 0, _
- ; CHECK: %xmm0 = VFMSUB213SSm_Int %xmm0, %xmm0, %rsi, 1, _, 0, _
- %xmm0 = VFMSUB213SSZm_Int %xmm0, %xmm0, %rsi, 1, _, 0, _
+ ; CHECK: %xmm0 = VFMSUB213SSm %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
+ %xmm0 = VFMSUB213SSZm %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
+ ; CHECK: %xmm0 = VFMSUB213SSm_Int %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
+ %xmm0 = VFMSUB213SSZm_Int %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
; CHECK: %xmm0 = VFMSUB213SSr %xmm0, %xmm1, %xmm2
%xmm0 = VFMSUB213SSZr %xmm0, %xmm1, %xmm2
; CHECK: %xmm0 = VFMSUB213SSr_Int %xmm0, %xmm1, %xmm2
%xmm0 = VFMSUB213SSZr_Int %xmm0, %xmm1, %xmm2
- ; CHECK: %xmm0 = VFMSUB231SDm %xmm0, %xmm0, %rsi, 1, _, 0, _
- %xmm0 = VFMSUB231SDZm %xmm0, %xmm0, %rsi, 1, _, 0, _
- ; CHECK: %xmm0 = VFMSUB231SDm_Int %xmm0, %xmm0, %rsi, 1, _, 0, _
- %xmm0 = VFMSUB231SDZm_Int %xmm0, %xmm0, %rsi, 1, _, 0, _
+ ; CHECK: %xmm0 = VFMSUB231SDm %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
+ %xmm0 = VFMSUB231SDZm %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
+ ; CHECK: %xmm0 = VFMSUB231SDm_Int %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
+ %xmm0 = VFMSUB231SDZm_Int %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
; CHECK: %xmm0 = VFMSUB231SDr %xmm0, %xmm1, %xmm2
%xmm0 = VFMSUB231SDZr %xmm0, %xmm1, %xmm2
; CHECK: %xmm0 = VFMSUB231SDr_Int %xmm0, %xmm1, %xmm2
%xmm0 = VFMSUB231SDZr_Int %xmm0, %xmm1, %xmm2
- ; CHECK: %xmm0 = VFMSUB231SSm %xmm0, %xmm0, %rsi, 1, _, 0, _
- %xmm0 = VFMSUB231SSZm %xmm0, %xmm0, %rsi, 1, _, 0, _
- ; CHECK: %xmm0 = VFMSUB231SSm_Int %xmm0, %xmm0, %rsi, 1, _, 0, _
- %xmm0 = VFMSUB231SSZm_Int %xmm0, %xmm0, %rsi, 1, _, 0, _
+ ; CHECK: %xmm0 = VFMSUB231SSm %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
+ %xmm0 = VFMSUB231SSZm %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
+ ; CHECK: %xmm0 = VFMSUB231SSm_Int %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
+ %xmm0 = VFMSUB231SSZm_Int %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
; CHECK: %xmm0 = VFMSUB231SSr %xmm0, %xmm1, %xmm2
%xmm0 = VFMSUB231SSZr %xmm0, %xmm1, %xmm2
; CHECK: %xmm0 = VFMSUB231SSr_Int %xmm0, %xmm1, %xmm2
%xmm0 = VFMSUB231SSZr_Int %xmm0, %xmm1, %xmm2
- ; CHECK: %xmm0 = VFNMADD132SDm %xmm0, %xmm0, %rsi, 1, _, 0, _
- %xmm0 = VFNMADD132SDZm %xmm0, %xmm0, %rsi, 1, _, 0, _
- ; CHECK: %xmm0 = VFNMADD132SDm_Int %xmm0, %xmm0, %rsi, 1, _, 0, _
- %xmm0 = VFNMADD132SDZm_Int %xmm0, %xmm0, %rsi, 1, _, 0, _
+ ; CHECK: %xmm0 = VFNMADD132SDm %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
+ %xmm0 = VFNMADD132SDZm %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
+ ; CHECK: %xmm0 = VFNMADD132SDm_Int %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
+ %xmm0 = VFNMADD132SDZm_Int %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
; CHECK: %xmm0 = VFNMADD132SDr %xmm0, %xmm1, %xmm2
%xmm0 = VFNMADD132SDZr %xmm0, %xmm1, %xmm2
; CHECK: %xmm0 = VFNMADD132SDr_Int %xmm0, %xmm1, %xmm2
%xmm0 = VFNMADD132SDZr_Int %xmm0, %xmm1, %xmm2
- ; CHECK: %xmm0 = VFNMADD132SSm %xmm0, %xmm0, %rsi, 1, _, 0, _
- %xmm0 = VFNMADD132SSZm %xmm0, %xmm0, %rsi, 1, _, 0, _
- ; CHECK: %xmm0 = VFNMADD132SSm_Int %xmm0, %xmm0, %rsi, 1, _, 0, _
- %xmm0 = VFNMADD132SSZm_Int %xmm0, %xmm0, %rsi, 1, _, 0, _
+ ; CHECK: %xmm0 = VFNMADD132SSm %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
+ %xmm0 = VFNMADD132SSZm %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
+ ; CHECK: %xmm0 = VFNMADD132SSm_Int %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
+ %xmm0 = VFNMADD132SSZm_Int %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
; CHECK: %xmm0 = VFNMADD132SSr %xmm0, %xmm1, %xmm2
%xmm0 = VFNMADD132SSZr %xmm0, %xmm1, %xmm2
; CHECK: %xmm0 = VFNMADD132SSr_Int %xmm0, %xmm1, %xmm2
%xmm0 = VFNMADD132SSZr_Int %xmm0, %xmm1, %xmm2
- ; CHECK: %xmm0 = VFNMADD213SDm %xmm0, %xmm0, %rsi, 1, _, 0, _
- %xmm0 = VFNMADD213SDZm %xmm0, %xmm0, %rsi, 1, _, 0, _
- ; CHECK: %xmm0 = VFNMADD213SDm_Int %xmm0, %xmm0, %rsi, 1, _, 0, _
- %xmm0 = VFNMADD213SDZm_Int %xmm0, %xmm0, %rsi, 1, _, 0, _
+ ; CHECK: %xmm0 = VFNMADD213SDm %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
+ %xmm0 = VFNMADD213SDZm %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
+ ; CHECK: %xmm0 = VFNMADD213SDm_Int %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
+ %xmm0 = VFNMADD213SDZm_Int %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
; CHECK: %xmm0 = VFNMADD213SDr %xmm0, %xmm1, %xmm2
%xmm0 = VFNMADD213SDZr %xmm0, %xmm1, %xmm2
; CHECK: %xmm0 = VFNMADD213SDr_Int %xmm0, %xmm1, %xmm2
%xmm0 = VFNMADD213SDZr_Int %xmm0, %xmm1, %xmm2
- ; CHECK: %xmm0 = VFNMADD213SSm %xmm0, %xmm0, %rsi, 1, _, 0, _
- %xmm0 = VFNMADD213SSZm %xmm0, %xmm0, %rsi, 1, _, 0, _
- ; CHECK: %xmm0 = VFNMADD213SSm_Int %xmm0, %xmm0, %rsi, 1, _, 0, _
- %xmm0 = VFNMADD213SSZm_Int %xmm0, %xmm0, %rsi, 1, _, 0, _
+ ; CHECK: %xmm0 = VFNMADD213SSm %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
+ %xmm0 = VFNMADD213SSZm %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
+ ; CHECK: %xmm0 = VFNMADD213SSm_Int %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
+ %xmm0 = VFNMADD213SSZm_Int %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
; CHECK: %xmm0 = VFNMADD213SSr %xmm0, %xmm1, %xmm2
%xmm0 = VFNMADD213SSZr %xmm0, %xmm1, %xmm2
; CHECK: %xmm0 = VFNMADD213SSr_Int %xmm0, %xmm1, %xmm2
%xmm0 = VFNMADD213SSZr_Int %xmm0, %xmm1, %xmm2
- ; CHECK: %xmm0 = VFNMADD231SDm %xmm0, %xmm0, %rsi, 1, _, 0, _
- %xmm0 = VFNMADD231SDZm %xmm0, %xmm0, %rsi, 1, _, 0, _
- ; CHECK: %xmm0 = VFNMADD231SDm_Int %xmm0, %xmm0, %rsi, 1, _, 0, _
- %xmm0 = VFNMADD231SDZm_Int %xmm0, %xmm0, %rsi, 1, _, 0, _
+ ; CHECK: %xmm0 = VFNMADD231SDm %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
+ %xmm0 = VFNMADD231SDZm %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
+ ; CHECK: %xmm0 = VFNMADD231SDm_Int %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
+ %xmm0 = VFNMADD231SDZm_Int %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
; CHECK: %xmm0 = VFNMADD231SDr %xmm0, %xmm1, %xmm2
%xmm0 = VFNMADD231SDZr %xmm0, %xmm1, %xmm2
; CHECK: %xmm0 = VFNMADD231SDr_Int %xmm0, %xmm1, %xmm2
%xmm0 = VFNMADD231SDZr_Int %xmm0, %xmm1, %xmm2
- ; CHECK: %xmm0 = VFNMADD231SSm %xmm0, %xmm0, %rsi, 1, _, 0, _
- %xmm0 = VFNMADD231SSZm %xmm0, %xmm0, %rsi, 1, _, 0, _
- ; CHECK: %xmm0 = VFNMADD231SSm_Int %xmm0, %xmm0, %rsi, 1, _, 0, _
- %xmm0 = VFNMADD231SSZm_Int %xmm0, %xmm0, %rsi, 1, _, 0, _
+ ; CHECK: %xmm0 = VFNMADD231SSm %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
+ %xmm0 = VFNMADD231SSZm %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
+ ; CHECK: %xmm0 = VFNMADD231SSm_Int %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
+ %xmm0 = VFNMADD231SSZm_Int %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
; CHECK: %xmm0 = VFNMADD231SSr %xmm0, %xmm1, %xmm2
%xmm0 = VFNMADD231SSZr %xmm0, %xmm1, %xmm2
; CHECK: %xmm0 = VFNMADD231SSr_Int %xmm0, %xmm1, %xmm2
%xmm0 = VFNMADD231SSZr_Int %xmm0, %xmm1, %xmm2
- ; CHECK: %xmm0 = VFNMSUB132SDm %xmm0, %xmm0, %rsi, 1, _, 0, _
- %xmm0 = VFNMSUB132SDZm %xmm0, %xmm0, %rsi, 1, _, 0, _
- ; CHECK: %xmm0 = VFNMSUB132SDm_Int %xmm0, %xmm0, %rsi, 1, _, 0, _
- %xmm0 = VFNMSUB132SDZm_Int %xmm0, %xmm0, %rsi, 1, _, 0, _
+ ; CHECK: %xmm0 = VFNMSUB132SDm %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
+ %xmm0 = VFNMSUB132SDZm %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
+ ; CHECK: %xmm0 = VFNMSUB132SDm_Int %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
+ %xmm0 = VFNMSUB132SDZm_Int %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
; CHECK: %xmm0 = VFNMSUB132SDr %xmm0, %xmm1, %xmm2
%xmm0 = VFNMSUB132SDZr %xmm0, %xmm1, %xmm2
; CHECK: %xmm0 = VFNMSUB132SDr_Int %xmm0, %xmm1, %xmm2
%xmm0 = VFNMSUB132SDZr_Int %xmm0, %xmm1, %xmm2
- ; CHECK: %xmm0 = VFNMSUB132SSm %xmm0, %xmm0, %rsi, 1, _, 0, _
- %xmm0 = VFNMSUB132SSZm %xmm0, %xmm0, %rsi, 1, _, 0, _
- ; CHECK: %xmm0 = VFNMSUB132SSm_Int %xmm0, %xmm0, %rsi, 1, _, 0, _
- %xmm0 = VFNMSUB132SSZm_Int %xmm0, %xmm0, %rsi, 1, _, 0, _
+ ; CHECK: %xmm0 = VFNMSUB132SSm %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
+ %xmm0 = VFNMSUB132SSZm %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
+ ; CHECK: %xmm0 = VFNMSUB132SSm_Int %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
+ %xmm0 = VFNMSUB132SSZm_Int %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
; CHECK: %xmm0 = VFNMSUB132SSr %xmm0, %xmm1, %xmm2
%xmm0 = VFNMSUB132SSZr %xmm0, %xmm1, %xmm2
; CHECK: %xmm0 = VFNMSUB132SSr_Int %xmm0, %xmm1, %xmm2
%xmm0 = VFNMSUB132SSZr_Int %xmm0, %xmm1, %xmm2
- ; CHECK: %xmm0 = VFNMSUB213SDm %xmm0, %xmm0, %rsi, 1, _, 0, _
- %xmm0 = VFNMSUB213SDZm %xmm0, %xmm0, %rsi, 1, _, 0, _
- ; CHECK: %xmm0 = VFNMSUB213SDm_Int %xmm0, %xmm0, %rsi, 1, _, 0, _
- %xmm0 = VFNMSUB213SDZm_Int %xmm0, %xmm0, %rsi, 1, _, 0, _
+ ; CHECK: %xmm0 = VFNMSUB213SDm %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
+ %xmm0 = VFNMSUB213SDZm %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
+ ; CHECK: %xmm0 = VFNMSUB213SDm_Int %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
+ %xmm0 = VFNMSUB213SDZm_Int %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
; CHECK: %xmm0 = VFNMSUB213SDr %xmm0, %xmm1, %xmm2
%xmm0 = VFNMSUB213SDZr %xmm0, %xmm1, %xmm2
; CHECK: %xmm0 = VFNMSUB213SDr_Int %xmm0, %xmm1, %xmm2
%xmm0 = VFNMSUB213SDZr_Int %xmm0, %xmm1, %xmm2
- ; CHECK: %xmm0 = VFNMSUB213SSm %xmm0, %xmm0, %rsi, 1, _, 0, _
- %xmm0 = VFNMSUB213SSZm %xmm0, %xmm0, %rsi, 1, _, 0, _
- ; CHECK: %xmm0 = VFNMSUB213SSm_Int %xmm0, %xmm0, %rsi, 1, _, 0, _
- %xmm0 = VFNMSUB213SSZm_Int %xmm0, %xmm0, %rsi, 1, _, 0, _
+ ; CHECK: %xmm0 = VFNMSUB213SSm %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
+ %xmm0 = VFNMSUB213SSZm %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
+ ; CHECK: %xmm0 = VFNMSUB213SSm_Int %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
+ %xmm0 = VFNMSUB213SSZm_Int %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
; CHECK: %xmm0 = VFNMSUB213SSr %xmm0, %xmm1, %xmm2
%xmm0 = VFNMSUB213SSZr %xmm0, %xmm1, %xmm2
; CHECK: %xmm0 = VFNMSUB213SSr_Int %xmm0, %xmm1, %xmm2
%xmm0 = VFNMSUB213SSZr_Int %xmm0, %xmm1, %xmm2
- ; CHECK: %xmm0 = VFNMSUB231SDm %xmm0, %xmm0, %rsi, 1, _, 0, _
- %xmm0 = VFNMSUB231SDZm %xmm0, %xmm0, %rsi, 1, _, 0, _
- ; CHECK: %xmm0 = VFNMSUB231SDm_Int %xmm0, %xmm0, %rsi, 1, _, 0, _
- %xmm0 = VFNMSUB231SDZm_Int %xmm0, %xmm0, %rsi, 1, _, 0, _
+ ; CHECK: %xmm0 = VFNMSUB231SDm %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
+ %xmm0 = VFNMSUB231SDZm %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
+ ; CHECK: %xmm0 = VFNMSUB231SDm_Int %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
+ %xmm0 = VFNMSUB231SDZm_Int %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
; CHECK: %xmm0 = VFNMSUB231SDr %xmm0, %xmm1, %xmm2
%xmm0 = VFNMSUB231SDZr %xmm0, %xmm1, %xmm2
; CHECK: %xmm0 = VFNMSUB231SDr_Int %xmm0, %xmm1, %xmm2
%xmm0 = VFNMSUB231SDZr_Int %xmm0, %xmm1, %xmm2
- ; CHECK: %xmm0 = VFNMSUB231SSm %xmm0, %xmm0, %rsi, 1, _, 0, _
- %xmm0 = VFNMSUB231SSZm %xmm0, %xmm0, %rsi, 1, _, 0, _
- ; CHECK: %xmm0 = VFNMSUB231SSm_Int %xmm0, %xmm0, %rsi, 1, _, 0, _
- %xmm0 = VFNMSUB231SSZm_Int %xmm0, %xmm0, %rsi, 1, _, 0, _
+ ; CHECK: %xmm0 = VFNMSUB231SSm %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
+ %xmm0 = VFNMSUB231SSZm %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
+ ; CHECK: %xmm0 = VFNMSUB231SSm_Int %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
+ %xmm0 = VFNMSUB231SSZm_Int %xmm0, %xmm0, %rsi, 1, %noreg, 0, %noreg
; CHECK: %xmm0 = VFNMSUB231SSr %xmm0, %xmm1, %xmm2
%xmm0 = VFNMSUB231SSZr %xmm0, %xmm1, %xmm2
; CHECK: %xmm0 = VFNMSUB231SSr_Int %xmm0, %xmm1, %xmm2
%xmm0 = VFNMSUB231SSZr_Int %xmm0, %xmm1, %xmm2
- ; CHECK: VPEXTRBmr %rdi, 1, _, 0, _, %xmm0, 3
- VPEXTRBZmr %rdi, 1, _, 0, _, %xmm0, 3
+ ; CHECK: VPEXTRBmr %rdi, 1, %noreg, 0, %noreg, %xmm0, 3
+ VPEXTRBZmr %rdi, 1, %noreg, 0, %noreg, %xmm0, 3
; CHECK: %eax = VPEXTRBrr %xmm0, 1
%eax = VPEXTRBZrr %xmm0, 1
- ; CHECK: VPEXTRDmr %rdi, 1, _, 0, _, %xmm0, 3
- VPEXTRDZmr %rdi, 1, _, 0, _, %xmm0, 3
+ ; CHECK: VPEXTRDmr %rdi, 1, %noreg, 0, %noreg, %xmm0, 3
+ VPEXTRDZmr %rdi, 1, %noreg, 0, %noreg, %xmm0, 3
; CHECK: %eax = VPEXTRDrr %xmm0, 1
%eax = VPEXTRDZrr %xmm0, 1
- ; CHECK: VPEXTRQmr %rdi, 1, _, 0, _, %xmm0, 3
- VPEXTRQZmr %rdi, 1, _, 0, _, %xmm0, 3
+ ; CHECK: VPEXTRQmr %rdi, 1, %noreg, 0, %noreg, %xmm0, 3
+ VPEXTRQZmr %rdi, 1, %noreg, 0, %noreg, %xmm0, 3
; CHECK: %rax = VPEXTRQrr %xmm0, 1
%rax = VPEXTRQZrr %xmm0, 1
- ; CHECK: VPEXTRWmr %rdi, 1, _, 0, _, %xmm0, 3
- VPEXTRWZmr %rdi, 1, _, 0, _, %xmm0, 3
+ ; CHECK: VPEXTRWmr %rdi, 1, %noreg, 0, %noreg, %xmm0, 3
+ VPEXTRWZmr %rdi, 1, %noreg, 0, %noreg, %xmm0, 3
; CHECK: %eax = VPEXTRWri %xmm0, 1
%eax = VPEXTRWZrr %xmm0, 1
; CHECK: %eax = VPEXTRWrr_REV %xmm0, 1
%eax = VPEXTRWZrr_REV %xmm0, 1
- ; CHECK: %xmm0 = VPINSRBrm %xmm0, %rsi, 1, _, 0, _, 3
- %xmm0 = VPINSRBZrm %xmm0, %rsi, 1, _, 0, _, 3
+ ; CHECK: %xmm0 = VPINSRBrm %xmm0, %rsi, 1, %noreg, 0, %noreg, 3
+ %xmm0 = VPINSRBZrm %xmm0, %rsi, 1, %noreg, 0, %noreg, 3
; CHECK: %xmm0 = VPINSRBrr %xmm0, %edi, 5
%xmm0 = VPINSRBZrr %xmm0, %edi, 5
- ; CHECK: %xmm0 = VPINSRDrm %xmm0, %rsi, 1, _, 0, _, 3
- %xmm0 = VPINSRDZrm %xmm0, %rsi, 1, _, 0, _, 3
+ ; CHECK: %xmm0 = VPINSRDrm %xmm0, %rsi, 1, %noreg, 0, %noreg, 3
+ %xmm0 = VPINSRDZrm %xmm0, %rsi, 1, %noreg, 0, %noreg, 3
; CHECK: %xmm0 = VPINSRDrr %xmm0, %edi, 5
%xmm0 = VPINSRDZrr %xmm0, %edi, 5
- ; CHECK: %xmm0 = VPINSRQrm %xmm0, %rsi, 1, _, 0, _, 3
- %xmm0 = VPINSRQZrm %xmm0, %rsi, 1, _, 0, _, 3
+ ; CHECK: %xmm0 = VPINSRQrm %xmm0, %rsi, 1, %noreg, 0, %noreg, 3
+ %xmm0 = VPINSRQZrm %xmm0, %rsi, 1, %noreg, 0, %noreg, 3
; CHECK: %xmm0 = VPINSRQrr %xmm0, %rdi, 5
%xmm0 = VPINSRQZrr %xmm0, %rdi, 5
- ; CHECK: %xmm0 = VPINSRWrmi %xmm0, %rsi, 1, _, 0, _, 3
- %xmm0 = VPINSRWZrm %xmm0, %rsi, 1, _, 0, _, 3
+ ; CHECK: %xmm0 = VPINSRWrmi %xmm0, %rsi, 1, %noreg, 0, %noreg, 3
+ %xmm0 = VPINSRWZrm %xmm0, %rsi, 1, %noreg, 0, %noreg, 3
; CHECK: %xmm0 = VPINSRWrri %xmm0, %edi, 5
%xmm0 = VPINSRWZrr %xmm0, %edi, 5
- ; CHECK: %xmm0 = VSQRTSDm %xmm0, _, _, _, _, _
- %xmm0 = VSQRTSDZm %xmm0, _, _, _, _, _
- ; CHECK: %xmm0 = VSQRTSDm_Int %xmm0, _, _, _, _, _
- %xmm0 = VSQRTSDZm_Int %xmm0, _, _, _, _, _
- ; CHECK: %xmm0 = VSQRTSDr %xmm0, _
- %xmm0 = VSQRTSDZr %xmm0, _
- ; CHECK: %xmm0 = VSQRTSDr_Int %xmm0, _
- %xmm0 = VSQRTSDZr_Int %xmm0, _
- ; CHECK: %xmm0 = VSQRTSSm %xmm0, _, _, _, _, _
- %xmm0 = VSQRTSSZm %xmm0, _, _, _, _, _
- ; CHECK: %xmm0 = VSQRTSSm_Int %xmm0, _, _, _, _, _
- %xmm0 = VSQRTSSZm_Int %xmm0, _, _, _, _, _
- ; CHECK: %xmm0 = VSQRTSSr %xmm0, _
- %xmm0 = VSQRTSSZr %xmm0, _
- ; CHECK: %xmm0 = VSQRTSSr_Int %xmm0, _
- %xmm0 = VSQRTSSZr_Int %xmm0, _
- ; CHECK: %rdi = VCVTSD2SI64rr %xmm0
- %rdi = VCVTSD2SI64Zrr %xmm0
- ; CHECK: %edi = VCVTSD2SIrr %xmm0
- %edi = VCVTSD2SIZrr %xmm0
- ; CHECK: %xmm0 = VCVTSD2SSrm %xmm0, %rdi, 1, _, 0, _
- %xmm0 = VCVTSD2SSZrm %xmm0, %rdi, 1, _, 0, _
- ; CHECK: %xmm0 = Int_VCVTSD2SSrm %xmm0, %rdi, 1, _, 0, _
- %xmm0 = VCVTSD2SSZrm_Int %xmm0, %rdi, 1, _, 0, _
- ; CHECK: %xmm0 = VCVTSD2SSrr %xmm0, _
- %xmm0 = VCVTSD2SSZrr %xmm0, _
- ; CHECK: %xmm0 = Int_VCVTSD2SSrr %xmm0, _
- %xmm0 = VCVTSD2SSZrr_Int %xmm0, _
- ; CHECK: %xmm0 = VCVTSI2SDrm %xmm0, %rdi, 1, _, 0, _
- %xmm0 = VCVTSI2SDZrm %xmm0, %rdi, 1, _, 0, _
- ; CHECK: %xmm0 = Int_VCVTSI2SDrm %xmm0, %rdi, 1, _, 0, _
- %xmm0 = VCVTSI2SDZrm_Int %xmm0, %rdi, 1, _, 0, _
- ; CHECK: %xmm0 = VCVTSI2SDrr %xmm0, _
- %xmm0 = VCVTSI2SDZrr %xmm0, _
- ; CHECK: %xmm0 = Int_VCVTSI2SDrr %xmm0, _
- %xmm0 = VCVTSI2SDZrr_Int %xmm0, _
- ; CHECK: %xmm0 = VCVTSI2SSrm %xmm0, %rdi, 1, _, 0, _
- %xmm0 = VCVTSI2SSZrm %xmm0, %rdi, 1, _, 0, _
- ; CHECK: %xmm0 = Int_VCVTSI2SSrm %xmm0, %rdi, 1, _, 0, _
- %xmm0 = VCVTSI2SSZrm_Int %xmm0, %rdi, 1, _, 0, _
- ; CHECK: %xmm0 = VCVTSI2SSrr %xmm0, _
- %xmm0 = VCVTSI2SSZrr %xmm0, _
- ; CHECK: %xmm0 = Int_VCVTSI2SSrr %xmm0, _
- %xmm0 = VCVTSI2SSZrr_Int %xmm0, _
- ; CHECK: %xmm0 = VCVTSI2SD64rm %xmm0, %rdi, 1, _, 0, _
- %xmm0 = VCVTSI642SDZrm %xmm0, %rdi, 1, _, 0, _
- ; CHECK: %xmm0 = Int_VCVTSI2SD64rm %xmm0, %rdi, 1, _, 0, _
- %xmm0 = VCVTSI642SDZrm_Int %xmm0, %rdi, 1, _, 0, _
- ; CHECK: %xmm0 = VCVTSI2SD64rr %xmm0, _
- %xmm0 = VCVTSI642SDZrr %xmm0, _
- ; CHECK: %xmm0 = Int_VCVTSI2SD64rr %xmm0, _
- %xmm0 = VCVTSI642SDZrr_Int %xmm0, _
- ; CHECK: %xmm0 = VCVTSI2SS64rm %xmm0, %rdi, 1, _, 0, _
- %xmm0 = VCVTSI642SSZrm %xmm0, %rdi, 1, _, 0, _
- ; CHECK: %xmm0 = Int_VCVTSI2SS64rm %xmm0, %rdi, 1, _, 0, _
- %xmm0 = VCVTSI642SSZrm_Int %xmm0, %rdi, 1, _, 0, _
- ; CHECK: %xmm0 = VCVTSI2SS64rr %xmm0, _
- %xmm0 = VCVTSI642SSZrr %xmm0, _
- ; CHECK: %xmm0 = Int_VCVTSI2SS64rr %xmm0, _
- %xmm0 = VCVTSI642SSZrr_Int %xmm0, _
- ; CHECK: %xmm0 = VCVTSS2SDrm %xmm0, %rdi, 1, _, 0, _
- %xmm0 = VCVTSS2SDZrm %xmm0, %rdi, 1, _, 0, _
- ; CHECK: %xmm0 = Int_VCVTSS2SDrm %xmm0, %rdi, 1, _, 0, _
- %xmm0 = VCVTSS2SDZrm_Int %xmm0, %rdi, 1, _, 0, _
- ; CHECK: %xmm0 = VCVTSS2SDrr %xmm0, _
- %xmm0 = VCVTSS2SDZrr %xmm0, _
- ; CHECK: %xmm0 = Int_VCVTSS2SDrr %xmm0, _
- %xmm0 = VCVTSS2SDZrr_Int %xmm0, _
- ; CHECK: %rdi = VCVTSS2SI64rm %rdi, %xmm0, 1, _, 0
- %rdi = VCVTSS2SI64Zrm %rdi, %xmm0, 1, _, 0
- ; CHECK: %rdi = VCVTSS2SI64rr %xmm0
- %rdi = VCVTSS2SI64Zrr %xmm0
- ; CHECK: %edi = VCVTSS2SIrm %rdi, %xmm0, 1, _, 0
- %edi = VCVTSS2SIZrm %rdi, %xmm0, 1, _, 0
- ; CHECK: %edi = VCVTSS2SIrr %xmm0
- %edi = VCVTSS2SIZrr %xmm0
- ; CHECK: %rdi = VCVTTSD2SI64rm %rdi, %xmm0, 1, _, 0
- %rdi = VCVTTSD2SI64Zrm %rdi, %xmm0, 1, _, 0
- ; CHECK: %rdi = Int_VCVTTSD2SI64rm %rdi, %xmm0, 1, _, 0
- %rdi = VCVTTSD2SI64Zrm_Int %rdi, %xmm0, 1, _, 0
+ ; CHECK: %xmm0 = VSQRTSDm %xmm0, %noreg, %noreg, %noreg, %noreg, %noreg
+ %xmm0 = VSQRTSDZm %xmm0, %noreg, %noreg, %noreg, %noreg, %noreg
+ ; CHECK: %xmm0 = VSQRTSDm_Int %xmm0, %noreg, %noreg, %noreg, %noreg, %noreg
+ %xmm0 = VSQRTSDZm_Int %xmm0, %noreg, %noreg, %noreg, %noreg, %noreg
+ ; CHECK: %xmm0 = VSQRTSDr %xmm0, %noreg
+ %xmm0 = VSQRTSDZr %xmm0, %noreg
+ ; CHECK: %xmm0 = VSQRTSDr_Int %xmm0, %noreg
+ %xmm0 = VSQRTSDZr_Int %xmm0, %noreg
+ ; CHECK: %xmm0 = VSQRTSSm %xmm0, %noreg, %noreg, %noreg, %noreg, %noreg
+ %xmm0 = VSQRTSSZm %xmm0, %noreg, %noreg, %noreg, %noreg, %noreg
+ ; CHECK: %xmm0 = VSQRTSSm_Int %xmm0, %noreg, %noreg, %noreg, %noreg, %noreg
+ %xmm0 = VSQRTSSZm_Int %xmm0, %noreg, %noreg, %noreg, %noreg, %noreg
+ ; CHECK: %xmm0 = VSQRTSSr %xmm0, %noreg
+ %xmm0 = VSQRTSSZr %xmm0, %noreg
+ ; CHECK: %xmm0 = VSQRTSSr_Int %xmm0, %noreg
+ %xmm0 = VSQRTSSZr_Int %xmm0, %noreg
+ ; CHECK: %rdi = VCVTSD2SI64rr_Int %xmm0
+ %rdi = VCVTSD2SI64Zrr_Int %xmm0
+ ; CHECK: %edi = VCVTSD2SIrr_Int %xmm0
+ %edi = VCVTSD2SIZrr_Int %xmm0
+ ; CHECK: %xmm0 = VCVTSD2SSrm %xmm0, %rdi, 1, %noreg, 0, %noreg
+ %xmm0 = VCVTSD2SSZrm %xmm0, %rdi, 1, %noreg, 0, %noreg
+ ; CHECK: %xmm0 = VCVTSD2SSrm_Int %xmm0, %rdi, 1, %noreg, 0, %noreg
+ %xmm0 = VCVTSD2SSZrm_Int %xmm0, %rdi, 1, %noreg, 0, %noreg
+ ; CHECK: %xmm0 = VCVTSD2SSrr %xmm0, %noreg
+ %xmm0 = VCVTSD2SSZrr %xmm0, %noreg
+ ; CHECK: %xmm0 = VCVTSD2SSrr_Int %xmm0, %noreg
+ %xmm0 = VCVTSD2SSZrr_Int %xmm0, %noreg
+ ; CHECK: %xmm0 = VCVTSI2SDrm %xmm0, %rdi, 1, %noreg, 0, %noreg
+ %xmm0 = VCVTSI2SDZrm %xmm0, %rdi, 1, %noreg, 0, %noreg
+ ; CHECK: %xmm0 = VCVTSI2SDrm_Int %xmm0, %rdi, 1, %noreg, 0, %noreg
+ %xmm0 = VCVTSI2SDZrm_Int %xmm0, %rdi, 1, %noreg, 0, %noreg
+ ; CHECK: %xmm0 = VCVTSI2SDrr %xmm0, %noreg
+ %xmm0 = VCVTSI2SDZrr %xmm0, %noreg
+ ; CHECK: %xmm0 = VCVTSI2SDrr_Int %xmm0, %noreg
+ %xmm0 = VCVTSI2SDZrr_Int %xmm0, %noreg
+ ; CHECK: %xmm0 = VCVTSI2SSrm %xmm0, %rdi, 1, %noreg, 0, %noreg
+ %xmm0 = VCVTSI2SSZrm %xmm0, %rdi, 1, %noreg, 0, %noreg
+ ; CHECK: %xmm0 = VCVTSI2SSrm_Int %xmm0, %rdi, 1, %noreg, 0, %noreg
+ %xmm0 = VCVTSI2SSZrm_Int %xmm0, %rdi, 1, %noreg, 0, %noreg
+ ; CHECK: %xmm0 = VCVTSI2SSrr %xmm0, %noreg
+ %xmm0 = VCVTSI2SSZrr %xmm0, %noreg
+ ; CHECK: %xmm0 = VCVTSI2SSrr_Int %xmm0, %noreg
+ %xmm0 = VCVTSI2SSZrr_Int %xmm0, %noreg
+ ; CHECK: %xmm0 = VCVTSI642SDrm %xmm0, %rdi, 1, %noreg, 0, %noreg
+ %xmm0 = VCVTSI642SDZrm %xmm0, %rdi, 1, %noreg, 0, %noreg
+ ; CHECK: %xmm0 = VCVTSI642SDrm_Int %xmm0, %rdi, 1, %noreg, 0, %noreg
+ %xmm0 = VCVTSI642SDZrm_Int %xmm0, %rdi, 1, %noreg, 0, %noreg
+ ; CHECK: %xmm0 = VCVTSI642SDrr %xmm0, %noreg
+ %xmm0 = VCVTSI642SDZrr %xmm0, %noreg
+ ; CHECK: %xmm0 = VCVTSI642SDrr_Int %xmm0, %noreg
+ %xmm0 = VCVTSI642SDZrr_Int %xmm0, %noreg
+ ; CHECK: %xmm0 = VCVTSI642SSrm %xmm0, %rdi, 1, %noreg, 0, %noreg
+ %xmm0 = VCVTSI642SSZrm %xmm0, %rdi, 1, %noreg, 0, %noreg
+ ; CHECK: %xmm0 = VCVTSI642SSrm_Int %xmm0, %rdi, 1, %noreg, 0, %noreg
+ %xmm0 = VCVTSI642SSZrm_Int %xmm0, %rdi, 1, %noreg, 0, %noreg
+ ; CHECK: %xmm0 = VCVTSI642SSrr %xmm0, %noreg
+ %xmm0 = VCVTSI642SSZrr %xmm0, %noreg
+ ; CHECK: %xmm0 = VCVTSI642SSrr_Int %xmm0, %noreg
+ %xmm0 = VCVTSI642SSZrr_Int %xmm0, %noreg
+ ; CHECK: %xmm0 = VCVTSS2SDrm %xmm0, %rdi, 1, %noreg, 0, %noreg
+ %xmm0 = VCVTSS2SDZrm %xmm0, %rdi, 1, %noreg, 0, %noreg
+ ; CHECK: %xmm0 = VCVTSS2SDrm_Int %xmm0, %rdi, 1, %noreg, 0, %noreg
+ %xmm0 = VCVTSS2SDZrm_Int %xmm0, %rdi, 1, %noreg, 0, %noreg
+ ; CHECK: %xmm0 = VCVTSS2SDrr %xmm0, %noreg
+ %xmm0 = VCVTSS2SDZrr %xmm0, %noreg
+ ; CHECK: %xmm0 = VCVTSS2SDrr_Int %xmm0, %noreg
+ %xmm0 = VCVTSS2SDZrr_Int %xmm0, %noreg
+ ; CHECK: %rdi = VCVTSS2SI64rm_Int %rdi, %xmm0, 1, %noreg, 0
+ %rdi = VCVTSS2SI64Zrm_Int %rdi, %xmm0, 1, %noreg, 0
+ ; CHECK: %rdi = VCVTSS2SI64rr_Int %xmm0
+ %rdi = VCVTSS2SI64Zrr_Int %xmm0
+ ; CHECK: %edi = VCVTSS2SIrm_Int %rdi, %xmm0, 1, %noreg, 0
+ %edi = VCVTSS2SIZrm_Int %rdi, %xmm0, 1, %noreg, 0
+ ; CHECK: %edi = VCVTSS2SIrr_Int %xmm0
+ %edi = VCVTSS2SIZrr_Int %xmm0
+ ; CHECK: %rdi = VCVTTSD2SI64rm %rdi, %xmm0, 1, %noreg, 0
+ %rdi = VCVTTSD2SI64Zrm %rdi, %xmm0, 1, %noreg, 0
+ ; CHECK: %rdi = VCVTTSD2SI64rm_Int %rdi, %xmm0, 1, %noreg, 0
+ %rdi = VCVTTSD2SI64Zrm_Int %rdi, %xmm0, 1, %noreg, 0
; CHECK: %rdi = VCVTTSD2SI64rr %xmm0
%rdi = VCVTTSD2SI64Zrr %xmm0
- ; CHECK: %rdi = Int_VCVTTSD2SI64rr %xmm0
+ ; CHECK: %rdi = VCVTTSD2SI64rr_Int %xmm0
%rdi = VCVTTSD2SI64Zrr_Int %xmm0
- ; CHECK: %edi = VCVTTSD2SIrm %rdi, %xmm0, 1, _, 0
- %edi = VCVTTSD2SIZrm %rdi, %xmm0, 1, _, 0
- ; CHECK: %edi = Int_VCVTTSD2SIrm %rdi, %xmm0, 1, _, 0
- %edi = VCVTTSD2SIZrm_Int %rdi, %xmm0, 1, _, 0
+ ; CHECK: %edi = VCVTTSD2SIrm %rdi, %xmm0, 1, %noreg, 0
+ %edi = VCVTTSD2SIZrm %rdi, %xmm0, 1, %noreg, 0
+ ; CHECK: %edi = VCVTTSD2SIrm_Int %rdi, %xmm0, 1, %noreg, 0
+ %edi = VCVTTSD2SIZrm_Int %rdi, %xmm0, 1, %noreg, 0
; CHECK: %edi = VCVTTSD2SIrr %xmm0
%edi = VCVTTSD2SIZrr %xmm0
- ; CHECK: %edi = Int_VCVTTSD2SIrr %xmm0
+ ; CHECK: %edi = VCVTTSD2SIrr_Int %xmm0
%edi = VCVTTSD2SIZrr_Int %xmm0
- ; CHECK: %rdi = VCVTTSS2SI64rm %rdi, %xmm0, 1, _, 0
- %rdi = VCVTTSS2SI64Zrm %rdi, %xmm0, 1, _, 0
- ; CHECK: %rdi = Int_VCVTTSS2SI64rm %rdi, %xmm0, 1, _, 0
- %rdi = VCVTTSS2SI64Zrm_Int %rdi, %xmm0, 1, _, 0
+ ; CHECK: %rdi = VCVTTSS2SI64rm %rdi, %xmm0, 1, %noreg, 0
+ %rdi = VCVTTSS2SI64Zrm %rdi, %xmm0, 1, %noreg, 0
+ ; CHECK: %rdi = VCVTTSS2SI64rm_Int %rdi, %xmm0, 1, %noreg, 0
+ %rdi = VCVTTSS2SI64Zrm_Int %rdi, %xmm0, 1, %noreg, 0
; CHECK: %rdi = VCVTTSS2SI64rr %xmm0
%rdi = VCVTTSS2SI64Zrr %xmm0
- ; CHECK: %rdi = Int_VCVTTSS2SI64rr %xmm0
+ ; CHECK: %rdi = VCVTTSS2SI64rr_Int %xmm0
%rdi = VCVTTSS2SI64Zrr_Int %xmm0
- ; CHECK: %edi = VCVTTSS2SIrm %rdi, %xmm0, 1, _, 0
- %edi = VCVTTSS2SIZrm %rdi, %xmm0, 1, _, 0
- ; CHECK: %edi = Int_VCVTTSS2SIrm %rdi, %xmm0, 1, _, 0
- %edi = VCVTTSS2SIZrm_Int %rdi, %xmm0, 1, _, 0
+ ; CHECK: %edi = VCVTTSS2SIrm %rdi, %xmm0, 1, %noreg, 0
+ %edi = VCVTTSS2SIZrm %rdi, %xmm0, 1, %noreg, 0
+ ; CHECK: %edi = VCVTTSS2SIrm_Int %rdi, %xmm0, 1, %noreg, 0
+ %edi = VCVTTSS2SIZrm_Int %rdi, %xmm0, 1, %noreg, 0
; CHECK: %edi = VCVTTSS2SIrr %xmm0
%edi = VCVTTSS2SIZrr %xmm0
- ; CHECK: %edi = Int_VCVTTSS2SIrr %xmm0
+ ; CHECK: %edi = VCVTTSS2SIrr_Int %xmm0
%edi = VCVTTSS2SIZrr_Int %xmm0
; CHECK: %xmm0 = VMOV64toSDrr %rdi
%xmm0 = VMOV64toSDZrr %rdi
- ; CHECK: %xmm0 = VMOVDI2SSrm %rip, _, _, _, _
- %xmm0 = VMOVDI2SSZrm %rip, _, _, _, _
+ ; CHECK: %xmm0 = VMOVDI2SSrm %rip, %noreg, %noreg, %noreg, %noreg
+ %xmm0 = VMOVDI2SSZrm %rip, %noreg, %noreg, %noreg, %noreg
; CHECK: %xmm0 = VMOVDI2SSrr %eax
%xmm0 = VMOVDI2SSZrr %eax
- ; CHECK: VMOVSDmr %rdi, %xmm0, _, _, _, _
- VMOVSDZmr %rdi, %xmm0, _, _, _, _
- ; CHECK: %xmm0 = VMOVSDrm %rip, _, _, _, _
- %xmm0 = VMOVSDZrm %rip, _, _, _, _
- ; CHECK: %xmm0 = VMOVSDrr %xmm0, _
- %xmm0 = VMOVSDZrr %xmm0, _
- ; CHECK: %xmm0 = VMOVSDrr_REV %xmm0, _
- %xmm0 = VMOVSDZrr_REV %xmm0, _
+ ; CHECK: VMOVSDmr %rdi, %xmm0, %noreg, %noreg, %noreg, %noreg
+ VMOVSDZmr %rdi, %xmm0, %noreg, %noreg, %noreg, %noreg
+ ; CHECK: %xmm0 = VMOVSDrm %rip, %noreg, %noreg, %noreg, %noreg
+ %xmm0 = VMOVSDZrm %rip, %noreg, %noreg, %noreg, %noreg
+ ; CHECK: %xmm0 = VMOVSDrr %xmm0, %noreg
+ %xmm0 = VMOVSDZrr %xmm0, %noreg
+ ; CHECK: %xmm0 = VMOVSDrr_REV %xmm0, %noreg
+ %xmm0 = VMOVSDZrr_REV %xmm0, %noreg
; CHECK: %rax = VMOVSDto64rr %xmm0
%rax = VMOVSDto64Zrr %xmm0
- ; CHECK: VMOVSDto64mr %rdi, %xmm0, _, _, _, _
- VMOVSDto64Zmr %rdi, %xmm0, _, _, _, _
- ; CHECK: VMOVSSmr %rdi, %xmm0, _, _, _, _
- VMOVSSZmr %rdi, %xmm0, _, _, _, _
- ; CHECK: %xmm0 = VMOVSSrm %rip, _, _, _, _
- %xmm0 = VMOVSSZrm %rip, _, _, _, _
- ; CHECK: %xmm0 = VMOVSSrr %xmm0, _
- %xmm0 = VMOVSSZrr %xmm0, _
- ; CHECK: %xmm0 = VMOVSSrr_REV %xmm0, _
- %xmm0 = VMOVSSZrr_REV %xmm0, _
- ; CHECK: VMOVSS2DImr %rdi, %xmm0, _, _, _, _
- VMOVSS2DIZmr %rdi, %xmm0, _, _, _, _
+ ; CHECK: VMOVSDto64mr %rdi, %xmm0, %noreg, %noreg, %noreg, %noreg
+ VMOVSDto64Zmr %rdi, %xmm0, %noreg, %noreg, %noreg, %noreg
+ ; CHECK: VMOVSSmr %rdi, %xmm0, %noreg, %noreg, %noreg, %noreg
+ VMOVSSZmr %rdi, %xmm0, %noreg, %noreg, %noreg, %noreg
+ ; CHECK: %xmm0 = VMOVSSrm %rip, %noreg, %noreg, %noreg, %noreg
+ %xmm0 = VMOVSSZrm %rip, %noreg, %noreg, %noreg, %noreg
+ ; CHECK: %xmm0 = VMOVSSrr %xmm0, %noreg
+ %xmm0 = VMOVSSZrr %xmm0, %noreg
+ ; CHECK: %xmm0 = VMOVSSrr_REV %xmm0, %noreg
+ %xmm0 = VMOVSSZrr_REV %xmm0, %noreg
+ ; CHECK: VMOVSS2DImr %rdi, %xmm0, %noreg, %noreg, %noreg, %noreg
+ VMOVSS2DIZmr %rdi, %xmm0, %noreg, %noreg, %noreg, %noreg
; CHECK: %eax = VMOVSS2DIrr %xmm0
%eax = VMOVSS2DIZrr %xmm0
; CHECK: %xmm0 = VMOV64toPQIrr %rdi
%xmm0 = VMOV64toPQIZrr %rdi
- ; CHECK: %xmm0 = VMOV64toPQIrm %rdi, _, _, _, _
- %xmm0 = VMOV64toPQIZrm %rdi, _, _, _, _
+ ; CHECK: %xmm0 = VMOV64toPQIrm %rdi, %noreg, %noreg, %noreg, %noreg
+ %xmm0 = VMOV64toPQIZrm %rdi, %noreg, %noreg, %noreg, %noreg
; CHECK: %xmm0 = VMOV64toSDrr %rdi
%xmm0 = VMOV64toSDZrr %rdi
- ; CHECK: %xmm0 = VMOVDI2PDIrm %rip, _, _, _, _
- %xmm0 = VMOVDI2PDIZrm %rip, _, _, _, _
+ ; CHECK: %xmm0 = VMOVDI2PDIrm %rip, %noreg, %noreg, %noreg, %noreg
+ %xmm0 = VMOVDI2PDIZrm %rip, %noreg, %noreg, %noreg, %noreg
; CHECK: %xmm0 = VMOVDI2PDIrr %edi
%xmm0 = VMOVDI2PDIZrr %edi
- ; CHECK: %xmm0 = VMOVLHPSrr %xmm0, _
- %xmm0 = VMOVLHPSZrr %xmm0, _
- ; CHECK: %xmm0 = VMOVHLPSrr %xmm0, _
- %xmm0 = VMOVHLPSZrr %xmm0, _
- ; CHECK: VMOVPDI2DImr %rdi, %xmm0, _, _, _, _
- VMOVPDI2DIZmr %rdi, %xmm0, _, _, _, _
+ ; CHECK: %xmm0 = VMOVLHPSrr %xmm0, %noreg
+ %xmm0 = VMOVLHPSZrr %xmm0, %noreg
+ ; CHECK: %xmm0 = VMOVHLPSrr %xmm0, %noreg
+ %xmm0 = VMOVHLPSZrr %xmm0, %noreg
+ ; CHECK: VMOVPDI2DImr %rdi, %xmm0, %noreg, %noreg, %noreg, %noreg
+ VMOVPDI2DIZmr %rdi, %xmm0, %noreg, %noreg, %noreg, %noreg
; CHECK: %edi = VMOVPDI2DIrr %xmm0
%edi = VMOVPDI2DIZrr %xmm0
; CHECK: %xmm0 = VMOVPQI2QIrr %xmm0
%xmm0 = VMOVPQI2QIZrr %xmm0
- ; CHECK: VMOVPQI2QImr %rdi, %xmm0, _, _, _, _
- VMOVPQI2QIZmr %rdi, %xmm0, _, _, _, _
+ ; CHECK: VMOVPQI2QImr %rdi, %xmm0, %noreg, %noreg, %noreg, %noreg
+ VMOVPQI2QIZmr %rdi, %xmm0, %noreg, %noreg, %noreg, %noreg
; CHECK: %rdi = VMOVPQIto64rr %xmm0
%rdi = VMOVPQIto64Zrr %xmm0
- ; CHECK: VMOVPQIto64mr %rdi, %xmm0, _, _, _, _
- VMOVPQIto64Zmr %rdi, %xmm0, _, _, _, _
- ; CHECK: %xmm0 = VMOVQI2PQIrm %rip, _, _, _, _
- %xmm0 = VMOVQI2PQIZrm %rip, _, _, _, _
+ ; CHECK: VMOVPQIto64mr %rdi, %xmm0, %noreg, %noreg, %noreg, %noreg
+ VMOVPQIto64Zmr %rdi, %xmm0, %noreg, %noreg, %noreg, %noreg
+ ; CHECK: %xmm0 = VMOVQI2PQIrm %rip, %noreg, %noreg, %noreg, %noreg
+ %xmm0 = VMOVQI2PQIZrm %rip, %noreg, %noreg, %noreg, %noreg
; CHECK: %xmm0 = VMOVZPQILo2PQIrr %xmm0
%xmm0 = VMOVZPQILo2PQIZrr %xmm0
- ; CHECK: Int_VCOMISDrm %xmm0, %rdi, _, _, _, _, implicit-def %eflags
- Int_VCOMISDZrm %xmm0, %rdi, _, _, _, _, implicit-def %eflags
+ ; CHECK: Int_VCOMISDrm %xmm0, %rdi, %noreg, %noreg, %noreg, %noreg, implicit-def %eflags
+ Int_VCOMISDZrm %xmm0, %rdi, %noreg, %noreg, %noreg, %noreg, implicit-def %eflags
; CHECK: Int_VCOMISDrr %xmm0, %xmm1, implicit-def %eflags
Int_VCOMISDZrr %xmm0, %xmm1, implicit-def %eflags
- ; CHECK: Int_VCOMISSrm %xmm0, %rdi, _, _, _, _, implicit-def %eflags
- Int_VCOMISSZrm %xmm0, %rdi, _, _, _, _, implicit-def %eflags
+ ; CHECK: Int_VCOMISSrm %xmm0, %rdi, %noreg, %noreg, %noreg, %noreg, implicit-def %eflags
+ Int_VCOMISSZrm %xmm0, %rdi, %noreg, %noreg, %noreg, %noreg, implicit-def %eflags
; CHECK: Int_VCOMISSrr %xmm0, %xmm1, implicit-def %eflags
Int_VCOMISSZrr %xmm0, %xmm1, implicit-def %eflags
- ; CHECK: Int_VUCOMISDrm %xmm0, %rdi, _, _, _, _, implicit-def %eflags
- Int_VUCOMISDZrm %xmm0, %rdi, _, _, _, _, implicit-def %eflags
+ ; CHECK: Int_VUCOMISDrm %xmm0, %rdi, %noreg, %noreg, %noreg, %noreg, implicit-def %eflags
+ Int_VUCOMISDZrm %xmm0, %rdi, %noreg, %noreg, %noreg, %noreg, implicit-def %eflags
; CHECK: Int_VUCOMISDrr %xmm0, %xmm1, implicit-def %eflags
Int_VUCOMISDZrr %xmm0, %xmm1, implicit-def %eflags
- ; CHECK: Int_VUCOMISSrm %xmm0, %rdi, _, _, _, _, implicit-def %eflags
- Int_VUCOMISSZrm %xmm0, %rdi, _, _, _, _, implicit-def %eflags
+ ; CHECK: Int_VUCOMISSrm %xmm0, %rdi, %noreg, %noreg, %noreg, %noreg, implicit-def %eflags
+ Int_VUCOMISSZrm %xmm0, %rdi, %noreg, %noreg, %noreg, %noreg, implicit-def %eflags
; CHECK: Int_VUCOMISSrr %xmm0, %xmm1, implicit-def %eflags
Int_VUCOMISSZrr %xmm0, %xmm1, implicit-def %eflags
- ; CHECK: VCOMISDrm %xmm0, %rdi, _, _, _, _, implicit-def %eflags
- VCOMISDZrm %xmm0, %rdi, _, _, _, _, implicit-def %eflags
+ ; CHECK: VCOMISDrm %xmm0, %rdi, %noreg, %noreg, %noreg, %noreg, implicit-def %eflags
+ VCOMISDZrm %xmm0, %rdi, %noreg, %noreg, %noreg, %noreg, implicit-def %eflags
; CHECK: VCOMISDrr %xmm0, %xmm1, implicit-def %eflags
VCOMISDZrr %xmm0, %xmm1, implicit-def %eflags
- ; CHECK: VCOMISSrm %xmm0, %rdi, _, _, _, _, implicit-def %eflags
- VCOMISSZrm %xmm0, %rdi, _, _, _, _, implicit-def %eflags
+ ; CHECK: VCOMISSrm %xmm0, %rdi, %noreg, %noreg, %noreg, %noreg, implicit-def %eflags
+ VCOMISSZrm %xmm0, %rdi, %noreg, %noreg, %noreg, %noreg, implicit-def %eflags
; CHECK: VCOMISSrr %xmm0, %xmm1, implicit-def %eflags
VCOMISSZrr %xmm0, %xmm1, implicit-def %eflags
- ; CHECK: VUCOMISDrm %xmm0, %rdi, _, _, _, _, implicit-def %eflags
- VUCOMISDZrm %xmm0, %rdi, _, _, _, _, implicit-def %eflags
+ ; CHECK: VUCOMISDrm %xmm0, %rdi, %noreg, %noreg, %noreg, %noreg, implicit-def %eflags
+ VUCOMISDZrm %xmm0, %rdi, %noreg, %noreg, %noreg, %noreg, implicit-def %eflags
; CHECK: VUCOMISDrr %xmm0, %xmm1, implicit-def %eflags
VUCOMISDZrr %xmm0, %xmm1, implicit-def %eflags
- ; CHECK: VUCOMISSrm %xmm0, %rdi, _, _, _, _, implicit-def %eflags
- VUCOMISSZrm %xmm0, %rdi, _, _, _, _, implicit-def %eflags
+ ; CHECK: VUCOMISSrm %xmm0, %rdi, %noreg, %noreg, %noreg, %noreg, implicit-def %eflags
+ VUCOMISSZrm %xmm0, %rdi, %noreg, %noreg, %noreg, %noreg, implicit-def %eflags
; CHECK: VUCOMISSrr %xmm0, %xmm1, implicit-def %eflags
VUCOMISSZrr %xmm0, %xmm1, implicit-def %eflags
- ; CHECK: VEXTRACTPSmr %rdi, 1, _, 0, _, %xmm0, _
- VEXTRACTPSZmr %rdi, 1, _, 0, _, %xmm0, _
- ; CHECK: %eax = VEXTRACTPSrr %xmm0, _
- %eax = VEXTRACTPSZrr %xmm0, _
- ; CHECK: %xmm0 = VINSERTPSrm %xmm0, %rdi, _, _, _, _, _
- %xmm0 = VINSERTPSZrm %xmm0, %rdi, _, _, _, _, _
- ; CHECK: %xmm0 = VINSERTPSrr %xmm0, %xmm0, _
- %xmm0 = VINSERTPSZrr %xmm0, %xmm0, _
+ ; CHECK: VEXTRACTPSmr %rdi, 1, %noreg, 0, %noreg, %xmm0, %noreg
+ VEXTRACTPSZmr %rdi, 1, %noreg, 0, %noreg, %xmm0, %noreg
+ ; CHECK: %eax = VEXTRACTPSrr %xmm0, %noreg
+ %eax = VEXTRACTPSZrr %xmm0, %noreg
+ ; CHECK: %xmm0 = VINSERTPSrm %xmm0, %rdi, %noreg, %noreg, %noreg, %noreg, %noreg
+ %xmm0 = VINSERTPSZrm %xmm0, %rdi, %noreg, %noreg, %noreg, %noreg, %noreg
+ ; CHECK: %xmm0 = VINSERTPSrr %xmm0, %xmm0, %noreg
+ %xmm0 = VINSERTPSZrr %xmm0, %xmm0, %noreg
RET 0, %zmm0, %zmm1
...
@@ -2326,878 +2326,878 @@ body: |
name: evex_z256_to_evex_test
body: |
bb.0:
- ; CHECK: VMOVAPDZ256mr %rdi, 1, _, 0, _, %ymm16
- VMOVAPDZ256mr %rdi, 1, _, 0, _, %ymm16
- ; CHECK: %ymm16 = VMOVAPDZ256rm %rip, 1, _, %rax, _
- %ymm16 = VMOVAPDZ256rm %rip, 1, _, %rax, _
+ ; CHECK: VMOVAPDZ256mr %rdi, 1, %noreg, 0, %noreg, %ymm16
+ VMOVAPDZ256mr %rdi, 1, %noreg, 0, %noreg, %ymm16
+ ; CHECK: %ymm16 = VMOVAPDZ256rm %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VMOVAPDZ256rm %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VMOVAPDZ256rr %ymm16
%ymm16 = VMOVAPDZ256rr %ymm16
; CHECK: %ymm16 = VMOVAPDZ256rr_REV %ymm16
%ymm16 = VMOVAPDZ256rr_REV %ymm16
- ; CHECK: VMOVAPSZ256mr %rdi, 1, _, 0, _, %ymm16
- VMOVAPSZ256mr %rdi, 1, _, 0, _, %ymm16
- ; CHECK: %ymm16 = VMOVAPSZ256rm %rip, 1, _, %rax, _
- %ymm16 = VMOVAPSZ256rm %rip, 1, _, %rax, _
+ ; CHECK: VMOVAPSZ256mr %rdi, 1, %noreg, 0, %noreg, %ymm16
+ VMOVAPSZ256mr %rdi, 1, %noreg, 0, %noreg, %ymm16
+ ; CHECK: %ymm16 = VMOVAPSZ256rm %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VMOVAPSZ256rm %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VMOVAPSZ256rr %ymm16
%ymm16 = VMOVAPSZ256rr %ymm16
; CHECK: %ymm16 = VMOVAPSZ256rr_REV %ymm16
%ymm16 = VMOVAPSZ256rr_REV %ymm16
- ; CHECK: %ymm16 = VMOVDDUPZ256rm %rip, 1, _, %rax, _
- %ymm16 = VMOVDDUPZ256rm %rip, 1, _, %rax, _
+ ; CHECK: %ymm16 = VMOVDDUPZ256rm %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VMOVDDUPZ256rm %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VMOVDDUPZ256rr %ymm16
%ymm16 = VMOVDDUPZ256rr %ymm16
- ; CHECK: VMOVDQA32Z256mr %rdi, 1, _, 0, _, %ymm16
- VMOVDQA32Z256mr %rdi, 1, _, 0, _, %ymm16
- ; CHECK: %ymm16 = VMOVDQA32Z256rm %rip, 1, _, %rax, _
- %ymm16 = VMOVDQA32Z256rm %rip, 1, _, %rax, _
+ ; CHECK: VMOVDQA32Z256mr %rdi, 1, %noreg, 0, %noreg, %ymm16
+ VMOVDQA32Z256mr %rdi, 1, %noreg, 0, %noreg, %ymm16
+ ; CHECK: %ymm16 = VMOVDQA32Z256rm %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VMOVDQA32Z256rm %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VMOVDQA32Z256rr %ymm16
%ymm16 = VMOVDQA32Z256rr %ymm16
; CHECK: %ymm16 = VMOVDQA32Z256rr_REV %ymm16
%ymm16 = VMOVDQA32Z256rr_REV %ymm16
- ; CHECK: VMOVDQA64Z256mr %rdi, 1, _, 0, _, %ymm16
- VMOVDQA64Z256mr %rdi, 1, _, 0, _, %ymm16
- ; CHECK: %ymm16 = VMOVDQA64Z256rm %rip, 1, _, %rax, _
- %ymm16 = VMOVDQA64Z256rm %rip, 1, _, %rax, _
+ ; CHECK: VMOVDQA64Z256mr %rdi, 1, %noreg, 0, %noreg, %ymm16
+ VMOVDQA64Z256mr %rdi, 1, %noreg, 0, %noreg, %ymm16
+ ; CHECK: %ymm16 = VMOVDQA64Z256rm %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VMOVDQA64Z256rm %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VMOVDQA64Z256rr %ymm16
%ymm16 = VMOVDQA64Z256rr %ymm16
; CHECK: %ymm16 = VMOVDQA64Z256rr_REV %ymm16
%ymm16 = VMOVDQA64Z256rr_REV %ymm16
- ; CHECK: VMOVDQU16Z256mr %rdi, 1, _, 0, _, %ymm16
- VMOVDQU16Z256mr %rdi, 1, _, 0, _, %ymm16
- ; CHECK: %ymm16 = VMOVDQU16Z256rm %rip, 1, _, %rax, _
- %ymm16 = VMOVDQU16Z256rm %rip, 1, _, %rax, _
+ ; CHECK: VMOVDQU16Z256mr %rdi, 1, %noreg, 0, %noreg, %ymm16
+ VMOVDQU16Z256mr %rdi, 1, %noreg, 0, %noreg, %ymm16
+ ; CHECK: %ymm16 = VMOVDQU16Z256rm %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VMOVDQU16Z256rm %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VMOVDQU16Z256rr %ymm16
%ymm16 = VMOVDQU16Z256rr %ymm16
; CHECK: %ymm16 = VMOVDQU16Z256rr_REV %ymm16
%ymm16 = VMOVDQU16Z256rr_REV %ymm16
- ; CHECK: VMOVDQU32Z256mr %rdi, 1, _, 0, _, %ymm16
- VMOVDQU32Z256mr %rdi, 1, _, 0, _, %ymm16
- ; CHECK: %ymm16 = VMOVDQU32Z256rm %rip, 1, _, %rax, _
- %ymm16 = VMOVDQU32Z256rm %rip, 1, _, %rax, _
+ ; CHECK: VMOVDQU32Z256mr %rdi, 1, %noreg, 0, %noreg, %ymm16
+ VMOVDQU32Z256mr %rdi, 1, %noreg, 0, %noreg, %ymm16
+ ; CHECK: %ymm16 = VMOVDQU32Z256rm %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VMOVDQU32Z256rm %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VMOVDQU32Z256rr %ymm16
%ymm16 = VMOVDQU32Z256rr %ymm16
; CHECK: %ymm16 = VMOVDQU32Z256rr_REV %ymm16
%ymm16 = VMOVDQU32Z256rr_REV %ymm16
- ; CHECK: VMOVDQU64Z256mr %rdi, 1, _, 0, _, %ymm16
- VMOVDQU64Z256mr %rdi, 1, _, 0, _, %ymm16
- ; CHECK: %ymm16 = VMOVDQU64Z256rm %rip, 1, _, %rax, _
- %ymm16 = VMOVDQU64Z256rm %rip, 1, _, %rax, _
+ ; CHECK: VMOVDQU64Z256mr %rdi, 1, %noreg, 0, %noreg, %ymm16
+ VMOVDQU64Z256mr %rdi, 1, %noreg, 0, %noreg, %ymm16
+ ; CHECK: %ymm16 = VMOVDQU64Z256rm %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VMOVDQU64Z256rm %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VMOVDQU64Z256rr %ymm16
%ymm16 = VMOVDQU64Z256rr %ymm16
; CHECK: %ymm16 = VMOVDQU64Z256rr_REV %ymm16
%ymm16 = VMOVDQU64Z256rr_REV %ymm16
- ; CHECK: VMOVDQU8Z256mr %rdi, 1, _, 0, _, %ymm16
- VMOVDQU8Z256mr %rdi, 1, _, 0, _, %ymm16
- ; CHECK: %ymm16 = VMOVDQU8Z256rm %rip, 1, _, %rax, _
- %ymm16 = VMOVDQU8Z256rm %rip, 1, _, %rax, _
+ ; CHECK: VMOVDQU8Z256mr %rdi, 1, %noreg, 0, %noreg, %ymm16
+ VMOVDQU8Z256mr %rdi, 1, %noreg, 0, %noreg, %ymm16
+ ; CHECK: %ymm16 = VMOVDQU8Z256rm %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VMOVDQU8Z256rm %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VMOVDQU8Z256rr %ymm16
%ymm16 = VMOVDQU8Z256rr %ymm16
; CHECK: %ymm16 = VMOVDQU8Z256rr_REV %ymm16
%ymm16 = VMOVDQU8Z256rr_REV %ymm16
- ; CHECK: %ymm16 = VMOVNTDQAZ256rm %rip, 1, _, %rax, _
- %ymm16 = VMOVNTDQAZ256rm %rip, 1, _, %rax, _
- ; CHECK: VMOVNTDQZ256mr %rdi, 1, _, 0, _, %ymm16
- VMOVNTDQZ256mr %rdi, 1, _, 0, _, %ymm16
- ; CHECK: VMOVNTPDZ256mr %rdi, 1, _, 0, _, %ymm16
- VMOVNTPDZ256mr %rdi, 1, _, 0, _, %ymm16
- ; CHECK: VMOVNTPSZ256mr %rdi, 1, _, 0, _, %ymm16
- VMOVNTPSZ256mr %rdi, 1, _, 0, _, %ymm16
- ; CHECK: %ymm16 = VMOVSHDUPZ256rm %rip, 1, _, %rax, _
- %ymm16 = VMOVSHDUPZ256rm %rip, 1, _, %rax, _
+ ; CHECK: %ymm16 = VMOVNTDQAZ256rm %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VMOVNTDQAZ256rm %rip, 1, %noreg, %rax, %noreg
+ ; CHECK: VMOVNTDQZ256mr %rdi, 1, %noreg, 0, %noreg, %ymm16
+ VMOVNTDQZ256mr %rdi, 1, %noreg, 0, %noreg, %ymm16
+ ; CHECK: VMOVNTPDZ256mr %rdi, 1, %noreg, 0, %noreg, %ymm16
+ VMOVNTPDZ256mr %rdi, 1, %noreg, 0, %noreg, %ymm16
+ ; CHECK: VMOVNTPSZ256mr %rdi, 1, %noreg, 0, %noreg, %ymm16
+ VMOVNTPSZ256mr %rdi, 1, %noreg, 0, %noreg, %ymm16
+ ; CHECK: %ymm16 = VMOVSHDUPZ256rm %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VMOVSHDUPZ256rm %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VMOVSHDUPZ256rr %ymm16
%ymm16 = VMOVSHDUPZ256rr %ymm16
- ; CHECK: %ymm16 = VMOVSLDUPZ256rm %rip, 1, _, %rax, _
- %ymm16 = VMOVSLDUPZ256rm %rip, 1, _, %rax, _
+ ; CHECK: %ymm16 = VMOVSLDUPZ256rm %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VMOVSLDUPZ256rm %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VMOVSLDUPZ256rr %ymm16
%ymm16 = VMOVSLDUPZ256rr %ymm16
- ; CHECK: VMOVUPDZ256mr %rdi, 1, _, 0, _, %ymm16
- VMOVUPDZ256mr %rdi, 1, _, 0, _, %ymm16
- ; CHECK: %ymm16 = VMOVUPDZ256rm %rip, 1, _, %rax, _
- %ymm16 = VMOVUPDZ256rm %rip, 1, _, %rax, _
+ ; CHECK: VMOVUPDZ256mr %rdi, 1, %noreg, 0, %noreg, %ymm16
+ VMOVUPDZ256mr %rdi, 1, %noreg, 0, %noreg, %ymm16
+ ; CHECK: %ymm16 = VMOVUPDZ256rm %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VMOVUPDZ256rm %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VMOVUPDZ256rr %ymm16
%ymm16 = VMOVUPDZ256rr %ymm16
; CHECK: %ymm16 = VMOVUPDZ256rr_REV %ymm16
%ymm16 = VMOVUPDZ256rr_REV %ymm16
- ; CHECK: VMOVUPSZ256mr %rdi, 1, _, 0, _, %ymm16
- VMOVUPSZ256mr %rdi, 1, _, 0, _, %ymm16
- ; CHECK: %ymm16 = VPANDDZ256rm %ymm16, %rip, 1, _, %rax, _
- %ymm16 = VPANDDZ256rm %ymm16, %rip, 1, _, %rax, _
+ ; CHECK: VMOVUPSZ256mr %rdi, 1, %noreg, 0, %noreg, %ymm16
+ VMOVUPSZ256mr %rdi, 1, %noreg, 0, %noreg, %ymm16
+ ; CHECK: %ymm16 = VPANDDZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VPANDDZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VPANDDZ256rr %ymm16, %ymm1
%ymm16 = VPANDDZ256rr %ymm16, %ymm1
- ; CHECK: %ymm16 = VPANDQZ256rm %ymm16, %rip, 1, _, %rax, _
- %ymm16 = VPANDQZ256rm %ymm16, %rip, 1, _, %rax, _
+ ; CHECK: %ymm16 = VPANDQZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VPANDQZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VPANDQZ256rr %ymm16, %ymm1
%ymm16 = VPANDQZ256rr %ymm16, %ymm1
- ; CHECK: %ymm16 = VPANDNDZ256rm %ymm16, %rip, 1, _, %rax, _
- %ymm16 = VPANDNDZ256rm %ymm16, %rip, 1, _, %rax, _
+ ; CHECK: %ymm16 = VPANDNDZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VPANDNDZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VPANDNDZ256rr %ymm16, %ymm1
%ymm16 = VPANDNDZ256rr %ymm16, %ymm1
- ; CHECK: %ymm16 = VPANDNQZ256rm %ymm16, %rip, 1, _, %rax, _
- %ymm16 = VPANDNQZ256rm %ymm16, %rip, 1, _, %rax, _
+ ; CHECK: %ymm16 = VPANDNQZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VPANDNQZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VPANDNQZ256rr %ymm16, %ymm1
%ymm16 = VPANDNQZ256rr %ymm16, %ymm1
- ; CHECK: %ymm16 = VPAVGBZ256rm %ymm16, %rip, 1, _, %rax, _
- %ymm16 = VPAVGBZ256rm %ymm16, %rip, 1, _, %rax, _
+ ; CHECK: %ymm16 = VPAVGBZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VPAVGBZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VPAVGBZ256rr %ymm16, %ymm1
%ymm16 = VPAVGBZ256rr %ymm16, %ymm1
- ; CHECK: %ymm16 = VPAVGWZ256rm %ymm16, %rip, 1, _, %rax, _
- %ymm16 = VPAVGWZ256rm %ymm16, %rip, 1, _, %rax, _
+ ; CHECK: %ymm16 = VPAVGWZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VPAVGWZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VPAVGWZ256rr %ymm16, %ymm1
%ymm16 = VPAVGWZ256rr %ymm16, %ymm1
- ; CHECK: %ymm16 = VPADDBZ256rm %ymm16, %rip, 1, _, %rax, _
- %ymm16 = VPADDBZ256rm %ymm16, %rip, 1, _, %rax, _
+ ; CHECK: %ymm16 = VPADDBZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VPADDBZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VPADDBZ256rr %ymm16, %ymm1
%ymm16 = VPADDBZ256rr %ymm16, %ymm1
- ; CHECK: %ymm16 = VPADDDZ256rm %ymm16, %rip, 1, _, %rax, _
- %ymm16 = VPADDDZ256rm %ymm16, %rip, 1, _, %rax, _
+ ; CHECK: %ymm16 = VPADDDZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VPADDDZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VPADDDZ256rr %ymm16, %ymm1
%ymm16 = VPADDDZ256rr %ymm16, %ymm1
- ; CHECK: %ymm16 = VPADDQZ256rm %ymm16, %rip, 1, _, %rax, _
- %ymm16 = VPADDQZ256rm %ymm16, %rip, 1, _, %rax, _
+ ; CHECK: %ymm16 = VPADDQZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VPADDQZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VPADDQZ256rr %ymm16, %ymm1
%ymm16 = VPADDQZ256rr %ymm16, %ymm1
- ; CHECK: %ymm16 = VPADDSBZ256rm %ymm16, %rip, 1, _, %rax, _
- %ymm16 = VPADDSBZ256rm %ymm16, %rip, 1, _, %rax, _
+ ; CHECK: %ymm16 = VPADDSBZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VPADDSBZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VPADDSBZ256rr %ymm16, %ymm1
%ymm16 = VPADDSBZ256rr %ymm16, %ymm1
- ; CHECK: %ymm16 = VPADDSWZ256rm %ymm16, %rip, 1, _, %rax, _
- %ymm16 = VPADDSWZ256rm %ymm16, %rip, 1, _, %rax, _
+ ; CHECK: %ymm16 = VPADDSWZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VPADDSWZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VPADDSWZ256rr %ymm16, %ymm1
%ymm16 = VPADDSWZ256rr %ymm16, %ymm1
- ; CHECK: %ymm16 = VPADDUSBZ256rm %ymm16, %rip, 1, _, %rax, _
- %ymm16 = VPADDUSBZ256rm %ymm16, %rip, 1, _, %rax, _
+ ; CHECK: %ymm16 = VPADDUSBZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VPADDUSBZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VPADDUSBZ256rr %ymm16, %ymm1
%ymm16 = VPADDUSBZ256rr %ymm16, %ymm1
- ; CHECK: %ymm16 = VPADDUSWZ256rm %ymm16, %rip, 1, _, %rax, _
- %ymm16 = VPADDUSWZ256rm %ymm16, %rip, 1, _, %rax, _
+ ; CHECK: %ymm16 = VPADDUSWZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VPADDUSWZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VPADDUSWZ256rr %ymm16, %ymm1
%ymm16 = VPADDUSWZ256rr %ymm16, %ymm1
- ; CHECK: %ymm16 = VPADDWZ256rm %ymm16, %rip, 1, _, %rax, _
- %ymm16 = VPADDWZ256rm %ymm16, %rip, 1, _, %rax, _
+ ; CHECK: %ymm16 = VPADDWZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VPADDWZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VPADDWZ256rr %ymm16, %ymm1
%ymm16 = VPADDWZ256rr %ymm16, %ymm1
- ; CHECK: %ymm16 = VMULPDZ256rm %ymm16, %rip, 1, _, %rax, _
- %ymm16 = VMULPDZ256rm %ymm16, %rip, 1, _, %rax, _
+ ; CHECK: %ymm16 = VMULPDZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VMULPDZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VMULPDZ256rr %ymm16, %ymm1
%ymm16 = VMULPDZ256rr %ymm16, %ymm1
- ; CHECK: %ymm16 = VMULPSZ256rm %ymm16, %rip, 1, _, %rax, _
- %ymm16 = VMULPSZ256rm %ymm16, %rip, 1, _, %rax, _
+ ; CHECK: %ymm16 = VMULPSZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VMULPSZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VMULPSZ256rr %ymm16, %ymm1
%ymm16 = VMULPSZ256rr %ymm16, %ymm1
- ; CHECK: %ymm16 = VORPDZ256rm %ymm16, %rip, 1, _, %rax, _
- %ymm16 = VORPDZ256rm %ymm16, %rip, 1, _, %rax, _
+ ; CHECK: %ymm16 = VORPDZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VORPDZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VORPDZ256rr %ymm16, %ymm1
%ymm16 = VORPDZ256rr %ymm16, %ymm1
- ; CHECK: %ymm16 = VORPSZ256rm %ymm16, %rip, 1, _, %rax, _
- %ymm16 = VORPSZ256rm %ymm16, %rip, 1, _, %rax, _
+ ; CHECK: %ymm16 = VORPSZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VORPSZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VORPSZ256rr %ymm16, %ymm1
%ymm16 = VORPSZ256rr %ymm16, %ymm1
- ; CHECK: %ymm16 = VPMADDUBSWZ256rm %ymm16, %rip, 1, _, %rax, _
- %ymm16 = VPMADDUBSWZ256rm %ymm16, %rip, 1, _, %rax, _
+ ; CHECK: %ymm16 = VPMADDUBSWZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VPMADDUBSWZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VPMADDUBSWZ256rr %ymm16, %ymm1
%ymm16 = VPMADDUBSWZ256rr %ymm16, %ymm1
- ; CHECK: %ymm16 = VPMADDWDZ256rm %ymm16, %rip, 1, _, %rax, _
- %ymm16 = VPMADDWDZ256rm %ymm16, %rip, 1, _, %rax, _
+ ; CHECK: %ymm16 = VPMADDWDZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VPMADDWDZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VPMADDWDZ256rr %ymm16, %ymm1
%ymm16 = VPMADDWDZ256rr %ymm16, %ymm1
- ; CHECK: %ymm16 = VPMAXSBZ256rm %ymm16, %rip, 1, _, %rax, _
- %ymm16 = VPMAXSBZ256rm %ymm16, %rip, 1, _, %rax, _
+ ; CHECK: %ymm16 = VPMAXSBZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VPMAXSBZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VPMAXSBZ256rr %ymm16, %ymm1
%ymm16 = VPMAXSBZ256rr %ymm16, %ymm1
- ; CHECK: %ymm16 = VPMAXSDZ256rm %ymm16, %rip, 1, _, %rax, _
- %ymm16 = VPMAXSDZ256rm %ymm16, %rip, 1, _, %rax, _
+ ; CHECK: %ymm16 = VPMAXSDZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VPMAXSDZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VPMAXSDZ256rr %ymm16, %ymm1
%ymm16 = VPMAXSDZ256rr %ymm16, %ymm1
- ; CHECK: %ymm16 = VPMAXSWZ256rm %ymm16, %rip, 1, _, %rax, _
- %ymm16 = VPMAXSWZ256rm %ymm16, %rip, 1, _, %rax, _
+ ; CHECK: %ymm16 = VPMAXSWZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VPMAXSWZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VPMAXSWZ256rr %ymm16, %ymm1
%ymm16 = VPMAXSWZ256rr %ymm16, %ymm1
- ; CHECK: %ymm16 = VPMAXUBZ256rm %ymm16, %rip, 1, _, %rax, _
- %ymm16 = VPMAXUBZ256rm %ymm16, %rip, 1, _, %rax, _
+ ; CHECK: %ymm16 = VPMAXUBZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VPMAXUBZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VPMAXUBZ256rr %ymm16, %ymm1
%ymm16 = VPMAXUBZ256rr %ymm16, %ymm1
- ; CHECK: %ymm16 = VPMAXUDZ256rm %ymm16, %rip, 1, _, %rax, _
- %ymm16 = VPMAXUDZ256rm %ymm16, %rip, 1, _, %rax, _
+ ; CHECK: %ymm16 = VPMAXUDZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VPMAXUDZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VPMAXUDZ256rr %ymm16, %ymm1
%ymm16 = VPMAXUDZ256rr %ymm16, %ymm1
- ; CHECK: %ymm16 = VPMAXUWZ256rm %ymm16, %rip, 1, _, %rax, _
- %ymm16 = VPMAXUWZ256rm %ymm16, %rip, 1, _, %rax, _
+ ; CHECK: %ymm16 = VPMAXUWZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VPMAXUWZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VPMAXUWZ256rr %ymm16, %ymm1
%ymm16 = VPMAXUWZ256rr %ymm16, %ymm1
- ; CHECK: %ymm16 = VPMINSBZ256rm %ymm16, %rip, 1, _, %rax, _
- %ymm16 = VPMINSBZ256rm %ymm16, %rip, 1, _, %rax, _
+ ; CHECK: %ymm16 = VPMINSBZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VPMINSBZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VPMINSBZ256rr %ymm16, %ymm1
%ymm16 = VPMINSBZ256rr %ymm16, %ymm1
- ; CHECK: %ymm16 = VPMINSDZ256rm %ymm16, %rip, 1, _, %rax, _
- %ymm16 = VPMINSDZ256rm %ymm16, %rip, 1, _, %rax, _
+ ; CHECK: %ymm16 = VPMINSDZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VPMINSDZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VPMINSDZ256rr %ymm16, %ymm1
%ymm16 = VPMINSDZ256rr %ymm16, %ymm1
- ; CHECK: %ymm16 = VPMINSWZ256rm %ymm16, %rip, 1, _, %rax, _
- %ymm16 = VPMINSWZ256rm %ymm16, %rip, 1, _, %rax, _
+ ; CHECK: %ymm16 = VPMINSWZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VPMINSWZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VPMINSWZ256rr %ymm16, %ymm1
%ymm16 = VPMINSWZ256rr %ymm16, %ymm1
- ; CHECK: %ymm16 = VPMINUBZ256rm %ymm16, %rip, 1, _, %rax, _
- %ymm16 = VPMINUBZ256rm %ymm16, %rip, 1, _, %rax, _
+ ; CHECK: %ymm16 = VPMINUBZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VPMINUBZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VPMINUBZ256rr %ymm16, %ymm1
%ymm16 = VPMINUBZ256rr %ymm16, %ymm1
- ; CHECK: %ymm16 = VPMINUDZ256rm %ymm16, %rip, 1, _, %rax, _
- %ymm16 = VPMINUDZ256rm %ymm16, %rip, 1, _, %rax, _
+ ; CHECK: %ymm16 = VPMINUDZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VPMINUDZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VPMINUDZ256rr %ymm16, %ymm1
%ymm16 = VPMINUDZ256rr %ymm16, %ymm1
- ; CHECK: %ymm16 = VPMINUWZ256rm %ymm16, %rip, 1, _, %rax, _
- %ymm16 = VPMINUWZ256rm %ymm16, %rip, 1, _, %rax, _
+ ; CHECK: %ymm16 = VPMINUWZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VPMINUWZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VPMINUWZ256rr %ymm16, %ymm1
%ymm16 = VPMINUWZ256rr %ymm16, %ymm1
- ; CHECK: %ymm16 = VPMULDQZ256rm %ymm16, %rip, 1, _, %rax, _
- %ymm16 = VPMULDQZ256rm %ymm16, %rip, 1, _, %rax, _
+ ; CHECK: %ymm16 = VPMULDQZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VPMULDQZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VPMULDQZ256rr %ymm16, %ymm1
%ymm16 = VPMULDQZ256rr %ymm16, %ymm1
- ; CHECK: %ymm16 = VPMULHRSWZ256rm %ymm16, %rip, 1, _, %rax, _
- %ymm16 = VPMULHRSWZ256rm %ymm16, %rip, 1, _, %rax, _
+ ; CHECK: %ymm16 = VPMULHRSWZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VPMULHRSWZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VPMULHRSWZ256rr %ymm16, %ymm1
%ymm16 = VPMULHRSWZ256rr %ymm16, %ymm1
- ; CHECK: %ymm16 = VPMULHUWZ256rm %ymm16, %rip, 1, _, %rax, _
- %ymm16 = VPMULHUWZ256rm %ymm16, %rip, 1, _, %rax, _
+ ; CHECK: %ymm16 = VPMULHUWZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VPMULHUWZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VPMULHUWZ256rr %ymm16, %ymm1
%ymm16 = VPMULHUWZ256rr %ymm16, %ymm1
- ; CHECK: %ymm16 = VPMULHWZ256rm %ymm16, %rip, 1, _, %rax, _
- %ymm16 = VPMULHWZ256rm %ymm16, %rip, 1, _, %rax, _
+ ; CHECK: %ymm16 = VPMULHWZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VPMULHWZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VPMULHWZ256rr %ymm16, %ymm1
%ymm16 = VPMULHWZ256rr %ymm16, %ymm1
- ; CHECK: %ymm16 = VPMULLDZ256rm %ymm16, %rip, 1, _, %rax, _
- %ymm16 = VPMULLDZ256rm %ymm16, %rip, 1, _, %rax, _
+ ; CHECK: %ymm16 = VPMULLDZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VPMULLDZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VPMULLDZ256rr %ymm16, %ymm1
%ymm16 = VPMULLDZ256rr %ymm16, %ymm1
- ; CHECK: %ymm16 = VPMULLWZ256rm %ymm16, %rip, 1, _, %rax, _
- %ymm16 = VPMULLWZ256rm %ymm16, %rip, 1, _, %rax, _
+ ; CHECK: %ymm16 = VPMULLWZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VPMULLWZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VPMULLWZ256rr %ymm16, %ymm1
%ymm16 = VPMULLWZ256rr %ymm16, %ymm1
- ; CHECK: %ymm16 = VPMULUDQZ256rm %ymm16, %rip, 1, _, %rax, _
- %ymm16 = VPMULUDQZ256rm %ymm16, %rip, 1, _, %rax, _
+ ; CHECK: %ymm16 = VPMULUDQZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VPMULUDQZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VPMULUDQZ256rr %ymm16, %ymm1
%ymm16 = VPMULUDQZ256rr %ymm16, %ymm1
- ; CHECK: %ymm16 = VPORDZ256rm %ymm16, %rip, 1, _, %rax, _
- %ymm16 = VPORDZ256rm %ymm16, %rip, 1, _, %rax, _
+ ; CHECK: %ymm16 = VPORDZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VPORDZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VPORDZ256rr %ymm16, %ymm1
%ymm16 = VPORDZ256rr %ymm16, %ymm1
- ; CHECK: %ymm16 = VPORQZ256rm %ymm16, %rip, 1, _, %rax, _
- %ymm16 = VPORQZ256rm %ymm16, %rip, 1, _, %rax, _
+ ; CHECK: %ymm16 = VPORQZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VPORQZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VPORQZ256rr %ymm16, %ymm1
%ymm16 = VPORQZ256rr %ymm16, %ymm1
- ; CHECK: %ymm16 = VPSUBBZ256rm %ymm16, %rip, 1, _, %rax, _
- %ymm16 = VPSUBBZ256rm %ymm16, %rip, 1, _, %rax, _
+ ; CHECK: %ymm16 = VPSUBBZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VPSUBBZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VPSUBBZ256rr %ymm16, %ymm1
%ymm16 = VPSUBBZ256rr %ymm16, %ymm1
- ; CHECK: %ymm16 = VPSUBDZ256rm %ymm16, %rip, 1, _, %rax, _
- %ymm16 = VPSUBDZ256rm %ymm16, %rip, 1, _, %rax, _
+ ; CHECK: %ymm16 = VPSUBDZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VPSUBDZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VPSUBDZ256rr %ymm16, %ymm1
%ymm16 = VPSUBDZ256rr %ymm16, %ymm1
- ; CHECK: %ymm16 = VPSUBQZ256rm %ymm16, %rip, 1, _, %rax, _
- %ymm16 = VPSUBQZ256rm %ymm16, %rip, 1, _, %rax, _
+ ; CHECK: %ymm16 = VPSUBQZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VPSUBQZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VPSUBQZ256rr %ymm16, %ymm1
%ymm16 = VPSUBQZ256rr %ymm16, %ymm1
- ; CHECK: %ymm16 = VPSUBSBZ256rm %ymm16, %rip, 1, _, %rax, _
- %ymm16 = VPSUBSBZ256rm %ymm16, %rip, 1, _, %rax, _
+ ; CHECK: %ymm16 = VPSUBSBZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VPSUBSBZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VPSUBSBZ256rr %ymm16, %ymm1
%ymm16 = VPSUBSBZ256rr %ymm16, %ymm1
- ; CHECK: %ymm16 = VPSUBSWZ256rm %ymm16, %rip, 1, _, %rax, _
- %ymm16 = VPSUBSWZ256rm %ymm16, %rip, 1, _, %rax, _
+ ; CHECK: %ymm16 = VPSUBSWZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VPSUBSWZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VPSUBSWZ256rr %ymm16, %ymm1
%ymm16 = VPSUBSWZ256rr %ymm16, %ymm1
- ; CHECK: %ymm16 = VPSUBUSBZ256rm %ymm16, %rip, 1, _, %rax, _
- %ymm16 = VPSUBUSBZ256rm %ymm16, %rip, 1, _, %rax, _
+ ; CHECK: %ymm16 = VPSUBUSBZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VPSUBUSBZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VPSUBUSBZ256rr %ymm16, %ymm1
%ymm16 = VPSUBUSBZ256rr %ymm16, %ymm1
- ; CHECK: %ymm16 = VPSUBUSWZ256rm %ymm16, %rip, 1, _, %rax, _
- %ymm16 = VPSUBUSWZ256rm %ymm16, %rip, 1, _, %rax, _
+ ; CHECK: %ymm16 = VPSUBUSWZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VPSUBUSWZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VPSUBUSWZ256rr %ymm16, %ymm1
%ymm16 = VPSUBUSWZ256rr %ymm16, %ymm1
- ; CHECK: %ymm16 = VPSUBWZ256rm %ymm16, %rip, 1, _, %rax, _
- %ymm16 = VPSUBWZ256rm %ymm16, %rip, 1, _, %rax, _
+ ; CHECK: %ymm16 = VPSUBWZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VPSUBWZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VPSUBWZ256rr %ymm16, %ymm1
%ymm16 = VPSUBWZ256rr %ymm16, %ymm1
- ; CHECK: %ymm16 = VPXORDZ256rm %ymm16, %rip, 1, _, %rax, _
- %ymm16 = VPXORDZ256rm %ymm16, %rip, 1, _, %rax, _
+ ; CHECK: %ymm16 = VPXORDZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VPXORDZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VPXORDZ256rr %ymm16, %ymm1
%ymm16 = VPXORDZ256rr %ymm16, %ymm1
- ; CHECK: %ymm16 = VPXORQZ256rm %ymm16, %rip, 1, _, %rax, _
- %ymm16 = VPXORQZ256rm %ymm16, %rip, 1, _, %rax, _
+ ; CHECK: %ymm16 = VPXORQZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VPXORQZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VPXORQZ256rr %ymm16, %ymm1
%ymm16 = VPXORQZ256rr %ymm16, %ymm1
- ; CHECK: %ymm16 = VADDPDZ256rm %ymm16, %rip, 1, _, %rax, _
- %ymm16 = VADDPDZ256rm %ymm16, %rip, 1, _, %rax, _
+ ; CHECK: %ymm16 = VADDPDZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VADDPDZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VADDPDZ256rr %ymm16, %ymm1
%ymm16 = VADDPDZ256rr %ymm16, %ymm1
- ; CHECK: %ymm16 = VADDPSZ256rm %ymm16, %rip, 1, _, %rax, _
- %ymm16 = VADDPSZ256rm %ymm16, %rip, 1, _, %rax, _
+ ; CHECK: %ymm16 = VADDPSZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VADDPSZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VADDPSZ256rr %ymm16, %ymm1
%ymm16 = VADDPSZ256rr %ymm16, %ymm1
- ; CHECK: %ymm16 = VANDNPDZ256rm %ymm16, %rip, 1, _, %rax, _
- %ymm16 = VANDNPDZ256rm %ymm16, %rip, 1, _, %rax, _
+ ; CHECK: %ymm16 = VANDNPDZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VANDNPDZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VANDNPDZ256rr %ymm16, %ymm1
%ymm16 = VANDNPDZ256rr %ymm16, %ymm1
- ; CHECK: %ymm16 = VANDNPSZ256rm %ymm16, %rip, 1, _, %rax, _
- %ymm16 = VANDNPSZ256rm %ymm16, %rip, 1, _, %rax, _
+ ; CHECK: %ymm16 = VANDNPSZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VANDNPSZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VANDNPSZ256rr %ymm16, %ymm1
%ymm16 = VANDNPSZ256rr %ymm16, %ymm1
- ; CHECK: %ymm16 = VANDPDZ256rm %ymm16, %rip, 1, _, %rax, _
- %ymm16 = VANDPDZ256rm %ymm16, %rip, 1, _, %rax, _
+ ; CHECK: %ymm16 = VANDPDZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VANDPDZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VANDPDZ256rr %ymm16, %ymm1
%ymm16 = VANDPDZ256rr %ymm16, %ymm1
- ; CHECK: %ymm16 = VANDPSZ256rm %ymm16, %rip, 1, _, %rax, _
- %ymm16 = VANDPSZ256rm %ymm16, %rip, 1, _, %rax, _
+ ; CHECK: %ymm16 = VANDPSZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VANDPSZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VANDPSZ256rr %ymm16, %ymm1
%ymm16 = VANDPSZ256rr %ymm16, %ymm1
- ; CHECK: %ymm16 = VDIVPDZ256rm %ymm16, %rip, 1, _, %rax, _
- %ymm16 = VDIVPDZ256rm %ymm16, %rip, 1, _, %rax, _
+ ; CHECK: %ymm16 = VDIVPDZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VDIVPDZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VDIVPDZ256rr %ymm16, %ymm1
%ymm16 = VDIVPDZ256rr %ymm16, %ymm1
- ; CHECK: %ymm16 = VDIVPSZ256rm %ymm16, %rip, 1, _, %rax, _
- %ymm16 = VDIVPSZ256rm %ymm16, %rip, 1, _, %rax, _
+ ; CHECK: %ymm16 = VDIVPSZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VDIVPSZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VDIVPSZ256rr %ymm16, %ymm1
%ymm16 = VDIVPSZ256rr %ymm16, %ymm1
- ; CHECK: %ymm16 = VMAXCPDZ256rm %ymm16, %rip, 1, _, %rax, _
- %ymm16 = VMAXCPDZ256rm %ymm16, %rip, 1, _, %rax, _
+ ; CHECK: %ymm16 = VMAXCPDZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VMAXCPDZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VMAXCPDZ256rr %ymm16, %ymm1
%ymm16 = VMAXCPDZ256rr %ymm16, %ymm1
- ; CHECK: %ymm16 = VMAXCPSZ256rm %ymm16, %rip, 1, _, %rax, _
- %ymm16 = VMAXCPSZ256rm %ymm16, %rip, 1, _, %rax, _
+ ; CHECK: %ymm16 = VMAXCPSZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VMAXCPSZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VMAXCPSZ256rr %ymm16, %ymm1
%ymm16 = VMAXCPSZ256rr %ymm16, %ymm1
- ; CHECK: %ymm16 = VMAXPDZ256rm %ymm16, %rip, 1, _, %rax, _
- %ymm16 = VMAXPDZ256rm %ymm16, %rip, 1, _, %rax, _
+ ; CHECK: %ymm16 = VMAXPDZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VMAXPDZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VMAXPDZ256rr %ymm16, %ymm1
%ymm16 = VMAXPDZ256rr %ymm16, %ymm1
- ; CHECK: %ymm16 = VMAXPSZ256rm %ymm16, %rip, 1, _, %rax, _
- %ymm16 = VMAXPSZ256rm %ymm16, %rip, 1, _, %rax, _
+ ; CHECK: %ymm16 = VMAXPSZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VMAXPSZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VMAXPSZ256rr %ymm16, %ymm1
%ymm16 = VMAXPSZ256rr %ymm16, %ymm1
- ; CHECK: %ymm16 = VMINCPDZ256rm %ymm16, %rip, 1, _, %rax, _
- %ymm16 = VMINCPDZ256rm %ymm16, %rip, 1, _, %rax, _
+ ; CHECK: %ymm16 = VMINCPDZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VMINCPDZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VMINCPDZ256rr %ymm16, %ymm1
%ymm16 = VMINCPDZ256rr %ymm16, %ymm1
- ; CHECK: %ymm16 = VMINCPSZ256rm %ymm16, %rip, 1, _, %rax, _
- %ymm16 = VMINCPSZ256rm %ymm16, %rip, 1, _, %rax, _
+ ; CHECK: %ymm16 = VMINCPSZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VMINCPSZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VMINCPSZ256rr %ymm16, %ymm1
%ymm16 = VMINCPSZ256rr %ymm16, %ymm1
- ; CHECK: %ymm16 = VMINPDZ256rm %ymm16, %rip, 1, _, %rax, _
- %ymm16 = VMINPDZ256rm %ymm16, %rip, 1, _, %rax, _
+ ; CHECK: %ymm16 = VMINPDZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VMINPDZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VMINPDZ256rr %ymm16, %ymm1
%ymm16 = VMINPDZ256rr %ymm16, %ymm1
- ; CHECK: %ymm16 = VMINPSZ256rm %ymm16, %rip, 1, _, %rax, _
- %ymm16 = VMINPSZ256rm %ymm16, %rip, 1, _, %rax, _
+ ; CHECK: %ymm16 = VMINPSZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VMINPSZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VMINPSZ256rr %ymm16, %ymm1
%ymm16 = VMINPSZ256rr %ymm16, %ymm1
- ; CHECK: %ymm16 = VXORPDZ256rm %ymm16, %rip, 1, _, %rax, _
- %ymm16 = VXORPDZ256rm %ymm16, %rip, 1, _, %rax, _
+ ; CHECK: %ymm16 = VXORPDZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VXORPDZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VXORPDZ256rr %ymm16, %ymm1
%ymm16 = VXORPDZ256rr %ymm16, %ymm1
- ; CHECK: %ymm16 = VXORPSZ256rm %ymm16, %rip, 1, _, %rax, _
- %ymm16 = VXORPSZ256rm %ymm16, %rip, 1, _, %rax, _
+ ; CHECK: %ymm16 = VXORPSZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VXORPSZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VXORPSZ256rr %ymm16, %ymm1
%ymm16 = VXORPSZ256rr %ymm16, %ymm1
- ; CHECK: %ymm16 = VPACKSSDWZ256rm %ymm16, %rip, 1, _, %rax, _
- %ymm16 = VPACKSSDWZ256rm %ymm16, %rip, 1, _, %rax, _
+ ; CHECK: %ymm16 = VPACKSSDWZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VPACKSSDWZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VPACKSSDWZ256rr %ymm16, %ymm1
%ymm16 = VPACKSSDWZ256rr %ymm16, %ymm1
- ; CHECK: %ymm16 = VPACKSSWBZ256rm %ymm16, %rip, 1, _, %rax, _
- %ymm16 = VPACKSSWBZ256rm %ymm16, %rip, 1, _, %rax, _
+ ; CHECK: %ymm16 = VPACKSSWBZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VPACKSSWBZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VPACKSSWBZ256rr %ymm16, %ymm1
%ymm16 = VPACKSSWBZ256rr %ymm16, %ymm1
- ; CHECK: %ymm16 = VPACKUSDWZ256rm %ymm16, %rip, 1, _, %rax, _
- %ymm16 = VPACKUSDWZ256rm %ymm16, %rip, 1, _, %rax, _
+ ; CHECK: %ymm16 = VPACKUSDWZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VPACKUSDWZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VPACKUSDWZ256rr %ymm16, %ymm1
%ymm16 = VPACKUSDWZ256rr %ymm16, %ymm1
- ; CHECK: %ymm16 = VPACKUSWBZ256rm %ymm16, %rip, 1, _, %rax, _
- %ymm16 = VPACKUSWBZ256rm %ymm16, %rip, 1, _, %rax, _
+ ; CHECK: %ymm16 = VPACKUSWBZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VPACKUSWBZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VPACKUSWBZ256rr %ymm16, %ymm1
%ymm16 = VPACKUSWBZ256rr %ymm16, %ymm1
- ; CHECK: %ymm16 = VUNPCKHPDZ256rm %ymm16, %rip, 1, _, %rax, _
- %ymm16 = VUNPCKHPDZ256rm %ymm16, %rip, 1, _, %rax, _
+ ; CHECK: %ymm16 = VUNPCKHPDZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VUNPCKHPDZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VUNPCKHPDZ256rr %ymm16, %ymm1
%ymm16 = VUNPCKHPDZ256rr %ymm16, %ymm1
- ; CHECK: %ymm16 = VUNPCKHPSZ256rm %ymm16, %rip, 1, _, %rax, _
- %ymm16 = VUNPCKHPSZ256rm %ymm16, %rip, 1, _, %rax, _
+ ; CHECK: %ymm16 = VUNPCKHPSZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VUNPCKHPSZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VUNPCKHPSZ256rr %ymm16, %ymm1
%ymm16 = VUNPCKHPSZ256rr %ymm16, %ymm1
- ; CHECK: %ymm16 = VUNPCKLPDZ256rm %ymm16, %rip, 1, _, %rax, _
- %ymm16 = VUNPCKLPDZ256rm %ymm16, %rip, 1, _, %rax, _
+ ; CHECK: %ymm16 = VUNPCKLPDZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VUNPCKLPDZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VUNPCKLPDZ256rr %ymm16, %ymm1
%ymm16 = VUNPCKLPDZ256rr %ymm16, %ymm1
- ; CHECK: %ymm16 = VUNPCKLPSZ256rm %ymm16, %rip, 1, _, %rax, _
- %ymm16 = VUNPCKLPSZ256rm %ymm16, %rip, 1, _, %rax, _
+ ; CHECK: %ymm16 = VUNPCKLPSZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VUNPCKLPSZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VUNPCKLPSZ256rr %ymm16, %ymm1
%ymm16 = VUNPCKLPSZ256rr %ymm16, %ymm1
- ; CHECK: %ymm16 = VSUBPDZ256rm %ymm16, %rip, 1, _, %rax, _
- %ymm16 = VSUBPDZ256rm %ymm16, %rip, 1, _, %rax, _
+ ; CHECK: %ymm16 = VSUBPDZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VSUBPDZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VSUBPDZ256rr %ymm16, %ymm1
%ymm16 = VSUBPDZ256rr %ymm16, %ymm1
- ; CHECK: %ymm16 = VSUBPSZ256rm %ymm16, %rip, 1, _, %rax, _
- %ymm16 = VSUBPSZ256rm %ymm16, %rip, 1, _, %rax, _
+ ; CHECK: %ymm16 = VSUBPSZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VSUBPSZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VSUBPSZ256rr %ymm16, %ymm1
%ymm16 = VSUBPSZ256rr %ymm16, %ymm1
- ; CHECK: %ymm16 = VPUNPCKHBWZ256rm %ymm16, %rip, 1, _, %rax, _
- %ymm16 = VPUNPCKHBWZ256rm %ymm16, %rip, 1, _, %rax, _
+ ; CHECK: %ymm16 = VPUNPCKHBWZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VPUNPCKHBWZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VPUNPCKHBWZ256rr %ymm16, %ymm1
%ymm16 = VPUNPCKHBWZ256rr %ymm16, %ymm1
- ; CHECK: %ymm16 = VPUNPCKHDQZ256rm %ymm16, %rip, 1, _, %rax, _
- %ymm16 = VPUNPCKHDQZ256rm %ymm16, %rip, 1, _, %rax, _
+ ; CHECK: %ymm16 = VPUNPCKHDQZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VPUNPCKHDQZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VPUNPCKHDQZ256rr %ymm16, %ymm1
%ymm16 = VPUNPCKHDQZ256rr %ymm16, %ymm1
- ; CHECK: %ymm16 = VPUNPCKHQDQZ256rm %ymm16, %rip, 1, _, %rax, _
- %ymm16 = VPUNPCKHQDQZ256rm %ymm16, %rip, 1, _, %rax, _
+ ; CHECK: %ymm16 = VPUNPCKHQDQZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VPUNPCKHQDQZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VPUNPCKHQDQZ256rr %ymm16, %ymm1
%ymm16 = VPUNPCKHQDQZ256rr %ymm16, %ymm1
- ; CHECK: %ymm16 = VPUNPCKHWDZ256rm %ymm16, %rip, 1, _, %rax, _
- %ymm16 = VPUNPCKHWDZ256rm %ymm16, %rip, 1, _, %rax, _
+ ; CHECK: %ymm16 = VPUNPCKHWDZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VPUNPCKHWDZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VPUNPCKHWDZ256rr %ymm16, %ymm1
%ymm16 = VPUNPCKHWDZ256rr %ymm16, %ymm1
- ; CHECK: %ymm16 = VPUNPCKLBWZ256rm %ymm16, %rip, 1, _, %rax, _
- %ymm16 = VPUNPCKLBWZ256rm %ymm16, %rip, 1, _, %rax, _
+ ; CHECK: %ymm16 = VPUNPCKLBWZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VPUNPCKLBWZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VPUNPCKLBWZ256rr %ymm16, %ymm1
%ymm16 = VPUNPCKLBWZ256rr %ymm16, %ymm1
- ; CHECK: %ymm16 = VPUNPCKLDQZ256rm %ymm16, %rip, 1, _, %rax, _
- %ymm16 = VPUNPCKLDQZ256rm %ymm16, %rip, 1, _, %rax, _
+ ; CHECK: %ymm16 = VPUNPCKLDQZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VPUNPCKLDQZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VPUNPCKLDQZ256rr %ymm16, %ymm1
%ymm16 = VPUNPCKLDQZ256rr %ymm16, %ymm1
- ; CHECK: %ymm16 = VPUNPCKLQDQZ256rm %ymm16, %rip, 1, _, %rax, _
- %ymm16 = VPUNPCKLQDQZ256rm %ymm16, %rip, 1, _, %rax, _
+ ; CHECK: %ymm16 = VPUNPCKLQDQZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VPUNPCKLQDQZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VPUNPCKLQDQZ256rr %ymm16, %ymm1
%ymm16 = VPUNPCKLQDQZ256rr %ymm16, %ymm1
- ; CHECK: %ymm16 = VPUNPCKLWDZ256rm %ymm16, %rip, 1, _, %rax, _
- %ymm16 = VPUNPCKLWDZ256rm %ymm16, %rip, 1, _, %rax, _
+ ; CHECK: %ymm16 = VPUNPCKLWDZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VPUNPCKLWDZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VPUNPCKLWDZ256rr %ymm16, %ymm1
%ymm16 = VPUNPCKLWDZ256rr %ymm16, %ymm1
- ; CHECK: %ymm16 = VFMADD132PDZ256m %ymm16, %ymm16, %rsi, 1, _, 0, _
- %ymm16 = VFMADD132PDZ256m %ymm16, %ymm16, %rsi, 1, _, 0, _
+ ; CHECK: %ymm16 = VFMADD132PDZ256m %ymm16, %ymm16, %rsi, 1, %noreg, 0, %noreg
+ %ymm16 = VFMADD132PDZ256m %ymm16, %ymm16, %rsi, 1, %noreg, 0, %noreg
; CHECK: %ymm16 = VFMADD132PDZ256r %ymm16, %ymm1, %ymm2
%ymm16 = VFMADD132PDZ256r %ymm16, %ymm1, %ymm2
- ; CHECK: %ymm16 = VFMADD132PSZ256m %ymm16, %ymm16, %rsi, 1, _, 0, _
- %ymm16 = VFMADD132PSZ256m %ymm16, %ymm16, %rsi, 1, _, 0, _
+ ; CHECK: %ymm16 = VFMADD132PSZ256m %ymm16, %ymm16, %rsi, 1, %noreg, 0, %noreg
+ %ymm16 = VFMADD132PSZ256m %ymm16, %ymm16, %rsi, 1, %noreg, 0, %noreg
; CHECK: %ymm16 = VFMADD132PSZ256r %ymm16, %ymm1, %ymm2
%ymm16 = VFMADD132PSZ256r %ymm16, %ymm1, %ymm2
- ; CHECK: %ymm16 = VFMADD213PDZ256m %ymm16, %ymm16, %rsi, 1, _, 0, _
- %ymm16 = VFMADD213PDZ256m %ymm16, %ymm16, %rsi, 1, _, 0, _
+ ; CHECK: %ymm16 = VFMADD213PDZ256m %ymm16, %ymm16, %rsi, 1, %noreg, 0, %noreg
+ %ymm16 = VFMADD213PDZ256m %ymm16, %ymm16, %rsi, 1, %noreg, 0, %noreg
; CHECK: %ymm16 = VFMADD213PDZ256r %ymm16, %ymm1, %ymm2
%ymm16 = VFMADD213PDZ256r %ymm16, %ymm1, %ymm2
- ; CHECK: %ymm16 = VFMADD213PSZ256m %ymm16, %ymm16, %rsi, 1, _, 0, _
- %ymm16 = VFMADD213PSZ256m %ymm16, %ymm16, %rsi, 1, _, 0, _
+ ; CHECK: %ymm16 = VFMADD213PSZ256m %ymm16, %ymm16, %rsi, 1, %noreg, 0, %noreg
+ %ymm16 = VFMADD213PSZ256m %ymm16, %ymm16, %rsi, 1, %noreg, 0, %noreg
; CHECK: %ymm16 = VFMADD213PSZ256r %ymm16, %ymm1, %ymm2
%ymm16 = VFMADD213PSZ256r %ymm16, %ymm1, %ymm2
- ; CHECK: %ymm16 = VFMADD231PDZ256m %ymm16, %ymm16, %rsi, 1, _, 0, _
- %ymm16 = VFMADD231PDZ256m %ymm16, %ymm16, %rsi, 1, _, 0, _
+ ; CHECK: %ymm16 = VFMADD231PDZ256m %ymm16, %ymm16, %rsi, 1, %noreg, 0, %noreg
+ %ymm16 = VFMADD231PDZ256m %ymm16, %ymm16, %rsi, 1, %noreg, 0, %noreg
; CHECK: %ymm16 = VFMADD231PDZ256r %ymm16, %ymm1, %ymm2
%ymm16 = VFMADD231PDZ256r %ymm16, %ymm1, %ymm2
- ; CHECK: %ymm16 = VFMADD231PSZ256m %ymm16, %ymm16, %rsi, 1, _, 0, _
- %ymm16 = VFMADD231PSZ256m %ymm16, %ymm16, %rsi, 1, _, 0, _
+ ; CHECK: %ymm16 = VFMADD231PSZ256m %ymm16, %ymm16, %rsi, 1, %noreg, 0, %noreg
+ %ymm16 = VFMADD231PSZ256m %ymm16, %ymm16, %rsi, 1, %noreg, 0, %noreg
; CHECK: %ymm16 = VFMADD231PSZ256r %ymm16, %ymm1, %ymm2
%ymm16 = VFMADD231PSZ256r %ymm16, %ymm1, %ymm2
- ; CHECK: %ymm16 = VFMADDSUB132PDZ256m %ymm16, %ymm16, %rsi, 1, _, 0, _
- %ymm16 = VFMADDSUB132PDZ256m %ymm16, %ymm16, %rsi, 1, _, 0, _
+ ; CHECK: %ymm16 = VFMADDSUB132PDZ256m %ymm16, %ymm16, %rsi, 1, %noreg, 0, %noreg
+ %ymm16 = VFMADDSUB132PDZ256m %ymm16, %ymm16, %rsi, 1, %noreg, 0, %noreg
; CHECK: %ymm16 = VFMADDSUB132PDZ256r %ymm16, %ymm1, %ymm2
%ymm16 = VFMADDSUB132PDZ256r %ymm16, %ymm1, %ymm2
- ; CHECK: %ymm16 = VFMADDSUB132PSZ256m %ymm16, %ymm16, %rsi, 1, _, 0, _
- %ymm16 = VFMADDSUB132PSZ256m %ymm16, %ymm16, %rsi, 1, _, 0, _
+ ; CHECK: %ymm16 = VFMADDSUB132PSZ256m %ymm16, %ymm16, %rsi, 1, %noreg, 0, %noreg
+ %ymm16 = VFMADDSUB132PSZ256m %ymm16, %ymm16, %rsi, 1, %noreg, 0, %noreg
; CHECK: %ymm16 = VFMADDSUB132PSZ256r %ymm16, %ymm1, %ymm2
%ymm16 = VFMADDSUB132PSZ256r %ymm16, %ymm1, %ymm2
- ; CHECK: %ymm16 = VFMADDSUB213PDZ256m %ymm16, %ymm16, %rsi, 1, _, 0, _
- %ymm16 = VFMADDSUB213PDZ256m %ymm16, %ymm16, %rsi, 1, _, 0, _
+ ; CHECK: %ymm16 = VFMADDSUB213PDZ256m %ymm16, %ymm16, %rsi, 1, %noreg, 0, %noreg
+ %ymm16 = VFMADDSUB213PDZ256m %ymm16, %ymm16, %rsi, 1, %noreg, 0, %noreg
; CHECK: %ymm16 = VFMADDSUB213PDZ256r %ymm16, %ymm1, %ymm2
%ymm16 = VFMADDSUB213PDZ256r %ymm16, %ymm1, %ymm2
- ; CHECK: %ymm16 = VFMADDSUB213PSZ256m %ymm16, %ymm16, %rsi, 1, _, 0, _
- %ymm16 = VFMADDSUB213PSZ256m %ymm16, %ymm16, %rsi, 1, _, 0, _
+ ; CHECK: %ymm16 = VFMADDSUB213PSZ256m %ymm16, %ymm16, %rsi, 1, %noreg, 0, %noreg
+ %ymm16 = VFMADDSUB213PSZ256m %ymm16, %ymm16, %rsi, 1, %noreg, 0, %noreg
; CHECK: %ymm16 = VFMADDSUB213PSZ256r %ymm16, %ymm1, %ymm2
%ymm16 = VFMADDSUB213PSZ256r %ymm16, %ymm1, %ymm2
- ; CHECK: %ymm16 = VFMADDSUB231PDZ256m %ymm16, %ymm16, %rsi, 1, _, 0, _
- %ymm16 = VFMADDSUB231PDZ256m %ymm16, %ymm16, %rsi, 1, _, 0, _
+ ; CHECK: %ymm16 = VFMADDSUB231PDZ256m %ymm16, %ymm16, %rsi, 1, %noreg, 0, %noreg
+ %ymm16 = VFMADDSUB231PDZ256m %ymm16, %ymm16, %rsi, 1, %noreg, 0, %noreg
; CHECK: %ymm16 = VFMADDSUB231PDZ256r %ymm16, %ymm1, %ymm2
%ymm16 = VFMADDSUB231PDZ256r %ymm16, %ymm1, %ymm2
- ; CHECK: %ymm16 = VFMADDSUB231PSZ256m %ymm16, %ymm16, %rsi, 1, _, 0, _
- %ymm16 = VFMADDSUB231PSZ256m %ymm16, %ymm16, %rsi, 1, _, 0, _
+ ; CHECK: %ymm16 = VFMADDSUB231PSZ256m %ymm16, %ymm16, %rsi, 1, %noreg, 0, %noreg
+ %ymm16 = VFMADDSUB231PSZ256m %ymm16, %ymm16, %rsi, 1, %noreg, 0, %noreg
; CHECK: %ymm16 = VFMADDSUB231PSZ256r %ymm16, %ymm1, %ymm2
%ymm16 = VFMADDSUB231PSZ256r %ymm16, %ymm1, %ymm2
- ; CHECK: %ymm16 = VFMSUB132PDZ256m %ymm16, %ymm16, %rsi, 1, _, 0, _
- %ymm16 = VFMSUB132PDZ256m %ymm16, %ymm16, %rsi, 1, _, 0, _
+ ; CHECK: %ymm16 = VFMSUB132PDZ256m %ymm16, %ymm16, %rsi, 1, %noreg, 0, %noreg
+ %ymm16 = VFMSUB132PDZ256m %ymm16, %ymm16, %rsi, 1, %noreg, 0, %noreg
; CHECK: %ymm16 = VFMSUB132PDZ256r %ymm16, %ymm1, %ymm2
%ymm16 = VFMSUB132PDZ256r %ymm16, %ymm1, %ymm2
- ; CHECK: %ymm16 = VFMSUB132PSZ256m %ymm16, %ymm16, %rsi, 1, _, 0, _
- %ymm16 = VFMSUB132PSZ256m %ymm16, %ymm16, %rsi, 1, _, 0, _
+ ; CHECK: %ymm16 = VFMSUB132PSZ256m %ymm16, %ymm16, %rsi, 1, %noreg, 0, %noreg
+ %ymm16 = VFMSUB132PSZ256m %ymm16, %ymm16, %rsi, 1, %noreg, 0, %noreg
; CHECK: %ymm16 = VFMSUB132PSZ256r %ymm16, %ymm1, %ymm2
%ymm16 = VFMSUB132PSZ256r %ymm16, %ymm1, %ymm2
- ; CHECK: %ymm16 = VFMSUB213PDZ256m %ymm16, %ymm16, %rsi, 1, _, 0, _
- %ymm16 = VFMSUB213PDZ256m %ymm16, %ymm16, %rsi, 1, _, 0, _
+ ; CHECK: %ymm16 = VFMSUB213PDZ256m %ymm16, %ymm16, %rsi, 1, %noreg, 0, %noreg
+ %ymm16 = VFMSUB213PDZ256m %ymm16, %ymm16, %rsi, 1, %noreg, 0, %noreg
; CHECK: %ymm16 = VFMSUB213PDZ256r %ymm16, %ymm1, %ymm2
%ymm16 = VFMSUB213PDZ256r %ymm16, %ymm1, %ymm2
- ; CHECK: %ymm16 = VFMSUB213PSZ256m %ymm16, %ymm16, %rsi, 1, _, 0, _
- %ymm16 = VFMSUB213PSZ256m %ymm16, %ymm16, %rsi, 1, _, 0, _
+ ; CHECK: %ymm16 = VFMSUB213PSZ256m %ymm16, %ymm16, %rsi, 1, %noreg, 0, %noreg
+ %ymm16 = VFMSUB213PSZ256m %ymm16, %ymm16, %rsi, 1, %noreg, 0, %noreg
; CHECK: %ymm16 = VFMSUB213PSZ256r %ymm16, %ymm1, %ymm2
%ymm16 = VFMSUB213PSZ256r %ymm16, %ymm1, %ymm2
- ; CHECK: %ymm16 = VFMSUB231PDZ256m %ymm16, %ymm16, %rsi, 1, _, 0, _
- %ymm16 = VFMSUB231PDZ256m %ymm16, %ymm16, %rsi, 1, _, 0, _
+ ; CHECK: %ymm16 = VFMSUB231PDZ256m %ymm16, %ymm16, %rsi, 1, %noreg, 0, %noreg
+ %ymm16 = VFMSUB231PDZ256m %ymm16, %ymm16, %rsi, 1, %noreg, 0, %noreg
; CHECK: %ymm16 = VFMSUB231PDZ256r %ymm16, %ymm1, %ymm2
%ymm16 = VFMSUB231PDZ256r %ymm16, %ymm1, %ymm2
- ; CHECK: %ymm16 = VFMSUB231PSZ256m %ymm16, %ymm16, %rsi, 1, _, 0, _
- %ymm16 = VFMSUB231PSZ256m %ymm16, %ymm16, %rsi, 1, _, 0, _
+ ; CHECK: %ymm16 = VFMSUB231PSZ256m %ymm16, %ymm16, %rsi, 1, %noreg, 0, %noreg
+ %ymm16 = VFMSUB231PSZ256m %ymm16, %ymm16, %rsi, 1, %noreg, 0, %noreg
; CHECK: %ymm16 = VFMSUB231PSZ256r %ymm16, %ymm1, %ymm2
%ymm16 = VFMSUB231PSZ256r %ymm16, %ymm1, %ymm2
- ; CHECK: %ymm16 = VFMSUBADD132PDZ256m %ymm16, %ymm16, %rsi, 1, _, 0, _
- %ymm16 = VFMSUBADD132PDZ256m %ymm16, %ymm16, %rsi, 1, _, 0, _
+ ; CHECK: %ymm16 = VFMSUBADD132PDZ256m %ymm16, %ymm16, %rsi, 1, %noreg, 0, %noreg
+ %ymm16 = VFMSUBADD132PDZ256m %ymm16, %ymm16, %rsi, 1, %noreg, 0, %noreg
; CHECK: %ymm16 = VFMSUBADD132PDZ256r %ymm16, %ymm1, %ymm2
%ymm16 = VFMSUBADD132PDZ256r %ymm16, %ymm1, %ymm2
- ; CHECK: %ymm16 = VFMSUBADD132PSZ256m %ymm16, %ymm16, %rsi, 1, _, 0, _
- %ymm16 = VFMSUBADD132PSZ256m %ymm16, %ymm16, %rsi, 1, _, 0, _
+ ; CHECK: %ymm16 = VFMSUBADD132PSZ256m %ymm16, %ymm16, %rsi, 1, %noreg, 0, %noreg
+ %ymm16 = VFMSUBADD132PSZ256m %ymm16, %ymm16, %rsi, 1, %noreg, 0, %noreg
; CHECK: %ymm16 = VFMSUBADD132PSZ256r %ymm16, %ymm1, %ymm2
%ymm16 = VFMSUBADD132PSZ256r %ymm16, %ymm1, %ymm2
- ; CHECK: %ymm16 = VFMSUBADD213PDZ256m %ymm16, %ymm16, %rsi, 1, _, 0, _
- %ymm16 = VFMSUBADD213PDZ256m %ymm16, %ymm16, %rsi, 1, _, 0, _
+ ; CHECK: %ymm16 = VFMSUBADD213PDZ256m %ymm16, %ymm16, %rsi, 1, %noreg, 0, %noreg
+ %ymm16 = VFMSUBADD213PDZ256m %ymm16, %ymm16, %rsi, 1, %noreg, 0, %noreg
; CHECK: %ymm16 = VFMSUBADD213PDZ256r %ymm16, %ymm1, %ymm2
%ymm16 = VFMSUBADD213PDZ256r %ymm16, %ymm1, %ymm2
- ; CHECK: %ymm16 = VFMSUBADD213PSZ256m %ymm16, %ymm16, %rsi, 1, _, 0, _
- %ymm16 = VFMSUBADD213PSZ256m %ymm16, %ymm16, %rsi, 1, _, 0, _
+ ; CHECK: %ymm16 = VFMSUBADD213PSZ256m %ymm16, %ymm16, %rsi, 1, %noreg, 0, %noreg
+ %ymm16 = VFMSUBADD213PSZ256m %ymm16, %ymm16, %rsi, 1, %noreg, 0, %noreg
; CHECK: %ymm16 = VFMSUBADD213PSZ256r %ymm16, %ymm1, %ymm2
%ymm16 = VFMSUBADD213PSZ256r %ymm16, %ymm1, %ymm2
- ; CHECK: %ymm16 = VFMSUBADD231PDZ256m %ymm16, %ymm16, %rsi, 1, _, 0, _
- %ymm16 = VFMSUBADD231PDZ256m %ymm16, %ymm16, %rsi, 1, _, 0, _
+ ; CHECK: %ymm16 = VFMSUBADD231PDZ256m %ymm16, %ymm16, %rsi, 1, %noreg, 0, %noreg
+ %ymm16 = VFMSUBADD231PDZ256m %ymm16, %ymm16, %rsi, 1, %noreg, 0, %noreg
; CHECK: %ymm16 = VFMSUBADD231PDZ256r %ymm16, %ymm1, %ymm2
%ymm16 = VFMSUBADD231PDZ256r %ymm16, %ymm1, %ymm2
- ; CHECK: %ymm16 = VFMSUBADD231PSZ256m %ymm16, %ymm16, %rsi, 1, _, 0, _
- %ymm16 = VFMSUBADD231PSZ256m %ymm16, %ymm16, %rsi, 1, _, 0, _
+ ; CHECK: %ymm16 = VFMSUBADD231PSZ256m %ymm16, %ymm16, %rsi, 1, %noreg, 0, %noreg
+ %ymm16 = VFMSUBADD231PSZ256m %ymm16, %ymm16, %rsi, 1, %noreg, 0, %noreg
; CHECK: %ymm16 = VFMSUBADD231PSZ256r %ymm16, %ymm1, %ymm2
%ymm16 = VFMSUBADD231PSZ256r %ymm16, %ymm1, %ymm2
- ; CHECK: %ymm16 = VFNMADD132PDZ256m %ymm16, %ymm16, %rsi, 1, _, 0, _
- %ymm16 = VFNMADD132PDZ256m %ymm16, %ymm16, %rsi, 1, _, 0, _
+ ; CHECK: %ymm16 = VFNMADD132PDZ256m %ymm16, %ymm16, %rsi, 1, %noreg, 0, %noreg
+ %ymm16 = VFNMADD132PDZ256m %ymm16, %ymm16, %rsi, 1, %noreg, 0, %noreg
; CHECK: %ymm16 = VFNMADD132PDZ256r %ymm16, %ymm1, %ymm2
%ymm16 = VFNMADD132PDZ256r %ymm16, %ymm1, %ymm2
- ; CHECK: %ymm16 = VFNMADD132PSZ256m %ymm16, %ymm16, %rsi, 1, _, 0, _
- %ymm16 = VFNMADD132PSZ256m %ymm16, %ymm16, %rsi, 1, _, 0, _
+ ; CHECK: %ymm16 = VFNMADD132PSZ256m %ymm16, %ymm16, %rsi, 1, %noreg, 0, %noreg
+ %ymm16 = VFNMADD132PSZ256m %ymm16, %ymm16, %rsi, 1, %noreg, 0, %noreg
; CHECK: %ymm16 = VFNMADD132PSZ256r %ymm16, %ymm1, %ymm2
%ymm16 = VFNMADD132PSZ256r %ymm16, %ymm1, %ymm2
- ; CHECK: %ymm16 = VFNMADD213PDZ256m %ymm16, %ymm16, %rsi, 1, _, 0, _
- %ymm16 = VFNMADD213PDZ256m %ymm16, %ymm16, %rsi, 1, _, 0, _
+ ; CHECK: %ymm16 = VFNMADD213PDZ256m %ymm16, %ymm16, %rsi, 1, %noreg, 0, %noreg
+ %ymm16 = VFNMADD213PDZ256m %ymm16, %ymm16, %rsi, 1, %noreg, 0, %noreg
; CHECK: %ymm16 = VFNMADD213PDZ256r %ymm16, %ymm1, %ymm2
%ymm16 = VFNMADD213PDZ256r %ymm16, %ymm1, %ymm2
- ; CHECK: %ymm16 = VFNMADD213PSZ256m %ymm16, %ymm16, %rsi, 1, _, 0, _
- %ymm16 = VFNMADD213PSZ256m %ymm16, %ymm16, %rsi, 1, _, 0, _
+ ; CHECK: %ymm16 = VFNMADD213PSZ256m %ymm16, %ymm16, %rsi, 1, %noreg, 0, %noreg
+ %ymm16 = VFNMADD213PSZ256m %ymm16, %ymm16, %rsi, 1, %noreg, 0, %noreg
; CHECK: %ymm16 = VFNMADD213PSZ256r %ymm16, %ymm1, %ymm2
%ymm16 = VFNMADD213PSZ256r %ymm16, %ymm1, %ymm2
- ; CHECK: %ymm16 = VFNMADD231PDZ256m %ymm16, %ymm16, %rsi, 1, _, 0, _
- %ymm16 = VFNMADD231PDZ256m %ymm16, %ymm16, %rsi, 1, _, 0, _
+ ; CHECK: %ymm16 = VFNMADD231PDZ256m %ymm16, %ymm16, %rsi, 1, %noreg, 0, %noreg
+ %ymm16 = VFNMADD231PDZ256m %ymm16, %ymm16, %rsi, 1, %noreg, 0, %noreg
; CHECK: %ymm16 = VFNMADD231PDZ256r %ymm16, %ymm1, %ymm2
%ymm16 = VFNMADD231PDZ256r %ymm16, %ymm1, %ymm2
- ; CHECK: %ymm16 = VFNMADD231PSZ256m %ymm16, %ymm16, %rsi, 1, _, 0, _
- %ymm16 = VFNMADD231PSZ256m %ymm16, %ymm16, %rsi, 1, _, 0, _
+ ; CHECK: %ymm16 = VFNMADD231PSZ256m %ymm16, %ymm16, %rsi, 1, %noreg, 0, %noreg
+ %ymm16 = VFNMADD231PSZ256m %ymm16, %ymm16, %rsi, 1, %noreg, 0, %noreg
; CHECK: %ymm16 = VFNMADD231PSZ256r %ymm16, %ymm1, %ymm2
%ymm16 = VFNMADD231PSZ256r %ymm16, %ymm1, %ymm2
- ; CHECK: %ymm16 = VFNMSUB132PDZ256m %ymm16, %ymm16, %rsi, 1, _, 0, _
- %ymm16 = VFNMSUB132PDZ256m %ymm16, %ymm16, %rsi, 1, _, 0, _
+ ; CHECK: %ymm16 = VFNMSUB132PDZ256m %ymm16, %ymm16, %rsi, 1, %noreg, 0, %noreg
+ %ymm16 = VFNMSUB132PDZ256m %ymm16, %ymm16, %rsi, 1, %noreg, 0, %noreg
; CHECK: %ymm16 = VFNMSUB132PDZ256r %ymm16, %ymm1, %ymm2
%ymm16 = VFNMSUB132PDZ256r %ymm16, %ymm1, %ymm2
- ; CHECK: %ymm16 = VFNMSUB132PSZ256m %ymm16, %ymm16, %rsi, 1, _, 0, _
- %ymm16 = VFNMSUB132PSZ256m %ymm16, %ymm16, %rsi, 1, _, 0, _
+ ; CHECK: %ymm16 = VFNMSUB132PSZ256m %ymm16, %ymm16, %rsi, 1, %noreg, 0, %noreg
+ %ymm16 = VFNMSUB132PSZ256m %ymm16, %ymm16, %rsi, 1, %noreg, 0, %noreg
; CHECK: %ymm16 = VFNMSUB132PSZ256r %ymm16, %ymm1, %ymm2
%ymm16 = VFNMSUB132PSZ256r %ymm16, %ymm1, %ymm2
- ; CHECK: %ymm16 = VFNMSUB213PDZ256m %ymm16, %ymm16, %rsi, 1, _, 0, _
- %ymm16 = VFNMSUB213PDZ256m %ymm16, %ymm16, %rsi, 1, _, 0, _
+ ; CHECK: %ymm16 = VFNMSUB213PDZ256m %ymm16, %ymm16, %rsi, 1, %noreg, 0, %noreg
+ %ymm16 = VFNMSUB213PDZ256m %ymm16, %ymm16, %rsi, 1, %noreg, 0, %noreg
; CHECK: %ymm16 = VFNMSUB213PDZ256r %ymm16, %ymm1, %ymm2
%ymm16 = VFNMSUB213PDZ256r %ymm16, %ymm1, %ymm2
- ; CHECK: %ymm16 = VFNMSUB213PSZ256m %ymm16, %ymm16, %rsi, 1, _, 0, _
- %ymm16 = VFNMSUB213PSZ256m %ymm16, %ymm16, %rsi, 1, _, 0, _
+ ; CHECK: %ymm16 = VFNMSUB213PSZ256m %ymm16, %ymm16, %rsi, 1, %noreg, 0, %noreg
+ %ymm16 = VFNMSUB213PSZ256m %ymm16, %ymm16, %rsi, 1, %noreg, 0, %noreg
; CHECK: %ymm16 = VFNMSUB213PSZ256r %ymm16, %ymm1, %ymm2
%ymm16 = VFNMSUB213PSZ256r %ymm16, %ymm1, %ymm2
- ; CHECK: %ymm16 = VFNMSUB231PDZ256m %ymm16, %ymm16, %rsi, 1, _, 0, _
- %ymm16 = VFNMSUB231PDZ256m %ymm16, %ymm16, %rsi, 1, _, 0, _
+ ; CHECK: %ymm16 = VFNMSUB231PDZ256m %ymm16, %ymm16, %rsi, 1, %noreg, 0, %noreg
+ %ymm16 = VFNMSUB231PDZ256m %ymm16, %ymm16, %rsi, 1, %noreg, 0, %noreg
; CHECK: %ymm16 = VFNMSUB231PDZ256r %ymm16, %ymm1, %ymm2
%ymm16 = VFNMSUB231PDZ256r %ymm16, %ymm1, %ymm2
- ; CHECK: %ymm16 = VFNMSUB231PSZ256m %ymm16, %ymm16, %rsi, 1, _, 0, _
- %ymm16 = VFNMSUB231PSZ256m %ymm16, %ymm16, %rsi, 1, _, 0, _
+ ; CHECK: %ymm16 = VFNMSUB231PSZ256m %ymm16, %ymm16, %rsi, 1, %noreg, 0, %noreg
+ %ymm16 = VFNMSUB231PSZ256m %ymm16, %ymm16, %rsi, 1, %noreg, 0, %noreg
; CHECK: %ymm16 = VFNMSUB231PSZ256r %ymm16, %ymm1, %ymm2
%ymm16 = VFNMSUB231PSZ256r %ymm16, %ymm1, %ymm2
; CHECK: %ymm16 = VPSRADZ256ri %ymm16, 7
%ymm16 = VPSRADZ256ri %ymm16, 7
- ; CHECK: %ymm16 = VPSRADZ256rm %ymm16, %rip, 1, _, %rax, _
- %ymm16 = VPSRADZ256rm %ymm16, %rip, 1, _, %rax, _
+ ; CHECK: %ymm16 = VPSRADZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VPSRADZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VPSRADZ256rr %ymm16, %xmm1
%ymm16 = VPSRADZ256rr %ymm16, %xmm1
- ; CHECK: %ymm16 = VPSRAVDZ256rm %ymm16, %rip, 1, _, %rax, _
- %ymm16 = VPSRAVDZ256rm %ymm16, %rip, 1, _, %rax, _
+ ; CHECK: %ymm16 = VPSRAVDZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VPSRAVDZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VPSRAVDZ256rr %ymm16, %ymm1
%ymm16 = VPSRAVDZ256rr %ymm16, %ymm1
; CHECK: %ymm16 = VPSRAWZ256ri %ymm16, 7
%ymm16 = VPSRAWZ256ri %ymm16, 7
- ; CHECK: %ymm16 = VPSRAWZ256rm %ymm16, %rip, 1, _, %rax, _
- %ymm16 = VPSRAWZ256rm %ymm16, %rip, 1, _, %rax, _
+ ; CHECK: %ymm16 = VPSRAWZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VPSRAWZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VPSRAWZ256rr %ymm16, %xmm1
%ymm16 = VPSRAWZ256rr %ymm16, %xmm1
; CHECK: %ymm16 = VPSRLDQZ256rr %ymm16, %ymm1
%ymm16 = VPSRLDQZ256rr %ymm16, %ymm1
; CHECK: %ymm16 = VPSRLDZ256ri %ymm16, 7
%ymm16 = VPSRLDZ256ri %ymm16, 7
- ; CHECK: %ymm16 = VPSRLDZ256rm %ymm16, %rip, 1, _, %rax, _
- %ymm16 = VPSRLDZ256rm %ymm16, %rip, 1, _, %rax, _
+ ; CHECK: %ymm16 = VPSRLDZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VPSRLDZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VPSRLDZ256rr %ymm16, %xmm1
%ymm16 = VPSRLDZ256rr %ymm16, %xmm1
; CHECK: %ymm16 = VPSRLQZ256ri %ymm16, 7
%ymm16 = VPSRLQZ256ri %ymm16, 7
- ; CHECK: %ymm16 = VPSRLQZ256rm %ymm16, %rip, 1, _, %rax, _
- %ymm16 = VPSRLQZ256rm %ymm16, %rip, 1, _, %rax, _
+ ; CHECK: %ymm16 = VPSRLQZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VPSRLQZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VPSRLQZ256rr %ymm16, %xmm1
%ymm16 = VPSRLQZ256rr %ymm16, %xmm1
- ; CHECK: %ymm16 = VPSRLVDZ256rm %ymm16, %rip, 1, _, %rax, _
- %ymm16 = VPSRLVDZ256rm %ymm16, %rip, 1, _, %rax, _
+ ; CHECK: %ymm16 = VPSRLVDZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VPSRLVDZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VPSRLVDZ256rr %ymm16, %ymm1
%ymm16 = VPSRLVDZ256rr %ymm16, %ymm1
- ; CHECK: %ymm16 = VPSRLVQZ256rm %ymm16, %rip, 1, _, %rax, _
- %ymm16 = VPSRLVQZ256rm %ymm16, %rip, 1, _, %rax, _
+ ; CHECK: %ymm16 = VPSRLVQZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VPSRLVQZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VPSRLVQZ256rr %ymm16, %ymm1
%ymm16 = VPSRLVQZ256rr %ymm16, %ymm1
; CHECK: %ymm16 = VPSRLWZ256ri %ymm16, 7
%ymm16 = VPSRLWZ256ri %ymm16, 7
- ; CHECK: %ymm16 = VPSRLWZ256rm %ymm16, %rip, 1, _, %rax, _
- %ymm16 = VPSRLWZ256rm %ymm16, %rip, 1, _, %rax, _
+ ; CHECK: %ymm16 = VPSRLWZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VPSRLWZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VPSRLWZ256rr %ymm16, %xmm1
%ymm16 = VPSRLWZ256rr %ymm16, %xmm1
- ; CHECK: %ymm16 = VPMOVSXBDZ256rm %rip, 1, _, %rax, _
- %ymm16 = VPMOVSXBDZ256rm %rip, 1, _, %rax, _
+ ; CHECK: %ymm16 = VPMOVSXBDZ256rm %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VPMOVSXBDZ256rm %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VPMOVSXBDZ256rr %xmm0
%ymm16 = VPMOVSXBDZ256rr %xmm0
- ; CHECK: %ymm16 = VPMOVSXBQZ256rm %rip, 1, _, %rax, _
- %ymm16 = VPMOVSXBQZ256rm %rip, 1, _, %rax, _
+ ; CHECK: %ymm16 = VPMOVSXBQZ256rm %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VPMOVSXBQZ256rm %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VPMOVSXBQZ256rr %xmm0
%ymm16 = VPMOVSXBQZ256rr %xmm0
- ; CHECK: %ymm16 = VPMOVSXBWZ256rm %rip, 1, _, %rax, _
- %ymm16 = VPMOVSXBWZ256rm %rip, 1, _, %rax, _
+ ; CHECK: %ymm16 = VPMOVSXBWZ256rm %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VPMOVSXBWZ256rm %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VPMOVSXBWZ256rr %xmm0
%ymm16 = VPMOVSXBWZ256rr %xmm0
- ; CHECK: %ymm16 = VPMOVSXDQZ256rm %rip, 1, _, %rax, _
- %ymm16 = VPMOVSXDQZ256rm %rip, 1, _, %rax, _
+ ; CHECK: %ymm16 = VPMOVSXDQZ256rm %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VPMOVSXDQZ256rm %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VPMOVSXDQZ256rr %xmm0
%ymm16 = VPMOVSXDQZ256rr %xmm0
- ; CHECK: %ymm16 = VPMOVSXWDZ256rm %rip, 1, _, %rax, _
- %ymm16 = VPMOVSXWDZ256rm %rip, 1, _, %rax, _
+ ; CHECK: %ymm16 = VPMOVSXWDZ256rm %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VPMOVSXWDZ256rm %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VPMOVSXWDZ256rr %xmm0
%ymm16 = VPMOVSXWDZ256rr %xmm0
- ; CHECK: %ymm16 = VPMOVSXWQZ256rm %rip, 1, _, %rax, _
- %ymm16 = VPMOVSXWQZ256rm %rip, 1, _, %rax, _
+ ; CHECK: %ymm16 = VPMOVSXWQZ256rm %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VPMOVSXWQZ256rm %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VPMOVSXWQZ256rr %xmm0
%ymm16 = VPMOVSXWQZ256rr %xmm0
- ; CHECK: %ymm16 = VPMOVZXBDZ256rm %rip, 1, _, %rax, _
- %ymm16 = VPMOVZXBDZ256rm %rip, 1, _, %rax, _
+ ; CHECK: %ymm16 = VPMOVZXBDZ256rm %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VPMOVZXBDZ256rm %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VPMOVZXBDZ256rr %xmm0
%ymm16 = VPMOVZXBDZ256rr %xmm0
- ; CHECK: %ymm16 = VPMOVZXBQZ256rm %rip, 1, _, %rax, _
- %ymm16 = VPMOVZXBQZ256rm %rip, 1, _, %rax, _
+ ; CHECK: %ymm16 = VPMOVZXBQZ256rm %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VPMOVZXBQZ256rm %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VPMOVZXBQZ256rr %xmm0
%ymm16 = VPMOVZXBQZ256rr %xmm0
- ; CHECK: %ymm16 = VPMOVZXBWZ256rm %rip, 1, _, %rax, _
- %ymm16 = VPMOVZXBWZ256rm %rip, 1, _, %rax, _
+ ; CHECK: %ymm16 = VPMOVZXBWZ256rm %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VPMOVZXBWZ256rm %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VPMOVZXBWZ256rr %xmm0
%ymm16 = VPMOVZXBWZ256rr %xmm0
- ; CHECK: %ymm16 = VPMOVZXDQZ256rm %rip, 1, _, %rax, _
- %ymm16 = VPMOVZXDQZ256rm %rip, 1, _, %rax, _
+ ; CHECK: %ymm16 = VPMOVZXDQZ256rm %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VPMOVZXDQZ256rm %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VPMOVZXDQZ256rr %xmm0
%ymm16 = VPMOVZXDQZ256rr %xmm0
- ; CHECK: %ymm16 = VPMOVZXWDZ256rm %rip, 1, _, %rax, _
- %ymm16 = VPMOVZXWDZ256rm %rip, 1, _, %rax, _
+ ; CHECK: %ymm16 = VPMOVZXWDZ256rm %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VPMOVZXWDZ256rm %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VPMOVZXWDZ256rr %xmm0
%ymm16 = VPMOVZXWDZ256rr %xmm0
- ; CHECK: %ymm16 = VPMOVZXWQZ256rm %rip, 1, _, %rax, _
- %ymm16 = VPMOVZXWQZ256rm %rip, 1, _, %rax, _
+ ; CHECK: %ymm16 = VPMOVZXWQZ256rm %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VPMOVZXWQZ256rm %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VPMOVZXWQZ256rr %xmm0
%ymm16 = VPMOVZXWQZ256rr %xmm0
- ; CHECK: %ymm16 = VBROADCASTF32X2Z256m %rip, 1, _, %rax, _
- %ymm16 = VBROADCASTF32X2Z256m %rip, 1, _, %rax, _
+ ; CHECK: %ymm16 = VBROADCASTF32X2Z256m %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VBROADCASTF32X2Z256m %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VBROADCASTF32X2Z256r %xmm16
%ymm16 = VBROADCASTF32X2Z256r %xmm16
- ; CHECK: %ymm16 = VBROADCASTF32X4Z256rm %rip, 1, _, %rax, _
- %ymm16 = VBROADCASTF32X4Z256rm %rip, 1, _, %rax, _
- ; CHECK: %ymm16 = VBROADCASTSDZ256m %rip, 1, _, %rax, _
- %ymm16 = VBROADCASTSDZ256m %rip, 1, _, %rax, _
+ ; CHECK: %ymm16 = VBROADCASTF32X4Z256rm %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VBROADCASTF32X4Z256rm %rip, 1, %noreg, %rax, %noreg
+ ; CHECK: %ymm16 = VBROADCASTSDZ256m %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VBROADCASTSDZ256m %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VBROADCASTSDZ256r %xmm0
%ymm16 = VBROADCASTSDZ256r %xmm0
- ; CHECK: %ymm16 = VBROADCASTSSZ256m %rip, 1, _, %rax, _
- %ymm16 = VBROADCASTSSZ256m %rip, 1, _, %rax, _
+ ; CHECK: %ymm16 = VBROADCASTSSZ256m %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VBROADCASTSSZ256m %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VBROADCASTSSZ256r %xmm0
%ymm16 = VBROADCASTSSZ256r %xmm0
- ; CHECK: %ymm16 = VPBROADCASTBZ256m %rip, 1, _, %rax, _
- %ymm16 = VPBROADCASTBZ256m %rip, 1, _, %rax, _
+ ; CHECK: %ymm16 = VPBROADCASTBZ256m %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VPBROADCASTBZ256m %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VPBROADCASTBZ256r %xmm0
%ymm16 = VPBROADCASTBZ256r %xmm0
- ; CHECK: %ymm16 = VPBROADCASTDZ256m %rip, 1, _, %rax, _
- %ymm16 = VPBROADCASTDZ256m %rip, 1, _, %rax, _
+ ; CHECK: %ymm16 = VPBROADCASTDZ256m %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VPBROADCASTDZ256m %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VPBROADCASTDZ256r %xmm0
%ymm16 = VPBROADCASTDZ256r %xmm0
- ; CHECK: %ymm16 = VPBROADCASTWZ256m %rip, 1, _, %rax, _
- %ymm16 = VPBROADCASTWZ256m %rip, 1, _, %rax, _
+ ; CHECK: %ymm16 = VPBROADCASTWZ256m %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VPBROADCASTWZ256m %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VPBROADCASTWZ256r %xmm0
%ymm16 = VPBROADCASTWZ256r %xmm0
- ; CHECK: %ymm16 = VBROADCASTI32X4Z256rm %rip, 1, _, %rax, _
- %ymm16 = VBROADCASTI32X4Z256rm %rip, 1, _, %rax, _
- ; CHECK: %ymm16 = VBROADCASTI32X2Z256m %rip, 1, _, %rax, _
- %ymm16 = VBROADCASTI32X2Z256m %rip, 1, _, %rax, _
+ ; CHECK: %ymm16 = VBROADCASTI32X4Z256rm %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VBROADCASTI32X4Z256rm %rip, 1, %noreg, %rax, %noreg
+ ; CHECK: %ymm16 = VBROADCASTI32X2Z256m %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VBROADCASTI32X2Z256m %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VBROADCASTI32X2Z256r %xmm16
%ymm16 = VBROADCASTI32X2Z256r %xmm16
- ; CHECK: %ymm16 = VPBROADCASTQZ256m %rip, 1, _, %rax, _
- %ymm16 = VPBROADCASTQZ256m %rip, 1, _, %rax, _
+ ; CHECK: %ymm16 = VPBROADCASTQZ256m %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VPBROADCASTQZ256m %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VPBROADCASTQZ256r %xmm0
%ymm16 = VPBROADCASTQZ256r %xmm0
- ; CHECK: %ymm16 = VPABSBZ256rm %rip, 1, _, %rax, _
- %ymm16 = VPABSBZ256rm %rip, 1, _, %rax, _
+ ; CHECK: %ymm16 = VPABSBZ256rm %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VPABSBZ256rm %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VPABSBZ256rr %ymm16
%ymm16 = VPABSBZ256rr %ymm16
- ; CHECK: %ymm16 = VPABSDZ256rm %rip, 1, _, %rax, _
- %ymm16 = VPABSDZ256rm %rip, 1, _, %rax, _
+ ; CHECK: %ymm16 = VPABSDZ256rm %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VPABSDZ256rm %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VPABSDZ256rr %ymm16
%ymm16 = VPABSDZ256rr %ymm16
- ; CHECK: %ymm16 = VPABSWZ256rm %rip, 1, _, %rax, _
- %ymm16 = VPABSWZ256rm %rip, 1, _, %rax, _
+ ; CHECK: %ymm16 = VPABSWZ256rm %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VPABSWZ256rm %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VPABSWZ256rr %ymm16
%ymm16 = VPABSWZ256rr %ymm16
- ; CHECK: %ymm16 = VPSADBWZ256rm %ymm16, 1, _, %rax, _, _
- %ymm16 = VPSADBWZ256rm %ymm16, 1, _, %rax, _, _
+ ; CHECK: %ymm16 = VPSADBWZ256rm %ymm16, 1, %noreg, %rax, %noreg, %noreg
+ %ymm16 = VPSADBWZ256rm %ymm16, 1, %noreg, %rax, %noreg, %noreg
; CHECK: %ymm16 = VPSADBWZ256rr %ymm16, %ymm1
%ymm16 = VPSADBWZ256rr %ymm16, %ymm1
- ; CHECK: %ymm16 = VPERMDZ256rm %ymm16, %rdi, 1, _, 0, _
- %ymm16 = VPERMDZ256rm %ymm16, %rdi, 1, _, 0, _
+ ; CHECK: %ymm16 = VPERMDZ256rm %ymm16, %rdi, 1, %noreg, 0, %noreg
+ %ymm16 = VPERMDZ256rm %ymm16, %rdi, 1, %noreg, 0, %noreg
; CHECK: %ymm16 = VPERMDZ256rr %ymm1, %ymm16
%ymm16 = VPERMDZ256rr %ymm1, %ymm16
- ; CHECK: %ymm16 = VPERMILPDZ256mi %rdi, 1, _, 0, _, _
- %ymm16 = VPERMILPDZ256mi %rdi, 1, _, 0, _, _
+ ; CHECK: %ymm16 = VPERMILPDZ256mi %rdi, 1, %noreg, 0, %noreg, %noreg
+ %ymm16 = VPERMILPDZ256mi %rdi, 1, %noreg, 0, %noreg, %noreg
; CHECK: %ymm16 = VPERMILPDZ256ri %ymm16, 7
%ymm16 = VPERMILPDZ256ri %ymm16, 7
- ; CHECK: %ymm16 = VPERMILPDZ256rm %ymm16, %rdi, 1, _, 0, _
- %ymm16 = VPERMILPDZ256rm %ymm16, %rdi, 1, _, 0, _
+ ; CHECK: %ymm16 = VPERMILPDZ256rm %ymm16, %rdi, 1, %noreg, 0, %noreg
+ %ymm16 = VPERMILPDZ256rm %ymm16, %rdi, 1, %noreg, 0, %noreg
; CHECK: %ymm16 = VPERMILPDZ256rr %ymm1, %ymm16
%ymm16 = VPERMILPDZ256rr %ymm1, %ymm16
- ; CHECK: %ymm16 = VPERMILPSZ256mi %rdi, 1, _, 0, _, _
- %ymm16 = VPERMILPSZ256mi %rdi, 1, _, 0, _, _
+ ; CHECK: %ymm16 = VPERMILPSZ256mi %rdi, 1, %noreg, 0, %noreg, %noreg
+ %ymm16 = VPERMILPSZ256mi %rdi, 1, %noreg, 0, %noreg, %noreg
; CHECK: %ymm16 = VPERMILPSZ256ri %ymm16, 7
%ymm16 = VPERMILPSZ256ri %ymm16, 7
- ; CHECK: %ymm16 = VPERMILPSZ256rm %ymm16, %rdi, 1, _, 0, _
- %ymm16 = VPERMILPSZ256rm %ymm16, %rdi, 1, _, 0, _
+ ; CHECK: %ymm16 = VPERMILPSZ256rm %ymm16, %rdi, 1, %noreg, 0, %noreg
+ %ymm16 = VPERMILPSZ256rm %ymm16, %rdi, 1, %noreg, 0, %noreg
; CHECK: %ymm16 = VPERMILPSZ256rr %ymm1, %ymm16
%ymm16 = VPERMILPSZ256rr %ymm1, %ymm16
- ; CHECK: %ymm16 = VPERMPDZ256mi %rdi, 1, _, 0, _, _
- %ymm16 = VPERMPDZ256mi %rdi, 1, _, 0, _, _
+ ; CHECK: %ymm16 = VPERMPDZ256mi %rdi, 1, %noreg, 0, %noreg, %noreg
+ %ymm16 = VPERMPDZ256mi %rdi, 1, %noreg, 0, %noreg, %noreg
; CHECK: %ymm16 = VPERMPDZ256ri %ymm16, 7
%ymm16 = VPERMPDZ256ri %ymm16, 7
- ; CHECK: %ymm16 = VPERMPSZ256rm %ymm16, %rdi, 1, _, 0, _
- %ymm16 = VPERMPSZ256rm %ymm16, %rdi, 1, _, 0, _
+ ; CHECK: %ymm16 = VPERMPSZ256rm %ymm16, %rdi, 1, %noreg, 0, %noreg
+ %ymm16 = VPERMPSZ256rm %ymm16, %rdi, 1, %noreg, 0, %noreg
; CHECK: %ymm16 = VPERMPSZ256rr %ymm1, %ymm16
%ymm16 = VPERMPSZ256rr %ymm1, %ymm16
- ; CHECK: %ymm16 = VPERMQZ256mi %rdi, 1, _, 0, _, _
- %ymm16 = VPERMQZ256mi %rdi, 1, _, 0, _, _
+ ; CHECK: %ymm16 = VPERMQZ256mi %rdi, 1, %noreg, 0, %noreg, %noreg
+ %ymm16 = VPERMQZ256mi %rdi, 1, %noreg, 0, %noreg, %noreg
; CHECK: %ymm16 = VPERMQZ256ri %ymm16, 7
%ymm16 = VPERMQZ256ri %ymm16, 7
; CHECK: %ymm16 = VPSLLDQZ256rr %ymm16, 14
%ymm16 = VPSLLDQZ256rr %ymm16, 14
; CHECK: %ymm16 = VPSLLDZ256ri %ymm16, 7
%ymm16 = VPSLLDZ256ri %ymm16, 7
- ; CHECK: %ymm16 = VPSLLDZ256rm %ymm16, %rip, 1, _, %rax, _
- %ymm16 = VPSLLDZ256rm %ymm16, %rip, 1, _, %rax, _
+ ; CHECK: %ymm16 = VPSLLDZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VPSLLDZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VPSLLDZ256rr %ymm16, 14
%ymm16 = VPSLLDZ256rr %ymm16, 14
; CHECK: %ymm16 = VPSLLQZ256ri %ymm16, 7
%ymm16 = VPSLLQZ256ri %ymm16, 7
- ; CHECK: %ymm16 = VPSLLQZ256rm %ymm16, %rip, 1, _, %rax, _
- %ymm16 = VPSLLQZ256rm %ymm16, %rip, 1, _, %rax, _
+ ; CHECK: %ymm16 = VPSLLQZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VPSLLQZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VPSLLQZ256rr %ymm16, 14
%ymm16 = VPSLLQZ256rr %ymm16, 14
- ; CHECK: %ymm16 = VPSLLVDZ256rm %ymm16, %rip, 1, _, %rax, _
- %ymm16 = VPSLLVDZ256rm %ymm16, %rip, 1, _, %rax, _
+ ; CHECK: %ymm16 = VPSLLVDZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VPSLLVDZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VPSLLVDZ256rr %ymm16, 14
%ymm16 = VPSLLVDZ256rr %ymm16, 14
- ; CHECK: %ymm16 = VPSLLVQZ256rm %ymm16, %rip, 1, _, %rax, _
- %ymm16 = VPSLLVQZ256rm %ymm16, %rip, 1, _, %rax, _
+ ; CHECK: %ymm16 = VPSLLVQZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VPSLLVQZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VPSLLVQZ256rr %ymm16, 14
%ymm16 = VPSLLVQZ256rr %ymm16, 14
; CHECK: %ymm16 = VPSLLWZ256ri %ymm16, 7
%ymm16 = VPSLLWZ256ri %ymm16, 7
- ; CHECK: %ymm16 = VPSLLWZ256rm %ymm16, %rip, 1, _, %rax, _
- %ymm16 = VPSLLWZ256rm %ymm16, %rip, 1, _, %rax, _
+ ; CHECK: %ymm16 = VPSLLWZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
+ %ymm16 = VPSLLWZ256rm %ymm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %ymm16 = VPSLLWZ256rr %ymm16, 14
%ymm16 = VPSLLWZ256rr %ymm16, 14
- ; CHECK: %ymm16 = VCVTDQ2PDZ256rm %rdi, %ymm16, 1, _, 0
- %ymm16 = VCVTDQ2PDZ256rm %rdi, %ymm16, 1, _, 0
+ ; CHECK: %ymm16 = VCVTDQ2PDZ256rm %rdi, %ymm16, 1, %noreg, 0
+ %ymm16 = VCVTDQ2PDZ256rm %rdi, %ymm16, 1, %noreg, 0
; CHECK: %ymm16 = VCVTDQ2PDZ256rr %xmm0
%ymm16 = VCVTDQ2PDZ256rr %xmm0
- ; CHECK: %ymm16 = VCVTDQ2PSZ256rm %rdi, %ymm16, 1, _, 0
- %ymm16 = VCVTDQ2PSZ256rm %rdi, %ymm16, 1, _, 0
+ ; CHECK: %ymm16 = VCVTDQ2PSZ256rm %rdi, %ymm16, 1, %noreg, 0
+ %ymm16 = VCVTDQ2PSZ256rm %rdi, %ymm16, 1, %noreg, 0
; CHECK: %ymm16 = VCVTDQ2PSZ256rr %ymm16
%ymm16 = VCVTDQ2PSZ256rr %ymm16
- ; CHECK: %xmm0 = VCVTPD2DQZ256rm %rdi, %ymm16, 1, _, 0
- %xmm0 = VCVTPD2DQZ256rm %rdi, %ymm16, 1, _, 0
+ ; CHECK: %xmm0 = VCVTPD2DQZ256rm %rdi, %ymm16, 1, %noreg, 0
+ %xmm0 = VCVTPD2DQZ256rm %rdi, %ymm16, 1, %noreg, 0
; CHECK: %xmm0 = VCVTPD2DQZ256rr %ymm16
%xmm0 = VCVTPD2DQZ256rr %ymm16
- ; CHECK: %xmm0 = VCVTPD2PSZ256rm %rdi, %ymm16, 1, _, 0
- %xmm0 = VCVTPD2PSZ256rm %rdi, %ymm16, 1, _, 0
+ ; CHECK: %xmm0 = VCVTPD2PSZ256rm %rdi, %ymm16, 1, %noreg, 0
+ %xmm0 = VCVTPD2PSZ256rm %rdi, %ymm16, 1, %noreg, 0
; CHECK: %xmm0 = VCVTPD2PSZ256rr %ymm16
%xmm0 = VCVTPD2PSZ256rr %ymm16
- ; CHECK: %ymm16 = VCVTPS2DQZ256rm %rdi, %ymm16, 1, _, 0
- %ymm16 = VCVTPS2DQZ256rm %rdi, %ymm16, 1, _, 0
+ ; CHECK: %ymm16 = VCVTPS2DQZ256rm %rdi, %ymm16, 1, %noreg, 0
+ %ymm16 = VCVTPS2DQZ256rm %rdi, %ymm16, 1, %noreg, 0
; CHECK: %ymm16 = VCVTPS2DQZ256rr %ymm16
%ymm16 = VCVTPS2DQZ256rr %ymm16
- ; CHECK: %ymm16 = VCVTPS2PDZ256rm %rdi, %ymm16, 1, _, 0
- %ymm16 = VCVTPS2PDZ256rm %rdi, %ymm16, 1, _, 0
+ ; CHECK: %ymm16 = VCVTPS2PDZ256rm %rdi, %ymm16, 1, %noreg, 0
+ %ymm16 = VCVTPS2PDZ256rm %rdi, %ymm16, 1, %noreg, 0
; CHECK: %ymm16 = VCVTPS2PDZ256rr %xmm0
%ymm16 = VCVTPS2PDZ256rr %xmm0
- ; CHECK: VCVTPS2PHZ256mr %rdi, %ymm16, 1, _, 0, _, _
- VCVTPS2PHZ256mr %rdi, %ymm16, 1, _, 0, _, _
- ; CHECK: %xmm0 = VCVTPS2PHZ256rr %ymm16, _
- %xmm0 = VCVTPS2PHZ256rr %ymm16, _
- ; CHECK: %ymm16 = VCVTPH2PSZ256rm %rdi, %ymm16, 1, _, 0
- %ymm16 = VCVTPH2PSZ256rm %rdi, %ymm16, 1, _, 0
+ ; CHECK: VCVTPS2PHZ256mr %rdi, %ymm16, 1, %noreg, 0, %noreg, %noreg
+ VCVTPS2PHZ256mr %rdi, %ymm16, 1, %noreg, 0, %noreg, %noreg
+ ; CHECK: %xmm0 = VCVTPS2PHZ256rr %ymm16, %noreg
+ %xmm0 = VCVTPS2PHZ256rr %ymm16, %noreg
+ ; CHECK: %ymm16 = VCVTPH2PSZ256rm %rdi, %ymm16, 1, %noreg, 0
+ %ymm16 = VCVTPH2PSZ256rm %rdi, %ymm16, 1, %noreg, 0
; CHECK: %ymm16 = VCVTPH2PSZ256rr %xmm0
%ymm16 = VCVTPH2PSZ256rr %xmm0
- ; CHECK: %xmm0 = VCVTTPD2DQZ256rm %rdi, %ymm16, 1, _, 0
- %xmm0 = VCVTTPD2DQZ256rm %rdi, %ymm16, 1, _, 0
+ ; CHECK: %xmm0 = VCVTTPD2DQZ256rm %rdi, %ymm16, 1, %noreg, 0
+ %xmm0 = VCVTTPD2DQZ256rm %rdi, %ymm16, 1, %noreg, 0
; CHECK: %xmm0 = VCVTTPD2DQZ256rr %ymm16
%xmm0 = VCVTTPD2DQZ256rr %ymm16
- ; CHECK: %ymm16 = VCVTTPS2DQZ256rm %rdi, %ymm16, 1, _, 0
- %ymm16 = VCVTTPS2DQZ256rm %rdi, %ymm16, 1, _, 0
+ ; CHECK: %ymm16 = VCVTTPS2DQZ256rm %rdi, %ymm16, 1, %noreg, 0
+ %ymm16 = VCVTTPS2DQZ256rm %rdi, %ymm16, 1, %noreg, 0
; CHECK: %ymm16 = VCVTTPS2DQZ256rr %ymm16
%ymm16 = VCVTTPS2DQZ256rr %ymm16
- ; CHECK: %ymm16 = VSQRTPDZ256m %rdi, _, _, _, _
- %ymm16 = VSQRTPDZ256m %rdi, _, _, _, _
+ ; CHECK: %ymm16 = VSQRTPDZ256m %rdi, %noreg, %noreg, %noreg, %noreg
+ %ymm16 = VSQRTPDZ256m %rdi, %noreg, %noreg, %noreg, %noreg
; CHECK: %ymm16 = VSQRTPDZ256r %ymm16
%ymm16 = VSQRTPDZ256r %ymm16
- ; CHECK: %ymm16 = VSQRTPSZ256m %rdi, _, _, _, _
- %ymm16 = VSQRTPSZ256m %rdi, _, _, _, _
+ ; CHECK: %ymm16 = VSQRTPSZ256m %rdi, %noreg, %noreg, %noreg, %noreg
+ %ymm16 = VSQRTPSZ256m %rdi, %noreg, %noreg, %noreg, %noreg
; CHECK: %ymm16 = VSQRTPSZ256r %ymm16
%ymm16 = VSQRTPSZ256r %ymm16
- ; CHECK: %ymm16 = VPALIGNRZ256rmi %ymm16, %rdi, _, _, _, _, _
- %ymm16 = VPALIGNRZ256rmi %ymm16, %rdi, _, _, _, _, _
- ; CHECK: %ymm16 = VPALIGNRZ256rri %ymm16, %ymm1, _
- %ymm16 = VPALIGNRZ256rri %ymm16, %ymm1, _
- ; CHECK: %ymm16 = VMOVUPSZ256rm %rdi, 1, _, 0, _
- %ymm16 = VMOVUPSZ256rm %rdi, 1, _, 0, _
+ ; CHECK: %ymm16 = VPALIGNRZ256rmi %ymm16, %rdi, %noreg, %noreg, %noreg, %noreg, %noreg
+ %ymm16 = VPALIGNRZ256rmi %ymm16, %rdi, %noreg, %noreg, %noreg, %noreg, %noreg
+ ; CHECK: %ymm16 = VPALIGNRZ256rri %ymm16, %ymm1, %noreg
+ %ymm16 = VPALIGNRZ256rri %ymm16, %ymm1, %noreg
+ ; CHECK: %ymm16 = VMOVUPSZ256rm %rdi, 1, %noreg, 0, %noreg
+ %ymm16 = VMOVUPSZ256rm %rdi, 1, %noreg, 0, %noreg
; CHECK: %ymm16 = VMOVUPSZ256rr %ymm16
%ymm16 = VMOVUPSZ256rr %ymm16
; CHECK: %ymm16 = VMOVUPSZ256rr_REV %ymm16
%ymm16 = VMOVUPSZ256rr_REV %ymm16
- ; CHECK: %ymm16 = VPSHUFBZ256rm %ymm16, _, _, _, _, _
- %ymm16 = VPSHUFBZ256rm %ymm16, _, _, _, _, _
+ ; CHECK: %ymm16 = VPSHUFBZ256rm %ymm16, %noreg, %noreg, %noreg, %noreg, %noreg
+ %ymm16 = VPSHUFBZ256rm %ymm16, %noreg, %noreg, %noreg, %noreg, %noreg
; CHECK: %ymm16 = VPSHUFBZ256rr %ymm16, %ymm1
%ymm16 = VPSHUFBZ256rr %ymm16, %ymm1
- ; CHECK: %ymm16 = VPSHUFDZ256mi %rdi, 1, _, 0, _, _
- %ymm16 = VPSHUFDZ256mi %rdi, 1, _, 0, _, _
+ ; CHECK: %ymm16 = VPSHUFDZ256mi %rdi, 1, %noreg, 0, %noreg, %noreg
+ %ymm16 = VPSHUFDZ256mi %rdi, 1, %noreg, 0, %noreg, %noreg
; CHECK: %ymm16 = VPSHUFDZ256ri %ymm16, -24
%ymm16 = VPSHUFDZ256ri %ymm16, -24
- ; CHECK: %ymm16 = VPSHUFHWZ256mi %rdi, 1, _, 0, _, _
- %ymm16 = VPSHUFHWZ256mi %rdi, 1, _, 0, _, _
+ ; CHECK: %ymm16 = VPSHUFHWZ256mi %rdi, 1, %noreg, 0, %noreg, %noreg
+ %ymm16 = VPSHUFHWZ256mi %rdi, 1, %noreg, 0, %noreg, %noreg
; CHECK: %ymm16 = VPSHUFHWZ256ri %ymm16, -24
%ymm16 = VPSHUFHWZ256ri %ymm16, -24
- ; CHECK: %ymm16 = VPSHUFLWZ256mi %rdi, 1, _, 0, _, _
- %ymm16 = VPSHUFLWZ256mi %rdi, 1, _, 0, _, _
+ ; CHECK: %ymm16 = VPSHUFLWZ256mi %rdi, 1, %noreg, 0, %noreg, %noreg
+ %ymm16 = VPSHUFLWZ256mi %rdi, 1, %noreg, 0, %noreg, %noreg
; CHECK: %ymm16 = VPSHUFLWZ256ri %ymm16, -24
%ymm16 = VPSHUFLWZ256ri %ymm16, -24
- ; CHECK: %ymm16 = VSHUFPDZ256rmi %ymm16, _, _, _, _, _, _
- %ymm16 = VSHUFPDZ256rmi %ymm16, _, _, _, _, _, _
- ; CHECK: %ymm16 = VSHUFPDZ256rri %ymm16, _, _
- %ymm16 = VSHUFPDZ256rri %ymm16, _, _
- ; CHECK: %ymm16 = VSHUFPSZ256rmi %ymm16, _, _, _, _, _, _
- %ymm16 = VSHUFPSZ256rmi %ymm16, _, _, _, _, _, _
- ; CHECK: %ymm16 = VSHUFPSZ256rri %ymm16, _, _
- %ymm16 = VSHUFPSZ256rri %ymm16, _, _
+ ; CHECK: %ymm16 = VSHUFPDZ256rmi %ymm16, %noreg, %noreg, %noreg, %noreg, %noreg, %noreg
+ %ymm16 = VSHUFPDZ256rmi %ymm16, %noreg, %noreg, %noreg, %noreg, %noreg, %noreg
+ ; CHECK: %ymm16 = VSHUFPDZ256rri %ymm16, %noreg, %noreg
+ %ymm16 = VSHUFPDZ256rri %ymm16, %noreg, %noreg
+ ; CHECK: %ymm16 = VSHUFPSZ256rmi %ymm16, %noreg, %noreg, %noreg, %noreg, %noreg, %noreg
+ %ymm16 = VSHUFPSZ256rmi %ymm16, %noreg, %noreg, %noreg, %noreg, %noreg, %noreg
+ ; CHECK: %ymm16 = VSHUFPSZ256rri %ymm16, %noreg, %noreg
+ %ymm16 = VSHUFPSZ256rri %ymm16, %noreg, %noreg
RET 0, %zmm0, %zmm1
...
@@ -3208,80 +3208,80 @@ body: |
name: evex_z128_to_evex_test
body: |
bb.0:
- ; CHECK: VMOVAPDZ128mr %rdi, 1, _, 0, _, %xmm16
- VMOVAPDZ128mr %rdi, 1, _, 0, _, %xmm16
- ; CHECK: %xmm16 = VMOVAPDZ128rm %rip, 1, _, %rax, _
- %xmm16 = VMOVAPDZ128rm %rip, 1, _, %rax, _
+ ; CHECK: VMOVAPDZ128mr %rdi, 1, %noreg, 0, %noreg, %xmm16
+ VMOVAPDZ128mr %rdi, 1, %noreg, 0, %noreg, %xmm16
+ ; CHECK: %xmm16 = VMOVAPDZ128rm %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VMOVAPDZ128rm %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VMOVAPDZ128rr %xmm16
%xmm16 = VMOVAPDZ128rr %xmm16
- ; CHECK: VMOVAPSZ128mr %rdi, 1, _, 0, _, %xmm16
- VMOVAPSZ128mr %rdi, 1, _, 0, _, %xmm16
- ; CHECK: %xmm16 = VMOVAPSZ128rm %rip, 1, _, %rax, _
- %xmm16 = VMOVAPSZ128rm %rip, 1, _, %rax, _
+ ; CHECK: VMOVAPSZ128mr %rdi, 1, %noreg, 0, %noreg, %xmm16
+ VMOVAPSZ128mr %rdi, 1, %noreg, 0, %noreg, %xmm16
+ ; CHECK: %xmm16 = VMOVAPSZ128rm %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VMOVAPSZ128rm %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VMOVAPSZ128rr %xmm16
%xmm16 = VMOVAPSZ128rr %xmm16
- ; CHECK: VMOVDQA32Z128mr %rdi, 1, _, 0, _, %xmm16
- VMOVDQA32Z128mr %rdi, 1, _, 0, _, %xmm16
- ; CHECK: %xmm16 = VMOVDQA32Z128rm %rip, 1, _, %rax, _
- %xmm16 = VMOVDQA32Z128rm %rip, 1, _, %rax, _
+ ; CHECK: VMOVDQA32Z128mr %rdi, 1, %noreg, 0, %noreg, %xmm16
+ VMOVDQA32Z128mr %rdi, 1, %noreg, 0, %noreg, %xmm16
+ ; CHECK: %xmm16 = VMOVDQA32Z128rm %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VMOVDQA32Z128rm %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VMOVDQA32Z128rr %xmm16
%xmm16 = VMOVDQA32Z128rr %xmm16
- ; CHECK: VMOVDQA64Z128mr %rdi, 1, _, 0, _, %xmm16
- VMOVDQA64Z128mr %rdi, 1, _, 0, _, %xmm16
- ; CHECK: %xmm16 = VMOVDQA64Z128rm %rip, 1, _, %rax, _
- %xmm16 = VMOVDQA64Z128rm %rip, 1, _, %rax, _
+ ; CHECK: VMOVDQA64Z128mr %rdi, 1, %noreg, 0, %noreg, %xmm16
+ VMOVDQA64Z128mr %rdi, 1, %noreg, 0, %noreg, %xmm16
+ ; CHECK: %xmm16 = VMOVDQA64Z128rm %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VMOVDQA64Z128rm %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VMOVDQA64Z128rr %xmm16
%xmm16 = VMOVDQA64Z128rr %xmm16
- ; CHECK: VMOVDQU16Z128mr %rdi, 1, _, 0, _, %xmm16
- VMOVDQU16Z128mr %rdi, 1, _, 0, _, %xmm16
- ; CHECK: %xmm16 = VMOVDQU16Z128rm %rip, 1, _, %rax, _
- %xmm16 = VMOVDQU16Z128rm %rip, 1, _, %rax, _
+ ; CHECK: VMOVDQU16Z128mr %rdi, 1, %noreg, 0, %noreg, %xmm16
+ VMOVDQU16Z128mr %rdi, 1, %noreg, 0, %noreg, %xmm16
+ ; CHECK: %xmm16 = VMOVDQU16Z128rm %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VMOVDQU16Z128rm %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VMOVDQU16Z128rr %xmm16
%xmm16 = VMOVDQU16Z128rr %xmm16
- ; CHECK: VMOVDQU32Z128mr %rdi, 1, _, 0, _, %xmm16
- VMOVDQU32Z128mr %rdi, 1, _, 0, _, %xmm16
- ; CHECK: %xmm16 = VMOVDQU32Z128rm %rip, 1, _, %rax, _
- %xmm16 = VMOVDQU32Z128rm %rip, 1, _, %rax, _
+ ; CHECK: VMOVDQU32Z128mr %rdi, 1, %noreg, 0, %noreg, %xmm16
+ VMOVDQU32Z128mr %rdi, 1, %noreg, 0, %noreg, %xmm16
+ ; CHECK: %xmm16 = VMOVDQU32Z128rm %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VMOVDQU32Z128rm %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VMOVDQU32Z128rr %xmm16
%xmm16 = VMOVDQU32Z128rr %xmm16
- ; CHECK: VMOVDQU64Z128mr %rdi, 1, _, 0, _, %xmm16
- VMOVDQU64Z128mr %rdi, 1, _, 0, _, %xmm16
- ; CHECK: %xmm16 = VMOVDQU64Z128rm %rip, 1, _, %rax, _
- %xmm16 = VMOVDQU64Z128rm %rip, 1, _, %rax, _
+ ; CHECK: VMOVDQU64Z128mr %rdi, 1, %noreg, 0, %noreg, %xmm16
+ VMOVDQU64Z128mr %rdi, 1, %noreg, 0, %noreg, %xmm16
+ ; CHECK: %xmm16 = VMOVDQU64Z128rm %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VMOVDQU64Z128rm %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VMOVDQU64Z128rr %xmm16
%xmm16 = VMOVDQU64Z128rr %xmm16
- ; CHECK: VMOVDQU8Z128mr %rdi, 1, _, 0, _, %xmm16
- VMOVDQU8Z128mr %rdi, 1, _, 0, _, %xmm16
- ; CHECK: %xmm16 = VMOVDQU8Z128rm %rip, 1, _, %rax, _
- %xmm16 = VMOVDQU8Z128rm %rip, 1, _, %rax, _
+ ; CHECK: VMOVDQU8Z128mr %rdi, 1, %noreg, 0, %noreg, %xmm16
+ VMOVDQU8Z128mr %rdi, 1, %noreg, 0, %noreg, %xmm16
+ ; CHECK: %xmm16 = VMOVDQU8Z128rm %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VMOVDQU8Z128rm %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VMOVDQU8Z128rr %xmm16
%xmm16 = VMOVDQU8Z128rr %xmm16
; CHECK: %xmm16 = VMOVDQU8Z128rr_REV %xmm16
%xmm16 = VMOVDQU8Z128rr_REV %xmm16
- ; CHECK: %xmm16 = VMOVNTDQAZ128rm %rip, 1, _, %rax, _
- %xmm16 = VMOVNTDQAZ128rm %rip, 1, _, %rax, _
- ; CHECK: VMOVUPDZ128mr %rdi, 1, _, 0, _, %xmm16
- VMOVUPDZ128mr %rdi, 1, _, 0, _, %xmm16
- ; CHECK: %xmm16 = VMOVUPDZ128rm %rip, 1, _, %rax, _
- %xmm16 = VMOVUPDZ128rm %rip, 1, _, %rax, _
+ ; CHECK: %xmm16 = VMOVNTDQAZ128rm %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VMOVNTDQAZ128rm %rip, 1, %noreg, %rax, %noreg
+ ; CHECK: VMOVUPDZ128mr %rdi, 1, %noreg, 0, %noreg, %xmm16
+ VMOVUPDZ128mr %rdi, 1, %noreg, 0, %noreg, %xmm16
+ ; CHECK: %xmm16 = VMOVUPDZ128rm %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VMOVUPDZ128rm %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VMOVUPDZ128rr %xmm16
%xmm16 = VMOVUPDZ128rr %xmm16
; CHECK: %xmm16 = VMOVUPDZ128rr_REV %xmm16
%xmm16 = VMOVUPDZ128rr_REV %xmm16
- ; CHECK: VMOVUPSZ128mr %rdi, 1, _, 0, _, %xmm16
- VMOVUPSZ128mr %rdi, 1, _, 0, _, %xmm16
- ; CHECK: %xmm16 = VMOVUPSZ128rm %rip, 1, _, %rax, _
- %xmm16 = VMOVUPSZ128rm %rip, 1, _, %rax, _
+ ; CHECK: VMOVUPSZ128mr %rdi, 1, %noreg, 0, %noreg, %xmm16
+ VMOVUPSZ128mr %rdi, 1, %noreg, 0, %noreg, %xmm16
+ ; CHECK: %xmm16 = VMOVUPSZ128rm %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VMOVUPSZ128rm %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VMOVUPSZ128rr %xmm16
%xmm16 = VMOVUPSZ128rr %xmm16
; CHECK: %xmm16 = VMOVUPSZ128rr_REV %xmm16
%xmm16 = VMOVUPSZ128rr_REV %xmm16
- ; CHECK: VMOVNTDQZ128mr %rdi, 1, _, 0, _, %xmm16
- VMOVNTDQZ128mr %rdi, 1, _, 0, _, %xmm16
- ; CHECK: VMOVNTPDZ128mr %rdi, 1, _, 0, _, %xmm16
- VMOVNTPDZ128mr %rdi, 1, _, 0, _, %xmm16
- ; CHECK: VMOVNTPSZ128mr %rdi, 1, _, 0, _, %xmm16
- VMOVNTPSZ128mr %rdi, 1, _, 0, _, %xmm16
+ ; CHECK: VMOVNTDQZ128mr %rdi, 1, %noreg, 0, %noreg, %xmm16
+ VMOVNTDQZ128mr %rdi, 1, %noreg, 0, %noreg, %xmm16
+ ; CHECK: VMOVNTPDZ128mr %rdi, 1, %noreg, 0, %noreg, %xmm16
+ VMOVNTPDZ128mr %rdi, 1, %noreg, 0, %noreg, %xmm16
+ ; CHECK: VMOVNTPSZ128mr %rdi, 1, %noreg, 0, %noreg, %xmm16
+ VMOVNTPSZ128mr %rdi, 1, %noreg, 0, %noreg, %xmm16
; CHECK: %xmm16 = VMOVAPDZ128rr_REV %xmm16
%xmm16 = VMOVAPDZ128rr_REV %xmm16
; CHECK: %xmm16 = VMOVAPSZ128rr_REV %xmm16
@@ -3296,786 +3296,786 @@ body: |
%xmm16 = VMOVDQU32Z128rr_REV %xmm16
; CHECK: %xmm16 = VMOVDQU64Z128rr_REV %xmm16
%xmm16 = VMOVDQU64Z128rr_REV %xmm16
- ; CHECK: %xmm16 = VPMOVSXBDZ128rm %rip, 1, _, %rax, _
- %xmm16 = VPMOVSXBDZ128rm %rip, 1, _, %rax, _
+ ; CHECK: %xmm16 = VPMOVSXBDZ128rm %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VPMOVSXBDZ128rm %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VPMOVSXBDZ128rr %xmm16
%xmm16 = VPMOVSXBDZ128rr %xmm16
- ; CHECK: %xmm16 = VPMOVSXBQZ128rm %rip, 1, _, %rax, _
- %xmm16 = VPMOVSXBQZ128rm %rip, 1, _, %rax, _
+ ; CHECK: %xmm16 = VPMOVSXBQZ128rm %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VPMOVSXBQZ128rm %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VPMOVSXBQZ128rr %xmm16
%xmm16 = VPMOVSXBQZ128rr %xmm16
- ; CHECK: %xmm16 = VPMOVSXBWZ128rm %rip, 1, _, %rax, _
- %xmm16 = VPMOVSXBWZ128rm %rip, 1, _, %rax, _
+ ; CHECK: %xmm16 = VPMOVSXBWZ128rm %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VPMOVSXBWZ128rm %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VPMOVSXBWZ128rr %xmm16
%xmm16 = VPMOVSXBWZ128rr %xmm16
- ; CHECK: %xmm16 = VPMOVSXDQZ128rm %rip, 1, _, %rax, _
- %xmm16 = VPMOVSXDQZ128rm %rip, 1, _, %rax, _
+ ; CHECK: %xmm16 = VPMOVSXDQZ128rm %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VPMOVSXDQZ128rm %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VPMOVSXDQZ128rr %xmm16
%xmm16 = VPMOVSXDQZ128rr %xmm16
- ; CHECK: %xmm16 = VPMOVSXWDZ128rm %rip, 1, _, %rax, _
- %xmm16 = VPMOVSXWDZ128rm %rip, 1, _, %rax, _
+ ; CHECK: %xmm16 = VPMOVSXWDZ128rm %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VPMOVSXWDZ128rm %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VPMOVSXWDZ128rr %xmm16
%xmm16 = VPMOVSXWDZ128rr %xmm16
- ; CHECK: %xmm16 = VPMOVSXWQZ128rm %rip, 1, _, %rax, _
- %xmm16 = VPMOVSXWQZ128rm %rip, 1, _, %rax, _
+ ; CHECK: %xmm16 = VPMOVSXWQZ128rm %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VPMOVSXWQZ128rm %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VPMOVSXWQZ128rr %xmm16
%xmm16 = VPMOVSXWQZ128rr %xmm16
- ; CHECK: %xmm16 = VPMOVZXBDZ128rm %rip, 1, _, %rax, _
- %xmm16 = VPMOVZXBDZ128rm %rip, 1, _, %rax, _
+ ; CHECK: %xmm16 = VPMOVZXBDZ128rm %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VPMOVZXBDZ128rm %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VPMOVZXBDZ128rr %xmm16
%xmm16 = VPMOVZXBDZ128rr %xmm16
- ; CHECK: %xmm16 = VPMOVZXBQZ128rm %rip, 1, _, %rax, _
- %xmm16 = VPMOVZXBQZ128rm %rip, 1, _, %rax, _
+ ; CHECK: %xmm16 = VPMOVZXBQZ128rm %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VPMOVZXBQZ128rm %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VPMOVZXBQZ128rr %xmm16
%xmm16 = VPMOVZXBQZ128rr %xmm16
- ; CHECK: %xmm16 = VPMOVZXBWZ128rm %rip, 1, _, %rax, _
- %xmm16 = VPMOVZXBWZ128rm %rip, 1, _, %rax, _
+ ; CHECK: %xmm16 = VPMOVZXBWZ128rm %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VPMOVZXBWZ128rm %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VPMOVZXBWZ128rr %xmm16
%xmm16 = VPMOVZXBWZ128rr %xmm16
- ; CHECK: %xmm16 = VPMOVZXDQZ128rm %rip, 1, _, %rax, _
- %xmm16 = VPMOVZXDQZ128rm %rip, 1, _, %rax, _
+ ; CHECK: %xmm16 = VPMOVZXDQZ128rm %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VPMOVZXDQZ128rm %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VPMOVZXDQZ128rr %xmm16
%xmm16 = VPMOVZXDQZ128rr %xmm16
- ; CHECK: %xmm16 = VPMOVZXWDZ128rm %rip, 1, _, %rax, _
- %xmm16 = VPMOVZXWDZ128rm %rip, 1, _, %rax, _
+ ; CHECK: %xmm16 = VPMOVZXWDZ128rm %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VPMOVZXWDZ128rm %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VPMOVZXWDZ128rr %xmm16
%xmm16 = VPMOVZXWDZ128rr %xmm16
- ; CHECK: %xmm16 = VPMOVZXWQZ128rm %rip, 1, _, %rax, _
- %xmm16 = VPMOVZXWQZ128rm %rip, 1, _, %rax, _
+ ; CHECK: %xmm16 = VPMOVZXWQZ128rm %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VPMOVZXWQZ128rm %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VPMOVZXWQZ128rr %xmm16
%xmm16 = VPMOVZXWQZ128rr %xmm16
- ; CHECK: VMOVHPDZ128mr %rdi, 1, _, 0, _, %xmm16
- VMOVHPDZ128mr %rdi, 1, _, 0, _, %xmm16
- ; CHECK: %xmm16 = VMOVHPDZ128rm %xmm16, %rdi, 1, _, 0, _
- %xmm16 = VMOVHPDZ128rm %xmm16, %rdi, 1, _, 0, _
- ; CHECK: VMOVHPSZ128mr %rdi, 1, _, 0, _, %xmm16
- VMOVHPSZ128mr %rdi, 1, _, 0, _, %xmm16
- ; CHECK: %xmm16 = VMOVHPSZ128rm %xmm16, %rdi, 1, _, 0, _
- %xmm16 = VMOVHPSZ128rm %xmm16, %rdi, 1, _, 0, _
- ; CHECK: VMOVLPDZ128mr %rdi, 1, _, 0, _, %xmm16
- VMOVLPDZ128mr %rdi, 1, _, 0, _, %xmm16
- ; CHECK: %xmm16 = VMOVLPDZ128rm %xmm16, %rdi, 1, _, 0, _
- %xmm16 = VMOVLPDZ128rm %xmm16, %rdi, 1, _, 0, _
- ; CHECK: VMOVLPSZ128mr %rdi, 1, _, 0, _, %xmm16
- VMOVLPSZ128mr %rdi, 1, _, 0, _, %xmm16
- ; CHECK: %xmm16 = VMOVLPSZ128rm %xmm16, %rdi, 1, _, 0, _
- %xmm16 = VMOVLPSZ128rm %xmm16, %rdi, 1, _, 0, _
- ; CHECK: %xmm16 = VMAXCPDZ128rm %xmm16, %rip, 1, _, %rax, _
- %xmm16 = VMAXCPDZ128rm %xmm16, %rip, 1, _, %rax, _
+ ; CHECK: VMOVHPDZ128mr %rdi, 1, %noreg, 0, %noreg, %xmm16
+ VMOVHPDZ128mr %rdi, 1, %noreg, 0, %noreg, %xmm16
+ ; CHECK: %xmm16 = VMOVHPDZ128rm %xmm16, %rdi, 1, %noreg, 0, %noreg
+ %xmm16 = VMOVHPDZ128rm %xmm16, %rdi, 1, %noreg, 0, %noreg
+ ; CHECK: VMOVHPSZ128mr %rdi, 1, %noreg, 0, %noreg, %xmm16
+ VMOVHPSZ128mr %rdi, 1, %noreg, 0, %noreg, %xmm16
+ ; CHECK: %xmm16 = VMOVHPSZ128rm %xmm16, %rdi, 1, %noreg, 0, %noreg
+ %xmm16 = VMOVHPSZ128rm %xmm16, %rdi, 1, %noreg, 0, %noreg
+ ; CHECK: VMOVLPDZ128mr %rdi, 1, %noreg, 0, %noreg, %xmm16
+ VMOVLPDZ128mr %rdi, 1, %noreg, 0, %noreg, %xmm16
+ ; CHECK: %xmm16 = VMOVLPDZ128rm %xmm16, %rdi, 1, %noreg, 0, %noreg
+ %xmm16 = VMOVLPDZ128rm %xmm16, %rdi, 1, %noreg, 0, %noreg
+ ; CHECK: VMOVLPSZ128mr %rdi, 1, %noreg, 0, %noreg, %xmm16
+ VMOVLPSZ128mr %rdi, 1, %noreg, 0, %noreg, %xmm16
+ ; CHECK: %xmm16 = VMOVLPSZ128rm %xmm16, %rdi, 1, %noreg, 0, %noreg
+ %xmm16 = VMOVLPSZ128rm %xmm16, %rdi, 1, %noreg, 0, %noreg
+ ; CHECK: %xmm16 = VMAXCPDZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VMAXCPDZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VMAXCPDZ128rr %xmm16, %xmm1
%xmm16 = VMAXCPDZ128rr %xmm16, %xmm1
- ; CHECK: %xmm16 = VMAXCPSZ128rm %xmm16, %rip, 1, _, %rax, _
- %xmm16 = VMAXCPSZ128rm %xmm16, %rip, 1, _, %rax, _
+ ; CHECK: %xmm16 = VMAXCPSZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VMAXCPSZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VMAXCPSZ128rr %xmm16, %xmm1
%xmm16 = VMAXCPSZ128rr %xmm16, %xmm1
- ; CHECK: %xmm16 = VMAXPDZ128rm %xmm16, %rip, 1, _, %rax, _
- %xmm16 = VMAXPDZ128rm %xmm16, %rip, 1, _, %rax, _
+ ; CHECK: %xmm16 = VMAXPDZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VMAXPDZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VMAXPDZ128rr %xmm16, %xmm1
%xmm16 = VMAXPDZ128rr %xmm16, %xmm1
- ; CHECK: %xmm16 = VMAXPSZ128rm %xmm16, %rip, 1, _, %rax, _
- %xmm16 = VMAXPSZ128rm %xmm16, %rip, 1, _, %rax, _
+ ; CHECK: %xmm16 = VMAXPSZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VMAXPSZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VMAXPSZ128rr %xmm16, %xmm1
%xmm16 = VMAXPSZ128rr %xmm16, %xmm1
- ; CHECK: %xmm16 = VMINCPDZ128rm %xmm16, %rip, 1, _, %rax, _
- %xmm16 = VMINCPDZ128rm %xmm16, %rip, 1, _, %rax, _
+ ; CHECK: %xmm16 = VMINCPDZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VMINCPDZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VMINCPDZ128rr %xmm16, %xmm1
%xmm16 = VMINCPDZ128rr %xmm16, %xmm1
- ; CHECK: %xmm16 = VMINCPSZ128rm %xmm16, %rip, 1, _, %rax, _
- %xmm16 = VMINCPSZ128rm %xmm16, %rip, 1, _, %rax, _
+ ; CHECK: %xmm16 = VMINCPSZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VMINCPSZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VMINCPSZ128rr %xmm16, %xmm1
%xmm16 = VMINCPSZ128rr %xmm16, %xmm1
- ; CHECK: %xmm16 = VMINPDZ128rm %xmm16, %rip, 1, _, %rax, _
- %xmm16 = VMINPDZ128rm %xmm16, %rip, 1, _, %rax, _
+ ; CHECK: %xmm16 = VMINPDZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VMINPDZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VMINPDZ128rr %xmm16, %xmm1
%xmm16 = VMINPDZ128rr %xmm16, %xmm1
- ; CHECK: %xmm16 = VMINPSZ128rm %xmm16, %rip, 1, _, %rax, _
- %xmm16 = VMINPSZ128rm %xmm16, %rip, 1, _, %rax, _
+ ; CHECK: %xmm16 = VMINPSZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VMINPSZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VMINPSZ128rr %xmm16, %xmm1
%xmm16 = VMINPSZ128rr %xmm16, %xmm1
- ; CHECK: %xmm16 = VMULPDZ128rm %xmm16, %rip, 1, _, %rax, _
- %xmm16 = VMULPDZ128rm %xmm16, %rip, 1, _, %rax, _
+ ; CHECK: %xmm16 = VMULPDZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VMULPDZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VMULPDZ128rr %xmm16, %xmm1
%xmm16 = VMULPDZ128rr %xmm16, %xmm1
- ; CHECK: %xmm16 = VMULPSZ128rm %xmm16, %rip, 1, _, %rax, _
- %xmm16 = VMULPSZ128rm %xmm16, %rip, 1, _, %rax, _
+ ; CHECK: %xmm16 = VMULPSZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VMULPSZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VMULPSZ128rr %xmm16, %xmm1
%xmm16 = VMULPSZ128rr %xmm16, %xmm1
- ; CHECK: %xmm16 = VORPDZ128rm %xmm16, %rip, 1, _, %rax, _
- %xmm16 = VORPDZ128rm %xmm16, %rip, 1, _, %rax, _
+ ; CHECK: %xmm16 = VORPDZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VORPDZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VORPDZ128rr %xmm16, %xmm1
%xmm16 = VORPDZ128rr %xmm16, %xmm1
- ; CHECK: %xmm16 = VORPSZ128rm %xmm16, %rip, 1, _, %rax, _
- %xmm16 = VORPSZ128rm %xmm16, %rip, 1, _, %rax, _
+ ; CHECK: %xmm16 = VORPSZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VORPSZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VORPSZ128rr %xmm16, %xmm1
%xmm16 = VORPSZ128rr %xmm16, %xmm1
- ; CHECK: %xmm16 = VPADDBZ128rm %xmm16, %rip, 1, _, %rax, _
- %xmm16 = VPADDBZ128rm %xmm16, %rip, 1, _, %rax, _
+ ; CHECK: %xmm16 = VPADDBZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VPADDBZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VPADDBZ128rr %xmm16, %xmm1
%xmm16 = VPADDBZ128rr %xmm16, %xmm1
- ; CHECK: %xmm16 = VPADDDZ128rm %xmm16, %rip, 1, _, %rax, _
- %xmm16 = VPADDDZ128rm %xmm16, %rip, 1, _, %rax, _
+ ; CHECK: %xmm16 = VPADDDZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VPADDDZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VPADDDZ128rr %xmm16, %xmm1
%xmm16 = VPADDDZ128rr %xmm16, %xmm1
- ; CHECK: %xmm16 = VPADDQZ128rm %xmm16, %rip, 1, _, %rax, _
- %xmm16 = VPADDQZ128rm %xmm16, %rip, 1, _, %rax, _
+ ; CHECK: %xmm16 = VPADDQZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VPADDQZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VPADDQZ128rr %xmm16, %xmm1
%xmm16 = VPADDQZ128rr %xmm16, %xmm1
- ; CHECK: %xmm16 = VPADDSBZ128rm %xmm16, %rip, 1, _, %rax, _
- %xmm16 = VPADDSBZ128rm %xmm16, %rip, 1, _, %rax, _
+ ; CHECK: %xmm16 = VPADDSBZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VPADDSBZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VPADDSBZ128rr %xmm16, %xmm1
%xmm16 = VPADDSBZ128rr %xmm16, %xmm1
- ; CHECK: %xmm16 = VPADDSWZ128rm %xmm16, %rip, 1, _, %rax, _
- %xmm16 = VPADDSWZ128rm %xmm16, %rip, 1, _, %rax, _
+ ; CHECK: %xmm16 = VPADDSWZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VPADDSWZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VPADDSWZ128rr %xmm16, %xmm1
%xmm16 = VPADDSWZ128rr %xmm16, %xmm1
- ; CHECK: %xmm16 = VPADDUSBZ128rm %xmm16, %rip, 1, _, %rax, _
- %xmm16 = VPADDUSBZ128rm %xmm16, %rip, 1, _, %rax, _
+ ; CHECK: %xmm16 = VPADDUSBZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VPADDUSBZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VPADDUSBZ128rr %xmm16, %xmm1
%xmm16 = VPADDUSBZ128rr %xmm16, %xmm1
- ; CHECK: %xmm16 = VPADDUSWZ128rm %xmm16, %rip, 1, _, %rax, _
- %xmm16 = VPADDUSWZ128rm %xmm16, %rip, 1, _, %rax, _
+ ; CHECK: %xmm16 = VPADDUSWZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VPADDUSWZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VPADDUSWZ128rr %xmm16, %xmm1
%xmm16 = VPADDUSWZ128rr %xmm16, %xmm1
- ; CHECK: %xmm16 = VPADDWZ128rm %xmm16, %rip, 1, _, %rax, _
- %xmm16 = VPADDWZ128rm %xmm16, %rip, 1, _, %rax, _
+ ; CHECK: %xmm16 = VPADDWZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VPADDWZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VPADDWZ128rr %xmm16, %xmm1
%xmm16 = VPADDWZ128rr %xmm16, %xmm1
- ; CHECK: %xmm16 = VPANDDZ128rm %xmm16, %rip, 1, _, %rax, _
- %xmm16 = VPANDDZ128rm %xmm16, %rip, 1, _, %rax, _
+ ; CHECK: %xmm16 = VPANDDZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VPANDDZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VPANDDZ128rr %xmm16, %xmm1
%xmm16 = VPANDDZ128rr %xmm16, %xmm1
- ; CHECK: %xmm16 = VPANDQZ128rm %xmm16, %rip, 1, _, %rax, _
- %xmm16 = VPANDQZ128rm %xmm16, %rip, 1, _, %rax, _
+ ; CHECK: %xmm16 = VPANDQZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VPANDQZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VPANDQZ128rr %xmm16, %xmm1
%xmm16 = VPANDQZ128rr %xmm16, %xmm1
- ; CHECK: %xmm16 = VPANDNDZ128rm %xmm16, %rip, 1, _, %rax, _
- %xmm16 = VPANDNDZ128rm %xmm16, %rip, 1, _, %rax, _
+ ; CHECK: %xmm16 = VPANDNDZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VPANDNDZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VPANDNDZ128rr %xmm16, %xmm1
%xmm16 = VPANDNDZ128rr %xmm16, %xmm1
- ; CHECK: %xmm16 = VPANDNQZ128rm %xmm16, %rip, 1, _, %rax, _
- %xmm16 = VPANDNQZ128rm %xmm16, %rip, 1, _, %rax, _
+ ; CHECK: %xmm16 = VPANDNQZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VPANDNQZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VPANDNQZ128rr %xmm16, %xmm1
%xmm16 = VPANDNQZ128rr %xmm16, %xmm1
- ; CHECK: %xmm16 = VPAVGBZ128rm %xmm16, %rip, 1, _, %rax, _
- %xmm16 = VPAVGBZ128rm %xmm16, %rip, 1, _, %rax, _
+ ; CHECK: %xmm16 = VPAVGBZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VPAVGBZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VPAVGBZ128rr %xmm16, %xmm1
%xmm16 = VPAVGBZ128rr %xmm16, %xmm1
- ; CHECK: %xmm16 = VPAVGWZ128rm %xmm16, %rip, 1, _, %rax, _
- %xmm16 = VPAVGWZ128rm %xmm16, %rip, 1, _, %rax, _
+ ; CHECK: %xmm16 = VPAVGWZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VPAVGWZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VPAVGWZ128rr %xmm16, %xmm1
%xmm16 = VPAVGWZ128rr %xmm16, %xmm1
- ; CHECK: %xmm16 = VPMAXSBZ128rm %xmm16, %rip, 1, _, %rax, _
- %xmm16 = VPMAXSBZ128rm %xmm16, %rip, 1, _, %rax, _
+ ; CHECK: %xmm16 = VPMAXSBZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VPMAXSBZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VPMAXSBZ128rr %xmm16, %xmm1
%xmm16 = VPMAXSBZ128rr %xmm16, %xmm1
- ; CHECK: %xmm16 = VPMAXSDZ128rm %xmm16, %rip, 1, _, %rax, _
- %xmm16 = VPMAXSDZ128rm %xmm16, %rip, 1, _, %rax, _
+ ; CHECK: %xmm16 = VPMAXSDZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VPMAXSDZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VPMAXSDZ128rr %xmm16, %xmm1
%xmm16 = VPMAXSDZ128rr %xmm16, %xmm1
- ; CHECK: %xmm16 = VPMAXSWZ128rm %xmm16, %rip, 1, _, %rax, _
- %xmm16 = VPMAXSWZ128rm %xmm16, %rip, 1, _, %rax, _
+ ; CHECK: %xmm16 = VPMAXSWZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VPMAXSWZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VPMAXSWZ128rr %xmm16, %xmm1
%xmm16 = VPMAXSWZ128rr %xmm16, %xmm1
- ; CHECK: %xmm16 = VPMAXUBZ128rm %xmm16, %rip, 1, _, %rax, _
- %xmm16 = VPMAXUBZ128rm %xmm16, %rip, 1, _, %rax, _
+ ; CHECK: %xmm16 = VPMAXUBZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VPMAXUBZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VPMAXUBZ128rr %xmm16, %xmm1
%xmm16 = VPMAXUBZ128rr %xmm16, %xmm1
- ; CHECK: %xmm16 = VPMAXUDZ128rm %xmm16, %rip, 1, _, %rax, _
- %xmm16 = VPMAXUDZ128rm %xmm16, %rip, 1, _, %rax, _
+ ; CHECK: %xmm16 = VPMAXUDZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VPMAXUDZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VPMAXUDZ128rr %xmm16, %xmm1
%xmm16 = VPMAXUDZ128rr %xmm16, %xmm1
- ; CHECK: %xmm16 = VPMAXUWZ128rm %xmm16, %rip, 1, _, %rax, _
- %xmm16 = VPMAXUWZ128rm %xmm16, %rip, 1, _, %rax, _
+ ; CHECK: %xmm16 = VPMAXUWZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VPMAXUWZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VPMAXUWZ128rr %xmm16, %xmm1
%xmm16 = VPMAXUWZ128rr %xmm16, %xmm1
- ; CHECK: %xmm16 = VPMINSBZ128rm %xmm16, %rip, 1, _, %rax, _
- %xmm16 = VPMINSBZ128rm %xmm16, %rip, 1, _, %rax, _
+ ; CHECK: %xmm16 = VPMINSBZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VPMINSBZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VPMINSBZ128rr %xmm16, %xmm1
%xmm16 = VPMINSBZ128rr %xmm16, %xmm1
- ; CHECK: %xmm16 = VPMINSDZ128rm %xmm16, %rip, 1, _, %rax, _
- %xmm16 = VPMINSDZ128rm %xmm16, %rip, 1, _, %rax, _
+ ; CHECK: %xmm16 = VPMINSDZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VPMINSDZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VPMINSDZ128rr %xmm16, %xmm1
%xmm16 = VPMINSDZ128rr %xmm16, %xmm1
- ; CHECK: %xmm16 = VPMINSWZ128rm %xmm16, %rip, 1, _, %rax, _
- %xmm16 = VPMINSWZ128rm %xmm16, %rip, 1, _, %rax, _
+ ; CHECK: %xmm16 = VPMINSWZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VPMINSWZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VPMINSWZ128rr %xmm16, %xmm1
%xmm16 = VPMINSWZ128rr %xmm16, %xmm1
- ; CHECK: %xmm16 = VPMINUBZ128rm %xmm16, %rip, 1, _, %rax, _
- %xmm16 = VPMINUBZ128rm %xmm16, %rip, 1, _, %rax, _
+ ; CHECK: %xmm16 = VPMINUBZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VPMINUBZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VPMINUBZ128rr %xmm16, %xmm1
%xmm16 = VPMINUBZ128rr %xmm16, %xmm1
- ; CHECK: %xmm16 = VPMINUDZ128rm %xmm16, %rip, 1, _, %rax, _
- %xmm16 = VPMINUDZ128rm %xmm16, %rip, 1, _, %rax, _
+ ; CHECK: %xmm16 = VPMINUDZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VPMINUDZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VPMINUDZ128rr %xmm16, %xmm1
%xmm16 = VPMINUDZ128rr %xmm16, %xmm1
- ; CHECK: %xmm16 = VPMINUWZ128rm %xmm16, %rip, 1, _, %rax, _
- %xmm16 = VPMINUWZ128rm %xmm16, %rip, 1, _, %rax, _
+ ; CHECK: %xmm16 = VPMINUWZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VPMINUWZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VPMINUWZ128rr %xmm16, %xmm1
%xmm16 = VPMINUWZ128rr %xmm16, %xmm1
- ; CHECK: %xmm16 = VPMULDQZ128rm %xmm16, %rip, 1, _, %rax, _
- %xmm16 = VPMULDQZ128rm %xmm16, %rip, 1, _, %rax, _
+ ; CHECK: %xmm16 = VPMULDQZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VPMULDQZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VPMULDQZ128rr %xmm16, %xmm1
%xmm16 = VPMULDQZ128rr %xmm16, %xmm1
- ; CHECK: %xmm16 = VPMULHRSWZ128rm %xmm16, %rip, 1, _, %rax, _
- %xmm16 = VPMULHRSWZ128rm %xmm16, %rip, 1, _, %rax, _
+ ; CHECK: %xmm16 = VPMULHRSWZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VPMULHRSWZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VPMULHRSWZ128rr %xmm16, %xmm1
%xmm16 = VPMULHRSWZ128rr %xmm16, %xmm1
- ; CHECK: %xmm16 = VPMULHUWZ128rm %xmm16, %rip, 1, _, %rax, _
- %xmm16 = VPMULHUWZ128rm %xmm16, %rip, 1, _, %rax, _
+ ; CHECK: %xmm16 = VPMULHUWZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VPMULHUWZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VPMULHUWZ128rr %xmm16, %xmm1
%xmm16 = VPMULHUWZ128rr %xmm16, %xmm1
- ; CHECK: %xmm16 = VPMULHWZ128rm %xmm16, %rip, 1, _, %rax, _
- %xmm16 = VPMULHWZ128rm %xmm16, %rip, 1, _, %rax, _
+ ; CHECK: %xmm16 = VPMULHWZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VPMULHWZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VPMULHWZ128rr %xmm16, %xmm1
%xmm16 = VPMULHWZ128rr %xmm16, %xmm1
- ; CHECK: %xmm16 = VPMULLDZ128rm %xmm16, %rip, 1, _, %rax, _
- %xmm16 = VPMULLDZ128rm %xmm16, %rip, 1, _, %rax, _
+ ; CHECK: %xmm16 = VPMULLDZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VPMULLDZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VPMULLDZ128rr %xmm16, %xmm1
%xmm16 = VPMULLDZ128rr %xmm16, %xmm1
- ; CHECK: %xmm16 = VPMULLWZ128rm %xmm16, %rip, 1, _, %rax, _
- %xmm16 = VPMULLWZ128rm %xmm16, %rip, 1, _, %rax, _
+ ; CHECK: %xmm16 = VPMULLWZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VPMULLWZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VPMULLWZ128rr %xmm16, %xmm1
%xmm16 = VPMULLWZ128rr %xmm16, %xmm1
- ; CHECK: %xmm16 = VPMULUDQZ128rm %xmm16, %rip, 1, _, %rax, _
- %xmm16 = VPMULUDQZ128rm %xmm16, %rip, 1, _, %rax, _
+ ; CHECK: %xmm16 = VPMULUDQZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VPMULUDQZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VPMULUDQZ128rr %xmm16, %xmm1
%xmm16 = VPMULUDQZ128rr %xmm16, %xmm1
- ; CHECK: %xmm16 = VPORDZ128rm %xmm16, %rip, 1, _, %rax, _
- %xmm16 = VPORDZ128rm %xmm16, %rip, 1, _, %rax, _
+ ; CHECK: %xmm16 = VPORDZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VPORDZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VPORDZ128rr %xmm16, %xmm1
%xmm16 = VPORDZ128rr %xmm16, %xmm1
- ; CHECK: %xmm16 = VPORQZ128rm %xmm16, %rip, 1, _, %rax, _
- %xmm16 = VPORQZ128rm %xmm16, %rip, 1, _, %rax, _
+ ; CHECK: %xmm16 = VPORQZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VPORQZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VPORQZ128rr %xmm16, %xmm1
%xmm16 = VPORQZ128rr %xmm16, %xmm1
- ; CHECK: %xmm16 = VPSUBBZ128rm %xmm16, %rip, 1, _, %rax, _
- %xmm16 = VPSUBBZ128rm %xmm16, %rip, 1, _, %rax, _
+ ; CHECK: %xmm16 = VPSUBBZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VPSUBBZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VPSUBBZ128rr %xmm16, %xmm1
%xmm16 = VPSUBBZ128rr %xmm16, %xmm1
- ; CHECK: %xmm16 = VPSUBDZ128rm %xmm16, %rip, 1, _, %rax, _
- %xmm16 = VPSUBDZ128rm %xmm16, %rip, 1, _, %rax, _
+ ; CHECK: %xmm16 = VPSUBDZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VPSUBDZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VPSUBDZ128rr %xmm16, %xmm1
%xmm16 = VPSUBDZ128rr %xmm16, %xmm1
- ; CHECK: %xmm16 = VPSUBQZ128rm %xmm16, %rip, 1, _, %rax, _
- %xmm16 = VPSUBQZ128rm %xmm16, %rip, 1, _, %rax, _
+ ; CHECK: %xmm16 = VPSUBQZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VPSUBQZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VPSUBQZ128rr %xmm16, %xmm1
%xmm16 = VPSUBQZ128rr %xmm16, %xmm1
- ; CHECK: %xmm16 = VPSUBSBZ128rm %xmm16, %rip, 1, _, %rax, _
- %xmm16 = VPSUBSBZ128rm %xmm16, %rip, 1, _, %rax, _
+ ; CHECK: %xmm16 = VPSUBSBZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VPSUBSBZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VPSUBSBZ128rr %xmm16, %xmm1
%xmm16 = VPSUBSBZ128rr %xmm16, %xmm1
- ; CHECK: %xmm16 = VPSUBSWZ128rm %xmm16, %rip, 1, _, %rax, _
- %xmm16 = VPSUBSWZ128rm %xmm16, %rip, 1, _, %rax, _
+ ; CHECK: %xmm16 = VPSUBSWZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VPSUBSWZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VPSUBSWZ128rr %xmm16, %xmm1
%xmm16 = VPSUBSWZ128rr %xmm16, %xmm1
- ; CHECK: %xmm16 = VPSUBUSBZ128rm %xmm16, %rip, 1, _, %rax, _
- %xmm16 = VPSUBUSBZ128rm %xmm16, %rip, 1, _, %rax, _
+ ; CHECK: %xmm16 = VPSUBUSBZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VPSUBUSBZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VPSUBUSBZ128rr %xmm16, %xmm1
%xmm16 = VPSUBUSBZ128rr %xmm16, %xmm1
- ; CHECK: %xmm16 = VPSUBUSWZ128rm %xmm16, %rip, 1, _, %rax, _
- %xmm16 = VPSUBUSWZ128rm %xmm16, %rip, 1, _, %rax, _
+ ; CHECK: %xmm16 = VPSUBUSWZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VPSUBUSWZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VPSUBUSWZ128rr %xmm16, %xmm1
%xmm16 = VPSUBUSWZ128rr %xmm16, %xmm1
- ; CHECK: %xmm16 = VPSUBWZ128rm %xmm16, %rip, 1, _, %rax, _
- %xmm16 = VPSUBWZ128rm %xmm16, %rip, 1, _, %rax, _
+ ; CHECK: %xmm16 = VPSUBWZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VPSUBWZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VPSUBWZ128rr %xmm16, %xmm1
%xmm16 = VPSUBWZ128rr %xmm16, %xmm1
- ; CHECK: %xmm16 = VADDPDZ128rm %xmm16, %rip, 1, _, %rax, _
- %xmm16 = VADDPDZ128rm %xmm16, %rip, 1, _, %rax, _
+ ; CHECK: %xmm16 = VADDPDZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VADDPDZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VADDPDZ128rr %xmm16, %xmm1
%xmm16 = VADDPDZ128rr %xmm16, %xmm1
- ; CHECK: %xmm16 = VADDPSZ128rm %xmm16, %rip, 1, _, %rax, _
- %xmm16 = VADDPSZ128rm %xmm16, %rip, 1, _, %rax, _
+ ; CHECK: %xmm16 = VADDPSZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VADDPSZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VADDPSZ128rr %xmm16, %xmm1
%xmm16 = VADDPSZ128rr %xmm16, %xmm1
- ; CHECK: %xmm16 = VANDNPDZ128rm %xmm16, %rip, 1, _, %rax, _
- %xmm16 = VANDNPDZ128rm %xmm16, %rip, 1, _, %rax, _
+ ; CHECK: %xmm16 = VANDNPDZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VANDNPDZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VANDNPDZ128rr %xmm16, %xmm1
%xmm16 = VANDNPDZ128rr %xmm16, %xmm1
- ; CHECK: %xmm16 = VANDNPSZ128rm %xmm16, %rip, 1, _, %rax, _
- %xmm16 = VANDNPSZ128rm %xmm16, %rip, 1, _, %rax, _
+ ; CHECK: %xmm16 = VANDNPSZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VANDNPSZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VANDNPSZ128rr %xmm16, %xmm1
%xmm16 = VANDNPSZ128rr %xmm16, %xmm1
- ; CHECK: %xmm16 = VANDPDZ128rm %xmm16, %rip, 1, _, %rax, _
- %xmm16 = VANDPDZ128rm %xmm16, %rip, 1, _, %rax, _
+ ; CHECK: %xmm16 = VANDPDZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VANDPDZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VANDPDZ128rr %xmm16, %xmm1
%xmm16 = VANDPDZ128rr %xmm16, %xmm1
- ; CHECK: %xmm16 = VANDPSZ128rm %xmm16, %rip, 1, _, %rax, _
- %xmm16 = VANDPSZ128rm %xmm16, %rip, 1, _, %rax, _
+ ; CHECK: %xmm16 = VANDPSZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VANDPSZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VANDPSZ128rr %xmm16, %xmm1
%xmm16 = VANDPSZ128rr %xmm16, %xmm1
- ; CHECK: %xmm16 = VDIVPDZ128rm %xmm16, %rip, 1, _, %rax, _
- %xmm16 = VDIVPDZ128rm %xmm16, %rip, 1, _, %rax, _
+ ; CHECK: %xmm16 = VDIVPDZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VDIVPDZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VDIVPDZ128rr %xmm16, %xmm1
%xmm16 = VDIVPDZ128rr %xmm16, %xmm1
- ; CHECK: %xmm16 = VDIVPSZ128rm %xmm16, %rip, 1, _, %rax, _
- %xmm16 = VDIVPSZ128rm %xmm16, %rip, 1, _, %rax, _
+ ; CHECK: %xmm16 = VDIVPSZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VDIVPSZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VDIVPSZ128rr %xmm16, %xmm1
%xmm16 = VDIVPSZ128rr %xmm16, %xmm1
- ; CHECK: %xmm16 = VPXORDZ128rm %xmm16, %rip, 1, _, %rax, _
- %xmm16 = VPXORDZ128rm %xmm16, %rip, 1, _, %rax, _
+ ; CHECK: %xmm16 = VPXORDZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VPXORDZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VPXORDZ128rr %xmm16, %xmm1
%xmm16 = VPXORDZ128rr %xmm16, %xmm1
- ; CHECK: %xmm16 = VPXORQZ128rm %xmm16, %rip, 1, _, %rax, _
- %xmm16 = VPXORQZ128rm %xmm16, %rip, 1, _, %rax, _
+ ; CHECK: %xmm16 = VPXORQZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VPXORQZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VPXORQZ128rr %xmm16, %xmm1
%xmm16 = VPXORQZ128rr %xmm16, %xmm1
- ; CHECK: %xmm16 = VSUBPDZ128rm %xmm16, %rip, 1, _, %rax, _
- %xmm16 = VSUBPDZ128rm %xmm16, %rip, 1, _, %rax, _
+ ; CHECK: %xmm16 = VSUBPDZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VSUBPDZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VSUBPDZ128rr %xmm16, %xmm1
%xmm16 = VSUBPDZ128rr %xmm16, %xmm1
- ; CHECK: %xmm16 = VSUBPSZ128rm %xmm16, %rip, 1, _, %rax, _
- %xmm16 = VSUBPSZ128rm %xmm16, %rip, 1, _, %rax, _
+ ; CHECK: %xmm16 = VSUBPSZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VSUBPSZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VSUBPSZ128rr %xmm16, %xmm1
%xmm16 = VSUBPSZ128rr %xmm16, %xmm1
- ; CHECK: %xmm16 = VXORPDZ128rm %xmm16, %rip, 1, _, %rax, _
- %xmm16 = VXORPDZ128rm %xmm16, %rip, 1, _, %rax, _
+ ; CHECK: %xmm16 = VXORPDZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VXORPDZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VXORPDZ128rr %xmm16, %xmm1
%xmm16 = VXORPDZ128rr %xmm16, %xmm1
- ; CHECK: %xmm16 = VXORPSZ128rm %xmm16, %rip, 1, _, %rax, _
- %xmm16 = VXORPSZ128rm %xmm16, %rip, 1, _, %rax, _
+ ; CHECK: %xmm16 = VXORPSZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VXORPSZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VXORPSZ128rr %xmm16, %xmm1
%xmm16 = VXORPSZ128rr %xmm16, %xmm1
- ; CHECK: %xmm16 = VPMADDUBSWZ128rm %xmm16, %rip, 1, _, %rax, _
- %xmm16 = VPMADDUBSWZ128rm %xmm16, %rip, 1, _, %rax, _
+ ; CHECK: %xmm16 = VPMADDUBSWZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VPMADDUBSWZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VPMADDUBSWZ128rr %xmm16, %xmm1
%xmm16 = VPMADDUBSWZ128rr %xmm16, %xmm1
- ; CHECK: %xmm16 = VPMADDWDZ128rm %xmm16, %rip, 1, _, %rax, _
- %xmm16 = VPMADDWDZ128rm %xmm16, %rip, 1, _, %rax, _
+ ; CHECK: %xmm16 = VPMADDWDZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VPMADDWDZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VPMADDWDZ128rr %xmm16, %xmm1
%xmm16 = VPMADDWDZ128rr %xmm16, %xmm1
- ; CHECK: %xmm16 = VPACKSSDWZ128rm %xmm16, %rip, 1, _, %rax, _
- %xmm16 = VPACKSSDWZ128rm %xmm16, %rip, 1, _, %rax, _
+ ; CHECK: %xmm16 = VPACKSSDWZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VPACKSSDWZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VPACKSSDWZ128rr %xmm16, %xmm1
%xmm16 = VPACKSSDWZ128rr %xmm16, %xmm1
- ; CHECK: %xmm16 = VPACKSSWBZ128rm %xmm16, %rip, 1, _, %rax, _
- %xmm16 = VPACKSSWBZ128rm %xmm16, %rip, 1, _, %rax, _
+ ; CHECK: %xmm16 = VPACKSSWBZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VPACKSSWBZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VPACKSSWBZ128rr %xmm16, %xmm1
%xmm16 = VPACKSSWBZ128rr %xmm16, %xmm1
- ; CHECK: %xmm16 = VPACKUSDWZ128rm %xmm16, %rip, 1, _, %rax, _
- %xmm16 = VPACKUSDWZ128rm %xmm16, %rip, 1, _, %rax, _
+ ; CHECK: %xmm16 = VPACKUSDWZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VPACKUSDWZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VPACKUSDWZ128rr %xmm16, %xmm1
%xmm16 = VPACKUSDWZ128rr %xmm16, %xmm1
- ; CHECK: %xmm16 = VPACKUSWBZ128rm %xmm16, %rip, 1, _, %rax, _
- %xmm16 = VPACKUSWBZ128rm %xmm16, %rip, 1, _, %rax, _
+ ; CHECK: %xmm16 = VPACKUSWBZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VPACKUSWBZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VPACKUSWBZ128rr %xmm16, %xmm1
%xmm16 = VPACKUSWBZ128rr %xmm16, %xmm1
- ; CHECK: %xmm16 = VPUNPCKHBWZ128rm %xmm16, %rip, 1, _, %rax, _
- %xmm16 = VPUNPCKHBWZ128rm %xmm16, %rip, 1, _, %rax, _
+ ; CHECK: %xmm16 = VPUNPCKHBWZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VPUNPCKHBWZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VPUNPCKHBWZ128rr %xmm16, %xmm1
%xmm16 = VPUNPCKHBWZ128rr %xmm16, %xmm1
- ; CHECK: %xmm16 = VPUNPCKHDQZ128rm %xmm16, %rip, 1, _, %rax, _
- %xmm16 = VPUNPCKHDQZ128rm %xmm16, %rip, 1, _, %rax, _
+ ; CHECK: %xmm16 = VPUNPCKHDQZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VPUNPCKHDQZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VPUNPCKHDQZ128rr %xmm16, %xmm1
%xmm16 = VPUNPCKHDQZ128rr %xmm16, %xmm1
- ; CHECK: %xmm16 = VPUNPCKHQDQZ128rm %xmm16, %rip, 1, _, %rax, _
- %xmm16 = VPUNPCKHQDQZ128rm %xmm16, %rip, 1, _, %rax, _
+ ; CHECK: %xmm16 = VPUNPCKHQDQZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VPUNPCKHQDQZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VPUNPCKHQDQZ128rr %xmm16, %xmm1
%xmm16 = VPUNPCKHQDQZ128rr %xmm16, %xmm1
- ; CHECK: %xmm16 = VPUNPCKHWDZ128rm %xmm16, %rip, 1, _, %rax, _
- %xmm16 = VPUNPCKHWDZ128rm %xmm16, %rip, 1, _, %rax, _
+ ; CHECK: %xmm16 = VPUNPCKHWDZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VPUNPCKHWDZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VPUNPCKHWDZ128rr %xmm16, %xmm1
%xmm16 = VPUNPCKHWDZ128rr %xmm16, %xmm1
- ; CHECK: %xmm16 = VPUNPCKLBWZ128rm %xmm16, %rip, 1, _, %rax, _
- %xmm16 = VPUNPCKLBWZ128rm %xmm16, %rip, 1, _, %rax, _
+ ; CHECK: %xmm16 = VPUNPCKLBWZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VPUNPCKLBWZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VPUNPCKLBWZ128rr %xmm16, %xmm1
%xmm16 = VPUNPCKLBWZ128rr %xmm16, %xmm1
- ; CHECK: %xmm16 = VPUNPCKLDQZ128rm %xmm16, %rip, 1, _, %rax, _
- %xmm16 = VPUNPCKLDQZ128rm %xmm16, %rip, 1, _, %rax, _
+ ; CHECK: %xmm16 = VPUNPCKLDQZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VPUNPCKLDQZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VPUNPCKLDQZ128rr %xmm16, %xmm1
%xmm16 = VPUNPCKLDQZ128rr %xmm16, %xmm1
- ; CHECK: %xmm16 = VPUNPCKLQDQZ128rm %xmm16, %rip, 1, _, %rax, _
- %xmm16 = VPUNPCKLQDQZ128rm %xmm16, %rip, 1, _, %rax, _
+ ; CHECK: %xmm16 = VPUNPCKLQDQZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VPUNPCKLQDQZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VPUNPCKLQDQZ128rr %xmm16, %xmm1
%xmm16 = VPUNPCKLQDQZ128rr %xmm16, %xmm1
- ; CHECK: %xmm16 = VPUNPCKLWDZ128rm %xmm16, %rip, 1, _, %rax, _
- %xmm16 = VPUNPCKLWDZ128rm %xmm16, %rip, 1, _, %rax, _
+ ; CHECK: %xmm16 = VPUNPCKLWDZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VPUNPCKLWDZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VPUNPCKLWDZ128rr %xmm16, %xmm1
%xmm16 = VPUNPCKLWDZ128rr %xmm16, %xmm1
- ; CHECK: %xmm16 = VUNPCKHPDZ128rm %xmm16, %rip, 1, _, %rax, _
- %xmm16 = VUNPCKHPDZ128rm %xmm16, %rip, 1, _, %rax, _
+ ; CHECK: %xmm16 = VUNPCKHPDZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VUNPCKHPDZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VUNPCKHPDZ128rr %xmm16, %xmm1
%xmm16 = VUNPCKHPDZ128rr %xmm16, %xmm1
- ; CHECK: %xmm16 = VUNPCKHPSZ128rm %xmm16, %rip, 1, _, %rax, _
- %xmm16 = VUNPCKHPSZ128rm %xmm16, %rip, 1, _, %rax, _
+ ; CHECK: %xmm16 = VUNPCKHPSZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VUNPCKHPSZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VUNPCKHPSZ128rr %xmm16, %xmm1
%xmm16 = VUNPCKHPSZ128rr %xmm16, %xmm1
- ; CHECK: %xmm16 = VUNPCKLPDZ128rm %xmm16, %rip, 1, _, %rax, _
- %xmm16 = VUNPCKLPDZ128rm %xmm16, %rip, 1, _, %rax, _
+ ; CHECK: %xmm16 = VUNPCKLPDZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VUNPCKLPDZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VUNPCKLPDZ128rr %xmm16, %xmm1
%xmm16 = VUNPCKLPDZ128rr %xmm16, %xmm1
- ; CHECK: %xmm16 = VUNPCKLPSZ128rm %xmm16, %rip, 1, _, %rax, _
- %xmm16 = VUNPCKLPSZ128rm %xmm16, %rip, 1, _, %rax, _
+ ; CHECK: %xmm16 = VUNPCKLPSZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VUNPCKLPSZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VUNPCKLPSZ128rr %xmm16, %xmm1
%xmm16 = VUNPCKLPSZ128rr %xmm16, %xmm1
- ; CHECK: %xmm16 = VFMADD132PDZ128m %xmm16, %xmm16, %rsi, 1, _, 0, _
- %xmm16 = VFMADD132PDZ128m %xmm16, %xmm16, %rsi, 1, _, 0, _
+ ; CHECK: %xmm16 = VFMADD132PDZ128m %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
+ %xmm16 = VFMADD132PDZ128m %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
; CHECK: %xmm16 = VFMADD132PDZ128r %xmm16, %xmm1, %xmm2
%xmm16 = VFMADD132PDZ128r %xmm16, %xmm1, %xmm2
- ; CHECK: %xmm16 = VFMADD132PSZ128m %xmm16, %xmm16, %rsi, 1, _, 0, _
- %xmm16 = VFMADD132PSZ128m %xmm16, %xmm16, %rsi, 1, _, 0, _
+ ; CHECK: %xmm16 = VFMADD132PSZ128m %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
+ %xmm16 = VFMADD132PSZ128m %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
; CHECK: %xmm16 = VFMADD132PSZ128r %xmm16, %xmm1, %xmm2
%xmm16 = VFMADD132PSZ128r %xmm16, %xmm1, %xmm2
- ; CHECK: %xmm16 = VFMADD213PDZ128m %xmm16, %xmm16, %rsi, 1, _, 0, _
- %xmm16 = VFMADD213PDZ128m %xmm16, %xmm16, %rsi, 1, _, 0, _
+ ; CHECK: %xmm16 = VFMADD213PDZ128m %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
+ %xmm16 = VFMADD213PDZ128m %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
; CHECK: %xmm16 = VFMADD213PDZ128r %xmm16, %xmm1, %xmm2
%xmm16 = VFMADD213PDZ128r %xmm16, %xmm1, %xmm2
- ; CHECK: %xmm16 = VFMADD213PSZ128m %xmm16, %xmm16, %rsi, 1, _, 0, _
- %xmm16 = VFMADD213PSZ128m %xmm16, %xmm16, %rsi, 1, _, 0, _
+ ; CHECK: %xmm16 = VFMADD213PSZ128m %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
+ %xmm16 = VFMADD213PSZ128m %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
; CHECK: %xmm16 = VFMADD213PSZ128r %xmm16, %xmm1, %xmm2
%xmm16 = VFMADD213PSZ128r %xmm16, %xmm1, %xmm2
- ; CHECK: %xmm16 = VFMADD231PDZ128m %xmm16, %xmm16, %rsi, 1, _, 0, _
- %xmm16 = VFMADD231PDZ128m %xmm16, %xmm16, %rsi, 1, _, 0, _
+ ; CHECK: %xmm16 = VFMADD231PDZ128m %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
+ %xmm16 = VFMADD231PDZ128m %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
; CHECK: %xmm16 = VFMADD231PDZ128r %xmm16, %xmm1, %xmm2
%xmm16 = VFMADD231PDZ128r %xmm16, %xmm1, %xmm2
- ; CHECK: %xmm16 = VFMADD231PSZ128m %xmm16, %xmm16, %rsi, 1, _, 0, _
- %xmm16 = VFMADD231PSZ128m %xmm16, %xmm16, %rsi, 1, _, 0, _
+ ; CHECK: %xmm16 = VFMADD231PSZ128m %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
+ %xmm16 = VFMADD231PSZ128m %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
; CHECK: %xmm16 = VFMADD231PSZ128r %xmm16, %xmm1, %xmm2
%xmm16 = VFMADD231PSZ128r %xmm16, %xmm1, %xmm2
- ; CHECK: %xmm16 = VFMADDSUB132PDZ128m %xmm16, %xmm16, %rsi, 1, _, 0, _
- %xmm16 = VFMADDSUB132PDZ128m %xmm16, %xmm16, %rsi, 1, _, 0, _
+ ; CHECK: %xmm16 = VFMADDSUB132PDZ128m %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
+ %xmm16 = VFMADDSUB132PDZ128m %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
; CHECK: %xmm16 = VFMADDSUB132PDZ128r %xmm16, %xmm1, %xmm2
%xmm16 = VFMADDSUB132PDZ128r %xmm16, %xmm1, %xmm2
- ; CHECK: %xmm16 = VFMADDSUB132PSZ128m %xmm16, %xmm16, %rsi, 1, _, 0, _
- %xmm16 = VFMADDSUB132PSZ128m %xmm16, %xmm16, %rsi, 1, _, 0, _
+ ; CHECK: %xmm16 = VFMADDSUB132PSZ128m %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
+ %xmm16 = VFMADDSUB132PSZ128m %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
; CHECK: %xmm16 = VFMADDSUB132PSZ128r %xmm16, %xmm1, %xmm2
%xmm16 = VFMADDSUB132PSZ128r %xmm16, %xmm1, %xmm2
- ; CHECK: %xmm16 = VFMADDSUB213PDZ128m %xmm16, %xmm16, %rsi, 1, _, 0, _
- %xmm16 = VFMADDSUB213PDZ128m %xmm16, %xmm16, %rsi, 1, _, 0, _
+ ; CHECK: %xmm16 = VFMADDSUB213PDZ128m %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
+ %xmm16 = VFMADDSUB213PDZ128m %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
; CHECK: %xmm16 = VFMADDSUB213PDZ128r %xmm16, %xmm1, %xmm2
%xmm16 = VFMADDSUB213PDZ128r %xmm16, %xmm1, %xmm2
- ; CHECK: %xmm16 = VFMADDSUB213PSZ128m %xmm16, %xmm16, %rsi, 1, _, 0, _
- %xmm16 = VFMADDSUB213PSZ128m %xmm16, %xmm16, %rsi, 1, _, 0, _
+ ; CHECK: %xmm16 = VFMADDSUB213PSZ128m %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
+ %xmm16 = VFMADDSUB213PSZ128m %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
; CHECK: %xmm16 = VFMADDSUB213PSZ128r %xmm16, %xmm1, %xmm2
%xmm16 = VFMADDSUB213PSZ128r %xmm16, %xmm1, %xmm2
- ; CHECK: %xmm16 = VFMADDSUB231PDZ128m %xmm16, %xmm16, %rsi, 1, _, 0, _
- %xmm16 = VFMADDSUB231PDZ128m %xmm16, %xmm16, %rsi, 1, _, 0, _
+ ; CHECK: %xmm16 = VFMADDSUB231PDZ128m %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
+ %xmm16 = VFMADDSUB231PDZ128m %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
; CHECK: %xmm16 = VFMADDSUB231PDZ128r %xmm16, %xmm1, %xmm2
%xmm16 = VFMADDSUB231PDZ128r %xmm16, %xmm1, %xmm2
- ; CHECK: %xmm16 = VFMADDSUB231PSZ128m %xmm16, %xmm16, %rsi, 1, _, 0, _
- %xmm16 = VFMADDSUB231PSZ128m %xmm16, %xmm16, %rsi, 1, _, 0, _
+ ; CHECK: %xmm16 = VFMADDSUB231PSZ128m %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
+ %xmm16 = VFMADDSUB231PSZ128m %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
; CHECK: %xmm16 = VFMADDSUB231PSZ128r %xmm16, %xmm1, %xmm2
%xmm16 = VFMADDSUB231PSZ128r %xmm16, %xmm1, %xmm2
- ; CHECK: %xmm16 = VFMSUB132PDZ128m %xmm16, %xmm16, %rsi, 1, _, 0, _
- %xmm16 = VFMSUB132PDZ128m %xmm16, %xmm16, %rsi, 1, _, 0, _
+ ; CHECK: %xmm16 = VFMSUB132PDZ128m %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
+ %xmm16 = VFMSUB132PDZ128m %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
; CHECK: %xmm16 = VFMSUB132PDZ128r %xmm16, %xmm1, %xmm2
%xmm16 = VFMSUB132PDZ128r %xmm16, %xmm1, %xmm2
- ; CHECK: %xmm16 = VFMSUB132PSZ128m %xmm16, %xmm16, %rsi, 1, _, 0, _
- %xmm16 = VFMSUB132PSZ128m %xmm16, %xmm16, %rsi, 1, _, 0, _
+ ; CHECK: %xmm16 = VFMSUB132PSZ128m %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
+ %xmm16 = VFMSUB132PSZ128m %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
; CHECK: %xmm16 = VFMSUB132PSZ128r %xmm16, %xmm1, %xmm2
%xmm16 = VFMSUB132PSZ128r %xmm16, %xmm1, %xmm2
- ; CHECK: %xmm16 = VFMSUB213PDZ128m %xmm16, %xmm16, %rsi, 1, _, 0, _
- %xmm16 = VFMSUB213PDZ128m %xmm16, %xmm16, %rsi, 1, _, 0, _
+ ; CHECK: %xmm16 = VFMSUB213PDZ128m %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
+ %xmm16 = VFMSUB213PDZ128m %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
; CHECK: %xmm16 = VFMSUB213PDZ128r %xmm16, %xmm1, %xmm2
%xmm16 = VFMSUB213PDZ128r %xmm16, %xmm1, %xmm2
- ; CHECK: %xmm16 = VFMSUB213PSZ128m %xmm16, %xmm16, %rsi, 1, _, 0, _
- %xmm16 = VFMSUB213PSZ128m %xmm16, %xmm16, %rsi, 1, _, 0, _
+ ; CHECK: %xmm16 = VFMSUB213PSZ128m %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
+ %xmm16 = VFMSUB213PSZ128m %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
; CHECK: %xmm16 = VFMSUB213PSZ128r %xmm16, %xmm1, %xmm2
%xmm16 = VFMSUB213PSZ128r %xmm16, %xmm1, %xmm2
- ; CHECK: %xmm16 = VFMSUB231PDZ128m %xmm16, %xmm16, %rsi, 1, _, 0, _
- %xmm16 = VFMSUB231PDZ128m %xmm16, %xmm16, %rsi, 1, _, 0, _
+ ; CHECK: %xmm16 = VFMSUB231PDZ128m %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
+ %xmm16 = VFMSUB231PDZ128m %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
; CHECK: %xmm16 = VFMSUB231PDZ128r %xmm16, %xmm1, %xmm2
%xmm16 = VFMSUB231PDZ128r %xmm16, %xmm1, %xmm2
- ; CHECK: %xmm16 = VFMSUB231PSZ128m %xmm16, %xmm16, %rsi, 1, _, 0, _
- %xmm16 = VFMSUB231PSZ128m %xmm16, %xmm16, %rsi, 1, _, 0, _
+ ; CHECK: %xmm16 = VFMSUB231PSZ128m %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
+ %xmm16 = VFMSUB231PSZ128m %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
; CHECK: %xmm16 = VFMSUB231PSZ128r %xmm16, %xmm1, %xmm2
%xmm16 = VFMSUB231PSZ128r %xmm16, %xmm1, %xmm2
- ; CHECK: %xmm16 = VFMSUBADD132PDZ128m %xmm16, %xmm16, %rsi, 1, _, 0, _
- %xmm16 = VFMSUBADD132PDZ128m %xmm16, %xmm16, %rsi, 1, _, 0, _
+ ; CHECK: %xmm16 = VFMSUBADD132PDZ128m %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
+ %xmm16 = VFMSUBADD132PDZ128m %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
; CHECK: %xmm16 = VFMSUBADD132PDZ128r %xmm16, %xmm1, %xmm2
%xmm16 = VFMSUBADD132PDZ128r %xmm16, %xmm1, %xmm2
- ; CHECK: %xmm16 = VFMSUBADD132PSZ128m %xmm16, %xmm16, %rsi, 1, _, 0, _
- %xmm16 = VFMSUBADD132PSZ128m %xmm16, %xmm16, %rsi, 1, _, 0, _
+ ; CHECK: %xmm16 = VFMSUBADD132PSZ128m %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
+ %xmm16 = VFMSUBADD132PSZ128m %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
; CHECK: %xmm16 = VFMSUBADD132PSZ128r %xmm16, %xmm1, %xmm2
%xmm16 = VFMSUBADD132PSZ128r %xmm16, %xmm1, %xmm2
- ; CHECK: %xmm16 = VFMSUBADD213PDZ128m %xmm16, %xmm16, %rsi, 1, _, 0, _
- %xmm16 = VFMSUBADD213PDZ128m %xmm16, %xmm16, %rsi, 1, _, 0, _
+ ; CHECK: %xmm16 = VFMSUBADD213PDZ128m %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
+ %xmm16 = VFMSUBADD213PDZ128m %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
; CHECK: %xmm16 = VFMSUBADD213PDZ128r %xmm16, %xmm1, %xmm2
%xmm16 = VFMSUBADD213PDZ128r %xmm16, %xmm1, %xmm2
- ; CHECK: %xmm16 = VFMSUBADD213PSZ128m %xmm16, %xmm16, %rsi, 1, _, 0, _
- %xmm16 = VFMSUBADD213PSZ128m %xmm16, %xmm16, %rsi, 1, _, 0, _
+ ; CHECK: %xmm16 = VFMSUBADD213PSZ128m %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
+ %xmm16 = VFMSUBADD213PSZ128m %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
; CHECK: %xmm16 = VFMSUBADD213PSZ128r %xmm16, %xmm1, %xmm2
%xmm16 = VFMSUBADD213PSZ128r %xmm16, %xmm1, %xmm2
- ; CHECK: %xmm16 = VFMSUBADD231PDZ128m %xmm16, %xmm16, %rsi, 1, _, 0, _
- %xmm16 = VFMSUBADD231PDZ128m %xmm16, %xmm16, %rsi, 1, _, 0, _
+ ; CHECK: %xmm16 = VFMSUBADD231PDZ128m %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
+ %xmm16 = VFMSUBADD231PDZ128m %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
; CHECK: %xmm16 = VFMSUBADD231PDZ128r %xmm16, %xmm1, %xmm2
%xmm16 = VFMSUBADD231PDZ128r %xmm16, %xmm1, %xmm2
- ; CHECK: %xmm16 = VFMSUBADD231PSZ128m %xmm16, %xmm16, %rsi, 1, _, 0, _
- %xmm16 = VFMSUBADD231PSZ128m %xmm16, %xmm16, %rsi, 1, _, 0, _
+ ; CHECK: %xmm16 = VFMSUBADD231PSZ128m %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
+ %xmm16 = VFMSUBADD231PSZ128m %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
; CHECK: %xmm16 = VFMSUBADD231PSZ128r %xmm16, %xmm1, %xmm2
%xmm16 = VFMSUBADD231PSZ128r %xmm16, %xmm1, %xmm2
- ; CHECK: %xmm16 = VFNMADD132PDZ128m %xmm16, %xmm16, %rsi, 1, _, 0, _
- %xmm16 = VFNMADD132PDZ128m %xmm16, %xmm16, %rsi, 1, _, 0, _
+ ; CHECK: %xmm16 = VFNMADD132PDZ128m %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
+ %xmm16 = VFNMADD132PDZ128m %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
; CHECK: %xmm16 = VFNMADD132PDZ128r %xmm16, %xmm1, %xmm2
%xmm16 = VFNMADD132PDZ128r %xmm16, %xmm1, %xmm2
- ; CHECK: %xmm16 = VFNMADD132PSZ128m %xmm16, %xmm16, %rsi, 1, _, 0, _
- %xmm16 = VFNMADD132PSZ128m %xmm16, %xmm16, %rsi, 1, _, 0, _
+ ; CHECK: %xmm16 = VFNMADD132PSZ128m %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
+ %xmm16 = VFNMADD132PSZ128m %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
; CHECK: %xmm16 = VFNMADD132PSZ128r %xmm16, %xmm1, %xmm2
%xmm16 = VFNMADD132PSZ128r %xmm16, %xmm1, %xmm2
- ; CHECK: %xmm16 = VFNMADD213PDZ128m %xmm16, %xmm16, %rsi, 1, _, 0, _
- %xmm16 = VFNMADD213PDZ128m %xmm16, %xmm16, %rsi, 1, _, 0, _
+ ; CHECK: %xmm16 = VFNMADD213PDZ128m %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
+ %xmm16 = VFNMADD213PDZ128m %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
; CHECK: %xmm16 = VFNMADD213PDZ128r %xmm16, %xmm1, %xmm2
%xmm16 = VFNMADD213PDZ128r %xmm16, %xmm1, %xmm2
- ; CHECK: %xmm16 = VFNMADD213PSZ128m %xmm16, %xmm16, %rsi, 1, _, 0, _
- %xmm16 = VFNMADD213PSZ128m %xmm16, %xmm16, %rsi, 1, _, 0, _
+ ; CHECK: %xmm16 = VFNMADD213PSZ128m %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
+ %xmm16 = VFNMADD213PSZ128m %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
; CHECK: %xmm16 = VFNMADD213PSZ128r %xmm16, %xmm1, %xmm2
%xmm16 = VFNMADD213PSZ128r %xmm16, %xmm1, %xmm2
- ; CHECK: %xmm16 = VFNMADD231PDZ128m %xmm16, %xmm16, %rsi, 1, _, 0, _
- %xmm16 = VFNMADD231PDZ128m %xmm16, %xmm16, %rsi, 1, _, 0, _
+ ; CHECK: %xmm16 = VFNMADD231PDZ128m %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
+ %xmm16 = VFNMADD231PDZ128m %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
; CHECK: %xmm16 = VFNMADD231PDZ128r %xmm16, %xmm1, %xmm2
%xmm16 = VFNMADD231PDZ128r %xmm16, %xmm1, %xmm2
- ; CHECK: %xmm16 = VFNMADD231PSZ128m %xmm16, %xmm16, %rsi, 1, _, 0, _
- %xmm16 = VFNMADD231PSZ128m %xmm16, %xmm16, %rsi, 1, _, 0, _
+ ; CHECK: %xmm16 = VFNMADD231PSZ128m %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
+ %xmm16 = VFNMADD231PSZ128m %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
; CHECK: %xmm16 = VFNMADD231PSZ128r %xmm16, %xmm1, %xmm2
%xmm16 = VFNMADD231PSZ128r %xmm16, %xmm1, %xmm2
- ; CHECK: %xmm16 = VFNMSUB132PDZ128m %xmm16, %xmm16, %rsi, 1, _, 0, _
- %xmm16 = VFNMSUB132PDZ128m %xmm16, %xmm16, %rsi, 1, _, 0, _
+ ; CHECK: %xmm16 = VFNMSUB132PDZ128m %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
+ %xmm16 = VFNMSUB132PDZ128m %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
; CHECK: %xmm16 = VFNMSUB132PDZ128r %xmm16, %xmm1, %xmm2
%xmm16 = VFNMSUB132PDZ128r %xmm16, %xmm1, %xmm2
- ; CHECK: %xmm16 = VFNMSUB132PSZ128m %xmm16, %xmm16, %rsi, 1, _, 0, _
- %xmm16 = VFNMSUB132PSZ128m %xmm16, %xmm16, %rsi, 1, _, 0, _
+ ; CHECK: %xmm16 = VFNMSUB132PSZ128m %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
+ %xmm16 = VFNMSUB132PSZ128m %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
; CHECK: %xmm16 = VFNMSUB132PSZ128r %xmm16, %xmm1, %xmm2
%xmm16 = VFNMSUB132PSZ128r %xmm16, %xmm1, %xmm2
- ; CHECK: %xmm16 = VFNMSUB213PDZ128m %xmm16, %xmm16, %rsi, 1, _, 0, _
- %xmm16 = VFNMSUB213PDZ128m %xmm16, %xmm16, %rsi, 1, _, 0, _
+ ; CHECK: %xmm16 = VFNMSUB213PDZ128m %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
+ %xmm16 = VFNMSUB213PDZ128m %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
; CHECK: %xmm16 = VFNMSUB213PDZ128r %xmm16, %xmm1, %xmm2
%xmm16 = VFNMSUB213PDZ128r %xmm16, %xmm1, %xmm2
- ; CHECK: %xmm16 = VFNMSUB213PSZ128m %xmm16, %xmm16, %rsi, 1, _, 0, _
- %xmm16 = VFNMSUB213PSZ128m %xmm16, %xmm16, %rsi, 1, _, 0, _
+ ; CHECK: %xmm16 = VFNMSUB213PSZ128m %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
+ %xmm16 = VFNMSUB213PSZ128m %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
; CHECK: %xmm16 = VFNMSUB213PSZ128r %xmm16, %xmm1, %xmm2
%xmm16 = VFNMSUB213PSZ128r %xmm16, %xmm1, %xmm2
- ; CHECK: %xmm16 = VFNMSUB231PDZ128m %xmm16, %xmm16, %rsi, 1, _, 0, _
- %xmm16 = VFNMSUB231PDZ128m %xmm16, %xmm16, %rsi, 1, _, 0, _
+ ; CHECK: %xmm16 = VFNMSUB231PDZ128m %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
+ %xmm16 = VFNMSUB231PDZ128m %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
; CHECK: %xmm16 = VFNMSUB231PDZ128r %xmm16, %xmm1, %xmm2
%xmm16 = VFNMSUB231PDZ128r %xmm16, %xmm1, %xmm2
- ; CHECK: %xmm16 = VFNMSUB231PSZ128m %xmm16, %xmm16, %rsi, 1, _, 0, _
- %xmm16 = VFNMSUB231PSZ128m %xmm16, %xmm16, %rsi, 1, _, 0, _
+ ; CHECK: %xmm16 = VFNMSUB231PSZ128m %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
+ %xmm16 = VFNMSUB231PSZ128m %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
; CHECK: %xmm16 = VFNMSUB231PSZ128r %xmm16, %xmm1, %xmm2
%xmm16 = VFNMSUB231PSZ128r %xmm16, %xmm1, %xmm2
; CHECK: %xmm16 = VPSLLDZ128ri %xmm16, 7
%xmm16 = VPSLLDZ128ri %xmm16, 7
- ; CHECK: %xmm16 = VPSLLDZ128rm %xmm16, %rip, 1, _, %rax, _
- %xmm16 = VPSLLDZ128rm %xmm16, %rip, 1, _, %rax, _
+ ; CHECK: %xmm16 = VPSLLDZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VPSLLDZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VPSLLDZ128rr %xmm16, 14
%xmm16 = VPSLLDZ128rr %xmm16, 14
; CHECK: %xmm16 = VPSLLQZ128ri %xmm16, 7
%xmm16 = VPSLLQZ128ri %xmm16, 7
- ; CHECK: %xmm16 = VPSLLQZ128rm %xmm16, %rip, 1, _, %rax, _
- %xmm16 = VPSLLQZ128rm %xmm16, %rip, 1, _, %rax, _
+ ; CHECK: %xmm16 = VPSLLQZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VPSLLQZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VPSLLQZ128rr %xmm16, 14
%xmm16 = VPSLLQZ128rr %xmm16, 14
- ; CHECK: %xmm16 = VPSLLVDZ128rm %xmm16, %rip, 1, _, %rax, _
- %xmm16 = VPSLLVDZ128rm %xmm16, %rip, 1, _, %rax, _
+ ; CHECK: %xmm16 = VPSLLVDZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VPSLLVDZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VPSLLVDZ128rr %xmm16, 14
%xmm16 = VPSLLVDZ128rr %xmm16, 14
- ; CHECK: %xmm16 = VPSLLVQZ128rm %xmm16, %rip, 1, _, %rax, _
- %xmm16 = VPSLLVQZ128rm %xmm16, %rip, 1, _, %rax, _
+ ; CHECK: %xmm16 = VPSLLVQZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VPSLLVQZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VPSLLVQZ128rr %xmm16, 14
%xmm16 = VPSLLVQZ128rr %xmm16, 14
; CHECK: %xmm16 = VPSLLWZ128ri %xmm16, 7
%xmm16 = VPSLLWZ128ri %xmm16, 7
- ; CHECK: %xmm16 = VPSLLWZ128rm %xmm16, %rip, 1, _, %rax, _
- %xmm16 = VPSLLWZ128rm %xmm16, %rip, 1, _, %rax, _
+ ; CHECK: %xmm16 = VPSLLWZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VPSLLWZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VPSLLWZ128rr %xmm16, 14
%xmm16 = VPSLLWZ128rr %xmm16, 14
; CHECK: %xmm16 = VPSRADZ128ri %xmm16, 7
%xmm16 = VPSRADZ128ri %xmm16, 7
- ; CHECK: %xmm16 = VPSRADZ128rm %xmm16, %rip, 1, _, %rax, _
- %xmm16 = VPSRADZ128rm %xmm16, %rip, 1, _, %rax, _
+ ; CHECK: %xmm16 = VPSRADZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VPSRADZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VPSRADZ128rr %xmm16, 14
%xmm16 = VPSRADZ128rr %xmm16, 14
- ; CHECK: %xmm16 = VPSRAVDZ128rm %xmm16, %rip, 1, _, %rax, _
- %xmm16 = VPSRAVDZ128rm %xmm16, %rip, 1, _, %rax, _
+ ; CHECK: %xmm16 = VPSRAVDZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VPSRAVDZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VPSRAVDZ128rr %xmm16, 14
%xmm16 = VPSRAVDZ128rr %xmm16, 14
; CHECK: %xmm16 = VPSRAWZ128ri %xmm16, 7
%xmm16 = VPSRAWZ128ri %xmm16, 7
- ; CHECK: %xmm16 = VPSRAWZ128rm %xmm16, %rip, 1, _, %rax, _
- %xmm16 = VPSRAWZ128rm %xmm16, %rip, 1, _, %rax, _
+ ; CHECK: %xmm16 = VPSRAWZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VPSRAWZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VPSRAWZ128rr %xmm16, 14
%xmm16 = VPSRAWZ128rr %xmm16, 14
; CHECK: %xmm16 = VPSRLDQZ128rr %xmm16, 14
%xmm16 = VPSRLDQZ128rr %xmm16, 14
; CHECK: %xmm16 = VPSRLDZ128ri %xmm16, 7
%xmm16 = VPSRLDZ128ri %xmm16, 7
- ; CHECK: %xmm16 = VPSRLDZ128rm %xmm16, %rip, 1, _, %rax, _
- %xmm16 = VPSRLDZ128rm %xmm16, %rip, 1, _, %rax, _
+ ; CHECK: %xmm16 = VPSRLDZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VPSRLDZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VPSRLDZ128rr %xmm16, 14
%xmm16 = VPSRLDZ128rr %xmm16, 14
; CHECK: %xmm16 = VPSRLQZ128ri %xmm16, 7
%xmm16 = VPSRLQZ128ri %xmm16, 7
- ; CHECK: %xmm16 = VPSRLQZ128rm %xmm16, %rip, 1, _, %rax, _
- %xmm16 = VPSRLQZ128rm %xmm16, %rip, 1, _, %rax, _
+ ; CHECK: %xmm16 = VPSRLQZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VPSRLQZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VPSRLQZ128rr %xmm16, 14
%xmm16 = VPSRLQZ128rr %xmm16, 14
- ; CHECK: %xmm16 = VPSRLVDZ128rm %xmm16, %rip, 1, _, %rax, _
- %xmm16 = VPSRLVDZ128rm %xmm16, %rip, 1, _, %rax, _
+ ; CHECK: %xmm16 = VPSRLVDZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VPSRLVDZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VPSRLVDZ128rr %xmm16, 14
%xmm16 = VPSRLVDZ128rr %xmm16, 14
- ; CHECK: %xmm16 = VPSRLVQZ128rm %xmm16, %rip, 1, _, %rax, _
- %xmm16 = VPSRLVQZ128rm %xmm16, %rip, 1, _, %rax, _
+ ; CHECK: %xmm16 = VPSRLVQZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VPSRLVQZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VPSRLVQZ128rr %xmm16, 14
%xmm16 = VPSRLVQZ128rr %xmm16, 14
; CHECK: %xmm16 = VPSRLWZ128ri %xmm16, 7
%xmm16 = VPSRLWZ128ri %xmm16, 7
- ; CHECK: %xmm16 = VPSRLWZ128rm %xmm16, %rip, 1, _, %rax, _
- %xmm16 = VPSRLWZ128rm %xmm16, %rip, 1, _, %rax, _
+ ; CHECK: %xmm16 = VPSRLWZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VPSRLWZ128rm %xmm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VPSRLWZ128rr %xmm16, 14
%xmm16 = VPSRLWZ128rr %xmm16, 14
- ; CHECK: %xmm16 = VPERMILPDZ128mi %rdi, 1, _, 0, _, _
- %xmm16 = VPERMILPDZ128mi %rdi, 1, _, 0, _, _
+ ; CHECK: %xmm16 = VPERMILPDZ128mi %rdi, 1, %noreg, 0, %noreg, %noreg
+ %xmm16 = VPERMILPDZ128mi %rdi, 1, %noreg, 0, %noreg, %noreg
; CHECK: %xmm16 = VPERMILPDZ128ri %xmm16, 9
%xmm16 = VPERMILPDZ128ri %xmm16, 9
- ; CHECK: %xmm16 = VPERMILPDZ128rm %xmm16, %rdi, 1, _, 0, _
- %xmm16 = VPERMILPDZ128rm %xmm16, %rdi, 1, _, 0, _
+ ; CHECK: %xmm16 = VPERMILPDZ128rm %xmm16, %rdi, 1, %noreg, 0, %noreg
+ %xmm16 = VPERMILPDZ128rm %xmm16, %rdi, 1, %noreg, 0, %noreg
; CHECK: %xmm16 = VPERMILPDZ128rr %xmm16, %xmm1
%xmm16 = VPERMILPDZ128rr %xmm16, %xmm1
- ; CHECK: %xmm16 = VPERMILPSZ128mi %rdi, 1, _, 0, _, _
- %xmm16 = VPERMILPSZ128mi %rdi, 1, _, 0, _, _
+ ; CHECK: %xmm16 = VPERMILPSZ128mi %rdi, 1, %noreg, 0, %noreg, %noreg
+ %xmm16 = VPERMILPSZ128mi %rdi, 1, %noreg, 0, %noreg, %noreg
; CHECK: %xmm16 = VPERMILPSZ128ri %xmm16, 9
%xmm16 = VPERMILPSZ128ri %xmm16, 9
- ; CHECK: %xmm16 = VPERMILPSZ128rm %xmm16, %rdi, 1, _, 0, _
- %xmm16 = VPERMILPSZ128rm %xmm16, %rdi, 1, _, 0, _
+ ; CHECK: %xmm16 = VPERMILPSZ128rm %xmm16, %rdi, 1, %noreg, 0, %noreg
+ %xmm16 = VPERMILPSZ128rm %xmm16, %rdi, 1, %noreg, 0, %noreg
; CHECK: %xmm16 = VPERMILPSZ128rr %xmm16, %xmm1
%xmm16 = VPERMILPSZ128rr %xmm16, %xmm1
- ; CHECK: %xmm16 = VCVTPH2PSZ128rm %rdi, %xmm16, 1, _, 0
- %xmm16 = VCVTPH2PSZ128rm %rdi, %xmm16, 1, _, 0
+ ; CHECK: %xmm16 = VCVTPH2PSZ128rm %rdi, %xmm16, 1, %noreg, 0
+ %xmm16 = VCVTPH2PSZ128rm %rdi, %xmm16, 1, %noreg, 0
; CHECK: %xmm16 = VCVTPH2PSZ128rr %xmm16
%xmm16 = VCVTPH2PSZ128rr %xmm16
- ; CHECK: %xmm16 = VCVTDQ2PDZ128rm %rdi, %xmm16, 1, _, 0
- %xmm16 = VCVTDQ2PDZ128rm %rdi, %xmm16, 1, _, 0
+ ; CHECK: %xmm16 = VCVTDQ2PDZ128rm %rdi, %xmm16, 1, %noreg, 0
+ %xmm16 = VCVTDQ2PDZ128rm %rdi, %xmm16, 1, %noreg, 0
; CHECK: %xmm16 = VCVTDQ2PDZ128rr %xmm16
%xmm16 = VCVTDQ2PDZ128rr %xmm16
- ; CHECK: %xmm16 = VCVTDQ2PSZ128rm %rdi, %xmm16, 1, _, 0
- %xmm16 = VCVTDQ2PSZ128rm %rdi, %xmm16, 1, _, 0
+ ; CHECK: %xmm16 = VCVTDQ2PSZ128rm %rdi, %xmm16, 1, %noreg, 0
+ %xmm16 = VCVTDQ2PSZ128rm %rdi, %xmm16, 1, %noreg, 0
; CHECK: %xmm16 = VCVTDQ2PSZ128rr %xmm16
%xmm16 = VCVTDQ2PSZ128rr %xmm16
- ; CHECK: %xmm16 = VCVTPD2DQZ128rm %rdi, %xmm16, 1, _, 0
- %xmm16 = VCVTPD2DQZ128rm %rdi, %xmm16, 1, _, 0
+ ; CHECK: %xmm16 = VCVTPD2DQZ128rm %rdi, %xmm16, 1, %noreg, 0
+ %xmm16 = VCVTPD2DQZ128rm %rdi, %xmm16, 1, %noreg, 0
; CHECK: %xmm16 = VCVTPD2DQZ128rr %xmm16
%xmm16 = VCVTPD2DQZ128rr %xmm16
- ; CHECK: %xmm16 = VCVTPD2PSZ128rm %rdi, %xmm16, 1, _, 0
- %xmm16 = VCVTPD2PSZ128rm %rdi, %xmm16, 1, _, 0
+ ; CHECK: %xmm16 = VCVTPD2PSZ128rm %rdi, %xmm16, 1, %noreg, 0
+ %xmm16 = VCVTPD2PSZ128rm %rdi, %xmm16, 1, %noreg, 0
; CHECK: %xmm16 = VCVTPD2PSZ128rr %xmm16
%xmm16 = VCVTPD2PSZ128rr %xmm16
- ; CHECK: %xmm16 = VCVTPS2DQZ128rm %rdi, %xmm16, 1, _, 0
- %xmm16 = VCVTPS2DQZ128rm %rdi, %xmm16, 1, _, 0
+ ; CHECK: %xmm16 = VCVTPS2DQZ128rm %rdi, %xmm16, 1, %noreg, 0
+ %xmm16 = VCVTPS2DQZ128rm %rdi, %xmm16, 1, %noreg, 0
; CHECK: %xmm16 = VCVTPS2DQZ128rr %xmm16
%xmm16 = VCVTPS2DQZ128rr %xmm16
- ; CHECK: %xmm16 = VCVTPS2PDZ128rm %rdi, %xmm16, 1, _, 0
- %xmm16 = VCVTPS2PDZ128rm %rdi, %xmm16, 1, _, 0
+ ; CHECK: %xmm16 = VCVTPS2PDZ128rm %rdi, %xmm16, 1, %noreg, 0
+ %xmm16 = VCVTPS2PDZ128rm %rdi, %xmm16, 1, %noreg, 0
; CHECK: %xmm16 = VCVTPS2PDZ128rr %xmm16
%xmm16 = VCVTPS2PDZ128rr %xmm16
- ; CHECK: %xmm16 = VCVTTPD2DQZ128rm %rdi, %xmm16, 1, _, 0
- %xmm16 = VCVTTPD2DQZ128rm %rdi, %xmm16, 1, _, 0
+ ; CHECK: %xmm16 = VCVTTPD2DQZ128rm %rdi, %xmm16, 1, %noreg, 0
+ %xmm16 = VCVTTPD2DQZ128rm %rdi, %xmm16, 1, %noreg, 0
; CHECK: %xmm16 = VCVTTPD2DQZ128rr %xmm16
%xmm16 = VCVTTPD2DQZ128rr %xmm16
- ; CHECK: %xmm16 = VCVTTPS2DQZ128rm %rdi, %xmm16, 1, _, 0
- %xmm16 = VCVTTPS2DQZ128rm %rdi, %xmm16, 1, _, 0
+ ; CHECK: %xmm16 = VCVTTPS2DQZ128rm %rdi, %xmm16, 1, %noreg, 0
+ %xmm16 = VCVTTPS2DQZ128rm %rdi, %xmm16, 1, %noreg, 0
; CHECK: %xmm16 = VCVTTPS2DQZ128rr %xmm16
%xmm16 = VCVTTPS2DQZ128rr %xmm16
- ; CHECK: %xmm16 = VSQRTPDZ128m %rdi, _, _, _, _
- %xmm16 = VSQRTPDZ128m %rdi, _, _, _, _
+ ; CHECK: %xmm16 = VSQRTPDZ128m %rdi, %noreg, %noreg, %noreg, %noreg
+ %xmm16 = VSQRTPDZ128m %rdi, %noreg, %noreg, %noreg, %noreg
; CHECK: %xmm16 = VSQRTPDZ128r %xmm16
%xmm16 = VSQRTPDZ128r %xmm16
- ; CHECK: %xmm16 = VSQRTPSZ128m %rdi, _, _, _, _
- %xmm16 = VSQRTPSZ128m %rdi, _, _, _, _
+ ; CHECK: %xmm16 = VSQRTPSZ128m %rdi, %noreg, %noreg, %noreg, %noreg
+ %xmm16 = VSQRTPSZ128m %rdi, %noreg, %noreg, %noreg, %noreg
; CHECK: %xmm16 = VSQRTPSZ128r %xmm16
%xmm16 = VSQRTPSZ128r %xmm16
- ; CHECK: %xmm16 = VMOVDDUPZ128rm %rdi, 1, _, 0, _
- %xmm16 = VMOVDDUPZ128rm %rdi, 1, _, 0, _
+ ; CHECK: %xmm16 = VMOVDDUPZ128rm %rdi, 1, %noreg, 0, %noreg
+ %xmm16 = VMOVDDUPZ128rm %rdi, 1, %noreg, 0, %noreg
; CHECK: %xmm16 = VMOVDDUPZ128rr %xmm16
%xmm16 = VMOVDDUPZ128rr %xmm16
- ; CHECK: %xmm16 = VMOVSHDUPZ128rm %rdi, 1, _, 0, _
- %xmm16 = VMOVSHDUPZ128rm %rdi, 1, _, 0, _
+ ; CHECK: %xmm16 = VMOVSHDUPZ128rm %rdi, 1, %noreg, 0, %noreg
+ %xmm16 = VMOVSHDUPZ128rm %rdi, 1, %noreg, 0, %noreg
; CHECK: %xmm16 = VMOVSHDUPZ128rr %xmm16
%xmm16 = VMOVSHDUPZ128rr %xmm16
- ; CHECK: %xmm16 = VMOVSLDUPZ128rm %rdi, 1, _, 0, _
- %xmm16 = VMOVSLDUPZ128rm %rdi, 1, _, 0, _
+ ; CHECK: %xmm16 = VMOVSLDUPZ128rm %rdi, 1, %noreg, 0, %noreg
+ %xmm16 = VMOVSLDUPZ128rm %rdi, 1, %noreg, 0, %noreg
; CHECK: %xmm16 = VMOVSLDUPZ128rr %xmm16
%xmm16 = VMOVSLDUPZ128rr %xmm16
- ; CHECK: %xmm16 = VPSHUFBZ128rm %xmm16, _, _, _, _, _
- %xmm16 = VPSHUFBZ128rm %xmm16, _, _, _, _, _
+ ; CHECK: %xmm16 = VPSHUFBZ128rm %xmm16, %noreg, %noreg, %noreg, %noreg, %noreg
+ %xmm16 = VPSHUFBZ128rm %xmm16, %noreg, %noreg, %noreg, %noreg, %noreg
; CHECK: %xmm16 = VPSHUFBZ128rr %xmm16, %xmm1
%xmm16 = VPSHUFBZ128rr %xmm16, %xmm1
- ; CHECK: %xmm16 = VPSHUFDZ128mi %rdi, 1, _, 0, _, _
- %xmm16 = VPSHUFDZ128mi %rdi, 1, _, 0, _, _
+ ; CHECK: %xmm16 = VPSHUFDZ128mi %rdi, 1, %noreg, 0, %noreg, %noreg
+ %xmm16 = VPSHUFDZ128mi %rdi, 1, %noreg, 0, %noreg, %noreg
; CHECK: %xmm16 = VPSHUFDZ128ri %xmm16, -24
%xmm16 = VPSHUFDZ128ri %xmm16, -24
- ; CHECK: %xmm16 = VPSHUFHWZ128mi %rdi, 1, _, 0, _, _
- %xmm16 = VPSHUFHWZ128mi %rdi, 1, _, 0, _, _
+ ; CHECK: %xmm16 = VPSHUFHWZ128mi %rdi, 1, %noreg, 0, %noreg, %noreg
+ %xmm16 = VPSHUFHWZ128mi %rdi, 1, %noreg, 0, %noreg, %noreg
; CHECK: %xmm16 = VPSHUFHWZ128ri %xmm16, -24
%xmm16 = VPSHUFHWZ128ri %xmm16, -24
- ; CHECK: %xmm16 = VPSHUFLWZ128mi %rdi, 1, _, 0, _, _
- %xmm16 = VPSHUFLWZ128mi %rdi, 1, _, 0, _, _
+ ; CHECK: %xmm16 = VPSHUFLWZ128mi %rdi, 1, %noreg, 0, %noreg, %noreg
+ %xmm16 = VPSHUFLWZ128mi %rdi, 1, %noreg, 0, %noreg, %noreg
; CHECK: %xmm16 = VPSHUFLWZ128ri %xmm16, -24
%xmm16 = VPSHUFLWZ128ri %xmm16, -24
; CHECK: %xmm16 = VPSLLDQZ128rr %xmm16, %xmm1
%xmm16 = VPSLLDQZ128rr %xmm16, %xmm1
- ; CHECK: %xmm16 = VSHUFPDZ128rmi %xmm16, _, _, _, _, _, _
- %xmm16 = VSHUFPDZ128rmi %xmm16, _, _, _, _, _, _
- ; CHECK: %xmm16 = VSHUFPDZ128rri %xmm16, _, _
- %xmm16 = VSHUFPDZ128rri %xmm16, _, _
- ; CHECK: %xmm16 = VSHUFPSZ128rmi %xmm16, _, _, _, _, _, _
- %xmm16 = VSHUFPSZ128rmi %xmm16, _, _, _, _, _, _
- ; CHECK: %xmm16 = VSHUFPSZ128rri %xmm16, _, _
- %xmm16 = VSHUFPSZ128rri %xmm16, _, _
- ; CHECK: %xmm16 = VPSADBWZ128rm %xmm16, 1, _, %rax, _, _
- %xmm16 = VPSADBWZ128rm %xmm16, 1, _, %rax, _, _
+ ; CHECK: %xmm16 = VSHUFPDZ128rmi %xmm16, %noreg, %noreg, %noreg, %noreg, %noreg, %noreg
+ %xmm16 = VSHUFPDZ128rmi %xmm16, %noreg, %noreg, %noreg, %noreg, %noreg, %noreg
+ ; CHECK: %xmm16 = VSHUFPDZ128rri %xmm16, %noreg, %noreg
+ %xmm16 = VSHUFPDZ128rri %xmm16, %noreg, %noreg
+ ; CHECK: %xmm16 = VSHUFPSZ128rmi %xmm16, %noreg, %noreg, %noreg, %noreg, %noreg, %noreg
+ %xmm16 = VSHUFPSZ128rmi %xmm16, %noreg, %noreg, %noreg, %noreg, %noreg, %noreg
+ ; CHECK: %xmm16 = VSHUFPSZ128rri %xmm16, %noreg, %noreg
+ %xmm16 = VSHUFPSZ128rri %xmm16, %noreg, %noreg
+ ; CHECK: %xmm16 = VPSADBWZ128rm %xmm16, 1, %noreg, %rax, %noreg, %noreg
+ %xmm16 = VPSADBWZ128rm %xmm16, 1, %noreg, %rax, %noreg, %noreg
; CHECK: %xmm16 = VPSADBWZ128rr %xmm16, %xmm1
%xmm16 = VPSADBWZ128rr %xmm16, %xmm1
- ; CHECK: %xmm16 = VBROADCASTSSZ128m %rip, _, _, _, _
- %xmm16 = VBROADCASTSSZ128m %rip, _, _, _, _
+ ; CHECK: %xmm16 = VBROADCASTSSZ128m %rip, %noreg, %noreg, %noreg, %noreg
+ %xmm16 = VBROADCASTSSZ128m %rip, %noreg, %noreg, %noreg, %noreg
; CHECK: %xmm16 = VBROADCASTSSZ128r %xmm16
%xmm16 = VBROADCASTSSZ128r %xmm16
- ; CHECK: %xmm16 = VPBROADCASTBZ128m %rip, _, _, _, _
- %xmm16 = VPBROADCASTBZ128m %rip, _, _, _, _
+ ; CHECK: %xmm16 = VPBROADCASTBZ128m %rip, %noreg, %noreg, %noreg, %noreg
+ %xmm16 = VPBROADCASTBZ128m %rip, %noreg, %noreg, %noreg, %noreg
; CHECK: %xmm16 = VPBROADCASTBZ128r %xmm16
%xmm16 = VPBROADCASTBZ128r %xmm16
- ; CHECK: %xmm16 = VPBROADCASTDZ128m %rip, _, _, _, _
- %xmm16 = VPBROADCASTDZ128m %rip, _, _, _, _
+ ; CHECK: %xmm16 = VPBROADCASTDZ128m %rip, %noreg, %noreg, %noreg, %noreg
+ %xmm16 = VPBROADCASTDZ128m %rip, %noreg, %noreg, %noreg, %noreg
; CHECK: %xmm16 = VPBROADCASTDZ128r %xmm16
%xmm16 = VPBROADCASTDZ128r %xmm16
- ; CHECK: %xmm16 = VPBROADCASTQZ128m %rip, _, _, _, _
- %xmm16 = VPBROADCASTQZ128m %rip, _, _, _, _
+ ; CHECK: %xmm16 = VPBROADCASTQZ128m %rip, %noreg, %noreg, %noreg, %noreg
+ %xmm16 = VPBROADCASTQZ128m %rip, %noreg, %noreg, %noreg, %noreg
; CHECK: %xmm16 = VPBROADCASTQZ128r %xmm16
%xmm16 = VPBROADCASTQZ128r %xmm16
- ; CHECK: %xmm16 = VPBROADCASTWZ128m %rip, _, _, _, _
- %xmm16 = VPBROADCASTWZ128m %rip, _, _, _, _
+ ; CHECK: %xmm16 = VPBROADCASTWZ128m %rip, %noreg, %noreg, %noreg, %noreg
+ %xmm16 = VPBROADCASTWZ128m %rip, %noreg, %noreg, %noreg, %noreg
; CHECK: %xmm16 = VPBROADCASTWZ128r %xmm16
%xmm16 = VPBROADCASTWZ128r %xmm16
- ; CHECK: %xmm16 = VBROADCASTI32X2Z128m %rip, _, _, _, _
- %xmm16 = VBROADCASTI32X2Z128m %rip, _, _, _, _
+ ; CHECK: %xmm16 = VBROADCASTI32X2Z128m %rip, %noreg, %noreg, %noreg, %noreg
+ %xmm16 = VBROADCASTI32X2Z128m %rip, %noreg, %noreg, %noreg, %noreg
; CHECK: %xmm16 = VBROADCASTI32X2Z128r %xmm0
%xmm16 = VBROADCASTI32X2Z128r %xmm0
; CHECK: %xmm16 = VCVTPS2PHZ128rr %xmm16, 2
%xmm16 = VCVTPS2PHZ128rr %xmm16, 2
- ; CHECK: VCVTPS2PHZ128mr %rdi, %xmm16, 1, _, 0, _, _
- VCVTPS2PHZ128mr %rdi, %xmm16, 1, _, 0, _, _
- ; CHECK: %xmm16 = VPABSBZ128rm %rip, 1, _, %rax, _
- %xmm16 = VPABSBZ128rm %rip, 1, _, %rax, _
+ ; CHECK: VCVTPS2PHZ128mr %rdi, %xmm16, 1, %noreg, 0, %noreg, %noreg
+ VCVTPS2PHZ128mr %rdi, %xmm16, 1, %noreg, 0, %noreg, %noreg
+ ; CHECK: %xmm16 = VPABSBZ128rm %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VPABSBZ128rm %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VPABSBZ128rr %xmm16
%xmm16 = VPABSBZ128rr %xmm16
- ; CHECK: %xmm16 = VPABSDZ128rm %rip, 1, _, %rax, _
- %xmm16 = VPABSDZ128rm %rip, 1, _, %rax, _
+ ; CHECK: %xmm16 = VPABSDZ128rm %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VPABSDZ128rm %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VPABSDZ128rr %xmm16
%xmm16 = VPABSDZ128rr %xmm16
- ; CHECK: %xmm16 = VPABSWZ128rm %rip, 1, _, %rax, _
- %xmm16 = VPABSWZ128rm %rip, 1, _, %rax, _
+ ; CHECK: %xmm16 = VPABSWZ128rm %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VPABSWZ128rm %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VPABSWZ128rr %xmm16
%xmm16 = VPABSWZ128rr %xmm16
- ; CHECK: %xmm16 = VPALIGNRZ128rmi %xmm16, _, _, _, _, _, _
- %xmm16 = VPALIGNRZ128rmi %xmm16, _, _, _, _, _, _
+ ; CHECK: %xmm16 = VPALIGNRZ128rmi %xmm16, %noreg, %noreg, %noreg, %noreg, %noreg, %noreg
+ %xmm16 = VPALIGNRZ128rmi %xmm16, %noreg, %noreg, %noreg, %noreg, %noreg, %noreg
; CHECK: %xmm16 = VPALIGNRZ128rri %xmm16, %xmm1, 15
%xmm16 = VPALIGNRZ128rri %xmm16, %xmm1, 15
- ; CHECK: VEXTRACTPSZmr %rdi, 1, _, 0, _, %xmm16, _
- VEXTRACTPSZmr %rdi, 1, _, 0, _, %xmm16, _
- ; CHECK: %eax = VEXTRACTPSZrr %xmm16, _
- %eax = VEXTRACTPSZrr %xmm16, _
- ; CHECK: %xmm16 = VINSERTPSZrm %xmm16, %rdi, _, _, _, _, _
- %xmm16 = VINSERTPSZrm %xmm16, %rdi, _, _, _, _, _
- ; CHECK: %xmm16 = VINSERTPSZrr %xmm16, %xmm16, _
- %xmm16 = VINSERTPSZrr %xmm16, %xmm16, _
+ ; CHECK: VEXTRACTPSZmr %rdi, 1, %noreg, 0, %noreg, %xmm16, %noreg
+ VEXTRACTPSZmr %rdi, 1, %noreg, 0, %noreg, %xmm16, %noreg
+ ; CHECK: %eax = VEXTRACTPSZrr %xmm16, %noreg
+ %eax = VEXTRACTPSZrr %xmm16, %noreg
+ ; CHECK: %xmm16 = VINSERTPSZrm %xmm16, %rdi, %noreg, %noreg, %noreg, %noreg, %noreg
+ %xmm16 = VINSERTPSZrm %xmm16, %rdi, %noreg, %noreg, %noreg, %noreg, %noreg
+ ; CHECK: %xmm16 = VINSERTPSZrr %xmm16, %xmm16, %noreg
+ %xmm16 = VINSERTPSZrr %xmm16, %xmm16, %noreg
RET 0, %zmm0, %zmm1
...
@@ -4086,548 +4086,548 @@ body: |
name: evex_scalar_to_evex_test
body: |
bb.0:
- ; CHECK: %xmm16 = VADDSDZrm %xmm16, %rip, 1, _, %rax, _
- %xmm16 = VADDSDZrm %xmm16, %rip, 1, _, %rax, _
- ; CHECK: %xmm16 = VADDSDZrm_Int %xmm16, %rip, 1, _, %rax, _
- %xmm16 = VADDSDZrm_Int %xmm16, %rip, 1, _, %rax, _
+ ; CHECK: %xmm16 = VADDSDZrm %xmm16, %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VADDSDZrm %xmm16, %rip, 1, %noreg, %rax, %noreg
+ ; CHECK: %xmm16 = VADDSDZrm_Int %xmm16, %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VADDSDZrm_Int %xmm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VADDSDZrr %xmm16, %xmm1
%xmm16 = VADDSDZrr %xmm16, %xmm1
; CHECK: %xmm16 = VADDSDZrr_Int %xmm16, %xmm1
%xmm16 = VADDSDZrr_Int %xmm16, %xmm1
- ; CHECK: %xmm16 = VADDSSZrm %xmm16, %rip, 1, _, %rax, _
- %xmm16 = VADDSSZrm %xmm16, %rip, 1, _, %rax, _
- ; CHECK: %xmm16 = VADDSSZrm_Int %xmm16, %rip, 1, _, %rax, _
- %xmm16 = VADDSSZrm_Int %xmm16, %rip, 1, _, %rax, _
+ ; CHECK: %xmm16 = VADDSSZrm %xmm16, %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VADDSSZrm %xmm16, %rip, 1, %noreg, %rax, %noreg
+ ; CHECK: %xmm16 = VADDSSZrm_Int %xmm16, %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VADDSSZrm_Int %xmm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VADDSSZrr %xmm16, %xmm1
%xmm16 = VADDSSZrr %xmm16, %xmm1
; CHECK: %xmm16 = VADDSSZrr_Int %xmm16, %xmm1
%xmm16 = VADDSSZrr_Int %xmm16, %xmm1
- ; CHECK: %xmm16 = VDIVSDZrm %xmm16, %rip, 1, _, %rax, _
- %xmm16 = VDIVSDZrm %xmm16, %rip, 1, _, %rax, _
- ; CHECK: %xmm16 = VDIVSDZrm_Int %xmm16, %rip, 1, _, %rax, _
- %xmm16 = VDIVSDZrm_Int %xmm16, %rip, 1, _, %rax, _
+ ; CHECK: %xmm16 = VDIVSDZrm %xmm16, %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VDIVSDZrm %xmm16, %rip, 1, %noreg, %rax, %noreg
+ ; CHECK: %xmm16 = VDIVSDZrm_Int %xmm16, %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VDIVSDZrm_Int %xmm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VDIVSDZrr %xmm16, %xmm1
%xmm16 = VDIVSDZrr %xmm16, %xmm1
; CHECK: %xmm16 = VDIVSDZrr_Int %xmm16, %xmm1
%xmm16 = VDIVSDZrr_Int %xmm16, %xmm1
- ; CHECK: %xmm16 = VDIVSSZrm %xmm16, %rip, 1, _, %rax, _
- %xmm16 = VDIVSSZrm %xmm16, %rip, 1, _, %rax, _
- ; CHECK: %xmm16 = VDIVSSZrm_Int %xmm16, %rip, 1, _, %rax, _
- %xmm16 = VDIVSSZrm_Int %xmm16, %rip, 1, _, %rax, _
+ ; CHECK: %xmm16 = VDIVSSZrm %xmm16, %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VDIVSSZrm %xmm16, %rip, 1, %noreg, %rax, %noreg
+ ; CHECK: %xmm16 = VDIVSSZrm_Int %xmm16, %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VDIVSSZrm_Int %xmm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VDIVSSZrr %xmm16, %xmm1
%xmm16 = VDIVSSZrr %xmm16, %xmm1
; CHECK: %xmm16 = VDIVSSZrr_Int %xmm16, %xmm1
%xmm16 = VDIVSSZrr_Int %xmm16, %xmm1
- ; CHECK: %xmm16 = VMAXCSDZrm %xmm16, %rip, 1, _, %rax, _
- %xmm16 = VMAXCSDZrm %xmm16, %rip, 1, _, %rax, _
+ ; CHECK: %xmm16 = VMAXCSDZrm %xmm16, %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VMAXCSDZrm %xmm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VMAXCSDZrr %xmm16, %xmm1
%xmm16 = VMAXCSDZrr %xmm16, %xmm1
- ; CHECK: %xmm16 = VMAXCSSZrm %xmm16, %rip, 1, _, %rax, _
- %xmm16 = VMAXCSSZrm %xmm16, %rip, 1, _, %rax, _
+ ; CHECK: %xmm16 = VMAXCSSZrm %xmm16, %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VMAXCSSZrm %xmm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VMAXCSSZrr %xmm16, %xmm1
%xmm16 = VMAXCSSZrr %xmm16, %xmm1
- ; CHECK: %xmm16 = VMAXSDZrm %xmm16, %rip, 1, _, %rax, _
- %xmm16 = VMAXSDZrm %xmm16, %rip, 1, _, %rax, _
- ; CHECK: %xmm16 = VMAXSDZrm_Int %xmm16, %rip, 1, _, %rax, _
- %xmm16 = VMAXSDZrm_Int %xmm16, %rip, 1, _, %rax, _
+ ; CHECK: %xmm16 = VMAXSDZrm %xmm16, %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VMAXSDZrm %xmm16, %rip, 1, %noreg, %rax, %noreg
+ ; CHECK: %xmm16 = VMAXSDZrm_Int %xmm16, %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VMAXSDZrm_Int %xmm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VMAXSDZrr %xmm16, %xmm1
%xmm16 = VMAXSDZrr %xmm16, %xmm1
; CHECK: %xmm16 = VMAXSDZrr_Int %xmm16, %xmm1
%xmm16 = VMAXSDZrr_Int %xmm16, %xmm1
- ; CHECK: %xmm16 = VMAXSSZrm %xmm16, %rip, 1, _, %rax, _
- %xmm16 = VMAXSSZrm %xmm16, %rip, 1, _, %rax, _
- ; CHECK: %xmm16 = VMAXSSZrm_Int %xmm16, %rip, 1, _, %rax, _
- %xmm16 = VMAXSSZrm_Int %xmm16, %rip, 1, _, %rax, _
+ ; CHECK: %xmm16 = VMAXSSZrm %xmm16, %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VMAXSSZrm %xmm16, %rip, 1, %noreg, %rax, %noreg
+ ; CHECK: %xmm16 = VMAXSSZrm_Int %xmm16, %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VMAXSSZrm_Int %xmm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VMAXSSZrr %xmm16, %xmm1
%xmm16 = VMAXSSZrr %xmm16, %xmm1
; CHECK: %xmm16 = VMAXSSZrr_Int %xmm16, %xmm1
%xmm16 = VMAXSSZrr_Int %xmm16, %xmm1
- ; CHECK: %xmm16 = VMINCSDZrm %xmm16, %rip, 1, _, %rax, _
- %xmm16 = VMINCSDZrm %xmm16, %rip, 1, _, %rax, _
+ ; CHECK: %xmm16 = VMINCSDZrm %xmm16, %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VMINCSDZrm %xmm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VMINCSDZrr %xmm16, %xmm1
%xmm16 = VMINCSDZrr %xmm16, %xmm1
- ; CHECK: %xmm16 = VMINCSSZrm %xmm16, %rip, 1, _, %rax, _
- %xmm16 = VMINCSSZrm %xmm16, %rip, 1, _, %rax, _
+ ; CHECK: %xmm16 = VMINCSSZrm %xmm16, %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VMINCSSZrm %xmm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VMINCSSZrr %xmm16, %xmm1
%xmm16 = VMINCSSZrr %xmm16, %xmm1
- ; CHECK: %xmm16 = VMINSDZrm %xmm16, %rip, 1, _, %rax, _
- %xmm16 = VMINSDZrm %xmm16, %rip, 1, _, %rax, _
- ; CHECK: %xmm16 = VMINSDZrm_Int %xmm16, %rip, 1, _, %rax, _
- %xmm16 = VMINSDZrm_Int %xmm16, %rip, 1, _, %rax, _
+ ; CHECK: %xmm16 = VMINSDZrm %xmm16, %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VMINSDZrm %xmm16, %rip, 1, %noreg, %rax, %noreg
+ ; CHECK: %xmm16 = VMINSDZrm_Int %xmm16, %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VMINSDZrm_Int %xmm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VMINSDZrr %xmm16, %xmm1
%xmm16 = VMINSDZrr %xmm16, %xmm1
; CHECK: %xmm16 = VMINSDZrr_Int %xmm16, %xmm1
%xmm16 = VMINSDZrr_Int %xmm16, %xmm1
- ; CHECK: %xmm16 = VMINSSZrm %xmm16, %rip, 1, _, %rax, _
- %xmm16 = VMINSSZrm %xmm16, %rip, 1, _, %rax, _
- ; CHECK: %xmm16 = VMINSSZrm_Int %xmm16, %rip, 1, _, %rax, _
- %xmm16 = VMINSSZrm_Int %xmm16, %rip, 1, _, %rax, _
+ ; CHECK: %xmm16 = VMINSSZrm %xmm16, %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VMINSSZrm %xmm16, %rip, 1, %noreg, %rax, %noreg
+ ; CHECK: %xmm16 = VMINSSZrm_Int %xmm16, %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VMINSSZrm_Int %xmm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VMINSSZrr %xmm16, %xmm1
%xmm16 = VMINSSZrr %xmm16, %xmm1
; CHECK: %xmm16 = VMINSSZrr_Int %xmm16, %xmm1
%xmm16 = VMINSSZrr_Int %xmm16, %xmm1
- ; CHECK: %xmm16 = VMULSDZrm %xmm16, %rip, 1, _, %rax, _
- %xmm16 = VMULSDZrm %xmm16, %rip, 1, _, %rax, _
- ; CHECK: %xmm16 = VMULSDZrm_Int %xmm16, %rip, 1, _, %rax, _
- %xmm16 = VMULSDZrm_Int %xmm16, %rip, 1, _, %rax, _
+ ; CHECK: %xmm16 = VMULSDZrm %xmm16, %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VMULSDZrm %xmm16, %rip, 1, %noreg, %rax, %noreg
+ ; CHECK: %xmm16 = VMULSDZrm_Int %xmm16, %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VMULSDZrm_Int %xmm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VMULSDZrr %xmm16, %xmm1
%xmm16 = VMULSDZrr %xmm16, %xmm1
; CHECK: %xmm16 = VMULSDZrr_Int %xmm16, %xmm1
%xmm16 = VMULSDZrr_Int %xmm16, %xmm1
- ; CHECK: %xmm16 = VMULSSZrm %xmm16, %rip, 1, _, %rax, _
- %xmm16 = VMULSSZrm %xmm16, %rip, 1, _, %rax, _
- ; CHECK: %xmm16 = VMULSSZrm_Int %xmm16, %rip, 1, _, %rax, _
- %xmm16 = VMULSSZrm_Int %xmm16, %rip, 1, _, %rax, _
+ ; CHECK: %xmm16 = VMULSSZrm %xmm16, %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VMULSSZrm %xmm16, %rip, 1, %noreg, %rax, %noreg
+ ; CHECK: %xmm16 = VMULSSZrm_Int %xmm16, %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VMULSSZrm_Int %xmm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VMULSSZrr %xmm16, %xmm1
%xmm16 = VMULSSZrr %xmm16, %xmm1
; CHECK: %xmm16 = VMULSSZrr_Int %xmm16, %xmm1
%xmm16 = VMULSSZrr_Int %xmm16, %xmm1
- ; CHECK: %xmm16 = VSUBSDZrm %xmm16, %rip, 1, _, %rax, _
- %xmm16 = VSUBSDZrm %xmm16, %rip, 1, _, %rax, _
- ; CHECK: %xmm16 = VSUBSDZrm_Int %xmm16, %rip, 1, _, %rax, _
- %xmm16 = VSUBSDZrm_Int %xmm16, %rip, 1, _, %rax, _
+ ; CHECK: %xmm16 = VSUBSDZrm %xmm16, %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VSUBSDZrm %xmm16, %rip, 1, %noreg, %rax, %noreg
+ ; CHECK: %xmm16 = VSUBSDZrm_Int %xmm16, %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VSUBSDZrm_Int %xmm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VSUBSDZrr %xmm16, %xmm1
%xmm16 = VSUBSDZrr %xmm16, %xmm1
; CHECK: %xmm16 = VSUBSDZrr_Int %xmm16, %xmm1
%xmm16 = VSUBSDZrr_Int %xmm16, %xmm1
- ; CHECK: %xmm16 = VSUBSSZrm %xmm16, %rip, 1, _, %rax, _
- %xmm16 = VSUBSSZrm %xmm16, %rip, 1, _, %rax, _
- ; CHECK: %xmm16 = VSUBSSZrm_Int %xmm16, %rip, 1, _, %rax, _
- %xmm16 = VSUBSSZrm_Int %xmm16, %rip, 1, _, %rax, _
+ ; CHECK: %xmm16 = VSUBSSZrm %xmm16, %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VSUBSSZrm %xmm16, %rip, 1, %noreg, %rax, %noreg
+ ; CHECK: %xmm16 = VSUBSSZrm_Int %xmm16, %rip, 1, %noreg, %rax, %noreg
+ %xmm16 = VSUBSSZrm_Int %xmm16, %rip, 1, %noreg, %rax, %noreg
; CHECK: %xmm16 = VSUBSSZrr %xmm16, %xmm1
%xmm16 = VSUBSSZrr %xmm16, %xmm1
; CHECK: %xmm16 = VSUBSSZrr_Int %xmm16, %xmm1
%xmm16 = VSUBSSZrr_Int %xmm16, %xmm1
- ; CHECK: %xmm16 = VFMADD132SDZm %xmm16, %xmm16, %rsi, 1, _, 0, _
- %xmm16 = VFMADD132SDZm %xmm16, %xmm16, %rsi, 1, _, 0, _
- ; CHECK: %xmm16 = VFMADD132SDZm_Int %xmm16, %xmm16, %rsi, 1, _, 0, _
- %xmm16 = VFMADD132SDZm_Int %xmm16, %xmm16, %rsi, 1, _, 0, _
+ ; CHECK: %xmm16 = VFMADD132SDZm %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
+ %xmm16 = VFMADD132SDZm %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
+ ; CHECK: %xmm16 = VFMADD132SDZm_Int %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
+ %xmm16 = VFMADD132SDZm_Int %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
; CHECK: %xmm16 = VFMADD132SDZr %xmm16, %xmm1, %xmm2
%xmm16 = VFMADD132SDZr %xmm16, %xmm1, %xmm2
; CHECK: %xmm16 = VFMADD132SDZr_Int %xmm16, %xmm1, %xmm2
%xmm16 = VFMADD132SDZr_Int %xmm16, %xmm1, %xmm2
- ; CHECK: %xmm16 = VFMADD132SSZm %xmm16, %xmm16, %rsi, 1, _, 0, _
- %xmm16 = VFMADD132SSZm %xmm16, %xmm16, %rsi, 1, _, 0, _
- ; CHECK: %xmm16 = VFMADD132SSZm_Int %xmm16, %xmm16, %rsi, 1, _, 0, _
- %xmm16 = VFMADD132SSZm_Int %xmm16, %xmm16, %rsi, 1, _, 0, _
+ ; CHECK: %xmm16 = VFMADD132SSZm %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
+ %xmm16 = VFMADD132SSZm %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
+ ; CHECK: %xmm16 = VFMADD132SSZm_Int %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
+ %xmm16 = VFMADD132SSZm_Int %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
; CHECK: %xmm16 = VFMADD132SSZr %xmm16, %xmm1, %xmm2
%xmm16 = VFMADD132SSZr %xmm16, %xmm1, %xmm2
; CHECK: %xmm16 = VFMADD132SSZr_Int %xmm16, %xmm1, %xmm2
%xmm16 = VFMADD132SSZr_Int %xmm16, %xmm1, %xmm2
- ; CHECK: %xmm16 = VFMADD213SDZm %xmm16, %xmm16, %rsi, 1, _, 0, _
- %xmm16 = VFMADD213SDZm %xmm16, %xmm16, %rsi, 1, _, 0, _
- ; CHECK: %xmm16 = VFMADD213SDZm_Int %xmm16, %xmm16, %rsi, 1, _, 0, _
- %xmm16 = VFMADD213SDZm_Int %xmm16, %xmm16, %rsi, 1, _, 0, _
+ ; CHECK: %xmm16 = VFMADD213SDZm %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
+ %xmm16 = VFMADD213SDZm %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
+ ; CHECK: %xmm16 = VFMADD213SDZm_Int %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
+ %xmm16 = VFMADD213SDZm_Int %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
; CHECK: %xmm16 = VFMADD213SDZr %xmm16, %xmm1, %xmm2
%xmm16 = VFMADD213SDZr %xmm16, %xmm1, %xmm2
; CHECK: %xmm16 = VFMADD213SDZr_Int %xmm16, %xmm1, %xmm2
%xmm16 = VFMADD213SDZr_Int %xmm16, %xmm1, %xmm2
- ; CHECK: %xmm16 = VFMADD213SSZm %xmm16, %xmm16, %rsi, 1, _, 0, _
- %xmm16 = VFMADD213SSZm %xmm16, %xmm16, %rsi, 1, _, 0, _
- ; CHECK: %xmm16 = VFMADD213SSZm_Int %xmm16, %xmm16, %rsi, 1, _, 0, _
- %xmm16 = VFMADD213SSZm_Int %xmm16, %xmm16, %rsi, 1, _, 0, _
+ ; CHECK: %xmm16 = VFMADD213SSZm %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
+ %xmm16 = VFMADD213SSZm %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
+ ; CHECK: %xmm16 = VFMADD213SSZm_Int %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
+ %xmm16 = VFMADD213SSZm_Int %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
; CHECK: %xmm16 = VFMADD213SSZr %xmm16, %xmm1, %xmm2
%xmm16 = VFMADD213SSZr %xmm16, %xmm1, %xmm2
; CHECK: %xmm16 = VFMADD213SSZr_Int %xmm16, %xmm1, %xmm2
%xmm16 = VFMADD213SSZr_Int %xmm16, %xmm1, %xmm2
- ; CHECK: %xmm16 = VFMADD231SDZm %xmm16, %xmm16, %rsi, 1, _, 0, _
- %xmm16 = VFMADD231SDZm %xmm16, %xmm16, %rsi, 1, _, 0, _
- ; CHECK: %xmm16 = VFMADD231SDZm_Int %xmm16, %xmm16, %rsi, 1, _, 0, _
- %xmm16 = VFMADD231SDZm_Int %xmm16, %xmm16, %rsi, 1, _, 0, _
+ ; CHECK: %xmm16 = VFMADD231SDZm %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
+ %xmm16 = VFMADD231SDZm %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
+ ; CHECK: %xmm16 = VFMADD231SDZm_Int %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
+ %xmm16 = VFMADD231SDZm_Int %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
; CHECK: %xmm16 = VFMADD231SDZr %xmm16, %xmm1, %xmm2
%xmm16 = VFMADD231SDZr %xmm16, %xmm1, %xmm2
; CHECK: %xmm16 = VFMADD231SDZr_Int %xmm16, %xmm1, %xmm2
%xmm16 = VFMADD231SDZr_Int %xmm16, %xmm1, %xmm2
- ; CHECK: %xmm16 = VFMADD231SSZm %xmm16, %xmm16, %rsi, 1, _, 0, _
- %xmm16 = VFMADD231SSZm %xmm16, %xmm16, %rsi, 1, _, 0, _
- ; CHECK: %xmm16 = VFMADD231SSZm_Int %xmm16, %xmm16, %rsi, 1, _, 0, _
- %xmm16 = VFMADD231SSZm_Int %xmm16, %xmm16, %rsi, 1, _, 0, _
+ ; CHECK: %xmm16 = VFMADD231SSZm %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
+ %xmm16 = VFMADD231SSZm %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
+ ; CHECK: %xmm16 = VFMADD231SSZm_Int %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
+ %xmm16 = VFMADD231SSZm_Int %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
; CHECK: %xmm16 = VFMADD231SSZr %xmm16, %xmm1, %xmm2
%xmm16 = VFMADD231SSZr %xmm16, %xmm1, %xmm2
; CHECK: %xmm16 = VFMADD231SSZr_Int %xmm16, %xmm1, %xmm2
%xmm16 = VFMADD231SSZr_Int %xmm16, %xmm1, %xmm2
- ; CHECK: %xmm16 = VFMSUB132SDZm %xmm16, %xmm16, %rsi, 1, _, 0, _
- %xmm16 = VFMSUB132SDZm %xmm16, %xmm16, %rsi, 1, _, 0, _
- ; CHECK: %xmm16 = VFMSUB132SDZm_Int %xmm16, %xmm16, %rsi, 1, _, 0, _
- %xmm16 = VFMSUB132SDZm_Int %xmm16, %xmm16, %rsi, 1, _, 0, _
+ ; CHECK: %xmm16 = VFMSUB132SDZm %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
+ %xmm16 = VFMSUB132SDZm %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
+ ; CHECK: %xmm16 = VFMSUB132SDZm_Int %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
+ %xmm16 = VFMSUB132SDZm_Int %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
; CHECK: %xmm16 = VFMSUB132SDZr %xmm16, %xmm1, %xmm2
%xmm16 = VFMSUB132SDZr %xmm16, %xmm1, %xmm2
; CHECK: %xmm16 = VFMSUB132SDZr_Int %xmm16, %xmm1, %xmm2
%xmm16 = VFMSUB132SDZr_Int %xmm16, %xmm1, %xmm2
- ; CHECK: %xmm16 = VFMSUB132SSZm %xmm16, %xmm16, %rsi, 1, _, 0, _
- %xmm16 = VFMSUB132SSZm %xmm16, %xmm16, %rsi, 1, _, 0, _
- ; CHECK: %xmm16 = VFMSUB132SSZm_Int %xmm16, %xmm16, %rsi, 1, _, 0, _
- %xmm16 = VFMSUB132SSZm_Int %xmm16, %xmm16, %rsi, 1, _, 0, _
+ ; CHECK: %xmm16 = VFMSUB132SSZm %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
+ %xmm16 = VFMSUB132SSZm %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
+ ; CHECK: %xmm16 = VFMSUB132SSZm_Int %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
+ %xmm16 = VFMSUB132SSZm_Int %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
; CHECK: %xmm16 = VFMSUB132SSZr %xmm16, %xmm1, %xmm2
%xmm16 = VFMSUB132SSZr %xmm16, %xmm1, %xmm2
; CHECK: %xmm16 = VFMSUB132SSZr_Int %xmm16, %xmm1, %xmm2
%xmm16 = VFMSUB132SSZr_Int %xmm16, %xmm1, %xmm2
- ; CHECK: %xmm16 = VFMSUB213SDZm %xmm16, %xmm16, %rsi, 1, _, 0, _
- %xmm16 = VFMSUB213SDZm %xmm16, %xmm16, %rsi, 1, _, 0, _
- ; CHECK: %xmm16 = VFMSUB213SDZm_Int %xmm16, %xmm16, %rsi, 1, _, 0, _
- %xmm16 = VFMSUB213SDZm_Int %xmm16, %xmm16, %rsi, 1, _, 0, _
+ ; CHECK: %xmm16 = VFMSUB213SDZm %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
+ %xmm16 = VFMSUB213SDZm %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
+ ; CHECK: %xmm16 = VFMSUB213SDZm_Int %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
+ %xmm16 = VFMSUB213SDZm_Int %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
; CHECK: %xmm16 = VFMSUB213SDZr %xmm16, %xmm1, %xmm2
%xmm16 = VFMSUB213SDZr %xmm16, %xmm1, %xmm2
; CHECK: %xmm16 = VFMSUB213SDZr_Int %xmm16, %xmm1, %xmm2
%xmm16 = VFMSUB213SDZr_Int %xmm16, %xmm1, %xmm2
- ; CHECK: %xmm16 = VFMSUB213SSZm %xmm16, %xmm16, %rsi, 1, _, 0, _
- %xmm16 = VFMSUB213SSZm %xmm16, %xmm16, %rsi, 1, _, 0, _
- ; CHECK: %xmm16 = VFMSUB213SSZm_Int %xmm16, %xmm16, %rsi, 1, _, 0, _
- %xmm16 = VFMSUB213SSZm_Int %xmm16, %xmm16, %rsi, 1, _, 0, _
+ ; CHECK: %xmm16 = VFMSUB213SSZm %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
+ %xmm16 = VFMSUB213SSZm %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
+ ; CHECK: %xmm16 = VFMSUB213SSZm_Int %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
+ %xmm16 = VFMSUB213SSZm_Int %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
; CHECK: %xmm16 = VFMSUB213SSZr %xmm16, %xmm1, %xmm2
%xmm16 = VFMSUB213SSZr %xmm16, %xmm1, %xmm2
; CHECK: %xmm16 = VFMSUB213SSZr_Int %xmm16, %xmm1, %xmm2
%xmm16 = VFMSUB213SSZr_Int %xmm16, %xmm1, %xmm2
- ; CHECK: %xmm16 = VFMSUB231SDZm %xmm16, %xmm16, %rsi, 1, _, 0, _
- %xmm16 = VFMSUB231SDZm %xmm16, %xmm16, %rsi, 1, _, 0, _
- ; CHECK: %xmm16 = VFMSUB231SDZm_Int %xmm16, %xmm16, %rsi, 1, _, 0, _
- %xmm16 = VFMSUB231SDZm_Int %xmm16, %xmm16, %rsi, 1, _, 0, _
+ ; CHECK: %xmm16 = VFMSUB231SDZm %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
+ %xmm16 = VFMSUB231SDZm %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
+ ; CHECK: %xmm16 = VFMSUB231SDZm_Int %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
+ %xmm16 = VFMSUB231SDZm_Int %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
; CHECK: %xmm16 = VFMSUB231SDZr %xmm16, %xmm1, %xmm2
%xmm16 = VFMSUB231SDZr %xmm16, %xmm1, %xmm2
; CHECK: %xmm16 = VFMSUB231SDZr_Int %xmm16, %xmm1, %xmm2
%xmm16 = VFMSUB231SDZr_Int %xmm16, %xmm1, %xmm2
- ; CHECK: %xmm16 = VFMSUB231SSZm %xmm16, %xmm16, %rsi, 1, _, 0, _
- %xmm16 = VFMSUB231SSZm %xmm16, %xmm16, %rsi, 1, _, 0, _
- ; CHECK: %xmm16 = VFMSUB231SSZm_Int %xmm16, %xmm16, %rsi, 1, _, 0, _
- %xmm16 = VFMSUB231SSZm_Int %xmm16, %xmm16, %rsi, 1, _, 0, _
+ ; CHECK: %xmm16 = VFMSUB231SSZm %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
+ %xmm16 = VFMSUB231SSZm %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
+ ; CHECK: %xmm16 = VFMSUB231SSZm_Int %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
+ %xmm16 = VFMSUB231SSZm_Int %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
; CHECK: %xmm16 = VFMSUB231SSZr %xmm16, %xmm1, %xmm2
%xmm16 = VFMSUB231SSZr %xmm16, %xmm1, %xmm2
; CHECK: %xmm16 = VFMSUB231SSZr_Int %xmm16, %xmm1, %xmm2
%xmm16 = VFMSUB231SSZr_Int %xmm16, %xmm1, %xmm2
- ; CHECK: %xmm16 = VFNMADD132SDZm %xmm16, %xmm16, %rsi, 1, _, 0, _
- %xmm16 = VFNMADD132SDZm %xmm16, %xmm16, %rsi, 1, _, 0, _
- ; CHECK: %xmm16 = VFNMADD132SDZm_Int %xmm16, %xmm16, %rsi, 1, _, 0, _
- %xmm16 = VFNMADD132SDZm_Int %xmm16, %xmm16, %rsi, 1, _, 0, _
+ ; CHECK: %xmm16 = VFNMADD132SDZm %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
+ %xmm16 = VFNMADD132SDZm %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
+ ; CHECK: %xmm16 = VFNMADD132SDZm_Int %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
+ %xmm16 = VFNMADD132SDZm_Int %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
; CHECK: %xmm16 = VFNMADD132SDZr %xmm16, %xmm1, %xmm2
%xmm16 = VFNMADD132SDZr %xmm16, %xmm1, %xmm2
; CHECK: %xmm16 = VFNMADD132SDZr_Int %xmm16, %xmm1, %xmm2
%xmm16 = VFNMADD132SDZr_Int %xmm16, %xmm1, %xmm2
- ; CHECK: %xmm16 = VFNMADD132SSZm %xmm16, %xmm16, %rsi, 1, _, 0, _
- %xmm16 = VFNMADD132SSZm %xmm16, %xmm16, %rsi, 1, _, 0, _
- ; CHECK: %xmm16 = VFNMADD132SSZm_Int %xmm16, %xmm16, %rsi, 1, _, 0, _
- %xmm16 = VFNMADD132SSZm_Int %xmm16, %xmm16, %rsi, 1, _, 0, _
+ ; CHECK: %xmm16 = VFNMADD132SSZm %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
+ %xmm16 = VFNMADD132SSZm %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
+ ; CHECK: %xmm16 = VFNMADD132SSZm_Int %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
+ %xmm16 = VFNMADD132SSZm_Int %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
; CHECK: %xmm16 = VFNMADD132SSZr %xmm16, %xmm1, %xmm2
%xmm16 = VFNMADD132SSZr %xmm16, %xmm1, %xmm2
; CHECK: %xmm16 = VFNMADD132SSZr_Int %xmm16, %xmm1, %xmm2
%xmm16 = VFNMADD132SSZr_Int %xmm16, %xmm1, %xmm2
- ; CHECK: %xmm16 = VFNMADD213SDZm %xmm16, %xmm16, %rsi, 1, _, 0, _
- %xmm16 = VFNMADD213SDZm %xmm16, %xmm16, %rsi, 1, _, 0, _
- ; CHECK: %xmm16 = VFNMADD213SDZm_Int %xmm16, %xmm16, %rsi, 1, _, 0, _
- %xmm16 = VFNMADD213SDZm_Int %xmm16, %xmm16, %rsi, 1, _, 0, _
+ ; CHECK: %xmm16 = VFNMADD213SDZm %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
+ %xmm16 = VFNMADD213SDZm %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
+ ; CHECK: %xmm16 = VFNMADD213SDZm_Int %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
+ %xmm16 = VFNMADD213SDZm_Int %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
; CHECK: %xmm16 = VFNMADD213SDZr %xmm16, %xmm1, %xmm2
%xmm16 = VFNMADD213SDZr %xmm16, %xmm1, %xmm2
; CHECK: %xmm16 = VFNMADD213SDZr_Int %xmm16, %xmm1, %xmm2
%xmm16 = VFNMADD213SDZr_Int %xmm16, %xmm1, %xmm2
- ; CHECK: %xmm16 = VFNMADD213SSZm %xmm16, %xmm16, %rsi, 1, _, 0, _
- %xmm16 = VFNMADD213SSZm %xmm16, %xmm16, %rsi, 1, _, 0, _
- ; CHECK: %xmm16 = VFNMADD213SSZm_Int %xmm16, %xmm16, %rsi, 1, _, 0, _
- %xmm16 = VFNMADD213SSZm_Int %xmm16, %xmm16, %rsi, 1, _, 0, _
+ ; CHECK: %xmm16 = VFNMADD213SSZm %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
+ %xmm16 = VFNMADD213SSZm %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
+ ; CHECK: %xmm16 = VFNMADD213SSZm_Int %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
+ %xmm16 = VFNMADD213SSZm_Int %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
; CHECK: %xmm16 = VFNMADD213SSZr %xmm16, %xmm1, %xmm2
%xmm16 = VFNMADD213SSZr %xmm16, %xmm1, %xmm2
; CHECK: %xmm16 = VFNMADD213SSZr_Int %xmm16, %xmm1, %xmm2
%xmm16 = VFNMADD213SSZr_Int %xmm16, %xmm1, %xmm2
- ; CHECK: %xmm16 = VFNMADD231SDZm %xmm16, %xmm16, %rsi, 1, _, 0, _
- %xmm16 = VFNMADD231SDZm %xmm16, %xmm16, %rsi, 1, _, 0, _
- ; CHECK: %xmm16 = VFNMADD231SDZm_Int %xmm16, %xmm16, %rsi, 1, _, 0, _
- %xmm16 = VFNMADD231SDZm_Int %xmm16, %xmm16, %rsi, 1, _, 0, _
+ ; CHECK: %xmm16 = VFNMADD231SDZm %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
+ %xmm16 = VFNMADD231SDZm %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
+ ; CHECK: %xmm16 = VFNMADD231SDZm_Int %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
+ %xmm16 = VFNMADD231SDZm_Int %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
; CHECK: %xmm16 = VFNMADD231SDZr %xmm16, %xmm1, %xmm2
%xmm16 = VFNMADD231SDZr %xmm16, %xmm1, %xmm2
; CHECK: %xmm16 = VFNMADD231SDZr_Int %xmm16, %xmm1, %xmm2
%xmm16 = VFNMADD231SDZr_Int %xmm16, %xmm1, %xmm2
- ; CHECK: %xmm16 = VFNMADD231SSZm %xmm16, %xmm16, %rsi, 1, _, 0, _
- %xmm16 = VFNMADD231SSZm %xmm16, %xmm16, %rsi, 1, _, 0, _
- ; CHECK: %xmm16 = VFNMADD231SSZm_Int %xmm16, %xmm16, %rsi, 1, _, 0, _
- %xmm16 = VFNMADD231SSZm_Int %xmm16, %xmm16, %rsi, 1, _, 0, _
+ ; CHECK: %xmm16 = VFNMADD231SSZm %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
+ %xmm16 = VFNMADD231SSZm %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
+ ; CHECK: %xmm16 = VFNMADD231SSZm_Int %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
+ %xmm16 = VFNMADD231SSZm_Int %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
; CHECK: %xmm16 = VFNMADD231SSZr %xmm16, %xmm1, %xmm2
%xmm16 = VFNMADD231SSZr %xmm16, %xmm1, %xmm2
; CHECK: %xmm16 = VFNMADD231SSZr_Int %xmm16, %xmm1, %xmm2
%xmm16 = VFNMADD231SSZr_Int %xmm16, %xmm1, %xmm2
- ; CHECK: %xmm16 = VFNMSUB132SDZm %xmm16, %xmm16, %rsi, 1, _, 0, _
- %xmm16 = VFNMSUB132SDZm %xmm16, %xmm16, %rsi, 1, _, 0, _
- ; CHECK: %xmm16 = VFNMSUB132SDZm_Int %xmm16, %xmm16, %rsi, 1, _, 0, _
- %xmm16 = VFNMSUB132SDZm_Int %xmm16, %xmm16, %rsi, 1, _, 0, _
+ ; CHECK: %xmm16 = VFNMSUB132SDZm %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
+ %xmm16 = VFNMSUB132SDZm %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
+ ; CHECK: %xmm16 = VFNMSUB132SDZm_Int %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
+ %xmm16 = VFNMSUB132SDZm_Int %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
; CHECK: %xmm16 = VFNMSUB132SDZr %xmm16, %xmm1, %xmm2
%xmm16 = VFNMSUB132SDZr %xmm16, %xmm1, %xmm2
; CHECK: %xmm16 = VFNMSUB132SDZr_Int %xmm16, %xmm1, %xmm2
%xmm16 = VFNMSUB132SDZr_Int %xmm16, %xmm1, %xmm2
- ; CHECK: %xmm16 = VFNMSUB132SSZm %xmm16, %xmm16, %rsi, 1, _, 0, _
- %xmm16 = VFNMSUB132SSZm %xmm16, %xmm16, %rsi, 1, _, 0, _
- ; CHECK: %xmm16 = VFNMSUB132SSZm_Int %xmm16, %xmm16, %rsi, 1, _, 0, _
- %xmm16 = VFNMSUB132SSZm_Int %xmm16, %xmm16, %rsi, 1, _, 0, _
+ ; CHECK: %xmm16 = VFNMSUB132SSZm %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
+ %xmm16 = VFNMSUB132SSZm %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
+ ; CHECK: %xmm16 = VFNMSUB132SSZm_Int %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
+ %xmm16 = VFNMSUB132SSZm_Int %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
; CHECK: %xmm16 = VFNMSUB132SSZr %xmm16, %xmm1, %xmm2
%xmm16 = VFNMSUB132SSZr %xmm16, %xmm1, %xmm2
; CHECK: %xmm16 = VFNMSUB132SSZr_Int %xmm16, %xmm1, %xmm2
%xmm16 = VFNMSUB132SSZr_Int %xmm16, %xmm1, %xmm2
- ; CHECK: %xmm16 = VFNMSUB213SDZm %xmm16, %xmm16, %rsi, 1, _, 0, _
- %xmm16 = VFNMSUB213SDZm %xmm16, %xmm16, %rsi, 1, _, 0, _
- ; CHECK: %xmm16 = VFNMSUB213SDZm_Int %xmm16, %xmm16, %rsi, 1, _, 0, _
- %xmm16 = VFNMSUB213SDZm_Int %xmm16, %xmm16, %rsi, 1, _, 0, _
+ ; CHECK: %xmm16 = VFNMSUB213SDZm %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
+ %xmm16 = VFNMSUB213SDZm %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
+ ; CHECK: %xmm16 = VFNMSUB213SDZm_Int %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
+ %xmm16 = VFNMSUB213SDZm_Int %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
; CHECK: %xmm16 = VFNMSUB213SDZr %xmm16, %xmm1, %xmm2
%xmm16 = VFNMSUB213SDZr %xmm16, %xmm1, %xmm2
; CHECK: %xmm16 = VFNMSUB213SDZr_Int %xmm16, %xmm1, %xmm2
%xmm16 = VFNMSUB213SDZr_Int %xmm16, %xmm1, %xmm2
- ; CHECK: %xmm16 = VFNMSUB213SSZm %xmm16, %xmm16, %rsi, 1, _, 0, _
- %xmm16 = VFNMSUB213SSZm %xmm16, %xmm16, %rsi, 1, _, 0, _
- ; CHECK: %xmm16 = VFNMSUB213SSZm_Int %xmm16, %xmm16, %rsi, 1, _, 0, _
- %xmm16 = VFNMSUB213SSZm_Int %xmm16, %xmm16, %rsi, 1, _, 0, _
+ ; CHECK: %xmm16 = VFNMSUB213SSZm %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
+ %xmm16 = VFNMSUB213SSZm %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
+ ; CHECK: %xmm16 = VFNMSUB213SSZm_Int %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
+ %xmm16 = VFNMSUB213SSZm_Int %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
; CHECK: %xmm16 = VFNMSUB213SSZr %xmm16, %xmm1, %xmm2
%xmm16 = VFNMSUB213SSZr %xmm16, %xmm1, %xmm2
; CHECK: %xmm16 = VFNMSUB213SSZr_Int %xmm16, %xmm1, %xmm2
%xmm16 = VFNMSUB213SSZr_Int %xmm16, %xmm1, %xmm2
- ; CHECK: %xmm16 = VFNMSUB231SDZm %xmm16, %xmm16, %rsi, 1, _, 0, _
- %xmm16 = VFNMSUB231SDZm %xmm16, %xmm16, %rsi, 1, _, 0, _
- ; CHECK: %xmm16 = VFNMSUB231SDZm_Int %xmm16, %xmm16, %rsi, 1, _, 0, _
- %xmm16 = VFNMSUB231SDZm_Int %xmm16, %xmm16, %rsi, 1, _, 0, _
+ ; CHECK: %xmm16 = VFNMSUB231SDZm %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
+ %xmm16 = VFNMSUB231SDZm %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
+ ; CHECK: %xmm16 = VFNMSUB231SDZm_Int %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
+ %xmm16 = VFNMSUB231SDZm_Int %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
; CHECK: %xmm16 = VFNMSUB231SDZr %xmm16, %xmm1, %xmm2
%xmm16 = VFNMSUB231SDZr %xmm16, %xmm1, %xmm2
; CHECK: %xmm16 = VFNMSUB231SDZr_Int %xmm16, %xmm1, %xmm2
%xmm16 = VFNMSUB231SDZr_Int %xmm16, %xmm1, %xmm2
- ; CHECK: %xmm16 = VFNMSUB231SSZm %xmm16, %xmm16, %rsi, 1, _, 0, _
- %xmm16 = VFNMSUB231SSZm %xmm16, %xmm16, %rsi, 1, _, 0, _
- ; CHECK: %xmm16 = VFNMSUB231SSZm_Int %xmm16, %xmm16, %rsi, 1, _, 0, _
- %xmm16 = VFNMSUB231SSZm_Int %xmm16, %xmm16, %rsi, 1, _, 0, _
+ ; CHECK: %xmm16 = VFNMSUB231SSZm %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
+ %xmm16 = VFNMSUB231SSZm %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
+ ; CHECK: %xmm16 = VFNMSUB231SSZm_Int %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
+ %xmm16 = VFNMSUB231SSZm_Int %xmm16, %xmm16, %rsi, 1, %noreg, 0, %noreg
; CHECK: %xmm16 = VFNMSUB231SSZr %xmm16, %xmm1, %xmm2
%xmm16 = VFNMSUB231SSZr %xmm16, %xmm1, %xmm2
; CHECK: %xmm16 = VFNMSUB231SSZr_Int %xmm16, %xmm1, %xmm2
%xmm16 = VFNMSUB231SSZr_Int %xmm16, %xmm1, %xmm2
- ; CHECK: VPEXTRBZmr %rdi, 1, _, 0, _, %xmm16, 3
- VPEXTRBZmr %rdi, 1, _, 0, _, %xmm16, 3
+ ; CHECK: VPEXTRBZmr %rdi, 1, %noreg, 0, %noreg, %xmm16, 3
+ VPEXTRBZmr %rdi, 1, %noreg, 0, %noreg, %xmm16, 3
; CHECK: %eax = VPEXTRBZrr %xmm16, 1
%eax = VPEXTRBZrr %xmm16, 1
- ; CHECK: VPEXTRDZmr %rdi, 1, _, 0, _, %xmm16, 3
- VPEXTRDZmr %rdi, 1, _, 0, _, %xmm16, 3
+ ; CHECK: VPEXTRDZmr %rdi, 1, %noreg, 0, %noreg, %xmm16, 3
+ VPEXTRDZmr %rdi, 1, %noreg, 0, %noreg, %xmm16, 3
; CHECK: %eax = VPEXTRDZrr %xmm16, 1
%eax = VPEXTRDZrr %xmm16, 1
- ; CHECK: VPEXTRQZmr %rdi, 1, _, 0, _, %xmm16, 3
- VPEXTRQZmr %rdi, 1, _, 0, _, %xmm16, 3
+ ; CHECK: VPEXTRQZmr %rdi, 1, %noreg, 0, %noreg, %xmm16, 3
+ VPEXTRQZmr %rdi, 1, %noreg, 0, %noreg, %xmm16, 3
; CHECK: %rax = VPEXTRQZrr %xmm16, 1
%rax = VPEXTRQZrr %xmm16, 1
- ; CHECK: VPEXTRWZmr %rdi, 1, _, 0, _, %xmm16, 3
- VPEXTRWZmr %rdi, 1, _, 0, _, %xmm16, 3
+ ; CHECK: VPEXTRWZmr %rdi, 1, %noreg, 0, %noreg, %xmm16, 3
+ VPEXTRWZmr %rdi, 1, %noreg, 0, %noreg, %xmm16, 3
; CHECK: %eax = VPEXTRWZrr %xmm16, 1
%eax = VPEXTRWZrr %xmm16, 1
; CHECK: %eax = VPEXTRWZrr_REV %xmm16, 1
%eax = VPEXTRWZrr_REV %xmm16, 1
- ; CHECK: %xmm16 = VPINSRBZrm %xmm16, %rsi, 1, _, 0, _, 3
- %xmm16 = VPINSRBZrm %xmm16, %rsi, 1, _, 0, _, 3
+ ; CHECK: %xmm16 = VPINSRBZrm %xmm16, %rsi, 1, %noreg, 0, %noreg, 3
+ %xmm16 = VPINSRBZrm %xmm16, %rsi, 1, %noreg, 0, %noreg, 3
; CHECK: %xmm16 = VPINSRBZrr %xmm16, %edi, 5
%xmm16 = VPINSRBZrr %xmm16, %edi, 5
- ; CHECK: %xmm16 = VPINSRDZrm %xmm16, %rsi, 1, _, 0, _, 3
- %xmm16 = VPINSRDZrm %xmm16, %rsi, 1, _, 0, _, 3
+ ; CHECK: %xmm16 = VPINSRDZrm %xmm16, %rsi, 1, %noreg, 0, %noreg, 3
+ %xmm16 = VPINSRDZrm %xmm16, %rsi, 1, %noreg, 0, %noreg, 3
; CHECK: %xmm16 = VPINSRDZrr %xmm16, %edi, 5
%xmm16 = VPINSRDZrr %xmm16, %edi, 5
- ; CHECK: %xmm16 = VPINSRQZrm %xmm16, %rsi, 1, _, 0, _, 3
- %xmm16 = VPINSRQZrm %xmm16, %rsi, 1, _, 0, _, 3
+ ; CHECK: %xmm16 = VPINSRQZrm %xmm16, %rsi, 1, %noreg, 0, %noreg, 3
+ %xmm16 = VPINSRQZrm %xmm16, %rsi, 1, %noreg, 0, %noreg, 3
; CHECK: %xmm16 = VPINSRQZrr %xmm16, %rdi, 5
%xmm16 = VPINSRQZrr %xmm16, %rdi, 5
- ; CHECK: %xmm16 = VPINSRWZrm %xmm16, %rsi, 1, _, 0, _, 3
- %xmm16 = VPINSRWZrm %xmm16, %rsi, 1, _, 0, _, 3
+ ; CHECK: %xmm16 = VPINSRWZrm %xmm16, %rsi, 1, %noreg, 0, %noreg, 3
+ %xmm16 = VPINSRWZrm %xmm16, %rsi, 1, %noreg, 0, %noreg, 3
; CHECK: %xmm16 = VPINSRWZrr %xmm16, %edi, 5
%xmm16 = VPINSRWZrr %xmm16, %edi, 5
- ; CHECK: %xmm16 = VSQRTSDZm %xmm16, _, _, _, _, _
- %xmm16 = VSQRTSDZm %xmm16, _, _, _, _, _
- ; CHECK: %xmm16 = VSQRTSDZm_Int %xmm16, _, _, _, _, _
- %xmm16 = VSQRTSDZm_Int %xmm16, _, _, _, _, _
- ; CHECK: %xmm16 = VSQRTSDZr %xmm16, _
- %xmm16 = VSQRTSDZr %xmm16, _
- ; CHECK: %xmm16 = VSQRTSDZr_Int %xmm16, _
- %xmm16 = VSQRTSDZr_Int %xmm16, _
- ; CHECK: %xmm16 = VSQRTSSZm %xmm16, _, _, _, _, _
- %xmm16 = VSQRTSSZm %xmm16, _, _, _, _, _
- ; CHECK: %xmm16 = VSQRTSSZm_Int %xmm16, _, _, _, _, _
- %xmm16 = VSQRTSSZm_Int %xmm16, _, _, _, _, _
- ; CHECK: %xmm16 = VSQRTSSZr %xmm16, _
- %xmm16 = VSQRTSSZr %xmm16, _
- ; CHECK: %xmm16 = VSQRTSSZr_Int %xmm16, _
- %xmm16 = VSQRTSSZr_Int %xmm16, _
- ; CHECK: %rdi = VCVTSD2SI64Zrm %rdi, %xmm16, 1, _, 0
- %rdi = VCVTSD2SI64Zrm %rdi, %xmm16, 1, _, 0
- ; CHECK: %rdi = VCVTSD2SI64Zrr %xmm16
- %rdi = VCVTSD2SI64Zrr %xmm16
- ; CHECK: %edi = VCVTSD2SIZrm %rdi, %xmm16, 1, _, 0
- %edi = VCVTSD2SIZrm %rdi, %xmm16, 1, _, 0
- ; CHECK: %edi = VCVTSD2SIZrr %xmm16
- %edi = VCVTSD2SIZrr %xmm16
- ; CHECK: %xmm16 = VCVTSD2SSZrm %xmm16, %rdi, 1, _, 0, _
- %xmm16 = VCVTSD2SSZrm %xmm16, %rdi, 1, _, 0, _
- ; CHECK: %xmm16 = VCVTSD2SSZrm_Int %xmm16, %rdi, 1, _, 0, _
- %xmm16 = VCVTSD2SSZrm_Int %xmm16, %rdi, 1, _, 0, _
- ; CHECK: %xmm16 = VCVTSD2SSZrr %xmm16, _
- %xmm16 = VCVTSD2SSZrr %xmm16, _
- ; CHECK: %xmm16 = VCVTSD2SSZrr_Int %xmm16, _
- %xmm16 = VCVTSD2SSZrr_Int %xmm16, _
- ; CHECK: %xmm16 = VCVTSI2SDZrm %xmm16, %rdi, 1, _, 0, _
- %xmm16 = VCVTSI2SDZrm %xmm16, %rdi, 1, _, 0, _
- ; CHECK: %xmm16 = VCVTSI2SDZrm_Int %xmm16, %rdi, 1, _, 0, _
- %xmm16 = VCVTSI2SDZrm_Int %xmm16, %rdi, 1, _, 0, _
- ; CHECK: %xmm16 = VCVTSI2SDZrr %xmm16, _
- %xmm16 = VCVTSI2SDZrr %xmm16, _
- ; CHECK: %xmm16 = VCVTSI2SDZrr_Int %xmm16, _
- %xmm16 = VCVTSI2SDZrr_Int %xmm16, _
- ; CHECK: %xmm16 = VCVTSI2SSZrm %xmm16, %rdi, 1, _, 0, _
- %xmm16 = VCVTSI2SSZrm %xmm16, %rdi, 1, _, 0, _
- ; CHECK: %xmm16 = VCVTSI2SSZrm_Int %xmm16, %rdi, 1, _, 0, _
- %xmm16 = VCVTSI2SSZrm_Int %xmm16, %rdi, 1, _, 0, _
- ; CHECK: %xmm16 = VCVTSI2SSZrr %xmm16, _
- %xmm16 = VCVTSI2SSZrr %xmm16, _
- ; CHECK: %xmm16 = VCVTSI2SSZrr_Int %xmm16, _
- %xmm16 = VCVTSI2SSZrr_Int %xmm16, _
- ; CHECK: %xmm16 = VCVTSI642SDZrm %xmm16, %rdi, 1, _, 0, _
- %xmm16 = VCVTSI642SDZrm %xmm16, %rdi, 1, _, 0, _
- ; CHECK: %xmm16 = VCVTSI642SDZrm_Int %xmm16, %rdi, 1, _, 0, _
- %xmm16 = VCVTSI642SDZrm_Int %xmm16, %rdi, 1, _, 0, _
- ; CHECK: %xmm16 = VCVTSI642SDZrr %xmm16, _
- %xmm16 = VCVTSI642SDZrr %xmm16, _
- ; CHECK: %xmm16 = VCVTSI642SDZrr_Int %xmm16, _
- %xmm16 = VCVTSI642SDZrr_Int %xmm16, _
- ; CHECK: %xmm16 = VCVTSI642SSZrm %xmm16, %rdi, 1, _, 0, _
- %xmm16 = VCVTSI642SSZrm %xmm16, %rdi, 1, _, 0, _
- ; CHECK: %xmm16 = VCVTSI642SSZrm_Int %xmm16, %rdi, 1, _, 0, _
- %xmm16 = VCVTSI642SSZrm_Int %xmm16, %rdi, 1, _, 0, _
- ; CHECK: %xmm16 = VCVTSI642SSZrr %xmm16, _
- %xmm16 = VCVTSI642SSZrr %xmm16, _
- ; CHECK: %xmm16 = VCVTSI642SSZrr_Int %xmm16, _
- %xmm16 = VCVTSI642SSZrr_Int %xmm16, _
- ; CHECK: %xmm16 = VCVTSS2SDZrm %xmm16, %rdi, 1, _, 0, _
- %xmm16 = VCVTSS2SDZrm %xmm16, %rdi, 1, _, 0, _
- ; CHECK: %xmm16 = VCVTSS2SDZrm_Int %xmm16, %rdi, 1, _, 0, _
- %xmm16 = VCVTSS2SDZrm_Int %xmm16, %rdi, 1, _, 0, _
- ; CHECK: %xmm16 = VCVTSS2SDZrr %xmm16, _
- %xmm16 = VCVTSS2SDZrr %xmm16, _
- ; CHECK: %xmm16 = VCVTSS2SDZrr_Int %xmm16, _
- %xmm16 = VCVTSS2SDZrr_Int %xmm16, _
- ; CHECK: %rdi = VCVTSS2SI64Zrm %rdi, %xmm16, 1, _, 0
- %rdi = VCVTSS2SI64Zrm %rdi, %xmm16, 1, _, 0
- ; CHECK: %rdi = VCVTSS2SI64Zrr %xmm16
- %rdi = VCVTSS2SI64Zrr %xmm16
- ; CHECK: %edi = VCVTSS2SIZrm %rdi, %xmm16, 1, _, 0
- %edi = VCVTSS2SIZrm %rdi, %xmm16, 1, _, 0
- ; CHECK: %edi = VCVTSS2SIZrr %xmm16
- %edi = VCVTSS2SIZrr %xmm16
- ; CHECK: %rdi = VCVTTSD2SI64Zrm %rdi, %xmm16, 1, _, 0
- %rdi = VCVTTSD2SI64Zrm %rdi, %xmm16, 1, _, 0
- ; CHECK: %rdi = VCVTTSD2SI64Zrm_Int %rdi, %xmm16, 1, _, 0
- %rdi = VCVTTSD2SI64Zrm_Int %rdi, %xmm16, 1, _, 0
+ ; CHECK: %xmm16 = VSQRTSDZm %xmm16, %noreg, %noreg, %noreg, %noreg, %noreg
+ %xmm16 = VSQRTSDZm %xmm16, %noreg, %noreg, %noreg, %noreg, %noreg
+ ; CHECK: %xmm16 = VSQRTSDZm_Int %xmm16, %noreg, %noreg, %noreg, %noreg, %noreg
+ %xmm16 = VSQRTSDZm_Int %xmm16, %noreg, %noreg, %noreg, %noreg, %noreg
+ ; CHECK: %xmm16 = VSQRTSDZr %xmm16, %noreg
+ %xmm16 = VSQRTSDZr %xmm16, %noreg
+ ; CHECK: %xmm16 = VSQRTSDZr_Int %xmm16, %noreg
+ %xmm16 = VSQRTSDZr_Int %xmm16, %noreg
+ ; CHECK: %xmm16 = VSQRTSSZm %xmm16, %noreg, %noreg, %noreg, %noreg, %noreg
+ %xmm16 = VSQRTSSZm %xmm16, %noreg, %noreg, %noreg, %noreg, %noreg
+ ; CHECK: %xmm16 = VSQRTSSZm_Int %xmm16, %noreg, %noreg, %noreg, %noreg, %noreg
+ %xmm16 = VSQRTSSZm_Int %xmm16, %noreg, %noreg, %noreg, %noreg, %noreg
+ ; CHECK: %xmm16 = VSQRTSSZr %xmm16, %noreg
+ %xmm16 = VSQRTSSZr %xmm16, %noreg
+ ; CHECK: %xmm16 = VSQRTSSZr_Int %xmm16, %noreg
+ %xmm16 = VSQRTSSZr_Int %xmm16, %noreg
+ ; CHECK: %rdi = VCVTSD2SI64Zrm_Int %rdi, %xmm16, 1, %noreg, 0
+ %rdi = VCVTSD2SI64Zrm_Int %rdi, %xmm16, 1, %noreg, 0
+ ; CHECK: %rdi = VCVTSD2SI64Zrr_Int %xmm16
+ %rdi = VCVTSD2SI64Zrr_Int %xmm16
+ ; CHECK: %edi = VCVTSD2SIZrm_Int %rdi, %xmm16, 1, %noreg, 0
+ %edi = VCVTSD2SIZrm_Int %rdi, %xmm16, 1, %noreg, 0
+ ; CHECK: %edi = VCVTSD2SIZrr_Int %xmm16
+ %edi = VCVTSD2SIZrr_Int %xmm16
+ ; CHECK: %xmm16 = VCVTSD2SSZrm %xmm16, %rdi, 1, %noreg, 0, %noreg
+ %xmm16 = VCVTSD2SSZrm %xmm16, %rdi, 1, %noreg, 0, %noreg
+ ; CHECK: %xmm16 = VCVTSD2SSZrm_Int %xmm16, %rdi, 1, %noreg, 0, %noreg
+ %xmm16 = VCVTSD2SSZrm_Int %xmm16, %rdi, 1, %noreg, 0, %noreg
+ ; CHECK: %xmm16 = VCVTSD2SSZrr %xmm16, %noreg
+ %xmm16 = VCVTSD2SSZrr %xmm16, %noreg
+ ; CHECK: %xmm16 = VCVTSD2SSZrr_Int %xmm16, %noreg
+ %xmm16 = VCVTSD2SSZrr_Int %xmm16, %noreg
+ ; CHECK: %xmm16 = VCVTSI2SDZrm %xmm16, %rdi, 1, %noreg, 0, %noreg
+ %xmm16 = VCVTSI2SDZrm %xmm16, %rdi, 1, %noreg, 0, %noreg
+ ; CHECK: %xmm16 = VCVTSI2SDZrm_Int %xmm16, %rdi, 1, %noreg, 0, %noreg
+ %xmm16 = VCVTSI2SDZrm_Int %xmm16, %rdi, 1, %noreg, 0, %noreg
+ ; CHECK: %xmm16 = VCVTSI2SDZrr %xmm16, %noreg
+ %xmm16 = VCVTSI2SDZrr %xmm16, %noreg
+ ; CHECK: %xmm16 = VCVTSI2SDZrr_Int %xmm16, %noreg
+ %xmm16 = VCVTSI2SDZrr_Int %xmm16, %noreg
+ ; CHECK: %xmm16 = VCVTSI2SSZrm %xmm16, %rdi, 1, %noreg, 0, %noreg
+ %xmm16 = VCVTSI2SSZrm %xmm16, %rdi, 1, %noreg, 0, %noreg
+ ; CHECK: %xmm16 = VCVTSI2SSZrm_Int %xmm16, %rdi, 1, %noreg, 0, %noreg
+ %xmm16 = VCVTSI2SSZrm_Int %xmm16, %rdi, 1, %noreg, 0, %noreg
+ ; CHECK: %xmm16 = VCVTSI2SSZrr %xmm16, %noreg
+ %xmm16 = VCVTSI2SSZrr %xmm16, %noreg
+ ; CHECK: %xmm16 = VCVTSI2SSZrr_Int %xmm16, %noreg
+ %xmm16 = VCVTSI2SSZrr_Int %xmm16, %noreg
+ ; CHECK: %xmm16 = VCVTSI642SDZrm %xmm16, %rdi, 1, %noreg, 0, %noreg
+ %xmm16 = VCVTSI642SDZrm %xmm16, %rdi, 1, %noreg, 0, %noreg
+ ; CHECK: %xmm16 = VCVTSI642SDZrm_Int %xmm16, %rdi, 1, %noreg, 0, %noreg
+ %xmm16 = VCVTSI642SDZrm_Int %xmm16, %rdi, 1, %noreg, 0, %noreg
+ ; CHECK: %xmm16 = VCVTSI642SDZrr %xmm16, %noreg
+ %xmm16 = VCVTSI642SDZrr %xmm16, %noreg
+ ; CHECK: %xmm16 = VCVTSI642SDZrr_Int %xmm16, %noreg
+ %xmm16 = VCVTSI642SDZrr_Int %xmm16, %noreg
+ ; CHECK: %xmm16 = VCVTSI642SSZrm %xmm16, %rdi, 1, %noreg, 0, %noreg
+ %xmm16 = VCVTSI642SSZrm %xmm16, %rdi, 1, %noreg, 0, %noreg
+ ; CHECK: %xmm16 = VCVTSI642SSZrm_Int %xmm16, %rdi, 1, %noreg, 0, %noreg
+ %xmm16 = VCVTSI642SSZrm_Int %xmm16, %rdi, 1, %noreg, 0, %noreg
+ ; CHECK: %xmm16 = VCVTSI642SSZrr %xmm16, %noreg
+ %xmm16 = VCVTSI642SSZrr %xmm16, %noreg
+ ; CHECK: %xmm16 = VCVTSI642SSZrr_Int %xmm16, %noreg
+ %xmm16 = VCVTSI642SSZrr_Int %xmm16, %noreg
+ ; CHECK: %xmm16 = VCVTSS2SDZrm %xmm16, %rdi, 1, %noreg, 0, %noreg
+ %xmm16 = VCVTSS2SDZrm %xmm16, %rdi, 1, %noreg, 0, %noreg
+ ; CHECK: %xmm16 = VCVTSS2SDZrm_Int %xmm16, %rdi, 1, %noreg, 0, %noreg
+ %xmm16 = VCVTSS2SDZrm_Int %xmm16, %rdi, 1, %noreg, 0, %noreg
+ ; CHECK: %xmm16 = VCVTSS2SDZrr %xmm16, %noreg
+ %xmm16 = VCVTSS2SDZrr %xmm16, %noreg
+ ; CHECK: %xmm16 = VCVTSS2SDZrr_Int %xmm16, %noreg
+ %xmm16 = VCVTSS2SDZrr_Int %xmm16, %noreg
+ ; CHECK: %rdi = VCVTSS2SI64Zrm_Int %rdi, %xmm16, 1, %noreg, 0
+ %rdi = VCVTSS2SI64Zrm_Int %rdi, %xmm16, 1, %noreg, 0
+ ; CHECK: %rdi = VCVTSS2SI64Zrr_Int %xmm16
+ %rdi = VCVTSS2SI64Zrr_Int %xmm16
+ ; CHECK: %edi = VCVTSS2SIZrm_Int %rdi, %xmm16, 1, %noreg, 0
+ %edi = VCVTSS2SIZrm_Int %rdi, %xmm16, 1, %noreg, 0
+ ; CHECK: %edi = VCVTSS2SIZrr_Int %xmm16
+ %edi = VCVTSS2SIZrr_Int %xmm16
+ ; CHECK: %rdi = VCVTTSD2SI64Zrm %rdi, %xmm16, 1, %noreg, 0
+ %rdi = VCVTTSD2SI64Zrm %rdi, %xmm16, 1, %noreg, 0
+ ; CHECK: %rdi = VCVTTSD2SI64Zrm_Int %rdi, %xmm16, 1, %noreg, 0
+ %rdi = VCVTTSD2SI64Zrm_Int %rdi, %xmm16, 1, %noreg, 0
; CHECK: %rdi = VCVTTSD2SI64Zrr %xmm16
%rdi = VCVTTSD2SI64Zrr %xmm16
; CHECK: %rdi = VCVTTSD2SI64Zrr_Int %xmm16
%rdi = VCVTTSD2SI64Zrr_Int %xmm16
- ; CHECK: %edi = VCVTTSD2SIZrm %rdi, %xmm16, 1, _, 0
- %edi = VCVTTSD2SIZrm %rdi, %xmm16, 1, _, 0
- ; CHECK: %edi = VCVTTSD2SIZrm_Int %rdi, %xmm16, 1, _, 0
- %edi = VCVTTSD2SIZrm_Int %rdi, %xmm16, 1, _, 0
+ ; CHECK: %edi = VCVTTSD2SIZrm %rdi, %xmm16, 1, %noreg, 0
+ %edi = VCVTTSD2SIZrm %rdi, %xmm16, 1, %noreg, 0
+ ; CHECK: %edi = VCVTTSD2SIZrm_Int %rdi, %xmm16, 1, %noreg, 0
+ %edi = VCVTTSD2SIZrm_Int %rdi, %xmm16, 1, %noreg, 0
; CHECK: %edi = VCVTTSD2SIZrr %xmm16
%edi = VCVTTSD2SIZrr %xmm16
; CHECK: %edi = VCVTTSD2SIZrr_Int %xmm16
%edi = VCVTTSD2SIZrr_Int %xmm16
- ; CHECK: %rdi = VCVTTSS2SI64Zrm %rdi, %xmm16, 1, _, 0
- %rdi = VCVTTSS2SI64Zrm %rdi, %xmm16, 1, _, 0
- ; CHECK: %rdi = VCVTTSS2SI64Zrm_Int %rdi, %xmm16, 1, _, 0
- %rdi = VCVTTSS2SI64Zrm_Int %rdi, %xmm16, 1, _, 0
+ ; CHECK: %rdi = VCVTTSS2SI64Zrm %rdi, %xmm16, 1, %noreg, 0
+ %rdi = VCVTTSS2SI64Zrm %rdi, %xmm16, 1, %noreg, 0
+ ; CHECK: %rdi = VCVTTSS2SI64Zrm_Int %rdi, %xmm16, 1, %noreg, 0
+ %rdi = VCVTTSS2SI64Zrm_Int %rdi, %xmm16, 1, %noreg, 0
; CHECK: %rdi = VCVTTSS2SI64Zrr %xmm16
%rdi = VCVTTSS2SI64Zrr %xmm16
; CHECK: %rdi = VCVTTSS2SI64Zrr_Int %xmm16
%rdi = VCVTTSS2SI64Zrr_Int %xmm16
- ; CHECK: %edi = VCVTTSS2SIZrm %rdi, %xmm16, 1, _, 0
- %edi = VCVTTSS2SIZrm %rdi, %xmm16, 1, _, 0
- ; CHECK: %edi = VCVTTSS2SIZrm_Int %rdi, %xmm16, 1, _, 0
- %edi = VCVTTSS2SIZrm_Int %rdi, %xmm16, 1, _, 0
+ ; CHECK: %edi = VCVTTSS2SIZrm %rdi, %xmm16, 1, %noreg, 0
+ %edi = VCVTTSS2SIZrm %rdi, %xmm16, 1, %noreg, 0
+ ; CHECK: %edi = VCVTTSS2SIZrm_Int %rdi, %xmm16, 1, %noreg, 0
+ %edi = VCVTTSS2SIZrm_Int %rdi, %xmm16, 1, %noreg, 0
; CHECK: %edi = VCVTTSS2SIZrr %xmm16
%edi = VCVTTSS2SIZrr %xmm16
; CHECK: %edi = VCVTTSS2SIZrr_Int %xmm16
%edi = VCVTTSS2SIZrr_Int %xmm16
; CHECK: %xmm16 = VMOV64toSDZrr %rdi
%xmm16 = VMOV64toSDZrr %rdi
- ; CHECK: %xmm16 = VMOVDI2SSZrm %rip, _, _, _, _
- %xmm16 = VMOVDI2SSZrm %rip, _, _, _, _
+ ; CHECK: %xmm16 = VMOVDI2SSZrm %rip, %noreg, %noreg, %noreg, %noreg
+ %xmm16 = VMOVDI2SSZrm %rip, %noreg, %noreg, %noreg, %noreg
; CHECK: %xmm16 = VMOVDI2SSZrr %eax
%xmm16 = VMOVDI2SSZrr %eax
- ; CHECK: VMOVSDZmr %rdi, %xmm16, _, _, _, _
- VMOVSDZmr %rdi, %xmm16, _, _, _, _
- ; CHECK: %xmm16 = VMOVSDZrm %rip, _, _, _, _
- %xmm16 = VMOVSDZrm %rip, _, _, _, _
- ; CHECK: %xmm16 = VMOVSDZrr %xmm16, _
- %xmm16 = VMOVSDZrr %xmm16, _
- ; CHECK: %xmm16 = VMOVSDZrr_REV %xmm16, _
- %xmm16 = VMOVSDZrr_REV %xmm16, _
+ ; CHECK: VMOVSDZmr %rdi, %xmm16, %noreg, %noreg, %noreg, %noreg
+ VMOVSDZmr %rdi, %xmm16, %noreg, %noreg, %noreg, %noreg
+ ; CHECK: %xmm16 = VMOVSDZrm %rip, %noreg, %noreg, %noreg, %noreg
+ %xmm16 = VMOVSDZrm %rip, %noreg, %noreg, %noreg, %noreg
+ ; CHECK: %xmm16 = VMOVSDZrr %xmm16, %noreg
+ %xmm16 = VMOVSDZrr %xmm16, %noreg
+ ; CHECK: %xmm16 = VMOVSDZrr_REV %xmm16, %noreg
+ %xmm16 = VMOVSDZrr_REV %xmm16, %noreg
; CHECK: %rax = VMOVSDto64Zrr %xmm16
%rax = VMOVSDto64Zrr %xmm16
- ; CHECK: VMOVSDto64Zmr %rdi, %xmm16, _, _, _, _
- VMOVSDto64Zmr %rdi, %xmm16, _, _, _, _
- ; CHECK: VMOVSSZmr %rdi, %xmm16, _, _, _, _
- VMOVSSZmr %rdi, %xmm16, _, _, _, _
- ; CHECK: %xmm16 = VMOVSSZrm %rip, _, _, _, _
- %xmm16 = VMOVSSZrm %rip, _, _, _, _
- ; CHECK: %xmm16 = VMOVSSZrr %xmm16, _
- %xmm16 = VMOVSSZrr %xmm16, _
- ; CHECK: %xmm16 = VMOVSSZrr_REV %xmm16, _
- %xmm16 = VMOVSSZrr_REV %xmm16, _
- ; CHECK: VMOVSS2DIZmr %rdi, %xmm16, _, _, _, _
- VMOVSS2DIZmr %rdi, %xmm16, _, _, _, _
+ ; CHECK: VMOVSDto64Zmr %rdi, %xmm16, %noreg, %noreg, %noreg, %noreg
+ VMOVSDto64Zmr %rdi, %xmm16, %noreg, %noreg, %noreg, %noreg
+ ; CHECK: VMOVSSZmr %rdi, %xmm16, %noreg, %noreg, %noreg, %noreg
+ VMOVSSZmr %rdi, %xmm16, %noreg, %noreg, %noreg, %noreg
+ ; CHECK: %xmm16 = VMOVSSZrm %rip, %noreg, %noreg, %noreg, %noreg
+ %xmm16 = VMOVSSZrm %rip, %noreg, %noreg, %noreg, %noreg
+ ; CHECK: %xmm16 = VMOVSSZrr %xmm16, %noreg
+ %xmm16 = VMOVSSZrr %xmm16, %noreg
+ ; CHECK: %xmm16 = VMOVSSZrr_REV %xmm16, %noreg
+ %xmm16 = VMOVSSZrr_REV %xmm16, %noreg
+ ; CHECK: VMOVSS2DIZmr %rdi, %xmm16, %noreg, %noreg, %noreg, %noreg
+ VMOVSS2DIZmr %rdi, %xmm16, %noreg, %noreg, %noreg, %noreg
; CHECK: %eax = VMOVSS2DIZrr %xmm16
%eax = VMOVSS2DIZrr %xmm16
; CHECK: %xmm16 = VMOV64toPQIZrr %rdi
%xmm16 = VMOV64toPQIZrr %rdi
- ; CHECK: %xmm16 = VMOV64toPQIZrm %rdi, _, _, _, _
- %xmm16 = VMOV64toPQIZrm %rdi, _, _, _, _
+ ; CHECK: %xmm16 = VMOV64toPQIZrm %rdi, %noreg, %noreg, %noreg, %noreg
+ %xmm16 = VMOV64toPQIZrm %rdi, %noreg, %noreg, %noreg, %noreg
; CHECK: %xmm16 = VMOV64toSDZrr %rdi
%xmm16 = VMOV64toSDZrr %rdi
- ; CHECK: %xmm16 = VMOVDI2PDIZrm %rip, _, _, _, _
- %xmm16 = VMOVDI2PDIZrm %rip, _, _, _, _
+ ; CHECK: %xmm16 = VMOVDI2PDIZrm %rip, %noreg, %noreg, %noreg, %noreg
+ %xmm16 = VMOVDI2PDIZrm %rip, %noreg, %noreg, %noreg, %noreg
; CHECK: %xmm16 = VMOVDI2PDIZrr %edi
%xmm16 = VMOVDI2PDIZrr %edi
- ; CHECK: %xmm16 = VMOVLHPSZrr %xmm16, _
- %xmm16 = VMOVLHPSZrr %xmm16, _
- ; CHECK: %xmm16 = VMOVHLPSZrr %xmm16, _
- %xmm16 = VMOVHLPSZrr %xmm16, _
- ; CHECK: VMOVPDI2DIZmr %rdi, %xmm16, _, _, _, _
- VMOVPDI2DIZmr %rdi, %xmm16, _, _, _, _
+ ; CHECK: %xmm16 = VMOVLHPSZrr %xmm16, %noreg
+ %xmm16 = VMOVLHPSZrr %xmm16, %noreg
+ ; CHECK: %xmm16 = VMOVHLPSZrr %xmm16, %noreg
+ %xmm16 = VMOVHLPSZrr %xmm16, %noreg
+ ; CHECK: VMOVPDI2DIZmr %rdi, %xmm16, %noreg, %noreg, %noreg, %noreg
+ VMOVPDI2DIZmr %rdi, %xmm16, %noreg, %noreg, %noreg, %noreg
; CHECK: %edi = VMOVPDI2DIZrr %xmm16
%edi = VMOVPDI2DIZrr %xmm16
; CHECK: %xmm16 = VMOVPQI2QIZrr %xmm16
%xmm16 = VMOVPQI2QIZrr %xmm16
- ; CHECK: VMOVPQI2QIZmr %rdi, %xmm16, _, _, _, _
- VMOVPQI2QIZmr %rdi, %xmm16, _, _, _, _
+ ; CHECK: VMOVPQI2QIZmr %rdi, %xmm16, %noreg, %noreg, %noreg, %noreg
+ VMOVPQI2QIZmr %rdi, %xmm16, %noreg, %noreg, %noreg, %noreg
; CHECK: %rdi = VMOVPQIto64Zrr %xmm16
%rdi = VMOVPQIto64Zrr %xmm16
- ; CHECK: VMOVPQIto64Zmr %rdi, %xmm16, _, _, _, _
- VMOVPQIto64Zmr %rdi, %xmm16, _, _, _, _
- ; CHECK: %xmm16 = VMOVQI2PQIZrm %rip, _, _, _, _
- %xmm16 = VMOVQI2PQIZrm %rip, _, _, _, _
+ ; CHECK: VMOVPQIto64Zmr %rdi, %xmm16, %noreg, %noreg, %noreg, %noreg
+ VMOVPQIto64Zmr %rdi, %xmm16, %noreg, %noreg, %noreg, %noreg
+ ; CHECK: %xmm16 = VMOVQI2PQIZrm %rip, %noreg, %noreg, %noreg, %noreg
+ %xmm16 = VMOVQI2PQIZrm %rip, %noreg, %noreg, %noreg, %noreg
; CHECK: %xmm16 = VMOVZPQILo2PQIZrr %xmm16
%xmm16 = VMOVZPQILo2PQIZrr %xmm16
- ; CHECK: Int_VCOMISDZrm %xmm16, %rdi, _, _, _, _, implicit-def %eflags
- Int_VCOMISDZrm %xmm16, %rdi, _, _, _, _, implicit-def %eflags
+ ; CHECK: Int_VCOMISDZrm %xmm16, %rdi, %noreg, %noreg, %noreg, %noreg, implicit-def %eflags
+ Int_VCOMISDZrm %xmm16, %rdi, %noreg, %noreg, %noreg, %noreg, implicit-def %eflags
; CHECK: Int_VCOMISDZrr %xmm16, %xmm1, implicit-def %eflags
Int_VCOMISDZrr %xmm16, %xmm1, implicit-def %eflags
- ; CHECK: Int_VCOMISSZrm %xmm16, %rdi, _, _, _, _, implicit-def %eflags
- Int_VCOMISSZrm %xmm16, %rdi, _, _, _, _, implicit-def %eflags
+ ; CHECK: Int_VCOMISSZrm %xmm16, %rdi, %noreg, %noreg, %noreg, %noreg, implicit-def %eflags
+ Int_VCOMISSZrm %xmm16, %rdi, %noreg, %noreg, %noreg, %noreg, implicit-def %eflags
; CHECK: Int_VCOMISSZrr %xmm16, %xmm1, implicit-def %eflags
Int_VCOMISSZrr %xmm16, %xmm1, implicit-def %eflags
- ; CHECK: Int_VUCOMISDZrm %xmm16, %rdi, _, _, _, _, implicit-def %eflags
- Int_VUCOMISDZrm %xmm16, %rdi, _, _, _, _, implicit-def %eflags
+ ; CHECK: Int_VUCOMISDZrm %xmm16, %rdi, %noreg, %noreg, %noreg, %noreg, implicit-def %eflags
+ Int_VUCOMISDZrm %xmm16, %rdi, %noreg, %noreg, %noreg, %noreg, implicit-def %eflags
; CHECK: Int_VUCOMISDZrr %xmm16, %xmm1, implicit-def %eflags
Int_VUCOMISDZrr %xmm16, %xmm1, implicit-def %eflags
- ; CHECK: Int_VUCOMISSZrm %xmm16, %rdi, _, _, _, _, implicit-def %eflags
- Int_VUCOMISSZrm %xmm16, %rdi, _, _, _, _, implicit-def %eflags
+ ; CHECK: Int_VUCOMISSZrm %xmm16, %rdi, %noreg, %noreg, %noreg, %noreg, implicit-def %eflags
+ Int_VUCOMISSZrm %xmm16, %rdi, %noreg, %noreg, %noreg, %noreg, implicit-def %eflags
; CHECK: Int_VUCOMISSZrr %xmm16, %xmm1, implicit-def %eflags
Int_VUCOMISSZrr %xmm16, %xmm1, implicit-def %eflags
- ; CHECK: VCOMISDZrm %xmm16, %rdi, _, _, _, _, implicit-def %eflags
- VCOMISDZrm %xmm16, %rdi, _, _, _, _, implicit-def %eflags
+ ; CHECK: VCOMISDZrm %xmm16, %rdi, %noreg, %noreg, %noreg, %noreg, implicit-def %eflags
+ VCOMISDZrm %xmm16, %rdi, %noreg, %noreg, %noreg, %noreg, implicit-def %eflags
; CHECK: VCOMISDZrr %xmm16, %xmm1, implicit-def %eflags
VCOMISDZrr %xmm16, %xmm1, implicit-def %eflags
- ; CHECK: VCOMISSZrm %xmm16, %rdi, _, _, _, _, implicit-def %eflags
- VCOMISSZrm %xmm16, %rdi, _, _, _, _, implicit-def %eflags
+ ; CHECK: VCOMISSZrm %xmm16, %rdi, %noreg, %noreg, %noreg, %noreg, implicit-def %eflags
+ VCOMISSZrm %xmm16, %rdi, %noreg, %noreg, %noreg, %noreg, implicit-def %eflags
; CHECK: VCOMISSZrr %xmm16, %xmm1, implicit-def %eflags
VCOMISSZrr %xmm16, %xmm1, implicit-def %eflags
- ; CHECK: VUCOMISDZrm %xmm16, %rdi, _, _, _, _, implicit-def %eflags
- VUCOMISDZrm %xmm16, %rdi, _, _, _, _, implicit-def %eflags
+ ; CHECK: VUCOMISDZrm %xmm16, %rdi, %noreg, %noreg, %noreg, %noreg, implicit-def %eflags
+ VUCOMISDZrm %xmm16, %rdi, %noreg, %noreg, %noreg, %noreg, implicit-def %eflags
; CHECK: VUCOMISDZrr %xmm16, %xmm1, implicit-def %eflags
VUCOMISDZrr %xmm16, %xmm1, implicit-def %eflags
- ; CHECK: VUCOMISSZrm %xmm16, %rdi, _, _, _, _, implicit-def %eflags
- VUCOMISSZrm %xmm16, %rdi, _, _, _, _, implicit-def %eflags
+ ; CHECK: VUCOMISSZrm %xmm16, %rdi, %noreg, %noreg, %noreg, %noreg, implicit-def %eflags
+ VUCOMISSZrm %xmm16, %rdi, %noreg, %noreg, %noreg, %noreg, implicit-def %eflags
; CHECK: VUCOMISSZrr %xmm16, %xmm1, implicit-def %eflags
VUCOMISSZrr %xmm16, %xmm1, implicit-def %eflags
-
- RET 0, %zmm0, %zmm1
+
+ RET 0, %zmm0, %zmm1
...
diff --git a/test/CodeGen/X86/exedeps-movq.ll b/test/CodeGen/X86/exedeps-movq.ll
index c1c60981edf5..cc56be672db3 100644
--- a/test/CodeGen/X86/exedeps-movq.ll
+++ b/test/CodeGen/X86/exedeps-movq.ll
@@ -12,13 +12,13 @@
define void @store_floats(<4 x float> %x, i64* %p) {
; SSE-LABEL: store_floats:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: addps %xmm0, %xmm0
; SSE-NEXT: movlps %xmm0, (%rdi)
; SSE-NEXT: retq
;
; AVX-LABEL: store_floats:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vaddps %xmm0, %xmm0, %xmm0
; AVX-NEXT: vmovlps %xmm0, (%rdi)
; AVX-NEXT: retq
@@ -31,13 +31,13 @@ define void @store_floats(<4 x float> %x, i64* %p) {
define void @store_double(<2 x double> %x, i64* %p) {
; SSE-LABEL: store_double:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: addpd %xmm0, %xmm0
; SSE-NEXT: movlpd %xmm0, (%rdi)
; SSE-NEXT: retq
;
; AVX-LABEL: store_double:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vaddpd %xmm0, %xmm0, %xmm0
; AVX-NEXT: vmovlpd %xmm0, (%rdi)
; AVX-NEXT: retq
@@ -50,13 +50,13 @@ define void @store_double(<2 x double> %x, i64* %p) {
define void @store_int(<4 x i32> %x, <2 x float>* %p) {
; SSE-LABEL: store_int:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: paddd %xmm0, %xmm0
; SSE-NEXT: movq %xmm0, (%rdi)
; SSE-NEXT: retq
;
; AVX-LABEL: store_int:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpaddd %xmm0, %xmm0, %xmm0
; AVX-NEXT: vmovq %xmm0, (%rdi)
; AVX-NEXT: retq
@@ -69,13 +69,13 @@ define void @store_int(<4 x i32> %x, <2 x float>* %p) {
define void @store_h_double(<2 x double> %x, i64* %p) {
; SSE-LABEL: store_h_double:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: addpd %xmm0, %xmm0
; SSE-NEXT: movhpd %xmm0, (%rdi)
; SSE-NEXT: retq
;
; AVX-LABEL: store_h_double:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vaddpd %xmm0, %xmm0, %xmm0
; AVX-NEXT: vmovhpd %xmm0, (%rdi)
; AVX-NEXT: retq
diff --git a/test/CodeGen/X86/exedepsfix-broadcast.ll b/test/CodeGen/X86/exedepsfix-broadcast.ll
index e67bb0f9b7ae..2fcbdd39f4a8 100644
--- a/test/CodeGen/X86/exedepsfix-broadcast.ll
+++ b/test/CodeGen/X86/exedepsfix-broadcast.ll
@@ -6,7 +6,7 @@
define <4 x float> @ExeDepsFix_broadcastss(<4 x float> %arg, <4 x float> %arg2) {
; CHECK-LABEL: ExeDepsFix_broadcastss:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vbroadcastss {{.*}}(%rip), %xmm2
; CHECK-NEXT: vandps %xmm2, %xmm0, %xmm0
; CHECK-NEXT: vmaxps %xmm1, %xmm0, %xmm0
@@ -21,7 +21,7 @@ define <4 x float> @ExeDepsFix_broadcastss(<4 x float> %arg, <4 x float> %arg2)
define <8 x float> @ExeDepsFix_broadcastss256(<8 x float> %arg, <8 x float> %arg2) {
; CHECK-LABEL: ExeDepsFix_broadcastss256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vbroadcastss {{.*}}(%rip), %ymm2
; CHECK-NEXT: vandps %ymm2, %ymm0, %ymm0
; CHECK-NEXT: vmaxps %ymm1, %ymm0, %ymm0
@@ -36,7 +36,7 @@ define <8 x float> @ExeDepsFix_broadcastss256(<8 x float> %arg, <8 x float> %arg
define <4 x float> @ExeDepsFix_broadcastss_inreg(<4 x float> %arg, <4 x float> %arg2, i32 %broadcastvalue) {
; CHECK-LABEL: ExeDepsFix_broadcastss_inreg:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovd %edi, %xmm2
; CHECK-NEXT: vpbroadcastd %xmm2, %xmm2
; CHECK-NEXT: vpand %xmm2, %xmm0, %xmm0
@@ -54,7 +54,7 @@ define <4 x float> @ExeDepsFix_broadcastss_inreg(<4 x float> %arg, <4 x float> %
define <8 x float> @ExeDepsFix_broadcastss256_inreg(<8 x float> %arg, <8 x float> %arg2, i32 %broadcastvalue) {
; CHECK-LABEL: ExeDepsFix_broadcastss256_inreg:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovd %edi, %xmm2
; CHECK-NEXT: vpbroadcastd %xmm2, %ymm2
; CHECK-NEXT: vpand %ymm2, %ymm0, %ymm0
@@ -73,7 +73,7 @@ define <8 x float> @ExeDepsFix_broadcastss256_inreg(<8 x float> %arg, <8 x float
; In that case the broadcast is directly folded into vandpd.
define <2 x double> @ExeDepsFix_broadcastsd(<2 x double> %arg, <2 x double> %arg2) {
; CHECK-LABEL: ExeDepsFix_broadcastsd:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vandpd {{.*}}(%rip), %xmm0, %xmm0
; CHECK-NEXT: vmaxpd %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
@@ -87,7 +87,7 @@ define <2 x double> @ExeDepsFix_broadcastsd(<2 x double> %arg, <2 x double> %arg
define <4 x double> @ExeDepsFix_broadcastsd256(<4 x double> %arg, <4 x double> %arg2) {
; CHECK-LABEL: ExeDepsFix_broadcastsd256:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vbroadcastsd {{.*}}(%rip), %ymm2
; CHECK-NEXT: vandpd %ymm2, %ymm0, %ymm0
; CHECK-NEXT: vmaxpd %ymm1, %ymm0, %ymm0
@@ -104,7 +104,7 @@ define <4 x double> @ExeDepsFix_broadcastsd256(<4 x double> %arg, <4 x double> %
; vpand and there is nothing more you can do to match vmaxpd.
define <2 x double> @ExeDepsFix_broadcastsd_inreg(<2 x double> %arg, <2 x double> %arg2, i64 %broadcastvalue) {
; CHECK-LABEL: ExeDepsFix_broadcastsd_inreg:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovq %rdi, %xmm2
; CHECK-NEXT: vpbroadcastq %xmm2, %xmm2
; CHECK-NEXT: vpand %xmm2, %xmm0, %xmm0
@@ -122,7 +122,7 @@ define <2 x double> @ExeDepsFix_broadcastsd_inreg(<2 x double> %arg, <2 x double
define <4 x double> @ExeDepsFix_broadcastsd256_inreg(<4 x double> %arg, <4 x double> %arg2, i64 %broadcastvalue) {
; CHECK-LABEL: ExeDepsFix_broadcastsd256_inreg:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vmovq %rdi, %xmm2
; CHECK-NEXT: vpbroadcastq %xmm2, %ymm2
; CHECK-NEXT: vpand %ymm2, %ymm0, %ymm0
diff --git a/test/CodeGen/X86/expand-vr64-gr64-copy.mir b/test/CodeGen/X86/expand-vr64-gr64-copy.mir
index 3598c045ad53..ceb7d394af25 100644
--- a/test/CodeGen/X86/expand-vr64-gr64-copy.mir
+++ b/test/CodeGen/X86/expand-vr64-gr64-copy.mir
@@ -23,14 +23,14 @@ body: |
liveins: %xmm0
%xmm0 = PSHUFDri killed %xmm0, -24
- MOVPQI2QImr %rsp, 1, _, -8, _, killed %xmm0
- %mm0 = PSWAPDrm %rsp, 1, _, -8, _
+ MOVPQI2QImr %rsp, 1, %noreg, -8, %noreg, killed %xmm0
+ %mm0 = PSWAPDrm %rsp, 1, %noreg, -8, %noreg
; CHECK: %rax = MMX_MOVD64from64rr %mm0
; CHECK-NEXT: %mm0 = MMX_MOVD64to64rr %rax
%rax = COPY %mm0
%mm0 = COPY %rax
- MMX_MOVQ64mr %rsp, 1, _, -16, _, killed %mm0
- %xmm0 = MOVQI2PQIrm %rsp, 1, _, -16, _
+ MMX_MOVQ64mr %rsp, 1, %noreg, -16, %noreg, killed %mm0
+ %xmm0 = MOVQI2PQIrm %rsp, 1, %noreg, -16, %noreg
%xmm0 = PSHUFDri killed %xmm0, -44
RETQ %xmm0
...
diff --git a/test/CodeGen/X86/extend.ll b/test/CodeGen/X86/extend.ll
index d349e782d5d0..399d05eaa572 100644
--- a/test/CodeGen/X86/extend.ll
+++ b/test/CodeGen/X86/extend.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=x86 -x86-asm-syntax=intel | grep movzx | count 1
-; RUN: llc < %s -march=x86 -x86-asm-syntax=intel | grep movsx | count 1
+; RUN: llc < %s -mtriple=i686-- -x86-asm-syntax=intel | grep movzx | count 1
+; RUN: llc < %s -mtriple=i686-- -x86-asm-syntax=intel | grep movsx | count 1
@G1 = internal global i8 0 ; <i8*> [#uses=1]
@G2 = internal global i8 0 ; <i8*> [#uses=1]
diff --git a/test/CodeGen/X86/extended-fma-contraction.ll b/test/CodeGen/X86/extended-fma-contraction.ll
index 858eabcb7dc6..8ac47bd77f7e 100644
--- a/test/CodeGen/X86/extended-fma-contraction.ll
+++ b/test/CodeGen/X86/extended-fma-contraction.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=x86 -mcpu=bdver2 -mattr=-fma -mtriple=x86_64-apple-darwin < %s | FileCheck %s
-; RUN: llc -march=x86 -mcpu=bdver2 -mattr=-fma,-fma4 -mtriple=x86_64-apple-darwin < %s | FileCheck %s --check-prefix=CHECK-NOFMA
+; RUN: llc -mcpu=bdver2 -mattr=-fma -mtriple=i686-apple-darwin < %s | FileCheck %s
+; RUN: llc -mcpu=bdver2 -mattr=-fma,-fma4 -mtriple=i686-apple-darwin < %s | FileCheck %s --check-prefix=CHECK-NOFMA
; CHECK-LABEL: fmafunc
define <3 x float> @fmafunc(<3 x float> %a, <3 x float> %b, <3 x float> %c) {
diff --git a/test/CodeGen/X86/extmul128.ll b/test/CodeGen/X86/extmul128.ll
index 9b598299e536..1ccdbb55059f 100644
--- a/test/CodeGen/X86/extmul128.ll
+++ b/test/CodeGen/X86/extmul128.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 | grep mul | count 2
+; RUN: llc < %s -mtriple=x86_64-- | grep mul | count 2
define i128 @i64_sext_i128(i64 %a, i64 %b) {
%aa = sext i64 %a to i128
diff --git a/test/CodeGen/X86/extmul64.ll b/test/CodeGen/X86/extmul64.ll
index 9e20ded1111f..7e3d2fca74c6 100644
--- a/test/CodeGen/X86/extmul64.ll
+++ b/test/CodeGen/X86/extmul64.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | grep mul | count 2
+; RUN: llc < %s -mtriple=i686-- | grep mul | count 2
define i64 @i32_sext_i64(i32 %a, i32 %b) {
%aa = sext i32 %a to i64
diff --git a/test/CodeGen/X86/extract-combine.ll b/test/CodeGen/X86/extract-combine.ll
index 2040e872f7fe..7b38a0157808 100644
--- a/test/CodeGen/X86/extract-combine.ll
+++ b/test/CodeGen/X86/extract-combine.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mcpu=core2 -o %t
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=core2 -o %t
; RUN: not grep unpcklps %t
define i32 @foo() nounwind {
diff --git a/test/CodeGen/X86/extract-extract.ll b/test/CodeGen/X86/extract-extract.ll
index 9f1516356203..aeb3566cb1c8 100644
--- a/test/CodeGen/X86/extract-extract.ll
+++ b/test/CodeGen/X86/extract-extract.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 >/dev/null
+; RUN: llc < %s -mtriple=i686-- >/dev/null
; PR4699
; Handle this extractvalue-of-extractvalue case without getting in
diff --git a/test/CodeGen/X86/extract-store.ll b/test/CodeGen/X86/extract-store.ll
index 4ea6b7801fb3..0601c773fa53 100644
--- a/test/CodeGen/X86/extract-store.ll
+++ b/test/CodeGen/X86/extract-store.ll
@@ -10,42 +10,42 @@
define void @extract_i8_0(i8* nocapture %dst, <16 x i8> %foo) nounwind {
; SSE2-X32-LABEL: extract_i8_0:
-; SSE2-X32: # BB#0:
+; SSE2-X32: # %bb.0:
; SSE2-X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; SSE2-X32-NEXT: movd %xmm0, %ecx
; SSE2-X32-NEXT: movb %cl, (%eax)
; SSE2-X32-NEXT: retl
;
; SSE2-X64-LABEL: extract_i8_0:
-; SSE2-X64: # BB#0:
+; SSE2-X64: # %bb.0:
; SSE2-X64-NEXT: movd %xmm0, %eax
; SSE2-X64-NEXT: movb %al, (%rdi)
; SSE2-X64-NEXT: retq
;
; SSE41-X32-LABEL: extract_i8_0:
-; SSE41-X32: # BB#0:
+; SSE41-X32: # %bb.0:
; SSE41-X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; SSE41-X32-NEXT: pextrb $0, %xmm0, (%eax)
; SSE41-X32-NEXT: retl
;
; SSE41-X64-LABEL: extract_i8_0:
-; SSE41-X64: # BB#0:
+; SSE41-X64: # %bb.0:
; SSE41-X64-NEXT: pextrb $0, %xmm0, (%rdi)
; SSE41-X64-NEXT: retq
;
; AVX-X32-LABEL: extract_i8_0:
-; AVX-X32: # BB#0:
+; AVX-X32: # %bb.0:
; AVX-X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX-X32-NEXT: vpextrb $0, %xmm0, (%eax)
; AVX-X32-NEXT: retl
;
; AVX-X64-LABEL: extract_i8_0:
-; AVX-X64: # BB#0:
+; AVX-X64: # %bb.0:
; AVX-X64-NEXT: vpextrb $0, %xmm0, (%rdi)
; AVX-X64-NEXT: retq
;
; SSE-F128-LABEL: extract_i8_0:
-; SSE-F128: # BB#0:
+; SSE-F128: # %bb.0:
; SSE-F128-NEXT: movd %xmm0, %eax
; SSE-F128-NEXT: movb %al, (%rdi)
; SSE-F128-NEXT: retq
@@ -56,7 +56,7 @@ define void @extract_i8_0(i8* nocapture %dst, <16 x i8> %foo) nounwind {
define void @extract_i8_3(i8* nocapture %dst, <16 x i8> %foo) nounwind {
; SSE2-X32-LABEL: extract_i8_3:
-; SSE2-X32: # BB#0:
+; SSE2-X32: # %bb.0:
; SSE2-X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; SSE2-X32-NEXT: movd %xmm0, %ecx
; SSE2-X32-NEXT: shrl $24, %ecx
@@ -64,36 +64,36 @@ define void @extract_i8_3(i8* nocapture %dst, <16 x i8> %foo) nounwind {
; SSE2-X32-NEXT: retl
;
; SSE2-X64-LABEL: extract_i8_3:
-; SSE2-X64: # BB#0:
+; SSE2-X64: # %bb.0:
; SSE2-X64-NEXT: movd %xmm0, %eax
; SSE2-X64-NEXT: shrl $24, %eax
; SSE2-X64-NEXT: movb %al, (%rdi)
; SSE2-X64-NEXT: retq
;
; SSE41-X32-LABEL: extract_i8_3:
-; SSE41-X32: # BB#0:
+; SSE41-X32: # %bb.0:
; SSE41-X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; SSE41-X32-NEXT: pextrb $3, %xmm0, (%eax)
; SSE41-X32-NEXT: retl
;
; SSE41-X64-LABEL: extract_i8_3:
-; SSE41-X64: # BB#0:
+; SSE41-X64: # %bb.0:
; SSE41-X64-NEXT: pextrb $3, %xmm0, (%rdi)
; SSE41-X64-NEXT: retq
;
; AVX-X32-LABEL: extract_i8_3:
-; AVX-X32: # BB#0:
+; AVX-X32: # %bb.0:
; AVX-X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX-X32-NEXT: vpextrb $3, %xmm0, (%eax)
; AVX-X32-NEXT: retl
;
; AVX-X64-LABEL: extract_i8_3:
-; AVX-X64: # BB#0:
+; AVX-X64: # %bb.0:
; AVX-X64-NEXT: vpextrb $3, %xmm0, (%rdi)
; AVX-X64-NEXT: retq
;
; SSE-F128-LABEL: extract_i8_3:
-; SSE-F128: # BB#0:
+; SSE-F128: # %bb.0:
; SSE-F128-NEXT: movd %xmm0, %eax
; SSE-F128-NEXT: shrl $24, %eax
; SSE-F128-NEXT: movb %al, (%rdi)
@@ -105,42 +105,42 @@ define void @extract_i8_3(i8* nocapture %dst, <16 x i8> %foo) nounwind {
define void @extract_i8_15(i8* nocapture %dst, <16 x i8> %foo) nounwind {
; SSE2-X32-LABEL: extract_i8_15:
-; SSE2-X32: # BB#0:
+; SSE2-X32: # %bb.0:
; SSE2-X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; SSE2-X32-NEXT: pextrw $7, %xmm0, %ecx
; SSE2-X32-NEXT: movb %ch, (%eax)
; SSE2-X32-NEXT: retl
;
; SSE2-X64-LABEL: extract_i8_15:
-; SSE2-X64: # BB#0:
+; SSE2-X64: # %bb.0:
; SSE2-X64-NEXT: pextrw $7, %xmm0, %eax
; SSE2-X64-NEXT: movb %ah, (%rdi) # NOREX
; SSE2-X64-NEXT: retq
;
; SSE41-X32-LABEL: extract_i8_15:
-; SSE41-X32: # BB#0:
+; SSE41-X32: # %bb.0:
; SSE41-X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; SSE41-X32-NEXT: pextrb $15, %xmm0, (%eax)
; SSE41-X32-NEXT: retl
;
; SSE41-X64-LABEL: extract_i8_15:
-; SSE41-X64: # BB#0:
+; SSE41-X64: # %bb.0:
; SSE41-X64-NEXT: pextrb $15, %xmm0, (%rdi)
; SSE41-X64-NEXT: retq
;
; AVX-X32-LABEL: extract_i8_15:
-; AVX-X32: # BB#0:
+; AVX-X32: # %bb.0:
; AVX-X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX-X32-NEXT: vpextrb $15, %xmm0, (%eax)
; AVX-X32-NEXT: retl
;
; AVX-X64-LABEL: extract_i8_15:
-; AVX-X64: # BB#0:
+; AVX-X64: # %bb.0:
; AVX-X64-NEXT: vpextrb $15, %xmm0, (%rdi)
; AVX-X64-NEXT: retq
;
; SSE-F128-LABEL: extract_i8_15:
-; SSE-F128: # BB#0:
+; SSE-F128: # %bb.0:
; SSE-F128-NEXT: pextrw $7, %xmm0, %eax
; SSE-F128-NEXT: movb %ah, (%rdi) # NOREX
; SSE-F128-NEXT: retq
@@ -151,42 +151,42 @@ define void @extract_i8_15(i8* nocapture %dst, <16 x i8> %foo) nounwind {
define void @extract_i16_0(i16* nocapture %dst, <8 x i16> %foo) nounwind {
; SSE2-X32-LABEL: extract_i16_0:
-; SSE2-X32: # BB#0:
+; SSE2-X32: # %bb.0:
; SSE2-X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; SSE2-X32-NEXT: movd %xmm0, %ecx
; SSE2-X32-NEXT: movw %cx, (%eax)
; SSE2-X32-NEXT: retl
;
; SSE2-X64-LABEL: extract_i16_0:
-; SSE2-X64: # BB#0:
+; SSE2-X64: # %bb.0:
; SSE2-X64-NEXT: movd %xmm0, %eax
; SSE2-X64-NEXT: movw %ax, (%rdi)
; SSE2-X64-NEXT: retq
;
; SSE41-X32-LABEL: extract_i16_0:
-; SSE41-X32: # BB#0:
+; SSE41-X32: # %bb.0:
; SSE41-X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; SSE41-X32-NEXT: pextrw $0, %xmm0, (%eax)
; SSE41-X32-NEXT: retl
;
; SSE41-X64-LABEL: extract_i16_0:
-; SSE41-X64: # BB#0:
+; SSE41-X64: # %bb.0:
; SSE41-X64-NEXT: pextrw $0, %xmm0, (%rdi)
; SSE41-X64-NEXT: retq
;
; AVX-X32-LABEL: extract_i16_0:
-; AVX-X32: # BB#0:
+; AVX-X32: # %bb.0:
; AVX-X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX-X32-NEXT: vpextrw $0, %xmm0, (%eax)
; AVX-X32-NEXT: retl
;
; AVX-X64-LABEL: extract_i16_0:
-; AVX-X64: # BB#0:
+; AVX-X64: # %bb.0:
; AVX-X64-NEXT: vpextrw $0, %xmm0, (%rdi)
; AVX-X64-NEXT: retq
;
; SSE-F128-LABEL: extract_i16_0:
-; SSE-F128: # BB#0:
+; SSE-F128: # %bb.0:
; SSE-F128-NEXT: movd %xmm0, %eax
; SSE-F128-NEXT: movw %ax, (%rdi)
; SSE-F128-NEXT: retq
@@ -197,42 +197,42 @@ define void @extract_i16_0(i16* nocapture %dst, <8 x i16> %foo) nounwind {
define void @extract_i16_7(i16* nocapture %dst, <8 x i16> %foo) nounwind {
; SSE2-X32-LABEL: extract_i16_7:
-; SSE2-X32: # BB#0:
+; SSE2-X32: # %bb.0:
; SSE2-X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; SSE2-X32-NEXT: pextrw $7, %xmm0, %ecx
; SSE2-X32-NEXT: movw %cx, (%eax)
; SSE2-X32-NEXT: retl
;
; SSE2-X64-LABEL: extract_i16_7:
-; SSE2-X64: # BB#0:
+; SSE2-X64: # %bb.0:
; SSE2-X64-NEXT: pextrw $7, %xmm0, %eax
; SSE2-X64-NEXT: movw %ax, (%rdi)
; SSE2-X64-NEXT: retq
;
; SSE41-X32-LABEL: extract_i16_7:
-; SSE41-X32: # BB#0:
+; SSE41-X32: # %bb.0:
; SSE41-X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; SSE41-X32-NEXT: pextrw $7, %xmm0, (%eax)
; SSE41-X32-NEXT: retl
;
; SSE41-X64-LABEL: extract_i16_7:
-; SSE41-X64: # BB#0:
+; SSE41-X64: # %bb.0:
; SSE41-X64-NEXT: pextrw $7, %xmm0, (%rdi)
; SSE41-X64-NEXT: retq
;
; AVX-X32-LABEL: extract_i16_7:
-; AVX-X32: # BB#0:
+; AVX-X32: # %bb.0:
; AVX-X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX-X32-NEXT: vpextrw $7, %xmm0, (%eax)
; AVX-X32-NEXT: retl
;
; AVX-X64-LABEL: extract_i16_7:
-; AVX-X64: # BB#0:
+; AVX-X64: # %bb.0:
; AVX-X64-NEXT: vpextrw $7, %xmm0, (%rdi)
; AVX-X64-NEXT: retq
;
; SSE-F128-LABEL: extract_i16_7:
-; SSE-F128: # BB#0:
+; SSE-F128: # %bb.0:
; SSE-F128-NEXT: pextrw $7, %xmm0, %eax
; SSE-F128-NEXT: movw %ax, (%rdi)
; SSE-F128-NEXT: retq
@@ -243,24 +243,24 @@ define void @extract_i16_7(i16* nocapture %dst, <8 x i16> %foo) nounwind {
define void @extract_i32_0(i32* nocapture %dst, <4 x i32> %foo) nounwind {
; SSE-X32-LABEL: extract_i32_0:
-; SSE-X32: # BB#0:
+; SSE-X32: # %bb.0:
; SSE-X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; SSE-X32-NEXT: movss %xmm0, (%eax)
; SSE-X32-NEXT: retl
;
; SSE-X64-LABEL: extract_i32_0:
-; SSE-X64: # BB#0:
+; SSE-X64: # %bb.0:
; SSE-X64-NEXT: movss %xmm0, (%rdi)
; SSE-X64-NEXT: retq
;
; AVX-X32-LABEL: extract_i32_0:
-; AVX-X32: # BB#0:
+; AVX-X32: # %bb.0:
; AVX-X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX-X32-NEXT: vmovss %xmm0, (%eax)
; AVX-X32-NEXT: retl
;
; AVX-X64-LABEL: extract_i32_0:
-; AVX-X64: # BB#0:
+; AVX-X64: # %bb.0:
; AVX-X64-NEXT: vmovss %xmm0, (%rdi)
; AVX-X64-NEXT: retq
%vecext = extractelement <4 x i32> %foo, i32 0
@@ -270,42 +270,42 @@ define void @extract_i32_0(i32* nocapture %dst, <4 x i32> %foo) nounwind {
define void @extract_i32_3(i32* nocapture %dst, <4 x i32> %foo) nounwind {
; SSE2-X32-LABEL: extract_i32_3:
-; SSE2-X32: # BB#0:
+; SSE2-X32: # %bb.0:
; SSE2-X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; SSE2-X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
; SSE2-X32-NEXT: movd %xmm0, (%eax)
; SSE2-X32-NEXT: retl
;
; SSE2-X64-LABEL: extract_i32_3:
-; SSE2-X64: # BB#0:
+; SSE2-X64: # %bb.0:
; SSE2-X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
; SSE2-X64-NEXT: movd %xmm0, (%rdi)
; SSE2-X64-NEXT: retq
;
; SSE41-X32-LABEL: extract_i32_3:
-; SSE41-X32: # BB#0:
+; SSE41-X32: # %bb.0:
; SSE41-X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; SSE41-X32-NEXT: pextrd $3, %xmm0, (%eax)
+; SSE41-X32-NEXT: extractps $3, %xmm0, (%eax)
; SSE41-X32-NEXT: retl
;
; SSE41-X64-LABEL: extract_i32_3:
-; SSE41-X64: # BB#0:
-; SSE41-X64-NEXT: pextrd $3, %xmm0, (%rdi)
+; SSE41-X64: # %bb.0:
+; SSE41-X64-NEXT: extractps $3, %xmm0, (%rdi)
; SSE41-X64-NEXT: retq
;
; AVX-X32-LABEL: extract_i32_3:
-; AVX-X32: # BB#0:
+; AVX-X32: # %bb.0:
; AVX-X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; AVX-X32-NEXT: vpextrd $3, %xmm0, (%eax)
+; AVX-X32-NEXT: vextractps $3, %xmm0, (%eax)
; AVX-X32-NEXT: retl
;
; AVX-X64-LABEL: extract_i32_3:
-; AVX-X64: # BB#0:
-; AVX-X64-NEXT: vpextrd $3, %xmm0, (%rdi)
+; AVX-X64: # %bb.0:
+; AVX-X64-NEXT: vextractps $3, %xmm0, (%rdi)
; AVX-X64-NEXT: retq
;
; SSE-F128-LABEL: extract_i32_3:
-; SSE-F128: # BB#0:
+; SSE-F128: # %bb.0:
; SSE-F128-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
; SSE-F128-NEXT: movd %xmm0, (%rdi)
; SSE-F128-NEXT: retq
@@ -316,24 +316,24 @@ define void @extract_i32_3(i32* nocapture %dst, <4 x i32> %foo) nounwind {
define void @extract_i64_0(i64* nocapture %dst, <2 x i64> %foo) nounwind {
; SSE-X32-LABEL: extract_i64_0:
-; SSE-X32: # BB#0:
+; SSE-X32: # %bb.0:
; SSE-X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; SSE-X32-NEXT: movlps %xmm0, (%eax)
; SSE-X32-NEXT: retl
;
; SSE-X64-LABEL: extract_i64_0:
-; SSE-X64: # BB#0:
+; SSE-X64: # %bb.0:
; SSE-X64-NEXT: movlps %xmm0, (%rdi)
; SSE-X64-NEXT: retq
;
; AVX-X32-LABEL: extract_i64_0:
-; AVX-X32: # BB#0:
+; AVX-X32: # %bb.0:
; AVX-X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX-X32-NEXT: vmovlps %xmm0, (%eax)
; AVX-X32-NEXT: retl
;
; AVX-X64-LABEL: extract_i64_0:
-; AVX-X64: # BB#0:
+; AVX-X64: # %bb.0:
; AVX-X64-NEXT: vmovlps %xmm0, (%rdi)
; AVX-X64-NEXT: retq
%vecext = extractelement <2 x i64> %foo, i32 0
@@ -343,37 +343,37 @@ define void @extract_i64_0(i64* nocapture %dst, <2 x i64> %foo) nounwind {
define void @extract_i64_1(i64* nocapture %dst, <2 x i64> %foo) nounwind {
; SSE-X32-LABEL: extract_i64_1:
-; SSE-X32: # BB#0:
+; SSE-X32: # %bb.0:
; SSE-X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; SSE-X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; SSE-X32-NEXT: movq %xmm0, (%eax)
; SSE-X32-NEXT: retl
;
; SSE2-X64-LABEL: extract_i64_1:
-; SSE2-X64: # BB#0:
+; SSE2-X64: # %bb.0:
; SSE2-X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
; SSE2-X64-NEXT: movq %xmm0, (%rdi)
; SSE2-X64-NEXT: retq
;
; SSE41-X64-LABEL: extract_i64_1:
-; SSE41-X64: # BB#0:
+; SSE41-X64: # %bb.0:
; SSE41-X64-NEXT: pextrq $1, %xmm0, (%rdi)
; SSE41-X64-NEXT: retq
;
; AVX-X32-LABEL: extract_i64_1:
-; AVX-X32: # BB#0:
+; AVX-X32: # %bb.0:
; AVX-X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; AVX-X32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; AVX-X32-NEXT: vmovq %xmm0, (%eax)
+; AVX-X32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX-X32-NEXT: vmovlps %xmm0, (%eax)
; AVX-X32-NEXT: retl
;
; AVX-X64-LABEL: extract_i64_1:
-; AVX-X64: # BB#0:
+; AVX-X64: # %bb.0:
; AVX-X64-NEXT: vpextrq $1, %xmm0, (%rdi)
; AVX-X64-NEXT: retq
;
; SSE-F128-LABEL: extract_i64_1:
-; SSE-F128: # BB#0:
+; SSE-F128: # %bb.0:
; SSE-F128-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
; SSE-F128-NEXT: movq %xmm0, (%rdi)
; SSE-F128-NEXT: retq
@@ -384,24 +384,24 @@ define void @extract_i64_1(i64* nocapture %dst, <2 x i64> %foo) nounwind {
define void @extract_f32_0(float* nocapture %dst, <4 x float> %foo) nounwind {
; SSE-X32-LABEL: extract_f32_0:
-; SSE-X32: # BB#0:
+; SSE-X32: # %bb.0:
; SSE-X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; SSE-X32-NEXT: movss %xmm0, (%eax)
; SSE-X32-NEXT: retl
;
; SSE-X64-LABEL: extract_f32_0:
-; SSE-X64: # BB#0:
+; SSE-X64: # %bb.0:
; SSE-X64-NEXT: movss %xmm0, (%rdi)
; SSE-X64-NEXT: retq
;
; AVX-X32-LABEL: extract_f32_0:
-; AVX-X32: # BB#0:
+; AVX-X32: # %bb.0:
; AVX-X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX-X32-NEXT: vmovss %xmm0, (%eax)
; AVX-X32-NEXT: retl
;
; AVX-X64-LABEL: extract_f32_0:
-; AVX-X64: # BB#0:
+; AVX-X64: # %bb.0:
; AVX-X64-NEXT: vmovss %xmm0, (%rdi)
; AVX-X64-NEXT: retq
%vecext = extractelement <4 x float> %foo, i32 0
@@ -411,42 +411,42 @@ define void @extract_f32_0(float* nocapture %dst, <4 x float> %foo) nounwind {
define void @extract_f32_3(float* nocapture %dst, <4 x float> %foo) nounwind {
; SSE2-X32-LABEL: extract_f32_3:
-; SSE2-X32: # BB#0:
+; SSE2-X32: # %bb.0:
; SSE2-X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; SSE2-X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
; SSE2-X32-NEXT: movss %xmm0, (%eax)
; SSE2-X32-NEXT: retl
;
; SSE2-X64-LABEL: extract_f32_3:
-; SSE2-X64: # BB#0:
+; SSE2-X64: # %bb.0:
; SSE2-X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
; SSE2-X64-NEXT: movss %xmm0, (%rdi)
; SSE2-X64-NEXT: retq
;
; SSE41-X32-LABEL: extract_f32_3:
-; SSE41-X32: # BB#0:
+; SSE41-X32: # %bb.0:
; SSE41-X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; SSE41-X32-NEXT: extractps $3, %xmm0, (%eax)
; SSE41-X32-NEXT: retl
;
; SSE41-X64-LABEL: extract_f32_3:
-; SSE41-X64: # BB#0:
+; SSE41-X64: # %bb.0:
; SSE41-X64-NEXT: extractps $3, %xmm0, (%rdi)
; SSE41-X64-NEXT: retq
;
; AVX-X32-LABEL: extract_f32_3:
-; AVX-X32: # BB#0:
+; AVX-X32: # %bb.0:
; AVX-X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX-X32-NEXT: vextractps $3, %xmm0, (%eax)
; AVX-X32-NEXT: retl
;
; AVX-X64-LABEL: extract_f32_3:
-; AVX-X64: # BB#0:
+; AVX-X64: # %bb.0:
; AVX-X64-NEXT: vextractps $3, %xmm0, (%rdi)
; AVX-X64-NEXT: retq
;
; SSE-F128-LABEL: extract_f32_3:
-; SSE-F128: # BB#0:
+; SSE-F128: # %bb.0:
; SSE-F128-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
; SSE-F128-NEXT: movss %xmm0, (%rdi)
; SSE-F128-NEXT: retq
@@ -457,24 +457,24 @@ define void @extract_f32_3(float* nocapture %dst, <4 x float> %foo) nounwind {
define void @extract_f64_0(double* nocapture %dst, <2 x double> %foo) nounwind {
; SSE-X32-LABEL: extract_f64_0:
-; SSE-X32: # BB#0:
+; SSE-X32: # %bb.0:
; SSE-X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; SSE-X32-NEXT: movlps %xmm0, (%eax)
; SSE-X32-NEXT: retl
;
; SSE-X64-LABEL: extract_f64_0:
-; SSE-X64: # BB#0:
+; SSE-X64: # %bb.0:
; SSE-X64-NEXT: movlps %xmm0, (%rdi)
; SSE-X64-NEXT: retq
;
; AVX-X32-LABEL: extract_f64_0:
-; AVX-X32: # BB#0:
+; AVX-X32: # %bb.0:
; AVX-X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX-X32-NEXT: vmovlps %xmm0, (%eax)
; AVX-X32-NEXT: retl
;
; AVX-X64-LABEL: extract_f64_0:
-; AVX-X64: # BB#0:
+; AVX-X64: # %bb.0:
; AVX-X64-NEXT: vmovlps %xmm0, (%rdi)
; AVX-X64-NEXT: retq
%vecext = extractelement <2 x double> %foo, i32 0
@@ -484,24 +484,24 @@ define void @extract_f64_0(double* nocapture %dst, <2 x double> %foo) nounwind {
define void @extract_f64_1(double* nocapture %dst, <2 x double> %foo) nounwind {
; SSE-X32-LABEL: extract_f64_1:
-; SSE-X32: # BB#0:
+; SSE-X32: # %bb.0:
; SSE-X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; SSE-X32-NEXT: movhpd %xmm0, (%eax)
; SSE-X32-NEXT: retl
;
; SSE-X64-LABEL: extract_f64_1:
-; SSE-X64: # BB#0:
+; SSE-X64: # %bb.0:
; SSE-X64-NEXT: movhpd %xmm0, (%rdi)
; SSE-X64-NEXT: retq
;
; AVX-X32-LABEL: extract_f64_1:
-; AVX-X32: # BB#0:
+; AVX-X32: # %bb.0:
; AVX-X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX-X32-NEXT: vmovhpd %xmm0, (%eax)
; AVX-X32-NEXT: retl
;
; AVX-X64-LABEL: extract_f64_1:
-; AVX-X64: # BB#0:
+; AVX-X64: # %bb.0:
; AVX-X64-NEXT: vmovhpd %xmm0, (%rdi)
; AVX-X64-NEXT: retq
%vecext = extractelement <2 x double> %foo, i32 1
@@ -510,43 +510,50 @@ define void @extract_f64_1(double* nocapture %dst, <2 x double> %foo) nounwind {
}
define void @extract_f128_0(fp128* nocapture %dst, <2 x fp128> %foo) nounwind {
-; X32-LABEL: extract_f128_0:
-; X32: # BB#0:
-; X32-NEXT: pushl %edi
-; X32-NEXT: pushl %esi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X32-NEXT: movl %esi, 12(%edi)
-; X32-NEXT: movl %edx, 8(%edi)
-; X32-NEXT: movl %ecx, 4(%edi)
-; X32-NEXT: movl %eax, (%edi)
-; X32-NEXT: popl %esi
-; X32-NEXT: popl %edi
-; X32-NEXT: retl
+; SSE-X32-LABEL: extract_f128_0:
+; SSE-X32: # %bb.0:
+; SSE-X32-NEXT: pushl %edi
+; SSE-X32-NEXT: pushl %esi
+; SSE-X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; SSE-X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; SSE-X32-NEXT: movl {{[0-9]+}}(%esp), %edx
+; SSE-X32-NEXT: movl {{[0-9]+}}(%esp), %esi
+; SSE-X32-NEXT: movl {{[0-9]+}}(%esp), %edi
+; SSE-X32-NEXT: movl %esi, 12(%edi)
+; SSE-X32-NEXT: movl %edx, 8(%edi)
+; SSE-X32-NEXT: movl %ecx, 4(%edi)
+; SSE-X32-NEXT: movl %eax, (%edi)
+; SSE-X32-NEXT: popl %esi
+; SSE-X32-NEXT: popl %edi
+; SSE-X32-NEXT: retl
;
; SSE2-X64-LABEL: extract_f128_0:
-; SSE2-X64: # BB#0:
+; SSE2-X64: # %bb.0:
; SSE2-X64-NEXT: movq %rdx, 8(%rdi)
; SSE2-X64-NEXT: movq %rsi, (%rdi)
; SSE2-X64-NEXT: retq
;
; SSE41-X64-LABEL: extract_f128_0:
-; SSE41-X64: # BB#0:
+; SSE41-X64: # %bb.0:
; SSE41-X64-NEXT: movq %rdx, 8(%rdi)
; SSE41-X64-NEXT: movq %rsi, (%rdi)
; SSE41-X64-NEXT: retq
;
+; AVX-X32-LABEL: extract_f128_0:
+; AVX-X32: # %bb.0:
+; AVX-X32-NEXT: vmovups {{[0-9]+}}(%esp), %xmm0
+; AVX-X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX-X32-NEXT: vmovups %xmm0, (%eax)
+; AVX-X32-NEXT: retl
+;
; AVX-X64-LABEL: extract_f128_0:
-; AVX-X64: # BB#0:
+; AVX-X64: # %bb.0:
; AVX-X64-NEXT: movq %rdx, 8(%rdi)
; AVX-X64-NEXT: movq %rsi, (%rdi)
; AVX-X64-NEXT: retq
;
; SSE-F128-LABEL: extract_f128_0:
-; SSE-F128: # BB#0:
+; SSE-F128: # %bb.0:
; SSE-F128-NEXT: movaps %xmm0, (%rdi)
; SSE-F128-NEXT: retq
%vecext = extractelement <2 x fp128> %foo, i32 0
@@ -555,43 +562,50 @@ define void @extract_f128_0(fp128* nocapture %dst, <2 x fp128> %foo) nounwind {
}
define void @extract_f128_1(fp128* nocapture %dst, <2 x fp128> %foo) nounwind {
-; X32-LABEL: extract_f128_1:
-; X32: # BB#0:
-; X32-NEXT: pushl %edi
-; X32-NEXT: pushl %esi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X32-NEXT: movl %esi, 12(%edi)
-; X32-NEXT: movl %edx, 8(%edi)
-; X32-NEXT: movl %ecx, 4(%edi)
-; X32-NEXT: movl %eax, (%edi)
-; X32-NEXT: popl %esi
-; X32-NEXT: popl %edi
-; X32-NEXT: retl
+; SSE-X32-LABEL: extract_f128_1:
+; SSE-X32: # %bb.0:
+; SSE-X32-NEXT: pushl %edi
+; SSE-X32-NEXT: pushl %esi
+; SSE-X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; SSE-X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; SSE-X32-NEXT: movl {{[0-9]+}}(%esp), %edx
+; SSE-X32-NEXT: movl {{[0-9]+}}(%esp), %esi
+; SSE-X32-NEXT: movl {{[0-9]+}}(%esp), %edi
+; SSE-X32-NEXT: movl %esi, 12(%edi)
+; SSE-X32-NEXT: movl %edx, 8(%edi)
+; SSE-X32-NEXT: movl %ecx, 4(%edi)
+; SSE-X32-NEXT: movl %eax, (%edi)
+; SSE-X32-NEXT: popl %esi
+; SSE-X32-NEXT: popl %edi
+; SSE-X32-NEXT: retl
;
; SSE2-X64-LABEL: extract_f128_1:
-; SSE2-X64: # BB#0:
+; SSE2-X64: # %bb.0:
; SSE2-X64-NEXT: movq %r8, 8(%rdi)
; SSE2-X64-NEXT: movq %rcx, (%rdi)
; SSE2-X64-NEXT: retq
;
; SSE41-X64-LABEL: extract_f128_1:
-; SSE41-X64: # BB#0:
+; SSE41-X64: # %bb.0:
; SSE41-X64-NEXT: movq %r8, 8(%rdi)
; SSE41-X64-NEXT: movq %rcx, (%rdi)
; SSE41-X64-NEXT: retq
;
+; AVX-X32-LABEL: extract_f128_1:
+; AVX-X32: # %bb.0:
+; AVX-X32-NEXT: vmovups {{[0-9]+}}(%esp), %xmm0
+; AVX-X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX-X32-NEXT: vmovups %xmm0, (%eax)
+; AVX-X32-NEXT: retl
+;
; AVX-X64-LABEL: extract_f128_1:
-; AVX-X64: # BB#0:
+; AVX-X64: # %bb.0:
; AVX-X64-NEXT: movq %r8, 8(%rdi)
; AVX-X64-NEXT: movq %rcx, (%rdi)
; AVX-X64-NEXT: retq
;
; SSE-F128-LABEL: extract_f128_1:
-; SSE-F128: # BB#0:
+; SSE-F128: # %bb.0:
; SSE-F128-NEXT: movaps %xmm1, (%rdi)
; SSE-F128-NEXT: retq
%vecext = extractelement <2 x fp128> %foo, i32 1
@@ -601,11 +615,11 @@ define void @extract_f128_1(fp128* nocapture %dst, <2 x fp128> %foo) nounwind {
define void @extract_i8_undef(i8* nocapture %dst, <16 x i8> %foo) nounwind {
; X32-LABEL: extract_i8_undef:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: retl
;
; X64-LABEL: extract_i8_undef:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: retq
%vecext = extractelement <16 x i8> %foo, i32 16 ; undef
store i8 %vecext, i8* %dst, align 1
@@ -614,11 +628,11 @@ define void @extract_i8_undef(i8* nocapture %dst, <16 x i8> %foo) nounwind {
define void @extract_i16_undef(i16* nocapture %dst, <8 x i16> %foo) nounwind {
; X32-LABEL: extract_i16_undef:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: retl
;
; X64-LABEL: extract_i16_undef:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: retq
%vecext = extractelement <8 x i16> %foo, i32 9 ; undef
store i16 %vecext, i16* %dst, align 1
@@ -627,11 +641,11 @@ define void @extract_i16_undef(i16* nocapture %dst, <8 x i16> %foo) nounwind {
define void @extract_i32_undef(i32* nocapture %dst, <4 x i32> %foo) nounwind {
; X32-LABEL: extract_i32_undef:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: retl
;
; X64-LABEL: extract_i32_undef:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: retq
%vecext = extractelement <4 x i32> %foo, i32 6 ; undef
store i32 %vecext, i32* %dst, align 1
@@ -640,11 +654,11 @@ define void @extract_i32_undef(i32* nocapture %dst, <4 x i32> %foo) nounwind {
define void @extract_i64_undef(i64* nocapture %dst, <2 x i64> %foo) nounwind {
; X32-LABEL: extract_i64_undef:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: retl
;
; X64-LABEL: extract_i64_undef:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: retq
%vecext = extractelement <2 x i64> %foo, i32 2 ; undef
store i64 %vecext, i64* %dst, align 1
@@ -653,11 +667,11 @@ define void @extract_i64_undef(i64* nocapture %dst, <2 x i64> %foo) nounwind {
define void @extract_f32_undef(float* nocapture %dst, <4 x float> %foo) nounwind {
; X32-LABEL: extract_f32_undef:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: retl
;
; X64-LABEL: extract_f32_undef:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: retq
%vecext = extractelement <4 x float> %foo, i32 6 ; undef
store float %vecext, float* %dst, align 1
@@ -666,11 +680,11 @@ define void @extract_f32_undef(float* nocapture %dst, <4 x float> %foo) nounwind
define void @extract_f64_undef(double* nocapture %dst, <2 x double> %foo) nounwind {
; X32-LABEL: extract_f64_undef:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: retl
;
; X64-LABEL: extract_f64_undef:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: retq
%vecext = extractelement <2 x double> %foo, i32 2 ; undef
store double %vecext, double* %dst, align 1
@@ -679,11 +693,11 @@ define void @extract_f64_undef(double* nocapture %dst, <2 x double> %foo) nounwi
define void @extract_f128_undef(fp128* nocapture %dst, <2 x fp128> %foo) nounwind {
; X32-LABEL: extract_f128_undef:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: retl
;
; X64-LABEL: extract_f128_undef:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: retq
%vecext = extractelement <2 x fp128> %foo, i32 2 ; undef
store fp128 %vecext, fp128* %dst, align 1
diff --git a/test/CodeGen/X86/extractelement-from-arg.ll b/test/CodeGen/X86/extractelement-from-arg.ll
index 4ea37f0c46d3..1f97d6b33f36 100644
--- a/test/CodeGen/X86/extractelement-from-arg.ll
+++ b/test/CodeGen/X86/extractelement-from-arg.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mattr=+sse2
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2
define void @test(float* %R, <4 x float> %X) nounwind {
%tmp = extractelement <4 x float> %X, i32 3
diff --git a/test/CodeGen/X86/extractelement-index.ll b/test/CodeGen/X86/extractelement-index.ll
index 228ce70b4009..4d24a15fe2e1 100644
--- a/test/CodeGen/X86/extractelement-index.ll
+++ b/test/CodeGen/X86/extractelement-index.ll
@@ -10,22 +10,22 @@
define i8 @extractelement_v16i8_1(<16 x i8> %a) nounwind {
; SSE2-LABEL: extractelement_v16i8_1:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movd %xmm0, %eax
; SSE2-NEXT: shrl $8, %eax
-; SSE2-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; SSE2-NEXT: # kill: def %al killed %al killed %eax
; SSE2-NEXT: retq
;
; SSE41-LABEL: extractelement_v16i8_1:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pextrb $1, %xmm0, %eax
-; SSE41-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; SSE41-NEXT: # kill: def %al killed %al killed %eax
; SSE41-NEXT: retq
;
; AVX-LABEL: extractelement_v16i8_1:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpextrb $1, %xmm0, %eax
-; AVX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX-NEXT: # kill: def %al killed %al killed %eax
; AVX-NEXT: retq
%b = extractelement <16 x i8> %a, i256 1
ret i8 %b
@@ -33,22 +33,22 @@ define i8 @extractelement_v16i8_1(<16 x i8> %a) nounwind {
define i8 @extractelement_v16i8_11(<16 x i8> %a) nounwind {
; SSE2-LABEL: extractelement_v16i8_11:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: pextrw $5, %xmm0, %eax
; SSE2-NEXT: shrl $8, %eax
-; SSE2-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; SSE2-NEXT: # kill: def %al killed %al killed %eax
; SSE2-NEXT: retq
;
; SSE41-LABEL: extractelement_v16i8_11:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pextrb $11, %xmm0, %eax
-; SSE41-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; SSE41-NEXT: # kill: def %al killed %al killed %eax
; SSE41-NEXT: retq
;
; AVX-LABEL: extractelement_v16i8_11:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpextrb $11, %xmm0, %eax
-; AVX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX-NEXT: # kill: def %al killed %al killed %eax
; AVX-NEXT: retq
%b = extractelement <16 x i8> %a, i256 11
ret i8 %b
@@ -56,21 +56,21 @@ define i8 @extractelement_v16i8_11(<16 x i8> %a) nounwind {
define i8 @extractelement_v16i8_14(<16 x i8> %a) nounwind {
; SSE2-LABEL: extractelement_v16i8_14:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: pextrw $7, %xmm0, %eax
-; SSE2-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; SSE2-NEXT: # kill: def %al killed %al killed %eax
; SSE2-NEXT: retq
;
; SSE41-LABEL: extractelement_v16i8_14:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pextrb $14, %xmm0, %eax
-; SSE41-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; SSE41-NEXT: # kill: def %al killed %al killed %eax
; SSE41-NEXT: retq
;
; AVX-LABEL: extractelement_v16i8_14:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpextrb $14, %xmm0, %eax
-; AVX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX-NEXT: # kill: def %al killed %al killed %eax
; AVX-NEXT: retq
%b = extractelement <16 x i8> %a, i256 14
ret i8 %b
@@ -78,22 +78,22 @@ define i8 @extractelement_v16i8_14(<16 x i8> %a) nounwind {
define i8 @extractelement_v32i8_1(<32 x i8> %a) nounwind {
; SSE2-LABEL: extractelement_v32i8_1:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movd %xmm0, %eax
; SSE2-NEXT: shrl $8, %eax
-; SSE2-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; SSE2-NEXT: # kill: def %al killed %al killed %eax
; SSE2-NEXT: retq
;
; SSE41-LABEL: extractelement_v32i8_1:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pextrb $1, %xmm0, %eax
-; SSE41-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; SSE41-NEXT: # kill: def %al killed %al killed %eax
; SSE41-NEXT: retq
;
; AVX-LABEL: extractelement_v32i8_1:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpextrb $1, %xmm0, %eax
-; AVX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX-NEXT: # kill: def %al killed %al killed %eax
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
%b = extractelement <32 x i8> %a, i256 1
@@ -102,31 +102,31 @@ define i8 @extractelement_v32i8_1(<32 x i8> %a) nounwind {
define i8 @extractelement_v32i8_17(<32 x i8> %a) nounwind {
; SSE2-LABEL: extractelement_v32i8_17:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movd %xmm1, %eax
; SSE2-NEXT: shrl $8, %eax
-; SSE2-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; SSE2-NEXT: # kill: def %al killed %al killed %eax
; SSE2-NEXT: retq
;
; SSE41-LABEL: extractelement_v32i8_17:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pextrb $1, %xmm1, %eax
-; SSE41-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; SSE41-NEXT: # kill: def %al killed %al killed %eax
; SSE41-NEXT: retq
;
; AVX1-LABEL: extractelement_v32i8_17:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpextrb $1, %xmm0, %eax
-; AVX1-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX1-NEXT: # kill: def %al killed %al killed %eax
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: extractelement_v32i8_17:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX2-NEXT: vpextrb $1, %xmm0, %eax
-; AVX2-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX2-NEXT: # kill: def %al killed %al killed %eax
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
%b = extractelement <32 x i8> %a, i256 17
@@ -135,15 +135,15 @@ define i8 @extractelement_v32i8_17(<32 x i8> %a) nounwind {
define i16 @extractelement_v8i16_0(<8 x i16> %a, i256 %i) nounwind {
; SSE-LABEL: extractelement_v8i16_0:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movd %xmm0, %eax
-; SSE-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; SSE-NEXT: # kill: def %ax killed %ax killed %eax
; SSE-NEXT: retq
;
; AVX-LABEL: extractelement_v8i16_0:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovd %xmm0, %eax
-; AVX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX-NEXT: # kill: def %ax killed %ax killed %eax
; AVX-NEXT: retq
%b = extractelement <8 x i16> %a, i256 0
ret i16 %b
@@ -151,15 +151,15 @@ define i16 @extractelement_v8i16_0(<8 x i16> %a, i256 %i) nounwind {
define i16 @extractelement_v8i16_3(<8 x i16> %a, i256 %i) nounwind {
; SSE-LABEL: extractelement_v8i16_3:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pextrw $3, %xmm0, %eax
-; SSE-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; SSE-NEXT: # kill: def %ax killed %ax killed %eax
; SSE-NEXT: retq
;
; AVX-LABEL: extractelement_v8i16_3:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpextrw $3, %xmm0, %eax
-; AVX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX-NEXT: # kill: def %ax killed %ax killed %eax
; AVX-NEXT: retq
%b = extractelement <8 x i16> %a, i256 3
ret i16 %b
@@ -167,15 +167,15 @@ define i16 @extractelement_v8i16_3(<8 x i16> %a, i256 %i) nounwind {
define i16 @extractelement_v16i16_0(<16 x i16> %a, i256 %i) nounwind {
; SSE-LABEL: extractelement_v16i16_0:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movd %xmm0, %eax
-; SSE-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; SSE-NEXT: # kill: def %ax killed %ax killed %eax
; SSE-NEXT: retq
;
; AVX-LABEL: extractelement_v16i16_0:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovd %xmm0, %eax
-; AVX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX-NEXT: # kill: def %ax killed %ax killed %eax
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
%b = extractelement <16 x i16> %a, i256 0
@@ -184,24 +184,24 @@ define i16 @extractelement_v16i16_0(<16 x i16> %a, i256 %i) nounwind {
define i16 @extractelement_v16i16_13(<16 x i16> %a, i256 %i) nounwind {
; SSE-LABEL: extractelement_v16i16_13:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pextrw $5, %xmm1, %eax
-; SSE-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; SSE-NEXT: # kill: def %ax killed %ax killed %eax
; SSE-NEXT: retq
;
; AVX1-LABEL: extractelement_v16i16_13:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpextrw $5, %xmm0, %eax
-; AVX1-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX1-NEXT: # kill: def %ax killed %ax killed %eax
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: extractelement_v16i16_13:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX2-NEXT: vpextrw $5, %xmm0, %eax
-; AVX2-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX2-NEXT: # kill: def %ax killed %ax killed %eax
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
%b = extractelement <16 x i16> %a, i256 13
@@ -210,12 +210,12 @@ define i16 @extractelement_v16i16_13(<16 x i16> %a, i256 %i) nounwind {
define i32 @extractelement_v4i32_0(<4 x i32> %a) nounwind {
; SSE-LABEL: extractelement_v4i32_0:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movd %xmm0, %eax
; SSE-NEXT: retq
;
; AVX-LABEL: extractelement_v4i32_0:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovd %xmm0, %eax
; AVX-NEXT: retq
%b = extractelement <4 x i32> %a, i256 0
@@ -224,19 +224,19 @@ define i32 @extractelement_v4i32_0(<4 x i32> %a) nounwind {
define i32 @extractelement_v4i32_3(<4 x i32> %a) nounwind {
; SSE2-LABEL: extractelement_v4i32_3:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
; SSE2-NEXT: movd %xmm0, %eax
; SSE2-NEXT: retq
;
; SSE41-LABEL: extractelement_v4i32_3:
-; SSE41: # BB#0:
-; SSE41-NEXT: pextrd $3, %xmm0, %eax
+; SSE41: # %bb.0:
+; SSE41-NEXT: extractps $3, %xmm0, %eax
; SSE41-NEXT: retq
;
; AVX-LABEL: extractelement_v4i32_3:
-; AVX: # BB#0:
-; AVX-NEXT: vpextrd $3, %xmm0, %eax
+; AVX: # %bb.0:
+; AVX-NEXT: vextractps $3, %xmm0, %eax
; AVX-NEXT: retq
%b = extractelement <4 x i32> %a, i256 3
ret i32 %b
@@ -244,19 +244,19 @@ define i32 @extractelement_v4i32_3(<4 x i32> %a) nounwind {
define i32 @extractelement_v8i32_0(<8 x i32> %a) nounwind {
; SSE-LABEL: extractelement_v8i32_0:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movd %xmm1, %eax
; SSE-NEXT: retq
;
; AVX1-LABEL: extractelement_v8i32_0:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vmovd %xmm0, %eax
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: extractelement_v8i32_0:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX2-NEXT: vmovd %xmm0, %eax
; AVX2-NEXT: vzeroupper
@@ -267,19 +267,19 @@ define i32 @extractelement_v8i32_0(<8 x i32> %a) nounwind {
define i32 @extractelement_v8i32_4(<8 x i32> %a) nounwind {
; SSE-LABEL: extractelement_v8i32_4:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movd %xmm1, %eax
; SSE-NEXT: retq
;
; AVX1-LABEL: extractelement_v8i32_4:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vmovd %xmm0, %eax
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: extractelement_v8i32_4:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX2-NEXT: vmovd %xmm0, %eax
; AVX2-NEXT: vzeroupper
@@ -290,41 +290,34 @@ define i32 @extractelement_v8i32_4(<8 x i32> %a) nounwind {
define i32 @extractelement_v8i32_7(<8 x i32> %a) nounwind {
; SSE2-LABEL: extractelement_v8i32_7:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3]
; SSE2-NEXT: movd %xmm0, %eax
; SSE2-NEXT: retq
;
; SSE41-LABEL: extractelement_v8i32_7:
-; SSE41: # BB#0:
-; SSE41-NEXT: pextrd $3, %xmm1, %eax
+; SSE41: # %bb.0:
+; SSE41-NEXT: extractps $3, %xmm1, %eax
; SSE41-NEXT: retq
;
-; AVX1-LABEL: extractelement_v8i32_7:
-; AVX1: # BB#0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpextrd $3, %xmm0, %eax
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: extractelement_v8i32_7:
-; AVX2: # BB#0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX2-NEXT: vpextrd $3, %xmm0, %eax
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
+; AVX-LABEL: extractelement_v8i32_7:
+; AVX: # %bb.0:
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX-NEXT: vextractps $3, %xmm0, %eax
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
%b = extractelement <8 x i32> %a, i64 7
ret i32 %b
}
define i64 @extractelement_v2i64_0(<2 x i64> %a, i256 %i) nounwind {
; SSE-LABEL: extractelement_v2i64_0:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movq %xmm0, %rax
; SSE-NEXT: retq
;
; AVX-LABEL: extractelement_v2i64_0:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovq %xmm0, %rax
; AVX-NEXT: retq
%b = extractelement <2 x i64> %a, i256 0
@@ -333,18 +326,18 @@ define i64 @extractelement_v2i64_0(<2 x i64> %a, i256 %i) nounwind {
define i64 @extractelement_v2i64_1(<2 x i64> %a, i256 %i) nounwind {
; SSE2-LABEL: extractelement_v2i64_1:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
; SSE2-NEXT: movq %xmm0, %rax
; SSE2-NEXT: retq
;
; SSE41-LABEL: extractelement_v2i64_1:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pextrq $1, %xmm0, %rax
; SSE41-NEXT: retq
;
; AVX-LABEL: extractelement_v2i64_1:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpextrq $1, %xmm0, %rax
; AVX-NEXT: retq
%b = extractelement <2 x i64> %a, i256 1
@@ -353,18 +346,18 @@ define i64 @extractelement_v2i64_1(<2 x i64> %a, i256 %i) nounwind {
define i64 @extractelement_v4i64_1(<4 x i64> %a, i256 %i) nounwind {
; SSE2-LABEL: extractelement_v4i64_1:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
; SSE2-NEXT: movq %xmm0, %rax
; SSE2-NEXT: retq
;
; SSE41-LABEL: extractelement_v4i64_1:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pextrq $1, %xmm0, %rax
; SSE41-NEXT: retq
;
; AVX-LABEL: extractelement_v4i64_1:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpextrq $1, %xmm0, %rax
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
@@ -374,25 +367,25 @@ define i64 @extractelement_v4i64_1(<4 x i64> %a, i256 %i) nounwind {
define i64 @extractelement_v4i64_3(<4 x i64> %a, i256 %i) nounwind {
; SSE2-LABEL: extractelement_v4i64_3:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
; SSE2-NEXT: movq %xmm0, %rax
; SSE2-NEXT: retq
;
; SSE41-LABEL: extractelement_v4i64_3:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pextrq $1, %xmm1, %rax
; SSE41-NEXT: retq
;
; AVX1-LABEL: extractelement_v4i64_3:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpextrq $1, %xmm0, %rax
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: extractelement_v4i64_3:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX2-NEXT: vpextrq $1, %xmm0, %rax
; AVX2-NEXT: vzeroupper
@@ -407,7 +400,7 @@ define i64 @extractelement_v4i64_3(<4 x i64> %a, i256 %i) nounwind {
define i8 @extractelement_v16i8_var(<16 x i8> %a, i256 %i) nounwind {
; SSE-LABEL: extractelement_v16i8_var:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: andl $15, %edi
; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; SSE-NEXT: leaq -{{[0-9]+}}(%rsp), %rax
@@ -415,7 +408,7 @@ define i8 @extractelement_v16i8_var(<16 x i8> %a, i256 %i) nounwind {
; SSE-NEXT: retq
;
; AVX-LABEL: extractelement_v16i8_var:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: andl $15, %edi
; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
; AVX-NEXT: leaq -{{[0-9]+}}(%rsp), %rax
@@ -427,7 +420,7 @@ define i8 @extractelement_v16i8_var(<16 x i8> %a, i256 %i) nounwind {
define i8 @extractelement_v32i8_var(<32 x i8> %a, i256 %i) nounwind {
; SSE-LABEL: extractelement_v32i8_var:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pushq %rbp
; SSE-NEXT: movq %rsp, %rbp
; SSE-NEXT: andq $-32, %rsp
@@ -442,7 +435,7 @@ define i8 @extractelement_v32i8_var(<32 x i8> %a, i256 %i) nounwind {
; SSE-NEXT: retq
;
; AVX-LABEL: extractelement_v32i8_var:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: pushq %rbp
; AVX-NEXT: movq %rsp, %rbp
; AVX-NEXT: andq $-32, %rsp
@@ -461,14 +454,14 @@ define i8 @extractelement_v32i8_var(<32 x i8> %a, i256 %i) nounwind {
define i16 @extractelement_v8i16_var(<8 x i16> %a, i256 %i) nounwind {
; SSE-LABEL: extractelement_v8i16_var:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: andl $7, %edi
; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; SSE-NEXT: movzwl -24(%rsp,%rdi,2), %eax
; SSE-NEXT: retq
;
; AVX-LABEL: extractelement_v8i16_var:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: andl $7, %edi
; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
; AVX-NEXT: movzwl -24(%rsp,%rdi,2), %eax
@@ -479,7 +472,7 @@ define i16 @extractelement_v8i16_var(<8 x i16> %a, i256 %i) nounwind {
define i16 @extractelement_v16i16_var(<16 x i16> %a, i256 %i) nounwind {
; SSE-LABEL: extractelement_v16i16_var:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pushq %rbp
; SSE-NEXT: movq %rsp, %rbp
; SSE-NEXT: andq $-32, %rsp
@@ -493,7 +486,7 @@ define i16 @extractelement_v16i16_var(<16 x i16> %a, i256 %i) nounwind {
; SSE-NEXT: retq
;
; AVX-LABEL: extractelement_v16i16_var:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: pushq %rbp
; AVX-NEXT: movq %rsp, %rbp
; AVX-NEXT: andq $-32, %rsp
@@ -511,14 +504,14 @@ define i16 @extractelement_v16i16_var(<16 x i16> %a, i256 %i) nounwind {
define i32 @extractelement_v4i32_var(<4 x i32> %a, i256 %i) nounwind {
; SSE-LABEL: extractelement_v4i32_var:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: andl $3, %edi
; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; SSE-NEXT: movl -24(%rsp,%rdi,4), %eax
; SSE-NEXT: retq
;
; AVX-LABEL: extractelement_v4i32_var:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: andl $3, %edi
; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
; AVX-NEXT: movl -24(%rsp,%rdi,4), %eax
@@ -529,7 +522,7 @@ define i32 @extractelement_v4i32_var(<4 x i32> %a, i256 %i) nounwind {
define i32 @extractelement_v8i32_var(<8 x i32> %a, i256 %i) nounwind {
; SSE-LABEL: extractelement_v8i32_var:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pushq %rbp
; SSE-NEXT: movq %rsp, %rbp
; SSE-NEXT: andq $-32, %rsp
@@ -543,7 +536,7 @@ define i32 @extractelement_v8i32_var(<8 x i32> %a, i256 %i) nounwind {
; SSE-NEXT: retq
;
; AVX-LABEL: extractelement_v8i32_var:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: pushq %rbp
; AVX-NEXT: movq %rsp, %rbp
; AVX-NEXT: andq $-32, %rsp
@@ -561,14 +554,14 @@ define i32 @extractelement_v8i32_var(<8 x i32> %a, i256 %i) nounwind {
define i64 @extractelement_v2i64_var(<2 x i64> %a, i256 %i) nounwind {
; SSE-LABEL: extractelement_v2i64_var:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: andl $1, %edi
; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; SSE-NEXT: movq -24(%rsp,%rdi,8), %rax
; SSE-NEXT: retq
;
; AVX-LABEL: extractelement_v2i64_var:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: andl $1, %edi
; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
; AVX-NEXT: movq -24(%rsp,%rdi,8), %rax
@@ -579,7 +572,7 @@ define i64 @extractelement_v2i64_var(<2 x i64> %a, i256 %i) nounwind {
define i64 @extractelement_v4i64_var(<4 x i64> %a, i256 %i) nounwind {
; SSE-LABEL: extractelement_v4i64_var:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pushq %rbp
; SSE-NEXT: movq %rsp, %rbp
; SSE-NEXT: andq $-32, %rsp
@@ -593,7 +586,7 @@ define i64 @extractelement_v4i64_var(<4 x i64> %a, i256 %i) nounwind {
; SSE-NEXT: retq
;
; AVX-LABEL: extractelement_v4i64_var:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: pushq %rbp
; AVX-NEXT: movq %rsp, %rbp
; AVX-NEXT: andq $-32, %rsp
@@ -615,11 +608,11 @@ define i64 @extractelement_v4i64_var(<4 x i64> %a, i256 %i) nounwind {
define i8 @extractelement_32i8_m1(<32 x i8> %a) nounwind {
; SSE-LABEL: extractelement_32i8_m1:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: retq
;
; AVX-LABEL: extractelement_32i8_m1:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: retq
%b = extractelement <32 x i8> %a, i256 -1
ret i8 %b
@@ -627,11 +620,11 @@ define i8 @extractelement_32i8_m1(<32 x i8> %a) nounwind {
define i16 @extractelement_v16i16_m4(<16 x i16> %a, i256 %i) nounwind {
; SSE-LABEL: extractelement_v16i16_m4:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: retq
;
; AVX-LABEL: extractelement_v16i16_m4:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: retq
%b = extractelement <16 x i16> %a, i256 -4
ret i16 %b
@@ -639,11 +632,11 @@ define i16 @extractelement_v16i16_m4(<16 x i16> %a, i256 %i) nounwind {
define i32 @extractelement_v8i32_15(<8 x i32> %a) nounwind {
; SSE-LABEL: extractelement_v8i32_15:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: retq
;
; AVX-LABEL: extractelement_v8i32_15:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: retq
%b = extractelement <8 x i32> %a, i64 15
ret i32 %b
@@ -651,11 +644,11 @@ define i32 @extractelement_v8i32_15(<8 x i32> %a) nounwind {
define i64 @extractelement_v4i64_4(<4 x i64> %a, i256 %i) nounwind {
; SSE-LABEL: extractelement_v4i64_4:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: retq
;
; AVX-LABEL: extractelement_v4i64_4:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: retq
%b = extractelement <4 x i64> %a, i256 4
ret i64 %b
diff --git a/test/CodeGen/X86/extractelement-legalization-store-ordering.ll b/test/CodeGen/X86/extractelement-legalization-store-ordering.ll
index 9d0900f3b424..a2aa23bbb916 100644
--- a/test/CodeGen/X86/extractelement-legalization-store-ordering.ll
+++ b/test/CodeGen/X86/extractelement-legalization-store-ordering.ll
@@ -9,24 +9,24 @@ target datalayout = "e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128"
define void @test_extractelement_legalization_storereuse(<4 x i32> %a, i32* nocapture %x, i32* nocapture readonly %y, i32 %i) #0 {
; CHECK-LABEL: test_extractelement_legalization_storereuse:
-; CHECK: ## BB#0: ## %entry
+; CHECK: ## %bb.0: ## %entry
; CHECK-NEXT: pushl %ebx
; CHECK-NEXT: pushl %edi
; CHECK-NEXT: pushl %esi
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT: paddd (%ecx), %xmm0
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx
-; CHECK-NEXT: paddd (%edx), %xmm0
-; CHECK-NEXT: movdqa %xmm0, (%edx)
-; CHECK-NEXT: movl (%edx), %esi
-; CHECK-NEXT: movl 4(%edx), %edi
-; CHECK-NEXT: shll $4, %ecx
-; CHECK-NEXT: movl 8(%edx), %ebx
-; CHECK-NEXT: movl 12(%edx), %edx
-; CHECK-NEXT: movl %esi, 12(%eax,%ecx)
-; CHECK-NEXT: movl %edi, (%eax,%ecx)
-; CHECK-NEXT: movl %ebx, 8(%eax,%ecx)
-; CHECK-NEXT: movl %edx, 4(%eax,%ecx)
+; CHECK-NEXT: movdqa %xmm0, (%ecx)
+; CHECK-NEXT: movl (%ecx), %esi
+; CHECK-NEXT: movl 4(%ecx), %edi
+; CHECK-NEXT: shll $4, %edx
+; CHECK-NEXT: movl 8(%ecx), %ebx
+; CHECK-NEXT: movl 12(%ecx), %ecx
+; CHECK-NEXT: movl %esi, 12(%eax,%edx)
+; CHECK-NEXT: movl %edi, (%eax,%edx)
+; CHECK-NEXT: movl %ebx, 8(%eax,%edx)
+; CHECK-NEXT: movl %ecx, 4(%eax,%edx)
; CHECK-NEXT: popl %esi
; CHECK-NEXT: popl %edi
; CHECK-NEXT: popl %ebx
diff --git a/test/CodeGen/X86/extractelement-load.ll b/test/CodeGen/X86/extractelement-load.ll
index c3542bff4ccc..8cde110383b3 100644
--- a/test/CodeGen/X86/extractelement-load.ll
+++ b/test/CodeGen/X86/extractelement-load.ll
@@ -7,18 +7,18 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
define i32 @t(<2 x i64>* %val) nounwind {
; X32-SSE2-LABEL: t:
-; X32-SSE2: # BB#0:
+; X32-SSE2: # %bb.0:
; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-SSE2-NEXT: movl 8(%eax), %eax
; X32-SSE2-NEXT: retl
;
; X64-SSSE3-LABEL: t:
-; X64-SSSE3: # BB#0:
+; X64-SSSE3: # %bb.0:
; X64-SSSE3-NEXT: movl 8(%rdi), %eax
; X64-SSSE3-NEXT: retq
;
; X64-AVX-LABEL: t:
-; X64-AVX: # BB#0:
+; X64-AVX: # %bb.0:
; X64-AVX-NEXT: movl 8(%rdi), %eax
; X64-AVX-NEXT: retq
%tmp2 = load <2 x i64>, <2 x i64>* %val, align 16 ; <<2 x i64>> [#uses=1]
@@ -31,15 +31,15 @@ define i32 @t(<2 x i64>* %val) nounwind {
; (Making sure this doesn't crash.)
define i32 @t2(<8 x i32>* %xp) {
; X32-SSE2-LABEL: t2:
-; X32-SSE2: # BB#0:
+; X32-SSE2: # %bb.0:
; X32-SSE2-NEXT: retl
;
; X64-SSSE3-LABEL: t2:
-; X64-SSSE3: # BB#0:
+; X64-SSSE3: # %bb.0:
; X64-SSSE3-NEXT: retq
;
; X64-AVX-LABEL: t2:
-; X64-AVX: # BB#0:
+; X64-AVX: # %bb.0:
; X64-AVX-NEXT: retq
%x = load <8 x i32>, <8 x i32>* %xp
%Shuff68 = shufflevector <8 x i32> %x, <8 x i32> undef, <8 x i32> <i32 undef, i32 7, i32 9, i32 undef, i32 13, i32 15, i32 1, i32 3>
@@ -57,17 +57,17 @@ define i32 @t2(<8 x i32>* %xp) {
define void @t3() {
; X32-SSE2-LABEL: t3:
-; X32-SSE2: # BB#0: # %bb
+; X32-SSE2: # %bb.0: # %bb
; X32-SSE2-NEXT: movupd (%eax), %xmm0
; X32-SSE2-NEXT: movhpd %xmm0, (%eax)
;
; X64-SSSE3-LABEL: t3:
-; X64-SSSE3: # BB#0: # %bb
+; X64-SSSE3: # %bb.0: # %bb
; X64-SSSE3-NEXT: movddup {{.*#+}} xmm0 = mem[0,0]
; X64-SSSE3-NEXT: movlpd %xmm0, (%rax)
;
; X64-AVX-LABEL: t3:
-; X64-AVX: # BB#0: # %bb
+; X64-AVX: # %bb.0: # %bb
; X64-AVX-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
; X64-AVX-NEXT: vmovlpd %xmm0, (%rax)
bb:
@@ -83,7 +83,7 @@ bb:
; second shuffle operand was a post-bitcast type instead of a pre-bitcast type.
define i64 @t4(<2 x double>* %a) {
; X32-SSE2-LABEL: t4:
-; X32-SSE2: # BB#0:
+; X32-SSE2: # %bb.0:
; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-SSE2-NEXT: movapd (%eax), %xmm0
; X32-SSE2-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0]
@@ -94,12 +94,12 @@ define i64 @t4(<2 x double>* %a) {
; X32-SSE2-NEXT: retl
;
; X64-SSSE3-LABEL: t4:
-; X64-SSSE3: # BB#0:
+; X64-SSSE3: # %bb.0:
; X64-SSSE3-NEXT: movq (%rdi), %rax
; X64-SSSE3-NEXT: retq
;
; X64-AVX-LABEL: t4:
-; X64-AVX: # BB#0:
+; X64-AVX: # %bb.0:
; X64-AVX-NEXT: movq (%rdi), %rax
; X64-AVX-NEXT: retq
%b = load <2 x double>, <2 x double>* %a, align 16
diff --git a/test/CodeGen/X86/extractps.ll b/test/CodeGen/X86/extractps.ll
index 7d4c2cf619a1..586099d74887 100644
--- a/test/CodeGen/X86/extractps.ll
+++ b/test/CodeGen/X86/extractps.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mcpu=penryn > %t
+; RUN: llc < %s -mtriple=i686-- -mcpu=penryn > %t
; RUN: not grep movd %t
; RUN: grep "movss %xmm" %t | count 1
; RUN: grep "extractps \$1, %xmm0, " %t | count 1
diff --git a/test/CodeGen/X86/f16c-intrinsics-fast-isel.ll b/test/CodeGen/X86/f16c-intrinsics-fast-isel.ll
index f66c53e8ee63..47cd1ba95bc7 100644
--- a/test/CodeGen/X86/f16c-intrinsics-fast-isel.ll
+++ b/test/CodeGen/X86/f16c-intrinsics-fast-isel.ll
@@ -6,7 +6,7 @@
define float @test_cvtsh_ss(i16 %a0) nounwind {
; X32-LABEL: test_cvtsh_ss:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pushl %eax
; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vmovd %eax, %xmm0
@@ -17,7 +17,7 @@ define float @test_cvtsh_ss(i16 %a0) nounwind {
; X32-NEXT: retl
;
; X64-LABEL: test_cvtsh_ss:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movzwl %di, %eax
; X64-NEXT: vmovd %eax, %xmm0
; X64-NEXT: vcvtph2ps %xmm0, %xmm0
@@ -37,22 +37,22 @@ define float @test_cvtsh_ss(i16 %a0) nounwind {
define i16 @test_cvtss_sh(float %a0) nounwind {
; X32-LABEL: test_cvtss_sh:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X32-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X32-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; X32-NEXT: vcvtps2ph $0, %xmm0, %xmm0
; X32-NEXT: vmovd %xmm0, %eax
-; X32-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; X32-NEXT: # kill: def %ax killed %ax killed %eax
; X32-NEXT: retl
;
; X64-LABEL: test_cvtss_sh:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X64-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; X64-NEXT: vcvtps2ph $0, %xmm0, %xmm0
; X64-NEXT: vmovd %xmm0, %eax
-; X64-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-NEXT: # kill: def %ax killed %ax killed %eax
; X64-NEXT: retq
%ins0 = insertelement <4 x float> undef, float %a0, i32 0
%ins1 = insertelement <4 x float> %ins0, float 0.000000e+00, i32 1
@@ -65,12 +65,12 @@ define i16 @test_cvtss_sh(float %a0) nounwind {
define <4 x float> @test_mm_cvtph_ps(<2 x i64> %a0) nounwind {
; X32-LABEL: test_mm_cvtph_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vcvtph2ps %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_cvtph_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vcvtph2ps %xmm0, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <8 x i16>
@@ -80,12 +80,12 @@ define <4 x float> @test_mm_cvtph_ps(<2 x i64> %a0) nounwind {
define <8 x float> @test_mm256_cvtph_ps(<2 x i64> %a0) nounwind {
; X32-LABEL: test_mm256_cvtph_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vcvtph2ps %xmm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_cvtph_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vcvtph2ps %xmm0, %ymm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <8 x i16>
@@ -95,12 +95,12 @@ define <8 x float> @test_mm256_cvtph_ps(<2 x i64> %a0) nounwind {
define <2 x i64> @test_mm_cvtps_ph(<4 x float> %a0) nounwind {
; X32-LABEL: test_mm_cvtps_ph:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vcvtps2ph $0, %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_cvtps_ph:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vcvtps2ph $0, %xmm0, %xmm0
; X64-NEXT: retq
%cvt = call <8 x i16> @llvm.x86.vcvtps2ph.128(<4 x float> %a0, i32 0)
@@ -110,13 +110,13 @@ define <2 x i64> @test_mm_cvtps_ph(<4 x float> %a0) nounwind {
define <2 x i64> @test_mm256_cvtps_ph(<8 x float> %a0) nounwind {
; X32-LABEL: test_mm256_cvtps_ph:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vcvtps2ph $0, %ymm0, %xmm0
; X32-NEXT: vzeroupper
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_cvtps_ph:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vcvtps2ph $0, %ymm0, %xmm0
; X64-NEXT: vzeroupper
; X64-NEXT: retq
diff --git a/test/CodeGen/X86/f16c-intrinsics.ll b/test/CodeGen/X86/f16c-intrinsics.ll
index 712fe810d2a9..20ea67529a91 100644
--- a/test/CodeGen/X86/f16c-intrinsics.ll
+++ b/test/CodeGen/X86/f16c-intrinsics.ll
@@ -1,33 +1,81 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx,+f16c | FileCheck %s --check-prefix=X32
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+f16c | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx,+f16c -show-mc-encoding -disable-peephole | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+f16c -show-mc-encoding -disable-peephole | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512vl -show-mc-encoding -disable-peephole | FileCheck %s --check-prefix=X32-AVX512VL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl -show-mc-encoding -disable-peephole | FileCheck %s --check-prefix=X64-AVX512VL
define <4 x float> @test_x86_vcvtph2ps_128(<8 x i16> %a0) {
; X32-LABEL: test_x86_vcvtph2ps_128:
-; X32: # BB#0:
-; X32-NEXT: vcvtph2ps %xmm0, %xmm0
-; X32-NEXT: retl
+; X32: # %bb.0:
+; X32-NEXT: vcvtph2ps %xmm0, %xmm0 # encoding: [0xc4,0xe2,0x79,0x13,0xc0]
+; X32-NEXT: retl # encoding: [0xc3]
;
; X64-LABEL: test_x86_vcvtph2ps_128:
-; X64: # BB#0:
-; X64-NEXT: vcvtph2ps %xmm0, %xmm0
-; X64-NEXT: retq
+; X64: # %bb.0:
+; X64-NEXT: vcvtph2ps %xmm0, %xmm0 # encoding: [0xc4,0xe2,0x79,0x13,0xc0]
+; X64-NEXT: retq # encoding: [0xc3]
+;
+; X32-AVX512VL-LABEL: test_x86_vcvtph2ps_128:
+; X32-AVX512VL: # %bb.0:
+; X32-AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x13,0xc0]
+; X32-AVX512VL-NEXT: retl # encoding: [0xc3]
+;
+; X64-AVX512VL-LABEL: test_x86_vcvtph2ps_128:
+; X64-AVX512VL: # %bb.0:
+; X64-AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x13,0xc0]
+; X64-AVX512VL-NEXT: retq # encoding: [0xc3]
%res = call <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16> %a0) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
declare <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16>) nounwind readonly
+define <4 x float> @test_x86_vcvtph2ps_128_m(<8 x i16>* nocapture %a) {
+; X32-LABEL: test_x86_vcvtph2ps_128_m:
+; X32: # %bb.0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X32-NEXT: vcvtph2ps (%eax), %xmm0 # encoding: [0xc4,0xe2,0x79,0x13,0x00]
+; X32-NEXT: retl # encoding: [0xc3]
+;
+; X64-LABEL: test_x86_vcvtph2ps_128_m:
+; X64: # %bb.0:
+; X64-NEXT: vcvtph2ps (%rdi), %xmm0 # encoding: [0xc4,0xe2,0x79,0x13,0x07]
+; X64-NEXT: retq # encoding: [0xc3]
+;
+; X32-AVX512VL-LABEL: test_x86_vcvtph2ps_128_m:
+; X32-AVX512VL: # %bb.0:
+; X32-AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X32-AVX512VL-NEXT: vcvtph2ps (%eax), %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x13,0x00]
+; X32-AVX512VL-NEXT: retl # encoding: [0xc3]
+;
+; X64-AVX512VL-LABEL: test_x86_vcvtph2ps_128_m:
+; X64-AVX512VL: # %bb.0:
+; X64-AVX512VL-NEXT: vcvtph2ps (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x13,0x07]
+; X64-AVX512VL-NEXT: retq # encoding: [0xc3]
+ %load = load <8 x i16>, <8 x i16>* %a
+ %res = call <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16> %load) ; <<4 x float>> [#uses=1]
+ ret <4 x float> %res
+}
define <8 x float> @test_x86_vcvtph2ps_256(<8 x i16> %a0) {
; X32-LABEL: test_x86_vcvtph2ps_256:
-; X32: # BB#0:
-; X32-NEXT: vcvtph2ps %xmm0, %ymm0
-; X32-NEXT: retl
+; X32: # %bb.0:
+; X32-NEXT: vcvtph2ps %xmm0, %ymm0 # encoding: [0xc4,0xe2,0x7d,0x13,0xc0]
+; X32-NEXT: retl # encoding: [0xc3]
;
; X64-LABEL: test_x86_vcvtph2ps_256:
-; X64: # BB#0:
-; X64-NEXT: vcvtph2ps %xmm0, %ymm0
-; X64-NEXT: retq
+; X64: # %bb.0:
+; X64-NEXT: vcvtph2ps %xmm0, %ymm0 # encoding: [0xc4,0xe2,0x7d,0x13,0xc0]
+; X64-NEXT: retq # encoding: [0xc3]
+;
+; X32-AVX512VL-LABEL: test_x86_vcvtph2ps_256:
+; X32-AVX512VL: # %bb.0:
+; X32-AVX512VL-NEXT: vcvtph2ps %xmm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x13,0xc0]
+; X32-AVX512VL-NEXT: retl # encoding: [0xc3]
+;
+; X64-AVX512VL-LABEL: test_x86_vcvtph2ps_256:
+; X64-AVX512VL: # %bb.0:
+; X64-AVX512VL-NEXT: vcvtph2ps %xmm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x13,0xc0]
+; X64-AVX512VL-NEXT: retq # encoding: [0xc3]
%res = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %a0) ; <<8 x float>> [#uses=1]
ret <8 x float> %res
}
@@ -35,30 +83,51 @@ declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>) nounwind readonly
define <8 x float> @test_x86_vcvtph2ps_256_m(<8 x i16>* nocapture %a) nounwind {
; X32-LABEL: test_x86_vcvtph2ps_256_m:
-; X32: # BB#0:
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: vcvtph2ps (%eax), %ymm0
-; X32-NEXT: retl
+; X32: # %bb.0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X32-NEXT: vcvtph2ps (%eax), %ymm0 # encoding: [0xc4,0xe2,0x7d,0x13,0x00]
+; X32-NEXT: retl # encoding: [0xc3]
;
; X64-LABEL: test_x86_vcvtph2ps_256_m:
-; X64: # BB#0:
-; X64-NEXT: vcvtph2ps (%rdi), %ymm0
-; X64-NEXT: retq
- %load = load <8 x i16>, <8 x i16>* %a, align 16
+; X64: # %bb.0:
+; X64-NEXT: vcvtph2ps (%rdi), %ymm0 # encoding: [0xc4,0xe2,0x7d,0x13,0x07]
+; X64-NEXT: retq # encoding: [0xc3]
+;
+; X32-AVX512VL-LABEL: test_x86_vcvtph2ps_256_m:
+; X32-AVX512VL: # %bb.0:
+; X32-AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X32-AVX512VL-NEXT: vcvtph2ps (%eax), %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x13,0x00]
+; X32-AVX512VL-NEXT: retl # encoding: [0xc3]
+;
+; X64-AVX512VL-LABEL: test_x86_vcvtph2ps_256_m:
+; X64-AVX512VL: # %bb.0:
+; X64-AVX512VL-NEXT: vcvtph2ps (%rdi), %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x13,0x07]
+; X64-AVX512VL-NEXT: retq # encoding: [0xc3]
+ %load = load <8 x i16>, <8 x i16>* %a
%res = tail call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %load)
ret <8 x float> %res
}
define <8 x i16> @test_x86_vcvtps2ph_128(<4 x float> %a0) {
; X32-LABEL: test_x86_vcvtps2ph_128:
-; X32: # BB#0:
-; X32-NEXT: vcvtps2ph $0, %xmm0, %xmm0
-; X32-NEXT: retl
+; X32: # %bb.0:
+; X32-NEXT: vcvtps2ph $0, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x1d,0xc0,0x00]
+; X32-NEXT: retl # encoding: [0xc3]
;
; X64-LABEL: test_x86_vcvtps2ph_128:
-; X64: # BB#0:
-; X64-NEXT: vcvtps2ph $0, %xmm0, %xmm0
-; X64-NEXT: retq
+; X64: # %bb.0:
+; X64-NEXT: vcvtps2ph $0, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x1d,0xc0,0x00]
+; X64-NEXT: retq # encoding: [0xc3]
+;
+; X32-AVX512VL-LABEL: test_x86_vcvtps2ph_128:
+; X32-AVX512VL: # %bb.0:
+; X32-AVX512VL-NEXT: vcvtps2ph $0, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x1d,0xc0,0x00]
+; X32-AVX512VL-NEXT: retl # encoding: [0xc3]
+;
+; X64-AVX512VL-LABEL: test_x86_vcvtps2ph_128:
+; X64-AVX512VL: # %bb.0:
+; X64-AVX512VL-NEXT: vcvtps2ph $0, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x1d,0xc0,0x00]
+; X64-AVX512VL-NEXT: retq # encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.vcvtps2ph.128(<4 x float> %a0, i32 0) ; <<8 x i16>> [#uses=1]
ret <8 x i16> %res
}
@@ -66,16 +135,28 @@ declare <8 x i16> @llvm.x86.vcvtps2ph.128(<4 x float>, i32) nounwind readonly
define <8 x i16> @test_x86_vcvtps2ph_256(<8 x float> %a0) {
; X32-LABEL: test_x86_vcvtps2ph_256:
-; X32: # BB#0:
-; X32-NEXT: vcvtps2ph $0, %ymm0, %xmm0
-; X32-NEXT: vzeroupper
-; X32-NEXT: retl
+; X32: # %bb.0:
+; X32-NEXT: vcvtps2ph $0, %ymm0, %xmm0 # encoding: [0xc4,0xe3,0x7d,0x1d,0xc0,0x00]
+; X32-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X32-NEXT: retl # encoding: [0xc3]
;
; X64-LABEL: test_x86_vcvtps2ph_256:
-; X64: # BB#0:
-; X64-NEXT: vcvtps2ph $0, %ymm0, %xmm0
-; X64-NEXT: vzeroupper
-; X64-NEXT: retq
+; X64: # %bb.0:
+; X64-NEXT: vcvtps2ph $0, %ymm0, %xmm0 # encoding: [0xc4,0xe3,0x7d,0x1d,0xc0,0x00]
+; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X64-NEXT: retq # encoding: [0xc3]
+;
+; X32-AVX512VL-LABEL: test_x86_vcvtps2ph_256:
+; X32-AVX512VL: # %bb.0:
+; X32-AVX512VL-NEXT: vcvtps2ph $0, %ymm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x1d,0xc0,0x00]
+; X32-AVX512VL-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X32-AVX512VL-NEXT: retl # encoding: [0xc3]
+;
+; X64-AVX512VL-LABEL: test_x86_vcvtps2ph_256:
+; X64-AVX512VL: # %bb.0:
+; X64-AVX512VL-NEXT: vcvtps2ph $0, %ymm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x1d,0xc0,0x00]
+; X64-AVX512VL-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X64-AVX512VL-NEXT: retq # encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %a0, i32 0) ; <<8 x i16>> [#uses=1]
ret <8 x i16> %res
}
@@ -83,15 +164,26 @@ declare <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float>, i32) nounwind readonly
define <4 x float> @test_x86_vcvtps2ph_128_scalar(i64* %ptr) {
; X32-LABEL: test_x86_vcvtps2ph_128_scalar:
-; X32: # BB#0:
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: vcvtph2ps (%eax), %xmm0
-; X32-NEXT: retl
+; X32: # %bb.0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X32-NEXT: vcvtph2ps (%eax), %xmm0 # encoding: [0xc4,0xe2,0x79,0x13,0x00]
+; X32-NEXT: retl # encoding: [0xc3]
;
; X64-LABEL: test_x86_vcvtps2ph_128_scalar:
-; X64: # BB#0:
-; X64-NEXT: vcvtph2ps (%rdi), %xmm0
-; X64-NEXT: retq
+; X64: # %bb.0:
+; X64-NEXT: vcvtph2ps (%rdi), %xmm0 # encoding: [0xc4,0xe2,0x79,0x13,0x07]
+; X64-NEXT: retq # encoding: [0xc3]
+;
+; X32-AVX512VL-LABEL: test_x86_vcvtps2ph_128_scalar:
+; X32-AVX512VL: # %bb.0:
+; X32-AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X32-AVX512VL-NEXT: vcvtph2ps (%eax), %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x13,0x00]
+; X32-AVX512VL-NEXT: retl # encoding: [0xc3]
+;
+; X64-AVX512VL-LABEL: test_x86_vcvtps2ph_128_scalar:
+; X64-AVX512VL: # %bb.0:
+; X64-AVX512VL-NEXT: vcvtph2ps (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x13,0x07]
+; X64-AVX512VL-NEXT: retq # encoding: [0xc3]
%load = load i64, i64* %ptr
%ins1 = insertelement <2 x i64> undef, i64 %load, i32 0
%ins2 = insertelement <2 x i64> %ins1, i64 0, i32 1
@@ -102,15 +194,26 @@ define <4 x float> @test_x86_vcvtps2ph_128_scalar(i64* %ptr) {
define <4 x float> @test_x86_vcvtps2ph_128_scalar2(i64* %ptr) {
; X32-LABEL: test_x86_vcvtps2ph_128_scalar2:
-; X32: # BB#0:
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: vcvtph2ps (%eax), %xmm0
-; X32-NEXT: retl
+; X32: # %bb.0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X32-NEXT: vcvtph2ps (%eax), %xmm0 # encoding: [0xc4,0xe2,0x79,0x13,0x00]
+; X32-NEXT: retl # encoding: [0xc3]
;
; X64-LABEL: test_x86_vcvtps2ph_128_scalar2:
-; X64: # BB#0:
-; X64-NEXT: vcvtph2ps (%rdi), %xmm0
-; X64-NEXT: retq
+; X64: # %bb.0:
+; X64-NEXT: vcvtph2ps (%rdi), %xmm0 # encoding: [0xc4,0xe2,0x79,0x13,0x07]
+; X64-NEXT: retq # encoding: [0xc3]
+;
+; X32-AVX512VL-LABEL: test_x86_vcvtps2ph_128_scalar2:
+; X32-AVX512VL: # %bb.0:
+; X32-AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X32-AVX512VL-NEXT: vcvtph2ps (%eax), %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x13,0x00]
+; X32-AVX512VL-NEXT: retl # encoding: [0xc3]
+;
+; X64-AVX512VL-LABEL: test_x86_vcvtps2ph_128_scalar2:
+; X64-AVX512VL: # %bb.0:
+; X64-AVX512VL-NEXT: vcvtph2ps (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x13,0x07]
+; X64-AVX512VL-NEXT: retq # encoding: [0xc3]
%load = load i64, i64* %ptr
%ins = insertelement <2 x i64> undef, i64 %load, i32 0
%bc = bitcast <2 x i64> %ins to <8 x i16>
@@ -120,17 +223,30 @@ define <4 x float> @test_x86_vcvtps2ph_128_scalar2(i64* %ptr) {
define void @test_x86_vcvtps2ph_256_m(<8 x i16>* nocapture %d, <8 x float> %a) nounwind {
; X32-LABEL: test_x86_vcvtps2ph_256_m:
-; X32: # BB#0: # %entry
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: vcvtps2ph $3, %ymm0, (%eax)
-; X32-NEXT: vzeroupper
-; X32-NEXT: retl
+; X32: # %bb.0: # %entry
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X32-NEXT: vcvtps2ph $3, %ymm0, (%eax) # encoding: [0xc4,0xe3,0x7d,0x1d,0x00,0x03]
+; X32-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X32-NEXT: retl # encoding: [0xc3]
;
; X64-LABEL: test_x86_vcvtps2ph_256_m:
-; X64: # BB#0: # %entry
-; X64-NEXT: vcvtps2ph $3, %ymm0, (%rdi)
-; X64-NEXT: vzeroupper
-; X64-NEXT: retq
+; X64: # %bb.0: # %entry
+; X64-NEXT: vcvtps2ph $3, %ymm0, (%rdi) # encoding: [0xc4,0xe3,0x7d,0x1d,0x07,0x03]
+; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X64-NEXT: retq # encoding: [0xc3]
+;
+; X32-AVX512VL-LABEL: test_x86_vcvtps2ph_256_m:
+; X32-AVX512VL: # %bb.0: # %entry
+; X32-AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X32-AVX512VL-NEXT: vcvtps2ph $3, %ymm0, (%eax) # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x1d,0x00,0x03]
+; X32-AVX512VL-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X32-AVX512VL-NEXT: retl # encoding: [0xc3]
+;
+; X64-AVX512VL-LABEL: test_x86_vcvtps2ph_256_m:
+; X64-AVX512VL: # %bb.0: # %entry
+; X64-AVX512VL-NEXT: vcvtps2ph $3, %ymm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x1d,0x07,0x03]
+; X64-AVX512VL-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X64-AVX512VL-NEXT: retq # encoding: [0xc3]
entry:
%0 = tail call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %a, i32 3)
store <8 x i16> %0, <8 x i16>* %d, align 16
@@ -139,15 +255,32 @@ entry:
define void @test_x86_vcvtps2ph_128_m(<4 x i16>* nocapture %d, <4 x float> %a) nounwind {
; X32-LABEL: test_x86_vcvtps2ph_128_m:
-; X32: # BB#0: # %entry
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: vcvtps2ph $3, %xmm0, (%eax)
-; X32-NEXT: retl
+; X32: # %bb.0: # %entry
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X32-NEXT: vcvtps2ph $3, %xmm0, (%eax) # encoding: [0xc4,0xe3,0x79,0x1d,0x00,0x03]
+; X32-NEXT: retl # encoding: [0xc3]
;
; X64-LABEL: test_x86_vcvtps2ph_128_m:
-; X64: # BB#0: # %entry
-; X64-NEXT: vcvtps2ph $3, %xmm0, (%rdi)
-; X64-NEXT: retq
+; X64: # %bb.0: # %entry
+; X64-NEXT: vcvtps2ph $3, %xmm0, (%rdi) # encoding: [0xc4,0xe3,0x79,0x1d,0x07,0x03]
+; X64-NEXT: retq # encoding: [0xc3]
+;
+; X32-AVX512VL-LABEL: test_x86_vcvtps2ph_128_m:
+; X32-AVX512VL: # %bb.0: # %entry
+; X32-AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X32-AVX512VL-NEXT: vcvtps2ph $3, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x1d,0xc0,0x03]
+; X32-AVX512VL-NEXT: vpmovzxwd %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x33,0xc0]
+; X32-AVX512VL-NEXT: # xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; X32-AVX512VL-NEXT: vpmovdw %xmm0, (%eax) # encoding: [0x62,0xf2,0x7e,0x08,0x33,0x00]
+; X32-AVX512VL-NEXT: retl # encoding: [0xc3]
+;
+; X64-AVX512VL-LABEL: test_x86_vcvtps2ph_128_m:
+; X64-AVX512VL: # %bb.0: # %entry
+; X64-AVX512VL-NEXT: vcvtps2ph $3, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x1d,0xc0,0x03]
+; X64-AVX512VL-NEXT: vpmovzxwd %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x33,0xc0]
+; X64-AVX512VL-NEXT: # xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; X64-AVX512VL-NEXT: vpmovdw %xmm0, (%rdi) # encoding: [0x62,0xf2,0x7e,0x08,0x33,0x07]
+; X64-AVX512VL-NEXT: retq # encoding: [0xc3]
entry:
%0 = tail call <8 x i16> @llvm.x86.vcvtps2ph.128(<4 x float> %a, i32 3)
%1 = shufflevector <8 x i16> %0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -157,15 +290,26 @@ entry:
define void @test_x86_vcvtps2ph_128_m2(double* nocapture %hf4x16, <4 x float> %f4x32) #0 {
; X32-LABEL: test_x86_vcvtps2ph_128_m2:
-; X32: # BB#0: # %entry
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: vcvtps2ph $3, %xmm0, (%eax)
-; X32-NEXT: retl
+; X32: # %bb.0: # %entry
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X32-NEXT: vcvtps2ph $3, %xmm0, (%eax) # encoding: [0xc4,0xe3,0x79,0x1d,0x00,0x03]
+; X32-NEXT: retl # encoding: [0xc3]
;
; X64-LABEL: test_x86_vcvtps2ph_128_m2:
-; X64: # BB#0: # %entry
-; X64-NEXT: vcvtps2ph $3, %xmm0, (%rdi)
-; X64-NEXT: retq
+; X64: # %bb.0: # %entry
+; X64-NEXT: vcvtps2ph $3, %xmm0, (%rdi) # encoding: [0xc4,0xe3,0x79,0x1d,0x07,0x03]
+; X64-NEXT: retq # encoding: [0xc3]
+;
+; X32-AVX512VL-LABEL: test_x86_vcvtps2ph_128_m2:
+; X32-AVX512VL: # %bb.0: # %entry
+; X32-AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X32-AVX512VL-NEXT: vcvtps2ph $3, %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x1d,0x00,0x03]
+; X32-AVX512VL-NEXT: retl # encoding: [0xc3]
+;
+; X64-AVX512VL-LABEL: test_x86_vcvtps2ph_128_m2:
+; X64-AVX512VL: # %bb.0: # %entry
+; X64-AVX512VL-NEXT: vcvtps2ph $3, %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x1d,0x07,0x03]
+; X64-AVX512VL-NEXT: retq # encoding: [0xc3]
entry:
%0 = tail call <8 x i16> @llvm.x86.vcvtps2ph.128(<4 x float> %f4x32, i32 3)
%1 = bitcast <8 x i16> %0 to <2 x double>
@@ -176,15 +320,26 @@ entry:
define void @test_x86_vcvtps2ph_128_m3(i64* nocapture %hf4x16, <4 x float> %f4x32) #0 {
; X32-LABEL: test_x86_vcvtps2ph_128_m3:
-; X32: # BB#0: # %entry
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: vcvtps2ph $3, %xmm0, (%eax)
-; X32-NEXT: retl
+; X32: # %bb.0: # %entry
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X32-NEXT: vcvtps2ph $3, %xmm0, (%eax) # encoding: [0xc4,0xe3,0x79,0x1d,0x00,0x03]
+; X32-NEXT: retl # encoding: [0xc3]
;
; X64-LABEL: test_x86_vcvtps2ph_128_m3:
-; X64: # BB#0: # %entry
-; X64-NEXT: vcvtps2ph $3, %xmm0, (%rdi)
-; X64-NEXT: retq
+; X64: # %bb.0: # %entry
+; X64-NEXT: vcvtps2ph $3, %xmm0, (%rdi) # encoding: [0xc4,0xe3,0x79,0x1d,0x07,0x03]
+; X64-NEXT: retq # encoding: [0xc3]
+;
+; X32-AVX512VL-LABEL: test_x86_vcvtps2ph_128_m3:
+; X32-AVX512VL: # %bb.0: # %entry
+; X32-AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X32-AVX512VL-NEXT: vcvtps2ph $3, %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x1d,0x00,0x03]
+; X32-AVX512VL-NEXT: retl # encoding: [0xc3]
+;
+; X64-AVX512VL-LABEL: test_x86_vcvtps2ph_128_m3:
+; X64-AVX512VL: # %bb.0: # %entry
+; X64-AVX512VL-NEXT: vcvtps2ph $3, %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x1d,0x07,0x03]
+; X64-AVX512VL-NEXT: retq # encoding: [0xc3]
entry:
%0 = tail call <8 x i16> @llvm.x86.vcvtps2ph.128(<4 x float> %f4x32, i32 3)
%1 = bitcast <8 x i16> %0 to <2 x i64>
diff --git a/test/CodeGen/X86/f16c-schedule.ll b/test/CodeGen/X86/f16c-schedule.ll
new file mode 100644
index 000000000000..0f03ed6d09f5
--- /dev/null
+++ b/test/CodeGen/X86/f16c-schedule.ll
@@ -0,0 +1,227 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=+f16c | FileCheck %s --check-prefix=CHECK --check-prefix=GENERIC
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=ivybridge | FileCheck %s --check-prefix=CHECK --check-prefix=IVY
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=haswell | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=broadwell | FileCheck %s --check-prefix=CHECK --check-prefix=BROADWELL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=SKYLAKE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1
+
+define <4 x float> @test_vcvtph2ps_128(<8 x i16> %a0, <8 x i16> *%a1) {
+; GENERIC-LABEL: test_vcvtph2ps_128:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vcvtph2ps (%rdi), %xmm1 # sched: [7:1.00]
+; GENERIC-NEXT: vcvtph2ps %xmm0, %xmm0 # sched: [3:1.00]
+; GENERIC-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; IVY-LABEL: test_vcvtph2ps_128:
+; IVY: # %bb.0:
+; IVY-NEXT: vcvtph2ps (%rdi), %xmm1 # sched: [7:1.00]
+; IVY-NEXT: vcvtph2ps %xmm0, %xmm0 # sched: [3:1.00]
+; IVY-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
+; IVY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_vcvtph2ps_128:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vcvtph2ps (%rdi), %xmm1 # sched: [6:1.00]
+; HASWELL-NEXT: vcvtph2ps %xmm0, %xmm0 # sched: [2:1.00]
+; HASWELL-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_vcvtph2ps_128:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vcvtph2ps (%rdi), %xmm1 # sched: [6:1.00]
+; BROADWELL-NEXT: vcvtph2ps %xmm0, %xmm0 # sched: [2:1.00]
+; BROADWELL-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_vcvtph2ps_128:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vcvtph2ps (%rdi), %xmm1 # sched: [9:0.50]
+; SKYLAKE-NEXT: vcvtph2ps %xmm0, %xmm0 # sched: [5:1.00]
+; SKYLAKE-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_vcvtph2ps_128:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: vcvtph2ps (%rdi), %xmm1 # sched: [8:1.00]
+; BTVER2-NEXT: vcvtph2ps %xmm0, %xmm0 # sched: [3:1.00]
+; BTVER2-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_vcvtph2ps_128:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vcvtph2ps (%rdi), %xmm1 # sched: [100:?]
+; ZNVER1-NEXT: vcvtph2ps %xmm0, %xmm0 # sched: [100:?]
+; ZNVER1-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = load <8 x i16>, <8 x i16> *%a1
+ %2 = call <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16> %1)
+ %3 = call <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16> %a0)
+ %4 = fadd <4 x float> %2, %3
+ ret <4 x float> %4
+}
+declare <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16>)
+
+define <8 x float> @test_vcvtph2ps_256(<8 x i16> %a0, <8 x i16> *%a1) {
+; GENERIC-LABEL: test_vcvtph2ps_256:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vcvtph2ps (%rdi), %ymm1 # sched: [7:1.00]
+; GENERIC-NEXT: vcvtph2ps %xmm0, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; IVY-LABEL: test_vcvtph2ps_256:
+; IVY: # %bb.0:
+; IVY-NEXT: vcvtph2ps (%rdi), %ymm1 # sched: [7:1.00]
+; IVY-NEXT: vcvtph2ps %xmm0, %ymm0 # sched: [3:1.00]
+; IVY-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; IVY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_vcvtph2ps_256:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vcvtph2ps (%rdi), %ymm1 # sched: [7:1.00]
+; HASWELL-NEXT: vcvtph2ps %xmm0, %ymm0 # sched: [2:1.00]
+; HASWELL-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_vcvtph2ps_256:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vcvtph2ps (%rdi), %ymm1 # sched: [6:1.00]
+; BROADWELL-NEXT: vcvtph2ps %xmm0, %ymm0 # sched: [2:1.00]
+; BROADWELL-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_vcvtph2ps_256:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vcvtph2ps (%rdi), %ymm1 # sched: [10:0.50]
+; SKYLAKE-NEXT: vcvtph2ps %xmm0, %ymm0 # sched: [7:1.00]
+; SKYLAKE-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_vcvtph2ps_256:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: vcvtph2ps (%rdi), %ymm1 # sched: [8:2.00]
+; BTVER2-NEXT: vcvtph2ps %xmm0, %ymm0 # sched: [3:2.00]
+; BTVER2-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:2.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_vcvtph2ps_256:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vcvtph2ps (%rdi), %ymm1 # sched: [100:?]
+; ZNVER1-NEXT: vcvtph2ps %xmm0, %ymm0 # sched: [100:?]
+; ZNVER1-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = load <8 x i16>, <8 x i16> *%a1
+ %2 = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %1)
+ %3 = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %a0)
+ %4 = fadd <8 x float> %2, %3
+ ret <8 x float> %4
+}
+declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>)
+
+define <8 x i16> @test_vcvtps2ph_128(<4 x float> %a0, <4 x float> %a1, <4 x i16> *%a2) {
+; GENERIC-LABEL: test_vcvtps2ph_128:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vcvtps2ph $0, %xmm0, %xmm0 # sched: [3:1.00]
+; GENERIC-NEXT: vcvtps2ph $0, %xmm1, (%rdi) # sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; IVY-LABEL: test_vcvtps2ph_128:
+; IVY: # %bb.0:
+; IVY-NEXT: vcvtps2ph $0, %xmm0, %xmm0 # sched: [3:1.00]
+; IVY-NEXT: vcvtps2ph $0, %xmm1, (%rdi) # sched: [7:1.00]
+; IVY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_vcvtps2ph_128:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vcvtps2ph $0, %xmm0, %xmm0 # sched: [4:1.00]
+; HASWELL-NEXT: vcvtps2ph $0, %xmm1, (%rdi) # sched: [5:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_vcvtps2ph_128:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vcvtps2ph $0, %xmm0, %xmm0 # sched: [4:1.00]
+; BROADWELL-NEXT: vcvtps2ph $0, %xmm1, (%rdi) # sched: [4:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_vcvtps2ph_128:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vcvtps2ph $0, %xmm0, %xmm0 # sched: [5:1.00]
+; SKYLAKE-NEXT: vcvtps2ph $0, %xmm1, (%rdi) # sched: [6:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_vcvtps2ph_128:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: vcvtps2ph $0, %xmm0, %xmm0 # sched: [3:1.00]
+; BTVER2-NEXT: vcvtps2ph $0, %xmm1, (%rdi) # sched: [3:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_vcvtps2ph_128:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vcvtps2ph $0, %xmm0, %xmm0 # sched: [100:?]
+; ZNVER1-NEXT: vcvtps2ph $0, %xmm1, (%rdi) # sched: [100:?]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call <8 x i16> @llvm.x86.vcvtps2ph.128(<4 x float> %a0, i32 0)
+ %2 = call <8 x i16> @llvm.x86.vcvtps2ph.128(<4 x float> %a1, i32 0)
+ %3 = shufflevector <8 x i16> %2, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ store <4 x i16> %3, <4 x i16> *%a2
+ ret <8 x i16> %1
+}
+declare <8 x i16> @llvm.x86.vcvtps2ph.128(<4 x float>, i32)
+
+define <8 x i16> @test_vcvtps2ph_256(<8 x float> %a0, <8 x float> %a1, <8 x i16> *%a2) {
+; GENERIC-LABEL: test_vcvtps2ph_256:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: vcvtps2ph $0, %ymm0, %xmm0 # sched: [3:1.00]
+; GENERIC-NEXT: vcvtps2ph $0, %ymm1, (%rdi) # sched: [7:1.00]
+; GENERIC-NEXT: vzeroupper
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; IVY-LABEL: test_vcvtps2ph_256:
+; IVY: # %bb.0:
+; IVY-NEXT: vcvtps2ph $0, %ymm0, %xmm0 # sched: [3:1.00]
+; IVY-NEXT: vcvtps2ph $0, %ymm1, (%rdi) # sched: [7:1.00]
+; IVY-NEXT: vzeroupper
+; IVY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_vcvtps2ph_256:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vcvtps2ph $0, %ymm0, %xmm0 # sched: [6:1.00]
+; HASWELL-NEXT: vcvtps2ph $0, %ymm1, (%rdi) # sched: [7:1.00]
+; HASWELL-NEXT: vzeroupper # sched: [4:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_vcvtps2ph_256:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vcvtps2ph $0, %ymm0, %xmm0 # sched: [6:1.00]
+; BROADWELL-NEXT: vcvtps2ph $0, %ymm1, (%rdi) # sched: [4:1.00]
+; BROADWELL-NEXT: vzeroupper # sched: [4:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_vcvtps2ph_256:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vcvtps2ph $0, %ymm0, %xmm0 # sched: [7:1.00]
+; SKYLAKE-NEXT: vcvtps2ph $0, %ymm1, (%rdi) # sched: [8:1.00]
+; SKYLAKE-NEXT: vzeroupper # sched: [4:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_vcvtps2ph_256:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: vcvtps2ph $0, %ymm0, %xmm0 # sched: [6:2.00]
+; BTVER2-NEXT: vcvtps2ph $0, %ymm1, (%rdi) # sched: [11:2.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_vcvtps2ph_256:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vcvtps2ph $0, %ymm0, %xmm0 # sched: [100:?]
+; ZNVER1-NEXT: vcvtps2ph $0, %ymm1, (%rdi) # sched: [100:?]
+; ZNVER1-NEXT: vzeroupper # sched: [100:?]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %a0, i32 0)
+ %2 = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %a1, i32 0)
+ store <8 x i16> %2, <8 x i16> *%a2
+ ret <8 x i16> %1
+}
+declare <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float>, i32)
diff --git a/test/CodeGen/X86/fadd-combines.ll b/test/CodeGen/X86/fadd-combines.ll
index 28f72f42d01d..ce7ee94e0fbd 100644
--- a/test/CodeGen/X86/fadd-combines.ll
+++ b/test/CodeGen/X86/fadd-combines.ll
@@ -3,7 +3,7 @@
define float @fadd_zero_f32(float %x) #0 {
; CHECK-LABEL: fadd_zero_f32:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: retq
%y = fadd float %x, 0.0
ret float %y
@@ -11,7 +11,7 @@ define float @fadd_zero_f32(float %x) #0 {
define <4 x float> @fadd_zero_4f32(<4 x float> %x) #0 {
; CHECK-LABEL: fadd_zero_4f32:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: retq
%y = fadd <4 x float> %x, zeroinitializer
ret <4 x float> %y
@@ -20,7 +20,7 @@ define <4 x float> @fadd_zero_4f32(<4 x float> %x) #0 {
; CHECK: float 3
define float @fadd_2const_f32(float %x) #0 {
; CHECK-LABEL: fadd_2const_f32:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: addss {{.*}}(%rip), %xmm0
; CHECK-NEXT: retq
%y = fadd float %x, 1.0
@@ -34,7 +34,7 @@ define float @fadd_2const_f32(float %x) #0 {
; CHECK: float 5
define <4 x float> @fadd_2const_4f32(<4 x float> %x) #0 {
; CHECK-LABEL: fadd_2const_4f32:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: addps {{.*}}(%rip), %xmm0
; CHECK-NEXT: retq
%y = fadd <4 x float> %x, <float 1.0, float 2.0, float 3.0, float 4.0>
@@ -45,7 +45,7 @@ define <4 x float> @fadd_2const_4f32(<4 x float> %x) #0 {
; CHECK: float 3
define float @fadd_x_fmul_x_c_f32(float %x) #0 {
; CHECK-LABEL: fadd_x_fmul_x_c_f32:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: mulss {{.*}}(%rip), %xmm0
; CHECK-NEXT: retq
%y = fmul float %x, 2.0
@@ -59,7 +59,7 @@ define float @fadd_x_fmul_x_c_f32(float %x) #0 {
; CHECK: float 5
define <4 x float> @fadd_x_fmul_x_c_4f32(<4 x float> %x) #0 {
; CHECK-LABEL: fadd_x_fmul_x_c_4f32:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: mulps {{.*}}(%rip), %xmm0
; CHECK-NEXT: retq
%y = fmul <4 x float> %x, <float 1.0, float 2.0, float 3.0, float 4.0>
@@ -70,7 +70,7 @@ define <4 x float> @fadd_x_fmul_x_c_4f32(<4 x float> %x) #0 {
; CHECK: float 3
define float @fadd_fmul_x_c_x_f32(float %x) #0 {
; CHECK-LABEL: fadd_fmul_x_c_x_f32:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: mulss {{.*}}(%rip), %xmm0
; CHECK-NEXT: retq
%y = fmul float %x, 2.0
@@ -84,7 +84,7 @@ define float @fadd_fmul_x_c_x_f32(float %x) #0 {
; CHECK: float 5
define <4 x float> @fadd_fmul_x_c_x_4f32(<4 x float> %x) #0 {
; CHECK-LABEL: fadd_fmul_x_c_x_4f32:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: mulps {{.*}}(%rip), %xmm0
; CHECK-NEXT: retq
%y = fmul <4 x float> %x, <float 1.0, float 2.0, float 3.0, float 4.0>
@@ -95,7 +95,7 @@ define <4 x float> @fadd_fmul_x_c_x_4f32(<4 x float> %x) #0 {
; CHECK: float 4
define float @fadd_fadd_x_x_fmul_x_c_f32(float %x) #0 {
; CHECK-LABEL: fadd_fadd_x_x_fmul_x_c_f32:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: mulss {{.*}}(%rip), %xmm0
; CHECK-NEXT: retq
%y = fadd float %x, %x
@@ -110,7 +110,7 @@ define float @fadd_fadd_x_x_fmul_x_c_f32(float %x) #0 {
; CHECK: float 6
define <4 x float> @fadd_fadd_x_x_fmul_x_c_4f32(<4 x float> %x) #0 {
; CHECK-LABEL: fadd_fadd_x_x_fmul_x_c_4f32:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: mulps {{.*}}(%rip), %xmm0
; CHECK-NEXT: retq
%y = fadd <4 x float> %x, %x
@@ -122,7 +122,7 @@ define <4 x float> @fadd_fadd_x_x_fmul_x_c_4f32(<4 x float> %x) #0 {
; CHECK: float 4
define float @fadd_fmul_x_c_fadd_x_x_f32(float %x) #0 {
; CHECK-LABEL: fadd_fmul_x_c_fadd_x_x_f32:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: mulss {{.*}}(%rip), %xmm0
; CHECK-NEXT: retq
%y = fadd float %x, %x
@@ -137,7 +137,7 @@ define float @fadd_fmul_x_c_fadd_x_x_f32(float %x) #0 {
; CHECK: float 6
define <4 x float> @fadd_fmul_x_c_fadd_x_x_4f32(<4 x float> %x) #0 {
; CHECK-LABEL: fadd_fmul_x_c_fadd_x_x_4f32:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: mulps {{.*}}(%rip), %xmm0
; CHECK-NEXT: retq
%y = fadd <4 x float> %x, %x
@@ -149,7 +149,7 @@ define <4 x float> @fadd_fmul_x_c_fadd_x_x_4f32(<4 x float> %x) #0 {
; CHECK: float 3
define float @fadd_x_fadd_x_x_f32(float %x) #0 {
; CHECK-LABEL: fadd_x_fadd_x_x_f32:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: mulss {{.*}}(%rip), %xmm0
; CHECK-NEXT: retq
%y = fadd float %x, %x
@@ -163,7 +163,7 @@ define float @fadd_x_fadd_x_x_f32(float %x) #0 {
; CHECK: float 3
define <4 x float> @fadd_x_fadd_x_x_4f32(<4 x float> %x) #0 {
; CHECK-LABEL: fadd_x_fadd_x_x_4f32:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: mulps {{.*}}(%rip), %xmm0
; CHECK-NEXT: retq
%y = fadd <4 x float> %x, %x
@@ -174,7 +174,7 @@ define <4 x float> @fadd_x_fadd_x_x_4f32(<4 x float> %x) #0 {
; CHECK: float 3
define float @fadd_fadd_x_x_x_f32(float %x) #0 {
; CHECK-LABEL: fadd_fadd_x_x_x_f32:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: mulss {{.*}}(%rip), %xmm0
; CHECK-NEXT: retq
%y = fadd float %x, %x
@@ -188,7 +188,7 @@ define float @fadd_fadd_x_x_x_f32(float %x) #0 {
; CHECK: float 3
define <4 x float> @fadd_fadd_x_x_x_4f32(<4 x float> %x) #0 {
; CHECK-LABEL: fadd_fadd_x_x_x_4f32:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: mulps {{.*}}(%rip), %xmm0
; CHECK-NEXT: retq
%y = fadd <4 x float> %x, %x
@@ -199,7 +199,7 @@ define <4 x float> @fadd_fadd_x_x_x_4f32(<4 x float> %x) #0 {
; CHECK: float 4
define float @fadd_fadd_x_x_fadd_x_x_f32(float %x) #0 {
; CHECK-LABEL: fadd_fadd_x_x_fadd_x_x_f32:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: mulss {{.*}}(%rip), %xmm0
; CHECK-NEXT: retq
%y = fadd float %x, %x
@@ -213,7 +213,7 @@ define float @fadd_fadd_x_x_fadd_x_x_f32(float %x) #0 {
; CHECK: float 4
define <4 x float> @fadd_fadd_x_x_fadd_x_x_4f32(<4 x float> %x) #0 {
; CHECK-LABEL: fadd_fadd_x_x_fadd_x_x_4f32:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: mulps {{.*}}(%rip), %xmm0
; CHECK-NEXT: retq
%y = fadd <4 x float> %x, %x
diff --git a/test/CodeGen/X86/fast-cc-callee-pops.ll b/test/CodeGen/X86/fast-cc-callee-pops.ll
index 2c5b80ac4af0..a0ec6ce25d75 100644
--- a/test/CodeGen/X86/fast-cc-callee-pops.ll
+++ b/test/CodeGen/X86/fast-cc-callee-pops.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -x86-asm-syntax=intel -mcpu=yonah | FileCheck %s
+; RUN: llc < %s -mtriple=i686-- -x86-asm-syntax=intel -mcpu=yonah | FileCheck %s
; Check that a fastcc function pops its stack variables before returning.
diff --git a/test/CodeGen/X86/fast-cc-merge-stack-adj.ll b/test/CodeGen/X86/fast-cc-merge-stack-adj.ll
index d9cfaa4c2656..436be5b7a736 100644
--- a/test/CodeGen/X86/fast-cc-merge-stack-adj.ll
+++ b/test/CodeGen/X86/fast-cc-merge-stack-adj.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mcpu=generic -march=x86 -x86-asm-syntax=intel | FileCheck %s
+; RUN: llc < %s -mcpu=generic -x86-asm-syntax=intel | FileCheck %s
; CHECK: add esp, 8
target triple = "i686-pc-linux-gnu"
diff --git a/test/CodeGen/X86/fast-cc-pass-in-regs.ll b/test/CodeGen/X86/fast-cc-pass-in-regs.ll
index ac898e69dfe1..1fe2515bef88 100644
--- a/test/CodeGen/X86/fast-cc-pass-in-regs.ll
+++ b/test/CodeGen/X86/fast-cc-pass-in-regs.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -x86-asm-syntax=intel | FileCheck %s
+; RUN: llc < %s -mtriple=i686-- -x86-asm-syntax=intel | FileCheck %s
; check that fastcc is passing stuff in regs.
declare x86_fastcallcc i64 @callee(i64 inreg)
diff --git a/test/CodeGen/X86/fast-isel-agg-constant.ll b/test/CodeGen/X86/fast-isel-agg-constant.ll
index ce0dff75cf0e..d782ec4c51c2 100644
--- a/test/CodeGen/X86/fast-isel-agg-constant.ll
+++ b/test/CodeGen/X86/fast-isel-agg-constant.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -O0 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-- -O0 | FileCheck %s
; Make sure fast-isel doesn't screw up aggregate constants.
; (Failing out is okay, as long as we don't miscompile.)
diff --git a/test/CodeGen/X86/fast-isel-atomic.ll b/test/CodeGen/X86/fast-isel-atomic.ll
index 5f761ddb858f..b2b63434ca66 100644
--- a/test/CodeGen/X86/fast-isel-atomic.ll
+++ b/test/CodeGen/X86/fast-isel-atomic.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -O0 -march=x86-64
+; RUN: llc < %s -O0 -mtriple=x86_64--
; rdar://8204072
; PR7652
diff --git a/test/CodeGen/X86/fast-isel-bail.ll b/test/CodeGen/X86/fast-isel-bail.ll
index a485827be96d..32a5ffdf5cc5 100644
--- a/test/CodeGen/X86/fast-isel-bail.ll
+++ b/test/CodeGen/X86/fast-isel-bail.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -O0
+; RUN: llc < %s -mtriple=i686-- -O0
; This file is for regression tests for cases where FastISel needs
; to gracefully bail out and let SelectionDAGISel take over.
diff --git a/test/CodeGen/X86/fast-isel-bc.ll b/test/CodeGen/X86/fast-isel-bc.ll
index 8ac15cdbc03f..fb3693d0b3fa 100644
--- a/test/CodeGen/X86/fast-isel-bc.ll
+++ b/test/CodeGen/X86/fast-isel-bc.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -O0 -march=x86-64 -mattr=+mmx,+sse2 | FileCheck %s
+; RUN: llc < %s -O0 -mattr=+mmx,+sse2 | FileCheck %s
; PR4684
target datalayout =
diff --git a/test/CodeGen/X86/fast-isel-call-cleanup.ll b/test/CodeGen/X86/fast-isel-call-cleanup.ll
new file mode 100644
index 000000000000..724d53dbfe0e
--- /dev/null
+++ b/test/CodeGen/X86/fast-isel-call-cleanup.ll
@@ -0,0 +1,19 @@
+; RUN: llc -fast-isel -O0 -code-model=large -mcpu=generic -mtriple=x86_64-apple-darwin10 -relocation-model=pic < %s | FileCheck %s
+
+; Check that fast-isel cleans up when it fails to lower a call instruction.
+define void @fastiselcall() {
+entry:
+ %call = call i32 @targetfn(i32 42)
+ ret void
+; CHECK-LABEL: fastiselcall:
+; Local value area is still there:
+; CHECK: movl $42, {{%[a-z]+}}
+; Fast-ISel's arg mov is not here:
+; CHECK-NOT: movl $42, (%esp)
+; SDag-ISel's arg mov:
+; CHECK: movabsq $_targetfn, %[[REG:[^ ]*]]
+; CHECK: movl $42, %edi
+; CHECK: callq *%[[REG]]
+
+}
+declare i32 @targetfn(i32)
diff --git a/test/CodeGen/X86/fast-isel-call.ll b/test/CodeGen/X86/fast-isel-call.ll
index ee70404bcedf..3f394514e2ce 100644
--- a/test/CodeGen/X86/fast-isel-call.ll
+++ b/test/CodeGen/X86/fast-isel-call.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -O0 -fast-isel-abort=1 -march=x86 -mtriple=i686-apple-darwin8 2>/dev/null | FileCheck %s
-; RUN: llc < %s -O0 -fast-isel-abort=1 -march=x86 -mtriple=i686-apple-darwin8 2>&1 >/dev/null | FileCheck -check-prefix=STDERR -allow-empty %s
+; RUN: llc < %s -O0 -fast-isel-abort=1 -mtriple=i686-apple-darwin8 2>/dev/null | FileCheck %s
+; RUN: llc < %s -O0 -fast-isel-abort=1 -mtriple=i686-apple-darwin8 2>&1 >/dev/null | FileCheck -check-prefix=STDERR -allow-empty %s
%struct.s = type {i32, i32, i32}
diff --git a/test/CodeGen/X86/fast-isel-cmp.ll b/test/CodeGen/X86/fast-isel-cmp.ll
index 59c536369849..355e6eb1b1e1 100644
--- a/test/CodeGen/X86/fast-isel-cmp.ll
+++ b/test/CodeGen/X86/fast-isel-cmp.ll
@@ -1,276 +1,394 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-apple-darwin10 | FileCheck %s --check-prefix=ALL --check-prefix=SDAG
-; RUN: llc < %s -fast-isel -fast-isel-abort=1 -mtriple=x86_64-apple-darwin10 | FileCheck %s --check-prefix=ALL --check-prefix=FAST
+; RUN: llc < %s -fast-isel -fast-isel-abort=1 -mtriple=x86_64-apple-darwin10 | FileCheck %s --check-prefix=ALL --check-prefix=FAST --check-prefix=FAST_NOAVX
+; RUN: llc < %s -fast-isel -fast-isel-abort=1 -mtriple=x86_64-apple-darwin10 -mattr=avx | FileCheck %s --check-prefix=ALL --check-prefix=FAST --check-prefix=FAST_AVX
+; RUN: llc < %s -fast-isel -fast-isel-abort=1 -mtriple=x86_64-apple-darwin10 -mattr=avx512f | FileCheck %s --check-prefix=ALL --check-prefix=FAST --check-prefix=FAST_AVX
define zeroext i1 @fcmp_oeq(float %x, float %y) {
; SDAG-LABEL: fcmp_oeq:
-; SDAG: ## BB#0:
+; SDAG: ## %bb.0:
; SDAG-NEXT: cmpeqss %xmm1, %xmm0
; SDAG-NEXT: movd %xmm0, %eax
; SDAG-NEXT: andl $1, %eax
-; SDAG-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; SDAG-NEXT: ## kill: def %al killed %al killed %eax
; SDAG-NEXT: retq
;
-; FAST-LABEL: fcmp_oeq:
-; FAST: ## BB#0:
-; FAST-NEXT: ucomiss %xmm1, %xmm0
-; FAST-NEXT: sete %al
-; FAST-NEXT: setnp %cl
-; FAST-NEXT: andb %al, %cl
-; FAST-NEXT: andb $1, %cl
-; FAST-NEXT: movzbl %cl, %eax
-; FAST-NEXT: retq
+; FAST_NOAVX-LABEL: fcmp_oeq:
+; FAST_NOAVX: ## %bb.0:
+; FAST_NOAVX-NEXT: ucomiss %xmm1, %xmm0
+; FAST_NOAVX-NEXT: sete %al
+; FAST_NOAVX-NEXT: setnp %cl
+; FAST_NOAVX-NEXT: andb %al, %cl
+; FAST_NOAVX-NEXT: andb $1, %cl
+; FAST_NOAVX-NEXT: movzbl %cl, %eax
+; FAST_NOAVX-NEXT: retq
+;
+; FAST_AVX-LABEL: fcmp_oeq:
+; FAST_AVX: ## %bb.0:
+; FAST_AVX-NEXT: vucomiss %xmm1, %xmm0
+; FAST_AVX-NEXT: sete %al
+; FAST_AVX-NEXT: setnp %cl
+; FAST_AVX-NEXT: andb %al, %cl
+; FAST_AVX-NEXT: andb $1, %cl
+; FAST_AVX-NEXT: movzbl %cl, %eax
+; FAST_AVX-NEXT: retq
%1 = fcmp oeq float %x, %y
ret i1 %1
}
define zeroext i1 @fcmp_ogt(float %x, float %y) {
; SDAG-LABEL: fcmp_ogt:
-; SDAG: ## BB#0:
+; SDAG: ## %bb.0:
; SDAG-NEXT: ucomiss %xmm1, %xmm0
; SDAG-NEXT: seta %al
; SDAG-NEXT: retq
;
-; FAST-LABEL: fcmp_ogt:
-; FAST: ## BB#0:
-; FAST-NEXT: ucomiss %xmm1, %xmm0
-; FAST-NEXT: seta %al
-; FAST-NEXT: andb $1, %al
-; FAST-NEXT: movzbl %al, %eax
-; FAST-NEXT: retq
+; FAST_NOAVX-LABEL: fcmp_ogt:
+; FAST_NOAVX: ## %bb.0:
+; FAST_NOAVX-NEXT: ucomiss %xmm1, %xmm0
+; FAST_NOAVX-NEXT: seta %al
+; FAST_NOAVX-NEXT: andb $1, %al
+; FAST_NOAVX-NEXT: movzbl %al, %eax
+; FAST_NOAVX-NEXT: retq
+;
+; FAST_AVX-LABEL: fcmp_ogt:
+; FAST_AVX: ## %bb.0:
+; FAST_AVX-NEXT: vucomiss %xmm1, %xmm0
+; FAST_AVX-NEXT: seta %al
+; FAST_AVX-NEXT: andb $1, %al
+; FAST_AVX-NEXT: movzbl %al, %eax
+; FAST_AVX-NEXT: retq
%1 = fcmp ogt float %x, %y
ret i1 %1
}
define zeroext i1 @fcmp_oge(float %x, float %y) {
; SDAG-LABEL: fcmp_oge:
-; SDAG: ## BB#0:
+; SDAG: ## %bb.0:
; SDAG-NEXT: ucomiss %xmm1, %xmm0
; SDAG-NEXT: setae %al
; SDAG-NEXT: retq
;
-; FAST-LABEL: fcmp_oge:
-; FAST: ## BB#0:
-; FAST-NEXT: ucomiss %xmm1, %xmm0
-; FAST-NEXT: setae %al
-; FAST-NEXT: andb $1, %al
-; FAST-NEXT: movzbl %al, %eax
-; FAST-NEXT: retq
+; FAST_NOAVX-LABEL: fcmp_oge:
+; FAST_NOAVX: ## %bb.0:
+; FAST_NOAVX-NEXT: ucomiss %xmm1, %xmm0
+; FAST_NOAVX-NEXT: setae %al
+; FAST_NOAVX-NEXT: andb $1, %al
+; FAST_NOAVX-NEXT: movzbl %al, %eax
+; FAST_NOAVX-NEXT: retq
+;
+; FAST_AVX-LABEL: fcmp_oge:
+; FAST_AVX: ## %bb.0:
+; FAST_AVX-NEXT: vucomiss %xmm1, %xmm0
+; FAST_AVX-NEXT: setae %al
+; FAST_AVX-NEXT: andb $1, %al
+; FAST_AVX-NEXT: movzbl %al, %eax
+; FAST_AVX-NEXT: retq
%1 = fcmp oge float %x, %y
ret i1 %1
}
define zeroext i1 @fcmp_olt(float %x, float %y) {
; SDAG-LABEL: fcmp_olt:
-; SDAG: ## BB#0:
+; SDAG: ## %bb.0:
; SDAG-NEXT: ucomiss %xmm0, %xmm1
; SDAG-NEXT: seta %al
; SDAG-NEXT: retq
;
-; FAST-LABEL: fcmp_olt:
-; FAST: ## BB#0:
-; FAST-NEXT: ucomiss %xmm0, %xmm1
-; FAST-NEXT: seta %al
-; FAST-NEXT: andb $1, %al
-; FAST-NEXT: movzbl %al, %eax
-; FAST-NEXT: retq
+; FAST_NOAVX-LABEL: fcmp_olt:
+; FAST_NOAVX: ## %bb.0:
+; FAST_NOAVX-NEXT: ucomiss %xmm0, %xmm1
+; FAST_NOAVX-NEXT: seta %al
+; FAST_NOAVX-NEXT: andb $1, %al
+; FAST_NOAVX-NEXT: movzbl %al, %eax
+; FAST_NOAVX-NEXT: retq
+;
+; FAST_AVX-LABEL: fcmp_olt:
+; FAST_AVX: ## %bb.0:
+; FAST_AVX-NEXT: vucomiss %xmm0, %xmm1
+; FAST_AVX-NEXT: seta %al
+; FAST_AVX-NEXT: andb $1, %al
+; FAST_AVX-NEXT: movzbl %al, %eax
+; FAST_AVX-NEXT: retq
%1 = fcmp olt float %x, %y
ret i1 %1
}
define zeroext i1 @fcmp_ole(float %x, float %y) {
; SDAG-LABEL: fcmp_ole:
-; SDAG: ## BB#0:
+; SDAG: ## %bb.0:
; SDAG-NEXT: ucomiss %xmm0, %xmm1
; SDAG-NEXT: setae %al
; SDAG-NEXT: retq
;
-; FAST-LABEL: fcmp_ole:
-; FAST: ## BB#0:
-; FAST-NEXT: ucomiss %xmm0, %xmm1
-; FAST-NEXT: setae %al
-; FAST-NEXT: andb $1, %al
-; FAST-NEXT: movzbl %al, %eax
-; FAST-NEXT: retq
+; FAST_NOAVX-LABEL: fcmp_ole:
+; FAST_NOAVX: ## %bb.0:
+; FAST_NOAVX-NEXT: ucomiss %xmm0, %xmm1
+; FAST_NOAVX-NEXT: setae %al
+; FAST_NOAVX-NEXT: andb $1, %al
+; FAST_NOAVX-NEXT: movzbl %al, %eax
+; FAST_NOAVX-NEXT: retq
+;
+; FAST_AVX-LABEL: fcmp_ole:
+; FAST_AVX: ## %bb.0:
+; FAST_AVX-NEXT: vucomiss %xmm0, %xmm1
+; FAST_AVX-NEXT: setae %al
+; FAST_AVX-NEXT: andb $1, %al
+; FAST_AVX-NEXT: movzbl %al, %eax
+; FAST_AVX-NEXT: retq
%1 = fcmp ole float %x, %y
ret i1 %1
}
define zeroext i1 @fcmp_one(float %x, float %y) {
; SDAG-LABEL: fcmp_one:
-; SDAG: ## BB#0:
+; SDAG: ## %bb.0:
; SDAG-NEXT: ucomiss %xmm1, %xmm0
; SDAG-NEXT: setne %al
; SDAG-NEXT: retq
;
-; FAST-LABEL: fcmp_one:
-; FAST: ## BB#0:
-; FAST-NEXT: ucomiss %xmm1, %xmm0
-; FAST-NEXT: setne %al
-; FAST-NEXT: andb $1, %al
-; FAST-NEXT: movzbl %al, %eax
-; FAST-NEXT: retq
+; FAST_NOAVX-LABEL: fcmp_one:
+; FAST_NOAVX: ## %bb.0:
+; FAST_NOAVX-NEXT: ucomiss %xmm1, %xmm0
+; FAST_NOAVX-NEXT: setne %al
+; FAST_NOAVX-NEXT: andb $1, %al
+; FAST_NOAVX-NEXT: movzbl %al, %eax
+; FAST_NOAVX-NEXT: retq
+;
+; FAST_AVX-LABEL: fcmp_one:
+; FAST_AVX: ## %bb.0:
+; FAST_AVX-NEXT: vucomiss %xmm1, %xmm0
+; FAST_AVX-NEXT: setne %al
+; FAST_AVX-NEXT: andb $1, %al
+; FAST_AVX-NEXT: movzbl %al, %eax
+; FAST_AVX-NEXT: retq
%1 = fcmp one float %x, %y
ret i1 %1
}
define zeroext i1 @fcmp_ord(float %x, float %y) {
; SDAG-LABEL: fcmp_ord:
-; SDAG: ## BB#0:
+; SDAG: ## %bb.0:
; SDAG-NEXT: ucomiss %xmm1, %xmm0
; SDAG-NEXT: setnp %al
; SDAG-NEXT: retq
;
-; FAST-LABEL: fcmp_ord:
-; FAST: ## BB#0:
-; FAST-NEXT: ucomiss %xmm1, %xmm0
-; FAST-NEXT: setnp %al
-; FAST-NEXT: andb $1, %al
-; FAST-NEXT: movzbl %al, %eax
-; FAST-NEXT: retq
+; FAST_NOAVX-LABEL: fcmp_ord:
+; FAST_NOAVX: ## %bb.0:
+; FAST_NOAVX-NEXT: ucomiss %xmm1, %xmm0
+; FAST_NOAVX-NEXT: setnp %al
+; FAST_NOAVX-NEXT: andb $1, %al
+; FAST_NOAVX-NEXT: movzbl %al, %eax
+; FAST_NOAVX-NEXT: retq
+;
+; FAST_AVX-LABEL: fcmp_ord:
+; FAST_AVX: ## %bb.0:
+; FAST_AVX-NEXT: vucomiss %xmm1, %xmm0
+; FAST_AVX-NEXT: setnp %al
+; FAST_AVX-NEXT: andb $1, %al
+; FAST_AVX-NEXT: movzbl %al, %eax
+; FAST_AVX-NEXT: retq
%1 = fcmp ord float %x, %y
ret i1 %1
}
define zeroext i1 @fcmp_uno(float %x, float %y) {
; SDAG-LABEL: fcmp_uno:
-; SDAG: ## BB#0:
+; SDAG: ## %bb.0:
; SDAG-NEXT: ucomiss %xmm1, %xmm0
; SDAG-NEXT: setp %al
; SDAG-NEXT: retq
;
-; FAST-LABEL: fcmp_uno:
-; FAST: ## BB#0:
-; FAST-NEXT: ucomiss %xmm1, %xmm0
-; FAST-NEXT: setp %al
-; FAST-NEXT: andb $1, %al
-; FAST-NEXT: movzbl %al, %eax
-; FAST-NEXT: retq
+; FAST_NOAVX-LABEL: fcmp_uno:
+; FAST_NOAVX: ## %bb.0:
+; FAST_NOAVX-NEXT: ucomiss %xmm1, %xmm0
+; FAST_NOAVX-NEXT: setp %al
+; FAST_NOAVX-NEXT: andb $1, %al
+; FAST_NOAVX-NEXT: movzbl %al, %eax
+; FAST_NOAVX-NEXT: retq
+;
+; FAST_AVX-LABEL: fcmp_uno:
+; FAST_AVX: ## %bb.0:
+; FAST_AVX-NEXT: vucomiss %xmm1, %xmm0
+; FAST_AVX-NEXT: setp %al
+; FAST_AVX-NEXT: andb $1, %al
+; FAST_AVX-NEXT: movzbl %al, %eax
+; FAST_AVX-NEXT: retq
%1 = fcmp uno float %x, %y
ret i1 %1
}
define zeroext i1 @fcmp_ueq(float %x, float %y) {
; SDAG-LABEL: fcmp_ueq:
-; SDAG: ## BB#0:
+; SDAG: ## %bb.0:
; SDAG-NEXT: ucomiss %xmm1, %xmm0
; SDAG-NEXT: sete %al
; SDAG-NEXT: retq
;
-; FAST-LABEL: fcmp_ueq:
-; FAST: ## BB#0:
-; FAST-NEXT: ucomiss %xmm1, %xmm0
-; FAST-NEXT: sete %al
-; FAST-NEXT: andb $1, %al
-; FAST-NEXT: movzbl %al, %eax
-; FAST-NEXT: retq
+; FAST_NOAVX-LABEL: fcmp_ueq:
+; FAST_NOAVX: ## %bb.0:
+; FAST_NOAVX-NEXT: ucomiss %xmm1, %xmm0
+; FAST_NOAVX-NEXT: sete %al
+; FAST_NOAVX-NEXT: andb $1, %al
+; FAST_NOAVX-NEXT: movzbl %al, %eax
+; FAST_NOAVX-NEXT: retq
+;
+; FAST_AVX-LABEL: fcmp_ueq:
+; FAST_AVX: ## %bb.0:
+; FAST_AVX-NEXT: vucomiss %xmm1, %xmm0
+; FAST_AVX-NEXT: sete %al
+; FAST_AVX-NEXT: andb $1, %al
+; FAST_AVX-NEXT: movzbl %al, %eax
+; FAST_AVX-NEXT: retq
%1 = fcmp ueq float %x, %y
ret i1 %1
}
define zeroext i1 @fcmp_ugt(float %x, float %y) {
; SDAG-LABEL: fcmp_ugt:
-; SDAG: ## BB#0:
+; SDAG: ## %bb.0:
; SDAG-NEXT: ucomiss %xmm0, %xmm1
; SDAG-NEXT: setb %al
; SDAG-NEXT: retq
;
-; FAST-LABEL: fcmp_ugt:
-; FAST: ## BB#0:
-; FAST-NEXT: ucomiss %xmm0, %xmm1
-; FAST-NEXT: setb %al
-; FAST-NEXT: andb $1, %al
-; FAST-NEXT: movzbl %al, %eax
-; FAST-NEXT: retq
+; FAST_NOAVX-LABEL: fcmp_ugt:
+; FAST_NOAVX: ## %bb.0:
+; FAST_NOAVX-NEXT: ucomiss %xmm0, %xmm1
+; FAST_NOAVX-NEXT: setb %al
+; FAST_NOAVX-NEXT: andb $1, %al
+; FAST_NOAVX-NEXT: movzbl %al, %eax
+; FAST_NOAVX-NEXT: retq
+;
+; FAST_AVX-LABEL: fcmp_ugt:
+; FAST_AVX: ## %bb.0:
+; FAST_AVX-NEXT: vucomiss %xmm0, %xmm1
+; FAST_AVX-NEXT: setb %al
+; FAST_AVX-NEXT: andb $1, %al
+; FAST_AVX-NEXT: movzbl %al, %eax
+; FAST_AVX-NEXT: retq
%1 = fcmp ugt float %x, %y
ret i1 %1
}
define zeroext i1 @fcmp_uge(float %x, float %y) {
; SDAG-LABEL: fcmp_uge:
-; SDAG: ## BB#0:
+; SDAG: ## %bb.0:
; SDAG-NEXT: ucomiss %xmm0, %xmm1
; SDAG-NEXT: setbe %al
; SDAG-NEXT: retq
;
-; FAST-LABEL: fcmp_uge:
-; FAST: ## BB#0:
-; FAST-NEXT: ucomiss %xmm0, %xmm1
-; FAST-NEXT: setbe %al
-; FAST-NEXT: andb $1, %al
-; FAST-NEXT: movzbl %al, %eax
-; FAST-NEXT: retq
+; FAST_NOAVX-LABEL: fcmp_uge:
+; FAST_NOAVX: ## %bb.0:
+; FAST_NOAVX-NEXT: ucomiss %xmm0, %xmm1
+; FAST_NOAVX-NEXT: setbe %al
+; FAST_NOAVX-NEXT: andb $1, %al
+; FAST_NOAVX-NEXT: movzbl %al, %eax
+; FAST_NOAVX-NEXT: retq
+;
+; FAST_AVX-LABEL: fcmp_uge:
+; FAST_AVX: ## %bb.0:
+; FAST_AVX-NEXT: vucomiss %xmm0, %xmm1
+; FAST_AVX-NEXT: setbe %al
+; FAST_AVX-NEXT: andb $1, %al
+; FAST_AVX-NEXT: movzbl %al, %eax
+; FAST_AVX-NEXT: retq
%1 = fcmp uge float %x, %y
ret i1 %1
}
define zeroext i1 @fcmp_ult(float %x, float %y) {
; SDAG-LABEL: fcmp_ult:
-; SDAG: ## BB#0:
+; SDAG: ## %bb.0:
; SDAG-NEXT: ucomiss %xmm1, %xmm0
; SDAG-NEXT: setb %al
; SDAG-NEXT: retq
;
-; FAST-LABEL: fcmp_ult:
-; FAST: ## BB#0:
-; FAST-NEXT: ucomiss %xmm1, %xmm0
-; FAST-NEXT: setb %al
-; FAST-NEXT: andb $1, %al
-; FAST-NEXT: movzbl %al, %eax
-; FAST-NEXT: retq
+; FAST_NOAVX-LABEL: fcmp_ult:
+; FAST_NOAVX: ## %bb.0:
+; FAST_NOAVX-NEXT: ucomiss %xmm1, %xmm0
+; FAST_NOAVX-NEXT: setb %al
+; FAST_NOAVX-NEXT: andb $1, %al
+; FAST_NOAVX-NEXT: movzbl %al, %eax
+; FAST_NOAVX-NEXT: retq
+;
+; FAST_AVX-LABEL: fcmp_ult:
+; FAST_AVX: ## %bb.0:
+; FAST_AVX-NEXT: vucomiss %xmm1, %xmm0
+; FAST_AVX-NEXT: setb %al
+; FAST_AVX-NEXT: andb $1, %al
+; FAST_AVX-NEXT: movzbl %al, %eax
+; FAST_AVX-NEXT: retq
%1 = fcmp ult float %x, %y
ret i1 %1
}
define zeroext i1 @fcmp_ule(float %x, float %y) {
; SDAG-LABEL: fcmp_ule:
-; SDAG: ## BB#0:
+; SDAG: ## %bb.0:
; SDAG-NEXT: ucomiss %xmm1, %xmm0
; SDAG-NEXT: setbe %al
; SDAG-NEXT: retq
;
-; FAST-LABEL: fcmp_ule:
-; FAST: ## BB#0:
-; FAST-NEXT: ucomiss %xmm1, %xmm0
-; FAST-NEXT: setbe %al
-; FAST-NEXT: andb $1, %al
-; FAST-NEXT: movzbl %al, %eax
-; FAST-NEXT: retq
+; FAST_NOAVX-LABEL: fcmp_ule:
+; FAST_NOAVX: ## %bb.0:
+; FAST_NOAVX-NEXT: ucomiss %xmm1, %xmm0
+; FAST_NOAVX-NEXT: setbe %al
+; FAST_NOAVX-NEXT: andb $1, %al
+; FAST_NOAVX-NEXT: movzbl %al, %eax
+; FAST_NOAVX-NEXT: retq
+;
+; FAST_AVX-LABEL: fcmp_ule:
+; FAST_AVX: ## %bb.0:
+; FAST_AVX-NEXT: vucomiss %xmm1, %xmm0
+; FAST_AVX-NEXT: setbe %al
+; FAST_AVX-NEXT: andb $1, %al
+; FAST_AVX-NEXT: movzbl %al, %eax
+; FAST_AVX-NEXT: retq
%1 = fcmp ule float %x, %y
ret i1 %1
}
define zeroext i1 @fcmp_une(float %x, float %y) {
; SDAG-LABEL: fcmp_une:
-; SDAG: ## BB#0:
+; SDAG: ## %bb.0:
; SDAG-NEXT: cmpneqss %xmm1, %xmm0
; SDAG-NEXT: movd %xmm0, %eax
; SDAG-NEXT: andl $1, %eax
-; SDAG-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; SDAG-NEXT: ## kill: def %al killed %al killed %eax
; SDAG-NEXT: retq
;
-; FAST-LABEL: fcmp_une:
-; FAST: ## BB#0:
-; FAST-NEXT: ucomiss %xmm1, %xmm0
-; FAST-NEXT: setne %al
-; FAST-NEXT: setp %cl
-; FAST-NEXT: orb %al, %cl
-; FAST-NEXT: andb $1, %cl
-; FAST-NEXT: movzbl %cl, %eax
-; FAST-NEXT: retq
+; FAST_NOAVX-LABEL: fcmp_une:
+; FAST_NOAVX: ## %bb.0:
+; FAST_NOAVX-NEXT: ucomiss %xmm1, %xmm0
+; FAST_NOAVX-NEXT: setne %al
+; FAST_NOAVX-NEXT: setp %cl
+; FAST_NOAVX-NEXT: orb %al, %cl
+; FAST_NOAVX-NEXT: andb $1, %cl
+; FAST_NOAVX-NEXT: movzbl %cl, %eax
+; FAST_NOAVX-NEXT: retq
+;
+; FAST_AVX-LABEL: fcmp_une:
+; FAST_AVX: ## %bb.0:
+; FAST_AVX-NEXT: vucomiss %xmm1, %xmm0
+; FAST_AVX-NEXT: setne %al
+; FAST_AVX-NEXT: setp %cl
+; FAST_AVX-NEXT: orb %al, %cl
+; FAST_AVX-NEXT: andb $1, %cl
+; FAST_AVX-NEXT: movzbl %cl, %eax
+; FAST_AVX-NEXT: retq
%1 = fcmp une float %x, %y
ret i1 %1
}
define zeroext i1 @icmp_eq(i32 %x, i32 %y) {
; SDAG-LABEL: icmp_eq:
-; SDAG: ## BB#0:
+; SDAG: ## %bb.0:
; SDAG-NEXT: cmpl %esi, %edi
; SDAG-NEXT: sete %al
; SDAG-NEXT: retq
;
; FAST-LABEL: icmp_eq:
-; FAST: ## BB#0:
+; FAST: ## %bb.0:
; FAST-NEXT: cmpl %esi, %edi
; FAST-NEXT: sete %al
; FAST-NEXT: andb $1, %al
@@ -282,13 +400,13 @@ define zeroext i1 @icmp_eq(i32 %x, i32 %y) {
define zeroext i1 @icmp_ne(i32 %x, i32 %y) {
; SDAG-LABEL: icmp_ne:
-; SDAG: ## BB#0:
+; SDAG: ## %bb.0:
; SDAG-NEXT: cmpl %esi, %edi
; SDAG-NEXT: setne %al
; SDAG-NEXT: retq
;
; FAST-LABEL: icmp_ne:
-; FAST: ## BB#0:
+; FAST: ## %bb.0:
; FAST-NEXT: cmpl %esi, %edi
; FAST-NEXT: setne %al
; FAST-NEXT: andb $1, %al
@@ -300,13 +418,13 @@ define zeroext i1 @icmp_ne(i32 %x, i32 %y) {
define zeroext i1 @icmp_ugt(i32 %x, i32 %y) {
; SDAG-LABEL: icmp_ugt:
-; SDAG: ## BB#0:
+; SDAG: ## %bb.0:
; SDAG-NEXT: cmpl %esi, %edi
; SDAG-NEXT: seta %al
; SDAG-NEXT: retq
;
; FAST-LABEL: icmp_ugt:
-; FAST: ## BB#0:
+; FAST: ## %bb.0:
; FAST-NEXT: cmpl %esi, %edi
; FAST-NEXT: seta %al
; FAST-NEXT: andb $1, %al
@@ -318,13 +436,13 @@ define zeroext i1 @icmp_ugt(i32 %x, i32 %y) {
define zeroext i1 @icmp_uge(i32 %x, i32 %y) {
; SDAG-LABEL: icmp_uge:
-; SDAG: ## BB#0:
+; SDAG: ## %bb.0:
; SDAG-NEXT: cmpl %esi, %edi
; SDAG-NEXT: setae %al
; SDAG-NEXT: retq
;
; FAST-LABEL: icmp_uge:
-; FAST: ## BB#0:
+; FAST: ## %bb.0:
; FAST-NEXT: cmpl %esi, %edi
; FAST-NEXT: setae %al
; FAST-NEXT: andb $1, %al
@@ -336,13 +454,13 @@ define zeroext i1 @icmp_uge(i32 %x, i32 %y) {
define zeroext i1 @icmp_ult(i32 %x, i32 %y) {
; SDAG-LABEL: icmp_ult:
-; SDAG: ## BB#0:
+; SDAG: ## %bb.0:
; SDAG-NEXT: cmpl %esi, %edi
; SDAG-NEXT: setb %al
; SDAG-NEXT: retq
;
; FAST-LABEL: icmp_ult:
-; FAST: ## BB#0:
+; FAST: ## %bb.0:
; FAST-NEXT: cmpl %esi, %edi
; FAST-NEXT: setb %al
; FAST-NEXT: andb $1, %al
@@ -354,13 +472,13 @@ define zeroext i1 @icmp_ult(i32 %x, i32 %y) {
define zeroext i1 @icmp_ule(i32 %x, i32 %y) {
; SDAG-LABEL: icmp_ule:
-; SDAG: ## BB#0:
+; SDAG: ## %bb.0:
; SDAG-NEXT: cmpl %esi, %edi
; SDAG-NEXT: setbe %al
; SDAG-NEXT: retq
;
; FAST-LABEL: icmp_ule:
-; FAST: ## BB#0:
+; FAST: ## %bb.0:
; FAST-NEXT: cmpl %esi, %edi
; FAST-NEXT: setbe %al
; FAST-NEXT: andb $1, %al
@@ -372,13 +490,13 @@ define zeroext i1 @icmp_ule(i32 %x, i32 %y) {
define zeroext i1 @icmp_sgt(i32 %x, i32 %y) {
; SDAG-LABEL: icmp_sgt:
-; SDAG: ## BB#0:
+; SDAG: ## %bb.0:
; SDAG-NEXT: cmpl %esi, %edi
; SDAG-NEXT: setg %al
; SDAG-NEXT: retq
;
; FAST-LABEL: icmp_sgt:
-; FAST: ## BB#0:
+; FAST: ## %bb.0:
; FAST-NEXT: cmpl %esi, %edi
; FAST-NEXT: setg %al
; FAST-NEXT: andb $1, %al
@@ -390,13 +508,13 @@ define zeroext i1 @icmp_sgt(i32 %x, i32 %y) {
define zeroext i1 @icmp_sge(i32 %x, i32 %y) {
; SDAG-LABEL: icmp_sge:
-; SDAG: ## BB#0:
+; SDAG: ## %bb.0:
; SDAG-NEXT: cmpl %esi, %edi
; SDAG-NEXT: setge %al
; SDAG-NEXT: retq
;
; FAST-LABEL: icmp_sge:
-; FAST: ## BB#0:
+; FAST: ## %bb.0:
; FAST-NEXT: cmpl %esi, %edi
; FAST-NEXT: setge %al
; FAST-NEXT: andb $1, %al
@@ -408,13 +526,13 @@ define zeroext i1 @icmp_sge(i32 %x, i32 %y) {
define zeroext i1 @icmp_slt(i32 %x, i32 %y) {
; SDAG-LABEL: icmp_slt:
-; SDAG: ## BB#0:
+; SDAG: ## %bb.0:
; SDAG-NEXT: cmpl %esi, %edi
; SDAG-NEXT: setl %al
; SDAG-NEXT: retq
;
; FAST-LABEL: icmp_slt:
-; FAST: ## BB#0:
+; FAST: ## %bb.0:
; FAST-NEXT: cmpl %esi, %edi
; FAST-NEXT: setl %al
; FAST-NEXT: andb $1, %al
@@ -426,13 +544,13 @@ define zeroext i1 @icmp_slt(i32 %x, i32 %y) {
define zeroext i1 @icmp_sle(i32 %x, i32 %y) {
; SDAG-LABEL: icmp_sle:
-; SDAG: ## BB#0:
+; SDAG: ## %bb.0:
; SDAG-NEXT: cmpl %esi, %edi
; SDAG-NEXT: setle %al
; SDAG-NEXT: retq
;
; FAST-LABEL: icmp_sle:
-; FAST: ## BB#0:
+; FAST: ## %bb.0:
; FAST-NEXT: cmpl %esi, %edi
; FAST-NEXT: setle %al
; FAST-NEXT: andb $1, %al
@@ -445,54 +563,73 @@ define zeroext i1 @icmp_sle(i32 %x, i32 %y) {
; Test cmp folding and condition optimization.
define zeroext i1 @fcmp_oeq2(float %x) {
; SDAG-LABEL: fcmp_oeq2:
-; SDAG: ## BB#0:
+; SDAG: ## %bb.0:
; SDAG-NEXT: ucomiss %xmm0, %xmm0
; SDAG-NEXT: setnp %al
; SDAG-NEXT: retq
;
-; FAST-LABEL: fcmp_oeq2:
-; FAST: ## BB#0:
-; FAST-NEXT: ucomiss %xmm0, %xmm0
-; FAST-NEXT: setnp %al
-; FAST-NEXT: andb $1, %al
-; FAST-NEXT: movzbl %al, %eax
-; FAST-NEXT: retq
+; FAST_NOAVX-LABEL: fcmp_oeq2:
+; FAST_NOAVX: ## %bb.0:
+; FAST_NOAVX-NEXT: ucomiss %xmm0, %xmm0
+; FAST_NOAVX-NEXT: setnp %al
+; FAST_NOAVX-NEXT: andb $1, %al
+; FAST_NOAVX-NEXT: movzbl %al, %eax
+; FAST_NOAVX-NEXT: retq
+;
+; FAST_AVX-LABEL: fcmp_oeq2:
+; FAST_AVX: ## %bb.0:
+; FAST_AVX-NEXT: vucomiss %xmm0, %xmm0
+; FAST_AVX-NEXT: setnp %al
+; FAST_AVX-NEXT: andb $1, %al
+; FAST_AVX-NEXT: movzbl %al, %eax
+; FAST_AVX-NEXT: retq
%1 = fcmp oeq float %x, %x
ret i1 %1
}
define zeroext i1 @fcmp_oeq3(float %x) {
; SDAG-LABEL: fcmp_oeq3:
-; SDAG: ## BB#0:
+; SDAG: ## %bb.0:
; SDAG-NEXT: xorps %xmm1, %xmm1
; SDAG-NEXT: cmpeqss %xmm0, %xmm1
; SDAG-NEXT: movd %xmm1, %eax
; SDAG-NEXT: andl $1, %eax
-; SDAG-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; SDAG-NEXT: ## kill: def %al killed %al killed %eax
; SDAG-NEXT: retq
;
-; FAST-LABEL: fcmp_oeq3:
-; FAST: ## BB#0:
-; FAST-NEXT: xorps %xmm1, %xmm1
-; FAST-NEXT: ucomiss %xmm1, %xmm0
-; FAST-NEXT: sete %al
-; FAST-NEXT: setnp %cl
-; FAST-NEXT: andb %al, %cl
-; FAST-NEXT: andb $1, %cl
-; FAST-NEXT: movzbl %cl, %eax
-; FAST-NEXT: retq
+; FAST_NOAVX-LABEL: fcmp_oeq3:
+; FAST_NOAVX: ## %bb.0:
+; FAST_NOAVX-NEXT: xorps %xmm1, %xmm1
+; FAST_NOAVX-NEXT: ucomiss %xmm1, %xmm0
+; FAST_NOAVX-NEXT: sete %al
+; FAST_NOAVX-NEXT: setnp %cl
+; FAST_NOAVX-NEXT: andb %al, %cl
+; FAST_NOAVX-NEXT: andb $1, %cl
+; FAST_NOAVX-NEXT: movzbl %cl, %eax
+; FAST_NOAVX-NEXT: retq
+;
+; FAST_AVX-LABEL: fcmp_oeq3:
+; FAST_AVX: ## %bb.0:
+; FAST_AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FAST_AVX-NEXT: vucomiss %xmm1, %xmm0
+; FAST_AVX-NEXT: sete %al
+; FAST_AVX-NEXT: setnp %cl
+; FAST_AVX-NEXT: andb %al, %cl
+; FAST_AVX-NEXT: andb $1, %cl
+; FAST_AVX-NEXT: movzbl %cl, %eax
+; FAST_AVX-NEXT: retq
%1 = fcmp oeq float %x, 0.000000e+00
ret i1 %1
}
define zeroext i1 @fcmp_ogt2(float %x) {
; SDAG-LABEL: fcmp_ogt2:
-; SDAG: ## BB#0:
+; SDAG: ## %bb.0:
; SDAG-NEXT: xorl %eax, %eax
; SDAG-NEXT: retq
;
; FAST-LABEL: fcmp_ogt2:
-; FAST: ## BB#0:
+; FAST: ## %bb.0:
; FAST-NEXT: xorl %eax, %eax
; FAST-NEXT: andb $1, %al
; FAST-NEXT: movzbl %al, %eax
@@ -503,70 +640,96 @@ define zeroext i1 @fcmp_ogt2(float %x) {
define zeroext i1 @fcmp_ogt3(float %x) {
; SDAG-LABEL: fcmp_ogt3:
-; SDAG: ## BB#0:
+; SDAG: ## %bb.0:
; SDAG-NEXT: xorps %xmm1, %xmm1
; SDAG-NEXT: ucomiss %xmm1, %xmm0
; SDAG-NEXT: seta %al
; SDAG-NEXT: retq
;
-; FAST-LABEL: fcmp_ogt3:
-; FAST: ## BB#0:
-; FAST-NEXT: xorps %xmm1, %xmm1
-; FAST-NEXT: ucomiss %xmm1, %xmm0
-; FAST-NEXT: seta %al
-; FAST-NEXT: andb $1, %al
-; FAST-NEXT: movzbl %al, %eax
-; FAST-NEXT: retq
+; FAST_NOAVX-LABEL: fcmp_ogt3:
+; FAST_NOAVX: ## %bb.0:
+; FAST_NOAVX-NEXT: xorps %xmm1, %xmm1
+; FAST_NOAVX-NEXT: ucomiss %xmm1, %xmm0
+; FAST_NOAVX-NEXT: seta %al
+; FAST_NOAVX-NEXT: andb $1, %al
+; FAST_NOAVX-NEXT: movzbl %al, %eax
+; FAST_NOAVX-NEXT: retq
+;
+; FAST_AVX-LABEL: fcmp_ogt3:
+; FAST_AVX: ## %bb.0:
+; FAST_AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FAST_AVX-NEXT: vucomiss %xmm1, %xmm0
+; FAST_AVX-NEXT: seta %al
+; FAST_AVX-NEXT: andb $1, %al
+; FAST_AVX-NEXT: movzbl %al, %eax
+; FAST_AVX-NEXT: retq
%1 = fcmp ogt float %x, 0.000000e+00
ret i1 %1
}
define zeroext i1 @fcmp_oge2(float %x) {
; SDAG-LABEL: fcmp_oge2:
-; SDAG: ## BB#0:
+; SDAG: ## %bb.0:
; SDAG-NEXT: ucomiss %xmm0, %xmm0
; SDAG-NEXT: setnp %al
; SDAG-NEXT: retq
;
-; FAST-LABEL: fcmp_oge2:
-; FAST: ## BB#0:
-; FAST-NEXT: ucomiss %xmm0, %xmm0
-; FAST-NEXT: setnp %al
-; FAST-NEXT: andb $1, %al
-; FAST-NEXT: movzbl %al, %eax
-; FAST-NEXT: retq
+; FAST_NOAVX-LABEL: fcmp_oge2:
+; FAST_NOAVX: ## %bb.0:
+; FAST_NOAVX-NEXT: ucomiss %xmm0, %xmm0
+; FAST_NOAVX-NEXT: setnp %al
+; FAST_NOAVX-NEXT: andb $1, %al
+; FAST_NOAVX-NEXT: movzbl %al, %eax
+; FAST_NOAVX-NEXT: retq
+;
+; FAST_AVX-LABEL: fcmp_oge2:
+; FAST_AVX: ## %bb.0:
+; FAST_AVX-NEXT: vucomiss %xmm0, %xmm0
+; FAST_AVX-NEXT: setnp %al
+; FAST_AVX-NEXT: andb $1, %al
+; FAST_AVX-NEXT: movzbl %al, %eax
+; FAST_AVX-NEXT: retq
%1 = fcmp oge float %x, %x
ret i1 %1
}
define zeroext i1 @fcmp_oge3(float %x) {
; SDAG-LABEL: fcmp_oge3:
-; SDAG: ## BB#0:
+; SDAG: ## %bb.0:
; SDAG-NEXT: xorps %xmm1, %xmm1
; SDAG-NEXT: ucomiss %xmm1, %xmm0
; SDAG-NEXT: setae %al
; SDAG-NEXT: retq
;
-; FAST-LABEL: fcmp_oge3:
-; FAST: ## BB#0:
-; FAST-NEXT: xorps %xmm1, %xmm1
-; FAST-NEXT: ucomiss %xmm1, %xmm0
-; FAST-NEXT: setae %al
-; FAST-NEXT: andb $1, %al
-; FAST-NEXT: movzbl %al, %eax
-; FAST-NEXT: retq
+; FAST_NOAVX-LABEL: fcmp_oge3:
+; FAST_NOAVX: ## %bb.0:
+; FAST_NOAVX-NEXT: xorps %xmm1, %xmm1
+; FAST_NOAVX-NEXT: ucomiss %xmm1, %xmm0
+; FAST_NOAVX-NEXT: setae %al
+; FAST_NOAVX-NEXT: andb $1, %al
+; FAST_NOAVX-NEXT: movzbl %al, %eax
+; FAST_NOAVX-NEXT: retq
+;
+; FAST_AVX-LABEL: fcmp_oge3:
+; FAST_AVX: ## %bb.0:
+; FAST_AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FAST_AVX-NEXT: vucomiss %xmm1, %xmm0
+; FAST_AVX-NEXT: setae %al
+; FAST_AVX-NEXT: andb $1, %al
+; FAST_AVX-NEXT: movzbl %al, %eax
+; FAST_AVX-NEXT: retq
%1 = fcmp oge float %x, 0.000000e+00
ret i1 %1
}
define zeroext i1 @fcmp_olt2(float %x) {
; SDAG-LABEL: fcmp_olt2:
-; SDAG: ## BB#0:
+; SDAG: ## %bb.0:
; SDAG-NEXT: xorl %eax, %eax
; SDAG-NEXT: retq
;
; FAST-LABEL: fcmp_olt2:
-; FAST: ## BB#0:
+; FAST: ## %bb.0:
; FAST-NEXT: xorl %eax, %eax
; FAST-NEXT: andb $1, %al
; FAST-NEXT: movzbl %al, %eax
@@ -577,70 +740,96 @@ define zeroext i1 @fcmp_olt2(float %x) {
define zeroext i1 @fcmp_olt3(float %x) {
; SDAG-LABEL: fcmp_olt3:
-; SDAG: ## BB#0:
+; SDAG: ## %bb.0:
; SDAG-NEXT: xorps %xmm1, %xmm1
; SDAG-NEXT: ucomiss %xmm0, %xmm1
; SDAG-NEXT: seta %al
; SDAG-NEXT: retq
;
-; FAST-LABEL: fcmp_olt3:
-; FAST: ## BB#0:
-; FAST-NEXT: xorps %xmm1, %xmm1
-; FAST-NEXT: ucomiss %xmm0, %xmm1
-; FAST-NEXT: seta %al
-; FAST-NEXT: andb $1, %al
-; FAST-NEXT: movzbl %al, %eax
-; FAST-NEXT: retq
+; FAST_NOAVX-LABEL: fcmp_olt3:
+; FAST_NOAVX: ## %bb.0:
+; FAST_NOAVX-NEXT: xorps %xmm1, %xmm1
+; FAST_NOAVX-NEXT: ucomiss %xmm0, %xmm1
+; FAST_NOAVX-NEXT: seta %al
+; FAST_NOAVX-NEXT: andb $1, %al
+; FAST_NOAVX-NEXT: movzbl %al, %eax
+; FAST_NOAVX-NEXT: retq
+;
+; FAST_AVX-LABEL: fcmp_olt3:
+; FAST_AVX: ## %bb.0:
+; FAST_AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FAST_AVX-NEXT: vucomiss %xmm0, %xmm1
+; FAST_AVX-NEXT: seta %al
+; FAST_AVX-NEXT: andb $1, %al
+; FAST_AVX-NEXT: movzbl %al, %eax
+; FAST_AVX-NEXT: retq
%1 = fcmp olt float %x, 0.000000e+00
ret i1 %1
}
define zeroext i1 @fcmp_ole2(float %x) {
; SDAG-LABEL: fcmp_ole2:
-; SDAG: ## BB#0:
+; SDAG: ## %bb.0:
; SDAG-NEXT: ucomiss %xmm0, %xmm0
; SDAG-NEXT: setnp %al
; SDAG-NEXT: retq
;
-; FAST-LABEL: fcmp_ole2:
-; FAST: ## BB#0:
-; FAST-NEXT: ucomiss %xmm0, %xmm0
-; FAST-NEXT: setnp %al
-; FAST-NEXT: andb $1, %al
-; FAST-NEXT: movzbl %al, %eax
-; FAST-NEXT: retq
+; FAST_NOAVX-LABEL: fcmp_ole2:
+; FAST_NOAVX: ## %bb.0:
+; FAST_NOAVX-NEXT: ucomiss %xmm0, %xmm0
+; FAST_NOAVX-NEXT: setnp %al
+; FAST_NOAVX-NEXT: andb $1, %al
+; FAST_NOAVX-NEXT: movzbl %al, %eax
+; FAST_NOAVX-NEXT: retq
+;
+; FAST_AVX-LABEL: fcmp_ole2:
+; FAST_AVX: ## %bb.0:
+; FAST_AVX-NEXT: vucomiss %xmm0, %xmm0
+; FAST_AVX-NEXT: setnp %al
+; FAST_AVX-NEXT: andb $1, %al
+; FAST_AVX-NEXT: movzbl %al, %eax
+; FAST_AVX-NEXT: retq
%1 = fcmp ole float %x, %x
ret i1 %1
}
define zeroext i1 @fcmp_ole3(float %x) {
; SDAG-LABEL: fcmp_ole3:
-; SDAG: ## BB#0:
+; SDAG: ## %bb.0:
; SDAG-NEXT: xorps %xmm1, %xmm1
; SDAG-NEXT: ucomiss %xmm0, %xmm1
; SDAG-NEXT: setae %al
; SDAG-NEXT: retq
;
-; FAST-LABEL: fcmp_ole3:
-; FAST: ## BB#0:
-; FAST-NEXT: xorps %xmm1, %xmm1
-; FAST-NEXT: ucomiss %xmm0, %xmm1
-; FAST-NEXT: setae %al
-; FAST-NEXT: andb $1, %al
-; FAST-NEXT: movzbl %al, %eax
-; FAST-NEXT: retq
+; FAST_NOAVX-LABEL: fcmp_ole3:
+; FAST_NOAVX: ## %bb.0:
+; FAST_NOAVX-NEXT: xorps %xmm1, %xmm1
+; FAST_NOAVX-NEXT: ucomiss %xmm0, %xmm1
+; FAST_NOAVX-NEXT: setae %al
+; FAST_NOAVX-NEXT: andb $1, %al
+; FAST_NOAVX-NEXT: movzbl %al, %eax
+; FAST_NOAVX-NEXT: retq
+;
+; FAST_AVX-LABEL: fcmp_ole3:
+; FAST_AVX: ## %bb.0:
+; FAST_AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FAST_AVX-NEXT: vucomiss %xmm0, %xmm1
+; FAST_AVX-NEXT: setae %al
+; FAST_AVX-NEXT: andb $1, %al
+; FAST_AVX-NEXT: movzbl %al, %eax
+; FAST_AVX-NEXT: retq
%1 = fcmp ole float %x, 0.000000e+00
ret i1 %1
}
define zeroext i1 @fcmp_one2(float %x) {
; SDAG-LABEL: fcmp_one2:
-; SDAG: ## BB#0:
+; SDAG: ## %bb.0:
; SDAG-NEXT: xorl %eax, %eax
; SDAG-NEXT: retq
;
; FAST-LABEL: fcmp_one2:
-; FAST: ## BB#0:
+; FAST: ## %bb.0:
; FAST-NEXT: xorl %eax, %eax
; FAST-NEXT: andb $1, %al
; FAST-NEXT: movzbl %al, %eax
@@ -651,104 +840,145 @@ define zeroext i1 @fcmp_one2(float %x) {
define zeroext i1 @fcmp_one3(float %x) {
; SDAG-LABEL: fcmp_one3:
-; SDAG: ## BB#0:
+; SDAG: ## %bb.0:
; SDAG-NEXT: xorps %xmm1, %xmm1
; SDAG-NEXT: ucomiss %xmm1, %xmm0
; SDAG-NEXT: setne %al
; SDAG-NEXT: retq
;
-; FAST-LABEL: fcmp_one3:
-; FAST: ## BB#0:
-; FAST-NEXT: xorps %xmm1, %xmm1
-; FAST-NEXT: ucomiss %xmm1, %xmm0
-; FAST-NEXT: setne %al
-; FAST-NEXT: andb $1, %al
-; FAST-NEXT: movzbl %al, %eax
-; FAST-NEXT: retq
+; FAST_NOAVX-LABEL: fcmp_one3:
+; FAST_NOAVX: ## %bb.0:
+; FAST_NOAVX-NEXT: xorps %xmm1, %xmm1
+; FAST_NOAVX-NEXT: ucomiss %xmm1, %xmm0
+; FAST_NOAVX-NEXT: setne %al
+; FAST_NOAVX-NEXT: andb $1, %al
+; FAST_NOAVX-NEXT: movzbl %al, %eax
+; FAST_NOAVX-NEXT: retq
+;
+; FAST_AVX-LABEL: fcmp_one3:
+; FAST_AVX: ## %bb.0:
+; FAST_AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FAST_AVX-NEXT: vucomiss %xmm1, %xmm0
+; FAST_AVX-NEXT: setne %al
+; FAST_AVX-NEXT: andb $1, %al
+; FAST_AVX-NEXT: movzbl %al, %eax
+; FAST_AVX-NEXT: retq
%1 = fcmp one float %x, 0.000000e+00
ret i1 %1
}
define zeroext i1 @fcmp_ord2(float %x) {
; SDAG-LABEL: fcmp_ord2:
-; SDAG: ## BB#0:
+; SDAG: ## %bb.0:
; SDAG-NEXT: ucomiss %xmm0, %xmm0
; SDAG-NEXT: setnp %al
; SDAG-NEXT: retq
;
-; FAST-LABEL: fcmp_ord2:
-; FAST: ## BB#0:
-; FAST-NEXT: ucomiss %xmm0, %xmm0
-; FAST-NEXT: setnp %al
-; FAST-NEXT: andb $1, %al
-; FAST-NEXT: movzbl %al, %eax
-; FAST-NEXT: retq
+; FAST_NOAVX-LABEL: fcmp_ord2:
+; FAST_NOAVX: ## %bb.0:
+; FAST_NOAVX-NEXT: ucomiss %xmm0, %xmm0
+; FAST_NOAVX-NEXT: setnp %al
+; FAST_NOAVX-NEXT: andb $1, %al
+; FAST_NOAVX-NEXT: movzbl %al, %eax
+; FAST_NOAVX-NEXT: retq
+;
+; FAST_AVX-LABEL: fcmp_ord2:
+; FAST_AVX: ## %bb.0:
+; FAST_AVX-NEXT: vucomiss %xmm0, %xmm0
+; FAST_AVX-NEXT: setnp %al
+; FAST_AVX-NEXT: andb $1, %al
+; FAST_AVX-NEXT: movzbl %al, %eax
+; FAST_AVX-NEXT: retq
%1 = fcmp ord float %x, %x
ret i1 %1
}
define zeroext i1 @fcmp_ord3(float %x) {
; SDAG-LABEL: fcmp_ord3:
-; SDAG: ## BB#0:
+; SDAG: ## %bb.0:
; SDAG-NEXT: ucomiss %xmm0, %xmm0
; SDAG-NEXT: setnp %al
; SDAG-NEXT: retq
;
-; FAST-LABEL: fcmp_ord3:
-; FAST: ## BB#0:
-; FAST-NEXT: ucomiss %xmm0, %xmm0
-; FAST-NEXT: setnp %al
-; FAST-NEXT: andb $1, %al
-; FAST-NEXT: movzbl %al, %eax
-; FAST-NEXT: retq
+; FAST_NOAVX-LABEL: fcmp_ord3:
+; FAST_NOAVX: ## %bb.0:
+; FAST_NOAVX-NEXT: ucomiss %xmm0, %xmm0
+; FAST_NOAVX-NEXT: setnp %al
+; FAST_NOAVX-NEXT: andb $1, %al
+; FAST_NOAVX-NEXT: movzbl %al, %eax
+; FAST_NOAVX-NEXT: retq
+;
+; FAST_AVX-LABEL: fcmp_ord3:
+; FAST_AVX: ## %bb.0:
+; FAST_AVX-NEXT: vucomiss %xmm0, %xmm0
+; FAST_AVX-NEXT: setnp %al
+; FAST_AVX-NEXT: andb $1, %al
+; FAST_AVX-NEXT: movzbl %al, %eax
+; FAST_AVX-NEXT: retq
%1 = fcmp ord float %x, 0.000000e+00
ret i1 %1
}
define zeroext i1 @fcmp_uno2(float %x) {
; SDAG-LABEL: fcmp_uno2:
-; SDAG: ## BB#0:
+; SDAG: ## %bb.0:
; SDAG-NEXT: ucomiss %xmm0, %xmm0
; SDAG-NEXT: setp %al
; SDAG-NEXT: retq
;
-; FAST-LABEL: fcmp_uno2:
-; FAST: ## BB#0:
-; FAST-NEXT: ucomiss %xmm0, %xmm0
-; FAST-NEXT: setp %al
-; FAST-NEXT: andb $1, %al
-; FAST-NEXT: movzbl %al, %eax
-; FAST-NEXT: retq
+; FAST_NOAVX-LABEL: fcmp_uno2:
+; FAST_NOAVX: ## %bb.0:
+; FAST_NOAVX-NEXT: ucomiss %xmm0, %xmm0
+; FAST_NOAVX-NEXT: setp %al
+; FAST_NOAVX-NEXT: andb $1, %al
+; FAST_NOAVX-NEXT: movzbl %al, %eax
+; FAST_NOAVX-NEXT: retq
+;
+; FAST_AVX-LABEL: fcmp_uno2:
+; FAST_AVX: ## %bb.0:
+; FAST_AVX-NEXT: vucomiss %xmm0, %xmm0
+; FAST_AVX-NEXT: setp %al
+; FAST_AVX-NEXT: andb $1, %al
+; FAST_AVX-NEXT: movzbl %al, %eax
+; FAST_AVX-NEXT: retq
%1 = fcmp uno float %x, %x
ret i1 %1
}
define zeroext i1 @fcmp_uno3(float %x) {
; SDAG-LABEL: fcmp_uno3:
-; SDAG: ## BB#0:
+; SDAG: ## %bb.0:
; SDAG-NEXT: ucomiss %xmm0, %xmm0
; SDAG-NEXT: setp %al
; SDAG-NEXT: retq
;
-; FAST-LABEL: fcmp_uno3:
-; FAST: ## BB#0:
-; FAST-NEXT: ucomiss %xmm0, %xmm0
-; FAST-NEXT: setp %al
-; FAST-NEXT: andb $1, %al
-; FAST-NEXT: movzbl %al, %eax
-; FAST-NEXT: retq
+; FAST_NOAVX-LABEL: fcmp_uno3:
+; FAST_NOAVX: ## %bb.0:
+; FAST_NOAVX-NEXT: ucomiss %xmm0, %xmm0
+; FAST_NOAVX-NEXT: setp %al
+; FAST_NOAVX-NEXT: andb $1, %al
+; FAST_NOAVX-NEXT: movzbl %al, %eax
+; FAST_NOAVX-NEXT: retq
+;
+; FAST_AVX-LABEL: fcmp_uno3:
+; FAST_AVX: ## %bb.0:
+; FAST_AVX-NEXT: vucomiss %xmm0, %xmm0
+; FAST_AVX-NEXT: setp %al
+; FAST_AVX-NEXT: andb $1, %al
+; FAST_AVX-NEXT: movzbl %al, %eax
+; FAST_AVX-NEXT: retq
%1 = fcmp uno float %x, 0.000000e+00
ret i1 %1
}
define zeroext i1 @fcmp_ueq2(float %x) {
; SDAG-LABEL: fcmp_ueq2:
-; SDAG: ## BB#0:
+; SDAG: ## %bb.0:
; SDAG-NEXT: movb $1, %al
; SDAG-NEXT: retq
;
; FAST-LABEL: fcmp_ueq2:
-; FAST: ## BB#0:
+; FAST: ## %bb.0:
; FAST-NEXT: movb $1, %al
; FAST-NEXT: andb $1, %al
; FAST-NEXT: movzbl %al, %eax
@@ -759,70 +989,96 @@ define zeroext i1 @fcmp_ueq2(float %x) {
define zeroext i1 @fcmp_ueq3(float %x) {
; SDAG-LABEL: fcmp_ueq3:
-; SDAG: ## BB#0:
+; SDAG: ## %bb.0:
; SDAG-NEXT: xorps %xmm1, %xmm1
; SDAG-NEXT: ucomiss %xmm1, %xmm0
; SDAG-NEXT: sete %al
; SDAG-NEXT: retq
;
-; FAST-LABEL: fcmp_ueq3:
-; FAST: ## BB#0:
-; FAST-NEXT: xorps %xmm1, %xmm1
-; FAST-NEXT: ucomiss %xmm1, %xmm0
-; FAST-NEXT: sete %al
-; FAST-NEXT: andb $1, %al
-; FAST-NEXT: movzbl %al, %eax
-; FAST-NEXT: retq
+; FAST_NOAVX-LABEL: fcmp_ueq3:
+; FAST_NOAVX: ## %bb.0:
+; FAST_NOAVX-NEXT: xorps %xmm1, %xmm1
+; FAST_NOAVX-NEXT: ucomiss %xmm1, %xmm0
+; FAST_NOAVX-NEXT: sete %al
+; FAST_NOAVX-NEXT: andb $1, %al
+; FAST_NOAVX-NEXT: movzbl %al, %eax
+; FAST_NOAVX-NEXT: retq
+;
+; FAST_AVX-LABEL: fcmp_ueq3:
+; FAST_AVX: ## %bb.0:
+; FAST_AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FAST_AVX-NEXT: vucomiss %xmm1, %xmm0
+; FAST_AVX-NEXT: sete %al
+; FAST_AVX-NEXT: andb $1, %al
+; FAST_AVX-NEXT: movzbl %al, %eax
+; FAST_AVX-NEXT: retq
%1 = fcmp ueq float %x, 0.000000e+00
ret i1 %1
}
define zeroext i1 @fcmp_ugt2(float %x) {
; SDAG-LABEL: fcmp_ugt2:
-; SDAG: ## BB#0:
+; SDAG: ## %bb.0:
; SDAG-NEXT: ucomiss %xmm0, %xmm0
; SDAG-NEXT: setp %al
; SDAG-NEXT: retq
;
-; FAST-LABEL: fcmp_ugt2:
-; FAST: ## BB#0:
-; FAST-NEXT: ucomiss %xmm0, %xmm0
-; FAST-NEXT: setp %al
-; FAST-NEXT: andb $1, %al
-; FAST-NEXT: movzbl %al, %eax
-; FAST-NEXT: retq
+; FAST_NOAVX-LABEL: fcmp_ugt2:
+; FAST_NOAVX: ## %bb.0:
+; FAST_NOAVX-NEXT: ucomiss %xmm0, %xmm0
+; FAST_NOAVX-NEXT: setp %al
+; FAST_NOAVX-NEXT: andb $1, %al
+; FAST_NOAVX-NEXT: movzbl %al, %eax
+; FAST_NOAVX-NEXT: retq
+;
+; FAST_AVX-LABEL: fcmp_ugt2:
+; FAST_AVX: ## %bb.0:
+; FAST_AVX-NEXT: vucomiss %xmm0, %xmm0
+; FAST_AVX-NEXT: setp %al
+; FAST_AVX-NEXT: andb $1, %al
+; FAST_AVX-NEXT: movzbl %al, %eax
+; FAST_AVX-NEXT: retq
%1 = fcmp ugt float %x, %x
ret i1 %1
}
define zeroext i1 @fcmp_ugt3(float %x) {
; SDAG-LABEL: fcmp_ugt3:
-; SDAG: ## BB#0:
+; SDAG: ## %bb.0:
; SDAG-NEXT: xorps %xmm1, %xmm1
; SDAG-NEXT: ucomiss %xmm0, %xmm1
; SDAG-NEXT: setb %al
; SDAG-NEXT: retq
;
-; FAST-LABEL: fcmp_ugt3:
-; FAST: ## BB#0:
-; FAST-NEXT: xorps %xmm1, %xmm1
-; FAST-NEXT: ucomiss %xmm0, %xmm1
-; FAST-NEXT: setb %al
-; FAST-NEXT: andb $1, %al
-; FAST-NEXT: movzbl %al, %eax
-; FAST-NEXT: retq
+; FAST_NOAVX-LABEL: fcmp_ugt3:
+; FAST_NOAVX: ## %bb.0:
+; FAST_NOAVX-NEXT: xorps %xmm1, %xmm1
+; FAST_NOAVX-NEXT: ucomiss %xmm0, %xmm1
+; FAST_NOAVX-NEXT: setb %al
+; FAST_NOAVX-NEXT: andb $1, %al
+; FAST_NOAVX-NEXT: movzbl %al, %eax
+; FAST_NOAVX-NEXT: retq
+;
+; FAST_AVX-LABEL: fcmp_ugt3:
+; FAST_AVX: ## %bb.0:
+; FAST_AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FAST_AVX-NEXT: vucomiss %xmm0, %xmm1
+; FAST_AVX-NEXT: setb %al
+; FAST_AVX-NEXT: andb $1, %al
+; FAST_AVX-NEXT: movzbl %al, %eax
+; FAST_AVX-NEXT: retq
%1 = fcmp ugt float %x, 0.000000e+00
ret i1 %1
}
define zeroext i1 @fcmp_uge2(float %x) {
; SDAG-LABEL: fcmp_uge2:
-; SDAG: ## BB#0:
+; SDAG: ## %bb.0:
; SDAG-NEXT: movb $1, %al
; SDAG-NEXT: retq
;
; FAST-LABEL: fcmp_uge2:
-; FAST: ## BB#0:
+; FAST: ## %bb.0:
; FAST-NEXT: movb $1, %al
; FAST-NEXT: andb $1, %al
; FAST-NEXT: movzbl %al, %eax
@@ -833,70 +1089,96 @@ define zeroext i1 @fcmp_uge2(float %x) {
define zeroext i1 @fcmp_uge3(float %x) {
; SDAG-LABEL: fcmp_uge3:
-; SDAG: ## BB#0:
+; SDAG: ## %bb.0:
; SDAG-NEXT: xorps %xmm1, %xmm1
; SDAG-NEXT: ucomiss %xmm0, %xmm1
; SDAG-NEXT: setbe %al
; SDAG-NEXT: retq
;
-; FAST-LABEL: fcmp_uge3:
-; FAST: ## BB#0:
-; FAST-NEXT: xorps %xmm1, %xmm1
-; FAST-NEXT: ucomiss %xmm0, %xmm1
-; FAST-NEXT: setbe %al
-; FAST-NEXT: andb $1, %al
-; FAST-NEXT: movzbl %al, %eax
-; FAST-NEXT: retq
+; FAST_NOAVX-LABEL: fcmp_uge3:
+; FAST_NOAVX: ## %bb.0:
+; FAST_NOAVX-NEXT: xorps %xmm1, %xmm1
+; FAST_NOAVX-NEXT: ucomiss %xmm0, %xmm1
+; FAST_NOAVX-NEXT: setbe %al
+; FAST_NOAVX-NEXT: andb $1, %al
+; FAST_NOAVX-NEXT: movzbl %al, %eax
+; FAST_NOAVX-NEXT: retq
+;
+; FAST_AVX-LABEL: fcmp_uge3:
+; FAST_AVX: ## %bb.0:
+; FAST_AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FAST_AVX-NEXT: vucomiss %xmm0, %xmm1
+; FAST_AVX-NEXT: setbe %al
+; FAST_AVX-NEXT: andb $1, %al
+; FAST_AVX-NEXT: movzbl %al, %eax
+; FAST_AVX-NEXT: retq
%1 = fcmp uge float %x, 0.000000e+00
ret i1 %1
}
define zeroext i1 @fcmp_ult2(float %x) {
; SDAG-LABEL: fcmp_ult2:
-; SDAG: ## BB#0:
+; SDAG: ## %bb.0:
; SDAG-NEXT: ucomiss %xmm0, %xmm0
; SDAG-NEXT: setp %al
; SDAG-NEXT: retq
;
-; FAST-LABEL: fcmp_ult2:
-; FAST: ## BB#0:
-; FAST-NEXT: ucomiss %xmm0, %xmm0
-; FAST-NEXT: setp %al
-; FAST-NEXT: andb $1, %al
-; FAST-NEXT: movzbl %al, %eax
-; FAST-NEXT: retq
+; FAST_NOAVX-LABEL: fcmp_ult2:
+; FAST_NOAVX: ## %bb.0:
+; FAST_NOAVX-NEXT: ucomiss %xmm0, %xmm0
+; FAST_NOAVX-NEXT: setp %al
+; FAST_NOAVX-NEXT: andb $1, %al
+; FAST_NOAVX-NEXT: movzbl %al, %eax
+; FAST_NOAVX-NEXT: retq
+;
+; FAST_AVX-LABEL: fcmp_ult2:
+; FAST_AVX: ## %bb.0:
+; FAST_AVX-NEXT: vucomiss %xmm0, %xmm0
+; FAST_AVX-NEXT: setp %al
+; FAST_AVX-NEXT: andb $1, %al
+; FAST_AVX-NEXT: movzbl %al, %eax
+; FAST_AVX-NEXT: retq
%1 = fcmp ult float %x, %x
ret i1 %1
}
define zeroext i1 @fcmp_ult3(float %x) {
; SDAG-LABEL: fcmp_ult3:
-; SDAG: ## BB#0:
+; SDAG: ## %bb.0:
; SDAG-NEXT: xorps %xmm1, %xmm1
; SDAG-NEXT: ucomiss %xmm1, %xmm0
; SDAG-NEXT: setb %al
; SDAG-NEXT: retq
;
-; FAST-LABEL: fcmp_ult3:
-; FAST: ## BB#0:
-; FAST-NEXT: xorps %xmm1, %xmm1
-; FAST-NEXT: ucomiss %xmm1, %xmm0
-; FAST-NEXT: setb %al
-; FAST-NEXT: andb $1, %al
-; FAST-NEXT: movzbl %al, %eax
-; FAST-NEXT: retq
+; FAST_NOAVX-LABEL: fcmp_ult3:
+; FAST_NOAVX: ## %bb.0:
+; FAST_NOAVX-NEXT: xorps %xmm1, %xmm1
+; FAST_NOAVX-NEXT: ucomiss %xmm1, %xmm0
+; FAST_NOAVX-NEXT: setb %al
+; FAST_NOAVX-NEXT: andb $1, %al
+; FAST_NOAVX-NEXT: movzbl %al, %eax
+; FAST_NOAVX-NEXT: retq
+;
+; FAST_AVX-LABEL: fcmp_ult3:
+; FAST_AVX: ## %bb.0:
+; FAST_AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FAST_AVX-NEXT: vucomiss %xmm1, %xmm0
+; FAST_AVX-NEXT: setb %al
+; FAST_AVX-NEXT: andb $1, %al
+; FAST_AVX-NEXT: movzbl %al, %eax
+; FAST_AVX-NEXT: retq
%1 = fcmp ult float %x, 0.000000e+00
ret i1 %1
}
define zeroext i1 @fcmp_ule2(float %x) {
; SDAG-LABEL: fcmp_ule2:
-; SDAG: ## BB#0:
+; SDAG: ## %bb.0:
; SDAG-NEXT: movb $1, %al
; SDAG-NEXT: retq
;
; FAST-LABEL: fcmp_ule2:
-; FAST: ## BB#0:
+; FAST: ## %bb.0:
; FAST-NEXT: movb $1, %al
; FAST-NEXT: andb $1, %al
; FAST-NEXT: movzbl %al, %eax
@@ -907,74 +1189,102 @@ define zeroext i1 @fcmp_ule2(float %x) {
define zeroext i1 @fcmp_ule3(float %x) {
; SDAG-LABEL: fcmp_ule3:
-; SDAG: ## BB#0:
+; SDAG: ## %bb.0:
; SDAG-NEXT: xorps %xmm1, %xmm1
; SDAG-NEXT: ucomiss %xmm1, %xmm0
; SDAG-NEXT: setbe %al
; SDAG-NEXT: retq
;
-; FAST-LABEL: fcmp_ule3:
-; FAST: ## BB#0:
-; FAST-NEXT: xorps %xmm1, %xmm1
-; FAST-NEXT: ucomiss %xmm1, %xmm0
-; FAST-NEXT: setbe %al
-; FAST-NEXT: andb $1, %al
-; FAST-NEXT: movzbl %al, %eax
-; FAST-NEXT: retq
+; FAST_NOAVX-LABEL: fcmp_ule3:
+; FAST_NOAVX: ## %bb.0:
+; FAST_NOAVX-NEXT: xorps %xmm1, %xmm1
+; FAST_NOAVX-NEXT: ucomiss %xmm1, %xmm0
+; FAST_NOAVX-NEXT: setbe %al
+; FAST_NOAVX-NEXT: andb $1, %al
+; FAST_NOAVX-NEXT: movzbl %al, %eax
+; FAST_NOAVX-NEXT: retq
+;
+; FAST_AVX-LABEL: fcmp_ule3:
+; FAST_AVX: ## %bb.0:
+; FAST_AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FAST_AVX-NEXT: vucomiss %xmm1, %xmm0
+; FAST_AVX-NEXT: setbe %al
+; FAST_AVX-NEXT: andb $1, %al
+; FAST_AVX-NEXT: movzbl %al, %eax
+; FAST_AVX-NEXT: retq
%1 = fcmp ule float %x, 0.000000e+00
ret i1 %1
}
define zeroext i1 @fcmp_une2(float %x) {
; SDAG-LABEL: fcmp_une2:
-; SDAG: ## BB#0:
+; SDAG: ## %bb.0:
; SDAG-NEXT: ucomiss %xmm0, %xmm0
; SDAG-NEXT: setp %al
; SDAG-NEXT: retq
;
-; FAST-LABEL: fcmp_une2:
-; FAST: ## BB#0:
-; FAST-NEXT: ucomiss %xmm0, %xmm0
-; FAST-NEXT: setp %al
-; FAST-NEXT: andb $1, %al
-; FAST-NEXT: movzbl %al, %eax
-; FAST-NEXT: retq
+; FAST_NOAVX-LABEL: fcmp_une2:
+; FAST_NOAVX: ## %bb.0:
+; FAST_NOAVX-NEXT: ucomiss %xmm0, %xmm0
+; FAST_NOAVX-NEXT: setp %al
+; FAST_NOAVX-NEXT: andb $1, %al
+; FAST_NOAVX-NEXT: movzbl %al, %eax
+; FAST_NOAVX-NEXT: retq
+;
+; FAST_AVX-LABEL: fcmp_une2:
+; FAST_AVX: ## %bb.0:
+; FAST_AVX-NEXT: vucomiss %xmm0, %xmm0
+; FAST_AVX-NEXT: setp %al
+; FAST_AVX-NEXT: andb $1, %al
+; FAST_AVX-NEXT: movzbl %al, %eax
+; FAST_AVX-NEXT: retq
%1 = fcmp une float %x, %x
ret i1 %1
}
define zeroext i1 @fcmp_une3(float %x) {
; SDAG-LABEL: fcmp_une3:
-; SDAG: ## BB#0:
+; SDAG: ## %bb.0:
; SDAG-NEXT: xorps %xmm1, %xmm1
; SDAG-NEXT: cmpneqss %xmm0, %xmm1
; SDAG-NEXT: movd %xmm1, %eax
; SDAG-NEXT: andl $1, %eax
-; SDAG-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; SDAG-NEXT: ## kill: def %al killed %al killed %eax
; SDAG-NEXT: retq
;
-; FAST-LABEL: fcmp_une3:
-; FAST: ## BB#0:
-; FAST-NEXT: xorps %xmm1, %xmm1
-; FAST-NEXT: ucomiss %xmm1, %xmm0
-; FAST-NEXT: setne %al
-; FAST-NEXT: setp %cl
-; FAST-NEXT: orb %al, %cl
-; FAST-NEXT: andb $1, %cl
-; FAST-NEXT: movzbl %cl, %eax
-; FAST-NEXT: retq
+; FAST_NOAVX-LABEL: fcmp_une3:
+; FAST_NOAVX: ## %bb.0:
+; FAST_NOAVX-NEXT: xorps %xmm1, %xmm1
+; FAST_NOAVX-NEXT: ucomiss %xmm1, %xmm0
+; FAST_NOAVX-NEXT: setne %al
+; FAST_NOAVX-NEXT: setp %cl
+; FAST_NOAVX-NEXT: orb %al, %cl
+; FAST_NOAVX-NEXT: andb $1, %cl
+; FAST_NOAVX-NEXT: movzbl %cl, %eax
+; FAST_NOAVX-NEXT: retq
+;
+; FAST_AVX-LABEL: fcmp_une3:
+; FAST_AVX: ## %bb.0:
+; FAST_AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; FAST_AVX-NEXT: vucomiss %xmm1, %xmm0
+; FAST_AVX-NEXT: setne %al
+; FAST_AVX-NEXT: setp %cl
+; FAST_AVX-NEXT: orb %al, %cl
+; FAST_AVX-NEXT: andb $1, %cl
+; FAST_AVX-NEXT: movzbl %cl, %eax
+; FAST_AVX-NEXT: retq
%1 = fcmp une float %x, 0.000000e+00
ret i1 %1
}
define zeroext i1 @icmp_eq2(i32 %x) {
; SDAG-LABEL: icmp_eq2:
-; SDAG: ## BB#0:
+; SDAG: ## %bb.0:
; SDAG-NEXT: movb $1, %al
; SDAG-NEXT: retq
;
; FAST-LABEL: icmp_eq2:
-; FAST: ## BB#0:
+; FAST: ## %bb.0:
; FAST-NEXT: movb $1, %al
; FAST-NEXT: andb $1, %al
; FAST-NEXT: movzbl %al, %eax
@@ -985,12 +1295,12 @@ define zeroext i1 @icmp_eq2(i32 %x) {
define zeroext i1 @icmp_ne2(i32 %x) {
; SDAG-LABEL: icmp_ne2:
-; SDAG: ## BB#0:
+; SDAG: ## %bb.0:
; SDAG-NEXT: xorl %eax, %eax
; SDAG-NEXT: retq
;
; FAST-LABEL: icmp_ne2:
-; FAST: ## BB#0:
+; FAST: ## %bb.0:
; FAST-NEXT: xorl %eax, %eax
; FAST-NEXT: andb $1, %al
; FAST-NEXT: movzbl %al, %eax
@@ -1001,12 +1311,12 @@ define zeroext i1 @icmp_ne2(i32 %x) {
define zeroext i1 @icmp_ugt2(i32 %x) {
; SDAG-LABEL: icmp_ugt2:
-; SDAG: ## BB#0:
+; SDAG: ## %bb.0:
; SDAG-NEXT: xorl %eax, %eax
; SDAG-NEXT: retq
;
; FAST-LABEL: icmp_ugt2:
-; FAST: ## BB#0:
+; FAST: ## %bb.0:
; FAST-NEXT: xorl %eax, %eax
; FAST-NEXT: andb $1, %al
; FAST-NEXT: movzbl %al, %eax
@@ -1017,12 +1327,12 @@ define zeroext i1 @icmp_ugt2(i32 %x) {
define zeroext i1 @icmp_uge2(i32 %x) {
; SDAG-LABEL: icmp_uge2:
-; SDAG: ## BB#0:
+; SDAG: ## %bb.0:
; SDAG-NEXT: movb $1, %al
; SDAG-NEXT: retq
;
; FAST-LABEL: icmp_uge2:
-; FAST: ## BB#0:
+; FAST: ## %bb.0:
; FAST-NEXT: movb $1, %al
; FAST-NEXT: andb $1, %al
; FAST-NEXT: movzbl %al, %eax
@@ -1033,12 +1343,12 @@ define zeroext i1 @icmp_uge2(i32 %x) {
define zeroext i1 @icmp_ult2(i32 %x) {
; SDAG-LABEL: icmp_ult2:
-; SDAG: ## BB#0:
+; SDAG: ## %bb.0:
; SDAG-NEXT: xorl %eax, %eax
; SDAG-NEXT: retq
;
; FAST-LABEL: icmp_ult2:
-; FAST: ## BB#0:
+; FAST: ## %bb.0:
; FAST-NEXT: xorl %eax, %eax
; FAST-NEXT: andb $1, %al
; FAST-NEXT: movzbl %al, %eax
@@ -1049,12 +1359,12 @@ define zeroext i1 @icmp_ult2(i32 %x) {
define zeroext i1 @icmp_ule2(i32 %x) {
; SDAG-LABEL: icmp_ule2:
-; SDAG: ## BB#0:
+; SDAG: ## %bb.0:
; SDAG-NEXT: movb $1, %al
; SDAG-NEXT: retq
;
; FAST-LABEL: icmp_ule2:
-; FAST: ## BB#0:
+; FAST: ## %bb.0:
; FAST-NEXT: movb $1, %al
; FAST-NEXT: andb $1, %al
; FAST-NEXT: movzbl %al, %eax
@@ -1065,12 +1375,12 @@ define zeroext i1 @icmp_ule2(i32 %x) {
define zeroext i1 @icmp_sgt2(i32 %x) {
; SDAG-LABEL: icmp_sgt2:
-; SDAG: ## BB#0:
+; SDAG: ## %bb.0:
; SDAG-NEXT: xorl %eax, %eax
; SDAG-NEXT: retq
;
; FAST-LABEL: icmp_sgt2:
-; FAST: ## BB#0:
+; FAST: ## %bb.0:
; FAST-NEXT: xorl %eax, %eax
; FAST-NEXT: andb $1, %al
; FAST-NEXT: movzbl %al, %eax
@@ -1081,12 +1391,12 @@ define zeroext i1 @icmp_sgt2(i32 %x) {
define zeroext i1 @icmp_sge2(i32 %x) {
; SDAG-LABEL: icmp_sge2:
-; SDAG: ## BB#0:
+; SDAG: ## %bb.0:
; SDAG-NEXT: movb $1, %al
; SDAG-NEXT: retq
;
; FAST-LABEL: icmp_sge2:
-; FAST: ## BB#0:
+; FAST: ## %bb.0:
; FAST-NEXT: movb $1, %al
; FAST-NEXT: andb $1, %al
; FAST-NEXT: movzbl %al, %eax
@@ -1097,12 +1407,12 @@ define zeroext i1 @icmp_sge2(i32 %x) {
define zeroext i1 @icmp_slt2(i32 %x) {
; SDAG-LABEL: icmp_slt2:
-; SDAG: ## BB#0:
+; SDAG: ## %bb.0:
; SDAG-NEXT: xorl %eax, %eax
; SDAG-NEXT: retq
;
; FAST-LABEL: icmp_slt2:
-; FAST: ## BB#0:
+; FAST: ## %bb.0:
; FAST-NEXT: xorl %eax, %eax
; FAST-NEXT: andb $1, %al
; FAST-NEXT: movzbl %al, %eax
@@ -1113,12 +1423,12 @@ define zeroext i1 @icmp_slt2(i32 %x) {
define zeroext i1 @icmp_sle2(i32 %x) {
; SDAG-LABEL: icmp_sle2:
-; SDAG: ## BB#0:
+; SDAG: ## %bb.0:
; SDAG-NEXT: movb $1, %al
; SDAG-NEXT: retq
;
; FAST-LABEL: icmp_sle2:
-; FAST: ## BB#0:
+; FAST: ## %bb.0:
; FAST-NEXT: movb $1, %al
; FAST-NEXT: andb $1, %al
; FAST-NEXT: movzbl %al, %eax
diff --git a/test/CodeGen/X86/fast-isel-constant.ll b/test/CodeGen/X86/fast-isel-constant.ll
index 6f9240ac4700..2c7cbc64da01 100644
--- a/test/CodeGen/X86/fast-isel-constant.ll
+++ b/test/CodeGen/X86/fast-isel-constant.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -O0 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-- -O0 | FileCheck %s
; Make sure fast-isel doesn't reset the materialised constant map
; across an intrinsic call.
diff --git a/test/CodeGen/X86/fast-isel-constpool.ll b/test/CodeGen/X86/fast-isel-constpool.ll
index 4e6f7c0f9e8e..4b8f387571e9 100644
--- a/test/CodeGen/X86/fast-isel-constpool.ll
+++ b/test/CodeGen/X86/fast-isel-constpool.ll
@@ -1,23 +1,66 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=x86_64-apple-darwin -fast-isel -code-model=small < %s | FileCheck %s
; RUN: llc -mtriple=x86_64-apple-darwin -fast-isel -code-model=large < %s | FileCheck %s --check-prefix=LARGE
+; RUN: llc -mtriple=x86_64-apple-darwin -fast-isel -code-model=small -mattr=avx < %s | FileCheck %s --check-prefix=AVX
+; RUN: llc -mtriple=x86_64-apple-darwin -fast-isel -code-model=large -mattr=avx < %s | FileCheck %s --check-prefix=LARGE_AVX
+; RUN: llc -mtriple=x86_64-apple-darwin -fast-isel -code-model=small -mattr=avx512f < %s | FileCheck %s --check-prefix=AVX
+; RUN: llc -mtriple=x86_64-apple-darwin -fast-isel -code-model=large -mattr=avx512f < %s | FileCheck %s --check-prefix=LARGE_AVX
; Make sure fast isel uses rip-relative addressing for the small code model.
define float @constpool_float(float %x) {
-; CHECK-LABEL: constpool_float
-; CHECK: LCPI0_0(%rip)
+; CHECK-LABEL: constpool_float:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT: addss %xmm1, %xmm0
+; CHECK-NEXT: retq
+;
+; LARGE-LABEL: constpool_float:
+; LARGE: ## %bb.0:
+; LARGE-NEXT: movabsq $LCPI0_0, %rax
+; LARGE-NEXT: addss (%rax), %xmm0
+; LARGE-NEXT: retq
+;
+; AVX-LABEL: constpool_float:
+; AVX: ## %bb.0:
+; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
+;
+; LARGE_AVX-LABEL: constpool_float:
+; LARGE_AVX: ## %bb.0:
+; LARGE_AVX-NEXT: movabsq $LCPI0_0, %rax
+; LARGE_AVX-NEXT: vaddss (%rax), %xmm0, %xmm0
+; LARGE_AVX-NEXT: retq
-; LARGE-LABEL: constpool_float
-; LARGE: movabsq $LCPI0_0, %rax
%1 = fadd float %x, 16.50e+01
ret float %1
}
define double @constpool_double(double %x) nounwind {
-; CHECK-LABEL: constpool_double
-; CHECK: LCPI1_0(%rip)
+; CHECK-LABEL: constpool_double:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT: addsd %xmm1, %xmm0
+; CHECK-NEXT: retq
+;
+; LARGE-LABEL: constpool_double:
+; LARGE: ## %bb.0:
+; LARGE-NEXT: movabsq $LCPI1_0, %rax
+; LARGE-NEXT: addsd (%rax), %xmm0
+; LARGE-NEXT: retq
+;
+; AVX-LABEL: constpool_double:
+; AVX: ## %bb.0:
+; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
+;
+; LARGE_AVX-LABEL: constpool_double:
+; LARGE_AVX: ## %bb.0:
+; LARGE_AVX-NEXT: movabsq $LCPI1_0, %rax
+; LARGE_AVX-NEXT: vaddsd (%rax), %xmm0, %xmm0
+; LARGE_AVX-NEXT: retq
-; LARGE-LABEL: constpool_double
-; LARGE: movabsq $LCPI1_0, %rax
%1 = fadd double %x, 8.500000e-01
ret double %1
}
diff --git a/test/CodeGen/X86/fast-isel-emutls.ll b/test/CodeGen/X86/fast-isel-emutls.ll
index cb8012c0fa39..0a7f5d451ee7 100644
--- a/test/CodeGen/X86/fast-isel-emutls.ll
+++ b/test/CodeGen/X86/fast-isel-emutls.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -emulated-tls -march=x86 -relocation-model=pic -mtriple=i686-unknown-linux-gnu -fast-isel | FileCheck %s
+; RUN: llc < %s -emulated-tls -relocation-model=pic -mtriple=i686-unknown-linux-gnu -fast-isel | FileCheck %s
; PR3654
@v = thread_local global i32 0
diff --git a/test/CodeGen/X86/fast-isel-expect.ll b/test/CodeGen/X86/fast-isel-expect.ll
index c4be7f364f30..ce86041cecbd 100644
--- a/test/CodeGen/X86/fast-isel-expect.ll
+++ b/test/CodeGen/X86/fast-isel-expect.ll
@@ -1,7 +1,7 @@
-; RUN: llc < %s -O0 -march=x86 | FileCheck %s
+; RUN: llc < %s -O0 | FileCheck %s
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
+target triple = "i686-unknown-linux-gnu"
@glbl = extern_weak constant i8
diff --git a/test/CodeGen/X86/fast-isel-fneg.ll b/test/CodeGen/X86/fast-isel-fneg.ll
index e3bc7faae3ce..99c728a7366c 100644
--- a/test/CodeGen/X86/fast-isel-fneg.ll
+++ b/test/CodeGen/X86/fast-isel-fneg.ll
@@ -1,5 +1,5 @@
; RUN: llc < %s -fast-isel -fast-isel-abort=1 -mtriple=x86_64-apple-darwin10 | FileCheck %s
-; RUN: llc < %s -fast-isel -march=x86 -mattr=+sse2 | FileCheck --check-prefix=SSE2 %s
+; RUN: llc < %s -fast-isel -mtriple=i686-- -mattr=+sse2 | FileCheck --check-prefix=SSE2 %s
; SSE2: xor
; SSE2: xor
diff --git a/test/CodeGen/X86/fast-isel-fptrunc-fpext.ll b/test/CodeGen/X86/fast-isel-fptrunc-fpext.ll
index e4e9aeaa262e..af4a9da9c2aa 100644
--- a/test/CodeGen/X86/fast-isel-fptrunc-fpext.ll
+++ b/test/CodeGen/X86/fast-isel-fptrunc-fpext.ll
@@ -1,5 +1,7 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 -fast-isel -fast-isel-abort=1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx -fast-isel -fast-isel-abort=1 | FileCheck %s --check-prefix=ALL --check-prefix=AVX
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f -fast-isel -fast-isel-abort=1 | FileCheck %s --check-prefix=ALL --check-prefix=AVX
;
; Verify that fast-isel doesn't select legacy SSE instructions on targets that
; feature AVX.
@@ -21,31 +23,64 @@
; ///
define double @single_to_double_rr(float %x) {
-; ALL-LABEL: single_to_double_rr:
-; SSE-NOT: vcvtss2sd
-; AVX: vcvtss2sd %xmm0, %xmm0, %xmm0
-; ALL: ret
+; SSE-LABEL: single_to_double_rr:
+; SSE: # %bb.0: # %entry
+; SSE-NEXT: cvtss2sd %xmm0, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: single_to_double_rr:
+; AVX: # %bb.0: # %entry
+; AVX-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
+; AVX-NEXT: retq
entry:
%conv = fpext float %x to double
ret double %conv
}
define float @double_to_single_rr(double %x) {
-; ALL-LABEL: double_to_single_rr:
-; SSE-NOT: vcvtsd2ss
-; AVX: vcvtsd2ss %xmm0, %xmm0, %xmm0
-; ALL: ret
+; SSE-LABEL: double_to_single_rr:
+; SSE: # %bb.0: # %entry
+; SSE-NEXT: cvtsd2ss %xmm0, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: double_to_single_rr:
+; AVX: # %bb.0: # %entry
+; AVX-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0
+; AVX-NEXT: retq
entry:
%conv = fptrunc double %x to float
ret float %conv
}
define double @single_to_double_rm(float* %x) {
-; ALL-LABEL: single_to_double_rm:
-; SSE: cvtss2sd (%rdi), %xmm0
-; AVX: vmovss (%rdi), %xmm0
-; AVX-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; ALL-NEXT: ret
+; SSE-LABEL: single_to_double_rm:
+; SSE: # %bb.0: # %entry
+; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT: cvtss2sd %xmm0, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: single_to_double_rm:
+; AVX: # %bb.0: # %entry
+; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
+; AVX-NEXT: retq
+entry:
+ %0 = load float, float* %x, align 4
+ %conv = fpext float %0 to double
+ ret double %conv
+}
+
+define double @single_to_double_rm_optsize(float* %x) optsize {
+; SSE-LABEL: single_to_double_rm_optsize:
+; SSE: # %bb.0: # %entry
+; SSE-NEXT: cvtss2sd (%rdi), %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: single_to_double_rm_optsize:
+; AVX: # %bb.0: # %entry
+; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
+; AVX-NEXT: retq
entry:
%0 = load float, float* %x, align 4
%conv = fpext float %0 to double
@@ -53,11 +88,34 @@ entry:
}
define float @double_to_single_rm(double* %x) {
-; ALL-LABEL: double_to_single_rm:
-; SSE: cvtsd2ss (%rdi), %xmm0
-; AVX: vmovsd (%rdi), %xmm0
-; AVX-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0
-; ALL-NEXT: ret
+; SSE-LABEL: double_to_single_rm:
+; SSE: # %bb.0: # %entry
+; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE-NEXT: cvtsd2ss %xmm0, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: double_to_single_rm:
+; AVX: # %bb.0: # %entry
+; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0
+; AVX-NEXT: retq
+entry:
+ %0 = load double, double* %x, align 8
+ %conv = fptrunc double %0 to float
+ ret float %conv
+}
+
+define float @double_to_single_rm_optsize(double* %x) optsize {
+; SSE-LABEL: double_to_single_rm_optsize:
+; SSE: # %bb.0: # %entry
+; SSE-NEXT: cvtsd2ss (%rdi), %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: double_to_single_rm_optsize:
+; AVX: # %bb.0: # %entry
+; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0
+; AVX-NEXT: retq
entry:
%0 = load double, double* %x, align 8
%conv = fptrunc double %0 to float
diff --git a/test/CodeGen/X86/fast-isel-gep.ll b/test/CodeGen/X86/fast-isel-gep.ll
index 1886d3379aad..88a22ca899d7 100644
--- a/test/CodeGen/X86/fast-isel-gep.ll
+++ b/test/CodeGen/X86/fast-isel-gep.ll
@@ -1,6 +1,6 @@
; RUN: llc < %s -mtriple=x86_64-linux -O0 | FileCheck %s --check-prefix=X64
; RUN: llc < %s -mtriple=x86_64-windows-itanium -O0 | FileCheck %s --check-prefix=X64
-; RUN: llc < %s -march=x86 -O0 | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=i686-- -O0 | FileCheck %s --check-prefix=X32
; GEP indices are interpreted as signed integers, so they
; should be sign-extended to 64 bits on 64-bit targets.
diff --git a/test/CodeGen/X86/fast-isel-int-float-conversion-x86-64.ll b/test/CodeGen/X86/fast-isel-int-float-conversion-x86-64.ll
new file mode 100644
index 000000000000..509a5cfe9316
--- /dev/null
+++ b/test/CodeGen/X86/fast-isel-int-float-conversion-x86-64.ll
@@ -0,0 +1,100 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=generic -mattr=+sse2 -fast-isel --fast-isel-abort=1 < %s | FileCheck %s --check-prefix=ALL --check-prefix=SSE2
+; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=generic -mattr=+avx -fast-isel --fast-isel-abort=1 < %s | FileCheck %s --check-prefix=ALL --check-prefix=AVX
+
+
+define double @long_to_double_rr(i64 %a) {
+; SSE2-LABEL: long_to_double_rr:
+; SSE2: # %bb.0: # %entry
+; SSE2-NEXT: cvtsi2sdq %rdi, %xmm0
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: long_to_double_rr:
+; AVX: # %bb.0: # %entry
+; AVX-NEXT: vcvtsi2sdq %rdi, %xmm0, %xmm0
+; AVX-NEXT: retq
+entry:
+ %0 = sitofp i64 %a to double
+ ret double %0
+}
+
+define double @long_to_double_rm(i64* %a) {
+; SSE2-LABEL: long_to_double_rm:
+; SSE2: # %bb.0: # %entry
+; SSE2-NEXT: movq (%rdi), %rax
+; SSE2-NEXT: cvtsi2sdq %rax, %xmm0
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: long_to_double_rm:
+; AVX: # %bb.0: # %entry
+; AVX-NEXT: vcvtsi2sdq (%rdi), %xmm0, %xmm0
+; AVX-NEXT: retq
+entry:
+ %0 = load i64, i64* %a
+ %1 = sitofp i64 %0 to double
+ ret double %1
+}
+
+define double @long_to_double_rm_optsize(i64* %a) optsize {
+; SSE2-LABEL: long_to_double_rm_optsize:
+; SSE2: # %bb.0: # %entry
+; SSE2-NEXT: cvtsi2sdq (%rdi), %xmm0
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: long_to_double_rm_optsize:
+; AVX: # %bb.0: # %entry
+; AVX-NEXT: vcvtsi2sdq (%rdi), %xmm0, %xmm0
+; AVX-NEXT: retq
+entry:
+ %0 = load i64, i64* %a
+ %1 = sitofp i64 %0 to double
+ ret double %1
+}
+
+define float @long_to_float_rr(i64 %a) {
+; SSE2-LABEL: long_to_float_rr:
+; SSE2: # %bb.0: # %entry
+; SSE2-NEXT: cvtsi2ssq %rdi, %xmm0
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: long_to_float_rr:
+; AVX: # %bb.0: # %entry
+; AVX-NEXT: vcvtsi2ssq %rdi, %xmm0, %xmm0
+; AVX-NEXT: retq
+entry:
+ %0 = sitofp i64 %a to float
+ ret float %0
+}
+
+define float @long_to_float_rm(i64* %a) {
+; SSE2-LABEL: long_to_float_rm:
+; SSE2: # %bb.0: # %entry
+; SSE2-NEXT: movq (%rdi), %rax
+; SSE2-NEXT: cvtsi2ssq %rax, %xmm0
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: long_to_float_rm:
+; AVX: # %bb.0: # %entry
+; AVX-NEXT: vcvtsi2ssq (%rdi), %xmm0, %xmm0
+; AVX-NEXT: retq
+entry:
+ %0 = load i64, i64* %a
+ %1 = sitofp i64 %0 to float
+ ret float %1
+}
+
+define float @long_to_float_rm_optsize(i64* %a) optsize {
+; SSE2-LABEL: long_to_float_rm_optsize:
+; SSE2: # %bb.0: # %entry
+; SSE2-NEXT: cvtsi2ssq (%rdi), %xmm0
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: long_to_float_rm_optsize:
+; AVX: # %bb.0: # %entry
+; AVX-NEXT: vcvtsi2ssq (%rdi), %xmm0, %xmm0
+; AVX-NEXT: retq
+entry:
+ %0 = load i64, i64* %a
+ %1 = sitofp i64 %0 to float
+ ret float %1
+}
diff --git a/test/CodeGen/X86/fast-isel-int-float-conversion.ll b/test/CodeGen/X86/fast-isel-int-float-conversion.ll
index afa6ee9aa7a2..4465d3463cca 100644
--- a/test/CodeGen/X86/fast-isel-int-float-conversion.ll
+++ b/test/CodeGen/X86/fast-isel-int-float-conversion.ll
@@ -1,22 +1,153 @@
-; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=generic -mattr=+sse2 -fast-isel --fast-isel-abort=1 < %s | FileCheck %s --check-prefix=ALL --check-prefix=SSE2
-; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=generic -mattr=+avx -fast-isel --fast-isel-abort=1 < %s | FileCheck %s --check-prefix=ALL --check-prefix=AVX
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=generic -mattr=+sse2 -fast-isel --fast-isel-abort=1 < %s | FileCheck %s --check-prefix=SSE2
+; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=generic -mattr=+avx -fast-isel --fast-isel-abort=1 < %s | FileCheck %s --check-prefix=AVX
+; RUN: llc -mtriple=i686-unknown-unknown -mcpu=generic -mattr=+sse2 -fast-isel --fast-isel-abort=1 < %s | FileCheck %s --check-prefix=SSE2_X86
+; RUN: llc -mtriple=i686-unknown-unknown -mcpu=generic -mattr=+avx -fast-isel --fast-isel-abort=1 < %s | FileCheck %s --check-prefix=AVX_X86
define double @int_to_double_rr(i32 %a) {
-; ALL-LABEL: int_to_double_rr:
-; SSE2: cvtsi2sdl %edi, %xmm0
-; AVX: vcvtsi2sdl %edi, %xmm0, %xmm0
-; ALL-NEXT: ret
+; SSE2-LABEL: int_to_double_rr:
+; SSE2: # %bb.0: # %entry
+; SSE2-NEXT: cvtsi2sdl %edi, %xmm0
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: int_to_double_rr:
+; AVX: # %bb.0: # %entry
+; AVX-NEXT: vcvtsi2sdl %edi, %xmm0, %xmm0
+; AVX-NEXT: retq
+;
+; SSE2_X86-LABEL: int_to_double_rr:
+; SSE2_X86: # %bb.0: # %entry
+; SSE2_X86-NEXT: pushl %ebp
+; SSE2_X86-NEXT: .cfi_def_cfa_offset 8
+; SSE2_X86-NEXT: .cfi_offset %ebp, -8
+; SSE2_X86-NEXT: movl %esp, %ebp
+; SSE2_X86-NEXT: .cfi_def_cfa_register %ebp
+; SSE2_X86-NEXT: andl $-8, %esp
+; SSE2_X86-NEXT: subl $8, %esp
+; SSE2_X86-NEXT: movl 8(%ebp), %eax
+; SSE2_X86-NEXT: cvtsi2sdl %eax, %xmm0
+; SSE2_X86-NEXT: movsd %xmm0, (%esp)
+; SSE2_X86-NEXT: fldl (%esp)
+; SSE2_X86-NEXT: movl %ebp, %esp
+; SSE2_X86-NEXT: popl %ebp
+; SSE2_X86-NEXT: retl
+;
+; AVX_X86-LABEL: int_to_double_rr:
+; AVX_X86: # %bb.0: # %entry
+; AVX_X86-NEXT: pushl %ebp
+; AVX_X86-NEXT: .cfi_def_cfa_offset 8
+; AVX_X86-NEXT: .cfi_offset %ebp, -8
+; AVX_X86-NEXT: movl %esp, %ebp
+; AVX_X86-NEXT: .cfi_def_cfa_register %ebp
+; AVX_X86-NEXT: andl $-8, %esp
+; AVX_X86-NEXT: subl $8, %esp
+; AVX_X86-NEXT: vcvtsi2sdl 8(%ebp), %xmm0, %xmm0
+; AVX_X86-NEXT: vmovsd %xmm0, (%esp)
+; AVX_X86-NEXT: fldl (%esp)
+; AVX_X86-NEXT: movl %ebp, %esp
+; AVX_X86-NEXT: popl %ebp
+; AVX_X86-NEXT: retl
entry:
%0 = sitofp i32 %a to double
ret double %0
}
define double @int_to_double_rm(i32* %a) {
-; ALL-LABEL: int_to_double_rm:
-; SSE2: cvtsi2sdl (%rdi), %xmm0
-; AVX: vcvtsi2sdl (%rdi), %xmm0, %xmm0
-; ALL-NEXT: ret
+; SSE2-LABEL: int_to_double_rm:
+; SSE2: # %bb.0: # %entry
+; SSE2-NEXT: movl (%rdi), %eax
+; SSE2-NEXT: cvtsi2sdl %eax, %xmm0
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: int_to_double_rm:
+; AVX: # %bb.0: # %entry
+; AVX-NEXT: vcvtsi2sdl (%rdi), %xmm0, %xmm0
+; AVX-NEXT: retq
+;
+; SSE2_X86-LABEL: int_to_double_rm:
+; SSE2_X86: # %bb.0: # %entry
+; SSE2_X86-NEXT: pushl %ebp
+; SSE2_X86-NEXT: .cfi_def_cfa_offset 8
+; SSE2_X86-NEXT: .cfi_offset %ebp, -8
+; SSE2_X86-NEXT: movl %esp, %ebp
+; SSE2_X86-NEXT: .cfi_def_cfa_register %ebp
+; SSE2_X86-NEXT: andl $-8, %esp
+; SSE2_X86-NEXT: subl $8, %esp
+; SSE2_X86-NEXT: movl 8(%ebp), %eax
+; SSE2_X86-NEXT: cvtsi2sdl (%eax), %xmm0
+; SSE2_X86-NEXT: movsd %xmm0, (%esp)
+; SSE2_X86-NEXT: fldl (%esp)
+; SSE2_X86-NEXT: movl %ebp, %esp
+; SSE2_X86-NEXT: popl %ebp
+; SSE2_X86-NEXT: retl
+;
+; AVX_X86-LABEL: int_to_double_rm:
+; AVX_X86: # %bb.0: # %entry
+; AVX_X86-NEXT: pushl %ebp
+; AVX_X86-NEXT: .cfi_def_cfa_offset 8
+; AVX_X86-NEXT: .cfi_offset %ebp, -8
+; AVX_X86-NEXT: movl %esp, %ebp
+; AVX_X86-NEXT: .cfi_def_cfa_register %ebp
+; AVX_X86-NEXT: andl $-8, %esp
+; AVX_X86-NEXT: subl $8, %esp
+; AVX_X86-NEXT: movl 8(%ebp), %eax
+; AVX_X86-NEXT: vcvtsi2sdl (%eax), %xmm0, %xmm0
+; AVX_X86-NEXT: vmovsd %xmm0, (%esp)
+; AVX_X86-NEXT: fldl (%esp)
+; AVX_X86-NEXT: movl %ebp, %esp
+; AVX_X86-NEXT: popl %ebp
+; AVX_X86-NEXT: retl
+entry:
+ %0 = load i32, i32* %a
+ %1 = sitofp i32 %0 to double
+ ret double %1
+}
+
+define double @int_to_double_rm_optsize(i32* %a) optsize {
+; SSE2-LABEL: int_to_double_rm_optsize:
+; SSE2: # %bb.0: # %entry
+; SSE2-NEXT: cvtsi2sdl (%rdi), %xmm0
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: int_to_double_rm_optsize:
+; AVX: # %bb.0: # %entry
+; AVX-NEXT: vcvtsi2sdl (%rdi), %xmm0, %xmm0
+; AVX-NEXT: retq
+;
+; SSE2_X86-LABEL: int_to_double_rm_optsize:
+; SSE2_X86: # %bb.0: # %entry
+; SSE2_X86-NEXT: pushl %ebp
+; SSE2_X86-NEXT: .cfi_def_cfa_offset 8
+; SSE2_X86-NEXT: .cfi_offset %ebp, -8
+; SSE2_X86-NEXT: movl %esp, %ebp
+; SSE2_X86-NEXT: .cfi_def_cfa_register %ebp
+; SSE2_X86-NEXT: andl $-8, %esp
+; SSE2_X86-NEXT: subl $8, %esp
+; SSE2_X86-NEXT: movl 8(%ebp), %eax
+; SSE2_X86-NEXT: cvtsi2sdl (%eax), %xmm0
+; SSE2_X86-NEXT: movsd %xmm0, (%esp)
+; SSE2_X86-NEXT: fldl (%esp)
+; SSE2_X86-NEXT: movl %ebp, %esp
+; SSE2_X86-NEXT: popl %ebp
+; SSE2_X86-NEXT: retl
+;
+; AVX_X86-LABEL: int_to_double_rm_optsize:
+; AVX_X86: # %bb.0: # %entry
+; AVX_X86-NEXT: pushl %ebp
+; AVX_X86-NEXT: .cfi_def_cfa_offset 8
+; AVX_X86-NEXT: .cfi_offset %ebp, -8
+; AVX_X86-NEXT: movl %esp, %ebp
+; AVX_X86-NEXT: .cfi_def_cfa_register %ebp
+; AVX_X86-NEXT: andl $-8, %esp
+; AVX_X86-NEXT: subl $8, %esp
+; AVX_X86-NEXT: movl 8(%ebp), %eax
+; AVX_X86-NEXT: vcvtsi2sdl (%eax), %xmm0, %xmm0
+; AVX_X86-NEXT: vmovsd %xmm0, (%esp)
+; AVX_X86-NEXT: fldl (%esp)
+; AVX_X86-NEXT: movl %ebp, %esp
+; AVX_X86-NEXT: popl %ebp
+; AVX_X86-NEXT: retl
entry:
%0 = load i32, i32* %a
%1 = sitofp i32 %0 to double
@@ -24,20 +155,112 @@ entry:
}
define float @int_to_float_rr(i32 %a) {
-; ALL-LABEL: int_to_float_rr:
-; SSE2: cvtsi2ssl %edi, %xmm0
-; AVX: vcvtsi2ssl %edi, %xmm0, %xmm0
-; ALL-NEXT: ret
+; SSE2-LABEL: int_to_float_rr:
+; SSE2: # %bb.0: # %entry
+; SSE2-NEXT: cvtsi2ssl %edi, %xmm0
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: int_to_float_rr:
+; AVX: # %bb.0: # %entry
+; AVX-NEXT: vcvtsi2ssl %edi, %xmm0, %xmm0
+; AVX-NEXT: retq
+;
+; SSE2_X86-LABEL: int_to_float_rr:
+; SSE2_X86: # %bb.0: # %entry
+; SSE2_X86-NEXT: pushl %eax
+; SSE2_X86-NEXT: .cfi_def_cfa_offset 8
+; SSE2_X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; SSE2_X86-NEXT: cvtsi2ssl %eax, %xmm0
+; SSE2_X86-NEXT: movss %xmm0, (%esp)
+; SSE2_X86-NEXT: flds (%esp)
+; SSE2_X86-NEXT: popl %eax
+; SSE2_X86-NEXT: retl
+;
+; AVX_X86-LABEL: int_to_float_rr:
+; AVX_X86: # %bb.0: # %entry
+; AVX_X86-NEXT: pushl %eax
+; AVX_X86-NEXT: .cfi_def_cfa_offset 8
+; AVX_X86-NEXT: vcvtsi2ssl {{[0-9]+}}(%esp), %xmm0, %xmm0
+; AVX_X86-NEXT: vmovss %xmm0, (%esp)
+; AVX_X86-NEXT: flds (%esp)
+; AVX_X86-NEXT: popl %eax
+; AVX_X86-NEXT: retl
entry:
%0 = sitofp i32 %a to float
ret float %0
}
define float @int_to_float_rm(i32* %a) {
-; ALL-LABEL: int_to_float_rm:
-; SSE2: cvtsi2ssl (%rdi), %xmm0
-; AVX: vcvtsi2ssl (%rdi), %xmm0, %xmm0
-; ALL-NEXT: ret
+; SSE2-LABEL: int_to_float_rm:
+; SSE2: # %bb.0: # %entry
+; SSE2-NEXT: movl (%rdi), %eax
+; SSE2-NEXT: cvtsi2ssl %eax, %xmm0
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: int_to_float_rm:
+; AVX: # %bb.0: # %entry
+; AVX-NEXT: vcvtsi2ssl (%rdi), %xmm0, %xmm0
+; AVX-NEXT: retq
+;
+; SSE2_X86-LABEL: int_to_float_rm:
+; SSE2_X86: # %bb.0: # %entry
+; SSE2_X86-NEXT: pushl %eax
+; SSE2_X86-NEXT: .cfi_def_cfa_offset 8
+; SSE2_X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; SSE2_X86-NEXT: cvtsi2ssl (%eax), %xmm0
+; SSE2_X86-NEXT: movss %xmm0, (%esp)
+; SSE2_X86-NEXT: flds (%esp)
+; SSE2_X86-NEXT: popl %eax
+; SSE2_X86-NEXT: retl
+;
+; AVX_X86-LABEL: int_to_float_rm:
+; AVX_X86: # %bb.0: # %entry
+; AVX_X86-NEXT: pushl %eax
+; AVX_X86-NEXT: .cfi_def_cfa_offset 8
+; AVX_X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX_X86-NEXT: vcvtsi2ssl (%eax), %xmm0, %xmm0
+; AVX_X86-NEXT: vmovss %xmm0, (%esp)
+; AVX_X86-NEXT: flds (%esp)
+; AVX_X86-NEXT: popl %eax
+; AVX_X86-NEXT: retl
+entry:
+ %0 = load i32, i32* %a
+ %1 = sitofp i32 %0 to float
+ ret float %1
+}
+
+define float @int_to_float_rm_optsize(i32* %a) optsize {
+; SSE2-LABEL: int_to_float_rm_optsize:
+; SSE2: # %bb.0: # %entry
+; SSE2-NEXT: cvtsi2ssl (%rdi), %xmm0
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: int_to_float_rm_optsize:
+; AVX: # %bb.0: # %entry
+; AVX-NEXT: vcvtsi2ssl (%rdi), %xmm0, %xmm0
+; AVX-NEXT: retq
+;
+; SSE2_X86-LABEL: int_to_float_rm_optsize:
+; SSE2_X86: # %bb.0: # %entry
+; SSE2_X86-NEXT: pushl %eax
+; SSE2_X86-NEXT: .cfi_def_cfa_offset 8
+; SSE2_X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; SSE2_X86-NEXT: cvtsi2ssl (%eax), %xmm0
+; SSE2_X86-NEXT: movss %xmm0, (%esp)
+; SSE2_X86-NEXT: flds (%esp)
+; SSE2_X86-NEXT: popl %eax
+; SSE2_X86-NEXT: retl
+;
+; AVX_X86-LABEL: int_to_float_rm_optsize:
+; AVX_X86: # %bb.0: # %entry
+; AVX_X86-NEXT: pushl %eax
+; AVX_X86-NEXT: .cfi_def_cfa_offset 8
+; AVX_X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX_X86-NEXT: vcvtsi2ssl (%eax), %xmm0, %xmm0
+; AVX_X86-NEXT: vmovss %xmm0, (%esp)
+; AVX_X86-NEXT: flds (%esp)
+; AVX_X86-NEXT: popl %eax
+; AVX_X86-NEXT: retl
entry:
%0 = load i32, i32* %a
%1 = sitofp i32 %0 to float
diff --git a/test/CodeGen/X86/fast-isel-load-i1.ll b/test/CodeGen/X86/fast-isel-load-i1.ll
index f515d38cbb95..814c8649ca90 100644
--- a/test/CodeGen/X86/fast-isel-load-i1.ll
+++ b/test/CodeGen/X86/fast-isel-load-i1.ll
@@ -3,10 +3,10 @@
define i1 @test_i1(i1* %b) {
; CHECK-LABEL: test_i1:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: testb $1, (%rdi)
; CHECK-NEXT: je .LBB0_2
-; CHECK-NEXT: # BB#1: # %in
+; CHECK-NEXT: # %bb.1: # %in
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: retq
; CHECK-NEXT: .LBB0_2: # %out
diff --git a/test/CodeGen/X86/fast-isel-nontemporal.ll b/test/CodeGen/X86/fast-isel-nontemporal.ll
index 33d001cdc216..79e96308a299 100644
--- a/test/CodeGen/X86/fast-isel-nontemporal.ll
+++ b/test/CodeGen/X86/fast-isel-nontemporal.ll
@@ -4,6 +4,7 @@
; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64-unknown-unknown -mattr=+mmx,+sse4.1 -fast-isel -O0 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64-unknown-unknown -mattr=+mmx,+avx -fast-isel -O0 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64-unknown-unknown -mattr=+mmx,+avx2 -fast-isel -O0 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64-unknown-unknown -mattr=+mmx,+avx512vl -fast-isel -O0 | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512VL
; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64-unknown-unknown -mattr=+mmx,+avx512f -fast-isel -O0 | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512F
; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64-unknown-unknown -mattr=+mmx,+avx512bw -fast-isel -O0 | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
@@ -13,7 +14,7 @@
define void @test_nti32(i32* nocapture %ptr, i32 %X) {
; ALL-LABEL: test_nti32:
-; ALL: # BB#0: # %entry
+; ALL: # %bb.0: # %entry
; ALL-NEXT: movntil %esi, (%rdi)
; ALL-NEXT: retq
entry:
@@ -23,7 +24,7 @@ entry:
define void @test_nti64(i64* nocapture %ptr, i64 %X) {
; ALL-LABEL: test_nti64:
-; ALL: # BB#0: # %entry
+; ALL: # %bb.0: # %entry
; ALL-NEXT: movntiq %rsi, (%rdi)
; ALL-NEXT: retq
entry:
@@ -33,27 +34,27 @@ entry:
define void @test_ntfloat(float* nocapture %ptr, float %X) {
; SSE2-LABEL: test_ntfloat:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movss %xmm0, (%rdi)
; SSE2-NEXT: retq
;
; SSE4A-LABEL: test_ntfloat:
-; SSE4A: # BB#0: # %entry
+; SSE4A: # %bb.0: # %entry
; SSE4A-NEXT: movntss %xmm0, (%rdi)
; SSE4A-NEXT: retq
;
; SSE41-LABEL: test_ntfloat:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: movss %xmm0, (%rdi)
; SSE41-NEXT: retq
;
; AVX-LABEL: test_ntfloat:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vmovss %xmm0, (%rdi)
; AVX-NEXT: retq
;
; AVX512-LABEL: test_ntfloat:
-; AVX512: # BB#0: # %entry
+; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vmovss %xmm0, (%rdi)
; AVX512-NEXT: retq
entry:
@@ -63,27 +64,27 @@ entry:
define void @test_ntdouble(double* nocapture %ptr, double %X) {
; SSE2-LABEL: test_ntdouble:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movsd %xmm0, (%rdi)
; SSE2-NEXT: retq
;
; SSE4A-LABEL: test_ntdouble:
-; SSE4A: # BB#0: # %entry
+; SSE4A: # %bb.0: # %entry
; SSE4A-NEXT: movntsd %xmm0, (%rdi)
; SSE4A-NEXT: retq
;
; SSE41-LABEL: test_ntdouble:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: movsd %xmm0, (%rdi)
; SSE41-NEXT: retq
;
; AVX-LABEL: test_ntdouble:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vmovsd %xmm0, (%rdi)
; AVX-NEXT: retq
;
; AVX512-LABEL: test_ntdouble:
-; AVX512: # BB#0: # %entry
+; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vmovsd %xmm0, (%rdi)
; AVX512-NEXT: retq
entry:
@@ -97,7 +98,7 @@ entry:
define void @test_mmx(x86_mmx* nocapture %a0, x86_mmx* nocapture %a1) {
; ALL-LABEL: test_mmx:
-; ALL: # BB#0: # %entry
+; ALL: # %bb.0: # %entry
; ALL-NEXT: movq (%rdi), %mm0
; ALL-NEXT: psrlq $3, %mm0
; ALL-NEXT: movntq %mm0, (%rsi)
@@ -116,17 +117,17 @@ declare x86_mmx @llvm.x86.mmx.psrli.q(x86_mmx, i32) nounwind readnone
define void @test_nt4xfloat(<4 x float>* nocapture %ptr, <4 x float> %X) {
; SSE-LABEL: test_nt4xfloat:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: movntps %xmm0, (%rdi)
; SSE-NEXT: retq
;
; AVX-LABEL: test_nt4xfloat:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vmovntps %xmm0, (%rdi)
; AVX-NEXT: retq
;
; AVX512-LABEL: test_nt4xfloat:
-; AVX512: # BB#0: # %entry
+; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vmovntps %xmm0, (%rdi)
; AVX512-NEXT: retq
entry:
@@ -136,17 +137,17 @@ entry:
define void @test_nt2xdouble(<2 x double>* nocapture %ptr, <2 x double> %X) {
; SSE-LABEL: test_nt2xdouble:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: movntpd %xmm0, (%rdi)
; SSE-NEXT: retq
;
; AVX-LABEL: test_nt2xdouble:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vmovntpd %xmm0, (%rdi)
; AVX-NEXT: retq
;
; AVX512-LABEL: test_nt2xdouble:
-; AVX512: # BB#0: # %entry
+; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vmovntpd %xmm0, (%rdi)
; AVX512-NEXT: retq
entry:
@@ -156,17 +157,17 @@ entry:
define void @test_nt16xi8(<16 x i8>* nocapture %ptr, <16 x i8> %X) {
; SSE-LABEL: test_nt16xi8:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: movntdq %xmm0, (%rdi)
; SSE-NEXT: retq
;
; AVX-LABEL: test_nt16xi8:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vmovntdq %xmm0, (%rdi)
; AVX-NEXT: retq
;
; AVX512-LABEL: test_nt16xi8:
-; AVX512: # BB#0: # %entry
+; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vmovntdq %xmm0, (%rdi)
; AVX512-NEXT: retq
entry:
@@ -176,17 +177,17 @@ entry:
define void @test_nt8xi16(<8 x i16>* nocapture %ptr, <8 x i16> %X) {
; SSE-LABEL: test_nt8xi16:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: movntdq %xmm0, (%rdi)
; SSE-NEXT: retq
;
; AVX-LABEL: test_nt8xi16:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vmovntdq %xmm0, (%rdi)
; AVX-NEXT: retq
;
; AVX512-LABEL: test_nt8xi16:
-; AVX512: # BB#0: # %entry
+; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vmovntdq %xmm0, (%rdi)
; AVX512-NEXT: retq
entry:
@@ -196,17 +197,17 @@ entry:
define void @test_nt4xi32(<4 x i32>* nocapture %ptr, <4 x i32> %X) {
; SSE-LABEL: test_nt4xi32:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: movntdq %xmm0, (%rdi)
; SSE-NEXT: retq
;
; AVX-LABEL: test_nt4xi32:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vmovntdq %xmm0, (%rdi)
; AVX-NEXT: retq
;
; AVX512-LABEL: test_nt4xi32:
-; AVX512: # BB#0: # %entry
+; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vmovntdq %xmm0, (%rdi)
; AVX512-NEXT: retq
entry:
@@ -216,17 +217,17 @@ entry:
define void @test_nt2xi64(<2 x i64>* nocapture %ptr, <2 x i64> %X) {
; SSE-LABEL: test_nt2xi64:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: movntdq %xmm0, (%rdi)
; SSE-NEXT: retq
;
; AVX-LABEL: test_nt2xi64:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vmovntdq %xmm0, (%rdi)
; AVX-NEXT: retq
;
; AVX512-LABEL: test_nt2xi64:
-; AVX512: # BB#0: # %entry
+; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vmovntdq %xmm0, (%rdi)
; AVX512-NEXT: retq
entry:
@@ -240,27 +241,27 @@ entry:
define <4 x float> @test_load_nt4xfloat(<4 x float>* nocapture %ptr) {
; SSE2-LABEL: test_load_nt4xfloat:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movaps (%rdi), %xmm0
; SSE2-NEXT: retq
;
; SSE4A-LABEL: test_load_nt4xfloat:
-; SSE4A: # BB#0: # %entry
+; SSE4A: # %bb.0: # %entry
; SSE4A-NEXT: movaps (%rdi), %xmm0
; SSE4A-NEXT: retq
;
; SSE41-LABEL: test_load_nt4xfloat:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: movntdqa (%rdi), %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: test_load_nt4xfloat:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vmovntdqa (%rdi), %xmm0
; AVX-NEXT: retq
;
; AVX512-LABEL: test_load_nt4xfloat:
-; AVX512: # BB#0: # %entry
+; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vmovntdqa (%rdi), %xmm0
; AVX512-NEXT: retq
entry:
@@ -270,27 +271,27 @@ entry:
define <2 x double> @test_load_nt2xdouble(<2 x double>* nocapture %ptr) {
; SSE2-LABEL: test_load_nt2xdouble:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movapd (%rdi), %xmm0
; SSE2-NEXT: retq
;
; SSE4A-LABEL: test_load_nt2xdouble:
-; SSE4A: # BB#0: # %entry
+; SSE4A: # %bb.0: # %entry
; SSE4A-NEXT: movapd (%rdi), %xmm0
; SSE4A-NEXT: retq
;
; SSE41-LABEL: test_load_nt2xdouble:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: movntdqa (%rdi), %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: test_load_nt2xdouble:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vmovntdqa (%rdi), %xmm0
; AVX-NEXT: retq
;
; AVX512-LABEL: test_load_nt2xdouble:
-; AVX512: # BB#0: # %entry
+; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vmovntdqa (%rdi), %xmm0
; AVX512-NEXT: retq
entry:
@@ -300,17 +301,17 @@ entry:
define <16 x i8> @test_load_nt16xi8(<16 x i8>* nocapture %ptr) {
; SSE-LABEL: test_load_nt16xi8:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: movntdqa (%rdi), %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test_load_nt16xi8:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vmovntdqa (%rdi), %xmm0
; AVX-NEXT: retq
;
; AVX512-LABEL: test_load_nt16xi8:
-; AVX512: # BB#0: # %entry
+; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vmovntdqa (%rdi), %xmm0
; AVX512-NEXT: retq
entry:
@@ -320,17 +321,17 @@ entry:
define <8 x i16> @test_load_nt8xi16(<8 x i16>* nocapture %ptr) {
; SSE-LABEL: test_load_nt8xi16:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: movntdqa (%rdi), %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test_load_nt8xi16:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vmovntdqa (%rdi), %xmm0
; AVX-NEXT: retq
;
; AVX512-LABEL: test_load_nt8xi16:
-; AVX512: # BB#0: # %entry
+; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vmovntdqa (%rdi), %xmm0
; AVX512-NEXT: retq
entry:
@@ -340,17 +341,17 @@ entry:
define <4 x i32> @test_load_nt4xi32(<4 x i32>* nocapture %ptr) {
; SSE-LABEL: test_load_nt4xi32:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: movntdqa (%rdi), %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test_load_nt4xi32:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vmovntdqa (%rdi), %xmm0
; AVX-NEXT: retq
;
; AVX512-LABEL: test_load_nt4xi32:
-; AVX512: # BB#0: # %entry
+; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vmovntdqa (%rdi), %xmm0
; AVX512-NEXT: retq
entry:
@@ -360,17 +361,17 @@ entry:
define <2 x i64> @test_load_nt2xi64(<2 x i64>* nocapture %ptr) {
; SSE-LABEL: test_load_nt2xi64:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: movntdqa (%rdi), %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test_load_nt2xi64:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vmovntdqa (%rdi), %xmm0
; AVX-NEXT: retq
;
; AVX512-LABEL: test_load_nt2xi64:
-; AVX512: # BB#0: # %entry
+; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vmovntdqa (%rdi), %xmm0
; AVX512-NEXT: retq
entry:
@@ -384,19 +385,19 @@ entry:
define void @test_nt8xfloat(<8 x float>* nocapture %ptr, <8 x float> %X) {
; SSE-LABEL: test_nt8xfloat:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: movntps %xmm0, (%rdi)
; SSE-NEXT: movntps %xmm1, 16(%rdi)
; SSE-NEXT: retq
;
; AVX-LABEL: test_nt8xfloat:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vmovntps %ymm0, (%rdi)
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
;
; AVX512-LABEL: test_nt8xfloat:
-; AVX512: # BB#0: # %entry
+; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vmovntps %ymm0, (%rdi)
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
@@ -407,19 +408,19 @@ entry:
define void @test_nt4xdouble(<4 x double>* nocapture %ptr, <4 x double> %X) {
; SSE-LABEL: test_nt4xdouble:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: movntpd %xmm0, (%rdi)
; SSE-NEXT: movntpd %xmm1, 16(%rdi)
; SSE-NEXT: retq
;
; AVX-LABEL: test_nt4xdouble:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vmovntpd %ymm0, (%rdi)
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
;
; AVX512-LABEL: test_nt4xdouble:
-; AVX512: # BB#0: # %entry
+; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vmovntpd %ymm0, (%rdi)
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
@@ -430,19 +431,19 @@ entry:
define void @test_nt32xi8(<32 x i8>* nocapture %ptr, <32 x i8> %X) {
; SSE-LABEL: test_nt32xi8:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: movntdq %xmm0, (%rdi)
; SSE-NEXT: movntdq %xmm1, 16(%rdi)
; SSE-NEXT: retq
;
; AVX-LABEL: test_nt32xi8:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vmovntdq %ymm0, (%rdi)
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
;
; AVX512-LABEL: test_nt32xi8:
-; AVX512: # BB#0: # %entry
+; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vmovntdq %ymm0, (%rdi)
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
@@ -453,19 +454,19 @@ entry:
define void @test_nt16xi16(<16 x i16>* nocapture %ptr, <16 x i16> %X) {
; SSE-LABEL: test_nt16xi16:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: movntdq %xmm0, (%rdi)
; SSE-NEXT: movntdq %xmm1, 16(%rdi)
; SSE-NEXT: retq
;
; AVX-LABEL: test_nt16xi16:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vmovntdq %ymm0, (%rdi)
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
;
; AVX512-LABEL: test_nt16xi16:
-; AVX512: # BB#0: # %entry
+; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vmovntdq %ymm0, (%rdi)
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
@@ -476,19 +477,19 @@ entry:
define void @test_nt8xi32(<8 x i32>* nocapture %ptr, <8 x i32> %X) {
; SSE-LABEL: test_nt8xi32:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: movntdq %xmm0, (%rdi)
; SSE-NEXT: movntdq %xmm1, 16(%rdi)
; SSE-NEXT: retq
;
; AVX-LABEL: test_nt8xi32:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vmovntdq %ymm0, (%rdi)
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
;
; AVX512-LABEL: test_nt8xi32:
-; AVX512: # BB#0: # %entry
+; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vmovntdq %ymm0, (%rdi)
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
@@ -499,19 +500,19 @@ entry:
define void @test_nt4xi64(<4 x i64>* nocapture %ptr, <4 x i64> %X) {
; SSE-LABEL: test_nt4xi64:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: movntdq %xmm0, (%rdi)
; SSE-NEXT: movntdq %xmm1, 16(%rdi)
; SSE-NEXT: retq
;
; AVX-LABEL: test_nt4xi64:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vmovntdq %ymm0, (%rdi)
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
;
; AVX512-LABEL: test_nt4xi64:
-; AVX512: # BB#0: # %entry
+; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vmovntdq %ymm0, (%rdi)
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
@@ -526,39 +527,39 @@ entry:
define <8 x float> @test_load_nt8xfloat(<8 x float>* nocapture %ptr) {
; SSE2-LABEL: test_load_nt8xfloat:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movaps (%rdi), %xmm0
; SSE2-NEXT: movaps 16(%rdi), %xmm1
; SSE2-NEXT: retq
;
; SSE4A-LABEL: test_load_nt8xfloat:
-; SSE4A: # BB#0: # %entry
+; SSE4A: # %bb.0: # %entry
; SSE4A-NEXT: movaps (%rdi), %xmm0
; SSE4A-NEXT: movaps 16(%rdi), %xmm1
; SSE4A-NEXT: retq
;
; SSE41-LABEL: test_load_nt8xfloat:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: movntdqa (%rdi), %xmm0
; SSE41-NEXT: movntdqa 16(%rdi), %xmm1
; SSE41-NEXT: retq
;
; AVX1-LABEL: test_load_nt8xfloat:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vmovntdqa (%rdi), %xmm0
-; AVX1-NEXT: # implicit-def: %YMM1
+; AVX1-NEXT: # implicit-def: %ymm1
; AVX1-NEXT: vmovaps %xmm0, %xmm1
; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_load_nt8xfloat:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vmovntdqa (%rdi), %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: test_load_nt8xfloat:
-; AVX512: # BB#0: # %entry
+; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vmovntdqa (%rdi), %ymm0
; AVX512-NEXT: retq
entry:
@@ -568,39 +569,39 @@ entry:
define <4 x double> @test_load_nt4xdouble(<4 x double>* nocapture %ptr) {
; SSE2-LABEL: test_load_nt4xdouble:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movapd (%rdi), %xmm0
; SSE2-NEXT: movapd 16(%rdi), %xmm1
; SSE2-NEXT: retq
;
; SSE4A-LABEL: test_load_nt4xdouble:
-; SSE4A: # BB#0: # %entry
+; SSE4A: # %bb.0: # %entry
; SSE4A-NEXT: movapd (%rdi), %xmm0
; SSE4A-NEXT: movapd 16(%rdi), %xmm1
; SSE4A-NEXT: retq
;
; SSE41-LABEL: test_load_nt4xdouble:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: movntdqa (%rdi), %xmm0
; SSE41-NEXT: movntdqa 16(%rdi), %xmm1
; SSE41-NEXT: retq
;
; AVX1-LABEL: test_load_nt4xdouble:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vmovntdqa (%rdi), %xmm0
-; AVX1-NEXT: # implicit-def: %YMM1
+; AVX1-NEXT: # implicit-def: %ymm1
; AVX1-NEXT: vmovaps %xmm0, %xmm1
; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_load_nt4xdouble:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vmovntdqa (%rdi), %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: test_load_nt4xdouble:
-; AVX512: # BB#0: # %entry
+; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vmovntdqa (%rdi), %ymm0
; AVX512-NEXT: retq
entry:
@@ -610,39 +611,39 @@ entry:
define <32 x i8> @test_load_nt32xi8(<32 x i8>* nocapture %ptr) {
; SSE2-LABEL: test_load_nt32xi8:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movaps (%rdi), %xmm0
; SSE2-NEXT: movaps 16(%rdi), %xmm1
; SSE2-NEXT: retq
;
; SSE4A-LABEL: test_load_nt32xi8:
-; SSE4A: # BB#0: # %entry
+; SSE4A: # %bb.0: # %entry
; SSE4A-NEXT: movaps (%rdi), %xmm0
; SSE4A-NEXT: movaps 16(%rdi), %xmm1
; SSE4A-NEXT: retq
;
; SSE41-LABEL: test_load_nt32xi8:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: movntdqa (%rdi), %xmm0
; SSE41-NEXT: movntdqa 16(%rdi), %xmm1
; SSE41-NEXT: retq
;
; AVX1-LABEL: test_load_nt32xi8:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vmovntdqa (%rdi), %xmm0
-; AVX1-NEXT: # implicit-def: %YMM1
+; AVX1-NEXT: # implicit-def: %ymm1
; AVX1-NEXT: vmovaps %xmm0, %xmm1
; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_load_nt32xi8:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vmovntdqa (%rdi), %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: test_load_nt32xi8:
-; AVX512: # BB#0: # %entry
+; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vmovntdqa (%rdi), %ymm0
; AVX512-NEXT: retq
entry:
@@ -652,39 +653,39 @@ entry:
define <16 x i16> @test_load_nt16xi16(<16 x i16>* nocapture %ptr) {
; SSE2-LABEL: test_load_nt16xi16:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movaps (%rdi), %xmm0
; SSE2-NEXT: movaps 16(%rdi), %xmm1
; SSE2-NEXT: retq
;
; SSE4A-LABEL: test_load_nt16xi16:
-; SSE4A: # BB#0: # %entry
+; SSE4A: # %bb.0: # %entry
; SSE4A-NEXT: movaps (%rdi), %xmm0
; SSE4A-NEXT: movaps 16(%rdi), %xmm1
; SSE4A-NEXT: retq
;
; SSE41-LABEL: test_load_nt16xi16:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: movntdqa (%rdi), %xmm0
; SSE41-NEXT: movntdqa 16(%rdi), %xmm1
; SSE41-NEXT: retq
;
; AVX1-LABEL: test_load_nt16xi16:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vmovntdqa (%rdi), %xmm0
-; AVX1-NEXT: # implicit-def: %YMM1
+; AVX1-NEXT: # implicit-def: %ymm1
; AVX1-NEXT: vmovaps %xmm0, %xmm1
; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_load_nt16xi16:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vmovntdqa (%rdi), %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: test_load_nt16xi16:
-; AVX512: # BB#0: # %entry
+; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vmovntdqa (%rdi), %ymm0
; AVX512-NEXT: retq
entry:
@@ -694,39 +695,39 @@ entry:
define <8 x i32> @test_load_nt8xi32(<8 x i32>* nocapture %ptr) {
; SSE2-LABEL: test_load_nt8xi32:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movaps (%rdi), %xmm0
; SSE2-NEXT: movaps 16(%rdi), %xmm1
; SSE2-NEXT: retq
;
; SSE4A-LABEL: test_load_nt8xi32:
-; SSE4A: # BB#0: # %entry
+; SSE4A: # %bb.0: # %entry
; SSE4A-NEXT: movaps (%rdi), %xmm0
; SSE4A-NEXT: movaps 16(%rdi), %xmm1
; SSE4A-NEXT: retq
;
; SSE41-LABEL: test_load_nt8xi32:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: movntdqa (%rdi), %xmm0
; SSE41-NEXT: movntdqa 16(%rdi), %xmm1
; SSE41-NEXT: retq
;
; AVX1-LABEL: test_load_nt8xi32:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vmovntdqa (%rdi), %xmm0
-; AVX1-NEXT: # implicit-def: %YMM1
+; AVX1-NEXT: # implicit-def: %ymm1
; AVX1-NEXT: vmovaps %xmm0, %xmm1
; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_load_nt8xi32:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vmovntdqa (%rdi), %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: test_load_nt8xi32:
-; AVX512: # BB#0: # %entry
+; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vmovntdqa (%rdi), %ymm0
; AVX512-NEXT: retq
entry:
@@ -736,39 +737,39 @@ entry:
define <4 x i64> @test_load_nt4xi64(<4 x i64>* nocapture %ptr) {
; SSE2-LABEL: test_load_nt4xi64:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movaps (%rdi), %xmm0
; SSE2-NEXT: movaps 16(%rdi), %xmm1
; SSE2-NEXT: retq
;
; SSE4A-LABEL: test_load_nt4xi64:
-; SSE4A: # BB#0: # %entry
+; SSE4A: # %bb.0: # %entry
; SSE4A-NEXT: movaps (%rdi), %xmm0
; SSE4A-NEXT: movaps 16(%rdi), %xmm1
; SSE4A-NEXT: retq
;
; SSE41-LABEL: test_load_nt4xi64:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: movntdqa (%rdi), %xmm0
; SSE41-NEXT: movntdqa 16(%rdi), %xmm1
; SSE41-NEXT: retq
;
; AVX1-LABEL: test_load_nt4xi64:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vmovntdqa (%rdi), %xmm0
-; AVX1-NEXT: # implicit-def: %YMM1
+; AVX1-NEXT: # implicit-def: %ymm1
; AVX1-NEXT: vmovaps %xmm0, %xmm1
; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_load_nt4xi64:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vmovntdqa (%rdi), %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: test_load_nt4xi64:
-; AVX512: # BB#0: # %entry
+; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vmovntdqa (%rdi), %ymm0
; AVX512-NEXT: retq
entry:
@@ -782,7 +783,7 @@ entry:
define void @test_nt16xfloat(<16 x float>* nocapture %ptr, <16 x float> %X) {
; SSE-LABEL: test_nt16xfloat:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: movntps %xmm0, (%rdi)
; SSE-NEXT: movntps %xmm1, 16(%rdi)
; SSE-NEXT: movntps %xmm2, 32(%rdi)
@@ -790,14 +791,14 @@ define void @test_nt16xfloat(<16 x float>* nocapture %ptr, <16 x float> %X) {
; SSE-NEXT: retq
;
; AVX-LABEL: test_nt16xfloat:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vmovntps %ymm0, (%rdi)
; AVX-NEXT: vmovntps %ymm1, 32(%rdi)
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
;
; AVX512-LABEL: test_nt16xfloat:
-; AVX512: # BB#0: # %entry
+; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vmovntps %zmm0, (%rdi)
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
@@ -808,7 +809,7 @@ entry:
define void @test_nt8xdouble(<8 x double>* nocapture %ptr, <8 x double> %X) {
; SSE-LABEL: test_nt8xdouble:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: movntpd %xmm0, (%rdi)
; SSE-NEXT: movntpd %xmm1, 16(%rdi)
; SSE-NEXT: movntpd %xmm2, 32(%rdi)
@@ -816,14 +817,14 @@ define void @test_nt8xdouble(<8 x double>* nocapture %ptr, <8 x double> %X) {
; SSE-NEXT: retq
;
; AVX-LABEL: test_nt8xdouble:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vmovntpd %ymm0, (%rdi)
; AVX-NEXT: vmovntpd %ymm1, 32(%rdi)
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
;
; AVX512-LABEL: test_nt8xdouble:
-; AVX512: # BB#0: # %entry
+; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vmovntpd %zmm0, (%rdi)
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
@@ -834,7 +835,7 @@ entry:
define void @test_nt64xi8(<64 x i8>* nocapture %ptr, <64 x i8> %X) {
; SSE-LABEL: test_nt64xi8:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: movntdq %xmm0, (%rdi)
; SSE-NEXT: movntdq %xmm1, 16(%rdi)
; SSE-NEXT: movntdq %xmm2, 32(%rdi)
@@ -842,21 +843,28 @@ define void @test_nt64xi8(<64 x i8>* nocapture %ptr, <64 x i8> %X) {
; SSE-NEXT: retq
;
; AVX-LABEL: test_nt64xi8:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vmovntdq %ymm0, (%rdi)
; AVX-NEXT: vmovntdq %ymm1, 32(%rdi)
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
;
+; AVX512VL-LABEL: test_nt64xi8:
+; AVX512VL: # %bb.0: # %entry
+; AVX512VL-NEXT: vmovntdq %ymm0, (%rdi)
+; AVX512VL-NEXT: vmovntdq %ymm1, 32(%rdi)
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
+;
; AVX512F-LABEL: test_nt64xi8:
-; AVX512F: # BB#0: # %entry
+; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vmovntdq %ymm0, (%rdi)
; AVX512F-NEXT: vmovntdq %ymm1, 32(%rdi)
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: test_nt64xi8:
-; AVX512BW: # BB#0: # %entry
+; AVX512BW: # %bb.0: # %entry
; AVX512BW-NEXT: vmovntdq %zmm0, (%rdi)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
@@ -867,7 +875,7 @@ entry:
define void @test_nt32xi16(<32 x i16>* nocapture %ptr, <32 x i16> %X) {
; SSE-LABEL: test_nt32xi16:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: movntdq %xmm0, (%rdi)
; SSE-NEXT: movntdq %xmm1, 16(%rdi)
; SSE-NEXT: movntdq %xmm2, 32(%rdi)
@@ -875,21 +883,28 @@ define void @test_nt32xi16(<32 x i16>* nocapture %ptr, <32 x i16> %X) {
; SSE-NEXT: retq
;
; AVX-LABEL: test_nt32xi16:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vmovntdq %ymm0, (%rdi)
; AVX-NEXT: vmovntdq %ymm1, 32(%rdi)
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
;
+; AVX512VL-LABEL: test_nt32xi16:
+; AVX512VL: # %bb.0: # %entry
+; AVX512VL-NEXT: vmovntdq %ymm0, (%rdi)
+; AVX512VL-NEXT: vmovntdq %ymm1, 32(%rdi)
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
+;
; AVX512F-LABEL: test_nt32xi16:
-; AVX512F: # BB#0: # %entry
+; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vmovntdq %ymm0, (%rdi)
; AVX512F-NEXT: vmovntdq %ymm1, 32(%rdi)
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: test_nt32xi16:
-; AVX512BW: # BB#0: # %entry
+; AVX512BW: # %bb.0: # %entry
; AVX512BW-NEXT: vmovntdq %zmm0, (%rdi)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
@@ -900,7 +915,7 @@ entry:
define void @test_nt16xi32(<16 x i32>* nocapture %ptr, <16 x i32> %X) {
; SSE-LABEL: test_nt16xi32:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: movntdq %xmm0, (%rdi)
; SSE-NEXT: movntdq %xmm1, 16(%rdi)
; SSE-NEXT: movntdq %xmm2, 32(%rdi)
@@ -908,14 +923,14 @@ define void @test_nt16xi32(<16 x i32>* nocapture %ptr, <16 x i32> %X) {
; SSE-NEXT: retq
;
; AVX-LABEL: test_nt16xi32:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vmovntdq %ymm0, (%rdi)
; AVX-NEXT: vmovntdq %ymm1, 32(%rdi)
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
;
; AVX512-LABEL: test_nt16xi32:
-; AVX512: # BB#0: # %entry
+; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vmovntdq %zmm0, (%rdi)
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
@@ -926,7 +941,7 @@ entry:
define void @test_nt8xi64(<8 x i64>* nocapture %ptr, <8 x i64> %X) {
; SSE-LABEL: test_nt8xi64:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: movntdq %xmm0, (%rdi)
; SSE-NEXT: movntdq %xmm1, 16(%rdi)
; SSE-NEXT: movntdq %xmm2, 32(%rdi)
@@ -934,14 +949,14 @@ define void @test_nt8xi64(<8 x i64>* nocapture %ptr, <8 x i64> %X) {
; SSE-NEXT: retq
;
; AVX-LABEL: test_nt8xi64:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vmovntdq %ymm0, (%rdi)
; AVX-NEXT: vmovntdq %ymm1, 32(%rdi)
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
;
; AVX512-LABEL: test_nt8xi64:
-; AVX512: # BB#0: # %entry
+; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vmovntdq %zmm0, (%rdi)
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
@@ -956,7 +971,7 @@ entry:
define <16 x float> @test_load_nt16xfloat(<16 x float>* nocapture %ptr) {
; SSE2-LABEL: test_load_nt16xfloat:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movaps (%rdi), %xmm0
; SSE2-NEXT: movaps 16(%rdi), %xmm1
; SSE2-NEXT: movaps 32(%rdi), %xmm2
@@ -964,7 +979,7 @@ define <16 x float> @test_load_nt16xfloat(<16 x float>* nocapture %ptr) {
; SSE2-NEXT: retq
;
; SSE4A-LABEL: test_load_nt16xfloat:
-; SSE4A: # BB#0: # %entry
+; SSE4A: # %bb.0: # %entry
; SSE4A-NEXT: movaps (%rdi), %xmm0
; SSE4A-NEXT: movaps 16(%rdi), %xmm1
; SSE4A-NEXT: movaps 32(%rdi), %xmm2
@@ -972,7 +987,7 @@ define <16 x float> @test_load_nt16xfloat(<16 x float>* nocapture %ptr) {
; SSE4A-NEXT: retq
;
; SSE41-LABEL: test_load_nt16xfloat:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: movntdqa (%rdi), %xmm0
; SSE41-NEXT: movntdqa 16(%rdi), %xmm1
; SSE41-NEXT: movntdqa 32(%rdi), %xmm2
@@ -980,27 +995,27 @@ define <16 x float> @test_load_nt16xfloat(<16 x float>* nocapture %ptr) {
; SSE41-NEXT: retq
;
; AVX1-LABEL: test_load_nt16xfloat:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vmovntdqa (%rdi), %xmm0
-; AVX1-NEXT: # implicit-def: %YMM1
+; AVX1-NEXT: # implicit-def: %ymm1
; AVX1-NEXT: vmovaps %xmm0, %xmm1
; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm2
-; AVX1-NEXT: # implicit-def: %YMM1
+; AVX1-NEXT: # implicit-def: %ymm1
; AVX1-NEXT: vmovaps %xmm2, %xmm1
; AVX1-NEXT: vmovntdqa 48(%rdi), %xmm2
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_load_nt16xfloat:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vmovntdqa (%rdi), %ymm0
; AVX2-NEXT: vmovntdqa 32(%rdi), %ymm1
; AVX2-NEXT: retq
;
; AVX512-LABEL: test_load_nt16xfloat:
-; AVX512: # BB#0: # %entry
+; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vmovntdqa (%rdi), %zmm0
; AVX512-NEXT: retq
entry:
@@ -1010,7 +1025,7 @@ entry:
define <8 x double> @test_load_nt8xdouble(<8 x double>* nocapture %ptr) {
; SSE2-LABEL: test_load_nt8xdouble:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movapd (%rdi), %xmm0
; SSE2-NEXT: movapd 16(%rdi), %xmm1
; SSE2-NEXT: movapd 32(%rdi), %xmm2
@@ -1018,7 +1033,7 @@ define <8 x double> @test_load_nt8xdouble(<8 x double>* nocapture %ptr) {
; SSE2-NEXT: retq
;
; SSE4A-LABEL: test_load_nt8xdouble:
-; SSE4A: # BB#0: # %entry
+; SSE4A: # %bb.0: # %entry
; SSE4A-NEXT: movapd (%rdi), %xmm0
; SSE4A-NEXT: movapd 16(%rdi), %xmm1
; SSE4A-NEXT: movapd 32(%rdi), %xmm2
@@ -1026,7 +1041,7 @@ define <8 x double> @test_load_nt8xdouble(<8 x double>* nocapture %ptr) {
; SSE4A-NEXT: retq
;
; SSE41-LABEL: test_load_nt8xdouble:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: movntdqa (%rdi), %xmm0
; SSE41-NEXT: movntdqa 16(%rdi), %xmm1
; SSE41-NEXT: movntdqa 32(%rdi), %xmm2
@@ -1034,27 +1049,27 @@ define <8 x double> @test_load_nt8xdouble(<8 x double>* nocapture %ptr) {
; SSE41-NEXT: retq
;
; AVX1-LABEL: test_load_nt8xdouble:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vmovntdqa (%rdi), %xmm0
-; AVX1-NEXT: # implicit-def: %YMM1
+; AVX1-NEXT: # implicit-def: %ymm1
; AVX1-NEXT: vmovaps %xmm0, %xmm1
; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm2
-; AVX1-NEXT: # implicit-def: %YMM1
+; AVX1-NEXT: # implicit-def: %ymm1
; AVX1-NEXT: vmovaps %xmm2, %xmm1
; AVX1-NEXT: vmovntdqa 48(%rdi), %xmm2
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_load_nt8xdouble:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vmovntdqa (%rdi), %ymm0
; AVX2-NEXT: vmovntdqa 32(%rdi), %ymm1
; AVX2-NEXT: retq
;
; AVX512-LABEL: test_load_nt8xdouble:
-; AVX512: # BB#0: # %entry
+; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vmovntdqa (%rdi), %zmm0
; AVX512-NEXT: retq
entry:
@@ -1064,7 +1079,7 @@ entry:
define <64 x i8> @test_load_nt64xi8(<64 x i8>* nocapture %ptr) {
; SSE2-LABEL: test_load_nt64xi8:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movaps (%rdi), %xmm0
; SSE2-NEXT: movaps 16(%rdi), %xmm1
; SSE2-NEXT: movaps 32(%rdi), %xmm2
@@ -1072,7 +1087,7 @@ define <64 x i8> @test_load_nt64xi8(<64 x i8>* nocapture %ptr) {
; SSE2-NEXT: retq
;
; SSE4A-LABEL: test_load_nt64xi8:
-; SSE4A: # BB#0: # %entry
+; SSE4A: # %bb.0: # %entry
; SSE4A-NEXT: movaps (%rdi), %xmm0
; SSE4A-NEXT: movaps 16(%rdi), %xmm1
; SSE4A-NEXT: movaps 32(%rdi), %xmm2
@@ -1080,7 +1095,7 @@ define <64 x i8> @test_load_nt64xi8(<64 x i8>* nocapture %ptr) {
; SSE4A-NEXT: retq
;
; SSE41-LABEL: test_load_nt64xi8:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: movntdqa (%rdi), %xmm0
; SSE41-NEXT: movntdqa 16(%rdi), %xmm1
; SSE41-NEXT: movntdqa 32(%rdi), %xmm2
@@ -1088,33 +1103,39 @@ define <64 x i8> @test_load_nt64xi8(<64 x i8>* nocapture %ptr) {
; SSE41-NEXT: retq
;
; AVX1-LABEL: test_load_nt64xi8:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vmovntdqa (%rdi), %xmm0
-; AVX1-NEXT: # implicit-def: %YMM1
+; AVX1-NEXT: # implicit-def: %ymm1
; AVX1-NEXT: vmovaps %xmm0, %xmm1
; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm2
-; AVX1-NEXT: # implicit-def: %YMM1
+; AVX1-NEXT: # implicit-def: %ymm1
; AVX1-NEXT: vmovaps %xmm2, %xmm1
; AVX1-NEXT: vmovntdqa 48(%rdi), %xmm2
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_load_nt64xi8:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vmovntdqa (%rdi), %ymm0
; AVX2-NEXT: vmovntdqa 32(%rdi), %ymm1
; AVX2-NEXT: retq
;
+; AVX512VL-LABEL: test_load_nt64xi8:
+; AVX512VL: # %bb.0: # %entry
+; AVX512VL-NEXT: vmovntdqa (%rdi), %ymm0
+; AVX512VL-NEXT: vmovntdqa 32(%rdi), %ymm1
+; AVX512VL-NEXT: retq
+;
; AVX512F-LABEL: test_load_nt64xi8:
-; AVX512F: # BB#0: # %entry
+; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vmovntdqa (%rdi), %ymm0
; AVX512F-NEXT: vmovntdqa 32(%rdi), %ymm1
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: test_load_nt64xi8:
-; AVX512BW: # BB#0: # %entry
+; AVX512BW: # %bb.0: # %entry
; AVX512BW-NEXT: vmovntdqa (%rdi), %zmm0
; AVX512BW-NEXT: retq
entry:
@@ -1124,7 +1145,7 @@ entry:
define <32 x i16> @test_load_nt32xi16(<32 x i16>* nocapture %ptr) {
; SSE2-LABEL: test_load_nt32xi16:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movaps (%rdi), %xmm0
; SSE2-NEXT: movaps 16(%rdi), %xmm1
; SSE2-NEXT: movaps 32(%rdi), %xmm2
@@ -1132,7 +1153,7 @@ define <32 x i16> @test_load_nt32xi16(<32 x i16>* nocapture %ptr) {
; SSE2-NEXT: retq
;
; SSE4A-LABEL: test_load_nt32xi16:
-; SSE4A: # BB#0: # %entry
+; SSE4A: # %bb.0: # %entry
; SSE4A-NEXT: movaps (%rdi), %xmm0
; SSE4A-NEXT: movaps 16(%rdi), %xmm1
; SSE4A-NEXT: movaps 32(%rdi), %xmm2
@@ -1140,7 +1161,7 @@ define <32 x i16> @test_load_nt32xi16(<32 x i16>* nocapture %ptr) {
; SSE4A-NEXT: retq
;
; SSE41-LABEL: test_load_nt32xi16:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: movntdqa (%rdi), %xmm0
; SSE41-NEXT: movntdqa 16(%rdi), %xmm1
; SSE41-NEXT: movntdqa 32(%rdi), %xmm2
@@ -1148,33 +1169,39 @@ define <32 x i16> @test_load_nt32xi16(<32 x i16>* nocapture %ptr) {
; SSE41-NEXT: retq
;
; AVX1-LABEL: test_load_nt32xi16:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vmovntdqa (%rdi), %xmm0
-; AVX1-NEXT: # implicit-def: %YMM1
+; AVX1-NEXT: # implicit-def: %ymm1
; AVX1-NEXT: vmovaps %xmm0, %xmm1
; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm2
-; AVX1-NEXT: # implicit-def: %YMM1
+; AVX1-NEXT: # implicit-def: %ymm1
; AVX1-NEXT: vmovaps %xmm2, %xmm1
; AVX1-NEXT: vmovntdqa 48(%rdi), %xmm2
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_load_nt32xi16:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vmovntdqa (%rdi), %ymm0
; AVX2-NEXT: vmovntdqa 32(%rdi), %ymm1
; AVX2-NEXT: retq
;
+; AVX512VL-LABEL: test_load_nt32xi16:
+; AVX512VL: # %bb.0: # %entry
+; AVX512VL-NEXT: vmovntdqa (%rdi), %ymm0
+; AVX512VL-NEXT: vmovntdqa 32(%rdi), %ymm1
+; AVX512VL-NEXT: retq
+;
; AVX512F-LABEL: test_load_nt32xi16:
-; AVX512F: # BB#0: # %entry
+; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vmovntdqa (%rdi), %ymm0
; AVX512F-NEXT: vmovntdqa 32(%rdi), %ymm1
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: test_load_nt32xi16:
-; AVX512BW: # BB#0: # %entry
+; AVX512BW: # %bb.0: # %entry
; AVX512BW-NEXT: vmovntdqa (%rdi), %zmm0
; AVX512BW-NEXT: retq
entry:
@@ -1184,7 +1211,7 @@ entry:
define <16 x i32> @test_load_nt16xi32(<16 x i32>* nocapture %ptr) {
; SSE2-LABEL: test_load_nt16xi32:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movaps (%rdi), %xmm0
; SSE2-NEXT: movaps 16(%rdi), %xmm1
; SSE2-NEXT: movaps 32(%rdi), %xmm2
@@ -1192,7 +1219,7 @@ define <16 x i32> @test_load_nt16xi32(<16 x i32>* nocapture %ptr) {
; SSE2-NEXT: retq
;
; SSE4A-LABEL: test_load_nt16xi32:
-; SSE4A: # BB#0: # %entry
+; SSE4A: # %bb.0: # %entry
; SSE4A-NEXT: movaps (%rdi), %xmm0
; SSE4A-NEXT: movaps 16(%rdi), %xmm1
; SSE4A-NEXT: movaps 32(%rdi), %xmm2
@@ -1200,7 +1227,7 @@ define <16 x i32> @test_load_nt16xi32(<16 x i32>* nocapture %ptr) {
; SSE4A-NEXT: retq
;
; SSE41-LABEL: test_load_nt16xi32:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: movntdqa (%rdi), %xmm0
; SSE41-NEXT: movntdqa 16(%rdi), %xmm1
; SSE41-NEXT: movntdqa 32(%rdi), %xmm2
@@ -1208,27 +1235,27 @@ define <16 x i32> @test_load_nt16xi32(<16 x i32>* nocapture %ptr) {
; SSE41-NEXT: retq
;
; AVX1-LABEL: test_load_nt16xi32:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vmovntdqa (%rdi), %xmm0
-; AVX1-NEXT: # implicit-def: %YMM1
+; AVX1-NEXT: # implicit-def: %ymm1
; AVX1-NEXT: vmovaps %xmm0, %xmm1
; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm2
-; AVX1-NEXT: # implicit-def: %YMM1
+; AVX1-NEXT: # implicit-def: %ymm1
; AVX1-NEXT: vmovaps %xmm2, %xmm1
; AVX1-NEXT: vmovntdqa 48(%rdi), %xmm2
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_load_nt16xi32:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vmovntdqa (%rdi), %ymm0
; AVX2-NEXT: vmovntdqa 32(%rdi), %ymm1
; AVX2-NEXT: retq
;
; AVX512-LABEL: test_load_nt16xi32:
-; AVX512: # BB#0: # %entry
+; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vmovntdqa (%rdi), %zmm0
; AVX512-NEXT: retq
entry:
@@ -1238,7 +1265,7 @@ entry:
define <8 x i64> @test_load_nt8xi64(<8 x i64>* nocapture %ptr) {
; SSE2-LABEL: test_load_nt8xi64:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movaps (%rdi), %xmm0
; SSE2-NEXT: movaps 16(%rdi), %xmm1
; SSE2-NEXT: movaps 32(%rdi), %xmm2
@@ -1246,7 +1273,7 @@ define <8 x i64> @test_load_nt8xi64(<8 x i64>* nocapture %ptr) {
; SSE2-NEXT: retq
;
; SSE4A-LABEL: test_load_nt8xi64:
-; SSE4A: # BB#0: # %entry
+; SSE4A: # %bb.0: # %entry
; SSE4A-NEXT: movaps (%rdi), %xmm0
; SSE4A-NEXT: movaps 16(%rdi), %xmm1
; SSE4A-NEXT: movaps 32(%rdi), %xmm2
@@ -1254,7 +1281,7 @@ define <8 x i64> @test_load_nt8xi64(<8 x i64>* nocapture %ptr) {
; SSE4A-NEXT: retq
;
; SSE41-LABEL: test_load_nt8xi64:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: movntdqa (%rdi), %xmm0
; SSE41-NEXT: movntdqa 16(%rdi), %xmm1
; SSE41-NEXT: movntdqa 32(%rdi), %xmm2
@@ -1262,27 +1289,27 @@ define <8 x i64> @test_load_nt8xi64(<8 x i64>* nocapture %ptr) {
; SSE41-NEXT: retq
;
; AVX1-LABEL: test_load_nt8xi64:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vmovntdqa (%rdi), %xmm0
-; AVX1-NEXT: # implicit-def: %YMM1
+; AVX1-NEXT: # implicit-def: %ymm1
; AVX1-NEXT: vmovaps %xmm0, %xmm1
; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm2
-; AVX1-NEXT: # implicit-def: %YMM1
+; AVX1-NEXT: # implicit-def: %ymm1
; AVX1-NEXT: vmovaps %xmm2, %xmm1
; AVX1-NEXT: vmovntdqa 48(%rdi), %xmm2
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_load_nt8xi64:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vmovntdqa (%rdi), %ymm0
; AVX2-NEXT: vmovntdqa 32(%rdi), %ymm1
; AVX2-NEXT: retq
;
; AVX512-LABEL: test_load_nt8xi64:
-; AVX512: # BB#0: # %entry
+; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vmovntdqa (%rdi), %zmm0
; AVX512-NEXT: retq
entry:
diff --git a/test/CodeGen/X86/fast-isel-noplt-pic.ll b/test/CodeGen/X86/fast-isel-noplt-pic.ll
new file mode 100644
index 000000000000..575ed365d656
--- /dev/null
+++ b/test/CodeGen/X86/fast-isel-noplt-pic.ll
@@ -0,0 +1,16 @@
+; RUN: llc -mtriple x86_64-unknown-linux-gnu -O0 -fast-isel=true -relocation-model=pic -filetype asm -o - %s | FileCheck %s
+
+declare void @f() local_unnamed_addr #0
+
+define void @g() local_unnamed_addr {
+entry:
+ call void @f()
+ ret void
+}
+
+attributes #0 = { nonlazybind }
+
+; CHECK-LABEL: g:
+; CHECK-LABEL: callq *f@GOTPCREL(%rip)
+; CHECK-LABEL: retq
+
diff --git a/test/CodeGen/X86/fast-isel-select-cmov.ll b/test/CodeGen/X86/fast-isel-select-cmov.ll
index e40e917e11e9..3e9b99f4c539 100644
--- a/test/CodeGen/X86/fast-isel-select-cmov.ll
+++ b/test/CodeGen/X86/fast-isel-select-cmov.ll
@@ -7,7 +7,7 @@
define zeroext i16 @select_cmov_i16(i1 zeroext %cond, i16 zeroext %a, i16 zeroext %b) {
; CHECK-LABEL: select_cmov_i16:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: testb $1, %dil
; CHECK-NEXT: cmovew %dx, %si
; CHECK-NEXT: movzwl %si, %eax
@@ -18,7 +18,7 @@ define zeroext i16 @select_cmov_i16(i1 zeroext %cond, i16 zeroext %a, i16 zeroex
define zeroext i16 @select_cmp_cmov_i16(i16 zeroext %a, i16 zeroext %b) {
; CHECK-LABEL: select_cmp_cmov_i16:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: cmpw %si, %di
; CHECK-NEXT: cmovbw %di, %si
; CHECK-NEXT: movzwl %si, %eax
@@ -30,7 +30,7 @@ define zeroext i16 @select_cmp_cmov_i16(i16 zeroext %a, i16 zeroext %b) {
define i32 @select_cmov_i32(i1 zeroext %cond, i32 %a, i32 %b) {
; CHECK-LABEL: select_cmov_i32:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: testb $1, %dil
; CHECK-NEXT: cmovel %edx, %esi
; CHECK-NEXT: movl %esi, %eax
@@ -41,7 +41,7 @@ define i32 @select_cmov_i32(i1 zeroext %cond, i32 %a, i32 %b) {
define i32 @select_cmp_cmov_i32(i32 %a, i32 %b) {
; CHECK-LABEL: select_cmp_cmov_i32:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: cmpl %esi, %edi
; CHECK-NEXT: cmovbl %edi, %esi
; CHECK-NEXT: movl %esi, %eax
@@ -53,7 +53,7 @@ define i32 @select_cmp_cmov_i32(i32 %a, i32 %b) {
define i64 @select_cmov_i64(i1 zeroext %cond, i64 %a, i64 %b) {
; CHECK-LABEL: select_cmov_i64:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: testb $1, %dil
; CHECK-NEXT: cmoveq %rdx, %rsi
; CHECK-NEXT: movq %rsi, %rax
@@ -64,7 +64,7 @@ define i64 @select_cmov_i64(i1 zeroext %cond, i64 %a, i64 %b) {
define i64 @select_cmp_cmov_i64(i64 %a, i64 %b) {
; CHECK-LABEL: select_cmp_cmov_i64:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: cmpq %rsi, %rdi
; CHECK-NEXT: cmovbq %rdi, %rsi
; CHECK-NEXT: movq %rsi, %rax
diff --git a/test/CodeGen/X86/fast-isel-select-cmov2.ll b/test/CodeGen/X86/fast-isel-select-cmov2.ll
index 8556ff21021a..3dd4d2b3433d 100644
--- a/test/CodeGen/X86/fast-isel-select-cmov2.ll
+++ b/test/CodeGen/X86/fast-isel-select-cmov2.ll
@@ -1,188 +1,356 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin10 | FileCheck %s --check-prefix=CHECK --check-prefix=SDAG
-; RUN: llc < %s -mtriple=x86_64-apple-darwin10 -fast-isel -fast-isel-abort=1 | FileCheck %s --check-prefix=CHECK --check-prefix=FAST
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-apple-darwin10 | FileCheck %s --check-prefix=CHECK --check-prefix=NOAVX --check-prefix=SDAG
+; RUN: llc < %s -mtriple=x86_64-apple-darwin10 -fast-isel -fast-isel-abort=1 | FileCheck %s --check-prefix=CHECK --check-prefix=NOAVX --check-prefix=FAST
+; RUN: llc < %s -mtriple=x86_64-apple-darwin10 -fast-isel -fast-isel-abort=1 -mattr=avx | FileCheck %s --check-prefix=CHECK --check-prefix=FAST_AVX
+; RUN: llc < %s -mtriple=x86_64-apple-darwin10 -fast-isel -fast-isel-abort=1 -mattr=avx512f | FileCheck %s --check-prefix=CHECK --check-prefix=FAST_AVX
; Test all the cmp predicates that can feed an integer conditional move.
define i64 @select_fcmp_false_cmov(double %a, double %b, i64 %c, i64 %d) {
-; CHECK-LABEL: select_fcmp_false_cmov
-; CHECK: movq %rsi, %rax
-; CHECK-NEXT: retq
+; CHECK-LABEL: select_fcmp_false_cmov:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: movq %rsi, %rax
+; CHECK-NEXT: retq
%1 = fcmp false double %a, %b
%2 = select i1 %1, i64 %c, i64 %d
ret i64 %2
}
define i64 @select_fcmp_oeq_cmov(double %a, double %b, i64 %c, i64 %d) {
-; CHECK-LABEL: select_fcmp_oeq_cmov
-; CHECK: ucomisd %xmm1, %xmm0
-; SDAG-NEXT: cmovneq %rsi, %rdi
-; SDAG-NEXT: cmovpq %rsi, %rdi
-; SDAG-NEXT: movq %rdi, %rax
-; FAST-NEXT: setnp %al
-; FAST-NEXT: sete %cl
-; FAST-NEXT: testb %al, %cl
-; FAST-NEXT: cmoveq %rsi, %rdi
+; SDAG-LABEL: select_fcmp_oeq_cmov:
+; SDAG: ## %bb.0:
+; SDAG-NEXT: ucomisd %xmm1, %xmm0
+; SDAG-NEXT: cmovneq %rsi, %rdi
+; SDAG-NEXT: cmovpq %rsi, %rdi
+; SDAG-NEXT: movq %rdi, %rax
+; SDAG-NEXT: retq
+;
+; FAST-LABEL: select_fcmp_oeq_cmov:
+; FAST: ## %bb.0:
+; FAST-NEXT: ucomisd %xmm1, %xmm0
+; FAST-NEXT: setnp %al
+; FAST-NEXT: sete %cl
+; FAST-NEXT: testb %al, %cl
+; FAST-NEXT: cmoveq %rsi, %rdi
+; FAST-NEXT: movq %rdi, %rax
+; FAST-NEXT: retq
+;
+; FAST_AVX-LABEL: select_fcmp_oeq_cmov:
+; FAST_AVX: ## %bb.0:
+; FAST_AVX-NEXT: vucomisd %xmm1, %xmm0
+; FAST_AVX-NEXT: setnp %al
+; FAST_AVX-NEXT: sete %cl
+; FAST_AVX-NEXT: testb %al, %cl
+; FAST_AVX-NEXT: cmoveq %rsi, %rdi
+; FAST_AVX-NEXT: movq %rdi, %rax
+; FAST_AVX-NEXT: retq
%1 = fcmp oeq double %a, %b
%2 = select i1 %1, i64 %c, i64 %d
ret i64 %2
}
define i64 @select_fcmp_ogt_cmov(double %a, double %b, i64 %c, i64 %d) {
-; CHECK-LABEL: select_fcmp_ogt_cmov
-; CHECK: ucomisd %xmm1, %xmm0
-; CHECK-NEXT: cmovbeq %rsi, %rdi
+; NOAVX-LABEL: select_fcmp_ogt_cmov:
+; NOAVX: ## %bb.0:
+; NOAVX-NEXT: ucomisd %xmm1, %xmm0
+; NOAVX-NEXT: cmovbeq %rsi, %rdi
+; NOAVX-NEXT: movq %rdi, %rax
+; NOAVX-NEXT: retq
+;
+; FAST_AVX-LABEL: select_fcmp_ogt_cmov:
+; FAST_AVX: ## %bb.0:
+; FAST_AVX-NEXT: vucomisd %xmm1, %xmm0
+; FAST_AVX-NEXT: cmovbeq %rsi, %rdi
+; FAST_AVX-NEXT: movq %rdi, %rax
+; FAST_AVX-NEXT: retq
%1 = fcmp ogt double %a, %b
%2 = select i1 %1, i64 %c, i64 %d
ret i64 %2
}
define i64 @select_fcmp_oge_cmov(double %a, double %b, i64 %c, i64 %d) {
-; CHECK-LABEL: select_fcmp_oge_cmov
-; CHECK: ucomisd %xmm1, %xmm0
-; CHECK-NEXT: cmovbq %rsi, %rdi
+; NOAVX-LABEL: select_fcmp_oge_cmov:
+; NOAVX: ## %bb.0:
+; NOAVX-NEXT: ucomisd %xmm1, %xmm0
+; NOAVX-NEXT: cmovbq %rsi, %rdi
+; NOAVX-NEXT: movq %rdi, %rax
+; NOAVX-NEXT: retq
+;
+; FAST_AVX-LABEL: select_fcmp_oge_cmov:
+; FAST_AVX: ## %bb.0:
+; FAST_AVX-NEXT: vucomisd %xmm1, %xmm0
+; FAST_AVX-NEXT: cmovbq %rsi, %rdi
+; FAST_AVX-NEXT: movq %rdi, %rax
+; FAST_AVX-NEXT: retq
%1 = fcmp oge double %a, %b
%2 = select i1 %1, i64 %c, i64 %d
ret i64 %2
}
define i64 @select_fcmp_olt_cmov(double %a, double %b, i64 %c, i64 %d) {
-; CHECK-LABEL: select_fcmp_olt_cmov
-; CHECK: ucomisd %xmm0, %xmm1
-; CHECK-NEXT: cmovbeq %rsi, %rdi
+; NOAVX-LABEL: select_fcmp_olt_cmov:
+; NOAVX: ## %bb.0:
+; NOAVX-NEXT: ucomisd %xmm0, %xmm1
+; NOAVX-NEXT: cmovbeq %rsi, %rdi
+; NOAVX-NEXT: movq %rdi, %rax
+; NOAVX-NEXT: retq
+;
+; FAST_AVX-LABEL: select_fcmp_olt_cmov:
+; FAST_AVX: ## %bb.0:
+; FAST_AVX-NEXT: vucomisd %xmm0, %xmm1
+; FAST_AVX-NEXT: cmovbeq %rsi, %rdi
+; FAST_AVX-NEXT: movq %rdi, %rax
+; FAST_AVX-NEXT: retq
%1 = fcmp olt double %a, %b
%2 = select i1 %1, i64 %c, i64 %d
ret i64 %2
}
define i64 @select_fcmp_ole_cmov(double %a, double %b, i64 %c, i64 %d) {
-; CHECK-LABEL: select_fcmp_ole_cmov
-; CHECK: ucomisd %xmm0, %xmm1
-; CHECK-NEXT: cmovbq %rsi, %rdi
+; NOAVX-LABEL: select_fcmp_ole_cmov:
+; NOAVX: ## %bb.0:
+; NOAVX-NEXT: ucomisd %xmm0, %xmm1
+; NOAVX-NEXT: cmovbq %rsi, %rdi
+; NOAVX-NEXT: movq %rdi, %rax
+; NOAVX-NEXT: retq
+;
+; FAST_AVX-LABEL: select_fcmp_ole_cmov:
+; FAST_AVX: ## %bb.0:
+; FAST_AVX-NEXT: vucomisd %xmm0, %xmm1
+; FAST_AVX-NEXT: cmovbq %rsi, %rdi
+; FAST_AVX-NEXT: movq %rdi, %rax
+; FAST_AVX-NEXT: retq
%1 = fcmp ole double %a, %b
%2 = select i1 %1, i64 %c, i64 %d
ret i64 %2
}
define i64 @select_fcmp_one_cmov(double %a, double %b, i64 %c, i64 %d) {
-; CHECK-LABEL: select_fcmp_one_cmov
-; CHECK: ucomisd %xmm1, %xmm0
-; CHECK-NEXT: cmoveq %rsi, %rdi
+; NOAVX-LABEL: select_fcmp_one_cmov:
+; NOAVX: ## %bb.0:
+; NOAVX-NEXT: ucomisd %xmm1, %xmm0
+; NOAVX-NEXT: cmoveq %rsi, %rdi
+; NOAVX-NEXT: movq %rdi, %rax
+; NOAVX-NEXT: retq
+;
+; FAST_AVX-LABEL: select_fcmp_one_cmov:
+; FAST_AVX: ## %bb.0:
+; FAST_AVX-NEXT: vucomisd %xmm1, %xmm0
+; FAST_AVX-NEXT: cmoveq %rsi, %rdi
+; FAST_AVX-NEXT: movq %rdi, %rax
+; FAST_AVX-NEXT: retq
%1 = fcmp one double %a, %b
%2 = select i1 %1, i64 %c, i64 %d
ret i64 %2
}
define i64 @select_fcmp_ord_cmov(double %a, double %b, i64 %c, i64 %d) {
-; CHECK-LABEL: select_fcmp_ord_cmov
-; CHECK: ucomisd %xmm1, %xmm0
-; CHECK-NEXT: cmovpq %rsi, %rdi
+; NOAVX-LABEL: select_fcmp_ord_cmov:
+; NOAVX: ## %bb.0:
+; NOAVX-NEXT: ucomisd %xmm1, %xmm0
+; NOAVX-NEXT: cmovpq %rsi, %rdi
+; NOAVX-NEXT: movq %rdi, %rax
+; NOAVX-NEXT: retq
+;
+; FAST_AVX-LABEL: select_fcmp_ord_cmov:
+; FAST_AVX: ## %bb.0:
+; FAST_AVX-NEXT: vucomisd %xmm1, %xmm0
+; FAST_AVX-NEXT: cmovpq %rsi, %rdi
+; FAST_AVX-NEXT: movq %rdi, %rax
+; FAST_AVX-NEXT: retq
%1 = fcmp ord double %a, %b
%2 = select i1 %1, i64 %c, i64 %d
ret i64 %2
}
define i64 @select_fcmp_uno_cmov(double %a, double %b, i64 %c, i64 %d) {
-; CHECK-LABEL: select_fcmp_uno_cmov
-; CHECK: ucomisd %xmm1, %xmm0
-; CHECK-NEXT: cmovnpq %rsi, %rdi
+; NOAVX-LABEL: select_fcmp_uno_cmov:
+; NOAVX: ## %bb.0:
+; NOAVX-NEXT: ucomisd %xmm1, %xmm0
+; NOAVX-NEXT: cmovnpq %rsi, %rdi
+; NOAVX-NEXT: movq %rdi, %rax
+; NOAVX-NEXT: retq
+;
+; FAST_AVX-LABEL: select_fcmp_uno_cmov:
+; FAST_AVX: ## %bb.0:
+; FAST_AVX-NEXT: vucomisd %xmm1, %xmm0
+; FAST_AVX-NEXT: cmovnpq %rsi, %rdi
+; FAST_AVX-NEXT: movq %rdi, %rax
+; FAST_AVX-NEXT: retq
%1 = fcmp uno double %a, %b
%2 = select i1 %1, i64 %c, i64 %d
ret i64 %2
}
define i64 @select_fcmp_ueq_cmov(double %a, double %b, i64 %c, i64 %d) {
-; CHECK-LABEL: select_fcmp_ueq_cmov
-; CHECK: ucomisd %xmm1, %xmm0
-; CHECK-NEXT: cmovneq %rsi, %rdi
+; NOAVX-LABEL: select_fcmp_ueq_cmov:
+; NOAVX: ## %bb.0:
+; NOAVX-NEXT: ucomisd %xmm1, %xmm0
+; NOAVX-NEXT: cmovneq %rsi, %rdi
+; NOAVX-NEXT: movq %rdi, %rax
+; NOAVX-NEXT: retq
+;
+; FAST_AVX-LABEL: select_fcmp_ueq_cmov:
+; FAST_AVX: ## %bb.0:
+; FAST_AVX-NEXT: vucomisd %xmm1, %xmm0
+; FAST_AVX-NEXT: cmovneq %rsi, %rdi
+; FAST_AVX-NEXT: movq %rdi, %rax
+; FAST_AVX-NEXT: retq
%1 = fcmp ueq double %a, %b
%2 = select i1 %1, i64 %c, i64 %d
ret i64 %2
}
define i64 @select_fcmp_ugt_cmov(double %a, double %b, i64 %c, i64 %d) {
-; CHECK-LABEL: select_fcmp_ugt_cmov
-; CHECK: ucomisd %xmm0, %xmm1
-; CHECK-NEXT: cmovaeq %rsi, %rdi
+; NOAVX-LABEL: select_fcmp_ugt_cmov:
+; NOAVX: ## %bb.0:
+; NOAVX-NEXT: ucomisd %xmm0, %xmm1
+; NOAVX-NEXT: cmovaeq %rsi, %rdi
+; NOAVX-NEXT: movq %rdi, %rax
+; NOAVX-NEXT: retq
+;
+; FAST_AVX-LABEL: select_fcmp_ugt_cmov:
+; FAST_AVX: ## %bb.0:
+; FAST_AVX-NEXT: vucomisd %xmm0, %xmm1
+; FAST_AVX-NEXT: cmovaeq %rsi, %rdi
+; FAST_AVX-NEXT: movq %rdi, %rax
+; FAST_AVX-NEXT: retq
%1 = fcmp ugt double %a, %b
%2 = select i1 %1, i64 %c, i64 %d
ret i64 %2
}
define i64 @select_fcmp_uge_cmov(double %a, double %b, i64 %c, i64 %d) {
-; CHECK-LABEL: select_fcmp_uge_cmov
-; CHECK: ucomisd %xmm0, %xmm1
-; CHECK-NEXT: cmovaq %rsi, %rdi
+; NOAVX-LABEL: select_fcmp_uge_cmov:
+; NOAVX: ## %bb.0:
+; NOAVX-NEXT: ucomisd %xmm0, %xmm1
+; NOAVX-NEXT: cmovaq %rsi, %rdi
+; NOAVX-NEXT: movq %rdi, %rax
+; NOAVX-NEXT: retq
+;
+; FAST_AVX-LABEL: select_fcmp_uge_cmov:
+; FAST_AVX: ## %bb.0:
+; FAST_AVX-NEXT: vucomisd %xmm0, %xmm1
+; FAST_AVX-NEXT: cmovaq %rsi, %rdi
+; FAST_AVX-NEXT: movq %rdi, %rax
+; FAST_AVX-NEXT: retq
%1 = fcmp uge double %a, %b
%2 = select i1 %1, i64 %c, i64 %d
ret i64 %2
}
define i64 @select_fcmp_ult_cmov(double %a, double %b, i64 %c, i64 %d) {
-; CHECK-LABEL: select_fcmp_ult_cmov
-; CHECK: ucomisd %xmm1, %xmm0
-; CHECK-NEXT: cmovaeq %rsi, %rdi
+; NOAVX-LABEL: select_fcmp_ult_cmov:
+; NOAVX: ## %bb.0:
+; NOAVX-NEXT: ucomisd %xmm1, %xmm0
+; NOAVX-NEXT: cmovaeq %rsi, %rdi
+; NOAVX-NEXT: movq %rdi, %rax
+; NOAVX-NEXT: retq
+;
+; FAST_AVX-LABEL: select_fcmp_ult_cmov:
+; FAST_AVX: ## %bb.0:
+; FAST_AVX-NEXT: vucomisd %xmm1, %xmm0
+; FAST_AVX-NEXT: cmovaeq %rsi, %rdi
+; FAST_AVX-NEXT: movq %rdi, %rax
+; FAST_AVX-NEXT: retq
%1 = fcmp ult double %a, %b
%2 = select i1 %1, i64 %c, i64 %d
ret i64 %2
}
define i64 @select_fcmp_ule_cmov(double %a, double %b, i64 %c, i64 %d) {
-; CHECK-LABEL: select_fcmp_ule_cmov
-; CHECK: ucomisd %xmm1, %xmm0
-; CHECK-NEXT: cmovaq %rsi, %rdi
+; NOAVX-LABEL: select_fcmp_ule_cmov:
+; NOAVX: ## %bb.0:
+; NOAVX-NEXT: ucomisd %xmm1, %xmm0
+; NOAVX-NEXT: cmovaq %rsi, %rdi
+; NOAVX-NEXT: movq %rdi, %rax
+; NOAVX-NEXT: retq
+;
+; FAST_AVX-LABEL: select_fcmp_ule_cmov:
+; FAST_AVX: ## %bb.0:
+; FAST_AVX-NEXT: vucomisd %xmm1, %xmm0
+; FAST_AVX-NEXT: cmovaq %rsi, %rdi
+; FAST_AVX-NEXT: movq %rdi, %rax
+; FAST_AVX-NEXT: retq
%1 = fcmp ule double %a, %b
%2 = select i1 %1, i64 %c, i64 %d
ret i64 %2
}
define i64 @select_fcmp_une_cmov(double %a, double %b, i64 %c, i64 %d) {
-; CHECK-LABEL: select_fcmp_une_cmov
-; CHECK: ucomisd %xmm1, %xmm0
-; SDAG-NEXT: cmovneq %rdi, %rsi
-; SDAG-NEXT: cmovpq %rdi, %rsi
-; SDAG-NEXT: movq %rsi, %rax
-; FAST-NEXT: setp %al
-; FAST-NEXT: setne %cl
-; FAST-NEXT: orb %al, %cl
-; FAST-NEXT: cmoveq %rsi, %rdi
+; SDAG-LABEL: select_fcmp_une_cmov:
+; SDAG: ## %bb.0:
+; SDAG-NEXT: ucomisd %xmm1, %xmm0
+; SDAG-NEXT: cmovneq %rdi, %rsi
+; SDAG-NEXT: cmovpq %rdi, %rsi
+; SDAG-NEXT: movq %rsi, %rax
+; SDAG-NEXT: retq
+;
+; FAST-LABEL: select_fcmp_une_cmov:
+; FAST: ## %bb.0:
+; FAST-NEXT: ucomisd %xmm1, %xmm0
+; FAST-NEXT: setp %al
+; FAST-NEXT: setne %cl
+; FAST-NEXT: orb %al, %cl
+; FAST-NEXT: cmoveq %rsi, %rdi
+; FAST-NEXT: movq %rdi, %rax
+; FAST-NEXT: retq
+;
+; FAST_AVX-LABEL: select_fcmp_une_cmov:
+; FAST_AVX: ## %bb.0:
+; FAST_AVX-NEXT: vucomisd %xmm1, %xmm0
+; FAST_AVX-NEXT: setp %al
+; FAST_AVX-NEXT: setne %cl
+; FAST_AVX-NEXT: orb %al, %cl
+; FAST_AVX-NEXT: cmoveq %rsi, %rdi
+; FAST_AVX-NEXT: movq %rdi, %rax
+; FAST_AVX-NEXT: retq
%1 = fcmp une double %a, %b
%2 = select i1 %1, i64 %c, i64 %d
ret i64 %2
}
define i64 @select_fcmp_true_cmov(double %a, double %b, i64 %c, i64 %d) {
-; CHECK-LABEL: select_fcmp_true_cmov
-; CHECK: movq %rdi, %rax
+; CHECK-LABEL: select_fcmp_true_cmov:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: movq %rdi, %rax
+; CHECK-NEXT: retq
%1 = fcmp true double %a, %b
%2 = select i1 %1, i64 %c, i64 %d
ret i64 %2
}
define i64 @select_icmp_eq_cmov(i64 %a, i64 %b, i64 %c, i64 %d) {
-; CHECK-LABEL: select_icmp_eq_cmov
-; CHECK: cmpq %rsi, %rdi
-; CHECK-NEXT: cmovneq %rcx, %rdx
-; CHECK-NEXT: movq %rdx, %rax
+; CHECK-LABEL: select_icmp_eq_cmov:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: cmpq %rsi, %rdi
+; CHECK-NEXT: cmovneq %rcx, %rdx
+; CHECK-NEXT: movq %rdx, %rax
+; CHECK-NEXT: retq
%1 = icmp eq i64 %a, %b
%2 = select i1 %1, i64 %c, i64 %d
ret i64 %2
}
define i64 @select_icmp_ne_cmov(i64 %a, i64 %b, i64 %c, i64 %d) {
-; CHECK-LABEL: select_icmp_ne_cmov
-; CHECK: cmpq %rsi, %rdi
-; CHECK-NEXT: cmoveq %rcx, %rdx
-; CHECK-NEXT: movq %rdx, %rax
+; CHECK-LABEL: select_icmp_ne_cmov:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: cmpq %rsi, %rdi
+; CHECK-NEXT: cmoveq %rcx, %rdx
+; CHECK-NEXT: movq %rdx, %rax
+; CHECK-NEXT: retq
%1 = icmp ne i64 %a, %b
%2 = select i1 %1, i64 %c, i64 %d
ret i64 %2
}
define i64 @select_icmp_ugt_cmov(i64 %a, i64 %b, i64 %c, i64 %d) {
-; CHECK-LABEL: select_icmp_ugt_cmov
-; CHECK: cmpq %rsi, %rdi
-; CHECK-NEXT: cmovbeq %rcx, %rdx
-; CHECK-NEXT: movq %rdx, %rax
+; CHECK-LABEL: select_icmp_ugt_cmov:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: cmpq %rsi, %rdi
+; CHECK-NEXT: cmovbeq %rcx, %rdx
+; CHECK-NEXT: movq %rdx, %rax
+; CHECK-NEXT: retq
%1 = icmp ugt i64 %a, %b
%2 = select i1 %1, i64 %c, i64 %d
ret i64 %2
@@ -190,70 +358,84 @@ define i64 @select_icmp_ugt_cmov(i64 %a, i64 %b, i64 %c, i64 %d) {
define i64 @select_icmp_uge_cmov(i64 %a, i64 %b, i64 %c, i64 %d) {
-; CHECK-LABEL: select_icmp_uge_cmov
-; CHECK: cmpq %rsi, %rdi
-; CHECK-NEXT: cmovbq %rcx, %rdx
-; CHECK-NEXT: movq %rdx, %rax
+; CHECK-LABEL: select_icmp_uge_cmov:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: cmpq %rsi, %rdi
+; CHECK-NEXT: cmovbq %rcx, %rdx
+; CHECK-NEXT: movq %rdx, %rax
+; CHECK-NEXT: retq
%1 = icmp uge i64 %a, %b
%2 = select i1 %1, i64 %c, i64 %d
ret i64 %2
}
define i64 @select_icmp_ult_cmov(i64 %a, i64 %b, i64 %c, i64 %d) {
-; CHECK-LABEL: select_icmp_ult_cmov
-; CHECK: cmpq %rsi, %rdi
-; CHECK-NEXT: cmovaeq %rcx, %rdx
-; CHECK-NEXT: movq %rdx, %rax
+; CHECK-LABEL: select_icmp_ult_cmov:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: cmpq %rsi, %rdi
+; CHECK-NEXT: cmovaeq %rcx, %rdx
+; CHECK-NEXT: movq %rdx, %rax
+; CHECK-NEXT: retq
%1 = icmp ult i64 %a, %b
%2 = select i1 %1, i64 %c, i64 %d
ret i64 %2
}
define i64 @select_icmp_ule_cmov(i64 %a, i64 %b, i64 %c, i64 %d) {
-; CHECK-LABEL: select_icmp_ule_cmov
-; CHECK: cmpq %rsi, %rdi
-; CHECK-NEXT: cmovaq %rcx, %rdx
-; CHECK-NEXT: movq %rdx, %rax
+; CHECK-LABEL: select_icmp_ule_cmov:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: cmpq %rsi, %rdi
+; CHECK-NEXT: cmovaq %rcx, %rdx
+; CHECK-NEXT: movq %rdx, %rax
+; CHECK-NEXT: retq
%1 = icmp ule i64 %a, %b
%2 = select i1 %1, i64 %c, i64 %d
ret i64 %2
}
define i64 @select_icmp_sgt_cmov(i64 %a, i64 %b, i64 %c, i64 %d) {
-; CHECK-LABEL: select_icmp_sgt_cmov
-; CHECK: cmpq %rsi, %rdi
-; CHECK-NEXT: cmovleq %rcx, %rdx
-; CHECK-NEXT: movq %rdx, %rax
+; CHECK-LABEL: select_icmp_sgt_cmov:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: cmpq %rsi, %rdi
+; CHECK-NEXT: cmovleq %rcx, %rdx
+; CHECK-NEXT: movq %rdx, %rax
+; CHECK-NEXT: retq
%1 = icmp sgt i64 %a, %b
%2 = select i1 %1, i64 %c, i64 %d
ret i64 %2
}
define i64 @select_icmp_sge_cmov(i64 %a, i64 %b, i64 %c, i64 %d) {
-; CHECK-LABEL: select_icmp_sge_cmov
-; CHECK: cmpq %rsi, %rdi
-; CHECK-NEXT: cmovlq %rcx, %rdx
-; CHECK-NEXT: movq %rdx, %rax
+; CHECK-LABEL: select_icmp_sge_cmov:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: cmpq %rsi, %rdi
+; CHECK-NEXT: cmovlq %rcx, %rdx
+; CHECK-NEXT: movq %rdx, %rax
+; CHECK-NEXT: retq
%1 = icmp sge i64 %a, %b
%2 = select i1 %1, i64 %c, i64 %d
ret i64 %2
}
define i64 @select_icmp_slt_cmov(i64 %a, i64 %b, i64 %c, i64 %d) {
-; CHECK-LABEL: select_icmp_slt_cmov
-; CHECK: cmpq %rsi, %rdi
-; CHECK-NEXT: cmovgeq %rcx, %rdx
-; CHECK-NEXT: movq %rdx, %rax
+; CHECK-LABEL: select_icmp_slt_cmov:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: cmpq %rsi, %rdi
+; CHECK-NEXT: cmovgeq %rcx, %rdx
+; CHECK-NEXT: movq %rdx, %rax
+; CHECK-NEXT: retq
%1 = icmp slt i64 %a, %b
%2 = select i1 %1, i64 %c, i64 %d
ret i64 %2
}
define i64 @select_icmp_sle_cmov(i64 %a, i64 %b, i64 %c, i64 %d) {
-; CHECK-LABEL: select_icmp_sle_cmov
-; CHECK: cmpq %rsi, %rdi
-; CHECK-NEXT: cmovgq %rcx, %rdx
-; CHECK-NEXT: movq %rdx, %rax
+; CHECK-LABEL: select_icmp_sle_cmov:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: cmpq %rsi, %rdi
+; CHECK-NEXT: cmovgq %rcx, %rdx
+; CHECK-NEXT: movq %rdx, %rax
+; CHECK-NEXT: retq
%1 = icmp sle i64 %a, %b
%2 = select i1 %1, i64 %c, i64 %d
ret i64 %2
diff --git a/test/CodeGen/X86/fast-isel-select-pseudo-cmov.ll b/test/CodeGen/X86/fast-isel-select-pseudo-cmov.ll
index 8147035b4385..3ab040758fa0 100644
--- a/test/CodeGen/X86/fast-isel-select-pseudo-cmov.ll
+++ b/test/CodeGen/X86/fast-isel-select-pseudo-cmov.ll
@@ -1,138 +1,293 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin10 | FileCheck %s
-; RUN: llc < %s -mtriple=x86_64-apple-darwin10 -fast-isel -fast-isel-abort=1 | FileCheck %s
-; RUN: llc < %s -mtriple=x86_64-apple-darwin10 -mcpu=corei7-avx | FileCheck %s
-; RUN: llc < %s -mtriple=x86_64-apple-darwin10 -fast-isel -fast-isel-abort=1 -mcpu=corei7-avx | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-apple-darwin10 | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
+; RUN: llc < %s -mtriple=x86_64-apple-darwin10 -fast-isel -fast-isel-abort=1 | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
+; RUN: llc < %s -mtriple=x86_64-apple-darwin10 -mcpu=corei7-avx | FileCheck %s --check-prefix=CHECK --check-prefix=AVX
+; RUN: llc < %s -mtriple=x86_64-apple-darwin10 -fast-isel -fast-isel-abort=1 -mcpu=corei7-avx | FileCheck %s --check-prefix=CHECK --check-prefix=AVX
define float @select_fcmp_one_f32(float %a, float %b, float %c, float %d) {
-; CHECK-LABEL: select_fcmp_one_f32
-; CHECK: ucomiss %xmm1, %xmm0
-; CHECK-NEXT: jne [[BB:LBB[0-9]+_2]]
-; CHECK: [[BB]]
-; CHECK-NEXT: movaps %xmm2, %xmm0
+; SSE-LABEL: select_fcmp_one_f32:
+; SSE: ## %bb.0:
+; SSE-NEXT: ucomiss %xmm1, %xmm0
+; SSE-NEXT: jne LBB0_2
+; SSE-NEXT: ## %bb.1:
+; SSE-NEXT: movaps %xmm3, %xmm2
+; SSE-NEXT: LBB0_2:
+; SSE-NEXT: movaps %xmm2, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: select_fcmp_one_f32:
+; AVX: ## %bb.0:
+; AVX-NEXT: vcmpneq_oqss %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vblendvps %xmm0, %xmm2, %xmm3, %xmm0
+; AVX-NEXT: retq
%1 = fcmp one float %a, %b
%2 = select i1 %1, float %c, float %d
ret float %2
}
define double @select_fcmp_one_f64(double %a, double %b, double %c, double %d) {
-; CHECK-LABEL: select_fcmp_one_f64
-; CHECK: ucomisd %xmm1, %xmm0
-; CHECK-NEXT: jne [[BB:LBB[0-9]+_2]]
-; CHECK: [[BB]]
-; CHECK-NEXT: movaps %xmm2, %xmm0
+; SSE-LABEL: select_fcmp_one_f64:
+; SSE: ## %bb.0:
+; SSE-NEXT: ucomisd %xmm1, %xmm0
+; SSE-NEXT: jne LBB1_2
+; SSE-NEXT: ## %bb.1:
+; SSE-NEXT: movaps %xmm3, %xmm2
+; SSE-NEXT: LBB1_2:
+; SSE-NEXT: movaps %xmm2, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: select_fcmp_one_f64:
+; AVX: ## %bb.0:
+; AVX-NEXT: vcmpneq_oqsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vblendvpd %xmm0, %xmm2, %xmm3, %xmm0
+; AVX-NEXT: retq
%1 = fcmp one double %a, %b
%2 = select i1 %1, double %c, double %d
ret double %2
}
define float @select_icmp_eq_f32(i64 %a, i64 %b, float %c, float %d) {
-; CHECK-LABEL: select_icmp_eq_f32
-; CHECK: cmpq %rsi, %rdi
-; CHECK-NEXT: je [[BB:LBB[0-9]+_2]]
-; CHECK: [[BB]]
-; CHECK-NEXT: retq
+; SSE-LABEL: select_icmp_eq_f32:
+; SSE: ## %bb.0:
+; SSE-NEXT: cmpq %rsi, %rdi
+; SSE-NEXT: je LBB2_2
+; SSE-NEXT: ## %bb.1:
+; SSE-NEXT: movaps %xmm1, %xmm0
+; SSE-NEXT: LBB2_2:
+; SSE-NEXT: retq
+;
+; AVX-LABEL: select_icmp_eq_f32:
+; AVX: ## %bb.0:
+; AVX-NEXT: cmpq %rsi, %rdi
+; AVX-NEXT: je LBB2_2
+; AVX-NEXT: ## %bb.1:
+; AVX-NEXT: vmovaps %xmm1, %xmm0
+; AVX-NEXT: LBB2_2:
+; AVX-NEXT: retq
%1 = icmp eq i64 %a, %b
%2 = select i1 %1, float %c, float %d
ret float %2
}
define float @select_icmp_ne_f32(i64 %a, i64 %b, float %c, float %d) {
-; CHECK-LABEL: select_icmp_ne_f32
-; CHECK: cmpq %rsi, %rdi
-; CHECK-NEXT: jne [[BB:LBB[0-9]+_2]]
-; CHECK: [[BB]]
-; CHECK-NEXT: retq
+; SSE-LABEL: select_icmp_ne_f32:
+; SSE: ## %bb.0:
+; SSE-NEXT: cmpq %rsi, %rdi
+; SSE-NEXT: jne LBB3_2
+; SSE-NEXT: ## %bb.1:
+; SSE-NEXT: movaps %xmm1, %xmm0
+; SSE-NEXT: LBB3_2:
+; SSE-NEXT: retq
+;
+; AVX-LABEL: select_icmp_ne_f32:
+; AVX: ## %bb.0:
+; AVX-NEXT: cmpq %rsi, %rdi
+; AVX-NEXT: jne LBB3_2
+; AVX-NEXT: ## %bb.1:
+; AVX-NEXT: vmovaps %xmm1, %xmm0
+; AVX-NEXT: LBB3_2:
+; AVX-NEXT: retq
%1 = icmp ne i64 %a, %b
%2 = select i1 %1, float %c, float %d
ret float %2
}
define float @select_icmp_ugt_f32(i64 %a, i64 %b, float %c, float %d) {
-; CHECK-LABEL: select_icmp_ugt_f32
-; CHECK: cmpq %rsi, %rdi
-; CHECK-NEXT: ja [[BB:LBB[0-9]+_2]]
-; CHECK: [[BB]]
-; CHECK-NEXT: retq
+; SSE-LABEL: select_icmp_ugt_f32:
+; SSE: ## %bb.0:
+; SSE-NEXT: cmpq %rsi, %rdi
+; SSE-NEXT: ja LBB4_2
+; SSE-NEXT: ## %bb.1:
+; SSE-NEXT: movaps %xmm1, %xmm0
+; SSE-NEXT: LBB4_2:
+; SSE-NEXT: retq
+;
+; AVX-LABEL: select_icmp_ugt_f32:
+; AVX: ## %bb.0:
+; AVX-NEXT: cmpq %rsi, %rdi
+; AVX-NEXT: ja LBB4_2
+; AVX-NEXT: ## %bb.1:
+; AVX-NEXT: vmovaps %xmm1, %xmm0
+; AVX-NEXT: LBB4_2:
+; AVX-NEXT: retq
%1 = icmp ugt i64 %a, %b
%2 = select i1 %1, float %c, float %d
ret float %2
}
define float @select_icmp_uge_f32(i64 %a, i64 %b, float %c, float %d) {
-; CHECK-LABEL: select_icmp_uge_f32
-; CHECK: cmpq %rsi, %rdi
-; CHECK-NEXT: jae [[BB:LBB[0-9]+_2]]
-; CHECK: [[BB]]
-; CHECK-NEXT: retq
+; SSE-LABEL: select_icmp_uge_f32:
+; SSE: ## %bb.0:
+; SSE-NEXT: cmpq %rsi, %rdi
+; SSE-NEXT: jae LBB5_2
+; SSE-NEXT: ## %bb.1:
+; SSE-NEXT: movaps %xmm1, %xmm0
+; SSE-NEXT: LBB5_2:
+; SSE-NEXT: retq
+;
+; AVX-LABEL: select_icmp_uge_f32:
+; AVX: ## %bb.0:
+; AVX-NEXT: cmpq %rsi, %rdi
+; AVX-NEXT: jae LBB5_2
+; AVX-NEXT: ## %bb.1:
+; AVX-NEXT: vmovaps %xmm1, %xmm0
+; AVX-NEXT: LBB5_2:
+; AVX-NEXT: retq
%1 = icmp uge i64 %a, %b
%2 = select i1 %1, float %c, float %d
ret float %2
}
define float @select_icmp_ult_f32(i64 %a, i64 %b, float %c, float %d) {
-; CHECK-LABEL: select_icmp_ult_f32
-; CHECK: cmpq %rsi, %rdi
-; CHECK-NEXT: jb [[BB:LBB[0-9]+_2]]
-; CHECK: [[BB]]
-; CHECK-NEXT: retq
+; SSE-LABEL: select_icmp_ult_f32:
+; SSE: ## %bb.0:
+; SSE-NEXT: cmpq %rsi, %rdi
+; SSE-NEXT: jb LBB6_2
+; SSE-NEXT: ## %bb.1:
+; SSE-NEXT: movaps %xmm1, %xmm0
+; SSE-NEXT: LBB6_2:
+; SSE-NEXT: retq
+;
+; AVX-LABEL: select_icmp_ult_f32:
+; AVX: ## %bb.0:
+; AVX-NEXT: cmpq %rsi, %rdi
+; AVX-NEXT: jb LBB6_2
+; AVX-NEXT: ## %bb.1:
+; AVX-NEXT: vmovaps %xmm1, %xmm0
+; AVX-NEXT: LBB6_2:
+; AVX-NEXT: retq
%1 = icmp ult i64 %a, %b
%2 = select i1 %1, float %c, float %d
ret float %2
}
define float @select_icmp_ule_f32(i64 %a, i64 %b, float %c, float %d) {
-; CHECK-LABEL: select_icmp_ule_f32
-; CHECK: cmpq %rsi, %rdi
-; CHECK-NEXT: jbe [[BB:LBB[0-9]+_2]]
-; CHECK: [[BB]]
-; CHECK-NEXT: retq
+; SSE-LABEL: select_icmp_ule_f32:
+; SSE: ## %bb.0:
+; SSE-NEXT: cmpq %rsi, %rdi
+; SSE-NEXT: jbe LBB7_2
+; SSE-NEXT: ## %bb.1:
+; SSE-NEXT: movaps %xmm1, %xmm0
+; SSE-NEXT: LBB7_2:
+; SSE-NEXT: retq
+;
+; AVX-LABEL: select_icmp_ule_f32:
+; AVX: ## %bb.0:
+; AVX-NEXT: cmpq %rsi, %rdi
+; AVX-NEXT: jbe LBB7_2
+; AVX-NEXT: ## %bb.1:
+; AVX-NEXT: vmovaps %xmm1, %xmm0
+; AVX-NEXT: LBB7_2:
+; AVX-NEXT: retq
%1 = icmp ule i64 %a, %b
%2 = select i1 %1, float %c, float %d
ret float %2
}
define float @select_icmp_sgt_f32(i64 %a, i64 %b, float %c, float %d) {
-; CHECK-LABEL: select_icmp_sgt_f32
-; CHECK: cmpq %rsi, %rdi
-; CHECK-NEXT: jg [[BB:LBB[0-9]+_2]]
-; CHECK: [[BB]]
-; CHECK-NEXT: retq
+; SSE-LABEL: select_icmp_sgt_f32:
+; SSE: ## %bb.0:
+; SSE-NEXT: cmpq %rsi, %rdi
+; SSE-NEXT: jg LBB8_2
+; SSE-NEXT: ## %bb.1:
+; SSE-NEXT: movaps %xmm1, %xmm0
+; SSE-NEXT: LBB8_2:
+; SSE-NEXT: retq
+;
+; AVX-LABEL: select_icmp_sgt_f32:
+; AVX: ## %bb.0:
+; AVX-NEXT: cmpq %rsi, %rdi
+; AVX-NEXT: jg LBB8_2
+; AVX-NEXT: ## %bb.1:
+; AVX-NEXT: vmovaps %xmm1, %xmm0
+; AVX-NEXT: LBB8_2:
+; AVX-NEXT: retq
%1 = icmp sgt i64 %a, %b
%2 = select i1 %1, float %c, float %d
ret float %2
}
define float @select_icmp_sge_f32(i64 %a, i64 %b, float %c, float %d) {
-; CHECK-LABEL: select_icmp_sge_f32
-; CHECK: cmpq %rsi, %rdi
-; CHECK-NEXT: jge [[BB:LBB[0-9]+_2]]
-; CHECK: [[BB]]
-; CHECK-NEXT: retq
+; SSE-LABEL: select_icmp_sge_f32:
+; SSE: ## %bb.0:
+; SSE-NEXT: cmpq %rsi, %rdi
+; SSE-NEXT: jge LBB9_2
+; SSE-NEXT: ## %bb.1:
+; SSE-NEXT: movaps %xmm1, %xmm0
+; SSE-NEXT: LBB9_2:
+; SSE-NEXT: retq
+;
+; AVX-LABEL: select_icmp_sge_f32:
+; AVX: ## %bb.0:
+; AVX-NEXT: cmpq %rsi, %rdi
+; AVX-NEXT: jge LBB9_2
+; AVX-NEXT: ## %bb.1:
+; AVX-NEXT: vmovaps %xmm1, %xmm0
+; AVX-NEXT: LBB9_2:
+; AVX-NEXT: retq
%1 = icmp sge i64 %a, %b
%2 = select i1 %1, float %c, float %d
ret float %2
}
define float @select_icmp_slt_f32(i64 %a, i64 %b, float %c, float %d) {
-; CHECK-LABEL: select_icmp_slt_f32
-; CHECK: cmpq %rsi, %rdi
-; CHECK-NEXT: jl [[BB:LBB[0-9]+_2]]
-; CHECK: [[BB]]
-; CHECK-NEXT: retq
+; SSE-LABEL: select_icmp_slt_f32:
+; SSE: ## %bb.0:
+; SSE-NEXT: cmpq %rsi, %rdi
+; SSE-NEXT: jl LBB10_2
+; SSE-NEXT: ## %bb.1:
+; SSE-NEXT: movaps %xmm1, %xmm0
+; SSE-NEXT: LBB10_2:
+; SSE-NEXT: retq
+;
+; AVX-LABEL: select_icmp_slt_f32:
+; AVX: ## %bb.0:
+; AVX-NEXT: cmpq %rsi, %rdi
+; AVX-NEXT: jl LBB10_2
+; AVX-NEXT: ## %bb.1:
+; AVX-NEXT: vmovaps %xmm1, %xmm0
+; AVX-NEXT: LBB10_2:
+; AVX-NEXT: retq
%1 = icmp slt i64 %a, %b
%2 = select i1 %1, float %c, float %d
ret float %2
}
define float @select_icmp_sle_f32(i64 %a, i64 %b, float %c, float %d) {
-; CHECK-LABEL: select_icmp_sle_f32
-; CHECK: cmpq %rsi, %rdi
-; CHECK-NEXT: jle [[BB:LBB[0-9]+_2]]
-; CHECK: [[BB]]
-; CHECK-NEXT: retq
+; SSE-LABEL: select_icmp_sle_f32:
+; SSE: ## %bb.0:
+; SSE-NEXT: cmpq %rsi, %rdi
+; SSE-NEXT: jle LBB11_2
+; SSE-NEXT: ## %bb.1:
+; SSE-NEXT: movaps %xmm1, %xmm0
+; SSE-NEXT: LBB11_2:
+; SSE-NEXT: retq
+;
+; AVX-LABEL: select_icmp_sle_f32:
+; AVX: ## %bb.0:
+; AVX-NEXT: cmpq %rsi, %rdi
+; AVX-NEXT: jle LBB11_2
+; AVX-NEXT: ## %bb.1:
+; AVX-NEXT: vmovaps %xmm1, %xmm0
+; AVX-NEXT: LBB11_2:
+; AVX-NEXT: retq
%1 = icmp sle i64 %a, %b
%2 = select i1 %1, float %c, float %d
ret float %2
}
+define i8 @select_icmp_sle_i8(i64 %a, i64 %b, i8 %c, i8 %d) {
+; CHECK-LABEL: select_icmp_sle_i8:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: cmpq %rsi, %rdi
+; CHECK-NEXT: jle LBB12_2
+; CHECK-NEXT: ## %bb.1:
+; CHECK-NEXT: movl %ecx, %edx
+; CHECK-NEXT: LBB12_2:
+; CHECK-NEXT: movl %edx, %eax
+; CHECK-NEXT: retq
+ %1 = icmp sle i64 %a, %b
+ %2 = select i1 %1, i8 %c, i8 %d
+ ret i8 %2
+}
diff --git a/test/CodeGen/X86/fast-isel-select-sse.ll b/test/CodeGen/X86/fast-isel-select-sse.ll
index 1b6bb36b77c8..e91b925a38e6 100644
--- a/test/CodeGen/X86/fast-isel-select-sse.ll
+++ b/test/CodeGen/X86/fast-isel-select-sse.ll
@@ -10,7 +10,7 @@
define float @select_fcmp_oeq_f32(float %a, float %b, float %c, float %d) {
; SSE-LABEL: select_fcmp_oeq_f32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: cmpeqss %xmm1, %xmm0
; SSE-NEXT: andps %xmm0, %xmm2
; SSE-NEXT: andnps %xmm3, %xmm0
@@ -18,13 +18,13 @@ define float @select_fcmp_oeq_f32(float %a, float %b, float %c, float %d) {
; SSE-NEXT: retq
;
; AVX-LABEL: select_fcmp_oeq_f32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vcmpeqss %xmm1, %xmm0, %xmm0
; AVX-NEXT: vblendvps %xmm0, %xmm2, %xmm3, %xmm0
; AVX-NEXT: retq
;
; AVX512-LABEL: select_fcmp_oeq_f32:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vcmpeqss %xmm1, %xmm0, %k1
; AVX512-NEXT: vmovss %xmm2, %xmm0, %xmm3 {%k1}
; AVX512-NEXT: vmovaps %xmm3, %xmm0
@@ -36,7 +36,7 @@ define float @select_fcmp_oeq_f32(float %a, float %b, float %c, float %d) {
define double @select_fcmp_oeq_f64(double %a, double %b, double %c, double %d) {
; SSE-LABEL: select_fcmp_oeq_f64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: cmpeqsd %xmm1, %xmm0
; SSE-NEXT: andpd %xmm0, %xmm2
; SSE-NEXT: andnpd %xmm3, %xmm0
@@ -44,13 +44,13 @@ define double @select_fcmp_oeq_f64(double %a, double %b, double %c, double %d) {
; SSE-NEXT: retq
;
; AVX-LABEL: select_fcmp_oeq_f64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vcmpeqsd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vblendvpd %xmm0, %xmm2, %xmm3, %xmm0
; AVX-NEXT: retq
;
; AVX512-LABEL: select_fcmp_oeq_f64:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vcmpeqsd %xmm1, %xmm0, %k1
; AVX512-NEXT: vmovsd %xmm2, %xmm0, %xmm3 {%k1}
; AVX512-NEXT: vmovapd %xmm3, %xmm0
@@ -62,7 +62,7 @@ define double @select_fcmp_oeq_f64(double %a, double %b, double %c, double %d) {
define float @select_fcmp_ogt_f32(float %a, float %b, float %c, float %d) {
; SSE-LABEL: select_fcmp_ogt_f32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: cmpltss %xmm0, %xmm1
; SSE-NEXT: andps %xmm1, %xmm2
; SSE-NEXT: andnps %xmm3, %xmm1
@@ -71,13 +71,13 @@ define float @select_fcmp_ogt_f32(float %a, float %b, float %c, float %d) {
; SSE-NEXT: retq
;
; AVX-LABEL: select_fcmp_ogt_f32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vcmpltss %xmm0, %xmm1, %xmm0
; AVX-NEXT: vblendvps %xmm0, %xmm2, %xmm3, %xmm0
; AVX-NEXT: retq
;
; AVX512-LABEL: select_fcmp_ogt_f32:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vcmpltss %xmm0, %xmm1, %k1
; AVX512-NEXT: vmovss %xmm2, %xmm0, %xmm3 {%k1}
; AVX512-NEXT: vmovaps %xmm3, %xmm0
@@ -89,7 +89,7 @@ define float @select_fcmp_ogt_f32(float %a, float %b, float %c, float %d) {
define double @select_fcmp_ogt_f64(double %a, double %b, double %c, double %d) {
; SSE-LABEL: select_fcmp_ogt_f64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: cmpltsd %xmm0, %xmm1
; SSE-NEXT: andpd %xmm1, %xmm2
; SSE-NEXT: andnpd %xmm3, %xmm1
@@ -98,13 +98,13 @@ define double @select_fcmp_ogt_f64(double %a, double %b, double %c, double %d) {
; SSE-NEXT: retq
;
; AVX-LABEL: select_fcmp_ogt_f64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vcmpltsd %xmm0, %xmm1, %xmm0
; AVX-NEXT: vblendvpd %xmm0, %xmm2, %xmm3, %xmm0
; AVX-NEXT: retq
;
; AVX512-LABEL: select_fcmp_ogt_f64:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vcmpltsd %xmm0, %xmm1, %k1
; AVX512-NEXT: vmovsd %xmm2, %xmm0, %xmm3 {%k1}
; AVX512-NEXT: vmovapd %xmm3, %xmm0
@@ -116,7 +116,7 @@ define double @select_fcmp_ogt_f64(double %a, double %b, double %c, double %d) {
define float @select_fcmp_oge_f32(float %a, float %b, float %c, float %d) {
; SSE-LABEL: select_fcmp_oge_f32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: cmpless %xmm0, %xmm1
; SSE-NEXT: andps %xmm1, %xmm2
; SSE-NEXT: andnps %xmm3, %xmm1
@@ -125,13 +125,13 @@ define float @select_fcmp_oge_f32(float %a, float %b, float %c, float %d) {
; SSE-NEXT: retq
;
; AVX-LABEL: select_fcmp_oge_f32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vcmpless %xmm0, %xmm1, %xmm0
; AVX-NEXT: vblendvps %xmm0, %xmm2, %xmm3, %xmm0
; AVX-NEXT: retq
;
; AVX512-LABEL: select_fcmp_oge_f32:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vcmpless %xmm0, %xmm1, %k1
; AVX512-NEXT: vmovss %xmm2, %xmm0, %xmm3 {%k1}
; AVX512-NEXT: vmovaps %xmm3, %xmm0
@@ -143,7 +143,7 @@ define float @select_fcmp_oge_f32(float %a, float %b, float %c, float %d) {
define double @select_fcmp_oge_f64(double %a, double %b, double %c, double %d) {
; SSE-LABEL: select_fcmp_oge_f64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: cmplesd %xmm0, %xmm1
; SSE-NEXT: andpd %xmm1, %xmm2
; SSE-NEXT: andnpd %xmm3, %xmm1
@@ -152,13 +152,13 @@ define double @select_fcmp_oge_f64(double %a, double %b, double %c, double %d) {
; SSE-NEXT: retq
;
; AVX-LABEL: select_fcmp_oge_f64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vcmplesd %xmm0, %xmm1, %xmm0
; AVX-NEXT: vblendvpd %xmm0, %xmm2, %xmm3, %xmm0
; AVX-NEXT: retq
;
; AVX512-LABEL: select_fcmp_oge_f64:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vcmplesd %xmm0, %xmm1, %k1
; AVX512-NEXT: vmovsd %xmm2, %xmm0, %xmm3 {%k1}
; AVX512-NEXT: vmovapd %xmm3, %xmm0
@@ -170,7 +170,7 @@ define double @select_fcmp_oge_f64(double %a, double %b, double %c, double %d) {
define float @select_fcmp_olt_f32(float %a, float %b, float %c, float %d) {
; SSE-LABEL: select_fcmp_olt_f32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: cmpltss %xmm1, %xmm0
; SSE-NEXT: andps %xmm0, %xmm2
; SSE-NEXT: andnps %xmm3, %xmm0
@@ -178,13 +178,13 @@ define float @select_fcmp_olt_f32(float %a, float %b, float %c, float %d) {
; SSE-NEXT: retq
;
; AVX-LABEL: select_fcmp_olt_f32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vcmpltss %xmm1, %xmm0, %xmm0
; AVX-NEXT: vblendvps %xmm0, %xmm2, %xmm3, %xmm0
; AVX-NEXT: retq
;
; AVX512-LABEL: select_fcmp_olt_f32:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vcmpltss %xmm1, %xmm0, %k1
; AVX512-NEXT: vmovss %xmm2, %xmm0, %xmm3 {%k1}
; AVX512-NEXT: vmovaps %xmm3, %xmm0
@@ -196,7 +196,7 @@ define float @select_fcmp_olt_f32(float %a, float %b, float %c, float %d) {
define double @select_fcmp_olt_f64(double %a, double %b, double %c, double %d) {
; SSE-LABEL: select_fcmp_olt_f64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: cmpltsd %xmm1, %xmm0
; SSE-NEXT: andpd %xmm0, %xmm2
; SSE-NEXT: andnpd %xmm3, %xmm0
@@ -204,13 +204,13 @@ define double @select_fcmp_olt_f64(double %a, double %b, double %c, double %d) {
; SSE-NEXT: retq
;
; AVX-LABEL: select_fcmp_olt_f64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vcmpltsd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vblendvpd %xmm0, %xmm2, %xmm3, %xmm0
; AVX-NEXT: retq
;
; AVX512-LABEL: select_fcmp_olt_f64:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vcmpltsd %xmm1, %xmm0, %k1
; AVX512-NEXT: vmovsd %xmm2, %xmm0, %xmm3 {%k1}
; AVX512-NEXT: vmovapd %xmm3, %xmm0
@@ -222,7 +222,7 @@ define double @select_fcmp_olt_f64(double %a, double %b, double %c, double %d) {
define float @select_fcmp_ole_f32(float %a, float %b, float %c, float %d) {
; SSE-LABEL: select_fcmp_ole_f32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: cmpless %xmm1, %xmm0
; SSE-NEXT: andps %xmm0, %xmm2
; SSE-NEXT: andnps %xmm3, %xmm0
@@ -230,13 +230,13 @@ define float @select_fcmp_ole_f32(float %a, float %b, float %c, float %d) {
; SSE-NEXT: retq
;
; AVX-LABEL: select_fcmp_ole_f32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vcmpless %xmm1, %xmm0, %xmm0
; AVX-NEXT: vblendvps %xmm0, %xmm2, %xmm3, %xmm0
; AVX-NEXT: retq
;
; AVX512-LABEL: select_fcmp_ole_f32:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vcmpless %xmm1, %xmm0, %k1
; AVX512-NEXT: vmovss %xmm2, %xmm0, %xmm3 {%k1}
; AVX512-NEXT: vmovaps %xmm3, %xmm0
@@ -248,7 +248,7 @@ define float @select_fcmp_ole_f32(float %a, float %b, float %c, float %d) {
define double @select_fcmp_ole_f64(double %a, double %b, double %c, double %d) {
; SSE-LABEL: select_fcmp_ole_f64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: cmplesd %xmm1, %xmm0
; SSE-NEXT: andpd %xmm0, %xmm2
; SSE-NEXT: andnpd %xmm3, %xmm0
@@ -256,13 +256,13 @@ define double @select_fcmp_ole_f64(double %a, double %b, double %c, double %d) {
; SSE-NEXT: retq
;
; AVX-LABEL: select_fcmp_ole_f64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vcmplesd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vblendvpd %xmm0, %xmm2, %xmm3, %xmm0
; AVX-NEXT: retq
;
; AVX512-LABEL: select_fcmp_ole_f64:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vcmplesd %xmm1, %xmm0, %k1
; AVX512-NEXT: vmovsd %xmm2, %xmm0, %xmm3 {%k1}
; AVX512-NEXT: vmovapd %xmm3, %xmm0
@@ -274,7 +274,7 @@ define double @select_fcmp_ole_f64(double %a, double %b, double %c, double %d) {
define float @select_fcmp_ord_f32(float %a, float %b, float %c, float %d) {
; SSE-LABEL: select_fcmp_ord_f32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: cmpordss %xmm1, %xmm0
; SSE-NEXT: andps %xmm0, %xmm2
; SSE-NEXT: andnps %xmm3, %xmm0
@@ -282,13 +282,13 @@ define float @select_fcmp_ord_f32(float %a, float %b, float %c, float %d) {
; SSE-NEXT: retq
;
; AVX-LABEL: select_fcmp_ord_f32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vcmpordss %xmm1, %xmm0, %xmm0
; AVX-NEXT: vblendvps %xmm0, %xmm2, %xmm3, %xmm0
; AVX-NEXT: retq
;
; AVX512-LABEL: select_fcmp_ord_f32:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vcmpordss %xmm1, %xmm0, %k1
; AVX512-NEXT: vmovss %xmm2, %xmm0, %xmm3 {%k1}
; AVX512-NEXT: vmovaps %xmm3, %xmm0
@@ -300,7 +300,7 @@ define float @select_fcmp_ord_f32(float %a, float %b, float %c, float %d) {
define double @select_fcmp_ord_f64(double %a, double %b, double %c, double %d) {
; SSE-LABEL: select_fcmp_ord_f64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: cmpordsd %xmm1, %xmm0
; SSE-NEXT: andpd %xmm0, %xmm2
; SSE-NEXT: andnpd %xmm3, %xmm0
@@ -308,13 +308,13 @@ define double @select_fcmp_ord_f64(double %a, double %b, double %c, double %d) {
; SSE-NEXT: retq
;
; AVX-LABEL: select_fcmp_ord_f64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vcmpordsd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vblendvpd %xmm0, %xmm2, %xmm3, %xmm0
; AVX-NEXT: retq
;
; AVX512-LABEL: select_fcmp_ord_f64:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vcmpordsd %xmm1, %xmm0, %k1
; AVX512-NEXT: vmovsd %xmm2, %xmm0, %xmm3 {%k1}
; AVX512-NEXT: vmovapd %xmm3, %xmm0
@@ -326,7 +326,7 @@ define double @select_fcmp_ord_f64(double %a, double %b, double %c, double %d) {
define float @select_fcmp_uno_f32(float %a, float %b, float %c, float %d) {
; SSE-LABEL: select_fcmp_uno_f32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: cmpunordss %xmm1, %xmm0
; SSE-NEXT: andps %xmm0, %xmm2
; SSE-NEXT: andnps %xmm3, %xmm0
@@ -334,13 +334,13 @@ define float @select_fcmp_uno_f32(float %a, float %b, float %c, float %d) {
; SSE-NEXT: retq
;
; AVX-LABEL: select_fcmp_uno_f32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vcmpunordss %xmm1, %xmm0, %xmm0
; AVX-NEXT: vblendvps %xmm0, %xmm2, %xmm3, %xmm0
; AVX-NEXT: retq
;
; AVX512-LABEL: select_fcmp_uno_f32:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vcmpunordss %xmm1, %xmm0, %k1
; AVX512-NEXT: vmovss %xmm2, %xmm0, %xmm3 {%k1}
; AVX512-NEXT: vmovaps %xmm3, %xmm0
@@ -352,7 +352,7 @@ define float @select_fcmp_uno_f32(float %a, float %b, float %c, float %d) {
define double @select_fcmp_uno_f64(double %a, double %b, double %c, double %d) {
; SSE-LABEL: select_fcmp_uno_f64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: cmpunordsd %xmm1, %xmm0
; SSE-NEXT: andpd %xmm0, %xmm2
; SSE-NEXT: andnpd %xmm3, %xmm0
@@ -360,13 +360,13 @@ define double @select_fcmp_uno_f64(double %a, double %b, double %c, double %d) {
; SSE-NEXT: retq
;
; AVX-LABEL: select_fcmp_uno_f64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vcmpunordsd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vblendvpd %xmm0, %xmm2, %xmm3, %xmm0
; AVX-NEXT: retq
;
; AVX512-LABEL: select_fcmp_uno_f64:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vcmpunordsd %xmm1, %xmm0, %k1
; AVX512-NEXT: vmovsd %xmm2, %xmm0, %xmm3 {%k1}
; AVX512-NEXT: vmovapd %xmm3, %xmm0
@@ -378,7 +378,7 @@ define double @select_fcmp_uno_f64(double %a, double %b, double %c, double %d) {
define float @select_fcmp_ugt_f32(float %a, float %b, float %c, float %d) {
; SSE-LABEL: select_fcmp_ugt_f32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: cmpnless %xmm1, %xmm0
; SSE-NEXT: andps %xmm0, %xmm2
; SSE-NEXT: andnps %xmm3, %xmm0
@@ -386,13 +386,13 @@ define float @select_fcmp_ugt_f32(float %a, float %b, float %c, float %d) {
; SSE-NEXT: retq
;
; AVX-LABEL: select_fcmp_ugt_f32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vcmpnless %xmm1, %xmm0, %xmm0
; AVX-NEXT: vblendvps %xmm0, %xmm2, %xmm3, %xmm0
; AVX-NEXT: retq
;
; AVX512-LABEL: select_fcmp_ugt_f32:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vcmpnless %xmm1, %xmm0, %k1
; AVX512-NEXT: vmovss %xmm2, %xmm0, %xmm3 {%k1}
; AVX512-NEXT: vmovaps %xmm3, %xmm0
@@ -404,7 +404,7 @@ define float @select_fcmp_ugt_f32(float %a, float %b, float %c, float %d) {
define double @select_fcmp_ugt_f64(double %a, double %b, double %c, double %d) {
; SSE-LABEL: select_fcmp_ugt_f64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: cmpnlesd %xmm1, %xmm0
; SSE-NEXT: andpd %xmm0, %xmm2
; SSE-NEXT: andnpd %xmm3, %xmm0
@@ -412,13 +412,13 @@ define double @select_fcmp_ugt_f64(double %a, double %b, double %c, double %d) {
; SSE-NEXT: retq
;
; AVX-LABEL: select_fcmp_ugt_f64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vcmpnlesd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vblendvpd %xmm0, %xmm2, %xmm3, %xmm0
; AVX-NEXT: retq
;
; AVX512-LABEL: select_fcmp_ugt_f64:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vcmpnlesd %xmm1, %xmm0, %k1
; AVX512-NEXT: vmovsd %xmm2, %xmm0, %xmm3 {%k1}
; AVX512-NEXT: vmovapd %xmm3, %xmm0
@@ -430,7 +430,7 @@ define double @select_fcmp_ugt_f64(double %a, double %b, double %c, double %d) {
define float @select_fcmp_uge_f32(float %a, float %b, float %c, float %d) {
; SSE-LABEL: select_fcmp_uge_f32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: cmpnltss %xmm1, %xmm0
; SSE-NEXT: andps %xmm0, %xmm2
; SSE-NEXT: andnps %xmm3, %xmm0
@@ -438,13 +438,13 @@ define float @select_fcmp_uge_f32(float %a, float %b, float %c, float %d) {
; SSE-NEXT: retq
;
; AVX-LABEL: select_fcmp_uge_f32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vcmpnltss %xmm1, %xmm0, %xmm0
; AVX-NEXT: vblendvps %xmm0, %xmm2, %xmm3, %xmm0
; AVX-NEXT: retq
;
; AVX512-LABEL: select_fcmp_uge_f32:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vcmpnltss %xmm1, %xmm0, %k1
; AVX512-NEXT: vmovss %xmm2, %xmm0, %xmm3 {%k1}
; AVX512-NEXT: vmovaps %xmm3, %xmm0
@@ -456,7 +456,7 @@ define float @select_fcmp_uge_f32(float %a, float %b, float %c, float %d) {
define double @select_fcmp_uge_f64(double %a, double %b, double %c, double %d) {
; SSE-LABEL: select_fcmp_uge_f64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: cmpnltsd %xmm1, %xmm0
; SSE-NEXT: andpd %xmm0, %xmm2
; SSE-NEXT: andnpd %xmm3, %xmm0
@@ -464,13 +464,13 @@ define double @select_fcmp_uge_f64(double %a, double %b, double %c, double %d) {
; SSE-NEXT: retq
;
; AVX-LABEL: select_fcmp_uge_f64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vcmpnltsd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vblendvpd %xmm0, %xmm2, %xmm3, %xmm0
; AVX-NEXT: retq
;
; AVX512-LABEL: select_fcmp_uge_f64:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vcmpnltsd %xmm1, %xmm0, %k1
; AVX512-NEXT: vmovsd %xmm2, %xmm0, %xmm3 {%k1}
; AVX512-NEXT: vmovapd %xmm3, %xmm0
@@ -482,7 +482,7 @@ define double @select_fcmp_uge_f64(double %a, double %b, double %c, double %d) {
define float @select_fcmp_ult_f32(float %a, float %b, float %c, float %d) {
; SSE-LABEL: select_fcmp_ult_f32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: cmpnless %xmm0, %xmm1
; SSE-NEXT: andps %xmm1, %xmm2
; SSE-NEXT: andnps %xmm3, %xmm1
@@ -491,13 +491,13 @@ define float @select_fcmp_ult_f32(float %a, float %b, float %c, float %d) {
; SSE-NEXT: retq
;
; AVX-LABEL: select_fcmp_ult_f32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vcmpnless %xmm0, %xmm1, %xmm0
; AVX-NEXT: vblendvps %xmm0, %xmm2, %xmm3, %xmm0
; AVX-NEXT: retq
;
; AVX512-LABEL: select_fcmp_ult_f32:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vcmpnless %xmm0, %xmm1, %k1
; AVX512-NEXT: vmovss %xmm2, %xmm0, %xmm3 {%k1}
; AVX512-NEXT: vmovaps %xmm3, %xmm0
@@ -509,7 +509,7 @@ define float @select_fcmp_ult_f32(float %a, float %b, float %c, float %d) {
define double @select_fcmp_ult_f64(double %a, double %b, double %c, double %d) {
; SSE-LABEL: select_fcmp_ult_f64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: cmpnlesd %xmm0, %xmm1
; SSE-NEXT: andpd %xmm1, %xmm2
; SSE-NEXT: andnpd %xmm3, %xmm1
@@ -518,13 +518,13 @@ define double @select_fcmp_ult_f64(double %a, double %b, double %c, double %d) {
; SSE-NEXT: retq
;
; AVX-LABEL: select_fcmp_ult_f64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vcmpnlesd %xmm0, %xmm1, %xmm0
; AVX-NEXT: vblendvpd %xmm0, %xmm2, %xmm3, %xmm0
; AVX-NEXT: retq
;
; AVX512-LABEL: select_fcmp_ult_f64:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vcmpnlesd %xmm0, %xmm1, %k1
; AVX512-NEXT: vmovsd %xmm2, %xmm0, %xmm3 {%k1}
; AVX512-NEXT: vmovapd %xmm3, %xmm0
@@ -536,7 +536,7 @@ define double @select_fcmp_ult_f64(double %a, double %b, double %c, double %d) {
define float @select_fcmp_ule_f32(float %a, float %b, float %c, float %d) {
; SSE-LABEL: select_fcmp_ule_f32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: cmpnltss %xmm0, %xmm1
; SSE-NEXT: andps %xmm1, %xmm2
; SSE-NEXT: andnps %xmm3, %xmm1
@@ -545,13 +545,13 @@ define float @select_fcmp_ule_f32(float %a, float %b, float %c, float %d) {
; SSE-NEXT: retq
;
; AVX-LABEL: select_fcmp_ule_f32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vcmpnltss %xmm0, %xmm1, %xmm0
; AVX-NEXT: vblendvps %xmm0, %xmm2, %xmm3, %xmm0
; AVX-NEXT: retq
;
; AVX512-LABEL: select_fcmp_ule_f32:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vcmpnltss %xmm0, %xmm1, %k1
; AVX512-NEXT: vmovss %xmm2, %xmm0, %xmm3 {%k1}
; AVX512-NEXT: vmovaps %xmm3, %xmm0
@@ -563,7 +563,7 @@ define float @select_fcmp_ule_f32(float %a, float %b, float %c, float %d) {
define double @select_fcmp_ule_f64(double %a, double %b, double %c, double %d) {
; SSE-LABEL: select_fcmp_ule_f64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: cmpnltsd %xmm0, %xmm1
; SSE-NEXT: andpd %xmm1, %xmm2
; SSE-NEXT: andnpd %xmm3, %xmm1
@@ -572,13 +572,13 @@ define double @select_fcmp_ule_f64(double %a, double %b, double %c, double %d) {
; SSE-NEXT: retq
;
; AVX-LABEL: select_fcmp_ule_f64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vcmpnltsd %xmm0, %xmm1, %xmm0
; AVX-NEXT: vblendvpd %xmm0, %xmm2, %xmm3, %xmm0
; AVX-NEXT: retq
;
; AVX512-LABEL: select_fcmp_ule_f64:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vcmpnltsd %xmm0, %xmm1, %k1
; AVX512-NEXT: vmovsd %xmm2, %xmm0, %xmm3 {%k1}
; AVX512-NEXT: vmovapd %xmm3, %xmm0
@@ -590,7 +590,7 @@ define double @select_fcmp_ule_f64(double %a, double %b, double %c, double %d) {
define float @select_fcmp_une_f32(float %a, float %b, float %c, float %d) {
; SSE-LABEL: select_fcmp_une_f32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: cmpneqss %xmm1, %xmm0
; SSE-NEXT: andps %xmm0, %xmm2
; SSE-NEXT: andnps %xmm3, %xmm0
@@ -598,13 +598,13 @@ define float @select_fcmp_une_f32(float %a, float %b, float %c, float %d) {
; SSE-NEXT: retq
;
; AVX-LABEL: select_fcmp_une_f32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vcmpneqss %xmm1, %xmm0, %xmm0
; AVX-NEXT: vblendvps %xmm0, %xmm2, %xmm3, %xmm0
; AVX-NEXT: retq
;
; AVX512-LABEL: select_fcmp_une_f32:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vcmpneqss %xmm1, %xmm0, %k1
; AVX512-NEXT: vmovss %xmm2, %xmm0, %xmm3 {%k1}
; AVX512-NEXT: vmovaps %xmm3, %xmm0
@@ -616,7 +616,7 @@ define float @select_fcmp_une_f32(float %a, float %b, float %c, float %d) {
define double @select_fcmp_une_f64(double %a, double %b, double %c, double %d) {
; SSE-LABEL: select_fcmp_une_f64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: cmpneqsd %xmm1, %xmm0
; SSE-NEXT: andpd %xmm0, %xmm2
; SSE-NEXT: andnpd %xmm3, %xmm0
@@ -624,13 +624,13 @@ define double @select_fcmp_une_f64(double %a, double %b, double %c, double %d) {
; SSE-NEXT: retq
;
; AVX-LABEL: select_fcmp_une_f64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vcmpneqsd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vblendvpd %xmm0, %xmm2, %xmm3, %xmm0
; AVX-NEXT: retq
;
; AVX512-LABEL: select_fcmp_une_f64:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vcmpneqsd %xmm1, %xmm0, %k1
; AVX512-NEXT: vmovsd %xmm2, %xmm0, %xmm3 {%k1}
; AVX512-NEXT: vmovapd %xmm3, %xmm0
diff --git a/test/CodeGen/X86/fast-isel-sext-zext.ll b/test/CodeGen/X86/fast-isel-sext-zext.ll
new file mode 100644
index 000000000000..92344a5c1a30
--- /dev/null
+++ b/test/CodeGen/X86/fast-isel-sext-zext.ll
@@ -0,0 +1,378 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-apple-darwin10 -fast-isel -fast-isel-abort=1 | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-apple-darwin10 -fast-isel -fast-isel-abort=1 | FileCheck %s --check-prefix=X64
+
+define i8 @test1(i8 %x) nounwind {
+; X32-LABEL: test1:
+; X32: ## %bb.0:
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: andb $1, %al
+; X32-NEXT: negb %al
+; X32-NEXT: retl
+; X32-NEXT: ## -- End function
+;
+; X64-LABEL: test1:
+; X64: ## %bb.0:
+; X64-NEXT: andb $1, %dil
+; X64-NEXT: negb %dil
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: retq
+; X64-NEXT: ## -- End function
+ %z = trunc i8 %x to i1
+ %u = sext i1 %z to i8
+ ret i8 %u
+}
+
+define i16 @test2(i16 %x) nounwind {
+; X32-LABEL: test2:
+; X32: ## %bb.0:
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: andb $1, %al
+; X32-NEXT: negb %al
+; X32-NEXT: movsbl %al, %eax
+; X32-NEXT: ## kill: def %ax killed %ax killed %eax
+; X32-NEXT: retl
+; X32-NEXT: ## -- End function
+;
+; X64-LABEL: test2:
+; X64: ## %bb.0:
+; X64-NEXT: andb $1, %dil
+; X64-NEXT: negb %dil
+; X64-NEXT: movsbl %dil, %eax
+; X64-NEXT: ## kill: def %ax killed %ax killed %eax
+; X64-NEXT: retq
+; X64-NEXT: ## -- End function
+ %z = trunc i16 %x to i1
+ %u = sext i1 %z to i16
+ ret i16 %u
+}
+
+define i32 @test3(i32 %x) nounwind {
+; X32-LABEL: test3:
+; X32: ## %bb.0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: andb $1, %al
+; X32-NEXT: negb %al
+; X32-NEXT: movsbl %al, %eax
+; X32-NEXT: retl
+; X32-NEXT: ## -- End function
+;
+; X64-LABEL: test3:
+; X64: ## %bb.0:
+; X64-NEXT: andb $1, %dil
+; X64-NEXT: negb %dil
+; X64-NEXT: movsbl %dil, %eax
+; X64-NEXT: retq
+; X64-NEXT: ## -- End function
+ %z = trunc i32 %x to i1
+ %u = sext i1 %z to i32
+ ret i32 %u
+}
+
+define i32 @test4(i32 %x) nounwind {
+; X32-LABEL: test4:
+; X32: ## %bb.0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: andb $1, %al
+; X32-NEXT: negb %al
+; X32-NEXT: movsbl %al, %eax
+; X32-NEXT: retl
+; X32-NEXT: ## -- End function
+;
+; X64-LABEL: test4:
+; X64: ## %bb.0:
+; X64-NEXT: andb $1, %dil
+; X64-NEXT: negb %dil
+; X64-NEXT: movsbl %dil, %eax
+; X64-NEXT: retq
+; X64-NEXT: ## -- End function
+ %z = trunc i32 %x to i1
+ %u = sext i1 %z to i32
+ ret i32 %u
+}
+
+define i8 @test5(i8 %x) nounwind {
+; X32-LABEL: test5:
+; X32: ## %bb.0:
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: andb $1, %al
+; X32-NEXT: retl
+; X32-NEXT: ## -- End function
+;
+; X64-LABEL: test5:
+; X64: ## %bb.0:
+; X64-NEXT: andb $1, %dil
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: retq
+; X64-NEXT: ## -- End function
+ %z = trunc i8 %x to i1
+ %u = zext i1 %z to i8
+ ret i8 %u
+}
+
+define i16 @test6(i16 %x) nounwind {
+; X32-LABEL: test6:
+; X32: ## %bb.0:
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: andb $1, %al
+; X32-NEXT: movzbl %al, %eax
+; X32-NEXT: ## kill: def %ax killed %ax killed %eax
+; X32-NEXT: retl
+; X32-NEXT: ## -- End function
+;
+; X64-LABEL: test6:
+; X64: ## %bb.0:
+; X64-NEXT: andb $1, %dil
+; X64-NEXT: movzbl %dil, %eax
+; X64-NEXT: ## kill: def %ax killed %ax killed %eax
+; X64-NEXT: retq
+; X64-NEXT: ## -- End function
+ %z = trunc i16 %x to i1
+ %u = zext i1 %z to i16
+ ret i16 %u
+}
+
+define i32 @test7(i32 %x) nounwind {
+; X32-LABEL: test7:
+; X32: ## %bb.0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: andb $1, %al
+; X32-NEXT: movzbl %al, %eax
+; X32-NEXT: retl
+; X32-NEXT: ## -- End function
+;
+; X64-LABEL: test7:
+; X64: ## %bb.0:
+; X64-NEXT: andb $1, %dil
+; X64-NEXT: movzbl %dil, %eax
+; X64-NEXT: retq
+; X64-NEXT: ## -- End function
+ %z = trunc i32 %x to i1
+ %u = zext i1 %z to i32
+ ret i32 %u
+}
+
+define i32 @test8(i32 %x) nounwind {
+; X32-LABEL: test8:
+; X32: ## %bb.0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: andb $1, %al
+; X32-NEXT: movzbl %al, %eax
+; X32-NEXT: retl
+; X32-NEXT: ## -- End function
+;
+; X64-LABEL: test8:
+; X64: ## %bb.0:
+; X64-NEXT: andb $1, %dil
+; X64-NEXT: movzbl %dil, %eax
+; X64-NEXT: retq
+; X64-NEXT: ## -- End function
+ %z = trunc i32 %x to i1
+ %u = zext i1 %z to i32
+ ret i32 %u
+}
+
+define i16 @test9(i8 %x) nounwind {
+; X32-LABEL: test9:
+; X32: ## %bb.0:
+; X32-NEXT: movsbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: ## kill: def %ax killed %ax killed %eax
+; X32-NEXT: retl
+; X32-NEXT: ## -- End function
+;
+; X64-LABEL: test9:
+; X64: ## %bb.0:
+; X64-NEXT: movsbl %dil, %eax
+; X64-NEXT: ## kill: def %ax killed %ax killed %eax
+; X64-NEXT: retq
+; X64-NEXT: ## -- End function
+ %u = sext i8 %x to i16
+ ret i16 %u
+}
+
+define i32 @test10(i8 %x) nounwind {
+; X32-LABEL: test10:
+; X32: ## %bb.0:
+; X32-NEXT: movsbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: retl
+; X32-NEXT: ## -- End function
+;
+; X64-LABEL: test10:
+; X64: ## %bb.0:
+; X64-NEXT: movsbl %dil, %eax
+; X64-NEXT: retq
+; X64-NEXT: ## -- End function
+ %u = sext i8 %x to i32
+ ret i32 %u
+}
+
+define i64 @test11(i8 %x) nounwind {
+; X32-LABEL: test11:
+; X32: ## %bb.0:
+; X32-NEXT: movsbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl %eax, %edx
+; X32-NEXT: sarl $31, %edx
+; X32-NEXT: retl
+; X32-NEXT: ## -- End function
+;
+; X64-LABEL: test11:
+; X64: ## %bb.0:
+; X64-NEXT: movsbq %dil, %rax
+; X64-NEXT: retq
+; X64-NEXT: ## -- End function
+ %u = sext i8 %x to i64
+ ret i64 %u
+}
+
+define i16 @test12(i8 %x) nounwind {
+; X32-LABEL: test12:
+; X32: ## %bb.0:
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: ## kill: def %ax killed %ax killed %eax
+; X32-NEXT: retl
+; X32-NEXT: ## -- End function
+;
+; X64-LABEL: test12:
+; X64: ## %bb.0:
+; X64-NEXT: movzbl %dil, %eax
+; X64-NEXT: ## kill: def %ax killed %ax killed %eax
+; X64-NEXT: retq
+; X64-NEXT: ## -- End function
+ %u = zext i8 %x to i16
+ ret i16 %u
+}
+
+define i32 @test13(i8 %x) nounwind {
+; X32-LABEL: test13:
+; X32: ## %bb.0:
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: retl
+; X32-NEXT: ## -- End function
+;
+; X64-LABEL: test13:
+; X64: ## %bb.0:
+; X64-NEXT: movzbl %dil, %eax
+; X64-NEXT: retq
+; X64-NEXT: ## -- End function
+ %u = zext i8 %x to i32
+ ret i32 %u
+}
+
+define i64 @test14(i8 %x) nounwind {
+; X32-LABEL: test14:
+; X32: ## %bb.0:
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: xorl %edx, %edx
+; X32-NEXT: retl
+; X32-NEXT: ## -- End function
+;
+; X64-LABEL: test14:
+; X64: ## %bb.0:
+; X64-NEXT: movzbl %dil, %eax
+; X64-NEXT: retq
+; X64-NEXT: ## -- End function
+ %u = zext i8 %x to i64
+ ret i64 %u
+}
+
+define i32 @test15(i16 %x) nounwind {
+; X32-LABEL: test15:
+; X32: ## %bb.0:
+; X32-NEXT: movswl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: retl
+; X32-NEXT: ## -- End function
+;
+; X64-LABEL: test15:
+; X64: ## %bb.0:
+; X64-NEXT: movswl %di, %eax
+; X64-NEXT: retq
+; X64-NEXT: ## -- End function
+ %u = sext i16 %x to i32
+ ret i32 %u
+}
+
+define i64 @test16(i16 %x) nounwind {
+; X32-LABEL: test16:
+; X32: ## %bb.0:
+; X32-NEXT: movswl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl %eax, %edx
+; X32-NEXT: sarl $31, %edx
+; X32-NEXT: retl
+; X32-NEXT: ## -- End function
+;
+; X64-LABEL: test16:
+; X64: ## %bb.0:
+; X64-NEXT: movswq %di, %rax
+; X64-NEXT: retq
+; X64-NEXT: ## -- End function
+ %u = sext i16 %x to i64
+ ret i64 %u
+}
+
+define i32 @test17(i16 %x) nounwind {
+; X32-LABEL: test17:
+; X32: ## %bb.0:
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: retl
+; X32-NEXT: ## -- End function
+;
+; X64-LABEL: test17:
+; X64: ## %bb.0:
+; X64-NEXT: movzwl %di, %eax
+; X64-NEXT: retq
+; X64-NEXT: ## -- End function
+ %u = zext i16 %x to i32
+ ret i32 %u
+}
+
+define i64 @test18(i16 %x) nounwind {
+; X32-LABEL: test18:
+; X32: ## %bb.0:
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: xorl %edx, %edx
+; X32-NEXT: retl
+; X32-NEXT: ## -- End function
+;
+; X64-LABEL: test18:
+; X64: ## %bb.0:
+; X64-NEXT: movzwl %di, %eax
+; X64-NEXT: retq
+; X64-NEXT: ## -- End function
+ %u = zext i16 %x to i64
+ ret i64 %u
+}
+
+define i64 @test19(i32 %x) nounwind {
+; X32-LABEL: test19:
+; X32: ## %bb.0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl %eax, %edx
+; X32-NEXT: sarl $31, %edx
+; X32-NEXT: retl
+; X32-NEXT: ## -- End function
+;
+; X64-LABEL: test19:
+; X64: ## %bb.0:
+; X64-NEXT: movslq %edi, %rax
+; X64-NEXT: retq
+; X64-NEXT: ## -- End function
+ %u = sext i32 %x to i64
+ ret i64 %u
+}
+
+define i64 @test20(i32 %x) nounwind {
+; X32-LABEL: test20:
+; X32: ## %bb.0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: xorl %edx, %edx
+; X32-NEXT: retl
+; X32-NEXT: ## -- End function
+;
+; X64-LABEL: test20:
+; X64: ## %bb.0:
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: retq
+; X64-NEXT: ## -- End function
+ %u = zext i32 %x to i64
+ ret i64 %u
+}
diff --git a/test/CodeGen/X86/fast-isel-shift.ll b/test/CodeGen/X86/fast-isel-shift.ll
new file mode 100644
index 000000000000..5d416e18260c
--- /dev/null
+++ b/test/CodeGen/X86/fast-isel-shift.ll
@@ -0,0 +1,383 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -fast-isel -fast-isel-abort=1 -mtriple=x86_64-apple-darwin10 | FileCheck %s
+
+define i8 @shl_i8(i8 %a, i8 %b) {
+; CHECK-LABEL: shl_i8:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: movl %esi, %ecx
+; CHECK-NEXT: shlb %cl, %dil
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: retq
+ %c = shl i8 %a, %b
+ ret i8 %c
+}
+
+define i16 @shl_i16(i16 %a, i16 %b) {
+; CHECK-LABEL: shl_i16:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: movl %esi, %ecx
+; CHECK-NEXT: ## kill: def %cl killed %cx
+; CHECK-NEXT: shlw %cl, %di
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: retq
+ %c = shl i16 %a, %b
+ ret i16 %c
+}
+
+define i32 @shl_i32(i32 %a, i32 %b) {
+; CHECK-LABEL: shl_i32:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: movl %esi, %ecx
+; CHECK-NEXT: ## kill: def %cl killed %ecx
+; CHECK-NEXT: shll %cl, %edi
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: retq
+ %c = shl i32 %a, %b
+ ret i32 %c
+}
+
+define i64 @shl_i64(i64 %a, i64 %b) {
+; CHECK-LABEL: shl_i64:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: movq %rsi, %rcx
+; CHECK-NEXT: ## kill: def %cl killed %rcx
+; CHECK-NEXT: shlq %cl, %rdi
+; CHECK-NEXT: movq %rdi, %rax
+; CHECK-NEXT: retq
+ %c = shl i64 %a, %b
+ ret i64 %c
+}
+
+define i8 @lshr_i8(i8 %a, i8 %b) {
+; CHECK-LABEL: lshr_i8:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: movl %esi, %ecx
+; CHECK-NEXT: shrb %cl, %dil
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: retq
+ %c = lshr i8 %a, %b
+ ret i8 %c
+}
+
+define i16 @lshr_i16(i16 %a, i16 %b) {
+; CHECK-LABEL: lshr_i16:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: movl %esi, %ecx
+; CHECK-NEXT: ## kill: def %cl killed %cx
+; CHECK-NEXT: shrw %cl, %di
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: retq
+ %c = lshr i16 %a, %b
+ ret i16 %c
+}
+
+define i32 @lshr_i32(i32 %a, i32 %b) {
+; CHECK-LABEL: lshr_i32:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: movl %esi, %ecx
+; CHECK-NEXT: ## kill: def %cl killed %ecx
+; CHECK-NEXT: shrl %cl, %edi
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: retq
+ %c = lshr i32 %a, %b
+ ret i32 %c
+}
+
+define i64 @lshr_i64(i64 %a, i64 %b) {
+; CHECK-LABEL: lshr_i64:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: movq %rsi, %rcx
+; CHECK-NEXT: ## kill: def %cl killed %rcx
+; CHECK-NEXT: shrq %cl, %rdi
+; CHECK-NEXT: movq %rdi, %rax
+; CHECK-NEXT: retq
+ %c = lshr i64 %a, %b
+ ret i64 %c
+}
+
+define i8 @ashr_i8(i8 %a, i8 %b) {
+; CHECK-LABEL: ashr_i8:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: movl %esi, %ecx
+; CHECK-NEXT: sarb %cl, %dil
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: retq
+ %c = ashr i8 %a, %b
+ ret i8 %c
+}
+
+define i16 @ashr_i16(i16 %a, i16 %b) {
+; CHECK-LABEL: ashr_i16:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: movl %esi, %ecx
+; CHECK-NEXT: ## kill: def %cl killed %cx
+; CHECK-NEXT: sarw %cl, %di
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: retq
+ %c = ashr i16 %a, %b
+ ret i16 %c
+}
+
+define i32 @ashr_i32(i32 %a, i32 %b) {
+; CHECK-LABEL: ashr_i32:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: movl %esi, %ecx
+; CHECK-NEXT: ## kill: def %cl killed %ecx
+; CHECK-NEXT: sarl %cl, %edi
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: retq
+ %c = ashr i32 %a, %b
+ ret i32 %c
+}
+
+define i64 @ashr_i64(i64 %a, i64 %b) {
+; CHECK-LABEL: ashr_i64:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: movq %rsi, %rcx
+; CHECK-NEXT: ## kill: def %cl killed %rcx
+; CHECK-NEXT: sarq %cl, %rdi
+; CHECK-NEXT: movq %rdi, %rax
+; CHECK-NEXT: retq
+ %c = ashr i64 %a, %b
+ ret i64 %c
+}
+
+define i8 @shl_imm1_i8(i8 %a) {
+; CHECK-LABEL: shl_imm1_i8:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: shlb $1, %dil
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: retq
+ %c = shl i8 %a, 1
+ ret i8 %c
+}
+
+define i16 @shl_imm1_i16(i16 %a) {
+; CHECK-LABEL: shl_imm1_i16:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: ## kill: def %edi killed %edi def %rdi
+; CHECK-NEXT: leal (,%rdi,2), %eax
+; CHECK-NEXT: ## kill: def %ax killed %ax killed %eax
+; CHECK-NEXT: retq
+ %c = shl i16 %a, 1
+ ret i16 %c
+}
+
+define i32 @shl_imm1_i32(i32 %a) {
+; CHECK-LABEL: shl_imm1_i32:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: ## kill: def %edi killed %edi def %rdi
+; CHECK-NEXT: leal (,%rdi,2), %eax
+; CHECK-NEXT: retq
+ %c = shl i32 %a, 1
+ ret i32 %c
+}
+
+define i64 @shl_imm1_i64(i64 %a) {
+; CHECK-LABEL: shl_imm1_i64:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: leaq (,%rdi,2), %rax
+; CHECK-NEXT: retq
+ %c = shl i64 %a, 1
+ ret i64 %c
+}
+
+define i8 @lshr_imm1_i8(i8 %a) {
+; CHECK-LABEL: lshr_imm1_i8:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: shrb $1, %dil
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: retq
+ %c = lshr i8 %a, 1
+ ret i8 %c
+}
+
+define i16 @lshr_imm1_i16(i16 %a) {
+; CHECK-LABEL: lshr_imm1_i16:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: shrw $1, %di
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: retq
+ %c = lshr i16 %a, 1
+ ret i16 %c
+}
+
+define i32 @lshr_imm1_i32(i32 %a) {
+; CHECK-LABEL: lshr_imm1_i32:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: shrl $1, %edi
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: retq
+ %c = lshr i32 %a, 1
+ ret i32 %c
+}
+
+define i64 @lshr_imm1_i64(i64 %a) {
+; CHECK-LABEL: lshr_imm1_i64:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: shrq $1, %rdi
+; CHECK-NEXT: movq %rdi, %rax
+; CHECK-NEXT: retq
+ %c = lshr i64 %a, 1
+ ret i64 %c
+}
+
+define i8 @ashr_imm1_i8(i8 %a) {
+; CHECK-LABEL: ashr_imm1_i8:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: sarb $1, %dil
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: retq
+ %c = ashr i8 %a, 1
+ ret i8 %c
+}
+
+define i16 @ashr_imm1_i16(i16 %a) {
+; CHECK-LABEL: ashr_imm1_i16:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: sarw $1, %di
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: retq
+ %c = ashr i16 %a, 1
+ ret i16 %c
+}
+
+define i32 @ashr_imm1_i32(i32 %a) {
+; CHECK-LABEL: ashr_imm1_i32:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: sarl $1, %edi
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: retq
+ %c = ashr i32 %a, 1
+ ret i32 %c
+}
+
+define i64 @ashr_imm1_i64(i64 %a) {
+; CHECK-LABEL: ashr_imm1_i64:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: sarq $1, %rdi
+; CHECK-NEXT: movq %rdi, %rax
+; CHECK-NEXT: retq
+ %c = ashr i64 %a, 1
+ ret i64 %c
+}
+
+define i8 @shl_imm4_i8(i8 %a) {
+; CHECK-LABEL: shl_imm4_i8:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: shlb $4, %dil
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: retq
+ %c = shl i8 %a, 4
+ ret i8 %c
+}
+
+define i16 @shl_imm4_i16(i16 %a) {
+; CHECK-LABEL: shl_imm4_i16:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: shlw $4, %di
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: retq
+ %c = shl i16 %a, 4
+ ret i16 %c
+}
+
+define i32 @shl_imm4_i32(i32 %a) {
+; CHECK-LABEL: shl_imm4_i32:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: shll $4, %edi
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: retq
+ %c = shl i32 %a, 4
+ ret i32 %c
+}
+
+define i64 @shl_imm4_i64(i64 %a) {
+; CHECK-LABEL: shl_imm4_i64:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: shlq $4, %rdi
+; CHECK-NEXT: movq %rdi, %rax
+; CHECK-NEXT: retq
+ %c = shl i64 %a, 4
+ ret i64 %c
+}
+
+define i8 @lshr_imm4_i8(i8 %a) {
+; CHECK-LABEL: lshr_imm4_i8:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: shrb $4, %dil
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: retq
+ %c = lshr i8 %a, 4
+ ret i8 %c
+}
+
+define i16 @lshr_imm4_i16(i16 %a) {
+; CHECK-LABEL: lshr_imm4_i16:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: shrw $4, %di
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: retq
+ %c = lshr i16 %a, 4
+ ret i16 %c
+}
+
+define i32 @lshr_imm4_i32(i32 %a) {
+; CHECK-LABEL: lshr_imm4_i32:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: shrl $4, %edi
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: retq
+ %c = lshr i32 %a, 4
+ ret i32 %c
+}
+
+define i64 @lshr_imm4_i64(i64 %a) {
+; CHECK-LABEL: lshr_imm4_i64:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: shrq $4, %rdi
+; CHECK-NEXT: movq %rdi, %rax
+; CHECK-NEXT: retq
+ %c = lshr i64 %a, 4
+ ret i64 %c
+}
+
+define i8 @ashr_imm4_i8(i8 %a) {
+; CHECK-LABEL: ashr_imm4_i8:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: sarb $4, %dil
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: retq
+ %c = ashr i8 %a, 4
+ ret i8 %c
+}
+
+define i16 @ashr_imm4_i16(i16 %a) {
+; CHECK-LABEL: ashr_imm4_i16:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: sarw $4, %di
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: retq
+ %c = ashr i16 %a, 4
+ ret i16 %c
+}
+
+define i32 @ashr_imm4_i32(i32 %a) {
+; CHECK-LABEL: ashr_imm4_i32:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: sarl $4, %edi
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: retq
+ %c = ashr i32 %a, 4
+ ret i32 %c
+}
+
+define i64 @ashr_imm4_i64(i64 %a) {
+; CHECK-LABEL: ashr_imm4_i64:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: sarq $4, %rdi
+; CHECK-NEXT: movq %rdi, %rax
+; CHECK-NEXT: retq
+ %c = ashr i64 %a, 4
+ ret i64 %c
+}
diff --git a/test/CodeGen/X86/fast-isel-store.ll b/test/CodeGen/X86/fast-isel-store.ll
index 528682bf70b5..6468186d4cab 100644
--- a/test/CodeGen/X86/fast-isel-store.ll
+++ b/test/CodeGen/X86/fast-isel-store.ll
@@ -10,13 +10,13 @@
define i32 @test_store_32(i32* nocapture %addr, i32 %value) {
; ALL32-LABEL: test_store_32:
-; ALL32: # BB#0: # %entry
+; ALL32: # %bb.0: # %entry
; ALL32-NEXT: movl %esi, (%rdi)
; ALL32-NEXT: movl %esi, %eax
; ALL32-NEXT: retq
;
; ALL64-LABEL: test_store_32:
-; ALL64: # BB#0: # %entry
+; ALL64: # %bb.0: # %entry
; ALL64-NEXT: movl {{[0-9]+}}(%esp), %eax
; ALL64-NEXT: movl {{[0-9]+}}(%esp), %ecx
; ALL64-NEXT: movl %eax, (%ecx)
@@ -28,13 +28,13 @@ entry:
define i16 @test_store_16(i16* nocapture %addr, i16 %value) {
; ALL32-LABEL: test_store_16:
-; ALL32: # BB#0: # %entry
+; ALL32: # %bb.0: # %entry
; ALL32-NEXT: movw %si, (%rdi)
; ALL32-NEXT: movl %esi, %eax
; ALL32-NEXT: retq
;
; ALL64-LABEL: test_store_16:
-; ALL64: # BB#0: # %entry
+; ALL64: # %bb.0: # %entry
; ALL64-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; ALL64-NEXT: movl {{[0-9]+}}(%esp), %ecx
; ALL64-NEXT: movw %ax, (%ecx)
@@ -46,39 +46,39 @@ entry:
define <4 x i32> @test_store_4xi32(<4 x i32>* nocapture %addr, <4 x i32> %value, <4 x i32> %value2) {
; SSE32-LABEL: test_store_4xi32:
-; SSE32: # BB#0:
+; SSE32: # %bb.0:
; SSE32-NEXT: paddd %xmm1, %xmm0
; SSE32-NEXT: movdqu %xmm0, (%rdi)
; SSE32-NEXT: retq
;
; SSE64-LABEL: test_store_4xi32:
-; SSE64: # BB#0:
+; SSE64: # %bb.0:
; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax
; SSE64-NEXT: paddd %xmm1, %xmm0
; SSE64-NEXT: movdqu %xmm0, (%eax)
; SSE64-NEXT: retl
;
; AVXONLY32-LABEL: test_store_4xi32:
-; AVXONLY32: # BB#0:
+; AVXONLY32: # %bb.0:
; AVXONLY32-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVXONLY32-NEXT: vmovdqu %xmm0, (%rdi)
; AVXONLY32-NEXT: retq
;
; AVX64-LABEL: test_store_4xi32:
-; AVX64: # BB#0:
+; AVX64: # %bb.0:
; AVX64-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX64-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX64-NEXT: vmovdqu %xmm0, (%eax)
; AVX64-NEXT: retl
;
; KNL32-LABEL: test_store_4xi32:
-; KNL32: # BB#0:
+; KNL32: # %bb.0:
; KNL32-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; KNL32-NEXT: vmovdqu %xmm0, (%rdi)
; KNL32-NEXT: retq
;
; SKX32-LABEL: test_store_4xi32:
-; SKX32: # BB#0:
+; SKX32: # %bb.0:
; SKX32-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; SKX32-NEXT: vmovdqu %xmm0, (%rdi)
; SKX32-NEXT: retq
@@ -89,39 +89,39 @@ define <4 x i32> @test_store_4xi32(<4 x i32>* nocapture %addr, <4 x i32> %value,
define <4 x i32> @test_store_4xi32_aligned(<4 x i32>* nocapture %addr, <4 x i32> %value, <4 x i32> %value2) {
; SSE32-LABEL: test_store_4xi32_aligned:
-; SSE32: # BB#0:
+; SSE32: # %bb.0:
; SSE32-NEXT: paddd %xmm1, %xmm0
; SSE32-NEXT: movdqa %xmm0, (%rdi)
; SSE32-NEXT: retq
;
; SSE64-LABEL: test_store_4xi32_aligned:
-; SSE64: # BB#0:
+; SSE64: # %bb.0:
; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax
; SSE64-NEXT: paddd %xmm1, %xmm0
; SSE64-NEXT: movdqa %xmm0, (%eax)
; SSE64-NEXT: retl
;
; AVXONLY32-LABEL: test_store_4xi32_aligned:
-; AVXONLY32: # BB#0:
+; AVXONLY32: # %bb.0:
; AVXONLY32-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVXONLY32-NEXT: vmovdqa %xmm0, (%rdi)
; AVXONLY32-NEXT: retq
;
; AVX64-LABEL: test_store_4xi32_aligned:
-; AVX64: # BB#0:
+; AVX64: # %bb.0:
; AVX64-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX64-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX64-NEXT: vmovdqa %xmm0, (%eax)
; AVX64-NEXT: retl
;
; KNL32-LABEL: test_store_4xi32_aligned:
-; KNL32: # BB#0:
+; KNL32: # %bb.0:
; KNL32-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; KNL32-NEXT: vmovdqa %xmm0, (%rdi)
; KNL32-NEXT: retq
;
; SKX32-LABEL: test_store_4xi32_aligned:
-; SKX32: # BB#0:
+; SKX32: # %bb.0:
; SKX32-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; SKX32-NEXT: vmovdqa %xmm0, (%rdi)
; SKX32-NEXT: retq
@@ -132,23 +132,23 @@ define <4 x i32> @test_store_4xi32_aligned(<4 x i32>* nocapture %addr, <4 x i32>
define <4 x float> @test_store_4xf32(<4 x float>* nocapture %addr, <4 x float> %value) {
; SSE32-LABEL: test_store_4xf32:
-; SSE32: # BB#0:
+; SSE32: # %bb.0:
; SSE32-NEXT: movups %xmm0, (%rdi)
; SSE32-NEXT: retq
;
; SSE64-LABEL: test_store_4xf32:
-; SSE64: # BB#0:
+; SSE64: # %bb.0:
; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax
; SSE64-NEXT: movups %xmm0, (%eax)
; SSE64-NEXT: retl
;
; AVX32-LABEL: test_store_4xf32:
-; AVX32: # BB#0:
+; AVX32: # %bb.0:
; AVX32-NEXT: vmovups %xmm0, (%rdi)
; AVX32-NEXT: retq
;
; AVX64-LABEL: test_store_4xf32:
-; AVX64: # BB#0:
+; AVX64: # %bb.0:
; AVX64-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX64-NEXT: vmovups %xmm0, (%eax)
; AVX64-NEXT: retl
@@ -158,23 +158,23 @@ define <4 x float> @test_store_4xf32(<4 x float>* nocapture %addr, <4 x float> %
define <4 x float> @test_store_4xf32_aligned(<4 x float>* nocapture %addr, <4 x float> %value) {
; SSE32-LABEL: test_store_4xf32_aligned:
-; SSE32: # BB#0:
+; SSE32: # %bb.0:
; SSE32-NEXT: movaps %xmm0, (%rdi)
; SSE32-NEXT: retq
;
; SSE64-LABEL: test_store_4xf32_aligned:
-; SSE64: # BB#0:
+; SSE64: # %bb.0:
; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax
; SSE64-NEXT: movaps %xmm0, (%eax)
; SSE64-NEXT: retl
;
; AVX32-LABEL: test_store_4xf32_aligned:
-; AVX32: # BB#0:
+; AVX32: # %bb.0:
; AVX32-NEXT: vmovaps %xmm0, (%rdi)
; AVX32-NEXT: retq
;
; AVX64-LABEL: test_store_4xf32_aligned:
-; AVX64: # BB#0:
+; AVX64: # %bb.0:
; AVX64-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX64-NEXT: vmovaps %xmm0, (%eax)
; AVX64-NEXT: retl
@@ -184,26 +184,26 @@ define <4 x float> @test_store_4xf32_aligned(<4 x float>* nocapture %addr, <4 x
define <2 x double> @test_store_2xf64(<2 x double>* nocapture %addr, <2 x double> %value, <2 x double> %value2) {
; SSE32-LABEL: test_store_2xf64:
-; SSE32: # BB#0:
+; SSE32: # %bb.0:
; SSE32-NEXT: addpd %xmm1, %xmm0
; SSE32-NEXT: movupd %xmm0, (%rdi)
; SSE32-NEXT: retq
;
; SSE64-LABEL: test_store_2xf64:
-; SSE64: # BB#0:
+; SSE64: # %bb.0:
; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax
; SSE64-NEXT: addpd %xmm1, %xmm0
; SSE64-NEXT: movupd %xmm0, (%eax)
; SSE64-NEXT: retl
;
; AVX32-LABEL: test_store_2xf64:
-; AVX32: # BB#0:
+; AVX32: # %bb.0:
; AVX32-NEXT: vaddpd %xmm1, %xmm0, %xmm0
; AVX32-NEXT: vmovupd %xmm0, (%rdi)
; AVX32-NEXT: retq
;
; AVX64-LABEL: test_store_2xf64:
-; AVX64: # BB#0:
+; AVX64: # %bb.0:
; AVX64-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX64-NEXT: vaddpd %xmm1, %xmm0, %xmm0
; AVX64-NEXT: vmovupd %xmm0, (%eax)
@@ -215,26 +215,26 @@ define <2 x double> @test_store_2xf64(<2 x double>* nocapture %addr, <2 x double
define <2 x double> @test_store_2xf64_aligned(<2 x double>* nocapture %addr, <2 x double> %value, <2 x double> %value2) {
; SSE32-LABEL: test_store_2xf64_aligned:
-; SSE32: # BB#0:
+; SSE32: # %bb.0:
; SSE32-NEXT: addpd %xmm1, %xmm0
; SSE32-NEXT: movapd %xmm0, (%rdi)
; SSE32-NEXT: retq
;
; SSE64-LABEL: test_store_2xf64_aligned:
-; SSE64: # BB#0:
+; SSE64: # %bb.0:
; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax
; SSE64-NEXT: addpd %xmm1, %xmm0
; SSE64-NEXT: movapd %xmm0, (%eax)
; SSE64-NEXT: retl
;
; AVX32-LABEL: test_store_2xf64_aligned:
-; AVX32: # BB#0:
+; AVX32: # %bb.0:
; AVX32-NEXT: vaddpd %xmm1, %xmm0, %xmm0
; AVX32-NEXT: vmovapd %xmm0, (%rdi)
; AVX32-NEXT: retq
;
; AVX64-LABEL: test_store_2xf64_aligned:
-; AVX64: # BB#0:
+; AVX64: # %bb.0:
; AVX64-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX64-NEXT: vaddpd %xmm1, %xmm0, %xmm0
; AVX64-NEXT: vmovapd %xmm0, (%eax)
@@ -246,25 +246,25 @@ define <2 x double> @test_store_2xf64_aligned(<2 x double>* nocapture %addr, <2
define <8 x i32> @test_store_8xi32(<8 x i32>* nocapture %addr, <8 x i32> %value) {
; SSE32-LABEL: test_store_8xi32:
-; SSE32: # BB#0:
+; SSE32: # %bb.0:
; SSE32-NEXT: movups %xmm0, (%rdi)
; SSE32-NEXT: movups %xmm1, 16(%rdi)
; SSE32-NEXT: retq
;
; SSE64-LABEL: test_store_8xi32:
-; SSE64: # BB#0:
+; SSE64: # %bb.0:
; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax
; SSE64-NEXT: movups %xmm0, (%eax)
; SSE64-NEXT: movups %xmm1, 16(%eax)
; SSE64-NEXT: retl
;
; AVX32-LABEL: test_store_8xi32:
-; AVX32: # BB#0:
+; AVX32: # %bb.0:
; AVX32-NEXT: vmovups %ymm0, (%rdi)
; AVX32-NEXT: retq
;
; AVX64-LABEL: test_store_8xi32:
-; AVX64: # BB#0:
+; AVX64: # %bb.0:
; AVX64-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX64-NEXT: vmovups %ymm0, (%eax)
; AVX64-NEXT: retl
@@ -274,25 +274,25 @@ define <8 x i32> @test_store_8xi32(<8 x i32>* nocapture %addr, <8 x i32> %value)
define <8 x i32> @test_store_8xi32_aligned(<8 x i32>* nocapture %addr, <8 x i32> %value) {
; SSE32-LABEL: test_store_8xi32_aligned:
-; SSE32: # BB#0:
+; SSE32: # %bb.0:
; SSE32-NEXT: movaps %xmm0, (%rdi)
; SSE32-NEXT: movaps %xmm1, 16(%rdi)
; SSE32-NEXT: retq
;
; SSE64-LABEL: test_store_8xi32_aligned:
-; SSE64: # BB#0:
+; SSE64: # %bb.0:
; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax
; SSE64-NEXT: movaps %xmm0, (%eax)
; SSE64-NEXT: movaps %xmm1, 16(%eax)
; SSE64-NEXT: retl
;
; AVX32-LABEL: test_store_8xi32_aligned:
-; AVX32: # BB#0:
+; AVX32: # %bb.0:
; AVX32-NEXT: vmovaps %ymm0, (%rdi)
; AVX32-NEXT: retq
;
; AVX64-LABEL: test_store_8xi32_aligned:
-; AVX64: # BB#0:
+; AVX64: # %bb.0:
; AVX64-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX64-NEXT: vmovaps %ymm0, (%eax)
; AVX64-NEXT: retl
@@ -302,25 +302,25 @@ define <8 x i32> @test_store_8xi32_aligned(<8 x i32>* nocapture %addr, <8 x i32>
define <8 x float> @test_store_8xf32(<8 x float>* nocapture %addr, <8 x float> %value) {
; SSE32-LABEL: test_store_8xf32:
-; SSE32: # BB#0:
+; SSE32: # %bb.0:
; SSE32-NEXT: movups %xmm0, (%rdi)
; SSE32-NEXT: movups %xmm1, 16(%rdi)
; SSE32-NEXT: retq
;
; SSE64-LABEL: test_store_8xf32:
-; SSE64: # BB#0:
+; SSE64: # %bb.0:
; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax
; SSE64-NEXT: movups %xmm0, (%eax)
; SSE64-NEXT: movups %xmm1, 16(%eax)
; SSE64-NEXT: retl
;
; AVX32-LABEL: test_store_8xf32:
-; AVX32: # BB#0:
+; AVX32: # %bb.0:
; AVX32-NEXT: vmovups %ymm0, (%rdi)
; AVX32-NEXT: retq
;
; AVX64-LABEL: test_store_8xf32:
-; AVX64: # BB#0:
+; AVX64: # %bb.0:
; AVX64-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX64-NEXT: vmovups %ymm0, (%eax)
; AVX64-NEXT: retl
@@ -330,25 +330,25 @@ define <8 x float> @test_store_8xf32(<8 x float>* nocapture %addr, <8 x float> %
define <8 x float> @test_store_8xf32_aligned(<8 x float>* nocapture %addr, <8 x float> %value) {
; SSE32-LABEL: test_store_8xf32_aligned:
-; SSE32: # BB#0:
+; SSE32: # %bb.0:
; SSE32-NEXT: movaps %xmm0, (%rdi)
; SSE32-NEXT: movaps %xmm1, 16(%rdi)
; SSE32-NEXT: retq
;
; SSE64-LABEL: test_store_8xf32_aligned:
-; SSE64: # BB#0:
+; SSE64: # %bb.0:
; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax
; SSE64-NEXT: movaps %xmm0, (%eax)
; SSE64-NEXT: movaps %xmm1, 16(%eax)
; SSE64-NEXT: retl
;
; AVX32-LABEL: test_store_8xf32_aligned:
-; AVX32: # BB#0:
+; AVX32: # %bb.0:
; AVX32-NEXT: vmovaps %ymm0, (%rdi)
; AVX32-NEXT: retq
;
; AVX64-LABEL: test_store_8xf32_aligned:
-; AVX64: # BB#0:
+; AVX64: # %bb.0:
; AVX64-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX64-NEXT: vmovaps %ymm0, (%eax)
; AVX64-NEXT: retl
@@ -358,7 +358,7 @@ define <8 x float> @test_store_8xf32_aligned(<8 x float>* nocapture %addr, <8 x
define <4 x double> @test_store_4xf64(<4 x double>* nocapture %addr, <4 x double> %value, <4 x double> %value2) {
; SSE32-LABEL: test_store_4xf64:
-; SSE32: # BB#0:
+; SSE32: # %bb.0:
; SSE32-NEXT: addpd %xmm3, %xmm1
; SSE32-NEXT: addpd %xmm2, %xmm0
; SSE32-NEXT: movupd %xmm0, (%rdi)
@@ -366,9 +366,8 @@ define <4 x double> @test_store_4xf64(<4 x double>* nocapture %addr, <4 x double
; SSE32-NEXT: retq
;
; SSE64-LABEL: test_store_4xf64:
-; SSE64: # BB#0:
+; SSE64: # %bb.0:
; SSE64-NEXT: subl $12, %esp
-; SSE64-NEXT: .Lcfi0:
; SSE64-NEXT: .cfi_def_cfa_offset 16
; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax
; SSE64-NEXT: addpd {{[0-9]+}}(%esp), %xmm1
@@ -379,13 +378,13 @@ define <4 x double> @test_store_4xf64(<4 x double>* nocapture %addr, <4 x double
; SSE64-NEXT: retl
;
; AVX32-LABEL: test_store_4xf64:
-; AVX32: # BB#0:
+; AVX32: # %bb.0:
; AVX32-NEXT: vaddpd %ymm1, %ymm0, %ymm0
; AVX32-NEXT: vmovupd %ymm0, (%rdi)
; AVX32-NEXT: retq
;
; AVX64-LABEL: test_store_4xf64:
-; AVX64: # BB#0:
+; AVX64: # %bb.0:
; AVX64-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX64-NEXT: vaddpd %ymm1, %ymm0, %ymm0
; AVX64-NEXT: vmovupd %ymm0, (%eax)
@@ -397,7 +396,7 @@ define <4 x double> @test_store_4xf64(<4 x double>* nocapture %addr, <4 x double
define <4 x double> @test_store_4xf64_aligned(<4 x double>* nocapture %addr, <4 x double> %value, <4 x double> %value2) {
; SSE32-LABEL: test_store_4xf64_aligned:
-; SSE32: # BB#0:
+; SSE32: # %bb.0:
; SSE32-NEXT: addpd %xmm3, %xmm1
; SSE32-NEXT: addpd %xmm2, %xmm0
; SSE32-NEXT: movapd %xmm0, (%rdi)
@@ -405,9 +404,8 @@ define <4 x double> @test_store_4xf64_aligned(<4 x double>* nocapture %addr, <4
; SSE32-NEXT: retq
;
; SSE64-LABEL: test_store_4xf64_aligned:
-; SSE64: # BB#0:
+; SSE64: # %bb.0:
; SSE64-NEXT: subl $12, %esp
-; SSE64-NEXT: .Lcfi1:
; SSE64-NEXT: .cfi_def_cfa_offset 16
; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax
; SSE64-NEXT: addpd {{[0-9]+}}(%esp), %xmm1
@@ -418,13 +416,13 @@ define <4 x double> @test_store_4xf64_aligned(<4 x double>* nocapture %addr, <4
; SSE64-NEXT: retl
;
; AVX32-LABEL: test_store_4xf64_aligned:
-; AVX32: # BB#0:
+; AVX32: # %bb.0:
; AVX32-NEXT: vaddpd %ymm1, %ymm0, %ymm0
; AVX32-NEXT: vmovapd %ymm0, (%rdi)
; AVX32-NEXT: retq
;
; AVX64-LABEL: test_store_4xf64_aligned:
-; AVX64: # BB#0:
+; AVX64: # %bb.0:
; AVX64-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX64-NEXT: vaddpd %ymm1, %ymm0, %ymm0
; AVX64-NEXT: vmovapd %ymm0, (%eax)
@@ -436,7 +434,7 @@ define <4 x double> @test_store_4xf64_aligned(<4 x double>* nocapture %addr, <4
define <16 x i32> @test_store_16xi32(<16 x i32>* nocapture %addr, <16 x i32> %value) {
; SSE32-LABEL: test_store_16xi32:
-; SSE32: # BB#0:
+; SSE32: # %bb.0:
; SSE32-NEXT: movups %xmm0, (%rdi)
; SSE32-NEXT: movups %xmm1, 16(%rdi)
; SSE32-NEXT: movups %xmm2, 32(%rdi)
@@ -444,9 +442,8 @@ define <16 x i32> @test_store_16xi32(<16 x i32>* nocapture %addr, <16 x i32> %va
; SSE32-NEXT: retq
;
; SSE64-LABEL: test_store_16xi32:
-; SSE64: # BB#0:
+; SSE64: # %bb.0:
; SSE64-NEXT: subl $12, %esp
-; SSE64-NEXT: .Lcfi2:
; SSE64-NEXT: .cfi_def_cfa_offset 16
; SSE64-NEXT: movaps {{[0-9]+}}(%esp), %xmm3
; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax
@@ -458,25 +455,25 @@ define <16 x i32> @test_store_16xi32(<16 x i32>* nocapture %addr, <16 x i32> %va
; SSE64-NEXT: retl
;
; AVXONLY32-LABEL: test_store_16xi32:
-; AVXONLY32: # BB#0:
+; AVXONLY32: # %bb.0:
; AVXONLY32-NEXT: vmovups %ymm0, (%rdi)
; AVXONLY32-NEXT: vmovups %ymm1, 32(%rdi)
; AVXONLY32-NEXT: retq
;
; AVXONLY64-LABEL: test_store_16xi32:
-; AVXONLY64: # BB#0:
+; AVXONLY64: # %bb.0:
; AVXONLY64-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVXONLY64-NEXT: vmovups %ymm0, (%eax)
; AVXONLY64-NEXT: vmovups %ymm1, 32(%eax)
; AVXONLY64-NEXT: retl
;
; AVX51232-LABEL: test_store_16xi32:
-; AVX51232: # BB#0:
+; AVX51232: # %bb.0:
; AVX51232-NEXT: vmovups %zmm0, (%rdi)
; AVX51232-NEXT: retq
;
; AVX51264-LABEL: test_store_16xi32:
-; AVX51264: # BB#0:
+; AVX51264: # %bb.0:
; AVX51264-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX51264-NEXT: vmovups %zmm0, (%eax)
; AVX51264-NEXT: retl
@@ -486,7 +483,7 @@ define <16 x i32> @test_store_16xi32(<16 x i32>* nocapture %addr, <16 x i32> %va
define <16 x i32> @test_store_16xi32_aligned(<16 x i32>* nocapture %addr, <16 x i32> %value) {
; SSE32-LABEL: test_store_16xi32_aligned:
-; SSE32: # BB#0:
+; SSE32: # %bb.0:
; SSE32-NEXT: movaps %xmm0, (%rdi)
; SSE32-NEXT: movaps %xmm1, 16(%rdi)
; SSE32-NEXT: movaps %xmm2, 32(%rdi)
@@ -494,9 +491,8 @@ define <16 x i32> @test_store_16xi32_aligned(<16 x i32>* nocapture %addr, <16 x
; SSE32-NEXT: retq
;
; SSE64-LABEL: test_store_16xi32_aligned:
-; SSE64: # BB#0:
+; SSE64: # %bb.0:
; SSE64-NEXT: subl $12, %esp
-; SSE64-NEXT: .Lcfi3:
; SSE64-NEXT: .cfi_def_cfa_offset 16
; SSE64-NEXT: movaps {{[0-9]+}}(%esp), %xmm3
; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax
@@ -508,25 +504,25 @@ define <16 x i32> @test_store_16xi32_aligned(<16 x i32>* nocapture %addr, <16 x
; SSE64-NEXT: retl
;
; AVXONLY32-LABEL: test_store_16xi32_aligned:
-; AVXONLY32: # BB#0:
+; AVXONLY32: # %bb.0:
; AVXONLY32-NEXT: vmovaps %ymm0, (%rdi)
; AVXONLY32-NEXT: vmovaps %ymm1, 32(%rdi)
; AVXONLY32-NEXT: retq
;
; AVXONLY64-LABEL: test_store_16xi32_aligned:
-; AVXONLY64: # BB#0:
+; AVXONLY64: # %bb.0:
; AVXONLY64-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVXONLY64-NEXT: vmovaps %ymm0, (%eax)
; AVXONLY64-NEXT: vmovaps %ymm1, 32(%eax)
; AVXONLY64-NEXT: retl
;
; AVX51232-LABEL: test_store_16xi32_aligned:
-; AVX51232: # BB#0:
+; AVX51232: # %bb.0:
; AVX51232-NEXT: vmovaps %zmm0, (%rdi)
; AVX51232-NEXT: retq
;
; AVX51264-LABEL: test_store_16xi32_aligned:
-; AVX51264: # BB#0:
+; AVX51264: # %bb.0:
; AVX51264-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX51264-NEXT: vmovaps %zmm0, (%eax)
; AVX51264-NEXT: retl
@@ -536,7 +532,7 @@ define <16 x i32> @test_store_16xi32_aligned(<16 x i32>* nocapture %addr, <16 x
define <16 x float> @test_store_16xf32(<16 x float>* nocapture %addr, <16 x float> %value) {
; SSE32-LABEL: test_store_16xf32:
-; SSE32: # BB#0:
+; SSE32: # %bb.0:
; SSE32-NEXT: movups %xmm0, (%rdi)
; SSE32-NEXT: movups %xmm1, 16(%rdi)
; SSE32-NEXT: movups %xmm2, 32(%rdi)
@@ -544,9 +540,8 @@ define <16 x float> @test_store_16xf32(<16 x float>* nocapture %addr, <16 x floa
; SSE32-NEXT: retq
;
; SSE64-LABEL: test_store_16xf32:
-; SSE64: # BB#0:
+; SSE64: # %bb.0:
; SSE64-NEXT: subl $12, %esp
-; SSE64-NEXT: .Lcfi4:
; SSE64-NEXT: .cfi_def_cfa_offset 16
; SSE64-NEXT: movaps {{[0-9]+}}(%esp), %xmm3
; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax
@@ -558,25 +553,25 @@ define <16 x float> @test_store_16xf32(<16 x float>* nocapture %addr, <16 x floa
; SSE64-NEXT: retl
;
; AVXONLY32-LABEL: test_store_16xf32:
-; AVXONLY32: # BB#0:
+; AVXONLY32: # %bb.0:
; AVXONLY32-NEXT: vmovups %ymm0, (%rdi)
; AVXONLY32-NEXT: vmovups %ymm1, 32(%rdi)
; AVXONLY32-NEXT: retq
;
; AVXONLY64-LABEL: test_store_16xf32:
-; AVXONLY64: # BB#0:
+; AVXONLY64: # %bb.0:
; AVXONLY64-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVXONLY64-NEXT: vmovups %ymm0, (%eax)
; AVXONLY64-NEXT: vmovups %ymm1, 32(%eax)
; AVXONLY64-NEXT: retl
;
; AVX51232-LABEL: test_store_16xf32:
-; AVX51232: # BB#0:
+; AVX51232: # %bb.0:
; AVX51232-NEXT: vmovups %zmm0, (%rdi)
; AVX51232-NEXT: retq
;
; AVX51264-LABEL: test_store_16xf32:
-; AVX51264: # BB#0:
+; AVX51264: # %bb.0:
; AVX51264-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX51264-NEXT: vmovups %zmm0, (%eax)
; AVX51264-NEXT: retl
@@ -586,7 +581,7 @@ define <16 x float> @test_store_16xf32(<16 x float>* nocapture %addr, <16 x floa
define <16 x float> @test_store_16xf32_aligned(<16 x float>* nocapture %addr, <16 x float> %value) {
; SSE32-LABEL: test_store_16xf32_aligned:
-; SSE32: # BB#0:
+; SSE32: # %bb.0:
; SSE32-NEXT: movaps %xmm0, (%rdi)
; SSE32-NEXT: movaps %xmm1, 16(%rdi)
; SSE32-NEXT: movaps %xmm2, 32(%rdi)
@@ -594,9 +589,8 @@ define <16 x float> @test_store_16xf32_aligned(<16 x float>* nocapture %addr, <1
; SSE32-NEXT: retq
;
; SSE64-LABEL: test_store_16xf32_aligned:
-; SSE64: # BB#0:
+; SSE64: # %bb.0:
; SSE64-NEXT: subl $12, %esp
-; SSE64-NEXT: .Lcfi5:
; SSE64-NEXT: .cfi_def_cfa_offset 16
; SSE64-NEXT: movaps {{[0-9]+}}(%esp), %xmm3
; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax
@@ -608,25 +602,25 @@ define <16 x float> @test_store_16xf32_aligned(<16 x float>* nocapture %addr, <1
; SSE64-NEXT: retl
;
; AVXONLY32-LABEL: test_store_16xf32_aligned:
-; AVXONLY32: # BB#0:
+; AVXONLY32: # %bb.0:
; AVXONLY32-NEXT: vmovaps %ymm0, (%rdi)
; AVXONLY32-NEXT: vmovaps %ymm1, 32(%rdi)
; AVXONLY32-NEXT: retq
;
; AVXONLY64-LABEL: test_store_16xf32_aligned:
-; AVXONLY64: # BB#0:
+; AVXONLY64: # %bb.0:
; AVXONLY64-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVXONLY64-NEXT: vmovaps %ymm0, (%eax)
; AVXONLY64-NEXT: vmovaps %ymm1, 32(%eax)
; AVXONLY64-NEXT: retl
;
; AVX51232-LABEL: test_store_16xf32_aligned:
-; AVX51232: # BB#0:
+; AVX51232: # %bb.0:
; AVX51232-NEXT: vmovaps %zmm0, (%rdi)
; AVX51232-NEXT: retq
;
; AVX51264-LABEL: test_store_16xf32_aligned:
-; AVX51264: # BB#0:
+; AVX51264: # %bb.0:
; AVX51264-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX51264-NEXT: vmovaps %zmm0, (%eax)
; AVX51264-NEXT: retl
@@ -636,7 +630,7 @@ define <16 x float> @test_store_16xf32_aligned(<16 x float>* nocapture %addr, <1
define <8 x double> @test_store_8xf64(<8 x double>* nocapture %addr, <8 x double> %value, <8 x double> %value2) {
; SSE32-LABEL: test_store_8xf64:
-; SSE32: # BB#0:
+; SSE32: # %bb.0:
; SSE32-NEXT: addpd %xmm7, %xmm3
; SSE32-NEXT: addpd %xmm6, %xmm2
; SSE32-NEXT: addpd %xmm5, %xmm1
@@ -648,9 +642,8 @@ define <8 x double> @test_store_8xf64(<8 x double>* nocapture %addr, <8 x double
; SSE32-NEXT: retq
;
; SSE64-LABEL: test_store_8xf64:
-; SSE64: # BB#0:
+; SSE64: # %bb.0:
; SSE64-NEXT: subl $12, %esp
-; SSE64-NEXT: .Lcfi6:
; SSE64-NEXT: .cfi_def_cfa_offset 16
; SSE64-NEXT: movapd {{[0-9]+}}(%esp), %xmm3
; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax
@@ -666,7 +659,7 @@ define <8 x double> @test_store_8xf64(<8 x double>* nocapture %addr, <8 x double
; SSE64-NEXT: retl
;
; AVXONLY32-LABEL: test_store_8xf64:
-; AVXONLY32: # BB#0:
+; AVXONLY32: # %bb.0:
; AVXONLY32-NEXT: vaddpd %ymm3, %ymm1, %ymm1
; AVXONLY32-NEXT: vaddpd %ymm2, %ymm0, %ymm0
; AVXONLY32-NEXT: vmovupd %ymm0, (%rdi)
@@ -674,14 +667,11 @@ define <8 x double> @test_store_8xf64(<8 x double>* nocapture %addr, <8 x double
; AVXONLY32-NEXT: retq
;
; AVXONLY64-LABEL: test_store_8xf64:
-; AVXONLY64: # BB#0:
+; AVXONLY64: # %bb.0:
; AVXONLY64-NEXT: pushl %ebp
-; AVXONLY64-NEXT: .Lcfi0:
; AVXONLY64-NEXT: .cfi_def_cfa_offset 8
-; AVXONLY64-NEXT: .Lcfi1:
; AVXONLY64-NEXT: .cfi_offset %ebp, -8
; AVXONLY64-NEXT: movl %esp, %ebp
-; AVXONLY64-NEXT: .Lcfi2:
; AVXONLY64-NEXT: .cfi_def_cfa_register %ebp
; AVXONLY64-NEXT: andl $-32, %esp
; AVXONLY64-NEXT: subl $32, %esp
@@ -695,13 +685,13 @@ define <8 x double> @test_store_8xf64(<8 x double>* nocapture %addr, <8 x double
; AVXONLY64-NEXT: retl
;
; AVX51232-LABEL: test_store_8xf64:
-; AVX51232: # BB#0:
+; AVX51232: # %bb.0:
; AVX51232-NEXT: vaddpd %zmm1, %zmm0, %zmm0
; AVX51232-NEXT: vmovupd %zmm0, (%rdi)
; AVX51232-NEXT: retq
;
; AVX51264-LABEL: test_store_8xf64:
-; AVX51264: # BB#0:
+; AVX51264: # %bb.0:
; AVX51264-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX51264-NEXT: vaddpd %zmm1, %zmm0, %zmm0
; AVX51264-NEXT: vmovupd %zmm0, (%eax)
@@ -713,7 +703,7 @@ define <8 x double> @test_store_8xf64(<8 x double>* nocapture %addr, <8 x double
define <8 x double> @test_store_8xf64_aligned(<8 x double>* nocapture %addr, <8 x double> %value, <8 x double> %value2) {
; SSE32-LABEL: test_store_8xf64_aligned:
-; SSE32: # BB#0:
+; SSE32: # %bb.0:
; SSE32-NEXT: addpd %xmm7, %xmm3
; SSE32-NEXT: addpd %xmm6, %xmm2
; SSE32-NEXT: addpd %xmm5, %xmm1
@@ -725,9 +715,8 @@ define <8 x double> @test_store_8xf64_aligned(<8 x double>* nocapture %addr, <8
; SSE32-NEXT: retq
;
; SSE64-LABEL: test_store_8xf64_aligned:
-; SSE64: # BB#0:
+; SSE64: # %bb.0:
; SSE64-NEXT: subl $12, %esp
-; SSE64-NEXT: .Lcfi7:
; SSE64-NEXT: .cfi_def_cfa_offset 16
; SSE64-NEXT: movapd {{[0-9]+}}(%esp), %xmm3
; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax
@@ -743,7 +732,7 @@ define <8 x double> @test_store_8xf64_aligned(<8 x double>* nocapture %addr, <8
; SSE64-NEXT: retl
;
; AVXONLY32-LABEL: test_store_8xf64_aligned:
-; AVXONLY32: # BB#0:
+; AVXONLY32: # %bb.0:
; AVXONLY32-NEXT: vaddpd %ymm3, %ymm1, %ymm1
; AVXONLY32-NEXT: vaddpd %ymm2, %ymm0, %ymm0
; AVXONLY32-NEXT: vmovapd %ymm0, (%rdi)
@@ -751,14 +740,11 @@ define <8 x double> @test_store_8xf64_aligned(<8 x double>* nocapture %addr, <8
; AVXONLY32-NEXT: retq
;
; AVXONLY64-LABEL: test_store_8xf64_aligned:
-; AVXONLY64: # BB#0:
+; AVXONLY64: # %bb.0:
; AVXONLY64-NEXT: pushl %ebp
-; AVXONLY64-NEXT: .Lcfi3:
; AVXONLY64-NEXT: .cfi_def_cfa_offset 8
-; AVXONLY64-NEXT: .Lcfi4:
; AVXONLY64-NEXT: .cfi_offset %ebp, -8
; AVXONLY64-NEXT: movl %esp, %ebp
-; AVXONLY64-NEXT: .Lcfi5:
; AVXONLY64-NEXT: .cfi_def_cfa_register %ebp
; AVXONLY64-NEXT: andl $-32, %esp
; AVXONLY64-NEXT: subl $32, %esp
@@ -772,13 +758,13 @@ define <8 x double> @test_store_8xf64_aligned(<8 x double>* nocapture %addr, <8
; AVXONLY64-NEXT: retl
;
; AVX51232-LABEL: test_store_8xf64_aligned:
-; AVX51232: # BB#0:
+; AVX51232: # %bb.0:
; AVX51232-NEXT: vaddpd %zmm1, %zmm0, %zmm0
; AVX51232-NEXT: vmovapd %zmm0, (%rdi)
; AVX51232-NEXT: retq
;
; AVX51264-LABEL: test_store_8xf64_aligned:
-; AVX51264: # BB#0:
+; AVX51264: # %bb.0:
; AVX51264-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX51264-NEXT: vaddpd %zmm1, %zmm0, %zmm0
; AVX51264-NEXT: vmovapd %zmm0, (%eax)
diff --git a/test/CodeGen/X86/fast-isel-tailcall.ll b/test/CodeGen/X86/fast-isel-tailcall.ll
index 88ad05e8e1a7..6f3e060ceca2 100644
--- a/test/CodeGen/X86/fast-isel-tailcall.ll
+++ b/test/CodeGen/X86/fast-isel-tailcall.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -fast-isel -tailcallopt -march=x86 | FileCheck %s
+; RUN: llc < %s -fast-isel -tailcallopt -mtriple=i686-- | FileCheck %s
; CHECK-NOT: add
; PR4154
diff --git a/test/CodeGen/X86/fast-isel-tls.ll b/test/CodeGen/X86/fast-isel-tls.ll
index 0b7a5d9759d2..95cc7842f6f6 100644
--- a/test/CodeGen/X86/fast-isel-tls.ll
+++ b/test/CodeGen/X86/fast-isel-tls.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -relocation-model=pic -mtriple=i686-unknown-linux-gnu -fast-isel | FileCheck %s
+; RUN: llc < %s -relocation-model=pic -mtriple=i686-unknown-linux-gnu -fast-isel | FileCheck %s
; PR3654
@v = thread_local global i32 0
diff --git a/test/CodeGen/X86/fast-isel-vecload.ll b/test/CodeGen/X86/fast-isel-vecload.ll
index f7051b8c8e91..31730493fb50 100644
--- a/test/CodeGen/X86/fast-isel-vecload.ll
+++ b/test/CodeGen/X86/fast-isel-vecload.ll
@@ -9,22 +9,22 @@
define <16 x i8> @test_v16i8(<16 x i8>* %V) {
; SSE-LABEL: test_v16i8:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: movdqa (%rdi), %xmm0
; SSE-NEXT: retq
;
; AVXONLY-LABEL: test_v16i8:
-; AVXONLY: # BB#0: # %entry
+; AVXONLY: # %bb.0: # %entry
; AVXONLY-NEXT: vmovdqa (%rdi), %xmm0
; AVXONLY-NEXT: retq
;
; KNL-LABEL: test_v16i8:
-; KNL: # BB#0: # %entry
+; KNL: # %bb.0: # %entry
; KNL-NEXT: vmovdqa (%rdi), %xmm0
; KNL-NEXT: retq
;
; SKX-LABEL: test_v16i8:
-; SKX: # BB#0: # %entry
+; SKX: # %bb.0: # %entry
; SKX-NEXT: vmovdqa64 (%rdi), %xmm0
; SKX-NEXT: retq
entry:
@@ -34,22 +34,22 @@ entry:
define <8 x i16> @test_v8i16(<8 x i16>* %V) {
; SSE-LABEL: test_v8i16:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: movdqa (%rdi), %xmm0
; SSE-NEXT: retq
;
; AVXONLY-LABEL: test_v8i16:
-; AVXONLY: # BB#0: # %entry
+; AVXONLY: # %bb.0: # %entry
; AVXONLY-NEXT: vmovdqa (%rdi), %xmm0
; AVXONLY-NEXT: retq
;
; KNL-LABEL: test_v8i16:
-; KNL: # BB#0: # %entry
+; KNL: # %bb.0: # %entry
; KNL-NEXT: vmovdqa (%rdi), %xmm0
; KNL-NEXT: retq
;
; SKX-LABEL: test_v8i16:
-; SKX: # BB#0: # %entry
+; SKX: # %bb.0: # %entry
; SKX-NEXT: vmovdqa64 (%rdi), %xmm0
; SKX-NEXT: retq
entry:
@@ -59,22 +59,22 @@ entry:
define <4 x i32> @test_v4i32(<4 x i32>* %V) {
; SSE-LABEL: test_v4i32:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: movdqa (%rdi), %xmm0
; SSE-NEXT: retq
;
; AVXONLY-LABEL: test_v4i32:
-; AVXONLY: # BB#0: # %entry
+; AVXONLY: # %bb.0: # %entry
; AVXONLY-NEXT: vmovdqa (%rdi), %xmm0
; AVXONLY-NEXT: retq
;
; KNL-LABEL: test_v4i32:
-; KNL: # BB#0: # %entry
+; KNL: # %bb.0: # %entry
; KNL-NEXT: vmovdqa (%rdi), %xmm0
; KNL-NEXT: retq
;
; SKX-LABEL: test_v4i32:
-; SKX: # BB#0: # %entry
+; SKX: # %bb.0: # %entry
; SKX-NEXT: vmovdqa64 (%rdi), %xmm0
; SKX-NEXT: retq
entry:
@@ -84,22 +84,22 @@ entry:
define <2 x i64> @test_v2i64(<2 x i64>* %V) {
; SSE-LABEL: test_v2i64:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: movdqa (%rdi), %xmm0
; SSE-NEXT: retq
;
; AVXONLY-LABEL: test_v2i64:
-; AVXONLY: # BB#0: # %entry
+; AVXONLY: # %bb.0: # %entry
; AVXONLY-NEXT: vmovdqa (%rdi), %xmm0
; AVXONLY-NEXT: retq
;
; KNL-LABEL: test_v2i64:
-; KNL: # BB#0: # %entry
+; KNL: # %bb.0: # %entry
; KNL-NEXT: vmovdqa (%rdi), %xmm0
; KNL-NEXT: retq
;
; SKX-LABEL: test_v2i64:
-; SKX: # BB#0: # %entry
+; SKX: # %bb.0: # %entry
; SKX-NEXT: vmovdqa64 (%rdi), %xmm0
; SKX-NEXT: retq
entry:
@@ -109,22 +109,22 @@ entry:
define <16 x i8> @test_v16i8_unaligned(<16 x i8>* %V) {
; SSE-LABEL: test_v16i8_unaligned:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: movdqu (%rdi), %xmm0
; SSE-NEXT: retq
;
; AVXONLY-LABEL: test_v16i8_unaligned:
-; AVXONLY: # BB#0: # %entry
+; AVXONLY: # %bb.0: # %entry
; AVXONLY-NEXT: vmovdqu (%rdi), %xmm0
; AVXONLY-NEXT: retq
;
; KNL-LABEL: test_v16i8_unaligned:
-; KNL: # BB#0: # %entry
+; KNL: # %bb.0: # %entry
; KNL-NEXT: vmovdqu (%rdi), %xmm0
; KNL-NEXT: retq
;
; SKX-LABEL: test_v16i8_unaligned:
-; SKX: # BB#0: # %entry
+; SKX: # %bb.0: # %entry
; SKX-NEXT: vmovdqu64 (%rdi), %xmm0
; SKX-NEXT: retq
entry:
@@ -134,22 +134,22 @@ entry:
define <8 x i16> @test_v8i16_unaligned(<8 x i16>* %V) {
; SSE-LABEL: test_v8i16_unaligned:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: movdqu (%rdi), %xmm0
; SSE-NEXT: retq
;
; AVXONLY-LABEL: test_v8i16_unaligned:
-; AVXONLY: # BB#0: # %entry
+; AVXONLY: # %bb.0: # %entry
; AVXONLY-NEXT: vmovdqu (%rdi), %xmm0
; AVXONLY-NEXT: retq
;
; KNL-LABEL: test_v8i16_unaligned:
-; KNL: # BB#0: # %entry
+; KNL: # %bb.0: # %entry
; KNL-NEXT: vmovdqu (%rdi), %xmm0
; KNL-NEXT: retq
;
; SKX-LABEL: test_v8i16_unaligned:
-; SKX: # BB#0: # %entry
+; SKX: # %bb.0: # %entry
; SKX-NEXT: vmovdqu64 (%rdi), %xmm0
; SKX-NEXT: retq
entry:
@@ -159,22 +159,22 @@ entry:
define <4 x i32> @test_v4i32_unaligned(<4 x i32>* %V) {
; SSE-LABEL: test_v4i32_unaligned:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: movdqu (%rdi), %xmm0
; SSE-NEXT: retq
;
; AVXONLY-LABEL: test_v4i32_unaligned:
-; AVXONLY: # BB#0: # %entry
+; AVXONLY: # %bb.0: # %entry
; AVXONLY-NEXT: vmovdqu (%rdi), %xmm0
; AVXONLY-NEXT: retq
;
; KNL-LABEL: test_v4i32_unaligned:
-; KNL: # BB#0: # %entry
+; KNL: # %bb.0: # %entry
; KNL-NEXT: vmovdqu (%rdi), %xmm0
; KNL-NEXT: retq
;
; SKX-LABEL: test_v4i32_unaligned:
-; SKX: # BB#0: # %entry
+; SKX: # %bb.0: # %entry
; SKX-NEXT: vmovdqu64 (%rdi), %xmm0
; SKX-NEXT: retq
entry:
@@ -184,22 +184,22 @@ entry:
define <2 x i64> @test_v2i64_unaligned(<2 x i64>* %V) {
; SSE-LABEL: test_v2i64_unaligned:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: movdqu (%rdi), %xmm0
; SSE-NEXT: retq
;
; AVXONLY-LABEL: test_v2i64_unaligned:
-; AVXONLY: # BB#0: # %entry
+; AVXONLY: # %bb.0: # %entry
; AVXONLY-NEXT: vmovdqu (%rdi), %xmm0
; AVXONLY-NEXT: retq
;
; KNL-LABEL: test_v2i64_unaligned:
-; KNL: # BB#0: # %entry
+; KNL: # %bb.0: # %entry
; KNL-NEXT: vmovdqu (%rdi), %xmm0
; KNL-NEXT: retq
;
; SKX-LABEL: test_v2i64_unaligned:
-; SKX: # BB#0: # %entry
+; SKX: # %bb.0: # %entry
; SKX-NEXT: vmovdqu64 (%rdi), %xmm0
; SKX-NEXT: retq
entry:
@@ -209,12 +209,12 @@ entry:
define <4 x float> @test_v4f32(<4 x float>* %V) {
; SSE-LABEL: test_v4f32:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: movaps (%rdi), %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test_v4f32:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vmovaps (%rdi), %xmm0
; AVX-NEXT: retq
entry:
@@ -224,12 +224,12 @@ entry:
define <2 x double> @test_v2f64(<2 x double>* %V) {
; SSE-LABEL: test_v2f64:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: movapd (%rdi), %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test_v2f64:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vmovapd (%rdi), %xmm0
; AVX-NEXT: retq
entry:
@@ -239,12 +239,12 @@ entry:
define <4 x float> @test_v4f32_unaligned(<4 x float>* %V) {
; SSE-LABEL: test_v4f32_unaligned:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: movups (%rdi), %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test_v4f32_unaligned:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vmovups (%rdi), %xmm0
; AVX-NEXT: retq
entry:
@@ -254,12 +254,12 @@ entry:
define <2 x double> @test_v2f64_unaligned(<2 x double>* %V) {
; SSE-LABEL: test_v2f64_unaligned:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: movupd (%rdi), %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test_v2f64_unaligned:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vmovupd (%rdi), %xmm0
; AVX-NEXT: retq
entry:
@@ -269,22 +269,22 @@ entry:
define <16 x i8> @test_v16i8_abi_alignment(<16 x i8>* %V) {
; SSE-LABEL: test_v16i8_abi_alignment:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: movdqa (%rdi), %xmm0
; SSE-NEXT: retq
;
; AVXONLY-LABEL: test_v16i8_abi_alignment:
-; AVXONLY: # BB#0: # %entry
+; AVXONLY: # %bb.0: # %entry
; AVXONLY-NEXT: vmovdqa (%rdi), %xmm0
; AVXONLY-NEXT: retq
;
; KNL-LABEL: test_v16i8_abi_alignment:
-; KNL: # BB#0: # %entry
+; KNL: # %bb.0: # %entry
; KNL-NEXT: vmovdqa (%rdi), %xmm0
; KNL-NEXT: retq
;
; SKX-LABEL: test_v16i8_abi_alignment:
-; SKX: # BB#0: # %entry
+; SKX: # %bb.0: # %entry
; SKX-NEXT: vmovdqa64 (%rdi), %xmm0
; SKX-NEXT: retq
entry:
@@ -294,22 +294,22 @@ entry:
define <8 x i16> @test_v8i16_abi_alignment(<8 x i16>* %V) {
; SSE-LABEL: test_v8i16_abi_alignment:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: movdqa (%rdi), %xmm0
; SSE-NEXT: retq
;
; AVXONLY-LABEL: test_v8i16_abi_alignment:
-; AVXONLY: # BB#0: # %entry
+; AVXONLY: # %bb.0: # %entry
; AVXONLY-NEXT: vmovdqa (%rdi), %xmm0
; AVXONLY-NEXT: retq
;
; KNL-LABEL: test_v8i16_abi_alignment:
-; KNL: # BB#0: # %entry
+; KNL: # %bb.0: # %entry
; KNL-NEXT: vmovdqa (%rdi), %xmm0
; KNL-NEXT: retq
;
; SKX-LABEL: test_v8i16_abi_alignment:
-; SKX: # BB#0: # %entry
+; SKX: # %bb.0: # %entry
; SKX-NEXT: vmovdqa64 (%rdi), %xmm0
; SKX-NEXT: retq
entry:
@@ -319,22 +319,22 @@ entry:
define <4 x i32> @test_v4i32_abi_alignment(<4 x i32>* %V) {
; SSE-LABEL: test_v4i32_abi_alignment:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: movdqa (%rdi), %xmm0
; SSE-NEXT: retq
;
; AVXONLY-LABEL: test_v4i32_abi_alignment:
-; AVXONLY: # BB#0: # %entry
+; AVXONLY: # %bb.0: # %entry
; AVXONLY-NEXT: vmovdqa (%rdi), %xmm0
; AVXONLY-NEXT: retq
;
; KNL-LABEL: test_v4i32_abi_alignment:
-; KNL: # BB#0: # %entry
+; KNL: # %bb.0: # %entry
; KNL-NEXT: vmovdqa (%rdi), %xmm0
; KNL-NEXT: retq
;
; SKX-LABEL: test_v4i32_abi_alignment:
-; SKX: # BB#0: # %entry
+; SKX: # %bb.0: # %entry
; SKX-NEXT: vmovdqa64 (%rdi), %xmm0
; SKX-NEXT: retq
entry:
@@ -344,22 +344,22 @@ entry:
define <2 x i64> @test_v2i64_abi_alignment(<2 x i64>* %V) {
; SSE-LABEL: test_v2i64_abi_alignment:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: movdqa (%rdi), %xmm0
; SSE-NEXT: retq
;
; AVXONLY-LABEL: test_v2i64_abi_alignment:
-; AVXONLY: # BB#0: # %entry
+; AVXONLY: # %bb.0: # %entry
; AVXONLY-NEXT: vmovdqa (%rdi), %xmm0
; AVXONLY-NEXT: retq
;
; KNL-LABEL: test_v2i64_abi_alignment:
-; KNL: # BB#0: # %entry
+; KNL: # %bb.0: # %entry
; KNL-NEXT: vmovdqa (%rdi), %xmm0
; KNL-NEXT: retq
;
; SKX-LABEL: test_v2i64_abi_alignment:
-; SKX: # BB#0: # %entry
+; SKX: # %bb.0: # %entry
; SKX-NEXT: vmovdqa64 (%rdi), %xmm0
; SKX-NEXT: retq
entry:
@@ -369,12 +369,12 @@ entry:
define <4 x float> @test_v4f32_abi_alignment(<4 x float>* %V) {
; SSE-LABEL: test_v4f32_abi_alignment:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: movaps (%rdi), %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test_v4f32_abi_alignment:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vmovaps (%rdi), %xmm0
; AVX-NEXT: retq
entry:
@@ -384,12 +384,12 @@ entry:
define <2 x double> @test_v2f64_abi_alignment(<2 x double>* %V) {
; SSE-LABEL: test_v2f64_abi_alignment:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: movapd (%rdi), %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test_v2f64_abi_alignment:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vmovapd (%rdi), %xmm0
; AVX-NEXT: retq
entry:
@@ -399,23 +399,23 @@ entry:
define <32 x i8> @test_v32i8(<32 x i8>* %V) {
; SSE-LABEL: test_v32i8:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: movaps (%rdi), %xmm0
; SSE-NEXT: movaps 16(%rdi), %xmm1
; SSE-NEXT: retq
;
; AVXONLY-LABEL: test_v32i8:
-; AVXONLY: # BB#0: # %entry
+; AVXONLY: # %bb.0: # %entry
; AVXONLY-NEXT: vmovdqa (%rdi), %ymm0
; AVXONLY-NEXT: retq
;
; KNL-LABEL: test_v32i8:
-; KNL: # BB#0: # %entry
+; KNL: # %bb.0: # %entry
; KNL-NEXT: vmovdqa (%rdi), %ymm0
; KNL-NEXT: retq
;
; SKX-LABEL: test_v32i8:
-; SKX: # BB#0: # %entry
+; SKX: # %bb.0: # %entry
; SKX-NEXT: vmovdqa64 (%rdi), %ymm0
; SKX-NEXT: retq
entry:
@@ -425,23 +425,23 @@ entry:
define <16 x i16> @test_v16i16(<16 x i16>* %V) {
; SSE-LABEL: test_v16i16:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: movaps (%rdi), %xmm0
; SSE-NEXT: movaps 16(%rdi), %xmm1
; SSE-NEXT: retq
;
; AVXONLY-LABEL: test_v16i16:
-; AVXONLY: # BB#0: # %entry
+; AVXONLY: # %bb.0: # %entry
; AVXONLY-NEXT: vmovdqa (%rdi), %ymm0
; AVXONLY-NEXT: retq
;
; KNL-LABEL: test_v16i16:
-; KNL: # BB#0: # %entry
+; KNL: # %bb.0: # %entry
; KNL-NEXT: vmovdqa (%rdi), %ymm0
; KNL-NEXT: retq
;
; SKX-LABEL: test_v16i16:
-; SKX: # BB#0: # %entry
+; SKX: # %bb.0: # %entry
; SKX-NEXT: vmovdqa64 (%rdi), %ymm0
; SKX-NEXT: retq
entry:
@@ -451,49 +451,49 @@ entry:
define <8 x i32> @test_v8i32(<8 x i32>* %V) {
; SSE-LABEL: test_v8i32:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: movaps (%rdi), %xmm0
; SSE-NEXT: movaps 16(%rdi), %xmm1
; SSE-NEXT: retq
;
; AVXONLY-LABEL: test_v8i32:
-; AVXONLY: # BB#0: # %entry
-; AVXONLY-NEXT: vmovdqu (%rdi), %ymm0
+; AVXONLY: # %bb.0: # %entry
+; AVXONLY-NEXT: vmovdqa (%rdi), %ymm0
; AVXONLY-NEXT: retq
;
; KNL-LABEL: test_v8i32:
-; KNL: # BB#0: # %entry
-; KNL-NEXT: vmovdqu (%rdi), %ymm0
+; KNL: # %bb.0: # %entry
+; KNL-NEXT: vmovdqa (%rdi), %ymm0
; KNL-NEXT: retq
;
; SKX-LABEL: test_v8i32:
-; SKX: # BB#0: # %entry
-; SKX-NEXT: vmovdqu64 (%rdi), %ymm0
+; SKX: # %bb.0: # %entry
+; SKX-NEXT: vmovdqa64 (%rdi), %ymm0
; SKX-NEXT: retq
entry:
- %0 = load <8 x i32>, <8 x i32>* %V, align 16
+ %0 = load <8 x i32>, <8 x i32>* %V, align 32
ret <8 x i32> %0
}
define <4 x i64> @test_v4i64(<4 x i64>* %V) {
; SSE-LABEL: test_v4i64:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: movaps (%rdi), %xmm0
; SSE-NEXT: movaps 16(%rdi), %xmm1
; SSE-NEXT: retq
;
; AVXONLY-LABEL: test_v4i64:
-; AVXONLY: # BB#0: # %entry
+; AVXONLY: # %bb.0: # %entry
; AVXONLY-NEXT: vmovdqa (%rdi), %ymm0
; AVXONLY-NEXT: retq
;
; KNL-LABEL: test_v4i64:
-; KNL: # BB#0: # %entry
+; KNL: # %bb.0: # %entry
; KNL-NEXT: vmovdqa (%rdi), %ymm0
; KNL-NEXT: retq
;
; SKX-LABEL: test_v4i64:
-; SKX: # BB#0: # %entry
+; SKX: # %bb.0: # %entry
; SKX-NEXT: vmovdqa64 (%rdi), %ymm0
; SKX-NEXT: retq
entry:
@@ -503,23 +503,23 @@ entry:
define <32 x i8> @test_v32i8_unaligned(<32 x i8>* %V) {
; SSE-LABEL: test_v32i8_unaligned:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: movups (%rdi), %xmm0
; SSE-NEXT: movups 16(%rdi), %xmm1
; SSE-NEXT: retq
;
; AVXONLY-LABEL: test_v32i8_unaligned:
-; AVXONLY: # BB#0: # %entry
+; AVXONLY: # %bb.0: # %entry
; AVXONLY-NEXT: vmovdqu (%rdi), %ymm0
; AVXONLY-NEXT: retq
;
; KNL-LABEL: test_v32i8_unaligned:
-; KNL: # BB#0: # %entry
+; KNL: # %bb.0: # %entry
; KNL-NEXT: vmovdqu (%rdi), %ymm0
; KNL-NEXT: retq
;
; SKX-LABEL: test_v32i8_unaligned:
-; SKX: # BB#0: # %entry
+; SKX: # %bb.0: # %entry
; SKX-NEXT: vmovdqu64 (%rdi), %ymm0
; SKX-NEXT: retq
entry:
@@ -529,23 +529,23 @@ entry:
define <16 x i16> @test_v16i16_unaligned(<16 x i16>* %V) {
; SSE-LABEL: test_v16i16_unaligned:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: movups (%rdi), %xmm0
; SSE-NEXT: movups 16(%rdi), %xmm1
; SSE-NEXT: retq
;
; AVXONLY-LABEL: test_v16i16_unaligned:
-; AVXONLY: # BB#0: # %entry
+; AVXONLY: # %bb.0: # %entry
; AVXONLY-NEXT: vmovdqu (%rdi), %ymm0
; AVXONLY-NEXT: retq
;
; KNL-LABEL: test_v16i16_unaligned:
-; KNL: # BB#0: # %entry
+; KNL: # %bb.0: # %entry
; KNL-NEXT: vmovdqu (%rdi), %ymm0
; KNL-NEXT: retq
;
; SKX-LABEL: test_v16i16_unaligned:
-; SKX: # BB#0: # %entry
+; SKX: # %bb.0: # %entry
; SKX-NEXT: vmovdqu64 (%rdi), %ymm0
; SKX-NEXT: retq
entry:
@@ -555,23 +555,23 @@ entry:
define <8 x i32> @test_v8i32_unaligned(<8 x i32>* %V) {
; SSE-LABEL: test_v8i32_unaligned:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: movups (%rdi), %xmm0
; SSE-NEXT: movups 16(%rdi), %xmm1
; SSE-NEXT: retq
;
; AVXONLY-LABEL: test_v8i32_unaligned:
-; AVXONLY: # BB#0: # %entry
+; AVXONLY: # %bb.0: # %entry
; AVXONLY-NEXT: vmovdqu (%rdi), %ymm0
; AVXONLY-NEXT: retq
;
; KNL-LABEL: test_v8i32_unaligned:
-; KNL: # BB#0: # %entry
+; KNL: # %bb.0: # %entry
; KNL-NEXT: vmovdqu (%rdi), %ymm0
; KNL-NEXT: retq
;
; SKX-LABEL: test_v8i32_unaligned:
-; SKX: # BB#0: # %entry
+; SKX: # %bb.0: # %entry
; SKX-NEXT: vmovdqu64 (%rdi), %ymm0
; SKX-NEXT: retq
entry:
@@ -581,23 +581,23 @@ entry:
define <4 x i64> @test_v4i64_unaligned(<4 x i64>* %V) {
; SSE-LABEL: test_v4i64_unaligned:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: movups (%rdi), %xmm0
; SSE-NEXT: movups 16(%rdi), %xmm1
; SSE-NEXT: retq
;
; AVXONLY-LABEL: test_v4i64_unaligned:
-; AVXONLY: # BB#0: # %entry
+; AVXONLY: # %bb.0: # %entry
; AVXONLY-NEXT: vmovdqu (%rdi), %ymm0
; AVXONLY-NEXT: retq
;
; KNL-LABEL: test_v4i64_unaligned:
-; KNL: # BB#0: # %entry
+; KNL: # %bb.0: # %entry
; KNL-NEXT: vmovdqu (%rdi), %ymm0
; KNL-NEXT: retq
;
; SKX-LABEL: test_v4i64_unaligned:
-; SKX: # BB#0: # %entry
+; SKX: # %bb.0: # %entry
; SKX-NEXT: vmovdqu64 (%rdi), %ymm0
; SKX-NEXT: retq
entry:
@@ -607,45 +607,45 @@ entry:
define <8 x float> @test_v8f32(<8 x float>* %V) {
; SSE-LABEL: test_v8f32:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: movaps (%rdi), %xmm0
; SSE-NEXT: movaps 16(%rdi), %xmm1
; SSE-NEXT: retq
;
; AVX-LABEL: test_v8f32:
-; AVX: # BB#0: # %entry
-; AVX-NEXT: vmovups (%rdi), %ymm0
+; AVX: # %bb.0: # %entry
+; AVX-NEXT: vmovaps (%rdi), %ymm0
; AVX-NEXT: retq
entry:
- %0 = load <8 x float>, <8 x float>* %V, align 16
+ %0 = load <8 x float>, <8 x float>* %V, align 32
ret <8 x float> %0
}
define <4 x double> @test_v4f64(<4 x double>* %V) {
; SSE-LABEL: test_v4f64:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: movapd (%rdi), %xmm0
; SSE-NEXT: movapd 16(%rdi), %xmm1
; SSE-NEXT: retq
;
; AVX-LABEL: test_v4f64:
-; AVX: # BB#0: # %entry
-; AVX-NEXT: vmovupd (%rdi), %ymm0
+; AVX: # %bb.0: # %entry
+; AVX-NEXT: vmovapd (%rdi), %ymm0
; AVX-NEXT: retq
entry:
- %0 = load <4 x double>, <4 x double>* %V, align 16
+ %0 = load <4 x double>, <4 x double>* %V, align 32
ret <4 x double> %0
}
define <8 x float> @test_v8f32_unaligned(<8 x float>* %V) {
; SSE-LABEL: test_v8f32_unaligned:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: movups (%rdi), %xmm0
; SSE-NEXT: movups 16(%rdi), %xmm1
; SSE-NEXT: retq
;
; AVX-LABEL: test_v8f32_unaligned:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vmovups (%rdi), %ymm0
; AVX-NEXT: retq
entry:
@@ -655,13 +655,13 @@ entry:
define <4 x double> @test_v4f64_unaligned(<4 x double>* %V) {
; SSE-LABEL: test_v4f64_unaligned:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: movupd (%rdi), %xmm0
; SSE-NEXT: movupd 16(%rdi), %xmm1
; SSE-NEXT: retq
;
; AVX-LABEL: test_v4f64_unaligned:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vmovupd (%rdi), %ymm0
; AVX-NEXT: retq
entry:
@@ -671,7 +671,7 @@ entry:
define <64 x i8> @test_v64i8(<64 x i8>* %V) {
; SSE-LABEL: test_v64i8:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: movaps (%rdi), %xmm0
; SSE-NEXT: movaps 16(%rdi), %xmm1
; SSE-NEXT: movaps 32(%rdi), %xmm2
@@ -679,29 +679,29 @@ define <64 x i8> @test_v64i8(<64 x i8>* %V) {
; SSE-NEXT: retq
;
; AVXONLY-LABEL: test_v64i8:
-; AVXONLY: # BB#0: # %entry
+; AVXONLY: # %bb.0: # %entry
; AVXONLY-NEXT: vmovaps (%rdi), %ymm0
; AVXONLY-NEXT: vmovaps 32(%rdi), %ymm1
; AVXONLY-NEXT: retq
;
; KNL-LABEL: test_v64i8:
-; KNL: # BB#0: # %entry
+; KNL: # %bb.0: # %entry
; KNL-NEXT: vmovaps (%rdi), %ymm0
; KNL-NEXT: vmovaps 32(%rdi), %ymm1
; KNL-NEXT: retq
;
; SKX-LABEL: test_v64i8:
-; SKX: # BB#0: # %entry
-; SKX-NEXT: vmovdqu64 (%rdi), %zmm0
+; SKX: # %bb.0: # %entry
+; SKX-NEXT: vmovdqa64 (%rdi), %zmm0
; SKX-NEXT: retq
entry:
- %0 = load <64 x i8>, <64 x i8>* %V, align 32
+ %0 = load <64 x i8>, <64 x i8>* %V, align 64
ret <64 x i8> %0
}
define <32 x i16> @test_v32i16(<32 x i16>* %V) {
; SSE-LABEL: test_v32i16:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: movaps (%rdi), %xmm0
; SSE-NEXT: movaps 16(%rdi), %xmm1
; SSE-NEXT: movaps 32(%rdi), %xmm2
@@ -709,29 +709,29 @@ define <32 x i16> @test_v32i16(<32 x i16>* %V) {
; SSE-NEXT: retq
;
; AVXONLY-LABEL: test_v32i16:
-; AVXONLY: # BB#0: # %entry
+; AVXONLY: # %bb.0: # %entry
; AVXONLY-NEXT: vmovaps (%rdi), %ymm0
; AVXONLY-NEXT: vmovaps 32(%rdi), %ymm1
; AVXONLY-NEXT: retq
;
; KNL-LABEL: test_v32i16:
-; KNL: # BB#0: # %entry
+; KNL: # %bb.0: # %entry
; KNL-NEXT: vmovaps (%rdi), %ymm0
; KNL-NEXT: vmovaps 32(%rdi), %ymm1
; KNL-NEXT: retq
;
; SKX-LABEL: test_v32i16:
-; SKX: # BB#0: # %entry
-; SKX-NEXT: vmovdqu64 (%rdi), %zmm0
+; SKX: # %bb.0: # %entry
+; SKX-NEXT: vmovdqa64 (%rdi), %zmm0
; SKX-NEXT: retq
entry:
- %0 = load <32 x i16>, <32 x i16>* %V, align 32
+ %0 = load <32 x i16>, <32 x i16>* %V, align 64
ret <32 x i16> %0
}
define <16 x i32> @test_v16i32(<16 x i32>* %V) {
; SSE-LABEL: test_v16i32:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: movaps (%rdi), %xmm0
; SSE-NEXT: movaps 16(%rdi), %xmm1
; SSE-NEXT: movaps 32(%rdi), %xmm2
@@ -739,23 +739,23 @@ define <16 x i32> @test_v16i32(<16 x i32>* %V) {
; SSE-NEXT: retq
;
; AVXONLY-LABEL: test_v16i32:
-; AVXONLY: # BB#0: # %entry
-; AVXONLY-NEXT: vmovups (%rdi), %ymm0
-; AVXONLY-NEXT: vmovups 32(%rdi), %ymm1
+; AVXONLY: # %bb.0: # %entry
+; AVXONLY-NEXT: vmovaps (%rdi), %ymm0
+; AVXONLY-NEXT: vmovaps 32(%rdi), %ymm1
; AVXONLY-NEXT: retq
;
; AVX512-LABEL: test_v16i32:
-; AVX512: # BB#0: # %entry
-; AVX512-NEXT: vmovdqu64 (%rdi), %zmm0
+; AVX512: # %bb.0: # %entry
+; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
; AVX512-NEXT: retq
entry:
- %0 = load <16 x i32>, <16 x i32>* %V, align 16
+ %0 = load <16 x i32>, <16 x i32>* %V, align 64
ret <16 x i32> %0
}
define <8 x i64> @test_v8i64(<8 x i64>* %V) {
; SSE-LABEL: test_v8i64:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: movaps (%rdi), %xmm0
; SSE-NEXT: movaps 16(%rdi), %xmm1
; SSE-NEXT: movaps 32(%rdi), %xmm2
@@ -763,23 +763,23 @@ define <8 x i64> @test_v8i64(<8 x i64>* %V) {
; SSE-NEXT: retq
;
; AVXONLY-LABEL: test_v8i64:
-; AVXONLY: # BB#0: # %entry
+; AVXONLY: # %bb.0: # %entry
; AVXONLY-NEXT: vmovaps (%rdi), %ymm0
; AVXONLY-NEXT: vmovaps 32(%rdi), %ymm1
; AVXONLY-NEXT: retq
;
; AVX512-LABEL: test_v8i64:
-; AVX512: # BB#0: # %entry
-; AVX512-NEXT: vmovdqu64 (%rdi), %zmm0
+; AVX512: # %bb.0: # %entry
+; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
; AVX512-NEXT: retq
entry:
- %0 = load <8 x i64>, <8 x i64>* %V, align 32
+ %0 = load <8 x i64>, <8 x i64>* %V, align 64
ret <8 x i64> %0
}
define <64 x i8> @test_v64i8_unaligned(<64 x i8>* %V) {
; SSE-LABEL: test_v64i8_unaligned:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: movups (%rdi), %xmm0
; SSE-NEXT: movups 16(%rdi), %xmm1
; SSE-NEXT: movups 32(%rdi), %xmm2
@@ -787,19 +787,19 @@ define <64 x i8> @test_v64i8_unaligned(<64 x i8>* %V) {
; SSE-NEXT: retq
;
; AVXONLY-LABEL: test_v64i8_unaligned:
-; AVXONLY: # BB#0: # %entry
+; AVXONLY: # %bb.0: # %entry
; AVXONLY-NEXT: vmovups (%rdi), %ymm0
; AVXONLY-NEXT: vmovups 32(%rdi), %ymm1
; AVXONLY-NEXT: retq
;
; KNL-LABEL: test_v64i8_unaligned:
-; KNL: # BB#0: # %entry
+; KNL: # %bb.0: # %entry
; KNL-NEXT: vmovups (%rdi), %ymm0
; KNL-NEXT: vmovups 32(%rdi), %ymm1
; KNL-NEXT: retq
;
; SKX-LABEL: test_v64i8_unaligned:
-; SKX: # BB#0: # %entry
+; SKX: # %bb.0: # %entry
; SKX-NEXT: vmovdqu64 (%rdi), %zmm0
; SKX-NEXT: retq
entry:
@@ -809,7 +809,7 @@ entry:
define <32 x i16> @test_v32i16_unaligned(<32 x i16>* %V) {
; SSE-LABEL: test_v32i16_unaligned:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: movups (%rdi), %xmm0
; SSE-NEXT: movups 16(%rdi), %xmm1
; SSE-NEXT: movups 32(%rdi), %xmm2
@@ -817,19 +817,19 @@ define <32 x i16> @test_v32i16_unaligned(<32 x i16>* %V) {
; SSE-NEXT: retq
;
; AVXONLY-LABEL: test_v32i16_unaligned:
-; AVXONLY: # BB#0: # %entry
+; AVXONLY: # %bb.0: # %entry
; AVXONLY-NEXT: vmovups (%rdi), %ymm0
; AVXONLY-NEXT: vmovups 32(%rdi), %ymm1
; AVXONLY-NEXT: retq
;
; KNL-LABEL: test_v32i16_unaligned:
-; KNL: # BB#0: # %entry
+; KNL: # %bb.0: # %entry
; KNL-NEXT: vmovups (%rdi), %ymm0
; KNL-NEXT: vmovups 32(%rdi), %ymm1
; KNL-NEXT: retq
;
; SKX-LABEL: test_v32i16_unaligned:
-; SKX: # BB#0: # %entry
+; SKX: # %bb.0: # %entry
; SKX-NEXT: vmovdqu64 (%rdi), %zmm0
; SKX-NEXT: retq
entry:
@@ -839,7 +839,7 @@ entry:
define <16 x i32> @test_v16i32_unaligned(<16 x i32>* %V) {
; SSE-LABEL: test_v16i32_unaligned:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: movups (%rdi), %xmm0
; SSE-NEXT: movups 16(%rdi), %xmm1
; SSE-NEXT: movups 32(%rdi), %xmm2
@@ -847,13 +847,13 @@ define <16 x i32> @test_v16i32_unaligned(<16 x i32>* %V) {
; SSE-NEXT: retq
;
; AVXONLY-LABEL: test_v16i32_unaligned:
-; AVXONLY: # BB#0: # %entry
+; AVXONLY: # %bb.0: # %entry
; AVXONLY-NEXT: vmovups (%rdi), %ymm0
; AVXONLY-NEXT: vmovups 32(%rdi), %ymm1
; AVXONLY-NEXT: retq
;
; AVX512-LABEL: test_v16i32_unaligned:
-; AVX512: # BB#0: # %entry
+; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vmovdqu64 (%rdi), %zmm0
; AVX512-NEXT: retq
entry:
@@ -863,7 +863,7 @@ entry:
define <8 x i64> @test_v8i64_unaligned(<8 x i64>* %V) {
; SSE-LABEL: test_v8i64_unaligned:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: movups (%rdi), %xmm0
; SSE-NEXT: movups 16(%rdi), %xmm1
; SSE-NEXT: movups 32(%rdi), %xmm2
@@ -871,13 +871,13 @@ define <8 x i64> @test_v8i64_unaligned(<8 x i64>* %V) {
; SSE-NEXT: retq
;
; AVXONLY-LABEL: test_v8i64_unaligned:
-; AVXONLY: # BB#0: # %entry
+; AVXONLY: # %bb.0: # %entry
; AVXONLY-NEXT: vmovups (%rdi), %ymm0
; AVXONLY-NEXT: vmovups 32(%rdi), %ymm1
; AVXONLY-NEXT: retq
;
; AVX512-LABEL: test_v8i64_unaligned:
-; AVX512: # BB#0: # %entry
+; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vmovdqu64 (%rdi), %zmm0
; AVX512-NEXT: retq
entry:
@@ -887,23 +887,23 @@ entry:
define <8 x float> @test_v16f32(<8 x float>* %V) {
; SSE-LABEL: test_v16f32:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: movaps (%rdi), %xmm0
; SSE-NEXT: movaps 16(%rdi), %xmm1
; SSE-NEXT: retq
;
; AVX-LABEL: test_v16f32:
-; AVX: # BB#0: # %entry
-; AVX-NEXT: vmovups (%rdi), %ymm0
+; AVX: # %bb.0: # %entry
+; AVX-NEXT: vmovaps (%rdi), %ymm0
; AVX-NEXT: retq
entry:
- %0 = load <8 x float>, <8 x float>* %V, align 16
+ %0 = load <8 x float>, <8 x float>* %V, align 64
ret <8 x float> %0
}
define <8 x double> @test_v8f64(<8 x double>* %V) {
; SSE-LABEL: test_v8f64:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: movapd (%rdi), %xmm0
; SSE-NEXT: movapd 16(%rdi), %xmm1
; SSE-NEXT: movapd 32(%rdi), %xmm2
@@ -911,23 +911,23 @@ define <8 x double> @test_v8f64(<8 x double>* %V) {
; SSE-NEXT: retq
;
; AVXONLY-LABEL: test_v8f64:
-; AVXONLY: # BB#0: # %entry
-; AVXONLY-NEXT: vmovupd (%rdi), %ymm0
-; AVXONLY-NEXT: vmovupd 32(%rdi), %ymm1
+; AVXONLY: # %bb.0: # %entry
+; AVXONLY-NEXT: vmovapd (%rdi), %ymm0
+; AVXONLY-NEXT: vmovapd 32(%rdi), %ymm1
; AVXONLY-NEXT: retq
;
; AVX512-LABEL: test_v8f64:
-; AVX512: # BB#0: # %entry
-; AVX512-NEXT: vmovupd (%rdi), %zmm0
+; AVX512: # %bb.0: # %entry
+; AVX512-NEXT: vmovapd (%rdi), %zmm0
; AVX512-NEXT: retq
entry:
- %0 = load <8 x double>, <8 x double>* %V, align 16
+ %0 = load <8 x double>, <8 x double>* %V, align 64
ret <8 x double> %0
}
define <16 x float> @test_v16f32_unaligned(<16 x float>* %V) {
; SSE-LABEL: test_v16f32_unaligned:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: movups (%rdi), %xmm0
; SSE-NEXT: movups 16(%rdi), %xmm1
; SSE-NEXT: movups 32(%rdi), %xmm2
@@ -935,13 +935,13 @@ define <16 x float> @test_v16f32_unaligned(<16 x float>* %V) {
; SSE-NEXT: retq
;
; AVXONLY-LABEL: test_v16f32_unaligned:
-; AVXONLY: # BB#0: # %entry
+; AVXONLY: # %bb.0: # %entry
; AVXONLY-NEXT: vmovups (%rdi), %ymm0
; AVXONLY-NEXT: vmovups 32(%rdi), %ymm1
; AVXONLY-NEXT: retq
;
; AVX512-LABEL: test_v16f32_unaligned:
-; AVX512: # BB#0: # %entry
+; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vmovups (%rdi), %zmm0
; AVX512-NEXT: retq
entry:
@@ -951,7 +951,7 @@ entry:
define <8 x double> @test_v8f64_unaligned(<8 x double>* %V) {
; SSE-LABEL: test_v8f64_unaligned:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: movupd (%rdi), %xmm0
; SSE-NEXT: movupd 16(%rdi), %xmm1
; SSE-NEXT: movupd 32(%rdi), %xmm2
@@ -959,13 +959,13 @@ define <8 x double> @test_v8f64_unaligned(<8 x double>* %V) {
; SSE-NEXT: retq
;
; AVXONLY-LABEL: test_v8f64_unaligned:
-; AVXONLY: # BB#0: # %entry
+; AVXONLY: # %bb.0: # %entry
; AVXONLY-NEXT: vmovupd (%rdi), %ymm0
; AVXONLY-NEXT: vmovupd 32(%rdi), %ymm1
; AVXONLY-NEXT: retq
;
; AVX512-LABEL: test_v8f64_unaligned:
-; AVX512: # BB#0: # %entry
+; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vmovupd (%rdi), %zmm0
; AVX512-NEXT: retq
entry:
diff --git a/test/CodeGen/X86/fast-isel-x86.ll b/test/CodeGen/X86/fast-isel-x86.ll
index aa6d9b7cf056..3923cf9bb922 100644
--- a/test/CodeGen/X86/fast-isel-x86.ll
+++ b/test/CodeGen/X86/fast-isel-x86.ll
@@ -88,22 +88,3 @@ entry:
; CHECK: addl $28
}
declare fastcc void @test4fastccsret(%struct.a* sret)
-
-
-; Check that fast-isel cleans up when it fails to lower a call instruction.
-define void @test5() {
-entry:
- %call = call i32 @test5dllimport(i32 42)
- ret void
-; CHECK-LABEL: test5:
-; Local value area is still there:
-; CHECK: movl $42, {{%[a-z]+}}
-; Fast-ISel's arg push is not here:
-; CHECK-NOT: movl $42, (%esp)
-; SDag-ISel's arg push:
-; CHECK: movl %esp, [[REGISTER:%[a-z]+]]
-; CHECK: movl $42, ([[REGISTER]])
-; CHECK: movl L_test5dllimport$non_lazy_ptr-L8$pb(%eax), %eax
-
-}
-declare dllimport i32 @test5dllimport(i32)
diff --git a/test/CodeGen/X86/fast-isel.ll b/test/CodeGen/X86/fast-isel.ll
index 375814c8afcd..dbc13ba7ed78 100644
--- a/test/CodeGen/X86/fast-isel.ll
+++ b/test/CodeGen/X86/fast-isel.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -fast-isel -fast-isel-abort=1 -verify-machineinstrs -march=x86 -mattr=sse2 -no-integrated-as
+; RUN: llc < %s -fast-isel -fast-isel-abort=1 -verify-machineinstrs -mtriple=i686-- -mattr=sse2 -no-integrated-as
; RUN: llc < %s -fast-isel -fast-isel-abort=1 -verify-machineinstrs -mtriple=x86_64-apple-darwin10 -no-integrated-as
; This tests very minimal fast-isel functionality.
diff --git a/test/CodeGen/X86/fastcc-sret.ll b/test/CodeGen/X86/fastcc-sret.ll
index 499aadda44fa..2962f8ec1ffe 100644
--- a/test/CodeGen/X86/fastcc-sret.ll
+++ b/test/CodeGen/X86/fastcc-sret.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -tailcallopt=false | FileCheck %s
+; RUN: llc < %s -mtriple=i686-- -tailcallopt=false | FileCheck %s
%struct.foo = type { [4 x i32] }
diff --git a/test/CodeGen/X86/fastcc3struct.ll b/test/CodeGen/X86/fastcc3struct.ll
index 98dc2f5a1c78..b8e2631248ab 100644
--- a/test/CodeGen/X86/fastcc3struct.ll
+++ b/test/CodeGen/X86/fastcc3struct.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | FileCheck %s
+; RUN: llc < %s -mtriple=i686-- | FileCheck %s
; CHECK: movl {{.}}12, %eax
; CHECK: movl {{.}}24, %edx
diff --git a/test/CodeGen/X86/fastisel-softfloat.ll b/test/CodeGen/X86/fastisel-softfloat.ll
index e4330db81e1a..579637e83446 100644
--- a/test/CodeGen/X86/fastisel-softfloat.ll
+++ b/test/CodeGen/X86/fastisel-softfloat.ll
@@ -6,7 +6,7 @@ target triple = "x86_64-unknown-linux-gnu"
define float @pr26522(float %pat) #0 {
; CHECK-LABEL: pr26522:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movl %edi, %eax
; CHECK-NEXT: retq
ret float %pat
diff --git a/test/CodeGen/X86/fcmove.ll b/test/CodeGen/X86/fcmove.ll
index 21cc683f734f..35dbb68117ba 100644
--- a/test/CodeGen/X86/fcmove.ll
+++ b/test/CodeGen/X86/fcmove.ll
@@ -12,4 +12,4 @@ define x86_fp80 @cmove_f(x86_fp80 %a, x86_fp80 %b, i32 %c) {
%add = fadd x86_fp80 %a, %b
%ret = select i1 %test, x86_fp80 %add, x86_fp80 %b
ret x86_fp80 %ret
-} \ No newline at end of file
+}
diff --git a/test/CodeGen/X86/fdiv-combine.ll b/test/CodeGen/X86/fdiv-combine.ll
index d9d9ac401fb5..912110e75d27 100644
--- a/test/CodeGen/X86/fdiv-combine.ll
+++ b/test/CodeGen/X86/fdiv-combine.ll
@@ -7,7 +7,7 @@
define float @div1_arcp(float %x, float %y, float %z) {
; CHECK-LABEL: div1_arcp:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: divss %xmm1, %xmm0
; CHECK-NEXT: retq
%div1 = fdiv arcp float %x, %y
@@ -18,7 +18,7 @@ define float @div1_arcp(float %x, float %y, float %z) {
define float @div2_arcp_all(float %x, float %y, float %z) {
; CHECK-LABEL: div2_arcp_all:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
; CHECK-NEXT: divss %xmm2, %xmm3
; CHECK-NEXT: mulss %xmm3, %xmm0
@@ -35,7 +35,7 @@ define float @div2_arcp_all(float %x, float %y, float %z) {
define float @div2_arcp_partial1(float %x, float %y, float %z) {
; CHECK-LABEL: div2_arcp_partial1:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: divss %xmm2, %xmm0
; CHECK-NEXT: mulss %xmm1, %xmm0
; CHECK-NEXT: divss %xmm2, %xmm0
@@ -50,7 +50,7 @@ define float @div2_arcp_partial1(float %x, float %y, float %z) {
define float @div2_arcp_partial2(float %x, float %y, float %z) {
; CHECK-LABEL: div2_arcp_partial2:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: divss %xmm2, %xmm0
; CHECK-NEXT: mulss %xmm1, %xmm0
; CHECK-NEXT: divss %xmm2, %xmm0
@@ -65,7 +65,7 @@ define float @div2_arcp_partial2(float %x, float %y, float %z) {
define float @div2_arcp_partial3(float %x, float %y, float %z) {
; CHECK-LABEL: div2_arcp_partial3:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
; CHECK-NEXT: divss %xmm2, %xmm3
; CHECK-NEXT: mulss %xmm3, %xmm0
@@ -83,7 +83,7 @@ define float @div2_arcp_partial3(float %x, float %y, float %z) {
define double @div3_arcp(double %x, double %y, double %z) {
; CHECK-LABEL: div3_arcp:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movsd{{.*#+}} xmm2 = mem[0],zero
; CHECK-NEXT: divsd %xmm1, %xmm2
; CHECK-NEXT: mulsd %xmm2, %xmm0
diff --git a/test/CodeGen/X86/fdiv.ll b/test/CodeGen/X86/fdiv.ll
index 226e6d269c3b..f3956ecc0ea3 100644
--- a/test/CodeGen/X86/fdiv.ll
+++ b/test/CodeGen/X86/fdiv.ll
@@ -4,7 +4,7 @@
define double @exact(double %x) {
; Exact division by a constant converted to multiplication.
; CHECK-LABEL: exact:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: mulsd {{.*}}(%rip), %xmm0
; CHECK-NEXT: retq
%div = fdiv double %x, 2.0
@@ -14,7 +14,7 @@ define double @exact(double %x) {
define double @inexact(double %x) {
; Inexact division by a constant converted to multiplication.
; CHECK-LABEL: inexact:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: mulsd {{.*}}(%rip), %xmm0
; CHECK-NEXT: retq
%div = fdiv double %x, 0x41DFFFFFFFC00000
@@ -24,7 +24,7 @@ define double @inexact(double %x) {
define double @funky(double %x) {
; No conversion to multiplication if too funky.
; CHECK-LABEL: funky:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: xorpd %xmm1, %xmm1
; CHECK-NEXT: divsd %xmm1, %xmm0
; CHECK-NEXT: retq
@@ -35,7 +35,7 @@ define double @funky(double %x) {
define double @denormal1(double %x) {
; Don't generate multiplication by a denormal.
; CHECK-LABEL: denormal1:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: divsd {{.*}}(%rip), %xmm0
; CHECK-NEXT: retq
%div = fdiv double %x, 0x7FD0000000000001
@@ -45,7 +45,7 @@ define double @denormal1(double %x) {
define double @denormal2(double %x) {
; Don't generate multiplication by a denormal.
; CHECK-LABEL: denormal2:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: divsd {{.*}}(%rip), %xmm0
; CHECK-NEXT: retq
%div = fdiv double %x, 0x7FEFFFFFFFFFFFFF
@@ -56,7 +56,7 @@ define double @denormal2(double %x) {
define float @double_negative(float %x, float %y) #0 {
; CHECK-LABEL: double_negative:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: divss %xmm1, %xmm0
; CHECK-NEXT: retq
%neg1 = fsub float -0.0, %x
diff --git a/test/CodeGen/X86/fentry-insertion.ll b/test/CodeGen/X86/fentry-insertion.ll
index a585d96b209c..c5fb3b254b26 100644
--- a/test/CodeGen/X86/fentry-insertion.ll
+++ b/test/CodeGen/X86/fentry-insertion.ll
@@ -12,5 +12,19 @@ entry:
; CHECK: retq
}
-attributes #0 = { "fentry-call"="true" }
+define void @test2() #1 {
+entry:
+ br label %bb1
+bb1:
+ call void @address_taken(i64 ptrtoint (i8* blockaddress(@test2, %bb1) to i64), i32 512)
+ ret void
+; CHECK-LABEL: @test2
+; CHECK: callq __fentry__
+; CHECK-NOT: mcount
+; CHECK: retq
+}
+
+declare void @address_taken(i64, i32) local_unnamed_addr
+attributes #0 = { "fentry-call"="true" }
+attributes #1 = { inlinehint minsize noredzone nounwind optsize sspstrong "fentry-call"="true" }
diff --git a/test/CodeGen/X86/field-extract-use-trunc.ll b/test/CodeGen/X86/field-extract-use-trunc.ll
index 735e1341f65b..e7d2f41f0c2c 100644
--- a/test/CodeGen/X86/field-extract-use-trunc.ll
+++ b/test/CodeGen/X86/field-extract-use-trunc.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=x86 | grep sar | count 1
-; RUN: llc < %s -march=x86-64 | not grep sar
+; RUN: llc < %s -mtriple=i686-- | grep sar | count 1
+; RUN: llc < %s -mtriple=x86_64-- | not grep sar
define i32 @test(i32 %f12) nounwind {
%tmp7.25 = lshr i32 %f12, 16
diff --git a/test/CodeGen/X86/fildll.ll b/test/CodeGen/X86/fildll.ll
index c5a3765c717b..aeb753c062bf 100644
--- a/test/CodeGen/X86/fildll.ll
+++ b/test/CodeGen/X86/fildll.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -x86-asm-syntax=att -mattr=-sse2 | grep fildll | count 2
+; RUN: llc < %s -mtriple=i686-- -x86-asm-syntax=att -mattr=-sse2 | grep fildll | count 2
define fastcc double @sint64_to_fp(i64 %X) {
%R = sitofp i64 %X to double ; <double> [#uses=1]
diff --git a/test/CodeGen/X86/file-directive.ll b/test/CodeGen/X86/file-directive.ll
new file mode 100644
index 000000000000..4b25a0909ea1
--- /dev/null
+++ b/test/CodeGen/X86/file-directive.ll
@@ -0,0 +1,13 @@
+; RUN: llc -mtriple=x86_64-linux-gnu -filetype=asm < %s | FileCheck %s --check-prefix=DIRECTIVE
+; RUN: llc -mtriple=x86_64-linux-gnu -filetype=obj < %s | llvm-readobj -symbols | FileCheck %s --check-prefix=STT-FILE
+
+; DIRECTIVE: .file "foobar"
+; STT-FILE: Name: foobar
+; STT-FILE-NEXT: Value: 0x0
+; STT-FILE-NEXT: Size: 0
+; STT-FILE-NEXT: Binding: Local
+; STT-FILE-NEXT: Type: File
+; STT-FILE-NEXT: Other: 0
+; STT-FILE-NEXT: Section: Absolute
+
+source_filename = "/path/to/foobar"
diff --git a/test/CodeGen/X86/finite-libcalls.ll b/test/CodeGen/X86/finite-libcalls.ll
new file mode 100644
index 000000000000..e77017b0064b
--- /dev/null
+++ b/test/CodeGen/X86/finite-libcalls.ll
@@ -0,0 +1,52 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-pc-linux-gnu | FileCheck %s --check-prefix=CHECK --check-prefix=GNU
+; RUN: llc < %s -mtriple=x86_64-pc-windows-msvc | FileCheck %s --check-prefix=CHECK --check-prefix=WIN
+
+; PR35672 - https://bugs.llvm.org/show_bug.cgi?id=35672
+; FIXME: We would not need the function-level attributes if FMF were propagated to DAG nodes for this case.
+
+define float @exp_f32(float %x) #0 {
+; CHECK-LABEL: exp_f32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: jmp expf # TAILCALL
+ %exp = tail call nnan ninf float @llvm.exp.f32(float %x)
+ ret float %exp
+}
+
+define double @exp_f64(double %x) #0 {
+; CHECK-LABEL: exp_f64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: jmp exp # TAILCALL
+ %exp = tail call nnan ninf double @llvm.exp.f64(double %x)
+ ret double %exp
+}
+
+define x86_fp80 @exp_f80(x86_fp80 %x) #0 {
+; GNU-LABEL: exp_f80:
+; GNU: # %bb.0:
+; GNU-NEXT: subq $24, %rsp
+; GNU-NEXT: fldt {{[0-9]+}}(%rsp)
+; GNU-NEXT: fstpt (%rsp)
+; GNU-NEXT: callq expl
+; GNU-NEXT: addq $24, %rsp
+; GNU-NEXT: retq
+;
+; WIN-LABEL: exp_f80:
+; WIN: # %bb.0:
+; WIN-NEXT: subq $56, %rsp
+; WIN-NEXT: fldt {{[0-9]+}}(%rsp)
+; WIN-NEXT: fstpt {{[0-9]+}}(%rsp)
+; WIN-NEXT: callq expl
+; WIN-NEXT: addq $56, %rsp
+; WIN-NEXT: retq
+ %exp = tail call nnan ninf x86_fp80 @llvm.exp.f80(x86_fp80 %x)
+ ret x86_fp80 %exp
+}
+
+declare float @llvm.exp.f32(float) #1
+declare double @llvm.exp.f64(double) #1
+declare x86_fp80 @llvm.exp.f80(x86_fp80) #1
+
+attributes #0 = { nounwind "no-infs-fp-math"="true" "no-nans-fp-math"="true" }
+attributes #1 = { nounwind readnone speculatable }
+
diff --git a/test/CodeGen/X86/fixup-bw-copy.ll b/test/CodeGen/X86/fixup-bw-copy.ll
index 9067dfd29c17..dead278bb0d3 100644
--- a/test/CodeGen/X86/fixup-bw-copy.ll
+++ b/test/CodeGen/X86/fixup-bw-copy.ll
@@ -8,17 +8,17 @@ target datalayout = "e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128"
define i8 @test_movb(i8 %a0) {
; BWON64-LABEL: test_movb:
-; BWON64: # BB#0:
+; BWON64: # %bb.0:
; BWON64-NEXT: movl %edi, %eax
; BWON64-NEXT: retq
;
; BWOFF64-LABEL: test_movb:
-; BWOFF64: # BB#0:
+; BWOFF64: # %bb.0:
; BWOFF64-NEXT: movb %dil, %al
; BWOFF64-NEXT: retq
;
; X32-LABEL: test_movb:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movb {{[0-9]+}}(%esp), %al
; X32-NEXT: retl
ret i8 %a0
@@ -26,22 +26,22 @@ define i8 @test_movb(i8 %a0) {
define i16 @test_movw(i16 %a0) {
; BWON64-LABEL: test_movw:
-; BWON64: # BB#0:
+; BWON64: # %bb.0:
; BWON64-NEXT: movl %edi, %eax
; BWON64-NEXT: retq
;
; BWOFF64-LABEL: test_movw:
-; BWOFF64: # BB#0:
+; BWOFF64: # %bb.0:
; BWOFF64-NEXT: movw %di, %ax
; BWOFF64-NEXT: retq
;
; BWON32-LABEL: test_movw:
-; BWON32: # BB#0:
+; BWON32: # %bb.0:
; BWON32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; BWON32-NEXT: retl
;
; BWOFF32-LABEL: test_movw:
-; BWOFF32: # BB#0:
+; BWOFF32: # %bb.0:
; BWOFF32-NEXT: movw {{[0-9]+}}(%esp), %ax
; BWOFF32-NEXT: retl
ret i16 %a0
@@ -50,15 +50,15 @@ define i16 @test_movw(i16 %a0) {
; Verify we don't mess with H-reg copies (only generated in 32-bit mode).
define i8 @test_movb_hreg(i16 %a0) {
; X64-LABEL: test_movb_hreg:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movl %edi, %eax
; X64-NEXT: shrl $8, %eax
; X64-NEXT: addb %dil, %al
-; X64-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-NEXT: # kill: def %al killed %al killed %eax
; X64-NEXT: retq
;
; X32-LABEL: test_movb_hreg:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: addb %al, %ah
; X32-NEXT: movb %ah, %al
diff --git a/test/CodeGen/X86/fixup-bw-inst.ll b/test/CodeGen/X86/fixup-bw-inst.ll
index 6f83e6362d56..0e90921227be 100644
--- a/test/CodeGen/X86/fixup-bw-inst.ll
+++ b/test/CodeGen/X86/fixup-bw-inst.ll
@@ -1,6 +1,6 @@
-; RUN: llc -fixup-byte-word-insts=1 -march=x86-64 < %s | \
+; RUN: llc -fixup-byte-word-insts=1 < %s | \
; RUN: FileCheck -check-prefix CHECK -check-prefix BWON %s
-; RUN: llc -fixup-byte-word-insts=0 -march=x86-64 < %s | \
+; RUN: llc -fixup-byte-word-insts=0 < %s | \
; RUN: FileCheck -check-prefix CHECK -check-prefix BWOFF %s
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
diff --git a/test/CodeGen/X86/fixup-bw-inst.mir b/test/CodeGen/X86/fixup-bw-inst.mir
new file mode 100644
index 000000000000..cea483e1b9bc
--- /dev/null
+++ b/test/CodeGen/X86/fixup-bw-inst.mir
@@ -0,0 +1,151 @@
+# RUN: llc -mtriple=x86_64-unknown-linux-gnu -run-pass x86-fixup-bw-insts %s -o - | FileCheck %s
+
+--- |
+ define void @test1() { ret void }
+ define void @test2() { ret void }
+
+ define i16 @test3(i16* readonly %p) {
+ ; Keep original IR to show how the situation like this might happen
+ ; due to preceding CG passes.
+ ;
+ ; %0 is used in %if.end BB (before tail-duplication), so its
+ ; corresponding super-register (EAX) is live-in into that BB (%if.end)
+ ; and also has an implicit-def EAX flag. Make sure that we still change
+ ; the movw into movzwl because EAX is not live before the load (which
+ ; can be seen by the fact that implicit EAX flag is missing).
+ entry:
+ %tobool = icmp eq i16* %p, null
+ br i1 %tobool, label %if.end, label %if.then
+
+ if.then: ; preds = %entry
+ %0 = load i16, i16* %p, align 2
+ br label %if.end
+
+ if.end: ; preds = %if.then, %entry
+ %i.0 = phi i16 [ %0, %if.then ], [ 0, %entry ]
+ ret i16 %i.0
+ }
+
+...
+---
+# CHECK-LABEL: name: test1
+name: test1
+alignment: 4
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+registers:
+liveins:
+ - { reg: '%rax' }
+frameInfo:
+ stackSize: 0
+fixedStack:
+stack:
+constants:
+# Verify that "movw (%rax), %ax" is changed to "movzwl (%rax), %rax".
+#
+# For that to happen, the liveness information after the MOV16rm
+# instruction should be used, not before it because %rax is live
+# before the MOV and is killed by it.
+body: |
+ bb.0:
+ liveins: %rax
+
+ %ax = MOV16rm killed %rax, 1, %noreg, 0, %noreg
+ ; CHECK: %eax = MOVZX32rm16 killed %rax
+
+ RETQ %ax
+
+...
+---
+# CHECK-LABEL: name: test2
+name: test2
+alignment: 4
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+registers:
+liveins:
+ - { reg: '%rax' }
+frameInfo:
+ stackSize: 0
+fixedStack:
+stack:
+constants:
+# Imp-use of any super-register means the register is live before the MOV
+body: |
+ bb.0:
+ liveins: %dl, %rbx, %rcx, %r14
+
+ %cl = MOV8rr killed %dl, implicit killed %rcx, implicit-def %rcx
+ ; CHECK: %cl = MOV8rr killed %dl, implicit killed %rcx, implicit-def %rcx
+ JMP_1 %bb.1
+ bb.1:
+ liveins: %rcx
+
+ RETQ %cl
+
+...
+---
+# CHECK-LABEL: name: test3
+name: test3
+alignment: 4
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+registers:
+liveins:
+ - { reg: '%rdi', virtual-reg: '' }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 0
+ adjustsStack: false
+ hasCalls: false
+ stackProtector: ''
+ maxCallFrameSize: 0
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+ savePoint: ''
+ restorePoint: ''
+fixedStack:
+stack:
+constants:
+# After MOV16rm the whole %eax is not *really* live, as can be seen by
+# missing implicit-uses of it in that MOV. Make sure that MOV is
+# transformed into MOVZX.
+# See the comment near the original IR on what preceding decisions can
+# lead to that.
+body: |
+ bb.0.entry:
+ successors: %bb.1(0x30000000), %bb.2.if.then(0x50000000)
+ liveins: %rdi
+
+ TEST64rr %rdi, %rdi, implicit-def %eflags
+ JE_1 %bb.1, implicit %eflags
+
+ bb.2.if.then:
+ liveins: %rdi
+
+ %ax = MOV16rm killed %rdi, 1, %noreg, 0, %noreg, implicit-def %eax :: (load 2 from %ir.p)
+ ; CHECK: %eax = MOVZX32rm16 killed %rdi, 1, %noreg, 0, %noreg, implicit-def %eax :: (load 2 from %ir.p)
+ %ax = KILL %ax, implicit killed %eax
+ RETQ %ax
+
+ bb.1:
+ %eax = XOR32rr undef %eax, undef %eax, implicit-def dead %eflags
+ %ax = KILL %ax, implicit killed %eax
+ RETQ %ax
+
+...
diff --git a/test/CodeGen/X86/fixup-lea.ll b/test/CodeGen/X86/fixup-lea.ll
index 1ddc099ffd62..2d58d866b290 100644
--- a/test/CodeGen/X86/fixup-lea.ll
+++ b/test/CodeGen/X86/fixup-lea.ll
@@ -1,4 +1,4 @@
-;RUN: llc < %s -march=x86 | FileCheck %s
+;RUN: llc < %s -mtriple=i686-- | FileCheck %s
define void @foo(i32 inreg %dns) minsize {
entry:
diff --git a/test/CodeGen/X86/float-conv-elim.ll b/test/CodeGen/X86/float-conv-elim.ll
index 7ccad2b80c8b..9ec189b05e20 100644
--- a/test/CodeGen/X86/float-conv-elim.ll
+++ b/test/CodeGen/X86/float-conv-elim.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=i686-unknown-linux-gnu -march=x86-64 -mcpu=x86-64 < %s | FileCheck %s
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mcpu=x86-64 < %s | FileCheck %s
; Make sure the float conversion is folded away as it should be.
; CHECK-LABEL: foo
diff --git a/test/CodeGen/X86/floor-soft-float.ll b/test/CodeGen/X86/floor-soft-float.ll
index 3b28ecc6379d..ad98c34e464e 100644
--- a/test/CodeGen/X86/floor-soft-float.ll
+++ b/test/CodeGen/X86/floor-soft-float.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=x86-64 -mattr=+sse4.1,-avx | FileCheck %s --check-prefix=CHECK-HARD-FLOAT
-; RUN: llc < %s -march=x86-64 -mattr=+sse4.1,-avx,+soft-float | FileCheck %s --check-prefix=CHECK-SOFT-FLOAT
+; RUN: llc < %s -mattr=+sse4.1,-avx | FileCheck %s --check-prefix=CHECK-HARD-FLOAT
+; RUN: llc < %s -mattr=+sse4.1,-avx,+soft-float | FileCheck %s --check-prefix=CHECK-SOFT-FLOAT
target triple = "x86_64-unknown-linux-gnu"
diff --git a/test/CodeGen/X86/fma-commute-x86.ll b/test/CodeGen/X86/fma-commute-x86.ll
index 162a97ac025c..f8ae88d68e09 100644
--- a/test/CodeGen/X86/fma-commute-x86.ll
+++ b/test/CodeGen/X86/fma-commute-x86.ll
@@ -1,193 +1,194 @@
-; RUN: llc < %s -mtriple=x86_64-pc-win32 -mcpu=core-avx2 | FileCheck %s
-; RUN: llc < %s -mtriple=x86_64-pc-win32 -mattr=+fma | FileCheck %s
-; RUN: llc < %s -mcpu=bdver2 -mtriple=x86_64-pc-win32 -mattr=-fma4 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-pc-win32 -mcpu=core-avx2 | FileCheck %s --check-prefix=FMA
+; RUN: llc < %s -mtriple=x86_64-pc-win32 -mattr=+fma | FileCheck %s --check-prefix=FMA
+; RUN: llc < %s -mcpu=bdver2 -mtriple=x86_64-pc-win32 -mattr=-fma4 | FileCheck %s --check-prefix=FMA
attributes #0 = { nounwind }
declare <4 x float> @llvm.x86.fma.vfmadd.ss(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
define <4 x float> @test_x86_fmadd_baa_ss(<4 x float> %a, <4 x float> %b) #0 {
-; CHECK-LABEL: test_x86_fmadd_baa_ss:
-; CHECK: # BB#0:
-; CHECK-NEXT: vmovaps {{\(%rdx\), %xmm0|\(%rcx\), %xmm1}}
-; CHECK-NEXT: vmovaps {{\(%rdx\), %xmm0|\(%rcx\), %xmm1}}
-; CHECK-NEXT: vfmadd213ss %xmm1, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; FMA-LABEL: test_x86_fmadd_baa_ss:
+; FMA: # %bb.0:
+; FMA-NEXT: vmovaps (%rcx), %xmm1
+; FMA-NEXT: vmovaps (%rdx), %xmm0
+; FMA-NEXT: vfmadd213ss %xmm1, %xmm1, %xmm0
+; FMA-NEXT: retq
%res = call <4 x float> @llvm.x86.fma.vfmadd.ss(<4 x float> %b, <4 x float> %a, <4 x float> %a) nounwind
ret <4 x float> %res
}
define <4 x float> @test_x86_fmadd_aba_ss(<4 x float> %a, <4 x float> %b) #0 {
-; CHECK-LABEL: test_x86_fmadd_aba_ss:
-; CHECK: # BB#0:
-; CHECK-NEXT: vmovaps (%rcx), %xmm0
-; CHECK-NEXT: vfmadd132ss (%rdx), %xmm0, %xmm0
-; CHECK-NEXT: retq
+; FMA-LABEL: test_x86_fmadd_aba_ss:
+; FMA: # %bb.0:
+; FMA-NEXT: vmovaps (%rcx), %xmm0
+; FMA-NEXT: vfmadd132ss (%rdx), %xmm0, %xmm0
+; FMA-NEXT: retq
%res = call <4 x float> @llvm.x86.fma.vfmadd.ss(<4 x float> %a, <4 x float> %b, <4 x float> %a) nounwind
ret <4 x float> %res
}
define <4 x float> @test_x86_fmadd_bba_ss(<4 x float> %a, <4 x float> %b) #0 {
-; CHECK-LABEL: test_x86_fmadd_bba_ss:
-; CHECK: # BB#0:
-; CHECK-NEXT: vmovaps (%rdx), %xmm0
-; CHECK-NEXT: vfmadd213ss (%rcx), %xmm0, %xmm0
-; CHECK-NEXT: retq
+; FMA-LABEL: test_x86_fmadd_bba_ss:
+; FMA: # %bb.0:
+; FMA-NEXT: vmovaps (%rdx), %xmm0
+; FMA-NEXT: vfmadd213ss (%rcx), %xmm0, %xmm0
+; FMA-NEXT: retq
%res = call <4 x float> @llvm.x86.fma.vfmadd.ss(<4 x float> %b, <4 x float> %b, <4 x float> %a) nounwind
ret <4 x float> %res
}
declare <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
define <4 x float> @test_x86_fmadd_baa_ps(<4 x float> %a, <4 x float> %b) #0 {
-; CHECK-LABEL: test_x86_fmadd_baa_ps:
-; CHECK: # BB#0:
-; CHECK-NEXT: vmovaps (%rcx), %xmm0
-; CHECK-NEXT: vfmadd132ps (%rdx), %xmm0, %xmm0
-; CHECK-NEXT: retq
+; FMA-LABEL: test_x86_fmadd_baa_ps:
+; FMA: # %bb.0:
+; FMA-NEXT: vmovaps (%rcx), %xmm0
+; FMA-NEXT: vfmadd132ps (%rdx), %xmm0, %xmm0
+; FMA-NEXT: retq
%res = call <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float> %b, <4 x float> %a, <4 x float> %a) nounwind
ret <4 x float> %res
}
define <4 x float> @test_x86_fmadd_aba_ps(<4 x float> %a, <4 x float> %b) #0 {
-; CHECK-LABEL: test_x86_fmadd_aba_ps:
-; CHECK: # BB#0:
-; CHECK-NEXT: vmovaps (%rcx), %xmm0
-; CHECK-NEXT: vfmadd231ps (%rdx), %xmm0, %xmm0
-; CHECK-NEXT: retq
+; FMA-LABEL: test_x86_fmadd_aba_ps:
+; FMA: # %bb.0:
+; FMA-NEXT: vmovaps (%rcx), %xmm0
+; FMA-NEXT: vfmadd231ps (%rdx), %xmm0, %xmm0
+; FMA-NEXT: retq
%res = call <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float> %a, <4 x float> %b, <4 x float> %a) nounwind
ret <4 x float> %res
}
define <4 x float> @test_x86_fmadd_bba_ps(<4 x float> %a, <4 x float> %b) #0 {
-; CHECK-LABEL: test_x86_fmadd_bba_ps:
-; CHECK: # BB#0:
-; CHECK-NEXT: vmovaps (%rdx), %xmm0
-; CHECK-NEXT: vfmadd213ps (%rcx), %xmm0, %xmm0
-; CHECK-NEXT: retq
+; FMA-LABEL: test_x86_fmadd_bba_ps:
+; FMA: # %bb.0:
+; FMA-NEXT: vmovaps (%rdx), %xmm0
+; FMA-NEXT: vfmadd213ps (%rcx), %xmm0, %xmm0
+; FMA-NEXT: retq
%res = call <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float> %b, <4 x float> %b, <4 x float> %a) nounwind
ret <4 x float> %res
}
declare <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float>, <8 x float>, <8 x float>) nounwind readnone
define <8 x float> @test_x86_fmadd_baa_ps_y(<8 x float> %a, <8 x float> %b) #0 {
-; CHECK-LABEL: test_x86_fmadd_baa_ps_y:
-; CHECK: # BB#0:
-; CHECK-NEXT: vmovaps (%rcx), %ymm0
-; CHECK-NEXT: vfmadd132ps (%rdx), %ymm0, %ymm0
-; CHECK-NEXT: retq
+; FMA-LABEL: test_x86_fmadd_baa_ps_y:
+; FMA: # %bb.0:
+; FMA-NEXT: vmovaps (%rcx), %ymm0
+; FMA-NEXT: vfmadd132ps (%rdx), %ymm0, %ymm0
+; FMA-NEXT: retq
%res = call <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float> %b, <8 x float> %a, <8 x float> %a) nounwind
ret <8 x float> %res
}
define <8 x float> @test_x86_fmadd_aba_ps_y(<8 x float> %a, <8 x float> %b) #0 {
-; CHECK-LABEL: test_x86_fmadd_aba_ps_y:
-; CHECK: # BB#0:
-; CHECK-NEXT: vmovaps (%rcx), %ymm0
-; CHECK-NEXT: vfmadd231ps (%rdx), %ymm0, %ymm0
-; CHECK-NEXT: retq
+; FMA-LABEL: test_x86_fmadd_aba_ps_y:
+; FMA: # %bb.0:
+; FMA-NEXT: vmovaps (%rcx), %ymm0
+; FMA-NEXT: vfmadd231ps (%rdx), %ymm0, %ymm0
+; FMA-NEXT: retq
%res = call <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %a) nounwind
ret <8 x float> %res
}
define <8 x float> @test_x86_fmadd_bba_ps_y(<8 x float> %a, <8 x float> %b) #0 {
-; CHECK-LABEL: test_x86_fmadd_bba_ps_y:
-; CHECK: # BB#0:
-; CHECK-NEXT: vmovaps (%rdx), %ymm0
-; CHECK-NEXT: vfmadd213ps (%rcx), %ymm0, %ymm0
-; CHECK-NEXT: retq
+; FMA-LABEL: test_x86_fmadd_bba_ps_y:
+; FMA: # %bb.0:
+; FMA-NEXT: vmovaps (%rdx), %ymm0
+; FMA-NEXT: vfmadd213ps (%rcx), %ymm0, %ymm0
+; FMA-NEXT: retq
%res = call <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float> %b, <8 x float> %b, <8 x float> %a) nounwind
ret <8 x float> %res
}
declare <2 x double> @llvm.x86.fma.vfmadd.sd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
define <2 x double> @test_x86_fmadd_baa_sd(<2 x double> %a, <2 x double> %b) #0 {
-; CHECK-LABEL: test_x86_fmadd_baa_sd:
-; CHECK: # BB#0:
-; CHECK-NEXT: vmovapd {{\(%rdx\), %xmm0|\(%rcx\), %xmm1}}
-; CHECK-NEXT: vmovapd {{\(%rdx\), %xmm0|\(%rcx\), %xmm1}}
-; CHECK-NEXT: vfmadd213sd %xmm1, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; FMA-LABEL: test_x86_fmadd_baa_sd:
+; FMA: # %bb.0:
+; FMA-NEXT: vmovapd (%rcx), %xmm1
+; FMA-NEXT: vmovapd (%rdx), %xmm0
+; FMA-NEXT: vfmadd213sd %xmm1, %xmm1, %xmm0
+; FMA-NEXT: retq
%res = call <2 x double> @llvm.x86.fma.vfmadd.sd(<2 x double> %b, <2 x double> %a, <2 x double> %a) nounwind
ret <2 x double> %res
}
define <2 x double> @test_x86_fmadd_aba_sd(<2 x double> %a, <2 x double> %b) #0 {
-; CHECK-LABEL: test_x86_fmadd_aba_sd:
-; CHECK: # BB#0:
-; CHECK-NEXT: vmovapd (%rcx), %xmm0
-; CHECK-NEXT: vfmadd132sd (%rdx), %xmm0, %xmm0
-; CHECK-NEXT: retq
+; FMA-LABEL: test_x86_fmadd_aba_sd:
+; FMA: # %bb.0:
+; FMA-NEXT: vmovapd (%rcx), %xmm0
+; FMA-NEXT: vfmadd132sd (%rdx), %xmm0, %xmm0
+; FMA-NEXT: retq
%res = call <2 x double> @llvm.x86.fma.vfmadd.sd(<2 x double> %a, <2 x double> %b, <2 x double> %a) nounwind
ret <2 x double> %res
}
define <2 x double> @test_x86_fmadd_bba_sd(<2 x double> %a, <2 x double> %b) #0 {
-; CHECK-LABEL: test_x86_fmadd_bba_sd:
-; CHECK: # BB#0:
-; CHECK-NEXT: vmovapd (%rdx), %xmm0
-; CHECK-NEXT: vfmadd213sd (%rcx), %xmm0, %xmm0
-; CHECK-NEXT: retq
+; FMA-LABEL: test_x86_fmadd_bba_sd:
+; FMA: # %bb.0:
+; FMA-NEXT: vmovapd (%rdx), %xmm0
+; FMA-NEXT: vfmadd213sd (%rcx), %xmm0, %xmm0
+; FMA-NEXT: retq
%res = call <2 x double> @llvm.x86.fma.vfmadd.sd(<2 x double> %b, <2 x double> %b, <2 x double> %a) nounwind
ret <2 x double> %res
}
declare <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
define <2 x double> @test_x86_fmadd_baa_pd(<2 x double> %a, <2 x double> %b) #0 {
-; CHECK-LABEL: test_x86_fmadd_baa_pd:
-; CHECK: # BB#0:
-; CHECK-NEXT: vmovapd (%rcx), %xmm0
-; CHECK-NEXT: vfmadd132pd (%rdx), %xmm0, %xmm0
-; CHECK-NEXT: retq
+; FMA-LABEL: test_x86_fmadd_baa_pd:
+; FMA: # %bb.0:
+; FMA-NEXT: vmovapd (%rcx), %xmm0
+; FMA-NEXT: vfmadd132pd (%rdx), %xmm0, %xmm0
+; FMA-NEXT: retq
%res = call <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double> %b, <2 x double> %a, <2 x double> %a) nounwind
ret <2 x double> %res
}
define <2 x double> @test_x86_fmadd_aba_pd(<2 x double> %a, <2 x double> %b) #0 {
-; CHECK-LABEL: test_x86_fmadd_aba_pd:
-; CHECK: # BB#0:
-; CHECK-NEXT: vmovapd (%rcx), %xmm0
-; CHECK-NEXT: vfmadd231pd (%rdx), %xmm0, %xmm0
-; CHECK-NEXT: retq
+; FMA-LABEL: test_x86_fmadd_aba_pd:
+; FMA: # %bb.0:
+; FMA-NEXT: vmovapd (%rcx), %xmm0
+; FMA-NEXT: vfmadd231pd (%rdx), %xmm0, %xmm0
+; FMA-NEXT: retq
%res = call <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double> %a, <2 x double> %b, <2 x double> %a) nounwind
ret <2 x double> %res
}
define <2 x double> @test_x86_fmadd_bba_pd(<2 x double> %a, <2 x double> %b) #0 {
-; CHECK-LABEL: test_x86_fmadd_bba_pd:
-; CHECK: # BB#0:
-; CHECK-NEXT: vmovapd (%rdx), %xmm0
-; CHECK-NEXT: vfmadd213pd (%rcx), %xmm0, %xmm0
-; CHECK-NEXT: retq
+; FMA-LABEL: test_x86_fmadd_bba_pd:
+; FMA: # %bb.0:
+; FMA-NEXT: vmovapd (%rdx), %xmm0
+; FMA-NEXT: vfmadd213pd (%rcx), %xmm0, %xmm0
+; FMA-NEXT: retq
%res = call <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double> %b, <2 x double> %b, <2 x double> %a) nounwind
ret <2 x double> %res
}
declare <4 x double> @llvm.x86.fma.vfmadd.pd.256(<4 x double>, <4 x double>, <4 x double>) nounwind readnone
define <4 x double> @test_x86_fmadd_baa_pd_y(<4 x double> %a, <4 x double> %b) #0 {
-; CHECK-LABEL: test_x86_fmadd_baa_pd_y:
-; CHECK: # BB#0:
-; CHECK-NEXT: vmovapd (%rcx), %ymm0
-; CHECK-NEXT: vfmadd132pd (%rdx), %ymm0, %ymm0
-; CHECK-NEXT: retq
+; FMA-LABEL: test_x86_fmadd_baa_pd_y:
+; FMA: # %bb.0:
+; FMA-NEXT: vmovapd (%rcx), %ymm0
+; FMA-NEXT: vfmadd132pd (%rdx), %ymm0, %ymm0
+; FMA-NEXT: retq
%res = call <4 x double> @llvm.x86.fma.vfmadd.pd.256(<4 x double> %b, <4 x double> %a, <4 x double> %a) nounwind
ret <4 x double> %res
}
define <4 x double> @test_x86_fmadd_aba_pd_y(<4 x double> %a, <4 x double> %b) #0 {
-; CHECK-LABEL: test_x86_fmadd_aba_pd_y:
-; CHECK: # BB#0:
-; CHECK-NEXT: vmovapd (%rcx), %ymm0
-; CHECK-NEXT: vfmadd231pd (%rdx), %ymm0, %ymm0
-; CHECK-NEXT: retq
+; FMA-LABEL: test_x86_fmadd_aba_pd_y:
+; FMA: # %bb.0:
+; FMA-NEXT: vmovapd (%rcx), %ymm0
+; FMA-NEXT: vfmadd231pd (%rdx), %ymm0, %ymm0
+; FMA-NEXT: retq
%res = call <4 x double> @llvm.x86.fma.vfmadd.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %a) nounwind
ret <4 x double> %res
}
define <4 x double> @test_x86_fmadd_bba_pd_y(<4 x double> %a, <4 x double> %b) #0 {
-; CHECK-LABEL: test_x86_fmadd_bba_pd_y:
-; CHECK: # BB#0:
-; CHECK-NEXT: vmovapd (%rdx), %ymm0
-; CHECK-NEXT: vfmadd213pd (%rcx), %ymm0, %ymm0
-; CHECK-NEXT: retq
+; FMA-LABEL: test_x86_fmadd_bba_pd_y:
+; FMA: # %bb.0:
+; FMA-NEXT: vmovapd (%rdx), %ymm0
+; FMA-NEXT: vfmadd213pd (%rcx), %ymm0, %ymm0
+; FMA-NEXT: retq
%res = call <4 x double> @llvm.x86.fma.vfmadd.pd.256(<4 x double> %b, <4 x double> %b, <4 x double> %a) nounwind
ret <4 x double> %res
}
@@ -195,377 +196,376 @@ define <4 x double> @test_x86_fmadd_bba_pd_y(<4 x double> %a, <4 x double> %b) #
declare <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
define <4 x float> @test_x86_fnmadd_baa_ss(<4 x float> %a, <4 x float> %b) #0 {
-; CHECK-LABEL: test_x86_fnmadd_baa_ss:
-; CHECK: # BB#0:
-; CHECK-NEXT: vmovaps {{\(%rdx\), %xmm0|\(%rcx\), %xmm1}}
-; CHECK-NEXT: vmovaps {{\(%rdx\), %xmm0|\(%rcx\), %xmm1}}
-; CHECK-NEXT: vfnmadd213ss %xmm1, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; FMA-LABEL: test_x86_fnmadd_baa_ss:
+; FMA: # %bb.0:
+; FMA-NEXT: vmovaps (%rcx), %xmm1
+; FMA-NEXT: vmovaps (%rdx), %xmm0
+; FMA-NEXT: vfnmadd213ss %xmm1, %xmm1, %xmm0
+; FMA-NEXT: retq
%res = call <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float> %b, <4 x float> %a, <4 x float> %a) nounwind
ret <4 x float> %res
}
define <4 x float> @test_x86_fnmadd_aba_ss(<4 x float> %a, <4 x float> %b) #0 {
-; CHECK-LABEL: test_x86_fnmadd_aba_ss:
-; CHECK: # BB#0:
-; CHECK-NEXT: vmovaps (%rcx), %xmm0
-; CHECK-NEXT: vfnmadd132ss (%rdx), %xmm0, %xmm0
-; CHECK-NEXT: retq
+; FMA-LABEL: test_x86_fnmadd_aba_ss:
+; FMA: # %bb.0:
+; FMA-NEXT: vmovaps (%rcx), %xmm0
+; FMA-NEXT: vfnmadd132ss (%rdx), %xmm0, %xmm0
+; FMA-NEXT: retq
%res = call <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float> %a, <4 x float> %b, <4 x float> %a) nounwind
ret <4 x float> %res
}
define <4 x float> @test_x86_fnmadd_bba_ss(<4 x float> %a, <4 x float> %b) #0 {
-; CHECK-LABEL: test_x86_fnmadd_bba_ss:
-; CHECK: # BB#0:
-; CHECK-NEXT: vmovaps (%rdx), %xmm0
-; CHECK-NEXT: vfnmadd213ss (%rcx), %xmm0, %xmm0
-; CHECK-NEXT: retq
+; FMA-LABEL: test_x86_fnmadd_bba_ss:
+; FMA: # %bb.0:
+; FMA-NEXT: vmovaps (%rdx), %xmm0
+; FMA-NEXT: vfnmadd213ss (%rcx), %xmm0, %xmm0
+; FMA-NEXT: retq
%res = call <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float> %b, <4 x float> %b, <4 x float> %a) nounwind
ret <4 x float> %res
}
declare <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
define <4 x float> @test_x86_fnmadd_baa_ps(<4 x float> %a, <4 x float> %b) #0 {
-; CHECK-LABEL: test_x86_fnmadd_baa_ps:
-; CHECK: # BB#0:
-; CHECK-NEXT: vmovaps (%rcx), %xmm0
-; CHECK-NEXT: vfnmadd132ps (%rdx), %xmm0, %xmm0
-; CHECK-NEXT: retq
+; FMA-LABEL: test_x86_fnmadd_baa_ps:
+; FMA: # %bb.0:
+; FMA-NEXT: vmovaps (%rcx), %xmm0
+; FMA-NEXT: vfnmadd132ps (%rdx), %xmm0, %xmm0
+; FMA-NEXT: retq
%res = call <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float> %b, <4 x float> %a, <4 x float> %a) nounwind
ret <4 x float> %res
}
define <4 x float> @test_x86_fnmadd_aba_ps(<4 x float> %a, <4 x float> %b) #0 {
-; CHECK-LABEL: test_x86_fnmadd_aba_ps:
-; CHECK: # BB#0:
-; CHECK-NEXT: vmovaps (%rcx), %xmm0
-; CHECK-NEXT: vfnmadd231ps (%rdx), %xmm0, %xmm0
-; CHECK-NEXT: retq
+; FMA-LABEL: test_x86_fnmadd_aba_ps:
+; FMA: # %bb.0:
+; FMA-NEXT: vmovaps (%rcx), %xmm0
+; FMA-NEXT: vfnmadd231ps (%rdx), %xmm0, %xmm0
+; FMA-NEXT: retq
%res = call <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float> %a, <4 x float> %b, <4 x float> %a) nounwind
ret <4 x float> %res
}
define <4 x float> @test_x86_fnmadd_bba_ps(<4 x float> %a, <4 x float> %b) #0 {
-; CHECK-LABEL: test_x86_fnmadd_bba_ps:
-; CHECK: # BB#0:
-; CHECK-NEXT: vmovaps (%rdx), %xmm0
-; CHECK-NEXT: vfnmadd213ps (%rcx), %xmm0, %xmm0
-; CHECK-NEXT: retq
+; FMA-LABEL: test_x86_fnmadd_bba_ps:
+; FMA: # %bb.0:
+; FMA-NEXT: vmovaps (%rdx), %xmm0
+; FMA-NEXT: vfnmadd213ps (%rcx), %xmm0, %xmm0
+; FMA-NEXT: retq
%res = call <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float> %b, <4 x float> %b, <4 x float> %a) nounwind
ret <4 x float> %res
}
declare <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float>, <8 x float>, <8 x float>) nounwind readnone
define <8 x float> @test_x86_fnmadd_baa_ps_y(<8 x float> %a, <8 x float> %b) #0 {
-; CHECK-LABEL: test_x86_fnmadd_baa_ps_y:
-; CHECK: # BB#0:
-; CHECK-NEXT: vmovaps (%rcx), %ymm0
-; CHECK-NEXT: vfnmadd132ps (%rdx), %ymm0, %ymm0
-; CHECK-NEXT: retq
+; FMA-LABEL: test_x86_fnmadd_baa_ps_y:
+; FMA: # %bb.0:
+; FMA-NEXT: vmovaps (%rcx), %ymm0
+; FMA-NEXT: vfnmadd132ps (%rdx), %ymm0, %ymm0
+; FMA-NEXT: retq
%res = call <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float> %b, <8 x float> %a, <8 x float> %a) nounwind
ret <8 x float> %res
}
define <8 x float> @test_x86_fnmadd_aba_ps_y(<8 x float> %a, <8 x float> %b) #0 {
-; CHECK-LABEL: test_x86_fnmadd_aba_ps_y:
-; CHECK: # BB#0:
-; CHECK-NEXT: vmovaps (%rcx), %ymm0
-; CHECK-NEXT: vfnmadd231ps (%rdx), %ymm0, %ymm0
-; CHECK-NEXT: retq
+; FMA-LABEL: test_x86_fnmadd_aba_ps_y:
+; FMA: # %bb.0:
+; FMA-NEXT: vmovaps (%rcx), %ymm0
+; FMA-NEXT: vfnmadd231ps (%rdx), %ymm0, %ymm0
+; FMA-NEXT: retq
%res = call <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %a) nounwind
ret <8 x float> %res
}
define <8 x float> @test_x86_fnmadd_bba_ps_y(<8 x float> %a, <8 x float> %b) #0 {
-; CHECK-LABEL: test_x86_fnmadd_bba_ps_y:
-; CHECK: # BB#0:
-; CHECK-NEXT: vmovaps (%rdx), %ymm0
-; CHECK-NEXT: vfnmadd213ps (%rcx), %ymm0, %ymm0
-; CHECK-NEXT: retq
+; FMA-LABEL: test_x86_fnmadd_bba_ps_y:
+; FMA: # %bb.0:
+; FMA-NEXT: vmovaps (%rdx), %ymm0
+; FMA-NEXT: vfnmadd213ps (%rcx), %ymm0, %ymm0
+; FMA-NEXT: retq
%res = call <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float> %b, <8 x float> %b, <8 x float> %a) nounwind
ret <8 x float> %res
}
declare <2 x double> @llvm.x86.fma.vfnmadd.sd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
define <2 x double> @test_x86_fnmadd_baa_sd(<2 x double> %a, <2 x double> %b) #0 {
-; CHECK-LABEL: test_x86_fnmadd_baa_sd:
-; CHECK: # BB#0:
-; CHECK-NEXT: vmovapd {{\(%rdx\), %xmm0|\(%rcx\), %xmm1}}
-; CHECK-NEXT: vmovapd {{\(%rdx\), %xmm0|\(%rcx\), %xmm1}}
-; CHECK-NEXT: vfnmadd213sd %xmm1, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; FMA-LABEL: test_x86_fnmadd_baa_sd:
+; FMA: # %bb.0:
+; FMA-NEXT: vmovapd (%rcx), %xmm1
+; FMA-NEXT: vmovapd (%rdx), %xmm0
+; FMA-NEXT: vfnmadd213sd %xmm1, %xmm1, %xmm0
+; FMA-NEXT: retq
%res = call <2 x double> @llvm.x86.fma.vfnmadd.sd(<2 x double> %b, <2 x double> %a, <2 x double> %a) nounwind
ret <2 x double> %res
}
define <2 x double> @test_x86_fnmadd_aba_sd(<2 x double> %a, <2 x double> %b) #0 {
-; CHECK-LABEL: test_x86_fnmadd_aba_sd:
-; CHECK: # BB#0:
-; CHECK-NEXT: vmovapd (%rcx), %xmm0
-; CHECK-NEXT: vfnmadd132sd (%rdx), %xmm0, %xmm0
-; CHECK-NEXT: retq
+; FMA-LABEL: test_x86_fnmadd_aba_sd:
+; FMA: # %bb.0:
+; FMA-NEXT: vmovapd (%rcx), %xmm0
+; FMA-NEXT: vfnmadd132sd (%rdx), %xmm0, %xmm0
+; FMA-NEXT: retq
%res = call <2 x double> @llvm.x86.fma.vfnmadd.sd(<2 x double> %a, <2 x double> %b, <2 x double> %a) nounwind
ret <2 x double> %res
}
define <2 x double> @test_x86_fnmadd_bba_sd(<2 x double> %a, <2 x double> %b) #0 {
-; CHECK-LABEL: test_x86_fnmadd_bba_sd:
-; CHECK: # BB#0:
-; CHECK-NEXT: vmovapd (%rdx), %xmm0
-; CHECK-NEXT: vfnmadd213sd (%rcx), %xmm0, %xmm0
-; CHECK-NEXT: retq
+; FMA-LABEL: test_x86_fnmadd_bba_sd:
+; FMA: # %bb.0:
+; FMA-NEXT: vmovapd (%rdx), %xmm0
+; FMA-NEXT: vfnmadd213sd (%rcx), %xmm0, %xmm0
+; FMA-NEXT: retq
%res = call <2 x double> @llvm.x86.fma.vfnmadd.sd(<2 x double> %b, <2 x double> %b, <2 x double> %a) nounwind
ret <2 x double> %res
}
declare <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
define <2 x double> @test_x86_fnmadd_baa_pd(<2 x double> %a, <2 x double> %b) #0 {
-; CHECK-LABEL: test_x86_fnmadd_baa_pd:
-; CHECK: # BB#0:
-; CHECK-NEXT: vmovapd (%rcx), %xmm0
-; CHECK-NEXT: vfnmadd132pd (%rdx), %xmm0, %xmm0
-; CHECK-NEXT: retq
+; FMA-LABEL: test_x86_fnmadd_baa_pd:
+; FMA: # %bb.0:
+; FMA-NEXT: vmovapd (%rcx), %xmm0
+; FMA-NEXT: vfnmadd132pd (%rdx), %xmm0, %xmm0
+; FMA-NEXT: retq
%res = call <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double> %b, <2 x double> %a, <2 x double> %a) nounwind
ret <2 x double> %res
}
define <2 x double> @test_x86_fnmadd_aba_pd(<2 x double> %a, <2 x double> %b) #0 {
-; CHECK-LABEL: test_x86_fnmadd_aba_pd:
-; CHECK: # BB#0:
-; CHECK-NEXT: vmovapd (%rcx), %xmm0
-; CHECK-NEXT: vfnmadd231pd (%rdx), %xmm0, %xmm0
-; CHECK-NEXT: retq
+; FMA-LABEL: test_x86_fnmadd_aba_pd:
+; FMA: # %bb.0:
+; FMA-NEXT: vmovapd (%rcx), %xmm0
+; FMA-NEXT: vfnmadd231pd (%rdx), %xmm0, %xmm0
+; FMA-NEXT: retq
%res = call <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double> %a, <2 x double> %b, <2 x double> %a) nounwind
ret <2 x double> %res
}
define <2 x double> @test_x86_fnmadd_bba_pd(<2 x double> %a, <2 x double> %b) #0 {
-; CHECK-LABEL: test_x86_fnmadd_bba_pd:
-; CHECK: # BB#0:
-; CHECK-NEXT: vmovapd (%rdx), %xmm0
-; CHECK-NEXT: vfnmadd213pd (%rcx), %xmm0, %xmm0
-; CHECK-NEXT: retq
+; FMA-LABEL: test_x86_fnmadd_bba_pd:
+; FMA: # %bb.0:
+; FMA-NEXT: vmovapd (%rdx), %xmm0
+; FMA-NEXT: vfnmadd213pd (%rcx), %xmm0, %xmm0
+; FMA-NEXT: retq
%res = call <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double> %b, <2 x double> %b, <2 x double> %a) nounwind
ret <2 x double> %res
}
declare <4 x double> @llvm.x86.fma.vfnmadd.pd.256(<4 x double>, <4 x double>, <4 x double>) nounwind readnone
define <4 x double> @test_x86_fnmadd_baa_pd_y(<4 x double> %a, <4 x double> %b) #0 {
-; CHECK-LABEL: test_x86_fnmadd_baa_pd_y:
-; CHECK: # BB#0:
-; CHECK-NEXT: vmovapd (%rcx), %ymm0
-; CHECK-NEXT: vfnmadd132pd (%rdx), %ymm0, %ymm0
-; CHECK-NEXT: retq
+; FMA-LABEL: test_x86_fnmadd_baa_pd_y:
+; FMA: # %bb.0:
+; FMA-NEXT: vmovapd (%rcx), %ymm0
+; FMA-NEXT: vfnmadd132pd (%rdx), %ymm0, %ymm0
+; FMA-NEXT: retq
%res = call <4 x double> @llvm.x86.fma.vfnmadd.pd.256(<4 x double> %b, <4 x double> %a, <4 x double> %a) nounwind
ret <4 x double> %res
}
define <4 x double> @test_x86_fnmadd_aba_pd_y(<4 x double> %a, <4 x double> %b) #0 {
-; CHECK-LABEL: test_x86_fnmadd_aba_pd_y:
-; CHECK: # BB#0:
-; CHECK-NEXT: vmovapd (%rcx), %ymm0
-; CHECK-NEXT: vfnmadd231pd (%rdx), %ymm0, %ymm0
-; CHECK-NEXT: retq
+; FMA-LABEL: test_x86_fnmadd_aba_pd_y:
+; FMA: # %bb.0:
+; FMA-NEXT: vmovapd (%rcx), %ymm0
+; FMA-NEXT: vfnmadd231pd (%rdx), %ymm0, %ymm0
+; FMA-NEXT: retq
%res = call <4 x double> @llvm.x86.fma.vfnmadd.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %a) nounwind
ret <4 x double> %res
}
define <4 x double> @test_x86_fnmadd_bba_pd_y(<4 x double> %a, <4 x double> %b) #0 {
-; CHECK-LABEL: test_x86_fnmadd_bba_pd_y:
-; CHECK: # BB#0:
-; CHECK-NEXT: vmovapd (%rdx), %ymm0
-; CHECK-NEXT: vfnmadd213pd (%rcx), %ymm0, %ymm0
-; CHECK-NEXT: retq
+; FMA-LABEL: test_x86_fnmadd_bba_pd_y:
+; FMA: # %bb.0:
+; FMA-NEXT: vmovapd (%rdx), %ymm0
+; FMA-NEXT: vfnmadd213pd (%rcx), %ymm0, %ymm0
+; FMA-NEXT: retq
%res = call <4 x double> @llvm.x86.fma.vfnmadd.pd.256(<4 x double> %b, <4 x double> %b, <4 x double> %a) nounwind
ret <4 x double> %res
}
-
declare <4 x float> @llvm.x86.fma.vfmsub.ss(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
define <4 x float> @test_x86_fmsub_baa_ss(<4 x float> %a, <4 x float> %b) #0 {
-; CHECK-LABEL: test_x86_fmsub_baa_ss:
-; CHECK: # BB#0:
-; CHECK-NEXT: vmovaps {{\(%rdx\), %xmm0|\(%rcx\), %xmm1}}
-; CHECK-NEXT: vmovaps {{\(%rdx\), %xmm0|\(%rcx\), %xmm1}}
-; CHECK-NEXT: vfmsub213ss %xmm1, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; FMA-LABEL: test_x86_fmsub_baa_ss:
+; FMA: # %bb.0:
+; FMA-NEXT: vmovaps (%rcx), %xmm1
+; FMA-NEXT: vmovaps (%rdx), %xmm0
+; FMA-NEXT: vfmsub213ss %xmm1, %xmm1, %xmm0
+; FMA-NEXT: retq
%res = call <4 x float> @llvm.x86.fma.vfmsub.ss(<4 x float> %b, <4 x float> %a, <4 x float> %a) nounwind
ret <4 x float> %res
}
define <4 x float> @test_x86_fmsub_aba_ss(<4 x float> %a, <4 x float> %b) #0 {
-; CHECK-LABEL: test_x86_fmsub_aba_ss:
-; CHECK: # BB#0:
-; CHECK-NEXT: vmovaps (%rcx), %xmm0
-; CHECK-NEXT: vfmsub132ss (%rdx), %xmm0, %xmm0
-; CHECK-NEXT: retq
+; FMA-LABEL: test_x86_fmsub_aba_ss:
+; FMA: # %bb.0:
+; FMA-NEXT: vmovaps (%rcx), %xmm0
+; FMA-NEXT: vfmsub132ss (%rdx), %xmm0, %xmm0
+; FMA-NEXT: retq
%res = call <4 x float> @llvm.x86.fma.vfmsub.ss(<4 x float> %a, <4 x float> %b, <4 x float> %a) nounwind
ret <4 x float> %res
}
define <4 x float> @test_x86_fmsub_bba_ss(<4 x float> %a, <4 x float> %b) #0 {
-; CHECK-LABEL: test_x86_fmsub_bba_ss:
-; CHECK: # BB#0:
-; CHECK-NEXT: vmovaps (%rdx), %xmm0
-; CHECK-NEXT: vfmsub213ss (%rcx), %xmm0, %xmm0
-; CHECK-NEXT: retq
+; FMA-LABEL: test_x86_fmsub_bba_ss:
+; FMA: # %bb.0:
+; FMA-NEXT: vmovaps (%rdx), %xmm0
+; FMA-NEXT: vfmsub213ss (%rcx), %xmm0, %xmm0
+; FMA-NEXT: retq
%res = call <4 x float> @llvm.x86.fma.vfmsub.ss(<4 x float> %b, <4 x float> %b, <4 x float> %a) nounwind
ret <4 x float> %res
}
declare <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
define <4 x float> @test_x86_fmsub_baa_ps(<4 x float> %a, <4 x float> %b) #0 {
-; CHECK-LABEL: test_x86_fmsub_baa_ps:
-; CHECK: # BB#0:
-; CHECK-NEXT: vmovaps (%rcx), %xmm0
-; CHECK-NEXT: vfmsub132ps (%rdx), %xmm0, %xmm0
-; CHECK-NEXT: retq
+; FMA-LABEL: test_x86_fmsub_baa_ps:
+; FMA: # %bb.0:
+; FMA-NEXT: vmovaps (%rcx), %xmm0
+; FMA-NEXT: vfmsub132ps (%rdx), %xmm0, %xmm0
+; FMA-NEXT: retq
%res = call <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float> %b, <4 x float> %a, <4 x float> %a) nounwind
ret <4 x float> %res
}
define <4 x float> @test_x86_fmsub_aba_ps(<4 x float> %a, <4 x float> %b) #0 {
-; CHECK-LABEL: test_x86_fmsub_aba_ps:
-; CHECK: # BB#0:
-; CHECK-NEXT: vmovaps (%rcx), %xmm0
-; CHECK-NEXT: vfmsub231ps (%rdx), %xmm0, %xmm0
-; CHECK-NEXT: retq
+; FMA-LABEL: test_x86_fmsub_aba_ps:
+; FMA: # %bb.0:
+; FMA-NEXT: vmovaps (%rcx), %xmm0
+; FMA-NEXT: vfmsub231ps (%rdx), %xmm0, %xmm0
+; FMA-NEXT: retq
%res = call <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float> %a, <4 x float> %b, <4 x float> %a) nounwind
ret <4 x float> %res
}
define <4 x float> @test_x86_fmsub_bba_ps(<4 x float> %a, <4 x float> %b) #0 {
-; CHECK-LABEL: test_x86_fmsub_bba_ps:
-; CHECK: # BB#0:
-; CHECK-NEXT: vmovaps (%rdx), %xmm0
-; CHECK-NEXT: vfmsub213ps (%rcx), %xmm0, %xmm0
-; CHECK-NEXT: retq
+; FMA-LABEL: test_x86_fmsub_bba_ps:
+; FMA: # %bb.0:
+; FMA-NEXT: vmovaps (%rdx), %xmm0
+; FMA-NEXT: vfmsub213ps (%rcx), %xmm0, %xmm0
+; FMA-NEXT: retq
%res = call <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float> %b, <4 x float> %b, <4 x float> %a) nounwind
ret <4 x float> %res
}
declare <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float>, <8 x float>, <8 x float>) nounwind readnone
define <8 x float> @test_x86_fmsub_baa_ps_y(<8 x float> %a, <8 x float> %b) #0 {
-; CHECK-LABEL: test_x86_fmsub_baa_ps_y:
-; CHECK: # BB#0:
-; CHECK-NEXT: vmovaps (%rcx), %ymm0
-; CHECK-NEXT: vfmsub132ps (%rdx), %ymm0, %ymm0
-; CHECK-NEXT: retq
+; FMA-LABEL: test_x86_fmsub_baa_ps_y:
+; FMA: # %bb.0:
+; FMA-NEXT: vmovaps (%rcx), %ymm0
+; FMA-NEXT: vfmsub132ps (%rdx), %ymm0, %ymm0
+; FMA-NEXT: retq
%res = call <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float> %b, <8 x float> %a, <8 x float> %a) nounwind
ret <8 x float> %res
}
define <8 x float> @test_x86_fmsub_aba_ps_y(<8 x float> %a, <8 x float> %b) #0 {
-; CHECK-LABEL: test_x86_fmsub_aba_ps_y:
-; CHECK: # BB#0:
-; CHECK-NEXT: vmovaps (%rcx), %ymm0
-; CHECK-NEXT: vfmsub231ps (%rdx), %ymm0, %ymm0
-; CHECK-NEXT: retq
+; FMA-LABEL: test_x86_fmsub_aba_ps_y:
+; FMA: # %bb.0:
+; FMA-NEXT: vmovaps (%rcx), %ymm0
+; FMA-NEXT: vfmsub231ps (%rdx), %ymm0, %ymm0
+; FMA-NEXT: retq
%res = call <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %a) nounwind
ret <8 x float> %res
}
define <8 x float> @test_x86_fmsub_bba_ps_y(<8 x float> %a, <8 x float> %b) #0 {
-; CHECK-LABEL: test_x86_fmsub_bba_ps_y:
-; CHECK: # BB#0:
-; CHECK-NEXT: vmovaps (%rdx), %ymm0
-; CHECK-NEXT: vfmsub213ps (%rcx), %ymm0, %ymm0
-; CHECK-NEXT: retq
+; FMA-LABEL: test_x86_fmsub_bba_ps_y:
+; FMA: # %bb.0:
+; FMA-NEXT: vmovaps (%rdx), %ymm0
+; FMA-NEXT: vfmsub213ps (%rcx), %ymm0, %ymm0
+; FMA-NEXT: retq
%res = call <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float> %b, <8 x float> %b, <8 x float> %a) nounwind
ret <8 x float> %res
}
declare <2 x double> @llvm.x86.fma.vfmsub.sd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
define <2 x double> @test_x86_fmsub_baa_sd(<2 x double> %a, <2 x double> %b) #0 {
-; CHECK-LABEL: test_x86_fmsub_baa_sd:
-; CHECK: # BB#0:
-; CHECK-NEXT: vmovapd {{\(%rdx\), %xmm0|\(%rcx\), %xmm1}}
-; CHECK-NEXT: vmovapd {{\(%rdx\), %xmm0|\(%rcx\), %xmm1}}
-; CHECK-NEXT: vfmsub213sd %xmm1, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; FMA-LABEL: test_x86_fmsub_baa_sd:
+; FMA: # %bb.0:
+; FMA-NEXT: vmovapd (%rcx), %xmm1
+; FMA-NEXT: vmovapd (%rdx), %xmm0
+; FMA-NEXT: vfmsub213sd %xmm1, %xmm1, %xmm0
+; FMA-NEXT: retq
%res = call <2 x double> @llvm.x86.fma.vfmsub.sd(<2 x double> %b, <2 x double> %a, <2 x double> %a) nounwind
ret <2 x double> %res
}
define <2 x double> @test_x86_fmsub_aba_sd(<2 x double> %a, <2 x double> %b) #0 {
-; CHECK-LABEL: test_x86_fmsub_aba_sd:
-; CHECK: # BB#0:
-; CHECK-NEXT: vmovapd (%rcx), %xmm0
-; CHECK-NEXT: vfmsub132sd (%rdx), %xmm0, %xmm0
-; CHECK-NEXT: retq
+; FMA-LABEL: test_x86_fmsub_aba_sd:
+; FMA: # %bb.0:
+; FMA-NEXT: vmovapd (%rcx), %xmm0
+; FMA-NEXT: vfmsub132sd (%rdx), %xmm0, %xmm0
+; FMA-NEXT: retq
%res = call <2 x double> @llvm.x86.fma.vfmsub.sd(<2 x double> %a, <2 x double> %b, <2 x double> %a) nounwind
ret <2 x double> %res
}
define <2 x double> @test_x86_fmsub_bba_sd(<2 x double> %a, <2 x double> %b) #0 {
-; CHECK-LABEL: test_x86_fmsub_bba_sd:
-; CHECK: # BB#0:
-; CHECK-NEXT: vmovapd (%rdx), %xmm0
-; CHECK-NEXT: vfmsub213sd (%rcx), %xmm0, %xmm0
-; CHECK-NEXT: retq
+; FMA-LABEL: test_x86_fmsub_bba_sd:
+; FMA: # %bb.0:
+; FMA-NEXT: vmovapd (%rdx), %xmm0
+; FMA-NEXT: vfmsub213sd (%rcx), %xmm0, %xmm0
+; FMA-NEXT: retq
%res = call <2 x double> @llvm.x86.fma.vfmsub.sd(<2 x double> %b, <2 x double> %b, <2 x double> %a) nounwind
ret <2 x double> %res
}
declare <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
define <2 x double> @test_x86_fmsub_baa_pd(<2 x double> %a, <2 x double> %b) #0 {
-; CHECK-LABEL: test_x86_fmsub_baa_pd:
-; CHECK: # BB#0:
-; CHECK-NEXT: vmovapd (%rcx), %xmm0
-; CHECK-NEXT: vfmsub132pd (%rdx), %xmm0, %xmm0
-; CHECK-NEXT: retq
+; FMA-LABEL: test_x86_fmsub_baa_pd:
+; FMA: # %bb.0:
+; FMA-NEXT: vmovapd (%rcx), %xmm0
+; FMA-NEXT: vfmsub132pd (%rdx), %xmm0, %xmm0
+; FMA-NEXT: retq
%res = call <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double> %b, <2 x double> %a, <2 x double> %a) nounwind
ret <2 x double> %res
}
define <2 x double> @test_x86_fmsub_aba_pd(<2 x double> %a, <2 x double> %b) #0 {
-; CHECK-LABEL: test_x86_fmsub_aba_pd:
-; CHECK: # BB#0:
-; CHECK-NEXT: vmovapd (%rcx), %xmm0
-; CHECK-NEXT: vfmsub231pd (%rdx), %xmm0, %xmm0
-; CHECK-NEXT: retq
+; FMA-LABEL: test_x86_fmsub_aba_pd:
+; FMA: # %bb.0:
+; FMA-NEXT: vmovapd (%rcx), %xmm0
+; FMA-NEXT: vfmsub231pd (%rdx), %xmm0, %xmm0
+; FMA-NEXT: retq
%res = call <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double> %a, <2 x double> %b, <2 x double> %a) nounwind
ret <2 x double> %res
}
define <2 x double> @test_x86_fmsub_bba_pd(<2 x double> %a, <2 x double> %b) #0 {
-; CHECK-LABEL: test_x86_fmsub_bba_pd:
-; CHECK: # BB#0:
-; CHECK-NEXT: vmovapd (%rdx), %xmm0
-; CHECK-NEXT: vfmsub213pd (%rcx), %xmm0, %xmm0
-; CHECK-NEXT: retq
+; FMA-LABEL: test_x86_fmsub_bba_pd:
+; FMA: # %bb.0:
+; FMA-NEXT: vmovapd (%rdx), %xmm0
+; FMA-NEXT: vfmsub213pd (%rcx), %xmm0, %xmm0
+; FMA-NEXT: retq
%res = call <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double> %b, <2 x double> %b, <2 x double> %a) nounwind
ret <2 x double> %res
}
declare <4 x double> @llvm.x86.fma.vfmsub.pd.256(<4 x double>, <4 x double>, <4 x double>) nounwind readnone
define <4 x double> @test_x86_fmsub_baa_pd_y(<4 x double> %a, <4 x double> %b) #0 {
-; CHECK-LABEL: test_x86_fmsub_baa_pd_y:
-; CHECK: # BB#0:
-; CHECK-NEXT: vmovapd (%rcx), %ymm0
-; CHECK-NEXT: vfmsub132pd (%rdx), %ymm0, %ymm0
-; CHECK-NEXT: retq
+; FMA-LABEL: test_x86_fmsub_baa_pd_y:
+; FMA: # %bb.0:
+; FMA-NEXT: vmovapd (%rcx), %ymm0
+; FMA-NEXT: vfmsub132pd (%rdx), %ymm0, %ymm0
+; FMA-NEXT: retq
%res = call <4 x double> @llvm.x86.fma.vfmsub.pd.256(<4 x double> %b, <4 x double> %a, <4 x double> %a) nounwind
ret <4 x double> %res
}
define <4 x double> @test_x86_fmsub_aba_pd_y(<4 x double> %a, <4 x double> %b) #0 {
-; CHECK-LABEL: test_x86_fmsub_aba_pd_y:
-; CHECK: # BB#0:
-; CHECK-NEXT: vmovapd (%rcx), %ymm0
-; CHECK-NEXT: vfmsub231pd (%rdx), %ymm0, %ymm0
-; CHECK-NEXT: retq
+; FMA-LABEL: test_x86_fmsub_aba_pd_y:
+; FMA: # %bb.0:
+; FMA-NEXT: vmovapd (%rcx), %ymm0
+; FMA-NEXT: vfmsub231pd (%rdx), %ymm0, %ymm0
+; FMA-NEXT: retq
%res = call <4 x double> @llvm.x86.fma.vfmsub.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %a) nounwind
ret <4 x double> %res
}
define <4 x double> @test_x86_fmsub_bba_pd_y(<4 x double> %a, <4 x double> %b) #0 {
-; CHECK-LABEL: test_x86_fmsub_bba_pd_y:
-; CHECK: # BB#0:
-; CHECK-NEXT: vmovapd (%rdx), %ymm0
-; CHECK-NEXT: vfmsub213pd (%rcx), %ymm0, %ymm0
-; CHECK-NEXT: retq
+; FMA-LABEL: test_x86_fmsub_bba_pd_y:
+; FMA: # %bb.0:
+; FMA-NEXT: vmovapd (%rdx), %ymm0
+; FMA-NEXT: vfmsub213pd (%rcx), %ymm0, %ymm0
+; FMA-NEXT: retq
%res = call <4 x double> @llvm.x86.fma.vfmsub.pd.256(<4 x double> %b, <4 x double> %b, <4 x double> %a) nounwind
ret <4 x double> %res
}
@@ -573,188 +573,188 @@ define <4 x double> @test_x86_fmsub_bba_pd_y(<4 x double> %a, <4 x double> %b) #
declare <4 x float> @llvm.x86.fma.vfnmsub.ss(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
define <4 x float> @test_x86_fnmsub_baa_ss(<4 x float> %a, <4 x float> %b) #0 {
-; CHECK-LABEL: test_x86_fnmsub_baa_ss:
-; CHECK: # BB#0:
-; CHECK-NEXT: vmovaps {{\(%rdx\), %xmm0|\(%rcx\), %xmm1}}
-; CHECK-NEXT: vmovaps {{\(%rdx\), %xmm0|\(%rcx\), %xmm1}}
-; CHECK-NEXT: vfnmsub213ss %xmm1, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; FMA-LABEL: test_x86_fnmsub_baa_ss:
+; FMA: # %bb.0:
+; FMA-NEXT: vmovaps (%rcx), %xmm1
+; FMA-NEXT: vmovaps (%rdx), %xmm0
+; FMA-NEXT: vfnmsub213ss %xmm1, %xmm1, %xmm0
+; FMA-NEXT: retq
%res = call <4 x float> @llvm.x86.fma.vfnmsub.ss(<4 x float> %b, <4 x float> %a, <4 x float> %a) nounwind
ret <4 x float> %res
}
define <4 x float> @test_x86_fnmsub_aba_ss(<4 x float> %a, <4 x float> %b) #0 {
-; CHECK-LABEL: test_x86_fnmsub_aba_ss:
-; CHECK: # BB#0:
-; CHECK-NEXT: vmovaps (%rcx), %xmm0
-; CHECK-NEXT: vfnmsub132ss (%rdx), %xmm0, %xmm0
-; CHECK-NEXT: retq
+; FMA-LABEL: test_x86_fnmsub_aba_ss:
+; FMA: # %bb.0:
+; FMA-NEXT: vmovaps (%rcx), %xmm0
+; FMA-NEXT: vfnmsub132ss (%rdx), %xmm0, %xmm0
+; FMA-NEXT: retq
%res = call <4 x float> @llvm.x86.fma.vfnmsub.ss(<4 x float> %a, <4 x float> %b, <4 x float> %a) nounwind
ret <4 x float> %res
}
define <4 x float> @test_x86_fnmsub_bba_ss(<4 x float> %a, <4 x float> %b) #0 {
-; CHECK-LABEL: test_x86_fnmsub_bba_ss:
-; CHECK: # BB#0:
-; CHECK-NEXT: vmovaps (%rdx), %xmm0
-; CHECK-NEXT: vfnmsub213ss (%rcx), %xmm0, %xmm0
-; CHECK-NEXT: retq
+; FMA-LABEL: test_x86_fnmsub_bba_ss:
+; FMA: # %bb.0:
+; FMA-NEXT: vmovaps (%rdx), %xmm0
+; FMA-NEXT: vfnmsub213ss (%rcx), %xmm0, %xmm0
+; FMA-NEXT: retq
%res = call <4 x float> @llvm.x86.fma.vfnmsub.ss(<4 x float> %b, <4 x float> %b, <4 x float> %a) nounwind
ret <4 x float> %res
}
declare <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
define <4 x float> @test_x86_fnmsub_baa_ps(<4 x float> %a, <4 x float> %b) #0 {
-; CHECK-LABEL: test_x86_fnmsub_baa_ps:
-; CHECK: # BB#0:
-; CHECK-NEXT: vmovaps (%rcx), %xmm0
-; CHECK-NEXT: vfnmsub132ps (%rdx), %xmm0, %xmm0
-; CHECK-NEXT: retq
+; FMA-LABEL: test_x86_fnmsub_baa_ps:
+; FMA: # %bb.0:
+; FMA-NEXT: vmovaps (%rcx), %xmm0
+; FMA-NEXT: vfnmsub132ps (%rdx), %xmm0, %xmm0
+; FMA-NEXT: retq
%res = call <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float> %b, <4 x float> %a, <4 x float> %a) nounwind
ret <4 x float> %res
}
define <4 x float> @test_x86_fnmsub_aba_ps(<4 x float> %a, <4 x float> %b) #0 {
-; CHECK-LABEL: test_x86_fnmsub_aba_ps:
-; CHECK: # BB#0:
-; CHECK-NEXT: vmovaps (%rcx), %xmm0
-; CHECK-NEXT: vfnmsub231ps (%rdx), %xmm0, %xmm0
-; CHECK-NEXT: retq
+; FMA-LABEL: test_x86_fnmsub_aba_ps:
+; FMA: # %bb.0:
+; FMA-NEXT: vmovaps (%rcx), %xmm0
+; FMA-NEXT: vfnmsub231ps (%rdx), %xmm0, %xmm0
+; FMA-NEXT: retq
%res = call <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float> %a, <4 x float> %b, <4 x float> %a) nounwind
ret <4 x float> %res
}
define <4 x float> @test_x86_fnmsub_bba_ps(<4 x float> %a, <4 x float> %b) #0 {
-; CHECK-LABEL: test_x86_fnmsub_bba_ps:
-; CHECK: # BB#0:
-; CHECK-NEXT: vmovaps (%rdx), %xmm0
-; CHECK-NEXT: vfnmsub213ps (%rcx), %xmm0, %xmm0
-; CHECK-NEXT: retq
+; FMA-LABEL: test_x86_fnmsub_bba_ps:
+; FMA: # %bb.0:
+; FMA-NEXT: vmovaps (%rdx), %xmm0
+; FMA-NEXT: vfnmsub213ps (%rcx), %xmm0, %xmm0
+; FMA-NEXT: retq
%res = call <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float> %b, <4 x float> %b, <4 x float> %a) nounwind
ret <4 x float> %res
}
declare <8 x float> @llvm.x86.fma.vfnmsub.ps.256(<8 x float>, <8 x float>, <8 x float>) nounwind readnone
define <8 x float> @test_x86_fnmsub_baa_ps_y(<8 x float> %a, <8 x float> %b) #0 {
-; CHECK-LABEL: test_x86_fnmsub_baa_ps_y:
-; CHECK: # BB#0:
-; CHECK-NEXT: vmovaps (%rcx), %ymm0
-; CHECK-NEXT: vfnmsub132ps (%rdx), %ymm0, %ymm0
-; CHECK-NEXT: retq
+; FMA-LABEL: test_x86_fnmsub_baa_ps_y:
+; FMA: # %bb.0:
+; FMA-NEXT: vmovaps (%rcx), %ymm0
+; FMA-NEXT: vfnmsub132ps (%rdx), %ymm0, %ymm0
+; FMA-NEXT: retq
%res = call <8 x float> @llvm.x86.fma.vfnmsub.ps.256(<8 x float> %b, <8 x float> %a, <8 x float> %a) nounwind
ret <8 x float> %res
}
define <8 x float> @test_x86_fnmsub_aba_ps_y(<8 x float> %a, <8 x float> %b) #0 {
-; CHECK-LABEL: test_x86_fnmsub_aba_ps_y:
-; CHECK: # BB#0:
-; CHECK-NEXT: vmovaps (%rcx), %ymm0
-; CHECK-NEXT: vfnmsub231ps (%rdx), %ymm0, %ymm0
-; CHECK-NEXT: retq
+; FMA-LABEL: test_x86_fnmsub_aba_ps_y:
+; FMA: # %bb.0:
+; FMA-NEXT: vmovaps (%rcx), %ymm0
+; FMA-NEXT: vfnmsub231ps (%rdx), %ymm0, %ymm0
+; FMA-NEXT: retq
%res = call <8 x float> @llvm.x86.fma.vfnmsub.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %a) nounwind
ret <8 x float> %res
}
define <8 x float> @test_x86_fnmsub_bba_ps_y(<8 x float> %a, <8 x float> %b) #0 {
-; CHECK-LABEL: test_x86_fnmsub_bba_ps_y:
-; CHECK: # BB#0:
-; CHECK-NEXT: vmovaps (%rdx), %ymm0
-; CHECK-NEXT: vfnmsub213ps (%rcx), %ymm0, %ymm0
-; CHECK-NEXT: retq
+; FMA-LABEL: test_x86_fnmsub_bba_ps_y:
+; FMA: # %bb.0:
+; FMA-NEXT: vmovaps (%rdx), %ymm0
+; FMA-NEXT: vfnmsub213ps (%rcx), %ymm0, %ymm0
+; FMA-NEXT: retq
%res = call <8 x float> @llvm.x86.fma.vfnmsub.ps.256(<8 x float> %b, <8 x float> %b, <8 x float> %a) nounwind
ret <8 x float> %res
}
declare <2 x double> @llvm.x86.fma.vfnmsub.sd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
define <2 x double> @test_x86_fnmsub_baa_sd(<2 x double> %a, <2 x double> %b) #0 {
-; CHECK-LABEL: test_x86_fnmsub_baa_sd:
-; CHECK: # BB#0:
-; CHECK-NEXT: vmovapd {{\(%rdx\), %xmm0|\(%rcx\), %xmm1}}
-; CHECK-NEXT: vmovapd {{\(%rdx\), %xmm0|\(%rcx\), %xmm1}}
-; CHECK-NEXT: vfnmsub213sd %xmm1, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; FMA-LABEL: test_x86_fnmsub_baa_sd:
+; FMA: # %bb.0:
+; FMA-NEXT: vmovapd (%rcx), %xmm1
+; FMA-NEXT: vmovapd (%rdx), %xmm0
+; FMA-NEXT: vfnmsub213sd %xmm1, %xmm1, %xmm0
+; FMA-NEXT: retq
%res = call <2 x double> @llvm.x86.fma.vfnmsub.sd(<2 x double> %b, <2 x double> %a, <2 x double> %a) nounwind
ret <2 x double> %res
}
define <2 x double> @test_x86_fnmsub_aba_sd(<2 x double> %a, <2 x double> %b) #0 {
-; CHECK-LABEL: test_x86_fnmsub_aba_sd:
-; CHECK: # BB#0:
-; CHECK-NEXT: vmovapd (%rcx), %xmm0
-; CHECK-NEXT: vfnmsub132sd (%rdx), %xmm0, %xmm0
-; CHECK-NEXT: retq
+; FMA-LABEL: test_x86_fnmsub_aba_sd:
+; FMA: # %bb.0:
+; FMA-NEXT: vmovapd (%rcx), %xmm0
+; FMA-NEXT: vfnmsub132sd (%rdx), %xmm0, %xmm0
+; FMA-NEXT: retq
%res = call <2 x double> @llvm.x86.fma.vfnmsub.sd(<2 x double> %a, <2 x double> %b, <2 x double> %a) nounwind
ret <2 x double> %res
}
define <2 x double> @test_x86_fnmsub_bba_sd(<2 x double> %a, <2 x double> %b) #0 {
-; CHECK-LABEL: test_x86_fnmsub_bba_sd:
-; CHECK: # BB#0:
-; CHECK-NEXT: vmovapd (%rdx), %xmm0
-; CHECK-NEXT: vfnmsub213sd (%rcx), %xmm0, %xmm0
-; CHECK-NEXT: retq
+; FMA-LABEL: test_x86_fnmsub_bba_sd:
+; FMA: # %bb.0:
+; FMA-NEXT: vmovapd (%rdx), %xmm0
+; FMA-NEXT: vfnmsub213sd (%rcx), %xmm0, %xmm0
+; FMA-NEXT: retq
%res = call <2 x double> @llvm.x86.fma.vfnmsub.sd(<2 x double> %b, <2 x double> %b, <2 x double> %a) nounwind
ret <2 x double> %res
}
declare <2 x double> @llvm.x86.fma.vfnmsub.pd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
define <2 x double> @test_x86_fnmsub_baa_pd(<2 x double> %a, <2 x double> %b) #0 {
-; CHECK-LABEL: test_x86_fnmsub_baa_pd:
-; CHECK: # BB#0:
-; CHECK-NEXT: vmovapd (%rcx), %xmm0
-; CHECK-NEXT: vfnmsub132pd (%rdx), %xmm0, %xmm0
-; CHECK-NEXT: retq
+; FMA-LABEL: test_x86_fnmsub_baa_pd:
+; FMA: # %bb.0:
+; FMA-NEXT: vmovapd (%rcx), %xmm0
+; FMA-NEXT: vfnmsub132pd (%rdx), %xmm0, %xmm0
+; FMA-NEXT: retq
%res = call <2 x double> @llvm.x86.fma.vfnmsub.pd(<2 x double> %b, <2 x double> %a, <2 x double> %a) nounwind
ret <2 x double> %res
}
define <2 x double> @test_x86_fnmsub_aba_pd(<2 x double> %a, <2 x double> %b) #0 {
-; CHECK-LABEL: test_x86_fnmsub_aba_pd:
-; CHECK: # BB#0:
-; CHECK-NEXT: vmovapd (%rcx), %xmm0
-; CHECK-NEXT: vfnmsub231pd (%rdx), %xmm0, %xmm0
-; CHECK-NEXT: retq
+; FMA-LABEL: test_x86_fnmsub_aba_pd:
+; FMA: # %bb.0:
+; FMA-NEXT: vmovapd (%rcx), %xmm0
+; FMA-NEXT: vfnmsub231pd (%rdx), %xmm0, %xmm0
+; FMA-NEXT: retq
%res = call <2 x double> @llvm.x86.fma.vfnmsub.pd(<2 x double> %a, <2 x double> %b, <2 x double> %a) nounwind
ret <2 x double> %res
}
define <2 x double> @test_x86_fnmsub_bba_pd(<2 x double> %a, <2 x double> %b) #0 {
-; CHECK-LABEL: test_x86_fnmsub_bba_pd:
-; CHECK: # BB#0:
-; CHECK-NEXT: vmovapd (%rdx), %xmm0
-; CHECK-NEXT: vfnmsub213pd (%rcx), %xmm0, %xmm0
-; CHECK-NEXT: retq
+; FMA-LABEL: test_x86_fnmsub_bba_pd:
+; FMA: # %bb.0:
+; FMA-NEXT: vmovapd (%rdx), %xmm0
+; FMA-NEXT: vfnmsub213pd (%rcx), %xmm0, %xmm0
+; FMA-NEXT: retq
%res = call <2 x double> @llvm.x86.fma.vfnmsub.pd(<2 x double> %b, <2 x double> %b, <2 x double> %a) nounwind
ret <2 x double> %res
}
declare <4 x double> @llvm.x86.fma.vfnmsub.pd.256(<4 x double>, <4 x double>, <4 x double>) nounwind readnone
define <4 x double> @test_x86_fnmsub_baa_pd_y(<4 x double> %a, <4 x double> %b) #0 {
-; CHECK-LABEL: test_x86_fnmsub_baa_pd_y:
-; CHECK: # BB#0:
-; CHECK-NEXT: vmovapd (%rcx), %ymm0
-; CHECK-NEXT: vfnmsub132pd (%rdx), %ymm0, %ymm0
-; CHECK-NEXT: retq
+; FMA-LABEL: test_x86_fnmsub_baa_pd_y:
+; FMA: # %bb.0:
+; FMA-NEXT: vmovapd (%rcx), %ymm0
+; FMA-NEXT: vfnmsub132pd (%rdx), %ymm0, %ymm0
+; FMA-NEXT: retq
%res = call <4 x double> @llvm.x86.fma.vfnmsub.pd.256(<4 x double> %b, <4 x double> %a, <4 x double> %a) nounwind
ret <4 x double> %res
}
define <4 x double> @test_x86_fnmsub_aba_pd_y(<4 x double> %a, <4 x double> %b) #0 {
-; CHECK-LABEL: test_x86_fnmsub_aba_pd_y:
-; CHECK: # BB#0:
-; CHECK-NEXT: vmovapd (%rcx), %ymm0
-; CHECK-NEXT: vfnmsub231pd (%rdx), %ymm0, %ymm0
-; CHECK-NEXT: retq
+; FMA-LABEL: test_x86_fnmsub_aba_pd_y:
+; FMA: # %bb.0:
+; FMA-NEXT: vmovapd (%rcx), %ymm0
+; FMA-NEXT: vfnmsub231pd (%rdx), %ymm0, %ymm0
+; FMA-NEXT: retq
%res = call <4 x double> @llvm.x86.fma.vfnmsub.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %a) nounwind
ret <4 x double> %res
}
define <4 x double> @test_x86_fnmsub_bba_pd_y(<4 x double> %a, <4 x double> %b) #0 {
-; CHECK-LABEL: test_x86_fnmsub_bba_pd_y:
-; CHECK: # BB#0:
-; CHECK-NEXT: vmovapd (%rdx), %ymm0
-; CHECK-NEXT: vfnmsub213pd (%rcx), %ymm0, %ymm0
-; CHECK-NEXT: retq
+; FMA-LABEL: test_x86_fnmsub_bba_pd_y:
+; FMA: # %bb.0:
+; FMA-NEXT: vmovapd (%rdx), %ymm0
+; FMA-NEXT: vfnmsub213pd (%rcx), %ymm0, %ymm0
+; FMA-NEXT: retq
%res = call <4 x double> @llvm.x86.fma.vfnmsub.pd.256(<4 x double> %b, <4 x double> %b, <4 x double> %a) nounwind
ret <4 x double> %res
}
diff --git a/test/CodeGen/X86/fma-fneg-combine.ll b/test/CodeGen/X86/fma-fneg-combine.ll
index d1d69c68af7b..8dacf2dcf971 100644
--- a/test/CodeGen/X86/fma-fneg-combine.ll
+++ b/test/CodeGen/X86/fma-fneg-combine.ll
@@ -7,7 +7,7 @@
define <16 x float> @test1(<16 x float> %a, <16 x float> %b, <16 x float> %c) {
; CHECK-LABEL: test1:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vfmsub213ps %zmm2, %zmm1, %zmm0
; CHECK-NEXT: retq
entry:
@@ -23,7 +23,7 @@ declare <16 x float> @llvm.x86.avx512.mask.vfnmsub.ps.512(<16 x float>, <16 x fl
define <16 x float> @test2(<16 x float> %a, <16 x float> %b, <16 x float> %c) {
; CHECK-LABEL: test2:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vfnmsub213ps %zmm2, %zmm1, %zmm0
; CHECK-NEXT: retq
entry:
@@ -34,7 +34,7 @@ entry:
define <16 x float> @test3(<16 x float> %a, <16 x float> %b, <16 x float> %c) {
; CHECK-LABEL: test3:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vfmsub213ps %zmm2, %zmm1, %zmm0
; CHECK-NEXT: retq
entry:
@@ -45,7 +45,7 @@ entry:
define <16 x float> @test4(<16 x float> %a, <16 x float> %b, <16 x float> %c) {
; CHECK-LABEL: test4:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vfmadd213ps %zmm2, %zmm1, %zmm0
; CHECK-NEXT: retq
entry:
@@ -56,7 +56,7 @@ entry:
define <16 x float> @test5(<16 x float> %a, <16 x float> %b, <16 x float> %c) {
; CHECK-LABEL: test5:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vfmsub213ps {ru-sae}, %zmm2, %zmm1, %zmm0
; CHECK-NEXT: retq
entry:
@@ -67,7 +67,7 @@ entry:
define <16 x float> @test6(<16 x float> %a, <16 x float> %b, <16 x float> %c) {
; CHECK-LABEL: test6:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vfmadd213ps {ru-sae}, %zmm2, %zmm1, %zmm0
; CHECK-NEXT: retq
entry:
@@ -79,7 +79,7 @@ entry:
define <8 x float> @test7(<8 x float> %a, <8 x float> %b, <8 x float> %c) {
; CHECK-LABEL: test7:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0
; CHECK-NEXT: retq
entry:
@@ -90,14 +90,14 @@ entry:
define <8 x float> @test8(<8 x float> %a, <8 x float> %b, <8 x float> %c) {
; SKX-LABEL: test8:
-; SKX: # BB#0: # %entry
+; SKX: # %bb.0: # %entry
; SKX-NEXT: vxorps {{.*}}(%rip){1to8}, %ymm2, %ymm2
; SKX-NEXT: vfmsub213ps %ymm2, %ymm1, %ymm0
; SKX-NEXT: retq
;
; KNL-LABEL: test8:
-; KNL: # BB#0: # %entry
-; KNL-NEXT: vbroadcastss {{.*}}(%rip), %ymm3
+; KNL: # %bb.0: # %entry
+; KNL-NEXT: vbroadcastss {{.*#+}} ymm3 = [-0,-0,-0,-0,-0,-0,-0,-0]
; KNL-NEXT: vxorps %ymm3, %ymm2, %ymm2
; KNL-NEXT: vfmsub213ps %ymm2, %ymm1, %ymm0
; KNL-NEXT: retq
@@ -112,7 +112,7 @@ declare <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float>, <8 x float>, <8 x f
define <8 x double> @test9(<8 x double> %a, <8 x double> %b, <8 x double> %c) {
; CHECK-LABEL: test9:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vfnmsub213pd %zmm2, %zmm1, %zmm0
; CHECK-NEXT: retq
entry:
@@ -125,7 +125,7 @@ declare <8 x double> @llvm.x86.avx512.mask.vfmadd.pd.512(<8 x double> %a, <8 x d
define <2 x double> @test10(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
; CHECK-LABEL: test10:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vfmadd213sd %xmm2, %xmm1, %xmm0
; CHECK-NEXT: vxorpd {{.*}}(%rip), %xmm0, %xmm0
; CHECK-NEXT: retq
@@ -139,15 +139,15 @@ declare <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %a, <2 x doubl
define <4 x float> @test11(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 zeroext %mask) local_unnamed_addr #0 {
; SKX-LABEL: test11:
-; SKX: # BB#0: # %entry
+; SKX: # %bb.0: # %entry
; SKX-NEXT: vxorps {{.*}}(%rip){1to4}, %xmm2, %xmm0
; SKX-NEXT: kmovd %edi, %k1
; SKX-NEXT: vfmadd231ss %xmm1, %xmm1, %xmm0 {%k1}
; SKX-NEXT: retq
;
; KNL-LABEL: test11:
-; KNL: # BB#0: # %entry
-; KNL-NEXT: vbroadcastss {{.*}}(%rip), %xmm0
+; KNL: # %bb.0: # %entry
+; KNL-NEXT: vbroadcastss {{.*#+}} xmm0 = [-0,-0,-0,-0]
; KNL-NEXT: vxorps %xmm0, %xmm2, %xmm0
; KNL-NEXT: kmovw %edi, %k1
; KNL-NEXT: vfmadd231ss %xmm1, %xmm1, %xmm0 {%k1}
@@ -160,16 +160,38 @@ entry:
declare <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32)
+define <4 x float> @test11b(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 zeroext %mask) local_unnamed_addr #0 {
+; SKX-LABEL: test11b:
+; SKX: # %bb.0: # %entry
+; SKX-NEXT: kmovd %edi, %k1
+; SKX-NEXT: vfmsub213ss %xmm2, %xmm1, %xmm1 {%k1}
+; SKX-NEXT: vmovaps %xmm1, %xmm0
+; SKX-NEXT: retq
+;
+; KNL-LABEL: test11b:
+; KNL: # %bb.0: # %entry
+; KNL-NEXT: kmovw %edi, %k1
+; KNL-NEXT: vfmsub213ss %xmm2, %xmm1, %xmm1 {%k1}
+; KNL-NEXT: vmovaps %xmm1, %xmm0
+; KNL-NEXT: retq
+entry:
+ %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %c
+ %0 = tail call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %b, <4 x float> %b, <4 x float> %sub.i, i8 %mask, i32 4) #10
+ ret <4 x float> %0
+}
+
+declare <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32)
+
define <8 x double> @test12(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask) {
; SKX-LABEL: test12:
-; SKX: # BB#0: # %entry
+; SKX: # %bb.0: # %entry
; SKX-NEXT: kmovd %edi, %k1
; SKX-NEXT: vfmadd132pd %zmm1, %zmm2, %zmm0 {%k1}
; SKX-NEXT: vxorpd {{.*}}(%rip){1to8}, %zmm0, %zmm0
; SKX-NEXT: retq
;
; KNL-LABEL: test12:
-; KNL: # BB#0: # %entry
+; KNL: # %bb.0: # %entry
; KNL-NEXT: kmovw %edi, %k1
; KNL-NEXT: vfmadd132pd %zmm1, %zmm2, %zmm0 {%k1}
; KNL-NEXT: vpxorq {{.*}}(%rip){1to8}, %zmm0, %zmm0
@@ -182,14 +204,14 @@ entry:
define <2 x double> @test13(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
; SKX-LABEL: test13:
-; SKX: # BB#0: # %entry
+; SKX: # %bb.0: # %entry
; SKX-NEXT: vxorpd {{.*}}(%rip), %xmm0, %xmm0
; SKX-NEXT: kmovd %edi, %k1
; SKX-NEXT: vfmadd213sd %xmm2, %xmm1, %xmm0 {%k1}
; SKX-NEXT: retq
;
; KNL-LABEL: test13:
-; KNL: # BB#0: # %entry
+; KNL: # %bb.0: # %entry
; KNL-NEXT: vxorpd {{.*}}(%rip), %xmm0, %xmm0
; KNL-NEXT: kmovw %edi, %k1
; KNL-NEXT: vfmadd213sd %xmm2, %xmm1, %xmm0 {%k1}
@@ -203,14 +225,14 @@ entry:
define <16 x float> @test14(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask) {
; SKX-LABEL: test14:
-; SKX: # BB#0: # %entry
+; SKX: # %bb.0: # %entry
; SKX-NEXT: kmovd %edi, %k1
; SKX-NEXT: vfnmsub132ps {ru-sae}, %zmm1, %zmm2, %zmm0 {%k1}
; SKX-NEXT: vxorps {{.*}}(%rip){1to16}, %zmm0, %zmm0
; SKX-NEXT: retq
;
; KNL-LABEL: test14:
-; KNL: # BB#0: # %entry
+; KNL: # %bb.0: # %entry
; KNL-NEXT: kmovw %edi, %k1
; KNL-NEXT: vfnmsub132ps {ru-sae}, %zmm1, %zmm2, %zmm0 {%k1}
; KNL-NEXT: vpxord {{.*}}(%rip){1to16}, %zmm0, %zmm0
@@ -223,7 +245,7 @@ entry:
define <16 x float> @test15(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask) {
; SKX-LABEL: test15:
-; SKX: # BB#0: # %entry
+; SKX: # %bb.0: # %entry
; SKX-NEXT: kmovd %edi, %k1
; SKX-NEXT: vxorps {{.*}}(%rip){1to16}, %zmm0, %zmm3
; SKX-NEXT: vfnmadd213ps {ru-sae}, %zmm2, %zmm0, %zmm1
@@ -233,7 +255,7 @@ define <16 x float> @test15(<16 x float> %a, <16 x float> %b, <16 x float> %c, i
; SKX-NEXT: retq
;
; KNL-LABEL: test15:
-; KNL: # BB#0: # %entry
+; KNL: # %bb.0: # %entry
; KNL-NEXT: kmovw %edi, %k1
; KNL-NEXT: vpxord {{.*}}(%rip){1to16}, %zmm0, %zmm3
; KNL-NEXT: vfnmadd213ps {ru-sae}, %zmm2, %zmm0, %zmm1
@@ -248,3 +270,38 @@ entry:
ret <16 x float> %1
}
+define <16 x float> @test16(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask) {
+; SKX-LABEL: test16:
+; SKX: # %bb.0:
+; SKX-NEXT: kmovd %edi, %k1
+; SKX-NEXT: vfmsubadd132ps {rd-sae}, %zmm1, %zmm2, %zmm0 {%k1}
+; SKX-NEXT: retq
+;
+; KNL-LABEL: test16:
+; KNL: # %bb.0:
+; KNL-NEXT: kmovw %edi, %k1
+; KNL-NEXT: vfmsubadd132ps {rd-sae}, %zmm1, %zmm2, %zmm0 {%k1}
+; KNL-NEXT: retq
+ %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %c
+ %res = call <16 x float> @llvm.x86.avx512.mask.vfmaddsub.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> %sub.i, i16 %mask, i32 1)
+ ret <16 x float> %res
+}
+declare <16 x float> @llvm.x86.avx512.mask.vfmaddsub.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
+
+define <8 x double> @test17(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask) {
+; SKX-LABEL: test17:
+; SKX: # %bb.0:
+; SKX-NEXT: kmovd %edi, %k1
+; SKX-NEXT: vfmsubadd132pd %zmm1, %zmm2, %zmm0 {%k1}
+; SKX-NEXT: retq
+;
+; KNL-LABEL: test17:
+; KNL: # %bb.0:
+; KNL-NEXT: kmovw %edi, %k1
+; KNL-NEXT: vfmsubadd132pd %zmm1, %zmm2, %zmm0 {%k1}
+; KNL-NEXT: retq
+ %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %c
+ %res = call <8 x double> @llvm.x86.avx512.mask.vfmaddsub.pd.512(<8 x double> %a, <8 x double> %b, <8 x double> %sub.i, i8 %mask, i32 4)
+ ret <8 x double> %res
+}
+declare <8 x double> @llvm.x86.avx512.mask.vfmaddsub.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32)
diff --git a/test/CodeGen/X86/fma-intrinsics-x86.ll b/test/CodeGen/X86/fma-intrinsics-x86.ll
index cf4c8933fcab..db1e382ed6b6 100644
--- a/test/CodeGen/X86/fma-intrinsics-x86.ll
+++ b/test/CodeGen/X86/fma-intrinsics-x86.ll
@@ -1,149 +1,182 @@
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -march=x86-64 -mcpu=corei7-avx -mattr=+fma | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FMA
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -march=x86-64 -mcpu=core-avx2 -mattr=+fma,+avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FMA
-; RUN: llc < %s -mtriple=x86_64-pc-windows -march=x86-64 -mcpu=core-avx2 -mattr=+fma,+avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FMA-WIN
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -march=x86-64 -mcpu=corei7-avx -mattr=+fma4 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FMA4
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -mattr=+avx,-fma | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FMA4
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -mattr=-fma4 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FMA
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+fma,-fma4 -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FMA
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,-fma4 -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-AVX512VL
+; RUN: llc < %s -mtriple=x86_64-pc-windows -mattr=+fma,-fma4 -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FMA-WIN
; VFMADD
define <4 x float> @test_x86_fma_vfmadd_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
-; CHECK-LABEL: test_x86_fma_vfmadd_ss:
-; CHECK-NEXT: # BB#0:
-;
-; CHECK-FMA-WIN-NEXT: vmovaps {{\(%rcx\), %xmm0|\(%r8\), %xmm1}}
-; CHECK-FMA-WIN-NEXT: vmovaps {{\(%rcx\), %xmm0|\(%r8\), %xmm1}}
-; CHECK-FMA-WIN-NEXT: vfmadd132ss (%rdx), %xmm1, %xmm0
-;
-; CHECK-FMA-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm0
-;
-; CHECK-FMA4-NEXT: vfmaddss %xmm2, %xmm1, %xmm0, %xmm0
-;
-; CHECK-NEXT: retq
+; CHECK-FMA-LABEL: test_x86_fma_vfmadd_ss:
+; CHECK-FMA: # %bb.0:
+; CHECK-FMA-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xa9,0xc2]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfmadd_ss:
+; CHECK-AVX512VL: # %bb.0:
+; CHECK-AVX512VL-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xa9,0xc2]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmadd_ss:
+; CHECK-FMA-WIN: # %bb.0:
+; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x0a]
+; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x01]
+; CHECK-FMA-WIN-NEXT: vfmadd213ss (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xa9,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
%res = call <4 x float> @llvm.x86.fma.vfmadd.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
ret <4 x float> %res
}
define <4 x float> @test_x86_fma_vfmadd_bac_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
-; CHECK-LABEL: test_x86_fma_vfmadd_bac_ss:
-; CHECK-NEXT: # BB#0:
-;
-; CHECK-FMA-WIN-NEXT: vmovaps {{\(%rdx\), %xmm0|\(%r8\), %xmm1}}
-; CHECK-FMA-WIN-NEXT: vmovaps {{\(%rdx\), %xmm0|\(%r8\), %xmm1}}
-; CHECK-FMA-WIN-NEXT: vfmadd132ss (%rcx), %xmm1, %xmm0
-;
-; CHECK-FMA-NEXT: vfmadd213ss %xmm2, %xmm0, %xmm1
-; CHECK-FMA-NEXT: vmovaps %xmm1, %xmm0
-;
-; CHECK-FMA4-NEXT: vfmaddss %xmm2, %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: retq
+; CHECK-FMA-LABEL: test_x86_fma_vfmadd_bac_ss:
+; CHECK-FMA: # %bb.0:
+; CHECK-FMA-NEXT: vfmadd213ss %xmm2, %xmm0, %xmm1 # encoding: [0xc4,0xe2,0x79,0xa9,0xca]
+; CHECK-FMA-NEXT: vmovaps %xmm1, %xmm0 # encoding: [0xc5,0xf8,0x28,0xc1]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfmadd_bac_ss:
+; CHECK-AVX512VL: # %bb.0:
+; CHECK-AVX512VL-NEXT: vfmadd213ss %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xa9,0xca]
+; CHECK-AVX512VL-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmadd_bac_ss:
+; CHECK-FMA-WIN: # %bb.0:
+; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x09]
+; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02]
+; CHECK-FMA-WIN-NEXT: vfmadd213ss (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xa9,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
%res = call <4 x float> @llvm.x86.fma.vfmadd.ss(<4 x float> %a1, <4 x float> %a0, <4 x float> %a2)
ret <4 x float> %res
}
declare <4 x float> @llvm.x86.fma.vfmadd.ss(<4 x float>, <4 x float>, <4 x float>)
define <2 x double> @test_x86_fma_vfmadd_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
-; CHECK-LABEL: test_x86_fma_vfmadd_sd:
-; CHECK-NEXT: # BB#0:
-;
-; CHECK-FMA-WIN-NEXT: vmovapd {{\(%rcx\), %xmm0|\(%r8\), %xmm1}}
-; CHECK-FMA-WIN-NEXT: vmovapd {{\(%rcx\), %xmm0|\(%r8\), %xmm1}}
-; CHECK-FMA-WIN-NEXT: vfmadd132sd (%rdx), %xmm1, %xmm0
-;
-; CHECK-FMA-NEXT: vfmadd213sd %xmm2, %xmm1, %xmm0
-;
-; CHECK-FMA4-NEXT: vfmaddsd %xmm2, %xmm1, %xmm0, %xmm0
-;
-; CHECK-NEXT: retq
+; CHECK-FMA-LABEL: test_x86_fma_vfmadd_sd:
+; CHECK-FMA: # %bb.0:
+; CHECK-FMA-NEXT: vfmadd213sd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0xa9,0xc2]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfmadd_sd:
+; CHECK-AVX512VL: # %bb.0:
+; CHECK-AVX512VL-NEXT: vfmadd213sd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xa9,0xc2]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmadd_sd:
+; CHECK-FMA-WIN: # %bb.0:
+; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm1 # encoding: [0xc5,0xf9,0x28,0x0a]
+; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x01]
+; CHECK-FMA-WIN-NEXT: vfmadd213sd (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0xf1,0xa9,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
%res = call <2 x double> @llvm.x86.fma.vfmadd.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
ret <2 x double> %res
}
define <2 x double> @test_x86_fma_vfmadd_bac_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
-; CHECK-LABEL: test_x86_fma_vfmadd_bac_sd:
-; CHECK-NEXT: # BB#0:
-;
-; CHECK-FMA-WIN-NEXT: vmovapd {{\(%rdx\), %xmm0|\(%r8\), %xmm1}}
-; CHECK-FMA-WIN-NEXT: vmovapd {{\(%rdx\), %xmm0|\(%r8\), %xmm1}}
-; CHECK-FMA-WIN-NEXT: vfmadd132sd (%rcx), %xmm1, %xmm0
-;
-; CHECK-FMA-NEXT: vfmadd213sd %xmm2, %xmm0, %xmm1
-; CHECK-FMA-NEXT: vmovapd %xmm1, %xmm0
-;
-; CHECK-FMA4-NEXT: vfmaddsd %xmm2, %xmm0, %xmm1, %xmm0
-;
-; CHECK-NEXT: retq
+; CHECK-FMA-LABEL: test_x86_fma_vfmadd_bac_sd:
+; CHECK-FMA: # %bb.0:
+; CHECK-FMA-NEXT: vfmadd213sd %xmm2, %xmm0, %xmm1 # encoding: [0xc4,0xe2,0xf9,0xa9,0xca]
+; CHECK-FMA-NEXT: vmovapd %xmm1, %xmm0 # encoding: [0xc5,0xf9,0x28,0xc1]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfmadd_bac_sd:
+; CHECK-AVX512VL: # %bb.0:
+; CHECK-AVX512VL-NEXT: vfmadd213sd %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xa9,0xca]
+; CHECK-AVX512VL-NEXT: vmovapd %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc1]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmadd_bac_sd:
+; CHECK-FMA-WIN: # %bb.0:
+; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm1 # encoding: [0xc5,0xf9,0x28,0x09]
+; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x02]
+; CHECK-FMA-WIN-NEXT: vfmadd213sd (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0xf1,0xa9,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
%res = call <2 x double> @llvm.x86.fma.vfmadd.sd(<2 x double> %a1, <2 x double> %a0, <2 x double> %a2)
ret <2 x double> %res
}
declare <2 x double> @llvm.x86.fma.vfmadd.sd(<2 x double>, <2 x double>, <2 x double>)
define <4 x float> @test_x86_fma_vfmadd_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
-; CHECK-LABEL: test_x86_fma_vfmadd_ps:
-; CHECK-NEXT: # BB#0:
-;
-; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}}
-; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}}
-; CHECK-FMA-WIN-NEXT: vfmadd213ps (%r8), %xmm1, %xmm0
-;
-; CHECK-FMA-NEXT: vfmadd213ps %xmm2, %xmm1, %xmm0
-;
-; CHECK-FMA4-NEXT: vfmaddps %xmm2, %xmm1, %xmm0, %xmm0
-;
-; CHECK-NEXT: retq
+; CHECK-FMA-LABEL: test_x86_fma_vfmadd_ps:
+; CHECK-FMA: # %bb.0:
+; CHECK-FMA-NEXT: vfmadd213ps %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xa8,0xc2]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfmadd_ps:
+; CHECK-AVX512VL: # %bb.0:
+; CHECK-AVX512VL-NEXT: vfmadd213ps %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xa8,0xc2]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmadd_ps:
+; CHECK-FMA-WIN: # %bb.0:
+; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x09]
+; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02]
+; CHECK-FMA-WIN-NEXT: vfmadd213ps (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xa8,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
%res = call <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
ret <4 x float> %res
}
declare <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float>, <4 x float>, <4 x float>)
define <2 x double> @test_x86_fma_vfmadd_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
-; CHECK-LABEL: test_x86_fma_vfmadd_pd:
-; CHECK-NEXT: # BB#0:
-;
-; CHECK-FMA-WIN-NEXT: vmovapd (%{{(rcx|rdx)}}), %xmm{{0|1}}
-; CHECK-FMA-WIN-NEXT: vmovapd (%{{(rcx|rdx)}}), %xmm{{0|1}}
-; CHECK-FMA-WIN-NEXT: vfmadd213pd (%r8), %xmm1, %xmm0
-;
-; CHECK-FMA-NEXT: vfmadd213pd %xmm2, %xmm1, %xmm0
-;
-; CHECK-FMA4-NEXT: vfmaddpd %xmm2, %xmm1, %xmm0, %xmm0
-;
-; CHECK-NEXT: retq
+; CHECK-FMA-LABEL: test_x86_fma_vfmadd_pd:
+; CHECK-FMA: # %bb.0:
+; CHECK-FMA-NEXT: vfmadd213pd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0xa8,0xc2]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfmadd_pd:
+; CHECK-AVX512VL: # %bb.0:
+; CHECK-AVX512VL-NEXT: vfmadd213pd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xa8,0xc2]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmadd_pd:
+; CHECK-FMA-WIN: # %bb.0:
+; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm1 # encoding: [0xc5,0xf9,0x28,0x09]
+; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x02]
+; CHECK-FMA-WIN-NEXT: vfmadd213pd (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0xf1,0xa8,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
%res = call <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
ret <2 x double> %res
}
declare <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double>, <2 x double>, <2 x double>)
define <8 x float> @test_x86_fma_vfmadd_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) #0 {
-; CHECK-LABEL: test_x86_fma_vfmadd_ps_256:
-; CHECK-NEXT: # BB#0:
-;
-; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %ymm{{0|1}}
-; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %ymm{{0|1}}
-; CHECK-FMA-WIN-NEXT: vfmadd213ps (%r8), %ymm1, %ymm0
-;
-; CHECK-FMA-NEXT: vfmadd213ps %ymm2, %ymm1, %ymm0
-;
-; CHECK-FMA4-NEXT: vfmaddps %ymm2, %ymm1, %ymm0, %ymm0
-;
-; CHECK-NEXT: retq
+; CHECK-FMA-LABEL: test_x86_fma_vfmadd_ps_256:
+; CHECK-FMA: # %bb.0:
+; CHECK-FMA-NEXT: vfmadd213ps %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0xa8,0xc2]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfmadd_ps_256:
+; CHECK-AVX512VL: # %bb.0:
+; CHECK-AVX512VL-NEXT: vfmadd213ps %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0xa8,0xc2]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmadd_ps_256:
+; CHECK-FMA-WIN: # %bb.0:
+; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %ymm1 # encoding: [0xc5,0xfc,0x28,0x09]
+; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %ymm0 # encoding: [0xc5,0xfc,0x28,0x02]
+; CHECK-FMA-WIN-NEXT: vfmadd213ps (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0x75,0xa8,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
%res = call <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2)
ret <8 x float> %res
}
declare <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float>, <8 x float>, <8 x float>)
define <4 x double> @test_x86_fma_vfmadd_pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 {
-; CHECK-LABEL: test_x86_fma_vfmadd_pd_256:
-; CHECK-NEXT: # BB#0:
-;
-; CHECK-FMA-WIN-NEXT: vmovapd (%{{(rcx|rdx)}}), %ymm{{0|1}}
-; CHECK-FMA-WIN-NEXT: vmovapd (%{{(rcx|rdx)}}), %ymm{{0|1}}
-; CHECK-FMA-WIN-NEXT: vfmadd213pd (%r8), %ymm1, %ymm0
-;
-; CHECK-FMA-NEXT: vfmadd213pd %ymm2, %ymm1, %ymm0
-;
-; CHECK-FMA4-NEXT: vfmaddpd %ymm2, %ymm1, %ymm0, %ymm0
-;
-; CHECK-NEXT: retq
+; CHECK-FMA-LABEL: test_x86_fma_vfmadd_pd_256:
+; CHECK-FMA: # %bb.0:
+; CHECK-FMA-NEXT: vfmadd213pd %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0xf5,0xa8,0xc2]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfmadd_pd_256:
+; CHECK-AVX512VL: # %bb.0:
+; CHECK-AVX512VL-NEXT: vfmadd213pd %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf5,0xa8,0xc2]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmadd_pd_256:
+; CHECK-FMA-WIN: # %bb.0:
+; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %ymm1 # encoding: [0xc5,0xfd,0x28,0x09]
+; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %ymm0 # encoding: [0xc5,0xfd,0x28,0x02]
+; CHECK-FMA-WIN-NEXT: vfmadd213pd (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0xf5,0xa8,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
%res = call <4 x double> @llvm.x86.fma.vfmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2)
ret <4 x double> %res
}
@@ -151,144 +184,178 @@ declare <4 x double> @llvm.x86.fma.vfmadd.pd.256(<4 x double>, <4 x double>, <4
; VFMSUB
define <4 x float> @test_x86_fma_vfmsub_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
-; CHECK-LABEL: test_x86_fma_vfmsub_ss:
-; CHECK-NEXT: # BB#0:
-;
-; CHECK-FMA-WIN-NEXT: vmovaps {{\(%rcx\), %xmm0|\(%r8\), %xmm1}}
-; CHECK-FMA-WIN-NEXT: vmovaps {{\(%rcx\), %xmm0|\(%r8\), %xmm1}}
-; CHECK-FMA-WIN-NEXT: vfmsub132ss (%rdx), %xmm1, %xmm0
-;
-; CHECK-FMA-NEXT: vfmsub213ss %xmm2, %xmm1, %xmm0
-;
-; CHECK-FMA4-NEXT: vfmsubss %xmm2, %xmm1, %xmm0, %xmm0
-;
-; CHECK-NEXT: retq
+; CHECK-FMA-LABEL: test_x86_fma_vfmsub_ss:
+; CHECK-FMA: # %bb.0:
+; CHECK-FMA-NEXT: vfmsub213ss %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xab,0xc2]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfmsub_ss:
+; CHECK-AVX512VL: # %bb.0:
+; CHECK-AVX512VL-NEXT: vfmsub213ss %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xab,0xc2]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmsub_ss:
+; CHECK-FMA-WIN: # %bb.0:
+; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x0a]
+; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x01]
+; CHECK-FMA-WIN-NEXT: vfmsub213ss (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xab,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
%res = call <4 x float> @llvm.x86.fma.vfmsub.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
ret <4 x float> %res
}
define <4 x float> @test_x86_fma_vfmsub_bac_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
-; CHECK-LABEL: test_x86_fma_vfmsub_bac_ss:
-; CHECK-NEXT: # BB#0:
-;
-; CHECK-FMA-WIN-NEXT: vmovaps {{\(%rdx\), %xmm0|\(%r8\), %xmm1}}
-; CHECK-FMA-WIN-NEXT: vmovaps {{\(%rdx\), %xmm0|\(%r8\), %xmm1}}
-; CHECK-FMA-WIN-NEXT: vfmsub132ss (%rcx), %xmm1, %xmm0
-;
-; CHECK-FMA-NEXT: vfmsub213ss %xmm2, %xmm0, %xmm1
-; CHECK-FMA-NEXT: vmovaps %xmm1, %xmm0
-;
-; CHECK-FMA4-NEXT: vfmsubss %xmm2, %xmm0, %xmm1, %xmm0
-;
-; CHECK-NEXT: retq
+; CHECK-FMA-LABEL: test_x86_fma_vfmsub_bac_ss:
+; CHECK-FMA: # %bb.0:
+; CHECK-FMA-NEXT: vfmsub213ss %xmm2, %xmm0, %xmm1 # encoding: [0xc4,0xe2,0x79,0xab,0xca]
+; CHECK-FMA-NEXT: vmovaps %xmm1, %xmm0 # encoding: [0xc5,0xf8,0x28,0xc1]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfmsub_bac_ss:
+; CHECK-AVX512VL: # %bb.0:
+; CHECK-AVX512VL-NEXT: vfmsub213ss %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xab,0xca]
+; CHECK-AVX512VL-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmsub_bac_ss:
+; CHECK-FMA-WIN: # %bb.0:
+; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x09]
+; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02]
+; CHECK-FMA-WIN-NEXT: vfmsub213ss (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xab,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
%res = call <4 x float> @llvm.x86.fma.vfmsub.ss(<4 x float> %a1, <4 x float> %a0, <4 x float> %a2)
ret <4 x float> %res
}
declare <4 x float> @llvm.x86.fma.vfmsub.ss(<4 x float>, <4 x float>, <4 x float>)
define <2 x double> @test_x86_fma_vfmsub_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
-; CHECK-LABEL: test_x86_fma_vfmsub_sd:
-; CHECK-NEXT: # BB#0:
-;
-; CHECK-FMA-WIN-NEXT: vmovapd {{\(%rcx\), %xmm0|\(%r8\), %xmm1}}
-; CHECK-FMA-WIN-NEXT: vmovapd {{\(%rcx\), %xmm0|\(%r8\), %xmm1}}
-; CHECK-FMA-WIN-NEXT: vfmsub132sd (%rdx), %xmm1, %xmm0
-;
-; CHECK-FMA-NEXT: vfmsub213sd %xmm2, %xmm1, %xmm0
-;
-; CHECK-FMA4-NEXT: vfmsubsd %xmm2, %xmm1, %xmm0, %xmm0
-;
-; CHECK-NEXT: retq
+; CHECK-FMA-LABEL: test_x86_fma_vfmsub_sd:
+; CHECK-FMA: # %bb.0:
+; CHECK-FMA-NEXT: vfmsub213sd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0xab,0xc2]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfmsub_sd:
+; CHECK-AVX512VL: # %bb.0:
+; CHECK-AVX512VL-NEXT: vfmsub213sd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xab,0xc2]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmsub_sd:
+; CHECK-FMA-WIN: # %bb.0:
+; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm1 # encoding: [0xc5,0xf9,0x28,0x0a]
+; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x01]
+; CHECK-FMA-WIN-NEXT: vfmsub213sd (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0xf1,0xab,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
%res = call <2 x double> @llvm.x86.fma.vfmsub.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
ret <2 x double> %res
}
define <2 x double> @test_x86_fma_vfmsub_bac_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
-; CHECK-LABEL: test_x86_fma_vfmsub_bac_sd:
-; CHECK-NEXT: # BB#0:
-;
-; CHECK-FMA-WIN-NEXT: vmovapd {{\(%rdx\), %xmm0|\(%r8\), %xmm1}}
-; CHECK-FMA-WIN-NEXT: vmovapd {{\(%rdx\), %xmm0|\(%r8\), %xmm1}}
-; CHECK-FMA-WIN-NEXT: vfmsub132sd (%rcx), %xmm1, %xmm0
-;
-; CHECK-FMA-NEXT: vfmsub213sd %xmm2, %xmm0, %xmm1
-; CHECK-FMA-NEXT: vmovapd %xmm1, %xmm0
-;
-; CHECK-FMA4-NEXT: vfmsubsd %xmm2, %xmm0, %xmm1, %xmm0
-;
-; CHECK-NEXT: retq
+; CHECK-FMA-LABEL: test_x86_fma_vfmsub_bac_sd:
+; CHECK-FMA: # %bb.0:
+; CHECK-FMA-NEXT: vfmsub213sd %xmm2, %xmm0, %xmm1 # encoding: [0xc4,0xe2,0xf9,0xab,0xca]
+; CHECK-FMA-NEXT: vmovapd %xmm1, %xmm0 # encoding: [0xc5,0xf9,0x28,0xc1]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfmsub_bac_sd:
+; CHECK-AVX512VL: # %bb.0:
+; CHECK-AVX512VL-NEXT: vfmsub213sd %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xab,0xca]
+; CHECK-AVX512VL-NEXT: vmovapd %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc1]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmsub_bac_sd:
+; CHECK-FMA-WIN: # %bb.0:
+; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm1 # encoding: [0xc5,0xf9,0x28,0x09]
+; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x02]
+; CHECK-FMA-WIN-NEXT: vfmsub213sd (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0xf1,0xab,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
%res = call <2 x double> @llvm.x86.fma.vfmsub.sd(<2 x double> %a1, <2 x double> %a0, <2 x double> %a2)
ret <2 x double> %res
}
declare <2 x double> @llvm.x86.fma.vfmsub.sd(<2 x double>, <2 x double>, <2 x double>)
define <4 x float> @test_x86_fma_vfmsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
-; CHECK-LABEL: test_x86_fma_vfmsub_ps:
-; CHECK-NEXT: # BB#0:
-;
-; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}}
-; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}}
-; CHECK-FMA-WIN-NEXT: vfmsub213ps (%r8), %xmm1, %xmm0
-;
-; CHECK-FMA-NEXT: vfmsub213ps %xmm2, %xmm1, %xmm0
-;
-; CHECK-FMA4-NEXT: vfmsubps %xmm2, %xmm1, %xmm0, %xmm0
-;
-; CHECK-NEXT: retq
+; CHECK-FMA-LABEL: test_x86_fma_vfmsub_ps:
+; CHECK-FMA: # %bb.0:
+; CHECK-FMA-NEXT: vfmsub213ps %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xaa,0xc2]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfmsub_ps:
+; CHECK-AVX512VL: # %bb.0:
+; CHECK-AVX512VL-NEXT: vfmsub213ps %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xaa,0xc2]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmsub_ps:
+; CHECK-FMA-WIN: # %bb.0:
+; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x09]
+; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02]
+; CHECK-FMA-WIN-NEXT: vfmsub213ps (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xaa,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
%res = call <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
ret <4 x float> %res
}
declare <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float>, <4 x float>, <4 x float>)
define <2 x double> @test_x86_fma_vfmsub_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
-; CHECK-LABEL: test_x86_fma_vfmsub_pd:
-; CHECK-NEXT: # BB#0:
-;
-; CHECK-FMA-WIN-NEXT: vmovapd (%{{(rcx|rdx)}}), %xmm{{0|1}}
-; CHECK-FMA-WIN-NEXT: vmovapd (%{{(rcx|rdx)}}), %xmm{{0|1}}
-; CHECK-FMA-WIN-NEXT: vfmsub213pd (%r8), %xmm1, %xmm0
-;
-; CHECK-FMA-NEXT: vfmsub213pd %xmm2, %xmm1, %xmm0
-;
-; CHECK-FMA4-NEXT: vfmsubpd %xmm2, %xmm1, %xmm0, %xmm0
-;
-; CHECK-NEXT: retq
+; CHECK-FMA-LABEL: test_x86_fma_vfmsub_pd:
+; CHECK-FMA: # %bb.0:
+; CHECK-FMA-NEXT: vfmsub213pd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0xaa,0xc2]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfmsub_pd:
+; CHECK-AVX512VL: # %bb.0:
+; CHECK-AVX512VL-NEXT: vfmsub213pd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xaa,0xc2]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmsub_pd:
+; CHECK-FMA-WIN: # %bb.0:
+; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm1 # encoding: [0xc5,0xf9,0x28,0x09]
+; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x02]
+; CHECK-FMA-WIN-NEXT: vfmsub213pd (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0xf1,0xaa,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
%res = call <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
ret <2 x double> %res
}
declare <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double>, <2 x double>, <2 x double>)
define <8 x float> @test_x86_fma_vfmsub_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) #0 {
-; CHECK-LABEL: test_x86_fma_vfmsub_ps_256:
-; CHECK-NEXT: # BB#0:
-;
-; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %ymm{{0|1}}
-; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %ymm{{0|1}}
-; CHECK-FMA-WIN-NEXT: vfmsub213ps (%r8), %ymm1, %ymm0
-;
-; CHECK-FMA-NEXT: vfmsub213ps %ymm2, %ymm1, %ymm0
-;
-; CHECK-FMA4-NEXT: vfmsubps %ymm2, %ymm1, %ymm0, %ymm0
-;
-; CHECK-NEXT: retq
+; CHECK-FMA-LABEL: test_x86_fma_vfmsub_ps_256:
+; CHECK-FMA: # %bb.0:
+; CHECK-FMA-NEXT: vfmsub213ps %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0xaa,0xc2]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfmsub_ps_256:
+; CHECK-AVX512VL: # %bb.0:
+; CHECK-AVX512VL-NEXT: vfmsub213ps %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0xaa,0xc2]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmsub_ps_256:
+; CHECK-FMA-WIN: # %bb.0:
+; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %ymm1 # encoding: [0xc5,0xfc,0x28,0x09]
+; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %ymm0 # encoding: [0xc5,0xfc,0x28,0x02]
+; CHECK-FMA-WIN-NEXT: vfmsub213ps (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0x75,0xaa,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
%res = call <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2)
ret <8 x float> %res
}
declare <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float>, <8 x float>, <8 x float>)
define <4 x double> @test_x86_fma_vfmsub_pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 {
-; CHECK-LABEL: test_x86_fma_vfmsub_pd_256:
-; CHECK-NEXT: # BB#0:
-;
-; CHECK-FMA-WIN-NEXT: vmovapd (%{{(rcx|rdx)}}), %ymm{{0|1}}
-; CHECK-FMA-WIN-NEXT: vmovapd (%{{(rcx|rdx)}}), %ymm{{0|1}}
-; CHECK-FMA-WIN-NEXT: vfmsub213pd (%r8), %ymm1, %ymm0
-;
-; CHECK-FMA-NEXT: vfmsub213pd %ymm2, %ymm1, %ymm0
-;
-; CHECK-FMA4-NEXT: vfmsubpd %ymm2, %ymm1, %ymm0, %ymm0
-;
-; CHECK-NEXT: retq
+; CHECK-FMA-LABEL: test_x86_fma_vfmsub_pd_256:
+; CHECK-FMA: # %bb.0:
+; CHECK-FMA-NEXT: vfmsub213pd %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0xf5,0xaa,0xc2]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfmsub_pd_256:
+; CHECK-AVX512VL: # %bb.0:
+; CHECK-AVX512VL-NEXT: vfmsub213pd %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf5,0xaa,0xc2]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmsub_pd_256:
+; CHECK-FMA-WIN: # %bb.0:
+; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %ymm1 # encoding: [0xc5,0xfd,0x28,0x09]
+; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %ymm0 # encoding: [0xc5,0xfd,0x28,0x02]
+; CHECK-FMA-WIN-NEXT: vfmsub213pd (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0xf5,0xaa,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
%res = call <4 x double> @llvm.x86.fma.vfmsub.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2)
ret <4 x double> %res
}
@@ -296,144 +363,178 @@ declare <4 x double> @llvm.x86.fma.vfmsub.pd.256(<4 x double>, <4 x double>, <4
; VFNMADD
define <4 x float> @test_x86_fma_vfnmadd_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
-; CHECK-LABEL: test_x86_fma_vfnmadd_ss:
-; CHECK-NEXT: # BB#0:
-;
-; CHECK-FMA-WIN-NEXT: vmovaps {{\(%rcx\), %xmm0|\(%r8\), %xmm1}}
-; CHECK-FMA-WIN-NEXT: vmovaps {{\(%rcx\), %xmm0|\(%r8\), %xmm1}}
-; CHECK-FMA-WIN-NEXT: vfnmadd132ss (%rdx), %xmm1, %xmm0
-;
-; CHECK-FMA-NEXT: vfnmadd213ss %xmm2, %xmm1, %xmm0
-;
-; CHECK-FMA4-NEXT: vfnmaddss %xmm2, %xmm1, %xmm0, %xmm0
-;
-; CHECK-NEXT: retq
+; CHECK-FMA-LABEL: test_x86_fma_vfnmadd_ss:
+; CHECK-FMA: # %bb.0:
+; CHECK-FMA-NEXT: vfnmadd213ss %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xad,0xc2]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmadd_ss:
+; CHECK-AVX512VL: # %bb.0:
+; CHECK-AVX512VL-NEXT: vfnmadd213ss %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xad,0xc2]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmadd_ss:
+; CHECK-FMA-WIN: # %bb.0:
+; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x0a]
+; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x01]
+; CHECK-FMA-WIN-NEXT: vfnmadd213ss (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xad,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
%res = call <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
ret <4 x float> %res
}
define <4 x float> @test_x86_fma_vfnmadd_bac_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
-; CHECK-LABEL: test_x86_fma_vfnmadd_bac_ss:
-; CHECK-NEXT: # BB#0:
-;
-; CHECK-FMA-WIN-NEXT: vmovaps {{\(%rdx\), %xmm0|\(%r8\), %xmm1}}
-; CHECK-FMA-WIN-NEXT: vmovaps {{\(%rdx\), %xmm0|\(%r8\), %xmm1}}
-; CHECK-FMA-WIN-NEXT: vfnmadd132ss (%rcx), %xmm1, %xmm0
-;
-; CHECK-FMA-NEXT: vfnmadd213ss %xmm2, %xmm0, %xmm1
-; CHECK-FMA-NEXT: vmovaps %xmm1, %xmm0
-;
-; CHECK-FMA4-NEXT: vfnmaddss %xmm2, %xmm0, %xmm1, %xmm0
-;
-; CHECK-NEXT: retq
+; CHECK-FMA-LABEL: test_x86_fma_vfnmadd_bac_ss:
+; CHECK-FMA: # %bb.0:
+; CHECK-FMA-NEXT: vfnmadd213ss %xmm2, %xmm0, %xmm1 # encoding: [0xc4,0xe2,0x79,0xad,0xca]
+; CHECK-FMA-NEXT: vmovaps %xmm1, %xmm0 # encoding: [0xc5,0xf8,0x28,0xc1]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmadd_bac_ss:
+; CHECK-AVX512VL: # %bb.0:
+; CHECK-AVX512VL-NEXT: vfnmadd213ss %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xad,0xca]
+; CHECK-AVX512VL-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmadd_bac_ss:
+; CHECK-FMA-WIN: # %bb.0:
+; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x09]
+; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02]
+; CHECK-FMA-WIN-NEXT: vfnmadd213ss (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xad,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
%res = call <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float> %a1, <4 x float> %a0, <4 x float> %a2)
ret <4 x float> %res
}
declare <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float>, <4 x float>, <4 x float>)
define <2 x double> @test_x86_fma_vfnmadd_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
-; CHECK-LABEL: test_x86_fma_vfnmadd_sd:
-; CHECK-NEXT: # BB#0:
-;
-; CHECK-FMA-WIN-NEXT: vmovapd {{\(%rcx\), %xmm0|\(%r8\), %xmm1}}
-; CHECK-FMA-WIN-NEXT: vmovapd {{\(%rcx\), %xmm0|\(%r8\), %xmm1}}
-; CHECK-FMA-WIN-NEXT: vfnmadd132sd (%rdx), %xmm1, %xmm0
-;
-; CHECK-FMA-NEXT: vfnmadd213sd %xmm2, %xmm1, %xmm0
-;
-; CHECK-FMA4-NEXT: vfnmaddsd %xmm2, %xmm1, %xmm0, %xmm0
-;
-; CHECK-NEXT: retq
+; CHECK-FMA-LABEL: test_x86_fma_vfnmadd_sd:
+; CHECK-FMA: # %bb.0:
+; CHECK-FMA-NEXT: vfnmadd213sd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0xad,0xc2]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmadd_sd:
+; CHECK-AVX512VL: # %bb.0:
+; CHECK-AVX512VL-NEXT: vfnmadd213sd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xad,0xc2]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmadd_sd:
+; CHECK-FMA-WIN: # %bb.0:
+; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm1 # encoding: [0xc5,0xf9,0x28,0x0a]
+; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x01]
+; CHECK-FMA-WIN-NEXT: vfnmadd213sd (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0xf1,0xad,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
%res = call <2 x double> @llvm.x86.fma.vfnmadd.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
ret <2 x double> %res
}
define <2 x double> @test_x86_fma_vfnmadd_bac_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
-; CHECK-LABEL: test_x86_fma_vfnmadd_bac_sd:
-; CHECK-NEXT: # BB#0:
-;
-; CHECK-FMA-WIN-NEXT: vmovapd {{\(%rdx\), %xmm0|\(%r8\), %xmm1}}
-; CHECK-FMA-WIN-NEXT: vmovapd {{\(%rdx\), %xmm0|\(%r8\), %xmm1}}
-; CHECK-FMA-WIN-NEXT: vfnmadd132sd (%rcx), %xmm1, %xmm0
-;
-; CHECK-FMA-NEXT: vfnmadd213sd %xmm2, %xmm0, %xmm1
-; CHECK-FMA-NEXT: vmovapd %xmm1, %xmm0
-;
-; CHECK-FMA4-NEXT: vfnmaddsd %xmm2, %xmm0, %xmm1, %xmm0
-;
-; CHECK-NEXT: retq
+; CHECK-FMA-LABEL: test_x86_fma_vfnmadd_bac_sd:
+; CHECK-FMA: # %bb.0:
+; CHECK-FMA-NEXT: vfnmadd213sd %xmm2, %xmm0, %xmm1 # encoding: [0xc4,0xe2,0xf9,0xad,0xca]
+; CHECK-FMA-NEXT: vmovapd %xmm1, %xmm0 # encoding: [0xc5,0xf9,0x28,0xc1]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmadd_bac_sd:
+; CHECK-AVX512VL: # %bb.0:
+; CHECK-AVX512VL-NEXT: vfnmadd213sd %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xad,0xca]
+; CHECK-AVX512VL-NEXT: vmovapd %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc1]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmadd_bac_sd:
+; CHECK-FMA-WIN: # %bb.0:
+; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm1 # encoding: [0xc5,0xf9,0x28,0x09]
+; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x02]
+; CHECK-FMA-WIN-NEXT: vfnmadd213sd (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0xf1,0xad,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
%res = call <2 x double> @llvm.x86.fma.vfnmadd.sd(<2 x double> %a1, <2 x double> %a0, <2 x double> %a2)
ret <2 x double> %res
}
declare <2 x double> @llvm.x86.fma.vfnmadd.sd(<2 x double>, <2 x double>, <2 x double>)
define <4 x float> @test_x86_fma_vfnmadd_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
-; CHECK-LABEL: test_x86_fma_vfnmadd_ps:
-; CHECK-NEXT: # BB#0:
-;
-; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}}
-; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}}
-; CHECK-FMA-WIN-NEXT: vfnmadd213ps (%r8), %xmm1, %xmm0
-;
-; CHECK-FMA-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0
-;
-; CHECK-FMA4-NEXT: vfnmaddps %xmm2, %xmm1, %xmm0, %xmm0
-;
-; CHECK-NEXT: retq
+; CHECK-FMA-LABEL: test_x86_fma_vfnmadd_ps:
+; CHECK-FMA: # %bb.0:
+; CHECK-FMA-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xac,0xc2]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmadd_ps:
+; CHECK-AVX512VL: # %bb.0:
+; CHECK-AVX512VL-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xac,0xc2]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmadd_ps:
+; CHECK-FMA-WIN: # %bb.0:
+; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x09]
+; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02]
+; CHECK-FMA-WIN-NEXT: vfnmadd213ps (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xac,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
%res = call <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
ret <4 x float> %res
}
declare <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float>, <4 x float>, <4 x float>)
define <2 x double> @test_x86_fma_vfnmadd_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
-; CHECK-LABEL: test_x86_fma_vfnmadd_pd:
-; CHECK-NEXT: # BB#0:
-;
-; CHECK-FMA-WIN-NEXT: vmovapd (%{{(rcx|rdx)}}), %xmm{{0|1}}
-; CHECK-FMA-WIN-NEXT: vmovapd (%{{(rcx|rdx)}}), %xmm{{0|1}}
-; CHECK-FMA-WIN-NEXT: vfnmadd213pd (%r8), %xmm1, %xmm0
-;
-; CHECK-FMA-NEXT: vfnmadd213pd %xmm2, %xmm1, %xmm0
-;
-; CHECK-FMA4-NEXT: vfnmaddpd %xmm2, %xmm1, %xmm0, %xmm0
-;
-; CHECK-NEXT: retq
+; CHECK-FMA-LABEL: test_x86_fma_vfnmadd_pd:
+; CHECK-FMA: # %bb.0:
+; CHECK-FMA-NEXT: vfnmadd213pd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0xac,0xc2]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmadd_pd:
+; CHECK-AVX512VL: # %bb.0:
+; CHECK-AVX512VL-NEXT: vfnmadd213pd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xac,0xc2]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmadd_pd:
+; CHECK-FMA-WIN: # %bb.0:
+; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm1 # encoding: [0xc5,0xf9,0x28,0x09]
+; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x02]
+; CHECK-FMA-WIN-NEXT: vfnmadd213pd (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0xf1,0xac,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
%res = call <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
ret <2 x double> %res
}
declare <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double>, <2 x double>, <2 x double>)
define <8 x float> @test_x86_fma_vfnmadd_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) #0 {
-; CHECK-LABEL: test_x86_fma_vfnmadd_ps_256:
-; CHECK-NEXT: # BB#0:
-;
-; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %ymm{{0|1}}
-; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %ymm{{0|1}}
-; CHECK-FMA-WIN-NEXT: vfnmadd213ps (%r8), %ymm1, %ymm0
-;
-; CHECK-FMA-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0
-;
-; CHECK-FMA4-NEXT: vfnmaddps %ymm2, %ymm1, %ymm0, %ymm0
-;
-; CHECK-NEXT: retq
+; CHECK-FMA-LABEL: test_x86_fma_vfnmadd_ps_256:
+; CHECK-FMA: # %bb.0:
+; CHECK-FMA-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0xac,0xc2]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmadd_ps_256:
+; CHECK-AVX512VL: # %bb.0:
+; CHECK-AVX512VL-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0xac,0xc2]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmadd_ps_256:
+; CHECK-FMA-WIN: # %bb.0:
+; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %ymm1 # encoding: [0xc5,0xfc,0x28,0x09]
+; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %ymm0 # encoding: [0xc5,0xfc,0x28,0x02]
+; CHECK-FMA-WIN-NEXT: vfnmadd213ps (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0x75,0xac,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
%res = call <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2)
ret <8 x float> %res
}
declare <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float>, <8 x float>, <8 x float>)
define <4 x double> @test_x86_fma_vfnmadd_pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 {
-; CHECK-LABEL: test_x86_fma_vfnmadd_pd_256:
-; CHECK-NEXT: # BB#0:
-;
-; CHECK-FMA-WIN-NEXT: vmovapd (%{{(rcx|rdx)}}), %ymm{{0|1}}
-; CHECK-FMA-WIN-NEXT: vmovapd (%{{(rcx|rdx)}}), %ymm{{0|1}}
-; CHECK-FMA-WIN-NEXT: vfnmadd213pd (%r8), %ymm1, %ymm0
-;
-; CHECK-FMA-NEXT: vfnmadd213pd %ymm2, %ymm1, %ymm0
-;
-; CHECK-FMA4-NEXT: vfnmaddpd %ymm2, %ymm1, %ymm0, %ymm0
-;
-; CHECK-NEXT: retq
+; CHECK-FMA-LABEL: test_x86_fma_vfnmadd_pd_256:
+; CHECK-FMA: # %bb.0:
+; CHECK-FMA-NEXT: vfnmadd213pd %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0xf5,0xac,0xc2]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmadd_pd_256:
+; CHECK-AVX512VL: # %bb.0:
+; CHECK-AVX512VL-NEXT: vfnmadd213pd %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf5,0xac,0xc2]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmadd_pd_256:
+; CHECK-FMA-WIN: # %bb.0:
+; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %ymm1 # encoding: [0xc5,0xfd,0x28,0x09]
+; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %ymm0 # encoding: [0xc5,0xfd,0x28,0x02]
+; CHECK-FMA-WIN-NEXT: vfnmadd213pd (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0xf5,0xac,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
%res = call <4 x double> @llvm.x86.fma.vfnmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2)
ret <4 x double> %res
}
@@ -441,144 +542,178 @@ declare <4 x double> @llvm.x86.fma.vfnmadd.pd.256(<4 x double>, <4 x double>, <4
; VFNMSUB
define <4 x float> @test_x86_fma_vfnmsub_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
-; CHECK-LABEL: test_x86_fma_vfnmsub_ss:
-; CHECK-NEXT: # BB#0:
-;
-; CHECK-FMA-WIN-NEXT: vmovaps {{\(%rcx\), %xmm0|\(%r8\), %xmm1}}
-; CHECK-FMA-WIN-NEXT: vmovaps {{\(%rcx\), %xmm0|\(%r8\), %xmm1}}
-; CHECK-FMA-WIN-NEXT: vfnmsub132ss (%rdx), %xmm1, %xmm0
-;
-; CHECK-FMA-NEXT: vfnmsub213ss %xmm2, %xmm1, %xmm0
-;
-; CHECK-FMA4-NEXT: vfnmsubss %xmm2, %xmm1, %xmm0, %xmm0
-;
-; CHECK-NEXT: retq
+; CHECK-FMA-LABEL: test_x86_fma_vfnmsub_ss:
+; CHECK-FMA: # %bb.0:
+; CHECK-FMA-NEXT: vfnmsub213ss %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xaf,0xc2]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmsub_ss:
+; CHECK-AVX512VL: # %bb.0:
+; CHECK-AVX512VL-NEXT: vfnmsub213ss %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xaf,0xc2]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmsub_ss:
+; CHECK-FMA-WIN: # %bb.0:
+; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x0a]
+; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x01]
+; CHECK-FMA-WIN-NEXT: vfnmsub213ss (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xaf,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
%res = call <4 x float> @llvm.x86.fma.vfnmsub.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
ret <4 x float> %res
}
define <4 x float> @test_x86_fma_vfnmsub_bac_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
-; CHECK-LABEL: test_x86_fma_vfnmsub_bac_ss:
-; CHECK-NEXT: # BB#0:
-;
-; CHECK-FMA-WIN-NEXT: vmovaps {{\(%rdx\), %xmm0|\(%r8\), %xmm1}}
-; CHECK-FMA-WIN-NEXT: vmovaps {{\(%rdx\), %xmm0|\(%r8\), %xmm1}}
-; CHECK-FMA-WIN-NEXT: vfnmsub132ss (%rcx), %xmm1, %xmm0
-;
-; CHECK-FMA-NEXT: vfnmsub213ss %xmm2, %xmm0, %xmm1
-; CHECK-FMA-NEXT: vmovaps %xmm1, %xmm0
-;
-; CHECK-FMA4-NEXT: vfnmsubss %xmm2, %xmm0, %xmm1, %xmm0
-;
-; CHECK-NEXT: retq
+; CHECK-FMA-LABEL: test_x86_fma_vfnmsub_bac_ss:
+; CHECK-FMA: # %bb.0:
+; CHECK-FMA-NEXT: vfnmsub213ss %xmm2, %xmm0, %xmm1 # encoding: [0xc4,0xe2,0x79,0xaf,0xca]
+; CHECK-FMA-NEXT: vmovaps %xmm1, %xmm0 # encoding: [0xc5,0xf8,0x28,0xc1]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmsub_bac_ss:
+; CHECK-AVX512VL: # %bb.0:
+; CHECK-AVX512VL-NEXT: vfnmsub213ss %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xaf,0xca]
+; CHECK-AVX512VL-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmsub_bac_ss:
+; CHECK-FMA-WIN: # %bb.0:
+; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x09]
+; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02]
+; CHECK-FMA-WIN-NEXT: vfnmsub213ss (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xaf,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
%res = call <4 x float> @llvm.x86.fma.vfnmsub.ss(<4 x float> %a1, <4 x float> %a0, <4 x float> %a2)
ret <4 x float> %res
}
declare <4 x float> @llvm.x86.fma.vfnmsub.ss(<4 x float>, <4 x float>, <4 x float>)
define <2 x double> @test_x86_fma_vfnmsub_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
-; CHECK-LABEL: test_x86_fma_vfnmsub_sd:
-; CHECK-NEXT: # BB#0:
-;
-; CHECK-FMA-WIN-NEXT: vmovapd {{\(%rcx\), %xmm0|\(%r8\), %xmm1}}
-; CHECK-FMA-WIN-NEXT: vmovapd {{\(%rcx\), %xmm0|\(%r8\), %xmm1}}
-; CHECK-FMA-WIN-NEXT: vfnmsub132sd (%rdx), %xmm1, %xmm0
-;
-; CHECK-FMA-NEXT: vfnmsub213sd %xmm2, %xmm1, %xmm0
-;
-; CHECK-FMA4-NEXT: vfnmsubsd %xmm2, %xmm1, %xmm0, %xmm0
-;
-; CHECK-NEXT: retq
+; CHECK-FMA-LABEL: test_x86_fma_vfnmsub_sd:
+; CHECK-FMA: # %bb.0:
+; CHECK-FMA-NEXT: vfnmsub213sd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0xaf,0xc2]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmsub_sd:
+; CHECK-AVX512VL: # %bb.0:
+; CHECK-AVX512VL-NEXT: vfnmsub213sd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xaf,0xc2]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmsub_sd:
+; CHECK-FMA-WIN: # %bb.0:
+; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm1 # encoding: [0xc5,0xf9,0x28,0x0a]
+; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x01]
+; CHECK-FMA-WIN-NEXT: vfnmsub213sd (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0xf1,0xaf,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
%res = call <2 x double> @llvm.x86.fma.vfnmsub.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
ret <2 x double> %res
}
define <2 x double> @test_x86_fma_vfnmsub_bac_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
-; CHECK-LABEL: test_x86_fma_vfnmsub_bac_sd:
-; CHECK-NEXT: # BB#0:
-;
-; CHECK-FMA-WIN-NEXT: vmovapd {{\(%rdx\), %xmm0|\(%r8\), %xmm1}}
-; CHECK-FMA-WIN-NEXT: vmovapd {{\(%rdx\), %xmm0|\(%r8\), %xmm1}}
-; CHECK-FMA-WIN-NEXT: vfnmsub132sd (%rcx), %xmm1, %xmm0
-;
-; CHECK-FMA-NEXT: vfnmsub213sd %xmm2, %xmm0, %xmm1
-; CHECK-FMA-NEXT: vmovapd %xmm1, %xmm0
-;
-; CHECK-FMA4-NEXT: vfnmsubsd %xmm2, %xmm0, %xmm1, %xmm0
-;
-; CHECK-NEXT: retq
+; CHECK-FMA-LABEL: test_x86_fma_vfnmsub_bac_sd:
+; CHECK-FMA: # %bb.0:
+; CHECK-FMA-NEXT: vfnmsub213sd %xmm2, %xmm0, %xmm1 # encoding: [0xc4,0xe2,0xf9,0xaf,0xca]
+; CHECK-FMA-NEXT: vmovapd %xmm1, %xmm0 # encoding: [0xc5,0xf9,0x28,0xc1]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmsub_bac_sd:
+; CHECK-AVX512VL: # %bb.0:
+; CHECK-AVX512VL-NEXT: vfnmsub213sd %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xaf,0xca]
+; CHECK-AVX512VL-NEXT: vmovapd %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc1]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmsub_bac_sd:
+; CHECK-FMA-WIN: # %bb.0:
+; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm1 # encoding: [0xc5,0xf9,0x28,0x09]
+; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x02]
+; CHECK-FMA-WIN-NEXT: vfnmsub213sd (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0xf1,0xaf,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
%res = call <2 x double> @llvm.x86.fma.vfnmsub.sd(<2 x double> %a1, <2 x double> %a0, <2 x double> %a2)
ret <2 x double> %res
}
declare <2 x double> @llvm.x86.fma.vfnmsub.sd(<2 x double>, <2 x double>, <2 x double>)
define <4 x float> @test_x86_fma_vfnmsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
-; CHECK-LABEL: test_x86_fma_vfnmsub_ps:
-; CHECK-NEXT: # BB#0:
-;
-; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}}
-; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}}
-; CHECK-FMA-WIN-NEXT: vfnmsub213ps (%r8), %xmm1, %xmm0
-;
-; CHECK-FMA-NEXT: vfnmsub213ps %xmm2, %xmm1, %xmm0
-;
-; CHECK-FMA4-NEXT: vfnmsubps %xmm2, %xmm1, %xmm0, %xmm0
-;
-; CHECK-NEXT: retq
+; CHECK-FMA-LABEL: test_x86_fma_vfnmsub_ps:
+; CHECK-FMA: # %bb.0:
+; CHECK-FMA-NEXT: vfnmsub213ps %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xae,0xc2]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmsub_ps:
+; CHECK-AVX512VL: # %bb.0:
+; CHECK-AVX512VL-NEXT: vfnmsub213ps %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xae,0xc2]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmsub_ps:
+; CHECK-FMA-WIN: # %bb.0:
+; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x09]
+; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02]
+; CHECK-FMA-WIN-NEXT: vfnmsub213ps (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xae,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
%res = call <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
ret <4 x float> %res
}
declare <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float>, <4 x float>, <4 x float>)
define <2 x double> @test_x86_fma_vfnmsub_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
-; CHECK-LABEL: test_x86_fma_vfnmsub_pd:
-; CHECK-NEXT: # BB#0:
-;
-; CHECK-FMA-WIN-NEXT: vmovapd (%{{(rcx|rdx)}}), %xmm{{0|1}}
-; CHECK-FMA-WIN-NEXT: vmovapd (%{{(rcx|rdx)}}), %xmm{{0|1}}
-; CHECK-FMA-WIN-NEXT: vfnmsub213pd (%r8), %xmm1, %xmm0
-;
-; CHECK-FMA-NEXT: vfnmsub213pd %xmm2, %xmm1, %xmm0
-;
-; CHECK-FMA4-NEXT: vfnmsubpd %xmm2, %xmm1, %xmm0, %xmm0
-;
-; CHECK-NEXT: retq
+; CHECK-FMA-LABEL: test_x86_fma_vfnmsub_pd:
+; CHECK-FMA: # %bb.0:
+; CHECK-FMA-NEXT: vfnmsub213pd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0xae,0xc2]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmsub_pd:
+; CHECK-AVX512VL: # %bb.0:
+; CHECK-AVX512VL-NEXT: vfnmsub213pd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xae,0xc2]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmsub_pd:
+; CHECK-FMA-WIN: # %bb.0:
+; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm1 # encoding: [0xc5,0xf9,0x28,0x09]
+; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x02]
+; CHECK-FMA-WIN-NEXT: vfnmsub213pd (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0xf1,0xae,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
%res = call <2 x double> @llvm.x86.fma.vfnmsub.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
ret <2 x double> %res
}
declare <2 x double> @llvm.x86.fma.vfnmsub.pd(<2 x double>, <2 x double>, <2 x double>)
define <8 x float> @test_x86_fma_vfnmsub_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) #0 {
-; CHECK-LABEL: test_x86_fma_vfnmsub_ps_256:
-; CHECK-NEXT: # BB#0:
-;
-; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %ymm{{0|1}}
-; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %ymm{{0|1}}
-; CHECK-FMA-WIN-NEXT: vfnmsub213ps (%r8), %ymm1, %ymm0
-;
-; CHECK-FMA-NEXT: vfnmsub213ps %ymm2, %ymm1, %ymm0
-;
-; CHECK-FMA4-NEXT: vfnmsubps %ymm2, %ymm1, %ymm0, %ymm0
-;
-; CHECK-NEXT: retq
+; CHECK-FMA-LABEL: test_x86_fma_vfnmsub_ps_256:
+; CHECK-FMA: # %bb.0:
+; CHECK-FMA-NEXT: vfnmsub213ps %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0xae,0xc2]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmsub_ps_256:
+; CHECK-AVX512VL: # %bb.0:
+; CHECK-AVX512VL-NEXT: vfnmsub213ps %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0xae,0xc2]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmsub_ps_256:
+; CHECK-FMA-WIN: # %bb.0:
+; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %ymm1 # encoding: [0xc5,0xfc,0x28,0x09]
+; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %ymm0 # encoding: [0xc5,0xfc,0x28,0x02]
+; CHECK-FMA-WIN-NEXT: vfnmsub213ps (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0x75,0xae,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
%res = call <8 x float> @llvm.x86.fma.vfnmsub.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2)
ret <8 x float> %res
}
declare <8 x float> @llvm.x86.fma.vfnmsub.ps.256(<8 x float>, <8 x float>, <8 x float>)
define <4 x double> @test_x86_fma_vfnmsub_pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 {
-; CHECK-LABEL: test_x86_fma_vfnmsub_pd_256:
-; CHECK-NEXT: # BB#0:
-;
-; CHECK-FMA-WIN-NEXT: vmovapd (%{{(rcx|rdx)}}), %ymm{{0|1}}
-; CHECK-FMA-WIN-NEXT: vmovapd (%{{(rcx|rdx)}}), %ymm{{0|1}}
-; CHECK-FMA-WIN-NEXT: vfnmsub213pd (%r8), %ymm1, %ymm0
-;
-; CHECK-FMA-NEXT: vfnmsub213pd %ymm2, %ymm1, %ymm0
-;
-; CHECK-FMA4-NEXT: vfnmsubpd %ymm2, %ymm1, %ymm0, %ymm0
-;
-; CHECK-NEXT: retq
+; CHECK-FMA-LABEL: test_x86_fma_vfnmsub_pd_256:
+; CHECK-FMA: # %bb.0:
+; CHECK-FMA-NEXT: vfnmsub213pd %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0xf5,0xae,0xc2]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfnmsub_pd_256:
+; CHECK-AVX512VL: # %bb.0:
+; CHECK-AVX512VL-NEXT: vfnmsub213pd %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf5,0xae,0xc2]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmsub_pd_256:
+; CHECK-FMA-WIN: # %bb.0:
+; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %ymm1 # encoding: [0xc5,0xfd,0x28,0x09]
+; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %ymm0 # encoding: [0xc5,0xfd,0x28,0x02]
+; CHECK-FMA-WIN-NEXT: vfnmsub213pd (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0xf5,0xae,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
%res = call <4 x double> @llvm.x86.fma.vfnmsub.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2)
ret <4 x double> %res
}
@@ -586,72 +721,88 @@ declare <4 x double> @llvm.x86.fma.vfnmsub.pd.256(<4 x double>, <4 x double>, <4
; VFMADDSUB
define <4 x float> @test_x86_fma_vfmaddsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
-; CHECK-LABEL: test_x86_fma_vfmaddsub_ps:
-; CHECK-NEXT: # BB#0:
-;
-; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}}
-; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}}
-; CHECK-FMA-WIN-NEXT: vfmaddsub213ps (%r8), %xmm1, %xmm0
-;
-; CHECK-FMA-NEXT: vfmaddsub213ps %xmm2, %xmm1, %xmm0
-;
-; CHECK-FMA4-NEXT: vfmaddsubps %xmm2, %xmm1, %xmm0, %xmm0
-;
-; CHECK-NEXT: retq
+; CHECK-FMA-LABEL: test_x86_fma_vfmaddsub_ps:
+; CHECK-FMA: # %bb.0:
+; CHECK-FMA-NEXT: vfmaddsub213ps %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xa6,0xc2]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfmaddsub_ps:
+; CHECK-AVX512VL: # %bb.0:
+; CHECK-AVX512VL-NEXT: vfmaddsub213ps %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xa6,0xc2]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmaddsub_ps:
+; CHECK-FMA-WIN: # %bb.0:
+; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x09]
+; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02]
+; CHECK-FMA-WIN-NEXT: vfmaddsub213ps (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xa6,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
%res = call <4 x float> @llvm.x86.fma.vfmaddsub.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
ret <4 x float> %res
}
declare <4 x float> @llvm.x86.fma.vfmaddsub.ps(<4 x float>, <4 x float>, <4 x float>)
define <2 x double> @test_x86_fma_vfmaddsub_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
-; CHECK-LABEL: test_x86_fma_vfmaddsub_pd:
-; CHECK-NEXT: # BB#0:
-;
-; CHECK-FMA-WIN-NEXT: vmovapd (%{{(rcx|rdx)}}), %xmm{{0|1}}
-; CHECK-FMA-WIN-NEXT: vmovapd (%{{(rcx|rdx)}}), %xmm{{0|1}}
-; CHECK-FMA-WIN-NEXT: vfmaddsub213pd (%r8), %xmm1, %xmm0
-;
-; CHECK-FMA-NEXT: vfmaddsub213pd %xmm2, %xmm1, %xmm0
-;
-; CHECK-FMA4-NEXT: vfmaddsubpd %xmm2, %xmm1, %xmm0, %xmm0
-;
-; CHECK-NEXT: retq
+; CHECK-FMA-LABEL: test_x86_fma_vfmaddsub_pd:
+; CHECK-FMA: # %bb.0:
+; CHECK-FMA-NEXT: vfmaddsub213pd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0xa6,0xc2]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfmaddsub_pd:
+; CHECK-AVX512VL: # %bb.0:
+; CHECK-AVX512VL-NEXT: vfmaddsub213pd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xa6,0xc2]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmaddsub_pd:
+; CHECK-FMA-WIN: # %bb.0:
+; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm1 # encoding: [0xc5,0xf9,0x28,0x09]
+; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x02]
+; CHECK-FMA-WIN-NEXT: vfmaddsub213pd (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0xf1,0xa6,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
%res = call <2 x double> @llvm.x86.fma.vfmaddsub.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
ret <2 x double> %res
}
declare <2 x double> @llvm.x86.fma.vfmaddsub.pd(<2 x double>, <2 x double>, <2 x double>)
define <8 x float> @test_x86_fma_vfmaddsub_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) #0 {
-; CHECK-LABEL: test_x86_fma_vfmaddsub_ps_256:
-; CHECK-NEXT: # BB#0:
-;
-; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %ymm{{0|1}}
-; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %ymm{{0|1}}
-; CHECK-FMA-WIN-NEXT: vfmaddsub213ps (%r8), %ymm1, %ymm0
-;
-; CHECK-FMA-NEXT: vfmaddsub213ps %ymm2, %ymm1, %ymm0
-;
-; CHECK-FMA4-NEXT: vfmaddsubps %ymm2, %ymm1, %ymm0, %ymm0
-;
-; CHECK-NEXT: retq
+; CHECK-FMA-LABEL: test_x86_fma_vfmaddsub_ps_256:
+; CHECK-FMA: # %bb.0:
+; CHECK-FMA-NEXT: vfmaddsub213ps %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0xa6,0xc2]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfmaddsub_ps_256:
+; CHECK-AVX512VL: # %bb.0:
+; CHECK-AVX512VL-NEXT: vfmaddsub213ps %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0xa6,0xc2]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmaddsub_ps_256:
+; CHECK-FMA-WIN: # %bb.0:
+; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %ymm1 # encoding: [0xc5,0xfc,0x28,0x09]
+; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %ymm0 # encoding: [0xc5,0xfc,0x28,0x02]
+; CHECK-FMA-WIN-NEXT: vfmaddsub213ps (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0x75,0xa6,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
%res = call <8 x float> @llvm.x86.fma.vfmaddsub.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2)
ret <8 x float> %res
}
declare <8 x float> @llvm.x86.fma.vfmaddsub.ps.256(<8 x float>, <8 x float>, <8 x float>)
define <4 x double> @test_x86_fma_vfmaddsub_pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 {
-; CHECK-LABEL: test_x86_fma_vfmaddsub_pd_256:
-; CHECK-NEXT: # BB#0:
-;
-; CHECK-FMA-WIN-NEXT: vmovapd (%{{(rcx|rdx)}}), %ymm{{0|1}}
-; CHECK-FMA-WIN-NEXT: vmovapd (%{{(rcx|rdx)}}), %ymm{{0|1}}
-; CHECK-FMA-WIN-NEXT: vfmaddsub213pd (%r8), %ymm1, %ymm0
-;
-; CHECK-FMA-NEXT: vfmaddsub213pd %ymm2, %ymm1, %ymm0
-;
-; CHECK-FMA4-NEXT: vfmaddsubpd %ymm2, %ymm1, %ymm0, %ymm0
-;
-; CHECK-NEXT: retq
+; CHECK-FMA-LABEL: test_x86_fma_vfmaddsub_pd_256:
+; CHECK-FMA: # %bb.0:
+; CHECK-FMA-NEXT: vfmaddsub213pd %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0xf5,0xa6,0xc2]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfmaddsub_pd_256:
+; CHECK-AVX512VL: # %bb.0:
+; CHECK-AVX512VL-NEXT: vfmaddsub213pd %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf5,0xa6,0xc2]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmaddsub_pd_256:
+; CHECK-FMA-WIN: # %bb.0:
+; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %ymm1 # encoding: [0xc5,0xfd,0x28,0x09]
+; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %ymm0 # encoding: [0xc5,0xfd,0x28,0x02]
+; CHECK-FMA-WIN-NEXT: vfmaddsub213pd (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0xf5,0xa6,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
%res = call <4 x double> @llvm.x86.fma.vfmaddsub.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2)
ret <4 x double> %res
}
@@ -659,72 +810,88 @@ declare <4 x double> @llvm.x86.fma.vfmaddsub.pd.256(<4 x double>, <4 x double>,
; VFMSUBADD
define <4 x float> @test_x86_fma_vfmsubadd_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
-; CHECK-LABEL: test_x86_fma_vfmsubadd_ps:
-; CHECK-NEXT: # BB#0:
-;
-; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}}
-; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %xmm{{0|1}}
-; CHECK-FMA-WIN-NEXT: vfmsubadd213ps (%r8), %xmm1, %xmm0
-;
-; CHECK-FMA-NEXT: vfmsubadd213ps %xmm2, %xmm1, %xmm0
-;
-; CHECK-FMA4-NEXT: vfmsubaddps %xmm2, %xmm1, %xmm0, %xmm0
-;
-; CHECK-NEXT: retq
+; CHECK-FMA-LABEL: test_x86_fma_vfmsubadd_ps:
+; CHECK-FMA: # %bb.0:
+; CHECK-FMA-NEXT: vfmsubadd213ps %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xa7,0xc2]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfmsubadd_ps:
+; CHECK-AVX512VL: # %bb.0:
+; CHECK-AVX512VL-NEXT: vfmsubadd213ps %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xa7,0xc2]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmsubadd_ps:
+; CHECK-FMA-WIN: # %bb.0:
+; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x09]
+; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02]
+; CHECK-FMA-WIN-NEXT: vfmsubadd213ps (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0x71,0xa7,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
%res = call <4 x float> @llvm.x86.fma.vfmsubadd.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
ret <4 x float> %res
}
declare <4 x float> @llvm.x86.fma.vfmsubadd.ps(<4 x float>, <4 x float>, <4 x float>)
define <2 x double> @test_x86_fma_vfmsubadd_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
-; CHECK-LABEL: test_x86_fma_vfmsubadd_pd:
-; CHECK-NEXT: # BB#0:
-;
-; CHECK-FMA-WIN-NEXT: vmovapd (%{{(rcx|rdx)}}), %xmm{{0|1}}
-; CHECK-FMA-WIN-NEXT: vmovapd (%{{(rcx|rdx)}}), %xmm{{0|1}}
-; CHECK-FMA-WIN-NEXT: vfmsubadd213pd (%r8), %xmm1, %xmm0
-;
-; CHECK-FMA-NEXT: vfmsubadd213pd %xmm2, %xmm1, %xmm0
-;
-; CHECK-FMA4-NEXT: vfmsubaddpd %xmm2, %xmm1, %xmm0, %xmm0
-;
-; CHECK-NEXT: retq
+; CHECK-FMA-LABEL: test_x86_fma_vfmsubadd_pd:
+; CHECK-FMA: # %bb.0:
+; CHECK-FMA-NEXT: vfmsubadd213pd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0xa7,0xc2]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfmsubadd_pd:
+; CHECK-AVX512VL: # %bb.0:
+; CHECK-AVX512VL-NEXT: vfmsubadd213pd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xa7,0xc2]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmsubadd_pd:
+; CHECK-FMA-WIN: # %bb.0:
+; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %xmm1 # encoding: [0xc5,0xf9,0x28,0x09]
+; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x02]
+; CHECK-FMA-WIN-NEXT: vfmsubadd213pd (%r8), %xmm1, %xmm0 # encoding: [0xc4,0xc2,0xf1,0xa7,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
%res = call <2 x double> @llvm.x86.fma.vfmsubadd.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
ret <2 x double> %res
}
declare <2 x double> @llvm.x86.fma.vfmsubadd.pd(<2 x double>, <2 x double>, <2 x double>)
define <8 x float> @test_x86_fma_vfmsubadd_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) #0 {
-; CHECK-LABEL: test_x86_fma_vfmsubadd_ps_256:
-; CHECK-NEXT: # BB#0:
-;
-; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %ymm{{0|1}}
-; CHECK-FMA-WIN-NEXT: vmovaps (%{{(rcx|rdx)}}), %ymm{{0|1}}
-; CHECK-FMA-WIN-NEXT: vfmsubadd213ps (%r8), %ymm1, %ymm0
-;
-; CHECK-FMA-NEXT: vfmsubadd213ps %ymm2, %ymm1, %ymm0
-;
-; CHECK-FMA4-NEXT: vfmsubaddps %ymm2, %ymm1, %ymm0, %ymm0
-;
-; CHECK-NEXT: retq
+; CHECK-FMA-LABEL: test_x86_fma_vfmsubadd_ps_256:
+; CHECK-FMA: # %bb.0:
+; CHECK-FMA-NEXT: vfmsubadd213ps %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0xa7,0xc2]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfmsubadd_ps_256:
+; CHECK-AVX512VL: # %bb.0:
+; CHECK-AVX512VL-NEXT: vfmsubadd213ps %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0xa7,0xc2]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmsubadd_ps_256:
+; CHECK-FMA-WIN: # %bb.0:
+; CHECK-FMA-WIN-NEXT: vmovaps (%rcx), %ymm1 # encoding: [0xc5,0xfc,0x28,0x09]
+; CHECK-FMA-WIN-NEXT: vmovaps (%rdx), %ymm0 # encoding: [0xc5,0xfc,0x28,0x02]
+; CHECK-FMA-WIN-NEXT: vfmsubadd213ps (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0x75,0xa7,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
%res = call <8 x float> @llvm.x86.fma.vfmsubadd.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2)
ret <8 x float> %res
}
declare <8 x float> @llvm.x86.fma.vfmsubadd.ps.256(<8 x float>, <8 x float>, <8 x float>)
define <4 x double> @test_x86_fma_vfmsubadd_pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 {
-; CHECK-LABEL: test_x86_fma_vfmsubadd_pd_256:
-; CHECK-NEXT: # BB#0:
-;
-; CHECK-FMA-WIN-NEXT: vmovapd (%{{(rcx|rdx)}}), %ymm{{0|1}}
-; CHECK-FMA-WIN-NEXT: vmovapd (%{{(rcx|rdx)}}), %ymm{{0|1}}
-; CHECK-FMA-WIN-NEXT: vfmsubadd213pd (%r8), %ymm1, %ymm0
-;
-; CHECK-FMA-NEXT: vfmsubadd213pd %ymm2, %ymm1, %ymm0
-;
-; CHECK-FMA4-NEXT: vfmsubaddpd %ymm2, %ymm1, %ymm0, %ymm0
-;
-; CHECK-NEXT: retq
+; CHECK-FMA-LABEL: test_x86_fma_vfmsubadd_pd_256:
+; CHECK-FMA: # %bb.0:
+; CHECK-FMA-NEXT: vfmsubadd213pd %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0xf5,0xa7,0xc2]
+; CHECK-FMA-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-AVX512VL-LABEL: test_x86_fma_vfmsubadd_pd_256:
+; CHECK-AVX512VL: # %bb.0:
+; CHECK-AVX512VL-NEXT: vfmsubadd213pd %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf5,0xa7,0xc2]
+; CHECK-AVX512VL-NEXT: retq # encoding: [0xc3]
+;
+; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmsubadd_pd_256:
+; CHECK-FMA-WIN: # %bb.0:
+; CHECK-FMA-WIN-NEXT: vmovapd (%rcx), %ymm1 # encoding: [0xc5,0xfd,0x28,0x09]
+; CHECK-FMA-WIN-NEXT: vmovapd (%rdx), %ymm0 # encoding: [0xc5,0xfd,0x28,0x02]
+; CHECK-FMA-WIN-NEXT: vfmsubadd213pd (%r8), %ymm1, %ymm0 # encoding: [0xc4,0xc2,0xf5,0xa7,0x00]
+; CHECK-FMA-WIN-NEXT: retq # encoding: [0xc3]
%res = call <4 x double> @llvm.x86.fma.vfmsubadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2)
ret <4 x double> %res
}
diff --git a/test/CodeGen/X86/fma-phi-213-to-231.ll b/test/CodeGen/X86/fma-phi-213-to-231.ll
index 34acdfe830f0..78cf1a5a3b70 100644
--- a/test/CodeGen/X86/fma-phi-213-to-231.ll
+++ b/test/CodeGen/X86/fma-phi-213-to-231.ll
@@ -1,6 +1,6 @@
; RUN: llc < %s -mtriple=i386-apple-darwin10 -mattr=+fma,-fma4 | FileCheck %s
; RUN: llc < %s -mtriple=x86_64-apple-darwin10 -mattr=+fma,-fma4 | FileCheck %s
-; RUN: llc < %s -march=x86 -mcpu=bdver2 -mattr=-fma4 | FileCheck %s
+; RUN: llc < %s -mtriple=i686-- -mcpu=bdver2 -mattr=-fma4 | FileCheck %s
; Test FMA3 variant selection
diff --git a/test/CodeGen/X86/fma-scalar-memfold.ll b/test/CodeGen/X86/fma-scalar-memfold.ll
index 4b400da3206b..7822139c3e14 100644
--- a/test/CodeGen/X86/fma-scalar-memfold.ll
+++ b/test/CodeGen/X86/fma-scalar-memfold.ll
@@ -1,7 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mcpu=core-avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=AVX2
; RUN: llc < %s -mcpu=skx | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512
-; RUN: llc < %s -mattr=fma4 | FileCheck %s --check-prefix=FMA4
target triple = "x86_64-unknown-unknown"
@@ -17,18 +16,11 @@ declare <2 x double> @llvm.x86.fma.vfnmsub.sd(<2 x double>, <2 x double>, <2 x d
define void @fmadd_aab_ss(float* %a, float* %b) {
; CHECK-LABEL: fmadd_aab_ss:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: vfmadd213ss (%rsi), %xmm0, %xmm0
; CHECK-NEXT: vmovss %xmm0, (%rdi)
; CHECK-NEXT: retq
-;
-; FMA4-LABEL: fmadd_aab_ss:
-; FMA4: # BB#0:
-; FMA4-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; FMA4-NEXT: vfmaddss (%rsi), %xmm0, %xmm0, %xmm0
-; FMA4-NEXT: vmovss %xmm0, (%rdi)
-; FMA4-NEXT: retq
%a.val = load float, float* %a
%av0 = insertelement <4 x float> undef, float %a.val, i32 0
%av1 = insertelement <4 x float> %av0, float 0.000000e+00, i32 1
@@ -50,18 +42,11 @@ define void @fmadd_aab_ss(float* %a, float* %b) {
define void @fmadd_aba_ss(float* %a, float* %b) {
; CHECK-LABEL: fmadd_aba_ss:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: vfmadd132ss (%rsi), %xmm0, %xmm0
; CHECK-NEXT: vmovss %xmm0, (%rdi)
; CHECK-NEXT: retq
-;
-; FMA4-LABEL: fmadd_aba_ss:
-; FMA4: # BB#0:
-; FMA4-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; FMA4-NEXT: vfmaddss %xmm0, (%rsi), %xmm0, %xmm0
-; FMA4-NEXT: vmovss %xmm0, (%rdi)
-; FMA4-NEXT: retq
%a.val = load float, float* %a
%av0 = insertelement <4 x float> undef, float %a.val, i32 0
%av1 = insertelement <4 x float> %av0, float 0.000000e+00, i32 1
@@ -83,18 +68,11 @@ define void @fmadd_aba_ss(float* %a, float* %b) {
define void @fmsub_aab_ss(float* %a, float* %b) {
; CHECK-LABEL: fmsub_aab_ss:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: vfmsub213ss (%rsi), %xmm0, %xmm0
; CHECK-NEXT: vmovss %xmm0, (%rdi)
; CHECK-NEXT: retq
-;
-; FMA4-LABEL: fmsub_aab_ss:
-; FMA4: # BB#0:
-; FMA4-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; FMA4-NEXT: vfmsubss (%rsi), %xmm0, %xmm0, %xmm0
-; FMA4-NEXT: vmovss %xmm0, (%rdi)
-; FMA4-NEXT: retq
%a.val = load float, float* %a
%av0 = insertelement <4 x float> undef, float %a.val, i32 0
%av1 = insertelement <4 x float> %av0, float 0.000000e+00, i32 1
@@ -116,18 +94,11 @@ define void @fmsub_aab_ss(float* %a, float* %b) {
define void @fmsub_aba_ss(float* %a, float* %b) {
; CHECK-LABEL: fmsub_aba_ss:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: vfmsub132ss (%rsi), %xmm0, %xmm0
; CHECK-NEXT: vmovss %xmm0, (%rdi)
; CHECK-NEXT: retq
-;
-; FMA4-LABEL: fmsub_aba_ss:
-; FMA4: # BB#0:
-; FMA4-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; FMA4-NEXT: vfmsubss %xmm0, (%rsi), %xmm0, %xmm0
-; FMA4-NEXT: vmovss %xmm0, (%rdi)
-; FMA4-NEXT: retq
%a.val = load float, float* %a
%av0 = insertelement <4 x float> undef, float %a.val, i32 0
%av1 = insertelement <4 x float> %av0, float 0.000000e+00, i32 1
@@ -149,18 +120,11 @@ define void @fmsub_aba_ss(float* %a, float* %b) {
define void @fnmadd_aab_ss(float* %a, float* %b) {
; CHECK-LABEL: fnmadd_aab_ss:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: vfnmadd213ss (%rsi), %xmm0, %xmm0
; CHECK-NEXT: vmovss %xmm0, (%rdi)
; CHECK-NEXT: retq
-;
-; FMA4-LABEL: fnmadd_aab_ss:
-; FMA4: # BB#0:
-; FMA4-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; FMA4-NEXT: vfnmaddss (%rsi), %xmm0, %xmm0, %xmm0
-; FMA4-NEXT: vmovss %xmm0, (%rdi)
-; FMA4-NEXT: retq
%a.val = load float, float* %a
%av0 = insertelement <4 x float> undef, float %a.val, i32 0
%av1 = insertelement <4 x float> %av0, float 0.000000e+00, i32 1
@@ -182,18 +146,11 @@ define void @fnmadd_aab_ss(float* %a, float* %b) {
define void @fnmadd_aba_ss(float* %a, float* %b) {
; CHECK-LABEL: fnmadd_aba_ss:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: vfnmadd132ss (%rsi), %xmm0, %xmm0
; CHECK-NEXT: vmovss %xmm0, (%rdi)
; CHECK-NEXT: retq
-;
-; FMA4-LABEL: fnmadd_aba_ss:
-; FMA4: # BB#0:
-; FMA4-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; FMA4-NEXT: vfnmaddss %xmm0, (%rsi), %xmm0, %xmm0
-; FMA4-NEXT: vmovss %xmm0, (%rdi)
-; FMA4-NEXT: retq
%a.val = load float, float* %a
%av0 = insertelement <4 x float> undef, float %a.val, i32 0
%av1 = insertelement <4 x float> %av0, float 0.000000e+00, i32 1
@@ -215,18 +172,11 @@ define void @fnmadd_aba_ss(float* %a, float* %b) {
define void @fnmsub_aab_ss(float* %a, float* %b) {
; CHECK-LABEL: fnmsub_aab_ss:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: vfnmsub213ss (%rsi), %xmm0, %xmm0
; CHECK-NEXT: vmovss %xmm0, (%rdi)
; CHECK-NEXT: retq
-;
-; FMA4-LABEL: fnmsub_aab_ss:
-; FMA4: # BB#0:
-; FMA4-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; FMA4-NEXT: vfnmsubss (%rsi), %xmm0, %xmm0, %xmm0
-; FMA4-NEXT: vmovss %xmm0, (%rdi)
-; FMA4-NEXT: retq
%a.val = load float, float* %a
%av0 = insertelement <4 x float> undef, float %a.val, i32 0
%av1 = insertelement <4 x float> %av0, float 0.000000e+00, i32 1
@@ -248,18 +198,11 @@ define void @fnmsub_aab_ss(float* %a, float* %b) {
define void @fnmsub_aba_ss(float* %a, float* %b) {
; CHECK-LABEL: fnmsub_aba_ss:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: vfnmsub132ss (%rsi), %xmm0, %xmm0
; CHECK-NEXT: vmovss %xmm0, (%rdi)
; CHECK-NEXT: retq
-;
-; FMA4-LABEL: fnmsub_aba_ss:
-; FMA4: # BB#0:
-; FMA4-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; FMA4-NEXT: vfnmsubss %xmm0, (%rsi), %xmm0, %xmm0
-; FMA4-NEXT: vmovss %xmm0, (%rdi)
-; FMA4-NEXT: retq
%a.val = load float, float* %a
%av0 = insertelement <4 x float> undef, float %a.val, i32 0
%av1 = insertelement <4 x float> %av0, float 0.000000e+00, i32 1
@@ -281,18 +224,11 @@ define void @fnmsub_aba_ss(float* %a, float* %b) {
define void @fmadd_aab_sd(double* %a, double* %b) {
; CHECK-LABEL: fmadd_aab_sd:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: vfmadd213sd (%rsi), %xmm0, %xmm0
; CHECK-NEXT: vmovlpd %xmm0, (%rdi)
; CHECK-NEXT: retq
-;
-; FMA4-LABEL: fmadd_aab_sd:
-; FMA4: # BB#0:
-; FMA4-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; FMA4-NEXT: vfmaddsd (%rsi), %xmm0, %xmm0, %xmm0
-; FMA4-NEXT: vmovlpd %xmm0, (%rdi)
-; FMA4-NEXT: retq
%a.val = load double, double* %a
%av0 = insertelement <2 x double> undef, double %a.val, i32 0
%av = insertelement <2 x double> %av0, double 0.000000e+00, i32 1
@@ -310,18 +246,11 @@ define void @fmadd_aab_sd(double* %a, double* %b) {
define void @fmadd_aba_sd(double* %a, double* %b) {
; CHECK-LABEL: fmadd_aba_sd:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: vfmadd132sd (%rsi), %xmm0, %xmm0
; CHECK-NEXT: vmovlpd %xmm0, (%rdi)
; CHECK-NEXT: retq
-;
-; FMA4-LABEL: fmadd_aba_sd:
-; FMA4: # BB#0:
-; FMA4-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; FMA4-NEXT: vfmaddsd %xmm0, (%rsi), %xmm0, %xmm0
-; FMA4-NEXT: vmovlpd %xmm0, (%rdi)
-; FMA4-NEXT: retq
%a.val = load double, double* %a
%av0 = insertelement <2 x double> undef, double %a.val, i32 0
%av = insertelement <2 x double> %av0, double 0.000000e+00, i32 1
@@ -339,18 +268,11 @@ define void @fmadd_aba_sd(double* %a, double* %b) {
define void @fmsub_aab_sd(double* %a, double* %b) {
; CHECK-LABEL: fmsub_aab_sd:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: vfmsub213sd (%rsi), %xmm0, %xmm0
; CHECK-NEXT: vmovlpd %xmm0, (%rdi)
; CHECK-NEXT: retq
-;
-; FMA4-LABEL: fmsub_aab_sd:
-; FMA4: # BB#0:
-; FMA4-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; FMA4-NEXT: vfmsubsd (%rsi), %xmm0, %xmm0, %xmm0
-; FMA4-NEXT: vmovlpd %xmm0, (%rdi)
-; FMA4-NEXT: retq
%a.val = load double, double* %a
%av0 = insertelement <2 x double> undef, double %a.val, i32 0
%av = insertelement <2 x double> %av0, double 0.000000e+00, i32 1
@@ -368,18 +290,11 @@ define void @fmsub_aab_sd(double* %a, double* %b) {
define void @fmsub_aba_sd(double* %a, double* %b) {
; CHECK-LABEL: fmsub_aba_sd:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: vfmsub132sd (%rsi), %xmm0, %xmm0
; CHECK-NEXT: vmovlpd %xmm0, (%rdi)
; CHECK-NEXT: retq
-;
-; FMA4-LABEL: fmsub_aba_sd:
-; FMA4: # BB#0:
-; FMA4-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; FMA4-NEXT: vfmsubsd %xmm0, (%rsi), %xmm0, %xmm0
-; FMA4-NEXT: vmovlpd %xmm0, (%rdi)
-; FMA4-NEXT: retq
%a.val = load double, double* %a
%av0 = insertelement <2 x double> undef, double %a.val, i32 0
%av = insertelement <2 x double> %av0, double 0.000000e+00, i32 1
@@ -397,18 +312,11 @@ define void @fmsub_aba_sd(double* %a, double* %b) {
define void @fnmadd_aab_sd(double* %a, double* %b) {
; CHECK-LABEL: fnmadd_aab_sd:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: vfnmadd213sd (%rsi), %xmm0, %xmm0
; CHECK-NEXT: vmovlpd %xmm0, (%rdi)
; CHECK-NEXT: retq
-;
-; FMA4-LABEL: fnmadd_aab_sd:
-; FMA4: # BB#0:
-; FMA4-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; FMA4-NEXT: vfnmaddsd (%rsi), %xmm0, %xmm0, %xmm0
-; FMA4-NEXT: vmovlpd %xmm0, (%rdi)
-; FMA4-NEXT: retq
%a.val = load double, double* %a
%av0 = insertelement <2 x double> undef, double %a.val, i32 0
%av = insertelement <2 x double> %av0, double 0.000000e+00, i32 1
@@ -426,18 +334,11 @@ define void @fnmadd_aab_sd(double* %a, double* %b) {
define void @fnmadd_aba_sd(double* %a, double* %b) {
; CHECK-LABEL: fnmadd_aba_sd:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: vfnmadd132sd (%rsi), %xmm0, %xmm0
; CHECK-NEXT: vmovlpd %xmm0, (%rdi)
; CHECK-NEXT: retq
-;
-; FMA4-LABEL: fnmadd_aba_sd:
-; FMA4: # BB#0:
-; FMA4-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; FMA4-NEXT: vfnmaddsd %xmm0, (%rsi), %xmm0, %xmm0
-; FMA4-NEXT: vmovlpd %xmm0, (%rdi)
-; FMA4-NEXT: retq
%a.val = load double, double* %a
%av0 = insertelement <2 x double> undef, double %a.val, i32 0
%av = insertelement <2 x double> %av0, double 0.000000e+00, i32 1
@@ -455,18 +356,11 @@ define void @fnmadd_aba_sd(double* %a, double* %b) {
define void @fnmsub_aab_sd(double* %a, double* %b) {
; CHECK-LABEL: fnmsub_aab_sd:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: vfnmsub213sd (%rsi), %xmm0, %xmm0
; CHECK-NEXT: vmovlpd %xmm0, (%rdi)
; CHECK-NEXT: retq
-;
-; FMA4-LABEL: fnmsub_aab_sd:
-; FMA4: # BB#0:
-; FMA4-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; FMA4-NEXT: vfnmsubsd (%rsi), %xmm0, %xmm0, %xmm0
-; FMA4-NEXT: vmovlpd %xmm0, (%rdi)
-; FMA4-NEXT: retq
%a.val = load double, double* %a
%av0 = insertelement <2 x double> undef, double %a.val, i32 0
%av = insertelement <2 x double> %av0, double 0.000000e+00, i32 1
@@ -484,18 +378,11 @@ define void @fnmsub_aab_sd(double* %a, double* %b) {
define void @fnmsub_aba_sd(double* %a, double* %b) {
; CHECK-LABEL: fnmsub_aba_sd:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: vfnmsub132sd (%rsi), %xmm0, %xmm0
; CHECK-NEXT: vmovlpd %xmm0, (%rdi)
; CHECK-NEXT: retq
-;
-; FMA4-LABEL: fnmsub_aba_sd:
-; FMA4: # BB#0:
-; FMA4-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; FMA4-NEXT: vfnmsubsd %xmm0, (%rsi), %xmm0, %xmm0
-; FMA4-NEXT: vmovlpd %xmm0, (%rdi)
-; FMA4-NEXT: retq
%a.val = load double, double* %a
%av0 = insertelement <2 x double> undef, double %a.val, i32 0
%av = insertelement <2 x double> %av0, double 0.000000e+00, i32 1
diff --git a/test/CodeGen/X86/fma-schedule.ll b/test/CodeGen/X86/fma-schedule.ll
new file mode 100644
index 000000000000..6c8a94e651d1
--- /dev/null
+++ b/test/CodeGen/X86/fma-schedule.ll
@@ -0,0 +1,2920 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=+fma | FileCheck %s --check-prefix=CHECK --check-prefix=GENERIC
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=haswell | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=broadwell | FileCheck %s --check-prefix=CHECK --check-prefix=BROADWELL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=SKYLAKE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=knl | FileCheck %s --check-prefix=CHECK --check-prefix=KNL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skx | FileCheck %s --check-prefix=CHECK --check-prefix=SKX
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1
+
+;
+; VFMADD
+;
+
+define void @test_vfmaddpd_128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> *%a3) optsize {
+; GENERIC-LABEL: test_vfmaddpd_128:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: vfmadd132pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT: vfmadd213pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT: vfmadd231pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT: vfmadd132pd (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT: vfmadd213pd (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT: vfmadd231pd (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_vfmaddpd_128:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: vfmadd132pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfmadd213pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfmadd231pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfmadd132pd (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
+; HASWELL-NEXT: vfmadd213pd (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
+; HASWELL-NEXT: vfmadd231pd (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_vfmaddpd_128:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: vfmadd132pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; BROADWELL-NEXT: vfmadd213pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; BROADWELL-NEXT: vfmadd231pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; BROADWELL-NEXT: vfmadd132pd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; BROADWELL-NEXT: vfmadd213pd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; BROADWELL-NEXT: vfmadd231pd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_vfmaddpd_128:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: vfmadd132pd %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vfmadd213pd %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vfmadd231pd %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vfmadd132pd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; SKYLAKE-NEXT: vfmadd213pd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; SKYLAKE-NEXT: vfmadd231pd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; KNL-LABEL: test_vfmaddpd_128:
+; KNL: # %bb.0:
+; KNL-NEXT: #APP
+; KNL-NEXT: vfmadd132pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT: vfmadd213pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT: vfmadd231pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT: vfmadd132pd (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
+; KNL-NEXT: vfmadd213pd (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
+; KNL-NEXT: vfmadd231pd (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
+; KNL-NEXT: #NO_APP
+; KNL-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_vfmaddpd_128:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: vfmadd132pd %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: vfmadd213pd %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: vfmadd231pd %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: vfmadd132pd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; SKX-NEXT: vfmadd213pd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; SKX-NEXT: vfmadd231pd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_vfmaddpd_128:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: vfmadd132pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; ZNVER1-NEXT: vfmadd213pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; ZNVER1-NEXT: vfmadd231pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; ZNVER1-NEXT: vfmadd132pd (%rdi), %xmm1, %xmm0 # sched: [12:0.50]
+; ZNVER1-NEXT: vfmadd213pd (%rdi), %xmm1, %xmm0 # sched: [12:0.50]
+; ZNVER1-NEXT: vfmadd231pd (%rdi), %xmm1, %xmm0 # sched: [12:0.50]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ tail call void asm "vfmadd132pd $2, $1, $0 \0A\09 vfmadd213pd $2, $1, $0 \0A\09 vfmadd231pd $2, $1, $0 \0A\09 vfmadd132pd $3, $1, $0 \0A\09 vfmadd213pd $3, $1, $0 \0A\09 vfmadd231pd $3, $1, $0", "x,x,x,*m"(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> *%a3) nounwind
+ ret void
+}
+
+define void @test_vfmaddpd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, <4 x double> *%a3) optsize {
+; GENERIC-LABEL: test_vfmaddpd_256:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: vfmadd132pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; GENERIC-NEXT: vfmadd213pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; GENERIC-NEXT: vfmadd231pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; GENERIC-NEXT: vfmadd132pd (%rdi), %ymm1, %ymm0 # sched: [9:0.50]
+; GENERIC-NEXT: vfmadd213pd (%rdi), %ymm1, %ymm0 # sched: [9:0.50]
+; GENERIC-NEXT: vfmadd231pd (%rdi), %ymm1, %ymm0 # sched: [9:0.50]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_vfmaddpd_256:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: vfmadd132pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfmadd213pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfmadd231pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfmadd132pd (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; HASWELL-NEXT: vfmadd213pd (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; HASWELL-NEXT: vfmadd231pd (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: vzeroupper # sched: [4:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_vfmaddpd_256:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: vfmadd132pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; BROADWELL-NEXT: vfmadd213pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; BROADWELL-NEXT: vfmadd231pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; BROADWELL-NEXT: vfmadd132pd (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; BROADWELL-NEXT: vfmadd213pd (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; BROADWELL-NEXT: vfmadd231pd (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: vzeroupper # sched: [4:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_vfmaddpd_256:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: vfmadd132pd %ymm2, %ymm1, %ymm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vfmadd213pd %ymm2, %ymm1, %ymm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vfmadd231pd %ymm2, %ymm1, %ymm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vfmadd132pd (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; SKYLAKE-NEXT: vfmadd213pd (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; SKYLAKE-NEXT: vfmadd231pd (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: vzeroupper # sched: [4:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; KNL-LABEL: test_vfmaddpd_256:
+; KNL: # %bb.0:
+; KNL-NEXT: #APP
+; KNL-NEXT: vfmadd132pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; KNL-NEXT: vfmadd213pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; KNL-NEXT: vfmadd231pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; KNL-NEXT: vfmadd132pd (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; KNL-NEXT: vfmadd213pd (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; KNL-NEXT: vfmadd231pd (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; KNL-NEXT: #NO_APP
+; KNL-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_vfmaddpd_256:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: vfmadd132pd %ymm2, %ymm1, %ymm0 # sched: [4:0.33]
+; SKX-NEXT: vfmadd213pd %ymm2, %ymm1, %ymm0 # sched: [4:0.33]
+; SKX-NEXT: vfmadd231pd %ymm2, %ymm1, %ymm0 # sched: [4:0.33]
+; SKX-NEXT: vfmadd132pd (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; SKX-NEXT: vfmadd213pd (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; SKX-NEXT: vfmadd231pd (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: vzeroupper # sched: [4:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_vfmaddpd_256:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: vfmadd132pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; ZNVER1-NEXT: vfmadd213pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; ZNVER1-NEXT: vfmadd231pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; ZNVER1-NEXT: vfmadd132pd (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; ZNVER1-NEXT: vfmadd213pd (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; ZNVER1-NEXT: vfmadd231pd (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: vzeroupper # sched: [100:?]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ tail call void asm "vfmadd132pd $2, $1, $0 \0A\09 vfmadd213pd $2, $1, $0 \0A\09 vfmadd231pd $2, $1, $0 \0A\09 vfmadd132pd $3, $1, $0 \0A\09 vfmadd213pd $3, $1, $0 \0A\09 vfmadd231pd $3, $1, $0", "x,x,x,*m"(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, <4 x double> *%a3) nounwind
+ ret void
+}
+
+define void @test_vfmaddps_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> *%a3) optsize {
+; GENERIC-LABEL: test_vfmaddps_128:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: vfmadd132ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT: vfmadd213ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT: vfmadd231ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT: vfmadd132ps (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT: vfmadd213ps (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT: vfmadd231ps (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_vfmaddps_128:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: vfmadd132ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfmadd213ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfmadd231ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfmadd132ps (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
+; HASWELL-NEXT: vfmadd213ps (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
+; HASWELL-NEXT: vfmadd231ps (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_vfmaddps_128:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: vfmadd132ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; BROADWELL-NEXT: vfmadd213ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; BROADWELL-NEXT: vfmadd231ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; BROADWELL-NEXT: vfmadd132ps (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; BROADWELL-NEXT: vfmadd213ps (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; BROADWELL-NEXT: vfmadd231ps (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_vfmaddps_128:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: vfmadd132ps %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vfmadd213ps %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vfmadd231ps %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vfmadd132ps (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; SKYLAKE-NEXT: vfmadd213ps (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; SKYLAKE-NEXT: vfmadd231ps (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; KNL-LABEL: test_vfmaddps_128:
+; KNL: # %bb.0:
+; KNL-NEXT: #APP
+; KNL-NEXT: vfmadd132ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT: vfmadd213ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT: vfmadd231ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT: vfmadd132ps (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
+; KNL-NEXT: vfmadd213ps (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
+; KNL-NEXT: vfmadd231ps (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
+; KNL-NEXT: #NO_APP
+; KNL-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_vfmaddps_128:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: vfmadd132ps %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: vfmadd213ps %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: vfmadd231ps %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: vfmadd132ps (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; SKX-NEXT: vfmadd213ps (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; SKX-NEXT: vfmadd231ps (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_vfmaddps_128:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: vfmadd132ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; ZNVER1-NEXT: vfmadd213ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; ZNVER1-NEXT: vfmadd231ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; ZNVER1-NEXT: vfmadd132ps (%rdi), %xmm1, %xmm0 # sched: [12:0.50]
+; ZNVER1-NEXT: vfmadd213ps (%rdi), %xmm1, %xmm0 # sched: [12:0.50]
+; ZNVER1-NEXT: vfmadd231ps (%rdi), %xmm1, %xmm0 # sched: [12:0.50]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ tail call void asm "vfmadd132ps $2, $1, $0 \0A\09 vfmadd213ps $2, $1, $0 \0A\09 vfmadd231ps $2, $1, $0 \0A\09 vfmadd132ps $3, $1, $0 \0A\09 vfmadd213ps $3, $1, $0 \0A\09 vfmadd231ps $3, $1, $0", "x,x,x,*m"(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> *%a3) nounwind
+ ret void
+}
+
+define void @test_vfmaddps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, <8 x float> *%a3) optsize {
+; GENERIC-LABEL: test_vfmaddps_256:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: vfmadd132ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; GENERIC-NEXT: vfmadd213ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; GENERIC-NEXT: vfmadd231ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; GENERIC-NEXT: vfmadd132ps (%rdi), %ymm1, %ymm0 # sched: [9:0.50]
+; GENERIC-NEXT: vfmadd213ps (%rdi), %ymm1, %ymm0 # sched: [9:0.50]
+; GENERIC-NEXT: vfmadd231ps (%rdi), %ymm1, %ymm0 # sched: [9:0.50]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_vfmaddps_256:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: vfmadd132ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfmadd213ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfmadd231ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfmadd132ps (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; HASWELL-NEXT: vfmadd213ps (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; HASWELL-NEXT: vfmadd231ps (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: vzeroupper # sched: [4:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_vfmaddps_256:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: vfmadd132ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; BROADWELL-NEXT: vfmadd213ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; BROADWELL-NEXT: vfmadd231ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; BROADWELL-NEXT: vfmadd132ps (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; BROADWELL-NEXT: vfmadd213ps (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; BROADWELL-NEXT: vfmadd231ps (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: vzeroupper # sched: [4:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_vfmaddps_256:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: vfmadd132ps %ymm2, %ymm1, %ymm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vfmadd213ps %ymm2, %ymm1, %ymm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vfmadd231ps %ymm2, %ymm1, %ymm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vfmadd132ps (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; SKYLAKE-NEXT: vfmadd213ps (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; SKYLAKE-NEXT: vfmadd231ps (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: vzeroupper # sched: [4:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; KNL-LABEL: test_vfmaddps_256:
+; KNL: # %bb.0:
+; KNL-NEXT: #APP
+; KNL-NEXT: vfmadd132ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; KNL-NEXT: vfmadd213ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; KNL-NEXT: vfmadd231ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; KNL-NEXT: vfmadd132ps (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; KNL-NEXT: vfmadd213ps (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; KNL-NEXT: vfmadd231ps (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; KNL-NEXT: #NO_APP
+; KNL-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_vfmaddps_256:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: vfmadd132ps %ymm2, %ymm1, %ymm0 # sched: [4:0.33]
+; SKX-NEXT: vfmadd213ps %ymm2, %ymm1, %ymm0 # sched: [4:0.33]
+; SKX-NEXT: vfmadd231ps %ymm2, %ymm1, %ymm0 # sched: [4:0.33]
+; SKX-NEXT: vfmadd132ps (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; SKX-NEXT: vfmadd213ps (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; SKX-NEXT: vfmadd231ps (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: vzeroupper # sched: [4:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_vfmaddps_256:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: vfmadd132ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; ZNVER1-NEXT: vfmadd213ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; ZNVER1-NEXT: vfmadd231ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; ZNVER1-NEXT: vfmadd132ps (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; ZNVER1-NEXT: vfmadd213ps (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; ZNVER1-NEXT: vfmadd231ps (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: vzeroupper # sched: [100:?]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ tail call void asm "vfmadd132ps $2, $1, $0 \0A\09 vfmadd213ps $2, $1, $0 \0A\09 vfmadd231ps $2, $1, $0 \0A\09 vfmadd132ps $3, $1, $0 \0A\09 vfmadd213ps $3, $1, $0 \0A\09 vfmadd231ps $3, $1, $0", "x,x,x,*m"(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, <8 x float> *%a3) nounwind
+ ret void
+}
+
+define void @test_vfmaddsd_128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> *%a3) optsize {
+; GENERIC-LABEL: test_vfmaddsd_128:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: vfmadd132sd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT: vfmadd213sd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT: vfmadd231sd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT: vfmadd132sd (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT: vfmadd213sd (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT: vfmadd231sd (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_vfmaddsd_128:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: vfmadd132sd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfmadd213sd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfmadd231sd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfmadd132sd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; HASWELL-NEXT: vfmadd213sd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; HASWELL-NEXT: vfmadd231sd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_vfmaddsd_128:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: vfmadd132sd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; BROADWELL-NEXT: vfmadd213sd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; BROADWELL-NEXT: vfmadd231sd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; BROADWELL-NEXT: vfmadd132sd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; BROADWELL-NEXT: vfmadd213sd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; BROADWELL-NEXT: vfmadd231sd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_vfmaddsd_128:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: vfmadd132sd %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vfmadd213sd %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vfmadd231sd %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vfmadd132sd (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; SKYLAKE-NEXT: vfmadd213sd (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; SKYLAKE-NEXT: vfmadd231sd (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; KNL-LABEL: test_vfmaddsd_128:
+; KNL: # %bb.0:
+; KNL-NEXT: #APP
+; KNL-NEXT: vfmadd132sd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT: vfmadd213sd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT: vfmadd231sd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT: vfmadd132sd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; KNL-NEXT: vfmadd213sd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; KNL-NEXT: vfmadd231sd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; KNL-NEXT: #NO_APP
+; KNL-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_vfmaddsd_128:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: vfmadd132sd %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: vfmadd213sd %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: vfmadd231sd %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: vfmadd132sd (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; SKX-NEXT: vfmadd213sd (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; SKX-NEXT: vfmadd231sd (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_vfmaddsd_128:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: vfmadd132sd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; ZNVER1-NEXT: vfmadd213sd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; ZNVER1-NEXT: vfmadd231sd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; ZNVER1-NEXT: vfmadd132sd (%rdi), %xmm1, %xmm0 # sched: [12:0.50]
+; ZNVER1-NEXT: vfmadd213sd (%rdi), %xmm1, %xmm0 # sched: [12:0.50]
+; ZNVER1-NEXT: vfmadd231sd (%rdi), %xmm1, %xmm0 # sched: [12:0.50]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ tail call void asm "vfmadd132sd $2, $1, $0 \0A\09 vfmadd213sd $2, $1, $0 \0A\09 vfmadd231sd $2, $1, $0 \0A\09 vfmadd132sd $3, $1, $0 \0A\09 vfmadd213sd $3, $1, $0 \0A\09 vfmadd231sd $3, $1, $0", "x,x,x,*m"(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> *%a3) nounwind
+ ret void
+}
+
+define void @test_vfmaddss_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> *%a3) optsize {
+; GENERIC-LABEL: test_vfmaddss_128:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: vfmadd132ss %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT: vfmadd231ss %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT: vfmadd132ss (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT: vfmadd213ss (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT: vfmadd231ss (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_vfmaddss_128:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: vfmadd132ss %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfmadd231ss %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfmadd132ss (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; HASWELL-NEXT: vfmadd213ss (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; HASWELL-NEXT: vfmadd231ss (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_vfmaddss_128:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: vfmadd132ss %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; BROADWELL-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; BROADWELL-NEXT: vfmadd231ss %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; BROADWELL-NEXT: vfmadd132ss (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; BROADWELL-NEXT: vfmadd213ss (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; BROADWELL-NEXT: vfmadd231ss (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_vfmaddss_128:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: vfmadd132ss %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vfmadd231ss %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vfmadd132ss (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; SKYLAKE-NEXT: vfmadd213ss (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; SKYLAKE-NEXT: vfmadd231ss (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; KNL-LABEL: test_vfmaddss_128:
+; KNL: # %bb.0:
+; KNL-NEXT: #APP
+; KNL-NEXT: vfmadd132ss %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT: vfmadd231ss %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT: vfmadd132ss (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; KNL-NEXT: vfmadd213ss (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; KNL-NEXT: vfmadd231ss (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; KNL-NEXT: #NO_APP
+; KNL-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_vfmaddss_128:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: vfmadd132ss %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: vfmadd231ss %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: vfmadd132ss (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; SKX-NEXT: vfmadd213ss (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; SKX-NEXT: vfmadd231ss (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_vfmaddss_128:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: vfmadd132ss %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; ZNVER1-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; ZNVER1-NEXT: vfmadd231ss %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; ZNVER1-NEXT: vfmadd132ss (%rdi), %xmm1, %xmm0 # sched: [12:0.50]
+; ZNVER1-NEXT: vfmadd213ss (%rdi), %xmm1, %xmm0 # sched: [12:0.50]
+; ZNVER1-NEXT: vfmadd231ss (%rdi), %xmm1, %xmm0 # sched: [12:0.50]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ tail call void asm "vfmadd132ss $2, $1, $0 \0A\09 vfmadd213ss $2, $1, $0 \0A\09 vfmadd231ss $2, $1, $0 \0A\09 vfmadd132ss $3, $1, $0 \0A\09 vfmadd213ss $3, $1, $0 \0A\09 vfmadd231ss $3, $1, $0", "x,x,x,*m"(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> *%a3) nounwind
+ ret void
+}
+
+;
+; VFMADDSUB
+;
+
+define void @test_vfmaddsubpd_128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> *%a3) optsize {
+; GENERIC-LABEL: test_vfmaddsubpd_128:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: vfmaddsub132pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT: vfmaddsub213pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT: vfmaddsub231pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT: vfmaddsub132pd (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT: vfmaddsub213pd (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT: vfmaddsub231pd (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_vfmaddsubpd_128:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: vfmaddsub132pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfmaddsub213pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfmaddsub231pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfmaddsub132pd (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
+; HASWELL-NEXT: vfmaddsub213pd (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
+; HASWELL-NEXT: vfmaddsub231pd (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_vfmaddsubpd_128:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: vfmaddsub132pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; BROADWELL-NEXT: vfmaddsub213pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; BROADWELL-NEXT: vfmaddsub231pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; BROADWELL-NEXT: vfmaddsub132pd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; BROADWELL-NEXT: vfmaddsub213pd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; BROADWELL-NEXT: vfmaddsub231pd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_vfmaddsubpd_128:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: vfmaddsub132pd %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vfmaddsub213pd %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vfmaddsub231pd %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vfmaddsub132pd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; SKYLAKE-NEXT: vfmaddsub213pd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; SKYLAKE-NEXT: vfmaddsub231pd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; KNL-LABEL: test_vfmaddsubpd_128:
+; KNL: # %bb.0:
+; KNL-NEXT: #APP
+; KNL-NEXT: vfmaddsub132pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT: vfmaddsub213pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT: vfmaddsub231pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT: vfmaddsub132pd (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
+; KNL-NEXT: vfmaddsub213pd (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
+; KNL-NEXT: vfmaddsub231pd (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
+; KNL-NEXT: #NO_APP
+; KNL-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_vfmaddsubpd_128:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: vfmaddsub132pd %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: vfmaddsub213pd %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: vfmaddsub231pd %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: vfmaddsub132pd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; SKX-NEXT: vfmaddsub213pd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; SKX-NEXT: vfmaddsub231pd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_vfmaddsubpd_128:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: vfmaddsub132pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; ZNVER1-NEXT: vfmaddsub213pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; ZNVER1-NEXT: vfmaddsub231pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; ZNVER1-NEXT: vfmaddsub132pd (%rdi), %xmm1, %xmm0 # sched: [12:0.50]
+; ZNVER1-NEXT: vfmaddsub213pd (%rdi), %xmm1, %xmm0 # sched: [12:0.50]
+; ZNVER1-NEXT: vfmaddsub231pd (%rdi), %xmm1, %xmm0 # sched: [12:0.50]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ tail call void asm "vfmaddsub132pd $2, $1, $0 \0A\09 vfmaddsub213pd $2, $1, $0 \0A\09 vfmaddsub231pd $2, $1, $0 \0A\09 vfmaddsub132pd $3, $1, $0 \0A\09 vfmaddsub213pd $3, $1, $0 \0A\09 vfmaddsub231pd $3, $1, $0", "x,x,x,*m"(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> *%a3) nounwind
+ ret void
+}
+
+define void @test_vfmaddsubpd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, <4 x double> *%a3) optsize {
+; GENERIC-LABEL: test_vfmaddsubpd_256:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: vfmaddsub132pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; GENERIC-NEXT: vfmaddsub213pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; GENERIC-NEXT: vfmaddsub231pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; GENERIC-NEXT: vfmaddsub132pd (%rdi), %ymm1, %ymm0 # sched: [9:0.50]
+; GENERIC-NEXT: vfmaddsub213pd (%rdi), %ymm1, %ymm0 # sched: [9:0.50]
+; GENERIC-NEXT: vfmaddsub231pd (%rdi), %ymm1, %ymm0 # sched: [9:0.50]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_vfmaddsubpd_256:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: vfmaddsub132pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfmaddsub213pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfmaddsub231pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfmaddsub132pd (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; HASWELL-NEXT: vfmaddsub213pd (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; HASWELL-NEXT: vfmaddsub231pd (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: vzeroupper # sched: [4:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_vfmaddsubpd_256:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: vfmaddsub132pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; BROADWELL-NEXT: vfmaddsub213pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; BROADWELL-NEXT: vfmaddsub231pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; BROADWELL-NEXT: vfmaddsub132pd (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; BROADWELL-NEXT: vfmaddsub213pd (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; BROADWELL-NEXT: vfmaddsub231pd (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: vzeroupper # sched: [4:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_vfmaddsubpd_256:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: vfmaddsub132pd %ymm2, %ymm1, %ymm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vfmaddsub213pd %ymm2, %ymm1, %ymm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vfmaddsub231pd %ymm2, %ymm1, %ymm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vfmaddsub132pd (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; SKYLAKE-NEXT: vfmaddsub213pd (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; SKYLAKE-NEXT: vfmaddsub231pd (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: vzeroupper # sched: [4:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; KNL-LABEL: test_vfmaddsubpd_256:
+; KNL: # %bb.0:
+; KNL-NEXT: #APP
+; KNL-NEXT: vfmaddsub132pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; KNL-NEXT: vfmaddsub213pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; KNL-NEXT: vfmaddsub231pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; KNL-NEXT: vfmaddsub132pd (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; KNL-NEXT: vfmaddsub213pd (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; KNL-NEXT: vfmaddsub231pd (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; KNL-NEXT: #NO_APP
+; KNL-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_vfmaddsubpd_256:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: vfmaddsub132pd %ymm2, %ymm1, %ymm0 # sched: [4:0.33]
+; SKX-NEXT: vfmaddsub213pd %ymm2, %ymm1, %ymm0 # sched: [4:0.33]
+; SKX-NEXT: vfmaddsub231pd %ymm2, %ymm1, %ymm0 # sched: [4:0.33]
+; SKX-NEXT: vfmaddsub132pd (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; SKX-NEXT: vfmaddsub213pd (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; SKX-NEXT: vfmaddsub231pd (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: vzeroupper # sched: [4:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_vfmaddsubpd_256:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: vfmaddsub132pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; ZNVER1-NEXT: vfmaddsub213pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; ZNVER1-NEXT: vfmaddsub231pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; ZNVER1-NEXT: vfmaddsub132pd (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; ZNVER1-NEXT: vfmaddsub213pd (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; ZNVER1-NEXT: vfmaddsub231pd (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: vzeroupper # sched: [100:?]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ tail call void asm "vfmaddsub132pd $2, $1, $0 \0A\09 vfmaddsub213pd $2, $1, $0 \0A\09 vfmaddsub231pd $2, $1, $0 \0A\09 vfmaddsub132pd $3, $1, $0 \0A\09 vfmaddsub213pd $3, $1, $0 \0A\09 vfmaddsub231pd $3, $1, $0", "x,x,x,*m"(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, <4 x double> *%a3) nounwind
+ ret void
+}
+
+define void @test_vfmaddsubps_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> *%a3) optsize {
+; GENERIC-LABEL: test_vfmaddsubps_128:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: vfmaddsub132ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT: vfmaddsub213ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT: vfmaddsub231ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT: vfmaddsub132ps (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT: vfmaddsub213ps (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT: vfmaddsub231ps (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_vfmaddsubps_128:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: vfmaddsub132ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfmaddsub213ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfmaddsub231ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfmaddsub132ps (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
+; HASWELL-NEXT: vfmaddsub213ps (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
+; HASWELL-NEXT: vfmaddsub231ps (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_vfmaddsubps_128:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: vfmaddsub132ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; BROADWELL-NEXT: vfmaddsub213ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; BROADWELL-NEXT: vfmaddsub231ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; BROADWELL-NEXT: vfmaddsub132ps (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; BROADWELL-NEXT: vfmaddsub213ps (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; BROADWELL-NEXT: vfmaddsub231ps (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_vfmaddsubps_128:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: vfmaddsub132ps %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vfmaddsub213ps %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vfmaddsub231ps %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vfmaddsub132ps (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; SKYLAKE-NEXT: vfmaddsub213ps (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; SKYLAKE-NEXT: vfmaddsub231ps (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; KNL-LABEL: test_vfmaddsubps_128:
+; KNL: # %bb.0:
+; KNL-NEXT: #APP
+; KNL-NEXT: vfmaddsub132ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT: vfmaddsub213ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT: vfmaddsub231ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT: vfmaddsub132ps (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
+; KNL-NEXT: vfmaddsub213ps (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
+; KNL-NEXT: vfmaddsub231ps (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
+; KNL-NEXT: #NO_APP
+; KNL-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_vfmaddsubps_128:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: vfmaddsub132ps %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: vfmaddsub213ps %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: vfmaddsub231ps %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: vfmaddsub132ps (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; SKX-NEXT: vfmaddsub213ps (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; SKX-NEXT: vfmaddsub231ps (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_vfmaddsubps_128:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: vfmaddsub132ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; ZNVER1-NEXT: vfmaddsub213ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; ZNVER1-NEXT: vfmaddsub231ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; ZNVER1-NEXT: vfmaddsub132ps (%rdi), %xmm1, %xmm0 # sched: [12:0.50]
+; ZNVER1-NEXT: vfmaddsub213ps (%rdi), %xmm1, %xmm0 # sched: [12:0.50]
+; ZNVER1-NEXT: vfmaddsub231ps (%rdi), %xmm1, %xmm0 # sched: [12:0.50]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ tail call void asm "vfmaddsub132ps $2, $1, $0 \0A\09 vfmaddsub213ps $2, $1, $0 \0A\09 vfmaddsub231ps $2, $1, $0 \0A\09 vfmaddsub132ps $3, $1, $0 \0A\09 vfmaddsub213ps $3, $1, $0 \0A\09 vfmaddsub231ps $3, $1, $0", "x,x,x,*m"(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> *%a3) nounwind
+ ret void
+}
+
+define void @test_vfmaddsubps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, <8 x float> *%a3) optsize {
+; GENERIC-LABEL: test_vfmaddsubps_256:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: vfmaddsub132ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; GENERIC-NEXT: vfmaddsub213ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; GENERIC-NEXT: vfmaddsub231ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; GENERIC-NEXT: vfmaddsub132ps (%rdi), %ymm1, %ymm0 # sched: [9:0.50]
+; GENERIC-NEXT: vfmaddsub213ps (%rdi), %ymm1, %ymm0 # sched: [9:0.50]
+; GENERIC-NEXT: vfmaddsub231ps (%rdi), %ymm1, %ymm0 # sched: [9:0.50]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_vfmaddsubps_256:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: vfmaddsub132ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfmaddsub213ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfmaddsub231ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfmaddsub132ps (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; HASWELL-NEXT: vfmaddsub213ps (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; HASWELL-NEXT: vfmaddsub231ps (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: vzeroupper # sched: [4:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_vfmaddsubps_256:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: vfmaddsub132ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; BROADWELL-NEXT: vfmaddsub213ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; BROADWELL-NEXT: vfmaddsub231ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; BROADWELL-NEXT: vfmaddsub132ps (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; BROADWELL-NEXT: vfmaddsub213ps (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; BROADWELL-NEXT: vfmaddsub231ps (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: vzeroupper # sched: [4:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_vfmaddsubps_256:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: vfmaddsub132ps %ymm2, %ymm1, %ymm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vfmaddsub213ps %ymm2, %ymm1, %ymm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vfmaddsub231ps %ymm2, %ymm1, %ymm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vfmaddsub132ps (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; SKYLAKE-NEXT: vfmaddsub213ps (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; SKYLAKE-NEXT: vfmaddsub231ps (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: vzeroupper # sched: [4:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; KNL-LABEL: test_vfmaddsubps_256:
+; KNL: # %bb.0:
+; KNL-NEXT: #APP
+; KNL-NEXT: vfmaddsub132ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; KNL-NEXT: vfmaddsub213ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; KNL-NEXT: vfmaddsub231ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; KNL-NEXT: vfmaddsub132ps (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; KNL-NEXT: vfmaddsub213ps (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; KNL-NEXT: vfmaddsub231ps (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; KNL-NEXT: #NO_APP
+; KNL-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_vfmaddsubps_256:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: vfmaddsub132ps %ymm2, %ymm1, %ymm0 # sched: [4:0.33]
+; SKX-NEXT: vfmaddsub213ps %ymm2, %ymm1, %ymm0 # sched: [4:0.33]
+; SKX-NEXT: vfmaddsub231ps %ymm2, %ymm1, %ymm0 # sched: [4:0.33]
+; SKX-NEXT: vfmaddsub132ps (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; SKX-NEXT: vfmaddsub213ps (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; SKX-NEXT: vfmaddsub231ps (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: vzeroupper # sched: [4:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_vfmaddsubps_256:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: vfmaddsub132ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; ZNVER1-NEXT: vfmaddsub213ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; ZNVER1-NEXT: vfmaddsub231ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; ZNVER1-NEXT: vfmaddsub132ps (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; ZNVER1-NEXT: vfmaddsub213ps (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; ZNVER1-NEXT: vfmaddsub231ps (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: vzeroupper # sched: [100:?]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ tail call void asm "vfmaddsub132ps $2, $1, $0 \0A\09 vfmaddsub213ps $2, $1, $0 \0A\09 vfmaddsub231ps $2, $1, $0 \0A\09 vfmaddsub132ps $3, $1, $0 \0A\09 vfmaddsub213ps $3, $1, $0 \0A\09 vfmaddsub231ps $3, $1, $0", "x,x,x,*m"(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, <8 x float> *%a3) nounwind
+ ret void
+}
+
+;
+; VFMSUBADD
+;
+
+define void @test_vfmsubaddpd_128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> *%a3) optsize {
+; GENERIC-LABEL: test_vfmsubaddpd_128:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: vfmsubadd132pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT: vfmsubadd213pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT: vfmsubadd231pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT: vfmsubadd132pd (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT: vfmsubadd213pd (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT: vfmsubadd231pd (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_vfmsubaddpd_128:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: vfmsubadd132pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfmsubadd213pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfmsubadd231pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfmsubadd132pd (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
+; HASWELL-NEXT: vfmsubadd213pd (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
+; HASWELL-NEXT: vfmsubadd231pd (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_vfmsubaddpd_128:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: vfmsubadd132pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; BROADWELL-NEXT: vfmsubadd213pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; BROADWELL-NEXT: vfmsubadd231pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; BROADWELL-NEXT: vfmsubadd132pd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; BROADWELL-NEXT: vfmsubadd213pd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; BROADWELL-NEXT: vfmsubadd231pd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_vfmsubaddpd_128:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: vfmsubadd132pd %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vfmsubadd213pd %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vfmsubadd231pd %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vfmsubadd132pd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; SKYLAKE-NEXT: vfmsubadd213pd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; SKYLAKE-NEXT: vfmsubadd231pd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; KNL-LABEL: test_vfmsubaddpd_128:
+; KNL: # %bb.0:
+; KNL-NEXT: #APP
+; KNL-NEXT: vfmsubadd132pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT: vfmsubadd213pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT: vfmsubadd231pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT: vfmsubadd132pd (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
+; KNL-NEXT: vfmsubadd213pd (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
+; KNL-NEXT: vfmsubadd231pd (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
+; KNL-NEXT: #NO_APP
+; KNL-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_vfmsubaddpd_128:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: vfmsubadd132pd %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: vfmsubadd213pd %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: vfmsubadd231pd %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: vfmsubadd132pd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; SKX-NEXT: vfmsubadd213pd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; SKX-NEXT: vfmsubadd231pd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_vfmsubaddpd_128:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: vfmsubadd132pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; ZNVER1-NEXT: vfmsubadd213pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; ZNVER1-NEXT: vfmsubadd231pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; ZNVER1-NEXT: vfmsubadd132pd (%rdi), %xmm1, %xmm0 # sched: [12:0.50]
+; ZNVER1-NEXT: vfmsubadd213pd (%rdi), %xmm1, %xmm0 # sched: [12:0.50]
+; ZNVER1-NEXT: vfmsubadd231pd (%rdi), %xmm1, %xmm0 # sched: [12:0.50]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ tail call void asm "vfmsubadd132pd $2, $1, $0 \0A\09 vfmsubadd213pd $2, $1, $0 \0A\09 vfmsubadd231pd $2, $1, $0 \0A\09 vfmsubadd132pd $3, $1, $0 \0A\09 vfmsubadd213pd $3, $1, $0 \0A\09 vfmsubadd231pd $3, $1, $0", "x,x,x,*m"(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> *%a3) nounwind
+ ret void
+}
+
+define void @test_vfmsubaddpd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, <4 x double> *%a3) optsize {
+; GENERIC-LABEL: test_vfmsubaddpd_256:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: vfmsubadd132pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; GENERIC-NEXT: vfmsubadd213pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; GENERIC-NEXT: vfmsubadd231pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; GENERIC-NEXT: vfmsubadd132pd (%rdi), %ymm1, %ymm0 # sched: [9:0.50]
+; GENERIC-NEXT: vfmsubadd213pd (%rdi), %ymm1, %ymm0 # sched: [9:0.50]
+; GENERIC-NEXT: vfmsubadd231pd (%rdi), %ymm1, %ymm0 # sched: [9:0.50]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_vfmsubaddpd_256:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: vfmsubadd132pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfmsubadd213pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfmsubadd231pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfmsubadd132pd (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; HASWELL-NEXT: vfmsubadd213pd (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; HASWELL-NEXT: vfmsubadd231pd (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: vzeroupper # sched: [4:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_vfmsubaddpd_256:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: vfmsubadd132pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; BROADWELL-NEXT: vfmsubadd213pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; BROADWELL-NEXT: vfmsubadd231pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; BROADWELL-NEXT: vfmsubadd132pd (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; BROADWELL-NEXT: vfmsubadd213pd (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; BROADWELL-NEXT: vfmsubadd231pd (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: vzeroupper # sched: [4:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_vfmsubaddpd_256:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: vfmsubadd132pd %ymm2, %ymm1, %ymm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vfmsubadd213pd %ymm2, %ymm1, %ymm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vfmsubadd231pd %ymm2, %ymm1, %ymm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vfmsubadd132pd (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; SKYLAKE-NEXT: vfmsubadd213pd (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; SKYLAKE-NEXT: vfmsubadd231pd (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: vzeroupper # sched: [4:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; KNL-LABEL: test_vfmsubaddpd_256:
+; KNL: # %bb.0:
+; KNL-NEXT: #APP
+; KNL-NEXT: vfmsubadd132pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; KNL-NEXT: vfmsubadd213pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; KNL-NEXT: vfmsubadd231pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; KNL-NEXT: vfmsubadd132pd (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; KNL-NEXT: vfmsubadd213pd (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; KNL-NEXT: vfmsubadd231pd (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; KNL-NEXT: #NO_APP
+; KNL-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_vfmsubaddpd_256:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: vfmsubadd132pd %ymm2, %ymm1, %ymm0 # sched: [4:0.33]
+; SKX-NEXT: vfmsubadd213pd %ymm2, %ymm1, %ymm0 # sched: [4:0.33]
+; SKX-NEXT: vfmsubadd231pd %ymm2, %ymm1, %ymm0 # sched: [4:0.33]
+; SKX-NEXT: vfmsubadd132pd (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; SKX-NEXT: vfmsubadd213pd (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; SKX-NEXT: vfmsubadd231pd (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: vzeroupper # sched: [4:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_vfmsubaddpd_256:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: vfmsubadd132pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; ZNVER1-NEXT: vfmsubadd213pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; ZNVER1-NEXT: vfmsubadd231pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; ZNVER1-NEXT: vfmsubadd132pd (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; ZNVER1-NEXT: vfmsubadd213pd (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; ZNVER1-NEXT: vfmsubadd231pd (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: vzeroupper # sched: [100:?]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ tail call void asm "vfmsubadd132pd $2, $1, $0 \0A\09 vfmsubadd213pd $2, $1, $0 \0A\09 vfmsubadd231pd $2, $1, $0 \0A\09 vfmsubadd132pd $3, $1, $0 \0A\09 vfmsubadd213pd $3, $1, $0 \0A\09 vfmsubadd231pd $3, $1, $0", "x,x,x,*m"(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, <4 x double> *%a3) nounwind
+ ret void
+}
+
+define void @test_vfmsubaddps_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> *%a3) optsize {
+; GENERIC-LABEL: test_vfmsubaddps_128:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: vfmsubadd132ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT: vfmsubadd213ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT: vfmsubadd231ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT: vfmsubadd132ps (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT: vfmsubadd213ps (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT: vfmsubadd231ps (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_vfmsubaddps_128:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: vfmsubadd132ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfmsubadd213ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfmsubadd231ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfmsubadd132ps (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
+; HASWELL-NEXT: vfmsubadd213ps (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
+; HASWELL-NEXT: vfmsubadd231ps (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_vfmsubaddps_128:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: vfmsubadd132ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; BROADWELL-NEXT: vfmsubadd213ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; BROADWELL-NEXT: vfmsubadd231ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; BROADWELL-NEXT: vfmsubadd132ps (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; BROADWELL-NEXT: vfmsubadd213ps (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; BROADWELL-NEXT: vfmsubadd231ps (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_vfmsubaddps_128:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: vfmsubadd132ps %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vfmsubadd213ps %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vfmsubadd231ps %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vfmsubadd132ps (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; SKYLAKE-NEXT: vfmsubadd213ps (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; SKYLAKE-NEXT: vfmsubadd231ps (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; KNL-LABEL: test_vfmsubaddps_128:
+; KNL: # %bb.0:
+; KNL-NEXT: #APP
+; KNL-NEXT: vfmsubadd132ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT: vfmsubadd213ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT: vfmsubadd231ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT: vfmsubadd132ps (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
+; KNL-NEXT: vfmsubadd213ps (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
+; KNL-NEXT: vfmsubadd231ps (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
+; KNL-NEXT: #NO_APP
+; KNL-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_vfmsubaddps_128:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: vfmsubadd132ps %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: vfmsubadd213ps %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: vfmsubadd231ps %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: vfmsubadd132ps (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; SKX-NEXT: vfmsubadd213ps (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; SKX-NEXT: vfmsubadd231ps (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_vfmsubaddps_128:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: vfmsubadd132ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; ZNVER1-NEXT: vfmsubadd213ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; ZNVER1-NEXT: vfmsubadd231ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; ZNVER1-NEXT: vfmsubadd132ps (%rdi), %xmm1, %xmm0 # sched: [12:0.50]
+; ZNVER1-NEXT: vfmsubadd213ps (%rdi), %xmm1, %xmm0 # sched: [12:0.50]
+; ZNVER1-NEXT: vfmsubadd231ps (%rdi), %xmm1, %xmm0 # sched: [12:0.50]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ tail call void asm "vfmsubadd132ps $2, $1, $0 \0A\09 vfmsubadd213ps $2, $1, $0 \0A\09 vfmsubadd231ps $2, $1, $0 \0A\09 vfmsubadd132ps $3, $1, $0 \0A\09 vfmsubadd213ps $3, $1, $0 \0A\09 vfmsubadd231ps $3, $1, $0", "x,x,x,*m"(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> *%a3) nounwind
+ ret void
+}
+
+define void @test_vfmsubaddps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, <8 x float> *%a3) optsize {
+; GENERIC-LABEL: test_vfmsubaddps_256:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: vfmsubadd132ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; GENERIC-NEXT: vfmsubadd213ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; GENERIC-NEXT: vfmsubadd231ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; GENERIC-NEXT: vfmsubadd132ps (%rdi), %ymm1, %ymm0 # sched: [9:0.50]
+; GENERIC-NEXT: vfmsubadd213ps (%rdi), %ymm1, %ymm0 # sched: [9:0.50]
+; GENERIC-NEXT: vfmsubadd231ps (%rdi), %ymm1, %ymm0 # sched: [9:0.50]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_vfmsubaddps_256:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: vfmsubadd132ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfmsubadd213ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfmsubadd231ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfmsubadd132ps (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; HASWELL-NEXT: vfmsubadd213ps (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; HASWELL-NEXT: vfmsubadd231ps (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: vzeroupper # sched: [4:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_vfmsubaddps_256:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: vfmsubadd132ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; BROADWELL-NEXT: vfmsubadd213ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; BROADWELL-NEXT: vfmsubadd231ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; BROADWELL-NEXT: vfmsubadd132ps (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; BROADWELL-NEXT: vfmsubadd213ps (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; BROADWELL-NEXT: vfmsubadd231ps (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: vzeroupper # sched: [4:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_vfmsubaddps_256:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: vfmsubadd132ps %ymm2, %ymm1, %ymm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vfmsubadd213ps %ymm2, %ymm1, %ymm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vfmsubadd231ps %ymm2, %ymm1, %ymm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vfmsubadd132ps (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; SKYLAKE-NEXT: vfmsubadd213ps (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; SKYLAKE-NEXT: vfmsubadd231ps (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: vzeroupper # sched: [4:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; KNL-LABEL: test_vfmsubaddps_256:
+; KNL: # %bb.0:
+; KNL-NEXT: #APP
+; KNL-NEXT: vfmsubadd132ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; KNL-NEXT: vfmsubadd213ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; KNL-NEXT: vfmsubadd231ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; KNL-NEXT: vfmsubadd132ps (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; KNL-NEXT: vfmsubadd213ps (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; KNL-NEXT: vfmsubadd231ps (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; KNL-NEXT: #NO_APP
+; KNL-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_vfmsubaddps_256:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: vfmsubadd132ps %ymm2, %ymm1, %ymm0 # sched: [4:0.33]
+; SKX-NEXT: vfmsubadd213ps %ymm2, %ymm1, %ymm0 # sched: [4:0.33]
+; SKX-NEXT: vfmsubadd231ps %ymm2, %ymm1, %ymm0 # sched: [4:0.33]
+; SKX-NEXT: vfmsubadd132ps (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; SKX-NEXT: vfmsubadd213ps (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; SKX-NEXT: vfmsubadd231ps (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: vzeroupper # sched: [4:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_vfmsubaddps_256:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: vfmsubadd132ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; ZNVER1-NEXT: vfmsubadd213ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; ZNVER1-NEXT: vfmsubadd231ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; ZNVER1-NEXT: vfmsubadd132ps (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; ZNVER1-NEXT: vfmsubadd213ps (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; ZNVER1-NEXT: vfmsubadd231ps (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: vzeroupper # sched: [100:?]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ tail call void asm "vfmsubadd132ps $2, $1, $0 \0A\09 vfmsubadd213ps $2, $1, $0 \0A\09 vfmsubadd231ps $2, $1, $0 \0A\09 vfmsubadd132ps $3, $1, $0 \0A\09 vfmsubadd213ps $3, $1, $0 \0A\09 vfmsubadd231ps $3, $1, $0", "x,x,x,*m"(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, <8 x float> *%a3) nounwind
+ ret void
+}
+
+;
+; VFMSUB
+;
+
+define void @test_vfmsubpd_128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> *%a3) optsize {
+; GENERIC-LABEL: test_vfmsubpd_128:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: vfmsub132pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT: vfmsub213pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT: vfmsub231pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT: vfmsub132pd (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT: vfmsub213pd (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT: vfmsub231pd (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_vfmsubpd_128:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: vfmsub132pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfmsub213pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfmsub231pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfmsub132pd (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
+; HASWELL-NEXT: vfmsub213pd (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
+; HASWELL-NEXT: vfmsub231pd (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_vfmsubpd_128:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: vfmsub132pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; BROADWELL-NEXT: vfmsub213pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; BROADWELL-NEXT: vfmsub231pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; BROADWELL-NEXT: vfmsub132pd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; BROADWELL-NEXT: vfmsub213pd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; BROADWELL-NEXT: vfmsub231pd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_vfmsubpd_128:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: vfmsub132pd %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vfmsub213pd %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vfmsub231pd %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vfmsub132pd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; SKYLAKE-NEXT: vfmsub213pd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; SKYLAKE-NEXT: vfmsub231pd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; KNL-LABEL: test_vfmsubpd_128:
+; KNL: # %bb.0:
+; KNL-NEXT: #APP
+; KNL-NEXT: vfmsub132pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT: vfmsub213pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT: vfmsub231pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT: vfmsub132pd (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
+; KNL-NEXT: vfmsub213pd (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
+; KNL-NEXT: vfmsub231pd (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
+; KNL-NEXT: #NO_APP
+; KNL-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_vfmsubpd_128:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: vfmsub132pd %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: vfmsub213pd %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: vfmsub231pd %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: vfmsub132pd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; SKX-NEXT: vfmsub213pd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; SKX-NEXT: vfmsub231pd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_vfmsubpd_128:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: vfmsub132pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; ZNVER1-NEXT: vfmsub213pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; ZNVER1-NEXT: vfmsub231pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; ZNVER1-NEXT: vfmsub132pd (%rdi), %xmm1, %xmm0 # sched: [12:0.50]
+; ZNVER1-NEXT: vfmsub213pd (%rdi), %xmm1, %xmm0 # sched: [12:0.50]
+; ZNVER1-NEXT: vfmsub231pd (%rdi), %xmm1, %xmm0 # sched: [12:0.50]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ tail call void asm "vfmsub132pd $2, $1, $0 \0A\09 vfmsub213pd $2, $1, $0 \0A\09 vfmsub231pd $2, $1, $0 \0A\09 vfmsub132pd $3, $1, $0 \0A\09 vfmsub213pd $3, $1, $0 \0A\09 vfmsub231pd $3, $1, $0", "x,x,x,*m"(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> *%a3) nounwind
+ ret void
+}
+
+define void @test_vfmsubpd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, <4 x double> *%a3) optsize {
+; GENERIC-LABEL: test_vfmsubpd_256:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: vfmsub132pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; GENERIC-NEXT: vfmsub213pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; GENERIC-NEXT: vfmsub231pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; GENERIC-NEXT: vfmsub132pd (%rdi), %ymm1, %ymm0 # sched: [9:0.50]
+; GENERIC-NEXT: vfmsub213pd (%rdi), %ymm1, %ymm0 # sched: [9:0.50]
+; GENERIC-NEXT: vfmsub231pd (%rdi), %ymm1, %ymm0 # sched: [9:0.50]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_vfmsubpd_256:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: vfmsub132pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfmsub213pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfmsub231pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfmsub132pd (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; HASWELL-NEXT: vfmsub213pd (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; HASWELL-NEXT: vfmsub231pd (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: vzeroupper # sched: [4:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_vfmsubpd_256:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: vfmsub132pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; BROADWELL-NEXT: vfmsub213pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; BROADWELL-NEXT: vfmsub231pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; BROADWELL-NEXT: vfmsub132pd (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; BROADWELL-NEXT: vfmsub213pd (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; BROADWELL-NEXT: vfmsub231pd (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: vzeroupper # sched: [4:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_vfmsubpd_256:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: vfmsub132pd %ymm2, %ymm1, %ymm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vfmsub213pd %ymm2, %ymm1, %ymm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vfmsub231pd %ymm2, %ymm1, %ymm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vfmsub132pd (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; SKYLAKE-NEXT: vfmsub213pd (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; SKYLAKE-NEXT: vfmsub231pd (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: vzeroupper # sched: [4:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; KNL-LABEL: test_vfmsubpd_256:
+; KNL: # %bb.0:
+; KNL-NEXT: #APP
+; KNL-NEXT: vfmsub132pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; KNL-NEXT: vfmsub213pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; KNL-NEXT: vfmsub231pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; KNL-NEXT: vfmsub132pd (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; KNL-NEXT: vfmsub213pd (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; KNL-NEXT: vfmsub231pd (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; KNL-NEXT: #NO_APP
+; KNL-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_vfmsubpd_256:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: vfmsub132pd %ymm2, %ymm1, %ymm0 # sched: [4:0.33]
+; SKX-NEXT: vfmsub213pd %ymm2, %ymm1, %ymm0 # sched: [4:0.33]
+; SKX-NEXT: vfmsub231pd %ymm2, %ymm1, %ymm0 # sched: [4:0.33]
+; SKX-NEXT: vfmsub132pd (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; SKX-NEXT: vfmsub213pd (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; SKX-NEXT: vfmsub231pd (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: vzeroupper # sched: [4:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_vfmsubpd_256:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: vfmsub132pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; ZNVER1-NEXT: vfmsub213pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; ZNVER1-NEXT: vfmsub231pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; ZNVER1-NEXT: vfmsub132pd (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; ZNVER1-NEXT: vfmsub213pd (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; ZNVER1-NEXT: vfmsub231pd (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: vzeroupper # sched: [100:?]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ tail call void asm "vfmsub132pd $2, $1, $0 \0A\09 vfmsub213pd $2, $1, $0 \0A\09 vfmsub231pd $2, $1, $0 \0A\09 vfmsub132pd $3, $1, $0 \0A\09 vfmsub213pd $3, $1, $0 \0A\09 vfmsub231pd $3, $1, $0", "x,x,x,*m"(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, <4 x double> *%a3) nounwind
+ ret void
+}
+
+define void @test_vfmsubps_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> *%a3) optsize {
+; GENERIC-LABEL: test_vfmsubps_128:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: vfmsub132ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT: vfmsub213ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT: vfmsub231ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT: vfmsub132ps (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT: vfmsub213ps (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT: vfmsub231ps (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_vfmsubps_128:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: vfmsub132ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfmsub213ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfmsub231ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfmsub132ps (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
+; HASWELL-NEXT: vfmsub213ps (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
+; HASWELL-NEXT: vfmsub231ps (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_vfmsubps_128:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: vfmsub132ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; BROADWELL-NEXT: vfmsub213ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; BROADWELL-NEXT: vfmsub231ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; BROADWELL-NEXT: vfmsub132ps (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; BROADWELL-NEXT: vfmsub213ps (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; BROADWELL-NEXT: vfmsub231ps (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_vfmsubps_128:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: vfmsub132ps %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vfmsub213ps %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vfmsub231ps %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vfmsub132ps (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; SKYLAKE-NEXT: vfmsub213ps (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; SKYLAKE-NEXT: vfmsub231ps (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; KNL-LABEL: test_vfmsubps_128:
+; KNL: # %bb.0:
+; KNL-NEXT: #APP
+; KNL-NEXT: vfmsub132ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT: vfmsub213ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT: vfmsub231ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT: vfmsub132ps (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
+; KNL-NEXT: vfmsub213ps (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
+; KNL-NEXT: vfmsub231ps (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
+; KNL-NEXT: #NO_APP
+; KNL-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_vfmsubps_128:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: vfmsub132ps %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: vfmsub213ps %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: vfmsub231ps %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: vfmsub132ps (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; SKX-NEXT: vfmsub213ps (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; SKX-NEXT: vfmsub231ps (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_vfmsubps_128:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: vfmsub132ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; ZNVER1-NEXT: vfmsub213ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; ZNVER1-NEXT: vfmsub231ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; ZNVER1-NEXT: vfmsub132ps (%rdi), %xmm1, %xmm0 # sched: [12:0.50]
+; ZNVER1-NEXT: vfmsub213ps (%rdi), %xmm1, %xmm0 # sched: [12:0.50]
+; ZNVER1-NEXT: vfmsub231ps (%rdi), %xmm1, %xmm0 # sched: [12:0.50]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ tail call void asm "vfmsub132ps $2, $1, $0 \0A\09 vfmsub213ps $2, $1, $0 \0A\09 vfmsub231ps $2, $1, $0 \0A\09 vfmsub132ps $3, $1, $0 \0A\09 vfmsub213ps $3, $1, $0 \0A\09 vfmsub231ps $3, $1, $0", "x,x,x,*m"(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> *%a3) nounwind
+ ret void
+}
+
+define void @test_vfmsubps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, <8 x float> *%a3) optsize {
+; GENERIC-LABEL: test_vfmsubps_256:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: vfmsub132ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; GENERIC-NEXT: vfmsub213ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; GENERIC-NEXT: vfmsub231ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; GENERIC-NEXT: vfmsub132ps (%rdi), %ymm1, %ymm0 # sched: [9:0.50]
+; GENERIC-NEXT: vfmsub213ps (%rdi), %ymm1, %ymm0 # sched: [9:0.50]
+; GENERIC-NEXT: vfmsub231ps (%rdi), %ymm1, %ymm0 # sched: [9:0.50]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_vfmsubps_256:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: vfmsub132ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfmsub213ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfmsub231ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfmsub132ps (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; HASWELL-NEXT: vfmsub213ps (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; HASWELL-NEXT: vfmsub231ps (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: vzeroupper # sched: [4:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_vfmsubps_256:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: vfmsub132ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; BROADWELL-NEXT: vfmsub213ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; BROADWELL-NEXT: vfmsub231ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; BROADWELL-NEXT: vfmsub132ps (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; BROADWELL-NEXT: vfmsub213ps (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; BROADWELL-NEXT: vfmsub231ps (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: vzeroupper # sched: [4:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_vfmsubps_256:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: vfmsub132ps %ymm2, %ymm1, %ymm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vfmsub213ps %ymm2, %ymm1, %ymm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vfmsub231ps %ymm2, %ymm1, %ymm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vfmsub132ps (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; SKYLAKE-NEXT: vfmsub213ps (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; SKYLAKE-NEXT: vfmsub231ps (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: vzeroupper # sched: [4:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; KNL-LABEL: test_vfmsubps_256:
+; KNL: # %bb.0:
+; KNL-NEXT: #APP
+; KNL-NEXT: vfmsub132ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; KNL-NEXT: vfmsub213ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; KNL-NEXT: vfmsub231ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; KNL-NEXT: vfmsub132ps (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; KNL-NEXT: vfmsub213ps (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; KNL-NEXT: vfmsub231ps (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; KNL-NEXT: #NO_APP
+; KNL-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_vfmsubps_256:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: vfmsub132ps %ymm2, %ymm1, %ymm0 # sched: [4:0.33]
+; SKX-NEXT: vfmsub213ps %ymm2, %ymm1, %ymm0 # sched: [4:0.33]
+; SKX-NEXT: vfmsub231ps %ymm2, %ymm1, %ymm0 # sched: [4:0.33]
+; SKX-NEXT: vfmsub132ps (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; SKX-NEXT: vfmsub213ps (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; SKX-NEXT: vfmsub231ps (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: vzeroupper # sched: [4:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_vfmsubps_256:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: vfmsub132ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; ZNVER1-NEXT: vfmsub213ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; ZNVER1-NEXT: vfmsub231ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; ZNVER1-NEXT: vfmsub132ps (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; ZNVER1-NEXT: vfmsub213ps (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; ZNVER1-NEXT: vfmsub231ps (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: vzeroupper # sched: [100:?]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ tail call void asm "vfmsub132ps $2, $1, $0 \0A\09 vfmsub213ps $2, $1, $0 \0A\09 vfmsub231ps $2, $1, $0 \0A\09 vfmsub132ps $3, $1, $0 \0A\09 vfmsub213ps $3, $1, $0 \0A\09 vfmsub231ps $3, $1, $0", "x,x,x,*m"(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, <8 x float> *%a3) nounwind
+ ret void
+}
+
+define void @test_vfmsubsd_128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> *%a3) optsize {
+; GENERIC-LABEL: test_vfmsubsd_128:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: vfmsub132sd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT: vfmsub213sd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT: vfmsub231sd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT: vfmsub132sd (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT: vfmsub213sd (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT: vfmsub231sd (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_vfmsubsd_128:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: vfmsub132sd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfmsub213sd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfmsub231sd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfmsub132sd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; HASWELL-NEXT: vfmsub213sd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; HASWELL-NEXT: vfmsub231sd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_vfmsubsd_128:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: vfmsub132sd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; BROADWELL-NEXT: vfmsub213sd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; BROADWELL-NEXT: vfmsub231sd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; BROADWELL-NEXT: vfmsub132sd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; BROADWELL-NEXT: vfmsub213sd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; BROADWELL-NEXT: vfmsub231sd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_vfmsubsd_128:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: vfmsub132sd %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vfmsub213sd %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vfmsub231sd %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vfmsub132sd (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; SKYLAKE-NEXT: vfmsub213sd (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; SKYLAKE-NEXT: vfmsub231sd (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; KNL-LABEL: test_vfmsubsd_128:
+; KNL: # %bb.0:
+; KNL-NEXT: #APP
+; KNL-NEXT: vfmsub132sd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT: vfmsub213sd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT: vfmsub231sd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT: vfmsub132sd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; KNL-NEXT: vfmsub213sd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; KNL-NEXT: vfmsub231sd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; KNL-NEXT: #NO_APP
+; KNL-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_vfmsubsd_128:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: vfmsub132sd %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: vfmsub213sd %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: vfmsub231sd %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: vfmsub132sd (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; SKX-NEXT: vfmsub213sd (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; SKX-NEXT: vfmsub231sd (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_vfmsubsd_128:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: vfmsub132sd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; ZNVER1-NEXT: vfmsub213sd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; ZNVER1-NEXT: vfmsub231sd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; ZNVER1-NEXT: vfmsub132sd (%rdi), %xmm1, %xmm0 # sched: [12:0.50]
+; ZNVER1-NEXT: vfmsub213sd (%rdi), %xmm1, %xmm0 # sched: [12:0.50]
+; ZNVER1-NEXT: vfmsub231sd (%rdi), %xmm1, %xmm0 # sched: [12:0.50]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ tail call void asm "vfmsub132sd $2, $1, $0 \0A\09 vfmsub213sd $2, $1, $0 \0A\09 vfmsub231sd $2, $1, $0 \0A\09 vfmsub132sd $3, $1, $0 \0A\09 vfmsub213sd $3, $1, $0 \0A\09 vfmsub231sd $3, $1, $0", "x,x,x,*m"(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> *%a3) nounwind
+ ret void
+}
+
+define void @test_vfmsubss_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> *%a3) optsize {
+; GENERIC-LABEL: test_vfmsubss_128:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: vfmsub132ss %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT: vfmsub213ss %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT: vfmsub231ss %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT: vfmsub132ss (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT: vfmsub213ss (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT: vfmsub231ss (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_vfmsubss_128:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: vfmsub132ss %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfmsub213ss %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfmsub231ss %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfmsub132ss (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; HASWELL-NEXT: vfmsub213ss (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; HASWELL-NEXT: vfmsub231ss (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_vfmsubss_128:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: vfmsub132ss %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; BROADWELL-NEXT: vfmsub213ss %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; BROADWELL-NEXT: vfmsub231ss %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; BROADWELL-NEXT: vfmsub132ss (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; BROADWELL-NEXT: vfmsub213ss (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; BROADWELL-NEXT: vfmsub231ss (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_vfmsubss_128:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: vfmsub132ss %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vfmsub213ss %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vfmsub231ss %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vfmsub132ss (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; SKYLAKE-NEXT: vfmsub213ss (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; SKYLAKE-NEXT: vfmsub231ss (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; KNL-LABEL: test_vfmsubss_128:
+; KNL: # %bb.0:
+; KNL-NEXT: #APP
+; KNL-NEXT: vfmsub132ss %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT: vfmsub213ss %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT: vfmsub231ss %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT: vfmsub132ss (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; KNL-NEXT: vfmsub213ss (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; KNL-NEXT: vfmsub231ss (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; KNL-NEXT: #NO_APP
+; KNL-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_vfmsubss_128:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: vfmsub132ss %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: vfmsub213ss %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: vfmsub231ss %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: vfmsub132ss (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; SKX-NEXT: vfmsub213ss (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; SKX-NEXT: vfmsub231ss (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_vfmsubss_128:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: vfmsub132ss %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; ZNVER1-NEXT: vfmsub213ss %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; ZNVER1-NEXT: vfmsub231ss %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; ZNVER1-NEXT: vfmsub132ss (%rdi), %xmm1, %xmm0 # sched: [12:0.50]
+; ZNVER1-NEXT: vfmsub213ss (%rdi), %xmm1, %xmm0 # sched: [12:0.50]
+; ZNVER1-NEXT: vfmsub231ss (%rdi), %xmm1, %xmm0 # sched: [12:0.50]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ tail call void asm "vfmsub132ss $2, $1, $0 \0A\09 vfmsub213ss $2, $1, $0 \0A\09 vfmsub231ss $2, $1, $0 \0A\09 vfmsub132ss $3, $1, $0 \0A\09 vfmsub213ss $3, $1, $0 \0A\09 vfmsub231ss $3, $1, $0", "x,x,x,*m"(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> *%a3) nounwind
+ ret void
+}
+
+;
+; VFNMADD
+;
+
+define void @test_vfnmaddpd_128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> *%a3) optsize {
+; GENERIC-LABEL: test_vfnmaddpd_128:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: vfnmadd132pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT: vfnmadd213pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT: vfnmadd231pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT: vfnmadd132pd (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT: vfnmadd213pd (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT: vfnmadd231pd (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_vfnmaddpd_128:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: vfnmadd132pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfnmadd213pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfnmadd231pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfnmadd132pd (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
+; HASWELL-NEXT: vfnmadd213pd (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
+; HASWELL-NEXT: vfnmadd231pd (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_vfnmaddpd_128:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: vfnmadd132pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; BROADWELL-NEXT: vfnmadd213pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; BROADWELL-NEXT: vfnmadd231pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; BROADWELL-NEXT: vfnmadd132pd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; BROADWELL-NEXT: vfnmadd213pd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; BROADWELL-NEXT: vfnmadd231pd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_vfnmaddpd_128:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: vfnmadd132pd %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vfnmadd213pd %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vfnmadd231pd %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vfnmadd132pd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; SKYLAKE-NEXT: vfnmadd213pd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; SKYLAKE-NEXT: vfnmadd231pd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; KNL-LABEL: test_vfnmaddpd_128:
+; KNL: # %bb.0:
+; KNL-NEXT: #APP
+; KNL-NEXT: vfnmadd132pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT: vfnmadd213pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT: vfnmadd231pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT: vfnmadd132pd (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
+; KNL-NEXT: vfnmadd213pd (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
+; KNL-NEXT: vfnmadd231pd (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
+; KNL-NEXT: #NO_APP
+; KNL-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_vfnmaddpd_128:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: vfnmadd132pd %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: vfnmadd213pd %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: vfnmadd231pd %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: vfnmadd132pd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; SKX-NEXT: vfnmadd213pd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; SKX-NEXT: vfnmadd231pd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_vfnmaddpd_128:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: vfnmadd132pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; ZNVER1-NEXT: vfnmadd213pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; ZNVER1-NEXT: vfnmadd231pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; ZNVER1-NEXT: vfnmadd132pd (%rdi), %xmm1, %xmm0 # sched: [12:0.50]
+; ZNVER1-NEXT: vfnmadd213pd (%rdi), %xmm1, %xmm0 # sched: [12:0.50]
+; ZNVER1-NEXT: vfnmadd231pd (%rdi), %xmm1, %xmm0 # sched: [12:0.50]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ tail call void asm "vfnmadd132pd $2, $1, $0 \0A\09 vfnmadd213pd $2, $1, $0 \0A\09 vfnmadd231pd $2, $1, $0 \0A\09 vfnmadd132pd $3, $1, $0 \0A\09 vfnmadd213pd $3, $1, $0 \0A\09 vfnmadd231pd $3, $1, $0", "x,x,x,*m"(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> *%a3) nounwind
+ ret void
+}
+
+define void @test_vfnmaddpd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, <4 x double> *%a3) optsize {
+; GENERIC-LABEL: test_vfnmaddpd_256:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: vfnmadd132pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; GENERIC-NEXT: vfnmadd213pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; GENERIC-NEXT: vfnmadd231pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; GENERIC-NEXT: vfnmadd132pd (%rdi), %ymm1, %ymm0 # sched: [9:0.50]
+; GENERIC-NEXT: vfnmadd213pd (%rdi), %ymm1, %ymm0 # sched: [9:0.50]
+; GENERIC-NEXT: vfnmadd231pd (%rdi), %ymm1, %ymm0 # sched: [9:0.50]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_vfnmaddpd_256:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: vfnmadd132pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfnmadd213pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfnmadd231pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfnmadd132pd (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; HASWELL-NEXT: vfnmadd213pd (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; HASWELL-NEXT: vfnmadd231pd (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: vzeroupper # sched: [4:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_vfnmaddpd_256:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: vfnmadd132pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; BROADWELL-NEXT: vfnmadd213pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; BROADWELL-NEXT: vfnmadd231pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; BROADWELL-NEXT: vfnmadd132pd (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; BROADWELL-NEXT: vfnmadd213pd (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; BROADWELL-NEXT: vfnmadd231pd (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: vzeroupper # sched: [4:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_vfnmaddpd_256:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: vfnmadd132pd %ymm2, %ymm1, %ymm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vfnmadd213pd %ymm2, %ymm1, %ymm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vfnmadd231pd %ymm2, %ymm1, %ymm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vfnmadd132pd (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; SKYLAKE-NEXT: vfnmadd213pd (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; SKYLAKE-NEXT: vfnmadd231pd (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: vzeroupper # sched: [4:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; KNL-LABEL: test_vfnmaddpd_256:
+; KNL: # %bb.0:
+; KNL-NEXT: #APP
+; KNL-NEXT: vfnmadd132pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; KNL-NEXT: vfnmadd213pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; KNL-NEXT: vfnmadd231pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; KNL-NEXT: vfnmadd132pd (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; KNL-NEXT: vfnmadd213pd (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; KNL-NEXT: vfnmadd231pd (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; KNL-NEXT: #NO_APP
+; KNL-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_vfnmaddpd_256:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: vfnmadd132pd %ymm2, %ymm1, %ymm0 # sched: [4:0.33]
+; SKX-NEXT: vfnmadd213pd %ymm2, %ymm1, %ymm0 # sched: [4:0.33]
+; SKX-NEXT: vfnmadd231pd %ymm2, %ymm1, %ymm0 # sched: [4:0.33]
+; SKX-NEXT: vfnmadd132pd (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; SKX-NEXT: vfnmadd213pd (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; SKX-NEXT: vfnmadd231pd (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: vzeroupper # sched: [4:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_vfnmaddpd_256:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: vfnmadd132pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; ZNVER1-NEXT: vfnmadd213pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; ZNVER1-NEXT: vfnmadd231pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; ZNVER1-NEXT: vfnmadd132pd (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; ZNVER1-NEXT: vfnmadd213pd (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; ZNVER1-NEXT: vfnmadd231pd (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: vzeroupper # sched: [100:?]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ tail call void asm "vfnmadd132pd $2, $1, $0 \0A\09 vfnmadd213pd $2, $1, $0 \0A\09 vfnmadd231pd $2, $1, $0 \0A\09 vfnmadd132pd $3, $1, $0 \0A\09 vfnmadd213pd $3, $1, $0 \0A\09 vfnmadd231pd $3, $1, $0", "x,x,x,*m"(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, <4 x double> *%a3) nounwind
+ ret void
+}
+
+define void @test_vfnmaddps_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> *%a3) optsize {
+; GENERIC-LABEL: test_vfnmaddps_128:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: vfnmadd132ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT: vfnmadd231ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT: vfnmadd132ps (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT: vfnmadd213ps (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT: vfnmadd231ps (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_vfnmaddps_128:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: vfnmadd132ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfnmadd231ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfnmadd132ps (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
+; HASWELL-NEXT: vfnmadd213ps (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
+; HASWELL-NEXT: vfnmadd231ps (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_vfnmaddps_128:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: vfnmadd132ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; BROADWELL-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; BROADWELL-NEXT: vfnmadd231ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; BROADWELL-NEXT: vfnmadd132ps (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; BROADWELL-NEXT: vfnmadd213ps (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; BROADWELL-NEXT: vfnmadd231ps (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_vfnmaddps_128:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: vfnmadd132ps %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vfnmadd231ps %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vfnmadd132ps (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; SKYLAKE-NEXT: vfnmadd213ps (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; SKYLAKE-NEXT: vfnmadd231ps (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; KNL-LABEL: test_vfnmaddps_128:
+; KNL: # %bb.0:
+; KNL-NEXT: #APP
+; KNL-NEXT: vfnmadd132ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT: vfnmadd231ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT: vfnmadd132ps (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
+; KNL-NEXT: vfnmadd213ps (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
+; KNL-NEXT: vfnmadd231ps (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
+; KNL-NEXT: #NO_APP
+; KNL-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_vfnmaddps_128:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: vfnmadd132ps %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: vfnmadd231ps %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: vfnmadd132ps (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; SKX-NEXT: vfnmadd213ps (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; SKX-NEXT: vfnmadd231ps (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_vfnmaddps_128:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: vfnmadd132ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; ZNVER1-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; ZNVER1-NEXT: vfnmadd231ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; ZNVER1-NEXT: vfnmadd132ps (%rdi), %xmm1, %xmm0 # sched: [12:0.50]
+; ZNVER1-NEXT: vfnmadd213ps (%rdi), %xmm1, %xmm0 # sched: [12:0.50]
+; ZNVER1-NEXT: vfnmadd231ps (%rdi), %xmm1, %xmm0 # sched: [12:0.50]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ tail call void asm "vfnmadd132ps $2, $1, $0 \0A\09 vfnmadd213ps $2, $1, $0 \0A\09 vfnmadd231ps $2, $1, $0 \0A\09 vfnmadd132ps $3, $1, $0 \0A\09 vfnmadd213ps $3, $1, $0 \0A\09 vfnmadd231ps $3, $1, $0", "x,x,x,*m"(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> *%a3) nounwind
+ ret void
+}
+
+define void @test_vfnmaddps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, <8 x float> *%a3) optsize {
+; GENERIC-LABEL: test_vfnmaddps_256:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: vfnmadd132ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; GENERIC-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; GENERIC-NEXT: vfnmadd231ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; GENERIC-NEXT: vfnmadd132ps (%rdi), %ymm1, %ymm0 # sched: [9:0.50]
+; GENERIC-NEXT: vfnmadd213ps (%rdi), %ymm1, %ymm0 # sched: [9:0.50]
+; GENERIC-NEXT: vfnmadd231ps (%rdi), %ymm1, %ymm0 # sched: [9:0.50]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_vfnmaddps_256:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: vfnmadd132ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfnmadd231ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfnmadd132ps (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; HASWELL-NEXT: vfnmadd213ps (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; HASWELL-NEXT: vfnmadd231ps (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: vzeroupper # sched: [4:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_vfnmaddps_256:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: vfnmadd132ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; BROADWELL-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; BROADWELL-NEXT: vfnmadd231ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; BROADWELL-NEXT: vfnmadd132ps (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; BROADWELL-NEXT: vfnmadd213ps (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; BROADWELL-NEXT: vfnmadd231ps (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: vzeroupper # sched: [4:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_vfnmaddps_256:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: vfnmadd132ps %ymm2, %ymm1, %ymm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vfnmadd231ps %ymm2, %ymm1, %ymm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vfnmadd132ps (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; SKYLAKE-NEXT: vfnmadd213ps (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; SKYLAKE-NEXT: vfnmadd231ps (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: vzeroupper # sched: [4:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; KNL-LABEL: test_vfnmaddps_256:
+; KNL: # %bb.0:
+; KNL-NEXT: #APP
+; KNL-NEXT: vfnmadd132ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; KNL-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; KNL-NEXT: vfnmadd231ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; KNL-NEXT: vfnmadd132ps (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; KNL-NEXT: vfnmadd213ps (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; KNL-NEXT: vfnmadd231ps (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; KNL-NEXT: #NO_APP
+; KNL-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_vfnmaddps_256:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: vfnmadd132ps %ymm2, %ymm1, %ymm0 # sched: [4:0.33]
+; SKX-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0 # sched: [4:0.33]
+; SKX-NEXT: vfnmadd231ps %ymm2, %ymm1, %ymm0 # sched: [4:0.33]
+; SKX-NEXT: vfnmadd132ps (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; SKX-NEXT: vfnmadd213ps (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; SKX-NEXT: vfnmadd231ps (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: vzeroupper # sched: [4:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_vfnmaddps_256:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: vfnmadd132ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; ZNVER1-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; ZNVER1-NEXT: vfnmadd231ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; ZNVER1-NEXT: vfnmadd132ps (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; ZNVER1-NEXT: vfnmadd213ps (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; ZNVER1-NEXT: vfnmadd231ps (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: vzeroupper # sched: [100:?]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ tail call void asm "vfnmadd132ps $2, $1, $0 \0A\09 vfnmadd213ps $2, $1, $0 \0A\09 vfnmadd231ps $2, $1, $0 \0A\09 vfnmadd132ps $3, $1, $0 \0A\09 vfnmadd213ps $3, $1, $0 \0A\09 vfnmadd231ps $3, $1, $0", "x,x,x,*m"(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, <8 x float> *%a3) nounwind
+ ret void
+}
+
+define void @test_vfnmaddsd_128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> *%a3) optsize {
+; GENERIC-LABEL: test_vfnmaddsd_128:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: vfnmadd132sd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT: vfnmadd213sd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT: vfnmadd231sd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT: vfnmadd132sd (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT: vfnmadd213sd (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT: vfnmadd231sd (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_vfnmaddsd_128:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: vfnmadd132sd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfnmadd213sd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfnmadd231sd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfnmadd132sd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; HASWELL-NEXT: vfnmadd213sd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; HASWELL-NEXT: vfnmadd231sd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_vfnmaddsd_128:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: vfnmadd132sd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; BROADWELL-NEXT: vfnmadd213sd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; BROADWELL-NEXT: vfnmadd231sd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; BROADWELL-NEXT: vfnmadd132sd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; BROADWELL-NEXT: vfnmadd213sd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; BROADWELL-NEXT: vfnmadd231sd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_vfnmaddsd_128:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: vfnmadd132sd %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vfnmadd213sd %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vfnmadd231sd %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vfnmadd132sd (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; SKYLAKE-NEXT: vfnmadd213sd (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; SKYLAKE-NEXT: vfnmadd231sd (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; KNL-LABEL: test_vfnmaddsd_128:
+; KNL: # %bb.0:
+; KNL-NEXT: #APP
+; KNL-NEXT: vfnmadd132sd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT: vfnmadd213sd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT: vfnmadd231sd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT: vfnmadd132sd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; KNL-NEXT: vfnmadd213sd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; KNL-NEXT: vfnmadd231sd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; KNL-NEXT: #NO_APP
+; KNL-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_vfnmaddsd_128:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: vfnmadd132sd %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: vfnmadd213sd %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: vfnmadd231sd %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: vfnmadd132sd (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; SKX-NEXT: vfnmadd213sd (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; SKX-NEXT: vfnmadd231sd (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_vfnmaddsd_128:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: vfnmadd132sd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; ZNVER1-NEXT: vfnmadd213sd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; ZNVER1-NEXT: vfnmadd231sd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; ZNVER1-NEXT: vfnmadd132sd (%rdi), %xmm1, %xmm0 # sched: [12:0.50]
+; ZNVER1-NEXT: vfnmadd213sd (%rdi), %xmm1, %xmm0 # sched: [12:0.50]
+; ZNVER1-NEXT: vfnmadd231sd (%rdi), %xmm1, %xmm0 # sched: [12:0.50]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ tail call void asm "vfnmadd132sd $2, $1, $0 \0A\09 vfnmadd213sd $2, $1, $0 \0A\09 vfnmadd231sd $2, $1, $0 \0A\09 vfnmadd132sd $3, $1, $0 \0A\09 vfnmadd213sd $3, $1, $0 \0A\09 vfnmadd231sd $3, $1, $0", "x,x,x,*m"(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> *%a3) nounwind
+ ret void
+}
+
+define void @test_vfnmaddss_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> *%a3) optsize {
+; GENERIC-LABEL: test_vfnmaddss_128:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: vfnmadd132ss %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT: vfnmadd213ss %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT: vfnmadd231ss %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT: vfnmadd132ss (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT: vfnmadd213ss (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT: vfnmadd231ss (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_vfnmaddss_128:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: vfnmadd132ss %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfnmadd213ss %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfnmadd231ss %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfnmadd132ss (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; HASWELL-NEXT: vfnmadd213ss (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; HASWELL-NEXT: vfnmadd231ss (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_vfnmaddss_128:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: vfnmadd132ss %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; BROADWELL-NEXT: vfnmadd213ss %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; BROADWELL-NEXT: vfnmadd231ss %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; BROADWELL-NEXT: vfnmadd132ss (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; BROADWELL-NEXT: vfnmadd213ss (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; BROADWELL-NEXT: vfnmadd231ss (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_vfnmaddss_128:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: vfnmadd132ss %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vfnmadd213ss %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vfnmadd231ss %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vfnmadd132ss (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; SKYLAKE-NEXT: vfnmadd213ss (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; SKYLAKE-NEXT: vfnmadd231ss (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; KNL-LABEL: test_vfnmaddss_128:
+; KNL: # %bb.0:
+; KNL-NEXT: #APP
+; KNL-NEXT: vfnmadd132ss %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT: vfnmadd213ss %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT: vfnmadd231ss %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT: vfnmadd132ss (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; KNL-NEXT: vfnmadd213ss (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; KNL-NEXT: vfnmadd231ss (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; KNL-NEXT: #NO_APP
+; KNL-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_vfnmaddss_128:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: vfnmadd132ss %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: vfnmadd213ss %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: vfnmadd231ss %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: vfnmadd132ss (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; SKX-NEXT: vfnmadd213ss (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; SKX-NEXT: vfnmadd231ss (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_vfnmaddss_128:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: vfnmadd132ss %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; ZNVER1-NEXT: vfnmadd213ss %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; ZNVER1-NEXT: vfnmadd231ss %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; ZNVER1-NEXT: vfnmadd132ss (%rdi), %xmm1, %xmm0 # sched: [12:0.50]
+; ZNVER1-NEXT: vfnmadd213ss (%rdi), %xmm1, %xmm0 # sched: [12:0.50]
+; ZNVER1-NEXT: vfnmadd231ss (%rdi), %xmm1, %xmm0 # sched: [12:0.50]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ tail call void asm "vfnmadd132ss $2, $1, $0 \0A\09 vfnmadd213ss $2, $1, $0 \0A\09 vfnmadd231ss $2, $1, $0 \0A\09 vfnmadd132ss $3, $1, $0 \0A\09 vfnmadd213ss $3, $1, $0 \0A\09 vfnmadd231ss $3, $1, $0", "x,x,x,*m"(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> *%a3) nounwind
+ ret void
+}
+
+;
+; VFNMSUB
+;
+
+define void @test_vfnmsubpd_128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> *%a3) optsize {
+; GENERIC-LABEL: test_vfnmsubpd_128:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: vfnmsub132pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT: vfnmsub213pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT: vfnmsub231pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT: vfnmsub132pd (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT: vfnmsub213pd (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT: vfnmsub231pd (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_vfnmsubpd_128:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: vfnmsub132pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfnmsub213pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfnmsub231pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfnmsub132pd (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
+; HASWELL-NEXT: vfnmsub213pd (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
+; HASWELL-NEXT: vfnmsub231pd (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_vfnmsubpd_128:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: vfnmsub132pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; BROADWELL-NEXT: vfnmsub213pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; BROADWELL-NEXT: vfnmsub231pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; BROADWELL-NEXT: vfnmsub132pd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; BROADWELL-NEXT: vfnmsub213pd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; BROADWELL-NEXT: vfnmsub231pd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_vfnmsubpd_128:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: vfnmsub132pd %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vfnmsub213pd %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vfnmsub231pd %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vfnmsub132pd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; SKYLAKE-NEXT: vfnmsub213pd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; SKYLAKE-NEXT: vfnmsub231pd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; KNL-LABEL: test_vfnmsubpd_128:
+; KNL: # %bb.0:
+; KNL-NEXT: #APP
+; KNL-NEXT: vfnmsub132pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT: vfnmsub213pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT: vfnmsub231pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT: vfnmsub132pd (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
+; KNL-NEXT: vfnmsub213pd (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
+; KNL-NEXT: vfnmsub231pd (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
+; KNL-NEXT: #NO_APP
+; KNL-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_vfnmsubpd_128:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: vfnmsub132pd %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: vfnmsub213pd %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: vfnmsub231pd %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: vfnmsub132pd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; SKX-NEXT: vfnmsub213pd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; SKX-NEXT: vfnmsub231pd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_vfnmsubpd_128:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: vfnmsub132pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; ZNVER1-NEXT: vfnmsub213pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; ZNVER1-NEXT: vfnmsub231pd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; ZNVER1-NEXT: vfnmsub132pd (%rdi), %xmm1, %xmm0 # sched: [12:0.50]
+; ZNVER1-NEXT: vfnmsub213pd (%rdi), %xmm1, %xmm0 # sched: [12:0.50]
+; ZNVER1-NEXT: vfnmsub231pd (%rdi), %xmm1, %xmm0 # sched: [12:0.50]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ tail call void asm "vfnmsub132pd $2, $1, $0 \0A\09 vfnmsub213pd $2, $1, $0 \0A\09 vfnmsub231pd $2, $1, $0 \0A\09 vfnmsub132pd $3, $1, $0 \0A\09 vfnmsub213pd $3, $1, $0 \0A\09 vfnmsub231pd $3, $1, $0", "x,x,x,*m"(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> *%a3) nounwind
+ ret void
+}
+
+define void @test_vfnmsubpd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, <4 x double> *%a3) optsize {
+; GENERIC-LABEL: test_vfnmsubpd_256:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: vfnmsub132pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; GENERIC-NEXT: vfnmsub213pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; GENERIC-NEXT: vfnmsub231pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; GENERIC-NEXT: vfnmsub132pd (%rdi), %ymm1, %ymm0 # sched: [9:0.50]
+; GENERIC-NEXT: vfnmsub213pd (%rdi), %ymm1, %ymm0 # sched: [9:0.50]
+; GENERIC-NEXT: vfnmsub231pd (%rdi), %ymm1, %ymm0 # sched: [9:0.50]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_vfnmsubpd_256:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: vfnmsub132pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfnmsub213pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfnmsub231pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfnmsub132pd (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; HASWELL-NEXT: vfnmsub213pd (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; HASWELL-NEXT: vfnmsub231pd (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: vzeroupper # sched: [4:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_vfnmsubpd_256:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: vfnmsub132pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; BROADWELL-NEXT: vfnmsub213pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; BROADWELL-NEXT: vfnmsub231pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; BROADWELL-NEXT: vfnmsub132pd (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; BROADWELL-NEXT: vfnmsub213pd (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; BROADWELL-NEXT: vfnmsub231pd (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: vzeroupper # sched: [4:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_vfnmsubpd_256:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: vfnmsub132pd %ymm2, %ymm1, %ymm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vfnmsub213pd %ymm2, %ymm1, %ymm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vfnmsub231pd %ymm2, %ymm1, %ymm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vfnmsub132pd (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; SKYLAKE-NEXT: vfnmsub213pd (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; SKYLAKE-NEXT: vfnmsub231pd (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: vzeroupper # sched: [4:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; KNL-LABEL: test_vfnmsubpd_256:
+; KNL: # %bb.0:
+; KNL-NEXT: #APP
+; KNL-NEXT: vfnmsub132pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; KNL-NEXT: vfnmsub213pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; KNL-NEXT: vfnmsub231pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; KNL-NEXT: vfnmsub132pd (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; KNL-NEXT: vfnmsub213pd (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; KNL-NEXT: vfnmsub231pd (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; KNL-NEXT: #NO_APP
+; KNL-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_vfnmsubpd_256:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: vfnmsub132pd %ymm2, %ymm1, %ymm0 # sched: [4:0.33]
+; SKX-NEXT: vfnmsub213pd %ymm2, %ymm1, %ymm0 # sched: [4:0.33]
+; SKX-NEXT: vfnmsub231pd %ymm2, %ymm1, %ymm0 # sched: [4:0.33]
+; SKX-NEXT: vfnmsub132pd (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; SKX-NEXT: vfnmsub213pd (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; SKX-NEXT: vfnmsub231pd (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: vzeroupper # sched: [4:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_vfnmsubpd_256:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: vfnmsub132pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; ZNVER1-NEXT: vfnmsub213pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; ZNVER1-NEXT: vfnmsub231pd %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; ZNVER1-NEXT: vfnmsub132pd (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; ZNVER1-NEXT: vfnmsub213pd (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; ZNVER1-NEXT: vfnmsub231pd (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: vzeroupper # sched: [100:?]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ tail call void asm "vfnmsub132pd $2, $1, $0 \0A\09 vfnmsub213pd $2, $1, $0 \0A\09 vfnmsub231pd $2, $1, $0 \0A\09 vfnmsub132pd $3, $1, $0 \0A\09 vfnmsub213pd $3, $1, $0 \0A\09 vfnmsub231pd $3, $1, $0", "x,x,x,*m"(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, <4 x double> *%a3) nounwind
+ ret void
+}
+
+define void @test_vfnmsubps_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> *%a3) optsize {
+; GENERIC-LABEL: test_vfnmsubps_128:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: vfnmsub132ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT: vfnmsub213ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT: vfnmsub231ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT: vfnmsub132ps (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT: vfnmsub213ps (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT: vfnmsub231ps (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_vfnmsubps_128:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: vfnmsub132ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfnmsub213ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfnmsub231ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfnmsub132ps (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
+; HASWELL-NEXT: vfnmsub213ps (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
+; HASWELL-NEXT: vfnmsub231ps (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_vfnmsubps_128:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: vfnmsub132ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; BROADWELL-NEXT: vfnmsub213ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; BROADWELL-NEXT: vfnmsub231ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; BROADWELL-NEXT: vfnmsub132ps (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; BROADWELL-NEXT: vfnmsub213ps (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; BROADWELL-NEXT: vfnmsub231ps (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_vfnmsubps_128:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: vfnmsub132ps %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vfnmsub213ps %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vfnmsub231ps %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vfnmsub132ps (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; SKYLAKE-NEXT: vfnmsub213ps (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; SKYLAKE-NEXT: vfnmsub231ps (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; KNL-LABEL: test_vfnmsubps_128:
+; KNL: # %bb.0:
+; KNL-NEXT: #APP
+; KNL-NEXT: vfnmsub132ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT: vfnmsub213ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT: vfnmsub231ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT: vfnmsub132ps (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
+; KNL-NEXT: vfnmsub213ps (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
+; KNL-NEXT: vfnmsub231ps (%rdi), %xmm1, %xmm0 # sched: [11:0.50]
+; KNL-NEXT: #NO_APP
+; KNL-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_vfnmsubps_128:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: vfnmsub132ps %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: vfnmsub213ps %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: vfnmsub231ps %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: vfnmsub132ps (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; SKX-NEXT: vfnmsub213ps (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; SKX-NEXT: vfnmsub231ps (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_vfnmsubps_128:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: vfnmsub132ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; ZNVER1-NEXT: vfnmsub213ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; ZNVER1-NEXT: vfnmsub231ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; ZNVER1-NEXT: vfnmsub132ps (%rdi), %xmm1, %xmm0 # sched: [12:0.50]
+; ZNVER1-NEXT: vfnmsub213ps (%rdi), %xmm1, %xmm0 # sched: [12:0.50]
+; ZNVER1-NEXT: vfnmsub231ps (%rdi), %xmm1, %xmm0 # sched: [12:0.50]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ tail call void asm "vfnmsub132ps $2, $1, $0 \0A\09 vfnmsub213ps $2, $1, $0 \0A\09 vfnmsub231ps $2, $1, $0 \0A\09 vfnmsub132ps $3, $1, $0 \0A\09 vfnmsub213ps $3, $1, $0 \0A\09 vfnmsub231ps $3, $1, $0", "x,x,x,*m"(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> *%a3) nounwind
+ ret void
+}
+
+define void @test_vfnmsubps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, <8 x float> *%a3) optsize {
+; GENERIC-LABEL: test_vfnmsubps_256:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: vfnmsub132ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; GENERIC-NEXT: vfnmsub213ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; GENERIC-NEXT: vfnmsub231ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; GENERIC-NEXT: vfnmsub132ps (%rdi), %ymm1, %ymm0 # sched: [9:0.50]
+; GENERIC-NEXT: vfnmsub213ps (%rdi), %ymm1, %ymm0 # sched: [9:0.50]
+; GENERIC-NEXT: vfnmsub231ps (%rdi), %ymm1, %ymm0 # sched: [9:0.50]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_vfnmsubps_256:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: vfnmsub132ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfnmsub213ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfnmsub231ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfnmsub132ps (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; HASWELL-NEXT: vfnmsub213ps (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; HASWELL-NEXT: vfnmsub231ps (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: vzeroupper # sched: [4:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_vfnmsubps_256:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: vfnmsub132ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; BROADWELL-NEXT: vfnmsub213ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; BROADWELL-NEXT: vfnmsub231ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; BROADWELL-NEXT: vfnmsub132ps (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; BROADWELL-NEXT: vfnmsub213ps (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; BROADWELL-NEXT: vfnmsub231ps (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: vzeroupper # sched: [4:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_vfnmsubps_256:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: vfnmsub132ps %ymm2, %ymm1, %ymm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vfnmsub213ps %ymm2, %ymm1, %ymm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vfnmsub231ps %ymm2, %ymm1, %ymm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vfnmsub132ps (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; SKYLAKE-NEXT: vfnmsub213ps (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; SKYLAKE-NEXT: vfnmsub231ps (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: vzeroupper # sched: [4:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; KNL-LABEL: test_vfnmsubps_256:
+; KNL: # %bb.0:
+; KNL-NEXT: #APP
+; KNL-NEXT: vfnmsub132ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; KNL-NEXT: vfnmsub213ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; KNL-NEXT: vfnmsub231ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; KNL-NEXT: vfnmsub132ps (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; KNL-NEXT: vfnmsub213ps (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; KNL-NEXT: vfnmsub231ps (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; KNL-NEXT: #NO_APP
+; KNL-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_vfnmsubps_256:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: vfnmsub132ps %ymm2, %ymm1, %ymm0 # sched: [4:0.33]
+; SKX-NEXT: vfnmsub213ps %ymm2, %ymm1, %ymm0 # sched: [4:0.33]
+; SKX-NEXT: vfnmsub231ps %ymm2, %ymm1, %ymm0 # sched: [4:0.33]
+; SKX-NEXT: vfnmsub132ps (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; SKX-NEXT: vfnmsub213ps (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; SKX-NEXT: vfnmsub231ps (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: vzeroupper # sched: [4:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_vfnmsubps_256:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: vfnmsub132ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; ZNVER1-NEXT: vfnmsub213ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; ZNVER1-NEXT: vfnmsub231ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; ZNVER1-NEXT: vfnmsub132ps (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; ZNVER1-NEXT: vfnmsub213ps (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; ZNVER1-NEXT: vfnmsub231ps (%rdi), %ymm1, %ymm0 # sched: [12:0.50]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: vzeroupper # sched: [100:?]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ tail call void asm "vfnmsub132ps $2, $1, $0 \0A\09 vfnmsub213ps $2, $1, $0 \0A\09 vfnmsub231ps $2, $1, $0 \0A\09 vfnmsub132ps $3, $1, $0 \0A\09 vfnmsub213ps $3, $1, $0 \0A\09 vfnmsub231ps $3, $1, $0", "x,x,x,*m"(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, <8 x float> *%a3) nounwind
+ ret void
+}
+
+define void @test_vfnmsubsd_128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> *%a3) optsize {
+; GENERIC-LABEL: test_vfnmsubsd_128:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: vfnmsub132sd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT: vfnmsub213sd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT: vfnmsub231sd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT: vfnmsub132sd (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT: vfnmsub213sd (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT: vfnmsub231sd (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_vfnmsubsd_128:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: vfnmsub132sd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfnmsub213sd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfnmsub231sd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfnmsub132sd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; HASWELL-NEXT: vfnmsub213sd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; HASWELL-NEXT: vfnmsub231sd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_vfnmsubsd_128:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: vfnmsub132sd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; BROADWELL-NEXT: vfnmsub213sd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; BROADWELL-NEXT: vfnmsub231sd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; BROADWELL-NEXT: vfnmsub132sd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; BROADWELL-NEXT: vfnmsub213sd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; BROADWELL-NEXT: vfnmsub231sd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_vfnmsubsd_128:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: vfnmsub132sd %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vfnmsub213sd %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vfnmsub231sd %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vfnmsub132sd (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; SKYLAKE-NEXT: vfnmsub213sd (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; SKYLAKE-NEXT: vfnmsub231sd (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; KNL-LABEL: test_vfnmsubsd_128:
+; KNL: # %bb.0:
+; KNL-NEXT: #APP
+; KNL-NEXT: vfnmsub132sd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT: vfnmsub213sd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT: vfnmsub231sd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT: vfnmsub132sd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; KNL-NEXT: vfnmsub213sd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; KNL-NEXT: vfnmsub231sd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; KNL-NEXT: #NO_APP
+; KNL-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_vfnmsubsd_128:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: vfnmsub132sd %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: vfnmsub213sd %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: vfnmsub231sd %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: vfnmsub132sd (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; SKX-NEXT: vfnmsub213sd (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; SKX-NEXT: vfnmsub231sd (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_vfnmsubsd_128:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: vfnmsub132sd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; ZNVER1-NEXT: vfnmsub213sd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; ZNVER1-NEXT: vfnmsub231sd %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; ZNVER1-NEXT: vfnmsub132sd (%rdi), %xmm1, %xmm0 # sched: [12:0.50]
+; ZNVER1-NEXT: vfnmsub213sd (%rdi), %xmm1, %xmm0 # sched: [12:0.50]
+; ZNVER1-NEXT: vfnmsub231sd (%rdi), %xmm1, %xmm0 # sched: [12:0.50]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ tail call void asm "vfnmsub132sd $2, $1, $0 \0A\09 vfnmsub213sd $2, $1, $0 \0A\09 vfnmsub231sd $2, $1, $0 \0A\09 vfnmsub132sd $3, $1, $0 \0A\09 vfnmsub213sd $3, $1, $0 \0A\09 vfnmsub231sd $3, $1, $0", "x,x,x,*m"(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> *%a3) nounwind
+ ret void
+}
+
+define void @test_vfnmsubss_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> *%a3) optsize {
+; GENERIC-LABEL: test_vfnmsubss_128:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: vfnmsub132ss %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT: vfnmsub213ss %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT: vfnmsub231ss %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT: vfnmsub132ss (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT: vfnmsub213ss (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT: vfnmsub231ss (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_vfnmsubss_128:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: vfnmsub132ss %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfnmsub213ss %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfnmsub231ss %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfnmsub132ss (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; HASWELL-NEXT: vfnmsub213ss (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; HASWELL-NEXT: vfnmsub231ss (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_vfnmsubss_128:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: vfnmsub132ss %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; BROADWELL-NEXT: vfnmsub213ss %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; BROADWELL-NEXT: vfnmsub231ss %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; BROADWELL-NEXT: vfnmsub132ss (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; BROADWELL-NEXT: vfnmsub213ss (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; BROADWELL-NEXT: vfnmsub231ss (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_vfnmsubss_128:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: vfnmsub132ss %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vfnmsub213ss %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vfnmsub231ss %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vfnmsub132ss (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; SKYLAKE-NEXT: vfnmsub213ss (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; SKYLAKE-NEXT: vfnmsub231ss (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; KNL-LABEL: test_vfnmsubss_128:
+; KNL: # %bb.0:
+; KNL-NEXT: #APP
+; KNL-NEXT: vfnmsub132ss %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT: vfnmsub213ss %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT: vfnmsub231ss %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT: vfnmsub132ss (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; KNL-NEXT: vfnmsub213ss (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; KNL-NEXT: vfnmsub231ss (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; KNL-NEXT: #NO_APP
+; KNL-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_vfnmsubss_128:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: vfnmsub132ss %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: vfnmsub213ss %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: vfnmsub231ss %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: vfnmsub132ss (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; SKX-NEXT: vfnmsub213ss (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; SKX-NEXT: vfnmsub231ss (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_vfnmsubss_128:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: vfnmsub132ss %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; ZNVER1-NEXT: vfnmsub213ss %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; ZNVER1-NEXT: vfnmsub231ss %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; ZNVER1-NEXT: vfnmsub132ss (%rdi), %xmm1, %xmm0 # sched: [12:0.50]
+; ZNVER1-NEXT: vfnmsub213ss (%rdi), %xmm1, %xmm0 # sched: [12:0.50]
+; ZNVER1-NEXT: vfnmsub231ss (%rdi), %xmm1, %xmm0 # sched: [12:0.50]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ tail call void asm "vfnmsub132ss $2, $1, $0 \0A\09 vfnmsub213ss $2, $1, $0 \0A\09 vfnmsub231ss $2, $1, $0 \0A\09 vfnmsub132ss $3, $1, $0 \0A\09 vfnmsub213ss $3, $1, $0 \0A\09 vfnmsub231ss $3, $1, $0", "x,x,x,*m"(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> *%a3) nounwind
+ ret void
+}
diff --git a/test/CodeGen/X86/fma.ll b/test/CodeGen/X86/fma.ll
index 2c942347d54c..611f707d64c9 100644
--- a/test/CodeGen/X86/fma.ll
+++ b/test/CodeGen/X86/fma.ll
@@ -10,7 +10,7 @@
define float @test_f32(float %a, float %b, float %c) #0 {
; FMA32-LABEL: test_f32:
-; FMA32: ## BB#0: ## %entry
+; FMA32: ## %bb.0: ## %entry
; FMA32-NEXT: pushl %eax ## encoding: [0x50]
; FMA32-NEXT: vmovss {{[0-9]+}}(%esp), %xmm0 ## encoding: [0xc5,0xfa,0x10,0x44,0x24,0x08]
; FMA32-NEXT: ## xmm0 = mem[0],zero,zero,zero
@@ -23,29 +23,29 @@ define float @test_f32(float %a, float %b, float %c) #0 {
; FMA32-NEXT: retl ## encoding: [0xc3]
;
; FMACALL32-LABEL: test_f32:
-; FMACALL32: ## BB#0: ## %entry
+; FMACALL32: ## %bb.0: ## %entry
; FMACALL32-NEXT: jmp _fmaf ## TAILCALL
; FMACALL32-NEXT: ## encoding: [0xeb,A]
; FMACALL32-NEXT: ## fixup A - offset: 1, value: _fmaf-1, kind: FK_PCRel_1
;
; FMA64-LABEL: test_f32:
-; FMA64: ## BB#0: ## %entry
+; FMA64: ## %bb.0: ## %entry
; FMA64-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0x71,0xa9,0xc2]
; FMA64-NEXT: retq ## encoding: [0xc3]
;
; FMACALL64-LABEL: test_f32:
-; FMACALL64: ## BB#0: ## %entry
+; FMACALL64: ## %bb.0: ## %entry
; FMACALL64-NEXT: jmp _fmaf ## TAILCALL
; FMACALL64-NEXT: ## encoding: [0xeb,A]
; FMACALL64-NEXT: ## fixup A - offset: 1, value: _fmaf-1, kind: FK_PCRel_1
;
; AVX512-LABEL: test_f32:
-; AVX512: ## BB#0: ## %entry
+; AVX512: ## %bb.0: ## %entry
; AVX512-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xa9,0xc2]
; AVX512-NEXT: retq ## encoding: [0xc3]
;
; AVX512VL-LABEL: test_f32:
-; AVX512VL: ## BB#0: ## %entry
+; AVX512VL: ## %bb.0: ## %entry
; AVX512VL-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xa9,0xc2]
; AVX512VL-NEXT: retq ## encoding: [0xc3]
entry:
@@ -55,7 +55,7 @@ entry:
define double @test_f64(double %a, double %b, double %c) #0 {
; FMA32-LABEL: test_f64:
-; FMA32: ## BB#0: ## %entry
+; FMA32: ## %bb.0: ## %entry
; FMA32-NEXT: subl $12, %esp ## encoding: [0x83,0xec,0x0c]
; FMA32-NEXT: vmovsd {{[0-9]+}}(%esp), %xmm0 ## encoding: [0xc5,0xfb,0x10,0x44,0x24,0x10]
; FMA32-NEXT: ## xmm0 = mem[0],zero
@@ -68,29 +68,29 @@ define double @test_f64(double %a, double %b, double %c) #0 {
; FMA32-NEXT: retl ## encoding: [0xc3]
;
; FMACALL32-LABEL: test_f64:
-; FMACALL32: ## BB#0: ## %entry
+; FMACALL32: ## %bb.0: ## %entry
; FMACALL32-NEXT: jmp _fma ## TAILCALL
; FMACALL32-NEXT: ## encoding: [0xeb,A]
; FMACALL32-NEXT: ## fixup A - offset: 1, value: _fma-1, kind: FK_PCRel_1
;
; FMA64-LABEL: test_f64:
-; FMA64: ## BB#0: ## %entry
+; FMA64: ## %bb.0: ## %entry
; FMA64-NEXT: vfmadd213sd %xmm2, %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0xf1,0xa9,0xc2]
; FMA64-NEXT: retq ## encoding: [0xc3]
;
; FMACALL64-LABEL: test_f64:
-; FMACALL64: ## BB#0: ## %entry
+; FMACALL64: ## %bb.0: ## %entry
; FMACALL64-NEXT: jmp _fma ## TAILCALL
; FMACALL64-NEXT: ## encoding: [0xeb,A]
; FMACALL64-NEXT: ## fixup A - offset: 1, value: _fma-1, kind: FK_PCRel_1
;
; AVX512-LABEL: test_f64:
-; AVX512: ## BB#0: ## %entry
+; AVX512: ## %bb.0: ## %entry
; AVX512-NEXT: vfmadd213sd %xmm2, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xa9,0xc2]
; AVX512-NEXT: retq ## encoding: [0xc3]
;
; AVX512VL-LABEL: test_f64:
-; AVX512VL: ## BB#0: ## %entry
+; AVX512VL: ## %bb.0: ## %entry
; AVX512VL-NEXT: vfmadd213sd %xmm2, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xa9,0xc2]
; AVX512VL-NEXT: retq ## encoding: [0xc3]
entry:
@@ -100,7 +100,7 @@ entry:
define x86_fp80 @test_f80(x86_fp80 %a, x86_fp80 %b, x86_fp80 %c) #0 {
; FMA32-LABEL: test_f80:
-; FMA32: ## BB#0: ## %entry
+; FMA32: ## %bb.0: ## %entry
; FMA32-NEXT: subl $60, %esp ## encoding: [0x83,0xec,0x3c]
; FMA32-NEXT: fldt {{[0-9]+}}(%esp) ## encoding: [0xdb,0x6c,0x24,0x40]
; FMA32-NEXT: fldt {{[0-9]+}}(%esp) ## encoding: [0xdb,0x6c,0x24,0x50]
@@ -114,7 +114,7 @@ define x86_fp80 @test_f80(x86_fp80 %a, x86_fp80 %b, x86_fp80 %c) #0 {
; FMA32-NEXT: retl ## encoding: [0xc3]
;
; FMACALL32-LABEL: test_f80:
-; FMACALL32: ## BB#0: ## %entry
+; FMACALL32: ## %bb.0: ## %entry
; FMACALL32-NEXT: subl $60, %esp ## encoding: [0x83,0xec,0x3c]
; FMACALL32-NEXT: fldt {{[0-9]+}}(%esp) ## encoding: [0xdb,0x6c,0x24,0x40]
; FMACALL32-NEXT: fldt {{[0-9]+}}(%esp) ## encoding: [0xdb,0x6c,0x24,0x50]
@@ -128,7 +128,7 @@ define x86_fp80 @test_f80(x86_fp80 %a, x86_fp80 %b, x86_fp80 %c) #0 {
; FMACALL32-NEXT: retl ## encoding: [0xc3]
;
; FMA64-LABEL: test_f80:
-; FMA64: ## BB#0: ## %entry
+; FMA64: ## %bb.0: ## %entry
; FMA64-NEXT: subq $56, %rsp ## encoding: [0x48,0x83,0xec,0x38]
; FMA64-NEXT: fldt {{[0-9]+}}(%rsp) ## encoding: [0xdb,0x6c,0x24,0x40]
; FMA64-NEXT: fldt {{[0-9]+}}(%rsp) ## encoding: [0xdb,0x6c,0x24,0x50]
@@ -142,7 +142,7 @@ define x86_fp80 @test_f80(x86_fp80 %a, x86_fp80 %b, x86_fp80 %c) #0 {
; FMA64-NEXT: retq ## encoding: [0xc3]
;
; FMACALL64-LABEL: test_f80:
-; FMACALL64: ## BB#0: ## %entry
+; FMACALL64: ## %bb.0: ## %entry
; FMACALL64-NEXT: subq $56, %rsp ## encoding: [0x48,0x83,0xec,0x38]
; FMACALL64-NEXT: fldt {{[0-9]+}}(%rsp) ## encoding: [0xdb,0x6c,0x24,0x40]
; FMACALL64-NEXT: fldt {{[0-9]+}}(%rsp) ## encoding: [0xdb,0x6c,0x24,0x50]
@@ -156,7 +156,7 @@ define x86_fp80 @test_f80(x86_fp80 %a, x86_fp80 %b, x86_fp80 %c) #0 {
; FMACALL64-NEXT: retq ## encoding: [0xc3]
;
; AVX512-LABEL: test_f80:
-; AVX512: ## BB#0: ## %entry
+; AVX512: ## %bb.0: ## %entry
; AVX512-NEXT: subq $56, %rsp ## encoding: [0x48,0x83,0xec,0x38]
; AVX512-NEXT: fldt {{[0-9]+}}(%rsp) ## encoding: [0xdb,0x6c,0x24,0x40]
; AVX512-NEXT: fldt {{[0-9]+}}(%rsp) ## encoding: [0xdb,0x6c,0x24,0x50]
@@ -170,7 +170,7 @@ define x86_fp80 @test_f80(x86_fp80 %a, x86_fp80 %b, x86_fp80 %c) #0 {
; AVX512-NEXT: retq ## encoding: [0xc3]
;
; AVX512VL-LABEL: test_f80:
-; AVX512VL: ## BB#0: ## %entry
+; AVX512VL: ## %bb.0: ## %entry
; AVX512VL-NEXT: subq $56, %rsp ## encoding: [0x48,0x83,0xec,0x38]
; AVX512VL-NEXT: fldt {{[0-9]+}}(%rsp) ## encoding: [0xdb,0x6c,0x24,0x40]
; AVX512VL-NEXT: fldt {{[0-9]+}}(%rsp) ## encoding: [0xdb,0x6c,0x24,0x50]
@@ -189,40 +189,40 @@ entry:
define float @test_f32_cst() #0 {
; FMA32-LABEL: test_f32_cst:
-; FMA32: ## BB#0: ## %entry
+; FMA32: ## %bb.0: ## %entry
; FMA32-NEXT: flds LCPI3_0 ## encoding: [0xd9,0x05,A,A,A,A]
; FMA32-NEXT: ## fixup A - offset: 2, value: LCPI3_0, kind: FK_Data_4
; FMA32-NEXT: retl ## encoding: [0xc3]
;
; FMACALL32-LABEL: test_f32_cst:
-; FMACALL32: ## BB#0: ## %entry
+; FMACALL32: ## %bb.0: ## %entry
; FMACALL32-NEXT: flds LCPI3_0 ## encoding: [0xd9,0x05,A,A,A,A]
; FMACALL32-NEXT: ## fixup A - offset: 2, value: LCPI3_0, kind: FK_Data_4
; FMACALL32-NEXT: retl ## encoding: [0xc3]
;
; FMA64-LABEL: test_f32_cst:
-; FMA64: ## BB#0: ## %entry
+; FMA64: ## %bb.0: ## %entry
; FMA64-NEXT: vmovss {{.*}}(%rip), %xmm0 ## encoding: [0xc5,0xfa,0x10,0x05,A,A,A,A]
; FMA64-NEXT: ## fixup A - offset: 4, value: LCPI3_0-4, kind: reloc_riprel_4byte
; FMA64-NEXT: ## xmm0 = mem[0],zero,zero,zero
; FMA64-NEXT: retq ## encoding: [0xc3]
;
; FMACALL64-LABEL: test_f32_cst:
-; FMACALL64: ## BB#0: ## %entry
+; FMACALL64: ## %bb.0: ## %entry
; FMACALL64-NEXT: movss {{.*}}(%rip), %xmm0 ## encoding: [0xf3,0x0f,0x10,0x05,A,A,A,A]
; FMACALL64-NEXT: ## fixup A - offset: 4, value: LCPI3_0-4, kind: reloc_riprel_4byte
; FMACALL64-NEXT: ## xmm0 = mem[0],zero,zero,zero
; FMACALL64-NEXT: retq ## encoding: [0xc3]
;
; AVX512-LABEL: test_f32_cst:
-; AVX512: ## BB#0: ## %entry
+; AVX512: ## %bb.0: ## %entry
; AVX512-NEXT: vmovss {{.*}}(%rip), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x05,A,A,A,A]
; AVX512-NEXT: ## fixup A - offset: 4, value: LCPI3_0-4, kind: reloc_riprel_4byte
; AVX512-NEXT: ## xmm0 = mem[0],zero,zero,zero
; AVX512-NEXT: retq ## encoding: [0xc3]
;
; AVX512VL-LABEL: test_f32_cst:
-; AVX512VL: ## BB#0: ## %entry
+; AVX512VL: ## %bb.0: ## %entry
; AVX512VL-NEXT: vmovss {{.*}}(%rip), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x05,A,A,A,A]
; AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI3_0-4, kind: reloc_riprel_4byte
; AVX512VL-NEXT: ## xmm0 = mem[0],zero,zero,zero
@@ -234,22 +234,22 @@ entry:
define <4 x float> @test_v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %c) #0 {
; FMA32-LABEL: test_v4f32:
-; FMA32: ## BB#0: ## %entry
+; FMA32: ## %bb.0: ## %entry
; FMA32-NEXT: vfmadd213ps %xmm2, %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0x71,0xa8,0xc2]
; FMA32-NEXT: retl ## encoding: [0xc3]
;
; FMA64-LABEL: test_v4f32:
-; FMA64: ## BB#0: ## %entry
+; FMA64: ## %bb.0: ## %entry
; FMA64-NEXT: vfmadd213ps %xmm2, %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0x71,0xa8,0xc2]
; FMA64-NEXT: retq ## encoding: [0xc3]
;
; AVX512-LABEL: test_v4f32:
-; AVX512: ## BB#0: ## %entry
+; AVX512: ## %bb.0: ## %entry
; AVX512-NEXT: vfmadd213ps %xmm2, %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0x71,0xa8,0xc2]
; AVX512-NEXT: retq ## encoding: [0xc3]
;
; AVX512VL-LABEL: test_v4f32:
-; AVX512VL: ## BB#0: ## %entry
+; AVX512VL: ## %bb.0: ## %entry
; AVX512VL-NEXT: vfmadd213ps %xmm2, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xa8,0xc2]
; AVX512VL-NEXT: retq ## encoding: [0xc3]
entry:
@@ -259,22 +259,22 @@ entry:
define <8 x float> @test_v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c) #0 {
; FMA32-LABEL: test_v8f32:
-; FMA32: ## BB#0: ## %entry
+; FMA32: ## %bb.0: ## %entry
; FMA32-NEXT: vfmadd213ps %ymm2, %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x75,0xa8,0xc2]
; FMA32-NEXT: retl ## encoding: [0xc3]
;
; FMA64-LABEL: test_v8f32:
-; FMA64: ## BB#0: ## %entry
+; FMA64: ## %bb.0: ## %entry
; FMA64-NEXT: vfmadd213ps %ymm2, %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x75,0xa8,0xc2]
; FMA64-NEXT: retq ## encoding: [0xc3]
;
; AVX512-LABEL: test_v8f32:
-; AVX512: ## BB#0: ## %entry
+; AVX512: ## %bb.0: ## %entry
; AVX512-NEXT: vfmadd213ps %ymm2, %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x75,0xa8,0xc2]
; AVX512-NEXT: retq ## encoding: [0xc3]
;
; AVX512VL-LABEL: test_v8f32:
-; AVX512VL: ## BB#0: ## %entry
+; AVX512VL: ## %bb.0: ## %entry
; AVX512VL-NEXT: vfmadd213ps %ymm2, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0xa8,0xc2]
; AVX512VL-NEXT: retq ## encoding: [0xc3]
entry:
@@ -284,7 +284,7 @@ entry:
define <16 x float> @test_v16f32(<16 x float> %a, <16 x float> %b, <16 x float> %c) #0 {
; FMA32-LABEL: test_v16f32:
-; FMA32: ## BB#0: ## %entry
+; FMA32: ## %bb.0: ## %entry
; FMA32-NEXT: pushl %ebp ## encoding: [0x55]
; FMA32-NEXT: movl %esp, %ebp ## encoding: [0x89,0xe5]
; FMA32-NEXT: andl $-32, %esp ## encoding: [0x83,0xe4,0xe0]
@@ -296,18 +296,18 @@ define <16 x float> @test_v16f32(<16 x float> %a, <16 x float> %b, <16 x float>
; FMA32-NEXT: retl ## encoding: [0xc3]
;
; FMA64-LABEL: test_v16f32:
-; FMA64: ## BB#0: ## %entry
+; FMA64: ## %bb.0: ## %entry
; FMA64-NEXT: vfmadd213ps %ymm4, %ymm2, %ymm0 ## encoding: [0xc4,0xe2,0x6d,0xa8,0xc4]
; FMA64-NEXT: vfmadd213ps %ymm5, %ymm3, %ymm1 ## encoding: [0xc4,0xe2,0x65,0xa8,0xcd]
; FMA64-NEXT: retq ## encoding: [0xc3]
;
; AVX512-LABEL: test_v16f32:
-; AVX512: ## BB#0: ## %entry
+; AVX512: ## %bb.0: ## %entry
; AVX512-NEXT: vfmadd213ps %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0x75,0x48,0xa8,0xc2]
; AVX512-NEXT: retq ## encoding: [0xc3]
;
; AVX512VL-LABEL: test_v16f32:
-; AVX512VL: ## BB#0: ## %entry
+; AVX512VL: ## %bb.0: ## %entry
; AVX512VL-NEXT: vfmadd213ps %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0x75,0x48,0xa8,0xc2]
; AVX512VL-NEXT: retq ## encoding: [0xc3]
entry:
@@ -317,22 +317,22 @@ entry:
define <2 x double> @test_v2f64(<2 x double> %a, <2 x double> %b, <2 x double> %c) #0 {
; FMA32-LABEL: test_v2f64:
-; FMA32: ## BB#0: ## %entry
+; FMA32: ## %bb.0: ## %entry
; FMA32-NEXT: vfmadd213pd %xmm2, %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0xf1,0xa8,0xc2]
; FMA32-NEXT: retl ## encoding: [0xc3]
;
; FMA64-LABEL: test_v2f64:
-; FMA64: ## BB#0: ## %entry
+; FMA64: ## %bb.0: ## %entry
; FMA64-NEXT: vfmadd213pd %xmm2, %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0xf1,0xa8,0xc2]
; FMA64-NEXT: retq ## encoding: [0xc3]
;
; AVX512-LABEL: test_v2f64:
-; AVX512: ## BB#0: ## %entry
+; AVX512: ## %bb.0: ## %entry
; AVX512-NEXT: vfmadd213pd %xmm2, %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0xf1,0xa8,0xc2]
; AVX512-NEXT: retq ## encoding: [0xc3]
;
; AVX512VL-LABEL: test_v2f64:
-; AVX512VL: ## BB#0: ## %entry
+; AVX512VL: ## %bb.0: ## %entry
; AVX512VL-NEXT: vfmadd213pd %xmm2, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xa8,0xc2]
; AVX512VL-NEXT: retq ## encoding: [0xc3]
entry:
@@ -342,22 +342,22 @@ entry:
define <4 x double> @test_v4f64(<4 x double> %a, <4 x double> %b, <4 x double> %c) #0 {
; FMA32-LABEL: test_v4f64:
-; FMA32: ## BB#0: ## %entry
+; FMA32: ## %bb.0: ## %entry
; FMA32-NEXT: vfmadd213pd %ymm2, %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0xf5,0xa8,0xc2]
; FMA32-NEXT: retl ## encoding: [0xc3]
;
; FMA64-LABEL: test_v4f64:
-; FMA64: ## BB#0: ## %entry
+; FMA64: ## %bb.0: ## %entry
; FMA64-NEXT: vfmadd213pd %ymm2, %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0xf5,0xa8,0xc2]
; FMA64-NEXT: retq ## encoding: [0xc3]
;
; AVX512-LABEL: test_v4f64:
-; AVX512: ## BB#0: ## %entry
+; AVX512: ## %bb.0: ## %entry
; AVX512-NEXT: vfmadd213pd %ymm2, %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0xf5,0xa8,0xc2]
; AVX512-NEXT: retq ## encoding: [0xc3]
;
; AVX512VL-LABEL: test_v4f64:
-; AVX512VL: ## BB#0: ## %entry
+; AVX512VL: ## %bb.0: ## %entry
; AVX512VL-NEXT: vfmadd213pd %ymm2, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf5,0xa8,0xc2]
; AVX512VL-NEXT: retq ## encoding: [0xc3]
entry:
@@ -367,7 +367,7 @@ entry:
define <8 x double> @test_v8f64(<8 x double> %a, <8 x double> %b, <8 x double> %c) #0 {
; FMA32-LABEL: test_v8f64:
-; FMA32: ## BB#0: ## %entry
+; FMA32: ## %bb.0: ## %entry
; FMA32-NEXT: pushl %ebp ## encoding: [0x55]
; FMA32-NEXT: movl %esp, %ebp ## encoding: [0x89,0xe5]
; FMA32-NEXT: andl $-32, %esp ## encoding: [0x83,0xe4,0xe0]
@@ -379,18 +379,18 @@ define <8 x double> @test_v8f64(<8 x double> %a, <8 x double> %b, <8 x double> %
; FMA32-NEXT: retl ## encoding: [0xc3]
;
; FMA64-LABEL: test_v8f64:
-; FMA64: ## BB#0: ## %entry
+; FMA64: ## %bb.0: ## %entry
; FMA64-NEXT: vfmadd213pd %ymm4, %ymm2, %ymm0 ## encoding: [0xc4,0xe2,0xed,0xa8,0xc4]
; FMA64-NEXT: vfmadd213pd %ymm5, %ymm3, %ymm1 ## encoding: [0xc4,0xe2,0xe5,0xa8,0xcd]
; FMA64-NEXT: retq ## encoding: [0xc3]
;
; AVX512-LABEL: test_v8f64:
-; AVX512: ## BB#0: ## %entry
+; AVX512: ## %bb.0: ## %entry
; AVX512-NEXT: vfmadd213pd %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0xf5,0x48,0xa8,0xc2]
; AVX512-NEXT: retq ## encoding: [0xc3]
;
; AVX512VL-LABEL: test_v8f64:
-; AVX512VL: ## BB#0: ## %entry
+; AVX512VL: ## %bb.0: ## %entry
; AVX512VL-NEXT: vfmadd213pd %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0xf5,0x48,0xa8,0xc2]
; AVX512VL-NEXT: retq ## encoding: [0xc3]
entry:
diff --git a/test/CodeGen/X86/fma4-commute-x86.ll b/test/CodeGen/X86/fma4-commute-x86.ll
new file mode 100644
index 000000000000..cfc6837e453c
--- /dev/null
+++ b/test/CodeGen/X86/fma4-commute-x86.ll
@@ -0,0 +1,563 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mcpu=bdver2 -mtriple=x86_64-pc-win32 | FileCheck %s --check-prefix=FMA4 --check-prefix=FMA
+
+attributes #0 = { nounwind }
+
+declare <4 x float> @llvm.x86.fma4.vfmadd.ss(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
+define <4 x float> @test_x86_fmadd_baa_ss(<4 x float> %a, <4 x float> %b) #0 {
+; FMA4-LABEL: test_x86_fmadd_baa_ss:
+; FMA4: # %bb.0:
+; FMA4-NEXT: vmovaps (%rcx), %xmm0
+; FMA4-NEXT: vfmaddss %xmm0, (%rdx), %xmm0, %xmm0
+; FMA4-NEXT: retq
+ %res = call <4 x float> @llvm.x86.fma4.vfmadd.ss(<4 x float> %b, <4 x float> %a, <4 x float> %a) nounwind
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_x86_fmadd_aba_ss(<4 x float> %a, <4 x float> %b) #0 {
+; FMA4-LABEL: test_x86_fmadd_aba_ss:
+; FMA4: # %bb.0:
+; FMA4-NEXT: vmovaps (%rcx), %xmm0
+; FMA4-NEXT: vfmaddss %xmm0, (%rdx), %xmm0, %xmm0
+; FMA4-NEXT: retq
+ %res = call <4 x float> @llvm.x86.fma4.vfmadd.ss(<4 x float> %a, <4 x float> %b, <4 x float> %a) nounwind
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_x86_fmadd_bba_ss(<4 x float> %a, <4 x float> %b) #0 {
+; FMA4-LABEL: test_x86_fmadd_bba_ss:
+; FMA4: # %bb.0:
+; FMA4-NEXT: vmovaps (%rdx), %xmm0
+; FMA4-NEXT: vfmaddss (%rcx), %xmm0, %xmm0, %xmm0
+; FMA4-NEXT: retq
+ %res = call <4 x float> @llvm.x86.fma4.vfmadd.ss(<4 x float> %b, <4 x float> %b, <4 x float> %a) nounwind
+ ret <4 x float> %res
+}
+
+declare <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
+define <4 x float> @test_x86_fmadd_baa_ps(<4 x float> %a, <4 x float> %b) #0 {
+; FMA4-LABEL: test_x86_fmadd_baa_ps:
+; FMA4: # %bb.0:
+; FMA4-NEXT: vmovaps (%rcx), %xmm0
+; FMA4-NEXT: vfmaddps %xmm0, (%rdx), %xmm0, %xmm0
+; FMA4-NEXT: retq
+ %res = call <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float> %b, <4 x float> %a, <4 x float> %a) nounwind
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_x86_fmadd_aba_ps(<4 x float> %a, <4 x float> %b) #0 {
+; FMA4-LABEL: test_x86_fmadd_aba_ps:
+; FMA4: # %bb.0:
+; FMA4-NEXT: vmovaps (%rcx), %xmm0
+; FMA4-NEXT: vfmaddps %xmm0, (%rdx), %xmm0, %xmm0
+; FMA4-NEXT: retq
+ %res = call <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float> %a, <4 x float> %b, <4 x float> %a) nounwind
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_x86_fmadd_bba_ps(<4 x float> %a, <4 x float> %b) #0 {
+; FMA4-LABEL: test_x86_fmadd_bba_ps:
+; FMA4: # %bb.0:
+; FMA4-NEXT: vmovaps (%rdx), %xmm0
+; FMA4-NEXT: vfmaddps (%rcx), %xmm0, %xmm0, %xmm0
+; FMA4-NEXT: retq
+ %res = call <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float> %b, <4 x float> %b, <4 x float> %a) nounwind
+ ret <4 x float> %res
+}
+
+declare <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float>, <8 x float>, <8 x float>) nounwind readnone
+define <8 x float> @test_x86_fmadd_baa_ps_y(<8 x float> %a, <8 x float> %b) #0 {
+; FMA4-LABEL: test_x86_fmadd_baa_ps_y:
+; FMA4: # %bb.0:
+; FMA4-NEXT: vmovaps (%rcx), %ymm0
+; FMA4-NEXT: vfmaddps %ymm0, (%rdx), %ymm0, %ymm0
+; FMA4-NEXT: retq
+ %res = call <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float> %b, <8 x float> %a, <8 x float> %a) nounwind
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_x86_fmadd_aba_ps_y(<8 x float> %a, <8 x float> %b) #0 {
+; FMA4-LABEL: test_x86_fmadd_aba_ps_y:
+; FMA4: # %bb.0:
+; FMA4-NEXT: vmovaps (%rcx), %ymm0
+; FMA4-NEXT: vfmaddps %ymm0, (%rdx), %ymm0, %ymm0
+; FMA4-NEXT: retq
+ %res = call <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %a) nounwind
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_x86_fmadd_bba_ps_y(<8 x float> %a, <8 x float> %b) #0 {
+; FMA4-LABEL: test_x86_fmadd_bba_ps_y:
+; FMA4: # %bb.0:
+; FMA4-NEXT: vmovaps (%rdx), %ymm0
+; FMA4-NEXT: vfmaddps (%rcx), %ymm0, %ymm0, %ymm0
+; FMA4-NEXT: retq
+ %res = call <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float> %b, <8 x float> %b, <8 x float> %a) nounwind
+ ret <8 x float> %res
+}
+
+declare <2 x double> @llvm.x86.fma4.vfmadd.sd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
+define <2 x double> @test_x86_fmadd_baa_sd(<2 x double> %a, <2 x double> %b) #0 {
+; FMA4-LABEL: test_x86_fmadd_baa_sd:
+; FMA4: # %bb.0:
+; FMA4-NEXT: vmovapd (%rcx), %xmm0
+; FMA4-NEXT: vfmaddsd %xmm0, (%rdx), %xmm0, %xmm0
+; FMA4-NEXT: retq
+ %res = call <2 x double> @llvm.x86.fma4.vfmadd.sd(<2 x double> %b, <2 x double> %a, <2 x double> %a) nounwind
+ ret <2 x double> %res
+}
+
+define <2 x double> @test_x86_fmadd_aba_sd(<2 x double> %a, <2 x double> %b) #0 {
+; FMA4-LABEL: test_x86_fmadd_aba_sd:
+; FMA4: # %bb.0:
+; FMA4-NEXT: vmovapd (%rcx), %xmm0
+; FMA4-NEXT: vfmaddsd %xmm0, (%rdx), %xmm0, %xmm0
+; FMA4-NEXT: retq
+ %res = call <2 x double> @llvm.x86.fma4.vfmadd.sd(<2 x double> %a, <2 x double> %b, <2 x double> %a) nounwind
+ ret <2 x double> %res
+}
+
+define <2 x double> @test_x86_fmadd_bba_sd(<2 x double> %a, <2 x double> %b) #0 {
+; FMA4-LABEL: test_x86_fmadd_bba_sd:
+; FMA4: # %bb.0:
+; FMA4-NEXT: vmovapd (%rdx), %xmm0
+; FMA4-NEXT: vfmaddsd (%rcx), %xmm0, %xmm0, %xmm0
+; FMA4-NEXT: retq
+ %res = call <2 x double> @llvm.x86.fma4.vfmadd.sd(<2 x double> %b, <2 x double> %b, <2 x double> %a) nounwind
+ ret <2 x double> %res
+}
+
+declare <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
+define <2 x double> @test_x86_fmadd_baa_pd(<2 x double> %a, <2 x double> %b) #0 {
+; FMA4-LABEL: test_x86_fmadd_baa_pd:
+; FMA4: # %bb.0:
+; FMA4-NEXT: vmovapd (%rcx), %xmm0
+; FMA4-NEXT: vfmaddpd %xmm0, (%rdx), %xmm0, %xmm0
+; FMA4-NEXT: retq
+ %res = call <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double> %b, <2 x double> %a, <2 x double> %a) nounwind
+ ret <2 x double> %res
+}
+
+define <2 x double> @test_x86_fmadd_aba_pd(<2 x double> %a, <2 x double> %b) #0 {
+; FMA4-LABEL: test_x86_fmadd_aba_pd:
+; FMA4: # %bb.0:
+; FMA4-NEXT: vmovapd (%rcx), %xmm0
+; FMA4-NEXT: vfmaddpd %xmm0, (%rdx), %xmm0, %xmm0
+; FMA4-NEXT: retq
+ %res = call <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double> %a, <2 x double> %b, <2 x double> %a) nounwind
+ ret <2 x double> %res
+}
+
+define <2 x double> @test_x86_fmadd_bba_pd(<2 x double> %a, <2 x double> %b) #0 {
+; FMA4-LABEL: test_x86_fmadd_bba_pd:
+; FMA4: # %bb.0:
+; FMA4-NEXT: vmovapd (%rdx), %xmm0
+; FMA4-NEXT: vfmaddpd (%rcx), %xmm0, %xmm0, %xmm0
+; FMA4-NEXT: retq
+ %res = call <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double> %b, <2 x double> %b, <2 x double> %a) nounwind
+ ret <2 x double> %res
+}
+
+declare <4 x double> @llvm.x86.fma.vfmadd.pd.256(<4 x double>, <4 x double>, <4 x double>) nounwind readnone
+define <4 x double> @test_x86_fmadd_baa_pd_y(<4 x double> %a, <4 x double> %b) #0 {
+; FMA4-LABEL: test_x86_fmadd_baa_pd_y:
+; FMA4: # %bb.0:
+; FMA4-NEXT: vmovapd (%rcx), %ymm0
+; FMA4-NEXT: vfmaddpd %ymm0, (%rdx), %ymm0, %ymm0
+; FMA4-NEXT: retq
+ %res = call <4 x double> @llvm.x86.fma.vfmadd.pd.256(<4 x double> %b, <4 x double> %a, <4 x double> %a) nounwind
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_x86_fmadd_aba_pd_y(<4 x double> %a, <4 x double> %b) #0 {
+; FMA4-LABEL: test_x86_fmadd_aba_pd_y:
+; FMA4: # %bb.0:
+; FMA4-NEXT: vmovapd (%rcx), %ymm0
+; FMA4-NEXT: vfmaddpd %ymm0, (%rdx), %ymm0, %ymm0
+; FMA4-NEXT: retq
+ %res = call <4 x double> @llvm.x86.fma.vfmadd.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %a) nounwind
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_x86_fmadd_bba_pd_y(<4 x double> %a, <4 x double> %b) #0 {
+; FMA4-LABEL: test_x86_fmadd_bba_pd_y:
+; FMA4: # %bb.0:
+; FMA4-NEXT: vmovapd (%rdx), %ymm0
+; FMA4-NEXT: vfmaddpd (%rcx), %ymm0, %ymm0, %ymm0
+; FMA4-NEXT: retq
+ %res = call <4 x double> @llvm.x86.fma.vfmadd.pd.256(<4 x double> %b, <4 x double> %b, <4 x double> %a) nounwind
+ ret <4 x double> %res
+}
+
+declare <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
+define <4 x float> @test_x86_fnmadd_baa_ps(<4 x float> %a, <4 x float> %b) #0 {
+; FMA4-LABEL: test_x86_fnmadd_baa_ps:
+; FMA4: # %bb.0:
+; FMA4-NEXT: vmovaps (%rcx), %xmm0
+; FMA4-NEXT: vfnmaddps %xmm0, (%rdx), %xmm0, %xmm0
+; FMA4-NEXT: retq
+ %res = call <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float> %b, <4 x float> %a, <4 x float> %a) nounwind
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_x86_fnmadd_aba_ps(<4 x float> %a, <4 x float> %b) #0 {
+; FMA4-LABEL: test_x86_fnmadd_aba_ps:
+; FMA4: # %bb.0:
+; FMA4-NEXT: vmovaps (%rcx), %xmm0
+; FMA4-NEXT: vfnmaddps %xmm0, (%rdx), %xmm0, %xmm0
+; FMA4-NEXT: retq
+ %res = call <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float> %a, <4 x float> %b, <4 x float> %a) nounwind
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_x86_fnmadd_bba_ps(<4 x float> %a, <4 x float> %b) #0 {
+; FMA4-LABEL: test_x86_fnmadd_bba_ps:
+; FMA4: # %bb.0:
+; FMA4-NEXT: vmovaps (%rdx), %xmm0
+; FMA4-NEXT: vfnmaddps (%rcx), %xmm0, %xmm0, %xmm0
+; FMA4-NEXT: retq
+ %res = call <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float> %b, <4 x float> %b, <4 x float> %a) nounwind
+ ret <4 x float> %res
+}
+
+declare <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float>, <8 x float>, <8 x float>) nounwind readnone
+define <8 x float> @test_x86_fnmadd_baa_ps_y(<8 x float> %a, <8 x float> %b) #0 {
+; FMA4-LABEL: test_x86_fnmadd_baa_ps_y:
+; FMA4: # %bb.0:
+; FMA4-NEXT: vmovaps (%rcx), %ymm0
+; FMA4-NEXT: vfnmaddps %ymm0, (%rdx), %ymm0, %ymm0
+; FMA4-NEXT: retq
+ %res = call <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float> %b, <8 x float> %a, <8 x float> %a) nounwind
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_x86_fnmadd_aba_ps_y(<8 x float> %a, <8 x float> %b) #0 {
+; FMA4-LABEL: test_x86_fnmadd_aba_ps_y:
+; FMA4: # %bb.0:
+; FMA4-NEXT: vmovaps (%rcx), %ymm0
+; FMA4-NEXT: vfnmaddps %ymm0, (%rdx), %ymm0, %ymm0
+; FMA4-NEXT: retq
+ %res = call <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %a) nounwind
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_x86_fnmadd_bba_ps_y(<8 x float> %a, <8 x float> %b) #0 {
+; FMA4-LABEL: test_x86_fnmadd_bba_ps_y:
+; FMA4: # %bb.0:
+; FMA4-NEXT: vmovaps (%rdx), %ymm0
+; FMA4-NEXT: vfnmaddps (%rcx), %ymm0, %ymm0, %ymm0
+; FMA4-NEXT: retq
+ %res = call <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float> %b, <8 x float> %b, <8 x float> %a) nounwind
+ ret <8 x float> %res
+}
+
+declare <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
+define <2 x double> @test_x86_fnmadd_baa_pd(<2 x double> %a, <2 x double> %b) #0 {
+; FMA4-LABEL: test_x86_fnmadd_baa_pd:
+; FMA4: # %bb.0:
+; FMA4-NEXT: vmovapd (%rcx), %xmm0
+; FMA4-NEXT: vfnmaddpd %xmm0, (%rdx), %xmm0, %xmm0
+; FMA4-NEXT: retq
+ %res = call <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double> %b, <2 x double> %a, <2 x double> %a) nounwind
+ ret <2 x double> %res
+}
+
+define <2 x double> @test_x86_fnmadd_aba_pd(<2 x double> %a, <2 x double> %b) #0 {
+; FMA4-LABEL: test_x86_fnmadd_aba_pd:
+; FMA4: # %bb.0:
+; FMA4-NEXT: vmovapd (%rcx), %xmm0
+; FMA4-NEXT: vfnmaddpd %xmm0, (%rdx), %xmm0, %xmm0
+; FMA4-NEXT: retq
+ %res = call <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double> %a, <2 x double> %b, <2 x double> %a) nounwind
+ ret <2 x double> %res
+}
+
+define <2 x double> @test_x86_fnmadd_bba_pd(<2 x double> %a, <2 x double> %b) #0 {
+; FMA4-LABEL: test_x86_fnmadd_bba_pd:
+; FMA4: # %bb.0:
+; FMA4-NEXT: vmovapd (%rdx), %xmm0
+; FMA4-NEXT: vfnmaddpd (%rcx), %xmm0, %xmm0, %xmm0
+; FMA4-NEXT: retq
+ %res = call <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double> %b, <2 x double> %b, <2 x double> %a) nounwind
+ ret <2 x double> %res
+}
+
+declare <4 x double> @llvm.x86.fma.vfnmadd.pd.256(<4 x double>, <4 x double>, <4 x double>) nounwind readnone
+define <4 x double> @test_x86_fnmadd_baa_pd_y(<4 x double> %a, <4 x double> %b) #0 {
+; FMA4-LABEL: test_x86_fnmadd_baa_pd_y:
+; FMA4: # %bb.0:
+; FMA4-NEXT: vmovapd (%rcx), %ymm0
+; FMA4-NEXT: vfnmaddpd %ymm0, (%rdx), %ymm0, %ymm0
+; FMA4-NEXT: retq
+ %res = call <4 x double> @llvm.x86.fma.vfnmadd.pd.256(<4 x double> %b, <4 x double> %a, <4 x double> %a) nounwind
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_x86_fnmadd_aba_pd_y(<4 x double> %a, <4 x double> %b) #0 {
+; FMA4-LABEL: test_x86_fnmadd_aba_pd_y:
+; FMA4: # %bb.0:
+; FMA4-NEXT: vmovapd (%rcx), %ymm0
+; FMA4-NEXT: vfnmaddpd %ymm0, (%rdx), %ymm0, %ymm0
+; FMA4-NEXT: retq
+ %res = call <4 x double> @llvm.x86.fma.vfnmadd.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %a) nounwind
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_x86_fnmadd_bba_pd_y(<4 x double> %a, <4 x double> %b) #0 {
+; FMA4-LABEL: test_x86_fnmadd_bba_pd_y:
+; FMA4: # %bb.0:
+; FMA4-NEXT: vmovapd (%rdx), %ymm0
+; FMA4-NEXT: vfnmaddpd (%rcx), %ymm0, %ymm0, %ymm0
+; FMA4-NEXT: retq
+ %res = call <4 x double> @llvm.x86.fma.vfnmadd.pd.256(<4 x double> %b, <4 x double> %b, <4 x double> %a) nounwind
+ ret <4 x double> %res
+}
+
+declare <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
+define <4 x float> @test_x86_fmsub_baa_ps(<4 x float> %a, <4 x float> %b) #0 {
+; FMA4-LABEL: test_x86_fmsub_baa_ps:
+; FMA4: # %bb.0:
+; FMA4-NEXT: vmovaps (%rcx), %xmm0
+; FMA4-NEXT: vfmsubps %xmm0, (%rdx), %xmm0, %xmm0
+; FMA4-NEXT: retq
+ %res = call <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float> %b, <4 x float> %a, <4 x float> %a) nounwind
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_x86_fmsub_aba_ps(<4 x float> %a, <4 x float> %b) #0 {
+; FMA4-LABEL: test_x86_fmsub_aba_ps:
+; FMA4: # %bb.0:
+; FMA4-NEXT: vmovaps (%rcx), %xmm0
+; FMA4-NEXT: vfmsubps %xmm0, (%rdx), %xmm0, %xmm0
+; FMA4-NEXT: retq
+ %res = call <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float> %a, <4 x float> %b, <4 x float> %a) nounwind
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_x86_fmsub_bba_ps(<4 x float> %a, <4 x float> %b) #0 {
+; FMA4-LABEL: test_x86_fmsub_bba_ps:
+; FMA4: # %bb.0:
+; FMA4-NEXT: vmovaps (%rdx), %xmm0
+; FMA4-NEXT: vfmsubps (%rcx), %xmm0, %xmm0, %xmm0
+; FMA4-NEXT: retq
+ %res = call <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float> %b, <4 x float> %b, <4 x float> %a) nounwind
+ ret <4 x float> %res
+}
+
+declare <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float>, <8 x float>, <8 x float>) nounwind readnone
+define <8 x float> @test_x86_fmsub_baa_ps_y(<8 x float> %a, <8 x float> %b) #0 {
+; FMA4-LABEL: test_x86_fmsub_baa_ps_y:
+; FMA4: # %bb.0:
+; FMA4-NEXT: vmovaps (%rcx), %ymm0
+; FMA4-NEXT: vfmsubps %ymm0, (%rdx), %ymm0, %ymm0
+; FMA4-NEXT: retq
+ %res = call <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float> %b, <8 x float> %a, <8 x float> %a) nounwind
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_x86_fmsub_aba_ps_y(<8 x float> %a, <8 x float> %b) #0 {
+; FMA4-LABEL: test_x86_fmsub_aba_ps_y:
+; FMA4: # %bb.0:
+; FMA4-NEXT: vmovaps (%rcx), %ymm0
+; FMA4-NEXT: vfmsubps %ymm0, (%rdx), %ymm0, %ymm0
+; FMA4-NEXT: retq
+ %res = call <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %a) nounwind
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_x86_fmsub_bba_ps_y(<8 x float> %a, <8 x float> %b) #0 {
+; FMA4-LABEL: test_x86_fmsub_bba_ps_y:
+; FMA4: # %bb.0:
+; FMA4-NEXT: vmovaps (%rdx), %ymm0
+; FMA4-NEXT: vfmsubps (%rcx), %ymm0, %ymm0, %ymm0
+; FMA4-NEXT: retq
+ %res = call <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float> %b, <8 x float> %b, <8 x float> %a) nounwind
+ ret <8 x float> %res
+}
+
+declare <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
+define <2 x double> @test_x86_fmsub_baa_pd(<2 x double> %a, <2 x double> %b) #0 {
+; FMA4-LABEL: test_x86_fmsub_baa_pd:
+; FMA4: # %bb.0:
+; FMA4-NEXT: vmovapd (%rcx), %xmm0
+; FMA4-NEXT: vfmsubpd %xmm0, (%rdx), %xmm0, %xmm0
+; FMA4-NEXT: retq
+ %res = call <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double> %b, <2 x double> %a, <2 x double> %a) nounwind
+ ret <2 x double> %res
+}
+
+define <2 x double> @test_x86_fmsub_aba_pd(<2 x double> %a, <2 x double> %b) #0 {
+; FMA4-LABEL: test_x86_fmsub_aba_pd:
+; FMA4: # %bb.0:
+; FMA4-NEXT: vmovapd (%rcx), %xmm0
+; FMA4-NEXT: vfmsubpd %xmm0, (%rdx), %xmm0, %xmm0
+; FMA4-NEXT: retq
+ %res = call <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double> %a, <2 x double> %b, <2 x double> %a) nounwind
+ ret <2 x double> %res
+}
+
+define <2 x double> @test_x86_fmsub_bba_pd(<2 x double> %a, <2 x double> %b) #0 {
+; FMA4-LABEL: test_x86_fmsub_bba_pd:
+; FMA4: # %bb.0:
+; FMA4-NEXT: vmovapd (%rdx), %xmm0
+; FMA4-NEXT: vfmsubpd (%rcx), %xmm0, %xmm0, %xmm0
+; FMA4-NEXT: retq
+ %res = call <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double> %b, <2 x double> %b, <2 x double> %a) nounwind
+ ret <2 x double> %res
+}
+
+declare <4 x double> @llvm.x86.fma.vfmsub.pd.256(<4 x double>, <4 x double>, <4 x double>) nounwind readnone
+define <4 x double> @test_x86_fmsub_baa_pd_y(<4 x double> %a, <4 x double> %b) #0 {
+; FMA4-LABEL: test_x86_fmsub_baa_pd_y:
+; FMA4: # %bb.0:
+; FMA4-NEXT: vmovapd (%rcx), %ymm0
+; FMA4-NEXT: vfmsubpd %ymm0, (%rdx), %ymm0, %ymm0
+; FMA4-NEXT: retq
+ %res = call <4 x double> @llvm.x86.fma.vfmsub.pd.256(<4 x double> %b, <4 x double> %a, <4 x double> %a) nounwind
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_x86_fmsub_aba_pd_y(<4 x double> %a, <4 x double> %b) #0 {
+; FMA4-LABEL: test_x86_fmsub_aba_pd_y:
+; FMA4: # %bb.0:
+; FMA4-NEXT: vmovapd (%rcx), %ymm0
+; FMA4-NEXT: vfmsubpd %ymm0, (%rdx), %ymm0, %ymm0
+; FMA4-NEXT: retq
+ %res = call <4 x double> @llvm.x86.fma.vfmsub.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %a) nounwind
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_x86_fmsub_bba_pd_y(<4 x double> %a, <4 x double> %b) #0 {
+; FMA4-LABEL: test_x86_fmsub_bba_pd_y:
+; FMA4: # %bb.0:
+; FMA4-NEXT: vmovapd (%rdx), %ymm0
+; FMA4-NEXT: vfmsubpd (%rcx), %ymm0, %ymm0, %ymm0
+; FMA4-NEXT: retq
+ %res = call <4 x double> @llvm.x86.fma.vfmsub.pd.256(<4 x double> %b, <4 x double> %b, <4 x double> %a) nounwind
+ ret <4 x double> %res
+}
+
+declare <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
+define <4 x float> @test_x86_fnmsub_baa_ps(<4 x float> %a, <4 x float> %b) #0 {
+; FMA4-LABEL: test_x86_fnmsub_baa_ps:
+; FMA4: # %bb.0:
+; FMA4-NEXT: vmovaps (%rcx), %xmm0
+; FMA4-NEXT: vfnmsubps %xmm0, (%rdx), %xmm0, %xmm0
+; FMA4-NEXT: retq
+ %res = call <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float> %b, <4 x float> %a, <4 x float> %a) nounwind
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_x86_fnmsub_aba_ps(<4 x float> %a, <4 x float> %b) #0 {
+; FMA4-LABEL: test_x86_fnmsub_aba_ps:
+; FMA4: # %bb.0:
+; FMA4-NEXT: vmovaps (%rcx), %xmm0
+; FMA4-NEXT: vfnmsubps %xmm0, (%rdx), %xmm0, %xmm0
+; FMA4-NEXT: retq
+ %res = call <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float> %a, <4 x float> %b, <4 x float> %a) nounwind
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_x86_fnmsub_bba_ps(<4 x float> %a, <4 x float> %b) #0 {
+; FMA4-LABEL: test_x86_fnmsub_bba_ps:
+; FMA4: # %bb.0:
+; FMA4-NEXT: vmovaps (%rdx), %xmm0
+; FMA4-NEXT: vfnmsubps (%rcx), %xmm0, %xmm0, %xmm0
+; FMA4-NEXT: retq
+ %res = call <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float> %b, <4 x float> %b, <4 x float> %a) nounwind
+ ret <4 x float> %res
+}
+
+declare <8 x float> @llvm.x86.fma.vfnmsub.ps.256(<8 x float>, <8 x float>, <8 x float>) nounwind readnone
+define <8 x float> @test_x86_fnmsub_baa_ps_y(<8 x float> %a, <8 x float> %b) #0 {
+; FMA4-LABEL: test_x86_fnmsub_baa_ps_y:
+; FMA4: # %bb.0:
+; FMA4-NEXT: vmovaps (%rcx), %ymm0
+; FMA4-NEXT: vfnmsubps %ymm0, (%rdx), %ymm0, %ymm0
+; FMA4-NEXT: retq
+ %res = call <8 x float> @llvm.x86.fma.vfnmsub.ps.256(<8 x float> %b, <8 x float> %a, <8 x float> %a) nounwind
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_x86_fnmsub_aba_ps_y(<8 x float> %a, <8 x float> %b) #0 {
+; FMA4-LABEL: test_x86_fnmsub_aba_ps_y:
+; FMA4: # %bb.0:
+; FMA4-NEXT: vmovaps (%rcx), %ymm0
+; FMA4-NEXT: vfnmsubps %ymm0, (%rdx), %ymm0, %ymm0
+; FMA4-NEXT: retq
+ %res = call <8 x float> @llvm.x86.fma.vfnmsub.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %a) nounwind
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_x86_fnmsub_bba_ps_y(<8 x float> %a, <8 x float> %b) #0 {
+; FMA4-LABEL: test_x86_fnmsub_bba_ps_y:
+; FMA4: # %bb.0:
+; FMA4-NEXT: vmovaps (%rdx), %ymm0
+; FMA4-NEXT: vfnmsubps (%rcx), %ymm0, %ymm0, %ymm0
+; FMA4-NEXT: retq
+ %res = call <8 x float> @llvm.x86.fma.vfnmsub.ps.256(<8 x float> %b, <8 x float> %b, <8 x float> %a) nounwind
+ ret <8 x float> %res
+}
+
+declare <2 x double> @llvm.x86.fma.vfnmsub.pd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
+define <2 x double> @test_x86_fnmsub_baa_pd(<2 x double> %a, <2 x double> %b) #0 {
+; FMA4-LABEL: test_x86_fnmsub_baa_pd:
+; FMA4: # %bb.0:
+; FMA4-NEXT: vmovapd (%rcx), %xmm0
+; FMA4-NEXT: vfnmsubpd %xmm0, (%rdx), %xmm0, %xmm0
+; FMA4-NEXT: retq
+ %res = call <2 x double> @llvm.x86.fma.vfnmsub.pd(<2 x double> %b, <2 x double> %a, <2 x double> %a) nounwind
+ ret <2 x double> %res
+}
+
+define <2 x double> @test_x86_fnmsub_aba_pd(<2 x double> %a, <2 x double> %b) #0 {
+; FMA4-LABEL: test_x86_fnmsub_aba_pd:
+; FMA4: # %bb.0:
+; FMA4-NEXT: vmovapd (%rcx), %xmm0
+; FMA4-NEXT: vfnmsubpd %xmm0, (%rdx), %xmm0, %xmm0
+; FMA4-NEXT: retq
+ %res = call <2 x double> @llvm.x86.fma.vfnmsub.pd(<2 x double> %a, <2 x double> %b, <2 x double> %a) nounwind
+ ret <2 x double> %res
+}
+
+define <2 x double> @test_x86_fnmsub_bba_pd(<2 x double> %a, <2 x double> %b) #0 {
+; FMA4-LABEL: test_x86_fnmsub_bba_pd:
+; FMA4: # %bb.0:
+; FMA4-NEXT: vmovapd (%rdx), %xmm0
+; FMA4-NEXT: vfnmsubpd (%rcx), %xmm0, %xmm0, %xmm0
+; FMA4-NEXT: retq
+ %res = call <2 x double> @llvm.x86.fma.vfnmsub.pd(<2 x double> %b, <2 x double> %b, <2 x double> %a) nounwind
+ ret <2 x double> %res
+}
+
+declare <4 x double> @llvm.x86.fma.vfnmsub.pd.256(<4 x double>, <4 x double>, <4 x double>) nounwind readnone
+define <4 x double> @test_x86_fnmsub_baa_pd_y(<4 x double> %a, <4 x double> %b) #0 {
+; FMA4-LABEL: test_x86_fnmsub_baa_pd_y:
+; FMA4: # %bb.0:
+; FMA4-NEXT: vmovapd (%rcx), %ymm0
+; FMA4-NEXT: vfnmsubpd %ymm0, (%rdx), %ymm0, %ymm0
+; FMA4-NEXT: retq
+ %res = call <4 x double> @llvm.x86.fma.vfnmsub.pd.256(<4 x double> %b, <4 x double> %a, <4 x double> %a) nounwind
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_x86_fnmsub_aba_pd_y(<4 x double> %a, <4 x double> %b) #0 {
+; FMA4-LABEL: test_x86_fnmsub_aba_pd_y:
+; FMA4: # %bb.0:
+; FMA4-NEXT: vmovapd (%rcx), %ymm0
+; FMA4-NEXT: vfnmsubpd %ymm0, (%rdx), %ymm0, %ymm0
+; FMA4-NEXT: retq
+ %res = call <4 x double> @llvm.x86.fma.vfnmsub.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %a) nounwind
+ ret <4 x double> %res
+}
+
+define <4 x double> @test_x86_fnmsub_bba_pd_y(<4 x double> %a, <4 x double> %b) #0 {
+; FMA4-LABEL: test_x86_fnmsub_bba_pd_y:
+; FMA4: # %bb.0:
+; FMA4-NEXT: vmovapd (%rdx), %ymm0
+; FMA4-NEXT: vfnmsubpd (%rcx), %ymm0, %ymm0, %ymm0
+; FMA4-NEXT: retq
+ %res = call <4 x double> @llvm.x86.fma.vfnmsub.pd.256(<4 x double> %b, <4 x double> %b, <4 x double> %a) nounwind
+ ret <4 x double> %res
+}
+
diff --git a/test/CodeGen/X86/fma4-fneg-combine.ll b/test/CodeGen/X86/fma4-fneg-combine.ll
new file mode 100644
index 000000000000..771162a2c993
--- /dev/null
+++ b/test/CodeGen/X86/fma4-fneg-combine.ll
@@ -0,0 +1,111 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+fma4,-fma | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+fma4,+fma | FileCheck %s
+
+declare <4 x float> @llvm.x86.fma4.vfmadd.ss(<4 x float> %a, <4 x float> %b, <4 x float> %c)
+declare <2 x double> @llvm.x86.fma4.vfmadd.sd(<2 x double> %a, <2 x double> %b, <2 x double> %c)
+
+; TODO this can be negated
+define <4 x float> @test1(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
+; CHECK-LABEL: test1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vfmaddss %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: vxorps {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = tail call <4 x float> @llvm.x86.fma4.vfmadd.ss(<4 x float> %a, <4 x float> %b, <4 x float> %c)
+ %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %res
+ ret <4 x float> %sub.i
+}
+
+define <4 x float> @test2(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
+; CHECK-LABEL: test2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vfmsubss %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %c
+ %res = tail call <4 x float> @llvm.x86.fma4.vfmadd.ss(<4 x float> %a, <4 x float> %b, <4 x float> %sub.i)
+ ret <4 x float> %res
+}
+
+define <4 x float> @test3(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
+; CHECK-LABEL: test3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vfnmaddss %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %b
+ %res = tail call <4 x float> @llvm.x86.fma4.vfmadd.ss(<4 x float> %a, <4 x float> %sub.i, <4 x float> %c)
+ ret <4 x float> %res
+}
+
+define <4 x float> @test4(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
+; CHECK-LABEL: test4:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vfnmaddss %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %a
+ %res = tail call <4 x float> @llvm.x86.fma4.vfmadd.ss(<4 x float> %sub.i, <4 x float> %b, <4 x float> %c)
+ ret <4 x float> %res
+}
+
+define <4 x float> @test5(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
+; CHECK-LABEL: test5:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vfnmsubss %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %a
+ %sub.i.2 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %c
+ %res = tail call <4 x float> @llvm.x86.fma4.vfmadd.ss(<4 x float> %sub.i, <4 x float> %b, <4 x float> %sub.i.2)
+ ret <4 x float> %res
+}
+
+define <2 x double> @test6(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
+; CHECK-LABEL: test6:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vfmaddsd %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: vxorpd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res = tail call <2 x double> @llvm.x86.fma4.vfmadd.sd(<2 x double> %a, <2 x double> %b, <2 x double> %c)
+ %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %res
+ ret <2 x double> %sub.i
+}
+
+define <2 x double> @test7(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
+; CHECK-LABEL: test7:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vfmsubsd %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %c
+ %res = tail call <2 x double> @llvm.x86.fma4.vfmadd.sd(<2 x double> %a, <2 x double> %b, <2 x double> %sub.i)
+ ret <2 x double> %res
+}
+
+define <2 x double> @test8(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
+; CHECK-LABEL: test8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vfnmaddsd %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %b
+ %res = tail call <2 x double> @llvm.x86.fma4.vfmadd.sd(<2 x double> %a, <2 x double> %sub.i, <2 x double> %c)
+ ret <2 x double> %res
+}
+
+define <2 x double> @test9(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
+; CHECK-LABEL: test9:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vfnmaddsd %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %a
+ %res = tail call <2 x double> @llvm.x86.fma4.vfmadd.sd(<2 x double> %sub.i, <2 x double> %b, <2 x double> %c)
+ ret <2 x double> %res
+}
+
+define <2 x double> @test10(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
+; CHECK-LABEL: test10:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vfnmsubsd %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %a
+ %sub.i.2 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %c
+ %res = tail call <2 x double> @llvm.x86.fma4.vfmadd.sd(<2 x double> %sub.i, <2 x double> %b, <2 x double> %sub.i.2)
+ ret <2 x double> %res
+}
diff --git a/test/CodeGen/X86/fma4-intrinsics-x86.ll b/test/CodeGen/X86/fma4-intrinsics-x86.ll
new file mode 100644
index 000000000000..ee6a7ec1b554
--- /dev/null
+++ b/test/CodeGen/X86/fma4-intrinsics-x86.ll
@@ -0,0 +1,289 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+fma4,-fma -show-mc-encoding | FileCheck %s --check-prefix=CHECK
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+fma4,+fma -show-mc-encoding | FileCheck %s --check-prefix=CHECK
+
+; VFMADD
+define <4 x float> @test_x86_fma4_vfmadd_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
+; CHECK-LABEL: test_x86_fma4_vfmadd_ss:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vfmaddss %xmm2, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0xf9,0x6a,0xc2,0x10]
+; CHECK-NEXT: retq # encoding: [0xc3]
+ %res = call <4 x float> @llvm.x86.fma4.vfmadd.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_x86_fma4_vfmadd_bac_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
+; CHECK-LABEL: test_x86_fma4_vfmadd_bac_ss:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vfmaddss %xmm2, %xmm0, %xmm1, %xmm0 # encoding: [0xc4,0xe3,0xf1,0x6a,0xc2,0x00]
+; CHECK-NEXT: retq # encoding: [0xc3]
+ %res = call <4 x float> @llvm.x86.fma4.vfmadd.ss(<4 x float> %a1, <4 x float> %a0, <4 x float> %a2)
+ ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.fma4.vfmadd.ss(<4 x float>, <4 x float>, <4 x float>)
+
+define <2 x double> @test_x86_fma4_vfmadd_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
+; CHECK-LABEL: test_x86_fma4_vfmadd_sd:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vfmaddsd %xmm2, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0xf9,0x6b,0xc2,0x10]
+; CHECK-NEXT: retq # encoding: [0xc3]
+ %res = call <2 x double> @llvm.x86.fma4.vfmadd.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
+ ret <2 x double> %res
+}
+
+define <2 x double> @test_x86_fma4_vfmadd_bac_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
+; CHECK-LABEL: test_x86_fma4_vfmadd_bac_sd:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vfmaddsd %xmm2, %xmm0, %xmm1, %xmm0 # encoding: [0xc4,0xe3,0xf1,0x6b,0xc2,0x00]
+; CHECK-NEXT: retq # encoding: [0xc3]
+ %res = call <2 x double> @llvm.x86.fma4.vfmadd.sd(<2 x double> %a1, <2 x double> %a0, <2 x double> %a2)
+ ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.fma4.vfmadd.sd(<2 x double>, <2 x double>, <2 x double>)
+
+define <4 x float> @test_x86_fma_vfmadd_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
+; CHECK-LABEL: test_x86_fma_vfmadd_ps:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vfmaddps %xmm2, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0xf9,0x68,0xc2,0x10]
+; CHECK-NEXT: retq # encoding: [0xc3]
+ %res = call <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
+ ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float>, <4 x float>, <4 x float>)
+
+define <2 x double> @test_x86_fma_vfmadd_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
+; CHECK-LABEL: test_x86_fma_vfmadd_pd:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vfmaddpd %xmm2, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0xf9,0x69,0xc2,0x10]
+; CHECK-NEXT: retq # encoding: [0xc3]
+ %res = call <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
+ ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double>, <2 x double>, <2 x double>)
+
+define <8 x float> @test_x86_fma_vfmadd_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) #0 {
+; CHECK-LABEL: test_x86_fma_vfmadd_ps_256:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vfmaddps %ymm2, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0xfd,0x68,0xc2,0x10]
+; CHECK-NEXT: retq # encoding: [0xc3]
+ %res = call <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2)
+ ret <8 x float> %res
+}
+declare <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float>, <8 x float>, <8 x float>)
+
+define <4 x double> @test_x86_fma_vfmadd_pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 {
+; CHECK-LABEL: test_x86_fma_vfmadd_pd_256:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vfmaddpd %ymm2, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0xfd,0x69,0xc2,0x10]
+; CHECK-NEXT: retq # encoding: [0xc3]
+ %res = call <4 x double> @llvm.x86.fma.vfmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2)
+ ret <4 x double> %res
+}
+declare <4 x double> @llvm.x86.fma.vfmadd.pd.256(<4 x double>, <4 x double>, <4 x double>)
+
+; VFMSUB
+define <4 x float> @test_x86_fma_vfmsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
+; CHECK-LABEL: test_x86_fma_vfmsub_ps:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vfmsubps %xmm2, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0xf9,0x6c,0xc2,0x10]
+; CHECK-NEXT: retq # encoding: [0xc3]
+ %res = call <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
+ ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float>, <4 x float>, <4 x float>)
+
+define <2 x double> @test_x86_fma_vfmsub_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
+; CHECK-LABEL: test_x86_fma_vfmsub_pd:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vfmsubpd %xmm2, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0xf9,0x6d,0xc2,0x10]
+; CHECK-NEXT: retq # encoding: [0xc3]
+ %res = call <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
+ ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double>, <2 x double>, <2 x double>)
+
+define <8 x float> @test_x86_fma_vfmsub_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) #0 {
+; CHECK-LABEL: test_x86_fma_vfmsub_ps_256:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vfmsubps %ymm2, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0xfd,0x6c,0xc2,0x10]
+; CHECK-NEXT: retq # encoding: [0xc3]
+ %res = call <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2)
+ ret <8 x float> %res
+}
+declare <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float>, <8 x float>, <8 x float>)
+
+define <4 x double> @test_x86_fma_vfmsub_pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 {
+; CHECK-LABEL: test_x86_fma_vfmsub_pd_256:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vfmsubpd %ymm2, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0xfd,0x6d,0xc2,0x10]
+; CHECK-NEXT: retq # encoding: [0xc3]
+ %res = call <4 x double> @llvm.x86.fma.vfmsub.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2)
+ ret <4 x double> %res
+}
+declare <4 x double> @llvm.x86.fma.vfmsub.pd.256(<4 x double>, <4 x double>, <4 x double>)
+
+; VFNMADD
+define <4 x float> @test_x86_fma_vfnmadd_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
+; CHECK-LABEL: test_x86_fma_vfnmadd_ps:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vfnmaddps %xmm2, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0xf9,0x78,0xc2,0x10]
+; CHECK-NEXT: retq # encoding: [0xc3]
+ %res = call <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
+ ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float>, <4 x float>, <4 x float>)
+
+define <2 x double> @test_x86_fma_vfnmadd_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
+; CHECK-LABEL: test_x86_fma_vfnmadd_pd:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vfnmaddpd %xmm2, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0xf9,0x79,0xc2,0x10]
+; CHECK-NEXT: retq # encoding: [0xc3]
+ %res = call <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
+ ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double>, <2 x double>, <2 x double>)
+
+define <8 x float> @test_x86_fma_vfnmadd_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) #0 {
+; CHECK-LABEL: test_x86_fma_vfnmadd_ps_256:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vfnmaddps %ymm2, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0xfd,0x78,0xc2,0x10]
+; CHECK-NEXT: retq # encoding: [0xc3]
+ %res = call <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2)
+ ret <8 x float> %res
+}
+declare <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float>, <8 x float>, <8 x float>)
+
+define <4 x double> @test_x86_fma_vfnmadd_pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 {
+; CHECK-LABEL: test_x86_fma_vfnmadd_pd_256:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vfnmaddpd %ymm2, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0xfd,0x79,0xc2,0x10]
+; CHECK-NEXT: retq # encoding: [0xc3]
+ %res = call <4 x double> @llvm.x86.fma.vfnmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2)
+ ret <4 x double> %res
+}
+declare <4 x double> @llvm.x86.fma.vfnmadd.pd.256(<4 x double>, <4 x double>, <4 x double>)
+
+; VFNMSUB
+define <4 x float> @test_x86_fma_vfnmsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
+; CHECK-LABEL: test_x86_fma_vfnmsub_ps:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vfnmsubps %xmm2, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0xf9,0x7c,0xc2,0x10]
+; CHECK-NEXT: retq # encoding: [0xc3]
+ %res = call <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
+ ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float>, <4 x float>, <4 x float>)
+
+define <2 x double> @test_x86_fma_vfnmsub_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
+; CHECK-LABEL: test_x86_fma_vfnmsub_pd:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vfnmsubpd %xmm2, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0xf9,0x7d,0xc2,0x10]
+; CHECK-NEXT: retq # encoding: [0xc3]
+ %res = call <2 x double> @llvm.x86.fma.vfnmsub.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
+ ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.fma.vfnmsub.pd(<2 x double>, <2 x double>, <2 x double>)
+
+define <8 x float> @test_x86_fma_vfnmsub_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) #0 {
+; CHECK-LABEL: test_x86_fma_vfnmsub_ps_256:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vfnmsubps %ymm2, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0xfd,0x7c,0xc2,0x10]
+; CHECK-NEXT: retq # encoding: [0xc3]
+ %res = call <8 x float> @llvm.x86.fma.vfnmsub.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2)
+ ret <8 x float> %res
+}
+declare <8 x float> @llvm.x86.fma.vfnmsub.ps.256(<8 x float>, <8 x float>, <8 x float>)
+
+define <4 x double> @test_x86_fma_vfnmsub_pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 {
+; CHECK-LABEL: test_x86_fma_vfnmsub_pd_256:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vfnmsubpd %ymm2, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0xfd,0x7d,0xc2,0x10]
+; CHECK-NEXT: retq # encoding: [0xc3]
+ %res = call <4 x double> @llvm.x86.fma.vfnmsub.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2)
+ ret <4 x double> %res
+}
+declare <4 x double> @llvm.x86.fma.vfnmsub.pd.256(<4 x double>, <4 x double>, <4 x double>)
+
+; VFMADDSUB
+define <4 x float> @test_x86_fma_vfmaddsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
+; CHECK-LABEL: test_x86_fma_vfmaddsub_ps:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vfmaddsubps %xmm2, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0xf9,0x5c,0xc2,0x10]
+; CHECK-NEXT: retq # encoding: [0xc3]
+ %res = call <4 x float> @llvm.x86.fma.vfmaddsub.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
+ ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.fma.vfmaddsub.ps(<4 x float>, <4 x float>, <4 x float>)
+
+define <2 x double> @test_x86_fma_vfmaddsub_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
+; CHECK-LABEL: test_x86_fma_vfmaddsub_pd:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vfmaddsubpd %xmm2, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0xf9,0x5d,0xc2,0x10]
+; CHECK-NEXT: retq # encoding: [0xc3]
+ %res = call <2 x double> @llvm.x86.fma.vfmaddsub.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
+ ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.fma.vfmaddsub.pd(<2 x double>, <2 x double>, <2 x double>)
+
+define <8 x float> @test_x86_fma_vfmaddsub_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) #0 {
+; CHECK-LABEL: test_x86_fma_vfmaddsub_ps_256:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vfmaddsubps %ymm2, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0xfd,0x5c,0xc2,0x10]
+; CHECK-NEXT: retq # encoding: [0xc3]
+ %res = call <8 x float> @llvm.x86.fma.vfmaddsub.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2)
+ ret <8 x float> %res
+}
+declare <8 x float> @llvm.x86.fma.vfmaddsub.ps.256(<8 x float>, <8 x float>, <8 x float>)
+
+define <4 x double> @test_x86_fma_vfmaddsub_pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 {
+; CHECK-LABEL: test_x86_fma_vfmaddsub_pd_256:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vfmaddsubpd %ymm2, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0xfd,0x5d,0xc2,0x10]
+; CHECK-NEXT: retq # encoding: [0xc3]
+ %res = call <4 x double> @llvm.x86.fma.vfmaddsub.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2)
+ ret <4 x double> %res
+}
+declare <4 x double> @llvm.x86.fma.vfmaddsub.pd.256(<4 x double>, <4 x double>, <4 x double>)
+
+; VFMSUBADD
+define <4 x float> @test_x86_fma_vfmsubadd_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
+; CHECK-LABEL: test_x86_fma_vfmsubadd_ps:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vfmsubaddps %xmm2, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0xf9,0x5e,0xc2,0x10]
+; CHECK-NEXT: retq # encoding: [0xc3]
+ %res = call <4 x float> @llvm.x86.fma.vfmsubadd.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
+ ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.fma.vfmsubadd.ps(<4 x float>, <4 x float>, <4 x float>)
+
+define <2 x double> @test_x86_fma_vfmsubadd_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 {
+; CHECK-LABEL: test_x86_fma_vfmsubadd_pd:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vfmsubaddpd %xmm2, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0xf9,0x5f,0xc2,0x10]
+; CHECK-NEXT: retq # encoding: [0xc3]
+ %res = call <2 x double> @llvm.x86.fma.vfmsubadd.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
+ ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.fma.vfmsubadd.pd(<2 x double>, <2 x double>, <2 x double>)
+
+define <8 x float> @test_x86_fma_vfmsubadd_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) #0 {
+; CHECK-LABEL: test_x86_fma_vfmsubadd_ps_256:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vfmsubaddps %ymm2, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0xfd,0x5e,0xc2,0x10]
+; CHECK-NEXT: retq # encoding: [0xc3]
+ %res = call <8 x float> @llvm.x86.fma.vfmsubadd.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2)
+ ret <8 x float> %res
+}
+declare <8 x float> @llvm.x86.fma.vfmsubadd.ps.256(<8 x float>, <8 x float>, <8 x float>)
+
+define <4 x double> @test_x86_fma_vfmsubadd_pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 {
+; CHECK-LABEL: test_x86_fma_vfmsubadd_pd_256:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vfmsubaddpd %ymm2, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0xfd,0x5f,0xc2,0x10]
+; CHECK-NEXT: retq # encoding: [0xc3]
+ %res = call <4 x double> @llvm.x86.fma.vfmsubadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2)
+ ret <4 x double> %res
+}
+declare <4 x double> @llvm.x86.fma.vfmsubadd.pd.256(<4 x double>, <4 x double>, <4 x double>)
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/X86/fma4-intrinsics-x86_64-folded-load.ll b/test/CodeGen/X86/fma4-intrinsics-x86_64-folded-load.ll
index 85de1ef5c9dc..236f3ff19dac 100644
--- a/test/CodeGen/X86/fma4-intrinsics-x86_64-folded-load.ll
+++ b/test/CodeGen/X86/fma4-intrinsics-x86_64-folded-load.ll
@@ -1,47 +1,66 @@
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -march=x86-64 -mcpu=corei7-avx -mattr=+fma4 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=corei7-avx -mattr=+fma4 | FileCheck %s
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -mattr=+avx,-fma | FileCheck %s
; VFMADD
-define < 4 x float > @test_x86_fma_vfmadd_ss_load(< 4 x float > %a0, < 4 x float > %a1, float* %a2) {
- ; CHECK: vfmaddss (%{{.*}})
+define < 4 x float > @test_x86_fma4_vfmadd_ss_load(< 4 x float > %a0, < 4 x float > %a1, float* %a2) {
+; CHECK-LABEL: test_x86_fma4_vfmadd_ss_load:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vfmaddss (%rdi), %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
%x = load float , float *%a2
%y = insertelement <4 x float> undef, float %x, i32 0
- %res = call < 4 x float > @llvm.x86.fma.vfmadd.ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %y)
+ %res = call < 4 x float > @llvm.x86.fma4.vfmadd.ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %y)
ret < 4 x float > %res
}
-define < 4 x float > @test_x86_fma_vfmadd_ss_load2(< 4 x float > %a0, float* %a1, < 4 x float > %a2) {
- ; CHECK: vfmaddss %{{.*}}, (%{{.*}})
+define < 4 x float > @test_x86_fma4_vfmadd_ss_load2(< 4 x float > %a0, float* %a1, < 4 x float > %a2) {
+; CHECK-LABEL: test_x86_fma4_vfmadd_ss_load2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vfmaddss %xmm1, (%rdi), %xmm0, %xmm0
+; CHECK-NEXT: retq
%x = load float , float *%a1
%y = insertelement <4 x float> undef, float %x, i32 0
- %res = call < 4 x float > @llvm.x86.fma.vfmadd.ss(< 4 x float > %a0, < 4 x float > %y, < 4 x float > %a2)
+ %res = call < 4 x float > @llvm.x86.fma4.vfmadd.ss(< 4 x float > %a0, < 4 x float > %y, < 4 x float > %a2)
ret < 4 x float > %res
}
-declare < 4 x float > @llvm.x86.fma.vfmadd.ss(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone
+declare < 4 x float > @llvm.x86.fma4.vfmadd.ss(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone
-define < 2 x double > @test_x86_fma_vfmadd_sd_load(< 2 x double > %a0, < 2 x double > %a1, double* %a2) {
- ; CHECK: vfmaddsd (%{{.*}})
+define < 2 x double > @test_x86_fma4_vfmadd_sd_load(< 2 x double > %a0, < 2 x double > %a1, double* %a2) {
+; CHECK-LABEL: test_x86_fma4_vfmadd_sd_load:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vfmaddsd (%rdi), %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
%x = load double , double *%a2
%y = insertelement <2 x double> undef, double %x, i32 0
- %res = call < 2 x double > @llvm.x86.fma.vfmadd.sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %y)
+ %res = call < 2 x double > @llvm.x86.fma4.vfmadd.sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %y)
ret < 2 x double > %res
}
-define < 2 x double > @test_x86_fma_vfmadd_sd_load2(< 2 x double > %a0, double* %a1, < 2 x double > %a2) {
- ; CHECK: vfmaddsd %{{.*}}, (%{{.*}})
+define < 2 x double > @test_x86_fma4_vfmadd_sd_load2(< 2 x double > %a0, double* %a1, < 2 x double > %a2) {
+; CHECK-LABEL: test_x86_fma4_vfmadd_sd_load2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vfmaddsd %xmm1, (%rdi), %xmm0, %xmm0
+; CHECK-NEXT: retq
%x = load double , double *%a1
%y = insertelement <2 x double> undef, double %x, i32 0
- %res = call < 2 x double > @llvm.x86.fma.vfmadd.sd(< 2 x double > %a0, < 2 x double > %y, < 2 x double > %a2)
+ %res = call < 2 x double > @llvm.x86.fma4.vfmadd.sd(< 2 x double > %a0, < 2 x double > %y, < 2 x double > %a2)
ret < 2 x double > %res
}
-declare < 2 x double > @llvm.x86.fma.vfmadd.sd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone
+declare < 2 x double > @llvm.x86.fma4.vfmadd.sd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone
define < 4 x float > @test_x86_fma_vfmadd_ps_load(< 4 x float > %a0, < 4 x float > %a1, < 4 x float >* %a2) {
- ; CHECK: vfmaddps (%{{.*}})
+; CHECK-LABEL: test_x86_fma_vfmadd_ps_load:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vfmaddps (%rdi), %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
%x = load <4 x float>, <4 x float>* %a2
%res = call < 4 x float > @llvm.x86.fma.vfmadd.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %x)
ret < 4 x float > %res
}
define < 4 x float > @test_x86_fma_vfmadd_ps_load2(< 4 x float > %a0, < 4 x float >* %a1, < 4 x float > %a2) {
- ; CHECK: vfmaddps %{{.*}}, (%{{.*}})
+; CHECK-LABEL: test_x86_fma_vfmadd_ps_load2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vfmaddps %xmm1, (%rdi), %xmm0, %xmm0
+; CHECK-NEXT: retq
%x = load <4 x float>, <4 x float>* %a1
%res = call < 4 x float > @llvm.x86.fma.vfmadd.ps(< 4 x float > %a0, < 4 x float > %x, < 4 x float > %a2)
ret < 4 x float > %res
@@ -50,8 +69,11 @@ declare < 4 x float > @llvm.x86.fma.vfmadd.ps(< 4 x float >, < 4 x float >, < 4
; To test execution dependency
define < 4 x float > @test_x86_fma_vfmadd_ps_load3(< 4 x float >* %a0, < 4 x float >* %a1, < 4 x float > %a2) {
- ; CHECK: vmovaps
- ; CHECK: vfmaddps %{{.*}}, (%{{.*}})
+; CHECK-LABEL: test_x86_fma_vfmadd_ps_load3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps (%rdi), %xmm1
+; CHECK-NEXT: vfmaddps %xmm0, (%rsi), %xmm1, %xmm0
+; CHECK-NEXT: retq
%x = load <4 x float>, <4 x float>* %a0
%y = load <4 x float>, <4 x float>* %a1
%res = call < 4 x float > @llvm.x86.fma.vfmadd.ps(< 4 x float > %x, < 4 x float > %y, < 4 x float > %a2)
@@ -59,13 +81,19 @@ define < 4 x float > @test_x86_fma_vfmadd_ps_load3(< 4 x float >* %a0, < 4 x flo
}
define < 2 x double > @test_x86_fma_vfmadd_pd_load(< 2 x double > %a0, < 2 x double > %a1, < 2 x double >* %a2) {
- ; CHECK: vfmaddpd (%{{.*}})
+; CHECK-LABEL: test_x86_fma_vfmadd_pd_load:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vfmaddpd (%rdi), %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
%x = load <2 x double>, <2 x double>* %a2
%res = call < 2 x double > @llvm.x86.fma.vfmadd.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %x)
ret < 2 x double > %res
}
define < 2 x double > @test_x86_fma_vfmadd_pd_load2(< 2 x double > %a0, < 2 x double >* %a1, < 2 x double > %a2) {
- ; CHECK: vfmaddpd %{{.*}}, (%{{.*}})
+; CHECK-LABEL: test_x86_fma_vfmadd_pd_load2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vfmaddpd %xmm1, (%rdi), %xmm0, %xmm0
+; CHECK-NEXT: retq
%x = load <2 x double>, <2 x double>* %a1
%res = call < 2 x double > @llvm.x86.fma.vfmadd.pd(< 2 x double > %a0, < 2 x double > %x, < 2 x double > %a2)
ret < 2 x double > %res
@@ -74,8 +102,11 @@ declare < 2 x double > @llvm.x86.fma.vfmadd.pd(< 2 x double >, < 2 x double >, <
; To test execution dependency
define < 2 x double > @test_x86_fma_vfmadd_pd_load3(< 2 x double >* %a0, < 2 x double >* %a1, < 2 x double > %a2) {
- ; CHECK: vmovapd
- ; CHECK: vfmaddpd %{{.*}}, (%{{.*}})
+; CHECK-LABEL: test_x86_fma_vfmadd_pd_load3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovapd (%rdi), %xmm1
+; CHECK-NEXT: vfmaddpd %xmm0, (%rsi), %xmm1, %xmm0
+; CHECK-NEXT: retq
%x = load <2 x double>, <2 x double>* %a0
%y = load <2 x double>, <2 x double>* %a1
%res = call < 2 x double > @llvm.x86.fma.vfmadd.pd(< 2 x double > %x, < 2 x double > %y, < 2 x double > %a2)
diff --git a/test/CodeGen/X86/fma4-scalar-memfold.ll b/test/CodeGen/X86/fma4-scalar-memfold.ll
new file mode 100644
index 000000000000..204f6f99b167
--- /dev/null
+++ b/test/CodeGen/X86/fma4-scalar-memfold.ll
@@ -0,0 +1,104 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mattr=fma4 | FileCheck %s
+
+target triple = "x86_64-unknown-unknown"
+
+declare <4 x float> @llvm.x86.fma4.vfmadd.ss(<4 x float>, <4 x float>, <4 x float>)
+declare <2 x double> @llvm.x86.fma4.vfmadd.sd(<2 x double>, <2 x double>, <2 x double>)
+
+define void @fmadd_aab_ss(float* %a, float* %b) {
+; CHECK-LABEL: fmadd_aab_ss:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT: vfmaddss (%rsi), %xmm0, %xmm0, %xmm0
+; CHECK-NEXT: vmovss %xmm0, (%rdi)
+; CHECK-NEXT: retq
+ %a.val = load float, float* %a
+ %av0 = insertelement <4 x float> undef, float %a.val, i32 0
+ %av1 = insertelement <4 x float> %av0, float 0.000000e+00, i32 1
+ %av2 = insertelement <4 x float> %av1, float 0.000000e+00, i32 2
+ %av = insertelement <4 x float> %av2, float 0.000000e+00, i32 3
+
+ %b.val = load float, float* %b
+ %bv0 = insertelement <4 x float> undef, float %b.val, i32 0
+ %bv1 = insertelement <4 x float> %bv0, float 0.000000e+00, i32 1
+ %bv2 = insertelement <4 x float> %bv1, float 0.000000e+00, i32 2
+ %bv = insertelement <4 x float> %bv2, float 0.000000e+00, i32 3
+
+ %vr = call <4 x float> @llvm.x86.fma4.vfmadd.ss(<4 x float> %av, <4 x float> %av, <4 x float> %bv)
+
+ %sr = extractelement <4 x float> %vr, i32 0
+ store float %sr, float* %a
+ ret void
+}
+
+define void @fmadd_aba_ss(float* %a, float* %b) {
+; CHECK-LABEL: fmadd_aba_ss:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT: vfmaddss %xmm0, (%rsi), %xmm0, %xmm0
+; CHECK-NEXT: vmovss %xmm0, (%rdi)
+; CHECK-NEXT: retq
+ %a.val = load float, float* %a
+ %av0 = insertelement <4 x float> undef, float %a.val, i32 0
+ %av1 = insertelement <4 x float> %av0, float 0.000000e+00, i32 1
+ %av2 = insertelement <4 x float> %av1, float 0.000000e+00, i32 2
+ %av = insertelement <4 x float> %av2, float 0.000000e+00, i32 3
+
+ %b.val = load float, float* %b
+ %bv0 = insertelement <4 x float> undef, float %b.val, i32 0
+ %bv1 = insertelement <4 x float> %bv0, float 0.000000e+00, i32 1
+ %bv2 = insertelement <4 x float> %bv1, float 0.000000e+00, i32 2
+ %bv = insertelement <4 x float> %bv2, float 0.000000e+00, i32 3
+
+ %vr = call <4 x float> @llvm.x86.fma4.vfmadd.ss(<4 x float> %av, <4 x float> %bv, <4 x float> %av)
+
+ %sr = extractelement <4 x float> %vr, i32 0
+ store float %sr, float* %a
+ ret void
+}
+
+define void @fmadd_aab_sd(double* %a, double* %b) {
+; CHECK-LABEL: fmadd_aab_sd:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT: vfmaddsd (%rsi), %xmm0, %xmm0, %xmm0
+; CHECK-NEXT: vmovlpd %xmm0, (%rdi)
+; CHECK-NEXT: retq
+ %a.val = load double, double* %a
+ %av0 = insertelement <2 x double> undef, double %a.val, i32 0
+ %av = insertelement <2 x double> %av0, double 0.000000e+00, i32 1
+
+ %b.val = load double, double* %b
+ %bv0 = insertelement <2 x double> undef, double %b.val, i32 0
+ %bv = insertelement <2 x double> %bv0, double 0.000000e+00, i32 1
+
+ %vr = call <2 x double> @llvm.x86.fma4.vfmadd.sd(<2 x double> %av, <2 x double> %av, <2 x double> %bv)
+
+ %sr = extractelement <2 x double> %vr, i32 0
+ store double %sr, double* %a
+ ret void
+}
+
+define void @fmadd_aba_sd(double* %a, double* %b) {
+; CHECK-LABEL: fmadd_aba_sd:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT: vfmaddsd %xmm0, (%rsi), %xmm0, %xmm0
+; CHECK-NEXT: vmovlpd %xmm0, (%rdi)
+; CHECK-NEXT: retq
+ %a.val = load double, double* %a
+ %av0 = insertelement <2 x double> undef, double %a.val, i32 0
+ %av = insertelement <2 x double> %av0, double 0.000000e+00, i32 1
+
+ %b.val = load double, double* %b
+ %bv0 = insertelement <2 x double> undef, double %b.val, i32 0
+ %bv = insertelement <2 x double> %bv0, double 0.000000e+00, i32 1
+
+ %vr = call <2 x double> @llvm.x86.fma4.vfmadd.sd(<2 x double> %av, <2 x double> %bv, <2 x double> %av)
+
+ %sr = extractelement <2 x double> %vr, i32 0
+ store double %sr, double* %a
+ ret void
+}
+
diff --git a/test/CodeGen/X86/fma4-schedule.ll b/test/CodeGen/X86/fma4-schedule.ll
new file mode 100644
index 000000000000..468b7088d608
--- /dev/null
+++ b/test/CodeGen/X86/fma4-schedule.ll
@@ -0,0 +1,758 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=+fma4 | FileCheck %s --check-prefix=CHECK --check-prefix=GENERIC
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=bdver1 | FileCheck %s --check-prefix=CHECK --check-prefix=BDVER --check-prefix=BDVER1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=bdver2 -mattr=-fma | FileCheck %s --check-prefix=CHECK --check-prefix=BDVER --check-prefix=BDVER1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=bdver3 -mattr=-fma | FileCheck %s --check-prefix=CHECK --check-prefix=BDVER --check-prefix=BDVER1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=bdver4 -mattr=-fma | FileCheck %s --check-prefix=CHECK --check-prefix=BDVER --check-prefix=BDVER1
+
+;
+; VFMADD
+;
+
+define void @test_vfmaddpd_128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> *%a3) optsize {
+; GENERIC-LABEL: test_vfmaddpd_128:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: vfmaddpd %xmm2, %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT: vfmaddpd (%rdi), %xmm1, %xmm0, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT: vfmaddpd %xmm1, (%rdi), %xmm0, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; BDVER-LABEL: test_vfmaddpd_128:
+; BDVER: # %bb.0:
+; BDVER-NEXT: #APP
+; BDVER-NEXT: vfmaddpd %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER-NEXT: vfmaddpd (%rdi), %xmm1, %xmm0, %xmm0
+; BDVER-NEXT: vfmaddpd %xmm1, (%rdi), %xmm0, %xmm0
+; BDVER-NEXT: #NO_APP
+; BDVER-NEXT: retq
+ tail call void asm "vfmaddpd $2, $1, $0, $0 \0A\09 vfmaddpd $3, $1, $0, $0 \0A\09 vfmaddpd $1, $3, $0, $0", "x,x,x,*m"(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> *%a3) nounwind
+ ret void
+}
+
+define void @test_vfmaddpd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, <4 x double> *%a3) optsize {
+; GENERIC-LABEL: test_vfmaddpd_256:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: vfmaddpd %ymm2, %ymm1, %ymm0, %ymm0 # sched: [5:0.50]
+; GENERIC-NEXT: vfmaddpd (%rdi), %ymm1, %ymm0, %ymm0 # sched: [9:0.50]
+; GENERIC-NEXT: vfmaddpd %ymm1, (%rdi), %ymm0, %ymm0 # sched: [9:0.50]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; BDVER-LABEL: test_vfmaddpd_256:
+; BDVER: # %bb.0:
+; BDVER-NEXT: #APP
+; BDVER-NEXT: vfmaddpd %ymm2, %ymm1, %ymm0, %ymm0
+; BDVER-NEXT: vfmaddpd (%rdi), %ymm1, %ymm0, %ymm0
+; BDVER-NEXT: vfmaddpd %ymm1, (%rdi), %ymm0, %ymm0
+; BDVER-NEXT: #NO_APP
+; BDVER-NEXT: vzeroupper
+; BDVER-NEXT: retq
+ tail call void asm "vfmaddpd $2, $1, $0, $0 \0A\09 vfmaddpd $3, $1, $0, $0 \0A\09 vfmaddpd $1, $3, $0, $0", "x,x,x,*m"(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, <4 x double> *%a3) nounwind
+ ret void
+}
+
+define void @test_vfmaddps_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> *%a3) optsize {
+; GENERIC-LABEL: test_vfmaddps_128:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: vfmaddps %xmm2, %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT: vfmaddps (%rdi), %xmm1, %xmm0, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT: vfmaddps %xmm1, (%rdi), %xmm0, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; BDVER-LABEL: test_vfmaddps_128:
+; BDVER: # %bb.0:
+; BDVER-NEXT: #APP
+; BDVER-NEXT: vfmaddps %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER-NEXT: vfmaddps (%rdi), %xmm1, %xmm0, %xmm0
+; BDVER-NEXT: vfmaddps %xmm1, (%rdi), %xmm0, %xmm0
+; BDVER-NEXT: #NO_APP
+; BDVER-NEXT: retq
+ tail call void asm "vfmaddps $2, $1, $0, $0 \0A\09 vfmaddps $3, $1, $0, $0 \0A\09 vfmaddps $1, $3, $0, $0", "x,x,x,*m"(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> *%a3) nounwind
+ ret void
+}
+
+define void @test_vfmaddps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, <8 x float> *%a3) optsize {
+; GENERIC-LABEL: test_vfmaddps_256:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: vfmaddps %ymm2, %ymm1, %ymm0, %ymm0 # sched: [5:0.50]
+; GENERIC-NEXT: vfmaddps (%rdi), %ymm1, %ymm0, %ymm0 # sched: [9:0.50]
+; GENERIC-NEXT: vfmaddps %ymm1, (%rdi), %ymm0, %ymm0 # sched: [9:0.50]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; BDVER-LABEL: test_vfmaddps_256:
+; BDVER: # %bb.0:
+; BDVER-NEXT: #APP
+; BDVER-NEXT: vfmaddps %ymm2, %ymm1, %ymm0, %ymm0
+; BDVER-NEXT: vfmaddps (%rdi), %ymm1, %ymm0, %ymm0
+; BDVER-NEXT: vfmaddps %ymm1, (%rdi), %ymm0, %ymm0
+; BDVER-NEXT: #NO_APP
+; BDVER-NEXT: vzeroupper
+; BDVER-NEXT: retq
+ tail call void asm "vfmaddps $2, $1, $0, $0 \0A\09 vfmaddps $3, $1, $0, $0 \0A\09 vfmaddps $1, $3, $0, $0", "x,x,x,*m"(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, <8 x float> *%a3) nounwind
+ ret void
+}
+
+define void @test_vfmaddsd_128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> *%a3) optsize {
+; GENERIC-LABEL: test_vfmaddsd_128:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: vfmaddsd %xmm2, %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT: vfmaddsd (%rdi), %xmm1, %xmm0, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT: vfmaddsd %xmm1, (%rdi), %xmm0, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; BDVER-LABEL: test_vfmaddsd_128:
+; BDVER: # %bb.0:
+; BDVER-NEXT: #APP
+; BDVER-NEXT: vfmaddsd %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER-NEXT: vfmaddsd (%rdi), %xmm1, %xmm0, %xmm0
+; BDVER-NEXT: vfmaddsd %xmm1, (%rdi), %xmm0, %xmm0
+; BDVER-NEXT: #NO_APP
+; BDVER-NEXT: retq
+ tail call void asm "vfmaddsd $2, $1, $0, $0 \0A\09 vfmaddsd $3, $1, $0, $0 \0A\09 vfmaddsd $1, $3, $0, $0", "x,x,x,*m"(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> *%a3) nounwind
+ ret void
+}
+
+define void @test_vfmaddss_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> *%a3) optsize {
+; GENERIC-LABEL: test_vfmaddss_128:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: vfmaddss %xmm2, %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT: vfmaddss (%rdi), %xmm1, %xmm0, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT: vfmaddss %xmm1, (%rdi), %xmm0, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; BDVER-LABEL: test_vfmaddss_128:
+; BDVER: # %bb.0:
+; BDVER-NEXT: #APP
+; BDVER-NEXT: vfmaddss %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER-NEXT: vfmaddss (%rdi), %xmm1, %xmm0, %xmm0
+; BDVER-NEXT: vfmaddss %xmm1, (%rdi), %xmm0, %xmm0
+; BDVER-NEXT: #NO_APP
+; BDVER-NEXT: retq
+ tail call void asm "vfmaddss $2, $1, $0, $0 \0A\09 vfmaddss $3, $1, $0, $0 \0A\09 vfmaddss $1, $3, $0, $0", "x,x,x,*m"(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> *%a3) nounwind
+ ret void
+}
+
+;
+; VFMADDSUB
+;
+
+define void @test_vfmaddsubpd_128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> *%a3) optsize {
+; GENERIC-LABEL: test_vfmaddsubpd_128:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: vfmaddsubpd %xmm2, %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT: vfmaddsubpd (%rdi), %xmm1, %xmm0, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT: vfmaddsubpd %xmm1, (%rdi), %xmm0, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; BDVER-LABEL: test_vfmaddsubpd_128:
+; BDVER: # %bb.0:
+; BDVER-NEXT: #APP
+; BDVER-NEXT: vfmaddsubpd %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER-NEXT: vfmaddsubpd (%rdi), %xmm1, %xmm0, %xmm0
+; BDVER-NEXT: vfmaddsubpd %xmm1, (%rdi), %xmm0, %xmm0
+; BDVER-NEXT: #NO_APP
+; BDVER-NEXT: retq
+ tail call void asm "vfmaddsubpd $2, $1, $0, $0 \0A\09 vfmaddsubpd $3, $1, $0, $0 \0A\09 vfmaddsubpd $1, $3, $0, $0", "x,x,x,*m"(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> *%a3) nounwind
+ ret void
+}
+
+define void @test_vfmaddsubpd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, <4 x double> *%a3) optsize {
+; GENERIC-LABEL: test_vfmaddsubpd_256:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: vfmaddsubpd %ymm2, %ymm1, %ymm0, %ymm0 # sched: [5:0.50]
+; GENERIC-NEXT: vfmaddsubpd (%rdi), %ymm1, %ymm0, %ymm0 # sched: [9:0.50]
+; GENERIC-NEXT: vfmaddsubpd %ymm1, (%rdi), %ymm0, %ymm0 # sched: [9:0.50]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; BDVER-LABEL: test_vfmaddsubpd_256:
+; BDVER: # %bb.0:
+; BDVER-NEXT: #APP
+; BDVER-NEXT: vfmaddsubpd %ymm2, %ymm1, %ymm0, %ymm0
+; BDVER-NEXT: vfmaddsubpd (%rdi), %ymm1, %ymm0, %ymm0
+; BDVER-NEXT: vfmaddsubpd %ymm1, (%rdi), %ymm0, %ymm0
+; BDVER-NEXT: #NO_APP
+; BDVER-NEXT: vzeroupper
+; BDVER-NEXT: retq
+ tail call void asm "vfmaddsubpd $2, $1, $0, $0 \0A\09 vfmaddsubpd $3, $1, $0, $0 \0A\09 vfmaddsubpd $1, $3, $0, $0", "x,x,x,*m"(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, <4 x double> *%a3) nounwind
+ ret void
+}
+
+define void @test_vfmaddsubps_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> *%a3) optsize {
+; GENERIC-LABEL: test_vfmaddsubps_128:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: vfmaddsubps %xmm2, %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT: vfmaddsubps (%rdi), %xmm1, %xmm0, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT: vfmaddsubps %xmm1, (%rdi), %xmm0, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; BDVER-LABEL: test_vfmaddsubps_128:
+; BDVER: # %bb.0:
+; BDVER-NEXT: #APP
+; BDVER-NEXT: vfmaddsubps %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER-NEXT: vfmaddsubps (%rdi), %xmm1, %xmm0, %xmm0
+; BDVER-NEXT: vfmaddsubps %xmm1, (%rdi), %xmm0, %xmm0
+; BDVER-NEXT: #NO_APP
+; BDVER-NEXT: retq
+ tail call void asm "vfmaddsubps $2, $1, $0, $0 \0A\09 vfmaddsubps $3, $1, $0, $0 \0A\09 vfmaddsubps $1, $3, $0, $0", "x,x,x,*m"(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> *%a3) nounwind
+ ret void
+}
+
+define void @test_vfmaddsubps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, <8 x float> *%a3) optsize {
+; GENERIC-LABEL: test_vfmaddsubps_256:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: vfmaddsubps %ymm2, %ymm1, %ymm0, %ymm0 # sched: [5:0.50]
+; GENERIC-NEXT: vfmaddsubps (%rdi), %ymm1, %ymm0, %ymm0 # sched: [9:0.50]
+; GENERIC-NEXT: vfmaddsubps %ymm1, (%rdi), %ymm0, %ymm0 # sched: [9:0.50]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; BDVER-LABEL: test_vfmaddsubps_256:
+; BDVER: # %bb.0:
+; BDVER-NEXT: #APP
+; BDVER-NEXT: vfmaddsubps %ymm2, %ymm1, %ymm0, %ymm0
+; BDVER-NEXT: vfmaddsubps (%rdi), %ymm1, %ymm0, %ymm0
+; BDVER-NEXT: vfmaddsubps %ymm1, (%rdi), %ymm0, %ymm0
+; BDVER-NEXT: #NO_APP
+; BDVER-NEXT: vzeroupper
+; BDVER-NEXT: retq
+ tail call void asm "vfmaddsubps $2, $1, $0, $0 \0A\09 vfmaddsubps $3, $1, $0, $0 \0A\09 vfmaddsubps $1, $3, $0, $0", "x,x,x,*m"(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, <8 x float> *%a3) nounwind
+ ret void
+}
+
+;
+; VFMSUBADD
+;
+
+define void @test_vfmsubaddpd_128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> *%a3) optsize {
+; GENERIC-LABEL: test_vfmsubaddpd_128:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: vfmsubaddpd %xmm2, %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT: vfmsubaddpd (%rdi), %xmm1, %xmm0, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT: vfmsubaddpd %xmm1, (%rdi), %xmm0, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; BDVER-LABEL: test_vfmsubaddpd_128:
+; BDVER: # %bb.0:
+; BDVER-NEXT: #APP
+; BDVER-NEXT: vfmsubaddpd %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER-NEXT: vfmsubaddpd (%rdi), %xmm1, %xmm0, %xmm0
+; BDVER-NEXT: vfmsubaddpd %xmm1, (%rdi), %xmm0, %xmm0
+; BDVER-NEXT: #NO_APP
+; BDVER-NEXT: retq
+ tail call void asm "vfmsubaddpd $2, $1, $0, $0 \0A\09 vfmsubaddpd $3, $1, $0, $0 \0A\09 vfmsubaddpd $1, $3, $0, $0", "x,x,x,*m"(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> *%a3) nounwind
+ ret void
+}
+
+define void @test_vfmsubaddpd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, <4 x double> *%a3) optsize {
+; GENERIC-LABEL: test_vfmsubaddpd_256:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: vfmsubaddpd %ymm2, %ymm1, %ymm0, %ymm0 # sched: [5:0.50]
+; GENERIC-NEXT: vfmsubaddpd (%rdi), %ymm1, %ymm0, %ymm0 # sched: [9:0.50]
+; GENERIC-NEXT: vfmsubaddpd %ymm1, (%rdi), %ymm0, %ymm0 # sched: [9:0.50]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; BDVER-LABEL: test_vfmsubaddpd_256:
+; BDVER: # %bb.0:
+; BDVER-NEXT: #APP
+; BDVER-NEXT: vfmsubaddpd %ymm2, %ymm1, %ymm0, %ymm0
+; BDVER-NEXT: vfmsubaddpd (%rdi), %ymm1, %ymm0, %ymm0
+; BDVER-NEXT: vfmsubaddpd %ymm1, (%rdi), %ymm0, %ymm0
+; BDVER-NEXT: #NO_APP
+; BDVER-NEXT: vzeroupper
+; BDVER-NEXT: retq
+ tail call void asm "vfmsubaddpd $2, $1, $0, $0 \0A\09 vfmsubaddpd $3, $1, $0, $0 \0A\09 vfmsubaddpd $1, $3, $0, $0", "x,x,x,*m"(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, <4 x double> *%a3) nounwind
+ ret void
+}
+
+define void @test_vfmsubaddps_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> *%a3) optsize {
+; GENERIC-LABEL: test_vfmsubaddps_128:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: vfmsubaddps %xmm2, %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT: vfmsubaddps (%rdi), %xmm1, %xmm0, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT: vfmsubaddps %xmm1, (%rdi), %xmm0, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; BDVER-LABEL: test_vfmsubaddps_128:
+; BDVER: # %bb.0:
+; BDVER-NEXT: #APP
+; BDVER-NEXT: vfmsubaddps %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER-NEXT: vfmsubaddps (%rdi), %xmm1, %xmm0, %xmm0
+; BDVER-NEXT: vfmsubaddps %xmm1, (%rdi), %xmm0, %xmm0
+; BDVER-NEXT: #NO_APP
+; BDVER-NEXT: retq
+ tail call void asm "vfmsubaddps $2, $1, $0, $0 \0A\09 vfmsubaddps $3, $1, $0, $0 \0A\09 vfmsubaddps $1, $3, $0, $0", "x,x,x,*m"(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> *%a3) nounwind
+ ret void
+}
+
+define void @test_vfmsubaddps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, <8 x float> *%a3) optsize {
+; GENERIC-LABEL: test_vfmsubaddps_256:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: vfmsubaddps %ymm2, %ymm1, %ymm0, %ymm0 # sched: [5:0.50]
+; GENERIC-NEXT: vfmsubaddps (%rdi), %ymm1, %ymm0, %ymm0 # sched: [9:0.50]
+; GENERIC-NEXT: vfmsubaddps %ymm1, (%rdi), %ymm0, %ymm0 # sched: [9:0.50]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; BDVER-LABEL: test_vfmsubaddps_256:
+; BDVER: # %bb.0:
+; BDVER-NEXT: #APP
+; BDVER-NEXT: vfmsubaddps %ymm2, %ymm1, %ymm0, %ymm0
+; BDVER-NEXT: vfmsubaddps (%rdi), %ymm1, %ymm0, %ymm0
+; BDVER-NEXT: vfmsubaddps %ymm1, (%rdi), %ymm0, %ymm0
+; BDVER-NEXT: #NO_APP
+; BDVER-NEXT: vzeroupper
+; BDVER-NEXT: retq
+ tail call void asm "vfmsubaddps $2, $1, $0, $0 \0A\09 vfmsubaddps $3, $1, $0, $0 \0A\09 vfmsubaddps $1, $3, $0, $0", "x,x,x,*m"(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, <8 x float> *%a3) nounwind
+ ret void
+}
+
+;
+; VFMSUB
+;
+
+define void @test_vfmsubpd_128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> *%a3) optsize {
+; GENERIC-LABEL: test_vfmsubpd_128:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: vfmsubpd %xmm2, %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT: vfmsubpd (%rdi), %xmm1, %xmm0, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT: vfmsubpd %xmm1, (%rdi), %xmm0, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; BDVER-LABEL: test_vfmsubpd_128:
+; BDVER: # %bb.0:
+; BDVER-NEXT: #APP
+; BDVER-NEXT: vfmsubpd %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER-NEXT: vfmsubpd (%rdi), %xmm1, %xmm0, %xmm0
+; BDVER-NEXT: vfmsubpd %xmm1, (%rdi), %xmm0, %xmm0
+; BDVER-NEXT: #NO_APP
+; BDVER-NEXT: retq
+ tail call void asm "vfmsubpd $2, $1, $0, $0 \0A\09 vfmsubpd $3, $1, $0, $0 \0A\09 vfmsubpd $1, $3, $0, $0", "x,x,x,*m"(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> *%a3) nounwind
+ ret void
+}
+
+define void @test_vfmsubpd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, <4 x double> *%a3) optsize {
+; GENERIC-LABEL: test_vfmsubpd_256:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: vfmsubpd %ymm2, %ymm1, %ymm0, %ymm0 # sched: [5:0.50]
+; GENERIC-NEXT: vfmsubpd (%rdi), %ymm1, %ymm0, %ymm0 # sched: [9:0.50]
+; GENERIC-NEXT: vfmsubpd %ymm1, (%rdi), %ymm0, %ymm0 # sched: [9:0.50]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; BDVER-LABEL: test_vfmsubpd_256:
+; BDVER: # %bb.0:
+; BDVER-NEXT: #APP
+; BDVER-NEXT: vfmsubpd %ymm2, %ymm1, %ymm0, %ymm0
+; BDVER-NEXT: vfmsubpd (%rdi), %ymm1, %ymm0, %ymm0
+; BDVER-NEXT: vfmsubpd %ymm1, (%rdi), %ymm0, %ymm0
+; BDVER-NEXT: #NO_APP
+; BDVER-NEXT: vzeroupper
+; BDVER-NEXT: retq
+ tail call void asm "vfmsubpd $2, $1, $0, $0 \0A\09 vfmsubpd $3, $1, $0, $0 \0A\09 vfmsubpd $1, $3, $0, $0", "x,x,x,*m"(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, <4 x double> *%a3) nounwind
+ ret void
+}
+
+define void @test_vfmsubps_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> *%a3) optsize {
+; GENERIC-LABEL: test_vfmsubps_128:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: vfmsubps %xmm2, %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT: vfmsubps (%rdi), %xmm1, %xmm0, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT: vfmsubps %xmm1, (%rdi), %xmm0, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; BDVER-LABEL: test_vfmsubps_128:
+; BDVER: # %bb.0:
+; BDVER-NEXT: #APP
+; BDVER-NEXT: vfmsubps %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER-NEXT: vfmsubps (%rdi), %xmm1, %xmm0, %xmm0
+; BDVER-NEXT: vfmsubps %xmm1, (%rdi), %xmm0, %xmm0
+; BDVER-NEXT: #NO_APP
+; BDVER-NEXT: retq
+ tail call void asm "vfmsubps $2, $1, $0, $0 \0A\09 vfmsubps $3, $1, $0, $0 \0A\09 vfmsubps $1, $3, $0, $0", "x,x,x,*m"(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> *%a3) nounwind
+ ret void
+}
+
+define void @test_vfmsubps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, <8 x float> *%a3) optsize {
+; GENERIC-LABEL: test_vfmsubps_256:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: vfmsubps %ymm2, %ymm1, %ymm0, %ymm0 # sched: [5:0.50]
+; GENERIC-NEXT: vfmsubps (%rdi), %ymm1, %ymm0, %ymm0 # sched: [9:0.50]
+; GENERIC-NEXT: vfmsubps %ymm1, (%rdi), %ymm0, %ymm0 # sched: [9:0.50]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; BDVER-LABEL: test_vfmsubps_256:
+; BDVER: # %bb.0:
+; BDVER-NEXT: #APP
+; BDVER-NEXT: vfmsubps %ymm2, %ymm1, %ymm0, %ymm0
+; BDVER-NEXT: vfmsubps (%rdi), %ymm1, %ymm0, %ymm0
+; BDVER-NEXT: vfmsubps %ymm1, (%rdi), %ymm0, %ymm0
+; BDVER-NEXT: #NO_APP
+; BDVER-NEXT: vzeroupper
+; BDVER-NEXT: retq
+ tail call void asm "vfmsubps $2, $1, $0, $0 \0A\09 vfmsubps $3, $1, $0, $0 \0A\09 vfmsubps $1, $3, $0, $0", "x,x,x,*m"(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, <8 x float> *%a3) nounwind
+ ret void
+}
+
+define void @test_vfmsubsd_128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> *%a3) optsize {
+; GENERIC-LABEL: test_vfmsubsd_128:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: vfmsubsd %xmm2, %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT: vfmsubsd (%rdi), %xmm1, %xmm0, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT: vfmsubsd %xmm1, (%rdi), %xmm0, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; BDVER-LABEL: test_vfmsubsd_128:
+; BDVER: # %bb.0:
+; BDVER-NEXT: #APP
+; BDVER-NEXT: vfmsubsd %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER-NEXT: vfmsubsd (%rdi), %xmm1, %xmm0, %xmm0
+; BDVER-NEXT: vfmsubsd %xmm1, (%rdi), %xmm0, %xmm0
+; BDVER-NEXT: #NO_APP
+; BDVER-NEXT: retq
+ tail call void asm "vfmsubsd $2, $1, $0, $0 \0A\09 vfmsubsd $3, $1, $0, $0 \0A\09 vfmsubsd $1, $3, $0, $0", "x,x,x,*m"(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> *%a3) nounwind
+ ret void
+}
+
+define void @test_vfmsubss_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> *%a3) optsize {
+; GENERIC-LABEL: test_vfmsubss_128:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: vfmsubss %xmm2, %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT: vfmsubss (%rdi), %xmm1, %xmm0, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT: vfmsubss %xmm1, (%rdi), %xmm0, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; BDVER-LABEL: test_vfmsubss_128:
+; BDVER: # %bb.0:
+; BDVER-NEXT: #APP
+; BDVER-NEXT: vfmsubss %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER-NEXT: vfmsubss (%rdi), %xmm1, %xmm0, %xmm0
+; BDVER-NEXT: vfmsubss %xmm1, (%rdi), %xmm0, %xmm0
+; BDVER-NEXT: #NO_APP
+; BDVER-NEXT: retq
+ tail call void asm "vfmsubss $2, $1, $0, $0 \0A\09 vfmsubss $3, $1, $0, $0 \0A\09 vfmsubss $1, $3, $0, $0", "x,x,x,*m"(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> *%a3) nounwind
+ ret void
+}
+
+;
+; VFNMADD
+;
+
+define void @test_vfnmaddpd_128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> *%a3) optsize {
+; GENERIC-LABEL: test_vfnmaddpd_128:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: vfnmaddpd %xmm2, %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT: vfnmaddpd (%rdi), %xmm1, %xmm0, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT: vfnmaddpd %xmm1, (%rdi), %xmm0, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; BDVER-LABEL: test_vfnmaddpd_128:
+; BDVER: # %bb.0:
+; BDVER-NEXT: #APP
+; BDVER-NEXT: vfnmaddpd %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER-NEXT: vfnmaddpd (%rdi), %xmm1, %xmm0, %xmm0
+; BDVER-NEXT: vfnmaddpd %xmm1, (%rdi), %xmm0, %xmm0
+; BDVER-NEXT: #NO_APP
+; BDVER-NEXT: retq
+ tail call void asm "vfnmaddpd $2, $1, $0, $0 \0A\09 vfnmaddpd $3, $1, $0, $0 \0A\09 vfnmaddpd $1, $3, $0, $0", "x,x,x,*m"(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> *%a3) nounwind
+ ret void
+}
+
+define void @test_vfnmaddpd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, <4 x double> *%a3) optsize {
+; GENERIC-LABEL: test_vfnmaddpd_256:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: vfnmaddpd %ymm2, %ymm1, %ymm0, %ymm0 # sched: [5:0.50]
+; GENERIC-NEXT: vfnmaddpd (%rdi), %ymm1, %ymm0, %ymm0 # sched: [9:0.50]
+; GENERIC-NEXT: vfnmaddpd %ymm1, (%rdi), %ymm0, %ymm0 # sched: [9:0.50]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; BDVER-LABEL: test_vfnmaddpd_256:
+; BDVER: # %bb.0:
+; BDVER-NEXT: #APP
+; BDVER-NEXT: vfnmaddpd %ymm2, %ymm1, %ymm0, %ymm0
+; BDVER-NEXT: vfnmaddpd (%rdi), %ymm1, %ymm0, %ymm0
+; BDVER-NEXT: vfnmaddpd %ymm1, (%rdi), %ymm0, %ymm0
+; BDVER-NEXT: #NO_APP
+; BDVER-NEXT: vzeroupper
+; BDVER-NEXT: retq
+ tail call void asm "vfnmaddpd $2, $1, $0, $0 \0A\09 vfnmaddpd $3, $1, $0, $0 \0A\09 vfnmaddpd $1, $3, $0, $0", "x,x,x,*m"(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, <4 x double> *%a3) nounwind
+ ret void
+}
+
+define void @test_vfnmaddps_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> *%a3) optsize {
+; GENERIC-LABEL: test_vfnmaddps_128:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: vfnmaddps %xmm2, %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT: vfnmaddps (%rdi), %xmm1, %xmm0, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT: vfnmaddps %xmm1, (%rdi), %xmm0, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; BDVER-LABEL: test_vfnmaddps_128:
+; BDVER: # %bb.0:
+; BDVER-NEXT: #APP
+; BDVER-NEXT: vfnmaddps %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER-NEXT: vfnmaddps (%rdi), %xmm1, %xmm0, %xmm0
+; BDVER-NEXT: vfnmaddps %xmm1, (%rdi), %xmm0, %xmm0
+; BDVER-NEXT: #NO_APP
+; BDVER-NEXT: retq
+ tail call void asm "vfnmaddps $2, $1, $0, $0 \0A\09 vfnmaddps $3, $1, $0, $0 \0A\09 vfnmaddps $1, $3, $0, $0", "x,x,x,*m"(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> *%a3) nounwind
+ ret void
+}
+
+define void @test_vfnmaddps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, <8 x float> *%a3) optsize {
+; GENERIC-LABEL: test_vfnmaddps_256:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: vfnmaddps %ymm2, %ymm1, %ymm0, %ymm0 # sched: [5:0.50]
+; GENERIC-NEXT: vfnmaddps (%rdi), %ymm1, %ymm0, %ymm0 # sched: [9:0.50]
+; GENERIC-NEXT: vfnmaddps %ymm1, (%rdi), %ymm0, %ymm0 # sched: [9:0.50]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; BDVER-LABEL: test_vfnmaddps_256:
+; BDVER: # %bb.0:
+; BDVER-NEXT: #APP
+; BDVER-NEXT: vfnmaddps %ymm2, %ymm1, %ymm0, %ymm0
+; BDVER-NEXT: vfnmaddps (%rdi), %ymm1, %ymm0, %ymm0
+; BDVER-NEXT: vfnmaddps %ymm1, (%rdi), %ymm0, %ymm0
+; BDVER-NEXT: #NO_APP
+; BDVER-NEXT: vzeroupper
+; BDVER-NEXT: retq
+ tail call void asm "vfnmaddps $2, $1, $0, $0 \0A\09 vfnmaddps $3, $1, $0, $0 \0A\09 vfnmaddps $1, $3, $0, $0", "x,x,x,*m"(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, <8 x float> *%a3) nounwind
+ ret void
+}
+
+define void @test_vfnmaddsd_128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> *%a3) optsize {
+; GENERIC-LABEL: test_vfnmaddsd_128:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: vfnmaddsd %xmm2, %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT: vfnmaddsd (%rdi), %xmm1, %xmm0, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT: vfnmaddsd %xmm1, (%rdi), %xmm0, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; BDVER-LABEL: test_vfnmaddsd_128:
+; BDVER: # %bb.0:
+; BDVER-NEXT: #APP
+; BDVER-NEXT: vfnmaddsd %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER-NEXT: vfnmaddsd (%rdi), %xmm1, %xmm0, %xmm0
+; BDVER-NEXT: vfnmaddsd %xmm1, (%rdi), %xmm0, %xmm0
+; BDVER-NEXT: #NO_APP
+; BDVER-NEXT: retq
+ tail call void asm "vfnmaddsd $2, $1, $0, $0 \0A\09 vfnmaddsd $3, $1, $0, $0 \0A\09 vfnmaddsd $1, $3, $0, $0", "x,x,x,*m"(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> *%a3) nounwind
+ ret void
+}
+
+define void @test_vfnmaddss_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> *%a3) optsize {
+; GENERIC-LABEL: test_vfnmaddss_128:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: vfnmaddss %xmm2, %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT: vfnmaddss (%rdi), %xmm1, %xmm0, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT: vfnmaddss %xmm1, (%rdi), %xmm0, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; BDVER-LABEL: test_vfnmaddss_128:
+; BDVER: # %bb.0:
+; BDVER-NEXT: #APP
+; BDVER-NEXT: vfnmaddss %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER-NEXT: vfnmaddss (%rdi), %xmm1, %xmm0, %xmm0
+; BDVER-NEXT: vfnmaddss %xmm1, (%rdi), %xmm0, %xmm0
+; BDVER-NEXT: #NO_APP
+; BDVER-NEXT: retq
+ tail call void asm "vfnmaddss $2, $1, $0, $0 \0A\09 vfnmaddss $3, $1, $0, $0 \0A\09 vfnmaddss $1, $3, $0, $0", "x,x,x,*m"(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> *%a3) nounwind
+ ret void
+}
+
+;
+; VFNMSUB
+;
+
+define void @test_vfnmsubpd_128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> *%a3) optsize {
+; GENERIC-LABEL: test_vfnmsubpd_128:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: vfnmsubpd %xmm2, %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT: vfnmsubpd (%rdi), %xmm1, %xmm0, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT: vfnmsubpd %xmm1, (%rdi), %xmm0, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; BDVER-LABEL: test_vfnmsubpd_128:
+; BDVER: # %bb.0:
+; BDVER-NEXT: #APP
+; BDVER-NEXT: vfnmsubpd %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER-NEXT: vfnmsubpd (%rdi), %xmm1, %xmm0, %xmm0
+; BDVER-NEXT: vfnmsubpd %xmm1, (%rdi), %xmm0, %xmm0
+; BDVER-NEXT: #NO_APP
+; BDVER-NEXT: retq
+ tail call void asm "vfnmsubpd $2, $1, $0, $0 \0A\09 vfnmsubpd $3, $1, $0, $0 \0A\09 vfnmsubpd $1, $3, $0, $0", "x,x,x,*m"(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> *%a3) nounwind
+ ret void
+}
+
+define void @test_vfnmsubpd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, <4 x double> *%a3) optsize {
+; GENERIC-LABEL: test_vfnmsubpd_256:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: vfnmsubpd %ymm2, %ymm1, %ymm0, %ymm0 # sched: [5:0.50]
+; GENERIC-NEXT: vfnmsubpd (%rdi), %ymm1, %ymm0, %ymm0 # sched: [9:0.50]
+; GENERIC-NEXT: vfnmsubpd %ymm1, (%rdi), %ymm0, %ymm0 # sched: [9:0.50]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; BDVER-LABEL: test_vfnmsubpd_256:
+; BDVER: # %bb.0:
+; BDVER-NEXT: #APP
+; BDVER-NEXT: vfnmsubpd %ymm2, %ymm1, %ymm0, %ymm0
+; BDVER-NEXT: vfnmsubpd (%rdi), %ymm1, %ymm0, %ymm0
+; BDVER-NEXT: vfnmsubpd %ymm1, (%rdi), %ymm0, %ymm0
+; BDVER-NEXT: #NO_APP
+; BDVER-NEXT: vzeroupper
+; BDVER-NEXT: retq
+ tail call void asm "vfnmsubpd $2, $1, $0, $0 \0A\09 vfnmsubpd $3, $1, $0, $0 \0A\09 vfnmsubpd $1, $3, $0, $0", "x,x,x,*m"(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, <4 x double> *%a3) nounwind
+ ret void
+}
+
+define void @test_vfnmsubps_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> *%a3) optsize {
+; GENERIC-LABEL: test_vfnmsubps_128:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: vfnmsubps %xmm2, %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT: vfnmsubps (%rdi), %xmm1, %xmm0, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT: vfnmsubps %xmm1, (%rdi), %xmm0, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; BDVER-LABEL: test_vfnmsubps_128:
+; BDVER: # %bb.0:
+; BDVER-NEXT: #APP
+; BDVER-NEXT: vfnmsubps %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER-NEXT: vfnmsubps (%rdi), %xmm1, %xmm0, %xmm0
+; BDVER-NEXT: vfnmsubps %xmm1, (%rdi), %xmm0, %xmm0
+; BDVER-NEXT: #NO_APP
+; BDVER-NEXT: retq
+ tail call void asm "vfnmsubps $2, $1, $0, $0 \0A\09 vfnmsubps $3, $1, $0, $0 \0A\09 vfnmsubps $1, $3, $0, $0", "x,x,x,*m"(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> *%a3) nounwind
+ ret void
+}
+
+define void @test_vfnmsubps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, <8 x float> *%a3) optsize {
+; GENERIC-LABEL: test_vfnmsubps_256:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: vfnmsubps %ymm2, %ymm1, %ymm0, %ymm0 # sched: [5:0.50]
+; GENERIC-NEXT: vfnmsubps (%rdi), %ymm1, %ymm0, %ymm0 # sched: [9:0.50]
+; GENERIC-NEXT: vfnmsubps %ymm1, (%rdi), %ymm0, %ymm0 # sched: [9:0.50]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; BDVER-LABEL: test_vfnmsubps_256:
+; BDVER: # %bb.0:
+; BDVER-NEXT: #APP
+; BDVER-NEXT: vfnmsubps %ymm2, %ymm1, %ymm0, %ymm0
+; BDVER-NEXT: vfnmsubps (%rdi), %ymm1, %ymm0, %ymm0
+; BDVER-NEXT: vfnmsubps %ymm1, (%rdi), %ymm0, %ymm0
+; BDVER-NEXT: #NO_APP
+; BDVER-NEXT: vzeroupper
+; BDVER-NEXT: retq
+ tail call void asm "vfnmsubps $2, $1, $0, $0 \0A\09 vfnmsubps $3, $1, $0, $0 \0A\09 vfnmsubps $1, $3, $0, $0", "x,x,x,*m"(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, <8 x float> *%a3) nounwind
+ ret void
+}
+
+define void @test_vfnmsubsd_128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> *%a3) optsize {
+; GENERIC-LABEL: test_vfnmsubsd_128:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: vfnmsubsd %xmm2, %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT: vfnmsubsd (%rdi), %xmm1, %xmm0, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT: vfnmsubsd %xmm1, (%rdi), %xmm0, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; BDVER-LABEL: test_vfnmsubsd_128:
+; BDVER: # %bb.0:
+; BDVER-NEXT: #APP
+; BDVER-NEXT: vfnmsubsd %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER-NEXT: vfnmsubsd (%rdi), %xmm1, %xmm0, %xmm0
+; BDVER-NEXT: vfnmsubsd %xmm1, (%rdi), %xmm0, %xmm0
+; BDVER-NEXT: #NO_APP
+; BDVER-NEXT: retq
+ tail call void asm "vfnmsubsd $2, $1, $0, $0 \0A\09 vfnmsubsd $3, $1, $0, $0 \0A\09 vfnmsubsd $1, $3, $0, $0", "x,x,x,*m"(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> *%a3) nounwind
+ ret void
+}
+
+define void @test_vfnmsubss_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> *%a3) optsize {
+; GENERIC-LABEL: test_vfnmsubss_128:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: vfnmsubss %xmm2, %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT: vfnmsubss (%rdi), %xmm1, %xmm0, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT: vfnmsubss %xmm1, (%rdi), %xmm0, %xmm0 # sched: [9:0.50]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; BDVER-LABEL: test_vfnmsubss_128:
+; BDVER: # %bb.0:
+; BDVER-NEXT: #APP
+; BDVER-NEXT: vfnmsubss %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER-NEXT: vfnmsubss (%rdi), %xmm1, %xmm0, %xmm0
+; BDVER-NEXT: vfnmsubss %xmm1, (%rdi), %xmm0, %xmm0
+; BDVER-NEXT: #NO_APP
+; BDVER-NEXT: retq
+ tail call void asm "vfnmsubss $2, $1, $0, $0 \0A\09 vfnmsubss $3, $1, $0, $0 \0A\09 vfnmsubss $1, $3, $0, $0", "x,x,x,*m"(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> *%a3) nounwind
+ ret void
+}
diff --git a/test/CodeGen/X86/fma_patterns.ll b/test/CodeGen/X86/fma_patterns.ll
index 002b0746d3c3..2b4a686b0dea 100644
--- a/test/CodeGen/X86/fma_patterns.ll
+++ b/test/CodeGen/X86/fma_patterns.ll
@@ -14,17 +14,17 @@
define float @test_f32_fmadd(float %a0, float %a1, float %a2) {
; FMA-LABEL: test_f32_fmadd:
-; FMA: # BB#0:
+; FMA: # %bb.0:
; FMA-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm0
; FMA-NEXT: retq
;
; FMA4-LABEL: test_f32_fmadd:
-; FMA4: # BB#0:
+; FMA4: # %bb.0:
; FMA4-NEXT: vfmaddss %xmm2, %xmm1, %xmm0, %xmm0
; FMA4-NEXT: retq
;
; AVX512-LABEL: test_f32_fmadd:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm0
; AVX512-NEXT: retq
%x = fmul float %a0, %a1
@@ -34,17 +34,17 @@ define float @test_f32_fmadd(float %a0, float %a1, float %a2) {
define <4 x float> @test_4f32_fmadd(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
; FMA-LABEL: test_4f32_fmadd:
-; FMA: # BB#0:
+; FMA: # %bb.0:
; FMA-NEXT: vfmadd213ps %xmm2, %xmm1, %xmm0
; FMA-NEXT: retq
;
; FMA4-LABEL: test_4f32_fmadd:
-; FMA4: # BB#0:
+; FMA4: # %bb.0:
; FMA4-NEXT: vfmaddps %xmm2, %xmm1, %xmm0, %xmm0
; FMA4-NEXT: retq
;
; AVX512-LABEL: test_4f32_fmadd:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vfmadd213ps %xmm2, %xmm1, %xmm0
; AVX512-NEXT: retq
%x = fmul <4 x float> %a0, %a1
@@ -54,17 +54,17 @@ define <4 x float> @test_4f32_fmadd(<4 x float> %a0, <4 x float> %a1, <4 x float
define <8 x float> @test_8f32_fmadd(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) {
; FMA-LABEL: test_8f32_fmadd:
-; FMA: # BB#0:
+; FMA: # %bb.0:
; FMA-NEXT: vfmadd213ps %ymm2, %ymm1, %ymm0
; FMA-NEXT: retq
;
; FMA4-LABEL: test_8f32_fmadd:
-; FMA4: # BB#0:
+; FMA4: # %bb.0:
; FMA4-NEXT: vfmaddps %ymm2, %ymm1, %ymm0, %ymm0
; FMA4-NEXT: retq
;
; AVX512-LABEL: test_8f32_fmadd:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vfmadd213ps %ymm2, %ymm1, %ymm0
; AVX512-NEXT: retq
%x = fmul <8 x float> %a0, %a1
@@ -74,17 +74,17 @@ define <8 x float> @test_8f32_fmadd(<8 x float> %a0, <8 x float> %a1, <8 x float
define double @test_f64_fmadd(double %a0, double %a1, double %a2) {
; FMA-LABEL: test_f64_fmadd:
-; FMA: # BB#0:
+; FMA: # %bb.0:
; FMA-NEXT: vfmadd213sd %xmm2, %xmm1, %xmm0
; FMA-NEXT: retq
;
; FMA4-LABEL: test_f64_fmadd:
-; FMA4: # BB#0:
+; FMA4: # %bb.0:
; FMA4-NEXT: vfmaddsd %xmm2, %xmm1, %xmm0, %xmm0
; FMA4-NEXT: retq
;
; AVX512-LABEL: test_f64_fmadd:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vfmadd213sd %xmm2, %xmm1, %xmm0
; AVX512-NEXT: retq
%x = fmul double %a0, %a1
@@ -94,17 +94,17 @@ define double @test_f64_fmadd(double %a0, double %a1, double %a2) {
define <2 x double> @test_2f64_fmadd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
; FMA-LABEL: test_2f64_fmadd:
-; FMA: # BB#0:
+; FMA: # %bb.0:
; FMA-NEXT: vfmadd213pd %xmm2, %xmm1, %xmm0
; FMA-NEXT: retq
;
; FMA4-LABEL: test_2f64_fmadd:
-; FMA4: # BB#0:
+; FMA4: # %bb.0:
; FMA4-NEXT: vfmaddpd %xmm2, %xmm1, %xmm0, %xmm0
; FMA4-NEXT: retq
;
; AVX512-LABEL: test_2f64_fmadd:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vfmadd213pd %xmm2, %xmm1, %xmm0
; AVX512-NEXT: retq
%x = fmul <2 x double> %a0, %a1
@@ -114,17 +114,17 @@ define <2 x double> @test_2f64_fmadd(<2 x double> %a0, <2 x double> %a1, <2 x do
define <4 x double> @test_4f64_fmadd(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) {
; FMA-LABEL: test_4f64_fmadd:
-; FMA: # BB#0:
+; FMA: # %bb.0:
; FMA-NEXT: vfmadd213pd %ymm2, %ymm1, %ymm0
; FMA-NEXT: retq
;
; FMA4-LABEL: test_4f64_fmadd:
-; FMA4: # BB#0:
+; FMA4: # %bb.0:
; FMA4-NEXT: vfmaddpd %ymm2, %ymm1, %ymm0, %ymm0
; FMA4-NEXT: retq
;
; AVX512-LABEL: test_4f64_fmadd:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vfmadd213pd %ymm2, %ymm1, %ymm0
; AVX512-NEXT: retq
%x = fmul <4 x double> %a0, %a1
@@ -138,17 +138,17 @@ define <4 x double> @test_4f64_fmadd(<4 x double> %a0, <4 x double> %a1, <4 x do
define float @test_f32_fmsub(float %a0, float %a1, float %a2) {
; FMA-LABEL: test_f32_fmsub:
-; FMA: # BB#0:
+; FMA: # %bb.0:
; FMA-NEXT: vfmsub213ss %xmm2, %xmm1, %xmm0
; FMA-NEXT: retq
;
; FMA4-LABEL: test_f32_fmsub:
-; FMA4: # BB#0:
+; FMA4: # %bb.0:
; FMA4-NEXT: vfmsubss %xmm2, %xmm1, %xmm0, %xmm0
; FMA4-NEXT: retq
;
; AVX512-LABEL: test_f32_fmsub:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vfmsub213ss %xmm2, %xmm1, %xmm0
; AVX512-NEXT: retq
%x = fmul float %a0, %a1
@@ -158,17 +158,17 @@ define float @test_f32_fmsub(float %a0, float %a1, float %a2) {
define <4 x float> @test_4f32_fmsub(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
; FMA-LABEL: test_4f32_fmsub:
-; FMA: # BB#0:
+; FMA: # %bb.0:
; FMA-NEXT: vfmsub213ps %xmm2, %xmm1, %xmm0
; FMA-NEXT: retq
;
; FMA4-LABEL: test_4f32_fmsub:
-; FMA4: # BB#0:
+; FMA4: # %bb.0:
; FMA4-NEXT: vfmsubps %xmm2, %xmm1, %xmm0, %xmm0
; FMA4-NEXT: retq
;
; AVX512-LABEL: test_4f32_fmsub:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vfmsub213ps %xmm2, %xmm1, %xmm0
; AVX512-NEXT: retq
%x = fmul <4 x float> %a0, %a1
@@ -178,17 +178,17 @@ define <4 x float> @test_4f32_fmsub(<4 x float> %a0, <4 x float> %a1, <4 x float
define <8 x float> @test_8f32_fmsub(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) {
; FMA-LABEL: test_8f32_fmsub:
-; FMA: # BB#0:
+; FMA: # %bb.0:
; FMA-NEXT: vfmsub213ps %ymm2, %ymm1, %ymm0
; FMA-NEXT: retq
;
; FMA4-LABEL: test_8f32_fmsub:
-; FMA4: # BB#0:
+; FMA4: # %bb.0:
; FMA4-NEXT: vfmsubps %ymm2, %ymm1, %ymm0, %ymm0
; FMA4-NEXT: retq
;
; AVX512-LABEL: test_8f32_fmsub:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vfmsub213ps %ymm2, %ymm1, %ymm0
; AVX512-NEXT: retq
%x = fmul <8 x float> %a0, %a1
@@ -198,17 +198,17 @@ define <8 x float> @test_8f32_fmsub(<8 x float> %a0, <8 x float> %a1, <8 x float
define double @test_f64_fmsub(double %a0, double %a1, double %a2) {
; FMA-LABEL: test_f64_fmsub:
-; FMA: # BB#0:
+; FMA: # %bb.0:
; FMA-NEXT: vfmsub213sd %xmm2, %xmm1, %xmm0
; FMA-NEXT: retq
;
; FMA4-LABEL: test_f64_fmsub:
-; FMA4: # BB#0:
+; FMA4: # %bb.0:
; FMA4-NEXT: vfmsubsd %xmm2, %xmm1, %xmm0, %xmm0
; FMA4-NEXT: retq
;
; AVX512-LABEL: test_f64_fmsub:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vfmsub213sd %xmm2, %xmm1, %xmm0
; AVX512-NEXT: retq
%x = fmul double %a0, %a1
@@ -218,17 +218,17 @@ define double @test_f64_fmsub(double %a0, double %a1, double %a2) {
define <2 x double> @test_2f64_fmsub(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
; FMA-LABEL: test_2f64_fmsub:
-; FMA: # BB#0:
+; FMA: # %bb.0:
; FMA-NEXT: vfmsub213pd %xmm2, %xmm1, %xmm0
; FMA-NEXT: retq
;
; FMA4-LABEL: test_2f64_fmsub:
-; FMA4: # BB#0:
+; FMA4: # %bb.0:
; FMA4-NEXT: vfmsubpd %xmm2, %xmm1, %xmm0, %xmm0
; FMA4-NEXT: retq
;
; AVX512-LABEL: test_2f64_fmsub:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vfmsub213pd %xmm2, %xmm1, %xmm0
; AVX512-NEXT: retq
%x = fmul <2 x double> %a0, %a1
@@ -238,17 +238,17 @@ define <2 x double> @test_2f64_fmsub(<2 x double> %a0, <2 x double> %a1, <2 x do
define <4 x double> @test_4f64_fmsub(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) {
; FMA-LABEL: test_4f64_fmsub:
-; FMA: # BB#0:
+; FMA: # %bb.0:
; FMA-NEXT: vfmsub213pd %ymm2, %ymm1, %ymm0
; FMA-NEXT: retq
;
; FMA4-LABEL: test_4f64_fmsub:
-; FMA4: # BB#0:
+; FMA4: # %bb.0:
; FMA4-NEXT: vfmsubpd %ymm2, %ymm1, %ymm0, %ymm0
; FMA4-NEXT: retq
;
; AVX512-LABEL: test_4f64_fmsub:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vfmsub213pd %ymm2, %ymm1, %ymm0
; AVX512-NEXT: retq
%x = fmul <4 x double> %a0, %a1
@@ -262,17 +262,17 @@ define <4 x double> @test_4f64_fmsub(<4 x double> %a0, <4 x double> %a1, <4 x do
define float @test_f32_fnmadd(float %a0, float %a1, float %a2) {
; FMA-LABEL: test_f32_fnmadd:
-; FMA: # BB#0:
+; FMA: # %bb.0:
; FMA-NEXT: vfnmadd213ss %xmm2, %xmm1, %xmm0
; FMA-NEXT: retq
;
; FMA4-LABEL: test_f32_fnmadd:
-; FMA4: # BB#0:
+; FMA4: # %bb.0:
; FMA4-NEXT: vfnmaddss %xmm2, %xmm1, %xmm0, %xmm0
; FMA4-NEXT: retq
;
; AVX512-LABEL: test_f32_fnmadd:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vfnmadd213ss %xmm2, %xmm1, %xmm0
; AVX512-NEXT: retq
%x = fmul float %a0, %a1
@@ -282,17 +282,17 @@ define float @test_f32_fnmadd(float %a0, float %a1, float %a2) {
define <4 x float> @test_4f32_fnmadd(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
; FMA-LABEL: test_4f32_fnmadd:
-; FMA: # BB#0:
+; FMA: # %bb.0:
; FMA-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0
; FMA-NEXT: retq
;
; FMA4-LABEL: test_4f32_fnmadd:
-; FMA4: # BB#0:
+; FMA4: # %bb.0:
; FMA4-NEXT: vfnmaddps %xmm2, %xmm1, %xmm0, %xmm0
; FMA4-NEXT: retq
;
; AVX512-LABEL: test_4f32_fnmadd:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0
; AVX512-NEXT: retq
%x = fmul <4 x float> %a0, %a1
@@ -302,17 +302,17 @@ define <4 x float> @test_4f32_fnmadd(<4 x float> %a0, <4 x float> %a1, <4 x floa
define <8 x float> @test_8f32_fnmadd(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) {
; FMA-LABEL: test_8f32_fnmadd:
-; FMA: # BB#0:
+; FMA: # %bb.0:
; FMA-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0
; FMA-NEXT: retq
;
; FMA4-LABEL: test_8f32_fnmadd:
-; FMA4: # BB#0:
+; FMA4: # %bb.0:
; FMA4-NEXT: vfnmaddps %ymm2, %ymm1, %ymm0, %ymm0
; FMA4-NEXT: retq
;
; AVX512-LABEL: test_8f32_fnmadd:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0
; AVX512-NEXT: retq
%x = fmul <8 x float> %a0, %a1
@@ -322,17 +322,17 @@ define <8 x float> @test_8f32_fnmadd(<8 x float> %a0, <8 x float> %a1, <8 x floa
define double @test_f64_fnmadd(double %a0, double %a1, double %a2) {
; FMA-LABEL: test_f64_fnmadd:
-; FMA: # BB#0:
+; FMA: # %bb.0:
; FMA-NEXT: vfnmadd213sd %xmm2, %xmm1, %xmm0
; FMA-NEXT: retq
;
; FMA4-LABEL: test_f64_fnmadd:
-; FMA4: # BB#0:
+; FMA4: # %bb.0:
; FMA4-NEXT: vfnmaddsd %xmm2, %xmm1, %xmm0, %xmm0
; FMA4-NEXT: retq
;
; AVX512-LABEL: test_f64_fnmadd:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vfnmadd213sd %xmm2, %xmm1, %xmm0
; AVX512-NEXT: retq
%x = fmul double %a0, %a1
@@ -342,17 +342,17 @@ define double @test_f64_fnmadd(double %a0, double %a1, double %a2) {
define <2 x double> @test_2f64_fnmadd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
; FMA-LABEL: test_2f64_fnmadd:
-; FMA: # BB#0:
+; FMA: # %bb.0:
; FMA-NEXT: vfnmadd213pd %xmm2, %xmm1, %xmm0
; FMA-NEXT: retq
;
; FMA4-LABEL: test_2f64_fnmadd:
-; FMA4: # BB#0:
+; FMA4: # %bb.0:
; FMA4-NEXT: vfnmaddpd %xmm2, %xmm1, %xmm0, %xmm0
; FMA4-NEXT: retq
;
; AVX512-LABEL: test_2f64_fnmadd:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vfnmadd213pd %xmm2, %xmm1, %xmm0
; AVX512-NEXT: retq
%x = fmul <2 x double> %a0, %a1
@@ -362,17 +362,17 @@ define <2 x double> @test_2f64_fnmadd(<2 x double> %a0, <2 x double> %a1, <2 x d
define <4 x double> @test_4f64_fnmadd(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) {
; FMA-LABEL: test_4f64_fnmadd:
-; FMA: # BB#0:
+; FMA: # %bb.0:
; FMA-NEXT: vfnmadd213pd %ymm2, %ymm1, %ymm0
; FMA-NEXT: retq
;
; FMA4-LABEL: test_4f64_fnmadd:
-; FMA4: # BB#0:
+; FMA4: # %bb.0:
; FMA4-NEXT: vfnmaddpd %ymm2, %ymm1, %ymm0, %ymm0
; FMA4-NEXT: retq
;
; AVX512-LABEL: test_4f64_fnmadd:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vfnmadd213pd %ymm2, %ymm1, %ymm0
; AVX512-NEXT: retq
%x = fmul <4 x double> %a0, %a1
@@ -386,17 +386,17 @@ define <4 x double> @test_4f64_fnmadd(<4 x double> %a0, <4 x double> %a1, <4 x d
define float @test_f32_fnmsub(float %a0, float %a1, float %a2) {
; FMA-LABEL: test_f32_fnmsub:
-; FMA: # BB#0:
+; FMA: # %bb.0:
; FMA-NEXT: vfnmsub213ss %xmm2, %xmm1, %xmm0
; FMA-NEXT: retq
;
; FMA4-LABEL: test_f32_fnmsub:
-; FMA4: # BB#0:
+; FMA4: # %bb.0:
; FMA4-NEXT: vfnmsubss %xmm2, %xmm1, %xmm0, %xmm0
; FMA4-NEXT: retq
;
; AVX512-LABEL: test_f32_fnmsub:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vfnmsub213ss %xmm2, %xmm1, %xmm0
; AVX512-NEXT: retq
%x = fmul float %a0, %a1
@@ -407,17 +407,17 @@ define float @test_f32_fnmsub(float %a0, float %a1, float %a2) {
define <4 x float> @test_4f32_fnmsub(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
; FMA-LABEL: test_4f32_fnmsub:
-; FMA: # BB#0:
+; FMA: # %bb.0:
; FMA-NEXT: vfnmsub213ps %xmm2, %xmm1, %xmm0
; FMA-NEXT: retq
;
; FMA4-LABEL: test_4f32_fnmsub:
-; FMA4: # BB#0:
+; FMA4: # %bb.0:
; FMA4-NEXT: vfnmsubps %xmm2, %xmm1, %xmm0, %xmm0
; FMA4-NEXT: retq
;
; AVX512-LABEL: test_4f32_fnmsub:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vfnmsub213ps %xmm2, %xmm1, %xmm0
; AVX512-NEXT: retq
%x = fmul <4 x float> %a0, %a1
@@ -428,17 +428,17 @@ define <4 x float> @test_4f32_fnmsub(<4 x float> %a0, <4 x float> %a1, <4 x floa
define <8 x float> @test_8f32_fnmsub(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) {
; FMA-LABEL: test_8f32_fnmsub:
-; FMA: # BB#0:
+; FMA: # %bb.0:
; FMA-NEXT: vfnmsub213ps %ymm2, %ymm1, %ymm0
; FMA-NEXT: retq
;
; FMA4-LABEL: test_8f32_fnmsub:
-; FMA4: # BB#0:
+; FMA4: # %bb.0:
; FMA4-NEXT: vfnmsubps %ymm2, %ymm1, %ymm0, %ymm0
; FMA4-NEXT: retq
;
; AVX512-LABEL: test_8f32_fnmsub:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vfnmsub213ps %ymm2, %ymm1, %ymm0
; AVX512-NEXT: retq
%x = fmul <8 x float> %a0, %a1
@@ -449,17 +449,17 @@ define <8 x float> @test_8f32_fnmsub(<8 x float> %a0, <8 x float> %a1, <8 x floa
define double @test_f64_fnmsub(double %a0, double %a1, double %a2) {
; FMA-LABEL: test_f64_fnmsub:
-; FMA: # BB#0:
+; FMA: # %bb.0:
; FMA-NEXT: vfnmsub213sd %xmm2, %xmm1, %xmm0
; FMA-NEXT: retq
;
; FMA4-LABEL: test_f64_fnmsub:
-; FMA4: # BB#0:
+; FMA4: # %bb.0:
; FMA4-NEXT: vfnmsubsd %xmm2, %xmm1, %xmm0, %xmm0
; FMA4-NEXT: retq
;
; AVX512-LABEL: test_f64_fnmsub:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vfnmsub213sd %xmm2, %xmm1, %xmm0
; AVX512-NEXT: retq
%x = fmul double %a0, %a1
@@ -470,17 +470,17 @@ define double @test_f64_fnmsub(double %a0, double %a1, double %a2) {
define <2 x double> @test_2f64_fnmsub(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
; FMA-LABEL: test_2f64_fnmsub:
-; FMA: # BB#0:
+; FMA: # %bb.0:
; FMA-NEXT: vfnmsub213pd %xmm2, %xmm1, %xmm0
; FMA-NEXT: retq
;
; FMA4-LABEL: test_2f64_fnmsub:
-; FMA4: # BB#0:
+; FMA4: # %bb.0:
; FMA4-NEXT: vfnmsubpd %xmm2, %xmm1, %xmm0, %xmm0
; FMA4-NEXT: retq
;
; AVX512-LABEL: test_2f64_fnmsub:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vfnmsub213pd %xmm2, %xmm1, %xmm0
; AVX512-NEXT: retq
%x = fmul <2 x double> %a0, %a1
@@ -491,17 +491,17 @@ define <2 x double> @test_2f64_fnmsub(<2 x double> %a0, <2 x double> %a1, <2 x d
define <4 x double> @test_4f64_fnmsub(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) {
; FMA-LABEL: test_4f64_fnmsub:
-; FMA: # BB#0:
+; FMA: # %bb.0:
; FMA-NEXT: vfnmsub213pd %ymm2, %ymm1, %ymm0
; FMA-NEXT: retq
;
; FMA4-LABEL: test_4f64_fnmsub:
-; FMA4: # BB#0:
+; FMA4: # %bb.0:
; FMA4-NEXT: vfnmsubpd %ymm2, %ymm1, %ymm0, %ymm0
; FMA4-NEXT: retq
;
; AVX512-LABEL: test_4f64_fnmsub:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vfnmsub213pd %ymm2, %ymm1, %ymm0
; AVX512-NEXT: retq
%x = fmul <4 x double> %a0, %a1
@@ -516,17 +516,17 @@ define <4 x double> @test_4f64_fnmsub(<4 x double> %a0, <4 x double> %a1, <4 x d
define <4 x float> @test_4f32_fmadd_load(<4 x float>* %a0, <4 x float> %a1, <4 x float> %a2) {
; FMA-LABEL: test_4f32_fmadd_load:
-; FMA: # BB#0:
+; FMA: # %bb.0:
; FMA-NEXT: vfmadd132ps (%rdi), %xmm1, %xmm0
; FMA-NEXT: retq
;
; FMA4-LABEL: test_4f32_fmadd_load:
-; FMA4: # BB#0:
+; FMA4: # %bb.0:
; FMA4-NEXT: vfmaddps %xmm1, (%rdi), %xmm0, %xmm0
; FMA4-NEXT: retq
;
; AVX512-LABEL: test_4f32_fmadd_load:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vfmadd132ps (%rdi), %xmm1, %xmm0
; AVX512-NEXT: retq
%x = load <4 x float>, <4 x float>* %a0
@@ -537,17 +537,17 @@ define <4 x float> @test_4f32_fmadd_load(<4 x float>* %a0, <4 x float> %a1, <4 x
define <2 x double> @test_2f64_fmsub_load(<2 x double>* %a0, <2 x double> %a1, <2 x double> %a2) {
; FMA-LABEL: test_2f64_fmsub_load:
-; FMA: # BB#0:
+; FMA: # %bb.0:
; FMA-NEXT: vfmsub132pd (%rdi), %xmm1, %xmm0
; FMA-NEXT: retq
;
; FMA4-LABEL: test_2f64_fmsub_load:
-; FMA4: # BB#0:
+; FMA4: # %bb.0:
; FMA4-NEXT: vfmsubpd %xmm1, (%rdi), %xmm0, %xmm0
; FMA4-NEXT: retq
;
; AVX512-LABEL: test_2f64_fmsub_load:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vfmsub132pd (%rdi), %xmm1, %xmm0
; AVX512-NEXT: retq
%x = load <2 x double>, <2 x double>* %a0
@@ -562,35 +562,35 @@ define <2 x double> @test_2f64_fmsub_load(<2 x double>* %a0, <2 x double> %a1, <
define <4 x float> @test_v4f32_mul_add_x_one_y(<4 x float> %x, <4 x float> %y) {
; FMA-INFS-LABEL: test_v4f32_mul_add_x_one_y:
-; FMA-INFS: # BB#0:
+; FMA-INFS: # %bb.0:
; FMA-INFS-NEXT: vaddps {{.*}}(%rip), %xmm0, %xmm0
; FMA-INFS-NEXT: vmulps %xmm1, %xmm0, %xmm0
; FMA-INFS-NEXT: retq
;
; FMA4-INFS-LABEL: test_v4f32_mul_add_x_one_y:
-; FMA4-INFS: # BB#0:
+; FMA4-INFS: # %bb.0:
; FMA4-INFS-NEXT: vaddps {{.*}}(%rip), %xmm0, %xmm0
; FMA4-INFS-NEXT: vmulps %xmm1, %xmm0, %xmm0
; FMA4-INFS-NEXT: retq
;
; AVX512-INFS-LABEL: test_v4f32_mul_add_x_one_y:
-; AVX512-INFS: # BB#0:
+; AVX512-INFS: # %bb.0:
; AVX512-INFS-NEXT: vaddps {{.*}}(%rip){1to4}, %xmm0, %xmm0
; AVX512-INFS-NEXT: vmulps %xmm1, %xmm0, %xmm0
; AVX512-INFS-NEXT: retq
;
; FMA-NOINFS-LABEL: test_v4f32_mul_add_x_one_y:
-; FMA-NOINFS: # BB#0:
+; FMA-NOINFS: # %bb.0:
; FMA-NOINFS-NEXT: vfmadd213ps %xmm1, %xmm1, %xmm0
; FMA-NOINFS-NEXT: retq
;
; FMA4-NOINFS-LABEL: test_v4f32_mul_add_x_one_y:
-; FMA4-NOINFS: # BB#0:
+; FMA4-NOINFS: # %bb.0:
; FMA4-NOINFS-NEXT: vfmaddps %xmm1, %xmm1, %xmm0, %xmm0
; FMA4-NOINFS-NEXT: retq
;
; AVX512-NOINFS-LABEL: test_v4f32_mul_add_x_one_y:
-; AVX512-NOINFS: # BB#0:
+; AVX512-NOINFS: # %bb.0:
; AVX512-NOINFS-NEXT: vfmadd213ps %xmm1, %xmm1, %xmm0
; AVX512-NOINFS-NEXT: retq
%a = fadd <4 x float> %x, <float 1.0, float 1.0, float 1.0, float 1.0>
@@ -600,35 +600,35 @@ define <4 x float> @test_v4f32_mul_add_x_one_y(<4 x float> %x, <4 x float> %y) {
define <4 x float> @test_v4f32_mul_y_add_x_one(<4 x float> %x, <4 x float> %y) {
; FMA-INFS-LABEL: test_v4f32_mul_y_add_x_one:
-; FMA-INFS: # BB#0:
+; FMA-INFS: # %bb.0:
; FMA-INFS-NEXT: vaddps {{.*}}(%rip), %xmm0, %xmm0
; FMA-INFS-NEXT: vmulps %xmm0, %xmm1, %xmm0
; FMA-INFS-NEXT: retq
;
; FMA4-INFS-LABEL: test_v4f32_mul_y_add_x_one:
-; FMA4-INFS: # BB#0:
+; FMA4-INFS: # %bb.0:
; FMA4-INFS-NEXT: vaddps {{.*}}(%rip), %xmm0, %xmm0
; FMA4-INFS-NEXT: vmulps %xmm0, %xmm1, %xmm0
; FMA4-INFS-NEXT: retq
;
; AVX512-INFS-LABEL: test_v4f32_mul_y_add_x_one:
-; AVX512-INFS: # BB#0:
+; AVX512-INFS: # %bb.0:
; AVX512-INFS-NEXT: vaddps {{.*}}(%rip){1to4}, %xmm0, %xmm0
; AVX512-INFS-NEXT: vmulps %xmm0, %xmm1, %xmm0
; AVX512-INFS-NEXT: retq
;
; FMA-NOINFS-LABEL: test_v4f32_mul_y_add_x_one:
-; FMA-NOINFS: # BB#0:
+; FMA-NOINFS: # %bb.0:
; FMA-NOINFS-NEXT: vfmadd213ps %xmm1, %xmm1, %xmm0
; FMA-NOINFS-NEXT: retq
;
; FMA4-NOINFS-LABEL: test_v4f32_mul_y_add_x_one:
-; FMA4-NOINFS: # BB#0:
+; FMA4-NOINFS: # %bb.0:
; FMA4-NOINFS-NEXT: vfmaddps %xmm1, %xmm1, %xmm0, %xmm0
; FMA4-NOINFS-NEXT: retq
;
; AVX512-NOINFS-LABEL: test_v4f32_mul_y_add_x_one:
-; AVX512-NOINFS: # BB#0:
+; AVX512-NOINFS: # %bb.0:
; AVX512-NOINFS-NEXT: vfmadd213ps %xmm1, %xmm1, %xmm0
; AVX512-NOINFS-NEXT: retq
%a = fadd <4 x float> %x, <float 1.0, float 1.0, float 1.0, float 1.0>
@@ -638,35 +638,35 @@ define <4 x float> @test_v4f32_mul_y_add_x_one(<4 x float> %x, <4 x float> %y) {
define <4 x float> @test_v4f32_mul_add_x_negone_y(<4 x float> %x, <4 x float> %y) {
; FMA-INFS-LABEL: test_v4f32_mul_add_x_negone_y:
-; FMA-INFS: # BB#0:
+; FMA-INFS: # %bb.0:
; FMA-INFS-NEXT: vaddps {{.*}}(%rip), %xmm0, %xmm0
; FMA-INFS-NEXT: vmulps %xmm1, %xmm0, %xmm0
; FMA-INFS-NEXT: retq
;
; FMA4-INFS-LABEL: test_v4f32_mul_add_x_negone_y:
-; FMA4-INFS: # BB#0:
+; FMA4-INFS: # %bb.0:
; FMA4-INFS-NEXT: vaddps {{.*}}(%rip), %xmm0, %xmm0
; FMA4-INFS-NEXT: vmulps %xmm1, %xmm0, %xmm0
; FMA4-INFS-NEXT: retq
;
; AVX512-INFS-LABEL: test_v4f32_mul_add_x_negone_y:
-; AVX512-INFS: # BB#0:
+; AVX512-INFS: # %bb.0:
; AVX512-INFS-NEXT: vaddps {{.*}}(%rip){1to4}, %xmm0, %xmm0
; AVX512-INFS-NEXT: vmulps %xmm1, %xmm0, %xmm0
; AVX512-INFS-NEXT: retq
;
; FMA-NOINFS-LABEL: test_v4f32_mul_add_x_negone_y:
-; FMA-NOINFS: # BB#0:
+; FMA-NOINFS: # %bb.0:
; FMA-NOINFS-NEXT: vfmsub213ps %xmm1, %xmm1, %xmm0
; FMA-NOINFS-NEXT: retq
;
; FMA4-NOINFS-LABEL: test_v4f32_mul_add_x_negone_y:
-; FMA4-NOINFS: # BB#0:
+; FMA4-NOINFS: # %bb.0:
; FMA4-NOINFS-NEXT: vfmsubps %xmm1, %xmm1, %xmm0, %xmm0
; FMA4-NOINFS-NEXT: retq
;
; AVX512-NOINFS-LABEL: test_v4f32_mul_add_x_negone_y:
-; AVX512-NOINFS: # BB#0:
+; AVX512-NOINFS: # %bb.0:
; AVX512-NOINFS-NEXT: vfmsub213ps %xmm1, %xmm1, %xmm0
; AVX512-NOINFS-NEXT: retq
%a = fadd <4 x float> %x, <float -1.0, float -1.0, float -1.0, float -1.0>
@@ -676,35 +676,35 @@ define <4 x float> @test_v4f32_mul_add_x_negone_y(<4 x float> %x, <4 x float> %y
define <4 x float> @test_v4f32_mul_y_add_x_negone(<4 x float> %x, <4 x float> %y) {
; FMA-INFS-LABEL: test_v4f32_mul_y_add_x_negone:
-; FMA-INFS: # BB#0:
+; FMA-INFS: # %bb.0:
; FMA-INFS-NEXT: vaddps {{.*}}(%rip), %xmm0, %xmm0
; FMA-INFS-NEXT: vmulps %xmm0, %xmm1, %xmm0
; FMA-INFS-NEXT: retq
;
; FMA4-INFS-LABEL: test_v4f32_mul_y_add_x_negone:
-; FMA4-INFS: # BB#0:
+; FMA4-INFS: # %bb.0:
; FMA4-INFS-NEXT: vaddps {{.*}}(%rip), %xmm0, %xmm0
; FMA4-INFS-NEXT: vmulps %xmm0, %xmm1, %xmm0
; FMA4-INFS-NEXT: retq
;
; AVX512-INFS-LABEL: test_v4f32_mul_y_add_x_negone:
-; AVX512-INFS: # BB#0:
+; AVX512-INFS: # %bb.0:
; AVX512-INFS-NEXT: vaddps {{.*}}(%rip){1to4}, %xmm0, %xmm0
; AVX512-INFS-NEXT: vmulps %xmm0, %xmm1, %xmm0
; AVX512-INFS-NEXT: retq
;
; FMA-NOINFS-LABEL: test_v4f32_mul_y_add_x_negone:
-; FMA-NOINFS: # BB#0:
+; FMA-NOINFS: # %bb.0:
; FMA-NOINFS-NEXT: vfmsub213ps %xmm1, %xmm1, %xmm0
; FMA-NOINFS-NEXT: retq
;
; FMA4-NOINFS-LABEL: test_v4f32_mul_y_add_x_negone:
-; FMA4-NOINFS: # BB#0:
+; FMA4-NOINFS: # %bb.0:
; FMA4-NOINFS-NEXT: vfmsubps %xmm1, %xmm1, %xmm0, %xmm0
; FMA4-NOINFS-NEXT: retq
;
; AVX512-NOINFS-LABEL: test_v4f32_mul_y_add_x_negone:
-; AVX512-NOINFS: # BB#0:
+; AVX512-NOINFS: # %bb.0:
; AVX512-NOINFS-NEXT: vfmsub213ps %xmm1, %xmm1, %xmm0
; AVX512-NOINFS-NEXT: retq
%a = fadd <4 x float> %x, <float -1.0, float -1.0, float -1.0, float -1.0>
@@ -714,38 +714,38 @@ define <4 x float> @test_v4f32_mul_y_add_x_negone(<4 x float> %x, <4 x float> %y
define <4 x float> @test_v4f32_mul_sub_one_x_y(<4 x float> %x, <4 x float> %y) {
; FMA-INFS-LABEL: test_v4f32_mul_sub_one_x_y:
-; FMA-INFS: # BB#0:
+; FMA-INFS: # %bb.0:
; FMA-INFS-NEXT: vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
; FMA-INFS-NEXT: vsubps %xmm0, %xmm2, %xmm0
; FMA-INFS-NEXT: vmulps %xmm1, %xmm0, %xmm0
; FMA-INFS-NEXT: retq
;
; FMA4-INFS-LABEL: test_v4f32_mul_sub_one_x_y:
-; FMA4-INFS: # BB#0:
+; FMA4-INFS: # %bb.0:
; FMA4-INFS-NEXT: vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
; FMA4-INFS-NEXT: vsubps %xmm0, %xmm2, %xmm0
; FMA4-INFS-NEXT: vmulps %xmm1, %xmm0, %xmm0
; FMA4-INFS-NEXT: retq
;
; AVX512-INFS-LABEL: test_v4f32_mul_sub_one_x_y:
-; AVX512-INFS: # BB#0:
-; AVX512-INFS-NEXT: vbroadcastss {{.*}}(%rip), %xmm2
+; AVX512-INFS: # %bb.0:
+; AVX512-INFS-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1]
; AVX512-INFS-NEXT: vsubps %xmm0, %xmm2, %xmm0
; AVX512-INFS-NEXT: vmulps %xmm1, %xmm0, %xmm0
; AVX512-INFS-NEXT: retq
;
; FMA-NOINFS-LABEL: test_v4f32_mul_sub_one_x_y:
-; FMA-NOINFS: # BB#0:
+; FMA-NOINFS: # %bb.0:
; FMA-NOINFS-NEXT: vfnmadd213ps %xmm1, %xmm1, %xmm0
; FMA-NOINFS-NEXT: retq
;
; FMA4-NOINFS-LABEL: test_v4f32_mul_sub_one_x_y:
-; FMA4-NOINFS: # BB#0:
+; FMA4-NOINFS: # %bb.0:
; FMA4-NOINFS-NEXT: vfnmaddps %xmm1, %xmm1, %xmm0, %xmm0
; FMA4-NOINFS-NEXT: retq
;
; AVX512-NOINFS-LABEL: test_v4f32_mul_sub_one_x_y:
-; AVX512-NOINFS: # BB#0:
+; AVX512-NOINFS: # %bb.0:
; AVX512-NOINFS-NEXT: vfnmadd213ps %xmm1, %xmm1, %xmm0
; AVX512-NOINFS-NEXT: retq
%s = fsub <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %x
@@ -755,38 +755,38 @@ define <4 x float> @test_v4f32_mul_sub_one_x_y(<4 x float> %x, <4 x float> %y) {
define <4 x float> @test_v4f32_mul_y_sub_one_x(<4 x float> %x, <4 x float> %y) {
; FMA-INFS-LABEL: test_v4f32_mul_y_sub_one_x:
-; FMA-INFS: # BB#0:
+; FMA-INFS: # %bb.0:
; FMA-INFS-NEXT: vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
; FMA-INFS-NEXT: vsubps %xmm0, %xmm2, %xmm0
; FMA-INFS-NEXT: vmulps %xmm0, %xmm1, %xmm0
; FMA-INFS-NEXT: retq
;
; FMA4-INFS-LABEL: test_v4f32_mul_y_sub_one_x:
-; FMA4-INFS: # BB#0:
+; FMA4-INFS: # %bb.0:
; FMA4-INFS-NEXT: vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
; FMA4-INFS-NEXT: vsubps %xmm0, %xmm2, %xmm0
; FMA4-INFS-NEXT: vmulps %xmm0, %xmm1, %xmm0
; FMA4-INFS-NEXT: retq
;
; AVX512-INFS-LABEL: test_v4f32_mul_y_sub_one_x:
-; AVX512-INFS: # BB#0:
-; AVX512-INFS-NEXT: vbroadcastss {{.*}}(%rip), %xmm2
+; AVX512-INFS: # %bb.0:
+; AVX512-INFS-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1]
; AVX512-INFS-NEXT: vsubps %xmm0, %xmm2, %xmm0
; AVX512-INFS-NEXT: vmulps %xmm0, %xmm1, %xmm0
; AVX512-INFS-NEXT: retq
;
; FMA-NOINFS-LABEL: test_v4f32_mul_y_sub_one_x:
-; FMA-NOINFS: # BB#0:
+; FMA-NOINFS: # %bb.0:
; FMA-NOINFS-NEXT: vfnmadd213ps %xmm1, %xmm1, %xmm0
; FMA-NOINFS-NEXT: retq
;
; FMA4-NOINFS-LABEL: test_v4f32_mul_y_sub_one_x:
-; FMA4-NOINFS: # BB#0:
+; FMA4-NOINFS: # %bb.0:
; FMA4-NOINFS-NEXT: vfnmaddps %xmm1, %xmm1, %xmm0, %xmm0
; FMA4-NOINFS-NEXT: retq
;
; AVX512-NOINFS-LABEL: test_v4f32_mul_y_sub_one_x:
-; AVX512-NOINFS: # BB#0:
+; AVX512-NOINFS: # %bb.0:
; AVX512-NOINFS-NEXT: vfnmadd213ps %xmm1, %xmm1, %xmm0
; AVX512-NOINFS-NEXT: retq
%s = fsub <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %x
@@ -796,38 +796,38 @@ define <4 x float> @test_v4f32_mul_y_sub_one_x(<4 x float> %x, <4 x float> %y) {
define <4 x float> @test_v4f32_mul_sub_negone_x_y(<4 x float> %x, <4 x float> %y) {
; FMA-INFS-LABEL: test_v4f32_mul_sub_negone_x_y:
-; FMA-INFS: # BB#0:
+; FMA-INFS: # %bb.0:
; FMA-INFS-NEXT: vmovaps {{.*#+}} xmm2 = [-1.000000e+00,-1.000000e+00,-1.000000e+00,-1.000000e+00]
; FMA-INFS-NEXT: vsubps %xmm0, %xmm2, %xmm0
; FMA-INFS-NEXT: vmulps %xmm1, %xmm0, %xmm0
; FMA-INFS-NEXT: retq
;
; FMA4-INFS-LABEL: test_v4f32_mul_sub_negone_x_y:
-; FMA4-INFS: # BB#0:
+; FMA4-INFS: # %bb.0:
; FMA4-INFS-NEXT: vmovaps {{.*#+}} xmm2 = [-1.000000e+00,-1.000000e+00,-1.000000e+00,-1.000000e+00]
; FMA4-INFS-NEXT: vsubps %xmm0, %xmm2, %xmm0
; FMA4-INFS-NEXT: vmulps %xmm1, %xmm0, %xmm0
; FMA4-INFS-NEXT: retq
;
; AVX512-INFS-LABEL: test_v4f32_mul_sub_negone_x_y:
-; AVX512-INFS: # BB#0:
-; AVX512-INFS-NEXT: vbroadcastss {{.*}}(%rip), %xmm2
+; AVX512-INFS: # %bb.0:
+; AVX512-INFS-NEXT: vbroadcastss {{.*#+}} xmm2 = [-1,-1,-1,-1]
; AVX512-INFS-NEXT: vsubps %xmm0, %xmm2, %xmm0
; AVX512-INFS-NEXT: vmulps %xmm1, %xmm0, %xmm0
; AVX512-INFS-NEXT: retq
;
; FMA-NOINFS-LABEL: test_v4f32_mul_sub_negone_x_y:
-; FMA-NOINFS: # BB#0:
+; FMA-NOINFS: # %bb.0:
; FMA-NOINFS-NEXT: vfnmsub213ps %xmm1, %xmm1, %xmm0
; FMA-NOINFS-NEXT: retq
;
; FMA4-NOINFS-LABEL: test_v4f32_mul_sub_negone_x_y:
-; FMA4-NOINFS: # BB#0:
+; FMA4-NOINFS: # %bb.0:
; FMA4-NOINFS-NEXT: vfnmsubps %xmm1, %xmm1, %xmm0, %xmm0
; FMA4-NOINFS-NEXT: retq
;
; AVX512-NOINFS-LABEL: test_v4f32_mul_sub_negone_x_y:
-; AVX512-NOINFS: # BB#0:
+; AVX512-NOINFS: # %bb.0:
; AVX512-NOINFS-NEXT: vfnmsub213ps %xmm1, %xmm1, %xmm0
; AVX512-NOINFS-NEXT: retq
%s = fsub <4 x float> <float -1.0, float -1.0, float -1.0, float -1.0>, %x
@@ -837,38 +837,38 @@ define <4 x float> @test_v4f32_mul_sub_negone_x_y(<4 x float> %x, <4 x float> %y
define <4 x float> @test_v4f32_mul_y_sub_negone_x(<4 x float> %x, <4 x float> %y) {
; FMA-INFS-LABEL: test_v4f32_mul_y_sub_negone_x:
-; FMA-INFS: # BB#0:
+; FMA-INFS: # %bb.0:
; FMA-INFS-NEXT: vmovaps {{.*#+}} xmm2 = [-1.000000e+00,-1.000000e+00,-1.000000e+00,-1.000000e+00]
; FMA-INFS-NEXT: vsubps %xmm0, %xmm2, %xmm0
; FMA-INFS-NEXT: vmulps %xmm0, %xmm1, %xmm0
; FMA-INFS-NEXT: retq
;
; FMA4-INFS-LABEL: test_v4f32_mul_y_sub_negone_x:
-; FMA4-INFS: # BB#0:
+; FMA4-INFS: # %bb.0:
; FMA4-INFS-NEXT: vmovaps {{.*#+}} xmm2 = [-1.000000e+00,-1.000000e+00,-1.000000e+00,-1.000000e+00]
; FMA4-INFS-NEXT: vsubps %xmm0, %xmm2, %xmm0
; FMA4-INFS-NEXT: vmulps %xmm0, %xmm1, %xmm0
; FMA4-INFS-NEXT: retq
;
; AVX512-INFS-LABEL: test_v4f32_mul_y_sub_negone_x:
-; AVX512-INFS: # BB#0:
-; AVX512-INFS-NEXT: vbroadcastss {{.*}}(%rip), %xmm2
+; AVX512-INFS: # %bb.0:
+; AVX512-INFS-NEXT: vbroadcastss {{.*#+}} xmm2 = [-1,-1,-1,-1]
; AVX512-INFS-NEXT: vsubps %xmm0, %xmm2, %xmm0
; AVX512-INFS-NEXT: vmulps %xmm0, %xmm1, %xmm0
; AVX512-INFS-NEXT: retq
;
; FMA-NOINFS-LABEL: test_v4f32_mul_y_sub_negone_x:
-; FMA-NOINFS: # BB#0:
+; FMA-NOINFS: # %bb.0:
; FMA-NOINFS-NEXT: vfnmsub213ps %xmm1, %xmm1, %xmm0
; FMA-NOINFS-NEXT: retq
;
; FMA4-NOINFS-LABEL: test_v4f32_mul_y_sub_negone_x:
-; FMA4-NOINFS: # BB#0:
+; FMA4-NOINFS: # %bb.0:
; FMA4-NOINFS-NEXT: vfnmsubps %xmm1, %xmm1, %xmm0, %xmm0
; FMA4-NOINFS-NEXT: retq
;
; AVX512-NOINFS-LABEL: test_v4f32_mul_y_sub_negone_x:
-; AVX512-NOINFS: # BB#0:
+; AVX512-NOINFS: # %bb.0:
; AVX512-NOINFS-NEXT: vfnmsub213ps %xmm1, %xmm1, %xmm0
; AVX512-NOINFS-NEXT: retq
%s = fsub <4 x float> <float -1.0, float -1.0, float -1.0, float -1.0>, %x
@@ -878,35 +878,35 @@ define <4 x float> @test_v4f32_mul_y_sub_negone_x(<4 x float> %x, <4 x float> %y
define <4 x float> @test_v4f32_mul_sub_x_one_y(<4 x float> %x, <4 x float> %y) {
; FMA-INFS-LABEL: test_v4f32_mul_sub_x_one_y:
-; FMA-INFS: # BB#0:
+; FMA-INFS: # %bb.0:
; FMA-INFS-NEXT: vsubps {{.*}}(%rip), %xmm0, %xmm0
; FMA-INFS-NEXT: vmulps %xmm1, %xmm0, %xmm0
; FMA-INFS-NEXT: retq
;
; FMA4-INFS-LABEL: test_v4f32_mul_sub_x_one_y:
-; FMA4-INFS: # BB#0:
+; FMA4-INFS: # %bb.0:
; FMA4-INFS-NEXT: vsubps {{.*}}(%rip), %xmm0, %xmm0
; FMA4-INFS-NEXT: vmulps %xmm1, %xmm0, %xmm0
; FMA4-INFS-NEXT: retq
;
; AVX512-INFS-LABEL: test_v4f32_mul_sub_x_one_y:
-; AVX512-INFS: # BB#0:
+; AVX512-INFS: # %bb.0:
; AVX512-INFS-NEXT: vsubps {{.*}}(%rip){1to4}, %xmm0, %xmm0
; AVX512-INFS-NEXT: vmulps %xmm1, %xmm0, %xmm0
; AVX512-INFS-NEXT: retq
;
; FMA-NOINFS-LABEL: test_v4f32_mul_sub_x_one_y:
-; FMA-NOINFS: # BB#0:
+; FMA-NOINFS: # %bb.0:
; FMA-NOINFS-NEXT: vfmsub213ps %xmm1, %xmm1, %xmm0
; FMA-NOINFS-NEXT: retq
;
; FMA4-NOINFS-LABEL: test_v4f32_mul_sub_x_one_y:
-; FMA4-NOINFS: # BB#0:
+; FMA4-NOINFS: # %bb.0:
; FMA4-NOINFS-NEXT: vfmsubps %xmm1, %xmm1, %xmm0, %xmm0
; FMA4-NOINFS-NEXT: retq
;
; AVX512-NOINFS-LABEL: test_v4f32_mul_sub_x_one_y:
-; AVX512-NOINFS: # BB#0:
+; AVX512-NOINFS: # %bb.0:
; AVX512-NOINFS-NEXT: vfmsub213ps %xmm1, %xmm1, %xmm0
; AVX512-NOINFS-NEXT: retq
%s = fsub <4 x float> %x, <float 1.0, float 1.0, float 1.0, float 1.0>
@@ -916,35 +916,35 @@ define <4 x float> @test_v4f32_mul_sub_x_one_y(<4 x float> %x, <4 x float> %y) {
define <4 x float> @test_v4f32_mul_y_sub_x_one(<4 x float> %x, <4 x float> %y) {
; FMA-INFS-LABEL: test_v4f32_mul_y_sub_x_one:
-; FMA-INFS: # BB#0:
+; FMA-INFS: # %bb.0:
; FMA-INFS-NEXT: vsubps {{.*}}(%rip), %xmm0, %xmm0
; FMA-INFS-NEXT: vmulps %xmm0, %xmm1, %xmm0
; FMA-INFS-NEXT: retq
;
; FMA4-INFS-LABEL: test_v4f32_mul_y_sub_x_one:
-; FMA4-INFS: # BB#0:
+; FMA4-INFS: # %bb.0:
; FMA4-INFS-NEXT: vsubps {{.*}}(%rip), %xmm0, %xmm0
; FMA4-INFS-NEXT: vmulps %xmm0, %xmm1, %xmm0
; FMA4-INFS-NEXT: retq
;
; AVX512-INFS-LABEL: test_v4f32_mul_y_sub_x_one:
-; AVX512-INFS: # BB#0:
+; AVX512-INFS: # %bb.0:
; AVX512-INFS-NEXT: vsubps {{.*}}(%rip){1to4}, %xmm0, %xmm0
; AVX512-INFS-NEXT: vmulps %xmm0, %xmm1, %xmm0
; AVX512-INFS-NEXT: retq
;
; FMA-NOINFS-LABEL: test_v4f32_mul_y_sub_x_one:
-; FMA-NOINFS: # BB#0:
+; FMA-NOINFS: # %bb.0:
; FMA-NOINFS-NEXT: vfmsub213ps %xmm1, %xmm1, %xmm0
; FMA-NOINFS-NEXT: retq
;
; FMA4-NOINFS-LABEL: test_v4f32_mul_y_sub_x_one:
-; FMA4-NOINFS: # BB#0:
+; FMA4-NOINFS: # %bb.0:
; FMA4-NOINFS-NEXT: vfmsubps %xmm1, %xmm1, %xmm0, %xmm0
; FMA4-NOINFS-NEXT: retq
;
; AVX512-NOINFS-LABEL: test_v4f32_mul_y_sub_x_one:
-; AVX512-NOINFS: # BB#0:
+; AVX512-NOINFS: # %bb.0:
; AVX512-NOINFS-NEXT: vfmsub213ps %xmm1, %xmm1, %xmm0
; AVX512-NOINFS-NEXT: retq
%s = fsub <4 x float> %x, <float 1.0, float 1.0, float 1.0, float 1.0>
@@ -954,35 +954,35 @@ define <4 x float> @test_v4f32_mul_y_sub_x_one(<4 x float> %x, <4 x float> %y) {
define <4 x float> @test_v4f32_mul_sub_x_negone_y(<4 x float> %x, <4 x float> %y) {
; FMA-INFS-LABEL: test_v4f32_mul_sub_x_negone_y:
-; FMA-INFS: # BB#0:
+; FMA-INFS: # %bb.0:
; FMA-INFS-NEXT: vsubps {{.*}}(%rip), %xmm0, %xmm0
; FMA-INFS-NEXT: vmulps %xmm1, %xmm0, %xmm0
; FMA-INFS-NEXT: retq
;
; FMA4-INFS-LABEL: test_v4f32_mul_sub_x_negone_y:
-; FMA4-INFS: # BB#0:
+; FMA4-INFS: # %bb.0:
; FMA4-INFS-NEXT: vsubps {{.*}}(%rip), %xmm0, %xmm0
; FMA4-INFS-NEXT: vmulps %xmm1, %xmm0, %xmm0
; FMA4-INFS-NEXT: retq
;
; AVX512-INFS-LABEL: test_v4f32_mul_sub_x_negone_y:
-; AVX512-INFS: # BB#0:
+; AVX512-INFS: # %bb.0:
; AVX512-INFS-NEXT: vsubps {{.*}}(%rip){1to4}, %xmm0, %xmm0
; AVX512-INFS-NEXT: vmulps %xmm1, %xmm0, %xmm0
; AVX512-INFS-NEXT: retq
;
; FMA-NOINFS-LABEL: test_v4f32_mul_sub_x_negone_y:
-; FMA-NOINFS: # BB#0:
+; FMA-NOINFS: # %bb.0:
; FMA-NOINFS-NEXT: vfmadd213ps %xmm1, %xmm1, %xmm0
; FMA-NOINFS-NEXT: retq
;
; FMA4-NOINFS-LABEL: test_v4f32_mul_sub_x_negone_y:
-; FMA4-NOINFS: # BB#0:
+; FMA4-NOINFS: # %bb.0:
; FMA4-NOINFS-NEXT: vfmaddps %xmm1, %xmm1, %xmm0, %xmm0
; FMA4-NOINFS-NEXT: retq
;
; AVX512-NOINFS-LABEL: test_v4f32_mul_sub_x_negone_y:
-; AVX512-NOINFS: # BB#0:
+; AVX512-NOINFS: # %bb.0:
; AVX512-NOINFS-NEXT: vfmadd213ps %xmm1, %xmm1, %xmm0
; AVX512-NOINFS-NEXT: retq
%s = fsub <4 x float> %x, <float -1.0, float -1.0, float -1.0, float -1.0>
@@ -992,35 +992,35 @@ define <4 x float> @test_v4f32_mul_sub_x_negone_y(<4 x float> %x, <4 x float> %y
define <4 x float> @test_v4f32_mul_y_sub_x_negone(<4 x float> %x, <4 x float> %y) {
; FMA-INFS-LABEL: test_v4f32_mul_y_sub_x_negone:
-; FMA-INFS: # BB#0:
+; FMA-INFS: # %bb.0:
; FMA-INFS-NEXT: vsubps {{.*}}(%rip), %xmm0, %xmm0
; FMA-INFS-NEXT: vmulps %xmm0, %xmm1, %xmm0
; FMA-INFS-NEXT: retq
;
; FMA4-INFS-LABEL: test_v4f32_mul_y_sub_x_negone:
-; FMA4-INFS: # BB#0:
+; FMA4-INFS: # %bb.0:
; FMA4-INFS-NEXT: vsubps {{.*}}(%rip), %xmm0, %xmm0
; FMA4-INFS-NEXT: vmulps %xmm0, %xmm1, %xmm0
; FMA4-INFS-NEXT: retq
;
; AVX512-INFS-LABEL: test_v4f32_mul_y_sub_x_negone:
-; AVX512-INFS: # BB#0:
+; AVX512-INFS: # %bb.0:
; AVX512-INFS-NEXT: vsubps {{.*}}(%rip){1to4}, %xmm0, %xmm0
; AVX512-INFS-NEXT: vmulps %xmm0, %xmm1, %xmm0
; AVX512-INFS-NEXT: retq
;
; FMA-NOINFS-LABEL: test_v4f32_mul_y_sub_x_negone:
-; FMA-NOINFS: # BB#0:
+; FMA-NOINFS: # %bb.0:
; FMA-NOINFS-NEXT: vfmadd213ps %xmm1, %xmm1, %xmm0
; FMA-NOINFS-NEXT: retq
;
; FMA4-NOINFS-LABEL: test_v4f32_mul_y_sub_x_negone:
-; FMA4-NOINFS: # BB#0:
+; FMA4-NOINFS: # %bb.0:
; FMA4-NOINFS-NEXT: vfmaddps %xmm1, %xmm1, %xmm0, %xmm0
; FMA4-NOINFS-NEXT: retq
;
; AVX512-NOINFS-LABEL: test_v4f32_mul_y_sub_x_negone:
-; AVX512-NOINFS: # BB#0:
+; AVX512-NOINFS: # %bb.0:
; AVX512-NOINFS-NEXT: vfmadd213ps %xmm1, %xmm1, %xmm0
; AVX512-NOINFS-NEXT: retq
%s = fsub <4 x float> %x, <float -1.0, float -1.0, float -1.0, float -1.0>
@@ -1034,7 +1034,7 @@ define <4 x float> @test_v4f32_mul_y_sub_x_negone(<4 x float> %x, <4 x float> %y
define float @test_f32_interp(float %x, float %y, float %t) {
; FMA-INFS-LABEL: test_f32_interp:
-; FMA-INFS: # BB#0:
+; FMA-INFS: # %bb.0:
; FMA-INFS-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
; FMA-INFS-NEXT: vsubss %xmm2, %xmm3, %xmm3
; FMA-INFS-NEXT: vmulss %xmm3, %xmm1, %xmm1
@@ -1042,7 +1042,7 @@ define float @test_f32_interp(float %x, float %y, float %t) {
; FMA-INFS-NEXT: retq
;
; FMA4-INFS-LABEL: test_f32_interp:
-; FMA4-INFS: # BB#0:
+; FMA4-INFS: # %bb.0:
; FMA4-INFS-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
; FMA4-INFS-NEXT: vsubss %xmm2, %xmm3, %xmm3
; FMA4-INFS-NEXT: vmulss %xmm3, %xmm1, %xmm1
@@ -1050,7 +1050,7 @@ define float @test_f32_interp(float %x, float %y, float %t) {
; FMA4-INFS-NEXT: retq
;
; AVX512-INFS-LABEL: test_f32_interp:
-; AVX512-INFS: # BB#0:
+; AVX512-INFS: # %bb.0:
; AVX512-INFS-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
; AVX512-INFS-NEXT: vsubss %xmm2, %xmm3, %xmm3
; AVX512-INFS-NEXT: vmulss %xmm3, %xmm1, %xmm1
@@ -1058,19 +1058,19 @@ define float @test_f32_interp(float %x, float %y, float %t) {
; AVX512-INFS-NEXT: retq
;
; FMA-NOINFS-LABEL: test_f32_interp:
-; FMA-NOINFS: # BB#0:
+; FMA-NOINFS: # %bb.0:
; FMA-NOINFS-NEXT: vfnmadd213ss %xmm1, %xmm2, %xmm1
; FMA-NOINFS-NEXT: vfmadd213ss %xmm1, %xmm2, %xmm0
; FMA-NOINFS-NEXT: retq
;
; FMA4-NOINFS-LABEL: test_f32_interp:
-; FMA4-NOINFS: # BB#0:
+; FMA4-NOINFS: # %bb.0:
; FMA4-NOINFS-NEXT: vfnmaddss %xmm1, %xmm1, %xmm2, %xmm1
; FMA4-NOINFS-NEXT: vfmaddss %xmm1, %xmm2, %xmm0, %xmm0
; FMA4-NOINFS-NEXT: retq
;
; AVX512-NOINFS-LABEL: test_f32_interp:
-; AVX512-NOINFS: # BB#0:
+; AVX512-NOINFS: # %bb.0:
; AVX512-NOINFS-NEXT: vfnmadd213ss %xmm1, %xmm2, %xmm1
; AVX512-NOINFS-NEXT: vfmadd213ss %xmm1, %xmm2, %xmm0
; AVX512-NOINFS-NEXT: retq
@@ -1083,7 +1083,7 @@ define float @test_f32_interp(float %x, float %y, float %t) {
define <4 x float> @test_v4f32_interp(<4 x float> %x, <4 x float> %y, <4 x float> %t) {
; FMA-INFS-LABEL: test_v4f32_interp:
-; FMA-INFS: # BB#0:
+; FMA-INFS: # %bb.0:
; FMA-INFS-NEXT: vmovaps {{.*#+}} xmm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
; FMA-INFS-NEXT: vsubps %xmm2, %xmm3, %xmm3
; FMA-INFS-NEXT: vmulps %xmm3, %xmm1, %xmm1
@@ -1091,7 +1091,7 @@ define <4 x float> @test_v4f32_interp(<4 x float> %x, <4 x float> %y, <4 x float
; FMA-INFS-NEXT: retq
;
; FMA4-INFS-LABEL: test_v4f32_interp:
-; FMA4-INFS: # BB#0:
+; FMA4-INFS: # %bb.0:
; FMA4-INFS-NEXT: vmovaps {{.*#+}} xmm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
; FMA4-INFS-NEXT: vsubps %xmm2, %xmm3, %xmm3
; FMA4-INFS-NEXT: vmulps %xmm3, %xmm1, %xmm1
@@ -1099,27 +1099,27 @@ define <4 x float> @test_v4f32_interp(<4 x float> %x, <4 x float> %y, <4 x float
; FMA4-INFS-NEXT: retq
;
; AVX512-INFS-LABEL: test_v4f32_interp:
-; AVX512-INFS: # BB#0:
-; AVX512-INFS-NEXT: vbroadcastss {{.*}}(%rip), %xmm3
+; AVX512-INFS: # %bb.0:
+; AVX512-INFS-NEXT: vbroadcastss {{.*#+}} xmm3 = [1,1,1,1]
; AVX512-INFS-NEXT: vsubps %xmm2, %xmm3, %xmm3
; AVX512-INFS-NEXT: vmulps %xmm3, %xmm1, %xmm1
; AVX512-INFS-NEXT: vfmadd213ps %xmm1, %xmm2, %xmm0
; AVX512-INFS-NEXT: retq
;
; FMA-NOINFS-LABEL: test_v4f32_interp:
-; FMA-NOINFS: # BB#0:
+; FMA-NOINFS: # %bb.0:
; FMA-NOINFS-NEXT: vfnmadd213ps %xmm1, %xmm2, %xmm1
; FMA-NOINFS-NEXT: vfmadd213ps %xmm1, %xmm2, %xmm0
; FMA-NOINFS-NEXT: retq
;
; FMA4-NOINFS-LABEL: test_v4f32_interp:
-; FMA4-NOINFS: # BB#0:
+; FMA4-NOINFS: # %bb.0:
; FMA4-NOINFS-NEXT: vfnmaddps %xmm1, %xmm1, %xmm2, %xmm1
; FMA4-NOINFS-NEXT: vfmaddps %xmm1, %xmm2, %xmm0, %xmm0
; FMA4-NOINFS-NEXT: retq
;
; AVX512-NOINFS-LABEL: test_v4f32_interp:
-; AVX512-NOINFS: # BB#0:
+; AVX512-NOINFS: # %bb.0:
; AVX512-NOINFS-NEXT: vfnmadd213ps %xmm1, %xmm2, %xmm1
; AVX512-NOINFS-NEXT: vfmadd213ps %xmm1, %xmm2, %xmm0
; AVX512-NOINFS-NEXT: retq
@@ -1132,7 +1132,7 @@ define <4 x float> @test_v4f32_interp(<4 x float> %x, <4 x float> %y, <4 x float
define <8 x float> @test_v8f32_interp(<8 x float> %x, <8 x float> %y, <8 x float> %t) {
; FMA-INFS-LABEL: test_v8f32_interp:
-; FMA-INFS: # BB#0:
+; FMA-INFS: # %bb.0:
; FMA-INFS-NEXT: vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
; FMA-INFS-NEXT: vsubps %ymm2, %ymm3, %ymm3
; FMA-INFS-NEXT: vmulps %ymm3, %ymm1, %ymm1
@@ -1140,7 +1140,7 @@ define <8 x float> @test_v8f32_interp(<8 x float> %x, <8 x float> %y, <8 x float
; FMA-INFS-NEXT: retq
;
; FMA4-INFS-LABEL: test_v8f32_interp:
-; FMA4-INFS: # BB#0:
+; FMA4-INFS: # %bb.0:
; FMA4-INFS-NEXT: vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
; FMA4-INFS-NEXT: vsubps %ymm2, %ymm3, %ymm3
; FMA4-INFS-NEXT: vmulps %ymm3, %ymm1, %ymm1
@@ -1148,27 +1148,27 @@ define <8 x float> @test_v8f32_interp(<8 x float> %x, <8 x float> %y, <8 x float
; FMA4-INFS-NEXT: retq
;
; AVX512-INFS-LABEL: test_v8f32_interp:
-; AVX512-INFS: # BB#0:
-; AVX512-INFS-NEXT: vbroadcastss {{.*}}(%rip), %ymm3
+; AVX512-INFS: # %bb.0:
+; AVX512-INFS-NEXT: vbroadcastss {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1]
; AVX512-INFS-NEXT: vsubps %ymm2, %ymm3, %ymm3
; AVX512-INFS-NEXT: vmulps %ymm3, %ymm1, %ymm1
; AVX512-INFS-NEXT: vfmadd213ps %ymm1, %ymm2, %ymm0
; AVX512-INFS-NEXT: retq
;
; FMA-NOINFS-LABEL: test_v8f32_interp:
-; FMA-NOINFS: # BB#0:
+; FMA-NOINFS: # %bb.0:
; FMA-NOINFS-NEXT: vfnmadd213ps %ymm1, %ymm2, %ymm1
; FMA-NOINFS-NEXT: vfmadd213ps %ymm1, %ymm2, %ymm0
; FMA-NOINFS-NEXT: retq
;
; FMA4-NOINFS-LABEL: test_v8f32_interp:
-; FMA4-NOINFS: # BB#0:
+; FMA4-NOINFS: # %bb.0:
; FMA4-NOINFS-NEXT: vfnmaddps %ymm1, %ymm1, %ymm2, %ymm1
; FMA4-NOINFS-NEXT: vfmaddps %ymm1, %ymm2, %ymm0, %ymm0
; FMA4-NOINFS-NEXT: retq
;
; AVX512-NOINFS-LABEL: test_v8f32_interp:
-; AVX512-NOINFS: # BB#0:
+; AVX512-NOINFS: # %bb.0:
; AVX512-NOINFS-NEXT: vfnmadd213ps %ymm1, %ymm2, %ymm1
; AVX512-NOINFS-NEXT: vfmadd213ps %ymm1, %ymm2, %ymm0
; AVX512-NOINFS-NEXT: retq
@@ -1181,7 +1181,7 @@ define <8 x float> @test_v8f32_interp(<8 x float> %x, <8 x float> %y, <8 x float
define double @test_f64_interp(double %x, double %y, double %t) {
; FMA-INFS-LABEL: test_f64_interp:
-; FMA-INFS: # BB#0:
+; FMA-INFS: # %bb.0:
; FMA-INFS-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero
; FMA-INFS-NEXT: vsubsd %xmm2, %xmm3, %xmm3
; FMA-INFS-NEXT: vmulsd %xmm3, %xmm1, %xmm1
@@ -1189,7 +1189,7 @@ define double @test_f64_interp(double %x, double %y, double %t) {
; FMA-INFS-NEXT: retq
;
; FMA4-INFS-LABEL: test_f64_interp:
-; FMA4-INFS: # BB#0:
+; FMA4-INFS: # %bb.0:
; FMA4-INFS-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero
; FMA4-INFS-NEXT: vsubsd %xmm2, %xmm3, %xmm3
; FMA4-INFS-NEXT: vmulsd %xmm3, %xmm1, %xmm1
@@ -1197,7 +1197,7 @@ define double @test_f64_interp(double %x, double %y, double %t) {
; FMA4-INFS-NEXT: retq
;
; AVX512-INFS-LABEL: test_f64_interp:
-; AVX512-INFS: # BB#0:
+; AVX512-INFS: # %bb.0:
; AVX512-INFS-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero
; AVX512-INFS-NEXT: vsubsd %xmm2, %xmm3, %xmm3
; AVX512-INFS-NEXT: vmulsd %xmm3, %xmm1, %xmm1
@@ -1205,19 +1205,19 @@ define double @test_f64_interp(double %x, double %y, double %t) {
; AVX512-INFS-NEXT: retq
;
; FMA-NOINFS-LABEL: test_f64_interp:
-; FMA-NOINFS: # BB#0:
+; FMA-NOINFS: # %bb.0:
; FMA-NOINFS-NEXT: vfnmadd213sd %xmm1, %xmm2, %xmm1
; FMA-NOINFS-NEXT: vfmadd213sd %xmm1, %xmm2, %xmm0
; FMA-NOINFS-NEXT: retq
;
; FMA4-NOINFS-LABEL: test_f64_interp:
-; FMA4-NOINFS: # BB#0:
+; FMA4-NOINFS: # %bb.0:
; FMA4-NOINFS-NEXT: vfnmaddsd %xmm1, %xmm1, %xmm2, %xmm1
; FMA4-NOINFS-NEXT: vfmaddsd %xmm1, %xmm2, %xmm0, %xmm0
; FMA4-NOINFS-NEXT: retq
;
; AVX512-NOINFS-LABEL: test_f64_interp:
-; AVX512-NOINFS: # BB#0:
+; AVX512-NOINFS: # %bb.0:
; AVX512-NOINFS-NEXT: vfnmadd213sd %xmm1, %xmm2, %xmm1
; AVX512-NOINFS-NEXT: vfmadd213sd %xmm1, %xmm2, %xmm0
; AVX512-NOINFS-NEXT: retq
@@ -1230,7 +1230,7 @@ define double @test_f64_interp(double %x, double %y, double %t) {
define <2 x double> @test_v2f64_interp(<2 x double> %x, <2 x double> %y, <2 x double> %t) {
; FMA-INFS-LABEL: test_v2f64_interp:
-; FMA-INFS: # BB#0:
+; FMA-INFS: # %bb.0:
; FMA-INFS-NEXT: vmovapd {{.*#+}} xmm3 = [1.000000e+00,1.000000e+00]
; FMA-INFS-NEXT: vsubpd %xmm2, %xmm3, %xmm3
; FMA-INFS-NEXT: vmulpd %xmm3, %xmm1, %xmm1
@@ -1238,7 +1238,7 @@ define <2 x double> @test_v2f64_interp(<2 x double> %x, <2 x double> %y, <2 x do
; FMA-INFS-NEXT: retq
;
; FMA4-INFS-LABEL: test_v2f64_interp:
-; FMA4-INFS: # BB#0:
+; FMA4-INFS: # %bb.0:
; FMA4-INFS-NEXT: vmovapd {{.*#+}} xmm3 = [1.000000e+00,1.000000e+00]
; FMA4-INFS-NEXT: vsubpd %xmm2, %xmm3, %xmm3
; FMA4-INFS-NEXT: vmulpd %xmm3, %xmm1, %xmm1
@@ -1246,7 +1246,7 @@ define <2 x double> @test_v2f64_interp(<2 x double> %x, <2 x double> %y, <2 x do
; FMA4-INFS-NEXT: retq
;
; AVX512-INFS-LABEL: test_v2f64_interp:
-; AVX512-INFS: # BB#0:
+; AVX512-INFS: # %bb.0:
; AVX512-INFS-NEXT: vmovapd {{.*#+}} xmm3 = [1.000000e+00,1.000000e+00]
; AVX512-INFS-NEXT: vsubpd %xmm2, %xmm3, %xmm3
; AVX512-INFS-NEXT: vmulpd %xmm3, %xmm1, %xmm1
@@ -1254,19 +1254,19 @@ define <2 x double> @test_v2f64_interp(<2 x double> %x, <2 x double> %y, <2 x do
; AVX512-INFS-NEXT: retq
;
; FMA-NOINFS-LABEL: test_v2f64_interp:
-; FMA-NOINFS: # BB#0:
+; FMA-NOINFS: # %bb.0:
; FMA-NOINFS-NEXT: vfnmadd213pd %xmm1, %xmm2, %xmm1
; FMA-NOINFS-NEXT: vfmadd213pd %xmm1, %xmm2, %xmm0
; FMA-NOINFS-NEXT: retq
;
; FMA4-NOINFS-LABEL: test_v2f64_interp:
-; FMA4-NOINFS: # BB#0:
+; FMA4-NOINFS: # %bb.0:
; FMA4-NOINFS-NEXT: vfnmaddpd %xmm1, %xmm1, %xmm2, %xmm1
; FMA4-NOINFS-NEXT: vfmaddpd %xmm1, %xmm2, %xmm0, %xmm0
; FMA4-NOINFS-NEXT: retq
;
; AVX512-NOINFS-LABEL: test_v2f64_interp:
-; AVX512-NOINFS: # BB#0:
+; AVX512-NOINFS: # %bb.0:
; AVX512-NOINFS-NEXT: vfnmadd213pd %xmm1, %xmm2, %xmm1
; AVX512-NOINFS-NEXT: vfmadd213pd %xmm1, %xmm2, %xmm0
; AVX512-NOINFS-NEXT: retq
@@ -1279,7 +1279,7 @@ define <2 x double> @test_v2f64_interp(<2 x double> %x, <2 x double> %y, <2 x do
define <4 x double> @test_v4f64_interp(<4 x double> %x, <4 x double> %y, <4 x double> %t) {
; FMA-INFS-LABEL: test_v4f64_interp:
-; FMA-INFS: # BB#0:
+; FMA-INFS: # %bb.0:
; FMA-INFS-NEXT: vmovapd {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
; FMA-INFS-NEXT: vsubpd %ymm2, %ymm3, %ymm3
; FMA-INFS-NEXT: vmulpd %ymm3, %ymm1, %ymm1
@@ -1287,7 +1287,7 @@ define <4 x double> @test_v4f64_interp(<4 x double> %x, <4 x double> %y, <4 x do
; FMA-INFS-NEXT: retq
;
; FMA4-INFS-LABEL: test_v4f64_interp:
-; FMA4-INFS: # BB#0:
+; FMA4-INFS: # %bb.0:
; FMA4-INFS-NEXT: vmovapd {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
; FMA4-INFS-NEXT: vsubpd %ymm2, %ymm3, %ymm3
; FMA4-INFS-NEXT: vmulpd %ymm3, %ymm1, %ymm1
@@ -1295,27 +1295,27 @@ define <4 x double> @test_v4f64_interp(<4 x double> %x, <4 x double> %y, <4 x do
; FMA4-INFS-NEXT: retq
;
; AVX512-INFS-LABEL: test_v4f64_interp:
-; AVX512-INFS: # BB#0:
-; AVX512-INFS-NEXT: vbroadcastsd {{.*}}(%rip), %ymm3
+; AVX512-INFS: # %bb.0:
+; AVX512-INFS-NEXT: vbroadcastsd {{.*#+}} ymm3 = [1,1,1,1]
; AVX512-INFS-NEXT: vsubpd %ymm2, %ymm3, %ymm3
; AVX512-INFS-NEXT: vmulpd %ymm3, %ymm1, %ymm1
; AVX512-INFS-NEXT: vfmadd213pd %ymm1, %ymm2, %ymm0
; AVX512-INFS-NEXT: retq
;
; FMA-NOINFS-LABEL: test_v4f64_interp:
-; FMA-NOINFS: # BB#0:
+; FMA-NOINFS: # %bb.0:
; FMA-NOINFS-NEXT: vfnmadd213pd %ymm1, %ymm2, %ymm1
; FMA-NOINFS-NEXT: vfmadd213pd %ymm1, %ymm2, %ymm0
; FMA-NOINFS-NEXT: retq
;
; FMA4-NOINFS-LABEL: test_v4f64_interp:
-; FMA4-NOINFS: # BB#0:
+; FMA4-NOINFS: # %bb.0:
; FMA4-NOINFS-NEXT: vfnmaddpd %ymm1, %ymm1, %ymm2, %ymm1
; FMA4-NOINFS-NEXT: vfmaddpd %ymm1, %ymm2, %ymm0, %ymm0
; FMA4-NOINFS-NEXT: retq
;
; AVX512-NOINFS-LABEL: test_v4f64_interp:
-; AVX512-NOINFS: # BB#0:
+; AVX512-NOINFS: # %bb.0:
; AVX512-NOINFS-NEXT: vfnmadd213pd %ymm1, %ymm2, %ymm1
; AVX512-NOINFS-NEXT: vfmadd213pd %ymm1, %ymm2, %ymm0
; AVX512-NOINFS-NEXT: retq
@@ -1332,17 +1332,17 @@ define <4 x double> @test_v4f64_interp(<4 x double> %x, <4 x double> %y, <4 x do
define <4 x float> @test_v4f32_fneg_fmadd(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
; FMA-LABEL: test_v4f32_fneg_fmadd:
-; FMA: # BB#0:
+; FMA: # %bb.0:
; FMA-NEXT: vfnmsub213ps %xmm2, %xmm1, %xmm0
; FMA-NEXT: retq
;
; FMA4-LABEL: test_v4f32_fneg_fmadd:
-; FMA4: # BB#0:
+; FMA4: # %bb.0:
; FMA4-NEXT: vfnmsubps %xmm2, %xmm1, %xmm0, %xmm0
; FMA4-NEXT: retq
;
; AVX512-LABEL: test_v4f32_fneg_fmadd:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vfnmsub213ps %xmm2, %xmm1, %xmm0
; AVX512-NEXT: retq
%mul = fmul <4 x float> %a0, %a1
@@ -1353,17 +1353,17 @@ define <4 x float> @test_v4f32_fneg_fmadd(<4 x float> %a0, <4 x float> %a1, <4 x
define <4 x double> @test_v4f64_fneg_fmsub(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 {
; FMA-LABEL: test_v4f64_fneg_fmsub:
-; FMA: # BB#0:
+; FMA: # %bb.0:
; FMA-NEXT: vfnmadd213pd %ymm2, %ymm1, %ymm0
; FMA-NEXT: retq
;
; FMA4-LABEL: test_v4f64_fneg_fmsub:
-; FMA4: # BB#0:
+; FMA4: # %bb.0:
; FMA4-NEXT: vfnmaddpd %ymm2, %ymm1, %ymm0, %ymm0
; FMA4-NEXT: retq
;
; AVX512-LABEL: test_v4f64_fneg_fmsub:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vfnmadd213pd %ymm2, %ymm1, %ymm0
; AVX512-NEXT: retq
%mul = fmul <4 x double> %a0, %a1
@@ -1374,17 +1374,17 @@ define <4 x double> @test_v4f64_fneg_fmsub(<4 x double> %a0, <4 x double> %a1, <
define <4 x float> @test_v4f32_fneg_fnmadd(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
; FMA-LABEL: test_v4f32_fneg_fnmadd:
-; FMA: # BB#0:
+; FMA: # %bb.0:
; FMA-NEXT: vfmsub213ps %xmm2, %xmm1, %xmm0
; FMA-NEXT: retq
;
; FMA4-LABEL: test_v4f32_fneg_fnmadd:
-; FMA4: # BB#0:
+; FMA4: # %bb.0:
; FMA4-NEXT: vfmsubps %xmm2, %xmm1, %xmm0, %xmm0
; FMA4-NEXT: retq
;
; AVX512-LABEL: test_v4f32_fneg_fnmadd:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vfmsub213ps %xmm2, %xmm1, %xmm0
; AVX512-NEXT: retq
%mul = fmul <4 x float> %a0, %a1
@@ -1396,17 +1396,17 @@ define <4 x float> @test_v4f32_fneg_fnmadd(<4 x float> %a0, <4 x float> %a1, <4
define <4 x double> @test_v4f64_fneg_fnmsub(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 {
; FMA-LABEL: test_v4f64_fneg_fnmsub:
-; FMA: # BB#0:
+; FMA: # %bb.0:
; FMA-NEXT: vfmadd213pd %ymm2, %ymm1, %ymm0
; FMA-NEXT: retq
;
; FMA4-LABEL: test_v4f64_fneg_fnmsub:
-; FMA4: # BB#0:
+; FMA4: # %bb.0:
; FMA4-NEXT: vfmaddpd %ymm2, %ymm1, %ymm0, %ymm0
; FMA4-NEXT: retq
;
; AVX512-LABEL: test_v4f64_fneg_fnmsub:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vfmadd213pd %ymm2, %ymm1, %ymm0
; AVX512-NEXT: retq
%mul = fmul <4 x double> %a0, %a1
@@ -1422,17 +1422,17 @@ define <4 x double> @test_v4f64_fneg_fnmsub(<4 x double> %a0, <4 x double> %a1,
define <4 x float> @test_v4f32_fma_x_c1_fmul_x_c2(<4 x float> %x) #0 {
; FMA-LABEL: test_v4f32_fma_x_c1_fmul_x_c2:
-; FMA: # BB#0:
+; FMA: # %bb.0:
; FMA-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0
; FMA-NEXT: retq
;
; FMA4-LABEL: test_v4f32_fma_x_c1_fmul_x_c2:
-; FMA4: # BB#0:
+; FMA4: # %bb.0:
; FMA4-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0
; FMA4-NEXT: retq
;
; AVX512-LABEL: test_v4f32_fma_x_c1_fmul_x_c2:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vmulps {{.*}}(%rip){1to4}, %xmm0, %xmm0
; AVX512-NEXT: retq
%m0 = fmul <4 x float> %x, <float 1.0, float 2.0, float 3.0, float 4.0>
@@ -1447,17 +1447,17 @@ define <4 x float> @test_v4f32_fma_x_c1_fmul_x_c2(<4 x float> %x) #0 {
define <4 x float> @test_v4f32_fma_fmul_x_c1_c2_y(<4 x float> %x, <4 x float> %y) #0 {
; FMA-LABEL: test_v4f32_fma_fmul_x_c1_c2_y:
-; FMA: # BB#0:
+; FMA: # %bb.0:
; FMA-NEXT: vfmadd132ps {{.*}}(%rip), %xmm1, %xmm0
; FMA-NEXT: retq
;
; FMA4-LABEL: test_v4f32_fma_fmul_x_c1_c2_y:
-; FMA4: # BB#0:
+; FMA4: # %bb.0:
; FMA4-NEXT: vfmaddps %xmm1, {{.*}}(%rip), %xmm0, %xmm0
; FMA4-NEXT: retq
;
; AVX512-LABEL: test_v4f32_fma_fmul_x_c1_c2_y:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vfmadd132ps {{.*}}(%rip), %xmm1, %xmm0
; AVX512-NEXT: retq
%m0 = fmul <4 x float> %x, <float 1.0, float 2.0, float 3.0, float 4.0>
@@ -1470,19 +1470,19 @@ define <4 x float> @test_v4f32_fma_fmul_x_c1_c2_y(<4 x float> %x, <4 x float> %y
define double @test_f64_fneg_fmul(double %x, double %y) #0 {
; FMA-LABEL: test_f64_fneg_fmul:
-; FMA: # BB#0:
+; FMA: # %bb.0:
; FMA-NEXT: vxorpd %xmm2, %xmm2, %xmm2
; FMA-NEXT: vfnmsub213sd %xmm2, %xmm1, %xmm0
; FMA-NEXT: retq
;
; FMA4-LABEL: test_f64_fneg_fmul:
-; FMA4: # BB#0:
+; FMA4: # %bb.0:
; FMA4-NEXT: vxorpd %xmm2, %xmm2, %xmm2
; FMA4-NEXT: vfnmsubsd %xmm2, %xmm1, %xmm0, %xmm0
; FMA4-NEXT: retq
;
; AVX512-LABEL: test_f64_fneg_fmul:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vxorpd %xmm2, %xmm2, %xmm2
; AVX512-NEXT: vfnmsub213sd %xmm2, %xmm1, %xmm0
; AVX512-NEXT: retq
@@ -1493,19 +1493,19 @@ define double @test_f64_fneg_fmul(double %x, double %y) #0 {
define <4 x float> @test_v4f32_fneg_fmul(<4 x float> %x, <4 x float> %y) #0 {
; FMA-LABEL: test_v4f32_fneg_fmul:
-; FMA: # BB#0:
+; FMA: # %bb.0:
; FMA-NEXT: vxorps %xmm2, %xmm2, %xmm2
; FMA-NEXT: vfnmsub213ps %xmm2, %xmm1, %xmm0
; FMA-NEXT: retq
;
; FMA4-LABEL: test_v4f32_fneg_fmul:
-; FMA4: # BB#0:
+; FMA4: # %bb.0:
; FMA4-NEXT: vxorps %xmm2, %xmm2, %xmm2
; FMA4-NEXT: vfnmsubps %xmm2, %xmm1, %xmm0, %xmm0
; FMA4-NEXT: retq
;
; AVX512-LABEL: test_v4f32_fneg_fmul:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vxorps %xmm2, %xmm2, %xmm2
; AVX512-NEXT: vfnmsub213ps %xmm2, %xmm1, %xmm0
; AVX512-NEXT: retq
@@ -1516,20 +1516,20 @@ define <4 x float> @test_v4f32_fneg_fmul(<4 x float> %x, <4 x float> %y) #0 {
define <4 x double> @test_v4f64_fneg_fmul(<4 x double> %x, <4 x double> %y) #0 {
; FMA-LABEL: test_v4f64_fneg_fmul:
-; FMA: # BB#0:
-; FMA-NEXT: vxorpd %ymm2, %ymm2, %ymm2
+; FMA: # %bb.0:
+; FMA-NEXT: vxorpd %xmm2, %xmm2, %xmm2
; FMA-NEXT: vfnmsub213pd %ymm2, %ymm1, %ymm0
; FMA-NEXT: retq
;
; FMA4-LABEL: test_v4f64_fneg_fmul:
-; FMA4: # BB#0:
-; FMA4-NEXT: vxorpd %ymm2, %ymm2, %ymm2
+; FMA4: # %bb.0:
+; FMA4-NEXT: vxorpd %xmm2, %xmm2, %xmm2
; FMA4-NEXT: vfnmsubpd %ymm2, %ymm1, %ymm0, %ymm0
; FMA4-NEXT: retq
;
; AVX512-LABEL: test_v4f64_fneg_fmul:
-; AVX512: # BB#0:
-; AVX512-NEXT: vxorpd %ymm2, %ymm2, %ymm2
+; AVX512: # %bb.0:
+; AVX512-NEXT: vxorpd %xmm2, %xmm2, %xmm2
; AVX512-NEXT: vfnmsub213pd %ymm2, %ymm1, %ymm0
; AVX512-NEXT: retq
%m = fmul nsz <4 x double> %x, %y
@@ -1539,19 +1539,19 @@ define <4 x double> @test_v4f64_fneg_fmul(<4 x double> %x, <4 x double> %y) #0 {
define <4 x double> @test_v4f64_fneg_fmul_no_nsz(<4 x double> %x, <4 x double> %y) #0 {
; FMA-LABEL: test_v4f64_fneg_fmul_no_nsz:
-; FMA: # BB#0:
+; FMA: # %bb.0:
; FMA-NEXT: vmulpd %ymm1, %ymm0, %ymm0
; FMA-NEXT: vxorpd {{.*}}(%rip), %ymm0, %ymm0
; FMA-NEXT: retq
;
; FMA4-LABEL: test_v4f64_fneg_fmul_no_nsz:
-; FMA4: # BB#0:
+; FMA4: # %bb.0:
; FMA4-NEXT: vmulpd %ymm1, %ymm0, %ymm0
; FMA4-NEXT: vxorpd {{.*}}(%rip), %ymm0, %ymm0
; FMA4-NEXT: retq
;
; AVX512-LABEL: test_v4f64_fneg_fmul_no_nsz:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vmulpd %ymm1, %ymm0, %ymm0
; AVX512-NEXT: vxorpd {{.*}}(%rip){1to4}, %ymm0, %ymm0
; AVX512-NEXT: retq
diff --git a/test/CodeGen/X86/fma_patterns_wide.ll b/test/CodeGen/X86/fma_patterns_wide.ll
index ab1bf4dbe4f9..9b2d7ff2bb9d 100644
--- a/test/CodeGen/X86/fma_patterns_wide.ll
+++ b/test/CodeGen/X86/fma_patterns_wide.ll
@@ -14,19 +14,19 @@
define <16 x float> @test_16f32_fmadd(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
; FMA-LABEL: test_16f32_fmadd:
-; FMA: # BB#0:
+; FMA: # %bb.0:
; FMA-NEXT: vfmadd213ps %ymm4, %ymm2, %ymm0
; FMA-NEXT: vfmadd213ps %ymm5, %ymm3, %ymm1
; FMA-NEXT: retq
;
; FMA4-LABEL: test_16f32_fmadd:
-; FMA4: # BB#0:
+; FMA4: # %bb.0:
; FMA4-NEXT: vfmaddps %ymm4, %ymm2, %ymm0, %ymm0
; FMA4-NEXT: vfmaddps %ymm5, %ymm3, %ymm1, %ymm1
; FMA4-NEXT: retq
;
; AVX512-LABEL: test_16f32_fmadd:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vfmadd213ps %zmm2, %zmm1, %zmm0
; AVX512-NEXT: retq
%x = fmul <16 x float> %a0, %a1
@@ -36,19 +36,19 @@ define <16 x float> @test_16f32_fmadd(<16 x float> %a0, <16 x float> %a1, <16 x
define <8 x double> @test_8f64_fmadd(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
; FMA-LABEL: test_8f64_fmadd:
-; FMA: # BB#0:
+; FMA: # %bb.0:
; FMA-NEXT: vfmadd213pd %ymm4, %ymm2, %ymm0
; FMA-NEXT: vfmadd213pd %ymm5, %ymm3, %ymm1
; FMA-NEXT: retq
;
; FMA4-LABEL: test_8f64_fmadd:
-; FMA4: # BB#0:
+; FMA4: # %bb.0:
; FMA4-NEXT: vfmaddpd %ymm4, %ymm2, %ymm0, %ymm0
; FMA4-NEXT: vfmaddpd %ymm5, %ymm3, %ymm1, %ymm1
; FMA4-NEXT: retq
;
; AVX512-LABEL: test_8f64_fmadd:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vfmadd213pd %zmm2, %zmm1, %zmm0
; AVX512-NEXT: retq
%x = fmul <8 x double> %a0, %a1
@@ -62,19 +62,19 @@ define <8 x double> @test_8f64_fmadd(<8 x double> %a0, <8 x double> %a1, <8 x do
define <16 x float> @test_16f32_fmsub(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
; FMA-LABEL: test_16f32_fmsub:
-; FMA: # BB#0:
+; FMA: # %bb.0:
; FMA-NEXT: vfmsub213ps %ymm4, %ymm2, %ymm0
; FMA-NEXT: vfmsub213ps %ymm5, %ymm3, %ymm1
; FMA-NEXT: retq
;
; FMA4-LABEL: test_16f32_fmsub:
-; FMA4: # BB#0:
+; FMA4: # %bb.0:
; FMA4-NEXT: vfmsubps %ymm4, %ymm2, %ymm0, %ymm0
; FMA4-NEXT: vfmsubps %ymm5, %ymm3, %ymm1, %ymm1
; FMA4-NEXT: retq
;
; AVX512-LABEL: test_16f32_fmsub:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vfmsub213ps %zmm2, %zmm1, %zmm0
; AVX512-NEXT: retq
%x = fmul <16 x float> %a0, %a1
@@ -84,19 +84,19 @@ define <16 x float> @test_16f32_fmsub(<16 x float> %a0, <16 x float> %a1, <16 x
define <8 x double> @test_8f64_fmsub(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
; FMA-LABEL: test_8f64_fmsub:
-; FMA: # BB#0:
+; FMA: # %bb.0:
; FMA-NEXT: vfmsub213pd %ymm4, %ymm2, %ymm0
; FMA-NEXT: vfmsub213pd %ymm5, %ymm3, %ymm1
; FMA-NEXT: retq
;
; FMA4-LABEL: test_8f64_fmsub:
-; FMA4: # BB#0:
+; FMA4: # %bb.0:
; FMA4-NEXT: vfmsubpd %ymm4, %ymm2, %ymm0, %ymm0
; FMA4-NEXT: vfmsubpd %ymm5, %ymm3, %ymm1, %ymm1
; FMA4-NEXT: retq
;
; AVX512-LABEL: test_8f64_fmsub:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vfmsub213pd %zmm2, %zmm1, %zmm0
; AVX512-NEXT: retq
%x = fmul <8 x double> %a0, %a1
@@ -110,19 +110,19 @@ define <8 x double> @test_8f64_fmsub(<8 x double> %a0, <8 x double> %a1, <8 x do
define <16 x float> @test_16f32_fnmadd(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
; FMA-LABEL: test_16f32_fnmadd:
-; FMA: # BB#0:
+; FMA: # %bb.0:
; FMA-NEXT: vfnmadd213ps %ymm4, %ymm2, %ymm0
; FMA-NEXT: vfnmadd213ps %ymm5, %ymm3, %ymm1
; FMA-NEXT: retq
;
; FMA4-LABEL: test_16f32_fnmadd:
-; FMA4: # BB#0:
+; FMA4: # %bb.0:
; FMA4-NEXT: vfnmaddps %ymm4, %ymm2, %ymm0, %ymm0
; FMA4-NEXT: vfnmaddps %ymm5, %ymm3, %ymm1, %ymm1
; FMA4-NEXT: retq
;
; AVX512-LABEL: test_16f32_fnmadd:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vfnmadd213ps %zmm2, %zmm1, %zmm0
; AVX512-NEXT: retq
%x = fmul <16 x float> %a0, %a1
@@ -132,19 +132,19 @@ define <16 x float> @test_16f32_fnmadd(<16 x float> %a0, <16 x float> %a1, <16 x
define <8 x double> @test_8f64_fnmadd(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
; FMA-LABEL: test_8f64_fnmadd:
-; FMA: # BB#0:
+; FMA: # %bb.0:
; FMA-NEXT: vfnmadd213pd %ymm4, %ymm2, %ymm0
; FMA-NEXT: vfnmadd213pd %ymm5, %ymm3, %ymm1
; FMA-NEXT: retq
;
; FMA4-LABEL: test_8f64_fnmadd:
-; FMA4: # BB#0:
+; FMA4: # %bb.0:
; FMA4-NEXT: vfnmaddpd %ymm4, %ymm2, %ymm0, %ymm0
; FMA4-NEXT: vfnmaddpd %ymm5, %ymm3, %ymm1, %ymm1
; FMA4-NEXT: retq
;
; AVX512-LABEL: test_8f64_fnmadd:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vfnmadd213pd %zmm2, %zmm1, %zmm0
; AVX512-NEXT: retq
%x = fmul <8 x double> %a0, %a1
@@ -158,19 +158,19 @@ define <8 x double> @test_8f64_fnmadd(<8 x double> %a0, <8 x double> %a1, <8 x d
define <16 x float> @test_16f32_fnmsub(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
; FMA-LABEL: test_16f32_fnmsub:
-; FMA: # BB#0:
+; FMA: # %bb.0:
; FMA-NEXT: vfnmsub213ps %ymm4, %ymm2, %ymm0
; FMA-NEXT: vfnmsub213ps %ymm5, %ymm3, %ymm1
; FMA-NEXT: retq
;
; FMA4-LABEL: test_16f32_fnmsub:
-; FMA4: # BB#0:
+; FMA4: # %bb.0:
; FMA4-NEXT: vfnmsubps %ymm4, %ymm2, %ymm0, %ymm0
; FMA4-NEXT: vfnmsubps %ymm5, %ymm3, %ymm1, %ymm1
; FMA4-NEXT: retq
;
; AVX512-LABEL: test_16f32_fnmsub:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vfnmsub213ps %zmm2, %zmm1, %zmm0
; AVX512-NEXT: retq
%x = fmul <16 x float> %a0, %a1
@@ -181,19 +181,19 @@ define <16 x float> @test_16f32_fnmsub(<16 x float> %a0, <16 x float> %a1, <16 x
define <8 x double> @test_8f64_fnmsub(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
; FMA-LABEL: test_8f64_fnmsub:
-; FMA: # BB#0:
+; FMA: # %bb.0:
; FMA-NEXT: vfnmsub213pd %ymm4, %ymm2, %ymm0
; FMA-NEXT: vfnmsub213pd %ymm5, %ymm3, %ymm1
; FMA-NEXT: retq
;
; FMA4-LABEL: test_8f64_fnmsub:
-; FMA4: # BB#0:
+; FMA4: # %bb.0:
; FMA4-NEXT: vfnmsubpd %ymm4, %ymm2, %ymm0, %ymm0
; FMA4-NEXT: vfnmsubpd %ymm5, %ymm3, %ymm1, %ymm1
; FMA4-NEXT: retq
;
; AVX512-LABEL: test_8f64_fnmsub:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vfnmsub213pd %zmm2, %zmm1, %zmm0
; AVX512-NEXT: retq
%x = fmul <8 x double> %a0, %a1
@@ -208,19 +208,19 @@ define <8 x double> @test_8f64_fnmsub(<8 x double> %a0, <8 x double> %a1, <8 x d
define <16 x float> @test_16f32_fmadd_load(<16 x float>* %a0, <16 x float> %a1, <16 x float> %a2) {
; FMA-LABEL: test_16f32_fmadd_load:
-; FMA: # BB#0:
+; FMA: # %bb.0:
; FMA-NEXT: vfmadd132ps (%rdi), %ymm2, %ymm0
; FMA-NEXT: vfmadd132ps 32(%rdi), %ymm3, %ymm1
; FMA-NEXT: retq
;
; FMA4-LABEL: test_16f32_fmadd_load:
-; FMA4: # BB#0:
+; FMA4: # %bb.0:
; FMA4-NEXT: vfmaddps %ymm2, (%rdi), %ymm0, %ymm0
; FMA4-NEXT: vfmaddps %ymm3, 32(%rdi), %ymm1, %ymm1
; FMA4-NEXT: retq
;
; AVX512-LABEL: test_16f32_fmadd_load:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vfmadd132ps (%rdi), %zmm1, %zmm0
; AVX512-NEXT: retq
%x = load <16 x float>, <16 x float>* %a0
@@ -231,19 +231,19 @@ define <16 x float> @test_16f32_fmadd_load(<16 x float>* %a0, <16 x float> %a1,
define <8 x double> @test_8f64_fmsub_load(<8 x double>* %a0, <8 x double> %a1, <8 x double> %a2) {
; FMA-LABEL: test_8f64_fmsub_load:
-; FMA: # BB#0:
+; FMA: # %bb.0:
; FMA-NEXT: vfmsub132pd (%rdi), %ymm2, %ymm0
; FMA-NEXT: vfmsub132pd 32(%rdi), %ymm3, %ymm1
; FMA-NEXT: retq
;
; FMA4-LABEL: test_8f64_fmsub_load:
-; FMA4: # BB#0:
+; FMA4: # %bb.0:
; FMA4-NEXT: vfmsubpd %ymm2, (%rdi), %ymm0, %ymm0
; FMA4-NEXT: vfmsubpd %ymm3, 32(%rdi), %ymm1, %ymm1
; FMA4-NEXT: retq
;
; AVX512-LABEL: test_8f64_fmsub_load:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vfmsub132pd (%rdi), %zmm1, %zmm0
; AVX512-NEXT: retq
%x = load <8 x double>, <8 x double>* %a0
@@ -258,7 +258,7 @@ define <8 x double> @test_8f64_fmsub_load(<8 x double>* %a0, <8 x double> %a1, <
define <16 x float> @test_v16f32_mul_add_x_one_y(<16 x float> %x, <16 x float> %y) {
; FMA-INFS-LABEL: test_v16f32_mul_add_x_one_y:
-; FMA-INFS: # BB#0:
+; FMA-INFS: # %bb.0:
; FMA-INFS-NEXT: vmovaps {{.*#+}} ymm4 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
; FMA-INFS-NEXT: vaddps %ymm4, %ymm1, %ymm1
; FMA-INFS-NEXT: vaddps %ymm4, %ymm0, %ymm0
@@ -267,7 +267,7 @@ define <16 x float> @test_v16f32_mul_add_x_one_y(<16 x float> %x, <16 x float> %
; FMA-INFS-NEXT: retq
;
; FMA4-INFS-LABEL: test_v16f32_mul_add_x_one_y:
-; FMA4-INFS: # BB#0:
+; FMA4-INFS: # %bb.0:
; FMA4-INFS-NEXT: vmovaps {{.*#+}} ymm4 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
; FMA4-INFS-NEXT: vaddps %ymm4, %ymm1, %ymm1
; FMA4-INFS-NEXT: vaddps %ymm4, %ymm0, %ymm0
@@ -276,25 +276,25 @@ define <16 x float> @test_v16f32_mul_add_x_one_y(<16 x float> %x, <16 x float> %
; FMA4-INFS-NEXT: retq
;
; AVX512-INFS-LABEL: test_v16f32_mul_add_x_one_y:
-; AVX512-INFS: # BB#0:
+; AVX512-INFS: # %bb.0:
; AVX512-INFS-NEXT: vaddps {{.*}}(%rip){1to16}, %zmm0, %zmm0
; AVX512-INFS-NEXT: vmulps %zmm1, %zmm0, %zmm0
; AVX512-INFS-NEXT: retq
;
; FMA-NOINFS-LABEL: test_v16f32_mul_add_x_one_y:
-; FMA-NOINFS: # BB#0:
+; FMA-NOINFS: # %bb.0:
; FMA-NOINFS-NEXT: vfmadd213ps %ymm2, %ymm2, %ymm0
; FMA-NOINFS-NEXT: vfmadd213ps %ymm3, %ymm3, %ymm1
; FMA-NOINFS-NEXT: retq
;
; FMA4-NOINFS-LABEL: test_v16f32_mul_add_x_one_y:
-; FMA4-NOINFS: # BB#0:
+; FMA4-NOINFS: # %bb.0:
; FMA4-NOINFS-NEXT: vfmaddps %ymm2, %ymm2, %ymm0, %ymm0
; FMA4-NOINFS-NEXT: vfmaddps %ymm3, %ymm3, %ymm1, %ymm1
; FMA4-NOINFS-NEXT: retq
;
; AVX512-NOINFS-LABEL: test_v16f32_mul_add_x_one_y:
-; AVX512-NOINFS: # BB#0:
+; AVX512-NOINFS: # %bb.0:
; AVX512-NOINFS-NEXT: vfmadd213ps %zmm1, %zmm1, %zmm0
; AVX512-NOINFS-NEXT: retq
%a = fadd <16 x float> %x, <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>
@@ -304,7 +304,7 @@ define <16 x float> @test_v16f32_mul_add_x_one_y(<16 x float> %x, <16 x float> %
define <8 x double> @test_v8f64_mul_y_add_x_one(<8 x double> %x, <8 x double> %y) {
; FMA-INFS-LABEL: test_v8f64_mul_y_add_x_one:
-; FMA-INFS: # BB#0:
+; FMA-INFS: # %bb.0:
; FMA-INFS-NEXT: vmovapd {{.*#+}} ymm4 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
; FMA-INFS-NEXT: vaddpd %ymm4, %ymm1, %ymm1
; FMA-INFS-NEXT: vaddpd %ymm4, %ymm0, %ymm0
@@ -313,7 +313,7 @@ define <8 x double> @test_v8f64_mul_y_add_x_one(<8 x double> %x, <8 x double> %y
; FMA-INFS-NEXT: retq
;
; FMA4-INFS-LABEL: test_v8f64_mul_y_add_x_one:
-; FMA4-INFS: # BB#0:
+; FMA4-INFS: # %bb.0:
; FMA4-INFS-NEXT: vmovapd {{.*#+}} ymm4 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
; FMA4-INFS-NEXT: vaddpd %ymm4, %ymm1, %ymm1
; FMA4-INFS-NEXT: vaddpd %ymm4, %ymm0, %ymm0
@@ -322,25 +322,25 @@ define <8 x double> @test_v8f64_mul_y_add_x_one(<8 x double> %x, <8 x double> %y
; FMA4-INFS-NEXT: retq
;
; AVX512-INFS-LABEL: test_v8f64_mul_y_add_x_one:
-; AVX512-INFS: # BB#0:
+; AVX512-INFS: # %bb.0:
; AVX512-INFS-NEXT: vaddpd {{.*}}(%rip){1to8}, %zmm0, %zmm0
; AVX512-INFS-NEXT: vmulpd %zmm0, %zmm1, %zmm0
; AVX512-INFS-NEXT: retq
;
; FMA-NOINFS-LABEL: test_v8f64_mul_y_add_x_one:
-; FMA-NOINFS: # BB#0:
+; FMA-NOINFS: # %bb.0:
; FMA-NOINFS-NEXT: vfmadd213pd %ymm2, %ymm2, %ymm0
; FMA-NOINFS-NEXT: vfmadd213pd %ymm3, %ymm3, %ymm1
; FMA-NOINFS-NEXT: retq
;
; FMA4-NOINFS-LABEL: test_v8f64_mul_y_add_x_one:
-; FMA4-NOINFS: # BB#0:
+; FMA4-NOINFS: # %bb.0:
; FMA4-NOINFS-NEXT: vfmaddpd %ymm2, %ymm2, %ymm0, %ymm0
; FMA4-NOINFS-NEXT: vfmaddpd %ymm3, %ymm3, %ymm1, %ymm1
; FMA4-NOINFS-NEXT: retq
;
; AVX512-NOINFS-LABEL: test_v8f64_mul_y_add_x_one:
-; AVX512-NOINFS: # BB#0:
+; AVX512-NOINFS: # %bb.0:
; AVX512-NOINFS-NEXT: vfmadd213pd %zmm1, %zmm1, %zmm0
; AVX512-NOINFS-NEXT: retq
%a = fadd <8 x double> %x, <double 1.0, double 1.0, double 1.0, double 1.0, double 1.0, double 1.0, double 1.0, double 1.0>
@@ -350,7 +350,7 @@ define <8 x double> @test_v8f64_mul_y_add_x_one(<8 x double> %x, <8 x double> %y
define <16 x float> @test_v16f32_mul_add_x_negone_y(<16 x float> %x, <16 x float> %y) {
; FMA-INFS-LABEL: test_v16f32_mul_add_x_negone_y:
-; FMA-INFS: # BB#0:
+; FMA-INFS: # %bb.0:
; FMA-INFS-NEXT: vmovaps {{.*#+}} ymm4 = [-1.000000e+00,-1.000000e+00,-1.000000e+00,-1.000000e+00,-1.000000e+00,-1.000000e+00,-1.000000e+00,-1.000000e+00]
; FMA-INFS-NEXT: vaddps %ymm4, %ymm1, %ymm1
; FMA-INFS-NEXT: vaddps %ymm4, %ymm0, %ymm0
@@ -359,7 +359,7 @@ define <16 x float> @test_v16f32_mul_add_x_negone_y(<16 x float> %x, <16 x float
; FMA-INFS-NEXT: retq
;
; FMA4-INFS-LABEL: test_v16f32_mul_add_x_negone_y:
-; FMA4-INFS: # BB#0:
+; FMA4-INFS: # %bb.0:
; FMA4-INFS-NEXT: vmovaps {{.*#+}} ymm4 = [-1.000000e+00,-1.000000e+00,-1.000000e+00,-1.000000e+00,-1.000000e+00,-1.000000e+00,-1.000000e+00,-1.000000e+00]
; FMA4-INFS-NEXT: vaddps %ymm4, %ymm1, %ymm1
; FMA4-INFS-NEXT: vaddps %ymm4, %ymm0, %ymm0
@@ -368,25 +368,25 @@ define <16 x float> @test_v16f32_mul_add_x_negone_y(<16 x float> %x, <16 x float
; FMA4-INFS-NEXT: retq
;
; AVX512-INFS-LABEL: test_v16f32_mul_add_x_negone_y:
-; AVX512-INFS: # BB#0:
+; AVX512-INFS: # %bb.0:
; AVX512-INFS-NEXT: vaddps {{.*}}(%rip){1to16}, %zmm0, %zmm0
; AVX512-INFS-NEXT: vmulps %zmm1, %zmm0, %zmm0
; AVX512-INFS-NEXT: retq
;
; FMA-NOINFS-LABEL: test_v16f32_mul_add_x_negone_y:
-; FMA-NOINFS: # BB#0:
+; FMA-NOINFS: # %bb.0:
; FMA-NOINFS-NEXT: vfmsub213ps %ymm2, %ymm2, %ymm0
; FMA-NOINFS-NEXT: vfmsub213ps %ymm3, %ymm3, %ymm1
; FMA-NOINFS-NEXT: retq
;
; FMA4-NOINFS-LABEL: test_v16f32_mul_add_x_negone_y:
-; FMA4-NOINFS: # BB#0:
+; FMA4-NOINFS: # %bb.0:
; FMA4-NOINFS-NEXT: vfmsubps %ymm2, %ymm2, %ymm0, %ymm0
; FMA4-NOINFS-NEXT: vfmsubps %ymm3, %ymm3, %ymm1, %ymm1
; FMA4-NOINFS-NEXT: retq
;
; AVX512-NOINFS-LABEL: test_v16f32_mul_add_x_negone_y:
-; AVX512-NOINFS: # BB#0:
+; AVX512-NOINFS: # %bb.0:
; AVX512-NOINFS-NEXT: vfmsub213ps %zmm1, %zmm1, %zmm0
; AVX512-NOINFS-NEXT: retq
%a = fadd <16 x float> %x, <float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0>
@@ -396,7 +396,7 @@ define <16 x float> @test_v16f32_mul_add_x_negone_y(<16 x float> %x, <16 x float
define <8 x double> @test_v8f64_mul_y_add_x_negone(<8 x double> %x, <8 x double> %y) {
; FMA-INFS-LABEL: test_v8f64_mul_y_add_x_negone:
-; FMA-INFS: # BB#0:
+; FMA-INFS: # %bb.0:
; FMA-INFS-NEXT: vmovapd {{.*#+}} ymm4 = [-1.000000e+00,-1.000000e+00,-1.000000e+00,-1.000000e+00]
; FMA-INFS-NEXT: vaddpd %ymm4, %ymm1, %ymm1
; FMA-INFS-NEXT: vaddpd %ymm4, %ymm0, %ymm0
@@ -405,7 +405,7 @@ define <8 x double> @test_v8f64_mul_y_add_x_negone(<8 x double> %x, <8 x double>
; FMA-INFS-NEXT: retq
;
; FMA4-INFS-LABEL: test_v8f64_mul_y_add_x_negone:
-; FMA4-INFS: # BB#0:
+; FMA4-INFS: # %bb.0:
; FMA4-INFS-NEXT: vmovapd {{.*#+}} ymm4 = [-1.000000e+00,-1.000000e+00,-1.000000e+00,-1.000000e+00]
; FMA4-INFS-NEXT: vaddpd %ymm4, %ymm1, %ymm1
; FMA4-INFS-NEXT: vaddpd %ymm4, %ymm0, %ymm0
@@ -414,25 +414,25 @@ define <8 x double> @test_v8f64_mul_y_add_x_negone(<8 x double> %x, <8 x double>
; FMA4-INFS-NEXT: retq
;
; AVX512-INFS-LABEL: test_v8f64_mul_y_add_x_negone:
-; AVX512-INFS: # BB#0:
+; AVX512-INFS: # %bb.0:
; AVX512-INFS-NEXT: vaddpd {{.*}}(%rip){1to8}, %zmm0, %zmm0
; AVX512-INFS-NEXT: vmulpd %zmm0, %zmm1, %zmm0
; AVX512-INFS-NEXT: retq
;
; FMA-NOINFS-LABEL: test_v8f64_mul_y_add_x_negone:
-; FMA-NOINFS: # BB#0:
+; FMA-NOINFS: # %bb.0:
; FMA-NOINFS-NEXT: vfmsub213pd %ymm2, %ymm2, %ymm0
; FMA-NOINFS-NEXT: vfmsub213pd %ymm3, %ymm3, %ymm1
; FMA-NOINFS-NEXT: retq
;
; FMA4-NOINFS-LABEL: test_v8f64_mul_y_add_x_negone:
-; FMA4-NOINFS: # BB#0:
+; FMA4-NOINFS: # %bb.0:
; FMA4-NOINFS-NEXT: vfmsubpd %ymm2, %ymm2, %ymm0, %ymm0
; FMA4-NOINFS-NEXT: vfmsubpd %ymm3, %ymm3, %ymm1, %ymm1
; FMA4-NOINFS-NEXT: retq
;
; AVX512-NOINFS-LABEL: test_v8f64_mul_y_add_x_negone:
-; AVX512-NOINFS: # BB#0:
+; AVX512-NOINFS: # %bb.0:
; AVX512-NOINFS-NEXT: vfmsub213pd %zmm1, %zmm1, %zmm0
; AVX512-NOINFS-NEXT: retq
%a = fadd <8 x double> %x, <double -1.0, double -1.0, double -1.0, double -1.0, double -1.0, double -1.0, double -1.0, double -1.0>
@@ -442,7 +442,7 @@ define <8 x double> @test_v8f64_mul_y_add_x_negone(<8 x double> %x, <8 x double>
define <16 x float> @test_v16f32_mul_sub_one_x_y(<16 x float> %x, <16 x float> %y) {
; FMA-INFS-LABEL: test_v16f32_mul_sub_one_x_y:
-; FMA-INFS: # BB#0:
+; FMA-INFS: # %bb.0:
; FMA-INFS-NEXT: vmovaps {{.*#+}} ymm4 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
; FMA-INFS-NEXT: vsubps %ymm1, %ymm4, %ymm1
; FMA-INFS-NEXT: vsubps %ymm0, %ymm4, %ymm0
@@ -451,7 +451,7 @@ define <16 x float> @test_v16f32_mul_sub_one_x_y(<16 x float> %x, <16 x float> %
; FMA-INFS-NEXT: retq
;
; FMA4-INFS-LABEL: test_v16f32_mul_sub_one_x_y:
-; FMA4-INFS: # BB#0:
+; FMA4-INFS: # %bb.0:
; FMA4-INFS-NEXT: vmovaps {{.*#+}} ymm4 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
; FMA4-INFS-NEXT: vsubps %ymm1, %ymm4, %ymm1
; FMA4-INFS-NEXT: vsubps %ymm0, %ymm4, %ymm0
@@ -460,26 +460,26 @@ define <16 x float> @test_v16f32_mul_sub_one_x_y(<16 x float> %x, <16 x float> %
; FMA4-INFS-NEXT: retq
;
; AVX512-INFS-LABEL: test_v16f32_mul_sub_one_x_y:
-; AVX512-INFS: # BB#0:
-; AVX512-INFS-NEXT: vbroadcastss {{.*}}(%rip), %zmm2
+; AVX512-INFS: # %bb.0:
+; AVX512-INFS-NEXT: vbroadcastss {{.*#+}} zmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; AVX512-INFS-NEXT: vsubps %zmm0, %zmm2, %zmm0
; AVX512-INFS-NEXT: vmulps %zmm1, %zmm0, %zmm0
; AVX512-INFS-NEXT: retq
;
; FMA-NOINFS-LABEL: test_v16f32_mul_sub_one_x_y:
-; FMA-NOINFS: # BB#0:
+; FMA-NOINFS: # %bb.0:
; FMA-NOINFS-NEXT: vfnmadd213ps %ymm2, %ymm2, %ymm0
; FMA-NOINFS-NEXT: vfnmadd213ps %ymm3, %ymm3, %ymm1
; FMA-NOINFS-NEXT: retq
;
; FMA4-NOINFS-LABEL: test_v16f32_mul_sub_one_x_y:
-; FMA4-NOINFS: # BB#0:
+; FMA4-NOINFS: # %bb.0:
; FMA4-NOINFS-NEXT: vfnmaddps %ymm2, %ymm2, %ymm0, %ymm0
; FMA4-NOINFS-NEXT: vfnmaddps %ymm3, %ymm3, %ymm1, %ymm1
; FMA4-NOINFS-NEXT: retq
;
; AVX512-NOINFS-LABEL: test_v16f32_mul_sub_one_x_y:
-; AVX512-NOINFS: # BB#0:
+; AVX512-NOINFS: # %bb.0:
; AVX512-NOINFS-NEXT: vfnmadd213ps %zmm1, %zmm1, %zmm0
; AVX512-NOINFS-NEXT: retq
%s = fsub <16 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x
@@ -489,7 +489,7 @@ define <16 x float> @test_v16f32_mul_sub_one_x_y(<16 x float> %x, <16 x float> %
define <8 x double> @test_v8f64_mul_y_sub_one_x(<8 x double> %x, <8 x double> %y) {
; FMA-INFS-LABEL: test_v8f64_mul_y_sub_one_x:
-; FMA-INFS: # BB#0:
+; FMA-INFS: # %bb.0:
; FMA-INFS-NEXT: vmovapd {{.*#+}} ymm4 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
; FMA-INFS-NEXT: vsubpd %ymm1, %ymm4, %ymm1
; FMA-INFS-NEXT: vsubpd %ymm0, %ymm4, %ymm0
@@ -498,7 +498,7 @@ define <8 x double> @test_v8f64_mul_y_sub_one_x(<8 x double> %x, <8 x double> %y
; FMA-INFS-NEXT: retq
;
; FMA4-INFS-LABEL: test_v8f64_mul_y_sub_one_x:
-; FMA4-INFS: # BB#0:
+; FMA4-INFS: # %bb.0:
; FMA4-INFS-NEXT: vmovapd {{.*#+}} ymm4 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
; FMA4-INFS-NEXT: vsubpd %ymm1, %ymm4, %ymm1
; FMA4-INFS-NEXT: vsubpd %ymm0, %ymm4, %ymm0
@@ -507,26 +507,26 @@ define <8 x double> @test_v8f64_mul_y_sub_one_x(<8 x double> %x, <8 x double> %y
; FMA4-INFS-NEXT: retq
;
; AVX512-INFS-LABEL: test_v8f64_mul_y_sub_one_x:
-; AVX512-INFS: # BB#0:
-; AVX512-INFS-NEXT: vbroadcastsd {{.*}}(%rip), %zmm2
+; AVX512-INFS: # %bb.0:
+; AVX512-INFS-NEXT: vbroadcastsd {{.*#+}} zmm2 = [1,1,1,1,1,1,1,1]
; AVX512-INFS-NEXT: vsubpd %zmm0, %zmm2, %zmm0
; AVX512-INFS-NEXT: vmulpd %zmm0, %zmm1, %zmm0
; AVX512-INFS-NEXT: retq
;
; FMA-NOINFS-LABEL: test_v8f64_mul_y_sub_one_x:
-; FMA-NOINFS: # BB#0:
+; FMA-NOINFS: # %bb.0:
; FMA-NOINFS-NEXT: vfnmadd213pd %ymm2, %ymm2, %ymm0
; FMA-NOINFS-NEXT: vfnmadd213pd %ymm3, %ymm3, %ymm1
; FMA-NOINFS-NEXT: retq
;
; FMA4-NOINFS-LABEL: test_v8f64_mul_y_sub_one_x:
-; FMA4-NOINFS: # BB#0:
+; FMA4-NOINFS: # %bb.0:
; FMA4-NOINFS-NEXT: vfnmaddpd %ymm2, %ymm2, %ymm0, %ymm0
; FMA4-NOINFS-NEXT: vfnmaddpd %ymm3, %ymm3, %ymm1, %ymm1
; FMA4-NOINFS-NEXT: retq
;
; AVX512-NOINFS-LABEL: test_v8f64_mul_y_sub_one_x:
-; AVX512-NOINFS: # BB#0:
+; AVX512-NOINFS: # %bb.0:
; AVX512-NOINFS-NEXT: vfnmadd213pd %zmm1, %zmm1, %zmm0
; AVX512-NOINFS-NEXT: retq
%s = fsub <8 x double> <double 1.0, double 1.0, double 1.0, double 1.0, double 1.0, double 1.0, double 1.0, double 1.0>, %x
@@ -536,7 +536,7 @@ define <8 x double> @test_v8f64_mul_y_sub_one_x(<8 x double> %x, <8 x double> %y
define <16 x float> @test_v16f32_mul_sub_negone_x_y(<16 x float> %x, <16 x float> %y) {
; FMA-INFS-LABEL: test_v16f32_mul_sub_negone_x_y:
-; FMA-INFS: # BB#0:
+; FMA-INFS: # %bb.0:
; FMA-INFS-NEXT: vmovaps {{.*#+}} ymm4 = [-1.000000e+00,-1.000000e+00,-1.000000e+00,-1.000000e+00,-1.000000e+00,-1.000000e+00,-1.000000e+00,-1.000000e+00]
; FMA-INFS-NEXT: vsubps %ymm1, %ymm4, %ymm1
; FMA-INFS-NEXT: vsubps %ymm0, %ymm4, %ymm0
@@ -545,7 +545,7 @@ define <16 x float> @test_v16f32_mul_sub_negone_x_y(<16 x float> %x, <16 x float
; FMA-INFS-NEXT: retq
;
; FMA4-INFS-LABEL: test_v16f32_mul_sub_negone_x_y:
-; FMA4-INFS: # BB#0:
+; FMA4-INFS: # %bb.0:
; FMA4-INFS-NEXT: vmovaps {{.*#+}} ymm4 = [-1.000000e+00,-1.000000e+00,-1.000000e+00,-1.000000e+00,-1.000000e+00,-1.000000e+00,-1.000000e+00,-1.000000e+00]
; FMA4-INFS-NEXT: vsubps %ymm1, %ymm4, %ymm1
; FMA4-INFS-NEXT: vsubps %ymm0, %ymm4, %ymm0
@@ -554,26 +554,26 @@ define <16 x float> @test_v16f32_mul_sub_negone_x_y(<16 x float> %x, <16 x float
; FMA4-INFS-NEXT: retq
;
; AVX512-INFS-LABEL: test_v16f32_mul_sub_negone_x_y:
-; AVX512-INFS: # BB#0:
-; AVX512-INFS-NEXT: vbroadcastss {{.*}}(%rip), %zmm2
+; AVX512-INFS: # %bb.0:
+; AVX512-INFS-NEXT: vbroadcastss {{.*#+}} zmm2 = [-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1]
; AVX512-INFS-NEXT: vsubps %zmm0, %zmm2, %zmm0
; AVX512-INFS-NEXT: vmulps %zmm1, %zmm0, %zmm0
; AVX512-INFS-NEXT: retq
;
; FMA-NOINFS-LABEL: test_v16f32_mul_sub_negone_x_y:
-; FMA-NOINFS: # BB#0:
+; FMA-NOINFS: # %bb.0:
; FMA-NOINFS-NEXT: vfnmsub213ps %ymm2, %ymm2, %ymm0
; FMA-NOINFS-NEXT: vfnmsub213ps %ymm3, %ymm3, %ymm1
; FMA-NOINFS-NEXT: retq
;
; FMA4-NOINFS-LABEL: test_v16f32_mul_sub_negone_x_y:
-; FMA4-NOINFS: # BB#0:
+; FMA4-NOINFS: # %bb.0:
; FMA4-NOINFS-NEXT: vfnmsubps %ymm2, %ymm2, %ymm0, %ymm0
; FMA4-NOINFS-NEXT: vfnmsubps %ymm3, %ymm3, %ymm1, %ymm1
; FMA4-NOINFS-NEXT: retq
;
; AVX512-NOINFS-LABEL: test_v16f32_mul_sub_negone_x_y:
-; AVX512-NOINFS: # BB#0:
+; AVX512-NOINFS: # %bb.0:
; AVX512-NOINFS-NEXT: vfnmsub213ps %zmm1, %zmm1, %zmm0
; AVX512-NOINFS-NEXT: retq
%s = fsub <16 x float> <float -1.0, float -1.0, float -1.0, float -1.0,float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0>, %x
@@ -583,7 +583,7 @@ define <16 x float> @test_v16f32_mul_sub_negone_x_y(<16 x float> %x, <16 x float
define <8 x double> @test_v8f64_mul_y_sub_negone_x(<8 x double> %x, <8 x double> %y) {
; FMA-INFS-LABEL: test_v8f64_mul_y_sub_negone_x:
-; FMA-INFS: # BB#0:
+; FMA-INFS: # %bb.0:
; FMA-INFS-NEXT: vmovapd {{.*#+}} ymm4 = [-1.000000e+00,-1.000000e+00,-1.000000e+00,-1.000000e+00]
; FMA-INFS-NEXT: vsubpd %ymm1, %ymm4, %ymm1
; FMA-INFS-NEXT: vsubpd %ymm0, %ymm4, %ymm0
@@ -592,7 +592,7 @@ define <8 x double> @test_v8f64_mul_y_sub_negone_x(<8 x double> %x, <8 x double>
; FMA-INFS-NEXT: retq
;
; FMA4-INFS-LABEL: test_v8f64_mul_y_sub_negone_x:
-; FMA4-INFS: # BB#0:
+; FMA4-INFS: # %bb.0:
; FMA4-INFS-NEXT: vmovapd {{.*#+}} ymm4 = [-1.000000e+00,-1.000000e+00,-1.000000e+00,-1.000000e+00]
; FMA4-INFS-NEXT: vsubpd %ymm1, %ymm4, %ymm1
; FMA4-INFS-NEXT: vsubpd %ymm0, %ymm4, %ymm0
@@ -601,26 +601,26 @@ define <8 x double> @test_v8f64_mul_y_sub_negone_x(<8 x double> %x, <8 x double>
; FMA4-INFS-NEXT: retq
;
; AVX512-INFS-LABEL: test_v8f64_mul_y_sub_negone_x:
-; AVX512-INFS: # BB#0:
-; AVX512-INFS-NEXT: vbroadcastsd {{.*}}(%rip), %zmm2
+; AVX512-INFS: # %bb.0:
+; AVX512-INFS-NEXT: vbroadcastsd {{.*#+}} zmm2 = [-1,-1,-1,-1,-1,-1,-1,-1]
; AVX512-INFS-NEXT: vsubpd %zmm0, %zmm2, %zmm0
; AVX512-INFS-NEXT: vmulpd %zmm0, %zmm1, %zmm0
; AVX512-INFS-NEXT: retq
;
; FMA-NOINFS-LABEL: test_v8f64_mul_y_sub_negone_x:
-; FMA-NOINFS: # BB#0:
+; FMA-NOINFS: # %bb.0:
; FMA-NOINFS-NEXT: vfnmsub213pd %ymm2, %ymm2, %ymm0
; FMA-NOINFS-NEXT: vfnmsub213pd %ymm3, %ymm3, %ymm1
; FMA-NOINFS-NEXT: retq
;
; FMA4-NOINFS-LABEL: test_v8f64_mul_y_sub_negone_x:
-; FMA4-NOINFS: # BB#0:
+; FMA4-NOINFS: # %bb.0:
; FMA4-NOINFS-NEXT: vfnmsubpd %ymm2, %ymm2, %ymm0, %ymm0
; FMA4-NOINFS-NEXT: vfnmsubpd %ymm3, %ymm3, %ymm1, %ymm1
; FMA4-NOINFS-NEXT: retq
;
; AVX512-NOINFS-LABEL: test_v8f64_mul_y_sub_negone_x:
-; AVX512-NOINFS: # BB#0:
+; AVX512-NOINFS: # %bb.0:
; AVX512-NOINFS-NEXT: vfnmsub213pd %zmm1, %zmm1, %zmm0
; AVX512-NOINFS-NEXT: retq
%s = fsub <8 x double> <double -1.0, double -1.0, double -1.0, double -1.0, double -1.0, double -1.0, double -1.0, double -1.0>, %x
@@ -630,7 +630,7 @@ define <8 x double> @test_v8f64_mul_y_sub_negone_x(<8 x double> %x, <8 x double>
define <16 x float> @test_v16f32_mul_sub_x_one_y(<16 x float> %x, <16 x float> %y) {
; FMA-INFS-LABEL: test_v16f32_mul_sub_x_one_y:
-; FMA-INFS: # BB#0:
+; FMA-INFS: # %bb.0:
; FMA-INFS-NEXT: vmovaps {{.*#+}} ymm4 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
; FMA-INFS-NEXT: vsubps %ymm4, %ymm1, %ymm1
; FMA-INFS-NEXT: vsubps %ymm4, %ymm0, %ymm0
@@ -639,7 +639,7 @@ define <16 x float> @test_v16f32_mul_sub_x_one_y(<16 x float> %x, <16 x float> %
; FMA-INFS-NEXT: retq
;
; FMA4-INFS-LABEL: test_v16f32_mul_sub_x_one_y:
-; FMA4-INFS: # BB#0:
+; FMA4-INFS: # %bb.0:
; FMA4-INFS-NEXT: vmovaps {{.*#+}} ymm4 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
; FMA4-INFS-NEXT: vsubps %ymm4, %ymm1, %ymm1
; FMA4-INFS-NEXT: vsubps %ymm4, %ymm0, %ymm0
@@ -648,25 +648,25 @@ define <16 x float> @test_v16f32_mul_sub_x_one_y(<16 x float> %x, <16 x float> %
; FMA4-INFS-NEXT: retq
;
; AVX512-INFS-LABEL: test_v16f32_mul_sub_x_one_y:
-; AVX512-INFS: # BB#0:
+; AVX512-INFS: # %bb.0:
; AVX512-INFS-NEXT: vsubps {{.*}}(%rip){1to16}, %zmm0, %zmm0
; AVX512-INFS-NEXT: vmulps %zmm1, %zmm0, %zmm0
; AVX512-INFS-NEXT: retq
;
; FMA-NOINFS-LABEL: test_v16f32_mul_sub_x_one_y:
-; FMA-NOINFS: # BB#0:
+; FMA-NOINFS: # %bb.0:
; FMA-NOINFS-NEXT: vfmsub213ps %ymm2, %ymm2, %ymm0
; FMA-NOINFS-NEXT: vfmsub213ps %ymm3, %ymm3, %ymm1
; FMA-NOINFS-NEXT: retq
;
; FMA4-NOINFS-LABEL: test_v16f32_mul_sub_x_one_y:
-; FMA4-NOINFS: # BB#0:
+; FMA4-NOINFS: # %bb.0:
; FMA4-NOINFS-NEXT: vfmsubps %ymm2, %ymm2, %ymm0, %ymm0
; FMA4-NOINFS-NEXT: vfmsubps %ymm3, %ymm3, %ymm1, %ymm1
; FMA4-NOINFS-NEXT: retq
;
; AVX512-NOINFS-LABEL: test_v16f32_mul_sub_x_one_y:
-; AVX512-NOINFS: # BB#0:
+; AVX512-NOINFS: # %bb.0:
; AVX512-NOINFS-NEXT: vfmsub213ps %zmm1, %zmm1, %zmm0
; AVX512-NOINFS-NEXT: retq
%s = fsub <16 x float> %x, <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>
@@ -676,7 +676,7 @@ define <16 x float> @test_v16f32_mul_sub_x_one_y(<16 x float> %x, <16 x float> %
define <8 x double> @test_v8f64_mul_y_sub_x_one(<8 x double> %x, <8 x double> %y) {
; FMA-INFS-LABEL: test_v8f64_mul_y_sub_x_one:
-; FMA-INFS: # BB#0:
+; FMA-INFS: # %bb.0:
; FMA-INFS-NEXT: vmovapd {{.*#+}} ymm4 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
; FMA-INFS-NEXT: vsubpd %ymm4, %ymm1, %ymm1
; FMA-INFS-NEXT: vsubpd %ymm4, %ymm0, %ymm0
@@ -685,7 +685,7 @@ define <8 x double> @test_v8f64_mul_y_sub_x_one(<8 x double> %x, <8 x double> %y
; FMA-INFS-NEXT: retq
;
; FMA4-INFS-LABEL: test_v8f64_mul_y_sub_x_one:
-; FMA4-INFS: # BB#0:
+; FMA4-INFS: # %bb.0:
; FMA4-INFS-NEXT: vmovapd {{.*#+}} ymm4 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
; FMA4-INFS-NEXT: vsubpd %ymm4, %ymm1, %ymm1
; FMA4-INFS-NEXT: vsubpd %ymm4, %ymm0, %ymm0
@@ -694,25 +694,25 @@ define <8 x double> @test_v8f64_mul_y_sub_x_one(<8 x double> %x, <8 x double> %y
; FMA4-INFS-NEXT: retq
;
; AVX512-INFS-LABEL: test_v8f64_mul_y_sub_x_one:
-; AVX512-INFS: # BB#0:
+; AVX512-INFS: # %bb.0:
; AVX512-INFS-NEXT: vsubpd {{.*}}(%rip){1to8}, %zmm0, %zmm0
; AVX512-INFS-NEXT: vmulpd %zmm0, %zmm1, %zmm0
; AVX512-INFS-NEXT: retq
;
; FMA-NOINFS-LABEL: test_v8f64_mul_y_sub_x_one:
-; FMA-NOINFS: # BB#0:
+; FMA-NOINFS: # %bb.0:
; FMA-NOINFS-NEXT: vfmsub213pd %ymm2, %ymm2, %ymm0
; FMA-NOINFS-NEXT: vfmsub213pd %ymm3, %ymm3, %ymm1
; FMA-NOINFS-NEXT: retq
;
; FMA4-NOINFS-LABEL: test_v8f64_mul_y_sub_x_one:
-; FMA4-NOINFS: # BB#0:
+; FMA4-NOINFS: # %bb.0:
; FMA4-NOINFS-NEXT: vfmsubpd %ymm2, %ymm2, %ymm0, %ymm0
; FMA4-NOINFS-NEXT: vfmsubpd %ymm3, %ymm3, %ymm1, %ymm1
; FMA4-NOINFS-NEXT: retq
;
; AVX512-NOINFS-LABEL: test_v8f64_mul_y_sub_x_one:
-; AVX512-NOINFS: # BB#0:
+; AVX512-NOINFS: # %bb.0:
; AVX512-NOINFS-NEXT: vfmsub213pd %zmm1, %zmm1, %zmm0
; AVX512-NOINFS-NEXT: retq
%s = fsub <8 x double> %x, <double 1.0, double 1.0, double 1.0, double 1.0, double 1.0, double 1.0, double 1.0, double 1.0>
@@ -722,7 +722,7 @@ define <8 x double> @test_v8f64_mul_y_sub_x_one(<8 x double> %x, <8 x double> %y
define <16 x float> @test_v16f32_mul_sub_x_negone_y(<16 x float> %x, <16 x float> %y) {
; FMA-INFS-LABEL: test_v16f32_mul_sub_x_negone_y:
-; FMA-INFS: # BB#0:
+; FMA-INFS: # %bb.0:
; FMA-INFS-NEXT: vmovaps {{.*#+}} ymm4 = [-1.000000e+00,-1.000000e+00,-1.000000e+00,-1.000000e+00,-1.000000e+00,-1.000000e+00,-1.000000e+00,-1.000000e+00]
; FMA-INFS-NEXT: vsubps %ymm4, %ymm1, %ymm1
; FMA-INFS-NEXT: vsubps %ymm4, %ymm0, %ymm0
@@ -731,7 +731,7 @@ define <16 x float> @test_v16f32_mul_sub_x_negone_y(<16 x float> %x, <16 x float
; FMA-INFS-NEXT: retq
;
; FMA4-INFS-LABEL: test_v16f32_mul_sub_x_negone_y:
-; FMA4-INFS: # BB#0:
+; FMA4-INFS: # %bb.0:
; FMA4-INFS-NEXT: vmovaps {{.*#+}} ymm4 = [-1.000000e+00,-1.000000e+00,-1.000000e+00,-1.000000e+00,-1.000000e+00,-1.000000e+00,-1.000000e+00,-1.000000e+00]
; FMA4-INFS-NEXT: vsubps %ymm4, %ymm1, %ymm1
; FMA4-INFS-NEXT: vsubps %ymm4, %ymm0, %ymm0
@@ -740,25 +740,25 @@ define <16 x float> @test_v16f32_mul_sub_x_negone_y(<16 x float> %x, <16 x float
; FMA4-INFS-NEXT: retq
;
; AVX512-INFS-LABEL: test_v16f32_mul_sub_x_negone_y:
-; AVX512-INFS: # BB#0:
+; AVX512-INFS: # %bb.0:
; AVX512-INFS-NEXT: vsubps {{.*}}(%rip){1to16}, %zmm0, %zmm0
; AVX512-INFS-NEXT: vmulps %zmm1, %zmm0, %zmm0
; AVX512-INFS-NEXT: retq
;
; FMA-NOINFS-LABEL: test_v16f32_mul_sub_x_negone_y:
-; FMA-NOINFS: # BB#0:
+; FMA-NOINFS: # %bb.0:
; FMA-NOINFS-NEXT: vfmadd213ps %ymm2, %ymm2, %ymm0
; FMA-NOINFS-NEXT: vfmadd213ps %ymm3, %ymm3, %ymm1
; FMA-NOINFS-NEXT: retq
;
; FMA4-NOINFS-LABEL: test_v16f32_mul_sub_x_negone_y:
-; FMA4-NOINFS: # BB#0:
+; FMA4-NOINFS: # %bb.0:
; FMA4-NOINFS-NEXT: vfmaddps %ymm2, %ymm2, %ymm0, %ymm0
; FMA4-NOINFS-NEXT: vfmaddps %ymm3, %ymm3, %ymm1, %ymm1
; FMA4-NOINFS-NEXT: retq
;
; AVX512-NOINFS-LABEL: test_v16f32_mul_sub_x_negone_y:
-; AVX512-NOINFS: # BB#0:
+; AVX512-NOINFS: # %bb.0:
; AVX512-NOINFS-NEXT: vfmadd213ps %zmm1, %zmm1, %zmm0
; AVX512-NOINFS-NEXT: retq
%s = fsub <16 x float> %x, <float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0>
@@ -768,7 +768,7 @@ define <16 x float> @test_v16f32_mul_sub_x_negone_y(<16 x float> %x, <16 x float
define <8 x double> @test_v8f64_mul_y_sub_x_negone(<8 x double> %x, <8 x double> %y) {
; FMA-INFS-LABEL: test_v8f64_mul_y_sub_x_negone:
-; FMA-INFS: # BB#0:
+; FMA-INFS: # %bb.0:
; FMA-INFS-NEXT: vmovapd {{.*#+}} ymm4 = [-1.000000e+00,-1.000000e+00,-1.000000e+00,-1.000000e+00]
; FMA-INFS-NEXT: vsubpd %ymm4, %ymm1, %ymm1
; FMA-INFS-NEXT: vsubpd %ymm4, %ymm0, %ymm0
@@ -777,7 +777,7 @@ define <8 x double> @test_v8f64_mul_y_sub_x_negone(<8 x double> %x, <8 x double>
; FMA-INFS-NEXT: retq
;
; FMA4-INFS-LABEL: test_v8f64_mul_y_sub_x_negone:
-; FMA4-INFS: # BB#0:
+; FMA4-INFS: # %bb.0:
; FMA4-INFS-NEXT: vmovapd {{.*#+}} ymm4 = [-1.000000e+00,-1.000000e+00,-1.000000e+00,-1.000000e+00]
; FMA4-INFS-NEXT: vsubpd %ymm4, %ymm1, %ymm1
; FMA4-INFS-NEXT: vsubpd %ymm4, %ymm0, %ymm0
@@ -786,25 +786,25 @@ define <8 x double> @test_v8f64_mul_y_sub_x_negone(<8 x double> %x, <8 x double>
; FMA4-INFS-NEXT: retq
;
; AVX512-INFS-LABEL: test_v8f64_mul_y_sub_x_negone:
-; AVX512-INFS: # BB#0:
+; AVX512-INFS: # %bb.0:
; AVX512-INFS-NEXT: vsubpd {{.*}}(%rip){1to8}, %zmm0, %zmm0
; AVX512-INFS-NEXT: vmulpd %zmm0, %zmm1, %zmm0
; AVX512-INFS-NEXT: retq
;
; FMA-NOINFS-LABEL: test_v8f64_mul_y_sub_x_negone:
-; FMA-NOINFS: # BB#0:
+; FMA-NOINFS: # %bb.0:
; FMA-NOINFS-NEXT: vfmadd213pd %ymm2, %ymm2, %ymm0
; FMA-NOINFS-NEXT: vfmadd213pd %ymm3, %ymm3, %ymm1
; FMA-NOINFS-NEXT: retq
;
; FMA4-NOINFS-LABEL: test_v8f64_mul_y_sub_x_negone:
-; FMA4-NOINFS: # BB#0:
+; FMA4-NOINFS: # %bb.0:
; FMA4-NOINFS-NEXT: vfmaddpd %ymm2, %ymm2, %ymm0, %ymm0
; FMA4-NOINFS-NEXT: vfmaddpd %ymm3, %ymm3, %ymm1, %ymm1
; FMA4-NOINFS-NEXT: retq
;
; AVX512-NOINFS-LABEL: test_v8f64_mul_y_sub_x_negone:
-; AVX512-NOINFS: # BB#0:
+; AVX512-NOINFS: # %bb.0:
; AVX512-NOINFS-NEXT: vfmadd213pd %zmm1, %zmm1, %zmm0
; AVX512-NOINFS-NEXT: retq
%s = fsub <8 x double> %x, <double -1.0, double -1.0, double -1.0, double -1.0, double -1.0, double -1.0, double -1.0, double -1.0>
@@ -818,7 +818,7 @@ define <8 x double> @test_v8f64_mul_y_sub_x_negone(<8 x double> %x, <8 x double>
define <16 x float> @test_v16f32_interp(<16 x float> %x, <16 x float> %y, <16 x float> %t) {
; FMA-INFS-LABEL: test_v16f32_interp:
-; FMA-INFS: # BB#0:
+; FMA-INFS: # %bb.0:
; FMA-INFS-NEXT: vmovaps {{.*#+}} ymm6 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
; FMA-INFS-NEXT: vsubps %ymm4, %ymm6, %ymm7
; FMA-INFS-NEXT: vsubps %ymm5, %ymm6, %ymm6
@@ -829,7 +829,7 @@ define <16 x float> @test_v16f32_interp(<16 x float> %x, <16 x float> %y, <16 x
; FMA-INFS-NEXT: retq
;
; FMA4-INFS-LABEL: test_v16f32_interp:
-; FMA4-INFS: # BB#0:
+; FMA4-INFS: # %bb.0:
; FMA4-INFS-NEXT: vmovaps {{.*#+}} ymm6 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
; FMA4-INFS-NEXT: vsubps %ymm4, %ymm6, %ymm7
; FMA4-INFS-NEXT: vsubps %ymm5, %ymm6, %ymm6
@@ -840,15 +840,15 @@ define <16 x float> @test_v16f32_interp(<16 x float> %x, <16 x float> %y, <16 x
; FMA4-INFS-NEXT: retq
;
; AVX512-INFS-LABEL: test_v16f32_interp:
-; AVX512-INFS: # BB#0:
-; AVX512-INFS-NEXT: vbroadcastss {{.*}}(%rip), %zmm3
+; AVX512-INFS: # %bb.0:
+; AVX512-INFS-NEXT: vbroadcastss {{.*#+}} zmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; AVX512-INFS-NEXT: vsubps %zmm2, %zmm3, %zmm3
; AVX512-INFS-NEXT: vmulps %zmm3, %zmm1, %zmm1
; AVX512-INFS-NEXT: vfmadd213ps %zmm1, %zmm2, %zmm0
; AVX512-INFS-NEXT: retq
;
; FMA-NOINFS-LABEL: test_v16f32_interp:
-; FMA-NOINFS: # BB#0:
+; FMA-NOINFS: # %bb.0:
; FMA-NOINFS-NEXT: vfnmadd213ps %ymm3, %ymm5, %ymm3
; FMA-NOINFS-NEXT: vfnmadd213ps %ymm2, %ymm4, %ymm2
; FMA-NOINFS-NEXT: vfmadd213ps %ymm2, %ymm4, %ymm0
@@ -856,7 +856,7 @@ define <16 x float> @test_v16f32_interp(<16 x float> %x, <16 x float> %y, <16 x
; FMA-NOINFS-NEXT: retq
;
; FMA4-NOINFS-LABEL: test_v16f32_interp:
-; FMA4-NOINFS: # BB#0:
+; FMA4-NOINFS: # %bb.0:
; FMA4-NOINFS-NEXT: vfnmaddps %ymm3, %ymm3, %ymm5, %ymm3
; FMA4-NOINFS-NEXT: vfnmaddps %ymm2, %ymm2, %ymm4, %ymm2
; FMA4-NOINFS-NEXT: vfmaddps %ymm2, %ymm4, %ymm0, %ymm0
@@ -864,7 +864,7 @@ define <16 x float> @test_v16f32_interp(<16 x float> %x, <16 x float> %y, <16 x
; FMA4-NOINFS-NEXT: retq
;
; AVX512-NOINFS-LABEL: test_v16f32_interp:
-; AVX512-NOINFS: # BB#0:
+; AVX512-NOINFS: # %bb.0:
; AVX512-NOINFS-NEXT: vfnmadd213ps %zmm1, %zmm2, %zmm1
; AVX512-NOINFS-NEXT: vfmadd213ps %zmm1, %zmm2, %zmm0
; AVX512-NOINFS-NEXT: retq
@@ -877,7 +877,7 @@ define <16 x float> @test_v16f32_interp(<16 x float> %x, <16 x float> %y, <16 x
define <8 x double> @test_v8f64_interp(<8 x double> %x, <8 x double> %y, <8 x double> %t) {
; FMA-INFS-LABEL: test_v8f64_interp:
-; FMA-INFS: # BB#0:
+; FMA-INFS: # %bb.0:
; FMA-INFS-NEXT: vmovapd {{.*#+}} ymm6 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
; FMA-INFS-NEXT: vsubpd %ymm4, %ymm6, %ymm7
; FMA-INFS-NEXT: vsubpd %ymm5, %ymm6, %ymm6
@@ -888,7 +888,7 @@ define <8 x double> @test_v8f64_interp(<8 x double> %x, <8 x double> %y, <8 x do
; FMA-INFS-NEXT: retq
;
; FMA4-INFS-LABEL: test_v8f64_interp:
-; FMA4-INFS: # BB#0:
+; FMA4-INFS: # %bb.0:
; FMA4-INFS-NEXT: vmovapd {{.*#+}} ymm6 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
; FMA4-INFS-NEXT: vsubpd %ymm4, %ymm6, %ymm7
; FMA4-INFS-NEXT: vsubpd %ymm5, %ymm6, %ymm6
@@ -899,15 +899,15 @@ define <8 x double> @test_v8f64_interp(<8 x double> %x, <8 x double> %y, <8 x do
; FMA4-INFS-NEXT: retq
;
; AVX512-INFS-LABEL: test_v8f64_interp:
-; AVX512-INFS: # BB#0:
-; AVX512-INFS-NEXT: vbroadcastsd {{.*}}(%rip), %zmm3
+; AVX512-INFS: # %bb.0:
+; AVX512-INFS-NEXT: vbroadcastsd {{.*#+}} zmm3 = [1,1,1,1,1,1,1,1]
; AVX512-INFS-NEXT: vsubpd %zmm2, %zmm3, %zmm3
; AVX512-INFS-NEXT: vmulpd %zmm3, %zmm1, %zmm1
; AVX512-INFS-NEXT: vfmadd213pd %zmm1, %zmm2, %zmm0
; AVX512-INFS-NEXT: retq
;
; FMA-NOINFS-LABEL: test_v8f64_interp:
-; FMA-NOINFS: # BB#0:
+; FMA-NOINFS: # %bb.0:
; FMA-NOINFS-NEXT: vfnmadd213pd %ymm3, %ymm5, %ymm3
; FMA-NOINFS-NEXT: vfnmadd213pd %ymm2, %ymm4, %ymm2
; FMA-NOINFS-NEXT: vfmadd213pd %ymm2, %ymm4, %ymm0
@@ -915,7 +915,7 @@ define <8 x double> @test_v8f64_interp(<8 x double> %x, <8 x double> %y, <8 x do
; FMA-NOINFS-NEXT: retq
;
; FMA4-NOINFS-LABEL: test_v8f64_interp:
-; FMA4-NOINFS: # BB#0:
+; FMA4-NOINFS: # %bb.0:
; FMA4-NOINFS-NEXT: vfnmaddpd %ymm3, %ymm3, %ymm5, %ymm3
; FMA4-NOINFS-NEXT: vfnmaddpd %ymm2, %ymm2, %ymm4, %ymm2
; FMA4-NOINFS-NEXT: vfmaddpd %ymm2, %ymm4, %ymm0, %ymm0
@@ -923,7 +923,7 @@ define <8 x double> @test_v8f64_interp(<8 x double> %x, <8 x double> %y, <8 x do
; FMA4-NOINFS-NEXT: retq
;
; AVX512-NOINFS-LABEL: test_v8f64_interp:
-; AVX512-NOINFS: # BB#0:
+; AVX512-NOINFS: # %bb.0:
; AVX512-NOINFS-NEXT: vfnmadd213pd %zmm1, %zmm2, %zmm1
; AVX512-NOINFS-NEXT: vfmadd213pd %zmm1, %zmm2, %zmm0
; AVX512-NOINFS-NEXT: retq
@@ -940,19 +940,19 @@ define <8 x double> @test_v8f64_interp(<8 x double> %x, <8 x double> %y, <8 x do
define <16 x float> @test_v16f32_fneg_fmadd(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) #0 {
; FMA-LABEL: test_v16f32_fneg_fmadd:
-; FMA: # BB#0:
+; FMA: # %bb.0:
; FMA-NEXT: vfnmsub213ps %ymm4, %ymm2, %ymm0
; FMA-NEXT: vfnmsub213ps %ymm5, %ymm3, %ymm1
; FMA-NEXT: retq
;
; FMA4-LABEL: test_v16f32_fneg_fmadd:
-; FMA4: # BB#0:
+; FMA4: # %bb.0:
; FMA4-NEXT: vfnmsubps %ymm4, %ymm2, %ymm0, %ymm0
; FMA4-NEXT: vfnmsubps %ymm5, %ymm3, %ymm1, %ymm1
; FMA4-NEXT: retq
;
; AVX512-LABEL: test_v16f32_fneg_fmadd:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vfnmsub213ps %zmm2, %zmm1, %zmm0
; AVX512-NEXT: retq
%mul = fmul <16 x float> %a0, %a1
@@ -963,19 +963,19 @@ define <16 x float> @test_v16f32_fneg_fmadd(<16 x float> %a0, <16 x float> %a1,
define <8 x double> @test_v8f64_fneg_fmsub(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) #0 {
; FMA-LABEL: test_v8f64_fneg_fmsub:
-; FMA: # BB#0:
+; FMA: # %bb.0:
; FMA-NEXT: vfnmadd213pd %ymm4, %ymm2, %ymm0
; FMA-NEXT: vfnmadd213pd %ymm5, %ymm3, %ymm1
; FMA-NEXT: retq
;
; FMA4-LABEL: test_v8f64_fneg_fmsub:
-; FMA4: # BB#0:
+; FMA4: # %bb.0:
; FMA4-NEXT: vfnmaddpd %ymm4, %ymm2, %ymm0, %ymm0
; FMA4-NEXT: vfnmaddpd %ymm5, %ymm3, %ymm1, %ymm1
; FMA4-NEXT: retq
;
; AVX512-LABEL: test_v8f64_fneg_fmsub:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vfnmadd213pd %zmm2, %zmm1, %zmm0
; AVX512-NEXT: retq
%mul = fmul <8 x double> %a0, %a1
@@ -986,19 +986,19 @@ define <8 x double> @test_v8f64_fneg_fmsub(<8 x double> %a0, <8 x double> %a1, <
define <16 x float> @test_v16f32_fneg_fnmadd(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) #0 {
; FMA-LABEL: test_v16f32_fneg_fnmadd:
-; FMA: # BB#0:
+; FMA: # %bb.0:
; FMA-NEXT: vfmsub213ps %ymm4, %ymm2, %ymm0
; FMA-NEXT: vfmsub213ps %ymm5, %ymm3, %ymm1
; FMA-NEXT: retq
;
; FMA4-LABEL: test_v16f32_fneg_fnmadd:
-; FMA4: # BB#0:
+; FMA4: # %bb.0:
; FMA4-NEXT: vfmsubps %ymm4, %ymm2, %ymm0, %ymm0
; FMA4-NEXT: vfmsubps %ymm5, %ymm3, %ymm1, %ymm1
; FMA4-NEXT: retq
;
; AVX512-LABEL: test_v16f32_fneg_fnmadd:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vfmsub213ps %zmm2, %zmm1, %zmm0
; AVX512-NEXT: retq
%mul = fmul <16 x float> %a0, %a1
@@ -1010,19 +1010,19 @@ define <16 x float> @test_v16f32_fneg_fnmadd(<16 x float> %a0, <16 x float> %a1,
define <8 x double> @test_v8f64_fneg_fnmsub(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) #0 {
; FMA-LABEL: test_v8f64_fneg_fnmsub:
-; FMA: # BB#0:
+; FMA: # %bb.0:
; FMA-NEXT: vfmadd213pd %ymm4, %ymm2, %ymm0
; FMA-NEXT: vfmadd213pd %ymm5, %ymm3, %ymm1
; FMA-NEXT: retq
;
; FMA4-LABEL: test_v8f64_fneg_fnmsub:
-; FMA4: # BB#0:
+; FMA4: # %bb.0:
; FMA4-NEXT: vfmaddpd %ymm4, %ymm2, %ymm0, %ymm0
; FMA4-NEXT: vfmaddpd %ymm5, %ymm3, %ymm1, %ymm1
; FMA4-NEXT: retq
;
; AVX512-LABEL: test_v8f64_fneg_fnmsub:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vfmadd213pd %zmm2, %zmm1, %zmm0
; AVX512-NEXT: retq
%mul = fmul <8 x double> %a0, %a1
@@ -1038,19 +1038,19 @@ define <8 x double> @test_v8f64_fneg_fnmsub(<8 x double> %a0, <8 x double> %a1,
define <16 x float> @test_v16f32_fma_x_c1_fmul_x_c2(<16 x float> %x) #0 {
; FMA-LABEL: test_v16f32_fma_x_c1_fmul_x_c2:
-; FMA: # BB#0:
+; FMA: # %bb.0:
; FMA-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0
; FMA-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1
; FMA-NEXT: retq
;
; FMA4-LABEL: test_v16f32_fma_x_c1_fmul_x_c2:
-; FMA4: # BB#0:
+; FMA4: # %bb.0:
; FMA4-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0
; FMA4-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1
; FMA4-NEXT: retq
;
; AVX512-LABEL: test_v16f32_fma_x_c1_fmul_x_c2:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vmulps {{.*}}(%rip), %zmm0, %zmm0
; AVX512-NEXT: retq
%m0 = fmul <16 x float> %x, <float 17.0, float 16.0, float 15.0, float 14.0, float 13.0, float 12.0, float 11.0, float 10.0, float 9.0, float 8.0, float 7.0, float 6.0, float 5.0, float 4.0, float 3.0, float 2.0>
@@ -1065,19 +1065,19 @@ define <16 x float> @test_v16f32_fma_x_c1_fmul_x_c2(<16 x float> %x) #0 {
define <16 x float> @test_v16f32_fma_fmul_x_c1_c2_y(<16 x float> %x, <16 x float> %y) #0 {
; FMA-LABEL: test_v16f32_fma_fmul_x_c1_c2_y:
-; FMA: # BB#0:
+; FMA: # %bb.0:
; FMA-NEXT: vfmadd132ps {{.*}}(%rip), %ymm2, %ymm0
; FMA-NEXT: vfmadd132ps {{.*}}(%rip), %ymm3, %ymm1
; FMA-NEXT: retq
;
; FMA4-LABEL: test_v16f32_fma_fmul_x_c1_c2_y:
-; FMA4: # BB#0:
+; FMA4: # %bb.0:
; FMA4-NEXT: vfmaddps %ymm2, {{.*}}(%rip), %ymm0, %ymm0
; FMA4-NEXT: vfmaddps %ymm3, {{.*}}(%rip), %ymm1, %ymm1
; FMA4-NEXT: retq
;
; AVX512-LABEL: test_v16f32_fma_fmul_x_c1_c2_y:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vfmadd132ps {{.*}}(%rip), %zmm1, %zmm0
; AVX512-NEXT: retq
%m0 = fmul <16 x float> %x, <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0>
@@ -1090,22 +1090,22 @@ define <16 x float> @test_v16f32_fma_fmul_x_c1_c2_y(<16 x float> %x, <16 x float
define <16 x float> @test_v16f32_fneg_fmul(<16 x float> %x, <16 x float> %y) #0 {
; FMA-LABEL: test_v16f32_fneg_fmul:
-; FMA: # BB#0:
-; FMA-NEXT: vxorps %ymm4, %ymm4, %ymm4
+; FMA: # %bb.0:
+; FMA-NEXT: vxorps %xmm4, %xmm4, %xmm4
; FMA-NEXT: vfnmsub213ps %ymm4, %ymm2, %ymm0
; FMA-NEXT: vfnmsub213ps %ymm4, %ymm3, %ymm1
; FMA-NEXT: retq
;
; FMA4-LABEL: test_v16f32_fneg_fmul:
-; FMA4: # BB#0:
-; FMA4-NEXT: vxorps %ymm4, %ymm4, %ymm4
+; FMA4: # %bb.0:
+; FMA4-NEXT: vxorps %xmm4, %xmm4, %xmm4
; FMA4-NEXT: vfnmsubps %ymm4, %ymm2, %ymm0, %ymm0
; FMA4-NEXT: vfnmsubps %ymm4, %ymm3, %ymm1, %ymm1
; FMA4-NEXT: retq
;
; AVX512-LABEL: test_v16f32_fneg_fmul:
-; AVX512: # BB#0:
-; AVX512-NEXT: vxorps %zmm2, %zmm2, %zmm2
+; AVX512: # %bb.0:
+; AVX512-NEXT: vxorps %xmm2, %xmm2, %xmm2
; AVX512-NEXT: vfnmsub213ps %zmm2, %zmm1, %zmm0
; AVX512-NEXT: retq
%m = fmul nsz <16 x float> %x, %y
@@ -1115,22 +1115,22 @@ define <16 x float> @test_v16f32_fneg_fmul(<16 x float> %x, <16 x float> %y) #0
define <8 x double> @test_v8f64_fneg_fmul(<8 x double> %x, <8 x double> %y) #0 {
; FMA-LABEL: test_v8f64_fneg_fmul:
-; FMA: # BB#0:
-; FMA-NEXT: vxorpd %ymm4, %ymm4, %ymm4
+; FMA: # %bb.0:
+; FMA-NEXT: vxorpd %xmm4, %xmm4, %xmm4
; FMA-NEXT: vfnmsub213pd %ymm4, %ymm2, %ymm0
; FMA-NEXT: vfnmsub213pd %ymm4, %ymm3, %ymm1
; FMA-NEXT: retq
;
; FMA4-LABEL: test_v8f64_fneg_fmul:
-; FMA4: # BB#0:
-; FMA4-NEXT: vxorpd %ymm4, %ymm4, %ymm4
+; FMA4: # %bb.0:
+; FMA4-NEXT: vxorpd %xmm4, %xmm4, %xmm4
; FMA4-NEXT: vfnmsubpd %ymm4, %ymm2, %ymm0, %ymm0
; FMA4-NEXT: vfnmsubpd %ymm4, %ymm3, %ymm1, %ymm1
; FMA4-NEXT: retq
;
; AVX512-LABEL: test_v8f64_fneg_fmul:
-; AVX512: # BB#0:
-; AVX512-NEXT: vxorpd %zmm2, %zmm2, %zmm2
+; AVX512: # %bb.0:
+; AVX512-NEXT: vxorpd %xmm2, %xmm2, %xmm2
; AVX512-NEXT: vfnmsub213pd %zmm2, %zmm1, %zmm0
; AVX512-NEXT: retq
%m = fmul nsz <8 x double> %x, %y
@@ -1140,7 +1140,7 @@ define <8 x double> @test_v8f64_fneg_fmul(<8 x double> %x, <8 x double> %y) #0 {
define <8 x double> @test_v8f64_fneg_fmul_no_nsz(<8 x double> %x, <8 x double> %y) #0 {
; FMA-LABEL: test_v8f64_fneg_fmul_no_nsz:
-; FMA: # BB#0:
+; FMA: # %bb.0:
; FMA-NEXT: vmulpd %ymm3, %ymm1, %ymm1
; FMA-NEXT: vmulpd %ymm2, %ymm0, %ymm0
; FMA-NEXT: vmovapd {{.*#+}} ymm2 = [-0.000000e+00,-0.000000e+00,-0.000000e+00,-0.000000e+00]
@@ -1149,7 +1149,7 @@ define <8 x double> @test_v8f64_fneg_fmul_no_nsz(<8 x double> %x, <8 x double> %
; FMA-NEXT: retq
;
; FMA4-LABEL: test_v8f64_fneg_fmul_no_nsz:
-; FMA4: # BB#0:
+; FMA4: # %bb.0:
; FMA4-NEXT: vmulpd %ymm3, %ymm1, %ymm1
; FMA4-NEXT: vmulpd %ymm2, %ymm0, %ymm0
; FMA4-NEXT: vmovapd {{.*#+}} ymm2 = [-0.000000e+00,-0.000000e+00,-0.000000e+00,-0.000000e+00]
@@ -1158,7 +1158,7 @@ define <8 x double> @test_v8f64_fneg_fmul_no_nsz(<8 x double> %x, <8 x double> %
; FMA4-NEXT: retq
;
; AVX512-LABEL: test_v8f64_fneg_fmul_no_nsz:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vmulpd %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vxorpd {{.*}}(%rip){1to8}, %zmm0, %zmm0
; AVX512-NEXT: retq
diff --git a/test/CodeGen/X86/fmaddsub-combine.ll b/test/CodeGen/X86/fmaddsub-combine.ll
index f3b13cd053b4..bb427a4f3a26 100644
--- a/test/CodeGen/X86/fmaddsub-combine.ll
+++ b/test/CodeGen/X86/fmaddsub-combine.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+fma | FileCheck -check-prefix=FMA3 -check-prefix=FMA3_256 %s
; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+fma,+avx512f | FileCheck -check-prefix=FMA3 -check-prefix=FMA3_512 %s
; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+fma4 | FileCheck -check-prefix=FMA4 %s
@@ -6,14 +7,14 @@
define <2 x double> @mul_addsub_pd128(<2 x double> %A, <2 x double> %B, <2 x double> %C) #0 {
; FMA3-LABEL: mul_addsub_pd128:
-; FMA3: # BB#0: # %entry
-; FMA3-NEXT: vfmaddsub213pd %xmm2, %xmm1, %xmm0
-; FMA3-NEXT: retq
+; FMA3: # %bb.0: # %entry
+; FMA3-NEXT: vfmaddsub213pd %xmm2, %xmm1, %xmm0
+; FMA3-NEXT: retq
;
; FMA4-LABEL: mul_addsub_pd128:
-; FMA4: # BB#0: # %entry
-; FMA4-NEXT: vfmaddsubpd %xmm2, %xmm1, %xmm0, %xmm0
-; FMA4-NEXT: retq
+; FMA4: # %bb.0: # %entry
+; FMA4-NEXT: vfmaddsubpd %xmm2, %xmm1, %xmm0, %xmm0
+; FMA4-NEXT: retq
entry:
%AB = fmul <2 x double> %A, %B
%Sub = fsub <2 x double> %AB, %C
@@ -24,14 +25,14 @@ entry:
define <4 x float> @mul_addsub_ps128(<4 x float> %A, <4 x float> %B, <4 x float> %C) #0 {
; FMA3-LABEL: mul_addsub_ps128:
-; FMA3: # BB#0: # %entry
-; FMA3-NEXT: vfmaddsub213ps %xmm2, %xmm1, %xmm0
-; FMA3-NEXT: retq
+; FMA3: # %bb.0: # %entry
+; FMA3-NEXT: vfmaddsub213ps %xmm2, %xmm1, %xmm0
+; FMA3-NEXT: retq
;
; FMA4-LABEL: mul_addsub_ps128:
-; FMA4: # BB#0: # %entry
-; FMA4-NEXT: vfmaddsubps %xmm2, %xmm1, %xmm0, %xmm0
-; FMA4-NEXT: retq
+; FMA4: # %bb.0: # %entry
+; FMA4-NEXT: vfmaddsubps %xmm2, %xmm1, %xmm0, %xmm0
+; FMA4-NEXT: retq
entry:
%AB = fmul <4 x float> %A, %B
%Sub = fsub <4 x float> %AB, %C
@@ -42,14 +43,14 @@ entry:
define <4 x double> @mul_addsub_pd256(<4 x double> %A, <4 x double> %B, <4 x double> %C) #0 {
; FMA3-LABEL: mul_addsub_pd256:
-; FMA3: # BB#0: # %entry
-; FMA3-NEXT: vfmaddsub213pd %ymm2, %ymm1, %ymm0
-; FMA3-NEXT: retq
+; FMA3: # %bb.0: # %entry
+; FMA3-NEXT: vfmaddsub213pd %ymm2, %ymm1, %ymm0
+; FMA3-NEXT: retq
;
; FMA4-LABEL: mul_addsub_pd256:
-; FMA4: # BB#0: # %entry
-; FMA4-NEXT: vfmaddsubpd %ymm2, %ymm1, %ymm0, %ymm0
-; FMA4-NEXT: retq
+; FMA4: # %bb.0: # %entry
+; FMA4-NEXT: vfmaddsubpd %ymm2, %ymm1, %ymm0, %ymm0
+; FMA4-NEXT: retq
entry:
%AB = fmul <4 x double> %A, %B
%Sub = fsub <4 x double> %AB, %C
@@ -60,14 +61,14 @@ entry:
define <8 x float> @mul_addsub_ps256(<8 x float> %A, <8 x float> %B, <8 x float> %C) #0 {
; FMA3-LABEL: mul_addsub_ps256:
-; FMA3: # BB#0: # %entry
-; FMA3-NEXT: vfmaddsub213ps %ymm2, %ymm1, %ymm0
-; FMA3-NEXT: retq
+; FMA3: # %bb.0: # %entry
+; FMA3-NEXT: vfmaddsub213ps %ymm2, %ymm1, %ymm0
+; FMA3-NEXT: retq
;
; FMA4-LABEL: mul_addsub_ps256:
-; FMA4: # BB#0: # %entry
-; FMA4-NEXT: vfmaddsubps %ymm2, %ymm1, %ymm0, %ymm0
-; FMA4-NEXT: retq
+; FMA4: # %bb.0: # %entry
+; FMA4-NEXT: vfmaddsubps %ymm2, %ymm1, %ymm0, %ymm0
+; FMA4-NEXT: retq
entry:
%AB = fmul <8 x float> %A, %B
%Sub = fsub <8 x float> %AB, %C
@@ -78,21 +79,21 @@ entry:
define <8 x double> @mul_addsub_pd512(<8 x double> %A, <8 x double> %B, <8 x double> %C) #0 {
; FMA3_256-LABEL: mul_addsub_pd512:
-; FMA3_256: # BB#0: # %entry
-; FMA3_256-NEXT: vfmaddsub213pd %ymm4, %ymm2, %ymm0
-; FMA3_256-NEXT: vfmaddsub213pd %ymm5, %ymm3, %ymm1
-; FMA3_256-NEXT: retq
+; FMA3_256: # %bb.0: # %entry
+; FMA3_256-NEXT: vfmaddsub213pd %ymm4, %ymm2, %ymm0
+; FMA3_256-NEXT: vfmaddsub213pd %ymm5, %ymm3, %ymm1
+; FMA3_256-NEXT: retq
;
; FMA3_512-LABEL: mul_addsub_pd512:
-; FMA3_512: # BB#0: # %entry
-; FMA3_512-NEXT: vfmaddsub213pd %zmm2, %zmm1, %zmm0
-; FMA3_512-NEXT: retq
+; FMA3_512: # %bb.0: # %entry
+; FMA3_512-NEXT: vfmaddsub213pd %zmm2, %zmm1, %zmm0
+; FMA3_512-NEXT: retq
;
; FMA4-LABEL: mul_addsub_pd512:
-; FMA4: # BB#0: # %entry
-; FMA4-NEXT: vfmaddsubpd %ymm4, %ymm2, %ymm0, %ymm0
-; FMA4-NEXT: vfmaddsubpd %ymm5, %ymm3, %ymm1, %ymm1
-; FMA4-NEXT: retq
+; FMA4: # %bb.0: # %entry
+; FMA4-NEXT: vfmaddsubpd %ymm4, %ymm2, %ymm0, %ymm0
+; FMA4-NEXT: vfmaddsubpd %ymm5, %ymm3, %ymm1, %ymm1
+; FMA4-NEXT: retq
entry:
%AB = fmul <8 x double> %A, %B
%Sub = fsub <8 x double> %AB, %C
@@ -103,21 +104,21 @@ entry:
define <16 x float> @mul_addsub_ps512(<16 x float> %A, <16 x float> %B, <16 x float> %C) #0 {
; FMA3_256-LABEL: mul_addsub_ps512:
-; FMA3_256: # BB#0: # %entry
-; FMA3_256-NEXT: vfmaddsub213ps %ymm4, %ymm2, %ymm0
-; FMA3_256-NEXT: vfmaddsub213ps %ymm5, %ymm3, %ymm1
-; FMA3_256-NEXT: retq
+; FMA3_256: # %bb.0: # %entry
+; FMA3_256-NEXT: vfmaddsub213ps %ymm4, %ymm2, %ymm0
+; FMA3_256-NEXT: vfmaddsub213ps %ymm5, %ymm3, %ymm1
+; FMA3_256-NEXT: retq
;
; FMA3_512-LABEL: mul_addsub_ps512:
-; FMA3_512: # BB#0: # %entry
-; FMA3_512-NEXT: vfmaddsub213ps %zmm2, %zmm1, %zmm0
-; FMA3_512-NEXT: retq
+; FMA3_512: # %bb.0: # %entry
+; FMA3_512-NEXT: vfmaddsub213ps %zmm2, %zmm1, %zmm0
+; FMA3_512-NEXT: retq
;
; FMA4-LABEL: mul_addsub_ps512:
-; FMA4: # BB#0: # %entry
-; FMA4-NEXT: vfmaddsubps %ymm4, %ymm2, %ymm0, %ymm0
-; FMA4-NEXT: vfmaddsubps %ymm5, %ymm3, %ymm1, %ymm1
-; FMA4-NEXT: retq
+; FMA4: # %bb.0: # %entry
+; FMA4-NEXT: vfmaddsubps %ymm4, %ymm2, %ymm0, %ymm0
+; FMA4-NEXT: vfmaddsubps %ymm5, %ymm3, %ymm1, %ymm1
+; FMA4-NEXT: retq
entry:
%AB = fmul <16 x float> %A, %B
%Sub = fsub <16 x float> %AB, %C
@@ -126,4 +127,949 @@ entry:
ret <16 x float> %Addsub
}
+define <4 x float> @buildvector_mul_addsub_ps128(<4 x float> %C, <4 x float> %D, <4 x float> %B) #0 {
+; FMA3-LABEL: buildvector_mul_addsub_ps128:
+; FMA3: # %bb.0: # %bb
+; FMA3-NEXT: vfmaddsub213ps %xmm2, %xmm1, %xmm0
+; FMA3-NEXT: retq
+;
+; FMA4-LABEL: buildvector_mul_addsub_ps128:
+; FMA4: # %bb.0: # %bb
+; FMA4-NEXT: vfmaddsubps %xmm2, %xmm1, %xmm0, %xmm0
+; FMA4-NEXT: retq
+bb:
+ %A = fmul <4 x float> %C, %D
+ %A0 = extractelement <4 x float> %A, i32 0
+ %B0 = extractelement <4 x float> %B, i32 0
+ %sub0 = fsub float %A0, %B0
+ %A2 = extractelement <4 x float> %A, i32 2
+ %B2 = extractelement <4 x float> %B, i32 2
+ %sub2 = fsub float %A2, %B2
+ %A1 = extractelement <4 x float> %A, i32 1
+ %B1 = extractelement <4 x float> %B, i32 1
+ %add1 = fadd float %A1, %B1
+ %A3 = extractelement <4 x float> %A, i32 3
+ %B3 = extractelement <4 x float> %B, i32 3
+ %add3 = fadd float %A3, %B3
+ %vecinsert1 = insertelement <4 x float> undef, float %sub0, i32 0
+ %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add1, i32 1
+ %vecinsert3 = insertelement <4 x float> %vecinsert2, float %sub2, i32 2
+ %vecinsert4 = insertelement <4 x float> %vecinsert3, float %add3, i32 3
+ ret <4 x float> %vecinsert4
+}
+
+define <2 x double> @buildvector_mul_addsub_pd128(<2 x double> %C, <2 x double> %D, <2 x double> %B) #0 {
+; FMA3-LABEL: buildvector_mul_addsub_pd128:
+; FMA3: # %bb.0: # %bb
+; FMA3-NEXT: vfmaddsub213pd %xmm2, %xmm1, %xmm0
+; FMA3-NEXT: retq
+;
+; FMA4-LABEL: buildvector_mul_addsub_pd128:
+; FMA4: # %bb.0: # %bb
+; FMA4-NEXT: vfmaddsubpd %xmm2, %xmm1, %xmm0, %xmm0
+; FMA4-NEXT: retq
+bb:
+ %A = fmul <2 x double> %C, %D
+ %A0 = extractelement <2 x double> %A, i32 0
+ %B0 = extractelement <2 x double> %B, i32 0
+ %sub0 = fsub double %A0, %B0
+ %A1 = extractelement <2 x double> %A, i32 1
+ %B1 = extractelement <2 x double> %B, i32 1
+ %add1 = fadd double %A1, %B1
+ %vecinsert1 = insertelement <2 x double> undef, double %sub0, i32 0
+ %vecinsert2 = insertelement <2 x double> %vecinsert1, double %add1, i32 1
+ ret <2 x double> %vecinsert2
+}
+
+define <8 x float> @buildvector_mul_addsub_ps256(<8 x float> %C, <8 x float> %D, <8 x float> %B) #0 {
+; FMA3-LABEL: buildvector_mul_addsub_ps256:
+; FMA3: # %bb.0: # %bb
+; FMA3-NEXT: vfmaddsub213ps %ymm2, %ymm1, %ymm0
+; FMA3-NEXT: retq
+;
+; FMA4-LABEL: buildvector_mul_addsub_ps256:
+; FMA4: # %bb.0: # %bb
+; FMA4-NEXT: vfmaddsubps %ymm2, %ymm1, %ymm0, %ymm0
+; FMA4-NEXT: retq
+bb:
+ %A = fmul <8 x float> %C, %D
+ %A0 = extractelement <8 x float> %A, i32 0
+ %B0 = extractelement <8 x float> %B, i32 0
+ %sub0 = fsub float %A0, %B0
+ %A2 = extractelement <8 x float> %A, i32 2
+ %B2 = extractelement <8 x float> %B, i32 2
+ %sub2 = fsub float %A2, %B2
+ %A4 = extractelement <8 x float> %A, i32 4
+ %B4 = extractelement <8 x float> %B, i32 4
+ %sub4 = fsub float %A4, %B4
+ %A6 = extractelement <8 x float> %A, i32 6
+ %B6 = extractelement <8 x float> %B, i32 6
+ %sub6 = fsub float %A6, %B6
+ %A1 = extractelement <8 x float> %A, i32 1
+ %B1 = extractelement <8 x float> %B, i32 1
+ %add1 = fadd float %A1, %B1
+ %A3 = extractelement <8 x float> %A, i32 3
+ %B3 = extractelement <8 x float> %B, i32 3
+ %add3 = fadd float %A3, %B3
+ %A5 = extractelement <8 x float> %A, i32 5
+ %B5 = extractelement <8 x float> %B, i32 5
+ %add5 = fadd float %A5, %B5
+ %A7 = extractelement <8 x float> %A, i32 7
+ %B7 = extractelement <8 x float> %B, i32 7
+ %add7 = fadd float %A7, %B7
+ %vecinsert1 = insertelement <8 x float> undef, float %sub0, i32 0
+ %vecinsert2 = insertelement <8 x float> %vecinsert1, float %add1, i32 1
+ %vecinsert3 = insertelement <8 x float> %vecinsert2, float %sub2, i32 2
+ %vecinsert4 = insertelement <8 x float> %vecinsert3, float %add3, i32 3
+ %vecinsert5 = insertelement <8 x float> %vecinsert4, float %sub4, i32 4
+ %vecinsert6 = insertelement <8 x float> %vecinsert5, float %add5, i32 5
+ %vecinsert7 = insertelement <8 x float> %vecinsert6, float %sub6, i32 6
+ %vecinsert8 = insertelement <8 x float> %vecinsert7, float %add7, i32 7
+ ret <8 x float> %vecinsert8
+}
+
+define <4 x double> @buildvector_mul_addsub_pd256(<4 x double> %C, <4 x double> %D, <4 x double> %B) #0 {
+; FMA3-LABEL: buildvector_mul_addsub_pd256:
+; FMA3: # %bb.0: # %bb
+; FMA3-NEXT: vfmaddsub213pd %ymm2, %ymm1, %ymm0
+; FMA3-NEXT: retq
+;
+; FMA4-LABEL: buildvector_mul_addsub_pd256:
+; FMA4: # %bb.0: # %bb
+; FMA4-NEXT: vfmaddsubpd %ymm2, %ymm1, %ymm0, %ymm0
+; FMA4-NEXT: retq
+bb:
+ %A = fmul <4 x double> %C, %D
+ %A0 = extractelement <4 x double> %A, i32 0
+ %B0 = extractelement <4 x double> %B, i32 0
+ %sub0 = fsub double %A0, %B0
+ %A2 = extractelement <4 x double> %A, i32 2
+ %B2 = extractelement <4 x double> %B, i32 2
+ %sub2 = fsub double %A2, %B2
+ %A1 = extractelement <4 x double> %A, i32 1
+ %B1 = extractelement <4 x double> %B, i32 1
+ %add1 = fadd double %A1, %B1
+ %A3 = extractelement <4 x double> %A, i32 3
+ %B3 = extractelement <4 x double> %B, i32 3
+ %add3 = fadd double %A3, %B3
+ %vecinsert1 = insertelement <4 x double> undef, double %sub0, i32 0
+ %vecinsert2 = insertelement <4 x double> %vecinsert1, double %add1, i32 1
+ %vecinsert3 = insertelement <4 x double> %vecinsert2, double %sub2, i32 2
+ %vecinsert4 = insertelement <4 x double> %vecinsert3, double %add3, i32 3
+ ret <4 x double> %vecinsert4
+}
+
+define <16 x float> @buildvector_mul_addsub_ps512(<16 x float> %C, <16 x float> %D, <16 x float> %B) #0 {
+; FMA3_256-LABEL: buildvector_mul_addsub_ps512:
+; FMA3_256: # %bb.0: # %bb
+; FMA3_256-NEXT: vfmaddsub213ps %ymm4, %ymm2, %ymm0
+; FMA3_256-NEXT: vfmaddsub213ps %ymm5, %ymm3, %ymm1
+; FMA3_256-NEXT: retq
+;
+; FMA3_512-LABEL: buildvector_mul_addsub_ps512:
+; FMA3_512: # %bb.0: # %bb
+; FMA3_512-NEXT: vfmaddsub213ps %zmm2, %zmm1, %zmm0
+; FMA3_512-NEXT: retq
+;
+; FMA4-LABEL: buildvector_mul_addsub_ps512:
+; FMA4: # %bb.0: # %bb
+; FMA4-NEXT: vfmaddsubps %ymm4, %ymm2, %ymm0, %ymm0
+; FMA4-NEXT: vfmaddsubps %ymm5, %ymm3, %ymm1, %ymm1
+; FMA4-NEXT: retq
+bb:
+ %A = fmul <16 x float> %C, %D
+ %A0 = extractelement <16 x float> %A, i32 0
+ %B0 = extractelement <16 x float> %B, i32 0
+ %sub0 = fsub float %A0, %B0
+ %A2 = extractelement <16 x float> %A, i32 2
+ %B2 = extractelement <16 x float> %B, i32 2
+ %sub2 = fsub float %A2, %B2
+ %A4 = extractelement <16 x float> %A, i32 4
+ %B4 = extractelement <16 x float> %B, i32 4
+ %sub4 = fsub float %A4, %B4
+ %A6 = extractelement <16 x float> %A, i32 6
+ %B6 = extractelement <16 x float> %B, i32 6
+ %sub6 = fsub float %A6, %B6
+ %A8 = extractelement <16 x float> %A, i32 8
+ %B8 = extractelement <16 x float> %B, i32 8
+ %sub8 = fsub float %A8, %B8
+ %A10 = extractelement <16 x float> %A, i32 10
+ %B10 = extractelement <16 x float> %B, i32 10
+ %sub10 = fsub float %A10, %B10
+ %A12 = extractelement <16 x float> %A, i32 12
+ %B12 = extractelement <16 x float> %B, i32 12
+ %sub12 = fsub float %A12, %B12
+ %A14 = extractelement <16 x float> %A, i32 14
+ %B14 = extractelement <16 x float> %B, i32 14
+ %sub14 = fsub float %A14, %B14
+ %A1 = extractelement <16 x float> %A, i32 1
+ %B1 = extractelement <16 x float> %B, i32 1
+ %add1 = fadd float %A1, %B1
+ %A3 = extractelement <16 x float> %A, i32 3
+ %B3 = extractelement <16 x float> %B, i32 3
+ %add3 = fadd float %A3, %B3
+ %A5 = extractelement <16 x float> %A, i32 5
+ %B5 = extractelement <16 x float> %B, i32 5
+ %add5 = fadd float %A5, %B5
+ %A7 = extractelement <16 x float> %A, i32 7
+ %B7 = extractelement <16 x float> %B, i32 7
+ %add7 = fadd float %A7, %B7
+ %A9 = extractelement <16 x float> %A, i32 9
+ %B9 = extractelement <16 x float> %B, i32 9
+ %add9 = fadd float %A9, %B9
+ %A11 = extractelement <16 x float> %A, i32 11
+ %B11 = extractelement <16 x float> %B, i32 11
+ %add11 = fadd float %A11, %B11
+ %A13 = extractelement <16 x float> %A, i32 13
+ %B13 = extractelement <16 x float> %B, i32 13
+ %add13 = fadd float %A13, %B13
+ %A15 = extractelement <16 x float> %A, i32 15
+ %B15 = extractelement <16 x float> %B, i32 15
+ %add15 = fadd float %A15, %B15
+ %vecinsert1 = insertelement <16 x float> undef, float %sub0, i32 0
+ %vecinsert2 = insertelement <16 x float> %vecinsert1, float %add1, i32 1
+ %vecinsert3 = insertelement <16 x float> %vecinsert2, float %sub2, i32 2
+ %vecinsert4 = insertelement <16 x float> %vecinsert3, float %add3, i32 3
+ %vecinsert5 = insertelement <16 x float> %vecinsert4, float %sub4, i32 4
+ ; element 5 is undef
+ %vecinsert7 = insertelement <16 x float> %vecinsert5, float %sub6, i32 6
+ %vecinsert8 = insertelement <16 x float> %vecinsert7, float %add7, i32 7
+ %vecinsert9 = insertelement <16 x float> %vecinsert8, float %sub8, i32 8
+ %vecinsert10 = insertelement <16 x float> %vecinsert9, float %add9, i32 9
+ %vecinsert11 = insertelement <16 x float> %vecinsert10, float %sub10, i32 10
+ %vecinsert12 = insertelement <16 x float> %vecinsert11, float %add11, i32 11
+ ; element 12 is undef
+ %vecinsert14 = insertelement <16 x float> %vecinsert12, float %add13, i32 13
+ %vecinsert15 = insertelement <16 x float> %vecinsert14, float %sub14, i32 14
+ %vecinsert16 = insertelement <16 x float> %vecinsert15, float %add15, i32 15
+ ret <16 x float> %vecinsert16
+}
+
+define <8 x double> @buildvector_mul_addsub_pd512(<8 x double> %C, <8 x double> %D, <8 x double> %B) #0 {
+; FMA3_256-LABEL: buildvector_mul_addsub_pd512:
+; FMA3_256: # %bb.0: # %bb
+; FMA3_256-NEXT: vfmaddsub213pd %ymm4, %ymm2, %ymm0
+; FMA3_256-NEXT: vfmaddsub213pd %ymm5, %ymm3, %ymm1
+; FMA3_256-NEXT: retq
+;
+; FMA3_512-LABEL: buildvector_mul_addsub_pd512:
+; FMA3_512: # %bb.0: # %bb
+; FMA3_512-NEXT: vfmaddsub213pd %zmm2, %zmm1, %zmm0
+; FMA3_512-NEXT: retq
+;
+; FMA4-LABEL: buildvector_mul_addsub_pd512:
+; FMA4: # %bb.0: # %bb
+; FMA4-NEXT: vfmaddsubpd %ymm4, %ymm2, %ymm0, %ymm0
+; FMA4-NEXT: vfmaddsubpd %ymm5, %ymm3, %ymm1, %ymm1
+; FMA4-NEXT: retq
+bb:
+ %A = fmul <8 x double> %C, %D
+ %A0 = extractelement <8 x double> %A, i32 0
+ %B0 = extractelement <8 x double> %B, i32 0
+ %sub0 = fsub double %A0, %B0
+ %A2 = extractelement <8 x double> %A, i32 2
+ %B2 = extractelement <8 x double> %B, i32 2
+ %sub2 = fsub double %A2, %B2
+ %A4 = extractelement <8 x double> %A, i32 4
+ %B4 = extractelement <8 x double> %B, i32 4
+ %sub4 = fsub double %A4, %B4
+ %A6 = extractelement <8 x double> %A, i32 6
+ %B6 = extractelement <8 x double> %B, i32 6
+ %sub6 = fsub double %A6, %B6
+ %A1 = extractelement <8 x double> %A, i32 1
+ %B1 = extractelement <8 x double> %B, i32 1
+ %add1 = fadd double %A1, %B1
+ %A3 = extractelement <8 x double> %A, i32 3
+ %B3 = extractelement <8 x double> %B, i32 3
+ %add3 = fadd double %A3, %B3
+ %A7 = extractelement <8 x double> %A, i32 7
+ %B7 = extractelement <8 x double> %B, i32 7
+ %add7 = fadd double %A7, %B7
+ %vecinsert1 = insertelement <8 x double> undef, double %sub0, i32 0
+ %vecinsert2 = insertelement <8 x double> %vecinsert1, double %add1, i32 1
+ %vecinsert3 = insertelement <8 x double> %vecinsert2, double %sub2, i32 2
+ %vecinsert4 = insertelement <8 x double> %vecinsert3, double %add3, i32 3
+ %vecinsert5 = insertelement <8 x double> %vecinsert4, double %sub4, i32 4
+ ; element 5 is undef
+ %vecinsert7 = insertelement <8 x double> %vecinsert5, double %sub6, i32 6
+ %vecinsert8 = insertelement <8 x double> %vecinsert7, double %add7, i32 7
+ ret <8 x double> %vecinsert8
+}
+
+define <4 x float> @buildvector_mul_subadd_ps128(<4 x float> %C, <4 x float> %D, <4 x float> %B) #0 {
+; FMA3_256-LABEL: buildvector_mul_subadd_ps128:
+; FMA3_256: # %bb.0: # %bb
+; FMA3_256-NEXT: vmulps %xmm1, %xmm0, %xmm0
+; FMA3_256-NEXT: vaddss %xmm2, %xmm0, %xmm1
+; FMA3_256-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
+; FMA3_256-NEXT: vpermilpd {{.*#+}} xmm4 = xmm2[1,0]
+; FMA3_256-NEXT: vaddss %xmm4, %xmm3, %xmm3
+; FMA3_256-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
+; FMA3_256-NEXT: vmovshdup {{.*#+}} xmm5 = xmm2[1,1,3,3]
+; FMA3_256-NEXT: vsubss %xmm5, %xmm4, %xmm4
+; FMA3_256-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; FMA3_256-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3]
+; FMA3_256-NEXT: vsubss %xmm2, %xmm0, %xmm0
+; FMA3_256-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[2,3]
+; FMA3_256-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3]
+; FMA3_256-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; FMA3_256-NEXT: retq
+;
+; FMA3_512-LABEL: buildvector_mul_subadd_ps128:
+; FMA3_512: # %bb.0: # %bb
+; FMA3_512-NEXT: vmulps %xmm1, %xmm0, %xmm0
+; FMA3_512-NEXT: vaddss %xmm2, %xmm0, %xmm1
+; FMA3_512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
+; FMA3_512-NEXT: vpermilpd {{.*#+}} xmm4 = xmm2[1,0]
+; FMA3_512-NEXT: vaddss %xmm4, %xmm3, %xmm3
+; FMA3_512-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
+; FMA3_512-NEXT: vmovshdup {{.*#+}} xmm5 = xmm2[1,1,3,3]
+; FMA3_512-NEXT: vsubss %xmm5, %xmm4, %xmm4
+; FMA3_512-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[2,3]
+; FMA3_512-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3]
+; FMA3_512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; FMA3_512-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3]
+; FMA3_512-NEXT: vsubss %xmm2, %xmm0, %xmm0
+; FMA3_512-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; FMA3_512-NEXT: retq
+;
+; FMA4-LABEL: buildvector_mul_subadd_ps128:
+; FMA4: # %bb.0: # %bb
+; FMA4-NEXT: vmulps %xmm1, %xmm0, %xmm0
+; FMA4-NEXT: vaddss %xmm2, %xmm0, %xmm1
+; FMA4-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
+; FMA4-NEXT: vpermilpd {{.*#+}} xmm4 = xmm2[1,0]
+; FMA4-NEXT: vaddss %xmm4, %xmm3, %xmm3
+; FMA4-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
+; FMA4-NEXT: vmovshdup {{.*#+}} xmm5 = xmm2[1,1,3,3]
+; FMA4-NEXT: vsubss %xmm5, %xmm4, %xmm4
+; FMA4-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; FMA4-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3]
+; FMA4-NEXT: vsubss %xmm2, %xmm0, %xmm0
+; FMA4-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[2,3]
+; FMA4-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3]
+; FMA4-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; FMA4-NEXT: retq
+bb:
+ %A = fmul <4 x float> %C, %D
+ %A0 = extractelement <4 x float> %A, i32 0
+ %B0 = extractelement <4 x float> %B, i32 0
+ %sub0 = fadd float %A0, %B0
+ %A2 = extractelement <4 x float> %A, i32 2
+ %B2 = extractelement <4 x float> %B, i32 2
+ %sub2 = fadd float %A2, %B2
+ %A1 = extractelement <4 x float> %A, i32 1
+ %B1 = extractelement <4 x float> %B, i32 1
+ %add1 = fsub float %A1, %B1
+ %A3 = extractelement <4 x float> %A, i32 3
+ %B3 = extractelement <4 x float> %B, i32 3
+ %add3 = fsub float %A3, %B3
+ %vecinsert1 = insertelement <4 x float> undef, float %sub0, i32 0
+ %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add1, i32 1
+ %vecinsert3 = insertelement <4 x float> %vecinsert2, float %sub2, i32 2
+ %vecinsert4 = insertelement <4 x float> %vecinsert3, float %add3, i32 3
+ ret <4 x float> %vecinsert4
+}
+
+define <2 x double> @buildvector_mul_subadd_pd128(<2 x double> %C, <2 x double> %D, <2 x double> %B) #0 {
+; FMA3-LABEL: buildvector_mul_subadd_pd128:
+; FMA3: # %bb.0: # %bb
+; FMA3-NEXT: vmulpd %xmm1, %xmm0, %xmm0
+; FMA3-NEXT: vaddsd %xmm2, %xmm0, %xmm1
+; FMA3-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; FMA3-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
+; FMA3-NEXT: vsubsd %xmm2, %xmm0, %xmm0
+; FMA3-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; FMA3-NEXT: retq
+;
+; FMA4-LABEL: buildvector_mul_subadd_pd128:
+; FMA4: # %bb.0: # %bb
+; FMA4-NEXT: vmulpd %xmm1, %xmm0, %xmm0
+; FMA4-NEXT: vaddsd %xmm2, %xmm0, %xmm1
+; FMA4-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; FMA4-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
+; FMA4-NEXT: vsubsd %xmm2, %xmm0, %xmm0
+; FMA4-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; FMA4-NEXT: retq
+bb:
+ %A = fmul <2 x double> %C, %D
+ %A0 = extractelement <2 x double> %A, i32 0
+ %B0 = extractelement <2 x double> %B, i32 0
+ %sub0 = fadd double %A0, %B0
+ %A1 = extractelement <2 x double> %A, i32 1
+ %B1 = extractelement <2 x double> %B, i32 1
+ %add1 = fsub double %A1, %B1
+ %vecinsert1 = insertelement <2 x double> undef, double %sub0, i32 0
+ %vecinsert2 = insertelement <2 x double> %vecinsert1, double %add1, i32 1
+ ret <2 x double> %vecinsert2
+}
+
+define <8 x float> @buildvector_mul_subadd_ps256(<8 x float> %C, <8 x float> %D, <8 x float> %B) #0 {
+; FMA3_256-LABEL: buildvector_mul_subadd_ps256:
+; FMA3_256: # %bb.0: # %bb
+; FMA3_256-NEXT: vmulps %ymm1, %ymm0, %ymm0
+; FMA3_256-NEXT: vaddss %xmm2, %xmm0, %xmm8
+; FMA3_256-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
+; FMA3_256-NEXT: vpermilpd {{.*#+}} xmm4 = xmm2[1,0]
+; FMA3_256-NEXT: vaddss %xmm4, %xmm3, %xmm9
+; FMA3_256-NEXT: vextractf128 $1, %ymm0, %xmm4
+; FMA3_256-NEXT: vextractf128 $1, %ymm2, %xmm5
+; FMA3_256-NEXT: vaddss %xmm5, %xmm4, %xmm6
+; FMA3_256-NEXT: vpermilpd {{.*#+}} xmm7 = xmm4[1,0]
+; FMA3_256-NEXT: vpermilpd {{.*#+}} xmm1 = xmm5[1,0]
+; FMA3_256-NEXT: vaddss %xmm1, %xmm7, %xmm1
+; FMA3_256-NEXT: vmovshdup {{.*#+}} xmm7 = xmm0[1,1,3,3]
+; FMA3_256-NEXT: vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3]
+; FMA3_256-NEXT: vsubss %xmm3, %xmm7, %xmm3
+; FMA3_256-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; FMA3_256-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3]
+; FMA3_256-NEXT: vsubss %xmm2, %xmm0, %xmm0
+; FMA3_256-NEXT: vmovshdup {{.*#+}} xmm2 = xmm4[1,1,3,3]
+; FMA3_256-NEXT: vmovshdup {{.*#+}} xmm7 = xmm5[1,1,3,3]
+; FMA3_256-NEXT: vsubss %xmm7, %xmm2, %xmm2
+; FMA3_256-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[3,1,2,3]
+; FMA3_256-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[3,1,2,3]
+; FMA3_256-NEXT: vsubss %xmm5, %xmm4, %xmm4
+; FMA3_256-NEXT: vinsertps {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[2,3]
+; FMA3_256-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
+; FMA3_256-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0]
+; FMA3_256-NEXT: vinsertps {{.*#+}} xmm2 = xmm8[0],xmm3[0],xmm8[2,3]
+; FMA3_256-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3]
+; FMA3_256-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0]
+; FMA3_256-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; FMA3_256-NEXT: retq
+;
+; FMA3_512-LABEL: buildvector_mul_subadd_ps256:
+; FMA3_512: # %bb.0: # %bb
+; FMA3_512-NEXT: vmulps %ymm1, %ymm0, %ymm0
+; FMA3_512-NEXT: vaddss %xmm2, %xmm0, %xmm1
+; FMA3_512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
+; FMA3_512-NEXT: vpermilpd {{.*#+}} xmm4 = xmm2[1,0]
+; FMA3_512-NEXT: vaddss %xmm4, %xmm3, %xmm3
+; FMA3_512-NEXT: vextractf128 $1, %ymm0, %xmm4
+; FMA3_512-NEXT: vextractf128 $1, %ymm2, %xmm5
+; FMA3_512-NEXT: vaddss %xmm5, %xmm4, %xmm8
+; FMA3_512-NEXT: vpermilpd {{.*#+}} xmm7 = xmm4[1,0]
+; FMA3_512-NEXT: vpermilpd {{.*#+}} xmm6 = xmm5[1,0]
+; FMA3_512-NEXT: vaddss %xmm6, %xmm7, %xmm9
+; FMA3_512-NEXT: vmovshdup {{.*#+}} xmm7 = xmm0[1,1,3,3]
+; FMA3_512-NEXT: vmovshdup {{.*#+}} xmm6 = xmm2[1,1,3,3]
+; FMA3_512-NEXT: vsubss %xmm6, %xmm7, %xmm6
+; FMA3_512-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[2,3]
+; FMA3_512-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3]
+; FMA3_512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; FMA3_512-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3]
+; FMA3_512-NEXT: vsubss %xmm2, %xmm0, %xmm0
+; FMA3_512-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; FMA3_512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm4[1,1,3,3]
+; FMA3_512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm5[1,1,3,3]
+; FMA3_512-NEXT: vsubss %xmm2, %xmm1, %xmm1
+; FMA3_512-NEXT: vinsertps {{.*#+}} xmm1 = xmm8[0],xmm1[0],xmm8[2,3]
+; FMA3_512-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm9[0],xmm1[3]
+; FMA3_512-NEXT: vpermilps {{.*#+}} xmm2 = xmm4[3,1,2,3]
+; FMA3_512-NEXT: vpermilps {{.*#+}} xmm3 = xmm5[3,1,2,3]
+; FMA3_512-NEXT: vsubss %xmm3, %xmm2, %xmm2
+; FMA3_512-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0]
+; FMA3_512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; FMA3_512-NEXT: retq
+;
+; FMA4-LABEL: buildvector_mul_subadd_ps256:
+; FMA4: # %bb.0: # %bb
+; FMA4-NEXT: vmulps %ymm1, %ymm0, %ymm0
+; FMA4-NEXT: vaddss %xmm2, %xmm0, %xmm8
+; FMA4-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
+; FMA4-NEXT: vpermilpd {{.*#+}} xmm4 = xmm2[1,0]
+; FMA4-NEXT: vaddss %xmm4, %xmm3, %xmm9
+; FMA4-NEXT: vextractf128 $1, %ymm0, %xmm4
+; FMA4-NEXT: vextractf128 $1, %ymm2, %xmm5
+; FMA4-NEXT: vaddss %xmm5, %xmm4, %xmm6
+; FMA4-NEXT: vpermilpd {{.*#+}} xmm7 = xmm4[1,0]
+; FMA4-NEXT: vpermilpd {{.*#+}} xmm1 = xmm5[1,0]
+; FMA4-NEXT: vaddss %xmm1, %xmm7, %xmm1
+; FMA4-NEXT: vmovshdup {{.*#+}} xmm7 = xmm0[1,1,3,3]
+; FMA4-NEXT: vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3]
+; FMA4-NEXT: vsubss %xmm3, %xmm7, %xmm3
+; FMA4-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; FMA4-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3]
+; FMA4-NEXT: vsubss %xmm2, %xmm0, %xmm0
+; FMA4-NEXT: vmovshdup {{.*#+}} xmm2 = xmm4[1,1,3,3]
+; FMA4-NEXT: vmovshdup {{.*#+}} xmm7 = xmm5[1,1,3,3]
+; FMA4-NEXT: vsubss %xmm7, %xmm2, %xmm2
+; FMA4-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[3,1,2,3]
+; FMA4-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[3,1,2,3]
+; FMA4-NEXT: vsubss %xmm5, %xmm4, %xmm4
+; FMA4-NEXT: vinsertps {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[2,3]
+; FMA4-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
+; FMA4-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0]
+; FMA4-NEXT: vinsertps {{.*#+}} xmm2 = xmm8[0],xmm3[0],xmm8[2,3]
+; FMA4-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3]
+; FMA4-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0]
+; FMA4-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; FMA4-NEXT: retq
+bb:
+ %A = fmul <8 x float> %C, %D
+ %A0 = extractelement <8 x float> %A, i32 0
+ %B0 = extractelement <8 x float> %B, i32 0
+ %sub0 = fadd float %A0, %B0
+ %A2 = extractelement <8 x float> %A, i32 2
+ %B2 = extractelement <8 x float> %B, i32 2
+ %sub2 = fadd float %A2, %B2
+ %A4 = extractelement <8 x float> %A, i32 4
+ %B4 = extractelement <8 x float> %B, i32 4
+ %sub4 = fadd float %A4, %B4
+ %A6 = extractelement <8 x float> %A, i32 6
+ %B6 = extractelement <8 x float> %B, i32 6
+ %sub6 = fadd float %A6, %B6
+ %A1 = extractelement <8 x float> %A, i32 1
+ %B1 = extractelement <8 x float> %B, i32 1
+ %add1 = fsub float %A1, %B1
+ %A3 = extractelement <8 x float> %A, i32 3
+ %B3 = extractelement <8 x float> %B, i32 3
+ %add3 = fsub float %A3, %B3
+ %A5 = extractelement <8 x float> %A, i32 5
+ %B5 = extractelement <8 x float> %B, i32 5
+ %add5 = fsub float %A5, %B5
+ %A7 = extractelement <8 x float> %A, i32 7
+ %B7 = extractelement <8 x float> %B, i32 7
+ %add7 = fsub float %A7, %B7
+ %vecinsert1 = insertelement <8 x float> undef, float %sub0, i32 0
+ %vecinsert2 = insertelement <8 x float> %vecinsert1, float %add1, i32 1
+ %vecinsert3 = insertelement <8 x float> %vecinsert2, float %sub2, i32 2
+ %vecinsert4 = insertelement <8 x float> %vecinsert3, float %add3, i32 3
+ %vecinsert5 = insertelement <8 x float> %vecinsert4, float %sub4, i32 4
+ %vecinsert6 = insertelement <8 x float> %vecinsert5, float %add5, i32 5
+ %vecinsert7 = insertelement <8 x float> %vecinsert6, float %sub6, i32 6
+ %vecinsert8 = insertelement <8 x float> %vecinsert7, float %add7, i32 7
+ ret <8 x float> %vecinsert8
+}
+
+define <4 x double> @buildvector_mul_subadd_pd256(<4 x double> %C, <4 x double> %D, <4 x double> %B) #0 {
+; FMA3-LABEL: buildvector_mul_subadd_pd256:
+; FMA3: # %bb.0: # %bb
+; FMA3-NEXT: vmulpd %ymm1, %ymm0, %ymm0
+; FMA3-NEXT: vaddsd %xmm2, %xmm0, %xmm1
+; FMA3-NEXT: vextractf128 $1, %ymm0, %xmm3
+; FMA3-NEXT: vextractf128 $1, %ymm2, %xmm4
+; FMA3-NEXT: vaddsd %xmm4, %xmm3, %xmm5
+; FMA3-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; FMA3-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
+; FMA3-NEXT: vsubsd %xmm2, %xmm0, %xmm0
+; FMA3-NEXT: vpermilpd {{.*#+}} xmm2 = xmm3[1,0]
+; FMA3-NEXT: vpermilpd {{.*#+}} xmm3 = xmm4[1,0]
+; FMA3-NEXT: vsubsd %xmm3, %xmm2, %xmm2
+; FMA3-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm5[0],xmm2[0]
+; FMA3-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; FMA3-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; FMA3-NEXT: retq
+;
+; FMA4-LABEL: buildvector_mul_subadd_pd256:
+; FMA4: # %bb.0: # %bb
+; FMA4-NEXT: vmulpd %ymm1, %ymm0, %ymm0
+; FMA4-NEXT: vaddsd %xmm2, %xmm0, %xmm1
+; FMA4-NEXT: vextractf128 $1, %ymm0, %xmm3
+; FMA4-NEXT: vextractf128 $1, %ymm2, %xmm4
+; FMA4-NEXT: vaddsd %xmm4, %xmm3, %xmm5
+; FMA4-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; FMA4-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
+; FMA4-NEXT: vsubsd %xmm2, %xmm0, %xmm0
+; FMA4-NEXT: vpermilpd {{.*#+}} xmm2 = xmm3[1,0]
+; FMA4-NEXT: vpermilpd {{.*#+}} xmm3 = xmm4[1,0]
+; FMA4-NEXT: vsubsd %xmm3, %xmm2, %xmm2
+; FMA4-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm5[0],xmm2[0]
+; FMA4-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; FMA4-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; FMA4-NEXT: retq
+bb:
+ %A = fmul <4 x double> %C, %D
+ %A0 = extractelement <4 x double> %A, i32 0
+ %B0 = extractelement <4 x double> %B, i32 0
+ %sub0 = fadd double %A0, %B0
+ %A2 = extractelement <4 x double> %A, i32 2
+ %B2 = extractelement <4 x double> %B, i32 2
+ %sub2 = fadd double %A2, %B2
+ %A1 = extractelement <4 x double> %A, i32 1
+ %B1 = extractelement <4 x double> %B, i32 1
+ %add1 = fsub double %A1, %B1
+ %A3 = extractelement <4 x double> %A, i32 3
+ %B3 = extractelement <4 x double> %B, i32 3
+ %add3 = fsub double %A3, %B3
+ %vecinsert1 = insertelement <4 x double> undef, double %sub0, i32 0
+ %vecinsert2 = insertelement <4 x double> %vecinsert1, double %add1, i32 1
+ %vecinsert3 = insertelement <4 x double> %vecinsert2, double %sub2, i32 2
+ %vecinsert4 = insertelement <4 x double> %vecinsert3, double %add3, i32 3
+ ret <4 x double> %vecinsert4
+}
+
+define <16 x float> @buildvector_mul_subadd_ps512(<16 x float> %C, <16 x float> %D, <16 x float> %B) #0 {
+; FMA3_256-LABEL: buildvector_mul_subadd_ps512:
+; FMA3_256: # %bb.0: # %bb
+; FMA3_256-NEXT: vmulps %ymm3, %ymm1, %ymm3
+; FMA3_256-NEXT: vmulps %ymm2, %ymm0, %ymm8
+; FMA3_256-NEXT: vaddss %xmm4, %xmm8, %xmm0
+; FMA3_256-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; FMA3_256-NEXT: vpermilpd {{.*#+}} xmm1 = xmm8[1,0]
+; FMA3_256-NEXT: vpermilpd {{.*#+}} xmm2 = xmm4[1,0]
+; FMA3_256-NEXT: vaddss %xmm2, %xmm1, %xmm10
+; FMA3_256-NEXT: vextractf128 $1, %ymm8, %xmm0
+; FMA3_256-NEXT: vextractf128 $1, %ymm4, %xmm1
+; FMA3_256-NEXT: vaddss %xmm1, %xmm0, %xmm11
+; FMA3_256-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
+; FMA3_256-NEXT: vpermilpd {{.*#+}} xmm7 = xmm1[1,0]
+; FMA3_256-NEXT: vaddss %xmm7, %xmm2, %xmm13
+; FMA3_256-NEXT: vpermilpd {{.*#+}} xmm2 = xmm3[1,0]
+; FMA3_256-NEXT: vpermilpd {{.*#+}} xmm6 = xmm5[1,0]
+; FMA3_256-NEXT: vaddss %xmm6, %xmm2, %xmm12
+; FMA3_256-NEXT: vextractf128 $1, %ymm3, %xmm6
+; FMA3_256-NEXT: vpermilpd {{.*#+}} xmm14 = xmm6[1,0]
+; FMA3_256-NEXT: vextractf128 $1, %ymm5, %xmm7
+; FMA3_256-NEXT: vpermilpd {{.*#+}} xmm2 = xmm7[1,0]
+; FMA3_256-NEXT: vaddss %xmm2, %xmm14, %xmm14
+; FMA3_256-NEXT: vmovshdup {{.*#+}} xmm15 = xmm8[1,1,3,3]
+; FMA3_256-NEXT: vmovshdup {{.*#+}} xmm2 = xmm4[1,1,3,3]
+; FMA3_256-NEXT: vsubss %xmm2, %xmm15, %xmm9
+; FMA3_256-NEXT: vaddss %xmm5, %xmm3, %xmm15
+; FMA3_256-NEXT: vpermilps {{.*#+}} xmm8 = xmm8[3,1,2,3]
+; FMA3_256-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[3,1,2,3]
+; FMA3_256-NEXT: vsubss %xmm4, %xmm8, %xmm4
+; FMA3_256-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; FMA3_256-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
+; FMA3_256-NEXT: vsubss %xmm1, %xmm0, %xmm0
+; FMA3_256-NEXT: vmovshdup {{.*#+}} xmm1 = xmm3[1,1,3,3]
+; FMA3_256-NEXT: vmovshdup {{.*#+}} xmm2 = xmm5[1,1,3,3]
+; FMA3_256-NEXT: vsubss %xmm2, %xmm1, %xmm1
+; FMA3_256-NEXT: vpermilps {{.*#+}} xmm2 = xmm3[3,1,2,3]
+; FMA3_256-NEXT: vpermilps {{.*#+}} xmm3 = xmm5[3,1,2,3]
+; FMA3_256-NEXT: vsubss %xmm3, %xmm2, %xmm2
+; FMA3_256-NEXT: vmovshdup {{.*#+}} xmm3 = xmm6[1,1,3,3]
+; FMA3_256-NEXT: vmovshdup {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; FMA3_256-NEXT: vsubss %xmm5, %xmm3, %xmm3
+; FMA3_256-NEXT: vpermilps {{.*#+}} xmm5 = xmm6[3,1,2,3]
+; FMA3_256-NEXT: vpermilps {{.*#+}} xmm6 = xmm7[3,1,2,3]
+; FMA3_256-NEXT: vsubss %xmm6, %xmm5, %xmm5
+; FMA3_256-NEXT: vinsertps {{.*#+}} xmm6 = xmm11[0,1],xmm13[0],xmm11[3]
+; FMA3_256-NEXT: vinsertps {{.*#+}} xmm0 = xmm6[0,1,2],xmm0[0]
+; FMA3_256-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm6 # 16-byte Reload
+; FMA3_256-NEXT: vinsertps {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[2,3]
+; FMA3_256-NEXT: vinsertps {{.*#+}} xmm6 = xmm6[0,1],xmm10[0],xmm6[3]
+; FMA3_256-NEXT: vinsertps {{.*#+}} xmm4 = xmm6[0,1,2],xmm4[0]
+; FMA3_256-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0
+; FMA3_256-NEXT: vmovsldup {{.*#+}} xmm3 = xmm3[0,0,2,2]
+; FMA3_256-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm14[0],xmm3[3]
+; FMA3_256-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm5[0]
+; FMA3_256-NEXT: vinsertps {{.*#+}} xmm1 = xmm15[0],xmm1[0],xmm15[2,3]
+; FMA3_256-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm12[0],xmm1[3]
+; FMA3_256-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0]
+; FMA3_256-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; FMA3_256-NEXT: retq
+;
+; FMA3_512-LABEL: buildvector_mul_subadd_ps512:
+; FMA3_512: # %bb.0: # %bb
+; FMA3_512-NEXT: vmulps %zmm1, %zmm0, %zmm0
+; FMA3_512-NEXT: vaddss %xmm2, %xmm0, %xmm8
+; FMA3_512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
+; FMA3_512-NEXT: vpermilpd {{.*#+}} xmm4 = xmm2[1,0]
+; FMA3_512-NEXT: vaddss %xmm4, %xmm3, %xmm10
+; FMA3_512-NEXT: vextractf128 $1, %ymm0, %xmm4
+; FMA3_512-NEXT: vextractf128 $1, %ymm2, %xmm5
+; FMA3_512-NEXT: vaddss %xmm5, %xmm4, %xmm6
+; FMA3_512-NEXT: vpermilpd {{.*#+}} xmm7 = xmm4[1,0]
+; FMA3_512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm5[1,0]
+; FMA3_512-NEXT: vaddss %xmm1, %xmm7, %xmm1
+; FMA3_512-NEXT: vinsertps {{.*#+}} xmm11 = xmm6[0,1],xmm1[0],xmm6[3]
+; FMA3_512-NEXT: vextractf32x4 $2, %zmm0, %xmm6
+; FMA3_512-NEXT: vextractf32x4 $2, %zmm2, %xmm7
+; FMA3_512-NEXT: vaddss %xmm7, %xmm6, %xmm9
+; FMA3_512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm6[1,0]
+; FMA3_512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm7[1,0]
+; FMA3_512-NEXT: vaddss %xmm1, %xmm3, %xmm12
+; FMA3_512-NEXT: vextractf32x4 $3, %zmm0, %xmm14
+; FMA3_512-NEXT: vpermilpd {{.*#+}} xmm13 = xmm14[1,0]
+; FMA3_512-NEXT: vextractf32x4 $3, %zmm2, %xmm15
+; FMA3_512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm15[1,0]
+; FMA3_512-NEXT: vaddss %xmm3, %xmm13, %xmm13
+; FMA3_512-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; FMA3_512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; FMA3_512-NEXT: vsubss %xmm1, %xmm3, %xmm1
+; FMA3_512-NEXT: vinsertps {{.*#+}} xmm1 = xmm8[0],xmm1[0],xmm8[2,3]
+; FMA3_512-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm10[0],xmm1[3]
+; FMA3_512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; FMA3_512-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3]
+; FMA3_512-NEXT: vsubss %xmm2, %xmm0, %xmm0
+; FMA3_512-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; FMA3_512-NEXT: vpermilps {{.*#+}} xmm1 = xmm4[3,1,2,3]
+; FMA3_512-NEXT: vpermilps {{.*#+}} xmm2 = xmm5[3,1,2,3]
+; FMA3_512-NEXT: vsubss %xmm2, %xmm1, %xmm1
+; FMA3_512-NEXT: vinsertps {{.*#+}} xmm1 = xmm11[0,1,2],xmm1[0]
+; FMA3_512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm6[1,1,3,3]
+; FMA3_512-NEXT: vmovshdup {{.*#+}} xmm3 = xmm7[1,1,3,3]
+; FMA3_512-NEXT: vsubss %xmm3, %xmm2, %xmm2
+; FMA3_512-NEXT: vinsertps {{.*#+}} xmm2 = xmm9[0],xmm2[0],xmm9[2,3]
+; FMA3_512-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm12[0],xmm2[3]
+; FMA3_512-NEXT: vpermilps {{.*#+}} xmm3 = xmm6[3,1,2,3]
+; FMA3_512-NEXT: vpermilps {{.*#+}} xmm4 = xmm7[3,1,2,3]
+; FMA3_512-NEXT: vsubss %xmm4, %xmm3, %xmm3
+; FMA3_512-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0]
+; FMA3_512-NEXT: vmovshdup {{.*#+}} xmm3 = xmm14[1,1,3,3]
+; FMA3_512-NEXT: vmovshdup {{.*#+}} xmm4 = xmm15[1,1,3,3]
+; FMA3_512-NEXT: vsubss %xmm4, %xmm3, %xmm3
+; FMA3_512-NEXT: vpermilps {{.*#+}} xmm4 = xmm14[3,1,2,3]
+; FMA3_512-NEXT: vpermilps {{.*#+}} xmm5 = xmm15[3,1,2,3]
+; FMA3_512-NEXT: vsubss %xmm5, %xmm4, %xmm4
+; FMA3_512-NEXT: vmovsldup {{.*#+}} xmm3 = xmm3[0,0,2,2]
+; FMA3_512-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm13[0],xmm3[3]
+; FMA3_512-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0]
+; FMA3_512-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; FMA3_512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; FMA3_512-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; FMA3_512-NEXT: retq
+;
+; FMA4-LABEL: buildvector_mul_subadd_ps512:
+; FMA4: # %bb.0: # %bb
+; FMA4-NEXT: vmulps %ymm3, %ymm1, %ymm3
+; FMA4-NEXT: vmulps %ymm2, %ymm0, %ymm8
+; FMA4-NEXT: vaddss %xmm4, %xmm8, %xmm0
+; FMA4-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; FMA4-NEXT: vpermilpd {{.*#+}} xmm1 = xmm8[1,0]
+; FMA4-NEXT: vpermilpd {{.*#+}} xmm2 = xmm4[1,0]
+; FMA4-NEXT: vaddss %xmm2, %xmm1, %xmm10
+; FMA4-NEXT: vextractf128 $1, %ymm8, %xmm0
+; FMA4-NEXT: vextractf128 $1, %ymm4, %xmm1
+; FMA4-NEXT: vaddss %xmm1, %xmm0, %xmm11
+; FMA4-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
+; FMA4-NEXT: vpermilpd {{.*#+}} xmm7 = xmm1[1,0]
+; FMA4-NEXT: vaddss %xmm7, %xmm2, %xmm13
+; FMA4-NEXT: vpermilpd {{.*#+}} xmm2 = xmm3[1,0]
+; FMA4-NEXT: vpermilpd {{.*#+}} xmm6 = xmm5[1,0]
+; FMA4-NEXT: vaddss %xmm6, %xmm2, %xmm12
+; FMA4-NEXT: vextractf128 $1, %ymm3, %xmm6
+; FMA4-NEXT: vpermilpd {{.*#+}} xmm14 = xmm6[1,0]
+; FMA4-NEXT: vextractf128 $1, %ymm5, %xmm7
+; FMA4-NEXT: vpermilpd {{.*#+}} xmm2 = xmm7[1,0]
+; FMA4-NEXT: vaddss %xmm2, %xmm14, %xmm14
+; FMA4-NEXT: vmovshdup {{.*#+}} xmm15 = xmm8[1,1,3,3]
+; FMA4-NEXT: vmovshdup {{.*#+}} xmm2 = xmm4[1,1,3,3]
+; FMA4-NEXT: vsubss %xmm2, %xmm15, %xmm9
+; FMA4-NEXT: vaddss %xmm5, %xmm3, %xmm15
+; FMA4-NEXT: vpermilps {{.*#+}} xmm8 = xmm8[3,1,2,3]
+; FMA4-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[3,1,2,3]
+; FMA4-NEXT: vsubss %xmm4, %xmm8, %xmm4
+; FMA4-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; FMA4-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
+; FMA4-NEXT: vsubss %xmm1, %xmm0, %xmm0
+; FMA4-NEXT: vmovshdup {{.*#+}} xmm1 = xmm3[1,1,3,3]
+; FMA4-NEXT: vmovshdup {{.*#+}} xmm2 = xmm5[1,1,3,3]
+; FMA4-NEXT: vsubss %xmm2, %xmm1, %xmm1
+; FMA4-NEXT: vpermilps {{.*#+}} xmm2 = xmm3[3,1,2,3]
+; FMA4-NEXT: vpermilps {{.*#+}} xmm3 = xmm5[3,1,2,3]
+; FMA4-NEXT: vsubss %xmm3, %xmm2, %xmm2
+; FMA4-NEXT: vmovshdup {{.*#+}} xmm3 = xmm6[1,1,3,3]
+; FMA4-NEXT: vmovshdup {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; FMA4-NEXT: vsubss %xmm5, %xmm3, %xmm3
+; FMA4-NEXT: vpermilps {{.*#+}} xmm5 = xmm6[3,1,2,3]
+; FMA4-NEXT: vpermilps {{.*#+}} xmm6 = xmm7[3,1,2,3]
+; FMA4-NEXT: vsubss %xmm6, %xmm5, %xmm5
+; FMA4-NEXT: vinsertps {{.*#+}} xmm6 = xmm11[0,1],xmm13[0],xmm11[3]
+; FMA4-NEXT: vinsertps {{.*#+}} xmm0 = xmm6[0,1,2],xmm0[0]
+; FMA4-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm6 # 16-byte Reload
+; FMA4-NEXT: vinsertps {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[2,3]
+; FMA4-NEXT: vinsertps {{.*#+}} xmm6 = xmm6[0,1],xmm10[0],xmm6[3]
+; FMA4-NEXT: vinsertps {{.*#+}} xmm4 = xmm6[0,1,2],xmm4[0]
+; FMA4-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0
+; FMA4-NEXT: vmovsldup {{.*#+}} xmm3 = xmm3[0,0,2,2]
+; FMA4-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm14[0],xmm3[3]
+; FMA4-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm5[0]
+; FMA4-NEXT: vinsertps {{.*#+}} xmm1 = xmm15[0],xmm1[0],xmm15[2,3]
+; FMA4-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm12[0],xmm1[3]
+; FMA4-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0]
+; FMA4-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; FMA4-NEXT: retq
+bb:
+ %A = fmul <16 x float> %C, %D
+ %A0 = extractelement <16 x float> %A, i32 0
+ %B0 = extractelement <16 x float> %B, i32 0
+ %sub0 = fadd float %A0, %B0
+ %A2 = extractelement <16 x float> %A, i32 2
+ %B2 = extractelement <16 x float> %B, i32 2
+ %sub2 = fadd float %A2, %B2
+ %A4 = extractelement <16 x float> %A, i32 4
+ %B4 = extractelement <16 x float> %B, i32 4
+ %sub4 = fadd float %A4, %B4
+ %A6 = extractelement <16 x float> %A, i32 6
+ %B6 = extractelement <16 x float> %B, i32 6
+ %sub6 = fadd float %A6, %B6
+ %A8 = extractelement <16 x float> %A, i32 8
+ %B8 = extractelement <16 x float> %B, i32 8
+ %sub8 = fadd float %A8, %B8
+ %A10 = extractelement <16 x float> %A, i32 10
+ %B10 = extractelement <16 x float> %B, i32 10
+ %sub10 = fadd float %A10, %B10
+ %A12 = extractelement <16 x float> %A, i32 12
+ %B12 = extractelement <16 x float> %B, i32 12
+ %sub12 = fadd float %A12, %B12
+ %A14 = extractelement <16 x float> %A, i32 14
+ %B14 = extractelement <16 x float> %B, i32 14
+ %sub14 = fadd float %A14, %B14
+ %A1 = extractelement <16 x float> %A, i32 1
+ %B1 = extractelement <16 x float> %B, i32 1
+ %add1 = fsub float %A1, %B1
+ %A3 = extractelement <16 x float> %A, i32 3
+ %B3 = extractelement <16 x float> %B, i32 3
+ %add3 = fsub float %A3, %B3
+ %A5 = extractelement <16 x float> %A, i32 5
+ %B5 = extractelement <16 x float> %B, i32 5
+ %add5 = fsub float %A5, %B5
+ %A7 = extractelement <16 x float> %A, i32 7
+ %B7 = extractelement <16 x float> %B, i32 7
+ %add7 = fsub float %A7, %B7
+ %A9 = extractelement <16 x float> %A, i32 9
+ %B9 = extractelement <16 x float> %B, i32 9
+ %add9 = fsub float %A9, %B9
+ %A11 = extractelement <16 x float> %A, i32 11
+ %B11 = extractelement <16 x float> %B, i32 11
+ %add11 = fsub float %A11, %B11
+ %A13 = extractelement <16 x float> %A, i32 13
+ %B13 = extractelement <16 x float> %B, i32 13
+ %add13 = fsub float %A13, %B13
+ %A15 = extractelement <16 x float> %A, i32 15
+ %B15 = extractelement <16 x float> %B, i32 15
+ %add15 = fsub float %A15, %B15
+ %vecinsert1 = insertelement <16 x float> undef, float %sub0, i32 0
+ %vecinsert2 = insertelement <16 x float> %vecinsert1, float %add1, i32 1
+ %vecinsert3 = insertelement <16 x float> %vecinsert2, float %sub2, i32 2
+ %vecinsert4 = insertelement <16 x float> %vecinsert3, float %add3, i32 3
+ %vecinsert5 = insertelement <16 x float> %vecinsert4, float %sub4, i32 4
+ ; element 5 is undef
+ %vecinsert7 = insertelement <16 x float> %vecinsert5, float %sub6, i32 6
+ %vecinsert8 = insertelement <16 x float> %vecinsert7, float %add7, i32 7
+ %vecinsert9 = insertelement <16 x float> %vecinsert8, float %sub8, i32 8
+ %vecinsert10 = insertelement <16 x float> %vecinsert9, float %add9, i32 9
+ %vecinsert11 = insertelement <16 x float> %vecinsert10, float %sub10, i32 10
+ %vecinsert12 = insertelement <16 x float> %vecinsert11, float %add11, i32 11
+ ; element 12 is undef
+ %vecinsert14 = insertelement <16 x float> %vecinsert12, float %add13, i32 13
+ %vecinsert15 = insertelement <16 x float> %vecinsert14, float %sub14, i32 14
+ %vecinsert16 = insertelement <16 x float> %vecinsert15, float %add15, i32 15
+ ret <16 x float> %vecinsert16
+}
+
+define <8 x double> @buildvector_mul_subadd_pd512(<8 x double> %C, <8 x double> %D, <8 x double> %B) #0 {
+; FMA3_256-LABEL: buildvector_mul_subadd_pd512:
+; FMA3_256: # %bb.0: # %bb
+; FMA3_256-NEXT: vmulpd %ymm3, %ymm1, %ymm1
+; FMA3_256-NEXT: vmulpd %ymm2, %ymm0, %ymm0
+; FMA3_256-NEXT: vaddsd %xmm4, %xmm0, %xmm9
+; FMA3_256-NEXT: vextractf128 $1, %ymm0, %xmm3
+; FMA3_256-NEXT: vextractf128 $1, %ymm4, %xmm6
+; FMA3_256-NEXT: vaddsd %xmm6, %xmm3, %xmm7
+; FMA3_256-NEXT: vaddsd %xmm5, %xmm1, %xmm8
+; FMA3_256-NEXT: vextractf128 $1, %ymm1, %xmm1
+; FMA3_256-NEXT: vextractf128 $1, %ymm5, %xmm5
+; FMA3_256-NEXT: vaddsd %xmm5, %xmm1, %xmm2
+; FMA3_256-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; FMA3_256-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
+; FMA3_256-NEXT: vsubsd %xmm4, %xmm0, %xmm0
+; FMA3_256-NEXT: vpermilpd {{.*#+}} xmm3 = xmm3[1,0]
+; FMA3_256-NEXT: vpermilpd {{.*#+}} xmm4 = xmm6[1,0]
+; FMA3_256-NEXT: vsubsd %xmm4, %xmm3, %xmm3
+; FMA3_256-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
+; FMA3_256-NEXT: vpermilpd {{.*#+}} xmm4 = xmm5[1,0]
+; FMA3_256-NEXT: vsubsd %xmm4, %xmm1, %xmm1
+; FMA3_256-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm7[0],xmm3[0]
+; FMA3_256-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm9[0],xmm0[0]
+; FMA3_256-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
+; FMA3_256-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; FMA3_256-NEXT: vinsertf128 $1, %xmm1, %ymm8, %ymm1
+; FMA3_256-NEXT: retq
+;
+; FMA3_512-LABEL: buildvector_mul_subadd_pd512:
+; FMA3_512: # %bb.0: # %bb
+; FMA3_512-NEXT: vmulpd %zmm1, %zmm0, %zmm0
+; FMA3_512-NEXT: vaddsd %xmm2, %xmm0, %xmm8
+; FMA3_512-NEXT: vextractf128 $1, %ymm0, %xmm3
+; FMA3_512-NEXT: vextractf128 $1, %ymm2, %xmm4
+; FMA3_512-NEXT: vaddsd %xmm4, %xmm3, %xmm5
+; FMA3_512-NEXT: vextractf32x4 $2, %zmm0, %xmm6
+; FMA3_512-NEXT: vextractf32x4 $2, %zmm2, %xmm7
+; FMA3_512-NEXT: vaddsd %xmm7, %xmm6, %xmm9
+; FMA3_512-NEXT: vextractf32x4 $3, %zmm0, %xmm7
+; FMA3_512-NEXT: vextractf32x4 $3, %zmm2, %xmm1
+; FMA3_512-NEXT: vaddsd %xmm1, %xmm7, %xmm6
+; FMA3_512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; FMA3_512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
+; FMA3_512-NEXT: vsubsd %xmm2, %xmm0, %xmm0
+; FMA3_512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm3[1,0]
+; FMA3_512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm4[1,0]
+; FMA3_512-NEXT: vsubsd %xmm3, %xmm2, %xmm2
+; FMA3_512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm7[1,0]
+; FMA3_512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
+; FMA3_512-NEXT: vsubsd %xmm1, %xmm3, %xmm1
+; FMA3_512-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm5[0],xmm2[0]
+; FMA3_512-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm8[0],xmm0[0]
+; FMA3_512-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; FMA3_512-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm6[0],xmm1[0]
+; FMA3_512-NEXT: vinsertf128 $1, %xmm1, %ymm9, %ymm1
+; FMA3_512-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; FMA3_512-NEXT: retq
+;
+; FMA4-LABEL: buildvector_mul_subadd_pd512:
+; FMA4: # %bb.0: # %bb
+; FMA4-NEXT: vmulpd %ymm3, %ymm1, %ymm1
+; FMA4-NEXT: vmulpd %ymm2, %ymm0, %ymm0
+; FMA4-NEXT: vaddsd %xmm4, %xmm0, %xmm9
+; FMA4-NEXT: vextractf128 $1, %ymm0, %xmm3
+; FMA4-NEXT: vextractf128 $1, %ymm4, %xmm6
+; FMA4-NEXT: vaddsd %xmm6, %xmm3, %xmm7
+; FMA4-NEXT: vaddsd %xmm5, %xmm1, %xmm8
+; FMA4-NEXT: vextractf128 $1, %ymm1, %xmm1
+; FMA4-NEXT: vextractf128 $1, %ymm5, %xmm5
+; FMA4-NEXT: vaddsd %xmm5, %xmm1, %xmm2
+; FMA4-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; FMA4-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
+; FMA4-NEXT: vsubsd %xmm4, %xmm0, %xmm0
+; FMA4-NEXT: vpermilpd {{.*#+}} xmm3 = xmm3[1,0]
+; FMA4-NEXT: vpermilpd {{.*#+}} xmm4 = xmm6[1,0]
+; FMA4-NEXT: vsubsd %xmm4, %xmm3, %xmm3
+; FMA4-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
+; FMA4-NEXT: vpermilpd {{.*#+}} xmm4 = xmm5[1,0]
+; FMA4-NEXT: vsubsd %xmm4, %xmm1, %xmm1
+; FMA4-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm7[0],xmm3[0]
+; FMA4-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm9[0],xmm0[0]
+; FMA4-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
+; FMA4-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; FMA4-NEXT: vinsertf128 $1, %xmm1, %ymm8, %ymm1
+; FMA4-NEXT: retq
+bb:
+ %A = fmul <8 x double> %C, %D
+ %A0 = extractelement <8 x double> %A, i32 0
+ %B0 = extractelement <8 x double> %B, i32 0
+ %sub0 = fadd double %A0, %B0
+ %A2 = extractelement <8 x double> %A, i32 2
+ %B2 = extractelement <8 x double> %B, i32 2
+ %sub2 = fadd double %A2, %B2
+ %A4 = extractelement <8 x double> %A, i32 4
+ %B4 = extractelement <8 x double> %B, i32 4
+ %sub4 = fadd double %A4, %B4
+ %A6 = extractelement <8 x double> %A, i32 6
+ %B6 = extractelement <8 x double> %B, i32 6
+ %sub6 = fadd double %A6, %B6
+ %A1 = extractelement <8 x double> %A, i32 1
+ %B1 = extractelement <8 x double> %B, i32 1
+ %add1 = fsub double %A1, %B1
+ %A3 = extractelement <8 x double> %A, i32 3
+ %B3 = extractelement <8 x double> %B, i32 3
+ %add3 = fsub double %A3, %B3
+ %A7 = extractelement <8 x double> %A, i32 7
+ %B7 = extractelement <8 x double> %B, i32 7
+ %add7 = fsub double %A7, %B7
+ %vecinsert1 = insertelement <8 x double> undef, double %sub0, i32 0
+ %vecinsert2 = insertelement <8 x double> %vecinsert1, double %add1, i32 1
+ %vecinsert3 = insertelement <8 x double> %vecinsert2, double %sub2, i32 2
+ %vecinsert4 = insertelement <8 x double> %vecinsert3, double %add3, i32 3
+ %vecinsert5 = insertelement <8 x double> %vecinsert4, double %sub4, i32 4
+ ; element 5 is undef
+ %vecinsert7 = insertelement <8 x double> %vecinsert5, double %sub6, i32 6
+ %vecinsert8 = insertelement <8 x double> %vecinsert7, double %add7, i32 7
+ ret <8 x double> %vecinsert8
+}
+
attributes #0 = { nounwind "unsafe-fp-math"="true" }
diff --git a/test/CodeGen/X86/fmf-flags.ll b/test/CodeGen/X86/fmf-flags.ll
new file mode 100644
index 000000000000..00c73c1ffb86
--- /dev/null
+++ b/test/CodeGen/X86/fmf-flags.ll
@@ -0,0 +1,104 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown | FileCheck %s -check-prefix=X64
+; RUN: llc < %s -mtriple=i686-unknown | FileCheck %s -check-prefix=X86
+
+declare float @llvm.sqrt.f32(float %x);
+
+define float @fast_recip_sqrt(float %x) {
+; X64-LABEL: fast_recip_sqrt:
+; X64: # %bb.0:
+; X64-NEXT: sqrtss %xmm0, %xmm1
+; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-NEXT: divss %xmm1, %xmm0
+; X64-NEXT: retq
+;
+; X86-LABEL: fast_recip_sqrt:
+; X86: # %bb.0:
+; X86-NEXT: flds {{[0-9]+}}(%esp)
+; X86-NEXT: fsqrt
+; X86-NEXT: fld1
+; X86-NEXT: fdivp %st(1)
+; X86-NEXT: retl
+ %y = call fast float @llvm.sqrt.f32(float %x)
+ %z = fdiv fast float 1.0, %y
+ ret float %z
+}
+
+declare float @llvm.fmuladd.f32(float %a, float %b, float %c);
+
+define float @fast_fmuladd_opts(float %a , float %b , float %c) {
+; X64-LABEL: fast_fmuladd_opts:
+; X64: # %bb.0:
+; X64-NEXT: movaps %xmm0, %xmm1
+; X64-NEXT: addss %xmm1, %xmm1
+; X64-NEXT: addss %xmm0, %xmm1
+; X64-NEXT: movaps %xmm1, %xmm0
+; X64-NEXT: retq
+;
+; X86-LABEL: fast_fmuladd_opts:
+; X86: # %bb.0:
+; X86-NEXT: flds {{[0-9]+}}(%esp)
+; X86-NEXT: fld %st(0)
+; X86-NEXT: fadd %st(1)
+; X86-NEXT: faddp %st(1)
+; X86-NEXT: retl
+ %res = call fast float @llvm.fmuladd.f32(float %a, float 2.0, float %a)
+ ret float %res
+}
+
+; The multiply is strict.
+
+@mul1 = common global double 0.000000e+00, align 4
+
+define double @not_so_fast_mul_add(double %x) {
+; X64-LABEL: not_so_fast_mul_add:
+; X64: # %bb.0:
+; X64-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
+; X64-NEXT: mulsd %xmm0, %xmm1
+; X64-NEXT: addsd %xmm1, %xmm0
+; X64-NEXT: movsd %xmm1, {{.*}}(%rip)
+; X64-NEXT: retq
+;
+; X86-LABEL: not_so_fast_mul_add:
+; X86: # %bb.0:
+; X86-NEXT: fldl {{[0-9]+}}(%esp)
+; X86-NEXT: fld %st(0)
+; X86-NEXT: fmull {{\.LCPI.*}}
+; X86-NEXT: fadd %st(0), %st(1)
+; X86-NEXT: fstpl mul1
+; X86-NEXT: retl
+ %m = fmul double %x, 4.2
+ %a = fadd fast double %m, %x
+ store double %m, double* @mul1, align 4
+ ret double %a
+}
+
+; The sqrt is strict.
+
+@sqrt1 = common global float 0.000000e+00, align 4
+
+define float @not_so_fast_recip_sqrt(float %x) {
+; X64-LABEL: not_so_fast_recip_sqrt:
+; X64: # %bb.0:
+; X64-NEXT: sqrtss %xmm0, %xmm1
+; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-NEXT: divss %xmm1, %xmm0
+; X64-NEXT: movss %xmm1, {{.*}}(%rip)
+; X64-NEXT: retq
+;
+; X86-LABEL: not_so_fast_recip_sqrt:
+; X86: # %bb.0:
+; X86-NEXT: flds {{[0-9]+}}(%esp)
+; X86-NEXT: fsqrt
+; X86-NEXT: fld1
+; X86-NEXT: fdiv %st(1)
+; X86-NEXT: fxch %st(1)
+; X86-NEXT: fstps sqrt1
+; X86-NEXT: retl
+ %y = call float @llvm.sqrt.f32(float %x)
+ %z = fdiv fast float 1.0, %y
+ store float %y, float* @sqrt1, align 4
+ %ret = fadd float %z , 14.5
+ ret float %z
+}
+
diff --git a/test/CodeGen/X86/fmsubadd-combine.ll b/test/CodeGen/X86/fmsubadd-combine.ll
index 338a95f6a80c..814d61e22382 100644
--- a/test/CodeGen/X86/fmsubadd-combine.ll
+++ b/test/CodeGen/X86/fmsubadd-combine.ll
@@ -7,7 +7,7 @@
define <2 x double> @mul_subadd_pd128(<2 x double> %A, <2 x double> %B, <2 x double> %C) #0 {
; FMA3_256-LABEL: mul_subadd_pd128:
-; FMA3_256: # BB#0: # %entry
+; FMA3_256: # %bb.0: # %entry
; FMA3_256-NEXT: vmulpd %xmm1, %xmm0, %xmm0
; FMA3_256-NEXT: vsubpd %xmm2, %xmm0, %xmm1
; FMA3_256-NEXT: vaddpd %xmm2, %xmm0, %xmm0
@@ -15,7 +15,7 @@ define <2 x double> @mul_subadd_pd128(<2 x double> %A, <2 x double> %B, <2 x dou
; FMA3_256-NEXT: retq
;
; FMA3_512-LABEL: mul_subadd_pd128:
-; FMA3_512: # BB#0: # %entry
+; FMA3_512: # %bb.0: # %entry
; FMA3_512-NEXT: vmulpd %xmm1, %xmm0, %xmm0
; FMA3_512-NEXT: vsubpd %xmm2, %xmm0, %xmm1
; FMA3_512-NEXT: vaddpd %xmm2, %xmm0, %xmm0
@@ -23,7 +23,7 @@ define <2 x double> @mul_subadd_pd128(<2 x double> %A, <2 x double> %B, <2 x dou
; FMA3_512-NEXT: retq
;
; FMA4-LABEL: mul_subadd_pd128:
-; FMA4: # BB#0: # %entry
+; FMA4: # %bb.0: # %entry
; FMA4-NEXT: vmulpd %xmm1, %xmm0, %xmm0
; FMA4-NEXT: vsubpd %xmm2, %xmm0, %xmm1
; FMA4-NEXT: vaddpd %xmm2, %xmm0, %xmm0
@@ -39,7 +39,7 @@ entry:
define <4 x float> @mul_subadd_ps128(<4 x float> %A, <4 x float> %B, <4 x float> %C) #0 {
; FMA3-LABEL: mul_subadd_ps128:
-; FMA3: # BB#0: # %entry
+; FMA3: # %bb.0: # %entry
; FMA3-NEXT: vmulps %xmm1, %xmm0, %xmm0
; FMA3-NEXT: vsubps %xmm2, %xmm0, %xmm1
; FMA3-NEXT: vaddps %xmm2, %xmm0, %xmm0
@@ -47,7 +47,7 @@ define <4 x float> @mul_subadd_ps128(<4 x float> %A, <4 x float> %B, <4 x float>
; FMA3-NEXT: retq
;
; FMA4-LABEL: mul_subadd_ps128:
-; FMA4: # BB#0: # %entry
+; FMA4: # %bb.0: # %entry
; FMA4-NEXT: vmulps %xmm1, %xmm0, %xmm0
; FMA4-NEXT: vsubps %xmm2, %xmm0, %xmm1
; FMA4-NEXT: vaddps %xmm2, %xmm0, %xmm0
@@ -63,7 +63,7 @@ entry:
define <4 x double> @mul_subadd_pd256(<4 x double> %A, <4 x double> %B, <4 x double> %C) #0 {
; FMA3-LABEL: mul_subadd_pd256:
-; FMA3: # BB#0: # %entry
+; FMA3: # %bb.0: # %entry
; FMA3-NEXT: vmulpd %ymm1, %ymm0, %ymm0
; FMA3-NEXT: vsubpd %ymm2, %ymm0, %ymm1
; FMA3-NEXT: vaddpd %ymm2, %ymm0, %ymm0
@@ -71,7 +71,7 @@ define <4 x double> @mul_subadd_pd256(<4 x double> %A, <4 x double> %B, <4 x dou
; FMA3-NEXT: retq
;
; FMA4-LABEL: mul_subadd_pd256:
-; FMA4: # BB#0: # %entry
+; FMA4: # %bb.0: # %entry
; FMA4-NEXT: vmulpd %ymm1, %ymm0, %ymm0
; FMA4-NEXT: vsubpd %ymm2, %ymm0, %ymm1
; FMA4-NEXT: vaddpd %ymm2, %ymm0, %ymm0
@@ -87,7 +87,7 @@ entry:
define <8 x float> @mul_subadd_ps256(<8 x float> %A, <8 x float> %B, <8 x float> %C) #0 {
; FMA3-LABEL: mul_subadd_ps256:
-; FMA3: # BB#0: # %entry
+; FMA3: # %bb.0: # %entry
; FMA3-NEXT: vmulps %ymm1, %ymm0, %ymm0
; FMA3-NEXT: vsubps %ymm2, %ymm0, %ymm1
; FMA3-NEXT: vaddps %ymm2, %ymm0, %ymm0
@@ -95,7 +95,7 @@ define <8 x float> @mul_subadd_ps256(<8 x float> %A, <8 x float> %B, <8 x float>
; FMA3-NEXT: retq
;
; FMA4-LABEL: mul_subadd_ps256:
-; FMA4: # BB#0: # %entry
+; FMA4: # %bb.0: # %entry
; FMA4-NEXT: vmulps %ymm1, %ymm0, %ymm0
; FMA4-NEXT: vsubps %ymm2, %ymm0, %ymm1
; FMA4-NEXT: vaddps %ymm2, %ymm0, %ymm0
@@ -111,7 +111,7 @@ entry:
define <8 x double> @mul_subadd_pd512(<8 x double> %A, <8 x double> %B, <8 x double> %C) #0 {
; FMA3_256-LABEL: mul_subadd_pd512:
-; FMA3_256: # BB#0: # %entry
+; FMA3_256: # %bb.0: # %entry
; FMA3_256-NEXT: vmulpd %ymm2, %ymm0, %ymm0
; FMA3_256-NEXT: vmulpd %ymm3, %ymm1, %ymm1
; FMA3_256-NEXT: vsubpd %ymm5, %ymm1, %ymm2
@@ -123,7 +123,7 @@ define <8 x double> @mul_subadd_pd512(<8 x double> %A, <8 x double> %B, <8 x dou
; FMA3_256-NEXT: retq
;
; FMA3_512-LABEL: mul_subadd_pd512:
-; FMA3_512: # BB#0: # %entry
+; FMA3_512: # %bb.0: # %entry
; FMA3_512-NEXT: vmulpd %zmm1, %zmm0, %zmm0
; FMA3_512-NEXT: vsubpd %zmm2, %zmm0, %zmm1
; FMA3_512-NEXT: vaddpd %zmm2, %zmm0, %zmm0
@@ -131,7 +131,7 @@ define <8 x double> @mul_subadd_pd512(<8 x double> %A, <8 x double> %B, <8 x dou
; FMA3_512-NEXT: retq
;
; FMA4-LABEL: mul_subadd_pd512:
-; FMA4: # BB#0: # %entry
+; FMA4: # %bb.0: # %entry
; FMA4-NEXT: vmulpd %ymm2, %ymm0, %ymm0
; FMA4-NEXT: vmulpd %ymm3, %ymm1, %ymm1
; FMA4-NEXT: vsubpd %ymm5, %ymm1, %ymm2
@@ -151,7 +151,7 @@ entry:
define <16 x float> @mul_subadd_ps512(<16 x float> %A, <16 x float> %B, <16 x float> %C) #0 {
; FMA3_256-LABEL: mul_subadd_ps512:
-; FMA3_256: # BB#0: # %entry
+; FMA3_256: # %bb.0: # %entry
; FMA3_256-NEXT: vmulps %ymm2, %ymm0, %ymm0
; FMA3_256-NEXT: vmulps %ymm3, %ymm1, %ymm1
; FMA3_256-NEXT: vsubps %ymm5, %ymm1, %ymm2
@@ -163,7 +163,7 @@ define <16 x float> @mul_subadd_ps512(<16 x float> %A, <16 x float> %B, <16 x fl
; FMA3_256-NEXT: retq
;
; FMA3_512-LABEL: mul_subadd_ps512:
-; FMA3_512: # BB#0: # %entry
+; FMA3_512: # %bb.0: # %entry
; FMA3_512-NEXT: vmulps %zmm1, %zmm0, %zmm1
; FMA3_512-NEXT: vaddps %zmm2, %zmm1, %zmm0
; FMA3_512-NEXT: movw $-21846, %ax # imm = 0xAAAA
@@ -172,7 +172,7 @@ define <16 x float> @mul_subadd_ps512(<16 x float> %A, <16 x float> %B, <16 x fl
; FMA3_512-NEXT: retq
;
; FMA4-LABEL: mul_subadd_ps512:
-; FMA4: # BB#0: # %entry
+; FMA4: # %bb.0: # %entry
; FMA4-NEXT: vmulps %ymm2, %ymm0, %ymm0
; FMA4-NEXT: vmulps %ymm3, %ymm1, %ymm1
; FMA4-NEXT: vsubps %ymm5, %ymm1, %ymm2
diff --git a/test/CodeGen/X86/fmul-combines.ll b/test/CodeGen/X86/fmul-combines.ll
index 564ce42fdb75..ff701aba43bc 100644
--- a/test/CodeGen/X86/fmul-combines.ll
+++ b/test/CodeGen/X86/fmul-combines.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=x86_64-unknown-unknown -march=x86-64 < %s | FileCheck %s
+; RUN: llc -mtriple=x86_64-unknown-unknown < %s | FileCheck %s
; CHECK-LABEL: fmul2_f32:
; CHECK: addss %xmm0, %xmm0
diff --git a/test/CodeGen/X86/fmul-zero.ll b/test/CodeGen/X86/fmul-zero.ll
index bc139f88534f..32fe1b0d91e4 100644
--- a/test/CodeGen/X86/fmul-zero.ll
+++ b/test/CodeGen/X86/fmul-zero.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=x86-64 -enable-unsafe-fp-math | not grep mulps
-; RUN: llc < %s -march=x86-64 | grep mulps
+; RUN: llc < %s -mtriple=x86_64-- -enable-unsafe-fp-math | not grep mulps
+; RUN: llc < %s -mtriple=x86_64-- | grep mulps
define void @test14(<4 x float>*) nounwind {
load <4 x float>, <4 x float>* %0, align 1
diff --git a/test/CodeGen/X86/fold-add.ll b/test/CodeGen/X86/fold-add.ll
index 7d2740074082..e2d3b0588fe2 100644
--- a/test/CodeGen/X86/fold-add.ll
+++ b/test/CodeGen/X86/fold-add.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 | FileCheck %s
+; RUN: llc < %s | FileCheck %s
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
target triple = "x86_64-apple-darwin9.6"
diff --git a/test/CodeGen/X86/fold-and-shift.ll b/test/CodeGen/X86/fold-and-shift.ll
index 00173efff69b..5de4acf10190 100644
--- a/test/CodeGen/X86/fold-and-shift.ll
+++ b/test/CodeGen/X86/fold-and-shift.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | FileCheck %s
+; RUN: llc < %s -mtriple=i686-- | FileCheck %s
define i32 @t1(i8* %X, i32 %i) {
; CHECK-LABEL: t1:
diff --git a/test/CodeGen/X86/fold-call.ll b/test/CodeGen/X86/fold-call.ll
index 00839943f678..a50b47481684 100644
--- a/test/CodeGen/X86/fold-call.ll
+++ b/test/CodeGen/X86/fold-call.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=x86 | FileCheck %s
-; RUN: llc < %s -march=x86-64 | FileCheck %s
+; RUN: llc < %s -mtriple=i686-- | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-- | FileCheck %s
; CHECK: test1
; CHECK-NOT: mov
diff --git a/test/CodeGen/X86/fold-imm.ll b/test/CodeGen/X86/fold-imm.ll
index 16e4786979b9..024c016c6c2c 100644
--- a/test/CodeGen/X86/fold-imm.ll
+++ b/test/CodeGen/X86/fold-imm.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | FileCheck %s
+; RUN: llc < %s -mtriple=i686-- | FileCheck %s
define i32 @test(i32 %X) nounwind {
entry:
diff --git a/test/CodeGen/X86/fold-load-binops.ll b/test/CodeGen/X86/fold-load-binops.ll
index 4662a1521a38..2d4fc723baa3 100644
--- a/test/CodeGen/X86/fold-load-binops.ll
+++ b/test/CodeGen/X86/fold-load-binops.ll
@@ -9,12 +9,12 @@
define <4 x float> @addss(<4 x float> %va, float* %pb) {
; SSE-LABEL: addss:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: addss (%rdi), %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: addss:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vaddss (%rdi), %xmm0, %xmm0
; AVX-NEXT: retq
%a = extractelement <4 x float> %va, i32 0
@@ -26,12 +26,12 @@ define <4 x float> @addss(<4 x float> %va, float* %pb) {
define <2 x double> @addsd(<2 x double> %va, double* %pb) {
; SSE-LABEL: addsd:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: addsd (%rdi), %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: addsd:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vaddsd (%rdi), %xmm0, %xmm0
; AVX-NEXT: retq
%a = extractelement <2 x double> %va, i32 0
@@ -43,12 +43,12 @@ define <2 x double> @addsd(<2 x double> %va, double* %pb) {
define <4 x float> @subss(<4 x float> %va, float* %pb) {
; SSE-LABEL: subss:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: subss (%rdi), %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: subss:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vsubss (%rdi), %xmm0, %xmm0
; AVX-NEXT: retq
%a = extractelement <4 x float> %va, i32 0
@@ -60,12 +60,12 @@ define <4 x float> @subss(<4 x float> %va, float* %pb) {
define <2 x double> @subsd(<2 x double> %va, double* %pb) {
; SSE-LABEL: subsd:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: subsd (%rdi), %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: subsd:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vsubsd (%rdi), %xmm0, %xmm0
; AVX-NEXT: retq
%a = extractelement <2 x double> %va, i32 0
@@ -77,12 +77,12 @@ define <2 x double> @subsd(<2 x double> %va, double* %pb) {
define <4 x float> @mulss(<4 x float> %va, float* %pb) {
; SSE-LABEL: mulss:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: mulss (%rdi), %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: mulss:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmulss (%rdi), %xmm0, %xmm0
; AVX-NEXT: retq
%a = extractelement <4 x float> %va, i32 0
@@ -94,12 +94,12 @@ define <4 x float> @mulss(<4 x float> %va, float* %pb) {
define <2 x double> @mulsd(<2 x double> %va, double* %pb) {
; SSE-LABEL: mulsd:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: mulsd (%rdi), %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: mulsd:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmulsd (%rdi), %xmm0, %xmm0
; AVX-NEXT: retq
%a = extractelement <2 x double> %va, i32 0
@@ -111,12 +111,12 @@ define <2 x double> @mulsd(<2 x double> %va, double* %pb) {
define <4 x float> @divss(<4 x float> %va, float* %pb) {
; SSE-LABEL: divss:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: divss (%rdi), %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: divss:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vdivss (%rdi), %xmm0, %xmm0
; AVX-NEXT: retq
%a = extractelement <4 x float> %va, i32 0
@@ -128,12 +128,12 @@ define <4 x float> @divss(<4 x float> %va, float* %pb) {
define <2 x double> @divsd(<2 x double> %va, double* %pb) {
; SSE-LABEL: divsd:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: divsd (%rdi), %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: divsd:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vdivsd (%rdi), %xmm0, %xmm0
; AVX-NEXT: retq
%a = extractelement <2 x double> %va, i32 0
diff --git a/test/CodeGen/X86/fold-load-unops.ll b/test/CodeGen/X86/fold-load-unops.ll
index d2b03dde8319..7feb66525e29 100644
--- a/test/CodeGen/X86/fold-load-unops.ll
+++ b/test/CodeGen/X86/fold-load-unops.ll
@@ -1,19 +1,20 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+sse2 < %s | FileCheck %s --check-prefix=SSE
; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx < %s | FileCheck %s --check-prefix=AVX
+; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx512f < %s | FileCheck %s --check-prefix=AVX
; Verify we fold loads into unary sse intrinsics only when optimizing for size
define float @rcpss(float* %a) {
; SSE-LABEL: rcpss:
-; SSE: # BB#0:
-; SSE-NEXT: movss (%rdi), %xmm0
+; SSE: # %bb.0:
+; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE-NEXT: rcpss %xmm0, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: rcpss:
-; AVX: # BB#0:
-; AVX-NEXT: vmovss (%rdi), %xmm0
+; AVX: # %bb.0:
+; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX-NEXT: vrcpss %xmm0, %xmm0, %xmm0
; AVX-NEXT: retq
%ld = load float, float* %a
@@ -25,14 +26,14 @@ define float @rcpss(float* %a) {
define float @rsqrtss(float* %a) {
; SSE-LABEL: rsqrtss:
-; SSE: # BB#0:
-; SSE-NEXT: movss (%rdi), %xmm0
+; SSE: # %bb.0:
+; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE-NEXT: rsqrtss %xmm0, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: rsqrtss:
-; AVX: # BB#0:
-; AVX-NEXT: vmovss (%rdi), %xmm0
+; AVX: # %bb.0:
+; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX-NEXT: vrsqrtss %xmm0, %xmm0, %xmm0
; AVX-NEXT: retq
%ld = load float, float* %a
@@ -44,14 +45,14 @@ define float @rsqrtss(float* %a) {
define float @sqrtss(float* %a) {
; SSE-LABEL: sqrtss:
-; SSE: # BB#0:
-; SSE-NEXT: movss (%rdi), %xmm0
+; SSE: # %bb.0:
+; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE-NEXT: sqrtss %xmm0, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: sqrtss:
-; AVX: # BB#0:
-; AVX-NEXT: vmovss (%rdi), %xmm0
+; AVX: # %bb.0:
+; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX-NEXT: vsqrtss %xmm0, %xmm0, %xmm0
; AVX-NEXT: retq
%ld = load float, float* %a
@@ -63,14 +64,14 @@ define float @sqrtss(float* %a) {
define double @sqrtsd(double* %a) {
; SSE-LABEL: sqrtsd:
-; SSE: # BB#0:
-; SSE-NEXT: movsd (%rdi), %xmm0
+; SSE: # %bb.0:
+; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; SSE-NEXT: sqrtsd %xmm0, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: sqrtsd:
-; AVX: # BB#0:
-; AVX-NEXT: vmovsd (%rdi), %xmm0
+; AVX: # %bb.0:
+; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0
; AVX-NEXT: retq
%ld = load double, double* %a
@@ -82,12 +83,12 @@ define double @sqrtsd(double* %a) {
define float @rcpss_size(float* %a) optsize {
; SSE-LABEL: rcpss_size:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: rcpss (%rdi), %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: rcpss_size:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vrcpss (%rdi), %xmm0, %xmm0
; AVX-NEXT: retq
%ld = load float, float* %a
@@ -97,14 +98,29 @@ define float @rcpss_size(float* %a) optsize {
ret float %ext
}
+define <4 x float> @rcpss_full_size(<4 x float>* %a) optsize {
+; SSE-LABEL: rcpss_full_size:
+; SSE: # %bb.0:
+; SSE-NEXT: rcpss (%rdi), %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: rcpss_full_size:
+; AVX: # %bb.0:
+; AVX-NEXT: vrcpss (%rdi), %xmm0, %xmm0
+; AVX-NEXT: retq
+ %ld = load <4 x float>, <4 x float>* %a
+ %res = tail call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %ld)
+ ret <4 x float> %res
+}
+
define float @rsqrtss_size(float* %a) optsize {
; SSE-LABEL: rsqrtss_size:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: rsqrtss (%rdi), %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: rsqrtss_size:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vrsqrtss (%rdi), %xmm0, %xmm0
; AVX-NEXT: retq
%ld = load float, float* %a
@@ -114,14 +130,29 @@ define float @rsqrtss_size(float* %a) optsize {
ret float %ext
}
+define <4 x float> @rsqrtss_full_size(<4 x float>* %a) optsize {
+; SSE-LABEL: rsqrtss_full_size:
+; SSE: # %bb.0:
+; SSE-NEXT: rsqrtss (%rdi), %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: rsqrtss_full_size:
+; AVX: # %bb.0:
+; AVX-NEXT: vrsqrtss (%rdi), %xmm0, %xmm0
+; AVX-NEXT: retq
+ %ld = load <4 x float>, <4 x float>* %a
+ %res = tail call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %ld)
+ ret <4 x float> %res
+}
+
define float @sqrtss_size(float* %a) optsize{
; SSE-LABEL: sqrtss_size:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: sqrtss (%rdi), %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: sqrtss_size:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vsqrtss (%rdi), %xmm0, %xmm0
; AVX-NEXT: retq
%ld = load float, float* %a
@@ -131,14 +162,29 @@ define float @sqrtss_size(float* %a) optsize{
ret float %ext
}
+define <4 x float> @sqrtss_full_size(<4 x float>* %a) optsize{
+; SSE-LABEL: sqrtss_full_size:
+; SSE: # %bb.0:
+; SSE-NEXT: sqrtss (%rdi), %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: sqrtss_full_size:
+; AVX: # %bb.0:
+; AVX-NEXT: vsqrtss (%rdi), %xmm0, %xmm0
+; AVX-NEXT: retq
+ %ld = load <4 x float>, <4 x float>* %a
+ %res = tail call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %ld)
+ ret <4 x float> %res
+}
+
define double @sqrtsd_size(double* %a) optsize {
; SSE-LABEL: sqrtsd_size:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: sqrtsd (%rdi), %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: sqrtsd_size:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vsqrtsd (%rdi), %xmm0, %xmm0
; AVX-NEXT: retq
%ld = load double, double* %a
@@ -148,6 +194,21 @@ define double @sqrtsd_size(double* %a) optsize {
ret double %ext
}
+define <2 x double> @sqrtsd_full_size(<2 x double>* %a) optsize {
+; SSE-LABEL: sqrtsd_full_size:
+; SSE: # %bb.0:
+; SSE-NEXT: sqrtsd (%rdi), %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: sqrtsd_full_size:
+; AVX: # %bb.0:
+; AVX-NEXT: vsqrtsd (%rdi), %xmm0, %xmm0
+; AVX-NEXT: retq
+ %ld = load <2 x double>, <2 x double>* %a
+ %res = tail call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %ld)
+ ret <2 x double> %res
+}
+
declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone
declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone
declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone
diff --git a/test/CodeGen/X86/fold-load-vec.ll b/test/CodeGen/X86/fold-load-vec.ll
index 657b7bdd24ff..db28156ab946 100644
--- a/test/CodeGen/X86/fold-load-vec.ll
+++ b/test/CodeGen/X86/fold-load-vec.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mcpu=corei7 -mattr=+sse4.1 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=corei7 -mattr=+sse4.1 | FileCheck %s
; rdar://12721174
; We should not fold movss into pshufd since pshufd expects m128 while movss
diff --git a/test/CodeGen/X86/fold-load.ll b/test/CodeGen/X86/fold-load.ll
index 8cdc58bb75e0..5ae46e2c2e2a 100644
--- a/test/CodeGen/X86/fold-load.ll
+++ b/test/CodeGen/X86/fold-load.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mcpu=generic -march=x86 | FileCheck %s
+; RUN: llc < %s -mcpu=generic -mtriple=i686-- | FileCheck %s
%struct._obstack_chunk = type { i8*, %struct._obstack_chunk*, [4 x i8] }
%struct.obstack = type { i32, %struct._obstack_chunk*, i8*, i8*, i8*, i32, i32, %struct._obstack_chunk* (...)*, void (...)*, i8*, i8 }
@stmt_obstack = external global %struct.obstack ; <%struct.obstack*> [#uses=1]
diff --git a/test/CodeGen/X86/fold-mul-lohi.ll b/test/CodeGen/X86/fold-mul-lohi.ll
index 8d4c5ef9eb22..f0f6849a32fd 100644
--- a/test/CodeGen/X86/fold-mul-lohi.ll
+++ b/test/CodeGen/X86/fold-mul-lohi.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | FileCheck %s
+; RUN: llc < %s -mtriple=i686-- | FileCheck %s
; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s
; CHECK-NOT: lea
diff --git a/test/CodeGen/X86/fold-pcmpeqd-1.ll b/test/CodeGen/X86/fold-pcmpeqd-1.ll
index 663e2afe22c7..0666d8909564 100644
--- a/test/CodeGen/X86/fold-pcmpeqd-1.ll
+++ b/test/CodeGen/X86/fold-pcmpeqd-1.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2,-avx | FileCheck %s
+; RUN: llc < %s -mtriple=i686-- -mattr=+sse2,-avx | FileCheck %s
define <2 x double> @foo() nounwind {
ret <2 x double> bitcast (<2 x i64><i64 -1, i64 -1> to <2 x double>)
diff --git a/test/CodeGen/X86/fold-push.ll b/test/CodeGen/X86/fold-push.ll
index 9d3afd1c449b..c887b835aabd 100644
--- a/test/CodeGen/X86/fold-push.ll
+++ b/test/CodeGen/X86/fold-push.ll
@@ -1,5 +1,5 @@
; RUN: llc < %s -mtriple=i686-windows | FileCheck %s -check-prefix=CHECK -check-prefix=NORMAL
-; RUN: llc < %s -mtriple=i686-windows -mattr=call-reg-indirect | FileCheck %s -check-prefix=CHECK -check-prefix=SLM
+; RUN: llc < %s -mtriple=i686-windows -mattr=slow-two-mem-ops | FileCheck %s -check-prefix=CHECK -check-prefix=SLM
declare void @foo(i32 %r)
diff --git a/test/CodeGen/X86/fold-rmw-ops.ll b/test/CodeGen/X86/fold-rmw-ops.ll
new file mode 100644
index 000000000000..bb89d4b54ea4
--- /dev/null
+++ b/test/CodeGen/X86/fold-rmw-ops.ll
@@ -0,0 +1,2439 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -verify-machineinstrs -show-mc-encoding | FileCheck %s
+
+target triple = "x86_64-unknown-unknown"
+
+@g64 = external global i64, align 8
+@g32 = external global i32, align 4
+@g16 = external global i16, align 2
+@g8 = external global i8, align 1
+
+declare void @a()
+declare void @b()
+
+define void @add64_imm32_br() nounwind {
+; CHECK-LABEL: add64_imm32_br:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: addq $16777214, {{.*}}(%rip) # encoding: [0x48,0x81,0x05,A,A,A,A,0xfe,0xff,0xff,0x00]
+; CHECK-NEXT: # fixup A - offset: 3, value: g64-8, kind: reloc_riprel_4byte
+; CHECK-NEXT: # imm = 0xFFFFFE
+; CHECK-NEXT: js .LBB0_1 # encoding: [0x78,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: .LBB0_1-1, kind: FK_PCRel_1
+; CHECK-NEXT: # %bb.2: # %b
+; CHECK-NEXT: jmp b # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: b-1, kind: FK_PCRel_1
+; CHECK-NEXT: .LBB0_1: # %a
+; CHECK-NEXT: jmp a # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: a-1, kind: FK_PCRel_1
+entry:
+ %load1 = load i64, i64* @g64
+ ; Add 0x00FFFFFE, a positive immediate requiring 24-bits.
+ %add = add i64 %load1, 16777214
+ store i64 %add, i64* @g64
+ %cond = icmp slt i64 %add, 0
+ br i1 %cond, label %a, label %b
+
+a:
+ tail call void @a()
+ ret void
+
+b:
+ tail call void @b()
+ ret void
+}
+
+define void @add64_sext_imm32_br() nounwind {
+; CHECK-LABEL: add64_sext_imm32_br:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: addq $-2147483648, {{.*}}(%rip) # encoding: [0x48,0x81,0x05,A,A,A,A,0x00,0x00,0x00,0x80]
+; CHECK-NEXT: # fixup A - offset: 3, value: g64-8, kind: reloc_riprel_4byte
+; CHECK-NEXT: # imm = 0x80000000
+; CHECK-NEXT: js .LBB1_1 # encoding: [0x78,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: .LBB1_1-1, kind: FK_PCRel_1
+; CHECK-NEXT: # %bb.2: # %b
+; CHECK-NEXT: jmp b # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: b-1, kind: FK_PCRel_1
+; CHECK-NEXT: .LBB1_1: # %a
+; CHECK-NEXT: jmp a # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: a-1, kind: FK_PCRel_1
+entry:
+ %load1 = load i64, i64* @g64
+ ; Add -0x80000000, which requires sign-extended 32 bits.
+ %add = add i64 %load1, -2147483648
+ store i64 %add, i64* @g64
+ %cond = icmp slt i64 %add, 0
+ br i1 %cond, label %a, label %b
+
+a:
+ tail call void @a()
+ ret void
+
+b:
+ tail call void @b()
+ ret void
+}
+
+define void @add64_imm32_via_sub_br() nounwind {
+; CHECK-LABEL: add64_imm32_via_sub_br:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: subq $-2147483648, {{.*}}(%rip) # encoding: [0x48,0x81,0x2d,A,A,A,A,0x00,0x00,0x00,0x80]
+; CHECK-NEXT: # fixup A - offset: 3, value: g64-8, kind: reloc_riprel_4byte
+; CHECK-NEXT: # imm = 0x80000000
+; CHECK-NEXT: js .LBB2_1 # encoding: [0x78,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: .LBB2_1-1, kind: FK_PCRel_1
+; CHECK-NEXT: # %bb.2: # %b
+; CHECK-NEXT: jmp b # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: b-1, kind: FK_PCRel_1
+; CHECK-NEXT: .LBB2_1: # %a
+; CHECK-NEXT: jmp a # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: a-1, kind: FK_PCRel_1
+entry:
+ %load1 = load i64, i64* @g64
+ ; Add 0x80000000, which cannot fit in a sign extended 32-bit immediate. This
+ ; get's folded because we can instead subtract -0x80000000.
+ %add = add i64 %load1, 2147483648
+ store i64 %add, i64* @g64
+ %cond = icmp slt i64 %add, 0
+ br i1 %cond, label %a, label %b
+
+a:
+ tail call void @a()
+ ret void
+
+b:
+ tail call void @b()
+ ret void
+}
+
+define void @add64_no_imm32_via_sub_due_to_cf_br() nounwind {
+; CHECK-LABEL: add64_no_imm32_via_sub_due_to_cf_br:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: movl $2147483648, %eax # encoding: [0xb8,0x00,0x00,0x00,0x80]
+; CHECK-NEXT: # imm = 0x80000000
+; CHECK-NEXT: addq %rax, {{.*}}(%rip) # encoding: [0x48,0x01,0x05,A,A,A,A]
+; CHECK-NEXT: # fixup A - offset: 3, value: g64-4, kind: reloc_riprel_4byte
+; CHECK-NEXT: jae .LBB3_2 # encoding: [0x73,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: .LBB3_2-1, kind: FK_PCRel_1
+; CHECK-NEXT: # %bb.1: # %a
+; CHECK-NEXT: jmp a # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: a-1, kind: FK_PCRel_1
+; CHECK-NEXT: .LBB3_2: # %b
+; CHECK-NEXT: jmp b # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: b-1, kind: FK_PCRel_1
+entry:
+ %load1 = load i64, i64* @g64
+ ; Add 0x80000000, which cannot fit in a sign extended 32-bit immediate, but
+ ; could in theory be folded into an immediate operand of a sub. However, we
+ ; use the CF flag here and so shouldn't make that transformation.
+ %add = add i64 %load1, 2147483648
+ store i64 %add, i64* @g64
+ %cond = icmp ult i64 %add, 2147483648
+ br i1 %cond, label %a, label %b
+
+a:
+ tail call void @a()
+ ret void
+
+b:
+ tail call void @b()
+ ret void
+}
+
+define void @add64_too_large_imm32_br() nounwind {
+; CHECK-LABEL: add64_too_large_imm32_br:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: movl $2147483649, %eax # encoding: [0xb8,0x01,0x00,0x00,0x80]
+; CHECK-NEXT: # imm = 0x80000001
+; CHECK-NEXT: addq %rax, {{.*}}(%rip) # encoding: [0x48,0x01,0x05,A,A,A,A]
+; CHECK-NEXT: # fixup A - offset: 3, value: g64-4, kind: reloc_riprel_4byte
+; CHECK-NEXT: js .LBB4_1 # encoding: [0x78,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: .LBB4_1-1, kind: FK_PCRel_1
+; CHECK-NEXT: # %bb.2: # %b
+; CHECK-NEXT: jmp b # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: b-1, kind: FK_PCRel_1
+; CHECK-NEXT: .LBB4_1: # %a
+; CHECK-NEXT: jmp a # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: a-1, kind: FK_PCRel_1
+entry:
+ %load1 = load i64, i64* @g64
+ ; Add 0x80000001, which cannot fit in a sign extended 32-bit immediate. This
+ ; should not get folded into an immediate.
+ %add = add i64 %load1, 2147483649
+ store i64 %add, i64* @g64
+ %cond = icmp slt i64 %add, 0
+ br i1 %cond, label %a, label %b
+
+a:
+ tail call void @a()
+ ret void
+
+b:
+ tail call void @b()
+ ret void
+}
+
+define void @add64_imm8_via_sub_br() nounwind {
+; CHECK-LABEL: add64_imm8_via_sub_br:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: subq $-128, {{.*}}(%rip) # encoding: [0x48,0x83,0x2d,A,A,A,A,0x80]
+; CHECK-NEXT: # fixup A - offset: 3, value: g64-5, kind: reloc_riprel_4byte
+; CHECK-NEXT: js .LBB5_1 # encoding: [0x78,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: .LBB5_1-1, kind: FK_PCRel_1
+; CHECK-NEXT: # %bb.2: # %b
+; CHECK-NEXT: jmp b # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: b-1, kind: FK_PCRel_1
+; CHECK-NEXT: .LBB5_1: # %a
+; CHECK-NEXT: jmp a # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: a-1, kind: FK_PCRel_1
+entry:
+ %load1 = load i64, i64* @g64
+ ; Add 0x80 which can't quite fit into an imm8 because it would be sign
+ ; extended, but which can fit if we convert to a sub and negate the value.
+ %add = add i64 %load1, 128
+ store i64 %add, i64* @g64
+ %cond = icmp slt i64 %add, 0
+ br i1 %cond, label %a, label %b
+
+a:
+ tail call void @a()
+ ret void
+
+b:
+ tail call void @b()
+ ret void
+}
+
+define void @add64_imm8_br() nounwind {
+; CHECK-LABEL: add64_imm8_br:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: addq $42, {{.*}}(%rip) # encoding: [0x48,0x83,0x05,A,A,A,A,0x2a]
+; CHECK-NEXT: # fixup A - offset: 3, value: g64-5, kind: reloc_riprel_4byte
+; CHECK-NEXT: js .LBB6_1 # encoding: [0x78,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: .LBB6_1-1, kind: FK_PCRel_1
+; CHECK-NEXT: # %bb.2: # %b
+; CHECK-NEXT: jmp b # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: b-1, kind: FK_PCRel_1
+; CHECK-NEXT: .LBB6_1: # %a
+; CHECK-NEXT: jmp a # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: a-1, kind: FK_PCRel_1
+entry:
+ %load1 = load i64, i64* @g64
+ %add = add i64 %load1, 42
+ store i64 %add, i64* @g64
+ %cond = icmp slt i64 %add, 0
+ br i1 %cond, label %a, label %b
+
+a:
+ tail call void @a()
+ ret void
+
+b:
+ tail call void @b()
+ ret void
+}
+
+define void @add64_imm8_neg_br() nounwind {
+; CHECK-LABEL: add64_imm8_neg_br:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: addq $-42, {{.*}}(%rip) # encoding: [0x48,0x83,0x05,A,A,A,A,0xd6]
+; CHECK-NEXT: # fixup A - offset: 3, value: g64-5, kind: reloc_riprel_4byte
+; CHECK-NEXT: js .LBB7_1 # encoding: [0x78,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: .LBB7_1-1, kind: FK_PCRel_1
+; CHECK-NEXT: # %bb.2: # %b
+; CHECK-NEXT: jmp b # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: b-1, kind: FK_PCRel_1
+; CHECK-NEXT: .LBB7_1: # %a
+; CHECK-NEXT: jmp a # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: a-1, kind: FK_PCRel_1
+entry:
+ %load1 = load i64, i64* @g64
+ %add = add i64 %load1, -42
+ store i64 %add, i64* @g64
+ %cond = icmp slt i64 %add, 0
+ br i1 %cond, label %a, label %b
+
+a:
+ tail call void @a()
+ ret void
+
+b:
+ tail call void @b()
+ ret void
+}
+
+define void @add32_imm_br() nounwind {
+; CHECK-LABEL: add32_imm_br:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: addl $-2147483648, {{.*}}(%rip) # encoding: [0x81,0x05,A,A,A,A,0x00,0x00,0x00,0x80]
+; CHECK-NEXT: # fixup A - offset: 2, value: g32-8, kind: reloc_riprel_4byte
+; CHECK-NEXT: # imm = 0x80000000
+; CHECK-NEXT: js .LBB8_1 # encoding: [0x78,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: .LBB8_1-1, kind: FK_PCRel_1
+; CHECK-NEXT: # %bb.2: # %b
+; CHECK-NEXT: jmp b # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: b-1, kind: FK_PCRel_1
+; CHECK-NEXT: .LBB8_1: # %a
+; CHECK-NEXT: jmp a # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: a-1, kind: FK_PCRel_1
+entry:
+ %load1 = load i32, i32* @g32
+ ; Add 0x80000000, a positive number requiring 32 bits of immediate.
+ %add = add i32 %load1, 2147483648
+ store i32 %add, i32* @g32
+ %cond = icmp slt i32 %add, 0
+ br i1 %cond, label %a, label %b
+
+a:
+ tail call void @a()
+ ret void
+
+b:
+ tail call void @b()
+ ret void
+}
+
+define void @add32_imm8_br() nounwind {
+; CHECK-LABEL: add32_imm8_br:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: addl $42, {{.*}}(%rip) # encoding: [0x83,0x05,A,A,A,A,0x2a]
+; CHECK-NEXT: # fixup A - offset: 2, value: g32-5, kind: reloc_riprel_4byte
+; CHECK-NEXT: js .LBB9_1 # encoding: [0x78,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: .LBB9_1-1, kind: FK_PCRel_1
+; CHECK-NEXT: # %bb.2: # %b
+; CHECK-NEXT: jmp b # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: b-1, kind: FK_PCRel_1
+; CHECK-NEXT: .LBB9_1: # %a
+; CHECK-NEXT: jmp a # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: a-1, kind: FK_PCRel_1
+entry:
+ %load1 = load i32, i32* @g32
+ %add = add i32 %load1, 42
+ store i32 %add, i32* @g32
+ %cond = icmp slt i32 %add, 0
+ br i1 %cond, label %a, label %b
+
+a:
+ tail call void @a()
+ ret void
+
+b:
+ tail call void @b()
+ ret void
+}
+
+define void @add32_imm8_neg_br() nounwind {
+; CHECK-LABEL: add32_imm8_neg_br:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: addl $-42, {{.*}}(%rip) # encoding: [0x83,0x05,A,A,A,A,0xd6]
+; CHECK-NEXT: # fixup A - offset: 2, value: g32-5, kind: reloc_riprel_4byte
+; CHECK-NEXT: js .LBB10_1 # encoding: [0x78,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: .LBB10_1-1, kind: FK_PCRel_1
+; CHECK-NEXT: # %bb.2: # %b
+; CHECK-NEXT: jmp b # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: b-1, kind: FK_PCRel_1
+; CHECK-NEXT: .LBB10_1: # %a
+; CHECK-NEXT: jmp a # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: a-1, kind: FK_PCRel_1
+entry:
+ %load1 = load i32, i32* @g32
+ %add = add i32 %load1, -42
+ store i32 %add, i32* @g32
+ %cond = icmp slt i32 %add, 0
+ br i1 %cond, label %a, label %b
+
+a:
+ tail call void @a()
+ ret void
+
+b:
+ tail call void @b()
+ ret void
+}
+
+define void @add16_imm_br() nounwind {
+; CHECK-LABEL: add16_imm_br:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: addw $-32768, {{.*}}(%rip) # encoding: [0x66,0x81,0x05,A,A,A,A,0x00,0x80]
+; CHECK-NEXT: # fixup A - offset: 3, value: g16-6, kind: reloc_riprel_4byte
+; CHECK-NEXT: # imm = 0x8000
+; CHECK-NEXT: js .LBB11_1 # encoding: [0x78,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: .LBB11_1-1, kind: FK_PCRel_1
+; CHECK-NEXT: # %bb.2: # %b
+; CHECK-NEXT: jmp b # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: b-1, kind: FK_PCRel_1
+; CHECK-NEXT: .LBB11_1: # %a
+; CHECK-NEXT: jmp a # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: a-1, kind: FK_PCRel_1
+entry:
+ %load1 = load i16, i16* @g16
+ ; Add 0x8000, a positive number requiring 16 bits of immediate.
+ %add = add i16 %load1, 32768
+ store i16 %add, i16* @g16
+ %cond = icmp slt i16 %add, 0
+ br i1 %cond, label %a, label %b
+
+a:
+ tail call void @a()
+ ret void
+
+b:
+ tail call void @b()
+ ret void
+}
+
+define void @add16_imm8_br() nounwind {
+; CHECK-LABEL: add16_imm8_br:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: addw $42, {{.*}}(%rip) # encoding: [0x66,0x83,0x05,A,A,A,A,0x2a]
+; CHECK-NEXT: # fixup A - offset: 3, value: g16-5, kind: reloc_riprel_4byte
+; CHECK-NEXT: js .LBB12_1 # encoding: [0x78,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: .LBB12_1-1, kind: FK_PCRel_1
+; CHECK-NEXT: # %bb.2: # %b
+; CHECK-NEXT: jmp b # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: b-1, kind: FK_PCRel_1
+; CHECK-NEXT: .LBB12_1: # %a
+; CHECK-NEXT: jmp a # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: a-1, kind: FK_PCRel_1
+entry:
+ %load1 = load i16, i16* @g16
+ %add = add i16 %load1, 42
+ store i16 %add, i16* @g16
+ %cond = icmp slt i16 %add, 0
+ br i1 %cond, label %a, label %b
+
+a:
+ tail call void @a()
+ ret void
+
+b:
+ tail call void @b()
+ ret void
+}
+
+define void @add16_imm8_neg_br() nounwind {
+; CHECK-LABEL: add16_imm8_neg_br:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: addw $-42, {{.*}}(%rip) # encoding: [0x66,0x83,0x05,A,A,A,A,0xd6]
+; CHECK-NEXT: # fixup A - offset: 3, value: g16-5, kind: reloc_riprel_4byte
+; CHECK-NEXT: js .LBB13_1 # encoding: [0x78,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: .LBB13_1-1, kind: FK_PCRel_1
+; CHECK-NEXT: # %bb.2: # %b
+; CHECK-NEXT: jmp b # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: b-1, kind: FK_PCRel_1
+; CHECK-NEXT: .LBB13_1: # %a
+; CHECK-NEXT: jmp a # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: a-1, kind: FK_PCRel_1
+entry:
+ %load1 = load i16, i16* @g16
+ %add = add i16 %load1, -42
+ store i16 %add, i16* @g16
+ %cond = icmp slt i16 %add, 0
+ br i1 %cond, label %a, label %b
+
+a:
+ tail call void @a()
+ ret void
+
+b:
+ tail call void @b()
+ ret void
+}
+
+define void @add8_imm_br() nounwind {
+; CHECK-LABEL: add8_imm_br:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: addb $-2, {{.*}}(%rip) # encoding: [0x80,0x05,A,A,A,A,0xfe]
+; CHECK-NEXT: # fixup A - offset: 2, value: g8-5, kind: reloc_riprel_4byte
+; CHECK-NEXT: js .LBB14_1 # encoding: [0x78,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: .LBB14_1-1, kind: FK_PCRel_1
+; CHECK-NEXT: # %bb.2: # %b
+; CHECK-NEXT: jmp b # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: b-1, kind: FK_PCRel_1
+; CHECK-NEXT: .LBB14_1: # %a
+; CHECK-NEXT: jmp a # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: a-1, kind: FK_PCRel_1
+entry:
+ %load1 = load i8, i8* @g8
+ %add = add i8 %load1, -2
+ store i8 %add, i8* @g8
+ %cond = icmp slt i8 %add, 0
+ br i1 %cond, label %a, label %b
+
+a:
+ tail call void @a()
+ ret void
+
+b:
+ tail call void @b()
+ ret void
+}
+
+define void @add64_reg_br(i64 %arg) nounwind {
+; CHECK-LABEL: add64_reg_br:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: addq %rdi, {{.*}}(%rip) # encoding: [0x48,0x01,0x3d,A,A,A,A]
+; CHECK-NEXT: # fixup A - offset: 3, value: g64-4, kind: reloc_riprel_4byte
+; CHECK-NEXT: js .LBB15_1 # encoding: [0x78,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: .LBB15_1-1, kind: FK_PCRel_1
+; CHECK-NEXT: # %bb.2: # %b
+; CHECK-NEXT: jmp b # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: b-1, kind: FK_PCRel_1
+; CHECK-NEXT: .LBB15_1: # %a
+; CHECK-NEXT: jmp a # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: a-1, kind: FK_PCRel_1
+entry:
+ %load1 = load i64, i64* @g64
+ %add = add i64 %load1, %arg
+ store i64 %add, i64* @g64
+ %cond = icmp slt i64 %add, 0
+ br i1 %cond, label %a, label %b
+
+a:
+ tail call void @a()
+ ret void
+
+b:
+ tail call void @b()
+ ret void
+}
+
+define void @add32_reg_br(i32 %arg) nounwind {
+; CHECK-LABEL: add32_reg_br:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: addl %edi, {{.*}}(%rip) # encoding: [0x01,0x3d,A,A,A,A]
+; CHECK-NEXT: # fixup A - offset: 2, value: g32-4, kind: reloc_riprel_4byte
+; CHECK-NEXT: js .LBB16_1 # encoding: [0x78,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: .LBB16_1-1, kind: FK_PCRel_1
+; CHECK-NEXT: # %bb.2: # %b
+; CHECK-NEXT: jmp b # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: b-1, kind: FK_PCRel_1
+; CHECK-NEXT: .LBB16_1: # %a
+; CHECK-NEXT: jmp a # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: a-1, kind: FK_PCRel_1
+entry:
+ %load1 = load i32, i32* @g32
+ %add = add i32 %load1, %arg
+ store i32 %add, i32* @g32
+ %cond = icmp slt i32 %add, 0
+ br i1 %cond, label %a, label %b
+
+a:
+ tail call void @a()
+ ret void
+
+b:
+ tail call void @b()
+ ret void
+}
+
+define void @add16_reg_br(i16 %arg) nounwind {
+; CHECK-LABEL: add16_reg_br:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: addw %di, {{.*}}(%rip) # encoding: [0x66,0x01,0x3d,A,A,A,A]
+; CHECK-NEXT: # fixup A - offset: 3, value: g16-4, kind: reloc_riprel_4byte
+; CHECK-NEXT: js .LBB17_1 # encoding: [0x78,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: .LBB17_1-1, kind: FK_PCRel_1
+; CHECK-NEXT: # %bb.2: # %b
+; CHECK-NEXT: jmp b # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: b-1, kind: FK_PCRel_1
+; CHECK-NEXT: .LBB17_1: # %a
+; CHECK-NEXT: jmp a # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: a-1, kind: FK_PCRel_1
+entry:
+ %load1 = load i16, i16* @g16
+ %add = add i16 %load1, %arg
+ store i16 %add, i16* @g16
+ %cond = icmp slt i16 %add, 0
+ br i1 %cond, label %a, label %b
+
+a:
+ tail call void @a()
+ ret void
+
+b:
+ tail call void @b()
+ ret void
+}
+
+define void @add8_reg_br(i8 %arg) nounwind {
+; CHECK-LABEL: add8_reg_br:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: addb %dil, {{.*}}(%rip) # encoding: [0x40,0x00,0x3d,A,A,A,A]
+; CHECK-NEXT: # fixup A - offset: 3, value: g8-4, kind: reloc_riprel_4byte
+; CHECK-NEXT: js .LBB18_1 # encoding: [0x78,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: .LBB18_1-1, kind: FK_PCRel_1
+; CHECK-NEXT: # %bb.2: # %b
+; CHECK-NEXT: jmp b # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: b-1, kind: FK_PCRel_1
+; CHECK-NEXT: .LBB18_1: # %a
+; CHECK-NEXT: jmp a # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: a-1, kind: FK_PCRel_1
+entry:
+ %load1 = load i8, i8* @g8
+ %add = add i8 %load1, %arg
+ store i8 %add, i8* @g8
+ %cond = icmp slt i8 %add, 0
+ br i1 %cond, label %a, label %b
+
+a:
+ tail call void @a()
+ ret void
+
+b:
+ tail call void @b()
+ ret void
+}
+
+define void @sub64_imm32_br() nounwind {
+; CHECK-LABEL: sub64_imm32_br:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: subq $-2147483648, {{.*}}(%rip) # encoding: [0x48,0x81,0x2d,A,A,A,A,0x00,0x00,0x00,0x80]
+; CHECK-NEXT: # fixup A - offset: 3, value: g64-8, kind: reloc_riprel_4byte
+; CHECK-NEXT: # imm = 0x80000000
+; CHECK-NEXT: js .LBB19_1 # encoding: [0x78,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: .LBB19_1-1, kind: FK_PCRel_1
+; CHECK-NEXT: # %bb.2: # %b
+; CHECK-NEXT: jmp b # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: b-1, kind: FK_PCRel_1
+; CHECK-NEXT: .LBB19_1: # %a
+; CHECK-NEXT: jmp a # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: a-1, kind: FK_PCRel_1
+entry:
+ %load1 = load i64, i64* @g64
+ ; Subtract -0x80000000, which can't be negated into a sign-extended 32-bit
+ ; immediate, so that we have to select sub here.
+ %sub = sub i64 %load1, -2147483648
+ store i64 %sub, i64* @g64
+ %cond = icmp slt i64 %sub, 0
+ br i1 %cond, label %a, label %b
+
+a:
+ tail call void @a()
+ ret void
+
+b:
+ tail call void @b()
+ ret void
+}
+
+define void @sub64_too_large_imm32_br() nounwind {
+; CHECK-LABEL: sub64_too_large_imm32_br:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: movabsq $-4294967295, %rax # encoding: [0x48,0xb8,0x01,0x00,0x00,0x00,0xff,0xff,0xff,0xff]
+; CHECK-NEXT: # imm = 0xFFFFFFFF00000001
+; CHECK-NEXT: addq %rax, {{.*}}(%rip) # encoding: [0x48,0x01,0x05,A,A,A,A]
+; CHECK-NEXT: # fixup A - offset: 3, value: g64-4, kind: reloc_riprel_4byte
+; CHECK-NEXT: js .LBB20_1 # encoding: [0x78,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: .LBB20_1-1, kind: FK_PCRel_1
+; CHECK-NEXT: # %bb.2: # %b
+; CHECK-NEXT: jmp b # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: b-1, kind: FK_PCRel_1
+; CHECK-NEXT: .LBB20_1: # %a
+; CHECK-NEXT: jmp a # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: a-1, kind: FK_PCRel_1
+entry:
+ %load1 = load i64, i64* @g64
+ ; Subtract 0xFFFFFFFF, which cannot fit in a sign extended 32-bit immediate,
+ ; even if negated and sign extended as an add.
+ %sub = sub i64 %load1, 4294967295
+ store i64 %sub, i64* @g64
+ %cond = icmp slt i64 %sub, 0
+ br i1 %cond, label %a, label %b
+
+a:
+ tail call void @a()
+ ret void
+
+b:
+ tail call void @b()
+ ret void
+}
+
+define void @sub64_imm8_br() nounwind {
+; CHECK-LABEL: sub64_imm8_br:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: subq $-128, {{.*}}(%rip) # encoding: [0x48,0x83,0x2d,A,A,A,A,0x80]
+; CHECK-NEXT: # fixup A - offset: 3, value: g64-5, kind: reloc_riprel_4byte
+; CHECK-NEXT: js .LBB21_1 # encoding: [0x78,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: .LBB21_1-1, kind: FK_PCRel_1
+; CHECK-NEXT: # %bb.2: # %b
+; CHECK-NEXT: jmp b # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: b-1, kind: FK_PCRel_1
+; CHECK-NEXT: .LBB21_1: # %a
+; CHECK-NEXT: jmp a # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: a-1, kind: FK_PCRel_1
+entry:
+ %load1 = load i64, i64* @g64
+ ; Subtract -0x80, which can be done with an 8-bit immediate but only as
+ ; a subtract where that immediate can be negative.
+ %sub = sub i64 %load1, -128
+ store i64 %sub, i64* @g64
+ %cond = icmp slt i64 %sub, 0
+ br i1 %cond, label %a, label %b
+
+a:
+ tail call void @a()
+ ret void
+
+b:
+ tail call void @b()
+ ret void
+}
+
+define void @sub32_imm_br() nounwind {
+; CHECK-LABEL: sub32_imm_br:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: addl $-2147483648, {{.*}}(%rip) # encoding: [0x81,0x05,A,A,A,A,0x00,0x00,0x00,0x80]
+; CHECK-NEXT: # fixup A - offset: 2, value: g32-8, kind: reloc_riprel_4byte
+; CHECK-NEXT: # imm = 0x80000000
+; CHECK-NEXT: js .LBB22_1 # encoding: [0x78,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: .LBB22_1-1, kind: FK_PCRel_1
+; CHECK-NEXT: # %bb.2: # %b
+; CHECK-NEXT: jmp b # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: b-1, kind: FK_PCRel_1
+; CHECK-NEXT: .LBB22_1: # %a
+; CHECK-NEXT: jmp a # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: a-1, kind: FK_PCRel_1
+entry:
+ %load1 = load i32, i32* @g32
+ ; Subtract -0x80000000, which requires 32 bits of immediate but still gets
+ ; lowered as an add.
+ %sub = sub i32 %load1, -2147483648
+ store i32 %sub, i32* @g32
+ %cond = icmp slt i32 %sub, 0
+ br i1 %cond, label %a, label %b
+
+a:
+ tail call void @a()
+ ret void
+
+b:
+ tail call void @b()
+ ret void
+}
+
+define void @sub32_imm8_br() nounwind {
+; CHECK-LABEL: sub32_imm8_br:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: subl $-128, {{.*}}(%rip) # encoding: [0x83,0x2d,A,A,A,A,0x80]
+; CHECK-NEXT: # fixup A - offset: 2, value: g32-5, kind: reloc_riprel_4byte
+; CHECK-NEXT: js .LBB23_1 # encoding: [0x78,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: .LBB23_1-1, kind: FK_PCRel_1
+; CHECK-NEXT: # %bb.2: # %b
+; CHECK-NEXT: jmp b # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: b-1, kind: FK_PCRel_1
+; CHECK-NEXT: .LBB23_1: # %a
+; CHECK-NEXT: jmp a # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: a-1, kind: FK_PCRel_1
+entry:
+ %load1 = load i32, i32* @g32
+ ; Subtract -0x80, which can be done with an 8-bit immediate but only as
+ ; a subtract where that immediate can be negative.
+ %sub = sub i32 %load1, -128
+ store i32 %sub, i32* @g32
+ %cond = icmp slt i32 %sub, 0
+ br i1 %cond, label %a, label %b
+
+a:
+ tail call void @a()
+ ret void
+
+b:
+ tail call void @b()
+ ret void
+}
+
+define void @sub16_imm_br() nounwind {
+; CHECK-LABEL: sub16_imm_br:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: addw $-32768, {{.*}}(%rip) # encoding: [0x66,0x81,0x05,A,A,A,A,0x00,0x80]
+; CHECK-NEXT: # fixup A - offset: 3, value: g16-6, kind: reloc_riprel_4byte
+; CHECK-NEXT: # imm = 0x8000
+; CHECK-NEXT: js .LBB24_1 # encoding: [0x78,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: .LBB24_1-1, kind: FK_PCRel_1
+; CHECK-NEXT: # %bb.2: # %b
+; CHECK-NEXT: jmp b # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: b-1, kind: FK_PCRel_1
+; CHECK-NEXT: .LBB24_1: # %a
+; CHECK-NEXT: jmp a # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: a-1, kind: FK_PCRel_1
+entry:
+ %load1 = load i16, i16* @g16
+ ; Subtract -0x8000, which requires a 16 bits of immediate but still gets
+ ; lowered as an add.
+ %sub = sub i16 %load1, -32768
+ store i16 %sub, i16* @g16
+ %cond = icmp slt i16 %sub, 0
+ br i1 %cond, label %a, label %b
+
+a:
+ tail call void @a()
+ ret void
+
+b:
+ tail call void @b()
+ ret void
+}
+
+define void @sub16_imm8_br() nounwind {
+; CHECK-LABEL: sub16_imm8_br:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: subw $-128, {{.*}}(%rip) # encoding: [0x66,0x83,0x2d,A,A,A,A,0x80]
+; CHECK-NEXT: # fixup A - offset: 3, value: g16-5, kind: reloc_riprel_4byte
+; CHECK-NEXT: js .LBB25_1 # encoding: [0x78,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: .LBB25_1-1, kind: FK_PCRel_1
+; CHECK-NEXT: # %bb.2: # %b
+; CHECK-NEXT: jmp b # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: b-1, kind: FK_PCRel_1
+; CHECK-NEXT: .LBB25_1: # %a
+; CHECK-NEXT: jmp a # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: a-1, kind: FK_PCRel_1
+entry:
+ %load1 = load i16, i16* @g16
+ ; Subtract -0x80, which can be done with an 8-bit immediate but only as
+ ; a subtract where that immediate can be negative.
+ %sub = sub i16 %load1, -128
+ store i16 %sub, i16* @g16
+ %cond = icmp slt i16 %sub, 0
+ br i1 %cond, label %a, label %b
+
+a:
+ tail call void @a()
+ ret void
+
+b:
+ tail call void @b()
+ ret void
+}
+
+define void @sub8_imm_br() nounwind {
+; CHECK-LABEL: sub8_imm_br:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: addb $-128, {{.*}}(%rip) # encoding: [0x80,0x05,A,A,A,A,0x80]
+; CHECK-NEXT: # fixup A - offset: 2, value: g8-5, kind: reloc_riprel_4byte
+; CHECK-NEXT: js .LBB26_1 # encoding: [0x78,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: .LBB26_1-1, kind: FK_PCRel_1
+; CHECK-NEXT: # %bb.2: # %b
+; CHECK-NEXT: jmp b # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: b-1, kind: FK_PCRel_1
+; CHECK-NEXT: .LBB26_1: # %a
+; CHECK-NEXT: jmp a # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: a-1, kind: FK_PCRel_1
+entry:
+ %load1 = load i8, i8* @g8
+ ; Subtract -0x80, which requires an 8-bit immediate but still gets lowered as
+ ; an add.
+ %sub = sub i8 %load1, -128
+ store i8 %sub, i8* @g8
+ %cond = icmp slt i8 %sub, 0
+ br i1 %cond, label %a, label %b
+
+a:
+ tail call void @a()
+ ret void
+
+b:
+ tail call void @b()
+ ret void
+}
+
+define void @sub64_reg_br(i64 %arg) nounwind {
+; CHECK-LABEL: sub64_reg_br:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: subq %rdi, {{.*}}(%rip) # encoding: [0x48,0x29,0x3d,A,A,A,A]
+; CHECK-NEXT: # fixup A - offset: 3, value: g64-4, kind: reloc_riprel_4byte
+; CHECK-NEXT: js .LBB27_1 # encoding: [0x78,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: .LBB27_1-1, kind: FK_PCRel_1
+; CHECK-NEXT: # %bb.2: # %b
+; CHECK-NEXT: jmp b # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: b-1, kind: FK_PCRel_1
+; CHECK-NEXT: .LBB27_1: # %a
+; CHECK-NEXT: jmp a # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: a-1, kind: FK_PCRel_1
+entry:
+ %load1 = load i64, i64* @g64
+ %sub = sub i64 %load1, %arg
+ store i64 %sub, i64* @g64
+ %cond = icmp slt i64 %sub, 0
+ br i1 %cond, label %a, label %b
+
+a:
+ tail call void @a()
+ ret void
+
+b:
+ tail call void @b()
+ ret void
+}
+
+define void @sub32_reg_br(i32 %arg) nounwind {
+; CHECK-LABEL: sub32_reg_br:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: subl %edi, {{.*}}(%rip) # encoding: [0x29,0x3d,A,A,A,A]
+; CHECK-NEXT: # fixup A - offset: 2, value: g32-4, kind: reloc_riprel_4byte
+; CHECK-NEXT: js .LBB28_1 # encoding: [0x78,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: .LBB28_1-1, kind: FK_PCRel_1
+; CHECK-NEXT: # %bb.2: # %b
+; CHECK-NEXT: jmp b # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: b-1, kind: FK_PCRel_1
+; CHECK-NEXT: .LBB28_1: # %a
+; CHECK-NEXT: jmp a # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: a-1, kind: FK_PCRel_1
+entry:
+ %load1 = load i32, i32* @g32
+ %sub = sub i32 %load1, %arg
+ store i32 %sub, i32* @g32
+ %cond = icmp slt i32 %sub, 0
+ br i1 %cond, label %a, label %b
+
+a:
+ tail call void @a()
+ ret void
+
+b:
+ tail call void @b()
+ ret void
+}
+
+define void @sub16_reg_br(i16 %arg) nounwind {
+; CHECK-LABEL: sub16_reg_br:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: subw %di, {{.*}}(%rip) # encoding: [0x66,0x29,0x3d,A,A,A,A]
+; CHECK-NEXT: # fixup A - offset: 3, value: g16-4, kind: reloc_riprel_4byte
+; CHECK-NEXT: js .LBB29_1 # encoding: [0x78,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: .LBB29_1-1, kind: FK_PCRel_1
+; CHECK-NEXT: # %bb.2: # %b
+; CHECK-NEXT: jmp b # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: b-1, kind: FK_PCRel_1
+; CHECK-NEXT: .LBB29_1: # %a
+; CHECK-NEXT: jmp a # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: a-1, kind: FK_PCRel_1
+entry:
+ %load1 = load i16, i16* @g16
+ %sub = sub i16 %load1, %arg
+ store i16 %sub, i16* @g16
+ %cond = icmp slt i16 %sub, 0
+ br i1 %cond, label %a, label %b
+
+a:
+ tail call void @a()
+ ret void
+
+b:
+ tail call void @b()
+ ret void
+}
+
+define void @sub8_reg_br(i8 %arg) nounwind {
+; CHECK-LABEL: sub8_reg_br:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: subb %dil, {{.*}}(%rip) # encoding: [0x40,0x28,0x3d,A,A,A,A]
+; CHECK-NEXT: # fixup A - offset: 3, value: g8-4, kind: reloc_riprel_4byte
+; CHECK-NEXT: js .LBB30_1 # encoding: [0x78,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: .LBB30_1-1, kind: FK_PCRel_1
+; CHECK-NEXT: # %bb.2: # %b
+; CHECK-NEXT: jmp b # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: b-1, kind: FK_PCRel_1
+; CHECK-NEXT: .LBB30_1: # %a
+; CHECK-NEXT: jmp a # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: a-1, kind: FK_PCRel_1
+entry:
+ %load1 = load i8, i8* @g8
+ %sub = sub i8 %load1, %arg
+ store i8 %sub, i8* @g8
+ %cond = icmp slt i8 %sub, 0
+ br i1 %cond, label %a, label %b
+
+a:
+ tail call void @a()
+ ret void
+
+b:
+ tail call void @b()
+ ret void
+}
+
+define void @and64_imm32_br() nounwind {
+; CHECK-LABEL: and64_imm32_br:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: andq $16777215, {{.*}}(%rip) # encoding: [0x48,0x81,0x25,A,A,A,A,0xff,0xff,0xff,0x00]
+; CHECK-NEXT: # fixup A - offset: 3, value: g64-8, kind: reloc_riprel_4byte
+; CHECK-NEXT: # imm = 0xFFFFFF
+; CHECK-NEXT: je .LBB31_1 # encoding: [0x74,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: .LBB31_1-1, kind: FK_PCRel_1
+; CHECK-NEXT: # %bb.2: # %b
+; CHECK-NEXT: jmp b # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: b-1, kind: FK_PCRel_1
+; CHECK-NEXT: .LBB31_1: # %a
+; CHECK-NEXT: jmp a # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: a-1, kind: FK_PCRel_1
+entry:
+ %load1 = load i64, i64* @g64
+ ; And 0x00FFFFFF, a positive immediate requiring 24-bits.
+ %and = and i64 %load1, 16777215
+ store i64 %and, i64* @g64
+ %cond = icmp eq i64 %and, 0
+ br i1 %cond, label %a, label %b
+
+a:
+ tail call void @a()
+ ret void
+
+b:
+ tail call void @b()
+ ret void
+}
+
+define void @and64_sext_imm32_br() nounwind {
+; CHECK-LABEL: and64_sext_imm32_br:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: andq $-2147483648, {{.*}}(%rip) # encoding: [0x48,0x81,0x25,A,A,A,A,0x00,0x00,0x00,0x80]
+; CHECK-NEXT: # fixup A - offset: 3, value: g64-8, kind: reloc_riprel_4byte
+; CHECK-NEXT: # imm = 0x80000000
+; CHECK-NEXT: je .LBB32_1 # encoding: [0x74,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: .LBB32_1-1, kind: FK_PCRel_1
+; CHECK-NEXT: # %bb.2: # %b
+; CHECK-NEXT: jmp b # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: b-1, kind: FK_PCRel_1
+; CHECK-NEXT: .LBB32_1: # %a
+; CHECK-NEXT: jmp a # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: a-1, kind: FK_PCRel_1
+entry:
+ %load1 = load i64, i64* @g64
+ ; And -0x80000000, which requires sign-extended 32 bits.
+ %and = and i64 %load1, -2147483648
+ store i64 %and, i64* @g64
+ %cond = icmp eq i64 %and, 0
+ br i1 %cond, label %a, label %b
+
+a:
+ tail call void @a()
+ ret void
+
+b:
+ tail call void @b()
+ ret void
+}
+
+define void @and64_imm8_br() nounwind {
+; CHECK-LABEL: and64_imm8_br:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: andq $15, {{.*}}(%rip) # encoding: [0x48,0x83,0x25,A,A,A,A,0x0f]
+; CHECK-NEXT: # fixup A - offset: 3, value: g64-5, kind: reloc_riprel_4byte
+; CHECK-NEXT: je .LBB33_1 # encoding: [0x74,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: .LBB33_1-1, kind: FK_PCRel_1
+; CHECK-NEXT: # %bb.2: # %b
+; CHECK-NEXT: jmp b # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: b-1, kind: FK_PCRel_1
+; CHECK-NEXT: .LBB33_1: # %a
+; CHECK-NEXT: jmp a # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: a-1, kind: FK_PCRel_1
+entry:
+ %load1 = load i64, i64* @g64
+ %and = and i64 %load1, 15
+ store i64 %and, i64* @g64
+ %cond = icmp eq i64 %and, 0
+ br i1 %cond, label %a, label %b
+
+a:
+ tail call void @a()
+ ret void
+
+b:
+ tail call void @b()
+ ret void
+}
+
+define void @and64_imm8_neg_br() nounwind {
+; CHECK-LABEL: and64_imm8_neg_br:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: andq $-4, {{.*}}(%rip) # encoding: [0x48,0x83,0x25,A,A,A,A,0xfc]
+; CHECK-NEXT: # fixup A - offset: 3, value: g64-5, kind: reloc_riprel_4byte
+; CHECK-NEXT: je .LBB34_1 # encoding: [0x74,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: .LBB34_1-1, kind: FK_PCRel_1
+; CHECK-NEXT: # %bb.2: # %b
+; CHECK-NEXT: jmp b # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: b-1, kind: FK_PCRel_1
+; CHECK-NEXT: .LBB34_1: # %a
+; CHECK-NEXT: jmp a # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: a-1, kind: FK_PCRel_1
+entry:
+ %load1 = load i64, i64* @g64
+ %and = and i64 %load1, -4
+ store i64 %and, i64* @g64
+ %cond = icmp eq i64 %and, 0
+ br i1 %cond, label %a, label %b
+
+a:
+ tail call void @a()
+ ret void
+
+b:
+ tail call void @b()
+ ret void
+}
+
+define void @and32_imm_br() nounwind {
+; CHECK-LABEL: and32_imm_br:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: movl $-2147483648, %eax # encoding: [0xb8,0x00,0x00,0x00,0x80]
+; CHECK-NEXT: # imm = 0x80000000
+; CHECK-NEXT: andl {{.*}}(%rip), %eax # encoding: [0x23,0x05,A,A,A,A]
+; CHECK-NEXT: # fixup A - offset: 2, value: g32-4, kind: reloc_riprel_4byte
+; CHECK-NEXT: movl %eax, {{.*}}(%rip) # encoding: [0x89,0x05,A,A,A,A]
+; CHECK-NEXT: # fixup A - offset: 2, value: g32-4, kind: reloc_riprel_4byte
+; CHECK-NEXT: jne .LBB35_2 # encoding: [0x75,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: .LBB35_2-1, kind: FK_PCRel_1
+; CHECK-NEXT: # %bb.1: # %a
+; CHECK-NEXT: jmp a # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: a-1, kind: FK_PCRel_1
+; CHECK-NEXT: .LBB35_2: # %b
+; CHECK-NEXT: jmp b # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: b-1, kind: FK_PCRel_1
+entry:
+ %load1 = load i32, i32* @g32
+ ; And 0x80000000, a positive number requiring 32 bits of immediate.
+ %and = and i32 %load1, 2147483648
+ store i32 %and, i32* @g32
+ %cond = icmp eq i32 %and, 0
+ br i1 %cond, label %a, label %b
+
+a:
+ tail call void @a()
+ ret void
+
+b:
+ tail call void @b()
+ ret void
+}
+
+define void @and32_imm8_br() nounwind {
+; CHECK-LABEL: and32_imm8_br:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: andl $15, {{.*}}(%rip) # encoding: [0x83,0x25,A,A,A,A,0x0f]
+; CHECK-NEXT: # fixup A - offset: 2, value: g32-5, kind: reloc_riprel_4byte
+; CHECK-NEXT: je .LBB36_1 # encoding: [0x74,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: .LBB36_1-1, kind: FK_PCRel_1
+; CHECK-NEXT: # %bb.2: # %b
+; CHECK-NEXT: jmp b # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: b-1, kind: FK_PCRel_1
+; CHECK-NEXT: .LBB36_1: # %a
+; CHECK-NEXT: jmp a # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: a-1, kind: FK_PCRel_1
+entry:
+ %load1 = load i32, i32* @g32
+ %and = and i32 %load1, 15
+ store i32 %and, i32* @g32
+ %cond = icmp eq i32 %and, 0
+ br i1 %cond, label %a, label %b
+
+a:
+ tail call void @a()
+ ret void
+
+b:
+ tail call void @b()
+ ret void
+}
+
+define void @and32_imm8_neg_br() nounwind {
+; CHECK-LABEL: and32_imm8_neg_br:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: andl $-4, {{.*}}(%rip) # encoding: [0x83,0x25,A,A,A,A,0xfc]
+; CHECK-NEXT: # fixup A - offset: 2, value: g32-5, kind: reloc_riprel_4byte
+; CHECK-NEXT: je .LBB37_1 # encoding: [0x74,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: .LBB37_1-1, kind: FK_PCRel_1
+; CHECK-NEXT: # %bb.2: # %b
+; CHECK-NEXT: jmp b # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: b-1, kind: FK_PCRel_1
+; CHECK-NEXT: .LBB37_1: # %a
+; CHECK-NEXT: jmp a # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: a-1, kind: FK_PCRel_1
+entry:
+ %load1 = load i32, i32* @g32
+ %and = and i32 %load1, -4
+ store i32 %and, i32* @g32
+ %cond = icmp eq i32 %and, 0
+ br i1 %cond, label %a, label %b
+
+a:
+ tail call void @a()
+ ret void
+
+b:
+ tail call void @b()
+ ret void
+}
+
+define void @and16_imm_br() nounwind {
+; CHECK-LABEL: and16_imm_br:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: movzwl {{.*}}(%rip), %eax # encoding: [0x0f,0xb7,0x05,A,A,A,A]
+; CHECK-NEXT: # fixup A - offset: 3, value: g16-4, kind: reloc_riprel_4byte
+; CHECK-NEXT: andl $32768, %eax # encoding: [0x25,0x00,0x80,0x00,0x00]
+; CHECK-NEXT: # imm = 0x8000
+; CHECK-NEXT: movw %ax, {{.*}}(%rip) # encoding: [0x66,0x89,0x05,A,A,A,A]
+; CHECK-NEXT: # fixup A - offset: 3, value: g16-4, kind: reloc_riprel_4byte
+; CHECK-NEXT: testw %ax, %ax # encoding: [0x66,0x85,0xc0]
+; CHECK-NEXT: jne .LBB38_2 # encoding: [0x75,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: .LBB38_2-1, kind: FK_PCRel_1
+; CHECK-NEXT: # %bb.1: # %a
+; CHECK-NEXT: jmp a # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: a-1, kind: FK_PCRel_1
+; CHECK-NEXT: .LBB38_2: # %b
+; CHECK-NEXT: jmp b # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: b-1, kind: FK_PCRel_1
+entry:
+ %load1 = load i16, i16* @g16
+ %and = and i16 %load1, 32768
+ store i16 %and, i16* @g16
+ %cond = icmp eq i16 %and, 0
+ br i1 %cond, label %a, label %b
+
+a:
+ tail call void @a()
+ ret void
+
+b:
+ tail call void @b()
+ ret void
+}
+
+define void @and16_imm8_br() nounwind {
+; CHECK-LABEL: and16_imm8_br:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: andw $15, {{.*}}(%rip) # encoding: [0x66,0x83,0x25,A,A,A,A,0x0f]
+; CHECK-NEXT: # fixup A - offset: 3, value: g16-5, kind: reloc_riprel_4byte
+; CHECK-NEXT: je .LBB39_1 # encoding: [0x74,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: .LBB39_1-1, kind: FK_PCRel_1
+; CHECK-NEXT: # %bb.2: # %b
+; CHECK-NEXT: jmp b # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: b-1, kind: FK_PCRel_1
+; CHECK-NEXT: .LBB39_1: # %a
+; CHECK-NEXT: jmp a # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: a-1, kind: FK_PCRel_1
+entry:
+ %load1 = load i16, i16* @g16
+ %and = and i16 %load1, 15
+ store i16 %and, i16* @g16
+ %cond = icmp eq i16 %and, 0
+ br i1 %cond, label %a, label %b
+
+a:
+ tail call void @a()
+ ret void
+
+b:
+ tail call void @b()
+ ret void
+}
+
+define void @and16_imm8_neg_br() nounwind {
+; CHECK-LABEL: and16_imm8_neg_br:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: andw $-4, {{.*}}(%rip) # encoding: [0x66,0x83,0x25,A,A,A,A,0xfc]
+; CHECK-NEXT: # fixup A - offset: 3, value: g16-5, kind: reloc_riprel_4byte
+; CHECK-NEXT: je .LBB40_1 # encoding: [0x74,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: .LBB40_1-1, kind: FK_PCRel_1
+; CHECK-NEXT: # %bb.2: # %b
+; CHECK-NEXT: jmp b # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: b-1, kind: FK_PCRel_1
+; CHECK-NEXT: .LBB40_1: # %a
+; CHECK-NEXT: jmp a # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: a-1, kind: FK_PCRel_1
+entry:
+ %load1 = load i16, i16* @g16
+ %and = and i16 %load1, -4
+ store i16 %and, i16* @g16
+ %cond = icmp eq i16 %and, 0
+ br i1 %cond, label %a, label %b
+
+a:
+ tail call void @a()
+ ret void
+
+b:
+ tail call void @b()
+ ret void
+}
+
+define void @and8_imm_br() nounwind {
+; CHECK-LABEL: and8_imm_br:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: andb $-4, {{.*}}(%rip) # encoding: [0x80,0x25,A,A,A,A,0xfc]
+; CHECK-NEXT: # fixup A - offset: 2, value: g8-5, kind: reloc_riprel_4byte
+; CHECK-NEXT: je .LBB41_1 # encoding: [0x74,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: .LBB41_1-1, kind: FK_PCRel_1
+; CHECK-NEXT: # %bb.2: # %b
+; CHECK-NEXT: jmp b # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: b-1, kind: FK_PCRel_1
+; CHECK-NEXT: .LBB41_1: # %a
+; CHECK-NEXT: jmp a # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: a-1, kind: FK_PCRel_1
+entry:
+ %load1 = load i8, i8* @g8
+ %and = and i8 %load1, -4
+ store i8 %and, i8* @g8
+ %cond = icmp eq i8 %and, 0
+ br i1 %cond, label %a, label %b
+
+a:
+ tail call void @a()
+ ret void
+
+b:
+ tail call void @b()
+ ret void
+}
+
+define void @and64_reg_br(i64 %arg) nounwind {
+; CHECK-LABEL: and64_reg_br:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: andq %rdi, {{.*}}(%rip) # encoding: [0x48,0x21,0x3d,A,A,A,A]
+; CHECK-NEXT: # fixup A - offset: 3, value: g64-4, kind: reloc_riprel_4byte
+; CHECK-NEXT: je .LBB42_1 # encoding: [0x74,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: .LBB42_1-1, kind: FK_PCRel_1
+; CHECK-NEXT: # %bb.2: # %b
+; CHECK-NEXT: jmp b # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: b-1, kind: FK_PCRel_1
+; CHECK-NEXT: .LBB42_1: # %a
+; CHECK-NEXT: jmp a # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: a-1, kind: FK_PCRel_1
+entry:
+ %load1 = load i64, i64* @g64
+ %and = and i64 %load1, %arg
+ store i64 %and, i64* @g64
+ %cond = icmp eq i64 %and, 0
+ br i1 %cond, label %a, label %b
+
+a:
+ tail call void @a()
+ ret void
+
+b:
+ tail call void @b()
+ ret void
+}
+
+define void @and32_reg_br(i32 %arg) nounwind {
+; CHECK-LABEL: and32_reg_br:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: andl %edi, {{.*}}(%rip) # encoding: [0x21,0x3d,A,A,A,A]
+; CHECK-NEXT: # fixup A - offset: 2, value: g32-4, kind: reloc_riprel_4byte
+; CHECK-NEXT: je .LBB43_1 # encoding: [0x74,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: .LBB43_1-1, kind: FK_PCRel_1
+; CHECK-NEXT: # %bb.2: # %b
+; CHECK-NEXT: jmp b # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: b-1, kind: FK_PCRel_1
+; CHECK-NEXT: .LBB43_1: # %a
+; CHECK-NEXT: jmp a # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: a-1, kind: FK_PCRel_1
+entry:
+ %load1 = load i32, i32* @g32
+ %and = and i32 %load1, %arg
+ store i32 %and, i32* @g32
+ %cond = icmp eq i32 %and, 0
+ br i1 %cond, label %a, label %b
+
+a:
+ tail call void @a()
+ ret void
+
+b:
+ tail call void @b()
+ ret void
+}
+
+define void @and16_reg_br(i16 %arg) nounwind {
+; CHECK-LABEL: and16_reg_br:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: andw %di, {{.*}}(%rip) # encoding: [0x66,0x21,0x3d,A,A,A,A]
+; CHECK-NEXT: # fixup A - offset: 3, value: g16-4, kind: reloc_riprel_4byte
+; CHECK-NEXT: je .LBB44_1 # encoding: [0x74,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: .LBB44_1-1, kind: FK_PCRel_1
+; CHECK-NEXT: # %bb.2: # %b
+; CHECK-NEXT: jmp b # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: b-1, kind: FK_PCRel_1
+; CHECK-NEXT: .LBB44_1: # %a
+; CHECK-NEXT: jmp a # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: a-1, kind: FK_PCRel_1
+entry:
+ %load1 = load i16, i16* @g16
+ %and = and i16 %load1, %arg
+ store i16 %and, i16* @g16
+ %cond = icmp eq i16 %and, 0
+ br i1 %cond, label %a, label %b
+
+a:
+ tail call void @a()
+ ret void
+
+b:
+ tail call void @b()
+ ret void
+}
+
+define void @and8_reg_br(i8 %arg) nounwind {
+; CHECK-LABEL: and8_reg_br:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: andb %dil, {{.*}}(%rip) # encoding: [0x40,0x20,0x3d,A,A,A,A]
+; CHECK-NEXT: # fixup A - offset: 3, value: g8-4, kind: reloc_riprel_4byte
+; CHECK-NEXT: je .LBB45_1 # encoding: [0x74,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: .LBB45_1-1, kind: FK_PCRel_1
+; CHECK-NEXT: # %bb.2: # %b
+; CHECK-NEXT: jmp b # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: b-1, kind: FK_PCRel_1
+; CHECK-NEXT: .LBB45_1: # %a
+; CHECK-NEXT: jmp a # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: a-1, kind: FK_PCRel_1
+entry:
+ %load1 = load i8, i8* @g8
+ %and = and i8 %load1, %arg
+ store i8 %and, i8* @g8
+ %cond = icmp eq i8 %and, 0
+ br i1 %cond, label %a, label %b
+
+a:
+ tail call void @a()
+ ret void
+
+b:
+ tail call void @b()
+ ret void
+}
+
+define void @or64_imm32_br() nounwind {
+; CHECK-LABEL: or64_imm32_br:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: orq $16777215, {{.*}}(%rip) # encoding: [0x48,0x81,0x0d,A,A,A,A,0xff,0xff,0xff,0x00]
+; CHECK-NEXT: # fixup A - offset: 3, value: g64-8, kind: reloc_riprel_4byte
+; CHECK-NEXT: # imm = 0xFFFFFF
+; CHECK-NEXT: je .LBB46_1 # encoding: [0x74,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: .LBB46_1-1, kind: FK_PCRel_1
+; CHECK-NEXT: # %bb.2: # %b
+; CHECK-NEXT: jmp b # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: b-1, kind: FK_PCRel_1
+; CHECK-NEXT: .LBB46_1: # %a
+; CHECK-NEXT: jmp a # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: a-1, kind: FK_PCRel_1
+entry:
+ %load1 = load i64, i64* @g64
+ ; Or 0x00FFFFFF, a positive immediate requiring 24-bits.
+ %or = or i64 %load1, 16777215
+ store i64 %or, i64* @g64
+ %cond = icmp eq i64 %or, 0
+ br i1 %cond, label %a, label %b
+
+a:
+ tail call void @a()
+ ret void
+
+b:
+ tail call void @b()
+ ret void
+}
+
+define void @or64_sext_imm32_br() nounwind {
+; CHECK-LABEL: or64_sext_imm32_br:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: orq $-2147483648, {{.*}}(%rip) # encoding: [0x48,0x81,0x0d,A,A,A,A,0x00,0x00,0x00,0x80]
+; CHECK-NEXT: # fixup A - offset: 3, value: g64-8, kind: reloc_riprel_4byte
+; CHECK-NEXT: # imm = 0x80000000
+; CHECK-NEXT: je .LBB47_1 # encoding: [0x74,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: .LBB47_1-1, kind: FK_PCRel_1
+; CHECK-NEXT: # %bb.2: # %b
+; CHECK-NEXT: jmp b # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: b-1, kind: FK_PCRel_1
+; CHECK-NEXT: .LBB47_1: # %a
+; CHECK-NEXT: jmp a # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: a-1, kind: FK_PCRel_1
+entry:
+ %load1 = load i64, i64* @g64
+ ; Or -0x80000000, which requires sign-extended 32 bits.
+ %or = or i64 %load1, -2147483648
+ store i64 %or, i64* @g64
+ %cond = icmp eq i64 %or, 0
+ br i1 %cond, label %a, label %b
+
+a:
+ tail call void @a()
+ ret void
+
+b:
+ tail call void @b()
+ ret void
+}
+
+define void @or64_imm8_br() nounwind {
+; CHECK-LABEL: or64_imm8_br:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: orq $15, {{.*}}(%rip) # encoding: [0x48,0x83,0x0d,A,A,A,A,0x0f]
+; CHECK-NEXT: # fixup A - offset: 3, value: g64-5, kind: reloc_riprel_4byte
+; CHECK-NEXT: je .LBB48_1 # encoding: [0x74,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: .LBB48_1-1, kind: FK_PCRel_1
+; CHECK-NEXT: # %bb.2: # %b
+; CHECK-NEXT: jmp b # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: b-1, kind: FK_PCRel_1
+; CHECK-NEXT: .LBB48_1: # %a
+; CHECK-NEXT: jmp a # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: a-1, kind: FK_PCRel_1
+entry:
+ %load1 = load i64, i64* @g64
+ %or = or i64 %load1, 15
+ store i64 %or, i64* @g64
+ %cond = icmp eq i64 %or, 0
+ br i1 %cond, label %a, label %b
+
+a:
+ tail call void @a()
+ ret void
+
+b:
+ tail call void @b()
+ ret void
+}
+
+define void @or64_imm8_neg_br() nounwind {
+; CHECK-LABEL: or64_imm8_neg_br:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: orq $-4, {{.*}}(%rip) # encoding: [0x48,0x83,0x0d,A,A,A,A,0xfc]
+; CHECK-NEXT: # fixup A - offset: 3, value: g64-5, kind: reloc_riprel_4byte
+; CHECK-NEXT: je .LBB49_1 # encoding: [0x74,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: .LBB49_1-1, kind: FK_PCRel_1
+; CHECK-NEXT: # %bb.2: # %b
+; CHECK-NEXT: jmp b # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: b-1, kind: FK_PCRel_1
+; CHECK-NEXT: .LBB49_1: # %a
+; CHECK-NEXT: jmp a # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: a-1, kind: FK_PCRel_1
+entry:
+ %load1 = load i64, i64* @g64
+ %or = or i64 %load1, -4
+ store i64 %or, i64* @g64
+ %cond = icmp eq i64 %or, 0
+ br i1 %cond, label %a, label %b
+
+a:
+ tail call void @a()
+ ret void
+
+b:
+ tail call void @b()
+ ret void
+}
+
+define void @or32_imm_br() nounwind {
+; CHECK-LABEL: or32_imm_br:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: orl $-2147483648, {{.*}}(%rip) # encoding: [0x81,0x0d,A,A,A,A,0x00,0x00,0x00,0x80]
+; CHECK-NEXT: # fixup A - offset: 2, value: g32-8, kind: reloc_riprel_4byte
+; CHECK-NEXT: # imm = 0x80000000
+; CHECK-NEXT: je .LBB50_1 # encoding: [0x74,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: .LBB50_1-1, kind: FK_PCRel_1
+; CHECK-NEXT: # %bb.2: # %b
+; CHECK-NEXT: jmp b # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: b-1, kind: FK_PCRel_1
+; CHECK-NEXT: .LBB50_1: # %a
+; CHECK-NEXT: jmp a # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: a-1, kind: FK_PCRel_1
+entry:
+ %load1 = load i32, i32* @g32
+ ; Or 0x80000000, a positive number requiring 32 bits of immediate.
+ %or = or i32 %load1, 2147483648
+ store i32 %or, i32* @g32
+ %cond = icmp eq i32 %or, 0
+ br i1 %cond, label %a, label %b
+
+a:
+ tail call void @a()
+ ret void
+
+b:
+ tail call void @b()
+ ret void
+}
+
+define void @or32_imm8_br() nounwind {
+; CHECK-LABEL: or32_imm8_br:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: orl $15, {{.*}}(%rip) # encoding: [0x83,0x0d,A,A,A,A,0x0f]
+; CHECK-NEXT: # fixup A - offset: 2, value: g32-5, kind: reloc_riprel_4byte
+; CHECK-NEXT: je .LBB51_1 # encoding: [0x74,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: .LBB51_1-1, kind: FK_PCRel_1
+; CHECK-NEXT: # %bb.2: # %b
+; CHECK-NEXT: jmp b # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: b-1, kind: FK_PCRel_1
+; CHECK-NEXT: .LBB51_1: # %a
+; CHECK-NEXT: jmp a # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: a-1, kind: FK_PCRel_1
+entry:
+ %load1 = load i32, i32* @g32
+ %or = or i32 %load1, 15
+ store i32 %or, i32* @g32
+ %cond = icmp eq i32 %or, 0
+ br i1 %cond, label %a, label %b
+
+a:
+ tail call void @a()
+ ret void
+
+b:
+ tail call void @b()
+ ret void
+}
+
+define void @or32_imm8_neg_br() nounwind {
+; CHECK-LABEL: or32_imm8_neg_br:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: orl $-4, {{.*}}(%rip) # encoding: [0x83,0x0d,A,A,A,A,0xfc]
+; CHECK-NEXT: # fixup A - offset: 2, value: g32-5, kind: reloc_riprel_4byte
+; CHECK-NEXT: je .LBB52_1 # encoding: [0x74,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: .LBB52_1-1, kind: FK_PCRel_1
+; CHECK-NEXT: # %bb.2: # %b
+; CHECK-NEXT: jmp b # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: b-1, kind: FK_PCRel_1
+; CHECK-NEXT: .LBB52_1: # %a
+; CHECK-NEXT: jmp a # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: a-1, kind: FK_PCRel_1
+entry:
+ %load1 = load i32, i32* @g32
+ %or = or i32 %load1, -4
+ store i32 %or, i32* @g32
+ %cond = icmp eq i32 %or, 0
+ br i1 %cond, label %a, label %b
+
+a:
+ tail call void @a()
+ ret void
+
+b:
+ tail call void @b()
+ ret void
+}
+
+define void @or16_imm_br() nounwind {
+; CHECK-LABEL: or16_imm_br:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: orw $-32768, {{.*}}(%rip) # encoding: [0x66,0x81,0x0d,A,A,A,A,0x00,0x80]
+; CHECK-NEXT: # fixup A - offset: 3, value: g16-6, kind: reloc_riprel_4byte
+; CHECK-NEXT: # imm = 0x8000
+; CHECK-NEXT: je .LBB53_1 # encoding: [0x74,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: .LBB53_1-1, kind: FK_PCRel_1
+; CHECK-NEXT: # %bb.2: # %b
+; CHECK-NEXT: jmp b # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: b-1, kind: FK_PCRel_1
+; CHECK-NEXT: .LBB53_1: # %a
+; CHECK-NEXT: jmp a # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: a-1, kind: FK_PCRel_1
+entry:
+ %load1 = load i16, i16* @g16
+ %or = or i16 %load1, 32768
+ store i16 %or, i16* @g16
+ %cond = icmp eq i16 %or, 0
+ br i1 %cond, label %a, label %b
+
+a:
+ tail call void @a()
+ ret void
+
+b:
+ tail call void @b()
+ ret void
+}
+
+define void @or16_imm8_br() nounwind {
+; CHECK-LABEL: or16_imm8_br:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: orw $15, {{.*}}(%rip) # encoding: [0x66,0x83,0x0d,A,A,A,A,0x0f]
+; CHECK-NEXT: # fixup A - offset: 3, value: g16-5, kind: reloc_riprel_4byte
+; CHECK-NEXT: je .LBB54_1 # encoding: [0x74,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: .LBB54_1-1, kind: FK_PCRel_1
+; CHECK-NEXT: # %bb.2: # %b
+; CHECK-NEXT: jmp b # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: b-1, kind: FK_PCRel_1
+; CHECK-NEXT: .LBB54_1: # %a
+; CHECK-NEXT: jmp a # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: a-1, kind: FK_PCRel_1
+entry:
+ %load1 = load i16, i16* @g16
+ %or = or i16 %load1, 15
+ store i16 %or, i16* @g16
+ %cond = icmp eq i16 %or, 0
+ br i1 %cond, label %a, label %b
+
+a:
+ tail call void @a()
+ ret void
+
+b:
+ tail call void @b()
+ ret void
+}
+
+define void @or16_imm8_neg_br() nounwind {
+; CHECK-LABEL: or16_imm8_neg_br:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: orw $-4, {{.*}}(%rip) # encoding: [0x66,0x83,0x0d,A,A,A,A,0xfc]
+; CHECK-NEXT: # fixup A - offset: 3, value: g16-5, kind: reloc_riprel_4byte
+; CHECK-NEXT: je .LBB55_1 # encoding: [0x74,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: .LBB55_1-1, kind: FK_PCRel_1
+; CHECK-NEXT: # %bb.2: # %b
+; CHECK-NEXT: jmp b # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: b-1, kind: FK_PCRel_1
+; CHECK-NEXT: .LBB55_1: # %a
+; CHECK-NEXT: jmp a # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: a-1, kind: FK_PCRel_1
+entry:
+ %load1 = load i16, i16* @g16
+ %or = or i16 %load1, -4
+ store i16 %or, i16* @g16
+ %cond = icmp eq i16 %or, 0
+ br i1 %cond, label %a, label %b
+
+a:
+ tail call void @a()
+ ret void
+
+b:
+ tail call void @b()
+ ret void
+}
+
+define void @or8_imm_br() nounwind {
+; CHECK-LABEL: or8_imm_br:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: orb $-4, {{.*}}(%rip) # encoding: [0x80,0x0d,A,A,A,A,0xfc]
+; CHECK-NEXT: # fixup A - offset: 2, value: g8-5, kind: reloc_riprel_4byte
+; CHECK-NEXT: je .LBB56_1 # encoding: [0x74,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: .LBB56_1-1, kind: FK_PCRel_1
+; CHECK-NEXT: # %bb.2: # %b
+; CHECK-NEXT: jmp b # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: b-1, kind: FK_PCRel_1
+; CHECK-NEXT: .LBB56_1: # %a
+; CHECK-NEXT: jmp a # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: a-1, kind: FK_PCRel_1
+entry:
+ %load1 = load i8, i8* @g8
+ %or = or i8 %load1, -4
+ store i8 %or, i8* @g8
+ %cond = icmp eq i8 %or, 0
+ br i1 %cond, label %a, label %b
+
+a:
+ tail call void @a()
+ ret void
+
+b:
+ tail call void @b()
+ ret void
+}
+
+define void @or64_reg_br(i64 %arg) nounwind {
+; CHECK-LABEL: or64_reg_br:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: orq %rdi, {{.*}}(%rip) # encoding: [0x48,0x09,0x3d,A,A,A,A]
+; CHECK-NEXT: # fixup A - offset: 3, value: g64-4, kind: reloc_riprel_4byte
+; CHECK-NEXT: je .LBB57_1 # encoding: [0x74,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: .LBB57_1-1, kind: FK_PCRel_1
+; CHECK-NEXT: # %bb.2: # %b
+; CHECK-NEXT: jmp b # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: b-1, kind: FK_PCRel_1
+; CHECK-NEXT: .LBB57_1: # %a
+; CHECK-NEXT: jmp a # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: a-1, kind: FK_PCRel_1
+entry:
+ %load1 = load i64, i64* @g64
+ %or = or i64 %load1, %arg
+ store i64 %or, i64* @g64
+ %cond = icmp eq i64 %or, 0
+ br i1 %cond, label %a, label %b
+
+a:
+ tail call void @a()
+ ret void
+
+b:
+ tail call void @b()
+ ret void
+}
+
+define void @or32_reg_br(i32 %arg) nounwind {
+; CHECK-LABEL: or32_reg_br:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: orl %edi, {{.*}}(%rip) # encoding: [0x09,0x3d,A,A,A,A]
+; CHECK-NEXT: # fixup A - offset: 2, value: g32-4, kind: reloc_riprel_4byte
+; CHECK-NEXT: je .LBB58_1 # encoding: [0x74,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: .LBB58_1-1, kind: FK_PCRel_1
+; CHECK-NEXT: # %bb.2: # %b
+; CHECK-NEXT: jmp b # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: b-1, kind: FK_PCRel_1
+; CHECK-NEXT: .LBB58_1: # %a
+; CHECK-NEXT: jmp a # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: a-1, kind: FK_PCRel_1
+entry:
+ %load1 = load i32, i32* @g32
+ %or = or i32 %load1, %arg
+ store i32 %or, i32* @g32
+ %cond = icmp eq i32 %or, 0
+ br i1 %cond, label %a, label %b
+
+a:
+ tail call void @a()
+ ret void
+
+b:
+ tail call void @b()
+ ret void
+}
+
+define void @or16_reg_br(i16 %arg) nounwind {
+; CHECK-LABEL: or16_reg_br:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: orw %di, {{.*}}(%rip) # encoding: [0x66,0x09,0x3d,A,A,A,A]
+; CHECK-NEXT: # fixup A - offset: 3, value: g16-4, kind: reloc_riprel_4byte
+; CHECK-NEXT: je .LBB59_1 # encoding: [0x74,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: .LBB59_1-1, kind: FK_PCRel_1
+; CHECK-NEXT: # %bb.2: # %b
+; CHECK-NEXT: jmp b # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: b-1, kind: FK_PCRel_1
+; CHECK-NEXT: .LBB59_1: # %a
+; CHECK-NEXT: jmp a # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: a-1, kind: FK_PCRel_1
+entry:
+ %load1 = load i16, i16* @g16
+ %or = or i16 %load1, %arg
+ store i16 %or, i16* @g16
+ %cond = icmp eq i16 %or, 0
+ br i1 %cond, label %a, label %b
+
+a:
+ tail call void @a()
+ ret void
+
+b:
+ tail call void @b()
+ ret void
+}
+
+define void @or8_reg_br(i8 %arg) nounwind {
+; CHECK-LABEL: or8_reg_br:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: orb %dil, {{.*}}(%rip) # encoding: [0x40,0x08,0x3d,A,A,A,A]
+; CHECK-NEXT: # fixup A - offset: 3, value: g8-4, kind: reloc_riprel_4byte
+; CHECK-NEXT: je .LBB60_1 # encoding: [0x74,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: .LBB60_1-1, kind: FK_PCRel_1
+; CHECK-NEXT: # %bb.2: # %b
+; CHECK-NEXT: jmp b # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: b-1, kind: FK_PCRel_1
+; CHECK-NEXT: .LBB60_1: # %a
+; CHECK-NEXT: jmp a # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: a-1, kind: FK_PCRel_1
+entry:
+ %load1 = load i8, i8* @g8
+ %or = or i8 %load1, %arg
+ store i8 %or, i8* @g8
+ %cond = icmp eq i8 %or, 0
+ br i1 %cond, label %a, label %b
+
+a:
+ tail call void @a()
+ ret void
+
+b:
+ tail call void @b()
+ ret void
+}
+
+define void @xor64_imm32_br() nounwind {
+; CHECK-LABEL: xor64_imm32_br:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: xorq $16777215, {{.*}}(%rip) # encoding: [0x48,0x81,0x35,A,A,A,A,0xff,0xff,0xff,0x00]
+; CHECK-NEXT: # fixup A - offset: 3, value: g64-8, kind: reloc_riprel_4byte
+; CHECK-NEXT: # imm = 0xFFFFFF
+; CHECK-NEXT: je .LBB61_1 # encoding: [0x74,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: .LBB61_1-1, kind: FK_PCRel_1
+; CHECK-NEXT: # %bb.2: # %b
+; CHECK-NEXT: jmp b # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: b-1, kind: FK_PCRel_1
+; CHECK-NEXT: .LBB61_1: # %a
+; CHECK-NEXT: jmp a # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: a-1, kind: FK_PCRel_1
+entry:
+ %load1 = load i64, i64* @g64
+ ; Xor 0x00FFFFFF, a positive immediate requiring 24-bits.
+ %xor = xor i64 %load1, 16777215
+ store i64 %xor, i64* @g64
+ %cond = icmp eq i64 %xor, 0
+ br i1 %cond, label %a, label %b
+
+a:
+ tail call void @a()
+ ret void
+
+b:
+ tail call void @b()
+ ret void
+}
+
+define void @xor64_sext_imm32_br() nounwind {
+; CHECK-LABEL: xor64_sext_imm32_br:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: xorq $-2147483648, {{.*}}(%rip) # encoding: [0x48,0x81,0x35,A,A,A,A,0x00,0x00,0x00,0x80]
+; CHECK-NEXT: # fixup A - offset: 3, value: g64-8, kind: reloc_riprel_4byte
+; CHECK-NEXT: # imm = 0x80000000
+; CHECK-NEXT: je .LBB62_1 # encoding: [0x74,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: .LBB62_1-1, kind: FK_PCRel_1
+; CHECK-NEXT: # %bb.2: # %b
+; CHECK-NEXT: jmp b # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: b-1, kind: FK_PCRel_1
+; CHECK-NEXT: .LBB62_1: # %a
+; CHECK-NEXT: jmp a # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: a-1, kind: FK_PCRel_1
+entry:
+ %load1 = load i64, i64* @g64
+ ; Xor -0x80000000, which requires sign-extended 32 bits.
+ %xor = xor i64 %load1, -2147483648
+ store i64 %xor, i64* @g64
+ %cond = icmp eq i64 %xor, 0
+ br i1 %cond, label %a, label %b
+
+a:
+ tail call void @a()
+ ret void
+
+b:
+ tail call void @b()
+ ret void
+}
+
+define void @xor64_imm8_br() nounwind {
+; CHECK-LABEL: xor64_imm8_br:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: xorq $15, {{.*}}(%rip) # encoding: [0x48,0x83,0x35,A,A,A,A,0x0f]
+; CHECK-NEXT: # fixup A - offset: 3, value: g64-5, kind: reloc_riprel_4byte
+; CHECK-NEXT: je .LBB63_1 # encoding: [0x74,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: .LBB63_1-1, kind: FK_PCRel_1
+; CHECK-NEXT: # %bb.2: # %b
+; CHECK-NEXT: jmp b # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: b-1, kind: FK_PCRel_1
+; CHECK-NEXT: .LBB63_1: # %a
+; CHECK-NEXT: jmp a # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: a-1, kind: FK_PCRel_1
+entry:
+ %load1 = load i64, i64* @g64
+ %xor = xor i64 %load1, 15
+ store i64 %xor, i64* @g64
+ %cond = icmp eq i64 %xor, 0
+ br i1 %cond, label %a, label %b
+
+a:
+ tail call void @a()
+ ret void
+
+b:
+ tail call void @b()
+ ret void
+}
+
+define void @xor64_imm8_neg_br() nounwind {
+; CHECK-LABEL: xor64_imm8_neg_br:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: xorq $-4, {{.*}}(%rip) # encoding: [0x48,0x83,0x35,A,A,A,A,0xfc]
+; CHECK-NEXT: # fixup A - offset: 3, value: g64-5, kind: reloc_riprel_4byte
+; CHECK-NEXT: je .LBB64_1 # encoding: [0x74,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: .LBB64_1-1, kind: FK_PCRel_1
+; CHECK-NEXT: # %bb.2: # %b
+; CHECK-NEXT: jmp b # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: b-1, kind: FK_PCRel_1
+; CHECK-NEXT: .LBB64_1: # %a
+; CHECK-NEXT: jmp a # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: a-1, kind: FK_PCRel_1
+entry:
+ %load1 = load i64, i64* @g64
+ %xor = xor i64 %load1, -4
+ store i64 %xor, i64* @g64
+ %cond = icmp eq i64 %xor, 0
+ br i1 %cond, label %a, label %b
+
+a:
+ tail call void @a()
+ ret void
+
+b:
+ tail call void @b()
+ ret void
+}
+
+define void @xor32_imm_br() nounwind {
+; CHECK-LABEL: xor32_imm_br:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: xorl $-2147483648, {{.*}}(%rip) # encoding: [0x81,0x35,A,A,A,A,0x00,0x00,0x00,0x80]
+; CHECK-NEXT: # fixup A - offset: 2, value: g32-8, kind: reloc_riprel_4byte
+; CHECK-NEXT: # imm = 0x80000000
+; CHECK-NEXT: je .LBB65_1 # encoding: [0x74,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: .LBB65_1-1, kind: FK_PCRel_1
+; CHECK-NEXT: # %bb.2: # %b
+; CHECK-NEXT: jmp b # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: b-1, kind: FK_PCRel_1
+; CHECK-NEXT: .LBB65_1: # %a
+; CHECK-NEXT: jmp a # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: a-1, kind: FK_PCRel_1
+entry:
+ %load1 = load i32, i32* @g32
+ ; Xor 0x80000000, a positive number requiring 32 bits of immediate.
+ %xor = xor i32 %load1, 2147483648
+ store i32 %xor, i32* @g32
+ %cond = icmp eq i32 %xor, 0
+ br i1 %cond, label %a, label %b
+
+a:
+ tail call void @a()
+ ret void
+
+b:
+ tail call void @b()
+ ret void
+}
+
+define void @xor32_imm8_br() nounwind {
+; CHECK-LABEL: xor32_imm8_br:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: xorl $15, {{.*}}(%rip) # encoding: [0x83,0x35,A,A,A,A,0x0f]
+; CHECK-NEXT: # fixup A - offset: 2, value: g32-5, kind: reloc_riprel_4byte
+; CHECK-NEXT: je .LBB66_1 # encoding: [0x74,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: .LBB66_1-1, kind: FK_PCRel_1
+; CHECK-NEXT: # %bb.2: # %b
+; CHECK-NEXT: jmp b # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: b-1, kind: FK_PCRel_1
+; CHECK-NEXT: .LBB66_1: # %a
+; CHECK-NEXT: jmp a # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: a-1, kind: FK_PCRel_1
+entry:
+ %load1 = load i32, i32* @g32
+ %xor = xor i32 %load1, 15
+ store i32 %xor, i32* @g32
+ %cond = icmp eq i32 %xor, 0
+ br i1 %cond, label %a, label %b
+
+a:
+ tail call void @a()
+ ret void
+
+b:
+ tail call void @b()
+ ret void
+}
+
+define void @xor32_imm8_neg_br() nounwind {
+; CHECK-LABEL: xor32_imm8_neg_br:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: xorl $-4, {{.*}}(%rip) # encoding: [0x83,0x35,A,A,A,A,0xfc]
+; CHECK-NEXT: # fixup A - offset: 2, value: g32-5, kind: reloc_riprel_4byte
+; CHECK-NEXT: je .LBB67_1 # encoding: [0x74,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: .LBB67_1-1, kind: FK_PCRel_1
+; CHECK-NEXT: # %bb.2: # %b
+; CHECK-NEXT: jmp b # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: b-1, kind: FK_PCRel_1
+; CHECK-NEXT: .LBB67_1: # %a
+; CHECK-NEXT: jmp a # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: a-1, kind: FK_PCRel_1
+entry:
+ %load1 = load i32, i32* @g32
+ %xor = xor i32 %load1, -4
+ store i32 %xor, i32* @g32
+ %cond = icmp eq i32 %xor, 0
+ br i1 %cond, label %a, label %b
+
+a:
+ tail call void @a()
+ ret void
+
+b:
+ tail call void @b()
+ ret void
+}
+
+define void @xor16_imm_br() nounwind {
+; CHECK-LABEL: xor16_imm_br:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: xorw $-32768, {{.*}}(%rip) # encoding: [0x66,0x81,0x35,A,A,A,A,0x00,0x80]
+; CHECK-NEXT: # fixup A - offset: 3, value: g16-6, kind: reloc_riprel_4byte
+; CHECK-NEXT: # imm = 0x8000
+; CHECK-NEXT: je .LBB68_1 # encoding: [0x74,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: .LBB68_1-1, kind: FK_PCRel_1
+; CHECK-NEXT: # %bb.2: # %b
+; CHECK-NEXT: jmp b # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: b-1, kind: FK_PCRel_1
+; CHECK-NEXT: .LBB68_1: # %a
+; CHECK-NEXT: jmp a # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: a-1, kind: FK_PCRel_1
+entry:
+ %load1 = load i16, i16* @g16
+ %xor = xor i16 %load1, 32768
+ store i16 %xor, i16* @g16
+ %cond = icmp eq i16 %xor, 0
+ br i1 %cond, label %a, label %b
+
+a:
+ tail call void @a()
+ ret void
+
+b:
+ tail call void @b()
+ ret void
+}
+
+define void @xor16_imm8_br() nounwind {
+; CHECK-LABEL: xor16_imm8_br:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: xorw $15, {{.*}}(%rip) # encoding: [0x66,0x83,0x35,A,A,A,A,0x0f]
+; CHECK-NEXT: # fixup A - offset: 3, value: g16-5, kind: reloc_riprel_4byte
+; CHECK-NEXT: je .LBB69_1 # encoding: [0x74,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: .LBB69_1-1, kind: FK_PCRel_1
+; CHECK-NEXT: # %bb.2: # %b
+; CHECK-NEXT: jmp b # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: b-1, kind: FK_PCRel_1
+; CHECK-NEXT: .LBB69_1: # %a
+; CHECK-NEXT: jmp a # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: a-1, kind: FK_PCRel_1
+entry:
+ %load1 = load i16, i16* @g16
+ %xor = xor i16 %load1, 15
+ store i16 %xor, i16* @g16
+ %cond = icmp eq i16 %xor, 0
+ br i1 %cond, label %a, label %b
+
+a:
+ tail call void @a()
+ ret void
+
+b:
+ tail call void @b()
+ ret void
+}
+
+define void @xor16_imm8_neg_br() nounwind {
+; CHECK-LABEL: xor16_imm8_neg_br:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: xorw $-4, {{.*}}(%rip) # encoding: [0x66,0x83,0x35,A,A,A,A,0xfc]
+; CHECK-NEXT: # fixup A - offset: 3, value: g16-5, kind: reloc_riprel_4byte
+; CHECK-NEXT: je .LBB70_1 # encoding: [0x74,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: .LBB70_1-1, kind: FK_PCRel_1
+; CHECK-NEXT: # %bb.2: # %b
+; CHECK-NEXT: jmp b # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: b-1, kind: FK_PCRel_1
+; CHECK-NEXT: .LBB70_1: # %a
+; CHECK-NEXT: jmp a # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: a-1, kind: FK_PCRel_1
+entry:
+ %load1 = load i16, i16* @g16
+ %xor = xor i16 %load1, -4
+ store i16 %xor, i16* @g16
+ %cond = icmp eq i16 %xor, 0
+ br i1 %cond, label %a, label %b
+
+a:
+ tail call void @a()
+ ret void
+
+b:
+ tail call void @b()
+ ret void
+}
+
+define void @xor8_imm_br() nounwind {
+; CHECK-LABEL: xor8_imm_br:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: xorb $-4, {{.*}}(%rip) # encoding: [0x80,0x35,A,A,A,A,0xfc]
+; CHECK-NEXT: # fixup A - offset: 2, value: g8-5, kind: reloc_riprel_4byte
+; CHECK-NEXT: je .LBB71_1 # encoding: [0x74,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: .LBB71_1-1, kind: FK_PCRel_1
+; CHECK-NEXT: # %bb.2: # %b
+; CHECK-NEXT: jmp b # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: b-1, kind: FK_PCRel_1
+; CHECK-NEXT: .LBB71_1: # %a
+; CHECK-NEXT: jmp a # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: a-1, kind: FK_PCRel_1
+entry:
+ %load1 = load i8, i8* @g8
+ %xor = xor i8 %load1, -4
+ store i8 %xor, i8* @g8
+ %cond = icmp eq i8 %xor, 0
+ br i1 %cond, label %a, label %b
+
+a:
+ tail call void @a()
+ ret void
+
+b:
+ tail call void @b()
+ ret void
+}
+
+define void @xor64_reg_br(i64 %arg) nounwind {
+; CHECK-LABEL: xor64_reg_br:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: xorq %rdi, {{.*}}(%rip) # encoding: [0x48,0x31,0x3d,A,A,A,A]
+; CHECK-NEXT: # fixup A - offset: 3, value: g64-4, kind: reloc_riprel_4byte
+; CHECK-NEXT: je .LBB72_1 # encoding: [0x74,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: .LBB72_1-1, kind: FK_PCRel_1
+; CHECK-NEXT: # %bb.2: # %b
+; CHECK-NEXT: jmp b # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: b-1, kind: FK_PCRel_1
+; CHECK-NEXT: .LBB72_1: # %a
+; CHECK-NEXT: jmp a # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: a-1, kind: FK_PCRel_1
+entry:
+ %load1 = load i64, i64* @g64
+ %xor = xor i64 %load1, %arg
+ store i64 %xor, i64* @g64
+ %cond = icmp eq i64 %xor, 0
+ br i1 %cond, label %a, label %b
+
+a:
+ tail call void @a()
+ ret void
+
+b:
+ tail call void @b()
+ ret void
+}
+
+define void @xor32_reg_br(i32 %arg) nounwind {
+; CHECK-LABEL: xor32_reg_br:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: xorl %edi, {{.*}}(%rip) # encoding: [0x31,0x3d,A,A,A,A]
+; CHECK-NEXT: # fixup A - offset: 2, value: g32-4, kind: reloc_riprel_4byte
+; CHECK-NEXT: je .LBB73_1 # encoding: [0x74,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: .LBB73_1-1, kind: FK_PCRel_1
+; CHECK-NEXT: # %bb.2: # %b
+; CHECK-NEXT: jmp b # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: b-1, kind: FK_PCRel_1
+; CHECK-NEXT: .LBB73_1: # %a
+; CHECK-NEXT: jmp a # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: a-1, kind: FK_PCRel_1
+entry:
+ %load1 = load i32, i32* @g32
+ %xor = xor i32 %load1, %arg
+ store i32 %xor, i32* @g32
+ %cond = icmp eq i32 %xor, 0
+ br i1 %cond, label %a, label %b
+
+a:
+ tail call void @a()
+ ret void
+
+b:
+ tail call void @b()
+ ret void
+}
+
+define void @xor16_reg_br(i16 %arg) nounwind {
+; CHECK-LABEL: xor16_reg_br:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: xorw %di, {{.*}}(%rip) # encoding: [0x66,0x31,0x3d,A,A,A,A]
+; CHECK-NEXT: # fixup A - offset: 3, value: g16-4, kind: reloc_riprel_4byte
+; CHECK-NEXT: je .LBB74_1 # encoding: [0x74,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: .LBB74_1-1, kind: FK_PCRel_1
+; CHECK-NEXT: # %bb.2: # %b
+; CHECK-NEXT: jmp b # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: b-1, kind: FK_PCRel_1
+; CHECK-NEXT: .LBB74_1: # %a
+; CHECK-NEXT: jmp a # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: a-1, kind: FK_PCRel_1
+entry:
+ %load1 = load i16, i16* @g16
+ %xor = xor i16 %load1, %arg
+ store i16 %xor, i16* @g16
+ %cond = icmp eq i16 %xor, 0
+ br i1 %cond, label %a, label %b
+
+a:
+ tail call void @a()
+ ret void
+
+b:
+ tail call void @b()
+ ret void
+}
+
+define void @xor8_reg_br(i8 %arg) nounwind {
+; CHECK-LABEL: xor8_reg_br:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: xorb %dil, {{.*}}(%rip) # encoding: [0x40,0x30,0x3d,A,A,A,A]
+; CHECK-NEXT: # fixup A - offset: 3, value: g8-4, kind: reloc_riprel_4byte
+; CHECK-NEXT: je .LBB75_1 # encoding: [0x74,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: .LBB75_1-1, kind: FK_PCRel_1
+; CHECK-NEXT: # %bb.2: # %b
+; CHECK-NEXT: jmp b # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: b-1, kind: FK_PCRel_1
+; CHECK-NEXT: .LBB75_1: # %a
+; CHECK-NEXT: jmp a # TAILCALL
+; CHECK-NEXT: # encoding: [0xeb,A]
+; CHECK-NEXT: # fixup A - offset: 1, value: a-1, kind: FK_PCRel_1
+entry:
+ %load1 = load i8, i8* @g8
+ %xor = xor i8 %load1, %arg
+ store i8 %xor, i8* @g8
+ %cond = icmp eq i8 %xor, 0
+ br i1 %cond, label %a, label %b
+
+a:
+ tail call void @a()
+ ret void
+
+b:
+ tail call void @b()
+ ret void
+}
diff --git a/test/CodeGen/X86/fold-sext-trunc.ll b/test/CodeGen/X86/fold-sext-trunc.ll
index df06e70b0c24..11863ffdbfba 100644
--- a/test/CodeGen/X86/fold-sext-trunc.ll
+++ b/test/CodeGen/X86/fold-sext-trunc.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 | grep movslq | count 1
+; RUN: llc < %s -mtriple=x86_64-- | grep movslq | count 1
; PR4050
%0 = type { i64 } ; type %0
diff --git a/test/CodeGen/X86/fold-vector-sext-crash.ll b/test/CodeGen/X86/fold-vector-sext-crash.ll
index 52ea7a912b9f..481f55e9e10d 100644
--- a/test/CodeGen/X86/fold-vector-sext-crash.ll
+++ b/test/CodeGen/X86/fold-vector-sext-crash.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -mcpu=core-avx-i -mtriple=i386-unknown-linux-gnu -mattr=+avx,+popcnt,+cmov
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mcpu=core-avx-i -mtriple=i386-unknown-linux-gnu -mattr=+avx,+popcnt,+cmov | FileCheck %s
; Make sure that we don't introduce illegal build_vector dag nodes
; when trying to fold a sign_extend of a constant build_vector.
@@ -6,6 +7,12 @@
; due to an illegal build_vector of type MVT::v4i64.
define <4 x i64> @foo(<4 x i64> %A) {
+; CHECK-LABEL: foo:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vmovdqa %xmm1, %xmm1
+; CHECK-NEXT: vandps %ymm0, %ymm1, %ymm0
+; CHECK-NEXT: retl
%1 = select <4 x i1> <i1 true, i1 true, i1 false, i1 false>, <4 x i64> %A, <4 x i64><i64 undef, i64 undef, i64 0, i64 0>
ret <4 x i64> %1
}
diff --git a/test/CodeGen/X86/fold-vector-sext-crash2.ll b/test/CodeGen/X86/fold-vector-sext-crash2.ll
index 44c836195abc..ca1a1c1949e5 100644
--- a/test/CodeGen/X86/fold-vector-sext-crash2.ll
+++ b/test/CodeGen/X86/fold-vector-sext-crash2.ll
@@ -1,92 +1,155 @@
-; RUN: llc < %s -march=x86 | FileCheck %s -check-prefix=X32
-; RUN: llc < %s -march=x86-64 | FileCheck %s -check-prefix=X64
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown | FileCheck %s -check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s -check-prefix=X64
; DAGCombiner crashes during sext folding
define <2 x i256> @test_sext1() {
+; X32-LABEL: test_sext1:
+; X32: # %bb.0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl $-1, 60(%eax)
+; X32-NEXT: movl $-1, 56(%eax)
+; X32-NEXT: movl $-1, 52(%eax)
+; X32-NEXT: movl $-1, 48(%eax)
+; X32-NEXT: movl $-1, 44(%eax)
+; X32-NEXT: movl $-1, 40(%eax)
+; X32-NEXT: movl $-1, 36(%eax)
+; X32-NEXT: movl $-99, 32(%eax)
+; X32-NEXT: movl $0, 28(%eax)
+; X32-NEXT: movl $0, 24(%eax)
+; X32-NEXT: movl $0, 20(%eax)
+; X32-NEXT: movl $0, 16(%eax)
+; X32-NEXT: movl $0, 12(%eax)
+; X32-NEXT: movl $0, 8(%eax)
+; X32-NEXT: movl $0, 4(%eax)
+; X32-NEXT: movl $0, (%eax)
+; X32-NEXT: retl $4
+;
+; X64-LABEL: test_sext1:
+; X64: # %bb.0:
+; X64-NEXT: xorps %xmm0, %xmm0
+; X64-NEXT: movaps %xmm0, 16(%rdi)
+; X64-NEXT: movaps %xmm0, (%rdi)
+; X64-NEXT: movq $-1, 56(%rdi)
+; X64-NEXT: movq $-1, 48(%rdi)
+; X64-NEXT: movq $-1, 40(%rdi)
+; X64-NEXT: movq $-99, 32(%rdi)
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: retq
%Se = sext <2 x i8> <i8 -100, i8 -99> to <2 x i256>
%Shuff = shufflevector <2 x i256> zeroinitializer, <2 x i256> %Se, <2 x i32> <i32 1, i32 3>
ret <2 x i256> %Shuff
-
- ; X64-LABEL: test_sext1
- ; X64: movq $-1
- ; X64-NEXT: movq $-1
- ; X64-NEXT: movq $-1
- ; X64-NEXT: movq $-99
-
- ; X32-LABEL: test_sext1
- ; X32: movl $-1
- ; X32-NEXT: movl $-1
- ; X32-NEXT: movl $-1
- ; X32-NEXT: movl $-1
- ; X32-NEXT: movl $-1
- ; X32-NEXT: movl $-1
- ; X32-NEXT: movl $-1
- ; X32-NEXT: movl $-99
}
define <2 x i256> @test_sext2() {
+; X32-LABEL: test_sext2:
+; X32: # %bb.0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl $-1, 60(%eax)
+; X32-NEXT: movl $-1, 56(%eax)
+; X32-NEXT: movl $-1, 52(%eax)
+; X32-NEXT: movl $-1, 48(%eax)
+; X32-NEXT: movl $-1, 44(%eax)
+; X32-NEXT: movl $-1, 40(%eax)
+; X32-NEXT: movl $-1, 36(%eax)
+; X32-NEXT: movl $-1999, 32(%eax) # imm = 0xF831
+; X32-NEXT: movl $0, 28(%eax)
+; X32-NEXT: movl $0, 24(%eax)
+; X32-NEXT: movl $0, 20(%eax)
+; X32-NEXT: movl $0, 16(%eax)
+; X32-NEXT: movl $0, 12(%eax)
+; X32-NEXT: movl $0, 8(%eax)
+; X32-NEXT: movl $0, 4(%eax)
+; X32-NEXT: movl $0, (%eax)
+; X32-NEXT: retl $4
+;
+; X64-LABEL: test_sext2:
+; X64: # %bb.0:
+; X64-NEXT: xorps %xmm0, %xmm0
+; X64-NEXT: movaps %xmm0, 16(%rdi)
+; X64-NEXT: movaps %xmm0, (%rdi)
+; X64-NEXT: movq $-1, 56(%rdi)
+; X64-NEXT: movq $-1, 48(%rdi)
+; X64-NEXT: movq $-1, 40(%rdi)
+; X64-NEXT: movq $-1999, 32(%rdi) # imm = 0xF831
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: retq
%Se = sext <2 x i128> <i128 -2000, i128 -1999> to <2 x i256>
%Shuff = shufflevector <2 x i256> zeroinitializer, <2 x i256> %Se, <2 x i32> <i32 1, i32 3>
ret <2 x i256> %Shuff
-
- ; X64-LABEL: test_sext2
- ; X64: movq $-1
- ; X64-NEXT: movq $-1
- ; X64-NEXT: movq $-1
- ; X64-NEXT: movq $-1999
-
- ; X32-LABEL: test_sext2
- ; X32: movl $-1
- ; X32-NEXT: movl $-1
- ; X32-NEXT: movl $-1
- ; X32-NEXT: movl $-1
- ; X32-NEXT: movl $-1
- ; X32-NEXT: movl $-1
- ; X32-NEXT: movl $-1
- ; X32-NEXT: movl $-1999
}
define <2 x i256> @test_zext1() {
+; X32-LABEL: test_zext1:
+; X32: # %bb.0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl $0, 60(%eax)
+; X32-NEXT: movl $0, 56(%eax)
+; X32-NEXT: movl $0, 52(%eax)
+; X32-NEXT: movl $0, 48(%eax)
+; X32-NEXT: movl $0, 44(%eax)
+; X32-NEXT: movl $0, 40(%eax)
+; X32-NEXT: movl $0, 36(%eax)
+; X32-NEXT: movl $254, 32(%eax)
+; X32-NEXT: movl $0, 28(%eax)
+; X32-NEXT: movl $0, 24(%eax)
+; X32-NEXT: movl $0, 20(%eax)
+; X32-NEXT: movl $0, 16(%eax)
+; X32-NEXT: movl $0, 12(%eax)
+; X32-NEXT: movl $0, 8(%eax)
+; X32-NEXT: movl $0, 4(%eax)
+; X32-NEXT: movl $0, (%eax)
+; X32-NEXT: retl $4
+;
+; X64-LABEL: test_zext1:
+; X64: # %bb.0:
+; X64-NEXT: xorps %xmm0, %xmm0
+; X64-NEXT: movaps %xmm0, 48(%rdi)
+; X64-NEXT: movaps %xmm0, 16(%rdi)
+; X64-NEXT: movaps %xmm0, (%rdi)
+; X64-NEXT: movq $0, 40(%rdi)
+; X64-NEXT: movq $254, 32(%rdi)
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: retq
%Se = zext <2 x i8> <i8 -1, i8 -2> to <2 x i256>
%Shuff = shufflevector <2 x i256> zeroinitializer, <2 x i256> %Se, <2 x i32> <i32 1, i32 3>
ret <2 x i256> %Shuff
-
- ; X64-LABEL: test_zext1
- ; X64: movq $0
- ; X64-NEXT: movq $0
- ; X64-NEXT: movq $0
- ; X64-NEXT: movq $254
-
- ; X32-LABEL: test_zext1
- ; X32: movl $0
- ; X32-NEXT: movl $0
- ; X32-NEXT: movl $0
- ; X32-NEXT: movl $0
- ; X32-NEXT: movl $0
- ; X32-NEXT: movl $0
- ; X32-NEXT: movl $0
- ; X32-NEXT: movl $254
}
define <2 x i256> @test_zext2() {
+; X32-LABEL: test_zext2:
+; X32: # %bb.0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl $0, 60(%eax)
+; X32-NEXT: movl $0, 56(%eax)
+; X32-NEXT: movl $0, 52(%eax)
+; X32-NEXT: movl $0, 48(%eax)
+; X32-NEXT: movl $-1, 44(%eax)
+; X32-NEXT: movl $-1, 40(%eax)
+; X32-NEXT: movl $-1, 36(%eax)
+; X32-NEXT: movl $-2, 32(%eax)
+; X32-NEXT: movl $0, 28(%eax)
+; X32-NEXT: movl $0, 24(%eax)
+; X32-NEXT: movl $0, 20(%eax)
+; X32-NEXT: movl $0, 16(%eax)
+; X32-NEXT: movl $0, 12(%eax)
+; X32-NEXT: movl $0, 8(%eax)
+; X32-NEXT: movl $0, 4(%eax)
+; X32-NEXT: movl $0, (%eax)
+; X32-NEXT: retl $4
+;
+; X64-LABEL: test_zext2:
+; X64: # %bb.0:
+; X64-NEXT: xorps %xmm0, %xmm0
+; X64-NEXT: movaps %xmm0, 48(%rdi)
+; X64-NEXT: movaps %xmm0, 16(%rdi)
+; X64-NEXT: movaps %xmm0, (%rdi)
+; X64-NEXT: movq $-1, 40(%rdi)
+; X64-NEXT: movq $-2, 32(%rdi)
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: retq
%Se = zext <2 x i128> <i128 -1, i128 -2> to <2 x i256>
%Shuff = shufflevector <2 x i256> zeroinitializer, <2 x i256> %Se, <2 x i32> <i32 1, i32 3>
ret <2 x i256> %Shuff
-
- ; X64-LABEL: test_zext2
- ; X64: movq $0
- ; X64-NEXT: movq $0
- ; X64-NEXT: movq $-1
- ; X64-NEXT: movq $-2
-
- ; X32-LABEL: test_zext2
- ; X32: movl $0
- ; X32-NEXT: movl $0
- ; X32-NEXT: movl $0
- ; X32-NEXT: movl $0
- ; X32-NEXT: movl $-1
- ; X32-NEXT: movl $-1
- ; X32-NEXT: movl $-1
- ; X32-NEXT: movl $-2
}
diff --git a/test/CodeGen/X86/fold-vector-sext-zext.ll b/test/CodeGen/X86/fold-vector-sext-zext.ll
index 575bd5897e47..16274a0d8191 100644
--- a/test/CodeGen/X86/fold-vector-sext-zext.ll
+++ b/test/CodeGen/X86/fold-vector-sext-zext.ll
@@ -10,12 +10,12 @@
define <4 x i16> @test_sext_4i8_4i16() {
; X32-LABEL: test_sext_4i8_4i16:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vmovaps {{.*#+}} xmm0 = [0,4294967295,2,4294967293]
; X32-NEXT: retl
;
; X64-LABEL: test_sext_4i8_4i16:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovaps {{.*#+}} xmm0 = [0,4294967295,2,4294967293]
; X64-NEXT: retq
%1 = insertelement <4 x i8> undef, i8 0, i32 0
@@ -28,12 +28,12 @@ define <4 x i16> @test_sext_4i8_4i16() {
define <4 x i16> @test_sext_4i8_4i16_undef() {
; X32-LABEL: test_sext_4i8_4i16_undef:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vmovaps {{.*#+}} xmm0 = <u,4294967295,u,4294967293>
; X32-NEXT: retl
;
; X64-LABEL: test_sext_4i8_4i16_undef:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovaps {{.*#+}} xmm0 = <u,4294967295,u,4294967293>
; X64-NEXT: retq
%1 = insertelement <4 x i8> undef, i8 undef, i32 0
@@ -46,12 +46,12 @@ define <4 x i16> @test_sext_4i8_4i16_undef() {
define <4 x i32> @test_sext_4i8_4i32() {
; X32-LABEL: test_sext_4i8_4i32:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vmovaps {{.*#+}} xmm0 = [0,4294967295,2,4294967293]
; X32-NEXT: retl
;
; X64-LABEL: test_sext_4i8_4i32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovaps {{.*#+}} xmm0 = [0,4294967295,2,4294967293]
; X64-NEXT: retq
%1 = insertelement <4 x i8> undef, i8 0, i32 0
@@ -64,12 +64,12 @@ define <4 x i32> @test_sext_4i8_4i32() {
define <4 x i32> @test_sext_4i8_4i32_undef() {
; X32-LABEL: test_sext_4i8_4i32_undef:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vmovaps {{.*#+}} xmm0 = <u,4294967295,u,4294967293>
; X32-NEXT: retl
;
; X64-LABEL: test_sext_4i8_4i32_undef:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovaps {{.*#+}} xmm0 = <u,4294967295,u,4294967293>
; X64-NEXT: retq
%1 = insertelement <4 x i8> undef, i8 undef, i32 0
@@ -82,13 +82,12 @@ define <4 x i32> @test_sext_4i8_4i32_undef() {
define <4 x i64> @test_sext_4i8_4i64() {
; X32-LABEL: test_sext_4i8_4i64:
-; X32: # BB#0:
-; X32-NEXT: vmovaps {{.*#+}} xmm0 = [0,0,4294967295,4294967295]
-; X32-NEXT: vinsertf128 $1, {{\.LCPI.*}}, %ymm0, %ymm0
+; X32: # %bb.0:
+; X32-NEXT: vmovaps {{.*#+}} ymm0 = [0,0,4294967295,4294967295,2,0,4294967293,4294967295]
; X32-NEXT: retl
;
; X64-LABEL: test_sext_4i8_4i64:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovaps {{.*#+}} ymm0 = [0,18446744073709551615,2,18446744073709551613]
; X64-NEXT: retq
%1 = insertelement <4 x i8> undef, i8 0, i32 0
@@ -101,13 +100,12 @@ define <4 x i64> @test_sext_4i8_4i64() {
define <4 x i64> @test_sext_4i8_4i64_undef() {
; X32-LABEL: test_sext_4i8_4i64_undef:
-; X32: # BB#0:
-; X32-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
-; X32-NEXT: vinsertf128 $1, {{\.LCPI.*}}, %ymm0, %ymm0
+; X32: # %bb.0:
+; X32-NEXT: vmovaps {{.*#+}} ymm0 = <u,u,4294967295,4294967295,u,u,4294967293,4294967295>
; X32-NEXT: retl
;
; X64-LABEL: test_sext_4i8_4i64_undef:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovaps {{.*#+}} ymm0 = <u,18446744073709551615,u,18446744073709551613>
; X64-NEXT: retq
%1 = insertelement <4 x i8> undef, i8 undef, i32 0
@@ -120,12 +118,12 @@ define <4 x i64> @test_sext_4i8_4i64_undef() {
define <8 x i16> @test_sext_8i8_8i16() {
; X32-LABEL: test_sext_8i8_8i16:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vmovaps {{.*#+}} xmm0 = <0,65535,2,65533,u,u,u,u>
; X32-NEXT: retl
;
; X64-LABEL: test_sext_8i8_8i16:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovaps {{.*#+}} xmm0 = <0,65535,2,65533,u,u,u,u>
; X64-NEXT: retq
%1 = insertelement <8 x i8> undef, i8 0, i32 0
@@ -142,12 +140,12 @@ define <8 x i16> @test_sext_8i8_8i16() {
define <8 x i32> @test_sext_8i8_8i32() {
; X32-LABEL: test_sext_8i8_8i32:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vmovaps {{.*#+}} ymm0 = <0,4294967295,2,4294967293,u,u,u,u>
; X32-NEXT: retl
;
; X64-LABEL: test_sext_8i8_8i32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovaps {{.*#+}} ymm0 = <0,4294967295,2,4294967293,u,u,u,u>
; X64-NEXT: retq
%1 = insertelement <8 x i8> undef, i8 0, i32 0
@@ -164,12 +162,12 @@ define <8 x i32> @test_sext_8i8_8i32() {
define <8 x i16> @test_sext_8i8_8i16_undef() {
; X32-LABEL: test_sext_8i8_8i16_undef:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vmovaps {{.*#+}} xmm0 = <u,65535,u,65533,u,u,u,u>
; X32-NEXT: retl
;
; X64-LABEL: test_sext_8i8_8i16_undef:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovaps {{.*#+}} xmm0 = <u,65535,u,65533,u,u,u,u>
; X64-NEXT: retq
%1 = insertelement <8 x i8> undef, i8 undef, i32 0
@@ -186,12 +184,12 @@ define <8 x i16> @test_sext_8i8_8i16_undef() {
define <8 x i32> @test_sext_8i8_8i32_undef() {
; X32-LABEL: test_sext_8i8_8i32_undef:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vmovaps {{.*#+}} ymm0 = <0,u,2,u,u,u,u,u>
; X32-NEXT: retl
;
; X64-LABEL: test_sext_8i8_8i32_undef:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovaps {{.*#+}} ymm0 = <0,u,2,u,u,u,u,u>
; X64-NEXT: retq
%1 = insertelement <8 x i8> undef, i8 0, i32 0
@@ -208,12 +206,12 @@ define <8 x i32> @test_sext_8i8_8i32_undef() {
define <4 x i16> @test_zext_4i8_4i16() {
; X32-LABEL: test_zext_4i8_4i16:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vmovaps {{.*#+}} xmm0 = [0,255,2,253]
; X32-NEXT: retl
;
; X64-LABEL: test_zext_4i8_4i16:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovaps {{.*#+}} xmm0 = [0,255,2,253]
; X64-NEXT: retq
%1 = insertelement <4 x i8> undef, i8 0, i32 0
@@ -226,12 +224,12 @@ define <4 x i16> @test_zext_4i8_4i16() {
define <4 x i32> @test_zext_4i8_4i32() {
; X32-LABEL: test_zext_4i8_4i32:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vmovaps {{.*#+}} xmm0 = [0,255,2,253]
; X32-NEXT: retl
;
; X64-LABEL: test_zext_4i8_4i32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovaps {{.*#+}} xmm0 = [0,255,2,253]
; X64-NEXT: retq
%1 = insertelement <4 x i8> undef, i8 0, i32 0
@@ -244,13 +242,12 @@ define <4 x i32> @test_zext_4i8_4i32() {
define <4 x i64> @test_zext_4i8_4i64() {
; X32-LABEL: test_zext_4i8_4i64:
-; X32: # BB#0:
-; X32-NEXT: vmovaps {{.*#+}} xmm0 = [0,0,255,0]
-; X32-NEXT: vinsertf128 $1, {{\.LCPI.*}}, %ymm0, %ymm0
+; X32: # %bb.0:
+; X32-NEXT: vmovaps {{.*#+}} ymm0 = [0,0,255,0,2,0,253,0]
; X32-NEXT: retl
;
; X64-LABEL: test_zext_4i8_4i64:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovaps {{.*#+}} ymm0 = [0,255,2,253]
; X64-NEXT: retq
%1 = insertelement <4 x i8> undef, i8 0, i32 0
@@ -263,12 +260,12 @@ define <4 x i64> @test_zext_4i8_4i64() {
define <4 x i16> @test_zext_4i8_4i16_undef() {
; X32-LABEL: test_zext_4i8_4i16_undef:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vmovaps {{.*#+}} xmm0 = <u,255,u,253>
; X32-NEXT: retl
;
; X64-LABEL: test_zext_4i8_4i16_undef:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovaps {{.*#+}} xmm0 = <u,255,u,253>
; X64-NEXT: retq
%1 = insertelement <4 x i8> undef, i8 undef, i32 0
@@ -281,12 +278,12 @@ define <4 x i16> @test_zext_4i8_4i16_undef() {
define <4 x i32> @test_zext_4i8_4i32_undef() {
; X32-LABEL: test_zext_4i8_4i32_undef:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vmovaps {{.*#+}} xmm0 = <0,u,2,u>
; X32-NEXT: retl
;
; X64-LABEL: test_zext_4i8_4i32_undef:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovaps {{.*#+}} xmm0 = <0,u,2,u>
; X64-NEXT: retq
%1 = insertelement <4 x i8> undef, i8 0, i32 0
@@ -299,15 +296,12 @@ define <4 x i32> @test_zext_4i8_4i32_undef() {
define <4 x i64> @test_zext_4i8_4i64_undef() {
; X32-LABEL: test_zext_4i8_4i64_undef:
-; X32: # BB#0:
-; X32-NEXT: vmovaps {{.*#+}} xmm0 = <u,u,255,0>
-; X32-NEXT: movl $2, %eax
-; X32-NEXT: vmovd %eax, %xmm1
-; X32-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X32: # %bb.0:
+; X32-NEXT: vmovaps {{.*#+}} ymm0 = <u,u,255,0,2,0,u,u>
; X32-NEXT: retl
;
; X64-LABEL: test_zext_4i8_4i64_undef:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovaps {{.*#+}} ymm0 = <u,255,2,u>
; X64-NEXT: retq
%1 = insertelement <4 x i8> undef, i8 undef, i32 0
@@ -320,12 +314,12 @@ define <4 x i64> @test_zext_4i8_4i64_undef() {
define <8 x i16> @test_zext_8i8_8i16() {
; X32-LABEL: test_zext_8i8_8i16:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vmovaps {{.*#+}} xmm0 = [0,255,2,253,4,251,6,249]
; X32-NEXT: retl
;
; X64-LABEL: test_zext_8i8_8i16:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovaps {{.*#+}} xmm0 = [0,255,2,253,4,251,6,249]
; X64-NEXT: retq
%1 = insertelement <8 x i8> undef, i8 0, i32 0
@@ -342,12 +336,12 @@ define <8 x i16> @test_zext_8i8_8i16() {
define <8 x i32> @test_zext_8i8_8i32() {
; X32-LABEL: test_zext_8i8_8i32:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vmovaps {{.*#+}} ymm0 = [0,255,2,253,4,251,6,249]
; X32-NEXT: retl
;
; X64-LABEL: test_zext_8i8_8i32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovaps {{.*#+}} ymm0 = [0,255,2,253,4,251,6,249]
; X64-NEXT: retq
%1 = insertelement <8 x i8> undef, i8 0, i32 0
@@ -364,12 +358,12 @@ define <8 x i32> @test_zext_8i8_8i32() {
define <8 x i16> @test_zext_8i8_8i16_undef() {
; X32-LABEL: test_zext_8i8_8i16_undef:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vmovaps {{.*#+}} xmm0 = <u,255,u,253,u,251,u,249>
; X32-NEXT: retl
;
; X64-LABEL: test_zext_8i8_8i16_undef:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovaps {{.*#+}} xmm0 = <u,255,u,253,u,251,u,249>
; X64-NEXT: retq
%1 = insertelement <8 x i8> undef, i8 undef, i32 0
@@ -386,12 +380,12 @@ define <8 x i16> @test_zext_8i8_8i16_undef() {
define <8 x i32> @test_zext_8i8_8i32_undef() {
; X32-LABEL: test_zext_8i8_8i32_undef:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vmovaps {{.*#+}} ymm0 = <0,u,2,253,4,u,6,u>
; X32-NEXT: retl
;
; X64-LABEL: test_zext_8i8_8i32_undef:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovaps {{.*#+}} ymm0 = <0,u,2,253,4,u,6,u>
; X64-NEXT: retq
%1 = insertelement <8 x i8> undef, i8 0, i32 0
diff --git a/test/CodeGen/X86/fold-vector-shl-crash.ll b/test/CodeGen/X86/fold-vector-shl-crash.ll
index 9f81e44074f1..7837f2552e22 100644
--- a/test/CodeGen/X86/fold-vector-shl-crash.ll
+++ b/test/CodeGen/X86/fold-vector-shl-crash.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=x86-64 | FileCheck %s
-; RUN: llc < %s -march=x86 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-- | FileCheck %s
+; RUN: llc < %s -mtriple=i686-- | FileCheck %s
;CHECK-LABEL: test
define <2 x i256> @test() {
diff --git a/test/CodeGen/X86/fp-elim.ll b/test/CodeGen/X86/fp-elim.ll
index 2c50bd1be75a..625c16ef7034 100644
--- a/test/CodeGen/X86/fp-elim.ll
+++ b/test/CodeGen/X86/fp-elim.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=x86 -asm-verbose=false | FileCheck %s -check-prefix=FP-ELIM
-; RUN: llc < %s -march=x86 -asm-verbose=false -disable-fp-elim | FileCheck %s -check-prefix=NO-ELIM
+; RUN: llc < %s -mtriple=i686-- -asm-verbose=false | FileCheck %s -check-prefix=FP-ELIM
+; RUN: llc < %s -mtriple=i686-- -asm-verbose=false -disable-fp-elim | FileCheck %s -check-prefix=NO-ELIM
; Implement -momit-leaf-frame-pointer
; rdar://7886181
diff --git a/test/CodeGen/X86/fp-fast.ll b/test/CodeGen/X86/fp-fast.ll
index fa31b9c9e128..c2b07ed10232 100644
--- a/test/CodeGen/X86/fp-fast.ll
+++ b/test/CodeGen/X86/fp-fast.ll
@@ -3,7 +3,7 @@
define float @test1(float %a) {
; CHECK-LABEL: test1:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0
; CHECK-NEXT: retq
%t1 = fadd float %a, %a
@@ -13,7 +13,7 @@ define float @test1(float %a) {
define float @test2(float %a) {
; CHECK-LABEL: test2:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0
; CHECK-NEXT: retq
%t1 = fmul float 4.0, %a
@@ -24,7 +24,7 @@ define float @test2(float %a) {
define float @test3(float %a) {
; CHECK-LABEL: test3:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0
; CHECK-NEXT: retq
%t1 = fmul float %a, 4.0
@@ -35,7 +35,7 @@ define float @test3(float %a) {
define float @test4(float %a) {
; CHECK-LABEL: test4:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0
; CHECK-NEXT: retq
%t1 = fadd float %a, %a
@@ -46,7 +46,7 @@ define float @test4(float %a) {
define float @test5(float %a) {
; CHECK-LABEL: test5:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0
; CHECK-NEXT: retq
%t1 = fadd float %a, %a
@@ -57,7 +57,7 @@ define float @test5(float %a) {
define float @test6(float %a) {
; CHECK-LABEL: test6:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
; CHECK-NEXT: retq
%t1 = fmul float 2.0, %a
@@ -68,7 +68,7 @@ define float @test6(float %a) {
define float @test7(float %a) {
; CHECK-LABEL: test7:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
; CHECK-NEXT: retq
%t1 = fmul float %a, 2.0
@@ -79,7 +79,7 @@ define float @test7(float %a) {
define float @test8(float %a) {
; CHECK-LABEL: test8:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: retq
%t1 = fmul float %a, 0.0
%t2 = fadd float %a, %t1
@@ -88,7 +88,7 @@ define float @test8(float %a) {
define float @test9(float %a) {
; CHECK-LABEL: test9:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: retq
%t1 = fmul float 0.0, %a
%t2 = fadd float %t1, %a
@@ -97,7 +97,7 @@ define float @test9(float %a) {
define float @test10(float %a) {
; CHECK-LABEL: test10:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
; CHECK-NEXT: retq
%t1 = fsub float -0.0, %a
@@ -107,7 +107,7 @@ define float @test10(float %a) {
define float @test11(float %a) {
; CHECK-LABEL: test11:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
; CHECK-NEXT: retq
%t1 = fsub float -0.0, %a
diff --git a/test/CodeGen/X86/fp-immediate-shorten.ll b/test/CodeGen/X86/fp-immediate-shorten.ll
index dc59c5a44b4e..49e3b1014be4 100644
--- a/test/CodeGen/X86/fp-immediate-shorten.ll
+++ b/test/CodeGen/X86/fp-immediate-shorten.ll
@@ -1,6 +1,6 @@
;; Test that this FP immediate is stored in the constant pool as a float.
-; RUN: llc < %s -march=x86 -mattr=-sse2,-sse3 | FileCheck %s
+; RUN: llc < %s -mtriple=i686-- -mattr=-sse2,-sse3 | FileCheck %s
; CHECK: {{.long.1123418112}}
diff --git a/test/CodeGen/X86/fp-intrinsics.ll b/test/CodeGen/X86/fp-intrinsics.ll
index 0f8d730d7535..eae3955adc31 100644
--- a/test/CodeGen/X86/fp-intrinsics.ll
+++ b/test/CodeGen/X86/fp-intrinsics.ll
@@ -1,4 +1,5 @@
-; RUN: llc -O3 -mtriple=x86_64-pc-linux < %s | FileCheck %s
+; RUN: llc -O3 -mtriple=x86_64-pc-linux < %s | FileCheck --check-prefix=COMMON --check-prefix=NO-FMA --check-prefix=FMACALL64 --check-prefix=FMACALL32 %s
+; RUN: llc -O3 -mtriple=x86_64-pc-linux -mattr=+fma < %s | FileCheck -check-prefix=COMMON --check-prefix=HAS-FMA --check-prefix=FMA64 --check-prefix=FMA32 %s
; Verify that constants aren't folded to inexact results when the rounding mode
; is unknown.
@@ -9,7 +10,7 @@
; }
;
; CHECK-LABEL: f1
-; CHECK: divsd
+; COMMON: divsd
define double @f1() {
entry:
%div = call double @llvm.experimental.constrained.fdiv.f64(
@@ -29,7 +30,7 @@ entry:
; }
;
; CHECK-LABEL: f2
-; CHECK: subsd
+; COMMON: subsd
define double @f2(double %a) {
entry:
%div = call double @llvm.experimental.constrained.fsub.f64(
@@ -50,9 +51,9 @@ entry:
; }
;
; CHECK-LABEL: f3:
-; CHECK: subsd
-; CHECK: mulsd
-; CHECK: subsd
+; COMMON: subsd
+; COMMON: mulsd
+; COMMON: subsd
define double @f3(double %a, double %b) {
entry:
%sub = call double @llvm.experimental.constrained.fsub.f64(
@@ -81,11 +82,11 @@ entry:
; return a;
; }
;
-;
+;
; CHECK-LABEL: f4:
-; CHECK: testl
-; CHECK: jle
-; CHECK: addsd
+; COMMON: testl
+; COMMON: jle
+; COMMON: addsd
define double @f4(i32 %n, double %a) {
entry:
%cmp = icmp sgt i32 %n, 0
@@ -105,7 +106,7 @@ if.end:
; Verify that sqrt(42.0) isn't simplified when the rounding mode is unknown.
; CHECK-LABEL: f5
-; CHECK: sqrtsd
+; COMMON: sqrtsd
define double @f5() {
entry:
%result = call double @llvm.experimental.constrained.sqrt.f64(double 42.0,
@@ -116,7 +117,7 @@ entry:
; Verify that pow(42.1, 3.0) isn't simplified when the rounding mode is unknown.
; CHECK-LABEL: f6
-; CHECK: pow
+; COMMON: pow
define double @f6() {
entry:
%result = call double @llvm.experimental.constrained.pow.f64(double 42.1,
@@ -128,7 +129,7 @@ entry:
; Verify that powi(42.1, 3) isn't simplified when the rounding mode is unknown.
; CHECK-LABEL: f7
-; CHECK: powi
+; COMMON: powi
define double @f7() {
entry:
%result = call double @llvm.experimental.constrained.powi.f64(double 42.1,
@@ -140,7 +141,7 @@ entry:
; Verify that sin(42.0) isn't simplified when the rounding mode is unknown.
; CHECK-LABEL: f8
-; CHECK: sin
+; COMMON: sin
define double @f8() {
entry:
%result = call double @llvm.experimental.constrained.sin.f64(double 42.0,
@@ -151,7 +152,7 @@ entry:
; Verify that cos(42.0) isn't simplified when the rounding mode is unknown.
; CHECK-LABEL: f9
-; CHECK: cos
+; COMMON: cos
define double @f9() {
entry:
%result = call double @llvm.experimental.constrained.cos.f64(double 42.0,
@@ -162,7 +163,7 @@ entry:
; Verify that exp(42.0) isn't simplified when the rounding mode is unknown.
; CHECK-LABEL: f10
-; CHECK: exp
+; COMMON: exp
define double @f10() {
entry:
%result = call double @llvm.experimental.constrained.exp.f64(double 42.0,
@@ -173,7 +174,7 @@ entry:
; Verify that exp2(42.1) isn't simplified when the rounding mode is unknown.
; CHECK-LABEL: f11
-; CHECK: exp2
+; COMMON: exp2
define double @f11() {
entry:
%result = call double @llvm.experimental.constrained.exp2.f64(double 42.1,
@@ -184,7 +185,7 @@ entry:
; Verify that log(42.0) isn't simplified when the rounding mode is unknown.
; CHECK-LABEL: f12
-; CHECK: log
+; COMMON: log
define double @f12() {
entry:
%result = call double @llvm.experimental.constrained.log.f64(double 42.0,
@@ -195,7 +196,7 @@ entry:
; Verify that log10(42.0) isn't simplified when the rounding mode is unknown.
; CHECK-LABEL: f13
-; CHECK: log10
+; COMMON: log10
define double @f13() {
entry:
%result = call double @llvm.experimental.constrained.log10.f64(double 42.0,
@@ -206,7 +207,7 @@ entry:
; Verify that log2(42.0) isn't simplified when the rounding mode is unknown.
; CHECK-LABEL: f14
-; CHECK: log2
+; COMMON: log2
define double @f14() {
entry:
%result = call double @llvm.experimental.constrained.log2.f64(double 42.0,
@@ -217,7 +218,8 @@ entry:
; Verify that rint(42.1) isn't simplified when the rounding mode is unknown.
; CHECK-LABEL: f15
-; CHECK: rint
+; NO-FMA: rint
+; HAS-FMA: vroundsd
define double @f15() {
entry:
%result = call double @llvm.experimental.constrained.rint.f64(double 42.1,
@@ -229,7 +231,8 @@ entry:
; Verify that nearbyint(42.1) isn't simplified when the rounding mode is
; unknown.
; CHECK-LABEL: f16
-; CHECK: nearbyint
+; NO-FMA: nearbyint
+; HAS-FMA: vroundsd
define double @f16() {
entry:
%result = call double @llvm.experimental.constrained.nearbyint.f64(
@@ -239,6 +242,38 @@ entry:
ret double %result
}
+; Verify that fma(3.5) isn't simplified when the rounding mode is
+; unknown.
+; CHECK-LABEL: f17
+; FMACALL32: jmp fmaf # TAILCALL
+; FMA32: vfmadd213ss
+define float @f17() {
+entry:
+ %result = call float @llvm.experimental.constrained.fma.f32(
+ float 3.5,
+ float 3.5,
+ float 3.5,
+ metadata !"round.dynamic",
+ metadata !"fpexcept.strict")
+ ret float %result
+}
+
+; Verify that fma(42.1) isn't simplified when the rounding mode is
+; unknown.
+; CHECK-LABEL: f18
+; FMACALL64: jmp fma # TAILCALL
+; FMA64: vfmadd213sd
+define double @f18() {
+entry:
+ %result = call double @llvm.experimental.constrained.fma.f64(
+ double 42.1,
+ double 42.1,
+ double 42.1,
+ metadata !"round.dynamic",
+ metadata !"fpexcept.strict")
+ ret double %result
+}
+
@llvm.fp.env = thread_local global i8 zeroinitializer, section "llvm.metadata"
declare double @llvm.experimental.constrained.fdiv.f64(double, double, metadata, metadata)
declare double @llvm.experimental.constrained.fmul.f64(double, double, metadata, metadata)
@@ -256,3 +291,5 @@ declare double @llvm.experimental.constrained.log10.f64(double, metadata, metada
declare double @llvm.experimental.constrained.log2.f64(double, metadata, metadata)
declare double @llvm.experimental.constrained.rint.f64(double, metadata, metadata)
declare double @llvm.experimental.constrained.nearbyint.f64(double, metadata, metadata)
+declare float @llvm.experimental.constrained.fma.f32(float, float, float, metadata, metadata)
+declare double @llvm.experimental.constrained.fma.f64(double, double, double, metadata, metadata)
diff --git a/test/CodeGen/X86/fp-load-trunc.ll b/test/CodeGen/X86/fp-load-trunc.ll
index 4ef4903914bc..582b648fdecf 100644
--- a/test/CodeGen/X86/fp-load-trunc.ll
+++ b/test/CodeGen/X86/fp-load-trunc.ll
@@ -4,7 +4,7 @@
define <1 x float> @test1(<1 x double>* %p) nounwind {
; CHECK-LABEL: test1:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: pushl %eax
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
@@ -15,7 +15,7 @@ define <1 x float> @test1(<1 x double>* %p) nounwind {
; CHECK-NEXT: retl
;
; AVX-LABEL: test1:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: pushl %eax
; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
@@ -31,13 +31,13 @@ define <1 x float> @test1(<1 x double>* %p) nounwind {
define <2 x float> @test2(<2 x double>* %p) nounwind {
; CHECK-LABEL: test2:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: cvtpd2ps (%eax), %xmm0
; CHECK-NEXT: retl
;
; AVX-LABEL: test2:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX-NEXT: vcvtpd2psx (%eax), %xmm0
; AVX-NEXT: retl
@@ -48,7 +48,7 @@ define <2 x float> @test2(<2 x double>* %p) nounwind {
define <4 x float> @test3(<4 x double>* %p) nounwind {
; CHECK-LABEL: test3:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: cvtpd2ps 16(%eax), %xmm1
; CHECK-NEXT: cvtpd2ps (%eax), %xmm0
@@ -56,7 +56,7 @@ define <4 x float> @test3(<4 x double>* %p) nounwind {
; CHECK-NEXT: retl
;
; AVX-LABEL: test3:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX-NEXT: vcvtpd2psy (%eax), %xmm0
; AVX-NEXT: retl
@@ -67,7 +67,7 @@ define <4 x float> @test3(<4 x double>* %p) nounwind {
define <8 x float> @test4(<8 x double>* %p) nounwind {
; CHECK-LABEL: test4:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: cvtpd2ps 16(%eax), %xmm1
; CHECK-NEXT: cvtpd2ps (%eax), %xmm0
@@ -78,7 +78,7 @@ define <8 x float> @test4(<8 x double>* %p) nounwind {
; CHECK-NEXT: retl
;
; AVX-LABEL: test4:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX-NEXT: vcvtpd2psy (%eax), %xmm0
; AVX-NEXT: vcvtpd2psy 32(%eax), %xmm1
diff --git a/test/CodeGen/X86/fp-logic-replace.ll b/test/CodeGen/X86/fp-logic-replace.ll
index e62b2f3db237..c1660ea696f4 100644
--- a/test/CodeGen/X86/fp-logic-replace.ll
+++ b/test/CodeGen/X86/fp-logic-replace.ll
@@ -11,17 +11,17 @@
define double @FsANDPSrr(double %x, double %y) {
; SSE-LABEL: FsANDPSrr:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: andps %xmm1, %xmm0 # encoding: [0x0f,0x54,0xc1]
; SSE-NEXT: retq # encoding: [0xc3]
;
; AVX-LABEL: FsANDPSrr:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vandps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x54,0xc1]
; AVX-NEXT: retq # encoding: [0xc3]
;
; AVX512DQ-LABEL: FsANDPSrr:
-; AVX512DQ: # BB#0:
+; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vandps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x54,0xc1]
; AVX512DQ-NEXT: retq # encoding: [0xc3]
%bc1 = bitcast double %x to i64
@@ -33,18 +33,18 @@ define double @FsANDPSrr(double %x, double %y) {
define double @FsANDNPSrr(double %x, double %y) {
; SSE-LABEL: FsANDNPSrr:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: andnps %xmm0, %xmm1 # encoding: [0x0f,0x55,0xc8]
; SSE-NEXT: movaps %xmm1, %xmm0 # encoding: [0x0f,0x28,0xc1]
; SSE-NEXT: retq # encoding: [0xc3]
;
; AVX-LABEL: FsANDNPSrr:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vandnps %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf0,0x55,0xc0]
; AVX-NEXT: retq # encoding: [0xc3]
;
; AVX512DQ-LABEL: FsANDNPSrr:
-; AVX512DQ: # BB#0:
+; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vandnps %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf0,0x55,0xc0]
; AVX512DQ-NEXT: retq # encoding: [0xc3]
%bc1 = bitcast double %x to i64
@@ -57,17 +57,17 @@ define double @FsANDNPSrr(double %x, double %y) {
define double @FsORPSrr(double %x, double %y) {
; SSE-LABEL: FsORPSrr:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: orps %xmm1, %xmm0 # encoding: [0x0f,0x56,0xc1]
; SSE-NEXT: retq # encoding: [0xc3]
;
; AVX-LABEL: FsORPSrr:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x56,0xc1]
; AVX-NEXT: retq # encoding: [0xc3]
;
; AVX512DQ-LABEL: FsORPSrr:
-; AVX512DQ: # BB#0:
+; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vorps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x56,0xc1]
; AVX512DQ-NEXT: retq # encoding: [0xc3]
%bc1 = bitcast double %x to i64
@@ -79,17 +79,17 @@ define double @FsORPSrr(double %x, double %y) {
define double @FsXORPSrr(double %x, double %y) {
; SSE-LABEL: FsXORPSrr:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: xorps %xmm1, %xmm0 # encoding: [0x0f,0x57,0xc1]
; SSE-NEXT: retq # encoding: [0xc3]
;
; AVX-LABEL: FsXORPSrr:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vxorps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x57,0xc1]
; AVX-NEXT: retq # encoding: [0xc3]
;
; AVX512DQ-LABEL: FsXORPSrr:
-; AVX512DQ: # BB#0:
+; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vxorps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x57,0xc1]
; AVX512DQ-NEXT: retq # encoding: [0xc3]
%bc1 = bitcast double %x to i64
diff --git a/test/CodeGen/X86/fp-logic.ll b/test/CodeGen/X86/fp-logic.ll
index 976470a83030..4402daceac73 100644
--- a/test/CodeGen/X86/fp-logic.ll
+++ b/test/CodeGen/X86/fp-logic.ll
@@ -18,7 +18,7 @@
define i32 @f1(float %x, i32 %y) {
; CHECK-LABEL: f1:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movd %xmm0, %eax
; CHECK-NEXT: andl %edi, %eax
; CHECK-NEXT: retq
@@ -31,7 +31,7 @@ define i32 @f1(float %x, i32 %y) {
define i32 @f2(float %x, i32 %y) {
; CHECK-LABEL: f2:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movd %xmm0, %eax
; CHECK-NEXT: andl %edi, %eax
; CHECK-NEXT: retq
@@ -44,7 +44,7 @@ define i32 @f2(float %x, i32 %y) {
define i32 @f3(float %x) {
; CHECK-LABEL: f3:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movd %xmm0, %eax
; CHECK-NEXT: andl $1, %eax
; CHECK-NEXT: retq
@@ -57,7 +57,7 @@ define i32 @f3(float %x) {
define i32 @f4(float %x) {
; CHECK-LABEL: f4:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movd %xmm0, %eax
; CHECK-NEXT: andl $2, %eax
; CHECK-NEXT: retq
@@ -70,7 +70,7 @@ define i32 @f4(float %x) {
define float @f5(float %x, i32 %y) {
; CHECK-LABEL: f5:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movd %edi, %xmm1
; CHECK-NEXT: pand %xmm1, %xmm0
; CHECK-NEXT: retq
@@ -84,7 +84,7 @@ define float @f5(float %x, i32 %y) {
define float @f6(float %x, i32 %y) {
; CHECK-LABEL: f6:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movd %edi, %xmm1
; CHECK-NEXT: pand %xmm1, %xmm0
; CHECK-NEXT: retq
@@ -98,7 +98,7 @@ define float @f6(float %x, i32 %y) {
define float @f7(float %x) {
; CHECK-LABEL: f7:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; CHECK-NEXT: andps %xmm1, %xmm0
; CHECK-NEXT: retq
@@ -112,7 +112,7 @@ define float @f7(float %x) {
define float @f8(float %x) {
; CHECK-LABEL: f8:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; CHECK-NEXT: andps %xmm1, %xmm0
; CHECK-NEXT: retq
@@ -126,7 +126,7 @@ define float @f8(float %x) {
define i32 @f9(float %x, float %y) {
; CHECK-LABEL: f9:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: pand %xmm1, %xmm0
; CHECK-NEXT: movd %xmm0, %eax
; CHECK-NEXT: retq
@@ -140,7 +140,7 @@ define i32 @f9(float %x, float %y) {
define float @f10(float %x, float %y) {
; CHECK-LABEL: f10:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: andps %xmm1, %xmm0
; CHECK-NEXT: retq
%bc1 = bitcast float %x to i32
@@ -152,7 +152,7 @@ define float @f10(float %x, float %y) {
define float @or(float %x, float %y) {
; CHECK-LABEL: or:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: orps %xmm1, %xmm0
; CHECK-NEXT: retq
%bc1 = bitcast float %x to i32
@@ -164,7 +164,7 @@ define float @or(float %x, float %y) {
define float @xor(float %x, float %y) {
; CHECK-LABEL: xor:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: xorps %xmm1, %xmm0
; CHECK-NEXT: retq
%bc1 = bitcast float %x to i32
@@ -176,7 +176,7 @@ define float @xor(float %x, float %y) {
define float @f7_or(float %x) {
; CHECK-LABEL: f7_or:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; CHECK-NEXT: orps %xmm1, %xmm0
; CHECK-NEXT: retq
@@ -188,7 +188,7 @@ define float @f7_or(float %x) {
define float @f7_xor(float %x) {
; CHECK-LABEL: f7_xor:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; CHECK-NEXT: xorps %xmm1, %xmm0
; CHECK-NEXT: retq
@@ -202,7 +202,7 @@ define float @f7_xor(float %x) {
define double @doubles(double %x, double %y) {
; CHECK-LABEL: doubles:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: andps %xmm1, %xmm0
; CHECK-NEXT: retq
%bc1 = bitcast double %x to i64
@@ -214,7 +214,7 @@ define double @doubles(double %x, double %y) {
define double @f7_double(double %x) {
; CHECK-LABEL: f7_double:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
; CHECK-NEXT: andps %xmm1, %xmm0
; CHECK-NEXT: retq
@@ -230,7 +230,7 @@ define double @f7_double(double %x) {
define float @movmsk(float %x) {
; CHECK-LABEL: movmsk:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; CHECK-NEXT: andps %xmm1, %xmm0
; CHECK-NEXT: retq
@@ -242,7 +242,7 @@ define float @movmsk(float %x) {
define double @bitcast_fabs(double %x) {
; CHECK-LABEL: bitcast_fabs:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: andps {{.*}}(%rip), %xmm0
; CHECK-NEXT: retq
%bc1 = bitcast double %x to i64
@@ -253,7 +253,7 @@ define double @bitcast_fabs(double %x) {
define float @bitcast_fneg(float %x) {
; CHECK-LABEL: bitcast_fneg:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: xorps {{.*}}(%rip), %xmm0
; CHECK-NEXT: retq
%bc1 = bitcast float %x to i32
@@ -264,7 +264,7 @@ define float @bitcast_fneg(float %x) {
define <2 x double> @bitcast_fabs_vec(<2 x double> %x) {
; CHECK-LABEL: bitcast_fabs_vec:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: andps {{.*}}(%rip), %xmm0
; CHECK-NEXT: retq
%bc1 = bitcast <2 x double> %x to <2 x i64>
@@ -275,7 +275,7 @@ define <2 x double> @bitcast_fabs_vec(<2 x double> %x) {
define <4 x float> @bitcast_fneg_vec(<4 x float> %x) {
; CHECK-LABEL: bitcast_fneg_vec:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: xorps {{.*}}(%rip), %xmm0
; CHECK-NEXT: retq
%bc1 = bitcast <4 x float> %x to <4 x i32>
diff --git a/test/CodeGen/X86/fp-select-cmp-and.ll b/test/CodeGen/X86/fp-select-cmp-and.ll
index 651d7a3351c6..0f6159d36ea8 100644
--- a/test/CodeGen/X86/fp-select-cmp-and.ll
+++ b/test/CodeGen/X86/fp-select-cmp-and.ll
@@ -3,7 +3,7 @@
define double @test1(double %a, double %b, double %eps) {
; CHECK-LABEL: test1:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: cmpltsd %xmm2, %xmm0
; CHECK-NEXT: andpd %xmm1, %xmm0
; CHECK-NEXT: retq
@@ -14,7 +14,7 @@ define double @test1(double %a, double %b, double %eps) {
define double @test2(double %a, double %b, double %eps) {
; CHECK-LABEL: test2:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: cmplesd %xmm2, %xmm0
; CHECK-NEXT: andpd %xmm1, %xmm0
; CHECK-NEXT: retq
@@ -25,7 +25,7 @@ define double @test2(double %a, double %b, double %eps) {
define double @test3(double %a, double %b, double %eps) {
; CHECK-LABEL: test3:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: cmpltsd %xmm0, %xmm2
; CHECK-NEXT: andpd %xmm1, %xmm2
; CHECK-NEXT: movapd %xmm2, %xmm0
@@ -37,7 +37,7 @@ define double @test3(double %a, double %b, double %eps) {
define double @test4(double %a, double %b, double %eps) {
; CHECK-LABEL: test4:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: cmplesd %xmm0, %xmm2
; CHECK-NEXT: andpd %xmm1, %xmm2
; CHECK-NEXT: movapd %xmm2, %xmm0
@@ -49,7 +49,7 @@ define double @test4(double %a, double %b, double %eps) {
define double @test5(double %a, double %b, double %eps) {
; CHECK-LABEL: test5:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: cmpltsd %xmm2, %xmm0
; CHECK-NEXT: andnpd %xmm1, %xmm0
; CHECK-NEXT: retq
@@ -60,7 +60,7 @@ define double @test5(double %a, double %b, double %eps) {
define double @test6(double %a, double %b, double %eps) {
; CHECK-LABEL: test6:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: cmplesd %xmm2, %xmm0
; CHECK-NEXT: andnpd %xmm1, %xmm0
; CHECK-NEXT: retq
@@ -71,7 +71,7 @@ define double @test6(double %a, double %b, double %eps) {
define double @test7(double %a, double %b, double %eps) {
; CHECK-LABEL: test7:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: cmpltsd %xmm0, %xmm2
; CHECK-NEXT: andnpd %xmm1, %xmm2
; CHECK-NEXT: movapd %xmm2, %xmm0
@@ -83,7 +83,7 @@ define double @test7(double %a, double %b, double %eps) {
define double @test8(double %a, double %b, double %eps) {
; CHECK-LABEL: test8:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: cmplesd %xmm0, %xmm2
; CHECK-NEXT: andnpd %xmm1, %xmm2
; CHECK-NEXT: movapd %xmm2, %xmm0
@@ -95,7 +95,7 @@ define double @test8(double %a, double %b, double %eps) {
define float @test9(float %a, float %b, float %eps) {
; CHECK-LABEL: test9:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: cmpltss %xmm2, %xmm0
; CHECK-NEXT: andps %xmm1, %xmm0
; CHECK-NEXT: retq
@@ -106,7 +106,7 @@ define float @test9(float %a, float %b, float %eps) {
define float @test10(float %a, float %b, float %eps) {
; CHECK-LABEL: test10:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: cmpless %xmm2, %xmm0
; CHECK-NEXT: andps %xmm1, %xmm0
; CHECK-NEXT: retq
@@ -117,7 +117,7 @@ define float @test10(float %a, float %b, float %eps) {
define float @test11(float %a, float %b, float %eps) {
; CHECK-LABEL: test11:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: cmpltss %xmm0, %xmm2
; CHECK-NEXT: andps %xmm1, %xmm2
; CHECK-NEXT: movaps %xmm2, %xmm0
@@ -129,7 +129,7 @@ define float @test11(float %a, float %b, float %eps) {
define float @test12(float %a, float %b, float %eps) {
; CHECK-LABEL: test12:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: cmpless %xmm0, %xmm2
; CHECK-NEXT: andps %xmm1, %xmm2
; CHECK-NEXT: movaps %xmm2, %xmm0
@@ -141,7 +141,7 @@ define float @test12(float %a, float %b, float %eps) {
define float @test13(float %a, float %b, float %eps) {
; CHECK-LABEL: test13:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: cmpltss %xmm2, %xmm0
; CHECK-NEXT: andnps %xmm1, %xmm0
; CHECK-NEXT: retq
@@ -152,7 +152,7 @@ define float @test13(float %a, float %b, float %eps) {
define float @test14(float %a, float %b, float %eps) {
; CHECK-LABEL: test14:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: cmpless %xmm2, %xmm0
; CHECK-NEXT: andnps %xmm1, %xmm0
; CHECK-NEXT: retq
@@ -163,7 +163,7 @@ define float @test14(float %a, float %b, float %eps) {
define float @test15(float %a, float %b, float %eps) {
; CHECK-LABEL: test15:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: cmpltss %xmm0, %xmm2
; CHECK-NEXT: andnps %xmm1, %xmm2
; CHECK-NEXT: movaps %xmm2, %xmm0
@@ -175,7 +175,7 @@ define float @test15(float %a, float %b, float %eps) {
define float @test16(float %a, float %b, float %eps) {
; CHECK-LABEL: test16:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: cmpless %xmm0, %xmm2
; CHECK-NEXT: andnps %xmm1, %xmm2
; CHECK-NEXT: movaps %xmm2, %xmm0
@@ -187,7 +187,7 @@ define float @test16(float %a, float %b, float %eps) {
define float @test17(float %a, float %b, float %c, float %eps) {
; CHECK-LABEL: test17:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: cmpless %xmm0, %xmm3
; CHECK-NEXT: andps %xmm3, %xmm2
; CHECK-NEXT: andnps %xmm1, %xmm3
@@ -201,7 +201,7 @@ define float @test17(float %a, float %b, float %c, float %eps) {
define double @test18(double %a, double %b, double %c, double %eps) {
; CHECK-LABEL: test18:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: cmplesd %xmm0, %xmm3
; CHECK-NEXT: andpd %xmm3, %xmm2
; CHECK-NEXT: andnpd %xmm1, %xmm3
diff --git a/test/CodeGen/X86/fp-stack-2results.ll b/test/CodeGen/X86/fp-stack-2results.ll
index c8da9ea02518..0e8dbbf9b3bc 100644
--- a/test/CodeGen/X86/fp-stack-2results.ll
+++ b/test/CodeGen/X86/fp-stack-2results.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=x86 | grep fldz
-; RUN: llc < %s -march=x86-64 | grep fld1
+; RUN: llc < %s -mtriple=i686-- | grep fldz
+; RUN: llc < %s -mtriple=x86_64-- | grep fld1
%0 = type { x86_fp80, x86_fp80 }
diff --git a/test/CodeGen/X86/fp-stack-compare-cmov.ll b/test/CodeGen/X86/fp-stack-compare-cmov.ll
index 1d3548816b72..d0e816db3b69 100644
--- a/test/CodeGen/X86/fp-stack-compare-cmov.ll
+++ b/test/CodeGen/X86/fp-stack-compare-cmov.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mcpu=pentiumpro | FileCheck %s
+; RUN: llc < %s -mtriple=i686-- -mcpu=pentiumpro | FileCheck %s
; PR1012
define float @foo(float* %col.2.0) {
diff --git a/test/CodeGen/X86/fp-stack-compare.ll b/test/CodeGen/X86/fp-stack-compare.ll
index 96088d759234..8ff0dd442f9d 100644
--- a/test/CodeGen/X86/fp-stack-compare.ll
+++ b/test/CodeGen/X86/fp-stack-compare.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mcpu=i386 | FileCheck %s
+; RUN: llc < %s -mtriple=i686-- -mcpu=i386 | FileCheck %s
; PR6679
define float @foo(float* %col.2.0) {
diff --git a/test/CodeGen/X86/fp-stack-direct-ret.ll b/test/CodeGen/X86/fp-stack-direct-ret.ll
index 5a28bb50a343..b8bd22308c0d 100644
--- a/test/CodeGen/X86/fp-stack-direct-ret.ll
+++ b/test/CodeGen/X86/fp-stack-direct-ret.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=x86 | not grep fstp
-; RUN: llc < %s -march=x86 -mcpu=yonah | not grep movsd
+; RUN: llc < %s -mtriple=i686-- | not grep fstp
+; RUN: llc < %s -mtriple=i686-- -mcpu=yonah | not grep movsd
declare double @foo()
diff --git a/test/CodeGen/X86/fp-stack-ret.ll b/test/CodeGen/X86/fp-stack-ret.ll
index 9635e2d2511a..db54acc754c7 100644
--- a/test/CodeGen/X86/fp-stack-ret.ll
+++ b/test/CodeGen/X86/fp-stack-ret.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=i686-apple-darwin8 -mcpu=yonah -march=x86 | FileCheck %s
+; RUN: llc < %s -mtriple=i686-apple-darwin8 -mcpu=yonah | FileCheck %s
; These testcases shouldn't require loading into an XMM register then storing
; to memory, then reloading into an FPStack reg.
diff --git a/test/CodeGen/X86/fp-stack-retcopy.ll b/test/CodeGen/X86/fp-stack-retcopy.ll
index 67dcb1871df4..bc77f79638e4 100644
--- a/test/CodeGen/X86/fp-stack-retcopy.ll
+++ b/test/CodeGen/X86/fp-stack-retcopy.ll
@@ -1,5 +1,5 @@
; This should not copy the result of foo into an xmm register.
-; RUN: llc < %s -march=x86 -mcpu=yonah -mtriple=i686-apple-darwin9 | not grep xmm
+; RUN: llc < %s -mcpu=yonah -mtriple=i686-apple-darwin9 | not grep xmm
; rdar://5689903
declare double @foo()
diff --git a/test/CodeGen/X86/fp-stack-set-st1.ll b/test/CodeGen/X86/fp-stack-set-st1.ll
index 894897a2a5f0..45597bb78f04 100644
--- a/test/CodeGen/X86/fp-stack-set-st1.ll
+++ b/test/CodeGen/X86/fp-stack-set-st1.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | grep fxch | count 2
+; RUN: llc < %s -mtriple=i686-- | grep fxch | count 2
define i32 @main() nounwind {
entry:
diff --git a/test/CodeGen/X86/fp-trunc.ll b/test/CodeGen/X86/fp-trunc.ll
index 2f700cd4cc70..105db93749e6 100644
--- a/test/CodeGen/X86/fp-trunc.ll
+++ b/test/CodeGen/X86/fp-trunc.ll
@@ -4,7 +4,7 @@
define <1 x float> @test1(<1 x double> %x) nounwind {
; CHECK-LABEL: test1:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: pushl %eax
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: cvtsd2ss %xmm0, %xmm0
@@ -14,7 +14,7 @@ define <1 x float> @test1(<1 x double> %x) nounwind {
; CHECK-NEXT: retl
;
; AVX-LABEL: test1:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: pushl %eax
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0
@@ -28,12 +28,12 @@ define <1 x float> @test1(<1 x double> %x) nounwind {
define <2 x float> @test2(<2 x double> %x) nounwind {
; CHECK-LABEL: test2:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: cvtpd2ps %xmm0, %xmm0
; CHECK-NEXT: retl
;
; AVX-LABEL: test2:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vcvtpd2ps %xmm0, %xmm0
; AVX-NEXT: retl
%y = fptrunc <2 x double> %x to <2 x float>
@@ -42,14 +42,14 @@ define <2 x float> @test2(<2 x double> %x) nounwind {
define <4 x float> @test3(<4 x double> %x) nounwind {
; CHECK-LABEL: test3:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: cvtpd2ps %xmm1, %xmm1
; CHECK-NEXT: cvtpd2ps %xmm0, %xmm0
; CHECK-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; CHECK-NEXT: retl
;
; AVX-LABEL: test3:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vcvtpd2ps %ymm0, %xmm0
; AVX-NEXT: vzeroupper
; AVX-NEXT: retl
@@ -59,7 +59,7 @@ define <4 x float> @test3(<4 x double> %x) nounwind {
define <8 x float> @test4(<8 x double> %x) nounwind {
; CHECK-LABEL: test4:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: subl $12, %esp
; CHECK-NEXT: cvtpd2ps %xmm1, %xmm1
; CHECK-NEXT: cvtpd2ps %xmm0, %xmm0
@@ -71,7 +71,7 @@ define <8 x float> @test4(<8 x double> %x) nounwind {
; CHECK-NEXT: retl
;
; AVX-LABEL: test4:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vcvtpd2ps %ymm0, %xmm0
; AVX-NEXT: vcvtpd2ps %ymm1, %xmm1
; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
diff --git a/test/CodeGen/X86/fp-une-cmp.ll b/test/CodeGen/X86/fp-une-cmp.ll
index 1b5af5aba366..9d208dc97e8a 100644
--- a/test/CodeGen/X86/fp-une-cmp.ll
+++ b/test/CodeGen/X86/fp-une-cmp.ll
@@ -23,13 +23,13 @@
define double @rdar_7859988(double %x, double %y) nounwind readnone optsize ssp {
; CHECK-LABEL: rdar_7859988:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: mulsd %xmm1, %xmm0
; CHECK-NEXT: xorpd %xmm1, %xmm1
; CHECK-NEXT: ucomisd %xmm1, %xmm0
; CHECK-NEXT: jne .LBB0_2
; CHECK-NEXT: jp .LBB0_2
-; CHECK-NEXT: # BB#1: # %bb1
+; CHECK-NEXT: # %bb.1: # %bb1
; CHECK-NEXT: addsd {{.*}}(%rip), %xmm0
; CHECK-NEXT: .LBB0_2: # %bb2
; CHECK-NEXT: retq
@@ -50,7 +50,7 @@ bb2:
define double @profile_metadata(double %x, double %y) {
; CHECK-LABEL: profile_metadata:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: mulsd %xmm1, %xmm0
; CHECK-NEXT: xorpd %xmm1, %xmm1
; CHECK-NEXT: ucomisd %xmm1, %xmm0
@@ -81,7 +81,7 @@ bb2:
define void @foo(float %f) {
; CHECK-LABEL: foo:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xorps %xmm1, %xmm1
; CHECK-NEXT: ucomiss %xmm1, %xmm0
; CHECK-NEXT: jne .LBB2_2
diff --git a/test/CodeGen/X86/fp128-cast.ll b/test/CodeGen/X86/fp128-cast.ll
index 560892485d89..3e49f6715081 100644
--- a/test/CodeGen/X86/fp128-cast.ll
+++ b/test/CodeGen/X86/fp128-cast.ll
@@ -363,7 +363,7 @@ cleanup: ; preds = %entry, %if.then
define i1 @PR34866(i128 %x) {
; X64-LABEL: PR34866:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movaps {{.*}}(%rip), %xmm0
; X64-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-NEXT: xorq -{{[0-9]+}}(%rsp), %rsi
@@ -373,13 +373,13 @@ define i1 @PR34866(i128 %x) {
; X64-NEXT: retq
;
; X64_NO_MMX-LABEL: PR34866:
-; X64_NO_MMX: # BB#0:
+; X64_NO_MMX: # %bb.0:
; X64_NO_MMX-NEXT: orq %rsi, %rdi
; X64_NO_MMX-NEXT: sete %al
; X64_NO_MMX-NEXT: retq
;
; X32-LABEL: PR34866:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: orl {{[0-9]+}}(%esp), %ecx
@@ -394,7 +394,7 @@ define i1 @PR34866(i128 %x) {
define i1 @PR34866_commute(i128 %x) {
; X64-LABEL: PR34866_commute:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movaps {{.*}}(%rip), %xmm0
; X64-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-NEXT: xorq -{{[0-9]+}}(%rsp), %rsi
@@ -404,13 +404,13 @@ define i1 @PR34866_commute(i128 %x) {
; X64-NEXT: retq
;
; X64_NO_MMX-LABEL: PR34866_commute:
-; X64_NO_MMX: # BB#0:
+; X64_NO_MMX: # %bb.0:
; X64_NO_MMX-NEXT: orq %rsi, %rdi
; X64_NO_MMX-NEXT: sete %al
; X64_NO_MMX-NEXT: retq
;
; X32-LABEL: PR34866_commute:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: orl {{[0-9]+}}(%esp), %ecx
diff --git a/test/CodeGen/X86/fp128-g.ll b/test/CodeGen/X86/fp128-g.ll
index 5eeef0cb77c4..bc9b6b29d172 100644
--- a/test/CodeGen/X86/fp128-g.ll
+++ b/test/CodeGen/X86/fp128-g.ll
@@ -118,7 +118,7 @@ attributes #2 = { nounwind readnone }
!llvm.module.flags = !{!8, !9, !10}
!llvm.ident = !{!11}
-!0 = !DIGlobalVariableExpression(var: !1)
+!0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression())
!1 = !DIGlobalVariable(name: "ld_ptr", scope: !2, file: !3, line: 17, type: !6, isLocal: false, isDefinition: true)
!2 = distinct !DICompileUnit(language: DW_LANG_C99, file: !3, producer: "clang version 4.0.0 (trunk 281495)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !4, globals: !5)
!3 = !DIFile(filename: "fp128-g.c", directory: "/disk5/chh/Debug/ld.loop")
diff --git a/test/CodeGen/X86/fp128-i128.ll b/test/CodeGen/X86/fp128-i128.ll
index 6c6bc8bdc1d1..54e2aab37ecb 100644
--- a/test/CodeGen/X86/fp128-i128.ll
+++ b/test/CodeGen/X86/fp128-i128.ll
@@ -43,15 +43,15 @@
; }
define void @TestUnionLD1(fp128 %s, i64 %n) #0 {
; CHECK-LABEL: TestUnionLD1:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: movq -{{[0-9]+}}(%rsp), %rax
; CHECK-NEXT: movabsq $281474976710655, %rcx # imm = 0xFFFFFFFFFFFF
; CHECK-NEXT: andq %rdi, %rcx
; CHECK-NEXT: movabsq $-281474976710656, %rdx # imm = 0xFFFF000000000000
; CHECK-NEXT: andq -{{[0-9]+}}(%rsp), %rdx
-; CHECK-NEXT: orq %rcx, %rdx
; CHECK-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: orq %rcx, %rdx
; CHECK-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0
; CHECK-NEXT: jmp foo # TAILCALL
@@ -78,7 +78,7 @@ entry:
; }
define fp128 @TestUnionLD2(fp128 %s) #0 {
; CHECK-LABEL: TestUnionLD2:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: movq -{{[0-9]+}}(%rsp), %rax
; CHECK-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
@@ -102,7 +102,7 @@ entry:
; }
define fp128 @TestI128_1(fp128 %x) #0 {
; CHECK-LABEL: TestI128_1:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: subq $40, %rsp
; CHECK-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rax
@@ -140,11 +140,11 @@ entry:
; }
define fp128 @TestI128_2(fp128 %x, fp128 %y) #0 {
; CHECK-LABEL: TestI128_2:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: cmpq $0, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: jns .LBB3_2
-; CHECK-NEXT: # BB#1: # %entry
+; CHECK-NEXT: # %bb.1: # %entry
; CHECK-NEXT: movaps %xmm1, %xmm0
; CHECK-NEXT: .LBB3_2: # %entry
; CHECK-NEXT: retq
@@ -168,14 +168,14 @@ entry:
; }
define fp128 @TestI128_3(fp128 %x, i32* nocapture readnone %ex) #0 {
; CHECK-LABEL: TestI128_3:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: subq $56, %rsp
; CHECK-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rax
; CHECK-NEXT: movabsq $9223090561878065152, %rcx # imm = 0x7FFF000000000000
; CHECK-NEXT: testq %rcx, %rax
; CHECK-NEXT: je .LBB4_2
-; CHECK-NEXT: # BB#1:
+; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rcx
; CHECK-NEXT: jmp .LBB4_3
; CHECK-NEXT: .LBB4_2: # %if.then
@@ -224,7 +224,7 @@ if.end: ; preds = %if.then, %entry
; }
define fp128 @TestI128_4(fp128 %x) #0 {
; CHECK-LABEL: TestI128_4:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: subq $40, %rsp
; CHECK-NEXT: movaps %xmm0, %xmm1
; CHECK-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
@@ -253,7 +253,7 @@ entry:
; }
define void @TestShift128_2() #2 {
; CHECK-LABEL: TestShift128_2:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movq {{.*}}(%rip), %rax
; CHECK-NEXT: shlq $32, %rax
; CHECK-NEXT: movq {{.*}}(%rip), %rcx
@@ -272,7 +272,7 @@ entry:
define fp128 @acosl(fp128 %x) #0 {
; CHECK-LABEL: acosl:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: subq $40, %rsp
; CHECK-NEXT: movaps %xmm0, %xmm1
; CHECK-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
@@ -294,11 +294,11 @@ entry:
; Compare i128 values and check i128 constants.
define fp128 @TestComp(fp128 %x, fp128 %y) #0 {
; CHECK-LABEL: TestComp:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: cmpq $0, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: jns .LBB8_2
-; CHECK-NEXT: # BB#1: # %entry
+; CHECK-NEXT: # %bb.1: # %entry
; CHECK-NEXT: movaps %xmm1, %xmm0
; CHECK-NEXT: .LBB8_2: # %entry
; CHECK-NEXT: retq
@@ -314,7 +314,7 @@ declare void @foo(fp128) #1
; Test logical operations on fp128 values.
define fp128 @TestFABS_LD(fp128 %x) #0 {
; CHECK-LABEL: TestFABS_LD:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: andps {{.*}}(%rip), %xmm0
; CHECK-NEXT: retq
entry:
@@ -329,7 +329,7 @@ declare fp128 @copysignl(fp128, fp128) #1
; Test more complicated logical operations generated from copysignl.
define void @TestCopySign({ fp128, fp128 }* noalias nocapture sret %agg.result, { fp128, fp128 }* byval nocapture readonly align 16 %z) #0 {
; CHECK-LABEL: TestCopySign:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: pushq %rbp
; CHECK-NEXT: pushq %rbx
; CHECK-NEXT: subq $40, %rsp
@@ -345,7 +345,7 @@ define void @TestCopySign({ fp128, fp128 }* noalias nocapture sret %agg.result,
; CHECK-NEXT: callq __subtf3
; CHECK-NEXT: testl %ebp, %ebp
; CHECK-NEXT: jle .LBB10_1
-; CHECK-NEXT: # BB#2: # %if.then
+; CHECK-NEXT: # %bb.2: # %if.then
; CHECK-NEXT: andps {{.*}}(%rip), %xmm0
; CHECK-NEXT: movaps %xmm0, %xmm1
; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
diff --git a/test/CodeGen/X86/fp128-select.ll b/test/CodeGen/X86/fp128-select.ll
index c02db1fcdde8..85f7d97c985e 100644
--- a/test/CodeGen/X86/fp128-select.ll
+++ b/test/CodeGen/X86/fp128-select.ll
@@ -10,10 +10,10 @@
define void @test_select(fp128* %p, fp128* %q, i1 zeroext %c) {
; MMX-LABEL: test_select:
-; MMX: # BB#0:
-; MMX-NEXT: testb %dl, %dl
+; MMX: # %bb.0:
+; MMX-NEXT: testl %edx, %edx
; MMX-NEXT: jne .LBB0_1
-; MMX-NEXT: # BB#2:
+; MMX-NEXT: # %bb.2:
; MMX-NEXT: movaps {{.*}}(%rip), %xmm0
; MMX-NEXT: movaps %xmm0, (%rsi)
; MMX-NEXT: retq
@@ -23,9 +23,9 @@ define void @test_select(fp128* %p, fp128* %q, i1 zeroext %c) {
; MMX-NEXT: retq
;
; CHECK-LABEL: test_select:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: xorl %eax, %eax
-; CHECK-NEXT: testb %dl, %dl
+; CHECK-NEXT: testl %edx, %edx
; CHECK-NEXT: cmovneq (%rdi), %rax
; CHECK-NEXT: movabsq $9223231299366420480, %rcx # imm = 0x7FFF800000000000
; CHECK-NEXT: cmovneq 8(%rdi), %rcx
diff --git a/test/CodeGen/X86/fp2sint.ll b/test/CodeGen/X86/fp2sint.ll
index b41f56f9f41e..de5fe4d83746 100644
--- a/test/CodeGen/X86/fp2sint.ll
+++ b/test/CodeGen/X86/fp2sint.ll
@@ -1,6 +1,6 @@
;; LowerFP_TO_SINT should not create a stack object if it's not needed.
-; RUN: llc < %s -march=x86 -mattr=+sse2 | not grep add
+; RUN: llc < %s -mtriple=i686-- -mattr=+sse2 | not grep add
define i32 @main(i32 %argc, i8** %argv) {
cond_false.i.i.i: ; preds = %bb.i5
diff --git a/test/CodeGen/X86/fp_constant_op.ll b/test/CodeGen/X86/fp_constant_op.ll
index 9a1337ab6cdb..1c015dccb09c 100644
--- a/test/CodeGen/X86/fp_constant_op.ll
+++ b/test/CodeGen/X86/fp_constant_op.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -x86-asm-syntax=intel -mcpu=i486 | FileCheck %s
+; RUN: llc < %s -mtriple=i686-- -x86-asm-syntax=intel -mcpu=i486 | FileCheck %s
; Test that the load of the constant is folded into the operation.
diff --git a/test/CodeGen/X86/fp_load_cast_fold.ll b/test/CodeGen/X86/fp_load_cast_fold.ll
index 5fd22e3fa6e5..5ef9d479df54 100644
--- a/test/CodeGen/X86/fp_load_cast_fold.ll
+++ b/test/CodeGen/X86/fp_load_cast_fold.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | FileCheck %s
+; RUN: llc < %s -mtriple=i686-- | FileCheck %s
define double @short(i16* %P) {
%V = load i16, i16* %P ; <i16> [#uses=1]
@@ -20,7 +20,7 @@ define double @long(i64* %P) {
; CHECK: long
; CHECK: fild
-; CHECK-NOT: ESP
+; CHECK-NOT: esp
; CHECK-NOT: esp
; CHECK: {{$}}
; CHECK: ret
diff --git a/test/CodeGen/X86/fp_load_fold.ll b/test/CodeGen/X86/fp_load_fold.ll
index 57497454792b..4600c4ebe9c2 100644
--- a/test/CodeGen/X86/fp_load_fold.ll
+++ b/test/CodeGen/X86/fp_load_fold.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -x86-asm-syntax=intel | \
+; RUN: llc < %s -mtriple=i686-- -x86-asm-syntax=intel | \
; RUN: grep -i ST | not grep "fadd\|fsub\|fdiv\|fmul"
; Test that the load of the memory location is folded into the operation.
diff --git a/test/CodeGen/X86/fpcmp-soft-fp.ll b/test/CodeGen/X86/fpcmp-soft-fp.ll
index dac468e5cbf0..f96bf65e44b6 100644
--- a/test/CodeGen/X86/fpcmp-soft-fp.ll
+++ b/test/CodeGen/X86/fpcmp-soft-fp.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mcpu=pentium -mtriple=x86-linux-gnu -float-abi=soft | FileCheck %s
+; RUN: llc < %s -mcpu=pentium -mtriple=i686-linux-gnu -float-abi=soft | FileCheck %s
define i1 @test1(double %d) #0 {
entry:
diff --git a/test/CodeGen/X86/fpstack-debuginstr-kill.ll b/test/CodeGen/X86/fpstack-debuginstr-kill.ll
index ca055da551ba..7ff7a759e5d9 100644
--- a/test/CodeGen/X86/fpstack-debuginstr-kill.ll
+++ b/test/CodeGen/X86/fpstack-debuginstr-kill.ll
@@ -49,14 +49,14 @@ attributes #0 = { nounwind readnone }
!llvm.dbg.cu = !{!10}
!llvm.module.flags = !{!14, !15}
-!0 = !DIGlobalVariableExpression(var: !1)
+!0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression())
!1 = !DIGlobalVariable(name: "g1", scope: null, file: !2, line: 5, type: !3, isLocal: false, isDefinition: true)
!2 = !DIFile(filename: "f1.cpp", directory: "x87stackifier")
!3 = !DIDerivedType(tag: DW_TAG_typedef, name: "fpu_extended", file: !2, line: 3, baseType: !4)
!4 = !DIDerivedType(tag: DW_TAG_typedef, name: "fpu_register", file: !2, line: 2, baseType: !5)
!5 = !DIDerivedType(tag: DW_TAG_typedef, name: "uae_f64", file: !2, line: 1, baseType: !6)
!6 = !DIBasicType(name: "long double", size: 128, align: 128, encoding: DW_ATE_float)
-!7 = !DIGlobalVariableExpression(var: !8)
+!7 = !DIGlobalVariableExpression(var: !8, expr: !DIExpression())
!8 = !DIGlobalVariable(name: "g2", scope: null, file: !2, line: 6, type: !9, isLocal: false, isDefinition: true)
!9 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
!10 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !11, producer: "clang version 3.6.0 (http://llvm.org/git/clang 8444ae7cfeaefae031f8fedf0d1435ca3b14d90b) (http://llvm.org/git/llvm 886f0101a7d176543b831f5efb74c03427244a55)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !12, retainedTypes: !12, globals: !13, imports: !12)
diff --git a/test/CodeGen/X86/frame-lowering-debug-intrinsic-2.ll b/test/CodeGen/X86/frame-lowering-debug-intrinsic-2.ll
index ab797e04b400..ba80c839fdda 100644
--- a/test/CodeGen/X86/frame-lowering-debug-intrinsic-2.ll
+++ b/test/CodeGen/X86/frame-lowering-debug-intrinsic-2.ll
@@ -40,7 +40,8 @@ entry:
}
; CHECK-LABEL: withDebug
-; CHECK: #DEBUG_VALUE: test:j <- %RBX
+; CHECK: callq printf
+; CHECK: callq printf
; CHECK-NEXT: addq $24, %rsp
; CHECK: popq %rbx
; CHECK-NEXT: popq %r14
diff --git a/test/CodeGen/X86/frameaddr.ll b/test/CodeGen/X86/frameaddr.ll
index 1ed2b5d0af51..71dcf1ffe165 100644
--- a/test/CodeGen/X86/frameaddr.ll
+++ b/test/CodeGen/X86/frameaddr.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=x86 | FileCheck %s --check-prefix=CHECK-32
-; RUN: llc < %s -march=x86 -fast-isel -fast-isel-abort=1 | FileCheck %s --check-prefix=CHECK-32
+; RUN: llc < %s -mtriple=i686-- | FileCheck %s --check-prefix=CHECK-32
+; RUN: llc < %s -mtriple=i686-- -fast-isel -fast-isel-abort=1 | FileCheck %s --check-prefix=CHECK-32
; RUN: llc < %s -mtriple=x86_64-pc-win32 -fast-isel | FileCheck %s --check-prefix=CHECK-W64
; RUN: llc < %s -mtriple=x86_64-unknown | FileCheck %s --check-prefix=CHECK-64
; RUN: llc < %s -mtriple=x86_64-unknown -fast-isel -fast-isel-abort=1 | FileCheck %s --check-prefix=CHECK-64
diff --git a/test/CodeGen/X86/fsgsbase-schedule.ll b/test/CodeGen/X86/fsgsbase-schedule.ll
new file mode 100644
index 000000000000..8b016496edb3
--- /dev/null
+++ b/test/CodeGen/X86/fsgsbase-schedule.ll
@@ -0,0 +1,411 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=fsgsbase | FileCheck %s --check-prefix=GENERIC
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=goldmont | FileCheck %s --check-prefix=GLM
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=ivybridge | FileCheck %s --check-prefix=IVY
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=haswell | FileCheck %s --check-prefix=HASWELL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=SKYLAKE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skx | FileCheck %s --check-prefix=SKX
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=knl | FileCheck %s --check-prefix=KNL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=bdver3 | FileCheck %s --check-prefix=BDVER --check-prefix=BDVER3
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=bdver4 | FileCheck %s --check-prefix=BDVER --check-prefix=BDVER4
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=ZNVER1
+
+define i32 @test_x86_rdfsbase_32() {
+; GENERIC-LABEL: test_x86_rdfsbase_32:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: rdfsbasel %eax # sched: [100:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; GLM-LABEL: test_x86_rdfsbase_32:
+; GLM: # %bb.0:
+; GLM-NEXT: rdfsbasel %eax # sched: [100:1.00]
+; GLM-NEXT: retq # sched: [4:1.00]
+;
+; IVY-LABEL: test_x86_rdfsbase_32:
+; IVY: # %bb.0:
+; IVY-NEXT: rdfsbasel %eax # sched: [100:0.33]
+; IVY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_x86_rdfsbase_32:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: rdfsbasel %eax # sched: [100:0.25]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_x86_rdfsbase_32:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: rdfsbasel %eax # sched: [100:0.25]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_x86_rdfsbase_32:
+; SKX: # %bb.0:
+; SKX-NEXT: rdfsbasel %eax # sched: [100:0.25]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; KNL-LABEL: test_x86_rdfsbase_32:
+; KNL: # %bb.0:
+; KNL-NEXT: rdfsbasel %eax # sched: [100:0.25]
+; KNL-NEXT: retq # sched: [7:1.00]
+;
+; BDVER-LABEL: test_x86_rdfsbase_32:
+; BDVER: # %bb.0:
+; BDVER-NEXT: rdfsbasel %eax
+; BDVER-NEXT: retq
+;
+; ZNVER1-LABEL: test_x86_rdfsbase_32:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: rdfsbasel %eax # sched: [100:?]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %res = call i32 @llvm.x86.rdfsbase.32()
+ ret i32 %res
+}
+declare i32 @llvm.x86.rdfsbase.32() nounwind readnone
+
+define i32 @test_x86_rdgsbase_32() {
+; GENERIC-LABEL: test_x86_rdgsbase_32:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: rdgsbasel %eax # sched: [100:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; GLM-LABEL: test_x86_rdgsbase_32:
+; GLM: # %bb.0:
+; GLM-NEXT: rdgsbasel %eax # sched: [100:1.00]
+; GLM-NEXT: retq # sched: [4:1.00]
+;
+; IVY-LABEL: test_x86_rdgsbase_32:
+; IVY: # %bb.0:
+; IVY-NEXT: rdgsbasel %eax # sched: [100:0.33]
+; IVY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_x86_rdgsbase_32:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: rdgsbasel %eax # sched: [100:0.25]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_x86_rdgsbase_32:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: rdgsbasel %eax # sched: [100:0.25]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_x86_rdgsbase_32:
+; SKX: # %bb.0:
+; SKX-NEXT: rdgsbasel %eax # sched: [100:0.25]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; KNL-LABEL: test_x86_rdgsbase_32:
+; KNL: # %bb.0:
+; KNL-NEXT: rdgsbasel %eax # sched: [100:0.25]
+; KNL-NEXT: retq # sched: [7:1.00]
+;
+; BDVER-LABEL: test_x86_rdgsbase_32:
+; BDVER: # %bb.0:
+; BDVER-NEXT: rdgsbasel %eax
+; BDVER-NEXT: retq
+;
+; ZNVER1-LABEL: test_x86_rdgsbase_32:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: rdgsbasel %eax # sched: [100:?]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %res = call i32 @llvm.x86.rdgsbase.32()
+ ret i32 %res
+}
+declare i32 @llvm.x86.rdgsbase.32() nounwind readnone
+
+define i64 @test_x86_rdfsbase_64() {
+; GENERIC-LABEL: test_x86_rdfsbase_64:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: rdfsbaseq %rax # sched: [100:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; GLM-LABEL: test_x86_rdfsbase_64:
+; GLM: # %bb.0:
+; GLM-NEXT: rdfsbaseq %rax # sched: [100:1.00]
+; GLM-NEXT: retq # sched: [4:1.00]
+;
+; IVY-LABEL: test_x86_rdfsbase_64:
+; IVY: # %bb.0:
+; IVY-NEXT: rdfsbaseq %rax # sched: [100:0.33]
+; IVY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_x86_rdfsbase_64:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: rdfsbaseq %rax # sched: [100:0.25]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_x86_rdfsbase_64:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: rdfsbaseq %rax # sched: [100:0.25]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_x86_rdfsbase_64:
+; SKX: # %bb.0:
+; SKX-NEXT: rdfsbaseq %rax # sched: [100:0.25]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; KNL-LABEL: test_x86_rdfsbase_64:
+; KNL: # %bb.0:
+; KNL-NEXT: rdfsbaseq %rax # sched: [100:0.25]
+; KNL-NEXT: retq # sched: [7:1.00]
+;
+; BDVER-LABEL: test_x86_rdfsbase_64:
+; BDVER: # %bb.0:
+; BDVER-NEXT: rdfsbaseq %rax
+; BDVER-NEXT: retq
+;
+; ZNVER1-LABEL: test_x86_rdfsbase_64:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: rdfsbaseq %rax # sched: [100:?]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %res = call i64 @llvm.x86.rdfsbase.64()
+ ret i64 %res
+}
+declare i64 @llvm.x86.rdfsbase.64() nounwind readnone
+
+define i64 @test_x86_rdgsbase_64() {
+; GENERIC-LABEL: test_x86_rdgsbase_64:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: rdgsbaseq %rax # sched: [100:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; GLM-LABEL: test_x86_rdgsbase_64:
+; GLM: # %bb.0:
+; GLM-NEXT: rdgsbaseq %rax # sched: [100:1.00]
+; GLM-NEXT: retq # sched: [4:1.00]
+;
+; IVY-LABEL: test_x86_rdgsbase_64:
+; IVY: # %bb.0:
+; IVY-NEXT: rdgsbaseq %rax # sched: [100:0.33]
+; IVY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_x86_rdgsbase_64:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: rdgsbaseq %rax # sched: [100:0.25]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_x86_rdgsbase_64:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: rdgsbaseq %rax # sched: [100:0.25]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_x86_rdgsbase_64:
+; SKX: # %bb.0:
+; SKX-NEXT: rdgsbaseq %rax # sched: [100:0.25]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; KNL-LABEL: test_x86_rdgsbase_64:
+; KNL: # %bb.0:
+; KNL-NEXT: rdgsbaseq %rax # sched: [100:0.25]
+; KNL-NEXT: retq # sched: [7:1.00]
+;
+; BDVER-LABEL: test_x86_rdgsbase_64:
+; BDVER: # %bb.0:
+; BDVER-NEXT: rdgsbaseq %rax
+; BDVER-NEXT: retq
+;
+; ZNVER1-LABEL: test_x86_rdgsbase_64:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: rdgsbaseq %rax # sched: [100:?]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %res = call i64 @llvm.x86.rdgsbase.64()
+ ret i64 %res
+}
+declare i64 @llvm.x86.rdgsbase.64() nounwind readnone
+
+define void @test_x86_wrfsbase_32(i32 %x) {
+; GENERIC-LABEL: test_x86_wrfsbase_32:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: wrfsbasel %edi # sched: [100:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; GLM-LABEL: test_x86_wrfsbase_32:
+; GLM: # %bb.0:
+; GLM-NEXT: wrfsbasel %edi # sched: [100:1.00]
+; GLM-NEXT: retq # sched: [4:1.00]
+;
+; IVY-LABEL: test_x86_wrfsbase_32:
+; IVY: # %bb.0:
+; IVY-NEXT: wrfsbasel %edi # sched: [100:0.33]
+; IVY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_x86_wrfsbase_32:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: wrfsbasel %edi # sched: [100:0.25]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_x86_wrfsbase_32:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: wrfsbasel %edi # sched: [100:0.25]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_x86_wrfsbase_32:
+; SKX: # %bb.0:
+; SKX-NEXT: wrfsbasel %edi # sched: [100:0.25]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; KNL-LABEL: test_x86_wrfsbase_32:
+; KNL: # %bb.0:
+; KNL-NEXT: wrfsbasel %edi # sched: [100:0.25]
+; KNL-NEXT: retq # sched: [7:1.00]
+;
+; BDVER-LABEL: test_x86_wrfsbase_32:
+; BDVER: # %bb.0:
+; BDVER-NEXT: wrfsbasel %edi
+; BDVER-NEXT: retq
+;
+; ZNVER1-LABEL: test_x86_wrfsbase_32:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: wrfsbasel %edi # sched: [100:?]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ call void @llvm.x86.wrfsbase.32(i32 %x)
+ ret void
+}
+declare void @llvm.x86.wrfsbase.32(i32) nounwind readnone
+
+define void @test_x86_wrgsbase_32(i32 %x) {
+; GENERIC-LABEL: test_x86_wrgsbase_32:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: wrgsbasel %edi # sched: [100:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; GLM-LABEL: test_x86_wrgsbase_32:
+; GLM: # %bb.0:
+; GLM-NEXT: wrgsbasel %edi # sched: [100:1.00]
+; GLM-NEXT: retq # sched: [4:1.00]
+;
+; IVY-LABEL: test_x86_wrgsbase_32:
+; IVY: # %bb.0:
+; IVY-NEXT: wrgsbasel %edi # sched: [100:0.33]
+; IVY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_x86_wrgsbase_32:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: wrgsbasel %edi # sched: [100:0.25]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_x86_wrgsbase_32:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: wrgsbasel %edi # sched: [100:0.25]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_x86_wrgsbase_32:
+; SKX: # %bb.0:
+; SKX-NEXT: wrgsbasel %edi # sched: [100:0.25]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; KNL-LABEL: test_x86_wrgsbase_32:
+; KNL: # %bb.0:
+; KNL-NEXT: wrgsbasel %edi # sched: [100:0.25]
+; KNL-NEXT: retq # sched: [7:1.00]
+;
+; BDVER-LABEL: test_x86_wrgsbase_32:
+; BDVER: # %bb.0:
+; BDVER-NEXT: wrgsbasel %edi
+; BDVER-NEXT: retq
+;
+; ZNVER1-LABEL: test_x86_wrgsbase_32:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: wrgsbasel %edi # sched: [100:?]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ call void @llvm.x86.wrgsbase.32(i32 %x)
+ ret void
+}
+declare void @llvm.x86.wrgsbase.32(i32) nounwind readnone
+
+define void @test_x86_wrfsbase_64(i64 %x) {
+; GENERIC-LABEL: test_x86_wrfsbase_64:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: wrfsbaseq %rdi # sched: [100:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; GLM-LABEL: test_x86_wrfsbase_64:
+; GLM: # %bb.0:
+; GLM-NEXT: wrfsbaseq %rdi # sched: [100:1.00]
+; GLM-NEXT: retq # sched: [4:1.00]
+;
+; IVY-LABEL: test_x86_wrfsbase_64:
+; IVY: # %bb.0:
+; IVY-NEXT: wrfsbaseq %rdi # sched: [100:0.33]
+; IVY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_x86_wrfsbase_64:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: wrfsbaseq %rdi # sched: [100:0.25]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_x86_wrfsbase_64:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: wrfsbaseq %rdi # sched: [100:0.25]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_x86_wrfsbase_64:
+; SKX: # %bb.0:
+; SKX-NEXT: wrfsbaseq %rdi # sched: [100:0.25]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; KNL-LABEL: test_x86_wrfsbase_64:
+; KNL: # %bb.0:
+; KNL-NEXT: wrfsbaseq %rdi # sched: [100:0.25]
+; KNL-NEXT: retq # sched: [7:1.00]
+;
+; BDVER-LABEL: test_x86_wrfsbase_64:
+; BDVER: # %bb.0:
+; BDVER-NEXT: wrfsbaseq %rdi
+; BDVER-NEXT: retq
+;
+; ZNVER1-LABEL: test_x86_wrfsbase_64:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: wrfsbaseq %rdi # sched: [100:?]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ call void @llvm.x86.wrfsbase.64(i64 %x)
+ ret void
+}
+declare void @llvm.x86.wrfsbase.64(i64) nounwind readnone
+
+define void @test_x86_wrgsbase_64(i64 %x) {
+; GENERIC-LABEL: test_x86_wrgsbase_64:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: wrgsbaseq %rdi # sched: [100:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; GLM-LABEL: test_x86_wrgsbase_64:
+; GLM: # %bb.0:
+; GLM-NEXT: wrgsbaseq %rdi # sched: [100:1.00]
+; GLM-NEXT: retq # sched: [4:1.00]
+;
+; IVY-LABEL: test_x86_wrgsbase_64:
+; IVY: # %bb.0:
+; IVY-NEXT: wrgsbaseq %rdi # sched: [100:0.33]
+; IVY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_x86_wrgsbase_64:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: wrgsbaseq %rdi # sched: [100:0.25]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_x86_wrgsbase_64:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: wrgsbaseq %rdi # sched: [100:0.25]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_x86_wrgsbase_64:
+; SKX: # %bb.0:
+; SKX-NEXT: wrgsbaseq %rdi # sched: [100:0.25]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; KNL-LABEL: test_x86_wrgsbase_64:
+; KNL: # %bb.0:
+; KNL-NEXT: wrgsbaseq %rdi # sched: [100:0.25]
+; KNL-NEXT: retq # sched: [7:1.00]
+;
+; BDVER-LABEL: test_x86_wrgsbase_64:
+; BDVER: # %bb.0:
+; BDVER-NEXT: wrgsbaseq %rdi
+; BDVER-NEXT: retq
+;
+; ZNVER1-LABEL: test_x86_wrgsbase_64:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: wrgsbaseq %rdi # sched: [100:?]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ call void @llvm.x86.wrgsbase.64(i64 %x)
+ ret void
+}
+declare void @llvm.x86.wrgsbase.64(i64) nounwind readnone
diff --git a/test/CodeGen/X86/fsgsbase.ll b/test/CodeGen/X86/fsgsbase.ll
index 0c22e3c7db29..98434ae42c81 100644
--- a/test/CodeGen/X86/fsgsbase.ll
+++ b/test/CodeGen/X86/fsgsbase.ll
@@ -1,56 +1,81 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -march=x86-64 -mcpu=core-avx-i -mattr=fsgsbase | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=fsgsbase | FileCheck %s
define i32 @test_x86_rdfsbase_32() {
- ; CHECK: rdfsbasel
+; CHECK-LABEL: test_x86_rdfsbase_32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: rdfsbasel %eax
+; CHECK-NEXT: retq
%res = call i32 @llvm.x86.rdfsbase.32()
ret i32 %res
}
declare i32 @llvm.x86.rdfsbase.32() nounwind readnone
define i32 @test_x86_rdgsbase_32() {
- ; CHECK: rdgsbasel
+; CHECK-LABEL: test_x86_rdgsbase_32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: rdgsbasel %eax
+; CHECK-NEXT: retq
%res = call i32 @llvm.x86.rdgsbase.32()
ret i32 %res
}
declare i32 @llvm.x86.rdgsbase.32() nounwind readnone
define i64 @test_x86_rdfsbase_64() {
- ; CHECK: rdfsbaseq
+; CHECK-LABEL: test_x86_rdfsbase_64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: rdfsbaseq %rax
+; CHECK-NEXT: retq
%res = call i64 @llvm.x86.rdfsbase.64()
ret i64 %res
}
declare i64 @llvm.x86.rdfsbase.64() nounwind readnone
define i64 @test_x86_rdgsbase_64() {
- ; CHECK: rdgsbaseq
+; CHECK-LABEL: test_x86_rdgsbase_64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: rdgsbaseq %rax
+; CHECK-NEXT: retq
%res = call i64 @llvm.x86.rdgsbase.64()
ret i64 %res
}
declare i64 @llvm.x86.rdgsbase.64() nounwind readnone
define void @test_x86_wrfsbase_32(i32 %x) {
- ; CHECK: wrfsbasel
+; CHECK-LABEL: test_x86_wrfsbase_32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: wrfsbasel %edi
+; CHECK-NEXT: retq
call void @llvm.x86.wrfsbase.32(i32 %x)
ret void
}
declare void @llvm.x86.wrfsbase.32(i32) nounwind readnone
define void @test_x86_wrgsbase_32(i32 %x) {
- ; CHECK: wrgsbasel
+; CHECK-LABEL: test_x86_wrgsbase_32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: wrgsbasel %edi
+; CHECK-NEXT: retq
call void @llvm.x86.wrgsbase.32(i32 %x)
ret void
}
declare void @llvm.x86.wrgsbase.32(i32) nounwind readnone
define void @test_x86_wrfsbase_64(i64 %x) {
- ; CHECK: wrfsbaseq
+; CHECK-LABEL: test_x86_wrfsbase_64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: wrfsbaseq %rdi
+; CHECK-NEXT: retq
call void @llvm.x86.wrfsbase.64(i64 %x)
ret void
}
declare void @llvm.x86.wrfsbase.64(i64) nounwind readnone
define void @test_x86_wrgsbase_64(i64 %x) {
- ; CHECK: wrgsbaseq
+; CHECK-LABEL: test_x86_wrgsbase_64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: wrgsbaseq %rdi
+; CHECK-NEXT: retq
call void @llvm.x86.wrgsbase.64(i64 %x)
ret void
}
diff --git a/test/CodeGen/X86/fsxor-alignment.ll b/test/CodeGen/X86/fsxor-alignment.ll
index 6a8dbcfaa7c3..6f9738f50fb4 100644
--- a/test/CodeGen/X86/fsxor-alignment.ll
+++ b/test/CodeGen/X86/fsxor-alignment.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 -enable-unsafe-fp-math | \
+; RUN: llc < %s -mtriple=i686-- -mattr=+sse2 -enable-unsafe-fp-math | \
; RUN: grep -v sp | grep xorps | count 2
; Don't fold the incoming stack arguments into the xorps instructions used
diff --git a/test/CodeGen/X86/full-lsr.ll b/test/CodeGen/X86/full-lsr.ll
index 85b2b41fa191..36c98c8b494e 100644
--- a/test/CodeGen/X86/full-lsr.ll
+++ b/test/CodeGen/X86/full-lsr.ll
@@ -1,16 +1,8 @@
-; RUN: llc < %s -march=x86 -mcpu=generic | FileCheck %s
-; RUN: llc < %s -march=x86 -mcpu=atom | FileCheck -check-prefix=ATOM %s
+; RUN: llc < %s -mtriple=i686-- -mcpu=generic | FileCheck %s
define void @foo(float* nocapture %A, float* nocapture %B, float* nocapture %C, i32 %N) nounwind {
-; ATOM: foo
-; ATOM: addl
-; ATOM: addl
-; ATOM: leal
-
; CHECK: foo
-; CHECK: addl
-; CHECK: addl
-; CHECK: addl
+; CHECK: incl
entry:
%0 = icmp sgt i32 %N, 0 ; <i1> [#uses=1]
diff --git a/test/CodeGen/X86/function-subtarget-features-2.ll b/test/CodeGen/X86/function-subtarget-features-2.ll
index d7c7c2fdb6fe..df1efab6edf3 100644
--- a/test/CodeGen/X86/function-subtarget-features-2.ll
+++ b/test/CodeGen/X86/function-subtarget-features-2.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -filetype=obj -o - | llvm-objdump -d - | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-- -filetype=obj -o - | llvm-objdump -d - | FileCheck %s
; This test verifies that we assemble code for different architectures
; based on target-cpu and target-features attributes.
diff --git a/test/CodeGen/X86/function-subtarget-features.ll b/test/CodeGen/X86/function-subtarget-features.ll
index b1e2585be004..d15988eb0be7 100644
--- a/test/CodeGen/X86/function-subtarget-features.ll
+++ b/test/CodeGen/X86/function-subtarget-features.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -o - | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-- -o - | FileCheck %s
; This test verifies that we produce different code for different architectures
; based on target-cpu and target-features attributes.
diff --git a/test/CodeGen/X86/gather-addresses.ll b/test/CodeGen/X86/gather-addresses.ll
index c3109673468e..e09ad3e4e0b8 100644
--- a/test/CodeGen/X86/gather-addresses.ll
+++ b/test/CodeGen/X86/gather-addresses.ll
@@ -16,10 +16,10 @@
; LIN: sarq $32, %r[[REG2]]
; LIN: movslq %e[[REG4]], %r[[REG3:.+]]
; LIN: sarq $32, %r[[REG4]]
-; LIN: movsd (%rdi,%r[[REG1]],8), %xmm0
-; LIN: movhpd (%rdi,%r[[REG2]],8), %xmm0
-; LIN: movsd (%rdi,%r[[REG3]],8), %xmm1
-; LIN: movhpd (%rdi,%r[[REG4]],8), %xmm1
+; LIN: movsd (%rdi,%r[[REG3]],8), %xmm1
+; LIN: movhpd (%rdi,%r[[REG4]],8), %xmm1
+; LIN: movq %rdi, %xmm1
+; LIN: movq %r[[REG3]], %xmm0
; WIN: movdqa (%rdx), %xmm0
; WIN: pand (%r8), %xmm0
@@ -29,10 +29,10 @@
; WIN: sarq $32, %r[[REG2]]
; WIN: movslq %e[[REG4]], %r[[REG3:.+]]
; WIN: sarq $32, %r[[REG4]]
-; WIN: movsd (%rcx,%r[[REG1]],8), %xmm0
-; WIN: movhpd (%rcx,%r[[REG2]],8), %xmm0
-; WIN: movsd (%rcx,%r[[REG3]],8), %xmm1
-; WIN: movhpd (%rcx,%r[[REG4]],8), %xmm1
+; WIN: movsd (%rcx,%r[[REG3]],8), %xmm1
+; WIN: movhpd (%rcx,%r[[REG4]],8), %xmm1
+; WIN: movdqa (%r[[REG2]]), %xmm0
+; WIN: movq %r[[REG2]], %xmm1
define <4 x double> @foo(double* %p, <4 x i32>* %i, <4 x i32>* %h) nounwind {
%a = load <4 x i32>, <4 x i32>* %i
diff --git a/test/CodeGen/X86/getelementptr.ll b/test/CodeGen/X86/getelementptr.ll
index e260e7d7b74a..68caf7a65f8f 100644
--- a/test/CodeGen/X86/getelementptr.ll
+++ b/test/CodeGen/X86/getelementptr.ll
@@ -1,7 +1,7 @@
-; RUN: llc < %s -O0 -march=x86
-; RUN: llc < %s -O0 -march=x86-64
-; RUN: llc < %s -O2 -march=x86
-; RUN: llc < %s -O2 -march=x86-64
+; RUN: llc < %s -O0 -mtriple=i686--
+; RUN: llc < %s -O0 -mtriple=x86_64--
+; RUN: llc < %s -O2 -mtriple=i686--
+; RUN: llc < %s -O2 -mtriple=x86_64--
; Test big index trunc to pointer size:
diff --git a/test/CodeGen/X86/gfni-intrinsics.ll b/test/CodeGen/X86/gfni-intrinsics.ll
new file mode 100644
index 000000000000..76e201e3a417
--- /dev/null
+++ b/test/CodeGen/X86/gfni-intrinsics.ll
@@ -0,0 +1,33 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=+gfni -show-mc-encoding | FileCheck %s
+
+declare <16 x i8> @llvm.x86.vgf2p8affineinvqb.128(<16 x i8>, <16 x i8>, i8)
+define <16 x i8> @test_gf2p8affineinvqb_128(<16 x i8> %src1, <16 x i8> %src2) {
+; CHECK-LABEL: test_gf2p8affineinvqb_128:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: gf2p8affineinvqb $11, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0xcf,0xc1,0x0b]
+; CHECK-NEXT: retl ## encoding: [0xc3]
+ %1 = call <16 x i8> @llvm.x86.vgf2p8affineinvqb.128(<16 x i8> %src1, <16 x i8> %src2, i8 11)
+ ret <16 x i8> %1
+}
+
+declare <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8>, <16 x i8>, i8)
+define <16 x i8> @test_gf2p8affineqb_128(<16 x i8> %src1, <16 x i8> %src2) {
+; CHECK-LABEL: test_gf2p8affineqb_128:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: gf2p8affineqb $11, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0xce,0xc1,0x0b]
+; CHECK-NEXT: retl ## encoding: [0xc3]
+ %1 = call <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8> %src1, <16 x i8> %src2, i8 11)
+ ret <16 x i8> %1
+}
+
+declare <16 x i8> @llvm.x86.vgf2p8mulb.128(<16 x i8>, <16 x i8>)
+define <16 x i8> @test_gf2p8mulb_128(<16 x i8> %src1, <16 x i8> %src2) {
+; CHECK-LABEL: test_gf2p8mulb_128:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: gf2p8mulb %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x38,0xcf,0xc1]
+; CHECK-NEXT: retl ## encoding: [0xc3]
+ %1 = call <16 x i8> @llvm.x86.vgf2p8mulb.128(<16 x i8> %src1, <16 x i8> %src2)
+ ret <16 x i8> %1
+}
+
diff --git a/test/CodeGen/X86/ghc-cc.ll b/test/CodeGen/X86/ghc-cc.ll
index 16e4db60502d..1a03c6ae7060 100644
--- a/test/CodeGen/X86/ghc-cc.ll
+++ b/test/CodeGen/X86/ghc-cc.ll
@@ -2,10 +2,10 @@
; Test the GHC call convention works (x86-32)
-@base = external global i32 ; assigned to register: EBX
-@sp = external global i32 ; assigned to register: EBP
-@hp = external global i32 ; assigned to register: EDI
-@r1 = external global i32 ; assigned to register: ESI
+@base = external global i32 ; assigned to register: ebx
+@sp = external global i32 ; assigned to register: ebp
+@hp = external global i32 ; assigned to register: edi
+@r1 = external global i32 ; assigned to register: esi
define void @zap(i32 %a, i32 %b) nounwind {
entry:
diff --git a/test/CodeGen/X86/ghc-cc64.ll b/test/CodeGen/X86/ghc-cc64.ll
index c4ce8cfdef13..e8b0f06fe24e 100644
--- a/test/CodeGen/X86/ghc-cc64.ll
+++ b/test/CodeGen/X86/ghc-cc64.ll
@@ -3,22 +3,22 @@
; Check the GHC call convention works (x86-64)
@base = external global i64 ; assigned to register: R13
-@sp = external global i64 ; assigned to register: RBP
+@sp = external global i64 ; assigned to register: rbp
@hp = external global i64 ; assigned to register: R12
-@r1 = external global i64 ; assigned to register: RBX
+@r1 = external global i64 ; assigned to register: rbx
@r2 = external global i64 ; assigned to register: R14
-@r3 = external global i64 ; assigned to register: RSI
-@r4 = external global i64 ; assigned to register: RDI
+@r3 = external global i64 ; assigned to register: rsi
+@r4 = external global i64 ; assigned to register: rdi
@r5 = external global i64 ; assigned to register: R8
@r6 = external global i64 ; assigned to register: R9
@splim = external global i64 ; assigned to register: R15
-@f1 = external global float ; assigned to register: XMM1
-@f2 = external global float ; assigned to register: XMM2
-@f3 = external global float ; assigned to register: XMM3
-@f4 = external global float ; assigned to register: XMM4
-@d1 = external global double ; assigned to register: XMM5
-@d2 = external global double ; assigned to register: XMM6
+@f1 = external global float ; assigned to register: xmm1
+@f2 = external global float ; assigned to register: xmm2
+@f3 = external global float ; assigned to register: xmm3
+@f4 = external global float ; assigned to register: xmm4
+@d1 = external global double ; assigned to register: xmm5
+@d2 = external global double ; assigned to register: xmm6
define void @zap(i64 %a, i64 %b) nounwind {
entry:
diff --git a/test/CodeGen/X86/global-access-pie-copyrelocs.ll b/test/CodeGen/X86/global-access-pie-copyrelocs.ll
index 5c4a87c969ce..0918793a4d20 100644
--- a/test/CodeGen/X86/global-access-pie-copyrelocs.ll
+++ b/test/CodeGen/X86/global-access-pie-copyrelocs.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -march=x86-64 -mcpu=generic -mtriple=x86_64-linux-gnu -relocation-model=pic -pie-copy-relocations \
+; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux-gnu -relocation-model=pic -pie-copy-relocations \
; RUN: | FileCheck -check-prefix=X64 %s
-; RUN: llc < %s -emulated-tls -march=x86 -mcpu=generic -mtriple=i386-linux-gnu -relocation-model=pic -pie-copy-relocations \
+; RUN: llc < %s -emulated-tls -mcpu=generic -mtriple=i386-linux-gnu -relocation-model=pic -pie-copy-relocations \
; RUN: | FileCheck -check-prefix=X32 %s
; External Linkage
@@ -63,6 +63,33 @@ entry:
ret i32 %0
}
+; ExternalWeak Linkage
+@e = extern_weak global i32, align 4
+
+define i32* @my_access_global_d() #0 {
+; X32-LABEL: my_access_global_d:
+; X32: addl $_GLOBAL_OFFSET_TABLE_{{.*}}, %eax
+; X32: movl e@GOT(%eax), %eax
+; X64-LABEL: my_access_global_d:
+; X64: movq e@GOTPCREL(%rip), %rax
+
+entry:
+ ret i32* @e
+}
+
+; ExternalWeak hidden Linkage
+@he = extern_weak hidden global i32, align 4
+
+define i32* @my_access_global_he() #0 {
+; X32-LABEL: my_access_global_he:
+; X32: addl $_GLOBAL_OFFSET_TABLE_{{.*}}, %eax
+; X32: movl he@GOT(%eax), %eax
+; X64-LABEL: my_access_global_he:
+; X64: movq he@GOTPCREL(%rip), %rax
+ ret i32* @he
+}
+
+
; External Linkage, only declaration, store a value.
define i32 @my_access_global_store_d() #0 {
diff --git a/test/CodeGen/X86/global-access-pie.ll b/test/CodeGen/X86/global-access-pie.ll
index 0e29d605476d..98dbcce9b77b 100644
--- a/test/CodeGen/X86/global-access-pie.ll
+++ b/test/CodeGen/X86/global-access-pie.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -march=x86-64 -mcpu=generic -mtriple=x86_64-linux-gnu -relocation-model=pic \
+; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux-gnu -relocation-model=pic \
; RUN: | FileCheck -check-prefix=X64 %s
-; RUN: llc < %s -emulated-tls -march=x86 -mcpu=generic -mtriple=i386-linux-gnu -relocation-model=pic \
+; RUN: llc < %s -emulated-tls -mcpu=generic -mtriple=i386-linux-gnu -relocation-model=pic \
; RUN: | FileCheck -check-prefix=X32 %s
; External Linkage
diff --git a/test/CodeGen/X86/gpr-to-mask.ll b/test/CodeGen/X86/gpr-to-mask.ll
new file mode 100644
index 000000000000..ead07adb3877
--- /dev/null
+++ b/test/CodeGen/X86/gpr-to-mask.ll
@@ -0,0 +1,558 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512dq < %s | FileCheck %s --check-prefix=X86-64
+; RUN: llc -mtriple=i386-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512dq < %s | FileCheck %s --check-prefix=X86-32
+
+define void @test_fcmp_storefloat(i1 %cond, float* %fptr, float %f1, float %f2, float %f3, float %f4, float %f5, float %f6) {
+; X86-64-LABEL: test_fcmp_storefloat:
+; X86-64: # %bb.0: # %entry
+; X86-64-NEXT: testb $1, %dil
+; X86-64-NEXT: je .LBB0_2
+; X86-64-NEXT: # %bb.1: # %if
+; X86-64-NEXT: vcmpeqss %xmm3, %xmm2, %k1
+; X86-64-NEXT: jmp .LBB0_3
+; X86-64-NEXT: .LBB0_2: # %else
+; X86-64-NEXT: vcmpeqss %xmm5, %xmm4, %k1
+; X86-64-NEXT: .LBB0_3: # %exit
+; X86-64-NEXT: vmovss %xmm0, %xmm0, %xmm1 {%k1}
+; X86-64-NEXT: vmovss %xmm1, (%rsi)
+; X86-64-NEXT: retq
+;
+; X86-32-LABEL: test_fcmp_storefloat:
+; X86-32: # %bb.0: # %entry
+; X86-32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-32-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-32-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X86-32-NEXT: je .LBB0_2
+; X86-32-NEXT: # %bb.1: # %if
+; X86-32-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X86-32-NEXT: vcmpeqss {{[0-9]+}}(%esp), %xmm2, %k1
+; X86-32-NEXT: jmp .LBB0_3
+; X86-32-NEXT: .LBB0_2: # %else
+; X86-32-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X86-32-NEXT: vcmpeqss {{[0-9]+}}(%esp), %xmm2, %k1
+; X86-32-NEXT: .LBB0_3: # %exit
+; X86-32-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1}
+; X86-32-NEXT: vmovss %xmm0, (%eax)
+; X86-32-NEXT: retl
+entry:
+ br i1 %cond, label %if, label %else
+
+if:
+ %cmp1 = fcmp oeq float %f3, %f4
+ br label %exit
+
+else:
+ %cmp2 = fcmp oeq float %f5, %f6
+ br label %exit
+
+exit:
+ %val = phi i1 [%cmp1, %if], [%cmp2, %else]
+ %selected = select i1 %val, float %f1, float %f2
+ store float %selected, float* %fptr
+ ret void
+}
+
+define void @test_fcmp_storei1(i1 %cond, float* %fptr, i1* %iptr, float %f1, float %f2, float %f3, float %f4) {
+; X86-64-LABEL: test_fcmp_storei1:
+; X86-64: # %bb.0: # %entry
+; X86-64-NEXT: testb $1, %dil
+; X86-64-NEXT: je .LBB1_2
+; X86-64-NEXT: # %bb.1: # %if
+; X86-64-NEXT: vcmpeqss %xmm1, %xmm0, %k0
+; X86-64-NEXT: jmp .LBB1_3
+; X86-64-NEXT: .LBB1_2: # %else
+; X86-64-NEXT: vcmpeqss %xmm3, %xmm2, %k0
+; X86-64-NEXT: .LBB1_3: # %exit
+; X86-64-NEXT: kmovd %k0, %eax
+; X86-64-NEXT: andb $1, %al
+; X86-64-NEXT: movb %al, (%rdx)
+; X86-64-NEXT: retq
+;
+; X86-32-LABEL: test_fcmp_storei1:
+; X86-32: # %bb.0: # %entry
+; X86-32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-32-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X86-32-NEXT: je .LBB1_2
+; X86-32-NEXT: # %bb.1: # %if
+; X86-32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-32-NEXT: vcmpeqss {{[0-9]+}}(%esp), %xmm0, %k0
+; X86-32-NEXT: jmp .LBB1_3
+; X86-32-NEXT: .LBB1_2: # %else
+; X86-32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-32-NEXT: vcmpeqss {{[0-9]+}}(%esp), %xmm0, %k0
+; X86-32-NEXT: .LBB1_3: # %exit
+; X86-32-NEXT: kmovd %k0, %ecx
+; X86-32-NEXT: andb $1, %cl
+; X86-32-NEXT: movb %cl, (%eax)
+; X86-32-NEXT: retl
+entry:
+ br i1 %cond, label %if, label %else
+
+if:
+ %cmp1 = fcmp oeq float %f1, %f2
+ br label %exit
+
+else:
+ %cmp2 = fcmp oeq float %f3, %f4
+ br label %exit
+
+exit:
+ %val = phi i1 [%cmp1, %if], [%cmp2, %else]
+ store i1 %val, i1* %iptr
+ ret void
+}
+
+define void @test_load_add(i1 %cond, float* %fptr, i1* %iptr1, i1* %iptr2, float %f1, float %f2) {
+; X86-64-LABEL: test_load_add:
+; X86-64: # %bb.0: # %entry
+; X86-64-NEXT: testb $1, %dil
+; X86-64-NEXT: je .LBB2_2
+; X86-64-NEXT: # %bb.1: # %if
+; X86-64-NEXT: kmovb (%rdx), %k0
+; X86-64-NEXT: kmovb (%rcx), %k1
+; X86-64-NEXT: kaddb %k1, %k0, %k1
+; X86-64-NEXT: jmp .LBB2_3
+; X86-64-NEXT: .LBB2_2: # %else
+; X86-64-NEXT: kmovb (%rcx), %k1
+; X86-64-NEXT: .LBB2_3: # %exit
+; X86-64-NEXT: vmovss %xmm0, %xmm0, %xmm1 {%k1}
+; X86-64-NEXT: vmovss %xmm1, (%rsi)
+; X86-64-NEXT: retq
+;
+; X86-32-LABEL: test_load_add:
+; X86-32: # %bb.0: # %entry
+; X86-32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-32-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-32-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X86-32-NEXT: je .LBB2_2
+; X86-32-NEXT: # %bb.1: # %if
+; X86-32-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-32-NEXT: kmovb (%edx), %k0
+; X86-32-NEXT: kmovb (%ecx), %k1
+; X86-32-NEXT: kaddb %k1, %k0, %k1
+; X86-32-NEXT: jmp .LBB2_3
+; X86-32-NEXT: .LBB2_2: # %else
+; X86-32-NEXT: kmovb (%ecx), %k1
+; X86-32-NEXT: .LBB2_3: # %exit
+; X86-32-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1}
+; X86-32-NEXT: vmovss %xmm0, (%eax)
+; X86-32-NEXT: retl
+entry:
+ br i1 %cond, label %if, label %else
+
+if:
+ %loaded1 = load i1, i1* %iptr1
+ %loaded2if = load i1, i1* %iptr2
+ %added = add i1 %loaded1, %loaded2if
+ br label %exit
+
+else:
+ %loaded2else = load i1, i1* %iptr2
+ br label %exit
+
+exit:
+ %val = phi i1 [%added, %if], [%loaded2else, %else]
+ %selected = select i1 %val, float %f1, float %f2
+ store float %selected, float* %fptr
+ ret void
+}
+
+define void @test_load_i1(i1 %cond, float* %fptr, i1* %iptr1, i1* %iptr2, float %f1, float %f2) {
+; X86-64-LABEL: test_load_i1:
+; X86-64: # %bb.0: # %entry
+; X86-64-NEXT: testb $1, %dil
+; X86-64-NEXT: je .LBB3_2
+; X86-64-NEXT: # %bb.1: # %if
+; X86-64-NEXT: kmovb (%rdx), %k1
+; X86-64-NEXT: jmp .LBB3_3
+; X86-64-NEXT: .LBB3_2: # %else
+; X86-64-NEXT: kmovb (%rcx), %k1
+; X86-64-NEXT: .LBB3_3: # %exit
+; X86-64-NEXT: vmovss %xmm0, %xmm0, %xmm1 {%k1}
+; X86-64-NEXT: vmovss %xmm1, (%rsi)
+; X86-64-NEXT: retq
+;
+; X86-32-LABEL: test_load_i1:
+; X86-32: # %bb.0: # %entry
+; X86-32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-32-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-32-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X86-32-NEXT: je .LBB3_2
+; X86-32-NEXT: # %bb.1: # %if
+; X86-32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-32-NEXT: jmp .LBB3_3
+; X86-32-NEXT: .LBB3_2: # %else
+; X86-32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-32-NEXT: .LBB3_3: # %exit
+; X86-32-NEXT: kmovb (%ecx), %k1
+; X86-32-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1}
+; X86-32-NEXT: vmovss %xmm0, (%eax)
+; X86-32-NEXT: retl
+entry:
+ br i1 %cond, label %if, label %else
+
+if:
+ %loaded1 = load i1, i1* %iptr1
+ br label %exit
+
+else:
+ %loaded2 = load i1, i1* %iptr2
+ br label %exit
+
+exit:
+ %val = phi i1 [%loaded1, %if], [%loaded2, %else]
+ %selected = select i1 %val, float %f1, float %f2
+ store float %selected, float* %fptr
+ ret void
+}
+
+define void @test_loadi1_storei1(i1 %cond, i1* %iptr1, i1* %iptr2, i1* %iptr3) {
+; X86-64-LABEL: test_loadi1_storei1:
+; X86-64: # %bb.0: # %entry
+; X86-64-NEXT: testb $1, %dil
+; X86-64-NEXT: je .LBB4_2
+; X86-64-NEXT: # %bb.1: # %if
+; X86-64-NEXT: movb (%rsi), %al
+; X86-64-NEXT: jmp .LBB4_3
+; X86-64-NEXT: .LBB4_2: # %else
+; X86-64-NEXT: movb (%rdx), %al
+; X86-64-NEXT: .LBB4_3: # %exit
+; X86-64-NEXT: andb $1, %al
+; X86-64-NEXT: movb %al, (%rcx)
+; X86-64-NEXT: retq
+;
+; X86-32-LABEL: test_loadi1_storei1:
+; X86-32: # %bb.0: # %entry
+; X86-32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-32-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X86-32-NEXT: je .LBB4_2
+; X86-32-NEXT: # %bb.1: # %if
+; X86-32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-32-NEXT: jmp .LBB4_3
+; X86-32-NEXT: .LBB4_2: # %else
+; X86-32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-32-NEXT: .LBB4_3: # %exit
+; X86-32-NEXT: movb (%ecx), %cl
+; X86-32-NEXT: andb $1, %cl
+; X86-32-NEXT: movb %cl, (%eax)
+; X86-32-NEXT: retl
+entry:
+ br i1 %cond, label %if, label %else
+
+if:
+ %loaded1 = load i1, i1* %iptr1
+ br label %exit
+
+else:
+ %loaded2 = load i1, i1* %iptr2
+ br label %exit
+
+exit:
+ %val = phi i1 [%loaded1, %if], [%loaded2, %else]
+ store i1 %val, i1* %iptr3
+ ret void
+}
+
+define void @test_shl1(i1 %cond, i8* %ptr1, i8* %ptr2, <8 x float> %fvec1, <8 x float> %fvec2, <8 x float>* %fptrvec) {
+; X86-64-LABEL: test_shl1:
+; X86-64: # %bb.0: # %entry
+; X86-64-NEXT: # kill: def %ymm1 killed %ymm1 def %zmm1
+; X86-64-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
+; X86-64-NEXT: testb $1, %dil
+; X86-64-NEXT: je .LBB5_2
+; X86-64-NEXT: # %bb.1: # %if
+; X86-64-NEXT: kmovb (%rsi), %k0
+; X86-64-NEXT: kaddb %k0, %k0, %k1
+; X86-64-NEXT: jmp .LBB5_3
+; X86-64-NEXT: .LBB5_2: # %else
+; X86-64-NEXT: kmovb (%rdx), %k1
+; X86-64-NEXT: .LBB5_3: # %exit
+; X86-64-NEXT: vmovaps %zmm0, %zmm1 {%k1}
+; X86-64-NEXT: vmovaps %ymm1, (%rcx)
+; X86-64-NEXT: vzeroupper
+; X86-64-NEXT: retq
+;
+; X86-32-LABEL: test_shl1:
+; X86-32: # %bb.0: # %entry
+; X86-32-NEXT: # kill: def %ymm1 killed %ymm1 def %zmm1
+; X86-32-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
+; X86-32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-32-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X86-32-NEXT: je .LBB5_2
+; X86-32-NEXT: # %bb.1: # %if
+; X86-32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-32-NEXT: kmovb (%ecx), %k0
+; X86-32-NEXT: kaddb %k0, %k0, %k1
+; X86-32-NEXT: jmp .LBB5_3
+; X86-32-NEXT: .LBB5_2: # %else
+; X86-32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-32-NEXT: kmovb (%ecx), %k1
+; X86-32-NEXT: .LBB5_3: # %exit
+; X86-32-NEXT: vmovaps %zmm0, %zmm1 {%k1}
+; X86-32-NEXT: vmovaps %ymm1, (%eax)
+; X86-32-NEXT: vzeroupper
+; X86-32-NEXT: retl
+entry:
+ br i1 %cond, label %if, label %else
+
+if:
+ %loaded1 = load i8, i8* %ptr1
+ %shifted = shl i8 %loaded1, 1
+ br label %exit
+
+else:
+ %loaded2 = load i8, i8* %ptr2
+ br label %exit
+
+exit:
+ %val = phi i8 [%shifted, %if], [%loaded2, %else]
+ %mask = bitcast i8 %val to <8 x i1>
+ %selected = select <8 x i1> %mask, <8 x float> %fvec1, <8 x float> %fvec2
+ store <8 x float> %selected, <8 x float>* %fptrvec
+ ret void
+}
+
+define void @test_shr1(i1 %cond, i8* %ptr1, i8* %ptr2, <8 x float> %fvec1, <8 x float> %fvec2, <8 x float>* %fptrvec) {
+; X86-64-LABEL: test_shr1:
+; X86-64: # %bb.0: # %entry
+; X86-64-NEXT: # kill: def %ymm1 killed %ymm1 def %zmm1
+; X86-64-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
+; X86-64-NEXT: testb $1, %dil
+; X86-64-NEXT: je .LBB6_2
+; X86-64-NEXT: # %bb.1: # %if
+; X86-64-NEXT: movb (%rsi), %al
+; X86-64-NEXT: shrb %al
+; X86-64-NEXT: jmp .LBB6_3
+; X86-64-NEXT: .LBB6_2: # %else
+; X86-64-NEXT: movb (%rdx), %al
+; X86-64-NEXT: .LBB6_3: # %exit
+; X86-64-NEXT: kmovd %eax, %k1
+; X86-64-NEXT: vmovaps %zmm0, %zmm1 {%k1}
+; X86-64-NEXT: vmovaps %ymm1, (%rcx)
+; X86-64-NEXT: vzeroupper
+; X86-64-NEXT: retq
+;
+; X86-32-LABEL: test_shr1:
+; X86-32: # %bb.0: # %entry
+; X86-32-NEXT: # kill: def %ymm1 killed %ymm1 def %zmm1
+; X86-32-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
+; X86-32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-32-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X86-32-NEXT: je .LBB6_2
+; X86-32-NEXT: # %bb.1: # %if
+; X86-32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-32-NEXT: movb (%ecx), %cl
+; X86-32-NEXT: shrb %cl
+; X86-32-NEXT: jmp .LBB6_3
+; X86-32-NEXT: .LBB6_2: # %else
+; X86-32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-32-NEXT: movb (%ecx), %cl
+; X86-32-NEXT: .LBB6_3: # %exit
+; X86-32-NEXT: kmovd %ecx, %k1
+; X86-32-NEXT: vmovaps %zmm0, %zmm1 {%k1}
+; X86-32-NEXT: vmovaps %ymm1, (%eax)
+; X86-32-NEXT: vzeroupper
+; X86-32-NEXT: retl
+entry:
+ br i1 %cond, label %if, label %else
+
+if:
+ %loaded1 = load i8, i8* %ptr1
+ %shifted = lshr i8 %loaded1, 1
+ br label %exit
+
+else:
+ %loaded2 = load i8, i8* %ptr2
+ br label %exit
+
+exit:
+ %val = phi i8 [%shifted, %if], [%loaded2, %else]
+ %mask = bitcast i8 %val to <8 x i1>
+ %selected = select <8 x i1> %mask, <8 x float> %fvec1, <8 x float> %fvec2
+ store <8 x float> %selected, <8 x float>* %fptrvec
+ ret void
+}
+
+define void @test_shr2(i1 %cond, i8* %ptr1, i8* %ptr2, <8 x float> %fvec1, <8 x float> %fvec2, <8 x float>* %fptrvec) {
+; X86-64-LABEL: test_shr2:
+; X86-64: # %bb.0: # %entry
+; X86-64-NEXT: # kill: def %ymm1 killed %ymm1 def %zmm1
+; X86-64-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
+; X86-64-NEXT: testb $1, %dil
+; X86-64-NEXT: je .LBB7_2
+; X86-64-NEXT: # %bb.1: # %if
+; X86-64-NEXT: kmovb (%rsi), %k0
+; X86-64-NEXT: kshiftrb $2, %k0, %k1
+; X86-64-NEXT: jmp .LBB7_3
+; X86-64-NEXT: .LBB7_2: # %else
+; X86-64-NEXT: kmovb (%rdx), %k1
+; X86-64-NEXT: .LBB7_3: # %exit
+; X86-64-NEXT: vmovaps %zmm0, %zmm1 {%k1}
+; X86-64-NEXT: vmovaps %ymm1, (%rcx)
+; X86-64-NEXT: vzeroupper
+; X86-64-NEXT: retq
+;
+; X86-32-LABEL: test_shr2:
+; X86-32: # %bb.0: # %entry
+; X86-32-NEXT: # kill: def %ymm1 killed %ymm1 def %zmm1
+; X86-32-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
+; X86-32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-32-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X86-32-NEXT: je .LBB7_2
+; X86-32-NEXT: # %bb.1: # %if
+; X86-32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-32-NEXT: kmovb (%ecx), %k0
+; X86-32-NEXT: kshiftrb $2, %k0, %k1
+; X86-32-NEXT: jmp .LBB7_3
+; X86-32-NEXT: .LBB7_2: # %else
+; X86-32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-32-NEXT: kmovb (%ecx), %k1
+; X86-32-NEXT: .LBB7_3: # %exit
+; X86-32-NEXT: vmovaps %zmm0, %zmm1 {%k1}
+; X86-32-NEXT: vmovaps %ymm1, (%eax)
+; X86-32-NEXT: vzeroupper
+; X86-32-NEXT: retl
+entry:
+ br i1 %cond, label %if, label %else
+
+if:
+ %loaded1 = load i8, i8* %ptr1
+ %shifted = lshr i8 %loaded1, 2
+ br label %exit
+
+else:
+ %loaded2 = load i8, i8* %ptr2
+ br label %exit
+
+exit:
+ %val = phi i8 [%shifted, %if], [%loaded2, %else]
+ %mask = bitcast i8 %val to <8 x i1>
+ %selected = select <8 x i1> %mask, <8 x float> %fvec1, <8 x float> %fvec2
+ store <8 x float> %selected, <8 x float>* %fptrvec
+ ret void
+}
+
+define void @test_shl(i1 %cond, i8* %ptr1, i8* %ptr2, <8 x float> %fvec1, <8 x float> %fvec2, <8 x float>* %fptrvec) {
+; X86-64-LABEL: test_shl:
+; X86-64: # %bb.0: # %entry
+; X86-64-NEXT: # kill: def %ymm1 killed %ymm1 def %zmm1
+; X86-64-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
+; X86-64-NEXT: testb $1, %dil
+; X86-64-NEXT: je .LBB8_2
+; X86-64-NEXT: # %bb.1: # %if
+; X86-64-NEXT: kmovb (%rsi), %k0
+; X86-64-NEXT: kshiftlb $6, %k0, %k1
+; X86-64-NEXT: jmp .LBB8_3
+; X86-64-NEXT: .LBB8_2: # %else
+; X86-64-NEXT: kmovb (%rdx), %k1
+; X86-64-NEXT: .LBB8_3: # %exit
+; X86-64-NEXT: vmovaps %zmm0, %zmm1 {%k1}
+; X86-64-NEXT: vmovaps %ymm1, (%rcx)
+; X86-64-NEXT: vzeroupper
+; X86-64-NEXT: retq
+;
+; X86-32-LABEL: test_shl:
+; X86-32: # %bb.0: # %entry
+; X86-32-NEXT: # kill: def %ymm1 killed %ymm1 def %zmm1
+; X86-32-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
+; X86-32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-32-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X86-32-NEXT: je .LBB8_2
+; X86-32-NEXT: # %bb.1: # %if
+; X86-32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-32-NEXT: kmovb (%ecx), %k0
+; X86-32-NEXT: kshiftlb $6, %k0, %k1
+; X86-32-NEXT: jmp .LBB8_3
+; X86-32-NEXT: .LBB8_2: # %else
+; X86-32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-32-NEXT: kmovb (%ecx), %k1
+; X86-32-NEXT: .LBB8_3: # %exit
+; X86-32-NEXT: vmovaps %zmm0, %zmm1 {%k1}
+; X86-32-NEXT: vmovaps %ymm1, (%eax)
+; X86-32-NEXT: vzeroupper
+; X86-32-NEXT: retl
+entry:
+ br i1 %cond, label %if, label %else
+
+if:
+ %loaded1 = load i8, i8* %ptr1
+ %shifted = shl i8 %loaded1, 6
+ br label %exit
+
+else:
+ %loaded2 = load i8, i8* %ptr2
+ br label %exit
+
+exit:
+ %val = phi i8 [%shifted, %if], [%loaded2, %else]
+ %mask = bitcast i8 %val to <8 x i1>
+ %selected = select <8 x i1> %mask, <8 x float> %fvec1, <8 x float> %fvec2
+ store <8 x float> %selected, <8 x float>* %fptrvec
+ ret void
+}
+
+define void @test_add(i1 %cond, i8* %ptr1, i8* %ptr2, <8 x float> %fvec1, <8 x float> %fvec2, <8 x float>* %fptrvec) {
+; X86-64-LABEL: test_add:
+; X86-64: # %bb.0: # %entry
+; X86-64-NEXT: # kill: def %ymm1 killed %ymm1 def %zmm1
+; X86-64-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
+; X86-64-NEXT: kmovb (%rsi), %k0
+; X86-64-NEXT: kmovb (%rdx), %k1
+; X86-64-NEXT: testb $1, %dil
+; X86-64-NEXT: je .LBB9_2
+; X86-64-NEXT: # %bb.1: # %if
+; X86-64-NEXT: kandb %k1, %k0, %k1
+; X86-64-NEXT: jmp .LBB9_3
+; X86-64-NEXT: .LBB9_2: # %else
+; X86-64-NEXT: kaddb %k1, %k0, %k1
+; X86-64-NEXT: .LBB9_3: # %exit
+; X86-64-NEXT: vmovaps %zmm0, %zmm1 {%k1}
+; X86-64-NEXT: vmovaps %ymm1, (%rcx)
+; X86-64-NEXT: vzeroupper
+; X86-64-NEXT: retq
+;
+; X86-32-LABEL: test_add:
+; X86-32: # %bb.0: # %entry
+; X86-32-NEXT: # kill: def %ymm1 killed %ymm1 def %zmm1
+; X86-32-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
+; X86-32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-32-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-32-NEXT: kmovb (%edx), %k0
+; X86-32-NEXT: kmovb (%ecx), %k1
+; X86-32-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X86-32-NEXT: je .LBB9_2
+; X86-32-NEXT: # %bb.1: # %if
+; X86-32-NEXT: kandb %k1, %k0, %k1
+; X86-32-NEXT: jmp .LBB9_3
+; X86-32-NEXT: .LBB9_2: # %else
+; X86-32-NEXT: kaddb %k1, %k0, %k1
+; X86-32-NEXT: .LBB9_3: # %exit
+; X86-32-NEXT: vmovaps %zmm0, %zmm1 {%k1}
+; X86-32-NEXT: vmovaps %ymm1, (%eax)
+; X86-32-NEXT: vzeroupper
+; X86-32-NEXT: retl
+entry:
+ %loaded1 = load i8, i8* %ptr1
+ %loaded2 = load i8, i8* %ptr2
+ br i1 %cond, label %if, label %else
+
+if:
+ %and = and i8 %loaded1, %loaded2
+ br label %exit
+
+else:
+ %add = add i8 %loaded1, %loaded2
+ br label %exit
+
+exit:
+ %val = phi i8 [%and, %if], [%add, %else]
+ %mask = bitcast i8 %val to <8 x i1>
+ %selected = select <8 x i1> %mask, <8 x float> %fvec1, <8 x float> %fvec2
+ store <8 x float> %selected, <8 x float>* %fptrvec
+ ret void
+}
diff --git a/test/CodeGen/X86/greedy_regalloc_bad_eviction_sequence.ll b/test/CodeGen/X86/greedy_regalloc_bad_eviction_sequence.ll
new file mode 100644
index 000000000000..4e39db59f8f9
--- /dev/null
+++ b/test/CodeGen/X86/greedy_regalloc_bad_eviction_sequence.ll
@@ -0,0 +1,116 @@
+; RUN: llc < %s -march=x86 -regalloc=greedy -stop-after=greedy | FileCheck %s
+; Make sure bad eviction sequence doesnt occur
+
+; Part of the fix for bugzilla 26810.
+; This test is meant to make sure bad eviction sequence like the one described
+; below does not occur
+;
+; movl %ebp, 8(%esp) # 4-byte Spill
+; movl %ecx, %ebp
+; movl %ebx, %ecx
+; movl %edi, %ebx
+; movl %edx, %edi
+; cltd
+; idivl %esi
+; movl %edi, %edx
+; movl %ebx, %edi
+; movl %ecx, %ebx
+; movl %ebp, %ecx
+; movl 16(%esp), %ebp # 4 - byte Reload
+
+; Make sure we have no redundant copies in the problematic code seqtion
+; CHECK-LABEL: name: bar
+; CHECK: bb.3.for.body:
+; CHECK: %eax = COPY
+; CHECK-NEXT: CDQ
+; CHECK-NEXT: IDIV32r
+; CHECK-NEXT: ADD32rr
+
+
+target datalayout = "e-m:x-p:32:32-i64:64-f80:32-n8:16:32-a:0:32-S32"
+target triple = "i386-pc-linux-gnu"
+
+
+; Function Attrs: norecurse nounwind readonly
+define i32 @bar(i32 %size, i32* nocapture readonly %arr, i32* nocapture readnone %tmp) local_unnamed_addr #1 {
+entry:
+ %0 = load i32, i32* %arr, align 4, !tbaa !3
+ %arrayidx3 = getelementptr inbounds i32, i32* %arr, i32 1
+ %1 = load i32, i32* %arrayidx3, align 4, !tbaa !3
+ %arrayidx5 = getelementptr inbounds i32, i32* %arr, i32 2
+ %2 = load i32, i32* %arrayidx5, align 4, !tbaa !3
+ %arrayidx7 = getelementptr inbounds i32, i32* %arr, i32 3
+ %3 = load i32, i32* %arrayidx7, align 4, !tbaa !3
+ %arrayidx9 = getelementptr inbounds i32, i32* %arr, i32 4
+ %4 = load i32, i32* %arrayidx9, align 4, !tbaa !3
+ %arrayidx11 = getelementptr inbounds i32, i32* %arr, i32 5
+ %5 = load i32, i32* %arrayidx11, align 4, !tbaa !3
+ %arrayidx13 = getelementptr inbounds i32, i32* %arr, i32 6
+ %6 = load i32, i32* %arrayidx13, align 4, !tbaa !3
+ %arrayidx15 = getelementptr inbounds i32, i32* %arr, i32 7
+ %7 = load i32, i32* %arrayidx15, align 4, !tbaa !3
+ %arrayidx17 = getelementptr inbounds i32, i32* %arr, i32 8
+ %8 = load i32, i32* %arrayidx17, align 4, !tbaa !3
+ %cmp69 = icmp sgt i32 %size, 1
+ br i1 %cmp69, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %for.body, %entry
+ %x0.0.lcssa = phi i32 [ %0, %entry ], [ %add, %for.body ]
+ %x1.0.lcssa = phi i32 [ %1, %entry ], [ %sub, %for.body ]
+ %x2.0.lcssa = phi i32 [ %2, %entry ], [ %mul, %for.body ]
+ %x3.0.lcssa = phi i32 [ %3, %entry ], [ %div, %for.body ]
+ %x4.0.lcssa = phi i32 [ %4, %entry ], [ %add19, %for.body ]
+ %x5.0.lcssa = phi i32 [ %5, %entry ], [ %sub20, %for.body ]
+ %x6.0.lcssa = phi i32 [ %6, %entry ], [ %add21, %for.body ]
+ %x7.0.lcssa = phi i32 [ %7, %entry ], [ %mul22, %for.body ]
+ %x8.0.lcssa = phi i32 [ %8, %entry ], [ %sub23, %for.body ]
+ %mul24 = mul nsw i32 %x1.0.lcssa, %x0.0.lcssa
+ %mul25 = mul nsw i32 %mul24, %x2.0.lcssa
+ %mul26 = mul nsw i32 %mul25, %x3.0.lcssa
+ %mul27 = mul nsw i32 %mul26, %x4.0.lcssa
+ %mul28 = mul nsw i32 %mul27, %x5.0.lcssa
+ %mul29 = mul nsw i32 %mul28, %x6.0.lcssa
+ %mul30 = mul nsw i32 %mul29, %x7.0.lcssa
+ %mul31 = mul nsw i32 %mul30, %x8.0.lcssa
+ ret i32 %mul31
+
+for.body: ; preds = %entry, %for.body
+ %i.079 = phi i32 [ %inc, %for.body ], [ 1, %entry ]
+ %x8.078 = phi i32 [ %sub23, %for.body ], [ %8, %entry ]
+ %x7.077 = phi i32 [ %mul22, %for.body ], [ %7, %entry ]
+ %x6.076 = phi i32 [ %add21, %for.body ], [ %6, %entry ]
+ %x5.075 = phi i32 [ %sub20, %for.body ], [ %5, %entry ]
+ %x4.074 = phi i32 [ %add19, %for.body ], [ %4, %entry ]
+ %x3.073 = phi i32 [ %div, %for.body ], [ %3, %entry ]
+ %x2.072 = phi i32 [ %mul, %for.body ], [ %2, %entry ]
+ %x1.071 = phi i32 [ %sub, %for.body ], [ %1, %entry ]
+ %x0.070 = phi i32 [ %add, %for.body ], [ %0, %entry ]
+ %add = add nsw i32 %x1.071, %x0.070
+ %sub = sub nsw i32 %x1.071, %x2.072
+ %mul = mul nsw i32 %x3.073, %x2.072
+ %div = sdiv i32 %x3.073, %x4.074
+ %add19 = add nsw i32 %x5.075, %x4.074
+ %sub20 = sub nsw i32 %x5.075, %x6.076
+ %add21 = add nsw i32 %x7.077, %x6.076
+ %mul22 = mul nsw i32 %x8.078, %x7.077
+ %sub23 = sub nsw i32 %x8.078, %add
+ %inc = add nuw nsw i32 %i.079, 1
+ %exitcond = icmp eq i32 %inc, %size
+ br i1 %exitcond, label %for.cond.cleanup, label %for.body, !llvm.loop !7
+}
+
+attributes #0 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { norecurse nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.module.flags = !{!0, !1}
+!llvm.ident = !{!2}
+
+!0 = !{i32 1, !"NumRegisterParameters", i32 0}
+!1 = !{i32 1, !"wchar_size", i32 2}
+!2 = !{!"clang version 5.0.0 (cfe/trunk 305640)"}
+!3 = !{!4, !4, i64 0}
+!4 = !{!"int", !5, i64 0}
+!5 = !{!"omnipotent char", !6, i64 0}
+!6 = !{!"Simple C/C++ TBAA"}
+!7 = distinct !{!7, !8}
+!8 = !{!"llvm.loop.unroll.disable"}
diff --git a/test/CodeGen/X86/h-register-addressing-32.ll b/test/CodeGen/X86/h-register-addressing-32.ll
index d0214137b0e4..8a86a421737b 100644
--- a/test/CodeGen/X86/h-register-addressing-32.ll
+++ b/test/CodeGen/X86/h-register-addressing-32.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=-bmi | FileCheck %s
+; RUN: llc < %s -mtriple=i686-- -mattr=-bmi | FileCheck %s
; Use h-register extract and zero-extend.
diff --git a/test/CodeGen/X86/h-register-addressing-64.ll b/test/CodeGen/X86/h-register-addressing-64.ll
index b3159f4896a8..8d8b44ce2826 100644
--- a/test/CodeGen/X86/h-register-addressing-64.ll
+++ b/test/CodeGen/X86/h-register-addressing-64.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mattr=-bmi | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-- -mattr=-bmi | FileCheck %s
; Use h-register extract and zero-extend.
diff --git a/test/CodeGen/X86/h-register-store.ll b/test/CodeGen/X86/h-register-store.ll
index 0e6a0236d2c3..dbf978276126 100644
--- a/test/CodeGen/X86/h-register-store.ll
+++ b/test/CodeGen/X86/h-register-store.ll
@@ -25,7 +25,7 @@
; W64: movb %ch, (%rdx)
; W64-NOT: mov
-; RUN: llc < %s -march=x86 | FileCheck %s -check-prefix=X86
+; RUN: llc < %s -mtriple=i686-- | FileCheck %s -check-prefix=X86
; X86-NOT: mov
; X86: movb %ah, (%e
; X86-NOT: mov
diff --git a/test/CodeGen/X86/h-registers-0.ll b/test/CodeGen/X86/h-registers-0.ll
index 9b72916ea743..5f459c3c4598 100644
--- a/test/CodeGen/X86/h-registers-0.ll
+++ b/test/CodeGen/X86/h-registers-0.ll
@@ -1,7 +1,7 @@
; RUN: llc < %s -mattr=-bmi -mtriple=x86_64-linux | FileCheck %s -check-prefix=X86-64
; RUN: llc < %s -mattr=-bmi -mtriple=x86_64-linux-gnux32 | FileCheck %s -check-prefix=X86-64
; RUN: llc < %s -mattr=-bmi -mtriple=x86_64-win32 | FileCheck %s -check-prefix=WIN64
-; RUN: llc < %s -mattr=-bmi -march=x86 | FileCheck %s -check-prefix=X86-32
+; RUN: llc < %s -mattr=-bmi -mtriple=i686-- | FileCheck %s -check-prefix=X86-32
; Use h registers. On x86-64, codegen doesn't support general allocation
; of h registers yet, due to x86 encoding complications.
diff --git a/test/CodeGen/X86/h-registers-1.ll b/test/CodeGen/X86/h-registers-1.ll
index 469d5517b40b..9daf563455d7 100644
--- a/test/CodeGen/X86/h-registers-1.ll
+++ b/test/CodeGen/X86/h-registers-1.ll
@@ -1,24 +1,84 @@
-; RUN: llc -mattr=-bmi < %s -mtriple=x86_64-linux | FileCheck %s
-; RUN: llc -mattr=-bmi < %s -mtriple=x86_64-linux-gnux32 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-linux -mattr=-bmi | FileCheck %s --check-prefix=CHECK
+; RUN: llc < %s -mtriple=x86_64-linux-gnux32 -mattr=-bmi | FileCheck %s --check-prefix=GNUX32
; LLVM creates virtual registers for values live across blocks
; based on the type of the value. Make sure that the extracts
; here use the GR64_NOREX register class for their result,
; instead of plain GR64.
-; CHECK: foo:
-; CHECK: movzbl %{{[abcd]}}h, %e
-; CHECK: movzbl %{{[abcd]}}h, %e
-; CHECK: movzbl %{{[abcd]}}h, %e
-; CHECK: movzbl %{{[abcd]}}h, %e
-; CHECK: movzbl %{{[abcd]}}h, %e
-; CHECK: movzbl %{{[abcd]}}h, %e
-; CHECK: movzbl %{{[abcd]}}h, %e
-; CHECK: movzbl %{{[abcd]}}h, %e
-; CHECK: ret
-
-define i64 @foo(i64 %a, i64 %b, i64 %c, i64 %d,
- i64 %e, i64 %f, i64 %g, i64 %h) {
+define i64 @foo(i64 %a, i64 %b, i64 %c, i64 %d, i64 %e, i64 %f, i64 %g, i64 %h) {
+; CHECK-LABEL: foo:
+; CHECK: # %bb.0:
+; CHECK-NEXT: pushq %rbp
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: pushq %rbx
+; CHECK-NEXT: .cfi_def_cfa_offset 24
+; CHECK-NEXT: .cfi_offset %rbx, -24
+; CHECK-NEXT: .cfi_offset %rbp, -16
+; CHECK-NEXT: movq %rsi, %rax
+; CHECK-NEXT: movq %rdi, %rbx
+; CHECK-NEXT: movzbl %bh, %esi # NOREX
+; CHECK-NEXT: movzbl %ah, %eax # NOREX
+; CHECK-NEXT: movq %rax, %r10
+; CHECK-NEXT: movzbl %dh, %edx # NOREX
+; CHECK-NEXT: movzbl %ch, %eax # NOREX
+; CHECK-NEXT: movq %rax, %r11
+; CHECK-NEXT: movq %r8, %rax
+; CHECK-NEXT: movzbl %ah, %ecx # NOREX
+; CHECK-NEXT: movq %r9, %rax
+; CHECK-NEXT: movzbl %ah, %ebp # NOREX
+; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT: movzbl %ah, %eax # NOREX
+; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %ebx
+; CHECK-NEXT: movzbl %bh, %edi # NOREX
+; CHECK-NEXT: movq %r10, %r8
+; CHECK-NEXT: addq %r8, %rsi
+; CHECK-NEXT: addq %r11, %rdx
+; CHECK-NEXT: addq %rsi, %rdx
+; CHECK-NEXT: addq %rbp, %rcx
+; CHECK-NEXT: addq %rdi, %rax
+; CHECK-NEXT: addq %rcx, %rax
+; CHECK-NEXT: addq %rdx, %rax
+; CHECK-NEXT: popq %rbx
+; CHECK-NEXT: popq %rbp
+; CHECK-NEXT: retq
+;
+; GNUX32-LABEL: foo:
+; GNUX32: # %bb.0:
+; GNUX32-NEXT: pushq %rbp
+; GNUX32-NEXT: .cfi_def_cfa_offset 16
+; GNUX32-NEXT: pushq %rbx
+; GNUX32-NEXT: .cfi_def_cfa_offset 24
+; GNUX32-NEXT: .cfi_offset %rbx, -24
+; GNUX32-NEXT: .cfi_offset %rbp, -16
+; GNUX32-NEXT: movq %rsi, %rax
+; GNUX32-NEXT: movq %rdi, %rbx
+; GNUX32-NEXT: movzbl %bh, %esi # NOREX
+; GNUX32-NEXT: movzbl %ah, %eax # NOREX
+; GNUX32-NEXT: movq %rax, %r10
+; GNUX32-NEXT: movzbl %dh, %edx # NOREX
+; GNUX32-NEXT: movzbl %ch, %eax # NOREX
+; GNUX32-NEXT: movq %rax, %r11
+; GNUX32-NEXT: movq %r8, %rax
+; GNUX32-NEXT: movzbl %ah, %ecx # NOREX
+; GNUX32-NEXT: movq %r9, %rax
+; GNUX32-NEXT: movzbl %ah, %ebp # NOREX
+; GNUX32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; GNUX32-NEXT: movzbl %ah, %eax # NOREX
+; GNUX32-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; GNUX32-NEXT: movzbl %bh, %edi # NOREX
+; GNUX32-NEXT: movq %r10, %r8
+; GNUX32-NEXT: addq %r8, %rsi
+; GNUX32-NEXT: addq %r11, %rdx
+; GNUX32-NEXT: addq %rsi, %rdx
+; GNUX32-NEXT: addq %rbp, %rcx
+; GNUX32-NEXT: addq %rdi, %rax
+; GNUX32-NEXT: addq %rcx, %rax
+; GNUX32-NEXT: addq %rdx, %rax
+; GNUX32-NEXT: popq %rbx
+; GNUX32-NEXT: popq %rbp
+; GNUX32-NEXT: retq
%sa = lshr i64 %a, 8
%A = and i64 %sa, 255
%sb = lshr i64 %b, 8
diff --git a/test/CodeGen/X86/h-registers-2.ll b/test/CodeGen/X86/h-registers-2.ll
index d244ab48a2cd..e7aeb4adf2b4 100644
--- a/test/CodeGen/X86/h-registers-2.ll
+++ b/test/CodeGen/X86/h-registers-2.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | FileCheck %s
+; RUN: llc < %s -mtriple=i686-- | FileCheck %s
; Use an h register, but don't omit the explicit shift for
; non-address use(s).
diff --git a/test/CodeGen/X86/haddsub-2.ll b/test/CodeGen/X86/haddsub-2.ll
index fd023d018031..2b8b8c909d17 100644
--- a/test/CodeGen/X86/haddsub-2.ll
+++ b/test/CodeGen/X86/haddsub-2.ll
@@ -6,12 +6,12 @@
define <4 x float> @hadd_ps_test1(<4 x float> %A, <4 x float> %B) {
; SSE-LABEL: hadd_ps_test1:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: haddps %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: hadd_ps_test1:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%vecext = extractelement <4 x float> %A, i32 0
@@ -35,12 +35,12 @@ define <4 x float> @hadd_ps_test1(<4 x float> %A, <4 x float> %B) {
define <4 x float> @hadd_ps_test2(<4 x float> %A, <4 x float> %B) {
; SSE-LABEL: hadd_ps_test2:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: haddps %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: hadd_ps_test2:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%vecext = extractelement <4 x float> %A, i32 2
@@ -64,12 +64,12 @@ define <4 x float> @hadd_ps_test2(<4 x float> %A, <4 x float> %B) {
define <4 x float> @hsub_ps_test1(<4 x float> %A, <4 x float> %B) {
; SSE-LABEL: hsub_ps_test1:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: hsubps %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: hsub_ps_test1:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vhsubps %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%vecext = extractelement <4 x float> %A, i32 0
@@ -93,12 +93,12 @@ define <4 x float> @hsub_ps_test1(<4 x float> %A, <4 x float> %B) {
define <4 x float> @hsub_ps_test2(<4 x float> %A, <4 x float> %B) {
; SSE-LABEL: hsub_ps_test2:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: hsubps %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: hsub_ps_test2:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vhsubps %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%vecext = extractelement <4 x float> %A, i32 2
@@ -122,7 +122,7 @@ define <4 x float> @hsub_ps_test2(<4 x float> %A, <4 x float> %B) {
define <4 x i32> @phadd_d_test1(<4 x i32> %A, <4 x i32> %B) {
; SSE3-LABEL: phadd_d_test1:
-; SSE3: # BB#0:
+; SSE3: # %bb.0:
; SSE3-NEXT: movd %xmm0, %eax
; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
; SSE3-NEXT: movd %xmm2, %ecx
@@ -151,12 +151,12 @@ define <4 x i32> @phadd_d_test1(<4 x i32> %A, <4 x i32> %B) {
; SSE3-NEXT: retq
;
; SSSE3-LABEL: phadd_d_test1:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: phaddd %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; AVX-LABEL: phadd_d_test1:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vphaddd %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%vecext = extractelement <4 x i32> %A, i32 0
@@ -180,7 +180,7 @@ define <4 x i32> @phadd_d_test1(<4 x i32> %A, <4 x i32> %B) {
define <4 x i32> @phadd_d_test2(<4 x i32> %A, <4 x i32> %B) {
; SSE3-LABEL: phadd_d_test2:
-; SSE3: # BB#0:
+; SSE3: # %bb.0:
; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
; SSE3-NEXT: movd %xmm2, %eax
; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3]
@@ -209,12 +209,12 @@ define <4 x i32> @phadd_d_test2(<4 x i32> %A, <4 x i32> %B) {
; SSE3-NEXT: retq
;
; SSSE3-LABEL: phadd_d_test2:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: phaddd %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; AVX-LABEL: phadd_d_test2:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vphaddd %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%vecext = extractelement <4 x i32> %A, i32 2
@@ -238,7 +238,7 @@ define <4 x i32> @phadd_d_test2(<4 x i32> %A, <4 x i32> %B) {
define <4 x i32> @phsub_d_test1(<4 x i32> %A, <4 x i32> %B) {
; SSE3-LABEL: phsub_d_test1:
-; SSE3: # BB#0:
+; SSE3: # %bb.0:
; SSE3-NEXT: movd %xmm0, %eax
; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
; SSE3-NEXT: movd %xmm2, %ecx
@@ -267,12 +267,12 @@ define <4 x i32> @phsub_d_test1(<4 x i32> %A, <4 x i32> %B) {
; SSE3-NEXT: retq
;
; SSSE3-LABEL: phsub_d_test1:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: phsubd %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; AVX-LABEL: phsub_d_test1:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vphsubd %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%vecext = extractelement <4 x i32> %A, i32 0
@@ -296,7 +296,7 @@ define <4 x i32> @phsub_d_test1(<4 x i32> %A, <4 x i32> %B) {
define <4 x i32> @phsub_d_test2(<4 x i32> %A, <4 x i32> %B) {
; SSE3-LABEL: phsub_d_test2:
-; SSE3: # BB#0:
+; SSE3: # %bb.0:
; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
; SSE3-NEXT: movd %xmm2, %eax
; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3]
@@ -325,12 +325,12 @@ define <4 x i32> @phsub_d_test2(<4 x i32> %A, <4 x i32> %B) {
; SSE3-NEXT: retq
;
; SSSE3-LABEL: phsub_d_test2:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: phsubd %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; AVX-LABEL: phsub_d_test2:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vphsubd %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%vecext = extractelement <4 x i32> %A, i32 2
@@ -354,12 +354,12 @@ define <4 x i32> @phsub_d_test2(<4 x i32> %A, <4 x i32> %B) {
define <2 x double> @hadd_pd_test1(<2 x double> %A, <2 x double> %B) {
; SSE-LABEL: hadd_pd_test1:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: haddpd %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: hadd_pd_test1:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vhaddpd %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%vecext = extractelement <2 x double> %A, i32 0
@@ -375,12 +375,12 @@ define <2 x double> @hadd_pd_test1(<2 x double> %A, <2 x double> %B) {
define <2 x double> @hadd_pd_test2(<2 x double> %A, <2 x double> %B) {
; SSE-LABEL: hadd_pd_test2:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: haddpd %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: hadd_pd_test2:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vhaddpd %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%vecext = extractelement <2 x double> %A, i32 1
@@ -396,12 +396,12 @@ define <2 x double> @hadd_pd_test2(<2 x double> %A, <2 x double> %B) {
define <2 x double> @hsub_pd_test1(<2 x double> %A, <2 x double> %B) {
; SSE-LABEL: hsub_pd_test1:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: hsubpd %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: hsub_pd_test1:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vhsubpd %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%vecext = extractelement <2 x double> %A, i32 0
@@ -417,12 +417,12 @@ define <2 x double> @hsub_pd_test1(<2 x double> %A, <2 x double> %B) {
define <2 x double> @hsub_pd_test2(<2 x double> %A, <2 x double> %B) {
; SSE-LABEL: hsub_pd_test2:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: hsubpd %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: hsub_pd_test2:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vhsubpd %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%vecext = extractelement <2 x double> %B, i32 0
@@ -438,14 +438,14 @@ define <2 x double> @hsub_pd_test2(<2 x double> %A, <2 x double> %B) {
define <4 x double> @avx_vhadd_pd_test(<4 x double> %A, <4 x double> %B) {
; SSE-LABEL: avx_vhadd_pd_test:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: haddpd %xmm1, %xmm0
; SSE-NEXT: haddpd %xmm3, %xmm2
; SSE-NEXT: movapd %xmm2, %xmm1
; SSE-NEXT: retq
;
; AVX-LABEL: avx_vhadd_pd_test:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX-NEXT: vhaddpd %xmm2, %xmm1, %xmm1
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2
@@ -473,14 +473,14 @@ define <4 x double> @avx_vhadd_pd_test(<4 x double> %A, <4 x double> %B) {
define <4 x double> @avx_vhsub_pd_test(<4 x double> %A, <4 x double> %B) {
; SSE-LABEL: avx_vhsub_pd_test:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: hsubpd %xmm1, %xmm0
; SSE-NEXT: hsubpd %xmm3, %xmm2
; SSE-NEXT: movapd %xmm2, %xmm1
; SSE-NEXT: retq
;
; AVX-LABEL: avx_vhsub_pd_test:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX-NEXT: vhsubpd %xmm2, %xmm1, %xmm1
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2
@@ -508,7 +508,7 @@ define <4 x double> @avx_vhsub_pd_test(<4 x double> %A, <4 x double> %B) {
define <8 x i32> @avx2_vphadd_d_test(<8 x i32> %A, <8 x i32> %B) {
; SSE3-LABEL: avx2_vphadd_d_test:
-; SSE3: # BB#0:
+; SSE3: # %bb.0:
; SSE3-NEXT: movd %xmm0, %ecx
; SSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,2,3]
; SSE3-NEXT: movd %xmm4, %r8d
@@ -562,14 +562,14 @@ define <8 x i32> @avx2_vphadd_d_test(<8 x i32> %A, <8 x i32> %B) {
; SSE3-NEXT: retq
;
; SSSE3-LABEL: avx2_vphadd_d_test:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: phaddd %xmm1, %xmm0
; SSSE3-NEXT: phaddd %xmm3, %xmm2
; SSSE3-NEXT: movdqa %xmm2, %xmm1
; SSSE3-NEXT: retq
;
; AVX1-LABEL: avx2_vphadd_d_test:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vphaddd %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
@@ -578,7 +578,7 @@ define <8 x i32> @avx2_vphadd_d_test(<8 x i32> %A, <8 x i32> %B) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: avx2_vphadd_d_test:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX2-NEXT: vphaddd %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
@@ -622,36 +622,24 @@ define <8 x i32> @avx2_vphadd_d_test(<8 x i32> %A, <8 x i32> %B) {
define <16 x i16> @avx2_vphadd_w_test(<16 x i16> %a, <16 x i16> %b) {
; SSE3-LABEL: avx2_vphadd_w_test:
-; SSE3: # BB#0:
+; SSE3: # %bb.0:
; SSE3-NEXT: pushq %rbp
-; SSE3-NEXT: .Lcfi0:
; SSE3-NEXT: .cfi_def_cfa_offset 16
; SSE3-NEXT: pushq %r15
-; SSE3-NEXT: .Lcfi1:
; SSE3-NEXT: .cfi_def_cfa_offset 24
; SSE3-NEXT: pushq %r14
-; SSE3-NEXT: .Lcfi2:
; SSE3-NEXT: .cfi_def_cfa_offset 32
; SSE3-NEXT: pushq %r13
-; SSE3-NEXT: .Lcfi3:
; SSE3-NEXT: .cfi_def_cfa_offset 40
; SSE3-NEXT: pushq %r12
-; SSE3-NEXT: .Lcfi4:
; SSE3-NEXT: .cfi_def_cfa_offset 48
; SSE3-NEXT: pushq %rbx
-; SSE3-NEXT: .Lcfi5:
; SSE3-NEXT: .cfi_def_cfa_offset 56
-; SSE3-NEXT: .Lcfi6:
; SSE3-NEXT: .cfi_offset %rbx, -56
-; SSE3-NEXT: .Lcfi7:
; SSE3-NEXT: .cfi_offset %r12, -48
-; SSE3-NEXT: .Lcfi8:
; SSE3-NEXT: .cfi_offset %r13, -40
-; SSE3-NEXT: .Lcfi9:
; SSE3-NEXT: .cfi_offset %r14, -32
-; SSE3-NEXT: .Lcfi10:
; SSE3-NEXT: .cfi_offset %r15, -24
-; SSE3-NEXT: .Lcfi11:
; SSE3-NEXT: .cfi_offset %rbp, -16
; SSE3-NEXT: movd %xmm0, %eax
; SSE3-NEXT: pextrw $1, %xmm0, %ecx
@@ -744,14 +732,14 @@ define <16 x i16> @avx2_vphadd_w_test(<16 x i16> %a, <16 x i16> %b) {
; SSE3-NEXT: retq
;
; SSSE3-LABEL: avx2_vphadd_w_test:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: phaddw %xmm1, %xmm0
; SSSE3-NEXT: phaddw %xmm3, %xmm2
; SSSE3-NEXT: movdqa %xmm2, %xmm1
; SSSE3-NEXT: retq
;
; AVX1-LABEL: avx2_vphadd_w_test:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vphaddw %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
@@ -760,7 +748,7 @@ define <16 x i16> @avx2_vphadd_w_test(<16 x i16> %a, <16 x i16> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: avx2_vphadd_w_test:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX2-NEXT: vphaddw %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
@@ -838,7 +826,7 @@ define <16 x i16> @avx2_vphadd_w_test(<16 x i16> %a, <16 x i16> %b) {
define <4 x i32> @not_a_hsub_1(<4 x i32> %A, <4 x i32> %B) {
; SSE-LABEL: not_a_hsub_1:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movd %xmm0, %eax
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
; SSE-NEXT: movd %xmm2, %ecx
@@ -867,7 +855,7 @@ define <4 x i32> @not_a_hsub_1(<4 x i32> %A, <4 x i32> %B) {
; SSE-NEXT: retq
;
; AVX-LABEL: not_a_hsub_1:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovd %xmm0, %eax
; AVX-NEXT: vpextrd $1, %xmm0, %ecx
; AVX-NEXT: subl %ecx, %eax
@@ -906,7 +894,7 @@ define <4 x i32> @not_a_hsub_1(<4 x i32> %A, <4 x i32> %B) {
define <4 x float> @not_a_hsub_2(<4 x float> %A, <4 x float> %B) {
; SSE-LABEL: not_a_hsub_2:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps %xmm0, %xmm2
; SSE-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1]
; SSE-NEXT: movaps %xmm0, %xmm3
@@ -923,11 +911,11 @@ define <4 x float> @not_a_hsub_2(<4 x float> %A, <4 x float> %B) {
; SSE-NEXT: subss %xmm4, %xmm1
; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE-NEXT: retq
;
; AVX-LABEL: not_a_hsub_2:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
; AVX-NEXT: vpermilps {{.*#+}} xmm3 = xmm0[3,1,2,3]
; AVX-NEXT: vsubss %xmm3, %xmm2, %xmm2
@@ -963,7 +951,7 @@ define <4 x float> @not_a_hsub_2(<4 x float> %A, <4 x float> %B) {
define <2 x double> @not_a_hsub_3(<2 x double> %A, <2 x double> %B) {
; SSE-LABEL: not_a_hsub_3:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps %xmm1, %xmm2
; SSE-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1]
; SSE-NEXT: subsd %xmm2, %xmm1
@@ -975,7 +963,7 @@ define <2 x double> @not_a_hsub_3(<2 x double> %A, <2 x double> %B) {
; SSE-NEXT: retq
;
; AVX-LABEL: not_a_hsub_3:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
; AVX-NEXT: vsubsd %xmm2, %xmm1, %xmm1
; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
@@ -998,13 +986,13 @@ define <2 x double> @not_a_hsub_3(<2 x double> %A, <2 x double> %B) {
define <8 x float> @avx_vhadd_ps(<8 x float> %a, <8 x float> %b) {
; SSE-LABEL: avx_vhadd_ps:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: haddps %xmm2, %xmm0
; SSE-NEXT: haddps %xmm3, %xmm1
; SSE-NEXT: retq
;
; AVX-LABEL: avx_vhadd_ps:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vhaddps %ymm1, %ymm0, %ymm0
; AVX-NEXT: retq
%vecext = extractelement <8 x float> %a, i32 0
@@ -1044,13 +1032,13 @@ define <8 x float> @avx_vhadd_ps(<8 x float> %a, <8 x float> %b) {
define <8 x float> @avx_vhsub_ps(<8 x float> %a, <8 x float> %b) {
; SSE-LABEL: avx_vhsub_ps:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: hsubps %xmm2, %xmm0
; SSE-NEXT: hsubps %xmm3, %xmm1
; SSE-NEXT: retq
;
; AVX-LABEL: avx_vhsub_ps:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vhsubps %ymm1, %ymm0, %ymm0
; AVX-NEXT: retq
%vecext = extractelement <8 x float> %a, i32 0
@@ -1090,13 +1078,13 @@ define <8 x float> @avx_vhsub_ps(<8 x float> %a, <8 x float> %b) {
define <4 x double> @avx_hadd_pd(<4 x double> %a, <4 x double> %b) {
; SSE-LABEL: avx_hadd_pd:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: haddpd %xmm2, %xmm0
; SSE-NEXT: haddpd %xmm3, %xmm1
; SSE-NEXT: retq
;
; AVX-LABEL: avx_hadd_pd:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vhaddpd %ymm1, %ymm0, %ymm0
; AVX-NEXT: retq
%vecext = extractelement <4 x double> %a, i32 0
@@ -1120,13 +1108,13 @@ define <4 x double> @avx_hadd_pd(<4 x double> %a, <4 x double> %b) {
define <4 x double> @avx_hsub_pd(<4 x double> %a, <4 x double> %b) {
; SSE-LABEL: avx_hsub_pd:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: hsubpd %xmm2, %xmm0
; SSE-NEXT: hsubpd %xmm3, %xmm1
; SSE-NEXT: retq
;
; AVX-LABEL: avx_hsub_pd:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vhsubpd %ymm1, %ymm0, %ymm0
; AVX-NEXT: retq
%vecext = extractelement <4 x double> %a, i32 0
@@ -1152,7 +1140,7 @@ define <4 x double> @avx_hsub_pd(<4 x double> %a, <4 x double> %b) {
define <8 x i32> @avx2_hadd_d(<8 x i32> %a, <8 x i32> %b) {
; SSE3-LABEL: avx2_hadd_d:
-; SSE3: # BB#0:
+; SSE3: # %bb.0:
; SSE3-NEXT: movd %xmm0, %ecx
; SSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,2,3]
; SSE3-NEXT: movd %xmm4, %r8d
@@ -1206,13 +1194,13 @@ define <8 x i32> @avx2_hadd_d(<8 x i32> %a, <8 x i32> %b) {
; SSE3-NEXT: retq
;
; SSSE3-LABEL: avx2_hadd_d:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: phaddd %xmm2, %xmm0
; SSSE3-NEXT: phaddd %xmm3, %xmm1
; SSSE3-NEXT: retq
;
; AVX1-LABEL: avx2_hadd_d:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vphaddd %xmm2, %xmm3, %xmm2
@@ -1221,7 +1209,7 @@ define <8 x i32> @avx2_hadd_d(<8 x i32> %a, <8 x i32> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: avx2_hadd_d:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vphaddd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
%vecext = extractelement <8 x i32> %a, i32 0
@@ -1261,36 +1249,24 @@ define <8 x i32> @avx2_hadd_d(<8 x i32> %a, <8 x i32> %b) {
define <16 x i16> @avx2_hadd_w(<16 x i16> %a, <16 x i16> %b) {
; SSE3-LABEL: avx2_hadd_w:
-; SSE3: # BB#0:
+; SSE3: # %bb.0:
; SSE3-NEXT: pushq %rbp
-; SSE3-NEXT: .Lcfi12:
; SSE3-NEXT: .cfi_def_cfa_offset 16
; SSE3-NEXT: pushq %r15
-; SSE3-NEXT: .Lcfi13:
; SSE3-NEXT: .cfi_def_cfa_offset 24
; SSE3-NEXT: pushq %r14
-; SSE3-NEXT: .Lcfi14:
; SSE3-NEXT: .cfi_def_cfa_offset 32
; SSE3-NEXT: pushq %r13
-; SSE3-NEXT: .Lcfi15:
; SSE3-NEXT: .cfi_def_cfa_offset 40
; SSE3-NEXT: pushq %r12
-; SSE3-NEXT: .Lcfi16:
; SSE3-NEXT: .cfi_def_cfa_offset 48
; SSE3-NEXT: pushq %rbx
-; SSE3-NEXT: .Lcfi17:
; SSE3-NEXT: .cfi_def_cfa_offset 56
-; SSE3-NEXT: .Lcfi18:
; SSE3-NEXT: .cfi_offset %rbx, -56
-; SSE3-NEXT: .Lcfi19:
; SSE3-NEXT: .cfi_offset %r12, -48
-; SSE3-NEXT: .Lcfi20:
; SSE3-NEXT: .cfi_offset %r13, -40
-; SSE3-NEXT: .Lcfi21:
; SSE3-NEXT: .cfi_offset %r14, -32
-; SSE3-NEXT: .Lcfi22:
; SSE3-NEXT: .cfi_offset %r15, -24
-; SSE3-NEXT: .Lcfi23:
; SSE3-NEXT: .cfi_offset %rbp, -16
; SSE3-NEXT: movd %xmm0, %eax
; SSE3-NEXT: pextrw $1, %xmm0, %r10d
@@ -1383,13 +1359,13 @@ define <16 x i16> @avx2_hadd_w(<16 x i16> %a, <16 x i16> %b) {
; SSE3-NEXT: retq
;
; SSSE3-LABEL: avx2_hadd_w:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: phaddw %xmm2, %xmm0
; SSSE3-NEXT: phaddw %xmm3, %xmm1
; SSSE3-NEXT: retq
;
; AVX1-LABEL: avx2_hadd_w:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vphaddw %xmm2, %xmm3, %xmm2
@@ -1398,7 +1374,7 @@ define <16 x i16> @avx2_hadd_w(<16 x i16> %a, <16 x i16> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: avx2_hadd_w:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vphaddw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
%vecext = extractelement <16 x i16> %a, i32 0
diff --git a/test/CodeGen/X86/haddsub-shuf.ll b/test/CodeGen/X86/haddsub-shuf.ll
new file mode 100644
index 000000000000..3b126b7b6dfc
--- /dev/null
+++ b/test/CodeGen/X86/haddsub-shuf.ll
@@ -0,0 +1,143 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSSE3
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX
+
+; The next 8 tests check for matching the horizontal op and eliminating the shuffle.
+; PR34111 - https://bugs.llvm.org/show_bug.cgi?id=34111
+
+define <4 x float> @hadd_v4f32(<4 x float> %a) {
+; SSSE3-LABEL: hadd_v4f32:
+; SSSE3: # %bb.0:
+; SSSE3-NEXT: haddps %xmm0, %xmm0
+; SSSE3-NEXT: retq
+;
+; AVX-LABEL: hadd_v4f32:
+; AVX: # %bb.0:
+; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0
+; AVX-NEXT: retq
+ %a02 = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 0, i32 2>
+ %a13 = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 1, i32 3>
+ %hop = fadd <2 x float> %a02, %a13
+ %shuf = shufflevector <2 x float> %hop, <2 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
+ ret <4 x float> %shuf
+}
+
+define <4 x float> @hsub_v4f32(<4 x float> %a) {
+; SSSE3-LABEL: hsub_v4f32:
+; SSSE3: # %bb.0:
+; SSSE3-NEXT: hsubps %xmm0, %xmm0
+; SSSE3-NEXT: retq
+;
+; AVX-LABEL: hsub_v4f32:
+; AVX: # %bb.0:
+; AVX-NEXT: vhsubps %xmm0, %xmm0, %xmm0
+; AVX-NEXT: retq
+ %a02 = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 0, i32 2>
+ %a13 = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 1, i32 3>
+ %hop = fsub <2 x float> %a02, %a13
+ %shuf = shufflevector <2 x float> %hop, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+ ret <4 x float> %shuf
+}
+
+define <2 x double> @hadd_v2f64(<2 x double> %a) {
+; SSSE3-LABEL: hadd_v2f64:
+; SSSE3: # %bb.0:
+; SSSE3-NEXT: haddpd %xmm0, %xmm0
+; SSSE3-NEXT: retq
+;
+; AVX-LABEL: hadd_v2f64:
+; AVX: # %bb.0:
+; AVX-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
+; AVX-NEXT: retq
+ %a0 = shufflevector <2 x double> %a, <2 x double> undef, <2 x i32> <i32 0, i32 undef>
+ %a1 = shufflevector <2 x double> %a, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
+ %hop = fadd <2 x double> %a0, %a1
+ %shuf = shufflevector <2 x double> %hop, <2 x double> undef, <2 x i32> <i32 0, i32 0>
+ ret <2 x double> %shuf
+}
+
+define <2 x double> @hsub_v2f64(<2 x double> %a) {
+; SSSE3-LABEL: hsub_v2f64:
+; SSSE3: # %bb.0:
+; SSSE3-NEXT: hsubpd %xmm0, %xmm0
+; SSSE3-NEXT: retq
+;
+; AVX-LABEL: hsub_v2f64:
+; AVX: # %bb.0:
+; AVX-NEXT: vhsubpd %xmm0, %xmm0, %xmm0
+; AVX-NEXT: retq
+ %a0 = shufflevector <2 x double> %a, <2 x double> undef, <2 x i32> <i32 0, i32 undef>
+ %a1 = shufflevector <2 x double> %a, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
+ %hop = fsub <2 x double> %a0, %a1
+ %shuf = shufflevector <2 x double> %hop, <2 x double> undef, <2 x i32> <i32 undef, i32 0>
+ ret <2 x double> %shuf
+}
+
+define <4 x i32> @hadd_v4i32(<4 x i32> %a) {
+; SSSE3-LABEL: hadd_v4i32:
+; SSSE3: # %bb.0:
+; SSSE3-NEXT: phaddd %xmm0, %xmm0
+; SSSE3-NEXT: retq
+;
+; AVX-LABEL: hadd_v4i32:
+; AVX: # %bb.0:
+; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0
+; AVX-NEXT: retq
+ %a02 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
+ %a13 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
+ %hop = add <4 x i32> %a02, %a13
+ %shuf = shufflevector <4 x i32> %hop, <4 x i32> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 1>
+ ret <4 x i32> %shuf
+}
+
+define <4 x i32> @hsub_v4i32(<4 x i32> %a) {
+; SSSE3-LABEL: hsub_v4i32:
+; SSSE3: # %bb.0:
+; SSSE3-NEXT: phsubd %xmm0, %xmm0
+; SSSE3-NEXT: retq
+;
+; AVX-LABEL: hsub_v4i32:
+; AVX: # %bb.0:
+; AVX-NEXT: vphsubd %xmm0, %xmm0, %xmm0
+; AVX-NEXT: retq
+ %a02 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
+ %a13 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
+ %hop = sub <4 x i32> %a02, %a13
+ %shuf = shufflevector <4 x i32> %hop, <4 x i32> undef, <4 x i32> <i32 undef, i32 1, i32 0, i32 undef>
+ ret <4 x i32> %shuf
+}
+
+define <8 x i16> @hadd_v8i16(<8 x i16> %a) {
+; SSSE3-LABEL: hadd_v8i16:
+; SSSE3: # %bb.0:
+; SSSE3-NEXT: phaddw %xmm0, %xmm0
+; SSSE3-NEXT: retq
+;
+; AVX-LABEL: hadd_v8i16:
+; AVX: # %bb.0:
+; AVX-NEXT: vphaddw %xmm0, %xmm0, %xmm0
+; AVX-NEXT: retq
+ %a0246 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 undef, i32 undef, i32 undef, i32 undef>
+ %a1357 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+ %hop = add <8 x i16> %a0246, %a1357
+ %shuf = shufflevector <8 x i16> %hop, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
+ ret <8 x i16> %shuf
+}
+
+define <8 x i16> @hsub_v8i16(<8 x i16> %a) {
+; SSSE3-LABEL: hsub_v8i16:
+; SSSE3: # %bb.0:
+; SSSE3-NEXT: phsubw %xmm0, %xmm0
+; SSSE3-NEXT: retq
+;
+; AVX-LABEL: hsub_v8i16:
+; AVX: # %bb.0:
+; AVX-NEXT: vphsubw %xmm0, %xmm0, %xmm0
+; AVX-NEXT: retq
+ %a0246 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 undef, i32 undef, i32 undef, i32 undef>
+ %a1357 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+ %hop = sub <8 x i16> %a0246, %a1357
+ %shuf = shufflevector <8 x i16> %hop, <8 x i16> undef, <8 x i32> <i32 0, i32 undef, i32 2, i32 undef, i32 undef, i32 1, i32 undef, i32 3>
+ ret <8 x i16> %shuf
+}
+
diff --git a/test/CodeGen/X86/haddsub-undef.ll b/test/CodeGen/X86/haddsub-undef.ll
index 091d1a22dbcd..d34f8985cff3 100644
--- a/test/CodeGen/X86/haddsub-undef.ll
+++ b/test/CodeGen/X86/haddsub-undef.ll
@@ -7,12 +7,12 @@
define <4 x float> @test1_undef(<4 x float> %a, <4 x float> %b) {
; SSE-LABEL: test1_undef:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: haddps %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test1_undef:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%vecext = extractelement <4 x float> %a, i32 0
@@ -32,12 +32,12 @@ define <4 x float> @test1_undef(<4 x float> %a, <4 x float> %b) {
define <4 x float> @test2_undef(<4 x float> %a, <4 x float> %b) {
; SSE-LABEL: test2_undef:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: haddps %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test2_undef:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%vecext = extractelement <4 x float> %a, i32 0
@@ -57,12 +57,12 @@ define <4 x float> @test2_undef(<4 x float> %a, <4 x float> %b) {
define <4 x float> @test3_undef(<4 x float> %a, <4 x float> %b) {
; SSE-LABEL: test3_undef:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: haddps %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test3_undef:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%vecext = extractelement <4 x float> %a, i32 0
@@ -82,13 +82,13 @@ define <4 x float> @test3_undef(<4 x float> %a, <4 x float> %b) {
define <4 x float> @test4_undef(<4 x float> %a, <4 x float> %b) {
; SSE-LABEL: test4_undef:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; SSE-NEXT: addss %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test4_undef:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
@@ -101,7 +101,7 @@ define <4 x float> @test4_undef(<4 x float> %a, <4 x float> %b) {
define <2 x double> @test5_undef(<2 x double> %a, <2 x double> %b) {
; SSE-LABEL: test5_undef:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps %xmm0, %xmm1
; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
; SSE-NEXT: addsd %xmm0, %xmm1
@@ -109,7 +109,7 @@ define <2 x double> @test5_undef(<2 x double> %a, <2 x double> %b) {
; SSE-NEXT: retq
;
; AVX-LABEL: test5_undef:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
@@ -122,12 +122,12 @@ define <2 x double> @test5_undef(<2 x double> %a, <2 x double> %b) {
define <4 x float> @test6_undef(<4 x float> %a, <4 x float> %b) {
; SSE-LABEL: test6_undef:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: haddps %xmm0, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test6_undef:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0
; AVX-NEXT: retq
%vecext = extractelement <4 x float> %a, i32 0
@@ -143,12 +143,12 @@ define <4 x float> @test6_undef(<4 x float> %a, <4 x float> %b) {
define <4 x float> @test7_undef(<4 x float> %a, <4 x float> %b) {
; SSE-LABEL: test7_undef:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: haddps %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test7_undef:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%vecext = extractelement <4 x float> %b, i32 0
@@ -164,19 +164,19 @@ define <4 x float> @test7_undef(<4 x float> %a, <4 x float> %b) {
define <4 x float> @test8_undef(<4 x float> %a, <4 x float> %b) {
; SSE-LABEL: test8_undef:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; SSE-NEXT: addss %xmm0, %xmm1
; SSE-NEXT: movaps %xmm0, %xmm2
; SSE-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1]
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
; SSE-NEXT: addss %xmm2, %xmm0
-; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; SSE-NEXT: movapd %xmm1, %xmm0
+; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE-NEXT: movaps %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test8_undef:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm1
; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
@@ -197,12 +197,12 @@ define <4 x float> @test8_undef(<4 x float> %a, <4 x float> %b) {
define <4 x float> @test9_undef(<4 x float> %a, <4 x float> %b) {
; SSE-LABEL: test9_undef:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: haddps %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test9_undef:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%vecext = extractelement <4 x float> %a, i32 0
@@ -218,12 +218,12 @@ define <4 x float> @test9_undef(<4 x float> %a, <4 x float> %b) {
define <8 x float> @test10_undef(<8 x float> %a, <8 x float> %b) {
; SSE-LABEL: test10_undef:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: haddps %xmm2, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test10_undef:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vhaddps %ymm1, %ymm0, %ymm0
; AVX-NEXT: retq
%vecext = extractelement <8 x float> %a, i32 0
@@ -239,7 +239,7 @@ define <8 x float> @test10_undef(<8 x float> %a, <8 x float> %b) {
define <8 x float> @test11_undef(<8 x float> %a, <8 x float> %b) {
; SSE-LABEL: test11_undef:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; SSE-NEXT: addss %xmm1, %xmm0
; SSE-NEXT: movshdup {{.*#+}} xmm1 = xmm3[1,1,3,3]
@@ -248,7 +248,7 @@ define <8 x float> @test11_undef(<8 x float> %a, <8 x float> %b) {
; SSE-NEXT: retq
;
; AVX-LABEL: test11_undef:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vhaddps %ymm0, %ymm0, %ymm0
; AVX-NEXT: retq
%vecext = extractelement <8 x float> %a, i32 0
@@ -264,12 +264,12 @@ define <8 x float> @test11_undef(<8 x float> %a, <8 x float> %b) {
define <8 x float> @test12_undef(<8 x float> %a, <8 x float> %b) {
; SSE-LABEL: test12_undef:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: haddps %xmm0, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test12_undef:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vhaddps %ymm0, %ymm0, %ymm0
; AVX-NEXT: retq
%vecext = extractelement <8 x float> %a, i32 0
@@ -285,12 +285,12 @@ define <8 x float> @test12_undef(<8 x float> %a, <8 x float> %b) {
define <8 x float> @test13_undef(<8 x float> %a, <8 x float> %b) {
; SSE-LABEL: test13_undef:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: haddps %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test13_undef:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
@@ -315,17 +315,17 @@ define <8 x float> @test13_undef(<8 x float> %a, <8 x float> %b) {
define <8 x i32> @test14_undef(<8 x i32> %a, <8 x i32> %b) {
; SSE-LABEL: test14_undef:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: phaddd %xmm2, %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: test14_undef:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vphaddd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: test14_undef:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vphaddd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
%vecext = extractelement <8 x i32> %a, i32 0
@@ -344,7 +344,7 @@ define <8 x i32> @test14_undef(<8 x i32> %a, <8 x i32> %b) {
; integer horizontal adds instead of two scalar adds followed by vector inserts.
define <8 x i32> @test15_undef(<8 x i32> %a, <8 x i32> %b) {
; SSE-LABEL: test15_undef:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movd %xmm0, %eax
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
; SSE-NEXT: movd %xmm0, %ecx
@@ -359,7 +359,7 @@ define <8 x i32> @test15_undef(<8 x i32> %a, <8 x i32> %b) {
; SSE-NEXT: retq
;
; AVX1-LABEL: test15_undef:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovd %xmm0, %eax
; AVX1-NEXT: vpextrd $1, %xmm0, %ecx
; AVX1-NEXT: addl %eax, %ecx
@@ -374,7 +374,7 @@ define <8 x i32> @test15_undef(<8 x i32> %a, <8 x i32> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test15_undef:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vphaddd %ymm0, %ymm0, %ymm0
; AVX2-NEXT: retq
%vecext = extractelement <8 x i32> %a, i32 0
@@ -390,17 +390,17 @@ define <8 x i32> @test15_undef(<8 x i32> %a, <8 x i32> %b) {
define <8 x i32> @test16_undef(<8 x i32> %a, <8 x i32> %b) {
; SSE-LABEL: test16_undef:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: phaddd %xmm0, %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: test16_undef:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vphaddd %xmm0, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: test16_undef:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vphaddd %ymm0, %ymm0, %ymm0
; AVX2-NEXT: retq
%vecext = extractelement <8 x i32> %a, i32 0
@@ -416,18 +416,18 @@ define <8 x i32> @test16_undef(<8 x i32> %a, <8 x i32> %b) {
define <8 x i32> @test17_undef(<8 x i32> %a, <8 x i32> %b) {
; SSE-LABEL: test17_undef:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: phaddd %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: test17_undef:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vphaddd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: test17_undef:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vphaddd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: retq
diff --git a/test/CodeGen/X86/haddsub.ll b/test/CodeGen/X86/haddsub.ll
index 8e28433d2ac2..030de9c7f14d 100644
--- a/test/CodeGen/X86/haddsub.ll
+++ b/test/CodeGen/X86/haddsub.ll
@@ -4,12 +4,12 @@
define <2 x double> @haddpd1(<2 x double> %x, <2 x double> %y) {
; SSE3-LABEL: haddpd1:
-; SSE3: # BB#0:
+; SSE3: # %bb.0:
; SSE3-NEXT: haddpd %xmm1, %xmm0
; SSE3-NEXT: retq
;
; AVX-LABEL: haddpd1:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vhaddpd %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%a = shufflevector <2 x double> %x, <2 x double> %y, <2 x i32> <i32 0, i32 2>
@@ -20,12 +20,12 @@ define <2 x double> @haddpd1(<2 x double> %x, <2 x double> %y) {
define <2 x double> @haddpd2(<2 x double> %x, <2 x double> %y) {
; SSE3-LABEL: haddpd2:
-; SSE3: # BB#0:
+; SSE3: # %bb.0:
; SSE3-NEXT: haddpd %xmm1, %xmm0
; SSE3-NEXT: retq
;
; AVX-LABEL: haddpd2:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vhaddpd %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%a = shufflevector <2 x double> %x, <2 x double> %y, <2 x i32> <i32 1, i32 2>
@@ -36,12 +36,12 @@ define <2 x double> @haddpd2(<2 x double> %x, <2 x double> %y) {
define <2 x double> @haddpd3(<2 x double> %x) {
; SSE3-LABEL: haddpd3:
-; SSE3: # BB#0:
+; SSE3: # %bb.0:
; SSE3-NEXT: haddpd %xmm0, %xmm0
; SSE3-NEXT: retq
;
; AVX-LABEL: haddpd3:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
; AVX-NEXT: retq
%a = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> <i32 0, i32 undef>
@@ -52,12 +52,12 @@ define <2 x double> @haddpd3(<2 x double> %x) {
define <4 x float> @haddps1(<4 x float> %x, <4 x float> %y) {
; SSE3-LABEL: haddps1:
-; SSE3: # BB#0:
+; SSE3: # %bb.0:
; SSE3-NEXT: haddps %xmm1, %xmm0
; SSE3-NEXT: retq
;
; AVX-LABEL: haddps1:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%a = shufflevector <4 x float> %x, <4 x float> %y, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
@@ -68,12 +68,12 @@ define <4 x float> @haddps1(<4 x float> %x, <4 x float> %y) {
define <4 x float> @haddps2(<4 x float> %x, <4 x float> %y) {
; SSE3-LABEL: haddps2:
-; SSE3: # BB#0:
+; SSE3: # %bb.0:
; SSE3-NEXT: haddps %xmm1, %xmm0
; SSE3-NEXT: retq
;
; AVX-LABEL: haddps2:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%a = shufflevector <4 x float> %x, <4 x float> %y, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
@@ -84,12 +84,12 @@ define <4 x float> @haddps2(<4 x float> %x, <4 x float> %y) {
define <4 x float> @haddps3(<4 x float> %x) {
; SSE3-LABEL: haddps3:
-; SSE3: # BB#0:
+; SSE3: # %bb.0:
; SSE3-NEXT: haddps %xmm0, %xmm0
; SSE3-NEXT: retq
;
; AVX-LABEL: haddps3:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0
; AVX-NEXT: retq
%a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 2, i32 4, i32 6>
@@ -100,12 +100,12 @@ define <4 x float> @haddps3(<4 x float> %x) {
define <4 x float> @haddps4(<4 x float> %x) {
; SSE3-LABEL: haddps4:
-; SSE3: # BB#0:
+; SSE3: # %bb.0:
; SSE3-NEXT: haddps %xmm0, %xmm0
; SSE3-NEXT: retq
;
; AVX-LABEL: haddps4:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0
; AVX-NEXT: retq
%a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
@@ -116,12 +116,12 @@ define <4 x float> @haddps4(<4 x float> %x) {
define <4 x float> @haddps5(<4 x float> %x) {
; SSE3-LABEL: haddps5:
-; SSE3: # BB#0:
+; SSE3: # %bb.0:
; SSE3-NEXT: haddps %xmm0, %xmm0
; SSE3-NEXT: retq
;
; AVX-LABEL: haddps5:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0
; AVX-NEXT: retq
%a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 0, i32 3, i32 undef, i32 undef>
@@ -132,12 +132,12 @@ define <4 x float> @haddps5(<4 x float> %x) {
define <4 x float> @haddps6(<4 x float> %x) {
; SSE3-LABEL: haddps6:
-; SSE3: # BB#0:
+; SSE3: # %bb.0:
; SSE3-NEXT: haddps %xmm0, %xmm0
; SSE3-NEXT: retq
;
; AVX-LABEL: haddps6:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0
; AVX-NEXT: retq
%a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
@@ -148,12 +148,12 @@ define <4 x float> @haddps6(<4 x float> %x) {
define <4 x float> @haddps7(<4 x float> %x) {
; SSE3-LABEL: haddps7:
-; SSE3: # BB#0:
+; SSE3: # %bb.0:
; SSE3-NEXT: haddps %xmm0, %xmm0
; SSE3-NEXT: retq
;
; AVX-LABEL: haddps7:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0
; AVX-NEXT: retq
%a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 3, i32 undef, i32 undef>
@@ -164,12 +164,12 @@ define <4 x float> @haddps7(<4 x float> %x) {
define <2 x double> @hsubpd1(<2 x double> %x, <2 x double> %y) {
; SSE3-LABEL: hsubpd1:
-; SSE3: # BB#0:
+; SSE3: # %bb.0:
; SSE3-NEXT: hsubpd %xmm1, %xmm0
; SSE3-NEXT: retq
;
; AVX-LABEL: hsubpd1:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vhsubpd %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%a = shufflevector <2 x double> %x, <2 x double> %y, <2 x i32> <i32 0, i32 2>
@@ -180,12 +180,12 @@ define <2 x double> @hsubpd1(<2 x double> %x, <2 x double> %y) {
define <2 x double> @hsubpd2(<2 x double> %x) {
; SSE3-LABEL: hsubpd2:
-; SSE3: # BB#0:
+; SSE3: # %bb.0:
; SSE3-NEXT: hsubpd %xmm0, %xmm0
; SSE3-NEXT: retq
;
; AVX-LABEL: hsubpd2:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vhsubpd %xmm0, %xmm0, %xmm0
; AVX-NEXT: retq
%a = shufflevector <2 x double> %x, <2 x double> undef, <2 x i32> <i32 0, i32 undef>
@@ -196,12 +196,12 @@ define <2 x double> @hsubpd2(<2 x double> %x) {
define <4 x float> @hsubps1(<4 x float> %x, <4 x float> %y) {
; SSE3-LABEL: hsubps1:
-; SSE3: # BB#0:
+; SSE3: # %bb.0:
; SSE3-NEXT: hsubps %xmm1, %xmm0
; SSE3-NEXT: retq
;
; AVX-LABEL: hsubps1:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vhsubps %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%a = shufflevector <4 x float> %x, <4 x float> %y, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
@@ -212,12 +212,12 @@ define <4 x float> @hsubps1(<4 x float> %x, <4 x float> %y) {
define <4 x float> @hsubps2(<4 x float> %x) {
; SSE3-LABEL: hsubps2:
-; SSE3: # BB#0:
+; SSE3: # %bb.0:
; SSE3-NEXT: hsubps %xmm0, %xmm0
; SSE3-NEXT: retq
;
; AVX-LABEL: hsubps2:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vhsubps %xmm0, %xmm0, %xmm0
; AVX-NEXT: retq
%a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 2, i32 4, i32 6>
@@ -228,12 +228,12 @@ define <4 x float> @hsubps2(<4 x float> %x) {
define <4 x float> @hsubps3(<4 x float> %x) {
; SSE3-LABEL: hsubps3:
-; SSE3: # BB#0:
+; SSE3: # %bb.0:
; SSE3-NEXT: hsubps %xmm0, %xmm0
; SSE3-NEXT: retq
;
; AVX-LABEL: hsubps3:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vhsubps %xmm0, %xmm0, %xmm0
; AVX-NEXT: retq
%a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
@@ -244,12 +244,12 @@ define <4 x float> @hsubps3(<4 x float> %x) {
define <4 x float> @hsubps4(<4 x float> %x) {
; SSE3-LABEL: hsubps4:
-; SSE3: # BB#0:
+; SSE3: # %bb.0:
; SSE3-NEXT: hsubps %xmm0, %xmm0
; SSE3-NEXT: retq
;
; AVX-LABEL: hsubps4:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vhsubps %xmm0, %xmm0, %xmm0
; AVX-NEXT: retq
%a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
@@ -260,13 +260,13 @@ define <4 x float> @hsubps4(<4 x float> %x) {
define <8 x float> @vhaddps1(<8 x float> %x, <8 x float> %y) {
; SSE3-LABEL: vhaddps1:
-; SSE3: # BB#0:
+; SSE3: # %bb.0:
; SSE3-NEXT: haddps %xmm2, %xmm0
; SSE3-NEXT: haddps %xmm3, %xmm1
; SSE3-NEXT: retq
;
; AVX-LABEL: vhaddps1:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vhaddps %ymm1, %ymm0, %ymm0
; AVX-NEXT: retq
%a = shufflevector <8 x float> %x, <8 x float> %y, <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
@@ -277,13 +277,13 @@ define <8 x float> @vhaddps1(<8 x float> %x, <8 x float> %y) {
define <8 x float> @vhaddps2(<8 x float> %x, <8 x float> %y) {
; SSE3-LABEL: vhaddps2:
-; SSE3: # BB#0:
+; SSE3: # %bb.0:
; SSE3-NEXT: haddps %xmm2, %xmm0
; SSE3-NEXT: haddps %xmm3, %xmm1
; SSE3-NEXT: retq
;
; AVX-LABEL: vhaddps2:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vhaddps %ymm1, %ymm0, %ymm0
; AVX-NEXT: retq
%a = shufflevector <8 x float> %x, <8 x float> %y, <8 x i32> <i32 1, i32 2, i32 9, i32 10, i32 5, i32 6, i32 13, i32 14>
@@ -294,13 +294,13 @@ define <8 x float> @vhaddps2(<8 x float> %x, <8 x float> %y) {
define <8 x float> @vhaddps3(<8 x float> %x) {
; SSE3-LABEL: vhaddps3:
-; SSE3: # BB#0:
+; SSE3: # %bb.0:
; SSE3-NEXT: haddps %xmm0, %xmm0
; SSE3-NEXT: haddps %xmm1, %xmm1
; SSE3-NEXT: retq
;
; AVX-LABEL: vhaddps3:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vhaddps %ymm0, %ymm0, %ymm0
; AVX-NEXT: retq
%a = shufflevector <8 x float> %x, <8 x float> undef, <8 x i32> <i32 undef, i32 2, i32 8, i32 10, i32 4, i32 6, i32 undef, i32 14>
@@ -311,13 +311,13 @@ define <8 x float> @vhaddps3(<8 x float> %x) {
define <8 x float> @vhsubps1(<8 x float> %x, <8 x float> %y) {
; SSE3-LABEL: vhsubps1:
-; SSE3: # BB#0:
+; SSE3: # %bb.0:
; SSE3-NEXT: hsubps %xmm2, %xmm0
; SSE3-NEXT: hsubps %xmm3, %xmm1
; SSE3-NEXT: retq
;
; AVX-LABEL: vhsubps1:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vhsubps %ymm1, %ymm0, %ymm0
; AVX-NEXT: retq
%a = shufflevector <8 x float> %x, <8 x float> %y, <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
@@ -328,13 +328,13 @@ define <8 x float> @vhsubps1(<8 x float> %x, <8 x float> %y) {
define <8 x float> @vhsubps3(<8 x float> %x) {
; SSE3-LABEL: vhsubps3:
-; SSE3: # BB#0:
+; SSE3: # %bb.0:
; SSE3-NEXT: hsubps %xmm0, %xmm0
; SSE3-NEXT: hsubps %xmm1, %xmm1
; SSE3-NEXT: retq
;
; AVX-LABEL: vhsubps3:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vhsubps %ymm0, %ymm0, %ymm0
; AVX-NEXT: retq
%a = shufflevector <8 x float> %x, <8 x float> undef, <8 x i32> <i32 undef, i32 2, i32 8, i32 10, i32 4, i32 6, i32 undef, i32 14>
@@ -345,13 +345,13 @@ define <8 x float> @vhsubps3(<8 x float> %x) {
define <4 x double> @vhaddpd1(<4 x double> %x, <4 x double> %y) {
; SSE3-LABEL: vhaddpd1:
-; SSE3: # BB#0:
+; SSE3: # %bb.0:
; SSE3-NEXT: haddpd %xmm2, %xmm0
; SSE3-NEXT: haddpd %xmm3, %xmm1
; SSE3-NEXT: retq
;
; AVX-LABEL: vhaddpd1:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vhaddpd %ymm1, %ymm0, %ymm0
; AVX-NEXT: retq
%a = shufflevector <4 x double> %x, <4 x double> %y, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
@@ -362,13 +362,13 @@ define <4 x double> @vhaddpd1(<4 x double> %x, <4 x double> %y) {
define <4 x double> @vhsubpd1(<4 x double> %x, <4 x double> %y) {
; SSE3-LABEL: vhsubpd1:
-; SSE3: # BB#0:
+; SSE3: # %bb.0:
; SSE3-NEXT: hsubpd %xmm2, %xmm0
; SSE3-NEXT: hsubpd %xmm3, %xmm1
; SSE3-NEXT: retq
;
; AVX-LABEL: vhsubpd1:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vhsubpd %ymm1, %ymm0, %ymm0
; AVX-NEXT: retq
%a = shufflevector <4 x double> %x, <4 x double> %y, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
@@ -379,12 +379,12 @@ define <4 x double> @vhsubpd1(<4 x double> %x, <4 x double> %y) {
define <2 x float> @haddps_v2f32(<4 x float> %v0) {
; SSE3-LABEL: haddps_v2f32:
-; SSE3: # BB#0:
+; SSE3: # %bb.0:
; SSE3-NEXT: haddps %xmm0, %xmm0
; SSE3-NEXT: retq
;
; AVX-LABEL: haddps_v2f32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0
; AVX-NEXT: retq
%v0.0 = extractelement <4 x float> %v0, i32 0
@@ -397,3 +397,4 @@ define <2 x float> @haddps_v2f32(<4 x float> %v0) {
%res1 = insertelement <2 x float> %res0, float %op1, i32 1
ret <2 x float> %res1
}
+
diff --git a/test/CodeGen/X86/half.ll b/test/CodeGen/X86/half.ll
index b7c43d3b2e3e..20db4a5e3889 100644
--- a/test/CodeGen/X86/half.ll
+++ b/test/CodeGen/X86/half.ll
@@ -10,19 +10,19 @@
define void @test_load_store(half* %in, half* %out) #0 {
; BWON-LABEL: test_load_store:
-; BWON: # BB#0:
+; BWON: # %bb.0:
; BWON-NEXT: movzwl (%rdi), %eax
; BWON-NEXT: movw %ax, (%rsi)
; BWON-NEXT: retq
;
; BWOFF-LABEL: test_load_store:
-; BWOFF: # BB#0:
+; BWOFF: # %bb.0:
; BWOFF-NEXT: movw (%rdi), %ax
; BWOFF-NEXT: movw %ax, (%rsi)
; BWOFF-NEXT: retq
;
; CHECK-I686-LABEL: test_load_store:
-; CHECK-I686: # BB#0:
+; CHECK-I686: # %bb.0:
; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %ecx
; CHECK-I686-NEXT: movw (%ecx), %cx
@@ -35,17 +35,17 @@ define void @test_load_store(half* %in, half* %out) #0 {
define i16 @test_bitcast_from_half(half* %addr) #0 {
; BWON-LABEL: test_bitcast_from_half:
-; BWON: # BB#0:
+; BWON: # %bb.0:
; BWON-NEXT: movzwl (%rdi), %eax
; BWON-NEXT: retq
;
; BWOFF-LABEL: test_bitcast_from_half:
-; BWOFF: # BB#0:
+; BWOFF: # %bb.0:
; BWOFF-NEXT: movw (%rdi), %ax
; BWOFF-NEXT: retq
;
; CHECK-I686-LABEL: test_bitcast_from_half:
-; CHECK-I686: # BB#0:
+; CHECK-I686: # %bb.0:
; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-I686-NEXT: movw (%eax), %ax
; CHECK-I686-NEXT: retl
@@ -56,12 +56,12 @@ define i16 @test_bitcast_from_half(half* %addr) #0 {
define void @test_bitcast_to_half(half* %addr, i16 %in) #0 {
; CHECK-LABEL: test_bitcast_to_half:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movw %si, (%rdi)
; CHECK-NEXT: retq
;
; CHECK-I686-LABEL: test_bitcast_to_half:
-; CHECK-I686: # BB#0:
+; CHECK-I686: # %bb.0:
; CHECK-I686-NEXT: movw {{[0-9]+}}(%esp), %ax
; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %ecx
; CHECK-I686-NEXT: movw %ax, (%ecx)
@@ -73,19 +73,19 @@ define void @test_bitcast_to_half(half* %addr, i16 %in) #0 {
define float @test_extend32(half* %addr) #0 {
; CHECK-LIBCALL-LABEL: test_extend32:
-; CHECK-LIBCALL: # BB#0:
+; CHECK-LIBCALL: # %bb.0:
; CHECK-LIBCALL-NEXT: movzwl (%rdi), %edi
; CHECK-LIBCALL-NEXT: jmp __gnu_h2f_ieee # TAILCALL
;
; BWON-F16C-LABEL: test_extend32:
-; BWON-F16C: # BB#0:
+; BWON-F16C: # %bb.0:
; BWON-F16C-NEXT: movswl (%rdi), %eax
; BWON-F16C-NEXT: vmovd %eax, %xmm0
; BWON-F16C-NEXT: vcvtph2ps %xmm0, %xmm0
; BWON-F16C-NEXT: retq
;
; CHECK-I686-LABEL: test_extend32:
-; CHECK-I686: # BB#0:
+; CHECK-I686: # %bb.0:
; CHECK-I686-NEXT: subl $12, %esp
; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-I686-NEXT: movzwl (%eax), %eax
@@ -100,7 +100,7 @@ define float @test_extend32(half* %addr) #0 {
define double @test_extend64(half* %addr) #0 {
; CHECK-LIBCALL-LABEL: test_extend64:
-; CHECK-LIBCALL: # BB#0:
+; CHECK-LIBCALL: # %bb.0:
; CHECK-LIBCALL-NEXT: pushq %rax
; CHECK-LIBCALL-NEXT: movzwl (%rdi), %edi
; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee
@@ -109,7 +109,7 @@ define double @test_extend64(half* %addr) #0 {
; CHECK-LIBCALL-NEXT: retq
;
; BWON-F16C-LABEL: test_extend64:
-; BWON-F16C: # BB#0:
+; BWON-F16C: # %bb.0:
; BWON-F16C-NEXT: movswl (%rdi), %eax
; BWON-F16C-NEXT: vmovd %eax, %xmm0
; BWON-F16C-NEXT: vcvtph2ps %xmm0, %xmm0
@@ -117,7 +117,7 @@ define double @test_extend64(half* %addr) #0 {
; BWON-F16C-NEXT: retq
;
; CHECK-I686-LABEL: test_extend64:
-; CHECK-I686: # BB#0:
+; CHECK-I686: # %bb.0:
; CHECK-I686-NEXT: subl $12, %esp
; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-I686-NEXT: movzwl (%eax), %eax
@@ -132,7 +132,7 @@ define double @test_extend64(half* %addr) #0 {
define void @test_trunc32(float %in, half* %addr) #0 {
; CHECK-LIBCALL-LABEL: test_trunc32:
-; CHECK-LIBCALL: # BB#0:
+; CHECK-LIBCALL: # %bb.0:
; CHECK-LIBCALL-NEXT: pushq %rbx
; CHECK-LIBCALL-NEXT: movq %rdi, %rbx
; CHECK-LIBCALL-NEXT: callq __gnu_f2h_ieee
@@ -141,14 +141,14 @@ define void @test_trunc32(float %in, half* %addr) #0 {
; CHECK-LIBCALL-NEXT: retq
;
; BWON-F16C-LABEL: test_trunc32:
-; BWON-F16C: # BB#0:
+; BWON-F16C: # %bb.0:
; BWON-F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; BWON-F16C-NEXT: vmovd %xmm0, %eax
; BWON-F16C-NEXT: movw %ax, (%rdi)
; BWON-F16C-NEXT: retq
;
; CHECK-I686-LABEL: test_trunc32:
-; CHECK-I686: # BB#0:
+; CHECK-I686: # %bb.0:
; CHECK-I686-NEXT: pushl %esi
; CHECK-I686-NEXT: subl $8, %esp
; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %esi
@@ -166,7 +166,7 @@ define void @test_trunc32(float %in, half* %addr) #0 {
define void @test_trunc64(double %in, half* %addr) #0 {
; CHECK-LABEL: test_trunc64:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: pushq %rbx
; CHECK-NEXT: movq %rdi, %rbx
; CHECK-NEXT: callq __truncdfhf2
@@ -175,7 +175,7 @@ define void @test_trunc64(double %in, half* %addr) #0 {
; CHECK-NEXT: retq
;
; CHECK-I686-LABEL: test_trunc64:
-; CHECK-I686: # BB#0:
+; CHECK-I686: # %bb.0:
; CHECK-I686-NEXT: pushl %esi
; CHECK-I686-NEXT: subl $8, %esp
; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %esi
@@ -193,7 +193,7 @@ define void @test_trunc64(double %in, half* %addr) #0 {
define i64 @test_fptosi_i64(half* %p) #0 {
; CHECK-LIBCALL-LABEL: test_fptosi_i64:
-; CHECK-LIBCALL: # BB#0:
+; CHECK-LIBCALL: # %bb.0:
; CHECK-LIBCALL-NEXT: pushq %rax
; CHECK-LIBCALL-NEXT: movzwl (%rdi), %edi
; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee
@@ -202,7 +202,7 @@ define i64 @test_fptosi_i64(half* %p) #0 {
; CHECK-LIBCALL-NEXT: retq
;
; BWON-F16C-LABEL: test_fptosi_i64:
-; BWON-F16C: # BB#0:
+; BWON-F16C: # %bb.0:
; BWON-F16C-NEXT: movswl (%rdi), %eax
; BWON-F16C-NEXT: vmovd %eax, %xmm0
; BWON-F16C-NEXT: vcvtph2ps %xmm0, %xmm0
@@ -210,7 +210,7 @@ define i64 @test_fptosi_i64(half* %p) #0 {
; BWON-F16C-NEXT: retq
;
; CHECK-I686-LABEL: test_fptosi_i64:
-; CHECK-I686: # BB#0:
+; CHECK-I686: # %bb.0:
; CHECK-I686-NEXT: subl $12, %esp
; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-I686-NEXT: movzwl (%eax), %eax
@@ -227,7 +227,7 @@ define i64 @test_fptosi_i64(half* %p) #0 {
define void @test_sitofp_i64(i64 %a, half* %p) #0 {
; CHECK-LIBCALL-LABEL: test_sitofp_i64:
-; CHECK-LIBCALL: # BB#0:
+; CHECK-LIBCALL: # %bb.0:
; CHECK-LIBCALL-NEXT: pushq %rbx
; CHECK-LIBCALL-NEXT: movq %rsi, %rbx
; CHECK-LIBCALL-NEXT: cvtsi2ssq %rdi, %xmm0
@@ -237,7 +237,7 @@ define void @test_sitofp_i64(i64 %a, half* %p) #0 {
; CHECK-LIBCALL-NEXT: retq
;
; BWON-F16C-LABEL: test_sitofp_i64:
-; BWON-F16C: # BB#0:
+; BWON-F16C: # %bb.0:
; BWON-F16C-NEXT: vcvtsi2ssq %rdi, %xmm0, %xmm0
; BWON-F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; BWON-F16C-NEXT: vmovd %xmm0, %eax
@@ -245,7 +245,7 @@ define void @test_sitofp_i64(i64 %a, half* %p) #0 {
; BWON-F16C-NEXT: retq
;
; CHECK-I686-LABEL: test_sitofp_i64:
-; CHECK-I686: # BB#0:
+; CHECK-I686: # %bb.0:
; CHECK-I686-NEXT: pushl %esi
; CHECK-I686-NEXT: subl $24, %esp
; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %esi
@@ -267,7 +267,7 @@ define void @test_sitofp_i64(i64 %a, half* %p) #0 {
define i64 @test_fptoui_i64(half* %p) #0 {
; CHECK-LIBCALL-LABEL: test_fptoui_i64:
-; CHECK-LIBCALL: # BB#0:
+; CHECK-LIBCALL: # %bb.0:
; CHECK-LIBCALL-NEXT: pushq %rax
; CHECK-LIBCALL-NEXT: movzwl (%rdi), %edi
; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee
@@ -284,7 +284,7 @@ define i64 @test_fptoui_i64(half* %p) #0 {
; CHECK-LIBCALL-NEXT: retq
;
; BWON-F16C-LABEL: test_fptoui_i64:
-; BWON-F16C: # BB#0:
+; BWON-F16C: # %bb.0:
; BWON-F16C-NEXT: movswl (%rdi), %eax
; BWON-F16C-NEXT: vmovd %eax, %xmm0
; BWON-F16C-NEXT: vcvtph2ps %xmm0, %xmm0
@@ -299,7 +299,7 @@ define i64 @test_fptoui_i64(half* %p) #0 {
; BWON-F16C-NEXT: retq
;
; CHECK-I686-LABEL: test_fptoui_i64:
-; CHECK-I686: # BB#0:
+; CHECK-I686: # %bb.0:
; CHECK-I686-NEXT: subl $12, %esp
; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-I686-NEXT: movzwl (%eax), %eax
@@ -316,12 +316,12 @@ define i64 @test_fptoui_i64(half* %p) #0 {
define void @test_uitofp_i64(i64 %a, half* %p) #0 {
; CHECK-LIBCALL-LABEL: test_uitofp_i64:
-; CHECK-LIBCALL: # BB#0:
+; CHECK-LIBCALL: # %bb.0:
; CHECK-LIBCALL-NEXT: pushq %rbx
; CHECK-LIBCALL-NEXT: movq %rsi, %rbx
; CHECK-LIBCALL-NEXT: testq %rdi, %rdi
; CHECK-LIBCALL-NEXT: js .LBB10_1
-; CHECK-LIBCALL-NEXT: # BB#2:
+; CHECK-LIBCALL-NEXT: # %bb.2:
; CHECK-LIBCALL-NEXT: cvtsi2ssq %rdi, %xmm0
; CHECK-LIBCALL-NEXT: jmp .LBB10_3
; CHECK-LIBCALL-NEXT: .LBB10_1:
@@ -338,10 +338,10 @@ define void @test_uitofp_i64(i64 %a, half* %p) #0 {
; CHECK-LIBCALL-NEXT: retq
;
; BWON-F16C-LABEL: test_uitofp_i64:
-; BWON-F16C: # BB#0:
+; BWON-F16C: # %bb.0:
; BWON-F16C-NEXT: testq %rdi, %rdi
; BWON-F16C-NEXT: js .LBB10_1
-; BWON-F16C-NEXT: # BB#2:
+; BWON-F16C-NEXT: # %bb.2:
; BWON-F16C-NEXT: vcvtsi2ssq %rdi, %xmm0, %xmm0
; BWON-F16C-NEXT: jmp .LBB10_3
; BWON-F16C-NEXT: .LBB10_1:
@@ -358,7 +358,7 @@ define void @test_uitofp_i64(i64 %a, half* %p) #0 {
; BWON-F16C-NEXT: retq
;
; CHECK-I686-LABEL: test_uitofp_i64:
-; CHECK-I686: # BB#0:
+; CHECK-I686: # %bb.0:
; CHECK-I686-NEXT: pushl %esi
; CHECK-I686-NEXT: subl $24, %esp
; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %esi
@@ -382,7 +382,7 @@ define void @test_uitofp_i64(i64 %a, half* %p) #0 {
define <4 x float> @test_extend32_vec4(<4 x half>* %p) #0 {
; CHECK-LIBCALL-LABEL: test_extend32_vec4:
-; CHECK-LIBCALL: # BB#0:
+; CHECK-LIBCALL: # %bb.0:
; CHECK-LIBCALL-NEXT: pushq %rbx
; CHECK-LIBCALL-NEXT: subq $48, %rsp
; CHECK-LIBCALL-NEXT: movq %rdi, %rbx
@@ -402,13 +402,13 @@ define <4 x float> @test_extend32_vec4(<4 x half>* %p) #0 {
; CHECK-LIBCALL-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
; CHECK-LIBCALL-NEXT: unpcklps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Folded Reload
; CHECK-LIBCALL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; CHECK-LIBCALL-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; CHECK-LIBCALL-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; CHECK-LIBCALL-NEXT: addq $48, %rsp
; CHECK-LIBCALL-NEXT: popq %rbx
; CHECK-LIBCALL-NEXT: retq
;
; BWON-F16C-LABEL: test_extend32_vec4:
-; BWON-F16C: # BB#0:
+; BWON-F16C: # %bb.0:
; BWON-F16C-NEXT: movswl 6(%rdi), %eax
; BWON-F16C-NEXT: vmovd %eax, %xmm0
; BWON-F16C-NEXT: vcvtph2ps %xmm0, %xmm0
@@ -427,7 +427,7 @@ define <4 x float> @test_extend32_vec4(<4 x half>* %p) #0 {
; BWON-F16C-NEXT: retq
;
; CHECK-I686-LABEL: test_extend32_vec4:
-; CHECK-I686: # BB#0:
+; CHECK-I686: # %bb.0:
; CHECK-I686-NEXT: pushl %esi
; CHECK-I686-NEXT: subl $56, %esp
; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %esi
@@ -457,7 +457,7 @@ define <4 x float> @test_extend32_vec4(<4 x half>* %p) #0 {
; CHECK-I686-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; CHECK-I686-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-I686-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; CHECK-I686-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; CHECK-I686-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; CHECK-I686-NEXT: addl $56, %esp
; CHECK-I686-NEXT: popl %esi
; CHECK-I686-NEXT: retl
@@ -468,7 +468,7 @@ define <4 x float> @test_extend32_vec4(<4 x half>* %p) #0 {
define <4 x double> @test_extend64_vec4(<4 x half>* %p) #0 {
; CHECK-LIBCALL-LABEL: test_extend64_vec4:
-; CHECK-LIBCALL: # BB#0:
+; CHECK-LIBCALL: # %bb.0:
; CHECK-LIBCALL-NEXT: pushq %rbx
; CHECK-LIBCALL-NEXT: subq $16, %rsp
; CHECK-LIBCALL-NEXT: movq %rdi, %rbx
@@ -487,20 +487,20 @@ define <4 x double> @test_extend64_vec4(<4 x half>* %p) #0 {
; CHECK-LIBCALL-NEXT: movss {{[0-9]+}}(%rsp), %xmm0 # 4-byte Reload
; CHECK-LIBCALL-NEXT: # xmm0 = mem[0],zero,zero,zero
; CHECK-LIBCALL-NEXT: cvtss2sd %xmm0, %xmm0
-; CHECK-LIBCALL-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; CHECK-LIBCALL-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; CHECK-LIBCALL-NEXT: movss {{[0-9]+}}(%rsp), %xmm1 # 4-byte Reload
; CHECK-LIBCALL-NEXT: # xmm1 = mem[0],zero,zero,zero
; CHECK-LIBCALL-NEXT: cvtss2sd %xmm1, %xmm2
; CHECK-LIBCALL-NEXT: movss {{[0-9]+}}(%rsp), %xmm1 # 4-byte Reload
; CHECK-LIBCALL-NEXT: # xmm1 = mem[0],zero,zero,zero
; CHECK-LIBCALL-NEXT: cvtss2sd %xmm1, %xmm1
-; CHECK-LIBCALL-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; CHECK-LIBCALL-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; CHECK-LIBCALL-NEXT: addq $16, %rsp
; CHECK-LIBCALL-NEXT: popq %rbx
; CHECK-LIBCALL-NEXT: retq
;
; BWON-F16C-LABEL: test_extend64_vec4:
-; BWON-F16C: # BB#0:
+; BWON-F16C: # %bb.0:
; BWON-F16C-NEXT: movswl (%rdi), %eax
; BWON-F16C-NEXT: vmovd %eax, %xmm0
; BWON-F16C-NEXT: vcvtph2ps %xmm0, %xmm0
@@ -515,15 +515,15 @@ define <4 x double> @test_extend64_vec4(<4 x half>* %p) #0 {
; BWON-F16C-NEXT: vcvtph2ps %xmm3, %xmm3
; BWON-F16C-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
; BWON-F16C-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
-; BWON-F16C-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; BWON-F16C-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
; BWON-F16C-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
; BWON-F16C-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; BWON-F16C-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; BWON-F16C-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; BWON-F16C-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; BWON-F16C-NEXT: retq
;
; CHECK-I686-LABEL: test_extend64_vec4:
-; CHECK-I686: # BB#0:
+; CHECK-I686: # %bb.0:
; CHECK-I686-NEXT: pushl %esi
; CHECK-I686-NEXT: subl $88, %esp
; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %esi
@@ -563,7 +563,7 @@ define <4 x double> @test_extend64_vec4(<4 x half>* %p) #0 {
define void @test_trunc32_vec4(<4 x float> %a, <4 x half>* %p) #0 {
; BWON-NOF16C-LABEL: test_trunc32_vec4:
-; BWON-NOF16C: # BB#0:
+; BWON-NOF16C: # %bb.0:
; BWON-NOF16C-NEXT: pushq %rbp
; BWON-NOF16C-NEXT: pushq %r15
; BWON-NOF16C-NEXT: pushq %r14
@@ -596,7 +596,7 @@ define void @test_trunc32_vec4(<4 x float> %a, <4 x half>* %p) #0 {
; BWON-NOF16C-NEXT: retq
;
; BWOFF-LABEL: test_trunc32_vec4:
-; BWOFF: # BB#0:
+; BWOFF: # %bb.0:
; BWOFF-NEXT: pushq %rbp
; BWOFF-NEXT: pushq %r15
; BWOFF-NEXT: pushq %r14
@@ -629,7 +629,7 @@ define void @test_trunc32_vec4(<4 x float> %a, <4 x half>* %p) #0 {
; BWOFF-NEXT: retq
;
; BWON-F16C-LABEL: test_trunc32_vec4:
-; BWON-F16C: # BB#0:
+; BWON-F16C: # %bb.0:
; BWON-F16C-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; BWON-F16C-NEXT: vcvtps2ph $4, %xmm1, %xmm1
; BWON-F16C-NEXT: vmovd %xmm1, %eax
@@ -648,7 +648,7 @@ define void @test_trunc32_vec4(<4 x float> %a, <4 x half>* %p) #0 {
; BWON-F16C-NEXT: retq
;
; CHECK-I686-LABEL: test_trunc32_vec4:
-; CHECK-I686: # BB#0:
+; CHECK-I686: # %bb.0:
; CHECK-I686-NEXT: pushl %ebp
; CHECK-I686-NEXT: pushl %ebx
; CHECK-I686-NEXT: pushl %edi
@@ -691,7 +691,7 @@ define void @test_trunc32_vec4(<4 x float> %a, <4 x half>* %p) #0 {
define void @test_trunc64_vec4(<4 x double> %a, <4 x half>* %p) #0 {
; BWON-NOF16C-LABEL: test_trunc64_vec4:
-; BWON-NOF16C: # BB#0:
+; BWON-NOF16C: # %bb.0:
; BWON-NOF16C-NEXT: pushq %rbp
; BWON-NOF16C-NEXT: pushq %r15
; BWON-NOF16C-NEXT: pushq %r14
@@ -724,7 +724,7 @@ define void @test_trunc64_vec4(<4 x double> %a, <4 x half>* %p) #0 {
; BWON-NOF16C-NEXT: retq
;
; BWOFF-LABEL: test_trunc64_vec4:
-; BWOFF: # BB#0:
+; BWOFF: # %bb.0:
; BWOFF-NEXT: pushq %rbp
; BWOFF-NEXT: pushq %r15
; BWOFF-NEXT: pushq %r14
@@ -757,7 +757,7 @@ define void @test_trunc64_vec4(<4 x double> %a, <4 x half>* %p) #0 {
; BWOFF-NEXT: retq
;
; BWON-F16C-LABEL: test_trunc64_vec4:
-; BWON-F16C: # BB#0:
+; BWON-F16C: # %bb.0:
; BWON-F16C-NEXT: pushq %rbp
; BWON-F16C-NEXT: pushq %r15
; BWON-F16C-NEXT: pushq %r14
@@ -777,7 +777,7 @@ define void @test_trunc64_vec4(<4 x double> %a, <4 x half>* %p) #0 {
; BWON-F16C-NEXT: callq __truncdfhf2
; BWON-F16C-NEXT: movl %eax, %r15d
; BWON-F16C-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
-; BWON-F16C-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; BWON-F16C-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; BWON-F16C-NEXT: vzeroupper
; BWON-F16C-NEXT: callq __truncdfhf2
; BWON-F16C-NEXT: movl %eax, %ebp
@@ -795,7 +795,7 @@ define void @test_trunc64_vec4(<4 x double> %a, <4 x half>* %p) #0 {
; BWON-F16C-NEXT: retq
;
; CHECK-I686-LABEL: test_trunc64_vec4:
-; CHECK-I686: # BB#0:
+; CHECK-I686: # %bb.0:
; CHECK-I686-NEXT: pushl %ebp
; CHECK-I686-NEXT: pushl %ebx
; CHECK-I686-NEXT: pushl %edi
@@ -840,7 +840,7 @@ declare float @test_floatret();
; fp_round and the subsequent fptrunc from float to half.
define half @test_f80trunc_nodagcombine() #0 {
; CHECK-LIBCALL-LABEL: test_f80trunc_nodagcombine:
-; CHECK-LIBCALL: # BB#0:
+; CHECK-LIBCALL: # %bb.0:
; CHECK-LIBCALL-NEXT: pushq %rax
; CHECK-LIBCALL-NEXT: callq test_floatret
; CHECK-LIBCALL-NEXT: callq __gnu_f2h_ieee
@@ -850,7 +850,7 @@ define half @test_f80trunc_nodagcombine() #0 {
; CHECK-LIBCALL-NEXT: retq
;
; BWON-F16C-LABEL: test_f80trunc_nodagcombine:
-; BWON-F16C: # BB#0:
+; BWON-F16C: # %bb.0:
; BWON-F16C-NEXT: pushq %rax
; BWON-F16C-NEXT: callq test_floatret
; BWON-F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0
@@ -859,7 +859,7 @@ define half @test_f80trunc_nodagcombine() #0 {
; BWON-F16C-NEXT: retq
;
; CHECK-I686-LABEL: test_f80trunc_nodagcombine:
-; CHECK-I686: # BB#0:
+; CHECK-I686: # %bb.0:
; CHECK-I686-NEXT: subl $12, %esp
; CHECK-I686-NEXT: calll test_floatret
; CHECK-I686-NEXT: fstps (%esp)
@@ -879,7 +879,7 @@ define half @test_f80trunc_nodagcombine() #0 {
define float @test_sitofp_fadd_i32(i32 %a, half* %b) #0 {
; CHECK-LIBCALL-LABEL: test_sitofp_fadd_i32:
-; CHECK-LIBCALL: # BB#0:
+; CHECK-LIBCALL: # %bb.0:
; CHECK-LIBCALL-NEXT: pushq %rbx
; CHECK-LIBCALL-NEXT: subq $16, %rsp
; CHECK-LIBCALL-NEXT: movl %edi, %ebx
@@ -896,7 +896,7 @@ define float @test_sitofp_fadd_i32(i32 %a, half* %b) #0 {
; CHECK-LIBCALL-NEXT: retq
;
; BWON-F16C-LABEL: test_sitofp_fadd_i32:
-; BWON-F16C: # BB#0:
+; BWON-F16C: # %bb.0:
; BWON-F16C-NEXT: movswl (%rsi), %eax
; BWON-F16C-NEXT: vmovd %eax, %xmm0
; BWON-F16C-NEXT: vcvtph2ps %xmm0, %xmm0
@@ -907,7 +907,7 @@ define float @test_sitofp_fadd_i32(i32 %a, half* %b) #0 {
; BWON-F16C-NEXT: retq
;
; CHECK-I686-LABEL: test_sitofp_fadd_i32:
-; CHECK-I686: # BB#0:
+; CHECK-I686: # %bb.0:
; CHECK-I686-NEXT: subl $28, %esp
; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-I686-NEXT: movzwl (%eax), %eax
diff --git a/test/CodeGen/X86/handle-move.ll b/test/CodeGen/X86/handle-move.ll
index ba96275569b3..0a43ef3fc22d 100644
--- a/test/CodeGen/X86/handle-move.ll
+++ b/test/CodeGen/X86/handle-move.ll
@@ -1,15 +1,15 @@
-; RUN: llc -march=x86-64 -mcpu=core2 -fast-isel -enable-misched -misched=shuffle -misched-bottomup -verify-machineinstrs < %s
-; RUN: llc -march=x86-64 -mcpu=core2 -fast-isel -enable-misched -misched=shuffle -misched-topdown -verify-machineinstrs < %s
+; RUN: llc -mtriple=x86_64-- -mcpu=core2 -fast-isel -enable-misched -misched=shuffle -misched-bottomup -verify-machineinstrs < %s
+; RUN: llc -mtriple=x86_64-- -mcpu=core2 -fast-isel -enable-misched -misched=shuffle -misched-topdown -verify-machineinstrs < %s
; REQUIRES: asserts
;
; Test the LiveIntervals::handleMove() function.
;
; Moving the DIV32r instruction exercises the regunit update code because
-; %EDX has a live range into the function and is used by the DIV32r.
+; %edx has a live range into the function and is used by the DIV32r.
;
; Here sinking a kill + dead def:
-; 144B -> 180B: DIV32r %vreg4, %EAX<imp-def>, %EDX<imp-def,dead>, %EFLAGS<imp-def,dead>, %EAX<imp-use,kill>, %EDX<imp-use>
-; %vreg4: [48r,144r:0) 0@48r
+; 144B -> 180B: DIV32r %4, implicit-def %eax, implicit dead %edx, implicit dead %EFLAGS, implicit killed %eax, implicit %edx
+; %4: [48r,144r:0) 0@48r
; --> [48r,180r:0) 0@48r
; DH: [0B,16r:0)[128r,144r:2)[144r,144d:1) 0@0B-phi 1@144r 2@128r
; --> [0B,16r:0)[128r,180r:2)[180r,180d:1) 0@0B-phi 1@180r 2@128r
@@ -25,8 +25,8 @@ entry:
}
; Same as above, but moving a kill + live def:
-; 144B -> 180B: DIV32r %vreg4, %EAX<imp-def,dead>, %EDX<imp-def>, %EFLAGS<imp-def,dead>, %EAX<imp-use,kill>, %EDX<imp-use>
-; %vreg4: [48r,144r:0) 0@48r
+; 144B -> 180B: DIV32r %4, implicit dead %eax, implicit-def %edx, implicit dead %EFLAGS, implicit killed %eax, implicit %edx
+; %4: [48r,144r:0) 0@48r
; --> [48r,180r:0) 0@48r
; DH: [0B,16r:0)[128r,144r:2)[144r,184r:1) 0@0B-phi 1@144r 2@128r
; --> [0B,16r:0)[128r,180r:2)[180r,184r:1) 0@0B-phi 1@180r 2@128r
@@ -41,13 +41,13 @@ entry:
ret i32 %add
}
-; Moving a use below the existing kill (%vreg5):
-; Moving a tied virtual register def (%vreg11):
+; Moving a use below the existing kill (%5):
+; Moving a tied virtual register def (%11):
;
-; 96B -> 120B: %vreg11<def,tied1> = SUB32rr %vreg11<tied0>, %vreg5
-; %vreg11: [80r,96r:1)[96r,144r:0) 0@96r 1@80r
+; 96B -> 120B: %11<def,tied1> = SUB32rr %11<tied0>, %5
+; %11: [80r,96r:1)[96r,144r:0) 0@96r 1@80r
; --> [80r,120r:1)[120r,144r:0) 0@120r 1@80r
-; %vreg5: [16r,112r:0) 0@16r
+; %5: [16r,112r:0) 0@16r
; --> [16r,120r:0) 0@16r
;
define i32 @f3(i32 %a, i32 %b) nounwind uwtable readnone ssp {
@@ -59,7 +59,7 @@ entry:
}
; Move EFLAGS dead def across another def:
-; handleMove 208B -> 36B: %EDX<def> = MOV32r0 %EFLAGS<imp-def,dead>
+; handleMove 208B -> 36B: %edx = MOV32r0 implicit dead %EFLAGS
; EFLAGS: [20r,20d:4)[160r,160d:3)[208r,208d:0)[224r,224d:1)[272r,272d:2)[304r,304d:5) 0@208r 1@224r 2@272r 3@160r 4@20r 5@304r
; --> [20r,20d:4)[36r,36d:0)[160r,160d:3)[224r,224d:1)[272r,272d:2)[304r,304d:5) 0@36r 1@224r 2@272r 3@160r 4@20r 5@304r
;
diff --git a/test/CodeGen/X86/hoist-invariant-load.ll b/test/CodeGen/X86/hoist-invariant-load.ll
index e7929c9cecdc..a8b4e0a25f9e 100644
--- a/test/CodeGen/X86/hoist-invariant-load.ll
+++ b/test/CodeGen/X86/hoist-invariant-load.ll
@@ -1,10 +1,12 @@
; REQUIRES: asserts
-; RUN: llc -mcpu=haswell < %s -stats -O2 2>&1 | grep "4 machinelicm.*hoisted"
+; RUN: llc -mcpu=haswell < %s -stats -O2 2>&1 | grep "7 machinelicm.*hoisted"
; For test:
; 2 invariant loads, 1 for OBJC_SELECTOR_REFERENCES_
; and 1 for objc_msgSend from the GOT
; For test_multi_def:
; 2 invariant load (full multiply, both loads should be hoisted.)
+; For test_div_def:
+; 2 invariant load (full divide, both loads should be hoisted.) 1 additional instruction for a zeroing edx that gets hoisted and then rematerialized.
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
target triple = "x86_64-apple-macosx10.7.2"
@@ -60,4 +62,30 @@ exit:
ret void
}
+define void @test_div_def(i32* dereferenceable(8) %x1,
+ i32* dereferenceable(8) %x2,
+ i32* %y, i32 %count) nounwind {
+entry:
+ br label %for.body
+
+for.check:
+ %inc = add nsw i32 %i, 1
+ %done = icmp sge i32 %inc, %count
+ br i1 %done, label %exit, label %for.body
+
+for.body:
+ %i = phi i32 [ 0, %entry ], [ %inc, %for.check ]
+ %x1_load = load i32, i32* %x1, align 8, !invariant.load !0
+ %x2_load = load i32, i32* %x2, align 8, !invariant.load !0
+ %x_quot = udiv i32 %x1_load, %x2_load
+ %y_elem = getelementptr inbounds i32, i32* %y, i32 %i
+ %y_load = load i32, i32* %y_elem, align 8
+ %y_plus = add i32 %x_quot, %y_load
+ store i32 %y_plus, i32* %y_elem, align 8
+ br label %for.check
+
+exit:
+ ret void
+}
+
!0 = !{}
diff --git a/test/CodeGen/X86/hoist-spill.ll b/test/CodeGen/X86/hoist-spill.ll
index afabf96b12a3..03f558fc3ae2 100644
--- a/test/CodeGen/X86/hoist-spill.ll
+++ b/test/CodeGen/X86/hoist-spill.ll
@@ -3,10 +3,8 @@
; Check no spills to the same stack slot after hoisting.
; CHECK: mov{{.}} %{{.*}}, [[SPOFFSET1:-?[0-9]*]](%rsp)
; CHECK: mov{{.}} %{{.*}}, [[SPOFFSET2:-?[0-9]*]](%rsp)
-; CHECK: mov{{.}} %{{.*}}, [[SPOFFSET3:-?[0-9]*]](%rsp)
; CHECK-NOT: mov{{.}} %{{.*}}, [[SPOFFSET1]](%rsp)
; CHECK-NOT: mov{{.}} %{{.*}}, [[SPOFFSET2]](%rsp)
-; CHECK-NOT: mov{{.}} %{{.*}}, [[SPOFFSET3]](%rsp)
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
diff --git a/test/CodeGen/X86/horizontal-reduce-smax.ll b/test/CodeGen/X86/horizontal-reduce-smax.ll
new file mode 100644
index 000000000000..a54e01d9af67
--- /dev/null
+++ b/test/CodeGen/X86/horizontal-reduce-smax.ll
@@ -0,0 +1,1940 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+sse2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE --check-prefix=X86-SSE2
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+sse4.2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE --check-prefix=X86-SSE42
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx | FileCheck %s --check-prefix=X86 --check-prefix=X86-AVX --check-prefix=X86-AVX1
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-AVX --check-prefix=X86-AVX2
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+sse2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE --check-prefix=X64-SSE2
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+sse4.2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE --check-prefix=X64-SSE42
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX1
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX2
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vl | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX512
+
+;
+; 128-bit Vectors
+;
+
+define i64 @test_reduce_v2i64(<2 x i64> %a0) {
+; X86-SSE2-LABEL: test_reduce_v2i64:
+; X86-SSE2: ## %bb.0:
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0]
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X86-SSE2-NEXT: pxor %xmm2, %xmm3
+; X86-SSE2-NEXT: pxor %xmm1, %xmm2
+; X86-SSE2-NEXT: movdqa %xmm3, %xmm4
+; X86-SSE2-NEXT: pcmpgtd %xmm2, %xmm4
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; X86-SSE2-NEXT: pcmpeqd %xmm3, %xmm2
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; X86-SSE2-NEXT: pand %xmm5, %xmm2
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; X86-SSE2-NEXT: por %xmm2, %xmm3
+; X86-SSE2-NEXT: pand %xmm3, %xmm0
+; X86-SSE2-NEXT: pandn %xmm1, %xmm3
+; X86-SSE2-NEXT: por %xmm0, %xmm3
+; X86-SSE2-NEXT: movd %xmm3, %eax
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
+; X86-SSE2-NEXT: movd %xmm0, %edx
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v2i64:
+; X86-SSE42: ## %bb.0:
+; X86-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; X86-SSE42-NEXT: pcmpgtq %xmm2, %xmm0
+; X86-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2
+; X86-SSE42-NEXT: movd %xmm2, %eax
+; X86-SSE42-NEXT: pextrd $1, %xmm2, %edx
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX-LABEL: test_reduce_v2i64:
+; X86-AVX: ## %bb.0:
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
+; X86-AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; X86-AVX-NEXT: vmovd %xmm0, %eax
+; X86-AVX-NEXT: vpextrd $1, %xmm0, %edx
+; X86-AVX-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v2i64:
+; X64-SSE2: ## %bb.0:
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0]
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X64-SSE2-NEXT: pxor %xmm2, %xmm3
+; X64-SSE2-NEXT: pxor %xmm1, %xmm2
+; X64-SSE2-NEXT: movdqa %xmm3, %xmm4
+; X64-SSE2-NEXT: pcmpgtd %xmm2, %xmm4
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; X64-SSE2-NEXT: pcmpeqd %xmm3, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; X64-SSE2-NEXT: pand %xmm5, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; X64-SSE2-NEXT: por %xmm2, %xmm3
+; X64-SSE2-NEXT: pand %xmm3, %xmm0
+; X64-SSE2-NEXT: pandn %xmm1, %xmm3
+; X64-SSE2-NEXT: por %xmm0, %xmm3
+; X64-SSE2-NEXT: movq %xmm3, %rax
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v2i64:
+; X64-SSE42: ## %bb.0:
+; X64-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; X64-SSE42-NEXT: pcmpgtq %xmm2, %xmm0
+; X64-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2
+; X64-SSE42-NEXT: movq %xmm2, %rax
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v2i64:
+; X64-AVX1: ## %bb.0:
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
+; X64-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; X64-AVX1-NEXT: vmovq %xmm0, %rax
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v2i64:
+; X64-AVX2: ## %bb.0:
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
+; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; X64-AVX2-NEXT: vmovq %xmm0, %rax
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v2i64:
+; X64-AVX512: ## %bb.0:
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0
+; X64-AVX512-NEXT: vmovq %xmm0, %rax
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <2 x i64> %a0, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
+ %2 = icmp sgt <2 x i64> %a0, %1
+ %3 = select <2 x i1> %2, <2 x i64> %a0, <2 x i64> %1
+ %4 = extractelement <2 x i64> %3, i32 0
+ ret i64 %4
+}
+
+define i32 @test_reduce_v4i32(<4 x i32> %a0) {
+; X86-SSE2-LABEL: test_reduce_v4i32:
+; X86-SSE2: ## %bb.0:
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT: pcmpgtd %xmm1, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm0
+; X86-SSE2-NEXT: pandn %xmm1, %xmm2
+; X86-SSE2-NEXT: por %xmm0, %xmm2
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm1
+; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm1
+; X86-SSE2-NEXT: pand %xmm1, %xmm2
+; X86-SSE2-NEXT: pandn %xmm0, %xmm1
+; X86-SSE2-NEXT: por %xmm2, %xmm1
+; X86-SSE2-NEXT: movd %xmm1, %eax
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v4i32:
+; X86-SSE42: ## %bb.0:
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT: pmaxsd %xmm0, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT: pmaxsd %xmm1, %xmm0
+; X86-SSE42-NEXT: movd %xmm0, %eax
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX-LABEL: test_reduce_v4i32:
+; X86-AVX: ## %bb.0:
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vmovd %xmm0, %eax
+; X86-AVX-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v4i32:
+; X64-SSE2: ## %bb.0:
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X64-SSE2-NEXT: pcmpgtd %xmm1, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm0
+; X64-SSE2-NEXT: pandn %xmm1, %xmm2
+; X64-SSE2-NEXT: por %xmm0, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm1
+; X64-SSE2-NEXT: pcmpgtd %xmm0, %xmm1
+; X64-SSE2-NEXT: pand %xmm1, %xmm2
+; X64-SSE2-NEXT: pandn %xmm0, %xmm1
+; X64-SSE2-NEXT: por %xmm2, %xmm1
+; X64-SSE2-NEXT: movd %xmm1, %eax
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v4i32:
+; X64-SSE42: ## %bb.0:
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT: pmaxsd %xmm0, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT: pmaxsd %xmm1, %xmm0
+; X64-SSE42-NEXT: movd %xmm0, %eax
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX-LABEL: test_reduce_v4i32:
+; X64-AVX: ## %bb.0:
+; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vmovd %xmm0, %eax
+; X64-AVX-NEXT: retq
+ %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+ %2 = icmp sgt <4 x i32> %a0, %1
+ %3 = select <4 x i1> %2, <4 x i32> %a0, <4 x i32> %1
+ %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+ %5 = icmp sgt <4 x i32> %3, %4
+ %6 = select <4 x i1> %5, <4 x i32> %3, <4 x i32> %4
+ %7 = extractelement <4 x i32> %6, i32 0
+ ret i32 %7
+}
+
+define i16 @test_reduce_v8i16(<8 x i16> %a0) {
+; X86-SSE2-LABEL: test_reduce_v8i16:
+; X86-SSE2: ## %bb.0:
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: pmaxsw %xmm0, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE2-NEXT: pmaxsw %xmm1, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT: psrld $16, %xmm1
+; X86-SSE2-NEXT: pmaxsw %xmm0, %xmm1
+; X86-SSE2-NEXT: movd %xmm1, %eax
+; X86-SSE2-NEXT: ## kill: def %ax killed %ax killed %eax
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v8i16:
+; X86-SSE42: ## %bb.0:
+; X86-SSE42-NEXT: movdqa {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767]
+; X86-SSE42-NEXT: pxor %xmm1, %xmm0
+; X86-SSE42-NEXT: phminposuw %xmm0, %xmm0
+; X86-SSE42-NEXT: pxor %xmm1, %xmm0
+; X86-SSE42-NEXT: movd %xmm0, %eax
+; X86-SSE42-NEXT: ## kill: def %ax killed %ax killed %eax
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX-LABEL: test_reduce_v8i16:
+; X86-AVX: ## %bb.0:
+; X86-AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767]
+; X86-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vphminposuw %xmm0, %xmm0
+; X86-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vmovd %xmm0, %eax
+; X86-AVX-NEXT: ## kill: def %ax killed %ax killed %eax
+; X86-AVX-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v8i16:
+; X64-SSE2: ## %bb.0:
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: pmaxsw %xmm0, %xmm1
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE2-NEXT: pmaxsw %xmm1, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE2-NEXT: psrld $16, %xmm1
+; X64-SSE2-NEXT: pmaxsw %xmm0, %xmm1
+; X64-SSE2-NEXT: movd %xmm1, %eax
+; X64-SSE2-NEXT: ## kill: def %ax killed %ax killed %eax
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v8i16:
+; X64-SSE42: ## %bb.0:
+; X64-SSE42-NEXT: movdqa {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767]
+; X64-SSE42-NEXT: pxor %xmm1, %xmm0
+; X64-SSE42-NEXT: phminposuw %xmm0, %xmm0
+; X64-SSE42-NEXT: pxor %xmm1, %xmm0
+; X64-SSE42-NEXT: movd %xmm0, %eax
+; X64-SSE42-NEXT: ## kill: def %ax killed %ax killed %eax
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX-LABEL: test_reduce_v8i16:
+; X64-AVX: ## %bb.0:
+; X64-AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767]
+; X64-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vphminposuw %xmm0, %xmm0
+; X64-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vmovd %xmm0, %eax
+; X64-AVX-NEXT: ## kill: def %ax killed %ax killed %eax
+; X64-AVX-NEXT: retq
+ %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp sgt <8 x i16> %a0, %1
+ %3 = select <8 x i1> %2, <8 x i16> %a0, <8 x i16> %1
+ %4 = shufflevector <8 x i16> %3, <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp sgt <8 x i16> %3, %4
+ %6 = select <8 x i1> %5, <8 x i16> %3, <8 x i16> %4
+ %7 = shufflevector <8 x i16> %6, <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp sgt <8 x i16> %6, %7
+ %9 = select <8 x i1> %8, <8 x i16> %6, <8 x i16> %7
+ %10 = extractelement <8 x i16> %9, i32 0
+ ret i16 %10
+}
+
+define i8 @test_reduce_v16i8(<16 x i8> %a0) {
+; X86-SSE2-LABEL: test_reduce_v16i8:
+; X86-SSE2: ## %bb.0:
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT: pcmpgtb %xmm1, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm0
+; X86-SSE2-NEXT: pandn %xmm1, %xmm2
+; X86-SSE2-NEXT: por %xmm0, %xmm2
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm1
+; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm1
+; X86-SSE2-NEXT: pand %xmm1, %xmm2
+; X86-SSE2-NEXT: pandn %xmm0, %xmm1
+; X86-SSE2-NEXT: por %xmm2, %xmm1
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE2-NEXT: psrld $16, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm1
+; X86-SSE2-NEXT: pandn %xmm0, %xmm2
+; X86-SSE2-NEXT: por %xmm1, %xmm2
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm0
+; X86-SSE2-NEXT: psrlw $8, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm1
+; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm1
+; X86-SSE2-NEXT: pand %xmm1, %xmm2
+; X86-SSE2-NEXT: pandn %xmm0, %xmm1
+; X86-SSE2-NEXT: por %xmm2, %xmm1
+; X86-SSE2-NEXT: movd %xmm1, %eax
+; X86-SSE2-NEXT: ## kill: def %al killed %al killed %eax
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v16i8:
+; X86-SSE42: ## %bb.0:
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT: pmaxsb %xmm0, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT: pmaxsb %xmm1, %xmm0
+; X86-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE42-NEXT: psrld $16, %xmm1
+; X86-SSE42-NEXT: pmaxsb %xmm0, %xmm1
+; X86-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE42-NEXT: psrlw $8, %xmm0
+; X86-SSE42-NEXT: pmaxsb %xmm1, %xmm0
+; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax
+; X86-SSE42-NEXT: ## kill: def %al killed %al killed %eax
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX-LABEL: test_reduce_v16i8:
+; X86-AVX: ## %bb.0:
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X86-AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vpextrb $0, %xmm0, %eax
+; X86-AVX-NEXT: ## kill: def %al killed %al killed %eax
+; X86-AVX-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v16i8:
+; X64-SSE2: ## %bb.0:
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X64-SSE2-NEXT: pcmpgtb %xmm1, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm0
+; X64-SSE2-NEXT: pandn %xmm1, %xmm2
+; X64-SSE2-NEXT: por %xmm0, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm1
+; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm1
+; X64-SSE2-NEXT: pand %xmm1, %xmm2
+; X64-SSE2-NEXT: pandn %xmm0, %xmm1
+; X64-SSE2-NEXT: por %xmm2, %xmm1
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE2-NEXT: psrld $16, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm1
+; X64-SSE2-NEXT: pandn %xmm0, %xmm2
+; X64-SSE2-NEXT: por %xmm1, %xmm2
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm0
+; X64-SSE2-NEXT: psrlw $8, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm1
+; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm1
+; X64-SSE2-NEXT: pand %xmm1, %xmm2
+; X64-SSE2-NEXT: pandn %xmm0, %xmm1
+; X64-SSE2-NEXT: por %xmm2, %xmm1
+; X64-SSE2-NEXT: movd %xmm1, %eax
+; X64-SSE2-NEXT: ## kill: def %al killed %al killed %eax
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v16i8:
+; X64-SSE42: ## %bb.0:
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT: pmaxsb %xmm0, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT: pmaxsb %xmm1, %xmm0
+; X64-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE42-NEXT: psrld $16, %xmm1
+; X64-SSE42-NEXT: pmaxsb %xmm0, %xmm1
+; X64-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE42-NEXT: psrlw $8, %xmm0
+; X64-SSE42-NEXT: pmaxsb %xmm1, %xmm0
+; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax
+; X64-SSE42-NEXT: ## kill: def %al killed %al killed %eax
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX-LABEL: test_reduce_v16i8:
+; X64-AVX: ## %bb.0:
+; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X64-AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vpextrb $0, %xmm0, %eax
+; X64-AVX-NEXT: ## kill: def %al killed %al killed %eax
+; X64-AVX-NEXT: retq
+ %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp sgt <16 x i8> %a0, %1
+ %3 = select <16 x i1> %2, <16 x i8> %a0, <16 x i8> %1
+ %4 = shufflevector <16 x i8> %3, <16 x i8> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp sgt <16 x i8> %3, %4
+ %6 = select <16 x i1> %5, <16 x i8> %3, <16 x i8> %4
+ %7 = shufflevector <16 x i8> %6, <16 x i8> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp sgt <16 x i8> %6, %7
+ %9 = select <16 x i1> %8, <16 x i8> %6, <16 x i8> %7
+ %10 = shufflevector <16 x i8> %9, <16 x i8> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %11 = icmp sgt <16 x i8> %9, %10
+ %12 = select <16 x i1> %11, <16 x i8> %9, <16 x i8> %10
+ %13 = extractelement <16 x i8> %12, i32 0
+ ret i8 %13
+}
+
+;
+; 256-bit Vectors
+;
+
+define i64 @test_reduce_v4i64(<4 x i64> %a0) {
+; X86-SSE2-LABEL: test_reduce_v4i64:
+; X86-SSE2: ## %bb.0:
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0]
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm3
+; X86-SSE2-NEXT: pxor %xmm2, %xmm3
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm4
+; X86-SSE2-NEXT: pxor %xmm2, %xmm4
+; X86-SSE2-NEXT: movdqa %xmm4, %xmm5
+; X86-SSE2-NEXT: pcmpgtd %xmm3, %xmm5
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
+; X86-SSE2-NEXT: pcmpeqd %xmm3, %xmm4
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; X86-SSE2-NEXT: pand %xmm6, %xmm3
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
+; X86-SSE2-NEXT: por %xmm3, %xmm4
+; X86-SSE2-NEXT: pand %xmm4, %xmm0
+; X86-SSE2-NEXT: pandn %xmm1, %xmm4
+; X86-SSE2-NEXT: por %xmm0, %xmm4
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]
+; X86-SSE2-NEXT: movdqa %xmm4, %xmm1
+; X86-SSE2-NEXT: pxor %xmm2, %xmm1
+; X86-SSE2-NEXT: pxor %xmm0, %xmm2
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm3
+; X86-SSE2-NEXT: pcmpgtd %xmm2, %xmm3
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
+; X86-SSE2-NEXT: pcmpeqd %xmm1, %xmm2
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; X86-SSE2-NEXT: pand %xmm5, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; X86-SSE2-NEXT: por %xmm1, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm4
+; X86-SSE2-NEXT: pandn %xmm0, %xmm2
+; X86-SSE2-NEXT: por %xmm4, %xmm2
+; X86-SSE2-NEXT: movd %xmm2, %eax
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X86-SSE2-NEXT: movd %xmm0, %edx
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v4i64:
+; X86-SSE42: ## %bb.0:
+; X86-SSE42-NEXT: movdqa %xmm0, %xmm2
+; X86-SSE42-NEXT: pcmpgtq %xmm1, %xmm0
+; X86-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; X86-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE42-NEXT: pcmpgtq %xmm2, %xmm0
+; X86-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2
+; X86-SSE42-NEXT: movd %xmm2, %eax
+; X86-SSE42-NEXT: pextrd $1, %xmm2, %edx
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX1-LABEL: test_reduce_v4i64:
+; X86-AVX1: ## %bb.0:
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X86-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
+; X86-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm3
+; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X86-AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm3
+; X86-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX1-NEXT: vmovd %xmm0, %eax
+; X86-AVX1-NEXT: vpextrd $1, %xmm0, %edx
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_reduce_v4i64:
+; X86-AVX2: ## %bb.0:
+; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2
+; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2
+; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX2-NEXT: vmovd %xmm0, %eax
+; X86-AVX2-NEXT: vpextrd $1, %xmm0, %edx
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v4i64:
+; X64-SSE2: ## %bb.0:
+; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0]
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm3
+; X64-SSE2-NEXT: pxor %xmm2, %xmm3
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm4
+; X64-SSE2-NEXT: pxor %xmm2, %xmm4
+; X64-SSE2-NEXT: movdqa %xmm4, %xmm5
+; X64-SSE2-NEXT: pcmpgtd %xmm3, %xmm5
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
+; X64-SSE2-NEXT: pcmpeqd %xmm3, %xmm4
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; X64-SSE2-NEXT: pand %xmm6, %xmm3
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
+; X64-SSE2-NEXT: por %xmm3, %xmm4
+; X64-SSE2-NEXT: pand %xmm4, %xmm0
+; X64-SSE2-NEXT: pandn %xmm1, %xmm4
+; X64-SSE2-NEXT: por %xmm0, %xmm4
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]
+; X64-SSE2-NEXT: movdqa %xmm4, %xmm1
+; X64-SSE2-NEXT: pxor %xmm2, %xmm1
+; X64-SSE2-NEXT: pxor %xmm0, %xmm2
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm3
+; X64-SSE2-NEXT: pcmpgtd %xmm2, %xmm3
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
+; X64-SSE2-NEXT: pcmpeqd %xmm1, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; X64-SSE2-NEXT: pand %xmm5, %xmm1
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; X64-SSE2-NEXT: por %xmm1, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm4
+; X64-SSE2-NEXT: pandn %xmm0, %xmm2
+; X64-SSE2-NEXT: por %xmm4, %xmm2
+; X64-SSE2-NEXT: movq %xmm2, %rax
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v4i64:
+; X64-SSE42: ## %bb.0:
+; X64-SSE42-NEXT: movdqa %xmm0, %xmm2
+; X64-SSE42-NEXT: pcmpgtq %xmm1, %xmm0
+; X64-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; X64-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE42-NEXT: pcmpgtq %xmm2, %xmm0
+; X64-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2
+; X64-SSE42-NEXT: movq %xmm2, %rax
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v4i64:
+; X64-AVX1: ## %bb.0:
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
+; X64-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm3
+; X64-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm3
+; X64-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX1-NEXT: vmovq %xmm0, %rax
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v4i64:
+; X64-AVX2: ## %bb.0:
+; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2
+; X64-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2
+; X64-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX2-NEXT: vmovq %xmm0, %rax
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v4i64:
+; X64-AVX512: ## %bb.0:
+; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT: vpmaxsq %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpmaxsq %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vmovq %xmm0, %rax
+; X64-AVX512-NEXT: vzeroupper
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+ %2 = icmp sgt <4 x i64> %a0, %1
+ %3 = select <4 x i1> %2, <4 x i64> %a0, <4 x i64> %1
+ %4 = shufflevector <4 x i64> %3, <4 x i64> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+ %5 = icmp sgt <4 x i64> %3, %4
+ %6 = select <4 x i1> %5, <4 x i64> %3, <4 x i64> %4
+ %7 = extractelement <4 x i64> %6, i32 0
+ ret i64 %7
+}
+
+define i32 @test_reduce_v8i32(<8 x i32> %a0) {
+; X86-SSE2-LABEL: test_reduce_v8i32:
+; X86-SSE2: ## %bb.0:
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT: pcmpgtd %xmm1, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm0
+; X86-SSE2-NEXT: pandn %xmm1, %xmm2
+; X86-SSE2-NEXT: por %xmm0, %xmm2
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm1
+; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm1
+; X86-SSE2-NEXT: pand %xmm1, %xmm2
+; X86-SSE2-NEXT: pandn %xmm0, %xmm1
+; X86-SSE2-NEXT: por %xmm2, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm1
+; X86-SSE2-NEXT: pandn %xmm0, %xmm2
+; X86-SSE2-NEXT: por %xmm1, %xmm2
+; X86-SSE2-NEXT: movd %xmm2, %eax
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v8i32:
+; X86-SSE42: ## %bb.0:
+; X86-SSE42-NEXT: pmaxsd %xmm1, %xmm0
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT: pmaxsd %xmm0, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT: pmaxsd %xmm1, %xmm0
+; X86-SSE42-NEXT: movd %xmm0, %eax
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX1-LABEL: test_reduce_v8i32:
+; X86-AVX1: ## %bb.0:
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X86-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vmovd %xmm0, %eax
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_reduce_v8i32:
+; X86-AVX2: ## %bb.0:
+; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vmovd %xmm0, %eax
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v8i32:
+; X64-SSE2: ## %bb.0:
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X64-SSE2-NEXT: pcmpgtd %xmm1, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm0
+; X64-SSE2-NEXT: pandn %xmm1, %xmm2
+; X64-SSE2-NEXT: por %xmm0, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm1
+; X64-SSE2-NEXT: pcmpgtd %xmm0, %xmm1
+; X64-SSE2-NEXT: pand %xmm1, %xmm2
+; X64-SSE2-NEXT: pandn %xmm0, %xmm1
+; X64-SSE2-NEXT: por %xmm2, %xmm1
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X64-SSE2-NEXT: pcmpgtd %xmm0, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm1
+; X64-SSE2-NEXT: pandn %xmm0, %xmm2
+; X64-SSE2-NEXT: por %xmm1, %xmm2
+; X64-SSE2-NEXT: movd %xmm2, %eax
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v8i32:
+; X64-SSE42: ## %bb.0:
+; X64-SSE42-NEXT: pmaxsd %xmm1, %xmm0
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT: pmaxsd %xmm0, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT: pmaxsd %xmm1, %xmm0
+; X64-SSE42-NEXT: movd %xmm0, %eax
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v8i32:
+; X64-AVX1: ## %bb.0:
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X64-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vmovd %xmm0, %eax
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v8i32:
+; X64-AVX2: ## %bb.0:
+; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vmovd %xmm0, %eax
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v8i32:
+; X64-AVX512: ## %bb.0:
+; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX512-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vmovd %xmm0, %eax
+; X64-AVX512-NEXT: vzeroupper
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp sgt <8 x i32> %a0, %1
+ %3 = select <8 x i1> %2, <8 x i32> %a0, <8 x i32> %1
+ %4 = shufflevector <8 x i32> %3, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp sgt <8 x i32> %3, %4
+ %6 = select <8 x i1> %5, <8 x i32> %3, <8 x i32> %4
+ %7 = shufflevector <8 x i32> %6, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp sgt <8 x i32> %6, %7
+ %9 = select <8 x i1> %8, <8 x i32> %6, <8 x i32> %7
+ %10 = extractelement <8 x i32> %9, i32 0
+ ret i32 %10
+}
+
+define i16 @test_reduce_v16i16(<16 x i16> %a0) {
+; X86-SSE2-LABEL: test_reduce_v16i16:
+; X86-SSE2: ## %bb.0:
+; X86-SSE2-NEXT: pmaxsw %xmm1, %xmm0
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: pmaxsw %xmm0, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE2-NEXT: pmaxsw %xmm1, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT: psrld $16, %xmm1
+; X86-SSE2-NEXT: pmaxsw %xmm0, %xmm1
+; X86-SSE2-NEXT: movd %xmm1, %eax
+; X86-SSE2-NEXT: ## kill: def %ax killed %ax killed %eax
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v16i16:
+; X86-SSE42: ## %bb.0:
+; X86-SSE42-NEXT: pmaxsw %xmm1, %xmm0
+; X86-SSE42-NEXT: movdqa {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767]
+; X86-SSE42-NEXT: pxor %xmm1, %xmm0
+; X86-SSE42-NEXT: phminposuw %xmm0, %xmm0
+; X86-SSE42-NEXT: pxor %xmm1, %xmm0
+; X86-SSE42-NEXT: movd %xmm0, %eax
+; X86-SSE42-NEXT: ## kill: def %ax killed %ax killed %eax
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX1-LABEL: test_reduce_v16i16:
+; X86-AVX1: ## %bb.0:
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X86-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767]
+; X86-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vphminposuw %xmm0, %xmm0
+; X86-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vmovd %xmm0, %eax
+; X86-AVX1-NEXT: ## kill: def %ax killed %ax killed %eax
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_reduce_v16i16:
+; X86-AVX2: ## %bb.0:
+; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
+; X86-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767]
+; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X86-AVX2-NEXT: vphminposuw %xmm0, %xmm0
+; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X86-AVX2-NEXT: vmovd %xmm0, %eax
+; X86-AVX2-NEXT: ## kill: def %ax killed %ax killed %eax
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v16i16:
+; X64-SSE2: ## %bb.0:
+; X64-SSE2-NEXT: pmaxsw %xmm1, %xmm0
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: pmaxsw %xmm0, %xmm1
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE2-NEXT: pmaxsw %xmm1, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE2-NEXT: psrld $16, %xmm1
+; X64-SSE2-NEXT: pmaxsw %xmm0, %xmm1
+; X64-SSE2-NEXT: movd %xmm1, %eax
+; X64-SSE2-NEXT: ## kill: def %ax killed %ax killed %eax
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v16i16:
+; X64-SSE42: ## %bb.0:
+; X64-SSE42-NEXT: pmaxsw %xmm1, %xmm0
+; X64-SSE42-NEXT: movdqa {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767]
+; X64-SSE42-NEXT: pxor %xmm1, %xmm0
+; X64-SSE42-NEXT: phminposuw %xmm0, %xmm0
+; X64-SSE42-NEXT: pxor %xmm1, %xmm0
+; X64-SSE42-NEXT: movd %xmm0, %eax
+; X64-SSE42-NEXT: ## kill: def %ax killed %ax killed %eax
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v16i16:
+; X64-AVX1: ## %bb.0:
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X64-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767]
+; X64-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vphminposuw %xmm0, %xmm0
+; X64-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vmovd %xmm0, %eax
+; X64-AVX1-NEXT: ## kill: def %ax killed %ax killed %eax
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v16i16:
+; X64-AVX2: ## %bb.0:
+; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767]
+; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0
+; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT: vmovd %xmm0, %eax
+; X64-AVX2-NEXT: ## kill: def %ax killed %ax killed %eax
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v16i16:
+; X64-AVX512: ## %bb.0:
+; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
+; X64-AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767]
+; X64-AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0
+; X64-AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX512-NEXT: vmovd %xmm0, %eax
+; X64-AVX512-NEXT: ## kill: def %ax killed %ax killed %eax
+; X64-AVX512-NEXT: vzeroupper
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp sgt <16 x i16> %a0, %1
+ %3 = select <16 x i1> %2, <16 x i16> %a0, <16 x i16> %1
+ %4 = shufflevector <16 x i16> %3, <16 x i16> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp sgt <16 x i16> %3, %4
+ %6 = select <16 x i1> %5, <16 x i16> %3, <16 x i16> %4
+ %7 = shufflevector <16 x i16> %6, <16 x i16> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp sgt <16 x i16> %6, %7
+ %9 = select <16 x i1> %8, <16 x i16> %6, <16 x i16> %7
+ %10 = shufflevector <16 x i16> %9, <16 x i16> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %11 = icmp sgt <16 x i16> %9, %10
+ %12 = select <16 x i1> %11, <16 x i16> %9, <16 x i16> %10
+ %13 = extractelement <16 x i16> %12, i32 0
+ ret i16 %13
+}
+
+define i8 @test_reduce_v32i8(<32 x i8> %a0) {
+; X86-SSE2-LABEL: test_reduce_v32i8:
+; X86-SSE2: ## %bb.0:
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT: pcmpgtb %xmm1, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm0
+; X86-SSE2-NEXT: pandn %xmm1, %xmm2
+; X86-SSE2-NEXT: por %xmm0, %xmm2
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm1
+; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm1
+; X86-SSE2-NEXT: pand %xmm1, %xmm2
+; X86-SSE2-NEXT: pandn %xmm0, %xmm1
+; X86-SSE2-NEXT: por %xmm2, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm1
+; X86-SSE2-NEXT: pandn %xmm0, %xmm2
+; X86-SSE2-NEXT: por %xmm1, %xmm2
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm0
+; X86-SSE2-NEXT: psrld $16, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm1
+; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm1
+; X86-SSE2-NEXT: pand %xmm1, %xmm2
+; X86-SSE2-NEXT: pandn %xmm0, %xmm1
+; X86-SSE2-NEXT: por %xmm2, %xmm1
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE2-NEXT: psrlw $8, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm1
+; X86-SSE2-NEXT: pandn %xmm0, %xmm2
+; X86-SSE2-NEXT: por %xmm1, %xmm2
+; X86-SSE2-NEXT: movd %xmm2, %eax
+; X86-SSE2-NEXT: ## kill: def %al killed %al killed %eax
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v32i8:
+; X86-SSE42: ## %bb.0:
+; X86-SSE42-NEXT: pmaxsb %xmm1, %xmm0
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT: pmaxsb %xmm0, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT: pmaxsb %xmm1, %xmm0
+; X86-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE42-NEXT: psrld $16, %xmm1
+; X86-SSE42-NEXT: pmaxsb %xmm0, %xmm1
+; X86-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE42-NEXT: psrlw $8, %xmm0
+; X86-SSE42-NEXT: pmaxsb %xmm1, %xmm0
+; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax
+; X86-SSE42-NEXT: ## kill: def %al killed %al killed %eax
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX1-LABEL: test_reduce_v32i8:
+; X86-AVX1: ## %bb.0:
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X86-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X86-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpextrb $0, %xmm0, %eax
+; X86-AVX1-NEXT: ## kill: def %al killed %al killed %eax
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_reduce_v32i8:
+; X86-AVX2: ## %bb.0:
+; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpextrb $0, %xmm0, %eax
+; X86-AVX2-NEXT: ## kill: def %al killed %al killed %eax
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v32i8:
+; X64-SSE2: ## %bb.0:
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X64-SSE2-NEXT: pcmpgtb %xmm1, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm0
+; X64-SSE2-NEXT: pandn %xmm1, %xmm2
+; X64-SSE2-NEXT: por %xmm0, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm1
+; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm1
+; X64-SSE2-NEXT: pand %xmm1, %xmm2
+; X64-SSE2-NEXT: pandn %xmm0, %xmm1
+; X64-SSE2-NEXT: por %xmm2, %xmm1
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm1
+; X64-SSE2-NEXT: pandn %xmm0, %xmm2
+; X64-SSE2-NEXT: por %xmm1, %xmm2
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm0
+; X64-SSE2-NEXT: psrld $16, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm1
+; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm1
+; X64-SSE2-NEXT: pand %xmm1, %xmm2
+; X64-SSE2-NEXT: pandn %xmm0, %xmm1
+; X64-SSE2-NEXT: por %xmm2, %xmm1
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE2-NEXT: psrlw $8, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm1
+; X64-SSE2-NEXT: pandn %xmm0, %xmm2
+; X64-SSE2-NEXT: por %xmm1, %xmm2
+; X64-SSE2-NEXT: movd %xmm2, %eax
+; X64-SSE2-NEXT: ## kill: def %al killed %al killed %eax
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v32i8:
+; X64-SSE42: ## %bb.0:
+; X64-SSE42-NEXT: pmaxsb %xmm1, %xmm0
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT: pmaxsb %xmm0, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT: pmaxsb %xmm1, %xmm0
+; X64-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE42-NEXT: psrld $16, %xmm1
+; X64-SSE42-NEXT: pmaxsb %xmm0, %xmm1
+; X64-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE42-NEXT: psrlw $8, %xmm0
+; X64-SSE42-NEXT: pmaxsb %xmm1, %xmm0
+; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax
+; X64-SSE42-NEXT: ## kill: def %al killed %al killed %eax
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v32i8:
+; X64-AVX1: ## %bb.0:
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X64-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X64-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpextrb $0, %xmm0, %eax
+; X64-AVX1-NEXT: ## kill: def %al killed %al killed %eax
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v32i8:
+; X64-AVX2: ## %bb.0:
+; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpextrb $0, %xmm0, %eax
+; X64-AVX2-NEXT: ## kill: def %al killed %al killed %eax
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v32i8:
+; X64-AVX512: ## %bb.0:
+; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX512-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX512-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X64-AVX512-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpextrb $0, %xmm0, %eax
+; X64-AVX512-NEXT: ## kill: def %al killed %al killed %eax
+; X64-AVX512-NEXT: vzeroupper
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <32 x i8> %a0, <32 x i8> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp sgt <32 x i8> %a0, %1
+ %3 = select <32 x i1> %2, <32 x i8> %a0, <32 x i8> %1
+ %4 = shufflevector <32 x i8> %3, <32 x i8> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp sgt <32 x i8> %3, %4
+ %6 = select <32 x i1> %5, <32 x i8> %3, <32 x i8> %4
+ %7 = shufflevector <32 x i8> %6, <32 x i8> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp sgt <32 x i8> %6, %7
+ %9 = select <32 x i1> %8, <32 x i8> %6, <32 x i8> %7
+ %10 = shufflevector <32 x i8> %9, <32 x i8> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %11 = icmp sgt <32 x i8> %9, %10
+ %12 = select <32 x i1> %11, <32 x i8> %9, <32 x i8> %10
+ %13 = shufflevector <32 x i8> %12, <32 x i8> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %14 = icmp sgt <32 x i8> %12, %13
+ %15 = select <32 x i1> %14, <32 x i8> %12, <32 x i8> %13
+ %16 = extractelement <32 x i8> %15, i32 0
+ ret i8 %16
+}
+
+;
+; 512-bit Vectors
+;
+
+define i64 @test_reduce_v8i64(<8 x i64> %a0) {
+; X86-SSE2-LABEL: test_reduce_v8i64:
+; X86-SSE2: ## %bb.0:
+; X86-SSE2-NEXT: subl $28, %esp
+; X86-SSE2-NEXT: .cfi_def_cfa_offset 32
+; X86-SSE2-NEXT: movdqa %xmm3, %xmm5
+; X86-SSE2-NEXT: movdqa %xmm5, (%esp) ## 16-byte Spill
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm3
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,0,2147483648,0]
+; X86-SSE2-NEXT: pxor %xmm4, %xmm5
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm6
+; X86-SSE2-NEXT: pxor %xmm4, %xmm6
+; X86-SSE2-NEXT: movdqa %xmm6, %xmm7
+; X86-SSE2-NEXT: pcmpgtd %xmm5, %xmm7
+; X86-SSE2-NEXT: pcmpeqd %xmm5, %xmm6
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,0,2,2]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; X86-SSE2-NEXT: pand %xmm5, %xmm6
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; X86-SSE2-NEXT: por %xmm6, %xmm5
+; X86-SSE2-NEXT: movdqa %xmm3, %xmm6
+; X86-SSE2-NEXT: pxor %xmm4, %xmm6
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm7
+; X86-SSE2-NEXT: pxor %xmm4, %xmm7
+; X86-SSE2-NEXT: movdqa %xmm7, %xmm0
+; X86-SSE2-NEXT: pcmpgtd %xmm6, %xmm0
+; X86-SSE2-NEXT: pcmpeqd %xmm6, %xmm7
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,0,2,2]
+; X86-SSE2-NEXT: pand %xmm6, %xmm7
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
+; X86-SSE2-NEXT: por %xmm7, %xmm6
+; X86-SSE2-NEXT: pand %xmm6, %xmm1
+; X86-SSE2-NEXT: pandn %xmm3, %xmm6
+; X86-SSE2-NEXT: por %xmm1, %xmm6
+; X86-SSE2-NEXT: pand %xmm5, %xmm2
+; X86-SSE2-NEXT: pandn (%esp), %xmm5 ## 16-byte Folded Reload
+; X86-SSE2-NEXT: por %xmm2, %xmm5
+; X86-SSE2-NEXT: movdqa %xmm5, %xmm0
+; X86-SSE2-NEXT: pxor %xmm4, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm6, %xmm1
+; X86-SSE2-NEXT: pxor %xmm4, %xmm1
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm2
+; X86-SSE2-NEXT: pcmpeqd %xmm0, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; X86-SSE2-NEXT: pand %xmm0, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
+; X86-SSE2-NEXT: por %xmm1, %xmm0
+; X86-SSE2-NEXT: pand %xmm0, %xmm6
+; X86-SSE2-NEXT: pandn %xmm5, %xmm0
+; X86-SSE2-NEXT: por %xmm6, %xmm0
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT: pxor %xmm4, %xmm2
+; X86-SSE2-NEXT: pxor %xmm1, %xmm4
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm3
+; X86-SSE2-NEXT: pcmpgtd %xmm4, %xmm3
+; X86-SSE2-NEXT: pcmpeqd %xmm2, %xmm4
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; X86-SSE2-NEXT: pand %xmm2, %xmm4
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; X86-SSE2-NEXT: por %xmm4, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm0
+; X86-SSE2-NEXT: pandn %xmm1, %xmm2
+; X86-SSE2-NEXT: por %xmm0, %xmm2
+; X86-SSE2-NEXT: movd %xmm2, %eax
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X86-SSE2-NEXT: movd %xmm0, %edx
+; X86-SSE2-NEXT: addl $28, %esp
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v8i64:
+; X86-SSE42: ## %bb.0:
+; X86-SSE42-NEXT: movdqa %xmm0, %xmm4
+; X86-SSE42-NEXT: movdqa %xmm4, %xmm5
+; X86-SSE42-NEXT: pcmpgtq %xmm2, %xmm5
+; X86-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE42-NEXT: pcmpgtq %xmm3, %xmm0
+; X86-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm3
+; X86-SSE42-NEXT: movdqa %xmm5, %xmm0
+; X86-SSE42-NEXT: blendvpd %xmm0, %xmm4, %xmm2
+; X86-SSE42-NEXT: movapd %xmm2, %xmm0
+; X86-SSE42-NEXT: pcmpgtq %xmm3, %xmm0
+; X86-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm3
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1]
+; X86-SSE42-NEXT: movdqa %xmm3, %xmm0
+; X86-SSE42-NEXT: pcmpgtq %xmm1, %xmm0
+; X86-SSE42-NEXT: blendvpd %xmm0, %xmm3, %xmm1
+; X86-SSE42-NEXT: movd %xmm1, %eax
+; X86-SSE42-NEXT: pextrd $1, %xmm1, %edx
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX1-LABEL: test_reduce_v8i64:
+; X86-AVX1: ## %bb.0:
+; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X86-AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
+; X86-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm3
+; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X86-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
+; X86-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm3
+; X86-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X86-AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm3
+; X86-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX1-NEXT: vmovd %xmm0, %eax
+; X86-AVX1-NEXT: vpextrd $1, %xmm0, %edx
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_reduce_v8i64:
+; X86-AVX2: ## %bb.0:
+; X86-AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2
+; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2
+; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2
+; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX2-NEXT: vmovd %xmm0, %eax
+; X86-AVX2-NEXT: vpextrd $1, %xmm0, %edx
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v8i64:
+; X64-SSE2: ## %bb.0:
+; X64-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,0,2147483648,0]
+; X64-SSE2-NEXT: movdqa %xmm3, %xmm5
+; X64-SSE2-NEXT: pxor %xmm4, %xmm5
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm6
+; X64-SSE2-NEXT: pxor %xmm4, %xmm6
+; X64-SSE2-NEXT: movdqa %xmm6, %xmm7
+; X64-SSE2-NEXT: pcmpgtd %xmm5, %xmm7
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
+; X64-SSE2-NEXT: pcmpeqd %xmm5, %xmm6
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; X64-SSE2-NEXT: pand %xmm8, %xmm6
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[1,1,3,3]
+; X64-SSE2-NEXT: por %xmm6, %xmm8
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm6
+; X64-SSE2-NEXT: pxor %xmm4, %xmm6
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm7
+; X64-SSE2-NEXT: pxor %xmm4, %xmm7
+; X64-SSE2-NEXT: movdqa %xmm7, %xmm5
+; X64-SSE2-NEXT: pcmpgtd %xmm6, %xmm5
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm5[0,0,2,2]
+; X64-SSE2-NEXT: pcmpeqd %xmm6, %xmm7
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
+; X64-SSE2-NEXT: pand %xmm9, %xmm7
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3]
+; X64-SSE2-NEXT: por %xmm7, %xmm6
+; X64-SSE2-NEXT: pand %xmm6, %xmm0
+; X64-SSE2-NEXT: pandn %xmm2, %xmm6
+; X64-SSE2-NEXT: por %xmm0, %xmm6
+; X64-SSE2-NEXT: pand %xmm8, %xmm1
+; X64-SSE2-NEXT: pandn %xmm3, %xmm8
+; X64-SSE2-NEXT: por %xmm1, %xmm8
+; X64-SSE2-NEXT: movdqa %xmm8, %xmm0
+; X64-SSE2-NEXT: pxor %xmm4, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm6, %xmm1
+; X64-SSE2-NEXT: pxor %xmm4, %xmm1
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X64-SSE2-NEXT: pcmpgtd %xmm0, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
+; X64-SSE2-NEXT: pcmpeqd %xmm0, %xmm1
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; X64-SSE2-NEXT: pand %xmm3, %xmm0
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; X64-SSE2-NEXT: por %xmm0, %xmm1
+; X64-SSE2-NEXT: pand %xmm1, %xmm6
+; X64-SSE2-NEXT: pandn %xmm8, %xmm1
+; X64-SSE2-NEXT: por %xmm6, %xmm1
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X64-SSE2-NEXT: pxor %xmm4, %xmm2
+; X64-SSE2-NEXT: pxor %xmm0, %xmm4
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm3
+; X64-SSE2-NEXT: pcmpgtd %xmm4, %xmm3
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
+; X64-SSE2-NEXT: pcmpeqd %xmm2, %xmm4
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
+; X64-SSE2-NEXT: pand %xmm5, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; X64-SSE2-NEXT: por %xmm2, %xmm3
+; X64-SSE2-NEXT: pand %xmm3, %xmm1
+; X64-SSE2-NEXT: pandn %xmm0, %xmm3
+; X64-SSE2-NEXT: por %xmm1, %xmm3
+; X64-SSE2-NEXT: movq %xmm3, %rax
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v8i64:
+; X64-SSE42: ## %bb.0:
+; X64-SSE42-NEXT: movdqa %xmm0, %xmm4
+; X64-SSE42-NEXT: movdqa %xmm4, %xmm5
+; X64-SSE42-NEXT: pcmpgtq %xmm2, %xmm5
+; X64-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE42-NEXT: pcmpgtq %xmm3, %xmm0
+; X64-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm3
+; X64-SSE42-NEXT: movdqa %xmm5, %xmm0
+; X64-SSE42-NEXT: blendvpd %xmm0, %xmm4, %xmm2
+; X64-SSE42-NEXT: movapd %xmm2, %xmm0
+; X64-SSE42-NEXT: pcmpgtq %xmm3, %xmm0
+; X64-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm3
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1]
+; X64-SSE42-NEXT: movdqa %xmm3, %xmm0
+; X64-SSE42-NEXT: pcmpgtq %xmm1, %xmm0
+; X64-SSE42-NEXT: blendvpd %xmm0, %xmm3, %xmm1
+; X64-SSE42-NEXT: movq %xmm1, %rax
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v8i64:
+; X64-AVX1: ## %bb.0:
+; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X64-AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
+; X64-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm3
+; X64-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X64-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
+; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm3
+; X64-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm3
+; X64-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX1-NEXT: vmovq %xmm0, %rax
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v8i64:
+; X64-AVX2: ## %bb.0:
+; X64-AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2
+; X64-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2
+; X64-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2
+; X64-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX2-NEXT: vmovq %xmm0, %rax
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v8i64:
+; X64-AVX512: ## %bb.0:
+; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; X64-AVX512-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vmovq %xmm0, %rax
+; X64-AVX512-NEXT: vzeroupper
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <8 x i64> %a0, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp sgt <8 x i64> %a0, %1
+ %3 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %1
+ %4 = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp sgt <8 x i64> %3, %4
+ %6 = select <8 x i1> %5, <8 x i64> %3, <8 x i64> %4
+ %7 = shufflevector <8 x i64> %6, <8 x i64> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp sgt <8 x i64> %6, %7
+ %9 = select <8 x i1> %8, <8 x i64> %6, <8 x i64> %7
+ %10 = extractelement <8 x i64> %9, i32 0
+ ret i64 %10
+}
+
+define i32 @test_reduce_v16i32(<16 x i32> %a0) {
+; X86-SSE2-LABEL: test_reduce_v16i32:
+; X86-SSE2: ## %bb.0:
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm4
+; X86-SSE2-NEXT: pcmpgtd %xmm2, %xmm4
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm5
+; X86-SSE2-NEXT: pcmpgtd %xmm3, %xmm5
+; X86-SSE2-NEXT: pand %xmm5, %xmm1
+; X86-SSE2-NEXT: pandn %xmm3, %xmm5
+; X86-SSE2-NEXT: por %xmm1, %xmm5
+; X86-SSE2-NEXT: pand %xmm4, %xmm0
+; X86-SSE2-NEXT: pandn %xmm2, %xmm4
+; X86-SSE2-NEXT: por %xmm0, %xmm4
+; X86-SSE2-NEXT: movdqa %xmm4, %xmm0
+; X86-SSE2-NEXT: pcmpgtd %xmm5, %xmm0
+; X86-SSE2-NEXT: pand %xmm0, %xmm4
+; X86-SSE2-NEXT: pandn %xmm5, %xmm0
+; X86-SSE2-NEXT: por %xmm4, %xmm0
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT: pcmpgtd %xmm1, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm0
+; X86-SSE2-NEXT: pandn %xmm1, %xmm2
+; X86-SSE2-NEXT: por %xmm0, %xmm2
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm1
+; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm1
+; X86-SSE2-NEXT: pand %xmm1, %xmm2
+; X86-SSE2-NEXT: pandn %xmm0, %xmm1
+; X86-SSE2-NEXT: por %xmm2, %xmm1
+; X86-SSE2-NEXT: movd %xmm1, %eax
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v16i32:
+; X86-SSE42: ## %bb.0:
+; X86-SSE42-NEXT: pmaxsd %xmm3, %xmm1
+; X86-SSE42-NEXT: pmaxsd %xmm2, %xmm0
+; X86-SSE42-NEXT: pmaxsd %xmm1, %xmm0
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT: pmaxsd %xmm0, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT: pmaxsd %xmm1, %xmm0
+; X86-SSE42-NEXT: movd %xmm0, %eax
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX1-LABEL: test_reduce_v16i32:
+; X86-AVX1: ## %bb.0:
+; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X86-AVX1-NEXT: vpmaxsd %xmm2, %xmm3, %xmm2
+; X86-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpmaxsd %xmm2, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vmovd %xmm0, %eax
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_reduce_v16i32:
+; X86-AVX2: ## %bb.0:
+; X86-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vmovd %xmm0, %eax
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v16i32:
+; X64-SSE2: ## %bb.0:
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm4
+; X64-SSE2-NEXT: pcmpgtd %xmm2, %xmm4
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm5
+; X64-SSE2-NEXT: pcmpgtd %xmm3, %xmm5
+; X64-SSE2-NEXT: pand %xmm5, %xmm1
+; X64-SSE2-NEXT: pandn %xmm3, %xmm5
+; X64-SSE2-NEXT: por %xmm1, %xmm5
+; X64-SSE2-NEXT: pand %xmm4, %xmm0
+; X64-SSE2-NEXT: pandn %xmm2, %xmm4
+; X64-SSE2-NEXT: por %xmm0, %xmm4
+; X64-SSE2-NEXT: movdqa %xmm4, %xmm0
+; X64-SSE2-NEXT: pcmpgtd %xmm5, %xmm0
+; X64-SSE2-NEXT: pand %xmm0, %xmm4
+; X64-SSE2-NEXT: pandn %xmm5, %xmm0
+; X64-SSE2-NEXT: por %xmm4, %xmm0
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X64-SSE2-NEXT: pcmpgtd %xmm1, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm0
+; X64-SSE2-NEXT: pandn %xmm1, %xmm2
+; X64-SSE2-NEXT: por %xmm0, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm1
+; X64-SSE2-NEXT: pcmpgtd %xmm0, %xmm1
+; X64-SSE2-NEXT: pand %xmm1, %xmm2
+; X64-SSE2-NEXT: pandn %xmm0, %xmm1
+; X64-SSE2-NEXT: por %xmm2, %xmm1
+; X64-SSE2-NEXT: movd %xmm1, %eax
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v16i32:
+; X64-SSE42: ## %bb.0:
+; X64-SSE42-NEXT: pmaxsd %xmm3, %xmm1
+; X64-SSE42-NEXT: pmaxsd %xmm2, %xmm0
+; X64-SSE42-NEXT: pmaxsd %xmm1, %xmm0
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT: pmaxsd %xmm0, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT: pmaxsd %xmm1, %xmm0
+; X64-SSE42-NEXT: movd %xmm0, %eax
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v16i32:
+; X64-AVX1: ## %bb.0:
+; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X64-AVX1-NEXT: vpmaxsd %xmm2, %xmm3, %xmm2
+; X64-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpmaxsd %xmm2, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vmovd %xmm0, %eax
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v16i32:
+; X64-AVX2: ## %bb.0:
+; X64-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vmovd %xmm0, %eax
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v16i32:
+; X64-AVX512: ## %bb.0:
+; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; X64-AVX512-NEXT: vpmaxsd %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT: vpmaxsd %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpmaxsd %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX512-NEXT: vpmaxsd %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vmovd %xmm0, %eax
+; X64-AVX512-NEXT: vzeroupper
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <16 x i32> %a0, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp sgt <16 x i32> %a0, %1
+ %3 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %1
+ %4 = shufflevector <16 x i32> %3, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp sgt <16 x i32> %3, %4
+ %6 = select <16 x i1> %5, <16 x i32> %3, <16 x i32> %4
+ %7 = shufflevector <16 x i32> %6, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp sgt <16 x i32> %6, %7
+ %9 = select <16 x i1> %8, <16 x i32> %6, <16 x i32> %7
+ %10 = shufflevector <16 x i32> %9, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %11 = icmp sgt <16 x i32> %9, %10
+ %12 = select <16 x i1> %11, <16 x i32> %9, <16 x i32> %10
+ %13 = extractelement <16 x i32> %12, i32 0
+ ret i32 %13
+}
+
+define i16 @test_reduce_v32i16(<32 x i16> %a0) {
+; X86-SSE2-LABEL: test_reduce_v32i16:
+; X86-SSE2: ## %bb.0:
+; X86-SSE2-NEXT: pmaxsw %xmm3, %xmm1
+; X86-SSE2-NEXT: pmaxsw %xmm2, %xmm0
+; X86-SSE2-NEXT: pmaxsw %xmm1, %xmm0
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: pmaxsw %xmm0, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE2-NEXT: pmaxsw %xmm1, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT: psrld $16, %xmm1
+; X86-SSE2-NEXT: pmaxsw %xmm0, %xmm1
+; X86-SSE2-NEXT: movd %xmm1, %eax
+; X86-SSE2-NEXT: ## kill: def %ax killed %ax killed %eax
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v32i16:
+; X86-SSE42: ## %bb.0:
+; X86-SSE42-NEXT: pmaxsw %xmm3, %xmm1
+; X86-SSE42-NEXT: pmaxsw %xmm2, %xmm0
+; X86-SSE42-NEXT: pmaxsw %xmm1, %xmm0
+; X86-SSE42-NEXT: movdqa {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767]
+; X86-SSE42-NEXT: pxor %xmm1, %xmm0
+; X86-SSE42-NEXT: phminposuw %xmm0, %xmm0
+; X86-SSE42-NEXT: pxor %xmm1, %xmm0
+; X86-SSE42-NEXT: movd %xmm0, %eax
+; X86-SSE42-NEXT: ## kill: def %ax killed %ax killed %eax
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX1-LABEL: test_reduce_v32i16:
+; X86-AVX1: ## %bb.0:
+; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X86-AVX1-NEXT: vpmaxsw %xmm2, %xmm3, %xmm2
+; X86-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpmaxsw %xmm2, %xmm0, %xmm0
+; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767]
+; X86-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vphminposuw %xmm0, %xmm0
+; X86-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vmovd %xmm0, %eax
+; X86-AVX1-NEXT: ## kill: def %ax killed %ax killed %eax
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_reduce_v32i16:
+; X86-AVX2: ## %bb.0:
+; X86-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
+; X86-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767]
+; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X86-AVX2-NEXT: vphminposuw %xmm0, %xmm0
+; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X86-AVX2-NEXT: vmovd %xmm0, %eax
+; X86-AVX2-NEXT: ## kill: def %ax killed %ax killed %eax
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v32i16:
+; X64-SSE2: ## %bb.0:
+; X64-SSE2-NEXT: pmaxsw %xmm3, %xmm1
+; X64-SSE2-NEXT: pmaxsw %xmm2, %xmm0
+; X64-SSE2-NEXT: pmaxsw %xmm1, %xmm0
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: pmaxsw %xmm0, %xmm1
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE2-NEXT: pmaxsw %xmm1, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE2-NEXT: psrld $16, %xmm1
+; X64-SSE2-NEXT: pmaxsw %xmm0, %xmm1
+; X64-SSE2-NEXT: movd %xmm1, %eax
+; X64-SSE2-NEXT: ## kill: def %ax killed %ax killed %eax
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v32i16:
+; X64-SSE42: ## %bb.0:
+; X64-SSE42-NEXT: pmaxsw %xmm3, %xmm1
+; X64-SSE42-NEXT: pmaxsw %xmm2, %xmm0
+; X64-SSE42-NEXT: pmaxsw %xmm1, %xmm0
+; X64-SSE42-NEXT: movdqa {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767]
+; X64-SSE42-NEXT: pxor %xmm1, %xmm0
+; X64-SSE42-NEXT: phminposuw %xmm0, %xmm0
+; X64-SSE42-NEXT: pxor %xmm1, %xmm0
+; X64-SSE42-NEXT: movd %xmm0, %eax
+; X64-SSE42-NEXT: ## kill: def %ax killed %ax killed %eax
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v32i16:
+; X64-AVX1: ## %bb.0:
+; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X64-AVX1-NEXT: vpmaxsw %xmm2, %xmm3, %xmm2
+; X64-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpmaxsw %xmm2, %xmm0, %xmm0
+; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767]
+; X64-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vphminposuw %xmm0, %xmm0
+; X64-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vmovd %xmm0, %eax
+; X64-AVX1-NEXT: ## kill: def %ax killed %ax killed %eax
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v32i16:
+; X64-AVX2: ## %bb.0:
+; X64-AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767]
+; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0
+; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT: vmovd %xmm0, %eax
+; X64-AVX2-NEXT: ## kill: def %ax killed %ax killed %eax
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v32i16:
+; X64-AVX512: ## %bb.0:
+; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; X64-AVX512-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
+; X64-AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [32767,32767,32767,32767,32767,32767,32767,32767]
+; X64-AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0
+; X64-AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX512-NEXT: vmovd %xmm0, %eax
+; X64-AVX512-NEXT: ## kill: def %ax killed %ax killed %eax
+; X64-AVX512-NEXT: vzeroupper
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <32 x i16> %a0, <32 x i16> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp sgt <32 x i16> %a0, %1
+ %3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %1
+ %4 = shufflevector <32 x i16> %3, <32 x i16> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp sgt <32 x i16> %3, %4
+ %6 = select <32 x i1> %5, <32 x i16> %3, <32 x i16> %4
+ %7 = shufflevector <32 x i16> %6, <32 x i16> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp sgt <32 x i16> %6, %7
+ %9 = select <32 x i1> %8, <32 x i16> %6, <32 x i16> %7
+ %10 = shufflevector <32 x i16> %9, <32 x i16> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %11 = icmp sgt <32 x i16> %9, %10
+ %12 = select <32 x i1> %11, <32 x i16> %9, <32 x i16> %10
+ %13 = shufflevector <32 x i16> %12, <32 x i16> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %14 = icmp sgt <32 x i16> %12, %13
+ %15 = select <32 x i1> %14, <32 x i16> %12, <32 x i16> %13
+ %16 = extractelement <32 x i16> %15, i32 0
+ ret i16 %16
+}
+
+define i8 @test_reduce_v64i8(<64 x i8> %a0) {
+; X86-SSE2-LABEL: test_reduce_v64i8:
+; X86-SSE2: ## %bb.0:
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm4
+; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm4
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm5
+; X86-SSE2-NEXT: pcmpgtb %xmm3, %xmm5
+; X86-SSE2-NEXT: pand %xmm5, %xmm1
+; X86-SSE2-NEXT: pandn %xmm3, %xmm5
+; X86-SSE2-NEXT: por %xmm1, %xmm5
+; X86-SSE2-NEXT: pand %xmm4, %xmm0
+; X86-SSE2-NEXT: pandn %xmm2, %xmm4
+; X86-SSE2-NEXT: por %xmm0, %xmm4
+; X86-SSE2-NEXT: movdqa %xmm4, %xmm0
+; X86-SSE2-NEXT: pcmpgtb %xmm5, %xmm0
+; X86-SSE2-NEXT: pand %xmm0, %xmm4
+; X86-SSE2-NEXT: pandn %xmm5, %xmm0
+; X86-SSE2-NEXT: por %xmm4, %xmm0
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT: pcmpgtb %xmm1, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm0
+; X86-SSE2-NEXT: pandn %xmm1, %xmm2
+; X86-SSE2-NEXT: por %xmm0, %xmm2
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm1
+; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm1
+; X86-SSE2-NEXT: pand %xmm1, %xmm2
+; X86-SSE2-NEXT: pandn %xmm0, %xmm1
+; X86-SSE2-NEXT: por %xmm2, %xmm1
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE2-NEXT: psrld $16, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm1
+; X86-SSE2-NEXT: pandn %xmm0, %xmm2
+; X86-SSE2-NEXT: por %xmm1, %xmm2
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm0
+; X86-SSE2-NEXT: psrlw $8, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm1
+; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm1
+; X86-SSE2-NEXT: pand %xmm1, %xmm2
+; X86-SSE2-NEXT: pandn %xmm0, %xmm1
+; X86-SSE2-NEXT: por %xmm2, %xmm1
+; X86-SSE2-NEXT: movd %xmm1, %eax
+; X86-SSE2-NEXT: ## kill: def %al killed %al killed %eax
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v64i8:
+; X86-SSE42: ## %bb.0:
+; X86-SSE42-NEXT: pmaxsb %xmm3, %xmm1
+; X86-SSE42-NEXT: pmaxsb %xmm2, %xmm0
+; X86-SSE42-NEXT: pmaxsb %xmm1, %xmm0
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT: pmaxsb %xmm0, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT: pmaxsb %xmm1, %xmm0
+; X86-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE42-NEXT: psrld $16, %xmm1
+; X86-SSE42-NEXT: pmaxsb %xmm0, %xmm1
+; X86-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE42-NEXT: psrlw $8, %xmm0
+; X86-SSE42-NEXT: pmaxsb %xmm1, %xmm0
+; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax
+; X86-SSE42-NEXT: ## kill: def %al killed %al killed %eax
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX1-LABEL: test_reduce_v64i8:
+; X86-AVX1: ## %bb.0:
+; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X86-AVX1-NEXT: vpmaxsb %xmm2, %xmm3, %xmm2
+; X86-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpmaxsb %xmm2, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X86-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpextrb $0, %xmm0, %eax
+; X86-AVX1-NEXT: ## kill: def %al killed %al killed %eax
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_reduce_v64i8:
+; X86-AVX2: ## %bb.0:
+; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X86-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpextrb $0, %xmm0, %eax
+; X86-AVX2-NEXT: ## kill: def %al killed %al killed %eax
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v64i8:
+; X64-SSE2: ## %bb.0:
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm4
+; X64-SSE2-NEXT: pcmpgtb %xmm2, %xmm4
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm5
+; X64-SSE2-NEXT: pcmpgtb %xmm3, %xmm5
+; X64-SSE2-NEXT: pand %xmm5, %xmm1
+; X64-SSE2-NEXT: pandn %xmm3, %xmm5
+; X64-SSE2-NEXT: por %xmm1, %xmm5
+; X64-SSE2-NEXT: pand %xmm4, %xmm0
+; X64-SSE2-NEXT: pandn %xmm2, %xmm4
+; X64-SSE2-NEXT: por %xmm0, %xmm4
+; X64-SSE2-NEXT: movdqa %xmm4, %xmm0
+; X64-SSE2-NEXT: pcmpgtb %xmm5, %xmm0
+; X64-SSE2-NEXT: pand %xmm0, %xmm4
+; X64-SSE2-NEXT: pandn %xmm5, %xmm0
+; X64-SSE2-NEXT: por %xmm4, %xmm0
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X64-SSE2-NEXT: pcmpgtb %xmm1, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm0
+; X64-SSE2-NEXT: pandn %xmm1, %xmm2
+; X64-SSE2-NEXT: por %xmm0, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm1
+; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm1
+; X64-SSE2-NEXT: pand %xmm1, %xmm2
+; X64-SSE2-NEXT: pandn %xmm0, %xmm1
+; X64-SSE2-NEXT: por %xmm2, %xmm1
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE2-NEXT: psrld $16, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm1
+; X64-SSE2-NEXT: pandn %xmm0, %xmm2
+; X64-SSE2-NEXT: por %xmm1, %xmm2
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm0
+; X64-SSE2-NEXT: psrlw $8, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm1
+; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm1
+; X64-SSE2-NEXT: pand %xmm1, %xmm2
+; X64-SSE2-NEXT: pandn %xmm0, %xmm1
+; X64-SSE2-NEXT: por %xmm2, %xmm1
+; X64-SSE2-NEXT: movd %xmm1, %eax
+; X64-SSE2-NEXT: ## kill: def %al killed %al killed %eax
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v64i8:
+; X64-SSE42: ## %bb.0:
+; X64-SSE42-NEXT: pmaxsb %xmm3, %xmm1
+; X64-SSE42-NEXT: pmaxsb %xmm2, %xmm0
+; X64-SSE42-NEXT: pmaxsb %xmm1, %xmm0
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT: pmaxsb %xmm0, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT: pmaxsb %xmm1, %xmm0
+; X64-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE42-NEXT: psrld $16, %xmm1
+; X64-SSE42-NEXT: pmaxsb %xmm0, %xmm1
+; X64-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE42-NEXT: psrlw $8, %xmm0
+; X64-SSE42-NEXT: pmaxsb %xmm1, %xmm0
+; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax
+; X64-SSE42-NEXT: ## kill: def %al killed %al killed %eax
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v64i8:
+; X64-AVX1: ## %bb.0:
+; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X64-AVX1-NEXT: vpmaxsb %xmm2, %xmm3, %xmm2
+; X64-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpmaxsb %xmm2, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X64-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpextrb $0, %xmm0, %eax
+; X64-AVX1-NEXT: ## kill: def %al killed %al killed %eax
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v64i8:
+; X64-AVX2: ## %bb.0:
+; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X64-AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpextrb $0, %xmm0, %eax
+; X64-AVX2-NEXT: ## kill: def %al killed %al killed %eax
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v64i8:
+; X64-AVX512: ## %bb.0:
+; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; X64-AVX512-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX512-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX512-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X64-AVX512-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpextrb $0, %xmm0, %eax
+; X64-AVX512-NEXT: ## kill: def %al killed %al killed %eax
+; X64-AVX512-NEXT: vzeroupper
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <64 x i8> %a0, <64 x i8> undef, <64 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp sgt <64 x i8> %a0, %1
+ %3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %1
+ %4 = shufflevector <64 x i8> %3, <64 x i8> undef, <64 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp sgt <64 x i8> %3, %4
+ %6 = select <64 x i1> %5, <64 x i8> %3, <64 x i8> %4
+ %7 = shufflevector <64 x i8> %6, <64 x i8> undef, <64 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp sgt <64 x i8> %6, %7
+ %9 = select <64 x i1> %8, <64 x i8> %6, <64 x i8> %7
+ %10 = shufflevector <64 x i8> %9, <64 x i8> undef, <64 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %11 = icmp sgt <64 x i8> %9, %10
+ %12 = select <64 x i1> %11, <64 x i8> %9, <64 x i8> %10
+ %13 = shufflevector <64 x i8> %12, <64 x i8> undef, <64 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %14 = icmp sgt <64 x i8> %12, %13
+ %15 = select <64 x i1> %14, <64 x i8> %12, <64 x i8> %13
+ %16 = shufflevector <64 x i8> %15, <64 x i8> undef, <64 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %17 = icmp sgt <64 x i8> %15, %16
+ %18 = select <64 x i1> %17, <64 x i8> %15, <64 x i8> %16
+ %19 = extractelement <64 x i8> %18, i32 0
+ ret i8 %19
+}
diff --git a/test/CodeGen/X86/horizontal-reduce-smin.ll b/test/CodeGen/X86/horizontal-reduce-smin.ll
new file mode 100644
index 000000000000..f03e745598e6
--- /dev/null
+++ b/test/CodeGen/X86/horizontal-reduce-smin.ll
@@ -0,0 +1,1942 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+sse2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE --check-prefix=X86-SSE2
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+sse4.2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE --check-prefix=X86-SSE42
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx | FileCheck %s --check-prefix=X86 --check-prefix=X86-AVX --check-prefix=X86-AVX1
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-AVX --check-prefix=X86-AVX2
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+sse2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE --check-prefix=X64-SSE2
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+sse4.2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE --check-prefix=X64-SSE42
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX1
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX2
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vl | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX512
+
+;
+; 128-bit Vectors
+;
+
+define i64 @test_reduce_v2i64(<2 x i64> %a0) {
+; X86-SSE2-LABEL: test_reduce_v2i64:
+; X86-SSE2: ## %bb.0:
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0]
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X86-SSE2-NEXT: pxor %xmm2, %xmm3
+; X86-SSE2-NEXT: pxor %xmm1, %xmm2
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm4
+; X86-SSE2-NEXT: pcmpgtd %xmm3, %xmm4
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; X86-SSE2-NEXT: pcmpeqd %xmm3, %xmm2
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; X86-SSE2-NEXT: pand %xmm5, %xmm2
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; X86-SSE2-NEXT: por %xmm2, %xmm3
+; X86-SSE2-NEXT: pand %xmm3, %xmm0
+; X86-SSE2-NEXT: pandn %xmm1, %xmm3
+; X86-SSE2-NEXT: por %xmm0, %xmm3
+; X86-SSE2-NEXT: movd %xmm3, %eax
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
+; X86-SSE2-NEXT: movd %xmm0, %edx
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v2i64:
+; X86-SSE42: ## %bb.0:
+; X86-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; X86-SSE42-NEXT: movdqa %xmm2, %xmm0
+; X86-SSE42-NEXT: pcmpgtq %xmm1, %xmm0
+; X86-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2
+; X86-SSE42-NEXT: movd %xmm2, %eax
+; X86-SSE42-NEXT: pextrd $1, %xmm2, %edx
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX-LABEL: test_reduce_v2i64:
+; X86-AVX: ## %bb.0:
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
+; X86-AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; X86-AVX-NEXT: vmovd %xmm0, %eax
+; X86-AVX-NEXT: vpextrd $1, %xmm0, %edx
+; X86-AVX-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v2i64:
+; X64-SSE2: ## %bb.0:
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0]
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X64-SSE2-NEXT: pxor %xmm2, %xmm3
+; X64-SSE2-NEXT: pxor %xmm1, %xmm2
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm4
+; X64-SSE2-NEXT: pcmpgtd %xmm3, %xmm4
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; X64-SSE2-NEXT: pcmpeqd %xmm3, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; X64-SSE2-NEXT: pand %xmm5, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; X64-SSE2-NEXT: por %xmm2, %xmm3
+; X64-SSE2-NEXT: pand %xmm3, %xmm0
+; X64-SSE2-NEXT: pandn %xmm1, %xmm3
+; X64-SSE2-NEXT: por %xmm0, %xmm3
+; X64-SSE2-NEXT: movq %xmm3, %rax
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v2i64:
+; X64-SSE42: ## %bb.0:
+; X64-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; X64-SSE42-NEXT: movdqa %xmm2, %xmm0
+; X64-SSE42-NEXT: pcmpgtq %xmm1, %xmm0
+; X64-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2
+; X64-SSE42-NEXT: movq %xmm2, %rax
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v2i64:
+; X64-AVX1: ## %bb.0:
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
+; X64-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; X64-AVX1-NEXT: vmovq %xmm0, %rax
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v2i64:
+; X64-AVX2: ## %bb.0:
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
+; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; X64-AVX2-NEXT: vmovq %xmm0, %rax
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v2i64:
+; X64-AVX512: ## %bb.0:
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpminsq %xmm1, %xmm0, %xmm0
+; X64-AVX512-NEXT: vmovq %xmm0, %rax
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <2 x i64> %a0, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
+ %2 = icmp slt <2 x i64> %a0, %1
+ %3 = select <2 x i1> %2, <2 x i64> %a0, <2 x i64> %1
+ %4 = extractelement <2 x i64> %3, i32 0
+ ret i64 %4
+}
+
+define i32 @test_reduce_v4i32(<4 x i32> %a0) {
+; X86-SSE2-LABEL: test_reduce_v4i32:
+; X86-SSE2: ## %bb.0:
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm0
+; X86-SSE2-NEXT: pandn %xmm1, %xmm2
+; X86-SSE2-NEXT: por %xmm0, %xmm2
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT: pcmpgtd %xmm2, %xmm1
+; X86-SSE2-NEXT: pand %xmm1, %xmm2
+; X86-SSE2-NEXT: pandn %xmm0, %xmm1
+; X86-SSE2-NEXT: por %xmm2, %xmm1
+; X86-SSE2-NEXT: movd %xmm1, %eax
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v4i32:
+; X86-SSE42: ## %bb.0:
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT: pminsd %xmm0, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT: pminsd %xmm1, %xmm0
+; X86-SSE42-NEXT: movd %xmm0, %eax
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX-LABEL: test_reduce_v4i32:
+; X86-AVX: ## %bb.0:
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX-NEXT: vpminsd %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX-NEXT: vpminsd %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vmovd %xmm0, %eax
+; X86-AVX-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v4i32:
+; X64-SSE2: ## %bb.0:
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X64-SSE2-NEXT: pcmpgtd %xmm0, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm0
+; X64-SSE2-NEXT: pandn %xmm1, %xmm2
+; X64-SSE2-NEXT: por %xmm0, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE2-NEXT: pcmpgtd %xmm2, %xmm1
+; X64-SSE2-NEXT: pand %xmm1, %xmm2
+; X64-SSE2-NEXT: pandn %xmm0, %xmm1
+; X64-SSE2-NEXT: por %xmm2, %xmm1
+; X64-SSE2-NEXT: movd %xmm1, %eax
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v4i32:
+; X64-SSE42: ## %bb.0:
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT: pminsd %xmm0, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT: pminsd %xmm1, %xmm0
+; X64-SSE42-NEXT: movd %xmm0, %eax
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX-LABEL: test_reduce_v4i32:
+; X64-AVX: ## %bb.0:
+; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX-NEXT: vpminsd %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX-NEXT: vpminsd %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vmovd %xmm0, %eax
+; X64-AVX-NEXT: retq
+ %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+ %2 = icmp slt <4 x i32> %a0, %1
+ %3 = select <4 x i1> %2, <4 x i32> %a0, <4 x i32> %1
+ %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+ %5 = icmp slt <4 x i32> %3, %4
+ %6 = select <4 x i1> %5, <4 x i32> %3, <4 x i32> %4
+ %7 = extractelement <4 x i32> %6, i32 0
+ ret i32 %7
+}
+
+define i16 @test_reduce_v8i16(<8 x i16> %a0) {
+; X86-SSE2-LABEL: test_reduce_v8i16:
+; X86-SSE2: ## %bb.0:
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: pminsw %xmm0, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE2-NEXT: pminsw %xmm1, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT: psrld $16, %xmm1
+; X86-SSE2-NEXT: pminsw %xmm0, %xmm1
+; X86-SSE2-NEXT: movd %xmm1, %eax
+; X86-SSE2-NEXT: ## kill: def %ax killed %ax killed %eax
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v8i16:
+; X86-SSE42: ## %bb.0:
+; X86-SSE42-NEXT: movdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; X86-SSE42-NEXT: pxor %xmm1, %xmm0
+; X86-SSE42-NEXT: phminposuw %xmm0, %xmm0
+; X86-SSE42-NEXT: pxor %xmm1, %xmm0
+; X86-SSE42-NEXT: movd %xmm0, %eax
+; X86-SSE42-NEXT: ## kill: def %ax killed %ax killed %eax
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX-LABEL: test_reduce_v8i16:
+; X86-AVX: ## %bb.0:
+; X86-AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; X86-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vphminposuw %xmm0, %xmm0
+; X86-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vmovd %xmm0, %eax
+; X86-AVX-NEXT: ## kill: def %ax killed %ax killed %eax
+; X86-AVX-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v8i16:
+; X64-SSE2: ## %bb.0:
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: pminsw %xmm0, %xmm1
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE2-NEXT: pminsw %xmm1, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE2-NEXT: psrld $16, %xmm1
+; X64-SSE2-NEXT: pminsw %xmm0, %xmm1
+; X64-SSE2-NEXT: movd %xmm1, %eax
+; X64-SSE2-NEXT: ## kill: def %ax killed %ax killed %eax
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v8i16:
+; X64-SSE42: ## %bb.0:
+; X64-SSE42-NEXT: movdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; X64-SSE42-NEXT: pxor %xmm1, %xmm0
+; X64-SSE42-NEXT: phminposuw %xmm0, %xmm0
+; X64-SSE42-NEXT: pxor %xmm1, %xmm0
+; X64-SSE42-NEXT: movd %xmm0, %eax
+; X64-SSE42-NEXT: ## kill: def %ax killed %ax killed %eax
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX-LABEL: test_reduce_v8i16:
+; X64-AVX: ## %bb.0:
+; X64-AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; X64-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vphminposuw %xmm0, %xmm0
+; X64-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vmovd %xmm0, %eax
+; X64-AVX-NEXT: ## kill: def %ax killed %ax killed %eax
+; X64-AVX-NEXT: retq
+ %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp slt <8 x i16> %a0, %1
+ %3 = select <8 x i1> %2, <8 x i16> %a0, <8 x i16> %1
+ %4 = shufflevector <8 x i16> %3, <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp slt <8 x i16> %3, %4
+ %6 = select <8 x i1> %5, <8 x i16> %3, <8 x i16> %4
+ %7 = shufflevector <8 x i16> %6, <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp slt <8 x i16> %6, %7
+ %9 = select <8 x i1> %8, <8 x i16> %6, <8 x i16> %7
+ %10 = extractelement <8 x i16> %9, i32 0
+ ret i16 %10
+}
+
+define i8 @test_reduce_v16i8(<16 x i8> %a0) {
+; X86-SSE2-LABEL: test_reduce_v16i8:
+; X86-SSE2: ## %bb.0:
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm0
+; X86-SSE2-NEXT: pandn %xmm1, %xmm2
+; X86-SSE2-NEXT: por %xmm0, %xmm2
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm1
+; X86-SSE2-NEXT: pand %xmm1, %xmm2
+; X86-SSE2-NEXT: pandn %xmm0, %xmm1
+; X86-SSE2-NEXT: por %xmm2, %xmm1
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE2-NEXT: psrld $16, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT: pcmpgtb %xmm1, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm1
+; X86-SSE2-NEXT: pandn %xmm0, %xmm2
+; X86-SSE2-NEXT: por %xmm1, %xmm2
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm0
+; X86-SSE2-NEXT: psrlw $8, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm1
+; X86-SSE2-NEXT: pand %xmm1, %xmm2
+; X86-SSE2-NEXT: pandn %xmm0, %xmm1
+; X86-SSE2-NEXT: por %xmm2, %xmm1
+; X86-SSE2-NEXT: movd %xmm1, %eax
+; X86-SSE2-NEXT: ## kill: def %al killed %al killed %eax
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v16i8:
+; X86-SSE42: ## %bb.0:
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT: pminsb %xmm0, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT: pminsb %xmm1, %xmm0
+; X86-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE42-NEXT: psrld $16, %xmm1
+; X86-SSE42-NEXT: pminsb %xmm0, %xmm1
+; X86-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE42-NEXT: psrlw $8, %xmm0
+; X86-SSE42-NEXT: pminsb %xmm1, %xmm0
+; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax
+; X86-SSE42-NEXT: ## kill: def %al killed %al killed %eax
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX-LABEL: test_reduce_v16i8:
+; X86-AVX: ## %bb.0:
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X86-AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vpextrb $0, %xmm0, %eax
+; X86-AVX-NEXT: ## kill: def %al killed %al killed %eax
+; X86-AVX-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v16i8:
+; X64-SSE2: ## %bb.0:
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm0
+; X64-SSE2-NEXT: pandn %xmm1, %xmm2
+; X64-SSE2-NEXT: por %xmm0, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE2-NEXT: pcmpgtb %xmm2, %xmm1
+; X64-SSE2-NEXT: pand %xmm1, %xmm2
+; X64-SSE2-NEXT: pandn %xmm0, %xmm1
+; X64-SSE2-NEXT: por %xmm2, %xmm1
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE2-NEXT: psrld $16, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X64-SSE2-NEXT: pcmpgtb %xmm1, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm1
+; X64-SSE2-NEXT: pandn %xmm0, %xmm2
+; X64-SSE2-NEXT: por %xmm1, %xmm2
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm0
+; X64-SSE2-NEXT: psrlw $8, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE2-NEXT: pcmpgtb %xmm2, %xmm1
+; X64-SSE2-NEXT: pand %xmm1, %xmm2
+; X64-SSE2-NEXT: pandn %xmm0, %xmm1
+; X64-SSE2-NEXT: por %xmm2, %xmm1
+; X64-SSE2-NEXT: movd %xmm1, %eax
+; X64-SSE2-NEXT: ## kill: def %al killed %al killed %eax
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v16i8:
+; X64-SSE42: ## %bb.0:
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT: pminsb %xmm0, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT: pminsb %xmm1, %xmm0
+; X64-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE42-NEXT: psrld $16, %xmm1
+; X64-SSE42-NEXT: pminsb %xmm0, %xmm1
+; X64-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE42-NEXT: psrlw $8, %xmm0
+; X64-SSE42-NEXT: pminsb %xmm1, %xmm0
+; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax
+; X64-SSE42-NEXT: ## kill: def %al killed %al killed %eax
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX-LABEL: test_reduce_v16i8:
+; X64-AVX: ## %bb.0:
+; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X64-AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vpextrb $0, %xmm0, %eax
+; X64-AVX-NEXT: ## kill: def %al killed %al killed %eax
+; X64-AVX-NEXT: retq
+ %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp slt <16 x i8> %a0, %1
+ %3 = select <16 x i1> %2, <16 x i8> %a0, <16 x i8> %1
+ %4 = shufflevector <16 x i8> %3, <16 x i8> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp slt <16 x i8> %3, %4
+ %6 = select <16 x i1> %5, <16 x i8> %3, <16 x i8> %4
+ %7 = shufflevector <16 x i8> %6, <16 x i8> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp slt <16 x i8> %6, %7
+ %9 = select <16 x i1> %8, <16 x i8> %6, <16 x i8> %7
+ %10 = shufflevector <16 x i8> %9, <16 x i8> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %11 = icmp slt <16 x i8> %9, %10
+ %12 = select <16 x i1> %11, <16 x i8> %9, <16 x i8> %10
+ %13 = extractelement <16 x i8> %12, i32 0
+ ret i8 %13
+}
+
+;
+; 256-bit Vectors
+;
+
+define i64 @test_reduce_v4i64(<4 x i64> %a0) {
+; X86-SSE2-LABEL: test_reduce_v4i64:
+; X86-SSE2: ## %bb.0:
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0]
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X86-SSE2-NEXT: pxor %xmm2, %xmm3
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm4
+; X86-SSE2-NEXT: pxor %xmm2, %xmm4
+; X86-SSE2-NEXT: movdqa %xmm4, %xmm5
+; X86-SSE2-NEXT: pcmpgtd %xmm3, %xmm5
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
+; X86-SSE2-NEXT: pcmpeqd %xmm3, %xmm4
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; X86-SSE2-NEXT: pand %xmm6, %xmm3
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
+; X86-SSE2-NEXT: por %xmm3, %xmm4
+; X86-SSE2-NEXT: pand %xmm4, %xmm0
+; X86-SSE2-NEXT: pandn %xmm1, %xmm4
+; X86-SSE2-NEXT: por %xmm0, %xmm4
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]
+; X86-SSE2-NEXT: movdqa %xmm4, %xmm1
+; X86-SSE2-NEXT: pxor %xmm2, %xmm1
+; X86-SSE2-NEXT: pxor %xmm0, %xmm2
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm3
+; X86-SSE2-NEXT: pcmpgtd %xmm1, %xmm3
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
+; X86-SSE2-NEXT: pcmpeqd %xmm1, %xmm2
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; X86-SSE2-NEXT: pand %xmm5, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; X86-SSE2-NEXT: por %xmm1, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm4
+; X86-SSE2-NEXT: pandn %xmm0, %xmm2
+; X86-SSE2-NEXT: por %xmm4, %xmm2
+; X86-SSE2-NEXT: movd %xmm2, %eax
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X86-SSE2-NEXT: movd %xmm0, %edx
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v4i64:
+; X86-SSE42: ## %bb.0:
+; X86-SSE42-NEXT: movdqa %xmm0, %xmm2
+; X86-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE42-NEXT: pcmpgtq %xmm2, %xmm0
+; X86-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; X86-SSE42-NEXT: movdqa %xmm2, %xmm0
+; X86-SSE42-NEXT: pcmpgtq %xmm1, %xmm0
+; X86-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2
+; X86-SSE42-NEXT: movd %xmm2, %eax
+; X86-SSE42-NEXT: pextrd $1, %xmm2, %edx
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX1-LABEL: test_reduce_v4i64:
+; X86-AVX1: ## %bb.0:
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X86-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
+; X86-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm3
+; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X86-AVX1-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm3
+; X86-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX1-NEXT: vmovd %xmm0, %eax
+; X86-AVX1-NEXT: vpextrd $1, %xmm0, %edx
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_reduce_v4i64:
+; X86-AVX2: ## %bb.0:
+; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2
+; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2
+; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX2-NEXT: vmovd %xmm0, %eax
+; X86-AVX2-NEXT: vpextrd $1, %xmm0, %edx
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v4i64:
+; X64-SSE2: ## %bb.0:
+; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0]
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X64-SSE2-NEXT: pxor %xmm2, %xmm3
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm4
+; X64-SSE2-NEXT: pxor %xmm2, %xmm4
+; X64-SSE2-NEXT: movdqa %xmm4, %xmm5
+; X64-SSE2-NEXT: pcmpgtd %xmm3, %xmm5
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
+; X64-SSE2-NEXT: pcmpeqd %xmm3, %xmm4
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; X64-SSE2-NEXT: pand %xmm6, %xmm3
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
+; X64-SSE2-NEXT: por %xmm3, %xmm4
+; X64-SSE2-NEXT: pand %xmm4, %xmm0
+; X64-SSE2-NEXT: pandn %xmm1, %xmm4
+; X64-SSE2-NEXT: por %xmm0, %xmm4
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]
+; X64-SSE2-NEXT: movdqa %xmm4, %xmm1
+; X64-SSE2-NEXT: pxor %xmm2, %xmm1
+; X64-SSE2-NEXT: pxor %xmm0, %xmm2
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm3
+; X64-SSE2-NEXT: pcmpgtd %xmm1, %xmm3
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
+; X64-SSE2-NEXT: pcmpeqd %xmm1, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; X64-SSE2-NEXT: pand %xmm5, %xmm1
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; X64-SSE2-NEXT: por %xmm1, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm4
+; X64-SSE2-NEXT: pandn %xmm0, %xmm2
+; X64-SSE2-NEXT: por %xmm4, %xmm2
+; X64-SSE2-NEXT: movq %xmm2, %rax
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v4i64:
+; X64-SSE42: ## %bb.0:
+; X64-SSE42-NEXT: movdqa %xmm0, %xmm2
+; X64-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE42-NEXT: pcmpgtq %xmm2, %xmm0
+; X64-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; X64-SSE42-NEXT: movdqa %xmm2, %xmm0
+; X64-SSE42-NEXT: pcmpgtq %xmm1, %xmm0
+; X64-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2
+; X64-SSE42-NEXT: movq %xmm2, %rax
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v4i64:
+; X64-AVX1: ## %bb.0:
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X64-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
+; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm3
+; X64-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X64-AVX1-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm3
+; X64-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX1-NEXT: vmovq %xmm0, %rax
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v4i64:
+; X64-AVX2: ## %bb.0:
+; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2
+; X64-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2
+; X64-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX2-NEXT: vmovq %xmm0, %rax
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v4i64:
+; X64-AVX512: ## %bb.0:
+; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT: vpminsq %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpminsq %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vmovq %xmm0, %rax
+; X64-AVX512-NEXT: vzeroupper
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+ %2 = icmp slt <4 x i64> %a0, %1
+ %3 = select <4 x i1> %2, <4 x i64> %a0, <4 x i64> %1
+ %4 = shufflevector <4 x i64> %3, <4 x i64> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+ %5 = icmp slt <4 x i64> %3, %4
+ %6 = select <4 x i1> %5, <4 x i64> %3, <4 x i64> %4
+ %7 = extractelement <4 x i64> %6, i32 0
+ ret i64 %7
+}
+
+define i32 @test_reduce_v8i32(<8 x i32> %a0) {
+; X86-SSE2-LABEL: test_reduce_v8i32:
+; X86-SSE2: ## %bb.0:
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm0
+; X86-SSE2-NEXT: pandn %xmm1, %xmm2
+; X86-SSE2-NEXT: por %xmm0, %xmm2
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT: pcmpgtd %xmm2, %xmm1
+; X86-SSE2-NEXT: pand %xmm1, %xmm2
+; X86-SSE2-NEXT: pandn %xmm0, %xmm1
+; X86-SSE2-NEXT: por %xmm2, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT: pcmpgtd %xmm1, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm1
+; X86-SSE2-NEXT: pandn %xmm0, %xmm2
+; X86-SSE2-NEXT: por %xmm1, %xmm2
+; X86-SSE2-NEXT: movd %xmm2, %eax
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v8i32:
+; X86-SSE42: ## %bb.0:
+; X86-SSE42-NEXT: pminsd %xmm1, %xmm0
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT: pminsd %xmm0, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT: pminsd %xmm1, %xmm0
+; X86-SSE42-NEXT: movd %xmm0, %eax
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX1-LABEL: test_reduce_v8i32:
+; X86-AVX1: ## %bb.0:
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X86-AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vmovd %xmm0, %eax
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_reduce_v8i32:
+; X86-AVX2: ## %bb.0:
+; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vmovd %xmm0, %eax
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v8i32:
+; X64-SSE2: ## %bb.0:
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X64-SSE2-NEXT: pcmpgtd %xmm0, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm0
+; X64-SSE2-NEXT: pandn %xmm1, %xmm2
+; X64-SSE2-NEXT: por %xmm0, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE2-NEXT: pcmpgtd %xmm2, %xmm1
+; X64-SSE2-NEXT: pand %xmm1, %xmm2
+; X64-SSE2-NEXT: pandn %xmm0, %xmm1
+; X64-SSE2-NEXT: por %xmm2, %xmm1
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X64-SSE2-NEXT: pcmpgtd %xmm1, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm1
+; X64-SSE2-NEXT: pandn %xmm0, %xmm2
+; X64-SSE2-NEXT: por %xmm1, %xmm2
+; X64-SSE2-NEXT: movd %xmm2, %eax
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v8i32:
+; X64-SSE42: ## %bb.0:
+; X64-SSE42-NEXT: pminsd %xmm1, %xmm0
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT: pminsd %xmm0, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT: pminsd %xmm1, %xmm0
+; X64-SSE42-NEXT: movd %xmm0, %eax
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v8i32:
+; X64-AVX1: ## %bb.0:
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X64-AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vmovd %xmm0, %eax
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v8i32:
+; X64-AVX2: ## %bb.0:
+; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vmovd %xmm0, %eax
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v8i32:
+; X64-AVX512: ## %bb.0:
+; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT: vpminsd %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpminsd %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX512-NEXT: vpminsd %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vmovd %xmm0, %eax
+; X64-AVX512-NEXT: vzeroupper
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp slt <8 x i32> %a0, %1
+ %3 = select <8 x i1> %2, <8 x i32> %a0, <8 x i32> %1
+ %4 = shufflevector <8 x i32> %3, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp slt <8 x i32> %3, %4
+ %6 = select <8 x i1> %5, <8 x i32> %3, <8 x i32> %4
+ %7 = shufflevector <8 x i32> %6, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp slt <8 x i32> %6, %7
+ %9 = select <8 x i1> %8, <8 x i32> %6, <8 x i32> %7
+ %10 = extractelement <8 x i32> %9, i32 0
+ ret i32 %10
+}
+
+define i16 @test_reduce_v16i16(<16 x i16> %a0) {
+; X86-SSE2-LABEL: test_reduce_v16i16:
+; X86-SSE2: ## %bb.0:
+; X86-SSE2-NEXT: pminsw %xmm1, %xmm0
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: pminsw %xmm0, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE2-NEXT: pminsw %xmm1, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT: psrld $16, %xmm1
+; X86-SSE2-NEXT: pminsw %xmm0, %xmm1
+; X86-SSE2-NEXT: movd %xmm1, %eax
+; X86-SSE2-NEXT: ## kill: def %ax killed %ax killed %eax
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v16i16:
+; X86-SSE42: ## %bb.0:
+; X86-SSE42-NEXT: pminsw %xmm1, %xmm0
+; X86-SSE42-NEXT: movdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; X86-SSE42-NEXT: pxor %xmm1, %xmm0
+; X86-SSE42-NEXT: phminposuw %xmm0, %xmm0
+; X86-SSE42-NEXT: pxor %xmm1, %xmm0
+; X86-SSE42-NEXT: movd %xmm0, %eax
+; X86-SSE42-NEXT: ## kill: def %ax killed %ax killed %eax
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX1-LABEL: test_reduce_v16i16:
+; X86-AVX1: ## %bb.0:
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X86-AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; X86-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vphminposuw %xmm0, %xmm0
+; X86-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vmovd %xmm0, %eax
+; X86-AVX1-NEXT: ## kill: def %ax killed %ax killed %eax
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_reduce_v16i16:
+; X86-AVX2: ## %bb.0:
+; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT: vpminsw %xmm1, %xmm0, %xmm0
+; X86-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X86-AVX2-NEXT: vphminposuw %xmm0, %xmm0
+; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X86-AVX2-NEXT: vmovd %xmm0, %eax
+; X86-AVX2-NEXT: ## kill: def %ax killed %ax killed %eax
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v16i16:
+; X64-SSE2: ## %bb.0:
+; X64-SSE2-NEXT: pminsw %xmm1, %xmm0
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: pminsw %xmm0, %xmm1
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE2-NEXT: pminsw %xmm1, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE2-NEXT: psrld $16, %xmm1
+; X64-SSE2-NEXT: pminsw %xmm0, %xmm1
+; X64-SSE2-NEXT: movd %xmm1, %eax
+; X64-SSE2-NEXT: ## kill: def %ax killed %ax killed %eax
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v16i16:
+; X64-SSE42: ## %bb.0:
+; X64-SSE42-NEXT: pminsw %xmm1, %xmm0
+; X64-SSE42-NEXT: movdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; X64-SSE42-NEXT: pxor %xmm1, %xmm0
+; X64-SSE42-NEXT: phminposuw %xmm0, %xmm0
+; X64-SSE42-NEXT: pxor %xmm1, %xmm0
+; X64-SSE42-NEXT: movd %xmm0, %eax
+; X64-SSE42-NEXT: ## kill: def %ax killed %ax killed %eax
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v16i16:
+; X64-AVX1: ## %bb.0:
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X64-AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; X64-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vphminposuw %xmm0, %xmm0
+; X64-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vmovd %xmm0, %eax
+; X64-AVX1-NEXT: ## kill: def %ax killed %ax killed %eax
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v16i16:
+; X64-AVX2: ## %bb.0:
+; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vpminsw %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0
+; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT: vmovd %xmm0, %eax
+; X64-AVX2-NEXT: ## kill: def %ax killed %ax killed %eax
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v16i16:
+; X64-AVX512: ## %bb.0:
+; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT: vpminsw %xmm1, %xmm0, %xmm0
+; X64-AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; X64-AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0
+; X64-AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX512-NEXT: vmovd %xmm0, %eax
+; X64-AVX512-NEXT: ## kill: def %ax killed %ax killed %eax
+; X64-AVX512-NEXT: vzeroupper
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp slt <16 x i16> %a0, %1
+ %3 = select <16 x i1> %2, <16 x i16> %a0, <16 x i16> %1
+ %4 = shufflevector <16 x i16> %3, <16 x i16> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp slt <16 x i16> %3, %4
+ %6 = select <16 x i1> %5, <16 x i16> %3, <16 x i16> %4
+ %7 = shufflevector <16 x i16> %6, <16 x i16> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp slt <16 x i16> %6, %7
+ %9 = select <16 x i1> %8, <16 x i16> %6, <16 x i16> %7
+ %10 = shufflevector <16 x i16> %9, <16 x i16> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %11 = icmp slt <16 x i16> %9, %10
+ %12 = select <16 x i1> %11, <16 x i16> %9, <16 x i16> %10
+ %13 = extractelement <16 x i16> %12, i32 0
+ ret i16 %13
+}
+
+define i8 @test_reduce_v32i8(<32 x i8> %a0) {
+; X86-SSE2-LABEL: test_reduce_v32i8:
+; X86-SSE2: ## %bb.0:
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm0
+; X86-SSE2-NEXT: pandn %xmm1, %xmm2
+; X86-SSE2-NEXT: por %xmm0, %xmm2
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm1
+; X86-SSE2-NEXT: pand %xmm1, %xmm2
+; X86-SSE2-NEXT: pandn %xmm0, %xmm1
+; X86-SSE2-NEXT: por %xmm2, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT: pcmpgtb %xmm1, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm1
+; X86-SSE2-NEXT: pandn %xmm0, %xmm2
+; X86-SSE2-NEXT: por %xmm1, %xmm2
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm0
+; X86-SSE2-NEXT: psrld $16, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm1
+; X86-SSE2-NEXT: pand %xmm1, %xmm2
+; X86-SSE2-NEXT: pandn %xmm0, %xmm1
+; X86-SSE2-NEXT: por %xmm2, %xmm1
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE2-NEXT: psrlw $8, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT: pcmpgtb %xmm1, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm1
+; X86-SSE2-NEXT: pandn %xmm0, %xmm2
+; X86-SSE2-NEXT: por %xmm1, %xmm2
+; X86-SSE2-NEXT: movd %xmm2, %eax
+; X86-SSE2-NEXT: ## kill: def %al killed %al killed %eax
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v32i8:
+; X86-SSE42: ## %bb.0:
+; X86-SSE42-NEXT: pminsb %xmm1, %xmm0
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT: pminsb %xmm0, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT: pminsb %xmm1, %xmm0
+; X86-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE42-NEXT: psrld $16, %xmm1
+; X86-SSE42-NEXT: pminsb %xmm0, %xmm1
+; X86-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE42-NEXT: psrlw $8, %xmm0
+; X86-SSE42-NEXT: pminsb %xmm1, %xmm0
+; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax
+; X86-SSE42-NEXT: ## kill: def %al killed %al killed %eax
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX1-LABEL: test_reduce_v32i8:
+; X86-AVX1: ## %bb.0:
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X86-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X86-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpextrb $0, %xmm0, %eax
+; X86-AVX1-NEXT: ## kill: def %al killed %al killed %eax
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_reduce_v32i8:
+; X86-AVX2: ## %bb.0:
+; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpextrb $0, %xmm0, %eax
+; X86-AVX2-NEXT: ## kill: def %al killed %al killed %eax
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v32i8:
+; X64-SSE2: ## %bb.0:
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm0
+; X64-SSE2-NEXT: pandn %xmm1, %xmm2
+; X64-SSE2-NEXT: por %xmm0, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE2-NEXT: pcmpgtb %xmm2, %xmm1
+; X64-SSE2-NEXT: pand %xmm1, %xmm2
+; X64-SSE2-NEXT: pandn %xmm0, %xmm1
+; X64-SSE2-NEXT: por %xmm2, %xmm1
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X64-SSE2-NEXT: pcmpgtb %xmm1, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm1
+; X64-SSE2-NEXT: pandn %xmm0, %xmm2
+; X64-SSE2-NEXT: por %xmm1, %xmm2
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm0
+; X64-SSE2-NEXT: psrld $16, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE2-NEXT: pcmpgtb %xmm2, %xmm1
+; X64-SSE2-NEXT: pand %xmm1, %xmm2
+; X64-SSE2-NEXT: pandn %xmm0, %xmm1
+; X64-SSE2-NEXT: por %xmm2, %xmm1
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE2-NEXT: psrlw $8, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X64-SSE2-NEXT: pcmpgtb %xmm1, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm1
+; X64-SSE2-NEXT: pandn %xmm0, %xmm2
+; X64-SSE2-NEXT: por %xmm1, %xmm2
+; X64-SSE2-NEXT: movd %xmm2, %eax
+; X64-SSE2-NEXT: ## kill: def %al killed %al killed %eax
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v32i8:
+; X64-SSE42: ## %bb.0:
+; X64-SSE42-NEXT: pminsb %xmm1, %xmm0
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT: pminsb %xmm0, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT: pminsb %xmm1, %xmm0
+; X64-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE42-NEXT: psrld $16, %xmm1
+; X64-SSE42-NEXT: pminsb %xmm0, %xmm1
+; X64-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE42-NEXT: psrlw $8, %xmm0
+; X64-SSE42-NEXT: pminsb %xmm1, %xmm0
+; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax
+; X64-SSE42-NEXT: ## kill: def %al killed %al killed %eax
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v32i8:
+; X64-AVX1: ## %bb.0:
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X64-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X64-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpextrb $0, %xmm0, %eax
+; X64-AVX1-NEXT: ## kill: def %al killed %al killed %eax
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v32i8:
+; X64-AVX2: ## %bb.0:
+; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpextrb $0, %xmm0, %eax
+; X64-AVX2-NEXT: ## kill: def %al killed %al killed %eax
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v32i8:
+; X64-AVX512: ## %bb.0:
+; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT: vpminsb %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpminsb %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX512-NEXT: vpminsb %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX512-NEXT: vpminsb %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X64-AVX512-NEXT: vpminsb %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpextrb $0, %xmm0, %eax
+; X64-AVX512-NEXT: ## kill: def %al killed %al killed %eax
+; X64-AVX512-NEXT: vzeroupper
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <32 x i8> %a0, <32 x i8> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp slt <32 x i8> %a0, %1
+ %3 = select <32 x i1> %2, <32 x i8> %a0, <32 x i8> %1
+ %4 = shufflevector <32 x i8> %3, <32 x i8> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp slt <32 x i8> %3, %4
+ %6 = select <32 x i1> %5, <32 x i8> %3, <32 x i8> %4
+ %7 = shufflevector <32 x i8> %6, <32 x i8> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp slt <32 x i8> %6, %7
+ %9 = select <32 x i1> %8, <32 x i8> %6, <32 x i8> %7
+ %10 = shufflevector <32 x i8> %9, <32 x i8> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %11 = icmp slt <32 x i8> %9, %10
+ %12 = select <32 x i1> %11, <32 x i8> %9, <32 x i8> %10
+ %13 = shufflevector <32 x i8> %12, <32 x i8> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %14 = icmp slt <32 x i8> %12, %13
+ %15 = select <32 x i1> %14, <32 x i8> %12, <32 x i8> %13
+ %16 = extractelement <32 x i8> %15, i32 0
+ ret i8 %16
+}
+
+;
+; 512-bit Vectors
+;
+
+define i64 @test_reduce_v8i64(<8 x i64> %a0) {
+; X86-SSE2-LABEL: test_reduce_v8i64:
+; X86-SSE2: ## %bb.0:
+; X86-SSE2-NEXT: subl $28, %esp
+; X86-SSE2-NEXT: .cfi_def_cfa_offset 32
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm6
+; X86-SSE2-NEXT: movdqa %xmm6, (%esp) ## 16-byte Spill
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,0,2147483648,0]
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm5
+; X86-SSE2-NEXT: pxor %xmm4, %xmm5
+; X86-SSE2-NEXT: pxor %xmm4, %xmm6
+; X86-SSE2-NEXT: movdqa %xmm6, %xmm7
+; X86-SSE2-NEXT: pcmpgtd %xmm5, %xmm7
+; X86-SSE2-NEXT: pcmpeqd %xmm5, %xmm6
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,0,2,2]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; X86-SSE2-NEXT: pand %xmm5, %xmm6
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; X86-SSE2-NEXT: por %xmm6, %xmm5
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm6
+; X86-SSE2-NEXT: pxor %xmm4, %xmm6
+; X86-SSE2-NEXT: movdqa %xmm3, %xmm7
+; X86-SSE2-NEXT: pxor %xmm4, %xmm7
+; X86-SSE2-NEXT: movdqa %xmm7, %xmm0
+; X86-SSE2-NEXT: pcmpgtd %xmm6, %xmm0
+; X86-SSE2-NEXT: pcmpeqd %xmm6, %xmm7
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,0,2,2]
+; X86-SSE2-NEXT: pand %xmm6, %xmm7
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
+; X86-SSE2-NEXT: por %xmm7, %xmm6
+; X86-SSE2-NEXT: pand %xmm6, %xmm1
+; X86-SSE2-NEXT: pandn %xmm3, %xmm6
+; X86-SSE2-NEXT: por %xmm1, %xmm6
+; X86-SSE2-NEXT: pand %xmm5, %xmm2
+; X86-SSE2-NEXT: pandn (%esp), %xmm5 ## 16-byte Folded Reload
+; X86-SSE2-NEXT: por %xmm2, %xmm5
+; X86-SSE2-NEXT: movdqa %xmm5, %xmm0
+; X86-SSE2-NEXT: pxor %xmm4, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm6, %xmm1
+; X86-SSE2-NEXT: pxor %xmm4, %xmm1
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm2
+; X86-SSE2-NEXT: pcmpeqd %xmm0, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; X86-SSE2-NEXT: pand %xmm0, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
+; X86-SSE2-NEXT: por %xmm1, %xmm0
+; X86-SSE2-NEXT: pand %xmm0, %xmm5
+; X86-SSE2-NEXT: pandn %xmm6, %xmm0
+; X86-SSE2-NEXT: por %xmm5, %xmm0
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT: pxor %xmm4, %xmm2
+; X86-SSE2-NEXT: pxor %xmm1, %xmm4
+; X86-SSE2-NEXT: movdqa %xmm4, %xmm3
+; X86-SSE2-NEXT: pcmpgtd %xmm2, %xmm3
+; X86-SSE2-NEXT: pcmpeqd %xmm2, %xmm4
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; X86-SSE2-NEXT: pand %xmm2, %xmm4
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; X86-SSE2-NEXT: por %xmm4, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm0
+; X86-SSE2-NEXT: pandn %xmm1, %xmm2
+; X86-SSE2-NEXT: por %xmm0, %xmm2
+; X86-SSE2-NEXT: movd %xmm2, %eax
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X86-SSE2-NEXT: movd %xmm0, %edx
+; X86-SSE2-NEXT: addl $28, %esp
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v8i64:
+; X86-SSE42: ## %bb.0:
+; X86-SSE42-NEXT: movdqa %xmm0, %xmm4
+; X86-SSE42-NEXT: movdqa %xmm3, %xmm5
+; X86-SSE42-NEXT: pcmpgtq %xmm1, %xmm5
+; X86-SSE42-NEXT: movdqa %xmm2, %xmm0
+; X86-SSE42-NEXT: pcmpgtq %xmm4, %xmm0
+; X86-SSE42-NEXT: blendvpd %xmm0, %xmm4, %xmm2
+; X86-SSE42-NEXT: movdqa %xmm5, %xmm0
+; X86-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm3
+; X86-SSE42-NEXT: movapd %xmm3, %xmm0
+; X86-SSE42-NEXT: pcmpgtq %xmm2, %xmm0
+; X86-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm3
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1]
+; X86-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE42-NEXT: pcmpgtq %xmm3, %xmm0
+; X86-SSE42-NEXT: blendvpd %xmm0, %xmm3, %xmm1
+; X86-SSE42-NEXT: movd %xmm1, %eax
+; X86-SSE42-NEXT: pextrd $1, %xmm1, %edx
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX1-LABEL: test_reduce_v8i64:
+; X86-AVX1: ## %bb.0:
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; X86-AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
+; X86-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm3
+; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X86-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
+; X86-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm3
+; X86-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X86-AVX1-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm3
+; X86-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX1-NEXT: vmovd %xmm0, %eax
+; X86-AVX1-NEXT: vpextrd $1, %xmm0, %edx
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_reduce_v8i64:
+; X86-AVX2: ## %bb.0:
+; X86-AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2
+; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2
+; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2
+; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX2-NEXT: vmovd %xmm0, %eax
+; X86-AVX2-NEXT: vpextrd $1, %xmm0, %edx
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v8i64:
+; X64-SSE2: ## %bb.0:
+; X64-SSE2-NEXT: movdqa {{.*#+}} xmm9 = [2147483648,0,2147483648,0]
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm5
+; X64-SSE2-NEXT: pxor %xmm9, %xmm5
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm6
+; X64-SSE2-NEXT: pxor %xmm9, %xmm6
+; X64-SSE2-NEXT: movdqa %xmm6, %xmm7
+; X64-SSE2-NEXT: pcmpgtd %xmm5, %xmm7
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
+; X64-SSE2-NEXT: pcmpeqd %xmm5, %xmm6
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; X64-SSE2-NEXT: pand %xmm8, %xmm6
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; X64-SSE2-NEXT: por %xmm6, %xmm5
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm6
+; X64-SSE2-NEXT: pxor %xmm9, %xmm6
+; X64-SSE2-NEXT: movdqa %xmm3, %xmm7
+; X64-SSE2-NEXT: pxor %xmm9, %xmm7
+; X64-SSE2-NEXT: movdqa %xmm7, %xmm4
+; X64-SSE2-NEXT: pcmpgtd %xmm6, %xmm4
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm4[0,0,2,2]
+; X64-SSE2-NEXT: pcmpeqd %xmm6, %xmm7
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
+; X64-SSE2-NEXT: pand %xmm8, %xmm7
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3]
+; X64-SSE2-NEXT: por %xmm7, %xmm6
+; X64-SSE2-NEXT: pand %xmm6, %xmm1
+; X64-SSE2-NEXT: pandn %xmm3, %xmm6
+; X64-SSE2-NEXT: por %xmm1, %xmm6
+; X64-SSE2-NEXT: pand %xmm5, %xmm0
+; X64-SSE2-NEXT: pandn %xmm2, %xmm5
+; X64-SSE2-NEXT: por %xmm0, %xmm5
+; X64-SSE2-NEXT: movdqa %xmm5, %xmm0
+; X64-SSE2-NEXT: pxor %xmm9, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm6, %xmm1
+; X64-SSE2-NEXT: pxor %xmm9, %xmm1
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X64-SSE2-NEXT: pcmpgtd %xmm0, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
+; X64-SSE2-NEXT: pcmpeqd %xmm0, %xmm1
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; X64-SSE2-NEXT: pand %xmm3, %xmm0
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; X64-SSE2-NEXT: por %xmm0, %xmm1
+; X64-SSE2-NEXT: pand %xmm1, %xmm5
+; X64-SSE2-NEXT: pandn %xmm6, %xmm1
+; X64-SSE2-NEXT: por %xmm5, %xmm1
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X64-SSE2-NEXT: pxor %xmm9, %xmm2
+; X64-SSE2-NEXT: pxor %xmm0, %xmm9
+; X64-SSE2-NEXT: movdqa %xmm9, %xmm3
+; X64-SSE2-NEXT: pcmpgtd %xmm2, %xmm3
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
+; X64-SSE2-NEXT: pcmpeqd %xmm2, %xmm9
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm9[1,1,3,3]
+; X64-SSE2-NEXT: pand %xmm4, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; X64-SSE2-NEXT: por %xmm2, %xmm3
+; X64-SSE2-NEXT: pand %xmm3, %xmm1
+; X64-SSE2-NEXT: pandn %xmm0, %xmm3
+; X64-SSE2-NEXT: por %xmm1, %xmm3
+; X64-SSE2-NEXT: movq %xmm3, %rax
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v8i64:
+; X64-SSE42: ## %bb.0:
+; X64-SSE42-NEXT: movdqa %xmm0, %xmm4
+; X64-SSE42-NEXT: movdqa %xmm3, %xmm5
+; X64-SSE42-NEXT: pcmpgtq %xmm1, %xmm5
+; X64-SSE42-NEXT: movdqa %xmm2, %xmm0
+; X64-SSE42-NEXT: pcmpgtq %xmm4, %xmm0
+; X64-SSE42-NEXT: blendvpd %xmm0, %xmm4, %xmm2
+; X64-SSE42-NEXT: movdqa %xmm5, %xmm0
+; X64-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm3
+; X64-SSE42-NEXT: movapd %xmm3, %xmm0
+; X64-SSE42-NEXT: pcmpgtq %xmm2, %xmm0
+; X64-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm3
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1]
+; X64-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE42-NEXT: pcmpgtq %xmm3, %xmm0
+; X64-SSE42-NEXT: blendvpd %xmm0, %xmm3, %xmm1
+; X64-SSE42-NEXT: movq %xmm1, %rax
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v8i64:
+; X64-AVX1: ## %bb.0:
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; X64-AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
+; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm3
+; X64-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
+; X64-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm3
+; X64-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X64-AVX1-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm3
+; X64-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX1-NEXT: vmovq %xmm0, %rax
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v8i64:
+; X64-AVX2: ## %bb.0:
+; X64-AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2
+; X64-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2
+; X64-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2
+; X64-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX2-NEXT: vmovq %xmm0, %rax
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v8i64:
+; X64-AVX512: ## %bb.0:
+; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; X64-AVX512-NEXT: vpminsq %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT: vpminsq %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpminsq %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vmovq %xmm0, %rax
+; X64-AVX512-NEXT: vzeroupper
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <8 x i64> %a0, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp slt <8 x i64> %a0, %1
+ %3 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %1
+ %4 = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp slt <8 x i64> %3, %4
+ %6 = select <8 x i1> %5, <8 x i64> %3, <8 x i64> %4
+ %7 = shufflevector <8 x i64> %6, <8 x i64> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp slt <8 x i64> %6, %7
+ %9 = select <8 x i1> %8, <8 x i64> %6, <8 x i64> %7
+ %10 = extractelement <8 x i64> %9, i32 0
+ ret i64 %10
+}
+
+define i32 @test_reduce_v16i32(<16 x i32> %a0) {
+; X86-SSE2-LABEL: test_reduce_v16i32:
+; X86-SSE2: ## %bb.0:
+; X86-SSE2-NEXT: movdqa %xmm3, %xmm4
+; X86-SSE2-NEXT: pcmpgtd %xmm1, %xmm4
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm5
+; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm5
+; X86-SSE2-NEXT: pand %xmm5, %xmm0
+; X86-SSE2-NEXT: pandn %xmm2, %xmm5
+; X86-SSE2-NEXT: por %xmm0, %xmm5
+; X86-SSE2-NEXT: pand %xmm4, %xmm1
+; X86-SSE2-NEXT: pandn %xmm3, %xmm4
+; X86-SSE2-NEXT: por %xmm1, %xmm4
+; X86-SSE2-NEXT: movdqa %xmm4, %xmm0
+; X86-SSE2-NEXT: pcmpgtd %xmm5, %xmm0
+; X86-SSE2-NEXT: pand %xmm0, %xmm5
+; X86-SSE2-NEXT: pandn %xmm4, %xmm0
+; X86-SSE2-NEXT: por %xmm5, %xmm0
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm0
+; X86-SSE2-NEXT: pandn %xmm1, %xmm2
+; X86-SSE2-NEXT: por %xmm0, %xmm2
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT: pcmpgtd %xmm2, %xmm1
+; X86-SSE2-NEXT: pand %xmm1, %xmm2
+; X86-SSE2-NEXT: pandn %xmm0, %xmm1
+; X86-SSE2-NEXT: por %xmm2, %xmm1
+; X86-SSE2-NEXT: movd %xmm1, %eax
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v16i32:
+; X86-SSE42: ## %bb.0:
+; X86-SSE42-NEXT: pminsd %xmm3, %xmm1
+; X86-SSE42-NEXT: pminsd %xmm2, %xmm0
+; X86-SSE42-NEXT: pminsd %xmm1, %xmm0
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT: pminsd %xmm0, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT: pminsd %xmm1, %xmm0
+; X86-SSE42-NEXT: movd %xmm0, %eax
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX1-LABEL: test_reduce_v16i32:
+; X86-AVX1: ## %bb.0:
+; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X86-AVX1-NEXT: vpminsd %xmm2, %xmm3, %xmm2
+; X86-AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpminsd %xmm2, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vmovd %xmm0, %eax
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_reduce_v16i32:
+; X86-AVX2: ## %bb.0:
+; X86-AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vmovd %xmm0, %eax
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v16i32:
+; X64-SSE2: ## %bb.0:
+; X64-SSE2-NEXT: movdqa %xmm3, %xmm4
+; X64-SSE2-NEXT: pcmpgtd %xmm1, %xmm4
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm5
+; X64-SSE2-NEXT: pcmpgtd %xmm0, %xmm5
+; X64-SSE2-NEXT: pand %xmm5, %xmm0
+; X64-SSE2-NEXT: pandn %xmm2, %xmm5
+; X64-SSE2-NEXT: por %xmm0, %xmm5
+; X64-SSE2-NEXT: pand %xmm4, %xmm1
+; X64-SSE2-NEXT: pandn %xmm3, %xmm4
+; X64-SSE2-NEXT: por %xmm1, %xmm4
+; X64-SSE2-NEXT: movdqa %xmm4, %xmm0
+; X64-SSE2-NEXT: pcmpgtd %xmm5, %xmm0
+; X64-SSE2-NEXT: pand %xmm0, %xmm5
+; X64-SSE2-NEXT: pandn %xmm4, %xmm0
+; X64-SSE2-NEXT: por %xmm5, %xmm0
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X64-SSE2-NEXT: pcmpgtd %xmm0, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm0
+; X64-SSE2-NEXT: pandn %xmm1, %xmm2
+; X64-SSE2-NEXT: por %xmm0, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE2-NEXT: pcmpgtd %xmm2, %xmm1
+; X64-SSE2-NEXT: pand %xmm1, %xmm2
+; X64-SSE2-NEXT: pandn %xmm0, %xmm1
+; X64-SSE2-NEXT: por %xmm2, %xmm1
+; X64-SSE2-NEXT: movd %xmm1, %eax
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v16i32:
+; X64-SSE42: ## %bb.0:
+; X64-SSE42-NEXT: pminsd %xmm3, %xmm1
+; X64-SSE42-NEXT: pminsd %xmm2, %xmm0
+; X64-SSE42-NEXT: pminsd %xmm1, %xmm0
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT: pminsd %xmm0, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT: pminsd %xmm1, %xmm0
+; X64-SSE42-NEXT: movd %xmm0, %eax
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v16i32:
+; X64-AVX1: ## %bb.0:
+; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X64-AVX1-NEXT: vpminsd %xmm2, %xmm3, %xmm2
+; X64-AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpminsd %xmm2, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vmovd %xmm0, %eax
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v16i32:
+; X64-AVX2: ## %bb.0:
+; X64-AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vmovd %xmm0, %eax
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v16i32:
+; X64-AVX512: ## %bb.0:
+; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; X64-AVX512-NEXT: vpminsd %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT: vpminsd %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpminsd %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX512-NEXT: vpminsd %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vmovd %xmm0, %eax
+; X64-AVX512-NEXT: vzeroupper
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <16 x i32> %a0, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp slt <16 x i32> %a0, %1
+ %3 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %1
+ %4 = shufflevector <16 x i32> %3, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp slt <16 x i32> %3, %4
+ %6 = select <16 x i1> %5, <16 x i32> %3, <16 x i32> %4
+ %7 = shufflevector <16 x i32> %6, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp slt <16 x i32> %6, %7
+ %9 = select <16 x i1> %8, <16 x i32> %6, <16 x i32> %7
+ %10 = shufflevector <16 x i32> %9, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %11 = icmp slt <16 x i32> %9, %10
+ %12 = select <16 x i1> %11, <16 x i32> %9, <16 x i32> %10
+ %13 = extractelement <16 x i32> %12, i32 0
+ ret i32 %13
+}
+
+define i16 @test_reduce_v32i16(<32 x i16> %a0) {
+; X86-SSE2-LABEL: test_reduce_v32i16:
+; X86-SSE2: ## %bb.0:
+; X86-SSE2-NEXT: pminsw %xmm3, %xmm1
+; X86-SSE2-NEXT: pminsw %xmm2, %xmm0
+; X86-SSE2-NEXT: pminsw %xmm1, %xmm0
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: pminsw %xmm0, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE2-NEXT: pminsw %xmm1, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT: psrld $16, %xmm1
+; X86-SSE2-NEXT: pminsw %xmm0, %xmm1
+; X86-SSE2-NEXT: movd %xmm1, %eax
+; X86-SSE2-NEXT: ## kill: def %ax killed %ax killed %eax
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v32i16:
+; X86-SSE42: ## %bb.0:
+; X86-SSE42-NEXT: pminsw %xmm3, %xmm1
+; X86-SSE42-NEXT: pminsw %xmm2, %xmm0
+; X86-SSE42-NEXT: pminsw %xmm1, %xmm0
+; X86-SSE42-NEXT: movdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; X86-SSE42-NEXT: pxor %xmm1, %xmm0
+; X86-SSE42-NEXT: phminposuw %xmm0, %xmm0
+; X86-SSE42-NEXT: pxor %xmm1, %xmm0
+; X86-SSE42-NEXT: movd %xmm0, %eax
+; X86-SSE42-NEXT: ## kill: def %ax killed %ax killed %eax
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX1-LABEL: test_reduce_v32i16:
+; X86-AVX1: ## %bb.0:
+; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X86-AVX1-NEXT: vpminsw %xmm2, %xmm3, %xmm2
+; X86-AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpminsw %xmm2, %xmm0, %xmm0
+; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; X86-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vphminposuw %xmm0, %xmm0
+; X86-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vmovd %xmm0, %eax
+; X86-AVX1-NEXT: ## kill: def %ax killed %ax killed %eax
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_reduce_v32i16:
+; X86-AVX2: ## %bb.0:
+; X86-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT: vpminsw %xmm1, %xmm0, %xmm0
+; X86-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X86-AVX2-NEXT: vphminposuw %xmm0, %xmm0
+; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X86-AVX2-NEXT: vmovd %xmm0, %eax
+; X86-AVX2-NEXT: ## kill: def %ax killed %ax killed %eax
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v32i16:
+; X64-SSE2: ## %bb.0:
+; X64-SSE2-NEXT: pminsw %xmm3, %xmm1
+; X64-SSE2-NEXT: pminsw %xmm2, %xmm0
+; X64-SSE2-NEXT: pminsw %xmm1, %xmm0
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: pminsw %xmm0, %xmm1
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE2-NEXT: pminsw %xmm1, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE2-NEXT: psrld $16, %xmm1
+; X64-SSE2-NEXT: pminsw %xmm0, %xmm1
+; X64-SSE2-NEXT: movd %xmm1, %eax
+; X64-SSE2-NEXT: ## kill: def %ax killed %ax killed %eax
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v32i16:
+; X64-SSE42: ## %bb.0:
+; X64-SSE42-NEXT: pminsw %xmm3, %xmm1
+; X64-SSE42-NEXT: pminsw %xmm2, %xmm0
+; X64-SSE42-NEXT: pminsw %xmm1, %xmm0
+; X64-SSE42-NEXT: movdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; X64-SSE42-NEXT: pxor %xmm1, %xmm0
+; X64-SSE42-NEXT: phminposuw %xmm0, %xmm0
+; X64-SSE42-NEXT: pxor %xmm1, %xmm0
+; X64-SSE42-NEXT: movd %xmm0, %eax
+; X64-SSE42-NEXT: ## kill: def %ax killed %ax killed %eax
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v32i16:
+; X64-AVX1: ## %bb.0:
+; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X64-AVX1-NEXT: vpminsw %xmm2, %xmm3, %xmm2
+; X64-AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpminsw %xmm2, %xmm0, %xmm0
+; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; X64-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vphminposuw %xmm0, %xmm0
+; X64-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vmovd %xmm0, %eax
+; X64-AVX1-NEXT: ## kill: def %ax killed %ax killed %eax
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v32i16:
+; X64-AVX2: ## %bb.0:
+; X64-AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vpminsw %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0
+; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT: vmovd %xmm0, %eax
+; X64-AVX2-NEXT: ## kill: def %ax killed %ax killed %eax
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v32i16:
+; X64-AVX512: ## %bb.0:
+; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; X64-AVX512-NEXT: vpminsw %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT: vpminsw %xmm1, %xmm0, %xmm0
+; X64-AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; X64-AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0
+; X64-AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX512-NEXT: vmovd %xmm0, %eax
+; X64-AVX512-NEXT: ## kill: def %ax killed %ax killed %eax
+; X64-AVX512-NEXT: vzeroupper
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <32 x i16> %a0, <32 x i16> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp slt <32 x i16> %a0, %1
+ %3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %1
+ %4 = shufflevector <32 x i16> %3, <32 x i16> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp slt <32 x i16> %3, %4
+ %6 = select <32 x i1> %5, <32 x i16> %3, <32 x i16> %4
+ %7 = shufflevector <32 x i16> %6, <32 x i16> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp slt <32 x i16> %6, %7
+ %9 = select <32 x i1> %8, <32 x i16> %6, <32 x i16> %7
+ %10 = shufflevector <32 x i16> %9, <32 x i16> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %11 = icmp slt <32 x i16> %9, %10
+ %12 = select <32 x i1> %11, <32 x i16> %9, <32 x i16> %10
+ %13 = shufflevector <32 x i16> %12, <32 x i16> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %14 = icmp slt <32 x i16> %12, %13
+ %15 = select <32 x i1> %14, <32 x i16> %12, <32 x i16> %13
+ %16 = extractelement <32 x i16> %15, i32 0
+ ret i16 %16
+}
+
+define i8 @test_reduce_v64i8(<64 x i8> %a0) {
+; X86-SSE2-LABEL: test_reduce_v64i8:
+; X86-SSE2: ## %bb.0:
+; X86-SSE2-NEXT: movdqa %xmm3, %xmm4
+; X86-SSE2-NEXT: pcmpgtb %xmm1, %xmm4
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm5
+; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm5
+; X86-SSE2-NEXT: pand %xmm5, %xmm0
+; X86-SSE2-NEXT: pandn %xmm2, %xmm5
+; X86-SSE2-NEXT: por %xmm0, %xmm5
+; X86-SSE2-NEXT: pand %xmm4, %xmm1
+; X86-SSE2-NEXT: pandn %xmm3, %xmm4
+; X86-SSE2-NEXT: por %xmm1, %xmm4
+; X86-SSE2-NEXT: movdqa %xmm4, %xmm0
+; X86-SSE2-NEXT: pcmpgtb %xmm5, %xmm0
+; X86-SSE2-NEXT: pand %xmm0, %xmm5
+; X86-SSE2-NEXT: pandn %xmm4, %xmm0
+; X86-SSE2-NEXT: por %xmm5, %xmm0
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT: pcmpgtb %xmm0, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm0
+; X86-SSE2-NEXT: pandn %xmm1, %xmm2
+; X86-SSE2-NEXT: por %xmm0, %xmm2
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm1
+; X86-SSE2-NEXT: pand %xmm1, %xmm2
+; X86-SSE2-NEXT: pandn %xmm0, %xmm1
+; X86-SSE2-NEXT: por %xmm2, %xmm1
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE2-NEXT: psrld $16, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT: pcmpgtb %xmm1, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm1
+; X86-SSE2-NEXT: pandn %xmm0, %xmm2
+; X86-SSE2-NEXT: por %xmm1, %xmm2
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm0
+; X86-SSE2-NEXT: psrlw $8, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm1
+; X86-SSE2-NEXT: pand %xmm1, %xmm2
+; X86-SSE2-NEXT: pandn %xmm0, %xmm1
+; X86-SSE2-NEXT: por %xmm2, %xmm1
+; X86-SSE2-NEXT: movd %xmm1, %eax
+; X86-SSE2-NEXT: ## kill: def %al killed %al killed %eax
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v64i8:
+; X86-SSE42: ## %bb.0:
+; X86-SSE42-NEXT: pminsb %xmm3, %xmm1
+; X86-SSE42-NEXT: pminsb %xmm2, %xmm0
+; X86-SSE42-NEXT: pminsb %xmm1, %xmm0
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT: pminsb %xmm0, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT: pminsb %xmm1, %xmm0
+; X86-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE42-NEXT: psrld $16, %xmm1
+; X86-SSE42-NEXT: pminsb %xmm0, %xmm1
+; X86-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE42-NEXT: psrlw $8, %xmm0
+; X86-SSE42-NEXT: pminsb %xmm1, %xmm0
+; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax
+; X86-SSE42-NEXT: ## kill: def %al killed %al killed %eax
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX1-LABEL: test_reduce_v64i8:
+; X86-AVX1: ## %bb.0:
+; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X86-AVX1-NEXT: vpminsb %xmm2, %xmm3, %xmm2
+; X86-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpminsb %xmm2, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X86-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpextrb $0, %xmm0, %eax
+; X86-AVX1-NEXT: ## kill: def %al killed %al killed %eax
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_reduce_v64i8:
+; X86-AVX2: ## %bb.0:
+; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X86-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpextrb $0, %xmm0, %eax
+; X86-AVX2-NEXT: ## kill: def %al killed %al killed %eax
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v64i8:
+; X64-SSE2: ## %bb.0:
+; X64-SSE2-NEXT: movdqa %xmm3, %xmm4
+; X64-SSE2-NEXT: pcmpgtb %xmm1, %xmm4
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm5
+; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm5
+; X64-SSE2-NEXT: pand %xmm5, %xmm0
+; X64-SSE2-NEXT: pandn %xmm2, %xmm5
+; X64-SSE2-NEXT: por %xmm0, %xmm5
+; X64-SSE2-NEXT: pand %xmm4, %xmm1
+; X64-SSE2-NEXT: pandn %xmm3, %xmm4
+; X64-SSE2-NEXT: por %xmm1, %xmm4
+; X64-SSE2-NEXT: movdqa %xmm4, %xmm0
+; X64-SSE2-NEXT: pcmpgtb %xmm5, %xmm0
+; X64-SSE2-NEXT: pand %xmm0, %xmm5
+; X64-SSE2-NEXT: pandn %xmm4, %xmm0
+; X64-SSE2-NEXT: por %xmm5, %xmm0
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X64-SSE2-NEXT: pcmpgtb %xmm0, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm0
+; X64-SSE2-NEXT: pandn %xmm1, %xmm2
+; X64-SSE2-NEXT: por %xmm0, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE2-NEXT: pcmpgtb %xmm2, %xmm1
+; X64-SSE2-NEXT: pand %xmm1, %xmm2
+; X64-SSE2-NEXT: pandn %xmm0, %xmm1
+; X64-SSE2-NEXT: por %xmm2, %xmm1
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE2-NEXT: psrld $16, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X64-SSE2-NEXT: pcmpgtb %xmm1, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm1
+; X64-SSE2-NEXT: pandn %xmm0, %xmm2
+; X64-SSE2-NEXT: por %xmm1, %xmm2
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm0
+; X64-SSE2-NEXT: psrlw $8, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE2-NEXT: pcmpgtb %xmm2, %xmm1
+; X64-SSE2-NEXT: pand %xmm1, %xmm2
+; X64-SSE2-NEXT: pandn %xmm0, %xmm1
+; X64-SSE2-NEXT: por %xmm2, %xmm1
+; X64-SSE2-NEXT: movd %xmm1, %eax
+; X64-SSE2-NEXT: ## kill: def %al killed %al killed %eax
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v64i8:
+; X64-SSE42: ## %bb.0:
+; X64-SSE42-NEXT: pminsb %xmm3, %xmm1
+; X64-SSE42-NEXT: pminsb %xmm2, %xmm0
+; X64-SSE42-NEXT: pminsb %xmm1, %xmm0
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT: pminsb %xmm0, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT: pminsb %xmm1, %xmm0
+; X64-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE42-NEXT: psrld $16, %xmm1
+; X64-SSE42-NEXT: pminsb %xmm0, %xmm1
+; X64-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE42-NEXT: psrlw $8, %xmm0
+; X64-SSE42-NEXT: pminsb %xmm1, %xmm0
+; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax
+; X64-SSE42-NEXT: ## kill: def %al killed %al killed %eax
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v64i8:
+; X64-AVX1: ## %bb.0:
+; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X64-AVX1-NEXT: vpminsb %xmm2, %xmm3, %xmm2
+; X64-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpminsb %xmm2, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X64-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpextrb $0, %xmm0, %eax
+; X64-AVX1-NEXT: ## kill: def %al killed %al killed %eax
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v64i8:
+; X64-AVX2: ## %bb.0:
+; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X64-AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpextrb $0, %xmm0, %eax
+; X64-AVX2-NEXT: ## kill: def %al killed %al killed %eax
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v64i8:
+; X64-AVX512: ## %bb.0:
+; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; X64-AVX512-NEXT: vpminsb %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT: vpminsb %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpminsb %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX512-NEXT: vpminsb %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX512-NEXT: vpminsb %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X64-AVX512-NEXT: vpminsb %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpextrb $0, %xmm0, %eax
+; X64-AVX512-NEXT: ## kill: def %al killed %al killed %eax
+; X64-AVX512-NEXT: vzeroupper
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <64 x i8> %a0, <64 x i8> undef, <64 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp slt <64 x i8> %a0, %1
+ %3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %1
+ %4 = shufflevector <64 x i8> %3, <64 x i8> undef, <64 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp slt <64 x i8> %3, %4
+ %6 = select <64 x i1> %5, <64 x i8> %3, <64 x i8> %4
+ %7 = shufflevector <64 x i8> %6, <64 x i8> undef, <64 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp slt <64 x i8> %6, %7
+ %9 = select <64 x i1> %8, <64 x i8> %6, <64 x i8> %7
+ %10 = shufflevector <64 x i8> %9, <64 x i8> undef, <64 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %11 = icmp slt <64 x i8> %9, %10
+ %12 = select <64 x i1> %11, <64 x i8> %9, <64 x i8> %10
+ %13 = shufflevector <64 x i8> %12, <64 x i8> undef, <64 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %14 = icmp slt <64 x i8> %12, %13
+ %15 = select <64 x i1> %14, <64 x i8> %12, <64 x i8> %13
+ %16 = shufflevector <64 x i8> %15, <64 x i8> undef, <64 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %17 = icmp slt <64 x i8> %15, %16
+ %18 = select <64 x i1> %17, <64 x i8> %15, <64 x i8> %16
+ %19 = extractelement <64 x i8> %18, i32 0
+ ret i8 %19
+}
diff --git a/test/CodeGen/X86/horizontal-reduce-umax.ll b/test/CodeGen/X86/horizontal-reduce-umax.ll
new file mode 100644
index 000000000000..52e623b82718
--- /dev/null
+++ b/test/CodeGen/X86/horizontal-reduce-umax.ll
@@ -0,0 +1,2161 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+sse2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE --check-prefix=X86-SSE2
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+sse4.2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE --check-prefix=X86-SSE42
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx | FileCheck %s --check-prefix=X86 --check-prefix=X86-AVX --check-prefix=X86-AVX1
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-AVX --check-prefix=X86-AVX2
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+sse2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE --check-prefix=X64-SSE2
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+sse4.2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE --check-prefix=X64-SSE42
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX1
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX2
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vl | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX512
+
+;
+; 128-bit Vectors
+;
+
+define i64 @test_reduce_v2i64(<2 x i64> %a0) {
+; X86-SSE2-LABEL: test_reduce_v2i64:
+; X86-SSE2: ## %bb.0:
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X86-SSE2-NEXT: pxor %xmm2, %xmm3
+; X86-SSE2-NEXT: pxor %xmm1, %xmm2
+; X86-SSE2-NEXT: movdqa %xmm3, %xmm4
+; X86-SSE2-NEXT: pcmpgtd %xmm2, %xmm4
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; X86-SSE2-NEXT: pcmpeqd %xmm3, %xmm2
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; X86-SSE2-NEXT: pand %xmm5, %xmm2
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; X86-SSE2-NEXT: por %xmm2, %xmm3
+; X86-SSE2-NEXT: pand %xmm3, %xmm0
+; X86-SSE2-NEXT: pandn %xmm1, %xmm3
+; X86-SSE2-NEXT: por %xmm0, %xmm3
+; X86-SSE2-NEXT: movd %xmm3, %eax
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
+; X86-SSE2-NEXT: movd %xmm0, %edx
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v2i64:
+; X86-SSE42: ## %bb.0:
+; X86-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; X86-SSE42-NEXT: movdqa {{.*#+}} xmm3 = [0,2147483648,0,2147483648]
+; X86-SSE42-NEXT: pxor %xmm3, %xmm0
+; X86-SSE42-NEXT: pxor %xmm2, %xmm3
+; X86-SSE42-NEXT: pcmpgtq %xmm3, %xmm0
+; X86-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2
+; X86-SSE42-NEXT: movd %xmm2, %eax
+; X86-SSE42-NEXT: pextrd $1, %xmm2, %edx
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX-LABEL: test_reduce_v2i64:
+; X86-AVX: ## %bb.0:
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648]
+; X86-AVX-NEXT: vpxor %xmm2, %xmm0, %xmm3
+; X86-AVX-NEXT: vpxor %xmm2, %xmm1, %xmm2
+; X86-AVX-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
+; X86-AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; X86-AVX-NEXT: vmovd %xmm0, %eax
+; X86-AVX-NEXT: vpextrd $1, %xmm0, %edx
+; X86-AVX-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v2i64:
+; X64-SSE2: ## %bb.0:
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X64-SSE2-NEXT: pxor %xmm2, %xmm3
+; X64-SSE2-NEXT: pxor %xmm1, %xmm2
+; X64-SSE2-NEXT: movdqa %xmm3, %xmm4
+; X64-SSE2-NEXT: pcmpgtd %xmm2, %xmm4
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; X64-SSE2-NEXT: pcmpeqd %xmm3, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; X64-SSE2-NEXT: pand %xmm5, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; X64-SSE2-NEXT: por %xmm2, %xmm3
+; X64-SSE2-NEXT: pand %xmm3, %xmm0
+; X64-SSE2-NEXT: pandn %xmm1, %xmm3
+; X64-SSE2-NEXT: por %xmm0, %xmm3
+; X64-SSE2-NEXT: movq %xmm3, %rax
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v2i64:
+; X64-SSE42: ## %bb.0:
+; X64-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; X64-SSE42-NEXT: movdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; X64-SSE42-NEXT: pxor %xmm3, %xmm0
+; X64-SSE42-NEXT: pxor %xmm2, %xmm3
+; X64-SSE42-NEXT: pcmpgtq %xmm3, %xmm0
+; X64-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2
+; X64-SSE42-NEXT: movq %xmm2, %rax
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v2i64:
+; X64-AVX1: ## %bb.0:
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; X64-AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm3
+; X64-AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm2
+; X64-AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
+; X64-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; X64-AVX1-NEXT: vmovq %xmm0, %rax
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v2i64:
+; X64-AVX2: ## %bb.0:
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; X64-AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3
+; X64-AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm2
+; X64-AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
+; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; X64-AVX2-NEXT: vmovq %xmm0, %rax
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v2i64:
+; X64-AVX512: ## %bb.0:
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpmaxuq %xmm1, %xmm0, %xmm0
+; X64-AVX512-NEXT: vmovq %xmm0, %rax
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <2 x i64> %a0, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
+ %2 = icmp ugt <2 x i64> %a0, %1
+ %3 = select <2 x i1> %2, <2 x i64> %a0, <2 x i64> %1
+ %4 = extractelement <2 x i64> %3, i32 0
+ ret i64 %4
+}
+
+define i32 @test_reduce_v4i32(<4 x i32> %a0) {
+; X86-SSE2-LABEL: test_reduce_v4i32:
+; X86-SSE2: ## %bb.0:
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X86-SSE2-NEXT: pxor %xmm2, %xmm3
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm4
+; X86-SSE2-NEXT: pxor %xmm2, %xmm4
+; X86-SSE2-NEXT: pcmpgtd %xmm4, %xmm3
+; X86-SSE2-NEXT: pand %xmm3, %xmm0
+; X86-SSE2-NEXT: pandn %xmm1, %xmm3
+; X86-SSE2-NEXT: por %xmm0, %xmm3
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
+; X86-SSE2-NEXT: movdqa %xmm3, %xmm1
+; X86-SSE2-NEXT: pxor %xmm2, %xmm1
+; X86-SSE2-NEXT: pxor %xmm0, %xmm2
+; X86-SSE2-NEXT: pcmpgtd %xmm2, %xmm1
+; X86-SSE2-NEXT: pand %xmm1, %xmm3
+; X86-SSE2-NEXT: pandn %xmm0, %xmm1
+; X86-SSE2-NEXT: por %xmm3, %xmm1
+; X86-SSE2-NEXT: movd %xmm1, %eax
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v4i32:
+; X86-SSE42: ## %bb.0:
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT: pmaxud %xmm0, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT: pmaxud %xmm1, %xmm0
+; X86-SSE42-NEXT: movd %xmm0, %eax
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX-LABEL: test_reduce_v4i32:
+; X86-AVX: ## %bb.0:
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vmovd %xmm0, %eax
+; X86-AVX-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v4i32:
+; X64-SSE2: ## %bb.0:
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X64-SSE2-NEXT: pxor %xmm2, %xmm3
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm4
+; X64-SSE2-NEXT: pxor %xmm2, %xmm4
+; X64-SSE2-NEXT: pcmpgtd %xmm4, %xmm3
+; X64-SSE2-NEXT: pand %xmm3, %xmm0
+; X64-SSE2-NEXT: pandn %xmm1, %xmm3
+; X64-SSE2-NEXT: por %xmm0, %xmm3
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
+; X64-SSE2-NEXT: movdqa %xmm3, %xmm1
+; X64-SSE2-NEXT: pxor %xmm2, %xmm1
+; X64-SSE2-NEXT: pxor %xmm0, %xmm2
+; X64-SSE2-NEXT: pcmpgtd %xmm2, %xmm1
+; X64-SSE2-NEXT: pand %xmm1, %xmm3
+; X64-SSE2-NEXT: pandn %xmm0, %xmm1
+; X64-SSE2-NEXT: por %xmm3, %xmm1
+; X64-SSE2-NEXT: movd %xmm1, %eax
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v4i32:
+; X64-SSE42: ## %bb.0:
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT: pmaxud %xmm0, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT: pmaxud %xmm1, %xmm0
+; X64-SSE42-NEXT: movd %xmm0, %eax
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX-LABEL: test_reduce_v4i32:
+; X64-AVX: ## %bb.0:
+; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vmovd %xmm0, %eax
+; X64-AVX-NEXT: retq
+ %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+ %2 = icmp ugt <4 x i32> %a0, %1
+ %3 = select <4 x i1> %2, <4 x i32> %a0, <4 x i32> %1
+ %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+ %5 = icmp ugt <4 x i32> %3, %4
+ %6 = select <4 x i1> %5, <4 x i32> %3, <4 x i32> %4
+ %7 = extractelement <4 x i32> %6, i32 0
+ ret i32 %7
+}
+
+define i16 @test_reduce_v8i16(<8 x i16> %a0) {
+; X86-SSE2-LABEL: test_reduce_v8i16:
+; X86-SSE2: ## %bb.0:
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X86-SSE2-NEXT: pxor %xmm1, %xmm3
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm4
+; X86-SSE2-NEXT: pxor %xmm1, %xmm4
+; X86-SSE2-NEXT: pcmpgtw %xmm4, %xmm3
+; X86-SSE2-NEXT: pand %xmm3, %xmm0
+; X86-SSE2-NEXT: pandn %xmm2, %xmm3
+; X86-SSE2-NEXT: por %xmm0, %xmm3
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
+; X86-SSE2-NEXT: movdqa %xmm3, %xmm2
+; X86-SSE2-NEXT: pxor %xmm1, %xmm2
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm4
+; X86-SSE2-NEXT: pxor %xmm1, %xmm4
+; X86-SSE2-NEXT: pcmpgtw %xmm4, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm3
+; X86-SSE2-NEXT: pandn %xmm0, %xmm2
+; X86-SSE2-NEXT: por %xmm3, %xmm2
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm0
+; X86-SSE2-NEXT: psrld $16, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm3
+; X86-SSE2-NEXT: pxor %xmm1, %xmm3
+; X86-SSE2-NEXT: pxor %xmm0, %xmm1
+; X86-SSE2-NEXT: pcmpgtw %xmm1, %xmm3
+; X86-SSE2-NEXT: pand %xmm3, %xmm2
+; X86-SSE2-NEXT: pandn %xmm0, %xmm3
+; X86-SSE2-NEXT: por %xmm2, %xmm3
+; X86-SSE2-NEXT: movd %xmm3, %eax
+; X86-SSE2-NEXT: ## kill: def %ax killed %ax killed %eax
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v8i16:
+; X86-SSE42: ## %bb.0:
+; X86-SSE42-NEXT: pcmpeqd %xmm1, %xmm1
+; X86-SSE42-NEXT: pxor %xmm1, %xmm0
+; X86-SSE42-NEXT: phminposuw %xmm0, %xmm0
+; X86-SSE42-NEXT: pxor %xmm1, %xmm0
+; X86-SSE42-NEXT: movd %xmm0, %eax
+; X86-SSE42-NEXT: ## kill: def %ax killed %ax killed %eax
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX-LABEL: test_reduce_v8i16:
+; X86-AVX: ## %bb.0:
+; X86-AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; X86-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vphminposuw %xmm0, %xmm0
+; X86-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vmovd %xmm0, %eax
+; X86-AVX-NEXT: ## kill: def %ax killed %ax killed %eax
+; X86-AVX-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v8i16:
+; X64-SSE2: ## %bb.0:
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X64-SSE2-NEXT: pxor %xmm1, %xmm3
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm4
+; X64-SSE2-NEXT: pxor %xmm1, %xmm4
+; X64-SSE2-NEXT: pcmpgtw %xmm4, %xmm3
+; X64-SSE2-NEXT: pand %xmm3, %xmm0
+; X64-SSE2-NEXT: pandn %xmm2, %xmm3
+; X64-SSE2-NEXT: por %xmm0, %xmm3
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
+; X64-SSE2-NEXT: movdqa %xmm3, %xmm2
+; X64-SSE2-NEXT: pxor %xmm1, %xmm2
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm4
+; X64-SSE2-NEXT: pxor %xmm1, %xmm4
+; X64-SSE2-NEXT: pcmpgtw %xmm4, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm3
+; X64-SSE2-NEXT: pandn %xmm0, %xmm2
+; X64-SSE2-NEXT: por %xmm3, %xmm2
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm0
+; X64-SSE2-NEXT: psrld $16, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm3
+; X64-SSE2-NEXT: pxor %xmm1, %xmm3
+; X64-SSE2-NEXT: pxor %xmm0, %xmm1
+; X64-SSE2-NEXT: pcmpgtw %xmm1, %xmm3
+; X64-SSE2-NEXT: pand %xmm3, %xmm2
+; X64-SSE2-NEXT: pandn %xmm0, %xmm3
+; X64-SSE2-NEXT: por %xmm2, %xmm3
+; X64-SSE2-NEXT: movd %xmm3, %eax
+; X64-SSE2-NEXT: ## kill: def %ax killed %ax killed %eax
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v8i16:
+; X64-SSE42: ## %bb.0:
+; X64-SSE42-NEXT: pcmpeqd %xmm1, %xmm1
+; X64-SSE42-NEXT: pxor %xmm1, %xmm0
+; X64-SSE42-NEXT: phminposuw %xmm0, %xmm0
+; X64-SSE42-NEXT: pxor %xmm1, %xmm0
+; X64-SSE42-NEXT: movd %xmm0, %eax
+; X64-SSE42-NEXT: ## kill: def %ax killed %ax killed %eax
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX-LABEL: test_reduce_v8i16:
+; X64-AVX: ## %bb.0:
+; X64-AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; X64-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vphminposuw %xmm0, %xmm0
+; X64-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vmovd %xmm0, %eax
+; X64-AVX-NEXT: ## kill: def %ax killed %ax killed %eax
+; X64-AVX-NEXT: retq
+ %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp ugt <8 x i16> %a0, %1
+ %3 = select <8 x i1> %2, <8 x i16> %a0, <8 x i16> %1
+ %4 = shufflevector <8 x i16> %3, <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp ugt <8 x i16> %3, %4
+ %6 = select <8 x i1> %5, <8 x i16> %3, <8 x i16> %4
+ %7 = shufflevector <8 x i16> %6, <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp ugt <8 x i16> %6, %7
+ %9 = select <8 x i1> %8, <8 x i16> %6, <8 x i16> %7
+ %10 = extractelement <8 x i16> %9, i32 0
+ ret i16 %10
+}
+
+define i8 @test_reduce_v16i8(<16 x i8> %a0) {
+; X86-SSE2-LABEL: test_reduce_v16i8:
+; X86-SSE2: ## %bb.0:
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: pmaxub %xmm0, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE2-NEXT: pmaxub %xmm1, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT: psrld $16, %xmm1
+; X86-SSE2-NEXT: pmaxub %xmm0, %xmm1
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE2-NEXT: psrlw $8, %xmm0
+; X86-SSE2-NEXT: pmaxub %xmm1, %xmm0
+; X86-SSE2-NEXT: movd %xmm0, %eax
+; X86-SSE2-NEXT: ## kill: def %al killed %al killed %eax
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v16i8:
+; X86-SSE42: ## %bb.0:
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT: pmaxub %xmm0, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT: pmaxub %xmm1, %xmm0
+; X86-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE42-NEXT: psrld $16, %xmm1
+; X86-SSE42-NEXT: pmaxub %xmm0, %xmm1
+; X86-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE42-NEXT: psrlw $8, %xmm0
+; X86-SSE42-NEXT: pmaxub %xmm1, %xmm0
+; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax
+; X86-SSE42-NEXT: ## kill: def %al killed %al killed %eax
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX-LABEL: test_reduce_v16i8:
+; X86-AVX: ## %bb.0:
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X86-AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vpextrb $0, %xmm0, %eax
+; X86-AVX-NEXT: ## kill: def %al killed %al killed %eax
+; X86-AVX-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v16i8:
+; X64-SSE2: ## %bb.0:
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: pmaxub %xmm0, %xmm1
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE2-NEXT: pmaxub %xmm1, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE2-NEXT: psrld $16, %xmm1
+; X64-SSE2-NEXT: pmaxub %xmm0, %xmm1
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE2-NEXT: psrlw $8, %xmm0
+; X64-SSE2-NEXT: pmaxub %xmm1, %xmm0
+; X64-SSE2-NEXT: movd %xmm0, %eax
+; X64-SSE2-NEXT: ## kill: def %al killed %al killed %eax
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v16i8:
+; X64-SSE42: ## %bb.0:
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT: pmaxub %xmm0, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT: pmaxub %xmm1, %xmm0
+; X64-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE42-NEXT: psrld $16, %xmm1
+; X64-SSE42-NEXT: pmaxub %xmm0, %xmm1
+; X64-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE42-NEXT: psrlw $8, %xmm0
+; X64-SSE42-NEXT: pmaxub %xmm1, %xmm0
+; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax
+; X64-SSE42-NEXT: ## kill: def %al killed %al killed %eax
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX-LABEL: test_reduce_v16i8:
+; X64-AVX: ## %bb.0:
+; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X64-AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vpextrb $0, %xmm0, %eax
+; X64-AVX-NEXT: ## kill: def %al killed %al killed %eax
+; X64-AVX-NEXT: retq
+ %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp ugt <16 x i8> %a0, %1
+ %3 = select <16 x i1> %2, <16 x i8> %a0, <16 x i8> %1
+ %4 = shufflevector <16 x i8> %3, <16 x i8> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp ugt <16 x i8> %3, %4
+ %6 = select <16 x i1> %5, <16 x i8> %3, <16 x i8> %4
+ %7 = shufflevector <16 x i8> %6, <16 x i8> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp ugt <16 x i8> %6, %7
+ %9 = select <16 x i1> %8, <16 x i8> %6, <16 x i8> %7
+ %10 = shufflevector <16 x i8> %9, <16 x i8> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %11 = icmp ugt <16 x i8> %9, %10
+ %12 = select <16 x i1> %11, <16 x i8> %9, <16 x i8> %10
+ %13 = extractelement <16 x i8> %12, i32 0
+ ret i8 %13
+}
+
+;
+; 256-bit Vectors
+;
+
+define i64 @test_reduce_v4i64(<4 x i64> %a0) {
+; X86-SSE2-LABEL: test_reduce_v4i64:
+; X86-SSE2: ## %bb.0:
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm3
+; X86-SSE2-NEXT: pxor %xmm2, %xmm3
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm4
+; X86-SSE2-NEXT: pxor %xmm2, %xmm4
+; X86-SSE2-NEXT: movdqa %xmm4, %xmm5
+; X86-SSE2-NEXT: pcmpgtd %xmm3, %xmm5
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
+; X86-SSE2-NEXT: pcmpeqd %xmm3, %xmm4
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; X86-SSE2-NEXT: pand %xmm6, %xmm3
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
+; X86-SSE2-NEXT: por %xmm3, %xmm4
+; X86-SSE2-NEXT: pand %xmm4, %xmm0
+; X86-SSE2-NEXT: pandn %xmm1, %xmm4
+; X86-SSE2-NEXT: por %xmm0, %xmm4
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]
+; X86-SSE2-NEXT: movdqa %xmm4, %xmm1
+; X86-SSE2-NEXT: pxor %xmm2, %xmm1
+; X86-SSE2-NEXT: pxor %xmm0, %xmm2
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm3
+; X86-SSE2-NEXT: pcmpgtd %xmm2, %xmm3
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
+; X86-SSE2-NEXT: pcmpeqd %xmm1, %xmm2
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; X86-SSE2-NEXT: pand %xmm5, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; X86-SSE2-NEXT: por %xmm1, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm4
+; X86-SSE2-NEXT: pandn %xmm0, %xmm2
+; X86-SSE2-NEXT: por %xmm4, %xmm2
+; X86-SSE2-NEXT: movd %xmm2, %eax
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X86-SSE2-NEXT: movd %xmm0, %edx
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v4i64:
+; X86-SSE42: ## %bb.0:
+; X86-SSE42-NEXT: movdqa %xmm0, %xmm2
+; X86-SSE42-NEXT: movdqa {{.*#+}} xmm3 = [0,2147483648,0,2147483648]
+; X86-SSE42-NEXT: movdqa %xmm1, %xmm4
+; X86-SSE42-NEXT: pxor %xmm3, %xmm4
+; X86-SSE42-NEXT: pxor %xmm3, %xmm0
+; X86-SSE42-NEXT: pcmpgtq %xmm4, %xmm0
+; X86-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; X86-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE42-NEXT: pxor %xmm3, %xmm0
+; X86-SSE42-NEXT: pxor %xmm2, %xmm3
+; X86-SSE42-NEXT: pcmpgtq %xmm3, %xmm0
+; X86-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2
+; X86-SSE42-NEXT: movd %xmm2, %eax
+; X86-SSE42-NEXT: pextrd $1, %xmm2, %edx
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX1-LABEL: test_reduce_v4i64:
+; X86-AVX1: ## %bb.0:
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648]
+; X86-AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm3
+; X86-AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm4
+; X86-AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm4
+; X86-AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm3
+; X86-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3
+; X86-AVX1-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0
+; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vxorpd %xmm2, %xmm0, %xmm3
+; X86-AVX1-NEXT: vxorpd %xmm2, %xmm1, %xmm4
+; X86-AVX1-NEXT: vpcmpgtq %xmm4, %xmm3, %xmm3
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; X86-AVX1-NEXT: vpxor %xmm2, %xmm4, %xmm2
+; X86-AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm2
+; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX1-NEXT: vmovd %xmm0, %eax
+; X86-AVX1-NEXT: vpextrd $1, %xmm0, %edx
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_reduce_v4i64:
+; X86-AVX2: ## %bb.0:
+; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648]
+; X86-AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm3
+; X86-AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm4
+; X86-AVX2-NEXT: vpcmpgtq %ymm4, %ymm3, %ymm3
+; X86-AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0
+; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vxorpd %ymm2, %ymm0, %ymm3
+; X86-AVX2-NEXT: vxorpd %ymm2, %ymm1, %ymm2
+; X86-AVX2-NEXT: vpcmpgtq %ymm2, %ymm3, %ymm2
+; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX2-NEXT: vmovd %xmm0, %eax
+; X86-AVX2-NEXT: vpextrd $1, %xmm0, %edx
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v4i64:
+; X64-SSE2: ## %bb.0:
+; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm3
+; X64-SSE2-NEXT: pxor %xmm2, %xmm3
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm4
+; X64-SSE2-NEXT: pxor %xmm2, %xmm4
+; X64-SSE2-NEXT: movdqa %xmm4, %xmm5
+; X64-SSE2-NEXT: pcmpgtd %xmm3, %xmm5
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
+; X64-SSE2-NEXT: pcmpeqd %xmm3, %xmm4
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; X64-SSE2-NEXT: pand %xmm6, %xmm3
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
+; X64-SSE2-NEXT: por %xmm3, %xmm4
+; X64-SSE2-NEXT: pand %xmm4, %xmm0
+; X64-SSE2-NEXT: pandn %xmm1, %xmm4
+; X64-SSE2-NEXT: por %xmm0, %xmm4
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]
+; X64-SSE2-NEXT: movdqa %xmm4, %xmm1
+; X64-SSE2-NEXT: pxor %xmm2, %xmm1
+; X64-SSE2-NEXT: pxor %xmm0, %xmm2
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm3
+; X64-SSE2-NEXT: pcmpgtd %xmm2, %xmm3
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
+; X64-SSE2-NEXT: pcmpeqd %xmm1, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; X64-SSE2-NEXT: pand %xmm5, %xmm1
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; X64-SSE2-NEXT: por %xmm1, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm4
+; X64-SSE2-NEXT: pandn %xmm0, %xmm2
+; X64-SSE2-NEXT: por %xmm4, %xmm2
+; X64-SSE2-NEXT: movq %xmm2, %rax
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v4i64:
+; X64-SSE42: ## %bb.0:
+; X64-SSE42-NEXT: movdqa %xmm0, %xmm2
+; X64-SSE42-NEXT: movdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; X64-SSE42-NEXT: movdqa %xmm1, %xmm4
+; X64-SSE42-NEXT: pxor %xmm3, %xmm4
+; X64-SSE42-NEXT: pxor %xmm3, %xmm0
+; X64-SSE42-NEXT: pcmpgtq %xmm4, %xmm0
+; X64-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; X64-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE42-NEXT: pxor %xmm3, %xmm0
+; X64-SSE42-NEXT: pxor %xmm2, %xmm3
+; X64-SSE42-NEXT: pcmpgtq %xmm3, %xmm0
+; X64-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2
+; X64-SSE42-NEXT: movq %xmm2, %rax
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v4i64:
+; X64-AVX1: ## %bb.0:
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; X64-AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm3
+; X64-AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm4
+; X64-AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm4
+; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm3
+; X64-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3
+; X64-AVX1-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0
+; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vxorpd %xmm2, %xmm0, %xmm3
+; X64-AVX1-NEXT: vxorpd %xmm2, %xmm1, %xmm4
+; X64-AVX1-NEXT: vpcmpgtq %xmm4, %xmm3, %xmm3
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; X64-AVX1-NEXT: vpxor %xmm2, %xmm4, %xmm2
+; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm2
+; X64-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX1-NEXT: vmovq %xmm0, %rax
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v4i64:
+; X64-AVX2: ## %bb.0:
+; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; X64-AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm3
+; X64-AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm4
+; X64-AVX2-NEXT: vpcmpgtq %ymm4, %ymm3, %ymm3
+; X64-AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0
+; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vxorpd %ymm2, %ymm0, %ymm3
+; X64-AVX2-NEXT: vxorpd %ymm2, %ymm1, %ymm2
+; X64-AVX2-NEXT: vpcmpgtq %ymm2, %ymm3, %ymm2
+; X64-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX2-NEXT: vmovq %xmm0, %rax
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v4i64:
+; X64-AVX512: ## %bb.0:
+; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT: vpmaxuq %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpmaxuq %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vmovq %xmm0, %rax
+; X64-AVX512-NEXT: vzeroupper
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+ %2 = icmp ugt <4 x i64> %a0, %1
+ %3 = select <4 x i1> %2, <4 x i64> %a0, <4 x i64> %1
+ %4 = shufflevector <4 x i64> %3, <4 x i64> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+ %5 = icmp ugt <4 x i64> %3, %4
+ %6 = select <4 x i1> %5, <4 x i64> %3, <4 x i64> %4
+ %7 = extractelement <4 x i64> %6, i32 0
+ ret i64 %7
+}
+
+define i32 @test_reduce_v8i32(<8 x i32> %a0) {
+; X86-SSE2-LABEL: test_reduce_v8i32:
+; X86-SSE2: ## %bb.0:
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm3
+; X86-SSE2-NEXT: pxor %xmm2, %xmm3
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm4
+; X86-SSE2-NEXT: pxor %xmm2, %xmm4
+; X86-SSE2-NEXT: pcmpgtd %xmm3, %xmm4
+; X86-SSE2-NEXT: pand %xmm4, %xmm0
+; X86-SSE2-NEXT: pandn %xmm1, %xmm4
+; X86-SSE2-NEXT: por %xmm0, %xmm4
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]
+; X86-SSE2-NEXT: movdqa %xmm4, %xmm1
+; X86-SSE2-NEXT: pxor %xmm2, %xmm1
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X86-SSE2-NEXT: pxor %xmm2, %xmm3
+; X86-SSE2-NEXT: pcmpgtd %xmm3, %xmm1
+; X86-SSE2-NEXT: pand %xmm1, %xmm4
+; X86-SSE2-NEXT: pandn %xmm0, %xmm1
+; X86-SSE2-NEXT: por %xmm4, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm3
+; X86-SSE2-NEXT: pxor %xmm2, %xmm3
+; X86-SSE2-NEXT: pxor %xmm0, %xmm2
+; X86-SSE2-NEXT: pcmpgtd %xmm2, %xmm3
+; X86-SSE2-NEXT: pand %xmm3, %xmm1
+; X86-SSE2-NEXT: pandn %xmm0, %xmm3
+; X86-SSE2-NEXT: por %xmm1, %xmm3
+; X86-SSE2-NEXT: movd %xmm3, %eax
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v8i32:
+; X86-SSE42: ## %bb.0:
+; X86-SSE42-NEXT: pmaxud %xmm1, %xmm0
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT: pmaxud %xmm0, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT: pmaxud %xmm1, %xmm0
+; X86-SSE42-NEXT: movd %xmm0, %eax
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX1-LABEL: test_reduce_v8i32:
+; X86-AVX1: ## %bb.0:
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X86-AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vmovd %xmm0, %eax
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_reduce_v8i32:
+; X86-AVX2: ## %bb.0:
+; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vmovd %xmm0, %eax
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v8i32:
+; X64-SSE2: ## %bb.0:
+; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm3
+; X64-SSE2-NEXT: pxor %xmm2, %xmm3
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm4
+; X64-SSE2-NEXT: pxor %xmm2, %xmm4
+; X64-SSE2-NEXT: pcmpgtd %xmm3, %xmm4
+; X64-SSE2-NEXT: pand %xmm4, %xmm0
+; X64-SSE2-NEXT: pandn %xmm1, %xmm4
+; X64-SSE2-NEXT: por %xmm0, %xmm4
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]
+; X64-SSE2-NEXT: movdqa %xmm4, %xmm1
+; X64-SSE2-NEXT: pxor %xmm2, %xmm1
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X64-SSE2-NEXT: pxor %xmm2, %xmm3
+; X64-SSE2-NEXT: pcmpgtd %xmm3, %xmm1
+; X64-SSE2-NEXT: pand %xmm1, %xmm4
+; X64-SSE2-NEXT: pandn %xmm0, %xmm1
+; X64-SSE2-NEXT: por %xmm4, %xmm1
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm3
+; X64-SSE2-NEXT: pxor %xmm2, %xmm3
+; X64-SSE2-NEXT: pxor %xmm0, %xmm2
+; X64-SSE2-NEXT: pcmpgtd %xmm2, %xmm3
+; X64-SSE2-NEXT: pand %xmm3, %xmm1
+; X64-SSE2-NEXT: pandn %xmm0, %xmm3
+; X64-SSE2-NEXT: por %xmm1, %xmm3
+; X64-SSE2-NEXT: movd %xmm3, %eax
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v8i32:
+; X64-SSE42: ## %bb.0:
+; X64-SSE42-NEXT: pmaxud %xmm1, %xmm0
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT: pmaxud %xmm0, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT: pmaxud %xmm1, %xmm0
+; X64-SSE42-NEXT: movd %xmm0, %eax
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v8i32:
+; X64-AVX1: ## %bb.0:
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X64-AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vmovd %xmm0, %eax
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v8i32:
+; X64-AVX2: ## %bb.0:
+; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vmovd %xmm0, %eax
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v8i32:
+; X64-AVX512: ## %bb.0:
+; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX512-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vmovd %xmm0, %eax
+; X64-AVX512-NEXT: vzeroupper
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp ugt <8 x i32> %a0, %1
+ %3 = select <8 x i1> %2, <8 x i32> %a0, <8 x i32> %1
+ %4 = shufflevector <8 x i32> %3, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp ugt <8 x i32> %3, %4
+ %6 = select <8 x i1> %5, <8 x i32> %3, <8 x i32> %4
+ %7 = shufflevector <8 x i32> %6, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp ugt <8 x i32> %6, %7
+ %9 = select <8 x i1> %8, <8 x i32> %6, <8 x i32> %7
+ %10 = extractelement <8 x i32> %9, i32 0
+ ret i32 %10
+}
+
+define i16 @test_reduce_v16i16(<16 x i16> %a0) {
+; X86-SSE2-LABEL: test_reduce_v16i16:
+; X86-SSE2: ## %bb.0:
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm3
+; X86-SSE2-NEXT: pxor %xmm2, %xmm3
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm4
+; X86-SSE2-NEXT: pxor %xmm2, %xmm4
+; X86-SSE2-NEXT: pcmpgtw %xmm3, %xmm4
+; X86-SSE2-NEXT: pand %xmm4, %xmm0
+; X86-SSE2-NEXT: pandn %xmm1, %xmm4
+; X86-SSE2-NEXT: por %xmm0, %xmm4
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]
+; X86-SSE2-NEXT: movdqa %xmm4, %xmm1
+; X86-SSE2-NEXT: pxor %xmm2, %xmm1
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X86-SSE2-NEXT: pxor %xmm2, %xmm3
+; X86-SSE2-NEXT: pcmpgtw %xmm3, %xmm1
+; X86-SSE2-NEXT: pand %xmm1, %xmm4
+; X86-SSE2-NEXT: pandn %xmm0, %xmm1
+; X86-SSE2-NEXT: por %xmm4, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm3
+; X86-SSE2-NEXT: pxor %xmm2, %xmm3
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm4
+; X86-SSE2-NEXT: pxor %xmm2, %xmm4
+; X86-SSE2-NEXT: pcmpgtw %xmm4, %xmm3
+; X86-SSE2-NEXT: pand %xmm3, %xmm1
+; X86-SSE2-NEXT: pandn %xmm0, %xmm3
+; X86-SSE2-NEXT: por %xmm1, %xmm3
+; X86-SSE2-NEXT: movdqa %xmm3, %xmm0
+; X86-SSE2-NEXT: psrld $16, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm3, %xmm1
+; X86-SSE2-NEXT: pxor %xmm2, %xmm1
+; X86-SSE2-NEXT: pxor %xmm0, %xmm2
+; X86-SSE2-NEXT: pcmpgtw %xmm2, %xmm1
+; X86-SSE2-NEXT: pand %xmm1, %xmm3
+; X86-SSE2-NEXT: pandn %xmm0, %xmm1
+; X86-SSE2-NEXT: por %xmm3, %xmm1
+; X86-SSE2-NEXT: movd %xmm1, %eax
+; X86-SSE2-NEXT: ## kill: def %ax killed %ax killed %eax
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v16i16:
+; X86-SSE42: ## %bb.0:
+; X86-SSE42-NEXT: pmaxuw %xmm1, %xmm0
+; X86-SSE42-NEXT: pcmpeqd %xmm1, %xmm1
+; X86-SSE42-NEXT: pxor %xmm1, %xmm0
+; X86-SSE42-NEXT: phminposuw %xmm0, %xmm0
+; X86-SSE42-NEXT: pxor %xmm1, %xmm0
+; X86-SSE42-NEXT: movd %xmm0, %eax
+; X86-SSE42-NEXT: ## kill: def %ax killed %ax killed %eax
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX1-LABEL: test_reduce_v16i16:
+; X86-AVX1: ## %bb.0:
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X86-AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; X86-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vphminposuw %xmm0, %xmm0
+; X86-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vmovd %xmm0, %eax
+; X86-AVX1-NEXT: ## kill: def %ax killed %ax killed %eax
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_reduce_v16i16:
+; X86-AVX2: ## %bb.0:
+; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
+; X86-AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X86-AVX2-NEXT: vphminposuw %xmm0, %xmm0
+; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X86-AVX2-NEXT: vmovd %xmm0, %eax
+; X86-AVX2-NEXT: ## kill: def %ax killed %ax killed %eax
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v16i16:
+; X64-SSE2: ## %bb.0:
+; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm3
+; X64-SSE2-NEXT: pxor %xmm2, %xmm3
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm4
+; X64-SSE2-NEXT: pxor %xmm2, %xmm4
+; X64-SSE2-NEXT: pcmpgtw %xmm3, %xmm4
+; X64-SSE2-NEXT: pand %xmm4, %xmm0
+; X64-SSE2-NEXT: pandn %xmm1, %xmm4
+; X64-SSE2-NEXT: por %xmm0, %xmm4
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]
+; X64-SSE2-NEXT: movdqa %xmm4, %xmm1
+; X64-SSE2-NEXT: pxor %xmm2, %xmm1
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X64-SSE2-NEXT: pxor %xmm2, %xmm3
+; X64-SSE2-NEXT: pcmpgtw %xmm3, %xmm1
+; X64-SSE2-NEXT: pand %xmm1, %xmm4
+; X64-SSE2-NEXT: pandn %xmm0, %xmm1
+; X64-SSE2-NEXT: por %xmm4, %xmm1
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm3
+; X64-SSE2-NEXT: pxor %xmm2, %xmm3
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm4
+; X64-SSE2-NEXT: pxor %xmm2, %xmm4
+; X64-SSE2-NEXT: pcmpgtw %xmm4, %xmm3
+; X64-SSE2-NEXT: pand %xmm3, %xmm1
+; X64-SSE2-NEXT: pandn %xmm0, %xmm3
+; X64-SSE2-NEXT: por %xmm1, %xmm3
+; X64-SSE2-NEXT: movdqa %xmm3, %xmm0
+; X64-SSE2-NEXT: psrld $16, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm3, %xmm1
+; X64-SSE2-NEXT: pxor %xmm2, %xmm1
+; X64-SSE2-NEXT: pxor %xmm0, %xmm2
+; X64-SSE2-NEXT: pcmpgtw %xmm2, %xmm1
+; X64-SSE2-NEXT: pand %xmm1, %xmm3
+; X64-SSE2-NEXT: pandn %xmm0, %xmm1
+; X64-SSE2-NEXT: por %xmm3, %xmm1
+; X64-SSE2-NEXT: movd %xmm1, %eax
+; X64-SSE2-NEXT: ## kill: def %ax killed %ax killed %eax
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v16i16:
+; X64-SSE42: ## %bb.0:
+; X64-SSE42-NEXT: pmaxuw %xmm1, %xmm0
+; X64-SSE42-NEXT: pcmpeqd %xmm1, %xmm1
+; X64-SSE42-NEXT: pxor %xmm1, %xmm0
+; X64-SSE42-NEXT: phminposuw %xmm0, %xmm0
+; X64-SSE42-NEXT: pxor %xmm1, %xmm0
+; X64-SSE42-NEXT: movd %xmm0, %eax
+; X64-SSE42-NEXT: ## kill: def %ax killed %ax killed %eax
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v16i16:
+; X64-AVX1: ## %bb.0:
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X64-AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; X64-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vphminposuw %xmm0, %xmm0
+; X64-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vmovd %xmm0, %eax
+; X64-AVX1-NEXT: ## kill: def %ax killed %ax killed %eax
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v16i16:
+; X64-AVX2: ## %bb.0:
+; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0
+; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT: vmovd %xmm0, %eax
+; X64-AVX2-NEXT: ## kill: def %ax killed %ax killed %eax
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v16i16:
+; X64-AVX512: ## %bb.0:
+; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
+; X64-AVX512-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; X64-AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0
+; X64-AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX512-NEXT: vmovd %xmm0, %eax
+; X64-AVX512-NEXT: ## kill: def %ax killed %ax killed %eax
+; X64-AVX512-NEXT: vzeroupper
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp ugt <16 x i16> %a0, %1
+ %3 = select <16 x i1> %2, <16 x i16> %a0, <16 x i16> %1
+ %4 = shufflevector <16 x i16> %3, <16 x i16> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp ugt <16 x i16> %3, %4
+ %6 = select <16 x i1> %5, <16 x i16> %3, <16 x i16> %4
+ %7 = shufflevector <16 x i16> %6, <16 x i16> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp ugt <16 x i16> %6, %7
+ %9 = select <16 x i1> %8, <16 x i16> %6, <16 x i16> %7
+ %10 = shufflevector <16 x i16> %9, <16 x i16> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %11 = icmp ugt <16 x i16> %9, %10
+ %12 = select <16 x i1> %11, <16 x i16> %9, <16 x i16> %10
+ %13 = extractelement <16 x i16> %12, i32 0
+ ret i16 %13
+}
+
+define i8 @test_reduce_v32i8(<32 x i8> %a0) {
+; X86-SSE2-LABEL: test_reduce_v32i8:
+; X86-SSE2: ## %bb.0:
+; X86-SSE2-NEXT: pmaxub %xmm1, %xmm0
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: pmaxub %xmm0, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE2-NEXT: pmaxub %xmm1, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT: psrld $16, %xmm1
+; X86-SSE2-NEXT: pmaxub %xmm0, %xmm1
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE2-NEXT: psrlw $8, %xmm0
+; X86-SSE2-NEXT: pmaxub %xmm1, %xmm0
+; X86-SSE2-NEXT: movd %xmm0, %eax
+; X86-SSE2-NEXT: ## kill: def %al killed %al killed %eax
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v32i8:
+; X86-SSE42: ## %bb.0:
+; X86-SSE42-NEXT: pmaxub %xmm1, %xmm0
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT: pmaxub %xmm0, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT: pmaxub %xmm1, %xmm0
+; X86-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE42-NEXT: psrld $16, %xmm1
+; X86-SSE42-NEXT: pmaxub %xmm0, %xmm1
+; X86-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE42-NEXT: psrlw $8, %xmm0
+; X86-SSE42-NEXT: pmaxub %xmm1, %xmm0
+; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax
+; X86-SSE42-NEXT: ## kill: def %al killed %al killed %eax
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX1-LABEL: test_reduce_v32i8:
+; X86-AVX1: ## %bb.0:
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X86-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X86-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpextrb $0, %xmm0, %eax
+; X86-AVX1-NEXT: ## kill: def %al killed %al killed %eax
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_reduce_v32i8:
+; X86-AVX2: ## %bb.0:
+; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpextrb $0, %xmm0, %eax
+; X86-AVX2-NEXT: ## kill: def %al killed %al killed %eax
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v32i8:
+; X64-SSE2: ## %bb.0:
+; X64-SSE2-NEXT: pmaxub %xmm1, %xmm0
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: pmaxub %xmm0, %xmm1
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE2-NEXT: pmaxub %xmm1, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE2-NEXT: psrld $16, %xmm1
+; X64-SSE2-NEXT: pmaxub %xmm0, %xmm1
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE2-NEXT: psrlw $8, %xmm0
+; X64-SSE2-NEXT: pmaxub %xmm1, %xmm0
+; X64-SSE2-NEXT: movd %xmm0, %eax
+; X64-SSE2-NEXT: ## kill: def %al killed %al killed %eax
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v32i8:
+; X64-SSE42: ## %bb.0:
+; X64-SSE42-NEXT: pmaxub %xmm1, %xmm0
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT: pmaxub %xmm0, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT: pmaxub %xmm1, %xmm0
+; X64-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE42-NEXT: psrld $16, %xmm1
+; X64-SSE42-NEXT: pmaxub %xmm0, %xmm1
+; X64-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE42-NEXT: psrlw $8, %xmm0
+; X64-SSE42-NEXT: pmaxub %xmm1, %xmm0
+; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax
+; X64-SSE42-NEXT: ## kill: def %al killed %al killed %eax
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v32i8:
+; X64-AVX1: ## %bb.0:
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X64-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X64-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpextrb $0, %xmm0, %eax
+; X64-AVX1-NEXT: ## kill: def %al killed %al killed %eax
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v32i8:
+; X64-AVX2: ## %bb.0:
+; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpextrb $0, %xmm0, %eax
+; X64-AVX2-NEXT: ## kill: def %al killed %al killed %eax
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v32i8:
+; X64-AVX512: ## %bb.0:
+; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX512-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX512-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X64-AVX512-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpextrb $0, %xmm0, %eax
+; X64-AVX512-NEXT: ## kill: def %al killed %al killed %eax
+; X64-AVX512-NEXT: vzeroupper
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <32 x i8> %a0, <32 x i8> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp ugt <32 x i8> %a0, %1
+ %3 = select <32 x i1> %2, <32 x i8> %a0, <32 x i8> %1
+ %4 = shufflevector <32 x i8> %3, <32 x i8> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp ugt <32 x i8> %3, %4
+ %6 = select <32 x i1> %5, <32 x i8> %3, <32 x i8> %4
+ %7 = shufflevector <32 x i8> %6, <32 x i8> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp ugt <32 x i8> %6, %7
+ %9 = select <32 x i1> %8, <32 x i8> %6, <32 x i8> %7
+ %10 = shufflevector <32 x i8> %9, <32 x i8> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %11 = icmp ugt <32 x i8> %9, %10
+ %12 = select <32 x i1> %11, <32 x i8> %9, <32 x i8> %10
+ %13 = shufflevector <32 x i8> %12, <32 x i8> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %14 = icmp ugt <32 x i8> %12, %13
+ %15 = select <32 x i1> %14, <32 x i8> %12, <32 x i8> %13
+ %16 = extractelement <32 x i8> %15, i32 0
+ ret i8 %16
+}
+
+;
+; 512-bit Vectors
+;
+
+define i64 @test_reduce_v8i64(<8 x i64> %a0) {
+; X86-SSE2-LABEL: test_reduce_v8i64:
+; X86-SSE2: ## %bb.0:
+; X86-SSE2-NEXT: subl $28, %esp
+; X86-SSE2-NEXT: .cfi_def_cfa_offset 32
+; X86-SSE2-NEXT: movdqa %xmm3, %xmm5
+; X86-SSE2-NEXT: movdqa %xmm5, (%esp) ## 16-byte Spill
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm3
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
+; X86-SSE2-NEXT: pxor %xmm4, %xmm5
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm6
+; X86-SSE2-NEXT: pxor %xmm4, %xmm6
+; X86-SSE2-NEXT: movdqa %xmm6, %xmm7
+; X86-SSE2-NEXT: pcmpgtd %xmm5, %xmm7
+; X86-SSE2-NEXT: pcmpeqd %xmm5, %xmm6
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,0,2,2]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; X86-SSE2-NEXT: pand %xmm5, %xmm6
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; X86-SSE2-NEXT: por %xmm6, %xmm5
+; X86-SSE2-NEXT: movdqa %xmm3, %xmm6
+; X86-SSE2-NEXT: pxor %xmm4, %xmm6
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm7
+; X86-SSE2-NEXT: pxor %xmm4, %xmm7
+; X86-SSE2-NEXT: movdqa %xmm7, %xmm0
+; X86-SSE2-NEXT: pcmpgtd %xmm6, %xmm0
+; X86-SSE2-NEXT: pcmpeqd %xmm6, %xmm7
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,0,2,2]
+; X86-SSE2-NEXT: pand %xmm6, %xmm7
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
+; X86-SSE2-NEXT: por %xmm7, %xmm6
+; X86-SSE2-NEXT: pand %xmm6, %xmm1
+; X86-SSE2-NEXT: pandn %xmm3, %xmm6
+; X86-SSE2-NEXT: por %xmm1, %xmm6
+; X86-SSE2-NEXT: pand %xmm5, %xmm2
+; X86-SSE2-NEXT: pandn (%esp), %xmm5 ## 16-byte Folded Reload
+; X86-SSE2-NEXT: por %xmm2, %xmm5
+; X86-SSE2-NEXT: movdqa %xmm5, %xmm0
+; X86-SSE2-NEXT: pxor %xmm4, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm6, %xmm1
+; X86-SSE2-NEXT: pxor %xmm4, %xmm1
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm2
+; X86-SSE2-NEXT: pcmpeqd %xmm0, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; X86-SSE2-NEXT: pand %xmm0, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
+; X86-SSE2-NEXT: por %xmm1, %xmm0
+; X86-SSE2-NEXT: pand %xmm0, %xmm6
+; X86-SSE2-NEXT: pandn %xmm5, %xmm0
+; X86-SSE2-NEXT: por %xmm6, %xmm0
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT: pxor %xmm4, %xmm2
+; X86-SSE2-NEXT: pxor %xmm1, %xmm4
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm3
+; X86-SSE2-NEXT: pcmpgtd %xmm4, %xmm3
+; X86-SSE2-NEXT: pcmpeqd %xmm2, %xmm4
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; X86-SSE2-NEXT: pand %xmm2, %xmm4
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; X86-SSE2-NEXT: por %xmm4, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm0
+; X86-SSE2-NEXT: pandn %xmm1, %xmm2
+; X86-SSE2-NEXT: por %xmm0, %xmm2
+; X86-SSE2-NEXT: movd %xmm2, %eax
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X86-SSE2-NEXT: movd %xmm0, %edx
+; X86-SSE2-NEXT: addl $28, %esp
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v8i64:
+; X86-SSE42: ## %bb.0:
+; X86-SSE42-NEXT: movdqa %xmm0, %xmm4
+; X86-SSE42-NEXT: movdqa {{.*#+}} xmm6 = [0,2147483648,0,2147483648]
+; X86-SSE42-NEXT: movdqa %xmm3, %xmm0
+; X86-SSE42-NEXT: pxor %xmm6, %xmm0
+; X86-SSE42-NEXT: movdqa %xmm1, %xmm5
+; X86-SSE42-NEXT: pxor %xmm6, %xmm5
+; X86-SSE42-NEXT: pcmpgtq %xmm0, %xmm5
+; X86-SSE42-NEXT: movdqa %xmm2, %xmm7
+; X86-SSE42-NEXT: pxor %xmm6, %xmm7
+; X86-SSE42-NEXT: movdqa %xmm4, %xmm0
+; X86-SSE42-NEXT: pxor %xmm6, %xmm0
+; X86-SSE42-NEXT: pcmpgtq %xmm7, %xmm0
+; X86-SSE42-NEXT: blendvpd %xmm0, %xmm4, %xmm2
+; X86-SSE42-NEXT: movdqa %xmm5, %xmm0
+; X86-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm3
+; X86-SSE42-NEXT: movapd %xmm3, %xmm1
+; X86-SSE42-NEXT: xorpd %xmm6, %xmm1
+; X86-SSE42-NEXT: movapd %xmm2, %xmm0
+; X86-SSE42-NEXT: xorpd %xmm6, %xmm0
+; X86-SSE42-NEXT: pcmpgtq %xmm1, %xmm0
+; X86-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm3
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1]
+; X86-SSE42-NEXT: movdqa %xmm3, %xmm0
+; X86-SSE42-NEXT: pxor %xmm6, %xmm0
+; X86-SSE42-NEXT: pxor %xmm1, %xmm6
+; X86-SSE42-NEXT: pcmpgtq %xmm6, %xmm0
+; X86-SSE42-NEXT: blendvpd %xmm0, %xmm3, %xmm1
+; X86-SSE42-NEXT: movd %xmm1, %eax
+; X86-SSE42-NEXT: pextrd $1, %xmm1, %edx
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX1-LABEL: test_reduce_v8i64:
+; X86-AVX1: ## %bb.0:
+; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,2147483648,0,2147483648]
+; X86-AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; X86-AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm4
+; X86-AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2
+; X86-AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm4
+; X86-AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm5
+; X86-AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4
+; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2
+; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X86-AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm2
+; X86-AVX1-NEXT: vxorpd %xmm3, %xmm0, %xmm4
+; X86-AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm4
+; X86-AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm2
+; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2
+; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vxorpd %xmm3, %xmm0, %xmm2
+; X86-AVX1-NEXT: vxorpd %xmm3, %xmm1, %xmm4
+; X86-AVX1-NEXT: vpcmpgtq %xmm4, %xmm2, %xmm2
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; X86-AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm3
+; X86-AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm3
+; X86-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX1-NEXT: vmovd %xmm0, %eax
+; X86-AVX1-NEXT: vpextrd $1, %xmm0, %edx
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_reduce_v8i64:
+; X86-AVX2: ## %bb.0:
+; X86-AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648]
+; X86-AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm3
+; X86-AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm4
+; X86-AVX2-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm3
+; X86-AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0
+; X86-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT: vxorpd %ymm2, %ymm0, %ymm3
+; X86-AVX2-NEXT: vxorpd %ymm2, %ymm1, %ymm4
+; X86-AVX2-NEXT: vpcmpgtq %ymm4, %ymm3, %ymm3
+; X86-AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0
+; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vxorpd %ymm2, %ymm0, %ymm3
+; X86-AVX2-NEXT: vxorpd %ymm2, %ymm1, %ymm2
+; X86-AVX2-NEXT: vpcmpgtq %ymm2, %ymm3, %ymm2
+; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX2-NEXT: vmovd %xmm0, %eax
+; X86-AVX2-NEXT: vpextrd $1, %xmm0, %edx
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v8i64:
+; X64-SSE2: ## %bb.0:
+; X64-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
+; X64-SSE2-NEXT: movdqa %xmm3, %xmm5
+; X64-SSE2-NEXT: pxor %xmm4, %xmm5
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm6
+; X64-SSE2-NEXT: pxor %xmm4, %xmm6
+; X64-SSE2-NEXT: movdqa %xmm6, %xmm7
+; X64-SSE2-NEXT: pcmpgtd %xmm5, %xmm7
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
+; X64-SSE2-NEXT: pcmpeqd %xmm5, %xmm6
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; X64-SSE2-NEXT: pand %xmm8, %xmm6
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[1,1,3,3]
+; X64-SSE2-NEXT: por %xmm6, %xmm8
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm6
+; X64-SSE2-NEXT: pxor %xmm4, %xmm6
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm7
+; X64-SSE2-NEXT: pxor %xmm4, %xmm7
+; X64-SSE2-NEXT: movdqa %xmm7, %xmm5
+; X64-SSE2-NEXT: pcmpgtd %xmm6, %xmm5
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm5[0,0,2,2]
+; X64-SSE2-NEXT: pcmpeqd %xmm6, %xmm7
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
+; X64-SSE2-NEXT: pand %xmm9, %xmm7
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3]
+; X64-SSE2-NEXT: por %xmm7, %xmm6
+; X64-SSE2-NEXT: pand %xmm6, %xmm0
+; X64-SSE2-NEXT: pandn %xmm2, %xmm6
+; X64-SSE2-NEXT: por %xmm0, %xmm6
+; X64-SSE2-NEXT: pand %xmm8, %xmm1
+; X64-SSE2-NEXT: pandn %xmm3, %xmm8
+; X64-SSE2-NEXT: por %xmm1, %xmm8
+; X64-SSE2-NEXT: movdqa %xmm8, %xmm0
+; X64-SSE2-NEXT: pxor %xmm4, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm6, %xmm1
+; X64-SSE2-NEXT: pxor %xmm4, %xmm1
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X64-SSE2-NEXT: pcmpgtd %xmm0, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
+; X64-SSE2-NEXT: pcmpeqd %xmm0, %xmm1
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; X64-SSE2-NEXT: pand %xmm3, %xmm0
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; X64-SSE2-NEXT: por %xmm0, %xmm1
+; X64-SSE2-NEXT: pand %xmm1, %xmm6
+; X64-SSE2-NEXT: pandn %xmm8, %xmm1
+; X64-SSE2-NEXT: por %xmm6, %xmm1
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X64-SSE2-NEXT: pxor %xmm4, %xmm2
+; X64-SSE2-NEXT: pxor %xmm0, %xmm4
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm3
+; X64-SSE2-NEXT: pcmpgtd %xmm4, %xmm3
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
+; X64-SSE2-NEXT: pcmpeqd %xmm2, %xmm4
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
+; X64-SSE2-NEXT: pand %xmm5, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; X64-SSE2-NEXT: por %xmm2, %xmm3
+; X64-SSE2-NEXT: pand %xmm3, %xmm1
+; X64-SSE2-NEXT: pandn %xmm0, %xmm3
+; X64-SSE2-NEXT: por %xmm1, %xmm3
+; X64-SSE2-NEXT: movq %xmm3, %rax
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v8i64:
+; X64-SSE42: ## %bb.0:
+; X64-SSE42-NEXT: movdqa %xmm0, %xmm4
+; X64-SSE42-NEXT: movdqa {{.*#+}} xmm6 = [9223372036854775808,9223372036854775808]
+; X64-SSE42-NEXT: movdqa %xmm3, %xmm0
+; X64-SSE42-NEXT: pxor %xmm6, %xmm0
+; X64-SSE42-NEXT: movdqa %xmm1, %xmm5
+; X64-SSE42-NEXT: pxor %xmm6, %xmm5
+; X64-SSE42-NEXT: pcmpgtq %xmm0, %xmm5
+; X64-SSE42-NEXT: movdqa %xmm2, %xmm7
+; X64-SSE42-NEXT: pxor %xmm6, %xmm7
+; X64-SSE42-NEXT: movdqa %xmm4, %xmm0
+; X64-SSE42-NEXT: pxor %xmm6, %xmm0
+; X64-SSE42-NEXT: pcmpgtq %xmm7, %xmm0
+; X64-SSE42-NEXT: blendvpd %xmm0, %xmm4, %xmm2
+; X64-SSE42-NEXT: movdqa %xmm5, %xmm0
+; X64-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm3
+; X64-SSE42-NEXT: movapd %xmm3, %xmm1
+; X64-SSE42-NEXT: xorpd %xmm6, %xmm1
+; X64-SSE42-NEXT: movapd %xmm2, %xmm0
+; X64-SSE42-NEXT: xorpd %xmm6, %xmm0
+; X64-SSE42-NEXT: pcmpgtq %xmm1, %xmm0
+; X64-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm3
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1]
+; X64-SSE42-NEXT: movdqa %xmm3, %xmm0
+; X64-SSE42-NEXT: pxor %xmm6, %xmm0
+; X64-SSE42-NEXT: pxor %xmm1, %xmm6
+; X64-SSE42-NEXT: pcmpgtq %xmm6, %xmm0
+; X64-SSE42-NEXT: blendvpd %xmm0, %xmm3, %xmm1
+; X64-SSE42-NEXT: movq %xmm1, %rax
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v8i64:
+; X64-AVX1: ## %bb.0:
+; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; X64-AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; X64-AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm4
+; X64-AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2
+; X64-AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm4
+; X64-AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm5
+; X64-AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4
+; X64-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2
+; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X64-AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm2
+; X64-AVX1-NEXT: vxorpd %xmm3, %xmm0, %xmm4
+; X64-AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm4
+; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm2
+; X64-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2
+; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vxorpd %xmm3, %xmm0, %xmm2
+; X64-AVX1-NEXT: vxorpd %xmm3, %xmm1, %xmm4
+; X64-AVX1-NEXT: vpcmpgtq %xmm4, %xmm2, %xmm2
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; X64-AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm3
+; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm3
+; X64-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX1-NEXT: vmovq %xmm0, %rax
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v8i64:
+; X64-AVX2: ## %bb.0:
+; X64-AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; X64-AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm3
+; X64-AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm4
+; X64-AVX2-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm3
+; X64-AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0
+; X64-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vxorpd %ymm2, %ymm0, %ymm3
+; X64-AVX2-NEXT: vxorpd %ymm2, %ymm1, %ymm4
+; X64-AVX2-NEXT: vpcmpgtq %ymm4, %ymm3, %ymm3
+; X64-AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0
+; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vxorpd %ymm2, %ymm0, %ymm3
+; X64-AVX2-NEXT: vxorpd %ymm2, %ymm1, %ymm2
+; X64-AVX2-NEXT: vpcmpgtq %ymm2, %ymm3, %ymm2
+; X64-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX2-NEXT: vmovq %xmm0, %rax
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v8i64:
+; X64-AVX512: ## %bb.0:
+; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; X64-AVX512-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vmovq %xmm0, %rax
+; X64-AVX512-NEXT: vzeroupper
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <8 x i64> %a0, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp ugt <8 x i64> %a0, %1
+ %3 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %1
+ %4 = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp ugt <8 x i64> %3, %4
+ %6 = select <8 x i1> %5, <8 x i64> %3, <8 x i64> %4
+ %7 = shufflevector <8 x i64> %6, <8 x i64> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp ugt <8 x i64> %6, %7
+ %9 = select <8 x i1> %8, <8 x i64> %6, <8 x i64> %7
+ %10 = extractelement <8 x i64> %9, i32 0
+ ret i64 %10
+}
+
+define i32 @test_reduce_v16i32(<16 x i32> %a0) {
+; X86-SSE2-LABEL: test_reduce_v16i32:
+; X86-SSE2: ## %bb.0:
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
+; X86-SSE2-NEXT: movdqa %xmm3, %xmm5
+; X86-SSE2-NEXT: pxor %xmm4, %xmm5
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm6
+; X86-SSE2-NEXT: pxor %xmm4, %xmm6
+; X86-SSE2-NEXT: pcmpgtd %xmm5, %xmm6
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm5
+; X86-SSE2-NEXT: pxor %xmm4, %xmm5
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm7
+; X86-SSE2-NEXT: pxor %xmm4, %xmm7
+; X86-SSE2-NEXT: pcmpgtd %xmm5, %xmm7
+; X86-SSE2-NEXT: pand %xmm7, %xmm0
+; X86-SSE2-NEXT: pandn %xmm2, %xmm7
+; X86-SSE2-NEXT: por %xmm0, %xmm7
+; X86-SSE2-NEXT: pand %xmm6, %xmm1
+; X86-SSE2-NEXT: pandn %xmm3, %xmm6
+; X86-SSE2-NEXT: por %xmm1, %xmm6
+; X86-SSE2-NEXT: movdqa %xmm6, %xmm0
+; X86-SSE2-NEXT: pxor %xmm4, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm7, %xmm1
+; X86-SSE2-NEXT: pxor %xmm4, %xmm1
+; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm1
+; X86-SSE2-NEXT: pand %xmm1, %xmm7
+; X86-SSE2-NEXT: pandn %xmm6, %xmm1
+; X86-SSE2-NEXT: por %xmm7, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT: pxor %xmm4, %xmm2
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X86-SSE2-NEXT: pxor %xmm4, %xmm3
+; X86-SSE2-NEXT: pcmpgtd %xmm3, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm1
+; X86-SSE2-NEXT: pandn %xmm0, %xmm2
+; X86-SSE2-NEXT: por %xmm1, %xmm2
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm1
+; X86-SSE2-NEXT: pxor %xmm4, %xmm1
+; X86-SSE2-NEXT: pxor %xmm0, %xmm4
+; X86-SSE2-NEXT: pcmpgtd %xmm4, %xmm1
+; X86-SSE2-NEXT: pand %xmm1, %xmm2
+; X86-SSE2-NEXT: pandn %xmm0, %xmm1
+; X86-SSE2-NEXT: por %xmm2, %xmm1
+; X86-SSE2-NEXT: movd %xmm1, %eax
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v16i32:
+; X86-SSE42: ## %bb.0:
+; X86-SSE42-NEXT: pmaxud %xmm3, %xmm1
+; X86-SSE42-NEXT: pmaxud %xmm2, %xmm0
+; X86-SSE42-NEXT: pmaxud %xmm1, %xmm0
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT: pmaxud %xmm0, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT: pmaxud %xmm1, %xmm0
+; X86-SSE42-NEXT: movd %xmm0, %eax
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX1-LABEL: test_reduce_v16i32:
+; X86-AVX1: ## %bb.0:
+; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X86-AVX1-NEXT: vpmaxud %xmm2, %xmm3, %xmm2
+; X86-AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpmaxud %xmm2, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vmovd %xmm0, %eax
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_reduce_v16i32:
+; X86-AVX2: ## %bb.0:
+; X86-AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vmovd %xmm0, %eax
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v16i32:
+; X64-SSE2: ## %bb.0:
+; X64-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
+; X64-SSE2-NEXT: movdqa %xmm3, %xmm5
+; X64-SSE2-NEXT: pxor %xmm4, %xmm5
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm6
+; X64-SSE2-NEXT: pxor %xmm4, %xmm6
+; X64-SSE2-NEXT: pcmpgtd %xmm5, %xmm6
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm5
+; X64-SSE2-NEXT: pxor %xmm4, %xmm5
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm7
+; X64-SSE2-NEXT: pxor %xmm4, %xmm7
+; X64-SSE2-NEXT: pcmpgtd %xmm5, %xmm7
+; X64-SSE2-NEXT: pand %xmm7, %xmm0
+; X64-SSE2-NEXT: pandn %xmm2, %xmm7
+; X64-SSE2-NEXT: por %xmm0, %xmm7
+; X64-SSE2-NEXT: pand %xmm6, %xmm1
+; X64-SSE2-NEXT: pandn %xmm3, %xmm6
+; X64-SSE2-NEXT: por %xmm1, %xmm6
+; X64-SSE2-NEXT: movdqa %xmm6, %xmm0
+; X64-SSE2-NEXT: pxor %xmm4, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm7, %xmm1
+; X64-SSE2-NEXT: pxor %xmm4, %xmm1
+; X64-SSE2-NEXT: pcmpgtd %xmm0, %xmm1
+; X64-SSE2-NEXT: pand %xmm1, %xmm7
+; X64-SSE2-NEXT: pandn %xmm6, %xmm1
+; X64-SSE2-NEXT: por %xmm7, %xmm1
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X64-SSE2-NEXT: pxor %xmm4, %xmm2
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X64-SSE2-NEXT: pxor %xmm4, %xmm3
+; X64-SSE2-NEXT: pcmpgtd %xmm3, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm1
+; X64-SSE2-NEXT: pandn %xmm0, %xmm2
+; X64-SSE2-NEXT: por %xmm1, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm1
+; X64-SSE2-NEXT: pxor %xmm4, %xmm1
+; X64-SSE2-NEXT: pxor %xmm0, %xmm4
+; X64-SSE2-NEXT: pcmpgtd %xmm4, %xmm1
+; X64-SSE2-NEXT: pand %xmm1, %xmm2
+; X64-SSE2-NEXT: pandn %xmm0, %xmm1
+; X64-SSE2-NEXT: por %xmm2, %xmm1
+; X64-SSE2-NEXT: movd %xmm1, %eax
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v16i32:
+; X64-SSE42: ## %bb.0:
+; X64-SSE42-NEXT: pmaxud %xmm3, %xmm1
+; X64-SSE42-NEXT: pmaxud %xmm2, %xmm0
+; X64-SSE42-NEXT: pmaxud %xmm1, %xmm0
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT: pmaxud %xmm0, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT: pmaxud %xmm1, %xmm0
+; X64-SSE42-NEXT: movd %xmm0, %eax
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v16i32:
+; X64-AVX1: ## %bb.0:
+; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X64-AVX1-NEXT: vpmaxud %xmm2, %xmm3, %xmm2
+; X64-AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpmaxud %xmm2, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vmovd %xmm0, %eax
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v16i32:
+; X64-AVX2: ## %bb.0:
+; X64-AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vmovd %xmm0, %eax
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v16i32:
+; X64-AVX512: ## %bb.0:
+; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; X64-AVX512-NEXT: vpmaxud %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT: vpmaxud %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpmaxud %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX512-NEXT: vpmaxud %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vmovd %xmm0, %eax
+; X64-AVX512-NEXT: vzeroupper
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <16 x i32> %a0, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp ugt <16 x i32> %a0, %1
+ %3 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %1
+ %4 = shufflevector <16 x i32> %3, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp ugt <16 x i32> %3, %4
+ %6 = select <16 x i1> %5, <16 x i32> %3, <16 x i32> %4
+ %7 = shufflevector <16 x i32> %6, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp ugt <16 x i32> %6, %7
+ %9 = select <16 x i1> %8, <16 x i32> %6, <16 x i32> %7
+ %10 = shufflevector <16 x i32> %9, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %11 = icmp ugt <16 x i32> %9, %10
+ %12 = select <16 x i1> %11, <16 x i32> %9, <16 x i32> %10
+ %13 = extractelement <16 x i32> %12, i32 0
+ ret i32 %13
+}
+
+define i16 @test_reduce_v32i16(<32 x i16> %a0) {
+; X86-SSE2-LABEL: test_reduce_v32i16:
+; X86-SSE2: ## %bb.0:
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; X86-SSE2-NEXT: movdqa %xmm3, %xmm5
+; X86-SSE2-NEXT: pxor %xmm4, %xmm5
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm6
+; X86-SSE2-NEXT: pxor %xmm4, %xmm6
+; X86-SSE2-NEXT: pcmpgtw %xmm5, %xmm6
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm5
+; X86-SSE2-NEXT: pxor %xmm4, %xmm5
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm7
+; X86-SSE2-NEXT: pxor %xmm4, %xmm7
+; X86-SSE2-NEXT: pcmpgtw %xmm5, %xmm7
+; X86-SSE2-NEXT: pand %xmm7, %xmm0
+; X86-SSE2-NEXT: pandn %xmm2, %xmm7
+; X86-SSE2-NEXT: por %xmm0, %xmm7
+; X86-SSE2-NEXT: pand %xmm6, %xmm1
+; X86-SSE2-NEXT: pandn %xmm3, %xmm6
+; X86-SSE2-NEXT: por %xmm1, %xmm6
+; X86-SSE2-NEXT: movdqa %xmm6, %xmm0
+; X86-SSE2-NEXT: pxor %xmm4, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm7, %xmm1
+; X86-SSE2-NEXT: pxor %xmm4, %xmm1
+; X86-SSE2-NEXT: pcmpgtw %xmm0, %xmm1
+; X86-SSE2-NEXT: pand %xmm1, %xmm7
+; X86-SSE2-NEXT: pandn %xmm6, %xmm1
+; X86-SSE2-NEXT: por %xmm7, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT: pxor %xmm4, %xmm2
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X86-SSE2-NEXT: pxor %xmm4, %xmm3
+; X86-SSE2-NEXT: pcmpgtw %xmm3, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm1
+; X86-SSE2-NEXT: pandn %xmm0, %xmm2
+; X86-SSE2-NEXT: por %xmm1, %xmm2
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm1
+; X86-SSE2-NEXT: pxor %xmm4, %xmm1
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X86-SSE2-NEXT: pxor %xmm4, %xmm3
+; X86-SSE2-NEXT: pcmpgtw %xmm3, %xmm1
+; X86-SSE2-NEXT: pand %xmm1, %xmm2
+; X86-SSE2-NEXT: pandn %xmm0, %xmm1
+; X86-SSE2-NEXT: por %xmm2, %xmm1
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE2-NEXT: psrld $16, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT: pxor %xmm4, %xmm2
+; X86-SSE2-NEXT: pxor %xmm0, %xmm4
+; X86-SSE2-NEXT: pcmpgtw %xmm4, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm1
+; X86-SSE2-NEXT: pandn %xmm0, %xmm2
+; X86-SSE2-NEXT: por %xmm1, %xmm2
+; X86-SSE2-NEXT: movd %xmm2, %eax
+; X86-SSE2-NEXT: ## kill: def %ax killed %ax killed %eax
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v32i16:
+; X86-SSE42: ## %bb.0:
+; X86-SSE42-NEXT: pmaxuw %xmm3, %xmm1
+; X86-SSE42-NEXT: pmaxuw %xmm2, %xmm0
+; X86-SSE42-NEXT: pmaxuw %xmm1, %xmm0
+; X86-SSE42-NEXT: pcmpeqd %xmm1, %xmm1
+; X86-SSE42-NEXT: pxor %xmm1, %xmm0
+; X86-SSE42-NEXT: phminposuw %xmm0, %xmm0
+; X86-SSE42-NEXT: pxor %xmm1, %xmm0
+; X86-SSE42-NEXT: movd %xmm0, %eax
+; X86-SSE42-NEXT: ## kill: def %ax killed %ax killed %eax
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX1-LABEL: test_reduce_v32i16:
+; X86-AVX1: ## %bb.0:
+; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X86-AVX1-NEXT: vpmaxuw %xmm2, %xmm3, %xmm2
+; X86-AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpmaxuw %xmm2, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; X86-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vphminposuw %xmm0, %xmm0
+; X86-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vmovd %xmm0, %eax
+; X86-AVX1-NEXT: ## kill: def %ax killed %ax killed %eax
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_reduce_v32i16:
+; X86-AVX2: ## %bb.0:
+; X86-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
+; X86-AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X86-AVX2-NEXT: vphminposuw %xmm0, %xmm0
+; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X86-AVX2-NEXT: vmovd %xmm0, %eax
+; X86-AVX2-NEXT: ## kill: def %ax killed %ax killed %eax
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v32i16:
+; X64-SSE2: ## %bb.0:
+; X64-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; X64-SSE2-NEXT: movdqa %xmm3, %xmm5
+; X64-SSE2-NEXT: pxor %xmm4, %xmm5
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm6
+; X64-SSE2-NEXT: pxor %xmm4, %xmm6
+; X64-SSE2-NEXT: pcmpgtw %xmm5, %xmm6
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm5
+; X64-SSE2-NEXT: pxor %xmm4, %xmm5
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm7
+; X64-SSE2-NEXT: pxor %xmm4, %xmm7
+; X64-SSE2-NEXT: pcmpgtw %xmm5, %xmm7
+; X64-SSE2-NEXT: pand %xmm7, %xmm0
+; X64-SSE2-NEXT: pandn %xmm2, %xmm7
+; X64-SSE2-NEXT: por %xmm0, %xmm7
+; X64-SSE2-NEXT: pand %xmm6, %xmm1
+; X64-SSE2-NEXT: pandn %xmm3, %xmm6
+; X64-SSE2-NEXT: por %xmm1, %xmm6
+; X64-SSE2-NEXT: movdqa %xmm6, %xmm0
+; X64-SSE2-NEXT: pxor %xmm4, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm7, %xmm1
+; X64-SSE2-NEXT: pxor %xmm4, %xmm1
+; X64-SSE2-NEXT: pcmpgtw %xmm0, %xmm1
+; X64-SSE2-NEXT: pand %xmm1, %xmm7
+; X64-SSE2-NEXT: pandn %xmm6, %xmm1
+; X64-SSE2-NEXT: por %xmm7, %xmm1
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X64-SSE2-NEXT: pxor %xmm4, %xmm2
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X64-SSE2-NEXT: pxor %xmm4, %xmm3
+; X64-SSE2-NEXT: pcmpgtw %xmm3, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm1
+; X64-SSE2-NEXT: pandn %xmm0, %xmm2
+; X64-SSE2-NEXT: por %xmm1, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm1
+; X64-SSE2-NEXT: pxor %xmm4, %xmm1
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X64-SSE2-NEXT: pxor %xmm4, %xmm3
+; X64-SSE2-NEXT: pcmpgtw %xmm3, %xmm1
+; X64-SSE2-NEXT: pand %xmm1, %xmm2
+; X64-SSE2-NEXT: pandn %xmm0, %xmm1
+; X64-SSE2-NEXT: por %xmm2, %xmm1
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE2-NEXT: psrld $16, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X64-SSE2-NEXT: pxor %xmm4, %xmm2
+; X64-SSE2-NEXT: pxor %xmm0, %xmm4
+; X64-SSE2-NEXT: pcmpgtw %xmm4, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm1
+; X64-SSE2-NEXT: pandn %xmm0, %xmm2
+; X64-SSE2-NEXT: por %xmm1, %xmm2
+; X64-SSE2-NEXT: movd %xmm2, %eax
+; X64-SSE2-NEXT: ## kill: def %ax killed %ax killed %eax
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v32i16:
+; X64-SSE42: ## %bb.0:
+; X64-SSE42-NEXT: pmaxuw %xmm3, %xmm1
+; X64-SSE42-NEXT: pmaxuw %xmm2, %xmm0
+; X64-SSE42-NEXT: pmaxuw %xmm1, %xmm0
+; X64-SSE42-NEXT: pcmpeqd %xmm1, %xmm1
+; X64-SSE42-NEXT: pxor %xmm1, %xmm0
+; X64-SSE42-NEXT: phminposuw %xmm0, %xmm0
+; X64-SSE42-NEXT: pxor %xmm1, %xmm0
+; X64-SSE42-NEXT: movd %xmm0, %eax
+; X64-SSE42-NEXT: ## kill: def %ax killed %ax killed %eax
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v32i16:
+; X64-AVX1: ## %bb.0:
+; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X64-AVX1-NEXT: vpmaxuw %xmm2, %xmm3, %xmm2
+; X64-AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpmaxuw %xmm2, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; X64-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vphminposuw %xmm0, %xmm0
+; X64-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vmovd %xmm0, %eax
+; X64-AVX1-NEXT: ## kill: def %ax killed %ax killed %eax
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v32i16:
+; X64-AVX2: ## %bb.0:
+; X64-AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0
+; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT: vmovd %xmm0, %eax
+; X64-AVX2-NEXT: ## kill: def %ax killed %ax killed %eax
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v32i16:
+; X64-AVX512: ## %bb.0:
+; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; X64-AVX512-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
+; X64-AVX512-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; X64-AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0
+; X64-AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX512-NEXT: vmovd %xmm0, %eax
+; X64-AVX512-NEXT: ## kill: def %ax killed %ax killed %eax
+; X64-AVX512-NEXT: vzeroupper
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <32 x i16> %a0, <32 x i16> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp ugt <32 x i16> %a0, %1
+ %3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %1
+ %4 = shufflevector <32 x i16> %3, <32 x i16> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp ugt <32 x i16> %3, %4
+ %6 = select <32 x i1> %5, <32 x i16> %3, <32 x i16> %4
+ %7 = shufflevector <32 x i16> %6, <32 x i16> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp ugt <32 x i16> %6, %7
+ %9 = select <32 x i1> %8, <32 x i16> %6, <32 x i16> %7
+ %10 = shufflevector <32 x i16> %9, <32 x i16> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %11 = icmp ugt <32 x i16> %9, %10
+ %12 = select <32 x i1> %11, <32 x i16> %9, <32 x i16> %10
+ %13 = shufflevector <32 x i16> %12, <32 x i16> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %14 = icmp ugt <32 x i16> %12, %13
+ %15 = select <32 x i1> %14, <32 x i16> %12, <32 x i16> %13
+ %16 = extractelement <32 x i16> %15, i32 0
+ ret i16 %16
+}
+
+define i8 @test_reduce_v64i8(<64 x i8> %a0) {
+; X86-SSE2-LABEL: test_reduce_v64i8:
+; X86-SSE2: ## %bb.0:
+; X86-SSE2-NEXT: pmaxub %xmm3, %xmm1
+; X86-SSE2-NEXT: pmaxub %xmm2, %xmm0
+; X86-SSE2-NEXT: pmaxub %xmm1, %xmm0
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: pmaxub %xmm0, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE2-NEXT: pmaxub %xmm1, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT: psrld $16, %xmm1
+; X86-SSE2-NEXT: pmaxub %xmm0, %xmm1
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE2-NEXT: psrlw $8, %xmm0
+; X86-SSE2-NEXT: pmaxub %xmm1, %xmm0
+; X86-SSE2-NEXT: movd %xmm0, %eax
+; X86-SSE2-NEXT: ## kill: def %al killed %al killed %eax
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v64i8:
+; X86-SSE42: ## %bb.0:
+; X86-SSE42-NEXT: pmaxub %xmm3, %xmm1
+; X86-SSE42-NEXT: pmaxub %xmm2, %xmm0
+; X86-SSE42-NEXT: pmaxub %xmm1, %xmm0
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT: pmaxub %xmm0, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT: pmaxub %xmm1, %xmm0
+; X86-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE42-NEXT: psrld $16, %xmm1
+; X86-SSE42-NEXT: pmaxub %xmm0, %xmm1
+; X86-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE42-NEXT: psrlw $8, %xmm0
+; X86-SSE42-NEXT: pmaxub %xmm1, %xmm0
+; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax
+; X86-SSE42-NEXT: ## kill: def %al killed %al killed %eax
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX1-LABEL: test_reduce_v64i8:
+; X86-AVX1: ## %bb.0:
+; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X86-AVX1-NEXT: vpmaxub %xmm2, %xmm3, %xmm2
+; X86-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpmaxub %xmm2, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X86-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpextrb $0, %xmm0, %eax
+; X86-AVX1-NEXT: ## kill: def %al killed %al killed %eax
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_reduce_v64i8:
+; X86-AVX2: ## %bb.0:
+; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X86-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpextrb $0, %xmm0, %eax
+; X86-AVX2-NEXT: ## kill: def %al killed %al killed %eax
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v64i8:
+; X64-SSE2: ## %bb.0:
+; X64-SSE2-NEXT: pmaxub %xmm3, %xmm1
+; X64-SSE2-NEXT: pmaxub %xmm2, %xmm0
+; X64-SSE2-NEXT: pmaxub %xmm1, %xmm0
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: pmaxub %xmm0, %xmm1
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE2-NEXT: pmaxub %xmm1, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE2-NEXT: psrld $16, %xmm1
+; X64-SSE2-NEXT: pmaxub %xmm0, %xmm1
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE2-NEXT: psrlw $8, %xmm0
+; X64-SSE2-NEXT: pmaxub %xmm1, %xmm0
+; X64-SSE2-NEXT: movd %xmm0, %eax
+; X64-SSE2-NEXT: ## kill: def %al killed %al killed %eax
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v64i8:
+; X64-SSE42: ## %bb.0:
+; X64-SSE42-NEXT: pmaxub %xmm3, %xmm1
+; X64-SSE42-NEXT: pmaxub %xmm2, %xmm0
+; X64-SSE42-NEXT: pmaxub %xmm1, %xmm0
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT: pmaxub %xmm0, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT: pmaxub %xmm1, %xmm0
+; X64-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE42-NEXT: psrld $16, %xmm1
+; X64-SSE42-NEXT: pmaxub %xmm0, %xmm1
+; X64-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE42-NEXT: psrlw $8, %xmm0
+; X64-SSE42-NEXT: pmaxub %xmm1, %xmm0
+; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax
+; X64-SSE42-NEXT: ## kill: def %al killed %al killed %eax
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v64i8:
+; X64-AVX1: ## %bb.0:
+; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X64-AVX1-NEXT: vpmaxub %xmm2, %xmm3, %xmm2
+; X64-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpmaxub %xmm2, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X64-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpextrb $0, %xmm0, %eax
+; X64-AVX1-NEXT: ## kill: def %al killed %al killed %eax
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v64i8:
+; X64-AVX2: ## %bb.0:
+; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X64-AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpextrb $0, %xmm0, %eax
+; X64-AVX2-NEXT: ## kill: def %al killed %al killed %eax
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v64i8:
+; X64-AVX512: ## %bb.0:
+; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; X64-AVX512-NEXT: vpmaxub %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT: vpmaxub %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpmaxub %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX512-NEXT: vpmaxub %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX512-NEXT: vpmaxub %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X64-AVX512-NEXT: vpmaxub %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpextrb $0, %xmm0, %eax
+; X64-AVX512-NEXT: ## kill: def %al killed %al killed %eax
+; X64-AVX512-NEXT: vzeroupper
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <64 x i8> %a0, <64 x i8> undef, <64 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp ugt <64 x i8> %a0, %1
+ %3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %1
+ %4 = shufflevector <64 x i8> %3, <64 x i8> undef, <64 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp ugt <64 x i8> %3, %4
+ %6 = select <64 x i1> %5, <64 x i8> %3, <64 x i8> %4
+ %7 = shufflevector <64 x i8> %6, <64 x i8> undef, <64 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp ugt <64 x i8> %6, %7
+ %9 = select <64 x i1> %8, <64 x i8> %6, <64 x i8> %7
+ %10 = shufflevector <64 x i8> %9, <64 x i8> undef, <64 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %11 = icmp ugt <64 x i8> %9, %10
+ %12 = select <64 x i1> %11, <64 x i8> %9, <64 x i8> %10
+ %13 = shufflevector <64 x i8> %12, <64 x i8> undef, <64 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %14 = icmp ugt <64 x i8> %12, %13
+ %15 = select <64 x i1> %14, <64 x i8> %12, <64 x i8> %13
+ %16 = shufflevector <64 x i8> %15, <64 x i8> undef, <64 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %17 = icmp ugt <64 x i8> %15, %16
+ %18 = select <64 x i1> %17, <64 x i8> %15, <64 x i8> %16
+ %19 = extractelement <64 x i8> %18, i32 0
+ ret i8 %19
+}
diff --git a/test/CodeGen/X86/horizontal-reduce-umin.ll b/test/CodeGen/X86/horizontal-reduce-umin.ll
new file mode 100644
index 000000000000..505663656a3a
--- /dev/null
+++ b/test/CodeGen/X86/horizontal-reduce-umin.ll
@@ -0,0 +1,2111 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+sse2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE --check-prefix=X86-SSE2
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+sse4.2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE --check-prefix=X86-SSE42
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx | FileCheck %s --check-prefix=X86 --check-prefix=X86-AVX --check-prefix=X86-AVX1
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-AVX --check-prefix=X86-AVX2
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+sse2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE --check-prefix=X64-SSE2
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+sse4.2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE --check-prefix=X64-SSE42
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX1
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX2
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vl | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX512
+
+;
+; 128-bit Vectors
+;
+
+define i64 @test_reduce_v2i64(<2 x i64> %a0) {
+; X86-SSE2-LABEL: test_reduce_v2i64:
+; X86-SSE2: ## %bb.0:
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X86-SSE2-NEXT: pxor %xmm2, %xmm3
+; X86-SSE2-NEXT: pxor %xmm1, %xmm2
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm4
+; X86-SSE2-NEXT: pcmpgtd %xmm3, %xmm4
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; X86-SSE2-NEXT: pcmpeqd %xmm3, %xmm2
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; X86-SSE2-NEXT: pand %xmm5, %xmm2
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; X86-SSE2-NEXT: por %xmm2, %xmm3
+; X86-SSE2-NEXT: pand %xmm3, %xmm0
+; X86-SSE2-NEXT: pandn %xmm1, %xmm3
+; X86-SSE2-NEXT: por %xmm0, %xmm3
+; X86-SSE2-NEXT: movd %xmm3, %eax
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
+; X86-SSE2-NEXT: movd %xmm0, %edx
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v2i64:
+; X86-SSE42: ## %bb.0:
+; X86-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; X86-SSE42-NEXT: movdqa {{.*#+}} xmm0 = [0,2147483648,0,2147483648]
+; X86-SSE42-NEXT: movdqa %xmm1, %xmm3
+; X86-SSE42-NEXT: pxor %xmm0, %xmm3
+; X86-SSE42-NEXT: pxor %xmm2, %xmm0
+; X86-SSE42-NEXT: pcmpgtq %xmm3, %xmm0
+; X86-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2
+; X86-SSE42-NEXT: movd %xmm2, %eax
+; X86-SSE42-NEXT: pextrd $1, %xmm2, %edx
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX-LABEL: test_reduce_v2i64:
+; X86-AVX: ## %bb.0:
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648]
+; X86-AVX-NEXT: vpxor %xmm2, %xmm0, %xmm3
+; X86-AVX-NEXT: vpxor %xmm2, %xmm1, %xmm2
+; X86-AVX-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
+; X86-AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; X86-AVX-NEXT: vmovd %xmm0, %eax
+; X86-AVX-NEXT: vpextrd $1, %xmm0, %edx
+; X86-AVX-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v2i64:
+; X64-SSE2: ## %bb.0:
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X64-SSE2-NEXT: pxor %xmm2, %xmm3
+; X64-SSE2-NEXT: pxor %xmm1, %xmm2
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm4
+; X64-SSE2-NEXT: pcmpgtd %xmm3, %xmm4
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; X64-SSE2-NEXT: pcmpeqd %xmm3, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; X64-SSE2-NEXT: pand %xmm5, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; X64-SSE2-NEXT: por %xmm2, %xmm3
+; X64-SSE2-NEXT: pand %xmm3, %xmm0
+; X64-SSE2-NEXT: pandn %xmm1, %xmm3
+; X64-SSE2-NEXT: por %xmm0, %xmm3
+; X64-SSE2-NEXT: movq %xmm3, %rax
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v2i64:
+; X64-SSE42: ## %bb.0:
+; X64-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; X64-SSE42-NEXT: movdqa {{.*#+}} xmm0 = [9223372036854775808,9223372036854775808]
+; X64-SSE42-NEXT: movdqa %xmm1, %xmm3
+; X64-SSE42-NEXT: pxor %xmm0, %xmm3
+; X64-SSE42-NEXT: pxor %xmm2, %xmm0
+; X64-SSE42-NEXT: pcmpgtq %xmm3, %xmm0
+; X64-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2
+; X64-SSE42-NEXT: movq %xmm2, %rax
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v2i64:
+; X64-AVX1: ## %bb.0:
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; X64-AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm3
+; X64-AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm2
+; X64-AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
+; X64-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; X64-AVX1-NEXT: vmovq %xmm0, %rax
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v2i64:
+; X64-AVX2: ## %bb.0:
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; X64-AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3
+; X64-AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm2
+; X64-AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
+; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; X64-AVX2-NEXT: vmovq %xmm0, %rax
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v2i64:
+; X64-AVX512: ## %bb.0:
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpminuq %xmm1, %xmm0, %xmm0
+; X64-AVX512-NEXT: vmovq %xmm0, %rax
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <2 x i64> %a0, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
+ %2 = icmp ult <2 x i64> %a0, %1
+ %3 = select <2 x i1> %2, <2 x i64> %a0, <2 x i64> %1
+ %4 = extractelement <2 x i64> %3, i32 0
+ ret i64 %4
+}
+
+define i32 @test_reduce_v4i32(<4 x i32> %a0) {
+; X86-SSE2-LABEL: test_reduce_v4i32:
+; X86-SSE2: ## %bb.0:
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X86-SSE2-NEXT: pxor %xmm2, %xmm3
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm4
+; X86-SSE2-NEXT: pxor %xmm2, %xmm4
+; X86-SSE2-NEXT: pcmpgtd %xmm3, %xmm4
+; X86-SSE2-NEXT: pand %xmm4, %xmm0
+; X86-SSE2-NEXT: pandn %xmm1, %xmm4
+; X86-SSE2-NEXT: por %xmm0, %xmm4
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,2,3]
+; X86-SSE2-NEXT: movdqa %xmm4, %xmm1
+; X86-SSE2-NEXT: pxor %xmm2, %xmm1
+; X86-SSE2-NEXT: pxor %xmm0, %xmm2
+; X86-SSE2-NEXT: pcmpgtd %xmm1, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm4
+; X86-SSE2-NEXT: pandn %xmm0, %xmm2
+; X86-SSE2-NEXT: por %xmm4, %xmm2
+; X86-SSE2-NEXT: movd %xmm2, %eax
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v4i32:
+; X86-SSE42: ## %bb.0:
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT: pminud %xmm0, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT: pminud %xmm1, %xmm0
+; X86-SSE42-NEXT: movd %xmm0, %eax
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX-LABEL: test_reduce_v4i32:
+; X86-AVX: ## %bb.0:
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX-NEXT: vpminud %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX-NEXT: vpminud %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vmovd %xmm0, %eax
+; X86-AVX-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v4i32:
+; X64-SSE2: ## %bb.0:
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X64-SSE2-NEXT: pxor %xmm2, %xmm3
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm4
+; X64-SSE2-NEXT: pxor %xmm2, %xmm4
+; X64-SSE2-NEXT: pcmpgtd %xmm3, %xmm4
+; X64-SSE2-NEXT: pand %xmm4, %xmm0
+; X64-SSE2-NEXT: pandn %xmm1, %xmm4
+; X64-SSE2-NEXT: por %xmm0, %xmm4
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,2,3]
+; X64-SSE2-NEXT: movdqa %xmm4, %xmm1
+; X64-SSE2-NEXT: pxor %xmm2, %xmm1
+; X64-SSE2-NEXT: pxor %xmm0, %xmm2
+; X64-SSE2-NEXT: pcmpgtd %xmm1, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm4
+; X64-SSE2-NEXT: pandn %xmm0, %xmm2
+; X64-SSE2-NEXT: por %xmm4, %xmm2
+; X64-SSE2-NEXT: movd %xmm2, %eax
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v4i32:
+; X64-SSE42: ## %bb.0:
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT: pminud %xmm0, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT: pminud %xmm1, %xmm0
+; X64-SSE42-NEXT: movd %xmm0, %eax
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX-LABEL: test_reduce_v4i32:
+; X64-AVX: ## %bb.0:
+; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX-NEXT: vpminud %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX-NEXT: vpminud %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vmovd %xmm0, %eax
+; X64-AVX-NEXT: retq
+ %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+ %2 = icmp ult <4 x i32> %a0, %1
+ %3 = select <4 x i1> %2, <4 x i32> %a0, <4 x i32> %1
+ %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+ %5 = icmp ult <4 x i32> %3, %4
+ %6 = select <4 x i1> %5, <4 x i32> %3, <4 x i32> %4
+ %7 = extractelement <4 x i32> %6, i32 0
+ ret i32 %7
+}
+
+define i16 @test_reduce_v8i16(<8 x i16> %a0) {
+; X86-SSE2-LABEL: test_reduce_v8i16:
+; X86-SSE2: ## %bb.0:
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X86-SSE2-NEXT: pxor %xmm1, %xmm3
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm4
+; X86-SSE2-NEXT: pxor %xmm1, %xmm4
+; X86-SSE2-NEXT: pcmpgtw %xmm3, %xmm4
+; X86-SSE2-NEXT: pand %xmm4, %xmm0
+; X86-SSE2-NEXT: pandn %xmm2, %xmm4
+; X86-SSE2-NEXT: por %xmm0, %xmm4
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,2,3]
+; X86-SSE2-NEXT: movdqa %xmm4, %xmm2
+; X86-SSE2-NEXT: pxor %xmm1, %xmm2
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X86-SSE2-NEXT: pxor %xmm1, %xmm3
+; X86-SSE2-NEXT: pcmpgtw %xmm2, %xmm3
+; X86-SSE2-NEXT: pand %xmm3, %xmm4
+; X86-SSE2-NEXT: pandn %xmm0, %xmm3
+; X86-SSE2-NEXT: por %xmm4, %xmm3
+; X86-SSE2-NEXT: movdqa %xmm3, %xmm0
+; X86-SSE2-NEXT: psrld $16, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm3, %xmm2
+; X86-SSE2-NEXT: pxor %xmm1, %xmm2
+; X86-SSE2-NEXT: pxor %xmm0, %xmm1
+; X86-SSE2-NEXT: pcmpgtw %xmm2, %xmm1
+; X86-SSE2-NEXT: pand %xmm1, %xmm3
+; X86-SSE2-NEXT: pandn %xmm0, %xmm1
+; X86-SSE2-NEXT: por %xmm3, %xmm1
+; X86-SSE2-NEXT: movd %xmm1, %eax
+; X86-SSE2-NEXT: ## kill: def %ax killed %ax killed %eax
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v8i16:
+; X86-SSE42: ## %bb.0:
+; X86-SSE42-NEXT: phminposuw %xmm0, %xmm0
+; X86-SSE42-NEXT: movd %xmm0, %eax
+; X86-SSE42-NEXT: ## kill: def %ax killed %ax killed %eax
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX-LABEL: test_reduce_v8i16:
+; X86-AVX: ## %bb.0:
+; X86-AVX-NEXT: vphminposuw %xmm0, %xmm0
+; X86-AVX-NEXT: vmovd %xmm0, %eax
+; X86-AVX-NEXT: ## kill: def %ax killed %ax killed %eax
+; X86-AVX-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v8i16:
+; X64-SSE2: ## %bb.0:
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X64-SSE2-NEXT: pxor %xmm1, %xmm3
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm4
+; X64-SSE2-NEXT: pxor %xmm1, %xmm4
+; X64-SSE2-NEXT: pcmpgtw %xmm3, %xmm4
+; X64-SSE2-NEXT: pand %xmm4, %xmm0
+; X64-SSE2-NEXT: pandn %xmm2, %xmm4
+; X64-SSE2-NEXT: por %xmm0, %xmm4
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,2,3]
+; X64-SSE2-NEXT: movdqa %xmm4, %xmm2
+; X64-SSE2-NEXT: pxor %xmm1, %xmm2
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X64-SSE2-NEXT: pxor %xmm1, %xmm3
+; X64-SSE2-NEXT: pcmpgtw %xmm2, %xmm3
+; X64-SSE2-NEXT: pand %xmm3, %xmm4
+; X64-SSE2-NEXT: pandn %xmm0, %xmm3
+; X64-SSE2-NEXT: por %xmm4, %xmm3
+; X64-SSE2-NEXT: movdqa %xmm3, %xmm0
+; X64-SSE2-NEXT: psrld $16, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm3, %xmm2
+; X64-SSE2-NEXT: pxor %xmm1, %xmm2
+; X64-SSE2-NEXT: pxor %xmm0, %xmm1
+; X64-SSE2-NEXT: pcmpgtw %xmm2, %xmm1
+; X64-SSE2-NEXT: pand %xmm1, %xmm3
+; X64-SSE2-NEXT: pandn %xmm0, %xmm1
+; X64-SSE2-NEXT: por %xmm3, %xmm1
+; X64-SSE2-NEXT: movd %xmm1, %eax
+; X64-SSE2-NEXT: ## kill: def %ax killed %ax killed %eax
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v8i16:
+; X64-SSE42: ## %bb.0:
+; X64-SSE42-NEXT: phminposuw %xmm0, %xmm0
+; X64-SSE42-NEXT: movd %xmm0, %eax
+; X64-SSE42-NEXT: ## kill: def %ax killed %ax killed %eax
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX-LABEL: test_reduce_v8i16:
+; X64-AVX: ## %bb.0:
+; X64-AVX-NEXT: vphminposuw %xmm0, %xmm0
+; X64-AVX-NEXT: vmovd %xmm0, %eax
+; X64-AVX-NEXT: ## kill: def %ax killed %ax killed %eax
+; X64-AVX-NEXT: retq
+ %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp ult <8 x i16> %a0, %1
+ %3 = select <8 x i1> %2, <8 x i16> %a0, <8 x i16> %1
+ %4 = shufflevector <8 x i16> %3, <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp ult <8 x i16> %3, %4
+ %6 = select <8 x i1> %5, <8 x i16> %3, <8 x i16> %4
+ %7 = shufflevector <8 x i16> %6, <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp ult <8 x i16> %6, %7
+ %9 = select <8 x i1> %8, <8 x i16> %6, <8 x i16> %7
+ %10 = extractelement <8 x i16> %9, i32 0
+ ret i16 %10
+}
+
+define i8 @test_reduce_v16i8(<16 x i8> %a0) {
+; X86-SSE2-LABEL: test_reduce_v16i8:
+; X86-SSE2: ## %bb.0:
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: pminub %xmm0, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE2-NEXT: pminub %xmm1, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT: psrld $16, %xmm1
+; X86-SSE2-NEXT: pminub %xmm0, %xmm1
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE2-NEXT: psrlw $8, %xmm0
+; X86-SSE2-NEXT: pminub %xmm1, %xmm0
+; X86-SSE2-NEXT: movd %xmm0, %eax
+; X86-SSE2-NEXT: ## kill: def %al killed %al killed %eax
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v16i8:
+; X86-SSE42: ## %bb.0:
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT: pminub %xmm0, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT: pminub %xmm1, %xmm0
+; X86-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE42-NEXT: psrld $16, %xmm1
+; X86-SSE42-NEXT: pminub %xmm0, %xmm1
+; X86-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE42-NEXT: psrlw $8, %xmm0
+; X86-SSE42-NEXT: pminub %xmm1, %xmm0
+; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax
+; X86-SSE42-NEXT: ## kill: def %al killed %al killed %eax
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX-LABEL: test_reduce_v16i8:
+; X86-AVX: ## %bb.0:
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X86-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vpextrb $0, %xmm0, %eax
+; X86-AVX-NEXT: ## kill: def %al killed %al killed %eax
+; X86-AVX-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v16i8:
+; X64-SSE2: ## %bb.0:
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: pminub %xmm0, %xmm1
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE2-NEXT: pminub %xmm1, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE2-NEXT: psrld $16, %xmm1
+; X64-SSE2-NEXT: pminub %xmm0, %xmm1
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE2-NEXT: psrlw $8, %xmm0
+; X64-SSE2-NEXT: pminub %xmm1, %xmm0
+; X64-SSE2-NEXT: movd %xmm0, %eax
+; X64-SSE2-NEXT: ## kill: def %al killed %al killed %eax
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v16i8:
+; X64-SSE42: ## %bb.0:
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT: pminub %xmm0, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT: pminub %xmm1, %xmm0
+; X64-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE42-NEXT: psrld $16, %xmm1
+; X64-SSE42-NEXT: pminub %xmm0, %xmm1
+; X64-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE42-NEXT: psrlw $8, %xmm0
+; X64-SSE42-NEXT: pminub %xmm1, %xmm0
+; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax
+; X64-SSE42-NEXT: ## kill: def %al killed %al killed %eax
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX-LABEL: test_reduce_v16i8:
+; X64-AVX: ## %bb.0:
+; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X64-AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vpextrb $0, %xmm0, %eax
+; X64-AVX-NEXT: ## kill: def %al killed %al killed %eax
+; X64-AVX-NEXT: retq
+ %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp ult <16 x i8> %a0, %1
+ %3 = select <16 x i1> %2, <16 x i8> %a0, <16 x i8> %1
+ %4 = shufflevector <16 x i8> %3, <16 x i8> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp ult <16 x i8> %3, %4
+ %6 = select <16 x i1> %5, <16 x i8> %3, <16 x i8> %4
+ %7 = shufflevector <16 x i8> %6, <16 x i8> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp ult <16 x i8> %6, %7
+ %9 = select <16 x i1> %8, <16 x i8> %6, <16 x i8> %7
+ %10 = shufflevector <16 x i8> %9, <16 x i8> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %11 = icmp ult <16 x i8> %9, %10
+ %12 = select <16 x i1> %11, <16 x i8> %9, <16 x i8> %10
+ %13 = extractelement <16 x i8> %12, i32 0
+ ret i8 %13
+}
+
+;
+; 256-bit Vectors
+;
+
+define i64 @test_reduce_v4i64(<4 x i64> %a0) {
+; X86-SSE2-LABEL: test_reduce_v4i64:
+; X86-SSE2: ## %bb.0:
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X86-SSE2-NEXT: pxor %xmm2, %xmm3
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm4
+; X86-SSE2-NEXT: pxor %xmm2, %xmm4
+; X86-SSE2-NEXT: movdqa %xmm4, %xmm5
+; X86-SSE2-NEXT: pcmpgtd %xmm3, %xmm5
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
+; X86-SSE2-NEXT: pcmpeqd %xmm3, %xmm4
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; X86-SSE2-NEXT: pand %xmm6, %xmm3
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
+; X86-SSE2-NEXT: por %xmm3, %xmm4
+; X86-SSE2-NEXT: pand %xmm4, %xmm0
+; X86-SSE2-NEXT: pandn %xmm1, %xmm4
+; X86-SSE2-NEXT: por %xmm0, %xmm4
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]
+; X86-SSE2-NEXT: movdqa %xmm4, %xmm1
+; X86-SSE2-NEXT: pxor %xmm2, %xmm1
+; X86-SSE2-NEXT: pxor %xmm0, %xmm2
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm3
+; X86-SSE2-NEXT: pcmpgtd %xmm1, %xmm3
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
+; X86-SSE2-NEXT: pcmpeqd %xmm1, %xmm2
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; X86-SSE2-NEXT: pand %xmm5, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; X86-SSE2-NEXT: por %xmm1, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm4
+; X86-SSE2-NEXT: pandn %xmm0, %xmm2
+; X86-SSE2-NEXT: por %xmm4, %xmm2
+; X86-SSE2-NEXT: movd %xmm2, %eax
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X86-SSE2-NEXT: movd %xmm0, %edx
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v4i64:
+; X86-SSE42: ## %bb.0:
+; X86-SSE42-NEXT: movdqa %xmm0, %xmm2
+; X86-SSE42-NEXT: movdqa {{.*#+}} xmm3 = [0,2147483648,0,2147483648]
+; X86-SSE42-NEXT: movdqa %xmm2, %xmm4
+; X86-SSE42-NEXT: pxor %xmm3, %xmm4
+; X86-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE42-NEXT: pxor %xmm3, %xmm0
+; X86-SSE42-NEXT: pcmpgtq %xmm4, %xmm0
+; X86-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; X86-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE42-NEXT: pxor %xmm3, %xmm0
+; X86-SSE42-NEXT: pxor %xmm2, %xmm3
+; X86-SSE42-NEXT: pcmpgtq %xmm0, %xmm3
+; X86-SSE42-NEXT: movdqa %xmm3, %xmm0
+; X86-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2
+; X86-SSE42-NEXT: movd %xmm2, %eax
+; X86-SSE42-NEXT: pextrd $1, %xmm2, %edx
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX1-LABEL: test_reduce_v4i64:
+; X86-AVX1: ## %bb.0:
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648]
+; X86-AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm3
+; X86-AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm4
+; X86-AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3
+; X86-AVX1-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm4
+; X86-AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; X86-AVX1-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0
+; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vxorpd %xmm2, %xmm0, %xmm3
+; X86-AVX1-NEXT: vxorpd %xmm2, %xmm1, %xmm4
+; X86-AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; X86-AVX1-NEXT: vpxor %xmm2, %xmm4, %xmm2
+; X86-AVX1-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm2
+; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX1-NEXT: vmovd %xmm0, %eax
+; X86-AVX1-NEXT: vpextrd $1, %xmm0, %edx
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_reduce_v4i64:
+; X86-AVX2: ## %bb.0:
+; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648]
+; X86-AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm3
+; X86-AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm4
+; X86-AVX2-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm3
+; X86-AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0
+; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vxorpd %ymm2, %ymm0, %ymm3
+; X86-AVX2-NEXT: vxorpd %ymm2, %ymm1, %ymm2
+; X86-AVX2-NEXT: vpcmpgtq %ymm3, %ymm2, %ymm2
+; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX2-NEXT: vmovd %xmm0, %eax
+; X86-AVX2-NEXT: vpextrd $1, %xmm0, %edx
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v4i64:
+; X64-SSE2: ## %bb.0:
+; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X64-SSE2-NEXT: pxor %xmm2, %xmm3
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm4
+; X64-SSE2-NEXT: pxor %xmm2, %xmm4
+; X64-SSE2-NEXT: movdqa %xmm4, %xmm5
+; X64-SSE2-NEXT: pcmpgtd %xmm3, %xmm5
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
+; X64-SSE2-NEXT: pcmpeqd %xmm3, %xmm4
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; X64-SSE2-NEXT: pand %xmm6, %xmm3
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
+; X64-SSE2-NEXT: por %xmm3, %xmm4
+; X64-SSE2-NEXT: pand %xmm4, %xmm0
+; X64-SSE2-NEXT: pandn %xmm1, %xmm4
+; X64-SSE2-NEXT: por %xmm0, %xmm4
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]
+; X64-SSE2-NEXT: movdqa %xmm4, %xmm1
+; X64-SSE2-NEXT: pxor %xmm2, %xmm1
+; X64-SSE2-NEXT: pxor %xmm0, %xmm2
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm3
+; X64-SSE2-NEXT: pcmpgtd %xmm1, %xmm3
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
+; X64-SSE2-NEXT: pcmpeqd %xmm1, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; X64-SSE2-NEXT: pand %xmm5, %xmm1
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; X64-SSE2-NEXT: por %xmm1, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm4
+; X64-SSE2-NEXT: pandn %xmm0, %xmm2
+; X64-SSE2-NEXT: por %xmm4, %xmm2
+; X64-SSE2-NEXT: movq %xmm2, %rax
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v4i64:
+; X64-SSE42: ## %bb.0:
+; X64-SSE42-NEXT: movdqa %xmm0, %xmm2
+; X64-SSE42-NEXT: movdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; X64-SSE42-NEXT: movdqa %xmm2, %xmm4
+; X64-SSE42-NEXT: pxor %xmm3, %xmm4
+; X64-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE42-NEXT: pxor %xmm3, %xmm0
+; X64-SSE42-NEXT: pcmpgtq %xmm4, %xmm0
+; X64-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; X64-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE42-NEXT: pxor %xmm3, %xmm0
+; X64-SSE42-NEXT: pxor %xmm2, %xmm3
+; X64-SSE42-NEXT: pcmpgtq %xmm0, %xmm3
+; X64-SSE42-NEXT: movdqa %xmm3, %xmm0
+; X64-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2
+; X64-SSE42-NEXT: movq %xmm2, %rax
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v4i64:
+; X64-AVX1: ## %bb.0:
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; X64-AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm3
+; X64-AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm4
+; X64-AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3
+; X64-AVX1-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm4
+; X64-AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; X64-AVX1-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0
+; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vxorpd %xmm2, %xmm0, %xmm3
+; X64-AVX1-NEXT: vxorpd %xmm2, %xmm1, %xmm4
+; X64-AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; X64-AVX1-NEXT: vpxor %xmm2, %xmm4, %xmm2
+; X64-AVX1-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm2
+; X64-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX1-NEXT: vmovq %xmm0, %rax
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v4i64:
+; X64-AVX2: ## %bb.0:
+; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; X64-AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm3
+; X64-AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm4
+; X64-AVX2-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm3
+; X64-AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0
+; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vxorpd %ymm2, %ymm0, %ymm3
+; X64-AVX2-NEXT: vxorpd %ymm2, %ymm1, %ymm2
+; X64-AVX2-NEXT: vpcmpgtq %ymm3, %ymm2, %ymm2
+; X64-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX2-NEXT: vmovq %xmm0, %rax
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v4i64:
+; X64-AVX512: ## %bb.0:
+; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT: vpminuq %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpminuq %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vmovq %xmm0, %rax
+; X64-AVX512-NEXT: vzeroupper
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+ %2 = icmp ult <4 x i64> %a0, %1
+ %3 = select <4 x i1> %2, <4 x i64> %a0, <4 x i64> %1
+ %4 = shufflevector <4 x i64> %3, <4 x i64> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+ %5 = icmp ult <4 x i64> %3, %4
+ %6 = select <4 x i1> %5, <4 x i64> %3, <4 x i64> %4
+ %7 = extractelement <4 x i64> %6, i32 0
+ ret i64 %7
+}
+
+define i32 @test_reduce_v8i32(<8 x i32> %a0) {
+; X86-SSE2-LABEL: test_reduce_v8i32:
+; X86-SSE2: ## %bb.0:
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X86-SSE2-NEXT: pxor %xmm2, %xmm3
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm4
+; X86-SSE2-NEXT: pxor %xmm2, %xmm4
+; X86-SSE2-NEXT: pcmpgtd %xmm3, %xmm4
+; X86-SSE2-NEXT: pand %xmm4, %xmm0
+; X86-SSE2-NEXT: pandn %xmm1, %xmm4
+; X86-SSE2-NEXT: por %xmm0, %xmm4
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]
+; X86-SSE2-NEXT: movdqa %xmm4, %xmm1
+; X86-SSE2-NEXT: pxor %xmm2, %xmm1
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X86-SSE2-NEXT: pxor %xmm2, %xmm3
+; X86-SSE2-NEXT: pcmpgtd %xmm1, %xmm3
+; X86-SSE2-NEXT: pand %xmm3, %xmm4
+; X86-SSE2-NEXT: pandn %xmm0, %xmm3
+; X86-SSE2-NEXT: por %xmm4, %xmm3
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
+; X86-SSE2-NEXT: movdqa %xmm3, %xmm1
+; X86-SSE2-NEXT: pxor %xmm2, %xmm1
+; X86-SSE2-NEXT: pxor %xmm0, %xmm2
+; X86-SSE2-NEXT: pcmpgtd %xmm1, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm3
+; X86-SSE2-NEXT: pandn %xmm0, %xmm2
+; X86-SSE2-NEXT: por %xmm3, %xmm2
+; X86-SSE2-NEXT: movd %xmm2, %eax
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v8i32:
+; X86-SSE42: ## %bb.0:
+; X86-SSE42-NEXT: pminud %xmm1, %xmm0
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT: pminud %xmm0, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT: pminud %xmm1, %xmm0
+; X86-SSE42-NEXT: movd %xmm0, %eax
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX1-LABEL: test_reduce_v8i32:
+; X86-AVX1: ## %bb.0:
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X86-AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vmovd %xmm0, %eax
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_reduce_v8i32:
+; X86-AVX2: ## %bb.0:
+; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vmovd %xmm0, %eax
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v8i32:
+; X64-SSE2: ## %bb.0:
+; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X64-SSE2-NEXT: pxor %xmm2, %xmm3
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm4
+; X64-SSE2-NEXT: pxor %xmm2, %xmm4
+; X64-SSE2-NEXT: pcmpgtd %xmm3, %xmm4
+; X64-SSE2-NEXT: pand %xmm4, %xmm0
+; X64-SSE2-NEXT: pandn %xmm1, %xmm4
+; X64-SSE2-NEXT: por %xmm0, %xmm4
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]
+; X64-SSE2-NEXT: movdqa %xmm4, %xmm1
+; X64-SSE2-NEXT: pxor %xmm2, %xmm1
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X64-SSE2-NEXT: pxor %xmm2, %xmm3
+; X64-SSE2-NEXT: pcmpgtd %xmm1, %xmm3
+; X64-SSE2-NEXT: pand %xmm3, %xmm4
+; X64-SSE2-NEXT: pandn %xmm0, %xmm3
+; X64-SSE2-NEXT: por %xmm4, %xmm3
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
+; X64-SSE2-NEXT: movdqa %xmm3, %xmm1
+; X64-SSE2-NEXT: pxor %xmm2, %xmm1
+; X64-SSE2-NEXT: pxor %xmm0, %xmm2
+; X64-SSE2-NEXT: pcmpgtd %xmm1, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm3
+; X64-SSE2-NEXT: pandn %xmm0, %xmm2
+; X64-SSE2-NEXT: por %xmm3, %xmm2
+; X64-SSE2-NEXT: movd %xmm2, %eax
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v8i32:
+; X64-SSE42: ## %bb.0:
+; X64-SSE42-NEXT: pminud %xmm1, %xmm0
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT: pminud %xmm0, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT: pminud %xmm1, %xmm0
+; X64-SSE42-NEXT: movd %xmm0, %eax
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v8i32:
+; X64-AVX1: ## %bb.0:
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X64-AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vmovd %xmm0, %eax
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v8i32:
+; X64-AVX2: ## %bb.0:
+; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vmovd %xmm0, %eax
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v8i32:
+; X64-AVX512: ## %bb.0:
+; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT: vpminud %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpminud %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX512-NEXT: vpminud %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vmovd %xmm0, %eax
+; X64-AVX512-NEXT: vzeroupper
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp ult <8 x i32> %a0, %1
+ %3 = select <8 x i1> %2, <8 x i32> %a0, <8 x i32> %1
+ %4 = shufflevector <8 x i32> %3, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp ult <8 x i32> %3, %4
+ %6 = select <8 x i1> %5, <8 x i32> %3, <8 x i32> %4
+ %7 = shufflevector <8 x i32> %6, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp ult <8 x i32> %6, %7
+ %9 = select <8 x i1> %8, <8 x i32> %6, <8 x i32> %7
+ %10 = extractelement <8 x i32> %9, i32 0
+ ret i32 %10
+}
+
+define i16 @test_reduce_v16i16(<16 x i16> %a0) {
+; X86-SSE2-LABEL: test_reduce_v16i16:
+; X86-SSE2: ## %bb.0:
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X86-SSE2-NEXT: pxor %xmm2, %xmm3
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm4
+; X86-SSE2-NEXT: pxor %xmm2, %xmm4
+; X86-SSE2-NEXT: pcmpgtw %xmm3, %xmm4
+; X86-SSE2-NEXT: pand %xmm4, %xmm0
+; X86-SSE2-NEXT: pandn %xmm1, %xmm4
+; X86-SSE2-NEXT: por %xmm0, %xmm4
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]
+; X86-SSE2-NEXT: movdqa %xmm4, %xmm1
+; X86-SSE2-NEXT: pxor %xmm2, %xmm1
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X86-SSE2-NEXT: pxor %xmm2, %xmm3
+; X86-SSE2-NEXT: pcmpgtw %xmm1, %xmm3
+; X86-SSE2-NEXT: pand %xmm3, %xmm4
+; X86-SSE2-NEXT: pandn %xmm0, %xmm3
+; X86-SSE2-NEXT: por %xmm4, %xmm3
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
+; X86-SSE2-NEXT: movdqa %xmm3, %xmm1
+; X86-SSE2-NEXT: pxor %xmm2, %xmm1
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm4
+; X86-SSE2-NEXT: pxor %xmm2, %xmm4
+; X86-SSE2-NEXT: pcmpgtw %xmm1, %xmm4
+; X86-SSE2-NEXT: pand %xmm4, %xmm3
+; X86-SSE2-NEXT: pandn %xmm0, %xmm4
+; X86-SSE2-NEXT: por %xmm3, %xmm4
+; X86-SSE2-NEXT: movdqa %xmm4, %xmm0
+; X86-SSE2-NEXT: psrld $16, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm4, %xmm1
+; X86-SSE2-NEXT: pxor %xmm2, %xmm1
+; X86-SSE2-NEXT: pxor %xmm0, %xmm2
+; X86-SSE2-NEXT: pcmpgtw %xmm1, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm4
+; X86-SSE2-NEXT: pandn %xmm0, %xmm2
+; X86-SSE2-NEXT: por %xmm4, %xmm2
+; X86-SSE2-NEXT: movd %xmm2, %eax
+; X86-SSE2-NEXT: ## kill: def %ax killed %ax killed %eax
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v16i16:
+; X86-SSE42: ## %bb.0:
+; X86-SSE42-NEXT: pminuw %xmm1, %xmm0
+; X86-SSE42-NEXT: phminposuw %xmm0, %xmm0
+; X86-SSE42-NEXT: movd %xmm0, %eax
+; X86-SSE42-NEXT: ## kill: def %ax killed %ax killed %eax
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX1-LABEL: test_reduce_v16i16:
+; X86-AVX1: ## %bb.0:
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X86-AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vphminposuw %xmm0, %xmm0
+; X86-AVX1-NEXT: vmovd %xmm0, %eax
+; X86-AVX1-NEXT: ## kill: def %ax killed %ax killed %eax
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_reduce_v16i16:
+; X86-AVX2: ## %bb.0:
+; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT: vpminuw %xmm1, %xmm0, %xmm0
+; X86-AVX2-NEXT: vphminposuw %xmm0, %xmm0
+; X86-AVX2-NEXT: vmovd %xmm0, %eax
+; X86-AVX2-NEXT: ## kill: def %ax killed %ax killed %eax
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v16i16:
+; X64-SSE2: ## %bb.0:
+; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X64-SSE2-NEXT: pxor %xmm2, %xmm3
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm4
+; X64-SSE2-NEXT: pxor %xmm2, %xmm4
+; X64-SSE2-NEXT: pcmpgtw %xmm3, %xmm4
+; X64-SSE2-NEXT: pand %xmm4, %xmm0
+; X64-SSE2-NEXT: pandn %xmm1, %xmm4
+; X64-SSE2-NEXT: por %xmm0, %xmm4
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]
+; X64-SSE2-NEXT: movdqa %xmm4, %xmm1
+; X64-SSE2-NEXT: pxor %xmm2, %xmm1
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X64-SSE2-NEXT: pxor %xmm2, %xmm3
+; X64-SSE2-NEXT: pcmpgtw %xmm1, %xmm3
+; X64-SSE2-NEXT: pand %xmm3, %xmm4
+; X64-SSE2-NEXT: pandn %xmm0, %xmm3
+; X64-SSE2-NEXT: por %xmm4, %xmm3
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
+; X64-SSE2-NEXT: movdqa %xmm3, %xmm1
+; X64-SSE2-NEXT: pxor %xmm2, %xmm1
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm4
+; X64-SSE2-NEXT: pxor %xmm2, %xmm4
+; X64-SSE2-NEXT: pcmpgtw %xmm1, %xmm4
+; X64-SSE2-NEXT: pand %xmm4, %xmm3
+; X64-SSE2-NEXT: pandn %xmm0, %xmm4
+; X64-SSE2-NEXT: por %xmm3, %xmm4
+; X64-SSE2-NEXT: movdqa %xmm4, %xmm0
+; X64-SSE2-NEXT: psrld $16, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm4, %xmm1
+; X64-SSE2-NEXT: pxor %xmm2, %xmm1
+; X64-SSE2-NEXT: pxor %xmm0, %xmm2
+; X64-SSE2-NEXT: pcmpgtw %xmm1, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm4
+; X64-SSE2-NEXT: pandn %xmm0, %xmm2
+; X64-SSE2-NEXT: por %xmm4, %xmm2
+; X64-SSE2-NEXT: movd %xmm2, %eax
+; X64-SSE2-NEXT: ## kill: def %ax killed %ax killed %eax
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v16i16:
+; X64-SSE42: ## %bb.0:
+; X64-SSE42-NEXT: pminuw %xmm1, %xmm0
+; X64-SSE42-NEXT: phminposuw %xmm0, %xmm0
+; X64-SSE42-NEXT: movd %xmm0, %eax
+; X64-SSE42-NEXT: ## kill: def %ax killed %ax killed %eax
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v16i16:
+; X64-AVX1: ## %bb.0:
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X64-AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vphminposuw %xmm0, %xmm0
+; X64-AVX1-NEXT: vmovd %xmm0, %eax
+; X64-AVX1-NEXT: ## kill: def %ax killed %ax killed %eax
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v16i16:
+; X64-AVX2: ## %bb.0:
+; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vpminuw %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0
+; X64-AVX2-NEXT: vmovd %xmm0, %eax
+; X64-AVX2-NEXT: ## kill: def %ax killed %ax killed %eax
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v16i16:
+; X64-AVX512: ## %bb.0:
+; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT: vpminuw %xmm1, %xmm0, %xmm0
+; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0
+; X64-AVX512-NEXT: vmovd %xmm0, %eax
+; X64-AVX512-NEXT: ## kill: def %ax killed %ax killed %eax
+; X64-AVX512-NEXT: vzeroupper
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp ult <16 x i16> %a0, %1
+ %3 = select <16 x i1> %2, <16 x i16> %a0, <16 x i16> %1
+ %4 = shufflevector <16 x i16> %3, <16 x i16> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp ult <16 x i16> %3, %4
+ %6 = select <16 x i1> %5, <16 x i16> %3, <16 x i16> %4
+ %7 = shufflevector <16 x i16> %6, <16 x i16> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp ult <16 x i16> %6, %7
+ %9 = select <16 x i1> %8, <16 x i16> %6, <16 x i16> %7
+ %10 = shufflevector <16 x i16> %9, <16 x i16> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %11 = icmp ult <16 x i16> %9, %10
+ %12 = select <16 x i1> %11, <16 x i16> %9, <16 x i16> %10
+ %13 = extractelement <16 x i16> %12, i32 0
+ ret i16 %13
+}
+
+define i8 @test_reduce_v32i8(<32 x i8> %a0) {
+; X86-SSE2-LABEL: test_reduce_v32i8:
+; X86-SSE2: ## %bb.0:
+; X86-SSE2-NEXT: pminub %xmm1, %xmm0
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: pminub %xmm0, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE2-NEXT: pminub %xmm1, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT: psrld $16, %xmm1
+; X86-SSE2-NEXT: pminub %xmm0, %xmm1
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE2-NEXT: psrlw $8, %xmm0
+; X86-SSE2-NEXT: pminub %xmm1, %xmm0
+; X86-SSE2-NEXT: movd %xmm0, %eax
+; X86-SSE2-NEXT: ## kill: def %al killed %al killed %eax
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v32i8:
+; X86-SSE42: ## %bb.0:
+; X86-SSE42-NEXT: pminub %xmm1, %xmm0
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT: pminub %xmm0, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT: pminub %xmm1, %xmm0
+; X86-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE42-NEXT: psrld $16, %xmm1
+; X86-SSE42-NEXT: pminub %xmm0, %xmm1
+; X86-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE42-NEXT: psrlw $8, %xmm0
+; X86-SSE42-NEXT: pminub %xmm1, %xmm0
+; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax
+; X86-SSE42-NEXT: ## kill: def %al killed %al killed %eax
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX1-LABEL: test_reduce_v32i8:
+; X86-AVX1: ## %bb.0:
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpextrb $0, %xmm0, %eax
+; X86-AVX1-NEXT: ## kill: def %al killed %al killed %eax
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_reduce_v32i8:
+; X86-AVX2: ## %bb.0:
+; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpextrb $0, %xmm0, %eax
+; X86-AVX2-NEXT: ## kill: def %al killed %al killed %eax
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v32i8:
+; X64-SSE2: ## %bb.0:
+; X64-SSE2-NEXT: pminub %xmm1, %xmm0
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: pminub %xmm0, %xmm1
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE2-NEXT: pminub %xmm1, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE2-NEXT: psrld $16, %xmm1
+; X64-SSE2-NEXT: pminub %xmm0, %xmm1
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE2-NEXT: psrlw $8, %xmm0
+; X64-SSE2-NEXT: pminub %xmm1, %xmm0
+; X64-SSE2-NEXT: movd %xmm0, %eax
+; X64-SSE2-NEXT: ## kill: def %al killed %al killed %eax
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v32i8:
+; X64-SSE42: ## %bb.0:
+; X64-SSE42-NEXT: pminub %xmm1, %xmm0
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT: pminub %xmm0, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT: pminub %xmm1, %xmm0
+; X64-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE42-NEXT: psrld $16, %xmm1
+; X64-SSE42-NEXT: pminub %xmm0, %xmm1
+; X64-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE42-NEXT: psrlw $8, %xmm0
+; X64-SSE42-NEXT: pminub %xmm1, %xmm0
+; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax
+; X64-SSE42-NEXT: ## kill: def %al killed %al killed %eax
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v32i8:
+; X64-AVX1: ## %bb.0:
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X64-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X64-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpextrb $0, %xmm0, %eax
+; X64-AVX1-NEXT: ## kill: def %al killed %al killed %eax
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v32i8:
+; X64-AVX2: ## %bb.0:
+; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpextrb $0, %xmm0, %eax
+; X64-AVX2-NEXT: ## kill: def %al killed %al killed %eax
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v32i8:
+; X64-AVX512: ## %bb.0:
+; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT: vpminub %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpminub %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX512-NEXT: vpminub %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX512-NEXT: vpminub %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X64-AVX512-NEXT: vpminub %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vpextrb $0, %xmm0, %eax
+; X64-AVX512-NEXT: ## kill: def %al killed %al killed %eax
+; X64-AVX512-NEXT: vzeroupper
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <32 x i8> %a0, <32 x i8> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp ult <32 x i8> %a0, %1
+ %3 = select <32 x i1> %2, <32 x i8> %a0, <32 x i8> %1
+ %4 = shufflevector <32 x i8> %3, <32 x i8> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp ult <32 x i8> %3, %4
+ %6 = select <32 x i1> %5, <32 x i8> %3, <32 x i8> %4
+ %7 = shufflevector <32 x i8> %6, <32 x i8> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp ult <32 x i8> %6, %7
+ %9 = select <32 x i1> %8, <32 x i8> %6, <32 x i8> %7
+ %10 = shufflevector <32 x i8> %9, <32 x i8> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %11 = icmp ult <32 x i8> %9, %10
+ %12 = select <32 x i1> %11, <32 x i8> %9, <32 x i8> %10
+ %13 = shufflevector <32 x i8> %12, <32 x i8> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %14 = icmp ult <32 x i8> %12, %13
+ %15 = select <32 x i1> %14, <32 x i8> %12, <32 x i8> %13
+ %16 = extractelement <32 x i8> %15, i32 0
+ ret i8 %16
+}
+
+;
+; 512-bit Vectors
+;
+
+define i64 @test_reduce_v8i64(<8 x i64> %a0) {
+; X86-SSE2-LABEL: test_reduce_v8i64:
+; X86-SSE2: ## %bb.0:
+; X86-SSE2-NEXT: subl $28, %esp
+; X86-SSE2-NEXT: .cfi_def_cfa_offset 32
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm6
+; X86-SSE2-NEXT: movdqa %xmm6, (%esp) ## 16-byte Spill
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm5
+; X86-SSE2-NEXT: pxor %xmm4, %xmm5
+; X86-SSE2-NEXT: pxor %xmm4, %xmm6
+; X86-SSE2-NEXT: movdqa %xmm6, %xmm7
+; X86-SSE2-NEXT: pcmpgtd %xmm5, %xmm7
+; X86-SSE2-NEXT: pcmpeqd %xmm5, %xmm6
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,0,2,2]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; X86-SSE2-NEXT: pand %xmm5, %xmm6
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; X86-SSE2-NEXT: por %xmm6, %xmm5
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm6
+; X86-SSE2-NEXT: pxor %xmm4, %xmm6
+; X86-SSE2-NEXT: movdqa %xmm3, %xmm7
+; X86-SSE2-NEXT: pxor %xmm4, %xmm7
+; X86-SSE2-NEXT: movdqa %xmm7, %xmm0
+; X86-SSE2-NEXT: pcmpgtd %xmm6, %xmm0
+; X86-SSE2-NEXT: pcmpeqd %xmm6, %xmm7
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,0,2,2]
+; X86-SSE2-NEXT: pand %xmm6, %xmm7
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
+; X86-SSE2-NEXT: por %xmm7, %xmm6
+; X86-SSE2-NEXT: pand %xmm6, %xmm1
+; X86-SSE2-NEXT: pandn %xmm3, %xmm6
+; X86-SSE2-NEXT: por %xmm1, %xmm6
+; X86-SSE2-NEXT: pand %xmm5, %xmm2
+; X86-SSE2-NEXT: pandn (%esp), %xmm5 ## 16-byte Folded Reload
+; X86-SSE2-NEXT: por %xmm2, %xmm5
+; X86-SSE2-NEXT: movdqa %xmm5, %xmm0
+; X86-SSE2-NEXT: pxor %xmm4, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm6, %xmm1
+; X86-SSE2-NEXT: pxor %xmm4, %xmm1
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm2
+; X86-SSE2-NEXT: pcmpeqd %xmm0, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; X86-SSE2-NEXT: pand %xmm0, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
+; X86-SSE2-NEXT: por %xmm1, %xmm0
+; X86-SSE2-NEXT: pand %xmm0, %xmm5
+; X86-SSE2-NEXT: pandn %xmm6, %xmm0
+; X86-SSE2-NEXT: por %xmm5, %xmm0
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT: pxor %xmm4, %xmm2
+; X86-SSE2-NEXT: pxor %xmm1, %xmm4
+; X86-SSE2-NEXT: movdqa %xmm4, %xmm3
+; X86-SSE2-NEXT: pcmpgtd %xmm2, %xmm3
+; X86-SSE2-NEXT: pcmpeqd %xmm2, %xmm4
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; X86-SSE2-NEXT: pand %xmm2, %xmm4
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; X86-SSE2-NEXT: por %xmm4, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm0
+; X86-SSE2-NEXT: pandn %xmm1, %xmm2
+; X86-SSE2-NEXT: por %xmm0, %xmm2
+; X86-SSE2-NEXT: movd %xmm2, %eax
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X86-SSE2-NEXT: movd %xmm0, %edx
+; X86-SSE2-NEXT: addl $28, %esp
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v8i64:
+; X86-SSE42: ## %bb.0:
+; X86-SSE42-NEXT: movdqa %xmm0, %xmm5
+; X86-SSE42-NEXT: movdqa {{.*#+}} xmm4 = [0,2147483648,0,2147483648]
+; X86-SSE42-NEXT: pxor %xmm4, %xmm0
+; X86-SSE42-NEXT: movdqa %xmm2, %xmm6
+; X86-SSE42-NEXT: pxor %xmm4, %xmm6
+; X86-SSE42-NEXT: pcmpgtq %xmm0, %xmm6
+; X86-SSE42-NEXT: movdqa %xmm1, %xmm7
+; X86-SSE42-NEXT: pxor %xmm4, %xmm7
+; X86-SSE42-NEXT: movdqa %xmm3, %xmm0
+; X86-SSE42-NEXT: pxor %xmm4, %xmm0
+; X86-SSE42-NEXT: pcmpgtq %xmm7, %xmm0
+; X86-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm3
+; X86-SSE42-NEXT: movdqa %xmm6, %xmm0
+; X86-SSE42-NEXT: blendvpd %xmm0, %xmm5, %xmm2
+; X86-SSE42-NEXT: movapd %xmm2, %xmm1
+; X86-SSE42-NEXT: xorpd %xmm4, %xmm1
+; X86-SSE42-NEXT: movapd %xmm3, %xmm0
+; X86-SSE42-NEXT: xorpd %xmm4, %xmm0
+; X86-SSE42-NEXT: pcmpgtq %xmm1, %xmm0
+; X86-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm3
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1]
+; X86-SSE42-NEXT: movdqa %xmm3, %xmm0
+; X86-SSE42-NEXT: pxor %xmm4, %xmm0
+; X86-SSE42-NEXT: pxor %xmm1, %xmm4
+; X86-SSE42-NEXT: pcmpgtq %xmm0, %xmm4
+; X86-SSE42-NEXT: movdqa %xmm4, %xmm0
+; X86-SSE42-NEXT: blendvpd %xmm0, %xmm3, %xmm1
+; X86-SSE42-NEXT: movd %xmm1, %eax
+; X86-SSE42-NEXT: pextrd $1, %xmm1, %edx
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX1-LABEL: test_reduce_v8i64:
+; X86-AVX1: ## %bb.0:
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,2147483648,0,2147483648]
+; X86-AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2
+; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
+; X86-AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm4
+; X86-AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2
+; X86-AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm4
+; X86-AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm5
+; X86-AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4
+; X86-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2
+; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X86-AVX1-NEXT: vxorpd %xmm3, %xmm0, %xmm2
+; X86-AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm4
+; X86-AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2
+; X86-AVX1-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm4
+; X86-AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
+; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vxorpd %xmm3, %xmm0, %xmm2
+; X86-AVX1-NEXT: vxorpd %xmm3, %xmm1, %xmm4
+; X86-AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; X86-AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm3
+; X86-AVX1-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm3
+; X86-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; X86-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX1-NEXT: vmovd %xmm0, %eax
+; X86-AVX1-NEXT: vpextrd $1, %xmm0, %edx
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_reduce_v8i64:
+; X86-AVX2: ## %bb.0:
+; X86-AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648]
+; X86-AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm3
+; X86-AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm4
+; X86-AVX2-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm3
+; X86-AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0
+; X86-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT: vxorpd %ymm2, %ymm0, %ymm3
+; X86-AVX2-NEXT: vxorpd %ymm2, %ymm1, %ymm4
+; X86-AVX2-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm3
+; X86-AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0
+; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vxorpd %ymm2, %ymm0, %ymm3
+; X86-AVX2-NEXT: vxorpd %ymm2, %ymm1, %ymm2
+; X86-AVX2-NEXT: vpcmpgtq %ymm3, %ymm2, %ymm2
+; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX2-NEXT: vmovd %xmm0, %eax
+; X86-AVX2-NEXT: vpextrd $1, %xmm0, %edx
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v8i64:
+; X64-SSE2: ## %bb.0:
+; X64-SSE2-NEXT: movdqa {{.*#+}} xmm9 = [2147483648,2147483648,2147483648,2147483648]
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm5
+; X64-SSE2-NEXT: pxor %xmm9, %xmm5
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm6
+; X64-SSE2-NEXT: pxor %xmm9, %xmm6
+; X64-SSE2-NEXT: movdqa %xmm6, %xmm7
+; X64-SSE2-NEXT: pcmpgtd %xmm5, %xmm7
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
+; X64-SSE2-NEXT: pcmpeqd %xmm5, %xmm6
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; X64-SSE2-NEXT: pand %xmm8, %xmm6
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; X64-SSE2-NEXT: por %xmm6, %xmm5
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm6
+; X64-SSE2-NEXT: pxor %xmm9, %xmm6
+; X64-SSE2-NEXT: movdqa %xmm3, %xmm7
+; X64-SSE2-NEXT: pxor %xmm9, %xmm7
+; X64-SSE2-NEXT: movdqa %xmm7, %xmm4
+; X64-SSE2-NEXT: pcmpgtd %xmm6, %xmm4
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm4[0,0,2,2]
+; X64-SSE2-NEXT: pcmpeqd %xmm6, %xmm7
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
+; X64-SSE2-NEXT: pand %xmm8, %xmm7
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3]
+; X64-SSE2-NEXT: por %xmm7, %xmm6
+; X64-SSE2-NEXT: pand %xmm6, %xmm1
+; X64-SSE2-NEXT: pandn %xmm3, %xmm6
+; X64-SSE2-NEXT: por %xmm1, %xmm6
+; X64-SSE2-NEXT: pand %xmm5, %xmm0
+; X64-SSE2-NEXT: pandn %xmm2, %xmm5
+; X64-SSE2-NEXT: por %xmm0, %xmm5
+; X64-SSE2-NEXT: movdqa %xmm5, %xmm0
+; X64-SSE2-NEXT: pxor %xmm9, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm6, %xmm1
+; X64-SSE2-NEXT: pxor %xmm9, %xmm1
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X64-SSE2-NEXT: pcmpgtd %xmm0, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
+; X64-SSE2-NEXT: pcmpeqd %xmm0, %xmm1
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; X64-SSE2-NEXT: pand %xmm3, %xmm0
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; X64-SSE2-NEXT: por %xmm0, %xmm1
+; X64-SSE2-NEXT: pand %xmm1, %xmm5
+; X64-SSE2-NEXT: pandn %xmm6, %xmm1
+; X64-SSE2-NEXT: por %xmm5, %xmm1
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X64-SSE2-NEXT: pxor %xmm9, %xmm2
+; X64-SSE2-NEXT: pxor %xmm0, %xmm9
+; X64-SSE2-NEXT: movdqa %xmm9, %xmm3
+; X64-SSE2-NEXT: pcmpgtd %xmm2, %xmm3
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
+; X64-SSE2-NEXT: pcmpeqd %xmm2, %xmm9
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm9[1,1,3,3]
+; X64-SSE2-NEXT: pand %xmm4, %xmm2
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; X64-SSE2-NEXT: por %xmm2, %xmm3
+; X64-SSE2-NEXT: pand %xmm3, %xmm1
+; X64-SSE2-NEXT: pandn %xmm0, %xmm3
+; X64-SSE2-NEXT: por %xmm1, %xmm3
+; X64-SSE2-NEXT: movq %xmm3, %rax
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v8i64:
+; X64-SSE42: ## %bb.0:
+; X64-SSE42-NEXT: movdqa %xmm0, %xmm5
+; X64-SSE42-NEXT: movdqa {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808]
+; X64-SSE42-NEXT: pxor %xmm4, %xmm0
+; X64-SSE42-NEXT: movdqa %xmm2, %xmm6
+; X64-SSE42-NEXT: pxor %xmm4, %xmm6
+; X64-SSE42-NEXT: pcmpgtq %xmm0, %xmm6
+; X64-SSE42-NEXT: movdqa %xmm1, %xmm7
+; X64-SSE42-NEXT: pxor %xmm4, %xmm7
+; X64-SSE42-NEXT: movdqa %xmm3, %xmm0
+; X64-SSE42-NEXT: pxor %xmm4, %xmm0
+; X64-SSE42-NEXT: pcmpgtq %xmm7, %xmm0
+; X64-SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm3
+; X64-SSE42-NEXT: movdqa %xmm6, %xmm0
+; X64-SSE42-NEXT: blendvpd %xmm0, %xmm5, %xmm2
+; X64-SSE42-NEXT: movapd %xmm2, %xmm1
+; X64-SSE42-NEXT: xorpd %xmm4, %xmm1
+; X64-SSE42-NEXT: movapd %xmm3, %xmm0
+; X64-SSE42-NEXT: xorpd %xmm4, %xmm0
+; X64-SSE42-NEXT: pcmpgtq %xmm1, %xmm0
+; X64-SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm3
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1]
+; X64-SSE42-NEXT: movdqa %xmm3, %xmm0
+; X64-SSE42-NEXT: pxor %xmm4, %xmm0
+; X64-SSE42-NEXT: pxor %xmm1, %xmm4
+; X64-SSE42-NEXT: pcmpgtq %xmm0, %xmm4
+; X64-SSE42-NEXT: movdqa %xmm4, %xmm0
+; X64-SSE42-NEXT: blendvpd %xmm0, %xmm3, %xmm1
+; X64-SSE42-NEXT: movq %xmm1, %rax
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v8i64:
+; X64-AVX1: ## %bb.0:
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; X64-AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2
+; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
+; X64-AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm4
+; X64-AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2
+; X64-AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm4
+; X64-AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm5
+; X64-AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4
+; X64-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2
+; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X64-AVX1-NEXT: vxorpd %xmm3, %xmm0, %xmm2
+; X64-AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm4
+; X64-AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2
+; X64-AVX1-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm4
+; X64-AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
+; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vxorpd %xmm3, %xmm0, %xmm2
+; X64-AVX1-NEXT: vxorpd %xmm3, %xmm1, %xmm4
+; X64-AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; X64-AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm3
+; X64-AVX1-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm3
+; X64-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; X64-AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX1-NEXT: vmovq %xmm0, %rax
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v8i64:
+; X64-AVX2: ## %bb.0:
+; X64-AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; X64-AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm3
+; X64-AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm4
+; X64-AVX2-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm3
+; X64-AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0
+; X64-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vxorpd %ymm2, %ymm0, %ymm3
+; X64-AVX2-NEXT: vxorpd %ymm2, %ymm1, %ymm4
+; X64-AVX2-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm3
+; X64-AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0
+; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vxorpd %ymm2, %ymm0, %ymm3
+; X64-AVX2-NEXT: vxorpd %ymm2, %ymm1, %ymm2
+; X64-AVX2-NEXT: vpcmpgtq %ymm3, %ymm2, %ymm2
+; X64-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX2-NEXT: vmovq %xmm0, %rax
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v8i64:
+; X64-AVX512: ## %bb.0:
+; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; X64-AVX512-NEXT: vpminuq %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT: vpminuq %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpminuq %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vmovq %xmm0, %rax
+; X64-AVX512-NEXT: vzeroupper
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <8 x i64> %a0, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp ult <8 x i64> %a0, %1
+ %3 = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %1
+ %4 = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp ult <8 x i64> %3, %4
+ %6 = select <8 x i1> %5, <8 x i64> %3, <8 x i64> %4
+ %7 = shufflevector <8 x i64> %6, <8 x i64> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp ult <8 x i64> %6, %7
+ %9 = select <8 x i1> %8, <8 x i64> %6, <8 x i64> %7
+ %10 = extractelement <8 x i64> %9, i32 0
+ ret i64 %10
+}
+
+define i32 @test_reduce_v16i32(<16 x i32> %a0) {
+; X86-SSE2-LABEL: test_reduce_v16i32:
+; X86-SSE2: ## %bb.0:
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm6
+; X86-SSE2-NEXT: pxor %xmm4, %xmm6
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm5
+; X86-SSE2-NEXT: pxor %xmm4, %xmm5
+; X86-SSE2-NEXT: pcmpgtd %xmm6, %xmm5
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm6
+; X86-SSE2-NEXT: pxor %xmm4, %xmm6
+; X86-SSE2-NEXT: movdqa %xmm3, %xmm7
+; X86-SSE2-NEXT: pxor %xmm4, %xmm7
+; X86-SSE2-NEXT: pcmpgtd %xmm6, %xmm7
+; X86-SSE2-NEXT: pand %xmm7, %xmm1
+; X86-SSE2-NEXT: pandn %xmm3, %xmm7
+; X86-SSE2-NEXT: por %xmm1, %xmm7
+; X86-SSE2-NEXT: pand %xmm5, %xmm0
+; X86-SSE2-NEXT: pandn %xmm2, %xmm5
+; X86-SSE2-NEXT: por %xmm0, %xmm5
+; X86-SSE2-NEXT: movdqa %xmm5, %xmm0
+; X86-SSE2-NEXT: pxor %xmm4, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm7, %xmm1
+; X86-SSE2-NEXT: pxor %xmm4, %xmm1
+; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm1
+; X86-SSE2-NEXT: pand %xmm1, %xmm5
+; X86-SSE2-NEXT: pandn %xmm7, %xmm1
+; X86-SSE2-NEXT: por %xmm5, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT: pxor %xmm4, %xmm2
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X86-SSE2-NEXT: pxor %xmm4, %xmm3
+; X86-SSE2-NEXT: pcmpgtd %xmm2, %xmm3
+; X86-SSE2-NEXT: pand %xmm3, %xmm1
+; X86-SSE2-NEXT: pandn %xmm0, %xmm3
+; X86-SSE2-NEXT: por %xmm1, %xmm3
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
+; X86-SSE2-NEXT: movdqa %xmm3, %xmm1
+; X86-SSE2-NEXT: pxor %xmm4, %xmm1
+; X86-SSE2-NEXT: pxor %xmm0, %xmm4
+; X86-SSE2-NEXT: pcmpgtd %xmm1, %xmm4
+; X86-SSE2-NEXT: pand %xmm4, %xmm3
+; X86-SSE2-NEXT: pandn %xmm0, %xmm4
+; X86-SSE2-NEXT: por %xmm3, %xmm4
+; X86-SSE2-NEXT: movd %xmm4, %eax
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v16i32:
+; X86-SSE42: ## %bb.0:
+; X86-SSE42-NEXT: pminud %xmm3, %xmm1
+; X86-SSE42-NEXT: pminud %xmm2, %xmm0
+; X86-SSE42-NEXT: pminud %xmm1, %xmm0
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT: pminud %xmm0, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT: pminud %xmm1, %xmm0
+; X86-SSE42-NEXT: movd %xmm0, %eax
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX1-LABEL: test_reduce_v16i32:
+; X86-AVX1: ## %bb.0:
+; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X86-AVX1-NEXT: vpminud %xmm2, %xmm3, %xmm2
+; X86-AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpminud %xmm2, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vmovd %xmm0, %eax
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_reduce_v16i32:
+; X86-AVX2: ## %bb.0:
+; X86-AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vmovd %xmm0, %eax
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v16i32:
+; X64-SSE2: ## %bb.0:
+; X64-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm6
+; X64-SSE2-NEXT: pxor %xmm4, %xmm6
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm5
+; X64-SSE2-NEXT: pxor %xmm4, %xmm5
+; X64-SSE2-NEXT: pcmpgtd %xmm6, %xmm5
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm6
+; X64-SSE2-NEXT: pxor %xmm4, %xmm6
+; X64-SSE2-NEXT: movdqa %xmm3, %xmm7
+; X64-SSE2-NEXT: pxor %xmm4, %xmm7
+; X64-SSE2-NEXT: pcmpgtd %xmm6, %xmm7
+; X64-SSE2-NEXT: pand %xmm7, %xmm1
+; X64-SSE2-NEXT: pandn %xmm3, %xmm7
+; X64-SSE2-NEXT: por %xmm1, %xmm7
+; X64-SSE2-NEXT: pand %xmm5, %xmm0
+; X64-SSE2-NEXT: pandn %xmm2, %xmm5
+; X64-SSE2-NEXT: por %xmm0, %xmm5
+; X64-SSE2-NEXT: movdqa %xmm5, %xmm0
+; X64-SSE2-NEXT: pxor %xmm4, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm7, %xmm1
+; X64-SSE2-NEXT: pxor %xmm4, %xmm1
+; X64-SSE2-NEXT: pcmpgtd %xmm0, %xmm1
+; X64-SSE2-NEXT: pand %xmm1, %xmm5
+; X64-SSE2-NEXT: pandn %xmm7, %xmm1
+; X64-SSE2-NEXT: por %xmm5, %xmm1
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X64-SSE2-NEXT: pxor %xmm4, %xmm2
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X64-SSE2-NEXT: pxor %xmm4, %xmm3
+; X64-SSE2-NEXT: pcmpgtd %xmm2, %xmm3
+; X64-SSE2-NEXT: pand %xmm3, %xmm1
+; X64-SSE2-NEXT: pandn %xmm0, %xmm3
+; X64-SSE2-NEXT: por %xmm1, %xmm3
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
+; X64-SSE2-NEXT: movdqa %xmm3, %xmm1
+; X64-SSE2-NEXT: pxor %xmm4, %xmm1
+; X64-SSE2-NEXT: pxor %xmm0, %xmm4
+; X64-SSE2-NEXT: pcmpgtd %xmm1, %xmm4
+; X64-SSE2-NEXT: pand %xmm4, %xmm3
+; X64-SSE2-NEXT: pandn %xmm0, %xmm4
+; X64-SSE2-NEXT: por %xmm3, %xmm4
+; X64-SSE2-NEXT: movd %xmm4, %eax
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v16i32:
+; X64-SSE42: ## %bb.0:
+; X64-SSE42-NEXT: pminud %xmm3, %xmm1
+; X64-SSE42-NEXT: pminud %xmm2, %xmm0
+; X64-SSE42-NEXT: pminud %xmm1, %xmm0
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT: pminud %xmm0, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT: pminud %xmm1, %xmm0
+; X64-SSE42-NEXT: movd %xmm0, %eax
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v16i32:
+; X64-AVX1: ## %bb.0:
+; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X64-AVX1-NEXT: vpminud %xmm2, %xmm3, %xmm2
+; X64-AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpminud %xmm2, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vmovd %xmm0, %eax
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v16i32:
+; X64-AVX2: ## %bb.0:
+; X64-AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vmovd %xmm0, %eax
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v16i32:
+; X64-AVX512: ## %bb.0:
+; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; X64-AVX512-NEXT: vpminud %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT: vpminud %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpminud %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX512-NEXT: vpminud %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vmovd %xmm0, %eax
+; X64-AVX512-NEXT: vzeroupper
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <16 x i32> %a0, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp ult <16 x i32> %a0, %1
+ %3 = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %1
+ %4 = shufflevector <16 x i32> %3, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp ult <16 x i32> %3, %4
+ %6 = select <16 x i1> %5, <16 x i32> %3, <16 x i32> %4
+ %7 = shufflevector <16 x i32> %6, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp ult <16 x i32> %6, %7
+ %9 = select <16 x i1> %8, <16 x i32> %6, <16 x i32> %7
+ %10 = shufflevector <16 x i32> %9, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %11 = icmp ult <16 x i32> %9, %10
+ %12 = select <16 x i1> %11, <16 x i32> %9, <16 x i32> %10
+ %13 = extractelement <16 x i32> %12, i32 0
+ ret i32 %13
+}
+
+define i16 @test_reduce_v32i16(<32 x i16> %a0) {
+; X86-SSE2-LABEL: test_reduce_v32i16:
+; X86-SSE2: ## %bb.0:
+; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm6
+; X86-SSE2-NEXT: pxor %xmm4, %xmm6
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm5
+; X86-SSE2-NEXT: pxor %xmm4, %xmm5
+; X86-SSE2-NEXT: pcmpgtw %xmm6, %xmm5
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm6
+; X86-SSE2-NEXT: pxor %xmm4, %xmm6
+; X86-SSE2-NEXT: movdqa %xmm3, %xmm7
+; X86-SSE2-NEXT: pxor %xmm4, %xmm7
+; X86-SSE2-NEXT: pcmpgtw %xmm6, %xmm7
+; X86-SSE2-NEXT: pand %xmm7, %xmm1
+; X86-SSE2-NEXT: pandn %xmm3, %xmm7
+; X86-SSE2-NEXT: por %xmm1, %xmm7
+; X86-SSE2-NEXT: pand %xmm5, %xmm0
+; X86-SSE2-NEXT: pandn %xmm2, %xmm5
+; X86-SSE2-NEXT: por %xmm0, %xmm5
+; X86-SSE2-NEXT: movdqa %xmm5, %xmm0
+; X86-SSE2-NEXT: pxor %xmm4, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm7, %xmm1
+; X86-SSE2-NEXT: pxor %xmm4, %xmm1
+; X86-SSE2-NEXT: pcmpgtw %xmm0, %xmm1
+; X86-SSE2-NEXT: pand %xmm1, %xmm5
+; X86-SSE2-NEXT: pandn %xmm7, %xmm1
+; X86-SSE2-NEXT: por %xmm5, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT: pxor %xmm4, %xmm2
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X86-SSE2-NEXT: pxor %xmm4, %xmm3
+; X86-SSE2-NEXT: pcmpgtw %xmm2, %xmm3
+; X86-SSE2-NEXT: pand %xmm3, %xmm1
+; X86-SSE2-NEXT: pandn %xmm0, %xmm3
+; X86-SSE2-NEXT: por %xmm1, %xmm3
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
+; X86-SSE2-NEXT: movdqa %xmm3, %xmm1
+; X86-SSE2-NEXT: pxor %xmm4, %xmm1
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT: pxor %xmm4, %xmm2
+; X86-SSE2-NEXT: pcmpgtw %xmm1, %xmm2
+; X86-SSE2-NEXT: pand %xmm2, %xmm3
+; X86-SSE2-NEXT: pandn %xmm0, %xmm2
+; X86-SSE2-NEXT: por %xmm3, %xmm2
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm0
+; X86-SSE2-NEXT: psrld $16, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm2, %xmm1
+; X86-SSE2-NEXT: pxor %xmm4, %xmm1
+; X86-SSE2-NEXT: pxor %xmm0, %xmm4
+; X86-SSE2-NEXT: pcmpgtw %xmm1, %xmm4
+; X86-SSE2-NEXT: pand %xmm4, %xmm2
+; X86-SSE2-NEXT: pandn %xmm0, %xmm4
+; X86-SSE2-NEXT: por %xmm2, %xmm4
+; X86-SSE2-NEXT: movd %xmm4, %eax
+; X86-SSE2-NEXT: ## kill: def %ax killed %ax killed %eax
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v32i16:
+; X86-SSE42: ## %bb.0:
+; X86-SSE42-NEXT: pminuw %xmm3, %xmm1
+; X86-SSE42-NEXT: pminuw %xmm2, %xmm0
+; X86-SSE42-NEXT: pminuw %xmm1, %xmm0
+; X86-SSE42-NEXT: phminposuw %xmm0, %xmm0
+; X86-SSE42-NEXT: movd %xmm0, %eax
+; X86-SSE42-NEXT: ## kill: def %ax killed %ax killed %eax
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX1-LABEL: test_reduce_v32i16:
+; X86-AVX1: ## %bb.0:
+; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X86-AVX1-NEXT: vpminuw %xmm2, %xmm3, %xmm2
+; X86-AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpminuw %xmm2, %xmm0, %xmm0
+; X86-AVX1-NEXT: vphminposuw %xmm0, %xmm0
+; X86-AVX1-NEXT: vmovd %xmm0, %eax
+; X86-AVX1-NEXT: ## kill: def %ax killed %ax killed %eax
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_reduce_v32i16:
+; X86-AVX2: ## %bb.0:
+; X86-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT: vpminuw %xmm1, %xmm0, %xmm0
+; X86-AVX2-NEXT: vphminposuw %xmm0, %xmm0
+; X86-AVX2-NEXT: vmovd %xmm0, %eax
+; X86-AVX2-NEXT: ## kill: def %ax killed %ax killed %eax
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v32i16:
+; X64-SSE2: ## %bb.0:
+; X64-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm6
+; X64-SSE2-NEXT: pxor %xmm4, %xmm6
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm5
+; X64-SSE2-NEXT: pxor %xmm4, %xmm5
+; X64-SSE2-NEXT: pcmpgtw %xmm6, %xmm5
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm6
+; X64-SSE2-NEXT: pxor %xmm4, %xmm6
+; X64-SSE2-NEXT: movdqa %xmm3, %xmm7
+; X64-SSE2-NEXT: pxor %xmm4, %xmm7
+; X64-SSE2-NEXT: pcmpgtw %xmm6, %xmm7
+; X64-SSE2-NEXT: pand %xmm7, %xmm1
+; X64-SSE2-NEXT: pandn %xmm3, %xmm7
+; X64-SSE2-NEXT: por %xmm1, %xmm7
+; X64-SSE2-NEXT: pand %xmm5, %xmm0
+; X64-SSE2-NEXT: pandn %xmm2, %xmm5
+; X64-SSE2-NEXT: por %xmm0, %xmm5
+; X64-SSE2-NEXT: movdqa %xmm5, %xmm0
+; X64-SSE2-NEXT: pxor %xmm4, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm7, %xmm1
+; X64-SSE2-NEXT: pxor %xmm4, %xmm1
+; X64-SSE2-NEXT: pcmpgtw %xmm0, %xmm1
+; X64-SSE2-NEXT: pand %xmm1, %xmm5
+; X64-SSE2-NEXT: pandn %xmm7, %xmm1
+; X64-SSE2-NEXT: por %xmm5, %xmm1
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X64-SSE2-NEXT: pxor %xmm4, %xmm2
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm3
+; X64-SSE2-NEXT: pxor %xmm4, %xmm3
+; X64-SSE2-NEXT: pcmpgtw %xmm2, %xmm3
+; X64-SSE2-NEXT: pand %xmm3, %xmm1
+; X64-SSE2-NEXT: pandn %xmm0, %xmm3
+; X64-SSE2-NEXT: por %xmm1, %xmm3
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
+; X64-SSE2-NEXT: movdqa %xmm3, %xmm1
+; X64-SSE2-NEXT: pxor %xmm4, %xmm1
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X64-SSE2-NEXT: pxor %xmm4, %xmm2
+; X64-SSE2-NEXT: pcmpgtw %xmm1, %xmm2
+; X64-SSE2-NEXT: pand %xmm2, %xmm3
+; X64-SSE2-NEXT: pandn %xmm0, %xmm2
+; X64-SSE2-NEXT: por %xmm3, %xmm2
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm0
+; X64-SSE2-NEXT: psrld $16, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm2, %xmm1
+; X64-SSE2-NEXT: pxor %xmm4, %xmm1
+; X64-SSE2-NEXT: pxor %xmm0, %xmm4
+; X64-SSE2-NEXT: pcmpgtw %xmm1, %xmm4
+; X64-SSE2-NEXT: pand %xmm4, %xmm2
+; X64-SSE2-NEXT: pandn %xmm0, %xmm4
+; X64-SSE2-NEXT: por %xmm2, %xmm4
+; X64-SSE2-NEXT: movd %xmm4, %eax
+; X64-SSE2-NEXT: ## kill: def %ax killed %ax killed %eax
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v32i16:
+; X64-SSE42: ## %bb.0:
+; X64-SSE42-NEXT: pminuw %xmm3, %xmm1
+; X64-SSE42-NEXT: pminuw %xmm2, %xmm0
+; X64-SSE42-NEXT: pminuw %xmm1, %xmm0
+; X64-SSE42-NEXT: phminposuw %xmm0, %xmm0
+; X64-SSE42-NEXT: movd %xmm0, %eax
+; X64-SSE42-NEXT: ## kill: def %ax killed %ax killed %eax
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v32i16:
+; X64-AVX1: ## %bb.0:
+; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X64-AVX1-NEXT: vpminuw %xmm2, %xmm3, %xmm2
+; X64-AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpminuw %xmm2, %xmm0, %xmm0
+; X64-AVX1-NEXT: vphminposuw %xmm0, %xmm0
+; X64-AVX1-NEXT: vmovd %xmm0, %eax
+; X64-AVX1-NEXT: ## kill: def %ax killed %ax killed %eax
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v32i16:
+; X64-AVX2: ## %bb.0:
+; X64-AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vpminuw %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT: vphminposuw %xmm0, %xmm0
+; X64-AVX2-NEXT: vmovd %xmm0, %eax
+; X64-AVX2-NEXT: ## kill: def %ax killed %ax killed %eax
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v32i16:
+; X64-AVX512: ## %bb.0:
+; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; X64-AVX512-NEXT: vpminuw %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT: vpminuw %xmm1, %xmm0, %xmm0
+; X64-AVX512-NEXT: vphminposuw %xmm0, %xmm0
+; X64-AVX512-NEXT: vmovd %xmm0, %eax
+; X64-AVX512-NEXT: ## kill: def %ax killed %ax killed %eax
+; X64-AVX512-NEXT: vzeroupper
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <32 x i16> %a0, <32 x i16> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp ult <32 x i16> %a0, %1
+ %3 = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %1
+ %4 = shufflevector <32 x i16> %3, <32 x i16> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp ult <32 x i16> %3, %4
+ %6 = select <32 x i1> %5, <32 x i16> %3, <32 x i16> %4
+ %7 = shufflevector <32 x i16> %6, <32 x i16> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp ult <32 x i16> %6, %7
+ %9 = select <32 x i1> %8, <32 x i16> %6, <32 x i16> %7
+ %10 = shufflevector <32 x i16> %9, <32 x i16> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %11 = icmp ult <32 x i16> %9, %10
+ %12 = select <32 x i1> %11, <32 x i16> %9, <32 x i16> %10
+ %13 = shufflevector <32 x i16> %12, <32 x i16> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %14 = icmp ult <32 x i16> %12, %13
+ %15 = select <32 x i1> %14, <32 x i16> %12, <32 x i16> %13
+ %16 = extractelement <32 x i16> %15, i32 0
+ ret i16 %16
+}
+
+define i8 @test_reduce_v64i8(<64 x i8> %a0) {
+; X86-SSE2-LABEL: test_reduce_v64i8:
+; X86-SSE2: ## %bb.0:
+; X86-SSE2-NEXT: pminub %xmm3, %xmm1
+; X86-SSE2-NEXT: pminub %xmm2, %xmm0
+; X86-SSE2-NEXT: pminub %xmm1, %xmm0
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: pminub %xmm0, %xmm1
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE2-NEXT: pminub %xmm1, %xmm0
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT: psrld $16, %xmm1
+; X86-SSE2-NEXT: pminub %xmm0, %xmm1
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE2-NEXT: psrlw $8, %xmm0
+; X86-SSE2-NEXT: pminub %xmm1, %xmm0
+; X86-SSE2-NEXT: movd %xmm0, %eax
+; X86-SSE2-NEXT: ## kill: def %al killed %al killed %eax
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE42-LABEL: test_reduce_v64i8:
+; X86-SSE42: ## %bb.0:
+; X86-SSE42-NEXT: pminub %xmm3, %xmm1
+; X86-SSE42-NEXT: pminub %xmm2, %xmm0
+; X86-SSE42-NEXT: pminub %xmm1, %xmm0
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT: pminub %xmm0, %xmm1
+; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT: pminub %xmm1, %xmm0
+; X86-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X86-SSE42-NEXT: psrld $16, %xmm1
+; X86-SSE42-NEXT: pminub %xmm0, %xmm1
+; X86-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE42-NEXT: psrlw $8, %xmm0
+; X86-SSE42-NEXT: pminub %xmm1, %xmm0
+; X86-SSE42-NEXT: pextrb $0, %xmm0, %eax
+; X86-SSE42-NEXT: ## kill: def %al killed %al killed %eax
+; X86-SSE42-NEXT: retl
+;
+; X86-AVX1-LABEL: test_reduce_v64i8:
+; X86-AVX1: ## %bb.0:
+; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X86-AVX1-NEXT: vpminub %xmm2, %xmm3, %xmm2
+; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpminub %xmm2, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT: vpextrb $0, %xmm0, %eax
+; X86-AVX1-NEXT: ## kill: def %al killed %al killed %eax
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-AVX2-LABEL: test_reduce_v64i8:
+; X86-AVX2: ## %bb.0:
+; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
+; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X86-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT: vpextrb $0, %xmm0, %eax
+; X86-AVX2-NEXT: ## kill: def %al killed %al killed %eax
+; X86-AVX2-NEXT: vzeroupper
+; X86-AVX2-NEXT: retl
+;
+; X64-SSE2-LABEL: test_reduce_v64i8:
+; X64-SSE2: ## %bb.0:
+; X64-SSE2-NEXT: pminub %xmm3, %xmm1
+; X64-SSE2-NEXT: pminub %xmm2, %xmm0
+; X64-SSE2-NEXT: pminub %xmm1, %xmm0
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: pminub %xmm0, %xmm1
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE2-NEXT: pminub %xmm1, %xmm0
+; X64-SSE2-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE2-NEXT: psrld $16, %xmm1
+; X64-SSE2-NEXT: pminub %xmm0, %xmm1
+; X64-SSE2-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE2-NEXT: psrlw $8, %xmm0
+; X64-SSE2-NEXT: pminub %xmm1, %xmm0
+; X64-SSE2-NEXT: movd %xmm0, %eax
+; X64-SSE2-NEXT: ## kill: def %al killed %al killed %eax
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE42-LABEL: test_reduce_v64i8:
+; X64-SSE42: ## %bb.0:
+; X64-SSE42-NEXT: pminub %xmm3, %xmm1
+; X64-SSE42-NEXT: pminub %xmm2, %xmm0
+; X64-SSE42-NEXT: pminub %xmm1, %xmm0
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT: pminub %xmm0, %xmm1
+; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT: pminub %xmm1, %xmm0
+; X64-SSE42-NEXT: movdqa %xmm0, %xmm1
+; X64-SSE42-NEXT: psrld $16, %xmm1
+; X64-SSE42-NEXT: pminub %xmm0, %xmm1
+; X64-SSE42-NEXT: movdqa %xmm1, %xmm0
+; X64-SSE42-NEXT: psrlw $8, %xmm0
+; X64-SSE42-NEXT: pminub %xmm1, %xmm0
+; X64-SSE42-NEXT: pextrb $0, %xmm0, %eax
+; X64-SSE42-NEXT: ## kill: def %al killed %al killed %eax
+; X64-SSE42-NEXT: retq
+;
+; X64-AVX1-LABEL: test_reduce_v64i8:
+; X64-AVX1: ## %bb.0:
+; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X64-AVX1-NEXT: vpminub %xmm2, %xmm3, %xmm2
+; X64-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpminub %xmm2, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X64-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT: vpextrb $0, %xmm0, %eax
+; X64-AVX1-NEXT: ## kill: def %al killed %al killed %eax
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: test_reduce_v64i8:
+; X64-AVX2: ## %bb.0:
+; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X64-AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpextrb $0, %xmm0, %eax
+; X64-AVX2-NEXT: ## kill: def %al killed %al killed %eax
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: test_reduce_v64i8:
+; X64-AVX512: ## %bb.0:
+; X64-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; X64-AVX512-NEXT: vpminub %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT: vpminub %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT: vpminub %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX512-NEXT: vpminub %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
+; X64-AVX512-NEXT: vpminub %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
+; X64-AVX512-NEXT: vpminub %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT: vpextrb $0, %xmm0, %eax
+; X64-AVX512-NEXT: ## kill: def %al killed %al killed %eax
+; X64-AVX512-NEXT: vzeroupper
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <64 x i8> %a0, <64 x i8> undef, <64 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %2 = icmp ult <64 x i8> %a0, %1
+ %3 = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %1
+ %4 = shufflevector <64 x i8> %3, <64 x i8> undef, <64 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %5 = icmp ult <64 x i8> %3, %4
+ %6 = select <64 x i1> %5, <64 x i8> %3, <64 x i8> %4
+ %7 = shufflevector <64 x i8> %6, <64 x i8> undef, <64 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %8 = icmp ult <64 x i8> %6, %7
+ %9 = select <64 x i1> %8, <64 x i8> %6, <64 x i8> %7
+ %10 = shufflevector <64 x i8> %9, <64 x i8> undef, <64 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %11 = icmp ult <64 x i8> %9, %10
+ %12 = select <64 x i1> %11, <64 x i8> %9, <64 x i8> %10
+ %13 = shufflevector <64 x i8> %12, <64 x i8> undef, <64 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %14 = icmp ult <64 x i8> %12, %13
+ %15 = select <64 x i1> %14, <64 x i8> %12, <64 x i8> %13
+ %16 = shufflevector <64 x i8> %15, <64 x i8> undef, <64 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ %17 = icmp ult <64 x i8> %15, %16
+ %18 = select <64 x i1> %17, <64 x i8> %15, <64 x i8> %16
+ %19 = extractelement <64 x i8> %18, i32 0
+ ret i8 %19
+}
diff --git a/test/CodeGen/X86/horizontal-shuffle.ll b/test/CodeGen/X86/horizontal-shuffle.ll
index def614150cd4..70fc7fa4a1d7 100644
--- a/test/CodeGen/X86/horizontal-shuffle.ll
+++ b/test/CodeGen/X86/horizontal-shuffle.ll
@@ -8,17 +8,13 @@
define <4 x float> @test_unpackl_fhadd_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> %a3) {
; X32-LABEL: test_unpackl_fhadd_128:
-; X32: ## BB#0:
-; X32-NEXT: vhaddps %xmm1, %xmm0, %xmm0
-; X32-NEXT: vhaddps %xmm3, %xmm2, %xmm1
-; X32-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X32: ## %bb.0:
+; X32-NEXT: vhaddps %xmm2, %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_unpackl_fhadd_128:
-; X64: ## BB#0:
-; X64-NEXT: vhaddps %xmm1, %xmm0, %xmm0
-; X64-NEXT: vhaddps %xmm3, %xmm2, %xmm1
-; X64-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X64: ## %bb.0:
+; X64-NEXT: vhaddps %xmm2, %xmm0, %xmm0
; X64-NEXT: retq
%1 = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %a0, <4 x float> %a1)
%2 = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %a2, <4 x float> %a3)
@@ -28,17 +24,13 @@ define <4 x float> @test_unpackl_fhadd_128(<4 x float> %a0, <4 x float> %a1, <4
define <2 x double> @test_unpackh_fhadd_128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> %a3) {
; X32-LABEL: test_unpackh_fhadd_128:
-; X32: ## BB#0:
-; X32-NEXT: vhaddpd %xmm1, %xmm0, %xmm0
-; X32-NEXT: vhaddpd %xmm3, %xmm2, %xmm1
-; X32-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; X32: ## %bb.0:
+; X32-NEXT: vhaddpd %xmm3, %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_unpackh_fhadd_128:
-; X64: ## BB#0:
-; X64-NEXT: vhaddpd %xmm1, %xmm0, %xmm0
-; X64-NEXT: vhaddpd %xmm3, %xmm2, %xmm1
-; X64-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; X64: ## %bb.0:
+; X64-NEXT: vhaddpd %xmm3, %xmm1, %xmm0
; X64-NEXT: retq
%1 = call <2 x double> @llvm.x86.sse3.hadd.pd(<2 x double> %a0, <2 x double> %a1)
%2 = call <2 x double> @llvm.x86.sse3.hadd.pd(<2 x double> %a2, <2 x double> %a3)
@@ -48,17 +40,13 @@ define <2 x double> @test_unpackh_fhadd_128(<2 x double> %a0, <2 x double> %a1,
define <2 x double> @test_unpackl_fhsub_128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> %a3) {
; X32-LABEL: test_unpackl_fhsub_128:
-; X32: ## BB#0:
-; X32-NEXT: vhsubpd %xmm1, %xmm0, %xmm0
-; X32-NEXT: vhsubpd %xmm3, %xmm2, %xmm1
-; X32-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X32: ## %bb.0:
+; X32-NEXT: vhsubpd %xmm2, %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_unpackl_fhsub_128:
-; X64: ## BB#0:
-; X64-NEXT: vhsubpd %xmm1, %xmm0, %xmm0
-; X64-NEXT: vhsubpd %xmm3, %xmm2, %xmm1
-; X64-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X64: ## %bb.0:
+; X64-NEXT: vhsubpd %xmm2, %xmm0, %xmm0
; X64-NEXT: retq
%1 = call <2 x double> @llvm.x86.sse3.hsub.pd(<2 x double> %a0, <2 x double> %a1)
%2 = call <2 x double> @llvm.x86.sse3.hsub.pd(<2 x double> %a2, <2 x double> %a3)
@@ -68,17 +56,13 @@ define <2 x double> @test_unpackl_fhsub_128(<2 x double> %a0, <2 x double> %a1,
define <4 x float> @test_unpackh_fhsub_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> %a3) {
; X32-LABEL: test_unpackh_fhsub_128:
-; X32: ## BB#0:
-; X32-NEXT: vhsubps %xmm1, %xmm0, %xmm0
-; X32-NEXT: vhsubps %xmm3, %xmm2, %xmm1
-; X32-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; X32: ## %bb.0:
+; X32-NEXT: vhsubps %xmm3, %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_unpackh_fhsub_128:
-; X64: ## BB#0:
-; X64-NEXT: vhsubps %xmm1, %xmm0, %xmm0
-; X64-NEXT: vhsubps %xmm3, %xmm2, %xmm1
-; X64-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; X64: ## %bb.0:
+; X64-NEXT: vhsubps %xmm3, %xmm1, %xmm0
; X64-NEXT: retq
%1 = call <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float> %a0, <4 x float> %a1)
%2 = call <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float> %a2, <4 x float> %a3)
@@ -88,17 +72,13 @@ define <4 x float> @test_unpackh_fhsub_128(<4 x float> %a0, <4 x float> %a1, <4
define <8 x i16> @test_unpackl_hadd_128(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> %a2, <8 x i16> %a3) {
; X32-LABEL: test_unpackl_hadd_128:
-; X32: ## BB#0:
-; X32-NEXT: vphaddw %xmm1, %xmm0, %xmm0
-; X32-NEXT: vphaddw %xmm3, %xmm2, %xmm1
-; X32-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X32: ## %bb.0:
+; X32-NEXT: vphaddw %xmm2, %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_unpackl_hadd_128:
-; X64: ## BB#0:
-; X64-NEXT: vphaddw %xmm1, %xmm0, %xmm0
-; X64-NEXT: vphaddw %xmm3, %xmm2, %xmm1
-; X64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X64: ## %bb.0:
+; X64-NEXT: vphaddw %xmm2, %xmm0, %xmm0
; X64-NEXT: retq
%1 = call <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16> %a0, <8 x i16> %a1)
%2 = call <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16> %a2, <8 x i16> %a3)
@@ -108,17 +88,13 @@ define <8 x i16> @test_unpackl_hadd_128(<8 x i16> %a0, <8 x i16> %a1, <8 x i16>
define <4 x i32> @test_unpackh_hadd_128(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2, <4 x i32> %a3) {
; X32-LABEL: test_unpackh_hadd_128:
-; X32: ## BB#0:
-; X32-NEXT: vphaddd %xmm1, %xmm0, %xmm0
-; X32-NEXT: vphaddd %xmm3, %xmm2, %xmm1
-; X32-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; X32: ## %bb.0:
+; X32-NEXT: vphaddd %xmm3, %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_unpackh_hadd_128:
-; X64: ## BB#0:
-; X64-NEXT: vphaddd %xmm1, %xmm0, %xmm0
-; X64-NEXT: vphaddd %xmm3, %xmm2, %xmm1
-; X64-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; X64: ## %bb.0:
+; X64-NEXT: vphaddd %xmm3, %xmm1, %xmm0
; X64-NEXT: retq
%1 = call <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32> %a0, <4 x i32> %a1)
%2 = call <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32> %a2, <4 x i32> %a3)
@@ -128,17 +104,13 @@ define <4 x i32> @test_unpackh_hadd_128(<4 x i32> %a0, <4 x i32> %a1, <4 x i32>
define <4 x i32> @test_unpackl_hsub_128(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2, <4 x i32> %a3) {
; X32-LABEL: test_unpackl_hsub_128:
-; X32: ## BB#0:
-; X32-NEXT: vphsubd %xmm1, %xmm0, %xmm0
-; X32-NEXT: vphsubd %xmm3, %xmm2, %xmm1
-; X32-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X32: ## %bb.0:
+; X32-NEXT: vphsubd %xmm2, %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_unpackl_hsub_128:
-; X64: ## BB#0:
-; X64-NEXT: vphsubd %xmm1, %xmm0, %xmm0
-; X64-NEXT: vphsubd %xmm3, %xmm2, %xmm1
-; X64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X64: ## %bb.0:
+; X64-NEXT: vphsubd %xmm2, %xmm0, %xmm0
; X64-NEXT: retq
%1 = call <4 x i32> @llvm.x86.ssse3.phsub.d.128(<4 x i32> %a0, <4 x i32> %a1)
%2 = call <4 x i32> @llvm.x86.ssse3.phsub.d.128(<4 x i32> %a2, <4 x i32> %a3)
@@ -148,17 +120,13 @@ define <4 x i32> @test_unpackl_hsub_128(<4 x i32> %a0, <4 x i32> %a1, <4 x i32>
define <8 x i16> @test_unpackh_hsub_128(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> %a2, <8 x i16> %a3) {
; X32-LABEL: test_unpackh_hsub_128:
-; X32: ## BB#0:
-; X32-NEXT: vphsubw %xmm1, %xmm0, %xmm0
-; X32-NEXT: vphsubw %xmm3, %xmm2, %xmm1
-; X32-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; X32: ## %bb.0:
+; X32-NEXT: vphsubw %xmm3, %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_unpackh_hsub_128:
-; X64: ## BB#0:
-; X64-NEXT: vphsubw %xmm1, %xmm0, %xmm0
-; X64-NEXT: vphsubw %xmm3, %xmm2, %xmm1
-; X64-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; X64: ## %bb.0:
+; X64-NEXT: vphsubw %xmm3, %xmm1, %xmm0
; X64-NEXT: retq
%1 = call <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16> %a0, <8 x i16> %a1)
%2 = call <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16> %a2, <8 x i16> %a3)
@@ -168,17 +136,13 @@ define <8 x i16> @test_unpackh_hsub_128(<8 x i16> %a0, <8 x i16> %a1, <8 x i16>
define <16 x i8> @test_unpackl_packss_128(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> %a2, <8 x i16> %a3) {
; X32-LABEL: test_unpackl_packss_128:
-; X32: ## BB#0:
-; X32-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
-; X32-NEXT: vpacksswb %xmm3, %xmm2, %xmm1
-; X32-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X32: ## %bb.0:
+; X32-NEXT: vpacksswb %xmm2, %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_unpackl_packss_128:
-; X64: ## BB#0:
-; X64-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
-; X64-NEXT: vpacksswb %xmm3, %xmm2, %xmm1
-; X64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X64: ## %bb.0:
+; X64-NEXT: vpacksswb %xmm2, %xmm0, %xmm0
; X64-NEXT: retq
%1 = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %a0, <8 x i16> %a1)
%2 = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %a2, <8 x i16> %a3)
@@ -188,17 +152,13 @@ define <16 x i8> @test_unpackl_packss_128(<8 x i16> %a0, <8 x i16> %a1, <8 x i16
define <8 x i16> @test_unpackh_packss_128(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2, <4 x i32> %a3) {
; X32-LABEL: test_unpackh_packss_128:
-; X32: ## BB#0:
-; X32-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
-; X32-NEXT: vpackssdw %xmm3, %xmm2, %xmm1
-; X32-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; X32: ## %bb.0:
+; X32-NEXT: vpackssdw %xmm3, %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_unpackh_packss_128:
-; X64: ## BB#0:
-; X64-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
-; X64-NEXT: vpackssdw %xmm3, %xmm2, %xmm1
-; X64-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; X64: ## %bb.0:
+; X64-NEXT: vpackssdw %xmm3, %xmm1, %xmm0
; X64-NEXT: retq
%1 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a0, <4 x i32> %a1)
%2 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a2, <4 x i32> %a3)
@@ -208,12 +168,12 @@ define <8 x i16> @test_unpackh_packss_128(<4 x i32> %a0, <4 x i32> %a1, <4 x i32
define <8 x i16> @test_unpackl_packus_128(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2, <4 x i32> %a3) {
; X32-LABEL: test_unpackl_packus_128:
-; X32: ## BB#0:
+; X32: ## %bb.0:
; X32-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_unpackl_packus_128:
-; X64: ## BB#0:
+; X64: ## %bb.0:
; X64-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
; X64-NEXT: retq
%1 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a0, <4 x i32> %a1)
@@ -224,12 +184,12 @@ define <8 x i16> @test_unpackl_packus_128(<4 x i32> %a0, <4 x i32> %a1, <4 x i32
define <16 x i8> @test_unpackh_packus_128(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> %a2, <8 x i16> %a3) {
; X32-LABEL: test_unpackh_packus_128:
-; X32: ## BB#0:
+; X32: ## %bb.0:
; X32-NEXT: vpackuswb %xmm3, %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_unpackh_packus_128:
-; X64: ## BB#0:
+; X64: ## %bb.0:
; X64-NEXT: vpackuswb %xmm3, %xmm1, %xmm0
; X64-NEXT: retq
%1 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %a0, <8 x i16> %a1)
@@ -244,17 +204,13 @@ define <16 x i8> @test_unpackh_packus_128(<8 x i16> %a0, <8 x i16> %a1, <8 x i16
define <8 x float> @test_unpackl_fhadd_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, <8 x float> %a3) {
; X32-LABEL: test_unpackl_fhadd_256:
-; X32: ## BB#0:
-; X32-NEXT: vhaddps %ymm1, %ymm0, %ymm0
-; X32-NEXT: vhaddps %ymm3, %ymm2, %ymm1
-; X32-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; X32: ## %bb.0:
+; X32-NEXT: vhaddps %ymm2, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_unpackl_fhadd_256:
-; X64: ## BB#0:
-; X64-NEXT: vhaddps %ymm1, %ymm0, %ymm0
-; X64-NEXT: vhaddps %ymm3, %ymm2, %ymm1
-; X64-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; X64: ## %bb.0:
+; X64-NEXT: vhaddps %ymm2, %ymm0, %ymm0
; X64-NEXT: retq
%1 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %a0, <8 x float> %a1)
%2 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %a2, <8 x float> %a3)
@@ -264,17 +220,13 @@ define <8 x float> @test_unpackl_fhadd_256(<8 x float> %a0, <8 x float> %a1, <8
define <4 x double> @test_unpackh_fhadd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, <4 x double> %a3) {
; X32-LABEL: test_unpackh_fhadd_256:
-; X32: ## BB#0:
-; X32-NEXT: vhaddpd %ymm1, %ymm0, %ymm0
-; X32-NEXT: vhaddpd %ymm3, %ymm2, %ymm1
-; X32-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; X32: ## %bb.0:
+; X32-NEXT: vhaddpd %ymm3, %ymm1, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_unpackh_fhadd_256:
-; X64: ## BB#0:
-; X64-NEXT: vhaddpd %ymm1, %ymm0, %ymm0
-; X64-NEXT: vhaddpd %ymm3, %ymm2, %ymm1
-; X64-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; X64: ## %bb.0:
+; X64-NEXT: vhaddpd %ymm3, %ymm1, %ymm0
; X64-NEXT: retq
%1 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %a0, <4 x double> %a1)
%2 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %a2, <4 x double> %a3)
@@ -284,17 +236,13 @@ define <4 x double> @test_unpackh_fhadd_256(<4 x double> %a0, <4 x double> %a1,
define <4 x double> @test_unpackl_fhsub_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, <4 x double> %a3) {
; X32-LABEL: test_unpackl_fhsub_256:
-; X32: ## BB#0:
-; X32-NEXT: vhsubpd %ymm1, %ymm0, %ymm0
-; X32-NEXT: vhsubpd %ymm3, %ymm2, %ymm1
-; X32-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; X32: ## %bb.0:
+; X32-NEXT: vhsubpd %ymm2, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_unpackl_fhsub_256:
-; X64: ## BB#0:
-; X64-NEXT: vhsubpd %ymm1, %ymm0, %ymm0
-; X64-NEXT: vhsubpd %ymm3, %ymm2, %ymm1
-; X64-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; X64: ## %bb.0:
+; X64-NEXT: vhsubpd %ymm2, %ymm0, %ymm0
; X64-NEXT: retq
%1 = call <4 x double> @llvm.x86.avx.hsub.pd.256(<4 x double> %a0, <4 x double> %a1)
%2 = call <4 x double> @llvm.x86.avx.hsub.pd.256(<4 x double> %a2, <4 x double> %a3)
@@ -304,17 +252,13 @@ define <4 x double> @test_unpackl_fhsub_256(<4 x double> %a0, <4 x double> %a1,
define <8 x float> @test_unpackh_fhsub_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, <8 x float> %a3) {
; X32-LABEL: test_unpackh_fhsub_256:
-; X32: ## BB#0:
-; X32-NEXT: vhsubps %ymm1, %ymm0, %ymm0
-; X32-NEXT: vhsubps %ymm3, %ymm2, %ymm1
-; X32-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; X32: ## %bb.0:
+; X32-NEXT: vhsubps %ymm3, %ymm1, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_unpackh_fhsub_256:
-; X64: ## BB#0:
-; X64-NEXT: vhsubps %ymm1, %ymm0, %ymm0
-; X64-NEXT: vhsubps %ymm3, %ymm2, %ymm1
-; X64-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; X64: ## %bb.0:
+; X64-NEXT: vhsubps %ymm3, %ymm1, %ymm0
; X64-NEXT: retq
%1 = call <8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float> %a0, <8 x float> %a1)
%2 = call <8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float> %a2, <8 x float> %a3)
@@ -324,17 +268,13 @@ define <8 x float> @test_unpackh_fhsub_256(<8 x float> %a0, <8 x float> %a1, <8
define <16 x i16> @test_unpackl_hadd_256(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> %a2, <16 x i16> %a3) {
; X32-LABEL: test_unpackl_hadd_256:
-; X32: ## BB#0:
-; X32-NEXT: vphaddw %ymm1, %ymm0, %ymm0
-; X32-NEXT: vphaddw %ymm3, %ymm2, %ymm1
-; X32-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; X32: ## %bb.0:
+; X32-NEXT: vphaddw %ymm2, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_unpackl_hadd_256:
-; X64: ## BB#0:
-; X64-NEXT: vphaddw %ymm1, %ymm0, %ymm0
-; X64-NEXT: vphaddw %ymm3, %ymm2, %ymm1
-; X64-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; X64: ## %bb.0:
+; X64-NEXT: vphaddw %ymm2, %ymm0, %ymm0
; X64-NEXT: retq
%1 = call <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16> %a0, <16 x i16> %a1)
%2 = call <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16> %a2, <16 x i16> %a3)
@@ -344,17 +284,13 @@ define <16 x i16> @test_unpackl_hadd_256(<16 x i16> %a0, <16 x i16> %a1, <16 x i
define <8 x i32> @test_unpackh_hadd_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2, <8 x i32> %a3) {
; X32-LABEL: test_unpackh_hadd_256:
-; X32: ## BB#0:
-; X32-NEXT: vphaddd %ymm1, %ymm0, %ymm0
-; X32-NEXT: vphaddd %ymm3, %ymm2, %ymm1
-; X32-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; X32: ## %bb.0:
+; X32-NEXT: vphaddd %ymm3, %ymm1, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_unpackh_hadd_256:
-; X64: ## BB#0:
-; X64-NEXT: vphaddd %ymm1, %ymm0, %ymm0
-; X64-NEXT: vphaddd %ymm3, %ymm2, %ymm1
-; X64-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; X64: ## %bb.0:
+; X64-NEXT: vphaddd %ymm3, %ymm1, %ymm0
; X64-NEXT: retq
%1 = call <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32> %a0, <8 x i32> %a1)
%2 = call <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32> %a2, <8 x i32> %a3)
@@ -364,17 +300,13 @@ define <8 x i32> @test_unpackh_hadd_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32>
define <8 x i32> @test_unpackl_hsub_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2, <8 x i32> %a3) {
; X32-LABEL: test_unpackl_hsub_256:
-; X32: ## BB#0:
-; X32-NEXT: vphsubd %ymm1, %ymm0, %ymm0
-; X32-NEXT: vphsubd %ymm3, %ymm2, %ymm1
-; X32-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; X32: ## %bb.0:
+; X32-NEXT: vphsubd %ymm2, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_unpackl_hsub_256:
-; X64: ## BB#0:
-; X64-NEXT: vphsubd %ymm1, %ymm0, %ymm0
-; X64-NEXT: vphsubd %ymm3, %ymm2, %ymm1
-; X64-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; X64: ## %bb.0:
+; X64-NEXT: vphsubd %ymm2, %ymm0, %ymm0
; X64-NEXT: retq
%1 = call <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32> %a0, <8 x i32> %a1)
%2 = call <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32> %a2, <8 x i32> %a3)
@@ -384,17 +316,13 @@ define <8 x i32> @test_unpackl_hsub_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32>
define <16 x i16> @test_unpackh_hsub_256(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> %a2, <16 x i16> %a3) {
; X32-LABEL: test_unpackh_hsub_256:
-; X32: ## BB#0:
-; X32-NEXT: vphsubw %ymm1, %ymm0, %ymm0
-; X32-NEXT: vphsubw %ymm3, %ymm2, %ymm1
-; X32-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; X32: ## %bb.0:
+; X32-NEXT: vphsubw %ymm3, %ymm1, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_unpackh_hsub_256:
-; X64: ## BB#0:
-; X64-NEXT: vphsubw %ymm1, %ymm0, %ymm0
-; X64-NEXT: vphsubw %ymm3, %ymm2, %ymm1
-; X64-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; X64: ## %bb.0:
+; X64-NEXT: vphsubw %ymm3, %ymm1, %ymm0
; X64-NEXT: retq
%1 = call <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16> %a0, <16 x i16> %a1)
%2 = call <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16> %a2, <16 x i16> %a3)
@@ -404,17 +332,13 @@ define <16 x i16> @test_unpackh_hsub_256(<16 x i16> %a0, <16 x i16> %a1, <16 x i
define <32 x i8> @test_unpackl_packss_256(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> %a2, <16 x i16> %a3) {
; X32-LABEL: test_unpackl_packss_256:
-; X32: ## BB#0:
-; X32-NEXT: vpacksswb %ymm1, %ymm0, %ymm0
-; X32-NEXT: vpacksswb %ymm3, %ymm2, %ymm1
-; X32-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; X32: ## %bb.0:
+; X32-NEXT: vpacksswb %ymm2, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_unpackl_packss_256:
-; X64: ## BB#0:
-; X64-NEXT: vpacksswb %ymm1, %ymm0, %ymm0
-; X64-NEXT: vpacksswb %ymm3, %ymm2, %ymm1
-; X64-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; X64: ## %bb.0:
+; X64-NEXT: vpacksswb %ymm2, %ymm0, %ymm0
; X64-NEXT: retq
%1 = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %a0, <16 x i16> %a1)
%2 = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %a2, <16 x i16> %a3)
@@ -424,17 +348,13 @@ define <32 x i8> @test_unpackl_packss_256(<16 x i16> %a0, <16 x i16> %a1, <16 x
define <16 x i16> @test_unpackh_packss_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2, <8 x i32> %a3) {
; X32-LABEL: test_unpackh_packss_256:
-; X32: ## BB#0:
-; X32-NEXT: vpackssdw %ymm1, %ymm0, %ymm0
-; X32-NEXT: vpackssdw %ymm3, %ymm2, %ymm1
-; X32-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; X32: ## %bb.0:
+; X32-NEXT: vpackssdw %ymm3, %ymm1, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_unpackh_packss_256:
-; X64: ## BB#0:
-; X64-NEXT: vpackssdw %ymm1, %ymm0, %ymm0
-; X64-NEXT: vpackssdw %ymm3, %ymm2, %ymm1
-; X64-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; X64: ## %bb.0:
+; X64-NEXT: vpackssdw %ymm3, %ymm1, %ymm0
; X64-NEXT: retq
%1 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a0, <8 x i32> %a1)
%2 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a2, <8 x i32> %a3)
@@ -444,17 +364,13 @@ define <16 x i16> @test_unpackh_packss_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i3
define <16 x i16> @test_unpackl_packus_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2, <8 x i32> %a3) {
; X32-LABEL: test_unpackl_packus_256:
-; X32: ## BB#0:
-; X32-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
-; X32-NEXT: vpackusdw %ymm3, %ymm2, %ymm1
-; X32-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; X32: ## %bb.0:
+; X32-NEXT: vpackusdw %ymm2, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_unpackl_packus_256:
-; X64: ## BB#0:
-; X64-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
-; X64-NEXT: vpackusdw %ymm3, %ymm2, %ymm1
-; X64-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; X64: ## %bb.0:
+; X64-NEXT: vpackusdw %ymm2, %ymm0, %ymm0
; X64-NEXT: retq
%1 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a0, <8 x i32> %a1)
%2 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a2, <8 x i32> %a3)
@@ -464,17 +380,13 @@ define <16 x i16> @test_unpackl_packus_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i3
define <32 x i8> @test_unpackh_packus_256(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> %a2, <16 x i16> %a3) {
; X32-LABEL: test_unpackh_packus_256:
-; X32: ## BB#0:
-; X32-NEXT: vpacksswb %ymm1, %ymm0, %ymm0
-; X32-NEXT: vpacksswb %ymm3, %ymm2, %ymm1
-; X32-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; X32: ## %bb.0:
+; X32-NEXT: vpacksswb %ymm3, %ymm1, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_unpackh_packus_256:
-; X64: ## BB#0:
-; X64-NEXT: vpacksswb %ymm1, %ymm0, %ymm0
-; X64-NEXT: vpacksswb %ymm3, %ymm2, %ymm1
-; X64-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; X64: ## %bb.0:
+; X64-NEXT: vpacksswb %ymm3, %ymm1, %ymm0
; X64-NEXT: retq
%1 = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %a0, <16 x i16> %a1)
%2 = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %a2, <16 x i16> %a3)
diff --git a/test/CodeGen/X86/i128-and-beyond.ll b/test/CodeGen/X86/i128-and-beyond.ll
index b741681ac17e..5018900d0c73 100644
--- a/test/CodeGen/X86/i128-and-beyond.ll
+++ b/test/CodeGen/X86/i128-and-beyond.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mtriple=i686-pc-linux-gnu | grep -- -1 | count 14
+; RUN: llc < %s -mtriple=i686-pc-linux-gnu | grep -- -1 | count 14
; These static initializers are too big to hand off to assemblers
; as monolithic blobs.
diff --git a/test/CodeGen/X86/i128-immediate.ll b/test/CodeGen/X86/i128-immediate.ll
index c47569e700f5..999076cddd69 100644
--- a/test/CodeGen/X86/i128-immediate.ll
+++ b/test/CodeGen/X86/i128-immediate.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 | grep movq | count 2
+; RUN: llc < %s -mtriple=x86_64-- | grep movq | count 2
define i128 @__addvti3() {
ret i128 -1
diff --git a/test/CodeGen/X86/i128-mul.ll b/test/CodeGen/X86/i128-mul.ll
index 21bca028888a..585d65b678fa 100644
--- a/test/CodeGen/X86/i128-mul.ll
+++ b/test/CodeGen/X86/i128-mul.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mcpu=corei7 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=corei7 | FileCheck %s
; PR1198
define i64 @foo(i64 %x, i64 %y) {
diff --git a/test/CodeGen/X86/i128-sdiv.ll b/test/CodeGen/X86/i128-sdiv.ll
index 89cd495aa8b6..82b75b334b3a 100644
--- a/test/CodeGen/X86/i128-sdiv.ll
+++ b/test/CodeGen/X86/i128-sdiv.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-- | FileCheck %s
; Make sure none of these crash, and that the power-of-two transformations
; trigger correctly.
diff --git a/test/CodeGen/X86/i16lshr8pat.ll b/test/CodeGen/X86/i16lshr8pat.ll
index 7f2da8e29538..425680f9d05c 100644
--- a/test/CodeGen/X86/i16lshr8pat.ll
+++ b/test/CodeGen/X86/i16lshr8pat.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=x86 -stop-after expand-isel-pseudos <%s 2>&1 | FileCheck %s
+; RUN: llc -stop-after expand-isel-pseudos <%s 2>&1 | FileCheck %s
target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
target triple = "i386-unknown-linux-gnu"
diff --git a/test/CodeGen/X86/i256-add.ll b/test/CodeGen/X86/i256-add.ll
index 7b2656897e0e..36d838a68cb1 100644
--- a/test/CodeGen/X86/i256-add.ll
+++ b/test/CodeGen/X86/i256-add.ll
@@ -4,7 +4,7 @@
define void @add(i256* %p, i256* %q) nounwind {
; X32-LABEL: add:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pushl %ebp
; X32-NEXT: pushl %ebx
; X32-NEXT: pushl %edi
@@ -50,7 +50,7 @@ define void @add(i256* %p, i256* %q) nounwind {
; X32-NEXT: retl
;
; X64-LABEL: add:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movq 16(%rdi), %rax
; X64-NEXT: movq (%rdi), %rcx
; X64-NEXT: movq 8(%rdi), %rdx
@@ -71,7 +71,7 @@ define void @add(i256* %p, i256* %q) nounwind {
}
define void @sub(i256* %p, i256* %q) nounwind {
; X32-LABEL: sub:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pushl %ebp
; X32-NEXT: pushl %ebx
; X32-NEXT: pushl %edi
@@ -114,7 +114,7 @@ define void @sub(i256* %p, i256* %q) nounwind {
; X32-NEXT: retl
;
; X64-LABEL: sub:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movq 16(%rdi), %rax
; X64-NEXT: movq (%rdi), %rcx
; X64-NEXT: movq 8(%rdi), %rdx
diff --git a/test/CodeGen/X86/i2k.ll b/test/CodeGen/X86/i2k.ll
index 83c10a58a3a8..ec5178a541aa 100644
--- a/test/CodeGen/X86/i2k.ll
+++ b/test/CodeGen/X86/i2k.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86
+; RUN: llc < %s -mtriple=i686--
define void @foo(i2011* %x, i2011* %y, i2011* %p) nounwind {
%a = load i2011, i2011* %x
diff --git a/test/CodeGen/X86/i486-fence-loop.ll b/test/CodeGen/X86/i486-fence-loop.ll
index 936e54eddafa..18556f261c33 100644
--- a/test/CodeGen/X86/i486-fence-loop.ll
+++ b/test/CodeGen/X86/i486-fence-loop.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=x86 -mcpu=i486 -o - %s | FileCheck %s
+; RUN: llc -mtriple=i686-- -mcpu=i486 -o - %s | FileCheck %s
; Main test here was that ISelDAG could cope with a MachineNode in the chain
; from the first load to the "X86ISD::SUB". Previously it thought that meant no
@@ -23,4 +23,4 @@ while.body:
if.then:
ret void
-} \ No newline at end of file
+}
diff --git a/test/CodeGen/X86/i64-mem-copy.ll b/test/CodeGen/X86/i64-mem-copy.ll
index 7b1926da245c..e14293797e86 100644
--- a/test/CodeGen/X86/i64-mem-copy.ll
+++ b/test/CodeGen/X86/i64-mem-copy.ll
@@ -7,13 +7,13 @@
define void @foo(i64* %x, i64* %y) {
; X64-LABEL: foo:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movq (%rsi), %rax
; X64-NEXT: movq %rax, (%rdi)
; X64-NEXT: retq
;
; X32-LABEL: foo:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
@@ -29,13 +29,13 @@ define void @foo(i64* %x, i64* %y) {
define void @store_i64_from_vector(<8 x i16> %x, <8 x i16> %y, i64* %i) {
; X64-LABEL: store_i64_from_vector:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: paddw %xmm1, %xmm0
; X64-NEXT: movq %xmm0, (%rdi)
; X64-NEXT: retq
;
; X32-LABEL: store_i64_from_vector:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: paddw %xmm1, %xmm0
; X32-NEXT: movq %xmm0, (%eax)
@@ -49,7 +49,7 @@ define void @store_i64_from_vector(<8 x i16> %x, <8 x i16> %y, i64* %i) {
define void @store_i64_from_vector256(<16 x i16> %x, <16 x i16> %y, i64* %i) {
; X32AVX-LABEL: store_i64_from_vector256:
-; X32AVX: # BB#0:
+; X32AVX: # %bb.0:
; X32AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32AVX-NEXT: vpaddw %ymm1, %ymm0, %ymm0
; X32AVX-NEXT: vextracti128 $1, %ymm0, %xmm0
diff --git a/test/CodeGen/X86/i64-to-float.ll b/test/CodeGen/X86/i64-to-float.ll
index f2fbff143121..0440b3d9575e 100644
--- a/test/CodeGen/X86/i64-to-float.ll
+++ b/test/CodeGen/X86/i64-to-float.ll
@@ -8,27 +8,27 @@
define <2 x double> @mask_sitofp_2i64_2f64(<2 x i64> %a) nounwind {
; X32-SSE-LABEL: mask_sitofp_2i64_2f64:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
; X32-SSE-NEXT: cvtdq2pd %xmm0, %xmm0
; X32-SSE-NEXT: retl
;
; X32-AVX-LABEL: mask_sitofp_2i64_2f64:
-; X32-AVX: # BB#0:
+; X32-AVX: # %bb.0:
; X32-AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[8,9],zero,zero,xmm0[u,u,u,u,u,u,u,u]
; X32-AVX-NEXT: vcvtdq2pd %xmm0, %xmm0
; X32-AVX-NEXT: retl
;
; X64-SSE-LABEL: mask_sitofp_2i64_2f64:
-; X64-SSE: # BB#0:
+; X64-SSE: # %bb.0:
; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X64-SSE-NEXT: pand {{.*}}(%rip), %xmm0
; X64-SSE-NEXT: cvtdq2pd %xmm0, %xmm0
; X64-SSE-NEXT: retq
;
; X64-AVX-LABEL: mask_sitofp_2i64_2f64:
-; X64-AVX: # BB#0:
+; X64-AVX: # %bb.0:
; X64-AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[8,9],zero,zero,xmm0[u,u,u,u,u,u,u,u]
; X64-AVX-NEXT: vcvtdq2pd %xmm0, %xmm0
; X64-AVX-NEXT: retq
@@ -39,27 +39,27 @@ define <2 x double> @mask_sitofp_2i64_2f64(<2 x i64> %a) nounwind {
define <2 x double> @mask_uitofp_2i64_2f64(<2 x i64> %a) nounwind {
; X32-SSE-LABEL: mask_uitofp_2i64_2f64:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
; X32-SSE-NEXT: cvtdq2pd %xmm0, %xmm0
; X32-SSE-NEXT: retl
;
; X32-AVX-LABEL: mask_uitofp_2i64_2f64:
-; X32-AVX: # BB#0:
+; X32-AVX: # %bb.0:
; X32-AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[8,9],zero,zero,xmm0[u,u,u,u,u,u,u,u]
; X32-AVX-NEXT: vcvtdq2pd %xmm0, %xmm0
; X32-AVX-NEXT: retl
;
; X64-SSE-LABEL: mask_uitofp_2i64_2f64:
-; X64-SSE: # BB#0:
+; X64-SSE: # %bb.0:
; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X64-SSE-NEXT: pand {{.*}}(%rip), %xmm0
; X64-SSE-NEXT: cvtdq2pd %xmm0, %xmm0
; X64-SSE-NEXT: retq
;
; X64-AVX-LABEL: mask_uitofp_2i64_2f64:
-; X64-AVX: # BB#0:
+; X64-AVX: # %bb.0:
; X64-AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[8,9],zero,zero,xmm0[u,u,u,u,u,u,u,u]
; X64-AVX-NEXT: vcvtdq2pd %xmm0, %xmm0
; X64-AVX-NEXT: retq
@@ -70,14 +70,14 @@ define <2 x double> @mask_uitofp_2i64_2f64(<2 x i64> %a) nounwind {
define <4 x float> @mask_sitofp_4i64_4f32(<4 x i64> %a) nounwind {
; X32-SSE-LABEL: mask_sitofp_4i64_4f32:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; X32-SSE-NEXT: andps {{\.LCPI.*}}, %xmm0
; X32-SSE-NEXT: cvtdq2ps %xmm0, %xmm0
; X32-SSE-NEXT: retl
;
; X32-AVX-LABEL: mask_sitofp_4i64_4f32:
-; X32-AVX: # BB#0:
+; X32-AVX: # %bb.0:
; X32-AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
; X32-AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; X32-AVX-NEXT: vandps {{\.LCPI.*}}, %xmm0, %xmm0
@@ -86,14 +86,14 @@ define <4 x float> @mask_sitofp_4i64_4f32(<4 x i64> %a) nounwind {
; X32-AVX-NEXT: retl
;
; X64-SSE-LABEL: mask_sitofp_4i64_4f32:
-; X64-SSE: # BB#0:
+; X64-SSE: # %bb.0:
; X64-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; X64-SSE-NEXT: andps {{.*}}(%rip), %xmm0
; X64-SSE-NEXT: cvtdq2ps %xmm0, %xmm0
; X64-SSE-NEXT: retq
;
; X64-AVX-LABEL: mask_sitofp_4i64_4f32:
-; X64-AVX: # BB#0:
+; X64-AVX: # %bb.0:
; X64-AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
; X64-AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; X64-AVX-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0
@@ -107,14 +107,14 @@ define <4 x float> @mask_sitofp_4i64_4f32(<4 x i64> %a) nounwind {
define <4 x float> @mask_uitofp_4i64_4f32(<4 x i64> %a) nounwind {
; X32-SSE-LABEL: mask_uitofp_4i64_4f32:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; X32-SSE-NEXT: andps {{\.LCPI.*}}, %xmm0
; X32-SSE-NEXT: cvtdq2ps %xmm0, %xmm0
; X32-SSE-NEXT: retl
;
; X32-AVX-LABEL: mask_uitofp_4i64_4f32:
-; X32-AVX: # BB#0:
+; X32-AVX: # %bb.0:
; X32-AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
; X32-AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; X32-AVX-NEXT: vandps {{\.LCPI.*}}, %xmm0, %xmm0
@@ -123,14 +123,14 @@ define <4 x float> @mask_uitofp_4i64_4f32(<4 x i64> %a) nounwind {
; X32-AVX-NEXT: retl
;
; X64-SSE-LABEL: mask_uitofp_4i64_4f32:
-; X64-SSE: # BB#0:
+; X64-SSE: # %bb.0:
; X64-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; X64-SSE-NEXT: andps {{.*}}(%rip), %xmm0
; X64-SSE-NEXT: cvtdq2ps %xmm0, %xmm0
; X64-SSE-NEXT: retq
;
; X64-AVX-LABEL: mask_uitofp_4i64_4f32:
-; X64-AVX: # BB#0:
+; X64-AVX: # %bb.0:
; X64-AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
; X64-AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; X64-AVX-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0
@@ -144,7 +144,7 @@ define <4 x float> @mask_uitofp_4i64_4f32(<4 x i64> %a) nounwind {
define <2 x double> @clamp_sitofp_2i64_2f64(<2 x i64> %a) nounwind {
; X32-SSE-LABEL: clamp_sitofp_2i64_2f64:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: pushl %ebp
; X32-SSE-NEXT: movl %esp, %ebp
; X32-SSE-NEXT: andl $-8, %esp
@@ -194,7 +194,7 @@ define <2 x double> @clamp_sitofp_2i64_2f64(<2 x i64> %a) nounwind {
; X32-SSE-NEXT: retl
;
; X32-AVX-LABEL: clamp_sitofp_2i64_2f64:
-; X32-AVX: # BB#0:
+; X32-AVX: # %bb.0:
; X32-AVX-NEXT: pushl %ebp
; X32-AVX-NEXT: movl %esp, %ebp
; X32-AVX-NEXT: andl $-8, %esp
@@ -207,8 +207,8 @@ define <2 x double> @clamp_sitofp_2i64_2f64(<2 x i64> %a) nounwind {
; X32-AVX-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
; X32-AVX-NEXT: vmovq {{.*#+}} xmm1 = xmm0[0],zero
; X32-AVX-NEXT: vmovq %xmm1, {{[0-9]+}}(%esp)
-; X32-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; X32-AVX-NEXT: vmovq %xmm0, {{[0-9]+}}(%esp)
+; X32-AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; X32-AVX-NEXT: vmovlpd %xmm0, {{[0-9]+}}(%esp)
; X32-AVX-NEXT: fildll {{[0-9]+}}(%esp)
; X32-AVX-NEXT: fstpl {{[0-9]+}}(%esp)
; X32-AVX-NEXT: fildll {{[0-9]+}}(%esp)
@@ -220,7 +220,7 @@ define <2 x double> @clamp_sitofp_2i64_2f64(<2 x i64> %a) nounwind {
; X32-AVX-NEXT: retl
;
; X64-SSE-LABEL: clamp_sitofp_2i64_2f64:
-; X64-SSE: # BB#0:
+; X64-SSE: # %bb.0:
; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,0,2147483648,0]
; X64-SSE-NEXT: movdqa %xmm0, %xmm2
; X64-SSE-NEXT: pxor %xmm1, %xmm2
@@ -258,11 +258,11 @@ define <2 x double> @clamp_sitofp_2i64_2f64(<2 x i64> %a) nounwind {
; X64-SSE-NEXT: movq %xmm1, %rax
; X64-SSE-NEXT: xorps %xmm1, %xmm1
; X64-SSE-NEXT: cvtsi2sdq %rax, %xmm1
-; X64-SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X64-SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; X64-SSE-NEXT: retq
;
; X64-AVX-LABEL: clamp_sitofp_2i64_2f64:
-; X64-AVX: # BB#0:
+; X64-AVX: # %bb.0:
; X64-AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744073709551361,18446744073709551361]
; X64-AVX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
; X64-AVX-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
@@ -273,7 +273,7 @@ define <2 x double> @clamp_sitofp_2i64_2f64(<2 x i64> %a) nounwind {
; X64-AVX-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm1
; X64-AVX-NEXT: vmovq %xmm0, %rax
; X64-AVX-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm0
-; X64-AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X64-AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; X64-AVX-NEXT: retq
%clo = icmp slt <2 x i64> %a, <i64 -255, i64 -255>
%lo = select <2 x i1> %clo, <2 x i64> <i64 -255, i64 -255>, <2 x i64> %a
diff --git a/test/CodeGen/X86/iabs.ll b/test/CodeGen/X86/iabs.ll
index f47bd7b2defb..95b0328ee730 100644
--- a/test/CodeGen/X86/iabs.ll
+++ b/test/CodeGen/X86/iabs.ll
@@ -1,4 +1,7 @@
-; RUN: llc < %s -march=x86-64 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown | FileCheck %s --check-prefix=X86 --check-prefix=X86-NO-CMOV
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+cmov | FileCheck %s --check-prefix=X86 --check-prefix=X86-CMOV
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=X64
;; Integer absolute value, should produce something at least as good as:
;; movl %edi, %eax
@@ -6,15 +9,113 @@
;; cmovll %edi, %eax
;; ret
; rdar://10695237
-define i32 @test(i32 %a) nounwind {
-; CHECK-LABEL: test:
-; CHECK: mov
-; CHECK-NEXT: neg
-; CHECK-NEXT: cmov
-; CHECK-NEXT: ret
- %tmp1neg = sub i32 0, %a
- %b = icmp sgt i32 %a, -1
- %abs = select i1 %b, i32 %a, i32 %tmp1neg
- ret i32 %abs
+define i8 @test_i8(i8 %a) nounwind {
+; X86-LABEL: test_i8:
+; X86: # %bb.0:
+; X86-NEXT: movb {{[0-9]+}}(%esp), %al
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: sarb $7, %cl
+; X86-NEXT: addb %cl, %al
+; X86-NEXT: xorb %cl, %al
+; X86-NEXT: retl
+;
+; X64-LABEL: test_i8:
+; X64: # %bb.0:
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: sarb $7, %al
+; X64-NEXT: addb %al, %dil
+; X64-NEXT: xorb %al, %dil
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: retq
+ %tmp1neg = sub i8 0, %a
+ %b = icmp sgt i8 %a, -1
+ %abs = select i1 %b, i8 %a, i8 %tmp1neg
+ ret i8 %abs
+}
+
+define i16 @test_i16(i16 %a) nounwind {
+; X86-NO-CMOV-LABEL: test_i16:
+; X86-NO-CMOV: # %bb.0:
+; X86-NO-CMOV-NEXT: movzwl {{[0-9]+}}(%esp), %eax
+; X86-NO-CMOV-NEXT: movl %eax, %ecx
+; X86-NO-CMOV-NEXT: sarw $15, %cx
+; X86-NO-CMOV-NEXT: addl %ecx, %eax
+; X86-NO-CMOV-NEXT: xorl %ecx, %eax
+; X86-NO-CMOV-NEXT: # kill: def %ax killed %ax killed %eax
+; X86-NO-CMOV-NEXT: retl
+;
+; X86-CMOV-LABEL: test_i16:
+; X86-CMOV: # %bb.0:
+; X86-CMOV-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
+; X86-CMOV-NEXT: movl %ecx, %eax
+; X86-CMOV-NEXT: negw %ax
+; X86-CMOV-NEXT: cmovlw %cx, %ax
+; X86-CMOV-NEXT: retl
+;
+; X64-LABEL: test_i16:
+; X64: # %bb.0:
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: negw %ax
+; X64-NEXT: cmovlw %di, %ax
+; X64-NEXT: retq
+ %tmp1neg = sub i16 0, %a
+ %b = icmp sgt i16 %a, -1
+ %abs = select i1 %b, i16 %a, i16 %tmp1neg
+ ret i16 %abs
+}
+
+define i32 @test_i32(i32 %a) nounwind {
+; X86-NO-CMOV-LABEL: test_i32:
+; X86-NO-CMOV: # %bb.0:
+; X86-NO-CMOV-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-CMOV-NEXT: movl %eax, %ecx
+; X86-NO-CMOV-NEXT: sarl $31, %ecx
+; X86-NO-CMOV-NEXT: addl %ecx, %eax
+; X86-NO-CMOV-NEXT: xorl %ecx, %eax
+; X86-NO-CMOV-NEXT: retl
+;
+; X86-CMOV-LABEL: test_i32:
+; X86-CMOV: # %bb.0:
+; X86-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-CMOV-NEXT: movl %ecx, %eax
+; X86-CMOV-NEXT: negl %eax
+; X86-CMOV-NEXT: cmovll %ecx, %eax
+; X86-CMOV-NEXT: retl
+;
+; X64-LABEL: test_i32:
+; X64: # %bb.0:
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: negl %eax
+; X64-NEXT: cmovll %edi, %eax
+; X64-NEXT: retq
+ %tmp1neg = sub i32 0, %a
+ %b = icmp sgt i32 %a, -1
+ %abs = select i1 %b, i32 %a, i32 %tmp1neg
+ ret i32 %abs
+}
+
+define i64 @test_i64(i64 %a) nounwind {
+; X86-LABEL: test_i64:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: sarl $31, %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: addl %ecx, %eax
+; X86-NEXT: adcl %ecx, %edx
+; X86-NEXT: xorl %ecx, %edx
+; X86-NEXT: xorl %ecx, %eax
+; X86-NEXT: retl
+;
+; X64-LABEL: test_i64:
+; X64: # %bb.0:
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: negq %rax
+; X64-NEXT: cmovlq %rdi, %rax
+; X64-NEXT: retq
+ %tmp1neg = sub i64 0, %a
+ %b = icmp sgt i64 %a, -1
+ %abs = select i1 %b, i64 %a, i64 %tmp1neg
+ ret i64 %abs
}
diff --git a/test/CodeGen/X86/illegal-bitfield-loadstore.ll b/test/CodeGen/X86/illegal-bitfield-loadstore.ll
index 5425670fbb1e..0bd84bbcad18 100644
--- a/test/CodeGen/X86/illegal-bitfield-loadstore.ll
+++ b/test/CodeGen/X86/illegal-bitfield-loadstore.ll
@@ -4,7 +4,7 @@
define void @i24_or(i24* %a) {
; X86-LABEL: i24_or:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movzwl (%ecx), %edx
; X86-NEXT: movzbl 2(%ecx), %eax
@@ -16,7 +16,7 @@ define void @i24_or(i24* %a) {
; X86-NEXT: retl
;
; X64-LABEL: i24_or:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movzwl (%rdi), %eax
; X64-NEXT: movzbl 2(%rdi), %ecx
; X64-NEXT: movb %cl, 2(%rdi)
@@ -33,7 +33,7 @@ define void @i24_or(i24* %a) {
define void @i24_and_or(i24* %a) {
; X86-LABEL: i24_and_or:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movzwl (%ecx), %edx
; X86-NEXT: movzbl 2(%ecx), %eax
@@ -46,7 +46,7 @@ define void @i24_and_or(i24* %a) {
; X86-NEXT: retl
;
; X64-LABEL: i24_and_or:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movzwl (%rdi), %eax
; X64-NEXT: movzbl 2(%rdi), %ecx
; X64-NEXT: movb %cl, 2(%rdi)
@@ -65,11 +65,9 @@ define void @i24_and_or(i24* %a) {
define void @i24_insert_bit(i24* %a, i1 zeroext %bit) {
; X86-LABEL: i24_insert_bit:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: pushl %esi
-; X86-NEXT: .Lcfi0:
; X86-NEXT: .cfi_def_cfa_offset 8
-; X86-NEXT: .Lcfi1:
; X86-NEXT: .cfi_offset %esi, -8
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %edx
@@ -86,17 +84,16 @@ define void @i24_insert_bit(i24* %a, i1 zeroext %bit) {
; X86-NEXT: retl
;
; X64-LABEL: i24_insert_bit:
-; X64: # BB#0:
-; X64-NEXT: movzbl %sil, %eax
-; X64-NEXT: movzwl (%rdi), %ecx
-; X64-NEXT: movzbl 2(%rdi), %edx
-; X64-NEXT: movb %dl, 2(%rdi)
-; X64-NEXT: shll $16, %edx
-; X64-NEXT: orl %ecx, %edx
-; X64-NEXT: shll $13, %eax
-; X64-NEXT: andl $16769023, %edx # imm = 0xFFDFFF
-; X64-NEXT: orl %eax, %edx
-; X64-NEXT: movw %dx, (%rdi)
+; X64: # %bb.0:
+; X64-NEXT: movzwl (%rdi), %eax
+; X64-NEXT: movzbl 2(%rdi), %ecx
+; X64-NEXT: movb %cl, 2(%rdi)
+; X64-NEXT: shll $16, %ecx
+; X64-NEXT: orl %eax, %ecx
+; X64-NEXT: shll $13, %esi
+; X64-NEXT: andl $16769023, %ecx # imm = 0xFFDFFF
+; X64-NEXT: orl %esi, %ecx
+; X64-NEXT: movw %cx, (%rdi)
; X64-NEXT: retq
%extbit = zext i1 %bit to i24
%b = load i24, i24* %a, align 1
@@ -109,26 +106,26 @@ define void @i24_insert_bit(i24* %a, i1 zeroext %bit) {
define void @i56_or(i56* %a) {
; X86-LABEL: i56_or:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: orl $384, (%eax) # imm = 0x180
; X86-NEXT: retl
;
; X64-LABEL: i56_or:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movzwl 4(%rdi), %eax
; X64-NEXT: movzbl 6(%rdi), %ecx
-; X64-NEXT: movl (%rdi), %edx
; X64-NEXT: movb %cl, 6(%rdi)
-; X64-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<kill> %RCX<def>
+; X64-NEXT: # kill: def %ecx killed %ecx killed %rcx def %rcx
; X64-NEXT: shll $16, %ecx
; X64-NEXT: orl %eax, %ecx
; X64-NEXT: shlq $32, %rcx
-; X64-NEXT: orq %rcx, %rdx
-; X64-NEXT: orq $384, %rdx # imm = 0x180
-; X64-NEXT: movl %edx, (%rdi)
-; X64-NEXT: shrq $32, %rdx
-; X64-NEXT: movw %dx, 4(%rdi)
+; X64-NEXT: movl (%rdi), %eax
+; X64-NEXT: orq %rcx, %rax
+; X64-NEXT: orq $384, %rax # imm = 0x180
+; X64-NEXT: movl %eax, (%rdi)
+; X64-NEXT: shrq $32, %rax
+; X64-NEXT: movw %ax, 4(%rdi)
; X64-NEXT: retq
%aa = load i56, i56* %a, align 1
%b = or i56 %aa, 384
@@ -138,7 +135,7 @@ define void @i56_or(i56* %a) {
define void @i56_and_or(i56* %a) {
; X86-LABEL: i56_and_or:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl $384, %ecx # imm = 0x180
; X86-NEXT: orl (%eax), %ecx
@@ -147,22 +144,22 @@ define void @i56_and_or(i56* %a) {
; X86-NEXT: retl
;
; X64-LABEL: i56_and_or:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movzwl 4(%rdi), %eax
; X64-NEXT: movzbl 6(%rdi), %ecx
-; X64-NEXT: movl (%rdi), %edx
; X64-NEXT: movb %cl, 6(%rdi)
-; X64-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<kill> %RCX<def>
+; X64-NEXT: # kill: def %ecx killed %ecx killed %rcx def %rcx
; X64-NEXT: shll $16, %ecx
; X64-NEXT: orl %eax, %ecx
; X64-NEXT: shlq $32, %rcx
-; X64-NEXT: orq %rcx, %rdx
-; X64-NEXT: orq $384, %rdx # imm = 0x180
-; X64-NEXT: movabsq $72057594037927808, %rax # imm = 0xFFFFFFFFFFFF80
-; X64-NEXT: andq %rdx, %rax
-; X64-NEXT: movl %eax, (%rdi)
-; X64-NEXT: shrq $32, %rax
-; X64-NEXT: movw %ax, 4(%rdi)
+; X64-NEXT: movl (%rdi), %eax
+; X64-NEXT: orq %rcx, %rax
+; X64-NEXT: orq $384, %rax # imm = 0x180
+; X64-NEXT: movabsq $72057594037927808, %rcx # imm = 0xFFFFFFFFFFFF80
+; X64-NEXT: andq %rax, %rcx
+; X64-NEXT: movl %ecx, (%rdi)
+; X64-NEXT: shrq $32, %rcx
+; X64-NEXT: movw %cx, 4(%rdi)
; X64-NEXT: retq
%b = load i56, i56* %a, align 1
%c = and i56 %b, -128
@@ -173,7 +170,7 @@ define void @i56_and_or(i56* %a) {
define void @i56_insert_bit(i56* %a, i1 zeroext %bit) {
; X86-LABEL: i56_insert_bit:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: shll $13, %ecx
@@ -184,24 +181,24 @@ define void @i56_insert_bit(i56* %a, i1 zeroext %bit) {
; X86-NEXT: retl
;
; X64-LABEL: i56_insert_bit:
-; X64: # BB#0:
-; X64-NEXT: movzbl %sil, %eax
+; X64: # %bb.0:
+; X64-NEXT: movl %esi, %eax
; X64-NEXT: movzwl 4(%rdi), %ecx
; X64-NEXT: movzbl 6(%rdi), %edx
-; X64-NEXT: movl (%rdi), %esi
; X64-NEXT: movb %dl, 6(%rdi)
-; X64-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<kill> %RDX<def>
+; X64-NEXT: # kill: def %edx killed %edx killed %rdx def %rdx
; X64-NEXT: shll $16, %edx
; X64-NEXT: orl %ecx, %edx
; X64-NEXT: shlq $32, %rdx
-; X64-NEXT: orq %rdx, %rsi
+; X64-NEXT: movl (%rdi), %ecx
+; X64-NEXT: orq %rdx, %rcx
; X64-NEXT: shlq $13, %rax
-; X64-NEXT: movabsq $72057594037919743, %rcx # imm = 0xFFFFFFFFFFDFFF
-; X64-NEXT: andq %rsi, %rcx
-; X64-NEXT: orq %rax, %rcx
-; X64-NEXT: movl %ecx, (%rdi)
-; X64-NEXT: shrq $32, %rcx
-; X64-NEXT: movw %cx, 4(%rdi)
+; X64-NEXT: movabsq $72057594037919743, %rdx # imm = 0xFFFFFFFFFFDFFF
+; X64-NEXT: andq %rcx, %rdx
+; X64-NEXT: orq %rax, %rdx
+; X64-NEXT: movl %edx, (%rdi)
+; X64-NEXT: shrq $32, %rdx
+; X64-NEXT: movw %dx, 4(%rdi)
; X64-NEXT: retq
%extbit = zext i1 %bit to i56
%b = load i56, i56* %a, align 1
diff --git a/test/CodeGen/X86/illegal-insert.ll b/test/CodeGen/X86/illegal-insert.ll
index dbf1b14684c2..9898a41da11e 100644
--- a/test/CodeGen/X86/illegal-insert.ll
+++ b/test/CodeGen/X86/illegal-insert.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64
+; RUN: llc < %s -mtriple=x86_64--
define <4 x double> @foo0(<4 x double> %t) {
%r = insertelement <4 x double> %t, double 2.3, i32 0
diff --git a/test/CodeGen/X86/illegal-vector-args-return.ll b/test/CodeGen/X86/illegal-vector-args-return.ll
index d783d4fa1b49..ec7547f5f715 100644
--- a/test/CodeGen/X86/illegal-vector-args-return.ll
+++ b/test/CodeGen/X86/illegal-vector-args-return.ll
@@ -1,7 +1,7 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 -mcpu=nehalem | grep "mulpd %xmm3, %xmm1"
-; RUN: llc < %s -march=x86 -mattr=+sse2 -mcpu=nehalem | grep "mulpd %xmm2, %xmm0"
-; RUN: llc < %s -march=x86 -mattr=+sse2 -mcpu=nehalem | grep "addps %xmm3, %xmm1"
-; RUN: llc < %s -march=x86 -mattr=+sse2 -mcpu=nehalem | grep "addps %xmm2, %xmm0"
+; RUN: llc < %s -mattr=+sse2 -mcpu=nehalem | grep "mulpd %xmm3, %xmm1"
+; RUN: llc < %s -mattr=+sse2 -mcpu=nehalem | grep "mulpd %xmm2, %xmm0"
+; RUN: llc < %s -mattr=+sse2 -mcpu=nehalem | grep "addps %xmm3, %xmm1"
+; RUN: llc < %s -mattr=+sse2 -mcpu=nehalem | grep "addps %xmm2, %xmm0"
target triple = "i686-apple-darwin8"
diff --git a/test/CodeGen/X86/immediate_merging.ll b/test/CodeGen/X86/immediate_merging.ll
index 8aef9c279b31..e1c291914988 100644
--- a/test/CodeGen/X86/immediate_merging.ll
+++ b/test/CodeGen/X86/immediate_merging.ll
@@ -1,5 +1,6 @@
-; RUN: llc -o - -mtriple=i386-unknown-linux-gnu < %s | FileCheck %s
-; RUN: llc -o - -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i386-unknown-linux-gnu | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu | FileCheck %s --check-prefix=X64
@a = common global i32 0, align 4
@b = common global i32 0, align 4
@@ -13,15 +14,43 @@
; Test -Os to make sure immediates with multiple users don't get pulled in to
; instructions.
define i32 @foo() optsize {
-; CHECK-LABEL: foo:
-; CHECK: movl $1234, [[R1:%[a-z]+]]
-; CHECK-NOT: movl $1234, a
-; CHECK-NOT: movl $1234, b
-; CHECK-NOT: movl $12, c
-; CHECK-NOT: cmpl $12, e
-; CHECK: movl [[R1]], a
-; CHECK: movl [[R1]], b
-
+; X86-LABEL: foo:
+; X86: # %bb.0: # %entry
+; X86-NEXT: movl $1234, %eax # imm = 0x4D2
+; X86-NEXT: movl %eax, a
+; X86-NEXT: movl %eax, b
+; X86-NEXT: movl $12, %eax
+; X86-NEXT: movl %eax, c
+; X86-NEXT: cmpl %eax, e
+; X86-NEXT: jne .LBB0_2
+; X86-NEXT: # %bb.1: # %if.then
+; X86-NEXT: movl $1, x
+; X86-NEXT: .LBB0_2: # %if.end
+; X86-NEXT: movl $1234, f # imm = 0x4D2
+; X86-NEXT: movl $555, %eax # imm = 0x22B
+; X86-NEXT: movl %eax, h
+; X86-NEXT: addl %eax, i
+; X86-NEXT: xorl %eax, %eax
+; X86-NEXT: retl
+;
+; X64-LABEL: foo:
+; X64: # %bb.0: # %entry
+; X64-NEXT: movl $1234, %eax # imm = 0x4D2
+; X64-NEXT: movl %eax, {{.*}}(%rip)
+; X64-NEXT: movl %eax, {{.*}}(%rip)
+; X64-NEXT: movl $12, %eax
+; X64-NEXT: movl %eax, {{.*}}(%rip)
+; X64-NEXT: cmpl %eax, {{.*}}(%rip)
+; X64-NEXT: jne .LBB0_2
+; X64-NEXT: # %bb.1: # %if.then
+; X64-NEXT: movl $1, {{.*}}(%rip)
+; X64-NEXT: .LBB0_2: # %if.end
+; X64-NEXT: movl $1234, {{.*}}(%rip) # imm = 0x4D2
+; X64-NEXT: movl $555, %eax # imm = 0x22B
+; X64-NEXT: movl %eax, {{.*}}(%rip)
+; X64-NEXT: addl %eax, {{.*}}(%rip)
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: retq
entry:
store i32 1234, i32* @a
store i32 1234, i32* @b
@@ -35,13 +64,6 @@ if.then: ; preds = %entry
br label %if.end
; New block.. Make sure 1234 isn't live across basic blocks from before.
-; CHECK: movl $1234, f
-; CHECK: movl $555, [[R3:%[a-z]+]]
-; CHECK-NOT: movl $555, h
-; CHECK-NOT: addl $555, i
-; CHECK: movl [[R3]], h
-; CHECK: addl [[R3]], i
-
if.end: ; preds = %if.then, %entry
store i32 1234, i32* @f
store i32 555, i32* @h
@@ -53,14 +75,22 @@ if.end: ; preds = %if.then, %entry
; Test -O2 to make sure that all immediates get pulled in to their users.
define i32 @foo2() {
-; CHECK-LABEL: foo2:
-; CHECK: movl $1234, a
-; CHECK: movl $1234, b
-
+; X86-LABEL: foo2:
+; X86: # %bb.0: # %entry
+; X86-NEXT: movl $1234, a # imm = 0x4D2
+; X86-NEXT: movl $1234, b # imm = 0x4D2
+; X86-NEXT: xorl %eax, %eax
+; X86-NEXT: retl
+;
+; X64-LABEL: foo2:
+; X64: # %bb.0: # %entry
+; X64-NEXT: movl $1234, {{.*}}(%rip) # imm = 0x4D2
+; X64-NEXT: movl $1234, {{.*}}(%rip) # imm = 0x4D2
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: retq
entry:
store i32 1234, i32* @a
store i32 1234, i32* @b
-
ret i32 0
}
@@ -72,10 +102,24 @@ declare void @llvm.memset.p0i8.i32(i8* nocapture, i8, i32, i32, i1) #1
; immediates used to store to the individual memory locations. Make
; sure we don't directly store the immediates.
define void @foomemset() optsize {
-; CHECK-LABEL: foomemset:
-; CHECK-NOT: movl ${{.*}}, AA
-; CHECK: mov{{l|q}} %{{e|r}}ax, AA
-
+; X86-LABEL: foomemset:
+; X86: # %bb.0: # %entry
+; X86-NEXT: movl $555819297, %eax # imm = 0x21212121
+; X86-NEXT: movl %eax, AA+20
+; X86-NEXT: movl %eax, AA+16
+; X86-NEXT: movl %eax, AA+12
+; X86-NEXT: movl %eax, AA+8
+; X86-NEXT: movl %eax, AA+4
+; X86-NEXT: movl %eax, AA
+; X86-NEXT: retl
+;
+; X64-LABEL: foomemset:
+; X64: # %bb.0: # %entry
+; X64-NEXT: movabsq $2387225703656530209, %rax # imm = 0x2121212121212121
+; X64-NEXT: movq %rax, AA+{{.*}}(%rip)
+; X64-NEXT: movq %rax, AA+{{.*}}(%rip)
+; X64-NEXT: movq %rax, {{.*}}(%rip)
+; X64-NEXT: retq
entry:
call void @llvm.memset.p0i8.i32(i8* getelementptr inbounds ([100 x i8], [100 x i8]* @AA, i32 0, i32 0), i8 33, i32 24, i32 1, i1 false)
ret void
diff --git a/test/CodeGen/X86/immediate_merging64.ll b/test/CodeGen/X86/immediate_merging64.ll
index 4bc9d4af6440..57f5b3b79d9a 100644
--- a/test/CodeGen/X86/immediate_merging64.ll
+++ b/test/CodeGen/X86/immediate_merging64.ll
@@ -8,7 +8,7 @@
; optimizing for code size.
define i1 @imm_multiple_users(i64 %a, i64* %b) optsize {
; CHECK-LABEL: imm_multiple_users:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movq $-1, %rax
; CHECK-NEXT: movq %rax, (%rsi)
; CHECK-NEXT: cmpq %rax, %rdi
@@ -26,7 +26,7 @@ declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1)
; code size.
define void @memset_zero(i8* noalias nocapture %D) optsize {
; CHECK-LABEL: memset_zero:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: movq %rax, 7(%rdi)
; CHECK-NEXT: movq %rax, (%rdi)
diff --git a/test/CodeGen/X86/implicit-null-check-negative.ll b/test/CodeGen/X86/implicit-null-check-negative.ll
index c8d425c3889f..c05b4a072adf 100644
--- a/test/CodeGen/X86/implicit-null-check-negative.ll
+++ b/test/CodeGen/X86/implicit-null-check-negative.ll
@@ -37,6 +37,22 @@ define i32 @imp_null_check_gep_load(i32* %x) {
ret i32 %t
}
+define i32 @imp_null_check_neg_gep_load(i32* %x) {
+ entry:
+ %c = icmp eq i32* %x, null
+ br i1 %c, label %is_null, label %not_null, !make.implicit !0
+
+ is_null:
+ ret i32 42
+
+ not_null:
+; null - 5000 * sizeof(i32) lies outside the null page and hence the
+; load to %t cannot be assumed to be reliably faulting.
+ %x.gep = getelementptr i32, i32* %x, i32 -5000
+ %t = load i32, i32* %x.gep
+ ret i32 %t
+}
+
define i32 @imp_null_check_load_no_md(i32* %x) {
; This is fine, except it is missing the !make.implicit metadata.
entry:
diff --git a/test/CodeGen/X86/implicit-null-check.ll b/test/CodeGen/X86/implicit-null-check.ll
index ee795667cdb1..8cfc9c669ad0 100644
--- a/test/CodeGen/X86/implicit-null-check.ll
+++ b/test/CodeGen/X86/implicit-null-check.ll
@@ -182,6 +182,28 @@ define void @imp_null_check_store(i32* %x) {
ret void
}
+define i32 @imp_null_check_neg_gep_load(i32* %x) {
+; CHECK-LABEL: _imp_null_check_neg_gep_load:
+; CHECK: [[BB0_imp_null_check_neg_gep_load:L[^:]+]]:
+; CHECK: movl -128(%rdi), %eax
+; CHECK: retq
+; CHECK: [[BB1_imp_null_check_neg_gep_load:LBB7_[0-9]+]]:
+; CHECK: movl $42, %eax
+; CHECK: retq
+
+ entry:
+ %c = icmp eq i32* %x, null
+ br i1 %c, label %is_null, label %not_null, !make.implicit !0
+
+ is_null:
+ ret i32 42
+
+ not_null:
+ %x.gep = getelementptr i32, i32* %x, i32 -32
+ %t = load i32, i32* %x.gep
+ ret i32 %t
+}
+
!0 = !{}
; CHECK-LABEL: __LLVM_FaultMaps:
@@ -194,7 +216,7 @@ define void @imp_null_check_store(i32* %x) {
; CHECK-NEXT: .short 0
; # functions:
-; CHECK-NEXT: .long 7
+; CHECK-NEXT: .long 8
; FunctionAddr:
; CHECK-NEXT: .quad _imp_null_check_add_result
@@ -262,6 +284,19 @@ define void @imp_null_check_store(i32* %x) {
; CHECK-NEXT: .long [[BB1_imp_null_check_load]]-_imp_null_check_load
; FunctionAddr:
+; CHECK-NEXT: .quad _imp_null_check_neg_gep_load
+; NumFaultingPCs
+; CHECK-NEXT: .long 1
+; Reserved:
+; CHECK-NEXT: .long 0
+; Fault[0].Type:
+; CHECK-NEXT: .long 1
+; Fault[0].FaultOffset:
+; CHECK-NEXT: .long [[BB0_imp_null_check_neg_gep_load]]-_imp_null_check_neg_gep_load
+; Fault[0].HandlerOffset:
+; CHECK-NEXT: .long [[BB1_imp_null_check_neg_gep_load]]-_imp_null_check_neg_gep_load
+
+; FunctionAddr:
; CHECK-NEXT: .quad _imp_null_check_store
; NumFaultingPCs
; CHECK-NEXT: .long 1
@@ -289,7 +324,7 @@ define void @imp_null_check_store(i32* %x) {
; OBJDUMP: FaultMap table:
; OBJDUMP-NEXT: Version: 0x1
-; OBJDUMP-NEXT: NumFunctions: 7
+; OBJDUMP-NEXT: NumFunctions: 8
; OBJDUMP-NEXT: FunctionAddress: 0x000000, NumFaultingPCs: 1
; OBJDUMP-NEXT: Fault kind: FaultingLoad, faulting PC offset: 0, handling PC offset: 5
; OBJDUMP-NEXT: FunctionAddress: 0x000000, NumFaultingPCs: 1
@@ -301,6 +336,8 @@ define void @imp_null_check_store(i32* %x) {
; OBJDUMP-NEXT: FunctionAddress: 0x000000, NumFaultingPCs: 1
; OBJDUMP-NEXT: Fault kind: FaultingLoad, faulting PC offset: 0, handling PC offset: 3
; OBJDUMP-NEXT: FunctionAddress: 0x000000, NumFaultingPCs: 1
+; OBJDUMP-NEXT: Fault kind: FaultingLoad, faulting PC offset: 0, handling PC offset: 4
+; OBJDUMP-NEXT: FunctionAddress: 0x000000, NumFaultingPCs: 1
; OBJDUMP-NEXT: Fault kind: FaultingStore, faulting PC offset: 0, handling PC offset: 7
; OBJDUMP-NEXT: FunctionAddress: 0x000000, NumFaultingPCs: 1
; OBJDUMP-NEXT: Fault kind: FaultingLoad, faulting PC offset: 0, handling PC offset: 11
diff --git a/test/CodeGen/X86/implicit-null-checks.mir b/test/CodeGen/X86/implicit-null-checks.mir
index 6efc965a6947..31361ac27e3f 100644
--- a/test/CodeGen/X86/implicit-null-checks.mir
+++ b/test/CodeGen/X86/implicit-null-checks.mir
@@ -365,6 +365,18 @@
ret i32 undef
}
+ define i32 @inc_spill_dep(i32* %ptr, i32 %val) {
+ entry:
+ %ptr_is_null = icmp eq i32* %ptr, null
+ br i1 %ptr_is_null, label %is_null, label %not_null, !make.implicit !0
+
+ not_null:
+ ret i32 undef
+
+ is_null:
+ ret i32 undef
+ }
+
attributes #0 = { "target-features"="+bmi,+bmi2" }
!0 = !{}
@@ -379,23 +391,23 @@ liveins:
- { reg: '%esi' }
# CHECK: bb.0.entry:
# CHECK: %eax = MOV32ri 2200000
-# CHECK-NEXT: %eax = FAULTING_OP 1, %bb.3.is_null, {{[0-9]+}}, %eax, %rdi, 1, _, 0, _, implicit-def %eflags :: (load 4 from %ir.x)
-# CHECK-NEXT: JMP_1 %bb.1.not_null
+# CHECK-NEXT: %eax = FAULTING_OP 1, %bb.3, {{[0-9]+}}, %eax, %rdi, 1, %noreg, 0, %noreg, implicit-def %eflags :: (load 4 from %ir.x)
+# CHECK-NEXT: JMP_1 %bb.1
body: |
bb.0.entry:
liveins: %esi, %rdi
TEST64rr %rdi, %rdi, implicit-def %eflags
- JE_1 %bb.3.is_null, implicit %eflags
+ JE_1 %bb.3, implicit %eflags
bb.1.not_null:
liveins: %esi, %rdi
%eax = MOV32ri 2200000
- %eax = AND32rm killed %eax, killed %rdi, 1, _, 0, _, implicit-def dead %eflags :: (load 4 from %ir.x)
+ %eax = AND32rm killed %eax, killed %rdi, 1, %noreg, 0, %noreg, implicit-def dead %eflags :: (load 4 from %ir.x)
CMP32rr killed %eax, killed %esi, implicit-def %eflags
- JE_1 %bb.4.ret_100, implicit %eflags
+ JE_1 %bb.4, implicit %eflags
bb.2.ret_200:
%eax = MOV32ri 200
@@ -419,25 +431,25 @@ liveins:
- { reg: '%esi' }
- { reg: '%rdx' }
# CHECK: bb.0.entry:
-# CHECK: %eax = MOV32rm killed %rdx, 1, _, 0, _ :: (volatile load 4 from %ir.ptr)
+# CHECK: %eax = MOV32rm killed %rdx, 1, %noreg, 0, %noreg :: (volatile load 4 from %ir.ptr)
# CHECK-NEXT: TEST64rr %rdi, %rdi, implicit-def %eflags
-# CHECK-NEXT: JE_1 %bb.3.is_null, implicit %eflags
+# CHECK-NEXT: JE_1 %bb.3, implicit %eflags
body: |
bb.0.entry:
liveins: %esi, %rdi, %rdx
- %eax = MOV32rm killed %rdx, 1, _, 0, _ :: (volatile load 4 from %ir.ptr)
+ %eax = MOV32rm killed %rdx, 1, %noreg, 0, %noreg :: (volatile load 4 from %ir.ptr)
TEST64rr %rdi, %rdi, implicit-def %eflags
- JE_1 %bb.3.is_null, implicit %eflags
+ JE_1 %bb.3, implicit %eflags
bb.1.not_null:
liveins: %esi, %rdi
%eax = MOV32ri 2200000
- %eax = AND32rm killed %eax, killed %rdi, 1, _, 0, _, implicit-def dead %eflags :: (load 4 from %ir.x)
+ %eax = AND32rm killed %eax, killed %rdi, 1, %noreg, 0, %noreg, implicit-def dead %eflags :: (load 4 from %ir.x)
CMP32rr killed %eax, killed %esi, implicit-def %eflags
- JE_1 %bb.4.ret_100, implicit %eflags
+ JE_1 %bb.4, implicit %eflags
bb.2.ret_200:
@@ -463,23 +475,23 @@ liveins:
- { reg: '%esi' }
# CHECK: bb.0.entry:
# CHECK: TEST64rr %rdi, %rdi, implicit-def %eflags
-# CHECK-NEXT: JE_1 %bb.3.is_null, implicit %eflags
+# CHECK-NEXT: JE_1 %bb.3, implicit %eflags
body: |
bb.0.entry:
liveins: %esi, %rdi
TEST64rr %rdi, %rdi, implicit-def %eflags
- JE_1 %bb.3.is_null, implicit %eflags
+ JE_1 %bb.3, implicit %eflags
bb.1.not_null:
liveins: %esi, %rdi
%eax = MOV32ri 2200000
%eax = ADD32ri killed %eax, 100, implicit-def dead %eflags
- %eax = AND32rm killed %eax, killed %rdi, 1, _, 0, _, implicit-def dead %eflags :: (load 4 from %ir.x)
+ %eax = AND32rm killed %eax, killed %rdi, 1, %noreg, 0, %noreg, implicit-def dead %eflags :: (load 4 from %ir.x)
CMP32rr killed %eax, killed %esi, implicit-def %eflags
- JE_1 %bb.4.ret_100, implicit %eflags
+ JE_1 %bb.4, implicit %eflags
bb.2.ret_200:
%eax = MOV32ri 200
@@ -504,22 +516,22 @@ liveins:
- { reg: '%rsi' }
# CHECK: bb.0.entry:
# CHECK: TEST64rr %rdi, %rdi, implicit-def %eflags
-# CHECK-NEXT: JE_1 %bb.3.is_null, implicit %eflags
+# CHECK-NEXT: JE_1 %bb.3, implicit %eflags
body: |
bb.0.entry:
liveins: %rsi, %rdi
TEST64rr %rdi, %rdi, implicit-def %eflags
- JE_1 %bb.3.is_null, implicit %eflags
+ JE_1 %bb.3, implicit %eflags
bb.1.not_null:
liveins: %rsi, %rdi
%rdi = MOV64ri 5000
- %rdi = AND64rm killed %rdi, killed %rdi, 1, _, 0, _, implicit-def dead %eflags :: (load 4 from %ir.x)
+ %rdi = AND64rm killed %rdi, killed %rdi, 1, %noreg, 0, %noreg, implicit-def dead %eflags :: (load 4 from %ir.x)
CMP64rr killed %rdi, killed %rsi, implicit-def %eflags
- JE_1 %bb.4.ret_100, implicit %eflags
+ JE_1 %bb.4, implicit %eflags
bb.2.ret_200:
%eax = MOV32ri 200
@@ -544,23 +556,23 @@ liveins:
- { reg: '%rsi' }
# CHECK: bb.0.entry:
# CHECK: %rbx = MOV64rr %rdx
-# CHECK-NEXT: %rbx = FAULTING_OP 1, %bb.3.is_null, {{[0-9]+}}, %rbx, %rdi, 1, _, 0, _, implicit-def %eflags :: (load 4 from %ir.x)
+# CHECK-NEXT: %rbx = FAULTING_OP 1, %bb.3, {{[0-9]+}}, %rbx, %rdi, 1, %noreg, 0, %noreg, implicit-def %eflags :: (load 4 from %ir.x)
body: |
bb.0.entry:
liveins: %rsi, %rdi, %rdx
TEST64rr %rdi, %rdi, implicit-def %eflags
- JE_1 %bb.3.is_null, implicit %eflags
+ JE_1 %bb.3, implicit %eflags
bb.1.not_null:
liveins: %rsi, %rdi, %rdx
%rbx = MOV64rr %rdx
- %rbx = AND64rm killed %rbx, killed %rdi, 1, _, 0, _, implicit-def dead %eflags :: (load 4 from %ir.x)
+ %rbx = AND64rm killed %rbx, killed %rdi, 1, %noreg, 0, %noreg, implicit-def dead %eflags :: (load 4 from %ir.x)
%rdx = MOV64ri 0
CMP64rr killed %rbx, killed %rsi, implicit-def %eflags
- JE_1 %bb.4.ret_100, implicit %eflags
+ JE_1 %bb.4, implicit %eflags
bb.2.ret_200:
%eax = MOV32ri 200
@@ -599,13 +611,13 @@ body: |
CFI_INSTRUCTION offset %rbx, -16
%rbx = MOV64rr %rdi
TEST64rr %rbx, %rbx, implicit-def %eflags
- JE_1 %bb.2.leave, implicit killed %eflags
+ JE_1 %bb.2, implicit killed %eflags
bb.1.stay:
liveins: %rbx
CALL64pcrel32 @f, csr_64, implicit %rsp, implicit-def %rsp
- %eax = MOV32rm killed %rbx, 1, _, 0, _ :: (load 4 from %ir.ptr)
+ %eax = MOV32rm killed %rbx, 1, %noreg, 0, %noreg :: (load 4 from %ir.ptr)
%rbx = POP64r implicit-def %rsp, implicit %rsp
RETQ %eax
@@ -636,15 +648,15 @@ body: |
liveins: %rdi, %rsi
TEST64rr %rdi, %rdi, implicit-def %eflags
- JE_1 %bb.2.is_null, implicit killed %eflags
+ JE_1 %bb.2, implicit killed %eflags
bb.1.not_null:
liveins: %rdi, %rsi
- %rcx = MOV64rm killed %rsi, 1, _, 0, _ :: (load 8 from %ir.ptr2)
+ %rcx = MOV64rm killed %rsi, 1, %noreg, 0, %noreg :: (load 8 from %ir.ptr2)
%esi = MOV32ri 3076
- %eax = BEXTR32rm killed %rdi, 1, _, 0, _, killed %esi, implicit-def dead %eflags :: (load 4 from %ir.ptr)
- %eax = ADD32rm killed %eax, killed %rcx, 1, _, 0, _, implicit-def dead %eflags :: (load 4 from %ir.val)
+ %eax = BEXTR32rm killed %rdi, 1, %noreg, 0, %noreg, killed %esi, implicit-def dead %eflags :: (load 4 from %ir.ptr)
+ %eax = ADD32rm killed %eax, killed %rcx, 1, %noreg, 0, %noreg, implicit-def dead %eflags :: (load 4 from %ir.val)
RETQ %eax
bb.2.is_null:
@@ -656,8 +668,8 @@ body: |
name: use_alternate_load_op
# CHECK-LABEL: name: use_alternate_load_op
# CHECK: bb.0.entry:
-# CHECK: %rax = FAULTING_OP 1, %bb.2.is_null, {{[0-9]+}}, %rdi, 1, _, 0, _
-# CHECK-NEXT: JMP_1 %bb.1.not_null
+# CHECK: %rax = FAULTING_OP 1, %bb.2, {{[0-9]+}}, %rdi, 1, %noreg, 0, %noreg
+# CHECK-NEXT: JMP_1 %bb.1
# CHECK: bb.1.not_null
alignment: 4
@@ -670,14 +682,14 @@ body: |
liveins: %rdi, %rsi
TEST64rr %rdi, %rdi, implicit-def %eflags
- JE_1 %bb.2.is_null, implicit killed %eflags
+ JE_1 %bb.2, implicit killed %eflags
bb.1.not_null:
liveins: %rdi, %rsi
- %rcx = MOV64rm killed %rsi, 1, _, 0, _
- %rcx = AND64rm killed %rcx, %rdi, 1, _, 0, _, implicit-def dead %eflags
- %rax = MOV64rm killed %rdi, 1, _, 0, _
+ %rcx = MOV64rm killed %rsi, 1, %noreg, 0, %noreg
+ %rcx = AND64rm killed %rcx, %rdi, 1, %noreg, 0, %noreg, implicit-def dead %eflags
+ %rax = MOV64rm killed %rdi, 1, %noreg, 0, %noreg
RETQ %eax
bb.2.is_null:
@@ -689,8 +701,8 @@ body: |
name: imp_null_check_gep_load_with_use_dep
# CHECK-LABEL: name: imp_null_check_gep_load_with_use_dep
# CHECK: bb.0.entry:
-# CHECK: %eax = FAULTING_OP 1, %bb.2.is_null, {{[0-9]+}}, %rdi, 1, _, 0, _, implicit-def %rax :: (load 4 from %ir.x)
-# CHECK-NEXT: JMP_1 %bb.1.not_null
+# CHECK: %eax = FAULTING_OP 1, %bb.2, {{[0-9]+}}, %rdi, 1, %noreg, 0, %noreg, implicit-def %rax :: (load 4 from %ir.x)
+# CHECK-NEXT: JMP_1 %bb.1
alignment: 4
tracksRegLiveness: true
liveins:
@@ -701,14 +713,14 @@ body: |
liveins: %rsi, %rdi
TEST64rr %rdi, %rdi, implicit-def %eflags
- JE_1 %bb.1.is_null, implicit %eflags
+ JE_1 %bb.1, implicit %eflags
bb.2.not_null:
liveins: %rdi, %rsi
%rsi = ADD64rr %rsi, %rdi, implicit-def dead %eflags
- %eax = MOV32rm killed %rdi, 1, _, 0, _, implicit-def %rax :: (load 4 from %ir.x)
- %eax = LEA64_32r killed %rax, 1, killed %rsi, 4, _
+ %eax = MOV32rm killed %rdi, 1, %noreg, 0, %noreg, implicit-def %rax :: (load 4 from %ir.x)
+ %eax = LEA64_32r killed %rax, 1, killed %rsi, 4, %noreg
RETQ %eax
bb.1.is_null:
@@ -721,8 +733,8 @@ name: imp_null_check_load_with_base_sep
# CHECK-LABEL: name: imp_null_check_load_with_base_sep
# CHECK: bb.0.entry:
# CHECK: %rsi = ADD64rr %rsi, %rdi, implicit-def dead %eflags
-# CHECK-NEXT: %esi = FAULTING_OP 1, %bb.2.is_null, {{[0-9]+}}, %esi, %rdi, 1, _, 0, _, implicit-def %eflags
-# CHECK-NEXT: JMP_1 %bb.1.not_null
+# CHECK-NEXT: %esi = FAULTING_OP 1, %bb.2, {{[0-9]+}}, %esi, %rdi, 1, %noreg, 0, %noreg, implicit-def %eflags
+# CHECK-NEXT: JMP_1 %bb.1
alignment: 4
tracksRegLiveness: true
liveins:
@@ -733,13 +745,13 @@ body: |
liveins: %rsi, %rdi
TEST64rr %rdi, %rdi, implicit-def %eflags
- JE_1 %bb.1.is_null, implicit %eflags
+ JE_1 %bb.1, implicit %eflags
bb.2.not_null:
liveins: %rdi, %rsi
%rsi = ADD64rr %rsi, %rdi, implicit-def dead %eflags
- %esi = AND32rm killed %esi, %rdi, 1, _, 0, _, implicit-def dead %eflags
+ %esi = AND32rm killed %esi, %rdi, 1, %noreg, 0, %noreg, implicit-def dead %eflags
%eax = MOV32rr %esi
RETQ %eax
@@ -752,8 +764,8 @@ body: |
name: inc_store
# CHECK-LABEL: name: inc_store
# CHECK: bb.0.entry:
-# CHECK: _ = FAULTING_OP 3, %bb.2.is_null, {{[0-9]+}}, %rdi, 1, _, 0, _, %rsi
-# CHECK-NEXT: JMP_1 %bb.1.not_null
+# CHECK: %noreg = FAULTING_OP 3, %bb.2, {{[0-9]+}}, %rdi, 1, %noreg, 0, %noreg, %rsi
+# CHECK-NEXT: JMP_1 %bb.1
# CHECK: bb.1.not_null
alignment: 4
@@ -766,12 +778,12 @@ body: |
liveins: %rdi, %rsi
TEST64rr %rdi, %rdi, implicit-def %eflags
- JE_1 %bb.2.is_null, implicit killed %eflags
+ JE_1 %bb.2, implicit killed %eflags
bb.1.not_null:
liveins: %rdi, %rsi
- MOV64mr killed %rdi, 1, _, 0, _, killed %rsi
+ MOV64mr killed %rdi, 1, %noreg, 0, %noreg, killed %rsi
RETQ
bb.2.is_null:
@@ -782,8 +794,8 @@ body: |
name: inc_store_plus_offset
# CHECK-LABEL: inc_store_plus_offset
# CHECK: bb.0.entry:
-# CHECK: _ = FAULTING_OP 3, %bb.2.is_null, {{[0-9]+}}, %rdi, 1, _, 16, _, %rsi
-# CHECK-NEXT: JMP_1 %bb.1.not_null
+# CHECK: %noreg = FAULTING_OP 3, %bb.2, {{[0-9]+}}, %rdi, 1, %noreg, 16, %noreg, %rsi
+# CHECK-NEXT: JMP_1 %bb.1
# CHECK: bb.1.not_null
alignment: 4
@@ -796,12 +808,12 @@ body: |
liveins: %rdi, %rsi
TEST64rr %rdi, %rdi, implicit-def %eflags
- JE_1 %bb.2.is_null, implicit killed %eflags
+ JE_1 %bb.2, implicit killed %eflags
bb.1.not_null:
liveins: %rdi, %rsi
- MOV64mr killed %rdi, 1, _, 16, _, killed %rsi
+ MOV64mr killed %rdi, 1, %noreg, 16, %noreg, killed %rsi
RETQ
bb.2.is_null:
@@ -813,8 +825,8 @@ name: inc_store_with_dep
# CHECK-LABEL: inc_store_with_dep
# CHECK: bb.0.entry:
# CHECK: %esi = ADD32rr killed %esi, killed %esi, implicit-def dead %eflags
-# CHECK-NEXT: _ = FAULTING_OP 3, %bb.2.is_null, {{[0-9]+}}, %rdi, 1, _, 16, _, %esi
-# CHECK-NEXT: JMP_1 %bb.1.not_null
+# CHECK-NEXT: %noreg = FAULTING_OP 3, %bb.2, {{[0-9]+}}, %rdi, 1, %noreg, 16, %noreg, %esi
+# CHECK-NEXT: JMP_1 %bb.1
# CHECK: bb.1.not_null
alignment: 4
@@ -827,13 +839,13 @@ body: |
liveins: %rdi, %rsi
TEST64rr %rdi, %rdi, implicit-def %eflags
- JE_1 %bb.2.is_null, implicit killed %eflags
+ JE_1 %bb.2, implicit killed %eflags
bb.1.not_null:
liveins: %rdi, %rsi
%esi = ADD32rr killed %esi, killed %esi, implicit-def dead %eflags
- MOV32mr killed %rdi, 1, _, 16, _, killed %esi
+ MOV32mr killed %rdi, 1, %noreg, 16, %noreg, killed %esi
RETQ
bb.2.is_null:
@@ -845,7 +857,7 @@ name: inc_store_with_dep_in_null
# CHECK-LABEL: inc_store_with_dep_in_null
# CHECK: bb.0.entry:
# CHECK: TEST64rr %rdi, %rdi, implicit-def %eflags
-# CHECK-NEXT: JE_1 %bb.2.is_null, implicit killed %eflags
+# CHECK-NEXT: JE_1 %bb.2, implicit killed %eflags
# CHECK: bb.1.not_null
alignment: 4
@@ -858,13 +870,13 @@ body: |
liveins: %rdi, %rsi
TEST64rr %rdi, %rdi, implicit-def %eflags
- JE_1 %bb.2.is_null, implicit killed %eflags
+ JE_1 %bb.2, implicit killed %eflags
bb.1.not_null:
liveins: %rdi, %rsi
%esi = ADD32rr %esi, %esi, implicit-def dead %eflags
- MOV32mr killed %rdi, 1, _, 0, _, %esi
+ MOV32mr killed %rdi, 1, %noreg, 0, %noreg, %esi
%eax = MOV32rr killed %esi
RETQ %eax
@@ -880,7 +892,7 @@ name: inc_store_with_volatile
# CHECK-LABEL: inc_store_with_volatile
# CHECK: bb.0.entry:
# CHECK: TEST64rr %rdi, %rdi, implicit-def %eflags
-# CHECK-NEXT: JE_1 %bb.2.is_null, implicit killed %eflags
+# CHECK-NEXT: JE_1 %bb.2, implicit killed %eflags
# CHECK: bb.1.not_null
alignment: 4
@@ -893,12 +905,12 @@ body: |
liveins: %rdi, %rsi
TEST64rr %rdi, %rdi, implicit-def %eflags
- JE_1 %bb.2.is_null, implicit killed %eflags
+ JE_1 %bb.2, implicit killed %eflags
bb.1.not_null:
liveins: %rdi, %rsi
- MOV32mr killed %rdi, 1, _, 0, _, killed %esi :: (volatile store 4 into %ir.ptr)
+ MOV32mr killed %rdi, 1, %noreg, 0, %noreg, killed %esi :: (volatile store 4 into %ir.ptr)
RETQ
bb.2.is_null:
@@ -910,7 +922,7 @@ name: inc_store_with_two_dep
# CHECK-LABEL: inc_store_with_two_dep
# CHECK: bb.0.entry:
# CHECK: TEST64rr %rdi, %rdi, implicit-def %eflags
-# CHECK-NEXT: JE_1 %bb.2.is_null, implicit killed %eflags
+# CHECK-NEXT: JE_1 %bb.2, implicit killed %eflags
# CHECK: bb.1.not_null
alignment: 4
@@ -923,14 +935,14 @@ body: |
liveins: %rdi, %rsi
TEST64rr %rdi, %rdi, implicit-def %eflags
- JE_1 %bb.2.is_null, implicit killed %eflags
+ JE_1 %bb.2, implicit killed %eflags
bb.1.not_null:
liveins: %rdi, %rsi
%esi = ADD32rr killed %esi, killed %esi, implicit-def dead %eflags
%esi = ADD32ri killed %esi, 15, implicit-def dead %eflags
- MOV32mr killed %rdi, 1, _, 16, _, killed %esi
+ MOV32mr killed %rdi, 1, %noreg, 16, %noreg, killed %esi
RETQ
bb.2.is_null:
@@ -942,7 +954,7 @@ name: inc_store_with_redefined_base
# CHECK-LABEL: inc_store_with_redefined_base
# CHECK: bb.0.entry:
# CHECK: TEST64rr %rdi, %rdi, implicit-def %eflags
-# CHECK-NEXT: JE_1 %bb.2.is_null, implicit killed %eflags
+# CHECK-NEXT: JE_1 %bb.2, implicit killed %eflags
# CHECK: bb.1.not_null
alignment: 4
@@ -955,13 +967,13 @@ body: |
liveins: %rdi, %rsi
TEST64rr %rdi, %rdi, implicit-def %eflags
- JE_1 %bb.2.is_null, implicit killed %eflags
+ JE_1 %bb.2, implicit killed %eflags
bb.1.not_null:
liveins: %rdi, %rsi
%rdi = ADD64rr killed %rdi, killed %rdi, implicit-def dead %eflags
- MOV32mr killed %rdi, 1, _, 16, _, killed %esi
+ MOV32mr killed %rdi, 1, %noreg, 16, %noreg, killed %esi
RETQ
bb.2.is_null:
@@ -972,8 +984,8 @@ body: |
name: inc_store_with_reused_base
# CHECK-LABEL: inc_store_with_reused_base
# CHECK: bb.0.entry:
-# CHECK: _ = FAULTING_OP 3, %bb.2.is_null, {{[0-9]+}}, %rdi, 1, _, 16, _, %esi
-# CHECK-NEXT: JMP_1 %bb.1.not_null
+# CHECK: %noreg = FAULTING_OP 3, %bb.2, {{[0-9]+}}, %rdi, 1, %noreg, 16, %noreg, %esi
+# CHECK-NEXT: JMP_1 %bb.1
# CHECK: bb.1.not_null
alignment: 4
@@ -986,13 +998,13 @@ body: |
liveins: %rdi, %rsi
TEST64rr %rdi, %rdi, implicit-def %eflags
- JE_1 %bb.2.is_null, implicit killed %eflags
+ JE_1 %bb.2, implicit killed %eflags
bb.1.not_null:
liveins: %rdi, %rsi
%rax = MOV64rr %rdi
- MOV32mr killed %rdi, 1, _, 16, _, killed %esi
+ MOV32mr killed %rdi, 1, %noreg, 16, %noreg, killed %esi
RETQ %eax
bb.2.is_null:
@@ -1005,7 +1017,7 @@ name: inc_store_across_call
# CHECK-LABEL: inc_store_across_call
# CHECK: bb.0.entry:
# CHECK: TEST64rr %rbx, %rbx, implicit-def %eflags
-# CHECK-NEXT: JE_1 %bb.2.is_null, implicit killed %eflags
+# CHECK-NEXT: JE_1 %bb.2, implicit killed %eflags
# CHECK: bb.1.not_null
alignment: 4
@@ -1025,13 +1037,13 @@ body: |
CFI_INSTRUCTION offset %rbx, -16
%rbx = MOV64rr killed %rdi
TEST64rr %rbx, %rbx, implicit-def %eflags
- JE_1 %bb.2.is_null, implicit killed %eflags
+ JE_1 %bb.2, implicit killed %eflags
bb.1.not_null:
liveins: %rbx
CALL64pcrel32 @f, csr_64, implicit %rsp, implicit-def %rsp
- MOV32mi %rbx, 1, _, 0, _, 20
+ MOV32mi %rbx, 1, %noreg, 0, %noreg, 20
%rax = MOV64rr killed %rbx
%rbx = POP64r implicit-def %rsp, implicit %rsp
RETQ %eax
@@ -1047,7 +1059,7 @@ name: inc_store_with_dep_in_dep
# CHECK-LABEL: inc_store_with_dep_in_dep
# CHECK: bb.0.entry:
# CHECK: TEST64rr %rdi, %rdi, implicit-def %eflags
-# CHECK-NEXT: JE_1 %bb.2.is_null, implicit killed %eflags
+# CHECK-NEXT: JE_1 %bb.2, implicit killed %eflags
# CHECK: bb.1.not_null
alignment: 4
@@ -1060,14 +1072,14 @@ body: |
liveins: %rdi, %rsi
TEST64rr %rdi, %rdi, implicit-def %eflags
- JE_1 %bb.2.is_null, implicit killed %eflags
+ JE_1 %bb.2, implicit killed %eflags
bb.1.not_null:
liveins: %rdi, %rsi
%eax = MOV32rr %esi
%esi = ADD32ri killed %esi, 15, implicit-def dead %eflags
- MOV32mr killed %rdi, 1, _, 0, _, killed %esi
+ MOV32mr killed %rdi, 1, %noreg, 0, %noreg, killed %esi
RETQ %eax
bb.2.is_null:
@@ -1080,7 +1092,7 @@ name: inc_store_with_load_over_store
# CHECK-LABEL: inc_store_with_load_over_store
# CHECK: bb.0.entry:
# CHECK: TEST64rr %rdi, %rdi, implicit-def %eflags
-# CHECK-NEXT: JE_1 %bb.2.is_null, implicit killed %eflags
+# CHECK-NEXT: JE_1 %bb.2, implicit killed %eflags
# CHECK: bb.1.not_null
alignment: 4
@@ -1093,13 +1105,13 @@ body: |
liveins: %rdi, %rsi
TEST64rr %rdi, %rdi, implicit-def %eflags
- JE_1 %bb.2.is_null, implicit killed %eflags
+ JE_1 %bb.2, implicit killed %eflags
bb.1.not_null:
liveins: %rdi, %rsi
- MOV32mi killed %rsi, 1, _, 0, _, 2
- %eax = MOV32rm killed %rdi, 1, _, 0, _
+ MOV32mi killed %rsi, 1, %noreg, 0, %noreg, 2
+ %eax = MOV32rm killed %rdi, 1, %noreg, 0, %noreg
RETQ %eax
bb.2.is_null:
@@ -1112,7 +1124,7 @@ name: inc_store_with_store_over_load
# CHECK-LABEL: inc_store_with_store_over_load
# CHECK: bb.0.entry:
# CHECK: TEST64rr %rdi, %rdi, implicit-def %eflags
-# CHECK-NEXT: JE_1 %bb.2.is_null, implicit killed %eflags
+# CHECK-NEXT: JE_1 %bb.2, implicit killed %eflags
# CHECK: bb.1.not_null
alignment: 4
@@ -1125,13 +1137,13 @@ body: |
liveins: %rdi, %rsi
TEST64rr %rdi, %rdi, implicit-def %eflags
- JE_1 %bb.2.is_null, implicit killed %eflags
+ JE_1 %bb.2, implicit killed %eflags
bb.1.not_null:
liveins: %rdi, %rsi
- %eax = MOV32rm killed %rsi, 1, _, 0, _
- MOV32mi killed %rdi, 1, _, 0, _, 2
+ %eax = MOV32rm killed %rsi, 1, %noreg, 0, %noreg
+ MOV32mi killed %rdi, 1, %noreg, 0, %noreg, 2
RETQ %eax
bb.2.is_null:
@@ -1144,7 +1156,7 @@ name: inc_store_with_store_over_store
# CHECK-LABEL: inc_store_with_store_over_store
# CHECK: bb.0.entry:
# CHECK: TEST64rr %rdi, %rdi, implicit-def %eflags
-# CHECK-NEXT: JE_1 %bb.2.is_null, implicit killed %eflags
+# CHECK-NEXT: JE_1 %bb.2, implicit killed %eflags
# CHECK: bb.1.not_null
alignment: 4
@@ -1157,13 +1169,13 @@ body: |
liveins: %rdi, %rsi
TEST64rr %rdi, %rdi, implicit-def %eflags
- JE_1 %bb.2.is_null, implicit killed %eflags
+ JE_1 %bb.2, implicit killed %eflags
bb.1.not_null:
liveins: %rdi, %rsi
- MOV32mi killed %rsi, 1, _, 0, _, 3
- MOV32mi killed %rdi, 1, _, 0, _, 2
+ MOV32mi killed %rsi, 1, %noreg, 0, %noreg, 3
+ MOV32mi killed %rdi, 1, %noreg, 0, %noreg, 2
RETQ
bb.2.is_null:
@@ -1174,8 +1186,8 @@ body: |
name: inc_store_with_load_and_store
# CHECK-LABEL: inc_store_with_load_and_store
# CHECK: bb.0.entry:
-# CHECK: _ = FAULTING_OP 2, %bb.2.is_null, {{[0-9]+}}, %rdi, 1, _, 0, _, %esi, implicit-def %eflags
-# CHECK-NEXT: JMP_1 %bb.1.not_null
+# CHECK: %noreg = FAULTING_OP 2, %bb.2, {{[0-9]+}}, %rdi, 1, %noreg, 0, %noreg, %esi, implicit-def %eflags
+# CHECK-NEXT: JMP_1 %bb.1
# CHECK: bb.1.not_null
alignment: 4
@@ -1188,13 +1200,13 @@ body: |
liveins: %rdi, %rsi
TEST64rr %rdi, %rdi, implicit-def %eflags
- JE_1 %bb.2.is_null, implicit killed %eflags
+ JE_1 %bb.2, implicit killed %eflags
bb.1.not_null:
liveins: %rdi, %rsi
%esi = ADD32rr %esi, %esi, implicit-def dead %eflags
- ADD32mr killed %rdi, 1, _, 0, _, killed %esi, implicit-def dead %eflags
+ ADD32mr killed %rdi, 1, %noreg, 0, %noreg, killed %esi, implicit-def dead %eflags
RETQ
bb.2.is_null:
@@ -1205,8 +1217,8 @@ body: |
name: inc_store_and_load_no_alias
# CHECK-LABEL: inc_store_and_load_no_alias
# CHECK: bb.0.entry:
-# CHECK: %eax = FAULTING_OP 1, %bb.2.is_null, {{[0-9]+}}, %rdi, 1, _, 0, _ :: (load 4 from %ir.ptr)
-# CHECK-NEXT: JMP_1 %bb.1.not_null
+# CHECK: %eax = FAULTING_OP 1, %bb.2, {{[0-9]+}}, %rdi, 1, %noreg, 0, %noreg :: (load 4 from %ir.ptr)
+# CHECK-NEXT: JMP_1 %bb.1
# CHECK: bb.1.not_null
alignment: 4
@@ -1219,13 +1231,13 @@ body: |
liveins: %rdi, %rsi
TEST64rr %rdi, %rdi, implicit-def %eflags
- JE_1 %bb.2.is_null, implicit killed %eflags
+ JE_1 %bb.2, implicit killed %eflags
bb.1.not_null:
liveins: %rdi, %rsi
- MOV32mi killed %rsi, 1, _, 0, _, 3 :: (store 4 into %ir.ptr2)
- %eax = MOV32rm killed %rdi, 1, _, 0, _ :: (load 4 from %ir.ptr)
+ MOV32mi killed %rsi, 1, %noreg, 0, %noreg, 3 :: (store 4 into %ir.ptr2)
+ %eax = MOV32rm killed %rdi, 1, %noreg, 0, %noreg :: (load 4 from %ir.ptr)
RETQ %eax
bb.2.is_null:
@@ -1238,11 +1250,45 @@ name: inc_store_and_load_alias
# CHECK-LABEL: inc_store_and_load_alias
# CHECK: bb.0.entry:
# CHECK: TEST64rr %rdi, %rdi, implicit-def %eflags
-# CHECK-NEXT: JE_1 %bb.2.is_null, implicit killed %eflags
+# CHECK-NEXT: JE_1 %bb.2, implicit killed %eflags
+# CHECK: bb.1.not_null
+
+alignment: 4
+tracksRegLiveness: true
+liveins:
+ - { reg: '%rdi' }
+ - { reg: '%rsi' }
+body: |
+ bb.0.entry:
+ liveins: %rdi, %rsi
+
+ TEST64rr %rdi, %rdi, implicit-def %eflags
+ JE_1 %bb.2, implicit killed %eflags
+
+ bb.1.not_null:
+ liveins: %rdi, %rsi
+
+ MOV32mi killed %rsi, 1, %noreg, 0, %noreg, 3 :: (store 4 into %ir.ptr2)
+ %eax = MOV32rm killed %rdi, 1, %noreg, 0, %noreg :: (load 4 from %ir.ptr)
+ RETQ %eax
+
+ bb.2.is_null:
+ %eax = XOR32rr undef %eax, undef %eax, implicit-def dead %eflags
+ RETQ %eax
+
+...
+---
+name: inc_spill_dep
+# CHECK-LABEL: inc_spill_dep
+# CHECK: bb.0.entry:
+# CHECK: TEST64rr %rdi, %rdi, implicit-def %eflags
+# CHECK-NEXT: JE_1 %bb.2, implicit killed %eflags
# CHECK: bb.1.not_null
alignment: 4
tracksRegLiveness: true
+stack:
+ - { id: 0, type: spill-slot, offset: -8, size: 8, alignment: 8}
liveins:
- { reg: '%rdi' }
- { reg: '%rsi' }
@@ -1250,14 +1296,18 @@ body: |
bb.0.entry:
liveins: %rdi, %rsi
+ %rsp = frame-setup SUB64ri8 %rsp, 8, implicit-def dead %eflags
+ MOV32mr %rsp, 1, %noreg, 0, %noreg, %esi :: (store 4 into %stack.0)
TEST64rr %rdi, %rdi, implicit-def %eflags
- JE_1 %bb.2.is_null, implicit killed %eflags
+ JE_1 %bb.2, implicit killed %eflags
bb.1.not_null:
liveins: %rdi, %rsi
- MOV32mi killed %rsi, 1, _, 0, _, 3 :: (store 4 into %ir.ptr2)
- %eax = MOV32rm killed %rdi, 1, _, 0, _ :: (load 4 from %ir.ptr)
+ %r14d = MOV32rm %rsp, 1, %noreg, 0, %noreg :: (load 4 from %stack.0)
+ MOV64mr %rsp, 1, %noreg, 0, %noreg, %rdi :: (store 8 into %stack.0)
+ %edi = MOV32rm %rdi, 1, %noreg, 8, %noreg :: (load 4 from %ir.ptr)
+ %eax = MOV32rr %edi
RETQ %eax
bb.2.is_null:
diff --git a/test/CodeGen/X86/implicit-use-spill.mir b/test/CodeGen/X86/implicit-use-spill.mir
index 94bdd47b4470..25f245e9c4fb 100644
--- a/test/CodeGen/X86/implicit-use-spill.mir
+++ b/test/CodeGen/X86/implicit-use-spill.mir
@@ -11,10 +11,10 @@ body: |
bb.0:
; CHECK: NOOP implicit-def [[VAL:%[0-9]+]]
; VAL should be spilled before csr_noregs, i.e., before we clobber all the registers
- ; CHECK-NEXT: MOV64mr [[SLOT:%stack.[0-9]+]], 1, _, 0, _, [[VAL]]
+ ; CHECK-NEXT: MOV64mr [[SLOT:%stack.[0-9]+]], 1, %noreg, 0, %noreg, [[VAL]]
; CHECK-NEXT: NOOP csr_noregs
; We need to reload before the (implicit) use.
- ; CHECK-NEXT: [[RELOADED_VAL:%[0-9]+]] = MOV64rm [[SLOT]], 1, _, 0, _
+ ; CHECK-NEXT: [[RELOADED_VAL:%[0-9]+]]:gr64 = MOV64rm [[SLOT]], 1, %noreg, 0, %noreg
; CHECK-NEXT: NOOP implicit [[RELOADED_VAL]]
NOOP implicit-def %0
NOOP csr_noregs
diff --git a/test/CodeGen/X86/imul-lea-2.ll b/test/CodeGen/X86/imul-lea-2.ll
index 7b79d0678bee..d1de25d02ef1 100644
--- a/test/CodeGen/X86/imul-lea-2.ll
+++ b/test/CodeGen/X86/imul-lea-2.ll
@@ -1,19 +1,26 @@
-; RUN: llc < %s -march=x86-64 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s
-; CHECK-NOT: imul
define i64 @t1(i64 %a) nounwind readnone {
+; CHECK-LABEL: t1:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: leaq (%rdi,%rdi,8), %rax
+; CHECK-NEXT: leaq (%rax,%rax,8), %rax
+; CHECK-NEXT: retq
entry:
%0 = mul i64 %a, 81
-; CHECK: lea
-; CHECK: lea
ret i64 %0
}
define i64 @t2(i64 %a) nounwind readnone {
+; CHECK-LABEL: t2:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: shlq $3, %rdi
+; CHECK-NEXT: leaq (%rdi,%rdi,4), %rax
+; CHECK-NEXT: retq
entry:
%0 = mul i64 %a, 40
-; CHECK: shl
-; CHECK: lea
ret i64 %0
}
+
diff --git a/test/CodeGen/X86/imul-lea.ll b/test/CodeGen/X86/imul-lea.ll
index d55ece7996ed..777222ec0bf2 100644
--- a/test/CodeGen/X86/imul-lea.ll
+++ b/test/CodeGen/X86/imul-lea.ll
@@ -1,12 +1,16 @@
-; RUN: llc < %s -march=x86 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown | FileCheck %s
declare i32 @foo()
define i32 @test() {
+; CHECK-LABEL: test:
+; CHECK: # %bb.0:
+; CHECK-NEXT: calll foo
+; CHECK-NEXT: leal (%eax,%eax,8), %eax
+; CHECK-NEXT: retl
%tmp.0 = tail call i32 @foo( )
%tmp.1 = mul i32 %tmp.0, 9
-; CHECK-NOT: mul
-; CHECK: lea
ret i32 %tmp.1
}
diff --git a/test/CodeGen/X86/imul.ll b/test/CodeGen/X86/imul.ll
index 45a83cc5dfd9..ff7df4f6b130 100644
--- a/test/CodeGen/X86/imul.ll
+++ b/test/CodeGen/X86/imul.ll
@@ -174,14 +174,14 @@ define i64 @mul18446744073709551615_64(i64 %A) {
define i32 @test(i32 %a) {
; X64-LABEL: test:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: movl %edi, %eax
; X64-NEXT: shll $5, %eax
; X64-NEXT: subl %edi, %eax
; X64-NEXT: retq
;
; X86-LABEL: test:
-; X86: # BB#0: # %entry
+; X86: # %bb.0: # %entry
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: shll $5, %eax
@@ -194,7 +194,7 @@ entry:
define i32 @test1(i32 %a) {
; X64-LABEL: test1:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: movl %edi, %eax
; X64-NEXT: shll $5, %eax
; X64-NEXT: subl %edi, %eax
@@ -202,7 +202,7 @@ define i32 @test1(i32 %a) {
; X64-NEXT: retq
;
; X86-LABEL: test1:
-; X86: # BB#0: # %entry
+; X86: # %bb.0: # %entry
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: shll $5, %eax
@@ -217,15 +217,15 @@ entry:
define i32 @test2(i32 %a) {
; X64-LABEL: test2:
-; X64: # BB#0: # %entry
-; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64: # %bb.0: # %entry
+; X64-NEXT: # kill: def %edi killed %edi def %rdi
; X64-NEXT: movl %edi, %eax
; X64-NEXT: shll $5, %eax
; X64-NEXT: leal (%rax,%rdi), %eax
; X64-NEXT: retq
;
; X86-LABEL: test2:
-; X86: # BB#0: # %entry
+; X86: # %bb.0: # %entry
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: shll $5, %eax
@@ -238,8 +238,8 @@ entry:
define i32 @test3(i32 %a) {
; X64-LABEL: test3:
-; X64: # BB#0: # %entry
-; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64: # %bb.0: # %entry
+; X64-NEXT: # kill: def %edi killed %edi def %rdi
; X64-NEXT: movl %edi, %eax
; X64-NEXT: shll $5, %eax
; X64-NEXT: leal (%rax,%rdi), %eax
@@ -247,7 +247,7 @@ define i32 @test3(i32 %a) {
; X64-NEXT: retq
;
; X86-LABEL: test3:
-; X86: # BB#0: # %entry
+; X86: # %bb.0: # %entry
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: shll $5, %eax
@@ -261,14 +261,14 @@ entry:
define i64 @test4(i64 %a) {
; X64-LABEL: test4:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: movq %rdi, %rax
; X64-NEXT: shlq $5, %rax
; X64-NEXT: subq %rdi, %rax
; X64-NEXT: retq
;
; X86-LABEL: test4:
-; X86: # BB#0: # %entry
+; X86: # %bb.0: # %entry
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl %eax, %ecx
; X86-NEXT: shll $5, %ecx
@@ -284,7 +284,7 @@ entry:
define i64 @test5(i64 %a) {
; X64-LABEL: test5:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: movq %rdi, %rax
; X64-NEXT: shlq $5, %rax
; X64-NEXT: subq %rdi, %rax
@@ -292,11 +292,9 @@ define i64 @test5(i64 %a) {
; X64-NEXT: retq
;
; X86-LABEL: test5:
-; X86: # BB#0: # %entry
+; X86: # %bb.0: # %entry
; X86-NEXT: pushl %esi
-; X86-NEXT: .Lcfi0:
; X86-NEXT: .cfi_def_cfa_offset 8
-; X86-NEXT: .Lcfi1:
; X86-NEXT: .cfi_offset %esi, -8
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
@@ -318,14 +316,14 @@ entry:
define i64 @test6(i64 %a) {
; X64-LABEL: test6:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: movq %rdi, %rax
; X64-NEXT: shlq $5, %rax
; X64-NEXT: leaq (%rax,%rdi), %rax
; X64-NEXT: retq
;
; X86-LABEL: test6:
-; X86: # BB#0: # %entry
+; X86: # %bb.0: # %entry
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl %eax, %ecx
; X86-NEXT: shll $5, %ecx
@@ -341,7 +339,7 @@ entry:
define i64 @test7(i64 %a) {
; X64-LABEL: test7:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: movq %rdi, %rax
; X64-NEXT: shlq $5, %rax
; X64-NEXT: leaq (%rax,%rdi), %rax
@@ -349,11 +347,9 @@ define i64 @test7(i64 %a) {
; X64-NEXT: retq
;
; X86-LABEL: test7:
-; X86: # BB#0: # %entry
+; X86: # %bb.0: # %entry
; X86-NEXT: pushl %esi
-; X86-NEXT: .Lcfi2:
; X86-NEXT: .cfi_def_cfa_offset 8
-; X86-NEXT: .Lcfi3:
; X86-NEXT: .cfi_offset %esi, -8
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
@@ -374,17 +370,15 @@ entry:
define i64 @testOverflow(i64 %a) {
; X64-LABEL: testOverflow:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: movabsq $9223372036854775807, %rax # imm = 0x7FFFFFFFFFFFFFFF
; X64-NEXT: imulq %rdi, %rax
; X64-NEXT: retq
;
; X86-LABEL: testOverflow:
-; X86: # BB#0: # %entry
+; X86: # %bb.0: # %entry
; X86-NEXT: pushl %esi
-; X86-NEXT: .Lcfi4:
; X86-NEXT: .cfi_def_cfa_offset 8
-; X86-NEXT: .Lcfi5:
; X86-NEXT: .cfi_offset %esi, -8
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl $-1, %edx
diff --git a/test/CodeGen/X86/inline-0bh.ll b/test/CodeGen/X86/inline-0bh.ll
index ceef395aa147..b1e7e57e0b20 100644
--- a/test/CodeGen/X86/inline-0bh.ll
+++ b/test/CodeGen/X86/inline-0bh.ll
@@ -4,7 +4,7 @@
; Function Attrs: noinline nounwind
define i32 @PR31007() {
; CHECK-LABEL: PR31007:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: #APP
; CHECK : addb $11, %al
; CHECK: #NO_APP
diff --git a/test/CodeGen/X86/inline-asm-A-constraint.ll b/test/CodeGen/X86/inline-asm-A-constraint.ll
index 2ad011e88e0d..7975b318eff5 100644
--- a/test/CodeGen/X86/inline-asm-A-constraint.ll
+++ b/test/CodeGen/X86/inline-asm-A-constraint.ll
@@ -19,8 +19,7 @@ entry:
%.fca.1.insert = insertvalue { i64, i64 } %.fca.0.insert, i64 %retval.sroa.2.0.extract.trunc, 1
ret { i64, i64 } %.fca.1.insert
}
-; CHECK: lock
-; CHECK-NEXT: cmpxchg16b
+; CHECK: lock cmpxchg16b
attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #1 = { nounwind }
diff --git a/test/CodeGen/X86/inline-asm-R-constraint.ll b/test/CodeGen/X86/inline-asm-R-constraint.ll
index d17e04dd7949..218638c0e653 100644
--- a/test/CodeGen/X86/inline-asm-R-constraint.ll
+++ b/test/CodeGen/X86/inline-asm-R-constraint.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=x86-64 < %s | FileCheck %s
+; RUN: llc < %s | FileCheck %s
; 7282062
; ModuleID = '<stdin>'
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
diff --git a/test/CodeGen/X86/inline-asm-avx-v-constraint-32bit.ll b/test/CodeGen/X86/inline-asm-avx-v-constraint-32bit.ll
index fa04530e5cf7..c4bdfb6a1038 100644
--- a/test/CodeGen/X86/inline-asm-avx-v-constraint-32bit.ll
+++ b/test/CodeGen/X86/inline-asm-avx-v-constraint-32bit.ll
@@ -1,133 +1,133 @@
; RUN: not llc < %s -mtriple i386-unknown-linux-gnu -mattr +avx -o /dev/null 2> %t
; RUN: FileCheck %s --input-file %t
-define <4 x float> @testXMM_1(<4 x float> %_xmm0, i32 %_l) {
+define <4 x float> @testxmm_1(<4 x float> %_xmm0, i32 %_l) {
; CHECK: error: inline assembly requires more registers than available
entry:
%0 = tail call <4 x float> asm "vmovhlps $1, $2, $0", "=v,v,v,~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{dirflag},~{fpsr},~{flags}"(i32 %_l, <4 x float> %_xmm0)
ret <4 x float> %0
}
-define <4 x float> @testXMM_2(<4 x float> %_xmm0, i32 %_l) {
+define <4 x float> @testxmm_2(<4 x float> %_xmm0, i32 %_l) {
; CHECK: error: inline assembly requires more registers than available
entry:
%0 = tail call <4 x float> asm "movapd $1, $0", "=v,v,~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{dirflag},~{fpsr},~{flags}"(i32 %_l)
ret <4 x float> %0
}
-define <4 x float> @testXMM_3(<4 x float> %_xmm0, i32 %_l) {
+define <4 x float> @testxmm_3(<4 x float> %_xmm0, i32 %_l) {
; CHECK: error: inline assembly requires more registers than available
entry:
%0 = tail call <4 x float> asm "vmovapd $1, $0", "=v,v,~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{dirflag},~{fpsr},~{flags}"(i32 %_l)
ret <4 x float> %0
}
-define <4 x float> @testXMM_4(<4 x float> %_xmm0, i32 %_l) {
+define <4 x float> @testxmm_4(<4 x float> %_xmm0, i32 %_l) {
; CHECK: error: inline assembly requires more registers than available
entry:
%0 = tail call <4 x float> asm "vmpsadbw $$0, $1, $2, $0", "=v,v,v,~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{dirflag},~{fpsr},~{flags}"(i32 %_l, <4 x float> %_xmm0)
ret <4 x float> %0
}
-define <4 x float> @testXMM_5(<4 x float> %_xmm0, i32 %_l) {
+define <4 x float> @testxmm_5(<4 x float> %_xmm0, i32 %_l) {
; CHECK: error: inline assembly requires more registers than available
entry:
%0 = tail call <4 x float> asm "vminpd $1, $2, $0", "=v,v,v,~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{dirflag},~{fpsr},~{flags}"(i32 %_l, i32 %_l)
ret <4 x float> %0
}
-define i32 @testXMM_6(i32 returned %_l) {
+define i32 @testxmm_6(i32 returned %_l) {
; CHECK: error: inline assembly requires more registers than available
entry:
tail call void asm sideeffect "vmovd $0, %eax", "v,~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{dirflag},~{fpsr},~{flags}"(i32 %_l)
ret i32 %_l
}
-define <4 x float> @testXMM_7(<4 x float> returned %_xmm0) {
+define <4 x float> @testxmm_7(<4 x float> returned %_xmm0) {
; CHECK: error: inline assembly requires more registers than available
entry:
tail call void asm sideeffect "vmovmskps $0, %eax", "v,~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{dirflag},~{fpsr},~{flags}"(<4 x float> %_xmm0)
ret <4 x float> %_xmm0
}
-define i32 @testXMM_8(<4 x float> %_xmm0, i32 %_l) {
+define i32 @testxmm_8(<4 x float> %_xmm0, i32 %_l) {
; CHECK: error: inline assembly requires more registers than available
entry:
%0 = tail call i32 asm "vmulsd $1, $2, $0", "=v,v,v,~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{dirflag},~{fpsr},~{flags}"(i32 %_l, <4 x float> %_xmm0)
ret i32 %0
}
-define <4 x float> @testXMM_9(<4 x float> %_xmm0, i32 %_l) {
+define <4 x float> @testxmm_9(<4 x float> %_xmm0, i32 %_l) {
; CHECK: error: inline assembly requires more registers than available
entry:
%0 = tail call <4 x float> asm "vorpd $1, $2, $0", "=v,v,v,~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{dirflag},~{fpsr},~{flags}"(i32 %_l, <4 x float> %_xmm0)
ret <4 x float> %0
}
-define <4 x float> @testXMM_10(<4 x float> %_xmm0, i32 %_l) {
+define <4 x float> @testxmm_10(<4 x float> %_xmm0, i32 %_l) {
; CHECK: error: inline assembly requires more registers than available
entry:
%0 = tail call <4 x float> asm "pabsb $1, $0", "=v,v,~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{dirflag},~{fpsr},~{flags}"(i32 %_l)
ret <4 x float> %0
}
-define <4 x float> @testXMM_11(<4 x float> %_xmm0, i32 %_l) {
+define <4 x float> @testxmm_11(<4 x float> %_xmm0, i32 %_l) {
; CHECK: error: inline assembly requires more registers than available
entry:
%0 = tail call <4 x float> asm "vpabsd $1, $0", "=v,v,~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{dirflag},~{fpsr},~{flags}"(i32 %_l)
ret <4 x float> %0
}
-define <8 x float> @testYMM_1(<8 x float> %_ymm0, <8 x float> %_ymm1) {
+define <8 x float> @testymm_1(<8 x float> %_ymm0, <8 x float> %_ymm1) {
; CHECK: error: inline assembly requires more registers than available
entry:
%0 = tail call <8 x float> asm "vmovsldup $1, $0", "=v,v,~{ymm0},~{ymm1},~{ymm2},~{ymm3},~{ymm4},~{ymm5},~{ymm6},~{ymm7},~{dirflag},~{fpsr},~{flags}"(<8 x float> %_ymm0)
ret <8 x float> %0
}
-define <8 x float> @testYMM_2(<8 x float> %_ymm0, <8 x float> %_ymm1) {
+define <8 x float> @testymm_2(<8 x float> %_ymm0, <8 x float> %_ymm1) {
; CHECK: error: inline assembly requires more registers than available
entry:
%0 = tail call <8 x float> asm "vmovapd $1, $0", "=v,v,~{ymm0},~{ymm1},~{ymm2},~{ymm3},~{ymm4},~{ymm5},~{ymm6},~{ymm7},~{dirflag},~{fpsr},~{flags}"(<8 x float> %_ymm1)
ret <8 x float> %0
}
-define <8 x float> @testYMM_3(<8 x float> %_ymm0, <8 x float> %_ymm1) {
+define <8 x float> @testymm_3(<8 x float> %_ymm0, <8 x float> %_ymm1) {
; CHECK: error: inline assembly requires more registers than available
entry:
%0 = tail call <8 x float> asm "vminpd $1, $2, $0", "=v,v,v,~{ymm0},~{ymm1},~{ymm2},~{ymm3},~{ymm4},~{ymm5},~{ymm6},~{ymm7},~{dirflag},~{fpsr},~{flags}"(<8 x float> %_ymm1, <8 x float> %_ymm0)
ret <8 x float> %0
}
-define <8 x float> @testYMM_4(<8 x float> %_ymm0, <8 x float> %_ymm1) {
+define <8 x float> @testymm_4(<8 x float> %_ymm0, <8 x float> %_ymm1) {
; CHECK: error: inline assembly requires more registers than available
entry:
%0 = tail call <8 x float> asm "vorpd $1, $2, $0", "=v,v,v,~{ymm0},~{ymm1},~{ymm2},~{ymm3},~{ymm4},~{ymm5},~{ymm6},~{ymm7},~{dirflag},~{fpsr},~{flags}"(<8 x float> %_ymm1, <8 x float> %_ymm0)
ret <8 x float> %0
}
-define <8 x float> @testYMM(<8 x float> %_ymm0, <8 x float> %_ymm1) {
+define <8 x float> @testymm(<8 x float> %_ymm0, <8 x float> %_ymm1) {
; CHECK: error: inline assembly requires more registers than available
entry:
%0 = tail call <8 x float> asm "vmulps $1, $2, $0", "=v,v,v,~{ymm0},~{ymm1},~{ymm2},~{ymm3},~{ymm4},~{ymm5},~{ymm6},~{ymm7},~{dirflag},~{fpsr},~{flags}"(<8 x float> %_ymm1, <8 x float> %_ymm0)
ret <8 x float> %0
}
-define <8 x float> @testYMM_6(<8 x float> %_ymm0, <8 x float> %_ymm1) {
+define <8 x float> @testymm_6(<8 x float> %_ymm0, <8 x float> %_ymm1) {
; CHECK: error: inline assembly requires more registers than available
entry:
%0 = tail call <8 x float> asm "vmulpd $1, $2, $0", "=v,v,v,~{ymm0},~{ymm1},~{ymm2},~{ymm3},~{ymm4},~{ymm5},~{ymm6},~{ymm7},~{dirflag},~{fpsr},~{flags}"(<8 x float> %_ymm1, <8 x float> %_ymm0)
ret <8 x float> %0
}
-define <8 x float> @testYMM_7(<8 x float> %_ymm0, <8 x float> %_ymm1) {
+define <8 x float> @testymm_7(<8 x float> %_ymm0, <8 x float> %_ymm1) {
; CHECK: error: inline assembly requires more registers than available
entry:
%0 = tail call <8 x float> asm "vmovups $1, $0", "=v,v,~{ymm0},~{ymm1},~{ymm2},~{ymm3},~{ymm4},~{ymm5},~{ymm6},~{ymm7},~{dirflag},~{fpsr},~{flags}"(<8 x float> %_ymm1)
ret <8 x float> %0
}
-define <8 x float> @testYMM_8(<8 x float> %_ymm0, <8 x float> %_ymm1) {
+define <8 x float> @testymm_8(<8 x float> %_ymm0, <8 x float> %_ymm1) {
; CHECK: error: inline assembly requires more registers than available
entry:
%0 = tail call <8 x float> asm "vmovupd $1, $0", "=v,v,~{ymm0},~{ymm1},~{ymm2},~{ymm3},~{ymm4},~{ymm5},~{ymm6},~{ymm7},~{dirflag},~{fpsr},~{flags}"(<8 x float> %_ymm1)
diff --git a/test/CodeGen/X86/inline-asm-avx-v-constraint.ll b/test/CodeGen/X86/inline-asm-avx-v-constraint.ll
index 140c2544f19c..2c8de16fd372 100644
--- a/test/CodeGen/X86/inline-asm-avx-v-constraint.ll
+++ b/test/CodeGen/X86/inline-asm-avx-v-constraint.ll
@@ -1,133 +1,133 @@
-; RUN: llc < %s -march x86-64 -mtriple x86_64-unknown-linux-gnu -mattr +avx | FileCheck %s
-; RUN: llc < %s -march x86-64 -mtriple x86_64-unknown-linux-gnu -mattr +avx512f | FileCheck %s
+; RUN: llc < %s -mtriple x86_64-unknown-linux-gnu -mattr +avx | FileCheck %s
+; RUN: llc < %s -mtriple x86_64-unknown-linux-gnu -mattr +avx512f | FileCheck %s
-define <4 x float> @testXMM_1(<4 x float> %_xmm0, i64 %_l) {
+define <4 x float> @testxmm_1(<4 x float> %_xmm0, i64 %_l) {
; CHECK: vmovhlps %xmm1, %xmm0, %xmm0
entry:
%0 = tail call <4 x float> asm "vmovhlps $1, $2, $0", "=v,v,v,~{dirflag},~{fpsr},~{flags}"(i64 %_l, <4 x float> %_xmm0)
ret <4 x float> %0
}
-define <4 x float> @testXMM_2(<4 x float> %_xmm0, i64 %_l) {
+define <4 x float> @testxmm_2(<4 x float> %_xmm0, i64 %_l) {
; CHECK: movapd %xmm0, %xmm0
entry:
%0 = tail call <4 x float> asm "movapd $1, $0", "=v,v,~{dirflag},~{fpsr},~{flags}"(i64 %_l)
ret <4 x float> %0
}
-define <4 x float> @testXMM_3(<4 x float> %_xmm0, i64 %_l) {
+define <4 x float> @testxmm_3(<4 x float> %_xmm0, i64 %_l) {
; CHECK: vmovapd %xmm0, %xmm0
entry:
%0 = tail call <4 x float> asm "vmovapd $1, $0", "=v,v,~{dirflag},~{fpsr},~{flags}"(i64 %_l)
ret <4 x float> %0
}
-define <4 x float> @testXMM_4(<4 x float> %_xmm0, i64 %_l) {
+define <4 x float> @testxmm_4(<4 x float> %_xmm0, i64 %_l) {
; CHECK: vmpsadbw $0, %xmm1, %xmm0, %xmm0
entry:
%0 = tail call <4 x float> asm "vmpsadbw $$0, $1, $2, $0", "=v,v,v,~{dirflag},~{fpsr},~{flags}"(i64 %_l, <4 x float> %_xmm0)
ret <4 x float> %0
}
-define <4 x float> @testXMM_5(<4 x float> %_xmm0, i64 %_l) {
+define <4 x float> @testxmm_5(<4 x float> %_xmm0, i64 %_l) {
; CHECK: vminpd %xmm0, %xmm0, %xmm0
entry:
%0 = tail call <4 x float> asm "vminpd $1, $2, $0", "=v,v,v,~{dirflag},~{fpsr},~{flags}"(i64 %_l, i64 %_l)
ret <4 x float> %0
}
-define i64 @testXMM_6(i64 returned %_l) {
+define i64 @testxmm_6(i64 returned %_l) {
; CHECK: vmovd %xmm0, %eax
entry:
tail call void asm sideeffect "vmovd $0, %eax", "v,~{dirflag},~{fpsr},~{flags}"(i64 %_l)
ret i64 %_l
}
-define <4 x float> @testXMM_7(<4 x float> returned %_xmm0) {
+define <4 x float> @testxmm_7(<4 x float> returned %_xmm0) {
; CHECK: vmovmskps %xmm0, %eax
entry:
tail call void asm sideeffect "vmovmskps $0, %rax", "v,~{dirflag},~{fpsr},~{flags}"(<4 x float> %_xmm0)
ret <4 x float> %_xmm0
}
-define i64 @testXMM_8(<4 x float> %_xmm0, i64 %_l) {
+define i64 @testxmm_8(<4 x float> %_xmm0, i64 %_l) {
; CHECK: vmulsd %xmm1, %xmm0, %xmm0
entry:
%0 = tail call i64 asm "vmulsd $1, $2, $0", "=v,v,v,~{dirflag},~{fpsr},~{flags}"(i64 %_l, <4 x float> %_xmm0)
ret i64 %0
}
-define <4 x float> @testXMM_9(<4 x float> %_xmm0, i64 %_l) {
+define <4 x float> @testxmm_9(<4 x float> %_xmm0, i64 %_l) {
; CHECK: vorpd %xmm1, %xmm0, %xmm0
entry:
%0 = tail call <4 x float> asm "vorpd $1, $2, $0", "=v,v,v,~{dirflag},~{fpsr},~{flags}"(i64 %_l, <4 x float> %_xmm0)
ret <4 x float> %0
}
-define <4 x float> @testXMM_10(<4 x float> %_xmm0, i64 %_l) {
+define <4 x float> @testxmm_10(<4 x float> %_xmm0, i64 %_l) {
; CHECK: pabsb %xmm0, %xmm0
entry:
%0 = tail call <4 x float> asm "pabsb $1, $0", "=v,v,~{dirflag},~{fpsr},~{flags}"(i64 %_l)
ret <4 x float> %0
}
-define <4 x float> @testXMM_11(<4 x float> %_xmm0, i64 %_l) {
+define <4 x float> @testxmm_11(<4 x float> %_xmm0, i64 %_l) {
; CHECK: vpabsd %xmm0, %xmm0
entry:
%0 = tail call <4 x float> asm "vpabsd $1, $0", "=v,v,~{dirflag},~{fpsr},~{flags}"(i64 %_l)
ret <4 x float> %0
}
-define <8 x float> @testYMM_1(<8 x float> %_ymm0, <8 x float> %_ymm1) {
+define <8 x float> @testymm_1(<8 x float> %_ymm0, <8 x float> %_ymm1) {
; CHECK: vmovsldup %ymm0, %ymm0
entry:
%0 = tail call <8 x float> asm "vmovsldup $1, $0", "=v,v,~{dirflag},~{fpsr},~{flags}"(<8 x float> %_ymm0)
ret <8 x float> %0
}
-define <8 x float> @testYMM_2(<8 x float> %_ymm0, <8 x float> %_ymm1) {
+define <8 x float> @testymm_2(<8 x float> %_ymm0, <8 x float> %_ymm1) {
; CHECK: vmovapd %ymm1, %ymm0
entry:
%0 = tail call <8 x float> asm "vmovapd $1, $0", "=v,v,~{dirflag},~{fpsr},~{flags}"(<8 x float> %_ymm1)
ret <8 x float> %0
}
-define <8 x float> @testYMM_3(<8 x float> %_ymm0, <8 x float> %_ymm1) {
+define <8 x float> @testymm_3(<8 x float> %_ymm0, <8 x float> %_ymm1) {
; CHECK: vminpd %ymm1, %ymm0, %ymm0
entry:
%0 = tail call <8 x float> asm "vminpd $1, $2, $0", "=v,v,v,~{dirflag},~{fpsr},~{flags}"(<8 x float> %_ymm1, <8 x float> %_ymm0)
ret <8 x float> %0
}
-define <8 x float> @testYMM_4(<8 x float> %_ymm0, <8 x float> %_ymm1) {
+define <8 x float> @testymm_4(<8 x float> %_ymm0, <8 x float> %_ymm1) {
; CHECK: vorpd %ymm1, %ymm0, %ymm0
entry:
%0 = tail call <8 x float> asm "vorpd $1, $2, $0", "=v,v,v,~{dirflag},~{fpsr},~{flags}"(<8 x float> %_ymm1, <8 x float> %_ymm0)
ret <8 x float> %0
}
-define <8 x float> @testYMM(<8 x float> %_ymm0, <8 x float> %_ymm1) {
+define <8 x float> @testymm(<8 x float> %_ymm0, <8 x float> %_ymm1) {
; CHECK: vmulps %ymm1, %ymm0, %ymm0
entry:
%0 = tail call <8 x float> asm "vmulps $1, $2, $0", "=v,v,v,~{dirflag},~{fpsr},~{flags}"(<8 x float> %_ymm1, <8 x float> %_ymm0)
ret <8 x float> %0
}
-define <8 x float> @testYMM_6(<8 x float> %_ymm0, <8 x float> %_ymm1) {
+define <8 x float> @testymm_6(<8 x float> %_ymm0, <8 x float> %_ymm1) {
; CHECK: vmulpd %ymm1, %ymm0, %ymm0
entry:
%0 = tail call <8 x float> asm "vmulpd $1, $2, $0", "=v,v,v,~{dirflag},~{fpsr},~{flags}"(<8 x float> %_ymm1, <8 x float> %_ymm0)
ret <8 x float> %0
}
-define <8 x float> @testYMM_7(<8 x float> %_ymm0, <8 x float> %_ymm1) {
+define <8 x float> @testymm_7(<8 x float> %_ymm0, <8 x float> %_ymm1) {
; CHECK: vmovups %ymm1, %ymm0
entry:
%0 = tail call <8 x float> asm "vmovups $1, $0", "=v,v,~{dirflag},~{fpsr},~{flags}"(<8 x float> %_ymm1)
ret <8 x float> %0
}
-define <8 x float> @testYMM_8(<8 x float> %_ymm0, <8 x float> %_ymm1) {
+define <8 x float> @testymm_8(<8 x float> %_ymm0, <8 x float> %_ymm1) {
; CHECK: vmovupd %ymm1, %ymm0
entry:
%0 = tail call <8 x float> asm "vmovupd $1, $0", "=v,v,~{dirflag},~{fpsr},~{flags}"(<8 x float> %_ymm1)
diff --git a/test/CodeGen/X86/inline-asm-avx512f-v-constraint.ll b/test/CodeGen/X86/inline-asm-avx512f-v-constraint.ll
index 4600d4e5b99d..019973ba935b 100644
--- a/test/CodeGen/X86/inline-asm-avx512f-v-constraint.ll
+++ b/test/CodeGen/X86/inline-asm-avx512f-v-constraint.ll
@@ -1,13 +1,13 @@
-; RUN: llc < %s -march x86-64 -mtriple x86_64-unknown-linux-gnu -mattr +avx512f | FileCheck %s
+; RUN: llc < %s -mtriple x86_64-unknown-linux-gnu -mattr +avx512f | FileCheck %s
-define <16 x float> @testZMM_1(<16 x float> %_zmm0, <16 x float> %_zmm1) {
+define <16 x float> @testzmm_1(<16 x float> %_zmm0, <16 x float> %_zmm1) {
entry:
; CHECK: vpternlogd $0, %zmm1, %zmm0, %zmm0
%0 = tail call <16 x float> asm "vpternlogd $$0, $1, $2, $0", "=v,v,v,~{dirflag},~{fpsr},~{flags}"(<16 x float> %_zmm1, <16 x float> %_zmm0)
ret <16 x float> %0
}
-define <16 x float> @testZMM_2(<16 x float> %_zmm0, <16 x float> %_zmm1) {
+define <16 x float> @testzmm_2(<16 x float> %_zmm0, <16 x float> %_zmm1) {
entry:
; CHECK: vpabsq %zmm1, %zmm0
%0 = tail call <16 x float> asm "vpabsq $1, $0", "=v,v,~{dirflag},~{fpsr},~{flags}"(<16 x float> %_zmm1)
@@ -15,7 +15,7 @@ entry:
}
-define <16 x float> @testZMM_3(<16 x float> %_zmm0, <16 x float> %_zmm1) {
+define <16 x float> @testzmm_3(<16 x float> %_zmm0, <16 x float> %_zmm1) {
entry:
; CHECK: vpaddd %zmm1, %zmm1, %zmm0
%0 = tail call <16 x float> asm "vpaddd $1, $2, $0", "=v,v,v,~{dirflag},~{fpsr},~{flags}"(<16 x float> %_zmm1, <16 x float> %_zmm1)
@@ -23,7 +23,7 @@ entry:
}
-define <16 x float> @testZMM_4(<16 x float> %_zmm0, <16 x float> %_zmm1) {
+define <16 x float> @testzmm_4(<16 x float> %_zmm0, <16 x float> %_zmm1) {
entry:
; CHECK: vpaddq %zmm1, %zmm1, %zmm0
%0 = tail call <16 x float> asm "vpaddq $1, $2, $0", "=v,v,v,~{dirflag},~{fpsr},~{flags}"(<16 x float> %_zmm1, <16 x float> %_zmm1)
@@ -31,7 +31,7 @@ entry:
}
-define <16 x float> @testZMM_5(<16 x float> %_zmm0, <16 x float> %_zmm1) {
+define <16 x float> @testzmm_5(<16 x float> %_zmm0, <16 x float> %_zmm1) {
entry:
; CHECK: vpandd %zmm1, %zmm1, %zmm0
%0 = tail call <16 x float> asm "vpandd $1, $2, $0", "=v,v,v,~{dirflag},~{fpsr},~{flags}"(<16 x float> %_zmm1, <16 x float> %_zmm1)
@@ -39,7 +39,7 @@ entry:
}
-define <16 x float> @testZMM_6(<16 x float> %_zmm0, <16 x float> %_zmm1) {
+define <16 x float> @testzmm_6(<16 x float> %_zmm0, <16 x float> %_zmm1) {
entry:
; CHECK: vpandnd %zmm1, %zmm1, %zmm0
%0 = tail call <16 x float> asm "vpandnd $1, $2, $0", "=v,v,v,~{dirflag},~{fpsr},~{flags}"(<16 x float> %_zmm1, <16 x float> %_zmm1)
@@ -47,7 +47,7 @@ entry:
}
-define <16 x float> @testZMM_7(<16 x float> %_zmm0, <16 x float> %_zmm1) {
+define <16 x float> @testzmm_7(<16 x float> %_zmm0, <16 x float> %_zmm1) {
entry:
; CHECK: vpmaxsd %zmm1, %zmm1, %zmm0
%0 = tail call <16 x float> asm "vpmaxsd $1, $2, $0", "=v,v,v,~{dirflag},~{fpsr},~{flags}"(<16 x float> %_zmm1, <16 x float> %_zmm1)
@@ -55,7 +55,7 @@ entry:
}
-define <16 x float> @testZMM_8(<16 x float> %_zmm0, <16 x float> %_zmm1) {
+define <16 x float> @testzmm_8(<16 x float> %_zmm0, <16 x float> %_zmm1) {
entry:
; CHECK: vmovups %zmm1, %zmm0
%0 = tail call <16 x float> asm "vmovups $1, $0", "=v,v,~{dirflag},~{fpsr},~{flags}"(<16 x float> %_zmm1)
@@ -63,7 +63,7 @@ entry:
}
-define <16 x float> @testZMM_9(<16 x float> %_zmm0, <16 x float> %_zmm1) {
+define <16 x float> @testzmm_9(<16 x float> %_zmm0, <16 x float> %_zmm1) {
entry:
; CHECK: vmovupd %zmm1, %zmm0
%0 = tail call <16 x float> asm "vmovupd $1, $0", "=v,v,~{dirflag},~{fpsr},~{flags}"(<16 x float> %_zmm1)
diff --git a/test/CodeGen/X86/inline-asm-avx512vl-v-constraint-32bit.ll b/test/CodeGen/X86/inline-asm-avx512vl-v-constraint-32bit.ll
index 81d17d3ac9a2..7278089348e2 100644
--- a/test/CodeGen/X86/inline-asm-avx512vl-v-constraint-32bit.ll
+++ b/test/CodeGen/X86/inline-asm-avx512vl-v-constraint-32bit.ll
@@ -1,7 +1,7 @@
; RUN: not llc < %s -mtriple i386-unknown-linux-gnu -mattr +avx512vl -o /dev/null 2> %t
; RUN: FileCheck %s --input-file %t
-define <4 x float> @testXMM_1(<4 x float> %_xmm0, i64 %_l) {
+define <4 x float> @testxmm_1(<4 x float> %_xmm0, i64 %_l) {
; CHECK: error: inline assembly requires more registers than available
entry:
%0 = tail call <4 x float> asm "vmovhlps $1, $2, $0", "=v,v,v,~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{dirflag},~{fpsr},~{flags}"(i64 %_l, <4 x float> %_xmm0)
@@ -9,7 +9,7 @@ entry:
}
-define <4 x float> @testXMM_2(<4 x float> %_xmm0, i64 %_l) {
+define <4 x float> @testxmm_2(<4 x float> %_xmm0, i64 %_l) {
; CHECK: error: inline assembly requires more registers than available
entry:
%0 = tail call <4 x float> asm "vmovapd $1, $0", "=v,v,~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{dirflag},~{fpsr},~{flags}"(i64 %_l)
@@ -17,7 +17,7 @@ entry:
}
-define <4 x float> @testXMM_3(<4 x float> %_xmm0, i64 %_l) {
+define <4 x float> @testxmm_3(<4 x float> %_xmm0, i64 %_l) {
; CHECK: error: inline assembly requires more registers than available
entry:
%0 = tail call <4 x float> asm "vminpd $1, $2, $0", "=v,v,v,~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{dirflag},~{fpsr},~{flags}"(i64 %_l, i64 %_l)
@@ -25,7 +25,7 @@ entry:
}
-define i64 @testXMM_4(<4 x float> %_xmm0, i64 %_l) {
+define i64 @testxmm_4(<4 x float> %_xmm0, i64 %_l) {
; CHECK: error: inline assembly requires more registers than available
entry:
%0 = tail call i64 asm "vmulsd $1, $2, $0", "=v,v,v,~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{dirflag},~{fpsr},~{flags}"(i64 %_l, <4 x float> %_xmm0)
@@ -33,7 +33,7 @@ entry:
}
-define <4 x float> @testXMM_5(<4 x float> %_xmm0, i64 %_l) {
+define <4 x float> @testxmm_5(<4 x float> %_xmm0, i64 %_l) {
; CHECK: error: inline assembly requires more registers than available
entry:
%0 = tail call <4 x float> asm "vpabsq $1, $0", "=v,v,~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{dirflag},~{fpsr},~{flags}"(i64 %_l)
@@ -41,7 +41,7 @@ entry:
}
-define <4 x float> @testXMM_6(<4 x float> %_xmm0, i64 %_l) {
+define <4 x float> @testxmm_6(<4 x float> %_xmm0, i64 %_l) {
; CHECK: error: inline assembly requires more registers than available
entry:
%0 = tail call <4 x float> asm "vpandd $1, $2, $0", "=v,v,v,~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{dirflag},~{fpsr},~{flags}"(<4 x float> %_xmm0, i64 %_l)
@@ -49,7 +49,7 @@ entry:
}
-define <4 x float> @testXMM_7(<4 x float> %_xmm0, i64 %_l) {
+define <4 x float> @testxmm_7(<4 x float> %_xmm0, i64 %_l) {
; CHECK: error: inline assembly requires more registers than available
entry:
%0 = tail call <4 x float> asm "vpandnd $1, $2, $0", "=v,v,v,~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{dirflag},~{fpsr},~{flags}"(<4 x float> %_xmm0, i64 %_l)
@@ -57,7 +57,7 @@ entry:
}
-define <8 x float> @testYMM_1(<8 x float> %_ymm0, <8 x float> %_ymm1) {
+define <8 x float> @testymm_1(<8 x float> %_ymm0, <8 x float> %_ymm1) {
; CHECK: error: inline assembly requires more registers than available
entry:
%0 = tail call <8 x float> asm "vmovsldup $1, $0", "=v,v,~{ymm0},~{ymm1},~{ymm2},~{ymm3},~{ymm4},~{ymm5},~{ymm6},~{ymm7},~{dirflag},~{fpsr},~{flags}"(<8 x float> %_ymm1)
@@ -65,7 +65,7 @@ entry:
}
-define <8 x float> @testYMM_2(<8 x float> %_ymm0, <8 x float> %_ymm1) {
+define <8 x float> @testymm_2(<8 x float> %_ymm0, <8 x float> %_ymm1) {
; CHECK: error: inline assembly requires more registers than available
entry:
%0 = tail call <8 x float> asm "vmovapd $1, $0", "=v,v,~{ymm0},~{ymm1},~{ymm2},~{ymm3},~{ymm4},~{ymm5},~{ymm6},~{ymm7},~{dirflag},~{fpsr},~{flags}"(<8 x float> %_ymm1)
@@ -73,7 +73,7 @@ entry:
}
-define <8 x float> @testYMM_3(<8 x float> %_ymm0, <8 x float> %_ymm1) {
+define <8 x float> @testymm_3(<8 x float> %_ymm0, <8 x float> %_ymm1) {
; CHECK: error: inline assembly requires more registers than available
entry:
%0 = tail call <8 x float> asm "vminpd $1, $2, $0", "=v,v,v,~{ymm0},~{ymm1},~{ymm2},~{ymm3},~{ymm4},~{ymm5},~{ymm6},~{ymm7},~{dirflag},~{fpsr},~{flags}"(<8 x float> %_ymm1, <8 x float> %_ymm1)
@@ -81,7 +81,7 @@ entry:
}
-define <8 x float> @testYMM_4(<8 x float> %_ymm0, <8 x float> %_ymm1) {
+define <8 x float> @testymm_4(<8 x float> %_ymm0, <8 x float> %_ymm1) {
; CHECK: error: inline assembly requires more registers than available
entry:
%0 = tail call <8 x float> asm "vpabsq $1, $0", "=v,v,~{ymm0},~{ymm1},~{ymm2},~{ymm3},~{ymm4},~{ymm5},~{ymm6},~{ymm7},~{dirflag},~{fpsr},~{flags}"(<8 x float> %_ymm1)
@@ -89,7 +89,7 @@ entry:
}
-define <8 x float> @testYMM_5(<8 x float> %_ymm0, <8 x float> %_ymm1) {
+define <8 x float> @testymm_5(<8 x float> %_ymm0, <8 x float> %_ymm1) {
; CHECK: error: inline assembly requires more registers than available
entry:
%0 = tail call <8 x float> asm "vpandd $1, $2, $0", "=v,v,v,~{ymm0},~{ymm1},~{ymm2},~{ymm3},~{ymm4},~{ymm5},~{ymm6},~{ymm7},~{dirflag},~{fpsr},~{flags}"(<8 x float> %_ymm1, <8 x float> %_ymm0)
@@ -97,7 +97,7 @@ entry:
}
-define <8 x float> @testYMM_6(<8 x float> %_ymm0, <8 x float> %_ymm1) {
+define <8 x float> @testymm_6(<8 x float> %_ymm0, <8 x float> %_ymm1) {
; CHECK: error: inline assembly requires more registers than available
entry:
%0 = tail call <8 x float> asm "vpandnd $1, $2, $0", "=v,v,v,~{ymm0},~{ymm1},~{ymm2},~{ymm3},~{ymm4},~{ymm5},~{ymm6},~{ymm7},~{dirflag},~{fpsr},~{flags}"(<8 x float> %_ymm1, <8 x float> %_ymm0)
@@ -105,7 +105,7 @@ entry:
}
-define <8 x float> @testYMM_7(<8 x float> %_ymm0, <8 x float> %_ymm1) {
+define <8 x float> @testymm_7(<8 x float> %_ymm0, <8 x float> %_ymm1) {
; CHECK: error: inline assembly requires more registers than available
entry:
%0 = tail call <8 x float> asm "vpminud $1, $2, $0", "=v,v,v,~{ymm0},~{ymm1},~{ymm2},~{ymm3},~{ymm4},~{ymm5},~{ymm6},~{ymm7},~{dirflag},~{fpsr},~{flags}"(<8 x float> %_ymm1, <8 x float> %_ymm0)
@@ -113,7 +113,7 @@ entry:
}
-define <8 x float> @testYMM_8(<8 x float> %_ymm0, <8 x float> %_ymm1) {
+define <8 x float> @testymm_8(<8 x float> %_ymm0, <8 x float> %_ymm1) {
; CHECK: error: inline assembly requires more registers than available
entry:
%0 = tail call <8 x float> asm "vpmaxsd $1, $2, $0", "=v,v,v,~{ymm0},~{ymm1},~{ymm2},~{ymm3},~{ymm4},~{ymm5},~{ymm6},~{ymm7},~{dirflag},~{fpsr},~{flags}"(<8 x float> %_ymm1, <8 x float> %_ymm0)
@@ -121,7 +121,7 @@ entry:
}
-define <8 x float> @testYMM_9(<8 x float> %_ymm0, <8 x float> %_ymm1) {
+define <8 x float> @testymm_9(<8 x float> %_ymm0, <8 x float> %_ymm1) {
; CHECK: error: inline assembly requires more registers than available
entry:
%0 = tail call <8 x float> asm "vmovups $1, $0", "=v,v,~{ymm0},~{ymm1},~{ymm2},~{ymm3},~{ymm4},~{ymm5},~{ymm6},~{ymm7},~{dirflag},~{fpsr},~{flags}"(<8 x float> %_ymm1)
@@ -129,7 +129,7 @@ entry:
}
-define <8 x float> @testYMM_10(<8 x float> %_ymm0, <8 x float> %_ymm1) {
+define <8 x float> @testymm_10(<8 x float> %_ymm0, <8 x float> %_ymm1) {
; CHECK: error: inline assembly requires more registers than available
entry:
%0 = tail call <8 x float> asm "vmovupd $1, $0", "=v,v,~{ymm0},~{ymm1},~{ymm2},~{ymm3},~{ymm4},~{ymm5},~{ymm6},~{ymm7},~{dirflag},~{fpsr},~{flags}"(<8 x float> %_ymm1)
diff --git a/test/CodeGen/X86/inline-asm-avx512vl-v-constraint.ll b/test/CodeGen/X86/inline-asm-avx512vl-v-constraint.ll
index 3453fb1be5dc..4b01814b2e20 100644
--- a/test/CodeGen/X86/inline-asm-avx512vl-v-constraint.ll
+++ b/test/CodeGen/X86/inline-asm-avx512vl-v-constraint.ll
@@ -1,118 +1,118 @@
-; RUN: llc < %s -march x86-64 -mtriple x86_64-unknown-linux-gnu -mattr +avx512vl | FileCheck %s
+; RUN: llc < %s -mtriple x86_64-unknown-linux-gnu -mattr +avx512vl | FileCheck %s
-define <4 x float> @testXMM_1(<4 x float> %_xmm0, i64 %_l) {
+define <4 x float> @testxmm_1(<4 x float> %_xmm0, i64 %_l) {
entry:
; CHECK: vmovhlps %xmm17, %xmm16, %xmm16
%0 = tail call <4 x float> asm "vmovhlps $1, $2, $0", "=v,v,v,~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{dirflag},~{fpsr},~{flags}"(i64 %_l, <4 x float> %_xmm0)
ret <4 x float> %0
}
-define <4 x float> @testXMM_2(<4 x float> %_xmm0, i64 %_l) {
+define <4 x float> @testxmm_2(<4 x float> %_xmm0, i64 %_l) {
entry:
; CHECK: vmovapd %xmm16, %xmm16
%0 = tail call <4 x float> asm "vmovapd $1, $0", "=v,v,~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{dirflag},~{fpsr},~{flags}"(i64 %_l)
ret <4 x float> %0
}
-define <4 x float> @testXMM_3(<4 x float> %_xmm0, i64 %_l) {
+define <4 x float> @testxmm_3(<4 x float> %_xmm0, i64 %_l) {
entry:
; CHECK: vminpd %xmm16, %xmm16, %xmm16
%0 = tail call <4 x float> asm "vminpd $1, $2, $0", "=v,v,v,~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{dirflag},~{fpsr},~{flags}"(i64 %_l, i64 %_l)
ret <4 x float> %0
}
-define i64 @testXMM_4(<4 x float> %_xmm0, i64 %_l) {
+define i64 @testxmm_4(<4 x float> %_xmm0, i64 %_l) {
entry:
; CHECK: vmulsd %xmm17, %xmm16, %xmm16
%0 = tail call i64 asm "vmulsd $1, $2, $0", "=v,v,v,~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{dirflag},~{fpsr},~{flags}"(i64 %_l, <4 x float> %_xmm0)
ret i64 %0
}
-define <4 x float> @testXMM_5(<4 x float> %_xmm0, i64 %_l) {
+define <4 x float> @testxmm_5(<4 x float> %_xmm0, i64 %_l) {
entry:
; CHECK: vpabsq %xmm16, %xmm16
%0 = tail call <4 x float> asm "vpabsq $1, $0", "=v,v,~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{dirflag},~{fpsr},~{flags}"(i64 %_l)
ret <4 x float> %0
}
-define <4 x float> @testXMM_6(<4 x float> %_xmm0, i64 %_l) {
+define <4 x float> @testxmm_6(<4 x float> %_xmm0, i64 %_l) {
entry:
; CHECK: vpandd %xmm16, %xmm17, %xmm16
%0 = tail call <4 x float> asm "vpandd $1, $2, $0", "=v,v,v,~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{dirflag},~{fpsr},~{flags}"(<4 x float> %_xmm0, i64 %_l)
ret <4 x float> %0
}
-define <4 x float> @testXMM_7(<4 x float> %_xmm0, i64 %_l) {
+define <4 x float> @testxmm_7(<4 x float> %_xmm0, i64 %_l) {
entry:
; CHECK: vpandnd %xmm16, %xmm17, %xmm16
%0 = tail call <4 x float> asm "vpandnd $1, $2, $0", "=v,v,v,~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{dirflag},~{fpsr},~{flags}"(<4 x float> %_xmm0, i64 %_l)
ret <4 x float> %0
}
-define <8 x float> @testYMM_1(<8 x float> %_ymm0, <8 x float> %_ymm1) {
+define <8 x float> @testymm_1(<8 x float> %_ymm0, <8 x float> %_ymm1) {
entry:
; CHECK: vmovsldup %ymm16, %ymm16
%0 = tail call <8 x float> asm "vmovsldup $1, $0", "=v,v,~{ymm0},~{ymm1},~{ymm2},~{ymm3},~{ymm4},~{ymm5},~{ymm6},~{ymm7},~{ymm8},~{ymm9},~{ymm10},~{ymm11},~{ymm12},~{ymm13},~{ymm14},~{ymm15},~{dirflag},~{fpsr},~{flags}"(<8 x float> %_ymm1)
ret <8 x float> %0
}
-define <8 x float> @testYMM_2(<8 x float> %_ymm0, <8 x float> %_ymm1) {
+define <8 x float> @testymm_2(<8 x float> %_ymm0, <8 x float> %_ymm1) {
entry:
; CHECK: vmovapd %ymm16, %ymm16
%0 = tail call <8 x float> asm "vmovapd $1, $0", "=v,v,~{ymm0},~{ymm1},~{ymm2},~{ymm3},~{ymm4},~{ymm5},~{ymm6},~{ymm7},~{ymm8},~{ymm9},~{ymm10},~{ymm11},~{ymm12},~{ymm13},~{ymm14},~{ymm15},~{dirflag},~{fpsr},~{flags}"(<8 x float> %_ymm1)
ret <8 x float> %0
}
-define <8 x float> @testYMM_3(<8 x float> %_ymm0, <8 x float> %_ymm1) {
+define <8 x float> @testymm_3(<8 x float> %_ymm0, <8 x float> %_ymm1) {
entry:
; CHECK: vminpd %ymm16, %ymm16, %ymm16
%0 = tail call <8 x float> asm "vminpd $1, $2, $0", "=v,v,v,~{ymm0},~{ymm1},~{ymm2},~{ymm3},~{ymm4},~{ymm5},~{ymm6},~{ymm7},~{ymm8},~{ymm9},~{ymm10},~{ymm11},~{ymm12},~{ymm13},~{ymm14},~{ymm15},~{dirflag},~{fpsr},~{flags}"(<8 x float> %_ymm1, <8 x float> %_ymm1)
ret <8 x float> %0
}
-define <8 x float> @testYMM_4(<8 x float> %_ymm0, <8 x float> %_ymm1) {
+define <8 x float> @testymm_4(<8 x float> %_ymm0, <8 x float> %_ymm1) {
entry:
; CHECK: vpabsq %ymm16, %ymm16
%0 = tail call <8 x float> asm "vpabsq $1, $0", "=v,v,~{ymm0},~{ymm1},~{ymm2},~{ymm3},~{ymm4},~{ymm5},~{ymm6},~{ymm7},~{ymm8},~{ymm9},~{ymm10},~{ymm11},~{ymm12},~{ymm13},~{ymm14},~{ymm15},~{dirflag},~{fpsr},~{flags}"(<8 x float> %_ymm1)
ret <8 x float> %0
}
-define <8 x float> @testYMM_5(<8 x float> %_ymm0, <8 x float> %_ymm1) {
+define <8 x float> @testymm_5(<8 x float> %_ymm0, <8 x float> %_ymm1) {
entry:
; CHECK: vpandd %ymm16, %ymm17, %ymm16
%0 = tail call <8 x float> asm "vpandd $1, $2, $0", "=v,v,v,~{ymm0},~{ymm1},~{ymm2},~{ymm3},~{ymm4},~{ymm5},~{ymm6},~{ymm7},~{ymm8},~{ymm9},~{ymm10},~{ymm11},~{ymm12},~{ymm13},~{ymm14},~{ymm15},~{dirflag},~{fpsr},~{flags}"(<8 x float> %_ymm1, <8 x float> %_ymm0)
ret <8 x float> %0
}
-define <8 x float> @testYMM_6(<8 x float> %_ymm0, <8 x float> %_ymm1) {
+define <8 x float> @testymm_6(<8 x float> %_ymm0, <8 x float> %_ymm1) {
entry:
; CHECK: vpandnd %ymm16, %ymm17, %ymm16
%0 = tail call <8 x float> asm "vpandnd $1, $2, $0", "=v,v,v,~{ymm0},~{ymm1},~{ymm2},~{ymm3},~{ymm4},~{ymm5},~{ymm6},~{ymm7},~{ymm8},~{ymm9},~{ymm10},~{ymm11},~{ymm12},~{ymm13},~{ymm14},~{ymm15},~{dirflag},~{fpsr},~{flags}"(<8 x float> %_ymm1, <8 x float> %_ymm0)
ret <8 x float> %0
}
-define <8 x float> @testYMM_7(<8 x float> %_ymm0, <8 x float> %_ymm1) {
+define <8 x float> @testymm_7(<8 x float> %_ymm0, <8 x float> %_ymm1) {
entry:
; CHECK: vpminud %ymm16, %ymm17, %ymm16
%0 = tail call <8 x float> asm "vpminud $1, $2, $0", "=v,v,v,~{ymm0},~{ymm1},~{ymm2},~{ymm3},~{ymm4},~{ymm5},~{ymm6},~{ymm7},~{ymm8},~{ymm9},~{ymm10},~{ymm11},~{ymm12},~{ymm13},~{ymm14},~{ymm15},~{dirflag},~{fpsr},~{flags}"(<8 x float> %_ymm1, <8 x float> %_ymm0)
ret <8 x float> %0
}
-define <8 x float> @testYMM_8(<8 x float> %_ymm0, <8 x float> %_ymm1) {
+define <8 x float> @testymm_8(<8 x float> %_ymm0, <8 x float> %_ymm1) {
entry:
; CHECK: vpmaxsd %ymm16, %ymm17, %ymm16
%0 = tail call <8 x float> asm "vpmaxsd $1, $2, $0", "=v,v,v,~{ymm0},~{ymm1},~{ymm2},~{ymm3},~{ymm4},~{ymm5},~{ymm6},~{ymm7},~{ymm8},~{ymm9},~{ymm10},~{ymm11},~{ymm12},~{ymm13},~{ymm14},~{ymm15},~{dirflag},~{fpsr},~{flags}"(<8 x float> %_ymm1, <8 x float> %_ymm0)
ret <8 x float> %0
}
-define <8 x float> @testYMM_9(<8 x float> %_ymm0, <8 x float> %_ymm1) {
+define <8 x float> @testymm_9(<8 x float> %_ymm0, <8 x float> %_ymm1) {
entry:
; CHECK: vmovups %ymm16, %ymm16
%0 = tail call <8 x float> asm "vmovups $1, $0", "=v,v,~{ymm0},~{ymm1},~{ymm2},~{ymm3},~{ymm4},~{ymm5},~{ymm6},~{ymm7},~{ymm8},~{ymm9},~{ymm10},~{ymm11},~{ymm12},~{ymm13},~{ymm14},~{ymm15},~{dirflag},~{fpsr},~{flags}"(<8 x float> %_ymm1)
ret <8 x float> %0
}
-define <8 x float> @testYMM_10(<8 x float> %_ymm0, <8 x float> %_ymm1) {
+define <8 x float> @testymm_10(<8 x float> %_ymm0, <8 x float> %_ymm1) {
entry:
; CHECK: vmovupd %ymm16, %ymm16
%0 = tail call <8 x float> asm "vmovupd $1, $0", "=v,v,~{ymm0},~{ymm1},~{ymm2},~{ymm3},~{ymm4},~{ymm5},~{ymm6},~{ymm7},~{ymm8},~{ymm9},~{ymm10},~{ymm11},~{ymm12},~{ymm13},~{ymm14},~{ymm15},~{dirflag},~{fpsr},~{flags}"(<8 x float> %_ymm1)
diff --git a/test/CodeGen/X86/inline-asm-bad-constraint-n.ll b/test/CodeGen/X86/inline-asm-bad-constraint-n.ll
index 91b1ffed4e0f..967477d076d3 100644
--- a/test/CodeGen/X86/inline-asm-bad-constraint-n.ll
+++ b/test/CodeGen/X86/inline-asm-bad-constraint-n.ll
@@ -1,4 +1,4 @@
-; RUN: not llc -march=x86 -no-integrated-as < %s 2>&1 | FileCheck %s
+; RUN: not llc -mtriple=i686-- -no-integrated-as < %s 2>&1 | FileCheck %s
@x = global i32 0, align 4
diff --git a/test/CodeGen/X86/inline-asm-duplicated-constraint.ll b/test/CodeGen/X86/inline-asm-duplicated-constraint.ll
index 2ef54749739f..0228f45ce96c 100644
--- a/test/CodeGen/X86/inline-asm-duplicated-constraint.ll
+++ b/test/CodeGen/X86/inline-asm-duplicated-constraint.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -no-integrated-as -mtriple=x86_64-linux-gnu | FileCheck %s
+; RUN: llc < %s -no-integrated-as -mtriple=x86_64-linux-gnu | FileCheck %s
; CHECK-LABEL: test1:
; CHECK: movl (%rdi), %eax
diff --git a/test/CodeGen/X86/inline-asm-error.ll b/test/CodeGen/X86/inline-asm-error.ll
index 31fb190daf83..a757365e3482 100644
--- a/test/CodeGen/X86/inline-asm-error.ll
+++ b/test/CodeGen/X86/inline-asm-error.ll
@@ -1,6 +1,6 @@
-; RUN: not llc -march x86 -regalloc=fast -optimize-regalloc=0 < %s 2> %t1
-; RUN: not llc -march x86 -regalloc=basic < %s 2> %t2
-; RUN: not llc -march x86 -regalloc=greedy < %s 2> %t3
+; RUN: not llc -mtriple=i686-- -regalloc=fast -optimize-regalloc=0 < %s 2> %t1
+; RUN: not llc -mtriple=i686-- -regalloc=basic < %s 2> %t2
+; RUN: not llc -mtriple=i686-- -regalloc=greedy < %s 2> %t3
; RUN: FileCheck %s < %t1
; RUN: FileCheck %s < %t2
; RUN: FileCheck %s < %t3
diff --git a/test/CodeGen/X86/inline-asm-flag-clobber.ll b/test/CodeGen/X86/inline-asm-flag-clobber.ll
index 0874b51af6a5..e47e636d9e78 100644
--- a/test/CodeGen/X86/inline-asm-flag-clobber.ll
+++ b/test/CodeGen/X86/inline-asm-flag-clobber.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=x86-64 -no-integrated-as < %s | FileCheck %s
+; RUN: llc -mtriple=x86_64-- -no-integrated-as < %s | FileCheck %s
; PR3701
define i64 @t(i64* %arg) nounwind {
diff --git a/test/CodeGen/X86/inline-asm-fpstack.ll b/test/CodeGen/X86/inline-asm-fpstack.ll
index c28dfc7f1c60..b6ac8a18b40b 100644
--- a/test/CodeGen/X86/inline-asm-fpstack.ll
+++ b/test/CodeGen/X86/inline-asm-fpstack.ll
@@ -1,124 +1,149 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mcpu=generic -mtriple=i386-apple-darwin -verify-machineinstrs -no-integrated-as | FileCheck %s
; There should be no stack manipulations between the inline asm and ret.
-; CHECK: test1
-; CHECK: InlineAsm End
-; CHECK-NEXT: ret
define x86_fp80 @test1() {
- %tmp85 = call x86_fp80 asm sideeffect "fld0", "={st(0)}"()
- ret x86_fp80 %tmp85
+; CHECK-LABEL: test1:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: ## InlineAsm Start
+; CHECK-NEXT: fld0
+; CHECK-NEXT: ## InlineAsm End
+; CHECK-NEXT: retl
+ %tmp85 = call x86_fp80 asm sideeffect "fld0", "={st(0)}"()
+ ret x86_fp80 %tmp85
}
-; CHECK: test2
-; CHECK: InlineAsm End
-; CHECK-NEXT: ret
define double @test2() {
- %tmp85 = call double asm sideeffect "fld0", "={st(0)}"()
- ret double %tmp85
+; CHECK-LABEL: test2:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: ## InlineAsm Start
+; CHECK-NEXT: fld0
+; CHECK-NEXT: ## InlineAsm End
+; CHECK-NEXT: retl
+ %tmp85 = call double asm sideeffect "fld0", "={st(0)}"()
+ ret double %tmp85
}
; Setting up argument in st(0) should be a single fld.
-; CHECK: test3
-; CHECK: fld
-; CHECK-NEXT: InlineAsm Start
; Asm consumes stack, nothing should be popped.
-; CHECK: InlineAsm End
-; CHECK-NOT: fstp
-; CHECK: ret
define void @test3(x86_fp80 %X) {
- call void asm sideeffect "frob ", "{st(0)},~{st},~{dirflag},~{fpsr},~{flags}"( x86_fp80 %X)
- ret void
+; CHECK-LABEL: test3:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: fldt {{[0-9]+}}(%esp)
+; CHECK-NEXT: ## InlineAsm Start
+; CHECK-NEXT: frob
+; CHECK-NEXT: ## InlineAsm End
+; CHECK-NEXT: retl
+ call void asm sideeffect "frob ", "{st(0)},~{st},~{dirflag},~{fpsr},~{flags}"( x86_fp80 %X)
+ ret void
}
-; CHECK: test4
-; CHECK: fld
-; CHECK-NEXT: InlineAsm Start
-; CHECK: InlineAsm End
-; CHECK-NOT: fstp
-; CHECK: ret
define void @test4(double %X) {
- call void asm sideeffect "frob ", "{st(0)},~{st},~{dirflag},~{fpsr},~{flags}"( double %X)
- ret void
+; CHECK-LABEL: test4:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: fldl {{[0-9]+}}(%esp)
+; CHECK-NEXT: ## InlineAsm Start
+; CHECK-NEXT: frob
+; CHECK-NEXT: ## InlineAsm End
+; CHECK-NEXT: retl
+ call void asm sideeffect "frob ", "{st(0)},~{st},~{dirflag},~{fpsr},~{flags}"( double %X)
+ ret void
}
; Same as test3/4, but using value from fadd.
; The fadd can be done in xmm or x87 regs - we don't test that.
-; CHECK: test5
-; CHECK: InlineAsm End
-; CHECK-NOT: fstp
-; CHECK: ret
define void @test5(double %X) {
- %Y = fadd double %X, 123.0
- call void asm sideeffect "frob ", "{st(0)},~{st},~{dirflag},~{fpsr},~{flags}"( double %Y)
- ret void
+; CHECK-LABEL: test5:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: fldl {{[0-9]+}}(%esp)
+; CHECK-NEXT: fadds LCPI4_0
+; CHECK-NEXT: ## InlineAsm Start
+; CHECK-NEXT: frob
+; CHECK-NEXT: ## InlineAsm End
+; CHECK-NEXT: retl
+ %Y = fadd double %X, 123.0
+ call void asm sideeffect "frob ", "{st(0)},~{st},~{dirflag},~{fpsr},~{flags}"( double %Y)
+ ret void
}
-; CHECK: test6
-define void @test6(double %A, double %B, double %C,
- double %D, double %E) nounwind {
+define void @test6(double %A, double %B, double %C, double %D, double %E) nounwind {
+; CHECK-LABEL: test6:
+; CHECK: ## %bb.0: ## %entry
+; CHECK-NEXT: fldl {{[0-9]+}}(%esp)
+; CHECK-NEXT: fldl {{[0-9]+}}(%esp)
+; CHECK-NEXT: fldl {{[0-9]+}}(%esp)
+; CHECK-NEXT: fldl {{[0-9]+}}(%esp)
+; CHECK-NEXT: fldl {{[0-9]+}}(%esp)
+; CHECK-NEXT: ## InlineAsm Start
+; CHECK-NEXT: foo %st(0) %st(0)
+; CHECK-NEXT: ## InlineAsm End
+; CHECK-NEXT: fstp %st(0)
+; CHECK-NEXT: ## InlineAsm Start
+; CHECK-NEXT: bar %st(1) %st(0)
+; CHECK-NEXT: ## InlineAsm End
+; CHECK-NEXT: fstp %st(1)
+; CHECK-NEXT: fstp %st(0)
+; CHECK-NEXT: ## InlineAsm Start
+; CHECK-NEXT: baz %st(1) %st(0)
+; CHECK-NEXT: ## InlineAsm End
+; CHECK-NEXT: fstp %st(0)
+; CHECK-NEXT: ## InlineAsm Start
+; CHECK-NEXT: baz %st(0)
+; CHECK-NEXT: ## InlineAsm End
+; CHECK-NEXT: fstp %st(0)
+; CHECK-NEXT: retl
+; CHECK-NEXT: ## -- End function
entry:
; Uses the same value twice, should have one fstp after the asm.
-; CHECK: foo
-; CHECK: InlineAsm End
-; CHECK-NEXT: fstp
-; CHECK-NOT: fstp
- tail call void asm sideeffect "foo $0 $1", "f,f,~{dirflag},~{fpsr},~{flags}"( double %A, double %A ) nounwind
+ tail call void asm sideeffect "foo $0 $1", "f,f,~{dirflag},~{fpsr},~{flags}"( double %A, double %A ) nounwind
; Uses two different values, should be in st(0)/st(1) and both be popped.
-; CHECK: bar
-; CHECK: InlineAsm End
-; CHECK-NEXT: fstp
-; CHECK-NEXT: fstp
- tail call void asm sideeffect "bar $0 $1", "f,f,~{dirflag},~{fpsr},~{flags}"( double %B, double %C ) nounwind
-; Uses two different values, one of which isn't killed in this asm, it
-; should not be popped after the asm.
-; CHECK: baz
-; CHECK: InlineAsm End
-; CHECK-NEXT: fstp
-; CHECK-NOT: fstp
- tail call void asm sideeffect "baz $0 $1", "f,f,~{dirflag},~{fpsr},~{flags}"( double %D, double %E ) nounwind
+ tail call void asm sideeffect "bar $0 $1", "f,f,~{dirflag},~{fpsr},~{flags}"( double %B, double %C ) nounwind
+; Uses two different values, one of which isn't killed in this asm, it should not be popped after the asm.
+ tail call void asm sideeffect "baz $0 $1", "f,f,~{dirflag},~{fpsr},~{flags}"( double %D, double %E ) nounwind
; This is the last use of %D, so it should be popped after.
-; CHECK: baz
-; CHECK: InlineAsm End
-; CHECK-NEXT: fstp
-; CHECK-NOT: fstp
-; CHECK: ret
- tail call void asm sideeffect "baz $0", "f,~{dirflag},~{fpsr},~{flags}"( double %D ) nounwind
- ret void
+ tail call void asm sideeffect "baz $0", "f,~{dirflag},~{fpsr},~{flags}"( double %D ) nounwind
+ ret void
}
; PR4185
; Passing a non-killed value to asm in {st}.
; Make sure it is duped before.
; asm kills st(0), so we shouldn't pop anything
-; CHECK: testPR4185
-; CHECK: fld %st(0)
-; CHECK: fistpl
-; CHECK-NOT: fstp
-; CHECK: fistpl
-; CHECK-NOT: fstp
-; CHECK: ret
; A valid alternative would be to remat the constant pool load before each
; inline asm.
define void @testPR4185() {
+; CHECK-LABEL: testPR4185:
+; CHECK: ## %bb.0: ## %return
+; CHECK-NEXT: flds LCPI6_0
+; CHECK-NEXT: fld %st(0)
+; CHECK-NEXT: ## InlineAsm Start
+; CHECK-NEXT: fistpl %st(0)
+; CHECK-NEXT: ## InlineAsm End
+; CHECK-NEXT: ## InlineAsm Start
+; CHECK-NEXT: fistpl %st(0)
+; CHECK-NEXT: ## InlineAsm End
+; CHECK-NEXT: retl
return:
- call void asm sideeffect "fistpl $0", "{st},~{st}"(double 1.000000e+06)
- call void asm sideeffect "fistpl $0", "{st},~{st}"(double 1.000000e+06)
- ret void
+ call void asm sideeffect "fistpl $0", "{st},~{st}"(double 1.000000e+06)
+ call void asm sideeffect "fistpl $0", "{st},~{st}"(double 1.000000e+06)
+ ret void
}
; Passing a non-killed value through asm in {st}.
; Make sure it is not duped before.
; Second asm kills st(0), so we shouldn't pop anything
-; CHECK: testPR4185b
-; CHECK-NOT: fld %st(0)
-; CHECK: fistl
-; CHECK-NOT: fstp
-; CHECK: fistpl
-; CHECK-NOT: fstp
-; CHECK: ret
-; A valid alternative would be to remat the constant pool load before each
-; inline asm.
+; A valid alternative would be to remat the constant pool load before each inline asm.
define void @testPR4185b() {
+; CHECK-LABEL: testPR4185b:
+; CHECK: ## %bb.0: ## %return
+; CHECK-NEXT: flds LCPI7_0
+; CHECK-NEXT: ## InlineAsm Start
+; CHECK-NEXT: fistl %st(0)
+; CHECK-NEXT: ## InlineAsm End
+; CHECK-NEXT: ## InlineAsm Start
+; CHECK-NEXT: fistpl %st(0)
+; CHECK-NEXT: ## InlineAsm End
+; CHECK-NEXT: retl
return:
call void asm sideeffect "fistl $0", "{st}"(double 1.000000e+06)
call void asm sideeffect "fistpl $0", "{st},~{st}"(double 1.000000e+06)
@@ -127,57 +152,88 @@ return:
; PR4459
; The return value from ceil must be duped before being consumed by asm.
-; CHECK: testPR4459
-; CHECK: ceil
-; CHECK: fld %st(0)
-; CHECK-NOT: fxch
-; CHECK: fistpl
-; CHECK-NOT: fxch
-; CHECK: fstpt
-; CHECK: test
define void @testPR4459(x86_fp80 %a) {
+; CHECK-LABEL: testPR4459:
+; CHECK: ## %bb.0: ## %entry
+; CHECK-NEXT: subl $28, %esp
+; CHECK-NEXT: .cfi_def_cfa_offset 32
+; CHECK-NEXT: fldt {{[0-9]+}}(%esp)
+; CHECK-NEXT: fstpt (%esp)
+; CHECK-NEXT: calll _ceil
+; CHECK-NEXT: fld %st(0)
+; CHECK-NEXT: ## InlineAsm Start
+; CHECK-NEXT: fistpl %st(0)
+; CHECK-NEXT: ## InlineAsm End
+; CHECK-NEXT: fstpt (%esp)
+; CHECK-NEXT: calll _test3
+; CHECK-NEXT: addl $28, %esp
+; CHECK-NEXT: retl
entry:
- %0 = call x86_fp80 @ceil(x86_fp80 %a)
- call void asm sideeffect "fistpl $0", "{st},~{st}"( x86_fp80 %0)
- call void @test3(x86_fp80 %0 )
- ret void
+ %0 = call x86_fp80 @ceil(x86_fp80 %a)
+ call void asm sideeffect "fistpl $0", "{st},~{st}"( x86_fp80 %0)
+ call void @test3(x86_fp80 %0 )
+ ret void
}
declare x86_fp80 @ceil(x86_fp80)
; PR4484
; test1 leaves a value on the stack that is needed after the asm.
-; CHECK: testPR4484
-; CHECK: calll _test1
-; CHECK-NOT: fstp
; Load %a from stack after ceil
-; CHECK: fldt
-; CHECK-NOT: fxch
-; CHECK: fistpl
-; CHECK-NOT: fstp
; Set up call to test.
-; CHECK: fstpt
-; CHECK: test
define void @testPR4484(x86_fp80 %a) {
+; CHECK-LABEL: testPR4484:
+; CHECK: ## %bb.0: ## %entry
+; CHECK-NEXT: subl $28, %esp
+; CHECK-NEXT: .cfi_def_cfa_offset 32
+; CHECK-NEXT: fldt {{[0-9]+}}(%esp)
+; CHECK-NEXT: fstpt {{[0-9]+}}(%esp) ## 10-byte Folded Spill
+; CHECK-NEXT: calll _test1
+; CHECK-NEXT: fldt {{[0-9]+}}(%esp) ## 10-byte Folded Reload
+; CHECK-NEXT: ## InlineAsm Start
+; CHECK-NEXT: fistpl %st(0)
+; CHECK-NEXT: ## InlineAsm End
+; CHECK-NEXT: fstpt (%esp)
+; CHECK-NEXT: calll _test3
+; CHECK-NEXT: addl $28, %esp
+; CHECK-NEXT: retl
entry:
- %0 = call x86_fp80 @test1()
- call void asm sideeffect "fistpl $0", "{st},~{st}"(x86_fp80 %a)
- call void @test3(x86_fp80 %0)
- ret void
+ %0 = call x86_fp80 @test1()
+ call void asm sideeffect "fistpl $0", "{st},~{st}"(x86_fp80 %a)
+ call void @test3(x86_fp80 %0)
+ ret void
}
; PR4485
-; CHECK: testPR4485
define void @testPR4485(x86_fp80* %a) {
+; CHECK-LABEL: testPR4485:
+; CHECK: ## %bb.0: ## %entry
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: fldt (%eax)
+; CHECK-NEXT: flds LCPI10_0
+; CHECK-NEXT: fmul %st(0), %st(1)
+; CHECK-NEXT: flds LCPI10_1
+; CHECK-NEXT: fmul %st(0), %st(2)
+; CHECK-NEXT: fxch %st(2)
+; CHECK-NEXT: ## InlineAsm Start
+; CHECK-NEXT: fistpl %st(0)
+; CHECK-NEXT: ## InlineAsm End
+; CHECK-NEXT: fldt (%eax)
+; CHECK-NEXT: fmulp %st(1)
+; CHECK-NEXT: fmulp %st(1)
+; CHECK-NEXT: ## InlineAsm Start
+; CHECK-NEXT: fistpl %st(0)
+; CHECK-NEXT: ## InlineAsm End
+; CHECK-NEXT: retl
entry:
- %0 = load x86_fp80, x86_fp80* %a, align 16
- %1 = fmul x86_fp80 %0, 0xK4006B400000000000000
- %2 = fmul x86_fp80 %1, 0xK4012F424000000000000
- tail call void asm sideeffect "fistpl $0", "{st},~{st}"(x86_fp80 %2)
- %3 = load x86_fp80, x86_fp80* %a, align 16
- %4 = fmul x86_fp80 %3, 0xK4006B400000000000000
- %5 = fmul x86_fp80 %4, 0xK4012F424000000000000
- tail call void asm sideeffect "fistpl $0", "{st},~{st}"(x86_fp80 %5)
- ret void
+ %0 = load x86_fp80, x86_fp80* %a, align 16
+ %1 = fmul x86_fp80 %0, 0xK4006B400000000000000
+ %2 = fmul x86_fp80 %1, 0xK4012F424000000000000
+ tail call void asm sideeffect "fistpl $0", "{st},~{st}"(x86_fp80 %2)
+ %3 = load x86_fp80, x86_fp80* %a, align 16
+ %4 = fmul x86_fp80 %3, 0xK4006B400000000000000
+ %5 = fmul x86_fp80 %4, 0xK4012F424000000000000
+ tail call void asm sideeffect "fistpl $0", "{st},~{st}"(x86_fp80 %5)
+ ret void
}
; An input argument in a fixed position is implicitly popped by the asm only if
@@ -189,13 +245,17 @@ entry:
; void fist1(long double x, int *p) {
; asm volatile ("fistl %1" : : "t"(x), "m"(*p));
; }
-;
-; CHECK: fist1
-; CHECK: fldt
-; CHECK: fistl (%e
-; CHECK: fstp
-; CHECK: ret
define void @fist1(x86_fp80 %x, i32* %p) nounwind ssp {
+; CHECK-LABEL: fist1:
+; CHECK: ## %bb.0: ## %entry
+; CHECK-NEXT: fldt {{[0-9]+}}(%esp)
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: ## InlineAsm Start
+; CHECK-NEXT: fistl (%eax)
+; CHECK-NEXT: ## InlineAsm End
+; CHECK-NEXT: fstp %st(0)
+; CHECK-NEXT: retl
+; CHECK-NEXT: ## -- End function
entry:
tail call void asm sideeffect "fistl $1", "{st},*m,~{memory},~{dirflag},~{fpsr},~{flags}"(x86_fp80 %x, i32* %p) nounwind
ret void
@@ -209,13 +269,16 @@ entry:
; asm ("fistl %1" : "=&t"(y) : "0"(x), "m"(*p) : "memory");
; return y;
; }
-;
-; CHECK: fist2
-; CHECK: fldt
-; CHECK: fistl (%e
-; CHECK-NOT: fstp
-; CHECK: ret
define x86_fp80 @fist2(x86_fp80 %x, i32* %p) nounwind ssp {
+; CHECK-LABEL: fist2:
+; CHECK: ## %bb.0: ## %entry
+; CHECK-NEXT: fldt {{[0-9]+}}(%esp)
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: ## InlineAsm Start
+; CHECK-NEXT: fistl (%eax)
+; CHECK-NEXT: ## InlineAsm End
+; CHECK-NEXT: retl
+; CHECK-NEXT: ## -- End function
entry:
%0 = tail call x86_fp80 asm "fistl $2", "=&{st},0,*m,~{memory},~{dirflag},~{fpsr},~{flags}"(x86_fp80 %x, i32* %p) nounwind
ret x86_fp80 %0
@@ -226,14 +289,18 @@ entry:
; void fucomp1(long double x, long double y) {
; asm volatile ("fucomp %1" : : "t"(x), "f"(y) : "st");
; }
-; CHECK: fucomp1
-; CHECK: fldt
-; CHECK: fldt
-; CHECK: fucomp %st
-; CHECK: fstp
-; CHECK-NOT: fstp
-; CHECK: ret
define void @fucomp1(x86_fp80 %x, x86_fp80 %y) nounwind ssp {
+; CHECK-LABEL: fucomp1:
+; CHECK: ## %bb.0: ## %entry
+; CHECK-NEXT: fldt {{[0-9]+}}(%esp)
+; CHECK-NEXT: fldt {{[0-9]+}}(%esp)
+; CHECK-NEXT: fxch %st(1)
+; CHECK-NEXT: ## InlineAsm Start
+; CHECK-NEXT: fucomp %st(1)
+; CHECK-NEXT: ## InlineAsm End
+; CHECK-NEXT: fstp %st(0)
+; CHECK-NEXT: retl
+; CHECK-NEXT: ## -- End function
entry:
tail call void asm sideeffect "fucomp $1", "{st},f,~{st},~{dirflag},~{fpsr},~{flags}"(x86_fp80 %x, x86_fp80 %y) nounwind
ret void
@@ -249,26 +316,34 @@ entry:
; asm volatile ("fucompp %1" : : "t"(x), "u"(y) : "st", "st(1)");
; }
;
-; CHECK: fucomp2
-; CHECK: fldt
-; CHECK: fldt
-; CHECK: fucomp %st(1)
-; CHECK: fstp
-; CHECK-NOT: fstp
-; CHECK: ret
-;
-; CHECK: fucomp3
-; CHECK: fldt
-; CHECK: fldt
-; CHECK: fucompp %st(1)
-; CHECK-NOT: fstp
-; CHECK: ret
define void @fucomp2(x86_fp80 %x, x86_fp80 %y) nounwind ssp {
+; CHECK-LABEL: fucomp2:
+; CHECK: ## %bb.0: ## %entry
+; CHECK-NEXT: fldt {{[0-9]+}}(%esp)
+; CHECK-NEXT: fldt {{[0-9]+}}(%esp)
+; CHECK-NEXT: fxch %st(1)
+; CHECK-NEXT: ## InlineAsm Start
+; CHECK-NEXT: fucomp %st(1)
+; CHECK-NEXT: ## InlineAsm End
+; CHECK-NEXT: fstp %st(0)
+; CHECK-NEXT: retl
+; CHECK-NEXT: ## -- End function
entry:
tail call void asm sideeffect "fucomp $1", "{st},{st(1)},~{st},~{dirflag},~{fpsr},~{flags}"(x86_fp80 %x, x86_fp80 %y) nounwind
ret void
}
+
define void @fucomp3(x86_fp80 %x, x86_fp80 %y) nounwind ssp {
+; CHECK-LABEL: fucomp3:
+; CHECK: ## %bb.0: ## %entry
+; CHECK-NEXT: fldt {{[0-9]+}}(%esp)
+; CHECK-NEXT: fldt {{[0-9]+}}(%esp)
+; CHECK-NEXT: fxch %st(1)
+; CHECK-NEXT: ## InlineAsm Start
+; CHECK-NEXT: fucompp %st(1)
+; CHECK-NEXT: ## InlineAsm End
+; CHECK-NEXT: retl
+; CHECK-NEXT: ## -- End function
entry:
tail call void asm sideeffect "fucompp $1", "{st},{st(1)},~{st},~{st(1)},~{dirflag},~{fpsr},~{flags}"(x86_fp80 %x, x86_fp80 %y) nounwind
ret void
@@ -276,15 +351,16 @@ entry:
; One input, two outputs, one dead output.
%complex = type { float, float }
-; CHECK: sincos1
-; CHECK: flds
-; CHECK-NOT: fxch
-; CHECK: sincos
-; CHECK-NOT: fstp
-; CHECK: fstp %st(1)
-; CHECK-NOT: fstp
-; CHECK: ret
define float @sincos1(float %x) nounwind ssp {
+; CHECK-LABEL: sincos1:
+; CHECK: ## %bb.0: ## %entry
+; CHECK-NEXT: flds {{[0-9]+}}(%esp)
+; CHECK-NEXT: ## InlineAsm Start
+; CHECK-NEXT: sincos
+; CHECK-NEXT: ## InlineAsm End
+; CHECK-NEXT: fstp %st(1)
+; CHECK-NEXT: retl
+; CHECK-NEXT: ## -- End function
entry:
%0 = tail call %complex asm "sincos", "={st},={st(1)},0,~{dirflag},~{fpsr},~{flags}"(float %x) nounwind
%asmresult = extractvalue %complex %0, 0
@@ -292,15 +368,16 @@ entry:
}
; Same thing, swapped output operands.
-; CHECK: sincos2
-; CHECK: flds
-; CHECK-NOT: fxch
-; CHECK: sincos
-; CHECK-NOT: fstp
-; CHECK: fstp %st(1)
-; CHECK-NOT: fstp
-; CHECK: ret
define float @sincos2(float %x) nounwind ssp {
+; CHECK-LABEL: sincos2:
+; CHECK: ## %bb.0: ## %entry
+; CHECK-NEXT: flds {{[0-9]+}}(%esp)
+; CHECK-NEXT: ## InlineAsm Start
+; CHECK-NEXT: sincos
+; CHECK-NEXT: ## InlineAsm End
+; CHECK-NEXT: fstp %st(1)
+; CHECK-NEXT: retl
+; CHECK-NEXT: ## -- End function
entry:
%0 = tail call %complex asm "sincos", "={st(1)},={st},1,~{dirflag},~{fpsr},~{flags}"(float %x) nounwind
%asmresult = extractvalue %complex %0, 1
@@ -308,21 +385,27 @@ entry:
}
; Clobber st(0) after it was live-out/dead from the previous asm.
-; CHECK: sincos3
; Load x, make a copy for the second asm.
-; CHECK: flds
-; CHECK: fld %st(0)
-; CHECK: sincos
; Discard dead result in st(0), bring x to the top.
-; CHECK: fstp %st(0)
-; CHECK: fxch
; x is now in st(0) for the second asm
-; CHECK: sincos
; Discard both results.
-; CHECK: fstp
-; CHECK: fstp
-; CHECK: ret
define float @sincos3(float %x) nounwind ssp {
+; CHECK-LABEL: sincos3:
+; CHECK: ## %bb.0: ## %entry
+; CHECK-NEXT: flds {{[0-9]+}}(%esp)
+; CHECK-NEXT: fld %st(0)
+; CHECK-NEXT: ## InlineAsm Start
+; CHECK-NEXT: sincos
+; CHECK-NEXT: ## InlineAsm End
+; CHECK-NEXT: fstp %st(0)
+; CHECK-NEXT: fxch %st(1)
+; CHECK-NEXT: ## InlineAsm Start
+; CHECK-NEXT: sincos
+; CHECK-NEXT: ## InlineAsm End
+; CHECK-NEXT: fstp %st(1)
+; CHECK-NEXT: fstp %st(0)
+; CHECK-NEXT: retl
+; CHECK-NEXT: ## -- End function
entry:
%0 = tail call %complex asm sideeffect "sincos", "={st(1)},={st},1,~{dirflag},~{fpsr},~{flags}"(float %x) nounwind
%1 = tail call %complex asm sideeffect "sincos", "={st(1)},={st},1,~{dirflag},~{fpsr},~{flags}"(float %x) nounwind
@@ -331,11 +414,19 @@ entry:
}
; Pass the same value in two fixed stack slots.
-; CHECK: PR10602
-; CHECK: flds LCPI
-; CHECK: fld %st(0)
-; CHECK: fcomi %st(1), %st(0)
define i32 @PR10602() nounwind ssp {
+; CHECK-LABEL: PR10602:
+; CHECK: ## %bb.0: ## %entry
+; CHECK-NEXT: flds LCPI19_0
+; CHECK-NEXT: fld %st(0)
+; CHECK-NEXT: fxch %st(1)
+; CHECK-NEXT: ## InlineAsm Start
+; CHECK-NEXT: fcomi %st(1), %st(0); pushf; pop %eax
+; CHECK-NEXT: ## InlineAsm End
+; CHECK-NEXT: fstp %st(0)
+; CHECK-NEXT: fstp %st(0)
+; CHECK-NEXT: retl
+; CHECK-NEXT: ## -- End function
entry:
%0 = tail call i32 asm "fcomi $2, $1; pushf; pop $0", "=r,{st},{st(1)},~{dirflag},~{fpsr},~{flags}"(double 2.000000e+00, double 2.000000e+00) nounwind
ret i32 %0
@@ -346,17 +437,9 @@ entry:
; inline-asm instruction and the ST register was live across another
; inline-asm instruction.
;
-; INLINEASM <es:frndint> [sideeffect] [attdialect], $0:[regdef], %ST0<imp-def,tied5>, $1:[reguse tiedto:$0], %ST0<tied3>, $2:[clobber], %EFLAGS<earlyclobber,imp-def,dead>
-; INLINEASM <es:fldcw $0> [sideeffect] [mayload] [attdialect], $0:[mem], %EAX<undef>, 1, %noreg, 0, %noreg, $1:[clobber], %EFLAGS<earlyclobber,imp-def,dead>
-; %FP0<def> = COPY %ST0
-
-; CHECK-LABEL: _test_live_st
-; CHECK: ## InlineAsm Start
-; CHECK: frndint
-; CHECK: ## InlineAsm End
-; CHECK: ## InlineAsm Start
-; CHECK: fldcw
-; CHECK: ## InlineAsm End
+; INLINEASM $frndint [sideeffect] [attdialect], $0:[regdef], %st0<imp-def,tied5>, $1:[reguse tiedto:$0], %st0<tied3>, $2:[clobber], early-clobber implicit dead %eflags
+; INLINEASM $fldcw $0 [sideeffect] [mayload] [attdialect], $0:[mem], undef %eax, 1, %noreg, 0, %noreg, $1:[clobber], early-clobber implicit dead %eflags
+; %fp0 = COPY %st0
%struct.fpu_t = type { [8 x x86_fp80], x86_fp80, %struct.anon1, %struct.anon2, i32, i8, [15 x i8] }
%struct.anon1 = type { i32, i32, i32 }
@@ -366,6 +449,35 @@ entry:
; Function Attrs: ssp
define void @test_live_st(i32 %a1) {
+; CHECK-LABEL: test_live_st:
+; CHECK: ## %bb.0: ## %entry
+; CHECK-NEXT: subl $12, %esp
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: fldt (%eax)
+; CHECK-NEXT: cmpl $1, {{[0-9]+}}(%esp)
+; CHECK-NEXT: jne LBB20_2
+; CHECK-NEXT: ## %bb.1: ## %sw.bb4.i
+; CHECK-NEXT: ## InlineAsm Start
+; CHECK-NEXT: frndint
+; CHECK-NEXT: ## InlineAsm End
+; CHECK-NEXT: ## InlineAsm Start
+; CHECK-NEXT: fldcw (%eax)
+; CHECK-NEXT: ## InlineAsm End
+; CHECK-NEXT: LBB20_2: ## %_Z5tointRKe.exit
+; CHECK-NEXT: fnstcw {{[0-9]+}}(%esp)
+; CHECK-NEXT: movzwl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: movw $3199, {{[0-9]+}}(%esp) ## imm = 0xC7F
+; CHECK-NEXT: fldcw {{[0-9]+}}(%esp)
+; CHECK-NEXT: movw %ax, {{[0-9]+}}(%esp)
+; CHECK-NEXT: fistpl {{[0-9]+}}(%esp)
+; CHECK-NEXT: fldcw {{[0-9]+}}(%esp)
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; CHECK-NEXT: fildl {{[0-9]+}}(%esp)
+; CHECK-NEXT: movl L_fpu$non_lazy_ptr, %eax
+; CHECK-NEXT: fstpt 128(%eax)
+; CHECK-NEXT: addl $12, %esp
+; CHECK-NEXT: retl
entry:
%0 = load x86_fp80, x86_fp80* undef, align 16
%cond = icmp eq i32 %a1, 1
@@ -388,13 +500,14 @@ return:
}
; Check that x87 stackifier is correctly rewriting FP registers to ST registers.
-;
-; CHECK-LABEL: _test_operand_rewrite
-; CHECK: ## InlineAsm Start
-; CHECK: foo %st(0), %st(1)
-; CHECK: ## InlineAsm End
-
define double @test_operand_rewrite() {
+; CHECK-LABEL: test_operand_rewrite:
+; CHECK: ## %bb.0: ## %entry
+; CHECK-NEXT: ## InlineAsm Start
+; CHECK-NEXT: foo %st(0), %st(1)
+; CHECK-NEXT: ## InlineAsm End
+; CHECK-NEXT: fsubp %st(1)
+; CHECK-NEXT: retl
entry:
%0 = tail call { double, double } asm sideeffect "foo $0, $1", "={st},={st(1)},~{dirflag},~{fpsr},~{flags}"()
%asmresult = extractvalue { double, double } %0, 0
diff --git a/test/CodeGen/X86/inline-asm-modifier-n.ll b/test/CodeGen/X86/inline-asm-modifier-n.ll
index 072c7c419536..7e48dd2dda5c 100644
--- a/test/CodeGen/X86/inline-asm-modifier-n.ll
+++ b/test/CodeGen/X86/inline-asm-modifier-n.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -no-integrated-as | grep " 37"
+; RUN: llc < %s -mtriple=i686-- -no-integrated-as | grep " 37"
; rdar://7008959
define void @bork() nounwind {
diff --git a/test/CodeGen/X86/inline-asm-modifier-q.ll b/test/CodeGen/X86/inline-asm-modifier-q.ll
index 8063d48a2ca6..f375c45b175d 100644
--- a/test/CodeGen/X86/inline-asm-modifier-q.ll
+++ b/test/CodeGen/X86/inline-asm-modifier-q.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -no-integrated-as | FileCheck %s
+; RUN: llc < %s -mtriple=i686-- -no-integrated-as | FileCheck %s
; If the target does not have 64-bit integer registers, emit 32-bit register
; names.
diff --git a/test/CodeGen/X86/inline-asm-mrv.ll b/test/CodeGen/X86/inline-asm-mrv.ll
index a96e7b818072..41a97e3dc1da 100644
--- a/test/CodeGen/X86/inline-asm-mrv.ll
+++ b/test/CodeGen/X86/inline-asm-mrv.ll
@@ -1,8 +1,8 @@
; PR2094
-; RUN: llc < %s -march=x86-64 -no-integrated-as | grep movslq
-; RUN: llc < %s -march=x86-64 -no-integrated-as | grep addps
-; RUN: llc < %s -march=x86-64 -no-integrated-as | grep paddd
-; RUN: llc < %s -march=x86-64 -no-integrated-as | not grep movq
+; RUN: llc < %s -no-integrated-as | grep movslq
+; RUN: llc < %s -no-integrated-as | grep addps
+; RUN: llc < %s -no-integrated-as | grep paddd
+; RUN: llc < %s -no-integrated-as | not grep movq
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
target triple = "x86_64-apple-darwin8"
diff --git a/test/CodeGen/X86/inline-asm-q-regs.ll b/test/CodeGen/X86/inline-asm-q-regs.ll
index 53a56aee2cb3..dd67bd645ee9 100644
--- a/test/CodeGen/X86/inline-asm-q-regs.ll
+++ b/test/CodeGen/X86/inline-asm-q-regs.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mattr=+avx -no-integrated-as
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx -no-integrated-as
; rdar://7066579
%0 = type { i64, i64, i64, i64, i64 } ; type %0
diff --git a/test/CodeGen/X86/inline-asm-stack-realign.ll b/test/CodeGen/X86/inline-asm-stack-realign.ll
index cfbe260a33a0..14ee97022431 100644
--- a/test/CodeGen/X86/inline-asm-stack-realign.ll
+++ b/test/CodeGen/X86/inline-asm-stack-realign.ll
@@ -1,6 +1,6 @@
; RUN: not llc -mtriple=i686-pc-win32 < %s 2>&1 | FileCheck %s
-; FIXME: This is miscompiled due to our unconditional use of ESI as the base
+; FIXME: This is miscompiled due to our unconditional use of esi as the base
; pointer.
; XFAIL: *
diff --git a/test/CodeGen/X86/inline-asm-stack-realign3.ll b/test/CodeGen/X86/inline-asm-stack-realign3.ll
index be0c6f51112d..29034a63ed9b 100644
--- a/test/CodeGen/X86/inline-asm-stack-realign3.ll
+++ b/test/CodeGen/X86/inline-asm-stack-realign3.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=x86 -no-integrated-as < %s | FileCheck %s
+; RUN: llc -mtriple=i686-- -no-integrated-as < %s | FileCheck %s
declare void @bar(i32* %junk)
diff --git a/test/CodeGen/X86/inline-asm-tied.ll b/test/CodeGen/X86/inline-asm-tied.ll
index db63a8048836..7363e613a56e 100644
--- a/test/CodeGen/X86/inline-asm-tied.ll
+++ b/test/CodeGen/X86/inline-asm-tied.ll
@@ -14,7 +14,7 @@ entry:
; CHECK-DAG: movl 4(%esp), %eax
; CHECK: ## InlineAsm Start
; CHECK: ## InlineAsm End
-; Everything is set up in EAX:EDX, return immediately.
+; Everything is set up in eax:edx, return immediately.
; CHECK-NEXT: retl
; The tied operands are not necessarily in the same order as the defs.
diff --git a/test/CodeGen/X86/inline-asm-x-scalar.ll b/test/CodeGen/X86/inline-asm-x-scalar.ll
index 64a7fe826472..a33734af93f9 100644
--- a/test/CodeGen/X86/inline-asm-x-scalar.ll
+++ b/test/CodeGen/X86/inline-asm-x-scalar.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mcpu=yonah -no-integrated-as
+; RUN: llc < %s -mtriple=i686-- -mcpu=yonah -no-integrated-as
define void @test1() {
tail call void asm sideeffect "ucomiss $0", "x"( float 0x41E0000000000000)
diff --git a/test/CodeGen/X86/inline-asm.ll b/test/CodeGen/X86/inline-asm.ll
index 5ec4f469df89..e4442388b082 100644
--- a/test/CodeGen/X86/inline-asm.ll
+++ b/test/CodeGen/X86/inline-asm.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -no-integrated-as
+; RUN: llc < %s -mtriple=i686-- -no-integrated-as
define i32 @test1() nounwind {
; Dest is AX, dest type = i32.
diff --git a/test/CodeGen/X86/inline-sse.ll b/test/CodeGen/X86/inline-sse.ll
index 08819b858293..ba6d4e9015f2 100644
--- a/test/CodeGen/X86/inline-sse.ll
+++ b/test/CodeGen/X86/inline-sse.ll
@@ -7,7 +7,7 @@
define void @nop() nounwind {
; X32-LABEL: nop:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pushl %ebp
; X32-NEXT: movl %esp, %ebp
; X32-NEXT: andl $-16, %esp
@@ -20,7 +20,7 @@ define void @nop() nounwind {
; X32-NEXT: retl
;
; X64-LABEL: nop:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: #APP
; X64-NEXT: #NO_APP
; X64-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
diff --git a/test/CodeGen/X86/inlineasm-sched-bug.ll b/test/CodeGen/X86/inlineasm-sched-bug.ll
index 08de0c02d293..3eb5fb0eb466 100644
--- a/test/CodeGen/X86/inlineasm-sched-bug.ll
+++ b/test/CodeGen/X86/inlineasm-sched-bug.ll
@@ -1,5 +1,5 @@
; PR13504
-; RUN: llc -march=x86 -mcpu=atom <%s | FileCheck %s
+; RUN: llc -mtriple=i686-- -mcpu=atom <%s | FileCheck %s
; CHECK: bsfl
; CHECK-NOT: movl
diff --git a/test/CodeGen/X86/ins_split_regalloc.ll b/test/CodeGen/X86/ins_split_regalloc.ll
index f04d088ce687..99398b0ccd18 100644
--- a/test/CodeGen/X86/ins_split_regalloc.ll
+++ b/test/CodeGen/X86/ins_split_regalloc.ll
@@ -1,4 +1,4 @@
-; RUN: llc -O1 -regalloc=greedy -mtriple=x86_64-apple-macosx -march x86-64 < %s -o - | FileCheck %s
+; RUN: llc -O1 -regalloc=greedy -mtriple=x86_64-apple-macosx < %s -o - | FileCheck %s
; Check that last chance split (RAGreedy::tryInstructonSplit) just split
; when this is beneficial, otherwise we end up with uncoalesced copies.
; <rdar://problem/15570057>
diff --git a/test/CodeGen/X86/ins_subreg_coalesce-1.ll b/test/CodeGen/X86/ins_subreg_coalesce-1.ll
index 4a5d8dfaf688..19ca7cb17a4c 100644
--- a/test/CodeGen/X86/ins_subreg_coalesce-1.ll
+++ b/test/CodeGen/X86/ins_subreg_coalesce-1.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=-bmi | FileCheck %s
+; RUN: llc < %s -mtriple=i686-- -mattr=-bmi | FileCheck %s
define fastcc i32 @t() nounwind {
entry:
diff --git a/test/CodeGen/X86/ins_subreg_coalesce-2.ll b/test/CodeGen/X86/ins_subreg_coalesce-2.ll
index f2c9cc72719c..ff8190c45c0a 100644
--- a/test/CodeGen/X86/ins_subreg_coalesce-2.ll
+++ b/test/CodeGen/X86/ins_subreg_coalesce-2.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 | not grep movw
+; RUN: llc < %s -mtriple=x86_64-- | not grep movw
define i16 @test5(i16 %f12) nounwind {
%f11 = shl i16 %f12, 2 ; <i16> [#uses=1]
diff --git a/test/CodeGen/X86/ins_subreg_coalesce-3.ll b/test/CodeGen/X86/ins_subreg_coalesce-3.ll
index 71890bc23b61..31f554d4afd6 100644
--- a/test/CodeGen/X86/ins_subreg_coalesce-3.ll
+++ b/test/CodeGen/X86/ins_subreg_coalesce-3.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 | grep mov | count 3
+; RUN: llc < %s -mtriple=x86_64-- | grep mov | count 3
%struct.COMPOSITE = type { i8, i16, i16 }
%struct.FILE = type { i8*, i32, i32, i16, i16, %struct.__sbuf, i32, i8*, i32 (i8*)*, i32 (i8*, i8*, i32)*, i64 (i8*, i64, i32)*, i32 (i8*, i8*, i32)*, %struct.__sbuf, %struct.__sFILEX*, i32, [3 x i8], [1 x i8], %struct.__sbuf, i32, i64 }
diff --git a/test/CodeGen/X86/insert-into-constant-vector.ll b/test/CodeGen/X86/insert-into-constant-vector.ll
new file mode 100644
index 000000000000..03ce34dace70
--- /dev/null
+++ b/test/CodeGen/X86/insert-into-constant-vector.ll
@@ -0,0 +1,465 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X32SSE --check-prefix=X32SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X64SSE --check-prefix=X64SSE2
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=X32SSE --check-prefix=X32SSE4
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=X64SSE --check-prefix=X64SSE4
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X32AVX --check-prefix=X32AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X64AVX --check-prefix=X64AVX2
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=X32AVX --check-prefix=X32AVX512F
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=X64AVX --check-prefix=X64AVX512F
+
+define <16 x i8> @elt0_v16i8(i8 %x) {
+; X32SSE2-LABEL: elt0_v16i8:
+; X32SSE2: # %bb.0:
+; X32SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X32SSE2-NEXT: movaps {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; X32SSE2-NEXT: andnps %xmm1, %xmm0
+; X32SSE2-NEXT: orps {{\.LCPI.*}}, %xmm0
+; X32SSE2-NEXT: retl
+;
+; X64SSE2-LABEL: elt0_v16i8:
+; X64SSE2: # %bb.0:
+; X64SSE2-NEXT: movd %edi, %xmm1
+; X64SSE2-NEXT: movdqa {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; X64SSE2-NEXT: pandn %xmm1, %xmm0
+; X64SSE2-NEXT: por {{.*}}(%rip), %xmm0
+; X64SSE2-NEXT: retq
+;
+; X32SSE4-LABEL: elt0_v16i8:
+; X32SSE4: # %bb.0:
+; X32SSE4-NEXT: movdqa {{.*#+}} xmm0 = <u,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15>
+; X32SSE4-NEXT: pinsrb $0, {{[0-9]+}}(%esp), %xmm0
+; X32SSE4-NEXT: retl
+;
+; X64SSE4-LABEL: elt0_v16i8:
+; X64SSE4: # %bb.0:
+; X64SSE4-NEXT: movdqa {{.*#+}} xmm0 = <u,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15>
+; X64SSE4-NEXT: pinsrb $0, %edi, %xmm0
+; X64SSE4-NEXT: retq
+;
+; X32AVX-LABEL: elt0_v16i8:
+; X32AVX: # %bb.0:
+; X32AVX-NEXT: vmovdqa {{.*#+}} xmm0 = <u,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15>
+; X32AVX-NEXT: vpinsrb $0, {{[0-9]+}}(%esp), %xmm0, %xmm0
+; X32AVX-NEXT: retl
+;
+; X64AVX-LABEL: elt0_v16i8:
+; X64AVX: # %bb.0:
+; X64AVX-NEXT: vmovdqa {{.*#+}} xmm0 = <u,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15>
+; X64AVX-NEXT: vpinsrb $0, %edi, %xmm0, %xmm0
+; X64AVX-NEXT: retq
+ %ins = insertelement <16 x i8> <i8 42, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, i8 %x, i32 0
+ ret <16 x i8> %ins
+}
+
+define <8 x i16> @elt5_v8i16(i16 %x) {
+; X32SSE-LABEL: elt5_v8i16:
+; X32SSE: # %bb.0:
+; X32SSE-NEXT: movdqa {{.*#+}} xmm0 = <42,1,2,3,4,u,6,7>
+; X32SSE-NEXT: pinsrw $5, {{[0-9]+}}(%esp), %xmm0
+; X32SSE-NEXT: retl
+;
+; X64SSE-LABEL: elt5_v8i16:
+; X64SSE: # %bb.0:
+; X64SSE-NEXT: movdqa {{.*#+}} xmm0 = <42,1,2,3,4,u,6,7>
+; X64SSE-NEXT: pinsrw $5, %edi, %xmm0
+; X64SSE-NEXT: retq
+;
+; X32AVX-LABEL: elt5_v8i16:
+; X32AVX: # %bb.0:
+; X32AVX-NEXT: vmovdqa {{.*#+}} xmm0 = <42,1,2,3,4,u,6,7>
+; X32AVX-NEXT: vpinsrw $5, {{[0-9]+}}(%esp), %xmm0, %xmm0
+; X32AVX-NEXT: retl
+;
+; X64AVX-LABEL: elt5_v8i16:
+; X64AVX: # %bb.0:
+; X64AVX-NEXT: vmovdqa {{.*#+}} xmm0 = <42,1,2,3,4,u,6,7>
+; X64AVX-NEXT: vpinsrw $5, %edi, %xmm0, %xmm0
+; X64AVX-NEXT: retq
+ %ins = insertelement <8 x i16> <i16 42, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, i16 %x, i32 5
+ ret <8 x i16> %ins
+}
+
+define <4 x i32> @elt3_v4i32(i32 %x) {
+; X32SSE2-LABEL: elt3_v4i32:
+; X32SSE2: # %bb.0:
+; X32SSE2-NEXT: movaps {{.*#+}} xmm0 = <42,1,2,u>
+; X32SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X32SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0]
+; X32SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
+; X32SSE2-NEXT: retl
+;
+; X64SSE2-LABEL: elt3_v4i32:
+; X64SSE2: # %bb.0:
+; X64SSE2-NEXT: movd %edi, %xmm1
+; X64SSE2-NEXT: movaps {{.*#+}} xmm0 = <42,1,2,u>
+; X64SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0]
+; X64SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
+; X64SSE2-NEXT: retq
+;
+; X32SSE4-LABEL: elt3_v4i32:
+; X32SSE4: # %bb.0:
+; X32SSE4-NEXT: movdqa {{.*#+}} xmm0 = <42,1,2,u>
+; X32SSE4-NEXT: pinsrd $3, {{[0-9]+}}(%esp), %xmm0
+; X32SSE4-NEXT: retl
+;
+; X64SSE4-LABEL: elt3_v4i32:
+; X64SSE4: # %bb.0:
+; X64SSE4-NEXT: movdqa {{.*#+}} xmm0 = <42,1,2,u>
+; X64SSE4-NEXT: pinsrd $3, %edi, %xmm0
+; X64SSE4-NEXT: retq
+;
+; X32AVX-LABEL: elt3_v4i32:
+; X32AVX: # %bb.0:
+; X32AVX-NEXT: vmovdqa {{.*#+}} xmm0 = <42,1,2,u>
+; X32AVX-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm0
+; X32AVX-NEXT: retl
+;
+; X64AVX-LABEL: elt3_v4i32:
+; X64AVX: # %bb.0:
+; X64AVX-NEXT: vmovdqa {{.*#+}} xmm0 = <42,1,2,u>
+; X64AVX-NEXT: vpinsrd $3, %edi, %xmm0, %xmm0
+; X64AVX-NEXT: retq
+ %ins = insertelement <4 x i32> <i32 42, i32 1, i32 2, i32 3>, i32 %x, i32 3
+ ret <4 x i32> %ins
+}
+
+define <2 x i64> @elt0_v2i64(i64 %x) {
+; X32SSE-LABEL: elt0_v2i64:
+; X32SSE: # %bb.0:
+; X32SSE-NEXT: movl $1, %eax
+; X32SSE-NEXT: movd %eax, %xmm1
+; X32SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X32SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X32SSE-NEXT: retl
+;
+; X64SSE2-LABEL: elt0_v2i64:
+; X64SSE2: # %bb.0:
+; X64SSE2-NEXT: movq %rdi, %xmm1
+; X64SSE2-NEXT: movapd {{.*#+}} xmm0 = <u,1>
+; X64SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; X64SSE2-NEXT: retq
+;
+; X64SSE4-LABEL: elt0_v2i64:
+; X64SSE4: # %bb.0:
+; X64SSE4-NEXT: movdqa {{.*#+}} xmm0 = <u,1>
+; X64SSE4-NEXT: pinsrq $0, %rdi, %xmm0
+; X64SSE4-NEXT: retq
+;
+; X32AVX-LABEL: elt0_v2i64:
+; X32AVX: # %bb.0:
+; X32AVX-NEXT: movl $1, %eax
+; X32AVX-NEXT: vmovd %eax, %xmm0
+; X32AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
+; X32AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; X32AVX-NEXT: retl
+;
+; X64AVX-LABEL: elt0_v2i64:
+; X64AVX: # %bb.0:
+; X64AVX-NEXT: vmovdqa {{.*#+}} xmm0 = <u,1>
+; X64AVX-NEXT: vpinsrq $0, %rdi, %xmm0, %xmm0
+; X64AVX-NEXT: retq
+ %ins = insertelement <2 x i64> <i64 42, i64 1>, i64 %x, i32 0
+ ret <2 x i64> %ins
+}
+
+define <4 x float> @elt1_v4f32(float %x) {
+; X32SSE2-LABEL: elt1_v4f32:
+; X32SSE2: # %bb.0:
+; X32SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32SSE2-NEXT: movaps {{.*#+}} xmm1 = <42,u,2,3>
+; X32SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0]
+; X32SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
+; X32SSE2-NEXT: retl
+;
+; X64SSE2-LABEL: elt1_v4f32:
+; X64SSE2: # %bb.0:
+; X64SSE2-NEXT: movaps {{.*#+}} xmm1 = <42,u,2,3>
+; X64SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0]
+; X64SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
+; X64SSE2-NEXT: retq
+;
+; X32SSE4-LABEL: elt1_v4f32:
+; X32SSE4: # %bb.0:
+; X32SSE4-NEXT: movaps {{.*#+}} xmm0 = <42,u,2,3>
+; X32SSE4-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
+; X32SSE4-NEXT: retl
+;
+; X64SSE4-LABEL: elt1_v4f32:
+; X64SSE4: # %bb.0:
+; X64SSE4-NEXT: movaps {{.*#+}} xmm1 = <42,u,2,3>
+; X64SSE4-NEXT: insertps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[2,3]
+; X64SSE4-NEXT: movaps %xmm1, %xmm0
+; X64SSE4-NEXT: retq
+;
+; X32AVX-LABEL: elt1_v4f32:
+; X32AVX: # %bb.0:
+; X32AVX-NEXT: vmovaps {{.*#+}} xmm0 = <42,u,2,3>
+; X32AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
+; X32AVX-NEXT: retl
+;
+; X64AVX-LABEL: elt1_v4f32:
+; X64AVX: # %bb.0:
+; X64AVX-NEXT: vmovaps {{.*#+}} xmm1 = <42,u,2,3>
+; X64AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; X64AVX-NEXT: retq
+ %ins = insertelement <4 x float> <float 42.0, float 1.0, float 2.0, float 3.0>, float %x, i32 1
+ ret <4 x float> %ins
+}
+
+define <2 x double> @elt1_v2f64(double %x) {
+; X32SSE-LABEL: elt1_v2f64:
+; X32SSE: # %bb.0:
+; X32SSE-NEXT: movapd {{.*#+}} xmm0 = <42,u>
+; X32SSE-NEXT: movhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; X32SSE-NEXT: retl
+;
+; X64SSE-LABEL: elt1_v2f64:
+; X64SSE: # %bb.0:
+; X64SSE-NEXT: movaps {{.*#+}} xmm1 = <42,u>
+; X64SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; X64SSE-NEXT: movaps %xmm1, %xmm0
+; X64SSE-NEXT: retq
+;
+; X32AVX-LABEL: elt1_v2f64:
+; X32AVX: # %bb.0:
+; X32AVX-NEXT: vmovapd {{.*#+}} xmm0 = <42,u>
+; X32AVX-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; X32AVX-NEXT: retl
+;
+; X64AVX-LABEL: elt1_v2f64:
+; X64AVX: # %bb.0:
+; X64AVX-NEXT: vmovaps {{.*#+}} xmm1 = <42,u>
+; X64AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; X64AVX-NEXT: retq
+ %ins = insertelement <2 x double> <double 42.0, double 1.0>, double %x, i32 1
+ ret <2 x double> %ins
+}
+
+define <8 x i32> @elt7_v8i32(i32 %x) {
+; X32SSE2-LABEL: elt7_v8i32:
+; X32SSE2: # %bb.0:
+; X32SSE2-NEXT: movaps {{.*#+}} xmm1 = <4,5,6,u>
+; X32SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[2,0]
+; X32SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
+; X32SSE2-NEXT: movaps {{.*#+}} xmm0 = [42,1,2,3]
+; X32SSE2-NEXT: retl
+;
+; X64SSE2-LABEL: elt7_v8i32:
+; X64SSE2: # %bb.0:
+; X64SSE2-NEXT: movd %edi, %xmm0
+; X64SSE2-NEXT: movaps {{.*#+}} xmm1 = <4,5,6,u>
+; X64SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[2,0]
+; X64SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
+; X64SSE2-NEXT: movaps {{.*#+}} xmm0 = [42,1,2,3]
+; X64SSE2-NEXT: retq
+;
+; X32SSE4-LABEL: elt7_v8i32:
+; X32SSE4: # %bb.0:
+; X32SSE4-NEXT: movdqa {{.*#+}} xmm1 = <4,5,6,u>
+; X32SSE4-NEXT: pinsrd $3, {{[0-9]+}}(%esp), %xmm1
+; X32SSE4-NEXT: movaps {{.*#+}} xmm0 = [42,1,2,3]
+; X32SSE4-NEXT: retl
+;
+; X64SSE4-LABEL: elt7_v8i32:
+; X64SSE4: # %bb.0:
+; X64SSE4-NEXT: movdqa {{.*#+}} xmm1 = <4,5,6,u>
+; X64SSE4-NEXT: pinsrd $3, %edi, %xmm1
+; X64SSE4-NEXT: movaps {{.*#+}} xmm0 = [42,1,2,3]
+; X64SSE4-NEXT: retq
+;
+; X32AVX-LABEL: elt7_v8i32:
+; X32AVX: # %bb.0:
+; X32AVX-NEXT: vmovdqa {{.*#+}} ymm0 = <42,1,2,3,4,5,6,u>
+; X32AVX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X32AVX-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm1, %xmm1
+; X32AVX-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; X32AVX-NEXT: retl
+;
+; X64AVX-LABEL: elt7_v8i32:
+; X64AVX: # %bb.0:
+; X64AVX-NEXT: vmovdqa {{.*#+}} ymm0 = <42,1,2,3,4,5,6,u>
+; X64AVX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64AVX-NEXT: vpinsrd $3, %edi, %xmm1, %xmm1
+; X64AVX-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; X64AVX-NEXT: retq
+ %ins = insertelement <8 x i32> <i32 42, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, i32 %x, i32 7
+ ret <8 x i32> %ins
+}
+
+define <8 x float> @elt6_v8f32(float %x) {
+; X32SSE2-LABEL: elt6_v8f32:
+; X32SSE2: # %bb.0:
+; X32SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32SSE2-NEXT: movaps {{.*#+}} xmm1 = <4,5,u,7>
+; X32SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0]
+; X32SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2]
+; X32SSE2-NEXT: movaps {{.*#+}} xmm0 = [4.200000e+01,1.000000e+00,2.000000e+00,3.000000e+00]
+; X32SSE2-NEXT: retl
+;
+; X64SSE2-LABEL: elt6_v8f32:
+; X64SSE2: # %bb.0:
+; X64SSE2-NEXT: movaps {{.*#+}} xmm1 = <4,5,u,7>
+; X64SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0]
+; X64SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2]
+; X64SSE2-NEXT: movaps {{.*#+}} xmm0 = [4.200000e+01,1.000000e+00,2.000000e+00,3.000000e+00]
+; X64SSE2-NEXT: retq
+;
+; X32SSE4-LABEL: elt6_v8f32:
+; X32SSE4: # %bb.0:
+; X32SSE4-NEXT: movaps {{.*#+}} xmm1 = <4,5,u,7>
+; X32SSE4-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3]
+; X32SSE4-NEXT: movaps {{.*#+}} xmm0 = [4.200000e+01,1.000000e+00,2.000000e+00,3.000000e+00]
+; X32SSE4-NEXT: retl
+;
+; X64SSE4-LABEL: elt6_v8f32:
+; X64SSE4: # %bb.0:
+; X64SSE4-NEXT: movaps {{.*#+}} xmm1 = <4,5,u,7>
+; X64SSE4-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0],xmm1[3]
+; X64SSE4-NEXT: movaps {{.*#+}} xmm0 = [4.200000e+01,1.000000e+00,2.000000e+00,3.000000e+00]
+; X64SSE4-NEXT: retq
+;
+; X32AVX-LABEL: elt6_v8f32:
+; X32AVX: # %bb.0:
+; X32AVX-NEXT: vmovaps {{.*#+}} ymm0 = <42,1,2,3,4,5,u,7>
+; X32AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X32AVX-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3]
+; X32AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X32AVX-NEXT: retl
+;
+; X64AVX-LABEL: elt6_v8f32:
+; X64AVX: # %bb.0:
+; X64AVX-NEXT: vmovaps {{.*#+}} ymm1 = <42,1,2,3,4,5,u,7>
+; X64AVX-NEXT: vextractf128 $1, %ymm1, %xmm2
+; X64AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1],xmm0[0],xmm2[3]
+; X64AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X64AVX-NEXT: retq
+ %ins = insertelement <8 x float> <float 42.0, float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0>, float %x, i32 6
+ ret <8 x float> %ins
+}
+
+define <8 x i64> @elt5_v8i64(i64 %x) {
+; X32SSE-LABEL: elt5_v8i64:
+; X32SSE: # %bb.0:
+; X32SSE-NEXT: movl $4, %eax
+; X32SSE-NEXT: movd %eax, %xmm2
+; X32SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X32SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0]
+; X32SSE-NEXT: movaps {{.*#+}} xmm0 = [42,0,1,0]
+; X32SSE-NEXT: movaps {{.*#+}} xmm1 = [2,0,3,0]
+; X32SSE-NEXT: movaps {{.*#+}} xmm3 = [6,0,7,0]
+; X32SSE-NEXT: retl
+;
+; X64SSE2-LABEL: elt5_v8i64:
+; X64SSE2: # %bb.0:
+; X64SSE2-NEXT: movq %rdi, %xmm0
+; X64SSE2-NEXT: movdqa {{.*#+}} xmm2 = <4,u>
+; X64SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0]
+; X64SSE2-NEXT: movaps {{.*#+}} xmm0 = [42,1]
+; X64SSE2-NEXT: movaps {{.*#+}} xmm1 = [2,3]
+; X64SSE2-NEXT: movaps {{.*#+}} xmm3 = [6,7]
+; X64SSE2-NEXT: retq
+;
+; X64SSE4-LABEL: elt5_v8i64:
+; X64SSE4: # %bb.0:
+; X64SSE4-NEXT: movdqa {{.*#+}} xmm2 = <4,u>
+; X64SSE4-NEXT: pinsrq $1, %rdi, %xmm2
+; X64SSE4-NEXT: movaps {{.*#+}} xmm0 = [42,1]
+; X64SSE4-NEXT: movaps {{.*#+}} xmm1 = [2,3]
+; X64SSE4-NEXT: movaps {{.*#+}} xmm3 = [6,7]
+; X64SSE4-NEXT: retq
+;
+; X32AVX2-LABEL: elt5_v8i64:
+; X32AVX2: # %bb.0:
+; X32AVX2-NEXT: movl $4, %eax
+; X32AVX2-NEXT: vmovd %eax, %xmm0
+; X32AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
+; X32AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X32AVX2-NEXT: vinserti128 $1, {{\.LCPI.*}}, %ymm0, %ymm1
+; X32AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [42,0,1,0,2,0,3,0]
+; X32AVX2-NEXT: retl
+;
+; X64AVX2-LABEL: elt5_v8i64:
+; X64AVX2: # %bb.0:
+; X64AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = <4,u,6,7>
+; X64AVX2-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm1
+; X64AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; X64AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [42,1,2,3]
+; X64AVX2-NEXT: retq
+;
+; X32AVX512F-LABEL: elt5_v8i64:
+; X32AVX512F: # %bb.0:
+; X32AVX512F-NEXT: vmovdqa {{.*#+}} ymm0 = [42,0,1,0,2,0,3,0]
+; X32AVX512F-NEXT: movl $4, %eax
+; X32AVX512F-NEXT: vmovd %eax, %xmm1
+; X32AVX512F-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
+; X32AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; X32AVX512F-NEXT: vinserti128 $1, {{\.LCPI.*}}, %ymm1, %ymm1
+; X32AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; X32AVX512F-NEXT: retl
+;
+; X64AVX512F-LABEL: elt5_v8i64:
+; X64AVX512F: # %bb.0:
+; X64AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = <42,1,2,3,4,u,6,7>
+; X64AVX512F-NEXT: vextracti32x4 $2, %zmm0, %xmm1
+; X64AVX512F-NEXT: vpinsrq $1, %rdi, %xmm1, %xmm1
+; X64AVX512F-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0
+; X64AVX512F-NEXT: retq
+ %ins = insertelement <8 x i64> <i64 42, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, i64 %x, i32 5
+ ret <8 x i64> %ins
+}
+
+define <8 x double> @elt1_v8f64(double %x) {
+; X32SSE-LABEL: elt1_v8f64:
+; X32SSE: # %bb.0:
+; X32SSE-NEXT: movapd {{.*#+}} xmm0 = <42,u>
+; X32SSE-NEXT: movhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; X32SSE-NEXT: movaps {{.*#+}} xmm1 = [2.000000e+00,3.000000e+00]
+; X32SSE-NEXT: movaps {{.*#+}} xmm2 = [4.000000e+00,5.000000e+00]
+; X32SSE-NEXT: movaps {{.*#+}} xmm3 = [6.000000e+00,7.000000e+00]
+; X32SSE-NEXT: retl
+;
+; X64SSE-LABEL: elt1_v8f64:
+; X64SSE: # %bb.0:
+; X64SSE-NEXT: movaps {{.*#+}} xmm4 = <42,u>
+; X64SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0]
+; X64SSE-NEXT: movaps {{.*#+}} xmm1 = [2.000000e+00,3.000000e+00]
+; X64SSE-NEXT: movaps {{.*#+}} xmm2 = [4.000000e+00,5.000000e+00]
+; X64SSE-NEXT: movaps {{.*#+}} xmm3 = [6.000000e+00,7.000000e+00]
+; X64SSE-NEXT: movaps %xmm4, %xmm0
+; X64SSE-NEXT: retq
+;
+; X32AVX2-LABEL: elt1_v8f64:
+; X32AVX2: # %bb.0:
+; X32AVX2-NEXT: vmovapd {{.*#+}} ymm0 = <42,u,2,3>
+; X32AVX2-NEXT: vmovhpd {{.*#+}} xmm1 = xmm0[0],mem[0]
+; X32AVX2-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
+; X32AVX2-NEXT: vmovaps {{.*#+}} ymm1 = [4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00]
+; X32AVX2-NEXT: retl
+;
+; X64AVX2-LABEL: elt1_v8f64:
+; X64AVX2: # %bb.0:
+; X64AVX2-NEXT: vmovapd {{.*#+}} ymm1 = <42,u,2,3>
+; X64AVX2-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; X64AVX2-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
+; X64AVX2-NEXT: vmovaps {{.*#+}} ymm1 = [4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00]
+; X64AVX2-NEXT: retq
+;
+; X32AVX512F-LABEL: elt1_v8f64:
+; X32AVX512F: # %bb.0:
+; X32AVX512F-NEXT: vmovapd {{.*#+}} zmm0 = <42,u,2,3,4,5,6,7>
+; X32AVX512F-NEXT: vmovhpd {{.*#+}} xmm1 = xmm0[0],mem[0]
+; X32AVX512F-NEXT: vinsertf32x4 $0, %xmm1, %zmm0, %zmm0
+; X32AVX512F-NEXT: retl
+;
+; X64AVX512F-LABEL: elt1_v8f64:
+; X64AVX512F: # %bb.0:
+; X64AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = <42,u,2,3,4,5,6,7>
+; X64AVX512F-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; X64AVX512F-NEXT: vinsertf32x4 $0, %xmm0, %zmm1, %zmm0
+; X64AVX512F-NEXT: retq
+ %ins = insertelement <8 x double> <double 42.0, double 1.0, double 2.0, double 3.0, double 4.0, double 5.0, double 6.0, double 7.0>, double %x, i32 1
+ ret <8 x double> %ins
+}
+
diff --git a/test/CodeGen/X86/insert-positions.ll b/test/CodeGen/X86/insert-positions.ll
index aa68579d22e0..e36d1646abbd 100644
--- a/test/CodeGen/X86/insert-positions.ll
+++ b/test/CodeGen/X86/insert-positions.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 >/dev/null
+; RUN: llc < %s -mtriple=x86_64-- >/dev/null
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
diff --git a/test/CodeGen/X86/insertelement-copytoregs.ll b/test/CodeGen/X86/insertelement-copytoregs.ll
index 88ff4dafad7d..83f0bd2bac6d 100644
--- a/test/CodeGen/X86/insertelement-copytoregs.ll
+++ b/test/CodeGen/X86/insertelement-copytoregs.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-- | FileCheck %s
; CHECK-NOT: IMPLICIT_DEF
define void @foo(<2 x float>* %p) {
diff --git a/test/CodeGen/X86/insertelement-duplicates.ll b/test/CodeGen/X86/insertelement-duplicates.ll
index b07343362144..2f32c5a2e6b0 100644
--- a/test/CodeGen/X86/insertelement-duplicates.ll
+++ b/test/CodeGen/X86/insertelement-duplicates.ll
@@ -6,7 +6,7 @@
define void @PR15298(<4 x float>* nocapture %source, <8 x float>* nocapture %dest) nounwind noinline {
; SSE-32-LABEL: PR15298:
-; SSE-32: # BB#0: # %L.entry
+; SSE-32: # %bb.0: # %L.entry
; SSE-32-NEXT: movl {{[0-9]+}}(%esp), %eax
; SSE-32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; SSE-32-NEXT: movaps 304(%ecx), %xmm0
@@ -18,7 +18,7 @@ define void @PR15298(<4 x float>* nocapture %source, <8 x float>* nocapture %des
; SSE-32-NEXT: retl
;
; SSE-64-LABEL: PR15298:
-; SSE-64: # BB#0: # %L.entry
+; SSE-64: # %bb.0: # %L.entry
; SSE-64-NEXT: movaps 304(%rdi), %xmm0
; SSE-64-NEXT: xorps %xmm1, %xmm1
; SSE-64-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,1]
@@ -28,20 +28,20 @@ define void @PR15298(<4 x float>* nocapture %source, <8 x float>* nocapture %des
; SSE-64-NEXT: retq
;
; AVX-32-LABEL: PR15298:
-; AVX-32: # BB#0: # %L.entry
+; AVX-32: # %bb.0: # %L.entry
; AVX-32-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX-32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; AVX-32-NEXT: vbroadcastss 304(%ecx), %xmm0
-; AVX-32-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; AVX-32-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX-32-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4,5,6,7]
; AVX-32-NEXT: vmovups %ymm0, 608(%eax)
; AVX-32-NEXT: vzeroupper
; AVX-32-NEXT: retl
;
; AVX-64-LABEL: PR15298:
-; AVX-64: # BB#0: # %L.entry
+; AVX-64: # %bb.0: # %L.entry
; AVX-64-NEXT: vbroadcastss 304(%rdi), %xmm0
-; AVX-64-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; AVX-64-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX-64-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4,5,6,7]
; AVX-64-NEXT: vmovups %ymm0, 608(%rsi)
; AVX-64-NEXT: vzeroupper
diff --git a/test/CodeGen/X86/insertelement-legalize.ll b/test/CodeGen/X86/insertelement-legalize.ll
index 3805cbbaaaf8..8adc3f7e2d66 100644
--- a/test/CodeGen/X86/insertelement-legalize.ll
+++ b/test/CodeGen/X86/insertelement-legalize.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86
+; RUN: llc < %s -mtriple=i686--
; Test to check that we properly legalize an insert vector element
define void @test(<2 x i64> %val, <2 x i64>* %dst, i64 %x) nounwind {
diff --git a/test/CodeGen/X86/insertelement-ones.ll b/test/CodeGen/X86/insertelement-ones.ll
new file mode 100644
index 000000000000..ceb3217b7cfc
--- /dev/null
+++ b/test/CodeGen/X86/insertelement-ones.ll
@@ -0,0 +1,504 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE3
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512VL
+
+define <2 x i64> @insert_v2i64_x1(<2 x i64> %a) {
+; SSE2-LABEL: insert_v2i64_x1:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movlpd {{.*#+}} xmm0 = mem[0],xmm0[1]
+; SSE2-NEXT: retq
+;
+; SSE3-LABEL: insert_v2i64_x1:
+; SSE3: # %bb.0:
+; SSE3-NEXT: movlpd {{.*#+}} xmm0 = mem[0],xmm0[1]
+; SSE3-NEXT: retq
+;
+; SSSE3-LABEL: insert_v2i64_x1:
+; SSSE3: # %bb.0:
+; SSSE3-NEXT: movlpd {{.*#+}} xmm0 = mem[0],xmm0[1]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: insert_v2i64_x1:
+; SSE41: # %bb.0:
+; SSE41-NEXT: pcmpeqd %xmm1, %xmm1
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: insert_v2i64_x1:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: insert_v2i64_x1:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: insert_v2i64_x1:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX512-NEXT: retq
+ %1 = insertelement <2 x i64> %a, i64 -1, i32 0
+ ret <2 x i64> %1
+}
+
+define <4 x i64> @insert_v4i64_01x3(<4 x i64> %a) {
+; SSE2-LABEL: insert_v4i64_01x3:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movlpd {{.*#+}} xmm1 = mem[0],xmm1[1]
+; SSE2-NEXT: retq
+;
+; SSE3-LABEL: insert_v4i64_01x3:
+; SSE3: # %bb.0:
+; SSE3-NEXT: movlpd {{.*#+}} xmm1 = mem[0],xmm1[1]
+; SSE3-NEXT: retq
+;
+; SSSE3-LABEL: insert_v4i64_01x3:
+; SSSE3: # %bb.0:
+; SSSE3-NEXT: movlpd {{.*#+}} xmm1 = mem[0],xmm1[1]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: insert_v4i64_01x3:
+; SSE41: # %bb.0:
+; SSE41-NEXT: pcmpeqd %xmm2, %xmm2
+; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: insert_v4i64_01x3:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1
+; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: insert_v4i64_01x3:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: insert_v4i64_01x3:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
+; AVX512-NEXT: retq
+ %1 = insertelement <4 x i64> %a, i64 -1, i32 2
+ ret <4 x i64> %1
+}
+
+define <4 x i32> @insert_v4i32_01x3(<4 x i32> %a) {
+; SSE2-LABEL: insert_v4i32_01x3:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movl $-1, %eax
+; SSE2-NEXT: movd %eax, %xmm1
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
+; SSE2-NEXT: retq
+;
+; SSE3-LABEL: insert_v4i32_01x3:
+; SSE3: # %bb.0:
+; SSE3-NEXT: movl $-1, %eax
+; SSE3-NEXT: movd %eax, %xmm1
+; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
+; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
+; SSE3-NEXT: retq
+;
+; SSSE3-LABEL: insert_v4i32_01x3:
+; SSSE3: # %bb.0:
+; SSSE3-NEXT: movl $-1, %eax
+; SSSE3-NEXT: movd %eax, %xmm1
+; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
+; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: insert_v4i32_01x3:
+; SSE41: # %bb.0:
+; SSE41-NEXT: pcmpeqd %xmm1, %xmm1
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5],xmm0[6,7]
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: insert_v4i32_01x3:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5],xmm0[6,7]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: insert_v4i32_01x3:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: insert_v4i32_01x3:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
+; AVX512-NEXT: retq
+ %1 = insertelement <4 x i32> %a, i32 -1, i32 2
+ ret <4 x i32> %1
+}
+
+define <8 x i32> @insert_v8i32_x12345x7(<8 x i32> %a) {
+; SSE2-LABEL: insert_v8i32_x12345x7:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
+; SSE2-NEXT: movl $-1, %eax
+; SSE2-NEXT: movd %eax, %xmm2
+; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[3,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2]
+; SSE2-NEXT: retq
+;
+; SSE3-LABEL: insert_v8i32_x12345x7:
+; SSE3: # %bb.0:
+; SSE3-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE3-NEXT: movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
+; SSE3-NEXT: movl $-1, %eax
+; SSE3-NEXT: movd %eax, %xmm2
+; SSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[3,0]
+; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2]
+; SSE3-NEXT: retq
+;
+; SSSE3-LABEL: insert_v8i32_x12345x7:
+; SSSE3: # %bb.0:
+; SSSE3-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSSE3-NEXT: movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
+; SSSE3-NEXT: movl $-1, %eax
+; SSSE3-NEXT: movd %eax, %xmm2
+; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[3,0]
+; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: insert_v8i32_x12345x7:
+; SSE41: # %bb.0:
+; SSE41-NEXT: pcmpeqd %xmm2, %xmm2
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3,4,5,6,7]
+; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5],xmm1[6,7]
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: insert_v8i32_x12345x7:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1
+; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5],ymm1[6],ymm0[7]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: insert_v8i32_x12345x7:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5],ymm1[6],ymm0[7]
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: insert_v8i32_x12345x7:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5],ymm1[6],ymm0[7]
+; AVX512-NEXT: retq
+ %1 = insertelement <8 x i32> %a, i32 -1, i32 0
+ %2 = insertelement <8 x i32> %1, i32 -1, i32 6
+ ret <8 x i32> %2
+}
+
+define <8 x i16> @insert_v8i16_x12345x7(<8 x i16> %a) {
+; SSE2-LABEL: insert_v8i16_x12345x7:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movl $65535, %eax # imm = 0xFFFF
+; SSE2-NEXT: pinsrw $0, %eax, %xmm0
+; SSE2-NEXT: pinsrw $6, %eax, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE3-LABEL: insert_v8i16_x12345x7:
+; SSE3: # %bb.0:
+; SSE3-NEXT: movl $65535, %eax # imm = 0xFFFF
+; SSE3-NEXT: pinsrw $0, %eax, %xmm0
+; SSE3-NEXT: pinsrw $6, %eax, %xmm0
+; SSE3-NEXT: retq
+;
+; SSSE3-LABEL: insert_v8i16_x12345x7:
+; SSSE3: # %bb.0:
+; SSSE3-NEXT: movl $65535, %eax # imm = 0xFFFF
+; SSSE3-NEXT: pinsrw $0, %eax, %xmm0
+; SSSE3-NEXT: pinsrw $6, %eax, %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: insert_v8i16_x12345x7:
+; SSE41: # %bb.0:
+; SSE41-NEXT: pcmpeqd %xmm1, %xmm1
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5],xmm1[6],xmm0[7]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: insert_v8i16_x12345x7:
+; AVX: # %bb.0:
+; AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5],xmm1[6],xmm0[7]
+; AVX-NEXT: retq
+ %1 = insertelement <8 x i16> %a, i16 -1, i32 0
+ %2 = insertelement <8 x i16> %1, i16 -1, i32 6
+ ret <8 x i16> %2
+}
+
+define <16 x i16> @insert_v16i16_x12345x789ABCDEx(<16 x i16> %a) {
+; SSE2-LABEL: insert_v16i16_x12345x789ABCDEx:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movl $65535, %eax # imm = 0xFFFF
+; SSE2-NEXT: pinsrw $0, %eax, %xmm0
+; SSE2-NEXT: pinsrw $6, %eax, %xmm0
+; SSE2-NEXT: pinsrw $7, %eax, %xmm1
+; SSE2-NEXT: retq
+;
+; SSE3-LABEL: insert_v16i16_x12345x789ABCDEx:
+; SSE3: # %bb.0:
+; SSE3-NEXT: movl $65535, %eax # imm = 0xFFFF
+; SSE3-NEXT: pinsrw $0, %eax, %xmm0
+; SSE3-NEXT: pinsrw $6, %eax, %xmm0
+; SSE3-NEXT: pinsrw $7, %eax, %xmm1
+; SSE3-NEXT: retq
+;
+; SSSE3-LABEL: insert_v16i16_x12345x789ABCDEx:
+; SSSE3: # %bb.0:
+; SSSE3-NEXT: movl $65535, %eax # imm = 0xFFFF
+; SSSE3-NEXT: pinsrw $0, %eax, %xmm0
+; SSSE3-NEXT: pinsrw $6, %eax, %xmm0
+; SSSE3-NEXT: pinsrw $7, %eax, %xmm1
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: insert_v16i16_x12345x789ABCDEx:
+; SSE41: # %bb.0:
+; SSE41-NEXT: pcmpeqd %xmm2, %xmm2
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3,4,5],xmm2[6],xmm0[7]
+; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6],xmm2[7]
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: insert_v16i16_x12345x789ABCDEx:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT: vorps {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT: vorps {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT: vorps {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: insert_v16i16_x12345x789ABCDEx:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
+; AVX2-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm0
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm0
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0]
+; AVX2-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: insert_v16i16_x12345x789ABCDEx:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
+; AVX512F-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm0
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm0
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0]
+; AVX512F-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm0
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: insert_v16i16_x12345x789ABCDEx:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX512VL-NEXT: movw $1, %ax
+; AVX512VL-NEXT: kmovd %eax, %k1
+; AVX512VL-NEXT: vmovdqu16 %ymm1, %ymm0 {%k1}
+; AVX512VL-NEXT: movw $64, %ax
+; AVX512VL-NEXT: kmovd %eax, %k1
+; AVX512VL-NEXT: vmovdqu16 %ymm1, %ymm0 {%k1}
+; AVX512VL-NEXT: movw $-32768, %ax # imm = 0x8000
+; AVX512VL-NEXT: kmovd %eax, %k1
+; AVX512VL-NEXT: vmovdqu16 %ymm1, %ymm0 {%k1}
+; AVX512VL-NEXT: retq
+ %1 = insertelement <16 x i16> %a, i16 -1, i32 0
+ %2 = insertelement <16 x i16> %1, i16 -1, i32 6
+ %3 = insertelement <16 x i16> %2, i16 -1, i32 15
+ ret <16 x i16> %3
+}
+
+define <16 x i8> @insert_v16i8_x123456789ABCDEx(<16 x i8> %a) {
+; SSE2-LABEL: insert_v16i8_x123456789ABCDEx:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; SSE2-NEXT: pand %xmm1, %xmm0
+; SSE2-NEXT: movl $255, %eax
+; SSE2-NEXT: movd %eax, %xmm2
+; SSE2-NEXT: pandn %xmm2, %xmm1
+; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0]
+; SSE2-NEXT: por %xmm2, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE3-LABEL: insert_v16i8_x123456789ABCDEx:
+; SSE3: # %bb.0:
+; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; SSE3-NEXT: pand %xmm1, %xmm0
+; SSE3-NEXT: movl $255, %eax
+; SSE3-NEXT: movd %eax, %xmm2
+; SSE3-NEXT: pandn %xmm2, %xmm1
+; SSE3-NEXT: por %xmm1, %xmm0
+; SSE3-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE3-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0]
+; SSE3-NEXT: por %xmm2, %xmm0
+; SSE3-NEXT: retq
+;
+; SSSE3-LABEL: insert_v16i8_x123456789ABCDEx:
+; SSSE3: # %bb.0:
+; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = zero,xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; SSSE3-NEXT: movl $255, %eax
+; SSSE3-NEXT: movd %eax, %xmm1
+; SSSE3-NEXT: movdqa %xmm1, %xmm2
+; SSSE3-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSSE3-NEXT: por %xmm2, %xmm0
+; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],zero
+; SSSE3-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
+; SSSE3-NEXT: por %xmm1, %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: insert_v16i8_x123456789ABCDEx:
+; SSE41: # %bb.0:
+; SSE41-NEXT: movl $255, %eax
+; SSE41-NEXT: pinsrb $0, %eax, %xmm0
+; SSE41-NEXT: pinsrb $15, %eax, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: insert_v16i8_x123456789ABCDEx:
+; AVX: # %bb.0:
+; AVX-NEXT: movl $255, %eax
+; AVX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0
+; AVX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; AVX-NEXT: retq
+ %1 = insertelement <16 x i8> %a, i8 -1, i32 0
+ %2 = insertelement <16 x i8> %1, i8 -1, i32 15
+ ret <16 x i8> %2
+}
+
+define <32 x i8> @insert_v32i8_x123456789ABCDEzGHIJKLMNOPQRSTxx(<32 x i8> %a) {
+; SSE2-LABEL: insert_v32i8_x123456789ABCDEzGHIJKLMNOPQRSTxx:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: movl $255, %eax
+; SSE2-NEXT: movd %eax, %xmm3
+; SSE2-NEXT: pandn %xmm3, %xmm2
+; SSE2-NEXT: por %xmm2, %xmm0
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0]
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0]
+; SSE2-NEXT: por %xmm4, %xmm0
+; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255]
+; SSE2-NEXT: pand %xmm5, %xmm1
+; SSE2-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1]
+; SSE2-NEXT: pandn %xmm3, %xmm5
+; SSE2-NEXT: por %xmm5, %xmm1
+; SSE2-NEXT: pand %xmm2, %xmm1
+; SSE2-NEXT: por %xmm4, %xmm1
+; SSE2-NEXT: retq
+;
+; SSE3-LABEL: insert_v32i8_x123456789ABCDEzGHIJKLMNOPQRSTxx:
+; SSE3: # %bb.0:
+; SSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; SSE3-NEXT: pand %xmm2, %xmm0
+; SSE3-NEXT: movl $255, %eax
+; SSE3-NEXT: movd %eax, %xmm3
+; SSE3-NEXT: pandn %xmm3, %xmm2
+; SSE3-NEXT: por %xmm2, %xmm0
+; SSE3-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0]
+; SSE3-NEXT: pand %xmm2, %xmm0
+; SSE3-NEXT: movdqa %xmm3, %xmm4
+; SSE3-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0]
+; SSE3-NEXT: por %xmm4, %xmm0
+; SSE3-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255]
+; SSE3-NEXT: pand %xmm5, %xmm1
+; SSE3-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1]
+; SSE3-NEXT: pandn %xmm3, %xmm5
+; SSE3-NEXT: por %xmm5, %xmm1
+; SSE3-NEXT: pand %xmm2, %xmm1
+; SSE3-NEXT: por %xmm4, %xmm1
+; SSE3-NEXT: retq
+;
+; SSSE3-LABEL: insert_v32i8_x123456789ABCDEzGHIJKLMNOPQRSTxx:
+; SSSE3: # %bb.0:
+; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = zero,xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; SSSE3-NEXT: movl $255, %eax
+; SSSE3-NEXT: movd %eax, %xmm2
+; SSSE3-NEXT: movdqa %xmm2, %xmm3
+; SSSE3-NEXT: pshufb {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSSE3-NEXT: por %xmm3, %xmm0
+; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,128]
+; SSSE3-NEXT: pshufb %xmm3, %xmm0
+; SSSE3-NEXT: movdqa %xmm2, %xmm4
+; SSSE3-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0]
+; SSSE3-NEXT: por %xmm4, %xmm0
+; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13],zero,xmm1[15]
+; SSSE3-NEXT: pshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0],zero
+; SSSE3-NEXT: por %xmm2, %xmm1
+; SSSE3-NEXT: pshufb %xmm3, %xmm1
+; SSSE3-NEXT: por %xmm4, %xmm1
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: insert_v32i8_x123456789ABCDEzGHIJKLMNOPQRSTxx:
+; SSE41: # %bb.0:
+; SSE41-NEXT: movl $255, %eax
+; SSE41-NEXT: pinsrb $0, %eax, %xmm0
+; SSE41-NEXT: pinsrb $15, %eax, %xmm0
+; SSE41-NEXT: pinsrb $14, %eax, %xmm1
+; SSE41-NEXT: pinsrb $15, %eax, %xmm1
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: insert_v32i8_x123456789ABCDEzGHIJKLMNOPQRSTxx:
+; AVX1: # %bb.0:
+; AVX1-NEXT: movl $255, %eax
+; AVX1-NEXT: vpinsrb $0, %eax, %xmm0, %xmm1
+; AVX1-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1
+; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
+; AVX1-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: insert_v32i8_x123456789ABCDEzGHIJKLMNOPQRSTxx:
+; AVX2: # %bb.0:
+; AVX2-NEXT: movl $255, %eax
+; AVX2-NEXT: vpinsrb $0, %eax, %xmm0, %xmm1
+; AVX2-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
+; AVX2-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: insert_v32i8_x123456789ABCDEzGHIJKLMNOPQRSTxx:
+; AVX512: # %bb.0:
+; AVX512-NEXT: movl $255, %eax
+; AVX512-NEXT: vpinsrb $0, %eax, %xmm0, %xmm1
+; AVX512-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1
+; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX512-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
+; AVX512-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512-NEXT: retq
+ %1 = insertelement <32 x i8> %a, i8 -1, i32 0
+ %2 = insertelement <32 x i8> %1, i8 -1, i32 15
+ %3 = insertelement <32 x i8> %2, i8 -1, i32 30
+ %4 = insertelement <32 x i8> %3, i8 -1, i32 31
+ ret <32 x i8> %4
+}
diff --git a/test/CodeGen/X86/insertelement-shuffle.ll b/test/CodeGen/X86/insertelement-shuffle.ll
new file mode 100644
index 000000000000..705ceba94871
--- /dev/null
+++ b/test/CodeGen/X86/insertelement-shuffle.ll
@@ -0,0 +1,145 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=avx2 | FileCheck %s --check-prefix=X32_AVX256
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2 | FileCheck %s --check-prefix=X64_AVX256
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=avx512f | FileCheck %s --check-prefix=X32_AVX512
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512f | FileCheck %s --check-prefix=X64_AVX512
+
+define <8 x float> @insert_subvector_256(i16 %x0, i16 %x1, <8 x float> %v) nounwind {
+; X32_AVX256-LABEL: insert_subvector_256:
+; X32_AVX256: # %bb.0:
+; X32_AVX256-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X32_AVX256-NEXT: vpinsrw $1, {{[0-9]+}}(%esp), %xmm1, %xmm1
+; X32_AVX256-NEXT: vpbroadcastd %xmm1, %xmm1
+; X32_AVX256-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7]
+; X32_AVX256-NEXT: retl
+;
+; X64_AVX256-LABEL: insert_subvector_256:
+; X64_AVX256: # %bb.0:
+; X64_AVX256-NEXT: vmovd %edi, %xmm1
+; X64_AVX256-NEXT: vpinsrw $1, %esi, %xmm1, %xmm1
+; X64_AVX256-NEXT: vpbroadcastd %xmm1, %xmm1
+; X64_AVX256-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7]
+; X64_AVX256-NEXT: retq
+;
+; X32_AVX512-LABEL: insert_subvector_256:
+; X32_AVX512: # %bb.0:
+; X32_AVX512-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X32_AVX512-NEXT: vpinsrw $1, {{[0-9]+}}(%esp), %xmm1, %xmm1
+; X32_AVX512-NEXT: vpbroadcastd %xmm1, %xmm1
+; X32_AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7]
+; X32_AVX512-NEXT: retl
+;
+; X64_AVX512-LABEL: insert_subvector_256:
+; X64_AVX512: # %bb.0:
+; X64_AVX512-NEXT: vmovd %edi, %xmm1
+; X64_AVX512-NEXT: vpinsrw $1, %esi, %xmm1, %xmm1
+; X64_AVX512-NEXT: vpbroadcastd %xmm1, %xmm1
+; X64_AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7]
+; X64_AVX512-NEXT: retq
+ %ins1 = insertelement <2 x i16> undef, i16 %x0, i32 0
+ %ins2 = insertelement <2 x i16> %ins1, i16 %x1, i32 1
+ %bc = bitcast <2 x i16> %ins2 to float
+ %ins3 = insertelement <8 x float> %v, float %bc, i32 1
+ ret <8 x float> %ins3
+}
+
+define <8 x i64> @insert_subvector_512(i32 %x0, i32 %x1, <8 x i64> %v) nounwind {
+; X32_AVX256-LABEL: insert_subvector_512:
+; X32_AVX256: # %bb.0:
+; X32_AVX256-NEXT: pushl %ebp
+; X32_AVX256-NEXT: movl %esp, %ebp
+; X32_AVX256-NEXT: andl $-8, %esp
+; X32_AVX256-NEXT: subl $8, %esp
+; X32_AVX256-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
+; X32_AVX256-NEXT: vmovlps %xmm2, (%esp)
+; X32_AVX256-NEXT: vextracti128 $1, %ymm0, %xmm2
+; X32_AVX256-NEXT: vpinsrd $0, (%esp), %xmm2, %xmm2
+; X32_AVX256-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm2, %xmm2
+; X32_AVX256-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
+; X32_AVX256-NEXT: movl %ebp, %esp
+; X32_AVX256-NEXT: popl %ebp
+; X32_AVX256-NEXT: retl
+;
+; X64_AVX256-LABEL: insert_subvector_512:
+; X64_AVX256: # %bb.0:
+; X64_AVX256-NEXT: vmovd %edi, %xmm2
+; X64_AVX256-NEXT: vpinsrd $1, %esi, %xmm2, %xmm2
+; X64_AVX256-NEXT: vmovq %xmm2, %rax
+; X64_AVX256-NEXT: vextracti128 $1, %ymm0, %xmm2
+; X64_AVX256-NEXT: vpinsrq $0, %rax, %xmm2, %xmm2
+; X64_AVX256-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
+; X64_AVX256-NEXT: retq
+;
+; X32_AVX512-LABEL: insert_subvector_512:
+; X32_AVX512: # %bb.0:
+; X32_AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
+; X32_AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,1,0,8,0,3,0,4,0,5,0,6,0,7,0]
+; X32_AVX512-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
+; X32_AVX512-NEXT: retl
+;
+; X64_AVX512-LABEL: insert_subvector_512:
+; X64_AVX512: # %bb.0:
+; X64_AVX512-NEXT: vmovd %edi, %xmm1
+; X64_AVX512-NEXT: vpinsrd $1, %esi, %xmm1, %xmm1
+; X64_AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,8,3,4,5,6,7]
+; X64_AVX512-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
+; X64_AVX512-NEXT: retq
+ %ins1 = insertelement <2 x i32> undef, i32 %x0, i32 0
+ %ins2 = insertelement <2 x i32> %ins1, i32 %x1, i32 1
+ %bc = bitcast <2 x i32> %ins2 to i64
+ %ins3 = insertelement <8 x i64> %v, i64 %bc, i32 2
+ ret <8 x i64> %ins3
+}
+
+; PR34716 - https://bugs.llvm.org/show_bug.cgi?id=34716
+; Special case: if we're inserting into an undef vector, we can optimize more.
+
+define <8 x i64> @insert_subvector_into_undef(i32 %x0, i32 %x1) nounwind {
+; X32_AVX256-LABEL: insert_subvector_into_undef:
+; X32_AVX256: # %bb.0:
+; X32_AVX256-NEXT: pushl %ebp
+; X32_AVX256-NEXT: movl %esp, %ebp
+; X32_AVX256-NEXT: andl $-8, %esp
+; X32_AVX256-NEXT: subl $8, %esp
+; X32_AVX256-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X32_AVX256-NEXT: vmovlps %xmm0, (%esp)
+; X32_AVX256-NEXT: movl (%esp), %eax
+; X32_AVX256-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32_AVX256-NEXT: vmovd %eax, %xmm0
+; X32_AVX256-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0
+; X32_AVX256-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0
+; X32_AVX256-NEXT: vpinsrd $3, %ecx, %xmm0, %xmm0
+; X32_AVX256-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; X32_AVX256-NEXT: vmovdqa %ymm0, %ymm1
+; X32_AVX256-NEXT: movl %ebp, %esp
+; X32_AVX256-NEXT: popl %ebp
+; X32_AVX256-NEXT: retl
+;
+; X64_AVX256-LABEL: insert_subvector_into_undef:
+; X64_AVX256: # %bb.0:
+; X64_AVX256-NEXT: vmovd %edi, %xmm0
+; X64_AVX256-NEXT: vpinsrd $1, %esi, %xmm0, %xmm0
+; X64_AVX256-NEXT: vpbroadcastq %xmm0, %ymm0
+; X64_AVX256-NEXT: vmovdqa %ymm0, %ymm1
+; X64_AVX256-NEXT: retq
+;
+; X32_AVX512-LABEL: insert_subvector_into_undef:
+; X32_AVX512: # %bb.0:
+; X32_AVX512-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X32_AVX512-NEXT: vbroadcastsd %xmm0, %zmm0
+; X32_AVX512-NEXT: retl
+;
+; X64_AVX512-LABEL: insert_subvector_into_undef:
+; X64_AVX512: # %bb.0:
+; X64_AVX512-NEXT: vmovd %edi, %xmm0
+; X64_AVX512-NEXT: vpinsrd $1, %esi, %xmm0, %xmm0
+; X64_AVX512-NEXT: vpbroadcastq %xmm0, %zmm0
+; X64_AVX512-NEXT: retq
+ %ins1 = insertelement <2 x i32> undef, i32 %x0, i32 0
+ %ins2 = insertelement <2 x i32> %ins1, i32 %x1, i32 1
+ %bc = bitcast <2 x i32> %ins2 to i64
+ %ins3 = insertelement <8 x i64> undef, i64 %bc, i32 0
+ %splat = shufflevector <8 x i64> %ins3, <8 x i64> undef, <8 x i32> zeroinitializer
+ ret <8 x i64> %splat
+}
+
diff --git a/test/CodeGen/X86/insertelement-zero.ll b/test/CodeGen/X86/insertelement-zero.ll
index ea7418f4707e..0a26f6cd016e 100644
--- a/test/CodeGen/X86/insertelement-zero.ll
+++ b/test/CodeGen/X86/insertelement-zero.ll
@@ -8,31 +8,31 @@
define <2 x double> @insert_v2f64_z1(<2 x double> %a) {
; SSE2-LABEL: insert_v2f64_z1:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: xorpd %xmm1, %xmm1
; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; SSE2-NEXT: retq
;
; SSE3-LABEL: insert_v2f64_z1:
-; SSE3: # BB#0:
+; SSE3: # %bb.0:
; SSE3-NEXT: xorpd %xmm1, %xmm1
; SSE3-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; SSE3-NEXT: retq
;
; SSSE3-LABEL: insert_v2f64_z1:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: xorpd %xmm1, %xmm1
; SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: insert_v2f64_z1:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: xorpd %xmm1, %xmm1
; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; SSE41-NEXT: retq
;
; AVX-LABEL: insert_v2f64_z1:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; AVX-NEXT: retq
@@ -42,36 +42,36 @@ define <2 x double> @insert_v2f64_z1(<2 x double> %a) {
define <4 x double> @insert_v4f64_0zz3(<4 x double> %a) {
; SSE2-LABEL: insert_v4f64_0zz3:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
; SSE2-NEXT: xorpd %xmm2, %xmm2
; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
; SSE2-NEXT: retq
;
; SSE3-LABEL: insert_v4f64_0zz3:
-; SSE3: # BB#0:
+; SSE3: # %bb.0:
; SSE3-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
; SSE3-NEXT: xorpd %xmm2, %xmm2
; SSE3-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
; SSE3-NEXT: retq
;
; SSSE3-LABEL: insert_v4f64_0zz3:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
; SSSE3-NEXT: xorpd %xmm2, %xmm2
; SSSE3-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: insert_v4f64_0zz3:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
; SSE41-NEXT: xorpd %xmm2, %xmm2
; SSE41-NEXT: blendpd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
; SSE41-NEXT: retq
;
; AVX-LABEL: insert_v4f64_0zz3:
-; AVX: # BB#0:
-; AVX-NEXT: vxorpd %ymm1, %ymm1, %ymm1
+; AVX: # %bb.0:
+; AVX-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3]
; AVX-NEXT: retq
%1 = insertelement <4 x double> %a, double 0.0, i32 1
@@ -81,39 +81,39 @@ define <4 x double> @insert_v4f64_0zz3(<4 x double> %a) {
define <2 x i64> @insert_v2i64_z1(<2 x i64> %a) {
; SSE2-LABEL: insert_v2i64_z1:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: xorpd %xmm1, %xmm1
; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; SSE2-NEXT: retq
;
; SSE3-LABEL: insert_v2i64_z1:
-; SSE3: # BB#0:
+; SSE3: # %bb.0:
; SSE3-NEXT: xorpd %xmm1, %xmm1
; SSE3-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; SSE3-NEXT: retq
;
; SSSE3-LABEL: insert_v2i64_z1:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: xorpd %xmm1, %xmm1
; SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: insert_v2i64_z1:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pxor %xmm1, %xmm1
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
; SSE41-NEXT: retq
;
; AVX1-LABEL: insert_v2i64_z1:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
; AVX1-NEXT: retq
;
; AVX2-LABEL: insert_v2i64_z1:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX2: # %bb.0:
+; AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
; AVX2-NEXT: retq
%1 = insertelement <2 x i64> %a, i64 0, i32 0
ret <2 x i64> %1
@@ -121,39 +121,39 @@ define <2 x i64> @insert_v2i64_z1(<2 x i64> %a) {
define <4 x i64> @insert_v4i64_01z3(<4 x i64> %a) {
; SSE2-LABEL: insert_v4i64_01z3:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: xorpd %xmm2, %xmm2
; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
; SSE2-NEXT: retq
;
; SSE3-LABEL: insert_v4i64_01z3:
-; SSE3: # BB#0:
+; SSE3: # %bb.0:
; SSE3-NEXT: xorpd %xmm2, %xmm2
; SSE3-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
; SSE3-NEXT: retq
;
; SSSE3-LABEL: insert_v4i64_01z3:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: xorpd %xmm2, %xmm2
; SSSE3-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: insert_v4i64_01z3:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pxor %xmm2, %xmm2
; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
; SSE41-NEXT: retq
;
; AVX1-LABEL: insert_v4i64_01z3:
-; AVX1: # BB#0:
-; AVX1-NEXT: vxorpd %ymm1, %ymm1, %ymm1
+; AVX1: # %bb.0:
+; AVX1-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3]
; AVX1-NEXT: retq
;
; AVX2-LABEL: insert_v4i64_01z3:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
+; AVX2: # %bb.0:
+; AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
; AVX2-NEXT: retq
%1 = insertelement <4 x i64> %a, i64 0, i32 2
ret <4 x i64> %1
@@ -161,34 +161,34 @@ define <4 x i64> @insert_v4i64_01z3(<4 x i64> %a) {
define <4 x float> @insert_v4f32_01z3(<4 x float> %a) {
; SSE2-LABEL: insert_v4f32_01z3:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: xorps %xmm1, %xmm1
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
; SSE2-NEXT: retq
;
; SSE3-LABEL: insert_v4f32_01z3:
-; SSE3: # BB#0:
+; SSE3: # %bb.0:
; SSE3-NEXT: xorps %xmm1, %xmm1
; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
; SSE3-NEXT: retq
;
; SSSE3-LABEL: insert_v4f32_01z3:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: xorps %xmm1, %xmm1
; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: insert_v4f32_01z3:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: xorps %xmm1, %xmm1
; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
; SSE41-NEXT: retq
;
; AVX-LABEL: insert_v4f32_01z3:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
; AVX-NEXT: retq
@@ -198,7 +198,7 @@ define <4 x float> @insert_v4f32_01z3(<4 x float> %a) {
define <8 x float> @insert_v8f32_z12345z7(<8 x float> %a) {
; SSE2-LABEL: insert_v8f32_z12345z7:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: xorps %xmm2, %xmm2
; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[3,0]
@@ -206,7 +206,7 @@ define <8 x float> @insert_v8f32_z12345z7(<8 x float> %a) {
; SSE2-NEXT: retq
;
; SSE3-LABEL: insert_v8f32_z12345z7:
-; SSE3: # BB#0:
+; SSE3: # %bb.0:
; SSE3-NEXT: xorps %xmm2, %xmm2
; SSE3-NEXT: movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
; SSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[3,0]
@@ -214,7 +214,7 @@ define <8 x float> @insert_v8f32_z12345z7(<8 x float> %a) {
; SSE3-NEXT: retq
;
; SSSE3-LABEL: insert_v8f32_z12345z7:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: xorps %xmm2, %xmm2
; SSSE3-NEXT: movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[3,0]
@@ -222,15 +222,15 @@ define <8 x float> @insert_v8f32_z12345z7(<8 x float> %a) {
; SSSE3-NEXT: retq
;
; SSE41-LABEL: insert_v8f32_z12345z7:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: xorps %xmm2, %xmm2
; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3]
; SSE41-NEXT: retq
;
; AVX-LABEL: insert_v8f32_z12345z7:
-; AVX: # BB#0:
-; AVX-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; AVX: # %bb.0:
+; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5],ymm1[6],ymm0[7]
; AVX-NEXT: retq
%1 = insertelement <8 x float> %a, float 0.0, i32 0
@@ -240,42 +240,42 @@ define <8 x float> @insert_v8f32_z12345z7(<8 x float> %a) {
define <4 x i32> @insert_v4i32_01z3(<4 x i32> %a) {
; SSE2-LABEL: insert_v4i32_01z3:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: xorps %xmm1, %xmm1
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
; SSE2-NEXT: retq
;
; SSE3-LABEL: insert_v4i32_01z3:
-; SSE3: # BB#0:
+; SSE3: # %bb.0:
; SSE3-NEXT: xorps %xmm1, %xmm1
; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
; SSE3-NEXT: retq
;
; SSSE3-LABEL: insert_v4i32_01z3:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: xorps %xmm1, %xmm1
; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: insert_v4i32_01z3:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pxor %xmm1, %xmm1
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5],xmm0[6,7]
; SSE41-NEXT: retq
;
; AVX1-LABEL: insert_v4i32_01z3:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5],xmm0[6,7]
; AVX1-NEXT: retq
;
; AVX2-LABEL: insert_v4i32_01z3:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
+; AVX2: # %bb.0:
+; AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
; AVX2-NEXT: retq
%1 = insertelement <4 x i32> %a, i32 0, i32 2
ret <4 x i32> %1
@@ -283,7 +283,7 @@ define <4 x i32> @insert_v4i32_01z3(<4 x i32> %a) {
define <8 x i32> @insert_v8i32_z12345z7(<8 x i32> %a) {
; SSE2-LABEL: insert_v8i32_z12345z7:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: xorps %xmm2, %xmm2
; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
; SSE2-NEXT: xorps %xmm2, %xmm2
@@ -292,7 +292,7 @@ define <8 x i32> @insert_v8i32_z12345z7(<8 x i32> %a) {
; SSE2-NEXT: retq
;
; SSE3-LABEL: insert_v8i32_z12345z7:
-; SSE3: # BB#0:
+; SSE3: # %bb.0:
; SSE3-NEXT: xorps %xmm2, %xmm2
; SSE3-NEXT: movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
; SSE3-NEXT: xorps %xmm2, %xmm2
@@ -301,7 +301,7 @@ define <8 x i32> @insert_v8i32_z12345z7(<8 x i32> %a) {
; SSE3-NEXT: retq
;
; SSSE3-LABEL: insert_v8i32_z12345z7:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: xorps %xmm2, %xmm2
; SSSE3-NEXT: movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
; SSSE3-NEXT: xorps %xmm2, %xmm2
@@ -310,23 +310,17 @@ define <8 x i32> @insert_v8i32_z12345z7(<8 x i32> %a) {
; SSSE3-NEXT: retq
;
; SSE41-LABEL: insert_v8i32_z12345z7:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pxor %xmm2, %xmm2
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3,4,5,6,7]
; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5],xmm1[6,7]
; SSE41-NEXT: retq
;
-; AVX1-LABEL: insert_v8i32_z12345z7:
-; AVX1: # BB#0:
-; AVX1-NEXT: vxorps %ymm1, %ymm1, %ymm1
-; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5],ymm1[6],ymm0[7]
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: insert_v8i32_z12345z7:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5],ymm1[6],ymm0[7]
-; AVX2-NEXT: retq
+; AVX-LABEL: insert_v8i32_z12345z7:
+; AVX: # %bb.0:
+; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5],ymm1[6],ymm0[7]
+; AVX-NEXT: retq
%1 = insertelement <8 x i32> %a, i32 0, i32 0
%2 = insertelement <8 x i32> %1, i32 0, i32 6
ret <8 x i32> %2
@@ -334,34 +328,34 @@ define <8 x i32> @insert_v8i32_z12345z7(<8 x i32> %a) {
define <8 x i16> @insert_v8i16_z12345z7(<8 x i16> %a) {
; SSE2-LABEL: insert_v8i16_z12345z7:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: xorl %eax, %eax
; SSE2-NEXT: pinsrw $0, %eax, %xmm0
; SSE2-NEXT: pinsrw $6, %eax, %xmm0
; SSE2-NEXT: retq
;
; SSE3-LABEL: insert_v8i16_z12345z7:
-; SSE3: # BB#0:
+; SSE3: # %bb.0:
; SSE3-NEXT: xorl %eax, %eax
; SSE3-NEXT: pinsrw $0, %eax, %xmm0
; SSE3-NEXT: pinsrw $6, %eax, %xmm0
; SSE3-NEXT: retq
;
; SSSE3-LABEL: insert_v8i16_z12345z7:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: xorl %eax, %eax
; SSSE3-NEXT: pinsrw $0, %eax, %xmm0
; SSSE3-NEXT: pinsrw $6, %eax, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: insert_v8i16_z12345z7:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pxor %xmm1, %xmm1
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5],xmm1[6],xmm0[7]
; SSE41-NEXT: retq
;
; AVX-LABEL: insert_v8i16_z12345z7:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5],xmm1[6],xmm0[7]
; AVX-NEXT: retq
@@ -372,7 +366,7 @@ define <8 x i16> @insert_v8i16_z12345z7(<8 x i16> %a) {
define <16 x i16> @insert_v16i16_z12345z789ABCDEz(<16 x i16> %a) {
; SSE2-LABEL: insert_v16i16_z12345z789ABCDEz:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: xorl %eax, %eax
; SSE2-NEXT: pinsrw $0, %eax, %xmm0
; SSE2-NEXT: pinsrw $6, %eax, %xmm0
@@ -380,7 +374,7 @@ define <16 x i16> @insert_v16i16_z12345z789ABCDEz(<16 x i16> %a) {
; SSE2-NEXT: retq
;
; SSE3-LABEL: insert_v16i16_z12345z789ABCDEz:
-; SSE3: # BB#0:
+; SSE3: # %bb.0:
; SSE3-NEXT: xorl %eax, %eax
; SSE3-NEXT: pinsrw $0, %eax, %xmm0
; SSE3-NEXT: pinsrw $6, %eax, %xmm0
@@ -388,7 +382,7 @@ define <16 x i16> @insert_v16i16_z12345z789ABCDEz(<16 x i16> %a) {
; SSE3-NEXT: retq
;
; SSSE3-LABEL: insert_v16i16_z12345z789ABCDEz:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: xorl %eax, %eax
; SSSE3-NEXT: pinsrw $0, %eax, %xmm0
; SSSE3-NEXT: pinsrw $6, %eax, %xmm0
@@ -396,14 +390,14 @@ define <16 x i16> @insert_v16i16_z12345z789ABCDEz(<16 x i16> %a) {
; SSSE3-NEXT: retq
;
; SSE41-LABEL: insert_v16i16_z12345z789ABCDEz:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pxor %xmm2, %xmm2
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3,4,5],xmm2[6],xmm0[7]
; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6],xmm2[7]
; SSE41-NEXT: retq
;
; AVX-LABEL: insert_v16i16_z12345z789ABCDEz:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
; AVX-NEXT: retq
%1 = insertelement <16 x i16> %a, i16 0, i32 0
@@ -414,29 +408,29 @@ define <16 x i16> @insert_v16i16_z12345z789ABCDEz(<16 x i16> %a) {
define <16 x i8> @insert_v16i8_z123456789ABCDEz(<16 x i8> %a) {
; SSE2-LABEL: insert_v16i8_z123456789ABCDEz:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: andps {{.*}}(%rip), %xmm0
; SSE2-NEXT: retq
;
; SSE3-LABEL: insert_v16i8_z123456789ABCDEz:
-; SSE3: # BB#0:
+; SSE3: # %bb.0:
; SSE3-NEXT: andps {{.*}}(%rip), %xmm0
; SSE3-NEXT: retq
;
; SSSE3-LABEL: insert_v16i8_z123456789ABCDEz:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: andps {{.*}}(%rip), %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: insert_v16i8_z123456789ABCDEz:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: xorl %eax, %eax
; SSE41-NEXT: pinsrb $0, %eax, %xmm0
; SSE41-NEXT: pinsrb $15, %eax, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: insert_v16i8_z123456789ABCDEz:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: xorl %eax, %eax
; AVX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0
; AVX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
@@ -448,25 +442,25 @@ define <16 x i8> @insert_v16i8_z123456789ABCDEz(<16 x i8> %a) {
define <32 x i8> @insert_v32i8_z123456789ABCDEzGHIJKLMNOPQRSTzz(<32 x i8> %a) {
; SSE2-LABEL: insert_v32i8_z123456789ABCDEzGHIJKLMNOPQRSTzz:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: andps {{.*}}(%rip), %xmm0
; SSE2-NEXT: andps {{.*}}(%rip), %xmm1
; SSE2-NEXT: retq
;
; SSE3-LABEL: insert_v32i8_z123456789ABCDEzGHIJKLMNOPQRSTzz:
-; SSE3: # BB#0:
+; SSE3: # %bb.0:
; SSE3-NEXT: andps {{.*}}(%rip), %xmm0
; SSE3-NEXT: andps {{.*}}(%rip), %xmm1
; SSE3-NEXT: retq
;
; SSSE3-LABEL: insert_v32i8_z123456789ABCDEzGHIJKLMNOPQRSTzz:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: andps {{.*}}(%rip), %xmm0
; SSSE3-NEXT: andps {{.*}}(%rip), %xmm1
; SSSE3-NEXT: retq
;
; SSE41-LABEL: insert_v32i8_z123456789ABCDEzGHIJKLMNOPQRSTzz:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: xorl %eax, %eax
; SSE41-NEXT: pinsrb $0, %eax, %xmm0
; SSE41-NEXT: pinsrb $15, %eax, %xmm0
@@ -475,11 +469,10 @@ define <32 x i8> @insert_v32i8_z123456789ABCDEzGHIJKLMNOPQRSTzz(<32 x i8> %a) {
; SSE41-NEXT: retq
;
; AVX1-LABEL: insert_v32i8_z123456789ABCDEzGHIJKLMNOPQRSTzz:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: xorl %eax, %eax
; AVX1-NEXT: vpinsrb $0, %eax, %xmm0, %xmm1
; AVX1-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1
-; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
@@ -487,11 +480,10 @@ define <32 x i8> @insert_v32i8_z123456789ABCDEzGHIJKLMNOPQRSTzz(<32 x i8> %a) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: insert_v32i8_z123456789ABCDEzGHIJKLMNOPQRSTzz:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: xorl %eax, %eax
; AVX2-NEXT: vpinsrb $0, %eax, %xmm0, %xmm1
; AVX2-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1
-; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
diff --git a/test/CodeGen/X86/insertps-combine.ll b/test/CodeGen/X86/insertps-combine.ll
index 044ad0721539..22a978eca07f 100644
--- a/test/CodeGen/X86/insertps-combine.ll
+++ b/test/CodeGen/X86/insertps-combine.ll
@@ -5,12 +5,12 @@
define <4 x float> @shuffle_v4f32_0z27(<4 x float> %x, <4 x float> %a) {
; SSE-LABEL: shuffle_v4f32_0z27:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2],xmm1[2]
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v4f32_0z27:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2],xmm1[2]
; AVX-NEXT: retq
%vecext = extractelement <4 x float> %x, i32 0
@@ -23,12 +23,12 @@ define <4 x float> @shuffle_v4f32_0z27(<4 x float> %x, <4 x float> %a) {
define <4 x float> @shuffle_v4f32_0zz4(<4 x float> %xyzw, <4 x float> %abcd) {
; SSE-LABEL: shuffle_v4f32_0zz4:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm1[0]
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v4f32_0zz4:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm1[0]
; AVX-NEXT: retq
%vecext = extractelement <4 x float> %xyzw, i32 0
@@ -41,12 +41,12 @@ define <4 x float> @shuffle_v4f32_0zz4(<4 x float> %xyzw, <4 x float> %abcd) {
define <4 x float> @shuffle_v4f32_0z24(<4 x float> %xyzw, <4 x float> %abcd) {
; SSE-LABEL: shuffle_v4f32_0z24:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2],xmm1[0]
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v4f32_0z24:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2],xmm1[0]
; AVX-NEXT: retq
%vecext = extractelement <4 x float> %xyzw, i32 0
@@ -59,12 +59,12 @@ define <4 x float> @shuffle_v4f32_0z24(<4 x float> %xyzw, <4 x float> %abcd) {
define <4 x float> @shuffle_v4f32_0zz0(float %a) {
; SSE-LABEL: shuffle_v4f32_0zz0:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm0[0]
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v4f32_0zz0:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm0[0]
; AVX-NEXT: retq
%vecinit = insertelement <4 x float> undef, float %a, i32 0
@@ -76,12 +76,12 @@ define <4 x float> @shuffle_v4f32_0zz0(float %a) {
define <4 x float> @shuffle_v4f32_0z6z(<4 x float> %A, <4 x float> %B) {
; SSE-LABEL: shuffle_v4f32_0z6z:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm1[2],zero
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v4f32_0z6z:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],zero,xmm1[2],zero
; AVX-NEXT: retq
%vecext = extractelement <4 x float> %A, i32 0
@@ -95,13 +95,13 @@ define <4 x float> @shuffle_v4f32_0z6z(<4 x float> %A, <4 x float> %B) {
define <4 x float> @shuffle_v4f32_z06z(<4 x float> %a, <4 x float> %b) {
; SSE-LABEL: shuffle_v4f32_z06z:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: insertps {{.*#+}} xmm1 = zero,xmm0[0],xmm1[2],zero
; SSE-NEXT: movaps %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v4f32_z06z:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vinsertps {{.*#+}} xmm0 = zero,xmm0[0],xmm1[2],zero
; AVX-NEXT: retq
%shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 undef, i32 0, i32 6, i32 undef>
@@ -111,12 +111,12 @@ define <4 x float> @shuffle_v4f32_z06z(<4 x float> %a, <4 x float> %b) {
define <4 x float> @shuffle_v4f32_05zz(<4 x float> %a, <4 x float> %b) {
; SSE-LABEL: shuffle_v4f32_05zz:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[1],zero,zero
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v4f32_05zz:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[1],zero,zero
; AVX-NEXT: retq
%shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 undef, i32 undef>
@@ -126,12 +126,12 @@ define <4 x float> @shuffle_v4f32_05zz(<4 x float> %a, <4 x float> %b) {
define <4 x float> @insertps_undef_input0(<4 x float> %a0, <4 x float> %a1) {
; SSE-LABEL: insertps_undef_input0:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: insertps {{.*#+}} xmm0 = zero,xmm1[0],zero,zero
; SSE-NEXT: retq
;
; AVX-LABEL: insertps_undef_input0:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vinsertps {{.*#+}} xmm0 = zero,xmm1[0],zero,zero
; AVX-NEXT: retq
%res0 = fadd <4 x float> %a0, <float 1.0, float 1.0, float 1.0, float 1.0>
@@ -142,13 +142,13 @@ define <4 x float> @insertps_undef_input0(<4 x float> %a0, <4 x float> %a1) {
define <4 x float> @insertps_undef_input1(<4 x float> %a0, <4 x float> %a1) {
; SSE-LABEL: insertps_undef_input1:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: xorps %xmm1, %xmm1
; SSE-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
; SSE-NEXT: retq
;
; AVX-LABEL: insertps_undef_input1:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
; AVX-NEXT: retq
@@ -160,7 +160,7 @@ define <4 x float> @insertps_undef_input1(<4 x float> %a0, <4 x float> %a1) {
define <4 x float> @insertps_zero_from_v2f64(<4 x float> %a0, <2 x double>* %a1) nounwind {
; SSE-LABEL: insertps_zero_from_v2f64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movapd (%rdi), %xmm1
; SSE-NEXT: addpd {{.*}}(%rip), %xmm1
; SSE-NEXT: insertps {{.*#+}} xmm0 = zero,xmm0[2,2,3]
@@ -168,7 +168,7 @@ define <4 x float> @insertps_zero_from_v2f64(<4 x float> %a0, <2 x double>* %a1)
; SSE-NEXT: retq
;
; AVX-LABEL: insertps_zero_from_v2f64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovapd (%rdi), %xmm1
; AVX-NEXT: vaddpd {{.*}}(%rip), %xmm1, %xmm1
; AVX-NEXT: vinsertps {{.*#+}} xmm0 = zero,xmm0[2,2,3]
@@ -184,7 +184,7 @@ define <4 x float> @insertps_zero_from_v2f64(<4 x float> %a0, <2 x double>* %a1)
define <4 x float> @insertps_zero_from_v2i64(<4 x float> %a0, <2 x i64>* %a1) nounwind {
; SSE-LABEL: insertps_zero_from_v2i64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movdqa (%rdi), %xmm1
; SSE-NEXT: paddq {{.*}}(%rip), %xmm1
; SSE-NEXT: insertps {{.*#+}} xmm0 = zero,xmm0[2,2,3]
@@ -192,7 +192,7 @@ define <4 x float> @insertps_zero_from_v2i64(<4 x float> %a0, <2 x i64>* %a1) no
; SSE-NEXT: retq
;
; AVX-LABEL: insertps_zero_from_v2i64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovdqa (%rdi), %xmm1
; AVX-NEXT: vpaddq {{.*}}(%rip), %xmm1, %xmm1
; AVX-NEXT: vinsertps {{.*#+}} xmm0 = zero,xmm0[2,2,3]
@@ -208,7 +208,7 @@ define <4 x float> @insertps_zero_from_v2i64(<4 x float> %a0, <2 x i64>* %a1) no
define <4 x float> @insertps_zero_from_v8i16(<4 x float> %a0, <8 x i16>* %a1) nounwind {
; SSE-LABEL: insertps_zero_from_v8i16:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movdqa (%rdi), %xmm1
; SSE-NEXT: paddw {{.*}}(%rip), %xmm1
; SSE-NEXT: insertps {{.*#+}} xmm0 = zero,xmm0[2,2,3]
@@ -216,7 +216,7 @@ define <4 x float> @insertps_zero_from_v8i16(<4 x float> %a0, <8 x i16>* %a1) no
; SSE-NEXT: retq
;
; AVX-LABEL: insertps_zero_from_v8i16:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovdqa (%rdi), %xmm1
; AVX-NEXT: vpaddw {{.*}}(%rip), %xmm1, %xmm1
; AVX-NEXT: vinsertps {{.*#+}} xmm0 = zero,xmm0[2,2,3]
@@ -232,12 +232,12 @@ define <4 x float> @insertps_zero_from_v8i16(<4 x float> %a0, <8 x i16>* %a1) no
define <4 x float> @consecutive_load_insertps_04zz(float* %p) {
; SSE-LABEL: consecutive_load_insertps_04zz:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; SSE-NEXT: retq
;
; AVX-LABEL: consecutive_load_insertps_04zz:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: retq
%p0 = getelementptr inbounds float, float* %p, i64 1
@@ -252,12 +252,12 @@ define <4 x float> @consecutive_load_insertps_04zz(float* %p) {
define float @extract_zero_insertps_z0z7(<4 x float> %a0, <4 x float> %a1) {
; SSE-LABEL: extract_zero_insertps_z0z7:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: xorps %xmm0, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: extract_zero_insertps_z0z7:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
; AVX-NEXT: retq
%res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a0, <4 x float> %a1, i8 21)
@@ -267,12 +267,12 @@ define float @extract_zero_insertps_z0z7(<4 x float> %a0, <4 x float> %a1) {
define float @extract_lane_insertps_5123(<4 x float> %a0, <4 x float> *%p1) {
; SSE-LABEL: extract_lane_insertps_5123:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE-NEXT: retq
;
; AVX-LABEL: extract_lane_insertps_5123:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX-NEXT: retq
%a1 = load <4 x float>, <4 x float> *%p1
diff --git a/test/CodeGen/X86/insertps-from-constantpool.ll b/test/CodeGen/X86/insertps-from-constantpool.ll
index cfcfeacad067..e0a371ebe40a 100644
--- a/test/CodeGen/X86/insertps-from-constantpool.ll
+++ b/test/CodeGen/X86/insertps-from-constantpool.ll
@@ -5,12 +5,12 @@
define <4 x float> @fold_from_constantpool(<4 x float> %a) {
; X32-LABEL: fold_from_constantpool:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: insertps {{.*#+}} xmm0 = mem[0],xmm0[1,2,3]
; X32-NEXT: retl
;
; X64-LABEL: fold_from_constantpool:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: insertps {{.*#+}} xmm0 = mem[0],xmm0[1,2,3]
; X64-NEXT: retq
%1 = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> <float 0.0, float 1.0, float 0.0, float 0.0>, i8 64)
diff --git a/test/CodeGen/X86/insertps-unfold-load-bug.ll b/test/CodeGen/X86/insertps-unfold-load-bug.ll
index bf7c4bc4d7b9..723b25d598cc 100644
--- a/test/CodeGen/X86/insertps-unfold-load-bug.ll
+++ b/test/CodeGen/X86/insertps-unfold-load-bug.ll
@@ -6,7 +6,7 @@
define <4 x float> @insertps_unfold(<4 x float>* %v0, <4 x float>* %v1) {
; X32-LABEL: insertps_unfold:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
@@ -16,7 +16,7 @@ define <4 x float> @insertps_unfold(<4 x float>* %v0, <4 x float>* %v1) {
; X32-NEXT: retl
;
; X64-LABEL: insertps_unfold:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X64-NEXT: movaps (%rdi), %xmm0
; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
diff --git a/test/CodeGen/X86/int-intrinsic.ll b/test/CodeGen/X86/int-intrinsic.ll
index b253e6c5f3b0..ca7ceb24b424 100644
--- a/test/CodeGen/X86/int-intrinsic.ll
+++ b/test/CodeGen/X86/int-intrinsic.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=x86 | FileCheck %s
-; RUN: llc < %s -march=x86-64 | FileCheck %s
+; RUN: llc < %s -mtriple=i686-- | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-- | FileCheck %s
declare void @llvm.x86.int(i8) nounwind
diff --git a/test/CodeGen/X86/invalid-liveness.mir b/test/CodeGen/X86/invalid-liveness.mir
index c1da65e0be69..47db8090a92f 100644
--- a/test/CodeGen/X86/invalid-liveness.mir
+++ b/test/CodeGen/X86/invalid-liveness.mir
@@ -1,15 +1,15 @@
-# RUN: not llc -march=x86 -run-pass liveintervals -o - %s 2>&1 | FileCheck %s
+# RUN: not llc -mtriple=i686-- -run-pass liveintervals -o - %s 2>&1 | FileCheck %s
# REQUIRES: asserts
--- |
define void @func() { ret void }
...
---
-# Liveness calculation should detect that we do not have a definition for vreg0
-# on all paths; In this example a def for vreg0 is missing when jumping from
+# Liveness calculation should detect that we do not have a definition for %0
+# on all paths; In this example a def for %0 is missing when jumping from
# bb.0 to bb.3.
#
-# CHECK: Use of %vreg0 does not have a corresponding definition on every path
+# CHECK: Use of %0 does not have a corresponding definition on every path
# CHECK: ERROR: Use not jointly dominated by defs.
name: func
registers:
diff --git a/test/CodeGen/X86/invalid-shift-immediate.ll b/test/CodeGen/X86/invalid-shift-immediate.ll
index 1fb80c7dba7f..05802660f060 100644
--- a/test/CodeGen/X86/invalid-shift-immediate.ll
+++ b/test/CodeGen/X86/invalid-shift-immediate.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86
+; RUN: llc < %s
; PR2098
target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
diff --git a/test/CodeGen/X86/ipra-inline-asm.ll b/test/CodeGen/X86/ipra-inline-asm.ll
index e70b149e19e1..4b56c3a2fd6c 100644
--- a/test/CodeGen/X86/ipra-inline-asm.ll
+++ b/test/CodeGen/X86/ipra-inline-asm.ll
@@ -11,7 +11,7 @@ define void @bar() #0 {
}
; Verifies that inline assembly is correctly handled by giving a list of clobbered registers
-; CHECK: foo Clobbered Registers: AH AL AX CH CL CX DI DIL EAX ECX EDI RAX RCX RDI
+; CHECK: foo Clobbered Registers: %ah %al %ax %ch %cl %cx %di %dil %eax %ecx %edi %rax %rcx %rdi
define void @foo() #0 {
call void asm sideeffect "", "~{eax},~{ecx},~{edi}"() #0
ret void
diff --git a/test/CodeGen/X86/ipra-reg-alias.ll b/test/CodeGen/X86/ipra-reg-alias.ll
index 36b768e4c4ff..c5c360756526 100644
--- a/test/CodeGen/X86/ipra-reg-alias.ll
+++ b/test/CodeGen/X86/ipra-reg-alias.ll
@@ -6,7 +6,7 @@ define i8 @main(i8 %X) {
%inc2 = mul i8 %inc, 5
; Here only CL is clobbred so CH should not be clobbred, but CX, ECX and RCX
; should be clobbered.
-; CHECK: main Clobbered Registers: AH AL AX CL CX EAX ECX EFLAGS RAX RCX
+; CHECK: main Clobbered Registers: %ah %al %ax %cl %cx %eax %ecx %eflags %rax %rcx
ret i8 %inc2
}
diff --git a/test/CodeGen/X86/ipra-reg-usage.ll b/test/CodeGen/X86/ipra-reg-usage.ll
index ca97472bb820..50c066de9656 100644
--- a/test/CodeGen/X86/ipra-reg-usage.ll
+++ b/test/CodeGen/X86/ipra-reg-usage.ll
@@ -3,7 +3,7 @@
target triple = "x86_64-unknown-unknown"
declare void @bar1()
define preserve_allcc void @foo()#0 {
-; CHECK: foo Clobbered Registers: CS DS EFLAGS EIP EIZ ES FPSW FS GS IP RIP RIZ SS BND0 BND1 BND2 BND3 CR0 CR1 CR2 CR3 CR4 CR5 CR6 CR7 CR8 CR9 CR10 CR11 CR12 CR13 CR14 CR15 DR0 DR1 DR2 DR3 DR4 DR5 DR6 DR7 DR8 DR9 DR10 DR11 DR12 DR13 DR14 DR15 FP0 FP1 FP2 FP3 FP4 FP5 FP6 FP7 K0 K1 K2 K3 K4 K5 K6 K7 MM0 MM1 MM2 MM3 MM4 MM5 MM6 MM7 R11 ST0 ST1 ST2 ST3 ST4 ST5 ST6 ST7 XMM16 XMM17 XMM18 XMM19 XMM20 XMM21 XMM22 XMM23 XMM24 XMM25 XMM26 XMM27 XMM28 XMM29 XMM30 XMM31 YMM0 YMM1 YMM2 YMM3 YMM4 YMM5 YMM6 YMM7 YMM8 YMM9 YMM10 YMM11 YMM12 YMM13 YMM14 YMM15 YMM16 YMM17 YMM18 YMM19 YMM20 YMM21 YMM22 YMM23 YMM24 YMM25 YMM26 YMM27 YMM28 YMM29 YMM30 YMM31 ZMM0 ZMM1 ZMM2 ZMM3 ZMM4 ZMM5 ZMM6 ZMM7 ZMM8 ZMM9 ZMM10 ZMM11 ZMM12 ZMM13 ZMM14 ZMM15 ZMM16 ZMM17 ZMM18 ZMM19 ZMM20 ZMM21 ZMM22 ZMM23 ZMM24 ZMM25 ZMM26 ZMM27 ZMM28 ZMM29 ZMM30 ZMM31 R11B R11D R11W
+; CHECK: foo Clobbered Registers: %cs %ds %eflags %eip %eiz %es %fpsw %fs %gs %ip %rip %riz %ss %ssp %bnd0 %bnd1 %bnd2 %bnd3 %cr0 %cr1 %cr2 %cr3 %cr4 %cr5 %cr6 %cr7 %cr8 %cr9 %cr10 %cr11 %cr12 %cr13 %cr14 %cr15 %dr0 %dr1 %dr2 %dr3 %dr4 %dr5 %dr6 %dr7 %dr8 %dr9 %dr10 %dr11 %dr12 %dr13 %dr14 %dr15 %fp0 %fp1 %fp2 %fp3 %fp4 %fp5 %fp6 %fp7 %k0 %k1 %k2 %k3 %k4 %k5 %k6 %k7 %mm0 %mm1 %mm2 %mm3 %mm4 %mm5 %mm6 %mm7 %r11 %st0 %st1 %st2 %st3 %st4 %st5 %st6 %st7 %xmm16 %xmm17 %xmm18 %xmm19 %xmm20 %xmm21 %xmm22 %xmm23 %xmm24 %xmm25 %xmm26 %xmm27 %xmm28 %xmm29 %xmm30 %xmm31 %ymm0 %ymm1 %ymm2 %ymm3 %ymm4 %ymm5 %ymm6 %ymm7 %ymm8 %ymm9 %ymm10 %ymm11 %ymm12 %ymm13 %ymm14 %ymm15 %ymm16 %ymm17 %ymm18 %ymm19 %ymm20 %ymm21 %ymm22 %ymm23 %ymm24 %ymm25 %ymm26 %ymm27 %ymm28 %ymm29 %ymm30 %ymm31 %zmm0 %zmm1 %zmm2 %zmm3 %zmm4 %zmm5 %zmm6 %zmm7 %zmm8 %zmm9 %zmm10 %zmm11 %zmm12 %zmm13 %zmm14 %zmm15 %zmm16 %zmm17 %zmm18 %zmm19 %zmm20 %zmm21 %zmm22 %zmm23 %zmm24 %zmm25 %zmm26 %zmm27 %zmm28 %zmm29 %zmm30 %zmm31 %r11b %r11d %r11w
call void @bar1()
call void @bar2()
ret void
diff --git a/test/CodeGen/X86/isel-optnone.ll b/test/CodeGen/X86/isel-optnone.ll
index 831ad3837d96..d78b5eb8fe1f 100644
--- a/test/CodeGen/X86/isel-optnone.ll
+++ b/test/CodeGen/X86/isel-optnone.ll
@@ -1,4 +1,4 @@
-; RUN: llc -O2 -march=x86 < %s | FileCheck %s
+; RUN: llc -O2 -mtriple=i686-- < %s | FileCheck %s
define i32* @fooOptnone(i32* %p, i32* %q, i32** %z) #0 {
entry:
diff --git a/test/CodeGen/X86/isel-sink.ll b/test/CodeGen/X86/isel-sink.ll
index 2f32097a09b2..ead3414d63dd 100644
--- a/test/CodeGen/X86/isel-sink.ll
+++ b/test/CodeGen/X86/isel-sink.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | FileCheck %s
+; RUN: llc < %s -mtriple=i686-- | FileCheck %s
define i32 @test(i32* %X, i32 %B) {
; CHECK-LABEL: test:
diff --git a/test/CodeGen/X86/isel-sink2.ll b/test/CodeGen/X86/isel-sink2.ll
index 65f1994b9fe1..e7236336bcd3 100644
--- a/test/CodeGen/X86/isel-sink2.ll
+++ b/test/CodeGen/X86/isel-sink2.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 > %t
+; RUN: llc < %s -mtriple=i686-- > %t
; RUN: grep "movb.7(%...)" %t
; RUN: not grep leal %t
diff --git a/test/CodeGen/X86/isnan.ll b/test/CodeGen/X86/isnan.ll
index 4d465c0c7aa8..98766838f14c 100644
--- a/test/CodeGen/X86/isnan.ll
+++ b/test/CodeGen/X86/isnan.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | not grep call
+; RUN: llc < %s -mtriple=i686-- | not grep call
declare i1 @llvm.isunordered.f64(double)
diff --git a/test/CodeGen/X86/isnan2.ll b/test/CodeGen/X86/isnan2.ll
index 7753346fd940..e28f8450a3ce 100644
--- a/test/CodeGen/X86/isnan2.ll
+++ b/test/CodeGen/X86/isnan2.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mcpu=yonah | not grep pxor
+; RUN: llc < %s -mtriple=i686-- -mcpu=yonah | not grep pxor
; This should not need to materialize 0.0 to evaluate the condition.
diff --git a/test/CodeGen/X86/ispositive.ll b/test/CodeGen/X86/ispositive.ll
index b1d1a20c8eb6..ac9bd4383719 100644
--- a/test/CodeGen/X86/ispositive.ll
+++ b/test/CodeGen/X86/ispositive.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | grep "shrl.*31"
+; RUN: llc < %s -mtriple=i686-- | grep "shrl.*31"
define i32 @test1(i32 %X) {
entry:
diff --git a/test/CodeGen/X86/jump_sign.ll b/test/CodeGen/X86/jump_sign.ll
index c767e06948f7..137edece0536 100644
--- a/test/CodeGen/X86/jump_sign.ll
+++ b/test/CodeGen/X86/jump_sign.ll
@@ -3,11 +3,11 @@
define i32 @func_f(i32 %X) {
; CHECK-LABEL: func_f:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: incl %eax
; CHECK-NEXT: jns .LBB0_2
-; CHECK-NEXT: # BB#1: # %cond_true
+; CHECK-NEXT: # %bb.1: # %cond_true
; CHECK-NEXT: calll bar
; CHECK-NEXT: .LBB0_2: # %cond_next
; CHECK-NEXT: jmp baz # TAILCALL
@@ -32,7 +32,7 @@ declare i32 @baz(...)
; rdar://11355268
define i32 @func_g(i32 %a, i32 %b) nounwind {
; CHECK-LABEL: func_g:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: xorl %ecx, %ecx
; CHECK-NEXT: subl {{[0-9]+}}(%esp), %eax
@@ -47,7 +47,7 @@ define i32 @func_g(i32 %a, i32 %b) nounwind {
; rdar://10734411
define i32 @func_h(i32 %a, i32 %b) nounwind {
; CHECK-LABEL: func_h:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: xorl %edx, %edx
@@ -62,7 +62,7 @@ define i32 @func_h(i32 %a, i32 %b) nounwind {
define i32 @func_i(i32 %a, i32 %b) nounwind {
; CHECK-LABEL: func_i:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: xorl %ecx, %ecx
; CHECK-NEXT: subl {{[0-9]+}}(%esp), %eax
@@ -76,7 +76,7 @@ define i32 @func_i(i32 %a, i32 %b) nounwind {
define i32 @func_j(i32 %a, i32 %b) nounwind {
; CHECK-LABEL: func_j:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: xorl %ecx, %ecx
; CHECK-NEXT: subl {{[0-9]+}}(%esp), %eax
@@ -90,7 +90,7 @@ define i32 @func_j(i32 %a, i32 %b) nounwind {
define i32 @func_k(i32 %a, i32 %b) nounwind {
; CHECK-LABEL: func_k:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: xorl %edx, %edx
@@ -106,7 +106,7 @@ define i32 @func_k(i32 %a, i32 %b) nounwind {
; redundant cmp instruction
define i32 @func_l(i32 %a, i32 %b) nounwind {
; CHECK-LABEL: func_l:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx
; CHECK-NEXT: movl %edx, %eax
@@ -121,7 +121,7 @@ define i32 @func_l(i32 %a, i32 %b) nounwind {
define i32 @func_m(i32 %a, i32 %b) nounwind {
; CHECK-LABEL: func_m:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: subl %ecx, %eax
@@ -137,14 +137,14 @@ define i32 @func_m(i32 %a, i32 %b) nounwind {
; a swapped sub.
define i32 @func_l2(i32 %a, i32 %b) nounwind {
; CHECK-LABEL: func_l2:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: movl %eax, %ecx
; CHECK-NEXT: subl %edx, %ecx
; CHECK-NEXT: cmpl %eax, %edx
; CHECK-NEXT: jne .LBB8_2
-; CHECK-NEXT: # BB#1: # %if.then
+; CHECK-NEXT: # %bb.1: # %if.then
; CHECK-NEXT: cmovgl %ecx, %eax
; CHECK-NEXT: retl
; CHECK-NEXT: .LBB8_2: # %if.else
@@ -165,12 +165,12 @@ if.else:
define i32 @func_l3(i32 %a, i32 %b) nounwind {
; CHECK-LABEL: func_l3:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: subl %ecx, %eax
; CHECK-NEXT: jge .LBB9_2
-; CHECK-NEXT: # BB#1: # %if.then
+; CHECK-NEXT: # %bb.1: # %if.then
; CHECK-NEXT: retl
; CHECK-NEXT: .LBB9_2: # %if.else
; CHECK-NEXT: incl %eax
@@ -191,7 +191,7 @@ if.else:
; When Movr0 is between sub and cmp, we need to move "Movr0" before sub.
define i32 @func_l4(i32 %a, i32 %b) nounwind {
; CHECK-LABEL: func_l4:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: xorl %edx, %edx
@@ -207,7 +207,7 @@ define i32 @func_l4(i32 %a, i32 %b) nounwind {
; rdar://11540023
define i32 @func_n(i32 %x, i32 %y) nounwind {
; CHECK-LABEL: func_n:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: cmpl %ecx, %eax
@@ -222,19 +222,19 @@ define i32 @func_n(i32 %x, i32 %y) nounwind {
; PR://13046
define void @func_o() nounwind uwtable {
; CHECK-LABEL: func_o:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: testb %al, %al
; CHECK-NEXT: je .LBB12_1
-; CHECK-NEXT: # BB#2: # %if.end.i
+; CHECK-NEXT: # %bb.2: # %if.end.i
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: testb %al, %al
; CHECK-NEXT: jne .LBB12_5
-; CHECK-NEXT: # BB#3: # %sw.bb
+; CHECK-NEXT: # %bb.3: # %sw.bb
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: testb %al, %al
; CHECK-NEXT: jne .LBB12_8
-; CHECK-NEXT: # BB#4: # %if.end29
+; CHECK-NEXT: # %bb.4: # %if.end29
; CHECK-NEXT: movzwl (%eax), %eax
; CHECK-NEXT: movzwl %ax, %eax
; CHECK-NEXT: imull $52429, %eax, %ecx # imm = 0xCCCD
@@ -247,13 +247,13 @@ define void @func_o() nounwind uwtable {
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: testb %al, %al
; CHECK-NEXT: je .LBB12_9
-; CHECK-NEXT: # BB#10: # %if.else.i104
+; CHECK-NEXT: # %bb.10: # %if.else.i104
; CHECK-NEXT: retl
; CHECK-NEXT: .LBB12_5: # %sw.default
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: testb %al, %al
; CHECK-NEXT: jne .LBB12_7
-; CHECK-NEXT: # BB#6: # %if.then.i96
+; CHECK-NEXT: # %bb.6: # %if.then.i96
; CHECK-NEXT: .LBB12_1: # %if.then.i
; CHECK-NEXT: .LBB12_9: # %if.then.i103
; CHECK-NEXT: .LBB12_7: # %if.else.i97
@@ -299,7 +299,7 @@ if.else.i104: ; preds = %if.then44
; rdar://11855129
define i32 @func_p(i32 %a, i32 %b) nounwind {
; CHECK-LABEL: func_p:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: xorl %ecx, %ecx
; CHECK-NEXT: addl {{[0-9]+}}(%esp), %eax
@@ -316,7 +316,7 @@ define i32 @func_p(i32 %a, i32 %b) nounwind {
; by sbb, we should not optimize cmp away.
define i32 @func_q(i32 %a0, i32 %a1, i32 %a2) {
; CHECK-LABEL: func_q:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
; CHECK-NEXT: movl %ecx, %edx
@@ -335,13 +335,13 @@ define i32 @func_q(i32 %a0, i32 %a1, i32 %a2) {
; rdar://11873276
define i8* @func_r(i8* %base, i32* nocapture %offset, i32 %size) nounwind {
; CHECK-LABEL: func_r:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx
; CHECK-NEXT: movl (%edx), %ecx
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: subl {{[0-9]+}}(%esp), %ecx
; CHECK-NEXT: jl .LBB15_2
-; CHECK-NEXT: # BB#1: # %if.end
+; CHECK-NEXT: # %bb.1: # %if.end
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: movl %ecx, (%edx)
; CHECK-NEXT: addl %ecx, %eax
@@ -366,7 +366,7 @@ return:
; Test optimizations of dec/inc.
define i32 @func_dec(i32 %a) nounwind {
; CHECK-LABEL: func_dec:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: xorl %ecx, %ecx
; CHECK-NEXT: decl %eax
@@ -380,7 +380,7 @@ define i32 @func_dec(i32 %a) nounwind {
define i32 @func_inc(i32 %a) nounwind {
; CHECK-LABEL: func_inc:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: xorl %ecx, %ecx
; CHECK-NEXT: incl %eax
@@ -397,7 +397,7 @@ define i32 @func_inc(i32 %a) nounwind {
@a = common global i32 0, align 4
define i32 @func_test1(i32 %p1) nounwind uwtable {
; CHECK-LABEL: func_test1:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movl b, %eax
; CHECK-NEXT: cmpl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: setb %cl
@@ -405,7 +405,7 @@ define i32 @func_test1(i32 %p1) nounwind uwtable {
; CHECK-NEXT: movl %eax, %edx
; CHECK-NEXT: andb %cl, %dl
; CHECK-NEXT: je .LBB18_2
-; CHECK-NEXT: # BB#1: # %if.then
+; CHECK-NEXT: # %bb.1: # %if.then
; CHECK-NEXT: decl %eax
; CHECK-NEXT: movl %eax, a
; CHECK-NEXT: .LBB18_2: # %if.end
diff --git a/test/CodeGen/X86/known-bits-vector.ll b/test/CodeGen/X86/known-bits-vector.ll
index eee466a5a60a..283d1f93dfb6 100644
--- a/test/CodeGen/X86/known-bits-vector.ll
+++ b/test/CodeGen/X86/known-bits-vector.ll
@@ -4,13 +4,13 @@
define i32 @knownbits_mask_extract_sext(<8 x i16> %a0) nounwind {
; X32-LABEL: knownbits_mask_extract_sext:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0
; X32-NEXT: vpextrw $0, %xmm0, %eax
; X32-NEXT: retl
;
; X64-LABEL: knownbits_mask_extract_sext:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; X64-NEXT: vpextrw $0, %xmm0, %eax
; X64-NEXT: retq
@@ -22,7 +22,7 @@ define i32 @knownbits_mask_extract_sext(<8 x i16> %a0) nounwind {
define float @knownbits_mask_extract_uitofp(<2 x i64> %a0) nounwind {
; X32-LABEL: knownbits_mask_extract_uitofp:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pushl %eax
; X32-NEXT: vpxor %xmm1, %xmm1, %xmm1
; X32-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4,5,6,7]
@@ -34,7 +34,7 @@ define float @knownbits_mask_extract_uitofp(<2 x i64> %a0) nounwind {
; X32-NEXT: retl
;
; X64-LABEL: knownbits_mask_extract_uitofp:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1
; X64-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4,5,6,7]
; X64-NEXT: vmovq %xmm0, %rax
@@ -48,7 +48,7 @@ define float @knownbits_mask_extract_uitofp(<2 x i64> %a0) nounwind {
define <4 x float> @knownbits_insert_uitofp(<4 x i32> %a0, i16 %a1, i16 %a2) nounwind {
; X32-LABEL: knownbits_insert_uitofp:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: vpinsrd $0, %eax, %xmm0, %xmm0
@@ -58,7 +58,7 @@ define <4 x float> @knownbits_insert_uitofp(<4 x i32> %a0, i16 %a1, i16 %a2) nou
; X32-NEXT: retl
;
; X64-LABEL: knownbits_insert_uitofp:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movzwl %di, %eax
; X64-NEXT: movzwl %si, %ecx
; X64-NEXT: vpinsrd $0, %eax, %xmm0, %xmm0
@@ -77,14 +77,14 @@ define <4 x float> @knownbits_insert_uitofp(<4 x i32> %a0, i16 %a1, i16 %a2) nou
define <4 x i32> @knownbits_mask_shuffle_sext(<8 x i16> %a0) nounwind {
; X32-LABEL: knownbits_mask_shuffle_sext:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0
; X32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
; X32-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; X32-NEXT: retl
;
; X64-LABEL: knownbits_mask_shuffle_sext:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
; X64-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
@@ -97,14 +97,14 @@ define <4 x i32> @knownbits_mask_shuffle_sext(<8 x i16> %a0) nounwind {
define <4 x i32> @knownbits_mask_shuffle_shuffle_sext(<8 x i16> %a0) nounwind {
; X32-LABEL: knownbits_mask_shuffle_shuffle_sext:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0
; X32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
; X32-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; X32-NEXT: retl
;
; X64-LABEL: knownbits_mask_shuffle_shuffle_sext:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
; X64-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
@@ -118,14 +118,14 @@ define <4 x i32> @knownbits_mask_shuffle_shuffle_sext(<8 x i16> %a0) nounwind {
define <4 x i32> @knownbits_mask_shuffle_shuffle_undef_sext(<8 x i16> %a0) nounwind {
; X32-LABEL: knownbits_mask_shuffle_shuffle_undef_sext:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0
; X32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
; X32-NEXT: vpmovsxwd %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: knownbits_mask_shuffle_shuffle_undef_sext:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
; X64-NEXT: vpmovsxwd %xmm0, %xmm0
@@ -139,16 +139,16 @@ define <4 x i32> @knownbits_mask_shuffle_shuffle_undef_sext(<8 x i16> %a0) nounw
define <4 x float> @knownbits_mask_shuffle_uitofp(<4 x i32> %a0) nounwind {
; X32-LABEL: knownbits_mask_shuffle_uitofp:
-; X32: # BB#0:
-; X32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0
-; X32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; X32: # %bb.0:
+; X32-NEXT: vandps {{\.LCPI.*}}, %xmm0, %xmm0
+; X32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3]
; X32-NEXT: vcvtdq2ps %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: knownbits_mask_shuffle_uitofp:
-; X64: # BB#0:
-; X64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; X64: # %bb.0:
+; X64-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0
+; X64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3]
; X64-NEXT: vcvtdq2ps %xmm0, %xmm0
; X64-NEXT: retq
%1 = and <4 x i32> %a0, <i32 -1, i32 -1, i32 255, i32 4085>
@@ -159,18 +159,18 @@ define <4 x float> @knownbits_mask_shuffle_uitofp(<4 x i32> %a0) nounwind {
define <4 x float> @knownbits_mask_or_shuffle_uitofp(<4 x i32> %a0) nounwind {
; X32-LABEL: knownbits_mask_or_shuffle_uitofp:
-; X32: # BB#0:
-; X32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0
-; X32-NEXT: vpor {{\.LCPI.*}}, %xmm0, %xmm0
-; X32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; X32: # %bb.0:
+; X32-NEXT: vandps {{\.LCPI.*}}, %xmm0, %xmm0
+; X32-NEXT: vorps {{\.LCPI.*}}, %xmm0, %xmm0
+; X32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3]
; X32-NEXT: vcvtdq2ps %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: knownbits_mask_or_shuffle_uitofp:
-; X64: # BB#0:
-; X64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; X64-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
-; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; X64: # %bb.0:
+; X64-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0
+; X64-NEXT: vorps {{.*}}(%rip), %xmm0, %xmm0
+; X64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3]
; X64-NEXT: vcvtdq2ps %xmm0, %xmm0
; X64-NEXT: retq
%1 = and <4 x i32> %a0, <i32 -1, i32 -1, i32 255, i32 4085>
@@ -182,18 +182,18 @@ define <4 x float> @knownbits_mask_or_shuffle_uitofp(<4 x i32> %a0) nounwind {
define <4 x float> @knownbits_mask_xor_shuffle_uitofp(<4 x i32> %a0) nounwind {
; X32-LABEL: knownbits_mask_xor_shuffle_uitofp:
-; X32: # BB#0:
-; X32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0
-; X32-NEXT: vpxor {{\.LCPI.*}}, %xmm0, %xmm0
-; X32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; X32: # %bb.0:
+; X32-NEXT: vandps {{\.LCPI.*}}, %xmm0, %xmm0
+; X32-NEXT: vxorps {{\.LCPI.*}}, %xmm0, %xmm0
+; X32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3]
; X32-NEXT: vcvtdq2ps %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: knownbits_mask_xor_shuffle_uitofp:
-; X64: # BB#0:
-; X64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; X64-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
-; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; X64: # %bb.0:
+; X64-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0
+; X64-NEXT: vxorps {{.*}}(%rip), %xmm0, %xmm0
+; X64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3]
; X64-NEXT: vcvtdq2ps %xmm0, %xmm0
; X64-NEXT: retq
%1 = and <4 x i32> %a0, <i32 -1, i32 -1, i32 255, i32 4085>
@@ -205,12 +205,12 @@ define <4 x float> @knownbits_mask_xor_shuffle_uitofp(<4 x i32> %a0) nounwind {
define <4 x i32> @knownbits_mask_shl_shuffle_lshr(<4 x i32> %a0) nounwind {
; X32-LABEL: knownbits_mask_shl_shuffle_lshr:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vxorps %xmm0, %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: knownbits_mask_shl_shuffle_lshr:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0
; X64-NEXT: retq
%1 = and <4 x i32> %a0, <i32 -65536, i32 -7, i32 -7, i32 -65536>
@@ -222,12 +222,12 @@ define <4 x i32> @knownbits_mask_shl_shuffle_lshr(<4 x i32> %a0) nounwind {
define <4 x i32> @knownbits_mask_ashr_shuffle_lshr(<4 x i32> %a0) nounwind {
; X32-LABEL: knownbits_mask_ashr_shuffle_lshr:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vxorps %xmm0, %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: knownbits_mask_ashr_shuffle_lshr:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0
; X64-NEXT: retq
%1 = and <4 x i32> %a0, <i32 131071, i32 -1, i32 -1, i32 131071>
@@ -239,12 +239,12 @@ define <4 x i32> @knownbits_mask_ashr_shuffle_lshr(<4 x i32> %a0) nounwind {
define <4 x i32> @knownbits_mask_mul_shuffle_shl(<4 x i32> %a0, <4 x i32> %a1) nounwind {
; X32-LABEL: knownbits_mask_mul_shuffle_shl:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vxorps %xmm0, %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: knownbits_mask_mul_shuffle_shl:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0
; X64-NEXT: retq
%1 = and <4 x i32> %a0, <i32 -65536, i32 -7, i32 -7, i32 -65536>
@@ -256,12 +256,12 @@ define <4 x i32> @knownbits_mask_mul_shuffle_shl(<4 x i32> %a0, <4 x i32> %a1) n
define <4 x i32> @knownbits_mask_trunc_shuffle_shl(<4 x i64> %a0) nounwind {
; X32-LABEL: knownbits_mask_trunc_shuffle_shl:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vxorps %xmm0, %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: knownbits_mask_trunc_shuffle_shl:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0
; X64-NEXT: retq
%1 = and <4 x i64> %a0, <i64 -65536, i64 -7, i64 7, i64 -65536>
@@ -273,12 +273,12 @@ define <4 x i32> @knownbits_mask_trunc_shuffle_shl(<4 x i64> %a0) nounwind {
define <4 x i32> @knownbits_mask_add_shuffle_lshr(<4 x i32> %a0, <4 x i32> %a1) nounwind {
; X32-LABEL: knownbits_mask_add_shuffle_lshr:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vxorps %xmm0, %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: knownbits_mask_add_shuffle_lshr:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0
; X64-NEXT: retq
%1 = and <4 x i32> %a0, <i32 32767, i32 -1, i32 -1, i32 32767>
@@ -291,12 +291,12 @@ define <4 x i32> @knownbits_mask_add_shuffle_lshr(<4 x i32> %a0, <4 x i32> %a1)
define <4 x i32> @knownbits_mask_sub_shuffle_lshr(<4 x i32> %a0) nounwind {
; X32-LABEL: knownbits_mask_sub_shuffle_lshr:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vxorps %xmm0, %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: knownbits_mask_sub_shuffle_lshr:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0
; X64-NEXT: retq
%1 = and <4 x i32> %a0, <i32 15, i32 -1, i32 -1, i32 15>
@@ -308,12 +308,12 @@ define <4 x i32> @knownbits_mask_sub_shuffle_lshr(<4 x i32> %a0) nounwind {
define <4 x i32> @knownbits_mask_udiv_shuffle_lshr(<4 x i32> %a0, <4 x i32> %a1) nounwind {
; X32-LABEL: knownbits_mask_udiv_shuffle_lshr:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vxorps %xmm0, %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: knownbits_mask_udiv_shuffle_lshr:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0
; X64-NEXT: retq
%1 = and <4 x i32> %a0, <i32 32767, i32 -1, i32 -1, i32 32767>
@@ -325,12 +325,12 @@ define <4 x i32> @knownbits_mask_udiv_shuffle_lshr(<4 x i32> %a0, <4 x i32> %a1)
define <4 x i32> @knownbits_urem_lshr(<4 x i32> %a0) nounwind {
; X32-LABEL: knownbits_urem_lshr:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vxorps %xmm0, %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: knownbits_urem_lshr:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0
; X64-NEXT: retq
%1 = urem <4 x i32> %a0, <i32 16, i32 16, i32 16, i32 16>
@@ -340,12 +340,12 @@ define <4 x i32> @knownbits_urem_lshr(<4 x i32> %a0) nounwind {
define <4 x i32> @knownbits_mask_urem_shuffle_lshr(<4 x i32> %a0, <4 x i32> %a1) nounwind {
; X32-LABEL: knownbits_mask_urem_shuffle_lshr:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vxorps %xmm0, %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: knownbits_mask_urem_shuffle_lshr:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0
; X64-NEXT: retq
%1 = and <4 x i32> %a0, <i32 32767, i32 -1, i32 -1, i32 32767>
@@ -358,12 +358,12 @@ define <4 x i32> @knownbits_mask_urem_shuffle_lshr(<4 x i32> %a0, <4 x i32> %a1)
define <4 x i32> @knownbits_mask_srem_shuffle_lshr(<4 x i32> %a0) nounwind {
; X32-LABEL: knownbits_mask_srem_shuffle_lshr:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vxorps %xmm0, %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: knownbits_mask_srem_shuffle_lshr:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0
; X64-NEXT: retq
%1 = and <4 x i32> %a0, <i32 -32768, i32 -1, i32 -1, i32 -32768>
@@ -375,12 +375,12 @@ define <4 x i32> @knownbits_mask_srem_shuffle_lshr(<4 x i32> %a0) nounwind {
define <4 x i32> @knownbits_mask_bswap_shuffle_shl(<4 x i32> %a0) nounwind {
; X32-LABEL: knownbits_mask_bswap_shuffle_shl:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vxorps %xmm0, %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: knownbits_mask_bswap_shuffle_shl:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0
; X64-NEXT: retq
%1 = and <4 x i32> %a0, <i32 32767, i32 -1, i32 -1, i32 32767>
@@ -393,21 +393,21 @@ declare <4 x i32> @llvm.bswap.v4i32(<4 x i32>)
define <8 x float> @knownbits_mask_concat_uitofp(<4 x i32> %a0, <4 x i32> %a1) nounwind {
; X32-LABEL: knownbits_mask_concat_uitofp:
-; X32: # BB#0:
-; X32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0
-; X32-NEXT: vpand {{\.LCPI.*}}, %xmm1, %xmm1
-; X32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,0,2]
-; X32-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,3,1,3]
+; X32: # %bb.0:
+; X32-NEXT: vandps {{\.LCPI.*}}, %xmm0, %xmm0
+; X32-NEXT: vandps {{\.LCPI.*}}, %xmm1, %xmm1
+; X32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,0,2]
+; X32-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[1,3,1,3]
; X32-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; X32-NEXT: vcvtdq2ps %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: knownbits_mask_concat_uitofp:
-; X64: # BB#0:
-; X64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; X64-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
-; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,0,2]
-; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,3,1,3]
+; X64: # %bb.0:
+; X64-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0
+; X64-NEXT: vandps {{.*}}(%rip), %xmm1, %xmm1
+; X64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,0,2]
+; X64-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[1,3,1,3]
; X64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; X64-NEXT: vcvtdq2ps %ymm0, %ymm0
; X64-NEXT: retq
@@ -420,14 +420,14 @@ define <8 x float> @knownbits_mask_concat_uitofp(<4 x i32> %a0, <4 x i32> %a1) n
define <4 x float> @knownbits_lshr_bitcast_shuffle_uitofp(<2 x i64> %a0, <4 x i32> %a1) nounwind {
; X32-LABEL: knownbits_lshr_bitcast_shuffle_uitofp:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpsrlq $1, %xmm0, %xmm0
; X32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
; X32-NEXT: vcvtdq2ps %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: knownbits_lshr_bitcast_shuffle_uitofp:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpsrlq $1, %xmm0, %xmm0
; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
; X64-NEXT: vcvtdq2ps %xmm0, %xmm0
@@ -441,7 +441,7 @@ define <4 x float> @knownbits_lshr_bitcast_shuffle_uitofp(<2 x i64> %a0, <4 x i3
define <4 x float> @knownbits_smax_smin_shuffle_uitofp(<4 x i32> %a0) {
; X32-LABEL: knownbits_smax_smin_shuffle_uitofp:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpminsd {{\.LCPI.*}}, %xmm0, %xmm0
; X32-NEXT: vpmaxsd {{\.LCPI.*}}, %xmm0, %xmm0
; X32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,3,3]
@@ -453,7 +453,7 @@ define <4 x float> @knownbits_smax_smin_shuffle_uitofp(<4 x i32> %a0) {
; X32-NEXT: retl
;
; X64-LABEL: knownbits_smax_smin_shuffle_uitofp:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpminsd {{.*}}(%rip), %xmm0, %xmm0
; X64-NEXT: vpmaxsd {{.*}}(%rip), %xmm0, %xmm0
; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,3,3]
@@ -474,14 +474,14 @@ declare <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32>, <4 x i32>) nounwind readnone
define <4 x float> @knownbits_umin_shuffle_uitofp(<4 x i32> %a0) {
; X32-LABEL: knownbits_umin_shuffle_uitofp:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpminud {{\.LCPI.*}}, %xmm0, %xmm0
; X32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,3,3]
; X32-NEXT: vcvtdq2ps %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: knownbits_umin_shuffle_uitofp:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm0
; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,3,3]
; X64-NEXT: vcvtdq2ps %xmm0, %xmm0
@@ -496,13 +496,13 @@ declare <4 x i32> @llvm.x86.sse41.pminud(<4 x i32>, <4 x i32>) nounwind readnone
define <4 x i32> @knownbits_umax_shuffle_ashr(<4 x i32> %a0) {
; X32-LABEL: knownbits_umax_shuffle_ashr:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpmaxud {{\.LCPI.*}}, %xmm0, %xmm0
; X32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,2]
; X32-NEXT: retl
;
; X64-LABEL: knownbits_umax_shuffle_ashr:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpmaxud {{.*}}(%rip), %xmm0, %xmm0
; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,2]
; X64-NEXT: retq
@@ -514,7 +514,7 @@ define <4 x i32> @knownbits_umax_shuffle_ashr(<4 x i32> %a0) {
define <4 x float> @knownbits_mask_umax_shuffle_uitofp(<4 x i32> %a0) {
; X32-LABEL: knownbits_mask_umax_shuffle_uitofp:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0
; X32-NEXT: vpmaxud {{\.LCPI.*}}, %xmm0, %xmm0
; X32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,3,3]
@@ -522,7 +522,7 @@ define <4 x float> @knownbits_mask_umax_shuffle_uitofp(<4 x i32> %a0) {
; X32-NEXT: retl
;
; X64-LABEL: knownbits_mask_umax_shuffle_uitofp:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; X64-NEXT: vpmaxud {{.*}}(%rip), %xmm0, %xmm0
; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,3,3]
@@ -537,12 +537,12 @@ define <4 x float> @knownbits_mask_umax_shuffle_uitofp(<4 x i32> %a0) {
define <4 x i32> @knownbits_mask_bitreverse_ashr(<4 x i32> %a0) {
; X32-LABEL: knownbits_mask_bitreverse_ashr:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vxorps %xmm0, %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: knownbits_mask_bitreverse_ashr:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0
; X64-NEXT: retq
%1 = and <4 x i32> %a0, <i32 -2, i32 -2, i32 -2, i32 -2>
@@ -555,7 +555,7 @@ declare <4 x i32> @llvm.bitreverse.v4i32(<4 x i32>) nounwind readnone
; If we don't know that the input isn't INT_MIN we can't combine to sitofp
define <4 x float> @knownbits_abs_uitofp(<4 x i32> %a0) {
; X32-LABEL: knownbits_abs_uitofp:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpabsd %xmm0, %xmm0
; X32-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7]
; X32-NEXT: vpsrld $16, %xmm0, %xmm0
@@ -565,7 +565,7 @@ define <4 x float> @knownbits_abs_uitofp(<4 x i32> %a0) {
; X32-NEXT: retl
;
; X64-LABEL: knownbits_abs_uitofp:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpabsd %xmm0, %xmm0
; X64-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7]
; X64-NEXT: vpsrld $16, %xmm0, %xmm0
@@ -582,7 +582,7 @@ define <4 x float> @knownbits_abs_uitofp(<4 x i32> %a0) {
define <4 x float> @knownbits_or_abs_uitofp(<4 x i32> %a0) {
; X32-LABEL: knownbits_or_abs_uitofp:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpor {{\.LCPI.*}}, %xmm0, %xmm0
; X32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,0,2]
; X32-NEXT: vpabsd %xmm0, %xmm0
@@ -590,7 +590,7 @@ define <4 x float> @knownbits_or_abs_uitofp(<4 x i32> %a0) {
; X32-NEXT: retl
;
; X64-LABEL: knownbits_or_abs_uitofp:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,0,2]
; X64-NEXT: vpabsd %xmm0, %xmm0
@@ -604,3 +604,79 @@ define <4 x float> @knownbits_or_abs_uitofp(<4 x i32> %a0) {
%6 = uitofp <4 x i32> %5 to <4 x float>
ret <4 x float> %6
}
+
+define <4 x float> @knownbits_and_select_shuffle_uitofp(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2, <4 x i32> %a3) nounwind {
+; X32-LABEL: knownbits_and_select_shuffle_uitofp:
+; X32: # %bb.0:
+; X32-NEXT: pushl %ebp
+; X32-NEXT: movl %esp, %ebp
+; X32-NEXT: andl $-16, %esp
+; X32-NEXT: subl $16, %esp
+; X32-NEXT: vmovaps 8(%ebp), %xmm3
+; X32-NEXT: vandps {{\.LCPI.*}}, %xmm2, %xmm2
+; X32-NEXT: vandps {{\.LCPI.*}}, %xmm3, %xmm3
+; X32-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; X32-NEXT: vblendvps %xmm0, %xmm2, %xmm3, %xmm0
+; X32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; X32-NEXT: vcvtdq2ps %xmm0, %xmm0
+; X32-NEXT: movl %ebp, %esp
+; X32-NEXT: popl %ebp
+; X32-NEXT: retl
+;
+; X64-LABEL: knownbits_and_select_shuffle_uitofp:
+; X64: # %bb.0:
+; X64-NEXT: vandps {{.*}}(%rip), %xmm2, %xmm2
+; X64-NEXT: vandps {{.*}}(%rip), %xmm3, %xmm3
+; X64-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; X64-NEXT: vblendvps %xmm0, %xmm2, %xmm3, %xmm0
+; X64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; X64-NEXT: vcvtdq2ps %xmm0, %xmm0
+; X64-NEXT: retq
+ %1 = and <4 x i32> %a2, <i32 65535, i32 -1, i32 255, i32 -1>
+ %2 = and <4 x i32> %a3, <i32 255, i32 -1, i32 65535, i32 -1>
+ %3 = icmp eq <4 x i32> %a0, %a1
+ %4 = select <4 x i1> %3, <4 x i32> %1, <4 x i32> %2
+ %5 = shufflevector <4 x i32> %4, <4 x i32> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+ %6 = uitofp <4 x i32> %5 to <4 x float>
+ ret <4 x float> %6
+}
+
+define <4 x float> @knownbits_lshr_and_select_shuffle_uitofp(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2, <4 x i32> %a3) nounwind {
+; X32-LABEL: knownbits_lshr_and_select_shuffle_uitofp:
+; X32: # %bb.0:
+; X32-NEXT: pushl %ebp
+; X32-NEXT: movl %esp, %ebp
+; X32-NEXT: andl $-16, %esp
+; X32-NEXT: subl $16, %esp
+; X32-NEXT: vmovaps 8(%ebp), %xmm3
+; X32-NEXT: vpsrld $1, %xmm2, %xmm4
+; X32-NEXT: vpsrld $5, %xmm2, %xmm2
+; X32-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7]
+; X32-NEXT: vandps {{\.LCPI.*}}, %xmm3, %xmm3
+; X32-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; X32-NEXT: vblendvps %xmm0, %xmm2, %xmm3, %xmm0
+; X32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; X32-NEXT: vcvtdq2ps %xmm0, %xmm0
+; X32-NEXT: movl %ebp, %esp
+; X32-NEXT: popl %ebp
+; X32-NEXT: retl
+;
+; X64-LABEL: knownbits_lshr_and_select_shuffle_uitofp:
+; X64: # %bb.0:
+; X64-NEXT: vpsrld $1, %xmm2, %xmm4
+; X64-NEXT: vpsrld $5, %xmm2, %xmm2
+; X64-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7]
+; X64-NEXT: vandps {{.*}}(%rip), %xmm3, %xmm3
+; X64-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; X64-NEXT: vblendvps %xmm0, %xmm2, %xmm3, %xmm0
+; X64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; X64-NEXT: vcvtdq2ps %xmm0, %xmm0
+; X64-NEXT: retq
+ %1 = lshr <4 x i32> %a2, <i32 5, i32 1, i32 5, i32 1>
+ %2 = and <4 x i32> %a3, <i32 255, i32 -1, i32 65535, i32 -1>
+ %3 = icmp eq <4 x i32> %a0, %a1
+ %4 = select <4 x i1> %3, <4 x i32> %1, <4 x i32> %2
+ %5 = shufflevector <4 x i32> %4, <4 x i32> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+ %6 = uitofp <4 x i32> %5 to <4 x float>
+ ret <4 x float> %6
+}
diff --git a/test/CodeGen/X86/known-bits.ll b/test/CodeGen/X86/known-bits.ll
index 90f6e9301389..91cde32d10e9 100644
--- a/test/CodeGen/X86/known-bits.ll
+++ b/test/CodeGen/X86/known-bits.ll
@@ -4,7 +4,7 @@
define void @knownbits_zext_in_reg(i8*) nounwind {
; X32-LABEL: knownbits_zext_in_reg:
-; X32: # BB#0: # %BB
+; X32: # %bb.0: # %BB
; X32-NEXT: pushl %ebp
; X32-NEXT: pushl %ebx
; X32-NEXT: pushl %edi
@@ -12,8 +12,8 @@ define void @knownbits_zext_in_reg(i8*) nounwind {
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movzbl (%eax), %eax
; X32-NEXT: imull $101, %eax, %eax
-; X32-NEXT: andl $16384, %eax # imm = 0x4000
; X32-NEXT: shrl $14, %eax
+; X32-NEXT: movzwl %ax, %eax
; X32-NEXT: movzbl %al, %eax
; X32-NEXT: vmovd %eax, %xmm0
; X32-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -47,11 +47,11 @@ define void @knownbits_zext_in_reg(i8*) nounwind {
; X32-NEXT: jmp .LBB0_1
;
; X64-LABEL: knownbits_zext_in_reg:
-; X64: # BB#0: # %BB
+; X64: # %bb.0: # %BB
; X64-NEXT: movzbl (%rdi), %eax
; X64-NEXT: imull $101, %eax, %eax
-; X64-NEXT: andl $16384, %eax # imm = 0x4000
; X64-NEXT: shrl $14, %eax
+; X64-NEXT: movzwl %ax, %eax
; X64-NEXT: movzbl %al, %eax
; X64-NEXT: vmovd %eax, %xmm0
; X64-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -106,12 +106,12 @@ CF246: ; preds = %CF237
define i32 @knownbits_mask_add_lshr(i32 %a0, i32 %a1) nounwind {
; X32-LABEL: knownbits_mask_add_lshr:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: xorl %eax, %eax
; X32-NEXT: retl
;
; X64-LABEL: knownbits_mask_add_lshr:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: retq
%1 = and i32 %a0, 32767
@@ -123,7 +123,7 @@ define i32 @knownbits_mask_add_lshr(i32 %a0, i32 %a1) nounwind {
define i128 @knownbits_mask_addc_shl(i64 %a0, i64 %a1, i64 %a2) nounwind {
; X32-LABEL: knownbits_mask_addc_shl:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pushl %edi
; X32-NEXT: pushl %esi
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
@@ -147,7 +147,7 @@ define i128 @knownbits_mask_addc_shl(i64 %a0, i64 %a1, i64 %a2) nounwind {
; X32-NEXT: retl $4
;
; X64-LABEL: knownbits_mask_addc_shl:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: andq $-1024, %rdi # imm = 0xFC00
; X64-NEXT: andq $-1024, %rsi # imm = 0xFC00
; X64-NEXT: addq %rdi, %rsi
@@ -169,7 +169,7 @@ define i128 @knownbits_mask_addc_shl(i64 %a0, i64 %a1, i64 %a2) nounwind {
define {i32, i1} @knownbits_uaddo_saddo(i64 %a0, i64 %a1) nounwind {
; X32-LABEL: knownbits_uaddo_saddo:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pushl %ebx
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
@@ -193,7 +193,7 @@ define {i32, i1} @knownbits_uaddo_saddo(i64 %a0, i64 %a1) nounwind {
; X32-NEXT: retl
;
; X64-LABEL: knownbits_uaddo_saddo:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: shlq $32, %rdi
; X64-NEXT: shlq $32, %rsi
; X64-NEXT: addq %rdi, %rsi
@@ -220,7 +220,7 @@ define {i32, i1} @knownbits_uaddo_saddo(i64 %a0, i64 %a1) nounwind {
define {i32, i1} @knownbits_usubo_ssubo(i64 %a0, i64 %a1) nounwind {
; X32-LABEL: knownbits_usubo_ssubo:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pushl %ebx
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
@@ -244,7 +244,7 @@ define {i32, i1} @knownbits_usubo_ssubo(i64 %a0, i64 %a1) nounwind {
; X32-NEXT: retl
;
; X64-LABEL: knownbits_usubo_ssubo:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: shlq $32, %rdi
; X64-NEXT: shlq $32, %rsi
; X64-NEXT: cmpq %rsi, %rdi
diff --git a/test/CodeGen/X86/known-signbits-vector.ll b/test/CodeGen/X86/known-signbits-vector.ll
index ec620b8ce877..a003a5520d03 100644
--- a/test/CodeGen/X86/known-signbits-vector.ll
+++ b/test/CodeGen/X86/known-signbits-vector.ll
@@ -4,12 +4,12 @@
define <2 x double> @signbits_sext_v2i64_sitofp_v2f64(i32 %a0, i32 %a1) nounwind {
; X32-LABEL: signbits_sext_v2i64_sitofp_v2f64:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vcvtdq2pd {{[0-9]+}}(%esp), %xmm0
; X32-NEXT: retl
;
; X64-LABEL: signbits_sext_v2i64_sitofp_v2f64:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovd %edi, %xmm0
; X64-NEXT: vpinsrd $1, %esi, %xmm0, %xmm0
; X64-NEXT: vcvtdq2pd %xmm0, %xmm0
@@ -24,7 +24,7 @@ define <2 x double> @signbits_sext_v2i64_sitofp_v2f64(i32 %a0, i32 %a1) nounwind
define <4 x float> @signbits_sext_v4i64_sitofp_v4f32(i8 signext %a0, i16 signext %a1, i32 %a2, i32 %a3) nounwind {
; X32-LABEL: signbits_sext_v4i64_sitofp_v4f32:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movsbl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movswl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: vmovd %eax, %xmm0
@@ -46,7 +46,7 @@ define <4 x float> @signbits_sext_v4i64_sitofp_v4f32(i8 signext %a0, i16 signext
; X32-NEXT: retl
;
; X64-LABEL: signbits_sext_v4i64_sitofp_v4f32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movslq %edi, %rax
; X64-NEXT: movslq %esi, %rsi
; X64-NEXT: movslq %edx, %rdx
@@ -72,19 +72,19 @@ define <4 x float> @signbits_sext_v4i64_sitofp_v4f32(i8 signext %a0, i16 signext
ret <4 x float> %9
}
-define float @signbits_ashr_extract_sitofp(<2 x i64> %a0) nounwind {
-; X32-LABEL: signbits_ashr_extract_sitofp:
-; X32: # BB#0:
+define float @signbits_ashr_extract_sitofp_0(<2 x i64> %a0) nounwind {
+; X32-LABEL: signbits_ashr_extract_sitofp_0:
+; X32: # %bb.0:
; X32-NEXT: pushl %eax
-; X32-NEXT: vpextrd $1, %xmm0, %eax
+; X32-NEXT: vextractps $1, %xmm0, %eax
; X32-NEXT: vcvtsi2ssl %eax, %xmm1, %xmm0
; X32-NEXT: vmovss %xmm0, (%esp)
; X32-NEXT: flds (%esp)
; X32-NEXT: popl %eax
; X32-NEXT: retl
;
-; X64-LABEL: signbits_ashr_extract_sitofp:
-; X64: # BB#0:
+; X64-LABEL: signbits_ashr_extract_sitofp_0:
+; X64: # %bb.0:
; X64-NEXT: vpsrad $31, %xmm0, %xmm1
; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
; X64-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
@@ -97,9 +97,86 @@ define float @signbits_ashr_extract_sitofp(<2 x i64> %a0) nounwind {
ret float %3
}
+define float @signbits_ashr_extract_sitofp_1(<2 x i64> %a0) nounwind {
+; X32-LABEL: signbits_ashr_extract_sitofp_1:
+; X32: # %bb.0:
+; X32-NEXT: pushl %eax
+; X32-NEXT: vmovdqa {{.*#+}} xmm1 = [0,2147483648,0,2147483648]
+; X32-NEXT: vpsrlq $63, %xmm1, %xmm2
+; X32-NEXT: vpsrlq $32, %xmm1, %xmm1
+; X32-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
+; X32-NEXT: vpsrlq $63, %xmm0, %xmm2
+; X32-NEXT: vpsrlq $32, %xmm0, %xmm0
+; X32-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
+; X32-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X32-NEXT: vpsubq %xmm1, %xmm0, %xmm0
+; X32-NEXT: vmovd %xmm0, %eax
+; X32-NEXT: vcvtsi2ssl %eax, %xmm3, %xmm0
+; X32-NEXT: vmovss %xmm0, (%esp)
+; X32-NEXT: flds (%esp)
+; X32-NEXT: popl %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: signbits_ashr_extract_sitofp_1:
+; X64: # %bb.0:
+; X64-NEXT: vpsrlq $63, %xmm0, %xmm1
+; X64-NEXT: vpsrlq $32, %xmm0, %xmm0
+; X64-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; X64-NEXT: vmovdqa {{.*#+}} xmm1 = [2147483648,1]
+; X64-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X64-NEXT: vpsubq %xmm1, %xmm0, %xmm0
+; X64-NEXT: vmovq %xmm0, %rax
+; X64-NEXT: vcvtsi2ssl %eax, %xmm2, %xmm0
+; X64-NEXT: retq
+ %1 = ashr <2 x i64> %a0, <i64 32, i64 63>
+ %2 = extractelement <2 x i64> %1, i32 0
+ %3 = sitofp i64 %2 to float
+ ret float %3
+}
+
+define float @signbits_ashr_shl_extract_sitofp(<2 x i64> %a0) nounwind {
+; X32-LABEL: signbits_ashr_shl_extract_sitofp:
+; X32: # %bb.0:
+; X32-NEXT: pushl %eax
+; X32-NEXT: vmovdqa {{.*#+}} xmm1 = [0,2147483648,0,2147483648]
+; X32-NEXT: vpsrlq $60, %xmm1, %xmm2
+; X32-NEXT: vpsrlq $61, %xmm1, %xmm1
+; X32-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
+; X32-NEXT: vpsrlq $60, %xmm0, %xmm2
+; X32-NEXT: vpsrlq $61, %xmm0, %xmm0
+; X32-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
+; X32-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X32-NEXT: vpsubq %xmm1, %xmm0, %xmm0
+; X32-NEXT: vpsllq $20, %xmm0, %xmm0
+; X32-NEXT: vmovd %xmm0, %eax
+; X32-NEXT: vcvtsi2ssl %eax, %xmm3, %xmm0
+; X32-NEXT: vmovss %xmm0, (%esp)
+; X32-NEXT: flds (%esp)
+; X32-NEXT: popl %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: signbits_ashr_shl_extract_sitofp:
+; X64: # %bb.0:
+; X64-NEXT: vpsrlq $60, %xmm0, %xmm1
+; X64-NEXT: vpsrlq $61, %xmm0, %xmm0
+; X64-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; X64-NEXT: vmovdqa {{.*#+}} xmm1 = [4,8]
+; X64-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; X64-NEXT: vpsubq %xmm1, %xmm0, %xmm0
+; X64-NEXT: vpsllq $20, %xmm0, %xmm0
+; X64-NEXT: vmovq %xmm0, %rax
+; X64-NEXT: vcvtsi2ssl %eax, %xmm2, %xmm0
+; X64-NEXT: retq
+ %1 = ashr <2 x i64> %a0, <i64 61, i64 60>
+ %2 = shl <2 x i64> %1, <i64 20, i64 16>
+ %3 = extractelement <2 x i64> %2, i32 0
+ %4 = sitofp i64 %3 to float
+ ret float %4
+}
+
define float @signbits_ashr_insert_ashr_extract_sitofp(i64 %a0, i64 %a1) nounwind {
; X32-LABEL: signbits_ashr_insert_ashr_extract_sitofp:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pushl %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
@@ -118,7 +195,7 @@ define float @signbits_ashr_insert_ashr_extract_sitofp(i64 %a0, i64 %a1) nounwin
; X32-NEXT: retl
;
; X64-LABEL: signbits_ashr_insert_ashr_extract_sitofp:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: sarq $30, %rdi
; X64-NEXT: vmovq %rsi, %xmm0
; X64-NEXT: vmovq %rdi, %xmm1
@@ -140,7 +217,7 @@ define float @signbits_ashr_insert_ashr_extract_sitofp(i64 %a0, i64 %a1) nounwin
define <4 x double> @signbits_sext_shuffle_sitofp(<4 x i32> %a0, <4 x i64> %a1) nounwind {
; X32-LABEL: signbits_sext_shuffle_sitofp:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpmovsxdq %xmm0, %xmm1
; X32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
; X32-NEXT: vpmovsxdq %xmm0, %xmm0
@@ -153,7 +230,7 @@ define <4 x double> @signbits_sext_shuffle_sitofp(<4 x i32> %a0, <4 x i64> %a1)
; X32-NEXT: retl
;
; X64-LABEL: signbits_sext_shuffle_sitofp:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpmovsxdq %xmm0, %xmm1
; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
; X64-NEXT: vpmovsxdq %xmm0, %xmm0
@@ -172,7 +249,7 @@ define <4 x double> @signbits_sext_shuffle_sitofp(<4 x i32> %a0, <4 x i64> %a1)
define <2 x double> @signbits_ashr_concat_ashr_extract_sitofp(<2 x i64> %a0, <4 x i64> %a1) nounwind {
; X32-LABEL: signbits_ashr_concat_ashr_extract_sitofp:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpsrad $16, %xmm0, %xmm1
; X32-NEXT: vpsrlq $16, %xmm0, %xmm0
; X32-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
@@ -182,7 +259,7 @@ define <2 x double> @signbits_ashr_concat_ashr_extract_sitofp(<2 x i64> %a0, <4
; X32-NEXT: retl
;
; X64-LABEL: signbits_ashr_concat_ashr_extract_sitofp:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpsrad $16, %xmm0, %xmm1
; X64-NEXT: vpsrlq $16, %xmm0, %xmm0
; X64-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
@@ -198,3 +275,189 @@ define <2 x double> @signbits_ashr_concat_ashr_extract_sitofp(<2 x i64> %a0, <4
%6 = sitofp <2 x i64> %5 to <2 x double>
ret <2 x double> %6
}
+
+define float @signbits_ashr_sext_sextinreg_and_extract_sitofp(<2 x i64> %a0, <2 x i64> %a1, i32 %a2) nounwind {
+; X32-LABEL: signbits_ashr_sext_sextinreg_and_extract_sitofp:
+; X32: # %bb.0:
+; X32-NEXT: pushl %eax
+; X32-NEXT: vmovdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648]
+; X32-NEXT: vpsrlq $60, %xmm2, %xmm3
+; X32-NEXT: vpsrlq $61, %xmm2, %xmm2
+; X32-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7]
+; X32-NEXT: vpsrlq $60, %xmm0, %xmm3
+; X32-NEXT: vpsrlq $61, %xmm0, %xmm0
+; X32-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
+; X32-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; X32-NEXT: vpsubq %xmm2, %xmm0, %xmm0
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vpinsrd $0, %eax, %xmm1, %xmm1
+; X32-NEXT: sarl $31, %eax
+; X32-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1
+; X32-NEXT: vpsllq $20, %xmm1, %xmm1
+; X32-NEXT: vpsrad $20, %xmm1, %xmm2
+; X32-NEXT: vpsrlq $20, %xmm1, %xmm1
+; X32-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; X32-NEXT: vpand %xmm1, %xmm0, %xmm0
+; X32-NEXT: vmovd %xmm0, %eax
+; X32-NEXT: vcvtsi2ssl %eax, %xmm4, %xmm0
+; X32-NEXT: vmovss %xmm0, (%esp)
+; X32-NEXT: flds (%esp)
+; X32-NEXT: popl %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: signbits_ashr_sext_sextinreg_and_extract_sitofp:
+; X64: # %bb.0:
+; X64-NEXT: vpsrlq $60, %xmm0, %xmm2
+; X64-NEXT: vpsrlq $61, %xmm0, %xmm0
+; X64-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
+; X64-NEXT: vmovdqa {{.*#+}} xmm2 = [4,8]
+; X64-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; X64-NEXT: vpsubq %xmm2, %xmm0, %xmm0
+; X64-NEXT: movslq %edi, %rax
+; X64-NEXT: vpinsrq $0, %rax, %xmm1, %xmm1
+; X64-NEXT: vpsllq $20, %xmm1, %xmm1
+; X64-NEXT: vpsrad $20, %xmm1, %xmm2
+; X64-NEXT: vpsrlq $20, %xmm1, %xmm1
+; X64-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; X64-NEXT: vpand %xmm1, %xmm0, %xmm0
+; X64-NEXT: vmovq %xmm0, %rax
+; X64-NEXT: vcvtsi2ssl %eax, %xmm3, %xmm0
+; X64-NEXT: retq
+ %1 = ashr <2 x i64> %a0, <i64 61, i64 60>
+ %2 = sext i32 %a2 to i64
+ %3 = insertelement <2 x i64> %a1, i64 %2, i32 0
+ %4 = shl <2 x i64> %3, <i64 20, i64 20>
+ %5 = ashr <2 x i64> %4, <i64 20, i64 20>
+ %6 = and <2 x i64> %1, %5
+ %7 = extractelement <2 x i64> %6, i32 0
+ %8 = sitofp i64 %7 to float
+ ret float %8
+}
+
+define float @signbits_ashr_sextvecinreg_bitops_extract_sitofp(<2 x i64> %a0, <4 x i32> %a1) nounwind {
+; X32-LABEL: signbits_ashr_sextvecinreg_bitops_extract_sitofp:
+; X32: # %bb.0:
+; X32-NEXT: pushl %eax
+; X32-NEXT: vmovdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648]
+; X32-NEXT: vpsrlq $60, %xmm2, %xmm3
+; X32-NEXT: vpsrlq $61, %xmm2, %xmm2
+; X32-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7]
+; X32-NEXT: vpsrlq $60, %xmm0, %xmm3
+; X32-NEXT: vpsrlq $61, %xmm0, %xmm0
+; X32-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
+; X32-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; X32-NEXT: vpsubq %xmm2, %xmm0, %xmm0
+; X32-NEXT: vpmovsxdq %xmm1, %xmm1
+; X32-NEXT: vpand %xmm1, %xmm0, %xmm2
+; X32-NEXT: vpor %xmm1, %xmm2, %xmm1
+; X32-NEXT: vpxor %xmm0, %xmm1, %xmm0
+; X32-NEXT: vmovd %xmm0, %eax
+; X32-NEXT: vcvtsi2ssl %eax, %xmm4, %xmm0
+; X32-NEXT: vmovss %xmm0, (%esp)
+; X32-NEXT: flds (%esp)
+; X32-NEXT: popl %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: signbits_ashr_sextvecinreg_bitops_extract_sitofp:
+; X64: # %bb.0:
+; X64-NEXT: vpsrlq $60, %xmm0, %xmm2
+; X64-NEXT: vpsrlq $61, %xmm0, %xmm0
+; X64-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
+; X64-NEXT: vmovdqa {{.*#+}} xmm2 = [4,8]
+; X64-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; X64-NEXT: vpsubq %xmm2, %xmm0, %xmm0
+; X64-NEXT: vpmovsxdq %xmm1, %xmm1
+; X64-NEXT: vpand %xmm1, %xmm0, %xmm2
+; X64-NEXT: vpor %xmm1, %xmm2, %xmm1
+; X64-NEXT: vpxor %xmm0, %xmm1, %xmm0
+; X64-NEXT: vmovq %xmm0, %rax
+; X64-NEXT: vcvtsi2ssl %eax, %xmm3, %xmm0
+; X64-NEXT: retq
+ %1 = ashr <2 x i64> %a0, <i64 61, i64 60>
+ %2 = shufflevector <4 x i32> %a1, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
+ %3 = sext <2 x i32> %2 to <2 x i64>
+ %4 = and <2 x i64> %1, %3
+ %5 = or <2 x i64> %4, %3
+ %6 = xor <2 x i64> %5, %1
+ %7 = extractelement <2 x i64> %6, i32 0
+ %8 = sitofp i64 %7 to float
+ ret float %8
+}
+
+define <4 x float> @signbits_ashr_sext_select_shuffle_sitofp(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> %a2, <4 x i32> %a3) nounwind {
+; X32-LABEL: signbits_ashr_sext_select_shuffle_sitofp:
+; X32: # %bb.0:
+; X32-NEXT: pushl %ebp
+; X32-NEXT: movl %esp, %ebp
+; X32-NEXT: andl $-16, %esp
+; X32-NEXT: subl $16, %esp
+; X32-NEXT: vmovdqa {{.*#+}} ymm3 = [33,0,63,0,33,0,63,0]
+; X32-NEXT: vextractf128 $1, %ymm3, %xmm4
+; X32-NEXT: vmovdqa {{.*#+}} xmm5 = [0,2147483648,0,2147483648]
+; X32-NEXT: vpsrlq %xmm4, %xmm5, %xmm6
+; X32-NEXT: vextractf128 $1, %ymm2, %xmm7
+; X32-NEXT: vpsrlq %xmm4, %xmm7, %xmm4
+; X32-NEXT: vpxor %xmm6, %xmm4, %xmm4
+; X32-NEXT: vpsubq %xmm6, %xmm4, %xmm4
+; X32-NEXT: vpsrlq %xmm3, %xmm5, %xmm5
+; X32-NEXT: vpsrlq %xmm3, %xmm2, %xmm2
+; X32-NEXT: vpxor %xmm5, %xmm2, %xmm2
+; X32-NEXT: vpsubq %xmm5, %xmm2, %xmm2
+; X32-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
+; X32-NEXT: vpmovsxdq 8(%ebp), %xmm3
+; X32-NEXT: vpmovsxdq 16(%ebp), %xmm4
+; X32-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; X32-NEXT: vextractf128 $1, %ymm1, %xmm4
+; X32-NEXT: vextractf128 $1, %ymm0, %xmm5
+; X32-NEXT: vpcmpeqq %xmm4, %xmm5, %xmm4
+; X32-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
+; X32-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; X32-NEXT: vblendvpd %ymm0, %ymm2, %ymm3, %ymm0
+; X32-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; X32-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X32-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; X32-NEXT: vcvtdq2ps %xmm0, %xmm0
+; X32-NEXT: movl %ebp, %esp
+; X32-NEXT: popl %ebp
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: signbits_ashr_sext_select_shuffle_sitofp:
+; X64: # %bb.0:
+; X64-NEXT: vextractf128 $1, %ymm2, %xmm4
+; X64-NEXT: vpsrlq $63, %xmm4, %xmm5
+; X64-NEXT: vpsrlq $33, %xmm4, %xmm4
+; X64-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm5[4,5,6,7]
+; X64-NEXT: vmovdqa {{.*#+}} xmm5 = [1073741824,1]
+; X64-NEXT: vpxor %xmm5, %xmm4, %xmm4
+; X64-NEXT: vpsubq %xmm5, %xmm4, %xmm4
+; X64-NEXT: vpsrlq $63, %xmm2, %xmm6
+; X64-NEXT: vpsrlq $33, %xmm2, %xmm2
+; X64-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm6[4,5,6,7]
+; X64-NEXT: vpxor %xmm5, %xmm2, %xmm2
+; X64-NEXT: vpsubq %xmm5, %xmm2, %xmm2
+; X64-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
+; X64-NEXT: vpmovsxdq %xmm3, %xmm4
+; X64-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
+; X64-NEXT: vpmovsxdq %xmm3, %xmm3
+; X64-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3
+; X64-NEXT: vextractf128 $1, %ymm1, %xmm4
+; X64-NEXT: vextractf128 $1, %ymm0, %xmm5
+; X64-NEXT: vpcmpeqq %xmm4, %xmm5, %xmm4
+; X64-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
+; X64-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; X64-NEXT: vblendvpd %ymm0, %ymm2, %ymm3, %ymm0
+; X64-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X64-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; X64-NEXT: vcvtdq2ps %xmm0, %xmm0
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+ %1 = ashr <4 x i64> %a2, <i64 33, i64 63, i64 33, i64 63>
+ %2 = sext <4 x i32> %a3 to <4 x i64>
+ %3 = icmp eq <4 x i64> %a0, %a1
+ %4 = select <4 x i1> %3, <4 x i64> %1, <4 x i64> %2
+ %5 = shufflevector <4 x i64> %4, <4 x i64> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+ %6 = sitofp <4 x i64> %5 to <4 x float>
+ ret <4 x float> %6
+}
diff --git a/test/CodeGen/X86/label-annotation.ll b/test/CodeGen/X86/label-annotation.ll
new file mode 100644
index 000000000000..3f359592f951
--- /dev/null
+++ b/test/CodeGen/X86/label-annotation.ll
@@ -0,0 +1,73 @@
+; RUN: llc < %s | FileCheck %s
+; FIXME: fastisel screws up the order here.
+; RUNX: llc -O0 < %s | FileCheck %s
+
+; Source to regenerate:
+; $ clang --target=x86_64-windows-msvc -S annotation.c -g -gcodeview -o t.ll \
+; -emit-llvm -O1 -Xclang -disable-llvm-passes -fms-extensions
+; void g(void);
+; void f(void) {
+; g();
+; __annotation(L"a1", L"a2");
+; g();
+; }
+
+; ModuleID = 'annotation.c'
+source_filename = "annotation.c"
+target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-windows-msvc19.0.24215"
+
+; Function Attrs: nounwind uwtable
+define void @f() #0 !dbg !8 {
+entry:
+ call void @g(), !dbg !11
+ call void @llvm.codeview.annotation(metadata !12), !dbg !13
+ call void @g(), !dbg !14
+ ret void, !dbg !15
+}
+
+; CHECK-LABEL: f: # @f
+; CHECK: callq g
+; CHECK: .Lannotation0:
+; CHECK: callq g
+; CHECK: retq
+
+; CHECK-LABEL: .short 4423 # Record kind: S_GPROC32_ID
+; CHECK: .short 4121 # Record kind: S_ANNOTATION
+; CHECK-NEXT: .secrel32 .Lannotation0
+; CHECK-NEXT: .secidx .Lannotation0
+; CHECK-NEXT: .short 2
+; CHECK-NEXT: .asciz "a1"
+; CHECK-NEXT: .asciz "a2"
+
+; CHECK-LABEL: .short 4431 # Record kind: S_PROC_ID_END
+
+declare void @g() #1
+
+; Function Attrs: nounwind
+declare void @llvm.codeview.annotation(metadata) #2
+
+attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4, !5, !6}
+!llvm.ident = !{!7}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 6.0.0 ", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
+!1 = !DIFile(filename: "annotation.c", directory: "C:\5Csrc\5Cllvm-project\5Cbuild", checksumkind: CSK_MD5, checksum: "51164221112d8a5baa55a995027e4ba5")
+!2 = !{}
+!3 = !{i32 2, !"CodeView", i32 1}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{i32 1, !"wchar_size", i32 2}
+!6 = !{i32 7, !"PIC Level", i32 2}
+!7 = !{!"clang version 6.0.0 "}
+!8 = distinct !DISubprogram(name: "f", scope: !1, file: !1, line: 2, type: !9, isLocal: false, isDefinition: true, scopeLine: 2, flags: DIFlagPrototyped, isOptimized: true, unit: !0, variables: !2)
+!9 = !DISubroutineType(types: !10)
+!10 = !{null}
+!11 = !DILocation(line: 3, column: 3, scope: !8)
+!12 = !{!"a1", !"a2"}
+!13 = !DILocation(line: 4, column: 3, scope: !8)
+!14 = !DILocation(line: 5, column: 3, scope: !8)
+!15 = !DILocation(line: 6, column: 1, scope: !8)
diff --git a/test/CodeGen/X86/lakemont.ll b/test/CodeGen/X86/lakemont.ll
index ddd24525f27a..49946890822f 100644
--- a/test/CodeGen/X86/lakemont.ll
+++ b/test/CodeGen/X86/lakemont.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mcpu=lakemont | FileCheck %s
+; RUN: llc < %s -mtriple=i686-- -mcpu=lakemont | FileCheck %s
; Make sure -mcpu=lakemont implies soft floats.
define float @test(float %a, float %b) nounwind readnone {
diff --git a/test/CodeGen/X86/large-code-model-isel.ll b/test/CodeGen/X86/large-code-model-isel.ll
index 9edabcd0520d..086fc8ed0794 100644
--- a/test/CodeGen/X86/large-code-model-isel.ll
+++ b/test/CodeGen/X86/large-code-model-isel.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -code-model=large -mcpu=core2 -march=x86-64 -O0 | FileCheck %s
+; RUN: llc < %s -code-model=large -mcpu=core2 -mtriple=x86_64-- -O0 | FileCheck %s
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/test/CodeGen/X86/large-gep-chain.ll b/test/CodeGen/X86/large-gep-chain.ll
index 8df282983f56..f0f06c12232e 100644
--- a/test/CodeGen/X86/large-gep-chain.ll
+++ b/test/CodeGen/X86/large-gep-chain.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -O0 -march x86 -o /dev/null
+; RUN: llc < %s -O0 -mtriple=i686-- -o /dev/null
; <rdar://problem/12445434>
%0 = type { i32, float* }
diff --git a/test/CodeGen/X86/large-gep-scale.ll b/test/CodeGen/X86/large-gep-scale.ll
index 8e6e4d23a818..10ef094e4be0 100644
--- a/test/CodeGen/X86/large-gep-scale.ll
+++ b/test/CodeGen/X86/large-gep-scale.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | FileCheck %s
+; RUN: llc < %s -mtriple=i686-- | FileCheck %s
; PR5281
; After scaling, this type doesn't fit in memory. Codegen should generate
diff --git a/test/CodeGen/X86/lea-3.ll b/test/CodeGen/X86/lea-3.ll
index a56403a24b03..f32c782c8d7b 100644
--- a/test/CodeGen/X86/lea-3.ll
+++ b/test/CodeGen/X86/lea-3.ll
@@ -1,24 +1,87 @@
-; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s
-; RUN: llc < %s -mtriple=x86_64-linux-gnux32 | FileCheck %s
-; RUN: llc < %s -mtriple=x86_64-nacl | FileCheck %s
-; RUN: llc < %s -mtriple=x86_64-win32 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s --check-prefix=LNX1
+; RUN: llc < %s -mtriple=x86_64-linux-gnux32 | FileCheck %s --check-prefix=LNX2
+; RUN: llc < %s -mtriple=x86_64-nacl | FileCheck %s --check-prefix=NACL
+; RUN: llc < %s -mtriple=x86_64-win32 | FileCheck %s --check-prefix=WIN
-; CHECK: leaq (,[[A0:%rdi|%rcx]],4), %rax
define i64 @test2(i64 %a) {
- %tmp2 = shl i64 %a, 2
+; LNX1-LABEL: test2:
+; LNX1: # %bb.0:
+; LNX1-NEXT: leaq (,%rdi,4), %rax
+; LNX1-NEXT: orq %rdi, %rax
+; LNX1-NEXT: retq
+;
+; LNX2-LABEL: test2:
+; LNX2: # %bb.0:
+; LNX2-NEXT: leaq (,%rdi,4), %rax
+; LNX2-NEXT: orq %rdi, %rax
+; LNX2-NEXT: retq
+;
+; NACL-LABEL: test2:
+; NACL: # %bb.0:
+; NACL-NEXT: leaq (,%rdi,4), %rax
+; NACL-NEXT: orq %rdi, %rax
+; NACL-NEXT: retq
+;
+; WIN-LABEL: test2:
+; WIN: # %bb.0:
+; WIN-NEXT: leaq (,%rcx,4), %rax
+; WIN-NEXT: orq %rcx, %rax
+; WIN-NEXT: retq
+ %tmp2 = shl i64 %a, 2
%tmp3 = or i64 %tmp2, %a
- ret i64 %tmp3
+ ret i64 %tmp3
}
-; CHECK: leal ([[A0]],[[A0]],2), %eax
define i32 @test(i32 %a) {
- %tmp2 = mul i32 %a, 3 ; <i32> [#uses=1]
- ret i32 %tmp2
+; LNX1-LABEL: test:
+; LNX1: # %bb.0:
+; LNX1-NEXT: # kill: def %edi killed %edi def %rdi
+; LNX1-NEXT: leal (%rdi,%rdi,2), %eax
+; LNX1-NEXT: retq
+;
+; LNX2-LABEL: test:
+; LNX2: # %bb.0:
+; LNX2-NEXT: # kill: def %edi killed %edi def %rdi
+; LNX2-NEXT: leal (%rdi,%rdi,2), %eax
+; LNX2-NEXT: retq
+;
+; NACL-LABEL: test:
+; NACL: # %bb.0:
+; NACL-NEXT: # kill: def %edi killed %edi def %rdi
+; NACL-NEXT: leal (%rdi,%rdi,2), %eax
+; NACL-NEXT: retq
+;
+; WIN-LABEL: test:
+; WIN: # %bb.0:
+; WIN-NEXT: # kill: def %ecx killed %ecx def %rcx
+; WIN-NEXT: leal (%rcx,%rcx,2), %eax
+; WIN-NEXT: retq
+ %tmp2 = mul i32 %a, 3
+ ret i32 %tmp2
}
-; CHECK: leaq (,[[A0]],8), %rax
define i64 @test3(i64 %a) {
- %tmp2 = shl i64 %a, 3
- ret i64 %tmp2
+; LNX1-LABEL: test3:
+; LNX1: # %bb.0:
+; LNX1-NEXT: leaq (,%rdi,8), %rax
+; LNX1-NEXT: retq
+;
+; LNX2-LABEL: test3:
+; LNX2: # %bb.0:
+; LNX2-NEXT: leaq (,%rdi,8), %rax
+; LNX2-NEXT: retq
+;
+; NACL-LABEL: test3:
+; NACL: # %bb.0:
+; NACL-NEXT: leaq (,%rdi,8), %rax
+; NACL-NEXT: retq
+;
+; WIN-LABEL: test3:
+; WIN: # %bb.0:
+; WIN-NEXT: leaq (,%rcx,8), %rax
+; WIN-NEXT: retq
+ %tmp2 = shl i64 %a, 3
+ ret i64 %tmp2
}
diff --git a/test/CodeGen/X86/lea-opt-cse1.ll b/test/CodeGen/X86/lea-opt-cse1.ll
new file mode 100644
index 000000000000..08241f6b5b86
--- /dev/null
+++ b/test/CodeGen/X86/lea-opt-cse1.ll
@@ -0,0 +1,46 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown | FileCheck %s -check-prefix=X64
+; RUN: llc < %s -mtriple=i686-unknown | FileCheck %s -check-prefix=X86
+
+%struct.SA = type { i32 , i32 , i32 , i32 ,i32 }
+
+define void @test_func(%struct.SA* nocapture %ctx, i32 %n) local_unnamed_addr {
+; X64-LABEL: test_func:
+; X64: # %bb.0: # %entry
+; X64-NEXT: movl (%rdi), %eax
+; X64-NEXT: movl 16(%rdi), %ecx
+; X64-NEXT: leal (%rax,%rcx), %edx
+; X64-NEXT: leal 1(%rax,%rcx), %eax
+; X64-NEXT: movl %eax, 12(%rdi)
+; X64-NEXT: leal 1(%rcx,%rdx), %eax
+; X64-NEXT: movl %eax, 16(%rdi)
+; X64-NEXT: retq
+;
+; X86-LABEL: test_func:
+; X86: # %bb.0: # %entry
+; X86-NEXT: pushl %esi
+; X86-NEXT: .cfi_def_cfa_offset 8
+; X86-NEXT: .cfi_offset %esi, -8
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl (%eax), %ecx
+; X86-NEXT: movl 16(%eax), %edx
+; X86-NEXT: leal 1(%ecx,%edx), %esi
+; X86-NEXT: addl %edx, %ecx
+; X86-NEXT: movl %esi, 12(%eax)
+; X86-NEXT: leal 1(%edx,%ecx), %ecx
+; X86-NEXT: movl %ecx, 16(%eax)
+; X86-NEXT: popl %esi
+; X86-NEXT: retl
+ entry:
+ %h0 = getelementptr inbounds %struct.SA, %struct.SA* %ctx, i64 0, i32 0
+ %0 = load i32, i32* %h0, align 8
+ %h3 = getelementptr inbounds %struct.SA, %struct.SA* %ctx, i64 0, i32 3
+ %h4 = getelementptr inbounds %struct.SA, %struct.SA* %ctx, i64 0, i32 4
+ %1 = load i32, i32* %h4, align 8
+ %add = add i32 %0, 1
+ %add4 = add i32 %add, %1
+ store i32 %add4, i32* %h3, align 4
+ %add29 = add i32 %add4 , %1
+ store i32 %add29, i32* %h4, align 8
+ ret void
+}
diff --git a/test/CodeGen/X86/lea-opt-cse2.ll b/test/CodeGen/X86/lea-opt-cse2.ll
new file mode 100644
index 000000000000..429a7a5c0c8e
--- /dev/null
+++ b/test/CodeGen/X86/lea-opt-cse2.ll
@@ -0,0 +1,72 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown | FileCheck %s -check-prefix=X64
+; RUN: llc < %s -mtriple=i686-unknown | FileCheck %s -check-prefix=X86
+
+%struct.SA = type { i32 , i32 , i32 , i32 , i32};
+
+define void @foo(%struct.SA* nocapture %ctx, i32 %n) local_unnamed_addr #0 {
+; X64-LABEL: foo:
+; X64: # %bb.0: # %entry
+; X64-NEXT: .p2align 4, 0x90
+; X64-NEXT: .LBB0_1: # %loop
+; X64-NEXT: # =>This Inner Loop Header: Depth=1
+; X64-NEXT: movl (%rdi), %eax
+; X64-NEXT: movl 16(%rdi), %ecx
+; X64-NEXT: leal 1(%rax,%rcx), %edx
+; X64-NEXT: movl %edx, 12(%rdi)
+; X64-NEXT: decl %esi
+; X64-NEXT: jne .LBB0_1
+; X64-NEXT: # %bb.2: # %exit
+; X64-NEXT: addl %ecx, %eax
+; X64-NEXT: leal 1(%rcx,%rax), %eax
+; X64-NEXT: movl %eax, 16(%rdi)
+; X64-NEXT: retq
+;
+; X86-LABEL: foo:
+; X86: # %bb.0: # %entry
+; X86-NEXT: pushl %edi
+; X86-NEXT: .cfi_def_cfa_offset 8
+; X86-NEXT: pushl %esi
+; X86-NEXT: .cfi_def_cfa_offset 12
+; X86-NEXT: .cfi_offset %esi, -12
+; X86-NEXT: .cfi_offset %edi, -8
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: .p2align 4, 0x90
+; X86-NEXT: .LBB0_1: # %loop
+; X86-NEXT: # =>This Inner Loop Header: Depth=1
+; X86-NEXT: movl (%eax), %edx
+; X86-NEXT: movl 16(%eax), %esi
+; X86-NEXT: leal 1(%edx,%esi), %edi
+; X86-NEXT: movl %edi, 12(%eax)
+; X86-NEXT: decl %ecx
+; X86-NEXT: jne .LBB0_1
+; X86-NEXT: # %bb.2: # %exit
+; X86-NEXT: addl %esi, %edx
+; X86-NEXT: leal 1(%esi,%edx), %ecx
+; X86-NEXT: movl %ecx, 16(%eax)
+; X86-NEXT: popl %esi
+; X86-NEXT: popl %edi
+; X86-NEXT: retl
+ entry:
+ br label %loop
+
+ loop:
+ %iter = phi i32 [%n ,%entry ] ,[ %iter.ctr ,%loop]
+ %h0 = getelementptr inbounds %struct.SA, %struct.SA* %ctx, i64 0, i32 0
+ %0 = load i32, i32* %h0, align 8
+ %h3 = getelementptr inbounds %struct.SA, %struct.SA* %ctx, i64 0, i32 3
+ %h4 = getelementptr inbounds %struct.SA, %struct.SA* %ctx, i64 0, i32 4
+ %1 = load i32, i32* %h4, align 8
+ %add = add i32 %0, 1
+ %add4 = add i32 %add, %1
+ store i32 %add4, i32* %h3, align 4
+ %add29 = add i32 %add4, %1
+ %iter.ctr = sub i32 %iter , 1
+ %res = icmp ne i32 %iter.ctr , 0
+ br i1 %res , label %loop , label %exit
+
+ exit:
+ store i32 %add29, i32* %h4, align 8
+ ret void
+}
diff --git a/test/CodeGen/X86/lea-opt-cse3.ll b/test/CodeGen/X86/lea-opt-cse3.ll
new file mode 100644
index 000000000000..d0b5a281186f
--- /dev/null
+++ b/test/CodeGen/X86/lea-opt-cse3.ll
@@ -0,0 +1,162 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown | FileCheck %s -check-prefix=X64
+; RUN: llc < %s -mtriple=i686-unknown | FileCheck %s -check-prefix=X86
+
+define i32 @foo(i32 %a, i32 %b) local_unnamed_addr #0 {
+; X64-LABEL: foo:
+; X64: # %bb.0: # %entry
+; X64-NEXT: # kill: def %esi killed %esi def %rsi
+; X64-NEXT: # kill: def %edi killed %edi def %rdi
+; X64-NEXT: leal 4(%rdi,%rsi,2), %ecx
+; X64-NEXT: leal 4(%rdi,%rsi,4), %eax
+; X64-NEXT: imull %ecx, %eax
+; X64-NEXT: retq
+;
+; X86-LABEL: foo:
+; X86: # %bb.0: # %entry
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: leal 4(%ecx,%eax,2), %edx
+; X86-NEXT: leal 4(%ecx,%eax,4), %eax
+; X86-NEXT: imull %edx, %eax
+; X86-NEXT: retl
+entry:
+ %mul = shl i32 %b, 1
+ %add = add i32 %a, 4
+ %add1 = add i32 %add, %mul
+ %mul2 = shl i32 %b, 2
+ %add4 = add i32 %add, %mul2
+ %mul5 = mul nsw i32 %add1, %add4
+ ret i32 %mul5
+}
+
+define i32 @foo1(i32 %a, i32 %b) local_unnamed_addr #0 {
+; X64-LABEL: foo1:
+; X64: # %bb.0: # %entry
+; X64-NEXT: # kill: def %esi killed %esi def %rsi
+; X64-NEXT: # kill: def %edi killed %edi def %rdi
+; X64-NEXT: leal 4(%rdi,%rsi,4), %ecx
+; X64-NEXT: leal 4(%rdi,%rsi,8), %eax
+; X64-NEXT: imull %ecx, %eax
+; X64-NEXT: retq
+;
+; X86-LABEL: foo1:
+; X86: # %bb.0: # %entry
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: leal 4(%ecx,%eax,4), %edx
+; X86-NEXT: leal 4(%ecx,%eax,8), %eax
+; X86-NEXT: imull %edx, %eax
+; X86-NEXT: retl
+entry:
+ %mul = shl i32 %b, 2
+ %add = add i32 %a, 4
+ %add1 = add i32 %add, %mul
+ %mul2 = shl i32 %b, 3
+ %add4 = add i32 %add, %mul2
+ %mul5 = mul nsw i32 %add1, %add4
+ ret i32 %mul5
+}
+
+define i32 @foo1_mult_basic_blocks(i32 %a, i32 %b) local_unnamed_addr #0 {
+; X64-LABEL: foo1_mult_basic_blocks:
+; X64: # %bb.0: # %entry
+; X64-NEXT: # kill: def %esi killed %esi def %rsi
+; X64-NEXT: # kill: def %edi killed %edi def %rdi
+; X64-NEXT: leal 4(%rdi,%rsi,4), %ecx
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: cmpl $10, %ecx
+; X64-NEXT: je .LBB2_2
+; X64-NEXT: # %bb.1: # %mid
+; X64-NEXT: leal 4(%rdi,%rsi,8), %eax
+; X64-NEXT: imull %eax, %ecx
+; X64-NEXT: movl %ecx, %eax
+; X64-NEXT: .LBB2_2: # %exit
+; X64-NEXT: retq
+;
+; X86-LABEL: foo1_mult_basic_blocks:
+; X86: # %bb.0: # %entry
+; X86-NEXT: pushl %esi
+; X86-NEXT: .cfi_def_cfa_offset 8
+; X86-NEXT: .cfi_offset %esi, -8
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: leal 4(%esi,%edx,4), %ecx
+; X86-NEXT: xorl %eax, %eax
+; X86-NEXT: cmpl $10, %ecx
+; X86-NEXT: je .LBB2_2
+; X86-NEXT: # %bb.1: # %mid
+; X86-NEXT: leal 4(%esi,%edx,8), %eax
+; X86-NEXT: imull %eax, %ecx
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: .LBB2_2: # %exit
+; X86-NEXT: popl %esi
+; X86-NEXT: retl
+entry:
+ %mul = shl i32 %b, 2
+ %add = add i32 %a, 4
+ %add1 = add i32 %add, %mul
+ %cmp = icmp ne i32 %add1 , 10
+ br i1 %cmp , label %mid , label %exit
+mid:
+ %addn = add i32 %a , 4
+ %mul2 = shl i32 %b, 3
+ %add4 = add i32 %addn, %mul2
+ %mul5 = mul nsw i32 %add1, %add4
+ br label %exit
+
+exit:
+ %retmul = phi i32 [%mul5 , %mid] , [0 , %entry]
+ ret i32 %retmul
+}
+
+define i32 @foo1_mult_basic_blocks_illegal_scale(i32 %a, i32 %b) local_unnamed_addr #0 {
+; X64-LABEL: foo1_mult_basic_blocks_illegal_scale:
+; X64: # %bb.0: # %entry
+; X64-NEXT: # kill: def %esi killed %esi def %rsi
+; X64-NEXT: # kill: def %edi killed %edi def %rdi
+; X64-NEXT: leal 4(%rdi,%rsi,2), %ecx
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: cmpl $10, %ecx
+; X64-NEXT: je .LBB3_2
+; X64-NEXT: # %bb.1: # %mid
+; X64-NEXT: leal 4(%rdi,%rsi,8), %eax
+; X64-NEXT: imull %eax, %ecx
+; X64-NEXT: movl %ecx, %eax
+; X64-NEXT: .LBB3_2: # %exit
+; X64-NEXT: retq
+;
+; X86-LABEL: foo1_mult_basic_blocks_illegal_scale:
+; X86: # %bb.0: # %entry
+; X86-NEXT: pushl %esi
+; X86-NEXT: .cfi_def_cfa_offset 8
+; X86-NEXT: .cfi_offset %esi, -8
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: leal 4(%esi,%edx,2), %ecx
+; X86-NEXT: xorl %eax, %eax
+; X86-NEXT: cmpl $10, %ecx
+; X86-NEXT: je .LBB3_2
+; X86-NEXT: # %bb.1: # %mid
+; X86-NEXT: leal 4(%esi,%edx,8), %eax
+; X86-NEXT: imull %eax, %ecx
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: .LBB3_2: # %exit
+; X86-NEXT: popl %esi
+; X86-NEXT: retl
+entry:
+ %mul = shl i32 %b, 1
+ %add = add i32 %a, 4
+ %add1 = add i32 %add, %mul
+ %cmp = icmp ne i32 %add1 , 10
+ br i1 %cmp, label %mid , label %exit
+mid:
+ %addn = add i32 %a , 4
+ %mul2 = shl i32 %b, 3
+ %add4 = add i32 %addn, %mul2
+ %mul5 = mul nsw i32 %add1, %add4
+ br label %exit
+exit:
+ %retmul = phi i32 [%mul5 , %mid] , [0 , %entry]
+ ret i32 %retmul
+}
diff --git a/test/CodeGen/X86/lea-opt-cse4.ll b/test/CodeGen/X86/lea-opt-cse4.ll
new file mode 100644
index 000000000000..a295ac7129c2
--- /dev/null
+++ b/test/CodeGen/X86/lea-opt-cse4.ll
@@ -0,0 +1,142 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown | FileCheck %s -check-prefix=X64
+; RUN: llc < %s -mtriple=i686-unknown | FileCheck %s -check-prefix=X86
+
+%struct.SA = type { i32 , i32 , i32 , i32 , i32};
+
+define void @foo(%struct.SA* nocapture %ctx, i32 %n) local_unnamed_addr #0 {
+; X64-LABEL: foo:
+; X64: # %bb.0: # %entry
+; X64-NEXT: movl 16(%rdi), %eax
+; X64-NEXT: movl (%rdi), %ecx
+; X64-NEXT: addl %eax, %ecx
+; X64-NEXT: addl %eax, %ecx
+; X64-NEXT: addl %eax, %ecx
+; X64-NEXT: leal (%rcx,%rax), %edx
+; X64-NEXT: leal 1(%rax,%rcx), %ecx
+; X64-NEXT: movl %ecx, 12(%rdi)
+; X64-NEXT: leal 1(%rax,%rdx), %eax
+; X64-NEXT: movl %eax, 16(%rdi)
+; X64-NEXT: retq
+;
+; X86-LABEL: foo:
+; X86: # %bb.0: # %entry
+; X86-NEXT: pushl %esi
+; X86-NEXT: .cfi_def_cfa_offset 8
+; X86-NEXT: .cfi_offset %esi, -8
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl 16(%eax), %ecx
+; X86-NEXT: movl (%eax), %edx
+; X86-NEXT: addl %ecx, %edx
+; X86-NEXT: addl %ecx, %edx
+; X86-NEXT: addl %ecx, %edx
+; X86-NEXT: leal 1(%ecx,%edx), %esi
+; X86-NEXT: addl %ecx, %edx
+; X86-NEXT: movl %esi, 12(%eax)
+; X86-NEXT: leal 1(%ecx,%edx), %ecx
+; X86-NEXT: movl %ecx, 16(%eax)
+; X86-NEXT: popl %esi
+; X86-NEXT: retl
+ entry:
+ %h0 = getelementptr inbounds %struct.SA, %struct.SA* %ctx, i64 0, i32 0
+ %0 = load i32, i32* %h0, align 8
+ %h3 = getelementptr inbounds %struct.SA, %struct.SA* %ctx, i64 0, i32 3
+ %h4 = getelementptr inbounds %struct.SA, %struct.SA* %ctx, i64 0, i32 4
+ %1 = load i32, i32* %h4, align 8
+ %add = add i32 %0 , 1
+ %add1 = add i32 %add, %1
+ %add2 = add i32 %add1, %1
+ %add3 = add i32 %add2, %1
+ %add4 = add i32 %add3, %1
+ store i32 %add4, i32* %h3, align 4
+ %add29 = add i32 %add4, %1
+ store i32 %add29, i32* %h4, align 8
+ ret void
+}
+
+
+
+define void @foo_loop(%struct.SA* nocapture %ctx, i32 %n) local_unnamed_addr #0 {
+; X64-LABEL: foo_loop:
+; X64: # %bb.0: # %entry
+; X64-NEXT: .p2align 4, 0x90
+; X64-NEXT: .LBB1_1: # %loop
+; X64-NEXT: # =>This Inner Loop Header: Depth=1
+; X64-NEXT: movl (%rdi), %ecx
+; X64-NEXT: movl 16(%rdi), %eax
+; X64-NEXT: leal 1(%rcx,%rax), %edx
+; X64-NEXT: movl %edx, 12(%rdi)
+; X64-NEXT: decl %esi
+; X64-NEXT: jne .LBB1_1
+; X64-NEXT: # %bb.2: # %exit
+; X64-NEXT: addl %eax, %ecx
+; X64-NEXT: leal 1(%rax,%rcx), %ecx
+; X64-NEXT: addl %eax, %ecx
+; X64-NEXT: addl %eax, %ecx
+; X64-NEXT: addl %eax, %ecx
+; X64-NEXT: addl %eax, %ecx
+; X64-NEXT: addl %eax, %ecx
+; X64-NEXT: addl %eax, %ecx
+; X64-NEXT: movl %ecx, 16(%rdi)
+; X64-NEXT: retq
+;
+; X86-LABEL: foo_loop:
+; X86: # %bb.0: # %entry
+; X86-NEXT: pushl %edi
+; X86-NEXT: .cfi_def_cfa_offset 8
+; X86-NEXT: pushl %esi
+; X86-NEXT: .cfi_def_cfa_offset 12
+; X86-NEXT: .cfi_offset %esi, -12
+; X86-NEXT: .cfi_offset %edi, -8
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: .p2align 4, 0x90
+; X86-NEXT: .LBB1_1: # %loop
+; X86-NEXT: # =>This Inner Loop Header: Depth=1
+; X86-NEXT: movl (%eax), %esi
+; X86-NEXT: movl 16(%eax), %ecx
+; X86-NEXT: leal 1(%esi,%ecx), %edi
+; X86-NEXT: movl %edi, 12(%eax)
+; X86-NEXT: decl %edx
+; X86-NEXT: jne .LBB1_1
+; X86-NEXT: # %bb.2: # %exit
+; X86-NEXT: addl %ecx, %esi
+; X86-NEXT: leal 1(%ecx,%esi), %edx
+; X86-NEXT: addl %ecx, %edx
+; X86-NEXT: addl %ecx, %edx
+; X86-NEXT: addl %ecx, %edx
+; X86-NEXT: addl %ecx, %edx
+; X86-NEXT: addl %ecx, %edx
+; X86-NEXT: addl %ecx, %edx
+; X86-NEXT: movl %edx, 16(%eax)
+; X86-NEXT: popl %esi
+; X86-NEXT: popl %edi
+; X86-NEXT: retl
+ entry:
+ br label %loop
+
+ loop:
+ %iter = phi i32 [%n ,%entry ] ,[ %iter.ctr ,%loop]
+ %h0 = getelementptr inbounds %struct.SA, %struct.SA* %ctx, i64 0, i32 0
+ %0 = load i32, i32* %h0, align 8
+ %h3 = getelementptr inbounds %struct.SA, %struct.SA* %ctx, i64 0, i32 3
+ %h4 = getelementptr inbounds %struct.SA, %struct.SA* %ctx, i64 0, i32 4
+ %1 = load i32, i32* %h4, align 8
+ %add = add i32 %0, 1
+ %add4 = add i32 %add, %1
+ store i32 %add4, i32* %h3, align 4
+ %add291 = add i32 %add4, %1
+ %add292 = add i32 %add291, %1
+ %add293 = add i32 %add292, %1
+ %add294 = add i32 %add293, %1
+ %add295 = add i32 %add294, %1
+ %add296 = add i32 %add295, %1
+ %add29 = add i32 %add296, %1
+ %iter.ctr = sub i32 %iter , 1
+ %res = icmp ne i32 %iter.ctr , 0
+ br i1 %res , label %loop , label %exit
+
+ exit:
+ store i32 %add29, i32* %h4, align 8
+ ret void
+}
diff --git a/test/CodeGen/X86/lea-opt-memop-check-1.ll b/test/CodeGen/X86/lea-opt-memop-check-1.ll
index 630df25d4009..6ad55d42868b 100644
--- a/test/CodeGen/X86/lea-opt-memop-check-1.ll
+++ b/test/CodeGen/X86/lea-opt-memop-check-1.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mtriple=i686-pc-win32 | FileCheck %s
+; RUN: llc < %s -mtriple=i686-pc-win32 | FileCheck %s
; PR26575
; Assertion `(Disp->isImm() || Disp->isGlobal()) && (Other.Disp->isImm() || Other.Disp->isGlobal()) && "Address displacement operand is always an immediate or a global"' failed.
diff --git a/test/CodeGen/X86/lea-opt-with-debug.mir b/test/CodeGen/X86/lea-opt-with-debug.mir
index 03a745888b5a..61e406985d7c 100644
--- a/test/CodeGen/X86/lea-opt-with-debug.mir
+++ b/test/CodeGen/X86/lea-opt-with-debug.mir
@@ -14,22 +14,22 @@
@d = common local_unnamed_addr global i32 0, align 4
@b = common local_unnamed_addr global i32 0, align 4
- define i32 @fn1() local_unnamed_addr !dbg !9 {
- %1 = load %struct.A*, %struct.A** @c, align 8, !dbg !14
- %2 = load i32, i32* @a, align 4, !dbg !14
- %3 = sext i32 %2 to i64, !dbg !14
- %4 = getelementptr inbounds %struct.A, %struct.A* %1, i64 %3, !dbg !14
- %5 = ptrtoint %struct.A* %4 to i64, !dbg !14
- %6 = trunc i64 %5 to i32, !dbg !14
- store i32 %6, i32* @d, align 4, !dbg !14
- %7 = getelementptr inbounds %struct.A, %struct.A* %1, i64 %3, i32 2, !dbg !15
- tail call void @llvm.dbg.value(metadata i32* %7, i64 0, metadata !12, metadata !16), !dbg !17
- br label %8, !dbg !18
+ define i32 @fn1() local_unnamed_addr !dbg !8 {
+ %1 = load %struct.A*, %struct.A** @c, align 8, !dbg !13
+ %2 = load i32, i32* @a, align 4, !dbg !13
+ %3 = sext i32 %2 to i64, !dbg !13
+ %4 = getelementptr inbounds %struct.A, %struct.A* %1, i64 %3, !dbg !13
+ %5 = ptrtoint %struct.A* %4 to i64, !dbg !13
+ %6 = trunc i64 %5 to i32, !dbg !13
+ store i32 %6, i32* @d, align 4, !dbg !13
+ %7 = getelementptr inbounds %struct.A, %struct.A* %1, i64 %3, i32 2, !dbg !14
+ tail call void @llvm.dbg.value(metadata i32* %7, i64 0, metadata !11, metadata !DIExpression()), !dbg !15
+ br label %8, !dbg !16
; <label>:8: ; preds = %8, %0
- %9 = load i32, i32* %7, align 4, !dbg !19
- store i32 %9, i32* @d, align 4, !dbg !19
- br label %8, !dbg !20
+ %9 = load i32, i32* %7, align 4, !dbg !17
+ store i32 %9, i32* @d, align 4, !dbg !17
+ br label %8, !dbg !18
}
; Function Attrs: nounwind readnone
@@ -39,7 +39,6 @@
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!5, !6, !7}
- !misc = !{!8}
!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, retainedTypes: !3, globals: !2)
!1 = !DIFile(filename: "test.c", directory: "")
@@ -49,19 +48,17 @@
!5 = !{i32 2, !"Dwarf Version", i32 4}
!6 = !{i32 2, !"Debug Info Version", i32 3}
!7 = !{i32 1, !"PIC Level", i32 2}
- !8 = !DIExpression(DW_OP_plus_uconst, 8, DW_OP_stack_value)
- !9 = distinct !DISubprogram(name: "fn1", scope: !1, file: !1, line: 7, type: !10, isLocal: false, isDefinition: true, scopeLine: 7, isOptimized: true, unit: !0, variables: !11)
- !10 = !DISubroutineType(types: !3)
- !11 = !{!12}
- !12 = !DILocalVariable(name: "e", scope: !9, file: !1, line: 8, type: !13)
- !13 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !4, size: 64)
- !14 = !DILocation(line: 9, scope: !9)
- !15 = !DILocation(line: 10, scope: !9)
- !16 = !DIExpression()
- !17 = !DILocation(line: 8, scope: !9)
- !18 = !DILocation(line: 11, scope: !9)
- !19 = !DILocation(line: 13, scope: !9)
- !20 = !DILocation(line: 14, scope: !9)
+ !8 = distinct !DISubprogram(name: "fn1", scope: !1, file: !1, line: 7, type: !9, isLocal: false, isDefinition: true, scopeLine: 7, isOptimized: true, unit: !0, variables: !10)
+ !9 = !DISubroutineType(types: !3)
+ !10 = !{!11}
+ !11 = !DILocalVariable(name: "e", scope: !8, file: !1, line: 8, type: !12)
+ !12 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !4, size: 64)
+ !13 = !DILocation(line: 9, scope: !8)
+ !14 = !DILocation(line: 10, scope: !8)
+ !15 = !DILocation(line: 8, scope: !8)
+ !16 = !DILocation(line: 11, scope: !8)
+ !17 = !DILocation(line: 13, scope: !8)
+ !18 = !DILocation(line: 14, scope: !8)
...
---
@@ -98,28 +95,28 @@ body: |
bb.0 (%ir-block.0):
successors: %bb.1(0x80000000)
- ; CHECK: %3 = LEA64r %2, 2, %2, 0, _, debug-location !14
- ; CHECK-NEXT: %4 = LEA64r %1, 4, %3, 0, _, debug-location !14
- ; CHECK-NOT: %0 = LEA64r %1, 4, %3, 8, _, debug-location !15
- ; CHECK: DBG_VALUE debug-use %4, debug-use _, !12, !8, debug-location !17
+ ; CHECK: %3:gr64_nosp = LEA64r %2, 2, %2, 0, %noreg, debug-location !13
+ ; CHECK-NEXT: %4:gr64 = LEA64r %1, 4, %3, 0, %noreg, debug-location !13
+ ; CHECK-NOT: %0:gr64 = LEA64r %1, 4, %3, 8, %noreg, debug-location !14
+ ; CHECK: DBG_VALUE debug-use %4, debug-use %noreg, !11, !DIExpression(DW_OP_plus_uconst, 8, DW_OP_stack_value), debug-location !15
- %1 = MOV64rm %rip, 1, _, @c, _, debug-location !14 :: (dereferenceable load 8 from @c)
- %2 = MOVSX64rm32 %rip, 1, _, @a, _, debug-location !14 :: (dereferenceable load 4 from @a)
- %3 = LEA64r %2, 2, %2, 0, _, debug-location !14
- %4 = LEA64r %1, 4, %3, 0, _, debug-location !14
- %5 = COPY %4.sub_32bit, debug-location !14
- MOV32mr %rip, 1, _, @d, _, killed %5, debug-location !14 :: (store 4 into @d)
- %0 = LEA64r %1, 4, %3, 8, _, debug-location !15
- DBG_VALUE debug-use %0, debug-use _, !12, !16, debug-location !17
+ %1 = MOV64rm %rip, 1, %noreg, @c, %noreg, debug-location !13 :: (dereferenceable load 8 from @c)
+ %2 = MOVSX64rm32 %rip, 1, %noreg, @a, %noreg, debug-location !13 :: (dereferenceable load 4 from @a)
+ %3 = LEA64r %2, 2, %2, 0, %noreg, debug-location !13
+ %4 = LEA64r %1, 4, %3, 0, %noreg, debug-location !13
+ %5 = COPY %4.sub_32bit, debug-location !13
+ MOV32mr %rip, 1, %noreg, @d, %noreg, killed %5, debug-location !13 :: (store 4 into @d)
+ %0 = LEA64r %1, 4, %3, 8, %noreg, debug-location !14
+ DBG_VALUE debug-use %0, debug-use %noreg, !11, !DIExpression(), debug-location !15
; CHECK-LABEL: bb.1 (%ir-block.8):
- ; CHECK: %6 = MOV32rm %4, 1, _, 8, _, debug-location !19 :: (load 4 from %ir.7)
+ ; CHECK: %6:gr32 = MOV32rm %4, 1, %noreg, 8, %noreg, debug-location !17 :: (load 4 from %ir.7)
bb.1 (%ir-block.8):
successors: %bb.1(0x80000000)
- %6 = MOV32rm %0, 1, _, 0, _, debug-location !19 :: (load 4 from %ir.7)
- MOV32mr %rip, 1, _, @d, _, killed %6, debug-location !19 :: (store 4 into @d)
- JMP_1 %bb.1, debug-location !20
+ %6 = MOV32rm %0, 1, %noreg, 0, %noreg, debug-location !17 :: (load 4 from %ir.7)
+ MOV32mr %rip, 1, %noreg, @d, %noreg, killed %6, debug-location !17 :: (store 4 into @d)
+ JMP_1 %bb.1, debug-location !18
...
diff --git a/test/CodeGen/X86/lea-recursion.ll b/test/CodeGen/X86/lea-recursion.ll
index 55bcd7819c37..5bba1141e3a6 100644
--- a/test/CodeGen/X86/lea-recursion.ll
+++ b/test/CodeGen/X86/lea-recursion.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 | grep lea | count 13
+; RUN: llc < %s -mtriple=x86_64-- | grep lea | count 13
; This testcase was written to demonstrate an instruction-selection problem,
; however it also happens to expose a limitation in the DAGCombiner's
diff --git a/test/CodeGen/X86/lea32-schedule.ll b/test/CodeGen/X86/lea32-schedule.ll
new file mode 100644
index 000000000000..b89ba4a3d0e7
--- /dev/null
+++ b/test/CodeGen/X86/lea32-schedule.ll
@@ -0,0 +1,825 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 | FileCheck %s --check-prefix=CHECK --check-prefix=GENERIC
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=atom | FileCheck %s --check-prefix=CHECK --check-prefix=ATOM
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=slm | FileCheck %s --check-prefix=CHECK --check-prefix=SLM
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=sandybridge | FileCheck %s --check-prefix=CHECK --check-prefix=SANDY
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=ivybridge | FileCheck %s --check-prefix=CHECK --check-prefix=SANDY
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=haswell | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=broadwell | FileCheck %s --check-prefix=CHECK --check-prefix=BROADWELL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=SKYLAKE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=knl | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1
+
+define i32 @test_lea_offset(i32) {
+; GENERIC-LABEL: test_lea_offset:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: # kill: def %edi killed %edi def %rdi
+; GENERIC-NEXT: leal -24(%rdi), %eax # sched: [1:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_lea_offset:
+; ATOM: # %bb.0:
+; ATOM-NEXT: # kill: def %edi killed %edi def %rdi
+; ATOM-NEXT: leal -24(%rdi), %eax # sched: [1:1.00]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_lea_offset:
+; SLM: # %bb.0:
+; SLM-NEXT: # kill: def %edi killed %edi def %rdi
+; SLM-NEXT: leal -24(%rdi), %eax # sched: [1:1.00]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_lea_offset:
+; SANDY: # %bb.0:
+; SANDY-NEXT: # kill: def %edi killed %edi def %rdi
+; SANDY-NEXT: leal -24(%rdi), %eax # sched: [1:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_lea_offset:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: # kill: def %edi killed %edi def %rdi
+; HASWELL-NEXT: leal -24(%rdi), %eax # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_lea_offset:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: # kill: def %edi killed %edi def %rdi
+; BROADWELL-NEXT: leal -24(%rdi), %eax # sched: [1:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_lea_offset:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: # kill: def %edi killed %edi def %rdi
+; SKYLAKE-NEXT: leal -24(%rdi), %eax # sched: [1:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_lea_offset:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: # kill: def %edi killed %edi def %rdi
+; BTVER2-NEXT: leal -24(%rdi), %eax # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_lea_offset:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: # kill: def %edi killed %edi def %rdi
+; ZNVER1-NEXT: leal -24(%rdi), %eax # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %2 = add nsw i32 %0, -24
+ ret i32 %2
+}
+
+define i32 @test_lea_offset_big(i32) {
+; GENERIC-LABEL: test_lea_offset_big:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: # kill: def %edi killed %edi def %rdi
+; GENERIC-NEXT: leal 1024(%rdi), %eax # sched: [1:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_lea_offset_big:
+; ATOM: # %bb.0:
+; ATOM-NEXT: # kill: def %edi killed %edi def %rdi
+; ATOM-NEXT: leal 1024(%rdi), %eax # sched: [1:1.00]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_lea_offset_big:
+; SLM: # %bb.0:
+; SLM-NEXT: # kill: def %edi killed %edi def %rdi
+; SLM-NEXT: leal 1024(%rdi), %eax # sched: [1:1.00]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_lea_offset_big:
+; SANDY: # %bb.0:
+; SANDY-NEXT: # kill: def %edi killed %edi def %rdi
+; SANDY-NEXT: leal 1024(%rdi), %eax # sched: [1:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_lea_offset_big:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: # kill: def %edi killed %edi def %rdi
+; HASWELL-NEXT: leal 1024(%rdi), %eax # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_lea_offset_big:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: # kill: def %edi killed %edi def %rdi
+; BROADWELL-NEXT: leal 1024(%rdi), %eax # sched: [1:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_lea_offset_big:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: # kill: def %edi killed %edi def %rdi
+; SKYLAKE-NEXT: leal 1024(%rdi), %eax # sched: [1:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_lea_offset_big:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: # kill: def %edi killed %edi def %rdi
+; BTVER2-NEXT: leal 1024(%rdi), %eax # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_lea_offset_big:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: # kill: def %edi killed %edi def %rdi
+; ZNVER1-NEXT: leal 1024(%rdi), %eax # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %2 = add nsw i32 %0, 1024
+ ret i32 %2
+}
+
+; Function Attrs: norecurse nounwind readnone uwtable
+define i32 @test_lea_add(i32, i32) {
+; GENERIC-LABEL: test_lea_add:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: # kill: def %esi killed %esi def %rsi
+; GENERIC-NEXT: # kill: def %edi killed %edi def %rdi
+; GENERIC-NEXT: leal (%rdi,%rsi), %eax # sched: [1:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_lea_add:
+; ATOM: # %bb.0:
+; ATOM-NEXT: # kill: def %esi killed %esi def %rsi
+; ATOM-NEXT: # kill: def %edi killed %edi def %rdi
+; ATOM-NEXT: leal (%rdi,%rsi), %eax # sched: [1:1.00]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_lea_add:
+; SLM: # %bb.0:
+; SLM-NEXT: # kill: def %esi killed %esi def %rsi
+; SLM-NEXT: # kill: def %edi killed %edi def %rdi
+; SLM-NEXT: leal (%rdi,%rsi), %eax # sched: [1:1.00]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_lea_add:
+; SANDY: # %bb.0:
+; SANDY-NEXT: # kill: def %esi killed %esi def %rsi
+; SANDY-NEXT: # kill: def %edi killed %edi def %rdi
+; SANDY-NEXT: leal (%rdi,%rsi), %eax # sched: [1:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_lea_add:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: # kill: def %esi killed %esi def %rsi
+; HASWELL-NEXT: # kill: def %edi killed %edi def %rdi
+; HASWELL-NEXT: leal (%rdi,%rsi), %eax # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_lea_add:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: # kill: def %esi killed %esi def %rsi
+; BROADWELL-NEXT: # kill: def %edi killed %edi def %rdi
+; BROADWELL-NEXT: leal (%rdi,%rsi), %eax # sched: [1:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_lea_add:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: # kill: def %esi killed %esi def %rsi
+; SKYLAKE-NEXT: # kill: def %edi killed %edi def %rdi
+; SKYLAKE-NEXT: leal (%rdi,%rsi), %eax # sched: [1:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_lea_add:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: # kill: def %esi killed %esi def %rsi
+; BTVER2-NEXT: # kill: def %edi killed %edi def %rdi
+; BTVER2-NEXT: leal (%rdi,%rsi), %eax # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_lea_add:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: # kill: def %esi killed %esi def %rsi
+; ZNVER1-NEXT: # kill: def %edi killed %edi def %rdi
+; ZNVER1-NEXT: leal (%rdi,%rsi), %eax # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %3 = add nsw i32 %1, %0
+ ret i32 %3
+}
+
+define i32 @test_lea_add_offset(i32, i32) {
+; GENERIC-LABEL: test_lea_add_offset:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: # kill: def %esi killed %esi def %rsi
+; GENERIC-NEXT: # kill: def %edi killed %edi def %rdi
+; GENERIC-NEXT: leal (%rdi,%rsi), %eax # sched: [1:0.50]
+; GENERIC-NEXT: addl $16, %eax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_lea_add_offset:
+; ATOM: # %bb.0:
+; ATOM-NEXT: # kill: def %esi killed %esi def %rsi
+; ATOM-NEXT: # kill: def %edi killed %edi def %rdi
+; ATOM-NEXT: leal 16(%rdi,%rsi), %eax # sched: [1:1.00]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_lea_add_offset:
+; SLM: # %bb.0:
+; SLM-NEXT: # kill: def %esi killed %esi def %rsi
+; SLM-NEXT: # kill: def %edi killed %edi def %rdi
+; SLM-NEXT: leal 16(%rdi,%rsi), %eax # sched: [1:1.00]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_lea_add_offset:
+; SANDY: # %bb.0:
+; SANDY-NEXT: # kill: def %esi killed %esi def %rsi
+; SANDY-NEXT: # kill: def %edi killed %edi def %rdi
+; SANDY-NEXT: leal (%rdi,%rsi), %eax # sched: [1:0.50]
+; SANDY-NEXT: addl $16, %eax # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_lea_add_offset:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: # kill: def %esi killed %esi def %rsi
+; HASWELL-NEXT: # kill: def %edi killed %edi def %rdi
+; HASWELL-NEXT: leal (%rdi,%rsi), %eax # sched: [1:0.50]
+; HASWELL-NEXT: addl $16, %eax # sched: [1:0.25]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_lea_add_offset:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: # kill: def %esi killed %esi def %rsi
+; BROADWELL-NEXT: # kill: def %edi killed %edi def %rdi
+; BROADWELL-NEXT: leal (%rdi,%rsi), %eax # sched: [1:0.50]
+; BROADWELL-NEXT: addl $16, %eax # sched: [1:0.25]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_lea_add_offset:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: # kill: def %esi killed %esi def %rsi
+; SKYLAKE-NEXT: # kill: def %edi killed %edi def %rdi
+; SKYLAKE-NEXT: leal (%rdi,%rsi), %eax # sched: [1:0.50]
+; SKYLAKE-NEXT: addl $16, %eax # sched: [1:0.25]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_lea_add_offset:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: # kill: def %esi killed %esi def %rsi
+; BTVER2-NEXT: # kill: def %edi killed %edi def %rdi
+; BTVER2-NEXT: leal 16(%rdi,%rsi), %eax # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_lea_add_offset:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: # kill: def %esi killed %esi def %rsi
+; ZNVER1-NEXT: # kill: def %edi killed %edi def %rdi
+; ZNVER1-NEXT: leal 16(%rdi,%rsi), %eax # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %3 = add i32 %0, 16
+ %4 = add i32 %3, %1
+ ret i32 %4
+}
+
+define i32 @test_lea_add_offset_big(i32, i32) {
+; GENERIC-LABEL: test_lea_add_offset_big:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: # kill: def %esi killed %esi def %rsi
+; GENERIC-NEXT: # kill: def %edi killed %edi def %rdi
+; GENERIC-NEXT: leal (%rdi,%rsi), %eax # sched: [1:0.50]
+; GENERIC-NEXT: addl $-4096, %eax # imm = 0xF000
+; GENERIC-NEXT: # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_lea_add_offset_big:
+; ATOM: # %bb.0:
+; ATOM-NEXT: # kill: def %esi killed %esi def %rsi
+; ATOM-NEXT: # kill: def %edi killed %edi def %rdi
+; ATOM-NEXT: leal -4096(%rdi,%rsi), %eax # sched: [1:1.00]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_lea_add_offset_big:
+; SLM: # %bb.0:
+; SLM-NEXT: # kill: def %esi killed %esi def %rsi
+; SLM-NEXT: # kill: def %edi killed %edi def %rdi
+; SLM-NEXT: leal -4096(%rdi,%rsi), %eax # sched: [1:1.00]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_lea_add_offset_big:
+; SANDY: # %bb.0:
+; SANDY-NEXT: # kill: def %esi killed %esi def %rsi
+; SANDY-NEXT: # kill: def %edi killed %edi def %rdi
+; SANDY-NEXT: leal (%rdi,%rsi), %eax # sched: [1:0.50]
+; SANDY-NEXT: addl $-4096, %eax # imm = 0xF000
+; SANDY-NEXT: # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_lea_add_offset_big:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: # kill: def %esi killed %esi def %rsi
+; HASWELL-NEXT: # kill: def %edi killed %edi def %rdi
+; HASWELL-NEXT: leal (%rdi,%rsi), %eax # sched: [1:0.50]
+; HASWELL-NEXT: addl $-4096, %eax # imm = 0xF000
+; HASWELL-NEXT: # sched: [1:0.25]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_lea_add_offset_big:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: # kill: def %esi killed %esi def %rsi
+; BROADWELL-NEXT: # kill: def %edi killed %edi def %rdi
+; BROADWELL-NEXT: leal (%rdi,%rsi), %eax # sched: [1:0.50]
+; BROADWELL-NEXT: addl $-4096, %eax # imm = 0xF000
+; BROADWELL-NEXT: # sched: [1:0.25]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_lea_add_offset_big:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: # kill: def %esi killed %esi def %rsi
+; SKYLAKE-NEXT: # kill: def %edi killed %edi def %rdi
+; SKYLAKE-NEXT: leal (%rdi,%rsi), %eax # sched: [1:0.50]
+; SKYLAKE-NEXT: addl $-4096, %eax # imm = 0xF000
+; SKYLAKE-NEXT: # sched: [1:0.25]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_lea_add_offset_big:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: # kill: def %esi killed %esi def %rsi
+; BTVER2-NEXT: # kill: def %edi killed %edi def %rdi
+; BTVER2-NEXT: leal -4096(%rdi,%rsi), %eax # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_lea_add_offset_big:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: # kill: def %esi killed %esi def %rsi
+; ZNVER1-NEXT: # kill: def %edi killed %edi def %rdi
+; ZNVER1-NEXT: leal -4096(%rdi,%rsi), %eax # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %3 = add i32 %0, -4096
+ %4 = add i32 %3, %1
+ ret i32 %4
+}
+
+define i32 @test_lea_mul(i32) {
+; GENERIC-LABEL: test_lea_mul:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: # kill: def %edi killed %edi def %rdi
+; GENERIC-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_lea_mul:
+; ATOM: # %bb.0:
+; ATOM-NEXT: # kill: def %edi killed %edi def %rdi
+; ATOM-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:1.00]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_lea_mul:
+; SLM: # %bb.0:
+; SLM-NEXT: # kill: def %edi killed %edi def %rdi
+; SLM-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:1.00]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_lea_mul:
+; SANDY: # %bb.0:
+; SANDY-NEXT: # kill: def %edi killed %edi def %rdi
+; SANDY-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_lea_mul:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: # kill: def %edi killed %edi def %rdi
+; HASWELL-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_lea_mul:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: # kill: def %edi killed %edi def %rdi
+; BROADWELL-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_lea_mul:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: # kill: def %edi killed %edi def %rdi
+; SKYLAKE-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_lea_mul:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: # kill: def %edi killed %edi def %rdi
+; BTVER2-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_lea_mul:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: # kill: def %edi killed %edi def %rdi
+; ZNVER1-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %2 = mul nsw i32 %0, 3
+ ret i32 %2
+}
+
+define i32 @test_lea_mul_offset(i32) {
+; GENERIC-LABEL: test_lea_mul_offset:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: # kill: def %edi killed %edi def %rdi
+; GENERIC-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
+; GENERIC-NEXT: addl $-32, %eax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_lea_mul_offset:
+; ATOM: # %bb.0:
+; ATOM-NEXT: # kill: def %edi killed %edi def %rdi
+; ATOM-NEXT: leal -32(%rdi,%rdi,2), %eax # sched: [1:1.00]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_lea_mul_offset:
+; SLM: # %bb.0:
+; SLM-NEXT: # kill: def %edi killed %edi def %rdi
+; SLM-NEXT: leal -32(%rdi,%rdi,2), %eax # sched: [1:1.00]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_lea_mul_offset:
+; SANDY: # %bb.0:
+; SANDY-NEXT: # kill: def %edi killed %edi def %rdi
+; SANDY-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
+; SANDY-NEXT: addl $-32, %eax # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_lea_mul_offset:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: # kill: def %edi killed %edi def %rdi
+; HASWELL-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
+; HASWELL-NEXT: addl $-32, %eax # sched: [1:0.25]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_lea_mul_offset:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: # kill: def %edi killed %edi def %rdi
+; BROADWELL-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
+; BROADWELL-NEXT: addl $-32, %eax # sched: [1:0.25]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_lea_mul_offset:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: # kill: def %edi killed %edi def %rdi
+; SKYLAKE-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
+; SKYLAKE-NEXT: addl $-32, %eax # sched: [1:0.25]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_lea_mul_offset:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: # kill: def %edi killed %edi def %rdi
+; BTVER2-NEXT: leal -32(%rdi,%rdi,2), %eax # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_lea_mul_offset:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: # kill: def %edi killed %edi def %rdi
+; ZNVER1-NEXT: leal -32(%rdi,%rdi,2), %eax # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %2 = mul nsw i32 %0, 3
+ %3 = add nsw i32 %2, -32
+ ret i32 %3
+}
+
+define i32 @test_lea_mul_offset_big(i32) {
+; GENERIC-LABEL: test_lea_mul_offset_big:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: # kill: def %edi killed %edi def %rdi
+; GENERIC-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:0.50]
+; GENERIC-NEXT: addl $10000, %eax # imm = 0x2710
+; GENERIC-NEXT: # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_lea_mul_offset_big:
+; ATOM: # %bb.0:
+; ATOM-NEXT: # kill: def %edi killed %edi def %rdi
+; ATOM-NEXT: leal 10000(%rdi,%rdi,8), %eax # sched: [1:1.00]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_lea_mul_offset_big:
+; SLM: # %bb.0:
+; SLM-NEXT: # kill: def %edi killed %edi def %rdi
+; SLM-NEXT: leal 10000(%rdi,%rdi,8), %eax # sched: [1:1.00]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_lea_mul_offset_big:
+; SANDY: # %bb.0:
+; SANDY-NEXT: # kill: def %edi killed %edi def %rdi
+; SANDY-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:0.50]
+; SANDY-NEXT: addl $10000, %eax # imm = 0x2710
+; SANDY-NEXT: # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_lea_mul_offset_big:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: # kill: def %edi killed %edi def %rdi
+; HASWELL-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:0.50]
+; HASWELL-NEXT: addl $10000, %eax # imm = 0x2710
+; HASWELL-NEXT: # sched: [1:0.25]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_lea_mul_offset_big:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: # kill: def %edi killed %edi def %rdi
+; BROADWELL-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:0.50]
+; BROADWELL-NEXT: addl $10000, %eax # imm = 0x2710
+; BROADWELL-NEXT: # sched: [1:0.25]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_lea_mul_offset_big:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: # kill: def %edi killed %edi def %rdi
+; SKYLAKE-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:0.50]
+; SKYLAKE-NEXT: addl $10000, %eax # imm = 0x2710
+; SKYLAKE-NEXT: # sched: [1:0.25]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_lea_mul_offset_big:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: # kill: def %edi killed %edi def %rdi
+; BTVER2-NEXT: leal 10000(%rdi,%rdi,8), %eax # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_lea_mul_offset_big:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: # kill: def %edi killed %edi def %rdi
+; ZNVER1-NEXT: leal 10000(%rdi,%rdi,8), %eax # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %2 = mul nsw i32 %0, 9
+ %3 = add nsw i32 %2, 10000
+ ret i32 %3
+}
+
+define i32 @test_lea_add_scale(i32, i32) {
+; GENERIC-LABEL: test_lea_add_scale:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: # kill: def %esi killed %esi def %rsi
+; GENERIC-NEXT: # kill: def %edi killed %edi def %rdi
+; GENERIC-NEXT: leal (%rdi,%rsi,2), %eax # sched: [1:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_lea_add_scale:
+; ATOM: # %bb.0:
+; ATOM-NEXT: # kill: def %esi killed %esi def %rsi
+; ATOM-NEXT: # kill: def %edi killed %edi def %rdi
+; ATOM-NEXT: leal (%rdi,%rsi,2), %eax # sched: [1:1.00]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_lea_add_scale:
+; SLM: # %bb.0:
+; SLM-NEXT: # kill: def %esi killed %esi def %rsi
+; SLM-NEXT: # kill: def %edi killed %edi def %rdi
+; SLM-NEXT: leal (%rdi,%rsi,2), %eax # sched: [1:1.00]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_lea_add_scale:
+; SANDY: # %bb.0:
+; SANDY-NEXT: # kill: def %esi killed %esi def %rsi
+; SANDY-NEXT: # kill: def %edi killed %edi def %rdi
+; SANDY-NEXT: leal (%rdi,%rsi,2), %eax # sched: [1:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_lea_add_scale:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: # kill: def %esi killed %esi def %rsi
+; HASWELL-NEXT: # kill: def %edi killed %edi def %rdi
+; HASWELL-NEXT: leal (%rdi,%rsi,2), %eax # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_lea_add_scale:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: # kill: def %esi killed %esi def %rsi
+; BROADWELL-NEXT: # kill: def %edi killed %edi def %rdi
+; BROADWELL-NEXT: leal (%rdi,%rsi,2), %eax # sched: [1:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_lea_add_scale:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: # kill: def %esi killed %esi def %rsi
+; SKYLAKE-NEXT: # kill: def %edi killed %edi def %rdi
+; SKYLAKE-NEXT: leal (%rdi,%rsi,2), %eax # sched: [1:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_lea_add_scale:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: # kill: def %esi killed %esi def %rsi
+; BTVER2-NEXT: # kill: def %edi killed %edi def %rdi
+; BTVER2-NEXT: leal (%rdi,%rsi,2), %eax # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_lea_add_scale:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: # kill: def %esi killed %esi def %rsi
+; ZNVER1-NEXT: # kill: def %edi killed %edi def %rdi
+; ZNVER1-NEXT: leal (%rdi,%rsi,2), %eax # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %3 = shl i32 %1, 1
+ %4 = add nsw i32 %3, %0
+ ret i32 %4
+}
+
+define i32 @test_lea_add_scale_offset(i32, i32) {
+; GENERIC-LABEL: test_lea_add_scale_offset:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: # kill: def %esi killed %esi def %rsi
+; GENERIC-NEXT: # kill: def %edi killed %edi def %rdi
+; GENERIC-NEXT: leal (%rdi,%rsi,4), %eax # sched: [1:0.50]
+; GENERIC-NEXT: addl $96, %eax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_lea_add_scale_offset:
+; ATOM: # %bb.0:
+; ATOM-NEXT: # kill: def %esi killed %esi def %rsi
+; ATOM-NEXT: # kill: def %edi killed %edi def %rdi
+; ATOM-NEXT: leal 96(%rdi,%rsi,4), %eax # sched: [1:1.00]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_lea_add_scale_offset:
+; SLM: # %bb.0:
+; SLM-NEXT: # kill: def %esi killed %esi def %rsi
+; SLM-NEXT: # kill: def %edi killed %edi def %rdi
+; SLM-NEXT: leal 96(%rdi,%rsi,4), %eax # sched: [1:1.00]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_lea_add_scale_offset:
+; SANDY: # %bb.0:
+; SANDY-NEXT: # kill: def %esi killed %esi def %rsi
+; SANDY-NEXT: # kill: def %edi killed %edi def %rdi
+; SANDY-NEXT: leal (%rdi,%rsi,4), %eax # sched: [1:0.50]
+; SANDY-NEXT: addl $96, %eax # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_lea_add_scale_offset:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: # kill: def %esi killed %esi def %rsi
+; HASWELL-NEXT: # kill: def %edi killed %edi def %rdi
+; HASWELL-NEXT: leal (%rdi,%rsi,4), %eax # sched: [1:0.50]
+; HASWELL-NEXT: addl $96, %eax # sched: [1:0.25]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_lea_add_scale_offset:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: # kill: def %esi killed %esi def %rsi
+; BROADWELL-NEXT: # kill: def %edi killed %edi def %rdi
+; BROADWELL-NEXT: leal (%rdi,%rsi,4), %eax # sched: [1:0.50]
+; BROADWELL-NEXT: addl $96, %eax # sched: [1:0.25]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_lea_add_scale_offset:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: # kill: def %esi killed %esi def %rsi
+; SKYLAKE-NEXT: # kill: def %edi killed %edi def %rdi
+; SKYLAKE-NEXT: leal (%rdi,%rsi,4), %eax # sched: [1:0.50]
+; SKYLAKE-NEXT: addl $96, %eax # sched: [1:0.25]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_lea_add_scale_offset:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: # kill: def %esi killed %esi def %rsi
+; BTVER2-NEXT: # kill: def %edi killed %edi def %rdi
+; BTVER2-NEXT: leal 96(%rdi,%rsi,4), %eax # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_lea_add_scale_offset:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: # kill: def %esi killed %esi def %rsi
+; ZNVER1-NEXT: # kill: def %edi killed %edi def %rdi
+; ZNVER1-NEXT: leal 96(%rdi,%rsi,4), %eax # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %3 = shl i32 %1, 2
+ %4 = add i32 %0, 96
+ %5 = add i32 %4, %3
+ ret i32 %5
+}
+
+define i32 @test_lea_add_scale_offset_big(i32, i32) {
+; GENERIC-LABEL: test_lea_add_scale_offset_big:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: # kill: def %esi killed %esi def %rsi
+; GENERIC-NEXT: # kill: def %edi killed %edi def %rdi
+; GENERIC-NEXT: leal (%rdi,%rsi,8), %eax # sched: [1:0.50]
+; GENERIC-NEXT: addl $-1200, %eax # imm = 0xFB50
+; GENERIC-NEXT: # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_lea_add_scale_offset_big:
+; ATOM: # %bb.0:
+; ATOM-NEXT: # kill: def %esi killed %esi def %rsi
+; ATOM-NEXT: # kill: def %edi killed %edi def %rdi
+; ATOM-NEXT: leal -1200(%rdi,%rsi,8), %eax # sched: [1:1.00]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_lea_add_scale_offset_big:
+; SLM: # %bb.0:
+; SLM-NEXT: # kill: def %esi killed %esi def %rsi
+; SLM-NEXT: # kill: def %edi killed %edi def %rdi
+; SLM-NEXT: leal -1200(%rdi,%rsi,8), %eax # sched: [1:1.00]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_lea_add_scale_offset_big:
+; SANDY: # %bb.0:
+; SANDY-NEXT: # kill: def %esi killed %esi def %rsi
+; SANDY-NEXT: # kill: def %edi killed %edi def %rdi
+; SANDY-NEXT: leal (%rdi,%rsi,8), %eax # sched: [1:0.50]
+; SANDY-NEXT: addl $-1200, %eax # imm = 0xFB50
+; SANDY-NEXT: # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_lea_add_scale_offset_big:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: # kill: def %esi killed %esi def %rsi
+; HASWELL-NEXT: # kill: def %edi killed %edi def %rdi
+; HASWELL-NEXT: leal (%rdi,%rsi,8), %eax # sched: [1:0.50]
+; HASWELL-NEXT: addl $-1200, %eax # imm = 0xFB50
+; HASWELL-NEXT: # sched: [1:0.25]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_lea_add_scale_offset_big:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: # kill: def %esi killed %esi def %rsi
+; BROADWELL-NEXT: # kill: def %edi killed %edi def %rdi
+; BROADWELL-NEXT: leal (%rdi,%rsi,8), %eax # sched: [1:0.50]
+; BROADWELL-NEXT: addl $-1200, %eax # imm = 0xFB50
+; BROADWELL-NEXT: # sched: [1:0.25]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_lea_add_scale_offset_big:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: # kill: def %esi killed %esi def %rsi
+; SKYLAKE-NEXT: # kill: def %edi killed %edi def %rdi
+; SKYLAKE-NEXT: leal (%rdi,%rsi,8), %eax # sched: [1:0.50]
+; SKYLAKE-NEXT: addl $-1200, %eax # imm = 0xFB50
+; SKYLAKE-NEXT: # sched: [1:0.25]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_lea_add_scale_offset_big:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: # kill: def %esi killed %esi def %rsi
+; BTVER2-NEXT: # kill: def %edi killed %edi def %rdi
+; BTVER2-NEXT: leal -1200(%rdi,%rsi,8), %eax # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_lea_add_scale_offset_big:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: # kill: def %esi killed %esi def %rsi
+; ZNVER1-NEXT: # kill: def %edi killed %edi def %rdi
+; ZNVER1-NEXT: leal -1200(%rdi,%rsi,8), %eax # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %3 = shl i32 %1, 3
+ %4 = add i32 %0, -1200
+ %5 = add i32 %4, %3
+ ret i32 %5
+}
diff --git a/test/CodeGen/X86/lea64-schedule.ll b/test/CodeGen/X86/lea64-schedule.ll
new file mode 100644
index 000000000000..549d002ae6cc
--- /dev/null
+++ b/test/CodeGen/X86/lea64-schedule.ll
@@ -0,0 +1,672 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 | FileCheck %s --check-prefix=CHECK --check-prefix=GENERIC
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=atom | FileCheck %s --check-prefix=CHECK --check-prefix=ATOM
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=slm | FileCheck %s --check-prefix=CHECK --check-prefix=SLM
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=sandybridge | FileCheck %s --check-prefix=CHECK --check-prefix=SANDY
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=ivybridge | FileCheck %s --check-prefix=CHECK --check-prefix=SANDY
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=haswell | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=broadwell | FileCheck %s --check-prefix=CHECK --check-prefix=BROADWELL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=SKYLAKE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=knl | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1
+
+define i64 @test_lea_offset(i64) {
+; GENERIC-LABEL: test_lea_offset:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: leaq -24(%rdi), %rax # sched: [1:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_lea_offset:
+; ATOM: # %bb.0:
+; ATOM-NEXT: leaq -24(%rdi), %rax # sched: [1:1.00]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_lea_offset:
+; SLM: # %bb.0:
+; SLM-NEXT: leaq -24(%rdi), %rax # sched: [1:1.00]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_lea_offset:
+; SANDY: # %bb.0:
+; SANDY-NEXT: leaq -24(%rdi), %rax # sched: [1:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_lea_offset:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: leaq -24(%rdi), %rax # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_lea_offset:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: leaq -24(%rdi), %rax # sched: [1:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_lea_offset:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: leaq -24(%rdi), %rax # sched: [1:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_lea_offset:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: leaq -24(%rdi), %rax # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_lea_offset:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: leaq -24(%rdi), %rax # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %2 = add nsw i64 %0, -24
+ ret i64 %2
+}
+
+define i64 @test_lea_offset_big(i64) {
+; GENERIC-LABEL: test_lea_offset_big:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: leaq 1024(%rdi), %rax # sched: [1:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_lea_offset_big:
+; ATOM: # %bb.0:
+; ATOM-NEXT: leaq 1024(%rdi), %rax # sched: [1:1.00]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_lea_offset_big:
+; SLM: # %bb.0:
+; SLM-NEXT: leaq 1024(%rdi), %rax # sched: [1:1.00]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_lea_offset_big:
+; SANDY: # %bb.0:
+; SANDY-NEXT: leaq 1024(%rdi), %rax # sched: [1:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_lea_offset_big:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: leaq 1024(%rdi), %rax # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_lea_offset_big:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: leaq 1024(%rdi), %rax # sched: [1:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_lea_offset_big:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: leaq 1024(%rdi), %rax # sched: [1:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_lea_offset_big:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: leaq 1024(%rdi), %rax # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_lea_offset_big:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: leaq 1024(%rdi), %rax # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %2 = add nsw i64 %0, 1024
+ ret i64 %2
+}
+
+; Function Attrs: norecurse nounwind readnone uwtable
+define i64 @test_lea_add(i64, i64) {
+; GENERIC-LABEL: test_lea_add:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: leaq (%rdi,%rsi), %rax # sched: [1:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_lea_add:
+; ATOM: # %bb.0:
+; ATOM-NEXT: leaq (%rdi,%rsi), %rax # sched: [1:1.00]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_lea_add:
+; SLM: # %bb.0:
+; SLM-NEXT: leaq (%rdi,%rsi), %rax # sched: [1:1.00]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_lea_add:
+; SANDY: # %bb.0:
+; SANDY-NEXT: leaq (%rdi,%rsi), %rax # sched: [1:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_lea_add:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: leaq (%rdi,%rsi), %rax # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_lea_add:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: leaq (%rdi,%rsi), %rax # sched: [1:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_lea_add:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: leaq (%rdi,%rsi), %rax # sched: [1:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_lea_add:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: leaq (%rdi,%rsi), %rax # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_lea_add:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: leaq (%rdi,%rsi), %rax # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %3 = add nsw i64 %1, %0
+ ret i64 %3
+}
+
+define i64 @test_lea_add_offset(i64, i64) {
+; GENERIC-LABEL: test_lea_add_offset:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: leaq (%rdi,%rsi), %rax # sched: [1:0.50]
+; GENERIC-NEXT: addq $16, %rax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_lea_add_offset:
+; ATOM: # %bb.0:
+; ATOM-NEXT: leaq 16(%rdi,%rsi), %rax # sched: [1:1.00]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_lea_add_offset:
+; SLM: # %bb.0:
+; SLM-NEXT: leaq 16(%rdi,%rsi), %rax # sched: [1:1.00]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_lea_add_offset:
+; SANDY: # %bb.0:
+; SANDY-NEXT: leaq (%rdi,%rsi), %rax # sched: [1:0.50]
+; SANDY-NEXT: addq $16, %rax # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_lea_add_offset:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: leaq (%rdi,%rsi), %rax # sched: [1:0.50]
+; HASWELL-NEXT: addq $16, %rax # sched: [1:0.25]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_lea_add_offset:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: leaq (%rdi,%rsi), %rax # sched: [1:0.50]
+; BROADWELL-NEXT: addq $16, %rax # sched: [1:0.25]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_lea_add_offset:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: leaq (%rdi,%rsi), %rax # sched: [1:0.50]
+; SKYLAKE-NEXT: addq $16, %rax # sched: [1:0.25]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_lea_add_offset:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: leaq 16(%rdi,%rsi), %rax # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_lea_add_offset:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: leaq 16(%rdi,%rsi), %rax # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %3 = add i64 %0, 16
+ %4 = add i64 %3, %1
+ ret i64 %4
+}
+
+define i64 @test_lea_add_offset_big(i64, i64) {
+; GENERIC-LABEL: test_lea_add_offset_big:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: leaq (%rdi,%rsi), %rax # sched: [1:0.50]
+; GENERIC-NEXT: addq $-4096, %rax # imm = 0xF000
+; GENERIC-NEXT: # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_lea_add_offset_big:
+; ATOM: # %bb.0:
+; ATOM-NEXT: leaq -4096(%rdi,%rsi), %rax # sched: [1:1.00]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_lea_add_offset_big:
+; SLM: # %bb.0:
+; SLM-NEXT: leaq -4096(%rdi,%rsi), %rax # sched: [1:1.00]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_lea_add_offset_big:
+; SANDY: # %bb.0:
+; SANDY-NEXT: leaq (%rdi,%rsi), %rax # sched: [1:0.50]
+; SANDY-NEXT: addq $-4096, %rax # imm = 0xF000
+; SANDY-NEXT: # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_lea_add_offset_big:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: leaq (%rdi,%rsi), %rax # sched: [1:0.50]
+; HASWELL-NEXT: addq $-4096, %rax # imm = 0xF000
+; HASWELL-NEXT: # sched: [1:0.25]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_lea_add_offset_big:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: leaq (%rdi,%rsi), %rax # sched: [1:0.50]
+; BROADWELL-NEXT: addq $-4096, %rax # imm = 0xF000
+; BROADWELL-NEXT: # sched: [1:0.25]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_lea_add_offset_big:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: leaq (%rdi,%rsi), %rax # sched: [1:0.50]
+; SKYLAKE-NEXT: addq $-4096, %rax # imm = 0xF000
+; SKYLAKE-NEXT: # sched: [1:0.25]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_lea_add_offset_big:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: leaq -4096(%rdi,%rsi), %rax # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_lea_add_offset_big:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: leaq -4096(%rdi,%rsi), %rax # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %3 = add i64 %0, -4096
+ %4 = add i64 %3, %1
+ ret i64 %4
+}
+
+define i64 @test_lea_mul(i64) {
+; GENERIC-LABEL: test_lea_mul:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_lea_mul:
+; ATOM: # %bb.0:
+; ATOM-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:1.00]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_lea_mul:
+; SLM: # %bb.0:
+; SLM-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:1.00]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_lea_mul:
+; SANDY: # %bb.0:
+; SANDY-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_lea_mul:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_lea_mul:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_lea_mul:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_lea_mul:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_lea_mul:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %2 = mul nsw i64 %0, 3
+ ret i64 %2
+}
+
+define i64 @test_lea_mul_offset(i64) {
+; GENERIC-LABEL: test_lea_mul_offset:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50]
+; GENERIC-NEXT: addq $-32, %rax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_lea_mul_offset:
+; ATOM: # %bb.0:
+; ATOM-NEXT: leaq -32(%rdi,%rdi,2), %rax # sched: [1:1.00]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_lea_mul_offset:
+; SLM: # %bb.0:
+; SLM-NEXT: leaq -32(%rdi,%rdi,2), %rax # sched: [1:1.00]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_lea_mul_offset:
+; SANDY: # %bb.0:
+; SANDY-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50]
+; SANDY-NEXT: addq $-32, %rax # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_lea_mul_offset:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50]
+; HASWELL-NEXT: addq $-32, %rax # sched: [1:0.25]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_lea_mul_offset:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50]
+; BROADWELL-NEXT: addq $-32, %rax # sched: [1:0.25]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_lea_mul_offset:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50]
+; SKYLAKE-NEXT: addq $-32, %rax # sched: [1:0.25]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_lea_mul_offset:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: leaq -32(%rdi,%rdi,2), %rax # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_lea_mul_offset:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: leaq -32(%rdi,%rdi,2), %rax # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %2 = mul nsw i64 %0, 3
+ %3 = add nsw i64 %2, -32
+ ret i64 %3
+}
+
+define i64 @test_lea_mul_offset_big(i64) {
+; GENERIC-LABEL: test_lea_mul_offset_big:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:0.50]
+; GENERIC-NEXT: addq $10000, %rax # imm = 0x2710
+; GENERIC-NEXT: # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_lea_mul_offset_big:
+; ATOM: # %bb.0:
+; ATOM-NEXT: leaq 10000(%rdi,%rdi,8), %rax # sched: [1:1.00]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_lea_mul_offset_big:
+; SLM: # %bb.0:
+; SLM-NEXT: leaq 10000(%rdi,%rdi,8), %rax # sched: [1:1.00]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_lea_mul_offset_big:
+; SANDY: # %bb.0:
+; SANDY-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:0.50]
+; SANDY-NEXT: addq $10000, %rax # imm = 0x2710
+; SANDY-NEXT: # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_lea_mul_offset_big:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:0.50]
+; HASWELL-NEXT: addq $10000, %rax # imm = 0x2710
+; HASWELL-NEXT: # sched: [1:0.25]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_lea_mul_offset_big:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:0.50]
+; BROADWELL-NEXT: addq $10000, %rax # imm = 0x2710
+; BROADWELL-NEXT: # sched: [1:0.25]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_lea_mul_offset_big:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:0.50]
+; SKYLAKE-NEXT: addq $10000, %rax # imm = 0x2710
+; SKYLAKE-NEXT: # sched: [1:0.25]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_lea_mul_offset_big:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: leaq 10000(%rdi,%rdi,8), %rax # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_lea_mul_offset_big:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: leaq 10000(%rdi,%rdi,8), %rax # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %2 = mul nsw i64 %0, 9
+ %3 = add nsw i64 %2, 10000
+ ret i64 %3
+}
+
+define i64 @test_lea_add_scale(i64, i64) {
+; GENERIC-LABEL: test_lea_add_scale:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: leaq (%rdi,%rsi,2), %rax # sched: [1:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_lea_add_scale:
+; ATOM: # %bb.0:
+; ATOM-NEXT: leaq (%rdi,%rsi,2), %rax # sched: [1:1.00]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_lea_add_scale:
+; SLM: # %bb.0:
+; SLM-NEXT: leaq (%rdi,%rsi,2), %rax # sched: [1:1.00]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_lea_add_scale:
+; SANDY: # %bb.0:
+; SANDY-NEXT: leaq (%rdi,%rsi,2), %rax # sched: [1:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_lea_add_scale:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: leaq (%rdi,%rsi,2), %rax # sched: [1:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_lea_add_scale:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: leaq (%rdi,%rsi,2), %rax # sched: [1:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_lea_add_scale:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: leaq (%rdi,%rsi,2), %rax # sched: [1:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_lea_add_scale:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: leaq (%rdi,%rsi,2), %rax # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_lea_add_scale:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: leaq (%rdi,%rsi,2), %rax # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %3 = shl i64 %1, 1
+ %4 = add nsw i64 %3, %0
+ ret i64 %4
+}
+
+define i64 @test_lea_add_scale_offset(i64, i64) {
+; GENERIC-LABEL: test_lea_add_scale_offset:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: leaq (%rdi,%rsi,4), %rax # sched: [1:0.50]
+; GENERIC-NEXT: addq $96, %rax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_lea_add_scale_offset:
+; ATOM: # %bb.0:
+; ATOM-NEXT: leaq 96(%rdi,%rsi,4), %rax # sched: [1:1.00]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_lea_add_scale_offset:
+; SLM: # %bb.0:
+; SLM-NEXT: leaq 96(%rdi,%rsi,4), %rax # sched: [1:1.00]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_lea_add_scale_offset:
+; SANDY: # %bb.0:
+; SANDY-NEXT: leaq (%rdi,%rsi,4), %rax # sched: [1:0.50]
+; SANDY-NEXT: addq $96, %rax # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_lea_add_scale_offset:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: leaq (%rdi,%rsi,4), %rax # sched: [1:0.50]
+; HASWELL-NEXT: addq $96, %rax # sched: [1:0.25]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_lea_add_scale_offset:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: leaq (%rdi,%rsi,4), %rax # sched: [1:0.50]
+; BROADWELL-NEXT: addq $96, %rax # sched: [1:0.25]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_lea_add_scale_offset:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: leaq (%rdi,%rsi,4), %rax # sched: [1:0.50]
+; SKYLAKE-NEXT: addq $96, %rax # sched: [1:0.25]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_lea_add_scale_offset:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: leaq 96(%rdi,%rsi,4), %rax # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_lea_add_scale_offset:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: leaq 96(%rdi,%rsi,4), %rax # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %3 = shl i64 %1, 2
+ %4 = add i64 %0, 96
+ %5 = add i64 %4, %3
+ ret i64 %5
+}
+
+define i64 @test_lea_add_scale_offset_big(i64, i64) {
+; GENERIC-LABEL: test_lea_add_scale_offset_big:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: leaq (%rdi,%rsi,8), %rax # sched: [1:0.50]
+; GENERIC-NEXT: addq $-1200, %rax # imm = 0xFB50
+; GENERIC-NEXT: # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_lea_add_scale_offset_big:
+; ATOM: # %bb.0:
+; ATOM-NEXT: leaq -1200(%rdi,%rsi,8), %rax # sched: [1:1.00]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_lea_add_scale_offset_big:
+; SLM: # %bb.0:
+; SLM-NEXT: leaq -1200(%rdi,%rsi,8), %rax # sched: [1:1.00]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_lea_add_scale_offset_big:
+; SANDY: # %bb.0:
+; SANDY-NEXT: leaq (%rdi,%rsi,8), %rax # sched: [1:0.50]
+; SANDY-NEXT: addq $-1200, %rax # imm = 0xFB50
+; SANDY-NEXT: # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_lea_add_scale_offset_big:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: leaq (%rdi,%rsi,8), %rax # sched: [1:0.50]
+; HASWELL-NEXT: addq $-1200, %rax # imm = 0xFB50
+; HASWELL-NEXT: # sched: [1:0.25]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_lea_add_scale_offset_big:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: leaq (%rdi,%rsi,8), %rax # sched: [1:0.50]
+; BROADWELL-NEXT: addq $-1200, %rax # imm = 0xFB50
+; BROADWELL-NEXT: # sched: [1:0.25]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_lea_add_scale_offset_big:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: leaq (%rdi,%rsi,8), %rax # sched: [1:0.50]
+; SKYLAKE-NEXT: addq $-1200, %rax # imm = 0xFB50
+; SKYLAKE-NEXT: # sched: [1:0.25]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_lea_add_scale_offset_big:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: leaq -1200(%rdi,%rsi,8), %rax # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_lea_add_scale_offset_big:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: leaq -1200(%rdi,%rsi,8), %rax # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %3 = shl i64 %1, 3
+ %4 = add i64 %0, -1200
+ %5 = add i64 %4, %3
+ ret i64 %5
+}
diff --git a/test/CodeGen/X86/leaFixup32.mir b/test/CodeGen/X86/leaFixup32.mir
index e3986e47df4d..d8e52802f56e 100644
--- a/test/CodeGen/X86/leaFixup32.mir
+++ b/test/CodeGen/X86/leaFixup32.mir
@@ -107,7 +107,7 @@ body: |
; CHECK: %eax = ADD32rr %eax, killed %ebp
; CHECK: %eax = ADD32ri8 %eax, -5
- %eax = LEA32r killed %eax, 1, killed %ebp, -5, _
+ %eax = LEA32r killed %eax, 1, killed %ebp, -5, %noreg
RETQ %eax
...
@@ -142,7 +142,7 @@ body: |
; CHECK: %ebp = ADD32rr %ebp, killed %eax
; CHECK: %ebp = ADD32ri8 %ebp, -5
- %ebp = LEA32r killed %ebp, 1, killed %eax, -5, _
+ %ebp = LEA32r killed %ebp, 1, killed %eax, -5, %noreg
RETQ %ebp
...
@@ -176,7 +176,7 @@ body: |
liveins: %eax, %ebp
; CHECK: %ebp = ADD32rr %ebp, killed %eax
- %ebp = LEA32r killed %ebp, 1, killed %eax, 0, _
+ %ebp = LEA32r killed %ebp, 1, killed %eax, 0, %noreg
RETQ %ebp
...
@@ -212,7 +212,7 @@ body: |
; CHECK: %ebx = LEA32r killed %eax, 1, killed %ebp, 0
; CHECK: %ebx = ADD32ri8 %ebx, -5
- %ebx = LEA32r killed %eax, 1, killed %ebp, -5, _
+ %ebx = LEA32r killed %eax, 1, killed %ebp, -5, %noreg
RETQ %ebx
...
@@ -245,10 +245,10 @@ frameInfo:
body: |
bb.0 (%ir-block.0):
liveins: %eax, %ebp
- ; CHECK: %ebx = LEA32r killed %eax, 1, killed %ebp, 0, _
+ ; CHECK: %ebx = LEA32r killed %eax, 1, killed %ebp, 0, %noreg
; CHECK: %ebx = ADD32ri8 %ebx, -5
- %ebx = LEA32r killed %ebp, 1, killed %eax, -5, _
+ %ebx = LEA32r killed %ebp, 1, killed %eax, -5, %noreg
RETQ %ebx
...
@@ -281,9 +281,9 @@ frameInfo:
body: |
bb.0 (%ir-block.0):
liveins: %eax, %ebp
- ; CHECK: %ebx = LEA32r killed %eax, 1, killed %ebp, 0, _
+ ; CHECK: %ebx = LEA32r killed %eax, 1, killed %ebp, 0, %noreg
- %ebx = LEA32r killed %ebp, 1, killed %eax, 0, _
+ %ebx = LEA32r killed %ebp, 1, killed %eax, 0, %noreg
RETQ %ebx
...
@@ -318,7 +318,7 @@ body: |
; CHECK: %eax = ADD32rr %eax, killed %ebp
; CHECK: %eax = ADD32ri %eax, 129
- %eax = LEA32r killed %eax, 1, killed %ebp, 129, _
+ %eax = LEA32r killed %eax, 1, killed %ebp, 129, %noreg
RETQ %eax
...
@@ -354,7 +354,7 @@ body: |
; CHECK: %ebx = MOV32rr %ebp
; CHECK: %ebx = ADD32rr %ebx, %ebp
- %ebx = LEA32r %ebp, 1, %ebp, 0, _
+ %ebx = LEA32r %ebp, 1, %ebp, 0, %noreg
RETQ %ebx
...
@@ -386,10 +386,10 @@ frameInfo:
body: |
bb.0 (%ir-block.0):
liveins: %eax, %ebp, %ebx
- ; CHECK: %ebx = LEA32r _, 1, %ebp, 5, _
+ ; CHECK: %ebx = LEA32r %noreg, 1, %ebp, 5, %noreg
; CHECK: %ebx = ADD32rr %ebx, %ebp
- %ebx = LEA32r %ebp, 1, %ebp, 5, _
+ %ebx = LEA32r %ebp, 1, %ebp, 5, %noreg
RETQ %ebx
...
@@ -421,10 +421,10 @@ frameInfo:
body: |
bb.0 (%ir-block.0):
liveins: %eax, %ebp, %ebx
- ; CHECK: %ebx = LEA32r _, 4, %ebp, 5, _
+ ; CHECK: %ebx = LEA32r %noreg, 4, %ebp, 5, %noreg
; CHECK: %ebx = ADD32rr %ebx, %ebp
- %ebx = LEA32r %ebp, 4, %ebp, 5, _
+ %ebx = LEA32r %ebp, 4, %ebp, 5, %noreg
RETQ %ebx
...
@@ -456,9 +456,9 @@ frameInfo:
body: |
bb.0 (%ir-block.0):
liveins: %eax, %ebp, %ebx
- ; CHECK: %ebp = LEA32r killed %ebp, 4, killed %ebp, 0, _
+ ; CHECK: %ebp = LEA32r killed %ebp, 4, killed %ebp, 0, %noreg
- %ebp = LEA32r killed %ebp, 4, killed %ebp, 0, _
+ %ebp = LEA32r killed %ebp, 4, killed %ebp, 0, %noreg
RETQ %ebp
...
@@ -490,17 +490,17 @@ frameInfo:
body: |
bb.0 (%ir-block.0):
liveins: %eax, %ebp, %ebx
- ; CHECK: %ebx = LEA32r killed %eax, 4, killed %eax, 5, _
- ; CHECK: %ebp = LEA32r killed %ebx, 4, killed %ebx, 0, _
+ ; CHECK: %ebx = LEA32r killed %eax, 4, killed %eax, 5, %noreg
+ ; CHECK: %ebp = LEA32r killed %ebx, 4, killed %ebx, 0, %noreg
; CHECK: %ebp = ADD32ri8 %ebp, 5
CMP32rr %eax, killed %ebx, implicit-def %eflags
- %ebx = LEA32r killed %eax, 4, killed %eax, 5, _
+ %ebx = LEA32r killed %eax, 4, killed %eax, 5, %noreg
JE_1 %bb.1, implicit %eflags
RETQ %ebx
bb.1:
liveins: %eax, %ebp, %ebx
- %ebp = LEA32r killed %ebx, 4, killed %ebx, 5, _
+ %ebp = LEA32r killed %ebx, 4, killed %ebx, 5, %noreg
RETQ %ebp
...
diff --git a/test/CodeGen/X86/leaFixup64.mir b/test/CodeGen/X86/leaFixup64.mir
index b35dee181a47..ad86d4ba27f6 100644
--- a/test/CodeGen/X86/leaFixup64.mir
+++ b/test/CodeGen/X86/leaFixup64.mir
@@ -180,7 +180,7 @@ body: |
; CHECK: %eax = LEA64_32r killed %rax, 1, killed %rbp, 0
; CHECK: %eax = ADD32ri8 %eax, -5
- %eax = LEA64_32r killed %rax, 1, killed %rbp, -5, _
+ %eax = LEA64_32r killed %rax, 1, killed %rbp, -5, %noreg
RETQ %eax
...
@@ -215,7 +215,7 @@ body: |
; CHECK: %ebp = LEA64_32r killed %rax, 1, killed %rbp, 0
; CHECK: %ebp = ADD32ri8 %ebp, -5
- %ebp = LEA64_32r killed %rbp, 1, killed %rax, -5, _
+ %ebp = LEA64_32r killed %rbp, 1, killed %rax, -5, %noreg
RETQ %ebp
...
@@ -249,7 +249,7 @@ body: |
liveins: %rax, %rbp
; CHECK: %ebp = LEA64_32r killed %rax, 1, killed %rbp, 0
- %ebp = LEA64_32r killed %rbp, 1, killed %rax, 0, _
+ %ebp = LEA64_32r killed %rbp, 1, killed %rax, 0, %noreg
RETQ %ebp
...
@@ -284,7 +284,7 @@ body: |
; CHECK: %rax = ADD64rr %rax, killed %rbp
; CHECK: %rax = ADD64ri8 %rax, -5
- %rax = LEA64r killed %rax, 1, killed %rbp, -5, _
+ %rax = LEA64r killed %rax, 1, killed %rbp, -5, %noreg
RETQ %eax
...
@@ -319,7 +319,7 @@ body: |
; CHECK: %rbp = ADD64rr %rbp, killed %rax
; CHECK: %rbp = ADD64ri8 %rbp, -5
- %rbp = LEA64r killed %rbp, 1, killed %rax, -5, _
+ %rbp = LEA64r killed %rbp, 1, killed %rax, -5, %noreg
RETQ %ebp
...
@@ -353,7 +353,7 @@ body: |
liveins: %rax, %rbp
; CHECK: %rbp = ADD64rr %rbp, killed %rax
- %rbp = LEA64r killed %rbp, 1, killed %rax, 0, _
+ %rbp = LEA64r killed %rbp, 1, killed %rax, 0, %noreg
RETQ %ebp
...
@@ -386,10 +386,10 @@ frameInfo:
body: |
bb.0 (%ir-block.0):
liveins: %rax, %rbp
- ; CHECK: %ebx = LEA64_32r killed %rax, 1, killed %rbp, 0, _
+ ; CHECK: %ebx = LEA64_32r killed %rax, 1, killed %rbp, 0, %noreg
; CHECK: %ebx = ADD32ri8 %ebx, -5
- %ebx = LEA64_32r killed %rax, 1, killed %rbp, -5, _
+ %ebx = LEA64_32r killed %rax, 1, killed %rbp, -5, %noreg
RETQ %ebx
...
@@ -422,10 +422,10 @@ frameInfo:
body: |
bb.0 (%ir-block.0):
liveins: %rax, %rbp
- ; CHECK: %ebx = LEA64_32r killed %rax, 1, killed %rbp, 0, _
+ ; CHECK: %ebx = LEA64_32r killed %rax, 1, killed %rbp, 0, %noreg
; CHECK: %ebx = ADD32ri8 %ebx, -5
- %ebx = LEA64_32r killed %rbp, 1, killed %rax, -5, _
+ %ebx = LEA64_32r killed %rbp, 1, killed %rax, -5, %noreg
RETQ %ebx
...
@@ -458,9 +458,9 @@ frameInfo:
body: |
bb.0 (%ir-block.0):
liveins: %rax, %rbp
- ; CHECK: %ebx = LEA64_32r killed %rax, 1, killed %rbp, 0, _
+ ; CHECK: %ebx = LEA64_32r killed %rax, 1, killed %rbp, 0, %noreg
- %ebx = LEA64_32r killed %rbp, 1, killed %rax, 0, _
+ %ebx = LEA64_32r killed %rbp, 1, killed %rax, 0, %noreg
RETQ %ebx
...
@@ -493,10 +493,10 @@ frameInfo:
body: |
bb.0 (%ir-block.0):
liveins: %rax, %rbp
- ; CHECK: %rbx = LEA64r killed %rax, 1, killed %rbp, 0, _
+ ; CHECK: %rbx = LEA64r killed %rax, 1, killed %rbp, 0, %noreg
; CHECK: %rbx = ADD64ri8 %rbx, -5
- %rbx = LEA64r killed %rax, 1, killed %rbp, -5, _
+ %rbx = LEA64r killed %rax, 1, killed %rbp, -5, %noreg
RETQ %ebx
...
@@ -529,10 +529,10 @@ frameInfo:
body: |
bb.0 (%ir-block.0):
liveins: %rax, %rbp
- ; CHECK: %rbx = LEA64r killed %rax, 1, killed %rbp, 0, _
+ ; CHECK: %rbx = LEA64r killed %rax, 1, killed %rbp, 0, %noreg
; CHECK: %rbx = ADD64ri8 %rbx, -5
- %rbx = LEA64r killed %rbp, 1, killed %rax, -5, _
+ %rbx = LEA64r killed %rbp, 1, killed %rax, -5, %noreg
RETQ %ebx
...
@@ -565,9 +565,9 @@ frameInfo:
body: |
bb.0 (%ir-block.0):
liveins: %rax, %rbp
- ; CHECK: %rbx = LEA64r killed %rax, 1, killed %rbp, 0, _
+ ; CHECK: %rbx = LEA64r killed %rax, 1, killed %rbp, 0, %noreg
- %rbx = LEA64r killed %rbp, 1, killed %rax, 0, _
+ %rbx = LEA64r killed %rbp, 1, killed %rax, 0, %noreg
RETQ %ebx
...
@@ -599,11 +599,11 @@ frameInfo:
body: |
bb.0 (%ir-block.0):
liveins: %rdi, %rbp
- ; CHECK: %r12 = LEA64r _, 2, killed %r13, 5, _
+ ; CHECK: %r12 = LEA64r %noreg, 2, killed %r13, 5, %noreg
; CHECK: %r12 = ADD64rr %r12, killed %rbp
%rbp = KILL %rbp, implicit-def %rbp
%r13 = KILL %rdi, implicit-def %r13
- %r12 = LEA64r killed %rbp, 2, killed %r13, 5, _
+ %r12 = LEA64r killed %rbp, 2, killed %r13, 5, %noreg
RETQ %r12
...
@@ -638,7 +638,7 @@ body: |
; CHECK: %eax = LEA64_32r killed %rax, 1, killed %rbp, 0
; CHECK: %eax = ADD32ri %eax, 129
- %eax = LEA64_32r killed %rax, 1, killed %rbp, 129, _
+ %eax = LEA64_32r killed %rax, 1, killed %rbp, 129, %noreg
RETQ %eax
...
@@ -670,9 +670,9 @@ frameInfo:
body: |
bb.0 (%ir-block.0):
liveins: %rax, %rbp, %rbx
- ; CHECK: %ebx = LEA64_32r killed %rbp, 1, killed %rbp, 0, _
+ ; CHECK: %ebx = LEA64_32r killed %rbp, 1, killed %rbp, 0, %noreg
- %ebx = LEA64_32r killed %rbp, 1, killed %rbp, 0, _
+ %ebx = LEA64_32r killed %rbp, 1, killed %rbp, 0, %noreg
RETQ %ebx
...
@@ -704,9 +704,9 @@ frameInfo:
body: |
bb.0 (%ir-block.0):
liveins: %rax, %rbp, %rbx
- ; CHECK: %ebx = LEA64_32r killed %rbp, 1, killed %rbp, 5, _
+ ; CHECK: %ebx = LEA64_32r killed %rbp, 1, killed %rbp, 5, %noreg
- %ebx = LEA64_32r killed %rbp, 1, killed %rbp, 5, _
+ %ebx = LEA64_32r killed %rbp, 1, killed %rbp, 5, %noreg
RETQ %ebx
...
@@ -738,9 +738,9 @@ frameInfo:
body: |
bb.0 (%ir-block.0):
liveins: %eax, %ebp, %ebx
- ; CHECK: %ebx = LEA64_32r killed %rbp, 4, killed %rbp, 5, _
+ ; CHECK: %ebx = LEA64_32r killed %rbp, 4, killed %rbp, 5, %noreg
- %ebx = LEA64_32r killed %rbp, 4, killed %rbp, 5, _
+ %ebx = LEA64_32r killed %rbp, 4, killed %rbp, 5, %noreg
RETQ %ebx
...
@@ -775,7 +775,7 @@ body: |
; CHECK: %rax = ADD64rr %rax, killed %rbp
; CHECK: %rax = ADD64ri32 %rax, 129
- %rax = LEA64r killed %rax, 1, killed %rbp, 129, _
+ %rax = LEA64r killed %rax, 1, killed %rbp, 129, %noreg
RETQ %eax
...
@@ -810,7 +810,7 @@ body: |
; CHECK: %rbx = MOV64rr %rbp
; CHECK: %rbx = ADD64rr %rbx, %rbp
- %rbx = LEA64r %rbp, 1, %rbp, 0, _
+ %rbx = LEA64r %rbp, 1, %rbp, 0, %noreg
RETQ %ebx
...
@@ -842,10 +842,10 @@ frameInfo:
body: |
bb.0 (%ir-block.0):
liveins: %rax, %rbp, %rbx
- ; CHECK: %rbx = LEA64r _, 1, %rbp, 5, _
+ ; CHECK: %rbx = LEA64r %noreg, 1, %rbp, 5, %noreg
; CHECK: %rbx = ADD64rr %rbx, %rbp
- %rbx = LEA64r %rbp, 1, %rbp, 5, _
+ %rbx = LEA64r %rbp, 1, %rbp, 5, %noreg
RETQ %ebx
...
@@ -877,10 +877,10 @@ frameInfo:
body: |
bb.0 (%ir-block.0):
liveins: %rax, %rbp, %rbx
- ; CHECK: %rbx = LEA64r _, 4, %rbp, 5, _
+ ; CHECK: %rbx = LEA64r %noreg, 4, %rbp, 5, %noreg
; CHECK: %rbx = ADD64rr %rbx, %rbp
- %rbx = LEA64r %rbp, 4, %rbp, 5, _
+ %rbx = LEA64r %rbp, 4, %rbp, 5, %noreg
RETQ %ebx
...
@@ -912,9 +912,9 @@ frameInfo:
body: |
bb.0 (%ir-block.0):
liveins: %rax, %rbp, %rbx
- ; CHECK: %rbp = LEA64r killed %rbp, 4, killed %rbp, 0, _
+ ; CHECK: %rbp = LEA64r killed %rbp, 4, killed %rbp, 0, %noreg
- %rbp = LEA64r killed %rbp, 4, killed %rbp, 0, _
+ %rbp = LEA64r killed %rbp, 4, killed %rbp, 0, %noreg
RETQ %ebp
...
@@ -946,17 +946,17 @@ frameInfo:
body: |
bb.0 (%ir-block.0):
liveins: %rax, %rbp, %rbx
- ; CHECK: %rbx = LEA64r killed %rax, 4, killed %rax, 5, _
- ; CHECK: %rbp = LEA64r killed %rbx, 4, killed %rbx, 0, _
+ ; CHECK: %rbx = LEA64r killed %rax, 4, killed %rax, 5, %noreg
+ ; CHECK: %rbp = LEA64r killed %rbx, 4, killed %rbx, 0, %noreg
; CHECK: %rbp = ADD64ri8 %rbp, 5
CMP64rr %rax, killed %rbx, implicit-def %eflags
- %rbx = LEA64r killed %rax, 4, killed %rax, 5, _
+ %rbx = LEA64r killed %rax, 4, killed %rax, 5, %noreg
JE_1 %bb.1, implicit %eflags
RETQ %ebx
bb.1:
liveins: %rax, %rbp, %rbx
- %rbp = LEA64r killed %rbx, 4, killed %rbx, 5, _
+ %rbp = LEA64r killed %rbx, 4, killed %rbx, 5, %noreg
RETQ %ebp
...
@@ -988,9 +988,9 @@ frameInfo:
body: |
bb.0 (%ir-block.0):
liveins: %rax, %rbp, %rbx
- ; CHECK: %ebp = LEA64_32r killed %rbp, 4, killed %rbp, 0, _
+ ; CHECK: %ebp = LEA64_32r killed %rbp, 4, killed %rbp, 0, %noreg
- %ebp = LEA64_32r killed %rbp, 4, killed %rbp, 0, _
+ %ebp = LEA64_32r killed %rbp, 4, killed %rbp, 0, %noreg
RETQ %ebp
...
@@ -1022,17 +1022,17 @@ frameInfo:
body: |
bb.0 (%ir-block.0):
liveins: %rax, %rbp, %rbx
- ; CHECK: %ebx = LEA64_32r killed %rax, 4, killed %rax, 5, _
- ; CHECK: %ebp = LEA64_32r killed %rbx, 4, killed %rbx, 0, _
+ ; CHECK: %ebx = LEA64_32r killed %rax, 4, killed %rax, 5, %noreg
+ ; CHECK: %ebp = LEA64_32r killed %rbx, 4, killed %rbx, 0, %noreg
; CHECK: %ebp = ADD32ri8 %ebp, 5
CMP64rr %rax, killed %rbx, implicit-def %eflags
- %ebx = LEA64_32r killed %rax, 4, killed %rax, 5, _
+ %ebx = LEA64_32r killed %rax, 4, killed %rax, 5, %noreg
JE_1 %bb.1, implicit %eflags
RETQ %ebx
bb.1:
liveins: %rax, %rbp, %rbx
- %ebp = LEA64_32r killed %rbx, 4, killed %rbx, 5, _
+ %ebp = LEA64_32r killed %rbx, 4, killed %rbx, 5, %noreg
RETQ %ebp
...
diff --git a/test/CodeGen/X86/legalize-fmp-oeq-vector-select.ll b/test/CodeGen/X86/legalize-fmp-oeq-vector-select.ll
index 6a8c154a1bbe..0906773145be 100644
--- a/test/CodeGen/X86/legalize-fmp-oeq-vector-select.ll
+++ b/test/CodeGen/X86/legalize-fmp-oeq-vector-select.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=x86-64 -enable-legalize-types-checking < %s
+; RUN: llc -mtriple=x86_64-- -enable-legalize-types-checking < %s
; PR5092
define <4 x float> @bug(float %a) nounwind {
diff --git a/test/CodeGen/X86/legalize-libcalls.ll b/test/CodeGen/X86/legalize-libcalls.ll
index 879dc98ab20d..f05ab61814a9 100644
--- a/test/CodeGen/X86/legalize-libcalls.ll
+++ b/test/CodeGen/X86/legalize-libcalls.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=x86 < %s
-; RUN: llc -march=x86-64 < %s
+; RUN: llc -mtriple=i686-- < %s
+; RUN: llc -mtriple=x86_64-- < %s
target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32-S128"
diff --git a/test/CodeGen/X86/legalize-shift-64.ll b/test/CodeGen/X86/legalize-shift-64.ll
index 3ad6cad32d83..05fad9c61326 100644
--- a/test/CodeGen/X86/legalize-shift-64.ll
+++ b/test/CodeGen/X86/legalize-shift-64.ll
@@ -3,7 +3,7 @@
define i64 @test1(i32 %xx, i32 %test) nounwind {
; CHECK-LABEL: test1:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movb {{[0-9]+}}(%esp), %cl
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx
; CHECK-NEXT: andb $7, %cl
@@ -22,7 +22,7 @@ define i64 @test1(i32 %xx, i32 %test) nounwind {
define i64 @test2(i64 %xx, i32 %test) nounwind {
; CHECK-LABEL: test2:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: pushl %esi
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %esi
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx
@@ -41,7 +41,7 @@ define i64 @test2(i64 %xx, i32 %test) nounwind {
define i64 @test3(i64 %xx, i32 %test) nounwind {
; CHECK-LABEL: test3:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx
; CHECK-NEXT: movb {{[0-9]+}}(%esp), %cl
@@ -57,7 +57,7 @@ define i64 @test3(i64 %xx, i32 %test) nounwind {
define i64 @test4(i64 %xx, i32 %test) nounwind {
; CHECK-LABEL: test4:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx
; CHECK-NEXT: movb {{[0-9]+}}(%esp), %cl
@@ -74,26 +74,18 @@ define i64 @test4(i64 %xx, i32 %test) nounwind {
; PR14668
define <2 x i64> @test5(<2 x i64> %A, <2 x i64> %B) {
; CHECK-LABEL: test5:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: pushl %ebp
-; CHECK-NEXT: .Lcfi0:
; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: pushl %ebx
-; CHECK-NEXT: .Lcfi1:
; CHECK-NEXT: .cfi_def_cfa_offset 12
; CHECK-NEXT: pushl %edi
-; CHECK-NEXT: .Lcfi2:
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: pushl %esi
-; CHECK-NEXT: .Lcfi3:
; CHECK-NEXT: .cfi_def_cfa_offset 20
-; CHECK-NEXT: .Lcfi4:
; CHECK-NEXT: .cfi_offset %esi, -20
-; CHECK-NEXT: .Lcfi5:
; CHECK-NEXT: .cfi_offset %edi, -16
-; CHECK-NEXT: .Lcfi6:
; CHECK-NEXT: .cfi_offset %ebx, -12
-; CHECK-NEXT: .Lcfi7:
; CHECK-NEXT: .cfi_offset %ebp, -8
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: movb {{[0-9]+}}(%esp), %cl
@@ -105,7 +97,7 @@ define <2 x i64> @test5(<2 x i64> %A, <2 x i64> %B) {
; CHECK-NEXT: testb $32, %cl
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ebp
; CHECK-NEXT: je .LBB4_2
-; CHECK-NEXT: # BB#1:
+; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: movl %edi, %esi
; CHECK-NEXT: xorl %edi, %edi
; CHECK-NEXT: .LBB4_2:
@@ -116,7 +108,7 @@ define <2 x i64> @test5(<2 x i64> %A, <2 x i64> %B) {
; CHECK-NEXT: shldl %cl, %edx, %ebp
; CHECK-NEXT: testb $32, %cl
; CHECK-NEXT: je .LBB4_4
-; CHECK-NEXT: # BB#3:
+; CHECK-NEXT: # %bb.3:
; CHECK-NEXT: movl %ebx, %ebp
; CHECK-NEXT: xorl %ebx, %ebx
; CHECK-NEXT: .LBB4_4:
@@ -136,14 +128,11 @@ define <2 x i64> @test5(<2 x i64> %A, <2 x i64> %B) {
; PR16108
define i32 @test6() {
; CHECK-LABEL: test6:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: pushl %ebp
-; CHECK-NEXT: .Lcfi8:
; CHECK-NEXT: .cfi_def_cfa_offset 8
-; CHECK-NEXT: .Lcfi9:
; CHECK-NEXT: .cfi_offset %ebp, -8
; CHECK-NEXT: movl %esp, %ebp
-; CHECK-NEXT: .Lcfi10:
; CHECK-NEXT: .cfi_def_cfa_register %ebp
; CHECK-NEXT: andl $-8, %esp
; CHECK-NEXT: subl $16, %esp
@@ -155,7 +144,7 @@ define i32 @test6() {
; CHECK-NEXT: movb $32, %dl
; CHECK-NEXT: testb %dl, %dl
; CHECK-NEXT: jne .LBB5_2
-; CHECK-NEXT: # BB#1:
+; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: movl %ecx, %eax
; CHECK-NEXT: .LBB5_2:
; CHECK-NEXT: sete %cl
@@ -163,7 +152,7 @@ define i32 @test6() {
; CHECK-NEXT: xorl $1, %eax
; CHECK-NEXT: orl %ecx, %eax
; CHECK-NEXT: je .LBB5_5
-; CHECK-NEXT: # BB#3: # %if.then
+; CHECK-NEXT: # %bb.3: # %if.then
; CHECK-NEXT: movl $1, %eax
; CHECK-NEXT: jmp .LBB5_4
; CHECK-NEXT: .LBB5_5: # %if.end
diff --git a/test/CodeGen/X86/legalize-shl-vec.ll b/test/CodeGen/X86/legalize-shl-vec.ll
index 30b89f2a855f..a6238f26cbb6 100644
--- a/test/CodeGen/X86/legalize-shl-vec.ll
+++ b/test/CodeGen/X86/legalize-shl-vec.ll
@@ -4,7 +4,7 @@
define <2 x i256> @test_shl(<2 x i256> %In) {
; X32-LABEL: test_shl:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl $0, 60(%eax)
; X32-NEXT: movl $0, 56(%eax)
@@ -25,15 +25,12 @@ define <2 x i256> @test_shl(<2 x i256> %In) {
; X32-NEXT: retl $4
;
; X64-LABEL: test_shl:
-; X64: # BB#0:
-; X64-NEXT: movq $0, 56(%rdi)
-; X64-NEXT: movq $0, 48(%rdi)
-; X64-NEXT: movq $0, 40(%rdi)
-; X64-NEXT: movq $0, 32(%rdi)
-; X64-NEXT: movq $0, 24(%rdi)
-; X64-NEXT: movq $0, 16(%rdi)
-; X64-NEXT: movq $0, 8(%rdi)
-; X64-NEXT: movq $0, (%rdi)
+; X64: # %bb.0:
+; X64-NEXT: xorps %xmm0, %xmm0
+; X64-NEXT: movaps %xmm0, 48(%rdi)
+; X64-NEXT: movaps %xmm0, 32(%rdi)
+; X64-NEXT: movaps %xmm0, 16(%rdi)
+; X64-NEXT: movaps %xmm0, (%rdi)
; X64-NEXT: movq %rdi, %rax
; X64-NEXT: retq
%Amt = insertelement <2 x i256> undef, i256 -1, i32 0
@@ -43,7 +40,7 @@ define <2 x i256> @test_shl(<2 x i256> %In) {
define <2 x i256> @test_srl(<2 x i256> %In) {
; X32-LABEL: test_srl:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl $0, 60(%eax)
; X32-NEXT: movl $0, 56(%eax)
@@ -64,15 +61,12 @@ define <2 x i256> @test_srl(<2 x i256> %In) {
; X32-NEXT: retl $4
;
; X64-LABEL: test_srl:
-; X64: # BB#0:
-; X64-NEXT: movq $0, 56(%rdi)
-; X64-NEXT: movq $0, 48(%rdi)
-; X64-NEXT: movq $0, 40(%rdi)
-; X64-NEXT: movq $0, 32(%rdi)
-; X64-NEXT: movq $0, 24(%rdi)
-; X64-NEXT: movq $0, 16(%rdi)
-; X64-NEXT: movq $0, 8(%rdi)
-; X64-NEXT: movq $0, (%rdi)
+; X64: # %bb.0:
+; X64-NEXT: xorps %xmm0, %xmm0
+; X64-NEXT: movaps %xmm0, 48(%rdi)
+; X64-NEXT: movaps %xmm0, 32(%rdi)
+; X64-NEXT: movaps %xmm0, 16(%rdi)
+; X64-NEXT: movaps %xmm0, (%rdi)
; X64-NEXT: movq %rdi, %rax
; X64-NEXT: retq
%Amt = insertelement <2 x i256> undef, i256 -1, i32 0
@@ -82,7 +76,7 @@ define <2 x i256> @test_srl(<2 x i256> %In) {
define <2 x i256> @test_sra(<2 x i256> %In) {
; X32-LABEL: test_sra:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: movl %ecx, 60(%eax)
@@ -113,7 +107,7 @@ define <2 x i256> @test_sra(<2 x i256> %In) {
; X32-NEXT: retl $4
;
; X64-LABEL: test_sra:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movq {{[0-9]+}}(%rsp), %rax
; X64-NEXT: movq {{[0-9]+}}(%rsp), %rcx
; X64-NEXT: movq {{[0-9]+}}(%rsp), %rdx
diff --git a/test/CodeGen/X86/legalizedag_vec.ll b/test/CodeGen/X86/legalizedag_vec.ll
index dff693120fb6..e15e39c5c0ba 100644
--- a/test/CodeGen/X86/legalizedag_vec.ll
+++ b/test/CodeGen/X86/legalizedag_vec.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=sse2 | FileCheck %s
+; RUN: llc < %s -mtriple=i686-- -mattr=sse2 | FileCheck %s
; Test case for r63760 where we generate a legalization assert that an illegal
diff --git a/test/CodeGen/X86/libcall-sret.ll b/test/CodeGen/X86/libcall-sret.ll
index 4ef0a78ad798..3c484afb0b6b 100644
--- a/test/CodeGen/X86/libcall-sret.ll
+++ b/test/CodeGen/X86/libcall-sret.ll
@@ -22,7 +22,7 @@ define void @test_sret_libcall(i128 %l, i128 %r) {
; CHECK: pushl 72(%esp)
; CHECK: pushl [[SRET_ADDR]]
-; CHECK: calll __multi3
+; CHECK: calll __udivti3
; CHECK: addl $44, %esp
; CHECK-DAG: movl 8(%esp), [[RES0:%[a-z]+]]
@@ -33,7 +33,7 @@ define void @test_sret_libcall(i128 %l, i128 %r) {
; CHECK-DAG: movl [[RES1]], var+4
; CHECK-DAG: movl [[RES2]], var+8
; CHECK-DAG: movl [[RES3]], var+12
- %prod = mul i128 %l, %r
- store i128 %prod, i128* @var
+ %quot = udiv i128 %l, %r
+ store i128 %quot, i128* @var
ret void
}
diff --git a/test/CodeGen/X86/licm-nested.ll b/test/CodeGen/X86/licm-nested.ll
index 63e3c5c3b6b2..c029508bb7a7 100644
--- a/test/CodeGen/X86/licm-nested.ll
+++ b/test/CodeGen/X86/licm-nested.ll
@@ -1,5 +1,5 @@
; REQUIRES: asserts
-; RUN: llc -mtriple=x86_64-apple-darwin -march=x86-64 < %s -o /dev/null -stats -info-output-file - | grep "hoisted out of loops" | grep 5
+; RUN: llc -mtriple=x86_64-apple-darwin < %s -o /dev/null -stats -info-output-file - | grep "hoisted out of loops" | grep 5
; MachineLICM should be able to hoist the symbolic addresses out of
; the inner loops.
diff --git a/test/CodeGen/X86/limited-prec.ll b/test/CodeGen/X86/limited-prec.ll
index 7bf4ac28fdf9..07291f854660 100644
--- a/test/CodeGen/X86/limited-prec.ll
+++ b/test/CodeGen/X86/limited-prec.ll
@@ -1,8 +1,8 @@
-; RUN: llc < %s -limit-float-precision=6 -march=x86 | \
+; RUN: llc < %s -limit-float-precision=6 -mtriple=i686-- | \
; RUN: not grep exp | not grep log | not grep pow
-; RUN: llc < %s -limit-float-precision=12 -march=x86 | \
+; RUN: llc < %s -limit-float-precision=12 -mtriple=i686-- | \
; RUN: not grep exp | not grep log | not grep pow
-; RUN: llc < %s -limit-float-precision=18 -march=x86 | \
+; RUN: llc < %s -limit-float-precision=18 -mtriple=i686-- | \
; RUN: not grep exp | not grep log | not grep pow
define float @f1(float %x) nounwind noinline {
diff --git a/test/CodeGen/X86/linux-preemption.ll b/test/CodeGen/X86/linux-preemption.ll
new file mode 100644
index 000000000000..ab1ac2f27bc8
--- /dev/null
+++ b/test/CodeGen/X86/linux-preemption.ll
@@ -0,0 +1,225 @@
+; RUN: llc -mtriple x86_64-pc-linux \
+; RUN: -relocation-model=static < %s | FileCheck --check-prefix=STATIC %s
+; RUN: llc -mtriple x86_64-pc-linux \
+; RUN: -relocation-model=pic < %s | FileCheck %s
+; RUN: llc -mtriple x86_64-pc-linux \
+; RUN: -relocation-model=dynamic-no-pic < %s | FileCheck %s
+
+; 32 bits
+
+; RUN: llc -mtriple i386-pc-linux \
+; RUN: -relocation-model=pic < %s | FileCheck --check-prefix=CHECK32 %s
+
+; globals
+
+@strong_default_global = global i32 42
+define i32* @get_strong_default_global() {
+ ret i32* @strong_default_global
+}
+; CHECK: movq strong_default_global@GOTPCREL(%rip), %rax
+; STATIC: movl $strong_default_global, %eax
+; CHECK32: movl strong_default_global@GOT(%eax), %eax
+
+@weak_default_global = weak global i32 42
+define i32* @get_weak_default_global() {
+ ret i32* @weak_default_global
+}
+; CHECK: movq weak_default_global@GOTPCREL(%rip), %rax
+; STATIC: movl $weak_default_global, %eax
+; CHECK32: movl weak_default_global@GOT(%eax), %eax
+
+@external_default_global = external global i32
+define i32* @get_external_default_global() {
+ ret i32* @external_default_global
+}
+; CHECK: movq external_default_global@GOTPCREL(%rip), %rax
+; STATIC: movl $external_default_global, %eax
+; CHECK32: movl external_default_global@GOT(%eax), %eax
+
+@strong_local_global = dso_local global i32 42
+define i32* @get_strong_local_global() {
+ ret i32* @strong_local_global
+}
+; CHECK: leaq strong_local_global(%rip), %rax
+; STATIC: movl $strong_local_global, %eax
+; CHECK32: leal strong_local_global@GOTOFF(%eax), %eax
+
+@weak_local_global = weak dso_local global i32 42
+define i32* @get_weak_local_global() {
+ ret i32* @weak_local_global
+}
+; CHECK: leaq weak_local_global(%rip), %rax
+; STATIC: movl $weak_local_global, %eax
+; CHECK32: leal weak_local_global@GOTOFF(%eax), %eax
+
+@external_local_global = external dso_local global i32
+define i32* @get_external_local_global() {
+ ret i32* @external_local_global
+}
+; CHECK: leaq external_local_global(%rip), %rax
+; STATIC: movl $external_local_global, %eax
+; CHECK32: leal external_local_global@GOTOFF(%eax), %eax
+
+
+@strong_preemptable_global = dso_preemptable global i32 42
+define i32* @get_strong_preemptable_global() {
+ ret i32* @strong_preemptable_global
+}
+; CHECK: movq strong_preemptable_global@GOTPCREL(%rip), %rax
+; STATIC: movl $strong_preemptable_global, %eax
+; CHECK32: movl strong_preemptable_global@GOT(%eax), %eax
+
+@weak_preemptable_global = weak dso_preemptable global i32 42
+define i32* @get_weak_preemptable_global() {
+ ret i32* @weak_preemptable_global
+}
+; CHECK ;ADD_LABEL_BACK; movq weak_preemptable_global@GOTPCREL(%rip), %rax
+; STATIC ;ADD_LABEL_BACK; movq weak_preemptable_global@GOTPCREL, %rax
+; CHECK32 ;ADD_LABEL_BACK; movl weak_preemptable_global@GOT(%eax), %eax
+
+@external_preemptable_global = external dso_preemptable global i32
+define i32* @get_external_preemptable_global() {
+ ret i32* @external_preemptable_global
+}
+; CHECK: movq external_preemptable_global@GOTPCREL(%rip), %rax
+; STATIC: movl $external_preemptable_global, %eax
+; CHECK32: movl external_preemptable_global@GOT(%eax), %eax
+
+; aliases
+@aliasee = global i32 42
+
+@strong_default_alias = alias i32, i32* @aliasee
+define i32* @get_strong_default_alias() {
+ ret i32* @strong_default_alias
+}
+; CHECK: movq strong_default_alias@GOTPCREL(%rip), %rax
+; STATIC: movl $strong_default_alias, %eax
+; CHECK32: movl strong_default_alias@GOT(%eax), %eax
+
+@weak_default_alias = weak alias i32, i32* @aliasee
+define i32* @get_weak_default_alias() {
+ ret i32* @weak_default_alias
+}
+; CHECK: movq weak_default_alias@GOTPCREL(%rip), %rax
+; STATIC: movl $weak_default_alias, %eax
+; CHECK32: movl weak_default_alias@GOT(%eax), %eax
+
+@strong_local_alias = dso_local alias i32, i32* @aliasee
+define i32* @get_strong_local_alias() {
+ ret i32* @strong_local_alias
+}
+; CHECK: leaq strong_local_alias(%rip), %rax
+; STATIC: movl $strong_local_alias, %eax
+; CHECK32: leal strong_local_alias@GOTOFF(%eax), %eax
+
+@weak_local_alias = weak dso_local alias i32, i32* @aliasee
+define i32* @get_weak_local_alias() {
+ ret i32* @weak_local_alias
+}
+; CHECK: leaq weak_local_alias(%rip), %rax
+; STATIC: movl $weak_local_alias, %eax
+; CHECK32: leal weak_local_alias@GOTOFF(%eax), %eax
+
+
+@strong_preemptable_alias = dso_preemptable alias i32, i32* @aliasee
+define i32* @get_strong_preemptable_alias() {
+ ret i32* @strong_preemptable_alias
+}
+; CHECK: movq strong_preemptable_alias@GOTPCREL(%rip), %rax
+; STATIC: movl $strong_preemptable_alias, %eax
+; CHECK32: movl strong_preemptable_alias@GOT(%eax), %eax
+
+@weak_preemptable_alias = weak dso_preemptable alias i32, i32* @aliasee
+define i32* @get_weak_preemptable_alias() {
+ ret i32* @weak_preemptable_alias
+}
+; CHECK: movq weak_preemptable_alias@GOTPCREL(%rip), %rax
+; STATIC: movl $weak_preemptable_alias, %eax
+; CHECK32: movl weak_preemptable_alias@GOT(%eax), %eax
+
+; functions
+
+define void @strong_default_function() {
+ ret void
+}
+define void()* @get_strong_default_function() {
+ ret void()* @strong_default_function
+}
+; CHECK: movq strong_default_function@GOTPCREL(%rip), %rax
+; STATIC: movl $strong_default_function, %eax
+; CHECK32: movl strong_default_function@GOT(%eax), %eax
+
+define weak void @weak_default_function() {
+ ret void
+}
+define void()* @get_weak_default_function() {
+ ret void()* @weak_default_function
+}
+; CHECK: movq weak_default_function@GOTPCREL(%rip), %rax
+; STATIC: movl $weak_default_function, %eax
+; CHECK32: movl weak_default_function@GOT(%eax), %eax
+
+declare void @external_default_function()
+define void()* @get_external_default_function() {
+ ret void()* @external_default_function
+}
+; CHECK: movq external_default_function@GOTPCREL(%rip), %rax
+; STATIC: movl $external_default_function, %eax
+; CHECK32: movl external_default_function@GOT(%eax), %eax
+
+define dso_local void @strong_local_function() {
+ ret void
+}
+define void()* @get_strong_local_function() {
+ ret void()* @strong_local_function
+}
+; CHECK: leaq strong_local_function(%rip), %rax
+; STATIC: movl $strong_local_function, %eax
+; CHECK32: leal strong_local_function@GOTOFF(%eax), %eax
+
+define weak dso_local void @weak_local_function() {
+ ret void
+}
+define void()* @get_weak_local_function() {
+ ret void()* @weak_local_function
+}
+; CHECK: leaq weak_local_function(%rip), %rax
+; STATIC: movl $weak_local_function, %eax
+; CHECK32: leal weak_local_function@GOTOFF(%eax), %eax
+
+declare dso_local void @external_local_function()
+define void()* @get_external_local_function() {
+ ret void()* @external_local_function
+}
+; CHECK: leaq external_local_function(%rip), %rax
+; STATIC: movl $external_local_function, %eax
+; CHECK32: leal external_local_function@GOTOFF(%eax), %eax
+
+
+define dso_preemptable void @strong_preemptable_function() {
+ ret void
+}
+define void()* @get_strong_preemptable_function() {
+ ret void()* @strong_preemptable_function
+}
+; CHECK: movq strong_preemptable_function@GOTPCREL(%rip), %rax
+; STATIC: movl $strong_preemptable_function, %eax
+; CHECK32: movl strong_preemptable_function@GOT(%eax), %eax
+
+define weak dso_preemptable void @weak_preemptable_function() {
+ ret void
+}
+define void()* @get_weak_preemptable_function() {
+ ret void()* @weak_preemptable_function
+}
+; CHECK: movq weak_preemptable_function@GOTPCREL(%rip), %rax
+; STATIC: movl $weak_preemptable_function, %eax
+; CHECK32: movl weak_preemptable_function@GOT(%eax), %eax
+
+declare dso_preemptable void @external_preemptable_function()
+define void()* @get_external_preemptable_function() {
+ ret void()* @external_preemptable_function
+}
+; CHECK: movq external_preemptable_function@GOTPCREL(%rip), %rax
+; STATIC: movl $external_preemptable_function, %eax
+; CHECK32: movl external_preemptable_function@GOT(%eax), %eax
diff --git a/test/CodeGen/X86/live-out-reg-info.ll b/test/CodeGen/X86/live-out-reg-info.ll
index 283ee3ae71a8..e4644665d65f 100644
--- a/test/CodeGen/X86/live-out-reg-info.ll
+++ b/test/CodeGen/X86/live-out-reg-info.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -march=x86-64 | grep testb
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s
; Make sure dagcombine doesn't eliminate the comparison due
; to an off-by-one bug with computeKnownBits information.
@@ -6,6 +7,18 @@
declare void @qux()
define void @foo(i32 %a) {
+; CHECK-LABEL: foo:
+; CHECK: # %bb.0:
+; CHECK-NEXT: pushq %rax
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: shrl $23, %edi
+; CHECK-NEXT: btl $8, %edi
+; CHECK-NEXT: jb .LBB0_2
+; CHECK-NEXT: # %bb.1: # %true
+; CHECK-NEXT: callq qux
+; CHECK-NEXT: .LBB0_2: # %false
+; CHECK-NEXT: popq %rax
+; CHECK-NEXT: retq
%t0 = lshr i32 %a, 23
br label %next
next:
@@ -18,3 +31,4 @@ true:
false:
ret void
}
+
diff --git a/test/CodeGen/X86/live-range-nosubreg.ll b/test/CodeGen/X86/live-range-nosubreg.ll
index 899a375221c4..d5226e67ee47 100644
--- a/test/CodeGen/X86/live-range-nosubreg.ll
+++ b/test/CodeGen/X86/live-range-nosubreg.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=x86-64 < %s
+; RUN: llc < %s
; This testcase used to crash. See PR29132.
diff --git a/test/CodeGen/X86/liveness-local-regalloc.ll b/test/CodeGen/X86/liveness-local-regalloc.ll
index 0954f9d5dd47..2eb5cc580daf 100644
--- a/test/CodeGen/X86/liveness-local-regalloc.ll
+++ b/test/CodeGen/X86/liveness-local-regalloc.ll
@@ -61,8 +61,8 @@ infloop1: ; preds = %infloop1, %bb5
}
-; RAFast would forget to add a super-register <imp-def> when rewriting:
-; %vreg10:sub_32bit<def,read-undef> = COPY %R9D<kill>
+; RAFast would forget to add a super-register implicit-def when rewriting:
+; %10:sub_32bit<def,read-undef> = COPY killed %R9D
; This trips up the machine code verifier.
define void @autogen_SD24657(i8*, i32*, i64*, i32, i64, i8) {
BB:
diff --git a/test/CodeGen/X86/llc-override-mcpu-mattr.ll b/test/CodeGen/X86/llc-override-mcpu-mattr.ll
index 19a5ed591867..293ceee3be90 100644
--- a/test/CodeGen/X86/llc-override-mcpu-mattr.ll
+++ b/test/CodeGen/X86/llc-override-mcpu-mattr.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march x86-64 -mcpu=broadwell | FileCheck %s
-; RUN: llc < %s -march x86-64 -mattr=+avx2 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=broadwell | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s
; Check that llc can overide function attributes target-cpu and target-features
; using command line options -mcpu and -mattr.
diff --git a/test/CodeGen/X86/load-combine-dbg.ll b/test/CodeGen/X86/load-combine-dbg.ll
new file mode 100644
index 000000000000..59e6e1ac39f5
--- /dev/null
+++ b/test/CodeGen/X86/load-combine-dbg.ll
@@ -0,0 +1,37 @@
+; RUN: llc -O0 < %s -mtriple=x86_64-unknown | FileCheck %s
+; This was extracted from a swift debugger stepping testcase and checks that the
+; fold (zext (load x)) -> (zext (truncate (zextload x)))
+; rule propagates the SDLoc of the load to the zextload.
+
+; CHECK: .loc {{.*}} main.swift:100
+; CHECK-NOT: .loc
+; CHECK: .loc {{.*}} main.swift:200
+; CHECK-NOT: .loc
+; CHECK: .loc {{.*}} main.swift:300
+; CHECK-NOT: .loc
+declare void @foo(double)
+
+define i32 @zext_load(i32* %arg) !dbg !30 {
+ %1 = bitcast i32* %arg to i8*
+ %2 = getelementptr inbounds i8, i8* %1, i32 1
+ %3 = load i8, i8* %2, align 1, !dbg !100
+ %4 = uitofp i8 %3 to double, !dbg !200
+ call void @foo(double %4), !dbg !200
+ %5 = zext i8 %3 to i32, !dbg !300
+ ret i32 %5
+}
+!llvm.dbg.cu = !{!1}
+!llvm.module.flags = !{!0}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = distinct !DICompileUnit(language: DW_LANG_Swift, file: !3, isOptimized: false, emissionKind: FullDebug)
+!2 = !DIModule(scope: null, name: "test", includePath: "", isysroot: "/")
+!3 = !DIFile(filename: "main.swift", directory: "/")
+
+!30 = distinct !DISubprogram(name: "main", scope: !2, file: !3, line: 1, type: !31, isLocal: false, isDefinition: true, isOptimized: false, unit: !1)
+!31 = !DISubroutineType(types: !32)
+!32 = !{}
+
+!100 = !DILocation(line: 100, scope: !30)
+!200 = !DILocation(line: 200, scope: !30)
+!300 = !DILocation(line: 300, scope: !30)
diff --git a/test/CodeGen/X86/load-combine.ll b/test/CodeGen/X86/load-combine.ll
index e737a51cf405..c943b6d5ed73 100644
--- a/test/CodeGen/X86/load-combine.ll
+++ b/test/CodeGen/X86/load-combine.ll
@@ -8,13 +8,13 @@
; (i32) p[0] | ((i32) p[1] << 8) | ((i32) p[2] << 16) | ((i32) p[3] << 24)
define i32 @load_i32_by_i8(i32* %arg) {
; CHECK-LABEL: load_i32_by_i8:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: movl (%eax), %eax
; CHECK-NEXT: retl
;
; CHECK64-LABEL: load_i32_by_i8:
-; CHECK64: # BB#0:
+; CHECK64: # %bb.0:
; CHECK64-NEXT: movl (%rdi), %eax
; CHECK64-NEXT: retq
%tmp = bitcast i32* %arg to i8*
@@ -42,26 +42,26 @@ define i32 @load_i32_by_i8(i32* %arg) {
; ((i32) p[0] << 24) | ((i32) p[1] << 16) | ((i32) p[2] << 8) | (i32) p[3]
define i32 @load_i32_by_i8_bswap(i32* %arg) {
; BSWAP-LABEL: load_i32_by_i8_bswap:
-; BSWAP: # BB#0:
+; BSWAP: # %bb.0:
; BSWAP-NEXT: movl {{[0-9]+}}(%esp), %eax
; BSWAP-NEXT: movl (%eax), %eax
; BSWAP-NEXT: bswapl %eax
; BSWAP-NEXT: retl
;
; MOVBE-LABEL: load_i32_by_i8_bswap:
-; MOVBE: # BB#0:
+; MOVBE: # %bb.0:
; MOVBE-NEXT: movl {{[0-9]+}}(%esp), %eax
; MOVBE-NEXT: movbel (%eax), %eax
; MOVBE-NEXT: retl
;
; BSWAP64-LABEL: load_i32_by_i8_bswap:
-; BSWAP64: # BB#0:
+; BSWAP64: # %bb.0:
; BSWAP64-NEXT: movl (%rdi), %eax
; BSWAP64-NEXT: bswapl %eax
; BSWAP64-NEXT: retq
;
; MOVBE64-LABEL: load_i32_by_i8_bswap:
-; MOVBE64: # BB#0:
+; MOVBE64: # %bb.0:
; MOVBE64-NEXT: movbel (%rdi), %eax
; MOVBE64-NEXT: retq
%tmp = bitcast i32* %arg to i8*
@@ -89,13 +89,13 @@ define i32 @load_i32_by_i8_bswap(i32* %arg) {
; (i32) p[0] | ((i32) p[1] << 16)
define i32 @load_i32_by_i16(i32* %arg) {
; CHECK-LABEL: load_i32_by_i16:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: movl (%eax), %eax
; CHECK-NEXT: retl
;
; CHECK64-LABEL: load_i32_by_i16:
-; CHECK64: # BB#0:
+; CHECK64: # %bb.0:
; CHECK64-NEXT: movl (%rdi), %eax
; CHECK64-NEXT: retq
%tmp = bitcast i32* %arg to i16*
@@ -114,13 +114,13 @@ define i32 @load_i32_by_i16(i32* %arg) {
; (i32) p_16[0] | ((i32) p[2] << 16) | ((i32) p[3] << 24)
define i32 @load_i32_by_i16_i8(i32* %arg) {
; CHECK-LABEL: load_i32_by_i16_i8:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: movl (%eax), %eax
; CHECK-NEXT: retl
;
; CHECK64-LABEL: load_i32_by_i16_i8:
-; CHECK64: # BB#0:
+; CHECK64: # %bb.0:
; CHECK64-NEXT: movl (%rdi), %eax
; CHECK64-NEXT: retq
%tmp = bitcast i32* %arg to i16*
@@ -145,13 +145,13 @@ define i32 @load_i32_by_i16_i8(i32* %arg) {
; (i32) ((i16) p[0] | ((i16) p[1] << 8)) | (((i32) ((i16) p[3] | ((i16) p[4] << 8)) << 16)
define i32 @load_i32_by_i16_by_i8(i32* %arg) {
; CHECK-LABEL: load_i32_by_i16_by_i8:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: movl (%eax), %eax
; CHECK-NEXT: retl
;
; CHECK64-LABEL: load_i32_by_i16_by_i8:
-; CHECK64: # BB#0:
+; CHECK64: # %bb.0:
; CHECK64-NEXT: movl (%rdi), %eax
; CHECK64-NEXT: retq
%tmp = bitcast i32* %arg to i8*
@@ -181,26 +181,26 @@ define i32 @load_i32_by_i16_by_i8(i32* %arg) {
; ((i32) (((i16) p[0] << 8) | (i16) p[1]) << 16) | (i32) (((i16) p[3] << 8) | (i16) p[4])
define i32 @load_i32_by_i16_by_i8_bswap(i32* %arg) {
; BSWAP-LABEL: load_i32_by_i16_by_i8_bswap:
-; BSWAP: # BB#0:
+; BSWAP: # %bb.0:
; BSWAP-NEXT: movl {{[0-9]+}}(%esp), %eax
; BSWAP-NEXT: movl (%eax), %eax
; BSWAP-NEXT: bswapl %eax
; BSWAP-NEXT: retl
;
; MOVBE-LABEL: load_i32_by_i16_by_i8_bswap:
-; MOVBE: # BB#0:
+; MOVBE: # %bb.0:
; MOVBE-NEXT: movl {{[0-9]+}}(%esp), %eax
; MOVBE-NEXT: movbel (%eax), %eax
; MOVBE-NEXT: retl
;
; BSWAP64-LABEL: load_i32_by_i16_by_i8_bswap:
-; BSWAP64: # BB#0:
+; BSWAP64: # %bb.0:
; BSWAP64-NEXT: movl (%rdi), %eax
; BSWAP64-NEXT: bswapl %eax
; BSWAP64-NEXT: retq
;
; MOVBE64-LABEL: load_i32_by_i16_by_i8_bswap:
-; MOVBE64: # BB#0:
+; MOVBE64: # %bb.0:
; MOVBE64-NEXT: movbel (%rdi), %eax
; MOVBE64-NEXT: retq
%tmp = bitcast i32* %arg to i8*
@@ -230,14 +230,14 @@ define i32 @load_i32_by_i16_by_i8_bswap(i32* %arg) {
; (i64) p[0] | ((i64) p[1] << 8) | ((i64) p[2] << 16) | ((i64) p[3] << 24) | ((i64) p[4] << 32) | ((i64) p[5] << 40) | ((i64) p[6] << 48) | ((i64) p[7] << 56)
define i64 @load_i64_by_i8(i64* %arg) {
; CHECK-LABEL: load_i64_by_i8:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
; CHECK-NEXT: movl (%ecx), %eax
; CHECK-NEXT: movl 4(%ecx), %edx
; CHECK-NEXT: retl
;
; CHECK64-LABEL: load_i64_by_i8:
-; CHECK64: # BB#0:
+; CHECK64: # %bb.0:
; CHECK64-NEXT: movq (%rdi), %rax
; CHECK64-NEXT: retq
%tmp = bitcast i64* %arg to i8*
@@ -285,7 +285,7 @@ define i64 @load_i64_by_i8(i64* %arg) {
; ((i64) p[0] << 56) | ((i64) p[1] << 48) | ((i64) p[2] << 40) | ((i64) p[3] << 32) | ((i64) p[4] << 24) | ((i64) p[5] << 16) | ((i64) p[6] << 8) | (i64) p[7]
define i64 @load_i64_by_i8_bswap(i64* %arg) {
; BSWAP-LABEL: load_i64_by_i8_bswap:
-; BSWAP: # BB#0:
+; BSWAP: # %bb.0:
; BSWAP-NEXT: movl {{[0-9]+}}(%esp), %eax
; BSWAP-NEXT: movl (%eax), %edx
; BSWAP-NEXT: movl 4(%eax), %eax
@@ -294,20 +294,20 @@ define i64 @load_i64_by_i8_bswap(i64* %arg) {
; BSWAP-NEXT: retl
;
; MOVBE-LABEL: load_i64_by_i8_bswap:
-; MOVBE: # BB#0:
+; MOVBE: # %bb.0:
; MOVBE-NEXT: movl {{[0-9]+}}(%esp), %ecx
; MOVBE-NEXT: movbel 4(%ecx), %eax
; MOVBE-NEXT: movbel (%ecx), %edx
; MOVBE-NEXT: retl
;
; BSWAP64-LABEL: load_i64_by_i8_bswap:
-; BSWAP64: # BB#0:
+; BSWAP64: # %bb.0:
; BSWAP64-NEXT: movq (%rdi), %rax
; BSWAP64-NEXT: bswapq %rax
; BSWAP64-NEXT: retq
;
; MOVBE64-LABEL: load_i64_by_i8_bswap:
-; MOVBE64: # BB#0:
+; MOVBE64: # %bb.0:
; MOVBE64-NEXT: movbeq (%rdi), %rax
; MOVBE64-NEXT: retq
%tmp = bitcast i64* %arg to i8*
@@ -358,11 +358,9 @@ define i64 @load_i64_by_i8_bswap(i64* %arg) {
; x | res
define i32 @load_i32_by_i8_bswap_uses(i32* %arg) {
; CHECK-LABEL: load_i32_by_i8_bswap_uses:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: pushl %esi
-; CHECK-NEXT: .Lcfi0:
; CHECK-NEXT: .cfi_def_cfa_offset 8
-; CHECK-NEXT: .Lcfi1:
; CHECK-NEXT: .cfi_offset %esi, -8
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: movzbl (%eax), %ecx
@@ -381,7 +379,7 @@ define i32 @load_i32_by_i8_bswap_uses(i32* %arg) {
; CHECK-NEXT: retl
;
; CHECK64-LABEL: load_i32_by_i8_bswap_uses:
-; CHECK64: # BB#0:
+; CHECK64: # %bb.0:
; CHECK64-NEXT: movzbl (%rdi), %eax
; CHECK64-NEXT: shll $24, %eax
; CHECK64-NEXT: movzbl 1(%rdi), %ecx
@@ -424,7 +422,7 @@ define i32 @load_i32_by_i8_bswap_uses(i32* %arg) {
; ((i32) p0 << 24) | ((i32) p[1] << 16) | ((i32) p[2] << 8) | (i32) p[3]
define i32 @load_i32_by_i8_bswap_volatile(i32* %arg) {
; CHECK-LABEL: load_i32_by_i8_bswap_volatile:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: movzbl (%eax), %ecx
; CHECK-NEXT: shll $24, %ecx
@@ -439,7 +437,7 @@ define i32 @load_i32_by_i8_bswap_volatile(i32* %arg) {
; CHECK-NEXT: retl
;
; CHECK64-LABEL: load_i32_by_i8_bswap_volatile:
-; CHECK64: # BB#0:
+; CHECK64: # %bb.0:
; CHECK64-NEXT: movzbl (%rdi), %eax
; CHECK64-NEXT: shll $24, %eax
; CHECK64-NEXT: movzbl 1(%rdi), %ecx
@@ -480,11 +478,9 @@ define i32 @load_i32_by_i8_bswap_volatile(i32* %arg) {
; res1 | res2
define i32 @load_i32_by_i8_bswap_store_in_between(i32* %arg, i32* %arg1) {
; CHECK-LABEL: load_i32_by_i8_bswap_store_in_between:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: pushl %esi
-; CHECK-NEXT: .Lcfi2:
; CHECK-NEXT: .cfi_def_cfa_offset 8
-; CHECK-NEXT: .Lcfi3:
; CHECK-NEXT: .cfi_offset %esi, -8
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
@@ -503,7 +499,7 @@ define i32 @load_i32_by_i8_bswap_store_in_between(i32* %arg, i32* %arg1) {
; CHECK-NEXT: retl
;
; CHECK64-LABEL: load_i32_by_i8_bswap_store_in_between:
-; CHECK64: # BB#0:
+; CHECK64: # %bb.0:
; CHECK64-NEXT: movzbl (%rdi), %eax
; CHECK64-NEXT: shll $24, %eax
; CHECK64-NEXT: movzbl 1(%rdi), %ecx
@@ -544,7 +540,7 @@ define i32 @load_i32_by_i8_bswap_store_in_between(i32* %arg, i32* %arg1) {
; ((i32) p[0] << 24) | ((i32) q[1] << 16) | ((i32) p[2] << 8) | (i32) p[3]
define i32 @load_i32_by_i8_bswap_unrelated_load(i32* %arg, i32* %arg1) {
; CHECK-LABEL: load_i32_by_i8_bswap_unrelated_load:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
; CHECK-NEXT: movzbl (%ecx), %edx
@@ -560,7 +556,7 @@ define i32 @load_i32_by_i8_bswap_unrelated_load(i32* %arg, i32* %arg1) {
; CHECK-NEXT: retl
;
; CHECK64-LABEL: load_i32_by_i8_bswap_unrelated_load:
-; CHECK64: # BB#0:
+; CHECK64: # %bb.0:
; CHECK64-NEXT: movzbl (%rdi), %eax
; CHECK64-NEXT: shll $24, %eax
; CHECK64-NEXT: movzbl 1(%rsi), %ecx
@@ -599,13 +595,13 @@ define i32 @load_i32_by_i8_bswap_unrelated_load(i32* %arg, i32* %arg1) {
; (i32) p[1] | ((i32) p[2] << 8) | ((i32) p[3] << 16) | ((i32) p[4] << 24)
define i32 @load_i32_by_i8_nonzero_offset(i32* %arg) {
; CHECK-LABEL: load_i32_by_i8_nonzero_offset:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: movl 1(%eax), %eax
; CHECK-NEXT: retl
;
; CHECK64-LABEL: load_i32_by_i8_nonzero_offset:
-; CHECK64: # BB#0:
+; CHECK64: # %bb.0:
; CHECK64-NEXT: movl 1(%rdi), %eax
; CHECK64-NEXT: retq
%tmp = bitcast i32* %arg to i8*
@@ -634,13 +630,13 @@ define i32 @load_i32_by_i8_nonzero_offset(i32* %arg) {
; (i32) p[-4] | ((i32) p[-3] << 8) | ((i32) p[-2] << 16) | ((i32) p[-1] << 24)
define i32 @load_i32_by_i8_neg_offset(i32* %arg) {
; CHECK-LABEL: load_i32_by_i8_neg_offset:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: movl -4(%eax), %eax
; CHECK-NEXT: retl
;
; CHECK64-LABEL: load_i32_by_i8_neg_offset:
-; CHECK64: # BB#0:
+; CHECK64: # %bb.0:
; CHECK64-NEXT: movl -4(%rdi), %eax
; CHECK64-NEXT: retq
%tmp = bitcast i32* %arg to i8*
@@ -669,26 +665,26 @@ define i32 @load_i32_by_i8_neg_offset(i32* %arg) {
; (i32) p[4] | ((i32) p[3] << 8) | ((i32) p[2] << 16) | ((i32) p[1] << 24)
define i32 @load_i32_by_i8_nonzero_offset_bswap(i32* %arg) {
; BSWAP-LABEL: load_i32_by_i8_nonzero_offset_bswap:
-; BSWAP: # BB#0:
+; BSWAP: # %bb.0:
; BSWAP-NEXT: movl {{[0-9]+}}(%esp), %eax
; BSWAP-NEXT: movl 1(%eax), %eax
; BSWAP-NEXT: bswapl %eax
; BSWAP-NEXT: retl
;
; MOVBE-LABEL: load_i32_by_i8_nonzero_offset_bswap:
-; MOVBE: # BB#0:
+; MOVBE: # %bb.0:
; MOVBE-NEXT: movl {{[0-9]+}}(%esp), %eax
; MOVBE-NEXT: movbel 1(%eax), %eax
; MOVBE-NEXT: retl
;
; BSWAP64-LABEL: load_i32_by_i8_nonzero_offset_bswap:
-; BSWAP64: # BB#0:
+; BSWAP64: # %bb.0:
; BSWAP64-NEXT: movl 1(%rdi), %eax
; BSWAP64-NEXT: bswapl %eax
; BSWAP64-NEXT: retq
;
; MOVBE64-LABEL: load_i32_by_i8_nonzero_offset_bswap:
-; MOVBE64: # BB#0:
+; MOVBE64: # %bb.0:
; MOVBE64-NEXT: movbel 1(%rdi), %eax
; MOVBE64-NEXT: retq
%tmp = bitcast i32* %arg to i8*
@@ -717,26 +713,26 @@ define i32 @load_i32_by_i8_nonzero_offset_bswap(i32* %arg) {
; (i32) p[-1] | ((i32) p[-2] << 8) | ((i32) p[-3] << 16) | ((i32) p[-4] << 24)
define i32 @load_i32_by_i8_neg_offset_bswap(i32* %arg) {
; BSWAP-LABEL: load_i32_by_i8_neg_offset_bswap:
-; BSWAP: # BB#0:
+; BSWAP: # %bb.0:
; BSWAP-NEXT: movl {{[0-9]+}}(%esp), %eax
; BSWAP-NEXT: movl -4(%eax), %eax
; BSWAP-NEXT: bswapl %eax
; BSWAP-NEXT: retl
;
; MOVBE-LABEL: load_i32_by_i8_neg_offset_bswap:
-; MOVBE: # BB#0:
+; MOVBE: # %bb.0:
; MOVBE-NEXT: movl {{[0-9]+}}(%esp), %eax
; MOVBE-NEXT: movbel -4(%eax), %eax
; MOVBE-NEXT: retl
;
; BSWAP64-LABEL: load_i32_by_i8_neg_offset_bswap:
-; BSWAP64: # BB#0:
+; BSWAP64: # %bb.0:
; BSWAP64-NEXT: movl -4(%rdi), %eax
; BSWAP64-NEXT: bswapl %eax
; BSWAP64-NEXT: retq
;
; MOVBE64-LABEL: load_i32_by_i8_neg_offset_bswap:
-; MOVBE64: # BB#0:
+; MOVBE64: # %bb.0:
; MOVBE64-NEXT: movbel -4(%rdi), %eax
; MOVBE64-NEXT: retq
%tmp = bitcast i32* %arg to i8*
@@ -765,7 +761,7 @@ define i32 @load_i32_by_i8_neg_offset_bswap(i32* %arg) {
; ((i32) p[i] << 24) | ((i32) p[i + 1] << 16) | ((i32) p[i + 2] << 8) | (i32) p[i + 3]
define i32 @load_i32_by_i8_bswap_base_index_offset(i32* %arg, i32 %arg1) {
; BSWAP-LABEL: load_i32_by_i8_bswap_base_index_offset:
-; BSWAP: # BB#0:
+; BSWAP: # %bb.0:
; BSWAP-NEXT: movl {{[0-9]+}}(%esp), %eax
; BSWAP-NEXT: movl {{[0-9]+}}(%esp), %ecx
; BSWAP-NEXT: movl (%ecx,%eax), %eax
@@ -773,21 +769,21 @@ define i32 @load_i32_by_i8_bswap_base_index_offset(i32* %arg, i32 %arg1) {
; BSWAP-NEXT: retl
;
; MOVBE-LABEL: load_i32_by_i8_bswap_base_index_offset:
-; MOVBE: # BB#0:
+; MOVBE: # %bb.0:
; MOVBE-NEXT: movl {{[0-9]+}}(%esp), %eax
; MOVBE-NEXT: movl {{[0-9]+}}(%esp), %ecx
; MOVBE-NEXT: movbel (%ecx,%eax), %eax
; MOVBE-NEXT: retl
;
; BSWAP64-LABEL: load_i32_by_i8_bswap_base_index_offset:
-; BSWAP64: # BB#0:
+; BSWAP64: # %bb.0:
; BSWAP64-NEXT: movslq %esi, %rax
; BSWAP64-NEXT: movl (%rdi,%rax), %eax
; BSWAP64-NEXT: bswapl %eax
; BSWAP64-NEXT: retq
;
; MOVBE64-LABEL: load_i32_by_i8_bswap_base_index_offset:
-; MOVBE64: # BB#0:
+; MOVBE64: # %bb.0:
; MOVBE64-NEXT: movslq %esi, %rax
; MOVBE64-NEXT: movbel (%rdi,%rax), %eax
; MOVBE64-NEXT: retq
@@ -819,14 +815,14 @@ define i32 @load_i32_by_i8_bswap_base_index_offset(i32* %arg, i32 %arg1) {
; Verify that we don't crash handling shl i32 %conv57, 32
define void @shift_i32_by_32(i8* %src1, i8* %src2, i64* %dst) {
; CHECK-LABEL: shift_i32_by_32:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: movl $-1, 4(%eax)
; CHECK-NEXT: movl $-1, (%eax)
; CHECK-NEXT: retl
;
; CHECK64-LABEL: shift_i32_by_32:
-; CHECK64: # BB#0: # %entry
+; CHECK64: # %bb.0: # %entry
; CHECK64-NEXT: movq $-1, (%rdx)
; CHECK64-NEXT: retq
entry:
@@ -850,26 +846,26 @@ declare i16 @llvm.bswap.i16(i16)
; (i32) bswap(p[1]) | (i32) bswap(p[0] << 16)
define i32 @load_i32_by_bswap_i16(i32* %arg) {
; BSWAP-LABEL: load_i32_by_bswap_i16:
-; BSWAP: # BB#0:
+; BSWAP: # %bb.0:
; BSWAP-NEXT: movl {{[0-9]+}}(%esp), %eax
; BSWAP-NEXT: movl (%eax), %eax
; BSWAP-NEXT: bswapl %eax
; BSWAP-NEXT: retl
;
; MOVBE-LABEL: load_i32_by_bswap_i16:
-; MOVBE: # BB#0:
+; MOVBE: # %bb.0:
; MOVBE-NEXT: movl {{[0-9]+}}(%esp), %eax
; MOVBE-NEXT: movbel (%eax), %eax
; MOVBE-NEXT: retl
;
; BSWAP64-LABEL: load_i32_by_bswap_i16:
-; BSWAP64: # BB#0:
+; BSWAP64: # %bb.0:
; BSWAP64-NEXT: movl (%rdi), %eax
; BSWAP64-NEXT: bswapl %eax
; BSWAP64-NEXT: retq
;
; MOVBE64-LABEL: load_i32_by_bswap_i16:
-; MOVBE64: # BB#0:
+; MOVBE64: # %bb.0:
; MOVBE64-NEXT: movbel (%rdi), %eax
; MOVBE64-NEXT: retq
%tmp = bitcast i32* %arg to i16*
@@ -889,13 +885,13 @@ define i32 @load_i32_by_bswap_i16(i32* %arg) {
; (i32) p[0] | (sext(p[1] << 16) to i32)
define i32 @load_i32_by_sext_i16(i32* %arg) {
; CHECK-LABEL: load_i32_by_sext_i16:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: movl (%eax), %eax
; CHECK-NEXT: retl
;
; CHECK64-LABEL: load_i32_by_sext_i16:
-; CHECK64: # BB#0:
+; CHECK64: # %bb.0:
; CHECK64-NEXT: movl (%rdi), %eax
; CHECK64-NEXT: retq
%tmp = bitcast i32* %arg to i16*
@@ -914,14 +910,14 @@ define i32 @load_i32_by_sext_i16(i32* %arg) {
; (i32) p[i] | ((i32) p[i + 1] << 8) | ((i32) p[i + 2] << 16) | ((i32) p[i + 3] << 24)
define i32 @load_i32_by_i8_base_offset_index(i8* %arg, i32 %i) {
; CHECK-LABEL: load_i32_by_i8_base_offset_index:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
; CHECK-NEXT: movl 12(%eax,%ecx), %eax
; CHECK-NEXT: retl
;
; CHECK64-LABEL: load_i32_by_i8_base_offset_index:
-; CHECK64: # BB#0:
+; CHECK64: # %bb.0:
; CHECK64-NEXT: movl %esi, %eax
; CHECK64-NEXT: movl 12(%rdi,%rax), %eax
; CHECK64-NEXT: retq
@@ -959,14 +955,14 @@ define i32 @load_i32_by_i8_base_offset_index(i8* %arg, i32 %i) {
; (i32) p[i + 1] | ((i32) p[i + 2] << 8) | ((i32) p[i + 3] << 16) | ((i32) p[i + 4] << 24)
define i32 @load_i32_by_i8_base_offset_index_2(i8* %arg, i32 %i) {
; CHECK-LABEL: load_i32_by_i8_base_offset_index_2:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
; CHECK-NEXT: movl 13(%eax,%ecx), %eax
; CHECK-NEXT: retl
;
; CHECK64-LABEL: load_i32_by_i8_base_offset_index_2:
-; CHECK64: # BB#0:
+; CHECK64: # %bb.0:
; CHECK64-NEXT: movl %esi, %eax
; CHECK64-NEXT: movl 13(%rdi,%rax), %eax
; CHECK64-NEXT: retq
@@ -1015,14 +1011,14 @@ define i32 @load_i32_by_i8_base_offset_index_2(i8* %arg, i32 %i) {
; to zext and aext loads.
define i32 @load_i32_by_i8_zaext_loads(i8* %arg, i32 %arg1) {
; CHECK-LABEL: load_i32_by_i8_zaext_loads:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
; CHECK-NEXT: movl 12(%eax,%ecx), %eax
; CHECK-NEXT: retl
;
; CHECK64-LABEL: load_i32_by_i8_zaext_loads:
-; CHECK64: # BB#0:
+; CHECK64: # %bb.0:
; CHECK64-NEXT: movl %esi, %eax
; CHECK64-NEXT: movl 12(%rdi,%rax), %eax
; CHECK64-NEXT: retq
@@ -1071,14 +1067,14 @@ define i32 @load_i32_by_i8_zaext_loads(i8* %arg, i32 %arg1) {
; (i32) p0[12] | ((i32) p1[12] << 8) | ((i32) p2[12] << 16) | ((i32) p3[12] << 24)
define i32 @load_i32_by_i8_zsext_loads(i8* %arg, i32 %arg1) {
; CHECK-LABEL: load_i32_by_i8_zsext_loads:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
; CHECK-NEXT: movl 12(%eax,%ecx), %eax
; CHECK-NEXT: retl
;
; CHECK64-LABEL: load_i32_by_i8_zsext_loads:
-; CHECK64: # BB#0:
+; CHECK64: # %bb.0:
; CHECK64-NEXT: movl %esi, %eax
; CHECK64-NEXT: movl 12(%rdi,%rax), %eax
; CHECK64-NEXT: retq
@@ -1119,7 +1115,7 @@ define i32 @load_i32_by_i8_zsext_loads(i8* %arg, i32 %arg1) {
; (i32) p[0] | ((i32) p[1] << 8)
define i32 @zext_load_i32_by_i8(i32* %arg) {
; CHECK-LABEL: zext_load_i32_by_i8:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: movzbl (%eax), %ecx
; CHECK-NEXT: movzbl 1(%eax), %eax
@@ -1128,7 +1124,7 @@ define i32 @zext_load_i32_by_i8(i32* %arg) {
; CHECK-NEXT: retl
;
; CHECK64-LABEL: zext_load_i32_by_i8:
-; CHECK64: # BB#0:
+; CHECK64: # %bb.0:
; CHECK64-NEXT: movzbl (%rdi), %ecx
; CHECK64-NEXT: movzbl 1(%rdi), %eax
; CHECK64-NEXT: shll $8, %eax
@@ -1150,7 +1146,7 @@ define i32 @zext_load_i32_by_i8(i32* %arg) {
; ((i32) p[0] << 8) | ((i32) p[1] << 16)
define i32 @zext_load_i32_by_i8_shl_8(i32* %arg) {
; CHECK-LABEL: zext_load_i32_by_i8_shl_8:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: movzbl (%eax), %ecx
; CHECK-NEXT: shll $8, %ecx
@@ -1160,7 +1156,7 @@ define i32 @zext_load_i32_by_i8_shl_8(i32* %arg) {
; CHECK-NEXT: retl
;
; CHECK64-LABEL: zext_load_i32_by_i8_shl_8:
-; CHECK64: # BB#0:
+; CHECK64: # %bb.0:
; CHECK64-NEXT: movzbl (%rdi), %ecx
; CHECK64-NEXT: shll $8, %ecx
; CHECK64-NEXT: movzbl 1(%rdi), %eax
@@ -1184,7 +1180,7 @@ define i32 @zext_load_i32_by_i8_shl_8(i32* %arg) {
; ((i32) p[0] << 16) | ((i32) p[1] << 24)
define i32 @zext_load_i32_by_i8_shl_16(i32* %arg) {
; CHECK-LABEL: zext_load_i32_by_i8_shl_16:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: movzbl (%eax), %ecx
; CHECK-NEXT: shll $16, %ecx
@@ -1194,7 +1190,7 @@ define i32 @zext_load_i32_by_i8_shl_16(i32* %arg) {
; CHECK-NEXT: retl
;
; CHECK64-LABEL: zext_load_i32_by_i8_shl_16:
-; CHECK64: # BB#0:
+; CHECK64: # %bb.0:
; CHECK64-NEXT: movzbl (%rdi), %ecx
; CHECK64-NEXT: shll $16, %ecx
; CHECK64-NEXT: movzbl 1(%rdi), %eax
@@ -1218,7 +1214,7 @@ define i32 @zext_load_i32_by_i8_shl_16(i32* %arg) {
; (i32) p[1] | ((i32) p[0] << 8)
define i32 @zext_load_i32_by_i8_bswap(i32* %arg) {
; CHECK-LABEL: zext_load_i32_by_i8_bswap:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: movzbl 1(%eax), %ecx
; CHECK-NEXT: movzbl (%eax), %eax
@@ -1227,7 +1223,7 @@ define i32 @zext_load_i32_by_i8_bswap(i32* %arg) {
; CHECK-NEXT: retl
;
; CHECK64-LABEL: zext_load_i32_by_i8_bswap:
-; CHECK64: # BB#0:
+; CHECK64: # %bb.0:
; CHECK64-NEXT: movzbl 1(%rdi), %ecx
; CHECK64-NEXT: movzbl (%rdi), %eax
; CHECK64-NEXT: shll $8, %eax
@@ -1249,7 +1245,7 @@ define i32 @zext_load_i32_by_i8_bswap(i32* %arg) {
; ((i32) p[1] << 8) | ((i32) p[0] << 16)
define i32 @zext_load_i32_by_i8_bswap_shl_8(i32* %arg) {
; CHECK-LABEL: zext_load_i32_by_i8_bswap_shl_8:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: movzbl 1(%eax), %ecx
; CHECK-NEXT: shll $8, %ecx
@@ -1259,7 +1255,7 @@ define i32 @zext_load_i32_by_i8_bswap_shl_8(i32* %arg) {
; CHECK-NEXT: retl
;
; CHECK64-LABEL: zext_load_i32_by_i8_bswap_shl_8:
-; CHECK64: # BB#0:
+; CHECK64: # %bb.0:
; CHECK64-NEXT: movzbl 1(%rdi), %ecx
; CHECK64-NEXT: shll $8, %ecx
; CHECK64-NEXT: movzbl (%rdi), %eax
@@ -1283,7 +1279,7 @@ define i32 @zext_load_i32_by_i8_bswap_shl_8(i32* %arg) {
; ((i32) p[1] << 16) | ((i32) p[0] << 24)
define i32 @zext_load_i32_by_i8_bswap_shl_16(i32* %arg) {
; CHECK-LABEL: zext_load_i32_by_i8_bswap_shl_16:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: movzbl 1(%eax), %ecx
; CHECK-NEXT: shll $16, %ecx
@@ -1293,7 +1289,7 @@ define i32 @zext_load_i32_by_i8_bswap_shl_16(i32* %arg) {
; CHECK-NEXT: retl
;
; CHECK64-LABEL: zext_load_i32_by_i8_bswap_shl_16:
-; CHECK64: # BB#0:
+; CHECK64: # %bb.0:
; CHECK64-NEXT: movzbl 1(%rdi), %ecx
; CHECK64-NEXT: shll $16, %ecx
; CHECK64-NEXT: movzbl (%rdi), %eax
diff --git a/test/CodeGen/X86/logical-load-fold.ll b/test/CodeGen/X86/logical-load-fold.ll
index 5f06fce1b7b6..3890c1869419 100644
--- a/test/CodeGen/X86/logical-load-fold.ll
+++ b/test/CodeGen/X86/logical-load-fold.ll
@@ -12,14 +12,14 @@
define double @load_double_no_fold(double %x, double %y) {
; SSE2-LABEL: load_double_no_fold:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: cmplesd %xmm0, %xmm1
; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; SSE2-NEXT: andpd %xmm1, %xmm0
; SSE2-NEXT: retq
;
; AVX-LABEL: load_double_no_fold:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vcmplesd %xmm0, %xmm1, %xmm0
; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
; AVX-NEXT: vandpd %xmm1, %xmm0, %xmm0
@@ -33,14 +33,14 @@ define double @load_double_no_fold(double %x, double %y) {
define float @load_float_no_fold(float %x, float %y) {
; SSE2-LABEL: load_float_no_fold:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: cmpless %xmm0, %xmm1
; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: andps %xmm1, %xmm0
; SSE2-NEXT: retq
;
; AVX-LABEL: load_float_no_fold:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vcmpless %xmm0, %xmm1, %xmm0
; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; AVX-NEXT: vandps %xmm1, %xmm0, %xmm0
diff --git a/test/CodeGen/X86/long-setcc.ll b/test/CodeGen/X86/long-setcc.ll
index 13046d8b3dec..9436891e9ccb 100644
--- a/test/CodeGen/X86/long-setcc.ll
+++ b/test/CodeGen/X86/long-setcc.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | FileCheck %s
+; RUN: llc < %s -mtriple=i686-- | FileCheck %s
define i1 @t1(i64 %x) nounwind {
%B = icmp slt i64 %x, 0
diff --git a/test/CodeGen/X86/longlong-deadload.ll b/test/CodeGen/X86/longlong-deadload.ll
index 01888f07306a..4166b0f204ee 100644
--- a/test/CodeGen/X86/longlong-deadload.ll
+++ b/test/CodeGen/X86/longlong-deadload.ll
@@ -4,7 +4,7 @@
define void @test(i64* %P) nounwind {
; CHECK-LABEL: test:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: movl (%eax), %ecx
; CHECK-NEXT: xorl $1, %ecx
diff --git a/test/CodeGen/X86/loop-blocks.ll b/test/CodeGen/X86/loop-blocks.ll
index fc6a357523fe..f39c8a8eab90 100644
--- a/test/CodeGen/X86/loop-blocks.ll
+++ b/test/CodeGen/X86/loop-blocks.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mtriple=x86_64-unknown-linux-gnu -asm-verbose=false | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -asm-verbose=false | FileCheck %s
; These tests check for loop branching structure, and that the loop align
; directive is placed in the expected place.
diff --git a/test/CodeGen/X86/loop-search.ll b/test/CodeGen/X86/loop-search.ll
index fda4ecec0e6a..88e9963e77fb 100644
--- a/test/CodeGen/X86/loop-search.ll
+++ b/test/CodeGen/X86/loop-search.ll
@@ -6,10 +6,10 @@
define zeroext i1 @search(i32 %needle, i32* nocapture readonly %haystack, i32 %count) {
; CHECK-LABEL: search:
-; CHECK: ## BB#0: ## %entry
+; CHECK: ## %bb.0: ## %entry
; CHECK-NEXT: testl %edx, %edx
; CHECK-NEXT: jle LBB0_1
-; CHECK-NEXT: ## BB#4: ## %for.body.preheader
+; CHECK-NEXT: ## %bb.4: ## %for.body.preheader
; CHECK-NEXT: movslq %edx, %rax
; CHECK-NEXT: xorl %ecx, %ecx
; CHECK-NEXT: .p2align 4, 0x90
@@ -17,23 +17,23 @@ define zeroext i1 @search(i32 %needle, i32* nocapture readonly %haystack, i32 %c
; CHECK-NEXT: ## =>This Inner Loop Header: Depth=1
; CHECK-NEXT: cmpl %edi, (%rsi,%rcx,4)
; CHECK-NEXT: je LBB0_6
-; CHECK-NEXT: ## BB#2: ## %for.cond
+; CHECK-NEXT: ## %bb.2: ## %for.cond
; CHECK-NEXT: ## in Loop: Header=BB0_5 Depth=1
; CHECK-NEXT: incq %rcx
; CHECK-NEXT: cmpq %rax, %rcx
; CHECK-NEXT: jl LBB0_5
-; ### FIXME: BB#3 and LBB0_1 should be merged
-; CHECK-NEXT: ## BB#3:
+; ### FIXME: %bb.3 and LBB0_1 should be merged
+; CHECK-NEXT: ## %bb.3:
; CHECK-NEXT: xorl %eax, %eax
-; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: ## kill: def %al killed %al killed %eax
; CHECK-NEXT: retq
; CHECK-NEXT: LBB0_1:
; CHECK-NEXT: xorl %eax, %eax
-; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: ## kill: def %al killed %al killed %eax
; CHECK-NEXT: retq
; CHECK-NEXT: LBB0_6:
; CHECK-NEXT: movb $1, %al
-; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: ## kill: def %al killed %al killed %eax
; CHECK-NEXT: retq
entry:
%cmp5 = icmp sgt i32 %count, 0
diff --git a/test/CodeGen/X86/loop-strength-reduce-2.ll b/test/CodeGen/X86/loop-strength-reduce-2.ll
index 062819021415..6c903a85c437 100644
--- a/test/CodeGen/X86/loop-strength-reduce-2.ll
+++ b/test/CodeGen/X86/loop-strength-reduce-2.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=x86 -relocation-model=pic | FileCheck %s -check-prefix=PIC
-; RUN: llc < %s -march=x86 -relocation-model=static | FileCheck %s -check-prefix=STATIC
+; RUN: llc < %s -mtriple=i686-- -relocation-model=pic | FileCheck %s -check-prefix=PIC
+; RUN: llc < %s -mtriple=i686-- -relocation-model=static | FileCheck %s -check-prefix=STATIC
;
; Make sure the common loop invariant A is hoisted up to preheader,
; since too many registers are needed to subsume it into the addressing modes.
diff --git a/test/CodeGen/X86/loop-strength-reduce.ll b/test/CodeGen/X86/loop-strength-reduce.ll
index 2f80e0bb78bd..d8222b8c3e59 100644
--- a/test/CodeGen/X86/loop-strength-reduce.ll
+++ b/test/CodeGen/X86/loop-strength-reduce.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -relocation-model=static | FileCheck %s
+; RUN: llc < %s -mtriple=i686-- -relocation-model=static | FileCheck %s
; CHECK: align
; CHECK: movl $4, -4(%ecx)
diff --git a/test/CodeGen/X86/loop-strength-reduce4.ll b/test/CodeGen/X86/loop-strength-reduce4.ll
index 786534b00d39..56f4161147b4 100644
--- a/test/CodeGen/X86/loop-strength-reduce4.ll
+++ b/test/CodeGen/X86/loop-strength-reduce4.ll
@@ -4,16 +4,19 @@
; By starting the IV at -64 instead of 0, a cmp is eliminated,
; as the flags from the add can be used directly.
-; STATIC: movl $-64, [[ECX:%e..]]
+; STATIC: movl $-64, [[EAX:%e..]]
-; STATIC: movl [[EAX:%e..]], _state+76([[ECX]])
-; STATIC: addl $16, [[ECX]]
+; STATIC: movl %{{.+}}, _state+76([[EAX]])
+; STATIC: addl $16, [[EAX]]
; STATIC: jne
-; In PIC mode the symbol can't be folded, so the change-compare-stride
-; trick applies.
+; The same for PIC mode.
-; PIC: cmpl $64
+; PIC: movl $-64, [[EAX:%e..]]
+
+; PIC: movl %{{.+}}, 76(%{{.+}},[[EAX]])
+; PIC: addl $16, [[EAX]]
+; PIC: jne
@state = external global [0 x i32] ; <[0 x i32]*> [#uses=4]
@S = external global [0 x i32] ; <[0 x i32]*> [#uses=4]
diff --git a/test/CodeGen/X86/loop-strength-reduce5.ll b/test/CodeGen/X86/loop-strength-reduce5.ll
index d50a66805db7..2e4a5838792f 100644
--- a/test/CodeGen/X86/loop-strength-reduce5.ll
+++ b/test/CodeGen/X86/loop-strength-reduce5.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | grep inc | count 1
+; RUN: llc < %s -mtriple=i686-- | grep inc | count 1
@X = weak global i16 0 ; <i16*> [#uses=1]
@Y = weak global i16 0 ; <i16*> [#uses=1]
diff --git a/test/CodeGen/X86/loop-strength-reduce6.ll b/test/CodeGen/X86/loop-strength-reduce6.ll
index 919f836841ff..326a7394979c 100644
--- a/test/CodeGen/X86/loop-strength-reduce6.ll
+++ b/test/CodeGen/X86/loop-strength-reduce6.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 | not grep inc
+; RUN: llc < %s -mtriple=x86_64-- | not grep inc
define fastcc i32 @decodeMP3(i32 %isize, i32* %done) nounwind {
entry:
diff --git a/test/CodeGen/X86/loop-strength-reduce7.ll b/test/CodeGen/X86/loop-strength-reduce7.ll
index 92ec485e7752..7a467d3118fa 100644
--- a/test/CodeGen/X86/loop-strength-reduce7.ll
+++ b/test/CodeGen/X86/loop-strength-reduce7.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | not grep imul
+; RUN: llc < %s | not grep imul
target triple = "i386-apple-darwin9.6"
%struct.III_psy_xmin = type { [22 x double], [13 x [3 x double]] }
diff --git a/test/CodeGen/X86/lower-bitcast.ll b/test/CodeGen/X86/lower-bitcast.ll
index 79f90f49c7c6..11271f155291 100644
--- a/test/CodeGen/X86/lower-bitcast.ll
+++ b/test/CodeGen/X86/lower-bitcast.ll
@@ -8,14 +8,14 @@
define double @test1(double %A) {
; CHECK-LABEL: test1:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1,1,3]
; CHECK-NEXT: paddd {{.*}}(%rip), %xmm0
; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; CHECK-NEXT: retq
;
; CHECK-WIDE-LABEL: test1:
-; CHECK-WIDE: # BB#0:
+; CHECK-WIDE: # %bb.0:
; CHECK-WIDE-NEXT: paddd {{.*}}(%rip), %xmm0
; CHECK-WIDE-NEXT: retq
%1 = bitcast double %A to <2 x i32>
@@ -26,12 +26,12 @@ define double @test1(double %A) {
define double @test2(double %A, double %B) {
; CHECK-LABEL: test2:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: paddd %xmm1, %xmm0
; CHECK-NEXT: retq
;
; CHECK-WIDE-LABEL: test2:
-; CHECK-WIDE: # BB#0:
+; CHECK-WIDE: # %bb.0:
; CHECK-WIDE-NEXT: paddd %xmm1, %xmm0
; CHECK-WIDE-NEXT: retq
%1 = bitcast double %A to <2 x i32>
@@ -43,14 +43,14 @@ define double @test2(double %A, double %B) {
define i64 @test3(i64 %A) {
; CHECK-LABEL: test3:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movq %rdi, %xmm0
; CHECK-NEXT: addps {{.*}}(%rip), %xmm0
; CHECK-NEXT: movq %xmm0, %rax
; CHECK-NEXT: retq
;
; CHECK-WIDE-LABEL: test3:
-; CHECK-WIDE: # BB#0:
+; CHECK-WIDE: # %bb.0:
; CHECK-WIDE-NEXT: movq %rdi, %xmm0
; CHECK-WIDE-NEXT: addps {{.*}}(%rip), %xmm0
; CHECK-WIDE-NEXT: movq %xmm0, %rax
@@ -66,7 +66,7 @@ define i64 @test3(i64 %A) {
define i64 @test4(i64 %A) {
; CHECK-LABEL: test4:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movq %rdi, %xmm0
; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
; CHECK-NEXT: paddd {{.*}}(%rip), %xmm0
@@ -75,7 +75,7 @@ define i64 @test4(i64 %A) {
; CHECK-NEXT: retq
;
; CHECK-WIDE-LABEL: test4:
-; CHECK-WIDE: # BB#0:
+; CHECK-WIDE: # %bb.0:
; CHECK-WIDE-NEXT: movq %rdi, %xmm0
; CHECK-WIDE-NEXT: paddd {{.*}}(%rip), %xmm0
; CHECK-WIDE-NEXT: movq %xmm0, %rax
@@ -88,12 +88,12 @@ define i64 @test4(i64 %A) {
define double @test5(double %A) {
; CHECK-LABEL: test5:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: addps {{.*}}(%rip), %xmm0
; CHECK-NEXT: retq
;
; CHECK-WIDE-LABEL: test5:
-; CHECK-WIDE: # BB#0:
+; CHECK-WIDE: # %bb.0:
; CHECK-WIDE-NEXT: addps {{.*}}(%rip), %xmm0
; CHECK-WIDE-NEXT: retq
%1 = bitcast double %A to <2 x float>
@@ -107,14 +107,14 @@ define double @test5(double %A) {
define double @test6(double %A) {
; CHECK-LABEL: test6:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
; CHECK-NEXT: paddw {{.*}}(%rip), %xmm0
; CHECK-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; CHECK-NEXT: retq
;
; CHECK-WIDE-LABEL: test6:
-; CHECK-WIDE: # BB#0:
+; CHECK-WIDE: # %bb.0:
; CHECK-WIDE-NEXT: paddw {{.*}}(%rip), %xmm0
; CHECK-WIDE-NEXT: retq
%1 = bitcast double %A to <4 x i16>
@@ -125,12 +125,12 @@ define double @test6(double %A) {
define double @test7(double %A, double %B) {
; CHECK-LABEL: test7:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: paddw %xmm1, %xmm0
; CHECK-NEXT: retq
;
; CHECK-WIDE-LABEL: test7:
-; CHECK-WIDE: # BB#0:
+; CHECK-WIDE: # %bb.0:
; CHECK-WIDE-NEXT: paddw %xmm1, %xmm0
; CHECK-WIDE-NEXT: retq
%1 = bitcast double %A to <4 x i16>
@@ -146,14 +146,14 @@ define double @test7(double %A, double %B) {
define double @test8(double %A) {
; CHECK-LABEL: test8:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; CHECK-NEXT: paddb {{.*}}(%rip), %xmm0
; CHECK-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; CHECK-NEXT: retq
;
; CHECK-WIDE-LABEL: test8:
-; CHECK-WIDE: # BB#0:
+; CHECK-WIDE: # %bb.0:
; CHECK-WIDE-NEXT: paddb {{.*}}(%rip), %xmm0
; CHECK-WIDE-NEXT: retq
%1 = bitcast double %A to <8 x i8>
@@ -164,12 +164,12 @@ define double @test8(double %A) {
define double @test9(double %A, double %B) {
; CHECK-LABEL: test9:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: paddb %xmm1, %xmm0
; CHECK-NEXT: retq
;
; CHECK-WIDE-LABEL: test9:
-; CHECK-WIDE: # BB#0:
+; CHECK-WIDE: # %bb.0:
; CHECK-WIDE-NEXT: paddb %xmm1, %xmm0
; CHECK-WIDE-NEXT: retq
%1 = bitcast double %A to <8 x i8>
diff --git a/test/CodeGen/X86/lower-vec-shift-2.ll b/test/CodeGen/X86/lower-vec-shift-2.ll
index a617f44d3f98..aeaac0e0e9de 100644
--- a/test/CodeGen/X86/lower-vec-shift-2.ll
+++ b/test/CodeGen/X86/lower-vec-shift-2.ll
@@ -4,14 +4,14 @@
define <8 x i16> @test1(<8 x i16> %A, <8 x i16> %B) {
; SSE2-LABEL: test1:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: pextrw $0, %xmm1, %eax
; SSE2-NEXT: movd %eax, %xmm1
; SSE2-NEXT: psllw %xmm1, %xmm0
; SSE2-NEXT: retq
;
; AVX-LABEL: test1:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
; AVX-NEXT: vpsllw %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
@@ -23,14 +23,14 @@ entry:
define <4 x i32> @test2(<4 x i32> %A, <4 x i32> %B) {
; SSE2-LABEL: test2:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: xorps %xmm2, %xmm2
; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
; SSE2-NEXT: pslld %xmm2, %xmm0
; SSE2-NEXT: retq
;
; AVX-LABEL: test2:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
; AVX-NEXT: vpslld %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
@@ -42,12 +42,12 @@ entry:
define <2 x i64> @test3(<2 x i64> %A, <2 x i64> %B) {
; SSE2-LABEL: test3:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: psllq %xmm1, %xmm0
; SSE2-NEXT: retq
;
; AVX-LABEL: test3:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vpsllq %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
entry:
@@ -58,14 +58,14 @@ entry:
define <8 x i16> @test4(<8 x i16> %A, <8 x i16> %B) {
; SSE2-LABEL: test4:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: pextrw $0, %xmm1, %eax
; SSE2-NEXT: movd %eax, %xmm1
; SSE2-NEXT: psrlw %xmm1, %xmm0
; SSE2-NEXT: retq
;
; AVX-LABEL: test4:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
; AVX-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
@@ -77,14 +77,14 @@ entry:
define <4 x i32> @test5(<4 x i32> %A, <4 x i32> %B) {
; SSE2-LABEL: test5:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: xorps %xmm2, %xmm2
; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
; SSE2-NEXT: psrld %xmm2, %xmm0
; SSE2-NEXT: retq
;
; AVX-LABEL: test5:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
; AVX-NEXT: vpsrld %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
@@ -96,12 +96,12 @@ entry:
define <2 x i64> @test6(<2 x i64> %A, <2 x i64> %B) {
; SSE2-LABEL: test6:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: psrlq %xmm1, %xmm0
; SSE2-NEXT: retq
;
; AVX-LABEL: test6:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vpsrlq %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
entry:
@@ -112,14 +112,14 @@ entry:
define <8 x i16> @test7(<8 x i16> %A, <8 x i16> %B) {
; SSE2-LABEL: test7:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: pextrw $0, %xmm1, %eax
; SSE2-NEXT: movd %eax, %xmm1
; SSE2-NEXT: psraw %xmm1, %xmm0
; SSE2-NEXT: retq
;
; AVX-LABEL: test7:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
; AVX-NEXT: vpsraw %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
@@ -131,14 +131,14 @@ entry:
define <4 x i32> @test8(<4 x i32> %A, <4 x i32> %B) {
; SSE2-LABEL: test8:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: xorps %xmm2, %xmm2
; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
; SSE2-NEXT: psrad %xmm2, %xmm0
; SSE2-NEXT: retq
;
; AVX-LABEL: test8:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
; AVX-NEXT: vpsrad %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
diff --git a/test/CodeGen/X86/lower-vec-shift.ll b/test/CodeGen/X86/lower-vec-shift.ll
index 8d64baf5f2a4..8474f7e75301 100644
--- a/test/CodeGen/X86/lower-vec-shift.ll
+++ b/test/CodeGen/X86/lower-vec-shift.ll
@@ -10,23 +10,22 @@
define <8 x i16> @test1(<8 x i16> %a) {
; SSE-LABEL: test1:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movdqa %xmm0, %xmm1
-; SSE-NEXT: psrlw $2, %xmm1
-; SSE-NEXT: psrlw $3, %xmm0
-; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
-; SSE-NEXT: movaps %xmm1, %xmm0
+; SSE-NEXT: psrlw $3, %xmm1
+; SSE-NEXT: psrlw $2, %xmm0
+; SSE-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
; SSE-NEXT: retq
;
; AVX1-LABEL: test1:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpsrlw $3, %xmm0, %xmm1
; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm0
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
; AVX1-NEXT: retq
;
; AVX2-LABEL: test1:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpsrlw $3, %xmm0, %xmm1
; AVX2-NEXT: vpsrlw $2, %xmm0, %xmm0
; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
@@ -37,23 +36,22 @@ define <8 x i16> @test1(<8 x i16> %a) {
define <8 x i16> @test2(<8 x i16> %a) {
; SSE-LABEL: test2:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movdqa %xmm0, %xmm1
-; SSE-NEXT: psrlw $2, %xmm1
-; SSE-NEXT: psrlw $3, %xmm0
-; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
-; SSE-NEXT: movapd %xmm1, %xmm0
+; SSE-NEXT: psrlw $3, %xmm1
+; SSE-NEXT: psrlw $2, %xmm0
+; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; SSE-NEXT: retq
;
; AVX1-LABEL: test2:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm1
; AVX1-NEXT: vpsrlw $3, %xmm0, %xmm0
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
; AVX1-NEXT: retq
;
; AVX2-LABEL: test2:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpsrlw $2, %xmm0, %xmm1
; AVX2-NEXT: vpsrlw $3, %xmm0, %xmm0
; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
@@ -64,23 +62,22 @@ define <8 x i16> @test2(<8 x i16> %a) {
define <4 x i32> @test3(<4 x i32> %a) {
; SSE-LABEL: test3:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movdqa %xmm0, %xmm1
-; SSE-NEXT: psrld $2, %xmm1
-; SSE-NEXT: psrld $3, %xmm0
-; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
-; SSE-NEXT: movaps %xmm1, %xmm0
+; SSE-NEXT: psrld $3, %xmm1
+; SSE-NEXT: psrld $2, %xmm0
+; SSE-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
; SSE-NEXT: retq
;
; AVX1-LABEL: test3:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpsrld $3, %xmm0, %xmm1
; AVX1-NEXT: vpsrld $2, %xmm0, %xmm0
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
; AVX1-NEXT: retq
;
; AVX2-LABEL: test3:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
; AVX2-NEXT: retq
%lshr = lshr <4 x i32> %a, <i32 3, i32 2, i32 2, i32 2>
@@ -89,23 +86,22 @@ define <4 x i32> @test3(<4 x i32> %a) {
define <4 x i32> @test4(<4 x i32> %a) {
; SSE-LABEL: test4:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movdqa %xmm0, %xmm1
-; SSE-NEXT: psrld $2, %xmm1
-; SSE-NEXT: psrld $3, %xmm0
-; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
-; SSE-NEXT: movapd %xmm1, %xmm0
+; SSE-NEXT: psrld $3, %xmm1
+; SSE-NEXT: psrld $2, %xmm0
+; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; SSE-NEXT: retq
;
; AVX1-LABEL: test4:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpsrld $2, %xmm0, %xmm1
; AVX1-NEXT: vpsrld $3, %xmm0, %xmm0
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
; AVX1-NEXT: retq
;
; AVX2-LABEL: test4:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
; AVX2-NEXT: retq
%lshr = lshr <4 x i32> %a, <i32 3, i32 3, i32 2, i32 2>
@@ -114,23 +110,22 @@ define <4 x i32> @test4(<4 x i32> %a) {
define <8 x i16> @test5(<8 x i16> %a) {
; SSE-LABEL: test5:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movdqa %xmm0, %xmm1
-; SSE-NEXT: psraw $2, %xmm1
-; SSE-NEXT: psraw $3, %xmm0
-; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
-; SSE-NEXT: movaps %xmm1, %xmm0
+; SSE-NEXT: psraw $3, %xmm1
+; SSE-NEXT: psraw $2, %xmm0
+; SSE-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
; SSE-NEXT: retq
;
; AVX1-LABEL: test5:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpsraw $3, %xmm0, %xmm1
; AVX1-NEXT: vpsraw $2, %xmm0, %xmm0
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
; AVX1-NEXT: retq
;
; AVX2-LABEL: test5:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpsraw $3, %xmm0, %xmm1
; AVX2-NEXT: vpsraw $2, %xmm0, %xmm0
; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
@@ -141,23 +136,22 @@ define <8 x i16> @test5(<8 x i16> %a) {
define <8 x i16> @test6(<8 x i16> %a) {
; SSE-LABEL: test6:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movdqa %xmm0, %xmm1
-; SSE-NEXT: psraw $2, %xmm1
-; SSE-NEXT: psraw $3, %xmm0
-; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
-; SSE-NEXT: movapd %xmm1, %xmm0
+; SSE-NEXT: psraw $3, %xmm1
+; SSE-NEXT: psraw $2, %xmm0
+; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; SSE-NEXT: retq
;
; AVX1-LABEL: test6:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpsraw $2, %xmm0, %xmm1
; AVX1-NEXT: vpsraw $3, %xmm0, %xmm0
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
; AVX1-NEXT: retq
;
; AVX2-LABEL: test6:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpsraw $2, %xmm0, %xmm1
; AVX2-NEXT: vpsraw $3, %xmm0, %xmm0
; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
@@ -168,23 +162,22 @@ define <8 x i16> @test6(<8 x i16> %a) {
define <4 x i32> @test7(<4 x i32> %a) {
; SSE-LABEL: test7:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movdqa %xmm0, %xmm1
-; SSE-NEXT: psrad $2, %xmm1
-; SSE-NEXT: psrad $3, %xmm0
-; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
-; SSE-NEXT: movaps %xmm1, %xmm0
+; SSE-NEXT: psrad $3, %xmm1
+; SSE-NEXT: psrad $2, %xmm0
+; SSE-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
; SSE-NEXT: retq
;
; AVX1-LABEL: test7:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpsrad $3, %xmm0, %xmm1
; AVX1-NEXT: vpsrad $2, %xmm0, %xmm0
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
; AVX1-NEXT: retq
;
; AVX2-LABEL: test7:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0
; AVX2-NEXT: retq
%lshr = ashr <4 x i32> %a, <i32 3, i32 2, i32 2, i32 2>
@@ -193,23 +186,22 @@ define <4 x i32> @test7(<4 x i32> %a) {
define <4 x i32> @test8(<4 x i32> %a) {
; SSE-LABEL: test8:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movdqa %xmm0, %xmm1
-; SSE-NEXT: psrad $2, %xmm1
-; SSE-NEXT: psrad $3, %xmm0
-; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
-; SSE-NEXT: movapd %xmm1, %xmm0
+; SSE-NEXT: psrad $3, %xmm1
+; SSE-NEXT: psrad $2, %xmm0
+; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; SSE-NEXT: retq
;
; AVX1-LABEL: test8:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpsrad $2, %xmm0, %xmm1
; AVX1-NEXT: vpsrad $3, %xmm0, %xmm0
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
; AVX1-NEXT: retq
;
; AVX2-LABEL: test8:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0
; AVX2-NEXT: retq
%lshr = ashr <4 x i32> %a, <i32 3, i32 3, i32 2, i32 2>
diff --git a/test/CodeGen/X86/lower-vec-shuffle-bug.ll b/test/CodeGen/X86/lower-vec-shuffle-bug.ll
index 7a081b556867..0ae2fc1faba3 100644
--- a/test/CodeGen/X86/lower-vec-shuffle-bug.ll
+++ b/test/CodeGen/X86/lower-vec-shuffle-bug.ll
@@ -3,7 +3,7 @@
define <4 x double> @test1(<4 x double> %A, <4 x double> %B) {
; CHECK-LABEL: test1:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; CHECK-NEXT: retq
entry:
@@ -13,7 +13,7 @@ entry:
define <4 x double> @test2(<4 x double> %A, <4 x double> %B) {
; CHECK-LABEL: test2:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; CHECK-NEXT: retq
entry:
@@ -23,7 +23,7 @@ entry:
define <4 x double> @test3(<4 x double> %A, <4 x double> %B) {
; CHECK-LABEL: test3:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; CHECK-NEXT: retq
entry:
@@ -33,7 +33,7 @@ entry:
define <4 x double> @test4(<4 x double> %A, <4 x double> %B) {
; CHECK-LABEL: test4:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; CHECK-NEXT: retq
entry:
diff --git a/test/CodeGen/X86/lsr-delayed-fold.ll b/test/CodeGen/X86/lsr-delayed-fold.ll
index eaa52dec2835..f580e404d177 100644
--- a/test/CodeGen/X86/lsr-delayed-fold.ll
+++ b/test/CodeGen/X86/lsr-delayed-fold.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=x86-64 < %s > /dev/null
+; RUN: llc < %s > /dev/null
; ScalarEvolution misses an opportunity to fold ((trunc x) + (trunc -x) + y),
; but LSR should tolerate this.
diff --git a/test/CodeGen/X86/lsr-i386.ll b/test/CodeGen/X86/lsr-i386.ll
index 9338939fafd0..c6fba85c9ec7 100644
--- a/test/CodeGen/X86/lsr-i386.ll
+++ b/test/CodeGen/X86/lsr-i386.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=x86 < %s | FileCheck %s
+; RUN: llc < %s | FileCheck %s
target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32"
target triple = "i386-pc-linux-gnu"
; PR7651
diff --git a/test/CodeGen/X86/lsr-interesting-step.ll b/test/CodeGen/X86/lsr-interesting-step.ll
index fe8337e2981a..53f7fb2460c2 100644
--- a/test/CodeGen/X86/lsr-interesting-step.ll
+++ b/test/CodeGen/X86/lsr-interesting-step.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -relocation-model=static -mtriple=x86_64-unknown-linux-gnu -asm-verbose=0 | FileCheck %s
+; RUN: llc < %s -relocation-model=static -mtriple=x86_64-unknown-linux-gnu -asm-verbose=0 | FileCheck %s
; The inner loop should require only one add (and no leas either).
; rdar://8100380
diff --git a/test/CodeGen/X86/lsr-negative-stride.ll b/test/CodeGen/X86/lsr-negative-stride.ll
index b08356c8d309..19d3422f45bc 100644
--- a/test/CodeGen/X86/lsr-negative-stride.ll
+++ b/test/CodeGen/X86/lsr-negative-stride.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 > %t
+; RUN: llc < %s -mtriple=i686-- > %t
; RUN: not grep neg %t
; RUN: not grep sub.*esp %t
; RUN: not grep esi %t
diff --git a/test/CodeGen/X86/lsr-nonaffine.ll b/test/CodeGen/X86/lsr-nonaffine.ll
index d825b5a76c09..6b82b9112eb4 100644
--- a/test/CodeGen/X86/lsr-nonaffine.ll
+++ b/test/CodeGen/X86/lsr-nonaffine.ll
@@ -1,4 +1,4 @@
-; RUN: llc -asm-verbose=false -march=x86-64 -mtriple=x86_64-apple-darwin -o - < %s | FileCheck %s
+; RUN: llc -asm-verbose=false -mtriple=x86_64-apple-darwin -o - < %s | FileCheck %s
; LSR should leave non-affine expressions alone because it currently
; doesn't know how to do anything with them, and when it tries, it
diff --git a/test/CodeGen/X86/lsr-normalization.ll b/test/CodeGen/X86/lsr-normalization.ll
index 09c892c9fc88..a8e3ab1ae994 100644
--- a/test/CodeGen/X86/lsr-normalization.ll
+++ b/test/CodeGen/X86/lsr-normalization.ll
@@ -1,6 +1,6 @@
; REQUIRES: asserts
-; RUN: llc < %s -march=x86-64 | FileCheck %s --check-prefix=ASM
-; RUN: llc -debug -o /dev/null < %s -march=x86-64 2>&1 | FileCheck %s --check-prefix=DBG
+; RUN: llc < %s -mtriple=x86_64-- | FileCheck %s --check-prefix=ASM
+; RUN: llc -debug -o /dev/null < %s -mtriple=x86_64-- 2>&1 | FileCheck %s --check-prefix=DBG
; rdar://8168938
; This testcase involves SCEV normalization with the exit value from
diff --git a/test/CodeGen/X86/lsr-quadratic-expand.ll b/test/CodeGen/X86/lsr-quadratic-expand.ll
index 29a8da2ef3aa..874dbd71981b 100644
--- a/test/CodeGen/X86/lsr-quadratic-expand.ll
+++ b/test/CodeGen/X86/lsr-quadratic-expand.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=x86-64 < %s
+; RUN: llc -mtriple=x86_64-- < %s
define void @dw2102_i2c_transfer() nounwind {
entry:
diff --git a/test/CodeGen/X86/lsr-redundant-addressing.ll b/test/CodeGen/X86/lsr-redundant-addressing.ll
index 31a1859e3b27..6aeaa97bdac3 100644
--- a/test/CodeGen/X86/lsr-redundant-addressing.ll
+++ b/test/CodeGen/X86/lsr-redundant-addressing.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=x86-64 < %s | FileCheck %s
+; RUN: llc -mtriple=x86_64-- < %s | FileCheck %s
; rdar://9081094
; LSR shouldn't create lots of redundant address computations.
diff --git a/test/CodeGen/X86/lsr-reuse.ll b/test/CodeGen/X86/lsr-reuse.ll
index dd1e40f6a1ec..85e0517978ef 100644
--- a/test/CodeGen/X86/lsr-reuse.ll
+++ b/test/CodeGen/X86/lsr-reuse.ll
@@ -1,6 +1,6 @@
; XFAIL: *
; ...should pass. See PR12324: misched bringup
-; RUN: llc < %s -march=x86-64 -O3 -asm-verbose=false | FileCheck %s
+; RUN: llc < %s -O3 -asm-verbose=false | FileCheck %s
target datalayout = "e-p:64:64:64"
target triple = "x86_64-unknown-unknown"
diff --git a/test/CodeGen/X86/lsr-sort.ll b/test/CodeGen/X86/lsr-sort.ll
index b85ddeb13b8d..5aca606c1e1c 100644
--- a/test/CodeGen/X86/lsr-sort.ll
+++ b/test/CodeGen/X86/lsr-sort.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 > %t
+; RUN: llc < %s -mtriple=x86_64-- > %t
; RUN: grep inc %t | count 1
; RUN: not grep incw %t
diff --git a/test/CodeGen/X86/lsr-static-addr.ll b/test/CodeGen/X86/lsr-static-addr.ll
index 3980bee9a306..1d4cb3c04e9b 100644
--- a/test/CodeGen/X86/lsr-static-addr.ll
+++ b/test/CodeGen/X86/lsr-static-addr.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=x86-64 -mcpu=generic -mtriple=x86_64-unknown-linux-gnu -relocation-model=static -asm-verbose=false < %s | FileCheck %s
-; RUN: llc -march=x86-64 -mcpu=atom -mtriple=x86_64-unknown-linux-gnu -relocation-model=static -asm-verbose=false < %s | FileCheck -check-prefix=ATOM %s
+; RUN: llc -mcpu=generic -mtriple=x86_64-unknown-linux-gnu -relocation-model=static -asm-verbose=false < %s | FileCheck %s
+; RUN: llc -mcpu=atom -mtriple=x86_64-unknown-linux-gnu -relocation-model=static -asm-verbose=false < %s | FileCheck -check-prefix=ATOM %s
; CHECK: xorl %eax, %eax
; CHECK: movsd .LCPI0_0(%rip), %xmm0
diff --git a/test/CodeGen/X86/lsr-wrap.ll b/test/CodeGen/X86/lsr-wrap.ll
index adf954477791..45139278d880 100644
--- a/test/CodeGen/X86/lsr-wrap.ll
+++ b/test/CodeGen/X86/lsr-wrap.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=x86-64 < %s | FileCheck %s
+; RUN: llc -mtriple=x86_64-- < %s | FileCheck %s
; LSR would like to use a single IV for both of these, however it's
; not safe due to wraparound.
diff --git a/test/CodeGen/X86/lwp-intrinsics-x86_64.ll b/test/CodeGen/X86/lwp-intrinsics-x86_64.ll
index 9ee95267fc33..32206989d718 100644
--- a/test/CodeGen/X86/lwp-intrinsics-x86_64.ll
+++ b/test/CodeGen/X86/lwp-intrinsics-x86_64.ll
@@ -7,7 +7,7 @@
define i8 @test_lwpins64_rri(i64 %a0, i32 %a1) nounwind {
; X64-LABEL: test_lwpins64_rri:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: lwpins $-1985229329, %esi, %rdi # imm = 0x89ABCDEF
; X64-NEXT: setb %al
; X64-NEXT: retq
@@ -17,7 +17,7 @@ define i8 @test_lwpins64_rri(i64 %a0, i32 %a1) nounwind {
define i8 @test_lwpins64_rmi(i64 %a0, i32 *%p1) nounwind {
; X64-LABEL: test_lwpins64_rmi:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: lwpins $1985229328, (%rsi), %rdi # imm = 0x76543210
; X64-NEXT: setb %al
; X64-NEXT: retq
@@ -28,7 +28,7 @@ define i8 @test_lwpins64_rmi(i64 %a0, i32 *%p1) nounwind {
define void @test_lwpval64_rri(i64 %a0, i32 %a1) nounwind {
; X64-LABEL: test_lwpval64_rri:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: lwpval $-19088744, %esi, %rdi # imm = 0xFEDCBA98
; X64-NEXT: retq
tail call void @llvm.x86.lwpval64(i64 %a0, i32 %a1, i32 4275878552)
@@ -37,7 +37,7 @@ define void @test_lwpval64_rri(i64 %a0, i32 %a1) nounwind {
define void @test_lwpval64_rmi(i64 %a0, i32 *%p1) nounwind {
; X64-LABEL: test_lwpval64_rmi:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: lwpval $305419896, (%rsi), %rdi # imm = 0x12345678
; X64-NEXT: retq
%a1 = load i32, i32 *%p1
diff --git a/test/CodeGen/X86/lwp-intrinsics.ll b/test/CodeGen/X86/lwp-intrinsics.ll
index c949bc806083..f693b6106140 100644
--- a/test/CodeGen/X86/lwp-intrinsics.ll
+++ b/test/CodeGen/X86/lwp-intrinsics.ll
@@ -12,13 +12,13 @@
define void @test_llwpcb(i8 *%a0) nounwind {
; X86-LABEL: test_llwpcb:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: llwpcb %eax
; X86-NEXT: retl
;
; X64-LABEL: test_llwpcb:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: llwpcb %rdi
; X64-NEXT: retq
tail call void @llvm.x86.llwpcb(i8 *%a0)
@@ -27,12 +27,12 @@ define void @test_llwpcb(i8 *%a0) nounwind {
define i8* @test_slwpcb(i8 *%a0) nounwind {
; X86-LABEL: test_slwpcb:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: slwpcb %eax
; X86-NEXT: retl
;
; X64-LABEL: test_slwpcb:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: slwpcb %rax
; X64-NEXT: retq
%1 = tail call i8* @llvm.x86.slwpcb()
@@ -41,7 +41,7 @@ define i8* @test_slwpcb(i8 *%a0) nounwind {
define i8 @test_lwpins32_rri(i32 %a0, i32 %a1) nounwind {
; X86-LABEL: test_lwpins32_rri:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: addl %ecx, %ecx
@@ -50,7 +50,7 @@ define i8 @test_lwpins32_rri(i32 %a0, i32 %a1) nounwind {
; X86-NEXT: retl
;
; X64-LABEL: test_lwpins32_rri:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: addl %esi, %esi
; X64-NEXT: lwpins $-1985229329, %esi, %edi # imm = 0x89ABCDEF
; X64-NEXT: setb %al
@@ -62,7 +62,7 @@ define i8 @test_lwpins32_rri(i32 %a0, i32 %a1) nounwind {
define i8 @test_lwpins32_rmi(i32 %a0, i32 *%p1) nounwind {
; X86-LABEL: test_lwpins32_rmi:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: lwpins $1985229328, (%eax), %ecx # imm = 0x76543210
@@ -70,7 +70,7 @@ define i8 @test_lwpins32_rmi(i32 %a0, i32 *%p1) nounwind {
; X86-NEXT: retl
;
; X64-LABEL: test_lwpins32_rmi:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: lwpins $1985229328, (%rsi), %edi # imm = 0x76543210
; X64-NEXT: setb %al
; X64-NEXT: retq
@@ -81,7 +81,7 @@ define i8 @test_lwpins32_rmi(i32 %a0, i32 *%p1) nounwind {
define void @test_lwpval32_rri(i32 %a0, i32 %a1) nounwind {
; X86-LABEL: test_lwpval32_rri:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: addl %ecx, %ecx
@@ -89,7 +89,7 @@ define void @test_lwpval32_rri(i32 %a0, i32 %a1) nounwind {
; X86-NEXT: retl
;
; X64-LABEL: test_lwpval32_rri:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: addl %esi, %esi
; X64-NEXT: lwpval $-19088744, %esi, %edi # imm = 0xFEDCBA98
; X64-NEXT: retq
@@ -100,14 +100,14 @@ define void @test_lwpval32_rri(i32 %a0, i32 %a1) nounwind {
define void @test_lwpval32_rmi(i32 %a0, i32 *%p1) nounwind {
; X86-LABEL: test_lwpval32_rmi:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: lwpval $305419896, (%eax), %ecx # imm = 0x12345678
; X86-NEXT: retl
;
; X64-LABEL: test_lwpval32_rmi:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: lwpval $305419896, (%rsi), %edi # imm = 0x12345678
; X64-NEXT: retq
%a1 = load i32, i32 *%p1
diff --git a/test/CodeGen/X86/lwp-schedule.ll b/test/CodeGen/X86/lwp-schedule.ll
new file mode 100644
index 000000000000..9e517ac62da9
--- /dev/null
+++ b/test/CodeGen/X86/lwp-schedule.ll
@@ -0,0 +1,179 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown -print-schedule -mcpu=x86-64 -mattr=+lwp | FileCheck %s --check-prefix=GENERIC
+; RUN: llc < %s -mtriple=x86_64-unknown -print-schedule -mcpu=bdver1 | FileCheck %s --check-prefix=BDVER --check-prefix=BDVER1
+; RUN: llc < %s -mtriple=x86_64-unknown -print-schedule -mcpu=bdver2 | FileCheck %s --check-prefix=BDVER --check-prefix=BDVER2
+; RUN: llc < %s -mtriple=x86_64-unknown -print-schedule -mcpu=bdver3 | FileCheck %s --check-prefix=BDVER --check-prefix=BDVER3
+; RUN: llc < %s -mtriple=x86_64-unknown -print-schedule -mcpu=bdver4 | FileCheck %s --check-prefix=BDVER --check-prefix=BDVER4
+
+define void @test_llwpcb(i8 *%a0) nounwind {
+; GENERIC-LABEL: test_llwpcb:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: llwpcb %rdi # sched: [100:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; BDVER-LABEL: test_llwpcb:
+; BDVER: # %bb.0:
+; BDVER-NEXT: llwpcb %rdi
+; BDVER-NEXT: retq
+ tail call void @llvm.x86.llwpcb(i8 *%a0)
+ ret void
+}
+
+define i8* @test_slwpcb(i8 *%a0) nounwind {
+; GENERIC-LABEL: test_slwpcb:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: slwpcb %rax # sched: [100:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; BDVER-LABEL: test_slwpcb:
+; BDVER: # %bb.0:
+; BDVER-NEXT: slwpcb %rax
+; BDVER-NEXT: retq
+ %1 = tail call i8* @llvm.x86.slwpcb()
+ ret i8 *%1
+}
+
+define i8 @test_lwpins32_rri(i32 %a0, i32 %a1) nounwind {
+; GENERIC-LABEL: test_lwpins32_rri:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: addl %esi, %esi # sched: [1:0.33]
+; GENERIC-NEXT: lwpins $-1985229329, %esi, %edi # imm = 0x89ABCDEF
+; GENERIC-NEXT: # sched: [100:0.33]
+; GENERIC-NEXT: setb %al # sched: [1:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; BDVER-LABEL: test_lwpins32_rri:
+; BDVER: # %bb.0:
+; BDVER-NEXT: addl %esi, %esi
+; BDVER-NEXT: lwpins $-1985229329, %esi, %edi # imm = 0x89ABCDEF
+; BDVER-NEXT: setb %al
+; BDVER-NEXT: retq
+ %1 = add i32 %a1, %a1
+ %2 = tail call i8 @llvm.x86.lwpins32(i32 %a0, i32 %1, i32 2309737967)
+ ret i8 %2
+}
+
+define i8 @test_lwpins32_rmi(i32 %a0, i32 *%p1) nounwind {
+; GENERIC-LABEL: test_lwpins32_rmi:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: lwpins $1985229328, (%rsi), %edi # imm = 0x76543210
+; GENERIC-NEXT: # sched: [100:0.33]
+; GENERIC-NEXT: setb %al # sched: [1:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; BDVER-LABEL: test_lwpins32_rmi:
+; BDVER: # %bb.0:
+; BDVER-NEXT: lwpins $1985229328, (%rsi), %edi # imm = 0x76543210
+; BDVER-NEXT: setb %al
+; BDVER-NEXT: retq
+ %a1 = load i32, i32 *%p1
+ %1 = tail call i8 @llvm.x86.lwpins32(i32 %a0, i32 %a1, i32 1985229328)
+ ret i8 %1
+}
+
+define i8 @test_lwpins64_rri(i64 %a0, i32 %a1) nounwind {
+; GENERIC-LABEL: test_lwpins64_rri:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: lwpins $-1985229329, %esi, %rdi # imm = 0x89ABCDEF
+; GENERIC-NEXT: # sched: [100:0.33]
+; GENERIC-NEXT: setb %al # sched: [1:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; BDVER-LABEL: test_lwpins64_rri:
+; BDVER: # %bb.0:
+; BDVER-NEXT: lwpins $-1985229329, %esi, %rdi # imm = 0x89ABCDEF
+; BDVER-NEXT: setb %al
+; BDVER-NEXT: retq
+ %1 = tail call i8 @llvm.x86.lwpins64(i64 %a0, i32 %a1, i32 2309737967)
+ ret i8 %1
+}
+
+define i8 @test_lwpins64_rmi(i64 %a0, i32 *%p1) nounwind {
+; GENERIC-LABEL: test_lwpins64_rmi:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: lwpins $1985229328, (%rsi), %rdi # imm = 0x76543210
+; GENERIC-NEXT: # sched: [100:0.33]
+; GENERIC-NEXT: setb %al # sched: [1:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; BDVER-LABEL: test_lwpins64_rmi:
+; BDVER: # %bb.0:
+; BDVER-NEXT: lwpins $1985229328, (%rsi), %rdi # imm = 0x76543210
+; BDVER-NEXT: setb %al
+; BDVER-NEXT: retq
+ %a1 = load i32, i32 *%p1
+ %1 = tail call i8 @llvm.x86.lwpins64(i64 %a0, i32 %a1, i32 1985229328)
+ ret i8 %1
+}
+
+define void @test_lwpval32_rri(i32 %a0, i32 %a1) nounwind {
+; GENERIC-LABEL: test_lwpval32_rri:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: addl %esi, %esi # sched: [1:0.33]
+; GENERIC-NEXT: lwpval $-19088744, %esi, %edi # imm = 0xFEDCBA98
+; GENERIC-NEXT: # sched: [100:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; BDVER-LABEL: test_lwpval32_rri:
+; BDVER: # %bb.0:
+; BDVER-NEXT: addl %esi, %esi
+; BDVER-NEXT: lwpval $-19088744, %esi, %edi # imm = 0xFEDCBA98
+; BDVER-NEXT: retq
+ %1 = add i32 %a1, %a1
+ tail call void @llvm.x86.lwpval32(i32 %a0, i32 %1, i32 4275878552)
+ ret void
+}
+
+define void @test_lwpval32_rmi(i32 %a0, i32 *%p1) nounwind {
+; GENERIC-LABEL: test_lwpval32_rmi:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: lwpval $305419896, (%rsi), %edi # imm = 0x12345678
+; GENERIC-NEXT: # sched: [100:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; BDVER-LABEL: test_lwpval32_rmi:
+; BDVER: # %bb.0:
+; BDVER-NEXT: lwpval $305419896, (%rsi), %edi # imm = 0x12345678
+; BDVER-NEXT: retq
+ %a1 = load i32, i32 *%p1
+ tail call void @llvm.x86.lwpval32(i32 %a0, i32 %a1, i32 305419896)
+ ret void
+}
+
+define void @test_lwpval64_rri(i64 %a0, i32 %a1) nounwind {
+; GENERIC-LABEL: test_lwpval64_rri:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: lwpval $-19088744, %esi, %rdi # imm = 0xFEDCBA98
+; GENERIC-NEXT: # sched: [100:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; BDVER-LABEL: test_lwpval64_rri:
+; BDVER: # %bb.0:
+; BDVER-NEXT: lwpval $-19088744, %esi, %rdi # imm = 0xFEDCBA98
+; BDVER-NEXT: retq
+ tail call void @llvm.x86.lwpval64(i64 %a0, i32 %a1, i32 4275878552)
+ ret void
+}
+
+define void @test_lwpval64_rmi(i64 %a0, i32 *%p1) nounwind {
+; GENERIC-LABEL: test_lwpval64_rmi:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: lwpval $305419896, (%rsi), %rdi # imm = 0x12345678
+; GENERIC-NEXT: # sched: [100:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; BDVER-LABEL: test_lwpval64_rmi:
+; BDVER: # %bb.0:
+; BDVER-NEXT: lwpval $305419896, (%rsi), %rdi # imm = 0x12345678
+; BDVER-NEXT: retq
+ %a1 = load i32, i32 *%p1
+ tail call void @llvm.x86.lwpval64(i64 %a0, i32 %a1, i32 305419896)
+ ret void
+}
+
+declare void @llvm.x86.llwpcb(i8*) nounwind
+declare i8* @llvm.x86.slwpcb() nounwind
+declare i8 @llvm.x86.lwpins32(i32, i32, i32) nounwind
+declare i8 @llvm.x86.lwpins64(i64, i32, i32) nounwind
+declare void @llvm.x86.lwpval32(i32, i32, i32) nounwind
+declare void @llvm.x86.lwpval64(i64, i32, i32) nounwind
diff --git a/test/CodeGen/X86/lzcnt-schedule.ll b/test/CodeGen/X86/lzcnt-schedule.ll
index cd0dcbbd6afb..43cb14626356 100644
--- a/test/CodeGen/X86/lzcnt-schedule.ll
+++ b/test/CodeGen/X86/lzcnt-schedule.ll
@@ -1,43 +1,60 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mattr=+lzcnt | FileCheck %s --check-prefix=CHECK --check-prefix=GENERIC
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=haswell | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=knl | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=+lzcnt | FileCheck %s --check-prefix=CHECK --check-prefix=GENERIC
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=haswell | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=broadwell | FileCheck %s --check-prefix=CHECK --check-prefix=BROADWELL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=SKYLAKE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=knl | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1
define i16 @test_ctlz_i16(i16 zeroext %a0, i16 *%a1) {
; GENERIC-LABEL: test_ctlz_i16:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: lzcntw (%rsi), %cx
-; GENERIC-NEXT: lzcntw %di, %ax
-; GENERIC-NEXT: orl %ecx, %eax
-; GENERIC-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: lzcntw (%rsi), %cx # sched: [7:1.00]
+; GENERIC-NEXT: lzcntw %di, %ax # sched: [3:1.00]
+; GENERIC-NEXT: orl %ecx, %eax # sched: [1:0.33]
+; GENERIC-NEXT: # kill: def %ax killed %ax killed %eax
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_ctlz_i16:
-; HASWELL: # BB#0:
-; HASWELL-NEXT: lzcntw (%rsi), %cx
-; HASWELL-NEXT: lzcntw %di, %ax
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: lzcntw (%rsi), %cx # sched: [8:1.00]
+; HASWELL-NEXT: lzcntw %di, %ax # sched: [3:1.00]
; HASWELL-NEXT: orl %ecx, %eax # sched: [1:0.25]
-; HASWELL-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: # kill: def %ax killed %ax killed %eax
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_ctlz_i16:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: lzcntw (%rsi), %cx # sched: [8:1.00]
+; BROADWELL-NEXT: lzcntw %di, %ax # sched: [3:1.00]
+; BROADWELL-NEXT: orl %ecx, %eax # sched: [1:0.25]
+; BROADWELL-NEXT: # kill: def %ax killed %ax killed %eax
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_ctlz_i16:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: lzcntw (%rsi), %cx # sched: [8:1.00]
+; SKYLAKE-NEXT: lzcntw %di, %ax # sched: [3:1.00]
+; SKYLAKE-NEXT: orl %ecx, %eax # sched: [1:0.25]
+; SKYLAKE-NEXT: # kill: def %ax killed %ax killed %eax
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_ctlz_i16:
-; BTVER2: # BB#0:
-; BTVER2-NEXT: lzcntw (%rsi), %cx
-; BTVER2-NEXT: lzcntw %di, %ax
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: lzcntw (%rsi), %cx # sched: [6:1.00]
+; BTVER2-NEXT: lzcntw %di, %ax # sched: [3:1.00]
; BTVER2-NEXT: orl %ecx, %eax # sched: [1:0.50]
-; BTVER2-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; BTVER2-NEXT: # kill: def %ax killed %ax killed %eax
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_ctlz_i16:
-; ZNVER1: # BB#0:
-; ZNVER1-NEXT: lzcntw (%rsi), %cx
-; ZNVER1-NEXT: lzcntw %di, %ax
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: lzcntw (%rsi), %cx # sched: [6:0.50]
+; ZNVER1-NEXT: lzcntw %di, %ax # sched: [2:0.25]
; ZNVER1-NEXT: orl %ecx, %eax # sched: [1:0.25]
-; ZNVER1-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: # kill: def %ax killed %ax killed %eax
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = load i16, i16 *%a1
%2 = tail call i16 @llvm.ctlz.i16( i16 %1, i1 false )
%3 = tail call i16 @llvm.ctlz.i16( i16 %a0, i1 false )
@@ -48,32 +65,46 @@ declare i16 @llvm.ctlz.i16(i16, i1)
define i32 @test_ctlz_i32(i32 %a0, i32 *%a1) {
; GENERIC-LABEL: test_ctlz_i32:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: lzcntl (%rsi), %ecx
-; GENERIC-NEXT: lzcntl %edi, %eax
-; GENERIC-NEXT: orl %ecx, %eax
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: lzcntl (%rsi), %ecx # sched: [7:1.00]
+; GENERIC-NEXT: lzcntl %edi, %eax # sched: [3:1.00]
+; GENERIC-NEXT: orl %ecx, %eax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_ctlz_i32:
-; HASWELL: # BB#0:
-; HASWELL-NEXT: lzcntl (%rsi), %ecx
-; HASWELL-NEXT: lzcntl %edi, %eax
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: lzcntl (%rsi), %ecx # sched: [8:1.00]
+; HASWELL-NEXT: lzcntl %edi, %eax # sched: [3:1.00]
; HASWELL-NEXT: orl %ecx, %eax # sched: [1:0.25]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_ctlz_i32:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: lzcntl (%rsi), %ecx # sched: [8:1.00]
+; BROADWELL-NEXT: lzcntl %edi, %eax # sched: [3:1.00]
+; BROADWELL-NEXT: orl %ecx, %eax # sched: [1:0.25]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_ctlz_i32:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: lzcntl (%rsi), %ecx # sched: [8:1.00]
+; SKYLAKE-NEXT: lzcntl %edi, %eax # sched: [3:1.00]
+; SKYLAKE-NEXT: orl %ecx, %eax # sched: [1:0.25]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_ctlz_i32:
-; BTVER2: # BB#0:
-; BTVER2-NEXT: lzcntl (%rsi), %ecx
-; BTVER2-NEXT: lzcntl %edi, %eax
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: lzcntl (%rsi), %ecx # sched: [6:1.00]
+; BTVER2-NEXT: lzcntl %edi, %eax # sched: [3:1.00]
; BTVER2-NEXT: orl %ecx, %eax # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_ctlz_i32:
-; ZNVER1: # BB#0:
-; ZNVER1-NEXT: lzcntl (%rsi), %ecx
-; ZNVER1-NEXT: lzcntl %edi, %eax
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: lzcntl (%rsi), %ecx # sched: [6:0.50]
+; ZNVER1-NEXT: lzcntl %edi, %eax # sched: [2:0.25]
; ZNVER1-NEXT: orl %ecx, %eax # sched: [1:0.25]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = load i32, i32 *%a1
%2 = tail call i32 @llvm.ctlz.i32( i32 %1, i1 false )
%3 = tail call i32 @llvm.ctlz.i32( i32 %a0, i1 false )
@@ -84,32 +115,46 @@ declare i32 @llvm.ctlz.i32(i32, i1)
define i64 @test_ctlz_i64(i64 %a0, i64 *%a1) {
; GENERIC-LABEL: test_ctlz_i64:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: lzcntq (%rsi), %rcx
-; GENERIC-NEXT: lzcntq %rdi, %rax
-; GENERIC-NEXT: orq %rcx, %rax
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: lzcntq (%rsi), %rcx # sched: [7:1.00]
+; GENERIC-NEXT: lzcntq %rdi, %rax # sched: [3:1.00]
+; GENERIC-NEXT: orq %rcx, %rax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_ctlz_i64:
-; HASWELL: # BB#0:
-; HASWELL-NEXT: lzcntq (%rsi), %rcx
-; HASWELL-NEXT: lzcntq %rdi, %rax
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: lzcntq (%rsi), %rcx # sched: [8:1.00]
+; HASWELL-NEXT: lzcntq %rdi, %rax # sched: [3:1.00]
; HASWELL-NEXT: orq %rcx, %rax # sched: [1:0.25]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_ctlz_i64:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: lzcntq (%rsi), %rcx # sched: [8:1.00]
+; BROADWELL-NEXT: lzcntq %rdi, %rax # sched: [3:1.00]
+; BROADWELL-NEXT: orq %rcx, %rax # sched: [1:0.25]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_ctlz_i64:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: lzcntq (%rsi), %rcx # sched: [8:1.00]
+; SKYLAKE-NEXT: lzcntq %rdi, %rax # sched: [3:1.00]
+; SKYLAKE-NEXT: orq %rcx, %rax # sched: [1:0.25]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_ctlz_i64:
-; BTVER2: # BB#0:
-; BTVER2-NEXT: lzcntq (%rsi), %rcx
-; BTVER2-NEXT: lzcntq %rdi, %rax
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: lzcntq (%rsi), %rcx # sched: [6:1.00]
+; BTVER2-NEXT: lzcntq %rdi, %rax # sched: [3:1.00]
; BTVER2-NEXT: orq %rcx, %rax # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_ctlz_i64:
-; ZNVER1: # BB#0:
-; ZNVER1-NEXT: lzcntq (%rsi), %rcx
-; ZNVER1-NEXT: lzcntq %rdi, %rax
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: lzcntq (%rsi), %rcx # sched: [6:0.50]
+; ZNVER1-NEXT: lzcntq %rdi, %rax # sched: [2:0.25]
; ZNVER1-NEXT: orq %rcx, %rax # sched: [1:0.25]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = load i64, i64 *%a1
%2 = tail call i64 @llvm.ctlz.i64( i64 %1, i1 false )
%3 = tail call i64 @llvm.ctlz.i64( i64 %a0, i1 false )
diff --git a/test/CodeGen/X86/lzcnt-zext-cmp.ll b/test/CodeGen/X86/lzcnt-zext-cmp.ll
index 7c961a98ad55..9a31a8da2dd9 100644
--- a/test/CodeGen/X86/lzcnt-zext-cmp.ll
+++ b/test/CodeGen/X86/lzcnt-zext-cmp.ll
@@ -9,7 +9,7 @@
; Test one 32-bit input, output is 32-bit, no transformations expected.
define i32 @test_zext_cmp0(i32 %a) {
; ALL-LABEL: test_zext_cmp0:
-; ALL: # BB#0: # %entry
+; ALL: # %bb.0: # %entry
; ALL-NEXT: xorl %eax, %eax
; ALL-NEXT: testl %edi, %edi
; ALL-NEXT: sete %al
@@ -23,7 +23,7 @@ entry:
; Test two 32-bit inputs, output is 32-bit.
define i32 @test_zext_cmp1(i32 %a, i32 %b) {
; FASTLZCNT-LABEL: test_zext_cmp1:
-; FASTLZCNT: # BB#0:
+; FASTLZCNT: # %bb.0:
; FASTLZCNT-NEXT: lzcntl %edi, %ecx
; FASTLZCNT-NEXT: lzcntl %esi, %eax
; FASTLZCNT-NEXT: orl %ecx, %eax
@@ -31,7 +31,7 @@ define i32 @test_zext_cmp1(i32 %a, i32 %b) {
; FASTLZCNT-NEXT: retq
;
; NOFASTLZCNT-LABEL: test_zext_cmp1:
-; NOFASTLZCNT: # BB#0:
+; NOFASTLZCNT: # %bb.0:
; NOFASTLZCNT-NEXT: testl %edi, %edi
; NOFASTLZCNT-NEXT: sete %al
; NOFASTLZCNT-NEXT: testl %esi, %esi
@@ -49,7 +49,7 @@ define i32 @test_zext_cmp1(i32 %a, i32 %b) {
; Test two 64-bit inputs, output is 64-bit.
define i64 @test_zext_cmp2(i64 %a, i64 %b) {
; FASTLZCNT-LABEL: test_zext_cmp2:
-; FASTLZCNT: # BB#0:
+; FASTLZCNT: # %bb.0:
; FASTLZCNT-NEXT: lzcntq %rdi, %rcx
; FASTLZCNT-NEXT: lzcntq %rsi, %rax
; FASTLZCNT-NEXT: orl %ecx, %eax
@@ -57,7 +57,7 @@ define i64 @test_zext_cmp2(i64 %a, i64 %b) {
; FASTLZCNT-NEXT: retq
;
; NOFASTLZCNT-LABEL: test_zext_cmp2:
-; NOFASTLZCNT: # BB#0:
+; NOFASTLZCNT: # %bb.0:
; NOFASTLZCNT-NEXT: testq %rdi, %rdi
; NOFASTLZCNT-NEXT: sete %al
; NOFASTLZCNT-NEXT: testq %rsi, %rsi
@@ -77,14 +77,14 @@ define i64 @test_zext_cmp2(i64 %a, i64 %b) {
; upper 16-bits, adding one more instruction.
define i16 @test_zext_cmp3(i16 %a, i16 %b) {
; ALL-LABEL: test_zext_cmp3:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: testw %di, %di
; ALL-NEXT: sete %al
; ALL-NEXT: testw %si, %si
; ALL-NEXT: sete %cl
; ALL-NEXT: orb %al, %cl
; ALL-NEXT: movzbl %cl, %eax
-; ALL-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; ALL-NEXT: # kill: def %ax killed %ax killed %eax
; ALL-NEXT: retq
%cmp = icmp eq i16 %a, 0
%cmp1 = icmp eq i16 %b, 0
@@ -96,7 +96,7 @@ define i16 @test_zext_cmp3(i16 %a, i16 %b) {
; Test two 32-bit inputs, output is 64-bit.
define i64 @test_zext_cmp4(i32 %a, i32 %b) {
; FASTLZCNT-LABEL: test_zext_cmp4:
-; FASTLZCNT: # BB#0: # %entry
+; FASTLZCNT: # %bb.0: # %entry
; FASTLZCNT-NEXT: lzcntl %edi, %ecx
; FASTLZCNT-NEXT: lzcntl %esi, %eax
; FASTLZCNT-NEXT: orl %ecx, %eax
@@ -104,7 +104,7 @@ define i64 @test_zext_cmp4(i32 %a, i32 %b) {
; FASTLZCNT-NEXT: retq
;
; NOFASTLZCNT-LABEL: test_zext_cmp4:
-; NOFASTLZCNT: # BB#0: # %entry
+; NOFASTLZCNT: # %bb.0: # %entry
; NOFASTLZCNT-NEXT: testl %edi, %edi
; NOFASTLZCNT-NEXT: sete %al
; NOFASTLZCNT-NEXT: testl %esi, %esi
@@ -123,16 +123,16 @@ entry:
; Test two 64-bit inputs, output is 32-bit.
define i32 @test_zext_cmp5(i64 %a, i64 %b) {
; FASTLZCNT-LABEL: test_zext_cmp5:
-; FASTLZCNT: # BB#0: # %entry
+; FASTLZCNT: # %bb.0: # %entry
; FASTLZCNT-NEXT: lzcntq %rdi, %rcx
; FASTLZCNT-NEXT: lzcntq %rsi, %rax
; FASTLZCNT-NEXT: orl %ecx, %eax
; FASTLZCNT-NEXT: shrl $6, %eax
-; FASTLZCNT-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill>
+; FASTLZCNT-NEXT: # kill: def %eax killed %eax killed %rax
; FASTLZCNT-NEXT: retq
;
; NOFASTLZCNT-LABEL: test_zext_cmp5:
-; NOFASTLZCNT: # BB#0: # %entry
+; NOFASTLZCNT: # %bb.0: # %entry
; NOFASTLZCNT-NEXT: testq %rdi, %rdi
; NOFASTLZCNT-NEXT: sete %al
; NOFASTLZCNT-NEXT: testq %rsi, %rsi
@@ -151,7 +151,7 @@ entry:
; Test three 32-bit inputs, output is 32-bit.
define i32 @test_zext_cmp6(i32 %a, i32 %b, i32 %c) {
; FASTLZCNT-LABEL: test_zext_cmp6:
-; FASTLZCNT: # BB#0: # %entry
+; FASTLZCNT: # %bb.0: # %entry
; FASTLZCNT-NEXT: lzcntl %edi, %eax
; FASTLZCNT-NEXT: lzcntl %esi, %ecx
; FASTLZCNT-NEXT: orl %eax, %ecx
@@ -161,7 +161,7 @@ define i32 @test_zext_cmp6(i32 %a, i32 %b, i32 %c) {
; FASTLZCNT-NEXT: retq
;
; NOFASTLZCNT-LABEL: test_zext_cmp6:
-; NOFASTLZCNT: # BB#0: # %entry
+; NOFASTLZCNT: # %bb.0: # %entry
; NOFASTLZCNT-NEXT: testl %edi, %edi
; NOFASTLZCNT-NEXT: sete %al
; NOFASTLZCNT-NEXT: testl %esi, %esi
@@ -186,7 +186,7 @@ entry:
; %.cmp2 inputs' order is inverted.
define i32 @test_zext_cmp7(i32 %a, i32 %b, i32 %c) {
; FASTLZCNT-LABEL: test_zext_cmp7:
-; FASTLZCNT: # BB#0: # %entry
+; FASTLZCNT: # %bb.0: # %entry
; FASTLZCNT-NEXT: lzcntl %edi, %eax
; FASTLZCNT-NEXT: lzcntl %esi, %ecx
; FASTLZCNT-NEXT: orl %eax, %ecx
@@ -196,7 +196,7 @@ define i32 @test_zext_cmp7(i32 %a, i32 %b, i32 %c) {
; FASTLZCNT-NEXT: retq
;
; NOFASTLZCNT-LABEL: test_zext_cmp7:
-; NOFASTLZCNT: # BB#0: # %entry
+; NOFASTLZCNT: # %bb.0: # %entry
; NOFASTLZCNT-NEXT: testl %edi, %edi
; NOFASTLZCNT-NEXT: sete %al
; NOFASTLZCNT-NEXT: testl %esi, %esi
@@ -220,7 +220,7 @@ entry:
; Test four 32-bit inputs, output is 32-bit.
define i32 @test_zext_cmp8(i32 %a, i32 %b, i32 %c, i32 %d) {
; FASTLZCNT-LABEL: test_zext_cmp8:
-; FASTLZCNT: # BB#0: # %entry
+; FASTLZCNT: # %bb.0: # %entry
; FASTLZCNT-NEXT: lzcntl %edi, %eax
; FASTLZCNT-NEXT: lzcntl %esi, %esi
; FASTLZCNT-NEXT: lzcntl %edx, %edx
@@ -232,7 +232,7 @@ define i32 @test_zext_cmp8(i32 %a, i32 %b, i32 %c, i32 %d) {
; FASTLZCNT-NEXT: retq
;
; NOFASTLZCNT-LABEL: test_zext_cmp8:
-; NOFASTLZCNT: # BB#0: # %entry
+; NOFASTLZCNT: # %bb.0: # %entry
; NOFASTLZCNT-NEXT: testl %edi, %edi
; NOFASTLZCNT-NEXT: sete %dil
; NOFASTLZCNT-NEXT: testl %esi, %esi
@@ -261,17 +261,17 @@ entry:
; Test one 32-bit input, one 64-bit input, output is 32-bit.
define i32 @test_zext_cmp9(i32 %a, i64 %b) {
; FASTLZCNT-LABEL: test_zext_cmp9:
-; FASTLZCNT: # BB#0: # %entry
+; FASTLZCNT: # %bb.0: # %entry
; FASTLZCNT-NEXT: lzcntq %rsi, %rax
; FASTLZCNT-NEXT: lzcntl %edi, %ecx
; FASTLZCNT-NEXT: shrl $5, %ecx
; FASTLZCNT-NEXT: shrl $6, %eax
; FASTLZCNT-NEXT: orl %ecx, %eax
-; FASTLZCNT-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill>
+; FASTLZCNT-NEXT: # kill: def %eax killed %eax killed %rax
; FASTLZCNT-NEXT: retq
;
; NOFASTLZCNT-LABEL: test_zext_cmp9:
-; NOFASTLZCNT: # BB#0: # %entry
+; NOFASTLZCNT: # %bb.0: # %entry
; NOFASTLZCNT-NEXT: testl %edi, %edi
; NOFASTLZCNT-NEXT: sete %al
; NOFASTLZCNT-NEXT: testq %rsi, %rsi
@@ -290,7 +290,7 @@ entry:
; Test 2 128-bit inputs, output is 32-bit, no transformations expected.
define i32 @test_zext_cmp10(i64 %a.coerce0, i64 %a.coerce1, i64 %b.coerce0, i64 %b.coerce1) {
; ALL-LABEL: test_zext_cmp10:
-; ALL: # BB#0: # %entry
+; ALL: # %bb.0: # %entry
; ALL-NEXT: orq %rsi, %rdi
; ALL-NEXT: sete %al
; ALL-NEXT: orq %rcx, %rdx
@@ -318,7 +318,7 @@ entry:
define i32 @test_zext_cmp11(double %a, double %b) "no-nans-fp-math"="true" {
;
; ALL-LABEL: test_zext_cmp11:
-; ALL: # BB#0: # %entry
+; ALL: # %bb.0: # %entry
; ALL-NEXT: vxorps %xmm2, %xmm2, %xmm2
; ALL-NEXT: vucomisd %xmm2, %xmm0
; ALL-NEXT: sete %al
diff --git a/test/CodeGen/X86/lzcnt.ll b/test/CodeGen/X86/lzcnt.ll
index ff83f8540946..1f0c6b3da2bf 100644
--- a/test/CodeGen/X86/lzcnt.ll
+++ b/test/CodeGen/X86/lzcnt.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mattr=+lzcnt | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+lzcnt | FileCheck %s
declare i8 @llvm.ctlz.i8(i8, i1) nounwind readnone
declare i16 @llvm.ctlz.i16(i16, i1) nounwind readnone
diff --git a/test/CodeGen/X86/machine-combiner-int-vec.ll b/test/CodeGen/X86/machine-combiner-int-vec.ll
index dc1ce77e13b7..8aea7cd5f5e9 100644
--- a/test/CodeGen/X86/machine-combiner-int-vec.ll
+++ b/test/CodeGen/X86/machine-combiner-int-vec.ll
@@ -5,14 +5,14 @@
define <4 x i32> @reassociate_and_v4i32(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, <4 x i32> %x3) {
; SSE-LABEL: reassociate_and_v4i32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: paddd %xmm1, %xmm0
; SSE-NEXT: pand %xmm3, %xmm2
; SSE-NEXT: pand %xmm2, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: reassociate_and_v4i32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpand %xmm3, %xmm2, %xmm1
; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
@@ -26,14 +26,14 @@ define <4 x i32> @reassociate_and_v4i32(<4 x i32> %x0, <4 x i32> %x1, <4 x i32>
define <4 x i32> @reassociate_or_v4i32(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, <4 x i32> %x3) {
; SSE-LABEL: reassociate_or_v4i32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: paddd %xmm1, %xmm0
; SSE-NEXT: por %xmm3, %xmm2
; SSE-NEXT: por %xmm2, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: reassociate_or_v4i32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpor %xmm3, %xmm2, %xmm1
; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
@@ -47,14 +47,14 @@ define <4 x i32> @reassociate_or_v4i32(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %
define <4 x i32> @reassociate_xor_v4i32(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, <4 x i32> %x3) {
; SSE-LABEL: reassociate_xor_v4i32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: paddd %xmm1, %xmm0
; SSE-NEXT: pxor %xmm3, %xmm2
; SSE-NEXT: pxor %xmm2, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: reassociate_xor_v4i32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpxor %xmm3, %xmm2, %xmm1
; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
@@ -70,7 +70,7 @@ define <4 x i32> @reassociate_xor_v4i32(<4 x i32> %x0, <4 x i32> %x1, <4 x i32>
define <8 x i32> @reassociate_and_v8i32(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, <8 x i32> %x3) {
; AVX-LABEL: reassociate_and_v8i32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; AVX-NEXT: vpand %ymm3, %ymm2, %ymm1
; AVX-NEXT: vpand %ymm1, %ymm0, %ymm0
@@ -84,7 +84,7 @@ define <8 x i32> @reassociate_and_v8i32(<8 x i32> %x0, <8 x i32> %x1, <8 x i32>
define <8 x i32> @reassociate_or_v8i32(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, <8 x i32> %x3) {
; AVX-LABEL: reassociate_or_v8i32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; AVX-NEXT: vpor %ymm3, %ymm2, %ymm1
; AVX-NEXT: vpor %ymm1, %ymm0, %ymm0
@@ -98,7 +98,7 @@ define <8 x i32> @reassociate_or_v8i32(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %
define <8 x i32> @reassociate_xor_v8i32(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, <8 x i32> %x3) {
; AVX-LABEL: reassociate_xor_v8i32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; AVX-NEXT: vpxor %ymm3, %ymm2, %ymm1
; AVX-NEXT: vpxor %ymm1, %ymm0, %ymm0
diff --git a/test/CodeGen/X86/machine-combiner-int.ll b/test/CodeGen/X86/machine-combiner-int.ll
index df35abd9534d..e26b7401941f 100644
--- a/test/CodeGen/X86/machine-combiner-int.ll
+++ b/test/CodeGen/X86/machine-combiner-int.ll
@@ -9,7 +9,7 @@
define i16 @reassociate_muls_i16(i16 %x0, i16 %x1, i16 %x2, i16 %x3) {
; CHECK-LABEL: reassociate_muls_i16:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: # kill
; CHECK-NEXT: # kill
; CHECK-NEXT: leal (%rdi,%rsi), %eax
@@ -25,7 +25,7 @@ define i16 @reassociate_muls_i16(i16 %x0, i16 %x1, i16 %x2, i16 %x3) {
define i32 @reassociate_muls_i32(i32 %x0, i32 %x1, i32 %x2, i32 %x3) {
; CHECK-LABEL: reassociate_muls_i32:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: # kill
; CHECK-NEXT: # kill
; CHECK-NEXT: leal (%rdi,%rsi), %eax
@@ -45,7 +45,7 @@ define i32 @reassociate_muls_i32(i32 %x0, i32 %x1, i32 %x2, i32 %x3) {
define i64 @reassociate_muls_i64(i64 %x0, i64 %x1, i64 %x2, i64 %x3) {
; CHECK-LABEL: reassociate_muls_i64:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: leaq (%rdi,%rsi), %rax
; CHECK-NEXT: imulq %rcx, %rdx
; CHECK-NEXT: imulq %rdx, %rax
@@ -61,7 +61,7 @@ define i64 @reassociate_muls_i64(i64 %x0, i64 %x1, i64 %x2, i64 %x3) {
define i8 @reassociate_ands_i8(i8 %x0, i8 %x1, i8 %x2, i8 %x3) {
; CHECK-LABEL: reassociate_ands_i8:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: subb %sil, %dil
; CHECK-NEXT: andb %cl, %dl
; CHECK-NEXT: andb %dil, %dl
@@ -77,7 +77,7 @@ define i8 @reassociate_ands_i8(i8 %x0, i8 %x1, i8 %x2, i8 %x3) {
define i32 @reassociate_ands_i32(i32 %x0, i32 %x1, i32 %x2, i32 %x3) {
; CHECK-LABEL: reassociate_ands_i32:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: subl %esi, %edi
; CHECK-NEXT: andl %ecx, %edx
; CHECK-NEXT: andl %edi, %edx
@@ -91,7 +91,7 @@ define i32 @reassociate_ands_i32(i32 %x0, i32 %x1, i32 %x2, i32 %x3) {
define i64 @reassociate_ands_i64(i64 %x0, i64 %x1, i64 %x2, i64 %x3) {
; CHECK-LABEL: reassociate_ands_i64:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: subq %rsi, %rdi
; CHECK-NEXT: andq %rcx, %rdx
; CHECK-NEXT: andq %rdi, %rdx
@@ -108,7 +108,7 @@ define i64 @reassociate_ands_i64(i64 %x0, i64 %x1, i64 %x2, i64 %x3) {
define i8 @reassociate_ors_i8(i8 %x0, i8 %x1, i8 %x2, i8 %x3) {
; CHECK-LABEL: reassociate_ors_i8:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: subb %sil, %dil
; CHECK-NEXT: orb %cl, %dl
; CHECK-NEXT: orb %dil, %dl
@@ -124,7 +124,7 @@ define i8 @reassociate_ors_i8(i8 %x0, i8 %x1, i8 %x2, i8 %x3) {
define i32 @reassociate_ors_i32(i32 %x0, i32 %x1, i32 %x2, i32 %x3) {
; CHECK-LABEL: reassociate_ors_i32:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: subl %esi, %edi
; CHECK-NEXT: orl %ecx, %edx
; CHECK-NEXT: orl %edi, %edx
@@ -138,7 +138,7 @@ define i32 @reassociate_ors_i32(i32 %x0, i32 %x1, i32 %x2, i32 %x3) {
define i64 @reassociate_ors_i64(i64 %x0, i64 %x1, i64 %x2, i64 %x3) {
; CHECK-LABEL: reassociate_ors_i64:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: subq %rsi, %rdi
; CHECK-NEXT: orq %rcx, %rdx
; CHECK-NEXT: orq %rdi, %rdx
@@ -155,7 +155,7 @@ define i64 @reassociate_ors_i64(i64 %x0, i64 %x1, i64 %x2, i64 %x3) {
define i8 @reassociate_xors_i8(i8 %x0, i8 %x1, i8 %x2, i8 %x3) {
; CHECK-LABEL: reassociate_xors_i8:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: subb %sil, %dil
; CHECK-NEXT: xorb %cl, %dl
; CHECK-NEXT: xorb %dil, %dl
@@ -171,7 +171,7 @@ define i8 @reassociate_xors_i8(i8 %x0, i8 %x1, i8 %x2, i8 %x3) {
define i32 @reassociate_xors_i32(i32 %x0, i32 %x1, i32 %x2, i32 %x3) {
; CHECK-LABEL: reassociate_xors_i32:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: subl %esi, %edi
; CHECK-NEXT: xorl %ecx, %edx
; CHECK-NEXT: xorl %edi, %edx
@@ -185,7 +185,7 @@ define i32 @reassociate_xors_i32(i32 %x0, i32 %x1, i32 %x2, i32 %x3) {
define i64 @reassociate_xors_i64(i64 %x0, i64 %x1, i64 %x2, i64 %x3) {
; CHECK-LABEL: reassociate_xors_i64:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: subq %rsi, %rdi
; CHECK-NEXT: xorq %rcx, %rdx
; CHECK-NEXT: xorq %rdi, %rdx
diff --git a/test/CodeGen/X86/machine-combiner.ll b/test/CodeGen/X86/machine-combiner.ll
index 3fbb233696c8..d634dbb65699 100644
--- a/test/CodeGen/X86/machine-combiner.ll
+++ b/test/CodeGen/X86/machine-combiner.ll
@@ -1,19 +1,24 @@
; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=sse -enable-unsafe-fp-math < %s | FileCheck %s --check-prefix=SSE
; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=avx -enable-unsafe-fp-math < %s | FileCheck %s --check-prefix=AVX
+; Incremental updates of the instruction depths should be enough for this test
+; case.
+; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=sse -enable-unsafe-fp-math -machine-combiner-inc-threshold=0 < %s | FileCheck %s --check-prefix=SSE
+; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=avx -enable-unsafe-fp-math -machine-combiner-inc-threshold=0 < %s | FileCheck %s --check-prefix=AVX
+
; Verify that the first two adds are independent regardless of how the inputs are
; commuted. The destination registers are used as source registers for the third add.
define float @reassociate_adds1(float %x0, float %x1, float %x2, float %x3) {
; SSE-LABEL: reassociate_adds1:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: addss %xmm1, %xmm0
; SSE-NEXT: addss %xmm3, %xmm2
; SSE-NEXT: addss %xmm2, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: reassociate_adds1:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
; AVX-NEXT: vaddss %xmm3, %xmm2, %xmm1
; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
@@ -26,14 +31,14 @@ define float @reassociate_adds1(float %x0, float %x1, float %x2, float %x3) {
define float @reassociate_adds2(float %x0, float %x1, float %x2, float %x3) {
; SSE-LABEL: reassociate_adds2:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: addss %xmm1, %xmm0
; SSE-NEXT: addss %xmm3, %xmm2
; SSE-NEXT: addss %xmm2, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: reassociate_adds2:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
; AVX-NEXT: vaddss %xmm3, %xmm2, %xmm1
; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
@@ -46,14 +51,14 @@ define float @reassociate_adds2(float %x0, float %x1, float %x2, float %x3) {
define float @reassociate_adds3(float %x0, float %x1, float %x2, float %x3) {
; SSE-LABEL: reassociate_adds3:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: addss %xmm1, %xmm0
; SSE-NEXT: addss %xmm3, %xmm2
; SSE-NEXT: addss %xmm2, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: reassociate_adds3:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
; AVX-NEXT: vaddss %xmm3, %xmm2, %xmm1
; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
@@ -66,14 +71,14 @@ define float @reassociate_adds3(float %x0, float %x1, float %x2, float %x3) {
define float @reassociate_adds4(float %x0, float %x1, float %x2, float %x3) {
; SSE-LABEL: reassociate_adds4:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: addss %xmm1, %xmm0
; SSE-NEXT: addss %xmm3, %xmm2
; SSE-NEXT: addss %xmm2, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: reassociate_adds4:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
; AVX-NEXT: vaddss %xmm3, %xmm2, %xmm1
; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
@@ -89,7 +94,7 @@ define float @reassociate_adds4(float %x0, float %x1, float %x2, float %x3) {
define float @reassociate_adds5(float %x0, float %x1, float %x2, float %x3, float %x4, float %x5, float %x6, float %x7) {
; SSE-LABEL: reassociate_adds5:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: addss %xmm1, %xmm0
; SSE-NEXT: addss %xmm3, %xmm2
; SSE-NEXT: addss %xmm2, %xmm0
@@ -100,7 +105,7 @@ define float @reassociate_adds5(float %x0, float %x1, float %x2, float %x3, floa
; SSE-NEXT: retq
;
; AVX-LABEL: reassociate_adds5:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
; AVX-NEXT: vaddss %xmm3, %xmm2, %xmm1
; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
@@ -126,14 +131,14 @@ define float @reassociate_adds5(float %x0, float %x1, float %x2, float %x3, floa
define float @reassociate_adds6(float %x0, float %x1, float %x2, float %x3) {
; SSE-LABEL: reassociate_adds6:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: divss %xmm1, %xmm0
; SSE-NEXT: addss %xmm3, %xmm2
; SSE-NEXT: addss %xmm2, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: reassociate_adds6:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vdivss %xmm1, %xmm0, %xmm0
; AVX-NEXT: vaddss %xmm3, %xmm2, %xmm1
; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
@@ -148,14 +153,14 @@ define float @reassociate_adds6(float %x0, float %x1, float %x2, float %x3) {
define float @reassociate_muls1(float %x0, float %x1, float %x2, float %x3) {
; SSE-LABEL: reassociate_muls1:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: divss %xmm1, %xmm0
; SSE-NEXT: mulss %xmm3, %xmm2
; SSE-NEXT: mulss %xmm2, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: reassociate_muls1:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vdivss %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmulss %xmm3, %xmm2, %xmm1
; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0
@@ -170,14 +175,14 @@ define float @reassociate_muls1(float %x0, float %x1, float %x2, float %x3) {
define double @reassociate_adds_double(double %x0, double %x1, double %x2, double %x3) {
; SSE-LABEL: reassociate_adds_double:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: divsd %xmm1, %xmm0
; SSE-NEXT: addsd %xmm3, %xmm2
; SSE-NEXT: addsd %xmm2, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: reassociate_adds_double:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vdivsd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vaddsd %xmm3, %xmm2, %xmm1
; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0
@@ -192,14 +197,14 @@ define double @reassociate_adds_double(double %x0, double %x1, double %x2, doubl
define double @reassociate_muls_double(double %x0, double %x1, double %x2, double %x3) {
; SSE-LABEL: reassociate_muls_double:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: divsd %xmm1, %xmm0
; SSE-NEXT: mulsd %xmm3, %xmm2
; SSE-NEXT: mulsd %xmm2, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: reassociate_muls_double:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vdivsd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmulsd %xmm3, %xmm2, %xmm1
; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
@@ -214,14 +219,14 @@ define double @reassociate_muls_double(double %x0, double %x1, double %x2, doubl
define <4 x float> @reassociate_adds_v4f32(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, <4 x float> %x3) {
; SSE-LABEL: reassociate_adds_v4f32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: mulps %xmm1, %xmm0
; SSE-NEXT: addps %xmm3, %xmm2
; SSE-NEXT: addps %xmm2, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: reassociate_adds_v4f32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0
; AVX-NEXT: vaddps %xmm3, %xmm2, %xmm1
; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0
@@ -236,14 +241,14 @@ define <4 x float> @reassociate_adds_v4f32(<4 x float> %x0, <4 x float> %x1, <4
define <2 x double> @reassociate_adds_v2f64(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, <2 x double> %x3) {
; SSE-LABEL: reassociate_adds_v2f64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: mulpd %xmm1, %xmm0
; SSE-NEXT: addpd %xmm3, %xmm2
; SSE-NEXT: addpd %xmm2, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: reassociate_adds_v2f64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmulpd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vaddpd %xmm3, %xmm2, %xmm1
; AVX-NEXT: vaddpd %xmm1, %xmm0, %xmm0
@@ -258,14 +263,14 @@ define <2 x double> @reassociate_adds_v2f64(<2 x double> %x0, <2 x double> %x1,
define <4 x float> @reassociate_muls_v4f32(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, <4 x float> %x3) {
; SSE-LABEL: reassociate_muls_v4f32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: addps %xmm1, %xmm0
; SSE-NEXT: mulps %xmm3, %xmm2
; SSE-NEXT: mulps %xmm2, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: reassociate_muls_v4f32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmulps %xmm3, %xmm2, %xmm1
; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0
@@ -280,14 +285,14 @@ define <4 x float> @reassociate_muls_v4f32(<4 x float> %x0, <4 x float> %x1, <4
define <2 x double> @reassociate_muls_v2f64(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, <2 x double> %x3) {
; SSE-LABEL: reassociate_muls_v2f64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: addpd %xmm1, %xmm0
; SSE-NEXT: mulpd %xmm3, %xmm2
; SSE-NEXT: mulpd %xmm2, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: reassociate_muls_v2f64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vaddpd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmulpd %xmm3, %xmm2, %xmm1
; AVX-NEXT: vmulpd %xmm1, %xmm0, %xmm0
@@ -302,7 +307,7 @@ define <2 x double> @reassociate_muls_v2f64(<2 x double> %x0, <2 x double> %x1,
define <8 x float> @reassociate_adds_v8f32(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, <8 x float> %x3) {
; AVX-LABEL: reassociate_adds_v8f32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmulps %ymm1, %ymm0, %ymm0
; AVX-NEXT: vaddps %ymm3, %ymm2, %ymm1
; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0
@@ -317,7 +322,7 @@ define <8 x float> @reassociate_adds_v8f32(<8 x float> %x0, <8 x float> %x1, <8
define <4 x double> @reassociate_adds_v4f64(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, <4 x double> %x3) {
; AVX-LABEL: reassociate_adds_v4f64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmulpd %ymm1, %ymm0, %ymm0
; AVX-NEXT: vaddpd %ymm3, %ymm2, %ymm1
; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0
@@ -332,7 +337,7 @@ define <4 x double> @reassociate_adds_v4f64(<4 x double> %x0, <4 x double> %x1,
define <8 x float> @reassociate_muls_v8f32(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, <8 x float> %x3) {
; AVX-LABEL: reassociate_muls_v8f32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0
; AVX-NEXT: vmulps %ymm3, %ymm2, %ymm1
; AVX-NEXT: vmulps %ymm1, %ymm0, %ymm0
@@ -347,7 +352,7 @@ define <8 x float> @reassociate_muls_v8f32(<8 x float> %x0, <8 x float> %x1, <8
define <4 x double> @reassociate_muls_v4f64(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, <4 x double> %x3) {
; AVX-LABEL: reassociate_muls_v4f64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0
; AVX-NEXT: vmulpd %ymm3, %ymm2, %ymm1
; AVX-NEXT: vmulpd %ymm1, %ymm0, %ymm0
@@ -362,14 +367,14 @@ define <4 x double> @reassociate_muls_v4f64(<4 x double> %x0, <4 x double> %x1,
define float @reassociate_mins_single(float %x0, float %x1, float %x2, float %x3) {
; SSE-LABEL: reassociate_mins_single:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: divss %xmm1, %xmm0
; SSE-NEXT: minss %xmm3, %xmm2
; SSE-NEXT: minss %xmm2, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: reassociate_mins_single:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vdivss %xmm1, %xmm0, %xmm0
; AVX-NEXT: vminss %xmm3, %xmm2, %xmm1
; AVX-NEXT: vminss %xmm1, %xmm0, %xmm0
@@ -386,14 +391,14 @@ define float @reassociate_mins_single(float %x0, float %x1, float %x2, float %x3
define float @reassociate_maxs_single(float %x0, float %x1, float %x2, float %x3) {
; SSE-LABEL: reassociate_maxs_single:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: divss %xmm1, %xmm0
; SSE-NEXT: maxss %xmm3, %xmm2
; SSE-NEXT: maxss %xmm2, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: reassociate_maxs_single:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vdivss %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmaxss %xmm3, %xmm2, %xmm1
; AVX-NEXT: vmaxss %xmm1, %xmm0, %xmm0
@@ -410,14 +415,14 @@ define float @reassociate_maxs_single(float %x0, float %x1, float %x2, float %x3
define double @reassociate_mins_double(double %x0, double %x1, double %x2, double %x3) {
; SSE-LABEL: reassociate_mins_double:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: divsd %xmm1, %xmm0
; SSE-NEXT: minsd %xmm3, %xmm2
; SSE-NEXT: minsd %xmm2, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: reassociate_mins_double:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vdivsd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vminsd %xmm3, %xmm2, %xmm1
; AVX-NEXT: vminsd %xmm1, %xmm0, %xmm0
@@ -434,14 +439,14 @@ define double @reassociate_mins_double(double %x0, double %x1, double %x2, doubl
define double @reassociate_maxs_double(double %x0, double %x1, double %x2, double %x3) {
; SSE-LABEL: reassociate_maxs_double:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: divsd %xmm1, %xmm0
; SSE-NEXT: maxsd %xmm3, %xmm2
; SSE-NEXT: maxsd %xmm2, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: reassociate_maxs_double:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vdivsd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmaxsd %xmm3, %xmm2, %xmm1
; AVX-NEXT: vmaxsd %xmm1, %xmm0, %xmm0
@@ -458,14 +463,14 @@ define double @reassociate_maxs_double(double %x0, double %x1, double %x2, doubl
define <4 x float> @reassociate_mins_v4f32(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, <4 x float> %x3) {
; SSE-LABEL: reassociate_mins_v4f32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: addps %xmm1, %xmm0
; SSE-NEXT: minps %xmm3, %xmm2
; SSE-NEXT: minps %xmm2, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: reassociate_mins_v4f32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0
; AVX-NEXT: vminps %xmm3, %xmm2, %xmm1
; AVX-NEXT: vminps %xmm1, %xmm0, %xmm0
@@ -482,14 +487,14 @@ define <4 x float> @reassociate_mins_v4f32(<4 x float> %x0, <4 x float> %x1, <4
define <4 x float> @reassociate_maxs_v4f32(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, <4 x float> %x3) {
; SSE-LABEL: reassociate_maxs_v4f32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: addps %xmm1, %xmm0
; SSE-NEXT: maxps %xmm3, %xmm2
; SSE-NEXT: maxps %xmm2, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: reassociate_maxs_v4f32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmaxps %xmm3, %xmm2, %xmm1
; AVX-NEXT: vmaxps %xmm1, %xmm0, %xmm0
@@ -506,14 +511,14 @@ define <4 x float> @reassociate_maxs_v4f32(<4 x float> %x0, <4 x float> %x1, <4
define <2 x double> @reassociate_mins_v2f64(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, <2 x double> %x3) {
; SSE-LABEL: reassociate_mins_v2f64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: addpd %xmm1, %xmm0
; SSE-NEXT: minpd %xmm3, %xmm2
; SSE-NEXT: minpd %xmm2, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: reassociate_mins_v2f64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vaddpd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vminpd %xmm3, %xmm2, %xmm1
; AVX-NEXT: vminpd %xmm1, %xmm0, %xmm0
@@ -530,14 +535,14 @@ define <2 x double> @reassociate_mins_v2f64(<2 x double> %x0, <2 x double> %x1,
define <2 x double> @reassociate_maxs_v2f64(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, <2 x double> %x3) {
; SSE-LABEL: reassociate_maxs_v2f64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: addpd %xmm1, %xmm0
; SSE-NEXT: maxpd %xmm3, %xmm2
; SSE-NEXT: maxpd %xmm2, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: reassociate_maxs_v2f64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vaddpd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmaxpd %xmm3, %xmm2, %xmm1
; AVX-NEXT: vmaxpd %xmm1, %xmm0, %xmm0
@@ -554,7 +559,7 @@ define <2 x double> @reassociate_maxs_v2f64(<2 x double> %x0, <2 x double> %x1,
define <8 x float> @reassociate_mins_v8f32(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, <8 x float> %x3) {
; AVX-LABEL: reassociate_mins_v8f32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0
; AVX-NEXT: vminps %ymm3, %ymm2, %ymm1
; AVX-NEXT: vminps %ymm1, %ymm0, %ymm0
@@ -571,7 +576,7 @@ define <8 x float> @reassociate_mins_v8f32(<8 x float> %x0, <8 x float> %x1, <8
define <8 x float> @reassociate_maxs_v8f32(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, <8 x float> %x3) {
; AVX-LABEL: reassociate_maxs_v8f32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0
; AVX-NEXT: vmaxps %ymm3, %ymm2, %ymm1
; AVX-NEXT: vmaxps %ymm1, %ymm0, %ymm0
@@ -588,7 +593,7 @@ define <8 x float> @reassociate_maxs_v8f32(<8 x float> %x0, <8 x float> %x1, <8
define <4 x double> @reassociate_mins_v4f64(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, <4 x double> %x3) {
; AVX-LABEL: reassociate_mins_v4f64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0
; AVX-NEXT: vminpd %ymm3, %ymm2, %ymm1
; AVX-NEXT: vminpd %ymm1, %ymm0, %ymm0
@@ -605,7 +610,7 @@ define <4 x double> @reassociate_mins_v4f64(<4 x double> %x0, <4 x double> %x1,
define <4 x double> @reassociate_maxs_v4f64(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, <4 x double> %x3) {
; AVX-LABEL: reassociate_maxs_v4f64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0
; AVX-NEXT: vmaxpd %ymm3, %ymm2, %ymm1
; AVX-NEXT: vmaxpd %ymm1, %ymm0, %ymm0
diff --git a/test/CodeGen/X86/machine-copy-prop.mir b/test/CodeGen/X86/machine-copy-prop.mir
index 225a43061c9c..05454584d9a0 100644
--- a/test/CodeGen/X86/machine-copy-prop.mir
+++ b/test/CodeGen/X86/machine-copy-prop.mir
@@ -1,4 +1,4 @@
-# RUN: llc -march=x86 -run-pass machine-cp -verify-machineinstrs -o - %s | FileCheck %s
+# RUN: llc -mtriple=i686-- -run-pass machine-cp -verify-machineinstrs -o - %s | FileCheck %s
--- |
declare void @foo()
diff --git a/test/CodeGen/X86/machine-cp.ll b/test/CodeGen/X86/machine-cp.ll
index 57663a011f10..b8b9b05c3184 100644
--- a/test/CodeGen/X86/machine-cp.ll
+++ b/test/CodeGen/X86/machine-cp.ll
@@ -1,18 +1,33 @@
-; RUN: llc -mtriple=x86_64-apple-macosx -mattr=+sse2 -verify-machineinstrs < %s | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-apple-macosx -mattr=+sse2 -verify-machineinstrs | FileCheck %s
; After tail duplication, two copies in an early exit BB can be cancelled out.
; rdar://10640363
define i32 @t1(i32 %a, i32 %b) nounwind {
-entry:
; CHECK-LABEL: t1:
-; CHECK: je [[LABEL:.*BB.*]]
+; CHECK: ## %bb.0: ## %entry
+; CHECK-NEXT: movl %esi, %edx
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: testl %edx, %edx
+; CHECK-NEXT: je LBB0_1
+; CHECK-NEXT: .p2align 4, 0x90
+; CHECK-NEXT: LBB0_2: ## %while.body
+; CHECK-NEXT: ## =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: movl %edx, %ecx
+; CHECK-NEXT: cltd
+; CHECK-NEXT: idivl %ecx
+; CHECK-NEXT: testl %edx, %edx
+; CHECK-NEXT: movl %ecx, %eax
+; CHECK-NEXT: jne LBB0_2
+; CHECK-NEXT: ## %bb.3: ## %while.end
+; CHECK-NEXT: movl %ecx, %eax
+; CHECK-NEXT: retq
+; CHECK-NEXT: LBB0_1:
+; CHECK-NEXT: retq
+entry:
%cmp1 = icmp eq i32 %b, 0
br i1 %cmp1, label %while.end, label %while.body
-; CHECK: [[LABEL]]:
-; CHECK-NOT: mov
-; CHECK: ret
-
while.body: ; preds = %entry, %while.body
%a.addr.03 = phi i32 [ %b.addr.02, %while.body ], [ %a, %entry ]
%b.addr.02 = phi i32 [ %rem, %while.body ], [ %b, %entry ]
@@ -28,24 +43,42 @@ while.end: ; preds = %while.body, %entry
; Two movdqa (from phi-elimination) in the entry BB cancels out.
; rdar://10428165
define <8 x i16> @t2(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone {
-entry:
; CHECK-LABEL: t2:
-; CHECK-NOT: movdqa
+; CHECK: ## %bb.0: ## %entry
+; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; CHECK-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,2,4,5,6,7]
+; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; CHECK-NEXT: retq
+entry:
%tmp8 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 undef, i32 undef, i32 7, i32 2, i32 8, i32 undef, i32 undef , i32 undef >
ret <8 x i16> %tmp8
}
define i32 @t3(i64 %a, i64 %b) nounwind {
-entry:
; CHECK-LABEL: t3:
-; CHECK: je [[LABEL:.*BB.*]]
+; CHECK: ## %bb.0: ## %entry
+; CHECK-NEXT: movq %rsi, %rdx
+; CHECK-NEXT: movq %rdi, %rax
+; CHECK-NEXT: testq %rdx, %rdx
+; CHECK-NEXT: je LBB2_1
+; CHECK-NEXT: .p2align 4, 0x90
+; CHECK-NEXT: LBB2_2: ## %while.body
+; CHECK-NEXT: ## =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: movq %rdx, %rcx
+; CHECK-NEXT: cqto
+; CHECK-NEXT: idivq %rcx
+; CHECK-NEXT: testq %rdx, %rdx
+; CHECK-NEXT: movq %rcx, %rax
+; CHECK-NEXT: jne LBB2_2
+; CHECK-NEXT: ## %bb.3: ## %while.end
+; CHECK-NEXT: movl %ecx, %eax
+; CHECK-NEXT: retq
+; CHECK-NEXT: LBB2_1:
+; CHECK-NEXT: retq
+entry:
%cmp1 = icmp eq i64 %b, 0
br i1 %cmp1, label %while.end, label %while.body
-; CHECK: [[LABEL]]:
-; CHECK-NOT: mov
-; CHECK: ret
-
while.body: ; preds = %entry, %while.body
%a.addr.03 = phi i64 [ %b.addr.02, %while.body ], [ %a, %entry ]
%b.addr.02 = phi i64 [ %rem, %while.body ], [ %b, %entry ]
@@ -61,29 +94,91 @@ while.end: ; preds = %while.body, %entry
; Check that copy propagation does not kill thing like:
; dst = copy src <-- do not kill that.
-; ... = op1 dst<undef>
+; ... = op1 undef dst
; ... = op2 dst <-- this is used here.
-;
-; CHECK-LABEL: foo:
-; CHECK: psllw $7,
-; CHECK: psllw $7, [[SRC1:%xmm[0-9]+]]
-; CHECK-NEXT: pand {{.*}}(%rip), [[SRC1]]
-; CHECK-NEXT: pcmpgtb [[SRC1]], [[SRC2:%xmm[0-9]+]]
-; CHECK-NEXT: pand %xmm{{[0-9]+}}, [[SRC2]]
-; CHECK-NEXT: movdqa [[SRC2]], [[CPY1:%xmm[0-9]+]]
-; CHECK-NEXT: punpcklbw %xmm{{[0-9]+}}, [[CPY1]]
-; Check that CPY1 is not redefined.
-; CHECK-NOT: , [[CPY1]]
-; CHECK: punpckhwd %xmm{{[0-9]+}}, [[CPY1]]
-; CHECK-NEXT: pslld $31, [[CPY1]]
-; CHECK-NEXT: psrad $31, [[CPY1]]
-; CHECK: punpckhbw %xmm{{[0-9]+}}, [[CPY2:%xmm[0-9]+]]
-; Check that CPY2 is not redefined.
-; CHECK-NOT: , [[CPY2]]
-; CHECK: punpckhwd %xmm{{[0-9]+}}, [[CPY2]]
-; CHECK-NEXT: pslld $31, [[CPY2]]
-; CHECK-NEXT: psrad $31, [[CPY2]]
define <16 x float> @foo(<16 x float> %x) {
+; CHECK-LABEL: foo:
+; CHECK: ## %bb.0: ## %bb
+; CHECK-NEXT: movaps %xmm3, %xmm8
+; CHECK-NEXT: xorps %xmm3, %xmm3
+; CHECK-NEXT: pxor %xmm6, %xmm6
+; CHECK-NEXT: pcmpgtd %xmm0, %xmm6
+; CHECK-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255]
+; CHECK-NEXT: pand %xmm6, %xmm5
+; CHECK-NEXT: packuswb %xmm5, %xmm5
+; CHECK-NEXT: packuswb %xmm5, %xmm5
+; CHECK-NEXT: cvttps2dq %xmm0, %xmm13
+; CHECK-NEXT: movdqa %xmm0, %xmm10
+; CHECK-NEXT: cmpltps %xmm3, %xmm10
+; CHECK-NEXT: movdqa %xmm6, %xmm9
+; CHECK-NEXT: pxor %xmm10, %xmm9
+; CHECK-NEXT: cvttps2dq %xmm1, %xmm14
+; CHECK-NEXT: movaps %xmm1, %xmm11
+; CHECK-NEXT: cmpltps %xmm3, %xmm11
+; CHECK-NEXT: movdqa %xmm6, %xmm7
+; CHECK-NEXT: pxor %xmm11, %xmm7
+; CHECK-NEXT: cvttps2dq %xmm2, %xmm1
+; CHECK-NEXT: cmpltps %xmm3, %xmm2
+; CHECK-NEXT: movdqa %xmm6, %xmm4
+; CHECK-NEXT: pxor %xmm2, %xmm4
+; CHECK-NEXT: cvttps2dq %xmm8, %xmm12
+; CHECK-NEXT: cmpltps %xmm3, %xmm8
+; CHECK-NEXT: pxor %xmm8, %xmm6
+; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [1,1,1,1]
+; CHECK-NEXT: pand %xmm0, %xmm6
+; CHECK-NEXT: pand %xmm0, %xmm4
+; CHECK-NEXT: pand %xmm0, %xmm7
+; CHECK-NEXT: pand %xmm0, %xmm9
+; CHECK-NEXT: cvtdq2ps %xmm13, %xmm15
+; CHECK-NEXT: cvtdq2ps %xmm14, %xmm14
+; CHECK-NEXT: cvtdq2ps %xmm1, %xmm13
+; CHECK-NEXT: cvtdq2ps %xmm12, %xmm12
+; CHECK-NEXT: pxor %xmm0, %xmm0
+; CHECK-NEXT: cmpltps %xmm12, %xmm0
+; CHECK-NEXT: xorps %xmm1, %xmm1
+; CHECK-NEXT: cmpltps %xmm13, %xmm1
+; CHECK-NEXT: packssdw %xmm0, %xmm1
+; CHECK-NEXT: xorps %xmm0, %xmm0
+; CHECK-NEXT: cmpltps %xmm14, %xmm0
+; CHECK-NEXT: cmpltps %xmm15, %xmm3
+; CHECK-NEXT: packssdw %xmm0, %xmm3
+; CHECK-NEXT: packsswb %xmm1, %xmm3
+; CHECK-NEXT: pand %xmm5, %xmm3
+; CHECK-NEXT: movdqa %xmm3, %xmm1
+; CHECK-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; CHECK-NEXT: movdqa %xmm1, %xmm0
+; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; CHECK-NEXT: pslld $31, %xmm0
+; CHECK-NEXT: psrad $31, %xmm0
+; CHECK-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; CHECK-NEXT: pslld $31, %xmm1
+; CHECK-NEXT: psrad $31, %xmm1
+; CHECK-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
+; CHECK-NEXT: movdqa %xmm3, %xmm5
+; CHECK-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3]
+; CHECK-NEXT: pslld $31, %xmm5
+; CHECK-NEXT: psrad $31, %xmm5
+; CHECK-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; CHECK-NEXT: pslld $31, %xmm3
+; CHECK-NEXT: psrad $31, %xmm3
+; CHECK-NEXT: pxor %xmm9, %xmm0
+; CHECK-NEXT: pxor %xmm15, %xmm0
+; CHECK-NEXT: pxor %xmm7, %xmm1
+; CHECK-NEXT: pxor %xmm14, %xmm1
+; CHECK-NEXT: pxor %xmm4, %xmm5
+; CHECK-NEXT: pxor %xmm13, %xmm5
+; CHECK-NEXT: pxor %xmm6, %xmm3
+; CHECK-NEXT: pxor %xmm12, %xmm3
+; CHECK-NEXT: pand %xmm8, %xmm3
+; CHECK-NEXT: pand %xmm2, %xmm5
+; CHECK-NEXT: pand %xmm11, %xmm1
+; CHECK-NEXT: pand %xmm10, %xmm0
+; CHECK-NEXT: pxor %xmm9, %xmm0
+; CHECK-NEXT: pxor %xmm7, %xmm1
+; CHECK-NEXT: pxor %xmm4, %xmm5
+; CHECK-NEXT: pxor %xmm6, %xmm3
+; CHECK-NEXT: movdqa %xmm5, %xmm2
+; CHECK-NEXT: retq
bb:
%v3 = icmp slt <16 x i32> undef, zeroinitializer
%v14 = zext <16 x i1> %v3 to <16 x i32>
diff --git a/test/CodeGen/X86/machine-cse.ll b/test/CodeGen/X86/machine-cse.ll
index abf39c9a058d..0e332382c77b 100644
--- a/test/CodeGen/X86/machine-cse.ll
+++ b/test/CodeGen/X86/machine-cse.ll
@@ -9,7 +9,7 @@
define fastcc i8* @t(i32 %base) nounwind {
; CHECK-LABEL: t:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: pushq %rax
; CHECK-NEXT: movl %edi, %eax
; CHECK-NEXT: shlq $9, %rax
@@ -17,7 +17,7 @@ define fastcc i8* @t(i32 %base) nounwind {
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: testb %al, %al
; CHECK-NEXT: jne .LBB0_2
-; CHECK-NEXT: # BB#1: # %bb1
+; CHECK-NEXT: # %bb.1: # %bb1
; CHECK-NEXT: callq bar
; CHECK-NEXT: .LBB0_2: # %bb2
; CHECK-NEXT: callq foo
@@ -49,22 +49,22 @@ declare void @printf(...) nounwind
define void @commute(i32 %test_case, i32 %scale) nounwind ssp {
; CHECK-LABEL: commute:
-; CHECK: # BB#0: # %entry
-; CHECK-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
-; CHECK-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: # kill: def %esi killed %esi def %rsi
+; CHECK-NEXT: # kill: def %edi killed %edi def %rdi
; CHECK-NEXT: leal -1(%rdi), %eax
; CHECK-NEXT: cmpl $2, %eax
; CHECK-NEXT: ja .LBB1_4
-; CHECK-NEXT: # BB#1: # %sw.bb
+; CHECK-NEXT: # %bb.1: # %sw.bb
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: testb %al, %al
; CHECK-NEXT: jne .LBB1_4
-; CHECK-NEXT: # BB#2: # %if.end34
+; CHECK-NEXT: # %bb.2: # %if.end34
; CHECK-NEXT: pushq %rax
; CHECK-NEXT: imull %edi, %esi
; CHECK-NEXT: leal (%rsi,%rsi,2), %esi
; CHECK-NEXT: xorl %eax, %eax
-; CHECK-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<kill>
+; CHECK-NEXT: # kill: def %edi killed %edi killed %rdi
; CHECK-NEXT: callq printf
; CHECK-NEXT: addq $8, %rsp
; CHECK-NEXT: .p2align 4, 0x90
@@ -107,11 +107,11 @@ sw.bb307:
; rdar://10660865
define i32 @cross_mbb_phys_cse(i32 %a, i32 %b) nounwind ssp {
; CHECK-LABEL: cross_mbb_phys_cse:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movl $1, %eax
; CHECK-NEXT: cmpl %esi, %edi
; CHECK-NEXT: ja .LBB2_2
-; CHECK-NEXT: # BB#1: # %if.end
+; CHECK-NEXT: # %bb.1: # %if.end
; CHECK-NEXT: sbbl %eax, %eax
; CHECK-NEXT: .LBB2_2: # %return
; CHECK-NEXT: retq
@@ -132,17 +132,17 @@ return:
; rdar://11393714
define i8* @bsd_memchr(i8* %s, i32 %a, i32 %c, i64 %n) nounwind ssp {
; CHECK-LABEL: bsd_memchr:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: testq %rcx, %rcx
; CHECK-NEXT: je .LBB3_4
-; CHECK-NEXT: # BB#1: # %preheader
+; CHECK-NEXT: # %bb.1: # %preheader
; CHECK-NEXT: movzbl %dl, %eax
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB3_2: # %do.body
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
; CHECK-NEXT: cmpl %eax, %esi
; CHECK-NEXT: je .LBB3_5
-; CHECK-NEXT: # BB#3: # %do.cond
+; CHECK-NEXT: # %bb.3: # %do.cond
; CHECK-NEXT: # in Loop: Header=BB3_2 Depth=1
; CHECK-NEXT: incq %rdi
; CHECK-NEXT: decq %rcx
@@ -184,13 +184,13 @@ declare i1 @t2_func()
define i32 @t2() nounwind {
; CHECK-LABEL: t2:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: pushq %rax
; CHECK-NEXT: movl $42, {{.*}}(%rip)
; CHECK-NEXT: callq t2_func
; CHECK-NEXT: testb $1, %al
; CHECK-NEXT: je .LBB4_2
-; CHECK-NEXT: # BB#1: # %a
+; CHECK-NEXT: # %bb.1: # %a
; CHECK-NEXT: movl {{.*}}(%rip), %eax
; CHECK-NEXT: popq %rcx
; CHECK-NEXT: retq
diff --git a/test/CodeGen/X86/machine-outliner-debuginfo.ll b/test/CodeGen/X86/machine-outliner-debuginfo.ll
index 02d0964e37eb..3f6552ab2f73 100644
--- a/test/CodeGen/X86/machine-outliner-debuginfo.ll
+++ b/test/CodeGen/X86/machine-outliner-debuginfo.ll
@@ -48,7 +48,7 @@ attributes #0 = { noredzone nounwind ssp uwtable "no-frame-pointer-elim"="true"
!llvm.module.flags = !{!7, !8, !9}
!llvm.ident = !{!10}
-!0 = !DIGlobalVariableExpression(var: !1)
+!0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression())
!1 = distinct !DIGlobalVariable(name: "x", scope: !2, file: !3, line: 2, type: !6, isLocal: false, isDefinition: true)
!2 = distinct !DICompileUnit(language: DW_LANG_C99, file: !3, producer: "clang version 5.0.0", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !4, globals: !5)
!3 = !DIFile(filename: "debug-test.c", directory: "dir")
diff --git a/test/CodeGen/X86/machine-outliner-tailcalls.ll b/test/CodeGen/X86/machine-outliner-tailcalls.ll
index 020f7eeaaff3..b7426a9c30c7 100644
--- a/test/CodeGen/X86/machine-outliner-tailcalls.ll
+++ b/test/CodeGen/X86/machine-outliner-tailcalls.ll
@@ -32,4 +32,4 @@ attributes #0 = { noredzone nounwind ssp uwtable "no-frame-pointer-elim"="false"
; CHECK-LABEL: l_OUTLINED_FUNCTION_0:
; CHECK: movl $0, (%rax)
; CHECK-NEXT: movl $1, %edi
-; CHECK-NEXT: jmp _ext \ No newline at end of file
+; CHECK-NEXT: jmp _ext
diff --git a/test/CodeGen/X86/machine-outliner.ll b/test/CodeGen/X86/machine-outliner.ll
index b4a277ec2d82..29cf32f79852 100644
--- a/test/CodeGen/X86/machine-outliner.ll
+++ b/test/CodeGen/X86/machine-outliner.ll
@@ -96,14 +96,14 @@ define i32 @main() #0 {
attributes #0 = { noredzone nounwind ssp uwtable "no-frame-pointer-elim"="true" }
-; CHECK-LABEL: l_OUTLINED_FUNCTION_1:
+; CHECK-LABEL: l_OUTLINED_FUNCTION_{{[0-9]+}}:
; CHECK: movl $1, -{{[0-9]+}}(%rbp)
; CHECK-NEXT: movl $2, -{{[0-9]+}}(%rbp)
; CHECK-NEXT: movl $3, -{{[0-9]+}}(%rbp)
; CHECK-NEXT: movl $4, -{{[0-9]+}}(%rbp)
; CHECK-NEXT: retq
-; CHECK-LABEL: l_OUTLINED_FUNCTION_0:
+; CHECK-LABEL: l_OUTLINED_FUNCTION_{{[0-9]+}}:
; CHECK: movl $1, -{{[0-9]+}}(%rbp)
; CHECK-NEXT: movl $2, -{{[0-9]+}}(%rbp)
; CHECK-NEXT: movl $3, -{{[0-9]+}}(%rbp)
diff --git a/test/CodeGen/X86/machine-region-info.mir b/test/CodeGen/X86/machine-region-info.mir
index 78823a3eb006..7704cb285601 100644
--- a/test/CodeGen/X86/machine-region-info.mir
+++ b/test/CodeGen/X86/machine-region-info.mir
@@ -53,12 +53,12 @@ body: |
...
# CHECK: Region tree:
-# CHECK-NEXT: [0] BB#0 => <Function Return>
-# CHECK-NEXT: [1] BB#0 => BB#11
-# CHECK-NEXT: [2] BB#7 => BB#9
-# CHECK-NEXT: [2] BB#9 => BB#11
-# CHECK-NEXT: [2] BB#1 => BB#11
-# CHECK-NEXT: [3] BB#2 => BB#5
-# CHECK-NEXT: [4] BB#3 => BB#5
-# CHECK-NEXT: [3] BB#5 => BB#11
+# CHECK-NEXT: [0] %bb.0 => <Function Return>
+# CHECK-NEXT: [1] %bb.0 => %bb.11
+# CHECK-NEXT: [2] %bb.7 => %bb.9
+# CHECK-NEXT: [2] %bb.9 => %bb.11
+# CHECK-NEXT: [2] %bb.1 => %bb.11
+# CHECK-NEXT: [3] %bb.2 => %bb.5
+# CHECK-NEXT: [4] %bb.3 => %bb.5
+# CHECK-NEXT: [3] %bb.5 => %bb.11
# CHECK-NEXT: End region tree
diff --git a/test/CodeGen/X86/machinesink-merge-debuginfo.ll b/test/CodeGen/X86/machinesink-merge-debuginfo.ll
new file mode 100644
index 000000000000..d8fcea1872e8
--- /dev/null
+++ b/test/CodeGen/X86/machinesink-merge-debuginfo.ll
@@ -0,0 +1,56 @@
+; RUN: llc -simplify-mir -stop-after=machine-sink < %s -o - | FileCheck %s
+
+; ModuleID = 'test-sink-debug.cpp'
+source_filename = "test-sink-debug.cpp"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: nounwind readnone uwtable
+define double @_Z3fooddb(double %x, double %y, i1 zeroext %c) local_unnamed_addr !dbg !7 {
+ tail call void @llvm.dbg.value(metadata double %x, metadata !13, metadata !DIExpression()), !dbg !16
+ tail call void @llvm.dbg.value(metadata double %y, metadata !14, metadata !DIExpression()), !dbg !16
+ tail call void @llvm.dbg.value(metadata i1 %c, metadata !15, metadata !DIExpression()), !dbg !16
+ %a = fdiv double %x, 3.000000e+00
+ %b = fdiv double %y, 5.000000e+00, !dbg !17
+ br i1 %c, label %first, label %second
+first:
+ %e = fadd double %a, 1.000000e+00
+ br label %final
+second:
+ %f = fadd double %b, 1.000000e+00, !dbg !17
+; CHECK: debug-location !17
+; CHECK: debug-location !17
+ br label %final
+final:
+ %cond = phi double [%e, %first], [%f, %second]
+ %d = fadd double %cond, 1.000000e+00
+ ret double %d
+}
+
+; Function Attrs: nounwind readnone speculatable
+declare void @llvm.dbg.value(metadata, metadata, metadata) #1
+
+attributes #1 = { nounwind readnone speculatable }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4, !5}
+!llvm.ident = !{!6}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 6.0.0 (trunk 313291)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
+!1 = !DIFile(filename: "test-sink-debug.cpp", directory: "/tmp")
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{i32 1, !"wchar_size", i32 4}
+!6 = !{!"clang version 6.0.0 (trunk 313291)"}
+!7 = distinct !DISubprogram(name: "foo", linkageName: "_Z3fooddb", scope: !1, file: !1, line: 1, type: !8, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, variables: !12)
+!8 = !DISubroutineType(types: !9)
+!9 = !{!10, !10, !10, !11}
+!10 = !DIBasicType(name: "double", size: 64, encoding: DW_ATE_float)
+!11 = !DIBasicType(name: "bool", size: 8, encoding: DW_ATE_boolean)
+!12 = !{!13, !14, !15}
+!13 = !DILocalVariable(name: "x", arg: 1, scope: !7, file: !1, line: 1, type: !10)
+!14 = !DILocalVariable(name: "y", arg: 2, scope: !7, file: !1, line: 1, type: !10)
+!15 = !DILocalVariable(name: "c", arg: 3, scope: !7, file: !1, line: 1, type: !11)
+!16 = !DILocation(line: 1, column: 19, scope: !7)
+!17 = !DILocation(line: 2, column: 26, scope: !7)
diff --git a/test/CodeGen/X86/machinesink-null-debuginfo.ll b/test/CodeGen/X86/machinesink-null-debuginfo.ll
new file mode 100644
index 000000000000..454e0cd704ff
--- /dev/null
+++ b/test/CodeGen/X86/machinesink-null-debuginfo.ll
@@ -0,0 +1,49 @@
+; RUN: llc -simplify-mir -stop-after=machine-sink < %s -o - | FileCheck %s
+
+; ModuleID = 'test-sink-debug.cpp'
+source_filename = "test-sink-debug.cpp"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: nounwind readnone uwtable
+define double @_Z3fooddb(double %x, double %y, i1 zeroext %c) local_unnamed_addr !dbg !7 {
+ tail call void @llvm.dbg.value(metadata double %x, metadata !13, metadata !DIExpression()), !dbg !16
+ tail call void @llvm.dbg.value(metadata double %y, metadata !14, metadata !DIExpression()), !dbg !17
+ tail call void @llvm.dbg.value(metadata i1 %c, metadata !15, metadata !DIExpression()), !dbg !18
+ %a = fdiv double %x, 3.000000e+00
+ %b = fdiv double %y, 5.000000e+00, !dbg !21
+ %cond = select i1 %c, double %a, double %b
+; CHECK-NOT: debug-location !21
+ ret double %cond, !dbg !22
+}
+
+; Function Attrs: nounwind readnone speculatable
+declare void @llvm.dbg.value(metadata, metadata, metadata) #1
+
+attributes #1 = { nounwind readnone speculatable }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4, !5}
+!llvm.ident = !{!6}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 6.0.0 (trunk 313291)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
+!1 = !DIFile(filename: "test-sink-debug.cpp", directory: "/tmp")
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{i32 1, !"wchar_size", i32 4}
+!6 = !{!"clang version 6.0.0 (trunk 313291)"}
+!7 = distinct !DISubprogram(name: "foo", linkageName: "_Z3fooddb", scope: !1, file: !1, line: 1, type: !8, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, variables: !12)
+!8 = !DISubroutineType(types: !9)
+!9 = !{!10, !10, !10, !11}
+!10 = !DIBasicType(name: "double", size: 64, encoding: DW_ATE_float)
+!11 = !DIBasicType(name: "bool", size: 8, encoding: DW_ATE_boolean)
+!12 = !{!13, !14, !15}
+!13 = !DILocalVariable(name: "x", arg: 1, scope: !7, file: !1, line: 1, type: !10)
+!14 = !DILocalVariable(name: "y", arg: 2, scope: !7, file: !1, line: 1, type: !10)
+!15 = !DILocalVariable(name: "c", arg: 3, scope: !7, file: !1, line: 1, type: !11)
+!16 = !DILocation(line: 1, column: 19, scope: !7)
+!17 = !DILocation(line: 1, column: 29, scope: !7)
+!18 = !DILocation(line: 1, column: 37, scope: !7)
+!21 = !DILocation(line: 2, column: 26, scope: !7)
+!22 = !DILocation(line: 2, column: 3, scope: !7)
diff --git a/test/CodeGen/X86/madd.ll b/test/CodeGen/X86/madd.ll
index af86df510016..44e7b91eef81 100644
--- a/test/CodeGen/X86/madd.ll
+++ b/test/CodeGen/X86/madd.ll
@@ -6,22 +6,22 @@
define i32 @_Z10test_shortPsS_i(i16* nocapture readonly, i16* nocapture readonly, i32) local_unnamed_addr #0 {
; SSE2-LABEL: _Z10test_shortPsS_i:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movl %edx, %eax
; SSE2-NEXT: pxor %xmm0, %xmm0
+; SSE2-NEXT: xorl %ecx, %ecx
; SSE2-NEXT: pxor %xmm1, %xmm1
; SSE2-NEXT: .p2align 4, 0x90
; SSE2-NEXT: .LBB0_1: # %vector.body
; SSE2-NEXT: # =>This Inner Loop Header: Depth=1
-; SSE2-NEXT: movdqu (%rdi), %xmm2
-; SSE2-NEXT: movdqu (%rsi), %xmm3
+; SSE2-NEXT: movdqu (%rdi,%rcx,2), %xmm2
+; SSE2-NEXT: movdqu (%rsi,%rcx,2), %xmm3
; SSE2-NEXT: pmaddwd %xmm2, %xmm3
; SSE2-NEXT: paddd %xmm3, %xmm1
-; SSE2-NEXT: addq $16, %rsi
-; SSE2-NEXT: addq $16, %rdi
-; SSE2-NEXT: addq $-8, %rax
+; SSE2-NEXT: addq $8, %rcx
+; SSE2-NEXT: cmpq %rcx, %rax
; SSE2-NEXT: jne .LBB0_1
-; SSE2-NEXT: # BB#2: # %middle.block
+; SSE2-NEXT: # %bb.2: # %middle.block
; SSE2-NEXT: paddd %xmm0, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
; SSE2-NEXT: paddd %xmm1, %xmm0
@@ -31,22 +31,20 @@ define i32 @_Z10test_shortPsS_i(i16* nocapture readonly, i16* nocapture readonly
; SSE2-NEXT: retq
;
; AVX2-LABEL: _Z10test_shortPsS_i:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: movl %edx, %eax
-; AVX2-NEXT: vpxor %ymm0, %ymm0, %ymm0
-; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; AVX2-NEXT: xorl %ecx, %ecx
; AVX2-NEXT: .p2align 4, 0x90
; AVX2-NEXT: .LBB0_1: # %vector.body
; AVX2-NEXT: # =>This Inner Loop Header: Depth=1
-; AVX2-NEXT: vmovdqu (%rsi), %xmm2
-; AVX2-NEXT: vpmaddwd (%rdi), %xmm2, %xmm2
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm2
-; AVX2-NEXT: vpaddd %ymm0, %ymm2, %ymm0
-; AVX2-NEXT: addq $16, %rsi
-; AVX2-NEXT: addq $16, %rdi
-; AVX2-NEXT: addq $-8, %rax
+; AVX2-NEXT: vmovdqu (%rsi,%rcx,2), %xmm1
+; AVX2-NEXT: vpmaddwd (%rdi,%rcx,2), %xmm1, %xmm1
+; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: addq $8, %rcx
+; AVX2-NEXT: cmpq %rcx, %rax
; AVX2-NEXT: jne .LBB0_1
-; AVX2-NEXT: # BB#2: # %middle.block
+; AVX2-NEXT: # %bb.2: # %middle.block
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
@@ -57,22 +55,20 @@ define i32 @_Z10test_shortPsS_i(i16* nocapture readonly, i16* nocapture readonly
; AVX2-NEXT: retq
;
; AVX512-LABEL: _Z10test_shortPsS_i:
-; AVX512: # BB#0: # %entry
+; AVX512: # %bb.0: # %entry
; AVX512-NEXT: movl %edx, %eax
-; AVX512-NEXT: vpxor %ymm0, %ymm0, %ymm0
-; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: xorl %ecx, %ecx
; AVX512-NEXT: .p2align 4, 0x90
; AVX512-NEXT: .LBB0_1: # %vector.body
; AVX512-NEXT: # =>This Inner Loop Header: Depth=1
-; AVX512-NEXT: vmovdqu (%rsi), %xmm2
-; AVX512-NEXT: vpmaddwd (%rdi), %xmm2, %xmm2
-; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm2
-; AVX512-NEXT: vpaddd %ymm0, %ymm2, %ymm0
-; AVX512-NEXT: addq $16, %rsi
-; AVX512-NEXT: addq $16, %rdi
-; AVX512-NEXT: addq $-8, %rax
+; AVX512-NEXT: vmovdqu (%rsi,%rcx,2), %xmm1
+; AVX512-NEXT: vpmaddwd (%rdi,%rcx,2), %xmm1, %xmm1
+; AVX512-NEXT: vpaddd %ymm0, %ymm1, %ymm0
+; AVX512-NEXT: addq $8, %rcx
+; AVX512-NEXT: cmpq %rcx, %rax
; AVX512-NEXT: jne .LBB0_1
-; AVX512-NEXT: # BB#2: # %middle.block
+; AVX512-NEXT: # %bb.2: # %middle.block
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
@@ -115,15 +111,16 @@ middle.block:
define i32 @test_unsigned_short(i16* nocapture readonly, i16* nocapture readonly, i32) local_unnamed_addr #0 {
; SSE2-LABEL: test_unsigned_short:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movl %edx, %eax
; SSE2-NEXT: pxor %xmm0, %xmm0
+; SSE2-NEXT: xorl %ecx, %ecx
; SSE2-NEXT: pxor %xmm1, %xmm1
; SSE2-NEXT: .p2align 4, 0x90
; SSE2-NEXT: .LBB1_1: # %vector.body
; SSE2-NEXT: # =>This Inner Loop Header: Depth=1
-; SSE2-NEXT: movdqu (%rdi), %xmm2
-; SSE2-NEXT: movdqu (%rsi), %xmm3
+; SSE2-NEXT: movdqu (%rdi,%rcx,2), %xmm2
+; SSE2-NEXT: movdqu (%rsi,%rcx,2), %xmm3
; SSE2-NEXT: movdqa %xmm3, %xmm4
; SSE2-NEXT: pmulhuw %xmm2, %xmm4
; SSE2-NEXT: pmullw %xmm2, %xmm3
@@ -132,11 +129,10 @@ define i32 @test_unsigned_short(i16* nocapture readonly, i16* nocapture readonly
; SSE2-NEXT: paddd %xmm2, %xmm0
; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
; SSE2-NEXT: paddd %xmm3, %xmm1
-; SSE2-NEXT: addq $16, %rsi
-; SSE2-NEXT: addq $16, %rdi
-; SSE2-NEXT: addq $-8, %rax
+; SSE2-NEXT: addq $8, %rcx
+; SSE2-NEXT: cmpq %rcx, %rax
; SSE2-NEXT: jne .LBB1_1
-; SSE2-NEXT: # BB#2: # %middle.block
+; SSE2-NEXT: # %bb.2: # %middle.block
; SSE2-NEXT: paddd %xmm1, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; SSE2-NEXT: paddd %xmm0, %xmm1
@@ -146,9 +142,10 @@ define i32 @test_unsigned_short(i16* nocapture readonly, i16* nocapture readonly
; SSE2-NEXT: retq
;
; AVX2-LABEL: test_unsigned_short:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: movl %edx, %eax
-; AVX2-NEXT: vpxor %ymm0, %ymm0, %ymm0
+; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; AVX2-NEXT: xorl %ecx, %ecx
; AVX2-NEXT: .p2align 4, 0x90
; AVX2-NEXT: .LBB1_1: # %vector.body
; AVX2-NEXT: # =>This Inner Loop Header: Depth=1
@@ -156,11 +153,10 @@ define i32 @test_unsigned_short(i16* nocapture readonly, i16* nocapture readonly
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
; AVX2-NEXT: vpmulld %ymm1, %ymm2, %ymm1
; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0
-; AVX2-NEXT: addq $16, %rsi
-; AVX2-NEXT: addq $16, %rdi
-; AVX2-NEXT: addq $-8, %rax
+; AVX2-NEXT: addq $8, %rcx
+; AVX2-NEXT: cmpq %rcx, %rax
; AVX2-NEXT: jne .LBB1_1
-; AVX2-NEXT: # BB#2: # %middle.block
+; AVX2-NEXT: # %bb.2: # %middle.block
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
@@ -171,9 +167,10 @@ define i32 @test_unsigned_short(i16* nocapture readonly, i16* nocapture readonly
; AVX2-NEXT: retq
;
; AVX512-LABEL: test_unsigned_short:
-; AVX512: # BB#0: # %entry
+; AVX512: # %bb.0: # %entry
; AVX512-NEXT: movl %edx, %eax
-; AVX512-NEXT: vpxor %ymm0, %ymm0, %ymm0
+; AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: xorl %ecx, %ecx
; AVX512-NEXT: .p2align 4, 0x90
; AVX512-NEXT: .LBB1_1: # %vector.body
; AVX512-NEXT: # =>This Inner Loop Header: Depth=1
@@ -181,11 +178,10 @@ define i32 @test_unsigned_short(i16* nocapture readonly, i16* nocapture readonly
; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
; AVX512-NEXT: vpmulld %ymm1, %ymm2, %ymm1
; AVX512-NEXT: vpaddd %ymm0, %ymm1, %ymm0
-; AVX512-NEXT: addq $16, %rsi
-; AVX512-NEXT: addq $16, %rdi
-; AVX512-NEXT: addq $-8, %rax
+; AVX512-NEXT: addq $8, %rcx
+; AVX512-NEXT: cmpq %rcx, %rax
; AVX512-NEXT: jne .LBB1_1
-; AVX512-NEXT: # BB#2: # %middle.block
+; AVX512-NEXT: # %bb.2: # %middle.block
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
@@ -228,9 +224,10 @@ middle.block:
define i32 @_Z9test_charPcS_i(i8* nocapture readonly, i8* nocapture readonly, i32) local_unnamed_addr #0 {
; SSE2-LABEL: _Z9test_charPcS_i:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movl %edx, %eax
; SSE2-NEXT: pxor %xmm0, %xmm0
+; SSE2-NEXT: xorl %ecx, %ecx
; SSE2-NEXT: pxor %xmm1, %xmm1
; SSE2-NEXT: pxor %xmm3, %xmm3
; SSE2-NEXT: pxor %xmm2, %xmm2
@@ -263,11 +260,10 @@ define i32 @_Z9test_charPcS_i(i8* nocapture readonly, i8* nocapture readonly, i3
; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
; SSE2-NEXT: psrad $16, %xmm4
; SSE2-NEXT: paddd %xmm4, %xmm2
-; SSE2-NEXT: addq $16, %rsi
-; SSE2-NEXT: addq $16, %rdi
-; SSE2-NEXT: addq $-16, %rax
+; SSE2-NEXT: addq $16, %rcx
+; SSE2-NEXT: cmpq %rcx, %rax
; SSE2-NEXT: jne .LBB2_1
-; SSE2-NEXT: # BB#2: # %middle.block
+; SSE2-NEXT: # %bb.2: # %middle.block
; SSE2-NEXT: paddd %xmm3, %xmm0
; SSE2-NEXT: paddd %xmm2, %xmm1
; SSE2-NEXT: paddd %xmm0, %xmm1
@@ -279,22 +275,22 @@ define i32 @_Z9test_charPcS_i(i8* nocapture readonly, i8* nocapture readonly, i3
; SSE2-NEXT: retq
;
; AVX2-LABEL: _Z9test_charPcS_i:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: movl %edx, %eax
-; AVX2-NEXT: vpxor %ymm0, %ymm0, %ymm0
-; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; AVX2-NEXT: xorl %ecx, %ecx
+; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: .p2align 4, 0x90
; AVX2-NEXT: .LBB2_1: # %vector.body
; AVX2-NEXT: # =>This Inner Loop Header: Depth=1
-; AVX2-NEXT: vpmovsxbw (%rdi), %ymm2
-; AVX2-NEXT: vpmovsxbw (%rsi), %ymm3
+; AVX2-NEXT: vpmovsxbw (%rdi,%rcx), %ymm2
+; AVX2-NEXT: vpmovsxbw (%rsi,%rcx), %ymm3
; AVX2-NEXT: vpmaddwd %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpaddd %ymm1, %ymm2, %ymm1
-; AVX2-NEXT: addq $16, %rsi
-; AVX2-NEXT: addq $16, %rdi
-; AVX2-NEXT: addq $-16, %rax
+; AVX2-NEXT: addq $16, %rcx
+; AVX2-NEXT: cmpq %rcx, %rax
; AVX2-NEXT: jne .LBB2_1
-; AVX2-NEXT: # BB#2: # %middle.block
+; AVX2-NEXT: # %bb.2: # %middle.block
; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
@@ -306,30 +302,28 @@ define i32 @_Z9test_charPcS_i(i8* nocapture readonly, i8* nocapture readonly, i3
; AVX2-NEXT: retq
;
; AVX512-LABEL: _Z9test_charPcS_i:
-; AVX512: # BB#0: # %entry
+; AVX512: # %bb.0: # %entry
; AVX512-NEXT: movl %edx, %eax
-; AVX512-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; AVX512-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: xorl %ecx, %ecx
; AVX512-NEXT: .p2align 4, 0x90
; AVX512-NEXT: .LBB2_1: # %vector.body
; AVX512-NEXT: # =>This Inner Loop Header: Depth=1
-; AVX512-NEXT: vpmovsxbw (%rdi), %ymm2
-; AVX512-NEXT: vpmovsxbw (%rsi), %ymm3
-; AVX512-NEXT: vpmaddwd %ymm2, %ymm3, %ymm2
-; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm2
-; AVX512-NEXT: vpaddd %zmm0, %zmm2, %zmm0
-; AVX512-NEXT: addq $16, %rsi
-; AVX512-NEXT: addq $16, %rdi
-; AVX512-NEXT: addq $-16, %rax
+; AVX512-NEXT: vpmovsxbw (%rdi,%rcx), %ymm1
+; AVX512-NEXT: vpmovsxbw (%rsi,%rcx), %ymm2
+; AVX512-NEXT: vpmaddwd %ymm1, %ymm2, %ymm1
+; AVX512-NEXT: vpaddd %zmm0, %zmm1, %zmm0
+; AVX512-NEXT: addq $16, %rcx
+; AVX512-NEXT: cmpq %rcx, %rax
; AVX512-NEXT: jne .LBB2_1
-; AVX512-NEXT: # BB#2: # %middle.block
-; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,0,1]
+; AVX512-NEXT: # %bb.2: # %middle.block
+; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[2,3,0,1,0,1,0,1]
+; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[1,1,2,3,5,5,6,7,9,9,10,11,13,13,14,15]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vmovd %xmm0, %eax
; AVX512-NEXT: vzeroupper
diff --git a/test/CodeGen/X86/mask-negated-bool.ll b/test/CodeGen/X86/mask-negated-bool.ll
index 779641cee7d2..b0147c3bb589 100644
--- a/test/CodeGen/X86/mask-negated-bool.ll
+++ b/test/CodeGen/X86/mask-negated-bool.ll
@@ -3,7 +3,7 @@
define i32 @mask_negated_zext_bool1(i1 %x) {
; CHECK-LABEL: mask_negated_zext_bool1:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: andl $1, %edi
; CHECK-NEXT: movl %edi, %eax
; CHECK-NEXT: retq
@@ -15,8 +15,8 @@ define i32 @mask_negated_zext_bool1(i1 %x) {
define i32 @mask_negated_zext_bool2(i1 zeroext %x) {
; CHECK-LABEL: mask_negated_zext_bool2:
-; CHECK: # BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
+; CHECK: # %bb.0:
+; CHECK-NEXT: movl %edi, %eax
; CHECK-NEXT: retq
%ext = zext i1 %x to i32
%neg = sub i32 0, %ext
@@ -26,7 +26,7 @@ define i32 @mask_negated_zext_bool2(i1 zeroext %x) {
define <4 x i32> @mask_negated_zext_bool_vec(<4 x i1> %x) {
; CHECK-LABEL: mask_negated_zext_bool_vec:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: andps {{.*}}(%rip), %xmm0
; CHECK-NEXT: retq
%ext = zext <4 x i1> %x to <4 x i32>
@@ -37,7 +37,7 @@ define <4 x i32> @mask_negated_zext_bool_vec(<4 x i1> %x) {
define i32 @mask_negated_sext_bool1(i1 %x) {
; CHECK-LABEL: mask_negated_sext_bool1:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: andl $1, %edi
; CHECK-NEXT: movl %edi, %eax
; CHECK-NEXT: retq
@@ -49,8 +49,8 @@ define i32 @mask_negated_sext_bool1(i1 %x) {
define i32 @mask_negated_sext_bool2(i1 zeroext %x) {
; CHECK-LABEL: mask_negated_sext_bool2:
-; CHECK: # BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
+; CHECK: # %bb.0:
+; CHECK-NEXT: movl %edi, %eax
; CHECK-NEXT: retq
%ext = sext i1 %x to i32
%neg = sub i32 0, %ext
@@ -60,7 +60,7 @@ define i32 @mask_negated_sext_bool2(i1 zeroext %x) {
define <4 x i32> @mask_negated_sext_bool_vec(<4 x i1> %x) {
; CHECK-LABEL: mask_negated_sext_bool_vec:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: andps {{.*}}(%rip), %xmm0
; CHECK-NEXT: retq
%ext = sext <4 x i1> %x to <4 x i32>
diff --git a/test/CodeGen/X86/masked-iv-safe.ll b/test/CodeGen/X86/masked-iv-safe.ll
index 8c0a4d4f1752..aca02a94dacc 100644
--- a/test/CodeGen/X86/masked-iv-safe.ll
+++ b/test/CodeGen/X86/masked-iv-safe.ll
@@ -1,11 +1,11 @@
-; RUN: llc < %s -mcpu=generic -march=x86-64 | FileCheck %s
+; RUN: llc < %s -mcpu=generic -mtriple=x86_64-- | FileCheck %s
; Optimize away zext-inreg and sext-inreg on the loop induction
; variable using trip-count information.
; CHECK-LABEL: count_up
; CHECK-NOT: {{and|movz|sar|shl}}
-; CHECK: incq
+; CHECK: addq $8
; CHECK-NOT: {{and|movz|sar|shl}}
; CHECK: jne
define void @count_up(double* %d, i64 %n) nounwind {
@@ -38,7 +38,7 @@ return:
; CHECK-LABEL: count_down
; CHECK-NOT: {{and|movz|sar|shl}}
-; CHECK: addq
+; CHECK: addq $-8
; CHECK-NOT: {{and|movz|sar|shl}}
; CHECK: jne
define void @count_down(double* %d, i64 %n) nounwind {
@@ -71,7 +71,7 @@ return:
; CHECK-LABEL: count_up_signed
; CHECK-NOT: {{and|movz|sar|shl}}
-; CHECK: incq
+; CHECK: addq $8
; CHECK-NOT: {{and|movz|sar|shl}}
; CHECK: jne
define void @count_up_signed(double* %d, i64 %n) nounwind {
@@ -106,7 +106,7 @@ return:
; CHECK-LABEL: count_down_signed
; CHECK-NOT: {{and|movz|sar|shl}}
-; CHECK: addq
+; CHECK: addq $-8
; CHECK-NOT: {{and|movz|sar|shl}}
; CHECK: jne
define void @count_down_signed(double* %d, i64 %n) nounwind {
@@ -141,7 +141,7 @@ return:
; CHECK-LABEL: another_count_up
; CHECK-NOT: {{and|movz|sar|shl}}
-; CHECK: addq
+; CHECK: addq $8
; CHECK-NOT: {{and|movz|sar|shl}}
; CHECK: jne
define void @another_count_up(double* %d, i64 %n) nounwind {
@@ -174,7 +174,7 @@ return:
; CHECK-LABEL: another_count_down
; CHECK-NOT: {{and|movz|sar|shl}}
-; CHECK: addq $-8,
+; CHECK: addq $-8
; CHECK-NOT: {{and|movz|sar|shl}}
; CHECK: jne
define void @another_count_down(double* %d, i64 %n) nounwind {
@@ -207,7 +207,7 @@ return:
; CHECK-LABEL: another_count_up_signed
; CHECK-NOT: {{and|movz|sar|shl}}
-; CHECK: addq
+; CHECK: addq $8
; CHECK-NOT: {{and|movz|sar|shl}}
; CHECK: jne
define void @another_count_up_signed(double* %d, i64 %n) nounwind {
@@ -242,7 +242,7 @@ return:
; CHECK-LABEL: another_count_down_signed
; CHECK-NOT: {{and|movz|sar|shl}}
-; CHECK: decq
+; CHECK: addq $-8
; CHECK-NOT: {{and|movz|sar|shl}}
; CHECK: jne
define void @another_count_down_signed(double* %d, i64 %n) nounwind {
diff --git a/test/CodeGen/X86/masked-iv-unsafe.ll b/test/CodeGen/X86/masked-iv-unsafe.ll
index 974a1cfb90d1..53a1f0619ff7 100644
--- a/test/CodeGen/X86/masked-iv-unsafe.ll
+++ b/test/CodeGen/X86/masked-iv-unsafe.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 > %t
+; RUN: llc < %s -mtriple=x86_64-- > %t
; RUN: grep and %t | count 6
; RUN: grep movzb %t | count 6
; RUN: grep sar %t | count 12
diff --git a/test/CodeGen/X86/masked_gather_scatter.ll b/test/CodeGen/X86/masked_gather_scatter.ll
index 77254ba6760f..1eb2631e26ef 100644
--- a/test/CodeGen/X86/masked_gather_scatter.ll
+++ b/test/CodeGen/X86/masked_gather_scatter.ll
@@ -1,14 +1,13 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f < %s | FileCheck %s --check-prefix=ALL --check-prefix=KNL_64
; RUN: llc -mtriple=i386-unknown-linux-gnu -mattr=+avx512f < %s | FileCheck %s --check-prefix=ALL --check-prefix=KNL_32
-; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl -mattr=+avx512dq < %s | FileCheck %s --check-prefix=ALL --check-prefix=SKX
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl -mattr=+avx512dq < %s | FileCheck %s --check-prefix=ALL --check-prefix=SKX --check-prefix=SKX_SMALL
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl -mattr=+avx512dq -code-model=large < %s | FileCheck %s --check-prefix=ALL --check-prefix=SKX --check-prefix=SKX_LARGE
; RUN: llc -mtriple=i386-unknown-linux-gnu -mattr=+avx512vl -mattr=+avx512dq < %s | FileCheck %s --check-prefix=ALL --check-prefix=SKX_32
; RUN: opt -mtriple=x86_64-apple-darwin -scalarize-masked-mem-intrin -mcpu=corei7-avx -S < %s | FileCheck %s -check-prefix=SCALAR
; RUN: llc -O0 -mtriple=x86_64-unknown-linux-gnu -mcpu=skx < %s -o /dev/null
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
+@glob_array = internal unnamed_addr constant [16 x i32] [i32 1, i32 1, i32 2, i32 3, i32 5, i32 8, i32 13, i32 21, i32 34, i32 55, i32 89, i32 144, i32 233, i32 377, i32 610, i32 987], align 16
; SCALAR-LABEL: test1
; SCALAR: extractelement <16 x float*>
@@ -19,14 +18,14 @@ target triple = "x86_64-unknown-linux-gnu"
define <16 x float> @test1(float* %base, <16 x i32> %ind) {
; KNL_64-LABEL: test1:
-; KNL_64: # BB#0:
+; KNL_64: # %bb.0:
; KNL_64-NEXT: kxnorw %k0, %k0, %k1
; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
; KNL_64-NEXT: vmovaps %zmm1, %zmm0
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test1:
-; KNL_32: # BB#0:
+; KNL_32: # %bb.0:
; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; KNL_32-NEXT: kxnorw %k0, %k0, %k1
; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
@@ -34,14 +33,14 @@ define <16 x float> @test1(float* %base, <16 x i32> %ind) {
; KNL_32-NEXT: retl
;
; SKX-LABEL: test1:
-; SKX: # BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: kxnorw %k0, %k0, %k1
; SKX-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
; SKX-NEXT: vmovaps %zmm1, %zmm0
; SKX-NEXT: retq
;
; SKX_32-LABEL: test1:
-; SKX_32: # BB#0:
+; SKX_32: # %bb.0:
; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; SKX_32-NEXT: kxnorw %k0, %k0, %k1
; SKX_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
@@ -76,14 +75,14 @@ declare <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> , i32, <8 x i1> ,
define <16 x float> @test2(float* %base, <16 x i32> %ind, i16 %mask) {
; KNL_64-LABEL: test2:
-; KNL_64: # BB#0:
+; KNL_64: # %bb.0:
; KNL_64-NEXT: kmovw %esi, %k1
; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
; KNL_64-NEXT: vmovaps %zmm1, %zmm0
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test2:
-; KNL_32: # BB#0:
+; KNL_32: # %bb.0:
; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; KNL_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
@@ -91,14 +90,14 @@ define <16 x float> @test2(float* %base, <16 x i32> %ind, i16 %mask) {
; KNL_32-NEXT: retl
;
; SKX-LABEL: test2:
-; SKX: # BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: kmovw %esi, %k1
; SKX-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
; SKX-NEXT: vmovaps %zmm1, %zmm0
; SKX-NEXT: retq
;
; SKX_32-LABEL: test2:
-; SKX_32: # BB#0:
+; SKX_32: # %bb.0:
; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; SKX_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
; SKX_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
@@ -117,14 +116,14 @@ define <16 x float> @test2(float* %base, <16 x i32> %ind, i16 %mask) {
define <16 x i32> @test3(i32* %base, <16 x i32> %ind, i16 %mask) {
; KNL_64-LABEL: test3:
-; KNL_64: # BB#0:
+; KNL_64: # %bb.0:
; KNL_64-NEXT: kmovw %esi, %k1
; KNL_64-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k1}
; KNL_64-NEXT: vmovdqa64 %zmm1, %zmm0
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test3:
-; KNL_32: # BB#0:
+; KNL_32: # %bb.0:
; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; KNL_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
; KNL_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm1 {%k1}
@@ -132,14 +131,14 @@ define <16 x i32> @test3(i32* %base, <16 x i32> %ind, i16 %mask) {
; KNL_32-NEXT: retl
;
; SKX-LABEL: test3:
-; SKX: # BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: kmovw %esi, %k1
; SKX-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k1}
; SKX-NEXT: vmovdqa64 %zmm1, %zmm0
; SKX-NEXT: retq
;
; SKX_32-LABEL: test3:
-; SKX_32: # BB#0:
+; SKX_32: # %bb.0:
; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; SKX_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
; SKX_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm1 {%k1}
@@ -159,7 +158,7 @@ define <16 x i32> @test3(i32* %base, <16 x i32> %ind, i16 %mask) {
define <16 x i32> @test4(i32* %base, <16 x i32> %ind, i16 %mask) {
; KNL_64-LABEL: test4:
-; KNL_64: # BB#0:
+; KNL_64: # %bb.0:
; KNL_64-NEXT: kmovw %esi, %k1
; KNL_64-NEXT: kmovw %k1, %k2
; KNL_64-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k2}
@@ -169,7 +168,7 @@ define <16 x i32> @test4(i32* %base, <16 x i32> %ind, i16 %mask) {
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test4:
-; KNL_32: # BB#0:
+; KNL_32: # %bb.0:
; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; KNL_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
; KNL_32-NEXT: kmovw %k1, %k2
@@ -180,7 +179,7 @@ define <16 x i32> @test4(i32* %base, <16 x i32> %ind, i16 %mask) {
; KNL_32-NEXT: retl
;
; SKX-LABEL: test4:
-; SKX: # BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: kmovw %esi, %k1
; SKX-NEXT: kmovw %k1, %k2
; SKX-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k2}
@@ -190,7 +189,7 @@ define <16 x i32> @test4(i32* %base, <16 x i32> %ind, i16 %mask) {
; SKX-NEXT: retq
;
; SKX_32-LABEL: test4:
-; SKX_32: # BB#0:
+; SKX_32: # %bb.0:
; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; SKX_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
; SKX_32-NEXT: kmovw %k1, %k2
@@ -228,7 +227,7 @@ define <16 x i32> @test4(i32* %base, <16 x i32> %ind, i16 %mask) {
define void @test5(i32* %base, <16 x i32> %ind, i16 %mask, <16 x i32>%val) {
; KNL_64-LABEL: test5:
-; KNL_64: # BB#0:
+; KNL_64: # %bb.0:
; KNL_64-NEXT: kmovw %esi, %k1
; KNL_64-NEXT: kmovw %k1, %k2
; KNL_64-NEXT: vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k2}
@@ -237,7 +236,7 @@ define void @test5(i32* %base, <16 x i32> %ind, i16 %mask, <16 x i32>%val) {
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test5:
-; KNL_32: # BB#0:
+; KNL_32: # %bb.0:
; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; KNL_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
; KNL_32-NEXT: kmovw %k1, %k2
@@ -247,7 +246,7 @@ define void @test5(i32* %base, <16 x i32> %ind, i16 %mask, <16 x i32>%val) {
; KNL_32-NEXT: retl
;
; SKX-LABEL: test5:
-; SKX: # BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: kmovw %esi, %k1
; SKX-NEXT: kmovw %k1, %k2
; SKX-NEXT: vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k2}
@@ -256,7 +255,7 @@ define void @test5(i32* %base, <16 x i32> %ind, i16 %mask, <16 x i32>%val) {
; SKX-NEXT: retq
;
; SKX_32-LABEL: test5:
-; SKX_32: # BB#0:
+; SKX_32: # %bb.0:
; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; SKX_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
; SKX_32-NEXT: kmovw %k1, %k2
@@ -290,7 +289,7 @@ declare void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> , <16 x i32*> , i32
define <8 x i32> @test6(<8 x i32>%a1, <8 x i32*> %ptr) {
; KNL_64-LABEL: test6:
-; KNL_64: # BB#0:
+; KNL_64: # %bb.0:
; KNL_64-NEXT: kxnorw %k0, %k0, %k1
; KNL_64-NEXT: kxnorw %k0, %k0, %k2
; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2}
@@ -299,9 +298,9 @@ define <8 x i32> @test6(<8 x i32>%a1, <8 x i32*> %ptr) {
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test6:
-; KNL_32: # BB#0:
-; KNL_32-NEXT: vpmovsxdq %ymm1, %zmm2
+; KNL_32: # %bb.0:
; KNL_32-NEXT: kxnorw %k0, %k0, %k1
+; KNL_32-NEXT: vpmovsxdq %ymm1, %zmm2
; KNL_32-NEXT: kxnorw %k0, %k0, %k2
; KNL_32-NEXT: vpgatherqd (,%zmm2), %ymm1 {%k2}
; KNL_32-NEXT: vpscatterqd %ymm0, (,%zmm2) {%k1}
@@ -309,7 +308,7 @@ define <8 x i32> @test6(<8 x i32>%a1, <8 x i32*> %ptr) {
; KNL_32-NEXT: retl
;
; SKX-LABEL: test6:
-; SKX: # BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: kxnorw %k0, %k0, %k1
; SKX-NEXT: kxnorw %k0, %k0, %k2
; SKX-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2}
@@ -318,7 +317,7 @@ define <8 x i32> @test6(<8 x i32>%a1, <8 x i32*> %ptr) {
; SKX-NEXT: retq
;
; SKX_32-LABEL: test6:
-; SKX_32: # BB#0:
+; SKX_32: # %bb.0:
; SKX_32-NEXT: kxnorw %k0, %k0, %k1
; SKX_32-NEXT: kxnorw %k0, %k0, %k2
; SKX_32-NEXT: vpgatherdd (,%ymm1), %ymm2 {%k2}
@@ -335,7 +334,7 @@ define <8 x i32> @test6(<8 x i32>%a1, <8 x i32*> %ptr) {
define <8 x i32> @test7(i32* %base, <8 x i32> %ind, i8 %mask) {
;
; KNL_64-LABEL: test7:
-; KNL_64: # BB#0:
+; KNL_64: # %bb.0:
; KNL_64-NEXT: kmovw %esi, %k1
; KNL_64-NEXT: vpmovsxdq %ymm0, %zmm0
; KNL_64-NEXT: kmovw %k1, %k2
@@ -346,7 +345,7 @@ define <8 x i32> @test7(i32* %base, <8 x i32> %ind, i8 %mask) {
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test7:
-; KNL_32: # BB#0:
+; KNL_32: # %bb.0:
; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; KNL_32-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
; KNL_32-NEXT: kmovw %ecx, %k1
@@ -359,7 +358,7 @@ define <8 x i32> @test7(i32* %base, <8 x i32> %ind, i8 %mask) {
; KNL_32-NEXT: retl
;
; SKX-LABEL: test7:
-; SKX: # BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: kmovw %esi, %k1
; SKX-NEXT: kmovw %k1, %k2
; SKX-NEXT: vpgatherdd (%rdi,%ymm0,4), %ymm1 {%k2}
@@ -369,7 +368,7 @@ define <8 x i32> @test7(i32* %base, <8 x i32> %ind, i8 %mask) {
; SKX-NEXT: retq
;
; SKX_32-LABEL: test7:
-; SKX_32: # BB#0:
+; SKX_32: # %bb.0:
; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; SKX_32-NEXT: kmovb {{[0-9]+}}(%esp), %k1
; SKX_32-NEXT: kmovw %k1, %k2
@@ -394,7 +393,7 @@ define <8 x i32> @test7(i32* %base, <8 x i32> %ind, i8 %mask) {
; each gather call will be split into two
define <16 x i32> @test8(<16 x i32*> %ptr.random, <16 x i32> %ind, i16 %mask) {
; KNL_64-LABEL: test8:
-; KNL_64: # BB#0:
+; KNL_64: # %bb.0:
; KNL_64-NEXT: kmovw %edi, %k1
; KNL_64-NEXT: kshiftrw $8, %k1, %k2
; KNL_64-NEXT: kmovw %k2, %k3
@@ -409,7 +408,7 @@ define <16 x i32> @test8(<16 x i32*> %ptr.random, <16 x i32> %ind, i16 %mask) {
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test8:
-; KNL_32: # BB#0:
+; KNL_32: # %bb.0:
; KNL_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
; KNL_32-NEXT: kmovw %k1, %k2
; KNL_32-NEXT: vpgatherdd (,%zmm0), %zmm1 {%k2}
@@ -419,22 +418,22 @@ define <16 x i32> @test8(<16 x i32*> %ptr.random, <16 x i32> %ind, i16 %mask) {
; KNL_32-NEXT: retl
;
; SKX-LABEL: test8:
-; SKX: # BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: kmovw %edi, %k1
; SKX-NEXT: kshiftrw $8, %k1, %k2
; SKX-NEXT: kmovw %k2, %k3
; SKX-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k3}
; SKX-NEXT: kmovw %k1, %k3
; SKX-NEXT: vpgatherqd (,%zmm0), %ymm3 {%k3}
-; SKX-NEXT: vinserti32x8 $1, %ymm2, %zmm3, %zmm4
+; SKX-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm4
; SKX-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2}
; SKX-NEXT: vpgatherqd (,%zmm0), %ymm3 {%k1}
-; SKX-NEXT: vinserti32x8 $1, %ymm2, %zmm3, %zmm0
+; SKX-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm0
; SKX-NEXT: vpaddd %zmm0, %zmm4, %zmm0
; SKX-NEXT: retq
;
; SKX_32-LABEL: test8:
-; SKX_32: # BB#0:
+; SKX_32: # %bb.0:
; SKX_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
; SKX_32-NEXT: kmovw %k1, %k2
; SKX_32-NEXT: vpgatherdd (,%zmm0), %zmm1 {%k2}
@@ -459,9 +458,9 @@ define <16 x i32> @test8(<16 x i32*> %ptr.random, <16 x i32> %ind, i16 %mask) {
define <8 x i32> @test9(%struct.ST* %base, <8 x i64> %ind1, <8 x i32>%ind5) {
; KNL_64-LABEL: test9:
-; KNL_64: # BB#0: # %entry
+; KNL_64: # %bb.0: # %entry
; KNL_64-NEXT: vpbroadcastq %rdi, %zmm2
-; KNL_64-NEXT: vpbroadcastq {{.*}}(%rip), %zmm3
+; KNL_64-NEXT: vpbroadcastq {{.*#+}} zmm3 = [824,824,824,824,824,824,824,824]
; KNL_64-NEXT: vpmuludq %zmm3, %zmm0, %zmm4
; KNL_64-NEXT: vpsrlq $32, %zmm0, %zmm0
; KNL_64-NEXT: vpmuludq %zmm3, %zmm0, %zmm0
@@ -477,15 +476,15 @@ define <8 x i32> @test9(%struct.ST* %base, <8 x i64> %ind1, <8 x i32>%ind5) {
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test9:
-; KNL_32: # BB#0: # %entry
+; KNL_32: # %bb.0: # %entry
; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %ymm2
-; KNL_32-NEXT: vpbroadcastd {{\.LCPI.*}}, %ymm3
+; KNL_32-NEXT: vpbroadcastd {{.*#+}} ymm3 = [80,80,80,80,80,80,80,80]
; KNL_32-NEXT: vpmulld %ymm3, %ymm1, %ymm1
; KNL_32-NEXT: vpmovqd %zmm0, %ymm0
-; KNL_32-NEXT: vpbroadcastd {{\.LCPI.*}}, %ymm3
+; KNL_32-NEXT: vpbroadcastd {{.*#+}} ymm3 = [820,820,820,820,820,820,820,820]
; KNL_32-NEXT: vpmulld %ymm3, %ymm0, %ymm0
; KNL_32-NEXT: vpaddd %ymm1, %ymm0, %ymm0
-; KNL_32-NEXT: vpbroadcastd {{\.LCPI.*}}, %ymm1
+; KNL_32-NEXT: vpbroadcastd {{.*#+}} ymm1 = [68,68,68,68,68,68,68,68]
; KNL_32-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; KNL_32-NEXT: vpaddd %ymm0, %ymm2, %ymm0
; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm1
@@ -493,21 +492,37 @@ define <8 x i32> @test9(%struct.ST* %base, <8 x i64> %ind1, <8 x i32>%ind5) {
; KNL_32-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1}
; KNL_32-NEXT: retl
;
-; SKX-LABEL: test9:
-; SKX: # BB#0: # %entry
-; SKX-NEXT: vpbroadcastq %rdi, %zmm2
-; SKX-NEXT: vpmullq {{.*}}(%rip){1to8}, %zmm0, %zmm0
-; SKX-NEXT: vpmovsxdq %ymm1, %zmm1
-; SKX-NEXT: vpmullq {{.*}}(%rip){1to8}, %zmm1, %zmm1
-; SKX-NEXT: vpaddq %zmm1, %zmm0, %zmm0
-; SKX-NEXT: vpaddq %zmm0, %zmm2, %zmm0
-; SKX-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm1
-; SKX-NEXT: kxnorw %k0, %k0, %k1
-; SKX-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1}
-; SKX-NEXT: retq
+; SKX_SMALL-LABEL: test9:
+; SKX_SMALL: # %bb.0: # %entry
+; SKX_SMALL-NEXT: vpbroadcastq %rdi, %zmm2
+; SKX_SMALL-NEXT: vpmullq {{.*}}(%rip){1to8}, %zmm0, %zmm0
+; SKX_SMALL-NEXT: vpmovsxdq %ymm1, %zmm1
+; SKX_SMALL-NEXT: vpmullq {{.*}}(%rip){1to8}, %zmm1, %zmm1
+; SKX_SMALL-NEXT: vpaddq %zmm1, %zmm0, %zmm0
+; SKX_SMALL-NEXT: vpaddq %zmm0, %zmm2, %zmm0
+; SKX_SMALL-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm1
+; SKX_SMALL-NEXT: kxnorw %k0, %k0, %k1
+; SKX_SMALL-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1}
+; SKX_SMALL-NEXT: retq
+;
+; SKX_LARGE-LABEL: test9:
+; SKX_LARGE: # %bb.0: # %entry
+; SKX_LARGE-NEXT: vpbroadcastq %rdi, %zmm2
+; SKX_LARGE-NEXT: vpmovsxdq %ymm1, %zmm1
+; SKX_LARGE-NEXT: movabsq ${{\.LCPI.*}}, %rax
+; SKX_LARGE-NEXT: vpmullq (%rax){1to8}, %zmm1, %zmm1
+; SKX_LARGE-NEXT: movabsq ${{\.LCPI.*}}, %rax
+; SKX_LARGE-NEXT: vpmullq (%rax){1to8}, %zmm0, %zmm0
+; SKX_LARGE-NEXT: vpaddq %zmm1, %zmm0, %zmm0
+; SKX_LARGE-NEXT: vpaddq %zmm0, %zmm2, %zmm0
+; SKX_LARGE-NEXT: movabsq ${{\.LCPI.*}}, %rax
+; SKX_LARGE-NEXT: vpaddq (%rax){1to8}, %zmm0, %zmm1
+; SKX_LARGE-NEXT: kxnorw %k0, %k0, %k1
+; SKX_LARGE-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1}
+; SKX_LARGE-NEXT: retq
;
; SKX_32-LABEL: test9:
-; SKX_32: # BB#0: # %entry
+; SKX_32: # %bb.0: # %entry
; SKX_32-NEXT: vpmulld {{\.LCPI.*}}{1to8}, %ymm1, %ymm1
; SKX_32-NEXT: vpmovqd %zmm0, %ymm0
; SKX_32-NEXT: vpmulld {{\.LCPI.*}}{1to8}, %ymm0, %ymm0
@@ -528,9 +543,9 @@ entry:
define <8 x i32> @test10(%struct.ST* %base, <8 x i64> %i1, <8 x i32>%ind5) {
; KNL_64-LABEL: test10:
-; KNL_64: # BB#0: # %entry
+; KNL_64: # %bb.0: # %entry
; KNL_64-NEXT: vpbroadcastq %rdi, %zmm2
-; KNL_64-NEXT: vpbroadcastq {{.*}}(%rip), %zmm3
+; KNL_64-NEXT: vpbroadcastq {{.*#+}} zmm3 = [824,824,824,824,824,824,824,824]
; KNL_64-NEXT: vpmuludq %zmm3, %zmm0, %zmm4
; KNL_64-NEXT: vpsrlq $32, %zmm0, %zmm0
; KNL_64-NEXT: vpmuludq %zmm3, %zmm0, %zmm0
@@ -546,15 +561,15 @@ define <8 x i32> @test10(%struct.ST* %base, <8 x i64> %i1, <8 x i32>%ind5) {
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test10:
-; KNL_32: # BB#0: # %entry
+; KNL_32: # %bb.0: # %entry
; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %ymm2
-; KNL_32-NEXT: vpbroadcastd {{\.LCPI.*}}, %ymm3
+; KNL_32-NEXT: vpbroadcastd {{.*#+}} ymm3 = [80,80,80,80,80,80,80,80]
; KNL_32-NEXT: vpmulld %ymm3, %ymm1, %ymm1
; KNL_32-NEXT: vpmovqd %zmm0, %ymm0
-; KNL_32-NEXT: vpbroadcastd {{\.LCPI.*}}, %ymm3
+; KNL_32-NEXT: vpbroadcastd {{.*#+}} ymm3 = [820,820,820,820,820,820,820,820]
; KNL_32-NEXT: vpmulld %ymm3, %ymm0, %ymm0
; KNL_32-NEXT: vpaddd %ymm1, %ymm0, %ymm0
-; KNL_32-NEXT: vpbroadcastd {{\.LCPI.*}}, %ymm1
+; KNL_32-NEXT: vpbroadcastd {{.*#+}} ymm1 = [68,68,68,68,68,68,68,68]
; KNL_32-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; KNL_32-NEXT: vpaddd %ymm0, %ymm2, %ymm0
; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm1
@@ -562,21 +577,37 @@ define <8 x i32> @test10(%struct.ST* %base, <8 x i64> %i1, <8 x i32>%ind5) {
; KNL_32-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1}
; KNL_32-NEXT: retl
;
-; SKX-LABEL: test10:
-; SKX: # BB#0: # %entry
-; SKX-NEXT: vpbroadcastq %rdi, %zmm2
-; SKX-NEXT: vpmullq {{.*}}(%rip){1to8}, %zmm0, %zmm0
-; SKX-NEXT: vpmovsxdq %ymm1, %zmm1
-; SKX-NEXT: vpmullq {{.*}}(%rip){1to8}, %zmm1, %zmm1
-; SKX-NEXT: vpaddq %zmm1, %zmm0, %zmm0
-; SKX-NEXT: vpaddq %zmm0, %zmm2, %zmm0
-; SKX-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm1
-; SKX-NEXT: kxnorw %k0, %k0, %k1
-; SKX-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1}
-; SKX-NEXT: retq
+; SKX_SMALL-LABEL: test10:
+; SKX_SMALL: # %bb.0: # %entry
+; SKX_SMALL-NEXT: vpbroadcastq %rdi, %zmm2
+; SKX_SMALL-NEXT: vpmullq {{.*}}(%rip){1to8}, %zmm0, %zmm0
+; SKX_SMALL-NEXT: vpmovsxdq %ymm1, %zmm1
+; SKX_SMALL-NEXT: vpmullq {{.*}}(%rip){1to8}, %zmm1, %zmm1
+; SKX_SMALL-NEXT: vpaddq %zmm1, %zmm0, %zmm0
+; SKX_SMALL-NEXT: vpaddq %zmm0, %zmm2, %zmm0
+; SKX_SMALL-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm1
+; SKX_SMALL-NEXT: kxnorw %k0, %k0, %k1
+; SKX_SMALL-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1}
+; SKX_SMALL-NEXT: retq
+;
+; SKX_LARGE-LABEL: test10:
+; SKX_LARGE: # %bb.0: # %entry
+; SKX_LARGE-NEXT: vpbroadcastq %rdi, %zmm2
+; SKX_LARGE-NEXT: vpmovsxdq %ymm1, %zmm1
+; SKX_LARGE-NEXT: movabsq ${{\.LCPI.*}}, %rax
+; SKX_LARGE-NEXT: vpmullq (%rax){1to8}, %zmm1, %zmm1
+; SKX_LARGE-NEXT: movabsq ${{\.LCPI.*}}, %rax
+; SKX_LARGE-NEXT: vpmullq (%rax){1to8}, %zmm0, %zmm0
+; SKX_LARGE-NEXT: vpaddq %zmm1, %zmm0, %zmm0
+; SKX_LARGE-NEXT: vpaddq %zmm0, %zmm2, %zmm0
+; SKX_LARGE-NEXT: movabsq ${{\.LCPI.*}}, %rax
+; SKX_LARGE-NEXT: vpaddq (%rax){1to8}, %zmm0, %zmm1
+; SKX_LARGE-NEXT: kxnorw %k0, %k0, %k1
+; SKX_LARGE-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1}
+; SKX_LARGE-NEXT: retq
;
; SKX_32-LABEL: test10:
-; SKX_32: # BB#0: # %entry
+; SKX_32: # %bb.0: # %entry
; SKX_32-NEXT: vpmulld {{\.LCPI.*}}{1to8}, %ymm1, %ymm1
; SKX_32-NEXT: vpmovqd %zmm0, %ymm0
; SKX_32-NEXT: vpmulld {{\.LCPI.*}}{1to8}, %ymm0, %ymm0
@@ -598,14 +629,14 @@ entry:
; Splat index in GEP, requires broadcast
define <16 x float> @test11(float* %base, i32 %ind) {
; KNL_64-LABEL: test11:
-; KNL_64: # BB#0:
+; KNL_64: # %bb.0:
; KNL_64-NEXT: vpbroadcastd %esi, %zmm1
; KNL_64-NEXT: kxnorw %k0, %k0, %k1
; KNL_64-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1}
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test11:
-; KNL_32: # BB#0:
+; KNL_32: # %bb.0:
; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; KNL_32-NEXT: vbroadcastss {{[0-9]+}}(%esp), %zmm1
; KNL_32-NEXT: kxnorw %k0, %k0, %k1
@@ -613,14 +644,14 @@ define <16 x float> @test11(float* %base, i32 %ind) {
; KNL_32-NEXT: retl
;
; SKX-LABEL: test11:
-; SKX: # BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vpbroadcastd %esi, %zmm1
; SKX-NEXT: kxnorw %k0, %k0, %k1
; SKX-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1}
; SKX-NEXT: retq
;
; SKX_32-LABEL: test11:
-; SKX_32: # BB#0:
+; SKX_32: # %bb.0:
; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; SKX_32-NEXT: vbroadcastss {{[0-9]+}}(%esp), %zmm1
; SKX_32-NEXT: kxnorw %k0, %k0, %k1
@@ -639,14 +670,14 @@ define <16 x float> @test11(float* %base, i32 %ind) {
; We are checking the uniform base here. It is taken directly from input to vgatherdps
define <16 x float> @test12(float* %base, <16 x i32> %ind) {
; KNL_64-LABEL: test12:
-; KNL_64: # BB#0:
+; KNL_64: # %bb.0:
; KNL_64-NEXT: kxnorw %k0, %k0, %k1
; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
; KNL_64-NEXT: vmovaps %zmm1, %zmm0
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test12:
-; KNL_32: # BB#0:
+; KNL_32: # %bb.0:
; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; KNL_32-NEXT: kxnorw %k0, %k0, %k1
; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
@@ -654,14 +685,14 @@ define <16 x float> @test12(float* %base, <16 x i32> %ind) {
; KNL_32-NEXT: retl
;
; SKX-LABEL: test12:
-; SKX: # BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: kxnorw %k0, %k0, %k1
; SKX-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
; SKX-NEXT: vmovaps %zmm1, %zmm0
; SKX-NEXT: retq
;
; SKX_32-LABEL: test12:
-; SKX_32: # BB#0:
+; SKX_32: # %bb.0:
; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; SKX_32-NEXT: kxnorw %k0, %k0, %k1
; SKX_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
@@ -678,14 +709,14 @@ define <16 x float> @test12(float* %base, <16 x i32> %ind) {
; The same as the previous, but the mask is undefined
define <16 x float> @test13(float* %base, <16 x i32> %ind) {
; KNL_64-LABEL: test13:
-; KNL_64: # BB#0:
+; KNL_64: # %bb.0:
; KNL_64-NEXT: kxnorw %k0, %k0, %k1
; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
; KNL_64-NEXT: vmovaps %zmm1, %zmm0
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test13:
-; KNL_32: # BB#0:
+; KNL_32: # %bb.0:
; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; KNL_32-NEXT: kxnorw %k0, %k0, %k1
; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
@@ -693,14 +724,14 @@ define <16 x float> @test13(float* %base, <16 x i32> %ind) {
; KNL_32-NEXT: retl
;
; SKX-LABEL: test13:
-; SKX: # BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: kxnorw %k0, %k0, %k1
; SKX-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
; SKX-NEXT: vmovaps %zmm1, %zmm0
; SKX-NEXT: retq
;
; SKX_32-LABEL: test13:
-; SKX_32: # BB#0:
+; SKX_32: # %bb.0:
; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; SKX_32-NEXT: kxnorw %k0, %k0, %k1
; SKX_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
@@ -717,7 +748,7 @@ define <16 x float> @test13(float* %base, <16 x i32> %ind) {
; The base pointer is not splat, can't find unform base
define <16 x float> @test14(float* %base, i32 %ind, <16 x float*> %vec) {
; KNL_64-LABEL: test14:
-; KNL_64: # BB#0:
+; KNL_64: # %bb.0:
; KNL_64-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm0
; KNL_64-NEXT: vpbroadcastq %xmm0, %zmm0
; KNL_64-NEXT: vmovd %esi, %xmm1
@@ -726,14 +757,12 @@ define <16 x float> @test14(float* %base, i32 %ind, <16 x float*> %vec) {
; KNL_64-NEXT: vpsllq $2, %zmm1, %zmm1
; KNL_64-NEXT: vpaddq %zmm1, %zmm0, %zmm0
; KNL_64-NEXT: kxnorw %k0, %k0, %k1
-; KNL_64-NEXT: kshiftrw $8, %k1, %k2
-; KNL_64-NEXT: vgatherqps (,%zmm0), %ymm1 {%k2}
-; KNL_64-NEXT: vgatherqps (,%zmm0), %ymm2 {%k1}
-; KNL_64-NEXT: vinsertf64x4 $1, %ymm1, %zmm2, %zmm0
+; KNL_64-NEXT: vgatherqps (,%zmm0), %ymm1 {%k1}
+; KNL_64-NEXT: vinsertf64x4 $1, %ymm1, %zmm1, %zmm0
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test14:
-; KNL_32: # BB#0:
+; KNL_32: # %bb.0:
; KNL_32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
; KNL_32-NEXT: vpbroadcastd %xmm0, %zmm0
; KNL_32-NEXT: vpslld $2, {{[0-9]+}}(%esp){1to16}, %zmm1
@@ -743,7 +772,7 @@ define <16 x float> @test14(float* %base, i32 %ind, <16 x float*> %vec) {
; KNL_32-NEXT: retl
;
; SKX-LABEL: test14:
-; SKX: # BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm0
; SKX-NEXT: vpbroadcastq %xmm0, %zmm0
; SKX-NEXT: vpbroadcastd %esi, %ymm1
@@ -751,14 +780,12 @@ define <16 x float> @test14(float* %base, i32 %ind, <16 x float*> %vec) {
; SKX-NEXT: vpsllq $2, %zmm1, %zmm1
; SKX-NEXT: vpaddq %zmm1, %zmm0, %zmm0
; SKX-NEXT: kxnorw %k0, %k0, %k1
-; SKX-NEXT: kshiftrw $8, %k1, %k2
-; SKX-NEXT: vgatherqps (,%zmm0), %ymm1 {%k2}
-; SKX-NEXT: vgatherqps (,%zmm0), %ymm2 {%k1}
-; SKX-NEXT: vinsertf32x8 $1, %ymm1, %zmm2, %zmm0
+; SKX-NEXT: vgatherqps (,%zmm0), %ymm1 {%k1}
+; SKX-NEXT: vinsertf64x4 $1, %ymm1, %zmm1, %zmm0
; SKX-NEXT: retq
;
; SKX_32-LABEL: test14:
-; SKX_32: # BB#0:
+; SKX_32: # %bb.0:
; SKX_32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
; SKX_32-NEXT: vpbroadcastd %xmm0, %zmm0
; SKX_32-NEXT: vpslld $2, {{[0-9]+}}(%esp){1to16}, %zmm1
@@ -782,38 +809,33 @@ declare <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*>, i32, <2 x
; Gather smaller than existing instruction
define <4 x float> @test15(float* %base, <4 x i32> %ind, <4 x i1> %mask) {
-;
; KNL_64-LABEL: test15:
-; KNL_64: # BB#0:
-; KNL_64-NEXT: # kill: %XMM1<def> %XMM1<kill> %YMM1<def>
-; KNL_64-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
-; KNL_64-NEXT: vpxor %ymm2, %ymm2, %ymm2
-; KNL_64-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; KNL_64: # %bb.0:
+; KNL_64-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
+; KNL_64-NEXT: vmovdqa %xmm1, %xmm1
; KNL_64-NEXT: vpmovsxdq %ymm0, %zmm2
; KNL_64-NEXT: vpslld $31, %ymm1, %ymm0
; KNL_64-NEXT: vptestmd %zmm0, %zmm0, %k1
; KNL_64-NEXT: vgatherqps (%rdi,%zmm2,4), %ymm0 {%k1}
-; KNL_64-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; KNL_64-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; KNL_64-NEXT: vzeroupper
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test15:
-; KNL_32: # BB#0:
-; KNL_32-NEXT: # kill: %XMM1<def> %XMM1<kill> %YMM1<def>
-; KNL_32-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
-; KNL_32-NEXT: vpxor %ymm2, %ymm2, %ymm2
-; KNL_32-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; KNL_32: # %bb.0:
+; KNL_32-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
+; KNL_32-NEXT: vmovdqa %xmm1, %xmm1
; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm2
; KNL_32-NEXT: vpslld $31, %ymm1, %ymm0
; KNL_32-NEXT: vptestmd %zmm0, %zmm0, %k1
; KNL_32-NEXT: vgatherqps (%eax,%zmm2,4), %ymm0 {%k1}
-; KNL_32-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; KNL_32-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
;
; SKX-LABEL: test15:
-; SKX: # BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vpslld $31, %xmm1, %xmm1
; SKX-NEXT: vptestmd %xmm1, %xmm1, %k1
; SKX-NEXT: vgatherdps (%rdi,%xmm0,4), %xmm1 {%k1}
@@ -821,7 +843,7 @@ define <4 x float> @test15(float* %base, <4 x i32> %ind, <4 x i1> %mask) {
; SKX-NEXT: retq
;
; SKX_32-LABEL: test15:
-; SKX_32: # BB#0:
+; SKX_32: # %bb.0:
; SKX_32-NEXT: vpslld $31, %xmm1, %xmm1
; SKX_32-NEXT: vptestmd %xmm1, %xmm1, %k1
; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
@@ -837,16 +859,14 @@ define <4 x float> @test15(float* %base, <4 x i32> %ind, <4 x i1> %mask) {
; Gather smaller than existing instruction
define <4 x double> @test16(double* %base, <4 x i32> %ind, <4 x i1> %mask, <4 x double> %src0) {
-;
; KNL_64-LABEL: test16:
-; KNL_64: # BB#0:
-; KNL_64-NEXT: # kill: %YMM2<def> %YMM2<kill> %ZMM2<def>
-; KNL_64-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; KNL_64: # %bb.0:
+; KNL_64-NEXT: # kill: def %ymm2 killed %ymm2 def %zmm2
+; KNL_64-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
; KNL_64-NEXT: vpslld $31, %xmm1, %xmm1
; KNL_64-NEXT: vpsrad $31, %xmm1, %xmm1
; KNL_64-NEXT: vpmovsxdq %xmm1, %ymm1
-; KNL_64-NEXT: vpxord %zmm3, %zmm3, %zmm3
-; KNL_64-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
+; KNL_64-NEXT: vmovdqa %ymm1, %ymm1
; KNL_64-NEXT: vpmovsxdq %ymm0, %zmm0
; KNL_64-NEXT: vpsllq $63, %zmm1, %zmm1
; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1
@@ -855,14 +875,13 @@ define <4 x double> @test16(double* %base, <4 x i32> %ind, <4 x i1> %mask, <4 x
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test16:
-; KNL_32: # BB#0:
-; KNL_32-NEXT: # kill: %YMM2<def> %YMM2<kill> %ZMM2<def>
-; KNL_32-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; KNL_32: # %bb.0:
+; KNL_32-NEXT: # kill: def %ymm2 killed %ymm2 def %zmm2
+; KNL_32-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
; KNL_32-NEXT: vpslld $31, %xmm1, %xmm1
; KNL_32-NEXT: vpsrad $31, %xmm1, %xmm1
; KNL_32-NEXT: vpmovsxdq %xmm1, %ymm1
-; KNL_32-NEXT: vpxord %zmm3, %zmm3, %zmm3
-; KNL_32-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
+; KNL_32-NEXT: vmovdqa %ymm1, %ymm1
; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm0
; KNL_32-NEXT: vpsllq $63, %zmm1, %zmm1
@@ -872,7 +891,7 @@ define <4 x double> @test16(double* %base, <4 x i32> %ind, <4 x i1> %mask, <4 x
; KNL_32-NEXT: retl
;
; SKX-LABEL: test16:
-; SKX: # BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vpslld $31, %xmm1, %xmm1
; SKX-NEXT: vptestmd %xmm1, %xmm1, %k1
; SKX-NEXT: vgatherdpd (%rdi,%xmm0,8), %ymm2 {%k1}
@@ -880,7 +899,7 @@ define <4 x double> @test16(double* %base, <4 x i32> %ind, <4 x i1> %mask, <4 x
; SKX-NEXT: retq
;
; SKX_32-LABEL: test16:
-; SKX_32: # BB#0:
+; SKX_32: # %bb.0:
; SKX_32-NEXT: vpslld $31, %xmm1, %xmm1
; SKX_32-NEXT: vptestmd %xmm1, %xmm1, %k1
; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
@@ -895,13 +914,12 @@ define <4 x double> @test16(double* %base, <4 x i32> %ind, <4 x i1> %mask, <4 x
}
define <2 x double> @test17(double* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x double> %src0) {
-;
; KNL_64-LABEL: test17:
-; KNL_64: # BB#0:
-; KNL_64-NEXT: # kill: %XMM2<def> %XMM2<kill> %ZMM2<def>
-; KNL_64-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
-; KNL_64-NEXT: vpxord %zmm3, %zmm3, %zmm3
-; KNL_64-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1
+; KNL_64: # %bb.0:
+; KNL_64-NEXT: # kill: def %xmm2 killed %xmm2 def %zmm2
+; KNL_64-NEXT: vpsllq $32, %xmm0, %xmm0
+; KNL_64-NEXT: vpsraq $32, %zmm0, %zmm0
+; KNL_64-NEXT: vmovdqa %xmm1, %xmm1
; KNL_64-NEXT: vpsllq $63, %zmm1, %zmm1
; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1
; KNL_64-NEXT: vgatherqpd (%rdi,%zmm0,8), %zmm2 {%k1}
@@ -910,11 +928,11 @@ define <2 x double> @test17(double* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test17:
-; KNL_32: # BB#0:
-; KNL_32-NEXT: # kill: %XMM2<def> %XMM2<kill> %ZMM2<def>
-; KNL_32-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
-; KNL_32-NEXT: vpxord %zmm3, %zmm3, %zmm3
-; KNL_32-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1
+; KNL_32: # %bb.0:
+; KNL_32-NEXT: # kill: def %xmm2 killed %xmm2 def %zmm2
+; KNL_32-NEXT: vpsllq $32, %xmm0, %xmm0
+; KNL_32-NEXT: vpsraq $32, %zmm0, %zmm0
+; KNL_32-NEXT: vmovdqa %xmm1, %xmm1
; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; KNL_32-NEXT: vpsllq $63, %zmm1, %zmm1
; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1
@@ -924,7 +942,9 @@ define <2 x double> @test17(double* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x
; KNL_32-NEXT: retl
;
; SKX-LABEL: test17:
-; SKX: # BB#0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpsllq $32, %xmm0, %xmm0
+; SKX-NEXT: vpsraq $32, %xmm0, %xmm0
; SKX-NEXT: vpsllq $63, %xmm1, %xmm1
; SKX-NEXT: vptestmq %xmm1, %xmm1, %k1
; SKX-NEXT: vgatherqpd (%rdi,%xmm0,8), %xmm2 {%k1}
@@ -932,7 +952,9 @@ define <2 x double> @test17(double* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x
; SKX-NEXT: retq
;
; SKX_32-LABEL: test17:
-; SKX_32: # BB#0:
+; SKX_32: # %bb.0:
+; SKX_32-NEXT: vpsllq $32, %xmm0, %xmm0
+; SKX_32-NEXT: vpsraq $32, %xmm0, %xmm0
; SKX_32-NEXT: vpsllq $63, %xmm1, %xmm1
; SKX_32-NEXT: vptestmq %xmm1, %xmm1, %k1
; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
@@ -953,14 +975,11 @@ declare void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> , <2 x i32*> , i32 , <
declare void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> , <2 x float*> , i32 , <2 x i1> )
define void @test18(<4 x i32>%a1, <4 x i32*> %ptr, <4 x i1>%mask) {
-;
; KNL_64-LABEL: test18:
-; KNL_64: # BB#0:
-; KNL_64-NEXT: # kill: %XMM2<def> %XMM2<kill> %YMM2<def>
-; KNL_64-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
-; KNL_64-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
-; KNL_64-NEXT: vpxor %ymm3, %ymm3, %ymm3
-; KNL_64-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; KNL_64: # %bb.0:
+; KNL_64-NEXT: # kill: def %ymm1 killed %ymm1 def %zmm1
+; KNL_64-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
+; KNL_64-NEXT: vmovdqa %xmm2, %xmm2
; KNL_64-NEXT: vpslld $31, %ymm2, %ymm2
; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1}
@@ -968,12 +987,10 @@ define void @test18(<4 x i32>%a1, <4 x i32*> %ptr, <4 x i1>%mask) {
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test18:
-; KNL_32: # BB#0:
-; KNL_32-NEXT: # kill: %XMM2<def> %XMM2<kill> %YMM2<def>
-; KNL_32-NEXT: # kill: %XMM1<def> %XMM1<kill> %YMM1<def>
-; KNL_32-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
-; KNL_32-NEXT: vpxor %ymm3, %ymm3, %ymm3
-; KNL_32-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; KNL_32: # %bb.0:
+; KNL_32-NEXT: # kill: def %xmm1 killed %xmm1 def %ymm1
+; KNL_32-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
+; KNL_32-NEXT: vmovdqa %xmm2, %xmm2
; KNL_32-NEXT: vpmovsxdq %ymm1, %zmm1
; KNL_32-NEXT: vpslld $31, %ymm2, %ymm2
; KNL_32-NEXT: vptestmd %zmm2, %zmm2, %k1
@@ -982,7 +999,7 @@ define void @test18(<4 x i32>%a1, <4 x i32*> %ptr, <4 x i1>%mask) {
; KNL_32-NEXT: retl
;
; SKX-LABEL: test18:
-; SKX: # BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vpslld $31, %xmm2, %xmm2
; SKX-NEXT: vptestmd %xmm2, %xmm2, %k1
; SKX-NEXT: vpscatterqd %xmm0, (,%ymm1) {%k1}
@@ -990,7 +1007,7 @@ define void @test18(<4 x i32>%a1, <4 x i32*> %ptr, <4 x i1>%mask) {
; SKX-NEXT: retq
;
; SKX_32-LABEL: test18:
-; SKX_32: # BB#0:
+; SKX_32: # %bb.0:
; SKX_32-NEXT: vpslld $31, %xmm2, %xmm2
; SKX_32-NEXT: vptestmd %xmm2, %xmm2, %k1
; SKX_32-NEXT: vpscatterdd %xmm0, (,%xmm1) {%k1}
@@ -1000,16 +1017,14 @@ define void @test18(<4 x i32>%a1, <4 x i32*> %ptr, <4 x i1>%mask) {
}
define void @test19(<4 x double>%a1, double* %ptr, <4 x i1>%mask, <4 x i64> %ind) {
-;
; KNL_64-LABEL: test19:
-; KNL_64: # BB#0:
-; KNL_64-NEXT: # kill: %YMM2<def> %YMM2<kill> %ZMM2<def>
-; KNL_64-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; KNL_64: # %bb.0:
+; KNL_64-NEXT: # kill: def %ymm2 killed %ymm2 def %zmm2
+; KNL_64-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; KNL_64-NEXT: vpslld $31, %xmm1, %xmm1
; KNL_64-NEXT: vpsrad $31, %xmm1, %xmm1
; KNL_64-NEXT: vpmovsxdq %xmm1, %ymm1
-; KNL_64-NEXT: vpxord %zmm3, %zmm3, %zmm3
-; KNL_64-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
+; KNL_64-NEXT: vmovdqa %ymm1, %ymm1
; KNL_64-NEXT: vpsllq $63, %zmm1, %zmm1
; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1
; KNL_64-NEXT: vscatterqpd %zmm0, (%rdi,%zmm2,8) {%k1}
@@ -1017,14 +1032,13 @@ define void @test19(<4 x double>%a1, double* %ptr, <4 x i1>%mask, <4 x i64> %ind
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test19:
-; KNL_32: # BB#0:
-; KNL_32-NEXT: # kill: %YMM2<def> %YMM2<kill> %ZMM2<def>
-; KNL_32-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; KNL_32: # %bb.0:
+; KNL_32-NEXT: # kill: def %ymm2 killed %ymm2 def %zmm2
+; KNL_32-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; KNL_32-NEXT: vpslld $31, %xmm1, %xmm1
; KNL_32-NEXT: vpsrad $31, %xmm1, %xmm1
; KNL_32-NEXT: vpmovsxdq %xmm1, %ymm1
-; KNL_32-NEXT: vpxord %zmm3, %zmm3, %zmm3
-; KNL_32-NEXT: vinserti64x4 $0, %ymm1, %zmm3, %zmm1
+; KNL_32-NEXT: vmovdqa %ymm1, %ymm1
; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; KNL_32-NEXT: vpsllq $63, %zmm1, %zmm1
; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1
@@ -1033,7 +1047,7 @@ define void @test19(<4 x double>%a1, double* %ptr, <4 x i1>%mask, <4 x i64> %ind
; KNL_32-NEXT: retl
;
; SKX-LABEL: test19:
-; SKX: # BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vpslld $31, %xmm1, %xmm1
; SKX-NEXT: vptestmd %xmm1, %xmm1, %k1
; SKX-NEXT: vscatterqpd %ymm0, (%rdi,%ymm2,8) {%k1}
@@ -1041,7 +1055,7 @@ define void @test19(<4 x double>%a1, double* %ptr, <4 x i1>%mask, <4 x i64> %ind
; SKX-NEXT: retq
;
; SKX_32-LABEL: test19:
-; SKX_32: # BB#0:
+; SKX_32: # %bb.0:
; SKX_32-NEXT: vpslld $31, %xmm1, %xmm1
; SKX_32-NEXT: vptestmd %xmm1, %xmm1, %k1
; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
@@ -1055,14 +1069,12 @@ define void @test19(<4 x double>%a1, double* %ptr, <4 x i1>%mask, <4 x i64> %ind
; Data type requires widening
define void @test20(<2 x float>%a1, <2 x float*> %ptr, <2 x i1> %mask) {
-;
; KNL_64-LABEL: test20:
-; KNL_64: # BB#0:
-; KNL_64-NEXT: # kill: %XMM1<def> %XMM1<kill> %ZMM1<def>
-; KNL_64-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; KNL_64: # %bb.0:
+; KNL_64-NEXT: # kill: def %xmm1 killed %xmm1 def %zmm1
+; KNL_64-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
; KNL_64-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,2],zero,zero
-; KNL_64-NEXT: vpxor %ymm3, %ymm3, %ymm3
-; KNL_64-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; KNL_64-NEXT: vmovaps %xmm2, %xmm2
; KNL_64-NEXT: vpslld $31, %ymm2, %ymm2
; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
; KNL_64-NEXT: vscatterqps %ymm0, (,%zmm1) {%k1}
@@ -1070,12 +1082,11 @@ define void @test20(<2 x float>%a1, <2 x float*> %ptr, <2 x i1> %mask) {
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test20:
-; KNL_32: # BB#0:
-; KNL_32-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
-; KNL_32-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,2],zero,zero
-; KNL_32-NEXT: vpxor %ymm3, %ymm3, %ymm3
-; KNL_32-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; KNL_32: # %bb.0:
+; KNL_32-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
; KNL_32-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; KNL_32-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,2],zero,zero
+; KNL_32-NEXT: vmovaps %xmm2, %xmm2
; KNL_32-NEXT: vpmovsxdq %ymm1, %zmm1
; KNL_32-NEXT: vpslld $31, %ymm2, %ymm2
; KNL_32-NEXT: vptestmd %zmm2, %zmm2, %k1
@@ -1084,23 +1095,19 @@ define void @test20(<2 x float>%a1, <2 x float*> %ptr, <2 x i1> %mask) {
; KNL_32-NEXT: retl
;
; SKX-LABEL: test20:
-; SKX: # BB#0:
-; SKX-NEXT: # kill: %XMM1<def> %XMM1<kill> %YMM1<def>
+; SKX: # %bb.0:
+; SKX-NEXT: # kill: def %xmm1 killed %xmm1 def %ymm1
; SKX-NEXT: vpsllq $63, %xmm2, %xmm2
-; SKX-NEXT: vptestmq %xmm2, %xmm2, %k0
-; SKX-NEXT: kshiftlb $6, %k0, %k0
-; SKX-NEXT: kshiftrb $6, %k0, %k1
+; SKX-NEXT: vptestmq %xmm2, %xmm2, %k1
; SKX-NEXT: vscatterqps %xmm0, (,%ymm1) {%k1}
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
;
; SKX_32-LABEL: test20:
-; SKX_32: # BB#0:
-; SKX_32-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SKX_32: # %bb.0:
+; SKX_32-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,2,2,3]
; SKX_32-NEXT: vpsllq $63, %xmm2, %xmm2
-; SKX_32-NEXT: vptestmq %xmm2, %xmm2, %k0
-; SKX_32-NEXT: kshiftlb $6, %k0, %k0
-; SKX_32-NEXT: kshiftrb $6, %k0, %k1
+; SKX_32-NEXT: vptestmq %xmm2, %xmm2, %k1
; SKX_32-NEXT: vscatterdps %xmm0, (,%xmm1) {%k1}
; SKX_32-NEXT: retl
call void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> %a1, <2 x float*> %ptr, i32 4, <2 x i1> %mask)
@@ -1109,12 +1116,10 @@ define void @test20(<2 x float>%a1, <2 x float*> %ptr, <2 x i1> %mask) {
; Data type requires promotion
define void @test21(<2 x i32>%a1, <2 x i32*> %ptr, <2 x i1>%mask) {
-;
; KNL_64-LABEL: test21:
-; KNL_64: # BB#0:
-; KNL_64-NEXT: # kill: %XMM1<def> %XMM1<kill> %ZMM1<def>
-; KNL_64-NEXT: vpxord %zmm3, %zmm3, %zmm3
-; KNL_64-NEXT: vinserti32x4 $0, %xmm2, %zmm3, %zmm2
+; KNL_64: # %bb.0:
+; KNL_64-NEXT: # kill: def %xmm1 killed %xmm1 def %zmm1
+; KNL_64-NEXT: vmovdqa %xmm2, %xmm2
; KNL_64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; KNL_64-NEXT: vpsllq $63, %zmm2, %zmm2
; KNL_64-NEXT: vptestmq %zmm2, %zmm2, %k1
@@ -1123,10 +1128,10 @@ define void @test21(<2 x i32>%a1, <2 x i32*> %ptr, <2 x i1>%mask) {
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test21:
-; KNL_32: # BB#0:
-; KNL_32-NEXT: # kill: %XMM1<def> %XMM1<kill> %ZMM1<def>
-; KNL_32-NEXT: vpxord %zmm3, %zmm3, %zmm3
-; KNL_32-NEXT: vinserti32x4 $0, %xmm2, %zmm3, %zmm2
+; KNL_32: # %bb.0:
+; KNL_32-NEXT: vpsllq $32, %xmm1, %xmm1
+; KNL_32-NEXT: vpsraq $32, %zmm1, %zmm1
+; KNL_32-NEXT: vmovdqa %xmm2, %xmm2
; KNL_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; KNL_32-NEXT: vpsllq $63, %zmm2, %zmm2
; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k1
@@ -1135,24 +1140,21 @@ define void @test21(<2 x i32>%a1, <2 x i32*> %ptr, <2 x i1>%mask) {
; KNL_32-NEXT: retl
;
; SKX-LABEL: test21:
-; SKX: # BB#0:
-; SKX-NEXT: # kill: %XMM1<def> %XMM1<kill> %YMM1<def>
+; SKX: # %bb.0:
+; SKX-NEXT: # kill: def %xmm1 killed %xmm1 def %ymm1
; SKX-NEXT: vpsllq $63, %xmm2, %xmm2
-; SKX-NEXT: vptestmq %xmm2, %xmm2, %k0
-; SKX-NEXT: kshiftlb $6, %k0, %k0
-; SKX-NEXT: kshiftrb $6, %k0, %k1
+; SKX-NEXT: vptestmq %xmm2, %xmm2, %k1
; SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SKX-NEXT: vpscatterqd %xmm0, (,%ymm1) {%k1}
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
;
; SKX_32-LABEL: test21:
-; SKX_32: # BB#0:
-; SKX_32-NEXT: # kill: %XMM1<def> %XMM1<kill> %YMM1<def>
+; SKX_32: # %bb.0:
+; SKX_32-NEXT: vpsllq $32, %xmm1, %xmm1
+; SKX_32-NEXT: vpsraq $32, %xmm1, %xmm1
; SKX_32-NEXT: vpsllq $63, %xmm2, %xmm2
-; SKX_32-NEXT: vptestmq %xmm2, %xmm2, %k0
-; SKX_32-NEXT: kshiftlb $6, %k0, %k0
-; SKX_32-NEXT: kshiftrb $6, %k0, %k1
+; SKX_32-NEXT: vptestmq %xmm2, %xmm2, %k1
; SKX_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SKX_32-NEXT: vpscatterqd %xmm0, (,%ymm1) {%k1}
; SKX_32-NEXT: vzeroupper
@@ -1165,15 +1167,12 @@ define void @test21(<2 x i32>%a1, <2 x i32*> %ptr, <2 x i1>%mask) {
declare <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*>, i32, <2 x i1>, <2 x float>)
define <2 x float> @test22(float* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x float> %src0) {
-;
-;
; KNL_64-LABEL: test22:
-; KNL_64: # BB#0:
-; KNL_64-NEXT: # kill: %XMM2<def> %XMM2<kill> %YMM2<def>
-; KNL_64-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,2],zero,zero
-; KNL_64-NEXT: vpxor %ymm3, %ymm3, %ymm3
-; KNL_64-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
+; KNL_64: # %bb.0:
+; KNL_64-NEXT: # kill: def %xmm2 killed %xmm2 def %ymm2
; KNL_64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; KNL_64-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,2],zero,zero
+; KNL_64-NEXT: vmovaps %xmm1, %xmm1
; KNL_64-NEXT: vpmovsxdq %ymm0, %zmm0
; KNL_64-NEXT: vpslld $31, %ymm1, %ymm1
; KNL_64-NEXT: vptestmd %zmm1, %zmm1, %k1
@@ -1183,12 +1182,11 @@ define <2 x float> @test22(float* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x fl
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test22:
-; KNL_32: # BB#0:
-; KNL_32-NEXT: # kill: %XMM2<def> %XMM2<kill> %YMM2<def>
-; KNL_32-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,2],zero,zero
-; KNL_32-NEXT: vpxor %ymm3, %ymm3, %ymm3
-; KNL_32-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
+; KNL_32: # %bb.0:
+; KNL_32-NEXT: # kill: def %xmm2 killed %xmm2 def %ymm2
; KNL_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; KNL_32-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,2],zero,zero
+; KNL_32-NEXT: vmovaps %xmm1, %xmm1
; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm0
; KNL_32-NEXT: vpslld $31, %ymm1, %ymm1
@@ -1199,23 +1197,19 @@ define <2 x float> @test22(float* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x fl
; KNL_32-NEXT: retl
;
; SKX-LABEL: test22:
-; SKX: # BB#0:
-; SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SKX: # %bb.0:
+; SKX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SKX-NEXT: vpsllq $63, %xmm1, %xmm1
-; SKX-NEXT: vptestmq %xmm1, %xmm1, %k0
-; SKX-NEXT: kshiftlb $6, %k0, %k0
-; SKX-NEXT: kshiftrb $6, %k0, %k1
+; SKX-NEXT: vptestmq %xmm1, %xmm1, %k1
; SKX-NEXT: vgatherdps (%rdi,%xmm0,4), %xmm2 {%k1}
; SKX-NEXT: vmovaps %xmm2, %xmm0
; SKX-NEXT: retq
;
; SKX_32-LABEL: test22:
-; SKX_32: # BB#0:
-; SKX_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SKX_32: # %bb.0:
+; SKX_32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SKX_32-NEXT: vpsllq $63, %xmm1, %xmm1
-; SKX_32-NEXT: vptestmq %xmm1, %xmm1, %k0
-; SKX_32-NEXT: kshiftlb $6, %k0, %k0
-; SKX_32-NEXT: kshiftrb $6, %k0, %k1
+; SKX_32-NEXT: vptestmq %xmm1, %xmm1, %k1
; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; SKX_32-NEXT: vgatherdps (%eax,%xmm0,4), %xmm2 {%k1}
; SKX_32-NEXT: vmovaps %xmm2, %xmm0
@@ -1228,12 +1222,11 @@ define <2 x float> @test22(float* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x fl
define <2 x float> @test22a(float* %base, <2 x i64> %ind, <2 x i1> %mask, <2 x float> %src0) {
; KNL_64-LABEL: test22a:
-; KNL_64: # BB#0:
-; KNL_64-NEXT: # kill: %XMM2<def> %XMM2<kill> %YMM2<def>
-; KNL_64-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; KNL_64: # %bb.0:
+; KNL_64-NEXT: # kill: def %xmm2 killed %xmm2 def %ymm2
+; KNL_64-NEXT: # kill: def %xmm0 killed %xmm0 def %zmm0
; KNL_64-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,2],zero,zero
-; KNL_64-NEXT: vpxor %ymm3, %ymm3, %ymm3
-; KNL_64-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
+; KNL_64-NEXT: vmovaps %xmm1, %xmm1
; KNL_64-NEXT: vpslld $31, %ymm1, %ymm1
; KNL_64-NEXT: vptestmd %zmm1, %zmm1, %k1
; KNL_64-NEXT: vgatherqps (%rdi,%zmm0,4), %ymm2 {%k1}
@@ -1242,12 +1235,11 @@ define <2 x float> @test22a(float* %base, <2 x i64> %ind, <2 x i1> %mask, <2 x f
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test22a:
-; KNL_32: # BB#0:
-; KNL_32-NEXT: # kill: %XMM2<def> %XMM2<kill> %YMM2<def>
-; KNL_32-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; KNL_32: # %bb.0:
+; KNL_32-NEXT: # kill: def %xmm2 killed %xmm2 def %ymm2
+; KNL_32-NEXT: # kill: def %xmm0 killed %xmm0 def %zmm0
; KNL_32-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,2],zero,zero
-; KNL_32-NEXT: vpxor %ymm3, %ymm3, %ymm3
-; KNL_32-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
+; KNL_32-NEXT: vmovaps %xmm1, %xmm1
; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; KNL_32-NEXT: vpslld $31, %ymm1, %ymm1
; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
@@ -1257,7 +1249,7 @@ define <2 x float> @test22a(float* %base, <2 x i64> %ind, <2 x i1> %mask, <2 x f
; KNL_32-NEXT: retl
;
; SKX-LABEL: test22a:
-; SKX: # BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vpsllq $63, %xmm1, %xmm1
; SKX-NEXT: vptestmq %xmm1, %xmm1, %k1
; SKX-NEXT: vgatherqps (%rdi,%xmm0,4), %xmm2 {%k1}
@@ -1265,7 +1257,7 @@ define <2 x float> @test22a(float* %base, <2 x i64> %ind, <2 x i1> %mask, <2 x f
; SKX-NEXT: retq
;
; SKX_32-LABEL: test22a:
-; SKX_32: # BB#0:
+; SKX_32: # %bb.0:
; SKX_32-NEXT: vpsllq $63, %xmm1, %xmm1
; SKX_32-NEXT: vptestmq %xmm1, %xmm1, %k1
; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
@@ -1281,51 +1273,54 @@ declare <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*>, i32, <2 x i1>, <
declare <2 x i64> @llvm.masked.gather.v2i64.v2p0i64(<2 x i64*>, i32, <2 x i1>, <2 x i64>)
define <2 x i32> @test23(i32* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i32> %src0) {
-;
; KNL_64-LABEL: test23:
-; KNL_64: # BB#0:
-; KNL_64-NEXT: # kill: %XMM2<def> %XMM2<kill> %ZMM2<def>
-; KNL_64-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
-; KNL_64-NEXT: vpxord %zmm3, %zmm3, %zmm3
-; KNL_64-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1
-; KNL_64-NEXT: vpsllq $63, %zmm1, %zmm1
-; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1
-; KNL_64-NEXT: vpgatherqq (%rdi,%zmm0,8), %zmm2 {%k1}
-; KNL_64-NEXT: vmovdqa %xmm2, %xmm0
+; KNL_64: # %bb.0:
+; KNL_64-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; KNL_64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; KNL_64-NEXT: vpmovsxdq %ymm0, %zmm0
+; KNL_64-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,2],zero,zero
+; KNL_64-NEXT: vmovaps %xmm1, %xmm1
+; KNL_64-NEXT: vpslld $31, %ymm1, %ymm1
+; KNL_64-NEXT: vptestmd %zmm1, %zmm1, %k1
+; KNL_64-NEXT: vpgatherqd (%rdi,%zmm0,4), %ymm2 {%k1}
+; KNL_64-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero
; KNL_64-NEXT: vzeroupper
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test23:
-; KNL_32: # BB#0:
-; KNL_32-NEXT: # kill: %XMM2<def> %XMM2<kill> %ZMM2<def>
-; KNL_32-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
-; KNL_32-NEXT: vpxord %zmm3, %zmm3, %zmm3
-; KNL_32-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1
+; KNL_32: # %bb.0:
; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; KNL_32-NEXT: vpsllq $63, %zmm1, %zmm1
-; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1
-; KNL_32-NEXT: vpgatherqq (%eax,%zmm0,8), %zmm2 {%k1}
-; KNL_32-NEXT: vmovdqa %xmm2, %xmm0
+; KNL_32-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; KNL_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm0
+; KNL_32-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,2],zero,zero
+; KNL_32-NEXT: vmovaps %xmm1, %xmm1
+; KNL_32-NEXT: vpslld $31, %ymm1, %ymm1
+; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
+; KNL_32-NEXT: vpgatherqd (%eax,%zmm0,4), %ymm2 {%k1}
+; KNL_32-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero
; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
;
; SKX-LABEL: test23:
-; SKX: # BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vpsllq $63, %xmm1, %xmm1
; SKX-NEXT: vptestmq %xmm1, %xmm1, %k1
+; SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
-; SKX-NEXT: vpgatherqd (%rdi,%xmm0,4), %xmm1 {%k1}
-; SKX-NEXT: vpmovsxdq %xmm1, %xmm0
+; SKX-NEXT: vpgatherdd (%rdi,%xmm0,4), %xmm1 {%k1}
+; SKX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero
; SKX-NEXT: retq
;
; SKX_32-LABEL: test23:
-; SKX_32: # BB#0:
+; SKX_32: # %bb.0:
; SKX_32-NEXT: vpsllq $63, %xmm1, %xmm1
; SKX_32-NEXT: vptestmq %xmm1, %xmm1, %k1
; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; SKX_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SKX_32-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
-; SKX_32-NEXT: vpgatherqd (%eax,%xmm0,4), %xmm1 {%k1}
-; SKX_32-NEXT: vpmovsxdq %xmm1, %xmm0
+; SKX_32-NEXT: vpgatherdd (%eax,%xmm0,4), %xmm1 {%k1}
+; SKX_32-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero
; SKX_32-NEXT: retl
%sext_ind = sext <2 x i32> %ind to <2 x i64>
%gep.random = getelementptr i32, i32* %base, <2 x i64> %sext_ind
@@ -1333,43 +1328,98 @@ define <2 x i32> @test23(i32* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i32> %
ret <2 x i32>%res
}
+define <2 x i32> @test23b(i32* %base, <2 x i64> %ind, <2 x i1> %mask, <2 x i32> %src0) {
+; KNL_64-LABEL: test23b:
+; KNL_64: # %bb.0:
+; KNL_64-NEXT: # kill: def %xmm0 killed %xmm0 def %zmm0
+; KNL_64-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; KNL_64-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,2],zero,zero
+; KNL_64-NEXT: vmovaps %xmm1, %xmm1
+; KNL_64-NEXT: vpslld $31, %ymm1, %ymm1
+; KNL_64-NEXT: vptestmd %zmm1, %zmm1, %k1
+; KNL_64-NEXT: vpgatherqd (%rdi,%zmm0,4), %ymm2 {%k1}
+; KNL_64-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero
+; KNL_64-NEXT: vzeroupper
+; KNL_64-NEXT: retq
+;
+; KNL_32-LABEL: test23b:
+; KNL_32: # %bb.0:
+; KNL_32-NEXT: # kill: def %xmm0 killed %xmm0 def %zmm0
+; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; KNL_32-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; KNL_32-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,2],zero,zero
+; KNL_32-NEXT: vmovaps %xmm1, %xmm1
+; KNL_32-NEXT: vpslld $31, %ymm1, %ymm1
+; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
+; KNL_32-NEXT: vpgatherqd (%eax,%zmm0,4), %ymm2 {%k1}
+; KNL_32-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero
+; KNL_32-NEXT: vzeroupper
+; KNL_32-NEXT: retl
+;
+; SKX-LABEL: test23b:
+; SKX: # %bb.0:
+; SKX-NEXT: vpsllq $63, %xmm1, %xmm1
+; SKX-NEXT: vptestmq %xmm1, %xmm1, %k1
+; SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
+; SKX-NEXT: vpgatherqd (%rdi,%xmm0,4), %xmm1 {%k1}
+; SKX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero
+; SKX-NEXT: retq
+;
+; SKX_32-LABEL: test23b:
+; SKX_32: # %bb.0:
+; SKX_32-NEXT: vpsllq $63, %xmm1, %xmm1
+; SKX_32-NEXT: vptestmq %xmm1, %xmm1, %k1
+; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; SKX_32-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
+; SKX_32-NEXT: vpgatherqd (%eax,%xmm0,4), %xmm1 {%k1}
+; SKX_32-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero
+; SKX_32-NEXT: retl
+ %gep.random = getelementptr i32, i32* %base, <2 x i64> %ind
+ %res = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> %gep.random, i32 4, <2 x i1> %mask, <2 x i32> %src0)
+ ret <2 x i32>%res
+}
+
define <2 x i32> @test24(i32* %base, <2 x i32> %ind) {
; KNL_64-LABEL: test24:
-; KNL_64: # BB#0:
-; KNL_64-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; KNL_64: # %bb.0:
+; KNL_64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; KNL_64-NEXT: vpmovsxdq %ymm0, %zmm0
; KNL_64-NEXT: movb $3, %al
; KNL_64-NEXT: kmovw %eax, %k1
-; KNL_64-NEXT: vpgatherqq (%rdi,%zmm0,8), %zmm1 {%k1}
-; KNL_64-NEXT: vmovdqa %xmm1, %xmm0
+; KNL_64-NEXT: vpgatherqd (%rdi,%zmm0,4), %ymm1 {%k1}
+; KNL_64-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero
; KNL_64-NEXT: vzeroupper
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test24:
-; KNL_32: # BB#0:
-; KNL_32-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; KNL_32: # %bb.0:
; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; KNL_32-NEXT: vpxord %zmm1, %zmm1, %zmm1
-; KNL_32-NEXT: vinserti32x4 $0, {{\.LCPI.*}}, %zmm1, %zmm1
-; KNL_32-NEXT: vpsllq $63, %zmm1, %zmm1
-; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1
-; KNL_32-NEXT: vpgatherqq (%eax,%zmm0,8), %zmm1 {%k1}
-; KNL_32-NEXT: vmovdqa %xmm1, %xmm0
+; KNL_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm0
+; KNL_32-NEXT: movb $3, %cl
+; KNL_32-NEXT: kmovw %ecx, %k1
+; KNL_32-NEXT: vpgatherqd (%eax,%zmm0,4), %ymm1 {%k1}
+; KNL_32-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero
; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
;
; SKX-LABEL: test24:
-; SKX: # BB#0:
-; SKX-NEXT: kxnorw %k0, %k0, %k1
-; SKX-NEXT: vpgatherqd (%rdi,%xmm0,4), %xmm1 {%k1}
-; SKX-NEXT: vpmovsxdq %xmm1, %xmm0
+; SKX: # %bb.0:
+; SKX-NEXT: movb $3, %al
+; SKX-NEXT: kmovw %eax, %k1
+; SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SKX-NEXT: vpgatherdd (%rdi,%xmm0,4), %xmm1 {%k1}
+; SKX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero
; SKX-NEXT: retq
;
; SKX_32-LABEL: test24:
-; SKX_32: # BB#0:
+; SKX_32: # %bb.0:
; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; SKX_32-NEXT: kxnorw %k0, %k0, %k1
-; SKX_32-NEXT: vpgatherqd (%eax,%xmm0,4), %xmm1 {%k1}
-; SKX_32-NEXT: vpmovsxdq %xmm1, %xmm0
+; SKX_32-NEXT: movb $3, %cl
+; SKX_32-NEXT: kmovw %ecx, %k1
+; SKX_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SKX_32-NEXT: vpgatherdd (%eax,%xmm0,4), %xmm1 {%k1}
+; SKX_32-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero
; SKX_32-NEXT: retl
%sext_ind = sext <2 x i32> %ind to <2 x i64>
%gep.random = getelementptr i32, i32* %base, <2 x i64> %sext_ind
@@ -1378,13 +1428,12 @@ define <2 x i32> @test24(i32* %base, <2 x i32> %ind) {
}
define <2 x i64> @test25(i64* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i64> %src0) {
-;
; KNL_64-LABEL: test25:
-; KNL_64: # BB#0:
-; KNL_64-NEXT: # kill: %XMM2<def> %XMM2<kill> %ZMM2<def>
-; KNL_64-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
-; KNL_64-NEXT: vpxord %zmm3, %zmm3, %zmm3
-; KNL_64-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1
+; KNL_64: # %bb.0:
+; KNL_64-NEXT: # kill: def %xmm2 killed %xmm2 def %zmm2
+; KNL_64-NEXT: vpsllq $32, %xmm0, %xmm0
+; KNL_64-NEXT: vpsraq $32, %zmm0, %zmm0
+; KNL_64-NEXT: vmovdqa %xmm1, %xmm1
; KNL_64-NEXT: vpsllq $63, %zmm1, %zmm1
; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1
; KNL_64-NEXT: vpgatherqq (%rdi,%zmm0,8), %zmm2 {%k1}
@@ -1393,11 +1442,11 @@ define <2 x i64> @test25(i64* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i64> %
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test25:
-; KNL_32: # BB#0:
-; KNL_32-NEXT: # kill: %XMM2<def> %XMM2<kill> %ZMM2<def>
-; KNL_32-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
-; KNL_32-NEXT: vpxord %zmm3, %zmm3, %zmm3
-; KNL_32-NEXT: vinserti32x4 $0, %xmm1, %zmm3, %zmm1
+; KNL_32: # %bb.0:
+; KNL_32-NEXT: # kill: def %xmm2 killed %xmm2 def %zmm2
+; KNL_32-NEXT: vpsllq $32, %xmm0, %xmm0
+; KNL_32-NEXT: vpsraq $32, %zmm0, %zmm0
+; KNL_32-NEXT: vmovdqa %xmm1, %xmm1
; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; KNL_32-NEXT: vpsllq $63, %zmm1, %zmm1
; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1
@@ -1407,7 +1456,9 @@ define <2 x i64> @test25(i64* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i64> %
; KNL_32-NEXT: retl
;
; SKX-LABEL: test25:
-; SKX: # BB#0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpsllq $32, %xmm0, %xmm0
+; SKX-NEXT: vpsraq $32, %xmm0, %xmm0
; SKX-NEXT: vpsllq $63, %xmm1, %xmm1
; SKX-NEXT: vptestmq %xmm1, %xmm1, %k1
; SKX-NEXT: vpgatherqq (%rdi,%xmm0,8), %xmm2 {%k1}
@@ -1415,7 +1466,9 @@ define <2 x i64> @test25(i64* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i64> %
; SKX-NEXT: retq
;
; SKX_32-LABEL: test25:
-; SKX_32: # BB#0:
+; SKX_32: # %bb.0:
+; SKX_32-NEXT: vpsllq $32, %xmm0, %xmm0
+; SKX_32-NEXT: vpsraq $32, %xmm0, %xmm0
; SKX_32-NEXT: vpsllq $63, %xmm1, %xmm1
; SKX_32-NEXT: vptestmq %xmm1, %xmm1, %k1
; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
@@ -1429,11 +1482,11 @@ define <2 x i64> @test25(i64* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i64> %
}
define <2 x i64> @test26(i64* %base, <2 x i32> %ind, <2 x i64> %src0) {
-;
; KNL_64-LABEL: test26:
-; KNL_64: # BB#0:
-; KNL_64-NEXT: # kill: %XMM1<def> %XMM1<kill> %ZMM1<def>
-; KNL_64-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; KNL_64: # %bb.0:
+; KNL_64-NEXT: # kill: def %xmm1 killed %xmm1 def %zmm1
+; KNL_64-NEXT: vpsllq $32, %xmm0, %xmm0
+; KNL_64-NEXT: vpsraq $32, %zmm0, %zmm0
; KNL_64-NEXT: movb $3, %al
; KNL_64-NEXT: kmovw %eax, %k1
; KNL_64-NEXT: vpgatherqq (%rdi,%zmm0,8), %zmm1 {%k1}
@@ -1442,12 +1495,12 @@ define <2 x i64> @test26(i64* %base, <2 x i32> %ind, <2 x i64> %src0) {
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test26:
-; KNL_32: # BB#0:
-; KNL_32-NEXT: # kill: %XMM1<def> %XMM1<kill> %ZMM1<def>
-; KNL_32-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; KNL_32: # %bb.0:
+; KNL_32-NEXT: # kill: def %xmm1 killed %xmm1 def %zmm1
+; KNL_32-NEXT: vpsllq $32, %xmm0, %xmm0
+; KNL_32-NEXT: vpsraq $32, %zmm0, %zmm0
; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; KNL_32-NEXT: vpxord %zmm2, %zmm2, %zmm2
-; KNL_32-NEXT: vinserti32x4 $0, {{\.LCPI.*}}, %zmm2, %zmm2
+; KNL_32-NEXT: vmovdqa {{.*#+}} xmm2 = [1,0,1,0]
; KNL_32-NEXT: vpsllq $63, %zmm2, %zmm2
; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k1
; KNL_32-NEXT: vpgatherqq (%eax,%zmm0,8), %zmm1 {%k1}
@@ -1456,14 +1509,18 @@ define <2 x i64> @test26(i64* %base, <2 x i32> %ind, <2 x i64> %src0) {
; KNL_32-NEXT: retl
;
; SKX-LABEL: test26:
-; SKX: # BB#0:
+; SKX: # %bb.0:
+; SKX-NEXT: vpsllq $32, %xmm0, %xmm0
+; SKX-NEXT: vpsraq $32, %xmm0, %xmm0
; SKX-NEXT: kxnorw %k0, %k0, %k1
; SKX-NEXT: vpgatherqq (%rdi,%xmm0,8), %xmm1 {%k1}
; SKX-NEXT: vmovdqa %xmm1, %xmm0
; SKX-NEXT: retq
;
; SKX_32-LABEL: test26:
-; SKX_32: # BB#0:
+; SKX_32: # %bb.0:
+; SKX_32-NEXT: vpsllq $32, %xmm0, %xmm0
+; SKX_32-NEXT: vpsraq $32, %xmm0, %xmm0
; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; SKX_32-NEXT: kxnorw %k0, %k0, %k1
; SKX_32-NEXT: vpgatherqq (%eax,%xmm0,8), %xmm1 {%k1}
@@ -1477,41 +1534,40 @@ define <2 x i64> @test26(i64* %base, <2 x i32> %ind, <2 x i64> %src0) {
; Result type requires widening; all-ones mask
define <2 x float> @test27(float* %base, <2 x i32> %ind) {
-;
; KNL_64-LABEL: test27:
-; KNL_64: # BB#0:
+; KNL_64: # %bb.0:
; KNL_64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; KNL_64-NEXT: vpmovsxdq %ymm0, %zmm1
; KNL_64-NEXT: movb $3, %al
; KNL_64-NEXT: kmovw %eax, %k1
; KNL_64-NEXT: vgatherqps (%rdi,%zmm1,4), %ymm0 {%k1}
-; KNL_64-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; KNL_64-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; KNL_64-NEXT: vzeroupper
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test27:
-; KNL_32: # BB#0:
+; KNL_32: # %bb.0:
; KNL_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm1
; KNL_32-NEXT: movb $3, %cl
; KNL_32-NEXT: kmovw %ecx, %k1
; KNL_32-NEXT: vgatherqps (%eax,%zmm1,4), %ymm0 {%k1}
-; KNL_32-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; KNL_32-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
;
; SKX-LABEL: test27:
-; SKX: # BB#0:
-; SKX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
+; SKX: # %bb.0:
+; SKX-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,2,2,3]
; SKX-NEXT: movb $3, %al
; SKX-NEXT: kmovw %eax, %k1
; SKX-NEXT: vgatherdps (%rdi,%xmm1,4), %xmm0 {%k1}
; SKX-NEXT: retq
;
; SKX_32-LABEL: test27:
-; SKX_32: # BB#0:
-; SKX_32-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
+; SKX_32: # %bb.0:
+; SKX_32-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,2,2,3]
; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; SKX_32-NEXT: movb $3, %cl
; SKX_32-NEXT: kmovw %ecx, %k1
@@ -1525,11 +1581,9 @@ define <2 x float> @test27(float* %base, <2 x i32> %ind) {
; Data type requires promotion, mask is all-ones
define void @test28(<2 x i32>%a1, <2 x i32*> %ptr) {
-;
-;
; KNL_64-LABEL: test28:
-; KNL_64: # BB#0:
-; KNL_64-NEXT: # kill: %XMM1<def> %XMM1<kill> %ZMM1<def>
+; KNL_64: # %bb.0:
+; KNL_64-NEXT: # kill: def %xmm1 killed %xmm1 def %zmm1
; KNL_64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; KNL_64-NEXT: movb $3, %al
; KNL_64-NEXT: kmovw %eax, %k1
@@ -1538,11 +1592,11 @@ define void @test28(<2 x i32>%a1, <2 x i32*> %ptr) {
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test28:
-; KNL_32: # BB#0:
-; KNL_32-NEXT: # kill: %XMM1<def> %XMM1<kill> %ZMM1<def>
+; KNL_32: # %bb.0:
+; KNL_32-NEXT: vpsllq $32, %xmm1, %xmm1
+; KNL_32-NEXT: vpsraq $32, %zmm1, %zmm1
; KNL_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; KNL_32-NEXT: vpxord %zmm2, %zmm2, %zmm2
-; KNL_32-NEXT: vinserti32x4 $0, {{\.LCPI.*}}, %zmm2, %zmm2
+; KNL_32-NEXT: vmovdqa {{.*#+}} xmm2 = [1,0,1,0]
; KNL_32-NEXT: vpsllq $63, %zmm2, %zmm2
; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k1
; KNL_32-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1}
@@ -1550,8 +1604,8 @@ define void @test28(<2 x i32>%a1, <2 x i32*> %ptr) {
; KNL_32-NEXT: retl
;
; SKX-LABEL: test28:
-; SKX: # BB#0:
-; SKX-NEXT: # kill: %XMM1<def> %XMM1<kill> %YMM1<def>
+; SKX: # %bb.0:
+; SKX-NEXT: # kill: def %xmm1 killed %xmm1 def %ymm1
; SKX-NEXT: movb $3, %al
; SKX-NEXT: kmovw %eax, %k1
; SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
@@ -1560,8 +1614,9 @@ define void @test28(<2 x i32>%a1, <2 x i32*> %ptr) {
; SKX-NEXT: retq
;
; SKX_32-LABEL: test28:
-; SKX_32: # BB#0:
-; SKX_32-NEXT: # kill: %XMM1<def> %XMM1<kill> %YMM1<def>
+; SKX_32: # %bb.0:
+; SKX_32-NEXT: vpsllq $32, %xmm1, %xmm1
+; SKX_32-NEXT: vpsraq $32, %xmm1, %xmm1
; SKX_32-NEXT: movb $3, %al
; SKX_32-NEXT: kmovw %eax, %k1
; SKX_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
@@ -1572,7 +1627,6 @@ define void @test28(<2 x i32>%a1, <2 x i32*> %ptr) {
ret void
}
-
; SCALAR-LABEL: test29
; SCALAR: extractelement <16 x float*>
; SCALAR-NEXT: load float
@@ -1582,7 +1636,7 @@ define void @test28(<2 x i32>%a1, <2 x i32*> %ptr) {
define <16 x float> @test29(float* %base, <16 x i32> %ind) {
; KNL_64-LABEL: test29:
-; KNL_64: # BB#0:
+; KNL_64: # %bb.0:
; KNL_64-NEXT: movw $44, %ax
; KNL_64-NEXT: kmovw %eax, %k1
; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
@@ -1590,7 +1644,7 @@ define <16 x float> @test29(float* %base, <16 x i32> %ind) {
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test29:
-; KNL_32: # BB#0:
+; KNL_32: # %bb.0:
; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; KNL_32-NEXT: movw $44, %cx
; KNL_32-NEXT: kmovw %ecx, %k1
@@ -1599,7 +1653,7 @@ define <16 x float> @test29(float* %base, <16 x i32> %ind) {
; KNL_32-NEXT: retl
;
; SKX-LABEL: test29:
-; SKX: # BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: movw $44, %ax
; SKX-NEXT: kmovw %eax, %k1
; SKX-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
@@ -1607,7 +1661,7 @@ define <16 x float> @test29(float* %base, <16 x i32> %ind) {
; SKX-NEXT: retq
;
; SKX_32-LABEL: test29:
-; SKX_32: # BB#0:
+; SKX_32: # %bb.0:
; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; SKX_32-NEXT: movw $44, %cx
; SKX_32-NEXT: kmovw %ecx, %k1
@@ -1628,8 +1682,159 @@ define <16 x float> @test29(float* %base, <16 x i32> %ind) {
; Check non-power-of-2 case. It should be scalarized.
declare <3 x i32> @llvm.masked.gather.v3i32.v3p0i32(<3 x i32*>, i32, <3 x i1>, <3 x i32>)
define <3 x i32> @test30(<3 x i32*> %base, <3 x i32> %ind, <3 x i1> %mask, <3 x i32> %src0) {
-; ALL-LABEL: test30
-; ALL-NOT: gather
+; KNL_64-LABEL: test30:
+; KNL_64: # %bb.0:
+; KNL_64-NEXT: vpmovsxdq %xmm1, %ymm1
+; KNL_64-NEXT: vpsllq $2, %ymm1, %ymm1
+; KNL_64-NEXT: vpaddq %ymm1, %ymm0, %ymm1
+; KNL_64-NEXT: testb $1, %dil
+; KNL_64-NEXT: # implicit-def: %xmm0
+; KNL_64-NEXT: jne .LBB31_1
+; KNL_64-NEXT: # %bb.2: # %else
+; KNL_64-NEXT: testb $1, %sil
+; KNL_64-NEXT: jne .LBB31_3
+; KNL_64-NEXT: .LBB31_4: # %else2
+; KNL_64-NEXT: testb $1, %dl
+; KNL_64-NEXT: jne .LBB31_5
+; KNL_64-NEXT: .LBB31_6: # %else5
+; KNL_64-NEXT: vmovd %edi, %xmm1
+; KNL_64-NEXT: vpinsrb $4, %esi, %xmm1, %xmm1
+; KNL_64-NEXT: vpinsrb $8, %edx, %xmm1, %xmm1
+; KNL_64-NEXT: vpslld $31, %xmm1, %xmm1
+; KNL_64-NEXT: vblendvps %xmm1, %xmm0, %xmm2, %xmm0
+; KNL_64-NEXT: vzeroupper
+; KNL_64-NEXT: retq
+; KNL_64-NEXT: .LBB31_1: # %cond.load
+; KNL_64-NEXT: vmovq %xmm1, %rax
+; KNL_64-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; KNL_64-NEXT: testb $1, %sil
+; KNL_64-NEXT: je .LBB31_4
+; KNL_64-NEXT: .LBB31_3: # %cond.load1
+; KNL_64-NEXT: vpextrq $1, %xmm1, %rax
+; KNL_64-NEXT: vpinsrd $1, (%rax), %xmm0, %xmm0
+; KNL_64-NEXT: testb $1, %dl
+; KNL_64-NEXT: je .LBB31_6
+; KNL_64-NEXT: .LBB31_5: # %cond.load4
+; KNL_64-NEXT: vextracti128 $1, %ymm1, %xmm1
+; KNL_64-NEXT: vmovq %xmm1, %rax
+; KNL_64-NEXT: vpinsrd $2, (%rax), %xmm0, %xmm0
+; KNL_64-NEXT: jmp .LBB31_6
+;
+; KNL_32-LABEL: test30:
+; KNL_32: # %bb.0:
+; KNL_32-NEXT: pushl %esi
+; KNL_32-NEXT: .cfi_def_cfa_offset 8
+; KNL_32-NEXT: .cfi_offset %esi, -8
+; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %edx
+; KNL_32-NEXT: vpslld $2, %xmm1, %xmm1
+; KNL_32-NEXT: vpaddd %xmm1, %xmm0, %xmm1
+; KNL_32-NEXT: testb $1, %dl
+; KNL_32-NEXT: # implicit-def: %xmm0
+; KNL_32-NEXT: jne .LBB31_1
+; KNL_32-NEXT: # %bb.2: # %else
+; KNL_32-NEXT: testb $1, %cl
+; KNL_32-NEXT: jne .LBB31_3
+; KNL_32-NEXT: .LBB31_4: # %else2
+; KNL_32-NEXT: testb $1, %al
+; KNL_32-NEXT: jne .LBB31_5
+; KNL_32-NEXT: .LBB31_6: # %else5
+; KNL_32-NEXT: vmovd %edx, %xmm1
+; KNL_32-NEXT: vpinsrb $4, %ecx, %xmm1, %xmm1
+; KNL_32-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; KNL_32-NEXT: vpslld $31, %xmm1, %xmm1
+; KNL_32-NEXT: vblendvps %xmm1, %xmm0, %xmm2, %xmm0
+; KNL_32-NEXT: popl %esi
+; KNL_32-NEXT: retl
+; KNL_32-NEXT: .LBB31_1: # %cond.load
+; KNL_32-NEXT: vmovd %xmm1, %esi
+; KNL_32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; KNL_32-NEXT: testb $1, %cl
+; KNL_32-NEXT: je .LBB31_4
+; KNL_32-NEXT: .LBB31_3: # %cond.load1
+; KNL_32-NEXT: vpextrd $1, %xmm1, %esi
+; KNL_32-NEXT: vpinsrd $1, (%esi), %xmm0, %xmm0
+; KNL_32-NEXT: testb $1, %al
+; KNL_32-NEXT: je .LBB31_6
+; KNL_32-NEXT: .LBB31_5: # %cond.load4
+; KNL_32-NEXT: vpextrd $2, %xmm1, %esi
+; KNL_32-NEXT: vpinsrd $2, (%esi), %xmm0, %xmm0
+; KNL_32-NEXT: jmp .LBB31_6
+;
+; SKX-LABEL: test30:
+; SKX: # %bb.0:
+; SKX-NEXT: vpslld $31, %xmm2, %xmm2
+; SKX-NEXT: vptestmd %xmm2, %xmm2, %k1
+; SKX-NEXT: kmovw %k1, %eax
+; SKX-NEXT: vpmovsxdq %xmm1, %ymm1
+; SKX-NEXT: vpsllq $2, %ymm1, %ymm1
+; SKX-NEXT: vpaddq %ymm1, %ymm0, %ymm1
+; SKX-NEXT: testb $1, %al
+; SKX-NEXT: # implicit-def: %xmm0
+; SKX-NEXT: je .LBB31_2
+; SKX-NEXT: # %bb.1: # %cond.load
+; SKX-NEXT: vmovq %xmm1, %rax
+; SKX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SKX-NEXT: .LBB31_2: # %else
+; SKX-NEXT: kshiftrw $1, %k1, %k0
+; SKX-NEXT: kmovw %k0, %eax
+; SKX-NEXT: testb $1, %al
+; SKX-NEXT: je .LBB31_4
+; SKX-NEXT: # %bb.3: # %cond.load1
+; SKX-NEXT: vpextrq $1, %xmm1, %rax
+; SKX-NEXT: vpinsrd $1, (%rax), %xmm0, %xmm0
+; SKX-NEXT: .LBB31_4: # %else2
+; SKX-NEXT: kshiftrw $2, %k1, %k0
+; SKX-NEXT: kmovw %k0, %eax
+; SKX-NEXT: testb $1, %al
+; SKX-NEXT: je .LBB31_6
+; SKX-NEXT: # %bb.5: # %cond.load4
+; SKX-NEXT: vextracti128 $1, %ymm1, %xmm1
+; SKX-NEXT: vmovq %xmm1, %rax
+; SKX-NEXT: vpinsrd $2, (%rax), %xmm0, %xmm0
+; SKX-NEXT: .LBB31_6: # %else5
+; SKX-NEXT: vmovdqa32 %xmm0, %xmm3 {%k1}
+; SKX-NEXT: vmovdqa %xmm3, %xmm0
+; SKX-NEXT: vzeroupper
+; SKX-NEXT: retq
+;
+; SKX_32-LABEL: test30:
+; SKX_32: # %bb.0:
+; SKX_32-NEXT: subl $12, %esp
+; SKX_32-NEXT: .cfi_def_cfa_offset 16
+; SKX_32-NEXT: vpslld $31, %xmm2, %xmm2
+; SKX_32-NEXT: vptestmd %xmm2, %xmm2, %k1
+; SKX_32-NEXT: kmovw %k1, %eax
+; SKX_32-NEXT: vpslld $2, %xmm1, %xmm1
+; SKX_32-NEXT: vpaddd %xmm1, %xmm0, %xmm2
+; SKX_32-NEXT: testb $1, %al
+; SKX_32-NEXT: # implicit-def: %xmm1
+; SKX_32-NEXT: je .LBB31_2
+; SKX_32-NEXT: # %bb.1: # %cond.load
+; SKX_32-NEXT: vmovd %xmm2, %eax
+; SKX_32-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SKX_32-NEXT: .LBB31_2: # %else
+; SKX_32-NEXT: kshiftrw $1, %k1, %k0
+; SKX_32-NEXT: kmovw %k0, %eax
+; SKX_32-NEXT: testb $1, %al
+; SKX_32-NEXT: je .LBB31_4
+; SKX_32-NEXT: # %bb.3: # %cond.load1
+; SKX_32-NEXT: vpextrd $1, %xmm2, %eax
+; SKX_32-NEXT: vpinsrd $1, (%eax), %xmm1, %xmm1
+; SKX_32-NEXT: .LBB31_4: # %else2
+; SKX_32-NEXT: vmovdqa {{[0-9]+}}(%esp), %xmm0
+; SKX_32-NEXT: kshiftrw $2, %k1, %k0
+; SKX_32-NEXT: kmovw %k0, %eax
+; SKX_32-NEXT: testb $1, %al
+; SKX_32-NEXT: je .LBB31_6
+; SKX_32-NEXT: # %bb.5: # %cond.load4
+; SKX_32-NEXT: vpextrd $2, %xmm2, %eax
+; SKX_32-NEXT: vpinsrd $2, (%eax), %xmm1, %xmm1
+; SKX_32-NEXT: .LBB31_6: # %else5
+; SKX_32-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1}
+; SKX_32-NEXT: addl $12, %esp
+; SKX_32-NEXT: retl
%sext_ind = sext <3 x i32> %ind to <3 x i64>
%gep.random = getelementptr i32, <3 x i32*> %base, <3 x i64> %sext_ind
@@ -1638,42 +1843,36 @@ define <3 x i32> @test30(<3 x i32*> %base, <3 x i32> %ind, <3 x i1> %mask, <3 x
}
declare <16 x float*> @llvm.masked.gather.v16p0f32.v16p0p0f32(<16 x float**>, i32, <16 x i1>, <16 x float*>)
-
-; KNL-LABEL: test31
-; KNL: vpgatherqq
-; KNL: vpgatherqq
define <16 x float*> @test31(<16 x float**> %ptrs) {
; KNL_64-LABEL: test31:
-; KNL_64: # BB#0:
+; KNL_64: # %bb.0:
; KNL_64-NEXT: kxnorw %k0, %k0, %k1
; KNL_64-NEXT: kxnorw %k0, %k0, %k2
; KNL_64-NEXT: vpgatherqq (,%zmm0), %zmm2 {%k2}
-; KNL_64-NEXT: kshiftrw $8, %k1, %k1
; KNL_64-NEXT: vpgatherqq (,%zmm1), %zmm3 {%k1}
; KNL_64-NEXT: vmovdqa64 %zmm2, %zmm0
; KNL_64-NEXT: vmovdqa64 %zmm3, %zmm1
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test31:
-; KNL_32: # BB#0:
+; KNL_32: # %bb.0:
; KNL_32-NEXT: kxnorw %k0, %k0, %k1
; KNL_32-NEXT: vpgatherdd (,%zmm0), %zmm1 {%k1}
; KNL_32-NEXT: vmovdqa64 %zmm1, %zmm0
; KNL_32-NEXT: retl
;
; SKX-LABEL: test31:
-; SKX: # BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: kxnorw %k0, %k0, %k1
; SKX-NEXT: kxnorw %k0, %k0, %k2
; SKX-NEXT: vpgatherqq (,%zmm0), %zmm2 {%k2}
-; SKX-NEXT: kshiftrw $8, %k1, %k1
; SKX-NEXT: vpgatherqq (,%zmm1), %zmm3 {%k1}
; SKX-NEXT: vmovdqa64 %zmm2, %zmm0
; SKX-NEXT: vmovdqa64 %zmm3, %zmm1
; SKX-NEXT: retq
;
; SKX_32-LABEL: test31:
-; SKX_32: # BB#0:
+; SKX_32: # %bb.0:
; SKX_32-NEXT: kxnorw %k0, %k0, %k1
; SKX_32-NEXT: vpgatherdd (,%zmm0), %zmm1 {%k1}
; SKX_32-NEXT: vmovdqa64 %zmm1, %zmm0
@@ -1685,7 +1884,7 @@ define <16 x float*> @test31(<16 x float**> %ptrs) {
define <16 x i32> @test_gather_16i32(<16 x i32*> %ptrs, <16 x i1> %mask, <16 x i32> %src0) {
; KNL_64-LABEL: test_gather_16i32:
-; KNL_64: # BB#0:
+; KNL_64: # %bb.0:
; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2
; KNL_64-NEXT: vpslld $31, %zmm2, %zmm2
; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
@@ -1697,7 +1896,7 @@ define <16 x i32> @test_gather_16i32(<16 x i32*> %ptrs, <16 x i1> %mask, <16 x i
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test_gather_16i32:
-; KNL_32: # BB#0:
+; KNL_32: # %bb.0:
; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1
; KNL_32-NEXT: vpslld $31, %zmm1, %zmm1
; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
@@ -1706,19 +1905,19 @@ define <16 x i32> @test_gather_16i32(<16 x i32*> %ptrs, <16 x i1> %mask, <16 x i
; KNL_32-NEXT: retl
;
; SKX-LABEL: test_gather_16i32:
-; SKX: # BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
; SKX-NEXT: vpslld $31, %zmm2, %zmm2
; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1
-; SKX-NEXT: vextracti32x8 $1, %zmm3, %ymm2
+; SKX-NEXT: vextracti64x4 $1, %zmm3, %ymm2
; SKX-NEXT: kshiftrw $8, %k1, %k2
; SKX-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2}
; SKX-NEXT: vpgatherqd (,%zmm0), %ymm3 {%k1}
-; SKX-NEXT: vinserti32x8 $1, %ymm2, %zmm3, %zmm0
+; SKX-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm0
; SKX-NEXT: retq
;
; SKX_32-LABEL: test_gather_16i32:
-; SKX_32: # BB#0:
+; SKX_32: # %bb.0:
; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1
; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1
; SKX_32-NEXT: vptestmd %zmm1, %zmm1, %k1
@@ -1730,7 +1929,7 @@ define <16 x i32> @test_gather_16i32(<16 x i32*> %ptrs, <16 x i1> %mask, <16 x i
}
define <16 x i64> @test_gather_16i64(<16 x i64*> %ptrs, <16 x i1> %mask, <16 x i64> %src0) {
; KNL_64-LABEL: test_gather_16i64:
-; KNL_64: # BB#0:
+; KNL_64: # %bb.0:
; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2
; KNL_64-NEXT: vpslld $31, %zmm2, %zmm2
; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
@@ -1742,14 +1941,11 @@ define <16 x i64> @test_gather_16i64(<16 x i64*> %ptrs, <16 x i1> %mask, <16 x i
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test_gather_16i64:
-; KNL_32: # BB#0:
+; KNL_32: # %bb.0:
; KNL_32-NEXT: pushl %ebp
-; KNL_32-NEXT: .Lcfi0:
; KNL_32-NEXT: .cfi_def_cfa_offset 8
-; KNL_32-NEXT: .Lcfi1:
; KNL_32-NEXT: .cfi_offset %ebp, -8
; KNL_32-NEXT: movl %esp, %ebp
-; KNL_32-NEXT: .Lcfi2:
; KNL_32-NEXT: .cfi_def_cfa_register %ebp
; KNL_32-NEXT: andl $-64, %esp
; KNL_32-NEXT: subl $64, %esp
@@ -1767,7 +1963,7 @@ define <16 x i64> @test_gather_16i64(<16 x i64*> %ptrs, <16 x i1> %mask, <16 x i
; KNL_32-NEXT: retl
;
; SKX-LABEL: test_gather_16i64:
-; SKX: # BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
; SKX-NEXT: vpslld $31, %zmm2, %zmm2
; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1
@@ -1779,14 +1975,11 @@ define <16 x i64> @test_gather_16i64(<16 x i64*> %ptrs, <16 x i1> %mask, <16 x i
; SKX-NEXT: retq
;
; SKX_32-LABEL: test_gather_16i64:
-; SKX_32: # BB#0:
+; SKX_32: # %bb.0:
; SKX_32-NEXT: pushl %ebp
-; SKX_32-NEXT: .Lcfi1:
; SKX_32-NEXT: .cfi_def_cfa_offset 8
-; SKX_32-NEXT: .Lcfi2:
; SKX_32-NEXT: .cfi_offset %ebp, -8
; SKX_32-NEXT: movl %esp, %ebp
-; SKX_32-NEXT: .Lcfi3:
; SKX_32-NEXT: .cfi_def_cfa_register %ebp
; SKX_32-NEXT: andl $-64, %esp
; SKX_32-NEXT: subl $64, %esp
@@ -1796,7 +1989,7 @@ define <16 x i64> @test_gather_16i64(<16 x i64*> %ptrs, <16 x i1> %mask, <16 x i
; SKX_32-NEXT: vmovdqa64 8(%ebp), %zmm1
; SKX_32-NEXT: kshiftrw $8, %k1, %k2
; SKX_32-NEXT: vpgatherdq (,%ymm0), %zmm2 {%k1}
-; SKX_32-NEXT: vextracti32x8 $1, %zmm0, %ymm0
+; SKX_32-NEXT: vextracti64x4 $1, %zmm0, %ymm0
; SKX_32-NEXT: vpgatherdq (,%ymm0), %zmm1 {%k2}
; SKX_32-NEXT: vmovdqa64 %zmm2, %zmm0
; SKX_32-NEXT: movl %ebp, %esp
@@ -1808,7 +2001,7 @@ define <16 x i64> @test_gather_16i64(<16 x i64*> %ptrs, <16 x i1> %mask, <16 x i
declare <16 x i64> @llvm.masked.gather.v16i64.v16p0i64(<16 x i64*> %ptrs, i32, <16 x i1> %mask, <16 x i64> %src0)
define <16 x float> @test_gather_16f32(<16 x float*> %ptrs, <16 x i1> %mask, <16 x float> %src0) {
; KNL_64-LABEL: test_gather_16f32:
-; KNL_64: # BB#0:
+; KNL_64: # %bb.0:
; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2
; KNL_64-NEXT: vpslld $31, %zmm2, %zmm2
; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
@@ -1820,7 +2013,7 @@ define <16 x float> @test_gather_16f32(<16 x float*> %ptrs, <16 x i1> %mask, <16
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test_gather_16f32:
-; KNL_32: # BB#0:
+; KNL_32: # %bb.0:
; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1
; KNL_32-NEXT: vpslld $31, %zmm1, %zmm1
; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
@@ -1829,19 +2022,19 @@ define <16 x float> @test_gather_16f32(<16 x float*> %ptrs, <16 x i1> %mask, <16
; KNL_32-NEXT: retl
;
; SKX-LABEL: test_gather_16f32:
-; SKX: # BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
; SKX-NEXT: vpslld $31, %zmm2, %zmm2
; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1
-; SKX-NEXT: vextractf32x8 $1, %zmm3, %ymm2
+; SKX-NEXT: vextractf64x4 $1, %zmm3, %ymm2
; SKX-NEXT: kshiftrw $8, %k1, %k2
; SKX-NEXT: vgatherqps (,%zmm1), %ymm2 {%k2}
; SKX-NEXT: vgatherqps (,%zmm0), %ymm3 {%k1}
-; SKX-NEXT: vinsertf32x8 $1, %ymm2, %zmm3, %zmm0
+; SKX-NEXT: vinsertf64x4 $1, %ymm2, %zmm3, %zmm0
; SKX-NEXT: retq
;
; SKX_32-LABEL: test_gather_16f32:
-; SKX_32: # BB#0:
+; SKX_32: # %bb.0:
; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1
; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1
; SKX_32-NEXT: vptestmd %zmm1, %zmm1, %k1
@@ -1853,7 +2046,7 @@ define <16 x float> @test_gather_16f32(<16 x float*> %ptrs, <16 x i1> %mask, <16
}
define <16 x double> @test_gather_16f64(<16 x double*> %ptrs, <16 x i1> %mask, <16 x double> %src0) {
; KNL_64-LABEL: test_gather_16f64:
-; KNL_64: # BB#0:
+; KNL_64: # %bb.0:
; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2
; KNL_64-NEXT: vpslld $31, %zmm2, %zmm2
; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
@@ -1865,14 +2058,11 @@ define <16 x double> @test_gather_16f64(<16 x double*> %ptrs, <16 x i1> %mask, <
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test_gather_16f64:
-; KNL_32: # BB#0:
+; KNL_32: # %bb.0:
; KNL_32-NEXT: pushl %ebp
-; KNL_32-NEXT: .Lcfi3:
; KNL_32-NEXT: .cfi_def_cfa_offset 8
-; KNL_32-NEXT: .Lcfi4:
; KNL_32-NEXT: .cfi_offset %ebp, -8
; KNL_32-NEXT: movl %esp, %ebp
-; KNL_32-NEXT: .Lcfi5:
; KNL_32-NEXT: .cfi_def_cfa_register %ebp
; KNL_32-NEXT: andl $-64, %esp
; KNL_32-NEXT: subl $64, %esp
@@ -1882,7 +2072,7 @@ define <16 x double> @test_gather_16f64(<16 x double*> %ptrs, <16 x i1> %mask, <
; KNL_32-NEXT: vmovapd 8(%ebp), %zmm1
; KNL_32-NEXT: kshiftrw $8, %k1, %k2
; KNL_32-NEXT: vgatherdpd (,%ymm0), %zmm2 {%k1}
-; KNL_32-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; KNL_32-NEXT: vextractf64x4 $1, %zmm0, %ymm0
; KNL_32-NEXT: vgatherdpd (,%ymm0), %zmm1 {%k2}
; KNL_32-NEXT: vmovapd %zmm2, %zmm0
; KNL_32-NEXT: movl %ebp, %esp
@@ -1890,7 +2080,7 @@ define <16 x double> @test_gather_16f64(<16 x double*> %ptrs, <16 x i1> %mask, <
; KNL_32-NEXT: retl
;
; SKX-LABEL: test_gather_16f64:
-; SKX: # BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
; SKX-NEXT: vpslld $31, %zmm2, %zmm2
; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1
@@ -1902,14 +2092,11 @@ define <16 x double> @test_gather_16f64(<16 x double*> %ptrs, <16 x i1> %mask, <
; SKX-NEXT: retq
;
; SKX_32-LABEL: test_gather_16f64:
-; SKX_32: # BB#0:
+; SKX_32: # %bb.0:
; SKX_32-NEXT: pushl %ebp
-; SKX_32-NEXT: .Lcfi4:
; SKX_32-NEXT: .cfi_def_cfa_offset 8
-; SKX_32-NEXT: .Lcfi5:
; SKX_32-NEXT: .cfi_offset %ebp, -8
; SKX_32-NEXT: movl %esp, %ebp
-; SKX_32-NEXT: .Lcfi6:
; SKX_32-NEXT: .cfi_def_cfa_register %ebp
; SKX_32-NEXT: andl $-64, %esp
; SKX_32-NEXT: subl $64, %esp
@@ -1919,7 +2106,7 @@ define <16 x double> @test_gather_16f64(<16 x double*> %ptrs, <16 x i1> %mask, <
; SKX_32-NEXT: vmovapd 8(%ebp), %zmm1
; SKX_32-NEXT: kshiftrw $8, %k1, %k2
; SKX_32-NEXT: vgatherdpd (,%ymm0), %zmm2 {%k1}
-; SKX_32-NEXT: vextracti32x8 $1, %zmm0, %ymm0
+; SKX_32-NEXT: vextractf64x4 $1, %zmm0, %ymm0
; SKX_32-NEXT: vgatherdpd (,%ymm0), %zmm1 {%k2}
; SKX_32-NEXT: vmovapd %zmm2, %zmm0
; SKX_32-NEXT: movl %ebp, %esp
@@ -1931,7 +2118,7 @@ define <16 x double> @test_gather_16f64(<16 x double*> %ptrs, <16 x i1> %mask, <
declare <16 x double> @llvm.masked.gather.v16f64.v16p0f64(<16 x double*> %ptrs, i32, <16 x i1> %mask, <16 x double> %src0)
define void @test_scatter_16i32(<16 x i32*> %ptrs, <16 x i1> %mask, <16 x i32> %src0) {
; KNL_64-LABEL: test_scatter_16i32:
-; KNL_64: # BB#0:
+; KNL_64: # %bb.0:
; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2
; KNL_64-NEXT: vpslld $31, %zmm2, %zmm2
; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
@@ -1943,7 +2130,7 @@ define void @test_scatter_16i32(<16 x i32*> %ptrs, <16 x i1> %mask, <16 x i32> %
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test_scatter_16i32:
-; KNL_32: # BB#0:
+; KNL_32: # %bb.0:
; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1
; KNL_32-NEXT: vpslld $31, %zmm1, %zmm1
; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
@@ -1952,19 +2139,19 @@ define void @test_scatter_16i32(<16 x i32*> %ptrs, <16 x i1> %mask, <16 x i32> %
; KNL_32-NEXT: retl
;
; SKX-LABEL: test_scatter_16i32:
-; SKX: # BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
; SKX-NEXT: vpslld $31, %zmm2, %zmm2
; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1
; SKX-NEXT: kshiftrw $8, %k1, %k2
; SKX-NEXT: vpscatterqd %ymm3, (,%zmm0) {%k1}
-; SKX-NEXT: vextracti32x8 $1, %zmm3, %ymm0
+; SKX-NEXT: vextracti64x4 $1, %zmm3, %ymm0
; SKX-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k2}
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
;
; SKX_32-LABEL: test_scatter_16i32:
-; SKX_32: # BB#0:
+; SKX_32: # %bb.0:
; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1
; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1
; SKX_32-NEXT: vptestmd %zmm1, %zmm1, %k1
@@ -1976,7 +2163,7 @@ define void @test_scatter_16i32(<16 x i32*> %ptrs, <16 x i1> %mask, <16 x i32> %
}
define void @test_scatter_16i64(<16 x i64*> %ptrs, <16 x i1> %mask, <16 x i64> %src0) {
; KNL_64-LABEL: test_scatter_16i64:
-; KNL_64: # BB#0:
+; KNL_64: # %bb.0:
; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2
; KNL_64-NEXT: vpslld $31, %zmm2, %zmm2
; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
@@ -1987,14 +2174,11 @@ define void @test_scatter_16i64(<16 x i64*> %ptrs, <16 x i1> %mask, <16 x i64> %
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test_scatter_16i64:
-; KNL_32: # BB#0:
+; KNL_32: # %bb.0:
; KNL_32-NEXT: pushl %ebp
-; KNL_32-NEXT: .Lcfi6:
; KNL_32-NEXT: .cfi_def_cfa_offset 8
-; KNL_32-NEXT: .Lcfi7:
; KNL_32-NEXT: .cfi_offset %ebp, -8
; KNL_32-NEXT: movl %esp, %ebp
-; KNL_32-NEXT: .Lcfi8:
; KNL_32-NEXT: .cfi_def_cfa_register %ebp
; KNL_32-NEXT: andl $-64, %esp
; KNL_32-NEXT: subl $64, %esp
@@ -2012,7 +2196,7 @@ define void @test_scatter_16i64(<16 x i64*> %ptrs, <16 x i1> %mask, <16 x i64> %
; KNL_32-NEXT: retl
;
; SKX-LABEL: test_scatter_16i64:
-; SKX: # BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
; SKX-NEXT: vpslld $31, %zmm2, %zmm2
; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1
@@ -2023,14 +2207,11 @@ define void @test_scatter_16i64(<16 x i64*> %ptrs, <16 x i1> %mask, <16 x i64> %
; SKX-NEXT: retq
;
; SKX_32-LABEL: test_scatter_16i64:
-; SKX_32: # BB#0:
+; SKX_32: # %bb.0:
; SKX_32-NEXT: pushl %ebp
-; SKX_32-NEXT: .Lcfi7:
; SKX_32-NEXT: .cfi_def_cfa_offset 8
-; SKX_32-NEXT: .Lcfi8:
; SKX_32-NEXT: .cfi_offset %ebp, -8
; SKX_32-NEXT: movl %esp, %ebp
-; SKX_32-NEXT: .Lcfi9:
; SKX_32-NEXT: .cfi_def_cfa_register %ebp
; SKX_32-NEXT: andl $-64, %esp
; SKX_32-NEXT: subl $64, %esp
@@ -2040,7 +2221,7 @@ define void @test_scatter_16i64(<16 x i64*> %ptrs, <16 x i1> %mask, <16 x i64> %
; SKX_32-NEXT: vmovdqa64 8(%ebp), %zmm1
; SKX_32-NEXT: kshiftrw $8, %k1, %k2
; SKX_32-NEXT: vpscatterdq %zmm2, (,%ymm0) {%k1}
-; SKX_32-NEXT: vextracti32x8 $1, %zmm0, %ymm0
+; SKX_32-NEXT: vextracti64x4 $1, %zmm0, %ymm0
; SKX_32-NEXT: vpscatterdq %zmm1, (,%ymm0) {%k2}
; SKX_32-NEXT: movl %ebp, %esp
; SKX_32-NEXT: popl %ebp
@@ -2052,7 +2233,7 @@ define void @test_scatter_16i64(<16 x i64*> %ptrs, <16 x i1> %mask, <16 x i64> %
declare void @llvm.masked.scatter.v16i64.v16p0i64(<16 x i64> %src0, <16 x i64*> %ptrs, i32, <16 x i1> %mask)
define void @test_scatter_16f32(<16 x float*> %ptrs, <16 x i1> %mask, <16 x float> %src0) {
; KNL_64-LABEL: test_scatter_16f32:
-; KNL_64: # BB#0:
+; KNL_64: # %bb.0:
; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2
; KNL_64-NEXT: vpslld $31, %zmm2, %zmm2
; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
@@ -2064,7 +2245,7 @@ define void @test_scatter_16f32(<16 x float*> %ptrs, <16 x i1> %mask, <16 x floa
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test_scatter_16f32:
-; KNL_32: # BB#0:
+; KNL_32: # %bb.0:
; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1
; KNL_32-NEXT: vpslld $31, %zmm1, %zmm1
; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
@@ -2073,19 +2254,19 @@ define void @test_scatter_16f32(<16 x float*> %ptrs, <16 x i1> %mask, <16 x floa
; KNL_32-NEXT: retl
;
; SKX-LABEL: test_scatter_16f32:
-; SKX: # BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
; SKX-NEXT: vpslld $31, %zmm2, %zmm2
; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1
; SKX-NEXT: kshiftrw $8, %k1, %k2
; SKX-NEXT: vscatterqps %ymm3, (,%zmm0) {%k1}
-; SKX-NEXT: vextractf32x8 $1, %zmm3, %ymm0
+; SKX-NEXT: vextractf64x4 $1, %zmm3, %ymm0
; SKX-NEXT: vscatterqps %ymm0, (,%zmm1) {%k2}
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
;
; SKX_32-LABEL: test_scatter_16f32:
-; SKX_32: # BB#0:
+; SKX_32: # %bb.0:
; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1
; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1
; SKX_32-NEXT: vptestmd %zmm1, %zmm1, %k1
@@ -2098,7 +2279,7 @@ define void @test_scatter_16f32(<16 x float*> %ptrs, <16 x i1> %mask, <16 x floa
declare void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> %src0, <16 x float*> %ptrs, i32, <16 x i1> %mask)
define void @test_scatter_16f64(<16 x double*> %ptrs, <16 x i1> %mask, <16 x double> %src0) {
; KNL_64-LABEL: test_scatter_16f64:
-; KNL_64: # BB#0:
+; KNL_64: # %bb.0:
; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2
; KNL_64-NEXT: vpslld $31, %zmm2, %zmm2
; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1
@@ -2109,14 +2290,11 @@ define void @test_scatter_16f64(<16 x double*> %ptrs, <16 x i1> %mask, <16 x dou
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test_scatter_16f64:
-; KNL_32: # BB#0:
+; KNL_32: # %bb.0:
; KNL_32-NEXT: pushl %ebp
-; KNL_32-NEXT: .Lcfi9:
; KNL_32-NEXT: .cfi_def_cfa_offset 8
-; KNL_32-NEXT: .Lcfi10:
; KNL_32-NEXT: .cfi_offset %ebp, -8
; KNL_32-NEXT: movl %esp, %ebp
-; KNL_32-NEXT: .Lcfi11:
; KNL_32-NEXT: .cfi_def_cfa_register %ebp
; KNL_32-NEXT: andl $-64, %esp
; KNL_32-NEXT: subl $64, %esp
@@ -2126,7 +2304,7 @@ define void @test_scatter_16f64(<16 x double*> %ptrs, <16 x i1> %mask, <16 x dou
; KNL_32-NEXT: vmovapd 8(%ebp), %zmm1
; KNL_32-NEXT: kshiftrw $8, %k1, %k2
; KNL_32-NEXT: vscatterdpd %zmm2, (,%ymm0) {%k1}
-; KNL_32-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; KNL_32-NEXT: vextractf64x4 $1, %zmm0, %ymm0
; KNL_32-NEXT: vscatterdpd %zmm1, (,%ymm0) {%k2}
; KNL_32-NEXT: movl %ebp, %esp
; KNL_32-NEXT: popl %ebp
@@ -2134,7 +2312,7 @@ define void @test_scatter_16f64(<16 x double*> %ptrs, <16 x i1> %mask, <16 x dou
; KNL_32-NEXT: retl
;
; SKX-LABEL: test_scatter_16f64:
-; SKX: # BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
; SKX-NEXT: vpslld $31, %zmm2, %zmm2
; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1
@@ -2145,14 +2323,11 @@ define void @test_scatter_16f64(<16 x double*> %ptrs, <16 x i1> %mask, <16 x dou
; SKX-NEXT: retq
;
; SKX_32-LABEL: test_scatter_16f64:
-; SKX_32: # BB#0:
+; SKX_32: # %bb.0:
; SKX_32-NEXT: pushl %ebp
-; SKX_32-NEXT: .Lcfi10:
; SKX_32-NEXT: .cfi_def_cfa_offset 8
-; SKX_32-NEXT: .Lcfi11:
; SKX_32-NEXT: .cfi_offset %ebp, -8
; SKX_32-NEXT: movl %esp, %ebp
-; SKX_32-NEXT: .Lcfi12:
; SKX_32-NEXT: .cfi_def_cfa_register %ebp
; SKX_32-NEXT: andl $-64, %esp
; SKX_32-NEXT: subl $64, %esp
@@ -2162,7 +2337,7 @@ define void @test_scatter_16f64(<16 x double*> %ptrs, <16 x i1> %mask, <16 x dou
; SKX_32-NEXT: vmovapd 8(%ebp), %zmm1
; SKX_32-NEXT: kshiftrw $8, %k1, %k2
; SKX_32-NEXT: vscatterdpd %zmm2, (,%ymm0) {%k1}
-; SKX_32-NEXT: vextracti32x8 $1, %zmm0, %ymm0
+; SKX_32-NEXT: vextractf64x4 $1, %zmm0, %ymm0
; SKX_32-NEXT: vscatterdpd %zmm1, (,%ymm0) {%k2}
; SKX_32-NEXT: movl %ebp, %esp
; SKX_32-NEXT: popl %ebp
@@ -2175,13 +2350,12 @@ declare void @llvm.masked.scatter.v16f64.v16p0f64(<16 x double> %src0, <16 x dou
define <4 x i64> @test_pr28312(<4 x i64*> %p1, <4 x i1> %k, <4 x i1> %k2,<4 x i64> %d) {
; KNL_64-LABEL: test_pr28312:
-; KNL_64: # BB#0:
-; KNL_64-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; KNL_64: # %bb.0:
+; KNL_64-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; KNL_64-NEXT: vpslld $31, %xmm1, %xmm1
; KNL_64-NEXT: vpsrad $31, %xmm1, %xmm1
; KNL_64-NEXT: vpmovsxdq %xmm1, %ymm1
-; KNL_64-NEXT: vpxord %zmm2, %zmm2, %zmm2
-; KNL_64-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
+; KNL_64-NEXT: vmovdqa %ymm1, %ymm1
; KNL_64-NEXT: vpsllq $63, %zmm1, %zmm1
; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1
; KNL_64-NEXT: vpgatherqq (,%zmm0), %zmm1 {%k1}
@@ -2190,23 +2364,19 @@ define <4 x i64> @test_pr28312(<4 x i64*> %p1, <4 x i1> %k, <4 x i1> %k2,<4 x i6
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test_pr28312:
-; KNL_32: # BB#0:
+; KNL_32: # %bb.0:
; KNL_32-NEXT: pushl %ebp
-; KNL_32-NEXT: .Lcfi12:
; KNL_32-NEXT: .cfi_def_cfa_offset 8
-; KNL_32-NEXT: .Lcfi13:
; KNL_32-NEXT: .cfi_offset %ebp, -8
; KNL_32-NEXT: movl %esp, %ebp
-; KNL_32-NEXT: .Lcfi14:
; KNL_32-NEXT: .cfi_def_cfa_register %ebp
; KNL_32-NEXT: andl $-32, %esp
; KNL_32-NEXT: subl $32, %esp
-; KNL_32-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; KNL_32-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
; KNL_32-NEXT: vpslld $31, %xmm1, %xmm1
; KNL_32-NEXT: vpsrad $31, %xmm1, %xmm1
; KNL_32-NEXT: vpmovsxdq %xmm1, %ymm1
-; KNL_32-NEXT: vpxord %zmm2, %zmm2, %zmm2
-; KNL_32-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
+; KNL_32-NEXT: vmovdqa %ymm1, %ymm1
; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm0
; KNL_32-NEXT: vpsllq $63, %zmm1, %zmm1
; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1
@@ -2218,7 +2388,7 @@ define <4 x i64> @test_pr28312(<4 x i64*> %p1, <4 x i1> %k, <4 x i1> %k2,<4 x i6
; KNL_32-NEXT: retl
;
; SKX-LABEL: test_pr28312:
-; SKX: # BB#0:
+; SKX: # %bb.0:
; SKX-NEXT: vpslld $31, %xmm1, %xmm1
; SKX-NEXT: vptestmd %xmm1, %xmm1, %k1
; SKX-NEXT: vpgatherqq (,%ymm0), %ymm1 {%k1}
@@ -2227,14 +2397,11 @@ define <4 x i64> @test_pr28312(<4 x i64*> %p1, <4 x i1> %k, <4 x i1> %k2,<4 x i6
; SKX-NEXT: retq
;
; SKX_32-LABEL: test_pr28312:
-; SKX_32: # BB#0:
+; SKX_32: # %bb.0:
; SKX_32-NEXT: pushl %ebp
-; SKX_32-NEXT: .Lcfi13:
; SKX_32-NEXT: .cfi_def_cfa_offset 8
-; SKX_32-NEXT: .Lcfi14:
; SKX_32-NEXT: .cfi_offset %ebp, -8
; SKX_32-NEXT: movl %esp, %ebp
-; SKX_32-NEXT: .Lcfi15:
; SKX_32-NEXT: .cfi_def_cfa_register %ebp
; SKX_32-NEXT: andl $-32, %esp
; SKX_32-NEXT: subl $32, %esp
@@ -2254,3 +2421,346 @@ define <4 x i64> @test_pr28312(<4 x i64*> %p1, <4 x i1> %k, <4 x i1> %k2,<4 x i6
ret <4 x i64> %b
}
declare <4 x i64> @llvm.masked.gather.v4i64.v4p0i64(<4 x i64*>, i32, <4 x i1>, <4 x i64>)
+
+define <8 x i32> @test_global_array(<8 x i64> %indxs) {
+; KNL_64-LABEL: test_global_array:
+; KNL_64: # %bb.0:
+; KNL_64-NEXT: kxnorw %k0, %k0, %k1
+; KNL_64-NEXT: vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1}
+; KNL_64-NEXT: vmovdqa %ymm1, %ymm0
+; KNL_64-NEXT: retq
+;
+; KNL_32-LABEL: test_global_array:
+; KNL_32: # %bb.0:
+; KNL_32-NEXT: kxnorw %k0, %k0, %k1
+; KNL_32-NEXT: vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1}
+; KNL_32-NEXT: vmovdqa %ymm1, %ymm0
+; KNL_32-NEXT: retl
+;
+; SKX_SMALL-LABEL: test_global_array:
+; SKX_SMALL: # %bb.0:
+; SKX_SMALL-NEXT: kxnorw %k0, %k0, %k1
+; SKX_SMALL-NEXT: vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1}
+; SKX_SMALL-NEXT: vmovdqa %ymm1, %ymm0
+; SKX_SMALL-NEXT: retq
+;
+; SKX_LARGE-LABEL: test_global_array:
+; SKX_LARGE: # %bb.0:
+; SKX_LARGE-NEXT: movabsq $glob_array, %rax
+; SKX_LARGE-NEXT: kxnorw %k0, %k0, %k1
+; SKX_LARGE-NEXT: vpgatherqd (%rax,%zmm0,4), %ymm1 {%k1}
+; SKX_LARGE-NEXT: vmovdqa %ymm1, %ymm0
+; SKX_LARGE-NEXT: retq
+;
+; SKX_32-LABEL: test_global_array:
+; SKX_32: # %bb.0:
+; SKX_32-NEXT: kxnorw %k0, %k0, %k1
+; SKX_32-NEXT: vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1}
+; SKX_32-NEXT: vmovdqa %ymm1, %ymm0
+; SKX_32-NEXT: retl
+ %p = getelementptr inbounds [16 x i32], [16 x i32]* @glob_array, i64 0, <8 x i64> %indxs
+ %g = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> %p, i32 8, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
+ ret <8 x i32> %g
+}
+
+define void @v1_scatter(<1 x i32>%a1, <1 x i32*> %ptr, <1 x i1> %mask) {
+; KNL_64-LABEL: v1_scatter:
+; KNL_64: # %bb.0:
+; KNL_64-NEXT: testb $1, %dl
+; KNL_64-NEXT: jne .LBB43_1
+; KNL_64-NEXT: # %bb.2: # %else
+; KNL_64-NEXT: retq
+; KNL_64-NEXT: .LBB43_1: # %cond.store
+; KNL_64-NEXT: movl %edi, (%rsi)
+; KNL_64-NEXT: retq
+;
+; KNL_32-LABEL: v1_scatter:
+; KNL_32: # %bb.0:
+; KNL_32-NEXT: testb $1, {{[0-9]+}}(%esp)
+; KNL_32-NEXT: jne .LBB43_1
+; KNL_32-NEXT: # %bb.2: # %else
+; KNL_32-NEXT: retl
+; KNL_32-NEXT: .LBB43_1: # %cond.store
+; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; KNL_32-NEXT: movl %ecx, (%eax)
+; KNL_32-NEXT: retl
+;
+; SKX-LABEL: v1_scatter:
+; SKX: # %bb.0:
+; SKX-NEXT: testb $1, %dl
+; SKX-NEXT: jne .LBB43_1
+; SKX-NEXT: # %bb.2: # %else
+; SKX-NEXT: retq
+; SKX-NEXT: .LBB43_1: # %cond.store
+; SKX-NEXT: movl %edi, (%rsi)
+; SKX-NEXT: retq
+;
+; SKX_32-LABEL: v1_scatter:
+; SKX_32: # %bb.0:
+; SKX_32-NEXT: testb $1, {{[0-9]+}}(%esp)
+; SKX_32-NEXT: jne .LBB43_1
+; SKX_32-NEXT: # %bb.2: # %else
+; SKX_32-NEXT: retl
+; SKX_32-NEXT: .LBB43_1: # %cond.store
+; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; SKX_32-NEXT: movl %ecx, (%eax)
+; SKX_32-NEXT: retl
+ call void @llvm.masked.scatter.v1i32.v1p0i32(<1 x i32> %a1, <1 x i32*> %ptr, i32 4, <1 x i1> %mask)
+ ret void
+}
+declare void @llvm.masked.scatter.v1i32.v1p0i32(<1 x i32>, <1 x i32*>, i32, <1 x i1>)
+
+define <1 x i32> @v1_gather(<1 x i32*> %ptr, <1 x i1> %mask, <1 x i32> %src0) {
+; KNL_64-LABEL: v1_gather:
+; KNL_64: # %bb.0:
+; KNL_64-NEXT: movl (%rdi), %eax
+; KNL_64-NEXT: retq
+;
+; KNL_32-LABEL: v1_gather:
+; KNL_32: # %bb.0:
+; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; KNL_32-NEXT: movl (%eax), %eax
+; KNL_32-NEXT: retl
+;
+; SKX-LABEL: v1_gather:
+; SKX: # %bb.0:
+; SKX-NEXT: movl (%rdi), %eax
+; SKX-NEXT: retq
+;
+; SKX_32-LABEL: v1_gather:
+; SKX_32: # %bb.0:
+; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; SKX_32-NEXT: movl (%eax), %eax
+; SKX_32-NEXT: retl
+ %res = call <1 x i32> @llvm.masked.gather.v1i32.v1p0i32(<1 x i32*> %ptr, i32 4, <1 x i1> <i1 true>, <1 x i32> %src0)
+ ret <1 x i32>%res
+}
+declare <1 x i32> @llvm.masked.gather.v1i32.v1p0i32(<1 x i32*>, i32, <1 x i1>, <1 x i32>)
+
+; Make sure we don't crash when the index element type is larger than i64 and we need to widen the result
+; This experienced a bad interaction when we widened and then tried to split.
+define <2 x float> @large_index(float* %base, <2 x i128> %ind, <2 x i1> %mask, <2 x float> %src0) {
+; KNL_64-LABEL: large_index:
+; KNL_64: # %bb.0:
+; KNL_64-NEXT: # kill: def %xmm1 killed %xmm1 def %ymm1
+; KNL_64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; KNL_64-NEXT: vmovaps %xmm0, %xmm0
+; KNL_64-NEXT: vmovq %rcx, %xmm2
+; KNL_64-NEXT: vmovq %rsi, %xmm3
+; KNL_64-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; KNL_64-NEXT: vpslld $31, %ymm0, %ymm0
+; KNL_64-NEXT: vptestmd %zmm0, %zmm0, %k1
+; KNL_64-NEXT: vgatherqps (%rdi,%zmm2,4), %ymm1 {%k1}
+; KNL_64-NEXT: vmovaps %xmm1, %xmm0
+; KNL_64-NEXT: vzeroupper
+; KNL_64-NEXT: retq
+;
+; KNL_32-LABEL: large_index:
+; KNL_32: # %bb.0:
+; KNL_32-NEXT: # kill: def %xmm1 killed %xmm1 def %ymm1
+; KNL_32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; KNL_32-NEXT: vmovaps %xmm0, %xmm0
+; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; KNL_32-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; KNL_32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm2, %xmm2
+; KNL_32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm2, %xmm2
+; KNL_32-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm2, %xmm2
+; KNL_32-NEXT: vpslld $31, %ymm0, %ymm0
+; KNL_32-NEXT: vptestmd %zmm0, %zmm0, %k1
+; KNL_32-NEXT: vgatherqps (%eax,%zmm2,4), %ymm1 {%k1}
+; KNL_32-NEXT: vmovaps %xmm1, %xmm0
+; KNL_32-NEXT: vzeroupper
+; KNL_32-NEXT: retl
+;
+; SKX-LABEL: large_index:
+; SKX: # %bb.0:
+; SKX-NEXT: vpsllq $63, %xmm0, %xmm0
+; SKX-NEXT: vptestmq %xmm0, %xmm0, %k1
+; SKX-NEXT: vmovq %rcx, %xmm0
+; SKX-NEXT: vmovq %rsi, %xmm2
+; SKX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
+; SKX-NEXT: vgatherqps (%rdi,%xmm0,4), %xmm1 {%k1}
+; SKX-NEXT: vmovaps %xmm1, %xmm0
+; SKX-NEXT: retq
+;
+; SKX_32-LABEL: large_index:
+; SKX_32: # %bb.0:
+; SKX_32-NEXT: vpsllq $63, %xmm0, %xmm0
+; SKX_32-NEXT: vptestmq %xmm0, %xmm0, %k1
+; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; SKX_32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SKX_32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
+; SKX_32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0
+; SKX_32-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm0
+; SKX_32-NEXT: vgatherqps (%eax,%xmm0,4), %xmm1 {%k1}
+; SKX_32-NEXT: vmovaps %xmm1, %xmm0
+; SKX_32-NEXT: retl
+ %gep.random = getelementptr float, float* %base, <2 x i128> %ind
+ %res = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> %gep.random, i32 4, <2 x i1> %mask, <2 x float> %src0)
+ ret <2 x float>%res
+}
+
+; Make sure we allow index to be sign extended from a smaller than i32 element size.
+define <16 x float> @sext_i8_index(float* %base, <16 x i8> %ind) {
+; KNL_64-LABEL: sext_i8_index:
+; KNL_64: # %bb.0:
+; KNL_64-NEXT: vpmovsxbw %xmm0, %ymm0
+; KNL_64-NEXT: vpmovsxwq %xmm0, %zmm1
+; KNL_64-NEXT: vextracti128 $1, %ymm0, %xmm0
+; KNL_64-NEXT: vpmovsxwq %xmm0, %zmm0
+; KNL_64-NEXT: kxnorw %k0, %k0, %k1
+; KNL_64-NEXT: kxnorw %k0, %k0, %k2
+; KNL_64-NEXT: vgatherqps (%rdi,%zmm0,4), %ymm2 {%k2}
+; KNL_64-NEXT: vgatherqps (%rdi,%zmm1,4), %ymm0 {%k1}
+; KNL_64-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; KNL_64-NEXT: retq
+;
+; KNL_32-LABEL: sext_i8_index:
+; KNL_32: # %bb.0:
+; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; KNL_32-NEXT: vpmovsxbw %xmm0, %ymm0
+; KNL_32-NEXT: vpmovsxwq %xmm0, %zmm1
+; KNL_32-NEXT: vextracti128 $1, %ymm0, %xmm0
+; KNL_32-NEXT: vpmovsxwq %xmm0, %zmm0
+; KNL_32-NEXT: kxnorw %k0, %k0, %k1
+; KNL_32-NEXT: kxnorw %k0, %k0, %k2
+; KNL_32-NEXT: vgatherqps (%eax,%zmm0,4), %ymm2 {%k2}
+; KNL_32-NEXT: vgatherqps (%eax,%zmm1,4), %ymm0 {%k1}
+; KNL_32-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; KNL_32-NEXT: retl
+;
+; SKX-LABEL: sext_i8_index:
+; SKX: # %bb.0:
+; SKX-NEXT: vpmovsxbw %xmm0, %ymm0
+; SKX-NEXT: vpmovsxwq %xmm0, %zmm1
+; SKX-NEXT: vextracti128 $1, %ymm0, %xmm0
+; SKX-NEXT: vpmovsxwq %xmm0, %zmm0
+; SKX-NEXT: kxnorw %k0, %k0, %k1
+; SKX-NEXT: kxnorw %k0, %k0, %k2
+; SKX-NEXT: vgatherqps (%rdi,%zmm0,4), %ymm2 {%k2}
+; SKX-NEXT: vgatherqps (%rdi,%zmm1,4), %ymm0 {%k1}
+; SKX-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; SKX-NEXT: retq
+;
+; SKX_32-LABEL: sext_i8_index:
+; SKX_32: # %bb.0:
+; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; SKX_32-NEXT: vpmovsxbw %xmm0, %ymm0
+; SKX_32-NEXT: vpmovsxwq %xmm0, %zmm1
+; SKX_32-NEXT: vextracti128 $1, %ymm0, %xmm0
+; SKX_32-NEXT: vpmovsxwq %xmm0, %zmm0
+; SKX_32-NEXT: kxnorw %k0, %k0, %k1
+; SKX_32-NEXT: kxnorw %k0, %k0, %k2
+; SKX_32-NEXT: vgatherqps (%eax,%zmm0,4), %ymm2 {%k2}
+; SKX_32-NEXT: vgatherqps (%eax,%zmm1,4), %ymm0 {%k1}
+; SKX_32-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; SKX_32-NEXT: retl
+
+ %sext_ind = sext <16 x i8> %ind to <16 x i64>
+ %gep.random = getelementptr float, float *%base, <16 x i64> %sext_ind
+
+ %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+ ret <16 x float>%res
+}
+
+; Make sure we allow index to be sign extended from a smaller than i32 element size.
+define <8 x float> @sext_v8i8_index(float* %base, <8 x i8> %ind) {
+; KNL_64-LABEL: sext_v8i8_index:
+; KNL_64: # %bb.0:
+; KNL_64-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; KNL_64-NEXT: vpsllq $56, %zmm0, %zmm0
+; KNL_64-NEXT: vpsraq $56, %zmm0, %zmm1
+; KNL_64-NEXT: kxnorw %k0, %k0, %k1
+; KNL_64-NEXT: vgatherqps (%rdi,%zmm1,4), %ymm0 {%k1}
+; KNL_64-NEXT: retq
+;
+; KNL_32-LABEL: sext_v8i8_index:
+; KNL_32: # %bb.0:
+; KNL_32-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; KNL_32-NEXT: vpsllq $56, %zmm0, %zmm0
+; KNL_32-NEXT: vpsraq $56, %zmm0, %zmm1
+; KNL_32-NEXT: kxnorw %k0, %k0, %k1
+; KNL_32-NEXT: vgatherqps (%eax,%zmm1,4), %ymm0 {%k1}
+; KNL_32-NEXT: retl
+;
+; SKX-LABEL: sext_v8i8_index:
+; SKX: # %bb.0:
+; SKX-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; SKX-NEXT: vpsllq $56, %zmm0, %zmm0
+; SKX-NEXT: vpsraq $56, %zmm0, %zmm1
+; SKX-NEXT: kxnorw %k0, %k0, %k1
+; SKX-NEXT: vgatherqps (%rdi,%zmm1,4), %ymm0 {%k1}
+; SKX-NEXT: retq
+;
+; SKX_32-LABEL: sext_v8i8_index:
+; SKX_32: # %bb.0:
+; SKX_32-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; SKX_32-NEXT: vpsllq $56, %zmm0, %zmm0
+; SKX_32-NEXT: vpsraq $56, %zmm0, %zmm1
+; SKX_32-NEXT: kxnorw %k0, %k0, %k1
+; SKX_32-NEXT: vgatherqps (%eax,%zmm1,4), %ymm0 {%k1}
+; SKX_32-NEXT: retl
+
+ %sext_ind = sext <8 x i8> %ind to <8 x i64>
+ %gep.random = getelementptr float, float *%base, <8 x i64> %sext_ind
+
+ %res = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> %gep.random, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef)
+ ret <8 x float>%res
+}
+declare <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*>, i32, <8 x i1>, <8 x float>)
+
+; Index requires promotion
+define void @test_scatter_2i32_index(<2 x double> %a1, double* %base, <2 x i32> %ind, <2 x i1> %mask) {
+; KNL_64-LABEL: test_scatter_2i32_index:
+; KNL_64: # %bb.0:
+; KNL_64-NEXT: # kill: def %xmm0 killed %xmm0 def %zmm0
+; KNL_64-NEXT: vpsllq $32, %xmm1, %xmm1
+; KNL_64-NEXT: vpsraq $32, %zmm1, %zmm1
+; KNL_64-NEXT: vmovdqa %xmm2, %xmm2
+; KNL_64-NEXT: vpsllq $63, %zmm2, %zmm2
+; KNL_64-NEXT: vptestmq %zmm2, %zmm2, %k1
+; KNL_64-NEXT: vscatterqpd %zmm0, (%rdi,%zmm1,8) {%k1}
+; KNL_64-NEXT: vzeroupper
+; KNL_64-NEXT: retq
+;
+; KNL_32-LABEL: test_scatter_2i32_index:
+; KNL_32: # %bb.0:
+; KNL_32-NEXT: # kill: def %xmm0 killed %xmm0 def %zmm0
+; KNL_32-NEXT: vpsllq $32, %xmm1, %xmm1
+; KNL_32-NEXT: vpsraq $32, %zmm1, %zmm1
+; KNL_32-NEXT: vmovdqa %xmm2, %xmm2
+; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; KNL_32-NEXT: vpsllq $63, %zmm2, %zmm2
+; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k1
+; KNL_32-NEXT: vscatterqpd %zmm0, (%eax,%zmm1,8) {%k1}
+; KNL_32-NEXT: vzeroupper
+; KNL_32-NEXT: retl
+;
+; SKX-LABEL: test_scatter_2i32_index:
+; SKX: # %bb.0:
+; SKX-NEXT: vpsllq $63, %xmm2, %xmm2
+; SKX-NEXT: vptestmq %xmm2, %xmm2, %k1
+; SKX-NEXT: vpsllq $32, %xmm1, %xmm1
+; SKX-NEXT: vpsraq $32, %xmm1, %xmm1
+; SKX-NEXT: vscatterqpd %xmm0, (%rdi,%xmm1,8) {%k1}
+; SKX-NEXT: retq
+;
+; SKX_32-LABEL: test_scatter_2i32_index:
+; SKX_32: # %bb.0:
+; SKX_32-NEXT: vpsllq $63, %xmm2, %xmm2
+; SKX_32-NEXT: vptestmq %xmm2, %xmm2, %k1
+; SKX_32-NEXT: vpsllq $32, %xmm1, %xmm1
+; SKX_32-NEXT: vpsraq $32, %xmm1, %xmm1
+; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; SKX_32-NEXT: vscatterqpd %xmm0, (%eax,%xmm1,8) {%k1}
+; SKX_32-NEXT: retl
+ %gep = getelementptr double, double *%base, <2 x i32> %ind
+ call void @llvm.masked.scatter.v2f64.v2p0f64(<2 x double> %a1, <2 x double*> %gep, i32 4, <2 x i1> %mask)
+ ret void
+}
+declare void @llvm.masked.scatter.v2f64.v2p0f64(<2 x double>, <2 x double*>, i32, <2 x i1>)
+
diff --git a/test/CodeGen/X86/masked_memop.ll b/test/CodeGen/X86/masked_memop.ll
index 7a2e41e10a37..82f097e4e0f7 100644
--- a/test/CodeGen/X86/masked_memop.ll
+++ b/test/CodeGen/X86/masked_memop.ll
@@ -8,9 +8,89 @@
; that does not have AVX, but that case should probably be a separate test file using less tests
; because it takes over 1.2 seconds to codegen these tests on Haswell 4GHz if there's no maskmov.
+define <1 x double> @loadv1(<1 x i64> %trigger, <1 x double>* %addr, <1 x double> %dst) {
+; AVX-LABEL: loadv1:
+; AVX: ## %bb.0:
+; AVX-NEXT: testq %rdi, %rdi
+; AVX-NEXT: ## implicit-def: %xmm1
+; AVX-NEXT: je LBB0_1
+; AVX-NEXT: ## %bb.2: ## %else
+; AVX-NEXT: testq %rdi, %rdi
+; AVX-NEXT: jne LBB0_3
+; AVX-NEXT: LBB0_4: ## %else
+; AVX-NEXT: vmovaps %xmm1, %xmm0
+; AVX-NEXT: retq
+; AVX-NEXT: LBB0_1: ## %cond.load
+; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX-NEXT: testq %rdi, %rdi
+; AVX-NEXT: je LBB0_4
+; AVX-NEXT: LBB0_3: ## %else
+; AVX-NEXT: vmovaps %xmm0, %xmm1
+; AVX-NEXT: vmovaps %xmm1, %xmm0
+; AVX-NEXT: retq
+;
+; AVX512F-LABEL: loadv1:
+; AVX512F: ## %bb.0:
+; AVX512F-NEXT: testq %rdi, %rdi
+; AVX512F-NEXT: ## implicit-def: %xmm1
+; AVX512F-NEXT: jne LBB0_2
+; AVX512F-NEXT: ## %bb.1: ## %cond.load
+; AVX512F-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX512F-NEXT: LBB0_2: ## %else
+; AVX512F-NEXT: testq %rdi, %rdi
+; AVX512F-NEXT: sete %al
+; AVX512F-NEXT: kmovw %eax, %k1
+; AVX512F-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1}
+; AVX512F-NEXT: retq
+;
+; SKX-LABEL: loadv1:
+; SKX: ## %bb.0:
+; SKX-NEXT: testq %rdi, %rdi
+; SKX-NEXT: ## implicit-def: %xmm1
+; SKX-NEXT: jne LBB0_2
+; SKX-NEXT: ## %bb.1: ## %cond.load
+; SKX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
+; SKX-NEXT: LBB0_2: ## %else
+; SKX-NEXT: testq %rdi, %rdi
+; SKX-NEXT: sete %al
+; SKX-NEXT: kmovd %eax, %k1
+; SKX-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1}
+; SKX-NEXT: retq
+ %mask = icmp eq <1 x i64> %trigger, zeroinitializer
+ %res = call <1 x double> @llvm.masked.load.v1f64.p0v1f64(<1 x double>* %addr, i32 4, <1 x i1>%mask, <1 x double>%dst)
+ ret <1 x double> %res
+}
+declare <1 x double> @llvm.masked.load.v1f64.p0v1f64(<1 x double>*, i32, <1 x i1>, <1 x double>)
+
+define void @storev1(<1 x i32> %trigger, <1 x i32>* %addr, <1 x i32> %val) {
+; AVX-LABEL: storev1:
+; AVX: ## %bb.0:
+; AVX-NEXT: testl %edi, %edi
+; AVX-NEXT: je LBB1_1
+; AVX-NEXT: ## %bb.2: ## %else
+; AVX-NEXT: retq
+; AVX-NEXT: LBB1_1: ## %cond.store
+; AVX-NEXT: movl %edx, (%rsi)
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: storev1:
+; AVX512: ## %bb.0:
+; AVX512-NEXT: testl %edi, %edi
+; AVX512-NEXT: je LBB1_1
+; AVX512-NEXT: ## %bb.2: ## %else
+; AVX512-NEXT: retq
+; AVX512-NEXT: LBB1_1: ## %cond.store
+; AVX512-NEXT: movl %edx, (%rsi)
+; AVX512-NEXT: retq
+ %mask = icmp eq <1 x i32> %trigger, zeroinitializer
+ call void @llvm.masked.store.v1i32.p0v1i32(<1 x i32>%val, <1 x i32>* %addr, i32 4, <1 x i1>%mask)
+ ret void
+}
+declare void @llvm.masked.store.v1i32.p0v1i32(<1 x i32>, <1 x i32>*, i32, <1 x i1>)
+
define <2 x double> @test6(<2 x i64> %trigger, <2 x double>* %addr, <2 x double> %dst) {
; AVX-LABEL: test6:
-; AVX: ## BB#0:
+; AVX: ## %bb.0:
; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0
; AVX-NEXT: vmaskmovpd (%rdi), %xmm0, %xmm2
@@ -18,7 +98,7 @@ define <2 x double> @test6(<2 x i64> %trigger, <2 x double>* %addr, <2 x double>
; AVX-NEXT: retq
;
; AVX512F-LABEL: test6:
-; AVX512F: ## BB#0:
+; AVX512F: ## %bb.0:
; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512F-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0
; AVX512F-NEXT: vmaskmovpd (%rdi), %xmm0, %xmm2
@@ -26,7 +106,7 @@ define <2 x double> @test6(<2 x i64> %trigger, <2 x double>* %addr, <2 x double>
; AVX512F-NEXT: retq
;
; SKX-LABEL: test6:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2
; SKX-NEXT: vpcmpeqq %xmm2, %xmm0, %k1
; SKX-NEXT: vblendmpd (%rdi), %xmm1, %xmm0 {%k1}
@@ -38,7 +118,7 @@ define <2 x double> @test6(<2 x i64> %trigger, <2 x double>* %addr, <2 x double>
define <4 x float> @test7(<4 x i32> %trigger, <4 x float>* %addr, <4 x float> %dst) {
; AVX-LABEL: test7:
-; AVX: ## BB#0:
+; AVX: ## %bb.0:
; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
; AVX-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2
@@ -46,7 +126,7 @@ define <4 x float> @test7(<4 x i32> %trigger, <4 x float>* %addr, <4 x float> %d
; AVX-NEXT: retq
;
; AVX512F-LABEL: test7:
-; AVX512F: ## BB#0:
+; AVX512F: ## %bb.0:
; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512F-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
; AVX512F-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2
@@ -54,7 +134,7 @@ define <4 x float> @test7(<4 x i32> %trigger, <4 x float>* %addr, <4 x float> %d
; AVX512F-NEXT: retq
;
; SKX-LABEL: test7:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2
; SKX-NEXT: vpcmpeqd %xmm2, %xmm0, %k1
; SKX-NEXT: vblendmps (%rdi), %xmm1, %xmm0 {%k1}
@@ -66,7 +146,7 @@ define <4 x float> @test7(<4 x i32> %trigger, <4 x float>* %addr, <4 x float> %d
define <4 x i32> @test8(<4 x i32> %trigger, <4 x i32>* %addr, <4 x i32> %dst) {
; AVX1-LABEL: test8:
-; AVX1: ## BB#0:
+; AVX1: ## %bb.0:
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2
@@ -74,7 +154,7 @@ define <4 x i32> @test8(<4 x i32> %trigger, <4 x i32>* %addr, <4 x i32> %dst) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test8:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX2-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpmaskmovd (%rdi), %xmm0, %xmm2
@@ -82,7 +162,7 @@ define <4 x i32> @test8(<4 x i32> %trigger, <4 x i32>* %addr, <4 x i32> %dst) {
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test8:
-; AVX512F: ## BB#0:
+; AVX512F: ## %bb.0:
; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512F-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
; AVX512F-NEXT: vpmaskmovd (%rdi), %xmm0, %xmm2
@@ -90,7 +170,7 @@ define <4 x i32> @test8(<4 x i32> %trigger, <4 x i32>* %addr, <4 x i32> %dst) {
; AVX512F-NEXT: retq
;
; SKX-LABEL: test8:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2
; SKX-NEXT: vpcmpeqd %xmm2, %xmm0, %k1
; SKX-NEXT: vpblendmd (%rdi), %xmm1, %xmm0 {%k1}
@@ -102,28 +182,28 @@ define <4 x i32> @test8(<4 x i32> %trigger, <4 x i32>* %addr, <4 x i32> %dst) {
define void @test9(<4 x i32> %trigger, <4 x i32>* %addr, <4 x i32> %val) {
; AVX1-LABEL: test9:
-; AVX1: ## BB#0:
+; AVX1: ## %bb.0:
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi)
; AVX1-NEXT: retq
;
; AVX2-LABEL: test9:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX2-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpmaskmovd %xmm1, %xmm0, (%rdi)
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test9:
-; AVX512F: ## BB#0:
+; AVX512F: ## %bb.0:
; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512F-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
; AVX512F-NEXT: vpmaskmovd %xmm1, %xmm0, (%rdi)
; AVX512F-NEXT: retq
;
; SKX-LABEL: test9:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2
; SKX-NEXT: vpcmpeqd %xmm2, %xmm0, %k1
; SKX-NEXT: vmovdqu32 %xmm1, (%rdi) {%k1}
@@ -135,7 +215,7 @@ define void @test9(<4 x i32> %trigger, <4 x i32>* %addr, <4 x i32> %val) {
define <4 x double> @test10(<4 x i32> %trigger, <4 x double>* %addr, <4 x double> %dst) {
; AVX1-LABEL: test10:
-; AVX1: ## BB#0:
+; AVX1: ## %bb.0:
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpmovsxdq %xmm0, %xmm2
@@ -147,7 +227,7 @@ define <4 x double> @test10(<4 x i32> %trigger, <4 x double>* %addr, <4 x double
; AVX1-NEXT: retq
;
; AVX2-LABEL: test10:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX2-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0
@@ -156,7 +236,7 @@ define <4 x double> @test10(<4 x i32> %trigger, <4 x double>* %addr, <4 x double
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test10:
-; AVX512F: ## BB#0:
+; AVX512F: ## %bb.0:
; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512F-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
; AVX512F-NEXT: vpmovsxdq %xmm0, %ymm0
@@ -165,7 +245,7 @@ define <4 x double> @test10(<4 x i32> %trigger, <4 x double>* %addr, <4 x double
; AVX512F-NEXT: retq
;
; SKX-LABEL: test10:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2
; SKX-NEXT: vpcmpeqd %xmm2, %xmm0, %k1
; SKX-NEXT: vblendmpd (%rdi), %ymm1, %ymm0 {%k1}
@@ -177,7 +257,7 @@ define <4 x double> @test10(<4 x i32> %trigger, <4 x double>* %addr, <4 x double
define <4 x double> @test10b(<4 x i32> %trigger, <4 x double>* %addr, <4 x double> %dst) {
; AVX1-LABEL: test10b:
-; AVX1: ## BB#0:
+; AVX1: ## %bb.0:
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1
@@ -188,7 +268,7 @@ define <4 x double> @test10b(<4 x i32> %trigger, <4 x double>* %addr, <4 x doubl
; AVX1-NEXT: retq
;
; AVX2-LABEL: test10b:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0
@@ -196,7 +276,7 @@ define <4 x double> @test10b(<4 x i32> %trigger, <4 x double>* %addr, <4 x doubl
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test10b:
-; AVX512F: ## BB#0:
+; AVX512F: ## %bb.0:
; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512F-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; AVX512F-NEXT: vpmovsxdq %xmm0, %ymm0
@@ -204,7 +284,7 @@ define <4 x double> @test10b(<4 x i32> %trigger, <4 x double>* %addr, <4 x doubl
; AVX512F-NEXT: retq
;
; SKX-LABEL: test10b:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; SKX-NEXT: vpcmpeqd %xmm1, %xmm0, %k1
; SKX-NEXT: vmovapd (%rdi), %ymm0 {%k1} {z}
@@ -216,7 +296,7 @@ define <4 x double> @test10b(<4 x i32> %trigger, <4 x double>* %addr, <4 x doubl
define <8 x float> @test11a(<8 x i32> %trigger, <8 x float>* %addr, <8 x float> %dst) {
; AVX1-LABEL: test11a:
-; AVX1: ## BB#0:
+; AVX1: ## %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2
@@ -227,28 +307,28 @@ define <8 x float> @test11a(<8 x i32> %trigger, <8 x float>* %addr, <8 x float>
; AVX1-NEXT: retq
;
; AVX2-LABEL: test11a:
-; AVX2: ## BB#0:
-; AVX2-NEXT: vpxor %ymm2, %ymm2, %ymm2
+; AVX2: ## %bb.0:
+; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX2-NEXT: vpcmpeqd %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vmaskmovps (%rdi), %ymm0, %ymm2
; AVX2-NEXT: vblendvps %ymm0, %ymm2, %ymm1, %ymm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test11a:
-; AVX512F: ## BB#0:
-; AVX512F-NEXT: ## kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
-; AVX512F-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
-; AVX512F-NEXT: vpxor %ymm2, %ymm2, %ymm2
+; AVX512F: ## %bb.0:
+; AVX512F-NEXT: ## kill: def %ymm1 killed %ymm1 def %zmm1
+; AVX512F-NEXT: ## kill: def %ymm0 killed %ymm0 def %zmm0
+; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512F-NEXT: vpcmpeqd %zmm2, %zmm0, %k0
; AVX512F-NEXT: kshiftlw $8, %k0, %k0
; AVX512F-NEXT: kshiftrw $8, %k0, %k1
; AVX512F-NEXT: vblendmps (%rdi), %zmm1, %zmm0 {%k1}
-; AVX512F-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512F-NEXT: ## kill: def %ymm0 killed %ymm0 killed %zmm0
; AVX512F-NEXT: retq
;
; SKX-LABEL: test11a:
-; SKX: ## BB#0:
-; SKX-NEXT: vpxor %ymm2, %ymm2, %ymm2
+; SKX: ## %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2
; SKX-NEXT: vpcmpeqd %ymm2, %ymm0, %k1
; SKX-NEXT: vblendmps (%rdi), %ymm1, %ymm0 {%k1}
; SKX-NEXT: retq
@@ -259,7 +339,7 @@ define <8 x float> @test11a(<8 x i32> %trigger, <8 x float>* %addr, <8 x float>
define <8 x i32> @test11b(<8 x i1> %mask, <8 x i32>* %addr, <8 x i32> %dst) {
; AVX1-LABEL: test11b:
-; AVX1: ## BB#0:
+; AVX1: ## %bb.0:
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; AVX1-NEXT: vpslld $31, %xmm2, %xmm2
; AVX1-NEXT: vpsrad $31, %xmm2, %xmm2
@@ -272,7 +352,7 @@ define <8 x i32> @test11b(<8 x i1> %mask, <8 x i32>* %addr, <8 x i32> %dst) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test11b:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX2-NEXT: vpslld $31, %ymm0, %ymm0
; AVX2-NEXT: vpsrad $31, %ymm0, %ymm0
@@ -281,19 +361,17 @@ define <8 x i32> @test11b(<8 x i1> %mask, <8 x i32>* %addr, <8 x i32> %dst) {
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test11b:
-; AVX512F: ## BB#0:
-; AVX512F-NEXT: ## kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; AVX512F: ## %bb.0:
+; AVX512F-NEXT: ## kill: def %ymm1 killed %ymm1 def %zmm1
; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0
; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0
-; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0
-; AVX512F-NEXT: kshiftlw $8, %k0, %k0
-; AVX512F-NEXT: kshiftrw $8, %k0, %k1
+; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1
; AVX512F-NEXT: vpblendmd (%rdi), %zmm1, %zmm0 {%k1}
-; AVX512F-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512F-NEXT: ## kill: def %ymm0 killed %ymm0 killed %zmm0
; AVX512F-NEXT: retq
;
; SKX-LABEL: test11b:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpsllw $15, %xmm0, %xmm0
; SKX-NEXT: vpmovw2m %xmm0, %k1
; SKX-NEXT: vpblendmd (%rdi), %ymm1, %ymm0 {%k1}
@@ -304,7 +382,7 @@ define <8 x i32> @test11b(<8 x i1> %mask, <8 x i32>* %addr, <8 x i32> %dst) {
define <8 x float> @test11c(<8 x i1> %mask, <8 x float>* %addr) {
; AVX1-LABEL: test11c:
-; AVX1: ## BB#0:
+; AVX1: ## %bb.0:
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; AVX1-NEXT: vpslld $31, %xmm1, %xmm1
; AVX1-NEXT: vpsrad $31, %xmm1, %xmm1
@@ -316,7 +394,7 @@ define <8 x float> @test11c(<8 x i1> %mask, <8 x float>* %addr) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test11c:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX2-NEXT: vpslld $31, %ymm0, %ymm0
; AVX2-NEXT: vpsrad $31, %ymm0, %ymm0
@@ -324,18 +402,16 @@ define <8 x float> @test11c(<8 x i1> %mask, <8 x float>* %addr) {
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test11c:
-; AVX512F: ## BB#0:
+; AVX512F: ## %bb.0:
; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0
; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0
-; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0
-; AVX512F-NEXT: kshiftlw $8, %k0, %k0
-; AVX512F-NEXT: kshiftrw $8, %k0, %k1
+; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1
; AVX512F-NEXT: vmovups (%rdi), %zmm0 {%k1} {z}
-; AVX512F-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512F-NEXT: ## kill: def %ymm0 killed %ymm0 killed %zmm0
; AVX512F-NEXT: retq
;
; SKX-LABEL: test11c:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpsllw $15, %xmm0, %xmm0
; SKX-NEXT: vpmovw2m %xmm0, %k1
; SKX-NEXT: vmovaps (%rdi), %ymm0 {%k1} {z}
@@ -346,7 +422,7 @@ define <8 x float> @test11c(<8 x i1> %mask, <8 x float>* %addr) {
define <8 x i32> @test11d(<8 x i1> %mask, <8 x i32>* %addr) {
; AVX1-LABEL: test11d:
-; AVX1: ## BB#0:
+; AVX1: ## %bb.0:
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; AVX1-NEXT: vpslld $31, %xmm1, %xmm1
; AVX1-NEXT: vpsrad $31, %xmm1, %xmm1
@@ -358,7 +434,7 @@ define <8 x i32> @test11d(<8 x i1> %mask, <8 x i32>* %addr) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test11d:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX2-NEXT: vpslld $31, %ymm0, %ymm0
; AVX2-NEXT: vpsrad $31, %ymm0, %ymm0
@@ -366,18 +442,16 @@ define <8 x i32> @test11d(<8 x i1> %mask, <8 x i32>* %addr) {
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test11d:
-; AVX512F: ## BB#0:
+; AVX512F: ## %bb.0:
; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0
; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0
-; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0
-; AVX512F-NEXT: kshiftlw $8, %k0, %k0
-; AVX512F-NEXT: kshiftrw $8, %k0, %k1
+; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1
; AVX512F-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1} {z}
-; AVX512F-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512F-NEXT: ## kill: def %ymm0 killed %ymm0 killed %zmm0
; AVX512F-NEXT: retq
;
; SKX-LABEL: test11d:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpsllw $15, %xmm0, %xmm0
; SKX-NEXT: vpmovw2m %xmm0, %k1
; SKX-NEXT: vmovdqu32 (%rdi), %ymm0 {%k1} {z}
@@ -388,7 +462,7 @@ define <8 x i32> @test11d(<8 x i1> %mask, <8 x i32>* %addr) {
define void @test12(<8 x i32> %trigger, <8 x i32>* %addr, <8 x i32> %val) {
; AVX1-LABEL: test12:
-; AVX1: ## BB#0:
+; AVX1: ## %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2
@@ -399,18 +473,18 @@ define void @test12(<8 x i32> %trigger, <8 x i32>* %addr, <8 x i32> %val) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test12:
-; AVX2: ## BB#0:
-; AVX2-NEXT: vpxor %ymm2, %ymm2, %ymm2
+; AVX2: ## %bb.0:
+; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX2-NEXT: vpcmpeqd %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpmaskmovd %ymm1, %ymm0, (%rdi)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test12:
-; AVX512F: ## BB#0:
-; AVX512F-NEXT: ## kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
-; AVX512F-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
-; AVX512F-NEXT: vpxor %ymm2, %ymm2, %ymm2
+; AVX512F: ## %bb.0:
+; AVX512F-NEXT: ## kill: def %ymm1 killed %ymm1 def %zmm1
+; AVX512F-NEXT: ## kill: def %ymm0 killed %ymm0 def %zmm0
+; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512F-NEXT: vpcmpeqd %zmm2, %zmm0, %k0
; AVX512F-NEXT: kshiftlw $8, %k0, %k0
; AVX512F-NEXT: kshiftrw $8, %k0, %k1
@@ -419,8 +493,8 @@ define void @test12(<8 x i32> %trigger, <8 x i32>* %addr, <8 x i32> %val) {
; AVX512F-NEXT: retq
;
; SKX-LABEL: test12:
-; SKX: ## BB#0:
-; SKX-NEXT: vpxor %ymm2, %ymm2, %ymm2
+; SKX: ## %bb.0:
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2
; SKX-NEXT: vpcmpeqd %ymm2, %ymm0, %k1
; SKX-NEXT: vmovdqu32 %ymm1, (%rdi) {%k1}
; SKX-NEXT: vzeroupper
@@ -432,7 +506,7 @@ define void @test12(<8 x i32> %trigger, <8 x i32>* %addr, <8 x i32> %val) {
define void @test14(<2 x i32> %trigger, <2 x float>* %addr, <2 x float> %val) {
; AVX1-LABEL: test14:
-; AVX1: ## BB#0:
+; AVX1: ## %bb.0:
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
; AVX1-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0
@@ -441,7 +515,7 @@ define void @test14(<2 x i32> %trigger, <2 x float>* %addr, <2 x float> %val) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test14:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
; AVX2-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0
@@ -450,7 +524,7 @@ define void @test14(<2 x i32> %trigger, <2 x float>* %addr, <2 x float> %val) {
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test14:
-; AVX512F: ## BB#0:
+; AVX512F: ## %bb.0:
; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
; AVX512F-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0
@@ -459,7 +533,7 @@ define void @test14(<2 x i32> %trigger, <2 x float>* %addr, <2 x float> %val) {
; AVX512F-NEXT: retq
;
; SKX-LABEL: test14:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2
; SKX-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
; SKX-NEXT: vpcmpeqq %xmm2, %xmm0, %k1
@@ -472,17 +546,17 @@ define void @test14(<2 x i32> %trigger, <2 x float>* %addr, <2 x float> %val) {
define void @test15(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i32> %val) {
; AVX1-LABEL: test15:
-; AVX1: ## BB#0:
+; AVX1: ## %bb.0:
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
; AVX1-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,2,2,3]
; AVX1-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi)
; AVX1-NEXT: retq
;
; AVX2-LABEL: test15:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
; AVX2-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0
@@ -492,7 +566,7 @@ define void @test15(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i32> %val) {
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test15:
-; AVX512F: ## BB#0:
+; AVX512F: ## %bb.0:
; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
; AVX512F-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0
@@ -502,7 +576,7 @@ define void @test15(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i32> %val) {
; AVX512F-NEXT: retq
;
; SKX-LABEL: test15:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2
; SKX-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
; SKX-NEXT: vpcmpeqq %xmm2, %xmm0, %k1
@@ -515,7 +589,7 @@ define void @test15(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i32> %val) {
define <2 x float> @test16(<2 x i32> %trigger, <2 x float>* %addr, <2 x float> %dst) {
; AVX1-LABEL: test16:
-; AVX1: ## BB#0:
+; AVX1: ## %bb.0:
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
; AVX1-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0
@@ -525,7 +599,7 @@ define <2 x float> @test16(<2 x i32> %trigger, <2 x float>* %addr, <2 x float> %
; AVX1-NEXT: retq
;
; AVX2-LABEL: test16:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
; AVX2-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0
@@ -535,7 +609,7 @@ define <2 x float> @test16(<2 x i32> %trigger, <2 x float>* %addr, <2 x float> %
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test16:
-; AVX512F: ## BB#0:
+; AVX512F: ## %bb.0:
; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
; AVX512F-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0
@@ -545,7 +619,7 @@ define <2 x float> @test16(<2 x i32> %trigger, <2 x float>* %addr, <2 x float> %
; AVX512F-NEXT: retq
;
; SKX-LABEL: test16:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2
; SKX-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
; SKX-NEXT: vpcmpeqq %xmm2, %xmm0, %k1
@@ -558,43 +632,43 @@ define <2 x float> @test16(<2 x i32> %trigger, <2 x float>* %addr, <2 x float> %
define <2 x i32> @test17(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i32> %dst) {
; AVX1-LABEL: test17:
-; AVX1: ## BB#0:
+; AVX1: ## %bb.0:
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
; AVX1-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
; AVX1-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,2,2,3]
; AVX1-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0
; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: test17:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
; AVX2-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
; AVX2-NEXT: vpmaskmovd (%rdi), %xmm0, %xmm2
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,2,2,3]
; AVX2-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0
; AVX2-NEXT: vpmovsxdq %xmm0, %xmm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test17:
-; AVX512F: ## BB#0:
+; AVX512F: ## %bb.0:
; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
; AVX512F-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0
; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
; AVX512F-NEXT: vpmaskmovd (%rdi), %xmm0, %xmm2
-; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; AVX512F-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,2,2,3]
; AVX512F-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0
; AVX512F-NEXT: vpmovsxdq %xmm0, %xmm0
; AVX512F-NEXT: retq
;
; SKX-LABEL: test17:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2
; SKX-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
; SKX-NEXT: vpcmpeqq %xmm2, %xmm0, %k1
@@ -609,7 +683,7 @@ define <2 x i32> @test17(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i32> %dst) {
define <2 x float> @test18(<2 x i32> %trigger, <2 x float>* %addr) {
; AVX1-LABEL: test18:
-; AVX1: ## BB#0:
+; AVX1: ## %bb.0:
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
; AVX1-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
@@ -618,7 +692,7 @@ define <2 x float> @test18(<2 x i32> %trigger, <2 x float>* %addr) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test18:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
; AVX2-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
@@ -627,7 +701,7 @@ define <2 x float> @test18(<2 x i32> %trigger, <2 x float>* %addr) {
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test18:
-; AVX512F: ## BB#0:
+; AVX512F: ## %bb.0:
; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
; AVX512F-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
@@ -636,7 +710,7 @@ define <2 x float> @test18(<2 x i32> %trigger, <2 x float>* %addr) {
; AVX512F-NEXT: retq
;
; SKX-LABEL: test18:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; SKX-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
; SKX-NEXT: vpcmpeqq %xmm1, %xmm0, %k1
@@ -649,18 +723,18 @@ define <2 x float> @test18(<2 x i32> %trigger, <2 x float>* %addr) {
define <4 x float> @load_all(<4 x i32> %trigger, <4 x float>* %addr) {
; AVX-LABEL: load_all:
-; AVX: ## BB#0:
+; AVX: ## %bb.0:
; AVX-NEXT: vmovups (%rdi), %xmm0
; AVX-NEXT: retq
;
; AVX512F-LABEL: load_all:
-; AVX512F: ## BB#0:
+; AVX512F: ## %bb.0:
; AVX512F-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; AVX512F-NEXT: vmaskmovps (%rdi), %xmm0, %xmm0
; AVX512F-NEXT: retq
;
; SKX-LABEL: load_all:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: kxnorw %k0, %k0, %k1
; SKX-NEXT: vmovups (%rdi), %xmm0 {%k1} {z}
; SKX-NEXT: retq
@@ -675,19 +749,19 @@ define <4 x float> @load_all(<4 x i32> %trigger, <4 x float>* %addr) {
define <4 x float> @mload_constmask_v4f32(<4 x float>* %addr, <4 x float> %dst) {
; AVX-LABEL: mload_constmask_v4f32:
-; AVX: ## BB#0:
+; AVX: ## %bb.0:
; AVX-NEXT: vblendps {{.*#+}} xmm0 = mem[0],xmm0[1],mem[2,3]
; AVX-NEXT: retq
;
; AVX512F-LABEL: mload_constmask_v4f32:
-; AVX512F: ## BB#0:
+; AVX512F: ## %bb.0:
; AVX512F-NEXT: vmovaps {{.*#+}} xmm1 = [4294967295,0,4294967295,4294967295]
; AVX512F-NEXT: vmaskmovps (%rdi), %xmm1, %xmm2
; AVX512F-NEXT: vblendvps %xmm1, %xmm2, %xmm0, %xmm0
; AVX512F-NEXT: retq
;
; SKX-LABEL: mload_constmask_v4f32:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: movb $13, %al
; SKX-NEXT: kmovd %eax, %k1
; SKX-NEXT: vmovups (%rdi), %xmm0 {%k1}
@@ -700,28 +774,28 @@ define <4 x float> @mload_constmask_v4f32(<4 x float>* %addr, <4 x float> %dst)
define <4 x i32> @mload_constmask_v4i32(<4 x i32>* %addr, <4 x i32> %dst) {
; AVX1-LABEL: mload_constmask_v4i32:
-; AVX1: ## BB#0:
+; AVX1: ## %bb.0:
; AVX1-NEXT: vmovaps {{.*#+}} xmm1 = [0,4294967295,4294967295,4294967295]
; AVX1-NEXT: vmaskmovps (%rdi), %xmm1, %xmm1
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
; AVX1-NEXT: retq
;
; AVX2-LABEL: mload_constmask_v4i32:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [0,4294967295,4294967295,4294967295]
; AVX2-NEXT: vpmaskmovd (%rdi), %xmm1, %xmm1
; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; AVX2-NEXT: retq
;
; AVX512F-LABEL: mload_constmask_v4i32:
-; AVX512F: ## BB#0:
+; AVX512F: ## %bb.0:
; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [0,4294967295,4294967295,4294967295]
; AVX512F-NEXT: vpmaskmovd (%rdi), %xmm1, %xmm2
; AVX512F-NEXT: vblendvps %xmm1, %xmm2, %xmm0, %xmm0
; AVX512F-NEXT: retq
;
; SKX-LABEL: mload_constmask_v4i32:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: movb $14, %al
; SKX-NEXT: kmovd %eax, %k1
; SKX-NEXT: vmovdqu32 (%rdi), %xmm0 {%k1}
@@ -734,23 +808,23 @@ define <4 x i32> @mload_constmask_v4i32(<4 x i32>* %addr, <4 x i32> %dst) {
define <8 x float> @mload_constmask_v8f32(<8 x float>* %addr, <8 x float> %dst) {
; AVX-LABEL: mload_constmask_v8f32:
-; AVX: ## BB#0:
+; AVX: ## %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,0,0,0,0,0]
; AVX-NEXT: vmaskmovps (%rdi), %ymm1, %ymm1
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
; AVX-NEXT: retq
;
; AVX512F-LABEL: mload_constmask_v8f32:
-; AVX512F: ## BB#0:
-; AVX512F-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512F: ## %bb.0:
+; AVX512F-NEXT: ## kill: def %ymm0 killed %ymm0 def %zmm0
; AVX512F-NEXT: movw $7, %ax
; AVX512F-NEXT: kmovw %eax, %k1
; AVX512F-NEXT: vmovups (%rdi), %zmm0 {%k1}
-; AVX512F-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512F-NEXT: ## kill: def %ymm0 killed %ymm0 killed %zmm0
; AVX512F-NEXT: retq
;
; SKX-LABEL: mload_constmask_v8f32:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: movb $7, %al
; SKX-NEXT: kmovd %eax, %k1
; SKX-NEXT: vmovups (%rdi), %ymm0 {%k1}
@@ -761,21 +835,21 @@ define <8 x float> @mload_constmask_v8f32(<8 x float>* %addr, <8 x float> %dst)
define <4 x double> @mload_constmask_v4f64(<4 x double>* %addr, <4 x double> %dst) {
; AVX-LABEL: mload_constmask_v4f64:
-; AVX: ## BB#0:
+; AVX: ## %bb.0:
; AVX-NEXT: vmovapd {{.*#+}} ymm1 = [18446744073709551615,18446744073709551615,18446744073709551615,0]
; AVX-NEXT: vmaskmovpd (%rdi), %ymm1, %ymm1
; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3]
; AVX-NEXT: retq
;
; AVX512F-LABEL: mload_constmask_v4f64:
-; AVX512F: ## BB#0:
+; AVX512F: ## %bb.0:
; AVX512F-NEXT: vmovapd {{.*#+}} ymm1 = [18446744073709551615,18446744073709551615,18446744073709551615,0]
; AVX512F-NEXT: vmaskmovpd (%rdi), %ymm1, %ymm2
; AVX512F-NEXT: vblendvpd %ymm1, %ymm2, %ymm0, %ymm0
; AVX512F-NEXT: retq
;
; SKX-LABEL: mload_constmask_v4f64:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: movb $7, %al
; SKX-NEXT: kmovd %eax, %k1
; SKX-NEXT: vmovupd (%rdi), %ymm0 {%k1}
@@ -787,27 +861,22 @@ define <4 x double> @mload_constmask_v4f64(<4 x double>* %addr, <4 x double> %ds
; 256-bit integer vectors are supported with AVX2.
define <8 x i32> @mload_constmask_v8i32(<8 x i32>* %addr, <8 x i32> %dst) {
-; AVX1-LABEL: mload_constmask_v8i32:
-; AVX1: ## BB#0:
-; AVX1-NEXT: vblendps {{.*#+}} ymm0 = mem[0,1,2],ymm0[3,4,5,6],mem[7]
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: mload_constmask_v8i32:
-; AVX2: ## BB#0:
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2],ymm0[3,4,5,6],mem[7]
-; AVX2-NEXT: retq
+; AVX-LABEL: mload_constmask_v8i32:
+; AVX: ## %bb.0:
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = mem[0,1,2],ymm0[3,4,5,6],mem[7]
+; AVX-NEXT: retq
;
; AVX512F-LABEL: mload_constmask_v8i32:
-; AVX512F: ## BB#0:
-; AVX512F-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512F: ## %bb.0:
+; AVX512F-NEXT: ## kill: def %ymm0 killed %ymm0 def %zmm0
; AVX512F-NEXT: movw $135, %ax
; AVX512F-NEXT: kmovw %eax, %k1
; AVX512F-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1}
-; AVX512F-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512F-NEXT: ## kill: def %ymm0 killed %ymm0 killed %zmm0
; AVX512F-NEXT: retq
;
; SKX-LABEL: mload_constmask_v8i32:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: movb $-121, %al
; SKX-NEXT: kmovd %eax, %k1
; SKX-NEXT: vmovdqu32 (%rdi), %ymm0 {%k1}
@@ -818,24 +887,24 @@ define <8 x i32> @mload_constmask_v8i32(<8 x i32>* %addr, <8 x i32> %dst) {
define <4 x i64> @mload_constmask_v4i64(<4 x i64>* %addr, <4 x i64> %dst) {
; AVX1-LABEL: mload_constmask_v4i64:
-; AVX1: ## BB#0:
+; AVX1: ## %bb.0:
; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = mem[0],ymm0[1,2],mem[3]
; AVX1-NEXT: retq
;
; AVX2-LABEL: mload_constmask_v4i64:
-; AVX2: ## BB#0:
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1],ymm0[2,3,4,5],mem[6,7]
+; AVX2: ## %bb.0:
+; AVX2-NEXT: vblendps {{.*#+}} ymm0 = mem[0,1],ymm0[2,3,4,5],mem[6,7]
; AVX2-NEXT: retq
;
; AVX512F-LABEL: mload_constmask_v4i64:
-; AVX512F: ## BB#0:
+; AVX512F: ## %bb.0:
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [18446744073709551615,0,0,18446744073709551615]
; AVX512F-NEXT: vpmaskmovq (%rdi), %ymm1, %ymm2
; AVX512F-NEXT: vblendvpd %ymm1, %ymm2, %ymm0, %ymm0
; AVX512F-NEXT: retq
;
; SKX-LABEL: mload_constmask_v4i64:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: movb $9, %al
; SKX-NEXT: kmovd %eax, %k1
; SKX-NEXT: vmovdqu64 (%rdi), %ymm0 {%k1}
@@ -848,20 +917,20 @@ define <4 x i64> @mload_constmask_v4i64(<4 x i64>* %addr, <4 x i64> %dst) {
define <8 x double> @mload_constmask_v8f64(<8 x double>* %addr, <8 x double> %dst) {
; AVX-LABEL: mload_constmask_v8f64:
-; AVX: ## BB#0:
+; AVX: ## %bb.0:
; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],mem[3]
; AVX-NEXT: vblendpd {{.*#+}} ymm0 = mem[0,1,2],ymm0[3]
; AVX-NEXT: retq
;
; AVX512F-LABEL: mload_constmask_v8f64:
-; AVX512F: ## BB#0:
+; AVX512F: ## %bb.0:
; AVX512F-NEXT: movb $-121, %al
; AVX512F-NEXT: kmovw %eax, %k1
; AVX512F-NEXT: vmovupd (%rdi), %zmm0 {%k1}
; AVX512F-NEXT: retq
;
; SKX-LABEL: mload_constmask_v8f64:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: movb $-121, %al
; SKX-NEXT: kmovd %eax, %k1
; SKX-NEXT: vmovupd (%rdi), %zmm0 {%k1}
@@ -874,19 +943,19 @@ define <8 x double> @mload_constmask_v8f64(<8 x double>* %addr, <8 x double> %ds
define <4 x double> @mload_constmask_v4f64_undef_passthrough(<4 x double>* %addr) {
; AVX-LABEL: mload_constmask_v4f64_undef_passthrough:
-; AVX: ## BB#0:
+; AVX: ## %bb.0:
; AVX-NEXT: vmovapd {{.*#+}} ymm0 = [18446744073709551615,18446744073709551615,18446744073709551615,0]
; AVX-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm0
; AVX-NEXT: retq
;
; AVX512F-LABEL: mload_constmask_v4f64_undef_passthrough:
-; AVX512F: ## BB#0:
+; AVX512F: ## %bb.0:
; AVX512F-NEXT: vmovapd {{.*#+}} ymm0 = [18446744073709551615,18446744073709551615,18446744073709551615,0]
; AVX512F-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm0
; AVX512F-NEXT: retq
;
; SKX-LABEL: mload_constmask_v4f64_undef_passthrough:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: movb $7, %al
; SKX-NEXT: kmovd %eax, %k1
; SKX-NEXT: vmovupd (%rdi), %ymm0 {%k1} {z}
@@ -897,25 +966,25 @@ define <4 x double> @mload_constmask_v4f64_undef_passthrough(<4 x double>* %addr
define <4 x i64> @mload_constmask_v4i64_undef_passthrough(<4 x i64>* %addr) {
; AVX1-LABEL: mload_constmask_v4i64_undef_passthrough:
-; AVX1: ## BB#0:
+; AVX1: ## %bb.0:
; AVX1-NEXT: vmovapd {{.*#+}} ymm0 = [0,18446744073709551615,18446744073709551615,0]
; AVX1-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: mload_constmask_v4i64_undef_passthrough:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [0,18446744073709551615,18446744073709551615,0]
; AVX2-NEXT: vpmaskmovq (%rdi), %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: mload_constmask_v4i64_undef_passthrough:
-; AVX512F: ## BB#0:
+; AVX512F: ## %bb.0:
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm0 = [0,18446744073709551615,18446744073709551615,0]
; AVX512F-NEXT: vpmaskmovq (%rdi), %ymm0, %ymm0
; AVX512F-NEXT: retq
;
; SKX-LABEL: mload_constmask_v4i64_undef_passthrough:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: movb $6, %al
; SKX-NEXT: kmovd %eax, %k1
; SKX-NEXT: vmovdqu64 (%rdi), %ymm0 {%k1} {z}
@@ -926,25 +995,25 @@ define <4 x i64> @mload_constmask_v4i64_undef_passthrough(<4 x i64>* %addr) {
define void @test21(<4 x i32> %trigger, <4 x i32>* %addr, <4 x i32> %val) {
; AVX1-LABEL: test21:
-; AVX1: ## BB#0:
+; AVX1: ## %bb.0:
; AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; AVX1-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi)
; AVX1-NEXT: retq
;
; AVX2-LABEL: test21:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; AVX2-NEXT: vpmaskmovd %xmm1, %xmm0, (%rdi)
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test21:
-; AVX512F: ## BB#0:
+; AVX512F: ## %bb.0:
; AVX512F-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; AVX512F-NEXT: vpmaskmovd %xmm1, %xmm0, (%rdi)
; AVX512F-NEXT: retq
;
; SKX-LABEL: test21:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: kxnorw %k0, %k0, %k1
; SKX-NEXT: vmovdqu32 %xmm1, (%rdi) {%k1}
; SKX-NEXT: retq
@@ -957,12 +1026,12 @@ define void @test21(<4 x i32> %trigger, <4 x i32>* %addr, <4 x i32> %val) {
define void @one_mask_bit_set1(<4 x i32>* %addr, <4 x i32> %val) {
; AVX-LABEL: one_mask_bit_set1:
-; AVX: ## BB#0:
+; AVX: ## %bb.0:
; AVX-NEXT: vmovss %xmm0, (%rdi)
; AVX-NEXT: retq
;
; AVX512-LABEL: one_mask_bit_set1:
-; AVX512: ## BB#0:
+; AVX512: ## %bb.0:
; AVX512-NEXT: vmovss %xmm0, (%rdi)
; AVX512-NEXT: retq
call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %val, <4 x i32>* %addr, i32 4, <4 x i1><i1 true, i1 false, i1 false, i1 false>)
@@ -973,12 +1042,12 @@ define void @one_mask_bit_set1(<4 x i32>* %addr, <4 x i32> %val) {
define void @one_mask_bit_set2(<4 x float>* %addr, <4 x float> %val) {
; AVX-LABEL: one_mask_bit_set2:
-; AVX: ## BB#0:
+; AVX: ## %bb.0:
; AVX-NEXT: vextractps $2, %xmm0, 8(%rdi)
; AVX-NEXT: retq
;
; AVX512-LABEL: one_mask_bit_set2:
-; AVX512: ## BB#0:
+; AVX512: ## %bb.0:
; AVX512-NEXT: vextractps $2, %xmm0, 8(%rdi)
; AVX512-NEXT: retq
call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %val, <4 x float>* %addr, i32 4, <4 x i1><i1 false, i1 false, i1 true, i1 false>)
@@ -989,25 +1058,18 @@ define void @one_mask_bit_set2(<4 x float>* %addr, <4 x float> %val) {
define void @one_mask_bit_set3(<4 x i64>* %addr, <4 x i64> %val) {
; AVX-LABEL: one_mask_bit_set3:
-; AVX: ## BB#0:
+; AVX: ## %bb.0:
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX-NEXT: vmovlps %xmm0, 16(%rdi)
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
;
-; AVX512F-LABEL: one_mask_bit_set3:
-; AVX512F: ## BB#0:
-; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX512F-NEXT: vmovlps %xmm0, 16(%rdi)
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; SKX-LABEL: one_mask_bit_set3:
-; SKX: ## BB#0:
-; SKX-NEXT: vextracti128 $1, %ymm0, %xmm0
-; SKX-NEXT: vmovq %xmm0, 16(%rdi)
-; SKX-NEXT: vzeroupper
-; SKX-NEXT: retq
+; AVX512-LABEL: one_mask_bit_set3:
+; AVX512: ## %bb.0:
+; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX512-NEXT: vmovlps %xmm0, 16(%rdi)
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
call void @llvm.masked.store.v4i64.p0v4i64(<4 x i64> %val, <4 x i64>* %addr, i32 4, <4 x i1><i1 false, i1 false, i1 true, i1 false>)
ret void
}
@@ -1016,14 +1078,14 @@ define void @one_mask_bit_set3(<4 x i64>* %addr, <4 x i64> %val) {
define void @one_mask_bit_set4(<4 x double>* %addr, <4 x double> %val) {
; AVX-LABEL: one_mask_bit_set4:
-; AVX: ## BB#0:
+; AVX: ## %bb.0:
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX-NEXT: vmovhpd %xmm0, 24(%rdi)
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
;
; AVX512-LABEL: one_mask_bit_set4:
-; AVX512: ## BB#0:
+; AVX512: ## %bb.0:
; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX512-NEXT: vmovhpd %xmm0, 24(%rdi)
; AVX512-NEXT: vzeroupper
@@ -1036,14 +1098,14 @@ define void @one_mask_bit_set4(<4 x double>* %addr, <4 x double> %val) {
define void @one_mask_bit_set5(<8 x double>* %addr, <8 x double> %val) {
; AVX-LABEL: one_mask_bit_set5:
-; AVX: ## BB#0:
+; AVX: ## %bb.0:
; AVX-NEXT: vextractf128 $1, %ymm1, %xmm0
; AVX-NEXT: vmovlps %xmm0, 48(%rdi)
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
;
; AVX512-LABEL: one_mask_bit_set5:
-; AVX512: ## BB#0:
+; AVX512: ## %bb.0:
; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0
; AVX512-NEXT: vmovlps %xmm0, 48(%rdi)
; AVX512-NEXT: vzeroupper
@@ -1056,12 +1118,12 @@ define void @one_mask_bit_set5(<8 x double>* %addr, <8 x double> %val) {
define <4 x i32> @load_one_mask_bit_set1(<4 x i32>* %addr, <4 x i32> %val) {
; AVX-LABEL: load_one_mask_bit_set1:
-; AVX: ## BB#0:
+; AVX: ## %bb.0:
; AVX-NEXT: vpinsrd $0, (%rdi), %xmm0, %xmm0
; AVX-NEXT: retq
;
; AVX512-LABEL: load_one_mask_bit_set1:
-; AVX512: ## BB#0:
+; AVX512: ## %bb.0:
; AVX512-NEXT: vpinsrd $0, (%rdi), %xmm0, %xmm0
; AVX512-NEXT: retq
%res = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %addr, i32 4, <4 x i1><i1 true, i1 false, i1 false, i1 false>, <4 x i32> %val)
@@ -1072,12 +1134,12 @@ define <4 x i32> @load_one_mask_bit_set1(<4 x i32>* %addr, <4 x i32> %val) {
define <4 x float> @load_one_mask_bit_set2(<4 x float>* %addr, <4 x float> %val) {
; AVX-LABEL: load_one_mask_bit_set2:
-; AVX: ## BB#0:
+; AVX: ## %bb.0:
; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
; AVX-NEXT: retq
;
; AVX512-LABEL: load_one_mask_bit_set2:
-; AVX512: ## BB#0:
+; AVX512: ## %bb.0:
; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
; AVX512-NEXT: retq
%res = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %addr, i32 4, <4 x i1><i1 false, i1 false, i1 true, i1 false>, <4 x float> %val)
@@ -1088,21 +1150,21 @@ define <4 x float> @load_one_mask_bit_set2(<4 x float>* %addr, <4 x float> %val)
define <4 x i64> @load_one_mask_bit_set3(<4 x i64>* %addr, <4 x i64> %val) {
; AVX1-LABEL: load_one_mask_bit_set3:
-; AVX1: ## BB#0:
+; AVX1: ## %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpinsrq $0, 16(%rdi), %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: load_one_mask_bit_set3:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpinsrq $0, 16(%rdi), %xmm1, %xmm1
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: load_one_mask_bit_set3:
-; AVX512: ## BB#0:
+; AVX512: ## %bb.0:
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512-NEXT: vpinsrq $0, 16(%rdi), %xmm1, %xmm1
; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
@@ -1115,14 +1177,14 @@ define <4 x i64> @load_one_mask_bit_set3(<4 x i64>* %addr, <4 x i64> %val) {
define <4 x double> @load_one_mask_bit_set4(<4 x double>* %addr, <4 x double> %val) {
; AVX-LABEL: load_one_mask_bit_set4:
-; AVX: ## BB#0:
+; AVX: ## %bb.0:
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX-NEXT: retq
;
; AVX512-LABEL: load_one_mask_bit_set4:
-; AVX512: ## BB#0:
+; AVX512: ## %bb.0:
; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX512-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
@@ -1135,15 +1197,15 @@ define <4 x double> @load_one_mask_bit_set4(<4 x double>* %addr, <4 x double> %v
define <8 x double> @load_one_mask_bit_set5(<8 x double>* %addr, <8 x double> %val) {
; AVX-LABEL: load_one_mask_bit_set5:
-; AVX: ## BB#0:
+; AVX: ## %bb.0:
; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero
-; AVX-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
; AVX-NEXT: retq
;
; AVX512-LABEL: load_one_mask_bit_set5:
-; AVX512: ## BB#0:
+; AVX512: ## %bb.0:
; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm1
; AVX512-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
; AVX512-NEXT: vinsertf32x4 $3, %xmm1, %zmm0, %zmm0
@@ -1152,6 +1214,31 @@ define <8 x double> @load_one_mask_bit_set5(<8 x double>* %addr, <8 x double> %v
ret <8 x double> %res
}
+; The mask bit for each data element is the most significant bit of the mask operand, so a compare isn't needed.
+; FIXME: The AVX512 code should be improved to use 'vpmovd2m'. Add tests for 512-bit vectors when implementing that.
+
+define void @trunc_mask(<4 x float> %x, <4 x float>* %ptr, <4 x float> %y, <4 x i32> %mask) {
+; AVX-LABEL: trunc_mask:
+; AVX: ## %bb.0:
+; AVX-NEXT: vmaskmovps %xmm0, %xmm2, (%rdi)
+; AVX-NEXT: retq
+;
+; AVX512F-LABEL: trunc_mask:
+; AVX512F: ## %bb.0:
+; AVX512F-NEXT: vmaskmovps %xmm0, %xmm2, (%rdi)
+; AVX512F-NEXT: retq
+;
+; SKX-LABEL: trunc_mask:
+; SKX: ## %bb.0:
+; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; SKX-NEXT: vpcmpgtd %xmm2, %xmm1, %k1
+; SKX-NEXT: vmovups %xmm0, (%rdi) {%k1}
+; SKX-NEXT: retq
+ %bool_mask = icmp slt <4 x i32> %mask, zeroinitializer
+ call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %x, <4 x float>* %ptr, i32 1, <4 x i1> %bool_mask)
+ ret void
+}
+
declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32, <4 x i1>, <4 x i32>)
declare <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>*, i32, <2 x i1>, <2 x i32>)
declare <4 x i64> @llvm.masked.load.v4i64.p0v4i64(<4 x i64>*, i32, <4 x i1>, <4 x i64>)
diff --git a/test/CodeGen/X86/maskmovdqu.ll b/test/CodeGen/X86/maskmovdqu.ll
index 0b3334d19f89..2f13c535e50d 100644
--- a/test/CodeGen/X86/maskmovdqu.ll
+++ b/test/CodeGen/X86/maskmovdqu.ll
@@ -1,7 +1,7 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2,-avx | grep -i EDI
-; RUN: llc < %s -march=x86-64 -mattr=+sse2,-avx | grep -i RDI
-; RUN: llc < %s -march=x86 -mattr=+avx | grep -i EDI
-; RUN: llc < %s -march=x86-64 -mattr=+avx | grep -i RDI
+; RUN: llc < %s -mtriple=i686-- -mattr=+sse2,-avx | grep -i edi
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2,-avx | grep -i rdi
+; RUN: llc < %s -mtriple=i686-- -mattr=+avx | grep -i edi
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx | grep -i rdi
; rdar://6573467
define void @test(<16 x i8> %a, <16 x i8> %b, i32 %dummy, i8* %c) nounwind {
diff --git a/test/CodeGen/X86/mature-mc-support.ll b/test/CodeGen/X86/mature-mc-support.ll
index 3d6f0f66c187..fefd456966ca 100644
--- a/test/CodeGen/X86/mature-mc-support.ll
+++ b/test/CodeGen/X86/mature-mc-support.ll
@@ -1,16 +1,16 @@
; Test that inline assembly is parsed by the MC layer when MC support is mature
; (even when the output is assembly).
-; RUN: not llc -march=x86 < %s > /dev/null 2> %t1
+; RUN: not llc -mtriple=i686-- < %s > /dev/null 2> %t1
; RUN: FileCheck %s < %t1
-; RUN: not llc -march=x86 -filetype=obj < %s > /dev/null 2> %t2
+; RUN: not llc -mtriple=i686-- -filetype=obj < %s > /dev/null 2> %t2
; RUN: FileCheck %s < %t2
-; RUN: not llc -march=x86-64 < %s > /dev/null 2> %t3
+; RUN: not llc -mtriple=x86_64-- < %s > /dev/null 2> %t3
; RUN: FileCheck %s < %t3
-; RUN: not llc -march=x86-64 -filetype=obj < %s > /dev/null 2> %t4
+; RUN: not llc -mtriple=x86_64-- -filetype=obj < %s > /dev/null 2> %t4
; RUN: FileCheck %s < %t4
module asm " .this_directive_is_very_unlikely_to_exist"
diff --git a/test/CodeGen/X86/mbp-false-cfg-break.ll b/test/CodeGen/X86/mbp-false-cfg-break.ll
index bc8b0de3eef0..f18ae43ff59d 100644
--- a/test/CodeGen/X86/mbp-false-cfg-break.ll
+++ b/test/CodeGen/X86/mbp-false-cfg-break.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-- | FileCheck %s
define void @test(i1 %cnd) !prof !{!"function_entry_count", i64 1024} {
; CHECK-LABEL: @test
diff --git a/test/CodeGen/X86/mem-promote-integers.ll b/test/CodeGen/X86/mem-promote-integers.ll
index 3023cf2e900e..688173e2acc6 100644
--- a/test/CodeGen/X86/mem-promote-integers.ll
+++ b/test/CodeGen/X86/mem-promote-integers.ll
@@ -1,8 +1,8 @@
; Test the basic functionality of integer element promotions of different types.
; This tests checks passing of arguments, loading and storing to memory and
; basic arithmetic.
-; RUN: llc -march=x86 < %s > /dev/null
-; RUN: llc -march=x86-64 < %s > /dev/null
+; RUN: llc -mtriple=i686-- < %s > /dev/null
+; RUN: llc -mtriple=x86_64-- < %s > /dev/null
define <1 x i8> @test_1xi8(<1 x i8> %x, <1 x i8>* %b) {
%bb = load <1 x i8>, <1 x i8>* %b
diff --git a/test/CodeGen/X86/membarrier.ll b/test/CodeGen/X86/membarrier.ll
index 5e569aabcadd..45827ae73ecd 100644
--- a/test/CodeGen/X86/membarrier.ll
+++ b/test/CodeGen/X86/membarrier.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mattr=-sse -O0
+; RUN: llc < %s -mtriple=x86_64-- -mattr=-sse -O0
; PR9675
define i32 @t() {
diff --git a/test/CodeGen/X86/memcmp-minsize.ll b/test/CodeGen/X86/memcmp-minsize.ll
index a55c40f5bda8..a1ab4e130069 100644
--- a/test/CodeGen/X86/memcmp-minsize.ll
+++ b/test/CodeGen/X86/memcmp-minsize.ll
@@ -13,20 +13,17 @@ declare i32 @memcmp(i8*, i8*, i64)
define i32 @length2(i8* %X, i8* %Y) nounwind minsize {
; X86-LABEL: length2:
-; X86: # BB#0:
-; X86-NEXT: subl $16, %esp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NEXT: movl %eax, (%esp)
-; X86-NEXT: andl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $2, {{[0-9]+}}(%esp)
+; X86: # %bb.0:
+; X86-NEXT: pushl $0
+; X86-NEXT: pushl $2
+; X86-NEXT: pushl {{[0-9]+}}(%esp)
+; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: calll memcmp
; X86-NEXT: addl $16, %esp
; X86-NEXT: retl
;
; X64-LABEL: length2:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: pushq $2
; X64-NEXT: popq %rdx
; X64-NEXT: jmp memcmp # TAILCALL
@@ -36,7 +33,7 @@ define i32 @length2(i8* %X, i8* %Y) nounwind minsize {
define i1 @length2_eq(i8* %X, i8* %Y) nounwind minsize {
; X86-LABEL: length2_eq:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movzwl (%ecx), %ecx
@@ -45,7 +42,7 @@ define i1 @length2_eq(i8* %X, i8* %Y) nounwind minsize {
; X86-NEXT: retl
;
; X64-LABEL: length2_eq:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movzwl (%rdi), %eax
; X64-NEXT: cmpw (%rsi), %ax
; X64-NEXT: sete %al
@@ -57,14 +54,14 @@ define i1 @length2_eq(i8* %X, i8* %Y) nounwind minsize {
define i1 @length2_eq_const(i8* %X) nounwind minsize {
; X86-LABEL: length2_eq_const:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: cmpw $12849, (%eax) # imm = 0x3231
; X86-NEXT: setne %al
; X86-NEXT: retl
;
; X64-LABEL: length2_eq_const:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: cmpw $12849, (%rdi) # imm = 0x3231
; X64-NEXT: setne %al
; X64-NEXT: retq
@@ -75,22 +72,19 @@ define i1 @length2_eq_const(i8* %X) nounwind minsize {
define i1 @length2_eq_nobuiltin_attr(i8* %X, i8* %Y) nounwind minsize {
; X86-LABEL: length2_eq_nobuiltin_attr:
-; X86: # BB#0:
-; X86-NEXT: subl $16, %esp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NEXT: movl %eax, (%esp)
-; X86-NEXT: andl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $2, {{[0-9]+}}(%esp)
+; X86: # %bb.0:
+; X86-NEXT: pushl $0
+; X86-NEXT: pushl $2
+; X86-NEXT: pushl {{[0-9]+}}(%esp)
+; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: calll memcmp
+; X86-NEXT: addl $16, %esp
; X86-NEXT: testl %eax, %eax
; X86-NEXT: sete %al
-; X86-NEXT: addl $16, %esp
; X86-NEXT: retl
;
; X64-LABEL: length2_eq_nobuiltin_attr:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: pushq %rax
; X64-NEXT: pushq $2
; X64-NEXT: popq %rdx
@@ -106,20 +100,17 @@ define i1 @length2_eq_nobuiltin_attr(i8* %X, i8* %Y) nounwind minsize {
define i32 @length3(i8* %X, i8* %Y) nounwind minsize {
; X86-LABEL: length3:
-; X86: # BB#0:
-; X86-NEXT: subl $16, %esp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NEXT: movl %eax, (%esp)
-; X86-NEXT: andl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $3, {{[0-9]+}}(%esp)
+; X86: # %bb.0:
+; X86-NEXT: pushl $0
+; X86-NEXT: pushl $3
+; X86-NEXT: pushl {{[0-9]+}}(%esp)
+; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: calll memcmp
; X86-NEXT: addl $16, %esp
; X86-NEXT: retl
;
; X64-LABEL: length3:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: pushq $3
; X64-NEXT: popq %rdx
; X64-NEXT: jmp memcmp # TAILCALL
@@ -129,22 +120,19 @@ define i32 @length3(i8* %X, i8* %Y) nounwind minsize {
define i1 @length3_eq(i8* %X, i8* %Y) nounwind minsize {
; X86-LABEL: length3_eq:
-; X86: # BB#0:
-; X86-NEXT: subl $16, %esp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NEXT: movl %eax, (%esp)
-; X86-NEXT: andl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $3, {{[0-9]+}}(%esp)
+; X86: # %bb.0:
+; X86-NEXT: pushl $0
+; X86-NEXT: pushl $3
+; X86-NEXT: pushl {{[0-9]+}}(%esp)
+; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: calll memcmp
+; X86-NEXT: addl $16, %esp
; X86-NEXT: testl %eax, %eax
; X86-NEXT: setne %al
-; X86-NEXT: addl $16, %esp
; X86-NEXT: retl
;
; X64-LABEL: length3_eq:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: pushq %rax
; X64-NEXT: pushq $3
; X64-NEXT: popq %rdx
@@ -160,20 +148,17 @@ define i1 @length3_eq(i8* %X, i8* %Y) nounwind minsize {
define i32 @length4(i8* %X, i8* %Y) nounwind minsize {
; X86-LABEL: length4:
-; X86: # BB#0:
-; X86-NEXT: subl $16, %esp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NEXT: movl %eax, (%esp)
-; X86-NEXT: andl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $4, {{[0-9]+}}(%esp)
+; X86: # %bb.0:
+; X86-NEXT: pushl $0
+; X86-NEXT: pushl $4
+; X86-NEXT: pushl {{[0-9]+}}(%esp)
+; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: calll memcmp
; X86-NEXT: addl $16, %esp
; X86-NEXT: retl
;
; X64-LABEL: length4:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: pushq $4
; X64-NEXT: popq %rdx
; X64-NEXT: jmp memcmp # TAILCALL
@@ -183,7 +168,7 @@ define i32 @length4(i8* %X, i8* %Y) nounwind minsize {
define i1 @length4_eq(i8* %X, i8* %Y) nounwind minsize {
; X86-LABEL: length4_eq:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl (%ecx), %ecx
@@ -192,7 +177,7 @@ define i1 @length4_eq(i8* %X, i8* %Y) nounwind minsize {
; X86-NEXT: retl
;
; X64-LABEL: length4_eq:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movl (%rdi), %eax
; X64-NEXT: cmpl (%rsi), %eax
; X64-NEXT: setne %al
@@ -204,14 +189,14 @@ define i1 @length4_eq(i8* %X, i8* %Y) nounwind minsize {
define i1 @length4_eq_const(i8* %X) nounwind minsize {
; X86-LABEL: length4_eq_const:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: cmpl $875770417, (%eax) # imm = 0x34333231
; X86-NEXT: sete %al
; X86-NEXT: retl
;
; X64-LABEL: length4_eq_const:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: cmpl $875770417, (%rdi) # imm = 0x34333231
; X64-NEXT: sete %al
; X64-NEXT: retq
@@ -222,20 +207,17 @@ define i1 @length4_eq_const(i8* %X) nounwind minsize {
define i32 @length5(i8* %X, i8* %Y) nounwind minsize {
; X86-LABEL: length5:
-; X86: # BB#0:
-; X86-NEXT: subl $16, %esp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NEXT: movl %eax, (%esp)
-; X86-NEXT: andl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $5, {{[0-9]+}}(%esp)
+; X86: # %bb.0:
+; X86-NEXT: pushl $0
+; X86-NEXT: pushl $5
+; X86-NEXT: pushl {{[0-9]+}}(%esp)
+; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: calll memcmp
; X86-NEXT: addl $16, %esp
; X86-NEXT: retl
;
; X64-LABEL: length5:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: pushq $5
; X64-NEXT: popq %rdx
; X64-NEXT: jmp memcmp # TAILCALL
@@ -245,22 +227,19 @@ define i32 @length5(i8* %X, i8* %Y) nounwind minsize {
define i1 @length5_eq(i8* %X, i8* %Y) nounwind minsize {
; X86-LABEL: length5_eq:
-; X86: # BB#0:
-; X86-NEXT: subl $16, %esp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NEXT: movl %eax, (%esp)
-; X86-NEXT: andl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $5, {{[0-9]+}}(%esp)
+; X86: # %bb.0:
+; X86-NEXT: pushl $0
+; X86-NEXT: pushl $5
+; X86-NEXT: pushl {{[0-9]+}}(%esp)
+; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: calll memcmp
+; X86-NEXT: addl $16, %esp
; X86-NEXT: testl %eax, %eax
; X86-NEXT: setne %al
-; X86-NEXT: addl $16, %esp
; X86-NEXT: retl
;
; X64-LABEL: length5_eq:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: pushq %rax
; X64-NEXT: pushq $5
; X64-NEXT: popq %rdx
@@ -276,20 +255,17 @@ define i1 @length5_eq(i8* %X, i8* %Y) nounwind minsize {
define i32 @length8(i8* %X, i8* %Y) nounwind minsize {
; X86-LABEL: length8:
-; X86: # BB#0:
-; X86-NEXT: subl $16, %esp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NEXT: movl %eax, (%esp)
-; X86-NEXT: andl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $8, {{[0-9]+}}(%esp)
+; X86: # %bb.0:
+; X86-NEXT: pushl $0
+; X86-NEXT: pushl $8
+; X86-NEXT: pushl {{[0-9]+}}(%esp)
+; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: calll memcmp
; X86-NEXT: addl $16, %esp
; X86-NEXT: retl
;
; X64-LABEL: length8:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: pushq $8
; X64-NEXT: popq %rdx
; X64-NEXT: jmp memcmp # TAILCALL
@@ -299,22 +275,19 @@ define i32 @length8(i8* %X, i8* %Y) nounwind minsize {
define i1 @length8_eq(i8* %X, i8* %Y) nounwind minsize {
; X86-LABEL: length8_eq:
-; X86: # BB#0:
-; X86-NEXT: subl $16, %esp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NEXT: movl %eax, (%esp)
-; X86-NEXT: andl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $8, {{[0-9]+}}(%esp)
+; X86: # %bb.0:
+; X86-NEXT: pushl $0
+; X86-NEXT: pushl $8
+; X86-NEXT: pushl {{[0-9]+}}(%esp)
+; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: calll memcmp
+; X86-NEXT: addl $16, %esp
; X86-NEXT: testl %eax, %eax
; X86-NEXT: sete %al
-; X86-NEXT: addl $16, %esp
; X86-NEXT: retl
;
; X64-LABEL: length8_eq:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movq (%rdi), %rax
; X64-NEXT: cmpq (%rsi), %rax
; X64-NEXT: sete %al
@@ -326,21 +299,19 @@ define i1 @length8_eq(i8* %X, i8* %Y) nounwind minsize {
define i1 @length8_eq_const(i8* %X) nounwind minsize {
; X86-LABEL: length8_eq_const:
-; X86: # BB#0:
-; X86-NEXT: subl $16, %esp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl %eax, (%esp)
-; X86-NEXT: andl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $8, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $.L.str, {{[0-9]+}}(%esp)
+; X86: # %bb.0:
+; X86-NEXT: pushl $0
+; X86-NEXT: pushl $8
+; X86-NEXT: pushl $.L.str
+; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: calll memcmp
+; X86-NEXT: addl $16, %esp
; X86-NEXT: testl %eax, %eax
; X86-NEXT: setne %al
-; X86-NEXT: addl $16, %esp
; X86-NEXT: retl
;
; X64-LABEL: length8_eq_const:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movabsq $3978425819141910832, %rax # imm = 0x3736353433323130
; X64-NEXT: cmpq %rax, (%rdi)
; X64-NEXT: setne %al
@@ -352,22 +323,19 @@ define i1 @length8_eq_const(i8* %X) nounwind minsize {
define i1 @length12_eq(i8* %X, i8* %Y) nounwind minsize {
; X86-LABEL: length12_eq:
-; X86: # BB#0:
-; X86-NEXT: subl $16, %esp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NEXT: movl %eax, (%esp)
-; X86-NEXT: andl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $12, {{[0-9]+}}(%esp)
+; X86: # %bb.0:
+; X86-NEXT: pushl $0
+; X86-NEXT: pushl $12
+; X86-NEXT: pushl {{[0-9]+}}(%esp)
+; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: calll memcmp
+; X86-NEXT: addl $16, %esp
; X86-NEXT: testl %eax, %eax
; X86-NEXT: setne %al
-; X86-NEXT: addl $16, %esp
; X86-NEXT: retl
;
; X64-LABEL: length12_eq:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: pushq %rax
; X64-NEXT: pushq $12
; X64-NEXT: popq %rdx
@@ -383,20 +351,17 @@ define i1 @length12_eq(i8* %X, i8* %Y) nounwind minsize {
define i32 @length12(i8* %X, i8* %Y) nounwind minsize {
; X86-LABEL: length12:
-; X86: # BB#0:
-; X86-NEXT: subl $16, %esp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NEXT: movl %eax, (%esp)
-; X86-NEXT: andl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $12, {{[0-9]+}}(%esp)
+; X86: # %bb.0:
+; X86-NEXT: pushl $0
+; X86-NEXT: pushl $12
+; X86-NEXT: pushl {{[0-9]+}}(%esp)
+; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: calll memcmp
; X86-NEXT: addl $16, %esp
; X86-NEXT: retl
;
; X64-LABEL: length12:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: pushq $12
; X64-NEXT: popq %rdx
; X64-NEXT: jmp memcmp # TAILCALL
@@ -408,20 +373,17 @@ define i32 @length12(i8* %X, i8* %Y) nounwind minsize {
define i32 @length16(i8* %X, i8* %Y) nounwind minsize {
; X86-LABEL: length16:
-; X86: # BB#0:
-; X86-NEXT: subl $16, %esp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NEXT: movl %eax, (%esp)
-; X86-NEXT: andl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $16, {{[0-9]+}}(%esp)
+; X86: # %bb.0:
+; X86-NEXT: pushl $0
+; X86-NEXT: pushl $16
+; X86-NEXT: pushl {{[0-9]+}}(%esp)
+; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: calll memcmp
; X86-NEXT: addl $16, %esp
; X86-NEXT: retl
;
; X64-LABEL: length16:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: pushq $16
; X64-NEXT: popq %rdx
; X64-NEXT: jmp memcmp # TAILCALL
@@ -431,22 +393,19 @@ define i32 @length16(i8* %X, i8* %Y) nounwind minsize {
define i1 @length16_eq(i8* %x, i8* %y) nounwind minsize {
; X86-NOSSE-LABEL: length16_eq:
-; X86-NOSSE: # BB#0:
-; X86-NOSSE-NEXT: subl $16, %esp
-; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NOSSE-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT: movl %eax, (%esp)
-; X86-NOSSE-NEXT: andl $0, {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT: movl $16, {{[0-9]+}}(%esp)
+; X86-NOSSE: # %bb.0:
+; X86-NOSSE-NEXT: pushl $0
+; X86-NOSSE-NEXT: pushl $16
+; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NOSSE-NEXT: calll memcmp
+; X86-NOSSE-NEXT: addl $16, %esp
; X86-NOSSE-NEXT: testl %eax, %eax
; X86-NOSSE-NEXT: setne %al
-; X86-NOSSE-NEXT: addl $16, %esp
; X86-NOSSE-NEXT: retl
;
; X86-SSE2-LABEL: length16_eq:
-; X86-SSE2: # BB#0:
+; X86-SSE2: # %bb.0:
; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-SSE2-NEXT: movdqu (%ecx), %xmm0
@@ -458,7 +417,7 @@ define i1 @length16_eq(i8* %x, i8* %y) nounwind minsize {
; X86-SSE2-NEXT: retl
;
; X64-SSE2-LABEL: length16_eq:
-; X64-SSE2: # BB#0:
+; X64-SSE2: # %bb.0:
; X64-SSE2-NEXT: movdqu (%rsi), %xmm0
; X64-SSE2-NEXT: movdqu (%rdi), %xmm1
; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm1
@@ -468,7 +427,7 @@ define i1 @length16_eq(i8* %x, i8* %y) nounwind minsize {
; X64-SSE2-NEXT: retq
;
; X64-AVX2-LABEL: length16_eq:
-; X64-AVX2: # BB#0:
+; X64-AVX2: # %bb.0:
; X64-AVX2-NEXT: vmovdqu (%rdi), %xmm0
; X64-AVX2-NEXT: vpcmpeqb (%rsi), %xmm0, %xmm0
; X64-AVX2-NEXT: vpmovmskb %xmm0, %eax
@@ -482,21 +441,19 @@ define i1 @length16_eq(i8* %x, i8* %y) nounwind minsize {
define i1 @length16_eq_const(i8* %X) nounwind minsize {
; X86-NOSSE-LABEL: length16_eq_const:
-; X86-NOSSE: # BB#0:
-; X86-NOSSE-NEXT: subl $16, %esp
-; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NOSSE-NEXT: movl %eax, (%esp)
-; X86-NOSSE-NEXT: andl $0, {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT: movl $16, {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT: movl $.L.str, {{[0-9]+}}(%esp)
+; X86-NOSSE: # %bb.0:
+; X86-NOSSE-NEXT: pushl $0
+; X86-NOSSE-NEXT: pushl $16
+; X86-NOSSE-NEXT: pushl $.L.str
+; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NOSSE-NEXT: calll memcmp
+; X86-NOSSE-NEXT: addl $16, %esp
; X86-NOSSE-NEXT: testl %eax, %eax
; X86-NOSSE-NEXT: sete %al
-; X86-NOSSE-NEXT: addl $16, %esp
; X86-NOSSE-NEXT: retl
;
; X86-SSE2-LABEL: length16_eq_const:
-; X86-SSE2: # BB#0:
+; X86-SSE2: # %bb.0:
; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE2-NEXT: movdqu (%eax), %xmm0
; X86-SSE2-NEXT: pcmpeqb {{\.LCPI.*}}, %xmm0
@@ -506,7 +463,7 @@ define i1 @length16_eq_const(i8* %X) nounwind minsize {
; X86-SSE2-NEXT: retl
;
; X64-SSE2-LABEL: length16_eq_const:
-; X64-SSE2: # BB#0:
+; X64-SSE2: # %bb.0:
; X64-SSE2-NEXT: movdqu (%rdi), %xmm0
; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm0
; X64-SSE2-NEXT: pmovmskb %xmm0, %eax
@@ -515,7 +472,7 @@ define i1 @length16_eq_const(i8* %X) nounwind minsize {
; X64-SSE2-NEXT: retq
;
; X64-AVX2-LABEL: length16_eq_const:
-; X64-AVX2: # BB#0:
+; X64-AVX2: # %bb.0:
; X64-AVX2-NEXT: vmovdqu (%rdi), %xmm0
; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %xmm0, %xmm0
; X64-AVX2-NEXT: vpmovmskb %xmm0, %eax
@@ -531,20 +488,17 @@ define i1 @length16_eq_const(i8* %X) nounwind minsize {
define i32 @length24(i8* %X, i8* %Y) nounwind minsize {
; X86-LABEL: length24:
-; X86: # BB#0:
-; X86-NEXT: subl $16, %esp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NEXT: movl %eax, (%esp)
-; X86-NEXT: andl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $24, {{[0-9]+}}(%esp)
+; X86: # %bb.0:
+; X86-NEXT: pushl $0
+; X86-NEXT: pushl $24
+; X86-NEXT: pushl {{[0-9]+}}(%esp)
+; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: calll memcmp
; X86-NEXT: addl $16, %esp
; X86-NEXT: retl
;
; X64-LABEL: length24:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: pushq $24
; X64-NEXT: popq %rdx
; X64-NEXT: jmp memcmp # TAILCALL
@@ -554,22 +508,19 @@ define i32 @length24(i8* %X, i8* %Y) nounwind minsize {
define i1 @length24_eq(i8* %x, i8* %y) nounwind minsize {
; X86-LABEL: length24_eq:
-; X86: # BB#0:
-; X86-NEXT: subl $16, %esp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NEXT: movl %eax, (%esp)
-; X86-NEXT: andl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $24, {{[0-9]+}}(%esp)
+; X86: # %bb.0:
+; X86-NEXT: pushl $0
+; X86-NEXT: pushl $24
+; X86-NEXT: pushl {{[0-9]+}}(%esp)
+; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: calll memcmp
+; X86-NEXT: addl $16, %esp
; X86-NEXT: testl %eax, %eax
; X86-NEXT: sete %al
-; X86-NEXT: addl $16, %esp
; X86-NEXT: retl
;
; X64-LABEL: length24_eq:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: pushq %rax
; X64-NEXT: pushq $24
; X64-NEXT: popq %rdx
@@ -585,21 +536,19 @@ define i1 @length24_eq(i8* %x, i8* %y) nounwind minsize {
define i1 @length24_eq_const(i8* %X) nounwind minsize {
; X86-LABEL: length24_eq_const:
-; X86: # BB#0:
-; X86-NEXT: subl $16, %esp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl %eax, (%esp)
-; X86-NEXT: andl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $24, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $.L.str, {{[0-9]+}}(%esp)
+; X86: # %bb.0:
+; X86-NEXT: pushl $0
+; X86-NEXT: pushl $24
+; X86-NEXT: pushl $.L.str
+; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: calll memcmp
+; X86-NEXT: addl $16, %esp
; X86-NEXT: testl %eax, %eax
; X86-NEXT: setne %al
-; X86-NEXT: addl $16, %esp
; X86-NEXT: retl
;
; X64-LABEL: length24_eq_const:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: pushq %rax
; X64-NEXT: pushq $24
; X64-NEXT: popq %rdx
@@ -616,20 +565,17 @@ define i1 @length24_eq_const(i8* %X) nounwind minsize {
define i32 @length32(i8* %X, i8* %Y) nounwind minsize {
; X86-LABEL: length32:
-; X86: # BB#0:
-; X86-NEXT: subl $16, %esp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NEXT: movl %eax, (%esp)
-; X86-NEXT: andl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $32, {{[0-9]+}}(%esp)
+; X86: # %bb.0:
+; X86-NEXT: pushl $0
+; X86-NEXT: pushl $32
+; X86-NEXT: pushl {{[0-9]+}}(%esp)
+; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: calll memcmp
; X86-NEXT: addl $16, %esp
; X86-NEXT: retl
;
; X64-LABEL: length32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: pushq $32
; X64-NEXT: popq %rdx
; X64-NEXT: jmp memcmp # TAILCALL
@@ -641,22 +587,19 @@ define i32 @length32(i8* %X, i8* %Y) nounwind minsize {
define i1 @length32_eq(i8* %x, i8* %y) nounwind minsize {
; X86-LABEL: length32_eq:
-; X86: # BB#0:
-; X86-NEXT: subl $16, %esp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NEXT: movl %eax, (%esp)
-; X86-NEXT: andl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $32, {{[0-9]+}}(%esp)
+; X86: # %bb.0:
+; X86-NEXT: pushl $0
+; X86-NEXT: pushl $32
+; X86-NEXT: pushl {{[0-9]+}}(%esp)
+; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: calll memcmp
+; X86-NEXT: addl $16, %esp
; X86-NEXT: testl %eax, %eax
; X86-NEXT: sete %al
-; X86-NEXT: addl $16, %esp
; X86-NEXT: retl
;
; X64-SSE2-LABEL: length32_eq:
-; X64-SSE2: # BB#0:
+; X64-SSE2: # %bb.0:
; X64-SSE2-NEXT: pushq %rax
; X64-SSE2-NEXT: pushq $32
; X64-SSE2-NEXT: popq %rdx
@@ -667,7 +610,7 @@ define i1 @length32_eq(i8* %x, i8* %y) nounwind minsize {
; X64-SSE2-NEXT: retq
;
; X64-AVX2-LABEL: length32_eq:
-; X64-AVX2: # BB#0:
+; X64-AVX2: # %bb.0:
; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0
; X64-AVX2-NEXT: vpcmpeqb (%rsi), %ymm0, %ymm0
; X64-AVX2-NEXT: vpmovmskb %ymm0, %eax
@@ -682,21 +625,19 @@ define i1 @length32_eq(i8* %x, i8* %y) nounwind minsize {
define i1 @length32_eq_const(i8* %X) nounwind minsize {
; X86-LABEL: length32_eq_const:
-; X86: # BB#0:
-; X86-NEXT: subl $16, %esp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl %eax, (%esp)
-; X86-NEXT: andl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $32, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $.L.str, {{[0-9]+}}(%esp)
+; X86: # %bb.0:
+; X86-NEXT: pushl $0
+; X86-NEXT: pushl $32
+; X86-NEXT: pushl $.L.str
+; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: calll memcmp
+; X86-NEXT: addl $16, %esp
; X86-NEXT: testl %eax, %eax
; X86-NEXT: setne %al
-; X86-NEXT: addl $16, %esp
; X86-NEXT: retl
;
; X64-SSE2-LABEL: length32_eq_const:
-; X64-SSE2: # BB#0:
+; X64-SSE2: # %bb.0:
; X64-SSE2-NEXT: pushq %rax
; X64-SSE2-NEXT: pushq $32
; X64-SSE2-NEXT: popq %rdx
@@ -708,7 +649,7 @@ define i1 @length32_eq_const(i8* %X) nounwind minsize {
; X64-SSE2-NEXT: retq
;
; X64-AVX2-LABEL: length32_eq_const:
-; X64-AVX2: # BB#0:
+; X64-AVX2: # %bb.0:
; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0
; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %ymm0, %ymm0
; X64-AVX2-NEXT: vpmovmskb %ymm0, %eax
@@ -723,20 +664,17 @@ define i1 @length32_eq_const(i8* %X) nounwind minsize {
define i32 @length64(i8* %X, i8* %Y) nounwind minsize {
; X86-LABEL: length64:
-; X86: # BB#0:
-; X86-NEXT: subl $16, %esp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NEXT: movl %eax, (%esp)
-; X86-NEXT: andl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $64, {{[0-9]+}}(%esp)
+; X86: # %bb.0:
+; X86-NEXT: pushl $0
+; X86-NEXT: pushl $64
+; X86-NEXT: pushl {{[0-9]+}}(%esp)
+; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: calll memcmp
; X86-NEXT: addl $16, %esp
; X86-NEXT: retl
;
; X64-LABEL: length64:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: pushq $64
; X64-NEXT: popq %rdx
; X64-NEXT: jmp memcmp # TAILCALL
@@ -746,22 +684,19 @@ define i32 @length64(i8* %X, i8* %Y) nounwind minsize {
define i1 @length64_eq(i8* %x, i8* %y) nounwind minsize {
; X86-LABEL: length64_eq:
-; X86: # BB#0:
-; X86-NEXT: subl $16, %esp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NEXT: movl %eax, (%esp)
-; X86-NEXT: andl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $64, {{[0-9]+}}(%esp)
+; X86: # %bb.0:
+; X86-NEXT: pushl $0
+; X86-NEXT: pushl $64
+; X86-NEXT: pushl {{[0-9]+}}(%esp)
+; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: calll memcmp
+; X86-NEXT: addl $16, %esp
; X86-NEXT: testl %eax, %eax
; X86-NEXT: setne %al
-; X86-NEXT: addl $16, %esp
; X86-NEXT: retl
;
; X64-LABEL: length64_eq:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: pushq %rax
; X64-NEXT: pushq $64
; X64-NEXT: popq %rdx
@@ -777,21 +712,19 @@ define i1 @length64_eq(i8* %x, i8* %y) nounwind minsize {
define i1 @length64_eq_const(i8* %X) nounwind minsize {
; X86-LABEL: length64_eq_const:
-; X86: # BB#0:
-; X86-NEXT: subl $16, %esp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl %eax, (%esp)
-; X86-NEXT: andl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $64, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $.L.str, {{[0-9]+}}(%esp)
+; X86: # %bb.0:
+; X86-NEXT: pushl $0
+; X86-NEXT: pushl $64
+; X86-NEXT: pushl $.L.str
+; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: calll memcmp
+; X86-NEXT: addl $16, %esp
; X86-NEXT: testl %eax, %eax
; X86-NEXT: sete %al
-; X86-NEXT: addl $16, %esp
; X86-NEXT: retl
;
; X64-LABEL: length64_eq_const:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: pushq %rax
; X64-NEXT: pushq $64
; X64-NEXT: popq %rdx
diff --git a/test/CodeGen/X86/memcmp-optsize.ll b/test/CodeGen/X86/memcmp-optsize.ll
index 4a5f30890513..a5fb85fae5ed 100644
--- a/test/CodeGen/X86/memcmp-optsize.ll
+++ b/test/CodeGen/X86/memcmp-optsize.ll
@@ -13,39 +13,27 @@ declare i32 @memcmp(i8*, i8*, i64)
define i32 @length2(i8* %X, i8* %Y) nounwind optsize {
; X86-LABEL: length2:
-; X86: # BB#0:
-; X86-NEXT: pushl %edi
-; X86-NEXT: pushl %esi
+; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movzwl (%ecx), %ecx
; X86-NEXT: movzwl (%eax), %edx
; X86-NEXT: rolw $8, %cx
; X86-NEXT: rolw $8, %dx
-; X86-NEXT: xorl %esi, %esi
-; X86-NEXT: xorl %edi, %edi
-; X86-NEXT: incl %edi
-; X86-NEXT: xorl %eax, %eax
-; X86-NEXT: decl %eax
-; X86-NEXT: cmpw %dx, %cx
-; X86-NEXT: cmovael %edi, %eax
-; X86-NEXT: cmovel %esi, %eax
-; X86-NEXT: popl %esi
-; X86-NEXT: popl %edi
+; X86-NEXT: movzwl %cx, %eax
+; X86-NEXT: movzwl %dx, %ecx
+; X86-NEXT: subl %ecx, %eax
; X86-NEXT: retl
;
; X64-LABEL: length2:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movzwl (%rdi), %eax
; X64-NEXT: movzwl (%rsi), %ecx
; X64-NEXT: rolw $8, %ax
; X64-NEXT: rolw $8, %cx
-; X64-NEXT: xorl %edx, %edx
-; X64-NEXT: cmpw %cx, %ax
-; X64-NEXT: movl $-1, %ecx
-; X64-NEXT: movl $1, %eax
-; X64-NEXT: cmovbl %ecx, %eax
-; X64-NEXT: cmovel %edx, %eax
+; X64-NEXT: movzwl %ax, %eax
+; X64-NEXT: movzwl %cx, %ecx
+; X64-NEXT: subl %ecx, %eax
; X64-NEXT: retq
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 2) nounwind
ret i32 %m
@@ -53,7 +41,7 @@ define i32 @length2(i8* %X, i8* %Y) nounwind optsize {
define i1 @length2_eq(i8* %X, i8* %Y) nounwind optsize {
; X86-LABEL: length2_eq:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movzwl (%ecx), %ecx
@@ -62,7 +50,7 @@ define i1 @length2_eq(i8* %X, i8* %Y) nounwind optsize {
; X86-NEXT: retl
;
; X64-LABEL: length2_eq:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movzwl (%rdi), %eax
; X64-NEXT: cmpw (%rsi), %ax
; X64-NEXT: sete %al
@@ -74,7 +62,7 @@ define i1 @length2_eq(i8* %X, i8* %Y) nounwind optsize {
define i1 @length2_eq_const(i8* %X) nounwind optsize {
; X86-LABEL: length2_eq_const:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movzwl (%eax), %eax
; X86-NEXT: cmpl $12849, %eax # imm = 0x3231
@@ -82,7 +70,7 @@ define i1 @length2_eq_const(i8* %X) nounwind optsize {
; X86-NEXT: retl
;
; X64-LABEL: length2_eq_const:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movzwl (%rdi), %eax
; X64-NEXT: cmpl $12849, %eax # imm = 0x3231
; X64-NEXT: setne %al
@@ -94,7 +82,7 @@ define i1 @length2_eq_const(i8* %X) nounwind optsize {
define i1 @length2_eq_nobuiltin_attr(i8* %X, i8* %Y) nounwind optsize {
; X86-LABEL: length2_eq_nobuiltin_attr:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: pushl $0
; X86-NEXT: pushl $2
; X86-NEXT: pushl {{[0-9]+}}(%esp)
@@ -106,7 +94,7 @@ define i1 @length2_eq_nobuiltin_attr(i8* %X, i8* %Y) nounwind optsize {
; X86-NEXT: retl
;
; X64-LABEL: length2_eq_nobuiltin_attr:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: pushq %rax
; X64-NEXT: movl $2, %edx
; X64-NEXT: callq memcmp
@@ -121,7 +109,7 @@ define i1 @length2_eq_nobuiltin_attr(i8* %X, i8* %Y) nounwind optsize {
define i32 @length3(i8* %X, i8* %Y) nounwind optsize {
; X86-LABEL: length3:
-; X86: # BB#0: # %loadbb
+; X86: # %bb.0: # %loadbb
; X86-NEXT: pushl %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
@@ -129,45 +117,38 @@ define i32 @length3(i8* %X, i8* %Y) nounwind optsize {
; X86-NEXT: movzwl (%ecx), %esi
; X86-NEXT: rolw $8, %dx
; X86-NEXT: rolw $8, %si
-; X86-NEXT: movzwl %dx, %edx
-; X86-NEXT: movzwl %si, %esi
-; X86-NEXT: cmpl %esi, %edx
+; X86-NEXT: cmpw %si, %dx
; X86-NEXT: jne .LBB4_1
-; X86-NEXT: # BB#2: # %loadbb1
+; X86-NEXT: # %bb.2: # %loadbb1
; X86-NEXT: movzbl 2(%eax), %eax
; X86-NEXT: movzbl 2(%ecx), %ecx
; X86-NEXT: subl %ecx, %eax
; X86-NEXT: jmp .LBB4_3
; X86-NEXT: .LBB4_1: # %res_block
-; X86-NEXT: xorl %ecx, %ecx
-; X86-NEXT: incl %ecx
-; X86-NEXT: xorl %eax, %eax
-; X86-NEXT: decl %eax
-; X86-NEXT: cmpl %esi, %edx
-; X86-NEXT: cmovael %ecx, %eax
+; X86-NEXT: setae %al
+; X86-NEXT: movzbl %al, %eax
+; X86-NEXT: leal -1(%eax,%eax), %eax
; X86-NEXT: .LBB4_3: # %endblock
; X86-NEXT: popl %esi
; X86-NEXT: retl
;
; X64-LABEL: length3:
-; X64: # BB#0: # %loadbb
+; X64: # %bb.0: # %loadbb
; X64-NEXT: movzwl (%rdi), %eax
; X64-NEXT: movzwl (%rsi), %ecx
; X64-NEXT: rolw $8, %ax
; X64-NEXT: rolw $8, %cx
-; X64-NEXT: movzwl %ax, %eax
-; X64-NEXT: movzwl %cx, %ecx
-; X64-NEXT: cmpq %rcx, %rax
+; X64-NEXT: cmpw %cx, %ax
; X64-NEXT: jne .LBB4_1
-; X64-NEXT: # BB#2: # %loadbb1
+; X64-NEXT: # %bb.2: # %loadbb1
; X64-NEXT: movzbl 2(%rdi), %eax
; X64-NEXT: movzbl 2(%rsi), %ecx
; X64-NEXT: subl %ecx, %eax
; X64-NEXT: retq
; X64-NEXT: .LBB4_1: # %res_block
-; X64-NEXT: movl $-1, %ecx
-; X64-NEXT: movl $1, %eax
-; X64-NEXT: cmovbl %ecx, %eax
+; X64-NEXT: setae %al
+; X64-NEXT: movzbl %al, %eax
+; X64-NEXT: leal -1(%rax,%rax), %eax
; X64-NEXT: retq
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 3) nounwind
ret i32 %m
@@ -175,36 +156,36 @@ define i32 @length3(i8* %X, i8* %Y) nounwind optsize {
define i1 @length3_eq(i8* %X, i8* %Y) nounwind optsize {
; X86-LABEL: length3_eq:
-; X86: # BB#0: # %loadbb
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movzwl (%eax), %edx
-; X86-NEXT: cmpw (%ecx), %dx
-; X86-NEXT: jne .LBB5_1
-; X86-NEXT: # BB#2: # %loadbb1
-; X86-NEXT: movb 2(%eax), %dl
-; X86-NEXT: xorl %eax, %eax
-; X86-NEXT: cmpb 2(%ecx), %dl
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movzwl (%ecx), %edx
+; X86-NEXT: cmpw (%eax), %dx
+; X86-NEXT: jne .LBB5_2
+; X86-NEXT: # %bb.1: # %loadbb1
+; X86-NEXT: movb 2(%ecx), %dl
+; X86-NEXT: xorl %ecx, %ecx
+; X86-NEXT: cmpb 2(%eax), %dl
; X86-NEXT: je .LBB5_3
-; X86-NEXT: .LBB5_1: # %res_block
-; X86-NEXT: xorl %eax, %eax
-; X86-NEXT: incl %eax
+; X86-NEXT: .LBB5_2: # %res_block
+; X86-NEXT: xorl %ecx, %ecx
+; X86-NEXT: incl %ecx
; X86-NEXT: .LBB5_3: # %endblock
-; X86-NEXT: testl %eax, %eax
+; X86-NEXT: testl %ecx, %ecx
; X86-NEXT: setne %al
; X86-NEXT: retl
;
; X64-LABEL: length3_eq:
-; X64: # BB#0: # %loadbb
+; X64: # %bb.0:
; X64-NEXT: movzwl (%rdi), %eax
; X64-NEXT: cmpw (%rsi), %ax
-; X64-NEXT: jne .LBB5_1
-; X64-NEXT: # BB#2: # %loadbb1
+; X64-NEXT: jne .LBB5_2
+; X64-NEXT: # %bb.1: # %loadbb1
; X64-NEXT: movb 2(%rdi), %cl
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: cmpb 2(%rsi), %cl
; X64-NEXT: je .LBB5_3
-; X64-NEXT: .LBB5_1: # %res_block
+; X64-NEXT: .LBB5_2: # %res_block
; X64-NEXT: movl $1, %eax
; X64-NEXT: .LBB5_3: # %endblock
; X64-NEXT: testl %eax, %eax
@@ -217,39 +198,29 @@ define i1 @length3_eq(i8* %X, i8* %Y) nounwind optsize {
define i32 @length4(i8* %X, i8* %Y) nounwind optsize {
; X86-LABEL: length4:
-; X86: # BB#0:
-; X86-NEXT: pushl %edi
-; X86-NEXT: pushl %esi
+; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl (%ecx), %ecx
; X86-NEXT: movl (%eax), %edx
; X86-NEXT: bswapl %ecx
; X86-NEXT: bswapl %edx
-; X86-NEXT: xorl %esi, %esi
-; X86-NEXT: xorl %edi, %edi
-; X86-NEXT: incl %edi
; X86-NEXT: xorl %eax, %eax
-; X86-NEXT: decl %eax
; X86-NEXT: cmpl %edx, %ecx
-; X86-NEXT: cmovael %edi, %eax
-; X86-NEXT: cmovel %esi, %eax
-; X86-NEXT: popl %esi
-; X86-NEXT: popl %edi
+; X86-NEXT: seta %al
+; X86-NEXT: sbbl $0, %eax
; X86-NEXT: retl
;
; X64-LABEL: length4:
-; X64: # BB#0:
-; X64-NEXT: movl (%rdi), %eax
-; X64-NEXT: movl (%rsi), %ecx
-; X64-NEXT: bswapl %eax
+; X64: # %bb.0:
+; X64-NEXT: movl (%rdi), %ecx
+; X64-NEXT: movl (%rsi), %edx
; X64-NEXT: bswapl %ecx
-; X64-NEXT: xorl %edx, %edx
-; X64-NEXT: cmpl %ecx, %eax
-; X64-NEXT: movl $-1, %ecx
-; X64-NEXT: movl $1, %eax
-; X64-NEXT: cmovbl %ecx, %eax
-; X64-NEXT: cmovel %edx, %eax
+; X64-NEXT: bswapl %edx
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: cmpl %edx, %ecx
+; X64-NEXT: seta %al
+; X64-NEXT: sbbl $0, %eax
; X64-NEXT: retq
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 4) nounwind
ret i32 %m
@@ -257,7 +228,7 @@ define i32 @length4(i8* %X, i8* %Y) nounwind optsize {
define i1 @length4_eq(i8* %X, i8* %Y) nounwind optsize {
; X86-LABEL: length4_eq:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl (%ecx), %ecx
@@ -266,7 +237,7 @@ define i1 @length4_eq(i8* %X, i8* %Y) nounwind optsize {
; X86-NEXT: retl
;
; X64-LABEL: length4_eq:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movl (%rdi), %eax
; X64-NEXT: cmpl (%rsi), %eax
; X64-NEXT: setne %al
@@ -278,14 +249,14 @@ define i1 @length4_eq(i8* %X, i8* %Y) nounwind optsize {
define i1 @length4_eq_const(i8* %X) nounwind optsize {
; X86-LABEL: length4_eq_const:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: cmpl $875770417, (%eax) # imm = 0x34333231
; X86-NEXT: sete %al
; X86-NEXT: retl
;
; X64-LABEL: length4_eq_const:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: cmpl $875770417, (%rdi) # imm = 0x34333231
; X64-NEXT: sete %al
; X64-NEXT: retq
@@ -296,7 +267,7 @@ define i1 @length4_eq_const(i8* %X) nounwind optsize {
define i32 @length5(i8* %X, i8* %Y) nounwind optsize {
; X86-LABEL: length5:
-; X86: # BB#0: # %loadbb
+; X86: # %bb.0: # %loadbb
; X86-NEXT: pushl %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
@@ -306,39 +277,36 @@ define i32 @length5(i8* %X, i8* %Y) nounwind optsize {
; X86-NEXT: bswapl %esi
; X86-NEXT: cmpl %esi, %edx
; X86-NEXT: jne .LBB9_1
-; X86-NEXT: # BB#2: # %loadbb1
+; X86-NEXT: # %bb.2: # %loadbb1
; X86-NEXT: movzbl 4(%eax), %eax
; X86-NEXT: movzbl 4(%ecx), %ecx
; X86-NEXT: subl %ecx, %eax
; X86-NEXT: jmp .LBB9_3
; X86-NEXT: .LBB9_1: # %res_block
-; X86-NEXT: xorl %ecx, %ecx
-; X86-NEXT: incl %ecx
-; X86-NEXT: xorl %eax, %eax
-; X86-NEXT: decl %eax
-; X86-NEXT: cmpl %esi, %edx
-; X86-NEXT: cmovael %ecx, %eax
+; X86-NEXT: setae %al
+; X86-NEXT: movzbl %al, %eax
+; X86-NEXT: leal -1(%eax,%eax), %eax
; X86-NEXT: .LBB9_3: # %endblock
; X86-NEXT: popl %esi
; X86-NEXT: retl
;
; X64-LABEL: length5:
-; X64: # BB#0: # %loadbb
+; X64: # %bb.0: # %loadbb
; X64-NEXT: movl (%rdi), %eax
; X64-NEXT: movl (%rsi), %ecx
; X64-NEXT: bswapl %eax
; X64-NEXT: bswapl %ecx
-; X64-NEXT: cmpq %rcx, %rax
+; X64-NEXT: cmpl %ecx, %eax
; X64-NEXT: jne .LBB9_1
-; X64-NEXT: # BB#2: # %loadbb1
+; X64-NEXT: # %bb.2: # %loadbb1
; X64-NEXT: movzbl 4(%rdi), %eax
; X64-NEXT: movzbl 4(%rsi), %ecx
; X64-NEXT: subl %ecx, %eax
; X64-NEXT: retq
; X64-NEXT: .LBB9_1: # %res_block
-; X64-NEXT: movl $-1, %ecx
-; X64-NEXT: movl $1, %eax
-; X64-NEXT: cmovbl %ecx, %eax
+; X64-NEXT: setae %al
+; X64-NEXT: movzbl %al, %eax
+; X64-NEXT: leal -1(%rax,%rax), %eax
; X64-NEXT: retq
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 5) nounwind
ret i32 %m
@@ -346,36 +314,36 @@ define i32 @length5(i8* %X, i8* %Y) nounwind optsize {
define i1 @length5_eq(i8* %X, i8* %Y) nounwind optsize {
; X86-LABEL: length5_eq:
-; X86: # BB#0: # %loadbb
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl (%eax), %edx
-; X86-NEXT: cmpl (%ecx), %edx
-; X86-NEXT: jne .LBB10_1
-; X86-NEXT: # BB#2: # %loadbb1
-; X86-NEXT: movb 4(%eax), %dl
-; X86-NEXT: xorl %eax, %eax
-; X86-NEXT: cmpb 4(%ecx), %dl
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl (%ecx), %edx
+; X86-NEXT: cmpl (%eax), %edx
+; X86-NEXT: jne .LBB10_2
+; X86-NEXT: # %bb.1: # %loadbb1
+; X86-NEXT: movb 4(%ecx), %dl
+; X86-NEXT: xorl %ecx, %ecx
+; X86-NEXT: cmpb 4(%eax), %dl
; X86-NEXT: je .LBB10_3
-; X86-NEXT: .LBB10_1: # %res_block
-; X86-NEXT: xorl %eax, %eax
-; X86-NEXT: incl %eax
+; X86-NEXT: .LBB10_2: # %res_block
+; X86-NEXT: xorl %ecx, %ecx
+; X86-NEXT: incl %ecx
; X86-NEXT: .LBB10_3: # %endblock
-; X86-NEXT: testl %eax, %eax
+; X86-NEXT: testl %ecx, %ecx
; X86-NEXT: setne %al
; X86-NEXT: retl
;
; X64-LABEL: length5_eq:
-; X64: # BB#0: # %loadbb
+; X64: # %bb.0:
; X64-NEXT: movl (%rdi), %eax
; X64-NEXT: cmpl (%rsi), %eax
-; X64-NEXT: jne .LBB10_1
-; X64-NEXT: # BB#2: # %loadbb1
+; X64-NEXT: jne .LBB10_2
+; X64-NEXT: # %bb.1: # %loadbb1
; X64-NEXT: movb 4(%rdi), %cl
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: cmpb 4(%rsi), %cl
; X64-NEXT: je .LBB10_3
-; X64-NEXT: .LBB10_1: # %res_block
+; X64-NEXT: .LBB10_2: # %res_block
; X64-NEXT: movl $1, %eax
; X64-NEXT: .LBB10_3: # %endblock
; X64-NEXT: testl %eax, %eax
@@ -388,7 +356,7 @@ define i1 @length5_eq(i8* %X, i8* %Y) nounwind optsize {
define i32 @length8(i8* %X, i8* %Y) nounwind optsize {
; X86-LABEL: length8:
-; X86: # BB#0: # %loadbb
+; X86: # %bb.0:
; X86-NEXT: pushl %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
@@ -397,8 +365,8 @@ define i32 @length8(i8* %X, i8* %Y) nounwind optsize {
; X86-NEXT: bswapl %ecx
; X86-NEXT: bswapl %edx
; X86-NEXT: cmpl %edx, %ecx
-; X86-NEXT: jne .LBB11_1
-; X86-NEXT: # BB#2: # %loadbb1
+; X86-NEXT: jne .LBB11_2
+; X86-NEXT: # %bb.1: # %loadbb1
; X86-NEXT: movl 4(%esi), %ecx
; X86-NEXT: movl 4(%eax), %edx
; X86-NEXT: bswapl %ecx
@@ -406,29 +374,25 @@ define i32 @length8(i8* %X, i8* %Y) nounwind optsize {
; X86-NEXT: xorl %eax, %eax
; X86-NEXT: cmpl %edx, %ecx
; X86-NEXT: je .LBB11_3
-; X86-NEXT: .LBB11_1: # %res_block
-; X86-NEXT: xorl %esi, %esi
-; X86-NEXT: incl %esi
+; X86-NEXT: .LBB11_2: # %res_block
; X86-NEXT: xorl %eax, %eax
-; X86-NEXT: decl %eax
; X86-NEXT: cmpl %edx, %ecx
-; X86-NEXT: cmovael %esi, %eax
+; X86-NEXT: setae %al
+; X86-NEXT: leal -1(%eax,%eax), %eax
; X86-NEXT: .LBB11_3: # %endblock
; X86-NEXT: popl %esi
; X86-NEXT: retl
;
; X64-LABEL: length8:
-; X64: # BB#0:
-; X64-NEXT: movq (%rdi), %rax
-; X64-NEXT: movq (%rsi), %rcx
-; X64-NEXT: bswapq %rax
+; X64: # %bb.0:
+; X64-NEXT: movq (%rdi), %rcx
+; X64-NEXT: movq (%rsi), %rdx
; X64-NEXT: bswapq %rcx
-; X64-NEXT: xorl %edx, %edx
-; X64-NEXT: cmpq %rcx, %rax
-; X64-NEXT: movl $-1, %ecx
-; X64-NEXT: movl $1, %eax
-; X64-NEXT: cmovbl %ecx, %eax
-; X64-NEXT: cmovel %edx, %eax
+; X64-NEXT: bswapq %rdx
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: cmpq %rdx, %rcx
+; X64-NEXT: seta %al
+; X64-NEXT: sbbl $0, %eax
; X64-NEXT: retq
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 8) nounwind
ret i32 %m
@@ -436,27 +400,27 @@ define i32 @length8(i8* %X, i8* %Y) nounwind optsize {
define i1 @length8_eq(i8* %X, i8* %Y) nounwind optsize {
; X86-LABEL: length8_eq:
-; X86: # BB#0: # %loadbb
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl (%eax), %edx
-; X86-NEXT: cmpl (%ecx), %edx
-; X86-NEXT: jne .LBB12_1
-; X86-NEXT: # BB#2: # %loadbb1
-; X86-NEXT: movl 4(%eax), %edx
-; X86-NEXT: xorl %eax, %eax
-; X86-NEXT: cmpl 4(%ecx), %edx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl (%ecx), %edx
+; X86-NEXT: cmpl (%eax), %edx
+; X86-NEXT: jne .LBB12_2
+; X86-NEXT: # %bb.1: # %loadbb1
+; X86-NEXT: movl 4(%ecx), %edx
+; X86-NEXT: xorl %ecx, %ecx
+; X86-NEXT: cmpl 4(%eax), %edx
; X86-NEXT: je .LBB12_3
-; X86-NEXT: .LBB12_1: # %res_block
-; X86-NEXT: xorl %eax, %eax
-; X86-NEXT: incl %eax
+; X86-NEXT: .LBB12_2: # %res_block
+; X86-NEXT: xorl %ecx, %ecx
+; X86-NEXT: incl %ecx
; X86-NEXT: .LBB12_3: # %endblock
-; X86-NEXT: testl %eax, %eax
+; X86-NEXT: testl %ecx, %ecx
; X86-NEXT: sete %al
; X86-NEXT: retl
;
; X64-LABEL: length8_eq:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movq (%rdi), %rax
; X64-NEXT: cmpq (%rsi), %rax
; X64-NEXT: sete %al
@@ -468,15 +432,15 @@ define i1 @length8_eq(i8* %X, i8* %Y) nounwind optsize {
define i1 @length8_eq_const(i8* %X) nounwind optsize {
; X86-LABEL: length8_eq_const:
-; X86: # BB#0: # %loadbb
+; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: cmpl $858927408, (%ecx) # imm = 0x33323130
-; X86-NEXT: jne .LBB13_1
-; X86-NEXT: # BB#2: # %loadbb1
+; X86-NEXT: jne .LBB13_2
+; X86-NEXT: # %bb.1: # %loadbb1
; X86-NEXT: xorl %eax, %eax
; X86-NEXT: cmpl $926299444, 4(%ecx) # imm = 0x37363534
; X86-NEXT: je .LBB13_3
-; X86-NEXT: .LBB13_1: # %res_block
+; X86-NEXT: .LBB13_2: # %res_block
; X86-NEXT: xorl %eax, %eax
; X86-NEXT: incl %eax
; X86-NEXT: .LBB13_3: # %endblock
@@ -485,7 +449,7 @@ define i1 @length8_eq_const(i8* %X) nounwind optsize {
; X86-NEXT: retl
;
; X64-LABEL: length8_eq_const:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movabsq $3978425819141910832, %rax # imm = 0x3736353433323130
; X64-NEXT: cmpq %rax, (%rdi)
; X64-NEXT: setne %al
@@ -497,7 +461,7 @@ define i1 @length8_eq_const(i8* %X) nounwind optsize {
define i1 @length12_eq(i8* %X, i8* %Y) nounwind optsize {
; X86-LABEL: length12_eq:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: pushl $0
; X86-NEXT: pushl $12
; X86-NEXT: pushl {{[0-9]+}}(%esp)
@@ -509,16 +473,16 @@ define i1 @length12_eq(i8* %X, i8* %Y) nounwind optsize {
; X86-NEXT: retl
;
; X64-LABEL: length12_eq:
-; X64: # BB#0: # %loadbb
+; X64: # %bb.0:
; X64-NEXT: movq (%rdi), %rax
; X64-NEXT: cmpq (%rsi), %rax
-; X64-NEXT: jne .LBB14_1
-; X64-NEXT: # BB#2: # %loadbb1
+; X64-NEXT: jne .LBB14_2
+; X64-NEXT: # %bb.1: # %loadbb1
; X64-NEXT: movl 8(%rdi), %ecx
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: cmpl 8(%rsi), %ecx
; X64-NEXT: je .LBB14_3
-; X64-NEXT: .LBB14_1: # %res_block
+; X64-NEXT: .LBB14_2: # %res_block
; X64-NEXT: movl $1, %eax
; X64-NEXT: .LBB14_3: # %endblock
; X64-NEXT: testl %eax, %eax
@@ -531,7 +495,7 @@ define i1 @length12_eq(i8* %X, i8* %Y) nounwind optsize {
define i32 @length12(i8* %X, i8* %Y) nounwind optsize {
; X86-LABEL: length12:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: pushl $0
; X86-NEXT: pushl $12
; X86-NEXT: pushl {{[0-9]+}}(%esp)
@@ -541,28 +505,27 @@ define i32 @length12(i8* %X, i8* %Y) nounwind optsize {
; X86-NEXT: retl
;
; X64-LABEL: length12:
-; X64: # BB#0: # %loadbb
+; X64: # %bb.0:
; X64-NEXT: movq (%rdi), %rcx
; X64-NEXT: movq (%rsi), %rdx
; X64-NEXT: bswapq %rcx
; X64-NEXT: bswapq %rdx
; X64-NEXT: cmpq %rdx, %rcx
-; X64-NEXT: jne .LBB15_1
-; X64-NEXT: # BB#2: # %loadbb1
+; X64-NEXT: jne .LBB15_2
+; X64-NEXT: # %bb.1: # %loadbb1
; X64-NEXT: movl 8(%rdi), %ecx
; X64-NEXT: movl 8(%rsi), %edx
; X64-NEXT: bswapl %ecx
; X64-NEXT: bswapl %edx
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: cmpq %rdx, %rcx
-; X64-NEXT: jne .LBB15_1
-; X64-NEXT: # BB#3: # %endblock
-; X64-NEXT: retq
-; X64-NEXT: .LBB15_1: # %res_block
+; X64-NEXT: je .LBB15_3
+; X64-NEXT: .LBB15_2: # %res_block
+; X64-NEXT: xorl %eax, %eax
; X64-NEXT: cmpq %rdx, %rcx
-; X64-NEXT: movl $-1, %ecx
-; X64-NEXT: movl $1, %eax
-; X64-NEXT: cmovbl %ecx, %eax
+; X64-NEXT: setae %al
+; X64-NEXT: leal -1(%rax,%rax), %eax
+; X64-NEXT: .LBB15_3: # %endblock
; X64-NEXT: retq
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 12) nounwind
ret i32 %m
@@ -572,7 +535,7 @@ define i32 @length12(i8* %X, i8* %Y) nounwind optsize {
define i32 @length16(i8* %X, i8* %Y) nounwind optsize {
; X86-LABEL: length16:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: pushl $0
; X86-NEXT: pushl $16
; X86-NEXT: pushl {{[0-9]+}}(%esp)
@@ -582,28 +545,27 @@ define i32 @length16(i8* %X, i8* %Y) nounwind optsize {
; X86-NEXT: retl
;
; X64-LABEL: length16:
-; X64: # BB#0: # %loadbb
+; X64: # %bb.0:
; X64-NEXT: movq (%rdi), %rcx
; X64-NEXT: movq (%rsi), %rdx
; X64-NEXT: bswapq %rcx
; X64-NEXT: bswapq %rdx
; X64-NEXT: cmpq %rdx, %rcx
-; X64-NEXT: jne .LBB16_1
-; X64-NEXT: # BB#2: # %loadbb1
+; X64-NEXT: jne .LBB16_2
+; X64-NEXT: # %bb.1: # %loadbb1
; X64-NEXT: movq 8(%rdi), %rcx
; X64-NEXT: movq 8(%rsi), %rdx
; X64-NEXT: bswapq %rcx
; X64-NEXT: bswapq %rdx
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: cmpq %rdx, %rcx
-; X64-NEXT: jne .LBB16_1
-; X64-NEXT: # BB#3: # %endblock
-; X64-NEXT: retq
-; X64-NEXT: .LBB16_1: # %res_block
+; X64-NEXT: je .LBB16_3
+; X64-NEXT: .LBB16_2: # %res_block
+; X64-NEXT: xorl %eax, %eax
; X64-NEXT: cmpq %rdx, %rcx
-; X64-NEXT: movl $-1, %ecx
-; X64-NEXT: movl $1, %eax
-; X64-NEXT: cmovbl %ecx, %eax
+; X64-NEXT: setae %al
+; X64-NEXT: leal -1(%rax,%rax), %eax
+; X64-NEXT: .LBB16_3: # %endblock
; X64-NEXT: retq
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 16) nounwind
ret i32 %m
@@ -611,7 +573,7 @@ define i32 @length16(i8* %X, i8* %Y) nounwind optsize {
define i1 @length16_eq(i8* %x, i8* %y) nounwind optsize {
; X86-NOSSE-LABEL: length16_eq:
-; X86-NOSSE: # BB#0:
+; X86-NOSSE: # %bb.0:
; X86-NOSSE-NEXT: pushl $0
; X86-NOSSE-NEXT: pushl $16
; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp)
@@ -623,7 +585,7 @@ define i1 @length16_eq(i8* %x, i8* %y) nounwind optsize {
; X86-NOSSE-NEXT: retl
;
; X86-SSE2-LABEL: length16_eq:
-; X86-SSE2: # BB#0:
+; X86-SSE2: # %bb.0:
; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-SSE2-NEXT: movdqu (%ecx), %xmm0
@@ -634,22 +596,24 @@ define i1 @length16_eq(i8* %x, i8* %y) nounwind optsize {
; X86-SSE2-NEXT: setne %al
; X86-SSE2-NEXT: retl
;
-; X64-LABEL: length16_eq:
-; X64: # BB#0: # %loadbb
-; X64-NEXT: movq (%rdi), %rax
-; X64-NEXT: cmpq (%rsi), %rax
-; X64-NEXT: jne .LBB17_1
-; X64-NEXT: # BB#2: # %loadbb1
-; X64-NEXT: movq 8(%rdi), %rcx
-; X64-NEXT: xorl %eax, %eax
-; X64-NEXT: cmpq 8(%rsi), %rcx
-; X64-NEXT: je .LBB17_3
-; X64-NEXT: .LBB17_1: # %res_block
-; X64-NEXT: movl $1, %eax
-; X64-NEXT: .LBB17_3: # %endblock
-; X64-NEXT: testl %eax, %eax
-; X64-NEXT: setne %al
-; X64-NEXT: retq
+; X64-SSE2-LABEL: length16_eq:
+; X64-SSE2: # %bb.0:
+; X64-SSE2-NEXT: movdqu (%rdi), %xmm0
+; X64-SSE2-NEXT: movdqu (%rsi), %xmm1
+; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm1
+; X64-SSE2-NEXT: pmovmskb %xmm1, %eax
+; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
+; X64-SSE2-NEXT: setne %al
+; X64-SSE2-NEXT: retq
+;
+; X64-AVX2-LABEL: length16_eq:
+; X64-AVX2: # %bb.0:
+; X64-AVX2-NEXT: vmovdqu (%rdi), %xmm0
+; X64-AVX2-NEXT: vpcmpeqb (%rsi), %xmm0, %xmm0
+; X64-AVX2-NEXT: vpmovmskb %xmm0, %eax
+; X64-AVX2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
+; X64-AVX2-NEXT: setne %al
+; X64-AVX2-NEXT: retq
%call = tail call i32 @memcmp(i8* %x, i8* %y, i64 16) nounwind
%cmp = icmp ne i32 %call, 0
ret i1 %cmp
@@ -657,7 +621,7 @@ define i1 @length16_eq(i8* %x, i8* %y) nounwind optsize {
define i1 @length16_eq_const(i8* %X) nounwind optsize {
; X86-NOSSE-LABEL: length16_eq_const:
-; X86-NOSSE: # BB#0:
+; X86-NOSSE: # %bb.0:
; X86-NOSSE-NEXT: pushl $0
; X86-NOSSE-NEXT: pushl $16
; X86-NOSSE-NEXT: pushl $.L.str
@@ -669,7 +633,7 @@ define i1 @length16_eq_const(i8* %X) nounwind optsize {
; X86-NOSSE-NEXT: retl
;
; X86-SSE2-LABEL: length16_eq_const:
-; X86-SSE2: # BB#0:
+; X86-SSE2: # %bb.0:
; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE2-NEXT: movdqu (%eax), %xmm0
; X86-SSE2-NEXT: pcmpeqb {{\.LCPI.*}}, %xmm0
@@ -678,22 +642,23 @@ define i1 @length16_eq_const(i8* %X) nounwind optsize {
; X86-SSE2-NEXT: sete %al
; X86-SSE2-NEXT: retl
;
-; X64-LABEL: length16_eq_const:
-; X64: # BB#0: # %loadbb
-; X64-NEXT: movabsq $3978425819141910832, %rax # imm = 0x3736353433323130
-; X64-NEXT: cmpq %rax, (%rdi)
-; X64-NEXT: jne .LBB18_1
-; X64-NEXT: # BB#2: # %loadbb1
-; X64-NEXT: xorl %eax, %eax
-; X64-NEXT: movabsq $3833745473465760056, %rcx # imm = 0x3534333231303938
-; X64-NEXT: cmpq %rcx, 8(%rdi)
-; X64-NEXT: je .LBB18_3
-; X64-NEXT: .LBB18_1: # %res_block
-; X64-NEXT: movl $1, %eax
-; X64-NEXT: .LBB18_3: # %endblock
-; X64-NEXT: testl %eax, %eax
-; X64-NEXT: sete %al
-; X64-NEXT: retq
+; X64-SSE2-LABEL: length16_eq_const:
+; X64-SSE2: # %bb.0:
+; X64-SSE2-NEXT: movdqu (%rdi), %xmm0
+; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm0
+; X64-SSE2-NEXT: pmovmskb %xmm0, %eax
+; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
+; X64-SSE2-NEXT: sete %al
+; X64-SSE2-NEXT: retq
+;
+; X64-AVX2-LABEL: length16_eq_const:
+; X64-AVX2: # %bb.0:
+; X64-AVX2-NEXT: vmovdqu (%rdi), %xmm0
+; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %xmm0, %xmm0
+; X64-AVX2-NEXT: vpmovmskb %xmm0, %eax
+; X64-AVX2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
+; X64-AVX2-NEXT: sete %al
+; X64-AVX2-NEXT: retq
%m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 16) nounwind
%c = icmp eq i32 %m, 0
ret i1 %c
@@ -703,7 +668,7 @@ define i1 @length16_eq_const(i8* %X) nounwind optsize {
define i32 @length24(i8* %X, i8* %Y) nounwind optsize {
; X86-LABEL: length24:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: pushl $0
; X86-NEXT: pushl $24
; X86-NEXT: pushl {{[0-9]+}}(%esp)
@@ -713,7 +678,7 @@ define i32 @length24(i8* %X, i8* %Y) nounwind optsize {
; X86-NEXT: retl
;
; X64-LABEL: length24:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movl $24, %edx
; X64-NEXT: jmp memcmp # TAILCALL
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 24) nounwind
@@ -722,7 +687,7 @@ define i32 @length24(i8* %X, i8* %Y) nounwind optsize {
define i1 @length24_eq(i8* %x, i8* %y) nounwind optsize {
; X86-LABEL: length24_eq:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: pushl $0
; X86-NEXT: pushl $24
; X86-NEXT: pushl {{[0-9]+}}(%esp)
@@ -733,15 +698,44 @@ define i1 @length24_eq(i8* %x, i8* %y) nounwind optsize {
; X86-NEXT: sete %al
; X86-NEXT: retl
;
-; X64-LABEL: length24_eq:
-; X64: # BB#0:
-; X64-NEXT: pushq %rax
-; X64-NEXT: movl $24, %edx
-; X64-NEXT: callq memcmp
-; X64-NEXT: testl %eax, %eax
-; X64-NEXT: sete %al
-; X64-NEXT: popq %rcx
-; X64-NEXT: retq
+; X64-SSE2-LABEL: length24_eq:
+; X64-SSE2: # %bb.0:
+; X64-SSE2-NEXT: movdqu (%rdi), %xmm0
+; X64-SSE2-NEXT: movdqu (%rsi), %xmm1
+; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm1
+; X64-SSE2-NEXT: pmovmskb %xmm1, %eax
+; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
+; X64-SSE2-NEXT: jne .LBB20_2
+; X64-SSE2-NEXT: # %bb.1: # %loadbb1
+; X64-SSE2-NEXT: movq 16(%rdi), %rcx
+; X64-SSE2-NEXT: xorl %eax, %eax
+; X64-SSE2-NEXT: cmpq 16(%rsi), %rcx
+; X64-SSE2-NEXT: je .LBB20_3
+; X64-SSE2-NEXT: .LBB20_2: # %res_block
+; X64-SSE2-NEXT: movl $1, %eax
+; X64-SSE2-NEXT: .LBB20_3: # %endblock
+; X64-SSE2-NEXT: testl %eax, %eax
+; X64-SSE2-NEXT: sete %al
+; X64-SSE2-NEXT: retq
+;
+; X64-AVX2-LABEL: length24_eq:
+; X64-AVX2: # %bb.0:
+; X64-AVX2-NEXT: vmovdqu (%rdi), %xmm0
+; X64-AVX2-NEXT: vpcmpeqb (%rsi), %xmm0, %xmm0
+; X64-AVX2-NEXT: vpmovmskb %xmm0, %eax
+; X64-AVX2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
+; X64-AVX2-NEXT: jne .LBB20_2
+; X64-AVX2-NEXT: # %bb.1: # %loadbb1
+; X64-AVX2-NEXT: movq 16(%rdi), %rcx
+; X64-AVX2-NEXT: xorl %eax, %eax
+; X64-AVX2-NEXT: cmpq 16(%rsi), %rcx
+; X64-AVX2-NEXT: je .LBB20_3
+; X64-AVX2-NEXT: .LBB20_2: # %res_block
+; X64-AVX2-NEXT: movl $1, %eax
+; X64-AVX2-NEXT: .LBB20_3: # %endblock
+; X64-AVX2-NEXT: testl %eax, %eax
+; X64-AVX2-NEXT: sete %al
+; X64-AVX2-NEXT: retq
%call = tail call i32 @memcmp(i8* %x, i8* %y, i64 24) nounwind
%cmp = icmp eq i32 %call, 0
ret i1 %cmp
@@ -749,7 +743,7 @@ define i1 @length24_eq(i8* %x, i8* %y) nounwind optsize {
define i1 @length24_eq_const(i8* %X) nounwind optsize {
; X86-LABEL: length24_eq_const:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: pushl $0
; X86-NEXT: pushl $24
; X86-NEXT: pushl $.L.str
@@ -760,16 +754,43 @@ define i1 @length24_eq_const(i8* %X) nounwind optsize {
; X86-NEXT: setne %al
; X86-NEXT: retl
;
-; X64-LABEL: length24_eq_const:
-; X64: # BB#0:
-; X64-NEXT: pushq %rax
-; X64-NEXT: movl $.L.str, %esi
-; X64-NEXT: movl $24, %edx
-; X64-NEXT: callq memcmp
-; X64-NEXT: testl %eax, %eax
-; X64-NEXT: setne %al
-; X64-NEXT: popq %rcx
-; X64-NEXT: retq
+; X64-SSE2-LABEL: length24_eq_const:
+; X64-SSE2: # %bb.0:
+; X64-SSE2-NEXT: movdqu (%rdi), %xmm0
+; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm0
+; X64-SSE2-NEXT: pmovmskb %xmm0, %eax
+; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
+; X64-SSE2-NEXT: jne .LBB21_2
+; X64-SSE2-NEXT: # %bb.1: # %loadbb1
+; X64-SSE2-NEXT: xorl %eax, %eax
+; X64-SSE2-NEXT: movabsq $3689065127958034230, %rcx # imm = 0x3332313039383736
+; X64-SSE2-NEXT: cmpq %rcx, 16(%rdi)
+; X64-SSE2-NEXT: je .LBB21_3
+; X64-SSE2-NEXT: .LBB21_2: # %res_block
+; X64-SSE2-NEXT: movl $1, %eax
+; X64-SSE2-NEXT: .LBB21_3: # %endblock
+; X64-SSE2-NEXT: testl %eax, %eax
+; X64-SSE2-NEXT: setne %al
+; X64-SSE2-NEXT: retq
+;
+; X64-AVX2-LABEL: length24_eq_const:
+; X64-AVX2: # %bb.0:
+; X64-AVX2-NEXT: vmovdqu (%rdi), %xmm0
+; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %xmm0, %xmm0
+; X64-AVX2-NEXT: vpmovmskb %xmm0, %eax
+; X64-AVX2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
+; X64-AVX2-NEXT: jne .LBB21_2
+; X64-AVX2-NEXT: # %bb.1: # %loadbb1
+; X64-AVX2-NEXT: xorl %eax, %eax
+; X64-AVX2-NEXT: movabsq $3689065127958034230, %rcx # imm = 0x3332313039383736
+; X64-AVX2-NEXT: cmpq %rcx, 16(%rdi)
+; X64-AVX2-NEXT: je .LBB21_3
+; X64-AVX2-NEXT: .LBB21_2: # %res_block
+; X64-AVX2-NEXT: movl $1, %eax
+; X64-AVX2-NEXT: .LBB21_3: # %endblock
+; X64-AVX2-NEXT: testl %eax, %eax
+; X64-AVX2-NEXT: setne %al
+; X64-AVX2-NEXT: retq
%m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 24) nounwind
%c = icmp ne i32 %m, 0
ret i1 %c
@@ -777,7 +798,7 @@ define i1 @length24_eq_const(i8* %X) nounwind optsize {
define i32 @length32(i8* %X, i8* %Y) nounwind optsize {
; X86-LABEL: length32:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: pushl $0
; X86-NEXT: pushl $32
; X86-NEXT: pushl {{[0-9]+}}(%esp)
@@ -787,7 +808,7 @@ define i32 @length32(i8* %X, i8* %Y) nounwind optsize {
; X86-NEXT: retl
;
; X64-LABEL: length32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movl $32, %edx
; X64-NEXT: jmp memcmp # TAILCALL
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 32) nounwind
@@ -797,30 +818,69 @@ define i32 @length32(i8* %X, i8* %Y) nounwind optsize {
; PR33325 - https://bugs.llvm.org/show_bug.cgi?id=33325
define i1 @length32_eq(i8* %x, i8* %y) nounwind optsize {
-; X86-LABEL: length32_eq:
-; X86: # BB#0:
-; X86-NEXT: pushl $0
-; X86-NEXT: pushl $32
-; X86-NEXT: pushl {{[0-9]+}}(%esp)
-; X86-NEXT: pushl {{[0-9]+}}(%esp)
-; X86-NEXT: calll memcmp
-; X86-NEXT: addl $16, %esp
-; X86-NEXT: testl %eax, %eax
-; X86-NEXT: sete %al
-; X86-NEXT: retl
+; X86-NOSSE-LABEL: length32_eq:
+; X86-NOSSE: # %bb.0:
+; X86-NOSSE-NEXT: pushl $0
+; X86-NOSSE-NEXT: pushl $32
+; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT: calll memcmp
+; X86-NOSSE-NEXT: addl $16, %esp
+; X86-NOSSE-NEXT: testl %eax, %eax
+; X86-NOSSE-NEXT: sete %al
+; X86-NOSSE-NEXT: retl
+;
+; X86-SSE2-LABEL: length32_eq:
+; X86-SSE2: # %bb.0:
+; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE2-NEXT: movdqu (%ecx), %xmm0
+; X86-SSE2-NEXT: movdqu (%eax), %xmm1
+; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm1
+; X86-SSE2-NEXT: pmovmskb %xmm1, %edx
+; X86-SSE2-NEXT: cmpl $65535, %edx # imm = 0xFFFF
+; X86-SSE2-NEXT: jne .LBB23_2
+; X86-SSE2-NEXT: # %bb.1: # %loadbb1
+; X86-SSE2-NEXT: movdqu 16(%ecx), %xmm0
+; X86-SSE2-NEXT: movdqu 16(%eax), %xmm1
+; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm1
+; X86-SSE2-NEXT: pmovmskb %xmm1, %ecx
+; X86-SSE2-NEXT: xorl %eax, %eax
+; X86-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF
+; X86-SSE2-NEXT: je .LBB23_3
+; X86-SSE2-NEXT: .LBB23_2: # %res_block
+; X86-SSE2-NEXT: xorl %eax, %eax
+; X86-SSE2-NEXT: incl %eax
+; X86-SSE2-NEXT: .LBB23_3: # %endblock
+; X86-SSE2-NEXT: testl %eax, %eax
+; X86-SSE2-NEXT: sete %al
+; X86-SSE2-NEXT: retl
;
; X64-SSE2-LABEL: length32_eq:
-; X64-SSE2: # BB#0:
-; X64-SSE2-NEXT: pushq %rax
-; X64-SSE2-NEXT: movl $32, %edx
-; X64-SSE2-NEXT: callq memcmp
+; X64-SSE2: # %bb.0:
+; X64-SSE2-NEXT: movdqu (%rdi), %xmm0
+; X64-SSE2-NEXT: movdqu (%rsi), %xmm1
+; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm1
+; X64-SSE2-NEXT: pmovmskb %xmm1, %eax
+; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
+; X64-SSE2-NEXT: jne .LBB23_2
+; X64-SSE2-NEXT: # %bb.1: # %loadbb1
+; X64-SSE2-NEXT: movdqu 16(%rdi), %xmm0
+; X64-SSE2-NEXT: movdqu 16(%rsi), %xmm1
+; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm1
+; X64-SSE2-NEXT: pmovmskb %xmm1, %ecx
+; X64-SSE2-NEXT: xorl %eax, %eax
+; X64-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF
+; X64-SSE2-NEXT: je .LBB23_3
+; X64-SSE2-NEXT: .LBB23_2: # %res_block
+; X64-SSE2-NEXT: movl $1, %eax
+; X64-SSE2-NEXT: .LBB23_3: # %endblock
; X64-SSE2-NEXT: testl %eax, %eax
; X64-SSE2-NEXT: sete %al
-; X64-SSE2-NEXT: popq %rcx
; X64-SSE2-NEXT: retq
;
; X64-AVX2-LABEL: length32_eq:
-; X64-AVX2: # BB#0:
+; X64-AVX2: # %bb.0:
; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0
; X64-AVX2-NEXT: vpcmpeqb (%rsi), %ymm0, %ymm0
; X64-AVX2-NEXT: vpmovmskb %ymm0, %eax
@@ -834,31 +894,64 @@ define i1 @length32_eq(i8* %x, i8* %y) nounwind optsize {
}
define i1 @length32_eq_const(i8* %X) nounwind optsize {
-; X86-LABEL: length32_eq_const:
-; X86: # BB#0:
-; X86-NEXT: pushl $0
-; X86-NEXT: pushl $32
-; X86-NEXT: pushl $.L.str
-; X86-NEXT: pushl {{[0-9]+}}(%esp)
-; X86-NEXT: calll memcmp
-; X86-NEXT: addl $16, %esp
-; X86-NEXT: testl %eax, %eax
-; X86-NEXT: setne %al
-; X86-NEXT: retl
+; X86-NOSSE-LABEL: length32_eq_const:
+; X86-NOSSE: # %bb.0:
+; X86-NOSSE-NEXT: pushl $0
+; X86-NOSSE-NEXT: pushl $32
+; X86-NOSSE-NEXT: pushl $.L.str
+; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT: calll memcmp
+; X86-NOSSE-NEXT: addl $16, %esp
+; X86-NOSSE-NEXT: testl %eax, %eax
+; X86-NOSSE-NEXT: setne %al
+; X86-NOSSE-NEXT: retl
+;
+; X86-SSE2-LABEL: length32_eq_const:
+; X86-SSE2: # %bb.0:
+; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT: movdqu (%eax), %xmm0
+; X86-SSE2-NEXT: pcmpeqb {{\.LCPI.*}}, %xmm0
+; X86-SSE2-NEXT: pmovmskb %xmm0, %ecx
+; X86-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF
+; X86-SSE2-NEXT: jne .LBB24_2
+; X86-SSE2-NEXT: # %bb.1: # %loadbb1
+; X86-SSE2-NEXT: movdqu 16(%eax), %xmm0
+; X86-SSE2-NEXT: pcmpeqb {{\.LCPI.*}}, %xmm0
+; X86-SSE2-NEXT: pmovmskb %xmm0, %ecx
+; X86-SSE2-NEXT: xorl %eax, %eax
+; X86-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF
+; X86-SSE2-NEXT: je .LBB24_3
+; X86-SSE2-NEXT: .LBB24_2: # %res_block
+; X86-SSE2-NEXT: xorl %eax, %eax
+; X86-SSE2-NEXT: incl %eax
+; X86-SSE2-NEXT: .LBB24_3: # %endblock
+; X86-SSE2-NEXT: testl %eax, %eax
+; X86-SSE2-NEXT: setne %al
+; X86-SSE2-NEXT: retl
;
; X64-SSE2-LABEL: length32_eq_const:
-; X64-SSE2: # BB#0:
-; X64-SSE2-NEXT: pushq %rax
-; X64-SSE2-NEXT: movl $.L.str, %esi
-; X64-SSE2-NEXT: movl $32, %edx
-; X64-SSE2-NEXT: callq memcmp
+; X64-SSE2: # %bb.0:
+; X64-SSE2-NEXT: movdqu (%rdi), %xmm0
+; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm0
+; X64-SSE2-NEXT: pmovmskb %xmm0, %eax
+; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
+; X64-SSE2-NEXT: jne .LBB24_2
+; X64-SSE2-NEXT: # %bb.1: # %loadbb1
+; X64-SSE2-NEXT: movdqu 16(%rdi), %xmm0
+; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm0
+; X64-SSE2-NEXT: pmovmskb %xmm0, %ecx
+; X64-SSE2-NEXT: xorl %eax, %eax
+; X64-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF
+; X64-SSE2-NEXT: je .LBB24_3
+; X64-SSE2-NEXT: .LBB24_2: # %res_block
+; X64-SSE2-NEXT: movl $1, %eax
+; X64-SSE2-NEXT: .LBB24_3: # %endblock
; X64-SSE2-NEXT: testl %eax, %eax
; X64-SSE2-NEXT: setne %al
-; X64-SSE2-NEXT: popq %rcx
; X64-SSE2-NEXT: retq
;
; X64-AVX2-LABEL: length32_eq_const:
-; X64-AVX2: # BB#0:
+; X64-AVX2: # %bb.0:
; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0
; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %ymm0, %ymm0
; X64-AVX2-NEXT: vpmovmskb %ymm0, %eax
@@ -873,7 +966,7 @@ define i1 @length32_eq_const(i8* %X) nounwind optsize {
define i32 @length64(i8* %X, i8* %Y) nounwind optsize {
; X86-LABEL: length64:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: pushl $0
; X86-NEXT: pushl $64
; X86-NEXT: pushl {{[0-9]+}}(%esp)
@@ -883,7 +976,7 @@ define i32 @length64(i8* %X, i8* %Y) nounwind optsize {
; X86-NEXT: retl
;
; X64-LABEL: length64:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movl $64, %edx
; X64-NEXT: jmp memcmp # TAILCALL
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 64) nounwind
@@ -892,7 +985,7 @@ define i32 @length64(i8* %X, i8* %Y) nounwind optsize {
define i1 @length64_eq(i8* %x, i8* %y) nounwind optsize {
; X86-LABEL: length64_eq:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: pushl $0
; X86-NEXT: pushl $64
; X86-NEXT: pushl {{[0-9]+}}(%esp)
@@ -903,15 +996,37 @@ define i1 @length64_eq(i8* %x, i8* %y) nounwind optsize {
; X86-NEXT: setne %al
; X86-NEXT: retl
;
-; X64-LABEL: length64_eq:
-; X64: # BB#0:
-; X64-NEXT: pushq %rax
-; X64-NEXT: movl $64, %edx
-; X64-NEXT: callq memcmp
-; X64-NEXT: testl %eax, %eax
-; X64-NEXT: setne %al
-; X64-NEXT: popq %rcx
-; X64-NEXT: retq
+; X64-SSE2-LABEL: length64_eq:
+; X64-SSE2: # %bb.0:
+; X64-SSE2-NEXT: pushq %rax
+; X64-SSE2-NEXT: movl $64, %edx
+; X64-SSE2-NEXT: callq memcmp
+; X64-SSE2-NEXT: testl %eax, %eax
+; X64-SSE2-NEXT: setne %al
+; X64-SSE2-NEXT: popq %rcx
+; X64-SSE2-NEXT: retq
+;
+; X64-AVX2-LABEL: length64_eq:
+; X64-AVX2: # %bb.0:
+; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0
+; X64-AVX2-NEXT: vpcmpeqb (%rsi), %ymm0, %ymm0
+; X64-AVX2-NEXT: vpmovmskb %ymm0, %eax
+; X64-AVX2-NEXT: cmpl $-1, %eax
+; X64-AVX2-NEXT: jne .LBB26_2
+; X64-AVX2-NEXT: # %bb.1: # %loadbb1
+; X64-AVX2-NEXT: vmovdqu 32(%rdi), %ymm0
+; X64-AVX2-NEXT: vpcmpeqb 32(%rsi), %ymm0, %ymm0
+; X64-AVX2-NEXT: vpmovmskb %ymm0, %ecx
+; X64-AVX2-NEXT: xorl %eax, %eax
+; X64-AVX2-NEXT: cmpl $-1, %ecx
+; X64-AVX2-NEXT: je .LBB26_3
+; X64-AVX2-NEXT: .LBB26_2: # %res_block
+; X64-AVX2-NEXT: movl $1, %eax
+; X64-AVX2-NEXT: .LBB26_3: # %endblock
+; X64-AVX2-NEXT: testl %eax, %eax
+; X64-AVX2-NEXT: setne %al
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
%call = tail call i32 @memcmp(i8* %x, i8* %y, i64 64) nounwind
%cmp = icmp ne i32 %call, 0
ret i1 %cmp
@@ -919,7 +1034,7 @@ define i1 @length64_eq(i8* %x, i8* %y) nounwind optsize {
define i1 @length64_eq_const(i8* %X) nounwind optsize {
; X86-LABEL: length64_eq_const:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: pushl $0
; X86-NEXT: pushl $64
; X86-NEXT: pushl $.L.str
@@ -930,16 +1045,38 @@ define i1 @length64_eq_const(i8* %X) nounwind optsize {
; X86-NEXT: sete %al
; X86-NEXT: retl
;
-; X64-LABEL: length64_eq_const:
-; X64: # BB#0:
-; X64-NEXT: pushq %rax
-; X64-NEXT: movl $.L.str, %esi
-; X64-NEXT: movl $64, %edx
-; X64-NEXT: callq memcmp
-; X64-NEXT: testl %eax, %eax
-; X64-NEXT: sete %al
-; X64-NEXT: popq %rcx
-; X64-NEXT: retq
+; X64-SSE2-LABEL: length64_eq_const:
+; X64-SSE2: # %bb.0:
+; X64-SSE2-NEXT: pushq %rax
+; X64-SSE2-NEXT: movl $.L.str, %esi
+; X64-SSE2-NEXT: movl $64, %edx
+; X64-SSE2-NEXT: callq memcmp
+; X64-SSE2-NEXT: testl %eax, %eax
+; X64-SSE2-NEXT: sete %al
+; X64-SSE2-NEXT: popq %rcx
+; X64-SSE2-NEXT: retq
+;
+; X64-AVX2-LABEL: length64_eq_const:
+; X64-AVX2: # %bb.0:
+; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0
+; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %ymm0, %ymm0
+; X64-AVX2-NEXT: vpmovmskb %ymm0, %eax
+; X64-AVX2-NEXT: cmpl $-1, %eax
+; X64-AVX2-NEXT: jne .LBB27_2
+; X64-AVX2-NEXT: # %bb.1: # %loadbb1
+; X64-AVX2-NEXT: vmovdqu 32(%rdi), %ymm0
+; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %ymm0, %ymm0
+; X64-AVX2-NEXT: vpmovmskb %ymm0, %ecx
+; X64-AVX2-NEXT: xorl %eax, %eax
+; X64-AVX2-NEXT: cmpl $-1, %ecx
+; X64-AVX2-NEXT: je .LBB27_3
+; X64-AVX2-NEXT: .LBB27_2: # %res_block
+; X64-AVX2-NEXT: movl $1, %eax
+; X64-AVX2-NEXT: .LBB27_3: # %endblock
+; X64-AVX2-NEXT: testl %eax, %eax
+; X64-AVX2-NEXT: sete %al
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
%m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 64) nounwind
%c = icmp eq i32 %m, 0
ret i1 %c
diff --git a/test/CodeGen/X86/memcmp.ll b/test/CodeGen/X86/memcmp.ll
index 889f6a74bf7f..ed7f496ee342 100644
--- a/test/CodeGen/X86/memcmp.ll
+++ b/test/CodeGen/X86/memcmp.ll
@@ -1,8 +1,10 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=cmov | FileCheck %s --check-prefix=X86 --check-prefix=X86-NOSSE
-; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX2
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=cmov | FileCheck %s --check-prefix=X86 --check-prefix=X86-NOSSE
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse | FileCheck %s --check-prefix=X86 --check-prefix=SSE --check-prefix=X86-SSE1
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86 --check-prefix=SSE --check-prefix=X86-SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX2
; This tests codegen time inlining/optimization of memcmp
; rdar://6480398
@@ -11,35 +13,58 @@
declare i32 @memcmp(i8*, i8*, i64)
+define i32 @length0(i8* %X, i8* %Y) nounwind {
+; X86-LABEL: length0:
+; X86: # %bb.0:
+; X86-NEXT: xorl %eax, %eax
+; X86-NEXT: retl
+;
+; X64-LABEL: length0:
+; X64: # %bb.0:
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: retq
+ %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 0) nounwind
+ ret i32 %m
+ }
+
+define i1 @length0_eq(i8* %X, i8* %Y) nounwind {
+; X86-LABEL: length0_eq:
+; X86: # %bb.0:
+; X86-NEXT: movb $1, %al
+; X86-NEXT: retl
+;
+; X64-LABEL: length0_eq:
+; X64: # %bb.0:
+; X64-NEXT: movb $1, %al
+; X64-NEXT: retq
+ %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 0) nounwind
+ %c = icmp eq i32 %m, 0
+ ret i1 %c
+}
+
define i32 @length2(i8* %X, i8* %Y) nounwind {
; X86-LABEL: length2:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movzwl (%ecx), %ecx
-; X86-NEXT: movzwl (%eax), %eax
+; X86-NEXT: movzwl (%eax), %edx
; X86-NEXT: rolw $8, %cx
-; X86-NEXT: rolw $8, %ax
-; X86-NEXT: xorl %edx, %edx
-; X86-NEXT: cmpw %ax, %cx
-; X86-NEXT: movl $-1, %ecx
-; X86-NEXT: movl $1, %eax
-; X86-NEXT: cmovbl %ecx, %eax
-; X86-NEXT: cmovel %edx, %eax
+; X86-NEXT: rolw $8, %dx
+; X86-NEXT: movzwl %cx, %eax
+; X86-NEXT: movzwl %dx, %ecx
+; X86-NEXT: subl %ecx, %eax
; X86-NEXT: retl
;
; X64-LABEL: length2:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movzwl (%rdi), %eax
; X64-NEXT: movzwl (%rsi), %ecx
; X64-NEXT: rolw $8, %ax
; X64-NEXT: rolw $8, %cx
-; X64-NEXT: xorl %edx, %edx
-; X64-NEXT: cmpw %cx, %ax
-; X64-NEXT: movl $-1, %ecx
-; X64-NEXT: movl $1, %eax
-; X64-NEXT: cmovbl %ecx, %eax
-; X64-NEXT: cmovel %edx, %eax
+; X64-NEXT: movzwl %ax, %eax
+; X64-NEXT: movzwl %cx, %ecx
+; X64-NEXT: subl %ecx, %eax
; X64-NEXT: retq
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 2) nounwind
ret i32 %m
@@ -47,7 +72,7 @@ define i32 @length2(i8* %X, i8* %Y) nounwind {
define i1 @length2_eq(i8* %X, i8* %Y) nounwind {
; X86-LABEL: length2_eq:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movzwl (%ecx), %ecx
@@ -56,7 +81,7 @@ define i1 @length2_eq(i8* %X, i8* %Y) nounwind {
; X86-NEXT: retl
;
; X64-LABEL: length2_eq:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movzwl (%rdi), %eax
; X64-NEXT: cmpw (%rsi), %ax
; X64-NEXT: sete %al
@@ -68,7 +93,7 @@ define i1 @length2_eq(i8* %X, i8* %Y) nounwind {
define i1 @length2_eq_const(i8* %X) nounwind {
; X86-LABEL: length2_eq_const:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movzwl (%eax), %eax
; X86-NEXT: cmpl $12849, %eax # imm = 0x3231
@@ -76,7 +101,7 @@ define i1 @length2_eq_const(i8* %X) nounwind {
; X86-NEXT: retl
;
; X64-LABEL: length2_eq_const:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movzwl (%rdi), %eax
; X64-NEXT: cmpl $12849, %eax # imm = 0x3231
; X64-NEXT: setne %al
@@ -88,7 +113,7 @@ define i1 @length2_eq_const(i8* %X) nounwind {
define i1 @length2_eq_nobuiltin_attr(i8* %X, i8* %Y) nounwind {
; X86-LABEL: length2_eq_nobuiltin_attr:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: pushl $0
; X86-NEXT: pushl $2
; X86-NEXT: pushl {{[0-9]+}}(%esp)
@@ -100,7 +125,7 @@ define i1 @length2_eq_nobuiltin_attr(i8* %X, i8* %Y) nounwind {
; X86-NEXT: retl
;
; X64-LABEL: length2_eq_nobuiltin_attr:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: pushq %rax
; X64-NEXT: movl $2, %edx
; X64-NEXT: callq memcmp
@@ -115,7 +140,7 @@ define i1 @length2_eq_nobuiltin_attr(i8* %X, i8* %Y) nounwind {
define i32 @length3(i8* %X, i8* %Y) nounwind {
; X86-LABEL: length3:
-; X86: # BB#0: # %loadbb
+; X86: # %bb.0: # %loadbb
; X86-NEXT: pushl %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
@@ -123,42 +148,38 @@ define i32 @length3(i8* %X, i8* %Y) nounwind {
; X86-NEXT: movzwl (%ecx), %esi
; X86-NEXT: rolw $8, %dx
; X86-NEXT: rolw $8, %si
-; X86-NEXT: movzwl %dx, %edx
-; X86-NEXT: movzwl %si, %esi
-; X86-NEXT: cmpl %esi, %edx
-; X86-NEXT: jne .LBB4_1
-; X86-NEXT: # BB#2: # %loadbb1
+; X86-NEXT: cmpw %si, %dx
+; X86-NEXT: jne .LBB6_1
+; X86-NEXT: # %bb.2: # %loadbb1
; X86-NEXT: movzbl 2(%eax), %eax
; X86-NEXT: movzbl 2(%ecx), %ecx
; X86-NEXT: subl %ecx, %eax
; X86-NEXT: popl %esi
; X86-NEXT: retl
-; X86-NEXT: .LBB4_1: # %res_block
-; X86-NEXT: movl $-1, %ecx
-; X86-NEXT: movl $1, %eax
-; X86-NEXT: cmovbl %ecx, %eax
+; X86-NEXT: .LBB6_1: # %res_block
+; X86-NEXT: setae %al
+; X86-NEXT: movzbl %al, %eax
+; X86-NEXT: leal -1(%eax,%eax), %eax
; X86-NEXT: popl %esi
; X86-NEXT: retl
;
; X64-LABEL: length3:
-; X64: # BB#0: # %loadbb
+; X64: # %bb.0: # %loadbb
; X64-NEXT: movzwl (%rdi), %eax
; X64-NEXT: movzwl (%rsi), %ecx
; X64-NEXT: rolw $8, %ax
; X64-NEXT: rolw $8, %cx
-; X64-NEXT: movzwl %ax, %eax
-; X64-NEXT: movzwl %cx, %ecx
-; X64-NEXT: cmpq %rcx, %rax
-; X64-NEXT: jne .LBB4_1
-; X64-NEXT: # BB#2: # %loadbb1
+; X64-NEXT: cmpw %cx, %ax
+; X64-NEXT: jne .LBB6_1
+; X64-NEXT: # %bb.2: # %loadbb1
; X64-NEXT: movzbl 2(%rdi), %eax
; X64-NEXT: movzbl 2(%rsi), %ecx
; X64-NEXT: subl %ecx, %eax
; X64-NEXT: retq
-; X64-NEXT: .LBB4_1: # %res_block
-; X64-NEXT: movl $-1, %ecx
-; X64-NEXT: movl $1, %eax
-; X64-NEXT: cmovbl %ecx, %eax
+; X64-NEXT: .LBB6_1: # %res_block
+; X64-NEXT: setae %al
+; X64-NEXT: movzbl %al, %eax
+; X64-NEXT: leal -1(%rax,%rax), %eax
; X64-NEXT: retq
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 3) nounwind
ret i32 %m
@@ -166,37 +187,37 @@ define i32 @length3(i8* %X, i8* %Y) nounwind {
define i1 @length3_eq(i8* %X, i8* %Y) nounwind {
; X86-LABEL: length3_eq:
-; X86: # BB#0: # %loadbb
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movzwl (%eax), %edx
-; X86-NEXT: cmpw (%ecx), %dx
-; X86-NEXT: jne .LBB5_1
-; X86-NEXT: # BB#2: # %loadbb1
-; X86-NEXT: movb 2(%eax), %dl
-; X86-NEXT: xorl %eax, %eax
-; X86-NEXT: cmpb 2(%ecx), %dl
-; X86-NEXT: je .LBB5_3
-; X86-NEXT: .LBB5_1: # %res_block
-; X86-NEXT: movl $1, %eax
-; X86-NEXT: .LBB5_3: # %endblock
-; X86-NEXT: testl %eax, %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movzwl (%ecx), %edx
+; X86-NEXT: cmpw (%eax), %dx
+; X86-NEXT: jne .LBB7_2
+; X86-NEXT: # %bb.1: # %loadbb1
+; X86-NEXT: movb 2(%ecx), %dl
+; X86-NEXT: xorl %ecx, %ecx
+; X86-NEXT: cmpb 2(%eax), %dl
+; X86-NEXT: je .LBB7_3
+; X86-NEXT: .LBB7_2: # %res_block
+; X86-NEXT: movl $1, %ecx
+; X86-NEXT: .LBB7_3: # %endblock
+; X86-NEXT: testl %ecx, %ecx
; X86-NEXT: setne %al
; X86-NEXT: retl
;
; X64-LABEL: length3_eq:
-; X64: # BB#0: # %loadbb
+; X64: # %bb.0:
; X64-NEXT: movzwl (%rdi), %eax
; X64-NEXT: cmpw (%rsi), %ax
-; X64-NEXT: jne .LBB5_1
-; X64-NEXT: # BB#2: # %loadbb1
+; X64-NEXT: jne .LBB7_2
+; X64-NEXT: # %bb.1: # %loadbb1
; X64-NEXT: movb 2(%rdi), %cl
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: cmpb 2(%rsi), %cl
-; X64-NEXT: je .LBB5_3
-; X64-NEXT: .LBB5_1: # %res_block
+; X64-NEXT: je .LBB7_3
+; X64-NEXT: .LBB7_2: # %res_block
; X64-NEXT: movl $1, %eax
-; X64-NEXT: .LBB5_3: # %endblock
+; X64-NEXT: .LBB7_3: # %endblock
; X64-NEXT: testl %eax, %eax
; X64-NEXT: setne %al
; X64-NEXT: retq
@@ -207,33 +228,29 @@ define i1 @length3_eq(i8* %X, i8* %Y) nounwind {
define i32 @length4(i8* %X, i8* %Y) nounwind {
; X86-LABEL: length4:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl (%ecx), %ecx
-; X86-NEXT: movl (%eax), %eax
+; X86-NEXT: movl (%eax), %edx
; X86-NEXT: bswapl %ecx
-; X86-NEXT: bswapl %eax
-; X86-NEXT: xorl %edx, %edx
-; X86-NEXT: cmpl %eax, %ecx
-; X86-NEXT: movl $-1, %ecx
-; X86-NEXT: movl $1, %eax
-; X86-NEXT: cmovbl %ecx, %eax
-; X86-NEXT: cmovel %edx, %eax
+; X86-NEXT: bswapl %edx
+; X86-NEXT: xorl %eax, %eax
+; X86-NEXT: cmpl %edx, %ecx
+; X86-NEXT: seta %al
+; X86-NEXT: sbbl $0, %eax
; X86-NEXT: retl
;
; X64-LABEL: length4:
-; X64: # BB#0:
-; X64-NEXT: movl (%rdi), %eax
-; X64-NEXT: movl (%rsi), %ecx
-; X64-NEXT: bswapl %eax
+; X64: # %bb.0:
+; X64-NEXT: movl (%rdi), %ecx
+; X64-NEXT: movl (%rsi), %edx
; X64-NEXT: bswapl %ecx
-; X64-NEXT: xorl %edx, %edx
-; X64-NEXT: cmpl %ecx, %eax
-; X64-NEXT: movl $-1, %ecx
-; X64-NEXT: movl $1, %eax
-; X64-NEXT: cmovbl %ecx, %eax
-; X64-NEXT: cmovel %edx, %eax
+; X64-NEXT: bswapl %edx
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: cmpl %edx, %ecx
+; X64-NEXT: seta %al
+; X64-NEXT: sbbl $0, %eax
; X64-NEXT: retq
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 4) nounwind
ret i32 %m
@@ -241,7 +258,7 @@ define i32 @length4(i8* %X, i8* %Y) nounwind {
define i1 @length4_eq(i8* %X, i8* %Y) nounwind {
; X86-LABEL: length4_eq:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl (%ecx), %ecx
@@ -250,7 +267,7 @@ define i1 @length4_eq(i8* %X, i8* %Y) nounwind {
; X86-NEXT: retl
;
; X64-LABEL: length4_eq:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movl (%rdi), %eax
; X64-NEXT: cmpl (%rsi), %eax
; X64-NEXT: setne %al
@@ -262,14 +279,14 @@ define i1 @length4_eq(i8* %X, i8* %Y) nounwind {
define i1 @length4_eq_const(i8* %X) nounwind {
; X86-LABEL: length4_eq_const:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: cmpl $875770417, (%eax) # imm = 0x34333231
; X86-NEXT: sete %al
; X86-NEXT: retl
;
; X64-LABEL: length4_eq_const:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: cmpl $875770417, (%rdi) # imm = 0x34333231
; X64-NEXT: sete %al
; X64-NEXT: retq
@@ -280,7 +297,7 @@ define i1 @length4_eq_const(i8* %X) nounwind {
define i32 @length5(i8* %X, i8* %Y) nounwind {
; X86-LABEL: length5:
-; X86: # BB#0: # %loadbb
+; X86: # %bb.0: # %loadbb
; X86-NEXT: pushl %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
@@ -289,37 +306,37 @@ define i32 @length5(i8* %X, i8* %Y) nounwind {
; X86-NEXT: bswapl %edx
; X86-NEXT: bswapl %esi
; X86-NEXT: cmpl %esi, %edx
-; X86-NEXT: jne .LBB9_1
-; X86-NEXT: # BB#2: # %loadbb1
+; X86-NEXT: jne .LBB11_1
+; X86-NEXT: # %bb.2: # %loadbb1
; X86-NEXT: movzbl 4(%eax), %eax
; X86-NEXT: movzbl 4(%ecx), %ecx
; X86-NEXT: subl %ecx, %eax
; X86-NEXT: popl %esi
; X86-NEXT: retl
-; X86-NEXT: .LBB9_1: # %res_block
-; X86-NEXT: movl $-1, %ecx
-; X86-NEXT: movl $1, %eax
-; X86-NEXT: cmovbl %ecx, %eax
+; X86-NEXT: .LBB11_1: # %res_block
+; X86-NEXT: setae %al
+; X86-NEXT: movzbl %al, %eax
+; X86-NEXT: leal -1(%eax,%eax), %eax
; X86-NEXT: popl %esi
; X86-NEXT: retl
;
; X64-LABEL: length5:
-; X64: # BB#0: # %loadbb
+; X64: # %bb.0: # %loadbb
; X64-NEXT: movl (%rdi), %eax
; X64-NEXT: movl (%rsi), %ecx
; X64-NEXT: bswapl %eax
; X64-NEXT: bswapl %ecx
-; X64-NEXT: cmpq %rcx, %rax
-; X64-NEXT: jne .LBB9_1
-; X64-NEXT: # BB#2: # %loadbb1
+; X64-NEXT: cmpl %ecx, %eax
+; X64-NEXT: jne .LBB11_1
+; X64-NEXT: # %bb.2: # %loadbb1
; X64-NEXT: movzbl 4(%rdi), %eax
; X64-NEXT: movzbl 4(%rsi), %ecx
; X64-NEXT: subl %ecx, %eax
; X64-NEXT: retq
-; X64-NEXT: .LBB9_1: # %res_block
-; X64-NEXT: movl $-1, %ecx
-; X64-NEXT: movl $1, %eax
-; X64-NEXT: cmovbl %ecx, %eax
+; X64-NEXT: .LBB11_1: # %res_block
+; X64-NEXT: setae %al
+; X64-NEXT: movzbl %al, %eax
+; X64-NEXT: leal -1(%rax,%rax), %eax
; X64-NEXT: retq
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 5) nounwind
ret i32 %m
@@ -327,37 +344,37 @@ define i32 @length5(i8* %X, i8* %Y) nounwind {
define i1 @length5_eq(i8* %X, i8* %Y) nounwind {
; X86-LABEL: length5_eq:
-; X86: # BB#0: # %loadbb
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl (%eax), %edx
-; X86-NEXT: cmpl (%ecx), %edx
-; X86-NEXT: jne .LBB10_1
-; X86-NEXT: # BB#2: # %loadbb1
-; X86-NEXT: movb 4(%eax), %dl
-; X86-NEXT: xorl %eax, %eax
-; X86-NEXT: cmpb 4(%ecx), %dl
-; X86-NEXT: je .LBB10_3
-; X86-NEXT: .LBB10_1: # %res_block
-; X86-NEXT: movl $1, %eax
-; X86-NEXT: .LBB10_3: # %endblock
-; X86-NEXT: testl %eax, %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl (%ecx), %edx
+; X86-NEXT: cmpl (%eax), %edx
+; X86-NEXT: jne .LBB12_2
+; X86-NEXT: # %bb.1: # %loadbb1
+; X86-NEXT: movb 4(%ecx), %dl
+; X86-NEXT: xorl %ecx, %ecx
+; X86-NEXT: cmpb 4(%eax), %dl
+; X86-NEXT: je .LBB12_3
+; X86-NEXT: .LBB12_2: # %res_block
+; X86-NEXT: movl $1, %ecx
+; X86-NEXT: .LBB12_3: # %endblock
+; X86-NEXT: testl %ecx, %ecx
; X86-NEXT: setne %al
; X86-NEXT: retl
;
; X64-LABEL: length5_eq:
-; X64: # BB#0: # %loadbb
+; X64: # %bb.0:
; X64-NEXT: movl (%rdi), %eax
; X64-NEXT: cmpl (%rsi), %eax
-; X64-NEXT: jne .LBB10_1
-; X64-NEXT: # BB#2: # %loadbb1
+; X64-NEXT: jne .LBB12_2
+; X64-NEXT: # %bb.1: # %loadbb1
; X64-NEXT: movb 4(%rdi), %cl
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: cmpb 4(%rsi), %cl
-; X64-NEXT: je .LBB10_3
-; X64-NEXT: .LBB10_1: # %res_block
+; X64-NEXT: je .LBB12_3
+; X64-NEXT: .LBB12_2: # %res_block
; X64-NEXT: movl $1, %eax
-; X64-NEXT: .LBB10_3: # %endblock
+; X64-NEXT: .LBB12_3: # %endblock
; X64-NEXT: testl %eax, %eax
; X64-NEXT: setne %al
; X64-NEXT: retq
@@ -368,7 +385,7 @@ define i1 @length5_eq(i8* %X, i8* %Y) nounwind {
define i32 @length8(i8* %X, i8* %Y) nounwind {
; X86-LABEL: length8:
-; X86: # BB#0: # %loadbb
+; X86: # %bb.0:
; X86-NEXT: pushl %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
@@ -377,38 +394,34 @@ define i32 @length8(i8* %X, i8* %Y) nounwind {
; X86-NEXT: bswapl %ecx
; X86-NEXT: bswapl %edx
; X86-NEXT: cmpl %edx, %ecx
-; X86-NEXT: jne .LBB11_1
-; X86-NEXT: # BB#2: # %loadbb1
+; X86-NEXT: jne .LBB13_2
+; X86-NEXT: # %bb.1: # %loadbb1
; X86-NEXT: movl 4(%esi), %ecx
; X86-NEXT: movl 4(%eax), %edx
; X86-NEXT: bswapl %ecx
; X86-NEXT: bswapl %edx
; X86-NEXT: xorl %eax, %eax
; X86-NEXT: cmpl %edx, %ecx
-; X86-NEXT: jne .LBB11_1
-; X86-NEXT: # BB#3: # %endblock
-; X86-NEXT: popl %esi
-; X86-NEXT: retl
-; X86-NEXT: .LBB11_1: # %res_block
+; X86-NEXT: je .LBB13_3
+; X86-NEXT: .LBB13_2: # %res_block
+; X86-NEXT: xorl %eax, %eax
; X86-NEXT: cmpl %edx, %ecx
-; X86-NEXT: movl $-1, %ecx
-; X86-NEXT: movl $1, %eax
-; X86-NEXT: cmovbl %ecx, %eax
+; X86-NEXT: setae %al
+; X86-NEXT: leal -1(%eax,%eax), %eax
+; X86-NEXT: .LBB13_3: # %endblock
; X86-NEXT: popl %esi
; X86-NEXT: retl
;
; X64-LABEL: length8:
-; X64: # BB#0:
-; X64-NEXT: movq (%rdi), %rax
-; X64-NEXT: movq (%rsi), %rcx
-; X64-NEXT: bswapq %rax
+; X64: # %bb.0:
+; X64-NEXT: movq (%rdi), %rcx
+; X64-NEXT: movq (%rsi), %rdx
; X64-NEXT: bswapq %rcx
-; X64-NEXT: xorl %edx, %edx
-; X64-NEXT: cmpq %rcx, %rax
-; X64-NEXT: movl $-1, %ecx
-; X64-NEXT: movl $1, %eax
-; X64-NEXT: cmovbl %ecx, %eax
-; X64-NEXT: cmovel %edx, %eax
+; X64-NEXT: bswapq %rdx
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: cmpq %rdx, %rcx
+; X64-NEXT: seta %al
+; X64-NEXT: sbbl $0, %eax
; X64-NEXT: retq
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 8) nounwind
ret i32 %m
@@ -416,26 +429,26 @@ define i32 @length8(i8* %X, i8* %Y) nounwind {
define i1 @length8_eq(i8* %X, i8* %Y) nounwind {
; X86-LABEL: length8_eq:
-; X86: # BB#0: # %loadbb
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl (%eax), %edx
-; X86-NEXT: cmpl (%ecx), %edx
-; X86-NEXT: jne .LBB12_1
-; X86-NEXT: # BB#2: # %loadbb1
-; X86-NEXT: movl 4(%eax), %edx
-; X86-NEXT: xorl %eax, %eax
-; X86-NEXT: cmpl 4(%ecx), %edx
-; X86-NEXT: je .LBB12_3
-; X86-NEXT: .LBB12_1: # %res_block
-; X86-NEXT: movl $1, %eax
-; X86-NEXT: .LBB12_3: # %endblock
-; X86-NEXT: testl %eax, %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl (%ecx), %edx
+; X86-NEXT: cmpl (%eax), %edx
+; X86-NEXT: jne .LBB14_2
+; X86-NEXT: # %bb.1: # %loadbb1
+; X86-NEXT: movl 4(%ecx), %edx
+; X86-NEXT: xorl %ecx, %ecx
+; X86-NEXT: cmpl 4(%eax), %edx
+; X86-NEXT: je .LBB14_3
+; X86-NEXT: .LBB14_2: # %res_block
+; X86-NEXT: movl $1, %ecx
+; X86-NEXT: .LBB14_3: # %endblock
+; X86-NEXT: testl %ecx, %ecx
; X86-NEXT: sete %al
; X86-NEXT: retl
;
; X64-LABEL: length8_eq:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movq (%rdi), %rax
; X64-NEXT: cmpq (%rsi), %rax
; X64-NEXT: sete %al
@@ -447,23 +460,23 @@ define i1 @length8_eq(i8* %X, i8* %Y) nounwind {
define i1 @length8_eq_const(i8* %X) nounwind {
; X86-LABEL: length8_eq_const:
-; X86: # BB#0: # %loadbb
+; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: cmpl $858927408, (%ecx) # imm = 0x33323130
-; X86-NEXT: jne .LBB13_1
-; X86-NEXT: # BB#2: # %loadbb1
+; X86-NEXT: jne .LBB15_2
+; X86-NEXT: # %bb.1: # %loadbb1
; X86-NEXT: xorl %eax, %eax
; X86-NEXT: cmpl $926299444, 4(%ecx) # imm = 0x37363534
-; X86-NEXT: je .LBB13_3
-; X86-NEXT: .LBB13_1: # %res_block
+; X86-NEXT: je .LBB15_3
+; X86-NEXT: .LBB15_2: # %res_block
; X86-NEXT: movl $1, %eax
-; X86-NEXT: .LBB13_3: # %endblock
+; X86-NEXT: .LBB15_3: # %endblock
; X86-NEXT: testl %eax, %eax
; X86-NEXT: setne %al
; X86-NEXT: retl
;
; X64-LABEL: length8_eq_const:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movabsq $3978425819141910832, %rax # imm = 0x3736353433323130
; X64-NEXT: cmpq %rax, (%rdi)
; X64-NEXT: setne %al
@@ -475,7 +488,7 @@ define i1 @length8_eq_const(i8* %X) nounwind {
define i1 @length12_eq(i8* %X, i8* %Y) nounwind {
; X86-LABEL: length12_eq:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: pushl $0
; X86-NEXT: pushl $12
; X86-NEXT: pushl {{[0-9]+}}(%esp)
@@ -487,18 +500,18 @@ define i1 @length12_eq(i8* %X, i8* %Y) nounwind {
; X86-NEXT: retl
;
; X64-LABEL: length12_eq:
-; X64: # BB#0: # %loadbb
+; X64: # %bb.0:
; X64-NEXT: movq (%rdi), %rax
; X64-NEXT: cmpq (%rsi), %rax
-; X64-NEXT: jne .LBB14_1
-; X64-NEXT: # BB#2: # %loadbb1
+; X64-NEXT: jne .LBB16_2
+; X64-NEXT: # %bb.1: # %loadbb1
; X64-NEXT: movl 8(%rdi), %ecx
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: cmpl 8(%rsi), %ecx
-; X64-NEXT: je .LBB14_3
-; X64-NEXT: .LBB14_1: # %res_block
+; X64-NEXT: je .LBB16_3
+; X64-NEXT: .LBB16_2: # %res_block
; X64-NEXT: movl $1, %eax
-; X64-NEXT: .LBB14_3: # %endblock
+; X64-NEXT: .LBB16_3: # %endblock
; X64-NEXT: testl %eax, %eax
; X64-NEXT: setne %al
; X64-NEXT: retq
@@ -509,7 +522,7 @@ define i1 @length12_eq(i8* %X, i8* %Y) nounwind {
define i32 @length12(i8* %X, i8* %Y) nounwind {
; X86-LABEL: length12:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: pushl $0
; X86-NEXT: pushl $12
; X86-NEXT: pushl {{[0-9]+}}(%esp)
@@ -519,28 +532,27 @@ define i32 @length12(i8* %X, i8* %Y) nounwind {
; X86-NEXT: retl
;
; X64-LABEL: length12:
-; X64: # BB#0: # %loadbb
+; X64: # %bb.0:
; X64-NEXT: movq (%rdi), %rcx
; X64-NEXT: movq (%rsi), %rdx
; X64-NEXT: bswapq %rcx
; X64-NEXT: bswapq %rdx
; X64-NEXT: cmpq %rdx, %rcx
-; X64-NEXT: jne .LBB15_1
-; X64-NEXT: # BB#2: # %loadbb1
+; X64-NEXT: jne .LBB17_2
+; X64-NEXT: # %bb.1: # %loadbb1
; X64-NEXT: movl 8(%rdi), %ecx
; X64-NEXT: movl 8(%rsi), %edx
; X64-NEXT: bswapl %ecx
; X64-NEXT: bswapl %edx
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: cmpq %rdx, %rcx
-; X64-NEXT: jne .LBB15_1
-; X64-NEXT: # BB#3: # %endblock
-; X64-NEXT: retq
-; X64-NEXT: .LBB15_1: # %res_block
+; X64-NEXT: je .LBB17_3
+; X64-NEXT: .LBB17_2: # %res_block
+; X64-NEXT: xorl %eax, %eax
; X64-NEXT: cmpq %rdx, %rcx
-; X64-NEXT: movl $-1, %ecx
-; X64-NEXT: movl $1, %eax
-; X64-NEXT: cmovbl %ecx, %eax
+; X64-NEXT: setae %al
+; X64-NEXT: leal -1(%rax,%rax), %eax
+; X64-NEXT: .LBB17_3: # %endblock
; X64-NEXT: retq
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 12) nounwind
ret i32 %m
@@ -550,7 +562,7 @@ define i32 @length12(i8* %X, i8* %Y) nounwind {
define i32 @length16(i8* %X, i8* %Y) nounwind {
; X86-LABEL: length16:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: pushl $0
; X86-NEXT: pushl $16
; X86-NEXT: pushl {{[0-9]+}}(%esp)
@@ -560,28 +572,27 @@ define i32 @length16(i8* %X, i8* %Y) nounwind {
; X86-NEXT: retl
;
; X64-LABEL: length16:
-; X64: # BB#0: # %loadbb
+; X64: # %bb.0:
; X64-NEXT: movq (%rdi), %rcx
; X64-NEXT: movq (%rsi), %rdx
; X64-NEXT: bswapq %rcx
; X64-NEXT: bswapq %rdx
; X64-NEXT: cmpq %rdx, %rcx
-; X64-NEXT: jne .LBB16_1
-; X64-NEXT: # BB#2: # %loadbb1
+; X64-NEXT: jne .LBB18_2
+; X64-NEXT: # %bb.1: # %loadbb1
; X64-NEXT: movq 8(%rdi), %rcx
; X64-NEXT: movq 8(%rsi), %rdx
; X64-NEXT: bswapq %rcx
; X64-NEXT: bswapq %rdx
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: cmpq %rdx, %rcx
-; X64-NEXT: jne .LBB16_1
-; X64-NEXT: # BB#3: # %endblock
-; X64-NEXT: retq
-; X64-NEXT: .LBB16_1: # %res_block
+; X64-NEXT: je .LBB18_3
+; X64-NEXT: .LBB18_2: # %res_block
+; X64-NEXT: xorl %eax, %eax
; X64-NEXT: cmpq %rdx, %rcx
-; X64-NEXT: movl $-1, %ecx
-; X64-NEXT: movl $1, %eax
-; X64-NEXT: cmovbl %ecx, %eax
+; X64-NEXT: setae %al
+; X64-NEXT: leal -1(%rax,%rax), %eax
+; X64-NEXT: .LBB18_3: # %endblock
; X64-NEXT: retq
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 16) nounwind
ret i32 %m
@@ -589,7 +600,7 @@ define i32 @length16(i8* %X, i8* %Y) nounwind {
define i1 @length16_eq(i8* %x, i8* %y) nounwind {
; X86-NOSSE-LABEL: length16_eq:
-; X86-NOSSE: # BB#0:
+; X86-NOSSE: # %bb.0:
; X86-NOSSE-NEXT: pushl $0
; X86-NOSSE-NEXT: pushl $16
; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp)
@@ -600,8 +611,20 @@ define i1 @length16_eq(i8* %x, i8* %y) nounwind {
; X86-NOSSE-NEXT: setne %al
; X86-NOSSE-NEXT: retl
;
+; X86-SSE1-LABEL: length16_eq:
+; X86-SSE1: # %bb.0:
+; X86-SSE1-NEXT: pushl $0
+; X86-SSE1-NEXT: pushl $16
+; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT: calll memcmp
+; X86-SSE1-NEXT: addl $16, %esp
+; X86-SSE1-NEXT: testl %eax, %eax
+; X86-SSE1-NEXT: setne %al
+; X86-SSE1-NEXT: retl
+;
; X86-SSE2-LABEL: length16_eq:
-; X86-SSE2: # BB#0:
+; X86-SSE2: # %bb.0:
; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-SSE2-NEXT: movdqu (%ecx), %xmm0
@@ -612,22 +635,24 @@ define i1 @length16_eq(i8* %x, i8* %y) nounwind {
; X86-SSE2-NEXT: setne %al
; X86-SSE2-NEXT: retl
;
-; X64-LABEL: length16_eq:
-; X64: # BB#0: # %loadbb
-; X64-NEXT: movq (%rdi), %rax
-; X64-NEXT: cmpq (%rsi), %rax
-; X64-NEXT: jne .LBB17_1
-; X64-NEXT: # BB#2: # %loadbb1
-; X64-NEXT: movq 8(%rdi), %rcx
-; X64-NEXT: xorl %eax, %eax
-; X64-NEXT: cmpq 8(%rsi), %rcx
-; X64-NEXT: je .LBB17_3
-; X64-NEXT: .LBB17_1: # %res_block
-; X64-NEXT: movl $1, %eax
-; X64-NEXT: .LBB17_3: # %endblock
-; X64-NEXT: testl %eax, %eax
-; X64-NEXT: setne %al
-; X64-NEXT: retq
+; X64-SSE2-LABEL: length16_eq:
+; X64-SSE2: # %bb.0:
+; X64-SSE2-NEXT: movdqu (%rdi), %xmm0
+; X64-SSE2-NEXT: movdqu (%rsi), %xmm1
+; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm1
+; X64-SSE2-NEXT: pmovmskb %xmm1, %eax
+; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
+; X64-SSE2-NEXT: setne %al
+; X64-SSE2-NEXT: retq
+;
+; X64-AVX-LABEL: length16_eq:
+; X64-AVX: # %bb.0:
+; X64-AVX-NEXT: vmovdqu (%rdi), %xmm0
+; X64-AVX-NEXT: vpcmpeqb (%rsi), %xmm0, %xmm0
+; X64-AVX-NEXT: vpmovmskb %xmm0, %eax
+; X64-AVX-NEXT: cmpl $65535, %eax # imm = 0xFFFF
+; X64-AVX-NEXT: setne %al
+; X64-AVX-NEXT: retq
%call = tail call i32 @memcmp(i8* %x, i8* %y, i64 16) nounwind
%cmp = icmp ne i32 %call, 0
ret i1 %cmp
@@ -635,7 +660,7 @@ define i1 @length16_eq(i8* %x, i8* %y) nounwind {
define i1 @length16_eq_const(i8* %X) nounwind {
; X86-NOSSE-LABEL: length16_eq_const:
-; X86-NOSSE: # BB#0:
+; X86-NOSSE: # %bb.0:
; X86-NOSSE-NEXT: pushl $0
; X86-NOSSE-NEXT: pushl $16
; X86-NOSSE-NEXT: pushl $.L.str
@@ -646,8 +671,20 @@ define i1 @length16_eq_const(i8* %X) nounwind {
; X86-NOSSE-NEXT: sete %al
; X86-NOSSE-NEXT: retl
;
+; X86-SSE1-LABEL: length16_eq_const:
+; X86-SSE1: # %bb.0:
+; X86-SSE1-NEXT: pushl $0
+; X86-SSE1-NEXT: pushl $16
+; X86-SSE1-NEXT: pushl $.L.str
+; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT: calll memcmp
+; X86-SSE1-NEXT: addl $16, %esp
+; X86-SSE1-NEXT: testl %eax, %eax
+; X86-SSE1-NEXT: sete %al
+; X86-SSE1-NEXT: retl
+;
; X86-SSE2-LABEL: length16_eq_const:
-; X86-SSE2: # BB#0:
+; X86-SSE2: # %bb.0:
; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE2-NEXT: movdqu (%eax), %xmm0
; X86-SSE2-NEXT: pcmpeqb {{\.LCPI.*}}, %xmm0
@@ -656,22 +693,23 @@ define i1 @length16_eq_const(i8* %X) nounwind {
; X86-SSE2-NEXT: sete %al
; X86-SSE2-NEXT: retl
;
-; X64-LABEL: length16_eq_const:
-; X64: # BB#0: # %loadbb
-; X64-NEXT: movabsq $3978425819141910832, %rax # imm = 0x3736353433323130
-; X64-NEXT: cmpq %rax, (%rdi)
-; X64-NEXT: jne .LBB18_1
-; X64-NEXT: # BB#2: # %loadbb1
-; X64-NEXT: xorl %eax, %eax
-; X64-NEXT: movabsq $3833745473465760056, %rcx # imm = 0x3534333231303938
-; X64-NEXT: cmpq %rcx, 8(%rdi)
-; X64-NEXT: je .LBB18_3
-; X64-NEXT: .LBB18_1: # %res_block
-; X64-NEXT: movl $1, %eax
-; X64-NEXT: .LBB18_3: # %endblock
-; X64-NEXT: testl %eax, %eax
-; X64-NEXT: sete %al
-; X64-NEXT: retq
+; X64-SSE2-LABEL: length16_eq_const:
+; X64-SSE2: # %bb.0:
+; X64-SSE2-NEXT: movdqu (%rdi), %xmm0
+; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm0
+; X64-SSE2-NEXT: pmovmskb %xmm0, %eax
+; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
+; X64-SSE2-NEXT: sete %al
+; X64-SSE2-NEXT: retq
+;
+; X64-AVX-LABEL: length16_eq_const:
+; X64-AVX: # %bb.0:
+; X64-AVX-NEXT: vmovdqu (%rdi), %xmm0
+; X64-AVX-NEXT: vpcmpeqb {{.*}}(%rip), %xmm0, %xmm0
+; X64-AVX-NEXT: vpmovmskb %xmm0, %eax
+; X64-AVX-NEXT: cmpl $65535, %eax # imm = 0xFFFF
+; X64-AVX-NEXT: sete %al
+; X64-AVX-NEXT: retq
%m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 16) nounwind
%c = icmp eq i32 %m, 0
ret i1 %c
@@ -681,7 +719,7 @@ define i1 @length16_eq_const(i8* %X) nounwind {
define i32 @length24(i8* %X, i8* %Y) nounwind {
; X86-LABEL: length24:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: pushl $0
; X86-NEXT: pushl $24
; X86-NEXT: pushl {{[0-9]+}}(%esp)
@@ -691,7 +729,7 @@ define i32 @length24(i8* %X, i8* %Y) nounwind {
; X86-NEXT: retl
;
; X64-LABEL: length24:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movl $24, %edx
; X64-NEXT: jmp memcmp # TAILCALL
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 24) nounwind
@@ -700,7 +738,7 @@ define i32 @length24(i8* %X, i8* %Y) nounwind {
define i1 @length24_eq(i8* %x, i8* %y) nounwind {
; X86-LABEL: length24_eq:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: pushl $0
; X86-NEXT: pushl $24
; X86-NEXT: pushl {{[0-9]+}}(%esp)
@@ -711,15 +749,44 @@ define i1 @length24_eq(i8* %x, i8* %y) nounwind {
; X86-NEXT: sete %al
; X86-NEXT: retl
;
-; X64-LABEL: length24_eq:
-; X64: # BB#0:
-; X64-NEXT: pushq %rax
-; X64-NEXT: movl $24, %edx
-; X64-NEXT: callq memcmp
-; X64-NEXT: testl %eax, %eax
-; X64-NEXT: sete %al
-; X64-NEXT: popq %rcx
-; X64-NEXT: retq
+; X64-SSE2-LABEL: length24_eq:
+; X64-SSE2: # %bb.0:
+; X64-SSE2-NEXT: movdqu (%rdi), %xmm0
+; X64-SSE2-NEXT: movdqu (%rsi), %xmm1
+; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm1
+; X64-SSE2-NEXT: pmovmskb %xmm1, %eax
+; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
+; X64-SSE2-NEXT: jne .LBB22_2
+; X64-SSE2-NEXT: # %bb.1: # %loadbb1
+; X64-SSE2-NEXT: movq 16(%rdi), %rcx
+; X64-SSE2-NEXT: xorl %eax, %eax
+; X64-SSE2-NEXT: cmpq 16(%rsi), %rcx
+; X64-SSE2-NEXT: je .LBB22_3
+; X64-SSE2-NEXT: .LBB22_2: # %res_block
+; X64-SSE2-NEXT: movl $1, %eax
+; X64-SSE2-NEXT: .LBB22_3: # %endblock
+; X64-SSE2-NEXT: testl %eax, %eax
+; X64-SSE2-NEXT: sete %al
+; X64-SSE2-NEXT: retq
+;
+; X64-AVX-LABEL: length24_eq:
+; X64-AVX: # %bb.0:
+; X64-AVX-NEXT: vmovdqu (%rdi), %xmm0
+; X64-AVX-NEXT: vpcmpeqb (%rsi), %xmm0, %xmm0
+; X64-AVX-NEXT: vpmovmskb %xmm0, %eax
+; X64-AVX-NEXT: cmpl $65535, %eax # imm = 0xFFFF
+; X64-AVX-NEXT: jne .LBB22_2
+; X64-AVX-NEXT: # %bb.1: # %loadbb1
+; X64-AVX-NEXT: movq 16(%rdi), %rcx
+; X64-AVX-NEXT: xorl %eax, %eax
+; X64-AVX-NEXT: cmpq 16(%rsi), %rcx
+; X64-AVX-NEXT: je .LBB22_3
+; X64-AVX-NEXT: .LBB22_2: # %res_block
+; X64-AVX-NEXT: movl $1, %eax
+; X64-AVX-NEXT: .LBB22_3: # %endblock
+; X64-AVX-NEXT: testl %eax, %eax
+; X64-AVX-NEXT: sete %al
+; X64-AVX-NEXT: retq
%call = tail call i32 @memcmp(i8* %x, i8* %y, i64 24) nounwind
%cmp = icmp eq i32 %call, 0
ret i1 %cmp
@@ -727,7 +794,7 @@ define i1 @length24_eq(i8* %x, i8* %y) nounwind {
define i1 @length24_eq_const(i8* %X) nounwind {
; X86-LABEL: length24_eq_const:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: pushl $0
; X86-NEXT: pushl $24
; X86-NEXT: pushl $.L.str
@@ -738,16 +805,43 @@ define i1 @length24_eq_const(i8* %X) nounwind {
; X86-NEXT: setne %al
; X86-NEXT: retl
;
-; X64-LABEL: length24_eq_const:
-; X64: # BB#0:
-; X64-NEXT: pushq %rax
-; X64-NEXT: movl $.L.str, %esi
-; X64-NEXT: movl $24, %edx
-; X64-NEXT: callq memcmp
-; X64-NEXT: testl %eax, %eax
-; X64-NEXT: setne %al
-; X64-NEXT: popq %rcx
-; X64-NEXT: retq
+; X64-SSE2-LABEL: length24_eq_const:
+; X64-SSE2: # %bb.0:
+; X64-SSE2-NEXT: movdqu (%rdi), %xmm0
+; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm0
+; X64-SSE2-NEXT: pmovmskb %xmm0, %eax
+; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
+; X64-SSE2-NEXT: jne .LBB23_2
+; X64-SSE2-NEXT: # %bb.1: # %loadbb1
+; X64-SSE2-NEXT: xorl %eax, %eax
+; X64-SSE2-NEXT: movabsq $3689065127958034230, %rcx # imm = 0x3332313039383736
+; X64-SSE2-NEXT: cmpq %rcx, 16(%rdi)
+; X64-SSE2-NEXT: je .LBB23_3
+; X64-SSE2-NEXT: .LBB23_2: # %res_block
+; X64-SSE2-NEXT: movl $1, %eax
+; X64-SSE2-NEXT: .LBB23_3: # %endblock
+; X64-SSE2-NEXT: testl %eax, %eax
+; X64-SSE2-NEXT: setne %al
+; X64-SSE2-NEXT: retq
+;
+; X64-AVX-LABEL: length24_eq_const:
+; X64-AVX: # %bb.0:
+; X64-AVX-NEXT: vmovdqu (%rdi), %xmm0
+; X64-AVX-NEXT: vpcmpeqb {{.*}}(%rip), %xmm0, %xmm0
+; X64-AVX-NEXT: vpmovmskb %xmm0, %eax
+; X64-AVX-NEXT: cmpl $65535, %eax # imm = 0xFFFF
+; X64-AVX-NEXT: jne .LBB23_2
+; X64-AVX-NEXT: # %bb.1: # %loadbb1
+; X64-AVX-NEXT: xorl %eax, %eax
+; X64-AVX-NEXT: movabsq $3689065127958034230, %rcx # imm = 0x3332313039383736
+; X64-AVX-NEXT: cmpq %rcx, 16(%rdi)
+; X64-AVX-NEXT: je .LBB23_3
+; X64-AVX-NEXT: .LBB23_2: # %res_block
+; X64-AVX-NEXT: movl $1, %eax
+; X64-AVX-NEXT: .LBB23_3: # %endblock
+; X64-AVX-NEXT: testl %eax, %eax
+; X64-AVX-NEXT: setne %al
+; X64-AVX-NEXT: retq
%m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 24) nounwind
%c = icmp ne i32 %m, 0
ret i1 %c
@@ -755,7 +849,7 @@ define i1 @length24_eq_const(i8* %X) nounwind {
define i32 @length32(i8* %X, i8* %Y) nounwind {
; X86-LABEL: length32:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: pushl $0
; X86-NEXT: pushl $32
; X86-NEXT: pushl {{[0-9]+}}(%esp)
@@ -765,7 +859,7 @@ define i32 @length32(i8* %X, i8* %Y) nounwind {
; X86-NEXT: retl
;
; X64-LABEL: length32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movl $32, %edx
; X64-NEXT: jmp memcmp # TAILCALL
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 32) nounwind
@@ -775,30 +869,101 @@ define i32 @length32(i8* %X, i8* %Y) nounwind {
; PR33325 - https://bugs.llvm.org/show_bug.cgi?id=33325
define i1 @length32_eq(i8* %x, i8* %y) nounwind {
-; X86-LABEL: length32_eq:
-; X86: # BB#0:
-; X86-NEXT: pushl $0
-; X86-NEXT: pushl $32
-; X86-NEXT: pushl {{[0-9]+}}(%esp)
-; X86-NEXT: pushl {{[0-9]+}}(%esp)
-; X86-NEXT: calll memcmp
-; X86-NEXT: addl $16, %esp
-; X86-NEXT: testl %eax, %eax
-; X86-NEXT: sete %al
-; X86-NEXT: retl
+; X86-NOSSE-LABEL: length32_eq:
+; X86-NOSSE: # %bb.0:
+; X86-NOSSE-NEXT: pushl $0
+; X86-NOSSE-NEXT: pushl $32
+; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT: calll memcmp
+; X86-NOSSE-NEXT: addl $16, %esp
+; X86-NOSSE-NEXT: testl %eax, %eax
+; X86-NOSSE-NEXT: sete %al
+; X86-NOSSE-NEXT: retl
+;
+; X86-SSE1-LABEL: length32_eq:
+; X86-SSE1: # %bb.0:
+; X86-SSE1-NEXT: pushl $0
+; X86-SSE1-NEXT: pushl $32
+; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT: calll memcmp
+; X86-SSE1-NEXT: addl $16, %esp
+; X86-SSE1-NEXT: testl %eax, %eax
+; X86-SSE1-NEXT: sete %al
+; X86-SSE1-NEXT: retl
+;
+; X86-SSE2-LABEL: length32_eq:
+; X86-SSE2: # %bb.0:
+; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE2-NEXT: movdqu (%ecx), %xmm0
+; X86-SSE2-NEXT: movdqu (%eax), %xmm1
+; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm1
+; X86-SSE2-NEXT: pmovmskb %xmm1, %edx
+; X86-SSE2-NEXT: cmpl $65535, %edx # imm = 0xFFFF
+; X86-SSE2-NEXT: jne .LBB25_2
+; X86-SSE2-NEXT: # %bb.1: # %loadbb1
+; X86-SSE2-NEXT: movdqu 16(%ecx), %xmm0
+; X86-SSE2-NEXT: movdqu 16(%eax), %xmm1
+; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm1
+; X86-SSE2-NEXT: pmovmskb %xmm1, %ecx
+; X86-SSE2-NEXT: xorl %eax, %eax
+; X86-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF
+; X86-SSE2-NEXT: je .LBB25_3
+; X86-SSE2-NEXT: .LBB25_2: # %res_block
+; X86-SSE2-NEXT: movl $1, %eax
+; X86-SSE2-NEXT: .LBB25_3: # %endblock
+; X86-SSE2-NEXT: testl %eax, %eax
+; X86-SSE2-NEXT: sete %al
+; X86-SSE2-NEXT: retl
;
; X64-SSE2-LABEL: length32_eq:
-; X64-SSE2: # BB#0:
-; X64-SSE2-NEXT: pushq %rax
-; X64-SSE2-NEXT: movl $32, %edx
-; X64-SSE2-NEXT: callq memcmp
+; X64-SSE2: # %bb.0:
+; X64-SSE2-NEXT: movdqu (%rdi), %xmm0
+; X64-SSE2-NEXT: movdqu (%rsi), %xmm1
+; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm1
+; X64-SSE2-NEXT: pmovmskb %xmm1, %eax
+; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
+; X64-SSE2-NEXT: jne .LBB25_2
+; X64-SSE2-NEXT: # %bb.1: # %loadbb1
+; X64-SSE2-NEXT: movdqu 16(%rdi), %xmm0
+; X64-SSE2-NEXT: movdqu 16(%rsi), %xmm1
+; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm1
+; X64-SSE2-NEXT: pmovmskb %xmm1, %ecx
+; X64-SSE2-NEXT: xorl %eax, %eax
+; X64-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF
+; X64-SSE2-NEXT: je .LBB25_3
+; X64-SSE2-NEXT: .LBB25_2: # %res_block
+; X64-SSE2-NEXT: movl $1, %eax
+; X64-SSE2-NEXT: .LBB25_3: # %endblock
; X64-SSE2-NEXT: testl %eax, %eax
; X64-SSE2-NEXT: sete %al
-; X64-SSE2-NEXT: popq %rcx
; X64-SSE2-NEXT: retq
;
+; X64-AVX1-LABEL: length32_eq:
+; X64-AVX1: # %bb.0:
+; X64-AVX1-NEXT: vmovdqu (%rdi), %xmm0
+; X64-AVX1-NEXT: vpcmpeqb (%rsi), %xmm0, %xmm0
+; X64-AVX1-NEXT: vpmovmskb %xmm0, %eax
+; X64-AVX1-NEXT: cmpl $65535, %eax # imm = 0xFFFF
+; X64-AVX1-NEXT: jne .LBB25_2
+; X64-AVX1-NEXT: # %bb.1: # %loadbb1
+; X64-AVX1-NEXT: vmovdqu 16(%rdi), %xmm0
+; X64-AVX1-NEXT: vpcmpeqb 16(%rsi), %xmm0, %xmm0
+; X64-AVX1-NEXT: vpmovmskb %xmm0, %ecx
+; X64-AVX1-NEXT: xorl %eax, %eax
+; X64-AVX1-NEXT: cmpl $65535, %ecx # imm = 0xFFFF
+; X64-AVX1-NEXT: je .LBB25_3
+; X64-AVX1-NEXT: .LBB25_2: # %res_block
+; X64-AVX1-NEXT: movl $1, %eax
+; X64-AVX1-NEXT: .LBB25_3: # %endblock
+; X64-AVX1-NEXT: testl %eax, %eax
+; X64-AVX1-NEXT: sete %al
+; X64-AVX1-NEXT: retq
+;
; X64-AVX2-LABEL: length32_eq:
-; X64-AVX2: # BB#0:
+; X64-AVX2: # %bb.0:
; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0
; X64-AVX2-NEXT: vpcmpeqb (%rsi), %ymm0, %ymm0
; X64-AVX2-NEXT: vpmovmskb %ymm0, %eax
@@ -812,31 +977,96 @@ define i1 @length32_eq(i8* %x, i8* %y) nounwind {
}
define i1 @length32_eq_const(i8* %X) nounwind {
-; X86-LABEL: length32_eq_const:
-; X86: # BB#0:
-; X86-NEXT: pushl $0
-; X86-NEXT: pushl $32
-; X86-NEXT: pushl $.L.str
-; X86-NEXT: pushl {{[0-9]+}}(%esp)
-; X86-NEXT: calll memcmp
-; X86-NEXT: addl $16, %esp
-; X86-NEXT: testl %eax, %eax
-; X86-NEXT: setne %al
-; X86-NEXT: retl
+; X86-NOSSE-LABEL: length32_eq_const:
+; X86-NOSSE: # %bb.0:
+; X86-NOSSE-NEXT: pushl $0
+; X86-NOSSE-NEXT: pushl $32
+; X86-NOSSE-NEXT: pushl $.L.str
+; X86-NOSSE-NEXT: pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT: calll memcmp
+; X86-NOSSE-NEXT: addl $16, %esp
+; X86-NOSSE-NEXT: testl %eax, %eax
+; X86-NOSSE-NEXT: setne %al
+; X86-NOSSE-NEXT: retl
+;
+; X86-SSE1-LABEL: length32_eq_const:
+; X86-SSE1: # %bb.0:
+; X86-SSE1-NEXT: pushl $0
+; X86-SSE1-NEXT: pushl $32
+; X86-SSE1-NEXT: pushl $.L.str
+; X86-SSE1-NEXT: pushl {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT: calll memcmp
+; X86-SSE1-NEXT: addl $16, %esp
+; X86-SSE1-NEXT: testl %eax, %eax
+; X86-SSE1-NEXT: setne %al
+; X86-SSE1-NEXT: retl
+;
+; X86-SSE2-LABEL: length32_eq_const:
+; X86-SSE2: # %bb.0:
+; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT: movdqu (%eax), %xmm0
+; X86-SSE2-NEXT: pcmpeqb {{\.LCPI.*}}, %xmm0
+; X86-SSE2-NEXT: pmovmskb %xmm0, %ecx
+; X86-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF
+; X86-SSE2-NEXT: jne .LBB26_2
+; X86-SSE2-NEXT: # %bb.1: # %loadbb1
+; X86-SSE2-NEXT: movdqu 16(%eax), %xmm0
+; X86-SSE2-NEXT: pcmpeqb {{\.LCPI.*}}, %xmm0
+; X86-SSE2-NEXT: pmovmskb %xmm0, %ecx
+; X86-SSE2-NEXT: xorl %eax, %eax
+; X86-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF
+; X86-SSE2-NEXT: je .LBB26_3
+; X86-SSE2-NEXT: .LBB26_2: # %res_block
+; X86-SSE2-NEXT: movl $1, %eax
+; X86-SSE2-NEXT: .LBB26_3: # %endblock
+; X86-SSE2-NEXT: testl %eax, %eax
+; X86-SSE2-NEXT: setne %al
+; X86-SSE2-NEXT: retl
;
; X64-SSE2-LABEL: length32_eq_const:
-; X64-SSE2: # BB#0:
-; X64-SSE2-NEXT: pushq %rax
-; X64-SSE2-NEXT: movl $.L.str, %esi
-; X64-SSE2-NEXT: movl $32, %edx
-; X64-SSE2-NEXT: callq memcmp
+; X64-SSE2: # %bb.0:
+; X64-SSE2-NEXT: movdqu (%rdi), %xmm0
+; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm0
+; X64-SSE2-NEXT: pmovmskb %xmm0, %eax
+; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
+; X64-SSE2-NEXT: jne .LBB26_2
+; X64-SSE2-NEXT: # %bb.1: # %loadbb1
+; X64-SSE2-NEXT: movdqu 16(%rdi), %xmm0
+; X64-SSE2-NEXT: pcmpeqb {{.*}}(%rip), %xmm0
+; X64-SSE2-NEXT: pmovmskb %xmm0, %ecx
+; X64-SSE2-NEXT: xorl %eax, %eax
+; X64-SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF
+; X64-SSE2-NEXT: je .LBB26_3
+; X64-SSE2-NEXT: .LBB26_2: # %res_block
+; X64-SSE2-NEXT: movl $1, %eax
+; X64-SSE2-NEXT: .LBB26_3: # %endblock
; X64-SSE2-NEXT: testl %eax, %eax
; X64-SSE2-NEXT: setne %al
-; X64-SSE2-NEXT: popq %rcx
; X64-SSE2-NEXT: retq
;
+; X64-AVX1-LABEL: length32_eq_const:
+; X64-AVX1: # %bb.0:
+; X64-AVX1-NEXT: vmovdqu (%rdi), %xmm0
+; X64-AVX1-NEXT: vpcmpeqb {{.*}}(%rip), %xmm0, %xmm0
+; X64-AVX1-NEXT: vpmovmskb %xmm0, %eax
+; X64-AVX1-NEXT: cmpl $65535, %eax # imm = 0xFFFF
+; X64-AVX1-NEXT: jne .LBB26_2
+; X64-AVX1-NEXT: # %bb.1: # %loadbb1
+; X64-AVX1-NEXT: vmovdqu 16(%rdi), %xmm0
+; X64-AVX1-NEXT: vpcmpeqb {{.*}}(%rip), %xmm0, %xmm0
+; X64-AVX1-NEXT: vpmovmskb %xmm0, %ecx
+; X64-AVX1-NEXT: xorl %eax, %eax
+; X64-AVX1-NEXT: cmpl $65535, %ecx # imm = 0xFFFF
+; X64-AVX1-NEXT: je .LBB26_3
+; X64-AVX1-NEXT: .LBB26_2: # %res_block
+; X64-AVX1-NEXT: movl $1, %eax
+; X64-AVX1-NEXT: .LBB26_3: # %endblock
+; X64-AVX1-NEXT: testl %eax, %eax
+; X64-AVX1-NEXT: setne %al
+; X64-AVX1-NEXT: retq
+;
; X64-AVX2-LABEL: length32_eq_const:
-; X64-AVX2: # BB#0:
+; X64-AVX2: # %bb.0:
; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0
; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %ymm0, %ymm0
; X64-AVX2-NEXT: vpmovmskb %ymm0, %eax
@@ -851,7 +1081,7 @@ define i1 @length32_eq_const(i8* %X) nounwind {
define i32 @length64(i8* %X, i8* %Y) nounwind {
; X86-LABEL: length64:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: pushl $0
; X86-NEXT: pushl $64
; X86-NEXT: pushl {{[0-9]+}}(%esp)
@@ -861,7 +1091,7 @@ define i32 @length64(i8* %X, i8* %Y) nounwind {
; X86-NEXT: retl
;
; X64-LABEL: length64:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movl $64, %edx
; X64-NEXT: jmp memcmp # TAILCALL
%m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 64) nounwind
@@ -870,7 +1100,7 @@ define i32 @length64(i8* %X, i8* %Y) nounwind {
define i1 @length64_eq(i8* %x, i8* %y) nounwind {
; X86-LABEL: length64_eq:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: pushl $0
; X86-NEXT: pushl $64
; X86-NEXT: pushl {{[0-9]+}}(%esp)
@@ -881,15 +1111,47 @@ define i1 @length64_eq(i8* %x, i8* %y) nounwind {
; X86-NEXT: setne %al
; X86-NEXT: retl
;
-; X64-LABEL: length64_eq:
-; X64: # BB#0:
-; X64-NEXT: pushq %rax
-; X64-NEXT: movl $64, %edx
-; X64-NEXT: callq memcmp
-; X64-NEXT: testl %eax, %eax
-; X64-NEXT: setne %al
-; X64-NEXT: popq %rcx
-; X64-NEXT: retq
+; X64-SSE2-LABEL: length64_eq:
+; X64-SSE2: # %bb.0:
+; X64-SSE2-NEXT: pushq %rax
+; X64-SSE2-NEXT: movl $64, %edx
+; X64-SSE2-NEXT: callq memcmp
+; X64-SSE2-NEXT: testl %eax, %eax
+; X64-SSE2-NEXT: setne %al
+; X64-SSE2-NEXT: popq %rcx
+; X64-SSE2-NEXT: retq
+;
+; X64-AVX1-LABEL: length64_eq:
+; X64-AVX1: # %bb.0:
+; X64-AVX1-NEXT: pushq %rax
+; X64-AVX1-NEXT: movl $64, %edx
+; X64-AVX1-NEXT: callq memcmp
+; X64-AVX1-NEXT: testl %eax, %eax
+; X64-AVX1-NEXT: setne %al
+; X64-AVX1-NEXT: popq %rcx
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: length64_eq:
+; X64-AVX2: # %bb.0:
+; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0
+; X64-AVX2-NEXT: vpcmpeqb (%rsi), %ymm0, %ymm0
+; X64-AVX2-NEXT: vpmovmskb %ymm0, %eax
+; X64-AVX2-NEXT: cmpl $-1, %eax
+; X64-AVX2-NEXT: jne .LBB28_2
+; X64-AVX2-NEXT: # %bb.1: # %loadbb1
+; X64-AVX2-NEXT: vmovdqu 32(%rdi), %ymm0
+; X64-AVX2-NEXT: vpcmpeqb 32(%rsi), %ymm0, %ymm0
+; X64-AVX2-NEXT: vpmovmskb %ymm0, %ecx
+; X64-AVX2-NEXT: xorl %eax, %eax
+; X64-AVX2-NEXT: cmpl $-1, %ecx
+; X64-AVX2-NEXT: je .LBB28_3
+; X64-AVX2-NEXT: .LBB28_2: # %res_block
+; X64-AVX2-NEXT: movl $1, %eax
+; X64-AVX2-NEXT: .LBB28_3: # %endblock
+; X64-AVX2-NEXT: testl %eax, %eax
+; X64-AVX2-NEXT: setne %al
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
%call = tail call i32 @memcmp(i8* %x, i8* %y, i64 64) nounwind
%cmp = icmp ne i32 %call, 0
ret i1 %cmp
@@ -897,7 +1159,7 @@ define i1 @length64_eq(i8* %x, i8* %y) nounwind {
define i1 @length64_eq_const(i8* %X) nounwind {
; X86-LABEL: length64_eq_const:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: pushl $0
; X86-NEXT: pushl $64
; X86-NEXT: pushl $.L.str
@@ -908,18 +1170,72 @@ define i1 @length64_eq_const(i8* %X) nounwind {
; X86-NEXT: sete %al
; X86-NEXT: retl
;
-; X64-LABEL: length64_eq_const:
-; X64: # BB#0:
-; X64-NEXT: pushq %rax
-; X64-NEXT: movl $.L.str, %esi
-; X64-NEXT: movl $64, %edx
-; X64-NEXT: callq memcmp
-; X64-NEXT: testl %eax, %eax
-; X64-NEXT: sete %al
-; X64-NEXT: popq %rcx
-; X64-NEXT: retq
+; X64-SSE2-LABEL: length64_eq_const:
+; X64-SSE2: # %bb.0:
+; X64-SSE2-NEXT: pushq %rax
+; X64-SSE2-NEXT: movl $.L.str, %esi
+; X64-SSE2-NEXT: movl $64, %edx
+; X64-SSE2-NEXT: callq memcmp
+; X64-SSE2-NEXT: testl %eax, %eax
+; X64-SSE2-NEXT: sete %al
+; X64-SSE2-NEXT: popq %rcx
+; X64-SSE2-NEXT: retq
+;
+; X64-AVX1-LABEL: length64_eq_const:
+; X64-AVX1: # %bb.0:
+; X64-AVX1-NEXT: pushq %rax
+; X64-AVX1-NEXT: movl $.L.str, %esi
+; X64-AVX1-NEXT: movl $64, %edx
+; X64-AVX1-NEXT: callq memcmp
+; X64-AVX1-NEXT: testl %eax, %eax
+; X64-AVX1-NEXT: sete %al
+; X64-AVX1-NEXT: popq %rcx
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: length64_eq_const:
+; X64-AVX2: # %bb.0:
+; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0
+; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %ymm0, %ymm0
+; X64-AVX2-NEXT: vpmovmskb %ymm0, %eax
+; X64-AVX2-NEXT: cmpl $-1, %eax
+; X64-AVX2-NEXT: jne .LBB29_2
+; X64-AVX2-NEXT: # %bb.1: # %loadbb1
+; X64-AVX2-NEXT: vmovdqu 32(%rdi), %ymm0
+; X64-AVX2-NEXT: vpcmpeqb {{.*}}(%rip), %ymm0, %ymm0
+; X64-AVX2-NEXT: vpmovmskb %ymm0, %ecx
+; X64-AVX2-NEXT: xorl %eax, %eax
+; X64-AVX2-NEXT: cmpl $-1, %ecx
+; X64-AVX2-NEXT: je .LBB29_3
+; X64-AVX2-NEXT: .LBB29_2: # %res_block
+; X64-AVX2-NEXT: movl $1, %eax
+; X64-AVX2-NEXT: .LBB29_3: # %endblock
+; X64-AVX2-NEXT: testl %eax, %eax
+; X64-AVX2-NEXT: sete %al
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
%m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 64) nounwind
%c = icmp eq i32 %m, 0
ret i1 %c
}
+; This checks that we do not do stupid things with huge sizes.
+define i32 @huge_length(i8* %X, i8* %Y) nounwind {
+; X86-LABEL: huge_length:
+; X86: # %bb.0:
+; X86-NEXT: pushl $2147483647 # imm = 0x7FFFFFFF
+; X86-NEXT: pushl $-1
+; X86-NEXT: pushl {{[0-9]+}}(%esp)
+; X86-NEXT: pushl {{[0-9]+}}(%esp)
+; X86-NEXT: calll memcmp
+; X86-NEXT: addl $16, %esp
+; X86-NEXT: retl
+;
+; X64-LABEL: huge_length:
+; X64: # %bb.0:
+; X64-NEXT: movabsq $9223372036854775807, %rdx # imm = 0x7FFFFFFFFFFFFFFF
+; X64-NEXT: jmp memcmp # TAILCALL
+ %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 9223372036854775807) nounwind
+ ret i32 %m
+}
+
+
diff --git a/test/CodeGen/X86/memcpy-2.ll b/test/CodeGen/X86/memcpy-2.ll
index 7ef61c9a677b..040dd153d643 100644
--- a/test/CodeGen/X86/memcpy-2.ll
+++ b/test/CodeGen/X86/memcpy-2.ll
@@ -27,8 +27,8 @@ entry:
; SSE1-LABEL: t1:
; SSE1: movaps _.str, %xmm0
-; SSE1: movaps %xmm0
; SSE1: movb $0, 24(%esp)
+; SSE1: movaps %xmm0
; SSE1: movl $0, 20(%esp)
; SSE1: movl $0, 16(%esp)
diff --git a/test/CodeGen/X86/memset-2.ll b/test/CodeGen/X86/memset-2.ll
index 1ac972048f12..e94432884b10 100644
--- a/test/CodeGen/X86/memset-2.ll
+++ b/test/CodeGen/X86/memset-2.ll
@@ -3,7 +3,7 @@
define fastcc void @t1() nounwind {
; CHECK-LABEL: t1:
-; CHECK: ## BB#0: ## %entry
+; CHECK: ## %bb.0: ## %entry
; CHECK-NEXT: subl $16, %esp
; CHECK-NEXT: pushl $188
; CHECK-NEXT: pushl $0
@@ -17,7 +17,7 @@ entry:
define fastcc void @t2(i8 signext %c) nounwind {
; CHECK-LABEL: t2:
-; CHECK: ## BB#0: ## %entry
+; CHECK: ## %bb.0: ## %entry
; CHECK-NEXT: subl $12, %esp
; CHECK-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; CHECK-NEXT: movl $76, {{[0-9]+}}(%esp)
@@ -31,7 +31,7 @@ declare void @llvm.memset.p0i8.i32(i8* nocapture, i8, i32, i32, i1) nounwind
define void @t3(i8* nocapture %s, i8 %a) nounwind {
; CHECK-LABEL: t3:
-; CHECK: ## BB#0: ## %entry
+; CHECK: ## %bb.0: ## %entry
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
; CHECK-NEXT: imull $16843009, %ecx, %ecx ## imm = 0x1010101
@@ -45,7 +45,7 @@ entry:
define void @t4(i8* nocapture %s, i8 %a) nounwind {
; CHECK-LABEL: t4:
-; CHECK: ## BB#0: ## %entry
+; CHECK: ## %bb.0: ## %entry
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
; CHECK-NEXT: imull $16843009, %ecx, %ecx ## imm = 0x1010101
diff --git a/test/CodeGen/X86/memset-nonzero.ll b/test/CodeGen/X86/memset-nonzero.ll
index 13258fd81de5..1c97e8c768cc 100644
--- a/test/CodeGen/X86/memset-nonzero.ll
+++ b/test/CodeGen/X86/memset-nonzero.ll
@@ -9,20 +9,20 @@
define void @memset_16_nonzero_bytes(i8* %x) {
; SSE-LABEL: memset_16_nonzero_bytes:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movabsq $3038287259199220266, %rax # imm = 0x2A2A2A2A2A2A2A2A
; SSE-NEXT: movq %rax, 8(%rdi)
; SSE-NEXT: movq %rax, (%rdi)
; SSE-NEXT: retq
;
; SSE2FAST-LABEL: memset_16_nonzero_bytes:
-; SSE2FAST: # BB#0:
+; SSE2FAST: # %bb.0:
; SSE2FAST-NEXT: movaps {{.*#+}} xmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
; SSE2FAST-NEXT: movups %xmm0, (%rdi)
; SSE2FAST-NEXT: retq
;
; AVX-LABEL: memset_16_nonzero_bytes:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
; AVX-NEXT: vmovups %xmm0, (%rdi)
; AVX-NEXT: retq
@@ -32,7 +32,7 @@ define void @memset_16_nonzero_bytes(i8* %x) {
define void @memset_32_nonzero_bytes(i8* %x) {
; SSE-LABEL: memset_32_nonzero_bytes:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movabsq $3038287259199220266, %rax # imm = 0x2A2A2A2A2A2A2A2A
; SSE-NEXT: movq %rax, 24(%rdi)
; SSE-NEXT: movq %rax, 16(%rdi)
@@ -41,14 +41,14 @@ define void @memset_32_nonzero_bytes(i8* %x) {
; SSE-NEXT: retq
;
; SSE2FAST-LABEL: memset_32_nonzero_bytes:
-; SSE2FAST: # BB#0:
+; SSE2FAST: # %bb.0:
; SSE2FAST-NEXT: movaps {{.*#+}} xmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
; SSE2FAST-NEXT: movups %xmm0, 16(%rdi)
; SSE2FAST-NEXT: movups %xmm0, (%rdi)
; SSE2FAST-NEXT: retq
;
; AVX-LABEL: memset_32_nonzero_bytes:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
; AVX-NEXT: vmovups %ymm0, (%rdi)
; AVX-NEXT: vzeroupper
@@ -59,7 +59,7 @@ define void @memset_32_nonzero_bytes(i8* %x) {
define void @memset_64_nonzero_bytes(i8* %x) {
; SSE-LABEL: memset_64_nonzero_bytes:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movabsq $3038287259199220266, %rax # imm = 0x2A2A2A2A2A2A2A2A
; SSE-NEXT: movq %rax, 56(%rdi)
; SSE-NEXT: movq %rax, 48(%rdi)
@@ -72,7 +72,7 @@ define void @memset_64_nonzero_bytes(i8* %x) {
; SSE-NEXT: retq
;
; SSE2FAST-LABEL: memset_64_nonzero_bytes:
-; SSE2FAST: # BB#0:
+; SSE2FAST: # %bb.0:
; SSE2FAST-NEXT: movaps {{.*#+}} xmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
; SSE2FAST-NEXT: movups %xmm0, 48(%rdi)
; SSE2FAST-NEXT: movups %xmm0, 32(%rdi)
@@ -81,7 +81,7 @@ define void @memset_64_nonzero_bytes(i8* %x) {
; SSE2FAST-NEXT: retq
;
; AVX-LABEL: memset_64_nonzero_bytes:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
; AVX-NEXT: vmovups %ymm0, 32(%rdi)
; AVX-NEXT: vmovups %ymm0, (%rdi)
@@ -93,7 +93,7 @@ define void @memset_64_nonzero_bytes(i8* %x) {
define void @memset_128_nonzero_bytes(i8* %x) {
; SSE-LABEL: memset_128_nonzero_bytes:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movabsq $3038287259199220266, %rax # imm = 0x2A2A2A2A2A2A2A2A
; SSE-NEXT: movq %rax, 120(%rdi)
; SSE-NEXT: movq %rax, 112(%rdi)
@@ -114,7 +114,7 @@ define void @memset_128_nonzero_bytes(i8* %x) {
; SSE-NEXT: retq
;
; SSE2FAST-LABEL: memset_128_nonzero_bytes:
-; SSE2FAST: # BB#0:
+; SSE2FAST: # %bb.0:
; SSE2FAST-NEXT: movaps {{.*#+}} xmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
; SSE2FAST-NEXT: movups %xmm0, 112(%rdi)
; SSE2FAST-NEXT: movups %xmm0, 96(%rdi)
@@ -127,7 +127,7 @@ define void @memset_128_nonzero_bytes(i8* %x) {
; SSE2FAST-NEXT: retq
;
; AVX-LABEL: memset_128_nonzero_bytes:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
; AVX-NEXT: vmovups %ymm0, 96(%rdi)
; AVX-NEXT: vmovups %ymm0, 64(%rdi)
@@ -141,9 +141,8 @@ define void @memset_128_nonzero_bytes(i8* %x) {
define void @memset_256_nonzero_bytes(i8* %x) {
; SSE-LABEL: memset_256_nonzero_bytes:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pushq %rax
-; SSE-NEXT: .Lcfi0:
; SSE-NEXT: .cfi_def_cfa_offset 16
; SSE-NEXT: movl $42, %esi
; SSE-NEXT: movl $256, %edx # imm = 0x100
@@ -152,7 +151,7 @@ define void @memset_256_nonzero_bytes(i8* %x) {
; SSE-NEXT: retq
;
; SSE2FAST-LABEL: memset_256_nonzero_bytes:
-; SSE2FAST: # BB#0:
+; SSE2FAST: # %bb.0:
; SSE2FAST-NEXT: movaps {{.*#+}} xmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
; SSE2FAST-NEXT: movups %xmm0, 240(%rdi)
; SSE2FAST-NEXT: movups %xmm0, 224(%rdi)
@@ -173,7 +172,7 @@ define void @memset_256_nonzero_bytes(i8* %x) {
; SSE2FAST-NEXT: retq
;
; AVX-LABEL: memset_256_nonzero_bytes:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
; AVX-NEXT: vmovups %ymm0, 224(%rdi)
; AVX-NEXT: vmovups %ymm0, 192(%rdi)
@@ -195,7 +194,7 @@ declare i8* @__memset_chk(i8*, i32, i64, i64)
define void @memset_16_nonconst_bytes(i8* %x, i8 %c) {
; SSE-LABEL: memset_16_nonconst_bytes:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movzbl %sil, %eax
; SSE-NEXT: movabsq $72340172838076673, %rcx # imm = 0x101010101010101
; SSE-NEXT: imulq %rax, %rcx
@@ -204,7 +203,7 @@ define void @memset_16_nonconst_bytes(i8* %x, i8 %c) {
; SSE-NEXT: retq
;
; SSE2FAST-LABEL: memset_16_nonconst_bytes:
-; SSE2FAST: # BB#0:
+; SSE2FAST: # %bb.0:
; SSE2FAST-NEXT: movd %esi, %xmm0
; SSE2FAST-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2FAST-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
@@ -213,7 +212,7 @@ define void @memset_16_nonconst_bytes(i8* %x, i8 %c) {
; SSE2FAST-NEXT: retq
;
; AVX1-LABEL: memset_16_nonconst_bytes:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovd %esi, %xmm0
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
@@ -221,7 +220,7 @@ define void @memset_16_nonconst_bytes(i8* %x, i8 %c) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: memset_16_nonconst_bytes:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vmovd %esi, %xmm0
; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0
; AVX2-NEXT: vmovdqu %xmm0, (%rdi)
@@ -232,7 +231,7 @@ define void @memset_16_nonconst_bytes(i8* %x, i8 %c) {
define void @memset_32_nonconst_bytes(i8* %x, i8 %c) {
; SSE-LABEL: memset_32_nonconst_bytes:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movzbl %sil, %eax
; SSE-NEXT: movabsq $72340172838076673, %rcx # imm = 0x101010101010101
; SSE-NEXT: imulq %rax, %rcx
@@ -243,7 +242,7 @@ define void @memset_32_nonconst_bytes(i8* %x, i8 %c) {
; SSE-NEXT: retq
;
; SSE2FAST-LABEL: memset_32_nonconst_bytes:
-; SSE2FAST: # BB#0:
+; SSE2FAST: # %bb.0:
; SSE2FAST-NEXT: movd %esi, %xmm0
; SSE2FAST-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2FAST-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
@@ -253,7 +252,7 @@ define void @memset_32_nonconst_bytes(i8* %x, i8 %c) {
; SSE2FAST-NEXT: retq
;
; AVX1-LABEL: memset_32_nonconst_bytes:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovd %esi, %xmm0
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
@@ -263,7 +262,7 @@ define void @memset_32_nonconst_bytes(i8* %x, i8 %c) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: memset_32_nonconst_bytes:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vmovd %esi, %xmm0
; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0
; AVX2-NEXT: vmovdqu %ymm0, (%rdi)
@@ -275,7 +274,7 @@ define void @memset_32_nonconst_bytes(i8* %x, i8 %c) {
define void @memset_64_nonconst_bytes(i8* %x, i8 %c) {
; SSE-LABEL: memset_64_nonconst_bytes:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movzbl %sil, %eax
; SSE-NEXT: movabsq $72340172838076673, %rcx # imm = 0x101010101010101
; SSE-NEXT: imulq %rax, %rcx
@@ -290,7 +289,7 @@ define void @memset_64_nonconst_bytes(i8* %x, i8 %c) {
; SSE-NEXT: retq
;
; SSE2FAST-LABEL: memset_64_nonconst_bytes:
-; SSE2FAST: # BB#0:
+; SSE2FAST: # %bb.0:
; SSE2FAST-NEXT: movd %esi, %xmm0
; SSE2FAST-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2FAST-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
@@ -302,7 +301,7 @@ define void @memset_64_nonconst_bytes(i8* %x, i8 %c) {
; SSE2FAST-NEXT: retq
;
; AVX1-LABEL: memset_64_nonconst_bytes:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovd %esi, %xmm0
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
@@ -313,7 +312,7 @@ define void @memset_64_nonconst_bytes(i8* %x, i8 %c) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: memset_64_nonconst_bytes:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vmovd %esi, %xmm0
; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0
; AVX2-NEXT: vmovdqu %ymm0, 32(%rdi)
@@ -326,7 +325,7 @@ define void @memset_64_nonconst_bytes(i8* %x, i8 %c) {
define void @memset_128_nonconst_bytes(i8* %x, i8 %c) {
; SSE-LABEL: memset_128_nonconst_bytes:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movzbl %sil, %eax
; SSE-NEXT: movabsq $72340172838076673, %rcx # imm = 0x101010101010101
; SSE-NEXT: imulq %rax, %rcx
@@ -349,7 +348,7 @@ define void @memset_128_nonconst_bytes(i8* %x, i8 %c) {
; SSE-NEXT: retq
;
; SSE2FAST-LABEL: memset_128_nonconst_bytes:
-; SSE2FAST: # BB#0:
+; SSE2FAST: # %bb.0:
; SSE2FAST-NEXT: movd %esi, %xmm0
; SSE2FAST-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2FAST-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
@@ -365,7 +364,7 @@ define void @memset_128_nonconst_bytes(i8* %x, i8 %c) {
; SSE2FAST-NEXT: retq
;
; AVX1-LABEL: memset_128_nonconst_bytes:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovd %esi, %xmm0
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
@@ -378,7 +377,7 @@ define void @memset_128_nonconst_bytes(i8* %x, i8 %c) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: memset_128_nonconst_bytes:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vmovd %esi, %xmm0
; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0
; AVX2-NEXT: vmovdqu %ymm0, 96(%rdi)
@@ -393,12 +392,12 @@ define void @memset_128_nonconst_bytes(i8* %x, i8 %c) {
define void @memset_256_nonconst_bytes(i8* %x, i8 %c) {
; SSE-LABEL: memset_256_nonconst_bytes:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movl $256, %edx # imm = 0x100
; SSE-NEXT: jmp memset # TAILCALL
;
; SSE2FAST-LABEL: memset_256_nonconst_bytes:
-; SSE2FAST: # BB#0:
+; SSE2FAST: # %bb.0:
; SSE2FAST-NEXT: movd %esi, %xmm0
; SSE2FAST-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2FAST-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
@@ -422,7 +421,7 @@ define void @memset_256_nonconst_bytes(i8* %x, i8 %c) {
; SSE2FAST-NEXT: retq
;
; AVX1-LABEL: memset_256_nonconst_bytes:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovd %esi, %xmm0
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
@@ -439,7 +438,7 @@ define void @memset_256_nonconst_bytes(i8* %x, i8 %c) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: memset_256_nonconst_bytes:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vmovd %esi, %xmm0
; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0
; AVX2-NEXT: vmovdqu %ymm0, 224(%rdi)
diff --git a/test/CodeGen/X86/memset.ll b/test/CodeGen/X86/memset.ll
index 96a22e885675..c9d8fbd58aaa 100644
--- a/test/CodeGen/X86/memset.ll
+++ b/test/CodeGen/X86/memset.ll
@@ -1,39 +1,65 @@
-; RUN: llc < %s -march=x86 -mcpu=pentium2 -mtriple=i686-apple-darwin8.8.0 | FileCheck %s --check-prefix=X86
-; RUN: llc < %s -march=x86 -mcpu=pentium3 -mtriple=i686-apple-darwin8.8.0 | FileCheck %s --check-prefix=XMM
-; RUN: llc < %s -march=x86 -mcpu=bdver1 -mtriple=i686-apple-darwin8.8.0 | FileCheck %s --check-prefix=YMM
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mcpu=pentium2 -mtriple=i686-apple-darwin8.8.0 | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -mcpu=pentium3 -mtriple=i686-apple-darwin8.8.0 | FileCheck %s --check-prefix=XMM
+; RUN: llc < %s -mcpu=bdver1 -mtriple=i686-apple-darwin8.8.0 | FileCheck %s --check-prefix=YMM
- %struct.x = type { i16, i16 }
+%struct.x = type { i16, i16 }
define void @t() nounwind {
+; X86-LABEL: t:
+; X86: ## %bb.0: ## %entry
+; X86-NEXT: subl $44, %esp
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl %eax, (%esp)
+; X86-NEXT: calll _foo
+; X86-NEXT: addl $44, %esp
+; X86-NEXT: retl
+; X86-NEXT: ## -- End function
+;
+; XMM-LABEL: t:
+; XMM: ## %bb.0: ## %entry
+; XMM-NEXT: subl $60, %esp
+; XMM-NEXT: xorps %xmm0, %xmm0
+; XMM-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; XMM-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; XMM-NEXT: leal {{[0-9]+}}(%esp), %eax
+; XMM-NEXT: movl %eax, (%esp)
+; XMM-NEXT: calll _foo
+; XMM-NEXT: addl $60, %esp
+; XMM-NEXT: retl
+; XMM-NEXT: ## -- End function
+;
+; YMM-LABEL: t:
+; YMM: ## %bb.0: ## %entry
+; YMM-NEXT: pushl %ebp
+; YMM-NEXT: movl %esp, %ebp
+; YMM-NEXT: andl $-32, %esp
+; YMM-NEXT: subl $96, %esp
+; YMM-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; YMM-NEXT: vmovaps %ymm0, {{[0-9]+}}(%esp)
+; YMM-NEXT: leal {{[0-9]+}}(%esp), %eax
+; YMM-NEXT: movl %eax, (%esp)
+; YMM-NEXT: vzeroupper
+; YMM-NEXT: calll _foo
+; YMM-NEXT: movl %ebp, %esp
+; YMM-NEXT: popl %ebp
+; YMM-NEXT: retl
+; YMM-NEXT: ## -- End function
entry:
%up_mvd = alloca [8 x %struct.x] ; <[8 x %struct.x]*> [#uses=2]
%up_mvd116 = getelementptr [8 x %struct.x], [8 x %struct.x]* %up_mvd, i32 0, i32 0 ; <%struct.x*> [#uses=1]
%tmp110117 = bitcast [8 x %struct.x]* %up_mvd to i8* ; <i8*> [#uses=1]
call void @llvm.memset.p0i8.i64(i8* %tmp110117, i8 0, i64 32, i32 8, i1 false)
-; X86: movl $0,
-; X86: movl $0,
-; X86: movl $0,
-; X86: movl $0,
-; X86: movl $0,
-; X86: movl $0,
-; X86: movl $0,
-; X86: movl $0,
-; X86-NOT: movl $0,
-; X86: ret
-
-; XMM: xorps %xmm{{[0-9]+}}, [[Z:%xmm[0-9]+]]
-; XMM: movaps [[Z]],
-; XMM: movaps [[Z]],
-; XMM-NOT: movaps
-; XMM: ret
-
-; YMM: vxorps %ymm{{[0-9]+}}, %ymm{{[0-9]+}}, [[Z:%ymm[0-9]+]]
-; YMM: vmovaps [[Z]],
-; YMM-NOT: movaps
-; YMM: ret
-
- call void @foo( %struct.x* %up_mvd116 ) nounwind
+ call void @foo( %struct.x* %up_mvd116 ) nounwind
ret void
}
@@ -41,15 +67,36 @@ declare void @foo(%struct.x*)
declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) nounwind
-define void @PR15348(i8* %a) {
; Ensure that alignment of '0' in an @llvm.memset intrinsic results in
; unaligned loads and stores.
-; XMM: PR15348
-; XMM: movb $0,
-; XMM: movl $0,
-; XMM: movl $0,
-; XMM: movl $0,
-; XMM: movl $0,
+define void @PR15348(i8* %a) {
+; X86-LABEL: PR15348:
+; X86: ## %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movb $0, 16(%eax)
+; X86-NEXT: movl $0, 12(%eax)
+; X86-NEXT: movl $0, 8(%eax)
+; X86-NEXT: movl $0, 4(%eax)
+; X86-NEXT: movl $0, (%eax)
+; X86-NEXT: retl
+;
+; XMM-LABEL: PR15348:
+; XMM: ## %bb.0:
+; XMM-NEXT: movl {{[0-9]+}}(%esp), %eax
+; XMM-NEXT: movb $0, 16(%eax)
+; XMM-NEXT: movl $0, 12(%eax)
+; XMM-NEXT: movl $0, 8(%eax)
+; XMM-NEXT: movl $0, 4(%eax)
+; XMM-NEXT: movl $0, (%eax)
+; XMM-NEXT: retl
+;
+; YMM-LABEL: PR15348:
+; YMM: ## %bb.0:
+; YMM-NEXT: movl {{[0-9]+}}(%esp), %eax
+; YMM-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; YMM-NEXT: vmovups %xmm0, (%eax)
+; YMM-NEXT: movb $0, 16(%eax)
+; YMM-NEXT: retl
call void @llvm.memset.p0i8.i64(i8* %a, i8 0, i64 17, i32 0, i1 false)
ret void
}
diff --git a/test/CodeGen/X86/memset64-on-x86-32.ll b/test/CodeGen/X86/memset64-on-x86-32.ll
index a7a3c61b1392..0fc21920409b 100644
--- a/test/CodeGen/X86/memset64-on-x86-32.ll
+++ b/test/CodeGen/X86/memset64-on-x86-32.ll
@@ -5,7 +5,7 @@
define void @bork() nounwind {
; FAST-LABEL: bork:
-; FAST: # BB#0:
+; FAST: # %bb.0:
; FAST-NEXT: xorps %xmm0, %xmm0
; FAST-NEXT: movups %xmm0, 64
; FAST-NEXT: movups %xmm0, 48
@@ -15,7 +15,7 @@ define void @bork() nounwind {
; FAST-NEXT: retl
;
; SLOW_32-LABEL: bork:
-; SLOW_32: # BB#0:
+; SLOW_32: # %bb.0:
; SLOW_32-NEXT: movl $0, 4
; SLOW_32-NEXT: movl $0, 0
; SLOW_32-NEXT: movl $0, 12
@@ -39,7 +39,7 @@ define void @bork() nounwind {
; SLOW_32-NEXT: retl
;
; SLOW_64-LABEL: bork:
-; SLOW_64: # BB#0:
+; SLOW_64: # %bb.0:
; SLOW_64-NEXT: movq $0, 72
; SLOW_64-NEXT: movq $0, 64
; SLOW_64-NEXT: movq $0, 56
diff --git a/test/CodeGen/X86/merge-consecutive-loads-128.ll b/test/CodeGen/X86/merge-consecutive-loads-128.ll
index 1d5829407b71..8c96b2bec8a9 100644
--- a/test/CodeGen/X86/merge-consecutive-loads-128.ll
+++ b/test/CodeGen/X86/merge-consecutive-loads-128.ll
@@ -11,17 +11,17 @@
define <2 x double> @merge_2f64_f64_23(double* %ptr) nounwind uwtable noinline ssp {
; SSE-LABEL: merge_2f64_f64_23:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movups 16(%rdi), %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: merge_2f64_f64_23:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovups 16(%rdi), %xmm0
; AVX-NEXT: retq
;
; X32-SSE1-LABEL: merge_2f64_f64_23:
-; X32-SSE1: # BB#0:
+; X32-SSE1: # %bb.0:
; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-SSE1-NEXT: fldl 16(%eax)
; X32-SSE1-NEXT: fldl 24(%eax)
@@ -29,7 +29,7 @@ define <2 x double> @merge_2f64_f64_23(double* %ptr) nounwind uwtable noinline s
; X32-SSE1-NEXT: retl
;
; X32-SSE41-LABEL: merge_2f64_f64_23:
-; X32-SSE41: # BB#0:
+; X32-SSE41: # %bb.0:
; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-SSE41-NEXT: movups 16(%eax), %xmm0
; X32-SSE41-NEXT: retl
@@ -44,26 +44,22 @@ define <2 x double> @merge_2f64_f64_23(double* %ptr) nounwind uwtable noinline s
define <2 x i64> @merge_2i64_i64_12(i64* %ptr) nounwind uwtable noinline ssp {
; SSE-LABEL: merge_2i64_i64_12:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movups 8(%rdi), %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: merge_2i64_i64_12:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovups 8(%rdi), %xmm0
; AVX-NEXT: retq
;
; X32-SSE1-LABEL: merge_2i64_i64_12:
-; X32-SSE1: # BB#0:
+; X32-SSE1: # %bb.0:
; X32-SSE1-NEXT: pushl %edi
-; X32-SSE1-NEXT: .Lcfi0:
; X32-SSE1-NEXT: .cfi_def_cfa_offset 8
; X32-SSE1-NEXT: pushl %esi
-; X32-SSE1-NEXT: .Lcfi1:
; X32-SSE1-NEXT: .cfi_def_cfa_offset 12
-; X32-SSE1-NEXT: .Lcfi2:
; X32-SSE1-NEXT: .cfi_offset %esi, -12
-; X32-SSE1-NEXT: .Lcfi3:
; X32-SSE1-NEXT: .cfi_offset %edi, -8
; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx
@@ -80,7 +76,7 @@ define <2 x i64> @merge_2i64_i64_12(i64* %ptr) nounwind uwtable noinline ssp {
; X32-SSE1-NEXT: retl $4
;
; X32-SSE41-LABEL: merge_2i64_i64_12:
-; X32-SSE41: # BB#0:
+; X32-SSE41: # %bb.0:
; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-SSE41-NEXT: movups 8(%eax), %xmm0
; X32-SSE41-NEXT: retl
@@ -95,17 +91,17 @@ define <2 x i64> @merge_2i64_i64_12(i64* %ptr) nounwind uwtable noinline ssp {
define <4 x float> @merge_4f32_f32_2345(float* %ptr) nounwind uwtable noinline ssp {
; SSE-LABEL: merge_4f32_f32_2345:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movups 8(%rdi), %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: merge_4f32_f32_2345:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovups 8(%rdi), %xmm0
; AVX-NEXT: retq
;
; X32-SSE-LABEL: merge_4f32_f32_2345:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-SSE-NEXT: movups 8(%eax), %xmm0
; X32-SSE-NEXT: retl
@@ -126,17 +122,17 @@ define <4 x float> @merge_4f32_f32_2345(float* %ptr) nounwind uwtable noinline s
define <4 x float> @merge_4f32_f32_3zuu(float* %ptr) nounwind uwtable noinline ssp {
; SSE-LABEL: merge_4f32_f32_3zuu:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE-NEXT: retq
;
; AVX-LABEL: merge_4f32_f32_3zuu:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX-NEXT: retq
;
; X32-SSE-LABEL: merge_4f32_f32_3zuu:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X32-SSE-NEXT: retl
@@ -149,17 +145,17 @@ define <4 x float> @merge_4f32_f32_3zuu(float* %ptr) nounwind uwtable noinline s
define <4 x float> @merge_4f32_f32_34uu(float* %ptr) nounwind uwtable noinline ssp {
; SSE-LABEL: merge_4f32_f32_34uu:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; SSE-NEXT: retq
;
; AVX-LABEL: merge_4f32_f32_34uu:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: retq
;
; X32-SSE1-LABEL: merge_4f32_f32_34uu:
-; X32-SSE1: # BB#0:
+; X32-SSE1: # %bb.0:
; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-SSE1-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X32-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
@@ -167,7 +163,7 @@ define <4 x float> @merge_4f32_f32_34uu(float* %ptr) nounwind uwtable noinline s
; X32-SSE1-NEXT: retl
;
; X32-SSE41-LABEL: merge_4f32_f32_34uu:
-; X32-SSE41: # BB#0:
+; X32-SSE41: # %bb.0:
; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-SSE41-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; X32-SSE41-NEXT: retl
@@ -182,7 +178,7 @@ define <4 x float> @merge_4f32_f32_34uu(float* %ptr) nounwind uwtable noinline s
define <4 x float> @merge_4f32_f32_34z6(float* %ptr) nounwind uwtable noinline ssp {
; SSE2-LABEL: merge_4f32_f32_34z6:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movups 12(%rdi), %xmm0
; SSE2-NEXT: xorps %xmm1, %xmm1
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0]
@@ -190,20 +186,20 @@ define <4 x float> @merge_4f32_f32_34z6(float* %ptr) nounwind uwtable noinline s
; SSE2-NEXT: retq
;
; SSE41-LABEL: merge_4f32_f32_34z6:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movups 12(%rdi), %xmm1
; SSE41-NEXT: xorps %xmm0, %xmm0
; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3]
; SSE41-NEXT: retq
;
; AVX-LABEL: merge_4f32_f32_34z6:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
; AVX-NEXT: vblendps {{.*#+}} xmm0 = mem[0,1],xmm0[2],mem[3]
; AVX-NEXT: retq
;
; X32-SSE1-LABEL: merge_4f32_f32_34z6:
-; X32-SSE1: # BB#0:
+; X32-SSE1: # %bb.0:
; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-SSE1-NEXT: movups 12(%eax), %xmm0
; X32-SSE1-NEXT: xorps %xmm1, %xmm1
@@ -212,7 +208,7 @@ define <4 x float> @merge_4f32_f32_34z6(float* %ptr) nounwind uwtable noinline s
; X32-SSE1-NEXT: retl
;
; X32-SSE41-LABEL: merge_4f32_f32_34z6:
-; X32-SSE41: # BB#0:
+; X32-SSE41: # %bb.0:
; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-SSE41-NEXT: movups 12(%eax), %xmm1
; X32-SSE41-NEXT: xorps %xmm0, %xmm0
@@ -232,17 +228,17 @@ define <4 x float> @merge_4f32_f32_34z6(float* %ptr) nounwind uwtable noinline s
define <4 x float> @merge_4f32_f32_45zz(float* %ptr) nounwind uwtable noinline ssp {
; SSE-LABEL: merge_4f32_f32_45zz:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; SSE-NEXT: retq
;
; AVX-LABEL: merge_4f32_f32_45zz:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: retq
;
; X32-SSE1-LABEL: merge_4f32_f32_45zz:
-; X32-SSE1: # BB#0:
+; X32-SSE1: # %bb.0:
; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-SSE1-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X32-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
@@ -252,7 +248,7 @@ define <4 x float> @merge_4f32_f32_45zz(float* %ptr) nounwind uwtable noinline s
; X32-SSE1-NEXT: retl
;
; X32-SSE41-LABEL: merge_4f32_f32_45zz:
-; X32-SSE41: # BB#0:
+; X32-SSE41: # %bb.0:
; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-SSE41-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; X32-SSE41-NEXT: retl
@@ -267,26 +263,26 @@ define <4 x float> @merge_4f32_f32_45zz(float* %ptr) nounwind uwtable noinline s
define <4 x float> @merge_4f32_f32_012u(float* %ptr) nounwind uwtable noinline ssp {
; SSE2-LABEL: merge_4f32_f32_012u:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
-; SSE2-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE2-NEXT: retq
;
; SSE41-LABEL: merge_4f32_f32_012u:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
; SSE41-NEXT: retq
;
; AVX-LABEL: merge_4f32_f32_012u:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
; AVX-NEXT: retq
;
; X32-SSE1-LABEL: merge_4f32_f32_012u:
-; X32-SSE1: # BB#0:
+; X32-SSE1: # %bb.0:
; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-SSE1-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X32-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
@@ -296,7 +292,7 @@ define <4 x float> @merge_4f32_f32_012u(float* %ptr) nounwind uwtable noinline s
; X32-SSE1-NEXT: retl
;
; X32-SSE41-LABEL: merge_4f32_f32_012u:
-; X32-SSE41: # BB#0:
+; X32-SSE41: # %bb.0:
; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-SSE41-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; X32-SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
@@ -316,26 +312,26 @@ define <4 x float> @merge_4f32_f32_012u(float* %ptr) nounwind uwtable noinline s
define <4 x float> @merge_4f32_f32_019u(float* %ptr) nounwind uwtable noinline ssp {
; SSE2-LABEL: merge_4f32_f32_019u:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
-; SSE2-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE2-NEXT: retq
;
; SSE41-LABEL: merge_4f32_f32_019u:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
; SSE41-NEXT: retq
;
; AVX-LABEL: merge_4f32_f32_019u:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
; AVX-NEXT: retq
;
; X32-SSE1-LABEL: merge_4f32_f32_019u:
-; X32-SSE1: # BB#0:
+; X32-SSE1: # %bb.0:
; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-SSE1-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X32-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
@@ -345,7 +341,7 @@ define <4 x float> @merge_4f32_f32_019u(float* %ptr) nounwind uwtable noinline s
; X32-SSE1-NEXT: retl
;
; X32-SSE41-LABEL: merge_4f32_f32_019u:
-; X32-SSE41: # BB#0:
+; X32-SSE41: # %bb.0:
; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-SSE41-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; X32-SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
@@ -365,21 +361,19 @@ define <4 x float> @merge_4f32_f32_019u(float* %ptr) nounwind uwtable noinline s
define <4 x i32> @merge_4i32_i32_23u5(i32* %ptr) nounwind uwtable noinline ssp {
; SSE-LABEL: merge_4i32_i32_23u5:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movups 8(%rdi), %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: merge_4i32_i32_23u5:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovups 8(%rdi), %xmm0
; AVX-NEXT: retq
;
; X32-SSE1-LABEL: merge_4i32_i32_23u5:
-; X32-SSE1: # BB#0:
+; X32-SSE1: # %bb.0:
; X32-SSE1-NEXT: pushl %esi
-; X32-SSE1-NEXT: .Lcfi4:
; X32-SSE1-NEXT: .cfi_def_cfa_offset 8
-; X32-SSE1-NEXT: .Lcfi5:
; X32-SSE1-NEXT: .cfi_offset %esi, -8
; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx
@@ -393,7 +387,7 @@ define <4 x i32> @merge_4i32_i32_23u5(i32* %ptr) nounwind uwtable noinline ssp {
; X32-SSE1-NEXT: retl $4
;
; X32-SSE41-LABEL: merge_4i32_i32_23u5:
-; X32-SSE41: # BB#0:
+; X32-SSE41: # %bb.0:
; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-SSE41-NEXT: movups 8(%eax), %xmm0
; X32-SSE41-NEXT: retl
@@ -409,19 +403,129 @@ define <4 x i32> @merge_4i32_i32_23u5(i32* %ptr) nounwind uwtable noinline ssp {
ret <4 x i32> %res3
}
+define <4 x i32> @merge_4i32_i32_23u5_inc2(i32* %ptr) nounwind uwtable noinline ssp {
+; SSE-LABEL: merge_4i32_i32_23u5_inc2:
+; SSE: # %bb.0:
+; SSE-NEXT: movups 8(%rdi), %xmm0
+; SSE-NEXT: incl 8(%rdi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: merge_4i32_i32_23u5_inc2:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovups 8(%rdi), %xmm0
+; AVX-NEXT: incl 8(%rdi)
+; AVX-NEXT: retq
+;
+; X32-SSE1-LABEL: merge_4i32_i32_23u5_inc2:
+; X32-SSE1: # %bb.0:
+; X32-SSE1-NEXT: pushl %edi
+; X32-SSE1-NEXT: .cfi_def_cfa_offset 8
+; X32-SSE1-NEXT: pushl %esi
+; X32-SSE1-NEXT: .cfi_def_cfa_offset 12
+; X32-SSE1-NEXT: .cfi_offset %esi, -12
+; X32-SSE1-NEXT: .cfi_offset %edi, -8
+; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-SSE1-NEXT: movl 8(%ecx), %edx
+; X32-SSE1-NEXT: movl 12(%ecx), %esi
+; X32-SSE1-NEXT: leal 1(%edx), %edi
+; X32-SSE1-NEXT: movl %edi, 8(%ecx)
+; X32-SSE1-NEXT: movl 20(%ecx), %ecx
+; X32-SSE1-NEXT: movl %esi, 4(%eax)
+; X32-SSE1-NEXT: movl %edx, (%eax)
+; X32-SSE1-NEXT: movl %ecx, 12(%eax)
+; X32-SSE1-NEXT: popl %esi
+; X32-SSE1-NEXT: popl %edi
+; X32-SSE1-NEXT: retl $4
+;
+; X32-SSE41-LABEL: merge_4i32_i32_23u5_inc2:
+; X32-SSE41: # %bb.0:
+; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-SSE41-NEXT: movups 8(%eax), %xmm0
+; X32-SSE41-NEXT: incl 8(%eax)
+; X32-SSE41-NEXT: retl
+ %ptr0 = getelementptr inbounds i32, i32* %ptr, i64 2
+ %ptr1 = getelementptr inbounds i32, i32* %ptr, i64 3
+ %ptr3 = getelementptr inbounds i32, i32* %ptr, i64 5
+ %val0 = load i32, i32* %ptr0
+ %inc = add i32 %val0, 1
+ store i32 %inc, i32* %ptr0
+ %val1 = load i32, i32* %ptr1
+ %val3 = load i32, i32* %ptr3
+ %res0 = insertelement <4 x i32> undef, i32 %val0, i32 0
+ %res1 = insertelement <4 x i32> %res0, i32 %val1, i32 1
+ %res3 = insertelement <4 x i32> %res1, i32 %val3, i32 3
+ ret <4 x i32> %res3
+}
+
+define <4 x i32> @merge_4i32_i32_23u5_inc3(i32* %ptr) nounwind uwtable noinline ssp {
+; SSE-LABEL: merge_4i32_i32_23u5_inc3:
+; SSE: # %bb.0:
+; SSE-NEXT: movups 8(%rdi), %xmm0
+; SSE-NEXT: incl 12(%rdi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: merge_4i32_i32_23u5_inc3:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovups 8(%rdi), %xmm0
+; AVX-NEXT: incl 12(%rdi)
+; AVX-NEXT: retq
+;
+; X32-SSE1-LABEL: merge_4i32_i32_23u5_inc3:
+; X32-SSE1: # %bb.0:
+; X32-SSE1-NEXT: pushl %edi
+; X32-SSE1-NEXT: .cfi_def_cfa_offset 8
+; X32-SSE1-NEXT: pushl %esi
+; X32-SSE1-NEXT: .cfi_def_cfa_offset 12
+; X32-SSE1-NEXT: .cfi_offset %esi, -12
+; X32-SSE1-NEXT: .cfi_offset %edi, -8
+; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-SSE1-NEXT: movl 8(%ecx), %edx
+; X32-SSE1-NEXT: movl 12(%ecx), %esi
+; X32-SSE1-NEXT: leal 1(%esi), %edi
+; X32-SSE1-NEXT: movl %edi, 12(%ecx)
+; X32-SSE1-NEXT: movl 20(%ecx), %ecx
+; X32-SSE1-NEXT: movl %esi, 4(%eax)
+; X32-SSE1-NEXT: movl %edx, (%eax)
+; X32-SSE1-NEXT: movl %ecx, 12(%eax)
+; X32-SSE1-NEXT: popl %esi
+; X32-SSE1-NEXT: popl %edi
+; X32-SSE1-NEXT: retl $4
+;
+; X32-SSE41-LABEL: merge_4i32_i32_23u5_inc3:
+; X32-SSE41: # %bb.0:
+; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-SSE41-NEXT: movups 8(%eax), %xmm0
+; X32-SSE41-NEXT: incl 12(%eax)
+; X32-SSE41-NEXT: retl
+ %ptr0 = getelementptr inbounds i32, i32* %ptr, i64 2
+ %ptr1 = getelementptr inbounds i32, i32* %ptr, i64 3
+ %ptr3 = getelementptr inbounds i32, i32* %ptr, i64 5
+ %val0 = load i32, i32* %ptr0
+ %val1 = load i32, i32* %ptr1
+ %inc = add i32 %val1, 1
+ store i32 %inc, i32* %ptr1
+ %val3 = load i32, i32* %ptr3
+ %res0 = insertelement <4 x i32> undef, i32 %val0, i32 0
+ %res1 = insertelement <4 x i32> %res0, i32 %val1, i32 1
+ %res3 = insertelement <4 x i32> %res1, i32 %val3, i32 3
+ ret <4 x i32> %res3
+}
+
define <4 x i32> @merge_4i32_i32_3zuu(i32* %ptr) nounwind uwtable noinline ssp {
; SSE-LABEL: merge_4i32_i32_3zuu:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE-NEXT: retq
;
; AVX-LABEL: merge_4i32_i32_3zuu:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX-NEXT: retq
;
; X32-SSE1-LABEL: merge_4i32_i32_3zuu:
-; X32-SSE1: # BB#0:
+; X32-SSE1: # %bb.0:
; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-SSE1-NEXT: movl 12(%ecx), %ecx
@@ -430,7 +534,7 @@ define <4 x i32> @merge_4i32_i32_3zuu(i32* %ptr) nounwind uwtable noinline ssp {
; X32-SSE1-NEXT: retl $4
;
; X32-SSE41-LABEL: merge_4i32_i32_3zuu:
-; X32-SSE41: # BB#0:
+; X32-SSE41: # %bb.0:
; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-SSE41-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X32-SSE41-NEXT: retl
@@ -443,17 +547,17 @@ define <4 x i32> @merge_4i32_i32_3zuu(i32* %ptr) nounwind uwtable noinline ssp {
define <4 x i32> @merge_4i32_i32_34uu(i32* %ptr) nounwind uwtable noinline ssp {
; SSE-LABEL: merge_4i32_i32_34uu:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; SSE-NEXT: retq
;
; AVX-LABEL: merge_4i32_i32_34uu:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: retq
;
; X32-SSE1-LABEL: merge_4i32_i32_34uu:
-; X32-SSE1: # BB#0:
+; X32-SSE1: # %bb.0:
; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-SSE1-NEXT: movl 12(%ecx), %edx
@@ -463,7 +567,7 @@ define <4 x i32> @merge_4i32_i32_34uu(i32* %ptr) nounwind uwtable noinline ssp {
; X32-SSE1-NEXT: retl $4
;
; X32-SSE41-LABEL: merge_4i32_i32_34uu:
-; X32-SSE41: # BB#0:
+; X32-SSE41: # %bb.0:
; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-SSE41-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; X32-SSE41-NEXT: retl
@@ -478,17 +582,17 @@ define <4 x i32> @merge_4i32_i32_34uu(i32* %ptr) nounwind uwtable noinline ssp {
define <4 x i32> @merge_4i32_i32_45zz(i32* %ptr) nounwind uwtable noinline ssp {
; SSE-LABEL: merge_4i32_i32_45zz:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; SSE-NEXT: retq
;
; AVX-LABEL: merge_4i32_i32_45zz:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: retq
;
; X32-SSE1-LABEL: merge_4i32_i32_45zz:
-; X32-SSE1: # BB#0:
+; X32-SSE1: # %bb.0:
; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-SSE1-NEXT: movl 16(%ecx), %edx
@@ -500,14 +604,118 @@ define <4 x i32> @merge_4i32_i32_45zz(i32* %ptr) nounwind uwtable noinline ssp {
; X32-SSE1-NEXT: retl $4
;
; X32-SSE41-LABEL: merge_4i32_i32_45zz:
-; X32-SSE41: # BB#0:
+; X32-SSE41: # %bb.0:
+; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-SSE41-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; X32-SSE41-NEXT: retl
+ %ptr0 = getelementptr inbounds i32, i32* %ptr, i64 4
+ %ptr1 = getelementptr inbounds i32, i32* %ptr, i64 5
+ %val0 = load i32, i32* %ptr0
+ %val1 = load i32, i32* %ptr1
+ %res0 = insertelement <4 x i32> zeroinitializer, i32 %val0, i32 0
+ %res1 = insertelement <4 x i32> %res0, i32 %val1, i32 1
+ ret <4 x i32> %res1
+}
+
+define <4 x i32> @merge_4i32_i32_45zz_inc4(i32* %ptr) nounwind uwtable noinline ssp {
+; SSE-LABEL: merge_4i32_i32_45zz_inc4:
+; SSE: # %bb.0:
+; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE-NEXT: incl 16(%rdi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: merge_4i32_i32_45zz_inc4:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT: incl 16(%rdi)
+; AVX-NEXT: retq
+;
+; X32-SSE1-LABEL: merge_4i32_i32_45zz_inc4:
+; X32-SSE1: # %bb.0:
+; X32-SSE1-NEXT: pushl %edi
+; X32-SSE1-NEXT: .cfi_def_cfa_offset 8
+; X32-SSE1-NEXT: pushl %esi
+; X32-SSE1-NEXT: .cfi_def_cfa_offset 12
+; X32-SSE1-NEXT: .cfi_offset %esi, -12
+; X32-SSE1-NEXT: .cfi_offset %edi, -8
+; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-SSE1-NEXT: movl 16(%ecx), %edx
+; X32-SSE1-NEXT: movl 20(%ecx), %esi
+; X32-SSE1-NEXT: leal 1(%edx), %edi
+; X32-SSE1-NEXT: movl %edi, 16(%ecx)
+; X32-SSE1-NEXT: movl %esi, 4(%eax)
+; X32-SSE1-NEXT: movl %edx, (%eax)
+; X32-SSE1-NEXT: movl $0, 12(%eax)
+; X32-SSE1-NEXT: movl $0, 8(%eax)
+; X32-SSE1-NEXT: popl %esi
+; X32-SSE1-NEXT: popl %edi
+; X32-SSE1-NEXT: retl $4
+;
+; X32-SSE41-LABEL: merge_4i32_i32_45zz_inc4:
+; X32-SSE41: # %bb.0:
+; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-SSE41-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; X32-SSE41-NEXT: incl 16(%eax)
+; X32-SSE41-NEXT: retl
+ %ptr0 = getelementptr inbounds i32, i32* %ptr, i64 4
+ %ptr1 = getelementptr inbounds i32, i32* %ptr, i64 5
+ %val0 = load i32, i32* %ptr0
+ %inc = add i32 %val0, 1
+ store i32 %inc, i32* %ptr0
+ %val1 = load i32, i32* %ptr1
+ %res0 = insertelement <4 x i32> zeroinitializer, i32 %val0, i32 0
+ %res1 = insertelement <4 x i32> %res0, i32 %val1, i32 1
+ ret <4 x i32> %res1
+}
+
+define <4 x i32> @merge_4i32_i32_45zz_inc5(i32* %ptr) nounwind uwtable noinline ssp {
+; SSE-LABEL: merge_4i32_i32_45zz_inc5:
+; SSE: # %bb.0:
+; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE-NEXT: incl 20(%rdi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: merge_4i32_i32_45zz_inc5:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT: incl 20(%rdi)
+; AVX-NEXT: retq
+;
+; X32-SSE1-LABEL: merge_4i32_i32_45zz_inc5:
+; X32-SSE1: # %bb.0:
+; X32-SSE1-NEXT: pushl %edi
+; X32-SSE1-NEXT: .cfi_def_cfa_offset 8
+; X32-SSE1-NEXT: pushl %esi
+; X32-SSE1-NEXT: .cfi_def_cfa_offset 12
+; X32-SSE1-NEXT: .cfi_offset %esi, -12
+; X32-SSE1-NEXT: .cfi_offset %edi, -8
+; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-SSE1-NEXT: movl 16(%ecx), %edx
+; X32-SSE1-NEXT: movl 20(%ecx), %esi
+; X32-SSE1-NEXT: leal 1(%esi), %edi
+; X32-SSE1-NEXT: movl %edi, 20(%ecx)
+; X32-SSE1-NEXT: movl %esi, 4(%eax)
+; X32-SSE1-NEXT: movl %edx, (%eax)
+; X32-SSE1-NEXT: movl $0, 12(%eax)
+; X32-SSE1-NEXT: movl $0, 8(%eax)
+; X32-SSE1-NEXT: popl %esi
+; X32-SSE1-NEXT: popl %edi
+; X32-SSE1-NEXT: retl $4
+;
+; X32-SSE41-LABEL: merge_4i32_i32_45zz_inc5:
+; X32-SSE41: # %bb.0:
; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-SSE41-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; X32-SSE41-NEXT: incl 20(%eax)
; X32-SSE41-NEXT: retl
%ptr0 = getelementptr inbounds i32, i32* %ptr, i64 4
%ptr1 = getelementptr inbounds i32, i32* %ptr, i64 5
%val0 = load i32, i32* %ptr0
%val1 = load i32, i32* %ptr1
+ %inc = add i32 %val1, 1
+ store i32 %inc, i32* %ptr1
%res0 = insertelement <4 x i32> zeroinitializer, i32 %val0, i32 0
%res1 = insertelement <4 x i32> %res0, i32 %val1, i32 1
ret <4 x i32> %res1
@@ -515,59 +723,39 @@ define <4 x i32> @merge_4i32_i32_45zz(i32* %ptr) nounwind uwtable noinline ssp {
define <8 x i16> @merge_8i16_i16_23u567u9(i16* %ptr) nounwind uwtable noinline ssp {
; SSE-LABEL: merge_8i16_i16_23u567u9:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movups 4(%rdi), %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: merge_8i16_i16_23u567u9:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovups 4(%rdi), %xmm0
; AVX-NEXT: retq
;
; X32-SSE1-LABEL: merge_8i16_i16_23u567u9:
-; X32-SSE1: # BB#0:
-; X32-SSE1-NEXT: pushl %ebp
-; X32-SSE1-NEXT: .Lcfi6:
-; X32-SSE1-NEXT: .cfi_def_cfa_offset 8
-; X32-SSE1-NEXT: pushl %ebx
-; X32-SSE1-NEXT: .Lcfi7:
-; X32-SSE1-NEXT: .cfi_def_cfa_offset 12
+; X32-SSE1: # %bb.0:
; X32-SSE1-NEXT: pushl %edi
-; X32-SSE1-NEXT: .Lcfi8:
-; X32-SSE1-NEXT: .cfi_def_cfa_offset 16
+; X32-SSE1-NEXT: .cfi_def_cfa_offset 8
; X32-SSE1-NEXT: pushl %esi
-; X32-SSE1-NEXT: .Lcfi9:
-; X32-SSE1-NEXT: .cfi_def_cfa_offset 20
-; X32-SSE1-NEXT: .Lcfi10:
-; X32-SSE1-NEXT: .cfi_offset %esi, -20
-; X32-SSE1-NEXT: .Lcfi11:
-; X32-SSE1-NEXT: .cfi_offset %edi, -16
-; X32-SSE1-NEXT: .Lcfi12:
-; X32-SSE1-NEXT: .cfi_offset %ebx, -12
-; X32-SSE1-NEXT: .Lcfi13:
-; X32-SSE1-NEXT: .cfi_offset %ebp, -8
+; X32-SSE1-NEXT: .cfi_def_cfa_offset 12
+; X32-SSE1-NEXT: .cfi_offset %esi, -12
+; X32-SSE1-NEXT: .cfi_offset %edi, -8
; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-SSE1-NEXT: movzwl 4(%ecx), %edx
-; X32-SSE1-NEXT: movzwl 6(%ecx), %esi
-; X32-SSE1-NEXT: movzwl 10(%ecx), %edi
-; X32-SSE1-NEXT: movzwl 12(%ecx), %ebx
-; X32-SSE1-NEXT: movzwl 14(%ecx), %ebp
+; X32-SSE1-NEXT: movl 4(%ecx), %edx
+; X32-SSE1-NEXT: movl 10(%ecx), %esi
+; X32-SSE1-NEXT: movzwl 14(%ecx), %edi
; X32-SSE1-NEXT: movzwl 18(%ecx), %ecx
-; X32-SSE1-NEXT: movw %bp, 10(%eax)
-; X32-SSE1-NEXT: movw %bx, 8(%eax)
+; X32-SSE1-NEXT: movw %di, 10(%eax)
; X32-SSE1-NEXT: movw %cx, 14(%eax)
-; X32-SSE1-NEXT: movw %si, 2(%eax)
-; X32-SSE1-NEXT: movw %dx, (%eax)
-; X32-SSE1-NEXT: movw %di, 6(%eax)
+; X32-SSE1-NEXT: movl %esi, 6(%eax)
+; X32-SSE1-NEXT: movl %edx, (%eax)
; X32-SSE1-NEXT: popl %esi
; X32-SSE1-NEXT: popl %edi
-; X32-SSE1-NEXT: popl %ebx
-; X32-SSE1-NEXT: popl %ebp
; X32-SSE1-NEXT: retl $4
;
; X32-SSE41-LABEL: merge_8i16_i16_23u567u9:
-; X32-SSE41: # BB#0:
+; X32-SSE41: # %bb.0:
; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-SSE41-NEXT: movups 4(%eax), %xmm0
; X32-SSE41-NEXT: retl
@@ -594,27 +782,25 @@ define <8 x i16> @merge_8i16_i16_23u567u9(i16* %ptr) nounwind uwtable noinline s
define <8 x i16> @merge_8i16_i16_34uuuuuu(i16* %ptr) nounwind uwtable noinline ssp {
; SSE-LABEL: merge_8i16_i16_34uuuuuu:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE-NEXT: retq
;
; AVX-LABEL: merge_8i16_i16_34uuuuuu:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX-NEXT: retq
;
; X32-SSE1-LABEL: merge_8i16_i16_34uuuuuu:
-; X32-SSE1: # BB#0:
+; X32-SSE1: # %bb.0:
; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-SSE1-NEXT: movzwl 6(%ecx), %edx
-; X32-SSE1-NEXT: movzwl 8(%ecx), %ecx
-; X32-SSE1-NEXT: movw %cx, 2(%eax)
-; X32-SSE1-NEXT: movw %dx, (%eax)
+; X32-SSE1-NEXT: movl 6(%ecx), %ecx
+; X32-SSE1-NEXT: movl %ecx, (%eax)
; X32-SSE1-NEXT: retl $4
;
; X32-SSE41-LABEL: merge_8i16_i16_34uuuuuu:
-; X32-SSE41: # BB#0:
+; X32-SSE41: # %bb.0:
; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-SSE41-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X32-SSE41-NEXT: retl
@@ -629,39 +815,29 @@ define <8 x i16> @merge_8i16_i16_34uuuuuu(i16* %ptr) nounwind uwtable noinline s
define <8 x i16> @merge_8i16_i16_45u7zzzz(i16* %ptr) nounwind uwtable noinline ssp {
; SSE-LABEL: merge_8i16_i16_45u7zzzz:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; SSE-NEXT: retq
;
; AVX-LABEL: merge_8i16_i16_45u7zzzz:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: retq
;
; X32-SSE1-LABEL: merge_8i16_i16_45u7zzzz:
-; X32-SSE1: # BB#0:
-; X32-SSE1-NEXT: pushl %esi
-; X32-SSE1-NEXT: .Lcfi14:
-; X32-SSE1-NEXT: .cfi_def_cfa_offset 8
-; X32-SSE1-NEXT: .Lcfi15:
-; X32-SSE1-NEXT: .cfi_offset %esi, -8
+; X32-SSE1: # %bb.0:
; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-SSE1-NEXT: movzwl 8(%ecx), %edx
-; X32-SSE1-NEXT: movzwl 10(%ecx), %esi
+; X32-SSE1-NEXT: movl 8(%ecx), %edx
; X32-SSE1-NEXT: movzwl 14(%ecx), %ecx
-; X32-SSE1-NEXT: movw %si, 2(%eax)
-; X32-SSE1-NEXT: movw %dx, (%eax)
; X32-SSE1-NEXT: movw %cx, 6(%eax)
-; X32-SSE1-NEXT: movw $0, 14(%eax)
-; X32-SSE1-NEXT: movw $0, 12(%eax)
-; X32-SSE1-NEXT: movw $0, 10(%eax)
-; X32-SSE1-NEXT: movw $0, 8(%eax)
-; X32-SSE1-NEXT: popl %esi
+; X32-SSE1-NEXT: movl %edx, (%eax)
+; X32-SSE1-NEXT: movl $0, 12(%eax)
+; X32-SSE1-NEXT: movl $0, 8(%eax)
; X32-SSE1-NEXT: retl $4
;
; X32-SSE41-LABEL: merge_8i16_i16_45u7zzzz:
-; X32-SSE41: # BB#0:
+; X32-SSE41: # %bb.0:
; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-SSE41-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; X32-SSE41-NEXT: retl
@@ -683,79 +859,51 @@ define <8 x i16> @merge_8i16_i16_45u7zzzz(i16* %ptr) nounwind uwtable noinline s
define <16 x i8> @merge_16i8_i8_01u3456789ABCDuF(i8* %ptr) nounwind uwtable noinline ssp {
; SSE-LABEL: merge_16i8_i8_01u3456789ABCDuF:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movups (%rdi), %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: merge_16i8_i8_01u3456789ABCDuF:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovups (%rdi), %xmm0
; AVX-NEXT: retq
;
; X32-SSE1-LABEL: merge_16i8_i8_01u3456789ABCDuF:
-; X32-SSE1: # BB#0:
-; X32-SSE1-NEXT: pushl %ebx
-; X32-SSE1-NEXT: .Lcfi16:
+; X32-SSE1: # %bb.0:
+; X32-SSE1-NEXT: pushl %ebp
; X32-SSE1-NEXT: .cfi_def_cfa_offset 8
-; X32-SSE1-NEXT: subl $12, %esp
-; X32-SSE1-NEXT: .Lcfi17:
+; X32-SSE1-NEXT: pushl %ebx
+; X32-SSE1-NEXT: .cfi_def_cfa_offset 12
+; X32-SSE1-NEXT: pushl %edi
+; X32-SSE1-NEXT: .cfi_def_cfa_offset 16
+; X32-SSE1-NEXT: pushl %esi
; X32-SSE1-NEXT: .cfi_def_cfa_offset 20
-; X32-SSE1-NEXT: .Lcfi18:
-; X32-SSE1-NEXT: .cfi_offset %ebx, -8
+; X32-SSE1-NEXT: .cfi_offset %esi, -20
+; X32-SSE1-NEXT: .cfi_offset %edi, -16
+; X32-SSE1-NEXT: .cfi_offset %ebx, -12
+; X32-SSE1-NEXT: .cfi_offset %ebp, -8
; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-SSE1-NEXT: movb (%ecx), %dl
-; X32-SSE1-NEXT: movb %dl, {{[0-9]+}}(%esp) # 1-byte Spill
-; X32-SSE1-NEXT: movb 1(%ecx), %dl
-; X32-SSE1-NEXT: movb %dl, {{[0-9]+}}(%esp) # 1-byte Spill
-; X32-SSE1-NEXT: movb 3(%ecx), %dl
-; X32-SSE1-NEXT: movb %dl, {{[0-9]+}}(%esp) # 1-byte Spill
-; X32-SSE1-NEXT: movb 4(%ecx), %dl
-; X32-SSE1-NEXT: movb %dl, {{[0-9]+}}(%esp) # 1-byte Spill
-; X32-SSE1-NEXT: movb 5(%ecx), %dl
-; X32-SSE1-NEXT: movb %dl, {{[0-9]+}}(%esp) # 1-byte Spill
-; X32-SSE1-NEXT: movb 6(%ecx), %dl
-; X32-SSE1-NEXT: movb %dl, {{[0-9]+}}(%esp) # 1-byte Spill
-; X32-SSE1-NEXT: movb 7(%ecx), %dl
-; X32-SSE1-NEXT: movb %dl, {{[0-9]+}}(%esp) # 1-byte Spill
-; X32-SSE1-NEXT: movb 8(%ecx), %dl
-; X32-SSE1-NEXT: movb %dl, {{[0-9]+}}(%esp) # 1-byte Spill
-; X32-SSE1-NEXT: movb 9(%ecx), %dl
-; X32-SSE1-NEXT: movb %dl, {{[0-9]+}}(%esp) # 1-byte Spill
-; X32-SSE1-NEXT: movb 10(%ecx), %bh
-; X32-SSE1-NEXT: movb 11(%ecx), %bl
-; X32-SSE1-NEXT: movb 12(%ecx), %dh
+; X32-SSE1-NEXT: movzwl (%ecx), %ebp
+; X32-SSE1-NEXT: movl 3(%ecx), %esi
+; X32-SSE1-NEXT: movl 7(%ecx), %edi
+; X32-SSE1-NEXT: movzwl 11(%ecx), %ebx
; X32-SSE1-NEXT: movb 13(%ecx), %dl
; X32-SSE1-NEXT: movb 15(%ecx), %cl
; X32-SSE1-NEXT: movb %dl, 13(%eax)
-; X32-SSE1-NEXT: movb %dh, 12(%eax)
; X32-SSE1-NEXT: movb %cl, 15(%eax)
-; X32-SSE1-NEXT: movb %bl, 11(%eax)
-; X32-SSE1-NEXT: movb %bh, 10(%eax)
-; X32-SSE1-NEXT: movb {{[0-9]+}}(%esp), %cl # 1-byte Reload
-; X32-SSE1-NEXT: movb %cl, 9(%eax)
-; X32-SSE1-NEXT: movb {{[0-9]+}}(%esp), %cl # 1-byte Reload
-; X32-SSE1-NEXT: movb %cl, 8(%eax)
-; X32-SSE1-NEXT: movb {{[0-9]+}}(%esp), %cl # 1-byte Reload
-; X32-SSE1-NEXT: movb %cl, 7(%eax)
-; X32-SSE1-NEXT: movb {{[0-9]+}}(%esp), %cl # 1-byte Reload
-; X32-SSE1-NEXT: movb %cl, 6(%eax)
-; X32-SSE1-NEXT: movb {{[0-9]+}}(%esp), %cl # 1-byte Reload
-; X32-SSE1-NEXT: movb %cl, 5(%eax)
-; X32-SSE1-NEXT: movb {{[0-9]+}}(%esp), %cl # 1-byte Reload
-; X32-SSE1-NEXT: movb %cl, 4(%eax)
-; X32-SSE1-NEXT: movb {{[0-9]+}}(%esp), %cl # 1-byte Reload
-; X32-SSE1-NEXT: movb %cl, 1(%eax)
-; X32-SSE1-NEXT: movb {{[0-9]+}}(%esp), %cl # 1-byte Reload
-; X32-SSE1-NEXT: movb %cl, (%eax)
-; X32-SSE1-NEXT: movb {{[0-9]+}}(%esp), %cl # 1-byte Reload
-; X32-SSE1-NEXT: movb %cl, 3(%eax)
-; X32-SSE1-NEXT: addl $12, %esp
+; X32-SSE1-NEXT: movw %bx, 11(%eax)
+; X32-SSE1-NEXT: movl %edi, 7(%eax)
+; X32-SSE1-NEXT: movl %esi, 3(%eax)
+; X32-SSE1-NEXT: movw %bp, (%eax)
+; X32-SSE1-NEXT: popl %esi
+; X32-SSE1-NEXT: popl %edi
; X32-SSE1-NEXT: popl %ebx
+; X32-SSE1-NEXT: popl %ebp
; X32-SSE1-NEXT: retl $4
;
; X32-SSE41-LABEL: merge_16i8_i8_01u3456789ABCDuF:
-; X32-SSE41: # BB#0:
+; X32-SSE41: # %bb.0:
; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-SSE41-NEXT: movups (%eax), %xmm0
; X32-SSE41-NEXT: retl
@@ -806,34 +954,30 @@ define <16 x i8> @merge_16i8_i8_01u3456789ABCDuF(i8* %ptr) nounwind uwtable noin
define <16 x i8> @merge_16i8_i8_01u3uuzzuuuuuzzz(i8* %ptr) nounwind uwtable noinline ssp {
; SSE-LABEL: merge_16i8_i8_01u3uuzzuuuuuzzz:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE-NEXT: retq
;
; AVX-LABEL: merge_16i8_i8_01u3uuzzuuuuuzzz:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX-NEXT: retq
;
; X32-SSE1-LABEL: merge_16i8_i8_01u3uuzzuuuuuzzz:
-; X32-SSE1: # BB#0:
+; X32-SSE1: # %bb.0:
; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-SSE1-NEXT: movb (%ecx), %dl
-; X32-SSE1-NEXT: movb 1(%ecx), %dh
+; X32-SSE1-NEXT: movzwl (%ecx), %edx
; X32-SSE1-NEXT: movb 3(%ecx), %cl
-; X32-SSE1-NEXT: movb %dh, 1(%eax)
-; X32-SSE1-NEXT: movb %dl, (%eax)
; X32-SSE1-NEXT: movb %cl, 3(%eax)
+; X32-SSE1-NEXT: movw %dx, (%eax)
; X32-SSE1-NEXT: movb $0, 15(%eax)
-; X32-SSE1-NEXT: movb $0, 14(%eax)
-; X32-SSE1-NEXT: movb $0, 13(%eax)
-; X32-SSE1-NEXT: movb $0, 7(%eax)
-; X32-SSE1-NEXT: movb $0, 6(%eax)
+; X32-SSE1-NEXT: movw $0, 13(%eax)
+; X32-SSE1-NEXT: movw $0, 6(%eax)
; X32-SSE1-NEXT: retl $4
;
; X32-SSE41-LABEL: merge_16i8_i8_01u3uuzzuuuuuzzz:
-; X32-SSE41: # BB#0:
+; X32-SSE41: # %bb.0:
; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-SSE41-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X32-SSE41-NEXT: retl
@@ -856,50 +1000,29 @@ define <16 x i8> @merge_16i8_i8_01u3uuzzuuuuuzzz(i8* %ptr) nounwind uwtable noin
define <16 x i8> @merge_16i8_i8_0123uu67uuuuuzzz(i8* %ptr) nounwind uwtable noinline ssp {
; SSE-LABEL: merge_16i8_i8_0123uu67uuuuuzzz:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; SSE-NEXT: retq
;
; AVX-LABEL: merge_16i8_i8_0123uu67uuuuuzzz:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: retq
;
; X32-SSE1-LABEL: merge_16i8_i8_0123uu67uuuuuzzz:
-; X32-SSE1: # BB#0:
-; X32-SSE1-NEXT: pushl %ebx
-; X32-SSE1-NEXT: .Lcfi19:
-; X32-SSE1-NEXT: .cfi_def_cfa_offset 8
-; X32-SSE1-NEXT: pushl %eax
-; X32-SSE1-NEXT: .Lcfi20:
-; X32-SSE1-NEXT: .cfi_def_cfa_offset 12
-; X32-SSE1-NEXT: .Lcfi21:
-; X32-SSE1-NEXT: .cfi_offset %ebx, -8
+; X32-SSE1: # %bb.0:
; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-SSE1-NEXT: movb (%ecx), %dl
-; X32-SSE1-NEXT: movb %dl, {{[0-9]+}}(%esp) # 1-byte Spill
-; X32-SSE1-NEXT: movb 1(%ecx), %dh
-; X32-SSE1-NEXT: movb 2(%ecx), %bl
-; X32-SSE1-NEXT: movb 3(%ecx), %bh
-; X32-SSE1-NEXT: movb 6(%ecx), %dl
-; X32-SSE1-NEXT: movb 7(%ecx), %cl
-; X32-SSE1-NEXT: movb %cl, 7(%eax)
-; X32-SSE1-NEXT: movb %dl, 6(%eax)
-; X32-SSE1-NEXT: movb %bh, 3(%eax)
-; X32-SSE1-NEXT: movb %bl, 2(%eax)
-; X32-SSE1-NEXT: movb %dh, 1(%eax)
-; X32-SSE1-NEXT: movb {{[0-9]+}}(%esp), %cl # 1-byte Reload
-; X32-SSE1-NEXT: movb %cl, (%eax)
+; X32-SSE1-NEXT: movl (%ecx), %edx
+; X32-SSE1-NEXT: movzwl 6(%ecx), %ecx
+; X32-SSE1-NEXT: movw %cx, 6(%eax)
+; X32-SSE1-NEXT: movl %edx, (%eax)
; X32-SSE1-NEXT: movb $0, 15(%eax)
-; X32-SSE1-NEXT: movb $0, 14(%eax)
-; X32-SSE1-NEXT: movb $0, 13(%eax)
-; X32-SSE1-NEXT: addl $4, %esp
-; X32-SSE1-NEXT: popl %ebx
+; X32-SSE1-NEXT: movw $0, 13(%eax)
; X32-SSE1-NEXT: retl $4
;
; X32-SSE41-LABEL: merge_16i8_i8_0123uu67uuuuuzzz:
-; X32-SSE41: # BB#0:
+; X32-SSE41: # %bb.0:
; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-SSE41-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; X32-SSE41-NEXT: retl
@@ -929,30 +1052,29 @@ define <16 x i8> @merge_16i8_i8_0123uu67uuuuuzzz(i8* %ptr) nounwind uwtable noin
define void @merge_4i32_i32_combine(<4 x i32>* %dst, i32* %src) {
; SSE-LABEL: merge_4i32_i32_combine:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE-NEXT: movaps %xmm0, (%rdi)
; SSE-NEXT: retq
;
; AVX-LABEL: merge_4i32_i32_combine:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX-NEXT: vmovaps %xmm0, (%rdi)
; AVX-NEXT: retq
;
; X32-SSE1-LABEL: merge_4i32_i32_combine:
-; X32-SSE1: # BB#0:
+; X32-SSE1: # %bb.0:
; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-SSE1-NEXT: movl (%ecx), %ecx
-; X32-SSE1-NEXT: movl %ecx, (%eax)
-; X32-SSE1-NEXT: movl $0, 12(%eax)
-; X32-SSE1-NEXT: movl $0, 8(%eax)
-; X32-SSE1-NEXT: movl $0, 4(%eax)
+; X32-SSE1-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X32-SSE1-NEXT: andps %xmm0, %xmm1
+; X32-SSE1-NEXT: movaps %xmm1, (%eax)
; X32-SSE1-NEXT: retl
;
; X32-SSE41-LABEL: merge_4i32_i32_combine:
-; X32-SSE41: # BB#0:
+; X32-SSE41: # %bb.0:
; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-SSE41-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
@@ -974,30 +1096,26 @@ define void @merge_4i32_i32_combine(<4 x i32>* %dst, i32* %src) {
define <2 x i64> @merge_2i64_i64_12_volatile(i64* %ptr) nounwind uwtable noinline ssp {
; SSE-LABEL: merge_2i64_i64_12_volatile:
-; SSE: # BB#0:
-; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE: # %bb.0:
+; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
+; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE-NEXT: retq
;
; AVX-LABEL: merge_2i64_i64_12_volatile:
-; AVX: # BB#0:
-; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
-; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX: # %bb.0:
+; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX-NEXT: retq
;
; X32-SSE1-LABEL: merge_2i64_i64_12_volatile:
-; X32-SSE1: # BB#0:
+; X32-SSE1: # %bb.0:
; X32-SSE1-NEXT: pushl %edi
-; X32-SSE1-NEXT: .Lcfi22:
; X32-SSE1-NEXT: .cfi_def_cfa_offset 8
; X32-SSE1-NEXT: pushl %esi
-; X32-SSE1-NEXT: .Lcfi23:
; X32-SSE1-NEXT: .cfi_def_cfa_offset 12
-; X32-SSE1-NEXT: .Lcfi24:
; X32-SSE1-NEXT: .cfi_offset %esi, -12
-; X32-SSE1-NEXT: .Lcfi25:
; X32-SSE1-NEXT: .cfi_offset %edi, -8
; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx
@@ -1014,7 +1132,7 @@ define <2 x i64> @merge_2i64_i64_12_volatile(i64* %ptr) nounwind uwtable noinlin
; X32-SSE1-NEXT: retl $4
;
; X32-SSE41-LABEL: merge_2i64_i64_12_volatile:
-; X32-SSE41: # BB#0:
+; X32-SSE41: # %bb.0:
; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-SSE41-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X32-SSE41-NEXT: pinsrd $1, 12(%eax), %xmm0
@@ -1032,16 +1150,16 @@ define <2 x i64> @merge_2i64_i64_12_volatile(i64* %ptr) nounwind uwtable noinlin
define <4 x float> @merge_4f32_f32_2345_volatile(float* %ptr) nounwind uwtable noinline ssp {
; SSE2-LABEL: merge_4f32_f32_2345_volatile:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
-; SSE2-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE2-NEXT: retq
;
; SSE41-LABEL: merge_4f32_f32_2345_volatile:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
@@ -1049,7 +1167,7 @@ define <4 x float> @merge_4f32_f32_2345_volatile(float* %ptr) nounwind uwtable n
; SSE41-NEXT: retq
;
; AVX-LABEL: merge_4f32_f32_2345_volatile:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
@@ -1057,7 +1175,7 @@ define <4 x float> @merge_4f32_f32_2345_volatile(float* %ptr) nounwind uwtable n
; AVX-NEXT: retq
;
; X32-SSE1-LABEL: merge_4f32_f32_2345_volatile:
-; X32-SSE1: # BB#0:
+; X32-SSE1: # %bb.0:
; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-SSE1-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X32-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
@@ -1069,7 +1187,7 @@ define <4 x float> @merge_4f32_f32_2345_volatile(float* %ptr) nounwind uwtable n
; X32-SSE1-NEXT: retl
;
; X32-SSE41-LABEL: merge_4f32_f32_2345_volatile:
-; X32-SSE41: # BB#0:
+; X32-SSE41: # %bb.0:
; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-SSE41-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X32-SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
@@ -1097,21 +1215,21 @@ define <4 x float> @merge_4f32_f32_2345_volatile(float* %ptr) nounwind uwtable n
define <4 x float> @merge_4f32_f32_X0YY(float* %ptr0, float* %ptr1) nounwind uwtable noinline ssp {
; SSE-LABEL: merge_4f32_f32_X0YY:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
; SSE-NEXT: retq
;
; AVX-LABEL: merge_4f32_f32_X0YY:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0,0]
; AVX-NEXT: retq
;
; X32-SSE-LABEL: merge_4f32_f32_X0YY:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
@@ -1134,17 +1252,17 @@ define <4 x float> @merge_4f32_f32_X0YY(float* %ptr0, float* %ptr1) nounwind uwt
; PR31309
define <4 x i32> @load_i32_zext_i128_v4i32(i32* %ptr) {
; SSE-LABEL: load_i32_zext_i128_v4i32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE-NEXT: retq
;
; AVX-LABEL: load_i32_zext_i128_v4i32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX-NEXT: retq
;
; X32-SSE1-LABEL: load_i32_zext_i128_v4i32:
-; X32-SSE1: # BB#0:
+; X32-SSE1: # %bb.0:
; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-SSE1-NEXT: movl (%ecx), %ecx
@@ -1155,7 +1273,7 @@ define <4 x i32> @load_i32_zext_i128_v4i32(i32* %ptr) {
; X32-SSE1-NEXT: retl $4
;
; X32-SSE41-LABEL: load_i32_zext_i128_v4i32:
-; X32-SSE41: # BB#0:
+; X32-SSE41: # %bb.0:
; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-SSE41-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X32-SSE41-NEXT: retl
diff --git a/test/CodeGen/X86/merge-consecutive-loads-256.ll b/test/CodeGen/X86/merge-consecutive-loads-256.ll
index b00d732889e3..5693149b5921 100644
--- a/test/CodeGen/X86/merge-consecutive-loads-256.ll
+++ b/test/CodeGen/X86/merge-consecutive-loads-256.ll
@@ -8,12 +8,12 @@
define <4 x double> @merge_4f64_2f64_23(<2 x double>* %ptr) nounwind uwtable noinline ssp {
; AVX-LABEL: merge_4f64_2f64_23:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovups 32(%rdi), %ymm0
; AVX-NEXT: retq
;
; X32-AVX-LABEL: merge_4f64_2f64_23:
-; X32-AVX: # BB#0:
+; X32-AVX: # %bb.0:
; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-AVX-NEXT: vmovups 32(%eax), %ymm0
; X32-AVX-NEXT: retl
@@ -27,18 +27,14 @@ define <4 x double> @merge_4f64_2f64_23(<2 x double>* %ptr) nounwind uwtable noi
define <4 x double> @merge_4f64_2f64_2z(<2 x double>* %ptr) nounwind uwtable noinline ssp {
; AVX-LABEL: merge_4f64_2f64_2z:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovaps 32(%rdi), %xmm0
-; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX-NEXT: retq
;
; X32-AVX-LABEL: merge_4f64_2f64_2z:
-; X32-AVX: # BB#0:
+; X32-AVX: # %bb.0:
; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-AVX-NEXT: vmovaps 32(%eax), %xmm0
-; X32-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X32-AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; X32-AVX-NEXT: retl
%ptr0 = getelementptr inbounds <2 x double>, <2 x double>* %ptr, i64 2
%val0 = load <2 x double>, <2 x double>* %ptr0
@@ -48,12 +44,12 @@ define <4 x double> @merge_4f64_2f64_2z(<2 x double>* %ptr) nounwind uwtable noi
define <4 x double> @merge_4f64_f64_2345(double* %ptr) nounwind uwtable noinline ssp {
; AVX-LABEL: merge_4f64_f64_2345:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovups 16(%rdi), %ymm0
; AVX-NEXT: retq
;
; X32-AVX-LABEL: merge_4f64_f64_2345:
-; X32-AVX: # BB#0:
+; X32-AVX: # %bb.0:
; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-AVX-NEXT: vmovups 16(%eax), %ymm0
; X32-AVX-NEXT: retl
@@ -74,12 +70,12 @@ define <4 x double> @merge_4f64_f64_2345(double* %ptr) nounwind uwtable noinline
define <4 x double> @merge_4f64_f64_3zuu(double* %ptr) nounwind uwtable noinline ssp {
; AVX-LABEL: merge_4f64_f64_3zuu:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: retq
;
; X32-AVX-LABEL: merge_4f64_f64_3zuu:
-; X32-AVX: # BB#0:
+; X32-AVX: # %bb.0:
; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; X32-AVX-NEXT: retl
@@ -92,12 +88,12 @@ define <4 x double> @merge_4f64_f64_3zuu(double* %ptr) nounwind uwtable noinline
define <4 x double> @merge_4f64_f64_34uu(double* %ptr) nounwind uwtable noinline ssp {
; AVX-LABEL: merge_4f64_f64_34uu:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovups 24(%rdi), %xmm0
; AVX-NEXT: retq
;
; X32-AVX-LABEL: merge_4f64_f64_34uu:
-; X32-AVX: # BB#0:
+; X32-AVX: # %bb.0:
; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-AVX-NEXT: vmovups 24(%eax), %xmm0
; X32-AVX-NEXT: retl
@@ -112,18 +108,14 @@ define <4 x double> @merge_4f64_f64_34uu(double* %ptr) nounwind uwtable noinline
define <4 x double> @merge_4f64_f64_45zz(double* %ptr) nounwind uwtable noinline ssp {
; AVX-LABEL: merge_4f64_f64_45zz:
-; AVX: # BB#0:
-; AVX-NEXT: vmovups 32(%rdi), %xmm0
-; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX: # %bb.0:
+; AVX-NEXT: vmovaps 32(%rdi), %xmm0
; AVX-NEXT: retq
;
; X32-AVX-LABEL: merge_4f64_f64_45zz:
-; X32-AVX: # BB#0:
+; X32-AVX: # %bb.0:
; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-AVX-NEXT: vmovups 32(%eax), %xmm0
-; X32-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X32-AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X32-AVX-NEXT: vmovaps 32(%eax), %xmm0
; X32-AVX-NEXT: retl
%ptr0 = getelementptr inbounds double, double* %ptr, i64 4
%ptr1 = getelementptr inbounds double, double* %ptr, i64 5
@@ -136,15 +128,15 @@ define <4 x double> @merge_4f64_f64_45zz(double* %ptr) nounwind uwtable noinline
define <4 x double> @merge_4f64_f64_34z6(double* %ptr) nounwind uwtable noinline ssp {
; AVX-LABEL: merge_4f64_f64_34z6:
-; AVX: # BB#0:
-; AVX-NEXT: vxorpd %ymm0, %ymm0, %ymm0
+; AVX: # %bb.0:
+; AVX-NEXT: vxorpd %xmm0, %xmm0, %xmm0
; AVX-NEXT: vblendpd {{.*#+}} ymm0 = mem[0,1],ymm0[2],mem[3]
; AVX-NEXT: retq
;
; X32-AVX-LABEL: merge_4f64_f64_34z6:
-; X32-AVX: # BB#0:
+; X32-AVX: # %bb.0:
; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-AVX-NEXT: vxorpd %ymm0, %ymm0, %ymm0
+; X32-AVX-NEXT: vxorpd %xmm0, %xmm0, %xmm0
; X32-AVX-NEXT: vblendpd {{.*#+}} ymm0 = mem[0,1],ymm0[2],mem[3]
; X32-AVX-NEXT: retl
%ptr0 = getelementptr inbounds double, double* %ptr, i64 3
@@ -162,18 +154,14 @@ define <4 x double> @merge_4f64_f64_34z6(double* %ptr) nounwind uwtable noinline
define <4 x i64> @merge_4i64_2i64_3z(<2 x i64>* %ptr) nounwind uwtable noinline ssp {
; AVX-LABEL: merge_4i64_2i64_3z:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovaps 48(%rdi), %xmm0
-; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX-NEXT: retq
;
; X32-AVX-LABEL: merge_4i64_2i64_3z:
-; X32-AVX: # BB#0:
+; X32-AVX: # %bb.0:
; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-AVX-NEXT: vmovaps 48(%eax), %xmm0
-; X32-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X32-AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; X32-AVX-NEXT: retl
%ptr0 = getelementptr inbounds <2 x i64>, <2 x i64>* %ptr, i64 3
%val0 = load <2 x i64>, <2 x i64>* %ptr0
@@ -183,12 +171,12 @@ define <4 x i64> @merge_4i64_2i64_3z(<2 x i64>* %ptr) nounwind uwtable noinline
define <4 x i64> @merge_4i64_i64_1234(i64* %ptr) nounwind uwtable noinline ssp {
; AVX-LABEL: merge_4i64_i64_1234:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovups 8(%rdi), %ymm0
; AVX-NEXT: retq
;
; X32-AVX-LABEL: merge_4i64_i64_1234:
-; X32-AVX: # BB#0:
+; X32-AVX: # %bb.0:
; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-AVX-NEXT: vmovups 8(%eax), %ymm0
; X32-AVX-NEXT: retl
@@ -209,12 +197,12 @@ define <4 x i64> @merge_4i64_i64_1234(i64* %ptr) nounwind uwtable noinline ssp {
define <4 x i64> @merge_4i64_i64_1zzu(i64* %ptr) nounwind uwtable noinline ssp {
; AVX-LABEL: merge_4i64_i64_1zzu:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: retq
;
; X32-AVX-LABEL: merge_4i64_i64_1zzu:
-; X32-AVX: # BB#0:
+; X32-AVX: # %bb.0:
; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; X32-AVX-NEXT: retl
@@ -228,18 +216,14 @@ define <4 x i64> @merge_4i64_i64_1zzu(i64* %ptr) nounwind uwtable noinline ssp {
define <4 x i64> @merge_4i64_i64_23zz(i64* %ptr) nounwind uwtable noinline ssp {
; AVX-LABEL: merge_4i64_i64_23zz:
-; AVX: # BB#0:
-; AVX-NEXT: vmovups 16(%rdi), %xmm0
-; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX: # %bb.0:
+; AVX-NEXT: vmovaps 16(%rdi), %xmm0
; AVX-NEXT: retq
;
; X32-AVX-LABEL: merge_4i64_i64_23zz:
-; X32-AVX: # BB#0:
+; X32-AVX: # %bb.0:
; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-AVX-NEXT: vmovups 16(%eax), %xmm0
-; X32-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X32-AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X32-AVX-NEXT: vmovaps 16(%eax), %xmm0
; X32-AVX-NEXT: retl
%ptr0 = getelementptr inbounds i64, i64* %ptr, i64 2
%ptr1 = getelementptr inbounds i64, i64* %ptr, i64 3
@@ -252,7 +236,7 @@ define <4 x i64> @merge_4i64_i64_23zz(i64* %ptr) nounwind uwtable noinline ssp {
define <8 x float> @merge_8f32_2f32_23z5(<2 x float>* %ptr) nounwind uwtable noinline ssp {
; AVX1-LABEL: merge_8f32_2f32_23z5:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; AVX1-NEXT: vmovups 16(%rdi), %xmm1
; AVX1-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
@@ -260,7 +244,7 @@ define <8 x float> @merge_8f32_2f32_23z5(<2 x float>* %ptr) nounwind uwtable noi
; AVX1-NEXT: retq
;
; AVX2-LABEL: merge_8f32_2f32_23z5:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; AVX2-NEXT: vmovdqu 16(%rdi), %xmm1
; AVX2-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
@@ -268,7 +252,7 @@ define <8 x float> @merge_8f32_2f32_23z5(<2 x float>* %ptr) nounwind uwtable noi
; AVX2-NEXT: retq
;
; AVX512F-LABEL: merge_8f32_2f32_23z5:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; AVX512F-NEXT: vmovdqu 16(%rdi), %xmm1
; AVX512F-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
@@ -276,9 +260,9 @@ define <8 x float> @merge_8f32_2f32_23z5(<2 x float>* %ptr) nounwind uwtable noi
; AVX512F-NEXT: retq
;
; X32-AVX-LABEL: merge_8f32_2f32_23z5:
-; X32-AVX: # BB#0:
+; X32-AVX: # %bb.0:
; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-AVX-NEXT: vxorpd %ymm0, %ymm0, %ymm0
+; X32-AVX-NEXT: vxorpd %xmm0, %xmm0, %xmm0
; X32-AVX-NEXT: vblendpd {{.*#+}} ymm0 = mem[0,1],ymm0[2],mem[3]
; X32-AVX-NEXT: retl
%ptr0 = getelementptr inbounds <2 x float>, <2 x float>* %ptr, i64 2
@@ -295,13 +279,13 @@ define <8 x float> @merge_8f32_2f32_23z5(<2 x float>* %ptr) nounwind uwtable noi
define <8 x float> @merge_8f32_4f32_z2(<4 x float>* %ptr) nounwind uwtable noinline ssp {
; AVX-LABEL: merge_8f32_4f32_z2:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
; AVX-NEXT: vinsertf128 $1, 32(%rdi), %ymm0, %ymm0
; AVX-NEXT: retq
;
; X32-AVX-LABEL: merge_8f32_4f32_z2:
-; X32-AVX: # BB#0:
+; X32-AVX: # %bb.0:
; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
; X32-AVX-NEXT: vinsertf128 $1, 32(%eax), %ymm0, %ymm0
@@ -314,12 +298,12 @@ define <8 x float> @merge_8f32_4f32_z2(<4 x float>* %ptr) nounwind uwtable noinl
define <8 x float> @merge_8f32_f32_12zzuuzz(float* %ptr) nounwind uwtable noinline ssp {
; AVX-LABEL: merge_8f32_f32_12zzuuzz:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: retq
;
; X32-AVX-LABEL: merge_8f32_f32_12zzuuzz:
-; X32-AVX: # BB#0:
+; X32-AVX: # %bb.0:
; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; X32-AVX-NEXT: retl
@@ -338,15 +322,15 @@ define <8 x float> @merge_8f32_f32_12zzuuzz(float* %ptr) nounwind uwtable noinli
define <8 x float> @merge_8f32_f32_1u3u5zu8(float* %ptr) nounwind uwtable noinline ssp {
; AVX-LABEL: merge_8f32_f32_1u3u5zu8:
-; AVX: # BB#0:
-; AVX-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; AVX: # %bb.0:
+; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
; AVX-NEXT: vblendps {{.*#+}} ymm0 = mem[0,1,2,3,4],ymm0[5],mem[6,7]
; AVX-NEXT: retq
;
; X32-AVX-LABEL: merge_8f32_f32_1u3u5zu8:
-; X32-AVX: # BB#0:
+; X32-AVX: # %bb.0:
; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-AVX-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; X32-AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
; X32-AVX-NEXT: vblendps {{.*#+}} ymm0 = mem[0,1,2,3,4],ymm0[5],mem[6,7]
; X32-AVX-NEXT: retl
%ptr0 = getelementptr inbounds float, float* %ptr, i64 1
@@ -367,13 +351,13 @@ define <8 x float> @merge_8f32_f32_1u3u5zu8(float* %ptr) nounwind uwtable noinli
define <8 x i32> @merge_8i32_4i32_z3(<4 x i32>* %ptr) nounwind uwtable noinline ssp {
; AVX-LABEL: merge_8i32_4i32_z3:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
; AVX-NEXT: vinsertf128 $1, 48(%rdi), %ymm0, %ymm0
; AVX-NEXT: retq
;
; X32-AVX-LABEL: merge_8i32_4i32_z3:
-; X32-AVX: # BB#0:
+; X32-AVX: # %bb.0:
; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
; X32-AVX-NEXT: vinsertf128 $1, 48(%eax), %ymm0, %ymm0
@@ -386,14 +370,14 @@ define <8 x i32> @merge_8i32_4i32_z3(<4 x i32>* %ptr) nounwind uwtable noinline
define <8 x i32> @merge_8i32_i32_56zz9uzz(i32* %ptr) nounwind uwtable noinline ssp {
; AVX-LABEL: merge_8i32_i32_56zz9uzz:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX-NEXT: retq
;
; X32-AVX-LABEL: merge_8i32_i32_56zz9uzz:
-; X32-AVX: # BB#0:
+; X32-AVX: # %bb.0:
; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; X32-AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
@@ -416,28 +400,16 @@ define <8 x i32> @merge_8i32_i32_56zz9uzz(i32* %ptr) nounwind uwtable noinline s
}
define <8 x i32> @merge_8i32_i32_1u3u5zu8(i32* %ptr) nounwind uwtable noinline ssp {
-; AVX1-LABEL: merge_8i32_i32_1u3u5zu8:
-; AVX1: # BB#0:
-; AVX1-NEXT: vxorps %ymm0, %ymm0, %ymm0
-; AVX1-NEXT: vblendps {{.*#+}} ymm0 = mem[0,1,2,3,4],ymm0[5],mem[6,7]
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: merge_8i32_i32_1u3u5zu8:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpxor %ymm0, %ymm0, %ymm0
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3,4],ymm0[5],mem[6,7]
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: merge_8i32_i32_1u3u5zu8:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vpxor %ymm0, %ymm0, %ymm0
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3,4],ymm0[5],mem[6,7]
-; AVX512F-NEXT: retq
+; AVX-LABEL: merge_8i32_i32_1u3u5zu8:
+; AVX: # %bb.0:
+; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = mem[0,1,2,3,4],ymm0[5],mem[6,7]
+; AVX-NEXT: retq
;
; X32-AVX-LABEL: merge_8i32_i32_1u3u5zu8:
-; X32-AVX: # BB#0:
+; X32-AVX: # %bb.0:
; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-AVX-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; X32-AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
; X32-AVX-NEXT: vblendps {{.*#+}} ymm0 = mem[0,1,2,3,4],ymm0[5],mem[6,7]
; X32-AVX-NEXT: retl
%ptr0 = getelementptr inbounds i32, i32* %ptr, i64 1
@@ -458,12 +430,12 @@ define <8 x i32> @merge_8i32_i32_1u3u5zu8(i32* %ptr) nounwind uwtable noinline s
define <16 x i16> @merge_16i16_i16_89zzzuuuuuuuuuuuz(i16* %ptr) nounwind uwtable noinline ssp {
; AVX-LABEL: merge_16i16_i16_89zzzuuuuuuuuuuuz:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX-NEXT: retq
;
; X32-AVX-LABEL: merge_16i16_i16_89zzzuuuuuuuuuuuz:
-; X32-AVX: # BB#0:
+; X32-AVX: # %bb.0:
; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X32-AVX-NEXT: retl
@@ -482,12 +454,12 @@ define <16 x i16> @merge_16i16_i16_89zzzuuuuuuuuuuuz(i16* %ptr) nounwind uwtable
define <16 x i16> @merge_16i16_i16_45u7uuuuuuuuuuuu(i16* %ptr) nounwind uwtable noinline ssp {
; AVX-LABEL: merge_16i16_i16_45u7uuuuuuuuuuuu:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: retq
;
; X32-AVX-LABEL: merge_16i16_i16_45u7uuuuuuuuuuuu:
-; X32-AVX: # BB#0:
+; X32-AVX: # %bb.0:
; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; X32-AVX-NEXT: retl
@@ -505,12 +477,12 @@ define <16 x i16> @merge_16i16_i16_45u7uuuuuuuuuuuu(i16* %ptr) nounwind uwtable
define <16 x i16> @merge_16i16_i16_0uu3uuuuuuuuCuEF(i16* %ptr) nounwind uwtable noinline ssp {
; AVX-LABEL: merge_16i16_i16_0uu3uuuuuuuuCuEF:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovups (%rdi), %ymm0
; AVX-NEXT: retq
;
; X32-AVX-LABEL: merge_16i16_i16_0uu3uuuuuuuuCuEF:
-; X32-AVX: # BB#0:
+; X32-AVX: # %bb.0:
; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-AVX-NEXT: vmovups (%eax), %ymm0
; X32-AVX-NEXT: retl
@@ -534,13 +506,13 @@ define <16 x i16> @merge_16i16_i16_0uu3uuuuuuuuCuEF(i16* %ptr) nounwind uwtable
define <16 x i16> @merge_16i16_i16_0uu3zzuuuuuzCuEF(i16* %ptr) nounwind uwtable noinline ssp {
; AVX-LABEL: merge_16i16_i16_0uu3zzuuuuuzCuEF:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovups (%rdi), %ymm0
; AVX-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
; AVX-NEXT: retq
;
; X32-AVX-LABEL: merge_16i16_i16_0uu3zzuuuuuzCuEF:
-; X32-AVX: # BB#0:
+; X32-AVX: # %bb.0:
; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-AVX-NEXT: vmovups (%eax), %ymm0
; X32-AVX-NEXT: vandps {{\.LCPI.*}}, %ymm0, %ymm0
@@ -568,12 +540,12 @@ define <16 x i16> @merge_16i16_i16_0uu3zzuuuuuzCuEF(i16* %ptr) nounwind uwtable
define <32 x i8> @merge_32i8_i8_45u7uuuuuuuuuuuuuuuuuuuuuuuuuuuu(i8* %ptr) nounwind uwtable noinline ssp {
; AVX-LABEL: merge_32i8_i8_45u7uuuuuuuuuuuuuuuuuuuuuuuuuuuu:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX-NEXT: retq
;
; X32-AVX-LABEL: merge_32i8_i8_45u7uuuuuuuuuuuuuuuuuuuuuuuuuuuu:
-; X32-AVX: # BB#0:
+; X32-AVX: # %bb.0:
; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X32-AVX-NEXT: retl
@@ -591,12 +563,12 @@ define <32 x i8> @merge_32i8_i8_45u7uuuuuuuuuuuuuuuuuuuuuuuuuuuu(i8* %ptr) nounw
define <32 x i8> @merge_32i8_i8_23u5uuuuuuuuuuzzzzuuuuuuuuuuuuuu(i8* %ptr) nounwind uwtable noinline ssp {
; AVX-LABEL: merge_32i8_i8_23u5uuuuuuuuuuzzzzuuuuuuuuuuuuuu:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX-NEXT: retq
;
; X32-AVX-LABEL: merge_32i8_i8_23u5uuuuuuuuuuzzzzuuuuuuuuuuuuuu:
-; X32-AVX: # BB#0:
+; X32-AVX: # %bb.0:
; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X32-AVX-NEXT: retl
@@ -621,37 +593,19 @@ define <32 x i8> @merge_32i8_i8_23u5uuuuuuuuuuzzzzuuuuuuuuuuuuuu(i8* %ptr) nounw
;
define <4 x double> @merge_4f64_f64_34uz_volatile(double* %ptr) nounwind uwtable noinline ssp {
-; AVX1-LABEL: merge_4f64_f64_34uz_volatile:
-; AVX1: # BB#0:
-; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX1-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
-; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: merge_4f64_f64_34uz_volatile:
-; AVX2: # BB#0:
-; AVX2-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX2-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
-; AVX2-NEXT: vxorpd %xmm1, %xmm1, %xmm1
-; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: merge_4f64_f64_34uz_volatile:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX512F-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
-; AVX512F-NEXT: vxorpd %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX512F-NEXT: retq
+; AVX-LABEL: merge_4f64_f64_34uz_volatile:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; AVX-NEXT: vmovapd %xmm0, %xmm0
+; AVX-NEXT: retq
;
; X32-AVX-LABEL: merge_4f64_f64_34uz_volatile:
-; X32-AVX: # BB#0:
+; X32-AVX: # %bb.0:
; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; X32-AVX-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
-; X32-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X32-AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X32-AVX-NEXT: vmovapd %xmm0, %xmm0
; X32-AVX-NEXT: retl
%ptr0 = getelementptr inbounds double, double* %ptr, i64 3
%ptr1 = getelementptr inbounds double, double* %ptr, i64 4
@@ -665,7 +619,7 @@ define <4 x double> @merge_4f64_f64_34uz_volatile(double* %ptr) nounwind uwtable
define <16 x i16> @merge_16i16_i16_0uu3zzuuuuuzCuEF_volatile(i16* %ptr) nounwind uwtable noinline ssp {
; AVX1-LABEL: merge_16i16_i16_0uu3zzuuuuuzCuEF_volatile:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0
; AVX1-NEXT: vpinsrw $0, (%rdi), %xmm0, %xmm1
; AVX1-NEXT: vpinsrw $4, 24(%rdi), %xmm0, %xmm0
@@ -676,7 +630,7 @@ define <16 x i16> @merge_16i16_i16_0uu3zzuuuuuzCuEF_volatile(i16* %ptr) nounwind
; AVX1-NEXT: retq
;
; AVX2-LABEL: merge_16i16_i16_0uu3zzuuuuuzCuEF_volatile:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0
; AVX2-NEXT: vpinsrw $0, (%rdi), %xmm0, %xmm1
; AVX2-NEXT: vpinsrw $4, 24(%rdi), %xmm0, %xmm0
@@ -687,7 +641,7 @@ define <16 x i16> @merge_16i16_i16_0uu3zzuuuuuzCuEF_volatile(i16* %ptr) nounwind
; AVX2-NEXT: retq
;
; AVX512F-LABEL: merge_16i16_i16_0uu3zzuuuuuzCuEF_volatile:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vpxor %xmm0, %xmm0, %xmm0
; AVX512F-NEXT: vpinsrw $0, (%rdi), %xmm0, %xmm1
; AVX512F-NEXT: vpinsrw $4, 24(%rdi), %xmm0, %xmm0
@@ -698,7 +652,7 @@ define <16 x i16> @merge_16i16_i16_0uu3zzuuuuuzCuEF_volatile(i16* %ptr) nounwind
; AVX512F-NEXT: retq
;
; X32-AVX-LABEL: merge_16i16_i16_0uu3zzuuuuuzCuEF_volatile:
-; X32-AVX: # BB#0:
+; X32-AVX: # %bb.0:
; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0
; X32-AVX-NEXT: vpinsrw $0, (%eax), %xmm0, %xmm1
diff --git a/test/CodeGen/X86/merge-consecutive-loads-512.ll b/test/CodeGen/X86/merge-consecutive-loads-512.ll
index c3500f0ad399..62102eb382cb 100644
--- a/test/CodeGen/X86/merge-consecutive-loads-512.ll
+++ b/test/CodeGen/X86/merge-consecutive-loads-512.ll
@@ -7,16 +7,16 @@
define <8 x double> @merge_8f64_2f64_12u4(<2 x double>* %ptr) nounwind uwtable noinline ssp {
; ALL-LABEL: merge_8f64_2f64_12u4:
-; ALL: # BB#0:
-; ALL-NEXT: vmovupd 16(%rdi), %ymm0
+; ALL: # %bb.0:
+; ALL-NEXT: vmovups 16(%rdi), %ymm0
; ALL-NEXT: vinsertf128 $1, 64(%rdi), %ymm0, %ymm1
; ALL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
; ALL-NEXT: retq
;
; X32-AVX512F-LABEL: merge_8f64_2f64_12u4:
-; X32-AVX512F: # BB#0:
+; X32-AVX512F: # %bb.0:
; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-AVX512F-NEXT: vmovupd 16(%eax), %ymm0
+; X32-AVX512F-NEXT: vmovups 16(%eax), %ymm0
; X32-AVX512F-NEXT: vinsertf128 $1, 64(%eax), %ymm0, %ymm1
; X32-AVX512F-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
; X32-AVX512F-NEXT: retl
@@ -34,18 +34,18 @@ define <8 x double> @merge_8f64_2f64_12u4(<2 x double>* %ptr) nounwind uwtable n
define <8 x double> @merge_8f64_2f64_23z5(<2 x double>* %ptr) nounwind uwtable noinline ssp {
; ALL-LABEL: merge_8f64_2f64_23z5:
-; ALL: # BB#0:
-; ALL-NEXT: vmovupd 32(%rdi), %ymm0
-; ALL-NEXT: vxorpd %xmm1, %xmm1, %xmm1
+; ALL: # %bb.0:
+; ALL-NEXT: vmovups 32(%rdi), %ymm0
+; ALL-NEXT: vxorps %xmm1, %xmm1, %xmm1
; ALL-NEXT: vinsertf128 $1, 80(%rdi), %ymm1, %ymm1
; ALL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
; ALL-NEXT: retq
;
; X32-AVX512F-LABEL: merge_8f64_2f64_23z5:
-; X32-AVX512F: # BB#0:
+; X32-AVX512F: # %bb.0:
; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-AVX512F-NEXT: vmovupd 32(%eax), %ymm0
-; X32-AVX512F-NEXT: vxorpd %xmm1, %xmm1, %xmm1
+; X32-AVX512F-NEXT: vmovups 32(%eax), %ymm0
+; X32-AVX512F-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X32-AVX512F-NEXT: vinsertf128 $1, 80(%eax), %ymm1, %ymm1
; X32-AVX512F-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
; X32-AVX512F-NEXT: retl
@@ -63,15 +63,15 @@ define <8 x double> @merge_8f64_2f64_23z5(<2 x double>* %ptr) nounwind uwtable n
define <8 x double> @merge_8f64_4f64_z2(<4 x double>* %ptr) nounwind uwtable noinline ssp {
; ALL-LABEL: merge_8f64_4f64_z2:
-; ALL: # BB#0:
-; ALL-NEXT: vxorpd %ymm0, %ymm0, %ymm0
+; ALL: # %bb.0:
+; ALL-NEXT: vxorps %xmm0, %xmm0, %xmm0
; ALL-NEXT: vinsertf64x4 $1, 64(%rdi), %zmm0, %zmm0
; ALL-NEXT: retq
;
; X32-AVX512F-LABEL: merge_8f64_4f64_z2:
-; X32-AVX512F: # BB#0:
+; X32-AVX512F: # %bb.0:
; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-AVX512F-NEXT: vxorpd %ymm0, %ymm0, %ymm0
+; X32-AVX512F-NEXT: vxorps %xmm0, %xmm0, %xmm0
; X32-AVX512F-NEXT: vinsertf64x4 $1, 64(%eax), %zmm0, %zmm0
; X32-AVX512F-NEXT: retl
%ptr1 = getelementptr inbounds <4 x double>, <4 x double>* %ptr, i64 2
@@ -82,12 +82,12 @@ define <8 x double> @merge_8f64_4f64_z2(<4 x double>* %ptr) nounwind uwtable noi
define <8 x double> @merge_8f64_f64_23uuuuu9(double* %ptr) nounwind uwtable noinline ssp {
; ALL-LABEL: merge_8f64_f64_23uuuuu9:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vmovups 16(%rdi), %zmm0
; ALL-NEXT: retq
;
; X32-AVX512F-LABEL: merge_8f64_f64_23uuuuu9:
-; X32-AVX512F: # BB#0:
+; X32-AVX512F: # %bb.0:
; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-AVX512F-NEXT: vmovups 16(%eax), %zmm0
; X32-AVX512F-NEXT: retl
@@ -105,22 +105,14 @@ define <8 x double> @merge_8f64_f64_23uuuuu9(double* %ptr) nounwind uwtable noin
define <8 x double> @merge_8f64_f64_12zzuuzz(double* %ptr) nounwind uwtable noinline ssp {
; ALL-LABEL: merge_8f64_f64_12zzuuzz:
-; ALL: # BB#0:
-; ALL-NEXT: vmovupd 8(%rdi), %xmm0
-; ALL-NEXT: vxorpd %xmm1, %xmm1, %xmm1
-; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; ALL-NEXT: vxorpd %ymm1, %ymm1, %ymm1
-; ALL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; ALL: # %bb.0:
+; ALL-NEXT: vmovaps 8(%rdi), %xmm0
; ALL-NEXT: retq
;
; X32-AVX512F-LABEL: merge_8f64_f64_12zzuuzz:
-; X32-AVX512F: # BB#0:
+; X32-AVX512F: # %bb.0:
; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-AVX512F-NEXT: vmovupd 8(%eax), %xmm0
-; X32-AVX512F-NEXT: vxorpd %xmm1, %xmm1, %xmm1
-; X32-AVX512F-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; X32-AVX512F-NEXT: vxorpd %ymm1, %ymm1, %ymm1
-; X32-AVX512F-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; X32-AVX512F-NEXT: vmovaps 8(%eax), %xmm0
; X32-AVX512F-NEXT: retl
%ptr0 = getelementptr inbounds double, double* %ptr, i64 1
%ptr1 = getelementptr inbounds double, double* %ptr, i64 2
@@ -137,7 +129,7 @@ define <8 x double> @merge_8f64_f64_12zzuuzz(double* %ptr) nounwind uwtable noin
define <8 x double> @merge_8f64_f64_1u3u5zu8(double* %ptr) nounwind uwtable noinline ssp {
; AVX512F-LABEL: merge_8f64_f64_1u3u5zu8:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: movb $32, %al
; AVX512F-NEXT: kmovw %eax, %k0
; AVX512F-NEXT: knotw %k0, %k1
@@ -145,7 +137,7 @@ define <8 x double> @merge_8f64_f64_1u3u5zu8(double* %ptr) nounwind uwtable noin
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: merge_8f64_f64_1u3u5zu8:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: movb $32, %al
; AVX512BW-NEXT: kmovd %eax, %k0
; AVX512BW-NEXT: knotw %k0, %k1
@@ -153,7 +145,7 @@ define <8 x double> @merge_8f64_f64_1u3u5zu8(double* %ptr) nounwind uwtable noin
; AVX512BW-NEXT: retq
;
; X32-AVX512F-LABEL: merge_8f64_f64_1u3u5zu8:
-; X32-AVX512F: # BB#0:
+; X32-AVX512F: # %bb.0:
; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-AVX512F-NEXT: movb $32, %cl
; X32-AVX512F-NEXT: kmovw %ecx, %k0
@@ -178,16 +170,16 @@ define <8 x double> @merge_8f64_f64_1u3u5zu8(double* %ptr) nounwind uwtable noin
define <8 x i64> @merge_8i64_4i64_z3(<4 x i64>* %ptr) nounwind uwtable noinline ssp {
; ALL-LABEL: merge_8i64_4i64_z3:
-; ALL: # BB#0:
-; ALL-NEXT: vpxor %ymm0, %ymm0, %ymm0
-; ALL-NEXT: vinserti64x4 $1, 96(%rdi), %zmm0, %zmm0
+; ALL: # %bb.0:
+; ALL-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; ALL-NEXT: vinsertf64x4 $1, 96(%rdi), %zmm0, %zmm0
; ALL-NEXT: retq
;
; X32-AVX512F-LABEL: merge_8i64_4i64_z3:
-; X32-AVX512F: # BB#0:
+; X32-AVX512F: # %bb.0:
; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-AVX512F-NEXT: vpxor %ymm0, %ymm0, %ymm0
-; X32-AVX512F-NEXT: vinserti64x4 $1, 96(%eax), %zmm0, %zmm0
+; X32-AVX512F-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; X32-AVX512F-NEXT: vinsertf64x4 $1, 96(%eax), %zmm0, %zmm0
; X32-AVX512F-NEXT: retl
%ptr1 = getelementptr inbounds <4 x i64>, <4 x i64>* %ptr, i64 3
%val1 = load <4 x i64>, <4 x i64>* %ptr1
@@ -197,22 +189,18 @@ define <8 x i64> @merge_8i64_4i64_z3(<4 x i64>* %ptr) nounwind uwtable noinline
define <8 x i64> @merge_8i64_i64_56zz9uzz(i64* %ptr) nounwind uwtable noinline ssp {
; ALL-LABEL: merge_8i64_i64_56zz9uzz:
-; ALL: # BB#0:
-; ALL-NEXT: vmovdqu 40(%rdi), %xmm0
-; ALL-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; ALL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; ALL-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
-; ALL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; ALL: # %bb.0:
+; ALL-NEXT: vmovaps 40(%rdi), %xmm0
+; ALL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
+; ALL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
; ALL-NEXT: retq
;
; X32-AVX512F-LABEL: merge_8i64_i64_56zz9uzz:
-; X32-AVX512F: # BB#0:
+; X32-AVX512F: # %bb.0:
; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-AVX512F-NEXT: vmovdqu 40(%eax), %xmm0
-; X32-AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; X32-AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; X32-AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
-; X32-AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; X32-AVX512F-NEXT: vmovaps 40(%eax), %xmm0
+; X32-AVX512F-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
+; X32-AVX512F-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
; X32-AVX512F-NEXT: retl
%ptr0 = getelementptr inbounds i64, i64* %ptr, i64 5
%ptr1 = getelementptr inbounds i64, i64* %ptr, i64 6
@@ -232,7 +220,7 @@ define <8 x i64> @merge_8i64_i64_56zz9uzz(i64* %ptr) nounwind uwtable noinline s
define <8 x i64> @merge_8i64_i64_1u3u5zu8(i64* %ptr) nounwind uwtable noinline ssp {
; AVX512F-LABEL: merge_8i64_i64_1u3u5zu8:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: movb $32, %al
; AVX512F-NEXT: kmovw %eax, %k0
; AVX512F-NEXT: knotw %k0, %k1
@@ -240,7 +228,7 @@ define <8 x i64> @merge_8i64_i64_1u3u5zu8(i64* %ptr) nounwind uwtable noinline s
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: merge_8i64_i64_1u3u5zu8:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: movb $32, %al
; AVX512BW-NEXT: kmovd %eax, %k0
; AVX512BW-NEXT: knotw %k0, %k1
@@ -248,7 +236,7 @@ define <8 x i64> @merge_8i64_i64_1u3u5zu8(i64* %ptr) nounwind uwtable noinline s
; AVX512BW-NEXT: retq
;
; X32-AVX512F-LABEL: merge_8i64_i64_1u3u5zu8:
-; X32-AVX512F: # BB#0:
+; X32-AVX512F: # %bb.0:
; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-AVX512F-NEXT: movb $32, %cl
; X32-AVX512F-NEXT: kmovw %ecx, %k0
@@ -273,12 +261,12 @@ define <8 x i64> @merge_8i64_i64_1u3u5zu8(i64* %ptr) nounwind uwtable noinline s
define <16 x float> @merge_16f32_f32_89zzzuuuuuuuuuuuz(float* %ptr) nounwind uwtable noinline ssp {
; ALL-LABEL: merge_16f32_f32_89zzzuuuuuuuuuuuz:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; ALL-NEXT: retq
;
; X32-AVX512F-LABEL: merge_16f32_f32_89zzzuuuuuuuuuuuz:
-; X32-AVX512F: # BB#0:
+; X32-AVX512F: # %bb.0:
; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; X32-AVX512F-NEXT: retl
@@ -297,12 +285,12 @@ define <16 x float> @merge_16f32_f32_89zzzuuuuuuuuuuuz(float* %ptr) nounwind uwt
define <16 x float> @merge_16f32_f32_45u7uuuuuuuuuuuu(float* %ptr) nounwind uwtable noinline ssp {
; ALL-LABEL: merge_16f32_f32_45u7uuuuuuuuuuuu:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vmovups 16(%rdi), %xmm0
; ALL-NEXT: retq
;
; X32-AVX512F-LABEL: merge_16f32_f32_45u7uuuuuuuuuuuu:
-; X32-AVX512F: # BB#0:
+; X32-AVX512F: # %bb.0:
; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-AVX512F-NEXT: vmovups 16(%eax), %xmm0
; X32-AVX512F-NEXT: retl
@@ -320,12 +308,12 @@ define <16 x float> @merge_16f32_f32_45u7uuuuuuuuuuuu(float* %ptr) nounwind uwta
define <16 x float> @merge_16f32_f32_0uu3uuuuuuuuCuEF(float* %ptr) nounwind uwtable noinline ssp {
; ALL-LABEL: merge_16f32_f32_0uu3uuuuuuuuCuEF:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vmovups (%rdi), %zmm0
; ALL-NEXT: retq
;
; X32-AVX512F-LABEL: merge_16f32_f32_0uu3uuuuuuuuCuEF:
-; X32-AVX512F: # BB#0:
+; X32-AVX512F: # %bb.0:
; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-AVX512F-NEXT: vmovups (%eax), %zmm0
; X32-AVX512F-NEXT: retl
@@ -349,18 +337,18 @@ define <16 x float> @merge_16f32_f32_0uu3uuuuuuuuCuEF(float* %ptr) nounwind uwta
define <16 x float> @merge_16f32_f32_0uu3zzuuuuuzCuEF(float* %ptr) nounwind uwtable noinline ssp {
; ALL-LABEL: merge_16f32_f32_0uu3zzuuuuuzCuEF:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vmovups (%rdi), %zmm1
-; ALL-NEXT: vpxord %zmm2, %zmm2, %zmm2
+; ALL-NEXT: vxorps %xmm2, %xmm2, %xmm2
; ALL-NEXT: vmovaps {{.*#+}} zmm0 = <0,u,u,3,20,21,u,u,u,u,u,u,12,29,14,15>
; ALL-NEXT: vpermi2ps %zmm2, %zmm1, %zmm0
; ALL-NEXT: retq
;
; X32-AVX512F-LABEL: merge_16f32_f32_0uu3zzuuuuuzCuEF:
-; X32-AVX512F: # BB#0:
+; X32-AVX512F: # %bb.0:
; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-AVX512F-NEXT: vmovups (%eax), %zmm1
-; X32-AVX512F-NEXT: vpxord %zmm2, %zmm2, %zmm2
+; X32-AVX512F-NEXT: vxorps %xmm2, %xmm2, %xmm2
; X32-AVX512F-NEXT: vmovaps {{.*#+}} zmm0 = <0,u,u,3,20,21,u,u,u,u,u,u,12,29,14,15>
; X32-AVX512F-NEXT: vpermi2ps %zmm2, %zmm1, %zmm0
; X32-AVX512F-NEXT: retl
@@ -387,12 +375,12 @@ define <16 x float> @merge_16f32_f32_0uu3zzuuuuuzCuEF(float* %ptr) nounwind uwta
define <16 x i32> @merge_16i32_i32_12zzzuuuuuuuuuuuz(i32* %ptr) nounwind uwtable noinline ssp {
; ALL-LABEL: merge_16i32_i32_12zzzuuuuuuuuuuuz:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; ALL-NEXT: retq
;
; X32-AVX512F-LABEL: merge_16i32_i32_12zzzuuuuuuuuuuuz:
-; X32-AVX512F: # BB#0:
+; X32-AVX512F: # %bb.0:
; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; X32-AVX512F-NEXT: retl
@@ -411,12 +399,12 @@ define <16 x i32> @merge_16i32_i32_12zzzuuuuuuuuuuuz(i32* %ptr) nounwind uwtable
define <16 x i32> @merge_16i32_i32_23u5uuuuuuuuuuuu(i32* %ptr) nounwind uwtable noinline ssp {
; ALL-LABEL: merge_16i32_i32_23u5uuuuuuuuuuuu:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vmovups 8(%rdi), %xmm0
; ALL-NEXT: retq
;
; X32-AVX512F-LABEL: merge_16i32_i32_23u5uuuuuuuuuuuu:
-; X32-AVX512F: # BB#0:
+; X32-AVX512F: # %bb.0:
; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-AVX512F-NEXT: vmovups 8(%eax), %xmm0
; X32-AVX512F-NEXT: retl
@@ -434,12 +422,12 @@ define <16 x i32> @merge_16i32_i32_23u5uuuuuuuuuuuu(i32* %ptr) nounwind uwtable
define <16 x i32> @merge_16i32_i32_0uu3uuuuuuuuCuEF(i32* %ptr) nounwind uwtable noinline ssp {
; ALL-LABEL: merge_16i32_i32_0uu3uuuuuuuuCuEF:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vmovups (%rdi), %zmm0
; ALL-NEXT: retq
;
; X32-AVX512F-LABEL: merge_16i32_i32_0uu3uuuuuuuuCuEF:
-; X32-AVX512F: # BB#0:
+; X32-AVX512F: # %bb.0:
; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-AVX512F-NEXT: vmovups (%eax), %zmm0
; X32-AVX512F-NEXT: retl
@@ -463,7 +451,7 @@ define <16 x i32> @merge_16i32_i32_0uu3uuuuuuuuCuEF(i32* %ptr) nounwind uwtable
define <16 x i32> @merge_16i32_i32_0uu3zzuuuuuzCuEF(i32* %ptr) nounwind uwtable noinline ssp {
; AVX512F-LABEL: merge_16i32_i32_0uu3zzuuuuuzCuEF:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: movw $8240, %ax # imm = 0x2030
; AVX512F-NEXT: kmovw %eax, %k0
; AVX512F-NEXT: knotw %k0, %k1
@@ -471,7 +459,7 @@ define <16 x i32> @merge_16i32_i32_0uu3zzuuuuuzCuEF(i32* %ptr) nounwind uwtable
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: merge_16i32_i32_0uu3zzuuuuuzCuEF:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: movw $8240, %ax # imm = 0x2030
; AVX512BW-NEXT: kmovd %eax, %k0
; AVX512BW-NEXT: knotw %k0, %k1
@@ -479,7 +467,7 @@ define <16 x i32> @merge_16i32_i32_0uu3zzuuuuuzCuEF(i32* %ptr) nounwind uwtable
; AVX512BW-NEXT: retq
;
; X32-AVX512F-LABEL: merge_16i32_i32_0uu3zzuuuuuzCuEF:
-; X32-AVX512F: # BB#0:
+; X32-AVX512F: # %bb.0:
; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-AVX512F-NEXT: movw $8240, %cx # imm = 0x2030
; X32-AVX512F-NEXT: kmovw %ecx, %k0
@@ -509,21 +497,21 @@ define <16 x i32> @merge_16i32_i32_0uu3zzuuuuuzCuEF(i32* %ptr) nounwind uwtable
define <32 x i16> @merge_32i16_i16_12u4uuuuuuuuuuuuuuuuuuuuuuuuuuzz(i16* %ptr) nounwind uwtable noinline ssp {
; AVX512F-LABEL: merge_32i16_i16_12u4uuuuuuuuuuuuuuuuuuuuuuuuuuzz:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX512F-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; AVX512F-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: merge_32i16_i16_12u4uuuuuuuuuuuuuuuuuuuuuuuuuuzz:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX512BW-NEXT: retq
;
; X32-AVX512F-LABEL: merge_32i16_i16_12u4uuuuuuuuuuuuuuuuuuuuuuuuuuzz:
-; X32-AVX512F: # BB#0:
+; X32-AVX512F: # %bb.0:
; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; X32-AVX512F-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; X32-AVX512F-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X32-AVX512F-NEXT: retl
%ptr0 = getelementptr inbounds i16, i16* %ptr, i64 1
%ptr1 = getelementptr inbounds i16, i16* %ptr, i64 2
@@ -541,12 +529,12 @@ define <32 x i16> @merge_32i16_i16_12u4uuuuuuuuuuuuuuuuuuuuuuuuuuzz(i16* %ptr) n
define <32 x i16> @merge_32i16_i16_45u7uuuuuuuuuuuuuuuuuuuuuuuuuuuu(i16* %ptr) nounwind uwtable noinline ssp {
; ALL-LABEL: merge_32i16_i16_45u7uuuuuuuuuuuuuuuuuuuuuuuuuuuu:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; ALL-NEXT: retq
;
; X32-AVX512F-LABEL: merge_32i16_i16_45u7uuuuuuuuuuuuuuuuuuuuuuuuuuuu:
-; X32-AVX512F: # BB#0:
+; X32-AVX512F: # %bb.0:
; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; X32-AVX512F-NEXT: retl
@@ -564,21 +552,21 @@ define <32 x i16> @merge_32i16_i16_45u7uuuuuuuuuuuuuuuuuuuuuuuuuuuu(i16* %ptr) n
define <32 x i16> @merge_32i16_i16_23uzuuuuuuuuuuzzzzuuuuuuuuuuuuuu(i16* %ptr) nounwind uwtable noinline ssp {
; AVX512F-LABEL: merge_32i16_i16_23uzuuuuuuuuuuzzzzuuuuuuuuuuuuuu:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX512F-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; AVX512F-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: merge_32i16_i16_23uzuuuuuuuuuuzzzzuuuuuuuuuuuuuu:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX512BW-NEXT: retq
;
; X32-AVX512F-LABEL: merge_32i16_i16_23uzuuuuuuuuuuzzzzuuuuuuuuuuuuuu:
-; X32-AVX512F: # BB#0:
+; X32-AVX512F: # %bb.0:
; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-AVX512F-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X32-AVX512F-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; X32-AVX512F-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X32-AVX512F-NEXT: retl
%ptr0 = getelementptr inbounds i16, i16* %ptr, i64 2
%ptr1 = getelementptr inbounds i16, i16* %ptr, i64 3
@@ -596,21 +584,21 @@ define <32 x i16> @merge_32i16_i16_23uzuuuuuuuuuuzzzzuuuuuuuuuuuuuu(i16* %ptr) n
define <64 x i8> @merge_64i8_i8_12u4uuu8uuuuuuzzzzuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuz(i8* %ptr) nounwind uwtable noinline ssp {
; AVX512F-LABEL: merge_64i8_i8_12u4uuu8uuuuuuzzzzuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuz:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX512F-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; AVX512F-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: merge_64i8_i8_12u4uuu8uuuuuuzzzzuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuz:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX512BW-NEXT: retq
;
; X32-AVX512F-LABEL: merge_64i8_i8_12u4uuu8uuuuuuzzzzuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuz:
-; X32-AVX512F: # BB#0:
+; X32-AVX512F: # %bb.0:
; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; X32-AVX512F-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; X32-AVX512F-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X32-AVX512F-NEXT: retl
%ptr0 = getelementptr inbounds i8, i8* %ptr, i64 1
%ptr1 = getelementptr inbounds i8, i8* %ptr, i64 2
@@ -634,21 +622,21 @@ define <64 x i8> @merge_64i8_i8_12u4uuu8uuuuuuzzzzuuuuuuuuuuuuuuuuuuuuuuuuuuuuuu
define <64 x i8> @merge_64i8_i8_12u4uuuuuuuuuuzzzzuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuz(i8* %ptr) nounwind uwtable noinline ssp {
; AVX512F-LABEL: merge_64i8_i8_12u4uuuuuuuuuuzzzzuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuz:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX512F-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; AVX512F-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: merge_64i8_i8_12u4uuuuuuuuuuzzzzuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuz:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX512BW-NEXT: retq
;
; X32-AVX512F-LABEL: merge_64i8_i8_12u4uuuuuuuuuuzzzzuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuz:
-; X32-AVX512F: # BB#0:
+; X32-AVX512F: # %bb.0:
; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-AVX512F-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X32-AVX512F-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; X32-AVX512F-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X32-AVX512F-NEXT: retl
%ptr0 = getelementptr inbounds i8, i8* %ptr, i64 1
%ptr1 = getelementptr inbounds i8, i8* %ptr, i64 2
@@ -673,7 +661,7 @@ define <64 x i8> @merge_64i8_i8_12u4uuuuuuuuuuzzzzuuuuuuuuuuuuuuuuuuuuuuuuuuuuuu
define <8 x double> @merge_8f64_f64_23uuuuu9_volatile(double* %ptr) nounwind uwtable noinline ssp {
; ALL-LABEL: merge_8f64_f64_23uuuuu9_volatile:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; ALL-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
; ALL-NEXT: vbroadcastsd 72(%rdi), %ymm1
@@ -681,7 +669,7 @@ define <8 x double> @merge_8f64_f64_23uuuuu9_volatile(double* %ptr) nounwind uwt
; ALL-NEXT: retq
;
; X32-AVX512F-LABEL: merge_8f64_f64_23uuuuu9_volatile:
-; X32-AVX512F: # BB#0:
+; X32-AVX512F: # %bb.0:
; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; X32-AVX512F-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
@@ -702,7 +690,7 @@ define <8 x double> @merge_8f64_f64_23uuuuu9_volatile(double* %ptr) nounwind uwt
define <16 x i32> @merge_16i32_i32_0uu3uuuuuuuuCuEF_volatile(i32* %ptr) nounwind uwtable noinline ssp {
; ALL-LABEL: merge_16i32_i32_0uu3uuuuuuuuCuEF_volatile:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; ALL-NEXT: vpinsrd $3, 12(%rdi), %xmm0, %xmm0
; ALL-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
@@ -713,7 +701,7 @@ define <16 x i32> @merge_16i32_i32_0uu3uuuuuuuuCuEF_volatile(i32* %ptr) nounwind
; ALL-NEXT: retq
;
; X32-AVX512F-LABEL: merge_16i32_i32_0uu3uuuuuuuuCuEF_volatile:
-; X32-AVX512F: # BB#0:
+; X32-AVX512F: # %bb.0:
; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-AVX512F-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X32-AVX512F-NEXT: vpinsrd $3, 12(%eax), %xmm0, %xmm0
diff --git a/test/CodeGen/X86/merge-consecutive-stores-i1.ll b/test/CodeGen/X86/merge-consecutive-stores-i1.ll
index a7f5c2142271..89aa77903718 100644
--- a/test/CodeGen/X86/merge-consecutive-stores-i1.ll
+++ b/test/CodeGen/X86/merge-consecutive-stores-i1.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=x86-64 < %s
+; RUN: llc -mtriple=x86_64-- < %s
; Ensure that MergeConsecutiveStores doesn't crash when dealing with
; i1 operands.
diff --git a/test/CodeGen/X86/merge-consecutive-stores.ll b/test/CodeGen/X86/merge-consecutive-stores.ll
index 426529529891..af5fb478e522 100644
--- a/test/CodeGen/X86/merge-consecutive-stores.ll
+++ b/test/CodeGen/X86/merge-consecutive-stores.ll
@@ -6,7 +6,7 @@
define i32 @foo (i64* %so) nounwind uwtable ssp {
; CHECK-LABEL: foo:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: movl $0, 28(%eax)
; CHECK-NEXT: movl $0, 24(%eax)
@@ -16,11 +16,9 @@ define i32 @foo (i64* %so) nounwind uwtable ssp {
; CHECK-NEXT: cmpl 16(%eax), %edx
; CHECK-NEXT: movl $0, 16(%eax)
; CHECK-NEXT: sbbl %ecx, %edx
-; CHECK-NEXT: movl $-1, %eax
-; CHECK-NEXT: jl .LBB0_2
-; CHECK-NEXT: # BB#1:
-; CHECK-NEXT: xorl %eax, %eax
-; CHECK-NEXT: .LBB0_2:
+; CHECK-NEXT: setl %al
+; CHECK-NEXT: movzbl %al, %eax
+; CHECK-NEXT: negl %eax
; CHECK-NEXT: retl
%used = getelementptr inbounds i64, i64* %so, i32 3
store i64 0, i64* %used, align 8
diff --git a/test/CodeGen/X86/merge-store-constants.ll b/test/CodeGen/X86/merge-store-constants.ll
new file mode 100644
index 000000000000..b38019f860ae
--- /dev/null
+++ b/test/CodeGen/X86/merge-store-constants.ll
@@ -0,0 +1,146 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=avx | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx | FileCheck %s --check-prefix=X64
+
+define void @big_nonzero_16_bytes(i32* nocapture %a) {
+; X32-LABEL: big_nonzero_16_bytes:
+; X32: # %bb.0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vmovaps {{.*#+}} xmm0 = [1,2,3,4]
+; X32-NEXT: vmovups %xmm0, (%eax)
+; X32-NEXT: retl
+;
+; X64-LABEL: big_nonzero_16_bytes:
+; X64: # %bb.0:
+; X64-NEXT: vmovaps {{.*#+}} xmm0 = [1,2,3,4]
+; X64-NEXT: vmovups %xmm0, (%rdi)
+; X64-NEXT: retq
+ %arrayidx1 = getelementptr inbounds i32, i32* %a, i64 1
+ %arrayidx2 = getelementptr inbounds i32, i32* %a, i64 2
+ %arrayidx3 = getelementptr inbounds i32, i32* %a, i64 3
+
+ store i32 1, i32* %a, align 4
+ store i32 2, i32* %arrayidx1, align 4
+ store i32 3, i32* %arrayidx2, align 4
+ store i32 4, i32* %arrayidx3, align 4
+ ret void
+}
+
+; TODO: We assumed that two 64-bit stores were better than 1 vector load and 1 vector store.
+; But if the 64-bit constants can't be represented as sign-extended 32-bit constants, then
+; it takes extra instructions to do this in scalar.
+
+define void @big_nonzero_16_bytes_big64bit_constants(i64* nocapture %a) {
+; X32-LABEL: big_nonzero_16_bytes_big64bit_constants:
+; X32: # %bb.0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vmovaps {{.*#+}} xmm0 = [1,1,1,3]
+; X32-NEXT: vmovups %xmm0, (%eax)
+; X32-NEXT: retl
+;
+; X64-LABEL: big_nonzero_16_bytes_big64bit_constants:
+; X64: # %bb.0:
+; X64-NEXT: movabsq $4294967297, %rax # imm = 0x100000001
+; X64-NEXT: movq %rax, (%rdi)
+; X64-NEXT: movabsq $12884901889, %rax # imm = 0x300000001
+; X64-NEXT: movq %rax, 8(%rdi)
+; X64-NEXT: retq
+ %arrayidx1 = getelementptr inbounds i64, i64* %a, i64 1
+
+ store i64 4294967297, i64* %a
+ store i64 12884901889, i64* %arrayidx1
+ ret void
+}
+
+; Splats may be an opportunity to use a broadcast op.
+
+define void @big_nonzero_32_bytes_splat(i32* nocapture %a) {
+; X32-LABEL: big_nonzero_32_bytes_splat:
+; X32: # %bb.0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42]
+; X32-NEXT: vmovups %ymm0, (%eax)
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: big_nonzero_32_bytes_splat:
+; X64: # %bb.0:
+; X64-NEXT: vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42]
+; X64-NEXT: vmovups %ymm0, (%rdi)
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+ %arrayidx1 = getelementptr inbounds i32, i32* %a, i64 1
+ %arrayidx2 = getelementptr inbounds i32, i32* %a, i64 2
+ %arrayidx3 = getelementptr inbounds i32, i32* %a, i64 3
+ %arrayidx4 = getelementptr inbounds i32, i32* %a, i64 4
+ %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 5
+ %arrayidx6 = getelementptr inbounds i32, i32* %a, i64 6
+ %arrayidx7 = getelementptr inbounds i32, i32* %a, i64 7
+
+ store i32 42, i32* %a, align 4
+ store i32 42, i32* %arrayidx1, align 4
+ store i32 42, i32* %arrayidx2, align 4
+ store i32 42, i32* %arrayidx3, align 4
+ store i32 42, i32* %arrayidx4, align 4
+ store i32 42, i32* %arrayidx5, align 4
+ store i32 42, i32* %arrayidx6, align 4
+ store i32 42, i32* %arrayidx7, align 4
+ ret void
+}
+
+; Verify that we choose the best-sized store(s) for each chunk.
+
+define void @big_nonzero_63_bytes(i8* nocapture %a) {
+; X32-LABEL: big_nonzero_63_bytes:
+; X32: # %bb.0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vmovaps {{.*#+}} ymm0 = [1,0,2,0,3,0,4,0]
+; X32-NEXT: vmovups %ymm0, (%eax)
+; X32-NEXT: vmovaps {{.*#+}} xmm0 = [5,0,6,0]
+; X32-NEXT: vmovups %xmm0, 32(%eax)
+; X32-NEXT: movl $0, 52(%eax)
+; X32-NEXT: movl $7, 48(%eax)
+; X32-NEXT: movl $8, 56(%eax)
+; X32-NEXT: movw $9, 60(%eax)
+; X32-NEXT: movb $10, 62(%eax)
+; X32-NEXT: vzeroupper
+; X32-NEXT: retl
+;
+; X64-LABEL: big_nonzero_63_bytes:
+; X64: # %bb.0:
+; X64-NEXT: vmovaps {{.*#+}} ymm0 = [1,2,3,4]
+; X64-NEXT: vmovups %ymm0, (%rdi)
+; X64-NEXT: movq $5, 32(%rdi)
+; X64-NEXT: movq $6, 40(%rdi)
+; X64-NEXT: movq $7, 48(%rdi)
+; X64-NEXT: movl $8, 56(%rdi)
+; X64-NEXT: movw $9, 60(%rdi)
+; X64-NEXT: movb $10, 62(%rdi)
+; X64-NEXT: vzeroupper
+; X64-NEXT: retq
+ %a8 = bitcast i8* %a to i64*
+ %arrayidx8 = getelementptr inbounds i64, i64* %a8, i64 1
+ %arrayidx16 = getelementptr inbounds i64, i64* %a8, i64 2
+ %arrayidx24 = getelementptr inbounds i64, i64* %a8, i64 3
+ %arrayidx32 = getelementptr inbounds i64, i64* %a8, i64 4
+ %arrayidx40 = getelementptr inbounds i64, i64* %a8, i64 5
+ %arrayidx48 = getelementptr inbounds i64, i64* %a8, i64 6
+ %a4 = bitcast i8* %a to i32*
+ %arrayidx56 = getelementptr inbounds i32, i32* %a4, i64 14
+ %a2 = bitcast i8* %a to i16*
+ %arrayidx60 = getelementptr inbounds i16, i16* %a2, i64 30
+ %arrayidx62 = getelementptr inbounds i8, i8* %a, i64 62
+
+ store i64 1, i64* %a8
+ store i64 2, i64* %arrayidx8
+ store i64 3, i64* %arrayidx16
+ store i64 4, i64* %arrayidx24
+ store i64 5, i64* %arrayidx32
+ store i64 6, i64* %arrayidx40
+ store i64 7, i64* %arrayidx48
+ store i32 8, i32* %arrayidx56
+ store i16 9, i16* %arrayidx60
+ store i8 10, i8* %arrayidx62
+ ret void
+}
+
diff --git a/test/CodeGen/X86/merge-store-partially-alias-loads.ll b/test/CodeGen/X86/merge-store-partially-alias-loads.ll
index 6ca964be9570..8e3c4305d50a 100644
--- a/test/CodeGen/X86/merge-store-partially-alias-loads.ll
+++ b/test/CodeGen/X86/merge-store-partially-alias-loads.ll
@@ -1,6 +1,6 @@
; REQUIRES: asserts
-; RUN: llc -march=x86-64 -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck -check-prefix=X86 %s
-; RUN: llc -march=x86-64 -mtriple=x86_64-unknown-linux-gnu -debug-only=isel < %s 2>&1 | FileCheck -check-prefix=DBGDAG %s
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck -check-prefix=X86 %s
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -debug-only=isel < %s -o /dev/null 2>&1 | FileCheck -check-prefix=DBGDAG %s
; It's OK to merge the load / store of the first 2 components, but
; they must not be placed on the same chain after merging.
@@ -13,10 +13,10 @@
; X86-NEXT: movb [[HI1]], 3([[BASEREG]])
; X86-NEXT: retq
-; DBGDAG-LABEL: Optimized lowered selection DAG: BB#0 'merge_store_partial_overlap_load:'
+; DBGDAG-LABEL: Optimized legalized selection DAG: %bb.0 'merge_store_partial_overlap_load:'
; DBGDAG: [[ENTRYTOKEN:t[0-9]+]]: ch = EntryToken
; DBGDAG-DAG: [[BASEPTR:t[0-9]+]]: i64,ch = CopyFromReg [[ENTRYTOKEN]],
-; DBGDAG-DAG: [[ADDPTR:t[0-9]+]]: i64 = add [[BASEPTR]], Constant:i64<2>
+; DBGDAG-DAG: [[ADDPTR:t[0-9]+]]: i64 = add {{(nuw )?}}[[BASEPTR]], Constant:i64<2>
; DBGDAG-DAG: [[LD2:t[0-9]+]]: i16,ch = load<LD2[%tmp81](align=1)> [[ENTRYTOKEN]], [[BASEPTR]], undef:i64
; DBGDAG-DAG: [[LD1:t[0-9]+]]: i8,ch = load<LD1[%tmp12]> [[ENTRYTOKEN]], [[ADDPTR]], undef:i64
@@ -27,7 +27,7 @@
; DBGDAG: X86ISD::RET_FLAG t{{[0-9]+}},
-; DBGDAG: Type-legalized selection DAG: BB#0 'merge_store_partial_overlap_load:'
+; DBGDAG-LABEL: Instruction selection begins
define void @merge_store_partial_overlap_load([4 x i8]* %tmp) {
%tmp8 = getelementptr inbounds [4 x i8], [4 x i8]* %tmp, i32 0, i8 0
%tmp10 = getelementptr inbounds [4 x i8], [4 x i8]* %tmp, i32 0, i8 1
diff --git a/test/CodeGen/X86/merge_store.ll b/test/CodeGen/X86/merge_store.ll
index f4c4c6d36067..f03175057fdf 100644
--- a/test/CodeGen/X86/merge_store.ll
+++ b/test/CodeGen/X86/merge_store.ll
@@ -1,9 +1,21 @@
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s
define void @merge_store(i32* nocapture %a) {
; CHECK-LABEL: merge_store:
-; CHECK: movq
-; CHECK: movq
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: xorl %eax, %eax
+; CHECK-NEXT: movabsq $4294967297, %rcx # imm = 0x100000001
+; CHECK-NEXT: .p2align 4, 0x90
+; CHECK-NEXT: .LBB0_1: # %for.body
+; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: movq %rcx, (%rdi,%rax,4)
+; CHECK-NEXT: movq %rcx, 8(%rdi,%rax,4)
+; CHECK-NEXT: addq $4, %rax
+; CHECK-NEXT: cmpl $1000, %eax # imm = 0x3E8
+; CHECK-NEXT: jl .LBB0_1
+; CHECK-NEXT: # %bb.2: # %for.end
+; CHECK-NEXT: retq
entry:
br label %for.body
@@ -29,10 +41,12 @@ entry:
ret void
}
-;; CHECK-LABEL: indexed-store-merge
-;; CHECK: movl $0, 2(%rsi,%rdi)
-;; CHECK: movb $0, (%rsi)
-define void @indexed-store-merge(i64 %p, i8* %v) {
+define void @indexed_store_merge(i64 %p, i8* %v) {
+; CHECK-LABEL: indexed_store_merge:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: movl $0, 2(%rsi,%rdi)
+; CHECK-NEXT: movb $0, (%rsi)
+; CHECK-NEXT: retq
entry:
%p2 = add nsw i64 %p, 2
%v2 = getelementptr i8, i8* %v, i64 %p2
diff --git a/test/CodeGen/X86/merge_store_duplicated_loads.ll b/test/CodeGen/X86/merge_store_duplicated_loads.ll
index cfc39035e403..9ef3255123c7 100644
--- a/test/CodeGen/X86/merge_store_duplicated_loads.ll
+++ b/test/CodeGen/X86/merge_store_duplicated_loads.ll
@@ -6,7 +6,7 @@ target triple = "x86_64-unknown-linux-gnu"
define void @merge_double(double* noalias nocapture %st, double* noalias nocapture readonly %ld) #0 {
; CHECK-LABEL: merge_double:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
; CHECK-NEXT: movsd %xmm0, (%rdi)
@@ -31,7 +31,7 @@ define void @merge_double(double* noalias nocapture %st, double* noalias nocaptu
define void @merge_loadstore_int(i64* noalias nocapture readonly %p, i64* noalias nocapture %q) local_unnamed_addr #0 {
; CHECK-LABEL: merge_loadstore_int:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movq (%rdi), %rax
; CHECK-NEXT: movq 8(%rdi), %rcx
; CHECK-NEXT: movq %rax, (%rsi)
@@ -55,7 +55,7 @@ entry:
define i64 @merge_loadstore_int_with_extra_use(i64* noalias nocapture readonly %p, i64* noalias nocapture %q) local_unnamed_addr #0 {
; CHECK-LABEL: merge_loadstore_int_with_extra_use:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movq (%rdi), %rax
; CHECK-NEXT: movq 8(%rdi), %rcx
; CHECK-NEXT: movq %rax, (%rsi)
diff --git a/test/CodeGen/X86/mfence.ll b/test/CodeGen/X86/mfence.ll
index b67a5c355044..93d99076d825 100644
--- a/test/CodeGen/X86/mfence.ll
+++ b/test/CodeGen/X86/mfence.ll
@@ -6,12 +6,12 @@
define void @test() {
; X32-LABEL: test:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: mfence
; X32-NEXT: retl
;
; X64-LABEL: test:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: mfence
; X64-NEXT: retq
fence seq_cst
@@ -20,14 +20,14 @@ define void @test() {
define i32 @fence(i32* %ptr) {
; X32-LABEL: fence:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: mfence
; X32-NEXT: movl (%eax), %eax
; X32-NEXT: retl
;
; X64-LABEL: fence:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: mfence
; X64-NEXT: movl (%rdi), %eax
; X64-NEXT: retq
diff --git a/test/CodeGen/X86/misched-code-difference-with-debug.ll b/test/CodeGen/X86/misched-code-difference-with-debug.ll
index 4d7badb2be24..5e38aab46e2b 100644
--- a/test/CodeGen/X86/misched-code-difference-with-debug.ll
+++ b/test/CodeGen/X86/misched-code-difference-with-debug.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mtriple=x86_64-unknown-unknown -mcpu=generic | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=generic | FileCheck %s
; Both functions should produce the same code. The presence of debug values
; should not affect the scheduling strategy.
; Generated from:
@@ -67,7 +67,7 @@ attributes #0 = { nounwind readnone }
!llvm.dbg.cu = !{!4}
!llvm.module.flags = !{!15, !16}
-!0 = !DIGlobalVariableExpression(var: !1)
+!0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression())
!1 = !DIGlobalVariable(name: "argc", scope: null, file: !2, line: 1, type: !3, isLocal: false, isDefinition: true)
!2 = !DIFile(filename: "test.cpp", directory: "")
!3 = !DIBasicType(name: "char", size: 8, align: 8, encoding: DW_ATE_signed_char)
diff --git a/test/CodeGen/X86/misched-copy.ll b/test/CodeGen/X86/misched-copy.ll
index 7abd157f147a..d43ec941b701 100644
--- a/test/CodeGen/X86/misched-copy.ll
+++ b/test/CodeGen/X86/misched-copy.ll
@@ -1,5 +1,5 @@
; REQUIRES: asserts
-; RUN: llc < %s -verify-machineinstrs -march=x86 -mcpu=core2 -pre-RA-sched=source -enable-misched -verify-misched -debug-only=machine-scheduler -o - 2>&1 > /dev/null | FileCheck %s
+; RUN: llc < %s -verify-machineinstrs -mtriple=i686-- -mcpu=core2 -pre-RA-sched=source -enable-misched -verify-misched -debug-only=machine-scheduler -o - 2>&1 > /dev/null | FileCheck %s
;
; Test scheduling of copy instructions.
;
@@ -8,11 +8,11 @@
; MUL_HiLo PhysReg use copies should be just above the mul.
; MUL_HiLo PhysReg def copies should be just below the mul.
;
-; CHECK: *** Final schedule for BB#1 ***
-; CHECK: %EAX<def> = COPY
-; CHECK-NEXT: MUL32r %vreg{{[0-9]+}}, %EAX<imp-def>, %EDX<imp-def>, %EFLAGS<imp-def,dead>, %EAX<imp-use>;
-; CHECK-NEXT: COPY %E{{[AD]}}X
-; CHECK-NEXT: COPY %E{{[AD]}}X
+; CHECK: *** Final schedule for %bb.1 ***
+; CHECK: %eax = COPY
+; CHECK-NEXT: MUL32r %{{[0-9]+}}, implicit-def %eax, implicit-def %edx, implicit-def dead %eflags, implicit %eax;
+; CHECK-NEXT: COPY %e{{[ad]}}x
+; CHECK-NEXT: COPY %e{{[ad]}}x
; CHECK: DIVSSrm
define i64 @mulhoist(i32 %a, i32 %b) #0 {
entry:
diff --git a/test/CodeGen/X86/misched-fusion.ll b/test/CodeGen/X86/misched-fusion.ll
index 0975faacb9ed..ec739093c9c9 100644
--- a/test/CodeGen/X86/misched-fusion.ll
+++ b/test/CodeGen/X86/misched-fusion.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mcpu=corei7-avx -disable-lsr -pre-RA-sched=source -enable-misched -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=corei7-avx -mattr=-slow-incdec -disable-lsr -pre-RA-sched=source -enable-misched -verify-machineinstrs | FileCheck %s
; Verify that TEST+JE are scheduled together.
; CHECK: test_je
diff --git a/test/CodeGen/X86/misched-matmul.ll b/test/CodeGen/X86/misched-matmul.ll
index 384344691f9b..be3f086809d9 100644
--- a/test/CodeGen/X86/misched-matmul.ll
+++ b/test/CodeGen/X86/misched-matmul.ll
@@ -1,5 +1,5 @@
; REQUIRES: asserts
-; RUN: llc < %s -march=x86-64 -mcpu=core2 -pre-RA-sched=source -enable-misched -stats 2>&1 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=core2 -pre-RA-sched=source -enable-misched -stats 2>&1 | FileCheck %s
;
; Verify that register pressure heuristics are working in MachineScheduler.
;
diff --git a/test/CodeGen/X86/misched-matrix.ll b/test/CodeGen/X86/misched-matrix.ll
index 94bbe75702cb..495ca711e989 100644
--- a/test/CodeGen/X86/misched-matrix.ll
+++ b/test/CodeGen/X86/misched-matrix.ll
@@ -1,10 +1,10 @@
-; RUN: llc < %s -march=x86-64 -mcpu=core2 -pre-RA-sched=source -enable-misched \
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=core2 -pre-RA-sched=source -enable-misched \
; RUN: -misched-topdown -verify-machineinstrs \
; RUN: | FileCheck %s -check-prefix=TOPDOWN
-; RUN: llc < %s -march=x86-64 -mcpu=core2 -pre-RA-sched=source -enable-misched \
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=core2 -pre-RA-sched=source -enable-misched \
; RUN: -misched=ilpmin -verify-machineinstrs \
; RUN: | FileCheck %s -check-prefix=ILPMIN
-; RUN: llc < %s -march=x86-64 -mcpu=core2 -pre-RA-sched=source -enable-misched \
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=core2 -pre-RA-sched=source -enable-misched \
; RUN: -misched=ilpmax -verify-machineinstrs \
; RUN: | FileCheck %s -check-prefix=ILPMAX
;
diff --git a/test/CodeGen/X86/misched-new.ll b/test/CodeGen/X86/misched-new.ll
index 410a7f320643..4e42c9314541 100644
--- a/test/CodeGen/X86/misched-new.ll
+++ b/test/CodeGen/X86/misched-new.ll
@@ -1,7 +1,7 @@
-; RUN: llc < %s -march=x86-64 -mcpu=core2 -x86-early-ifcvt -enable-misched \
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=core2 -x86-early-ifcvt -enable-misched \
; RUN: -misched=shuffle -misched-bottomup -verify-machineinstrs \
; RUN: | FileCheck %s
-; RUN: llc < %s -march=x86-64 -mcpu=core2 -x86-early-ifcvt -enable-misched \
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=core2 -x86-early-ifcvt -enable-misched \
; RUN: -misched=shuffle -misched-topdown -verify-machineinstrs \
; RUN: | FileCheck %s --check-prefix TOPDOWN
; REQUIRES: asserts
diff --git a/test/CodeGen/X86/mmx-arg-passing-x86-64.ll b/test/CodeGen/X86/mmx-arg-passing-x86-64.ll
index 41f9a7822b27..b88916053bec 100644
--- a/test/CodeGen/X86/mmx-arg-passing-x86-64.ll
+++ b/test/CodeGen/X86/mmx-arg-passing-x86-64.ll
@@ -8,7 +8,7 @@
define void @t3() nounwind {
; X86-64-LABEL: t3:
-; X86-64: ## BB#0:
+; X86-64: ## %bb.0:
; X86-64-NEXT: movq _g_v8qi@{{.*}}(%rip), %rax
; X86-64-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; X86-64-NEXT: movb $1, %al
@@ -21,7 +21,7 @@ define void @t3() nounwind {
define void @t4(x86_mmx %v1, x86_mmx %v2) nounwind {
; X86-64-LABEL: t4:
-; X86-64: ## BB#0:
+; X86-64: ## %bb.0:
; X86-64-NEXT: movdq2q %xmm1, %mm0
; X86-64-NEXT: movq %mm0, -{{[0-9]+}}(%rsp)
; X86-64-NEXT: movdq2q %xmm0, %mm0
@@ -41,7 +41,7 @@ define void @t4(x86_mmx %v1, x86_mmx %v2) nounwind {
define void @t5() nounwind {
; X86-64-LABEL: t5:
-; X86-64: ## BB#0:
+; X86-64: ## %bb.0:
; X86-64-NEXT: pushq %rax
; X86-64-NEXT: xorl %edi, %edi
; X86-64-NEXT: callq _pass_v1di
diff --git a/test/CodeGen/X86/mmx-arg-passing.ll b/test/CodeGen/X86/mmx-arg-passing.ll
index 67ccb9e32dde..4ea00b2e9ac1 100644
--- a/test/CodeGen/X86/mmx-arg-passing.ll
+++ b/test/CodeGen/X86/mmx-arg-passing.ll
@@ -12,13 +12,13 @@
define void @t1(x86_mmx %v1) nounwind {
; X86-32-LABEL: t1:
-; X86-32: ## BB#0:
+; X86-32: ## %bb.0:
; X86-32-NEXT: movl L_u1$non_lazy_ptr, %eax
; X86-32-NEXT: movq %mm0, (%eax)
; X86-32-NEXT: retl
;
; X86-64-LABEL: t1:
-; X86-64: ## BB#0:
+; X86-64: ## %bb.0:
; X86-64-NEXT: movdq2q %xmm0, %mm0
; X86-64-NEXT: movq _u1@{{.*}}(%rip), %rax
; X86-64-NEXT: movq %mm0, (%rax)
@@ -31,7 +31,7 @@ define void @t1(x86_mmx %v1) nounwind {
define void @t2(<1 x i64> %v1) nounwind {
; X86-32-LABEL: t2:
-; X86-32: ## BB#0:
+; X86-32: ## %bb.0:
; X86-32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-32-NEXT: movl L_u2$non_lazy_ptr, %edx
@@ -40,7 +40,7 @@ define void @t2(<1 x i64> %v1) nounwind {
; X86-32-NEXT: retl
;
; X86-64-LABEL: t2:
-; X86-64: ## BB#0:
+; X86-64: ## %bb.0:
; X86-64-NEXT: movq _u2@{{.*}}(%rip), %rax
; X86-64-NEXT: movq %rdi, (%rax)
; X86-64-NEXT: retq
diff --git a/test/CodeGen/X86/mmx-arith.ll b/test/CodeGen/X86/mmx-arith.ll
index 114d2535d603..7664ec2684f6 100644
--- a/test/CodeGen/X86/mmx-arith.ll
+++ b/test/CodeGen/X86/mmx-arith.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=x86 -mattr=+mmx,+sse2 | FileCheck -check-prefix=X32 %s
-; RUN: llc < %s -march=x86-64 -mattr=+mmx,+sse2 | FileCheck -check-prefix=X64 %s
+; RUN: llc < %s -mtriple=i686-- -mattr=+mmx,+sse2 | FileCheck -check-prefix=X32 %s
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+mmx,+sse2 | FileCheck -check-prefix=X64 %s
;; A basic sanity check to make sure that MMX arithmetic actually compiles.
;; First is a straight translation of the original with bitcasts as needed.
diff --git a/test/CodeGen/X86/mmx-bitcast.ll b/test/CodeGen/X86/mmx-bitcast.ll
index 30cf474dc38b..d3befdaeff8f 100644
--- a/test/CodeGen/X86/mmx-bitcast.ll
+++ b/test/CodeGen/X86/mmx-bitcast.ll
@@ -3,7 +3,7 @@
define i64 @t0(x86_mmx* %p) {
; CHECK-LABEL: t0:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: movq (%rdi), %mm0
; CHECK-NEXT: paddq %mm0, %mm0
; CHECK-NEXT: movd %mm0, %rax
@@ -16,7 +16,7 @@ define i64 @t0(x86_mmx* %p) {
define i64 @t1(x86_mmx* %p) {
; CHECK-LABEL: t1:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: movq (%rdi), %mm0
; CHECK-NEXT: paddd %mm0, %mm0
; CHECK-NEXT: movd %mm0, %rax
@@ -29,7 +29,7 @@ define i64 @t1(x86_mmx* %p) {
define i64 @t2(x86_mmx* %p) {
; CHECK-LABEL: t2:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: movq (%rdi), %mm0
; CHECK-NEXT: paddw %mm0, %mm0
; CHECK-NEXT: movd %mm0, %rax
@@ -42,7 +42,7 @@ define i64 @t2(x86_mmx* %p) {
define i64 @t3(x86_mmx* %p) {
; CHECK-LABEL: t3:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: movq (%rdi), %mm0
; CHECK-NEXT: paddb %mm0, %mm0
; CHECK-NEXT: movd %mm0, %rax
@@ -57,7 +57,7 @@ define i64 @t3(x86_mmx* %p) {
define void @t4(<1 x i64> %A, <1 x i64> %B) {
; CHECK-LABEL: t4:
-; CHECK: ## BB#0: ## %entry
+; CHECK: ## %bb.0: ## %entry
; CHECK-NEXT: movd %rdi, %mm0
; CHECK-NEXT: movd %rsi, %mm1
; CHECK-NEXT: paddusw %mm0, %mm1
@@ -76,7 +76,7 @@ entry:
define i64 @t5(i32 %a, i32 %b) nounwind readnone {
; CHECK-LABEL: t5:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: movd %esi, %xmm0
; CHECK-NEXT: movd %edi, %xmm1
; CHECK-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
@@ -92,7 +92,7 @@ declare x86_mmx @llvm.x86.mmx.pslli.q(x86_mmx, i32)
define <1 x i64> @t6(i64 %t) {
; CHECK-LABEL: t6:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: movd %rdi, %mm0
; CHECK-NEXT: psllq $48, %mm0
; CHECK-NEXT: movd %mm0, %rax
diff --git a/test/CodeGen/X86/mmx-coalescing.ll b/test/CodeGen/X86/mmx-coalescing.ll
index a515e5ee3754..c23e732d9bff 100644
--- a/test/CodeGen/X86/mmx-coalescing.ll
+++ b/test/CodeGen/X86/mmx-coalescing.ll
@@ -8,7 +8,7 @@
define i32 @test(%SA* %pSA, i16* %A, i32 %B, i32 %C, i32 %D, i8* %E) {
entry:
; CHECK-LABEL: test
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: pshufw
; CHECK-NEXT: movd
; CHECK-NOT: movd
diff --git a/test/CodeGen/X86/mmx-copy-gprs.ll b/test/CodeGen/X86/mmx-copy-gprs.ll
index 6d39713833e8..02b94e7e632d 100644
--- a/test/CodeGen/X86/mmx-copy-gprs.ll
+++ b/test/CodeGen/X86/mmx-copy-gprs.ll
@@ -1,7 +1,7 @@
; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s
; RUN: llc < %s -mtriple=x86_64-win32 | FileCheck %s
-; RUN: llc < %s -march=x86 -mattr=-sse2 | FileCheck %s
-; RUN: llc < %s -march=x86 -mattr=+sse2 | FileCheck %s
+; RUN: llc < %s -mtriple=i686-- -mattr=-sse2 | FileCheck %s
+; RUN: llc < %s -mtriple=i686-- -mattr=+sse2 | FileCheck %s
; This test should use GPRs to copy the mmx value, not MMX regs. Using mmx regs,
; increases the places that need to use emms.
diff --git a/test/CodeGen/X86/mmx-cvt.ll b/test/CodeGen/X86/mmx-cvt.ll
index fd6c5081b5a3..ff4edcc82aee 100644
--- a/test/CodeGen/X86/mmx-cvt.ll
+++ b/test/CodeGen/X86/mmx-cvt.ll
@@ -7,7 +7,7 @@
define void @cvt_v2f64_v2i32(<2 x double>, <1 x i64>*) nounwind {
; X86-LABEL: cvt_v2f64_v2i32:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: pushl %ebp
; X86-NEXT: movl %esp, %ebp
; X86-NEXT: andl $-8, %esp
@@ -25,7 +25,7 @@ define void @cvt_v2f64_v2i32(<2 x double>, <1 x i64>*) nounwind {
; X86-NEXT: retl
;
; X64-LABEL: cvt_v2f64_v2i32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: cvtpd2pi %xmm0, %mm0
; X64-NEXT: paddd %mm0, %mm0
; X64-NEXT: movq %mm0, (%rdi)
@@ -43,7 +43,7 @@ define void @cvt_v2f64_v2i32(<2 x double>, <1 x i64>*) nounwind {
define void @cvtt_v2f64_v2i32(<2 x double>, <1 x i64>*) nounwind {
; X86-LABEL: cvtt_v2f64_v2i32:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: pushl %ebp
; X86-NEXT: movl %esp, %ebp
; X86-NEXT: andl $-8, %esp
@@ -61,7 +61,7 @@ define void @cvtt_v2f64_v2i32(<2 x double>, <1 x i64>*) nounwind {
; X86-NEXT: retl
;
; X64-LABEL: cvtt_v2f64_v2i32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: cvttpd2pi %xmm0, %mm0
; X64-NEXT: paddd %mm0, %mm0
; X64-NEXT: movq %mm0, (%rdi)
@@ -79,7 +79,7 @@ define void @cvtt_v2f64_v2i32(<2 x double>, <1 x i64>*) nounwind {
define void @fptosi_v2f64_v2i32(<2 x double>, <1 x i64>*) nounwind {
; X86-LABEL: fptosi_v2f64_v2i32:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: pushl %ebp
; X86-NEXT: movl %esp, %ebp
; X86-NEXT: andl $-8, %esp
@@ -97,7 +97,7 @@ define void @fptosi_v2f64_v2i32(<2 x double>, <1 x i64>*) nounwind {
; X86-NEXT: retl
;
; X64-LABEL: fptosi_v2f64_v2i32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: cvttpd2pi %xmm0, %mm0
; X64-NEXT: paddd %mm0, %mm0
; X64-NEXT: movq %mm0, (%rdi)
@@ -113,7 +113,7 @@ define void @fptosi_v2f64_v2i32(<2 x double>, <1 x i64>*) nounwind {
define void @cvt_v2f32_v2i32(<4 x float>, <1 x i64>*) nounwind {
; X86-LABEL: cvt_v2f32_v2i32:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: pushl %ebp
; X86-NEXT: movl %esp, %ebp
; X86-NEXT: andl $-8, %esp
@@ -131,7 +131,7 @@ define void @cvt_v2f32_v2i32(<4 x float>, <1 x i64>*) nounwind {
; X86-NEXT: retl
;
; X64-LABEL: cvt_v2f32_v2i32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: cvtps2pi %xmm0, %mm0
; X64-NEXT: paddd %mm0, %mm0
; X64-NEXT: movq %mm0, (%rdi)
@@ -149,7 +149,7 @@ define void @cvt_v2f32_v2i32(<4 x float>, <1 x i64>*) nounwind {
define void @cvtt_v2f32_v2i32(<4 x float>, <1 x i64>*) nounwind {
; X86-LABEL: cvtt_v2f32_v2i32:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: pushl %ebp
; X86-NEXT: movl %esp, %ebp
; X86-NEXT: andl $-8, %esp
@@ -167,7 +167,7 @@ define void @cvtt_v2f32_v2i32(<4 x float>, <1 x i64>*) nounwind {
; X86-NEXT: retl
;
; X64-LABEL: cvtt_v2f32_v2i32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: cvttps2pi %xmm0, %mm0
; X64-NEXT: paddd %mm0, %mm0
; X64-NEXT: movq %mm0, (%rdi)
@@ -185,7 +185,7 @@ define void @cvtt_v2f32_v2i32(<4 x float>, <1 x i64>*) nounwind {
define void @fptosi_v4f32_v4i32(<4 x float>, <1 x i64>*) nounwind {
; X86-LABEL: fptosi_v4f32_v4i32:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: pushl %ebp
; X86-NEXT: movl %esp, %ebp
; X86-NEXT: andl $-8, %esp
@@ -203,7 +203,7 @@ define void @fptosi_v4f32_v4i32(<4 x float>, <1 x i64>*) nounwind {
; X86-NEXT: retl
;
; X64-LABEL: fptosi_v4f32_v4i32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: cvttps2pi %xmm0, %mm0
; X64-NEXT: paddd %mm0, %mm0
; X64-NEXT: movq %mm0, (%rdi)
@@ -220,7 +220,7 @@ define void @fptosi_v4f32_v4i32(<4 x float>, <1 x i64>*) nounwind {
define void @fptosi_v2f32_v2i32(<4 x float>, <1 x i64>*) nounwind {
; X86-LABEL: fptosi_v2f32_v2i32:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: pushl %ebp
; X86-NEXT: movl %esp, %ebp
; X86-NEXT: andl $-8, %esp
@@ -238,7 +238,7 @@ define void @fptosi_v2f32_v2i32(<4 x float>, <1 x i64>*) nounwind {
; X86-NEXT: retl
;
; X64-LABEL: fptosi_v2f32_v2i32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: cvttps2pi %xmm0, %mm0
; X64-NEXT: paddd %mm0, %mm0
; X64-NEXT: movq %mm0, (%rdi)
@@ -259,7 +259,7 @@ define void @fptosi_v2f32_v2i32(<4 x float>, <1 x i64>*) nounwind {
define <2 x double> @sitofp_v2i32_v2f64(<1 x i64>*) nounwind {
; X86-LABEL: sitofp_v2i32_v2f64:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: pushl %ebp
; X86-NEXT: movl %esp, %ebp
; X86-NEXT: andl $-8, %esp
@@ -274,7 +274,7 @@ define <2 x double> @sitofp_v2i32_v2f64(<1 x i64>*) nounwind {
; X86-NEXT: retl
;
; X64-LABEL: sitofp_v2i32_v2f64:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movq (%rdi), %mm0
; X64-NEXT: paddd %mm0, %mm0
; X64-NEXT: movq2dq %mm0, %xmm0
@@ -293,7 +293,7 @@ define <2 x double> @sitofp_v2i32_v2f64(<1 x i64>*) nounwind {
define <4 x float> @sitofp_v2i32_v2f32(<1 x i64>*) nounwind {
; X86-LABEL: sitofp_v2i32_v2f32:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: pushl %ebp
; X86-NEXT: movl %esp, %ebp
; X86-NEXT: andl $-8, %esp
@@ -309,7 +309,7 @@ define <4 x float> @sitofp_v2i32_v2f32(<1 x i64>*) nounwind {
; X86-NEXT: retl
;
; X64-LABEL: sitofp_v2i32_v2f32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movq (%rdi), %mm0
; X64-NEXT: paddd %mm0, %mm0
; X64-NEXT: movq %mm0, -{{[0-9]+}}(%rsp)
@@ -327,7 +327,7 @@ define <4 x float> @sitofp_v2i32_v2f32(<1 x i64>*) nounwind {
define <4 x float> @cvt_v2i32_v2f32(<1 x i64>*) nounwind {
; X86-LABEL: cvt_v2i32_v2f32:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: pushl %ebp
; X86-NEXT: movl %esp, %ebp
; X86-NEXT: andl $-8, %esp
@@ -343,7 +343,7 @@ define <4 x float> @cvt_v2i32_v2f32(<1 x i64>*) nounwind {
; X86-NEXT: retl
;
; X64-LABEL: cvt_v2i32_v2f32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movq (%rdi), %mm0
; X64-NEXT: paddd %mm0, %mm0
; X64-NEXT: movd %mm0, %rax
diff --git a/test/CodeGen/X86/mmx-fold-load.ll b/test/CodeGen/X86/mmx-fold-load.ll
index 832743870fb4..601d72c0d086 100644
--- a/test/CodeGen/X86/mmx-fold-load.ll
+++ b/test/CodeGen/X86/mmx-fold-load.ll
@@ -4,7 +4,7 @@
define i64 @t0(<1 x i64>* %a, i32* %b) nounwind {
; X86-LABEL: t0:
-; X86: # BB#0: # %entry
+; X86: # %bb.0: # %entry
; X86-NEXT: pushl %ebp
; X86-NEXT: movl %esp, %ebp
; X86-NEXT: andl $-8, %esp
@@ -22,7 +22,7 @@ define i64 @t0(<1 x i64>* %a, i32* %b) nounwind {
; X86-NEXT: retl
;
; X64-LABEL: t0:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: movq (%rdi), %mm0
; X64-NEXT: movd (%rsi), %mm1
; X64-NEXT: psllq %mm1, %mm0
@@ -40,7 +40,7 @@ declare x86_mmx @llvm.x86.mmx.pslli.q(x86_mmx, i32)
define i64 @t1(<1 x i64>* %a, i32* %b) nounwind {
; X86-LABEL: t1:
-; X86: # BB#0: # %entry
+; X86: # %bb.0: # %entry
; X86-NEXT: pushl %ebp
; X86-NEXT: movl %esp, %ebp
; X86-NEXT: andl $-8, %esp
@@ -58,7 +58,7 @@ define i64 @t1(<1 x i64>* %a, i32* %b) nounwind {
; X86-NEXT: retl
;
; X64-LABEL: t1:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: movq (%rdi), %mm0
; X64-NEXT: movd (%rsi), %mm1
; X64-NEXT: psrlq %mm1, %mm0
@@ -76,7 +76,7 @@ declare x86_mmx @llvm.x86.mmx.psrli.q(x86_mmx, i32)
define i64 @t2(<1 x i64>* %a, i32* %b) nounwind {
; X86-LABEL: t2:
-; X86: # BB#0: # %entry
+; X86: # %bb.0: # %entry
; X86-NEXT: pushl %ebp
; X86-NEXT: movl %esp, %ebp
; X86-NEXT: andl $-8, %esp
@@ -94,7 +94,7 @@ define i64 @t2(<1 x i64>* %a, i32* %b) nounwind {
; X86-NEXT: retl
;
; X64-LABEL: t2:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: movq (%rdi), %mm0
; X64-NEXT: movd (%rsi), %mm1
; X64-NEXT: psllw %mm1, %mm0
@@ -112,7 +112,7 @@ declare x86_mmx @llvm.x86.mmx.pslli.w(x86_mmx, i32)
define i64 @t3(<1 x i64>* %a, i32* %b) nounwind {
; X86-LABEL: t3:
-; X86: # BB#0: # %entry
+; X86: # %bb.0: # %entry
; X86-NEXT: pushl %ebp
; X86-NEXT: movl %esp, %ebp
; X86-NEXT: andl $-8, %esp
@@ -130,7 +130,7 @@ define i64 @t3(<1 x i64>* %a, i32* %b) nounwind {
; X86-NEXT: retl
;
; X64-LABEL: t3:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: movq (%rdi), %mm0
; X64-NEXT: movd (%rsi), %mm1
; X64-NEXT: psrlw %mm1, %mm0
@@ -148,7 +148,7 @@ declare x86_mmx @llvm.x86.mmx.psrli.w(x86_mmx, i32)
define i64 @t4(<1 x i64>* %a, i32* %b) nounwind {
; X86-LABEL: t4:
-; X86: # BB#0: # %entry
+; X86: # %bb.0: # %entry
; X86-NEXT: pushl %ebp
; X86-NEXT: movl %esp, %ebp
; X86-NEXT: andl $-8, %esp
@@ -166,7 +166,7 @@ define i64 @t4(<1 x i64>* %a, i32* %b) nounwind {
; X86-NEXT: retl
;
; X64-LABEL: t4:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: movq (%rdi), %mm0
; X64-NEXT: movd (%rsi), %mm1
; X64-NEXT: pslld %mm1, %mm0
@@ -184,7 +184,7 @@ declare x86_mmx @llvm.x86.mmx.pslli.d(x86_mmx, i32)
define i64 @t5(<1 x i64>* %a, i32* %b) nounwind {
; X86-LABEL: t5:
-; X86: # BB#0: # %entry
+; X86: # %bb.0: # %entry
; X86-NEXT: pushl %ebp
; X86-NEXT: movl %esp, %ebp
; X86-NEXT: andl $-8, %esp
@@ -202,7 +202,7 @@ define i64 @t5(<1 x i64>* %a, i32* %b) nounwind {
; X86-NEXT: retl
;
; X64-LABEL: t5:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: movq (%rdi), %mm0
; X64-NEXT: movd (%rsi), %mm1
; X64-NEXT: psrld %mm1, %mm0
@@ -220,7 +220,7 @@ declare x86_mmx @llvm.x86.mmx.psrli.d(x86_mmx, i32)
define i64 @t6(<1 x i64>* %a, i32* %b) nounwind {
; X86-LABEL: t6:
-; X86: # BB#0: # %entry
+; X86: # %bb.0: # %entry
; X86-NEXT: pushl %ebp
; X86-NEXT: movl %esp, %ebp
; X86-NEXT: andl $-8, %esp
@@ -238,7 +238,7 @@ define i64 @t6(<1 x i64>* %a, i32* %b) nounwind {
; X86-NEXT: retl
;
; X64-LABEL: t6:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: movq (%rdi), %mm0
; X64-NEXT: movd (%rsi), %mm1
; X64-NEXT: psraw %mm1, %mm0
@@ -256,7 +256,7 @@ declare x86_mmx @llvm.x86.mmx.psrai.w(x86_mmx, i32)
define i64 @t7(<1 x i64>* %a, i32* %b) nounwind {
; X86-LABEL: t7:
-; X86: # BB#0: # %entry
+; X86: # %bb.0: # %entry
; X86-NEXT: pushl %ebp
; X86-NEXT: movl %esp, %ebp
; X86-NEXT: andl $-8, %esp
@@ -274,7 +274,7 @@ define i64 @t7(<1 x i64>* %a, i32* %b) nounwind {
; X86-NEXT: retl
;
; X64-LABEL: t7:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: movq (%rdi), %mm0
; X64-NEXT: movd (%rsi), %mm1
; X64-NEXT: psrad %mm1, %mm0
@@ -292,7 +292,7 @@ declare x86_mmx @llvm.x86.mmx.psrai.d(x86_mmx, i32)
define i64 @tt0(x86_mmx %t, x86_mmx* %q) nounwind {
; X86-LABEL: tt0:
-; X86: # BB#0: # %entry
+; X86: # %bb.0: # %entry
; X86-NEXT: pushl %ebp
; X86-NEXT: movl %esp, %ebp
; X86-NEXT: andl $-8, %esp
@@ -308,7 +308,7 @@ define i64 @tt0(x86_mmx %t, x86_mmx* %q) nounwind {
; X86-NEXT: retl
;
; X64-LABEL: tt0:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: paddb (%rdi), %mm0
; X64-NEXT: movd %mm0, %rax
; X64-NEXT: emms
@@ -325,7 +325,7 @@ declare void @llvm.x86.mmx.emms()
define i64 @tt1(x86_mmx %t, x86_mmx* %q) nounwind {
; X86-LABEL: tt1:
-; X86: # BB#0: # %entry
+; X86: # %bb.0: # %entry
; X86-NEXT: pushl %ebp
; X86-NEXT: movl %esp, %ebp
; X86-NEXT: andl $-8, %esp
@@ -341,7 +341,7 @@ define i64 @tt1(x86_mmx %t, x86_mmx* %q) nounwind {
; X86-NEXT: retl
;
; X64-LABEL: tt1:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: paddw (%rdi), %mm0
; X64-NEXT: movd %mm0, %rax
; X64-NEXT: emms
@@ -357,7 +357,7 @@ declare x86_mmx @llvm.x86.mmx.padd.w(x86_mmx, x86_mmx)
define i64 @tt2(x86_mmx %t, x86_mmx* %q) nounwind {
; X86-LABEL: tt2:
-; X86: # BB#0: # %entry
+; X86: # %bb.0: # %entry
; X86-NEXT: pushl %ebp
; X86-NEXT: movl %esp, %ebp
; X86-NEXT: andl $-8, %esp
@@ -373,7 +373,7 @@ define i64 @tt2(x86_mmx %t, x86_mmx* %q) nounwind {
; X86-NEXT: retl
;
; X64-LABEL: tt2:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: paddd (%rdi), %mm0
; X64-NEXT: movd %mm0, %rax
; X64-NEXT: emms
@@ -389,7 +389,7 @@ declare x86_mmx @llvm.x86.mmx.padd.d(x86_mmx, x86_mmx)
define i64 @tt3(x86_mmx %t, x86_mmx* %q) nounwind {
; X86-LABEL: tt3:
-; X86: # BB#0: # %entry
+; X86: # %bb.0: # %entry
; X86-NEXT: pushl %ebp
; X86-NEXT: movl %esp, %ebp
; X86-NEXT: andl $-8, %esp
@@ -405,7 +405,7 @@ define i64 @tt3(x86_mmx %t, x86_mmx* %q) nounwind {
; X86-NEXT: retl
;
; X64-LABEL: tt3:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: paddq (%rdi), %mm0
; X64-NEXT: movd %mm0, %rax
; X64-NEXT: emms
@@ -421,7 +421,7 @@ declare x86_mmx @llvm.x86.mmx.padd.q(x86_mmx, x86_mmx)
define i64 @tt4(x86_mmx %t, x86_mmx* %q) nounwind {
; X86-LABEL: tt4:
-; X86: # BB#0: # %entry
+; X86: # %bb.0: # %entry
; X86-NEXT: pushl %ebp
; X86-NEXT: movl %esp, %ebp
; X86-NEXT: andl $-8, %esp
@@ -437,7 +437,7 @@ define i64 @tt4(x86_mmx %t, x86_mmx* %q) nounwind {
; X86-NEXT: retl
;
; X64-LABEL: tt4:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: paddusb (%rdi), %mm0
; X64-NEXT: movd %mm0, %rax
; X64-NEXT: emms
@@ -453,7 +453,7 @@ declare x86_mmx @llvm.x86.mmx.paddus.b(x86_mmx, x86_mmx)
define i64 @tt5(x86_mmx %t, x86_mmx* %q) nounwind {
; X86-LABEL: tt5:
-; X86: # BB#0: # %entry
+; X86: # %bb.0: # %entry
; X86-NEXT: pushl %ebp
; X86-NEXT: movl %esp, %ebp
; X86-NEXT: andl $-8, %esp
@@ -469,7 +469,7 @@ define i64 @tt5(x86_mmx %t, x86_mmx* %q) nounwind {
; X86-NEXT: retl
;
; X64-LABEL: tt5:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: paddusw (%rdi), %mm0
; X64-NEXT: movd %mm0, %rax
; X64-NEXT: emms
@@ -485,7 +485,7 @@ declare x86_mmx @llvm.x86.mmx.paddus.w(x86_mmx, x86_mmx)
define i64 @tt6(x86_mmx %t, x86_mmx* %q) nounwind {
; X86-LABEL: tt6:
-; X86: # BB#0: # %entry
+; X86: # %bb.0: # %entry
; X86-NEXT: pushl %ebp
; X86-NEXT: movl %esp, %ebp
; X86-NEXT: andl $-8, %esp
@@ -501,7 +501,7 @@ define i64 @tt6(x86_mmx %t, x86_mmx* %q) nounwind {
; X86-NEXT: retl
;
; X64-LABEL: tt6:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: psrlw (%rdi), %mm0
; X64-NEXT: movd %mm0, %rax
; X64-NEXT: emms
@@ -517,7 +517,7 @@ declare x86_mmx @llvm.x86.mmx.psrl.w(x86_mmx, x86_mmx)
define i64 @tt7(x86_mmx %t, x86_mmx* %q) nounwind {
; X86-LABEL: tt7:
-; X86: # BB#0: # %entry
+; X86: # %bb.0: # %entry
; X86-NEXT: pushl %ebp
; X86-NEXT: movl %esp, %ebp
; X86-NEXT: andl $-8, %esp
@@ -533,7 +533,7 @@ define i64 @tt7(x86_mmx %t, x86_mmx* %q) nounwind {
; X86-NEXT: retl
;
; X64-LABEL: tt7:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: psrld (%rdi), %mm0
; X64-NEXT: movd %mm0, %rax
; X64-NEXT: emms
@@ -549,7 +549,7 @@ declare x86_mmx @llvm.x86.mmx.psrl.d(x86_mmx, x86_mmx)
define i64 @tt8(x86_mmx %t, x86_mmx* %q) nounwind {
; X86-LABEL: tt8:
-; X86: # BB#0: # %entry
+; X86: # %bb.0: # %entry
; X86-NEXT: pushl %ebp
; X86-NEXT: movl %esp, %ebp
; X86-NEXT: andl $-8, %esp
@@ -565,7 +565,7 @@ define i64 @tt8(x86_mmx %t, x86_mmx* %q) nounwind {
; X86-NEXT: retl
;
; X64-LABEL: tt8:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: psrlq (%rdi), %mm0
; X64-NEXT: movd %mm0, %rax
; X64-NEXT: emms
@@ -581,7 +581,7 @@ declare x86_mmx @llvm.x86.mmx.psrl.q(x86_mmx, x86_mmx)
define void @test_psrlq_by_volatile_shift_amount(x86_mmx* %t) nounwind {
; X86-LABEL: test_psrlq_by_volatile_shift_amount:
-; X86: # BB#0: # %entry
+; X86: # %bb.0: # %entry
; X86-NEXT: pushl %ebp
; X86-NEXT: movl %esp, %ebp
; X86-NEXT: andl $-8, %esp
@@ -599,7 +599,7 @@ define void @test_psrlq_by_volatile_shift_amount(x86_mmx* %t) nounwind {
; X86-NEXT: retl
;
; X64-LABEL: test_psrlq_by_volatile_shift_amount:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: movl $1, -{{[0-9]+}}(%rsp)
; X64-NEXT: movd -{{[0-9]+}}(%rsp), %mm0
; X64-NEXT: movl $255, %eax
diff --git a/test/CodeGen/X86/mmx-intrinsics.ll b/test/CodeGen/X86/mmx-intrinsics.ll
index 7647fccb5803..b9655830619e 100644
--- a/test/CodeGen/X86/mmx-intrinsics.ll
+++ b/test/CodeGen/X86/mmx-intrinsics.ll
@@ -1,7 +1,7 @@
-; RUN: llc < %s -march=x86 -mattr=+mmx,+ssse3,-avx | FileCheck %s --check-prefix=ALL --check-prefix=X86
-; RUN: llc < %s -march=x86 -mattr=+mmx,+avx | FileCheck %s --check-prefix=ALL --check-prefix=X86
-; RUN: llc < %s -march=x86-64 -mattr=+mmx,+ssse3,-avx | FileCheck %s --check-prefix=ALL --check-prefix=X64
-; RUN: llc < %s -march=x86-64 -mattr=+mmx,+avx | FileCheck %s --check-prefix=ALL --check-prefix=X64
+; RUN: llc < %s -mtriple=i686-- -mattr=+mmx,+ssse3,-avx | FileCheck %s --check-prefix=ALL --check-prefix=X86
+; RUN: llc < %s -mtriple=i686-- -mattr=+mmx,+avx | FileCheck %s --check-prefix=ALL --check-prefix=X86
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+mmx,+ssse3,-avx | FileCheck %s --check-prefix=ALL --check-prefix=X64
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+mmx,+avx | FileCheck %s --check-prefix=ALL --check-prefix=X64
declare x86_mmx @llvm.x86.ssse3.phadd.w(x86_mmx, x86_mmx) nounwind readnone
diff --git a/test/CodeGen/X86/mmx-only.ll b/test/CodeGen/X86/mmx-only.ll
index 35598d5f6e19..eab67e08b957 100644
--- a/test/CodeGen/X86/mmx-only.ll
+++ b/test/CodeGen/X86/mmx-only.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=x86 -mattr=+mmx | FileCheck %s
-; RUN: llc < %s -march=x86 -mattr=+mmx,-sse | FileCheck %s
+; RUN: llc < %s -mtriple=i686-- -mattr=+mmx | FileCheck %s
+; RUN: llc < %s -mtriple=i686-- -mattr=+mmx,-sse | FileCheck %s
; Test that turning off sse doesn't turn off mmx.
diff --git a/test/CodeGen/X86/mmx-schedule.ll b/test/CodeGen/X86/mmx-schedule.ll
new file mode 100644
index 000000000000..42159fea8f56
--- /dev/null
+++ b/test/CodeGen/X86/mmx-schedule.ll
@@ -0,0 +1,6967 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=+ssse3 | FileCheck %s --check-prefix=CHECK --check-prefix=GENERIC
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=atom | FileCheck %s --check-prefix=CHECK --check-prefix=ATOM
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=slm | FileCheck %s --check-prefix=CHECK --check-prefix=SLM
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=sandybridge | FileCheck %s --check-prefix=CHECK --check-prefix=SANDY
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=ivybridge | FileCheck %s --check-prefix=CHECK --check-prefix=SANDY
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=haswell | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=broadwell | FileCheck %s --check-prefix=CHECK --check-prefix=BROADWELL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=SKYLAKE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skx | FileCheck %s --check-prefix=CHECK --check-prefix=SKX
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1
+
+define i64 @test_cvtpd2pi(<2 x double> %a0, <2 x double>* %a1) optsize {
+; GENERIC-LABEL: test_cvtpd2pi:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: cvtpd2pi (%rdi), %mm0 # sched: [10:1.00]
+; GENERIC-NEXT: cvtpd2pi %xmm0, %mm1 # sched: [4:1.00]
+; GENERIC-NEXT: por %mm1, %mm0 # sched: [1:1.00]
+; GENERIC-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_cvtpd2pi:
+; ATOM: # %bb.0:
+; ATOM-NEXT: cvtpd2pi (%rdi), %mm0 # sched: [8:4.00]
+; ATOM-NEXT: cvtpd2pi %xmm0, %mm1 # sched: [7:3.50]
+; ATOM-NEXT: por %mm1, %mm0 # sched: [1:0.50]
+; ATOM-NEXT: movd %mm0, %rax # sched: [3:3.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_cvtpd2pi:
+; SLM: # %bb.0:
+; SLM-NEXT: cvtpd2pi (%rdi), %mm1 # sched: [7:1.00]
+; SLM-NEXT: cvtpd2pi %xmm0, %mm0 # sched: [4:0.50]
+; SLM-NEXT: por %mm0, %mm1 # sched: [1:0.50]
+; SLM-NEXT: movd %mm1, %rax # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_cvtpd2pi:
+; SANDY: # %bb.0:
+; SANDY-NEXT: cvtpd2pi (%rdi), %mm0 # sched: [10:1.00]
+; SANDY-NEXT: cvtpd2pi %xmm0, %mm1 # sched: [4:1.00]
+; SANDY-NEXT: por %mm1, %mm0 # sched: [1:1.00]
+; SANDY-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_cvtpd2pi:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: cvtpd2pi (%rdi), %mm0 # sched: [10:1.00]
+; HASWELL-NEXT: cvtpd2pi %xmm0, %mm1 # sched: [4:1.00]
+; HASWELL-NEXT: por %mm1, %mm0 # sched: [1:0.33]
+; HASWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_cvtpd2pi:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: cvtpd2pi %xmm0, %mm0 # sched: [4:1.00]
+; BROADWELL-NEXT: cvtpd2pi (%rdi), %mm1 # sched: [9:1.00]
+; BROADWELL-NEXT: por %mm0, %mm1 # sched: [1:0.33]
+; BROADWELL-NEXT: movd %mm1, %rax # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_cvtpd2pi:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: cvtpd2pi %xmm0, %mm0 # sched: [5:1.00]
+; SKYLAKE-NEXT: cvtpd2pi (%rdi), %mm1 # sched: [11:1.00]
+; SKYLAKE-NEXT: por %mm0, %mm1 # sched: [1:0.50]
+; SKYLAKE-NEXT: movd %mm1, %rax # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_cvtpd2pi:
+; SKX: # %bb.0:
+; SKX-NEXT: cvtpd2pi %xmm0, %mm0 # sched: [5:1.00]
+; SKX-NEXT: cvtpd2pi (%rdi), %mm1 # sched: [11:1.00]
+; SKX-NEXT: por %mm0, %mm1 # sched: [1:0.50]
+; SKX-NEXT: movd %mm1, %rax # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_cvtpd2pi:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: cvtpd2pi (%rdi), %mm1 # sched: [8:1.00]
+; BTVER2-NEXT: cvtpd2pi %xmm0, %mm0 # sched: [3:1.00]
+; BTVER2-NEXT: por %mm0, %mm1 # sched: [1:0.50]
+; BTVER2-NEXT: movd %mm1, %rax # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_cvtpd2pi:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: cvtpd2pi (%rdi), %mm1 # sched: [12:1.00]
+; ZNVER1-NEXT: cvtpd2pi %xmm0, %mm0 # sched: [4:1.00]
+; ZNVER1-NEXT: por %mm0, %mm1 # sched: [1:0.25]
+; ZNVER1-NEXT: movd %mm1, %rax # sched: [2:1.00]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call x86_mmx @llvm.x86.sse.cvtpd2pi(<2 x double> %a0)
+ %2 = load <2 x double>, <2 x double> *%a1, align 16
+ %3 = call x86_mmx @llvm.x86.sse.cvtpd2pi(<2 x double> %2)
+ %4 = call x86_mmx @llvm.x86.mmx.por(x86_mmx %1, x86_mmx %3)
+ %5 = bitcast x86_mmx %4 to i64
+ ret i64 %5
+}
+declare x86_mmx @llvm.x86.sse.cvtpd2pi(<2 x double>) nounwind readnone
+
+define <2 x double> @test_cvtpi2pd(x86_mmx %a0, x86_mmx* %a1) optsize {
+; GENERIC-LABEL: test_cvtpi2pd:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: cvtpi2pd %mm0, %xmm1 # sched: [4:1.00]
+; GENERIC-NEXT: cvtpi2pd (%rdi), %xmm0 # sched: [10:1.00]
+; GENERIC-NEXT: addpd %xmm1, %xmm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_cvtpi2pd:
+; ATOM: # %bb.0:
+; ATOM-NEXT: cvtpi2pd (%rdi), %xmm0 # sched: [8:4.00]
+; ATOM-NEXT: cvtpi2pd %mm0, %xmm1 # sched: [7:3.50]
+; ATOM-NEXT: addpd %xmm1, %xmm0 # sched: [6:3.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_cvtpi2pd:
+; SLM: # %bb.0:
+; SLM-NEXT: cvtpi2pd (%rdi), %xmm0 # sched: [7:1.00]
+; SLM-NEXT: cvtpi2pd %mm0, %xmm1 # sched: [4:0.50]
+; SLM-NEXT: addpd %xmm1, %xmm0 # sched: [3:1.00]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_cvtpi2pd:
+; SANDY: # %bb.0:
+; SANDY-NEXT: cvtpi2pd %mm0, %xmm0 # sched: [4:1.00]
+; SANDY-NEXT: cvtpi2pd (%rdi), %xmm1 # sched: [10:1.00]
+; SANDY-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_cvtpi2pd:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: cvtpi2pd %mm0, %xmm0 # sched: [4:1.00]
+; HASWELL-NEXT: cvtpi2pd (%rdi), %xmm1 # sched: [9:1.00]
+; HASWELL-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_cvtpi2pd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: cvtpi2pd (%rdi), %xmm0 # sched: [9:1.00]
+; BROADWELL-NEXT: cvtpi2pd %mm0, %xmm1 # sched: [4:1.00]
+; BROADWELL-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_cvtpi2pd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: cvtpi2pd %mm0, %xmm0 # sched: [5:1.00]
+; SKYLAKE-NEXT: cvtpi2pd (%rdi), %xmm1 # sched: [10:1.00]
+; SKYLAKE-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_cvtpi2pd:
+; SKX: # %bb.0:
+; SKX-NEXT: cvtpi2pd %mm0, %xmm0 # sched: [5:1.00]
+; SKX-NEXT: cvtpi2pd (%rdi), %xmm1 # sched: [10:1.00]
+; SKX-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_cvtpi2pd:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: cvtpi2pd (%rdi), %xmm1 # sched: [8:1.00]
+; BTVER2-NEXT: cvtpi2pd %mm0, %xmm0 # sched: [3:1.00]
+; BTVER2-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_cvtpi2pd:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: cvtpi2pd (%rdi), %xmm1 # sched: [12:1.00]
+; ZNVER1-NEXT: cvtpi2pd %mm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call <2 x double> @llvm.x86.sse.cvtpi2pd(x86_mmx %a0)
+ %2 = load x86_mmx, x86_mmx *%a1, align 8
+ %3 = call <2 x double> @llvm.x86.sse.cvtpi2pd(x86_mmx %2)
+ %4 = fadd <2 x double> %1, %3
+ ret <2 x double> %4
+}
+declare <2 x double> @llvm.x86.sse.cvtpi2pd(x86_mmx) nounwind readnone
+
+define <4 x float> @test_cvtpi2ps(x86_mmx %a0, x86_mmx* %a1, <4 x float> %a2, <4 x float> %a3) optsize {
+; GENERIC-LABEL: test_cvtpi2ps:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: cvtpi2ps %mm0, %xmm0 # sched: [3:1.00]
+; GENERIC-NEXT: cvtpi2ps (%rdi), %xmm1 # sched: [9:1.00]
+; GENERIC-NEXT: addps %xmm1, %xmm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_cvtpi2ps:
+; ATOM: # %bb.0:
+; ATOM-NEXT: cvtpi2ps (%rdi), %xmm1
+; ATOM-NEXT: cvtpi2ps %mm0, %xmm0
+; ATOM-NEXT: addps %xmm1, %xmm0 # sched: [5:5.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_cvtpi2ps:
+; SLM: # %bb.0:
+; SLM-NEXT: cvtpi2ps (%rdi), %xmm1 # sched: [7:1.00]
+; SLM-NEXT: cvtpi2ps %mm0, %xmm0 # sched: [4:0.50]
+; SLM-NEXT: addps %xmm1, %xmm0 # sched: [3:1.00]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_cvtpi2ps:
+; SANDY: # %bb.0:
+; SANDY-NEXT: cvtpi2ps %mm0, %xmm0 # sched: [3:1.00]
+; SANDY-NEXT: cvtpi2ps (%rdi), %xmm1 # sched: [9:1.00]
+; SANDY-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_cvtpi2ps:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: cvtpi2ps %mm0, %xmm0 # sched: [3:1.00]
+; HASWELL-NEXT: cvtpi2ps (%rdi), %xmm1 # sched: [8:1.00]
+; HASWELL-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_cvtpi2ps:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: cvtpi2ps %mm0, %xmm0 # sched: [3:1.00]
+; BROADWELL-NEXT: cvtpi2ps (%rdi), %xmm1 # sched: [8:1.00]
+; BROADWELL-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_cvtpi2ps:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: cvtpi2ps %mm0, %xmm0 # sched: [6:2.00]
+; SKYLAKE-NEXT: cvtpi2ps (%rdi), %xmm1 # sched: [9:1.00]
+; SKYLAKE-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_cvtpi2ps:
+; SKX: # %bb.0:
+; SKX-NEXT: cvtpi2ps %mm0, %xmm0 # sched: [6:2.00]
+; SKX-NEXT: cvtpi2ps (%rdi), %xmm1 # sched: [9:1.00]
+; SKX-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_cvtpi2ps:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: cvtpi2ps (%rdi), %xmm1 # sched: [8:1.00]
+; BTVER2-NEXT: cvtpi2ps %mm0, %xmm0 # sched: [3:1.00]
+; BTVER2-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_cvtpi2ps:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: cvtpi2ps (%rdi), %xmm1 # sched: [12:1.00]
+; ZNVER1-NEXT: cvtpi2ps %mm0, %xmm0 # sched: [5:1.00]
+; ZNVER1-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call <4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float> %a2, x86_mmx %a0)
+ %2 = load x86_mmx, x86_mmx *%a1, align 8
+ %3 = call <4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float> %a3, x86_mmx %2)
+ %4 = fadd <4 x float> %1, %3
+ ret <4 x float> %4
+}
+declare <4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float>, x86_mmx) nounwind readnone
+
+define i64 @test_cvtps2pi(<4 x float> %a0, <4 x float>* %a1) optsize {
+; GENERIC-LABEL: test_cvtps2pi:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: cvtps2pi %xmm0, %mm0 # sched: [3:1.00]
+; GENERIC-NEXT: cvtps2pi (%rdi), %mm1 # sched: [9:1.00]
+; GENERIC-NEXT: por %mm0, %mm1 # sched: [1:1.00]
+; GENERIC-NEXT: movd %mm1, %rax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_cvtps2pi:
+; ATOM: # %bb.0:
+; ATOM-NEXT: cvtps2pi %xmm0, %mm0 # sched: [5:5.00]
+; ATOM-NEXT: cvtps2pi (%rdi), %mm1 # sched: [5:5.00]
+; ATOM-NEXT: por %mm0, %mm1 # sched: [1:0.50]
+; ATOM-NEXT: movd %mm1, %rax # sched: [3:3.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_cvtps2pi:
+; SLM: # %bb.0:
+; SLM-NEXT: cvtps2pi (%rdi), %mm1 # sched: [7:1.00]
+; SLM-NEXT: cvtps2pi %xmm0, %mm0 # sched: [4:0.50]
+; SLM-NEXT: por %mm0, %mm1 # sched: [1:0.50]
+; SLM-NEXT: movd %mm1, %rax # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_cvtps2pi:
+; SANDY: # %bb.0:
+; SANDY-NEXT: cvtps2pi %xmm0, %mm0 # sched: [3:1.00]
+; SANDY-NEXT: cvtps2pi (%rdi), %mm1 # sched: [9:1.00]
+; SANDY-NEXT: por %mm0, %mm1 # sched: [1:1.00]
+; SANDY-NEXT: movd %mm1, %rax # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_cvtps2pi:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: cvtps2pi %xmm0, %mm0 # sched: [4:1.00]
+; HASWELL-NEXT: cvtps2pi (%rdi), %mm1 # sched: [8:1.00]
+; HASWELL-NEXT: por %mm0, %mm1 # sched: [1:0.33]
+; HASWELL-NEXT: movd %mm1, %rax # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_cvtps2pi:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: cvtps2pi %xmm0, %mm0 # sched: [4:1.00]
+; BROADWELL-NEXT: cvtps2pi (%rdi), %mm1 # sched: [8:1.00]
+; BROADWELL-NEXT: por %mm0, %mm1 # sched: [1:0.33]
+; BROADWELL-NEXT: movd %mm1, %rax # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_cvtps2pi:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: cvtps2pi %xmm0, %mm0 # sched: [5:1.00]
+; SKYLAKE-NEXT: cvtps2pi (%rdi), %mm1 # sched: [9:0.50]
+; SKYLAKE-NEXT: por %mm0, %mm1 # sched: [1:0.50]
+; SKYLAKE-NEXT: movd %mm1, %rax # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_cvtps2pi:
+; SKX: # %bb.0:
+; SKX-NEXT: cvtps2pi %xmm0, %mm0 # sched: [5:1.00]
+; SKX-NEXT: cvtps2pi (%rdi), %mm1 # sched: [9:0.50]
+; SKX-NEXT: por %mm0, %mm1 # sched: [1:0.50]
+; SKX-NEXT: movd %mm1, %rax # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_cvtps2pi:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: cvtps2pi (%rdi), %mm1 # sched: [8:1.00]
+; BTVER2-NEXT: cvtps2pi %xmm0, %mm0 # sched: [3:1.00]
+; BTVER2-NEXT: por %mm0, %mm1 # sched: [1:0.50]
+; BTVER2-NEXT: movd %mm1, %rax # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_cvtps2pi:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: cvtps2pi (%rdi), %mm1 # sched: [12:1.00]
+; ZNVER1-NEXT: cvtps2pi %xmm0, %mm0 # sched: [4:1.00]
+; ZNVER1-NEXT: por %mm0, %mm1 # sched: [1:0.25]
+; ZNVER1-NEXT: movd %mm1, %rax # sched: [2:1.00]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call x86_mmx @llvm.x86.sse.cvtps2pi(<4 x float> %a0)
+ %2 = load <4 x float>, <4 x float> *%a1, align 16
+ %3 = call x86_mmx @llvm.x86.sse.cvtps2pi(<4 x float> %2)
+ %4 = call x86_mmx @llvm.x86.mmx.por(x86_mmx %1, x86_mmx %3)
+ %5 = bitcast x86_mmx %4 to i64
+ ret i64 %5
+}
+declare x86_mmx @llvm.x86.sse.cvtps2pi(<4 x float>) nounwind readnone
+
+define i64 @test_cvttpd2pi(<2 x double> %a0, <2 x double>* %a1) optsize {
+; GENERIC-LABEL: test_cvttpd2pi:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: cvttpd2pi (%rdi), %mm0 # sched: [10:1.00]
+; GENERIC-NEXT: cvttpd2pi %xmm0, %mm1 # sched: [4:1.00]
+; GENERIC-NEXT: por %mm1, %mm0 # sched: [1:1.00]
+; GENERIC-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_cvttpd2pi:
+; ATOM: # %bb.0:
+; ATOM-NEXT: cvttpd2pi (%rdi), %mm0 # sched: [8:4.00]
+; ATOM-NEXT: cvttpd2pi %xmm0, %mm1 # sched: [7:3.50]
+; ATOM-NEXT: por %mm1, %mm0 # sched: [1:0.50]
+; ATOM-NEXT: movd %mm0, %rax # sched: [3:3.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_cvttpd2pi:
+; SLM: # %bb.0:
+; SLM-NEXT: cvttpd2pi (%rdi), %mm1 # sched: [7:1.00]
+; SLM-NEXT: cvttpd2pi %xmm0, %mm0 # sched: [4:0.50]
+; SLM-NEXT: por %mm0, %mm1 # sched: [1:0.50]
+; SLM-NEXT: movd %mm1, %rax # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_cvttpd2pi:
+; SANDY: # %bb.0:
+; SANDY-NEXT: cvttpd2pi (%rdi), %mm0 # sched: [10:1.00]
+; SANDY-NEXT: cvttpd2pi %xmm0, %mm1 # sched: [4:1.00]
+; SANDY-NEXT: por %mm1, %mm0 # sched: [1:1.00]
+; SANDY-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_cvttpd2pi:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: cvttpd2pi (%rdi), %mm0 # sched: [10:1.00]
+; HASWELL-NEXT: cvttpd2pi %xmm0, %mm1 # sched: [4:1.00]
+; HASWELL-NEXT: por %mm1, %mm0 # sched: [1:0.33]
+; HASWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_cvttpd2pi:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: cvttpd2pi %xmm0, %mm0 # sched: [4:1.00]
+; BROADWELL-NEXT: cvttpd2pi (%rdi), %mm1 # sched: [9:1.00]
+; BROADWELL-NEXT: por %mm0, %mm1 # sched: [1:0.33]
+; BROADWELL-NEXT: movd %mm1, %rax # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_cvttpd2pi:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: cvttpd2pi %xmm0, %mm0 # sched: [5:1.00]
+; SKYLAKE-NEXT: cvttpd2pi (%rdi), %mm1 # sched: [11:1.00]
+; SKYLAKE-NEXT: por %mm0, %mm1 # sched: [1:0.50]
+; SKYLAKE-NEXT: movd %mm1, %rax # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_cvttpd2pi:
+; SKX: # %bb.0:
+; SKX-NEXT: cvttpd2pi %xmm0, %mm0 # sched: [5:1.00]
+; SKX-NEXT: cvttpd2pi (%rdi), %mm1 # sched: [11:1.00]
+; SKX-NEXT: por %mm0, %mm1 # sched: [1:0.50]
+; SKX-NEXT: movd %mm1, %rax # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_cvttpd2pi:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: cvttpd2pi (%rdi), %mm1 # sched: [8:1.00]
+; BTVER2-NEXT: cvttpd2pi %xmm0, %mm0 # sched: [3:1.00]
+; BTVER2-NEXT: por %mm0, %mm1 # sched: [1:0.50]
+; BTVER2-NEXT: movd %mm1, %rax # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_cvttpd2pi:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: cvttpd2pi (%rdi), %mm1 # sched: [12:1.00]
+; ZNVER1-NEXT: cvttpd2pi %xmm0, %mm0 # sched: [4:1.00]
+; ZNVER1-NEXT: por %mm0, %mm1 # sched: [1:0.25]
+; ZNVER1-NEXT: movd %mm1, %rax # sched: [2:1.00]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call x86_mmx @llvm.x86.sse.cvttpd2pi(<2 x double> %a0)
+ %2 = load <2 x double>, <2 x double> *%a1, align 16
+ %3 = call x86_mmx @llvm.x86.sse.cvttpd2pi(<2 x double> %2)
+ %4 = call x86_mmx @llvm.x86.mmx.por(x86_mmx %1, x86_mmx %3)
+ %5 = bitcast x86_mmx %4 to i64
+ ret i64 %5
+}
+declare x86_mmx @llvm.x86.sse.cvttpd2pi(<2 x double>) nounwind readnone
+
+define i64 @test_cvttps2pi(<4 x float> %a0, <4 x float>* %a1) optsize {
+; GENERIC-LABEL: test_cvttps2pi:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: cvttps2pi %xmm0, %mm0 # sched: [3:1.00]
+; GENERIC-NEXT: cvttps2pi (%rdi), %mm1 # sched: [9:1.00]
+; GENERIC-NEXT: por %mm0, %mm1 # sched: [1:1.00]
+; GENERIC-NEXT: movd %mm1, %rax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_cvttps2pi:
+; ATOM: # %bb.0:
+; ATOM-NEXT: cvttps2pi %xmm0, %mm0 # sched: [5:5.00]
+; ATOM-NEXT: cvttps2pi (%rdi), %mm1 # sched: [5:5.00]
+; ATOM-NEXT: por %mm0, %mm1 # sched: [1:0.50]
+; ATOM-NEXT: movd %mm1, %rax # sched: [3:3.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_cvttps2pi:
+; SLM: # %bb.0:
+; SLM-NEXT: cvttps2pi (%rdi), %mm1 # sched: [7:1.00]
+; SLM-NEXT: cvttps2pi %xmm0, %mm0 # sched: [4:0.50]
+; SLM-NEXT: por %mm0, %mm1 # sched: [1:0.50]
+; SLM-NEXT: movd %mm1, %rax # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_cvttps2pi:
+; SANDY: # %bb.0:
+; SANDY-NEXT: cvttps2pi %xmm0, %mm0 # sched: [3:1.00]
+; SANDY-NEXT: cvttps2pi (%rdi), %mm1 # sched: [9:1.00]
+; SANDY-NEXT: por %mm0, %mm1 # sched: [1:1.00]
+; SANDY-NEXT: movd %mm1, %rax # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_cvttps2pi:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: cvttps2pi %xmm0, %mm0 # sched: [4:1.00]
+; HASWELL-NEXT: cvttps2pi (%rdi), %mm1 # sched: [8:1.00]
+; HASWELL-NEXT: por %mm0, %mm1 # sched: [1:0.33]
+; HASWELL-NEXT: movd %mm1, %rax # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_cvttps2pi:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: cvttps2pi %xmm0, %mm0 # sched: [4:1.00]
+; BROADWELL-NEXT: cvttps2pi (%rdi), %mm1 # sched: [8:1.00]
+; BROADWELL-NEXT: por %mm0, %mm1 # sched: [1:0.33]
+; BROADWELL-NEXT: movd %mm1, %rax # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_cvttps2pi:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: cvttps2pi %xmm0, %mm0 # sched: [5:1.00]
+; SKYLAKE-NEXT: cvttps2pi (%rdi), %mm1 # sched: [9:0.50]
+; SKYLAKE-NEXT: por %mm0, %mm1 # sched: [1:0.50]
+; SKYLAKE-NEXT: movd %mm1, %rax # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_cvttps2pi:
+; SKX: # %bb.0:
+; SKX-NEXT: cvttps2pi %xmm0, %mm0 # sched: [5:1.00]
+; SKX-NEXT: cvttps2pi (%rdi), %mm1 # sched: [9:0.50]
+; SKX-NEXT: por %mm0, %mm1 # sched: [1:0.50]
+; SKX-NEXT: movd %mm1, %rax # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_cvttps2pi:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: cvttps2pi (%rdi), %mm1 # sched: [8:1.00]
+; BTVER2-NEXT: cvttps2pi %xmm0, %mm0 # sched: [3:1.00]
+; BTVER2-NEXT: por %mm0, %mm1 # sched: [1:0.50]
+; BTVER2-NEXT: movd %mm1, %rax # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_cvttps2pi:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: cvttps2pi (%rdi), %mm1 # sched: [12:1.00]
+; ZNVER1-NEXT: cvttps2pi %xmm0, %mm0 # sched: [4:1.00]
+; ZNVER1-NEXT: por %mm0, %mm1 # sched: [1:0.25]
+; ZNVER1-NEXT: movd %mm1, %rax # sched: [2:1.00]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call x86_mmx @llvm.x86.sse.cvttps2pi(<4 x float> %a0)
+ %2 = load <4 x float>, <4 x float> *%a1, align 16
+ %3 = call x86_mmx @llvm.x86.sse.cvttps2pi(<4 x float> %2)
+ %4 = call x86_mmx @llvm.x86.mmx.por(x86_mmx %1, x86_mmx %3)
+ %5 = bitcast x86_mmx %4 to i64
+ ret i64 %5
+}
+declare x86_mmx @llvm.x86.sse.cvttps2pi(<4 x float>) nounwind readnone
+
+define void @test_emms() optsize {
+; GENERIC-LABEL: test_emms:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: emms
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_emms:
+; ATOM: # %bb.0:
+; ATOM-NEXT: emms # sched: [5:2.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_emms:
+; SLM: # %bb.0:
+; SLM-NEXT: emms
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_emms:
+; SANDY: # %bb.0:
+; SANDY-NEXT: emms
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_emms:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: emms # sched: [31:10.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_emms:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: emms # sched: [31:10.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_emms:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: emms # sched: [10:4.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_emms:
+; SKX: # %bb.0:
+; SKX-NEXT: emms # sched: [10:4.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_emms:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: emms
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_emms:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: emms
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ call void @llvm.x86.mmx.emms()
+ ret void
+}
+declare void @llvm.x86.mmx.emms()
+
+define void @test_maskmovq(x86_mmx %a0, x86_mmx %a1, i8* %a2) optsize {
+; GENERIC-LABEL: test_maskmovq:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: maskmovq %mm1, %mm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_maskmovq:
+; ATOM: # %bb.0:
+; ATOM-NEXT: maskmovq %mm1, %mm0 # sched: [1:1.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_maskmovq:
+; SLM: # %bb.0:
+; SLM-NEXT: maskmovq %mm1, %mm0 # sched: [1:1.00]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_maskmovq:
+; SANDY: # %bb.0:
+; SANDY-NEXT: maskmovq %mm1, %mm0 # sched: [1:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_maskmovq:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: maskmovq %mm1, %mm0 # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_maskmovq:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: maskmovq %mm1, %mm0 # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_maskmovq:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: maskmovq %mm1, %mm0 # sched: [1:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_maskmovq:
+; SKX: # %bb.0:
+; SKX-NEXT: maskmovq %mm1, %mm0 # sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_maskmovq:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: maskmovq %mm1, %mm0 # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_maskmovq:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: maskmovq %mm1, %mm0 # sched: [100:?]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ call void @llvm.x86.mmx.maskmovq(x86_mmx %a0, x86_mmx %a1, i8* %a2)
+ ret void
+}
+declare void @llvm.x86.mmx.maskmovq(x86_mmx, x86_mmx, i8*) nounwind
+
+define i32 @test_movd(x86_mmx %a0, i32 %a1, i32 *%a2) {
+; GENERIC-LABEL: test_movd:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: movd %edi, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp) # sched: [5:1.00]
+; GENERIC-NEXT: movq -{{[0-9]+}}(%rsp), %mm1 # sched: [4:0.50]
+; GENERIC-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [6:0.50]
+; GENERIC-NEXT: movlps %xmm0, -{{[0-9]+}}(%rsp) # sched: [5:1.00]
+; GENERIC-NEXT: paddd -{{[0-9]+}}(%rsp), %mm1 # sched: [7:1.00]
+; GENERIC-NEXT: paddd %mm1, %mm0 # sched: [3:1.00]
+; GENERIC-NEXT: movd %mm1, %ecx # sched: [1:0.33]
+; GENERIC-NEXT: movd %mm0, %eax # sched: [1:0.33]
+; GENERIC-NEXT: movl %ecx, (%rsi) # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_movd:
+; ATOM: # %bb.0:
+; ATOM-NEXT: movd %edi, %xmm0 # sched: [1:1.00]
+; ATOM-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp) # sched: [1:1.00]
+; ATOM-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [1:1.00]
+; ATOM-NEXT: movq -{{[0-9]+}}(%rsp), %mm1 # sched: [1:1.00]
+; ATOM-NEXT: movlps %xmm0, -{{[0-9]+}}(%rsp) # sched: [1:1.00]
+; ATOM-NEXT: paddd -{{[0-9]+}}(%rsp), %mm1 # sched: [1:1.00]
+; ATOM-NEXT: paddd %mm1, %mm0 # sched: [1:0.50]
+; ATOM-NEXT: movd %mm1, %ecx # sched: [3:3.00]
+; ATOM-NEXT: movd %mm0, %eax # sched: [3:3.00]
+; ATOM-NEXT: movl %ecx, (%rsi) # sched: [1:1.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_movd:
+; SLM: # %bb.0:
+; SLM-NEXT: movd %edi, %xmm0 # sched: [1:0.50]
+; SLM-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp) # sched: [1:1.00]
+; SLM-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [3:1.00]
+; SLM-NEXT: movq -{{[0-9]+}}(%rsp), %mm1 # sched: [3:1.00]
+; SLM-NEXT: movlps %xmm0, -{{[0-9]+}}(%rsp) # sched: [1:1.00]
+; SLM-NEXT: paddd -{{[0-9]+}}(%rsp), %mm1 # sched: [4:1.00]
+; SLM-NEXT: paddd %mm1, %mm0 # sched: [1:0.50]
+; SLM-NEXT: movd %mm1, %ecx # sched: [1:0.50]
+; SLM-NEXT: movd %mm0, %eax # sched: [1:0.50]
+; SLM-NEXT: movl %ecx, (%rsi) # sched: [1:1.00]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_movd:
+; SANDY: # %bb.0:
+; SANDY-NEXT: vmovd %edi, %xmm0 # sched: [1:1.00]
+; SANDY-NEXT: vmovq %xmm0, -{{[0-9]+}}(%rsp) # sched: [5:1.00]
+; SANDY-NEXT: movq -{{[0-9]+}}(%rsp), %mm1 # sched: [4:0.50]
+; SANDY-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [6:0.50]
+; SANDY-NEXT: vmovlps %xmm0, -{{[0-9]+}}(%rsp) # sched: [5:1.00]
+; SANDY-NEXT: paddd -{{[0-9]+}}(%rsp), %mm1 # sched: [7:1.00]
+; SANDY-NEXT: paddd %mm1, %mm0 # sched: [3:1.00]
+; SANDY-NEXT: movd %mm1, %ecx # sched: [1:0.33]
+; SANDY-NEXT: movd %mm0, %eax # sched: [1:0.33]
+; SANDY-NEXT: movl %ecx, (%rsi) # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_movd:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vmovd %edi, %xmm0 # sched: [1:1.00]
+; HASWELL-NEXT: vmovq %xmm0, -{{[0-9]+}}(%rsp) # sched: [1:1.00]
+; HASWELL-NEXT: movq -{{[0-9]+}}(%rsp), %mm1 # sched: [5:0.50]
+; HASWELL-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [5:0.50]
+; HASWELL-NEXT: vmovlps %xmm0, -{{[0-9]+}}(%rsp) # sched: [1:1.00]
+; HASWELL-NEXT: paddd -{{[0-9]+}}(%rsp), %mm1 # sched: [6:0.50]
+; HASWELL-NEXT: paddd %mm1, %mm0 # sched: [1:0.50]
+; HASWELL-NEXT: movd %mm1, %ecx # sched: [1:1.00]
+; HASWELL-NEXT: movd %mm0, %eax # sched: [1:1.00]
+; HASWELL-NEXT: movl %ecx, (%rsi) # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_movd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vmovd %edi, %xmm0 # sched: [1:1.00]
+; BROADWELL-NEXT: vmovq %xmm0, -{{[0-9]+}}(%rsp) # sched: [1:1.00]
+; BROADWELL-NEXT: movq -{{[0-9]+}}(%rsp), %mm1 # sched: [5:0.50]
+; BROADWELL-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [5:0.50]
+; BROADWELL-NEXT: vmovlps %xmm0, -{{[0-9]+}}(%rsp) # sched: [1:1.00]
+; BROADWELL-NEXT: paddd -{{[0-9]+}}(%rsp), %mm1 # sched: [6:0.50]
+; BROADWELL-NEXT: paddd %mm1, %mm0 # sched: [1:0.50]
+; BROADWELL-NEXT: movd %mm1, %ecx # sched: [1:1.00]
+; BROADWELL-NEXT: movd %mm0, %eax # sched: [1:1.00]
+; BROADWELL-NEXT: movl %ecx, (%rsi) # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_movd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vmovd %edi, %xmm0 # sched: [1:1.00]
+; SKYLAKE-NEXT: vmovq %xmm0, -{{[0-9]+}}(%rsp) # sched: [1:1.00]
+; SKYLAKE-NEXT: movq -{{[0-9]+}}(%rsp), %mm1 # sched: [5:0.50]
+; SKYLAKE-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [5:0.50]
+; SKYLAKE-NEXT: vmovlps %xmm0, -{{[0-9]+}}(%rsp) # sched: [1:1.00]
+; SKYLAKE-NEXT: paddd -{{[0-9]+}}(%rsp), %mm1 # sched: [6:0.50]
+; SKYLAKE-NEXT: paddd %mm1, %mm0 # sched: [1:0.50]
+; SKYLAKE-NEXT: movd %mm1, %ecx # sched: [2:1.00]
+; SKYLAKE-NEXT: movd %mm0, %eax # sched: [2:1.00]
+; SKYLAKE-NEXT: movl %ecx, (%rsi) # sched: [1:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_movd:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovd %edi, %xmm0 # sched: [1:1.00]
+; SKX-NEXT: vpmovqd %xmm0, -{{[0-9]+}}(%rsp) # sched: [4:1.00]
+; SKX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [5:0.50]
+; SKX-NEXT: movq -{{[0-9]+}}(%rsp), %mm1 # sched: [5:0.50]
+; SKX-NEXT: vpmovqd %xmm0, -{{[0-9]+}}(%rsp) # sched: [4:1.00]
+; SKX-NEXT: paddd -{{[0-9]+}}(%rsp), %mm1 # sched: [6:0.50]
+; SKX-NEXT: paddd %mm1, %mm0 # sched: [1:0.50]
+; SKX-NEXT: movd %mm1, %ecx # sched: [2:1.00]
+; SKX-NEXT: movd %mm0, %eax # sched: [2:1.00]
+; SKX-NEXT: movl %ecx, (%rsi) # sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_movd:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: vmovd %edi, %xmm0 # sched: [1:0.50]
+; BTVER2-NEXT: vmovq %xmm0, -{{[0-9]+}}(%rsp) # sched: [1:1.00]
+; BTVER2-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [5:1.00]
+; BTVER2-NEXT: movq -{{[0-9]+}}(%rsp), %mm1 # sched: [5:1.00]
+; BTVER2-NEXT: vmovlps %xmm0, -{{[0-9]+}}(%rsp) # sched: [1:1.00]
+; BTVER2-NEXT: paddd -{{[0-9]+}}(%rsp), %mm1 # sched: [6:1.00]
+; BTVER2-NEXT: paddd %mm1, %mm0 # sched: [1:0.50]
+; BTVER2-NEXT: movd %mm1, %ecx # sched: [1:0.50]
+; BTVER2-NEXT: movd %mm0, %eax # sched: [1:0.50]
+; BTVER2-NEXT: movl %ecx, (%rsi) # sched: [1:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_movd:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vmovd %edi, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT: vmovq %xmm0, -{{[0-9]+}}(%rsp) # sched: [1:0.50]
+; ZNVER1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [8:0.50]
+; ZNVER1-NEXT: movq -{{[0-9]+}}(%rsp), %mm1 # sched: [8:0.50]
+; ZNVER1-NEXT: vmovlps %xmm0, -{{[0-9]+}}(%rsp) # sched: [1:0.50]
+; ZNVER1-NEXT: paddd -{{[0-9]+}}(%rsp), %mm1 # sched: [8:0.50]
+; ZNVER1-NEXT: paddd %mm1, %mm0 # sched: [1:0.25]
+; ZNVER1-NEXT: movd %mm1, %ecx # sched: [2:1.00]
+; ZNVER1-NEXT: movd %mm0, %eax # sched: [2:1.00]
+; ZNVER1-NEXT: movl %ecx, (%rsi) # sched: [1:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = insertelement <2 x i32> undef, i32 %a1, i32 0
+ %2 = bitcast <2 x i32> %1 to x86_mmx
+ %3 = load i32, i32 *%a2
+ %4 = insertelement <2 x i32> undef, i32 %3, i32 0
+ %5 = bitcast <2 x i32> %4 to x86_mmx
+ %6 = call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %2, x86_mmx %5)
+ %7 = call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %a0, x86_mmx %6)
+ %8 = bitcast x86_mmx %6 to <2 x i32>
+ %9 = bitcast x86_mmx %7 to <2 x i32>
+ %10 = extractelement <2 x i32> %8, i32 0
+ %11 = extractelement <2 x i32> %9, i32 0
+ store i32 %10, i32* %a2
+ ret i32 %11
+}
+
+define i64 @test_movdq2q(<2 x i64> %a0) optsize {
+; GENERIC-LABEL: test_movdq2q:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: movdq2q %xmm0, %mm0 # sched: [2:1.00]
+; GENERIC-NEXT: paddd %mm0, %mm0 # sched: [3:1.00]
+; GENERIC-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_movdq2q:
+; ATOM: # %bb.0:
+; ATOM-NEXT: movdq2q %xmm0, %mm0 # sched: [1:0.50]
+; ATOM-NEXT: paddd %mm0, %mm0 # sched: [1:0.50]
+; ATOM-NEXT: movd %mm0, %rax # sched: [3:3.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_movdq2q:
+; SLM: # %bb.0:
+; SLM-NEXT: movdq2q %xmm0, %mm0 # sched: [1:0.50]
+; SLM-NEXT: paddd %mm0, %mm0 # sched: [1:0.50]
+; SLM-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_movdq2q:
+; SANDY: # %bb.0:
+; SANDY-NEXT: movdq2q %xmm0, %mm0 # sched: [2:1.00]
+; SANDY-NEXT: paddd %mm0, %mm0 # sched: [3:1.00]
+; SANDY-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_movdq2q:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: movdq2q %xmm0, %mm0 # sched: [2:0.67]
+; HASWELL-NEXT: paddd %mm0, %mm0 # sched: [1:0.50]
+; HASWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_movdq2q:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: movdq2q %xmm0, %mm0 # sched: [2:0.67]
+; BROADWELL-NEXT: paddd %mm0, %mm0 # sched: [1:0.50]
+; BROADWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_movdq2q:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: movdq2q %xmm0, %mm0 # sched: [2:1.00]
+; SKYLAKE-NEXT: paddd %mm0, %mm0 # sched: [1:0.50]
+; SKYLAKE-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_movdq2q:
+; SKX: # %bb.0:
+; SKX-NEXT: movdq2q %xmm0, %mm0 # sched: [2:1.00]
+; SKX-NEXT: paddd %mm0, %mm0 # sched: [1:0.50]
+; SKX-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_movdq2q:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: movdq2q %xmm0, %mm0 # sched: [1:0.50]
+; BTVER2-NEXT: paddd %mm0, %mm0 # sched: [1:0.50]
+; BTVER2-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_movdq2q:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: movdq2q %xmm0, %mm0 # sched: [1:0.25]
+; ZNVER1-NEXT: paddd %mm0, %mm0 # sched: [1:0.25]
+; ZNVER1-NEXT: movd %mm0, %rax # sched: [2:1.00]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = extractelement <2 x i64> %a0, i32 0
+ %2 = bitcast i64 %1 to x86_mmx
+ %3 = call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %2, x86_mmx %2)
+ %4 = bitcast x86_mmx %3 to i64
+ ret i64 %4
+}
+
+define void @test_movntq(x86_mmx* %a0, x86_mmx %a1) optsize {
+; GENERIC-LABEL: test_movntq:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: movntq %mm0, (%rdi) # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_movntq:
+; ATOM: # %bb.0:
+; ATOM-NEXT: movntq %mm0, (%rdi) # sched: [1:1.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_movntq:
+; SLM: # %bb.0:
+; SLM-NEXT: movntq %mm0, (%rdi) # sched: [1:1.00]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_movntq:
+; SANDY: # %bb.0:
+; SANDY-NEXT: movntq %mm0, (%rdi) # sched: [1:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_movntq:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: movntq %mm0, (%rdi) # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_movntq:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: movntq %mm0, (%rdi) # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_movntq:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: movntq %mm0, (%rdi) # sched: [1:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_movntq:
+; SKX: # %bb.0:
+; SKX-NEXT: movntq %mm0, (%rdi) # sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_movntq:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: movntq %mm0, (%rdi) # sched: [1:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_movntq:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: movntq %mm0, (%rdi) # sched: [1:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ call void @llvm.x86.mmx.movnt.dq(x86_mmx* %a0, x86_mmx %a1)
+ ret void
+}
+declare void @llvm.x86.mmx.movnt.dq(x86_mmx*, x86_mmx) nounwind
+
+define void @test_movq(i64 *%a0) {
+; GENERIC-LABEL: test_movq:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: movq (%rdi), %mm0 # sched: [4:0.50]
+; GENERIC-NEXT: paddd %mm0, %mm0 # sched: [3:1.00]
+; GENERIC-NEXT: movq %mm0, (%rdi) # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_movq:
+; ATOM: # %bb.0:
+; ATOM-NEXT: movq (%rdi), %mm0 # sched: [1:1.00]
+; ATOM-NEXT: paddd %mm0, %mm0 # sched: [1:0.50]
+; ATOM-NEXT: movq %mm0, (%rdi) # sched: [1:1.00]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_movq:
+; SLM: # %bb.0:
+; SLM-NEXT: movq (%rdi), %mm0 # sched: [3:1.00]
+; SLM-NEXT: paddd %mm0, %mm0 # sched: [1:0.50]
+; SLM-NEXT: movq %mm0, (%rdi) # sched: [1:1.00]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_movq:
+; SANDY: # %bb.0:
+; SANDY-NEXT: movq (%rdi), %mm0 # sched: [4:0.50]
+; SANDY-NEXT: paddd %mm0, %mm0 # sched: [3:1.00]
+; SANDY-NEXT: movq %mm0, (%rdi) # sched: [1:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_movq:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: movq (%rdi), %mm0 # sched: [5:0.50]
+; HASWELL-NEXT: paddd %mm0, %mm0 # sched: [1:0.50]
+; HASWELL-NEXT: movq %mm0, (%rdi) # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_movq:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: movq (%rdi), %mm0 # sched: [5:0.50]
+; BROADWELL-NEXT: paddd %mm0, %mm0 # sched: [1:0.50]
+; BROADWELL-NEXT: movq %mm0, (%rdi) # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_movq:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: movq (%rdi), %mm0 # sched: [5:0.50]
+; SKYLAKE-NEXT: paddd %mm0, %mm0 # sched: [1:0.50]
+; SKYLAKE-NEXT: movq %mm0, (%rdi) # sched: [1:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_movq:
+; SKX: # %bb.0:
+; SKX-NEXT: movq (%rdi), %mm0 # sched: [5:0.50]
+; SKX-NEXT: paddd %mm0, %mm0 # sched: [1:0.50]
+; SKX-NEXT: movq %mm0, (%rdi) # sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_movq:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: movq (%rdi), %mm0 # sched: [5:1.00]
+; BTVER2-NEXT: paddd %mm0, %mm0 # sched: [1:0.50]
+; BTVER2-NEXT: movq %mm0, (%rdi) # sched: [1:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_movq:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: movq (%rdi), %mm0 # sched: [8:0.50]
+; ZNVER1-NEXT: paddd %mm0, %mm0 # sched: [1:0.25]
+; ZNVER1-NEXT: movq %mm0, (%rdi) # sched: [1:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = load i64, i64* %a0, align 8
+ %2 = bitcast i64 %1 to x86_mmx
+ %3 = call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %2, x86_mmx %2)
+ %4 = bitcast x86_mmx %3 to i64
+ store i64 %4, i64* %a0, align 8
+ ret void
+}
+
+define <2 x i64> @test_movq2dq(x86_mmx %a0) optsize {
+; GENERIC-LABEL: test_movq2dq:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: movq2dq %mm0, %xmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_movq2dq:
+; ATOM: # %bb.0:
+; ATOM-NEXT: movq2dq %mm0, %xmm0
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_movq2dq:
+; SLM: # %bb.0:
+; SLM-NEXT: movq2dq %mm0, %xmm0 # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_movq2dq:
+; SANDY: # %bb.0:
+; SANDY-NEXT: movq2dq %mm0, %xmm0 # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_movq2dq:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: movq2dq %mm0, %xmm0 # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_movq2dq:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: movq2dq %mm0, %xmm0 # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_movq2dq:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: movq2dq %mm0, %xmm0 # sched: [2:2.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_movq2dq:
+; SKX: # %bb.0:
+; SKX-NEXT: movq2dq %mm0, %xmm0 # sched: [2:2.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_movq2dq:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: movq2dq %mm0, %xmm0 # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_movq2dq:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: movq2dq %mm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = bitcast x86_mmx %a0 to i64
+ %2 = insertelement <2 x i64> undef, i64 %1, i32 0
+ ret <2 x i64> %2
+}
+
+define i64 @test_pabsb(x86_mmx *%a0) optsize {
+; GENERIC-LABEL: test_pabsb:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: pabsb (%rdi), %mm0 # sched: [6:0.50]
+; GENERIC-NEXT: pabsb %mm0, %mm0 # sched: [1:0.50]
+; GENERIC-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_pabsb:
+; ATOM: # %bb.0:
+; ATOM-NEXT: pabsb (%rdi), %mm0 # sched: [1:1.00]
+; ATOM-NEXT: pabsb %mm0, %mm0 # sched: [1:0.50]
+; ATOM-NEXT: movd %mm0, %rax # sched: [3:3.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_pabsb:
+; SLM: # %bb.0:
+; SLM-NEXT: pabsb (%rdi), %mm0 # sched: [4:1.00]
+; SLM-NEXT: pabsb %mm0, %mm0 # sched: [1:0.50]
+; SLM-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_pabsb:
+; SANDY: # %bb.0:
+; SANDY-NEXT: pabsb (%rdi), %mm0 # sched: [6:0.50]
+; SANDY-NEXT: pabsb %mm0, %mm0 # sched: [1:0.50]
+; SANDY-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_pabsb:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: pabsb (%rdi), %mm0 # sched: [6:0.50]
+; HASWELL-NEXT: pabsb %mm0, %mm0 # sched: [1:0.50]
+; HASWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pabsb:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: pabsb (%rdi), %mm0 # sched: [6:0.50]
+; BROADWELL-NEXT: pabsb %mm0, %mm0 # sched: [1:0.50]
+; BROADWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pabsb:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: pabsb (%rdi), %mm0 # sched: [6:0.50]
+; SKYLAKE-NEXT: pabsb %mm0, %mm0 # sched: [1:0.50]
+; SKYLAKE-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pabsb:
+; SKX: # %bb.0:
+; SKX-NEXT: pabsb (%rdi), %mm0 # sched: [6:0.50]
+; SKX-NEXT: pabsb %mm0, %mm0 # sched: [1:0.50]
+; SKX-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_pabsb:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: pabsb (%rdi), %mm0 # sched: [6:1.00]
+; BTVER2-NEXT: pabsb %mm0, %mm0 # sched: [1:0.50]
+; BTVER2-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pabsb:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: pabsb (%rdi), %mm0 # sched: [8:0.50]
+; ZNVER1-NEXT: pabsb %mm0, %mm0 # sched: [1:0.25]
+; ZNVER1-NEXT: movd %mm0, %rax # sched: [2:1.00]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = load x86_mmx, x86_mmx *%a0, align 8
+ %2 = call x86_mmx @llvm.x86.ssse3.pabs.b(x86_mmx %1)
+ %3 = call x86_mmx @llvm.x86.ssse3.pabs.b(x86_mmx %2)
+ %4 = bitcast x86_mmx %3 to i64
+ ret i64 %4
+}
+declare x86_mmx @llvm.x86.ssse3.pabs.b(x86_mmx) nounwind readnone
+
+define i64 @test_pabsd(x86_mmx *%a0) optsize {
+; GENERIC-LABEL: test_pabsd:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: pabsd (%rdi), %mm0 # sched: [6:0.50]
+; GENERIC-NEXT: pabsd %mm0, %mm0 # sched: [1:0.50]
+; GENERIC-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_pabsd:
+; ATOM: # %bb.0:
+; ATOM-NEXT: pabsd (%rdi), %mm0 # sched: [1:1.00]
+; ATOM-NEXT: pabsd %mm0, %mm0 # sched: [1:0.50]
+; ATOM-NEXT: movd %mm0, %rax # sched: [3:3.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_pabsd:
+; SLM: # %bb.0:
+; SLM-NEXT: pabsd (%rdi), %mm0 # sched: [4:1.00]
+; SLM-NEXT: pabsd %mm0, %mm0 # sched: [1:0.50]
+; SLM-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_pabsd:
+; SANDY: # %bb.0:
+; SANDY-NEXT: pabsd (%rdi), %mm0 # sched: [6:0.50]
+; SANDY-NEXT: pabsd %mm0, %mm0 # sched: [1:0.50]
+; SANDY-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_pabsd:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: pabsd (%rdi), %mm0 # sched: [6:0.50]
+; HASWELL-NEXT: pabsd %mm0, %mm0 # sched: [1:0.50]
+; HASWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pabsd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: pabsd (%rdi), %mm0 # sched: [6:0.50]
+; BROADWELL-NEXT: pabsd %mm0, %mm0 # sched: [1:0.50]
+; BROADWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pabsd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: pabsd (%rdi), %mm0 # sched: [6:0.50]
+; SKYLAKE-NEXT: pabsd %mm0, %mm0 # sched: [1:0.50]
+; SKYLAKE-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pabsd:
+; SKX: # %bb.0:
+; SKX-NEXT: pabsd (%rdi), %mm0 # sched: [6:0.50]
+; SKX-NEXT: pabsd %mm0, %mm0 # sched: [1:0.50]
+; SKX-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_pabsd:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: pabsd (%rdi), %mm0 # sched: [6:1.00]
+; BTVER2-NEXT: pabsd %mm0, %mm0 # sched: [1:0.50]
+; BTVER2-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pabsd:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: pabsd (%rdi), %mm0 # sched: [8:0.50]
+; ZNVER1-NEXT: pabsd %mm0, %mm0 # sched: [1:0.25]
+; ZNVER1-NEXT: movd %mm0, %rax # sched: [2:1.00]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = load x86_mmx, x86_mmx *%a0, align 8
+ %2 = call x86_mmx @llvm.x86.ssse3.pabs.d(x86_mmx %1)
+ %3 = call x86_mmx @llvm.x86.ssse3.pabs.d(x86_mmx %2)
+ %4 = bitcast x86_mmx %3 to i64
+ ret i64 %4
+}
+declare x86_mmx @llvm.x86.ssse3.pabs.d(x86_mmx) nounwind readnone
+
+define i64 @test_pabsw(x86_mmx *%a0) optsize {
+; GENERIC-LABEL: test_pabsw:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: pabsw (%rdi), %mm0 # sched: [6:0.50]
+; GENERIC-NEXT: pabsw %mm0, %mm0 # sched: [1:0.50]
+; GENERIC-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_pabsw:
+; ATOM: # %bb.0:
+; ATOM-NEXT: pabsw (%rdi), %mm0 # sched: [1:1.00]
+; ATOM-NEXT: pabsw %mm0, %mm0 # sched: [1:0.50]
+; ATOM-NEXT: movd %mm0, %rax # sched: [3:3.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_pabsw:
+; SLM: # %bb.0:
+; SLM-NEXT: pabsw (%rdi), %mm0 # sched: [4:1.00]
+; SLM-NEXT: pabsw %mm0, %mm0 # sched: [1:0.50]
+; SLM-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_pabsw:
+; SANDY: # %bb.0:
+; SANDY-NEXT: pabsw (%rdi), %mm0 # sched: [6:0.50]
+; SANDY-NEXT: pabsw %mm0, %mm0 # sched: [1:0.50]
+; SANDY-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_pabsw:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: pabsw (%rdi), %mm0 # sched: [6:0.50]
+; HASWELL-NEXT: pabsw %mm0, %mm0 # sched: [1:0.50]
+; HASWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pabsw:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: pabsw (%rdi), %mm0 # sched: [6:0.50]
+; BROADWELL-NEXT: pabsw %mm0, %mm0 # sched: [1:0.50]
+; BROADWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pabsw:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: pabsw (%rdi), %mm0 # sched: [6:0.50]
+; SKYLAKE-NEXT: pabsw %mm0, %mm0 # sched: [1:0.50]
+; SKYLAKE-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pabsw:
+; SKX: # %bb.0:
+; SKX-NEXT: pabsw (%rdi), %mm0 # sched: [6:0.50]
+; SKX-NEXT: pabsw %mm0, %mm0 # sched: [1:0.50]
+; SKX-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_pabsw:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: pabsw (%rdi), %mm0 # sched: [6:1.00]
+; BTVER2-NEXT: pabsw %mm0, %mm0 # sched: [1:0.50]
+; BTVER2-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pabsw:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: pabsw (%rdi), %mm0 # sched: [8:0.50]
+; ZNVER1-NEXT: pabsw %mm0, %mm0 # sched: [1:0.25]
+; ZNVER1-NEXT: movd %mm0, %rax # sched: [2:1.00]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = load x86_mmx, x86_mmx *%a0, align 8
+ %2 = call x86_mmx @llvm.x86.ssse3.pabs.w(x86_mmx %1)
+ %3 = call x86_mmx @llvm.x86.ssse3.pabs.w(x86_mmx %2)
+ %4 = bitcast x86_mmx %3 to i64
+ ret i64 %4
+}
+declare x86_mmx @llvm.x86.ssse3.pabs.w(x86_mmx) nounwind readnone
+
+define i64 @test_packssdw(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
+; GENERIC-LABEL: test_packssdw:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: packssdw %mm1, %mm0 # sched: [1:1.00]
+; GENERIC-NEXT: packssdw (%rdi), %mm0 # sched: [5:1.00]
+; GENERIC-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_packssdw:
+; ATOM: # %bb.0:
+; ATOM-NEXT: packssdw %mm1, %mm0 # sched: [1:0.50]
+; ATOM-NEXT: packssdw (%rdi), %mm0 # sched: [1:1.00]
+; ATOM-NEXT: movd %mm0, %rax # sched: [3:3.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_packssdw:
+; SLM: # %bb.0:
+; SLM-NEXT: packssdw %mm1, %mm0 # sched: [1:1.00]
+; SLM-NEXT: packssdw (%rdi), %mm0 # sched: [4:1.00]
+; SLM-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_packssdw:
+; SANDY: # %bb.0:
+; SANDY-NEXT: packssdw %mm1, %mm0 # sched: [1:1.00]
+; SANDY-NEXT: packssdw (%rdi), %mm0 # sched: [5:1.00]
+; SANDY-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_packssdw:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: packssdw %mm1, %mm0 # sched: [3:2.00]
+; HASWELL-NEXT: packssdw (%rdi), %mm0 # sched: [7:2.00]
+; HASWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_packssdw:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: packssdw %mm1, %mm0 # sched: [3:2.00]
+; BROADWELL-NEXT: packssdw (%rdi), %mm0 # sched: [7:2.00]
+; BROADWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_packssdw:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: packssdw %mm1, %mm0 # sched: [3:2.00]
+; SKYLAKE-NEXT: packssdw (%rdi), %mm0 # sched: [7:2.00]
+; SKYLAKE-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_packssdw:
+; SKX: # %bb.0:
+; SKX-NEXT: packssdw %mm1, %mm0 # sched: [3:2.00]
+; SKX-NEXT: packssdw (%rdi), %mm0 # sched: [7:2.00]
+; SKX-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_packssdw:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: packssdw %mm1, %mm0 # sched: [1:0.50]
+; BTVER2-NEXT: packssdw (%rdi), %mm0 # sched: [6:1.00]
+; BTVER2-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_packssdw:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: packssdw %mm1, %mm0 # sched: [1:0.50]
+; ZNVER1-NEXT: packssdw (%rdi), %mm0 # sched: [1:0.50]
+; ZNVER1-NEXT: movd %mm0, %rax # sched: [2:1.00]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call x86_mmx @llvm.x86.mmx.packssdw(x86_mmx %a0, x86_mmx %a1)
+ %2 = load x86_mmx, x86_mmx *%a2, align 8
+ %3 = call x86_mmx @llvm.x86.mmx.packssdw(x86_mmx %1, x86_mmx %2)
+ %4 = bitcast x86_mmx %3 to i64
+ ret i64 %4
+}
+declare x86_mmx @llvm.x86.mmx.packssdw(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test_packsswb(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
+; GENERIC-LABEL: test_packsswb:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: packsswb %mm1, %mm0 # sched: [1:1.00]
+; GENERIC-NEXT: packsswb (%rdi), %mm0 # sched: [5:1.00]
+; GENERIC-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_packsswb:
+; ATOM: # %bb.0:
+; ATOM-NEXT: packsswb %mm1, %mm0 # sched: [1:0.50]
+; ATOM-NEXT: packsswb (%rdi), %mm0 # sched: [1:1.00]
+; ATOM-NEXT: movd %mm0, %rax # sched: [3:3.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_packsswb:
+; SLM: # %bb.0:
+; SLM-NEXT: packsswb %mm1, %mm0 # sched: [1:1.00]
+; SLM-NEXT: packsswb (%rdi), %mm0 # sched: [4:1.00]
+; SLM-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_packsswb:
+; SANDY: # %bb.0:
+; SANDY-NEXT: packsswb %mm1, %mm0 # sched: [1:1.00]
+; SANDY-NEXT: packsswb (%rdi), %mm0 # sched: [5:1.00]
+; SANDY-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_packsswb:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: packsswb %mm1, %mm0 # sched: [3:2.00]
+; HASWELL-NEXT: packsswb (%rdi), %mm0 # sched: [7:2.00]
+; HASWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_packsswb:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: packsswb %mm1, %mm0 # sched: [3:2.00]
+; BROADWELL-NEXT: packsswb (%rdi), %mm0 # sched: [7:2.00]
+; BROADWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_packsswb:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: packsswb %mm1, %mm0 # sched: [3:2.00]
+; SKYLAKE-NEXT: packsswb (%rdi), %mm0 # sched: [7:2.00]
+; SKYLAKE-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_packsswb:
+; SKX: # %bb.0:
+; SKX-NEXT: packsswb %mm1, %mm0 # sched: [3:2.00]
+; SKX-NEXT: packsswb (%rdi), %mm0 # sched: [7:2.00]
+; SKX-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_packsswb:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: packsswb %mm1, %mm0 # sched: [1:0.50]
+; BTVER2-NEXT: packsswb (%rdi), %mm0 # sched: [6:1.00]
+; BTVER2-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_packsswb:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: packsswb %mm1, %mm0 # sched: [1:0.50]
+; ZNVER1-NEXT: packsswb (%rdi), %mm0 # sched: [1:0.50]
+; ZNVER1-NEXT: movd %mm0, %rax # sched: [2:1.00]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call x86_mmx @llvm.x86.mmx.packsswb(x86_mmx %a0, x86_mmx %a1)
+ %2 = load x86_mmx, x86_mmx *%a2, align 8
+ %3 = call x86_mmx @llvm.x86.mmx.packsswb(x86_mmx %1, x86_mmx %2)
+ %4 = bitcast x86_mmx %3 to i64
+ ret i64 %4
+}
+declare x86_mmx @llvm.x86.mmx.packsswb(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test_packuswb(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
+; GENERIC-LABEL: test_packuswb:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: packuswb %mm1, %mm0 # sched: [1:1.00]
+; GENERIC-NEXT: packuswb (%rdi), %mm0 # sched: [5:1.00]
+; GENERIC-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_packuswb:
+; ATOM: # %bb.0:
+; ATOM-NEXT: packuswb %mm1, %mm0 # sched: [1:0.50]
+; ATOM-NEXT: packuswb (%rdi), %mm0 # sched: [1:1.00]
+; ATOM-NEXT: movd %mm0, %rax # sched: [3:3.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_packuswb:
+; SLM: # %bb.0:
+; SLM-NEXT: packuswb %mm1, %mm0 # sched: [1:1.00]
+; SLM-NEXT: packuswb (%rdi), %mm0 # sched: [4:1.00]
+; SLM-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_packuswb:
+; SANDY: # %bb.0:
+; SANDY-NEXT: packuswb %mm1, %mm0 # sched: [1:1.00]
+; SANDY-NEXT: packuswb (%rdi), %mm0 # sched: [5:1.00]
+; SANDY-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_packuswb:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: packuswb %mm1, %mm0 # sched: [3:2.00]
+; HASWELL-NEXT: packuswb (%rdi), %mm0 # sched: [7:2.00]
+; HASWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_packuswb:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: packuswb %mm1, %mm0 # sched: [3:2.00]
+; BROADWELL-NEXT: packuswb (%rdi), %mm0 # sched: [7:2.00]
+; BROADWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_packuswb:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: packuswb %mm1, %mm0 # sched: [3:2.00]
+; SKYLAKE-NEXT: packuswb (%rdi), %mm0 # sched: [7:2.00]
+; SKYLAKE-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_packuswb:
+; SKX: # %bb.0:
+; SKX-NEXT: packuswb %mm1, %mm0 # sched: [3:2.00]
+; SKX-NEXT: packuswb (%rdi), %mm0 # sched: [7:2.00]
+; SKX-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_packuswb:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: packuswb %mm1, %mm0 # sched: [1:0.50]
+; BTVER2-NEXT: packuswb (%rdi), %mm0 # sched: [6:1.00]
+; BTVER2-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_packuswb:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: packuswb %mm1, %mm0 # sched: [1:0.50]
+; ZNVER1-NEXT: packuswb (%rdi), %mm0 # sched: [1:0.50]
+; ZNVER1-NEXT: movd %mm0, %rax # sched: [2:1.00]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call x86_mmx @llvm.x86.mmx.packuswb(x86_mmx %a0, x86_mmx %a1)
+ %2 = load x86_mmx, x86_mmx *%a2, align 8
+ %3 = call x86_mmx @llvm.x86.mmx.packuswb(x86_mmx %1, x86_mmx %2)
+ %4 = bitcast x86_mmx %3 to i64
+ ret i64 %4
+}
+declare x86_mmx @llvm.x86.mmx.packuswb(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test_paddb(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
+; GENERIC-LABEL: test_paddb:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: paddb %mm1, %mm0 # sched: [3:1.00]
+; GENERIC-NEXT: paddb (%rdi), %mm0 # sched: [7:1.00]
+; GENERIC-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_paddb:
+; ATOM: # %bb.0:
+; ATOM-NEXT: paddb %mm1, %mm0 # sched: [1:0.50]
+; ATOM-NEXT: paddb (%rdi), %mm0 # sched: [1:1.00]
+; ATOM-NEXT: movd %mm0, %rax # sched: [3:3.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_paddb:
+; SLM: # %bb.0:
+; SLM-NEXT: paddb %mm1, %mm0 # sched: [1:0.50]
+; SLM-NEXT: paddb (%rdi), %mm0 # sched: [4:1.00]
+; SLM-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_paddb:
+; SANDY: # %bb.0:
+; SANDY-NEXT: paddb %mm1, %mm0 # sched: [3:1.00]
+; SANDY-NEXT: paddb (%rdi), %mm0 # sched: [7:1.00]
+; SANDY-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_paddb:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: paddb %mm1, %mm0 # sched: [1:0.50]
+; HASWELL-NEXT: paddb (%rdi), %mm0 # sched: [6:0.50]
+; HASWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_paddb:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: paddb %mm1, %mm0 # sched: [1:0.50]
+; BROADWELL-NEXT: paddb (%rdi), %mm0 # sched: [6:0.50]
+; BROADWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_paddb:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: paddb %mm1, %mm0 # sched: [1:0.50]
+; SKYLAKE-NEXT: paddb (%rdi), %mm0 # sched: [6:0.50]
+; SKYLAKE-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_paddb:
+; SKX: # %bb.0:
+; SKX-NEXT: paddb %mm1, %mm0 # sched: [1:0.50]
+; SKX-NEXT: paddb (%rdi), %mm0 # sched: [6:0.50]
+; SKX-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_paddb:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: paddb %mm1, %mm0 # sched: [1:0.50]
+; BTVER2-NEXT: paddb (%rdi), %mm0 # sched: [6:1.00]
+; BTVER2-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_paddb:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: paddb %mm1, %mm0 # sched: [1:0.25]
+; ZNVER1-NEXT: paddb (%rdi), %mm0 # sched: [8:0.50]
+; ZNVER1-NEXT: movd %mm0, %rax # sched: [2:1.00]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call x86_mmx @llvm.x86.mmx.padd.b(x86_mmx %a0, x86_mmx %a1)
+ %2 = load x86_mmx, x86_mmx *%a2, align 8
+ %3 = call x86_mmx @llvm.x86.mmx.padd.b(x86_mmx %1, x86_mmx %2)
+ %4 = bitcast x86_mmx %3 to i64
+ ret i64 %4
+}
+declare x86_mmx @llvm.x86.mmx.padd.b(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test_paddd(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
+; GENERIC-LABEL: test_paddd:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: paddd %mm1, %mm0 # sched: [3:1.00]
+; GENERIC-NEXT: paddd (%rdi), %mm0 # sched: [7:1.00]
+; GENERIC-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_paddd:
+; ATOM: # %bb.0:
+; ATOM-NEXT: paddd %mm1, %mm0 # sched: [1:0.50]
+; ATOM-NEXT: paddd (%rdi), %mm0 # sched: [1:1.00]
+; ATOM-NEXT: movd %mm0, %rax # sched: [3:3.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_paddd:
+; SLM: # %bb.0:
+; SLM-NEXT: paddd %mm1, %mm0 # sched: [1:0.50]
+; SLM-NEXT: paddd (%rdi), %mm0 # sched: [4:1.00]
+; SLM-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_paddd:
+; SANDY: # %bb.0:
+; SANDY-NEXT: paddd %mm1, %mm0 # sched: [3:1.00]
+; SANDY-NEXT: paddd (%rdi), %mm0 # sched: [7:1.00]
+; SANDY-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_paddd:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: paddd %mm1, %mm0 # sched: [1:0.50]
+; HASWELL-NEXT: paddd (%rdi), %mm0 # sched: [6:0.50]
+; HASWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_paddd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: paddd %mm1, %mm0 # sched: [1:0.50]
+; BROADWELL-NEXT: paddd (%rdi), %mm0 # sched: [6:0.50]
+; BROADWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_paddd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: paddd %mm1, %mm0 # sched: [1:0.50]
+; SKYLAKE-NEXT: paddd (%rdi), %mm0 # sched: [6:0.50]
+; SKYLAKE-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_paddd:
+; SKX: # %bb.0:
+; SKX-NEXT: paddd %mm1, %mm0 # sched: [1:0.50]
+; SKX-NEXT: paddd (%rdi), %mm0 # sched: [6:0.50]
+; SKX-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_paddd:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: paddd %mm1, %mm0 # sched: [1:0.50]
+; BTVER2-NEXT: paddd (%rdi), %mm0 # sched: [6:1.00]
+; BTVER2-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_paddd:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: paddd %mm1, %mm0 # sched: [1:0.25]
+; ZNVER1-NEXT: paddd (%rdi), %mm0 # sched: [8:0.50]
+; ZNVER1-NEXT: movd %mm0, %rax # sched: [2:1.00]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %a0, x86_mmx %a1)
+ %2 = load x86_mmx, x86_mmx *%a2, align 8
+ %3 = call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %1, x86_mmx %2)
+ %4 = bitcast x86_mmx %3 to i64
+ ret i64 %4
+}
+declare x86_mmx @llvm.x86.mmx.padd.d(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test_paddq(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
+; GENERIC-LABEL: test_paddq:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: paddq %mm1, %mm0 # sched: [1:0.50]
+; GENERIC-NEXT: paddq (%rdi), %mm0 # sched: [7:0.50]
+; GENERIC-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_paddq:
+; ATOM: # %bb.0:
+; ATOM-NEXT: paddq %mm1, %mm0 # sched: [2:1.00]
+; ATOM-NEXT: paddq (%rdi), %mm0 # sched: [3:1.50]
+; ATOM-NEXT: movd %mm0, %rax # sched: [3:3.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_paddq:
+; SLM: # %bb.0:
+; SLM-NEXT: paddq %mm1, %mm0 # sched: [1:0.50]
+; SLM-NEXT: paddq (%rdi), %mm0 # sched: [4:1.00]
+; SLM-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_paddq:
+; SANDY: # %bb.0:
+; SANDY-NEXT: paddq %mm1, %mm0 # sched: [1:0.50]
+; SANDY-NEXT: paddq (%rdi), %mm0 # sched: [7:0.50]
+; SANDY-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_paddq:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: paddq %mm1, %mm0 # sched: [1:0.50]
+; HASWELL-NEXT: paddq (%rdi), %mm0 # sched: [6:0.50]
+; HASWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_paddq:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: paddq %mm1, %mm0 # sched: [1:0.50]
+; BROADWELL-NEXT: paddq (%rdi), %mm0 # sched: [6:0.50]
+; BROADWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_paddq:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: paddq %mm1, %mm0 # sched: [1:0.50]
+; SKYLAKE-NEXT: paddq (%rdi), %mm0 # sched: [6:0.50]
+; SKYLAKE-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_paddq:
+; SKX: # %bb.0:
+; SKX-NEXT: paddq %mm1, %mm0 # sched: [1:0.50]
+; SKX-NEXT: paddq (%rdi), %mm0 # sched: [6:0.50]
+; SKX-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_paddq:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: paddq %mm1, %mm0 # sched: [1:0.50]
+; BTVER2-NEXT: paddq (%rdi), %mm0 # sched: [6:1.00]
+; BTVER2-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_paddq:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: paddq %mm1, %mm0 # sched: [1:0.25]
+; ZNVER1-NEXT: paddq (%rdi), %mm0 # sched: [8:0.50]
+; ZNVER1-NEXT: movd %mm0, %rax # sched: [2:1.00]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call x86_mmx @llvm.x86.mmx.padd.q(x86_mmx %a0, x86_mmx %a1)
+ %2 = load x86_mmx, x86_mmx *%a2, align 8
+ %3 = call x86_mmx @llvm.x86.mmx.padd.q(x86_mmx %1, x86_mmx %2)
+ %4 = bitcast x86_mmx %3 to i64
+ ret i64 %4
+}
+declare x86_mmx @llvm.x86.mmx.padd.q(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test_paddsb(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
+; GENERIC-LABEL: test_paddsb:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: paddsb %mm1, %mm0 # sched: [3:1.00]
+; GENERIC-NEXT: paddsb (%rdi), %mm0 # sched: [7:1.00]
+; GENERIC-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_paddsb:
+; ATOM: # %bb.0:
+; ATOM-NEXT: paddsb %mm1, %mm0 # sched: [1:0.50]
+; ATOM-NEXT: paddsb (%rdi), %mm0 # sched: [1:1.00]
+; ATOM-NEXT: movd %mm0, %rax # sched: [3:3.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_paddsb:
+; SLM: # %bb.0:
+; SLM-NEXT: paddsb %mm1, %mm0 # sched: [1:0.50]
+; SLM-NEXT: paddsb (%rdi), %mm0 # sched: [4:1.00]
+; SLM-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_paddsb:
+; SANDY: # %bb.0:
+; SANDY-NEXT: paddsb %mm1, %mm0 # sched: [3:1.00]
+; SANDY-NEXT: paddsb (%rdi), %mm0 # sched: [7:1.00]
+; SANDY-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_paddsb:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: paddsb %mm1, %mm0 # sched: [1:0.50]
+; HASWELL-NEXT: paddsb (%rdi), %mm0 # sched: [6:0.50]
+; HASWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_paddsb:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: paddsb %mm1, %mm0 # sched: [1:0.50]
+; BROADWELL-NEXT: paddsb (%rdi), %mm0 # sched: [6:0.50]
+; BROADWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_paddsb:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: paddsb %mm1, %mm0 # sched: [1:1.00]
+; SKYLAKE-NEXT: paddsb (%rdi), %mm0 # sched: [6:1.00]
+; SKYLAKE-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_paddsb:
+; SKX: # %bb.0:
+; SKX-NEXT: paddsb %mm1, %mm0 # sched: [1:1.00]
+; SKX-NEXT: paddsb (%rdi), %mm0 # sched: [6:1.00]
+; SKX-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_paddsb:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: paddsb %mm1, %mm0 # sched: [1:0.50]
+; BTVER2-NEXT: paddsb (%rdi), %mm0 # sched: [6:1.00]
+; BTVER2-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_paddsb:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: paddsb %mm1, %mm0 # sched: [1:0.25]
+; ZNVER1-NEXT: paddsb (%rdi), %mm0 # sched: [8:0.50]
+; ZNVER1-NEXT: movd %mm0, %rax # sched: [2:1.00]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call x86_mmx @llvm.x86.mmx.padds.b(x86_mmx %a0, x86_mmx %a1)
+ %2 = load x86_mmx, x86_mmx *%a2, align 8
+ %3 = call x86_mmx @llvm.x86.mmx.padds.b(x86_mmx %1, x86_mmx %2)
+ %4 = bitcast x86_mmx %3 to i64
+ ret i64 %4
+}
+declare x86_mmx @llvm.x86.mmx.padds.b(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test_paddsw(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
+; GENERIC-LABEL: test_paddsw:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: paddsw %mm1, %mm0 # sched: [3:1.00]
+; GENERIC-NEXT: paddsw (%rdi), %mm0 # sched: [7:1.00]
+; GENERIC-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_paddsw:
+; ATOM: # %bb.0:
+; ATOM-NEXT: paddsw %mm1, %mm0 # sched: [1:0.50]
+; ATOM-NEXT: paddsw (%rdi), %mm0 # sched: [1:1.00]
+; ATOM-NEXT: movd %mm0, %rax # sched: [3:3.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_paddsw:
+; SLM: # %bb.0:
+; SLM-NEXT: paddsw %mm1, %mm0 # sched: [1:0.50]
+; SLM-NEXT: paddsw (%rdi), %mm0 # sched: [4:1.00]
+; SLM-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_paddsw:
+; SANDY: # %bb.0:
+; SANDY-NEXT: paddsw %mm1, %mm0 # sched: [3:1.00]
+; SANDY-NEXT: paddsw (%rdi), %mm0 # sched: [7:1.00]
+; SANDY-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_paddsw:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: paddsw %mm1, %mm0 # sched: [1:0.50]
+; HASWELL-NEXT: paddsw (%rdi), %mm0 # sched: [6:0.50]
+; HASWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_paddsw:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: paddsw %mm1, %mm0 # sched: [1:0.50]
+; BROADWELL-NEXT: paddsw (%rdi), %mm0 # sched: [6:0.50]
+; BROADWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_paddsw:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: paddsw %mm1, %mm0 # sched: [1:1.00]
+; SKYLAKE-NEXT: paddsw (%rdi), %mm0 # sched: [6:1.00]
+; SKYLAKE-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_paddsw:
+; SKX: # %bb.0:
+; SKX-NEXT: paddsw %mm1, %mm0 # sched: [1:1.00]
+; SKX-NEXT: paddsw (%rdi), %mm0 # sched: [6:1.00]
+; SKX-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_paddsw:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: paddsw %mm1, %mm0 # sched: [1:0.50]
+; BTVER2-NEXT: paddsw (%rdi), %mm0 # sched: [6:1.00]
+; BTVER2-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_paddsw:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: paddsw %mm1, %mm0 # sched: [1:0.25]
+; ZNVER1-NEXT: paddsw (%rdi), %mm0 # sched: [8:0.50]
+; ZNVER1-NEXT: movd %mm0, %rax # sched: [2:1.00]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call x86_mmx @llvm.x86.mmx.padds.w(x86_mmx %a0, x86_mmx %a1)
+ %2 = load x86_mmx, x86_mmx *%a2, align 8
+ %3 = call x86_mmx @llvm.x86.mmx.padds.w(x86_mmx %1, x86_mmx %2)
+ %4 = bitcast x86_mmx %3 to i64
+ ret i64 %4
+}
+declare x86_mmx @llvm.x86.mmx.padds.w(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test_paddusb(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
+; GENERIC-LABEL: test_paddusb:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: paddusb %mm1, %mm0 # sched: [3:1.00]
+; GENERIC-NEXT: paddusb (%rdi), %mm0 # sched: [7:1.00]
+; GENERIC-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_paddusb:
+; ATOM: # %bb.0:
+; ATOM-NEXT: paddusb %mm1, %mm0 # sched: [1:0.50]
+; ATOM-NEXT: paddusb (%rdi), %mm0 # sched: [1:1.00]
+; ATOM-NEXT: movd %mm0, %rax # sched: [3:3.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_paddusb:
+; SLM: # %bb.0:
+; SLM-NEXT: paddusb %mm1, %mm0 # sched: [1:0.50]
+; SLM-NEXT: paddusb (%rdi), %mm0 # sched: [4:1.00]
+; SLM-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_paddusb:
+; SANDY: # %bb.0:
+; SANDY-NEXT: paddusb %mm1, %mm0 # sched: [3:1.00]
+; SANDY-NEXT: paddusb (%rdi), %mm0 # sched: [7:1.00]
+; SANDY-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_paddusb:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: paddusb %mm1, %mm0 # sched: [1:0.50]
+; HASWELL-NEXT: paddusb (%rdi), %mm0 # sched: [6:0.50]
+; HASWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_paddusb:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: paddusb %mm1, %mm0 # sched: [1:0.50]
+; BROADWELL-NEXT: paddusb (%rdi), %mm0 # sched: [6:0.50]
+; BROADWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_paddusb:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: paddusb %mm1, %mm0 # sched: [1:1.00]
+; SKYLAKE-NEXT: paddusb (%rdi), %mm0 # sched: [6:1.00]
+; SKYLAKE-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_paddusb:
+; SKX: # %bb.0:
+; SKX-NEXT: paddusb %mm1, %mm0 # sched: [1:1.00]
+; SKX-NEXT: paddusb (%rdi), %mm0 # sched: [6:1.00]
+; SKX-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_paddusb:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: paddusb %mm1, %mm0 # sched: [1:0.50]
+; BTVER2-NEXT: paddusb (%rdi), %mm0 # sched: [6:1.00]
+; BTVER2-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_paddusb:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: paddusb %mm1, %mm0 # sched: [1:0.25]
+; ZNVER1-NEXT: paddusb (%rdi), %mm0 # sched: [8:0.50]
+; ZNVER1-NEXT: movd %mm0, %rax # sched: [2:1.00]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call x86_mmx @llvm.x86.mmx.paddus.b(x86_mmx %a0, x86_mmx %a1)
+ %2 = load x86_mmx, x86_mmx *%a2, align 8
+ %3 = call x86_mmx @llvm.x86.mmx.paddus.b(x86_mmx %1, x86_mmx %2)
+ %4 = bitcast x86_mmx %3 to i64
+ ret i64 %4
+}
+declare x86_mmx @llvm.x86.mmx.paddus.b(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test_paddusw(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
+; GENERIC-LABEL: test_paddusw:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: paddusw %mm1, %mm0 # sched: [3:1.00]
+; GENERIC-NEXT: paddusw (%rdi), %mm0 # sched: [7:1.00]
+; GENERIC-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_paddusw:
+; ATOM: # %bb.0:
+; ATOM-NEXT: paddusw %mm1, %mm0 # sched: [1:0.50]
+; ATOM-NEXT: paddusw (%rdi), %mm0 # sched: [1:1.00]
+; ATOM-NEXT: movd %mm0, %rax # sched: [3:3.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_paddusw:
+; SLM: # %bb.0:
+; SLM-NEXT: paddusw %mm1, %mm0 # sched: [1:0.50]
+; SLM-NEXT: paddusw (%rdi), %mm0 # sched: [4:1.00]
+; SLM-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_paddusw:
+; SANDY: # %bb.0:
+; SANDY-NEXT: paddusw %mm1, %mm0 # sched: [3:1.00]
+; SANDY-NEXT: paddusw (%rdi), %mm0 # sched: [7:1.00]
+; SANDY-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_paddusw:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: paddusw %mm1, %mm0 # sched: [1:0.50]
+; HASWELL-NEXT: paddusw (%rdi), %mm0 # sched: [6:0.50]
+; HASWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_paddusw:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: paddusw %mm1, %mm0 # sched: [1:0.50]
+; BROADWELL-NEXT: paddusw (%rdi), %mm0 # sched: [6:0.50]
+; BROADWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_paddusw:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: paddusw %mm1, %mm0 # sched: [1:1.00]
+; SKYLAKE-NEXT: paddusw (%rdi), %mm0 # sched: [6:1.00]
+; SKYLAKE-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_paddusw:
+; SKX: # %bb.0:
+; SKX-NEXT: paddusw %mm1, %mm0 # sched: [1:1.00]
+; SKX-NEXT: paddusw (%rdi), %mm0 # sched: [6:1.00]
+; SKX-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_paddusw:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: paddusw %mm1, %mm0 # sched: [1:0.50]
+; BTVER2-NEXT: paddusw (%rdi), %mm0 # sched: [6:1.00]
+; BTVER2-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_paddusw:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: paddusw %mm1, %mm0 # sched: [1:0.25]
+; ZNVER1-NEXT: paddusw (%rdi), %mm0 # sched: [8:0.50]
+; ZNVER1-NEXT: movd %mm0, %rax # sched: [2:1.00]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call x86_mmx @llvm.x86.mmx.paddus.w(x86_mmx %a0, x86_mmx %a1)
+ %2 = load x86_mmx, x86_mmx *%a2, align 8
+ %3 = call x86_mmx @llvm.x86.mmx.paddus.w(x86_mmx %1, x86_mmx %2)
+ %4 = bitcast x86_mmx %3 to i64
+ ret i64 %4
+}
+declare x86_mmx @llvm.x86.mmx.paddus.w(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test_paddw(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
+; GENERIC-LABEL: test_paddw:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: paddw %mm1, %mm0 # sched: [3:1.00]
+; GENERIC-NEXT: paddw (%rdi), %mm0 # sched: [7:1.00]
+; GENERIC-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_paddw:
+; ATOM: # %bb.0:
+; ATOM-NEXT: paddw %mm1, %mm0 # sched: [1:0.50]
+; ATOM-NEXT: paddw (%rdi), %mm0 # sched: [1:1.00]
+; ATOM-NEXT: movd %mm0, %rax # sched: [3:3.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_paddw:
+; SLM: # %bb.0:
+; SLM-NEXT: paddw %mm1, %mm0 # sched: [1:0.50]
+; SLM-NEXT: paddw (%rdi), %mm0 # sched: [4:1.00]
+; SLM-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_paddw:
+; SANDY: # %bb.0:
+; SANDY-NEXT: paddw %mm1, %mm0 # sched: [3:1.00]
+; SANDY-NEXT: paddw (%rdi), %mm0 # sched: [7:1.00]
+; SANDY-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_paddw:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: paddw %mm1, %mm0 # sched: [1:0.50]
+; HASWELL-NEXT: paddw (%rdi), %mm0 # sched: [6:0.50]
+; HASWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_paddw:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: paddw %mm1, %mm0 # sched: [1:0.50]
+; BROADWELL-NEXT: paddw (%rdi), %mm0 # sched: [6:0.50]
+; BROADWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_paddw:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: paddw %mm1, %mm0 # sched: [1:0.50]
+; SKYLAKE-NEXT: paddw (%rdi), %mm0 # sched: [6:0.50]
+; SKYLAKE-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_paddw:
+; SKX: # %bb.0:
+; SKX-NEXT: paddw %mm1, %mm0 # sched: [1:0.50]
+; SKX-NEXT: paddw (%rdi), %mm0 # sched: [6:0.50]
+; SKX-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_paddw:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: paddw %mm1, %mm0 # sched: [1:0.50]
+; BTVER2-NEXT: paddw (%rdi), %mm0 # sched: [6:1.00]
+; BTVER2-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_paddw:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: paddw %mm1, %mm0 # sched: [1:0.25]
+; ZNVER1-NEXT: paddw (%rdi), %mm0 # sched: [8:0.50]
+; ZNVER1-NEXT: movd %mm0, %rax # sched: [2:1.00]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %a0, x86_mmx %a1)
+ %2 = load x86_mmx, x86_mmx *%a2, align 8
+ %3 = call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %1, x86_mmx %2)
+ %4 = bitcast x86_mmx %3 to i64
+ ret i64 %4
+}
+declare x86_mmx @llvm.x86.mmx.padd.w(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test_palignr(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
+; GENERIC-LABEL: test_palignr:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: palignr $1, %mm1, %mm0 # sched: [1:0.50]
+; GENERIC-NEXT: palignr $1, (%rdi), %mm0 # sched: [6:0.50]
+; GENERIC-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_palignr:
+; ATOM: # %bb.0:
+; ATOM-NEXT: palignr $1, %mm1, %mm0
+; ATOM-NEXT: palignr $1, (%rdi), %mm0
+; ATOM-NEXT: movd %mm0, %rax # sched: [3:3.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_palignr:
+; SLM: # %bb.0:
+; SLM-NEXT: palignr $1, %mm1, %mm0 # sched: [1:1.00]
+; SLM-NEXT: palignr $1, (%rdi), %mm0 # sched: [4:1.00]
+; SLM-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_palignr:
+; SANDY: # %bb.0:
+; SANDY-NEXT: palignr $1, %mm1, %mm0 # sched: [1:0.50]
+; SANDY-NEXT: palignr $1, (%rdi), %mm0 # sched: [6:0.50]
+; SANDY-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_palignr:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: palignr $1, %mm1, %mm0 # sched: [1:1.00]
+; HASWELL-NEXT: palignr $1, (%rdi), %mm0 # sched: [6:1.00]
+; HASWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_palignr:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: palignr $1, %mm1, %mm0 # sched: [1:1.00]
+; BROADWELL-NEXT: palignr $1, (%rdi), %mm0 # sched: [6:1.00]
+; BROADWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_palignr:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: palignr $1, %mm1, %mm0 # sched: [1:1.00]
+; SKYLAKE-NEXT: palignr $1, (%rdi), %mm0 # sched: [6:1.00]
+; SKYLAKE-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_palignr:
+; SKX: # %bb.0:
+; SKX-NEXT: palignr $1, %mm1, %mm0 # sched: [1:1.00]
+; SKX-NEXT: palignr $1, (%rdi), %mm0 # sched: [6:1.00]
+; SKX-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_palignr:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: palignr $1, %mm1, %mm0 # sched: [1:0.50]
+; BTVER2-NEXT: palignr $1, (%rdi), %mm0 # sched: [6:1.00]
+; BTVER2-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_palignr:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: palignr $1, %mm1, %mm0 # sched: [1:0.25]
+; ZNVER1-NEXT: palignr $1, (%rdi), %mm0 # sched: [8:0.50]
+; ZNVER1-NEXT: movd %mm0, %rax # sched: [2:1.00]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call x86_mmx @llvm.x86.mmx.palignr.b(x86_mmx %a0, x86_mmx %a1, i8 1)
+ %2 = load x86_mmx, x86_mmx *%a2, align 8
+ %3 = call x86_mmx @llvm.x86.mmx.palignr.b(x86_mmx %1, x86_mmx %2, i8 1)
+ %4 = bitcast x86_mmx %3 to i64
+ ret i64 %4
+}
+declare x86_mmx @llvm.x86.mmx.palignr.b(x86_mmx, x86_mmx, i8) nounwind readnone
+
+define i64 @test_pand(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
+; GENERIC-LABEL: test_pand:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: pand %mm1, %mm0 # sched: [1:1.00]
+; GENERIC-NEXT: pand (%rdi), %mm0 # sched: [5:1.00]
+; GENERIC-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_pand:
+; ATOM: # %bb.0:
+; ATOM-NEXT: pand %mm1, %mm0 # sched: [1:0.50]
+; ATOM-NEXT: pand (%rdi), %mm0 # sched: [1:1.00]
+; ATOM-NEXT: movd %mm0, %rax # sched: [3:3.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_pand:
+; SLM: # %bb.0:
+; SLM-NEXT: pand %mm1, %mm0 # sched: [1:0.50]
+; SLM-NEXT: pand (%rdi), %mm0 # sched: [4:1.00]
+; SLM-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_pand:
+; SANDY: # %bb.0:
+; SANDY-NEXT: pand %mm1, %mm0 # sched: [1:1.00]
+; SANDY-NEXT: pand (%rdi), %mm0 # sched: [5:1.00]
+; SANDY-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_pand:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: pand %mm1, %mm0 # sched: [1:0.33]
+; HASWELL-NEXT: pand (%rdi), %mm0 # sched: [6:0.50]
+; HASWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pand:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: pand %mm1, %mm0 # sched: [1:0.33]
+; BROADWELL-NEXT: pand (%rdi), %mm0 # sched: [6:0.50]
+; BROADWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pand:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: pand %mm1, %mm0 # sched: [1:0.50]
+; SKYLAKE-NEXT: pand (%rdi), %mm0 # sched: [6:0.50]
+; SKYLAKE-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pand:
+; SKX: # %bb.0:
+; SKX-NEXT: pand %mm1, %mm0 # sched: [1:0.50]
+; SKX-NEXT: pand (%rdi), %mm0 # sched: [6:0.50]
+; SKX-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_pand:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: pand %mm1, %mm0 # sched: [1:0.50]
+; BTVER2-NEXT: pand (%rdi), %mm0 # sched: [6:1.00]
+; BTVER2-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pand:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: pand %mm1, %mm0 # sched: [1:0.25]
+; ZNVER1-NEXT: pand (%rdi), %mm0 # sched: [8:0.50]
+; ZNVER1-NEXT: movd %mm0, %rax # sched: [2:1.00]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call x86_mmx @llvm.x86.mmx.pand(x86_mmx %a0, x86_mmx %a1)
+ %2 = load x86_mmx, x86_mmx *%a2, align 8
+ %3 = call x86_mmx @llvm.x86.mmx.pand(x86_mmx %1, x86_mmx %2)
+ %4 = bitcast x86_mmx %3 to i64
+ ret i64 %4
+}
+declare x86_mmx @llvm.x86.mmx.pand(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test_pandn(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
+; GENERIC-LABEL: test_pandn:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: pandn %mm1, %mm0 # sched: [1:1.00]
+; GENERIC-NEXT: pandn (%rdi), %mm0 # sched: [5:1.00]
+; GENERIC-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_pandn:
+; ATOM: # %bb.0:
+; ATOM-NEXT: pandn %mm1, %mm0 # sched: [1:0.50]
+; ATOM-NEXT: pandn (%rdi), %mm0 # sched: [1:1.00]
+; ATOM-NEXT: movd %mm0, %rax # sched: [3:3.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_pandn:
+; SLM: # %bb.0:
+; SLM-NEXT: pandn %mm1, %mm0 # sched: [1:0.50]
+; SLM-NEXT: pandn (%rdi), %mm0 # sched: [4:1.00]
+; SLM-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_pandn:
+; SANDY: # %bb.0:
+; SANDY-NEXT: pandn %mm1, %mm0 # sched: [1:1.00]
+; SANDY-NEXT: pandn (%rdi), %mm0 # sched: [5:1.00]
+; SANDY-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_pandn:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: pandn %mm1, %mm0 # sched: [1:0.33]
+; HASWELL-NEXT: pandn (%rdi), %mm0 # sched: [6:0.50]
+; HASWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pandn:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: pandn %mm1, %mm0 # sched: [1:0.33]
+; BROADWELL-NEXT: pandn (%rdi), %mm0 # sched: [6:0.50]
+; BROADWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pandn:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: pandn %mm1, %mm0 # sched: [1:0.50]
+; SKYLAKE-NEXT: pandn (%rdi), %mm0 # sched: [6:0.50]
+; SKYLAKE-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pandn:
+; SKX: # %bb.0:
+; SKX-NEXT: pandn %mm1, %mm0 # sched: [1:0.50]
+; SKX-NEXT: pandn (%rdi), %mm0 # sched: [6:0.50]
+; SKX-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_pandn:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: pandn %mm1, %mm0 # sched: [1:0.50]
+; BTVER2-NEXT: pandn (%rdi), %mm0 # sched: [6:1.00]
+; BTVER2-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pandn:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: pandn %mm1, %mm0 # sched: [1:0.25]
+; ZNVER1-NEXT: pandn (%rdi), %mm0 # sched: [8:0.50]
+; ZNVER1-NEXT: movd %mm0, %rax # sched: [2:1.00]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call x86_mmx @llvm.x86.mmx.pandn(x86_mmx %a0, x86_mmx %a1)
+ %2 = load x86_mmx, x86_mmx *%a2, align 8
+ %3 = call x86_mmx @llvm.x86.mmx.pandn(x86_mmx %1, x86_mmx %2)
+ %4 = bitcast x86_mmx %3 to i64
+ ret i64 %4
+}
+declare x86_mmx @llvm.x86.mmx.pandn(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test_pavgb(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
+; GENERIC-LABEL: test_pavgb:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: pavgb %mm1, %mm0 # sched: [5:1.00]
+; GENERIC-NEXT: pavgb (%rdi), %mm0 # sched: [9:1.00]
+; GENERIC-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_pavgb:
+; ATOM: # %bb.0:
+; ATOM-NEXT: pavgb %mm1, %mm0 # sched: [1:1.00]
+; ATOM-NEXT: pavgb (%rdi), %mm0 # sched: [1:0.50]
+; ATOM-NEXT: movd %mm0, %rax # sched: [3:3.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_pavgb:
+; SLM: # %bb.0:
+; SLM-NEXT: pavgb %mm1, %mm0 # sched: [4:1.00]
+; SLM-NEXT: pavgb (%rdi), %mm0 # sched: [7:1.00]
+; SLM-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_pavgb:
+; SANDY: # %bb.0:
+; SANDY-NEXT: pavgb %mm1, %mm0 # sched: [5:1.00]
+; SANDY-NEXT: pavgb (%rdi), %mm0 # sched: [9:1.00]
+; SANDY-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_pavgb:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: pavgb %mm1, %mm0 # sched: [1:0.50]
+; HASWELL-NEXT: pavgb (%rdi), %mm0 # sched: [6:0.50]
+; HASWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pavgb:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: pavgb %mm1, %mm0 # sched: [1:0.50]
+; BROADWELL-NEXT: pavgb (%rdi), %mm0 # sched: [6:0.50]
+; BROADWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pavgb:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: pavgb %mm1, %mm0 # sched: [1:1.00]
+; SKYLAKE-NEXT: pavgb (%rdi), %mm0 # sched: [6:1.00]
+; SKYLAKE-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pavgb:
+; SKX: # %bb.0:
+; SKX-NEXT: pavgb %mm1, %mm0 # sched: [1:1.00]
+; SKX-NEXT: pavgb (%rdi), %mm0 # sched: [6:1.00]
+; SKX-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_pavgb:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: pavgb %mm1, %mm0 # sched: [2:1.00]
+; BTVER2-NEXT: pavgb (%rdi), %mm0 # sched: [7:1.00]
+; BTVER2-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pavgb:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: pavgb %mm1, %mm0 # sched: [4:1.00]
+; ZNVER1-NEXT: pavgb (%rdi), %mm0 # sched: [11:1.00]
+; ZNVER1-NEXT: movd %mm0, %rax # sched: [2:1.00]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call x86_mmx @llvm.x86.mmx.pavg.b(x86_mmx %a0, x86_mmx %a1)
+ %2 = load x86_mmx, x86_mmx *%a2, align 8
+ %3 = call x86_mmx @llvm.x86.mmx.pavg.b(x86_mmx %1, x86_mmx %2)
+ %4 = bitcast x86_mmx %3 to i64
+ ret i64 %4
+}
+declare x86_mmx @llvm.x86.mmx.pavg.b(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test_pavgw(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
+; GENERIC-LABEL: test_pavgw:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: pavgw %mm1, %mm0 # sched: [5:1.00]
+; GENERIC-NEXT: pavgw (%rdi), %mm0 # sched: [9:1.00]
+; GENERIC-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_pavgw:
+; ATOM: # %bb.0:
+; ATOM-NEXT: pavgw %mm1, %mm0 # sched: [1:1.00]
+; ATOM-NEXT: pavgw (%rdi), %mm0 # sched: [1:0.50]
+; ATOM-NEXT: movd %mm0, %rax # sched: [3:3.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_pavgw:
+; SLM: # %bb.0:
+; SLM-NEXT: pavgw %mm1, %mm0 # sched: [4:1.00]
+; SLM-NEXT: pavgw (%rdi), %mm0 # sched: [7:1.00]
+; SLM-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_pavgw:
+; SANDY: # %bb.0:
+; SANDY-NEXT: pavgw %mm1, %mm0 # sched: [5:1.00]
+; SANDY-NEXT: pavgw (%rdi), %mm0 # sched: [9:1.00]
+; SANDY-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_pavgw:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: pavgw %mm1, %mm0 # sched: [1:0.50]
+; HASWELL-NEXT: pavgw (%rdi), %mm0 # sched: [6:0.50]
+; HASWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pavgw:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: pavgw %mm1, %mm0 # sched: [1:0.50]
+; BROADWELL-NEXT: pavgw (%rdi), %mm0 # sched: [6:0.50]
+; BROADWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pavgw:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: pavgw %mm1, %mm0 # sched: [1:1.00]
+; SKYLAKE-NEXT: pavgw (%rdi), %mm0 # sched: [6:1.00]
+; SKYLAKE-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pavgw:
+; SKX: # %bb.0:
+; SKX-NEXT: pavgw %mm1, %mm0 # sched: [1:1.00]
+; SKX-NEXT: pavgw (%rdi), %mm0 # sched: [6:1.00]
+; SKX-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_pavgw:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: pavgw %mm1, %mm0 # sched: [2:1.00]
+; BTVER2-NEXT: pavgw (%rdi), %mm0 # sched: [7:1.00]
+; BTVER2-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pavgw:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: pavgw %mm1, %mm0 # sched: [4:1.00]
+; ZNVER1-NEXT: pavgw (%rdi), %mm0 # sched: [11:1.00]
+; ZNVER1-NEXT: movd %mm0, %rax # sched: [2:1.00]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call x86_mmx @llvm.x86.mmx.pavg.w(x86_mmx %a0, x86_mmx %a1)
+ %2 = load x86_mmx, x86_mmx *%a2, align 8
+ %3 = call x86_mmx @llvm.x86.mmx.pavg.w(x86_mmx %1, x86_mmx %2)
+ %4 = bitcast x86_mmx %3 to i64
+ ret i64 %4
+}
+declare x86_mmx @llvm.x86.mmx.pavg.w(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test_pcmpeqb(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
+; GENERIC-LABEL: test_pcmpeqb:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: pcmpeqb %mm1, %mm0 # sched: [3:1.00]
+; GENERIC-NEXT: pcmpeqb (%rdi), %mm0 # sched: [7:1.00]
+; GENERIC-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_pcmpeqb:
+; ATOM: # %bb.0:
+; ATOM-NEXT: pcmpeqb %mm1, %mm0 # sched: [1:0.50]
+; ATOM-NEXT: pcmpeqb (%rdi), %mm0 # sched: [1:1.00]
+; ATOM-NEXT: movd %mm0, %rax # sched: [3:3.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_pcmpeqb:
+; SLM: # %bb.0:
+; SLM-NEXT: pcmpeqb %mm1, %mm0 # sched: [1:0.50]
+; SLM-NEXT: pcmpeqb (%rdi), %mm0 # sched: [4:1.00]
+; SLM-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_pcmpeqb:
+; SANDY: # %bb.0:
+; SANDY-NEXT: pcmpeqb %mm1, %mm0 # sched: [3:1.00]
+; SANDY-NEXT: pcmpeqb (%rdi), %mm0 # sched: [7:1.00]
+; SANDY-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_pcmpeqb:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: pcmpeqb %mm1, %mm0 # sched: [1:0.50]
+; HASWELL-NEXT: pcmpeqb (%rdi), %mm0 # sched: [6:0.50]
+; HASWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pcmpeqb:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: pcmpeqb %mm1, %mm0 # sched: [1:0.50]
+; BROADWELL-NEXT: pcmpeqb (%rdi), %mm0 # sched: [6:0.50]
+; BROADWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pcmpeqb:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: pcmpeqb %mm1, %mm0 # sched: [1:1.00]
+; SKYLAKE-NEXT: pcmpeqb (%rdi), %mm0 # sched: [6:1.00]
+; SKYLAKE-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pcmpeqb:
+; SKX: # %bb.0:
+; SKX-NEXT: pcmpeqb %mm1, %mm0 # sched: [1:1.00]
+; SKX-NEXT: pcmpeqb (%rdi), %mm0 # sched: [6:1.00]
+; SKX-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_pcmpeqb:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: pcmpeqb %mm1, %mm0 # sched: [1:0.50]
+; BTVER2-NEXT: pcmpeqb (%rdi), %mm0 # sched: [6:1.00]
+; BTVER2-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pcmpeqb:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: pcmpeqb %mm1, %mm0 # sched: [1:0.25]
+; ZNVER1-NEXT: pcmpeqb (%rdi), %mm0 # sched: [8:0.50]
+; ZNVER1-NEXT: movd %mm0, %rax # sched: [2:1.00]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call x86_mmx @llvm.x86.mmx.pcmpeq.b(x86_mmx %a0, x86_mmx %a1)
+ %2 = load x86_mmx, x86_mmx *%a2, align 8
+ %3 = call x86_mmx @llvm.x86.mmx.pcmpeq.b(x86_mmx %1, x86_mmx %2)
+ %4 = bitcast x86_mmx %3 to i64
+ ret i64 %4
+}
+declare x86_mmx @llvm.x86.mmx.pcmpeq.b(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test_pcmpeqd(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
+; GENERIC-LABEL: test_pcmpeqd:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: pcmpeqd %mm1, %mm0 # sched: [3:1.00]
+; GENERIC-NEXT: pcmpeqd (%rdi), %mm0 # sched: [7:1.00]
+; GENERIC-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_pcmpeqd:
+; ATOM: # %bb.0:
+; ATOM-NEXT: pcmpeqd %mm1, %mm0 # sched: [1:0.50]
+; ATOM-NEXT: pcmpeqd (%rdi), %mm0 # sched: [1:1.00]
+; ATOM-NEXT: movd %mm0, %rax # sched: [3:3.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_pcmpeqd:
+; SLM: # %bb.0:
+; SLM-NEXT: pcmpeqd %mm1, %mm0 # sched: [1:0.50]
+; SLM-NEXT: pcmpeqd (%rdi), %mm0 # sched: [4:1.00]
+; SLM-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_pcmpeqd:
+; SANDY: # %bb.0:
+; SANDY-NEXT: pcmpeqd %mm1, %mm0 # sched: [3:1.00]
+; SANDY-NEXT: pcmpeqd (%rdi), %mm0 # sched: [7:1.00]
+; SANDY-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_pcmpeqd:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: pcmpeqd %mm1, %mm0 # sched: [1:0.50]
+; HASWELL-NEXT: pcmpeqd (%rdi), %mm0 # sched: [6:0.50]
+; HASWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pcmpeqd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: pcmpeqd %mm1, %mm0 # sched: [1:0.50]
+; BROADWELL-NEXT: pcmpeqd (%rdi), %mm0 # sched: [6:0.50]
+; BROADWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pcmpeqd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: pcmpeqd %mm1, %mm0 # sched: [1:1.00]
+; SKYLAKE-NEXT: pcmpeqd (%rdi), %mm0 # sched: [6:1.00]
+; SKYLAKE-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pcmpeqd:
+; SKX: # %bb.0:
+; SKX-NEXT: pcmpeqd %mm1, %mm0 # sched: [1:1.00]
+; SKX-NEXT: pcmpeqd (%rdi), %mm0 # sched: [6:1.00]
+; SKX-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_pcmpeqd:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: pcmpeqd %mm1, %mm0 # sched: [1:0.50]
+; BTVER2-NEXT: pcmpeqd (%rdi), %mm0 # sched: [6:1.00]
+; BTVER2-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pcmpeqd:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: pcmpeqd %mm1, %mm0 # sched: [1:0.25]
+; ZNVER1-NEXT: pcmpeqd (%rdi), %mm0 # sched: [8:0.50]
+; ZNVER1-NEXT: movd %mm0, %rax # sched: [2:1.00]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call x86_mmx @llvm.x86.mmx.pcmpeq.d(x86_mmx %a0, x86_mmx %a1)
+ %2 = load x86_mmx, x86_mmx *%a2, align 8
+ %3 = call x86_mmx @llvm.x86.mmx.pcmpeq.d(x86_mmx %1, x86_mmx %2)
+ %4 = bitcast x86_mmx %3 to i64
+ ret i64 %4
+}
+declare x86_mmx @llvm.x86.mmx.pcmpeq.d(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test_pcmpeqw(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
+; GENERIC-LABEL: test_pcmpeqw:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: pcmpeqw %mm1, %mm0 # sched: [3:1.00]
+; GENERIC-NEXT: pcmpeqw (%rdi), %mm0 # sched: [7:1.00]
+; GENERIC-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_pcmpeqw:
+; ATOM: # %bb.0:
+; ATOM-NEXT: pcmpeqw %mm1, %mm0 # sched: [1:0.50]
+; ATOM-NEXT: pcmpeqw (%rdi), %mm0 # sched: [1:1.00]
+; ATOM-NEXT: movd %mm0, %rax # sched: [3:3.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_pcmpeqw:
+; SLM: # %bb.0:
+; SLM-NEXT: pcmpeqw %mm1, %mm0 # sched: [1:0.50]
+; SLM-NEXT: pcmpeqw (%rdi), %mm0 # sched: [4:1.00]
+; SLM-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_pcmpeqw:
+; SANDY: # %bb.0:
+; SANDY-NEXT: pcmpeqw %mm1, %mm0 # sched: [3:1.00]
+; SANDY-NEXT: pcmpeqw (%rdi), %mm0 # sched: [7:1.00]
+; SANDY-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_pcmpeqw:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: pcmpeqw %mm1, %mm0 # sched: [1:0.50]
+; HASWELL-NEXT: pcmpeqw (%rdi), %mm0 # sched: [6:0.50]
+; HASWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pcmpeqw:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: pcmpeqw %mm1, %mm0 # sched: [1:0.50]
+; BROADWELL-NEXT: pcmpeqw (%rdi), %mm0 # sched: [6:0.50]
+; BROADWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pcmpeqw:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: pcmpeqw %mm1, %mm0 # sched: [1:1.00]
+; SKYLAKE-NEXT: pcmpeqw (%rdi), %mm0 # sched: [6:1.00]
+; SKYLAKE-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pcmpeqw:
+; SKX: # %bb.0:
+; SKX-NEXT: pcmpeqw %mm1, %mm0 # sched: [1:1.00]
+; SKX-NEXT: pcmpeqw (%rdi), %mm0 # sched: [6:1.00]
+; SKX-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_pcmpeqw:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: pcmpeqw %mm1, %mm0 # sched: [1:0.50]
+; BTVER2-NEXT: pcmpeqw (%rdi), %mm0 # sched: [6:1.00]
+; BTVER2-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pcmpeqw:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: pcmpeqw %mm1, %mm0 # sched: [1:0.25]
+; ZNVER1-NEXT: pcmpeqw (%rdi), %mm0 # sched: [8:0.50]
+; ZNVER1-NEXT: movd %mm0, %rax # sched: [2:1.00]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call x86_mmx @llvm.x86.mmx.pcmpeq.w(x86_mmx %a0, x86_mmx %a1)
+ %2 = load x86_mmx, x86_mmx *%a2, align 8
+ %3 = call x86_mmx @llvm.x86.mmx.pcmpeq.w(x86_mmx %1, x86_mmx %2)
+ %4 = bitcast x86_mmx %3 to i64
+ ret i64 %4
+}
+declare x86_mmx @llvm.x86.mmx.pcmpeq.w(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test_pcmpgtb(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
+; GENERIC-LABEL: test_pcmpgtb:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: pcmpgtb %mm1, %mm0 # sched: [3:1.00]
+; GENERIC-NEXT: pcmpgtb (%rdi), %mm0 # sched: [7:1.00]
+; GENERIC-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_pcmpgtb:
+; ATOM: # %bb.0:
+; ATOM-NEXT: pcmpgtb %mm1, %mm0 # sched: [1:0.50]
+; ATOM-NEXT: pcmpgtb (%rdi), %mm0 # sched: [1:1.00]
+; ATOM-NEXT: movd %mm0, %rax # sched: [3:3.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_pcmpgtb:
+; SLM: # %bb.0:
+; SLM-NEXT: pcmpgtb %mm1, %mm0 # sched: [1:0.50]
+; SLM-NEXT: pcmpgtb (%rdi), %mm0 # sched: [4:1.00]
+; SLM-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_pcmpgtb:
+; SANDY: # %bb.0:
+; SANDY-NEXT: pcmpgtb %mm1, %mm0 # sched: [3:1.00]
+; SANDY-NEXT: pcmpgtb (%rdi), %mm0 # sched: [7:1.00]
+; SANDY-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_pcmpgtb:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: pcmpgtb %mm1, %mm0 # sched: [1:0.50]
+; HASWELL-NEXT: pcmpgtb (%rdi), %mm0 # sched: [6:0.50]
+; HASWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pcmpgtb:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: pcmpgtb %mm1, %mm0 # sched: [1:0.50]
+; BROADWELL-NEXT: pcmpgtb (%rdi), %mm0 # sched: [6:0.50]
+; BROADWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pcmpgtb:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: pcmpgtb %mm1, %mm0 # sched: [1:1.00]
+; SKYLAKE-NEXT: pcmpgtb (%rdi), %mm0 # sched: [6:1.00]
+; SKYLAKE-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pcmpgtb:
+; SKX: # %bb.0:
+; SKX-NEXT: pcmpgtb %mm1, %mm0 # sched: [1:1.00]
+; SKX-NEXT: pcmpgtb (%rdi), %mm0 # sched: [6:1.00]
+; SKX-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_pcmpgtb:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: pcmpgtb %mm1, %mm0 # sched: [1:0.50]
+; BTVER2-NEXT: pcmpgtb (%rdi), %mm0 # sched: [6:1.00]
+; BTVER2-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pcmpgtb:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: pcmpgtb %mm1, %mm0 # sched: [1:0.25]
+; ZNVER1-NEXT: pcmpgtb (%rdi), %mm0 # sched: [8:0.50]
+; ZNVER1-NEXT: movd %mm0, %rax # sched: [2:1.00]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call x86_mmx @llvm.x86.mmx.pcmpgt.b(x86_mmx %a0, x86_mmx %a1)
+ %2 = load x86_mmx, x86_mmx *%a2, align 8
+ %3 = call x86_mmx @llvm.x86.mmx.pcmpgt.b(x86_mmx %1, x86_mmx %2)
+ %4 = bitcast x86_mmx %3 to i64
+ ret i64 %4
+}
+declare x86_mmx @llvm.x86.mmx.pcmpgt.b(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test_pcmpgtd(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
+; GENERIC-LABEL: test_pcmpgtd:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: pcmpgtd %mm1, %mm0 # sched: [3:1.00]
+; GENERIC-NEXT: pcmpgtd (%rdi), %mm0 # sched: [7:1.00]
+; GENERIC-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_pcmpgtd:
+; ATOM: # %bb.0:
+; ATOM-NEXT: pcmpgtd %mm1, %mm0 # sched: [1:0.50]
+; ATOM-NEXT: pcmpgtd (%rdi), %mm0 # sched: [1:1.00]
+; ATOM-NEXT: movd %mm0, %rax # sched: [3:3.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_pcmpgtd:
+; SLM: # %bb.0:
+; SLM-NEXT: pcmpgtd %mm1, %mm0 # sched: [1:0.50]
+; SLM-NEXT: pcmpgtd (%rdi), %mm0 # sched: [4:1.00]
+; SLM-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_pcmpgtd:
+; SANDY: # %bb.0:
+; SANDY-NEXT: pcmpgtd %mm1, %mm0 # sched: [3:1.00]
+; SANDY-NEXT: pcmpgtd (%rdi), %mm0 # sched: [7:1.00]
+; SANDY-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_pcmpgtd:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: pcmpgtd %mm1, %mm0 # sched: [1:0.50]
+; HASWELL-NEXT: pcmpgtd (%rdi), %mm0 # sched: [6:0.50]
+; HASWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pcmpgtd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: pcmpgtd %mm1, %mm0 # sched: [1:0.50]
+; BROADWELL-NEXT: pcmpgtd (%rdi), %mm0 # sched: [6:0.50]
+; BROADWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pcmpgtd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: pcmpgtd %mm1, %mm0 # sched: [1:1.00]
+; SKYLAKE-NEXT: pcmpgtd (%rdi), %mm0 # sched: [6:1.00]
+; SKYLAKE-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pcmpgtd:
+; SKX: # %bb.0:
+; SKX-NEXT: pcmpgtd %mm1, %mm0 # sched: [1:1.00]
+; SKX-NEXT: pcmpgtd (%rdi), %mm0 # sched: [6:1.00]
+; SKX-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_pcmpgtd:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: pcmpgtd %mm1, %mm0 # sched: [1:0.50]
+; BTVER2-NEXT: pcmpgtd (%rdi), %mm0 # sched: [6:1.00]
+; BTVER2-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pcmpgtd:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: pcmpgtd %mm1, %mm0 # sched: [1:0.25]
+; ZNVER1-NEXT: pcmpgtd (%rdi), %mm0 # sched: [8:0.50]
+; ZNVER1-NEXT: movd %mm0, %rax # sched: [2:1.00]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call x86_mmx @llvm.x86.mmx.pcmpgt.d(x86_mmx %a0, x86_mmx %a1)
+ %2 = load x86_mmx, x86_mmx *%a2, align 8
+ %3 = call x86_mmx @llvm.x86.mmx.pcmpgt.d(x86_mmx %1, x86_mmx %2)
+ %4 = bitcast x86_mmx %3 to i64
+ ret i64 %4
+}
+declare x86_mmx @llvm.x86.mmx.pcmpgt.d(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test_pcmpgtw(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
+; GENERIC-LABEL: test_pcmpgtw:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: pcmpgtw %mm1, %mm0 # sched: [3:1.00]
+; GENERIC-NEXT: pcmpgtw (%rdi), %mm0 # sched: [7:1.00]
+; GENERIC-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_pcmpgtw:
+; ATOM: # %bb.0:
+; ATOM-NEXT: pcmpgtw %mm1, %mm0 # sched: [1:0.50]
+; ATOM-NEXT: pcmpgtw (%rdi), %mm0 # sched: [1:1.00]
+; ATOM-NEXT: movd %mm0, %rax # sched: [3:3.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_pcmpgtw:
+; SLM: # %bb.0:
+; SLM-NEXT: pcmpgtw %mm1, %mm0 # sched: [1:0.50]
+; SLM-NEXT: pcmpgtw (%rdi), %mm0 # sched: [4:1.00]
+; SLM-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_pcmpgtw:
+; SANDY: # %bb.0:
+; SANDY-NEXT: pcmpgtw %mm1, %mm0 # sched: [3:1.00]
+; SANDY-NEXT: pcmpgtw (%rdi), %mm0 # sched: [7:1.00]
+; SANDY-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_pcmpgtw:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: pcmpgtw %mm1, %mm0 # sched: [1:0.50]
+; HASWELL-NEXT: pcmpgtw (%rdi), %mm0 # sched: [6:0.50]
+; HASWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pcmpgtw:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: pcmpgtw %mm1, %mm0 # sched: [1:0.50]
+; BROADWELL-NEXT: pcmpgtw (%rdi), %mm0 # sched: [6:0.50]
+; BROADWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pcmpgtw:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: pcmpgtw %mm1, %mm0 # sched: [1:1.00]
+; SKYLAKE-NEXT: pcmpgtw (%rdi), %mm0 # sched: [6:1.00]
+; SKYLAKE-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pcmpgtw:
+; SKX: # %bb.0:
+; SKX-NEXT: pcmpgtw %mm1, %mm0 # sched: [1:1.00]
+; SKX-NEXT: pcmpgtw (%rdi), %mm0 # sched: [6:1.00]
+; SKX-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_pcmpgtw:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: pcmpgtw %mm1, %mm0 # sched: [1:0.50]
+; BTVER2-NEXT: pcmpgtw (%rdi), %mm0 # sched: [6:1.00]
+; BTVER2-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pcmpgtw:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: pcmpgtw %mm1, %mm0 # sched: [1:0.25]
+; ZNVER1-NEXT: pcmpgtw (%rdi), %mm0 # sched: [8:0.50]
+; ZNVER1-NEXT: movd %mm0, %rax # sched: [2:1.00]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call x86_mmx @llvm.x86.mmx.pcmpgt.w(x86_mmx %a0, x86_mmx %a1)
+ %2 = load x86_mmx, x86_mmx *%a2, align 8
+ %3 = call x86_mmx @llvm.x86.mmx.pcmpgt.w(x86_mmx %1, x86_mmx %2)
+ %4 = bitcast x86_mmx %3 to i64
+ ret i64 %4
+}
+declare x86_mmx @llvm.x86.mmx.pcmpgt.w(x86_mmx, x86_mmx) nounwind readnone
+
+define i32 @test_pextrw(x86_mmx %a0) optsize {
+; GENERIC-LABEL: test_pextrw:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: pextrw $0, %mm0, %eax # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_pextrw:
+; ATOM: # %bb.0:
+; ATOM-NEXT: pextrw $0, %mm0, %eax # sched: [4:2.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_pextrw:
+; SLM: # %bb.0:
+; SLM-NEXT: pextrw $0, %mm0, %eax # sched: [1:1.00]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_pextrw:
+; SANDY: # %bb.0:
+; SANDY-NEXT: pextrw $0, %mm0, %eax # sched: [1:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_pextrw:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: pextrw $0, %mm0, %eax # sched: [2:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pextrw:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: pextrw $0, %mm0, %eax # sched: [2:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pextrw:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: pextrw $0, %mm0, %eax # sched: [3:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pextrw:
+; SKX: # %bb.0:
+; SKX-NEXT: pextrw $0, %mm0, %eax # sched: [3:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_pextrw:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: pextrw $0, %mm0, %eax # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pextrw:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: pextrw $0, %mm0, %eax # sched: [2:2.00]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call i32 @llvm.x86.mmx.pextr.w(x86_mmx %a0, i32 0)
+ ret i32 %1
+}
+declare i32 @llvm.x86.mmx.pextr.w(x86_mmx, i32) nounwind readnone
+
+define i64 @test_phaddd(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
+; GENERIC-LABEL: test_phaddd:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: phaddd %mm1, %mm0 # sched: [3:1.50]
+; GENERIC-NEXT: phaddd (%rdi), %mm0 # sched: [8:1.50]
+; GENERIC-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_phaddd:
+; ATOM: # %bb.0:
+; ATOM-NEXT: phaddd %mm1, %mm0 # sched: [3:1.50]
+; ATOM-NEXT: phaddd (%rdi), %mm0 # sched: [4:2.00]
+; ATOM-NEXT: movd %mm0, %rax # sched: [3:3.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_phaddd:
+; SLM: # %bb.0:
+; SLM-NEXT: phaddd %mm1, %mm0 # sched: [1:0.50]
+; SLM-NEXT: phaddd (%rdi), %mm0 # sched: [4:1.00]
+; SLM-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_phaddd:
+; SANDY: # %bb.0:
+; SANDY-NEXT: phaddd %mm1, %mm0 # sched: [3:1.50]
+; SANDY-NEXT: phaddd (%rdi), %mm0 # sched: [8:1.50]
+; SANDY-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_phaddd:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: phaddd %mm1, %mm0 # sched: [3:2.00]
+; HASWELL-NEXT: phaddd (%rdi), %mm0 # sched: [8:2.00]
+; HASWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_phaddd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: phaddd %mm1, %mm0 # sched: [3:2.00]
+; BROADWELL-NEXT: phaddd (%rdi), %mm0 # sched: [8:2.00]
+; BROADWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_phaddd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: phaddd %mm1, %mm0 # sched: [3:2.00]
+; SKYLAKE-NEXT: phaddd (%rdi), %mm0 # sched: [8:2.00]
+; SKYLAKE-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_phaddd:
+; SKX: # %bb.0:
+; SKX-NEXT: phaddd %mm1, %mm0 # sched: [3:2.00]
+; SKX-NEXT: phaddd (%rdi), %mm0 # sched: [8:2.00]
+; SKX-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_phaddd:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: phaddd %mm1, %mm0 # sched: [1:0.50]
+; BTVER2-NEXT: phaddd (%rdi), %mm0 # sched: [6:1.00]
+; BTVER2-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_phaddd:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: phaddd %mm1, %mm0 # sched: [100:?]
+; ZNVER1-NEXT: phaddd (%rdi), %mm0 # sched: [100:?]
+; ZNVER1-NEXT: movd %mm0, %rax # sched: [2:1.00]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call x86_mmx @llvm.x86.ssse3.phadd.d(x86_mmx %a0, x86_mmx %a1)
+ %2 = load x86_mmx, x86_mmx *%a2, align 8
+ %3 = call x86_mmx @llvm.x86.ssse3.phadd.d(x86_mmx %1, x86_mmx %2)
+ %4 = bitcast x86_mmx %3 to i64
+ ret i64 %4
+}
+declare x86_mmx @llvm.x86.ssse3.phadd.d(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test_phaddsw(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
+; GENERIC-LABEL: test_phaddsw:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: phaddsw %mm1, %mm0 # sched: [3:1.50]
+; GENERIC-NEXT: phaddsw (%rdi), %mm0 # sched: [8:1.50]
+; GENERIC-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_phaddsw:
+; ATOM: # %bb.0:
+; ATOM-NEXT: phaddsw %mm1, %mm0 # sched: [5:2.50]
+; ATOM-NEXT: phaddsw (%rdi), %mm0 # sched: [6:3.00]
+; ATOM-NEXT: movd %mm0, %rax # sched: [3:3.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_phaddsw:
+; SLM: # %bb.0:
+; SLM-NEXT: phaddsw %mm1, %mm0 # sched: [1:0.50]
+; SLM-NEXT: phaddsw (%rdi), %mm0 # sched: [4:1.00]
+; SLM-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_phaddsw:
+; SANDY: # %bb.0:
+; SANDY-NEXT: phaddsw %mm1, %mm0 # sched: [3:1.50]
+; SANDY-NEXT: phaddsw (%rdi), %mm0 # sched: [8:1.50]
+; SANDY-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_phaddsw:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: phaddsw %mm1, %mm0 # sched: [3:2.00]
+; HASWELL-NEXT: phaddsw (%rdi), %mm0 # sched: [8:2.00]
+; HASWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_phaddsw:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: phaddsw %mm1, %mm0 # sched: [3:2.00]
+; BROADWELL-NEXT: phaddsw (%rdi), %mm0 # sched: [8:2.00]
+; BROADWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_phaddsw:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: phaddsw %mm1, %mm0 # sched: [3:2.00]
+; SKYLAKE-NEXT: phaddsw (%rdi), %mm0 # sched: [8:2.00]
+; SKYLAKE-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_phaddsw:
+; SKX: # %bb.0:
+; SKX-NEXT: phaddsw %mm1, %mm0 # sched: [3:2.00]
+; SKX-NEXT: phaddsw (%rdi), %mm0 # sched: [8:2.00]
+; SKX-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_phaddsw:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: phaddsw %mm1, %mm0 # sched: [1:0.50]
+; BTVER2-NEXT: phaddsw (%rdi), %mm0 # sched: [6:1.00]
+; BTVER2-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_phaddsw:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: phaddsw %mm1, %mm0 # sched: [100:?]
+; ZNVER1-NEXT: phaddsw (%rdi), %mm0 # sched: [100:?]
+; ZNVER1-NEXT: movd %mm0, %rax # sched: [2:1.00]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call x86_mmx @llvm.x86.ssse3.phadd.sw(x86_mmx %a0, x86_mmx %a1)
+ %2 = load x86_mmx, x86_mmx *%a2, align 8
+ %3 = call x86_mmx @llvm.x86.ssse3.phadd.sw(x86_mmx %1, x86_mmx %2)
+ %4 = bitcast x86_mmx %3 to i64
+ ret i64 %4
+}
+declare x86_mmx @llvm.x86.ssse3.phadd.sw(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test_phaddw(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
+; GENERIC-LABEL: test_phaddw:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: phaddw %mm1, %mm0 # sched: [3:1.50]
+; GENERIC-NEXT: phaddw (%rdi), %mm0 # sched: [8:1.50]
+; GENERIC-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_phaddw:
+; ATOM: # %bb.0:
+; ATOM-NEXT: phaddw %mm1, %mm0 # sched: [5:2.50]
+; ATOM-NEXT: phaddw (%rdi), %mm0 # sched: [6:3.00]
+; ATOM-NEXT: movd %mm0, %rax # sched: [3:3.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_phaddw:
+; SLM: # %bb.0:
+; SLM-NEXT: phaddw %mm1, %mm0 # sched: [1:0.50]
+; SLM-NEXT: phaddw (%rdi), %mm0 # sched: [4:1.00]
+; SLM-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_phaddw:
+; SANDY: # %bb.0:
+; SANDY-NEXT: phaddw %mm1, %mm0 # sched: [3:1.50]
+; SANDY-NEXT: phaddw (%rdi), %mm0 # sched: [8:1.50]
+; SANDY-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_phaddw:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: phaddw %mm1, %mm0 # sched: [3:2.00]
+; HASWELL-NEXT: phaddw (%rdi), %mm0 # sched: [8:2.00]
+; HASWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_phaddw:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: phaddw %mm1, %mm0 # sched: [3:2.00]
+; BROADWELL-NEXT: phaddw (%rdi), %mm0 # sched: [8:2.00]
+; BROADWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_phaddw:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: phaddw %mm1, %mm0 # sched: [3:2.00]
+; SKYLAKE-NEXT: phaddw (%rdi), %mm0 # sched: [8:2.00]
+; SKYLAKE-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_phaddw:
+; SKX: # %bb.0:
+; SKX-NEXT: phaddw %mm1, %mm0 # sched: [3:2.00]
+; SKX-NEXT: phaddw (%rdi), %mm0 # sched: [8:2.00]
+; SKX-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_phaddw:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: phaddw %mm1, %mm0 # sched: [1:0.50]
+; BTVER2-NEXT: phaddw (%rdi), %mm0 # sched: [6:1.00]
+; BTVER2-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_phaddw:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: phaddw %mm1, %mm0 # sched: [100:?]
+; ZNVER1-NEXT: phaddw (%rdi), %mm0 # sched: [100:?]
+; ZNVER1-NEXT: movd %mm0, %rax # sched: [2:1.00]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call x86_mmx @llvm.x86.ssse3.phadd.w(x86_mmx %a0, x86_mmx %a1)
+ %2 = load x86_mmx, x86_mmx *%a2, align 8
+ %3 = call x86_mmx @llvm.x86.ssse3.phadd.w(x86_mmx %1, x86_mmx %2)
+ %4 = bitcast x86_mmx %3 to i64
+ ret i64 %4
+}
+declare x86_mmx @llvm.x86.ssse3.phadd.w(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test_phsubd(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
+; GENERIC-LABEL: test_phsubd:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: phsubd %mm1, %mm0 # sched: [3:1.50]
+; GENERIC-NEXT: phsubd (%rdi), %mm0 # sched: [8:1.50]
+; GENERIC-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_phsubd:
+; ATOM: # %bb.0:
+; ATOM-NEXT: phsubd %mm1, %mm0 # sched: [3:1.50]
+; ATOM-NEXT: phsubd (%rdi), %mm0 # sched: [4:2.00]
+; ATOM-NEXT: movd %mm0, %rax # sched: [3:3.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_phsubd:
+; SLM: # %bb.0:
+; SLM-NEXT: phsubd %mm1, %mm0 # sched: [1:0.50]
+; SLM-NEXT: phsubd (%rdi), %mm0 # sched: [4:1.00]
+; SLM-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_phsubd:
+; SANDY: # %bb.0:
+; SANDY-NEXT: phsubd %mm1, %mm0 # sched: [3:1.50]
+; SANDY-NEXT: phsubd (%rdi), %mm0 # sched: [8:1.50]
+; SANDY-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_phsubd:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: phsubd %mm1, %mm0 # sched: [3:2.00]
+; HASWELL-NEXT: phsubd (%rdi), %mm0 # sched: [8:2.00]
+; HASWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_phsubd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: phsubd %mm1, %mm0 # sched: [3:2.00]
+; BROADWELL-NEXT: phsubd (%rdi), %mm0 # sched: [8:2.00]
+; BROADWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_phsubd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: phsubd %mm1, %mm0 # sched: [3:2.00]
+; SKYLAKE-NEXT: phsubd (%rdi), %mm0 # sched: [8:2.00]
+; SKYLAKE-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_phsubd:
+; SKX: # %bb.0:
+; SKX-NEXT: phsubd %mm1, %mm0 # sched: [3:2.00]
+; SKX-NEXT: phsubd (%rdi), %mm0 # sched: [8:2.00]
+; SKX-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_phsubd:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: phsubd %mm1, %mm0 # sched: [1:0.50]
+; BTVER2-NEXT: phsubd (%rdi), %mm0 # sched: [6:1.00]
+; BTVER2-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_phsubd:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: phsubd %mm1, %mm0 # sched: [100:?]
+; ZNVER1-NEXT: phsubd (%rdi), %mm0 # sched: [100:?]
+; ZNVER1-NEXT: movd %mm0, %rax # sched: [2:1.00]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call x86_mmx @llvm.x86.ssse3.phsub.d(x86_mmx %a0, x86_mmx %a1)
+ %2 = load x86_mmx, x86_mmx *%a2, align 8
+ %3 = call x86_mmx @llvm.x86.ssse3.phsub.d(x86_mmx %1, x86_mmx %2)
+ %4 = bitcast x86_mmx %3 to i64
+ ret i64 %4
+}
+declare x86_mmx @llvm.x86.ssse3.phsub.d(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test_phsubsw(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
+; GENERIC-LABEL: test_phsubsw:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: phsubsw %mm1, %mm0 # sched: [3:1.50]
+; GENERIC-NEXT: phsubsw (%rdi), %mm0 # sched: [8:1.50]
+; GENERIC-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_phsubsw:
+; ATOM: # %bb.0:
+; ATOM-NEXT: phsubsw %mm1, %mm0 # sched: [5:2.50]
+; ATOM-NEXT: phsubsw (%rdi), %mm0 # sched: [6:3.00]
+; ATOM-NEXT: movd %mm0, %rax # sched: [3:3.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_phsubsw:
+; SLM: # %bb.0:
+; SLM-NEXT: phsubsw %mm1, %mm0 # sched: [1:0.50]
+; SLM-NEXT: phsubsw (%rdi), %mm0 # sched: [4:1.00]
+; SLM-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_phsubsw:
+; SANDY: # %bb.0:
+; SANDY-NEXT: phsubsw %mm1, %mm0 # sched: [3:1.50]
+; SANDY-NEXT: phsubsw (%rdi), %mm0 # sched: [8:1.50]
+; SANDY-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_phsubsw:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: phsubsw %mm1, %mm0 # sched: [3:2.00]
+; HASWELL-NEXT: phsubsw (%rdi), %mm0 # sched: [8:2.00]
+; HASWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_phsubsw:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: phsubsw %mm1, %mm0 # sched: [3:2.00]
+; BROADWELL-NEXT: phsubsw (%rdi), %mm0 # sched: [8:2.00]
+; BROADWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_phsubsw:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: phsubsw %mm1, %mm0 # sched: [3:2.00]
+; SKYLAKE-NEXT: phsubsw (%rdi), %mm0 # sched: [8:2.00]
+; SKYLAKE-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_phsubsw:
+; SKX: # %bb.0:
+; SKX-NEXT: phsubsw %mm1, %mm0 # sched: [3:2.00]
+; SKX-NEXT: phsubsw (%rdi), %mm0 # sched: [8:2.00]
+; SKX-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_phsubsw:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: phsubsw %mm1, %mm0 # sched: [1:0.50]
+; BTVER2-NEXT: phsubsw (%rdi), %mm0 # sched: [6:1.00]
+; BTVER2-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_phsubsw:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: phsubsw %mm1, %mm0 # sched: [100:?]
+; ZNVER1-NEXT: phsubsw (%rdi), %mm0 # sched: [8:0.50]
+; ZNVER1-NEXT: movd %mm0, %rax # sched: [2:1.00]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call x86_mmx @llvm.x86.ssse3.phsub.sw(x86_mmx %a0, x86_mmx %a1)
+ %2 = load x86_mmx, x86_mmx *%a2, align 8
+ %3 = call x86_mmx @llvm.x86.ssse3.phsub.sw(x86_mmx %1, x86_mmx %2)
+ %4 = bitcast x86_mmx %3 to i64
+ ret i64 %4
+}
+declare x86_mmx @llvm.x86.ssse3.phsub.sw(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test_phsubw(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
+; GENERIC-LABEL: test_phsubw:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: phsubw %mm1, %mm0 # sched: [3:1.50]
+; GENERIC-NEXT: phsubw (%rdi), %mm0 # sched: [8:1.50]
+; GENERIC-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_phsubw:
+; ATOM: # %bb.0:
+; ATOM-NEXT: phsubw %mm1, %mm0 # sched: [5:2.50]
+; ATOM-NEXT: phsubw (%rdi), %mm0 # sched: [6:3.00]
+; ATOM-NEXT: movd %mm0, %rax # sched: [3:3.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_phsubw:
+; SLM: # %bb.0:
+; SLM-NEXT: phsubw %mm1, %mm0 # sched: [1:0.50]
+; SLM-NEXT: phsubw (%rdi), %mm0 # sched: [4:1.00]
+; SLM-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_phsubw:
+; SANDY: # %bb.0:
+; SANDY-NEXT: phsubw %mm1, %mm0 # sched: [3:1.50]
+; SANDY-NEXT: phsubw (%rdi), %mm0 # sched: [8:1.50]
+; SANDY-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_phsubw:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: phsubw %mm1, %mm0 # sched: [3:2.00]
+; HASWELL-NEXT: phsubw (%rdi), %mm0 # sched: [8:2.00]
+; HASWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_phsubw:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: phsubw %mm1, %mm0 # sched: [3:2.00]
+; BROADWELL-NEXT: phsubw (%rdi), %mm0 # sched: [8:2.00]
+; BROADWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_phsubw:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: phsubw %mm1, %mm0 # sched: [3:2.00]
+; SKYLAKE-NEXT: phsubw (%rdi), %mm0 # sched: [8:2.00]
+; SKYLAKE-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_phsubw:
+; SKX: # %bb.0:
+; SKX-NEXT: phsubw %mm1, %mm0 # sched: [3:2.00]
+; SKX-NEXT: phsubw (%rdi), %mm0 # sched: [8:2.00]
+; SKX-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_phsubw:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: phsubw %mm1, %mm0 # sched: [1:0.50]
+; BTVER2-NEXT: phsubw (%rdi), %mm0 # sched: [6:1.00]
+; BTVER2-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_phsubw:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: phsubw %mm1, %mm0 # sched: [100:?]
+; ZNVER1-NEXT: phsubw (%rdi), %mm0 # sched: [100:?]
+; ZNVER1-NEXT: movd %mm0, %rax # sched: [2:1.00]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call x86_mmx @llvm.x86.ssse3.phsub.w(x86_mmx %a0, x86_mmx %a1)
+ %2 = load x86_mmx, x86_mmx *%a2, align 8
+ %3 = call x86_mmx @llvm.x86.ssse3.phsub.w(x86_mmx %1, x86_mmx %2)
+ %4 = bitcast x86_mmx %3 to i64
+ ret i64 %4
+}
+declare x86_mmx @llvm.x86.ssse3.phsub.w(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test_pinsrw(x86_mmx %a0, i32 %a1, i16* %a2) optsize {
+; GENERIC-LABEL: test_pinsrw:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: pinsrw $0, %edi, %mm0 # sched: [1:1.00]
+; GENERIC-NEXT: movswl (%rsi), %eax # sched: [5:0.50]
+; GENERIC-NEXT: pinsrw $1, %eax, %mm0 # sched: [1:1.00]
+; GENERIC-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_pinsrw:
+; ATOM: # %bb.0:
+; ATOM-NEXT: movswl (%rsi), %eax # sched: [1:1.00]
+; ATOM-NEXT: pinsrw $0, %edi, %mm0 # sched: [1:1.00]
+; ATOM-NEXT: pinsrw $1, %eax, %mm0 # sched: [1:1.00]
+; ATOM-NEXT: movd %mm0, %rax # sched: [3:3.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_pinsrw:
+; SLM: # %bb.0:
+; SLM-NEXT: movswl (%rsi), %eax # sched: [4:1.00]
+; SLM-NEXT: pinsrw $0, %edi, %mm0 # sched: [1:1.00]
+; SLM-NEXT: pinsrw $1, %eax, %mm0 # sched: [1:1.00]
+; SLM-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_pinsrw:
+; SANDY: # %bb.0:
+; SANDY-NEXT: pinsrw $0, %edi, %mm0 # sched: [1:1.00]
+; SANDY-NEXT: movswl (%rsi), %eax # sched: [5:0.50]
+; SANDY-NEXT: pinsrw $1, %eax, %mm0 # sched: [1:1.00]
+; SANDY-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_pinsrw:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: pinsrw $0, %edi, %mm0 # sched: [2:2.00]
+; HASWELL-NEXT: movswl (%rsi), %eax # sched: [5:0.50]
+; HASWELL-NEXT: pinsrw $1, %eax, %mm0 # sched: [2:2.00]
+; HASWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pinsrw:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: pinsrw $0, %edi, %mm0 # sched: [2:2.00]
+; BROADWELL-NEXT: movswl (%rsi), %eax # sched: [5:0.50]
+; BROADWELL-NEXT: pinsrw $1, %eax, %mm0 # sched: [2:2.00]
+; BROADWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pinsrw:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: pinsrw $0, %edi, %mm0 # sched: [2:2.00]
+; SKYLAKE-NEXT: movswl (%rsi), %eax # sched: [5:0.50]
+; SKYLAKE-NEXT: pinsrw $1, %eax, %mm0 # sched: [2:2.00]
+; SKYLAKE-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pinsrw:
+; SKX: # %bb.0:
+; SKX-NEXT: pinsrw $0, %edi, %mm0 # sched: [2:2.00]
+; SKX-NEXT: movswl (%rsi), %eax # sched: [5:0.50]
+; SKX-NEXT: pinsrw $1, %eax, %mm0 # sched: [2:2.00]
+; SKX-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_pinsrw:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: movswl (%rsi), %eax # sched: [4:1.00]
+; BTVER2-NEXT: pinsrw $0, %edi, %mm0 # sched: [1:0.50]
+; BTVER2-NEXT: pinsrw $1, %eax, %mm0 # sched: [1:0.50]
+; BTVER2-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pinsrw:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: movswl (%rsi), %eax # sched: [8:0.50]
+; ZNVER1-NEXT: pinsrw $0, %edi, %mm0 # sched: [1:0.25]
+; ZNVER1-NEXT: pinsrw $1, %eax, %mm0 # sched: [1:0.25]
+; ZNVER1-NEXT: movd %mm0, %rax # sched: [2:1.00]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call x86_mmx @llvm.x86.mmx.pinsr.w(x86_mmx %a0, i32 %a1, i32 0)
+ %2 = load i16, i16 *%a2, align 2
+ %3 = sext i16 %2 to i32
+ %4 = call x86_mmx @llvm.x86.mmx.pinsr.w(x86_mmx %1, i32 %3, i32 1)
+ %5 = bitcast x86_mmx %4 to i64
+ ret i64 %5
+}
+declare x86_mmx @llvm.x86.mmx.pinsr.w(x86_mmx, i32, i32) nounwind readnone
+
+define i64 @test_pmaddwd(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
+; GENERIC-LABEL: test_pmaddwd:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: pmaddwd %mm1, %mm0 # sched: [5:1.00]
+; GENERIC-NEXT: pmaddwd (%rdi), %mm0 # sched: [9:1.00]
+; GENERIC-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_pmaddwd:
+; ATOM: # %bb.0:
+; ATOM-NEXT: pmaddwd %mm1, %mm0 # sched: [4:4.00]
+; ATOM-NEXT: pmaddwd (%rdi), %mm0 # sched: [4:4.00]
+; ATOM-NEXT: movd %mm0, %rax # sched: [3:3.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_pmaddwd:
+; SLM: # %bb.0:
+; SLM-NEXT: pmaddwd %mm1, %mm0 # sched: [4:1.00]
+; SLM-NEXT: pmaddwd (%rdi), %mm0 # sched: [7:1.00]
+; SLM-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_pmaddwd:
+; SANDY: # %bb.0:
+; SANDY-NEXT: pmaddwd %mm1, %mm0 # sched: [5:1.00]
+; SANDY-NEXT: pmaddwd (%rdi), %mm0 # sched: [9:1.00]
+; SANDY-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_pmaddwd:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: pmaddwd %mm1, %mm0 # sched: [5:1.00]
+; HASWELL-NEXT: pmaddwd (%rdi), %mm0 # sched: [10:1.00]
+; HASWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pmaddwd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: pmaddwd %mm1, %mm0 # sched: [5:1.00]
+; BROADWELL-NEXT: pmaddwd (%rdi), %mm0 # sched: [10:1.00]
+; BROADWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pmaddwd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: pmaddwd %mm1, %mm0 # sched: [4:1.00]
+; SKYLAKE-NEXT: pmaddwd (%rdi), %mm0 # sched: [9:1.00]
+; SKYLAKE-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pmaddwd:
+; SKX: # %bb.0:
+; SKX-NEXT: pmaddwd %mm1, %mm0 # sched: [4:1.00]
+; SKX-NEXT: pmaddwd (%rdi), %mm0 # sched: [9:1.00]
+; SKX-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_pmaddwd:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: pmaddwd %mm1, %mm0 # sched: [2:1.00]
+; BTVER2-NEXT: pmaddwd (%rdi), %mm0 # sched: [7:1.00]
+; BTVER2-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pmaddwd:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: pmaddwd %mm1, %mm0 # sched: [4:1.00]
+; ZNVER1-NEXT: pmaddwd (%rdi), %mm0 # sched: [11:1.00]
+; ZNVER1-NEXT: movd %mm0, %rax # sched: [2:1.00]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call x86_mmx @llvm.x86.mmx.pmadd.wd(x86_mmx %a0, x86_mmx %a1)
+ %2 = load x86_mmx, x86_mmx *%a2, align 8
+ %3 = call x86_mmx @llvm.x86.mmx.pmadd.wd(x86_mmx %1, x86_mmx %2)
+ %4 = bitcast x86_mmx %3 to i64
+ ret i64 %4
+}
+declare x86_mmx @llvm.x86.mmx.pmadd.wd(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test_pmaddubsw(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
+; GENERIC-LABEL: test_pmaddubsw:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: pmaddubsw %mm1, %mm0 # sched: [3:1.00]
+; GENERIC-NEXT: pmaddubsw (%rdi), %mm0 # sched: [8:1.00]
+; GENERIC-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_pmaddubsw:
+; ATOM: # %bb.0:
+; ATOM-NEXT: pmaddubsw %mm1, %mm0 # sched: [4:4.00]
+; ATOM-NEXT: pmaddubsw (%rdi), %mm0 # sched: [4:4.00]
+; ATOM-NEXT: movd %mm0, %rax # sched: [3:3.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_pmaddubsw:
+; SLM: # %bb.0:
+; SLM-NEXT: pmaddubsw %mm1, %mm0 # sched: [4:1.00]
+; SLM-NEXT: pmaddubsw (%rdi), %mm0 # sched: [7:1.00]
+; SLM-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_pmaddubsw:
+; SANDY: # %bb.0:
+; SANDY-NEXT: pmaddubsw %mm1, %mm0 # sched: [3:1.00]
+; SANDY-NEXT: pmaddubsw (%rdi), %mm0 # sched: [8:1.00]
+; SANDY-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_pmaddubsw:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: pmaddubsw %mm1, %mm0 # sched: [5:1.00]
+; HASWELL-NEXT: pmaddubsw (%rdi), %mm0 # sched: [10:1.00]
+; HASWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pmaddubsw:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: pmaddubsw %mm1, %mm0 # sched: [5:1.00]
+; BROADWELL-NEXT: pmaddubsw (%rdi), %mm0 # sched: [10:1.00]
+; BROADWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pmaddubsw:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: pmaddubsw %mm1, %mm0 # sched: [4:1.00]
+; SKYLAKE-NEXT: pmaddubsw (%rdi), %mm0 # sched: [9:1.00]
+; SKYLAKE-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pmaddubsw:
+; SKX: # %bb.0:
+; SKX-NEXT: pmaddubsw %mm1, %mm0 # sched: [4:1.00]
+; SKX-NEXT: pmaddubsw (%rdi), %mm0 # sched: [9:1.00]
+; SKX-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_pmaddubsw:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: pmaddubsw %mm1, %mm0 # sched: [2:1.00]
+; BTVER2-NEXT: pmaddubsw (%rdi), %mm0 # sched: [7:1.00]
+; BTVER2-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pmaddubsw:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: pmaddubsw %mm1, %mm0 # sched: [4:1.00]
+; ZNVER1-NEXT: pmaddubsw (%rdi), %mm0 # sched: [11:1.00]
+; ZNVER1-NEXT: movd %mm0, %rax # sched: [2:1.00]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call x86_mmx @llvm.x86.ssse3.pmadd.ub.sw(x86_mmx %a0, x86_mmx %a1)
+ %2 = load x86_mmx, x86_mmx *%a2, align 8
+ %3 = call x86_mmx @llvm.x86.ssse3.pmadd.ub.sw(x86_mmx %1, x86_mmx %2)
+ %4 = bitcast x86_mmx %3 to i64
+ ret i64 %4
+}
+declare x86_mmx @llvm.x86.ssse3.pmadd.ub.sw(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test_pmaxsw(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
+; GENERIC-LABEL: test_pmaxsw:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: pmaxsw %mm1, %mm0 # sched: [5:1.00]
+; GENERIC-NEXT: pmaxsw (%rdi), %mm0 # sched: [9:1.00]
+; GENERIC-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_pmaxsw:
+; ATOM: # %bb.0:
+; ATOM-NEXT: pmaxsw %mm1, %mm0 # sched: [1:1.00]
+; ATOM-NEXT: pmaxsw (%rdi), %mm0 # sched: [1:0.50]
+; ATOM-NEXT: movd %mm0, %rax # sched: [3:3.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_pmaxsw:
+; SLM: # %bb.0:
+; SLM-NEXT: pmaxsw %mm1, %mm0 # sched: [4:1.00]
+; SLM-NEXT: pmaxsw (%rdi), %mm0 # sched: [7:1.00]
+; SLM-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_pmaxsw:
+; SANDY: # %bb.0:
+; SANDY-NEXT: pmaxsw %mm1, %mm0 # sched: [5:1.00]
+; SANDY-NEXT: pmaxsw (%rdi), %mm0 # sched: [9:1.00]
+; SANDY-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_pmaxsw:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: pmaxsw %mm1, %mm0 # sched: [1:0.50]
+; HASWELL-NEXT: pmaxsw (%rdi), %mm0 # sched: [6:0.50]
+; HASWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pmaxsw:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: pmaxsw %mm1, %mm0 # sched: [1:0.50]
+; BROADWELL-NEXT: pmaxsw (%rdi), %mm0 # sched: [6:0.50]
+; BROADWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pmaxsw:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: pmaxsw %mm1, %mm0 # sched: [1:1.00]
+; SKYLAKE-NEXT: pmaxsw (%rdi), %mm0 # sched: [6:1.00]
+; SKYLAKE-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pmaxsw:
+; SKX: # %bb.0:
+; SKX-NEXT: pmaxsw %mm1, %mm0 # sched: [1:1.00]
+; SKX-NEXT: pmaxsw (%rdi), %mm0 # sched: [6:1.00]
+; SKX-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_pmaxsw:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: pmaxsw %mm1, %mm0 # sched: [2:1.00]
+; BTVER2-NEXT: pmaxsw (%rdi), %mm0 # sched: [7:1.00]
+; BTVER2-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pmaxsw:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: pmaxsw %mm1, %mm0 # sched: [4:1.00]
+; ZNVER1-NEXT: pmaxsw (%rdi), %mm0 # sched: [11:1.00]
+; ZNVER1-NEXT: movd %mm0, %rax # sched: [2:1.00]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call x86_mmx @llvm.x86.mmx.pmaxs.w(x86_mmx %a0, x86_mmx %a1)
+ %2 = load x86_mmx, x86_mmx *%a2, align 8
+ %3 = call x86_mmx @llvm.x86.mmx.pmaxs.w(x86_mmx %1, x86_mmx %2)
+ %4 = bitcast x86_mmx %3 to i64
+ ret i64 %4
+}
+declare x86_mmx @llvm.x86.mmx.pmaxs.w(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test_pmaxub(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
+; GENERIC-LABEL: test_pmaxub:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: pmaxub %mm1, %mm0 # sched: [5:1.00]
+; GENERIC-NEXT: pmaxub (%rdi), %mm0 # sched: [9:1.00]
+; GENERIC-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_pmaxub:
+; ATOM: # %bb.0:
+; ATOM-NEXT: pmaxub %mm1, %mm0 # sched: [1:1.00]
+; ATOM-NEXT: pmaxub (%rdi), %mm0 # sched: [1:0.50]
+; ATOM-NEXT: movd %mm0, %rax # sched: [3:3.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_pmaxub:
+; SLM: # %bb.0:
+; SLM-NEXT: pmaxub %mm1, %mm0 # sched: [4:1.00]
+; SLM-NEXT: pmaxub (%rdi), %mm0 # sched: [7:1.00]
+; SLM-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_pmaxub:
+; SANDY: # %bb.0:
+; SANDY-NEXT: pmaxub %mm1, %mm0 # sched: [5:1.00]
+; SANDY-NEXT: pmaxub (%rdi), %mm0 # sched: [9:1.00]
+; SANDY-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_pmaxub:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: pmaxub %mm1, %mm0 # sched: [1:0.50]
+; HASWELL-NEXT: pmaxub (%rdi), %mm0 # sched: [6:0.50]
+; HASWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pmaxub:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: pmaxub %mm1, %mm0 # sched: [1:0.50]
+; BROADWELL-NEXT: pmaxub (%rdi), %mm0 # sched: [6:0.50]
+; BROADWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pmaxub:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: pmaxub %mm1, %mm0 # sched: [1:1.00]
+; SKYLAKE-NEXT: pmaxub (%rdi), %mm0 # sched: [6:1.00]
+; SKYLAKE-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pmaxub:
+; SKX: # %bb.0:
+; SKX-NEXT: pmaxub %mm1, %mm0 # sched: [1:1.00]
+; SKX-NEXT: pmaxub (%rdi), %mm0 # sched: [6:1.00]
+; SKX-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_pmaxub:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: pmaxub %mm1, %mm0 # sched: [2:1.00]
+; BTVER2-NEXT: pmaxub (%rdi), %mm0 # sched: [7:1.00]
+; BTVER2-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pmaxub:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: pmaxub %mm1, %mm0 # sched: [4:1.00]
+; ZNVER1-NEXT: pmaxub (%rdi), %mm0 # sched: [11:1.00]
+; ZNVER1-NEXT: movd %mm0, %rax # sched: [2:1.00]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call x86_mmx @llvm.x86.mmx.pmaxu.b(x86_mmx %a0, x86_mmx %a1)
+ %2 = load x86_mmx, x86_mmx *%a2, align 8
+ %3 = call x86_mmx @llvm.x86.mmx.pmaxu.b(x86_mmx %1, x86_mmx %2)
+ %4 = bitcast x86_mmx %3 to i64
+ ret i64 %4
+}
+declare x86_mmx @llvm.x86.mmx.pmaxu.b(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test_pminsw(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
+; GENERIC-LABEL: test_pminsw:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: pminsw %mm1, %mm0 # sched: [5:1.00]
+; GENERIC-NEXT: pminsw (%rdi), %mm0 # sched: [9:1.00]
+; GENERIC-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_pminsw:
+; ATOM: # %bb.0:
+; ATOM-NEXT: pminsw %mm1, %mm0 # sched: [1:1.00]
+; ATOM-NEXT: pminsw (%rdi), %mm0 # sched: [1:0.50]
+; ATOM-NEXT: movd %mm0, %rax # sched: [3:3.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_pminsw:
+; SLM: # %bb.0:
+; SLM-NEXT: pminsw %mm1, %mm0 # sched: [4:1.00]
+; SLM-NEXT: pminsw (%rdi), %mm0 # sched: [7:1.00]
+; SLM-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_pminsw:
+; SANDY: # %bb.0:
+; SANDY-NEXT: pminsw %mm1, %mm0 # sched: [5:1.00]
+; SANDY-NEXT: pminsw (%rdi), %mm0 # sched: [9:1.00]
+; SANDY-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_pminsw:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: pminsw %mm1, %mm0 # sched: [1:0.50]
+; HASWELL-NEXT: pminsw (%rdi), %mm0 # sched: [6:0.50]
+; HASWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pminsw:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: pminsw %mm1, %mm0 # sched: [1:0.50]
+; BROADWELL-NEXT: pminsw (%rdi), %mm0 # sched: [6:0.50]
+; BROADWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pminsw:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: pminsw %mm1, %mm0 # sched: [1:1.00]
+; SKYLAKE-NEXT: pminsw (%rdi), %mm0 # sched: [6:1.00]
+; SKYLAKE-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pminsw:
+; SKX: # %bb.0:
+; SKX-NEXT: pminsw %mm1, %mm0 # sched: [1:1.00]
+; SKX-NEXT: pminsw (%rdi), %mm0 # sched: [6:1.00]
+; SKX-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_pminsw:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: pminsw %mm1, %mm0 # sched: [2:1.00]
+; BTVER2-NEXT: pminsw (%rdi), %mm0 # sched: [7:1.00]
+; BTVER2-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pminsw:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: pminsw %mm1, %mm0 # sched: [4:1.00]
+; ZNVER1-NEXT: pminsw (%rdi), %mm0 # sched: [11:1.00]
+; ZNVER1-NEXT: movd %mm0, %rax # sched: [2:1.00]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call x86_mmx @llvm.x86.mmx.pmins.w(x86_mmx %a0, x86_mmx %a1)
+ %2 = load x86_mmx, x86_mmx *%a2, align 8
+ %3 = call x86_mmx @llvm.x86.mmx.pmins.w(x86_mmx %1, x86_mmx %2)
+ %4 = bitcast x86_mmx %3 to i64
+ ret i64 %4
+}
+declare x86_mmx @llvm.x86.mmx.pmins.w(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test_pminub(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
+; GENERIC-LABEL: test_pminub:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: pminub %mm1, %mm0 # sched: [5:1.00]
+; GENERIC-NEXT: pminub (%rdi), %mm0 # sched: [9:1.00]
+; GENERIC-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_pminub:
+; ATOM: # %bb.0:
+; ATOM-NEXT: pminub %mm1, %mm0 # sched: [1:1.00]
+; ATOM-NEXT: pminub (%rdi), %mm0 # sched: [1:0.50]
+; ATOM-NEXT: movd %mm0, %rax # sched: [3:3.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_pminub:
+; SLM: # %bb.0:
+; SLM-NEXT: pminub %mm1, %mm0 # sched: [4:1.00]
+; SLM-NEXT: pminub (%rdi), %mm0 # sched: [7:1.00]
+; SLM-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_pminub:
+; SANDY: # %bb.0:
+; SANDY-NEXT: pminub %mm1, %mm0 # sched: [5:1.00]
+; SANDY-NEXT: pminub (%rdi), %mm0 # sched: [9:1.00]
+; SANDY-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_pminub:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: pminub %mm1, %mm0 # sched: [1:0.50]
+; HASWELL-NEXT: pminub (%rdi), %mm0 # sched: [6:0.50]
+; HASWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pminub:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: pminub %mm1, %mm0 # sched: [1:0.50]
+; BROADWELL-NEXT: pminub (%rdi), %mm0 # sched: [6:0.50]
+; BROADWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pminub:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: pminub %mm1, %mm0 # sched: [1:1.00]
+; SKYLAKE-NEXT: pminub (%rdi), %mm0 # sched: [6:1.00]
+; SKYLAKE-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pminub:
+; SKX: # %bb.0:
+; SKX-NEXT: pminub %mm1, %mm0 # sched: [1:1.00]
+; SKX-NEXT: pminub (%rdi), %mm0 # sched: [6:1.00]
+; SKX-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_pminub:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: pminub %mm1, %mm0 # sched: [2:1.00]
+; BTVER2-NEXT: pminub (%rdi), %mm0 # sched: [7:1.00]
+; BTVER2-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pminub:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: pminub %mm1, %mm0 # sched: [4:1.00]
+; ZNVER1-NEXT: pminub (%rdi), %mm0 # sched: [11:1.00]
+; ZNVER1-NEXT: movd %mm0, %rax # sched: [2:1.00]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call x86_mmx @llvm.x86.mmx.pminu.b(x86_mmx %a0, x86_mmx %a1)
+ %2 = load x86_mmx, x86_mmx *%a2, align 8
+ %3 = call x86_mmx @llvm.x86.mmx.pminu.b(x86_mmx %1, x86_mmx %2)
+ %4 = bitcast x86_mmx %3 to i64
+ ret i64 %4
+}
+declare x86_mmx @llvm.x86.mmx.pminu.b(x86_mmx, x86_mmx) nounwind readnone
+
+define i32 @test_pmovmskb(x86_mmx %a0) optsize {
+; GENERIC-LABEL: test_pmovmskb:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: pmovmskb %mm0, %eax # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_pmovmskb:
+; ATOM: # %bb.0:
+; ATOM-NEXT: pmovmskb %mm0, %eax # sched: [3:3.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_pmovmskb:
+; SLM: # %bb.0:
+; SLM-NEXT: pmovmskb %mm0, %eax # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_pmovmskb:
+; SANDY: # %bb.0:
+; SANDY-NEXT: pmovmskb %mm0, %eax # sched: [1:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_pmovmskb:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: pmovmskb %mm0, %eax # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pmovmskb:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: pmovmskb %mm0, %eax # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pmovmskb:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: pmovmskb %mm0, %eax # sched: [2:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pmovmskb:
+; SKX: # %bb.0:
+; SKX-NEXT: pmovmskb %mm0, %eax # sched: [2:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_pmovmskb:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: pmovmskb %mm0, %eax # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pmovmskb:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: pmovmskb %mm0, %eax # sched: [1:1.00]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call i32 @llvm.x86.mmx.pmovmskb(x86_mmx %a0)
+ ret i32 %1
+}
+declare i32 @llvm.x86.mmx.pmovmskb(x86_mmx) nounwind readnone
+
+define i64 @test_pmulhrsw(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
+; GENERIC-LABEL: test_pmulhrsw:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: pmulhrsw %mm1, %mm0 # sched: [3:1.00]
+; GENERIC-NEXT: pmulhrsw (%rdi), %mm0 # sched: [8:1.00]
+; GENERIC-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_pmulhrsw:
+; ATOM: # %bb.0:
+; ATOM-NEXT: pmulhrsw %mm1, %mm0 # sched: [4:4.00]
+; ATOM-NEXT: pmulhrsw (%rdi), %mm0 # sched: [4:4.00]
+; ATOM-NEXT: movd %mm0, %rax # sched: [3:3.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_pmulhrsw:
+; SLM: # %bb.0:
+; SLM-NEXT: pmulhrsw %mm1, %mm0 # sched: [4:1.00]
+; SLM-NEXT: pmulhrsw (%rdi), %mm0 # sched: [7:1.00]
+; SLM-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_pmulhrsw:
+; SANDY: # %bb.0:
+; SANDY-NEXT: pmulhrsw %mm1, %mm0 # sched: [3:1.00]
+; SANDY-NEXT: pmulhrsw (%rdi), %mm0 # sched: [8:1.00]
+; SANDY-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_pmulhrsw:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: pmulhrsw %mm1, %mm0 # sched: [5:1.00]
+; HASWELL-NEXT: pmulhrsw (%rdi), %mm0 # sched: [10:1.00]
+; HASWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pmulhrsw:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: pmulhrsw %mm1, %mm0 # sched: [5:1.00]
+; BROADWELL-NEXT: pmulhrsw (%rdi), %mm0 # sched: [10:1.00]
+; BROADWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pmulhrsw:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: pmulhrsw %mm1, %mm0 # sched: [4:1.00]
+; SKYLAKE-NEXT: pmulhrsw (%rdi), %mm0 # sched: [9:1.00]
+; SKYLAKE-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pmulhrsw:
+; SKX: # %bb.0:
+; SKX-NEXT: pmulhrsw %mm1, %mm0 # sched: [4:1.00]
+; SKX-NEXT: pmulhrsw (%rdi), %mm0 # sched: [9:1.00]
+; SKX-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_pmulhrsw:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: pmulhrsw %mm1, %mm0 # sched: [2:1.00]
+; BTVER2-NEXT: pmulhrsw (%rdi), %mm0 # sched: [7:1.00]
+; BTVER2-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pmulhrsw:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: pmulhrsw %mm1, %mm0 # sched: [4:1.00]
+; ZNVER1-NEXT: pmulhrsw (%rdi), %mm0 # sched: [11:1.00]
+; ZNVER1-NEXT: movd %mm0, %rax # sched: [2:1.00]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call x86_mmx @llvm.x86.ssse3.pmul.hr.sw(x86_mmx %a0, x86_mmx %a1)
+ %2 = load x86_mmx, x86_mmx *%a2, align 8
+ %3 = call x86_mmx @llvm.x86.ssse3.pmul.hr.sw(x86_mmx %1, x86_mmx %2)
+ %4 = bitcast x86_mmx %3 to i64
+ ret i64 %4
+}
+declare x86_mmx @llvm.x86.ssse3.pmul.hr.sw(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test_pmulhw(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
+; GENERIC-LABEL: test_pmulhw:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: pmulhw %mm1, %mm0 # sched: [5:1.00]
+; GENERIC-NEXT: pmulhw (%rdi), %mm0 # sched: [9:1.00]
+; GENERIC-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_pmulhw:
+; ATOM: # %bb.0:
+; ATOM-NEXT: pmulhw %mm1, %mm0 # sched: [4:4.00]
+; ATOM-NEXT: pmulhw (%rdi), %mm0 # sched: [4:4.00]
+; ATOM-NEXT: movd %mm0, %rax # sched: [3:3.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_pmulhw:
+; SLM: # %bb.0:
+; SLM-NEXT: pmulhw %mm1, %mm0 # sched: [4:1.00]
+; SLM-NEXT: pmulhw (%rdi), %mm0 # sched: [7:1.00]
+; SLM-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_pmulhw:
+; SANDY: # %bb.0:
+; SANDY-NEXT: pmulhw %mm1, %mm0 # sched: [5:1.00]
+; SANDY-NEXT: pmulhw (%rdi), %mm0 # sched: [9:1.00]
+; SANDY-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_pmulhw:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: pmulhw %mm1, %mm0 # sched: [5:1.00]
+; HASWELL-NEXT: pmulhw (%rdi), %mm0 # sched: [10:1.00]
+; HASWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pmulhw:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: pmulhw %mm1, %mm0 # sched: [5:1.00]
+; BROADWELL-NEXT: pmulhw (%rdi), %mm0 # sched: [10:1.00]
+; BROADWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pmulhw:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: pmulhw %mm1, %mm0 # sched: [4:1.00]
+; SKYLAKE-NEXT: pmulhw (%rdi), %mm0 # sched: [9:1.00]
+; SKYLAKE-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pmulhw:
+; SKX: # %bb.0:
+; SKX-NEXT: pmulhw %mm1, %mm0 # sched: [4:1.00]
+; SKX-NEXT: pmulhw (%rdi), %mm0 # sched: [9:1.00]
+; SKX-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_pmulhw:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: pmulhw %mm1, %mm0 # sched: [2:1.00]
+; BTVER2-NEXT: pmulhw (%rdi), %mm0 # sched: [7:1.00]
+; BTVER2-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pmulhw:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: pmulhw %mm1, %mm0 # sched: [4:1.00]
+; ZNVER1-NEXT: pmulhw (%rdi), %mm0 # sched: [11:1.00]
+; ZNVER1-NEXT: movd %mm0, %rax # sched: [2:1.00]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call x86_mmx @llvm.x86.mmx.pmulh.w(x86_mmx %a0, x86_mmx %a1)
+ %2 = load x86_mmx, x86_mmx *%a2, align 8
+ %3 = call x86_mmx @llvm.x86.mmx.pmulh.w(x86_mmx %1, x86_mmx %2)
+ %4 = bitcast x86_mmx %3 to i64
+ ret i64 %4
+}
+declare x86_mmx @llvm.x86.mmx.pmulh.w(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test_pmulhuw(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
+; GENERIC-LABEL: test_pmulhuw:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: pmulhuw %mm1, %mm0 # sched: [5:1.00]
+; GENERIC-NEXT: pmulhuw (%rdi), %mm0 # sched: [9:1.00]
+; GENERIC-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_pmulhuw:
+; ATOM: # %bb.0:
+; ATOM-NEXT: pmulhuw %mm1, %mm0 # sched: [4:4.00]
+; ATOM-NEXT: pmulhuw (%rdi), %mm0 # sched: [4:4.00]
+; ATOM-NEXT: movd %mm0, %rax # sched: [3:3.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_pmulhuw:
+; SLM: # %bb.0:
+; SLM-NEXT: pmulhuw %mm1, %mm0 # sched: [4:1.00]
+; SLM-NEXT: pmulhuw (%rdi), %mm0 # sched: [7:1.00]
+; SLM-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_pmulhuw:
+; SANDY: # %bb.0:
+; SANDY-NEXT: pmulhuw %mm1, %mm0 # sched: [5:1.00]
+; SANDY-NEXT: pmulhuw (%rdi), %mm0 # sched: [9:1.00]
+; SANDY-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_pmulhuw:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: pmulhuw %mm1, %mm0 # sched: [5:1.00]
+; HASWELL-NEXT: pmulhuw (%rdi), %mm0 # sched: [10:1.00]
+; HASWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pmulhuw:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: pmulhuw %mm1, %mm0 # sched: [5:1.00]
+; BROADWELL-NEXT: pmulhuw (%rdi), %mm0 # sched: [10:1.00]
+; BROADWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pmulhuw:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: pmulhuw %mm1, %mm0 # sched: [4:1.00]
+; SKYLAKE-NEXT: pmulhuw (%rdi), %mm0 # sched: [9:1.00]
+; SKYLAKE-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pmulhuw:
+; SKX: # %bb.0:
+; SKX-NEXT: pmulhuw %mm1, %mm0 # sched: [4:1.00]
+; SKX-NEXT: pmulhuw (%rdi), %mm0 # sched: [9:1.00]
+; SKX-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_pmulhuw:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: pmulhuw %mm1, %mm0 # sched: [2:1.00]
+; BTVER2-NEXT: pmulhuw (%rdi), %mm0 # sched: [7:1.00]
+; BTVER2-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pmulhuw:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: pmulhuw %mm1, %mm0 # sched: [4:1.00]
+; ZNVER1-NEXT: pmulhuw (%rdi), %mm0 # sched: [11:1.00]
+; ZNVER1-NEXT: movd %mm0, %rax # sched: [2:1.00]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call x86_mmx @llvm.x86.mmx.pmulhu.w(x86_mmx %a0, x86_mmx %a1)
+ %2 = load x86_mmx, x86_mmx *%a2, align 8
+ %3 = call x86_mmx @llvm.x86.mmx.pmulhu.w(x86_mmx %1, x86_mmx %2)
+ %4 = bitcast x86_mmx %3 to i64
+ ret i64 %4
+}
+declare x86_mmx @llvm.x86.mmx.pmulhu.w(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test_pmullw(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
+; GENERIC-LABEL: test_pmullw:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: pmullw %mm1, %mm0 # sched: [5:1.00]
+; GENERIC-NEXT: pmullw (%rdi), %mm0 # sched: [9:1.00]
+; GENERIC-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_pmullw:
+; ATOM: # %bb.0:
+; ATOM-NEXT: pmullw %mm1, %mm0 # sched: [4:4.00]
+; ATOM-NEXT: pmullw (%rdi), %mm0 # sched: [4:4.00]
+; ATOM-NEXT: movd %mm0, %rax # sched: [3:3.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_pmullw:
+; SLM: # %bb.0:
+; SLM-NEXT: pmullw %mm1, %mm0 # sched: [4:1.00]
+; SLM-NEXT: pmullw (%rdi), %mm0 # sched: [7:1.00]
+; SLM-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_pmullw:
+; SANDY: # %bb.0:
+; SANDY-NEXT: pmullw %mm1, %mm0 # sched: [5:1.00]
+; SANDY-NEXT: pmullw (%rdi), %mm0 # sched: [9:1.00]
+; SANDY-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_pmullw:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: pmullw %mm1, %mm0 # sched: [5:1.00]
+; HASWELL-NEXT: pmullw (%rdi), %mm0 # sched: [10:1.00]
+; HASWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pmullw:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: pmullw %mm1, %mm0 # sched: [5:1.00]
+; BROADWELL-NEXT: pmullw (%rdi), %mm0 # sched: [10:1.00]
+; BROADWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pmullw:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: pmullw %mm1, %mm0 # sched: [4:1.00]
+; SKYLAKE-NEXT: pmullw (%rdi), %mm0 # sched: [9:1.00]
+; SKYLAKE-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pmullw:
+; SKX: # %bb.0:
+; SKX-NEXT: pmullw %mm1, %mm0 # sched: [4:1.00]
+; SKX-NEXT: pmullw (%rdi), %mm0 # sched: [9:1.00]
+; SKX-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_pmullw:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: pmullw %mm1, %mm0 # sched: [2:1.00]
+; BTVER2-NEXT: pmullw (%rdi), %mm0 # sched: [7:1.00]
+; BTVER2-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pmullw:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: pmullw %mm1, %mm0 # sched: [4:1.00]
+; ZNVER1-NEXT: pmullw (%rdi), %mm0 # sched: [11:1.00]
+; ZNVER1-NEXT: movd %mm0, %rax # sched: [2:1.00]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call x86_mmx @llvm.x86.mmx.pmull.w(x86_mmx %a0, x86_mmx %a1)
+ %2 = load x86_mmx, x86_mmx *%a2, align 8
+ %3 = call x86_mmx @llvm.x86.mmx.pmull.w(x86_mmx %1, x86_mmx %2)
+ %4 = bitcast x86_mmx %3 to i64
+ ret i64 %4
+}
+declare x86_mmx @llvm.x86.mmx.pmull.w(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test_pmuludq(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
+; GENERIC-LABEL: test_pmuludq:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: pmuludq %mm1, %mm0 # sched: [3:1.00]
+; GENERIC-NEXT: pmuludq (%rdi), %mm0 # sched: [9:1.00]
+; GENERIC-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_pmuludq:
+; ATOM: # %bb.0:
+; ATOM-NEXT: pmuludq %mm1, %mm0 # sched: [4:4.00]
+; ATOM-NEXT: pmuludq (%rdi), %mm0 # sched: [4:4.00]
+; ATOM-NEXT: movd %mm0, %rax # sched: [3:3.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_pmuludq:
+; SLM: # %bb.0:
+; SLM-NEXT: pmuludq %mm1, %mm0 # sched: [4:1.00]
+; SLM-NEXT: pmuludq (%rdi), %mm0 # sched: [7:1.00]
+; SLM-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_pmuludq:
+; SANDY: # %bb.0:
+; SANDY-NEXT: pmuludq %mm1, %mm0 # sched: [3:1.00]
+; SANDY-NEXT: pmuludq (%rdi), %mm0 # sched: [9:1.00]
+; SANDY-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_pmuludq:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: pmuludq %mm1, %mm0 # sched: [5:1.00]
+; HASWELL-NEXT: pmuludq (%rdi), %mm0 # sched: [10:1.00]
+; HASWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pmuludq:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: pmuludq %mm1, %mm0 # sched: [5:1.00]
+; BROADWELL-NEXT: pmuludq (%rdi), %mm0 # sched: [10:1.00]
+; BROADWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pmuludq:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: pmuludq %mm1, %mm0 # sched: [4:1.00]
+; SKYLAKE-NEXT: pmuludq (%rdi), %mm0 # sched: [9:1.00]
+; SKYLAKE-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pmuludq:
+; SKX: # %bb.0:
+; SKX-NEXT: pmuludq %mm1, %mm0 # sched: [4:1.00]
+; SKX-NEXT: pmuludq (%rdi), %mm0 # sched: [9:1.00]
+; SKX-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_pmuludq:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: pmuludq %mm1, %mm0 # sched: [2:1.00]
+; BTVER2-NEXT: pmuludq (%rdi), %mm0 # sched: [7:1.00]
+; BTVER2-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pmuludq:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: pmuludq %mm1, %mm0 # sched: [4:1.00]
+; ZNVER1-NEXT: pmuludq (%rdi), %mm0 # sched: [11:1.00]
+; ZNVER1-NEXT: movd %mm0, %rax # sched: [2:1.00]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call x86_mmx @llvm.x86.mmx.pmulu.dq(x86_mmx %a0, x86_mmx %a1)
+ %2 = load x86_mmx, x86_mmx *%a2, align 8
+ %3 = call x86_mmx @llvm.x86.mmx.pmulu.dq(x86_mmx %1, x86_mmx %2)
+ %4 = bitcast x86_mmx %3 to i64
+ ret i64 %4
+}
+declare x86_mmx @llvm.x86.mmx.pmulu.dq(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test_por(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
+; GENERIC-LABEL: test_por:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: por %mm1, %mm0 # sched: [1:1.00]
+; GENERIC-NEXT: por (%rdi), %mm0 # sched: [5:1.00]
+; GENERIC-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_por:
+; ATOM: # %bb.0:
+; ATOM-NEXT: por %mm1, %mm0 # sched: [1:0.50]
+; ATOM-NEXT: por (%rdi), %mm0 # sched: [1:1.00]
+; ATOM-NEXT: movd %mm0, %rax # sched: [3:3.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_por:
+; SLM: # %bb.0:
+; SLM-NEXT: por %mm1, %mm0 # sched: [1:0.50]
+; SLM-NEXT: por (%rdi), %mm0 # sched: [4:1.00]
+; SLM-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_por:
+; SANDY: # %bb.0:
+; SANDY-NEXT: por %mm1, %mm0 # sched: [1:1.00]
+; SANDY-NEXT: por (%rdi), %mm0 # sched: [5:1.00]
+; SANDY-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_por:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: por %mm1, %mm0 # sched: [1:0.33]
+; HASWELL-NEXT: por (%rdi), %mm0 # sched: [6:0.50]
+; HASWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_por:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: por %mm1, %mm0 # sched: [1:0.33]
+; BROADWELL-NEXT: por (%rdi), %mm0 # sched: [6:0.50]
+; BROADWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_por:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: por %mm1, %mm0 # sched: [1:0.50]
+; SKYLAKE-NEXT: por (%rdi), %mm0 # sched: [6:0.50]
+; SKYLAKE-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_por:
+; SKX: # %bb.0:
+; SKX-NEXT: por %mm1, %mm0 # sched: [1:0.50]
+; SKX-NEXT: por (%rdi), %mm0 # sched: [6:0.50]
+; SKX-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_por:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: por %mm1, %mm0 # sched: [1:0.50]
+; BTVER2-NEXT: por (%rdi), %mm0 # sched: [6:1.00]
+; BTVER2-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_por:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: por %mm1, %mm0 # sched: [1:0.25]
+; ZNVER1-NEXT: por (%rdi), %mm0 # sched: [8:0.50]
+; ZNVER1-NEXT: movd %mm0, %rax # sched: [2:1.00]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call x86_mmx @llvm.x86.mmx.por(x86_mmx %a0, x86_mmx %a1)
+ %2 = load x86_mmx, x86_mmx *%a2, align 8
+ %3 = call x86_mmx @llvm.x86.mmx.por(x86_mmx %1, x86_mmx %2)
+ %4 = bitcast x86_mmx %3 to i64
+ ret i64 %4
+}
+declare x86_mmx @llvm.x86.mmx.por(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test_psadbw(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
+; GENERIC-LABEL: test_psadbw:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: psadbw %mm1, %mm0 # sched: [5:1.00]
+; GENERIC-NEXT: psadbw (%rdi), %mm0 # sched: [9:1.00]
+; GENERIC-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_psadbw:
+; ATOM: # %bb.0:
+; ATOM-NEXT: psadbw %mm1, %mm0 # sched: [4:2.00]
+; ATOM-NEXT: psadbw (%rdi), %mm0 # sched: [4:2.00]
+; ATOM-NEXT: movd %mm0, %rax # sched: [3:3.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_psadbw:
+; SLM: # %bb.0:
+; SLM-NEXT: psadbw %mm1, %mm0 # sched: [4:1.00]
+; SLM-NEXT: psadbw (%rdi), %mm0 # sched: [7:1.00]
+; SLM-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_psadbw:
+; SANDY: # %bb.0:
+; SANDY-NEXT: psadbw %mm1, %mm0 # sched: [5:1.00]
+; SANDY-NEXT: psadbw (%rdi), %mm0 # sched: [9:1.00]
+; SANDY-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_psadbw:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: psadbw %mm1, %mm0 # sched: [5:1.00]
+; HASWELL-NEXT: psadbw (%rdi), %mm0 # sched: [10:1.00]
+; HASWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_psadbw:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: psadbw %mm1, %mm0 # sched: [5:1.00]
+; BROADWELL-NEXT: psadbw (%rdi), %mm0 # sched: [10:1.00]
+; BROADWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_psadbw:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: psadbw %mm1, %mm0 # sched: [3:1.00]
+; SKYLAKE-NEXT: psadbw (%rdi), %mm0 # sched: [8:1.00]
+; SKYLAKE-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_psadbw:
+; SKX: # %bb.0:
+; SKX-NEXT: psadbw %mm1, %mm0 # sched: [3:1.00]
+; SKX-NEXT: psadbw (%rdi), %mm0 # sched: [8:1.00]
+; SKX-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_psadbw:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: psadbw %mm1, %mm0 # sched: [2:1.00]
+; BTVER2-NEXT: psadbw (%rdi), %mm0 # sched: [7:1.00]
+; BTVER2-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_psadbw:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: psadbw %mm1, %mm0 # sched: [4:1.00]
+; ZNVER1-NEXT: psadbw (%rdi), %mm0 # sched: [11:1.00]
+; ZNVER1-NEXT: movd %mm0, %rax # sched: [2:1.00]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call x86_mmx @llvm.x86.mmx.psad.bw(x86_mmx %a0, x86_mmx %a1)
+ %2 = load x86_mmx, x86_mmx *%a2, align 8
+ %3 = call x86_mmx @llvm.x86.mmx.psad.bw(x86_mmx %1, x86_mmx %2)
+ %4 = bitcast x86_mmx %3 to i64
+ ret i64 %4
+}
+declare x86_mmx @llvm.x86.mmx.psad.bw(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test_pshufb(x86_mmx %a0, x86_mmx %a1, x86_mmx *%a2) optsize {
+; GENERIC-LABEL: test_pshufb:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: pshufb %mm1, %mm0 # sched: [1:0.50]
+; GENERIC-NEXT: pshufb (%rdi), %mm0 # sched: [6:0.50]
+; GENERIC-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_pshufb:
+; ATOM: # %bb.0:
+; ATOM-NEXT: pshufb %mm1, %mm0 # sched: [1:1.00]
+; ATOM-NEXT: pshufb (%rdi), %mm0 # sched: [1:1.00]
+; ATOM-NEXT: movd %mm0, %rax # sched: [3:3.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_pshufb:
+; SLM: # %bb.0:
+; SLM-NEXT: pshufb %mm1, %mm0 # sched: [1:1.00]
+; SLM-NEXT: pshufb (%rdi), %mm0 # sched: [4:1.00]
+; SLM-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_pshufb:
+; SANDY: # %bb.0:
+; SANDY-NEXT: pshufb %mm1, %mm0 # sched: [1:0.50]
+; SANDY-NEXT: pshufb (%rdi), %mm0 # sched: [6:0.50]
+; SANDY-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_pshufb:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: pshufb %mm1, %mm0 # sched: [1:1.00]
+; HASWELL-NEXT: pshufb (%rdi), %mm0 # sched: [6:1.00]
+; HASWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pshufb:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: pshufb %mm1, %mm0 # sched: [1:1.00]
+; BROADWELL-NEXT: pshufb (%rdi), %mm0 # sched: [6:1.00]
+; BROADWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pshufb:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: pshufb %mm1, %mm0 # sched: [1:1.00]
+; SKYLAKE-NEXT: pshufb (%rdi), %mm0 # sched: [6:1.00]
+; SKYLAKE-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pshufb:
+; SKX: # %bb.0:
+; SKX-NEXT: pshufb %mm1, %mm0 # sched: [1:1.00]
+; SKX-NEXT: pshufb (%rdi), %mm0 # sched: [6:1.00]
+; SKX-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_pshufb:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: pshufb %mm1, %mm0 # sched: [1:0.50]
+; BTVER2-NEXT: pshufb (%rdi), %mm0 # sched: [6:1.00]
+; BTVER2-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pshufb:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: pshufb %mm1, %mm0 # sched: [1:0.25]
+; ZNVER1-NEXT: pshufb (%rdi), %mm0 # sched: [8:0.50]
+; ZNVER1-NEXT: movd %mm0, %rax # sched: [2:1.00]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call x86_mmx @llvm.x86.ssse3.pshuf.b(x86_mmx %a0, x86_mmx %a1)
+ %2 = load x86_mmx, x86_mmx *%a2, align 8
+ %3 = call x86_mmx @llvm.x86.ssse3.pshuf.b(x86_mmx %1, x86_mmx %2)
+ %4 = bitcast x86_mmx %3 to i64
+ ret i64 %4
+}
+declare x86_mmx @llvm.x86.ssse3.pshuf.b(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test_pshufw(x86_mmx *%a0) optsize {
+; GENERIC-LABEL: test_pshufw:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: pshufw $0, (%rdi), %mm0 # mm0 = mem[0,0,0,0] sched: [5:1.00]
+; GENERIC-NEXT: pshufw $0, %mm0, %mm0 # mm0 = mm0[0,0,0,0] sched: [1:1.00]
+; GENERIC-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_pshufw:
+; ATOM: # %bb.0:
+; ATOM-NEXT: pshufw $0, (%rdi), %mm0 # mm0 = mem[0,0,0,0] sched: [1:1.00]
+; ATOM-NEXT: pshufw $0, %mm0, %mm0 # mm0 = mm0[0,0,0,0] sched: [1:1.00]
+; ATOM-NEXT: movd %mm0, %rax # sched: [3:3.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_pshufw:
+; SLM: # %bb.0:
+; SLM-NEXT: pshufw $0, (%rdi), %mm0 # mm0 = mem[0,0,0,0] sched: [4:1.00]
+; SLM-NEXT: pshufw $0, %mm0, %mm0 # mm0 = mm0[0,0,0,0] sched: [1:1.00]
+; SLM-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_pshufw:
+; SANDY: # %bb.0:
+; SANDY-NEXT: pshufw $0, (%rdi), %mm0 # mm0 = mem[0,0,0,0] sched: [5:1.00]
+; SANDY-NEXT: pshufw $0, %mm0, %mm0 # mm0 = mm0[0,0,0,0] sched: [1:1.00]
+; SANDY-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_pshufw:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: pshufw $0, (%rdi), %mm0 # mm0 = mem[0,0,0,0] sched: [6:1.00]
+; HASWELL-NEXT: pshufw $0, %mm0, %mm0 # mm0 = mm0[0,0,0,0] sched: [1:1.00]
+; HASWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pshufw:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: pshufw $0, (%rdi), %mm0 # mm0 = mem[0,0,0,0] sched: [6:1.00]
+; BROADWELL-NEXT: pshufw $0, %mm0, %mm0 # mm0 = mm0[0,0,0,0] sched: [1:1.00]
+; BROADWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pshufw:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: pshufw $0, (%rdi), %mm0 # mm0 = mem[0,0,0,0] sched: [6:1.00]
+; SKYLAKE-NEXT: pshufw $0, %mm0, %mm0 # mm0 = mm0[0,0,0,0] sched: [1:1.00]
+; SKYLAKE-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pshufw:
+; SKX: # %bb.0:
+; SKX-NEXT: pshufw $0, (%rdi), %mm0 # mm0 = mem[0,0,0,0] sched: [6:1.00]
+; SKX-NEXT: pshufw $0, %mm0, %mm0 # mm0 = mm0[0,0,0,0] sched: [1:1.00]
+; SKX-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_pshufw:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: pshufw $0, (%rdi), %mm0 # mm0 = mem[0,0,0,0] sched: [6:1.00]
+; BTVER2-NEXT: pshufw $0, %mm0, %mm0 # mm0 = mm0[0,0,0,0] sched: [1:0.50]
+; BTVER2-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pshufw:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: pshufw $0, (%rdi), %mm0 # mm0 = mem[0,0,0,0] sched: [8:0.50]
+; ZNVER1-NEXT: pshufw $0, %mm0, %mm0 # mm0 = mm0[0,0,0,0] sched: [1:0.25]
+; ZNVER1-NEXT: movd %mm0, %rax # sched: [2:1.00]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = load x86_mmx, x86_mmx *%a0, align 8
+ %2 = call x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx %1, i8 0)
+ %3 = call x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx %2, i8 0)
+ %4 = bitcast x86_mmx %3 to i64
+ ret i64 %4
+}
+declare x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx, i8) nounwind readnone
+
+define i64 @test_psignb(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
+; GENERIC-LABEL: test_psignb:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: psignb %mm1, %mm0 # sched: [1:0.50]
+; GENERIC-NEXT: psignb (%rdi), %mm0 # sched: [6:0.50]
+; GENERIC-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_psignb:
+; ATOM: # %bb.0:
+; ATOM-NEXT: psignb %mm1, %mm0 # sched: [1:1.00]
+; ATOM-NEXT: psignb (%rdi), %mm0 # sched: [1:0.50]
+; ATOM-NEXT: movd %mm0, %rax # sched: [3:3.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_psignb:
+; SLM: # %bb.0:
+; SLM-NEXT: psignb %mm1, %mm0 # sched: [4:1.00]
+; SLM-NEXT: psignb (%rdi), %mm0 # sched: [7:1.00]
+; SLM-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_psignb:
+; SANDY: # %bb.0:
+; SANDY-NEXT: psignb %mm1, %mm0 # sched: [1:0.50]
+; SANDY-NEXT: psignb (%rdi), %mm0 # sched: [6:0.50]
+; SANDY-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_psignb:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: psignb %mm1, %mm0 # sched: [1:0.50]
+; HASWELL-NEXT: psignb (%rdi), %mm0 # sched: [6:0.50]
+; HASWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_psignb:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: psignb %mm1, %mm0 # sched: [1:0.50]
+; BROADWELL-NEXT: psignb (%rdi), %mm0 # sched: [6:0.50]
+; BROADWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_psignb:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: psignb %mm1, %mm0 # sched: [1:0.50]
+; SKYLAKE-NEXT: psignb (%rdi), %mm0 # sched: [6:0.50]
+; SKYLAKE-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_psignb:
+; SKX: # %bb.0:
+; SKX-NEXT: psignb %mm1, %mm0 # sched: [1:0.50]
+; SKX-NEXT: psignb (%rdi), %mm0 # sched: [6:0.50]
+; SKX-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_psignb:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: psignb %mm1, %mm0 # sched: [2:1.00]
+; BTVER2-NEXT: psignb (%rdi), %mm0 # sched: [7:1.00]
+; BTVER2-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_psignb:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: psignb %mm1, %mm0 # sched: [4:1.00]
+; ZNVER1-NEXT: psignb (%rdi), %mm0 # sched: [11:1.00]
+; ZNVER1-NEXT: movd %mm0, %rax # sched: [2:1.00]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call x86_mmx @llvm.x86.ssse3.psign.b(x86_mmx %a0, x86_mmx %a1)
+ %2 = load x86_mmx, x86_mmx *%a2, align 8
+ %3 = call x86_mmx @llvm.x86.ssse3.psign.b(x86_mmx %1, x86_mmx %2)
+ %4 = bitcast x86_mmx %3 to i64
+ ret i64 %4
+}
+declare x86_mmx @llvm.x86.ssse3.psign.b(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test_psignd(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
+; GENERIC-LABEL: test_psignd:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: psignd %mm1, %mm0 # sched: [1:0.50]
+; GENERIC-NEXT: psignd (%rdi), %mm0 # sched: [6:0.50]
+; GENERIC-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_psignd:
+; ATOM: # %bb.0:
+; ATOM-NEXT: psignd %mm1, %mm0 # sched: [1:1.00]
+; ATOM-NEXT: psignd (%rdi), %mm0 # sched: [1:0.50]
+; ATOM-NEXT: movd %mm0, %rax # sched: [3:3.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_psignd:
+; SLM: # %bb.0:
+; SLM-NEXT: psignd %mm1, %mm0 # sched: [4:1.00]
+; SLM-NEXT: psignd (%rdi), %mm0 # sched: [7:1.00]
+; SLM-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_psignd:
+; SANDY: # %bb.0:
+; SANDY-NEXT: psignd %mm1, %mm0 # sched: [1:0.50]
+; SANDY-NEXT: psignd (%rdi), %mm0 # sched: [6:0.50]
+; SANDY-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_psignd:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: psignd %mm1, %mm0 # sched: [1:0.50]
+; HASWELL-NEXT: psignd (%rdi), %mm0 # sched: [6:0.50]
+; HASWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_psignd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: psignd %mm1, %mm0 # sched: [1:0.50]
+; BROADWELL-NEXT: psignd (%rdi), %mm0 # sched: [6:0.50]
+; BROADWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_psignd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: psignd %mm1, %mm0 # sched: [1:0.50]
+; SKYLAKE-NEXT: psignd (%rdi), %mm0 # sched: [6:0.50]
+; SKYLAKE-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_psignd:
+; SKX: # %bb.0:
+; SKX-NEXT: psignd %mm1, %mm0 # sched: [1:0.50]
+; SKX-NEXT: psignd (%rdi), %mm0 # sched: [6:0.50]
+; SKX-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_psignd:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: psignd %mm1, %mm0 # sched: [2:1.00]
+; BTVER2-NEXT: psignd (%rdi), %mm0 # sched: [7:1.00]
+; BTVER2-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_psignd:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: psignd %mm1, %mm0 # sched: [4:1.00]
+; ZNVER1-NEXT: psignd (%rdi), %mm0 # sched: [11:1.00]
+; ZNVER1-NEXT: movd %mm0, %rax # sched: [2:1.00]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call x86_mmx @llvm.x86.ssse3.psign.d(x86_mmx %a0, x86_mmx %a1)
+ %2 = load x86_mmx, x86_mmx *%a2, align 8
+ %3 = call x86_mmx @llvm.x86.ssse3.psign.d(x86_mmx %1, x86_mmx %2)
+ %4 = bitcast x86_mmx %3 to i64
+ ret i64 %4
+}
+declare x86_mmx @llvm.x86.ssse3.psign.d(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test_psignw(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
+; GENERIC-LABEL: test_psignw:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: psignw %mm1, %mm0 # sched: [1:0.50]
+; GENERIC-NEXT: psignw (%rdi), %mm0 # sched: [6:0.50]
+; GENERIC-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_psignw:
+; ATOM: # %bb.0:
+; ATOM-NEXT: psignw %mm1, %mm0 # sched: [1:1.00]
+; ATOM-NEXT: psignw (%rdi), %mm0 # sched: [1:0.50]
+; ATOM-NEXT: movd %mm0, %rax # sched: [3:3.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_psignw:
+; SLM: # %bb.0:
+; SLM-NEXT: psignw %mm1, %mm0 # sched: [4:1.00]
+; SLM-NEXT: psignw (%rdi), %mm0 # sched: [7:1.00]
+; SLM-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_psignw:
+; SANDY: # %bb.0:
+; SANDY-NEXT: psignw %mm1, %mm0 # sched: [1:0.50]
+; SANDY-NEXT: psignw (%rdi), %mm0 # sched: [6:0.50]
+; SANDY-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_psignw:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: psignw %mm1, %mm0 # sched: [1:0.50]
+; HASWELL-NEXT: psignw (%rdi), %mm0 # sched: [6:0.50]
+; HASWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_psignw:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: psignw %mm1, %mm0 # sched: [1:0.50]
+; BROADWELL-NEXT: psignw (%rdi), %mm0 # sched: [6:0.50]
+; BROADWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_psignw:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: psignw %mm1, %mm0 # sched: [1:0.50]
+; SKYLAKE-NEXT: psignw (%rdi), %mm0 # sched: [6:0.50]
+; SKYLAKE-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_psignw:
+; SKX: # %bb.0:
+; SKX-NEXT: psignw %mm1, %mm0 # sched: [1:0.50]
+; SKX-NEXT: psignw (%rdi), %mm0 # sched: [6:0.50]
+; SKX-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_psignw:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: psignw %mm1, %mm0 # sched: [2:1.00]
+; BTVER2-NEXT: psignw (%rdi), %mm0 # sched: [7:1.00]
+; BTVER2-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_psignw:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: psignw %mm1, %mm0 # sched: [4:1.00]
+; ZNVER1-NEXT: psignw (%rdi), %mm0 # sched: [11:1.00]
+; ZNVER1-NEXT: movd %mm0, %rax # sched: [2:1.00]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call x86_mmx @llvm.x86.ssse3.psign.w(x86_mmx %a0, x86_mmx %a1)
+ %2 = load x86_mmx, x86_mmx *%a2, align 8
+ %3 = call x86_mmx @llvm.x86.ssse3.psign.w(x86_mmx %1, x86_mmx %2)
+ %4 = bitcast x86_mmx %3 to i64
+ ret i64 %4
+}
+declare x86_mmx @llvm.x86.ssse3.psign.w(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test_pslld(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
+; GENERIC-LABEL: test_pslld:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: pslld %mm1, %mm0 # sched: [1:1.00]
+; GENERIC-NEXT: pslld (%rdi), %mm0 # sched: [5:1.00]
+; GENERIC-NEXT: pslld $7, %mm0 # sched: [1:1.00]
+; GENERIC-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_pslld:
+; ATOM: # %bb.0:
+; ATOM-NEXT: pslld %mm1, %mm0 # sched: [2:1.00]
+; ATOM-NEXT: pslld (%rdi), %mm0 # sched: [3:1.50]
+; ATOM-NEXT: pslld $7, %mm0 # sched: [1:0.50]
+; ATOM-NEXT: movd %mm0, %rax # sched: [3:3.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_pslld:
+; SLM: # %bb.0:
+; SLM-NEXT: pslld %mm1, %mm0 # sched: [1:1.00]
+; SLM-NEXT: pslld (%rdi), %mm0 # sched: [4:1.00]
+; SLM-NEXT: pslld $7, %mm0 # sched: [1:1.00]
+; SLM-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_pslld:
+; SANDY: # %bb.0:
+; SANDY-NEXT: pslld %mm1, %mm0 # sched: [1:1.00]
+; SANDY-NEXT: pslld (%rdi), %mm0 # sched: [5:1.00]
+; SANDY-NEXT: pslld $7, %mm0 # sched: [1:1.00]
+; SANDY-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_pslld:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: pslld %mm1, %mm0 # sched: [1:1.00]
+; HASWELL-NEXT: pslld (%rdi), %mm0 # sched: [6:1.00]
+; HASWELL-NEXT: pslld $7, %mm0 # sched: [1:1.00]
+; HASWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pslld:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: pslld %mm1, %mm0 # sched: [1:1.00]
+; BROADWELL-NEXT: pslld (%rdi), %mm0 # sched: [6:1.00]
+; BROADWELL-NEXT: pslld $7, %mm0 # sched: [1:1.00]
+; BROADWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pslld:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: pslld %mm1, %mm0 # sched: [1:1.00]
+; SKYLAKE-NEXT: pslld (%rdi), %mm0 # sched: [6:1.00]
+; SKYLAKE-NEXT: pslld $7, %mm0 # sched: [1:1.00]
+; SKYLAKE-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pslld:
+; SKX: # %bb.0:
+; SKX-NEXT: pslld %mm1, %mm0 # sched: [1:1.00]
+; SKX-NEXT: pslld (%rdi), %mm0 # sched: [6:1.00]
+; SKX-NEXT: pslld $7, %mm0 # sched: [1:1.00]
+; SKX-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_pslld:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: pslld %mm1, %mm0 # sched: [1:0.50]
+; BTVER2-NEXT: pslld (%rdi), %mm0 # sched: [6:1.00]
+; BTVER2-NEXT: pslld $7, %mm0 # sched: [1:0.50]
+; BTVER2-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pslld:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: pslld %mm1, %mm0 # sched: [1:0.25]
+; ZNVER1-NEXT: pslld (%rdi), %mm0 # sched: [8:0.50]
+; ZNVER1-NEXT: pslld $7, %mm0 # sched: [1:0.25]
+; ZNVER1-NEXT: movd %mm0, %rax # sched: [2:1.00]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call x86_mmx @llvm.x86.mmx.psll.d(x86_mmx %a0, x86_mmx %a1)
+ %2 = load x86_mmx, x86_mmx *%a2, align 8
+ %3 = call x86_mmx @llvm.x86.mmx.psll.d(x86_mmx %1, x86_mmx %2)
+ %4 = call x86_mmx @llvm.x86.mmx.pslli.d(x86_mmx %3, i32 7)
+ %5 = bitcast x86_mmx %4 to i64
+ ret i64 %5
+}
+declare x86_mmx @llvm.x86.mmx.psll.d(x86_mmx, x86_mmx) nounwind readnone
+declare x86_mmx @llvm.x86.mmx.pslli.d(x86_mmx, i32) nounwind readnone
+
+define i64 @test_psllq(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
+; GENERIC-LABEL: test_psllq:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: psllq %mm1, %mm0 # sched: [1:1.00]
+; GENERIC-NEXT: psllq (%rdi), %mm0 # sched: [5:1.00]
+; GENERIC-NEXT: psllq $7, %mm0 # sched: [1:1.00]
+; GENERIC-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_psllq:
+; ATOM: # %bb.0:
+; ATOM-NEXT: psllq %mm1, %mm0 # sched: [2:1.00]
+; ATOM-NEXT: psllq (%rdi), %mm0 # sched: [3:1.50]
+; ATOM-NEXT: psllq $7, %mm0 # sched: [1:0.50]
+; ATOM-NEXT: movd %mm0, %rax # sched: [3:3.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_psllq:
+; SLM: # %bb.0:
+; SLM-NEXT: psllq %mm1, %mm0 # sched: [1:1.00]
+; SLM-NEXT: psllq (%rdi), %mm0 # sched: [4:1.00]
+; SLM-NEXT: psllq $7, %mm0 # sched: [1:1.00]
+; SLM-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_psllq:
+; SANDY: # %bb.0:
+; SANDY-NEXT: psllq %mm1, %mm0 # sched: [1:1.00]
+; SANDY-NEXT: psllq (%rdi), %mm0 # sched: [5:1.00]
+; SANDY-NEXT: psllq $7, %mm0 # sched: [1:1.00]
+; SANDY-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_psllq:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: psllq %mm1, %mm0 # sched: [1:1.00]
+; HASWELL-NEXT: psllq (%rdi), %mm0 # sched: [6:1.00]
+; HASWELL-NEXT: psllq $7, %mm0 # sched: [1:1.00]
+; HASWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_psllq:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: psllq %mm1, %mm0 # sched: [1:1.00]
+; BROADWELL-NEXT: psllq (%rdi), %mm0 # sched: [6:1.00]
+; BROADWELL-NEXT: psllq $7, %mm0 # sched: [1:1.00]
+; BROADWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_psllq:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: psllq %mm1, %mm0 # sched: [1:1.00]
+; SKYLAKE-NEXT: psllq (%rdi), %mm0 # sched: [6:1.00]
+; SKYLAKE-NEXT: psllq $7, %mm0 # sched: [1:1.00]
+; SKYLAKE-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_psllq:
+; SKX: # %bb.0:
+; SKX-NEXT: psllq %mm1, %mm0 # sched: [1:1.00]
+; SKX-NEXT: psllq (%rdi), %mm0 # sched: [6:1.00]
+; SKX-NEXT: psllq $7, %mm0 # sched: [1:1.00]
+; SKX-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_psllq:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: psllq %mm1, %mm0 # sched: [1:0.50]
+; BTVER2-NEXT: psllq (%rdi), %mm0 # sched: [6:1.00]
+; BTVER2-NEXT: psllq $7, %mm0 # sched: [1:0.50]
+; BTVER2-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_psllq:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: psllq %mm1, %mm0 # sched: [1:0.25]
+; ZNVER1-NEXT: psllq (%rdi), %mm0 # sched: [8:0.50]
+; ZNVER1-NEXT: psllq $7, %mm0 # sched: [1:0.25]
+; ZNVER1-NEXT: movd %mm0, %rax # sched: [2:1.00]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call x86_mmx @llvm.x86.mmx.psll.q(x86_mmx %a0, x86_mmx %a1)
+ %2 = load x86_mmx, x86_mmx *%a2, align 8
+ %3 = call x86_mmx @llvm.x86.mmx.psll.q(x86_mmx %1, x86_mmx %2)
+ %4 = call x86_mmx @llvm.x86.mmx.pslli.q(x86_mmx %3, i32 7)
+ %5 = bitcast x86_mmx %4 to i64
+ ret i64 %5
+}
+declare x86_mmx @llvm.x86.mmx.psll.q(x86_mmx, x86_mmx) nounwind readnone
+declare x86_mmx @llvm.x86.mmx.pslli.q(x86_mmx, i32) nounwind readnone
+
+define i64 @test_psllw(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
+; GENERIC-LABEL: test_psllw:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: psllw %mm1, %mm0 # sched: [1:1.00]
+; GENERIC-NEXT: psllw (%rdi), %mm0 # sched: [5:1.00]
+; GENERIC-NEXT: psllw $7, %mm0 # sched: [1:1.00]
+; GENERIC-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_psllw:
+; ATOM: # %bb.0:
+; ATOM-NEXT: psllw %mm1, %mm0 # sched: [2:1.00]
+; ATOM-NEXT: psllw (%rdi), %mm0 # sched: [3:1.50]
+; ATOM-NEXT: psllw $7, %mm0 # sched: [1:0.50]
+; ATOM-NEXT: movd %mm0, %rax # sched: [3:3.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_psllw:
+; SLM: # %bb.0:
+; SLM-NEXT: psllw %mm1, %mm0 # sched: [1:1.00]
+; SLM-NEXT: psllw (%rdi), %mm0 # sched: [4:1.00]
+; SLM-NEXT: psllw $7, %mm0 # sched: [1:1.00]
+; SLM-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_psllw:
+; SANDY: # %bb.0:
+; SANDY-NEXT: psllw %mm1, %mm0 # sched: [1:1.00]
+; SANDY-NEXT: psllw (%rdi), %mm0 # sched: [5:1.00]
+; SANDY-NEXT: psllw $7, %mm0 # sched: [1:1.00]
+; SANDY-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_psllw:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: psllw %mm1, %mm0 # sched: [1:1.00]
+; HASWELL-NEXT: psllw (%rdi), %mm0 # sched: [6:1.00]
+; HASWELL-NEXT: psllw $7, %mm0 # sched: [1:1.00]
+; HASWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_psllw:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: psllw %mm1, %mm0 # sched: [1:1.00]
+; BROADWELL-NEXT: psllw (%rdi), %mm0 # sched: [6:1.00]
+; BROADWELL-NEXT: psllw $7, %mm0 # sched: [1:1.00]
+; BROADWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_psllw:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: psllw %mm1, %mm0 # sched: [1:1.00]
+; SKYLAKE-NEXT: psllw (%rdi), %mm0 # sched: [6:1.00]
+; SKYLAKE-NEXT: psllw $7, %mm0 # sched: [1:1.00]
+; SKYLAKE-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_psllw:
+; SKX: # %bb.0:
+; SKX-NEXT: psllw %mm1, %mm0 # sched: [1:1.00]
+; SKX-NEXT: psllw (%rdi), %mm0 # sched: [6:1.00]
+; SKX-NEXT: psllw $7, %mm0 # sched: [1:1.00]
+; SKX-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_psllw:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: psllw %mm1, %mm0 # sched: [1:0.50]
+; BTVER2-NEXT: psllw (%rdi), %mm0 # sched: [6:1.00]
+; BTVER2-NEXT: psllw $7, %mm0 # sched: [1:0.50]
+; BTVER2-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_psllw:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: psllw %mm1, %mm0 # sched: [1:0.25]
+; ZNVER1-NEXT: psllw (%rdi), %mm0 # sched: [8:0.50]
+; ZNVER1-NEXT: psllw $7, %mm0 # sched: [1:0.25]
+; ZNVER1-NEXT: movd %mm0, %rax # sched: [2:1.00]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call x86_mmx @llvm.x86.mmx.psll.w(x86_mmx %a0, x86_mmx %a1)
+ %2 = load x86_mmx, x86_mmx *%a2, align 8
+ %3 = call x86_mmx @llvm.x86.mmx.psll.w(x86_mmx %1, x86_mmx %2)
+ %4 = call x86_mmx @llvm.x86.mmx.pslli.w(x86_mmx %3, i32 7)
+ %5 = bitcast x86_mmx %4 to i64
+ ret i64 %5
+}
+declare x86_mmx @llvm.x86.mmx.psll.w(x86_mmx, x86_mmx) nounwind readnone
+declare x86_mmx @llvm.x86.mmx.pslli.w(x86_mmx, i32) nounwind readnone
+
+define i64 @test_psrad(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
+; GENERIC-LABEL: test_psrad:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: psrad %mm1, %mm0 # sched: [1:1.00]
+; GENERIC-NEXT: psrad (%rdi), %mm0 # sched: [5:1.00]
+; GENERIC-NEXT: psrad $7, %mm0 # sched: [1:1.00]
+; GENERIC-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_psrad:
+; ATOM: # %bb.0:
+; ATOM-NEXT: psrad %mm1, %mm0 # sched: [2:1.00]
+; ATOM-NEXT: psrad (%rdi), %mm0 # sched: [3:1.50]
+; ATOM-NEXT: psrad $7, %mm0 # sched: [1:0.50]
+; ATOM-NEXT: movd %mm0, %rax # sched: [3:3.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_psrad:
+; SLM: # %bb.0:
+; SLM-NEXT: psrad %mm1, %mm0 # sched: [1:1.00]
+; SLM-NEXT: psrad (%rdi), %mm0 # sched: [4:1.00]
+; SLM-NEXT: psrad $7, %mm0 # sched: [1:1.00]
+; SLM-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_psrad:
+; SANDY: # %bb.0:
+; SANDY-NEXT: psrad %mm1, %mm0 # sched: [1:1.00]
+; SANDY-NEXT: psrad (%rdi), %mm0 # sched: [5:1.00]
+; SANDY-NEXT: psrad $7, %mm0 # sched: [1:1.00]
+; SANDY-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_psrad:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: psrad %mm1, %mm0 # sched: [1:1.00]
+; HASWELL-NEXT: psrad (%rdi), %mm0 # sched: [6:1.00]
+; HASWELL-NEXT: psrad $7, %mm0 # sched: [1:1.00]
+; HASWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_psrad:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: psrad %mm1, %mm0 # sched: [1:1.00]
+; BROADWELL-NEXT: psrad (%rdi), %mm0 # sched: [6:1.00]
+; BROADWELL-NEXT: psrad $7, %mm0 # sched: [1:1.00]
+; BROADWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_psrad:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: psrad %mm1, %mm0 # sched: [1:1.00]
+; SKYLAKE-NEXT: psrad (%rdi), %mm0 # sched: [6:1.00]
+; SKYLAKE-NEXT: psrad $7, %mm0 # sched: [1:1.00]
+; SKYLAKE-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_psrad:
+; SKX: # %bb.0:
+; SKX-NEXT: psrad %mm1, %mm0 # sched: [1:1.00]
+; SKX-NEXT: psrad (%rdi), %mm0 # sched: [6:1.00]
+; SKX-NEXT: psrad $7, %mm0 # sched: [1:1.00]
+; SKX-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_psrad:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: psrad %mm1, %mm0 # sched: [1:0.50]
+; BTVER2-NEXT: psrad (%rdi), %mm0 # sched: [6:1.00]
+; BTVER2-NEXT: psrad $7, %mm0 # sched: [1:0.50]
+; BTVER2-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_psrad:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: psrad %mm1, %mm0 # sched: [1:0.25]
+; ZNVER1-NEXT: psrad (%rdi), %mm0 # sched: [8:0.50]
+; ZNVER1-NEXT: psrad $7, %mm0 # sched: [1:0.25]
+; ZNVER1-NEXT: movd %mm0, %rax # sched: [2:1.00]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call x86_mmx @llvm.x86.mmx.psra.d(x86_mmx %a0, x86_mmx %a1)
+ %2 = load x86_mmx, x86_mmx *%a2, align 8
+ %3 = call x86_mmx @llvm.x86.mmx.psra.d(x86_mmx %1, x86_mmx %2)
+ %4 = call x86_mmx @llvm.x86.mmx.psrai.d(x86_mmx %3, i32 7)
+ %5 = bitcast x86_mmx %4 to i64
+ ret i64 %5
+}
+declare x86_mmx @llvm.x86.mmx.psra.d(x86_mmx, x86_mmx) nounwind readnone
+declare x86_mmx @llvm.x86.mmx.psrai.d(x86_mmx, i32) nounwind readnone
+
+define i64 @test_psraw(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
+; GENERIC-LABEL: test_psraw:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: psraw %mm1, %mm0 # sched: [1:1.00]
+; GENERIC-NEXT: psraw (%rdi), %mm0 # sched: [5:1.00]
+; GENERIC-NEXT: psraw $7, %mm0 # sched: [1:1.00]
+; GENERIC-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_psraw:
+; ATOM: # %bb.0:
+; ATOM-NEXT: psraw %mm1, %mm0 # sched: [2:1.00]
+; ATOM-NEXT: psraw (%rdi), %mm0 # sched: [3:1.50]
+; ATOM-NEXT: psraw $7, %mm0 # sched: [1:0.50]
+; ATOM-NEXT: movd %mm0, %rax # sched: [3:3.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_psraw:
+; SLM: # %bb.0:
+; SLM-NEXT: psraw %mm1, %mm0 # sched: [1:1.00]
+; SLM-NEXT: psraw (%rdi), %mm0 # sched: [4:1.00]
+; SLM-NEXT: psraw $7, %mm0 # sched: [1:1.00]
+; SLM-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_psraw:
+; SANDY: # %bb.0:
+; SANDY-NEXT: psraw %mm1, %mm0 # sched: [1:1.00]
+; SANDY-NEXT: psraw (%rdi), %mm0 # sched: [5:1.00]
+; SANDY-NEXT: psraw $7, %mm0 # sched: [1:1.00]
+; SANDY-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_psraw:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: psraw %mm1, %mm0 # sched: [1:1.00]
+; HASWELL-NEXT: psraw (%rdi), %mm0 # sched: [6:1.00]
+; HASWELL-NEXT: psraw $7, %mm0 # sched: [1:1.00]
+; HASWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_psraw:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: psraw %mm1, %mm0 # sched: [1:1.00]
+; BROADWELL-NEXT: psraw (%rdi), %mm0 # sched: [6:1.00]
+; BROADWELL-NEXT: psraw $7, %mm0 # sched: [1:1.00]
+; BROADWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_psraw:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: psraw %mm1, %mm0 # sched: [1:1.00]
+; SKYLAKE-NEXT: psraw (%rdi), %mm0 # sched: [6:1.00]
+; SKYLAKE-NEXT: psraw $7, %mm0 # sched: [1:1.00]
+; SKYLAKE-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_psraw:
+; SKX: # %bb.0:
+; SKX-NEXT: psraw %mm1, %mm0 # sched: [1:1.00]
+; SKX-NEXT: psraw (%rdi), %mm0 # sched: [6:1.00]
+; SKX-NEXT: psraw $7, %mm0 # sched: [1:1.00]
+; SKX-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_psraw:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: psraw %mm1, %mm0 # sched: [1:0.50]
+; BTVER2-NEXT: psraw (%rdi), %mm0 # sched: [6:1.00]
+; BTVER2-NEXT: psraw $7, %mm0 # sched: [1:0.50]
+; BTVER2-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_psraw:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: psraw %mm1, %mm0 # sched: [1:0.25]
+; ZNVER1-NEXT: psraw (%rdi), %mm0 # sched: [8:0.50]
+; ZNVER1-NEXT: psraw $7, %mm0 # sched: [1:0.25]
+; ZNVER1-NEXT: movd %mm0, %rax # sched: [2:1.00]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call x86_mmx @llvm.x86.mmx.psra.w(x86_mmx %a0, x86_mmx %a1)
+ %2 = load x86_mmx, x86_mmx *%a2, align 8
+ %3 = call x86_mmx @llvm.x86.mmx.psra.w(x86_mmx %1, x86_mmx %2)
+ %4 = call x86_mmx @llvm.x86.mmx.psrai.w(x86_mmx %3, i32 7)
+ %5 = bitcast x86_mmx %4 to i64
+ ret i64 %5
+}
+declare x86_mmx @llvm.x86.mmx.psra.w(x86_mmx, x86_mmx) nounwind readnone
+declare x86_mmx @llvm.x86.mmx.psrai.w(x86_mmx, i32) nounwind readnone
+
+define i64 @test_psrld(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
+; GENERIC-LABEL: test_psrld:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: psrld %mm1, %mm0 # sched: [1:1.00]
+; GENERIC-NEXT: psrld (%rdi), %mm0 # sched: [5:1.00]
+; GENERIC-NEXT: psrld $7, %mm0 # sched: [1:1.00]
+; GENERIC-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_psrld:
+; ATOM: # %bb.0:
+; ATOM-NEXT: psrld %mm1, %mm0 # sched: [2:1.00]
+; ATOM-NEXT: psrld (%rdi), %mm0 # sched: [3:1.50]
+; ATOM-NEXT: psrld $7, %mm0 # sched: [1:0.50]
+; ATOM-NEXT: movd %mm0, %rax # sched: [3:3.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_psrld:
+; SLM: # %bb.0:
+; SLM-NEXT: psrld %mm1, %mm0 # sched: [1:1.00]
+; SLM-NEXT: psrld (%rdi), %mm0 # sched: [4:1.00]
+; SLM-NEXT: psrld $7, %mm0 # sched: [1:1.00]
+; SLM-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_psrld:
+; SANDY: # %bb.0:
+; SANDY-NEXT: psrld %mm1, %mm0 # sched: [1:1.00]
+; SANDY-NEXT: psrld (%rdi), %mm0 # sched: [5:1.00]
+; SANDY-NEXT: psrld $7, %mm0 # sched: [1:1.00]
+; SANDY-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_psrld:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: psrld %mm1, %mm0 # sched: [1:1.00]
+; HASWELL-NEXT: psrld (%rdi), %mm0 # sched: [6:1.00]
+; HASWELL-NEXT: psrld $7, %mm0 # sched: [1:1.00]
+; HASWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_psrld:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: psrld %mm1, %mm0 # sched: [1:1.00]
+; BROADWELL-NEXT: psrld (%rdi), %mm0 # sched: [6:1.00]
+; BROADWELL-NEXT: psrld $7, %mm0 # sched: [1:1.00]
+; BROADWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_psrld:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: psrld %mm1, %mm0 # sched: [1:1.00]
+; SKYLAKE-NEXT: psrld (%rdi), %mm0 # sched: [6:1.00]
+; SKYLAKE-NEXT: psrld $7, %mm0 # sched: [1:1.00]
+; SKYLAKE-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_psrld:
+; SKX: # %bb.0:
+; SKX-NEXT: psrld %mm1, %mm0 # sched: [1:1.00]
+; SKX-NEXT: psrld (%rdi), %mm0 # sched: [6:1.00]
+; SKX-NEXT: psrld $7, %mm0 # sched: [1:1.00]
+; SKX-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_psrld:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: psrld %mm1, %mm0 # sched: [1:0.50]
+; BTVER2-NEXT: psrld (%rdi), %mm0 # sched: [6:1.00]
+; BTVER2-NEXT: psrld $7, %mm0 # sched: [1:0.50]
+; BTVER2-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_psrld:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: psrld %mm1, %mm0 # sched: [1:0.25]
+; ZNVER1-NEXT: psrld (%rdi), %mm0 # sched: [8:0.50]
+; ZNVER1-NEXT: psrld $7, %mm0 # sched: [1:0.25]
+; ZNVER1-NEXT: movd %mm0, %rax # sched: [2:1.00]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call x86_mmx @llvm.x86.mmx.psrl.d(x86_mmx %a0, x86_mmx %a1)
+ %2 = load x86_mmx, x86_mmx *%a2, align 8
+ %3 = call x86_mmx @llvm.x86.mmx.psrl.d(x86_mmx %1, x86_mmx %2)
+ %4 = call x86_mmx @llvm.x86.mmx.psrli.d(x86_mmx %3, i32 7)
+ %5 = bitcast x86_mmx %4 to i64
+ ret i64 %5
+}
+declare x86_mmx @llvm.x86.mmx.psrl.d(x86_mmx, x86_mmx) nounwind readnone
+declare x86_mmx @llvm.x86.mmx.psrli.d(x86_mmx, i32) nounwind readnone
+
+define i64 @test_psrlq(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
+; GENERIC-LABEL: test_psrlq:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: psrlq %mm1, %mm0 # sched: [1:1.00]
+; GENERIC-NEXT: psrlq (%rdi), %mm0 # sched: [5:1.00]
+; GENERIC-NEXT: psrlq $7, %mm0 # sched: [1:1.00]
+; GENERIC-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_psrlq:
+; ATOM: # %bb.0:
+; ATOM-NEXT: psrlq %mm1, %mm0 # sched: [2:1.00]
+; ATOM-NEXT: psrlq (%rdi), %mm0 # sched: [3:1.50]
+; ATOM-NEXT: psrlq $7, %mm0 # sched: [1:0.50]
+; ATOM-NEXT: movd %mm0, %rax # sched: [3:3.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_psrlq:
+; SLM: # %bb.0:
+; SLM-NEXT: psrlq %mm1, %mm0 # sched: [1:1.00]
+; SLM-NEXT: psrlq (%rdi), %mm0 # sched: [4:1.00]
+; SLM-NEXT: psrlq $7, %mm0 # sched: [1:1.00]
+; SLM-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_psrlq:
+; SANDY: # %bb.0:
+; SANDY-NEXT: psrlq %mm1, %mm0 # sched: [1:1.00]
+; SANDY-NEXT: psrlq (%rdi), %mm0 # sched: [5:1.00]
+; SANDY-NEXT: psrlq $7, %mm0 # sched: [1:1.00]
+; SANDY-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_psrlq:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: psrlq %mm1, %mm0 # sched: [1:1.00]
+; HASWELL-NEXT: psrlq (%rdi), %mm0 # sched: [6:1.00]
+; HASWELL-NEXT: psrlq $7, %mm0 # sched: [1:1.00]
+; HASWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_psrlq:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: psrlq %mm1, %mm0 # sched: [1:1.00]
+; BROADWELL-NEXT: psrlq (%rdi), %mm0 # sched: [6:1.00]
+; BROADWELL-NEXT: psrlq $7, %mm0 # sched: [1:1.00]
+; BROADWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_psrlq:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: psrlq %mm1, %mm0 # sched: [1:1.00]
+; SKYLAKE-NEXT: psrlq (%rdi), %mm0 # sched: [6:1.00]
+; SKYLAKE-NEXT: psrlq $7, %mm0 # sched: [1:1.00]
+; SKYLAKE-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_psrlq:
+; SKX: # %bb.0:
+; SKX-NEXT: psrlq %mm1, %mm0 # sched: [1:1.00]
+; SKX-NEXT: psrlq (%rdi), %mm0 # sched: [6:1.00]
+; SKX-NEXT: psrlq $7, %mm0 # sched: [1:1.00]
+; SKX-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_psrlq:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: psrlq %mm1, %mm0 # sched: [1:0.50]
+; BTVER2-NEXT: psrlq (%rdi), %mm0 # sched: [6:1.00]
+; BTVER2-NEXT: psrlq $7, %mm0 # sched: [1:0.50]
+; BTVER2-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_psrlq:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: psrlq %mm1, %mm0 # sched: [1:0.25]
+; ZNVER1-NEXT: psrlq (%rdi), %mm0 # sched: [8:0.50]
+; ZNVER1-NEXT: psrlq $7, %mm0 # sched: [1:0.25]
+; ZNVER1-NEXT: movd %mm0, %rax # sched: [2:1.00]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call x86_mmx @llvm.x86.mmx.psrl.q(x86_mmx %a0, x86_mmx %a1)
+ %2 = load x86_mmx, x86_mmx *%a2, align 8
+ %3 = call x86_mmx @llvm.x86.mmx.psrl.q(x86_mmx %1, x86_mmx %2)
+ %4 = call x86_mmx @llvm.x86.mmx.psrli.q(x86_mmx %3, i32 7)
+ %5 = bitcast x86_mmx %4 to i64
+ ret i64 %5
+}
+declare x86_mmx @llvm.x86.mmx.psrl.q(x86_mmx, x86_mmx) nounwind readnone
+declare x86_mmx @llvm.x86.mmx.psrli.q(x86_mmx, i32) nounwind readnone
+
+define i64 @test_psrlw(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
+; GENERIC-LABEL: test_psrlw:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: psrlw %mm1, %mm0 # sched: [1:1.00]
+; GENERIC-NEXT: psrlw (%rdi), %mm0 # sched: [5:1.00]
+; GENERIC-NEXT: psrlw $7, %mm0 # sched: [1:1.00]
+; GENERIC-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_psrlw:
+; ATOM: # %bb.0:
+; ATOM-NEXT: psrlw %mm1, %mm0 # sched: [2:1.00]
+; ATOM-NEXT: psrlw (%rdi), %mm0 # sched: [3:1.50]
+; ATOM-NEXT: psrlw $7, %mm0 # sched: [1:0.50]
+; ATOM-NEXT: movd %mm0, %rax # sched: [3:3.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_psrlw:
+; SLM: # %bb.0:
+; SLM-NEXT: psrlw %mm1, %mm0 # sched: [1:1.00]
+; SLM-NEXT: psrlw (%rdi), %mm0 # sched: [4:1.00]
+; SLM-NEXT: psrlw $7, %mm0 # sched: [1:1.00]
+; SLM-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_psrlw:
+; SANDY: # %bb.0:
+; SANDY-NEXT: psrlw %mm1, %mm0 # sched: [1:1.00]
+; SANDY-NEXT: psrlw (%rdi), %mm0 # sched: [5:1.00]
+; SANDY-NEXT: psrlw $7, %mm0 # sched: [1:1.00]
+; SANDY-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_psrlw:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: psrlw %mm1, %mm0 # sched: [1:1.00]
+; HASWELL-NEXT: psrlw (%rdi), %mm0 # sched: [6:1.00]
+; HASWELL-NEXT: psrlw $7, %mm0 # sched: [1:1.00]
+; HASWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_psrlw:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: psrlw %mm1, %mm0 # sched: [1:1.00]
+; BROADWELL-NEXT: psrlw (%rdi), %mm0 # sched: [6:1.00]
+; BROADWELL-NEXT: psrlw $7, %mm0 # sched: [1:1.00]
+; BROADWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_psrlw:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: psrlw %mm1, %mm0 # sched: [1:1.00]
+; SKYLAKE-NEXT: psrlw (%rdi), %mm0 # sched: [6:1.00]
+; SKYLAKE-NEXT: psrlw $7, %mm0 # sched: [1:1.00]
+; SKYLAKE-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_psrlw:
+; SKX: # %bb.0:
+; SKX-NEXT: psrlw %mm1, %mm0 # sched: [1:1.00]
+; SKX-NEXT: psrlw (%rdi), %mm0 # sched: [6:1.00]
+; SKX-NEXT: psrlw $7, %mm0 # sched: [1:1.00]
+; SKX-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_psrlw:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: psrlw %mm1, %mm0 # sched: [1:0.50]
+; BTVER2-NEXT: psrlw (%rdi), %mm0 # sched: [6:1.00]
+; BTVER2-NEXT: psrlw $7, %mm0 # sched: [1:0.50]
+; BTVER2-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_psrlw:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: psrlw %mm1, %mm0 # sched: [1:0.25]
+; ZNVER1-NEXT: psrlw (%rdi), %mm0 # sched: [8:0.50]
+; ZNVER1-NEXT: psrlw $7, %mm0 # sched: [1:0.25]
+; ZNVER1-NEXT: movd %mm0, %rax # sched: [2:1.00]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call x86_mmx @llvm.x86.mmx.psrl.w(x86_mmx %a0, x86_mmx %a1)
+ %2 = load x86_mmx, x86_mmx *%a2, align 8
+ %3 = call x86_mmx @llvm.x86.mmx.psrl.w(x86_mmx %1, x86_mmx %2)
+ %4 = call x86_mmx @llvm.x86.mmx.psrli.w(x86_mmx %3, i32 7)
+ %5 = bitcast x86_mmx %4 to i64
+ ret i64 %5
+}
+declare x86_mmx @llvm.x86.mmx.psrl.w(x86_mmx, x86_mmx) nounwind readnone
+declare x86_mmx @llvm.x86.mmx.psrli.w(x86_mmx, i32) nounwind readnone
+
+define i64 @test_psubb(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
+; GENERIC-LABEL: test_psubb:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: psubb %mm1, %mm0 # sched: [3:1.00]
+; GENERIC-NEXT: psubb (%rdi), %mm0 # sched: [7:1.00]
+; GENERIC-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_psubb:
+; ATOM: # %bb.0:
+; ATOM-NEXT: psubb %mm1, %mm0 # sched: [1:0.50]
+; ATOM-NEXT: psubb (%rdi), %mm0 # sched: [1:1.00]
+; ATOM-NEXT: movd %mm0, %rax # sched: [3:3.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_psubb:
+; SLM: # %bb.0:
+; SLM-NEXT: psubb %mm1, %mm0 # sched: [1:0.50]
+; SLM-NEXT: psubb (%rdi), %mm0 # sched: [4:1.00]
+; SLM-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_psubb:
+; SANDY: # %bb.0:
+; SANDY-NEXT: psubb %mm1, %mm0 # sched: [3:1.00]
+; SANDY-NEXT: psubb (%rdi), %mm0 # sched: [7:1.00]
+; SANDY-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_psubb:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: psubb %mm1, %mm0 # sched: [1:0.50]
+; HASWELL-NEXT: psubb (%rdi), %mm0 # sched: [6:0.50]
+; HASWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_psubb:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: psubb %mm1, %mm0 # sched: [1:0.50]
+; BROADWELL-NEXT: psubb (%rdi), %mm0 # sched: [6:0.50]
+; BROADWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_psubb:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: psubb %mm1, %mm0 # sched: [1:0.50]
+; SKYLAKE-NEXT: psubb (%rdi), %mm0 # sched: [6:0.50]
+; SKYLAKE-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_psubb:
+; SKX: # %bb.0:
+; SKX-NEXT: psubb %mm1, %mm0 # sched: [1:0.50]
+; SKX-NEXT: psubb (%rdi), %mm0 # sched: [6:0.50]
+; SKX-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_psubb:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: psubb %mm1, %mm0 # sched: [1:0.50]
+; BTVER2-NEXT: psubb (%rdi), %mm0 # sched: [6:1.00]
+; BTVER2-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_psubb:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: psubb %mm1, %mm0 # sched: [1:0.25]
+; ZNVER1-NEXT: psubb (%rdi), %mm0 # sched: [8:0.50]
+; ZNVER1-NEXT: movd %mm0, %rax # sched: [2:1.00]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call x86_mmx @llvm.x86.mmx.psub.b(x86_mmx %a0, x86_mmx %a1)
+ %2 = load x86_mmx, x86_mmx *%a2, align 8
+ %3 = call x86_mmx @llvm.x86.mmx.psub.b(x86_mmx %1, x86_mmx %2)
+ %4 = bitcast x86_mmx %3 to i64
+ ret i64 %4
+}
+declare x86_mmx @llvm.x86.mmx.psub.b(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test_psubd(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
+; GENERIC-LABEL: test_psubd:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: psubd %mm1, %mm0 # sched: [3:1.00]
+; GENERIC-NEXT: psubd (%rdi), %mm0 # sched: [7:1.00]
+; GENERIC-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_psubd:
+; ATOM: # %bb.0:
+; ATOM-NEXT: psubd %mm1, %mm0 # sched: [1:0.50]
+; ATOM-NEXT: psubd (%rdi), %mm0 # sched: [1:1.00]
+; ATOM-NEXT: movd %mm0, %rax # sched: [3:3.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_psubd:
+; SLM: # %bb.0:
+; SLM-NEXT: psubd %mm1, %mm0 # sched: [1:0.50]
+; SLM-NEXT: psubd (%rdi), %mm0 # sched: [4:1.00]
+; SLM-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_psubd:
+; SANDY: # %bb.0:
+; SANDY-NEXT: psubd %mm1, %mm0 # sched: [3:1.00]
+; SANDY-NEXT: psubd (%rdi), %mm0 # sched: [7:1.00]
+; SANDY-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_psubd:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: psubd %mm1, %mm0 # sched: [1:0.50]
+; HASWELL-NEXT: psubd (%rdi), %mm0 # sched: [6:0.50]
+; HASWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_psubd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: psubd %mm1, %mm0 # sched: [1:0.50]
+; BROADWELL-NEXT: psubd (%rdi), %mm0 # sched: [6:0.50]
+; BROADWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_psubd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: psubd %mm1, %mm0 # sched: [1:0.50]
+; SKYLAKE-NEXT: psubd (%rdi), %mm0 # sched: [6:0.50]
+; SKYLAKE-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_psubd:
+; SKX: # %bb.0:
+; SKX-NEXT: psubd %mm1, %mm0 # sched: [1:0.50]
+; SKX-NEXT: psubd (%rdi), %mm0 # sched: [6:0.50]
+; SKX-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_psubd:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: psubd %mm1, %mm0 # sched: [1:0.50]
+; BTVER2-NEXT: psubd (%rdi), %mm0 # sched: [6:1.00]
+; BTVER2-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_psubd:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: psubd %mm1, %mm0 # sched: [1:0.25]
+; ZNVER1-NEXT: psubd (%rdi), %mm0 # sched: [8:0.50]
+; ZNVER1-NEXT: movd %mm0, %rax # sched: [2:1.00]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call x86_mmx @llvm.x86.mmx.psub.d(x86_mmx %a0, x86_mmx %a1)
+ %2 = load x86_mmx, x86_mmx *%a2, align 8
+ %3 = call x86_mmx @llvm.x86.mmx.psub.d(x86_mmx %1, x86_mmx %2)
+ %4 = bitcast x86_mmx %3 to i64
+ ret i64 %4
+}
+declare x86_mmx @llvm.x86.mmx.psub.d(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test_psubq(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
+; GENERIC-LABEL: test_psubq:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: psubq %mm1, %mm0 # sched: [3:1.00]
+; GENERIC-NEXT: psubq (%rdi), %mm0 # sched: [7:1.00]
+; GENERIC-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_psubq:
+; ATOM: # %bb.0:
+; ATOM-NEXT: psubq %mm1, %mm0 # sched: [2:1.00]
+; ATOM-NEXT: psubq (%rdi), %mm0 # sched: [3:1.50]
+; ATOM-NEXT: movd %mm0, %rax # sched: [3:3.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_psubq:
+; SLM: # %bb.0:
+; SLM-NEXT: psubq %mm1, %mm0 # sched: [1:0.50]
+; SLM-NEXT: psubq (%rdi), %mm0 # sched: [4:1.00]
+; SLM-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_psubq:
+; SANDY: # %bb.0:
+; SANDY-NEXT: psubq %mm1, %mm0 # sched: [3:1.00]
+; SANDY-NEXT: psubq (%rdi), %mm0 # sched: [7:1.00]
+; SANDY-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_psubq:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: psubq %mm1, %mm0 # sched: [1:0.50]
+; HASWELL-NEXT: psubq (%rdi), %mm0 # sched: [6:0.50]
+; HASWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_psubq:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: psubq %mm1, %mm0 # sched: [1:0.50]
+; BROADWELL-NEXT: psubq (%rdi), %mm0 # sched: [6:0.50]
+; BROADWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_psubq:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: psubq %mm1, %mm0 # sched: [1:0.50]
+; SKYLAKE-NEXT: psubq (%rdi), %mm0 # sched: [6:0.50]
+; SKYLAKE-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_psubq:
+; SKX: # %bb.0:
+; SKX-NEXT: psubq %mm1, %mm0 # sched: [1:0.50]
+; SKX-NEXT: psubq (%rdi), %mm0 # sched: [6:0.50]
+; SKX-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_psubq:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: psubq %mm1, %mm0 # sched: [1:0.50]
+; BTVER2-NEXT: psubq (%rdi), %mm0 # sched: [6:1.00]
+; BTVER2-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_psubq:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: psubq %mm1, %mm0 # sched: [1:0.25]
+; ZNVER1-NEXT: psubq (%rdi), %mm0 # sched: [8:0.50]
+; ZNVER1-NEXT: movd %mm0, %rax # sched: [2:1.00]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call x86_mmx @llvm.x86.mmx.psub.q(x86_mmx %a0, x86_mmx %a1)
+ %2 = load x86_mmx, x86_mmx *%a2, align 8
+ %3 = call x86_mmx @llvm.x86.mmx.psub.q(x86_mmx %1, x86_mmx %2)
+ %4 = bitcast x86_mmx %3 to i64
+ ret i64 %4
+}
+declare x86_mmx @llvm.x86.mmx.psub.q(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test_psubsb(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
+; GENERIC-LABEL: test_psubsb:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: psubsb %mm1, %mm0 # sched: [3:1.00]
+; GENERIC-NEXT: psubsb (%rdi), %mm0 # sched: [7:1.00]
+; GENERIC-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_psubsb:
+; ATOM: # %bb.0:
+; ATOM-NEXT: psubsb %mm1, %mm0 # sched: [1:0.50]
+; ATOM-NEXT: psubsb (%rdi), %mm0 # sched: [1:1.00]
+; ATOM-NEXT: movd %mm0, %rax # sched: [3:3.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_psubsb:
+; SLM: # %bb.0:
+; SLM-NEXT: psubsb %mm1, %mm0 # sched: [1:0.50]
+; SLM-NEXT: psubsb (%rdi), %mm0 # sched: [4:1.00]
+; SLM-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_psubsb:
+; SANDY: # %bb.0:
+; SANDY-NEXT: psubsb %mm1, %mm0 # sched: [3:1.00]
+; SANDY-NEXT: psubsb (%rdi), %mm0 # sched: [7:1.00]
+; SANDY-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_psubsb:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: psubsb %mm1, %mm0 # sched: [1:0.50]
+; HASWELL-NEXT: psubsb (%rdi), %mm0 # sched: [6:0.50]
+; HASWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_psubsb:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: psubsb %mm1, %mm0 # sched: [1:0.50]
+; BROADWELL-NEXT: psubsb (%rdi), %mm0 # sched: [6:0.50]
+; BROADWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_psubsb:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: psubsb %mm1, %mm0 # sched: [1:1.00]
+; SKYLAKE-NEXT: psubsb (%rdi), %mm0 # sched: [6:1.00]
+; SKYLAKE-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_psubsb:
+; SKX: # %bb.0:
+; SKX-NEXT: psubsb %mm1, %mm0 # sched: [1:1.00]
+; SKX-NEXT: psubsb (%rdi), %mm0 # sched: [6:1.00]
+; SKX-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_psubsb:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: psubsb %mm1, %mm0 # sched: [1:0.50]
+; BTVER2-NEXT: psubsb (%rdi), %mm0 # sched: [6:1.00]
+; BTVER2-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_psubsb:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: psubsb %mm1, %mm0 # sched: [1:0.25]
+; ZNVER1-NEXT: psubsb (%rdi), %mm0 # sched: [8:0.50]
+; ZNVER1-NEXT: movd %mm0, %rax # sched: [2:1.00]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call x86_mmx @llvm.x86.mmx.psubs.b(x86_mmx %a0, x86_mmx %a1)
+ %2 = load x86_mmx, x86_mmx *%a2, align 8
+ %3 = call x86_mmx @llvm.x86.mmx.psubs.b(x86_mmx %1, x86_mmx %2)
+ %4 = bitcast x86_mmx %3 to i64
+ ret i64 %4
+}
+declare x86_mmx @llvm.x86.mmx.psubs.b(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test_psubsw(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
+; GENERIC-LABEL: test_psubsw:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: psubsw %mm1, %mm0 # sched: [3:1.00]
+; GENERIC-NEXT: psubsw (%rdi), %mm0 # sched: [7:1.00]
+; GENERIC-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_psubsw:
+; ATOM: # %bb.0:
+; ATOM-NEXT: psubsw %mm1, %mm0 # sched: [1:0.50]
+; ATOM-NEXT: psubsw (%rdi), %mm0 # sched: [1:1.00]
+; ATOM-NEXT: movd %mm0, %rax # sched: [3:3.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_psubsw:
+; SLM: # %bb.0:
+; SLM-NEXT: psubsw %mm1, %mm0 # sched: [1:0.50]
+; SLM-NEXT: psubsw (%rdi), %mm0 # sched: [4:1.00]
+; SLM-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_psubsw:
+; SANDY: # %bb.0:
+; SANDY-NEXT: psubsw %mm1, %mm0 # sched: [3:1.00]
+; SANDY-NEXT: psubsw (%rdi), %mm0 # sched: [7:1.00]
+; SANDY-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_psubsw:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: psubsw %mm1, %mm0 # sched: [1:0.50]
+; HASWELL-NEXT: psubsw (%rdi), %mm0 # sched: [6:0.50]
+; HASWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_psubsw:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: psubsw %mm1, %mm0 # sched: [1:0.50]
+; BROADWELL-NEXT: psubsw (%rdi), %mm0 # sched: [6:0.50]
+; BROADWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_psubsw:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: psubsw %mm1, %mm0 # sched: [1:1.00]
+; SKYLAKE-NEXT: psubsw (%rdi), %mm0 # sched: [6:1.00]
+; SKYLAKE-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_psubsw:
+; SKX: # %bb.0:
+; SKX-NEXT: psubsw %mm1, %mm0 # sched: [1:1.00]
+; SKX-NEXT: psubsw (%rdi), %mm0 # sched: [6:1.00]
+; SKX-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_psubsw:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: psubsw %mm1, %mm0 # sched: [1:0.50]
+; BTVER2-NEXT: psubsw (%rdi), %mm0 # sched: [6:1.00]
+; BTVER2-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_psubsw:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: psubsw %mm1, %mm0 # sched: [1:0.25]
+; ZNVER1-NEXT: psubsw (%rdi), %mm0 # sched: [8:0.50]
+; ZNVER1-NEXT: movd %mm0, %rax # sched: [2:1.00]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call x86_mmx @llvm.x86.mmx.psubs.w(x86_mmx %a0, x86_mmx %a1)
+ %2 = load x86_mmx, x86_mmx *%a2, align 8
+ %3 = call x86_mmx @llvm.x86.mmx.psubs.w(x86_mmx %1, x86_mmx %2)
+ %4 = bitcast x86_mmx %3 to i64
+ ret i64 %4
+}
+declare x86_mmx @llvm.x86.mmx.psubs.w(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test_psubusb(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
+; GENERIC-LABEL: test_psubusb:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: psubusb %mm1, %mm0 # sched: [3:1.00]
+; GENERIC-NEXT: psubusb (%rdi), %mm0 # sched: [7:1.00]
+; GENERIC-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_psubusb:
+; ATOM: # %bb.0:
+; ATOM-NEXT: psubusb %mm1, %mm0 # sched: [1:0.50]
+; ATOM-NEXT: psubusb (%rdi), %mm0 # sched: [1:1.00]
+; ATOM-NEXT: movd %mm0, %rax # sched: [3:3.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_psubusb:
+; SLM: # %bb.0:
+; SLM-NEXT: psubusb %mm1, %mm0 # sched: [1:0.50]
+; SLM-NEXT: psubusb (%rdi), %mm0 # sched: [4:1.00]
+; SLM-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_psubusb:
+; SANDY: # %bb.0:
+; SANDY-NEXT: psubusb %mm1, %mm0 # sched: [3:1.00]
+; SANDY-NEXT: psubusb (%rdi), %mm0 # sched: [7:1.00]
+; SANDY-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_psubusb:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: psubusb %mm1, %mm0 # sched: [1:0.50]
+; HASWELL-NEXT: psubusb (%rdi), %mm0 # sched: [6:0.50]
+; HASWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_psubusb:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: psubusb %mm1, %mm0 # sched: [1:0.50]
+; BROADWELL-NEXT: psubusb (%rdi), %mm0 # sched: [6:0.50]
+; BROADWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_psubusb:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: psubusb %mm1, %mm0 # sched: [1:1.00]
+; SKYLAKE-NEXT: psubusb (%rdi), %mm0 # sched: [6:1.00]
+; SKYLAKE-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_psubusb:
+; SKX: # %bb.0:
+; SKX-NEXT: psubusb %mm1, %mm0 # sched: [1:1.00]
+; SKX-NEXT: psubusb (%rdi), %mm0 # sched: [6:1.00]
+; SKX-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_psubusb:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: psubusb %mm1, %mm0 # sched: [1:0.50]
+; BTVER2-NEXT: psubusb (%rdi), %mm0 # sched: [6:1.00]
+; BTVER2-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_psubusb:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: psubusb %mm1, %mm0 # sched: [1:0.25]
+; ZNVER1-NEXT: psubusb (%rdi), %mm0 # sched: [8:0.50]
+; ZNVER1-NEXT: movd %mm0, %rax # sched: [2:1.00]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call x86_mmx @llvm.x86.mmx.psubus.b(x86_mmx %a0, x86_mmx %a1)
+ %2 = load x86_mmx, x86_mmx *%a2, align 8
+ %3 = call x86_mmx @llvm.x86.mmx.psubus.b(x86_mmx %1, x86_mmx %2)
+ %4 = bitcast x86_mmx %3 to i64
+ ret i64 %4
+}
+declare x86_mmx @llvm.x86.mmx.psubus.b(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test_psubusw(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
+; GENERIC-LABEL: test_psubusw:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: psubusw %mm1, %mm0 # sched: [3:1.00]
+; GENERIC-NEXT: psubusw (%rdi), %mm0 # sched: [7:1.00]
+; GENERIC-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_psubusw:
+; ATOM: # %bb.0:
+; ATOM-NEXT: psubusw %mm1, %mm0 # sched: [1:0.50]
+; ATOM-NEXT: psubusw (%rdi), %mm0 # sched: [1:1.00]
+; ATOM-NEXT: movd %mm0, %rax # sched: [3:3.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_psubusw:
+; SLM: # %bb.0:
+; SLM-NEXT: psubusw %mm1, %mm0 # sched: [1:0.50]
+; SLM-NEXT: psubusw (%rdi), %mm0 # sched: [4:1.00]
+; SLM-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_psubusw:
+; SANDY: # %bb.0:
+; SANDY-NEXT: psubusw %mm1, %mm0 # sched: [3:1.00]
+; SANDY-NEXT: psubusw (%rdi), %mm0 # sched: [7:1.00]
+; SANDY-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_psubusw:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: psubusw %mm1, %mm0 # sched: [1:0.50]
+; HASWELL-NEXT: psubusw (%rdi), %mm0 # sched: [6:0.50]
+; HASWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_psubusw:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: psubusw %mm1, %mm0 # sched: [1:0.50]
+; BROADWELL-NEXT: psubusw (%rdi), %mm0 # sched: [6:0.50]
+; BROADWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_psubusw:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: psubusw %mm1, %mm0 # sched: [1:1.00]
+; SKYLAKE-NEXT: psubusw (%rdi), %mm0 # sched: [6:1.00]
+; SKYLAKE-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_psubusw:
+; SKX: # %bb.0:
+; SKX-NEXT: psubusw %mm1, %mm0 # sched: [1:1.00]
+; SKX-NEXT: psubusw (%rdi), %mm0 # sched: [6:1.00]
+; SKX-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_psubusw:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: psubusw %mm1, %mm0 # sched: [1:0.50]
+; BTVER2-NEXT: psubusw (%rdi), %mm0 # sched: [6:1.00]
+; BTVER2-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_psubusw:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: psubusw %mm1, %mm0 # sched: [1:0.25]
+; ZNVER1-NEXT: psubusw (%rdi), %mm0 # sched: [8:0.50]
+; ZNVER1-NEXT: movd %mm0, %rax # sched: [2:1.00]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call x86_mmx @llvm.x86.mmx.psubus.w(x86_mmx %a0, x86_mmx %a1)
+ %2 = load x86_mmx, x86_mmx *%a2, align 8
+ %3 = call x86_mmx @llvm.x86.mmx.psubus.w(x86_mmx %1, x86_mmx %2)
+ %4 = bitcast x86_mmx %3 to i64
+ ret i64 %4
+}
+declare x86_mmx @llvm.x86.mmx.psubus.w(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test_psubw(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
+; GENERIC-LABEL: test_psubw:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: psubw %mm1, %mm0 # sched: [3:1.00]
+; GENERIC-NEXT: psubw (%rdi), %mm0 # sched: [7:1.00]
+; GENERIC-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_psubw:
+; ATOM: # %bb.0:
+; ATOM-NEXT: psubw %mm1, %mm0 # sched: [1:0.50]
+; ATOM-NEXT: psubw (%rdi), %mm0 # sched: [1:1.00]
+; ATOM-NEXT: movd %mm0, %rax # sched: [3:3.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_psubw:
+; SLM: # %bb.0:
+; SLM-NEXT: psubw %mm1, %mm0 # sched: [1:0.50]
+; SLM-NEXT: psubw (%rdi), %mm0 # sched: [4:1.00]
+; SLM-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_psubw:
+; SANDY: # %bb.0:
+; SANDY-NEXT: psubw %mm1, %mm0 # sched: [3:1.00]
+; SANDY-NEXT: psubw (%rdi), %mm0 # sched: [7:1.00]
+; SANDY-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_psubw:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: psubw %mm1, %mm0 # sched: [1:0.50]
+; HASWELL-NEXT: psubw (%rdi), %mm0 # sched: [6:0.50]
+; HASWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_psubw:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: psubw %mm1, %mm0 # sched: [1:0.50]
+; BROADWELL-NEXT: psubw (%rdi), %mm0 # sched: [6:0.50]
+; BROADWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_psubw:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: psubw %mm1, %mm0 # sched: [1:0.50]
+; SKYLAKE-NEXT: psubw (%rdi), %mm0 # sched: [6:0.50]
+; SKYLAKE-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_psubw:
+; SKX: # %bb.0:
+; SKX-NEXT: psubw %mm1, %mm0 # sched: [1:0.50]
+; SKX-NEXT: psubw (%rdi), %mm0 # sched: [6:0.50]
+; SKX-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_psubw:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: psubw %mm1, %mm0 # sched: [1:0.50]
+; BTVER2-NEXT: psubw (%rdi), %mm0 # sched: [6:1.00]
+; BTVER2-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_psubw:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: psubw %mm1, %mm0 # sched: [1:0.25]
+; ZNVER1-NEXT: psubw (%rdi), %mm0 # sched: [8:0.50]
+; ZNVER1-NEXT: movd %mm0, %rax # sched: [2:1.00]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call x86_mmx @llvm.x86.mmx.psub.w(x86_mmx %a0, x86_mmx %a1)
+ %2 = load x86_mmx, x86_mmx *%a2, align 8
+ %3 = call x86_mmx @llvm.x86.mmx.psub.w(x86_mmx %1, x86_mmx %2)
+ %4 = bitcast x86_mmx %3 to i64
+ ret i64 %4
+}
+declare x86_mmx @llvm.x86.mmx.psub.w(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test_punpckhbw(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
+; GENERIC-LABEL: test_punpckhbw:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: punpckhbw %mm1, %mm0 # mm0 = mm0[4],mm1[4],mm0[5],mm1[5],mm0[6],mm1[6],mm0[7],mm1[7] sched: [1:1.00]
+; GENERIC-NEXT: punpckhbw (%rdi), %mm0 # mm0 = mm0[4],mem[4],mm0[5],mem[5],mm0[6],mem[6],mm0[7],mem[7] sched: [5:1.00]
+; GENERIC-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_punpckhbw:
+; ATOM: # %bb.0:
+; ATOM-NEXT: punpckhbw %mm1, %mm0 # mm0 = mm0[4],mm1[4],mm0[5],mm1[5],mm0[6],mm1[6],mm0[7],mm1[7] sched: [1:0.50]
+; ATOM-NEXT: punpckhbw (%rdi), %mm0 # mm0 = mm0[4],mem[4],mm0[5],mem[5],mm0[6],mem[6],mm0[7],mem[7] sched: [1:1.00]
+; ATOM-NEXT: movd %mm0, %rax # sched: [3:3.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_punpckhbw:
+; SLM: # %bb.0:
+; SLM-NEXT: punpckhbw %mm1, %mm0 # mm0 = mm0[4],mm1[4],mm0[5],mm1[5],mm0[6],mm1[6],mm0[7],mm1[7] sched: [1:1.00]
+; SLM-NEXT: punpckhbw (%rdi), %mm0 # mm0 = mm0[4],mem[4],mm0[5],mem[5],mm0[6],mem[6],mm0[7],mem[7] sched: [4:1.00]
+; SLM-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_punpckhbw:
+; SANDY: # %bb.0:
+; SANDY-NEXT: punpckhbw %mm1, %mm0 # mm0 = mm0[4],mm1[4],mm0[5],mm1[5],mm0[6],mm1[6],mm0[7],mm1[7] sched: [1:1.00]
+; SANDY-NEXT: punpckhbw (%rdi), %mm0 # mm0 = mm0[4],mem[4],mm0[5],mem[5],mm0[6],mem[6],mm0[7],mem[7] sched: [5:1.00]
+; SANDY-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_punpckhbw:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: punpckhbw %mm1, %mm0 # mm0 = mm0[4],mm1[4],mm0[5],mm1[5],mm0[6],mm1[6],mm0[7],mm1[7] sched: [1:1.00]
+; HASWELL-NEXT: punpckhbw (%rdi), %mm0 # mm0 = mm0[4],mem[4],mm0[5],mem[5],mm0[6],mem[6],mm0[7],mem[7] sched: [6:1.00]
+; HASWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_punpckhbw:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: punpckhbw %mm1, %mm0 # mm0 = mm0[4],mm1[4],mm0[5],mm1[5],mm0[6],mm1[6],mm0[7],mm1[7] sched: [1:1.00]
+; BROADWELL-NEXT: punpckhbw (%rdi), %mm0 # mm0 = mm0[4],mem[4],mm0[5],mem[5],mm0[6],mem[6],mm0[7],mem[7] sched: [6:1.00]
+; BROADWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_punpckhbw:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: punpckhbw %mm1, %mm0 # mm0 = mm0[4],mm1[4],mm0[5],mm1[5],mm0[6],mm1[6],mm0[7],mm1[7] sched: [1:1.00]
+; SKYLAKE-NEXT: punpckhbw (%rdi), %mm0 # mm0 = mm0[4],mem[4],mm0[5],mem[5],mm0[6],mem[6],mm0[7],mem[7] sched: [6:1.00]
+; SKYLAKE-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_punpckhbw:
+; SKX: # %bb.0:
+; SKX-NEXT: punpckhbw %mm1, %mm0 # mm0 = mm0[4],mm1[4],mm0[5],mm1[5],mm0[6],mm1[6],mm0[7],mm1[7] sched: [1:1.00]
+; SKX-NEXT: punpckhbw (%rdi), %mm0 # mm0 = mm0[4],mem[4],mm0[5],mem[5],mm0[6],mem[6],mm0[7],mem[7] sched: [6:1.00]
+; SKX-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_punpckhbw:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: punpckhbw %mm1, %mm0 # mm0 = mm0[4],mm1[4],mm0[5],mm1[5],mm0[6],mm1[6],mm0[7],mm1[7] sched: [1:0.50]
+; BTVER2-NEXT: punpckhbw (%rdi), %mm0 # mm0 = mm0[4],mem[4],mm0[5],mem[5],mm0[6],mem[6],mm0[7],mem[7] sched: [6:1.00]
+; BTVER2-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_punpckhbw:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: punpckhbw %mm1, %mm0 # mm0 = mm0[4],mm1[4],mm0[5],mm1[5],mm0[6],mm1[6],mm0[7],mm1[7] sched: [1:0.25]
+; ZNVER1-NEXT: punpckhbw (%rdi), %mm0 # mm0 = mm0[4],mem[4],mm0[5],mem[5],mm0[6],mem[6],mm0[7],mem[7] sched: [8:0.50]
+; ZNVER1-NEXT: movd %mm0, %rax # sched: [2:1.00]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call x86_mmx @llvm.x86.mmx.punpckhbw(x86_mmx %a0, x86_mmx %a1)
+ %2 = load x86_mmx, x86_mmx *%a2, align 8
+ %3 = call x86_mmx @llvm.x86.mmx.punpckhbw(x86_mmx %1, x86_mmx %2)
+ %4 = bitcast x86_mmx %3 to i64
+ ret i64 %4
+}
+declare x86_mmx @llvm.x86.mmx.punpckhbw(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test_punpckhdq(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
+; GENERIC-LABEL: test_punpckhdq:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: punpckhdq %mm1, %mm0 # mm0 = mm0[1],mm1[1] sched: [1:1.00]
+; GENERIC-NEXT: punpckhdq (%rdi), %mm0 # mm0 = mm0[1],mem[1] sched: [5:1.00]
+; GENERIC-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_punpckhdq:
+; ATOM: # %bb.0:
+; ATOM-NEXT: punpckhdq %mm1, %mm0 # mm0 = mm0[1],mm1[1] sched: [1:0.50]
+; ATOM-NEXT: punpckhdq (%rdi), %mm0 # mm0 = mm0[1],mem[1] sched: [1:1.00]
+; ATOM-NEXT: movd %mm0, %rax # sched: [3:3.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_punpckhdq:
+; SLM: # %bb.0:
+; SLM-NEXT: punpckhdq %mm1, %mm0 # mm0 = mm0[1],mm1[1] sched: [1:1.00]
+; SLM-NEXT: punpckhdq (%rdi), %mm0 # mm0 = mm0[1],mem[1] sched: [4:1.00]
+; SLM-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_punpckhdq:
+; SANDY: # %bb.0:
+; SANDY-NEXT: punpckhdq %mm1, %mm0 # mm0 = mm0[1],mm1[1] sched: [1:1.00]
+; SANDY-NEXT: punpckhdq (%rdi), %mm0 # mm0 = mm0[1],mem[1] sched: [5:1.00]
+; SANDY-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_punpckhdq:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: punpckhdq %mm1, %mm0 # mm0 = mm0[1],mm1[1] sched: [1:1.00]
+; HASWELL-NEXT: punpckhdq (%rdi), %mm0 # mm0 = mm0[1],mem[1] sched: [6:1.00]
+; HASWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_punpckhdq:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: punpckhdq %mm1, %mm0 # mm0 = mm0[1],mm1[1] sched: [1:1.00]
+; BROADWELL-NEXT: punpckhdq (%rdi), %mm0 # mm0 = mm0[1],mem[1] sched: [6:1.00]
+; BROADWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_punpckhdq:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: punpckhdq %mm1, %mm0 # mm0 = mm0[1],mm1[1] sched: [1:1.00]
+; SKYLAKE-NEXT: punpckhdq (%rdi), %mm0 # mm0 = mm0[1],mem[1] sched: [6:1.00]
+; SKYLAKE-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_punpckhdq:
+; SKX: # %bb.0:
+; SKX-NEXT: punpckhdq %mm1, %mm0 # mm0 = mm0[1],mm1[1] sched: [1:1.00]
+; SKX-NEXT: punpckhdq (%rdi), %mm0 # mm0 = mm0[1],mem[1] sched: [6:1.00]
+; SKX-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_punpckhdq:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: punpckhdq %mm1, %mm0 # mm0 = mm0[1],mm1[1] sched: [1:0.50]
+; BTVER2-NEXT: punpckhdq (%rdi), %mm0 # mm0 = mm0[1],mem[1] sched: [6:1.00]
+; BTVER2-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_punpckhdq:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: punpckhdq %mm1, %mm0 # mm0 = mm0[1],mm1[1] sched: [1:0.25]
+; ZNVER1-NEXT: punpckhdq (%rdi), %mm0 # mm0 = mm0[1],mem[1] sched: [8:0.50]
+; ZNVER1-NEXT: movd %mm0, %rax # sched: [2:1.00]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call x86_mmx @llvm.x86.mmx.punpckhdq(x86_mmx %a0, x86_mmx %a1)
+ %2 = load x86_mmx, x86_mmx *%a2, align 8
+ %3 = call x86_mmx @llvm.x86.mmx.punpckhdq(x86_mmx %1, x86_mmx %2)
+ %4 = bitcast x86_mmx %3 to i64
+ ret i64 %4
+}
+declare x86_mmx @llvm.x86.mmx.punpckhdq(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test_punpckhwd(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
+; GENERIC-LABEL: test_punpckhwd:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: punpckhwd %mm1, %mm0 # mm0 = mm0[2],mm1[2],mm0[3],mm1[3] sched: [1:1.00]
+; GENERIC-NEXT: punpckhwd (%rdi), %mm0 # mm0 = mm0[2],mem[2],mm0[3],mem[3] sched: [5:1.00]
+; GENERIC-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_punpckhwd:
+; ATOM: # %bb.0:
+; ATOM-NEXT: punpckhwd %mm1, %mm0 # mm0 = mm0[2],mm1[2],mm0[3],mm1[3] sched: [1:0.50]
+; ATOM-NEXT: punpckhwd (%rdi), %mm0 # mm0 = mm0[2],mem[2],mm0[3],mem[3] sched: [1:1.00]
+; ATOM-NEXT: movd %mm0, %rax # sched: [3:3.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_punpckhwd:
+; SLM: # %bb.0:
+; SLM-NEXT: punpckhwd %mm1, %mm0 # mm0 = mm0[2],mm1[2],mm0[3],mm1[3] sched: [1:1.00]
+; SLM-NEXT: punpckhwd (%rdi), %mm0 # mm0 = mm0[2],mem[2],mm0[3],mem[3] sched: [4:1.00]
+; SLM-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_punpckhwd:
+; SANDY: # %bb.0:
+; SANDY-NEXT: punpckhwd %mm1, %mm0 # mm0 = mm0[2],mm1[2],mm0[3],mm1[3] sched: [1:1.00]
+; SANDY-NEXT: punpckhwd (%rdi), %mm0 # mm0 = mm0[2],mem[2],mm0[3],mem[3] sched: [5:1.00]
+; SANDY-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_punpckhwd:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: punpckhwd %mm1, %mm0 # mm0 = mm0[2],mm1[2],mm0[3],mm1[3] sched: [1:1.00]
+; HASWELL-NEXT: punpckhwd (%rdi), %mm0 # mm0 = mm0[2],mem[2],mm0[3],mem[3] sched: [6:1.00]
+; HASWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_punpckhwd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: punpckhwd %mm1, %mm0 # mm0 = mm0[2],mm1[2],mm0[3],mm1[3] sched: [1:1.00]
+; BROADWELL-NEXT: punpckhwd (%rdi), %mm0 # mm0 = mm0[2],mem[2],mm0[3],mem[3] sched: [6:1.00]
+; BROADWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_punpckhwd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: punpckhwd %mm1, %mm0 # mm0 = mm0[2],mm1[2],mm0[3],mm1[3] sched: [1:1.00]
+; SKYLAKE-NEXT: punpckhwd (%rdi), %mm0 # mm0 = mm0[2],mem[2],mm0[3],mem[3] sched: [6:1.00]
+; SKYLAKE-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_punpckhwd:
+; SKX: # %bb.0:
+; SKX-NEXT: punpckhwd %mm1, %mm0 # mm0 = mm0[2],mm1[2],mm0[3],mm1[3] sched: [1:1.00]
+; SKX-NEXT: punpckhwd (%rdi), %mm0 # mm0 = mm0[2],mem[2],mm0[3],mem[3] sched: [6:1.00]
+; SKX-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_punpckhwd:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: punpckhwd %mm1, %mm0 # mm0 = mm0[2],mm1[2],mm0[3],mm1[3] sched: [1:0.50]
+; BTVER2-NEXT: punpckhwd (%rdi), %mm0 # mm0 = mm0[2],mem[2],mm0[3],mem[3] sched: [6:1.00]
+; BTVER2-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_punpckhwd:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: punpckhwd %mm1, %mm0 # mm0 = mm0[2],mm1[2],mm0[3],mm1[3] sched: [1:0.25]
+; ZNVER1-NEXT: punpckhwd (%rdi), %mm0 # mm0 = mm0[2],mem[2],mm0[3],mem[3] sched: [8:0.50]
+; ZNVER1-NEXT: movd %mm0, %rax # sched: [2:1.00]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call x86_mmx @llvm.x86.mmx.punpckhwd(x86_mmx %a0, x86_mmx %a1)
+ %2 = load x86_mmx, x86_mmx *%a2, align 8
+ %3 = call x86_mmx @llvm.x86.mmx.punpckhwd(x86_mmx %1, x86_mmx %2)
+ %4 = bitcast x86_mmx %3 to i64
+ ret i64 %4
+}
+declare x86_mmx @llvm.x86.mmx.punpckhwd(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test_punpcklbw(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
+; GENERIC-LABEL: test_punpcklbw:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: punpcklbw %mm1, %mm0 # mm0 = mm0[0],mm1[0],mm0[1],mm1[1],mm0[2],mm1[2],mm0[3],mm1[3] sched: [1:1.00]
+; GENERIC-NEXT: punpcklbw (%rdi), %mm0 # mm0 = mm0[0],mem[0],mm0[1],mem[1],mm0[2],mem[2],mm0[3],mem[3] sched: [5:1.00]
+; GENERIC-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_punpcklbw:
+; ATOM: # %bb.0:
+; ATOM-NEXT: punpcklbw %mm1, %mm0 # mm0 = mm0[0],mm1[0],mm0[1],mm1[1],mm0[2],mm1[2],mm0[3],mm1[3] sched: [1:1.00]
+; ATOM-NEXT: punpcklbw (%rdi), %mm0 # mm0 = mm0[0],mem[0],mm0[1],mem[1],mm0[2],mem[2],mm0[3],mem[3] sched: [1:1.00]
+; ATOM-NEXT: movd %mm0, %rax # sched: [3:3.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_punpcklbw:
+; SLM: # %bb.0:
+; SLM-NEXT: punpcklbw %mm1, %mm0 # mm0 = mm0[0],mm1[0],mm0[1],mm1[1],mm0[2],mm1[2],mm0[3],mm1[3] sched: [1:1.00]
+; SLM-NEXT: punpcklbw (%rdi), %mm0 # mm0 = mm0[0],mem[0],mm0[1],mem[1],mm0[2],mem[2],mm0[3],mem[3] sched: [4:1.00]
+; SLM-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_punpcklbw:
+; SANDY: # %bb.0:
+; SANDY-NEXT: punpcklbw %mm1, %mm0 # mm0 = mm0[0],mm1[0],mm0[1],mm1[1],mm0[2],mm1[2],mm0[3],mm1[3] sched: [1:1.00]
+; SANDY-NEXT: punpcklbw (%rdi), %mm0 # mm0 = mm0[0],mem[0],mm0[1],mem[1],mm0[2],mem[2],mm0[3],mem[3] sched: [5:1.00]
+; SANDY-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_punpcklbw:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: punpcklbw %mm1, %mm0 # mm0 = mm0[0],mm1[0],mm0[1],mm1[1],mm0[2],mm1[2],mm0[3],mm1[3] sched: [1:1.00]
+; HASWELL-NEXT: punpcklbw (%rdi), %mm0 # mm0 = mm0[0],mem[0],mm0[1],mem[1],mm0[2],mem[2],mm0[3],mem[3] sched: [6:1.00]
+; HASWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_punpcklbw:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: punpcklbw %mm1, %mm0 # mm0 = mm0[0],mm1[0],mm0[1],mm1[1],mm0[2],mm1[2],mm0[3],mm1[3] sched: [1:1.00]
+; BROADWELL-NEXT: punpcklbw (%rdi), %mm0 # mm0 = mm0[0],mem[0],mm0[1],mem[1],mm0[2],mem[2],mm0[3],mem[3] sched: [6:1.00]
+; BROADWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_punpcklbw:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: punpcklbw %mm1, %mm0 # mm0 = mm0[0],mm1[0],mm0[1],mm1[1],mm0[2],mm1[2],mm0[3],mm1[3] sched: [1:1.00]
+; SKYLAKE-NEXT: punpcklbw (%rdi), %mm0 # mm0 = mm0[0],mem[0],mm0[1],mem[1],mm0[2],mem[2],mm0[3],mem[3] sched: [6:1.00]
+; SKYLAKE-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_punpcklbw:
+; SKX: # %bb.0:
+; SKX-NEXT: punpcklbw %mm1, %mm0 # mm0 = mm0[0],mm1[0],mm0[1],mm1[1],mm0[2],mm1[2],mm0[3],mm1[3] sched: [1:1.00]
+; SKX-NEXT: punpcklbw (%rdi), %mm0 # mm0 = mm0[0],mem[0],mm0[1],mem[1],mm0[2],mem[2],mm0[3],mem[3] sched: [6:1.00]
+; SKX-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_punpcklbw:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: punpcklbw %mm1, %mm0 # mm0 = mm0[0],mm1[0],mm0[1],mm1[1],mm0[2],mm1[2],mm0[3],mm1[3] sched: [1:0.50]
+; BTVER2-NEXT: punpcklbw (%rdi), %mm0 # mm0 = mm0[0],mem[0],mm0[1],mem[1],mm0[2],mem[2],mm0[3],mem[3] sched: [6:1.00]
+; BTVER2-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_punpcklbw:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: punpcklbw %mm1, %mm0 # mm0 = mm0[0],mm1[0],mm0[1],mm1[1],mm0[2],mm1[2],mm0[3],mm1[3] sched: [1:0.25]
+; ZNVER1-NEXT: punpcklbw (%rdi), %mm0 # mm0 = mm0[0],mem[0],mm0[1],mem[1],mm0[2],mem[2],mm0[3],mem[3] sched: [8:0.50]
+; ZNVER1-NEXT: movd %mm0, %rax # sched: [2:1.00]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call x86_mmx @llvm.x86.mmx.punpcklbw(x86_mmx %a0, x86_mmx %a1)
+ %2 = load x86_mmx, x86_mmx *%a2, align 8
+ %3 = call x86_mmx @llvm.x86.mmx.punpcklbw(x86_mmx %1, x86_mmx %2)
+ %4 = bitcast x86_mmx %3 to i64
+ ret i64 %4
+}
+declare x86_mmx @llvm.x86.mmx.punpcklbw(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test_punpckldq(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
+; GENERIC-LABEL: test_punpckldq:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: punpckldq %mm1, %mm0 # mm0 = mm0[0],mm1[0] sched: [1:1.00]
+; GENERIC-NEXT: punpckldq (%rdi), %mm0 # mm0 = mm0[0],mem[0] sched: [5:1.00]
+; GENERIC-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_punpckldq:
+; ATOM: # %bb.0:
+; ATOM-NEXT: punpckldq %mm1, %mm0 # mm0 = mm0[0],mm1[0] sched: [1:1.00]
+; ATOM-NEXT: punpckldq (%rdi), %mm0 # mm0 = mm0[0],mem[0] sched: [1:1.00]
+; ATOM-NEXT: movd %mm0, %rax # sched: [3:3.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_punpckldq:
+; SLM: # %bb.0:
+; SLM-NEXT: punpckldq %mm1, %mm0 # mm0 = mm0[0],mm1[0] sched: [1:1.00]
+; SLM-NEXT: punpckldq (%rdi), %mm0 # mm0 = mm0[0],mem[0] sched: [4:1.00]
+; SLM-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_punpckldq:
+; SANDY: # %bb.0:
+; SANDY-NEXT: punpckldq %mm1, %mm0 # mm0 = mm0[0],mm1[0] sched: [1:1.00]
+; SANDY-NEXT: punpckldq (%rdi), %mm0 # mm0 = mm0[0],mem[0] sched: [5:1.00]
+; SANDY-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_punpckldq:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: punpckldq %mm1, %mm0 # mm0 = mm0[0],mm1[0] sched: [1:1.00]
+; HASWELL-NEXT: punpckldq (%rdi), %mm0 # mm0 = mm0[0],mem[0] sched: [6:1.00]
+; HASWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_punpckldq:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: punpckldq %mm1, %mm0 # mm0 = mm0[0],mm1[0] sched: [1:1.00]
+; BROADWELL-NEXT: punpckldq (%rdi), %mm0 # mm0 = mm0[0],mem[0] sched: [6:1.00]
+; BROADWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_punpckldq:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: punpckldq %mm1, %mm0 # mm0 = mm0[0],mm1[0] sched: [1:1.00]
+; SKYLAKE-NEXT: punpckldq (%rdi), %mm0 # mm0 = mm0[0],mem[0] sched: [6:1.00]
+; SKYLAKE-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_punpckldq:
+; SKX: # %bb.0:
+; SKX-NEXT: punpckldq %mm1, %mm0 # mm0 = mm0[0],mm1[0] sched: [1:1.00]
+; SKX-NEXT: punpckldq (%rdi), %mm0 # mm0 = mm0[0],mem[0] sched: [6:1.00]
+; SKX-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_punpckldq:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: punpckldq %mm1, %mm0 # mm0 = mm0[0],mm1[0] sched: [1:0.50]
+; BTVER2-NEXT: punpckldq (%rdi), %mm0 # mm0 = mm0[0],mem[0] sched: [6:1.00]
+; BTVER2-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_punpckldq:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: punpckldq %mm1, %mm0 # mm0 = mm0[0],mm1[0] sched: [1:0.25]
+; ZNVER1-NEXT: punpckldq (%rdi), %mm0 # mm0 = mm0[0],mem[0] sched: [8:0.50]
+; ZNVER1-NEXT: movd %mm0, %rax # sched: [2:1.00]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call x86_mmx @llvm.x86.mmx.punpckldq(x86_mmx %a0, x86_mmx %a1)
+ %2 = load x86_mmx, x86_mmx *%a2, align 8
+ %3 = call x86_mmx @llvm.x86.mmx.punpckldq(x86_mmx %1, x86_mmx %2)
+ %4 = bitcast x86_mmx %3 to i64
+ ret i64 %4
+}
+declare x86_mmx @llvm.x86.mmx.punpckldq(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test_punpcklwd(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
+; GENERIC-LABEL: test_punpcklwd:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: punpcklwd %mm1, %mm0 # mm0 = mm0[0],mm1[0],mm0[1],mm1[1] sched: [1:1.00]
+; GENERIC-NEXT: punpcklwd (%rdi), %mm0 # mm0 = mm0[0],mem[0],mm0[1],mem[1] sched: [5:1.00]
+; GENERIC-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_punpcklwd:
+; ATOM: # %bb.0:
+; ATOM-NEXT: punpcklwd %mm1, %mm0 # mm0 = mm0[0],mm1[0],mm0[1],mm1[1] sched: [1:1.00]
+; ATOM-NEXT: punpcklwd (%rdi), %mm0 # mm0 = mm0[0],mem[0],mm0[1],mem[1] sched: [1:1.00]
+; ATOM-NEXT: movd %mm0, %rax # sched: [3:3.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_punpcklwd:
+; SLM: # %bb.0:
+; SLM-NEXT: punpcklwd %mm1, %mm0 # mm0 = mm0[0],mm1[0],mm0[1],mm1[1] sched: [1:1.00]
+; SLM-NEXT: punpcklwd (%rdi), %mm0 # mm0 = mm0[0],mem[0],mm0[1],mem[1] sched: [4:1.00]
+; SLM-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_punpcklwd:
+; SANDY: # %bb.0:
+; SANDY-NEXT: punpcklwd %mm1, %mm0 # mm0 = mm0[0],mm1[0],mm0[1],mm1[1] sched: [1:1.00]
+; SANDY-NEXT: punpcklwd (%rdi), %mm0 # mm0 = mm0[0],mem[0],mm0[1],mem[1] sched: [5:1.00]
+; SANDY-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_punpcklwd:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: punpcklwd %mm1, %mm0 # mm0 = mm0[0],mm1[0],mm0[1],mm1[1] sched: [1:1.00]
+; HASWELL-NEXT: punpcklwd (%rdi), %mm0 # mm0 = mm0[0],mem[0],mm0[1],mem[1] sched: [6:1.00]
+; HASWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_punpcklwd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: punpcklwd %mm1, %mm0 # mm0 = mm0[0],mm1[0],mm0[1],mm1[1] sched: [1:1.00]
+; BROADWELL-NEXT: punpcklwd (%rdi), %mm0 # mm0 = mm0[0],mem[0],mm0[1],mem[1] sched: [6:1.00]
+; BROADWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_punpcklwd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: punpcklwd %mm1, %mm0 # mm0 = mm0[0],mm1[0],mm0[1],mm1[1] sched: [1:1.00]
+; SKYLAKE-NEXT: punpcklwd (%rdi), %mm0 # mm0 = mm0[0],mem[0],mm0[1],mem[1] sched: [6:1.00]
+; SKYLAKE-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_punpcklwd:
+; SKX: # %bb.0:
+; SKX-NEXT: punpcklwd %mm1, %mm0 # mm0 = mm0[0],mm1[0],mm0[1],mm1[1] sched: [1:1.00]
+; SKX-NEXT: punpcklwd (%rdi), %mm0 # mm0 = mm0[0],mem[0],mm0[1],mem[1] sched: [6:1.00]
+; SKX-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_punpcklwd:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: punpcklwd %mm1, %mm0 # mm0 = mm0[0],mm1[0],mm0[1],mm1[1] sched: [1:0.50]
+; BTVER2-NEXT: punpcklwd (%rdi), %mm0 # mm0 = mm0[0],mem[0],mm0[1],mem[1] sched: [6:1.00]
+; BTVER2-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_punpcklwd:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: punpcklwd %mm1, %mm0 # mm0 = mm0[0],mm1[0],mm0[1],mm1[1] sched: [1:0.25]
+; ZNVER1-NEXT: punpcklwd (%rdi), %mm0 # mm0 = mm0[0],mem[0],mm0[1],mem[1] sched: [8:0.50]
+; ZNVER1-NEXT: movd %mm0, %rax # sched: [2:1.00]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call x86_mmx @llvm.x86.mmx.punpcklwd(x86_mmx %a0, x86_mmx %a1)
+ %2 = load x86_mmx, x86_mmx *%a2, align 8
+ %3 = call x86_mmx @llvm.x86.mmx.punpcklwd(x86_mmx %1, x86_mmx %2)
+ %4 = bitcast x86_mmx %3 to i64
+ ret i64 %4
+}
+declare x86_mmx @llvm.x86.mmx.punpcklwd(x86_mmx, x86_mmx) nounwind readnone
+
+define i64 @test_pxor(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
+; GENERIC-LABEL: test_pxor:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: pxor %mm1, %mm0 # sched: [1:1.00]
+; GENERIC-NEXT: pxor (%rdi), %mm0 # sched: [5:1.00]
+; GENERIC-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_pxor:
+; ATOM: # %bb.0:
+; ATOM-NEXT: pxor %mm1, %mm0 # sched: [1:0.50]
+; ATOM-NEXT: pxor (%rdi), %mm0 # sched: [1:1.00]
+; ATOM-NEXT: movd %mm0, %rax # sched: [3:3.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_pxor:
+; SLM: # %bb.0:
+; SLM-NEXT: pxor %mm1, %mm0 # sched: [1:0.50]
+; SLM-NEXT: pxor (%rdi), %mm0 # sched: [4:1.00]
+; SLM-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_pxor:
+; SANDY: # %bb.0:
+; SANDY-NEXT: pxor %mm1, %mm0 # sched: [1:1.00]
+; SANDY-NEXT: pxor (%rdi), %mm0 # sched: [5:1.00]
+; SANDY-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_pxor:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: pxor %mm1, %mm0 # sched: [1:0.33]
+; HASWELL-NEXT: pxor (%rdi), %mm0 # sched: [6:0.50]
+; HASWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pxor:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: pxor %mm1, %mm0 # sched: [1:0.33]
+; BROADWELL-NEXT: pxor (%rdi), %mm0 # sched: [6:0.50]
+; BROADWELL-NEXT: movd %mm0, %rax # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pxor:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: pxor %mm1, %mm0 # sched: [1:0.50]
+; SKYLAKE-NEXT: pxor (%rdi), %mm0 # sched: [6:0.50]
+; SKYLAKE-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pxor:
+; SKX: # %bb.0:
+; SKX-NEXT: pxor %mm1, %mm0 # sched: [1:0.50]
+; SKX-NEXT: pxor (%rdi), %mm0 # sched: [6:0.50]
+; SKX-NEXT: movd %mm0, %rax # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_pxor:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: pxor %mm1, %mm0 # sched: [1:0.50]
+; BTVER2-NEXT: pxor (%rdi), %mm0 # sched: [6:1.00]
+; BTVER2-NEXT: movd %mm0, %rax # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pxor:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: pxor %mm1, %mm0 # sched: [1:0.25]
+; ZNVER1-NEXT: pxor (%rdi), %mm0 # sched: [8:0.50]
+; ZNVER1-NEXT: movd %mm0, %rax # sched: [2:1.00]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call x86_mmx @llvm.x86.mmx.pxor(x86_mmx %a0, x86_mmx %a1)
+ %2 = load x86_mmx, x86_mmx *%a2, align 8
+ %3 = call x86_mmx @llvm.x86.mmx.pxor(x86_mmx %1, x86_mmx %2)
+ %4 = bitcast x86_mmx %3 to i64
+ ret i64 %4
+}
+declare x86_mmx @llvm.x86.mmx.pxor(x86_mmx, x86_mmx) nounwind readnone
diff --git a/test/CodeGen/X86/movbe-schedule.ll b/test/CodeGen/X86/movbe-schedule.ll
new file mode 100644
index 000000000000..667b0d84c392
--- /dev/null
+++ b/test/CodeGen/X86/movbe-schedule.ll
@@ -0,0 +1,190 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=+movbe | FileCheck %s --check-prefix=CHECK --check-prefix=GENERIC
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=atom | FileCheck %s --check-prefix=CHECK --check-prefix=ATOM
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=slm | FileCheck %s --check-prefix=CHECK --check-prefix=SLM
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=haswell | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=broadwell | FileCheck %s --check-prefix=CHECK --check-prefix=BROADWELL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=SKYLAKE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=knl | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1
+
+define i16 @test_movbe_i16(i16 *%a0, i16 %a1, i16 *%a2) {
+; GENERIC-LABEL: test_movbe_i16:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: movbew (%rdi), %ax # sched: [5:0.50]
+; GENERIC-NEXT: movbew %si, (%rdx) # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_movbe_i16:
+; ATOM: # %bb.0:
+; ATOM-NEXT: movbew (%rdi), %ax # sched: [1:1.00]
+; ATOM-NEXT: movbew %si, (%rdx) # sched: [1:1.00]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_movbe_i16:
+; SLM: # %bb.0:
+; SLM-NEXT: movbew (%rdi), %ax # sched: [4:1.00]
+; SLM-NEXT: movbew %si, (%rdx) # sched: [1:1.00]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; HASWELL-LABEL: test_movbe_i16:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: movbew (%rdi), %ax # sched: [6:0.50]
+; HASWELL-NEXT: movbew %si, (%rdx) # sched: [2:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_movbe_i16:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: movbew (%rdi), %ax # sched: [6:0.50]
+; BROADWELL-NEXT: movbew %si, (%rdx) # sched: [2:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_movbe_i16:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: movbew (%rdi), %ax # sched: [6:0.50]
+; SKYLAKE-NEXT: movbew %si, (%rdx) # sched: [2:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_movbe_i16:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: movbew (%rdi), %ax # sched: [4:1.00]
+; BTVER2-NEXT: movbew %si, (%rdx) # sched: [1:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_movbe_i16:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: movbew (%rdi), %ax # sched: [5:0.50]
+; ZNVER1-NEXT: movbew %si, (%rdx) # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = load i16, i16 *%a0
+ %2 = tail call i16 @llvm.bswap.i16( i16 %1 )
+ %3 = tail call i16 @llvm.bswap.i16( i16 %a1 )
+ store i16 %3, i16* %a2, align 2
+ ret i16 %2
+}
+declare i16 @llvm.bswap.i16(i16)
+
+define i32 @test_movbe_i32(i32 *%a0, i32 %a1, i32 *%a2) {
+; GENERIC-LABEL: test_movbe_i32:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: movbel (%rdi), %eax # sched: [5:0.50]
+; GENERIC-NEXT: movbel %esi, (%rdx) # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_movbe_i32:
+; ATOM: # %bb.0:
+; ATOM-NEXT: movbel (%rdi), %eax # sched: [1:1.00]
+; ATOM-NEXT: movbel %esi, (%rdx) # sched: [1:1.00]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_movbe_i32:
+; SLM: # %bb.0:
+; SLM-NEXT: movbel (%rdi), %eax # sched: [4:1.00]
+; SLM-NEXT: movbel %esi, (%rdx) # sched: [1:1.00]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; HASWELL-LABEL: test_movbe_i32:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: movbel (%rdi), %eax # sched: [6:0.50]
+; HASWELL-NEXT: movbel %esi, (%rdx) # sched: [2:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_movbe_i32:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: movbel (%rdi), %eax # sched: [6:0.50]
+; BROADWELL-NEXT: movbel %esi, (%rdx) # sched: [2:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_movbe_i32:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: movbel (%rdi), %eax # sched: [6:0.50]
+; SKYLAKE-NEXT: movbel %esi, (%rdx) # sched: [2:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_movbe_i32:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: movbel (%rdi), %eax # sched: [4:1.00]
+; BTVER2-NEXT: movbel %esi, (%rdx) # sched: [1:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_movbe_i32:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: movbel (%rdi), %eax # sched: [5:0.50]
+; ZNVER1-NEXT: movbel %esi, (%rdx) # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = load i32, i32 *%a0
+ %2 = tail call i32 @llvm.bswap.i32( i32 %1 )
+ %3 = tail call i32 @llvm.bswap.i32( i32 %a1 )
+ store i32 %3, i32* %a2, align 2
+ ret i32 %2
+}
+declare i32 @llvm.bswap.i32(i32)
+
+define i64 @test_movbe_i64(i64 *%a0, i64 %a1, i64 *%a2) {
+; GENERIC-LABEL: test_movbe_i64:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: movbeq (%rdi), %rax # sched: [5:0.50]
+; GENERIC-NEXT: movbeq %rsi, (%rdx) # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_movbe_i64:
+; ATOM: # %bb.0:
+; ATOM-NEXT: movbeq (%rdi), %rax # sched: [1:1.00]
+; ATOM-NEXT: movbeq %rsi, (%rdx) # sched: [1:1.00]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_movbe_i64:
+; SLM: # %bb.0:
+; SLM-NEXT: movbeq (%rdi), %rax # sched: [4:1.00]
+; SLM-NEXT: movbeq %rsi, (%rdx) # sched: [1:1.00]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; HASWELL-LABEL: test_movbe_i64:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: movbeq (%rdi), %rax # sched: [6:0.50]
+; HASWELL-NEXT: movbeq %rsi, (%rdx) # sched: [2:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_movbe_i64:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: movbeq (%rdi), %rax # sched: [6:0.50]
+; BROADWELL-NEXT: movbeq %rsi, (%rdx) # sched: [2:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_movbe_i64:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: movbeq (%rdi), %rax # sched: [6:0.50]
+; SKYLAKE-NEXT: movbeq %rsi, (%rdx) # sched: [2:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_movbe_i64:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: movbeq (%rdi), %rax # sched: [4:1.00]
+; BTVER2-NEXT: movbeq %rsi, (%rdx) # sched: [1:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_movbe_i64:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: movbeq (%rdi), %rax # sched: [5:0.50]
+; ZNVER1-NEXT: movbeq %rsi, (%rdx) # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = load i64, i64 *%a0
+ %2 = tail call i64 @llvm.bswap.i64( i64 %1 )
+ %3 = tail call i64 @llvm.bswap.i64( i64 %a1 )
+ store i64 %3, i64* %a2, align 2
+ ret i64 %2
+}
+declare i64 @llvm.bswap.i64(i64)
diff --git a/test/CodeGen/X86/movfs.ll b/test/CodeGen/X86/movfs.ll
index 75b2404ec56e..e6ac21d2614e 100644
--- a/test/CodeGen/X86/movfs.ll
+++ b/test/CodeGen/X86/movfs.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | grep fs
+; RUN: llc < %s -mtriple=i686-- | grep fs
define i32 @foo() nounwind readonly {
entry:
diff --git a/test/CodeGen/X86/movgs.ll b/test/CodeGen/X86/movgs.ll
index 8e964bf16898..00fc598ec658 100644
--- a/test/CodeGen/X86/movgs.ll
+++ b/test/CodeGen/X86/movgs.ll
@@ -1,16 +1,16 @@
-; RUN: llc < %s -march=x86 -mtriple=i386-linux-gnu -mcpu=penryn -mattr=sse4.1 | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=i386-linux-gnu -mcpu=penryn -mattr=sse4.1 | FileCheck %s --check-prefix=X32
; RUN: llc < %s -mtriple=x86_64-linux -mcpu=penryn -mattr=sse4.1 | FileCheck %s --check-prefix=X64
; RUN: llc < %s -mtriple=x86_64-win32 -mcpu=penryn -mattr=sse4.1 | FileCheck %s --check-prefix=X64
define i32 @test1() nounwind readonly {
; X32-LABEL: test1:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: movl %gs:196, %eax
; X32-NEXT: movl (%eax), %eax
; X32-NEXT: retl
;
; X64-LABEL: test1:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: movq %gs:320, %rax
; X64-NEXT: movl (%rax), %eax
; X64-NEXT: retq
@@ -22,7 +22,7 @@ entry:
define i64 @test2(void (i8*)* addrspace(256)* %tmp8) nounwind {
; X32-LABEL: test2:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: subl $12, %esp
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: calll *%gs:(%eax)
@@ -32,7 +32,7 @@ define i64 @test2(void (i8*)* addrspace(256)* %tmp8) nounwind {
; X32-NEXT: retl
;
; X64-LABEL: test2:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: {{(subq.*%rsp|pushq)}}
; X64-NEXT: callq *%gs:(%{{(rcx|rdi)}})
; X64-NEXT: xorl %eax, %eax
@@ -46,13 +46,13 @@ entry:
define <2 x i64> @pmovsxwd_1(i64 addrspace(256)* %p) nounwind readonly {
; X32-LABEL: pmovsxwd_1:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: pmovsxwd %gs:(%eax), %xmm0
; X32-NEXT: retl
;
; X64-LABEL: pmovsxwd_1:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: pmovsxwd %gs:(%{{(rcx|rdi)}}), %xmm0
; X64-NEXT: retq
entry:
@@ -69,7 +69,7 @@ entry:
; address spaces. Make sure they aren't CSE'd.
define i32 @test_no_cse() nounwind readonly {
; X32-LABEL: test_no_cse:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: movl %gs:196, %eax
; X32-NEXT: movl (%eax), %eax
; X32-NEXT: movl %fs:196, %ecx
@@ -77,7 +77,7 @@ define i32 @test_no_cse() nounwind readonly {
; X32-NEXT: retl
;
; X64-LABEL: test_no_cse:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: movq %gs:320, %rax
; X64-NEXT: movl (%rax), %eax
; X64-NEXT: movq %fs:320, %rcx
diff --git a/test/CodeGen/X86/movmsk.ll b/test/CodeGen/X86/movmsk.ll
index e40f64eb39b2..d2ee19d97ffe 100644
--- a/test/CodeGen/X86/movmsk.ll
+++ b/test/CodeGen/X86/movmsk.ll
@@ -6,7 +6,7 @@
define i32 @double_signbit(double %d1) nounwind uwtable readnone ssp {
; CHECK-LABEL: double_signbit:
-; CHECK: ## BB#0: ## %entry
+; CHECK: ## %bb.0: ## %entry
; CHECK-NEXT: movsd %xmm0, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: movsd %xmm0, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: movmskpd %xmm0, %eax
@@ -28,7 +28,7 @@ entry:
define i32 @double_add_signbit(double %d1, double %d2) nounwind uwtable readnone ssp {
; CHECK-LABEL: double_add_signbit:
-; CHECK: ## BB#0: ## %entry
+; CHECK: ## %bb.0: ## %entry
; CHECK-NEXT: addsd %xmm1, %xmm0
; CHECK-NEXT: movsd %xmm0, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: movsd %xmm0, -{{[0-9]+}}(%rsp)
@@ -52,7 +52,7 @@ entry:
define i32 @float_signbit(float %f1) nounwind uwtable readnone ssp {
; CHECK-LABEL: float_signbit:
-; CHECK: ## BB#0: ## %entry
+; CHECK: ## %bb.0: ## %entry
; CHECK-NEXT: movss %xmm0, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: movss %xmm0, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: movmskps %xmm0, %eax
@@ -73,7 +73,7 @@ entry:
define i32 @float_add_signbit(float %f1, float %f2) nounwind uwtable readnone ssp {
; CHECK-LABEL: float_add_signbit:
-; CHECK: ## BB#0: ## %entry
+; CHECK: ## %bb.0: ## %entry
; CHECK-NEXT: addss %xmm1, %xmm0
; CHECK-NEXT: movss %xmm0, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: movss %xmm0, -{{[0-9]+}}(%rsp)
@@ -99,10 +99,10 @@ entry:
; in this case, though.
define void @float_call_signbit(double %n) {
; CHECK-LABEL: float_call_signbit:
-; CHECK: ## BB#0: ## %entry
+; CHECK: ## %bb.0: ## %entry
; CHECK-NEXT: movq %xmm0, %rdi
; CHECK-NEXT: shrq $63, %rdi
-; CHECK-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<kill>
+; CHECK-NEXT: ## kill: def %edi killed %edi killed %rdi
; CHECK-NEXT: jmp _float_call_signbit_callee ## TAILCALL
entry:
%t0 = bitcast double %n to i64
@@ -118,7 +118,7 @@ declare void @float_call_signbit_callee(i1 zeroext)
define i32 @t1(<4 x float> %x, i32* nocapture %indexTable) nounwind uwtable readonly ssp {
; CHECK-LABEL: t1:
-; CHECK: ## BB#0: ## %entry
+; CHECK: ## %bb.0: ## %entry
; CHECK-NEXT: movmskps %xmm0, %eax
; CHECK-NEXT: movl (%rdi,%rax,4), %eax
; CHECK-NEXT: retq
@@ -132,7 +132,7 @@ entry:
define i32 @t2(<4 x float> %x, i32* nocapture %indexTable) nounwind uwtable readonly ssp {
; CHECK-LABEL: t2:
-; CHECK: ## BB#0: ## %entry
+; CHECK: ## %bb.0: ## %entry
; CHECK-NEXT: movmskpd %xmm0, %eax
; CHECK-NEXT: movl (%rdi,%rax,4), %eax
; CHECK-NEXT: retq
diff --git a/test/CodeGen/X86/movpc32-check.ll b/test/CodeGen/X86/movpc32-check.ll
index 42ee332290f9..e22981aed9af 100644
--- a/test/CodeGen/X86/movpc32-check.ll
+++ b/test/CodeGen/X86/movpc32-check.ll
@@ -33,9 +33,7 @@ attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-
!12 = !DILocation(line: 5, column: 1, scope: !4)
; CHECK: calll .L0$pb
-; CHECK-NEXT: .Lcfi3:
; CHECK-NEXT: .cfi_adjust_cfa_offset 4
; CHECK-NEXT: .L0$pb:
; CHECK-NEXT: popl
-; CHECK-NEXT: .Lcfi4:
; CHECK-NEXT: .cfi_adjust_cfa_offset -4
diff --git a/test/CodeGen/X86/movtopush.ll b/test/CodeGen/X86/movtopush.ll
index d715ccfa8c69..051c8a710c85 100644
--- a/test/CodeGen/X86/movtopush.ll
+++ b/test/CodeGen/X86/movtopush.ll
@@ -12,6 +12,8 @@ declare void @inreg(i32 %a, i32 inreg %b, i32 %c, i32 %d)
declare x86_thiscallcc void @thiscall(%class.Class* %class, i32 %a, i32 %b, i32 %c, i32 %d)
declare void @oneparam(i32 %a)
declare void @eightparams(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h)
+declare void @eightparams16(i16 %a, i16 %b, i16 %c, i16 %d, i16 %e, i16 %f, i16 %g, i16 %h)
+declare void @eightparams64(i64 %a, i64 %b, i64 %c, i64 %d, i64 %e, i64 %f, i64 %g, i64 %h)
declare void @struct(%struct.s* byval %a, i32 %b, i32 %c, i32 %d)
declare void @inalloca(<{ %struct.s }>* inalloca)
@@ -228,16 +230,16 @@ entry:
; NORMAL-NEXT: pushl $2
; NORMAL-NEXT: pushl $1
; NORMAL-NEXT: call
-; NORMAL-NEXT: subl $4, %esp
-; NORMAL-NEXT: movl 20(%esp), [[E1:%e..]]
-; NORMAL-NEXT: movl 24(%esp), [[E2:%e..]]
-; NORMAL-NEXT: movl [[E2]], 4(%esp)
-; NORMAL-NEXT: movl [[E1]], (%esp)
-; NORMAL-NEXT: leal 32(%esp), [[E3:%e..]]
-; NORMAL-NEXT: movl [[E3]], 16(%esp)
-; NORMAL-NEXT: leal 28(%esp), [[E4:%e..]]
-; NORMAL-NEXT: movl [[E4]], 12(%esp)
-; NORMAL-NEXT: movl $6, 8(%esp)
+; NORMAL-NEXT: addl $16, %esp
+; NORMAL-NEXT: movl (%esp), [[E1:%e..]]
+; NORMAL-NEXT: movl 4(%esp), [[E2:%e..]]
+; NORMAL-NEXT: leal 16(%esp), [[E3:%e..]]
+; NORMAL-NEXT: leal 12(%esp), [[E4:%e..]]
+; NORMAL-NEXT: pushl [[E3]]
+; NORMAL-NEXT: pushl [[E4]]
+; NORMAL-NEXT: pushl $6
+; NORMAL-NEXT: pushl [[E2]]
+; NORMAL-NEXT: pushl [[E1]]
; NORMAL-NEXT: call
; NORMAL-NEXT: addl $20, %esp
define void @test9() optsize {
@@ -297,10 +299,10 @@ define void @test11() optsize {
; Converting one mov into a push isn't worth it when
; doing so forces too much overhead for other calls.
; NORMAL-LABEL: test12:
-; NORMAL: movl $8, 12(%esp)
-; NORMAL-NEXT: movl $7, 8(%esp)
-; NORMAL-NEXT: movl $6, 4(%esp)
-; NORMAL-NEXT: movl $5, (%esp)
+; NORMAL: pushl $8
+; NORMAL-NEXT: pushl $7
+; NORMAL-NEXT: pushl $6
+; NORMAL-NEXT: pushl $5
; NORMAL-NEXT: calll _good
define void @test12() optsize {
entry:
@@ -318,18 +320,22 @@ entry:
; NORMAL-NEXT: pushl $2
; NORMAL-NEXT: pushl $1
; NORMAL-NEXT: calll _good
-; NORMAL-NEXT: subl $4, %esp
-; NORMAL: movl $8, 16(%esp)
-; NORMAL-NEXT: movl $7, 12(%esp)
-; NORMAL-NEXT: movl $6, 8(%esp)
-; NORMAL-NEXT: calll _struct
-; NORMAL-NEXT: addl $20, %esp
-; NORMAL-NEXT: pushl $12
-; NORMAL-NEXT: pushl $11
-; NORMAL-NEXT: pushl $10
-; NORMAL-NEXT: pushl $9
-; NORMAL-NEXT: calll _good
-; NORMAL-NEXT: addl $16, %esp
+; NORMAL-NEXT: addl $16, %esp
+; NORMAL=NEXT: movl (%esp), %eax
+; NORMAL=NEXT: movl 4(%esp), %ecx
+; NORMAL=NEXT: pushl $8
+; NORMAL=NEXT: pushl $7
+; NORMAL=NEXT: pushl $6
+; NORMAL=NEXT: pushl %ecx
+; NORMAL=NEXT: pushl %eax
+; NORMAL=NEXT: calll _struct
+; NORMAL=NEXT: addl $20, %esp
+; NORMAL=NEXT: pushl $12
+; NORMAL=NEXT: pushl $11
+; NORMAL=NEXT: pushl $10
+; NORMAL=NEXT: pushl $9
+; NORMAL=NEXT: calll _good
+; NORMAL=NEXT: addl $16, %esp
define void @test12b() optsize {
entry:
%s = alloca %struct.s, align 4
@@ -412,3 +418,117 @@ entry:
call void @B_func(%struct.B* sret %tmp, %struct.B* %ref.tmp, i32 1)
ret void
}
+
+; NORMAL-LABEL: pr34863_16
+; NORMAL: movl 4(%esp), %eax
+; NORMAL-NEXT: pushl $65535
+; NORMAL-NEXT: pushl $0
+; NORMAL-NEXT: pushl %eax
+; NORMAL-NEXT: pushl %eax
+; NORMAL-NEXT: pushl %eax
+; NORMAL-NEXT: pushl %eax
+; NORMAL-NEXT: pushl %eax
+; NORMAL-NEXT: pushl %eax
+; NORMAL-NEXT: calll _eightparams16
+; NORMAL-NEXT: addl $32, %esp
+;
+; NOPUSH-LABEL: pr34863_16
+; NOPUSH: subl $32, %esp
+; NOPUSH-NEXT: movl 36(%esp), %eax
+; NOPUSH-NEXT: movl %eax, 20(%esp)
+; NOPUSH-NEXT: movl %eax, 16(%esp)
+; NOPUSH-NEXT: movl %eax, 12(%esp)
+; NOPUSH-NEXT: movl %eax, 8(%esp)
+; NOPUSH-NEXT: movl %eax, 4(%esp)
+; NOPUSH-NEXT: movl %eax, (%esp)
+; NOPUSH-NEXT: movl $65535, 28(%esp)
+; NOPUSH-NEXT: andl $0, 24(%esp)
+; NOPUSH-NEXT: calll _eightparams16
+; NOPUSH-NEXT: addl $32, %esp
+define void @pr34863_16(i16 %x) minsize nounwind {
+entry:
+ tail call void @eightparams16(i16 %x, i16 %x, i16 %x, i16 %x, i16 %x, i16 %x, i16 0, i16 -1)
+ ret void
+}
+
+; NORMAL-LABEL: pr34863_32
+; NORMAL: movl 4(%esp), %eax
+; NORMAL-NEXT: pushl $-1
+; NORMAL-NEXT: pushl $0
+; NORMAL-NEXT: pushl %eax
+; NORMAL-NEXT: pushl %eax
+; NORMAL-NEXT: pushl %eax
+; NORMAL-NEXT: pushl %eax
+; NORMAL-NEXT: pushl %eax
+; NORMAL-NEXT: pushl %eax
+; NORMAL-NEXT: calll _eightparams
+; NORMAL-NEXT: addl $32, %esp
+;
+; NOPUSH-LABEL: pr34863_32
+; NOPUSH: subl $32, %esp
+; NOPUSH-NEXT: movl 36(%esp), %eax
+; NOPUSH-NEXT: movl %eax, 20(%esp)
+; NOPUSH-NEXT: movl %eax, 16(%esp)
+; NOPUSH-NEXT: movl %eax, 12(%esp)
+; NOPUSH-NEXT: movl %eax, 8(%esp)
+; NOPUSH-NEXT: movl %eax, 4(%esp)
+; NOPUSH-NEXT: movl %eax, (%esp)
+; NOPUSH-NEXT: orl $-1, 28(%esp)
+; NOPUSH-NEXT: andl $0, 24(%esp)
+; NOPUSH-NEXT: calll _eightparams
+; NOPUSH-NEXT: addl $32, %esp
+define void @pr34863_32(i32 %x) minsize nounwind {
+entry:
+ tail call void @eightparams(i32 %x, i32 %x, i32 %x, i32 %x, i32 %x, i32 %x, i32 0, i32 -1)
+ ret void
+}
+
+; NORMAL-LABEL: pr34863_64
+; NORMAL: movl 4(%esp), %eax
+; NORMAL-NEXT: movl 8(%esp), %ecx
+; NORMAL-NEXT: pushl $-1
+; NORMAL-NEXT: pushl $-1
+; NORMAL-NEXT: pushl $0
+; NORMAL-NEXT: pushl $0
+; NORMAL-NEXT: pushl %ecx
+; NORMAL-NEXT: pushl %eax
+; NORMAL-NEXT: pushl %ecx
+; NORMAL-NEXT: pushl %eax
+; NORMAL-NEXT: pushl %ecx
+; NORMAL-NEXT: pushl %eax
+; NORMAL-NEXT: pushl %ecx
+; NORMAL-NEXT: pushl %eax
+; NORMAL-NEXT: pushl %ecx
+; NORMAL-NEXT: pushl %eax
+; NORMAL-NEXT: pushl %ecx
+; NORMAL-NEXT: pushl %eax
+; NORMAL-NEXT: calll _eightparams64
+; NORMAL-NEXT: addl $64, %esp
+;
+; NOPUSH-LABEL: pr34863_64
+; NOPUSH: subl $64, %esp
+; NOPUSH-NEXT: movl 68(%esp), %eax
+; NOPUSH-NEXT: movl 72(%esp), %ecx
+; NOPUSH-NEXT: movl %ecx, 44(%esp)
+; NOPUSH-NEXT: movl %eax, 40(%esp)
+; NOPUSH-NEXT: movl %ecx, 36(%esp)
+; NOPUSH-NEXT: movl %eax, 32(%esp)
+; NOPUSH-NEXT: movl %ecx, 28(%esp)
+; NOPUSH-NEXT: movl %eax, 24(%esp)
+; NOPUSH-NEXT: movl %ecx, 20(%esp)
+; NOPUSH-NEXT: movl %eax, 16(%esp)
+; NOPUSH-NEXT: movl %ecx, 12(%esp)
+; NOPUSH-NEXT: movl %eax, 8(%esp)
+; NOPUSH-NEXT: movl %ecx, 4(%esp)
+; NOPUSH-NEXT: movl %eax, (%esp)
+; NOPUSH-NEXT: orl $-1, 60(%esp)
+; NOPUSH-NEXT: orl $-1, 56(%esp)
+; NOPUSH-NEXT: andl $0, 52(%esp)
+; NOPUSH-NEXT: andl $0, 48(%esp)
+; NOPUSH-NEXT: calll _eightparams64
+; NOPUSH-NEXT: addl $64, %esp
+define void @pr34863_64(i64 %x) minsize nounwind {
+entry:
+ tail call void @eightparams64(i64 %x, i64 %x, i64 %x, i64 %x, i64 %x, i64 %x, i64 0, i64 -1)
+ ret void
+}
diff --git a/test/CodeGen/X86/movtopush.mir b/test/CodeGen/X86/movtopush.mir
new file mode 100644
index 000000000000..4c1dfc57627c
--- /dev/null
+++ b/test/CodeGen/X86/movtopush.mir
@@ -0,0 +1,125 @@
+# RUN: llc -mtriple=i686-windows --run-pass="x86-cf-opt" %s -o - | FileCheck %s
+
+# PR34903
+--- |
+ target datalayout = "e-m:x-p:32:32-i64:64-f80:32-n8:16:32-a:0:32-S32"
+ target triple = "i686--windows-msvc"
+
+ %struct.s = type { i64 }
+
+ declare void @good(i32, i32, i32, i32)
+
+ declare void @struct(%struct.s* byval, i32, i32, i32)
+
+ ; Function Attrs: optsize
+ define void @test9() #0 {
+ entry:
+ %p = alloca i32, align 4
+ %q = alloca i32, align 4
+ %s = alloca %struct.s, align 4
+ call void @good(i32 1, i32 2, i32 3, i32 4)
+ %pv = ptrtoint i32* %p to i32
+ %qv = ptrtoint i32* %q to i32
+ call void @struct(%struct.s* byval %s, i32 6, i32 %qv, i32 %pv)
+ ret void
+ }
+
+ ; Function Attrs: nounwind
+ declare void @llvm.stackprotector(i8*, i8**) #1
+
+ attributes #0 = { optsize }
+ attributes #1 = { nounwind }
+
+...
+---
+# CHECK-LABEL: test9
+# CHECK: ADJCALLSTACKDOWN32 16, 0, 16, implicit-def dead %esp, implicit-def dead %eflags, implicit-def dead %ssp, implicit %esp, implicit %ssp
+# CHECK-NEXT: PUSH32i8 4, implicit-def %esp, implicit %esp
+# CHECK-NEXT: PUSH32i8 3, implicit-def %esp, implicit %esp
+# CHECK-NEXT: PUSH32i8 2, implicit-def %esp, implicit %esp
+# CHECK-NEXT: PUSH32i8 1, implicit-def %esp, implicit %esp
+# CHECK-NEXT: CALLpcrel32 @good, csr_32, implicit %esp, implicit %ssp, implicit-def %esp, implicit-def %ssp
+# CHECK-NEXT: ADJCALLSTACKUP32 16, 0, implicit-def dead %esp, implicit-def dead %eflags, implicit-def dead %ssp, implicit %esp, implicit %ssp
+# CHECK-NEXT: ADJCALLSTACKDOWN32 20, 0, 20, implicit-def dead %esp, implicit-def dead %eflags, implicit-def dead %ssp, implicit %esp, implicit %ssp
+# CHECK-NEXT: %1:gr32 = MOV32rm %stack.2.s, 1, %noreg, 0, %noreg :: (load 4 from %stack.2.s, align 8)
+# CHECK-NEXT: %2:gr32 = MOV32rm %stack.2.s, 1, %noreg, 4, %noreg :: (load 4 from %stack.2.s + 4)
+# CHECK-NEXT: %4:gr32 = LEA32r %stack.0.p, 1, %noreg, 0, %noreg
+# CHECK-NEXT: %5:gr32 = LEA32r %stack.1.q, 1, %noreg, 0, %noreg
+# CHECK-NEXT: PUSH32r %4, implicit-def %esp, implicit %esp
+# CHECK-NEXT: PUSH32r %5, implicit-def %esp, implicit %esp
+# CHECK-NEXT: PUSH32i8 6, implicit-def %esp, implicit %esp
+# CHECK-NEXT: PUSH32r %2, implicit-def %esp, implicit %esp
+# CHECK-NEXT: PUSH32r %1, implicit-def %esp, implicit %esp
+# CHECK-NEXT: CALLpcrel32 @struct, csr_32, implicit %esp, implicit %ssp, implicit-def %esp, implicit-def %ssp
+# CHECK-NEXT: ADJCALLSTACKUP32 20, 0, implicit-def dead %esp, implicit-def dead %eflags, implicit-def dead %ssp, implicit %esp, implicit %ssp
+# CHECK-NEXT: RET 0
+name: test9
+alignment: 0
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: gr32, preferred-register: '' }
+ - { id: 1, class: gr32, preferred-register: '' }
+ - { id: 2, class: gr32, preferred-register: '' }
+ - { id: 3, class: gr32, preferred-register: '' }
+ - { id: 4, class: gr32, preferred-register: '' }
+ - { id: 5, class: gr32, preferred-register: '' }
+liveins:
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 8
+ adjustsStack: false
+ hasCalls: true
+ stackProtector: ''
+ maxCallFrameSize: 4294967295
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+ savePoint: ''
+ restorePoint: ''
+fixedStack:
+stack:
+ - { id: 0, name: p, type: default, offset: 0, size: 4, alignment: 4,
+ stack-id: 0, callee-saved-register: '', callee-saved-restored: true,
+ di-variable: '', di-expression: '', di-location: '' }
+ - { id: 1, name: q, type: default, offset: 0, size: 4, alignment: 4,
+ stack-id: 0, callee-saved-register: '', callee-saved-restored: true,
+ di-variable: '', di-expression: '', di-location: '' }
+ - { id: 2, name: s, type: default, offset: 0, size: 8, alignment: 8,
+ stack-id: 0, callee-saved-register: '', callee-saved-restored: true,
+ di-variable: '', di-expression: '', di-location: '' }
+constants:
+body: |
+ bb.0.entry:
+ ADJCALLSTACKDOWN32 16, 0, 0, implicit-def dead %esp, implicit-def dead %eflags, implicit-def dead %ssp, implicit %esp, implicit %ssp
+ %0 = COPY %esp
+ MOV32mi %0, 1, %noreg, 12, %noreg, 4 :: (store 4 into stack + 12)
+ MOV32mi %0, 1, %noreg, 8, %noreg, 3 :: (store 4 into stack + 8)
+ MOV32mi %0, 1, %noreg, 4, %noreg, 2 :: (store 4 into stack + 4)
+ MOV32mi %0, 1, %noreg, 0, %noreg, 1 :: (store 4 into stack)
+ CALLpcrel32 @good, csr_32, implicit %esp, implicit %ssp, implicit-def %esp, implicit-def %ssp
+ ADJCALLSTACKUP32 16, 0, implicit-def dead %esp, implicit-def dead %eflags, implicit-def dead %ssp, implicit %esp, implicit %ssp
+ ADJCALLSTACKDOWN32 20, 0, 0, implicit-def dead %esp, implicit-def dead %eflags, implicit-def dead %ssp, implicit %esp, implicit %ssp
+ %1 = MOV32rm %stack.2.s, 1, %noreg, 0, %noreg :: (load 4 from %stack.2.s, align 8)
+ %2 = MOV32rm %stack.2.s, 1, %noreg, 4, %noreg :: (load 4 from %stack.2.s + 4)
+ %3 = COPY %esp
+ MOV32mr %3, 1, %noreg, 4, %noreg, killed %2 :: (store 4)
+ MOV32mr %3, 1, %noreg, 0, %noreg, killed %1 :: (store 4)
+ %4 = LEA32r %stack.0.p, 1, %noreg, 0, %noreg
+ MOV32mr %3, 1, %noreg, 16, %noreg, killed %4 :: (store 4 into stack + 16)
+ %5 = LEA32r %stack.1.q, 1, %noreg, 0, %noreg
+ MOV32mr %3, 1, %noreg, 12, %noreg, killed %5 :: (store 4 into stack + 12)
+ MOV32mi %3, 1, %noreg, 8, %noreg, 6 :: (store 4 into stack + 8)
+ CALLpcrel32 @struct, csr_32, implicit %esp, implicit %ssp, implicit-def %esp, implicit-def %ssp,
+ ADJCALLSTACKUP32 20, 0, implicit-def dead %esp, implicit-def dead %eflags, implicit-def dead %ssp, implicit %esp, implicit %ssp
+ RET 0
+
+...
diff --git a/test/CodeGen/X86/movtopush64.ll b/test/CodeGen/X86/movtopush64.ll
index 1f4aa18c3227..76dd7402bfac 100644
--- a/test/CodeGen/X86/movtopush64.ll
+++ b/test/CodeGen/X86/movtopush64.ll
@@ -4,6 +4,9 @@
; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -no-x86-call-frame-opt | FileCheck %s -check-prefix=NOPUSH
declare void @seven_params(i32 %a, i64 %b, i32 %c, i64 %d, i32 %e, i64 %f, i32 %g)
+declare void @eightparams(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h)
+declare void @eightparams16(i16 %a, i16 %b, i16 %c, i16 %d, i16 %e, i16 %f, i16 %g, i16 %h)
+declare void @eightparams64(i64 %a, i64 %b, i64 %c, i64 %d, i64 %e, i64 %f, i64 %g, i64 %h)
declare void @ten_params(i32 %a, i64 %b, i32 %c, i64 %d, i32 %e, i64 %f, i32 %g, i64 %h, i32 %i, i64 %j)
declare void @ten_params_ptr(i32 %a, i64 %b, i32 %c, i64 %d, i32 %e, i64 %f, i32 %g, i8* %h, i32 %i, i64 %j)
declare void @cannot_push(float %a, float %b, float %c, float %d, float %e, float %f, float %g, float %h, float %i)
@@ -191,3 +194,33 @@ define void @test10(float %p1) {
call void @ten_params(i32 1, i64 2, i32 3, i64 4, i32 5, i64 6, i32 7, i64 8, i32 9, i64 10)
ret void
}
+
+; NORMAL-LABEL: pr34863_16
+; NORMAL: pushq ${{-1|65535}}
+; NORMAL-NEXT: pushq $0
+; NORMAL-NEXT: call
+define void @pr34863_16(i16 %x) minsize nounwind {
+entry:
+ tail call void @eightparams16(i16 %x, i16 %x, i16 %x, i16 %x, i16 %x, i16 %x, i16 0, i16 -1)
+ ret void
+}
+
+; NORMAL-LABEL: pr34863_32
+; NORMAL: pushq ${{-1|65535}}
+; NORMAL-NEXT: pushq $0
+; NORMAL-NEXT: call
+define void @pr34863_32(i32 %x) minsize nounwind {
+entry:
+ tail call void @eightparams(i32 %x, i32 %x, i32 %x, i32 %x, i32 %x, i32 %x, i32 0, i32 -1)
+ ret void
+}
+
+; NORMAL-LABEL: pr34863_64
+; NORMAL: pushq ${{-1|65535}}
+; NORMAL-NEXT: pushq $0
+; NORMAL-NEXT: call
+define void @pr34863_64(i64 %x) minsize nounwind {
+entry:
+ tail call void @eightparams64(i64 %x, i64 %x, i64 %x, i64 %x, i64 %x, i64 %x, i64 0, i64 -1)
+ ret void
+}
diff --git a/test/CodeGen/X86/ms-inline-asm.ll b/test/CodeGen/X86/ms-inline-asm.ll
index ec0630a4ce08..62525b072bb5 100644
--- a/test/CodeGen/X86/ms-inline-asm.ll
+++ b/test/CodeGen/X86/ms-inline-asm.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mcpu=core2 -no-integrated-as | FileCheck %s
+; RUN: llc < %s -mtriple=i686-- -mcpu=core2 -no-integrated-as | FileCheck %s
define i32 @t1() nounwind {
entry:
diff --git a/test/CodeGen/X86/mul-constant-i16.ll b/test/CodeGen/X86/mul-constant-i16.ll
index 7b39bfe1c484..2036eae670f6 100644
--- a/test/CodeGen/X86/mul-constant-i16.ll
+++ b/test/CodeGen/X86/mul-constant-i16.ll
@@ -4,12 +4,12 @@
define i16 @test_mul_by_1(i16 %x) {
; X86-LABEL: test_mul_by_1:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X86-NEXT: retl
;
; X64-LABEL: test_mul_by_1:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movl %edi, %eax
; X64-NEXT: retq
%mul = mul nsw i16 %x, 1
@@ -18,17 +18,17 @@ define i16 @test_mul_by_1(i16 %x) {
define i16 @test_mul_by_2(i16 %x) {
; X86-LABEL: test_mul_by_2:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X86-NEXT: addl %eax, %eax
-; X86-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-NEXT: # kill: def %ax killed %ax killed %eax
; X86-NEXT: retl
;
; X64-LABEL: test_mul_by_2:
-; X64: # BB#0:
-; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64: # %bb.0:
+; X64-NEXT: # kill: def %edi killed %edi def %rdi
; X64-NEXT: leal (%rdi,%rdi), %eax
-; X64-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-NEXT: # kill: def %ax killed %ax killed %eax
; X64-NEXT: retq
%mul = mul nsw i16 %x, 2
ret i16 %mul
@@ -36,17 +36,17 @@ define i16 @test_mul_by_2(i16 %x) {
define i16 @test_mul_by_3(i16 %x) {
; X86-LABEL: test_mul_by_3:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X86-NEXT: leal (%eax,%eax,2), %eax
-; X86-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-NEXT: # kill: def %ax killed %ax killed %eax
; X86-NEXT: retl
;
; X64-LABEL: test_mul_by_3:
-; X64: # BB#0:
-; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64: # %bb.0:
+; X64-NEXT: # kill: def %edi killed %edi def %rdi
; X64-NEXT: leal (%rdi,%rdi,2), %eax
-; X64-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-NEXT: # kill: def %ax killed %ax killed %eax
; X64-NEXT: retq
%mul = mul nsw i16 %x, 3
ret i16 %mul
@@ -54,17 +54,17 @@ define i16 @test_mul_by_3(i16 %x) {
define i16 @test_mul_by_4(i16 %x) {
; X86-LABEL: test_mul_by_4:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X86-NEXT: shll $2, %eax
-; X86-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-NEXT: # kill: def %ax killed %ax killed %eax
; X86-NEXT: retl
;
; X64-LABEL: test_mul_by_4:
-; X64: # BB#0:
-; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64: # %bb.0:
+; X64-NEXT: # kill: def %edi killed %edi def %rdi
; X64-NEXT: leal (,%rdi,4), %eax
-; X64-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-NEXT: # kill: def %ax killed %ax killed %eax
; X64-NEXT: retq
%mul = mul nsw i16 %x, 4
ret i16 %mul
@@ -72,17 +72,17 @@ define i16 @test_mul_by_4(i16 %x) {
define i16 @test_mul_by_5(i16 %x) {
; X86-LABEL: test_mul_by_5:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X86-NEXT: leal (%eax,%eax,4), %eax
-; X86-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-NEXT: # kill: def %ax killed %ax killed %eax
; X86-NEXT: retl
;
; X64-LABEL: test_mul_by_5:
-; X64: # BB#0:
-; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64: # %bb.0:
+; X64-NEXT: # kill: def %edi killed %edi def %rdi
; X64-NEXT: leal (%rdi,%rdi,4), %eax
-; X64-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-NEXT: # kill: def %ax killed %ax killed %eax
; X64-NEXT: retq
%mul = mul nsw i16 %x, 5
ret i16 %mul
@@ -90,19 +90,19 @@ define i16 @test_mul_by_5(i16 %x) {
define i16 @test_mul_by_6(i16 %x) {
; X86-LABEL: test_mul_by_6:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X86-NEXT: addl %eax, %eax
; X86-NEXT: leal (%eax,%eax,2), %eax
-; X86-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-NEXT: # kill: def %ax killed %ax killed %eax
; X86-NEXT: retl
;
; X64-LABEL: test_mul_by_6:
-; X64: # BB#0:
-; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64: # %bb.0:
+; X64-NEXT: # kill: def %edi killed %edi def %rdi
; X64-NEXT: addl %edi, %edi
; X64-NEXT: leal (%rdi,%rdi,2), %eax
-; X64-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-NEXT: # kill: def %ax killed %ax killed %eax
; X64-NEXT: retq
%mul = mul nsw i16 %x, 6
ret i16 %mul
@@ -110,19 +110,19 @@ define i16 @test_mul_by_6(i16 %x) {
define i16 @test_mul_by_7(i16 %x) {
; X86-LABEL: test_mul_by_7:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: leal (,%ecx,8), %eax
; X86-NEXT: subl %ecx, %eax
-; X86-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-NEXT: # kill: def %ax killed %ax killed %eax
; X86-NEXT: retl
;
; X64-LABEL: test_mul_by_7:
-; X64: # BB#0:
-; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64: # %bb.0:
+; X64-NEXT: # kill: def %edi killed %edi def %rdi
; X64-NEXT: leal (,%rdi,8), %eax
; X64-NEXT: subl %edi, %eax
-; X64-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-NEXT: # kill: def %ax killed %ax killed %eax
; X64-NEXT: retq
%mul = mul nsw i16 %x, 7
ret i16 %mul
@@ -130,17 +130,17 @@ define i16 @test_mul_by_7(i16 %x) {
define i16 @test_mul_by_8(i16 %x) {
; X86-LABEL: test_mul_by_8:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X86-NEXT: shll $3, %eax
-; X86-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-NEXT: # kill: def %ax killed %ax killed %eax
; X86-NEXT: retl
;
; X64-LABEL: test_mul_by_8:
-; X64: # BB#0:
-; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64: # %bb.0:
+; X64-NEXT: # kill: def %edi killed %edi def %rdi
; X64-NEXT: leal (,%rdi,8), %eax
-; X64-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-NEXT: # kill: def %ax killed %ax killed %eax
; X64-NEXT: retq
%mul = mul nsw i16 %x, 8
ret i16 %mul
@@ -148,17 +148,17 @@ define i16 @test_mul_by_8(i16 %x) {
define i16 @test_mul_by_9(i16 %x) {
; X86-LABEL: test_mul_by_9:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X86-NEXT: leal (%eax,%eax,8), %eax
-; X86-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-NEXT: # kill: def %ax killed %ax killed %eax
; X86-NEXT: retl
;
; X64-LABEL: test_mul_by_9:
-; X64: # BB#0:
-; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64: # %bb.0:
+; X64-NEXT: # kill: def %edi killed %edi def %rdi
; X64-NEXT: leal (%rdi,%rdi,8), %eax
-; X64-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-NEXT: # kill: def %ax killed %ax killed %eax
; X64-NEXT: retq
%mul = mul nsw i16 %x, 9
ret i16 %mul
@@ -166,19 +166,19 @@ define i16 @test_mul_by_9(i16 %x) {
define i16 @test_mul_by_10(i16 %x) {
; X86-LABEL: test_mul_by_10:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X86-NEXT: addl %eax, %eax
; X86-NEXT: leal (%eax,%eax,4), %eax
-; X86-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-NEXT: # kill: def %ax killed %ax killed %eax
; X86-NEXT: retl
;
; X64-LABEL: test_mul_by_10:
-; X64: # BB#0:
-; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64: # %bb.0:
+; X64-NEXT: # kill: def %edi killed %edi def %rdi
; X64-NEXT: addl %edi, %edi
; X64-NEXT: leal (%rdi,%rdi,4), %eax
-; X64-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-NEXT: # kill: def %ax killed %ax killed %eax
; X64-NEXT: retq
%mul = mul nsw i16 %x, 10
ret i16 %mul
@@ -186,19 +186,19 @@ define i16 @test_mul_by_10(i16 %x) {
define i16 @test_mul_by_11(i16 %x) {
; X86-LABEL: test_mul_by_11:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X86-NEXT: leal (%eax,%eax,4), %ecx
; X86-NEXT: leal (%eax,%ecx,2), %eax
-; X86-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-NEXT: # kill: def %ax killed %ax killed %eax
; X86-NEXT: retl
;
; X64-LABEL: test_mul_by_11:
-; X64: # BB#0:
-; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64: # %bb.0:
+; X64-NEXT: # kill: def %edi killed %edi def %rdi
; X64-NEXT: leal (%rdi,%rdi,4), %eax
; X64-NEXT: leal (%rdi,%rax,2), %eax
-; X64-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-NEXT: # kill: def %ax killed %ax killed %eax
; X64-NEXT: retq
%mul = mul nsw i16 %x, 11
ret i16 %mul
@@ -206,19 +206,19 @@ define i16 @test_mul_by_11(i16 %x) {
define i16 @test_mul_by_12(i16 %x) {
; X86-LABEL: test_mul_by_12:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X86-NEXT: shll $2, %eax
; X86-NEXT: leal (%eax,%eax,2), %eax
-; X86-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-NEXT: # kill: def %ax killed %ax killed %eax
; X86-NEXT: retl
;
; X64-LABEL: test_mul_by_12:
-; X64: # BB#0:
-; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64: # %bb.0:
+; X64-NEXT: # kill: def %edi killed %edi def %rdi
; X64-NEXT: shll $2, %edi
; X64-NEXT: leal (%rdi,%rdi,2), %eax
-; X64-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-NEXT: # kill: def %ax killed %ax killed %eax
; X64-NEXT: retq
%mul = mul nsw i16 %x, 12
ret i16 %mul
@@ -226,19 +226,19 @@ define i16 @test_mul_by_12(i16 %x) {
define i16 @test_mul_by_13(i16 %x) {
; X86-LABEL: test_mul_by_13:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X86-NEXT: leal (%eax,%eax,2), %ecx
; X86-NEXT: leal (%eax,%ecx,4), %eax
-; X86-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-NEXT: # kill: def %ax killed %ax killed %eax
; X86-NEXT: retl
;
; X64-LABEL: test_mul_by_13:
-; X64: # BB#0:
-; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64: # %bb.0:
+; X64-NEXT: # kill: def %edi killed %edi def %rdi
; X64-NEXT: leal (%rdi,%rdi,2), %eax
; X64-NEXT: leal (%rdi,%rax,4), %eax
-; X64-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-NEXT: # kill: def %ax killed %ax killed %eax
; X64-NEXT: retq
%mul = mul nsw i16 %x, 13
ret i16 %mul
@@ -246,21 +246,21 @@ define i16 @test_mul_by_13(i16 %x) {
define i16 @test_mul_by_14(i16 %x) {
; X86-LABEL: test_mul_by_14:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: leal (%ecx,%ecx,2), %eax
; X86-NEXT: leal (%ecx,%eax,4), %eax
; X86-NEXT: addl %ecx, %eax
-; X86-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-NEXT: # kill: def %ax killed %ax killed %eax
; X86-NEXT: retl
;
; X64-LABEL: test_mul_by_14:
-; X64: # BB#0:
-; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64: # %bb.0:
+; X64-NEXT: # kill: def %edi killed %edi def %rdi
; X64-NEXT: leal (%rdi,%rdi,2), %eax
; X64-NEXT: leal (%rdi,%rax,4), %eax
; X64-NEXT: addl %edi, %eax
-; X64-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-NEXT: # kill: def %ax killed %ax killed %eax
; X64-NEXT: retq
%mul = mul nsw i16 %x, 14
ret i16 %mul
@@ -268,19 +268,19 @@ define i16 @test_mul_by_14(i16 %x) {
define i16 @test_mul_by_15(i16 %x) {
; X86-LABEL: test_mul_by_15:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X86-NEXT: leal (%eax,%eax,4), %eax
; X86-NEXT: leal (%eax,%eax,2), %eax
-; X86-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-NEXT: # kill: def %ax killed %ax killed %eax
; X86-NEXT: retl
;
; X64-LABEL: test_mul_by_15:
-; X64: # BB#0:
-; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64: # %bb.0:
+; X64-NEXT: # kill: def %edi killed %edi def %rdi
; X64-NEXT: leal (%rdi,%rdi,4), %eax
; X64-NEXT: leal (%rax,%rax,2), %eax
-; X64-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-NEXT: # kill: def %ax killed %ax killed %eax
; X64-NEXT: retq
%mul = mul nsw i16 %x, 15
ret i16 %mul
@@ -288,14 +288,14 @@ define i16 @test_mul_by_15(i16 %x) {
define i16 @test_mul_by_16(i16 %x) {
; X86-LABEL: test_mul_by_16:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X86-NEXT: shll $4, %eax
-; X86-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-NEXT: # kill: def %ax killed %ax killed %eax
; X86-NEXT: retl
;
; X64-LABEL: test_mul_by_16:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: shll $4, %edi
; X64-NEXT: movl %edi, %eax
; X64-NEXT: retq
@@ -305,21 +305,21 @@ define i16 @test_mul_by_16(i16 %x) {
define i16 @test_mul_by_17(i16 %x) {
; X86-LABEL: test_mul_by_17:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: shll $4, %eax
; X86-NEXT: addl %ecx, %eax
-; X86-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-NEXT: # kill: def %ax killed %ax killed %eax
; X86-NEXT: retl
;
; X64-LABEL: test_mul_by_17:
-; X64: # BB#0:
-; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64: # %bb.0:
+; X64-NEXT: # kill: def %edi killed %edi def %rdi
; X64-NEXT: movl %edi, %eax
; X64-NEXT: shll $4, %eax
; X64-NEXT: leal (%rax,%rdi), %eax
-; X64-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-NEXT: # kill: def %ax killed %ax killed %eax
; X64-NEXT: retq
%mul = mul nsw i16 %x, 17
ret i16 %mul
@@ -327,19 +327,19 @@ define i16 @test_mul_by_17(i16 %x) {
define i16 @test_mul_by_18(i16 %x) {
; X86-LABEL: test_mul_by_18:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X86-NEXT: addl %eax, %eax
; X86-NEXT: leal (%eax,%eax,8), %eax
-; X86-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-NEXT: # kill: def %ax killed %ax killed %eax
; X86-NEXT: retl
;
; X64-LABEL: test_mul_by_18:
-; X64: # BB#0:
-; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64: # %bb.0:
+; X64-NEXT: # kill: def %edi killed %edi def %rdi
; X64-NEXT: addl %edi, %edi
; X64-NEXT: leal (%rdi,%rdi,8), %eax
-; X64-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-NEXT: # kill: def %ax killed %ax killed %eax
; X64-NEXT: retq
%mul = mul nsw i16 %x, 18
ret i16 %mul
@@ -347,21 +347,21 @@ define i16 @test_mul_by_18(i16 %x) {
define i16 @test_mul_by_19(i16 %x) {
; X86-LABEL: test_mul_by_19:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: leal (%ecx,%ecx,4), %eax
; X86-NEXT: shll $2, %eax
; X86-NEXT: subl %ecx, %eax
-; X86-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-NEXT: # kill: def %ax killed %ax killed %eax
; X86-NEXT: retl
;
; X64-LABEL: test_mul_by_19:
-; X64: # BB#0:
-; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64: # %bb.0:
+; X64-NEXT: # kill: def %edi killed %edi def %rdi
; X64-NEXT: leal (%rdi,%rdi,4), %eax
; X64-NEXT: shll $2, %eax
; X64-NEXT: subl %edi, %eax
-; X64-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-NEXT: # kill: def %ax killed %ax killed %eax
; X64-NEXT: retq
%mul = mul nsw i16 %x, 19
ret i16 %mul
@@ -369,19 +369,19 @@ define i16 @test_mul_by_19(i16 %x) {
define i16 @test_mul_by_20(i16 %x) {
; X86-LABEL: test_mul_by_20:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X86-NEXT: shll $2, %eax
; X86-NEXT: leal (%eax,%eax,4), %eax
-; X86-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-NEXT: # kill: def %ax killed %ax killed %eax
; X86-NEXT: retl
;
; X64-LABEL: test_mul_by_20:
-; X64: # BB#0:
-; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64: # %bb.0:
+; X64-NEXT: # kill: def %edi killed %edi def %rdi
; X64-NEXT: shll $2, %edi
; X64-NEXT: leal (%rdi,%rdi,4), %eax
-; X64-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-NEXT: # kill: def %ax killed %ax killed %eax
; X64-NEXT: retq
%mul = mul nsw i16 %x, 20
ret i16 %mul
@@ -389,19 +389,19 @@ define i16 @test_mul_by_20(i16 %x) {
define i16 @test_mul_by_21(i16 %x) {
; X86-LABEL: test_mul_by_21:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X86-NEXT: leal (%eax,%eax,4), %ecx
; X86-NEXT: leal (%eax,%ecx,4), %eax
-; X86-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-NEXT: # kill: def %ax killed %ax killed %eax
; X86-NEXT: retl
;
; X64-LABEL: test_mul_by_21:
-; X64: # BB#0:
-; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64: # %bb.0:
+; X64-NEXT: # kill: def %edi killed %edi def %rdi
; X64-NEXT: leal (%rdi,%rdi,4), %eax
; X64-NEXT: leal (%rdi,%rax,4), %eax
-; X64-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-NEXT: # kill: def %ax killed %ax killed %eax
; X64-NEXT: retq
%mul = mul nsw i16 %x, 21
ret i16 %mul
@@ -409,21 +409,21 @@ define i16 @test_mul_by_21(i16 %x) {
define i16 @test_mul_by_22(i16 %x) {
; X86-LABEL: test_mul_by_22:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: leal (%ecx,%ecx,4), %eax
; X86-NEXT: leal (%ecx,%eax,4), %eax
; X86-NEXT: addl %ecx, %eax
-; X86-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-NEXT: # kill: def %ax killed %ax killed %eax
; X86-NEXT: retl
;
; X64-LABEL: test_mul_by_22:
-; X64: # BB#0:
-; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64: # %bb.0:
+; X64-NEXT: # kill: def %edi killed %edi def %rdi
; X64-NEXT: leal (%rdi,%rdi,4), %eax
; X64-NEXT: leal (%rdi,%rax,4), %eax
; X64-NEXT: addl %edi, %eax
-; X64-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-NEXT: # kill: def %ax killed %ax killed %eax
; X64-NEXT: retq
%mul = mul nsw i16 %x, 22
ret i16 %mul
@@ -431,21 +431,21 @@ define i16 @test_mul_by_22(i16 %x) {
define i16 @test_mul_by_23(i16 %x) {
; X86-LABEL: test_mul_by_23:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: leal (%ecx,%ecx,2), %eax
; X86-NEXT: shll $3, %eax
; X86-NEXT: subl %ecx, %eax
-; X86-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-NEXT: # kill: def %ax killed %ax killed %eax
; X86-NEXT: retl
;
; X64-LABEL: test_mul_by_23:
-; X64: # BB#0:
-; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64: # %bb.0:
+; X64-NEXT: # kill: def %edi killed %edi def %rdi
; X64-NEXT: leal (%rdi,%rdi,2), %eax
; X64-NEXT: shll $3, %eax
; X64-NEXT: subl %edi, %eax
-; X64-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-NEXT: # kill: def %ax killed %ax killed %eax
; X64-NEXT: retq
%mul = mul nsw i16 %x, 23
ret i16 %mul
@@ -453,19 +453,19 @@ define i16 @test_mul_by_23(i16 %x) {
define i16 @test_mul_by_24(i16 %x) {
; X86-LABEL: test_mul_by_24:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X86-NEXT: shll $3, %eax
; X86-NEXT: leal (%eax,%eax,2), %eax
-; X86-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-NEXT: # kill: def %ax killed %ax killed %eax
; X86-NEXT: retl
;
; X64-LABEL: test_mul_by_24:
-; X64: # BB#0:
-; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64: # %bb.0:
+; X64-NEXT: # kill: def %edi killed %edi def %rdi
; X64-NEXT: shll $3, %edi
; X64-NEXT: leal (%rdi,%rdi,2), %eax
-; X64-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-NEXT: # kill: def %ax killed %ax killed %eax
; X64-NEXT: retq
%mul = mul nsw i16 %x, 24
ret i16 %mul
@@ -473,19 +473,19 @@ define i16 @test_mul_by_24(i16 %x) {
define i16 @test_mul_by_25(i16 %x) {
; X86-LABEL: test_mul_by_25:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X86-NEXT: leal (%eax,%eax,4), %eax
; X86-NEXT: leal (%eax,%eax,4), %eax
-; X86-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-NEXT: # kill: def %ax killed %ax killed %eax
; X86-NEXT: retl
;
; X64-LABEL: test_mul_by_25:
-; X64: # BB#0:
-; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64: # %bb.0:
+; X64-NEXT: # kill: def %edi killed %edi def %rdi
; X64-NEXT: leal (%rdi,%rdi,4), %eax
; X64-NEXT: leal (%rax,%rax,4), %eax
-; X64-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-NEXT: # kill: def %ax killed %ax killed %eax
; X64-NEXT: retq
%mul = mul nsw i16 %x, 25
ret i16 %mul
@@ -493,21 +493,21 @@ define i16 @test_mul_by_25(i16 %x) {
define i16 @test_mul_by_26(i16 %x) {
; X86-LABEL: test_mul_by_26:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: leal (%ecx,%ecx,8), %eax
; X86-NEXT: leal (%eax,%eax,2), %eax
; X86-NEXT: subl %ecx, %eax
-; X86-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-NEXT: # kill: def %ax killed %ax killed %eax
; X86-NEXT: retl
;
; X64-LABEL: test_mul_by_26:
-; X64: # BB#0:
-; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64: # %bb.0:
+; X64-NEXT: # kill: def %edi killed %edi def %rdi
; X64-NEXT: leal (%rdi,%rdi,8), %eax
; X64-NEXT: leal (%rax,%rax,2), %eax
; X64-NEXT: subl %edi, %eax
-; X64-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-NEXT: # kill: def %ax killed %ax killed %eax
; X64-NEXT: retq
%mul = mul nsw i16 %x, 26
ret i16 %mul
@@ -515,19 +515,19 @@ define i16 @test_mul_by_26(i16 %x) {
define i16 @test_mul_by_27(i16 %x) {
; X86-LABEL: test_mul_by_27:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X86-NEXT: leal (%eax,%eax,8), %eax
; X86-NEXT: leal (%eax,%eax,2), %eax
-; X86-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-NEXT: # kill: def %ax killed %ax killed %eax
; X86-NEXT: retl
;
; X64-LABEL: test_mul_by_27:
-; X64: # BB#0:
-; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64: # %bb.0:
+; X64-NEXT: # kill: def %edi killed %edi def %rdi
; X64-NEXT: leal (%rdi,%rdi,8), %eax
; X64-NEXT: leal (%rax,%rax,2), %eax
-; X64-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-NEXT: # kill: def %ax killed %ax killed %eax
; X64-NEXT: retq
%mul = mul nsw i16 %x, 27
ret i16 %mul
@@ -535,21 +535,21 @@ define i16 @test_mul_by_27(i16 %x) {
define i16 @test_mul_by_28(i16 %x) {
; X86-LABEL: test_mul_by_28:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: leal (%ecx,%ecx,8), %eax
; X86-NEXT: leal (%eax,%eax,2), %eax
; X86-NEXT: addl %ecx, %eax
-; X86-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-NEXT: # kill: def %ax killed %ax killed %eax
; X86-NEXT: retl
;
; X64-LABEL: test_mul_by_28:
-; X64: # BB#0:
-; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64: # %bb.0:
+; X64-NEXT: # kill: def %edi killed %edi def %rdi
; X64-NEXT: leal (%rdi,%rdi,8), %eax
; X64-NEXT: leal (%rax,%rax,2), %eax
; X64-NEXT: addl %edi, %eax
-; X64-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-NEXT: # kill: def %ax killed %ax killed %eax
; X64-NEXT: retq
%mul = mul nsw i16 %x, 28
ret i16 %mul
@@ -557,23 +557,23 @@ define i16 @test_mul_by_28(i16 %x) {
define i16 @test_mul_by_29(i16 %x) {
; X86-LABEL: test_mul_by_29:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: leal (%ecx,%ecx,8), %eax
; X86-NEXT: leal (%eax,%eax,2), %eax
; X86-NEXT: addl %ecx, %eax
; X86-NEXT: addl %ecx, %eax
-; X86-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-NEXT: # kill: def %ax killed %ax killed %eax
; X86-NEXT: retl
;
; X64-LABEL: test_mul_by_29:
-; X64: # BB#0:
-; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64: # %bb.0:
+; X64-NEXT: # kill: def %edi killed %edi def %rdi
; X64-NEXT: leal (%rdi,%rdi,8), %eax
; X64-NEXT: leal (%rax,%rax,2), %eax
; X64-NEXT: addl %edi, %eax
; X64-NEXT: addl %edi, %eax
-; X64-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-NEXT: # kill: def %ax killed %ax killed %eax
; X64-NEXT: retq
%mul = mul nsw i16 %x, 29
ret i16 %mul
@@ -581,22 +581,22 @@ define i16 @test_mul_by_29(i16 %x) {
define i16 @test_mul_by_30(i16 %x) {
; X86-LABEL: test_mul_by_30:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: shll $5, %eax
; X86-NEXT: subl %ecx, %eax
; X86-NEXT: subl %ecx, %eax
-; X86-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-NEXT: # kill: def %ax killed %ax killed %eax
; X86-NEXT: retl
;
; X64-LABEL: test_mul_by_30:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movl %edi, %eax
; X64-NEXT: shll $5, %eax
; X64-NEXT: subl %edi, %eax
; X64-NEXT: subl %edi, %eax
-; X64-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-NEXT: # kill: def %ax killed %ax killed %eax
; X64-NEXT: retq
%mul = mul nsw i16 %x, 30
ret i16 %mul
@@ -604,20 +604,20 @@ define i16 @test_mul_by_30(i16 %x) {
define i16 @test_mul_by_31(i16 %x) {
; X86-LABEL: test_mul_by_31:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: shll $5, %eax
; X86-NEXT: subl %ecx, %eax
-; X86-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-NEXT: # kill: def %ax killed %ax killed %eax
; X86-NEXT: retl
;
; X64-LABEL: test_mul_by_31:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movl %edi, %eax
; X64-NEXT: shll $5, %eax
; X64-NEXT: subl %edi, %eax
-; X64-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-NEXT: # kill: def %ax killed %ax killed %eax
; X64-NEXT: retq
%mul = mul nsw i16 %x, 31
ret i16 %mul
@@ -625,14 +625,14 @@ define i16 @test_mul_by_31(i16 %x) {
define i16 @test_mul_by_32(i16 %x) {
; X86-LABEL: test_mul_by_32:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X86-NEXT: shll $5, %eax
-; X86-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-NEXT: # kill: def %ax killed %ax killed %eax
; X86-NEXT: retl
;
; X64-LABEL: test_mul_by_32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: shll $5, %edi
; X64-NEXT: movl %edi, %eax
; X64-NEXT: retq
@@ -643,21 +643,21 @@ define i16 @test_mul_by_32(i16 %x) {
; (x*9+42)*(x*5+2)
define i16 @test_mul_spec(i16 %x) nounwind {
; X86-LABEL: test_mul_spec:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X86-NEXT: leal 42(%eax,%eax,8), %ecx
; X86-NEXT: leal 2(%eax,%eax,4), %eax
; X86-NEXT: imull %ecx, %eax
-; X86-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-NEXT: # kill: def %ax killed %ax killed %eax
; X86-NEXT: retl
;
; X64-LABEL: test_mul_spec:
-; X64: # BB#0:
-; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64: # %bb.0:
+; X64-NEXT: # kill: def %edi killed %edi def %rdi
; X64-NEXT: leal 42(%rdi,%rdi,8), %ecx
; X64-NEXT: leal 2(%rdi,%rdi,4), %eax
; X64-NEXT: imull %ecx, %eax
-; X64-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-NEXT: # kill: def %ax killed %ax killed %eax
; X64-NEXT: retq
%mul = mul nsw i16 %x, 9
%add = add nsw i16 %mul, 42
diff --git a/test/CodeGen/X86/mul-constant-i32.ll b/test/CodeGen/X86/mul-constant-i32.ll
index d545b477e102..83024f5c6f99 100644
--- a/test/CodeGen/X86/mul-constant-i32.ll
+++ b/test/CodeGen/X86/mul-constant-i32.ll
@@ -10,42 +10,42 @@
define i32 @test_mul_by_1(i32 %x) {
; X86-LABEL: test_mul_by_1:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: retl
;
; X64-HSW-LABEL: test_mul_by_1:
-; X64-HSW: # BB#0:
+; X64-HSW: # %bb.0:
; X64-HSW-NEXT: movl %edi, %eax # sched: [1:0.25]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [7:1.00]
;
; X64-JAG-LABEL: test_mul_by_1:
-; X64-JAG: # BB#0:
-; X64-JAG-NEXT: movl %edi, %eax # sched: [1:0.17]
+; X64-JAG: # %bb.0:
+; X64-JAG-NEXT: movl %edi, %eax # sched: [1:0.50]
; X64-JAG-NEXT: retq # sched: [4:1.00]
;
; X86-NOOPT-LABEL: test_mul_by_1:
-; X86-NOOPT: # BB#0:
+; X86-NOOPT: # %bb.0:
; X86-NOOPT-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NOOPT-NEXT: retl
;
; HSW-NOOPT-LABEL: test_mul_by_1:
-; HSW-NOOPT: # BB#0:
+; HSW-NOOPT: # %bb.0:
; HSW-NOOPT-NEXT: movl %edi, %eax # sched: [1:0.25]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [7:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_1:
-; JAG-NOOPT: # BB#0:
-; JAG-NOOPT-NEXT: movl %edi, %eax # sched: [1:0.17]
+; JAG-NOOPT: # %bb.0:
+; JAG-NOOPT-NEXT: movl %edi, %eax # sched: [1:0.50]
; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
;
; X64-SLM-LABEL: test_mul_by_1:
-; X64-SLM: # BB#0:
+; X64-SLM: # %bb.0:
; X64-SLM-NEXT: movl %edi, %eax # sched: [1:0.50]
; X64-SLM-NEXT: retq # sched: [4:1.00]
;
; SLM-NOOPT-LABEL: test_mul_by_1:
-; SLM-NOOPT: # BB#0:
+; SLM-NOOPT: # %bb.0:
; SLM-NOOPT-NEXT: movl %edi, %eax # sched: [1:0.50]
; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
%mul = mul nsw i32 %x, 1
@@ -54,50 +54,50 @@ define i32 @test_mul_by_1(i32 %x) {
define i32 @test_mul_by_2(i32 %x) {
; X86-LABEL: test_mul_by_2:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: addl %eax, %eax
; X86-NEXT: retl
;
; X64-HSW-LABEL: test_mul_by_2:
-; X64-HSW: # BB#0:
-; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-HSW: # %bb.0:
+; X64-HSW-NEXT: # kill: def %edi killed %edi def %rdi
; X64-HSW-NEXT: leal (%rdi,%rdi), %eax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [7:1.00]
;
; X64-JAG-LABEL: test_mul_by_2:
-; X64-JAG: # BB#0:
-; X64-JAG-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-JAG: # %bb.0:
+; X64-JAG-NEXT: # kill: def %edi killed %edi def %rdi
; X64-JAG-NEXT: leal (%rdi,%rdi), %eax # sched: [1:0.50]
; X64-JAG-NEXT: retq # sched: [4:1.00]
;
; X86-NOOPT-LABEL: test_mul_by_2:
-; X86-NOOPT: # BB#0:
+; X86-NOOPT: # %bb.0:
; X86-NOOPT-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NOOPT-NEXT: addl %eax, %eax
; X86-NOOPT-NEXT: retl
;
; HSW-NOOPT-LABEL: test_mul_by_2:
-; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; HSW-NOOPT: # %bb.0:
+; HSW-NOOPT-NEXT: # kill: def %edi killed %edi def %rdi
; HSW-NOOPT-NEXT: leal (%rdi,%rdi), %eax # sched: [1:0.50]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [7:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_2:
-; JAG-NOOPT: # BB#0:
-; JAG-NOOPT-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; JAG-NOOPT: # %bb.0:
+; JAG-NOOPT-NEXT: # kill: def %edi killed %edi def %rdi
; JAG-NOOPT-NEXT: leal (%rdi,%rdi), %eax # sched: [1:0.50]
; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
;
; X64-SLM-LABEL: test_mul_by_2:
-; X64-SLM: # BB#0:
-; X64-SLM-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-SLM: # %bb.0:
+; X64-SLM-NEXT: # kill: def %edi killed %edi def %rdi
; X64-SLM-NEXT: leal (%rdi,%rdi), %eax # sched: [1:1.00]
; X64-SLM-NEXT: retq # sched: [4:1.00]
;
; SLM-NOOPT-LABEL: test_mul_by_2:
-; SLM-NOOPT: # BB#0:
-; SLM-NOOPT-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; SLM-NOOPT: # %bb.0:
+; SLM-NOOPT-NEXT: # kill: def %edi killed %edi def %rdi
; SLM-NOOPT-NEXT: leal (%rdi,%rdi), %eax # sched: [1:1.00]
; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
%mul = mul nsw i32 %x, 2
@@ -106,48 +106,48 @@ define i32 @test_mul_by_2(i32 %x) {
define i32 @test_mul_by_3(i32 %x) {
; X86-LABEL: test_mul_by_3:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: imull $3, {{[0-9]+}}(%esp), %eax
; X86-NEXT: retl
;
; X64-HSW-LABEL: test_mul_by_3:
-; X64-HSW: # BB#0:
-; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-HSW: # %bb.0:
+; X64-HSW-NEXT: # kill: def %edi killed %edi def %rdi
; X64-HSW-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [7:1.00]
;
; X64-JAG-LABEL: test_mul_by_3:
-; X64-JAG: # BB#0:
-; X64-JAG-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-JAG: # %bb.0:
+; X64-JAG-NEXT: # kill: def %edi killed %edi def %rdi
; X64-JAG-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
; X64-JAG-NEXT: retq # sched: [4:1.00]
;
; X86-NOOPT-LABEL: test_mul_by_3:
-; X86-NOOPT: # BB#0:
+; X86-NOOPT: # %bb.0:
; X86-NOOPT-NEXT: imull $3, {{[0-9]+}}(%esp), %eax
; X86-NOOPT-NEXT: retl
;
; HSW-NOOPT-LABEL: test_mul_by_3:
-; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; HSW-NOOPT: # %bb.0:
+; HSW-NOOPT-NEXT: # kill: def %edi killed %edi def %rdi
; HSW-NOOPT-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [7:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_3:
-; JAG-NOOPT: # BB#0:
-; JAG-NOOPT-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; JAG-NOOPT: # %bb.0:
+; JAG-NOOPT-NEXT: # kill: def %edi killed %edi def %rdi
; JAG-NOOPT-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
;
; X64-SLM-LABEL: test_mul_by_3:
-; X64-SLM: # BB#0:
-; X64-SLM-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-SLM: # %bb.0:
+; X64-SLM-NEXT: # kill: def %edi killed %edi def %rdi
; X64-SLM-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:1.00]
; X64-SLM-NEXT: retq # sched: [4:1.00]
;
; SLM-NOOPT-LABEL: test_mul_by_3:
-; SLM-NOOPT: # BB#0:
-; SLM-NOOPT-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; SLM-NOOPT: # %bb.0:
+; SLM-NOOPT-NEXT: # kill: def %edi killed %edi def %rdi
; SLM-NOOPT-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:1.00]
; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
%mul = mul nsw i32 %x, 3
@@ -156,50 +156,50 @@ define i32 @test_mul_by_3(i32 %x) {
define i32 @test_mul_by_4(i32 %x) {
; X86-LABEL: test_mul_by_4:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: shll $2, %eax
; X86-NEXT: retl
;
; X64-HSW-LABEL: test_mul_by_4:
-; X64-HSW: # BB#0:
-; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-HSW: # %bb.0:
+; X64-HSW-NEXT: # kill: def %edi killed %edi def %rdi
; X64-HSW-NEXT: leal (,%rdi,4), %eax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [7:1.00]
;
; X64-JAG-LABEL: test_mul_by_4:
-; X64-JAG: # BB#0:
-; X64-JAG-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-JAG: # %bb.0:
+; X64-JAG-NEXT: # kill: def %edi killed %edi def %rdi
; X64-JAG-NEXT: leal (,%rdi,4), %eax # sched: [1:0.50]
; X64-JAG-NEXT: retq # sched: [4:1.00]
;
; X86-NOOPT-LABEL: test_mul_by_4:
-; X86-NOOPT: # BB#0:
+; X86-NOOPT: # %bb.0:
; X86-NOOPT-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NOOPT-NEXT: shll $2, %eax
; X86-NOOPT-NEXT: retl
;
; HSW-NOOPT-LABEL: test_mul_by_4:
-; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; HSW-NOOPT: # %bb.0:
+; HSW-NOOPT-NEXT: # kill: def %edi killed %edi def %rdi
; HSW-NOOPT-NEXT: leal (,%rdi,4), %eax # sched: [1:0.50]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [7:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_4:
-; JAG-NOOPT: # BB#0:
-; JAG-NOOPT-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; JAG-NOOPT: # %bb.0:
+; JAG-NOOPT-NEXT: # kill: def %edi killed %edi def %rdi
; JAG-NOOPT-NEXT: leal (,%rdi,4), %eax # sched: [1:0.50]
; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
;
; X64-SLM-LABEL: test_mul_by_4:
-; X64-SLM: # BB#0:
-; X64-SLM-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-SLM: # %bb.0:
+; X64-SLM-NEXT: # kill: def %edi killed %edi def %rdi
; X64-SLM-NEXT: leal (,%rdi,4), %eax # sched: [1:1.00]
; X64-SLM-NEXT: retq # sched: [4:1.00]
;
; SLM-NOOPT-LABEL: test_mul_by_4:
-; SLM-NOOPT: # BB#0:
-; SLM-NOOPT-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; SLM-NOOPT: # %bb.0:
+; SLM-NOOPT-NEXT: # kill: def %edi killed %edi def %rdi
; SLM-NOOPT-NEXT: leal (,%rdi,4), %eax # sched: [1:1.00]
; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
%mul = mul nsw i32 %x, 4
@@ -208,48 +208,48 @@ define i32 @test_mul_by_4(i32 %x) {
define i32 @test_mul_by_5(i32 %x) {
; X86-LABEL: test_mul_by_5:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: imull $5, {{[0-9]+}}(%esp), %eax
; X86-NEXT: retl
;
; X64-HSW-LABEL: test_mul_by_5:
-; X64-HSW: # BB#0:
-; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-HSW: # %bb.0:
+; X64-HSW-NEXT: # kill: def %edi killed %edi def %rdi
; X64-HSW-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [7:1.00]
;
; X64-JAG-LABEL: test_mul_by_5:
-; X64-JAG: # BB#0:
-; X64-JAG-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-JAG: # %bb.0:
+; X64-JAG-NEXT: # kill: def %edi killed %edi def %rdi
; X64-JAG-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
; X64-JAG-NEXT: retq # sched: [4:1.00]
;
; X86-NOOPT-LABEL: test_mul_by_5:
-; X86-NOOPT: # BB#0:
+; X86-NOOPT: # %bb.0:
; X86-NOOPT-NEXT: imull $5, {{[0-9]+}}(%esp), %eax
; X86-NOOPT-NEXT: retl
;
; HSW-NOOPT-LABEL: test_mul_by_5:
-; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; HSW-NOOPT: # %bb.0:
+; HSW-NOOPT-NEXT: # kill: def %edi killed %edi def %rdi
; HSW-NOOPT-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [7:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_5:
-; JAG-NOOPT: # BB#0:
-; JAG-NOOPT-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; JAG-NOOPT: # %bb.0:
+; JAG-NOOPT-NEXT: # kill: def %edi killed %edi def %rdi
; JAG-NOOPT-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
;
; X64-SLM-LABEL: test_mul_by_5:
-; X64-SLM: # BB#0:
-; X64-SLM-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-SLM: # %bb.0:
+; X64-SLM-NEXT: # kill: def %edi killed %edi def %rdi
; X64-SLM-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:1.00]
; X64-SLM-NEXT: retq # sched: [4:1.00]
;
; SLM-NOOPT-LABEL: test_mul_by_5:
-; SLM-NOOPT: # BB#0:
-; SLM-NOOPT-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; SLM-NOOPT: # %bb.0:
+; SLM-NOOPT-NEXT: # kill: def %edi killed %edi def %rdi
; SLM-NOOPT-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:1.00]
; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
%mul = mul nsw i32 %x, 5
@@ -258,50 +258,50 @@ define i32 @test_mul_by_5(i32 %x) {
define i32 @test_mul_by_6(i32 %x) {
; X86-LABEL: test_mul_by_6:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: addl %eax, %eax
; X86-NEXT: leal (%eax,%eax,2), %eax
; X86-NEXT: retl
;
; X64-HSW-LABEL: test_mul_by_6:
-; X64-HSW: # BB#0:
-; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-HSW: # %bb.0:
+; X64-HSW-NEXT: # kill: def %edi killed %edi def %rdi
; X64-HSW-NEXT: addl %edi, %edi # sched: [1:0.25]
; X64-HSW-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [7:1.00]
;
; X64-JAG-LABEL: test_mul_by_6:
-; X64-JAG: # BB#0:
-; X64-JAG-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-JAG: # %bb.0:
+; X64-JAG-NEXT: # kill: def %edi killed %edi def %rdi
; X64-JAG-NEXT: addl %edi, %edi # sched: [1:0.50]
; X64-JAG-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
; X64-JAG-NEXT: retq # sched: [4:1.00]
;
; X86-NOOPT-LABEL: test_mul_by_6:
-; X86-NOOPT: # BB#0:
+; X86-NOOPT: # %bb.0:
; X86-NOOPT-NEXT: imull $6, {{[0-9]+}}(%esp), %eax
; X86-NOOPT-NEXT: retl
;
; HSW-NOOPT-LABEL: test_mul_by_6:
-; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: imull $6, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT: # %bb.0:
+; HSW-NOOPT-NEXT: imull $6, %edi, %eax # sched: [3:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [7:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_6:
-; JAG-NOOPT: # BB#0:
+; JAG-NOOPT: # %bb.0:
; JAG-NOOPT-NEXT: imull $6, %edi, %eax # sched: [3:1.00]
; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
;
; X64-SLM-LABEL: test_mul_by_6:
-; X64-SLM: # BB#0:
-; X64-SLM-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-SLM: # %bb.0:
+; X64-SLM-NEXT: # kill: def %edi killed %edi def %rdi
; X64-SLM-NEXT: addl %edi, %edi # sched: [1:0.50]
; X64-SLM-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:1.00]
; X64-SLM-NEXT: retq # sched: [4:1.00]
;
; SLM-NOOPT-LABEL: test_mul_by_6:
-; SLM-NOOPT: # BB#0:
+; SLM-NOOPT: # %bb.0:
; SLM-NOOPT-NEXT: imull $6, %edi, %eax # sched: [3:1.00]
; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
%mul = mul nsw i32 %x, 6
@@ -310,50 +310,50 @@ define i32 @test_mul_by_6(i32 %x) {
define i32 @test_mul_by_7(i32 %x) {
; X86-LABEL: test_mul_by_7:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: leal (,%ecx,8), %eax
; X86-NEXT: subl %ecx, %eax
; X86-NEXT: retl
;
; X64-HSW-LABEL: test_mul_by_7:
-; X64-HSW: # BB#0:
-; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-HSW: # %bb.0:
+; X64-HSW-NEXT: # kill: def %edi killed %edi def %rdi
; X64-HSW-NEXT: leal (,%rdi,8), %eax # sched: [1:0.50]
; X64-HSW-NEXT: subl %edi, %eax # sched: [1:0.25]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [7:1.00]
;
; X64-JAG-LABEL: test_mul_by_7:
-; X64-JAG: # BB#0:
-; X64-JAG-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-JAG: # %bb.0:
+; X64-JAG-NEXT: # kill: def %edi killed %edi def %rdi
; X64-JAG-NEXT: leal (,%rdi,8), %eax # sched: [1:0.50]
; X64-JAG-NEXT: subl %edi, %eax # sched: [1:0.50]
; X64-JAG-NEXT: retq # sched: [4:1.00]
;
; X86-NOOPT-LABEL: test_mul_by_7:
-; X86-NOOPT: # BB#0:
+; X86-NOOPT: # %bb.0:
; X86-NOOPT-NEXT: imull $7, {{[0-9]+}}(%esp), %eax
; X86-NOOPT-NEXT: retl
;
; HSW-NOOPT-LABEL: test_mul_by_7:
-; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: imull $7, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT: # %bb.0:
+; HSW-NOOPT-NEXT: imull $7, %edi, %eax # sched: [3:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [7:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_7:
-; JAG-NOOPT: # BB#0:
+; JAG-NOOPT: # %bb.0:
; JAG-NOOPT-NEXT: imull $7, %edi, %eax # sched: [3:1.00]
; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
;
; X64-SLM-LABEL: test_mul_by_7:
-; X64-SLM: # BB#0:
-; X64-SLM-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-SLM: # %bb.0:
+; X64-SLM-NEXT: # kill: def %edi killed %edi def %rdi
; X64-SLM-NEXT: leal (,%rdi,8), %eax # sched: [1:1.00]
; X64-SLM-NEXT: subl %edi, %eax # sched: [1:0.50]
; X64-SLM-NEXT: retq # sched: [4:1.00]
;
; SLM-NOOPT-LABEL: test_mul_by_7:
-; SLM-NOOPT: # BB#0:
+; SLM-NOOPT: # %bb.0:
; SLM-NOOPT-NEXT: imull $7, %edi, %eax # sched: [3:1.00]
; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
%mul = mul nsw i32 %x, 7
@@ -362,50 +362,50 @@ define i32 @test_mul_by_7(i32 %x) {
define i32 @test_mul_by_8(i32 %x) {
; X86-LABEL: test_mul_by_8:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: shll $3, %eax
; X86-NEXT: retl
;
; X64-HSW-LABEL: test_mul_by_8:
-; X64-HSW: # BB#0:
-; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-HSW: # %bb.0:
+; X64-HSW-NEXT: # kill: def %edi killed %edi def %rdi
; X64-HSW-NEXT: leal (,%rdi,8), %eax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [7:1.00]
;
; X64-JAG-LABEL: test_mul_by_8:
-; X64-JAG: # BB#0:
-; X64-JAG-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-JAG: # %bb.0:
+; X64-JAG-NEXT: # kill: def %edi killed %edi def %rdi
; X64-JAG-NEXT: leal (,%rdi,8), %eax # sched: [1:0.50]
; X64-JAG-NEXT: retq # sched: [4:1.00]
;
; X86-NOOPT-LABEL: test_mul_by_8:
-; X86-NOOPT: # BB#0:
+; X86-NOOPT: # %bb.0:
; X86-NOOPT-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NOOPT-NEXT: shll $3, %eax
; X86-NOOPT-NEXT: retl
;
; HSW-NOOPT-LABEL: test_mul_by_8:
-; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; HSW-NOOPT: # %bb.0:
+; HSW-NOOPT-NEXT: # kill: def %edi killed %edi def %rdi
; HSW-NOOPT-NEXT: leal (,%rdi,8), %eax # sched: [1:0.50]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [7:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_8:
-; JAG-NOOPT: # BB#0:
-; JAG-NOOPT-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; JAG-NOOPT: # %bb.0:
+; JAG-NOOPT-NEXT: # kill: def %edi killed %edi def %rdi
; JAG-NOOPT-NEXT: leal (,%rdi,8), %eax # sched: [1:0.50]
; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
;
; X64-SLM-LABEL: test_mul_by_8:
-; X64-SLM: # BB#0:
-; X64-SLM-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-SLM: # %bb.0:
+; X64-SLM-NEXT: # kill: def %edi killed %edi def %rdi
; X64-SLM-NEXT: leal (,%rdi,8), %eax # sched: [1:1.00]
; X64-SLM-NEXT: retq # sched: [4:1.00]
;
; SLM-NOOPT-LABEL: test_mul_by_8:
-; SLM-NOOPT: # BB#0:
-; SLM-NOOPT-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; SLM-NOOPT: # %bb.0:
+; SLM-NOOPT-NEXT: # kill: def %edi killed %edi def %rdi
; SLM-NOOPT-NEXT: leal (,%rdi,8), %eax # sched: [1:1.00]
; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
%mul = mul nsw i32 %x, 8
@@ -414,48 +414,48 @@ define i32 @test_mul_by_8(i32 %x) {
define i32 @test_mul_by_9(i32 %x) {
; X86-LABEL: test_mul_by_9:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: imull $9, {{[0-9]+}}(%esp), %eax
; X86-NEXT: retl
;
; X64-HSW-LABEL: test_mul_by_9:
-; X64-HSW: # BB#0:
-; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-HSW: # %bb.0:
+; X64-HSW-NEXT: # kill: def %edi killed %edi def %rdi
; X64-HSW-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [7:1.00]
;
; X64-JAG-LABEL: test_mul_by_9:
-; X64-JAG: # BB#0:
-; X64-JAG-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-JAG: # %bb.0:
+; X64-JAG-NEXT: # kill: def %edi killed %edi def %rdi
; X64-JAG-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:0.50]
; X64-JAG-NEXT: retq # sched: [4:1.00]
;
; X86-NOOPT-LABEL: test_mul_by_9:
-; X86-NOOPT: # BB#0:
+; X86-NOOPT: # %bb.0:
; X86-NOOPT-NEXT: imull $9, {{[0-9]+}}(%esp), %eax
; X86-NOOPT-NEXT: retl
;
; HSW-NOOPT-LABEL: test_mul_by_9:
-; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; HSW-NOOPT: # %bb.0:
+; HSW-NOOPT-NEXT: # kill: def %edi killed %edi def %rdi
; HSW-NOOPT-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:0.50]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [7:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_9:
-; JAG-NOOPT: # BB#0:
-; JAG-NOOPT-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; JAG-NOOPT: # %bb.0:
+; JAG-NOOPT-NEXT: # kill: def %edi killed %edi def %rdi
; JAG-NOOPT-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:0.50]
; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
;
; X64-SLM-LABEL: test_mul_by_9:
-; X64-SLM: # BB#0:
-; X64-SLM-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-SLM: # %bb.0:
+; X64-SLM-NEXT: # kill: def %edi killed %edi def %rdi
; X64-SLM-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:1.00]
; X64-SLM-NEXT: retq # sched: [4:1.00]
;
; SLM-NOOPT-LABEL: test_mul_by_9:
-; SLM-NOOPT: # BB#0:
-; SLM-NOOPT-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; SLM-NOOPT: # %bb.0:
+; SLM-NOOPT-NEXT: # kill: def %edi killed %edi def %rdi
; SLM-NOOPT-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:1.00]
; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
%mul = mul nsw i32 %x, 9
@@ -464,50 +464,50 @@ define i32 @test_mul_by_9(i32 %x) {
define i32 @test_mul_by_10(i32 %x) {
; X86-LABEL: test_mul_by_10:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: addl %eax, %eax
; X86-NEXT: leal (%eax,%eax,4), %eax
; X86-NEXT: retl
;
; X64-HSW-LABEL: test_mul_by_10:
-; X64-HSW: # BB#0:
-; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-HSW: # %bb.0:
+; X64-HSW-NEXT: # kill: def %edi killed %edi def %rdi
; X64-HSW-NEXT: addl %edi, %edi # sched: [1:0.25]
; X64-HSW-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [7:1.00]
;
; X64-JAG-LABEL: test_mul_by_10:
-; X64-JAG: # BB#0:
-; X64-JAG-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-JAG: # %bb.0:
+; X64-JAG-NEXT: # kill: def %edi killed %edi def %rdi
; X64-JAG-NEXT: addl %edi, %edi # sched: [1:0.50]
; X64-JAG-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
; X64-JAG-NEXT: retq # sched: [4:1.00]
;
; X86-NOOPT-LABEL: test_mul_by_10:
-; X86-NOOPT: # BB#0:
+; X86-NOOPT: # %bb.0:
; X86-NOOPT-NEXT: imull $10, {{[0-9]+}}(%esp), %eax
; X86-NOOPT-NEXT: retl
;
; HSW-NOOPT-LABEL: test_mul_by_10:
-; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: imull $10, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT: # %bb.0:
+; HSW-NOOPT-NEXT: imull $10, %edi, %eax # sched: [3:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [7:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_10:
-; JAG-NOOPT: # BB#0:
+; JAG-NOOPT: # %bb.0:
; JAG-NOOPT-NEXT: imull $10, %edi, %eax # sched: [3:1.00]
; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
;
; X64-SLM-LABEL: test_mul_by_10:
-; X64-SLM: # BB#0:
-; X64-SLM-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-SLM: # %bb.0:
+; X64-SLM-NEXT: # kill: def %edi killed %edi def %rdi
; X64-SLM-NEXT: addl %edi, %edi # sched: [1:0.50]
; X64-SLM-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:1.00]
; X64-SLM-NEXT: retq # sched: [4:1.00]
;
; SLM-NOOPT-LABEL: test_mul_by_10:
-; SLM-NOOPT: # BB#0:
+; SLM-NOOPT: # %bb.0:
; SLM-NOOPT-NEXT: imull $10, %edi, %eax # sched: [3:1.00]
; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
%mul = mul nsw i32 %x, 10
@@ -516,48 +516,48 @@ define i32 @test_mul_by_10(i32 %x) {
define i32 @test_mul_by_11(i32 %x) {
; X86-LABEL: test_mul_by_11:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: leal (%eax,%eax,4), %ecx
; X86-NEXT: leal (%eax,%ecx,2), %eax
; X86-NEXT: retl
;
; X64-HSW-LABEL: test_mul_by_11:
-; X64-HSW: # BB#0:
-; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-HSW: # %bb.0:
+; X64-HSW-NEXT: # kill: def %edi killed %edi def %rdi
; X64-HSW-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
; X64-HSW-NEXT: leal (%rdi,%rax,2), %eax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [7:1.00]
;
; X64-JAG-LABEL: test_mul_by_11:
-; X64-JAG: # BB#0:
-; X64-JAG-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-JAG: # %bb.0:
+; X64-JAG-NEXT: # kill: def %edi killed %edi def %rdi
; X64-JAG-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
; X64-JAG-NEXT: leal (%rdi,%rax,2), %eax # sched: [1:0.50]
; X64-JAG-NEXT: retq # sched: [4:1.00]
;
; X86-NOOPT-LABEL: test_mul_by_11:
-; X86-NOOPT: # BB#0:
+; X86-NOOPT: # %bb.0:
; X86-NOOPT-NEXT: imull $11, {{[0-9]+}}(%esp), %eax
; X86-NOOPT-NEXT: retl
;
; HSW-NOOPT-LABEL: test_mul_by_11:
-; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: imull $11, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT: # %bb.0:
+; HSW-NOOPT-NEXT: imull $11, %edi, %eax # sched: [3:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [7:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_11:
-; JAG-NOOPT: # BB#0:
+; JAG-NOOPT: # %bb.0:
; JAG-NOOPT-NEXT: imull $11, %edi, %eax # sched: [3:1.00]
; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
;
; X64-SLM-LABEL: test_mul_by_11:
-; X64-SLM: # BB#0:
+; X64-SLM: # %bb.0:
; X64-SLM-NEXT: imull $11, %edi, %eax # sched: [3:1.00]
; X64-SLM-NEXT: retq # sched: [4:1.00]
;
; SLM-NOOPT-LABEL: test_mul_by_11:
-; SLM-NOOPT: # BB#0:
+; SLM-NOOPT: # %bb.0:
; SLM-NOOPT-NEXT: imull $11, %edi, %eax # sched: [3:1.00]
; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
%mul = mul nsw i32 %x, 11
@@ -566,50 +566,50 @@ define i32 @test_mul_by_11(i32 %x) {
define i32 @test_mul_by_12(i32 %x) {
; X86-LABEL: test_mul_by_12:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: shll $2, %eax
; X86-NEXT: leal (%eax,%eax,2), %eax
; X86-NEXT: retl
;
; X64-HSW-LABEL: test_mul_by_12:
-; X64-HSW: # BB#0:
-; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-HSW: # %bb.0:
+; X64-HSW-NEXT: # kill: def %edi killed %edi def %rdi
; X64-HSW-NEXT: shll $2, %edi # sched: [1:0.50]
; X64-HSW-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [7:1.00]
;
; X64-JAG-LABEL: test_mul_by_12:
-; X64-JAG: # BB#0:
-; X64-JAG-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-JAG: # %bb.0:
+; X64-JAG-NEXT: # kill: def %edi killed %edi def %rdi
; X64-JAG-NEXT: shll $2, %edi # sched: [1:0.50]
; X64-JAG-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
; X64-JAG-NEXT: retq # sched: [4:1.00]
;
; X86-NOOPT-LABEL: test_mul_by_12:
-; X86-NOOPT: # BB#0:
+; X86-NOOPT: # %bb.0:
; X86-NOOPT-NEXT: imull $12, {{[0-9]+}}(%esp), %eax
; X86-NOOPT-NEXT: retl
;
; HSW-NOOPT-LABEL: test_mul_by_12:
-; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: imull $12, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT: # %bb.0:
+; HSW-NOOPT-NEXT: imull $12, %edi, %eax # sched: [3:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [7:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_12:
-; JAG-NOOPT: # BB#0:
+; JAG-NOOPT: # %bb.0:
; JAG-NOOPT-NEXT: imull $12, %edi, %eax # sched: [3:1.00]
; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
;
; X64-SLM-LABEL: test_mul_by_12:
-; X64-SLM: # BB#0:
-; X64-SLM-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-SLM: # %bb.0:
+; X64-SLM-NEXT: # kill: def %edi killed %edi def %rdi
; X64-SLM-NEXT: shll $2, %edi # sched: [1:1.00]
; X64-SLM-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:1.00]
; X64-SLM-NEXT: retq # sched: [4:1.00]
;
; SLM-NOOPT-LABEL: test_mul_by_12:
-; SLM-NOOPT: # BB#0:
+; SLM-NOOPT: # %bb.0:
; SLM-NOOPT-NEXT: imull $12, %edi, %eax # sched: [3:1.00]
; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
%mul = mul nsw i32 %x, 12
@@ -618,48 +618,48 @@ define i32 @test_mul_by_12(i32 %x) {
define i32 @test_mul_by_13(i32 %x) {
; X86-LABEL: test_mul_by_13:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: leal (%eax,%eax,2), %ecx
; X86-NEXT: leal (%eax,%ecx,4), %eax
; X86-NEXT: retl
;
; X64-HSW-LABEL: test_mul_by_13:
-; X64-HSW: # BB#0:
-; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-HSW: # %bb.0:
+; X64-HSW-NEXT: # kill: def %edi killed %edi def %rdi
; X64-HSW-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
; X64-HSW-NEXT: leal (%rdi,%rax,4), %eax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [7:1.00]
;
; X64-JAG-LABEL: test_mul_by_13:
-; X64-JAG: # BB#0:
-; X64-JAG-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-JAG: # %bb.0:
+; X64-JAG-NEXT: # kill: def %edi killed %edi def %rdi
; X64-JAG-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
; X64-JAG-NEXT: leal (%rdi,%rax,4), %eax # sched: [1:0.50]
; X64-JAG-NEXT: retq # sched: [4:1.00]
;
; X86-NOOPT-LABEL: test_mul_by_13:
-; X86-NOOPT: # BB#0:
+; X86-NOOPT: # %bb.0:
; X86-NOOPT-NEXT: imull $13, {{[0-9]+}}(%esp), %eax
; X86-NOOPT-NEXT: retl
;
; HSW-NOOPT-LABEL: test_mul_by_13:
-; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: imull $13, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT: # %bb.0:
+; HSW-NOOPT-NEXT: imull $13, %edi, %eax # sched: [3:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [7:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_13:
-; JAG-NOOPT: # BB#0:
+; JAG-NOOPT: # %bb.0:
; JAG-NOOPT-NEXT: imull $13, %edi, %eax # sched: [3:1.00]
; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
;
; X64-SLM-LABEL: test_mul_by_13:
-; X64-SLM: # BB#0:
+; X64-SLM: # %bb.0:
; X64-SLM-NEXT: imull $13, %edi, %eax # sched: [3:1.00]
; X64-SLM-NEXT: retq # sched: [4:1.00]
;
; SLM-NOOPT-LABEL: test_mul_by_13:
-; SLM-NOOPT: # BB#0:
+; SLM-NOOPT: # %bb.0:
; SLM-NOOPT-NEXT: imull $13, %edi, %eax # sched: [3:1.00]
; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
%mul = mul nsw i32 %x, 13
@@ -668,7 +668,7 @@ define i32 @test_mul_by_13(i32 %x) {
define i32 @test_mul_by_14(i32 %x) {
; X86-LABEL: test_mul_by_14:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: leal (%ecx,%ecx,2), %eax
; X86-NEXT: leal (%ecx,%eax,4), %eax
@@ -676,43 +676,43 @@ define i32 @test_mul_by_14(i32 %x) {
; X86-NEXT: retl
;
; X64-HSW-LABEL: test_mul_by_14:
-; X64-HSW: # BB#0:
-; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-HSW: # %bb.0:
+; X64-HSW-NEXT: # kill: def %edi killed %edi def %rdi
; X64-HSW-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
; X64-HSW-NEXT: leal (%rdi,%rax,4), %eax # sched: [1:0.50]
; X64-HSW-NEXT: addl %edi, %eax # sched: [1:0.25]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [7:1.00]
;
; X64-JAG-LABEL: test_mul_by_14:
-; X64-JAG: # BB#0:
-; X64-JAG-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-JAG: # %bb.0:
+; X64-JAG-NEXT: # kill: def %edi killed %edi def %rdi
; X64-JAG-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
; X64-JAG-NEXT: leal (%rdi,%rax,4), %eax # sched: [1:0.50]
; X64-JAG-NEXT: addl %edi, %eax # sched: [1:0.50]
; X64-JAG-NEXT: retq # sched: [4:1.00]
;
; X86-NOOPT-LABEL: test_mul_by_14:
-; X86-NOOPT: # BB#0:
+; X86-NOOPT: # %bb.0:
; X86-NOOPT-NEXT: imull $14, {{[0-9]+}}(%esp), %eax
; X86-NOOPT-NEXT: retl
;
; HSW-NOOPT-LABEL: test_mul_by_14:
-; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: imull $14, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT: # %bb.0:
+; HSW-NOOPT-NEXT: imull $14, %edi, %eax # sched: [3:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [7:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_14:
-; JAG-NOOPT: # BB#0:
+; JAG-NOOPT: # %bb.0:
; JAG-NOOPT-NEXT: imull $14, %edi, %eax # sched: [3:1.00]
; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
;
; X64-SLM-LABEL: test_mul_by_14:
-; X64-SLM: # BB#0:
+; X64-SLM: # %bb.0:
; X64-SLM-NEXT: imull $14, %edi, %eax # sched: [3:1.00]
; X64-SLM-NEXT: retq # sched: [4:1.00]
;
; SLM-NOOPT-LABEL: test_mul_by_14:
-; SLM-NOOPT: # BB#0:
+; SLM-NOOPT: # %bb.0:
; SLM-NOOPT-NEXT: imull $14, %edi, %eax # sched: [3:1.00]
; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
%mul = mul nsw i32 %x, 14
@@ -721,50 +721,50 @@ define i32 @test_mul_by_14(i32 %x) {
define i32 @test_mul_by_15(i32 %x) {
; X86-LABEL: test_mul_by_15:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: leal (%eax,%eax,4), %eax
; X86-NEXT: leal (%eax,%eax,2), %eax
; X86-NEXT: retl
;
; X64-HSW-LABEL: test_mul_by_15:
-; X64-HSW: # BB#0:
-; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-HSW: # %bb.0:
+; X64-HSW-NEXT: # kill: def %edi killed %edi def %rdi
; X64-HSW-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
; X64-HSW-NEXT: leal (%rax,%rax,2), %eax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [7:1.00]
;
; X64-JAG-LABEL: test_mul_by_15:
-; X64-JAG: # BB#0:
-; X64-JAG-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-JAG: # %bb.0:
+; X64-JAG-NEXT: # kill: def %edi killed %edi def %rdi
; X64-JAG-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
; X64-JAG-NEXT: leal (%rax,%rax,2), %eax # sched: [1:0.50]
; X64-JAG-NEXT: retq # sched: [4:1.00]
;
; X86-NOOPT-LABEL: test_mul_by_15:
-; X86-NOOPT: # BB#0:
+; X86-NOOPT: # %bb.0:
; X86-NOOPT-NEXT: imull $15, {{[0-9]+}}(%esp), %eax
; X86-NOOPT-NEXT: retl
;
; HSW-NOOPT-LABEL: test_mul_by_15:
-; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: imull $15, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT: # %bb.0:
+; HSW-NOOPT-NEXT: imull $15, %edi, %eax # sched: [3:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [7:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_15:
-; JAG-NOOPT: # BB#0:
+; JAG-NOOPT: # %bb.0:
; JAG-NOOPT-NEXT: imull $15, %edi, %eax # sched: [3:1.00]
; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
;
; X64-SLM-LABEL: test_mul_by_15:
-; X64-SLM: # BB#0:
-; X64-SLM-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-SLM: # %bb.0:
+; X64-SLM-NEXT: # kill: def %edi killed %edi def %rdi
; X64-SLM-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:1.00]
; X64-SLM-NEXT: leal (%rax,%rax,2), %eax # sched: [1:1.00]
; X64-SLM-NEXT: retq # sched: [4:1.00]
;
; SLM-NOOPT-LABEL: test_mul_by_15:
-; SLM-NOOPT: # BB#0:
+; SLM-NOOPT: # %bb.0:
; SLM-NOOPT-NEXT: imull $15, %edi, %eax # sched: [3:1.00]
; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
%mul = mul nsw i32 %x, 15
@@ -773,49 +773,49 @@ define i32 @test_mul_by_15(i32 %x) {
define i32 @test_mul_by_16(i32 %x) {
; X86-LABEL: test_mul_by_16:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: shll $4, %eax
; X86-NEXT: retl
;
; X64-HSW-LABEL: test_mul_by_16:
-; X64-HSW: # BB#0:
+; X64-HSW: # %bb.0:
; X64-HSW-NEXT: shll $4, %edi # sched: [1:0.50]
; X64-HSW-NEXT: movl %edi, %eax # sched: [1:0.25]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [7:1.00]
;
; X64-JAG-LABEL: test_mul_by_16:
-; X64-JAG: # BB#0:
+; X64-JAG: # %bb.0:
; X64-JAG-NEXT: shll $4, %edi # sched: [1:0.50]
-; X64-JAG-NEXT: movl %edi, %eax # sched: [1:0.17]
+; X64-JAG-NEXT: movl %edi, %eax # sched: [1:0.50]
; X64-JAG-NEXT: retq # sched: [4:1.00]
;
; X86-NOOPT-LABEL: test_mul_by_16:
-; X86-NOOPT: # BB#0:
+; X86-NOOPT: # %bb.0:
; X86-NOOPT-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NOOPT-NEXT: shll $4, %eax
; X86-NOOPT-NEXT: retl
;
; HSW-NOOPT-LABEL: test_mul_by_16:
-; HSW-NOOPT: # BB#0:
+; HSW-NOOPT: # %bb.0:
; HSW-NOOPT-NEXT: shll $4, %edi # sched: [1:0.50]
; HSW-NOOPT-NEXT: movl %edi, %eax # sched: [1:0.25]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [7:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_16:
-; JAG-NOOPT: # BB#0:
+; JAG-NOOPT: # %bb.0:
; JAG-NOOPT-NEXT: shll $4, %edi # sched: [1:0.50]
-; JAG-NOOPT-NEXT: movl %edi, %eax # sched: [1:0.17]
+; JAG-NOOPT-NEXT: movl %edi, %eax # sched: [1:0.50]
; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
;
; X64-SLM-LABEL: test_mul_by_16:
-; X64-SLM: # BB#0:
+; X64-SLM: # %bb.0:
; X64-SLM-NEXT: shll $4, %edi # sched: [1:1.00]
; X64-SLM-NEXT: movl %edi, %eax # sched: [1:0.50]
; X64-SLM-NEXT: retq # sched: [4:1.00]
;
; SLM-NOOPT-LABEL: test_mul_by_16:
-; SLM-NOOPT: # BB#0:
+; SLM-NOOPT: # %bb.0:
; SLM-NOOPT-NEXT: shll $4, %edi # sched: [1:1.00]
; SLM-NOOPT-NEXT: movl %edi, %eax # sched: [1:0.50]
; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
@@ -825,7 +825,7 @@ define i32 @test_mul_by_16(i32 %x) {
define i32 @test_mul_by_17(i32 %x) {
; X86-LABEL: test_mul_by_17:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: shll $4, %eax
@@ -833,46 +833,46 @@ define i32 @test_mul_by_17(i32 %x) {
; X86-NEXT: retl
;
; X64-HSW-LABEL: test_mul_by_17:
-; X64-HSW: # BB#0:
-; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-HSW: # %bb.0:
+; X64-HSW-NEXT: # kill: def %edi killed %edi def %rdi
; X64-HSW-NEXT: movl %edi, %eax # sched: [1:0.25]
; X64-HSW-NEXT: shll $4, %eax # sched: [1:0.50]
; X64-HSW-NEXT: leal (%rax,%rdi), %eax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [7:1.00]
;
; X64-JAG-LABEL: test_mul_by_17:
-; X64-JAG: # BB#0:
-; X64-JAG-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-JAG-NEXT: movl %edi, %eax # sched: [1:0.17]
+; X64-JAG: # %bb.0:
+; X64-JAG-NEXT: # kill: def %edi killed %edi def %rdi
+; X64-JAG-NEXT: movl %edi, %eax # sched: [1:0.50]
; X64-JAG-NEXT: shll $4, %eax # sched: [1:0.50]
; X64-JAG-NEXT: leal (%rax,%rdi), %eax # sched: [1:0.50]
; X64-JAG-NEXT: retq # sched: [4:1.00]
;
; X86-NOOPT-LABEL: test_mul_by_17:
-; X86-NOOPT: # BB#0:
+; X86-NOOPT: # %bb.0:
; X86-NOOPT-NEXT: imull $17, {{[0-9]+}}(%esp), %eax
; X86-NOOPT-NEXT: retl
;
; HSW-NOOPT-LABEL: test_mul_by_17:
-; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: imull $17, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT: # %bb.0:
+; HSW-NOOPT-NEXT: imull $17, %edi, %eax # sched: [3:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [7:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_17:
-; JAG-NOOPT: # BB#0:
+; JAG-NOOPT: # %bb.0:
; JAG-NOOPT-NEXT: imull $17, %edi, %eax # sched: [3:1.00]
; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
;
; X64-SLM-LABEL: test_mul_by_17:
-; X64-SLM: # BB#0:
-; X64-SLM-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-SLM: # %bb.0:
+; X64-SLM-NEXT: # kill: def %edi killed %edi def %rdi
; X64-SLM-NEXT: movl %edi, %eax # sched: [1:0.50]
; X64-SLM-NEXT: shll $4, %eax # sched: [1:1.00]
; X64-SLM-NEXT: leal (%rax,%rdi), %eax # sched: [1:1.00]
; X64-SLM-NEXT: retq # sched: [4:1.00]
;
; SLM-NOOPT-LABEL: test_mul_by_17:
-; SLM-NOOPT: # BB#0:
+; SLM-NOOPT: # %bb.0:
; SLM-NOOPT-NEXT: imull $17, %edi, %eax # sched: [3:1.00]
; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
%mul = mul nsw i32 %x, 17
@@ -881,50 +881,50 @@ define i32 @test_mul_by_17(i32 %x) {
define i32 @test_mul_by_18(i32 %x) {
; X86-LABEL: test_mul_by_18:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: addl %eax, %eax
; X86-NEXT: leal (%eax,%eax,8), %eax
; X86-NEXT: retl
;
; X64-HSW-LABEL: test_mul_by_18:
-; X64-HSW: # BB#0:
-; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-HSW: # %bb.0:
+; X64-HSW-NEXT: # kill: def %edi killed %edi def %rdi
; X64-HSW-NEXT: addl %edi, %edi # sched: [1:0.25]
; X64-HSW-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [7:1.00]
;
; X64-JAG-LABEL: test_mul_by_18:
-; X64-JAG: # BB#0:
-; X64-JAG-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-JAG: # %bb.0:
+; X64-JAG-NEXT: # kill: def %edi killed %edi def %rdi
; X64-JAG-NEXT: addl %edi, %edi # sched: [1:0.50]
; X64-JAG-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:0.50]
; X64-JAG-NEXT: retq # sched: [4:1.00]
;
; X86-NOOPT-LABEL: test_mul_by_18:
-; X86-NOOPT: # BB#0:
+; X86-NOOPT: # %bb.0:
; X86-NOOPT-NEXT: imull $18, {{[0-9]+}}(%esp), %eax
; X86-NOOPT-NEXT: retl
;
; HSW-NOOPT-LABEL: test_mul_by_18:
-; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: imull $18, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT: # %bb.0:
+; HSW-NOOPT-NEXT: imull $18, %edi, %eax # sched: [3:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [7:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_18:
-; JAG-NOOPT: # BB#0:
+; JAG-NOOPT: # %bb.0:
; JAG-NOOPT-NEXT: imull $18, %edi, %eax # sched: [3:1.00]
; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
;
; X64-SLM-LABEL: test_mul_by_18:
-; X64-SLM: # BB#0:
-; X64-SLM-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-SLM: # %bb.0:
+; X64-SLM-NEXT: # kill: def %edi killed %edi def %rdi
; X64-SLM-NEXT: addl %edi, %edi # sched: [1:0.50]
; X64-SLM-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:1.00]
; X64-SLM-NEXT: retq # sched: [4:1.00]
;
; SLM-NOOPT-LABEL: test_mul_by_18:
-; SLM-NOOPT: # BB#0:
+; SLM-NOOPT: # %bb.0:
; SLM-NOOPT-NEXT: imull $18, %edi, %eax # sched: [3:1.00]
; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
%mul = mul nsw i32 %x, 18
@@ -933,7 +933,7 @@ define i32 @test_mul_by_18(i32 %x) {
define i32 @test_mul_by_19(i32 %x) {
; X86-LABEL: test_mul_by_19:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: leal (%ecx,%ecx,4), %eax
; X86-NEXT: shll $2, %eax
@@ -941,43 +941,43 @@ define i32 @test_mul_by_19(i32 %x) {
; X86-NEXT: retl
;
; X64-HSW-LABEL: test_mul_by_19:
-; X64-HSW: # BB#0:
-; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-HSW: # %bb.0:
+; X64-HSW-NEXT: # kill: def %edi killed %edi def %rdi
; X64-HSW-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
; X64-HSW-NEXT: shll $2, %eax # sched: [1:0.50]
; X64-HSW-NEXT: subl %edi, %eax # sched: [1:0.25]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [7:1.00]
;
; X64-JAG-LABEL: test_mul_by_19:
-; X64-JAG: # BB#0:
-; X64-JAG-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-JAG: # %bb.0:
+; X64-JAG-NEXT: # kill: def %edi killed %edi def %rdi
; X64-JAG-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
; X64-JAG-NEXT: shll $2, %eax # sched: [1:0.50]
; X64-JAG-NEXT: subl %edi, %eax # sched: [1:0.50]
; X64-JAG-NEXT: retq # sched: [4:1.00]
;
; X86-NOOPT-LABEL: test_mul_by_19:
-; X86-NOOPT: # BB#0:
+; X86-NOOPT: # %bb.0:
; X86-NOOPT-NEXT: imull $19, {{[0-9]+}}(%esp), %eax
; X86-NOOPT-NEXT: retl
;
; HSW-NOOPT-LABEL: test_mul_by_19:
-; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: imull $19, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT: # %bb.0:
+; HSW-NOOPT-NEXT: imull $19, %edi, %eax # sched: [3:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [7:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_19:
-; JAG-NOOPT: # BB#0:
+; JAG-NOOPT: # %bb.0:
; JAG-NOOPT-NEXT: imull $19, %edi, %eax # sched: [3:1.00]
; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
;
; X64-SLM-LABEL: test_mul_by_19:
-; X64-SLM: # BB#0:
+; X64-SLM: # %bb.0:
; X64-SLM-NEXT: imull $19, %edi, %eax # sched: [3:1.00]
; X64-SLM-NEXT: retq # sched: [4:1.00]
;
; SLM-NOOPT-LABEL: test_mul_by_19:
-; SLM-NOOPT: # BB#0:
+; SLM-NOOPT: # %bb.0:
; SLM-NOOPT-NEXT: imull $19, %edi, %eax # sched: [3:1.00]
; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
%mul = mul nsw i32 %x, 19
@@ -986,50 +986,50 @@ define i32 @test_mul_by_19(i32 %x) {
define i32 @test_mul_by_20(i32 %x) {
; X86-LABEL: test_mul_by_20:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: shll $2, %eax
; X86-NEXT: leal (%eax,%eax,4), %eax
; X86-NEXT: retl
;
; X64-HSW-LABEL: test_mul_by_20:
-; X64-HSW: # BB#0:
-; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-HSW: # %bb.0:
+; X64-HSW-NEXT: # kill: def %edi killed %edi def %rdi
; X64-HSW-NEXT: shll $2, %edi # sched: [1:0.50]
; X64-HSW-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [7:1.00]
;
; X64-JAG-LABEL: test_mul_by_20:
-; X64-JAG: # BB#0:
-; X64-JAG-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-JAG: # %bb.0:
+; X64-JAG-NEXT: # kill: def %edi killed %edi def %rdi
; X64-JAG-NEXT: shll $2, %edi # sched: [1:0.50]
; X64-JAG-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
; X64-JAG-NEXT: retq # sched: [4:1.00]
;
; X86-NOOPT-LABEL: test_mul_by_20:
-; X86-NOOPT: # BB#0:
+; X86-NOOPT: # %bb.0:
; X86-NOOPT-NEXT: imull $20, {{[0-9]+}}(%esp), %eax
; X86-NOOPT-NEXT: retl
;
; HSW-NOOPT-LABEL: test_mul_by_20:
-; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: imull $20, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT: # %bb.0:
+; HSW-NOOPT-NEXT: imull $20, %edi, %eax # sched: [3:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [7:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_20:
-; JAG-NOOPT: # BB#0:
+; JAG-NOOPT: # %bb.0:
; JAG-NOOPT-NEXT: imull $20, %edi, %eax # sched: [3:1.00]
; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
;
; X64-SLM-LABEL: test_mul_by_20:
-; X64-SLM: # BB#0:
-; X64-SLM-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-SLM: # %bb.0:
+; X64-SLM-NEXT: # kill: def %edi killed %edi def %rdi
; X64-SLM-NEXT: shll $2, %edi # sched: [1:1.00]
; X64-SLM-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:1.00]
; X64-SLM-NEXT: retq # sched: [4:1.00]
;
; SLM-NOOPT-LABEL: test_mul_by_20:
-; SLM-NOOPT: # BB#0:
+; SLM-NOOPT: # %bb.0:
; SLM-NOOPT-NEXT: imull $20, %edi, %eax # sched: [3:1.00]
; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
%mul = mul nsw i32 %x, 20
@@ -1038,48 +1038,48 @@ define i32 @test_mul_by_20(i32 %x) {
define i32 @test_mul_by_21(i32 %x) {
; X86-LABEL: test_mul_by_21:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: leal (%eax,%eax,4), %ecx
; X86-NEXT: leal (%eax,%ecx,4), %eax
; X86-NEXT: retl
;
; X64-HSW-LABEL: test_mul_by_21:
-; X64-HSW: # BB#0:
-; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-HSW: # %bb.0:
+; X64-HSW-NEXT: # kill: def %edi killed %edi def %rdi
; X64-HSW-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
; X64-HSW-NEXT: leal (%rdi,%rax,4), %eax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [7:1.00]
;
; X64-JAG-LABEL: test_mul_by_21:
-; X64-JAG: # BB#0:
-; X64-JAG-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-JAG: # %bb.0:
+; X64-JAG-NEXT: # kill: def %edi killed %edi def %rdi
; X64-JAG-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
; X64-JAG-NEXT: leal (%rdi,%rax,4), %eax # sched: [1:0.50]
; X64-JAG-NEXT: retq # sched: [4:1.00]
;
; X86-NOOPT-LABEL: test_mul_by_21:
-; X86-NOOPT: # BB#0:
+; X86-NOOPT: # %bb.0:
; X86-NOOPT-NEXT: imull $21, {{[0-9]+}}(%esp), %eax
; X86-NOOPT-NEXT: retl
;
; HSW-NOOPT-LABEL: test_mul_by_21:
-; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: imull $21, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT: # %bb.0:
+; HSW-NOOPT-NEXT: imull $21, %edi, %eax # sched: [3:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [7:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_21:
-; JAG-NOOPT: # BB#0:
+; JAG-NOOPT: # %bb.0:
; JAG-NOOPT-NEXT: imull $21, %edi, %eax # sched: [3:1.00]
; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
;
; X64-SLM-LABEL: test_mul_by_21:
-; X64-SLM: # BB#0:
+; X64-SLM: # %bb.0:
; X64-SLM-NEXT: imull $21, %edi, %eax # sched: [3:1.00]
; X64-SLM-NEXT: retq # sched: [4:1.00]
;
; SLM-NOOPT-LABEL: test_mul_by_21:
-; SLM-NOOPT: # BB#0:
+; SLM-NOOPT: # %bb.0:
; SLM-NOOPT-NEXT: imull $21, %edi, %eax # sched: [3:1.00]
; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
%mul = mul nsw i32 %x, 21
@@ -1088,7 +1088,7 @@ define i32 @test_mul_by_21(i32 %x) {
define i32 @test_mul_by_22(i32 %x) {
; X86-LABEL: test_mul_by_22:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: leal (%ecx,%ecx,4), %eax
; X86-NEXT: leal (%ecx,%eax,4), %eax
@@ -1096,43 +1096,43 @@ define i32 @test_mul_by_22(i32 %x) {
; X86-NEXT: retl
;
; X64-HSW-LABEL: test_mul_by_22:
-; X64-HSW: # BB#0:
-; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-HSW: # %bb.0:
+; X64-HSW-NEXT: # kill: def %edi killed %edi def %rdi
; X64-HSW-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
; X64-HSW-NEXT: leal (%rdi,%rax,4), %eax # sched: [1:0.50]
; X64-HSW-NEXT: addl %edi, %eax # sched: [1:0.25]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [7:1.00]
;
; X64-JAG-LABEL: test_mul_by_22:
-; X64-JAG: # BB#0:
-; X64-JAG-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-JAG: # %bb.0:
+; X64-JAG-NEXT: # kill: def %edi killed %edi def %rdi
; X64-JAG-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
; X64-JAG-NEXT: leal (%rdi,%rax,4), %eax # sched: [1:0.50]
; X64-JAG-NEXT: addl %edi, %eax # sched: [1:0.50]
; X64-JAG-NEXT: retq # sched: [4:1.00]
;
; X86-NOOPT-LABEL: test_mul_by_22:
-; X86-NOOPT: # BB#0:
+; X86-NOOPT: # %bb.0:
; X86-NOOPT-NEXT: imull $22, {{[0-9]+}}(%esp), %eax
; X86-NOOPT-NEXT: retl
;
; HSW-NOOPT-LABEL: test_mul_by_22:
-; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: imull $22, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT: # %bb.0:
+; HSW-NOOPT-NEXT: imull $22, %edi, %eax # sched: [3:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [7:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_22:
-; JAG-NOOPT: # BB#0:
+; JAG-NOOPT: # %bb.0:
; JAG-NOOPT-NEXT: imull $22, %edi, %eax # sched: [3:1.00]
; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
;
; X64-SLM-LABEL: test_mul_by_22:
-; X64-SLM: # BB#0:
+; X64-SLM: # %bb.0:
; X64-SLM-NEXT: imull $22, %edi, %eax # sched: [3:1.00]
; X64-SLM-NEXT: retq # sched: [4:1.00]
;
; SLM-NOOPT-LABEL: test_mul_by_22:
-; SLM-NOOPT: # BB#0:
+; SLM-NOOPT: # %bb.0:
; SLM-NOOPT-NEXT: imull $22, %edi, %eax # sched: [3:1.00]
; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
%mul = mul nsw i32 %x, 22
@@ -1141,7 +1141,7 @@ define i32 @test_mul_by_22(i32 %x) {
define i32 @test_mul_by_23(i32 %x) {
; X86-LABEL: test_mul_by_23:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: leal (%ecx,%ecx,2), %eax
; X86-NEXT: shll $3, %eax
@@ -1149,43 +1149,43 @@ define i32 @test_mul_by_23(i32 %x) {
; X86-NEXT: retl
;
; X64-HSW-LABEL: test_mul_by_23:
-; X64-HSW: # BB#0:
-; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-HSW: # %bb.0:
+; X64-HSW-NEXT: # kill: def %edi killed %edi def %rdi
; X64-HSW-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
; X64-HSW-NEXT: shll $3, %eax # sched: [1:0.50]
; X64-HSW-NEXT: subl %edi, %eax # sched: [1:0.25]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [7:1.00]
;
; X64-JAG-LABEL: test_mul_by_23:
-; X64-JAG: # BB#0:
-; X64-JAG-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-JAG: # %bb.0:
+; X64-JAG-NEXT: # kill: def %edi killed %edi def %rdi
; X64-JAG-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
; X64-JAG-NEXT: shll $3, %eax # sched: [1:0.50]
; X64-JAG-NEXT: subl %edi, %eax # sched: [1:0.50]
; X64-JAG-NEXT: retq # sched: [4:1.00]
;
; X86-NOOPT-LABEL: test_mul_by_23:
-; X86-NOOPT: # BB#0:
+; X86-NOOPT: # %bb.0:
; X86-NOOPT-NEXT: imull $23, {{[0-9]+}}(%esp), %eax
; X86-NOOPT-NEXT: retl
;
; HSW-NOOPT-LABEL: test_mul_by_23:
-; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: imull $23, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT: # %bb.0:
+; HSW-NOOPT-NEXT: imull $23, %edi, %eax # sched: [3:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [7:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_23:
-; JAG-NOOPT: # BB#0:
+; JAG-NOOPT: # %bb.0:
; JAG-NOOPT-NEXT: imull $23, %edi, %eax # sched: [3:1.00]
; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
;
; X64-SLM-LABEL: test_mul_by_23:
-; X64-SLM: # BB#0:
+; X64-SLM: # %bb.0:
; X64-SLM-NEXT: imull $23, %edi, %eax # sched: [3:1.00]
; X64-SLM-NEXT: retq # sched: [4:1.00]
;
; SLM-NOOPT-LABEL: test_mul_by_23:
-; SLM-NOOPT: # BB#0:
+; SLM-NOOPT: # %bb.0:
; SLM-NOOPT-NEXT: imull $23, %edi, %eax # sched: [3:1.00]
; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
%mul = mul nsw i32 %x, 23
@@ -1194,50 +1194,50 @@ define i32 @test_mul_by_23(i32 %x) {
define i32 @test_mul_by_24(i32 %x) {
; X86-LABEL: test_mul_by_24:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: shll $3, %eax
; X86-NEXT: leal (%eax,%eax,2), %eax
; X86-NEXT: retl
;
; X64-HSW-LABEL: test_mul_by_24:
-; X64-HSW: # BB#0:
-; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-HSW: # %bb.0:
+; X64-HSW-NEXT: # kill: def %edi killed %edi def %rdi
; X64-HSW-NEXT: shll $3, %edi # sched: [1:0.50]
; X64-HSW-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [7:1.00]
;
; X64-JAG-LABEL: test_mul_by_24:
-; X64-JAG: # BB#0:
-; X64-JAG-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-JAG: # %bb.0:
+; X64-JAG-NEXT: # kill: def %edi killed %edi def %rdi
; X64-JAG-NEXT: shll $3, %edi # sched: [1:0.50]
; X64-JAG-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
; X64-JAG-NEXT: retq # sched: [4:1.00]
;
; X86-NOOPT-LABEL: test_mul_by_24:
-; X86-NOOPT: # BB#0:
+; X86-NOOPT: # %bb.0:
; X86-NOOPT-NEXT: imull $24, {{[0-9]+}}(%esp), %eax
; X86-NOOPT-NEXT: retl
;
; HSW-NOOPT-LABEL: test_mul_by_24:
-; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: imull $24, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT: # %bb.0:
+; HSW-NOOPT-NEXT: imull $24, %edi, %eax # sched: [3:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [7:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_24:
-; JAG-NOOPT: # BB#0:
+; JAG-NOOPT: # %bb.0:
; JAG-NOOPT-NEXT: imull $24, %edi, %eax # sched: [3:1.00]
; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
;
; X64-SLM-LABEL: test_mul_by_24:
-; X64-SLM: # BB#0:
-; X64-SLM-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-SLM: # %bb.0:
+; X64-SLM-NEXT: # kill: def %edi killed %edi def %rdi
; X64-SLM-NEXT: shll $3, %edi # sched: [1:1.00]
; X64-SLM-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:1.00]
; X64-SLM-NEXT: retq # sched: [4:1.00]
;
; SLM-NOOPT-LABEL: test_mul_by_24:
-; SLM-NOOPT: # BB#0:
+; SLM-NOOPT: # %bb.0:
; SLM-NOOPT-NEXT: imull $24, %edi, %eax # sched: [3:1.00]
; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
%mul = mul nsw i32 %x, 24
@@ -1246,50 +1246,50 @@ define i32 @test_mul_by_24(i32 %x) {
define i32 @test_mul_by_25(i32 %x) {
; X86-LABEL: test_mul_by_25:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: leal (%eax,%eax,4), %eax
; X86-NEXT: leal (%eax,%eax,4), %eax
; X86-NEXT: retl
;
; X64-HSW-LABEL: test_mul_by_25:
-; X64-HSW: # BB#0:
-; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-HSW: # %bb.0:
+; X64-HSW-NEXT: # kill: def %edi killed %edi def %rdi
; X64-HSW-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
; X64-HSW-NEXT: leal (%rax,%rax,4), %eax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [7:1.00]
;
; X64-JAG-LABEL: test_mul_by_25:
-; X64-JAG: # BB#0:
-; X64-JAG-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-JAG: # %bb.0:
+; X64-JAG-NEXT: # kill: def %edi killed %edi def %rdi
; X64-JAG-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
; X64-JAG-NEXT: leal (%rax,%rax,4), %eax # sched: [1:0.50]
; X64-JAG-NEXT: retq # sched: [4:1.00]
;
; X86-NOOPT-LABEL: test_mul_by_25:
-; X86-NOOPT: # BB#0:
+; X86-NOOPT: # %bb.0:
; X86-NOOPT-NEXT: imull $25, {{[0-9]+}}(%esp), %eax
; X86-NOOPT-NEXT: retl
;
; HSW-NOOPT-LABEL: test_mul_by_25:
-; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: imull $25, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT: # %bb.0:
+; HSW-NOOPT-NEXT: imull $25, %edi, %eax # sched: [3:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [7:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_25:
-; JAG-NOOPT: # BB#0:
+; JAG-NOOPT: # %bb.0:
; JAG-NOOPT-NEXT: imull $25, %edi, %eax # sched: [3:1.00]
; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
;
; X64-SLM-LABEL: test_mul_by_25:
-; X64-SLM: # BB#0:
-; X64-SLM-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-SLM: # %bb.0:
+; X64-SLM-NEXT: # kill: def %edi killed %edi def %rdi
; X64-SLM-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:1.00]
; X64-SLM-NEXT: leal (%rax,%rax,4), %eax # sched: [1:1.00]
; X64-SLM-NEXT: retq # sched: [4:1.00]
;
; SLM-NOOPT-LABEL: test_mul_by_25:
-; SLM-NOOPT: # BB#0:
+; SLM-NOOPT: # %bb.0:
; SLM-NOOPT-NEXT: imull $25, %edi, %eax # sched: [3:1.00]
; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
%mul = mul nsw i32 %x, 25
@@ -1298,7 +1298,7 @@ define i32 @test_mul_by_25(i32 %x) {
define i32 @test_mul_by_26(i32 %x) {
; X86-LABEL: test_mul_by_26:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: leal (%ecx,%ecx,8), %eax
; X86-NEXT: leal (%eax,%eax,2), %eax
@@ -1306,43 +1306,43 @@ define i32 @test_mul_by_26(i32 %x) {
; X86-NEXT: retl
;
; X64-HSW-LABEL: test_mul_by_26:
-; X64-HSW: # BB#0:
-; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-HSW: # %bb.0:
+; X64-HSW-NEXT: # kill: def %edi killed %edi def %rdi
; X64-HSW-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:0.50]
; X64-HSW-NEXT: leal (%rax,%rax,2), %eax # sched: [1:0.50]
; X64-HSW-NEXT: subl %edi, %eax # sched: [1:0.25]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [7:1.00]
;
; X64-JAG-LABEL: test_mul_by_26:
-; X64-JAG: # BB#0:
-; X64-JAG-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-JAG: # %bb.0:
+; X64-JAG-NEXT: # kill: def %edi killed %edi def %rdi
; X64-JAG-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:0.50]
; X64-JAG-NEXT: leal (%rax,%rax,2), %eax # sched: [1:0.50]
; X64-JAG-NEXT: subl %edi, %eax # sched: [1:0.50]
; X64-JAG-NEXT: retq # sched: [4:1.00]
;
; X86-NOOPT-LABEL: test_mul_by_26:
-; X86-NOOPT: # BB#0:
+; X86-NOOPT: # %bb.0:
; X86-NOOPT-NEXT: imull $26, {{[0-9]+}}(%esp), %eax
; X86-NOOPT-NEXT: retl
;
; HSW-NOOPT-LABEL: test_mul_by_26:
-; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: imull $26, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT: # %bb.0:
+; HSW-NOOPT-NEXT: imull $26, %edi, %eax # sched: [3:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [7:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_26:
-; JAG-NOOPT: # BB#0:
+; JAG-NOOPT: # %bb.0:
; JAG-NOOPT-NEXT: imull $26, %edi, %eax # sched: [3:1.00]
; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
;
; X64-SLM-LABEL: test_mul_by_26:
-; X64-SLM: # BB#0:
+; X64-SLM: # %bb.0:
; X64-SLM-NEXT: imull $26, %edi, %eax # sched: [3:1.00]
; X64-SLM-NEXT: retq # sched: [4:1.00]
;
; SLM-NOOPT-LABEL: test_mul_by_26:
-; SLM-NOOPT: # BB#0:
+; SLM-NOOPT: # %bb.0:
; SLM-NOOPT-NEXT: imull $26, %edi, %eax # sched: [3:1.00]
; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
%mul = mul nsw i32 %x, 26
@@ -1351,50 +1351,50 @@ define i32 @test_mul_by_26(i32 %x) {
define i32 @test_mul_by_27(i32 %x) {
; X86-LABEL: test_mul_by_27:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: leal (%eax,%eax,8), %eax
; X86-NEXT: leal (%eax,%eax,2), %eax
; X86-NEXT: retl
;
; X64-HSW-LABEL: test_mul_by_27:
-; X64-HSW: # BB#0:
-; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-HSW: # %bb.0:
+; X64-HSW-NEXT: # kill: def %edi killed %edi def %rdi
; X64-HSW-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:0.50]
; X64-HSW-NEXT: leal (%rax,%rax,2), %eax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [7:1.00]
;
; X64-JAG-LABEL: test_mul_by_27:
-; X64-JAG: # BB#0:
-; X64-JAG-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-JAG: # %bb.0:
+; X64-JAG-NEXT: # kill: def %edi killed %edi def %rdi
; X64-JAG-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:0.50]
; X64-JAG-NEXT: leal (%rax,%rax,2), %eax # sched: [1:0.50]
; X64-JAG-NEXT: retq # sched: [4:1.00]
;
; X86-NOOPT-LABEL: test_mul_by_27:
-; X86-NOOPT: # BB#0:
+; X86-NOOPT: # %bb.0:
; X86-NOOPT-NEXT: imull $27, {{[0-9]+}}(%esp), %eax
; X86-NOOPT-NEXT: retl
;
; HSW-NOOPT-LABEL: test_mul_by_27:
-; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: imull $27, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT: # %bb.0:
+; HSW-NOOPT-NEXT: imull $27, %edi, %eax # sched: [3:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [7:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_27:
-; JAG-NOOPT: # BB#0:
+; JAG-NOOPT: # %bb.0:
; JAG-NOOPT-NEXT: imull $27, %edi, %eax # sched: [3:1.00]
; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
;
; X64-SLM-LABEL: test_mul_by_27:
-; X64-SLM: # BB#0:
-; X64-SLM-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-SLM: # %bb.0:
+; X64-SLM-NEXT: # kill: def %edi killed %edi def %rdi
; X64-SLM-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:1.00]
; X64-SLM-NEXT: leal (%rax,%rax,2), %eax # sched: [1:1.00]
; X64-SLM-NEXT: retq # sched: [4:1.00]
;
; SLM-NOOPT-LABEL: test_mul_by_27:
-; SLM-NOOPT: # BB#0:
+; SLM-NOOPT: # %bb.0:
; SLM-NOOPT-NEXT: imull $27, %edi, %eax # sched: [3:1.00]
; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
%mul = mul nsw i32 %x, 27
@@ -1403,7 +1403,7 @@ define i32 @test_mul_by_27(i32 %x) {
define i32 @test_mul_by_28(i32 %x) {
; X86-LABEL: test_mul_by_28:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: leal (%ecx,%ecx,8), %eax
; X86-NEXT: leal (%eax,%eax,2), %eax
@@ -1411,43 +1411,43 @@ define i32 @test_mul_by_28(i32 %x) {
; X86-NEXT: retl
;
; X64-HSW-LABEL: test_mul_by_28:
-; X64-HSW: # BB#0:
-; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-HSW: # %bb.0:
+; X64-HSW-NEXT: # kill: def %edi killed %edi def %rdi
; X64-HSW-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:0.50]
; X64-HSW-NEXT: leal (%rax,%rax,2), %eax # sched: [1:0.50]
; X64-HSW-NEXT: addl %edi, %eax # sched: [1:0.25]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [7:1.00]
;
; X64-JAG-LABEL: test_mul_by_28:
-; X64-JAG: # BB#0:
-; X64-JAG-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-JAG: # %bb.0:
+; X64-JAG-NEXT: # kill: def %edi killed %edi def %rdi
; X64-JAG-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:0.50]
; X64-JAG-NEXT: leal (%rax,%rax,2), %eax # sched: [1:0.50]
; X64-JAG-NEXT: addl %edi, %eax # sched: [1:0.50]
; X64-JAG-NEXT: retq # sched: [4:1.00]
;
; X86-NOOPT-LABEL: test_mul_by_28:
-; X86-NOOPT: # BB#0:
+; X86-NOOPT: # %bb.0:
; X86-NOOPT-NEXT: imull $28, {{[0-9]+}}(%esp), %eax
; X86-NOOPT-NEXT: retl
;
; HSW-NOOPT-LABEL: test_mul_by_28:
-; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: imull $28, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT: # %bb.0:
+; HSW-NOOPT-NEXT: imull $28, %edi, %eax # sched: [3:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [7:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_28:
-; JAG-NOOPT: # BB#0:
+; JAG-NOOPT: # %bb.0:
; JAG-NOOPT-NEXT: imull $28, %edi, %eax # sched: [3:1.00]
; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
;
; X64-SLM-LABEL: test_mul_by_28:
-; X64-SLM: # BB#0:
+; X64-SLM: # %bb.0:
; X64-SLM-NEXT: imull $28, %edi, %eax # sched: [3:1.00]
; X64-SLM-NEXT: retq # sched: [4:1.00]
;
; SLM-NOOPT-LABEL: test_mul_by_28:
-; SLM-NOOPT: # BB#0:
+; SLM-NOOPT: # %bb.0:
; SLM-NOOPT-NEXT: imull $28, %edi, %eax # sched: [3:1.00]
; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
%mul = mul nsw i32 %x, 28
@@ -1456,7 +1456,7 @@ define i32 @test_mul_by_28(i32 %x) {
define i32 @test_mul_by_29(i32 %x) {
; X86-LABEL: test_mul_by_29:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: leal (%ecx,%ecx,8), %eax
; X86-NEXT: leal (%eax,%eax,2), %eax
@@ -1465,17 +1465,17 @@ define i32 @test_mul_by_29(i32 %x) {
; X86-NEXT: retl
;
; X64-HSW-LABEL: test_mul_by_29:
-; X64-HSW: # BB#0:
-; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-HSW: # %bb.0:
+; X64-HSW-NEXT: # kill: def %edi killed %edi def %rdi
; X64-HSW-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:0.50]
; X64-HSW-NEXT: leal (%rax,%rax,2), %eax # sched: [1:0.50]
; X64-HSW-NEXT: addl %edi, %eax # sched: [1:0.25]
; X64-HSW-NEXT: addl %edi, %eax # sched: [1:0.25]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [7:1.00]
;
; X64-JAG-LABEL: test_mul_by_29:
-; X64-JAG: # BB#0:
-; X64-JAG-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-JAG: # %bb.0:
+; X64-JAG-NEXT: # kill: def %edi killed %edi def %rdi
; X64-JAG-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:0.50]
; X64-JAG-NEXT: leal (%rax,%rax,2), %eax # sched: [1:0.50]
; X64-JAG-NEXT: addl %edi, %eax # sched: [1:0.50]
@@ -1483,27 +1483,27 @@ define i32 @test_mul_by_29(i32 %x) {
; X64-JAG-NEXT: retq # sched: [4:1.00]
;
; X86-NOOPT-LABEL: test_mul_by_29:
-; X86-NOOPT: # BB#0:
+; X86-NOOPT: # %bb.0:
; X86-NOOPT-NEXT: imull $29, {{[0-9]+}}(%esp), %eax
; X86-NOOPT-NEXT: retl
;
; HSW-NOOPT-LABEL: test_mul_by_29:
-; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: imull $29, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT: # %bb.0:
+; HSW-NOOPT-NEXT: imull $29, %edi, %eax # sched: [3:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [7:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_29:
-; JAG-NOOPT: # BB#0:
+; JAG-NOOPT: # %bb.0:
; JAG-NOOPT-NEXT: imull $29, %edi, %eax # sched: [3:1.00]
; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
;
; X64-SLM-LABEL: test_mul_by_29:
-; X64-SLM: # BB#0:
+; X64-SLM: # %bb.0:
; X64-SLM-NEXT: imull $29, %edi, %eax # sched: [3:1.00]
; X64-SLM-NEXT: retq # sched: [4:1.00]
;
; SLM-NOOPT-LABEL: test_mul_by_29:
-; SLM-NOOPT: # BB#0:
+; SLM-NOOPT: # %bb.0:
; SLM-NOOPT-NEXT: imull $29, %edi, %eax # sched: [3:1.00]
; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
%mul = mul nsw i32 %x, 29
@@ -1512,7 +1512,7 @@ define i32 @test_mul_by_29(i32 %x) {
define i32 @test_mul_by_30(i32 %x) {
; X86-LABEL: test_mul_by_30:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: shll $5, %eax
@@ -1521,43 +1521,43 @@ define i32 @test_mul_by_30(i32 %x) {
; X86-NEXT: retl
;
; X64-HSW-LABEL: test_mul_by_30:
-; X64-HSW: # BB#0:
+; X64-HSW: # %bb.0:
; X64-HSW-NEXT: movl %edi, %eax # sched: [1:0.25]
; X64-HSW-NEXT: shll $5, %eax # sched: [1:0.50]
; X64-HSW-NEXT: subl %edi, %eax # sched: [1:0.25]
; X64-HSW-NEXT: subl %edi, %eax # sched: [1:0.25]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [7:1.00]
;
; X64-JAG-LABEL: test_mul_by_30:
-; X64-JAG: # BB#0:
-; X64-JAG-NEXT: movl %edi, %eax # sched: [1:0.17]
+; X64-JAG: # %bb.0:
+; X64-JAG-NEXT: movl %edi, %eax # sched: [1:0.50]
; X64-JAG-NEXT: shll $5, %eax # sched: [1:0.50]
; X64-JAG-NEXT: subl %edi, %eax # sched: [1:0.50]
; X64-JAG-NEXT: subl %edi, %eax # sched: [1:0.50]
; X64-JAG-NEXT: retq # sched: [4:1.00]
;
; X86-NOOPT-LABEL: test_mul_by_30:
-; X86-NOOPT: # BB#0:
+; X86-NOOPT: # %bb.0:
; X86-NOOPT-NEXT: imull $30, {{[0-9]+}}(%esp), %eax
; X86-NOOPT-NEXT: retl
;
; HSW-NOOPT-LABEL: test_mul_by_30:
-; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: imull $30, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT: # %bb.0:
+; HSW-NOOPT-NEXT: imull $30, %edi, %eax # sched: [3:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [7:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_30:
-; JAG-NOOPT: # BB#0:
+; JAG-NOOPT: # %bb.0:
; JAG-NOOPT-NEXT: imull $30, %edi, %eax # sched: [3:1.00]
; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
;
; X64-SLM-LABEL: test_mul_by_30:
-; X64-SLM: # BB#0:
+; X64-SLM: # %bb.0:
; X64-SLM-NEXT: imull $30, %edi, %eax # sched: [3:1.00]
; X64-SLM-NEXT: retq # sched: [4:1.00]
;
; SLM-NOOPT-LABEL: test_mul_by_30:
-; SLM-NOOPT: # BB#0:
+; SLM-NOOPT: # %bb.0:
; SLM-NOOPT-NEXT: imull $30, %edi, %eax # sched: [3:1.00]
; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
%mul = mul nsw i32 %x, 30
@@ -1566,7 +1566,7 @@ define i32 @test_mul_by_30(i32 %x) {
define i32 @test_mul_by_31(i32 %x) {
; X86-LABEL: test_mul_by_31:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: shll $5, %eax
@@ -1574,43 +1574,43 @@ define i32 @test_mul_by_31(i32 %x) {
; X86-NEXT: retl
;
; X64-HSW-LABEL: test_mul_by_31:
-; X64-HSW: # BB#0:
+; X64-HSW: # %bb.0:
; X64-HSW-NEXT: movl %edi, %eax # sched: [1:0.25]
; X64-HSW-NEXT: shll $5, %eax # sched: [1:0.50]
; X64-HSW-NEXT: subl %edi, %eax # sched: [1:0.25]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [7:1.00]
;
; X64-JAG-LABEL: test_mul_by_31:
-; X64-JAG: # BB#0:
-; X64-JAG-NEXT: movl %edi, %eax # sched: [1:0.17]
+; X64-JAG: # %bb.0:
+; X64-JAG-NEXT: movl %edi, %eax # sched: [1:0.50]
; X64-JAG-NEXT: shll $5, %eax # sched: [1:0.50]
; X64-JAG-NEXT: subl %edi, %eax # sched: [1:0.50]
; X64-JAG-NEXT: retq # sched: [4:1.00]
;
; X86-NOOPT-LABEL: test_mul_by_31:
-; X86-NOOPT: # BB#0:
+; X86-NOOPT: # %bb.0:
; X86-NOOPT-NEXT: imull $31, {{[0-9]+}}(%esp), %eax
; X86-NOOPT-NEXT: retl
;
; HSW-NOOPT-LABEL: test_mul_by_31:
-; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: imull $31, %edi, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT: # %bb.0:
+; HSW-NOOPT-NEXT: imull $31, %edi, %eax # sched: [3:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [7:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_31:
-; JAG-NOOPT: # BB#0:
+; JAG-NOOPT: # %bb.0:
; JAG-NOOPT-NEXT: imull $31, %edi, %eax # sched: [3:1.00]
; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
;
; X64-SLM-LABEL: test_mul_by_31:
-; X64-SLM: # BB#0:
+; X64-SLM: # %bb.0:
; X64-SLM-NEXT: movl %edi, %eax # sched: [1:0.50]
; X64-SLM-NEXT: shll $5, %eax # sched: [1:1.00]
; X64-SLM-NEXT: subl %edi, %eax # sched: [1:0.50]
; X64-SLM-NEXT: retq # sched: [4:1.00]
;
; SLM-NOOPT-LABEL: test_mul_by_31:
-; SLM-NOOPT: # BB#0:
+; SLM-NOOPT: # %bb.0:
; SLM-NOOPT-NEXT: imull $31, %edi, %eax # sched: [3:1.00]
; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
%mul = mul nsw i32 %x, 31
@@ -1619,49 +1619,49 @@ define i32 @test_mul_by_31(i32 %x) {
define i32 @test_mul_by_32(i32 %x) {
; X86-LABEL: test_mul_by_32:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: shll $5, %eax
; X86-NEXT: retl
;
; X64-HSW-LABEL: test_mul_by_32:
-; X64-HSW: # BB#0:
+; X64-HSW: # %bb.0:
; X64-HSW-NEXT: shll $5, %edi # sched: [1:0.50]
; X64-HSW-NEXT: movl %edi, %eax # sched: [1:0.25]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [7:1.00]
;
; X64-JAG-LABEL: test_mul_by_32:
-; X64-JAG: # BB#0:
+; X64-JAG: # %bb.0:
; X64-JAG-NEXT: shll $5, %edi # sched: [1:0.50]
-; X64-JAG-NEXT: movl %edi, %eax # sched: [1:0.17]
+; X64-JAG-NEXT: movl %edi, %eax # sched: [1:0.50]
; X64-JAG-NEXT: retq # sched: [4:1.00]
;
; X86-NOOPT-LABEL: test_mul_by_32:
-; X86-NOOPT: # BB#0:
+; X86-NOOPT: # %bb.0:
; X86-NOOPT-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NOOPT-NEXT: shll $5, %eax
; X86-NOOPT-NEXT: retl
;
; HSW-NOOPT-LABEL: test_mul_by_32:
-; HSW-NOOPT: # BB#0:
+; HSW-NOOPT: # %bb.0:
; HSW-NOOPT-NEXT: shll $5, %edi # sched: [1:0.50]
; HSW-NOOPT-NEXT: movl %edi, %eax # sched: [1:0.25]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [7:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_32:
-; JAG-NOOPT: # BB#0:
+; JAG-NOOPT: # %bb.0:
; JAG-NOOPT-NEXT: shll $5, %edi # sched: [1:0.50]
-; JAG-NOOPT-NEXT: movl %edi, %eax # sched: [1:0.17]
+; JAG-NOOPT-NEXT: movl %edi, %eax # sched: [1:0.50]
; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
;
; X64-SLM-LABEL: test_mul_by_32:
-; X64-SLM: # BB#0:
+; X64-SLM: # %bb.0:
; X64-SLM-NEXT: shll $5, %edi # sched: [1:1.00]
; X64-SLM-NEXT: movl %edi, %eax # sched: [1:0.50]
; X64-SLM-NEXT: retq # sched: [4:1.00]
;
; SLM-NOOPT-LABEL: test_mul_by_32:
-; SLM-NOOPT: # BB#0:
+; SLM-NOOPT: # %bb.0:
; SLM-NOOPT-NEXT: shll $5, %edi # sched: [1:1.00]
; SLM-NOOPT-NEXT: movl %edi, %eax # sched: [1:0.50]
; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
@@ -1672,7 +1672,7 @@ define i32 @test_mul_by_32(i32 %x) {
; (x*9+42)*(x*5+2)
define i32 @test_mul_spec(i32 %x) nounwind {
; X86-LABEL: test_mul_spec:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: leal 42(%eax,%eax,8), %ecx
; X86-NEXT: leal 2(%eax,%eax,4), %eax
@@ -1680,25 +1680,25 @@ define i32 @test_mul_spec(i32 %x) nounwind {
; X86-NEXT: retl
;
; X64-HSW-LABEL: test_mul_spec:
-; X64-HSW: # BB#0:
-; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-HSW: # %bb.0:
+; X64-HSW-NEXT: # kill: def %edi killed %edi def %rdi
; X64-HSW-NEXT: leal (%rdi,%rdi,8), %ecx # sched: [1:0.50]
; X64-HSW-NEXT: addl $42, %ecx # sched: [1:0.25]
; X64-HSW-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
; X64-HSW-NEXT: addl $2, %eax # sched: [1:0.25]
-; X64-HSW-NEXT: imull %ecx, %eax # sched: [4:1.00]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: imull %ecx, %eax # sched: [3:1.00]
+; X64-HSW-NEXT: retq # sched: [7:1.00]
;
; X64-JAG-LABEL: test_mul_spec:
-; X64-JAG: # BB#0:
-; X64-JAG-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-JAG: # %bb.0:
+; X64-JAG-NEXT: # kill: def %edi killed %edi def %rdi
; X64-JAG-NEXT: leal 42(%rdi,%rdi,8), %ecx # sched: [1:0.50]
; X64-JAG-NEXT: leal 2(%rdi,%rdi,4), %eax # sched: [1:0.50]
; X64-JAG-NEXT: imull %ecx, %eax # sched: [3:1.00]
; X64-JAG-NEXT: retq # sched: [4:1.00]
;
; X86-NOOPT-LABEL: test_mul_spec:
-; X86-NOOPT: # BB#0:
+; X86-NOOPT: # %bb.0:
; X86-NOOPT-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NOOPT-NEXT: leal 42(%eax,%eax,8), %ecx
; X86-NOOPT-NEXT: leal 2(%eax,%eax,4), %eax
@@ -1706,34 +1706,34 @@ define i32 @test_mul_spec(i32 %x) nounwind {
; X86-NOOPT-NEXT: retl
;
; HSW-NOOPT-LABEL: test_mul_spec:
-; HSW-NOOPT: # BB#0:
-; HSW-NOOPT-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; HSW-NOOPT: # %bb.0:
+; HSW-NOOPT-NEXT: # kill: def %edi killed %edi def %rdi
; HSW-NOOPT-NEXT: leal (%rdi,%rdi,8), %ecx # sched: [1:0.50]
; HSW-NOOPT-NEXT: addl $42, %ecx # sched: [1:0.25]
; HSW-NOOPT-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
; HSW-NOOPT-NEXT: addl $2, %eax # sched: [1:0.25]
-; HSW-NOOPT-NEXT: imull %ecx, %eax # sched: [4:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: imull %ecx, %eax # sched: [3:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [7:1.00]
;
; JAG-NOOPT-LABEL: test_mul_spec:
-; JAG-NOOPT: # BB#0:
-; JAG-NOOPT-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; JAG-NOOPT: # %bb.0:
+; JAG-NOOPT-NEXT: # kill: def %edi killed %edi def %rdi
; JAG-NOOPT-NEXT: leal 42(%rdi,%rdi,8), %ecx # sched: [1:0.50]
; JAG-NOOPT-NEXT: leal 2(%rdi,%rdi,4), %eax # sched: [1:0.50]
; JAG-NOOPT-NEXT: imull %ecx, %eax # sched: [3:1.00]
; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
;
; X64-SLM-LABEL: test_mul_spec:
-; X64-SLM: # BB#0:
-; X64-SLM-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-SLM: # %bb.0:
+; X64-SLM-NEXT: # kill: def %edi killed %edi def %rdi
; X64-SLM-NEXT: leal 42(%rdi,%rdi,8), %ecx # sched: [1:1.00]
; X64-SLM-NEXT: leal 2(%rdi,%rdi,4), %eax # sched: [1:1.00]
; X64-SLM-NEXT: imull %ecx, %eax # sched: [3:1.00]
; X64-SLM-NEXT: retq # sched: [4:1.00]
;
; SLM-NOOPT-LABEL: test_mul_spec:
-; SLM-NOOPT: # BB#0:
-; SLM-NOOPT-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; SLM-NOOPT: # %bb.0:
+; SLM-NOOPT-NEXT: # kill: def %edi killed %edi def %rdi
; SLM-NOOPT-NEXT: leal 42(%rdi,%rdi,8), %ecx # sched: [1:1.00]
; SLM-NOOPT-NEXT: leal 2(%rdi,%rdi,4), %eax # sched: [1:1.00]
; SLM-NOOPT-NEXT: imull %ecx, %eax # sched: [3:1.00]
diff --git a/test/CodeGen/X86/mul-constant-i64.ll b/test/CodeGen/X86/mul-constant-i64.ll
index ea841c761c7b..538bc5c2d60f 100644
--- a/test/CodeGen/X86/mul-constant-i64.ll
+++ b/test/CodeGen/X86/mul-constant-i64.ll
@@ -10,44 +10,44 @@
define i64 @test_mul_by_1(i64 %x) nounwind {
; X86-LABEL: test_mul_by_1:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: retl
;
; X64-HSW-LABEL: test_mul_by_1:
-; X64-HSW: # BB#0:
+; X64-HSW: # %bb.0:
; X64-HSW-NEXT: movq %rdi, %rax # sched: [1:0.25]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [7:1.00]
;
; X64-JAG-LABEL: test_mul_by_1:
-; X64-JAG: # BB#0:
-; X64-JAG-NEXT: movq %rdi, %rax # sched: [1:0.17]
+; X64-JAG: # %bb.0:
+; X64-JAG-NEXT: movq %rdi, %rax # sched: [1:0.50]
; X64-JAG-NEXT: retq # sched: [4:1.00]
;
; X86-NOOPT-LABEL: test_mul_by_1:
-; X86-NOOPT: # BB#0:
+; X86-NOOPT: # %bb.0:
; X86-NOOPT-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NOOPT-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NOOPT-NEXT: retl
;
; HSW-NOOPT-LABEL: test_mul_by_1:
-; HSW-NOOPT: # BB#0:
+; HSW-NOOPT: # %bb.0:
; HSW-NOOPT-NEXT: movq %rdi, %rax # sched: [1:0.25]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [7:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_1:
-; JAG-NOOPT: # BB#0:
-; JAG-NOOPT-NEXT: movq %rdi, %rax # sched: [1:0.17]
+; JAG-NOOPT: # %bb.0:
+; JAG-NOOPT-NEXT: movq %rdi, %rax # sched: [1:0.50]
; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
;
; X64-SLM-LABEL: test_mul_by_1:
-; X64-SLM: # BB#0:
+; X64-SLM: # %bb.0:
; X64-SLM-NEXT: movq %rdi, %rax # sched: [1:0.50]
; X64-SLM-NEXT: retq # sched: [4:1.00]
;
; SLM-NOOPT-LABEL: test_mul_by_1:
-; SLM-NOOPT: # BB#0:
+; SLM-NOOPT: # %bb.0:
; SLM-NOOPT-NEXT: movq %rdi, %rax # sched: [1:0.50]
; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
%mul = mul nsw i64 %x, 1
@@ -56,7 +56,7 @@ define i64 @test_mul_by_1(i64 %x) nounwind {
define i64 @test_mul_by_2(i64 %x) {
; X86-LABEL: test_mul_by_2:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: shldl $1, %eax, %edx
@@ -64,17 +64,17 @@ define i64 @test_mul_by_2(i64 %x) {
; X86-NEXT: retl
;
; X64-HSW-LABEL: test_mul_by_2:
-; X64-HSW: # BB#0:
+; X64-HSW: # %bb.0:
; X64-HSW-NEXT: leaq (%rdi,%rdi), %rax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [7:1.00]
;
; X64-JAG-LABEL: test_mul_by_2:
-; X64-JAG: # BB#0:
+; X64-JAG: # %bb.0:
; X64-JAG-NEXT: leaq (%rdi,%rdi), %rax # sched: [1:0.50]
; X64-JAG-NEXT: retq # sched: [4:1.00]
;
; X86-NOOPT-LABEL: test_mul_by_2:
-; X86-NOOPT: # BB#0:
+; X86-NOOPT: # %bb.0:
; X86-NOOPT-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NOOPT-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NOOPT-NEXT: shldl $1, %eax, %edx
@@ -82,22 +82,22 @@ define i64 @test_mul_by_2(i64 %x) {
; X86-NOOPT-NEXT: retl
;
; HSW-NOOPT-LABEL: test_mul_by_2:
-; HSW-NOOPT: # BB#0:
+; HSW-NOOPT: # %bb.0:
; HSW-NOOPT-NEXT: leaq (%rdi,%rdi), %rax # sched: [1:0.50]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [7:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_2:
-; JAG-NOOPT: # BB#0:
+; JAG-NOOPT: # %bb.0:
; JAG-NOOPT-NEXT: leaq (%rdi,%rdi), %rax # sched: [1:0.50]
; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
;
; X64-SLM-LABEL: test_mul_by_2:
-; X64-SLM: # BB#0:
+; X64-SLM: # %bb.0:
; X64-SLM-NEXT: leaq (%rdi,%rdi), %rax # sched: [1:1.00]
; X64-SLM-NEXT: retq # sched: [4:1.00]
;
; SLM-NOOPT-LABEL: test_mul_by_2:
-; SLM-NOOPT: # BB#0:
+; SLM-NOOPT: # %bb.0:
; SLM-NOOPT-NEXT: leaq (%rdi,%rdi), %rax # sched: [1:1.00]
; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
%mul = mul nsw i64 %x, 2
@@ -106,7 +106,7 @@ define i64 @test_mul_by_2(i64 %x) {
define i64 @test_mul_by_3(i64 %x) {
; X86-LABEL: test_mul_by_3:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movl $3, %eax
; X86-NEXT: mull {{[0-9]+}}(%esp)
; X86-NEXT: imull $3, {{[0-9]+}}(%esp), %ecx
@@ -114,17 +114,17 @@ define i64 @test_mul_by_3(i64 %x) {
; X86-NEXT: retl
;
; X64-HSW-LABEL: test_mul_by_3:
-; X64-HSW: # BB#0:
+; X64-HSW: # %bb.0:
; X64-HSW-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [7:1.00]
;
; X64-JAG-LABEL: test_mul_by_3:
-; X64-JAG: # BB#0:
+; X64-JAG: # %bb.0:
; X64-JAG-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50]
; X64-JAG-NEXT: retq # sched: [4:1.00]
;
; X86-NOOPT-LABEL: test_mul_by_3:
-; X86-NOOPT: # BB#0:
+; X86-NOOPT: # %bb.0:
; X86-NOOPT-NEXT: movl $3, %eax
; X86-NOOPT-NEXT: mull {{[0-9]+}}(%esp)
; X86-NOOPT-NEXT: imull $3, {{[0-9]+}}(%esp), %ecx
@@ -132,22 +132,22 @@ define i64 @test_mul_by_3(i64 %x) {
; X86-NOOPT-NEXT: retl
;
; HSW-NOOPT-LABEL: test_mul_by_3:
-; HSW-NOOPT: # BB#0:
+; HSW-NOOPT: # %bb.0:
; HSW-NOOPT-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [7:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_3:
-; JAG-NOOPT: # BB#0:
+; JAG-NOOPT: # %bb.0:
; JAG-NOOPT-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50]
; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
;
; X64-SLM-LABEL: test_mul_by_3:
-; X64-SLM: # BB#0:
+; X64-SLM: # %bb.0:
; X64-SLM-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:1.00]
; X64-SLM-NEXT: retq # sched: [4:1.00]
;
; SLM-NOOPT-LABEL: test_mul_by_3:
-; SLM-NOOPT: # BB#0:
+; SLM-NOOPT: # %bb.0:
; SLM-NOOPT-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:1.00]
; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
%mul = mul nsw i64 %x, 3
@@ -156,7 +156,7 @@ define i64 @test_mul_by_3(i64 %x) {
define i64 @test_mul_by_4(i64 %x) {
; X86-LABEL: test_mul_by_4:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: shldl $2, %eax, %edx
@@ -164,17 +164,17 @@ define i64 @test_mul_by_4(i64 %x) {
; X86-NEXT: retl
;
; X64-HSW-LABEL: test_mul_by_4:
-; X64-HSW: # BB#0:
+; X64-HSW: # %bb.0:
; X64-HSW-NEXT: leaq (,%rdi,4), %rax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [7:1.00]
;
; X64-JAG-LABEL: test_mul_by_4:
-; X64-JAG: # BB#0:
+; X64-JAG: # %bb.0:
; X64-JAG-NEXT: leaq (,%rdi,4), %rax # sched: [1:0.50]
; X64-JAG-NEXT: retq # sched: [4:1.00]
;
; X86-NOOPT-LABEL: test_mul_by_4:
-; X86-NOOPT: # BB#0:
+; X86-NOOPT: # %bb.0:
; X86-NOOPT-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NOOPT-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NOOPT-NEXT: shldl $2, %eax, %edx
@@ -182,22 +182,22 @@ define i64 @test_mul_by_4(i64 %x) {
; X86-NOOPT-NEXT: retl
;
; HSW-NOOPT-LABEL: test_mul_by_4:
-; HSW-NOOPT: # BB#0:
+; HSW-NOOPT: # %bb.0:
; HSW-NOOPT-NEXT: leaq (,%rdi,4), %rax # sched: [1:0.50]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [7:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_4:
-; JAG-NOOPT: # BB#0:
+; JAG-NOOPT: # %bb.0:
; JAG-NOOPT-NEXT: leaq (,%rdi,4), %rax # sched: [1:0.50]
; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
;
; X64-SLM-LABEL: test_mul_by_4:
-; X64-SLM: # BB#0:
+; X64-SLM: # %bb.0:
; X64-SLM-NEXT: leaq (,%rdi,4), %rax # sched: [1:1.00]
; X64-SLM-NEXT: retq # sched: [4:1.00]
;
; SLM-NOOPT-LABEL: test_mul_by_4:
-; SLM-NOOPT: # BB#0:
+; SLM-NOOPT: # %bb.0:
; SLM-NOOPT-NEXT: leaq (,%rdi,4), %rax # sched: [1:1.00]
; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
%mul = mul nsw i64 %x, 4
@@ -206,7 +206,7 @@ define i64 @test_mul_by_4(i64 %x) {
define i64 @test_mul_by_5(i64 %x) {
; X86-LABEL: test_mul_by_5:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movl $5, %eax
; X86-NEXT: mull {{[0-9]+}}(%esp)
; X86-NEXT: imull $5, {{[0-9]+}}(%esp), %ecx
@@ -214,17 +214,17 @@ define i64 @test_mul_by_5(i64 %x) {
; X86-NEXT: retl
;
; X64-HSW-LABEL: test_mul_by_5:
-; X64-HSW: # BB#0:
+; X64-HSW: # %bb.0:
; X64-HSW-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [7:1.00]
;
; X64-JAG-LABEL: test_mul_by_5:
-; X64-JAG: # BB#0:
+; X64-JAG: # %bb.0:
; X64-JAG-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
; X64-JAG-NEXT: retq # sched: [4:1.00]
;
; X86-NOOPT-LABEL: test_mul_by_5:
-; X86-NOOPT: # BB#0:
+; X86-NOOPT: # %bb.0:
; X86-NOOPT-NEXT: movl $5, %eax
; X86-NOOPT-NEXT: mull {{[0-9]+}}(%esp)
; X86-NOOPT-NEXT: imull $5, {{[0-9]+}}(%esp), %ecx
@@ -232,22 +232,22 @@ define i64 @test_mul_by_5(i64 %x) {
; X86-NOOPT-NEXT: retl
;
; HSW-NOOPT-LABEL: test_mul_by_5:
-; HSW-NOOPT: # BB#0:
+; HSW-NOOPT: # %bb.0:
; HSW-NOOPT-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [7:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_5:
-; JAG-NOOPT: # BB#0:
+; JAG-NOOPT: # %bb.0:
; JAG-NOOPT-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
;
; X64-SLM-LABEL: test_mul_by_5:
-; X64-SLM: # BB#0:
+; X64-SLM: # %bb.0:
; X64-SLM-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:1.00]
; X64-SLM-NEXT: retq # sched: [4:1.00]
;
; SLM-NOOPT-LABEL: test_mul_by_5:
-; SLM-NOOPT: # BB#0:
+; SLM-NOOPT: # %bb.0:
; SLM-NOOPT-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:1.00]
; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
%mul = mul nsw i64 %x, 5
@@ -256,7 +256,7 @@ define i64 @test_mul_by_5(i64 %x) {
define i64 @test_mul_by_6(i64 %x) {
; X86-LABEL: test_mul_by_6:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: leal (%eax,%eax,2), %ecx
; X86-NEXT: movl $6, %eax
@@ -265,19 +265,19 @@ define i64 @test_mul_by_6(i64 %x) {
; X86-NEXT: retl
;
; X64-HSW-LABEL: test_mul_by_6:
-; X64-HSW: # BB#0:
+; X64-HSW: # %bb.0:
; X64-HSW-NEXT: addq %rdi, %rdi # sched: [1:0.25]
; X64-HSW-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [7:1.00]
;
; X64-JAG-LABEL: test_mul_by_6:
-; X64-JAG: # BB#0:
+; X64-JAG: # %bb.0:
; X64-JAG-NEXT: addq %rdi, %rdi # sched: [1:0.50]
; X64-JAG-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50]
; X64-JAG-NEXT: retq # sched: [4:1.00]
;
; X86-NOOPT-LABEL: test_mul_by_6:
-; X86-NOOPT: # BB#0:
+; X86-NOOPT: # %bb.0:
; X86-NOOPT-NEXT: movl $6, %eax
; X86-NOOPT-NEXT: mull {{[0-9]+}}(%esp)
; X86-NOOPT-NEXT: imull $6, {{[0-9]+}}(%esp), %ecx
@@ -285,23 +285,23 @@ define i64 @test_mul_by_6(i64 %x) {
; X86-NOOPT-NEXT: retl
;
; HSW-NOOPT-LABEL: test_mul_by_6:
-; HSW-NOOPT: # BB#0:
+; HSW-NOOPT: # %bb.0:
; HSW-NOOPT-NEXT: imulq $6, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [7:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_6:
-; JAG-NOOPT: # BB#0:
+; JAG-NOOPT: # %bb.0:
; JAG-NOOPT-NEXT: imulq $6, %rdi, %rax # sched: [3:1.00]
; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
;
; X64-SLM-LABEL: test_mul_by_6:
-; X64-SLM: # BB#0:
+; X64-SLM: # %bb.0:
; X64-SLM-NEXT: addq %rdi, %rdi # sched: [1:0.50]
; X64-SLM-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:1.00]
; X64-SLM-NEXT: retq # sched: [4:1.00]
;
; SLM-NOOPT-LABEL: test_mul_by_6:
-; SLM-NOOPT: # BB#0:
+; SLM-NOOPT: # %bb.0:
; SLM-NOOPT-NEXT: imulq $6, %rdi, %rax # sched: [3:1.00]
; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
%mul = mul nsw i64 %x, 6
@@ -310,7 +310,7 @@ define i64 @test_mul_by_6(i64 %x) {
define i64 @test_mul_by_7(i64 %x) {
; X86-LABEL: test_mul_by_7:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: leal (,%eax,8), %ecx
; X86-NEXT: subl %eax, %ecx
@@ -320,19 +320,19 @@ define i64 @test_mul_by_7(i64 %x) {
; X86-NEXT: retl
;
; X64-HSW-LABEL: test_mul_by_7:
-; X64-HSW: # BB#0:
+; X64-HSW: # %bb.0:
; X64-HSW-NEXT: leaq (,%rdi,8), %rax # sched: [1:0.50]
; X64-HSW-NEXT: subq %rdi, %rax # sched: [1:0.25]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [7:1.00]
;
; X64-JAG-LABEL: test_mul_by_7:
-; X64-JAG: # BB#0:
+; X64-JAG: # %bb.0:
; X64-JAG-NEXT: leaq (,%rdi,8), %rax # sched: [1:0.50]
; X64-JAG-NEXT: subq %rdi, %rax # sched: [1:0.50]
; X64-JAG-NEXT: retq # sched: [4:1.00]
;
; X86-NOOPT-LABEL: test_mul_by_7:
-; X86-NOOPT: # BB#0:
+; X86-NOOPT: # %bb.0:
; X86-NOOPT-NEXT: movl $7, %eax
; X86-NOOPT-NEXT: mull {{[0-9]+}}(%esp)
; X86-NOOPT-NEXT: imull $7, {{[0-9]+}}(%esp), %ecx
@@ -340,23 +340,23 @@ define i64 @test_mul_by_7(i64 %x) {
; X86-NOOPT-NEXT: retl
;
; HSW-NOOPT-LABEL: test_mul_by_7:
-; HSW-NOOPT: # BB#0:
+; HSW-NOOPT: # %bb.0:
; HSW-NOOPT-NEXT: imulq $7, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [7:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_7:
-; JAG-NOOPT: # BB#0:
+; JAG-NOOPT: # %bb.0:
; JAG-NOOPT-NEXT: imulq $7, %rdi, %rax # sched: [3:1.00]
; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
;
; X64-SLM-LABEL: test_mul_by_7:
-; X64-SLM: # BB#0:
+; X64-SLM: # %bb.0:
; X64-SLM-NEXT: leaq (,%rdi,8), %rax # sched: [1:1.00]
; X64-SLM-NEXT: subq %rdi, %rax # sched: [1:0.50]
; X64-SLM-NEXT: retq # sched: [4:1.00]
;
; SLM-NOOPT-LABEL: test_mul_by_7:
-; SLM-NOOPT: # BB#0:
+; SLM-NOOPT: # %bb.0:
; SLM-NOOPT-NEXT: imulq $7, %rdi, %rax # sched: [3:1.00]
; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
%mul = mul nsw i64 %x, 7
@@ -365,7 +365,7 @@ define i64 @test_mul_by_7(i64 %x) {
define i64 @test_mul_by_8(i64 %x) {
; X86-LABEL: test_mul_by_8:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: shldl $3, %eax, %edx
@@ -373,17 +373,17 @@ define i64 @test_mul_by_8(i64 %x) {
; X86-NEXT: retl
;
; X64-HSW-LABEL: test_mul_by_8:
-; X64-HSW: # BB#0:
+; X64-HSW: # %bb.0:
; X64-HSW-NEXT: leaq (,%rdi,8), %rax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [7:1.00]
;
; X64-JAG-LABEL: test_mul_by_8:
-; X64-JAG: # BB#0:
+; X64-JAG: # %bb.0:
; X64-JAG-NEXT: leaq (,%rdi,8), %rax # sched: [1:0.50]
; X64-JAG-NEXT: retq # sched: [4:1.00]
;
; X86-NOOPT-LABEL: test_mul_by_8:
-; X86-NOOPT: # BB#0:
+; X86-NOOPT: # %bb.0:
; X86-NOOPT-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NOOPT-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NOOPT-NEXT: shldl $3, %eax, %edx
@@ -391,22 +391,22 @@ define i64 @test_mul_by_8(i64 %x) {
; X86-NOOPT-NEXT: retl
;
; HSW-NOOPT-LABEL: test_mul_by_8:
-; HSW-NOOPT: # BB#0:
+; HSW-NOOPT: # %bb.0:
; HSW-NOOPT-NEXT: leaq (,%rdi,8), %rax # sched: [1:0.50]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [7:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_8:
-; JAG-NOOPT: # BB#0:
+; JAG-NOOPT: # %bb.0:
; JAG-NOOPT-NEXT: leaq (,%rdi,8), %rax # sched: [1:0.50]
; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
;
; X64-SLM-LABEL: test_mul_by_8:
-; X64-SLM: # BB#0:
+; X64-SLM: # %bb.0:
; X64-SLM-NEXT: leaq (,%rdi,8), %rax # sched: [1:1.00]
; X64-SLM-NEXT: retq # sched: [4:1.00]
;
; SLM-NOOPT-LABEL: test_mul_by_8:
-; SLM-NOOPT: # BB#0:
+; SLM-NOOPT: # %bb.0:
; SLM-NOOPT-NEXT: leaq (,%rdi,8), %rax # sched: [1:1.00]
; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
%mul = mul nsw i64 %x, 8
@@ -415,7 +415,7 @@ define i64 @test_mul_by_8(i64 %x) {
define i64 @test_mul_by_9(i64 %x) {
; X86-LABEL: test_mul_by_9:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movl $9, %eax
; X86-NEXT: mull {{[0-9]+}}(%esp)
; X86-NEXT: imull $9, {{[0-9]+}}(%esp), %ecx
@@ -423,17 +423,17 @@ define i64 @test_mul_by_9(i64 %x) {
; X86-NEXT: retl
;
; X64-HSW-LABEL: test_mul_by_9:
-; X64-HSW: # BB#0:
+; X64-HSW: # %bb.0:
; X64-HSW-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [7:1.00]
;
; X64-JAG-LABEL: test_mul_by_9:
-; X64-JAG: # BB#0:
+; X64-JAG: # %bb.0:
; X64-JAG-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:0.50]
; X64-JAG-NEXT: retq # sched: [4:1.00]
;
; X86-NOOPT-LABEL: test_mul_by_9:
-; X86-NOOPT: # BB#0:
+; X86-NOOPT: # %bb.0:
; X86-NOOPT-NEXT: movl $9, %eax
; X86-NOOPT-NEXT: mull {{[0-9]+}}(%esp)
; X86-NOOPT-NEXT: imull $9, {{[0-9]+}}(%esp), %ecx
@@ -441,22 +441,22 @@ define i64 @test_mul_by_9(i64 %x) {
; X86-NOOPT-NEXT: retl
;
; HSW-NOOPT-LABEL: test_mul_by_9:
-; HSW-NOOPT: # BB#0:
+; HSW-NOOPT: # %bb.0:
; HSW-NOOPT-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:0.50]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [7:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_9:
-; JAG-NOOPT: # BB#0:
+; JAG-NOOPT: # %bb.0:
; JAG-NOOPT-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:0.50]
; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
;
; X64-SLM-LABEL: test_mul_by_9:
-; X64-SLM: # BB#0:
+; X64-SLM: # %bb.0:
; X64-SLM-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:1.00]
; X64-SLM-NEXT: retq # sched: [4:1.00]
;
; SLM-NOOPT-LABEL: test_mul_by_9:
-; SLM-NOOPT: # BB#0:
+; SLM-NOOPT: # %bb.0:
; SLM-NOOPT-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:1.00]
; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
%mul = mul nsw i64 %x, 9
@@ -465,7 +465,7 @@ define i64 @test_mul_by_9(i64 %x) {
define i64 @test_mul_by_10(i64 %x) {
; X86-LABEL: test_mul_by_10:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: leal (%eax,%eax,4), %ecx
; X86-NEXT: movl $10, %eax
@@ -474,19 +474,19 @@ define i64 @test_mul_by_10(i64 %x) {
; X86-NEXT: retl
;
; X64-HSW-LABEL: test_mul_by_10:
-; X64-HSW: # BB#0:
+; X64-HSW: # %bb.0:
; X64-HSW-NEXT: addq %rdi, %rdi # sched: [1:0.25]
; X64-HSW-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [7:1.00]
;
; X64-JAG-LABEL: test_mul_by_10:
-; X64-JAG: # BB#0:
+; X64-JAG: # %bb.0:
; X64-JAG-NEXT: addq %rdi, %rdi # sched: [1:0.50]
; X64-JAG-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
; X64-JAG-NEXT: retq # sched: [4:1.00]
;
; X86-NOOPT-LABEL: test_mul_by_10:
-; X86-NOOPT: # BB#0:
+; X86-NOOPT: # %bb.0:
; X86-NOOPT-NEXT: movl $10, %eax
; X86-NOOPT-NEXT: mull {{[0-9]+}}(%esp)
; X86-NOOPT-NEXT: imull $10, {{[0-9]+}}(%esp), %ecx
@@ -494,23 +494,23 @@ define i64 @test_mul_by_10(i64 %x) {
; X86-NOOPT-NEXT: retl
;
; HSW-NOOPT-LABEL: test_mul_by_10:
-; HSW-NOOPT: # BB#0:
+; HSW-NOOPT: # %bb.0:
; HSW-NOOPT-NEXT: imulq $10, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [7:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_10:
-; JAG-NOOPT: # BB#0:
+; JAG-NOOPT: # %bb.0:
; JAG-NOOPT-NEXT: imulq $10, %rdi, %rax # sched: [3:1.00]
; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
;
; X64-SLM-LABEL: test_mul_by_10:
-; X64-SLM: # BB#0:
+; X64-SLM: # %bb.0:
; X64-SLM-NEXT: addq %rdi, %rdi # sched: [1:0.50]
; X64-SLM-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:1.00]
; X64-SLM-NEXT: retq # sched: [4:1.00]
;
; SLM-NOOPT-LABEL: test_mul_by_10:
-; SLM-NOOPT: # BB#0:
+; SLM-NOOPT: # %bb.0:
; SLM-NOOPT-NEXT: imulq $10, %rdi, %rax # sched: [3:1.00]
; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
%mul = mul nsw i64 %x, 10
@@ -519,7 +519,7 @@ define i64 @test_mul_by_10(i64 %x) {
define i64 @test_mul_by_11(i64 %x) {
; X86-LABEL: test_mul_by_11:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: leal (%eax,%eax,4), %ecx
; X86-NEXT: leal (%eax,%ecx,2), %ecx
@@ -529,19 +529,19 @@ define i64 @test_mul_by_11(i64 %x) {
; X86-NEXT: retl
;
; X64-HSW-LABEL: test_mul_by_11:
-; X64-HSW: # BB#0:
+; X64-HSW: # %bb.0:
; X64-HSW-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
; X64-HSW-NEXT: leaq (%rdi,%rax,2), %rax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [7:1.00]
;
; X64-JAG-LABEL: test_mul_by_11:
-; X64-JAG: # BB#0:
+; X64-JAG: # %bb.0:
; X64-JAG-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
; X64-JAG-NEXT: leaq (%rdi,%rax,2), %rax # sched: [1:0.50]
; X64-JAG-NEXT: retq # sched: [4:1.00]
;
; X86-NOOPT-LABEL: test_mul_by_11:
-; X86-NOOPT: # BB#0:
+; X86-NOOPT: # %bb.0:
; X86-NOOPT-NEXT: movl $11, %eax
; X86-NOOPT-NEXT: mull {{[0-9]+}}(%esp)
; X86-NOOPT-NEXT: imull $11, {{[0-9]+}}(%esp), %ecx
@@ -549,22 +549,22 @@ define i64 @test_mul_by_11(i64 %x) {
; X86-NOOPT-NEXT: retl
;
; HSW-NOOPT-LABEL: test_mul_by_11:
-; HSW-NOOPT: # BB#0:
+; HSW-NOOPT: # %bb.0:
; HSW-NOOPT-NEXT: imulq $11, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [7:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_11:
-; JAG-NOOPT: # BB#0:
+; JAG-NOOPT: # %bb.0:
; JAG-NOOPT-NEXT: imulq $11, %rdi, %rax # sched: [3:1.00]
; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
;
; X64-SLM-LABEL: test_mul_by_11:
-; X64-SLM: # BB#0:
+; X64-SLM: # %bb.0:
; X64-SLM-NEXT: imulq $11, %rdi, %rax # sched: [3:1.00]
; X64-SLM-NEXT: retq # sched: [4:1.00]
;
; SLM-NOOPT-LABEL: test_mul_by_11:
-; SLM-NOOPT: # BB#0:
+; SLM-NOOPT: # %bb.0:
; SLM-NOOPT-NEXT: imulq $11, %rdi, %rax # sched: [3:1.00]
; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
%mul = mul nsw i64 %x, 11
@@ -573,7 +573,7 @@ define i64 @test_mul_by_11(i64 %x) {
define i64 @test_mul_by_12(i64 %x) {
; X86-LABEL: test_mul_by_12:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: leal (%eax,%eax,2), %ecx
; X86-NEXT: movl $12, %eax
@@ -582,19 +582,19 @@ define i64 @test_mul_by_12(i64 %x) {
; X86-NEXT: retl
;
; X64-HSW-LABEL: test_mul_by_12:
-; X64-HSW: # BB#0:
+; X64-HSW: # %bb.0:
; X64-HSW-NEXT: shlq $2, %rdi # sched: [1:0.50]
; X64-HSW-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [7:1.00]
;
; X64-JAG-LABEL: test_mul_by_12:
-; X64-JAG: # BB#0:
+; X64-JAG: # %bb.0:
; X64-JAG-NEXT: shlq $2, %rdi # sched: [1:0.50]
; X64-JAG-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50]
; X64-JAG-NEXT: retq # sched: [4:1.00]
;
; X86-NOOPT-LABEL: test_mul_by_12:
-; X86-NOOPT: # BB#0:
+; X86-NOOPT: # %bb.0:
; X86-NOOPT-NEXT: movl $12, %eax
; X86-NOOPT-NEXT: mull {{[0-9]+}}(%esp)
; X86-NOOPT-NEXT: imull $12, {{[0-9]+}}(%esp), %ecx
@@ -602,23 +602,23 @@ define i64 @test_mul_by_12(i64 %x) {
; X86-NOOPT-NEXT: retl
;
; HSW-NOOPT-LABEL: test_mul_by_12:
-; HSW-NOOPT: # BB#0:
+; HSW-NOOPT: # %bb.0:
; HSW-NOOPT-NEXT: imulq $12, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [7:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_12:
-; JAG-NOOPT: # BB#0:
+; JAG-NOOPT: # %bb.0:
; JAG-NOOPT-NEXT: imulq $12, %rdi, %rax # sched: [3:1.00]
; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
;
; X64-SLM-LABEL: test_mul_by_12:
-; X64-SLM: # BB#0:
+; X64-SLM: # %bb.0:
; X64-SLM-NEXT: shlq $2, %rdi # sched: [1:1.00]
; X64-SLM-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:1.00]
; X64-SLM-NEXT: retq # sched: [4:1.00]
;
; SLM-NOOPT-LABEL: test_mul_by_12:
-; SLM-NOOPT: # BB#0:
+; SLM-NOOPT: # %bb.0:
; SLM-NOOPT-NEXT: imulq $12, %rdi, %rax # sched: [3:1.00]
; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
%mul = mul nsw i64 %x, 12
@@ -627,7 +627,7 @@ define i64 @test_mul_by_12(i64 %x) {
define i64 @test_mul_by_13(i64 %x) {
; X86-LABEL: test_mul_by_13:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: leal (%eax,%eax,2), %ecx
; X86-NEXT: leal (%eax,%ecx,4), %ecx
@@ -637,19 +637,19 @@ define i64 @test_mul_by_13(i64 %x) {
; X86-NEXT: retl
;
; X64-HSW-LABEL: test_mul_by_13:
-; X64-HSW: # BB#0:
+; X64-HSW: # %bb.0:
; X64-HSW-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50]
; X64-HSW-NEXT: leaq (%rdi,%rax,4), %rax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [7:1.00]
;
; X64-JAG-LABEL: test_mul_by_13:
-; X64-JAG: # BB#0:
+; X64-JAG: # %bb.0:
; X64-JAG-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50]
; X64-JAG-NEXT: leaq (%rdi,%rax,4), %rax # sched: [1:0.50]
; X64-JAG-NEXT: retq # sched: [4:1.00]
;
; X86-NOOPT-LABEL: test_mul_by_13:
-; X86-NOOPT: # BB#0:
+; X86-NOOPT: # %bb.0:
; X86-NOOPT-NEXT: movl $13, %eax
; X86-NOOPT-NEXT: mull {{[0-9]+}}(%esp)
; X86-NOOPT-NEXT: imull $13, {{[0-9]+}}(%esp), %ecx
@@ -657,22 +657,22 @@ define i64 @test_mul_by_13(i64 %x) {
; X86-NOOPT-NEXT: retl
;
; HSW-NOOPT-LABEL: test_mul_by_13:
-; HSW-NOOPT: # BB#0:
+; HSW-NOOPT: # %bb.0:
; HSW-NOOPT-NEXT: imulq $13, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [7:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_13:
-; JAG-NOOPT: # BB#0:
+; JAG-NOOPT: # %bb.0:
; JAG-NOOPT-NEXT: imulq $13, %rdi, %rax # sched: [3:1.00]
; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
;
; X64-SLM-LABEL: test_mul_by_13:
-; X64-SLM: # BB#0:
+; X64-SLM: # %bb.0:
; X64-SLM-NEXT: imulq $13, %rdi, %rax # sched: [3:1.00]
; X64-SLM-NEXT: retq # sched: [4:1.00]
;
; SLM-NOOPT-LABEL: test_mul_by_13:
-; SLM-NOOPT: # BB#0:
+; SLM-NOOPT: # %bb.0:
; SLM-NOOPT-NEXT: imulq $13, %rdi, %rax # sched: [3:1.00]
; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
%mul = mul nsw i64 %x, 13
@@ -681,7 +681,7 @@ define i64 @test_mul_by_13(i64 %x) {
define i64 @test_mul_by_14(i64 %x) {
; X86-LABEL: test_mul_by_14:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: leal (%eax,%eax,2), %ecx
; X86-NEXT: leal (%eax,%ecx,4), %ecx
@@ -692,21 +692,21 @@ define i64 @test_mul_by_14(i64 %x) {
; X86-NEXT: retl
;
; X64-HSW-LABEL: test_mul_by_14:
-; X64-HSW: # BB#0:
+; X64-HSW: # %bb.0:
; X64-HSW-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50]
; X64-HSW-NEXT: leaq (%rdi,%rax,4), %rax # sched: [1:0.50]
; X64-HSW-NEXT: addq %rdi, %rax # sched: [1:0.25]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [7:1.00]
;
; X64-JAG-LABEL: test_mul_by_14:
-; X64-JAG: # BB#0:
+; X64-JAG: # %bb.0:
; X64-JAG-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50]
; X64-JAG-NEXT: leaq (%rdi,%rax,4), %rax # sched: [1:0.50]
; X64-JAG-NEXT: addq %rdi, %rax # sched: [1:0.50]
; X64-JAG-NEXT: retq # sched: [4:1.00]
;
; X86-NOOPT-LABEL: test_mul_by_14:
-; X86-NOOPT: # BB#0:
+; X86-NOOPT: # %bb.0:
; X86-NOOPT-NEXT: movl $14, %eax
; X86-NOOPT-NEXT: mull {{[0-9]+}}(%esp)
; X86-NOOPT-NEXT: imull $14, {{[0-9]+}}(%esp), %ecx
@@ -714,22 +714,22 @@ define i64 @test_mul_by_14(i64 %x) {
; X86-NOOPT-NEXT: retl
;
; HSW-NOOPT-LABEL: test_mul_by_14:
-; HSW-NOOPT: # BB#0:
+; HSW-NOOPT: # %bb.0:
; HSW-NOOPT-NEXT: imulq $14, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [7:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_14:
-; JAG-NOOPT: # BB#0:
+; JAG-NOOPT: # %bb.0:
; JAG-NOOPT-NEXT: imulq $14, %rdi, %rax # sched: [3:1.00]
; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
;
; X64-SLM-LABEL: test_mul_by_14:
-; X64-SLM: # BB#0:
+; X64-SLM: # %bb.0:
; X64-SLM-NEXT: imulq $14, %rdi, %rax # sched: [3:1.00]
; X64-SLM-NEXT: retq # sched: [4:1.00]
;
; SLM-NOOPT-LABEL: test_mul_by_14:
-; SLM-NOOPT: # BB#0:
+; SLM-NOOPT: # %bb.0:
; SLM-NOOPT-NEXT: imulq $14, %rdi, %rax # sched: [3:1.00]
; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
%mul = mul nsw i64 %x, 14
@@ -738,7 +738,7 @@ define i64 @test_mul_by_14(i64 %x) {
define i64 @test_mul_by_15(i64 %x) {
; X86-LABEL: test_mul_by_15:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl $15, %eax
; X86-NEXT: mull {{[0-9]+}}(%esp)
@@ -748,19 +748,19 @@ define i64 @test_mul_by_15(i64 %x) {
; X86-NEXT: retl
;
; X64-HSW-LABEL: test_mul_by_15:
-; X64-HSW: # BB#0:
+; X64-HSW: # %bb.0:
; X64-HSW-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
; X64-HSW-NEXT: leaq (%rax,%rax,2), %rax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [7:1.00]
;
; X64-JAG-LABEL: test_mul_by_15:
-; X64-JAG: # BB#0:
+; X64-JAG: # %bb.0:
; X64-JAG-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
; X64-JAG-NEXT: leaq (%rax,%rax,2), %rax # sched: [1:0.50]
; X64-JAG-NEXT: retq # sched: [4:1.00]
;
; X86-NOOPT-LABEL: test_mul_by_15:
-; X86-NOOPT: # BB#0:
+; X86-NOOPT: # %bb.0:
; X86-NOOPT-NEXT: movl $15, %eax
; X86-NOOPT-NEXT: mull {{[0-9]+}}(%esp)
; X86-NOOPT-NEXT: imull $15, {{[0-9]+}}(%esp), %ecx
@@ -768,23 +768,23 @@ define i64 @test_mul_by_15(i64 %x) {
; X86-NOOPT-NEXT: retl
;
; HSW-NOOPT-LABEL: test_mul_by_15:
-; HSW-NOOPT: # BB#0:
+; HSW-NOOPT: # %bb.0:
; HSW-NOOPT-NEXT: imulq $15, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [7:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_15:
-; JAG-NOOPT: # BB#0:
+; JAG-NOOPT: # %bb.0:
; JAG-NOOPT-NEXT: imulq $15, %rdi, %rax # sched: [3:1.00]
; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
;
; X64-SLM-LABEL: test_mul_by_15:
-; X64-SLM: # BB#0:
+; X64-SLM: # %bb.0:
; X64-SLM-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:1.00]
; X64-SLM-NEXT: leaq (%rax,%rax,2), %rax # sched: [1:1.00]
; X64-SLM-NEXT: retq # sched: [4:1.00]
;
; SLM-NOOPT-LABEL: test_mul_by_15:
-; SLM-NOOPT: # BB#0:
+; SLM-NOOPT: # %bb.0:
; SLM-NOOPT-NEXT: imulq $15, %rdi, %rax # sched: [3:1.00]
; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
%mul = mul nsw i64 %x, 15
@@ -793,7 +793,7 @@ define i64 @test_mul_by_15(i64 %x) {
define i64 @test_mul_by_16(i64 %x) {
; X86-LABEL: test_mul_by_16:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: shldl $4, %eax, %edx
@@ -801,19 +801,19 @@ define i64 @test_mul_by_16(i64 %x) {
; X86-NEXT: retl
;
; X64-HSW-LABEL: test_mul_by_16:
-; X64-HSW: # BB#0:
+; X64-HSW: # %bb.0:
; X64-HSW-NEXT: shlq $4, %rdi # sched: [1:0.50]
; X64-HSW-NEXT: movq %rdi, %rax # sched: [1:0.25]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [7:1.00]
;
; X64-JAG-LABEL: test_mul_by_16:
-; X64-JAG: # BB#0:
+; X64-JAG: # %bb.0:
; X64-JAG-NEXT: shlq $4, %rdi # sched: [1:0.50]
-; X64-JAG-NEXT: movq %rdi, %rax # sched: [1:0.17]
+; X64-JAG-NEXT: movq %rdi, %rax # sched: [1:0.50]
; X64-JAG-NEXT: retq # sched: [4:1.00]
;
; X86-NOOPT-LABEL: test_mul_by_16:
-; X86-NOOPT: # BB#0:
+; X86-NOOPT: # %bb.0:
; X86-NOOPT-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NOOPT-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NOOPT-NEXT: shldl $4, %eax, %edx
@@ -821,25 +821,25 @@ define i64 @test_mul_by_16(i64 %x) {
; X86-NOOPT-NEXT: retl
;
; HSW-NOOPT-LABEL: test_mul_by_16:
-; HSW-NOOPT: # BB#0:
+; HSW-NOOPT: # %bb.0:
; HSW-NOOPT-NEXT: shlq $4, %rdi # sched: [1:0.50]
; HSW-NOOPT-NEXT: movq %rdi, %rax # sched: [1:0.25]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [7:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_16:
-; JAG-NOOPT: # BB#0:
+; JAG-NOOPT: # %bb.0:
; JAG-NOOPT-NEXT: shlq $4, %rdi # sched: [1:0.50]
-; JAG-NOOPT-NEXT: movq %rdi, %rax # sched: [1:0.17]
+; JAG-NOOPT-NEXT: movq %rdi, %rax # sched: [1:0.50]
; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
;
; X64-SLM-LABEL: test_mul_by_16:
-; X64-SLM: # BB#0:
+; X64-SLM: # %bb.0:
; X64-SLM-NEXT: shlq $4, %rdi # sched: [1:1.00]
; X64-SLM-NEXT: movq %rdi, %rax # sched: [1:0.50]
; X64-SLM-NEXT: retq # sched: [4:1.00]
;
; SLM-NOOPT-LABEL: test_mul_by_16:
-; SLM-NOOPT: # BB#0:
+; SLM-NOOPT: # %bb.0:
; SLM-NOOPT-NEXT: shlq $4, %rdi # sched: [1:1.00]
; SLM-NOOPT-NEXT: movq %rdi, %rax # sched: [1:0.50]
; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
@@ -849,7 +849,7 @@ define i64 @test_mul_by_16(i64 %x) {
define i64 @test_mul_by_17(i64 %x) {
; X86-LABEL: test_mul_by_17:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl %eax, %ecx
; X86-NEXT: shll $4, %ecx
@@ -860,21 +860,21 @@ define i64 @test_mul_by_17(i64 %x) {
; X86-NEXT: retl
;
; X64-HSW-LABEL: test_mul_by_17:
-; X64-HSW: # BB#0:
+; X64-HSW: # %bb.0:
; X64-HSW-NEXT: movq %rdi, %rax # sched: [1:0.25]
; X64-HSW-NEXT: shlq $4, %rax # sched: [1:0.50]
; X64-HSW-NEXT: leaq (%rax,%rdi), %rax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [7:1.00]
;
; X64-JAG-LABEL: test_mul_by_17:
-; X64-JAG: # BB#0:
-; X64-JAG-NEXT: movq %rdi, %rax # sched: [1:0.17]
+; X64-JAG: # %bb.0:
+; X64-JAG-NEXT: movq %rdi, %rax # sched: [1:0.50]
; X64-JAG-NEXT: shlq $4, %rax # sched: [1:0.50]
; X64-JAG-NEXT: leaq (%rax,%rdi), %rax # sched: [1:0.50]
; X64-JAG-NEXT: retq # sched: [4:1.00]
;
; X86-NOOPT-LABEL: test_mul_by_17:
-; X86-NOOPT: # BB#0:
+; X86-NOOPT: # %bb.0:
; X86-NOOPT-NEXT: movl $17, %eax
; X86-NOOPT-NEXT: mull {{[0-9]+}}(%esp)
; X86-NOOPT-NEXT: imull $17, {{[0-9]+}}(%esp), %ecx
@@ -882,24 +882,24 @@ define i64 @test_mul_by_17(i64 %x) {
; X86-NOOPT-NEXT: retl
;
; HSW-NOOPT-LABEL: test_mul_by_17:
-; HSW-NOOPT: # BB#0:
+; HSW-NOOPT: # %bb.0:
; HSW-NOOPT-NEXT: imulq $17, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [7:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_17:
-; JAG-NOOPT: # BB#0:
+; JAG-NOOPT: # %bb.0:
; JAG-NOOPT-NEXT: imulq $17, %rdi, %rax # sched: [3:1.00]
; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
;
; X64-SLM-LABEL: test_mul_by_17:
-; X64-SLM: # BB#0:
+; X64-SLM: # %bb.0:
; X64-SLM-NEXT: movq %rdi, %rax # sched: [1:0.50]
; X64-SLM-NEXT: shlq $4, %rax # sched: [1:1.00]
; X64-SLM-NEXT: addq %rdi, %rax # sched: [1:0.50]
; X64-SLM-NEXT: retq # sched: [4:1.00]
;
; SLM-NOOPT-LABEL: test_mul_by_17:
-; SLM-NOOPT: # BB#0:
+; SLM-NOOPT: # %bb.0:
; SLM-NOOPT-NEXT: imulq $17, %rdi, %rax # sched: [3:1.00]
; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
%mul = mul nsw i64 %x, 17
@@ -908,7 +908,7 @@ define i64 @test_mul_by_17(i64 %x) {
define i64 @test_mul_by_18(i64 %x) {
; X86-LABEL: test_mul_by_18:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: leal (%eax,%eax,8), %ecx
; X86-NEXT: movl $18, %eax
@@ -917,19 +917,19 @@ define i64 @test_mul_by_18(i64 %x) {
; X86-NEXT: retl
;
; X64-HSW-LABEL: test_mul_by_18:
-; X64-HSW: # BB#0:
+; X64-HSW: # %bb.0:
; X64-HSW-NEXT: addq %rdi, %rdi # sched: [1:0.25]
; X64-HSW-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [7:1.00]
;
; X64-JAG-LABEL: test_mul_by_18:
-; X64-JAG: # BB#0:
+; X64-JAG: # %bb.0:
; X64-JAG-NEXT: addq %rdi, %rdi # sched: [1:0.50]
; X64-JAG-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:0.50]
; X64-JAG-NEXT: retq # sched: [4:1.00]
;
; X86-NOOPT-LABEL: test_mul_by_18:
-; X86-NOOPT: # BB#0:
+; X86-NOOPT: # %bb.0:
; X86-NOOPT-NEXT: movl $18, %eax
; X86-NOOPT-NEXT: mull {{[0-9]+}}(%esp)
; X86-NOOPT-NEXT: imull $18, {{[0-9]+}}(%esp), %ecx
@@ -937,23 +937,23 @@ define i64 @test_mul_by_18(i64 %x) {
; X86-NOOPT-NEXT: retl
;
; HSW-NOOPT-LABEL: test_mul_by_18:
-; HSW-NOOPT: # BB#0:
+; HSW-NOOPT: # %bb.0:
; HSW-NOOPT-NEXT: imulq $18, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [7:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_18:
-; JAG-NOOPT: # BB#0:
+; JAG-NOOPT: # %bb.0:
; JAG-NOOPT-NEXT: imulq $18, %rdi, %rax # sched: [3:1.00]
; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
;
; X64-SLM-LABEL: test_mul_by_18:
-; X64-SLM: # BB#0:
+; X64-SLM: # %bb.0:
; X64-SLM-NEXT: addq %rdi, %rdi # sched: [1:0.50]
; X64-SLM-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:1.00]
; X64-SLM-NEXT: retq # sched: [4:1.00]
;
; SLM-NOOPT-LABEL: test_mul_by_18:
-; SLM-NOOPT: # BB#0:
+; SLM-NOOPT: # %bb.0:
; SLM-NOOPT-NEXT: imulq $18, %rdi, %rax # sched: [3:1.00]
; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
%mul = mul nsw i64 %x, 18
@@ -962,7 +962,7 @@ define i64 @test_mul_by_18(i64 %x) {
define i64 @test_mul_by_19(i64 %x) {
; X86-LABEL: test_mul_by_19:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: leal (%eax,%eax,4), %ecx
; X86-NEXT: shll $2, %ecx
@@ -973,21 +973,21 @@ define i64 @test_mul_by_19(i64 %x) {
; X86-NEXT: retl
;
; X64-HSW-LABEL: test_mul_by_19:
-; X64-HSW: # BB#0:
+; X64-HSW: # %bb.0:
; X64-HSW-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
; X64-HSW-NEXT: shlq $2, %rax # sched: [1:0.50]
; X64-HSW-NEXT: subq %rdi, %rax # sched: [1:0.25]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [7:1.00]
;
; X64-JAG-LABEL: test_mul_by_19:
-; X64-JAG: # BB#0:
+; X64-JAG: # %bb.0:
; X64-JAG-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
; X64-JAG-NEXT: shlq $2, %rax # sched: [1:0.50]
; X64-JAG-NEXT: subq %rdi, %rax # sched: [1:0.50]
; X64-JAG-NEXT: retq # sched: [4:1.00]
;
; X86-NOOPT-LABEL: test_mul_by_19:
-; X86-NOOPT: # BB#0:
+; X86-NOOPT: # %bb.0:
; X86-NOOPT-NEXT: movl $19, %eax
; X86-NOOPT-NEXT: mull {{[0-9]+}}(%esp)
; X86-NOOPT-NEXT: imull $19, {{[0-9]+}}(%esp), %ecx
@@ -995,22 +995,22 @@ define i64 @test_mul_by_19(i64 %x) {
; X86-NOOPT-NEXT: retl
;
; HSW-NOOPT-LABEL: test_mul_by_19:
-; HSW-NOOPT: # BB#0:
+; HSW-NOOPT: # %bb.0:
; HSW-NOOPT-NEXT: imulq $19, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [7:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_19:
-; JAG-NOOPT: # BB#0:
+; JAG-NOOPT: # %bb.0:
; JAG-NOOPT-NEXT: imulq $19, %rdi, %rax # sched: [3:1.00]
; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
;
; X64-SLM-LABEL: test_mul_by_19:
-; X64-SLM: # BB#0:
+; X64-SLM: # %bb.0:
; X64-SLM-NEXT: imulq $19, %rdi, %rax # sched: [3:1.00]
; X64-SLM-NEXT: retq # sched: [4:1.00]
;
; SLM-NOOPT-LABEL: test_mul_by_19:
-; SLM-NOOPT: # BB#0:
+; SLM-NOOPT: # %bb.0:
; SLM-NOOPT-NEXT: imulq $19, %rdi, %rax # sched: [3:1.00]
; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
%mul = mul nsw i64 %x, 19
@@ -1019,7 +1019,7 @@ define i64 @test_mul_by_19(i64 %x) {
define i64 @test_mul_by_20(i64 %x) {
; X86-LABEL: test_mul_by_20:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: leal (%eax,%eax,4), %ecx
; X86-NEXT: movl $20, %eax
@@ -1028,19 +1028,19 @@ define i64 @test_mul_by_20(i64 %x) {
; X86-NEXT: retl
;
; X64-HSW-LABEL: test_mul_by_20:
-; X64-HSW: # BB#0:
+; X64-HSW: # %bb.0:
; X64-HSW-NEXT: shlq $2, %rdi # sched: [1:0.50]
; X64-HSW-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [7:1.00]
;
; X64-JAG-LABEL: test_mul_by_20:
-; X64-JAG: # BB#0:
+; X64-JAG: # %bb.0:
; X64-JAG-NEXT: shlq $2, %rdi # sched: [1:0.50]
; X64-JAG-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
; X64-JAG-NEXT: retq # sched: [4:1.00]
;
; X86-NOOPT-LABEL: test_mul_by_20:
-; X86-NOOPT: # BB#0:
+; X86-NOOPT: # %bb.0:
; X86-NOOPT-NEXT: movl $20, %eax
; X86-NOOPT-NEXT: mull {{[0-9]+}}(%esp)
; X86-NOOPT-NEXT: imull $20, {{[0-9]+}}(%esp), %ecx
@@ -1048,23 +1048,23 @@ define i64 @test_mul_by_20(i64 %x) {
; X86-NOOPT-NEXT: retl
;
; HSW-NOOPT-LABEL: test_mul_by_20:
-; HSW-NOOPT: # BB#0:
+; HSW-NOOPT: # %bb.0:
; HSW-NOOPT-NEXT: imulq $20, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [7:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_20:
-; JAG-NOOPT: # BB#0:
+; JAG-NOOPT: # %bb.0:
; JAG-NOOPT-NEXT: imulq $20, %rdi, %rax # sched: [3:1.00]
; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
;
; X64-SLM-LABEL: test_mul_by_20:
-; X64-SLM: # BB#0:
+; X64-SLM: # %bb.0:
; X64-SLM-NEXT: shlq $2, %rdi # sched: [1:1.00]
; X64-SLM-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:1.00]
; X64-SLM-NEXT: retq # sched: [4:1.00]
;
; SLM-NOOPT-LABEL: test_mul_by_20:
-; SLM-NOOPT: # BB#0:
+; SLM-NOOPT: # %bb.0:
; SLM-NOOPT-NEXT: imulq $20, %rdi, %rax # sched: [3:1.00]
; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
%mul = mul nsw i64 %x, 20
@@ -1073,7 +1073,7 @@ define i64 @test_mul_by_20(i64 %x) {
define i64 @test_mul_by_21(i64 %x) {
; X86-LABEL: test_mul_by_21:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: leal (%eax,%eax,4), %ecx
; X86-NEXT: leal (%eax,%ecx,4), %ecx
@@ -1083,19 +1083,19 @@ define i64 @test_mul_by_21(i64 %x) {
; X86-NEXT: retl
;
; X64-HSW-LABEL: test_mul_by_21:
-; X64-HSW: # BB#0:
+; X64-HSW: # %bb.0:
; X64-HSW-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
; X64-HSW-NEXT: leaq (%rdi,%rax,4), %rax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [7:1.00]
;
; X64-JAG-LABEL: test_mul_by_21:
-; X64-JAG: # BB#0:
+; X64-JAG: # %bb.0:
; X64-JAG-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
; X64-JAG-NEXT: leaq (%rdi,%rax,4), %rax # sched: [1:0.50]
; X64-JAG-NEXT: retq # sched: [4:1.00]
;
; X86-NOOPT-LABEL: test_mul_by_21:
-; X86-NOOPT: # BB#0:
+; X86-NOOPT: # %bb.0:
; X86-NOOPT-NEXT: movl $21, %eax
; X86-NOOPT-NEXT: mull {{[0-9]+}}(%esp)
; X86-NOOPT-NEXT: imull $21, {{[0-9]+}}(%esp), %ecx
@@ -1103,22 +1103,22 @@ define i64 @test_mul_by_21(i64 %x) {
; X86-NOOPT-NEXT: retl
;
; HSW-NOOPT-LABEL: test_mul_by_21:
-; HSW-NOOPT: # BB#0:
+; HSW-NOOPT: # %bb.0:
; HSW-NOOPT-NEXT: imulq $21, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [7:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_21:
-; JAG-NOOPT: # BB#0:
+; JAG-NOOPT: # %bb.0:
; JAG-NOOPT-NEXT: imulq $21, %rdi, %rax # sched: [3:1.00]
; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
;
; X64-SLM-LABEL: test_mul_by_21:
-; X64-SLM: # BB#0:
+; X64-SLM: # %bb.0:
; X64-SLM-NEXT: imulq $21, %rdi, %rax # sched: [3:1.00]
; X64-SLM-NEXT: retq # sched: [4:1.00]
;
; SLM-NOOPT-LABEL: test_mul_by_21:
-; SLM-NOOPT: # BB#0:
+; SLM-NOOPT: # %bb.0:
; SLM-NOOPT-NEXT: imulq $21, %rdi, %rax # sched: [3:1.00]
; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
%mul = mul nsw i64 %x, 21
@@ -1127,7 +1127,7 @@ define i64 @test_mul_by_21(i64 %x) {
define i64 @test_mul_by_22(i64 %x) {
; X86-LABEL: test_mul_by_22:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: leal (%eax,%eax,4), %ecx
; X86-NEXT: leal (%eax,%ecx,4), %ecx
@@ -1138,21 +1138,21 @@ define i64 @test_mul_by_22(i64 %x) {
; X86-NEXT: retl
;
; X64-HSW-LABEL: test_mul_by_22:
-; X64-HSW: # BB#0:
+; X64-HSW: # %bb.0:
; X64-HSW-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
; X64-HSW-NEXT: leaq (%rdi,%rax,4), %rax # sched: [1:0.50]
; X64-HSW-NEXT: addq %rdi, %rax # sched: [1:0.25]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [7:1.00]
;
; X64-JAG-LABEL: test_mul_by_22:
-; X64-JAG: # BB#0:
+; X64-JAG: # %bb.0:
; X64-JAG-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
; X64-JAG-NEXT: leaq (%rdi,%rax,4), %rax # sched: [1:0.50]
; X64-JAG-NEXT: addq %rdi, %rax # sched: [1:0.50]
; X64-JAG-NEXT: retq # sched: [4:1.00]
;
; X86-NOOPT-LABEL: test_mul_by_22:
-; X86-NOOPT: # BB#0:
+; X86-NOOPT: # %bb.0:
; X86-NOOPT-NEXT: movl $22, %eax
; X86-NOOPT-NEXT: mull {{[0-9]+}}(%esp)
; X86-NOOPT-NEXT: imull $22, {{[0-9]+}}(%esp), %ecx
@@ -1160,22 +1160,22 @@ define i64 @test_mul_by_22(i64 %x) {
; X86-NOOPT-NEXT: retl
;
; HSW-NOOPT-LABEL: test_mul_by_22:
-; HSW-NOOPT: # BB#0:
+; HSW-NOOPT: # %bb.0:
; HSW-NOOPT-NEXT: imulq $22, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [7:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_22:
-; JAG-NOOPT: # BB#0:
+; JAG-NOOPT: # %bb.0:
; JAG-NOOPT-NEXT: imulq $22, %rdi, %rax # sched: [3:1.00]
; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
;
; X64-SLM-LABEL: test_mul_by_22:
-; X64-SLM: # BB#0:
+; X64-SLM: # %bb.0:
; X64-SLM-NEXT: imulq $22, %rdi, %rax # sched: [3:1.00]
; X64-SLM-NEXT: retq # sched: [4:1.00]
;
; SLM-NOOPT-LABEL: test_mul_by_22:
-; SLM-NOOPT: # BB#0:
+; SLM-NOOPT: # %bb.0:
; SLM-NOOPT-NEXT: imulq $22, %rdi, %rax # sched: [3:1.00]
; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
%mul = mul nsw i64 %x, 22
@@ -1184,7 +1184,7 @@ define i64 @test_mul_by_22(i64 %x) {
define i64 @test_mul_by_23(i64 %x) {
; X86-LABEL: test_mul_by_23:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: leal (%eax,%eax,2), %ecx
; X86-NEXT: shll $3, %ecx
@@ -1195,21 +1195,21 @@ define i64 @test_mul_by_23(i64 %x) {
; X86-NEXT: retl
;
; X64-HSW-LABEL: test_mul_by_23:
-; X64-HSW: # BB#0:
+; X64-HSW: # %bb.0:
; X64-HSW-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50]
; X64-HSW-NEXT: shlq $3, %rax # sched: [1:0.50]
; X64-HSW-NEXT: subq %rdi, %rax # sched: [1:0.25]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [7:1.00]
;
; X64-JAG-LABEL: test_mul_by_23:
-; X64-JAG: # BB#0:
+; X64-JAG: # %bb.0:
; X64-JAG-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50]
; X64-JAG-NEXT: shlq $3, %rax # sched: [1:0.50]
; X64-JAG-NEXT: subq %rdi, %rax # sched: [1:0.50]
; X64-JAG-NEXT: retq # sched: [4:1.00]
;
; X86-NOOPT-LABEL: test_mul_by_23:
-; X86-NOOPT: # BB#0:
+; X86-NOOPT: # %bb.0:
; X86-NOOPT-NEXT: movl $23, %eax
; X86-NOOPT-NEXT: mull {{[0-9]+}}(%esp)
; X86-NOOPT-NEXT: imull $23, {{[0-9]+}}(%esp), %ecx
@@ -1217,22 +1217,22 @@ define i64 @test_mul_by_23(i64 %x) {
; X86-NOOPT-NEXT: retl
;
; HSW-NOOPT-LABEL: test_mul_by_23:
-; HSW-NOOPT: # BB#0:
+; HSW-NOOPT: # %bb.0:
; HSW-NOOPT-NEXT: imulq $23, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [7:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_23:
-; JAG-NOOPT: # BB#0:
+; JAG-NOOPT: # %bb.0:
; JAG-NOOPT-NEXT: imulq $23, %rdi, %rax # sched: [3:1.00]
; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
;
; X64-SLM-LABEL: test_mul_by_23:
-; X64-SLM: # BB#0:
+; X64-SLM: # %bb.0:
; X64-SLM-NEXT: imulq $23, %rdi, %rax # sched: [3:1.00]
; X64-SLM-NEXT: retq # sched: [4:1.00]
;
; SLM-NOOPT-LABEL: test_mul_by_23:
-; SLM-NOOPT: # BB#0:
+; SLM-NOOPT: # %bb.0:
; SLM-NOOPT-NEXT: imulq $23, %rdi, %rax # sched: [3:1.00]
; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
%mul = mul nsw i64 %x, 23
@@ -1241,7 +1241,7 @@ define i64 @test_mul_by_23(i64 %x) {
define i64 @test_mul_by_24(i64 %x) {
; X86-LABEL: test_mul_by_24:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: leal (%eax,%eax,2), %ecx
; X86-NEXT: movl $24, %eax
@@ -1250,19 +1250,19 @@ define i64 @test_mul_by_24(i64 %x) {
; X86-NEXT: retl
;
; X64-HSW-LABEL: test_mul_by_24:
-; X64-HSW: # BB#0:
+; X64-HSW: # %bb.0:
; X64-HSW-NEXT: shlq $3, %rdi # sched: [1:0.50]
; X64-HSW-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [7:1.00]
;
; X64-JAG-LABEL: test_mul_by_24:
-; X64-JAG: # BB#0:
+; X64-JAG: # %bb.0:
; X64-JAG-NEXT: shlq $3, %rdi # sched: [1:0.50]
; X64-JAG-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50]
; X64-JAG-NEXT: retq # sched: [4:1.00]
;
; X86-NOOPT-LABEL: test_mul_by_24:
-; X86-NOOPT: # BB#0:
+; X86-NOOPT: # %bb.0:
; X86-NOOPT-NEXT: movl $24, %eax
; X86-NOOPT-NEXT: mull {{[0-9]+}}(%esp)
; X86-NOOPT-NEXT: imull $24, {{[0-9]+}}(%esp), %ecx
@@ -1270,23 +1270,23 @@ define i64 @test_mul_by_24(i64 %x) {
; X86-NOOPT-NEXT: retl
;
; HSW-NOOPT-LABEL: test_mul_by_24:
-; HSW-NOOPT: # BB#0:
+; HSW-NOOPT: # %bb.0:
; HSW-NOOPT-NEXT: imulq $24, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [7:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_24:
-; JAG-NOOPT: # BB#0:
+; JAG-NOOPT: # %bb.0:
; JAG-NOOPT-NEXT: imulq $24, %rdi, %rax # sched: [3:1.00]
; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
;
; X64-SLM-LABEL: test_mul_by_24:
-; X64-SLM: # BB#0:
+; X64-SLM: # %bb.0:
; X64-SLM-NEXT: shlq $3, %rdi # sched: [1:1.00]
; X64-SLM-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:1.00]
; X64-SLM-NEXT: retq # sched: [4:1.00]
;
; SLM-NOOPT-LABEL: test_mul_by_24:
-; SLM-NOOPT: # BB#0:
+; SLM-NOOPT: # %bb.0:
; SLM-NOOPT-NEXT: imulq $24, %rdi, %rax # sched: [3:1.00]
; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
%mul = mul nsw i64 %x, 24
@@ -1295,7 +1295,7 @@ define i64 @test_mul_by_24(i64 %x) {
define i64 @test_mul_by_25(i64 %x) {
; X86-LABEL: test_mul_by_25:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl $25, %eax
; X86-NEXT: mull {{[0-9]+}}(%esp)
@@ -1305,19 +1305,19 @@ define i64 @test_mul_by_25(i64 %x) {
; X86-NEXT: retl
;
; X64-HSW-LABEL: test_mul_by_25:
-; X64-HSW: # BB#0:
+; X64-HSW: # %bb.0:
; X64-HSW-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
; X64-HSW-NEXT: leaq (%rax,%rax,4), %rax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [7:1.00]
;
; X64-JAG-LABEL: test_mul_by_25:
-; X64-JAG: # BB#0:
+; X64-JAG: # %bb.0:
; X64-JAG-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
; X64-JAG-NEXT: leaq (%rax,%rax,4), %rax # sched: [1:0.50]
; X64-JAG-NEXT: retq # sched: [4:1.00]
;
; X86-NOOPT-LABEL: test_mul_by_25:
-; X86-NOOPT: # BB#0:
+; X86-NOOPT: # %bb.0:
; X86-NOOPT-NEXT: movl $25, %eax
; X86-NOOPT-NEXT: mull {{[0-9]+}}(%esp)
; X86-NOOPT-NEXT: imull $25, {{[0-9]+}}(%esp), %ecx
@@ -1325,23 +1325,23 @@ define i64 @test_mul_by_25(i64 %x) {
; X86-NOOPT-NEXT: retl
;
; HSW-NOOPT-LABEL: test_mul_by_25:
-; HSW-NOOPT: # BB#0:
+; HSW-NOOPT: # %bb.0:
; HSW-NOOPT-NEXT: imulq $25, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [7:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_25:
-; JAG-NOOPT: # BB#0:
+; JAG-NOOPT: # %bb.0:
; JAG-NOOPT-NEXT: imulq $25, %rdi, %rax # sched: [3:1.00]
; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
;
; X64-SLM-LABEL: test_mul_by_25:
-; X64-SLM: # BB#0:
+; X64-SLM: # %bb.0:
; X64-SLM-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:1.00]
; X64-SLM-NEXT: leaq (%rax,%rax,4), %rax # sched: [1:1.00]
; X64-SLM-NEXT: retq # sched: [4:1.00]
;
; SLM-NOOPT-LABEL: test_mul_by_25:
-; SLM-NOOPT: # BB#0:
+; SLM-NOOPT: # %bb.0:
; SLM-NOOPT-NEXT: imulq $25, %rdi, %rax # sched: [3:1.00]
; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
%mul = mul nsw i64 %x, 25
@@ -1350,7 +1350,7 @@ define i64 @test_mul_by_25(i64 %x) {
define i64 @test_mul_by_26(i64 %x) {
; X86-LABEL: test_mul_by_26:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: leal (%eax,%eax,8), %ecx
; X86-NEXT: leal (%ecx,%ecx,2), %ecx
@@ -1361,21 +1361,21 @@ define i64 @test_mul_by_26(i64 %x) {
; X86-NEXT: retl
;
; X64-HSW-LABEL: test_mul_by_26:
-; X64-HSW: # BB#0:
+; X64-HSW: # %bb.0:
; X64-HSW-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:0.50]
; X64-HSW-NEXT: leaq (%rax,%rax,2), %rax # sched: [1:0.50]
; X64-HSW-NEXT: subq %rdi, %rax # sched: [1:0.25]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [7:1.00]
;
; X64-JAG-LABEL: test_mul_by_26:
-; X64-JAG: # BB#0:
+; X64-JAG: # %bb.0:
; X64-JAG-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:0.50]
; X64-JAG-NEXT: leaq (%rax,%rax,2), %rax # sched: [1:0.50]
; X64-JAG-NEXT: subq %rdi, %rax # sched: [1:0.50]
; X64-JAG-NEXT: retq # sched: [4:1.00]
;
; X86-NOOPT-LABEL: test_mul_by_26:
-; X86-NOOPT: # BB#0:
+; X86-NOOPT: # %bb.0:
; X86-NOOPT-NEXT: movl $26, %eax
; X86-NOOPT-NEXT: mull {{[0-9]+}}(%esp)
; X86-NOOPT-NEXT: imull $26, {{[0-9]+}}(%esp), %ecx
@@ -1383,22 +1383,22 @@ define i64 @test_mul_by_26(i64 %x) {
; X86-NOOPT-NEXT: retl
;
; HSW-NOOPT-LABEL: test_mul_by_26:
-; HSW-NOOPT: # BB#0:
+; HSW-NOOPT: # %bb.0:
; HSW-NOOPT-NEXT: imulq $26, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [7:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_26:
-; JAG-NOOPT: # BB#0:
+; JAG-NOOPT: # %bb.0:
; JAG-NOOPT-NEXT: imulq $26, %rdi, %rax # sched: [3:1.00]
; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
;
; X64-SLM-LABEL: test_mul_by_26:
-; X64-SLM: # BB#0:
+; X64-SLM: # %bb.0:
; X64-SLM-NEXT: imulq $26, %rdi, %rax # sched: [3:1.00]
; X64-SLM-NEXT: retq # sched: [4:1.00]
;
; SLM-NOOPT-LABEL: test_mul_by_26:
-; SLM-NOOPT: # BB#0:
+; SLM-NOOPT: # %bb.0:
; SLM-NOOPT-NEXT: imulq $26, %rdi, %rax # sched: [3:1.00]
; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
%mul = mul nsw i64 %x, 26
@@ -1407,7 +1407,7 @@ define i64 @test_mul_by_26(i64 %x) {
define i64 @test_mul_by_27(i64 %x) {
; X86-LABEL: test_mul_by_27:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl $27, %eax
; X86-NEXT: mull {{[0-9]+}}(%esp)
@@ -1417,19 +1417,19 @@ define i64 @test_mul_by_27(i64 %x) {
; X86-NEXT: retl
;
; X64-HSW-LABEL: test_mul_by_27:
-; X64-HSW: # BB#0:
+; X64-HSW: # %bb.0:
; X64-HSW-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:0.50]
; X64-HSW-NEXT: leaq (%rax,%rax,2), %rax # sched: [1:0.50]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [7:1.00]
;
; X64-JAG-LABEL: test_mul_by_27:
-; X64-JAG: # BB#0:
+; X64-JAG: # %bb.0:
; X64-JAG-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:0.50]
; X64-JAG-NEXT: leaq (%rax,%rax,2), %rax # sched: [1:0.50]
; X64-JAG-NEXT: retq # sched: [4:1.00]
;
; X86-NOOPT-LABEL: test_mul_by_27:
-; X86-NOOPT: # BB#0:
+; X86-NOOPT: # %bb.0:
; X86-NOOPT-NEXT: movl $27, %eax
; X86-NOOPT-NEXT: mull {{[0-9]+}}(%esp)
; X86-NOOPT-NEXT: imull $27, {{[0-9]+}}(%esp), %ecx
@@ -1437,23 +1437,23 @@ define i64 @test_mul_by_27(i64 %x) {
; X86-NOOPT-NEXT: retl
;
; HSW-NOOPT-LABEL: test_mul_by_27:
-; HSW-NOOPT: # BB#0:
+; HSW-NOOPT: # %bb.0:
; HSW-NOOPT-NEXT: imulq $27, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [7:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_27:
-; JAG-NOOPT: # BB#0:
+; JAG-NOOPT: # %bb.0:
; JAG-NOOPT-NEXT: imulq $27, %rdi, %rax # sched: [3:1.00]
; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
;
; X64-SLM-LABEL: test_mul_by_27:
-; X64-SLM: # BB#0:
+; X64-SLM: # %bb.0:
; X64-SLM-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:1.00]
; X64-SLM-NEXT: leaq (%rax,%rax,2), %rax # sched: [1:1.00]
; X64-SLM-NEXT: retq # sched: [4:1.00]
;
; SLM-NOOPT-LABEL: test_mul_by_27:
-; SLM-NOOPT: # BB#0:
+; SLM-NOOPT: # %bb.0:
; SLM-NOOPT-NEXT: imulq $27, %rdi, %rax # sched: [3:1.00]
; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
%mul = mul nsw i64 %x, 27
@@ -1462,7 +1462,7 @@ define i64 @test_mul_by_27(i64 %x) {
define i64 @test_mul_by_28(i64 %x) {
; X86-LABEL: test_mul_by_28:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: leal (%eax,%eax,8), %ecx
; X86-NEXT: leal (%ecx,%ecx,2), %ecx
@@ -1473,21 +1473,21 @@ define i64 @test_mul_by_28(i64 %x) {
; X86-NEXT: retl
;
; X64-HSW-LABEL: test_mul_by_28:
-; X64-HSW: # BB#0:
+; X64-HSW: # %bb.0:
; X64-HSW-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:0.50]
; X64-HSW-NEXT: leaq (%rax,%rax,2), %rax # sched: [1:0.50]
; X64-HSW-NEXT: addq %rdi, %rax # sched: [1:0.25]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [7:1.00]
;
; X64-JAG-LABEL: test_mul_by_28:
-; X64-JAG: # BB#0:
+; X64-JAG: # %bb.0:
; X64-JAG-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:0.50]
; X64-JAG-NEXT: leaq (%rax,%rax,2), %rax # sched: [1:0.50]
; X64-JAG-NEXT: addq %rdi, %rax # sched: [1:0.50]
; X64-JAG-NEXT: retq # sched: [4:1.00]
;
; X86-NOOPT-LABEL: test_mul_by_28:
-; X86-NOOPT: # BB#0:
+; X86-NOOPT: # %bb.0:
; X86-NOOPT-NEXT: movl $28, %eax
; X86-NOOPT-NEXT: mull {{[0-9]+}}(%esp)
; X86-NOOPT-NEXT: imull $28, {{[0-9]+}}(%esp), %ecx
@@ -1495,22 +1495,22 @@ define i64 @test_mul_by_28(i64 %x) {
; X86-NOOPT-NEXT: retl
;
; HSW-NOOPT-LABEL: test_mul_by_28:
-; HSW-NOOPT: # BB#0:
+; HSW-NOOPT: # %bb.0:
; HSW-NOOPT-NEXT: imulq $28, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [7:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_28:
-; JAG-NOOPT: # BB#0:
+; JAG-NOOPT: # %bb.0:
; JAG-NOOPT-NEXT: imulq $28, %rdi, %rax # sched: [3:1.00]
; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
;
; X64-SLM-LABEL: test_mul_by_28:
-; X64-SLM: # BB#0:
+; X64-SLM: # %bb.0:
; X64-SLM-NEXT: imulq $28, %rdi, %rax # sched: [3:1.00]
; X64-SLM-NEXT: retq # sched: [4:1.00]
;
; SLM-NOOPT-LABEL: test_mul_by_28:
-; SLM-NOOPT: # BB#0:
+; SLM-NOOPT: # %bb.0:
; SLM-NOOPT-NEXT: imulq $28, %rdi, %rax # sched: [3:1.00]
; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
%mul = mul nsw i64 %x, 28
@@ -1519,7 +1519,7 @@ define i64 @test_mul_by_28(i64 %x) {
define i64 @test_mul_by_29(i64 %x) {
; X86-LABEL: test_mul_by_29:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: leal (%eax,%eax,8), %ecx
; X86-NEXT: leal (%ecx,%ecx,2), %ecx
@@ -1531,15 +1531,15 @@ define i64 @test_mul_by_29(i64 %x) {
; X86-NEXT: retl
;
; X64-HSW-LABEL: test_mul_by_29:
-; X64-HSW: # BB#0:
+; X64-HSW: # %bb.0:
; X64-HSW-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:0.50]
; X64-HSW-NEXT: leaq (%rax,%rax,2), %rax # sched: [1:0.50]
; X64-HSW-NEXT: addq %rdi, %rax # sched: [1:0.25]
; X64-HSW-NEXT: addq %rdi, %rax # sched: [1:0.25]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [7:1.00]
;
; X64-JAG-LABEL: test_mul_by_29:
-; X64-JAG: # BB#0:
+; X64-JAG: # %bb.0:
; X64-JAG-NEXT: leaq (%rdi,%rdi,8), %rax # sched: [1:0.50]
; X64-JAG-NEXT: leaq (%rax,%rax,2), %rax # sched: [1:0.50]
; X64-JAG-NEXT: addq %rdi, %rax # sched: [1:0.50]
@@ -1547,7 +1547,7 @@ define i64 @test_mul_by_29(i64 %x) {
; X64-JAG-NEXT: retq # sched: [4:1.00]
;
; X86-NOOPT-LABEL: test_mul_by_29:
-; X86-NOOPT: # BB#0:
+; X86-NOOPT: # %bb.0:
; X86-NOOPT-NEXT: movl $29, %eax
; X86-NOOPT-NEXT: mull {{[0-9]+}}(%esp)
; X86-NOOPT-NEXT: imull $29, {{[0-9]+}}(%esp), %ecx
@@ -1555,22 +1555,22 @@ define i64 @test_mul_by_29(i64 %x) {
; X86-NOOPT-NEXT: retl
;
; HSW-NOOPT-LABEL: test_mul_by_29:
-; HSW-NOOPT: # BB#0:
+; HSW-NOOPT: # %bb.0:
; HSW-NOOPT-NEXT: imulq $29, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [7:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_29:
-; JAG-NOOPT: # BB#0:
+; JAG-NOOPT: # %bb.0:
; JAG-NOOPT-NEXT: imulq $29, %rdi, %rax # sched: [3:1.00]
; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
;
; X64-SLM-LABEL: test_mul_by_29:
-; X64-SLM: # BB#0:
+; X64-SLM: # %bb.0:
; X64-SLM-NEXT: imulq $29, %rdi, %rax # sched: [3:1.00]
; X64-SLM-NEXT: retq # sched: [4:1.00]
;
; SLM-NOOPT-LABEL: test_mul_by_29:
-; SLM-NOOPT: # BB#0:
+; SLM-NOOPT: # %bb.0:
; SLM-NOOPT-NEXT: imulq $29, %rdi, %rax # sched: [3:1.00]
; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
%mul = mul nsw i64 %x, 29
@@ -1579,7 +1579,7 @@ define i64 @test_mul_by_29(i64 %x) {
define i64 @test_mul_by_30(i64 %x) {
; X86-LABEL: test_mul_by_30:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl %eax, %ecx
; X86-NEXT: shll $5, %ecx
@@ -1591,23 +1591,23 @@ define i64 @test_mul_by_30(i64 %x) {
; X86-NEXT: retl
;
; X64-HSW-LABEL: test_mul_by_30:
-; X64-HSW: # BB#0:
+; X64-HSW: # %bb.0:
; X64-HSW-NEXT: movq %rdi, %rax # sched: [1:0.25]
; X64-HSW-NEXT: shlq $5, %rax # sched: [1:0.50]
; X64-HSW-NEXT: subq %rdi, %rax # sched: [1:0.25]
; X64-HSW-NEXT: subq %rdi, %rax # sched: [1:0.25]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [7:1.00]
;
; X64-JAG-LABEL: test_mul_by_30:
-; X64-JAG: # BB#0:
-; X64-JAG-NEXT: movq %rdi, %rax # sched: [1:0.17]
+; X64-JAG: # %bb.0:
+; X64-JAG-NEXT: movq %rdi, %rax # sched: [1:0.50]
; X64-JAG-NEXT: shlq $5, %rax # sched: [1:0.50]
; X64-JAG-NEXT: subq %rdi, %rax # sched: [1:0.50]
; X64-JAG-NEXT: subq %rdi, %rax # sched: [1:0.50]
; X64-JAG-NEXT: retq # sched: [4:1.00]
;
; X86-NOOPT-LABEL: test_mul_by_30:
-; X86-NOOPT: # BB#0:
+; X86-NOOPT: # %bb.0:
; X86-NOOPT-NEXT: movl $30, %eax
; X86-NOOPT-NEXT: mull {{[0-9]+}}(%esp)
; X86-NOOPT-NEXT: imull $30, {{[0-9]+}}(%esp), %ecx
@@ -1615,22 +1615,22 @@ define i64 @test_mul_by_30(i64 %x) {
; X86-NOOPT-NEXT: retl
;
; HSW-NOOPT-LABEL: test_mul_by_30:
-; HSW-NOOPT: # BB#0:
+; HSW-NOOPT: # %bb.0:
; HSW-NOOPT-NEXT: imulq $30, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [7:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_30:
-; JAG-NOOPT: # BB#0:
+; JAG-NOOPT: # %bb.0:
; JAG-NOOPT-NEXT: imulq $30, %rdi, %rax # sched: [3:1.00]
; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
;
; X64-SLM-LABEL: test_mul_by_30:
-; X64-SLM: # BB#0:
+; X64-SLM: # %bb.0:
; X64-SLM-NEXT: imulq $30, %rdi, %rax # sched: [3:1.00]
; X64-SLM-NEXT: retq # sched: [4:1.00]
;
; SLM-NOOPT-LABEL: test_mul_by_30:
-; SLM-NOOPT: # BB#0:
+; SLM-NOOPT: # %bb.0:
; SLM-NOOPT-NEXT: imulq $30, %rdi, %rax # sched: [3:1.00]
; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
%mul = mul nsw i64 %x, 30
@@ -1639,7 +1639,7 @@ define i64 @test_mul_by_30(i64 %x) {
define i64 @test_mul_by_31(i64 %x) {
; X86-LABEL: test_mul_by_31:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl %eax, %ecx
; X86-NEXT: shll $5, %ecx
@@ -1650,21 +1650,21 @@ define i64 @test_mul_by_31(i64 %x) {
; X86-NEXT: retl
;
; X64-HSW-LABEL: test_mul_by_31:
-; X64-HSW: # BB#0:
+; X64-HSW: # %bb.0:
; X64-HSW-NEXT: movq %rdi, %rax # sched: [1:0.25]
; X64-HSW-NEXT: shlq $5, %rax # sched: [1:0.50]
; X64-HSW-NEXT: subq %rdi, %rax # sched: [1:0.25]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [7:1.00]
;
; X64-JAG-LABEL: test_mul_by_31:
-; X64-JAG: # BB#0:
-; X64-JAG-NEXT: movq %rdi, %rax # sched: [1:0.17]
+; X64-JAG: # %bb.0:
+; X64-JAG-NEXT: movq %rdi, %rax # sched: [1:0.50]
; X64-JAG-NEXT: shlq $5, %rax # sched: [1:0.50]
; X64-JAG-NEXT: subq %rdi, %rax # sched: [1:0.50]
; X64-JAG-NEXT: retq # sched: [4:1.00]
;
; X86-NOOPT-LABEL: test_mul_by_31:
-; X86-NOOPT: # BB#0:
+; X86-NOOPT: # %bb.0:
; X86-NOOPT-NEXT: movl $31, %eax
; X86-NOOPT-NEXT: mull {{[0-9]+}}(%esp)
; X86-NOOPT-NEXT: imull $31, {{[0-9]+}}(%esp), %ecx
@@ -1672,24 +1672,24 @@ define i64 @test_mul_by_31(i64 %x) {
; X86-NOOPT-NEXT: retl
;
; HSW-NOOPT-LABEL: test_mul_by_31:
-; HSW-NOOPT: # BB#0:
+; HSW-NOOPT: # %bb.0:
; HSW-NOOPT-NEXT: imulq $31, %rdi, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [7:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_31:
-; JAG-NOOPT: # BB#0:
+; JAG-NOOPT: # %bb.0:
; JAG-NOOPT-NEXT: imulq $31, %rdi, %rax # sched: [3:1.00]
; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
;
; X64-SLM-LABEL: test_mul_by_31:
-; X64-SLM: # BB#0:
+; X64-SLM: # %bb.0:
; X64-SLM-NEXT: movq %rdi, %rax # sched: [1:0.50]
; X64-SLM-NEXT: shlq $5, %rax # sched: [1:1.00]
; X64-SLM-NEXT: subq %rdi, %rax # sched: [1:0.50]
; X64-SLM-NEXT: retq # sched: [4:1.00]
;
; SLM-NOOPT-LABEL: test_mul_by_31:
-; SLM-NOOPT: # BB#0:
+; SLM-NOOPT: # %bb.0:
; SLM-NOOPT-NEXT: imulq $31, %rdi, %rax # sched: [3:1.00]
; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
%mul = mul nsw i64 %x, 31
@@ -1698,7 +1698,7 @@ define i64 @test_mul_by_31(i64 %x) {
define i64 @test_mul_by_32(i64 %x) {
; X86-LABEL: test_mul_by_32:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: shldl $5, %eax, %edx
@@ -1706,19 +1706,19 @@ define i64 @test_mul_by_32(i64 %x) {
; X86-NEXT: retl
;
; X64-HSW-LABEL: test_mul_by_32:
-; X64-HSW: # BB#0:
+; X64-HSW: # %bb.0:
; X64-HSW-NEXT: shlq $5, %rdi # sched: [1:0.50]
; X64-HSW-NEXT: movq %rdi, %rax # sched: [1:0.25]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [7:1.00]
;
; X64-JAG-LABEL: test_mul_by_32:
-; X64-JAG: # BB#0:
+; X64-JAG: # %bb.0:
; X64-JAG-NEXT: shlq $5, %rdi # sched: [1:0.50]
-; X64-JAG-NEXT: movq %rdi, %rax # sched: [1:0.17]
+; X64-JAG-NEXT: movq %rdi, %rax # sched: [1:0.50]
; X64-JAG-NEXT: retq # sched: [4:1.00]
;
; X86-NOOPT-LABEL: test_mul_by_32:
-; X86-NOOPT: # BB#0:
+; X86-NOOPT: # %bb.0:
; X86-NOOPT-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NOOPT-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NOOPT-NEXT: shldl $5, %eax, %edx
@@ -1726,25 +1726,25 @@ define i64 @test_mul_by_32(i64 %x) {
; X86-NOOPT-NEXT: retl
;
; HSW-NOOPT-LABEL: test_mul_by_32:
-; HSW-NOOPT: # BB#0:
+; HSW-NOOPT: # %bb.0:
; HSW-NOOPT-NEXT: shlq $5, %rdi # sched: [1:0.50]
; HSW-NOOPT-NEXT: movq %rdi, %rax # sched: [1:0.25]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [7:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_32:
-; JAG-NOOPT: # BB#0:
+; JAG-NOOPT: # %bb.0:
; JAG-NOOPT-NEXT: shlq $5, %rdi # sched: [1:0.50]
-; JAG-NOOPT-NEXT: movq %rdi, %rax # sched: [1:0.17]
+; JAG-NOOPT-NEXT: movq %rdi, %rax # sched: [1:0.50]
; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
;
; X64-SLM-LABEL: test_mul_by_32:
-; X64-SLM: # BB#0:
+; X64-SLM: # %bb.0:
; X64-SLM-NEXT: shlq $5, %rdi # sched: [1:1.00]
; X64-SLM-NEXT: movq %rdi, %rax # sched: [1:0.50]
; X64-SLM-NEXT: retq # sched: [4:1.00]
;
; SLM-NOOPT-LABEL: test_mul_by_32:
-; SLM-NOOPT: # BB#0:
+; SLM-NOOPT: # %bb.0:
; SLM-NOOPT-NEXT: shlq $5, %rdi # sched: [1:1.00]
; SLM-NOOPT-NEXT: movq %rdi, %rax # sched: [1:0.50]
; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
@@ -1755,7 +1755,7 @@ define i64 @test_mul_by_32(i64 %x) {
; (x*9+42)*(x*5+2)
define i64 @test_mul_spec(i64 %x) nounwind {
; X86-LABEL: test_mul_spec:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
@@ -1787,23 +1787,23 @@ define i64 @test_mul_spec(i64 %x) nounwind {
; X86-NEXT: retl
;
; X64-HSW-LABEL: test_mul_spec:
-; X64-HSW: # BB#0:
+; X64-HSW: # %bb.0:
; X64-HSW-NEXT: leaq (%rdi,%rdi,8), %rcx # sched: [1:0.50]
; X64-HSW-NEXT: addq $42, %rcx # sched: [1:0.25]
; X64-HSW-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
; X64-HSW-NEXT: addq $2, %rax # sched: [1:0.25]
; X64-HSW-NEXT: imulq %rcx, %rax # sched: [3:1.00]
-; X64-HSW-NEXT: retq # sched: [1:1.00]
+; X64-HSW-NEXT: retq # sched: [7:1.00]
;
; X64-JAG-LABEL: test_mul_spec:
-; X64-JAG: # BB#0:
+; X64-JAG: # %bb.0:
; X64-JAG-NEXT: leaq 42(%rdi,%rdi,8), %rcx # sched: [1:0.50]
; X64-JAG-NEXT: leaq 2(%rdi,%rdi,4), %rax # sched: [1:0.50]
; X64-JAG-NEXT: imulq %rcx, %rax # sched: [3:1.00]
; X64-JAG-NEXT: retq # sched: [4:1.00]
;
; X86-NOOPT-LABEL: test_mul_spec:
-; X86-NOOPT: # BB#0:
+; X86-NOOPT: # %bb.0:
; X86-NOOPT-NEXT: pushl %ebx
; X86-NOOPT-NEXT: pushl %edi
; X86-NOOPT-NEXT: pushl %esi
@@ -1835,30 +1835,30 @@ define i64 @test_mul_spec(i64 %x) nounwind {
; X86-NOOPT-NEXT: retl
;
; HSW-NOOPT-LABEL: test_mul_spec:
-; HSW-NOOPT: # BB#0:
+; HSW-NOOPT: # %bb.0:
; HSW-NOOPT-NEXT: leaq (%rdi,%rdi,8), %rcx # sched: [1:0.50]
; HSW-NOOPT-NEXT: addq $42, %rcx # sched: [1:0.25]
; HSW-NOOPT-NEXT: leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
; HSW-NOOPT-NEXT: addq $2, %rax # sched: [1:0.25]
; HSW-NOOPT-NEXT: imulq %rcx, %rax # sched: [3:1.00]
-; HSW-NOOPT-NEXT: retq # sched: [1:1.00]
+; HSW-NOOPT-NEXT: retq # sched: [7:1.00]
;
; JAG-NOOPT-LABEL: test_mul_spec:
-; JAG-NOOPT: # BB#0:
+; JAG-NOOPT: # %bb.0:
; JAG-NOOPT-NEXT: leaq 42(%rdi,%rdi,8), %rcx # sched: [1:0.50]
; JAG-NOOPT-NEXT: leaq 2(%rdi,%rdi,4), %rax # sched: [1:0.50]
; JAG-NOOPT-NEXT: imulq %rcx, %rax # sched: [3:1.00]
; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
;
; X64-SLM-LABEL: test_mul_spec:
-; X64-SLM: # BB#0:
+; X64-SLM: # %bb.0:
; X64-SLM-NEXT: leaq 42(%rdi,%rdi,8), %rcx # sched: [1:1.00]
; X64-SLM-NEXT: leaq 2(%rdi,%rdi,4), %rax # sched: [1:1.00]
; X64-SLM-NEXT: imulq %rcx, %rax # sched: [3:1.00]
; X64-SLM-NEXT: retq # sched: [4:1.00]
;
; SLM-NOOPT-LABEL: test_mul_spec:
-; SLM-NOOPT: # BB#0:
+; SLM-NOOPT: # %bb.0:
; SLM-NOOPT-NEXT: leaq 42(%rdi,%rdi,8), %rcx # sched: [1:1.00]
; SLM-NOOPT-NEXT: leaq 2(%rdi,%rdi,4), %rax # sched: [1:1.00]
; SLM-NOOPT-NEXT: imulq %rcx, %rax # sched: [3:1.00]
diff --git a/test/CodeGen/X86/mul-constant-result.ll b/test/CodeGen/X86/mul-constant-result.ll
index 65d80a699e24..bec0ed990dc3 100644
--- a/test/CodeGen/X86/mul-constant-result.ll
+++ b/test/CodeGen/X86/mul-constant-result.ll
@@ -1,34 +1,35 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=i686-unknown | FileCheck %s --check-prefix=X86
-; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=haswell| FileCheck %s --check-prefix=X64-HSW
+
+; Incremental updates of the instruction depths should be enough for this test
+; case.
+; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=haswell -machine-combiner-inc-threshold=0| FileCheck %s --check-prefix=X64-HSW
; Function Attrs: norecurse nounwind readnone uwtable
define i32 @mult(i32, i32) local_unnamed_addr #0 {
; X86-LABEL: mult:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: pushl %esi
-; X86-NEXT: .Lcfi0:
; X86-NEXT: .cfi_def_cfa_offset 8
-; X86-NEXT: .Lcfi1:
; X86-NEXT: .cfi_offset %esi, -8
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: cmpl $1, %edx
; X86-NEXT: movl $1, %eax
; X86-NEXT: movl $1, %esi
; X86-NEXT: jg .LBB0_2
-; X86-NEXT: # BB#1:
+; X86-NEXT: # %bb.1:
; X86-NEXT: movl %edx, %esi
; X86-NEXT: .LBB0_2:
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: testl %edx, %edx
; X86-NEXT: je .LBB0_4
-; X86-NEXT: # BB#3:
+; X86-NEXT: # %bb.3:
; X86-NEXT: movl %esi, %eax
; X86-NEXT: .LBB0_4:
; X86-NEXT: decl %ecx
; X86-NEXT: cmpl $31, %ecx
; X86-NEXT: ja .LBB0_39
-; X86-NEXT: # BB#5:
+; X86-NEXT: # %bb.5:
; X86-NEXT: jmpl *.LJTI0_0(,%ecx,4)
; X86-NEXT: .LBB0_6:
; X86-NEXT: addl %eax, %eax
@@ -186,8 +187,8 @@ define i32 @mult(i32, i32) local_unnamed_addr #0 {
; X86-NEXT: retl
;
; X64-HSW-LABEL: mult:
-; X64-HSW: # BB#0:
-; X64-HSW-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-HSW: # %bb.0:
+; X64-HSW-NEXT: # kill: def %edi killed %edi def %rdi
; X64-HSW-NEXT: cmpl $1, %esi
; X64-HSW-NEXT: movl $1, %ecx
; X64-HSW-NEXT: movl %esi, %eax
@@ -197,64 +198,64 @@ define i32 @mult(i32, i32) local_unnamed_addr #0 {
; X64-HSW-NEXT: addl $-1, %edi
; X64-HSW-NEXT: cmpl $31, %edi
; X64-HSW-NEXT: ja .LBB0_36
-; X64-HSW-NEXT: # BB#1:
+; X64-HSW-NEXT: # %bb.1:
; X64-HSW-NEXT: jmpq *.LJTI0_0(,%rdi,8)
; X64-HSW-NEXT: .LBB0_2:
; X64-HSW-NEXT: addl %eax, %eax
-; X64-HSW-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill>
+; X64-HSW-NEXT: # kill: def %eax killed %eax killed %rax
; X64-HSW-NEXT: retq
; X64-HSW-NEXT: .LBB0_36:
; X64-HSW-NEXT: xorl %eax, %eax
; X64-HSW-NEXT: .LBB0_37:
-; X64-HSW-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill>
+; X64-HSW-NEXT: # kill: def %eax killed %eax killed %rax
; X64-HSW-NEXT: retq
; X64-HSW-NEXT: .LBB0_3:
; X64-HSW-NEXT: leal (%rax,%rax,2), %eax
-; X64-HSW-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill>
+; X64-HSW-NEXT: # kill: def %eax killed %eax killed %rax
; X64-HSW-NEXT: retq
; X64-HSW-NEXT: .LBB0_4:
; X64-HSW-NEXT: shll $2, %eax
-; X64-HSW-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill>
+; X64-HSW-NEXT: # kill: def %eax killed %eax killed %rax
; X64-HSW-NEXT: retq
; X64-HSW-NEXT: .LBB0_5:
; X64-HSW-NEXT: leal (%rax,%rax,4), %eax
-; X64-HSW-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill>
+; X64-HSW-NEXT: # kill: def %eax killed %eax killed %rax
; X64-HSW-NEXT: retq
; X64-HSW-NEXT: .LBB0_6:
; X64-HSW-NEXT: addl %eax, %eax
; X64-HSW-NEXT: leal (%rax,%rax,2), %eax
-; X64-HSW-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill>
+; X64-HSW-NEXT: # kill: def %eax killed %eax killed %rax
; X64-HSW-NEXT: retq
; X64-HSW-NEXT: .LBB0_7:
; X64-HSW-NEXT: leal (,%rax,8), %ecx
; X64-HSW-NEXT: jmp .LBB0_8
; X64-HSW-NEXT: .LBB0_9:
; X64-HSW-NEXT: shll $3, %eax
-; X64-HSW-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill>
+; X64-HSW-NEXT: # kill: def %eax killed %eax killed %rax
; X64-HSW-NEXT: retq
; X64-HSW-NEXT: .LBB0_10:
; X64-HSW-NEXT: leal (%rax,%rax,8), %eax
-; X64-HSW-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill>
+; X64-HSW-NEXT: # kill: def %eax killed %eax killed %rax
; X64-HSW-NEXT: retq
; X64-HSW-NEXT: .LBB0_11:
; X64-HSW-NEXT: addl %eax, %eax
; X64-HSW-NEXT: leal (%rax,%rax,4), %eax
-; X64-HSW-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill>
+; X64-HSW-NEXT: # kill: def %eax killed %eax killed %rax
; X64-HSW-NEXT: retq
; X64-HSW-NEXT: .LBB0_12:
; X64-HSW-NEXT: leal (%rax,%rax,4), %ecx
; X64-HSW-NEXT: leal (%rax,%rcx,2), %eax
-; X64-HSW-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill>
+; X64-HSW-NEXT: # kill: def %eax killed %eax killed %rax
; X64-HSW-NEXT: retq
; X64-HSW-NEXT: .LBB0_13:
; X64-HSW-NEXT: shll $2, %eax
; X64-HSW-NEXT: leal (%rax,%rax,2), %eax
-; X64-HSW-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill>
+; X64-HSW-NEXT: # kill: def %eax killed %eax killed %rax
; X64-HSW-NEXT: retq
; X64-HSW-NEXT: .LBB0_14:
; X64-HSW-NEXT: leal (%rax,%rax,2), %ecx
; X64-HSW-NEXT: leal (%rax,%rcx,4), %eax
-; X64-HSW-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill>
+; X64-HSW-NEXT: # kill: def %eax killed %eax killed %rax
; X64-HSW-NEXT: retq
; X64-HSW-NEXT: .LBB0_15:
; X64-HSW-NEXT: leal (%rax,%rax,2), %ecx
@@ -262,11 +263,11 @@ define i32 @mult(i32, i32) local_unnamed_addr #0 {
; X64-HSW-NEXT: .LBB0_18:
; X64-HSW-NEXT: leal (%rax,%rax,4), %eax
; X64-HSW-NEXT: leal (%rax,%rax,2), %eax
-; X64-HSW-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill>
+; X64-HSW-NEXT: # kill: def %eax killed %eax killed %rax
; X64-HSW-NEXT: retq
; X64-HSW-NEXT: .LBB0_19:
; X64-HSW-NEXT: shll $4, %eax
-; X64-HSW-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill>
+; X64-HSW-NEXT: # kill: def %eax killed %eax killed %rax
; X64-HSW-NEXT: retq
; X64-HSW-NEXT: .LBB0_20:
; X64-HSW-NEXT: movl %eax, %ecx
@@ -275,7 +276,7 @@ define i32 @mult(i32, i32) local_unnamed_addr #0 {
; X64-HSW-NEXT: .LBB0_21:
; X64-HSW-NEXT: addl %eax, %eax
; X64-HSW-NEXT: leal (%rax,%rax,8), %eax
-; X64-HSW-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill>
+; X64-HSW-NEXT: # kill: def %eax killed %eax killed %rax
; X64-HSW-NEXT: retq
; X64-HSW-NEXT: .LBB0_22:
; X64-HSW-NEXT: leal (%rax,%rax,4), %ecx
@@ -284,12 +285,12 @@ define i32 @mult(i32, i32) local_unnamed_addr #0 {
; X64-HSW-NEXT: .LBB0_23:
; X64-HSW-NEXT: shll $2, %eax
; X64-HSW-NEXT: leal (%rax,%rax,4), %eax
-; X64-HSW-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill>
+; X64-HSW-NEXT: # kill: def %eax killed %eax killed %rax
; X64-HSW-NEXT: retq
; X64-HSW-NEXT: .LBB0_24:
; X64-HSW-NEXT: leal (%rax,%rax,4), %ecx
; X64-HSW-NEXT: leal (%rax,%rcx,4), %eax
-; X64-HSW-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill>
+; X64-HSW-NEXT: # kill: def %eax killed %eax killed %rax
; X64-HSW-NEXT: retq
; X64-HSW-NEXT: .LBB0_25:
; X64-HSW-NEXT: leal (%rax,%rax,4), %ecx
@@ -303,12 +304,12 @@ define i32 @mult(i32, i32) local_unnamed_addr #0 {
; X64-HSW-NEXT: .LBB0_27:
; X64-HSW-NEXT: shll $3, %eax
; X64-HSW-NEXT: leal (%rax,%rax,2), %eax
-; X64-HSW-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill>
+; X64-HSW-NEXT: # kill: def %eax killed %eax killed %rax
; X64-HSW-NEXT: retq
; X64-HSW-NEXT: .LBB0_28:
; X64-HSW-NEXT: leal (%rax,%rax,4), %eax
; X64-HSW-NEXT: leal (%rax,%rax,4), %eax
-; X64-HSW-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill>
+; X64-HSW-NEXT: # kill: def %eax killed %eax killed %rax
; X64-HSW-NEXT: retq
; X64-HSW-NEXT: .LBB0_29:
; X64-HSW-NEXT: leal (%rax,%rax,8), %ecx
@@ -317,7 +318,7 @@ define i32 @mult(i32, i32) local_unnamed_addr #0 {
; X64-HSW-NEXT: .LBB0_30:
; X64-HSW-NEXT: leal (%rax,%rax,8), %eax
; X64-HSW-NEXT: leal (%rax,%rax,2), %eax
-; X64-HSW-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill>
+; X64-HSW-NEXT: # kill: def %eax killed %eax killed %rax
; X64-HSW-NEXT: retq
; X64-HSW-NEXT: .LBB0_31:
; X64-HSW-NEXT: leal (%rax,%rax,8), %ecx
@@ -330,7 +331,7 @@ define i32 @mult(i32, i32) local_unnamed_addr #0 {
; X64-HSW-NEXT: .LBB0_17:
; X64-HSW-NEXT: addl %eax, %ecx
; X64-HSW-NEXT: movl %ecx, %eax
-; X64-HSW-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill>
+; X64-HSW-NEXT: # kill: def %eax killed %eax killed %rax
; X64-HSW-NEXT: retq
; X64-HSW-NEXT: .LBB0_33:
; X64-HSW-NEXT: movl %eax, %ecx
@@ -343,11 +344,11 @@ define i32 @mult(i32, i32) local_unnamed_addr #0 {
; X64-HSW-NEXT: .LBB0_8:
; X64-HSW-NEXT: subl %eax, %ecx
; X64-HSW-NEXT: movl %ecx, %eax
-; X64-HSW-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill>
+; X64-HSW-NEXT: # kill: def %eax killed %eax killed %rax
; X64-HSW-NEXT: retq
; X64-HSW-NEXT: .LBB0_35:
; X64-HSW-NEXT: shll $5, %eax
-; X64-HSW-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill>
+; X64-HSW-NEXT: # kill: def %eax killed %eax killed %rax
; X64-HSW-NEXT: retq
%3 = icmp eq i32 %1, 0
%4 = icmp sgt i32 %1, 1
@@ -523,474 +524,359 @@ define i32 @mult(i32, i32) local_unnamed_addr #0 {
; Function Attrs: norecurse nounwind readnone uwtable
define i32 @foo() local_unnamed_addr #0 {
; X86-LABEL: foo:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: pushl %ebx
-; X86-NEXT: .Lcfi2:
; X86-NEXT: .cfi_def_cfa_offset 8
; X86-NEXT: pushl %edi
-; X86-NEXT: .Lcfi3:
; X86-NEXT: .cfi_def_cfa_offset 12
; X86-NEXT: pushl %esi
-; X86-NEXT: .Lcfi4:
; X86-NEXT: .cfi_def_cfa_offset 16
-; X86-NEXT: .Lcfi5:
; X86-NEXT: .cfi_offset %esi, -16
-; X86-NEXT: .Lcfi6:
; X86-NEXT: .cfi_offset %edi, -12
-; X86-NEXT: .Lcfi7:
; X86-NEXT: .cfi_offset %ebx, -8
; X86-NEXT: pushl $0
-; X86-NEXT: .Lcfi8:
; X86-NEXT: .cfi_adjust_cfa_offset 4
; X86-NEXT: pushl $1
-; X86-NEXT: .Lcfi9:
; X86-NEXT: .cfi_adjust_cfa_offset 4
; X86-NEXT: calll mult
; X86-NEXT: addl $8, %esp
-; X86-NEXT: .Lcfi10:
; X86-NEXT: .cfi_adjust_cfa_offset -8
; X86-NEXT: movl %eax, %esi
; X86-NEXT: xorl $1, %esi
; X86-NEXT: pushl $1
-; X86-NEXT: .Lcfi11:
; X86-NEXT: .cfi_adjust_cfa_offset 4
; X86-NEXT: pushl $2
-; X86-NEXT: .Lcfi12:
; X86-NEXT: .cfi_adjust_cfa_offset 4
; X86-NEXT: calll mult
; X86-NEXT: addl $8, %esp
-; X86-NEXT: .Lcfi13:
; X86-NEXT: .cfi_adjust_cfa_offset -8
; X86-NEXT: movl %eax, %edi
; X86-NEXT: xorl $2, %edi
; X86-NEXT: pushl $1
-; X86-NEXT: .Lcfi14:
; X86-NEXT: .cfi_adjust_cfa_offset 4
; X86-NEXT: pushl $3
-; X86-NEXT: .Lcfi15:
; X86-NEXT: .cfi_adjust_cfa_offset 4
; X86-NEXT: calll mult
; X86-NEXT: addl $8, %esp
-; X86-NEXT: .Lcfi16:
; X86-NEXT: .cfi_adjust_cfa_offset -8
; X86-NEXT: movl %eax, %ebx
; X86-NEXT: xorl $3, %ebx
; X86-NEXT: orl %edi, %ebx
; X86-NEXT: pushl $2
-; X86-NEXT: .Lcfi17:
; X86-NEXT: .cfi_adjust_cfa_offset 4
; X86-NEXT: pushl $4
-; X86-NEXT: .Lcfi18:
; X86-NEXT: .cfi_adjust_cfa_offset 4
; X86-NEXT: calll mult
; X86-NEXT: addl $8, %esp
-; X86-NEXT: .Lcfi19:
; X86-NEXT: .cfi_adjust_cfa_offset -8
; X86-NEXT: movl %eax, %edi
; X86-NEXT: xorl $4, %edi
; X86-NEXT: orl %ebx, %edi
; X86-NEXT: pushl $2
-; X86-NEXT: .Lcfi20:
; X86-NEXT: .cfi_adjust_cfa_offset 4
; X86-NEXT: pushl $5
-; X86-NEXT: .Lcfi21:
; X86-NEXT: .cfi_adjust_cfa_offset 4
; X86-NEXT: calll mult
; X86-NEXT: addl $8, %esp
-; X86-NEXT: .Lcfi22:
; X86-NEXT: .cfi_adjust_cfa_offset -8
; X86-NEXT: movl %eax, %ebx
; X86-NEXT: xorl $5, %ebx
; X86-NEXT: orl %edi, %ebx
; X86-NEXT: pushl $3
-; X86-NEXT: .Lcfi23:
; X86-NEXT: .cfi_adjust_cfa_offset 4
; X86-NEXT: pushl $6
-; X86-NEXT: .Lcfi24:
; X86-NEXT: .cfi_adjust_cfa_offset 4
; X86-NEXT: calll mult
; X86-NEXT: addl $8, %esp
-; X86-NEXT: .Lcfi25:
; X86-NEXT: .cfi_adjust_cfa_offset -8
; X86-NEXT: movl %eax, %edi
; X86-NEXT: xorl $6, %edi
; X86-NEXT: orl %ebx, %edi
; X86-NEXT: pushl $3
-; X86-NEXT: .Lcfi26:
; X86-NEXT: .cfi_adjust_cfa_offset 4
; X86-NEXT: pushl $7
-; X86-NEXT: .Lcfi27:
; X86-NEXT: .cfi_adjust_cfa_offset 4
; X86-NEXT: calll mult
; X86-NEXT: addl $8, %esp
-; X86-NEXT: .Lcfi28:
; X86-NEXT: .cfi_adjust_cfa_offset -8
; X86-NEXT: movl %eax, %ebx
; X86-NEXT: xorl $7, %ebx
; X86-NEXT: orl %edi, %ebx
; X86-NEXT: pushl $4
-; X86-NEXT: .Lcfi29:
; X86-NEXT: .cfi_adjust_cfa_offset 4
; X86-NEXT: pushl $8
-; X86-NEXT: .Lcfi30:
; X86-NEXT: .cfi_adjust_cfa_offset 4
; X86-NEXT: calll mult
; X86-NEXT: addl $8, %esp
-; X86-NEXT: .Lcfi31:
; X86-NEXT: .cfi_adjust_cfa_offset -8
; X86-NEXT: movl %eax, %edi
; X86-NEXT: xorl $8, %edi
; X86-NEXT: orl %ebx, %edi
; X86-NEXT: pushl $4
-; X86-NEXT: .Lcfi32:
; X86-NEXT: .cfi_adjust_cfa_offset 4
; X86-NEXT: pushl $9
-; X86-NEXT: .Lcfi33:
; X86-NEXT: .cfi_adjust_cfa_offset 4
; X86-NEXT: calll mult
; X86-NEXT: addl $8, %esp
-; X86-NEXT: .Lcfi34:
; X86-NEXT: .cfi_adjust_cfa_offset -8
; X86-NEXT: movl %eax, %ebx
; X86-NEXT: xorl $9, %ebx
; X86-NEXT: orl %edi, %ebx
; X86-NEXT: pushl $5
-; X86-NEXT: .Lcfi35:
; X86-NEXT: .cfi_adjust_cfa_offset 4
; X86-NEXT: pushl $10
-; X86-NEXT: .Lcfi36:
; X86-NEXT: .cfi_adjust_cfa_offset 4
; X86-NEXT: calll mult
; X86-NEXT: addl $8, %esp
-; X86-NEXT: .Lcfi37:
; X86-NEXT: .cfi_adjust_cfa_offset -8
; X86-NEXT: movl %eax, %edi
; X86-NEXT: xorl $10, %edi
; X86-NEXT: orl %ebx, %edi
; X86-NEXT: pushl $5
-; X86-NEXT: .Lcfi38:
; X86-NEXT: .cfi_adjust_cfa_offset 4
; X86-NEXT: pushl $11
-; X86-NEXT: .Lcfi39:
; X86-NEXT: .cfi_adjust_cfa_offset 4
; X86-NEXT: calll mult
; X86-NEXT: addl $8, %esp
-; X86-NEXT: .Lcfi40:
; X86-NEXT: .cfi_adjust_cfa_offset -8
; X86-NEXT: movl %eax, %ebx
; X86-NEXT: xorl $11, %ebx
; X86-NEXT: orl %edi, %ebx
; X86-NEXT: pushl $6
-; X86-NEXT: .Lcfi41:
; X86-NEXT: .cfi_adjust_cfa_offset 4
; X86-NEXT: pushl $12
-; X86-NEXT: .Lcfi42:
; X86-NEXT: .cfi_adjust_cfa_offset 4
; X86-NEXT: calll mult
; X86-NEXT: addl $8, %esp
-; X86-NEXT: .Lcfi43:
; X86-NEXT: .cfi_adjust_cfa_offset -8
; X86-NEXT: movl %eax, %edi
; X86-NEXT: xorl $12, %edi
; X86-NEXT: orl %ebx, %edi
; X86-NEXT: pushl $6
-; X86-NEXT: .Lcfi44:
; X86-NEXT: .cfi_adjust_cfa_offset 4
; X86-NEXT: pushl $13
-; X86-NEXT: .Lcfi45:
; X86-NEXT: .cfi_adjust_cfa_offset 4
; X86-NEXT: calll mult
; X86-NEXT: addl $8, %esp
-; X86-NEXT: .Lcfi46:
; X86-NEXT: .cfi_adjust_cfa_offset -8
; X86-NEXT: movl %eax, %ebx
; X86-NEXT: xorl $13, %ebx
; X86-NEXT: orl %edi, %ebx
; X86-NEXT: pushl $7
-; X86-NEXT: .Lcfi47:
; X86-NEXT: .cfi_adjust_cfa_offset 4
; X86-NEXT: pushl $14
-; X86-NEXT: .Lcfi48:
; X86-NEXT: .cfi_adjust_cfa_offset 4
; X86-NEXT: calll mult
; X86-NEXT: addl $8, %esp
-; X86-NEXT: .Lcfi49:
; X86-NEXT: .cfi_adjust_cfa_offset -8
; X86-NEXT: movl %eax, %edi
; X86-NEXT: xorl $14, %edi
; X86-NEXT: orl %ebx, %edi
; X86-NEXT: pushl $7
-; X86-NEXT: .Lcfi50:
; X86-NEXT: .cfi_adjust_cfa_offset 4
; X86-NEXT: pushl $15
-; X86-NEXT: .Lcfi51:
; X86-NEXT: .cfi_adjust_cfa_offset 4
; X86-NEXT: calll mult
; X86-NEXT: addl $8, %esp
-; X86-NEXT: .Lcfi52:
; X86-NEXT: .cfi_adjust_cfa_offset -8
; X86-NEXT: movl %eax, %ebx
; X86-NEXT: xorl $15, %ebx
; X86-NEXT: orl %edi, %ebx
; X86-NEXT: pushl $8
-; X86-NEXT: .Lcfi53:
; X86-NEXT: .cfi_adjust_cfa_offset 4
; X86-NEXT: pushl $16
-; X86-NEXT: .Lcfi54:
; X86-NEXT: .cfi_adjust_cfa_offset 4
; X86-NEXT: calll mult
; X86-NEXT: addl $8, %esp
-; X86-NEXT: .Lcfi55:
; X86-NEXT: .cfi_adjust_cfa_offset -8
; X86-NEXT: movl %eax, %edi
; X86-NEXT: xorl $16, %edi
; X86-NEXT: orl %ebx, %edi
; X86-NEXT: pushl $8
-; X86-NEXT: .Lcfi56:
; X86-NEXT: .cfi_adjust_cfa_offset 4
; X86-NEXT: pushl $17
-; X86-NEXT: .Lcfi57:
; X86-NEXT: .cfi_adjust_cfa_offset 4
; X86-NEXT: calll mult
; X86-NEXT: addl $8, %esp
-; X86-NEXT: .Lcfi58:
; X86-NEXT: .cfi_adjust_cfa_offset -8
; X86-NEXT: movl %eax, %ebx
; X86-NEXT: xorl $17, %ebx
; X86-NEXT: orl %edi, %ebx
; X86-NEXT: pushl $9
-; X86-NEXT: .Lcfi59:
; X86-NEXT: .cfi_adjust_cfa_offset 4
; X86-NEXT: pushl $18
-; X86-NEXT: .Lcfi60:
; X86-NEXT: .cfi_adjust_cfa_offset 4
; X86-NEXT: calll mult
; X86-NEXT: addl $8, %esp
-; X86-NEXT: .Lcfi61:
; X86-NEXT: .cfi_adjust_cfa_offset -8
; X86-NEXT: movl %eax, %edi
; X86-NEXT: xorl $18, %edi
; X86-NEXT: orl %ebx, %edi
; X86-NEXT: pushl $9
-; X86-NEXT: .Lcfi62:
; X86-NEXT: .cfi_adjust_cfa_offset 4
; X86-NEXT: pushl $19
-; X86-NEXT: .Lcfi63:
; X86-NEXT: .cfi_adjust_cfa_offset 4
; X86-NEXT: calll mult
; X86-NEXT: addl $8, %esp
-; X86-NEXT: .Lcfi64:
; X86-NEXT: .cfi_adjust_cfa_offset -8
; X86-NEXT: movl %eax, %ebx
; X86-NEXT: xorl $19, %ebx
; X86-NEXT: orl %edi, %ebx
; X86-NEXT: pushl $10
-; X86-NEXT: .Lcfi65:
; X86-NEXT: .cfi_adjust_cfa_offset 4
; X86-NEXT: pushl $20
-; X86-NEXT: .Lcfi66:
; X86-NEXT: .cfi_adjust_cfa_offset 4
; X86-NEXT: calll mult
; X86-NEXT: addl $8, %esp
-; X86-NEXT: .Lcfi67:
; X86-NEXT: .cfi_adjust_cfa_offset -8
; X86-NEXT: movl %eax, %edi
; X86-NEXT: xorl $20, %edi
; X86-NEXT: orl %ebx, %edi
; X86-NEXT: pushl $10
-; X86-NEXT: .Lcfi68:
; X86-NEXT: .cfi_adjust_cfa_offset 4
; X86-NEXT: pushl $21
-; X86-NEXT: .Lcfi69:
; X86-NEXT: .cfi_adjust_cfa_offset 4
; X86-NEXT: calll mult
; X86-NEXT: addl $8, %esp
-; X86-NEXT: .Lcfi70:
; X86-NEXT: .cfi_adjust_cfa_offset -8
; X86-NEXT: movl %eax, %ebx
; X86-NEXT: xorl $21, %ebx
; X86-NEXT: orl %edi, %ebx
; X86-NEXT: pushl $11
-; X86-NEXT: .Lcfi71:
; X86-NEXT: .cfi_adjust_cfa_offset 4
; X86-NEXT: pushl $22
-; X86-NEXT: .Lcfi72:
; X86-NEXT: .cfi_adjust_cfa_offset 4
; X86-NEXT: calll mult
; X86-NEXT: addl $8, %esp
-; X86-NEXT: .Lcfi73:
; X86-NEXT: .cfi_adjust_cfa_offset -8
; X86-NEXT: movl %eax, %edi
; X86-NEXT: xorl $22, %edi
; X86-NEXT: orl %ebx, %edi
; X86-NEXT: pushl $11
-; X86-NEXT: .Lcfi74:
; X86-NEXT: .cfi_adjust_cfa_offset 4
; X86-NEXT: pushl $23
-; X86-NEXT: .Lcfi75:
; X86-NEXT: .cfi_adjust_cfa_offset 4
; X86-NEXT: calll mult
; X86-NEXT: addl $8, %esp
-; X86-NEXT: .Lcfi76:
; X86-NEXT: .cfi_adjust_cfa_offset -8
; X86-NEXT: movl %eax, %ebx
; X86-NEXT: xorl $23, %ebx
; X86-NEXT: orl %edi, %ebx
; X86-NEXT: pushl $12
-; X86-NEXT: .Lcfi77:
; X86-NEXT: .cfi_adjust_cfa_offset 4
; X86-NEXT: pushl $24
-; X86-NEXT: .Lcfi78:
; X86-NEXT: .cfi_adjust_cfa_offset 4
; X86-NEXT: calll mult
; X86-NEXT: addl $8, %esp
-; X86-NEXT: .Lcfi79:
; X86-NEXT: .cfi_adjust_cfa_offset -8
; X86-NEXT: movl %eax, %edi
; X86-NEXT: xorl $24, %edi
; X86-NEXT: orl %ebx, %edi
; X86-NEXT: pushl $12
-; X86-NEXT: .Lcfi80:
; X86-NEXT: .cfi_adjust_cfa_offset 4
; X86-NEXT: pushl $25
-; X86-NEXT: .Lcfi81:
; X86-NEXT: .cfi_adjust_cfa_offset 4
; X86-NEXT: calll mult
; X86-NEXT: addl $8, %esp
-; X86-NEXT: .Lcfi82:
; X86-NEXT: .cfi_adjust_cfa_offset -8
; X86-NEXT: movl %eax, %ebx
; X86-NEXT: xorl $25, %ebx
; X86-NEXT: orl %edi, %ebx
; X86-NEXT: pushl $13
-; X86-NEXT: .Lcfi83:
; X86-NEXT: .cfi_adjust_cfa_offset 4
; X86-NEXT: pushl $26
-; X86-NEXT: .Lcfi84:
; X86-NEXT: .cfi_adjust_cfa_offset 4
; X86-NEXT: calll mult
; X86-NEXT: addl $8, %esp
-; X86-NEXT: .Lcfi85:
; X86-NEXT: .cfi_adjust_cfa_offset -8
; X86-NEXT: movl %eax, %edi
; X86-NEXT: xorl $26, %edi
; X86-NEXT: orl %ebx, %edi
; X86-NEXT: pushl $13
-; X86-NEXT: .Lcfi86:
; X86-NEXT: .cfi_adjust_cfa_offset 4
; X86-NEXT: pushl $27
-; X86-NEXT: .Lcfi87:
; X86-NEXT: .cfi_adjust_cfa_offset 4
; X86-NEXT: calll mult
; X86-NEXT: addl $8, %esp
-; X86-NEXT: .Lcfi88:
; X86-NEXT: .cfi_adjust_cfa_offset -8
; X86-NEXT: movl %eax, %ebx
; X86-NEXT: xorl $27, %ebx
; X86-NEXT: orl %edi, %ebx
; X86-NEXT: pushl $14
-; X86-NEXT: .Lcfi89:
; X86-NEXT: .cfi_adjust_cfa_offset 4
; X86-NEXT: pushl $28
-; X86-NEXT: .Lcfi90:
; X86-NEXT: .cfi_adjust_cfa_offset 4
; X86-NEXT: calll mult
; X86-NEXT: addl $8, %esp
-; X86-NEXT: .Lcfi91:
; X86-NEXT: .cfi_adjust_cfa_offset -8
; X86-NEXT: movl %eax, %edi
; X86-NEXT: xorl $28, %edi
; X86-NEXT: orl %ebx, %edi
; X86-NEXT: pushl $14
-; X86-NEXT: .Lcfi92:
; X86-NEXT: .cfi_adjust_cfa_offset 4
; X86-NEXT: pushl $29
-; X86-NEXT: .Lcfi93:
; X86-NEXT: .cfi_adjust_cfa_offset 4
; X86-NEXT: calll mult
; X86-NEXT: addl $8, %esp
-; X86-NEXT: .Lcfi94:
; X86-NEXT: .cfi_adjust_cfa_offset -8
; X86-NEXT: movl %eax, %ebx
; X86-NEXT: xorl $29, %ebx
; X86-NEXT: orl %edi, %ebx
; X86-NEXT: pushl $15
-; X86-NEXT: .Lcfi95:
; X86-NEXT: .cfi_adjust_cfa_offset 4
; X86-NEXT: pushl $30
-; X86-NEXT: .Lcfi96:
; X86-NEXT: .cfi_adjust_cfa_offset 4
; X86-NEXT: calll mult
; X86-NEXT: addl $8, %esp
-; X86-NEXT: .Lcfi97:
; X86-NEXT: .cfi_adjust_cfa_offset -8
; X86-NEXT: movl %eax, %edi
; X86-NEXT: xorl $30, %edi
; X86-NEXT: orl %ebx, %edi
; X86-NEXT: pushl $15
-; X86-NEXT: .Lcfi98:
; X86-NEXT: .cfi_adjust_cfa_offset 4
; X86-NEXT: pushl $31
-; X86-NEXT: .Lcfi99:
; X86-NEXT: .cfi_adjust_cfa_offset 4
; X86-NEXT: calll mult
; X86-NEXT: addl $8, %esp
-; X86-NEXT: .Lcfi100:
; X86-NEXT: .cfi_adjust_cfa_offset -8
; X86-NEXT: movl %eax, %ebx
; X86-NEXT: xorl $31, %ebx
; X86-NEXT: orl %edi, %ebx
; X86-NEXT: orl %esi, %ebx
; X86-NEXT: pushl $16
-; X86-NEXT: .Lcfi101:
; X86-NEXT: .cfi_adjust_cfa_offset 4
; X86-NEXT: pushl $32
-; X86-NEXT: .Lcfi102:
; X86-NEXT: .cfi_adjust_cfa_offset 4
; X86-NEXT: calll mult
; X86-NEXT: addl $8, %esp
-; X86-NEXT: .Lcfi103:
; X86-NEXT: .cfi_adjust_cfa_offset -8
; X86-NEXT: xorl $32, %eax
+; X86-NEXT: xorl %ecx, %ecx
; X86-NEXT: orl %ebx, %eax
-; X86-NEXT: movl $-1, %eax
-; X86-NEXT: jne .LBB1_2
-; X86-NEXT: # BB#1:
-; X86-NEXT: xorl %eax, %eax
-; X86-NEXT: .LBB1_2:
+; X86-NEXT: setne %cl
+; X86-NEXT: negl %ecx
+; X86-NEXT: movl %ecx, %eax
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NEXT: popl %ebx
; X86-NEXT: retl
;
; X64-HSW-LABEL: foo:
-; X64-HSW: # BB#0:
+; X64-HSW: # %bb.0:
; X64-HSW-NEXT: pushq %rbp
-; X64-HSW-NEXT: .Lcfi0:
; X64-HSW-NEXT: .cfi_def_cfa_offset 16
; X64-HSW-NEXT: pushq %r15
-; X64-HSW-NEXT: .Lcfi1:
; X64-HSW-NEXT: .cfi_def_cfa_offset 24
; X64-HSW-NEXT: pushq %r14
-; X64-HSW-NEXT: .Lcfi2:
; X64-HSW-NEXT: .cfi_def_cfa_offset 32
-; X64-HSW-NEXT: pushq %r12
-; X64-HSW-NEXT: .Lcfi3:
-; X64-HSW-NEXT: .cfi_def_cfa_offset 40
; X64-HSW-NEXT: pushq %rbx
-; X64-HSW-NEXT: .Lcfi4:
+; X64-HSW-NEXT: .cfi_def_cfa_offset 40
+; X64-HSW-NEXT: pushq %rax
; X64-HSW-NEXT: .cfi_def_cfa_offset 48
-; X64-HSW-NEXT: .Lcfi5:
-; X64-HSW-NEXT: .cfi_offset %rbx, -48
-; X64-HSW-NEXT: .Lcfi6:
-; X64-HSW-NEXT: .cfi_offset %r12, -40
-; X64-HSW-NEXT: .Lcfi7:
+; X64-HSW-NEXT: .cfi_offset %rbx, -40
; X64-HSW-NEXT: .cfi_offset %r14, -32
-; X64-HSW-NEXT: .Lcfi8:
; X64-HSW-NEXT: .cfi_offset %r15, -24
-; X64-HSW-NEXT: .Lcfi9:
; X64-HSW-NEXT: .cfi_offset %rbp, -16
-; X64-HSW-NEXT: xorl %r12d, %r12d
; X64-HSW-NEXT: movl $1, %edi
; X64-HSW-NEXT: xorl %esi, %esi
; X64-HSW-NEXT: callq mult
@@ -1180,11 +1066,13 @@ define i32 @foo() local_unnamed_addr #0 {
; X64-HSW-NEXT: movl $16, %esi
; X64-HSW-NEXT: callq mult
; X64-HSW-NEXT: xorl $32, %eax
+; X64-HSW-NEXT: xorl %ecx, %ecx
; X64-HSW-NEXT: orl %ebx, %eax
-; X64-HSW-NEXT: movl $-1, %eax
-; X64-HSW-NEXT: cmovel %r12d, %eax
+; X64-HSW-NEXT: setne %cl
+; X64-HSW-NEXT: negl %ecx
+; X64-HSW-NEXT: movl %ecx, %eax
+; X64-HSW-NEXT: addq $8, %rsp
; X64-HSW-NEXT: popq %rbx
-; X64-HSW-NEXT: popq %r12
; X64-HSW-NEXT: popq %r14
; X64-HSW-NEXT: popq %r15
; X64-HSW-NEXT: popq %rbp
diff --git a/test/CodeGen/X86/mul-i1024.ll b/test/CodeGen/X86/mul-i1024.ll
index 87661004373f..9980042a4ccc 100644
--- a/test/CodeGen/X86/mul-i1024.ll
+++ b/test/CodeGen/X86/mul-i1024.ll
@@ -4,4232 +4,6721 @@
define void @test_1024(i1024* %a, i1024* %b, i1024* %out) nounwind {
; X32-LABEL: test_1024:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pushl %ebp
; X32-NEXT: movl %esp, %ebp
; X32-NEXT: pushl %ebx
; X32-NEXT: pushl %edi
; X32-NEXT: pushl %esi
-; X32-NEXT: andl $-8, %esp
-; X32-NEXT: subl $2632, %esp # imm = 0xA48
-; X32-NEXT: movl 8(%ebp), %eax
-; X32-NEXT: movl 64(%eax), %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl 68(%eax), %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl 72(%eax), %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl 76(%eax), %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl 80(%eax), %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl 84(%eax), %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl 88(%eax), %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl 92(%eax), %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl 96(%eax), %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl 100(%eax), %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl 104(%eax), %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl 108(%eax), %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl 112(%eax), %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl 116(%eax), %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl 120(%eax), %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl 124(%eax), %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl (%eax), %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl 4(%eax), %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl 8(%eax), %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl 12(%eax), %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl 16(%eax), %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl 20(%eax), %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl 24(%eax), %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl 28(%eax), %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl 32(%eax), %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl 36(%eax), %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl 40(%eax), %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl 44(%eax), %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl 48(%eax), %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl 52(%eax), %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl 56(%eax), %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl 60(%eax), %ebx
-; X32-NEXT: movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: subl $996, %esp # imm = 0x3E4
; X32-NEXT: movl 12(%ebp), %eax
-; X32-NEXT: movl 48(%eax), %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl 52(%eax), %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl 56(%eax), %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl 60(%eax), %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl 32(%eax), %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl 36(%eax), %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl 40(%eax), %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl 44(%eax), %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl 16(%eax), %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl 20(%eax), %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl 24(%eax), %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl 28(%eax), %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl 8(%eax), %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl 12(%eax), %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl 112(%eax), %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl 116(%eax), %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl 120(%eax), %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl 124(%eax), %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl 96(%eax), %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl 100(%eax), %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl 104(%eax), %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl 108(%eax), %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl 80(%eax), %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl 84(%eax), %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl 88(%eax), %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl 92(%eax), %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl 64(%eax), %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl 68(%eax), %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl 72(%eax), %edi
-; X32-NEXT: movl 76(%eax), %esi
-; X32-NEXT: movl (%eax), %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl 4(%eax), %edx
-; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl %edx
-; X32-NEXT: pushl %ecx
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl %ebx
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl %esi
+; X32-NEXT: movl 32(%eax), %eax
+; X32-NEXT: movl %eax, -188(%ebp) # 4-byte Spill
+; X32-NEXT: xorl %ecx, %ecx
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %eax, %ebx
+; X32-NEXT: movl %edx, %edi
+; X32-NEXT: movl 8(%ebp), %esi
+; X32-NEXT: movl 48(%esi), %eax
+; X32-NEXT: movl %eax, -440(%ebp) # 4-byte Spill
+; X32-NEXT: mull %ecx
+; X32-NEXT: xorl %ecx, %ecx
+; X32-NEXT: movl %edx, -140(%ebp) # 4-byte Spill
+; X32-NEXT: movl %eax, -132(%ebp) # 4-byte Spill
+; X32-NEXT: addl %ebx, %eax
+; X32-NEXT: adcl %edi, %edx
+; X32-NEXT: movl %edx, -884(%ebp) # 4-byte Spill
+; X32-NEXT: movl 32(%esi), %eax
+; X32-NEXT: movl %eax, -416(%ebp) # 4-byte Spill
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, -400(%ebp) # 4-byte Spill
+; X32-NEXT: movl %eax, -324(%ebp) # 4-byte Spill
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: addl %ebx, %ecx
+; X32-NEXT: movl %edx, %eax
+; X32-NEXT: adcl %edi, %eax
+; X32-NEXT: movl %edi, %ecx
+; X32-NEXT: movl %ecx, -204(%ebp) # 4-byte Spill
+; X32-NEXT: movl %eax, -892(%ebp) # 4-byte Spill
+; X32-NEXT: movl 12(%ebp), %eax
+; X32-NEXT: movl 36(%eax), %eax
+; X32-NEXT: movl %eax, -148(%ebp) # 4-byte Spill
+; X32-NEXT: xorl %edx, %edx
+; X32-NEXT: mull %edx
+; X32-NEXT: movl %edx, -236(%ebp) # 4-byte Spill
+; X32-NEXT: movl %eax, %edi
+; X32-NEXT: movl %edi, -304(%ebp) # 4-byte Spill
+; X32-NEXT: addl %ecx, %edi
+; X32-NEXT: movl %edi, -80(%ebp) # 4-byte Spill
+; X32-NEXT: movl %edx, %eax
+; X32-NEXT: adcl $0, %eax
+; X32-NEXT: movl %eax, -220(%ebp) # 4-byte Spill
+; X32-NEXT: movl 36(%esi), %eax
+; X32-NEXT: movl %eax, -316(%ebp) # 4-byte Spill
+; X32-NEXT: xorl %ecx, %ecx
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: movl %ecx, -124(%ebp) # 4-byte Spill
+; X32-NEXT: movl %eax, -184(%ebp) # 4-byte Spill
+; X32-NEXT: movl %eax, %edx
+; X32-NEXT: movl -400(%ebp), %esi # 4-byte Reload
+; X32-NEXT: addl %esi, %edx
+; X32-NEXT: adcl $0, %ecx
+; X32-NEXT: movl %ecx, -64(%ebp) # 4-byte Spill
+; X32-NEXT: movl -324(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: movl %ebx, -100(%ebp) # 4-byte Spill
+; X32-NEXT: addl %ebx, %eax
+; X32-NEXT: movl %eax, -656(%ebp) # 4-byte Spill
+; X32-NEXT: leal (%ebx,%edi), %eax
+; X32-NEXT: movl %edx, %edi
+; X32-NEXT: leal (%ecx,%edi), %edx
+; X32-NEXT: adcl %eax, %edx
+; X32-NEXT: movl %edx, -700(%ebp) # 4-byte Spill
+; X32-NEXT: seto %al
+; X32-NEXT: lahf
+; X32-NEXT: movl %eax, %eax
+; X32-NEXT: movl %eax, -640(%ebp) # 4-byte Spill
+; X32-NEXT: movl %eax, -96(%ebp) # 4-byte Spill
+; X32-NEXT: addl %ecx, %edi
+; X32-NEXT: movl %edi, -112(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %esi, -64(%ebp) # 4-byte Folded Spill
; X32-NEXT: movl %esi, %ebx
-; X32-NEXT: movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: pushl %edi
-; X32-NEXT: movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
-; X32-NEXT: pushl %esi
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl %ebx
-; X32-NEXT: pushl %edi
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl %esi
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl %ebx
-; X32-NEXT: pushl %edi
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
-; X32-NEXT: pushl %esi
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT: pushl %edi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
-; X32-NEXT: pushl %ebx
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl %esi
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl %edi
-; X32-NEXT: pushl %ebx
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT: pushl %edi
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
-; X32-NEXT: pushl %esi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
-; X32-NEXT: pushl %ebx
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl %edi
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl %esi
-; X32-NEXT: pushl %ebx
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl %edi
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
-; X32-NEXT: pushl %esi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT: pushl %edi
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
-; X32-NEXT: pushl %ebx
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl %esi
-; X32-NEXT: pushl %edi
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl %ebx
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
-; X32-NEXT: pushl %esi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT: pushl %edi
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl %esi
-; X32-NEXT: pushl %edi
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
-; X32-NEXT: pushl %esi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT: pushl %edi
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
-; X32-NEXT: pushl %ebx
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl %esi
-; X32-NEXT: pushl %edi
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl %ebx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
-; X32-NEXT: pushl %esi
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
-; X32-NEXT: pushl %ebx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT: pushl %edi
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl %ebx
-; X32-NEXT: pushl %edi
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
-; X32-NEXT: pushl %ebx
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT: pushl %edi
-; X32-NEXT: pushl %esi
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl %ebx
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl %edi
-; X32-NEXT: pushl %esi
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT: pushl %edi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
-; X32-NEXT: pushl %esi
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl %edi
-; X32-NEXT: pushl %esi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
-; X32-NEXT: pushl %ebx
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT: pushl %edi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
-; X32-NEXT: pushl %esi
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl %edi
-; X32-NEXT: pushl %esi
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl %ebx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
-; X32-NEXT: pushl %esi
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT: pushl %edi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
-; X32-NEXT: pushl %ebx
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl %esi
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl %edi
-; X32-NEXT: pushl %ebx
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
-; X32-NEXT: pushl %ebx
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
-; X32-NEXT: pushl %esi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT: pushl %edi
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl %ebx
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl %esi
-; X32-NEXT: pushl %edi
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT: pushl %edi
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
-; X32-NEXT: pushl %esi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
-; X32-NEXT: pushl %ebx
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl %edi
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl %esi
-; X32-NEXT: pushl %ebx
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT: pushl %edi
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
-; X32-NEXT: pushl %esi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
-; X32-NEXT: pushl %ebx
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl %edi
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl %esi
-; X32-NEXT: pushl %ebx
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl %edi
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
-; X32-NEXT: pushl %ebx
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT: pushl %edi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
-; X32-NEXT: pushl %esi
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl %ebx
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl %edi
-; X32-NEXT: pushl %esi
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT: pushl %edi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
-; X32-NEXT: pushl %esi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
-; X32-NEXT: pushl %ecx
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
-; X32-NEXT: pushl %ebx
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl %edi
-; X32-NEXT: pushl %esi
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl %ebx
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl %edi
-; X32-NEXT: pushl %esi
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
-; X32-NEXT: pushl %esi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT: pushl %edi
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
-; X32-NEXT: pushl %ebx
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl %esi
-; X32-NEXT: pushl %edi
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl %ebx
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
-; X32-NEXT: pushl %esi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT: pushl %edi
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl %esi
-; X32-NEXT: pushl %edi
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
-; X32-NEXT: pushl %esi
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT: pushl %edi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
-; X32-NEXT: pushl %ebx
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl %esi
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl %edi
-; X32-NEXT: movl %ebx, %esi
-; X32-NEXT: pushl %esi
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
-; X32-NEXT: pushl %ebx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT: pushl %edi
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl %esi
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl %ebx
-; X32-NEXT: pushl %edi
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl %ebx
-; X32-NEXT: pushl %edi
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT: pushl %edi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
-; X32-NEXT: pushl %esi
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
-; X32-NEXT: pushl %ebx
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl %edi
-; X32-NEXT: pushl %esi
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl %ebx
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT: pushl %edi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
-; X32-NEXT: pushl %esi
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
-; X32-NEXT: pushl %ebx
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl %edi
-; X32-NEXT: pushl %esi
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl %ebx
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
-; X32-NEXT: pushl %esi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
-; X32-NEXT: pushl %ebx
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT: pushl %edi
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl %esi
-; X32-NEXT: pushl %ebx
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
-; X32-NEXT: pushl %ebx
-; X32-NEXT: pushl %edi
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT: pushl %edi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
-; X32-NEXT: pushl %esi
-; X32-NEXT: pushl %ebx
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl %edi
+; X32-NEXT: setb -160(%ebp) # 1-byte Folded Spill
+; X32-NEXT: movl 12(%ebp), %eax
+; X32-NEXT: movl (%eax), %eax
+; X32-NEXT: movl %eax, -168(%ebp) # 4-byte Spill
+; X32-NEXT: xorl %ecx, %ecx
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %eax, %esi
+; X32-NEXT: movl %edx, %edi
+; X32-NEXT: movl 8(%ebp), %ecx
+; X32-NEXT: movl 16(%ecx), %eax
+; X32-NEXT: movl %eax, -348(%ebp) # 4-byte Spill
+; X32-NEXT: xorl %edx, %edx
+; X32-NEXT: mull %edx
+; X32-NEXT: movl %edx, -320(%ebp) # 4-byte Spill
+; X32-NEXT: movl %eax, -180(%ebp) # 4-byte Spill
+; X32-NEXT: addl %esi, %eax
+; X32-NEXT: adcl %edi, %edx
+; X32-NEXT: movl %edx, -428(%ebp) # 4-byte Spill
+; X32-NEXT: movl (%ecx), %eax
+; X32-NEXT: movl %eax, -260(%ebp) # 4-byte Spill
+; X32-NEXT: xorl %ecx, %ecx
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, -264(%ebp) # 4-byte Spill
+; X32-NEXT: movl %eax, -136(%ebp) # 4-byte Spill
+; X32-NEXT: addl %esi, %eax
+; X32-NEXT: movl %edx, %eax
+; X32-NEXT: adcl %edi, %eax
+; X32-NEXT: movl %eax, -452(%ebp) # 4-byte Spill
+; X32-NEXT: movl -132(%ebp), %eax # 4-byte Reload
+; X32-NEXT: addl %esi, %eax
+; X32-NEXT: movl -140(%ebp), %eax # 4-byte Reload
+; X32-NEXT: adcl %edi, %eax
+; X32-NEXT: movl %eax, -764(%ebp) # 4-byte Spill
+; X32-NEXT: movl -324(%ebp), %eax # 4-byte Reload
+; X32-NEXT: addl %esi, %eax
+; X32-NEXT: movl %esi, %ecx
+; X32-NEXT: adcl %edi, %ebx
+; X32-NEXT: movl %ebx, -424(%ebp) # 4-byte Spill
; X32-NEXT: movl %edi, %ebx
-; X32-NEXT: pushl %esi
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl %ebx
-; X32-NEXT: pushl %esi
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
-; X32-NEXT: pushl %esi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
-; X32-NEXT: pushl %ebx
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT: pushl %edi
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl %esi
-; X32-NEXT: pushl %ebx
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl %edi
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT: pushl %edi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
-; X32-NEXT: pushl %esi
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl %edi
-; X32-NEXT: pushl %esi
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT: pushl %edi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
-; X32-NEXT: pushl %esi
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
-; X32-NEXT: pushl %ebx
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl %edi
-; X32-NEXT: pushl %esi
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl %ebx
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT: pushl %edi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
-; X32-NEXT: pushl %esi
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl %edi
-; X32-NEXT: pushl %esi
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT: pushl %edi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
-; X32-NEXT: pushl %ebx
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
-; X32-NEXT: pushl %esi
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl %edi
-; X32-NEXT: pushl %ebx
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl %esi
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT: pushl %edi
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
-; X32-NEXT: pushl %esi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
-; X32-NEXT: pushl %ebx
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl %edi
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl %esi
-; X32-NEXT: pushl %ebx
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
-; X32-NEXT: pushl %esi
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
-; X32-NEXT: pushl %ebx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT: pushl %edi
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl %esi
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl %ebx
-; X32-NEXT: movl %ebx, %esi
-; X32-NEXT: pushl %edi
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT: pushl %edi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
-; X32-NEXT: pushl %ebx
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl %edi
-; X32-NEXT: pushl %ebx
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
-; X32-NEXT: pushl %ebx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT: pushl %edi
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl %esi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
-; X32-NEXT: pushl %esi
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl %ebx
-; X32-NEXT: pushl %edi
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl %esi
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT: pushl %edi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
-; X32-NEXT: pushl %esi
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl %edi
-; X32-NEXT: pushl %esi
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT: pushl %edi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
-; X32-NEXT: pushl %esi
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
-; X32-NEXT: pushl %ebx
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl %edi
-; X32-NEXT: pushl %esi
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
-; X32-NEXT: pushl %esi
-; X32-NEXT: pushl %ebx
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
-; X32-NEXT: pushl %ebx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT: pushl %edi
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl %ebx
-; X32-NEXT: pushl %edi
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
-; X32-NEXT: pushl %ebx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT: pushl %edi
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl %esi
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl %ebx
-; X32-NEXT: pushl %edi
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl %esi
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT: pushl %edi
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
-; X32-NEXT: pushl %ebx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
-; X32-NEXT: pushl %esi
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl %edi
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl %ebx
-; X32-NEXT: pushl %esi
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
-; X32-NEXT: pushl %ebx
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT: pushl %edi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
-; X32-NEXT: pushl %esi
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl %ebx
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl %edi
-; X32-NEXT: pushl %esi
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT: pushl %edi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
-; X32-NEXT: pushl %esi
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl %edi
-; X32-NEXT: pushl %esi
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
-; X32-NEXT: pushl %esi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
-; X32-NEXT: pushl %ebx
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT: pushl %edi
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl %esi
-; X32-NEXT: pushl %ebx
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
+; X32-NEXT: movl %ebx, -256(%ebp) # 4-byte Spill
+; X32-NEXT: movl -100(%ebp), %eax # 4-byte Reload
+; X32-NEXT: addl %eax, -80(%ebp) # 4-byte Folded Spill
+; X32-NEXT: movl -204(%ebp), %eax # 4-byte Reload
+; X32-NEXT: adcl %eax, -220(%ebp) # 4-byte Folded Spill
+; X32-NEXT: setb -388(%ebp) # 1-byte Folded Spill
+; X32-NEXT: movl 12(%ebp), %eax
+; X32-NEXT: movl 4(%eax), %eax
+; X32-NEXT: movl %eax, -92(%ebp) # 4-byte Spill
+; X32-NEXT: xorl %edx, %edx
+; X32-NEXT: mull %edx
+; X32-NEXT: movl %eax, %edi
+; X32-NEXT: addl %ebx, %edi
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: adcl $0, %esi
+; X32-NEXT: movl %ecx, -28(%ebp) # 4-byte Spill
+; X32-NEXT: addl %ecx, %edi
+; X32-NEXT: movl %edi, -16(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %ebx, %esi
+; X32-NEXT: setb %bh
+; X32-NEXT: addl %eax, %esi
+; X32-NEXT: movl %esi, -76(%ebp) # 4-byte Spill
+; X32-NEXT: movzbl %bh, %eax
+; X32-NEXT: adcl %edx, %eax
+; X32-NEXT: movl %eax, %edi
+; X32-NEXT: movl %edi, -72(%ebp) # 4-byte Spill
+; X32-NEXT: movl 12(%ebp), %eax
+; X32-NEXT: movl 8(%eax), %eax
+; X32-NEXT: movl %eax, -108(%ebp) # 4-byte Spill
+; X32-NEXT: xorl %ebx, %ebx
+; X32-NEXT: mull %ebx
+; X32-NEXT: movl %eax, -104(%ebp) # 4-byte Spill
+; X32-NEXT: movl %edx, -156(%ebp) # 4-byte Spill
+; X32-NEXT: addl %eax, %ecx
+; X32-NEXT: movl -256(%ebp), %eax # 4-byte Reload
+; X32-NEXT: adcl %edx, %eax
+; X32-NEXT: addl %esi, %ecx
+; X32-NEXT: movl %ecx, -120(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %edi, %eax
+; X32-NEXT: movl %eax, -60(%ebp) # 4-byte Spill
+; X32-NEXT: movl 8(%ebp), %eax
+; X32-NEXT: movl 52(%eax), %eax
+; X32-NEXT: movl %eax, -340(%ebp) # 4-byte Spill
+; X32-NEXT: mull %ebx
+; X32-NEXT: movl %eax, %edi
+; X32-NEXT: movl -140(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: addl %ecx, %edi
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: adcl $0, %esi
+; X32-NEXT: movl -132(%ebp), %ebx # 4-byte Reload
+; X32-NEXT: addl %ebx, %edi
+; X32-NEXT: movl %edi, -192(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %ecx, %esi
+; X32-NEXT: movl %ecx, %edi
+; X32-NEXT: setb %cl
+; X32-NEXT: addl %eax, %esi
+; X32-NEXT: movzbl %cl, %eax
+; X32-NEXT: adcl %edx, %eax
+; X32-NEXT: movl %eax, -216(%ebp) # 4-byte Spill
+; X32-NEXT: movl 8(%ebp), %eax
+; X32-NEXT: movl 56(%eax), %eax
+; X32-NEXT: movl %eax, -408(%ebp) # 4-byte Spill
+; X32-NEXT: xorl %ecx, %ecx
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %eax, -392(%ebp) # 4-byte Spill
+; X32-NEXT: movl %edx, -412(%ebp) # 4-byte Spill
+; X32-NEXT: movl %ebx, %ecx
+; X32-NEXT: addl %eax, %ebx
+; X32-NEXT: adcl %edx, %edi
+; X32-NEXT: addl %esi, %ebx
+; X32-NEXT: movl %ebx, -272(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -216(%ebp), %edi # 4-byte Folded Reload
+; X32-NEXT: movl %edi, -24(%ebp) # 4-byte Spill
+; X32-NEXT: addl -28(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: movl %ecx, -68(%ebp) # 4-byte Spill
+; X32-NEXT: movl -192(%ebp), %eax # 4-byte Reload
+; X32-NEXT: adcl -16(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: movl %eax, -420(%ebp) # 4-byte Spill
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: adcl -120(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: movl %eax, -616(%ebp) # 4-byte Spill
+; X32-NEXT: movl %edi, %eax
+; X32-NEXT: adcl -60(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: movl %eax, -612(%ebp) # 4-byte Spill
+; X32-NEXT: movl -64(%ebp), %esi # 4-byte Reload
+; X32-NEXT: addl -184(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: movl %esi, -64(%ebp) # 4-byte Spill
+; X32-NEXT: movzbl -160(%ebp), %eax # 1-byte Folded Reload
+; X32-NEXT: adcl -124(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: movl %eax, -152(%ebp) # 4-byte Spill
+; X32-NEXT: movl 8(%ebp), %eax
+; X32-NEXT: movl 40(%eax), %eax
+; X32-NEXT: movl %eax, -352(%ebp) # 4-byte Spill
+; X32-NEXT: xorl %ecx, %ecx
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %eax, -364(%ebp) # 4-byte Spill
+; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: movl %ebx, -396(%ebp) # 4-byte Spill
+; X32-NEXT: movl -324(%ebp), %edx # 4-byte Reload
+; X32-NEXT: movl %edx, %edi
+; X32-NEXT: addl %eax, %edi
+; X32-NEXT: movl -400(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: adcl %ebx, %ecx
+; X32-NEXT: addl %esi, %edi
+; X32-NEXT: movl %edi, -44(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -152(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: movl %ecx, -52(%ebp) # 4-byte Spill
+; X32-NEXT: addl -28(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: movl %edx, -32(%ebp) # 4-byte Spill
+; X32-NEXT: movl -112(%ebp), %eax # 4-byte Reload
+; X32-NEXT: adcl -16(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: movl %eax, -196(%ebp) # 4-byte Spill
+; X32-NEXT: seto %al
+; X32-NEXT: lahf
+; X32-NEXT: movl %eax, %eax
+; X32-NEXT: movl %eax, -456(%ebp) # 4-byte Spill
+; X32-NEXT: movl %edi, %eax
+; X32-NEXT: adcl -120(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: movl %eax, -504(%ebp) # 4-byte Spill
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: adcl -60(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: movl %eax, -508(%ebp) # 4-byte Spill
+; X32-NEXT: movl 12(%ebp), %ecx
+; X32-NEXT: movl 16(%ecx), %eax
+; X32-NEXT: movl %eax, -212(%ebp) # 4-byte Spill
+; X32-NEXT: xorl %ebx, %ebx
+; X32-NEXT: mull %ebx
+; X32-NEXT: movl %eax, %edi
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: movl %esi, -84(%ebp) # 4-byte Spill
+; X32-NEXT: movl 20(%ecx), %eax
+; X32-NEXT: movl %eax, -252(%ebp) # 4-byte Spill
+; X32-NEXT: mull %ebx
+; X32-NEXT: movl %eax, %ebx
+; X32-NEXT: addl %esi, %ebx
+; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: adcl $0, %ecx
+; X32-NEXT: addl %edi, %ebx
+; X32-NEXT: movl %ebx, -164(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %esi, %ecx
+; X32-NEXT: setb %bl
+; X32-NEXT: addl %eax, %ecx
+; X32-NEXT: movzbl %bl, %esi
+; X32-NEXT: adcl %edx, %esi
+; X32-NEXT: movl 12(%ebp), %eax
+; X32-NEXT: movl 24(%eax), %eax
+; X32-NEXT: movl %eax, -284(%ebp) # 4-byte Spill
+; X32-NEXT: xorl %edx, %edx
+; X32-NEXT: mull %edx
+; X32-NEXT: movl %eax, -308(%ebp) # 4-byte Spill
+; X32-NEXT: movl %edx, -208(%ebp) # 4-byte Spill
; X32-NEXT: movl %edi, %ebx
-; X32-NEXT: pushl %ebx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
-; X32-NEXT: pushl %esi
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT: pushl %edi
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl %ebx
-; X32-NEXT: pushl %esi
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl %ebx
-; X32-NEXT: pushl %esi
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl %edi
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
-; X32-NEXT: pushl %esi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT: pushl %edi
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl %esi
-; X32-NEXT: pushl %edi
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
-; X32-NEXT: pushl %ebx
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT: pushl %edi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
-; X32-NEXT: pushl %esi
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl %ebx
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl %edi
-; X32-NEXT: pushl %esi
-; X32-NEXT: movl %esi, %edi
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
-; X32-NEXT: pushl %ebx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
-; X32-NEXT: pushl %esi
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl %ebx
-; X32-NEXT: pushl %esi
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
-; X32-NEXT: pushl %ebx
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
-; X32-NEXT: pushl %esi
-; X32-NEXT: pushl %edi
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl %ebx
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl %esi
-; X32-NEXT: pushl %edi
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
-; X32-NEXT: pushl %ebx
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT: pushl %edi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
-; X32-NEXT: pushl %esi
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl %ebx
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl %edi
-; X32-NEXT: pushl %esi
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT: pushl %edi
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
-; X32-NEXT: pushl %esi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
-; X32-NEXT: pushl %ebx
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl %edi
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl %esi
-; X32-NEXT: pushl %ebx
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
-; X32-NEXT: pushl %esi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT: pushl %edi
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl %esi
-; X32-NEXT: pushl %edi
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT: pushl %edi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
-; X32-NEXT: pushl %esi
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
-; X32-NEXT: pushl %ebx
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl %edi
-; X32-NEXT: pushl %esi
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl %ebx
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
-; X32-NEXT: pushl %ebx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
-; X32-NEXT: pushl %esi
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT: pushl %edi
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl %ebx
-; X32-NEXT: pushl %esi
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl %edi
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
-; X32-NEXT: pushl %esi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT: pushl %edi
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl %esi
-; X32-NEXT: pushl %edi
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT: pushl %edi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
-; X32-NEXT: pushl %esi
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
-; X32-NEXT: pushl %ebx
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl %edi
-; X32-NEXT: pushl %esi
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl %ebx
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
-; X32-NEXT: pushl %esi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT: pushl %edi
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl %esi
-; X32-NEXT: pushl %edi
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT: pushl %edi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
-; X32-NEXT: pushl %esi
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl %ebx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
-; X32-NEXT: pushl %ebx
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl %edi
-; X32-NEXT: pushl %esi
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl %ebx
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
-; X32-NEXT: pushl %ebx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT: pushl %edi
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
-; X32-NEXT: pushl %esi
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl %ebx
-; X32-NEXT: pushl %edi
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl %esi
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
-; X32-NEXT: pushl %ebx
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
-; X32-NEXT: pushl %esi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT: pushl %edi
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl %ebx
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl %esi
-; X32-NEXT: pushl %edi
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
-; X32-NEXT: pushl %esi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT: pushl %edi
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl %esi
-; X32-NEXT: pushl %edi
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
-; X32-NEXT: pushl %esi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
-; X32-NEXT: pushl %ebx
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT: pushl %edi
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl %esi
-; X32-NEXT: pushl %ebx
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl %edi
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
-; X32-NEXT: pushl %esi
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT: pushl %edi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
-; X32-NEXT: pushl %ebx
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl %esi
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl %edi
-; X32-NEXT: pushl %ebx
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
-; X32-NEXT: pushl %esi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT: pushl %edi
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl %esi
-; X32-NEXT: pushl %edi
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X32-NEXT: movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: addl %edi, %ecx
+; X32-NEXT: addl %eax, %ebx
+; X32-NEXT: movl -84(%ebp), %eax # 4-byte Reload
+; X32-NEXT: adcl %edx, %eax
+; X32-NEXT: addl %ecx, %ebx
+; X32-NEXT: movl %ebx, -40(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %esi, %eax
+; X32-NEXT: movl %eax, %edx
+; X32-NEXT: movl -324(%ebp), %esi # 4-byte Reload
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: movl %edi, -116(%ebp) # 4-byte Spill
+; X32-NEXT: addl %edi, %eax
+; X32-NEXT: movl -400(%ebp), %eax # 4-byte Reload
+; X32-NEXT: movl -84(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: adcl %ecx, %eax
+; X32-NEXT: movl %eax, -768(%ebp) # 4-byte Spill
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: addl %edi, %eax
+; X32-NEXT: movl %eax, -296(%ebp) # 4-byte Spill
+; X32-NEXT: movl -112(%ebp), %eax # 4-byte Reload
+; X32-NEXT: movl -164(%ebp), %esi # 4-byte Reload
+; X32-NEXT: adcl %esi, %eax
+; X32-NEXT: movl %eax, -776(%ebp) # 4-byte Spill
+; X32-NEXT: movl -44(%ebp), %eax # 4-byte Reload
+; X32-NEXT: adcl %ebx, %eax
+; X32-NEXT: movl %eax, -772(%ebp) # 4-byte Spill
+; X32-NEXT: movl -52(%ebp), %eax # 4-byte Reload
+; X32-NEXT: adcl %edx, %eax
+; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: movl %ebx, -56(%ebp) # 4-byte Spill
+; X32-NEXT: movl %eax, -780(%ebp) # 4-byte Spill
+; X32-NEXT: movl -132(%ebp), %edx # 4-byte Reload
; X32-NEXT: movl %edx, %eax
+; X32-NEXT: addl %edi, %eax
+; X32-NEXT: movl -140(%ebp), %eax # 4-byte Reload
+; X32-NEXT: adcl %ecx, %eax
+; X32-NEXT: movl %eax, -448(%ebp) # 4-byte Spill
+; X32-NEXT: movl %edx, %eax
+; X32-NEXT: addl %edi, %eax
+; X32-NEXT: movl %eax, -332(%ebp) # 4-byte Spill
+; X32-NEXT: movl -192(%ebp), %eax # 4-byte Reload
; X32-NEXT: adcl %esi, %eax
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X32-NEXT: movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl $0, %ebx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X32-NEXT: movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl $0, %esi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: movl %eax, -648(%ebp) # 4-byte Spill
+; X32-NEXT: movl -272(%ebp), %eax # 4-byte Reload
+; X32-NEXT: adcl -40(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: movl %eax, -644(%ebp) # 4-byte Spill
+; X32-NEXT: movl -24(%ebp), %eax # 4-byte Reload
+; X32-NEXT: adcl %ebx, %eax
+; X32-NEXT: movl %eax, -572(%ebp) # 4-byte Spill
+; X32-NEXT: movl 8(%ebp), %eax
+; X32-NEXT: movl 20(%eax), %eax
+; X32-NEXT: movl %eax, -216(%ebp) # 4-byte Spill
+; X32-NEXT: xorl %ecx, %ecx
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %eax, %esi
+; X32-NEXT: movl -320(%ebp), %ebx # 4-byte Reload
+; X32-NEXT: addl %ebx, %esi
+; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: adcl $0, %ecx
+; X32-NEXT: movl -180(%ebp), %edi # 4-byte Reload
+; X32-NEXT: addl %edi, %esi
+; X32-NEXT: movl %esi, -48(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %ebx, %ecx
+; X32-NEXT: setb %bl
+; X32-NEXT: addl %eax, %ecx
+; X32-NEXT: movzbl %bl, %esi
+; X32-NEXT: adcl %edx, %esi
+; X32-NEXT: movl 8(%ebp), %eax
+; X32-NEXT: movl 24(%eax), %eax
+; X32-NEXT: movl %eax, -288(%ebp) # 4-byte Spill
+; X32-NEXT: xorl %edx, %edx
+; X32-NEXT: mull %edx
+; X32-NEXT: movl %eax, -280(%ebp) # 4-byte Spill
+; X32-NEXT: movl %edx, -312(%ebp) # 4-byte Spill
+; X32-NEXT: movl %edi, %edx
+; X32-NEXT: addl %eax, %edi
+; X32-NEXT: movl -320(%ebp), %ebx # 4-byte Reload
+; X32-NEXT: adcl -312(%ebp), %ebx # 4-byte Folded Reload
+; X32-NEXT: addl %ecx, %edi
+; X32-NEXT: movl %edi, -36(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %esi, %ebx
+; X32-NEXT: movl %ebx, -20(%ebp) # 4-byte Spill
+; X32-NEXT: addl -28(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: movl %edx, -228(%ebp) # 4-byte Spill
+; X32-NEXT: movl -48(%ebp), %eax # 4-byte Reload
+; X32-NEXT: adcl -16(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: movl %eax, -596(%ebp) # 4-byte Spill
+; X32-NEXT: movl %edi, %eax
+; X32-NEXT: adcl -120(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: movl %eax, -464(%ebp) # 4-byte Spill
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: adcl -60(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: movl %eax, -536(%ebp) # 4-byte Spill
+; X32-NEXT: movl 8(%ebp), %eax
+; X32-NEXT: movl 4(%eax), %eax
+; X32-NEXT: movl %eax, -124(%ebp) # 4-byte Spill
+; X32-NEXT: xorl %ecx, %ecx
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %eax, %esi
+; X32-NEXT: movl -264(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: addl %ecx, %esi
+; X32-NEXT: movl %edx, %edi
+; X32-NEXT: adcl $0, %edi
+; X32-NEXT: movl -136(%ebp), %ebx # 4-byte Reload
+; X32-NEXT: addl %ebx, %esi
+; X32-NEXT: movl %esi, -276(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %ecx, %edi
+; X32-NEXT: setb %cl
+; X32-NEXT: addl %eax, %edi
+; X32-NEXT: movl %edi, -584(%ebp) # 4-byte Spill
+; X32-NEXT: movzbl %cl, %eax
+; X32-NEXT: adcl %edx, %eax
+; X32-NEXT: movl %eax, -432(%ebp) # 4-byte Spill
+; X32-NEXT: movl 8(%ebp), %eax
+; X32-NEXT: movl 8(%eax), %eax
+; X32-NEXT: movl %eax, -184(%ebp) # 4-byte Spill
+; X32-NEXT: xorl %ecx, %ecx
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: movl %ecx, -160(%ebp) # 4-byte Spill
+; X32-NEXT: movl %edx, -268(%ebp) # 4-byte Spill
+; X32-NEXT: movl %ebx, %esi
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: addl %ecx, %eax
+; X32-NEXT: movl -264(%ebp), %ebx # 4-byte Reload
+; X32-NEXT: movl %ebx, %ecx
+; X32-NEXT: adcl %edx, %ecx
+; X32-NEXT: addl %edi, %eax
+; X32-NEXT: movl %eax, -240(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -432(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: movl %esi, %edx
+; X32-NEXT: addl -28(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: movl %edx, -344(%ebp) # 4-byte Spill
+; X32-NEXT: movl -276(%ebp), %edi # 4-byte Reload
+; X32-NEXT: movl %edi, %edx
+; X32-NEXT: adcl -16(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: movl %edx, -404(%ebp) # 4-byte Spill
+; X32-NEXT: pushl %eax
+; X32-NEXT: seto %al
+; X32-NEXT: lahf
+; X32-NEXT: movl %eax, %edx
+; X32-NEXT: popl %eax
+; X32-NEXT: movl %edx, -736(%ebp) # 4-byte Spill
+; X32-NEXT: movl %eax, %edx
+; X32-NEXT: adcl -120(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: movl %edx, -532(%ebp) # 4-byte Spill
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: movl %ecx, -172(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -60(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: movl %eax, -592(%ebp) # 4-byte Spill
+; X32-NEXT: movl %esi, %edx
+; X32-NEXT: movl %edx, %eax
+; X32-NEXT: movl -116(%ebp), %esi # 4-byte Reload
+; X32-NEXT: addl %esi, %eax
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: movl -84(%ebp), %ebx # 4-byte Reload
+; X32-NEXT: adcl %ebx, %eax
+; X32-NEXT: movl %eax, -328(%ebp) # 4-byte Spill
+; X32-NEXT: movl %edx, %eax
+; X32-NEXT: addl %esi, %eax
+; X32-NEXT: movl %eax, -368(%ebp) # 4-byte Spill
+; X32-NEXT: movl %edi, %eax
+; X32-NEXT: adcl -164(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: movl %eax, -620(%ebp) # 4-byte Spill
+; X32-NEXT: movl -240(%ebp), %eax # 4-byte Reload
+; X32-NEXT: movl -40(%ebp), %edi # 4-byte Reload
+; X32-NEXT: adcl %edi, %eax
+; X32-NEXT: movl %eax, -788(%ebp) # 4-byte Spill
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: adcl -56(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: movl %eax, -784(%ebp) # 4-byte Spill
+; X32-NEXT: movl -180(%ebp), %eax # 4-byte Reload
+; X32-NEXT: movl -100(%ebp), %edx # 4-byte Reload
+; X32-NEXT: addl %edx, %eax
+; X32-NEXT: movl -320(%ebp), %esi # 4-byte Reload
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: movl -204(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: adcl %ecx, %eax
+; X32-NEXT: movl %eax, -804(%ebp) # 4-byte Spill
+; X32-NEXT: movl -136(%ebp), %eax # 4-byte Reload
+; X32-NEXT: addl %edx, %eax
+; X32-NEXT: movl -264(%ebp), %eax # 4-byte Reload
+; X32-NEXT: adcl %ecx, %eax
+; X32-NEXT: movl %eax, -820(%ebp) # 4-byte Spill
+; X32-NEXT: movl -180(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: movl -116(%ebp), %edx # 4-byte Reload
+; X32-NEXT: addl %edx, %eax
+; X32-NEXT: adcl %ebx, %esi
+; X32-NEXT: movl %esi, -576(%ebp) # 4-byte Spill
; X32-NEXT: addl %edx, %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: movl %ecx, -540(%ebp) # 4-byte Spill
+; X32-NEXT: movl -48(%ebp), %eax # 4-byte Reload
+; X32-NEXT: adcl -164(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: movl %eax, -800(%ebp) # 4-byte Spill
+; X32-NEXT: movl -36(%ebp), %eax # 4-byte Reload
+; X32-NEXT: adcl %edi, %eax
+; X32-NEXT: movl %eax, -796(%ebp) # 4-byte Spill
+; X32-NEXT: movl -20(%ebp), %eax # 4-byte Reload
+; X32-NEXT: adcl -56(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: movl %eax, -792(%ebp) # 4-byte Spill
+; X32-NEXT: movl -220(%ebp), %esi # 4-byte Reload
+; X32-NEXT: addl -304(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: movl %esi, -220(%ebp) # 4-byte Spill
+; X32-NEXT: movzbl -388(%ebp), %eax # 1-byte Folded Reload
+; X32-NEXT: adcl -236(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: movl %eax, -376(%ebp) # 4-byte Spill
+; X32-NEXT: movl 12(%ebp), %eax
+; X32-NEXT: movl 40(%eax), %eax
+; X32-NEXT: movl %eax, -236(%ebp) # 4-byte Spill
+; X32-NEXT: xorl %ecx, %ecx
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %eax, -304(%ebp) # 4-byte Spill
+; X32-NEXT: movl %edx, -128(%ebp) # 4-byte Spill
+; X32-NEXT: movl -100(%ebp), %ebx # 4-byte Reload
+; X32-NEXT: movl %ebx, %edi
+; X32-NEXT: addl %eax, %edi
+; X32-NEXT: movl -204(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: adcl %edx, %ecx
+; X32-NEXT: addl %esi, %edi
+; X32-NEXT: adcl -376(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: movl %ecx, %edx
+; X32-NEXT: movl -180(%ebp), %eax # 4-byte Reload
+; X32-NEXT: addl %ebx, %eax
+; X32-NEXT: movl %eax, -468(%ebp) # 4-byte Spill
+; X32-NEXT: movl -48(%ebp), %eax # 4-byte Reload
+; X32-NEXT: movl -80(%ebp), %ecx # 4-byte Reload
; X32-NEXT: adcl %ecx, %eax
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl $0, %edi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
+; X32-NEXT: movl %eax, -816(%ebp) # 4-byte Spill
+; X32-NEXT: movl -36(%ebp), %eax # 4-byte Reload
+; X32-NEXT: adcl %edi, %eax
+; X32-NEXT: movl %edi, -372(%ebp) # 4-byte Spill
+; X32-NEXT: movl %eax, -812(%ebp) # 4-byte Spill
+; X32-NEXT: movl -20(%ebp), %eax # 4-byte Reload
+; X32-NEXT: movl %edx, -292(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %edx, %eax
+; X32-NEXT: movl %eax, -808(%ebp) # 4-byte Spill
+; X32-NEXT: movl -136(%ebp), %eax # 4-byte Reload
+; X32-NEXT: addl %ebx, %eax
+; X32-NEXT: movl %eax, -512(%ebp) # 4-byte Spill
+; X32-NEXT: movl -276(%ebp), %eax # 4-byte Reload
+; X32-NEXT: adcl %ecx, %eax
+; X32-NEXT: movl %eax, -676(%ebp) # 4-byte Spill
+; X32-NEXT: seto %al
+; X32-NEXT: lahf
+; X32-NEXT: movl %eax, %eax
+; X32-NEXT: movl %eax, -740(%ebp) # 4-byte Spill
+; X32-NEXT: movl -240(%ebp), %eax # 4-byte Reload
+; X32-NEXT: adcl %edi, %eax
+; X32-NEXT: movl %eax, -624(%ebp) # 4-byte Spill
+; X32-NEXT: movl -172(%ebp), %eax # 4-byte Reload
+; X32-NEXT: adcl %edx, %eax
+; X32-NEXT: movl %eax, -628(%ebp) # 4-byte Spill
+; X32-NEXT: movl 12(%ebp), %esi
+; X32-NEXT: movl 48(%esi), %eax
+; X32-NEXT: movl %eax, -300(%ebp) # 4-byte Spill
+; X32-NEXT: xorl %ecx, %ecx
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %eax, %ebx
+; X32-NEXT: movl %ebx, -336(%ebp) # 4-byte Spill
+; X32-NEXT: movl %edx, %edi
+; X32-NEXT: movl 52(%esi), %eax
+; X32-NEXT: movl %eax, -144(%ebp) # 4-byte Spill
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %eax, %esi
+; X32-NEXT: addl %edi, %esi
+; X32-NEXT: movl %edx, %ecx
; X32-NEXT: adcl $0, %ecx
-; X32-NEXT: addl %ebx, %edi
-; X32-NEXT: adcl %esi, %ecx
-; X32-NEXT: setb %al
-; X32-NEXT: addl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movzbl %al, %eax
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl $0, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
+; X32-NEXT: addl %ebx, %esi
+; X32-NEXT: movl %esi, -200(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %edi, %ecx
+; X32-NEXT: setb %bl
+; X32-NEXT: addl %eax, %ecx
+; X32-NEXT: movzbl %bl, %esi
+; X32-NEXT: adcl %edx, %esi
+; X32-NEXT: movl 12(%ebp), %eax
+; X32-NEXT: movl 56(%eax), %eax
+; X32-NEXT: movl %eax, -244(%ebp) # 4-byte Spill
+; X32-NEXT: xorl %edx, %edx
+; X32-NEXT: mull %edx
+; X32-NEXT: movl %eax, -224(%ebp) # 4-byte Spill
+; X32-NEXT: movl %edx, -360(%ebp) # 4-byte Spill
+; X32-NEXT: movl -336(%ebp), %ebx # 4-byte Reload
+; X32-NEXT: addl %eax, %ebx
+; X32-NEXT: movl %edi, %edx
+; X32-NEXT: movl %edx, -176(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -360(%ebp), %edi # 4-byte Folded Reload
+; X32-NEXT: addl %ecx, %ebx
+; X32-NEXT: movl %ebx, -472(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %esi, %edi
+; X32-NEXT: movl %edi, -436(%ebp) # 4-byte Spill
+; X32-NEXT: movl -136(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: movl -336(%ebp), %esi # 4-byte Reload
+; X32-NEXT: addl %esi, %eax
+; X32-NEXT: movl -264(%ebp), %eax # 4-byte Reload
+; X32-NEXT: adcl %edx, %eax
+; X32-NEXT: movl %eax, -824(%ebp) # 4-byte Spill
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: addl %esi, %eax
+; X32-NEXT: movl %eax, -588(%ebp) # 4-byte Spill
+; X32-NEXT: movl -276(%ebp), %eax # 4-byte Reload
+; X32-NEXT: adcl -200(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: movl %eax, -632(%ebp) # 4-byte Spill
+; X32-NEXT: movl -240(%ebp), %eax # 4-byte Reload
+; X32-NEXT: adcl %ebx, %eax
+; X32-NEXT: movl %eax, -828(%ebp) # 4-byte Spill
+; X32-NEXT: movl -172(%ebp), %eax # 4-byte Reload
+; X32-NEXT: adcl %edi, %eax
+; X32-NEXT: movl %eax, -636(%ebp) # 4-byte Spill
+; X32-NEXT: movl 8(%ebp), %eax
+; X32-NEXT: movl 64(%eax), %eax
+; X32-NEXT: movl %eax, -476(%ebp) # 4-byte Spill
+; X32-NEXT: xorl %ecx, %ecx
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: movl %esi, -384(%ebp) # 4-byte Spill
+; X32-NEXT: movl -116(%ebp), %edi # 4-byte Reload
+; X32-NEXT: movl %edi, %ecx
+; X32-NEXT: movl %eax, %edx
+; X32-NEXT: movl %edx, -480(%ebp) # 4-byte Spill
+; X32-NEXT: addl %edx, %ecx
+; X32-NEXT: movl -84(%ebp), %ebx # 4-byte Reload
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: adcl %esi, %eax
+; X32-NEXT: movl %eax, -920(%ebp) # 4-byte Spill
+; X32-NEXT: movl -28(%ebp), %esi # 4-byte Reload
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: addl %edx, %eax
+; X32-NEXT: movl -256(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: adcl -384(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: movl %eax, -932(%ebp) # 4-byte Spill
+; X32-NEXT: movl 8(%ebp), %eax
+; X32-NEXT: movl 80(%eax), %eax
+; X32-NEXT: movl %eax, -548(%ebp) # 4-byte Spill
+; X32-NEXT: xorl %edx, %edx
+; X32-NEXT: mull %edx
+; X32-NEXT: movl %eax, -380(%ebp) # 4-byte Spill
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: movl -380(%ebp), %esi # 4-byte Reload
+; X32-NEXT: addl %esi, %eax
+; X32-NEXT: movl %edx, -356(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %edx, %ecx
+; X32-NEXT: movl %ecx, -948(%ebp) # 4-byte Spill
+; X32-NEXT: addl %esi, %edi
+; X32-NEXT: adcl %edx, %ebx
+; X32-NEXT: movl %ebx, -960(%ebp) # 4-byte Spill
+; X32-NEXT: movl 12(%ebp), %ecx
+; X32-NEXT: movl 80(%ecx), %eax
+; X32-NEXT: movl %eax, -552(%ebp) # 4-byte Spill
+; X32-NEXT: xorl %ebx, %ebx
+; X32-NEXT: mull %ebx
+; X32-NEXT: movl %edx, -528(%ebp) # 4-byte Spill
+; X32-NEXT: movl %eax, -524(%ebp) # 4-byte Spill
+; X32-NEXT: movl -136(%ebp), %esi # 4-byte Reload
+; X32-NEXT: addl %esi, %eax
+; X32-NEXT: movl %edx, %eax
+; X32-NEXT: movl -264(%ebp), %edi # 4-byte Reload
+; X32-NEXT: adcl %edi, %eax
+; X32-NEXT: movl %eax, -976(%ebp) # 4-byte Spill
+; X32-NEXT: movl 64(%ecx), %eax
+; X32-NEXT: movl %eax, -520(%ebp) # 4-byte Spill
+; X32-NEXT: mull %ebx
+; X32-NEXT: movl %eax, -500(%ebp) # 4-byte Spill
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: addl %esi, %ecx
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: movl %esi, -496(%ebp) # 4-byte Spill
+; X32-NEXT: movl %esi, %ecx
+; X32-NEXT: adcl %edi, %ecx
+; X32-NEXT: movl %ecx, -992(%ebp) # 4-byte Spill
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: movl -180(%ebp), %edx # 4-byte Reload
+; X32-NEXT: addl %edx, %ecx
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: movl -320(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: adcl %ecx, %eax
+; X32-NEXT: movl %eax, -1008(%ebp) # 4-byte Spill
+; X32-NEXT: movl %edx, %eax
+; X32-NEXT: movl -336(%ebp), %edi # 4-byte Reload
+; X32-NEXT: addl %edi, %eax
+; X32-NEXT: adcl -176(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: movl %ecx, -832(%ebp) # 4-byte Spill
+; X32-NEXT: movl %edx, %eax
+; X32-NEXT: addl %edi, %eax
+; X32-NEXT: movl %eax, -672(%ebp) # 4-byte Spill
+; X32-NEXT: movl -48(%ebp), %eax # 4-byte Reload
+; X32-NEXT: adcl -200(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: movl %eax, -836(%ebp) # 4-byte Spill
+; X32-NEXT: movl -36(%ebp), %eax # 4-byte Reload
+; X32-NEXT: adcl -472(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: movl %eax, -840(%ebp) # 4-byte Spill
+; X32-NEXT: movl -20(%ebp), %eax # 4-byte Reload
+; X32-NEXT: adcl -436(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: movl %eax, -844(%ebp) # 4-byte Spill
+; X32-NEXT: movl -132(%ebp), %eax # 4-byte Reload
+; X32-NEXT: addl -100(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: movl %eax, -680(%ebp) # 4-byte Spill
+; X32-NEXT: movl -192(%ebp), %eax # 4-byte Reload
+; X32-NEXT: adcl -80(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: movl %eax, -856(%ebp) # 4-byte Spill
+; X32-NEXT: movl -272(%ebp), %eax # 4-byte Reload
+; X32-NEXT: movl -372(%ebp), %edx # 4-byte Reload
+; X32-NEXT: adcl %edx, %eax
+; X32-NEXT: movl %eax, -852(%ebp) # 4-byte Spill
+; X32-NEXT: movl -24(%ebp), %eax # 4-byte Reload
+; X32-NEXT: movl -292(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: adcl %ecx, %eax
+; X32-NEXT: movl %eax, -848(%ebp) # 4-byte Spill
+; X32-NEXT: movl -44(%ebp), %ebx # 4-byte Reload
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: movl -96(%ebp), %esi # 4-byte Reload
; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: addb $127, %al
+; X32-NEXT: sahf
+; X32-NEXT: popl %eax
+; X32-NEXT: adcl %edx, %eax
+; X32-NEXT: movl %eax, -860(%ebp) # 4-byte Spill
+; X32-NEXT: movl -52(%ebp), %esi # 4-byte Reload
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: adcl %ecx, %eax
+; X32-NEXT: movl %eax, -864(%ebp) # 4-byte Spill
+; X32-NEXT: movl -324(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: addl %edi, %eax
+; X32-NEXT: movl -400(%ebp), %eax # 4-byte Reload
+; X32-NEXT: movl -176(%ebp), %edx # 4-byte Reload
+; X32-NEXT: adcl %edx, %eax
+; X32-NEXT: movl %eax, -868(%ebp) # 4-byte Spill
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: addl %edi, %eax
+; X32-NEXT: movl %eax, -684(%ebp) # 4-byte Spill
+; X32-NEXT: movl -112(%ebp), %eax # 4-byte Reload
+; X32-NEXT: adcl -200(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: movl %eax, -876(%ebp) # 4-byte Spill
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: movl -472(%ebp), %ebx # 4-byte Reload
+; X32-NEXT: adcl %ebx, %eax
+; X32-NEXT: movl %eax, -872(%ebp) # 4-byte Spill
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: movl -436(%ebp), %esi # 4-byte Reload
+; X32-NEXT: adcl %esi, %eax
+; X32-NEXT: movl %eax, -880(%ebp) # 4-byte Spill
+; X32-NEXT: movl -132(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: addl %edi, %eax
+; X32-NEXT: movl -140(%ebp), %eax # 4-byte Reload
+; X32-NEXT: adcl %edx, %eax
+; X32-NEXT: movl %eax, -888(%ebp) # 4-byte Spill
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: addl %edi, %eax
+; X32-NEXT: movl %eax, -688(%ebp) # 4-byte Spill
+; X32-NEXT: movl -192(%ebp), %eax # 4-byte Reload
+; X32-NEXT: adcl -200(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: movl %eax, -900(%ebp) # 4-byte Spill
+; X32-NEXT: movl -272(%ebp), %eax # 4-byte Reload
+; X32-NEXT: adcl %ebx, %eax
+; X32-NEXT: movl %eax, -896(%ebp) # 4-byte Spill
+; X32-NEXT: movl -24(%ebp), %eax # 4-byte Reload
+; X32-NEXT: adcl %esi, %eax
+; X32-NEXT: movl %eax, -904(%ebp) # 4-byte Spill
+; X32-NEXT: movl 8(%ebp), %eax
+; X32-NEXT: movl 68(%eax), %eax
+; X32-NEXT: movl %eax, -248(%ebp) # 4-byte Spill
+; X32-NEXT: xorl %ecx, %ecx
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %eax, %esi
+; X32-NEXT: movl -384(%ebp), %edi # 4-byte Reload
+; X32-NEXT: addl %edi, %esi
+; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: adcl $0, %ecx
+; X32-NEXT: movl -480(%ebp), %ebx # 4-byte Reload
+; X32-NEXT: addl %ebx, %esi
+; X32-NEXT: movl %esi, -652(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %edi, %ecx
+; X32-NEXT: setb -96(%ebp) # 1-byte Folded Spill
; X32-NEXT: addl %eax, %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl %ecx, %edx
-; X32-NEXT: adcl %eax, %edx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl %ecx, %esi
+; X32-NEXT: movzbl -96(%ebp), %edi # 1-byte Folded Reload
+; X32-NEXT: adcl %edx, %edi
+; X32-NEXT: movl 8(%ebp), %eax
+; X32-NEXT: movl 72(%eax), %eax
+; X32-NEXT: movl %eax, -516(%ebp) # 4-byte Spill
+; X32-NEXT: xorl %edx, %edx
+; X32-NEXT: mull %edx
+; X32-NEXT: movl %eax, %esi
+; X32-NEXT: movl %esi, -484(%ebp) # 4-byte Spill
+; X32-NEXT: movl %edx, -488(%ebp) # 4-byte Spill
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: addl %esi, %eax
+; X32-NEXT: movl -384(%ebp), %ebx # 4-byte Reload
+; X32-NEXT: adcl %edx, %ebx
+; X32-NEXT: addl %ecx, %eax
+; X32-NEXT: adcl %edi, %ebx
+; X32-NEXT: movl -116(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: movl -480(%ebp), %edx # 4-byte Reload
+; X32-NEXT: addl %edx, %ecx
+; X32-NEXT: movl %ecx, -692(%ebp) # 4-byte Spill
+; X32-NEXT: movl -164(%ebp), %esi # 4-byte Reload
+; X32-NEXT: movl -652(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: adcl %ecx, %esi
+; X32-NEXT: movl %esi, -908(%ebp) # 4-byte Spill
+; X32-NEXT: movl -40(%ebp), %esi # 4-byte Reload
; X32-NEXT: adcl %eax, %esi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X32-NEXT: movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl %eax, %ebx
-; X32-NEXT: addl %edi, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
-; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %esi # 4-byte Folded Reload
-; X32-NEXT: movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
-; X32-NEXT: movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X32-NEXT: movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: addl %eax, %esi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl %ecx, %edx
-; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X32-NEXT: movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl $0, %ebx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X32-NEXT: movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl $0, %edi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: movl %esi, -916(%ebp) # 4-byte Spill
+; X32-NEXT: movl -56(%ebp), %esi # 4-byte Reload
+; X32-NEXT: adcl %ebx, %esi
+; X32-NEXT: movl %esi, -912(%ebp) # 4-byte Spill
+; X32-NEXT: movl -28(%ebp), %esi # 4-byte Reload
; X32-NEXT: addl %edx, %esi
-; X32-NEXT: movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl %edx, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT: adcl $0, %eax
+; X32-NEXT: movl %esi, -696(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -16(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: movl %ecx, -652(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -120(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: movl %eax, -924(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -60(%ebp), %ebx # 4-byte Folded Reload
+; X32-NEXT: movl %ebx, -928(%ebp) # 4-byte Spill
+; X32-NEXT: movl 8(%ebp), %ecx
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: movl 84(%eax), %eax
+; X32-NEXT: movl %eax, -544(%ebp) # 4-byte Spill
+; X32-NEXT: xorl %ecx, %ecx
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %eax, %esi
+; X32-NEXT: movl -356(%ebp), %ebx # 4-byte Reload
+; X32-NEXT: addl %ebx, %esi
+; X32-NEXT: movl %edx, %ecx
; X32-NEXT: adcl $0, %ecx
+; X32-NEXT: movl -380(%ebp), %edi # 4-byte Reload
+; X32-NEXT: addl %edi, %esi
+; X32-NEXT: movl %esi, -660(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %ebx, %ecx
+; X32-NEXT: setb %bl
+; X32-NEXT: addl %eax, %ecx
+; X32-NEXT: movzbl %bl, %esi
+; X32-NEXT: adcl %edx, %esi
+; X32-NEXT: movl 8(%ebp), %eax
+; X32-NEXT: movl 88(%eax), %eax
+; X32-NEXT: movl %eax, -580(%ebp) # 4-byte Spill
+; X32-NEXT: xorl %edx, %edx
+; X32-NEXT: mull %edx
+; X32-NEXT: movl %eax, -600(%ebp) # 4-byte Spill
+; X32-NEXT: movl %edx, -604(%ebp) # 4-byte Spill
+; X32-NEXT: movl %edi, %ebx
+; X32-NEXT: addl %eax, %edi
+; X32-NEXT: movl -356(%ebp), %eax # 4-byte Reload
+; X32-NEXT: adcl %edx, %eax
+; X32-NEXT: addl %ecx, %edi
+; X32-NEXT: adcl %esi, %eax
+; X32-NEXT: movl %eax, %esi
+; X32-NEXT: movl -28(%ebp), %eax # 4-byte Reload
; X32-NEXT: addl %ebx, %eax
-; X32-NEXT: adcl %edi, %ecx
-; X32-NEXT: setb %dl
-; X32-NEXT: addl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
-; X32-NEXT: movzbl %dl, %edx
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
-; X32-NEXT: adcl $0, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X32-NEXT: movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X32-NEXT: movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: addl %esi, %ebx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X32-NEXT: movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X32-NEXT: movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: movl %eax, -704(%ebp) # 4-byte Spill
+; X32-NEXT: movl -16(%ebp), %edx # 4-byte Reload
+; X32-NEXT: movl -660(%ebp), %eax # 4-byte Reload
+; X32-NEXT: adcl %eax, %edx
+; X32-NEXT: movl %edx, -940(%ebp) # 4-byte Spill
+; X32-NEXT: movl -120(%ebp), %edx # 4-byte Reload
+; X32-NEXT: adcl %edi, %edx
+; X32-NEXT: movl %edx, -944(%ebp) # 4-byte Spill
+; X32-NEXT: movl %edi, %edx
+; X32-NEXT: movl -60(%ebp), %edi # 4-byte Reload
; X32-NEXT: adcl %esi, %edi
-; X32-NEXT: movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X32-NEXT: movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X32-NEXT: movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl %edi, %esi
-; X32-NEXT: movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X32-NEXT: movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X32-NEXT: movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: movl %edi, -936(%ebp) # 4-byte Spill
+; X32-NEXT: movl -116(%ebp), %edi # 4-byte Reload
+; X32-NEXT: addl %ebx, %edi
+; X32-NEXT: movl %edi, -708(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -164(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: movl %eax, -660(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -40(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: movl %edx, -952(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -56(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: movl %esi, -956(%ebp) # 4-byte Spill
+; X32-NEXT: movl 12(%ebp), %eax
+; X32-NEXT: movl 84(%eax), %eax
+; X32-NEXT: movl %eax, -460(%ebp) # 4-byte Spill
+; X32-NEXT: xorl %ecx, %ecx
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %eax, %edi
+; X32-NEXT: movl -528(%ebp), %esi # 4-byte Reload
+; X32-NEXT: addl %esi, %edi
+; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: adcl $0, %ecx
+; X32-NEXT: addl -524(%ebp), %edi # 4-byte Folded Reload
+; X32-NEXT: movl %edi, -668(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %esi, %ecx
+; X32-NEXT: setb %bl
+; X32-NEXT: addl %eax, %ecx
+; X32-NEXT: movzbl %bl, %edi
+; X32-NEXT: adcl %edx, %edi
+; X32-NEXT: movl 12(%ebp), %eax
+; X32-NEXT: movl 88(%eax), %eax
+; X32-NEXT: movl %eax, -492(%ebp) # 4-byte Spill
+; X32-NEXT: xorl %edx, %edx
+; X32-NEXT: mull %edx
+; X32-NEXT: movl %eax, %esi
+; X32-NEXT: movl %esi, -556(%ebp) # 4-byte Spill
+; X32-NEXT: movl %edx, -560(%ebp) # 4-byte Spill
+; X32-NEXT: movl -524(%ebp), %eax # 4-byte Reload
+; X32-NEXT: movl %eax, %ebx
+; X32-NEXT: addl %esi, %ebx
+; X32-NEXT: movl -528(%ebp), %esi # 4-byte Reload
+; X32-NEXT: adcl %edx, %esi
+; X32-NEXT: addl %ecx, %ebx
+; X32-NEXT: movl %ebx, -732(%ebp) # 4-byte Spill
; X32-NEXT: adcl %edi, %esi
+; X32-NEXT: movl %esi, %edx
+; X32-NEXT: movl %edx, -728(%ebp) # 4-byte Spill
+; X32-NEXT: addl -136(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: movl %eax, -712(%ebp) # 4-byte Spill
+; X32-NEXT: movl -668(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: adcl -276(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: movl %ecx, -968(%ebp) # 4-byte Spill
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: adcl -240(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: movl %eax, -964(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -172(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: movl %edx, -972(%ebp) # 4-byte Spill
+; X32-NEXT: movl 12(%ebp), %eax
+; X32-NEXT: movl 68(%eax), %eax
+; X32-NEXT: movl %eax, -444(%ebp) # 4-byte Spill
+; X32-NEXT: xorl %ecx, %ecx
+; X32-NEXT: mull %ecx
+; X32-NEXT: xorl %ebx, %ebx
+; X32-NEXT: movl %eax, %esi
+; X32-NEXT: movl -496(%ebp), %edi # 4-byte Reload
+; X32-NEXT: addl %edi, %esi
+; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: adcl $0, %ecx
+; X32-NEXT: addl -500(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: movl %esi, -664(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %edi, %ecx
+; X32-NEXT: setb -96(%ebp) # 1-byte Folded Spill
+; X32-NEXT: addl %eax, %ecx
+; X32-NEXT: movzbl -96(%ebp), %esi # 1-byte Folded Reload
+; X32-NEXT: adcl %edx, %esi
+; X32-NEXT: movl 12(%ebp), %eax
+; X32-NEXT: movl 72(%eax), %eax
+; X32-NEXT: movl %eax, -388(%ebp) # 4-byte Spill
+; X32-NEXT: mull %ebx
+; X32-NEXT: movl %eax, -564(%ebp) # 4-byte Spill
+; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: movl %ebx, -568(%ebp) # 4-byte Spill
+; X32-NEXT: movl -500(%ebp), %edx # 4-byte Reload
+; X32-NEXT: movl %edx, %edi
+; X32-NEXT: addl %eax, %edi
+; X32-NEXT: movl -496(%ebp), %eax # 4-byte Reload
+; X32-NEXT: adcl %ebx, %eax
+; X32-NEXT: addl %ecx, %edi
+; X32-NEXT: adcl %esi, %eax
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: movl %edx, %eax
+; X32-NEXT: addl -136(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: movl %eax, -716(%ebp) # 4-byte Spill
+; X32-NEXT: movl -664(%ebp), %eax # 4-byte Reload
+; X32-NEXT: movl %eax, %esi
+; X32-NEXT: adcl -276(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: movl %esi, -988(%ebp) # 4-byte Spill
+; X32-NEXT: movl %edi, %esi
+; X32-NEXT: adcl -240(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: movl %esi, -984(%ebp) # 4-byte Spill
+; X32-NEXT: movl %ecx, %esi
+; X32-NEXT: adcl -172(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: movl %esi, -980(%ebp) # 4-byte Spill
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: movl -180(%ebp), %edx # 4-byte Reload
+; X32-NEXT: addl %edx, %esi
+; X32-NEXT: movl %esi, -720(%ebp) # 4-byte Spill
+; X32-NEXT: movl -48(%ebp), %esi # 4-byte Reload
+; X32-NEXT: adcl %esi, %eax
+; X32-NEXT: movl %eax, -664(%ebp) # 4-byte Spill
+; X32-NEXT: movl -36(%ebp), %ebx # 4-byte Reload
+; X32-NEXT: adcl %ebx, %edi
+; X32-NEXT: movl %edi, -996(%ebp) # 4-byte Spill
+; X32-NEXT: movl -20(%ebp), %edi # 4-byte Reload
+; X32-NEXT: adcl %edi, %ecx
+; X32-NEXT: movl %ecx, -1000(%ebp) # 4-byte Spill
+; X32-NEXT: movl -524(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: addl %edx, %eax
+; X32-NEXT: movl -528(%ebp), %eax # 4-byte Reload
+; X32-NEXT: adcl -320(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: movl %eax, -1004(%ebp) # 4-byte Spill
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: addl %edx, %eax
+; X32-NEXT: movl %eax, -724(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %esi, -668(%ebp) # 4-byte Folded Spill
+; X32-NEXT: adcl %ebx, -732(%ebp) # 4-byte Folded Spill
+; X32-NEXT: adcl %edi, -728(%ebp) # 4-byte Folded Spill
+; X32-NEXT: movl 12(%ebp), %eax
+; X32-NEXT: movl 12(%eax), %eax
+; X32-NEXT: movl %eax, -96(%ebp) # 4-byte Spill
+; X32-NEXT: xorl %ecx, %ecx
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %eax, %edi
+; X32-NEXT: movl -156(%ebp), %esi # 4-byte Reload
+; X32-NEXT: addl %esi, %edi
+; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: adcl $0, %ebx
+; X32-NEXT: movl -104(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: addl %ecx, %edi
+; X32-NEXT: movl %edi, -232(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %esi, %ebx
+; X32-NEXT: setb -88(%ebp) # 1-byte Folded Spill
; X32-NEXT: addl %eax, %ebx
-; X32-NEXT: movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: adcl %ecx, %eax
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
-; X32-NEXT: adcl %edx, %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %esi # 4-byte Folded Reload
-; X32-NEXT: movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx # 4-byte Reload
-; X32-NEXT: addl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
-; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
-; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
-; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
-; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
-; X32-NEXT: movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %esi # 4-byte Folded Reload
-; X32-NEXT: movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: addl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl %eax, %edx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X32-NEXT: adcl $0, %edi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: adcl $0, %eax
-; X32-NEXT: addl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT: movzbl -88(%ebp), %eax # 1-byte Folded Reload
+; X32-NEXT: adcl %edx, %eax
+; X32-NEXT: movl %ecx, %edx
+; X32-NEXT: addl %edx, %ebx
+; X32-NEXT: adcl %esi, %eax
+; X32-NEXT: movl %eax, -88(%ebp) # 4-byte Spill
+; X32-NEXT: movl -28(%ebp), %edi # 4-byte Reload
+; X32-NEXT: movl -76(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: addl %edi, %ecx
+; X32-NEXT: movl -72(%ebp), %esi # 4-byte Reload
+; X32-NEXT: adcl -256(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: addl %ecx, %edx
+; X32-NEXT: movl %edx, -72(%ebp) # 4-byte Spill
+; X32-NEXT: movl -232(%ebp), %edx # 4-byte Reload
+; X32-NEXT: adcl %esi, %edx
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: movl %edx, -76(%ebp) # 4-byte Spill
+; X32-NEXT: movl %ebx, %edx
; X32-NEXT: adcl $0, %edx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT: movl -88(%ebp), %esi # 4-byte Reload
; X32-NEXT: adcl $0, %esi
-; X32-NEXT: addl %edi, %edx
-; X32-NEXT: adcl %eax, %esi
-; X32-NEXT: setb %al
-; X32-NEXT: addl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %esi
-; X32-NEXT: movzbl %al, %eax
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: addl %edi, -72(%ebp) # 4-byte Folded Spill
+; X32-NEXT: movl -16(%ebp), %edi # 4-byte Reload
+; X32-NEXT: adcl %edi, -76(%ebp) # 4-byte Folded Spill
+; X32-NEXT: adcl $0, %ecx
; X32-NEXT: adcl $0, %eax
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT: addl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
-; X32-NEXT: addl %edx, %edi
-; X32-NEXT: movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl %esi, %ebx
-; X32-NEXT: movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: addl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: movl %ecx, %esi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT: addl %edx, %ecx
+; X32-NEXT: adcl %esi, %eax
+; X32-NEXT: setb %dl
+; X32-NEXT: addl -104(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: adcl -232(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: movzbl %dl, %edx
+; X32-NEXT: adcl %ebx, %edx
+; X32-NEXT: movl %edx, -608(%ebp) # 4-byte Spill
+; X32-NEXT: adcl $0, -88(%ebp) # 4-byte Folded Spill
+; X32-NEXT: movl -28(%ebp), %ebx # 4-byte Reload
+; X32-NEXT: addl -116(%ebp), %ebx # 4-byte Folded Reload
+; X32-NEXT: movl -164(%ebp), %edx # 4-byte Reload
+; X32-NEXT: adcl %edi, %edx
+; X32-NEXT: movl -40(%ebp), %esi # 4-byte Reload
+; X32-NEXT: adcl -120(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: movl -56(%ebp), %edi # 4-byte Reload
+; X32-NEXT: adcl -60(%ebp), %edi # 4-byte Folded Reload
+; X32-NEXT: addl %ecx, %ebx
+; X32-NEXT: movl %ebx, -232(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %eax, %edx
+; X32-NEXT: movl %edx, -164(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -608(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: movl %esi, -40(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -88(%ebp), %edi # 4-byte Folded Reload
+; X32-NEXT: movl %edi, -56(%ebp) # 4-byte Spill
+; X32-NEXT: movl 8(%ebp), %eax
+; X32-NEXT: movl 44(%eax), %eax
+; X32-NEXT: movl %eax, -120(%ebp) # 4-byte Spill
+; X32-NEXT: xorl %ecx, %ecx
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, %edi
+; X32-NEXT: movl %eax, %ebx
+; X32-NEXT: movl %ebx, %ecx
+; X32-NEXT: movl -396(%ebp), %esi # 4-byte Reload
+; X32-NEXT: addl %esi, %ecx
; X32-NEXT: adcl $0, %edx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X32-NEXT: adcl $0, %edi
-; X32-NEXT: addl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %esi
-; X32-NEXT: movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT: movl -364(%ebp), %eax # 4-byte Reload
+; X32-NEXT: addl %eax, %ecx
+; X32-NEXT: movl %ecx, -60(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %esi, %edx
+; X32-NEXT: movl %esi, %ecx
+; X32-NEXT: setb -16(%ebp) # 1-byte Folded Spill
+; X32-NEXT: addl %ebx, %edx
+; X32-NEXT: movzbl -16(%ebp), %ebx # 1-byte Folded Reload
+; X32-NEXT: adcl %edi, %ebx
+; X32-NEXT: movl %eax, %esi
+; X32-NEXT: addl %esi, %edx
+; X32-NEXT: adcl %ecx, %ebx
+; X32-NEXT: movl -64(%ebp), %eax # 4-byte Reload
+; X32-NEXT: addl -324(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: movl -152(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: adcl -400(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: addl %eax, %esi
+; X32-NEXT: movl %esi, -64(%ebp) # 4-byte Spill
+; X32-NEXT: movl -60(%ebp), %esi # 4-byte Reload
+; X32-NEXT: adcl %ecx, %esi
+; X32-NEXT: movl %esi, -16(%ebp) # 4-byte Spill
+; X32-NEXT: movl %edx, %esi
; X32-NEXT: adcl $0, %esi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: movl %esi, -88(%ebp) # 4-byte Spill
+; X32-NEXT: movl %ebx, %edi
+; X32-NEXT: adcl $0, %edi
+; X32-NEXT: movl -324(%ebp), %esi # 4-byte Reload
+; X32-NEXT: addl %esi, -64(%ebp) # 4-byte Folded Spill
+; X32-NEXT: movl -16(%ebp), %esi # 4-byte Reload
+; X32-NEXT: adcl -112(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: movl %esi, -16(%ebp) # 4-byte Spill
+; X32-NEXT: adcl $0, %eax
; X32-NEXT: adcl $0, %ecx
-; X32-NEXT: addl %edx, %esi
+; X32-NEXT: addl -88(%ebp), %eax # 4-byte Folded Reload
; X32-NEXT: adcl %edi, %ecx
-; X32-NEXT: setb %al
-; X32-NEXT: addl {{[0-9]+}}(%esp), %esi
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: movzbl %al, %eax
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: adcl $0, %eax
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: addl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
-; X32-NEXT: addl %esi, %eax
+; X32-NEXT: setb -88(%ebp) # 1-byte Folded Spill
+; X32-NEXT: addl -364(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: adcl -60(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: movzbl -88(%ebp), %esi # 1-byte Folded Reload
+; X32-NEXT: adcl %edx, %esi
+; X32-NEXT: movl %esi, -60(%ebp) # 4-byte Spill
+; X32-NEXT: adcl $0, %ebx
+; X32-NEXT: movl -324(%ebp), %edx # 4-byte Reload
+; X32-NEXT: addl -132(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: movl %edx, -88(%ebp) # 4-byte Spill
+; X32-NEXT: movl -192(%ebp), %edx # 4-byte Reload
+; X32-NEXT: adcl -112(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: movl -44(%ebp), %esi # 4-byte Reload
+; X32-NEXT: adcl -272(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: movl -52(%ebp), %edi # 4-byte Reload
+; X32-NEXT: adcl -24(%ebp), %edi # 4-byte Folded Reload
+; X32-NEXT: addl %eax, -88(%ebp) # 4-byte Folded Spill
; X32-NEXT: adcl %ecx, %edx
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
-; X32-NEXT: addl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
-; X32-NEXT: movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
-; X32-NEXT: movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl $0, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT: adcl $0, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT: adcl $0, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT: adcl $0, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: addl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl %eax, %esi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: adcl $0, %eax
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: adcl $0, %ecx
-; X32-NEXT: addl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %esi
-; X32-NEXT: movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT: movl %edx, -192(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -60(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: movl %esi, -44(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %ebx, %edi
+; X32-NEXT: movl %edi, -52(%ebp) # 4-byte Spill
+; X32-NEXT: movl -64(%ebp), %eax # 4-byte Reload
+; X32-NEXT: movl -456(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: pushl %eax
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: addb $127, %al
+; X32-NEXT: sahf
+; X32-NEXT: popl %eax
+; X32-NEXT: adcl -72(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: movl %eax, -608(%ebp) # 4-byte Spill
+; X32-NEXT: movl -16(%ebp), %eax # 4-byte Reload
+; X32-NEXT: adcl -76(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: movl %eax, -760(%ebp) # 4-byte Spill
+; X32-NEXT: movl -88(%ebp), %eax # 4-byte Reload
+; X32-NEXT: adcl -232(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: movl %eax, -756(%ebp) # 4-byte Spill
+; X32-NEXT: movl %edx, %eax
+; X32-NEXT: adcl -164(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: movl %eax, -752(%ebp) # 4-byte Spill
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: adcl -40(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: movl %eax, -748(%ebp) # 4-byte Spill
+; X32-NEXT: movl %edi, %eax
+; X32-NEXT: adcl -56(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: movl %eax, -744(%ebp) # 4-byte Spill
+; X32-NEXT: movl 8(%ebp), %eax
+; X32-NEXT: movl 12(%eax), %eax
+; X32-NEXT: movl %eax, -60(%ebp) # 4-byte Spill
+; X32-NEXT: xorl %ecx, %ecx
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: movl -268(%ebp), %ebx # 4-byte Reload
+; X32-NEXT: addl %ebx, %ecx
; X32-NEXT: adcl $0, %edx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X32-NEXT: adcl $0, %esi
+; X32-NEXT: movl -160(%ebp), %edi # 4-byte Reload
+; X32-NEXT: addl %edi, %ecx
+; X32-NEXT: movl %ecx, -24(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %ebx, %edx
+; X32-NEXT: setb %cl
; X32-NEXT: addl %eax, %edx
-; X32-NEXT: adcl %ecx, %esi
-; X32-NEXT: setb %al
-; X32-NEXT: addl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %esi
-; X32-NEXT: movzbl %al, %eax
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movzbl %cl, %eax
+; X32-NEXT: adcl %esi, %eax
+; X32-NEXT: movl %edi, %esi
+; X32-NEXT: addl %esi, %edx
+; X32-NEXT: adcl %ebx, %eax
+; X32-NEXT: movl %eax, -112(%ebp) # 4-byte Spill
+; X32-NEXT: movl -136(%ebp), %edi # 4-byte Reload
+; X32-NEXT: movl -584(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: addl %edi, %ecx
+; X32-NEXT: movl -432(%ebp), %eax # 4-byte Reload
+; X32-NEXT: adcl -264(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: addl %ecx, %esi
+; X32-NEXT: movl %esi, -432(%ebp) # 4-byte Spill
+; X32-NEXT: movl -24(%ebp), %esi # 4-byte Reload
+; X32-NEXT: adcl %eax, %esi
+; X32-NEXT: movl %esi, -456(%ebp) # 4-byte Spill
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: adcl $0, %esi
+; X32-NEXT: movl -112(%ebp), %ebx # 4-byte Reload
+; X32-NEXT: adcl $0, %ebx
+; X32-NEXT: addl %edi, -432(%ebp) # 4-byte Folded Spill
+; X32-NEXT: movl -276(%ebp), %edi # 4-byte Reload
+; X32-NEXT: adcl %edi, -456(%ebp) # 4-byte Folded Spill
+; X32-NEXT: adcl $0, %ecx
; X32-NEXT: adcl $0, %eax
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: addl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
-; X32-NEXT: addl %edx, %eax
-; X32-NEXT: adcl %esi, %ecx
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx # 4-byte Reload
-; X32-NEXT: addl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx # 4-byte Reload
-; X32-NEXT: adcl %edx, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx # 4-byte Reload
-; X32-NEXT: adcl %edx, {{[0-9]+}}(%esp) # 4-byte Folded Spill
+; X32-NEXT: addl %esi, %ecx
+; X32-NEXT: adcl %ebx, %eax
+; X32-NEXT: setb %bl
+; X32-NEXT: addl -160(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: adcl -24(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: movzbl %bl, %esi
+; X32-NEXT: adcl %edx, %esi
+; X32-NEXT: movl %esi, -24(%ebp) # 4-byte Spill
+; X32-NEXT: adcl $0, -112(%ebp) # 4-byte Folded Spill
+; X32-NEXT: movl -136(%ebp), %ebx # 4-byte Reload
+; X32-NEXT: addl -180(%ebp), %ebx # 4-byte Folded Reload
+; X32-NEXT: movl %edi, %edx
+; X32-NEXT: adcl -48(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: movl -240(%ebp), %esi # 4-byte Reload
+; X32-NEXT: adcl -36(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: movl -172(%ebp), %edi # 4-byte Reload
+; X32-NEXT: adcl -20(%ebp), %edi # 4-byte Folded Reload
+; X32-NEXT: addl %ecx, %ebx
+; X32-NEXT: movl %ebx, -584(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %eax, %edx
+; X32-NEXT: movl %edx, -276(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -24(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: movl %esi, -240(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -112(%ebp), %edi # 4-byte Folded Reload
+; X32-NEXT: movl %edi, -172(%ebp) # 4-byte Spill
+; X32-NEXT: movl -736(%ebp), %eax # 4-byte Reload
+; X32-NEXT: movl %eax, %eax
+; X32-NEXT: addb $127, %al
+; X32-NEXT: sahf
+; X32-NEXT: movl -72(%ebp), %eax # 4-byte Reload
+; X32-NEXT: adcl -432(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: movl %eax, -72(%ebp) # 4-byte Spill
+; X32-NEXT: movl -76(%ebp), %eax # 4-byte Reload
+; X32-NEXT: adcl -456(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: movl %eax, -76(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %ebx, -232(%ebp) # 4-byte Folded Spill
+; X32-NEXT: adcl %edx, -164(%ebp) # 4-byte Folded Spill
+; X32-NEXT: adcl %esi, -40(%ebp) # 4-byte Folded Spill
+; X32-NEXT: adcl %edi, -56(%ebp) # 4-byte Folded Spill
+; X32-NEXT: movl 12(%ebp), %eax
+; X32-NEXT: movl 44(%eax), %eax
+; X32-NEXT: movl %eax, -112(%ebp) # 4-byte Spill
+; X32-NEXT: xorl %ecx, %ecx
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %eax, %ebx
+; X32-NEXT: movl -128(%ebp), %edi # 4-byte Reload
+; X32-NEXT: addl %edi, %ebx
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: adcl $0, %esi
+; X32-NEXT: movl -304(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: addl %ecx, %ebx
+; X32-NEXT: movl %ebx, -36(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %edi, %esi
+; X32-NEXT: setb %bl
+; X32-NEXT: addl %eax, %esi
+; X32-NEXT: movzbl %bl, %eax
+; X32-NEXT: adcl %edx, %eax
+; X32-NEXT: movl %ecx, %edx
+; X32-NEXT: addl %edx, %esi
+; X32-NEXT: adcl %edi, %eax
+; X32-NEXT: movl %eax, -48(%ebp) # 4-byte Spill
+; X32-NEXT: movl -100(%ebp), %edi # 4-byte Reload
+; X32-NEXT: movl -220(%ebp), %eax # 4-byte Reload
+; X32-NEXT: addl %edi, %eax
+; X32-NEXT: movl -376(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: adcl -204(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: addl %eax, %edx
+; X32-NEXT: movl %edx, -376(%ebp) # 4-byte Spill
+; X32-NEXT: movl -36(%ebp), %edx # 4-byte Reload
+; X32-NEXT: adcl %ecx, %edx
+; X32-NEXT: movl %edx, -220(%ebp) # 4-byte Spill
+; X32-NEXT: movl %esi, %edx
+; X32-NEXT: adcl $0, %edx
+; X32-NEXT: movl %edx, -20(%ebp) # 4-byte Spill
+; X32-NEXT: movl -48(%ebp), %ebx # 4-byte Reload
+; X32-NEXT: adcl $0, %ebx
+; X32-NEXT: addl %edi, -376(%ebp) # 4-byte Folded Spill
+; X32-NEXT: movl -220(%ebp), %edx # 4-byte Reload
+; X32-NEXT: adcl -80(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: movl %edx, -220(%ebp) # 4-byte Spill
; X32-NEXT: adcl $0, %eax
; X32-NEXT: adcl $0, %ecx
+; X32-NEXT: addl -20(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: adcl %ebx, %ecx
+; X32-NEXT: setb %dl
+; X32-NEXT: addl -304(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: adcl -36(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: movzbl %dl, %edx
+; X32-NEXT: adcl %esi, %edx
+; X32-NEXT: movl %edx, -36(%ebp) # 4-byte Spill
+; X32-NEXT: adcl $0, -48(%ebp) # 4-byte Folded Spill
+; X32-NEXT: movl %edi, %ebx
+; X32-NEXT: addl -336(%ebp), %ebx # 4-byte Folded Reload
+; X32-NEXT: movl -200(%ebp), %edx # 4-byte Reload
+; X32-NEXT: adcl -80(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: movl -472(%ebp), %edi # 4-byte Reload
+; X32-NEXT: adcl -372(%ebp), %edi # 4-byte Folded Reload
+; X32-NEXT: movl -436(%ebp), %esi # 4-byte Reload
+; X32-NEXT: adcl -292(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: addl %eax, %ebx
+; X32-NEXT: adcl %ecx, %edx
+; X32-NEXT: movl %edx, -200(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -36(%ebp), %edi # 4-byte Folded Reload
+; X32-NEXT: adcl -48(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: movl -740(%ebp), %eax # 4-byte Reload
+; X32-NEXT: movl %eax, %eax
+; X32-NEXT: addb $127, %al
+; X32-NEXT: sahf
+; X32-NEXT: movl -376(%ebp), %edx # 4-byte Reload
+; X32-NEXT: adcl %edx, -432(%ebp) # 4-byte Folded Spill
+; X32-NEXT: movl -220(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: adcl %ecx, -456(%ebp) # 4-byte Folded Spill
+; X32-NEXT: adcl %ebx, -584(%ebp) # 4-byte Folded Spill
+; X32-NEXT: movl -200(%ebp), %eax # 4-byte Reload
+; X32-NEXT: adcl %eax, -276(%ebp) # 4-byte Folded Spill
+; X32-NEXT: adcl %edi, -240(%ebp) # 4-byte Folded Spill
+; X32-NEXT: adcl %esi, -172(%ebp) # 4-byte Folded Spill
+; X32-NEXT: movl -640(%ebp), %eax # 4-byte Reload
+; X32-NEXT: movl %eax, %eax
+; X32-NEXT: addb $127, %al
+; X32-NEXT: sahf
+; X32-NEXT: adcl -64(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: movl %edx, -376(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -16(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: movl %ecx, -220(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -88(%ebp), %ebx # 4-byte Folded Reload
+; X32-NEXT: movl %ebx, -640(%ebp) # 4-byte Spill
+; X32-NEXT: movl -192(%ebp), %eax # 4-byte Reload
+; X32-NEXT: adcl %eax, -200(%ebp) # 4-byte Folded Spill
+; X32-NEXT: adcl -44(%ebp), %edi # 4-byte Folded Reload
+; X32-NEXT: movl %edi, -472(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -52(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: movl %esi, -436(%ebp) # 4-byte Spill
+; X32-NEXT: movl -408(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: movl -168(%ebp), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %eax, -48(%ebp) # 4-byte Spill
+; X32-NEXT: movl %edx, -16(%ebp) # 4-byte Spill
+; X32-NEXT: movl 8(%ebp), %eax
+; X32-NEXT: movl 60(%eax), %eax
+; X32-NEXT: movl %eax, -192(%ebp) # 4-byte Spill
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %edi
+; X32-NEXT: movl %eax, %ebx
+; X32-NEXT: addl -16(%ebp), %ebx # 4-byte Folded Reload
; X32-NEXT: adcl $0, %edi
-; X32-NEXT: adcl $0, %ebx
-; X32-NEXT: addl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
-; X32-NEXT: movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
-; X32-NEXT: movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: setb {{[0-9]+}}(%esp) # 1-byte Folded Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: addl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl %eax, %edx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: adcl $0, %eax
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: movl -92(%ebp), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: addl %ebx, %eax
+; X32-NEXT: movl %eax, -36(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %edi, %ecx
+; X32-NEXT: setb %bl
+; X32-NEXT: movl -192(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: addl %ecx, %eax
+; X32-NEXT: movzbl %bl, %ecx
+; X32-NEXT: adcl %ecx, %edx
+; X32-NEXT: movl -392(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: addl -28(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: movl -412(%ebp), %esi # 4-byte Reload
+; X32-NEXT: adcl -256(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: addl %eax, %ecx
+; X32-NEXT: movl %ecx, -80(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %edx, %esi
+; X32-NEXT: movl %esi, -16(%ebp) # 4-byte Spill
+; X32-NEXT: movl -440(%ebp), %esi # 4-byte Reload
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: movl -168(%ebp), %edi # 4-byte Reload
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: movl %eax, -24(%ebp) # 4-byte Spill
+; X32-NEXT: movl -340(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %edx, %edi
+; X32-NEXT: movl %eax, %ebx
+; X32-NEXT: addl %ecx, %ebx
; X32-NEXT: adcl $0, %edi
-; X32-NEXT: addl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: movl -92(%ebp), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: addl %ebx, %eax
+; X32-NEXT: movl %eax, -64(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %edi, %ecx
+; X32-NEXT: setb %bl
+; X32-NEXT: movl -340(%ebp), %edi # 4-byte Reload
+; X32-NEXT: movl %edi, %eax
+; X32-NEXT: mull %esi
+; X32-NEXT: addl %ecx, %eax
+; X32-NEXT: movzbl %bl, %ecx
+; X32-NEXT: adcl %ecx, %edx
+; X32-NEXT: addl -68(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: adcl -764(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: addl -48(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: movl %eax, -20(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -36(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: movl %edx, -36(%ebp) # 4-byte Spill
+; X32-NEXT: adcl $0, -80(%ebp) # 4-byte Folded Spill
+; X32-NEXT: adcl $0, -16(%ebp) # 4-byte Folded Spill
+; X32-NEXT: movl -440(%ebp), %esi # 4-byte Reload
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: movl -108(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, -44(%ebp) # 4-byte Spill
+; X32-NEXT: movl %eax, -48(%ebp) # 4-byte Spill
+; X32-NEXT: movl %edi, %eax
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: addl -44(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: adcl $0, %ebx
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: movl -96(%ebp), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %edi
+; X32-NEXT: addl %ecx, %eax
+; X32-NEXT: movl %eax, -52(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %ebx, %edi
+; X32-NEXT: setb %cl
+; X32-NEXT: movl -340(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: addl %edi, %eax
+; X32-NEXT: movzbl %cl, %ecx
+; X32-NEXT: adcl %ecx, %edx
+; X32-NEXT: movl -132(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: addl -104(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: movl -140(%ebp), %esi # 4-byte Reload
+; X32-NEXT: adcl -156(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: addl %eax, %ecx
+; X32-NEXT: adcl %edx, %esi
+; X32-NEXT: movl -20(%ebp), %eax # 4-byte Reload
+; X32-NEXT: addl %eax, -48(%ebp) # 4-byte Folded Spill
+; X32-NEXT: movl -36(%ebp), %eax # 4-byte Reload
+; X32-NEXT: adcl %eax, -52(%ebp) # 4-byte Folded Spill
+; X32-NEXT: adcl $0, %ecx
; X32-NEXT: adcl $0, %esi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT: addl -80(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: adcl -16(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: setb -36(%ebp) # 1-byte Folded Spill
+; X32-NEXT: movl -408(%ebp), %eax # 4-byte Reload
+; X32-NEXT: movl -108(%ebp), %edi # 4-byte Reload
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %edx, -80(%ebp) # 4-byte Spill
+; X32-NEXT: movl %eax, -16(%ebp) # 4-byte Spill
+; X32-NEXT: movl -192(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %eax, %ebx
+; X32-NEXT: addl -80(%ebp), %ebx # 4-byte Folded Reload
; X32-NEXT: adcl $0, %edx
-; X32-NEXT: addl %eax, %esi
+; X32-NEXT: movl %edx, -20(%ebp) # 4-byte Spill
+; X32-NEXT: movl -408(%ebp), %eax # 4-byte Reload
+; X32-NEXT: movl -96(%ebp), %edi # 4-byte Reload
+; X32-NEXT: mull %edi
+; X32-NEXT: addl %ebx, %eax
+; X32-NEXT: movl %eax, -80(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -20(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: setb -20(%ebp) # 1-byte Folded Spill
+; X32-NEXT: movl -192(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %edi
+; X32-NEXT: addl %ebx, %eax
+; X32-NEXT: movzbl -20(%ebp), %edi # 1-byte Folded Reload
; X32-NEXT: adcl %edi, %edx
-; X32-NEXT: setb %al
-; X32-NEXT: addl {{[0-9]+}}(%esp), %esi
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: movzbl %al, %edi
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %edi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X32-NEXT: movl -392(%ebp), %edi # 4-byte Reload
+; X32-NEXT: addl -104(%ebp), %edi # 4-byte Folded Reload
+; X32-NEXT: movl -412(%ebp), %ebx # 4-byte Reload
+; X32-NEXT: adcl -156(%ebp), %ebx # 4-byte Folded Reload
+; X32-NEXT: addl %eax, %edi
+; X32-NEXT: adcl %edx, %ebx
+; X32-NEXT: movl -16(%ebp), %edx # 4-byte Reload
+; X32-NEXT: addl %ecx, %edx
+; X32-NEXT: movl -80(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: adcl %esi, %ecx
+; X32-NEXT: movzbl -36(%ebp), %eax # 1-byte Folded Reload
+; X32-NEXT: adcl %eax, %edi
; X32-NEXT: adcl $0, %ebx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: addl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
-; X32-NEXT: addl %esi, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT: adcl %edx, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT: adcl %edi, %eax
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl %ebx, %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT: addl {{[0-9]+}}(%esp), %edi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %ebx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
-; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax # 1-byte Folded Reload
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
-; X32-NEXT: adcl %eax, %esi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: adcl $0, %eax
-; X32-NEXT: adcl $0, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT: adcl $0, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT: addl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
-; X32-NEXT: movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
-; X32-NEXT: movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
-; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %esi # 4-byte Folded Reload
-; X32-NEXT: movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: adcl %eax, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: adcl %eax, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: addl -68(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: movl %edx, -16(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -420(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: movl %ecx, -80(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -616(%ebp), %edi # 4-byte Folded Reload
+; X32-NEXT: movl %edi, -88(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -612(%ebp), %ebx # 4-byte Folded Reload
+; X32-NEXT: movl %ebx, -272(%ebp) # 4-byte Spill
+; X32-NEXT: movl -352(%ebp), %edi # 4-byte Reload
+; X32-NEXT: movl %edi, %eax
+; X32-NEXT: movl -168(%ebp), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: movl %eax, -44(%ebp) # 4-byte Spill
+; X32-NEXT: movl -120(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: movl %eax, %esi
+; X32-NEXT: addl %ecx, %esi
+; X32-NEXT: adcl $0, %ebx
+; X32-NEXT: movl %edi, %eax
+; X32-NEXT: movl -92(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, %edi
; X32-NEXT: addl %esi, %eax
+; X32-NEXT: movl %eax, -68(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %ebx, %edi
+; X32-NEXT: setb %bl
+; X32-NEXT: movl -120(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %ecx
+; X32-NEXT: addl %edi, %eax
+; X32-NEXT: movzbl %bl, %ecx
+; X32-NEXT: adcl %ecx, %edx
+; X32-NEXT: movl -364(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: addl -28(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: movl -396(%ebp), %esi # 4-byte Reload
+; X32-NEXT: adcl -256(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: addl %eax, %ecx
+; X32-NEXT: movl %ecx, -20(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %edx, %esi
+; X32-NEXT: movl %esi, -36(%ebp) # 4-byte Spill
+; X32-NEXT: movl -416(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: movl -168(%ebp), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %edi
+; X32-NEXT: movl %eax, -616(%ebp) # 4-byte Spill
+; X32-NEXT: movl -316(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: movl %eax, %ebx
+; X32-NEXT: addl %edi, %ebx
+; X32-NEXT: adcl $0, %esi
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: movl -92(%ebp), %edi # 4-byte Reload
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: addl %ebx, %eax
+; X32-NEXT: movl %eax, -612(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %esi, %ecx
+; X32-NEXT: setb -152(%ebp) # 1-byte Folded Spill
+; X32-NEXT: movl -316(%ebp), %ebx # 4-byte Reload
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: mull %edi
+; X32-NEXT: addl %ecx, %eax
+; X32-NEXT: movzbl -152(%ebp), %ecx # 1-byte Folded Reload
+; X32-NEXT: adcl %ecx, %edx
+; X32-NEXT: addl -32(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: adcl -424(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: addl -44(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: movl %eax, -152(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -68(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: movl %edx, -44(%ebp) # 4-byte Spill
+; X32-NEXT: adcl $0, -20(%ebp) # 4-byte Folded Spill
+; X32-NEXT: adcl $0, -36(%ebp) # 4-byte Folded Spill
+; X32-NEXT: movl -416(%ebp), %esi # 4-byte Reload
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: movl -108(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, %edi
+; X32-NEXT: movl %eax, -424(%ebp) # 4-byte Spill
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, %ebx
; X32-NEXT: movl %eax, %ecx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl %edx, %eax
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X32-NEXT: movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: addl %edi, %ecx
; X32-NEXT: adcl $0, %ebx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X32-NEXT: movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: movl -96(%ebp), %edi # 4-byte Reload
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: addl %ecx, %eax
+; X32-NEXT: movl %eax, -420(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %ebx, %esi
+; X32-NEXT: setb %cl
+; X32-NEXT: movl -316(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %edi
+; X32-NEXT: addl %esi, %eax
+; X32-NEXT: movzbl %cl, %ecx
+; X32-NEXT: adcl %ecx, %edx
+; X32-NEXT: movl -324(%ebp), %edi # 4-byte Reload
+; X32-NEXT: addl -104(%ebp), %edi # 4-byte Folded Reload
+; X32-NEXT: movl -400(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: adcl -156(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: addl %eax, %edi
+; X32-NEXT: adcl %edx, %ecx
+; X32-NEXT: movl -152(%ebp), %eax # 4-byte Reload
+; X32-NEXT: addl %eax, -424(%ebp) # 4-byte Folded Spill
+; X32-NEXT: movl -44(%ebp), %eax # 4-byte Reload
+; X32-NEXT: adcl %eax, -420(%ebp) # 4-byte Folded Spill
; X32-NEXT: adcl $0, %edi
-; X32-NEXT: movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X32-NEXT: movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: addl %edi, %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl %ecx, %eax
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl $0, %esi
+; X32-NEXT: adcl $0, %ecx
+; X32-NEXT: addl -20(%ebp), %edi # 4-byte Folded Reload
+; X32-NEXT: adcl -36(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: setb -68(%ebp) # 1-byte Folded Spill
+; X32-NEXT: movl -352(%ebp), %eax # 4-byte Reload
+; X32-NEXT: movl -108(%ebp), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, -20(%ebp) # 4-byte Spill
+; X32-NEXT: movl %eax, -36(%ebp) # 4-byte Spill
+; X32-NEXT: movl -120(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %eax, %ebx
+; X32-NEXT: addl -20(%ebp), %ebx # 4-byte Folded Reload
; X32-NEXT: adcl $0, %edx
-; X32-NEXT: addl %ebx, %esi
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
-; X32-NEXT: setb %al
-; X32-NEXT: addl {{[0-9]+}}(%esp), %esi # 4-byte Folded Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
-; X32-NEXT: movzbl %al, %eax
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT: adcl $0, %edi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X32-NEXT: movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: movl %edx, -44(%ebp) # 4-byte Spill
+; X32-NEXT: movl -352(%ebp), %eax # 4-byte Reload
+; X32-NEXT: movl -96(%ebp), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
; X32-NEXT: addl %ebx, %eax
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X32-NEXT: movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl %ebx, %eax
-; X32-NEXT: movl %eax, %ebx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl %ecx, %eax
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl %ecx, %eax
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
-; X32-NEXT: addl %esi, %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl %edx, %ebx
-; X32-NEXT: movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
-; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl %edi, %eax
-; X32-NEXT: movl %eax, %esi
-; X32-NEXT: movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: addl %eax, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: adcl %eax, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: adcl %eax, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: adcl %eax, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT: adcl %ecx, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT: adcl %ebx, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT: adcl %edx, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT: adcl %esi, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: addl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl %eax, %esi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl %eax, -20(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -44(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: setb -44(%ebp) # 1-byte Folded Spill
+; X32-NEXT: movl -120(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: addl %ebx, %eax
+; X32-NEXT: movzbl -44(%ebp), %esi # 1-byte Folded Reload
+; X32-NEXT: adcl %esi, %edx
+; X32-NEXT: movl -364(%ebp), %ebx # 4-byte Reload
+; X32-NEXT: addl -104(%ebp), %ebx # 4-byte Folded Reload
+; X32-NEXT: movl -396(%ebp), %esi # 4-byte Reload
+; X32-NEXT: adcl -156(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: addl %eax, %ebx
+; X32-NEXT: adcl %edx, %esi
+; X32-NEXT: movl %esi, -44(%ebp) # 4-byte Spill
+; X32-NEXT: movl -36(%ebp), %edx # 4-byte Reload
+; X32-NEXT: addl %edi, %edx
+; X32-NEXT: movl -20(%ebp), %esi # 4-byte Reload
+; X32-NEXT: adcl %ecx, %esi
+; X32-NEXT: movzbl -68(%ebp), %eax # 1-byte Folded Reload
+; X32-NEXT: adcl %eax, %ebx
+; X32-NEXT: movl -44(%ebp), %eax # 4-byte Reload
; X32-NEXT: adcl $0, %eax
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: adcl $0, %ecx
-; X32-NEXT: addl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %esi
-; X32-NEXT: movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X32-NEXT: adcl $0, %edi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT: addl -32(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: adcl -196(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: adcl -504(%ebp), %ebx # 4-byte Folded Reload
+; X32-NEXT: adcl -508(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: addl -24(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: movl %edx, -36(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -64(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: movl %esi, -20(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -48(%ebp), %ebx # 4-byte Folded Reload
+; X32-NEXT: movl %ebx, -292(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -52(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: movl %eax, -44(%ebp) # 4-byte Spill
+; X32-NEXT: adcl $0, -16(%ebp) # 4-byte Folded Spill
+; X32-NEXT: adcl $0, -80(%ebp) # 4-byte Folded Spill
+; X32-NEXT: adcl $0, -88(%ebp) # 4-byte Folded Spill
+; X32-NEXT: adcl $0, -272(%ebp) # 4-byte Folded Spill
+; X32-NEXT: movl -352(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: movl -212(%ebp), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, -52(%ebp) # 4-byte Spill
+; X32-NEXT: movl %eax, -48(%ebp) # 4-byte Spill
+; X32-NEXT: movl -120(%ebp), %edi # 4-byte Reload
+; X32-NEXT: movl %edi, %eax
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: movl %eax, %ebx
+; X32-NEXT: addl -52(%ebp), %ebx # 4-byte Folded Reload
+; X32-NEXT: adcl $0, %esi
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: mull -252(%ebp) # 4-byte Folded Reload
+; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: addl %ebx, %eax
+; X32-NEXT: movl %eax, -64(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %esi, %ecx
+; X32-NEXT: setb %bl
+; X32-NEXT: movl %edi, %eax
+; X32-NEXT: movl -252(%ebp), %edi # 4-byte Reload
+; X32-NEXT: mull %edi
+; X32-NEXT: addl %ecx, %eax
+; X32-NEXT: movzbl %bl, %ecx
+; X32-NEXT: adcl %ecx, %edx
+; X32-NEXT: movl -364(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: addl -116(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: movl -396(%ebp), %esi # 4-byte Reload
+; X32-NEXT: adcl -84(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: addl %eax, %ecx
+; X32-NEXT: movl %ecx, -24(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %edx, %esi
+; X32-NEXT: movl %esi, -52(%ebp) # 4-byte Spill
+; X32-NEXT: movl -416(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: movl -212(%ebp), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, -68(%ebp) # 4-byte Spill
+; X32-NEXT: movl %eax, -508(%ebp) # 4-byte Spill
+; X32-NEXT: movl -316(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: movl %eax, %ebx
+; X32-NEXT: addl -68(%ebp), %ebx # 4-byte Folded Reload
; X32-NEXT: adcl $0, %esi
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: addl %ebx, %eax
+; X32-NEXT: movl %eax, -504(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %esi, %ecx
+; X32-NEXT: setb %bl
+; X32-NEXT: movl -316(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %edi
+; X32-NEXT: addl %ecx, %eax
+; X32-NEXT: movzbl %bl, %ecx
+; X32-NEXT: adcl %ecx, %edx
+; X32-NEXT: addl -296(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: adcl -768(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: addl -48(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: movl %eax, -372(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -64(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: movl %edx, -68(%ebp) # 4-byte Spill
+; X32-NEXT: adcl $0, -24(%ebp) # 4-byte Folded Spill
+; X32-NEXT: adcl $0, -52(%ebp) # 4-byte Folded Spill
+; X32-NEXT: movl -416(%ebp), %esi # 4-byte Reload
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: movl -284(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: movl %eax, -152(%ebp) # 4-byte Spill
+; X32-NEXT: movl -316(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, %edi
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: addl %ebx, %ecx
+; X32-NEXT: adcl $0, %edi
+; X32-NEXT: movl 12(%ebp), %eax
+; X32-NEXT: movl 28(%eax), %ebx
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: mull %ebx
+; X32-NEXT: movl %ebx, %esi
+; X32-NEXT: movl %esi, -48(%ebp) # 4-byte Spill
+; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: addl %ecx, %eax
+; X32-NEXT: movl %eax, -64(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %edi, %ebx
+; X32-NEXT: setb %cl
+; X32-NEXT: movl -316(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: addl %ebx, %eax
+; X32-NEXT: movzbl %cl, %ecx
+; X32-NEXT: adcl %ecx, %edx
+; X32-NEXT: movl -324(%ebp), %ebx # 4-byte Reload
+; X32-NEXT: addl -308(%ebp), %ebx # 4-byte Folded Reload
+; X32-NEXT: movl -400(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: adcl -208(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: addl %eax, %ebx
+; X32-NEXT: adcl %edx, %ecx
+; X32-NEXT: movl -372(%ebp), %eax # 4-byte Reload
+; X32-NEXT: addl %eax, -152(%ebp) # 4-byte Folded Spill
+; X32-NEXT: movl -64(%ebp), %eax # 4-byte Reload
+; X32-NEXT: adcl -68(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: movl %eax, -64(%ebp) # 4-byte Spill
+; X32-NEXT: adcl $0, %ebx
+; X32-NEXT: adcl $0, %ecx
+; X32-NEXT: addl -24(%ebp), %ebx # 4-byte Folded Reload
+; X32-NEXT: adcl -52(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: setb -372(%ebp) # 1-byte Folded Spill
+; X32-NEXT: movl -352(%ebp), %eax # 4-byte Reload
+; X32-NEXT: movl -284(%ebp), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, -24(%ebp) # 4-byte Spill
+; X32-NEXT: movl %eax, -52(%ebp) # 4-byte Spill
+; X32-NEXT: movl -120(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %eax, %edi
+; X32-NEXT: addl -24(%ebp), %edi # 4-byte Folded Reload
+; X32-NEXT: adcl $0, %edx
+; X32-NEXT: movl %edx, -68(%ebp) # 4-byte Spill
+; X32-NEXT: movl -352(%ebp), %eax # 4-byte Reload
+; X32-NEXT: movl -48(%ebp), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: addl %edi, %eax
+; X32-NEXT: movl %eax, -24(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -68(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: movl %edx, %edi
+; X32-NEXT: setb -68(%ebp) # 1-byte Folded Spill
+; X32-NEXT: movl -120(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: addl %edi, %eax
+; X32-NEXT: movzbl -68(%ebp), %esi # 1-byte Folded Reload
+; X32-NEXT: adcl %esi, %edx
+; X32-NEXT: movl -364(%ebp), %edi # 4-byte Reload
+; X32-NEXT: addl -308(%ebp), %edi # 4-byte Folded Reload
+; X32-NEXT: movl -396(%ebp), %esi # 4-byte Reload
+; X32-NEXT: adcl -208(%ebp), %esi # 4-byte Folded Reload
; X32-NEXT: addl %eax, %edi
+; X32-NEXT: adcl %edx, %esi
+; X32-NEXT: movl %esi, -68(%ebp) # 4-byte Spill
+; X32-NEXT: movl -52(%ebp), %edx # 4-byte Reload
+; X32-NEXT: addl %ebx, %edx
+; X32-NEXT: movl -24(%ebp), %esi # 4-byte Reload
; X32-NEXT: adcl %ecx, %esi
-; X32-NEXT: setb %al
-; X32-NEXT: addl {{[0-9]+}}(%esp), %edi
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %esi
-; X32-NEXT: movzbl %al, %eax
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movzbl -372(%ebp), %eax # 1-byte Folded Reload
+; X32-NEXT: adcl %eax, %edi
+; X32-NEXT: movl -68(%ebp), %eax # 4-byte Reload
; X32-NEXT: adcl $0, %eax
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
-; X32-NEXT: addl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
-; X32-NEXT: addl %edi, %ebx
-; X32-NEXT: movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl %esi, %eax
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
-; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: addl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl %eax, %esi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: addl -296(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: adcl -776(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: adcl -772(%ebp), %edi # 4-byte Folded Reload
+; X32-NEXT: adcl -780(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: movl -36(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: addl %ecx, -508(%ebp) # 4-byte Folded Spill
+; X32-NEXT: movl -20(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: adcl %ecx, -504(%ebp) # 4-byte Folded Spill
+; X32-NEXT: movl -292(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: adcl %ecx, -152(%ebp) # 4-byte Folded Spill
+; X32-NEXT: movl -44(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: adcl %ecx, -64(%ebp) # 4-byte Folded Spill
+; X32-NEXT: adcl $0, %edx
+; X32-NEXT: adcl $0, %esi
+; X32-NEXT: adcl $0, %edi
; X32-NEXT: adcl $0, %eax
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: addl -16(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: movl %edx, -52(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -80(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: movl %esi, -24(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -88(%ebp), %edi # 4-byte Folded Reload
+; X32-NEXT: movl %edi, -44(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -272(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: movl %eax, -68(%ebp) # 4-byte Spill
+; X32-NEXT: setb -20(%ebp) # 1-byte Folded Spill
+; X32-NEXT: movl -408(%ebp), %ebx # 4-byte Reload
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: movl -212(%ebp), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: movl %eax, -36(%ebp) # 4-byte Spill
+; X32-NEXT: movl -192(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: movl %eax, %edi
+; X32-NEXT: addl %ecx, %edi
+; X32-NEXT: adcl $0, %esi
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: movl -252(%ebp), %ebx # 4-byte Reload
+; X32-NEXT: mull %ebx
+; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: addl %edi, %eax
+; X32-NEXT: movl %eax, -272(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %esi, %ecx
+; X32-NEXT: setb -16(%ebp) # 1-byte Folded Spill
+; X32-NEXT: movl -192(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %ebx
+; X32-NEXT: addl %ecx, %eax
+; X32-NEXT: movzbl -16(%ebp), %ecx # 1-byte Folded Reload
+; X32-NEXT: adcl %ecx, %edx
+; X32-NEXT: movl -392(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: addl -116(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: movl -412(%ebp), %esi # 4-byte Reload
+; X32-NEXT: adcl -84(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: addl %eax, %ecx
+; X32-NEXT: movl %ecx, -80(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %edx, %esi
+; X32-NEXT: movl %esi, -16(%ebp) # 4-byte Spill
+; X32-NEXT: movl -440(%ebp), %ebx # 4-byte Reload
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: movl -212(%ebp), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: movl %eax, -292(%ebp) # 4-byte Spill
+; X32-NEXT: movl -340(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: movl %eax, %edi
+; X32-NEXT: addl %ecx, %edi
+; X32-NEXT: adcl $0, %esi
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: movl -252(%ebp), %ebx # 4-byte Reload
+; X32-NEXT: mull %ebx
+; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: addl %edi, %eax
+; X32-NEXT: movl %eax, -372(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %esi, %ecx
+; X32-NEXT: setb -88(%ebp) # 1-byte Folded Spill
+; X32-NEXT: movl -340(%ebp), %edi # 4-byte Reload
+; X32-NEXT: movl %edi, %eax
+; X32-NEXT: mull %ebx
+; X32-NEXT: addl %ecx, %eax
+; X32-NEXT: movzbl -88(%ebp), %ecx # 1-byte Folded Reload
+; X32-NEXT: adcl %ecx, %edx
+; X32-NEXT: addl -332(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: adcl -448(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: addl -36(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: movl %eax, -448(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -272(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: movl %edx, -36(%ebp) # 4-byte Spill
+; X32-NEXT: adcl $0, -80(%ebp) # 4-byte Folded Spill
+; X32-NEXT: adcl $0, -16(%ebp) # 4-byte Folded Spill
+; X32-NEXT: movl -440(%ebp), %esi # 4-byte Reload
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: movl -284(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: movl %eax, -88(%ebp) # 4-byte Spill
+; X32-NEXT: movl %edi, %eax
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, %edi
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: addl %ebx, %ecx
+; X32-NEXT: adcl $0, %edi
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: movl -48(%ebp), %ebx # 4-byte Reload
+; X32-NEXT: mull %ebx
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: addl %ecx, %eax
+; X32-NEXT: movl %eax, -296(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %edi, %esi
+; X32-NEXT: setb %cl
+; X32-NEXT: movl -340(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %ebx
+; X32-NEXT: addl %esi, %eax
+; X32-NEXT: movzbl %cl, %ecx
+; X32-NEXT: adcl %ecx, %edx
+; X32-NEXT: movl -132(%ebp), %ebx # 4-byte Reload
+; X32-NEXT: addl -308(%ebp), %ebx # 4-byte Folded Reload
+; X32-NEXT: movl -140(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: adcl -208(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: addl %eax, %ebx
+; X32-NEXT: adcl %edx, %ecx
+; X32-NEXT: movl -448(%ebp), %eax # 4-byte Reload
+; X32-NEXT: addl %eax, -88(%ebp) # 4-byte Folded Spill
+; X32-NEXT: movl -36(%ebp), %eax # 4-byte Reload
+; X32-NEXT: adcl %eax, -296(%ebp) # 4-byte Folded Spill
+; X32-NEXT: adcl $0, %ebx
; X32-NEXT: adcl $0, %ecx
-; X32-NEXT: addl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %esi
-; X32-NEXT: movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT: addl -80(%ebp), %ebx # 4-byte Folded Reload
+; X32-NEXT: adcl -16(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: setb -16(%ebp) # 1-byte Folded Spill
+; X32-NEXT: movl -408(%ebp), %eax # 4-byte Reload
+; X32-NEXT: movl -284(%ebp), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, -80(%ebp) # 4-byte Spill
+; X32-NEXT: movl %eax, -272(%ebp) # 4-byte Spill
+; X32-NEXT: movl -192(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %eax, %edi
+; X32-NEXT: addl -80(%ebp), %edi # 4-byte Folded Reload
; X32-NEXT: adcl $0, %edx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X32-NEXT: adcl $0, %esi
-; X32-NEXT: addl %eax, %edx
+; X32-NEXT: movl %edx, -80(%ebp) # 4-byte Spill
+; X32-NEXT: movl -408(%ebp), %eax # 4-byte Reload
+; X32-NEXT: movl -48(%ebp), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: addl %edi, %eax
+; X32-NEXT: movl %eax, -36(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -80(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: movl %edx, %edi
+; X32-NEXT: setb -80(%ebp) # 1-byte Folded Spill
+; X32-NEXT: movl -192(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: addl %edi, %eax
+; X32-NEXT: movzbl -80(%ebp), %esi # 1-byte Folded Reload
+; X32-NEXT: adcl %esi, %edx
+; X32-NEXT: movl -392(%ebp), %edi # 4-byte Reload
+; X32-NEXT: addl -308(%ebp), %edi # 4-byte Folded Reload
+; X32-NEXT: movl -412(%ebp), %esi # 4-byte Reload
+; X32-NEXT: adcl -208(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: addl %eax, %edi
+; X32-NEXT: adcl %edx, %esi
+; X32-NEXT: movl %esi, -80(%ebp) # 4-byte Spill
+; X32-NEXT: movl -272(%ebp), %edx # 4-byte Reload
+; X32-NEXT: addl %ebx, %edx
+; X32-NEXT: movl -36(%ebp), %esi # 4-byte Reload
; X32-NEXT: adcl %ecx, %esi
-; X32-NEXT: setb %al
-; X32-NEXT: addl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %esi
-; X32-NEXT: movzbl %al, %eax
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movzbl -16(%ebp), %eax # 1-byte Folded Reload
+; X32-NEXT: adcl %eax, %edi
+; X32-NEXT: movl -80(%ebp), %eax # 4-byte Reload
; X32-NEXT: adcl $0, %eax
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: addl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
-; X32-NEXT: addl %edx, %eax
-; X32-NEXT: adcl %esi, %ecx
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
-; X32-NEXT: addl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
-; X32-NEXT: movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
-; X32-NEXT: movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl $0, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT: adcl $0, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT: adcl $0, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT: adcl $0, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: addl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl %eax, %esi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: addl -332(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: adcl -648(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: movl %esi, -36(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -644(%ebp), %edi # 4-byte Folded Reload
+; X32-NEXT: movl %edi, -332(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -572(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: movl %eax, -80(%ebp) # 4-byte Spill
+; X32-NEXT: movl -292(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: addl -52(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: movl -372(%ebp), %edx # 4-byte Reload
+; X32-NEXT: adcl -24(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: movl -88(%ebp), %esi # 4-byte Reload
+; X32-NEXT: adcl -44(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: movl -296(%ebp), %edi # 4-byte Reload
+; X32-NEXT: adcl -68(%ebp), %edi # 4-byte Folded Reload
+; X32-NEXT: movzbl -20(%ebp), %eax # 1-byte Folded Reload
+; X32-NEXT: adcl %eax, %ebx
+; X32-NEXT: movl %ebx, -272(%ebp) # 4-byte Spill
+; X32-NEXT: movl -36(%ebp), %eax # 4-byte Reload
; X32-NEXT: adcl $0, %eax
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: movl -332(%ebp), %ebx # 4-byte Reload
+; X32-NEXT: adcl $0, %ebx
+; X32-NEXT: adcl $0, -80(%ebp) # 4-byte Folded Spill
+; X32-NEXT: addl -32(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: movl %ecx, -292(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -196(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: movl %edx, -372(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -608(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: movl %esi, -88(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -760(%ebp), %edi # 4-byte Folded Reload
+; X32-NEXT: movl %edi, -296(%ebp) # 4-byte Spill
+; X32-NEXT: movl -756(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: adcl %ecx, -272(%ebp) # 4-byte Folded Spill
+; X32-NEXT: adcl -752(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: movl %eax, -36(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -748(%ebp), %ebx # 4-byte Folded Reload
+; X32-NEXT: movl %ebx, -332(%ebp) # 4-byte Spill
+; X32-NEXT: movl -744(%ebp), %eax # 4-byte Reload
+; X32-NEXT: adcl %eax, -80(%ebp) # 4-byte Folded Spill
+; X32-NEXT: movl -288(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: movl -168(%ebp), %edi # 4-byte Reload
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %eax, -52(%ebp) # 4-byte Spill
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: movl 8(%ebp), %eax
+; X32-NEXT: movl 28(%eax), %eax
+; X32-NEXT: movl %eax, -16(%ebp) # 4-byte Spill
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %edx, %edi
+; X32-NEXT: movl %eax, %ebx
+; X32-NEXT: addl %esi, %ebx
+; X32-NEXT: adcl $0, %edi
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: movl -92(%ebp), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: addl %ebx, %eax
+; X32-NEXT: movl %eax, -24(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %edi, %ecx
+; X32-NEXT: setb %bl
+; X32-NEXT: movl -16(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: addl %ecx, %eax
+; X32-NEXT: movzbl %bl, %ecx
+; X32-NEXT: adcl %ecx, %edx
+; X32-NEXT: movl -280(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: addl -28(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: movl -312(%ebp), %esi # 4-byte Reload
+; X32-NEXT: adcl -256(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: addl %eax, %ecx
+; X32-NEXT: movl %ecx, -44(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %edx, %esi
+; X32-NEXT: movl %esi, -20(%ebp) # 4-byte Spill
+; X32-NEXT: movl -348(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: movl -168(%ebp), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, -32(%ebp) # 4-byte Spill
+; X32-NEXT: movl %eax, -572(%ebp) # 4-byte Spill
+; X32-NEXT: movl -216(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %edi
+; X32-NEXT: movl %eax, %ebx
+; X32-NEXT: addl -32(%ebp), %ebx # 4-byte Folded Reload
+; X32-NEXT: adcl $0, %edi
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: movl -92(%ebp), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: addl %ebx, %eax
+; X32-NEXT: movl %eax, -448(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %edi, %ecx
+; X32-NEXT: setb %bl
+; X32-NEXT: movl -216(%ebp), %edi # 4-byte Reload
+; X32-NEXT: movl %edi, %eax
+; X32-NEXT: mull %esi
+; X32-NEXT: addl %ecx, %eax
+; X32-NEXT: movzbl %bl, %ecx
+; X32-NEXT: adcl %ecx, %edx
+; X32-NEXT: addl -228(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: adcl -428(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: addl -52(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: movl %eax, -32(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -24(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: movl %edx, -52(%ebp) # 4-byte Spill
+; X32-NEXT: adcl $0, -44(%ebp) # 4-byte Folded Spill
+; X32-NEXT: adcl $0, -20(%ebp) # 4-byte Folded Spill
+; X32-NEXT: movl -348(%ebp), %esi # 4-byte Reload
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: movl -108(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, -24(%ebp) # 4-byte Spill
+; X32-NEXT: movl %eax, -196(%ebp) # 4-byte Spill
+; X32-NEXT: movl %edi, %eax
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: addl -24(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: adcl $0, %ebx
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: movl -96(%ebp), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %edi
+; X32-NEXT: addl %ecx, %eax
+; X32-NEXT: movl %eax, -428(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %ebx, %edi
+; X32-NEXT: setb %cl
+; X32-NEXT: movl -216(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: addl %edi, %eax
+; X32-NEXT: movzbl %cl, %ecx
+; X32-NEXT: adcl %ecx, %edx
+; X32-NEXT: movl -180(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: addl -104(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: movl -320(%ebp), %esi # 4-byte Reload
+; X32-NEXT: adcl -156(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: addl %eax, %ecx
+; X32-NEXT: adcl %edx, %esi
+; X32-NEXT: movl -32(%ebp), %eax # 4-byte Reload
+; X32-NEXT: addl %eax, -196(%ebp) # 4-byte Folded Spill
+; X32-NEXT: movl -52(%ebp), %eax # 4-byte Reload
+; X32-NEXT: adcl %eax, -428(%ebp) # 4-byte Folded Spill
; X32-NEXT: adcl $0, %ecx
-; X32-NEXT: addl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %esi
-; X32-NEXT: movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT: adcl $0, %esi
+; X32-NEXT: addl -44(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: adcl -20(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: setb -52(%ebp) # 1-byte Folded Spill
+; X32-NEXT: movl -288(%ebp), %eax # 4-byte Reload
+; X32-NEXT: movl -108(%ebp), %edi # 4-byte Reload
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %edx, -44(%ebp) # 4-byte Spill
+; X32-NEXT: movl %eax, -20(%ebp) # 4-byte Spill
+; X32-NEXT: movl -16(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %eax, %ebx
+; X32-NEXT: addl -44(%ebp), %ebx # 4-byte Folded Reload
; X32-NEXT: adcl $0, %edx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT: movl %edx, -24(%ebp) # 4-byte Spill
+; X32-NEXT: movl -288(%ebp), %eax # 4-byte Reload
+; X32-NEXT: movl -96(%ebp), %edi # 4-byte Reload
+; X32-NEXT: mull %edi
+; X32-NEXT: addl %ebx, %eax
+; X32-NEXT: movl %eax, -44(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -24(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: setb -24(%ebp) # 1-byte Folded Spill
+; X32-NEXT: movl -16(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %edi
+; X32-NEXT: addl %ebx, %eax
+; X32-NEXT: movzbl -24(%ebp), %edi # 1-byte Folded Reload
+; X32-NEXT: adcl %edi, %edx
+; X32-NEXT: movl -280(%ebp), %edi # 4-byte Reload
+; X32-NEXT: addl -104(%ebp), %edi # 4-byte Folded Reload
+; X32-NEXT: movl -312(%ebp), %ebx # 4-byte Reload
+; X32-NEXT: adcl -156(%ebp), %ebx # 4-byte Folded Reload
+; X32-NEXT: addl %eax, %edi
+; X32-NEXT: adcl %edx, %ebx
+; X32-NEXT: movl -20(%ebp), %edx # 4-byte Reload
+; X32-NEXT: addl %ecx, %edx
+; X32-NEXT: movl -44(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: adcl %esi, %ecx
+; X32-NEXT: movzbl -52(%ebp), %eax # 1-byte Folded Reload
+; X32-NEXT: adcl %eax, %edi
+; X32-NEXT: adcl $0, %ebx
+; X32-NEXT: addl -228(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: movl %edx, -20(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -596(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: movl %ecx, -44(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -464(%ebp), %edi # 4-byte Folded Reload
+; X32-NEXT: movl %edi, -464(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -536(%ebp), %ebx # 4-byte Folded Reload
+; X32-NEXT: movl %ebx, -68(%ebp) # 4-byte Spill
+; X32-NEXT: movl -184(%ebp), %edi # 4-byte Reload
+; X32-NEXT: movl %edi, %eax
+; X32-NEXT: movl -168(%ebp), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: movl %eax, -32(%ebp) # 4-byte Spill
+; X32-NEXT: movl -60(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: movl %eax, %esi
+; X32-NEXT: addl %ecx, %esi
+; X32-NEXT: adcl $0, %ebx
+; X32-NEXT: movl %edi, %eax
+; X32-NEXT: movl -92(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, %edi
+; X32-NEXT: addl %esi, %eax
+; X32-NEXT: movl %eax, -228(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %ebx, %edi
+; X32-NEXT: setb %bl
+; X32-NEXT: movl -60(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %ecx
+; X32-NEXT: addl %edi, %eax
+; X32-NEXT: movzbl %bl, %ecx
+; X32-NEXT: adcl %ecx, %edx
+; X32-NEXT: movl -160(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: addl -28(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: movl -268(%ebp), %esi # 4-byte Reload
+; X32-NEXT: adcl -256(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: addl %eax, %ecx
+; X32-NEXT: movl %ecx, -24(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %edx, %esi
+; X32-NEXT: movl %esi, -52(%ebp) # 4-byte Spill
+; X32-NEXT: movl -260(%ebp), %edi # 4-byte Reload
+; X32-NEXT: movl %edi, %eax
+; X32-NEXT: movl -168(%ebp), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: movl %eax, -648(%ebp) # 4-byte Spill
+; X32-NEXT: movl -124(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: movl %eax, %ebx
+; X32-NEXT: addl %ecx, %ebx
; X32-NEXT: adcl $0, %esi
-; X32-NEXT: addl %eax, %edx
+; X32-NEXT: movl %edi, %eax
+; X32-NEXT: movl -92(%ebp), %edi # 4-byte Reload
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: addl %ebx, %eax
+; X32-NEXT: movl %eax, -644(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %esi, %ecx
+; X32-NEXT: setb -536(%ebp) # 1-byte Folded Spill
+; X32-NEXT: movl -124(%ebp), %ebx # 4-byte Reload
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: mull %edi
+; X32-NEXT: addl %ecx, %eax
+; X32-NEXT: movzbl -536(%ebp), %ecx # 1-byte Folded Reload
+; X32-NEXT: adcl %ecx, %edx
+; X32-NEXT: addl -344(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: adcl -452(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: addl -32(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: movl %eax, -452(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -228(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: movl %edx, -32(%ebp) # 4-byte Spill
+; X32-NEXT: adcl $0, -24(%ebp) # 4-byte Folded Spill
+; X32-NEXT: adcl $0, -52(%ebp) # 4-byte Folded Spill
+; X32-NEXT: movl -260(%ebp), %esi # 4-byte Reload
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: movl -108(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, %edi
+; X32-NEXT: movl %eax, -536(%ebp) # 4-byte Spill
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: addl %edi, %ecx
+; X32-NEXT: adcl $0, %ebx
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: movl -96(%ebp), %edi # 4-byte Reload
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: addl %ecx, %eax
+; X32-NEXT: movl %eax, -596(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %ebx, %esi
+; X32-NEXT: setb %cl
+; X32-NEXT: movl -124(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %edi
+; X32-NEXT: addl %esi, %eax
+; X32-NEXT: movzbl %cl, %ecx
+; X32-NEXT: adcl %ecx, %edx
+; X32-NEXT: movl -136(%ebp), %edi # 4-byte Reload
+; X32-NEXT: addl -104(%ebp), %edi # 4-byte Folded Reload
+; X32-NEXT: movl -264(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: adcl -156(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: addl %eax, %edi
+; X32-NEXT: adcl %edx, %ecx
+; X32-NEXT: movl -452(%ebp), %eax # 4-byte Reload
+; X32-NEXT: addl %eax, -536(%ebp) # 4-byte Folded Spill
+; X32-NEXT: movl -32(%ebp), %eax # 4-byte Reload
+; X32-NEXT: adcl %eax, -596(%ebp) # 4-byte Folded Spill
+; X32-NEXT: adcl $0, %edi
+; X32-NEXT: adcl $0, %ecx
+; X32-NEXT: addl -24(%ebp), %edi # 4-byte Folded Reload
+; X32-NEXT: adcl -52(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: setb -228(%ebp) # 1-byte Folded Spill
+; X32-NEXT: movl -184(%ebp), %eax # 4-byte Reload
+; X32-NEXT: movl -108(%ebp), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, -24(%ebp) # 4-byte Spill
+; X32-NEXT: movl %eax, -52(%ebp) # 4-byte Spill
+; X32-NEXT: movl -60(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %eax, %ebx
+; X32-NEXT: addl -24(%ebp), %ebx # 4-byte Folded Reload
+; X32-NEXT: adcl $0, %edx
+; X32-NEXT: movl %edx, -32(%ebp) # 4-byte Spill
+; X32-NEXT: movl -184(%ebp), %eax # 4-byte Reload
+; X32-NEXT: movl -96(%ebp), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: addl %ebx, %eax
+; X32-NEXT: movl %eax, -24(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -32(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: setb -32(%ebp) # 1-byte Folded Spill
+; X32-NEXT: movl -60(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: addl %ebx, %eax
+; X32-NEXT: movzbl -32(%ebp), %esi # 1-byte Folded Reload
+; X32-NEXT: adcl %esi, %edx
+; X32-NEXT: movl -160(%ebp), %ebx # 4-byte Reload
+; X32-NEXT: addl -104(%ebp), %ebx # 4-byte Folded Reload
+; X32-NEXT: movl -268(%ebp), %esi # 4-byte Reload
+; X32-NEXT: adcl -156(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: addl %eax, %ebx
+; X32-NEXT: adcl %edx, %esi
+; X32-NEXT: movl %esi, -32(%ebp) # 4-byte Spill
+; X32-NEXT: movl -52(%ebp), %edx # 4-byte Reload
+; X32-NEXT: addl %edi, %edx
+; X32-NEXT: movl -24(%ebp), %esi # 4-byte Reload
; X32-NEXT: adcl %ecx, %esi
-; X32-NEXT: setb %al
-; X32-NEXT: addl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %esi
-; X32-NEXT: movzbl %al, %eax
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movzbl -228(%ebp), %eax # 1-byte Folded Reload
+; X32-NEXT: adcl %eax, %ebx
+; X32-NEXT: movl -32(%ebp), %eax # 4-byte Reload
; X32-NEXT: adcl $0, %eax
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
-; X32-NEXT: addl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
-; X32-NEXT: addl %edx, %ecx
-; X32-NEXT: adcl %esi, %edi
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
-; X32-NEXT: movl %eax, %edx
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: addl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: adcl %eax, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: adcl %eax, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT: adcl $0, %ecx
-; X32-NEXT: adcl $0, %edi
+; X32-NEXT: addl -344(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: adcl -404(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: adcl -532(%ebp), %ebx # 4-byte Folded Reload
+; X32-NEXT: adcl -592(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: addl -572(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: movl %edx, -52(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -448(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: movl %esi, -24(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -196(%ebp), %ebx # 4-byte Folded Reload
+; X32-NEXT: movl %ebx, -572(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -428(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: movl %eax, -32(%ebp) # 4-byte Spill
+; X32-NEXT: adcl $0, -20(%ebp) # 4-byte Folded Spill
+; X32-NEXT: adcl $0, -44(%ebp) # 4-byte Folded Spill
+; X32-NEXT: adcl $0, -464(%ebp) # 4-byte Folded Spill
+; X32-NEXT: adcl $0, -68(%ebp) # 4-byte Folded Spill
+; X32-NEXT: movl -184(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: movl -212(%ebp), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, -228(%ebp) # 4-byte Spill
+; X32-NEXT: movl %eax, -428(%ebp) # 4-byte Spill
+; X32-NEXT: movl -60(%ebp), %edi # 4-byte Reload
+; X32-NEXT: movl %edi, %eax
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: movl %eax, %ebx
+; X32-NEXT: addl -228(%ebp), %ebx # 4-byte Folded Reload
+; X32-NEXT: adcl $0, %esi
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: mull -252(%ebp) # 4-byte Folded Reload
+; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: addl %ebx, %eax
+; X32-NEXT: movl %eax, -452(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %esi, %ecx
+; X32-NEXT: setb %bl
+; X32-NEXT: movl %edi, %eax
+; X32-NEXT: movl -252(%ebp), %edi # 4-byte Reload
+; X32-NEXT: mull %edi
+; X32-NEXT: addl %ecx, %eax
+; X32-NEXT: movzbl %bl, %ecx
+; X32-NEXT: adcl %ecx, %edx
+; X32-NEXT: movl -160(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: addl -116(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: movl -268(%ebp), %esi # 4-byte Reload
+; X32-NEXT: adcl -84(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: addl %eax, %ecx
+; X32-NEXT: movl %ecx, -196(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %edx, %esi
+; X32-NEXT: movl %esi, -228(%ebp) # 4-byte Spill
+; X32-NEXT: movl -260(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: movl -212(%ebp), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, -532(%ebp) # 4-byte Spill
+; X32-NEXT: movl %eax, -592(%ebp) # 4-byte Spill
+; X32-NEXT: movl -124(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: movl %eax, %ebx
+; X32-NEXT: addl -532(%ebp), %ebx # 4-byte Folded Reload
+; X32-NEXT: adcl $0, %esi
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: addl %ebx, %eax
+; X32-NEXT: movl %eax, -532(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %esi, %ecx
+; X32-NEXT: setb %bl
+; X32-NEXT: movl -124(%ebp), %esi # 4-byte Reload
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: mull %edi
+; X32-NEXT: addl %ecx, %eax
+; X32-NEXT: movzbl %bl, %ecx
+; X32-NEXT: adcl %ecx, %edx
+; X32-NEXT: addl -368(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: adcl -328(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: addl -428(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: movl %eax, -448(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -452(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: movl %edx, -328(%ebp) # 4-byte Spill
+; X32-NEXT: adcl $0, -196(%ebp) # 4-byte Folded Spill
+; X32-NEXT: adcl $0, -228(%ebp) # 4-byte Folded Spill
+; X32-NEXT: movl -260(%ebp), %edi # 4-byte Reload
+; X32-NEXT: movl %edi, %eax
+; X32-NEXT: movl -284(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: movl %eax, -428(%ebp) # 4-byte Spill
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: addl %ebx, %ecx
+; X32-NEXT: adcl $0, %esi
+; X32-NEXT: movl %edi, %eax
+; X32-NEXT: movl -48(%ebp), %edi # 4-byte Reload
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: addl %ecx, %eax
+; X32-NEXT: movl %eax, -452(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %esi, %ebx
+; X32-NEXT: setb %cl
+; X32-NEXT: movl -124(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %edi
+; X32-NEXT: addl %ebx, %eax
+; X32-NEXT: movzbl %cl, %ecx
+; X32-NEXT: adcl %ecx, %edx
+; X32-NEXT: movl -136(%ebp), %ebx # 4-byte Reload
+; X32-NEXT: addl -308(%ebp), %ebx # 4-byte Folded Reload
+; X32-NEXT: movl -264(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: adcl -208(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: addl %eax, %ebx
+; X32-NEXT: adcl %edx, %ecx
+; X32-NEXT: movl -448(%ebp), %eax # 4-byte Reload
+; X32-NEXT: addl %eax, -428(%ebp) # 4-byte Folded Spill
+; X32-NEXT: movl -328(%ebp), %eax # 4-byte Reload
+; X32-NEXT: adcl %eax, -452(%ebp) # 4-byte Folded Spill
; X32-NEXT: adcl $0, %ebx
+; X32-NEXT: adcl $0, %ecx
+; X32-NEXT: addl -196(%ebp), %ebx # 4-byte Folded Reload
+; X32-NEXT: adcl -228(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: setb -448(%ebp) # 1-byte Folded Spill
+; X32-NEXT: movl -184(%ebp), %eax # 4-byte Reload
+; X32-NEXT: movl -284(%ebp), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, -196(%ebp) # 4-byte Spill
+; X32-NEXT: movl %eax, -228(%ebp) # 4-byte Spill
+; X32-NEXT: movl -60(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %eax, %edi
+; X32-NEXT: addl -196(%ebp), %edi # 4-byte Folded Reload
; X32-NEXT: adcl $0, %edx
-; X32-NEXT: addl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
-; X32-NEXT: movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
-; X32-NEXT: movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
-; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: setb {{[0-9]+}}(%esp) # 1-byte Folded Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: addl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: movl %ecx, %esi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT: movl %edx, -328(%ebp) # 4-byte Spill
+; X32-NEXT: movl -184(%ebp), %eax # 4-byte Reload
+; X32-NEXT: movl -48(%ebp), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: addl %edi, %eax
+; X32-NEXT: movl %eax, -196(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -328(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: movl %edx, %edi
+; X32-NEXT: setb -328(%ebp) # 1-byte Folded Spill
+; X32-NEXT: movl -60(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: addl %edi, %eax
+; X32-NEXT: movzbl -328(%ebp), %esi # 1-byte Folded Reload
+; X32-NEXT: adcl %esi, %edx
+; X32-NEXT: movl -160(%ebp), %edi # 4-byte Reload
+; X32-NEXT: addl -308(%ebp), %edi # 4-byte Folded Reload
+; X32-NEXT: movl -268(%ebp), %esi # 4-byte Reload
+; X32-NEXT: adcl -208(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: addl %eax, %edi
+; X32-NEXT: adcl %edx, %esi
+; X32-NEXT: movl %esi, -328(%ebp) # 4-byte Spill
+; X32-NEXT: movl -228(%ebp), %edx # 4-byte Reload
+; X32-NEXT: addl %ebx, %edx
+; X32-NEXT: movl -196(%ebp), %esi # 4-byte Reload
+; X32-NEXT: adcl %ecx, %esi
+; X32-NEXT: movzbl -448(%ebp), %eax # 1-byte Folded Reload
+; X32-NEXT: adcl %eax, %edi
+; X32-NEXT: movl -328(%ebp), %eax # 4-byte Reload
+; X32-NEXT: adcl $0, %eax
+; X32-NEXT: addl -368(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: adcl -620(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: adcl -788(%ebp), %edi # 4-byte Folded Reload
+; X32-NEXT: adcl -784(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: movl -52(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: addl %ecx, -592(%ebp) # 4-byte Folded Spill
+; X32-NEXT: movl -24(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: adcl %ecx, -532(%ebp) # 4-byte Folded Spill
+; X32-NEXT: movl -572(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: adcl %ecx, -428(%ebp) # 4-byte Folded Spill
+; X32-NEXT: movl -32(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: adcl %ecx, -452(%ebp) # 4-byte Folded Spill
; X32-NEXT: adcl $0, %edx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: adcl $0, %ecx
-; X32-NEXT: addl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %esi
-; X32-NEXT: movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
; X32-NEXT: adcl $0, %esi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi
; X32-NEXT: adcl $0, %edi
-; X32-NEXT: addl %edx, %esi
-; X32-NEXT: adcl %ecx, %edi
-; X32-NEXT: setb %al
-; X32-NEXT: addl {{[0-9]+}}(%esp), %esi
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %edi
-; X32-NEXT: movzbl %al, %ebx
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %ebx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: adcl $0, %eax
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: addl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
-; X32-NEXT: addl %esi, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT: adcl %edi, %eax
+; X32-NEXT: addl -20(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: movl %edx, -228(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -44(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: movl %esi, -196(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -464(%ebp), %edi # 4-byte Folded Reload
+; X32-NEXT: movl %edi, -620(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -68(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: movl %eax, -328(%ebp) # 4-byte Spill
+; X32-NEXT: setb -464(%ebp) # 1-byte Folded Spill
+; X32-NEXT: movl -288(%ebp), %ebx # 4-byte Reload
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: movl -212(%ebp), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: movl %eax, -20(%ebp) # 4-byte Spill
+; X32-NEXT: movl -16(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %esi
; X32-NEXT: movl %eax, %edi
-; X32-NEXT: adcl %ebx, %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
-; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
-; X32-NEXT: addl {{[0-9]+}}(%esp), %ebx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
-; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax # 1-byte Folded Reload
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
-; X32-NEXT: adcl %eax, %esi
+; X32-NEXT: addl %ecx, %edi
+; X32-NEXT: adcl $0, %esi
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: movl -252(%ebp), %ebx # 4-byte Reload
+; X32-NEXT: mull %ebx
+; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: addl %edi, %eax
+; X32-NEXT: movl %eax, -24(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %esi, %ecx
+; X32-NEXT: setb -44(%ebp) # 1-byte Folded Spill
+; X32-NEXT: movl -16(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %ebx
+; X32-NEXT: addl %ecx, %eax
+; X32-NEXT: movzbl -44(%ebp), %ecx # 1-byte Folded Reload
+; X32-NEXT: adcl %ecx, %edx
+; X32-NEXT: movl -280(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: addl -116(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: movl -312(%ebp), %esi # 4-byte Reload
+; X32-NEXT: adcl -84(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: addl %eax, %ecx
+; X32-NEXT: movl %ecx, -52(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %edx, %esi
+; X32-NEXT: movl %esi, -44(%ebp) # 4-byte Spill
+; X32-NEXT: movl -348(%ebp), %ebx # 4-byte Reload
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: movl -212(%ebp), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: movl %eax, -32(%ebp) # 4-byte Spill
+; X32-NEXT: movl -216(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: movl %eax, %edi
+; X32-NEXT: addl %ecx, %edi
+; X32-NEXT: adcl $0, %esi
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: movl -252(%ebp), %ebx # 4-byte Reload
+; X32-NEXT: mull %ebx
+; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: addl %edi, %eax
+; X32-NEXT: movl %eax, -68(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %esi, %ecx
+; X32-NEXT: setb -368(%ebp) # 1-byte Folded Spill
+; X32-NEXT: movl -216(%ebp), %esi # 4-byte Reload
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: mull %ebx
+; X32-NEXT: addl %ecx, %eax
+; X32-NEXT: movzbl -368(%ebp), %ecx # 1-byte Folded Reload
+; X32-NEXT: adcl %ecx, %edx
+; X32-NEXT: addl -540(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: adcl -576(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: addl -20(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: movl %eax, -576(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -24(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: movl %edx, -24(%ebp) # 4-byte Spill
+; X32-NEXT: adcl $0, -52(%ebp) # 4-byte Folded Spill
+; X32-NEXT: adcl $0, -44(%ebp) # 4-byte Folded Spill
+; X32-NEXT: movl -348(%ebp), %ebx # 4-byte Reload
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: movl -284(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, -368(%ebp) # 4-byte Spill
+; X32-NEXT: movl %eax, -20(%ebp) # 4-byte Spill
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, %edi
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: addl -368(%ebp), %ecx # 4-byte Folded Reload
; X32-NEXT: adcl $0, %edi
-; X32-NEXT: movl %edi, %eax
-; X32-NEXT: adcl $0, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT: adcl $0, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT: addl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
-; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %esi # 4-byte Folded Reload
-; X32-NEXT: movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
-; X32-NEXT: addl {{[0-9]+}}(%esp), %ebx
-; X32-NEXT: movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %edi
-; X32-NEXT: movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %esi # 4-byte Folded Reload
-; X32-NEXT: movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %esi # 4-byte Folded Reload
-; X32-NEXT: movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %esi # 4-byte Folded Reload
-; X32-NEXT: movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
-; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl $0, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT: adcl $0, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT: adcl $0, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT: adcl $0, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT: adcl $0, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT: adcl $0, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT: adcl $0, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT: adcl $0, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X32-NEXT: movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: addl %edi, %esi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl %edx, %eax
-; X32-NEXT: adcl %ecx, %eax
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X32-NEXT: movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: movl -48(%ebp), %ebx # 4-byte Reload
+; X32-NEXT: mull %ebx
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: addl %ecx, %eax
+; X32-NEXT: movl %eax, -368(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %edi, %esi
+; X32-NEXT: setb %cl
+; X32-NEXT: movl -216(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %ebx
+; X32-NEXT: addl %esi, %eax
+; X32-NEXT: movzbl %cl, %ecx
+; X32-NEXT: adcl %ecx, %edx
+; X32-NEXT: movl -180(%ebp), %ebx # 4-byte Reload
+; X32-NEXT: addl -308(%ebp), %ebx # 4-byte Folded Reload
+; X32-NEXT: movl -320(%ebp), %edi # 4-byte Reload
+; X32-NEXT: adcl -208(%ebp), %edi # 4-byte Folded Reload
+; X32-NEXT: addl %eax, %ebx
+; X32-NEXT: adcl %edx, %edi
+; X32-NEXT: movl -576(%ebp), %eax # 4-byte Reload
+; X32-NEXT: addl %eax, -20(%ebp) # 4-byte Folded Spill
+; X32-NEXT: movl -24(%ebp), %eax # 4-byte Reload
+; X32-NEXT: adcl %eax, -368(%ebp) # 4-byte Folded Spill
; X32-NEXT: adcl $0, %ebx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: adcl $0, %edi
+; X32-NEXT: addl -52(%ebp), %ebx # 4-byte Folded Reload
+; X32-NEXT: adcl -44(%ebp), %edi # 4-byte Folded Reload
+; X32-NEXT: setb -576(%ebp) # 1-byte Folded Spill
+; X32-NEXT: movl -288(%ebp), %eax # 4-byte Reload
+; X32-NEXT: movl -284(%ebp), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, -52(%ebp) # 4-byte Spill
+; X32-NEXT: movl %eax, -44(%ebp) # 4-byte Spill
+; X32-NEXT: movl -16(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: addl -52(%ebp), %ecx # 4-byte Folded Reload
; X32-NEXT: adcl $0, %edx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: movl %edx, -24(%ebp) # 4-byte Spill
+; X32-NEXT: movl -288(%ebp), %eax # 4-byte Reload
+; X32-NEXT: movl -48(%ebp), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: addl %ecx, %eax
+; X32-NEXT: movl %eax, -52(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -24(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: setb -24(%ebp) # 1-byte Folded Spill
+; X32-NEXT: movl -16(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: addl %ecx, %eax
+; X32-NEXT: movzbl -24(%ebp), %esi # 1-byte Folded Reload
+; X32-NEXT: adcl %esi, %edx
+; X32-NEXT: movl -280(%ebp), %esi # 4-byte Reload
+; X32-NEXT: addl -308(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: movl -312(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: adcl -208(%ebp), %ecx # 4-byte Folded Reload
; X32-NEXT: addl %eax, %esi
-; X32-NEXT: movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl %eax, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT: adcl $0, %edi
-; X32-NEXT: adcl $0, %ecx
-; X32-NEXT: addl %ebx, %edi
+; X32-NEXT: movl %esi, -24(%ebp) # 4-byte Spill
; X32-NEXT: adcl %edx, %ecx
-; X32-NEXT: setb %dl
-; X32-NEXT: addl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
-; X32-NEXT: movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
-; X32-NEXT: movzbl %dl, %eax
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl $0, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: movl -44(%ebp), %edx # 4-byte Reload
+; X32-NEXT: addl %ebx, %edx
+; X32-NEXT: movl -52(%ebp), %esi # 4-byte Reload
+; X32-NEXT: adcl %edi, %esi
+; X32-NEXT: movzbl -576(%ebp), %eax # 1-byte Folded Reload
+; X32-NEXT: movl -24(%ebp), %edi # 4-byte Reload
+; X32-NEXT: adcl %eax, %edi
+; X32-NEXT: adcl $0, %ecx
+; X32-NEXT: addl -540(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: movl %edx, -44(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -800(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: movl %esi, -52(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -796(%ebp), %edi # 4-byte Folded Reload
+; X32-NEXT: movl %edi, -24(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -792(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: movl %ecx, %edi
+; X32-NEXT: movl -32(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: addl -228(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: movl -68(%ebp), %edx # 4-byte Reload
+; X32-NEXT: adcl -196(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: movl -20(%ebp), %esi # 4-byte Reload
+; X32-NEXT: adcl -620(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: movl -368(%ebp), %ebx # 4-byte Reload
+; X32-NEXT: adcl -328(%ebp), %ebx # 4-byte Folded Reload
+; X32-NEXT: movzbl -464(%ebp), %eax # 1-byte Folded Reload
+; X32-NEXT: adcl %eax, -44(%ebp) # 4-byte Folded Spill
+; X32-NEXT: adcl $0, -52(%ebp) # 4-byte Folded Spill
+; X32-NEXT: adcl $0, -24(%ebp) # 4-byte Folded Spill
+; X32-NEXT: adcl $0, %edi
+; X32-NEXT: addl -344(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: movl %ecx, -32(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -404(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: movl %edx, -68(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -72(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: movl %esi, -20(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -76(%ebp), %ebx # 4-byte Folded Reload
+; X32-NEXT: movl -44(%ebp), %eax # 4-byte Reload
+; X32-NEXT: adcl -232(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: movl -52(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: adcl -164(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: movl -24(%ebp), %edx # 4-byte Reload
+; X32-NEXT: adcl -40(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: adcl -56(%ebp), %edi # 4-byte Folded Reload
+; X32-NEXT: movl -32(%ebp), %esi # 4-byte Reload
+; X32-NEXT: addl -616(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: movl %esi, -32(%ebp) # 4-byte Spill
+; X32-NEXT: movl -68(%ebp), %esi # 4-byte Reload
+; X32-NEXT: adcl -612(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: movl %esi, -68(%ebp) # 4-byte Spill
+; X32-NEXT: movl -20(%ebp), %esi # 4-byte Reload
+; X32-NEXT: adcl -424(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: movl %esi, -20(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -420(%ebp), %ebx # 4-byte Folded Reload
+; X32-NEXT: movl %ebx, -368(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -508(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: movl %eax, -44(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -504(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: movl %ecx, -52(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -152(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: movl %edx, -24(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -64(%ebp), %edi # 4-byte Folded Reload
+; X32-NEXT: movl %edi, -464(%ebp) # 4-byte Spill
+; X32-NEXT: adcl $0, -292(%ebp) # 4-byte Folded Spill
+; X32-NEXT: adcl $0, -372(%ebp) # 4-byte Folded Spill
+; X32-NEXT: adcl $0, -88(%ebp) # 4-byte Folded Spill
+; X32-NEXT: adcl $0, -296(%ebp) # 4-byte Folded Spill
+; X32-NEXT: adcl $0, -272(%ebp) # 4-byte Folded Spill
+; X32-NEXT: adcl $0, -36(%ebp) # 4-byte Folded Spill
+; X32-NEXT: adcl $0, -332(%ebp) # 4-byte Folded Spill
+; X32-NEXT: adcl $0, -80(%ebp) # 4-byte Folded Spill
+; X32-NEXT: movl -288(%ebp), %esi # 4-byte Reload
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: movl -188(%ebp), %edi # 4-byte Reload
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: movl %eax, -164(%ebp) # 4-byte Spill
+; X32-NEXT: movl -16(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %edx, %edi
+; X32-NEXT: movl %eax, %ebx
+; X32-NEXT: addl %ecx, %ebx
+; X32-NEXT: adcl $0, %edi
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: movl -148(%ebp), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: addl %ebx, %eax
+; X32-NEXT: movl %eax, -76(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %edi, %ecx
+; X32-NEXT: setb %bl
+; X32-NEXT: movl -16(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: addl %ecx, %eax
+; X32-NEXT: movzbl %bl, %ecx
+; X32-NEXT: adcl %ecx, %edx
+; X32-NEXT: movl -280(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: addl -100(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: movl -312(%ebp), %esi # 4-byte Reload
+; X32-NEXT: adcl -204(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: addl %eax, %ecx
+; X32-NEXT: movl %ecx, -56(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %edx, %esi
+; X32-NEXT: movl %esi, -40(%ebp) # 4-byte Spill
+; X32-NEXT: movl -348(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: movl -188(%ebp), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, -72(%ebp) # 4-byte Spill
+; X32-NEXT: movl %eax, -152(%ebp) # 4-byte Spill
+; X32-NEXT: movl -216(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %edi
+; X32-NEXT: movl %eax, %ebx
+; X32-NEXT: addl -72(%ebp), %ebx # 4-byte Folded Reload
+; X32-NEXT: adcl $0, %edi
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: movl -148(%ebp), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: addl %ebx, %eax
+; X32-NEXT: movl %eax, -228(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %edi, %ecx
+; X32-NEXT: setb %bl
+; X32-NEXT: movl -216(%ebp), %edi # 4-byte Reload
+; X32-NEXT: movl %edi, %eax
+; X32-NEXT: mull %esi
+; X32-NEXT: addl %ecx, %eax
+; X32-NEXT: movzbl %bl, %ecx
+; X32-NEXT: adcl %ecx, %edx
+; X32-NEXT: addl -468(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: adcl -804(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: addl -164(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: movl %eax, -72(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -76(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: movl %edx, -76(%ebp) # 4-byte Spill
+; X32-NEXT: adcl $0, -56(%ebp) # 4-byte Folded Spill
+; X32-NEXT: adcl $0, -40(%ebp) # 4-byte Folded Spill
+; X32-NEXT: movl -348(%ebp), %esi # 4-byte Reload
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: movl -236(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: mull %ecx
; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: movl %eax, -164(%ebp) # 4-byte Spill
+; X32-NEXT: movl %edi, %eax
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, %edi
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: addl %ebx, %ecx
+; X32-NEXT: adcl $0, %edi
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: movl -112(%ebp), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: addl %ecx, %eax
+; X32-NEXT: movl %eax, -232(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %edi, %ebx
+; X32-NEXT: setb %cl
+; X32-NEXT: movl -216(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: addl %ebx, %eax
+; X32-NEXT: movzbl %cl, %ecx
+; X32-NEXT: adcl %ecx, %edx
+; X32-NEXT: movl -180(%ebp), %ebx # 4-byte Reload
+; X32-NEXT: addl -304(%ebp), %ebx # 4-byte Folded Reload
+; X32-NEXT: movl -320(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: adcl -128(%ebp), %ecx # 4-byte Folded Reload
; X32-NEXT: addl %eax, %ebx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X32-NEXT: movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl %eax, %esi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X32-NEXT: movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl %eax, %edi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: adcl %edx, %eax
-; X32-NEXT: addl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
-; X32-NEXT: movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: adcl %edx, %ecx
+; X32-NEXT: movl -72(%ebp), %eax # 4-byte Reload
+; X32-NEXT: addl %eax, -164(%ebp) # 4-byte Folded Spill
+; X32-NEXT: movl -76(%ebp), %eax # 4-byte Reload
+; X32-NEXT: adcl %eax, -232(%ebp) # 4-byte Folded Spill
+; X32-NEXT: adcl $0, %ebx
+; X32-NEXT: adcl $0, %ecx
+; X32-NEXT: addl -56(%ebp), %ebx # 4-byte Folded Reload
+; X32-NEXT: adcl -40(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: setb -72(%ebp) # 1-byte Folded Spill
+; X32-NEXT: movl -288(%ebp), %eax # 4-byte Reload
+; X32-NEXT: movl -236(%ebp), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, -40(%ebp) # 4-byte Spill
+; X32-NEXT: movl %eax, -56(%ebp) # 4-byte Spill
+; X32-NEXT: movl -16(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %eax, %edi
+; X32-NEXT: addl -40(%ebp), %edi # 4-byte Folded Reload
+; X32-NEXT: adcl $0, %edx
+; X32-NEXT: movl %edx, -40(%ebp) # 4-byte Spill
+; X32-NEXT: movl -288(%ebp), %eax # 4-byte Reload
+; X32-NEXT: movl -112(%ebp), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: addl %edi, %eax
+; X32-NEXT: movl %eax, -76(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -40(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: movl %edx, %edi
+; X32-NEXT: setb -40(%ebp) # 1-byte Folded Spill
+; X32-NEXT: movl -16(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: addl %edi, %eax
+; X32-NEXT: movzbl -40(%ebp), %esi # 1-byte Folded Reload
+; X32-NEXT: adcl %esi, %edx
+; X32-NEXT: movl -280(%ebp), %edi # 4-byte Reload
+; X32-NEXT: addl -304(%ebp), %edi # 4-byte Folded Reload
+; X32-NEXT: movl -312(%ebp), %esi # 4-byte Reload
+; X32-NEXT: adcl -128(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: addl %eax, %edi
+; X32-NEXT: adcl %edx, %esi
+; X32-NEXT: movl %esi, -40(%ebp) # 4-byte Spill
+; X32-NEXT: movl -56(%ebp), %edx # 4-byte Reload
+; X32-NEXT: addl %ebx, %edx
+; X32-NEXT: movl -76(%ebp), %esi # 4-byte Reload
; X32-NEXT: adcl %ecx, %esi
-; X32-NEXT: movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
-; X32-NEXT: movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl %eax, %ecx
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: addl %eax, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: adcl %eax, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: adcl %eax, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: adcl %eax, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT: adcl %ebx, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT: adcl %esi, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT: adcl %edi, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT: adcl %ecx, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: addl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl %eax, %esi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movzbl -72(%ebp), %eax # 1-byte Folded Reload
+; X32-NEXT: adcl %eax, %edi
+; X32-NEXT: movl -40(%ebp), %eax # 4-byte Reload
; X32-NEXT: adcl $0, %eax
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: addl -468(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: movl %edx, -56(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -816(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: movl %esi, -76(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -812(%ebp), %edi # 4-byte Folded Reload
+; X32-NEXT: movl %edi, -344(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -808(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: movl %eax, -40(%ebp) # 4-byte Spill
+; X32-NEXT: movl -184(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: movl -188(%ebp), %edi # 4-byte Reload
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: movl %eax, -196(%ebp) # 4-byte Spill
+; X32-NEXT: movl -60(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %edx, %edi
+; X32-NEXT: movl %eax, %ebx
+; X32-NEXT: addl %esi, %ebx
+; X32-NEXT: adcl $0, %edi
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: movl -148(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: addl %ebx, %eax
+; X32-NEXT: movl %eax, -328(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %edi, %esi
+; X32-NEXT: setb %bl
+; X32-NEXT: movl -60(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %ecx
+; X32-NEXT: addl %esi, %eax
+; X32-NEXT: movzbl %bl, %ecx
+; X32-NEXT: adcl %ecx, %edx
+; X32-NEXT: movl -160(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: addl -100(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: movl -268(%ebp), %esi # 4-byte Reload
+; X32-NEXT: adcl -204(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: addl %eax, %ecx
+; X32-NEXT: movl %ecx, -64(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %edx, %esi
+; X32-NEXT: movl %esi, -72(%ebp) # 4-byte Spill
+; X32-NEXT: movl -260(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: movl -188(%ebp), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, -468(%ebp) # 4-byte Spill
+; X32-NEXT: movl %eax, -508(%ebp) # 4-byte Spill
+; X32-NEXT: movl -124(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %edi
+; X32-NEXT: movl %eax, %ebx
+; X32-NEXT: addl -468(%ebp), %ebx # 4-byte Folded Reload
+; X32-NEXT: adcl $0, %edi
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: movl -148(%ebp), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: addl %ebx, %eax
+; X32-NEXT: movl %eax, -504(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %edi, %ecx
+; X32-NEXT: setb %bl
+; X32-NEXT: movl -124(%ebp), %edi # 4-byte Reload
+; X32-NEXT: movl %edi, %eax
+; X32-NEXT: mull %esi
+; X32-NEXT: addl %ecx, %eax
+; X32-NEXT: movzbl %bl, %ecx
+; X32-NEXT: adcl %ecx, %edx
+; X32-NEXT: addl -512(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: adcl -820(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: addl -196(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: movl %eax, -404(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -328(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: movl %edx, -196(%ebp) # 4-byte Spill
+; X32-NEXT: adcl $0, -64(%ebp) # 4-byte Folded Spill
+; X32-NEXT: adcl $0, -72(%ebp) # 4-byte Folded Spill
+; X32-NEXT: movl -260(%ebp), %esi # 4-byte Reload
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: movl -236(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: movl %eax, -328(%ebp) # 4-byte Spill
+; X32-NEXT: movl %edi, %eax
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, %edi
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: addl %ebx, %ecx
+; X32-NEXT: adcl $0, %edi
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: movl -112(%ebp), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: addl %ecx, %eax
+; X32-NEXT: movl %eax, -468(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %edi, %ebx
+; X32-NEXT: setb %cl
+; X32-NEXT: movl -124(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: addl %ebx, %eax
+; X32-NEXT: movzbl %cl, %ecx
+; X32-NEXT: adcl %ecx, %edx
+; X32-NEXT: movl -136(%ebp), %esi # 4-byte Reload
+; X32-NEXT: addl -304(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: movl -264(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: adcl -128(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: addl %eax, %esi
+; X32-NEXT: adcl %edx, %ecx
+; X32-NEXT: movl -404(%ebp), %eax # 4-byte Reload
+; X32-NEXT: addl %eax, -328(%ebp) # 4-byte Folded Spill
+; X32-NEXT: movl -196(%ebp), %eax # 4-byte Reload
+; X32-NEXT: adcl %eax, -468(%ebp) # 4-byte Folded Spill
+; X32-NEXT: adcl $0, %esi
; X32-NEXT: adcl $0, %ecx
-; X32-NEXT: addl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %esi
-; X32-NEXT: movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT: addl -64(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: adcl -72(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: setb -196(%ebp) # 1-byte Folded Spill
+; X32-NEXT: movl -184(%ebp), %eax # 4-byte Reload
+; X32-NEXT: movl -236(%ebp), %edi # 4-byte Reload
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %edx, -64(%ebp) # 4-byte Spill
+; X32-NEXT: movl %eax, -72(%ebp) # 4-byte Spill
+; X32-NEXT: movl -60(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %eax, %ebx
+; X32-NEXT: addl -64(%ebp), %ebx # 4-byte Folded Reload
; X32-NEXT: adcl $0, %edx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X32-NEXT: adcl $0, %esi
-; X32-NEXT: addl %eax, %edx
+; X32-NEXT: movl %edx, -404(%ebp) # 4-byte Spill
+; X32-NEXT: movl -184(%ebp), %eax # 4-byte Reload
+; X32-NEXT: movl -112(%ebp), %edi # 4-byte Reload
+; X32-NEXT: mull %edi
+; X32-NEXT: addl %ebx, %eax
+; X32-NEXT: movl %eax, -64(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -404(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: setb -404(%ebp) # 1-byte Folded Spill
+; X32-NEXT: movl -60(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %edi
+; X32-NEXT: addl %ebx, %eax
+; X32-NEXT: movzbl -404(%ebp), %edi # 1-byte Folded Reload
+; X32-NEXT: adcl %edi, %edx
+; X32-NEXT: movl -160(%ebp), %edi # 4-byte Reload
+; X32-NEXT: addl -304(%ebp), %edi # 4-byte Folded Reload
+; X32-NEXT: movl -268(%ebp), %ebx # 4-byte Reload
+; X32-NEXT: adcl -128(%ebp), %ebx # 4-byte Folded Reload
+; X32-NEXT: addl %eax, %edi
+; X32-NEXT: adcl %edx, %ebx
+; X32-NEXT: movl -72(%ebp), %edx # 4-byte Reload
+; X32-NEXT: addl %esi, %edx
+; X32-NEXT: movl -64(%ebp), %esi # 4-byte Reload
; X32-NEXT: adcl %ecx, %esi
-; X32-NEXT: setb %al
-; X32-NEXT: addl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %esi
-; X32-NEXT: movzbl %al, %eax
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: adcl $0, %eax
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT: addl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
-; X32-NEXT: addl %edx, %edi
-; X32-NEXT: movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl %esi, %ebx
-; X32-NEXT: movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: addl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl %eax, %esi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: adcl $0, %eax
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: movzbl -196(%ebp), %eax # 1-byte Folded Reload
+; X32-NEXT: adcl %eax, %edi
+; X32-NEXT: adcl $0, %ebx
+; X32-NEXT: addl -512(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: adcl -676(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: adcl -624(%ebp), %edi # 4-byte Folded Reload
+; X32-NEXT: adcl -628(%ebp), %ebx # 4-byte Folded Reload
+; X32-NEXT: addl -152(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: movl %edx, -72(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -228(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: movl %esi, -64(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -164(%ebp), %edi # 4-byte Folded Reload
+; X32-NEXT: movl %edi, -628(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -232(%ebp), %ebx # 4-byte Folded Reload
+; X32-NEXT: movl %ebx, -624(%ebp) # 4-byte Spill
+; X32-NEXT: adcl $0, -56(%ebp) # 4-byte Folded Spill
+; X32-NEXT: adcl $0, -76(%ebp) # 4-byte Folded Spill
+; X32-NEXT: adcl $0, -344(%ebp) # 4-byte Folded Spill
+; X32-NEXT: adcl $0, -40(%ebp) # 4-byte Folded Spill
+; X32-NEXT: movl -184(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: movl -300(%ebp), %edi # 4-byte Reload
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %edx, -232(%ebp) # 4-byte Spill
+; X32-NEXT: movl %eax, -164(%ebp) # 4-byte Spill
+; X32-NEXT: movl -60(%ebp), %esi # 4-byte Reload
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: movl %eax, %edi
+; X32-NEXT: addl -232(%ebp), %edi # 4-byte Folded Reload
+; X32-NEXT: adcl $0, %ebx
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: mull -144(%ebp) # 4-byte Folded Reload
+; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: addl %edi, %eax
+; X32-NEXT: movl %eax, -228(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %ebx, %ecx
+; X32-NEXT: setb %bl
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: mull -144(%ebp) # 4-byte Folded Reload
+; X32-NEXT: addl %ecx, %eax
+; X32-NEXT: movzbl %bl, %ecx
+; X32-NEXT: adcl %ecx, %edx
+; X32-NEXT: movl -160(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: addl -336(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: movl -268(%ebp), %esi # 4-byte Reload
+; X32-NEXT: adcl -176(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: addl %eax, %ecx
+; X32-NEXT: movl %ecx, -152(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %edx, %esi
+; X32-NEXT: movl %esi, -232(%ebp) # 4-byte Spill
+; X32-NEXT: movl -260(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: movl -300(%ebp), %edi # 4-byte Reload
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: movl %eax, -404(%ebp) # 4-byte Spill
+; X32-NEXT: movl -124(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %edx, %edi
+; X32-NEXT: movl %eax, %ebx
+; X32-NEXT: addl %esi, %ebx
+; X32-NEXT: adcl $0, %edi
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: movl -144(%ebp), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: addl %ebx, %eax
+; X32-NEXT: movl %eax, -540(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %edi, %ecx
+; X32-NEXT: setb -196(%ebp) # 1-byte Folded Spill
+; X32-NEXT: movl -124(%ebp), %ebx # 4-byte Reload
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: mull %esi
+; X32-NEXT: addl %ecx, %eax
+; X32-NEXT: movzbl -196(%ebp), %ecx # 1-byte Folded Reload
+; X32-NEXT: adcl %ecx, %edx
+; X32-NEXT: addl -588(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: adcl -824(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: addl -164(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: movl %eax, -420(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -228(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: movl %edx, -424(%ebp) # 4-byte Spill
+; X32-NEXT: adcl $0, -152(%ebp) # 4-byte Folded Spill
+; X32-NEXT: adcl $0, -232(%ebp) # 4-byte Folded Spill
+; X32-NEXT: movl -260(%ebp), %edi # 4-byte Reload
+; X32-NEXT: movl %edi, %eax
+; X32-NEXT: movl -244(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: movl %eax, -228(%ebp) # 4-byte Spill
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: addl %esi, %ecx
+; X32-NEXT: adcl $0, %ebx
+; X32-NEXT: movl 12(%ebp), %eax
+; X32-NEXT: movl 60(%eax), %esi
+; X32-NEXT: movl %edi, %eax
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %esi, -164(%ebp) # 4-byte Spill
+; X32-NEXT: movl %edx, %edi
+; X32-NEXT: addl %ecx, %eax
+; X32-NEXT: movl %eax, -196(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %ebx, %edi
+; X32-NEXT: setb %cl
+; X32-NEXT: movl -124(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: addl %edi, %eax
+; X32-NEXT: movzbl %cl, %ecx
+; X32-NEXT: adcl %ecx, %edx
+; X32-NEXT: movl -136(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: addl -224(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: movl -264(%ebp), %esi # 4-byte Reload
+; X32-NEXT: adcl -360(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: addl %eax, %ecx
+; X32-NEXT: adcl %edx, %esi
+; X32-NEXT: movl -420(%ebp), %eax # 4-byte Reload
+; X32-NEXT: addl %eax, -228(%ebp) # 4-byte Folded Spill
+; X32-NEXT: movl -424(%ebp), %eax # 4-byte Reload
+; X32-NEXT: adcl %eax, -196(%ebp) # 4-byte Folded Spill
; X32-NEXT: adcl $0, %ecx
-; X32-NEXT: addl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %esi
-; X32-NEXT: movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT: adcl $0, %esi
+; X32-NEXT: addl -152(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: adcl -232(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: setb -232(%ebp) # 1-byte Folded Spill
+; X32-NEXT: movl -184(%ebp), %eax # 4-byte Reload
+; X32-NEXT: movl -244(%ebp), %edi # 4-byte Reload
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %edx, -152(%ebp) # 4-byte Spill
+; X32-NEXT: movl %eax, -424(%ebp) # 4-byte Spill
+; X32-NEXT: movl -60(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %eax, %ebx
+; X32-NEXT: addl -152(%ebp), %ebx # 4-byte Folded Reload
+; X32-NEXT: adcl $0, %edx
+; X32-NEXT: movl %edx, -152(%ebp) # 4-byte Spill
+; X32-NEXT: movl -184(%ebp), %eax # 4-byte Reload
+; X32-NEXT: movl -164(%ebp), %edi # 4-byte Reload
+; X32-NEXT: mull %edi
+; X32-NEXT: addl %ebx, %eax
+; X32-NEXT: movl %eax, -420(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -152(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: setb -152(%ebp) # 1-byte Folded Spill
+; X32-NEXT: movl -60(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %edi
+; X32-NEXT: addl %ebx, %eax
+; X32-NEXT: movzbl -152(%ebp), %edi # 1-byte Folded Reload
+; X32-NEXT: adcl %edi, %edx
+; X32-NEXT: movl -160(%ebp), %edi # 4-byte Reload
+; X32-NEXT: addl -224(%ebp), %edi # 4-byte Folded Reload
+; X32-NEXT: movl -268(%ebp), %ebx # 4-byte Reload
+; X32-NEXT: adcl -360(%ebp), %ebx # 4-byte Folded Reload
+; X32-NEXT: addl %eax, %edi
+; X32-NEXT: adcl %edx, %ebx
+; X32-NEXT: movl -424(%ebp), %edx # 4-byte Reload
+; X32-NEXT: addl %ecx, %edx
+; X32-NEXT: movl -420(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: adcl %esi, %ecx
+; X32-NEXT: movzbl -232(%ebp), %eax # 1-byte Folded Reload
+; X32-NEXT: adcl %eax, %edi
+; X32-NEXT: adcl $0, %ebx
+; X32-NEXT: addl -588(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: adcl -632(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: adcl -828(%ebp), %edi # 4-byte Folded Reload
+; X32-NEXT: adcl -636(%ebp), %ebx # 4-byte Folded Reload
+; X32-NEXT: movl -72(%ebp), %eax # 4-byte Reload
+; X32-NEXT: addl %eax, -404(%ebp) # 4-byte Folded Spill
+; X32-NEXT: movl -64(%ebp), %eax # 4-byte Reload
+; X32-NEXT: adcl %eax, -540(%ebp) # 4-byte Folded Spill
+; X32-NEXT: movl -628(%ebp), %eax # 4-byte Reload
+; X32-NEXT: adcl %eax, -228(%ebp) # 4-byte Folded Spill
+; X32-NEXT: movl -624(%ebp), %eax # 4-byte Reload
+; X32-NEXT: adcl %eax, -196(%ebp) # 4-byte Folded Spill
; X32-NEXT: adcl $0, %edx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT: adcl $0, %ecx
+; X32-NEXT: adcl $0, %edi
+; X32-NEXT: adcl $0, %ebx
+; X32-NEXT: addl -56(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: movl %edx, -424(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -76(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: movl %ecx, -420(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -344(%ebp), %edi # 4-byte Folded Reload
+; X32-NEXT: movl %edi, -636(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -40(%ebp), %ebx # 4-byte Folded Reload
+; X32-NEXT: movl %ebx, -632(%ebp) # 4-byte Spill
+; X32-NEXT: setb -588(%ebp) # 1-byte Folded Spill
+; X32-NEXT: movl -288(%ebp), %ebx # 4-byte Reload
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: movl -300(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: movl %eax, -76(%ebp) # 4-byte Spill
+; X32-NEXT: movl -16(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, %edi
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: addl %esi, %ecx
+; X32-NEXT: adcl $0, %edi
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: movl -144(%ebp), %ebx # 4-byte Reload
+; X32-NEXT: mull %ebx
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: addl %ecx, %eax
+; X32-NEXT: movl %eax, -72(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %edi, %esi
+; X32-NEXT: setb %cl
+; X32-NEXT: movl -16(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %ebx
+; X32-NEXT: addl %esi, %eax
+; X32-NEXT: movzbl %cl, %ecx
+; X32-NEXT: adcl %ecx, %edx
+; X32-NEXT: movl -280(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: addl -336(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: movl -312(%ebp), %esi # 4-byte Reload
+; X32-NEXT: adcl -176(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: addl %eax, %ecx
+; X32-NEXT: movl %ecx, -56(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %edx, %esi
+; X32-NEXT: movl %esi, -40(%ebp) # 4-byte Spill
+; X32-NEXT: movl -348(%ebp), %ebx # 4-byte Reload
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: movl -300(%ebp), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: movl %eax, -232(%ebp) # 4-byte Spill
+; X32-NEXT: movl -216(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: movl %eax, %edi
+; X32-NEXT: addl %ecx, %edi
; X32-NEXT: adcl $0, %esi
-; X32-NEXT: addl %eax, %edx
-; X32-NEXT: adcl %ecx, %esi
-; X32-NEXT: setb %al
-; X32-NEXT: addl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %esi
-; X32-NEXT: movzbl %al, %eax
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: adcl $0, %eax
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: addl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
-; X32-NEXT: addl %edx, %eax
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: movl -144(%ebp), %ebx # 4-byte Reload
+; X32-NEXT: mull %ebx
+; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: addl %edi, %eax
+; X32-NEXT: movl %eax, -152(%ebp) # 4-byte Spill
; X32-NEXT: adcl %esi, %ecx
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
-; X32-NEXT: addl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
-; X32-NEXT: movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
-; X32-NEXT: movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl $0, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT: adcl $0, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT: adcl $0, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT: adcl $0, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: addl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl %eax, %esi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: adcl $0, %eax
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: setb -64(%ebp) # 1-byte Folded Spill
+; X32-NEXT: movl -216(%ebp), %edi # 4-byte Reload
+; X32-NEXT: movl %edi, %eax
+; X32-NEXT: mull %ebx
+; X32-NEXT: addl %ecx, %eax
+; X32-NEXT: movzbl -64(%ebp), %ecx # 1-byte Folded Reload
+; X32-NEXT: adcl %ecx, %edx
+; X32-NEXT: addl -672(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: adcl -832(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: addl -76(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: movl %eax, -344(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -72(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: movl %edx, -76(%ebp) # 4-byte Spill
+; X32-NEXT: adcl $0, -56(%ebp) # 4-byte Folded Spill
+; X32-NEXT: adcl $0, -40(%ebp) # 4-byte Folded Spill
+; X32-NEXT: movl -348(%ebp), %esi # 4-byte Reload
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: movl -244(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: movl %eax, -72(%ebp) # 4-byte Spill
+; X32-NEXT: movl %edi, %eax
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, %edi
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: addl %ebx, %ecx
+; X32-NEXT: adcl $0, %edi
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: movl -164(%ebp), %ebx # 4-byte Reload
+; X32-NEXT: mull %ebx
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: addl %ecx, %eax
+; X32-NEXT: movl %eax, -64(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %edi, %esi
+; X32-NEXT: setb %cl
+; X32-NEXT: movl -216(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %ebx
+; X32-NEXT: addl %esi, %eax
+; X32-NEXT: movzbl %cl, %ecx
+; X32-NEXT: adcl %ecx, %edx
+; X32-NEXT: movl -180(%ebp), %ebx # 4-byte Reload
+; X32-NEXT: addl -224(%ebp), %ebx # 4-byte Folded Reload
+; X32-NEXT: movl -320(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: adcl -360(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: addl %eax, %ebx
+; X32-NEXT: adcl %edx, %ecx
+; X32-NEXT: movl -344(%ebp), %eax # 4-byte Reload
+; X32-NEXT: addl %eax, -72(%ebp) # 4-byte Folded Spill
+; X32-NEXT: movl -76(%ebp), %eax # 4-byte Reload
+; X32-NEXT: adcl %eax, -64(%ebp) # 4-byte Folded Spill
+; X32-NEXT: adcl $0, %ebx
; X32-NEXT: adcl $0, %ecx
-; X32-NEXT: addl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %esi
-; X32-NEXT: movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT: addl -56(%ebp), %ebx # 4-byte Folded Reload
+; X32-NEXT: adcl -40(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: setb -56(%ebp) # 1-byte Folded Spill
+; X32-NEXT: movl -288(%ebp), %eax # 4-byte Reload
+; X32-NEXT: movl -244(%ebp), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, -40(%ebp) # 4-byte Spill
+; X32-NEXT: movl %eax, -344(%ebp) # 4-byte Spill
+; X32-NEXT: movl -16(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %eax, %edi
+; X32-NEXT: addl -40(%ebp), %edi # 4-byte Folded Reload
; X32-NEXT: adcl $0, %edx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X32-NEXT: adcl $0, %esi
-; X32-NEXT: addl %eax, %edx
+; X32-NEXT: movl %edx, -76(%ebp) # 4-byte Spill
+; X32-NEXT: movl -288(%ebp), %eax # 4-byte Reload
+; X32-NEXT: movl -164(%ebp), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: addl %edi, %eax
+; X32-NEXT: movl %eax, -40(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -76(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: movl %edx, %edi
+; X32-NEXT: setb -76(%ebp) # 1-byte Folded Spill
+; X32-NEXT: movl -16(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: addl %edi, %eax
+; X32-NEXT: movzbl -76(%ebp), %esi # 1-byte Folded Reload
+; X32-NEXT: adcl %esi, %edx
+; X32-NEXT: movl -280(%ebp), %edi # 4-byte Reload
+; X32-NEXT: addl -224(%ebp), %edi # 4-byte Folded Reload
+; X32-NEXT: movl -312(%ebp), %esi # 4-byte Reload
+; X32-NEXT: adcl -360(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: addl %eax, %edi
+; X32-NEXT: adcl %edx, %esi
+; X32-NEXT: movl %esi, -76(%ebp) # 4-byte Spill
+; X32-NEXT: movl -344(%ebp), %edx # 4-byte Reload
+; X32-NEXT: addl %ebx, %edx
+; X32-NEXT: movl -40(%ebp), %esi # 4-byte Reload
; X32-NEXT: adcl %ecx, %esi
-; X32-NEXT: setb %al
-; X32-NEXT: addl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %esi
-; X32-NEXT: movzbl %al, %eax
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movzbl -56(%ebp), %eax # 1-byte Folded Reload
+; X32-NEXT: adcl %eax, %edi
+; X32-NEXT: movl -76(%ebp), %eax # 4-byte Reload
; X32-NEXT: adcl $0, %eax
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
-; X32-NEXT: addl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
-; X32-NEXT: addl %edx, %ecx
-; X32-NEXT: adcl %esi, %edi
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
-; X32-NEXT: movl %eax, %edx
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: addl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: adcl %eax, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: adcl %eax, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT: adcl $0, %ecx
-; X32-NEXT: adcl $0, %edi
+; X32-NEXT: addl -672(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: adcl -836(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: movl %esi, -40(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -840(%ebp), %edi # 4-byte Folded Reload
+; X32-NEXT: movl %edi, -56(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -844(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: movl %eax, -76(%ebp) # 4-byte Spill
+; X32-NEXT: movl -232(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: addl -424(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: movl -152(%ebp), %edx # 4-byte Reload
+; X32-NEXT: adcl -420(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: movl -72(%ebp), %esi # 4-byte Reload
+; X32-NEXT: adcl -636(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: movl -64(%ebp), %edi # 4-byte Reload
+; X32-NEXT: adcl -632(%ebp), %edi # 4-byte Folded Reload
+; X32-NEXT: movzbl -588(%ebp), %eax # 1-byte Folded Reload
+; X32-NEXT: adcl %eax, %ebx
+; X32-NEXT: movl %ebx, -344(%ebp) # 4-byte Spill
+; X32-NEXT: movl -40(%ebp), %eax # 4-byte Reload
+; X32-NEXT: adcl $0, %eax
+; X32-NEXT: movl -56(%ebp), %ebx # 4-byte Reload
; X32-NEXT: adcl $0, %ebx
+; X32-NEXT: adcl $0, -76(%ebp) # 4-byte Folded Spill
+; X32-NEXT: addl -512(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: movl %ecx, -232(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -676(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: movl %edx, -152(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -432(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: movl %esi, -72(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -456(%ebp), %edi # 4-byte Folded Reload
+; X32-NEXT: movl %edi, -64(%ebp) # 4-byte Spill
+; X32-NEXT: movl -344(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: adcl -584(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: adcl -276(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: movl %eax, -40(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -240(%ebp), %ebx # 4-byte Folded Reload
+; X32-NEXT: movl %ebx, -56(%ebp) # 4-byte Spill
+; X32-NEXT: movl -76(%ebp), %eax # 4-byte Reload
+; X32-NEXT: adcl -172(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: movl -32(%ebp), %edx # 4-byte Reload
+; X32-NEXT: addl %edx, -508(%ebp) # 4-byte Folded Spill
+; X32-NEXT: movl -68(%ebp), %edx # 4-byte Reload
+; X32-NEXT: adcl %edx, -504(%ebp) # 4-byte Folded Spill
+; X32-NEXT: movl -20(%ebp), %edx # 4-byte Reload
+; X32-NEXT: adcl %edx, -328(%ebp) # 4-byte Folded Spill
+; X32-NEXT: movl -368(%ebp), %edx # 4-byte Reload
+; X32-NEXT: adcl %edx, -468(%ebp) # 4-byte Folded Spill
+; X32-NEXT: movl -44(%ebp), %edx # 4-byte Reload
+; X32-NEXT: adcl %edx, -404(%ebp) # 4-byte Folded Spill
+; X32-NEXT: movl -52(%ebp), %edx # 4-byte Reload
+; X32-NEXT: adcl %edx, -540(%ebp) # 4-byte Folded Spill
+; X32-NEXT: movl -24(%ebp), %edx # 4-byte Reload
+; X32-NEXT: adcl %edx, -228(%ebp) # 4-byte Folded Spill
+; X32-NEXT: movl -464(%ebp), %edx # 4-byte Reload
+; X32-NEXT: adcl %edx, -196(%ebp) # 4-byte Folded Spill
+; X32-NEXT: movl -232(%ebp), %edx # 4-byte Reload
; X32-NEXT: adcl $0, %edx
-; X32-NEXT: addl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
-; X32-NEXT: movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
-; X32-NEXT: movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
-; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: setb {{[0-9]+}}(%esp) # 1-byte Folded Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: addl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X32-NEXT: adcl $0, %edi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: adcl $0, %edx
-; X32-NEXT: addl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT: movl -152(%ebp), %esi # 4-byte Reload
; X32-NEXT: adcl $0, %esi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: movl -72(%ebp), %edi # 4-byte Reload
+; X32-NEXT: adcl $0, %edi
+; X32-NEXT: movl -64(%ebp), %ebx # 4-byte Reload
+; X32-NEXT: adcl $0, %ebx
; X32-NEXT: adcl $0, %ecx
-; X32-NEXT: addl %edi, %esi
+; X32-NEXT: adcl $0, -40(%ebp) # 4-byte Folded Spill
+; X32-NEXT: adcl $0, -56(%ebp) # 4-byte Folded Spill
+; X32-NEXT: adcl $0, %eax
+; X32-NEXT: addl -292(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: movl %edx, -232(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -372(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: movl %esi, -152(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -88(%ebp), %edi # 4-byte Folded Reload
+; X32-NEXT: movl %edi, -72(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -296(%ebp), %ebx # 4-byte Folded Reload
+; X32-NEXT: movl %ebx, -64(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -272(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: movl %ecx, -344(%ebp) # 4-byte Spill
+; X32-NEXT: movl -40(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: adcl -36(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: movl %ecx, -40(%ebp) # 4-byte Spill
+; X32-NEXT: movl -56(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: adcl -332(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: movl %ecx, -56(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -80(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: movl %eax, -76(%ebp) # 4-byte Spill
+; X32-NEXT: setb -372(%ebp) # 1-byte Folded Spill
+; X32-NEXT: movl -408(%ebp), %esi # 4-byte Reload
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: movl -188(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, -240(%ebp) # 4-byte Spill
+; X32-NEXT: movl %eax, -276(%ebp) # 4-byte Spill
+; X32-NEXT: movl -192(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, %edi
+; X32-NEXT: movl %eax, %ebx
+; X32-NEXT: addl -240(%ebp), %ebx # 4-byte Folded Reload
+; X32-NEXT: adcl $0, %edi
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: movl -148(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: addl %ebx, %eax
+; X32-NEXT: movl %eax, -240(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %edi, %esi
+; X32-NEXT: setb %bl
+; X32-NEXT: movl -192(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %ecx
+; X32-NEXT: addl %esi, %eax
+; X32-NEXT: movzbl %bl, %ecx
+; X32-NEXT: adcl %ecx, %edx
+; X32-NEXT: movl -392(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: addl -100(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: movl -412(%ebp), %esi # 4-byte Reload
+; X32-NEXT: adcl -204(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: addl %eax, %ecx
+; X32-NEXT: movl %ecx, -80(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %edx, %esi
+; X32-NEXT: movl %esi, -172(%ebp) # 4-byte Spill
+; X32-NEXT: movl -440(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: movl -188(%ebp), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, -36(%ebp) # 4-byte Spill
+; X32-NEXT: movl %eax, -272(%ebp) # 4-byte Spill
+; X32-NEXT: movl -340(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %edi
+; X32-NEXT: movl %eax, %ebx
+; X32-NEXT: addl -36(%ebp), %ebx # 4-byte Folded Reload
+; X32-NEXT: adcl $0, %edi
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: movl -148(%ebp), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: addl %ebx, %eax
+; X32-NEXT: movl %eax, -296(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %edi, %ecx
+; X32-NEXT: setb %bl
+; X32-NEXT: movl -340(%ebp), %edi # 4-byte Reload
+; X32-NEXT: movl %edi, %eax
+; X32-NEXT: mull %esi
+; X32-NEXT: addl %ecx, %eax
+; X32-NEXT: movzbl %bl, %ecx
+; X32-NEXT: adcl %ecx, %edx
+; X32-NEXT: addl -680(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: adcl -884(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: addl -276(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: movl %eax, -20(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -240(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: movl %edx, -36(%ebp) # 4-byte Spill
+; X32-NEXT: adcl $0, -80(%ebp) # 4-byte Folded Spill
+; X32-NEXT: adcl $0, -172(%ebp) # 4-byte Folded Spill
+; X32-NEXT: movl -440(%ebp), %ebx # 4-byte Reload
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: movl -236(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: movl %eax, -276(%ebp) # 4-byte Spill
+; X32-NEXT: movl %edi, %eax
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, %edi
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: addl %esi, %ecx
+; X32-NEXT: adcl $0, %edi
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: movl -112(%ebp), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: addl %ecx, %eax
+; X32-NEXT: movl %eax, -240(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %edi, %ebx
+; X32-NEXT: setb %cl
+; X32-NEXT: movl -340(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: addl %ebx, %eax
+; X32-NEXT: movzbl %cl, %ecx
+; X32-NEXT: adcl %ecx, %edx
+; X32-NEXT: movl -132(%ebp), %ebx # 4-byte Reload
+; X32-NEXT: addl -304(%ebp), %ebx # 4-byte Folded Reload
+; X32-NEXT: movl -140(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: adcl -128(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: addl %eax, %ebx
; X32-NEXT: adcl %edx, %ecx
-; X32-NEXT: setb %al
-; X32-NEXT: addl {{[0-9]+}}(%esp), %esi
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: movzbl %al, %edi
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %edi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X32-NEXT: movl -20(%ebp), %eax # 4-byte Reload
+; X32-NEXT: addl %eax, -276(%ebp) # 4-byte Folded Spill
+; X32-NEXT: movl -36(%ebp), %eax # 4-byte Reload
+; X32-NEXT: adcl %eax, -240(%ebp) # 4-byte Folded Spill
; X32-NEXT: adcl $0, %ebx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: addl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
-; X32-NEXT: addl %esi, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT: adcl %ecx, %edx
-; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl %edi, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT: adcl %ebx, %eax
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
-; X32-NEXT: addl {{[0-9]+}}(%esp), %ebx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %edi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
-; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax # 1-byte Folded Reload
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
-; X32-NEXT: adcl %eax, %esi
-; X32-NEXT: adcl $0, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X32-NEXT: adcl $0, %ecx
+; X32-NEXT: addl -80(%ebp), %ebx # 4-byte Folded Reload
+; X32-NEXT: adcl -172(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: setb -20(%ebp) # 1-byte Folded Spill
+; X32-NEXT: movl -408(%ebp), %eax # 4-byte Reload
+; X32-NEXT: movl -236(%ebp), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, -172(%ebp) # 4-byte Spill
+; X32-NEXT: movl %eax, -80(%ebp) # 4-byte Spill
+; X32-NEXT: movl -192(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %eax, %edi
+; X32-NEXT: addl -172(%ebp), %edi # 4-byte Folded Reload
+; X32-NEXT: adcl $0, %edx
+; X32-NEXT: movl %edx, -172(%ebp) # 4-byte Spill
+; X32-NEXT: movl -408(%ebp), %eax # 4-byte Reload
+; X32-NEXT: movl -112(%ebp), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: addl %edi, %eax
+; X32-NEXT: movl %eax, -36(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -172(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: movl %edx, %edi
+; X32-NEXT: setb -172(%ebp) # 1-byte Folded Spill
+; X32-NEXT: movl -192(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: addl %edi, %eax
+; X32-NEXT: movzbl -172(%ebp), %esi # 1-byte Folded Reload
+; X32-NEXT: adcl %esi, %edx
+; X32-NEXT: movl -392(%ebp), %edi # 4-byte Reload
+; X32-NEXT: addl -304(%ebp), %edi # 4-byte Folded Reload
+; X32-NEXT: movl -412(%ebp), %esi # 4-byte Reload
+; X32-NEXT: adcl -128(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: addl %eax, %edi
+; X32-NEXT: adcl %edx, %esi
+; X32-NEXT: movl %esi, -172(%ebp) # 4-byte Spill
+; X32-NEXT: movl -80(%ebp), %edx # 4-byte Reload
+; X32-NEXT: addl %ebx, %edx
+; X32-NEXT: movl -36(%ebp), %esi # 4-byte Reload
+; X32-NEXT: adcl %ecx, %esi
+; X32-NEXT: movzbl -20(%ebp), %eax # 1-byte Folded Reload
+; X32-NEXT: adcl %eax, %edi
+; X32-NEXT: movl -172(%ebp), %eax # 4-byte Reload
; X32-NEXT: adcl $0, %eax
-; X32-NEXT: adcl $0, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT: addl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
-; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
-; X32-NEXT: movl %ecx, %edx
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %esi # 4-byte Folded Reload
-; X32-NEXT: movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: addl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: adcl %eax, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: adcl %eax, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: adcl %eax, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: adcl %eax, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: adcl %eax, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: adcl %eax, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT: adcl $0, %ebx
+; X32-NEXT: addl -680(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: movl %edx, -80(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -856(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: movl %esi, -36(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -852(%ebp), %edi # 4-byte Folded Reload
+; X32-NEXT: movl %edi, -292(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -848(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: movl %eax, -172(%ebp) # 4-byte Spill
+; X32-NEXT: movl -352(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: movl -188(%ebp), %edi # 4-byte Reload
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %edx, -20(%ebp) # 4-byte Spill
+; X32-NEXT: movl %eax, -44(%ebp) # 4-byte Spill
+; X32-NEXT: movl -120(%ebp), %esi # 4-byte Reload
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %edx, %edi
+; X32-NEXT: movl %eax, %ebx
+; X32-NEXT: addl -20(%ebp), %ebx # 4-byte Folded Reload
; X32-NEXT: adcl $0, %edi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: mull -148(%ebp) # 4-byte Folded Reload
+; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: addl %ebx, %eax
+; X32-NEXT: movl %eax, -52(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %edi, %ecx
+; X32-NEXT: setb %bl
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: movl -148(%ebp), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: addl %ecx, %eax
+; X32-NEXT: movzbl %bl, %ecx
+; X32-NEXT: adcl %ecx, %edx
+; X32-NEXT: movl -364(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: addl -100(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: movl -396(%ebp), %edi # 4-byte Reload
+; X32-NEXT: adcl -204(%ebp), %edi # 4-byte Folded Reload
+; X32-NEXT: addl %eax, %ecx
+; X32-NEXT: movl %ecx, -24(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %edx, %edi
+; X32-NEXT: movl %edi, -20(%ebp) # 4-byte Spill
+; X32-NEXT: movl -416(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: movl -188(%ebp), %edi # 4-byte Reload
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %edx, -88(%ebp) # 4-byte Spill
+; X32-NEXT: movl %eax, -432(%ebp) # 4-byte Spill
+; X32-NEXT: movl -316(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %edx, %edi
+; X32-NEXT: movl %eax, %ebx
+; X32-NEXT: addl -88(%ebp), %ebx # 4-byte Folded Reload
+; X32-NEXT: adcl $0, %edi
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: addl %ebx, %eax
+; X32-NEXT: movl %eax, -456(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %edi, %ecx
+; X32-NEXT: setb %bl
+; X32-NEXT: movl -316(%ebp), %edi # 4-byte Reload
+; X32-NEXT: movl %edi, %eax
+; X32-NEXT: mull %esi
+; X32-NEXT: addl %ecx, %eax
+; X32-NEXT: movzbl %bl, %ecx
+; X32-NEXT: adcl %ecx, %edx
+; X32-NEXT: addl -656(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: adcl -892(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: addl -44(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: movl %eax, -32(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -52(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: movl %edx, -88(%ebp) # 4-byte Spill
+; X32-NEXT: adcl $0, -24(%ebp) # 4-byte Folded Spill
+; X32-NEXT: adcl $0, -20(%ebp) # 4-byte Folded Spill
+; X32-NEXT: movl -416(%ebp), %esi # 4-byte Reload
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: movl -236(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: movl %eax, -44(%ebp) # 4-byte Spill
+; X32-NEXT: movl %edi, %eax
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, %edi
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: addl %ebx, %ecx
+; X32-NEXT: adcl $0, %edi
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: movl -112(%ebp), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: addl %ecx, %eax
+; X32-NEXT: movl %eax, -52(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %edi, %ebx
+; X32-NEXT: setb %cl
+; X32-NEXT: movl -316(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: addl %ebx, %eax
+; X32-NEXT: movzbl %cl, %ecx
+; X32-NEXT: adcl %ecx, %edx
+; X32-NEXT: movl -324(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: addl -304(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: movl -400(%ebp), %esi # 4-byte Reload
+; X32-NEXT: adcl -128(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: addl %eax, %ecx
+; X32-NEXT: adcl %edx, %esi
+; X32-NEXT: movl -32(%ebp), %eax # 4-byte Reload
+; X32-NEXT: addl %eax, -44(%ebp) # 4-byte Folded Spill
+; X32-NEXT: movl -88(%ebp), %eax # 4-byte Reload
+; X32-NEXT: adcl %eax, -52(%ebp) # 4-byte Folded Spill
+; X32-NEXT: adcl $0, %ecx
; X32-NEXT: adcl $0, %esi
+; X32-NEXT: addl -24(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: adcl -20(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: setb -24(%ebp) # 1-byte Folded Spill
+; X32-NEXT: movl -352(%ebp), %eax # 4-byte Reload
+; X32-NEXT: movl -236(%ebp), %edi # 4-byte Reload
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %edx, -88(%ebp) # 4-byte Spill
+; X32-NEXT: movl %eax, -20(%ebp) # 4-byte Spill
+; X32-NEXT: movl -120(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %eax, %ebx
+; X32-NEXT: addl -88(%ebp), %ebx # 4-byte Folded Reload
; X32-NEXT: adcl $0, %edx
-; X32-NEXT: adcl $0, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: adcl $0, %eax
-; X32-NEXT: adcl $0, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT: adcl $0, %ecx
-; X32-NEXT: addl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
-; X32-NEXT: movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
-; X32-NEXT: movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %esi # 4-byte Folded Reload
-; X32-NEXT: movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
-; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
-; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: setb {{[0-9]+}}(%esp) # 1-byte Folded Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: addl %eax, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: adcl %eax, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: addl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl %eax, %edx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X32-NEXT: movl %edx, -32(%ebp) # 4-byte Spill
+; X32-NEXT: movl -352(%ebp), %eax # 4-byte Reload
+; X32-NEXT: movl -112(%ebp), %edi # 4-byte Reload
+; X32-NEXT: mull %edi
+; X32-NEXT: addl %ebx, %eax
+; X32-NEXT: movl %eax, -88(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -32(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: setb -32(%ebp) # 1-byte Folded Spill
+; X32-NEXT: movl -120(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %edi
+; X32-NEXT: addl %ebx, %eax
+; X32-NEXT: movzbl -32(%ebp), %edi # 1-byte Folded Reload
+; X32-NEXT: adcl %edi, %edx
+; X32-NEXT: movl -364(%ebp), %edi # 4-byte Reload
+; X32-NEXT: addl -304(%ebp), %edi # 4-byte Folded Reload
+; X32-NEXT: movl -396(%ebp), %ebx # 4-byte Reload
+; X32-NEXT: adcl -128(%ebp), %ebx # 4-byte Folded Reload
+; X32-NEXT: addl %eax, %edi
+; X32-NEXT: adcl %edx, %ebx
+; X32-NEXT: movl -20(%ebp), %edx # 4-byte Reload
+; X32-NEXT: addl %ecx, %edx
+; X32-NEXT: movl -88(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: adcl %esi, %ecx
+; X32-NEXT: movzbl -24(%ebp), %eax # 1-byte Folded Reload
+; X32-NEXT: adcl %eax, %edi
+; X32-NEXT: adcl $0, %ebx
+; X32-NEXT: addl -656(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: adcl -700(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: adcl -860(%ebp), %edi # 4-byte Folded Reload
+; X32-NEXT: adcl -864(%ebp), %ebx # 4-byte Folded Reload
+; X32-NEXT: addl -272(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: movl %edx, -20(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -296(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: movl %ecx, -88(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -276(%ebp), %edi # 4-byte Folded Reload
+; X32-NEXT: movl %edi, -332(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -240(%ebp), %ebx # 4-byte Folded Reload
+; X32-NEXT: movl %ebx, -368(%ebp) # 4-byte Spill
+; X32-NEXT: adcl $0, -80(%ebp) # 4-byte Folded Spill
+; X32-NEXT: adcl $0, -36(%ebp) # 4-byte Folded Spill
+; X32-NEXT: adcl $0, -292(%ebp) # 4-byte Folded Spill
+; X32-NEXT: adcl $0, -172(%ebp) # 4-byte Folded Spill
+; X32-NEXT: movl -352(%ebp), %ebx # 4-byte Reload
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: movl -300(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: movl %eax, -276(%ebp) # 4-byte Spill
+; X32-NEXT: movl -120(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, %edi
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: addl %esi, %ecx
; X32-NEXT: adcl $0, %edi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: movl -144(%ebp), %ebx # 4-byte Reload
+; X32-NEXT: mull %ebx
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: addl %ecx, %eax
+; X32-NEXT: movl %eax, -240(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %edi, %esi
+; X32-NEXT: setb %cl
+; X32-NEXT: movl -120(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %ebx
+; X32-NEXT: addl %esi, %eax
+; X32-NEXT: movzbl %cl, %ecx
+; X32-NEXT: adcl %ecx, %edx
+; X32-NEXT: movl -364(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: addl -336(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: movl -396(%ebp), %esi # 4-byte Reload
+; X32-NEXT: adcl -176(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: addl %eax, %ecx
+; X32-NEXT: movl %ecx, -32(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %edx, %esi
+; X32-NEXT: movl %esi, -296(%ebp) # 4-byte Spill
+; X32-NEXT: movl -416(%ebp), %ebx # 4-byte Reload
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: movl -300(%ebp), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: movl %eax, -24(%ebp) # 4-byte Spill
+; X32-NEXT: movl -316(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: movl %eax, %edi
+; X32-NEXT: addl %ecx, %edi
+; X32-NEXT: adcl $0, %esi
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: movl -144(%ebp), %ebx # 4-byte Reload
+; X32-NEXT: mull %ebx
+; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: addl %edi, %eax
+; X32-NEXT: movl %eax, -272(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %esi, %ecx
+; X32-NEXT: setb -68(%ebp) # 1-byte Folded Spill
+; X32-NEXT: movl -316(%ebp), %esi # 4-byte Reload
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: mull %ebx
+; X32-NEXT: addl %ecx, %eax
+; X32-NEXT: movzbl -68(%ebp), %ecx # 1-byte Folded Reload
+; X32-NEXT: adcl %ecx, %edx
+; X32-NEXT: addl -684(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: adcl -868(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: addl -276(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: movl %eax, -512(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -240(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: movl %edx, -68(%ebp) # 4-byte Spill
+; X32-NEXT: adcl $0, -32(%ebp) # 4-byte Folded Spill
+; X32-NEXT: adcl $0, -296(%ebp) # 4-byte Folded Spill
+; X32-NEXT: movl -416(%ebp), %edi # 4-byte Reload
+; X32-NEXT: movl %edi, %eax
+; X32-NEXT: movl -244(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: movl %eax, -276(%ebp) # 4-byte Spill
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: addl %ebx, %ecx
+; X32-NEXT: adcl $0, %esi
+; X32-NEXT: movl %edi, %eax
+; X32-NEXT: movl -164(%ebp), %ebx # 4-byte Reload
+; X32-NEXT: mull %ebx
+; X32-NEXT: movl %edx, %edi
+; X32-NEXT: addl %ecx, %eax
+; X32-NEXT: movl %eax, -240(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %esi, %edi
+; X32-NEXT: setb %cl
+; X32-NEXT: movl -316(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %ebx
+; X32-NEXT: addl %edi, %eax
+; X32-NEXT: movzbl %cl, %ecx
+; X32-NEXT: adcl %ecx, %edx
+; X32-NEXT: movl -324(%ebp), %ebx # 4-byte Reload
+; X32-NEXT: addl -224(%ebp), %ebx # 4-byte Folded Reload
+; X32-NEXT: movl -400(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: adcl -360(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: addl %eax, %ebx
+; X32-NEXT: adcl %edx, %ecx
+; X32-NEXT: movl -512(%ebp), %eax # 4-byte Reload
+; X32-NEXT: addl %eax, -276(%ebp) # 4-byte Folded Spill
+; X32-NEXT: movl -68(%ebp), %eax # 4-byte Reload
+; X32-NEXT: adcl %eax, -240(%ebp) # 4-byte Folded Spill
+; X32-NEXT: adcl $0, %ebx
+; X32-NEXT: adcl $0, %ecx
+; X32-NEXT: addl -32(%ebp), %ebx # 4-byte Folded Reload
+; X32-NEXT: adcl -296(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: setb -512(%ebp) # 1-byte Folded Spill
+; X32-NEXT: movl -352(%ebp), %eax # 4-byte Reload
+; X32-NEXT: movl -244(%ebp), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, -32(%ebp) # 4-byte Spill
+; X32-NEXT: movl %eax, -296(%ebp) # 4-byte Spill
+; X32-NEXT: movl -120(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %eax, %edi
+; X32-NEXT: addl -32(%ebp), %edi # 4-byte Folded Reload
+; X32-NEXT: adcl $0, %edx
+; X32-NEXT: movl %edx, -68(%ebp) # 4-byte Spill
+; X32-NEXT: movl -352(%ebp), %eax # 4-byte Reload
+; X32-NEXT: movl -164(%ebp), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: addl %edi, %eax
+; X32-NEXT: movl %eax, -32(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -68(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: movl %edx, %edi
+; X32-NEXT: setb -68(%ebp) # 1-byte Folded Spill
+; X32-NEXT: movl -120(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: addl %edi, %eax
+; X32-NEXT: movzbl -68(%ebp), %esi # 1-byte Folded Reload
+; X32-NEXT: adcl %esi, %edx
+; X32-NEXT: movl -364(%ebp), %edi # 4-byte Reload
+; X32-NEXT: addl -224(%ebp), %edi # 4-byte Folded Reload
+; X32-NEXT: movl -396(%ebp), %esi # 4-byte Reload
+; X32-NEXT: adcl -360(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: addl %eax, %edi
+; X32-NEXT: adcl %edx, %esi
+; X32-NEXT: movl %esi, -68(%ebp) # 4-byte Spill
+; X32-NEXT: movl -296(%ebp), %edx # 4-byte Reload
+; X32-NEXT: addl %ebx, %edx
+; X32-NEXT: movl -32(%ebp), %esi # 4-byte Reload
+; X32-NEXT: adcl %ecx, %esi
+; X32-NEXT: movzbl -512(%ebp), %eax # 1-byte Folded Reload
+; X32-NEXT: adcl %eax, %edi
+; X32-NEXT: movl -68(%ebp), %eax # 4-byte Reload
; X32-NEXT: adcl $0, %eax
-; X32-NEXT: addl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT: addl -684(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: adcl -876(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: adcl -872(%ebp), %edi # 4-byte Folded Reload
+; X32-NEXT: adcl -880(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: movl -20(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: addl %ecx, -24(%ebp) # 4-byte Folded Spill
+; X32-NEXT: movl -88(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: adcl %ecx, -272(%ebp) # 4-byte Folded Spill
+; X32-NEXT: movl -332(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: adcl %ecx, -276(%ebp) # 4-byte Folded Spill
+; X32-NEXT: movl -368(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: adcl %ecx, -240(%ebp) # 4-byte Folded Spill
; X32-NEXT: adcl $0, %edx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
; X32-NEXT: adcl $0, %esi
-; X32-NEXT: addl %edi, %edx
-; X32-NEXT: adcl %eax, %esi
-; X32-NEXT: setb %al
-; X32-NEXT: addl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %esi
-; X32-NEXT: movzbl %al, %eax
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: adcl $0, %edi
; X32-NEXT: adcl $0, %eax
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT: addl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
-; X32-NEXT: addl %edx, %edi
-; X32-NEXT: movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl %esi, %ebx
-; X32-NEXT: movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: addl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: addl -80(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: movl %edx, -296(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -36(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: movl %esi, -32(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -292(%ebp), %edi # 4-byte Folded Reload
+; X32-NEXT: movl %edi, -292(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -172(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: movl %eax, -68(%ebp) # 4-byte Spill
+; X32-NEXT: setb -88(%ebp) # 1-byte Folded Spill
+; X32-NEXT: movl -408(%ebp), %ebx # 4-byte Reload
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: movl -300(%ebp), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: movl %eax, -36(%ebp) # 4-byte Spill
+; X32-NEXT: movl -192(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %edi
; X32-NEXT: movl %eax, %esi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: adcl $0, %eax
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: adcl $0, %ecx
-; X32-NEXT: addl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %esi
-; X32-NEXT: movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: adcl $0, %edx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT: addl %ecx, %esi
+; X32-NEXT: adcl $0, %edi
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: movl -144(%ebp), %ebx # 4-byte Reload
+; X32-NEXT: mull %ebx
+; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: addl %esi, %eax
+; X32-NEXT: movl %eax, -20(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %edi, %ecx
+; X32-NEXT: setb -172(%ebp) # 1-byte Folded Spill
+; X32-NEXT: movl -192(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %ebx
+; X32-NEXT: addl %ecx, %eax
+; X32-NEXT: movzbl -172(%ebp), %ecx # 1-byte Folded Reload
+; X32-NEXT: adcl %ecx, %edx
+; X32-NEXT: movl -336(%ebp), %esi # 4-byte Reload
+; X32-NEXT: addl -392(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: movl -176(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: adcl -412(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: addl %eax, %esi
+; X32-NEXT: movl %esi, -336(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %edx, %ecx
+; X32-NEXT: movl %ecx, -176(%ebp) # 4-byte Spill
+; X32-NEXT: movl -440(%ebp), %ebx # 4-byte Reload
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: movl -300(%ebp), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: movl %eax, -172(%ebp) # 4-byte Spill
+; X32-NEXT: movl -340(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: movl %eax, %edi
+; X32-NEXT: addl %ecx, %edi
; X32-NEXT: adcl $0, %esi
-; X32-NEXT: addl %eax, %edx
-; X32-NEXT: adcl %ecx, %esi
-; X32-NEXT: setb %al
-; X32-NEXT: addl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %esi
-; X32-NEXT: movzbl %al, %edi
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %edi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: movl -144(%ebp), %ebx # 4-byte Reload
+; X32-NEXT: mull %ebx
+; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: addl %edi, %eax
+; X32-NEXT: movl %eax, -80(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %esi, %ecx
+; X32-NEXT: setb -332(%ebp) # 1-byte Folded Spill
+; X32-NEXT: movl -340(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %ebx
+; X32-NEXT: movl %eax, %edi
+; X32-NEXT: addl %ecx, %edi
+; X32-NEXT: movzbl -332(%ebp), %eax # 1-byte Folded Reload
+; X32-NEXT: adcl %eax, %edx
+; X32-NEXT: addl -688(%ebp), %edi # 4-byte Folded Reload
+; X32-NEXT: adcl -888(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: addl -36(%ebp), %edi # 4-byte Folded Reload
+; X32-NEXT: adcl -20(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: movl %edx, -332(%ebp) # 4-byte Spill
+; X32-NEXT: adcl $0, -336(%ebp) # 4-byte Folded Spill
+; X32-NEXT: adcl $0, -176(%ebp) # 4-byte Folded Spill
+; X32-NEXT: movl -440(%ebp), %esi # 4-byte Reload
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: movl -244(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, -20(%ebp) # 4-byte Spill
+; X32-NEXT: movl %eax, -36(%ebp) # 4-byte Spill
+; X32-NEXT: movl -340(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: addl -20(%ebp), %ecx # 4-byte Folded Reload
; X32-NEXT: adcl $0, %ebx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: addl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
-; X32-NEXT: addl %edx, %eax
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx # 4-byte Reload
-; X32-NEXT: adcl %esi, %edx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
-; X32-NEXT: adcl %edi, %esi
-; X32-NEXT: adcl %ebx, %ecx
-; X32-NEXT: addl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %esi # 4-byte Folded Reload
-; X32-NEXT: movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl $0, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT: adcl $0, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT: adcl $0, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT: adcl $0, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: addl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl %eax, %esi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: adcl $0, %eax
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: adcl $0, %ecx
-; X32-NEXT: addl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %esi
-; X32-NEXT: movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: adcl $0, %edx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: mull -164(%ebp) # 4-byte Folded Reload
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: addl %ecx, %eax
+; X32-NEXT: movl %eax, -20(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %ebx, %esi
+; X32-NEXT: setb %cl
+; X32-NEXT: movl -340(%ebp), %eax # 4-byte Reload
+; X32-NEXT: movl -164(%ebp), %ebx # 4-byte Reload
+; X32-NEXT: mull %ebx
+; X32-NEXT: addl %esi, %eax
+; X32-NEXT: movzbl %cl, %ecx
+; X32-NEXT: adcl %ecx, %edx
+; X32-NEXT: movl -132(%ebp), %esi # 4-byte Reload
+; X32-NEXT: addl -224(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: movl -140(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: adcl -360(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: addl %eax, %esi
+; X32-NEXT: adcl %edx, %ecx
+; X32-NEXT: addl %edi, -36(%ebp) # 4-byte Folded Spill
+; X32-NEXT: movl -332(%ebp), %eax # 4-byte Reload
+; X32-NEXT: adcl %eax, -20(%ebp) # 4-byte Folded Spill
; X32-NEXT: adcl $0, %esi
-; X32-NEXT: addl %eax, %edx
-; X32-NEXT: adcl %ecx, %esi
-; X32-NEXT: setb %al
-; X32-NEXT: addl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %esi
-; X32-NEXT: movzbl %al, %eax
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: adcl $0, %ecx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
-; X32-NEXT: addl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: adcl %eax, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: adcl %eax, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT: addl %edx, %ebx
-; X32-NEXT: movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
-; X32-NEXT: adcl %esi, %ebx
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx # 4-byte Reload
+; X32-NEXT: addl -336(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: movl %esi, -132(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -176(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: movl %ecx, -140(%ebp) # 4-byte Spill
+; X32-NEXT: setb -176(%ebp) # 1-byte Folded Spill
+; X32-NEXT: movl -408(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: movl -244(%ebp), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, -332(%ebp) # 4-byte Spill
+; X32-NEXT: movl %eax, -336(%ebp) # 4-byte Spill
+; X32-NEXT: movl -192(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: movl %eax, %edi
+; X32-NEXT: addl -332(%ebp), %edi # 4-byte Folded Reload
+; X32-NEXT: adcl $0, %esi
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: mull %ebx
+; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: addl %edi, %eax
+; X32-NEXT: movl %eax, %edi
+; X32-NEXT: adcl %esi, %ecx
+; X32-NEXT: setb -332(%ebp) # 1-byte Folded Spill
+; X32-NEXT: movl -192(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %ebx
+; X32-NEXT: addl %ecx, %eax
+; X32-NEXT: movzbl -332(%ebp), %ecx # 1-byte Folded Reload
; X32-NEXT: adcl %ecx, %edx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: addl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: adcl %eax, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: adcl %eax, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
+; X32-NEXT: movl -392(%ebp), %esi # 4-byte Reload
+; X32-NEXT: addl -224(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: movl -412(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: adcl -360(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: addl %eax, %esi
+; X32-NEXT: adcl %edx, %ecx
+; X32-NEXT: movl -336(%ebp), %ebx # 4-byte Reload
+; X32-NEXT: addl -132(%ebp), %ebx # 4-byte Folded Reload
+; X32-NEXT: adcl -140(%ebp), %edi # 4-byte Folded Reload
+; X32-NEXT: movzbl -176(%ebp), %eax # 1-byte Folded Reload
+; X32-NEXT: adcl %eax, %esi
; X32-NEXT: adcl $0, %ecx
-; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: addl -688(%ebp), %ebx # 4-byte Folded Reload
+; X32-NEXT: adcl -900(%ebp), %edi # 4-byte Folded Reload
+; X32-NEXT: movl %edi, -360(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -896(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: movl %esi, -392(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -904(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: movl %ecx, -412(%ebp) # 4-byte Spill
+; X32-NEXT: movl -172(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: addl -296(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: movl -80(%ebp), %edx # 4-byte Reload
+; X32-NEXT: adcl -32(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: movl -36(%ebp), %esi # 4-byte Reload
+; X32-NEXT: adcl -292(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: movl -20(%ebp), %edi # 4-byte Reload
+; X32-NEXT: adcl -68(%ebp), %edi # 4-byte Folded Reload
+; X32-NEXT: movzbl -88(%ebp), %eax # 1-byte Folded Reload
+; X32-NEXT: adcl %eax, %ebx
+; X32-NEXT: movl %ebx, -336(%ebp) # 4-byte Spill
+; X32-NEXT: adcl $0, -360(%ebp) # 4-byte Folded Spill
+; X32-NEXT: movl -392(%ebp), %eax # 4-byte Reload
; X32-NEXT: adcl $0, %eax
+; X32-NEXT: movl -412(%ebp), %ebx # 4-byte Reload
+; X32-NEXT: adcl $0, %ebx
+; X32-NEXT: addl -656(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: movl %ecx, -172(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -700(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: movl %edx, -80(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -376(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: adcl -220(%ebp), %edi # 4-byte Folded Reload
+; X32-NEXT: movl -336(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: adcl -640(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: movl -360(%ebp), %edx # 4-byte Reload
+; X32-NEXT: adcl -200(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: adcl -472(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: movl %eax, -392(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -436(%ebp), %ebx # 4-byte Folded Reload
+; X32-NEXT: movl -232(%ebp), %eax # 4-byte Reload
+; X32-NEXT: addl %eax, -432(%ebp) # 4-byte Folded Spill
+; X32-NEXT: movl -152(%ebp), %eax # 4-byte Reload
+; X32-NEXT: adcl %eax, -456(%ebp) # 4-byte Folded Spill
+; X32-NEXT: movl -72(%ebp), %eax # 4-byte Reload
+; X32-NEXT: adcl %eax, -44(%ebp) # 4-byte Folded Spill
+; X32-NEXT: movl -64(%ebp), %eax # 4-byte Reload
+; X32-NEXT: adcl %eax, -52(%ebp) # 4-byte Folded Spill
+; X32-NEXT: movl -344(%ebp), %eax # 4-byte Reload
+; X32-NEXT: adcl %eax, -24(%ebp) # 4-byte Folded Spill
+; X32-NEXT: movl -40(%ebp), %eax # 4-byte Reload
+; X32-NEXT: adcl %eax, -272(%ebp) # 4-byte Folded Spill
+; X32-NEXT: movl -56(%ebp), %eax # 4-byte Reload
+; X32-NEXT: adcl %eax, -276(%ebp) # 4-byte Folded Spill
+; X32-NEXT: movl -76(%ebp), %eax # 4-byte Reload
+; X32-NEXT: adcl %eax, -240(%ebp) # 4-byte Folded Spill
+; X32-NEXT: movzbl -372(%ebp), %eax # 1-byte Folded Reload
+; X32-NEXT: adcl %eax, -172(%ebp) # 4-byte Folded Spill
+; X32-NEXT: adcl $0, -80(%ebp) # 4-byte Folded Spill
+; X32-NEXT: adcl $0, %esi
+; X32-NEXT: movl %esi, -36(%ebp) # 4-byte Spill
; X32-NEXT: adcl $0, %edi
-; X32-NEXT: adcl $0, %edx
-; X32-NEXT: addl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
-; X32-NEXT: movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
-; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: setb {{[0-9]+}}(%esp) # 1-byte Folded Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: addl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: movl %ecx, %esi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: movl %edi, -20(%ebp) # 4-byte Spill
; X32-NEXT: adcl $0, %ecx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT: movl %ecx, -336(%ebp) # 4-byte Spill
; X32-NEXT: adcl $0, %edx
-; X32-NEXT: addl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %esi
-; X32-NEXT: movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X32-NEXT: adcl $0, %esi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X32-NEXT: movl %edx, -360(%ebp) # 4-byte Spill
+; X32-NEXT: adcl $0, -392(%ebp) # 4-byte Folded Spill
+; X32-NEXT: adcl $0, %ebx
+; X32-NEXT: movl %ebx, -412(%ebp) # 4-byte Spill
+; X32-NEXT: movl -284(%ebp), %esi # 4-byte Reload
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: movl -476(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, -140(%ebp) # 4-byte Spill
+; X32-NEXT: movl %eax, -132(%ebp) # 4-byte Spill
+; X32-NEXT: movl -48(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, %edi
+; X32-NEXT: movl %eax, %ebx
+; X32-NEXT: addl -140(%ebp), %ebx # 4-byte Folded Reload
; X32-NEXT: adcl $0, %edi
-; X32-NEXT: addl %ecx, %esi
-; X32-NEXT: adcl %edx, %edi
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: movl -248(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: addl %ebx, %eax
+; X32-NEXT: movl %eax, -140(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %edi, %esi
+; X32-NEXT: setb %bl
+; X32-NEXT: movl -48(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %ecx
+; X32-NEXT: addl %esi, %eax
+; X32-NEXT: movzbl %bl, %ecx
+; X32-NEXT: adcl %ecx, %edx
+; X32-NEXT: movl -308(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: addl -480(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: movl -208(%ebp), %esi # 4-byte Reload
+; X32-NEXT: adcl -384(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: addl %eax, %ecx
+; X32-NEXT: movl %ecx, -200(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %edx, %esi
+; X32-NEXT: movl %esi, -176(%ebp) # 4-byte Spill
+; X32-NEXT: movl -212(%ebp), %esi # 4-byte Reload
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: movl -476(%ebp), %edi # 4-byte Reload
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: movl %eax, -64(%ebp) # 4-byte Spill
+; X32-NEXT: movl -252(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %edx, %edi
+; X32-NEXT: movl %eax, %ebx
+; X32-NEXT: addl %ecx, %ebx
+; X32-NEXT: adcl $0, %edi
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: movl -248(%ebp), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: addl %ebx, %eax
+; X32-NEXT: movl %eax, -220(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %edi, %ecx
+; X32-NEXT: setb -40(%ebp) # 1-byte Folded Spill
+; X32-NEXT: movl -252(%ebp), %edi # 4-byte Reload
+; X32-NEXT: movl %edi, %eax
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %eax, %ebx
+; X32-NEXT: addl %ecx, %ebx
+; X32-NEXT: movzbl -40(%ebp), %eax # 1-byte Folded Reload
+; X32-NEXT: adcl %eax, %edx
+; X32-NEXT: addl -692(%ebp), %ebx # 4-byte Folded Reload
+; X32-NEXT: adcl -920(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: addl -132(%ebp), %ebx # 4-byte Folded Reload
+; X32-NEXT: adcl -140(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: movl %edx, -56(%ebp) # 4-byte Spill
+; X32-NEXT: adcl $0, -200(%ebp) # 4-byte Folded Spill
+; X32-NEXT: adcl $0, -176(%ebp) # 4-byte Folded Spill
+; X32-NEXT: movl -212(%ebp), %eax # 4-byte Reload
+; X32-NEXT: movl -516(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, -132(%ebp) # 4-byte Spill
+; X32-NEXT: movl %eax, -140(%ebp) # 4-byte Spill
+; X32-NEXT: movl %edi, %eax
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: addl -132(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: adcl $0, %esi
+; X32-NEXT: movl 8(%ebp), %eax
+; X32-NEXT: movl 76(%eax), %edx
+; X32-NEXT: movl %edx, -132(%ebp) # 4-byte Spill
+; X32-NEXT: movl -212(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %edx
+; X32-NEXT: movl %edx, %edi
+; X32-NEXT: addl %ecx, %eax
+; X32-NEXT: movl %eax, -40(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %esi, %edi
; X32-NEXT: setb %cl
-; X32-NEXT: addl {{[0-9]+}}(%esp), %esi
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %edi
-; X32-NEXT: movzbl %cl, %eax
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT: movl -252(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull -132(%ebp) # 4-byte Folded Reload
+; X32-NEXT: addl %edi, %eax
+; X32-NEXT: movzbl %cl, %ecx
+; X32-NEXT: adcl %ecx, %edx
+; X32-NEXT: movl -116(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: addl -484(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: movl -84(%ebp), %esi # 4-byte Reload
+; X32-NEXT: adcl -488(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: addl %eax, %ecx
+; X32-NEXT: adcl %edx, %esi
+; X32-NEXT: addl %ebx, -140(%ebp) # 4-byte Folded Spill
+; X32-NEXT: movl -56(%ebp), %eax # 4-byte Reload
+; X32-NEXT: adcl %eax, -40(%ebp) # 4-byte Folded Spill
+; X32-NEXT: adcl $0, %ecx
+; X32-NEXT: adcl $0, %esi
+; X32-NEXT: addl -200(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: adcl -176(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: setb -56(%ebp) # 1-byte Folded Spill
+; X32-NEXT: movl -284(%ebp), %eax # 4-byte Reload
+; X32-NEXT: movl -516(%ebp), %edi # 4-byte Reload
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %edx, -200(%ebp) # 4-byte Spill
+; X32-NEXT: movl %eax, -176(%ebp) # 4-byte Spill
+; X32-NEXT: movl -48(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %eax, %ebx
+; X32-NEXT: addl -200(%ebp), %ebx # 4-byte Folded Reload
; X32-NEXT: adcl $0, %edx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
-; X32-NEXT: addl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
-; X32-NEXT: adcl %ecx, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
-; X32-NEXT: adcl %ecx, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT: addl %esi, %ebx
-; X32-NEXT: movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl %eax, %esi
+; X32-NEXT: movl %edx, -224(%ebp) # 4-byte Spill
+; X32-NEXT: movl -284(%ebp), %eax # 4-byte Reload
+; X32-NEXT: movl -132(%ebp), %edi # 4-byte Reload
+; X32-NEXT: mull %edi
+; X32-NEXT: addl %ebx, %eax
+; X32-NEXT: movl %eax, -200(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -224(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: setb -224(%ebp) # 1-byte Folded Spill
+; X32-NEXT: movl -48(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %edi
+; X32-NEXT: addl %ebx, %eax
+; X32-NEXT: movzbl -224(%ebp), %edi # 1-byte Folded Reload
+; X32-NEXT: adcl %edi, %edx
+; X32-NEXT: movl -308(%ebp), %edi # 4-byte Reload
+; X32-NEXT: addl -484(%ebp), %edi # 4-byte Folded Reload
+; X32-NEXT: movl -208(%ebp), %ebx # 4-byte Reload
+; X32-NEXT: adcl -488(%ebp), %ebx # 4-byte Folded Reload
+; X32-NEXT: addl %eax, %edi
+; X32-NEXT: adcl %edx, %ebx
+; X32-NEXT: movl -176(%ebp), %edx # 4-byte Reload
+; X32-NEXT: addl %ecx, %edx
+; X32-NEXT: movl -200(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: adcl %esi, %ecx
+; X32-NEXT: movzbl -56(%ebp), %eax # 1-byte Folded Reload
+; X32-NEXT: adcl %eax, %edi
+; X32-NEXT: adcl $0, %ebx
+; X32-NEXT: addl -692(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: movl %edx, -176(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -908(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: movl %ecx, -200(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -916(%ebp), %edi # 4-byte Folded Reload
+; X32-NEXT: movl %edi, -68(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -912(%ebp), %ebx # 4-byte Folded Reload
+; X32-NEXT: movl %ebx, -32(%ebp) # 4-byte Spill
+; X32-NEXT: movl -108(%ebp), %esi # 4-byte Reload
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: movl -476(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, -56(%ebp) # 4-byte Spill
+; X32-NEXT: movl %eax, -76(%ebp) # 4-byte Spill
+; X32-NEXT: movl -96(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, %edi
+; X32-NEXT: movl %eax, %ebx
+; X32-NEXT: addl -56(%ebp), %ebx # 4-byte Folded Reload
+; X32-NEXT: adcl $0, %edi
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: movl -248(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: addl %ebx, %eax
+; X32-NEXT: movl %eax, -72(%ebp) # 4-byte Spill
; X32-NEXT: adcl %edi, %esi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
+; X32-NEXT: setb %bl
+; X32-NEXT: movl -96(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %ecx
+; X32-NEXT: addl %esi, %eax
+; X32-NEXT: movzbl %bl, %ecx
+; X32-NEXT: adcl %ecx, %edx
+; X32-NEXT: movl -104(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: addl -480(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: movl -156(%ebp), %esi # 4-byte Reload
+; X32-NEXT: adcl -384(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: addl %eax, %ecx
+; X32-NEXT: movl %ecx, -224(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %edx, %esi
+; X32-NEXT: movl %esi, -56(%ebp) # 4-byte Spill
+; X32-NEXT: movl -168(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: movl -476(%ebp), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, -436(%ebp) # 4-byte Spill
+; X32-NEXT: movl %eax, -344(%ebp) # 4-byte Spill
+; X32-NEXT: movl -92(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %edi
+; X32-NEXT: movl %eax, %ebx
+; X32-NEXT: addl -436(%ebp), %ebx # 4-byte Folded Reload
+; X32-NEXT: adcl $0, %edi
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: movl -248(%ebp), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: addl %ebx, %eax
+; X32-NEXT: movl %eax, -232(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %edi, %ecx
+; X32-NEXT: setb %bl
+; X32-NEXT: movl -92(%ebp), %edi # 4-byte Reload
+; X32-NEXT: movl %edi, %eax
+; X32-NEXT: mull %esi
+; X32-NEXT: addl %ecx, %eax
+; X32-NEXT: movzbl %bl, %ecx
+; X32-NEXT: adcl %ecx, %edx
+; X32-NEXT: addl -696(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: adcl -932(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: addl -76(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: movl %eax, -88(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -72(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: movl %edx, -76(%ebp) # 4-byte Spill
+; X32-NEXT: adcl $0, -224(%ebp) # 4-byte Folded Spill
+; X32-NEXT: adcl $0, -56(%ebp) # 4-byte Folded Spill
+; X32-NEXT: movl -168(%ebp), %esi # 4-byte Reload
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: movl -516(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, -72(%ebp) # 4-byte Spill
+; X32-NEXT: movl %eax, -436(%ebp) # 4-byte Spill
+; X32-NEXT: movl %edi, %eax
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: addl -72(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: adcl $0, %ebx
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: movl -132(%ebp), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %edi
+; X32-NEXT: addl %ecx, %eax
+; X32-NEXT: movl %eax, -472(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %ebx, %edi
+; X32-NEXT: setb %cl
+; X32-NEXT: movl -92(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: addl %edi, %eax
+; X32-NEXT: movzbl %cl, %ecx
+; X32-NEXT: adcl %ecx, %edx
+; X32-NEXT: movl -28(%ebp), %edi # 4-byte Reload
+; X32-NEXT: addl -484(%ebp), %edi # 4-byte Folded Reload
+; X32-NEXT: movl -256(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: adcl -488(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: addl %eax, %edi
; X32-NEXT: adcl %edx, %ecx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
-; X32-NEXT: addl {{[0-9]+}}(%esp), %ebx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax # 1-byte Folded Reload
-; X32-NEXT: adcl %eax, {{[0-9]+}}(%esp) # 4-byte Folded Spill
+; X32-NEXT: movl -88(%ebp), %eax # 4-byte Reload
+; X32-NEXT: addl %eax, -436(%ebp) # 4-byte Folded Spill
+; X32-NEXT: movl -76(%ebp), %eax # 4-byte Reload
+; X32-NEXT: adcl %eax, -472(%ebp) # 4-byte Folded Spill
+; X32-NEXT: adcl $0, %edi
+; X32-NEXT: adcl $0, %ecx
+; X32-NEXT: addl -224(%ebp), %edi # 4-byte Folded Reload
+; X32-NEXT: adcl -56(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: setb -56(%ebp) # 1-byte Folded Spill
+; X32-NEXT: movl -108(%ebp), %eax # 4-byte Reload
+; X32-NEXT: movl -516(%ebp), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, -76(%ebp) # 4-byte Spill
+; X32-NEXT: movl %eax, -224(%ebp) # 4-byte Spill
+; X32-NEXT: movl -96(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %eax, %ebx
+; X32-NEXT: addl -76(%ebp), %ebx # 4-byte Folded Reload
+; X32-NEXT: adcl $0, %edx
+; X32-NEXT: movl %edx, -72(%ebp) # 4-byte Spill
+; X32-NEXT: movl -108(%ebp), %eax # 4-byte Reload
+; X32-NEXT: movl -132(%ebp), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: addl %ebx, %eax
+; X32-NEXT: movl %eax, -76(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -72(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: setb -72(%ebp) # 1-byte Folded Spill
+; X32-NEXT: movl -96(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: addl %ebx, %eax
+; X32-NEXT: movzbl -72(%ebp), %esi # 1-byte Folded Reload
+; X32-NEXT: adcl %esi, %edx
+; X32-NEXT: movl -104(%ebp), %ebx # 4-byte Reload
+; X32-NEXT: addl -484(%ebp), %ebx # 4-byte Folded Reload
+; X32-NEXT: movl -156(%ebp), %esi # 4-byte Reload
+; X32-NEXT: adcl -488(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: addl %eax, %ebx
+; X32-NEXT: adcl %edx, %esi
+; X32-NEXT: movl %esi, -72(%ebp) # 4-byte Spill
+; X32-NEXT: movl -224(%ebp), %edx # 4-byte Reload
+; X32-NEXT: addl %edi, %edx
+; X32-NEXT: movl -76(%ebp), %esi # 4-byte Reload
+; X32-NEXT: adcl %ecx, %esi
+; X32-NEXT: movzbl -56(%ebp), %eax # 1-byte Folded Reload
+; X32-NEXT: adcl %eax, %ebx
+; X32-NEXT: movl -72(%ebp), %eax # 4-byte Reload
+; X32-NEXT: adcl $0, %eax
+; X32-NEXT: addl -696(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: adcl -652(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: adcl -924(%ebp), %ebx # 4-byte Folded Reload
+; X32-NEXT: adcl -928(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: addl -64(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: movl %edx, -224(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -220(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: movl %esi, -76(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -140(%ebp), %ebx # 4-byte Folded Reload
+; X32-NEXT: movl %ebx, -152(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -40(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: movl %eax, -72(%ebp) # 4-byte Spill
+; X32-NEXT: adcl $0, -176(%ebp) # 4-byte Folded Spill
+; X32-NEXT: adcl $0, -200(%ebp) # 4-byte Folded Spill
+; X32-NEXT: adcl $0, -68(%ebp) # 4-byte Folded Spill
+; X32-NEXT: adcl $0, -32(%ebp) # 4-byte Folded Spill
+; X32-NEXT: movl -108(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: movl -548(%ebp), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, -40(%ebp) # 4-byte Spill
+; X32-NEXT: movl %eax, -140(%ebp) # 4-byte Spill
+; X32-NEXT: movl -96(%ebp), %edi # 4-byte Reload
+; X32-NEXT: movl %edi, %eax
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: movl %eax, %ebx
+; X32-NEXT: addl -40(%ebp), %ebx # 4-byte Folded Reload
; X32-NEXT: adcl $0, %esi
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: mull -544(%ebp) # 4-byte Folded Reload
+; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: addl %ebx, %eax
+; X32-NEXT: movl %eax, -40(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %esi, %ecx
+; X32-NEXT: setb %bl
+; X32-NEXT: movl %edi, %eax
+; X32-NEXT: movl -544(%ebp), %edi # 4-byte Reload
+; X32-NEXT: mull %edi
+; X32-NEXT: addl %ecx, %eax
+; X32-NEXT: movzbl %bl, %ecx
+; X32-NEXT: adcl %ecx, %edx
+; X32-NEXT: movl -104(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: addl -380(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: movl -156(%ebp), %esi # 4-byte Reload
+; X32-NEXT: adcl -356(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: addl %eax, %ecx
+; X32-NEXT: movl %ecx, -220(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %edx, %esi
+; X32-NEXT: movl %esi, -64(%ebp) # 4-byte Spill
+; X32-NEXT: movl -168(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: movl -548(%ebp), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, -56(%ebp) # 4-byte Spill
+; X32-NEXT: movl %eax, -88(%ebp) # 4-byte Spill
+; X32-NEXT: movl -92(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: movl %eax, %ebx
+; X32-NEXT: addl -56(%ebp), %ebx # 4-byte Folded Reload
+; X32-NEXT: adcl $0, %esi
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: addl %ebx, %eax
+; X32-NEXT: movl %eax, -296(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %esi, %ecx
+; X32-NEXT: setb -56(%ebp) # 1-byte Folded Spill
+; X32-NEXT: movl -92(%ebp), %ebx # 4-byte Reload
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: mull %edi
+; X32-NEXT: addl %ecx, %eax
+; X32-NEXT: movzbl -56(%ebp), %ecx # 1-byte Folded Reload
+; X32-NEXT: adcl %ecx, %edx
+; X32-NEXT: addl -704(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: adcl -948(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: addl -140(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: movl %eax, -292(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -40(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: movl %edx, -376(%ebp) # 4-byte Spill
+; X32-NEXT: adcl $0, -220(%ebp) # 4-byte Folded Spill
+; X32-NEXT: adcl $0, -64(%ebp) # 4-byte Folded Spill
+; X32-NEXT: movl -168(%ebp), %esi # 4-byte Reload
; X32-NEXT: movl %esi, %eax
+; X32-NEXT: movl -580(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, -140(%ebp) # 4-byte Spill
+; X32-NEXT: movl %eax, -40(%ebp) # 4-byte Spill
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, %edi
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: addl -140(%ebp), %ecx # 4-byte Folded Reload
; X32-NEXT: adcl $0, %edi
+; X32-NEXT: movl 8(%ebp), %eax
+; X32-NEXT: movl 92(%eax), %ebx
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: mull %ebx
+; X32-NEXT: movl %ebx, %esi
+; X32-NEXT: movl %esi, -140(%ebp) # 4-byte Spill
+; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: addl %ecx, %eax
+; X32-NEXT: movl %eax, -56(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %edi, %ebx
+; X32-NEXT: setb %cl
+; X32-NEXT: movl -92(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: addl %ebx, %eax
+; X32-NEXT: movzbl %cl, %ecx
+; X32-NEXT: adcl %ecx, %edx
+; X32-NEXT: movl -28(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: addl -600(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: movl -256(%ebp), %esi # 4-byte Reload
+; X32-NEXT: adcl -604(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: addl %eax, %ecx
+; X32-NEXT: adcl %edx, %esi
+; X32-NEXT: movl -292(%ebp), %eax # 4-byte Reload
+; X32-NEXT: addl %eax, -40(%ebp) # 4-byte Folded Spill
+; X32-NEXT: movl -376(%ebp), %eax # 4-byte Reload
+; X32-NEXT: adcl %eax, -56(%ebp) # 4-byte Folded Spill
; X32-NEXT: adcl $0, %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: addl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
-; X32-NEXT: movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
+; X32-NEXT: adcl $0, %esi
+; X32-NEXT: addl -220(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: adcl -64(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: setb -376(%ebp) # 1-byte Folded Spill
+; X32-NEXT: movl -108(%ebp), %eax # 4-byte Reload
+; X32-NEXT: movl -580(%ebp), %edi # 4-byte Reload
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %edx, -220(%ebp) # 4-byte Spill
+; X32-NEXT: movl %eax, -64(%ebp) # 4-byte Spill
+; X32-NEXT: movl -96(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %eax, %ebx
+; X32-NEXT: addl -220(%ebp), %ebx # 4-byte Folded Reload
+; X32-NEXT: adcl $0, %edx
+; X32-NEXT: movl %edx, -292(%ebp) # 4-byte Spill
+; X32-NEXT: movl -108(%ebp), %eax # 4-byte Reload
+; X32-NEXT: movl -140(%ebp), %edi # 4-byte Reload
+; X32-NEXT: mull %edi
+; X32-NEXT: addl %ebx, %eax
+; X32-NEXT: movl %eax, -220(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -292(%ebp), %edx # 4-byte Folded Reload
; X32-NEXT: movl %edx, %ebx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %esi # 4-byte Folded Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
-; X32-NEXT: movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: addl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: adcl %eax, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: adcl %eax, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: adcl %eax, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: adcl %eax, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: adcl %eax, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: adcl %eax, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax # 1-byte Folded Reload
-; X32-NEXT: adcl %eax, {{[0-9]+}}(%esp) # 4-byte Folded Spill
+; X32-NEXT: setb -292(%ebp) # 1-byte Folded Spill
+; X32-NEXT: movl -96(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %edi
+; X32-NEXT: addl %ebx, %eax
+; X32-NEXT: movzbl -292(%ebp), %edi # 1-byte Folded Reload
+; X32-NEXT: adcl %edi, %edx
+; X32-NEXT: movl -104(%ebp), %edi # 4-byte Reload
+; X32-NEXT: addl -600(%ebp), %edi # 4-byte Folded Reload
+; X32-NEXT: movl -156(%ebp), %ebx # 4-byte Reload
+; X32-NEXT: adcl -604(%ebp), %ebx # 4-byte Folded Reload
+; X32-NEXT: addl %eax, %edi
+; X32-NEXT: adcl %edx, %ebx
+; X32-NEXT: movl -64(%ebp), %edx # 4-byte Reload
+; X32-NEXT: addl %ecx, %edx
+; X32-NEXT: movl -220(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: adcl %esi, %ecx
+; X32-NEXT: movzbl -376(%ebp), %eax # 1-byte Folded Reload
+; X32-NEXT: adcl %eax, %edi
; X32-NEXT: adcl $0, %ebx
-; X32-NEXT: movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: addl -704(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: adcl -940(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: adcl -944(%ebp), %edi # 4-byte Folded Reload
+; X32-NEXT: adcl -936(%ebp), %ebx # 4-byte Folded Reload
+; X32-NEXT: movl -224(%ebp), %eax # 4-byte Reload
+; X32-NEXT: addl %eax, -88(%ebp) # 4-byte Folded Spill
+; X32-NEXT: movl -76(%ebp), %eax # 4-byte Reload
+; X32-NEXT: adcl %eax, -296(%ebp) # 4-byte Folded Spill
+; X32-NEXT: movl -152(%ebp), %eax # 4-byte Reload
+; X32-NEXT: adcl %eax, -40(%ebp) # 4-byte Folded Spill
+; X32-NEXT: movl -72(%ebp), %eax # 4-byte Reload
+; X32-NEXT: adcl %eax, -56(%ebp) # 4-byte Folded Spill
; X32-NEXT: adcl $0, %edx
-; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
; X32-NEXT: adcl $0, %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: adcl $0, %edi
+; X32-NEXT: adcl $0, %ebx
+; X32-NEXT: addl -176(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: movl %edx, -64(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -200(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: movl %ecx, -220(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -68(%ebp), %edi # 4-byte Folded Reload
+; X32-NEXT: movl %edi, -68(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -32(%ebp), %ebx # 4-byte Folded Reload
+; X32-NEXT: movl %ebx, -152(%ebp) # 4-byte Spill
+; X32-NEXT: setb -32(%ebp) # 1-byte Folded Spill
+; X32-NEXT: movl -284(%ebp), %esi # 4-byte Reload
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: movl -548(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, %edi
+; X32-NEXT: movl %eax, -176(%ebp) # 4-byte Spill
+; X32-NEXT: movl -48(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: addl %edi, %ecx
+; X32-NEXT: adcl $0, %ebx
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: movl -544(%ebp), %edi # 4-byte Reload
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: addl %ecx, %eax
+; X32-NEXT: movl %eax, -200(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %ebx, %esi
+; X32-NEXT: setb %cl
+; X32-NEXT: movl -48(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %edi
+; X32-NEXT: addl %esi, %eax
+; X32-NEXT: movzbl %cl, %ecx
+; X32-NEXT: adcl %ecx, %edx
+; X32-NEXT: movl -380(%ebp), %esi # 4-byte Reload
+; X32-NEXT: addl -308(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: movl -356(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: adcl -208(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: addl %eax, %esi
+; X32-NEXT: movl %esi, -380(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %edx, %ecx
+; X32-NEXT: movl %ecx, -356(%ebp) # 4-byte Spill
+; X32-NEXT: movl -212(%ebp), %edi # 4-byte Reload
+; X32-NEXT: movl %edi, %eax
+; X32-NEXT: movl -548(%ebp), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: movl %eax, -76(%ebp) # 4-byte Spill
+; X32-NEXT: movl -252(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: movl %eax, %ebx
+; X32-NEXT: addl %ecx, %ebx
+; X32-NEXT: adcl $0, %esi
+; X32-NEXT: movl %edi, %eax
+; X32-NEXT: movl -544(%ebp), %edi # 4-byte Reload
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: addl %ebx, %eax
+; X32-NEXT: movl %eax, -72(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %esi, %ecx
+; X32-NEXT: setb %bl
+; X32-NEXT: movl -252(%ebp), %esi # 4-byte Reload
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: mull %edi
+; X32-NEXT: addl %ecx, %eax
+; X32-NEXT: movzbl %bl, %ecx
+; X32-NEXT: adcl %ecx, %edx
+; X32-NEXT: addl -708(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: adcl -960(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: addl -176(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: movl %eax, -376(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -200(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: movl %edx, -224(%ebp) # 4-byte Spill
+; X32-NEXT: adcl $0, -380(%ebp) # 4-byte Folded Spill
+; X32-NEXT: adcl $0, -356(%ebp) # 4-byte Folded Spill
+; X32-NEXT: movl -212(%ebp), %eax # 4-byte Reload
+; X32-NEXT: movl -580(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, %edi
+; X32-NEXT: movl %eax, -176(%ebp) # 4-byte Spill
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: addl %edi, %ecx
+; X32-NEXT: adcl $0, %ebx
+; X32-NEXT: movl -212(%ebp), %eax # 4-byte Reload
+; X32-NEXT: movl -140(%ebp), %edi # 4-byte Reload
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: addl %ecx, %eax
+; X32-NEXT: movl %eax, -200(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %ebx, %esi
+; X32-NEXT: setb %cl
+; X32-NEXT: movl -252(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %edi, %ebx
+; X32-NEXT: addl %esi, %eax
+; X32-NEXT: movzbl %cl, %ecx
+; X32-NEXT: adcl %ecx, %edx
+; X32-NEXT: movl -116(%ebp), %esi # 4-byte Reload
+; X32-NEXT: addl -600(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: movl -84(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: adcl -604(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: addl %eax, %esi
+; X32-NEXT: adcl %edx, %ecx
+; X32-NEXT: movl -376(%ebp), %eax # 4-byte Reload
+; X32-NEXT: addl %eax, -176(%ebp) # 4-byte Folded Spill
+; X32-NEXT: movl -224(%ebp), %eax # 4-byte Reload
+; X32-NEXT: adcl %eax, -200(%ebp) # 4-byte Folded Spill
+; X32-NEXT: adcl $0, %esi
+; X32-NEXT: adcl $0, %ecx
+; X32-NEXT: addl -380(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: movl %esi, -116(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -356(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: movl %ecx, -84(%ebp) # 4-byte Spill
+; X32-NEXT: setb -356(%ebp) # 1-byte Folded Spill
+; X32-NEXT: movl -284(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: movl -580(%ebp), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, -380(%ebp) # 4-byte Spill
+; X32-NEXT: movl %eax, -224(%ebp) # 4-byte Spill
+; X32-NEXT: movl -48(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: movl %eax, %edi
+; X32-NEXT: addl -380(%ebp), %edi # 4-byte Folded Reload
; X32-NEXT: adcl $0, %esi
-; X32-NEXT: movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl $0, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT: adcl $0, {{[0-9]+}}(%esp) # 4-byte Folded Spill
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: mull %ebx
+; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: addl %edi, %eax
+; X32-NEXT: movl %eax, %edi
+; X32-NEXT: adcl %esi, %ecx
+; X32-NEXT: setb -380(%ebp) # 1-byte Folded Spill
+; X32-NEXT: movl -48(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %ebx
+; X32-NEXT: addl %ecx, %eax
+; X32-NEXT: movzbl -380(%ebp), %ecx # 1-byte Folded Reload
+; X32-NEXT: adcl %ecx, %edx
+; X32-NEXT: movl -308(%ebp), %esi # 4-byte Reload
+; X32-NEXT: addl -600(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: movl -208(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: adcl -604(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: addl %eax, %esi
+; X32-NEXT: adcl %edx, %ecx
+; X32-NEXT: movl -224(%ebp), %edx # 4-byte Reload
+; X32-NEXT: addl -116(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: adcl -84(%ebp), %edi # 4-byte Folded Reload
+; X32-NEXT: movzbl -356(%ebp), %eax # 1-byte Folded Reload
+; X32-NEXT: adcl %eax, %esi
+; X32-NEXT: adcl $0, %ecx
+; X32-NEXT: addl -708(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: adcl -660(%ebp), %edi # 4-byte Folded Reload
+; X32-NEXT: adcl -952(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: adcl -956(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: movl -64(%ebp), %eax # 4-byte Reload
+; X32-NEXT: addl %eax, -76(%ebp) # 4-byte Folded Spill
+; X32-NEXT: movl -220(%ebp), %eax # 4-byte Reload
+; X32-NEXT: adcl %eax, -72(%ebp) # 4-byte Folded Spill
+; X32-NEXT: movl -68(%ebp), %eax # 4-byte Reload
+; X32-NEXT: adcl %eax, -176(%ebp) # 4-byte Folded Spill
+; X32-NEXT: movl -152(%ebp), %eax # 4-byte Reload
+; X32-NEXT: adcl %eax, -200(%ebp) # 4-byte Folded Spill
+; X32-NEXT: movzbl -32(%ebp), %eax # 1-byte Folded Reload
+; X32-NEXT: adcl %eax, %edx
+; X32-NEXT: movl %edx, -224(%ebp) # 4-byte Spill
; X32-NEXT: adcl $0, %edi
-; X32-NEXT: movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: addl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl %eax, %esi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: movl %edi, -380(%ebp) # 4-byte Spill
+; X32-NEXT: adcl $0, %esi
+; X32-NEXT: movl %esi, -308(%ebp) # 4-byte Spill
; X32-NEXT: adcl $0, %ecx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: adcl $0, %eax
-; X32-NEXT: addl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %esi
-; X32-NEXT: movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X32-NEXT: movl %ecx, -208(%ebp) # 4-byte Spill
+; X32-NEXT: movl -516(%ebp), %esi # 4-byte Reload
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: movl -188(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, -116(%ebp) # 4-byte Spill
+; X32-NEXT: movl %eax, -356(%ebp) # 4-byte Spill
+; X32-NEXT: movl -132(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, %edi
+; X32-NEXT: movl %eax, %ebx
+; X32-NEXT: addl -116(%ebp), %ebx # 4-byte Folded Reload
; X32-NEXT: adcl $0, %edi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: movl -148(%ebp), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: addl %ebx, %eax
+; X32-NEXT: movl %eax, -32(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %edi, %ecx
+; X32-NEXT: setb %bl
+; X32-NEXT: movl -132(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: addl %ecx, %eax
+; X32-NEXT: movzbl %bl, %ecx
+; X32-NEXT: adcl %ecx, %edx
+; X32-NEXT: movl -484(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: addl -100(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: movl -488(%ebp), %esi # 4-byte Reload
+; X32-NEXT: adcl -204(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: addl %eax, %ecx
+; X32-NEXT: movl %ecx, -84(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %edx, %esi
+; X32-NEXT: movl %esi, -116(%ebp) # 4-byte Spill
+; X32-NEXT: movl -476(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: movl -188(%ebp), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, -220(%ebp) # 4-byte Spill
+; X32-NEXT: movl %eax, -64(%ebp) # 4-byte Spill
+; X32-NEXT: movl -248(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %edi
+; X32-NEXT: movl %eax, %ebx
+; X32-NEXT: addl -220(%ebp), %ebx # 4-byte Folded Reload
+; X32-NEXT: adcl $0, %edi
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: movl -148(%ebp), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: addl %ebx, %eax
+; X32-NEXT: movl %eax, -220(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %edi, %ecx
+; X32-NEXT: setb %bl
+; X32-NEXT: movl -248(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: addl %ecx, %eax
+; X32-NEXT: movzbl %bl, %ecx
+; X32-NEXT: adcl %ecx, %edx
+; X32-NEXT: movl -100(%ebp), %esi # 4-byte Reload
+; X32-NEXT: addl -480(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: movl -204(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: adcl -384(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: addl %eax, %esi
+; X32-NEXT: adcl %edx, %ecx
+; X32-NEXT: addl -356(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: movl %esi, -100(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -32(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: movl %ecx, -204(%ebp) # 4-byte Spill
+; X32-NEXT: adcl $0, -84(%ebp) # 4-byte Folded Spill
+; X32-NEXT: adcl $0, -116(%ebp) # 4-byte Folded Spill
+; X32-NEXT: movl -476(%ebp), %ebx # 4-byte Reload
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: movl -236(%ebp), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: movl %eax, -356(%ebp) # 4-byte Spill
+; X32-NEXT: movl -248(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: movl %eax, %edi
+; X32-NEXT: addl %ecx, %edi
; X32-NEXT: adcl $0, %esi
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: movl -112(%ebp), %ebx # 4-byte Reload
+; X32-NEXT: mull %ebx
+; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: addl %edi, %eax
+; X32-NEXT: movl %eax, %edi
+; X32-NEXT: adcl %esi, %ecx
+; X32-NEXT: setb -32(%ebp) # 1-byte Folded Spill
+; X32-NEXT: movl -248(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %ebx
+; X32-NEXT: addl %ecx, %eax
+; X32-NEXT: movzbl -32(%ebp), %ecx # 1-byte Folded Reload
+; X32-NEXT: adcl %ecx, %edx
+; X32-NEXT: movl -480(%ebp), %esi # 4-byte Reload
+; X32-NEXT: addl -304(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: movl -384(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: adcl -128(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: addl %eax, %esi
+; X32-NEXT: adcl %edx, %ecx
+; X32-NEXT: movl -100(%ebp), %eax # 4-byte Reload
+; X32-NEXT: addl %eax, -356(%ebp) # 4-byte Folded Spill
+; X32-NEXT: adcl -204(%ebp), %edi # 4-byte Folded Reload
+; X32-NEXT: movl %edi, -32(%ebp) # 4-byte Spill
+; X32-NEXT: adcl $0, %esi
+; X32-NEXT: adcl $0, %ecx
+; X32-NEXT: addl -84(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: movl %esi, -480(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -116(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: movl %ecx, -384(%ebp) # 4-byte Spill
+; X32-NEXT: setb -204(%ebp) # 1-byte Folded Spill
+; X32-NEXT: movl -516(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: movl -236(%ebp), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, -100(%ebp) # 4-byte Spill
+; X32-NEXT: movl %eax, -116(%ebp) # 4-byte Spill
+; X32-NEXT: movl -132(%ebp), %ebx # 4-byte Reload
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: movl %eax, %edi
+; X32-NEXT: addl -100(%ebp), %edi # 4-byte Folded Reload
+; X32-NEXT: adcl $0, %esi
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: mull -112(%ebp) # 4-byte Folded Reload
+; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: addl %edi, %eax
+; X32-NEXT: movl %eax, -100(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %esi, %ecx
+; X32-NEXT: setb -84(%ebp) # 1-byte Folded Spill
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: movl -112(%ebp), %edi # 4-byte Reload
+; X32-NEXT: mull %edi
+; X32-NEXT: addl %ecx, %eax
+; X32-NEXT: movzbl -84(%ebp), %ecx # 1-byte Folded Reload
+; X32-NEXT: adcl %ecx, %edx
+; X32-NEXT: movl -484(%ebp), %esi # 4-byte Reload
+; X32-NEXT: addl -304(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: movl -488(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: adcl -128(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: addl %eax, %esi
+; X32-NEXT: adcl %edx, %ecx
+; X32-NEXT: movl -480(%ebp), %eax # 4-byte Reload
+; X32-NEXT: addl %eax, -116(%ebp) # 4-byte Folded Spill
+; X32-NEXT: movl -384(%ebp), %eax # 4-byte Reload
+; X32-NEXT: adcl %eax, -100(%ebp) # 4-byte Folded Spill
+; X32-NEXT: movzbl -204(%ebp), %eax # 1-byte Folded Reload
+; X32-NEXT: adcl %eax, %esi
+; X32-NEXT: movl %esi, -484(%ebp) # 4-byte Spill
+; X32-NEXT: adcl $0, %ecx
+; X32-NEXT: movl %ecx, -488(%ebp) # 4-byte Spill
+; X32-NEXT: movl -548(%ebp), %eax # 4-byte Reload
+; X32-NEXT: movl %edi, %ecx
+; X32-NEXT: imull %eax, %ecx
+; X32-NEXT: movl -236(%ebp), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %eax, -204(%ebp) # 4-byte Spill
+; X32-NEXT: addl %ecx, %edx
+; X32-NEXT: imull -544(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: addl %edx, %esi
+; X32-NEXT: movl %esi, -236(%ebp) # 4-byte Spill
+; X32-NEXT: movl -580(%ebp), %eax # 4-byte Reload
+; X32-NEXT: movl %eax, %esi
+; X32-NEXT: movl -148(%ebp), %ebx # 4-byte Reload
+; X32-NEXT: imull %ebx, %esi
+; X32-NEXT: movl -188(%ebp), %edi # 4-byte Reload
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: addl %esi, %edx
+; X32-NEXT: movl -140(%ebp), %esi # 4-byte Reload
+; X32-NEXT: imull %edi, %esi
+; X32-NEXT: addl %edx, %esi
+; X32-NEXT: addl -204(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: movl %ecx, -84(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -236(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: movl %esi, -140(%ebp) # 4-byte Spill
+; X32-NEXT: movl %edi, %eax
+; X32-NEXT: movl %edi, %esi
+; X32-NEXT: movl -548(%ebp), %edi # 4-byte Reload
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: movl %eax, -236(%ebp) # 4-byte Spill
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %edx, %edi
+; X32-NEXT: movl %eax, %ebx
+; X32-NEXT: addl %ecx, %ebx
+; X32-NEXT: adcl $0, %edi
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: movl -544(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: addl %ebx, %eax
+; X32-NEXT: movl %eax, -204(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %edi, %esi
+; X32-NEXT: setb %bl
+; X32-NEXT: movl -148(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %ecx
+; X32-NEXT: addl %esi, %eax
+; X32-NEXT: movzbl %bl, %ecx
+; X32-NEXT: adcl %ecx, %edx
+; X32-NEXT: addl -84(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: movl %eax, -304(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -140(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: movl %edx, -84(%ebp) # 4-byte Spill
+; X32-NEXT: movl -476(%ebp), %eax # 4-byte Reload
+; X32-NEXT: movl -164(%ebp), %esi # 4-byte Reload
+; X32-NEXT: imull %eax, %esi
+; X32-NEXT: movl -244(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %eax, -148(%ebp) # 4-byte Spill
+; X32-NEXT: addl %esi, %edx
+; X32-NEXT: imull -248(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: addl %edx, %ecx
+; X32-NEXT: movl %ecx, -244(%ebp) # 4-byte Spill
+; X32-NEXT: movl -516(%ebp), %eax # 4-byte Reload
+; X32-NEXT: movl %eax, %esi
+; X32-NEXT: movl -144(%ebp), %ebx # 4-byte Reload
+; X32-NEXT: imull %ebx, %esi
+; X32-NEXT: movl -300(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %eax, %edi
+; X32-NEXT: addl %esi, %edx
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: movl -132(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: imull %eax, %ecx
+; X32-NEXT: addl %edx, %ecx
+; X32-NEXT: addl -148(%ebp), %edi # 4-byte Folded Reload
+; X32-NEXT: movl %edi, -128(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -244(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: movl %ecx, -132(%ebp) # 4-byte Spill
+; X32-NEXT: movl -476(%ebp), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: movl %eax, -148(%ebp) # 4-byte Spill
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %edi
+; X32-NEXT: movl %eax, %esi
+; X32-NEXT: addl %ecx, %esi
+; X32-NEXT: adcl $0, %edi
+; X32-NEXT: movl -300(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull -248(%ebp) # 4-byte Folded Reload
+; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: addl %esi, %ecx
+; X32-NEXT: adcl %edi, %ebx
+; X32-NEXT: setb -244(%ebp) # 1-byte Folded Spill
+; X32-NEXT: movl -144(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull -248(%ebp) # 4-byte Folded Reload
+; X32-NEXT: addl %ebx, %eax
+; X32-NEXT: movzbl -244(%ebp), %esi # 1-byte Folded Reload
+; X32-NEXT: adcl %esi, %edx
+; X32-NEXT: addl -128(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: adcl -132(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: movl -148(%ebp), %esi # 4-byte Reload
+; X32-NEXT: addl -236(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: adcl -204(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: adcl -304(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: adcl -84(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: addl -116(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: movl %esi, -148(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -100(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: movl %ecx, -164(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -484(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: movl %eax, -384(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -488(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: movl %edx, -300(%ebp) # 4-byte Spill
+; X32-NEXT: movl 8(%ebp), %esi
+; X32-NEXT: movl 104(%esi), %ebx
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: movl %ebx, -244(%ebp) # 4-byte Spill
+; X32-NEXT: movl -168(%ebp), %edi # 4-byte Reload
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %eax, -236(%ebp) # 4-byte Spill
+; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: movl 108(%esi), %eax
+; X32-NEXT: movl %eax, -100(%ebp) # 4-byte Spill
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: movl %eax, %edi
; X32-NEXT: addl %ecx, %edi
+; X32-NEXT: adcl $0, %esi
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: movl -92(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: addl %edi, %eax
+; X32-NEXT: movl %eax, -204(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %esi, %ebx
+; X32-NEXT: setb -116(%ebp) # 1-byte Folded Spill
+; X32-NEXT: movl -100(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: movl %eax, %edi
+; X32-NEXT: addl %ebx, %edi
+; X32-NEXT: movzbl -116(%ebp), %eax # 1-byte Folded Reload
; X32-NEXT: adcl %eax, %esi
-; X32-NEXT: setb %al
-; X32-NEXT: addl {{[0-9]+}}(%esp), %edi
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %esi
-; X32-NEXT: movzbl %al, %eax
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: adcl $0, %eax
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
-; X32-NEXT: addl %eax, %ebx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: adcl %ecx, %eax
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
-; X32-NEXT: adcl %edx, %ecx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
-; X32-NEXT: addl %edi, %ebx
-; X32-NEXT: movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl %esi, %eax
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
-; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: addl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl -244(%ebp), %eax # 4-byte Reload
+; X32-NEXT: xorl %ecx, %ecx
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, -128(%ebp) # 4-byte Spill
+; X32-NEXT: movl %eax, -248(%ebp) # 4-byte Spill
+; X32-NEXT: addl -28(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: adcl -256(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: addl %edi, %eax
+; X32-NEXT: movl %eax, -112(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %esi, %edx
+; X32-NEXT: movl %edx, -140(%ebp) # 4-byte Spill
+; X32-NEXT: movl 8(%ebp), %ecx
+; X32-NEXT: movl 96(%ecx), %edi
+; X32-NEXT: movl %edi, %eax
+; X32-NEXT: movl %edi, -84(%ebp) # 4-byte Spill
+; X32-NEXT: movl -168(%ebp), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %eax, -304(%ebp) # 4-byte Spill
+; X32-NEXT: movl %edx, -132(%ebp) # 4-byte Spill
+; X32-NEXT: movl 100(%ecx), %eax
+; X32-NEXT: movl %eax, -116(%ebp) # 4-byte Spill
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %ebx
; X32-NEXT: movl %eax, %esi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: adcl $0, %eax
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: addl -132(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: adcl $0, %ebx
+; X32-NEXT: movl %edi, %eax
+; X32-NEXT: movl -92(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, %edi
+; X32-NEXT: addl %esi, %eax
+; X32-NEXT: movl %eax, -132(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %ebx, %edi
+; X32-NEXT: setb -144(%ebp) # 1-byte Folded Spill
+; X32-NEXT: movl -116(%ebp), %ebx # 4-byte Reload
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: addl %edi, %ecx
+; X32-NEXT: movzbl -144(%ebp), %eax # 1-byte Folded Reload
+; X32-NEXT: adcl %eax, %esi
+; X32-NEXT: movl -84(%ebp), %eax # 4-byte Reload
+; X32-NEXT: xorl %edx, %edx
+; X32-NEXT: mull %edx
+; X32-NEXT: movl %edx, -188(%ebp) # 4-byte Spill
+; X32-NEXT: movl %eax, -144(%ebp) # 4-byte Spill
+; X32-NEXT: movl -28(%ebp), %edi # 4-byte Reload
+; X32-NEXT: addl %eax, %edi
+; X32-NEXT: movl -256(%ebp), %eax # 4-byte Reload
+; X32-NEXT: adcl %edx, %eax
+; X32-NEXT: addl %ecx, %edi
+; X32-NEXT: adcl %esi, %eax
+; X32-NEXT: addl -236(%ebp), %edi # 4-byte Folded Reload
+; X32-NEXT: movl %edi, -28(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -204(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: movl %eax, -256(%ebp) # 4-byte Spill
+; X32-NEXT: adcl $0, -112(%ebp) # 4-byte Folded Spill
+; X32-NEXT: adcl $0, -140(%ebp) # 4-byte Folded Spill
+; X32-NEXT: movl -84(%ebp), %esi # 4-byte Reload
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: movl -108(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, -204(%ebp) # 4-byte Spill
+; X32-NEXT: movl %eax, -236(%ebp) # 4-byte Spill
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, %edi
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: addl -204(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: adcl $0, %edi
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: movl -96(%ebp), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: addl %ecx, %eax
+; X32-NEXT: movl %eax, -204(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %edi, %ebx
+; X32-NEXT: setb %cl
+; X32-NEXT: movl -116(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: addl %ebx, %eax
+; X32-NEXT: movzbl %cl, %ecx
+; X32-NEXT: adcl %ecx, %edx
+; X32-NEXT: movl -144(%ebp), %esi # 4-byte Reload
+; X32-NEXT: addl -104(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: movl -188(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: adcl -156(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: addl %eax, %esi
+; X32-NEXT: adcl %edx, %ecx
+; X32-NEXT: movl -28(%ebp), %eax # 4-byte Reload
+; X32-NEXT: addl %eax, -236(%ebp) # 4-byte Folded Spill
+; X32-NEXT: movl -256(%ebp), %eax # 4-byte Reload
+; X32-NEXT: adcl %eax, -204(%ebp) # 4-byte Folded Spill
+; X32-NEXT: adcl $0, %esi
; X32-NEXT: adcl $0, %ecx
-; X32-NEXT: addl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %esi
-; X32-NEXT: movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: adcl $0, %edx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT: addl -112(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: movl %esi, -144(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -140(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: movl %ecx, -188(%ebp) # 4-byte Spill
+; X32-NEXT: setb -112(%ebp) # 1-byte Folded Spill
+; X32-NEXT: movl -244(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: movl -108(%ebp), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, -256(%ebp) # 4-byte Spill
+; X32-NEXT: movl %eax, -28(%ebp) # 4-byte Spill
+; X32-NEXT: movl -100(%ebp), %edi # 4-byte Reload
+; X32-NEXT: movl %edi, %eax
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: movl %eax, %ebx
+; X32-NEXT: addl -256(%ebp), %ebx # 4-byte Folded Reload
; X32-NEXT: adcl $0, %esi
-; X32-NEXT: addl %eax, %edx
-; X32-NEXT: adcl %ecx, %esi
-; X32-NEXT: setb %al
-; X32-NEXT: addl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %esi
-; X32-NEXT: movzbl %al, %eax
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: mull -96(%ebp) # 4-byte Folded Reload
+; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: addl %ebx, %eax
+; X32-NEXT: movl %eax, -256(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %esi, %ecx
+; X32-NEXT: setb %bl
+; X32-NEXT: movl %edi, %eax
+; X32-NEXT: movl -96(%ebp), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: addl %ecx, %eax
+; X32-NEXT: movzbl %bl, %ecx
+; X32-NEXT: adcl %ecx, %edx
+; X32-NEXT: movl -248(%ebp), %edi # 4-byte Reload
+; X32-NEXT: addl -104(%ebp), %edi # 4-byte Folded Reload
+; X32-NEXT: movl -128(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: adcl -156(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: addl %eax, %edi
+; X32-NEXT: adcl %edx, %ecx
+; X32-NEXT: movl -144(%ebp), %eax # 4-byte Reload
+; X32-NEXT: addl %eax, -28(%ebp) # 4-byte Folded Spill
+; X32-NEXT: movl -188(%ebp), %eax # 4-byte Reload
+; X32-NEXT: adcl %eax, -256(%ebp) # 4-byte Folded Spill
+; X32-NEXT: movzbl -112(%ebp), %eax # 1-byte Folded Reload
+; X32-NEXT: adcl %eax, %edi
+; X32-NEXT: movl %edi, -248(%ebp) # 4-byte Spill
; X32-NEXT: adcl $0, %ecx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
-; X32-NEXT: addl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
-; X32-NEXT: movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
-; X32-NEXT: movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
-; X32-NEXT: movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: movl %ecx, -128(%ebp) # 4-byte Spill
+; X32-NEXT: movl 8(%ebp), %ecx
+; X32-NEXT: movl 112(%ecx), %eax
+; X32-NEXT: movl %eax, -156(%ebp) # 4-byte Spill
+; X32-NEXT: imull %eax, %esi
+; X32-NEXT: movl -108(%ebp), %edi # 4-byte Reload
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %eax, -144(%ebp) # 4-byte Spill
+; X32-NEXT: addl %esi, %edx
+; X32-NEXT: movl 116(%ecx), %eax
+; X32-NEXT: movl %eax, -104(%ebp) # 4-byte Spill
+; X32-NEXT: imull %eax, %edi
+; X32-NEXT: addl %edx, %edi
+; X32-NEXT: movl %edi, -108(%ebp) # 4-byte Spill
+; X32-NEXT: movl 120(%ecx), %eax
+; X32-NEXT: movl %ecx, %ebx
+; X32-NEXT: movl %eax, %edi
+; X32-NEXT: movl -92(%ebp), %esi # 4-byte Reload
+; X32-NEXT: imull %esi, %edi
+; X32-NEXT: movl -168(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %eax, -96(%ebp) # 4-byte Spill
+; X32-NEXT: addl %edi, %edx
+; X32-NEXT: movl 124(%ebx), %ebx
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: imull %eax, %ebx
; X32-NEXT: addl %edx, %ebx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT: adcl %esi, %edi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx # 4-byte Reload
-; X32-NEXT: adcl %eax, %edx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: adcl %ecx, %eax
-; X32-NEXT: addl {{[0-9]+}}(%esp), %ebx
-; X32-NEXT: movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %edi
-; X32-NEXT: movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
-; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl $0, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT: adcl $0, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT: adcl $0, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT: adcl $0, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: addl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl %eax, %esi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: adcl $0, %eax
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: movl -144(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: addl %ecx, -96(%ebp) # 4-byte Folded Spill
+; X32-NEXT: adcl -108(%ebp), %ebx # 4-byte Folded Reload
+; X32-NEXT: movl -156(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, -144(%ebp) # 4-byte Spill
+; X32-NEXT: movl %eax, -108(%ebp) # 4-byte Spill
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: movl %eax, %edi
+; X32-NEXT: addl -144(%ebp), %edi # 4-byte Folded Reload
; X32-NEXT: adcl $0, %ecx
-; X32-NEXT: addl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %esi
-; X32-NEXT: movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: adcl $0, %edx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X32-NEXT: adcl $0, %esi
-; X32-NEXT: addl %eax, %edx
+; X32-NEXT: movl -168(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull -104(%ebp) # 4-byte Folded Reload
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: addl %edi, %eax
+; X32-NEXT: movl %eax, -168(%ebp) # 4-byte Spill
; X32-NEXT: adcl %ecx, %esi
-; X32-NEXT: setb %al
-; X32-NEXT: addl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %esi
-; X32-NEXT: movzbl %al, %eax
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: adcl $0, %eax
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X32-NEXT: setb %cl
+; X32-NEXT: movl -92(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull -104(%ebp) # 4-byte Folded Reload
+; X32-NEXT: addl %esi, %eax
+; X32-NEXT: movzbl %cl, %ecx
+; X32-NEXT: adcl %ecx, %edx
+; X32-NEXT: addl -96(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: movl %eax, -92(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %ebx, %edx
+; X32-NEXT: movl %edx, -96(%ebp) # 4-byte Spill
+; X32-NEXT: movl -48(%ebp), %edi # 4-byte Reload
+; X32-NEXT: movl -84(%ebp), %eax # 4-byte Reload
+; X32-NEXT: imull %eax, %edi
+; X32-NEXT: movl -284(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %eax, -104(%ebp) # 4-byte Spill
+; X32-NEXT: addl %edi, %edx
+; X32-NEXT: imull -116(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: addl %edx, %ecx
+; X32-NEXT: movl %ecx, -284(%ebp) # 4-byte Spill
+; X32-NEXT: movl -244(%ebp), %eax # 4-byte Reload
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: movl -252(%ebp), %ebx # 4-byte Reload
+; X32-NEXT: imull %ebx, %ecx
+; X32-NEXT: movl -212(%ebp), %edi # 4-byte Reload
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %eax, %esi
+; X32-NEXT: addl %ecx, %edx
+; X32-NEXT: movl -100(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: imull %edi, %ecx
+; X32-NEXT: addl %edx, %ecx
+; X32-NEXT: addl -104(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: movl %esi, -104(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -284(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: movl %ecx, -100(%ebp) # 4-byte Spill
+; X32-NEXT: movl %edi, %eax
+; X32-NEXT: movl %edi, %ecx
+; X32-NEXT: movl -84(%ebp), %edi # 4-byte Reload
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: movl %eax, -284(%ebp) # 4-byte Spill
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %edx, %edi
+; X32-NEXT: movl %eax, %ebx
+; X32-NEXT: addl %esi, %ebx
+; X32-NEXT: adcl $0, %edi
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: mull -116(%ebp) # 4-byte Folded Reload
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: addl %ebx, %ecx
+; X32-NEXT: adcl %edi, %esi
+; X32-NEXT: setb %bl
+; X32-NEXT: movl -252(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull -116(%ebp) # 4-byte Folded Reload
+; X32-NEXT: addl %esi, %eax
+; X32-NEXT: movzbl %bl, %esi
+; X32-NEXT: adcl %esi, %edx
+; X32-NEXT: addl -104(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: adcl -100(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: movl -284(%ebp), %esi # 4-byte Reload
+; X32-NEXT: addl -108(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: adcl -168(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: adcl -92(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: adcl -96(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: addl -28(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: movl %esi, %edi
+; X32-NEXT: adcl -256(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: movl %ecx, %ebx
+; X32-NEXT: adcl -248(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: movl %eax, -116(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -128(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: movl %edx, -256(%ebp) # 4-byte Spill
+; X32-NEXT: movl -304(%ebp), %eax # 4-byte Reload
+; X32-NEXT: addl -64(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: movl -132(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: adcl -220(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: movl -236(%ebp), %edx # 4-byte Reload
+; X32-NEXT: adcl -356(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: movl -204(%ebp), %esi # 4-byte Reload
+; X32-NEXT: adcl -32(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: adcl -148(%ebp), %edi # 4-byte Folded Reload
+; X32-NEXT: movl %edi, -284(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -164(%ebp), %ebx # 4-byte Folded Reload
+; X32-NEXT: movl -384(%ebp), %edi # 4-byte Reload
+; X32-NEXT: adcl %edi, -116(%ebp) # 4-byte Folded Spill
+; X32-NEXT: movl -256(%ebp), %edi # 4-byte Reload
+; X32-NEXT: adcl -300(%ebp), %edi # 4-byte Folded Reload
+; X32-NEXT: addl -76(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: movl %eax, -304(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -72(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: movl %ecx, -132(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -176(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: movl %edx, -236(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -200(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: movl %esi, -204(%ebp) # 4-byte Spill
+; X32-NEXT: movl -224(%ebp), %eax # 4-byte Reload
+; X32-NEXT: adcl %eax, -284(%ebp) # 4-byte Folded Spill
+; X32-NEXT: adcl -380(%ebp), %ebx # 4-byte Folded Reload
+; X32-NEXT: movl %ebx, -140(%ebp) # 4-byte Spill
+; X32-NEXT: movl -308(%ebp), %eax # 4-byte Reload
+; X32-NEXT: adcl %eax, -116(%ebp) # 4-byte Folded Spill
+; X32-NEXT: adcl -208(%ebp), %edi # 4-byte Folded Reload
+; X32-NEXT: movl %edi, -256(%ebp) # 4-byte Spill
+; X32-NEXT: movl -492(%ebp), %esi # 4-byte Reload
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: movl -260(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %eax, -108(%ebp) # 4-byte Spill
+; X32-NEXT: movl %edx, -28(%ebp) # 4-byte Spill
+; X32-NEXT: movl 12(%ebp), %eax
+; X32-NEXT: movl 92(%eax), %eax
+; X32-NEXT: movl %eax, -96(%ebp) # 4-byte Spill
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, %edi
+; X32-NEXT: movl %eax, %ebx
+; X32-NEXT: addl -28(%ebp), %ebx # 4-byte Folded Reload
+; X32-NEXT: adcl $0, %edi
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: movl -124(%ebp), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: addl %ebx, %eax
+; X32-NEXT: movl %eax, -104(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %edi, %ecx
+; X32-NEXT: setb %bl
+; X32-NEXT: movl -96(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %esi
; X32-NEXT: addl %ecx, %eax
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X32-NEXT: movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
+; X32-NEXT: movzbl %bl, %ecx
+; X32-NEXT: adcl %ecx, %edx
+; X32-NEXT: movl -556(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: addl -136(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: movl -560(%ebp), %esi # 4-byte Reload
+; X32-NEXT: adcl -264(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: addl %eax, %ecx
+; X32-NEXT: movl %ecx, -92(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %edx, %esi
+; X32-NEXT: movl %esi, -28(%ebp) # 4-byte Spill
+; X32-NEXT: movl -552(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: movl -260(%ebp), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, -168(%ebp) # 4-byte Spill
+; X32-NEXT: movl %eax, -148(%ebp) # 4-byte Spill
+; X32-NEXT: movl -460(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %edi
+; X32-NEXT: movl %eax, %ebx
+; X32-NEXT: addl -168(%ebp), %ebx # 4-byte Folded Reload
+; X32-NEXT: adcl $0, %edi
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: movl -124(%ebp), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: addl %ebx, %eax
+; X32-NEXT: movl %eax, -128(%ebp) # 4-byte Spill
; X32-NEXT: adcl %edi, %ecx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X32-NEXT: movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
+; X32-NEXT: setb %bl
+; X32-NEXT: movl -460(%ebp), %edi # 4-byte Reload
+; X32-NEXT: movl %edi, %eax
+; X32-NEXT: mull %esi
+; X32-NEXT: addl %ecx, %eax
+; X32-NEXT: movzbl %bl, %ecx
+; X32-NEXT: adcl %ecx, %edx
+; X32-NEXT: addl -712(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: adcl -976(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: addl -108(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: movl %eax, -48(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -104(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: movl %edx, -168(%ebp) # 4-byte Spill
+; X32-NEXT: adcl $0, -92(%ebp) # 4-byte Folded Spill
+; X32-NEXT: adcl $0, -28(%ebp) # 4-byte Folded Spill
+; X32-NEXT: movl -552(%ebp), %esi # 4-byte Reload
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: movl -184(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, -104(%ebp) # 4-byte Spill
+; X32-NEXT: movl %eax, -108(%ebp) # 4-byte Spill
+; X32-NEXT: movl %edi, %eax
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: addl -104(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: adcl $0, %ebx
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: movl -60(%ebp), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %edi
+; X32-NEXT: addl %ecx, %eax
+; X32-NEXT: movl %eax, -104(%ebp) # 4-byte Spill
; X32-NEXT: adcl %ebx, %edi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X32-NEXT: movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
-; X32-NEXT: addl %edx, %eax
-; X32-NEXT: adcl %esi, %ecx
-; X32-NEXT: movl %edi, %edx
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT: addl {{[0-9]+}}(%esp), %edi
-; X32-NEXT: movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %edi
-; X32-NEXT: movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT: adcl %edi, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT: adcl %edi, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT: adcl $0, %eax
+; X32-NEXT: setb %cl
+; X32-NEXT: movl -460(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: addl %edi, %eax
+; X32-NEXT: movzbl %cl, %ecx
+; X32-NEXT: adcl %ecx, %edx
+; X32-NEXT: movl -524(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: addl -160(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: movl -528(%ebp), %esi # 4-byte Reload
+; X32-NEXT: adcl -268(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: addl %eax, %ecx
+; X32-NEXT: adcl %edx, %esi
+; X32-NEXT: movl -48(%ebp), %eax # 4-byte Reload
+; X32-NEXT: addl %eax, -108(%ebp) # 4-byte Folded Spill
+; X32-NEXT: movl -168(%ebp), %eax # 4-byte Reload
+; X32-NEXT: adcl %eax, -104(%ebp) # 4-byte Folded Spill
; X32-NEXT: adcl $0, %ecx
+; X32-NEXT: adcl $0, %esi
+; X32-NEXT: addl -92(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: adcl -28(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: setb -28(%ebp) # 1-byte Folded Spill
+; X32-NEXT: movl -492(%ebp), %eax # 4-byte Reload
+; X32-NEXT: movl -184(%ebp), %edi # 4-byte Reload
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %edx, -92(%ebp) # 4-byte Spill
+; X32-NEXT: movl %eax, -212(%ebp) # 4-byte Spill
+; X32-NEXT: movl -96(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %eax, %ebx
+; X32-NEXT: addl -92(%ebp), %ebx # 4-byte Folded Reload
; X32-NEXT: adcl $0, %edx
+; X32-NEXT: movl %edx, -92(%ebp) # 4-byte Spill
+; X32-NEXT: movl -492(%ebp), %eax # 4-byte Reload
+; X32-NEXT: movl -60(%ebp), %edi # 4-byte Reload
+; X32-NEXT: mull %edi
+; X32-NEXT: addl %ebx, %eax
+; X32-NEXT: movl %eax, -208(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -92(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: setb -92(%ebp) # 1-byte Folded Spill
+; X32-NEXT: movl -96(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %edi
+; X32-NEXT: addl %ebx, %eax
+; X32-NEXT: movzbl -92(%ebp), %edi # 1-byte Folded Reload
+; X32-NEXT: adcl %edi, %edx
+; X32-NEXT: movl -556(%ebp), %edi # 4-byte Reload
+; X32-NEXT: addl -160(%ebp), %edi # 4-byte Folded Reload
+; X32-NEXT: movl -560(%ebp), %ebx # 4-byte Reload
+; X32-NEXT: adcl -268(%ebp), %ebx # 4-byte Folded Reload
+; X32-NEXT: addl %eax, %edi
+; X32-NEXT: adcl %edx, %ebx
+; X32-NEXT: movl -212(%ebp), %edx # 4-byte Reload
+; X32-NEXT: addl %ecx, %edx
+; X32-NEXT: movl -208(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: adcl %esi, %ecx
+; X32-NEXT: movzbl -28(%ebp), %eax # 1-byte Folded Reload
+; X32-NEXT: adcl %eax, %edi
; X32-NEXT: adcl $0, %ebx
-; X32-NEXT: addl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
-; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
-; X32-NEXT: movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: setb {{[0-9]+}}(%esp) # 1-byte Folded Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: addl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl %eax, %esi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: adcl $0, %eax
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: adcl $0, %edx
-; X32-NEXT: addl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %esi
-; X32-NEXT: movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X32-NEXT: adcl $0, %esi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X32-NEXT: addl -712(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: movl %edx, -212(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -968(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: movl %ecx, -208(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -964(%ebp), %edi # 4-byte Folded Reload
+; X32-NEXT: movl %edi, -244(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -972(%ebp), %ebx # 4-byte Folded Reload
+; X32-NEXT: movl %ebx, -248(%ebp) # 4-byte Spill
+; X32-NEXT: movl -388(%ebp), %esi # 4-byte Reload
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: movl -260(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %eax, -92(%ebp) # 4-byte Spill
+; X32-NEXT: movl %edx, -168(%ebp) # 4-byte Spill
+; X32-NEXT: movl 12(%ebp), %eax
+; X32-NEXT: movl 76(%eax), %eax
+; X32-NEXT: movl %eax, -28(%ebp) # 4-byte Spill
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, %edi
+; X32-NEXT: movl %eax, %ebx
+; X32-NEXT: addl -168(%ebp), %ebx # 4-byte Folded Reload
; X32-NEXT: adcl $0, %edi
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: movl -124(%ebp), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: addl %ebx, %eax
+; X32-NEXT: movl %eax, -252(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %edi, %ecx
+; X32-NEXT: setb %bl
+; X32-NEXT: movl -28(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: addl %ecx, %eax
+; X32-NEXT: movzbl %bl, %ecx
+; X32-NEXT: adcl %ecx, %edx
+; X32-NEXT: movl -564(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: addl -136(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: movl -568(%ebp), %esi # 4-byte Reload
+; X32-NEXT: adcl -264(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: addl %eax, %ecx
+; X32-NEXT: movl %ecx, -156(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %edx, %esi
+; X32-NEXT: movl %esi, -48(%ebp) # 4-byte Spill
+; X32-NEXT: movl -520(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: movl -260(%ebp), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, -308(%ebp) # 4-byte Spill
+; X32-NEXT: movl %eax, -168(%ebp) # 4-byte Spill
+; X32-NEXT: movl -444(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %edi
+; X32-NEXT: movl %eax, %ebx
+; X32-NEXT: addl -308(%ebp), %ebx # 4-byte Folded Reload
+; X32-NEXT: adcl $0, %edi
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: movl -124(%ebp), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: addl %ebx, %eax
+; X32-NEXT: movl %eax, -308(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %edi, %ecx
+; X32-NEXT: setb %bl
+; X32-NEXT: movl -444(%ebp), %edi # 4-byte Reload
+; X32-NEXT: movl %edi, %eax
+; X32-NEXT: mull %esi
+; X32-NEXT: addl %ecx, %eax
+; X32-NEXT: movzbl %bl, %ecx
+; X32-NEXT: adcl %ecx, %edx
+; X32-NEXT: addl -716(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: adcl -992(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: addl -92(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: movl %eax, -84(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -252(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: movl %edx, -100(%ebp) # 4-byte Spill
+; X32-NEXT: adcl $0, -156(%ebp) # 4-byte Folded Spill
+; X32-NEXT: adcl $0, -48(%ebp) # 4-byte Folded Spill
+; X32-NEXT: movl -520(%ebp), %esi # 4-byte Reload
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: movl -184(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, -92(%ebp) # 4-byte Spill
+; X32-NEXT: movl %eax, -252(%ebp) # 4-byte Spill
+; X32-NEXT: movl %edi, %eax
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: addl -92(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: adcl $0, %ebx
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: movl -60(%ebp), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %edi
+; X32-NEXT: addl %ecx, %eax
+; X32-NEXT: movl %eax, -92(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %ebx, %edi
+; X32-NEXT: setb %cl
+; X32-NEXT: movl -444(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: addl %edi, %eax
+; X32-NEXT: movzbl %cl, %ecx
+; X32-NEXT: adcl %ecx, %edx
+; X32-NEXT: movl -500(%ebp), %esi # 4-byte Reload
+; X32-NEXT: addl -160(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: movl -496(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: adcl -268(%ebp), %ecx # 4-byte Folded Reload
; X32-NEXT: addl %eax, %esi
-; X32-NEXT: adcl %edx, %edi
-; X32-NEXT: setb %al
-; X32-NEXT: addl {{[0-9]+}}(%esp), %esi
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %edi
-; X32-NEXT: movzbl %al, %eax
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: adcl $0, %edx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
-; X32-NEXT: addl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
-; X32-NEXT: adcl %ecx, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
-; X32-NEXT: adcl %ecx, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT: addl %esi, %ebx
-; X32-NEXT: movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl %eax, %esi
-; X32-NEXT: adcl %edi, %esi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
; X32-NEXT: adcl %edx, %ecx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: addl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: adcl %eax, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: adcl %eax, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax # 1-byte Folded Reload
-; X32-NEXT: adcl %eax, {{[0-9]+}}(%esp) # 4-byte Folded Spill
+; X32-NEXT: movl -84(%ebp), %eax # 4-byte Reload
+; X32-NEXT: addl %eax, -252(%ebp) # 4-byte Folded Spill
+; X32-NEXT: movl -100(%ebp), %eax # 4-byte Reload
+; X32-NEXT: adcl %eax, -92(%ebp) # 4-byte Folded Spill
; X32-NEXT: adcl $0, %esi
-; X32-NEXT: movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl $0, %edi
-; X32-NEXT: movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
; X32-NEXT: adcl $0, %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: addl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X32-NEXT: addl -156(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: adcl -48(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: setb -48(%ebp) # 1-byte Folded Spill
+; X32-NEXT: movl -388(%ebp), %eax # 4-byte Reload
+; X32-NEXT: movl -184(%ebp), %edi # 4-byte Reload
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %edx, -156(%ebp) # 4-byte Spill
+; X32-NEXT: movl %eax, -100(%ebp) # 4-byte Spill
+; X32-NEXT: movl -28(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %eax, %ebx
+; X32-NEXT: addl -156(%ebp), %ebx # 4-byte Folded Reload
+; X32-NEXT: adcl $0, %edx
+; X32-NEXT: movl %edx, -156(%ebp) # 4-byte Spill
+; X32-NEXT: movl -388(%ebp), %eax # 4-byte Reload
+; X32-NEXT: movl -60(%ebp), %edi # 4-byte Reload
+; X32-NEXT: mull %edi
+; X32-NEXT: addl %ebx, %eax
+; X32-NEXT: movl %eax, -84(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -156(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: setb -156(%ebp) # 1-byte Folded Spill
+; X32-NEXT: movl -28(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %edi
+; X32-NEXT: addl %ebx, %eax
+; X32-NEXT: movzbl -156(%ebp), %edi # 1-byte Folded Reload
+; X32-NEXT: adcl %edi, %edx
+; X32-NEXT: movl -564(%ebp), %edi # 4-byte Reload
+; X32-NEXT: addl -160(%ebp), %edi # 4-byte Folded Reload
+; X32-NEXT: movl -568(%ebp), %ebx # 4-byte Reload
+; X32-NEXT: adcl -268(%ebp), %ebx # 4-byte Folded Reload
+; X32-NEXT: addl %eax, %edi
+; X32-NEXT: adcl %edx, %ebx
+; X32-NEXT: movl -100(%ebp), %edx # 4-byte Reload
+; X32-NEXT: addl %esi, %edx
+; X32-NEXT: movl -84(%ebp), %esi # 4-byte Reload
+; X32-NEXT: adcl %ecx, %esi
+; X32-NEXT: movzbl -48(%ebp), %eax # 1-byte Folded Reload
+; X32-NEXT: adcl %eax, %edi
+; X32-NEXT: adcl $0, %ebx
+; X32-NEXT: addl -716(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: adcl -988(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: adcl -984(%ebp), %edi # 4-byte Folded Reload
+; X32-NEXT: adcl -980(%ebp), %ebx # 4-byte Folded Reload
+; X32-NEXT: addl -148(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: movl %edx, -100(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -128(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: movl %esi, -84(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -108(%ebp), %edi # 4-byte Folded Reload
+; X32-NEXT: movl %edi, -144(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -104(%ebp), %ebx # 4-byte Folded Reload
+; X32-NEXT: movl %ebx, -188(%ebp) # 4-byte Spill
+; X32-NEXT: adcl $0, -212(%ebp) # 4-byte Folded Spill
+; X32-NEXT: adcl $0, -208(%ebp) # 4-byte Folded Spill
+; X32-NEXT: adcl $0, -244(%ebp) # 4-byte Folded Spill
+; X32-NEXT: adcl $0, -248(%ebp) # 4-byte Folded Spill
+; X32-NEXT: movl -388(%ebp), %esi # 4-byte Reload
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: movl -348(%ebp), %edi # 4-byte Reload
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: movl %eax, -108(%ebp) # 4-byte Spill
+; X32-NEXT: movl -28(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %edx, %edi
+; X32-NEXT: movl %eax, %ebx
+; X32-NEXT: addl %ecx, %ebx
+; X32-NEXT: adcl $0, %edi
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: movl -216(%ebp), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: addl %ebx, %eax
+; X32-NEXT: movl %eax, -48(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %edi, %ecx
+; X32-NEXT: setb %bl
+; X32-NEXT: movl -28(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: addl %ecx, %eax
+; X32-NEXT: movzbl %bl, %ecx
+; X32-NEXT: adcl %ecx, %edx
+; X32-NEXT: movl -564(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: addl -180(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: movl -568(%ebp), %esi # 4-byte Reload
+; X32-NEXT: adcl -320(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: addl %eax, %ecx
+; X32-NEXT: movl %ecx, -128(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %edx, %esi
+; X32-NEXT: movl %esi, -148(%ebp) # 4-byte Spill
+; X32-NEXT: movl -520(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: movl -348(%ebp), %edi # 4-byte Reload
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: movl %eax, -156(%ebp) # 4-byte Spill
+; X32-NEXT: movl -444(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %edx, %edi
+; X32-NEXT: movl %eax, %ebx
+; X32-NEXT: addl %esi, %ebx
; X32-NEXT: adcl $0, %edi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: movl -216(%ebp), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: addl %ebx, %eax
+; X32-NEXT: movl %eax, -104(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %edi, %ecx
+; X32-NEXT: setb -112(%ebp) # 1-byte Folded Spill
+; X32-NEXT: movl -444(%ebp), %ebx # 4-byte Reload
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: mull %esi
+; X32-NEXT: addl %ecx, %eax
+; X32-NEXT: movzbl -112(%ebp), %ecx # 1-byte Folded Reload
+; X32-NEXT: adcl %ecx, %edx
+; X32-NEXT: addl -720(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: adcl -1008(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: addl -108(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: movl %eax, -300(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -48(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: movl %edx, -112(%ebp) # 4-byte Spill
+; X32-NEXT: adcl $0, -128(%ebp) # 4-byte Folded Spill
+; X32-NEXT: adcl $0, -148(%ebp) # 4-byte Folded Spill
+; X32-NEXT: movl -520(%ebp), %esi # 4-byte Reload
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: movl -288(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, %edi
+; X32-NEXT: movl %eax, -48(%ebp) # 4-byte Spill
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: addl %edi, %ecx
; X32-NEXT: adcl $0, %ebx
-; X32-NEXT: addl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: adcl $0, %edx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X32-NEXT: adcl $0, %esi
-; X32-NEXT: addl %edi, %edx
-; X32-NEXT: adcl %ebx, %esi
-; X32-NEXT: setb %al
-; X32-NEXT: addl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %esi
-; X32-NEXT: movzbl %al, %eax
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: adcl $0, %eax
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X32-NEXT: addl {{[0-9]+}}(%esp), %edi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %ebx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: addl %edx, %edi
-; X32-NEXT: movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl %esi, %ebx
-; X32-NEXT: movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: addl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl %eax, %esi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: adcl $0, %eax
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: movl -16(%ebp), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %edi
+; X32-NEXT: addl %ecx, %eax
+; X32-NEXT: movl %eax, -108(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %ebx, %edi
+; X32-NEXT: setb %cl
+; X32-NEXT: movl -444(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: addl %edi, %eax
+; X32-NEXT: movzbl %cl, %ecx
+; X32-NEXT: adcl %ecx, %edx
+; X32-NEXT: movl -500(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: addl -280(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: movl -496(%ebp), %esi # 4-byte Reload
+; X32-NEXT: adcl -312(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: addl %eax, %ecx
+; X32-NEXT: adcl %edx, %esi
+; X32-NEXT: movl -300(%ebp), %eax # 4-byte Reload
+; X32-NEXT: addl %eax, -48(%ebp) # 4-byte Folded Spill
+; X32-NEXT: movl -112(%ebp), %eax # 4-byte Reload
+; X32-NEXT: adcl %eax, -108(%ebp) # 4-byte Folded Spill
; X32-NEXT: adcl $0, %ecx
-; X32-NEXT: addl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %esi
-; X32-NEXT: movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: adcl $0, %edx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
; X32-NEXT: adcl $0, %esi
-; X32-NEXT: addl %eax, %edx
-; X32-NEXT: adcl %ecx, %esi
-; X32-NEXT: setb %al
-; X32-NEXT: addl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %esi
-; X32-NEXT: movzbl %al, %eax
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: adcl $0, %eax
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X32-NEXT: addl {{[0-9]+}}(%esp), %edi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %ebx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: addl %edx, %edi
-; X32-NEXT: adcl %esi, %ebx
-; X32-NEXT: movl %ecx, %edx
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: addl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl %eax, %esi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
-; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
-; X32-NEXT: addl {{[0-9]+}}(%esp), %esi # 4-byte Folded Reload
-; X32-NEXT: movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
-; X32-NEXT: adcl %esi, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
-; X32-NEXT: movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
-; X32-NEXT: movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: adcl %eax, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
-; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: addl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT: addl -128(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: adcl -148(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: setb -112(%ebp) # 1-byte Folded Spill
+; X32-NEXT: movl -388(%ebp), %eax # 4-byte Reload
+; X32-NEXT: movl -288(%ebp), %edi # 4-byte Reload
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %edx, -128(%ebp) # 4-byte Spill
+; X32-NEXT: movl %eax, -148(%ebp) # 4-byte Spill
+; X32-NEXT: movl -28(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %eax, %ebx
+; X32-NEXT: addl -128(%ebp), %ebx # 4-byte Folded Reload
; X32-NEXT: adcl $0, %edx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT: movl %edx, -300(%ebp) # 4-byte Spill
+; X32-NEXT: movl -388(%ebp), %eax # 4-byte Reload
+; X32-NEXT: movl -16(%ebp), %edi # 4-byte Reload
+; X32-NEXT: mull %edi
+; X32-NEXT: addl %ebx, %eax
+; X32-NEXT: movl %eax, -128(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -300(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: setb -300(%ebp) # 1-byte Folded Spill
+; X32-NEXT: movl -28(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %edi
+; X32-NEXT: addl %ebx, %eax
+; X32-NEXT: movzbl -300(%ebp), %edi # 1-byte Folded Reload
+; X32-NEXT: adcl %edi, %edx
+; X32-NEXT: movl -564(%ebp), %edi # 4-byte Reload
+; X32-NEXT: addl -280(%ebp), %edi # 4-byte Folded Reload
+; X32-NEXT: movl -568(%ebp), %ebx # 4-byte Reload
+; X32-NEXT: adcl -312(%ebp), %ebx # 4-byte Folded Reload
+; X32-NEXT: addl %eax, %edi
+; X32-NEXT: adcl %edx, %ebx
+; X32-NEXT: movl -148(%ebp), %edx # 4-byte Reload
+; X32-NEXT: addl %ecx, %edx
+; X32-NEXT: movl -128(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: adcl %esi, %ecx
+; X32-NEXT: movzbl -112(%ebp), %eax # 1-byte Folded Reload
+; X32-NEXT: adcl %eax, %edi
+; X32-NEXT: adcl $0, %ebx
+; X32-NEXT: addl -720(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: adcl -664(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: adcl -996(%ebp), %edi # 4-byte Folded Reload
+; X32-NEXT: adcl -1000(%ebp), %ebx # 4-byte Folded Reload
+; X32-NEXT: movl -100(%ebp), %eax # 4-byte Reload
+; X32-NEXT: addl %eax, -156(%ebp) # 4-byte Folded Spill
+; X32-NEXT: movl -84(%ebp), %eax # 4-byte Reload
+; X32-NEXT: adcl %eax, -104(%ebp) # 4-byte Folded Spill
+; X32-NEXT: movl -144(%ebp), %eax # 4-byte Reload
+; X32-NEXT: adcl %eax, -48(%ebp) # 4-byte Folded Spill
+; X32-NEXT: movl -188(%ebp), %eax # 4-byte Reload
+; X32-NEXT: adcl %eax, -108(%ebp) # 4-byte Folded Spill
+; X32-NEXT: adcl $0, %edx
+; X32-NEXT: adcl $0, %ecx
+; X32-NEXT: adcl $0, %edi
+; X32-NEXT: adcl $0, %ebx
+; X32-NEXT: addl -212(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: movl %edx, -148(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -208(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: movl %ecx, -128(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -244(%ebp), %edi # 4-byte Folded Reload
+; X32-NEXT: movl %edi, -84(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -248(%ebp), %ebx # 4-byte Folded Reload
+; X32-NEXT: movl %ebx, -144(%ebp) # 4-byte Spill
+; X32-NEXT: setb -100(%ebp) # 1-byte Folded Spill
+; X32-NEXT: movl -492(%ebp), %ebx # 4-byte Reload
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: movl -348(%ebp), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: movl %eax, -212(%ebp) # 4-byte Spill
+; X32-NEXT: movl -96(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %edi
+; X32-NEXT: movl %eax, %esi
+; X32-NEXT: addl %ecx, %esi
+; X32-NEXT: adcl $0, %edi
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: movl -216(%ebp), %ebx # 4-byte Reload
+; X32-NEXT: mull %ebx
+; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: addl %esi, %eax
+; X32-NEXT: movl %eax, -208(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %edi, %ecx
+; X32-NEXT: setb -248(%ebp) # 1-byte Folded Spill
+; X32-NEXT: movl -96(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %ebx
+; X32-NEXT: addl %ecx, %eax
+; X32-NEXT: movzbl -248(%ebp), %ecx # 1-byte Folded Reload
+; X32-NEXT: adcl %ecx, %edx
+; X32-NEXT: movl -180(%ebp), %esi # 4-byte Reload
+; X32-NEXT: addl -556(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: movl -320(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: adcl -560(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: addl %eax, %esi
+; X32-NEXT: movl %esi, -180(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %edx, %ecx
+; X32-NEXT: movl %ecx, -320(%ebp) # 4-byte Spill
+; X32-NEXT: movl -552(%ebp), %ebx # 4-byte Reload
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: movl -348(%ebp), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: movl %eax, -248(%ebp) # 4-byte Spill
+; X32-NEXT: movl -460(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: movl %eax, %edi
+; X32-NEXT: addl %ecx, %edi
+; X32-NEXT: adcl $0, %esi
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: movl -216(%ebp), %ebx # 4-byte Reload
+; X32-NEXT: mull %ebx
+; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: addl %edi, %eax
+; X32-NEXT: movl %eax, -244(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %esi, %ecx
+; X32-NEXT: setb -188(%ebp) # 1-byte Folded Spill
+; X32-NEXT: movl -460(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %ebx
+; X32-NEXT: movl %eax, %edi
+; X32-NEXT: addl %ecx, %edi
+; X32-NEXT: movzbl -188(%ebp), %eax # 1-byte Folded Reload
+; X32-NEXT: adcl %eax, %edx
+; X32-NEXT: addl -724(%ebp), %edi # 4-byte Folded Reload
+; X32-NEXT: adcl -1004(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: addl -212(%ebp), %edi # 4-byte Folded Reload
+; X32-NEXT: adcl -208(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: movl %edx, -188(%ebp) # 4-byte Spill
+; X32-NEXT: adcl $0, -180(%ebp) # 4-byte Folded Spill
+; X32-NEXT: adcl $0, -320(%ebp) # 4-byte Folded Spill
+; X32-NEXT: movl -552(%ebp), %esi # 4-byte Reload
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: movl -288(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, -208(%ebp) # 4-byte Spill
+; X32-NEXT: movl %eax, -212(%ebp) # 4-byte Spill
+; X32-NEXT: movl -460(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: addl -208(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: adcl $0, %ebx
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: mull -16(%ebp) # 4-byte Folded Reload
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: addl %ecx, %eax
+; X32-NEXT: movl %eax, -208(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %ebx, %esi
+; X32-NEXT: setb %cl
+; X32-NEXT: movl -460(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull -16(%ebp) # 4-byte Folded Reload
+; X32-NEXT: addl %esi, %eax
+; X32-NEXT: movzbl %cl, %ecx
+; X32-NEXT: adcl %ecx, %edx
+; X32-NEXT: movl -524(%ebp), %esi # 4-byte Reload
+; X32-NEXT: addl -280(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: movl -528(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: adcl -312(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: addl %eax, %esi
+; X32-NEXT: adcl %edx, %ecx
+; X32-NEXT: addl %edi, -212(%ebp) # 4-byte Folded Spill
+; X32-NEXT: movl -188(%ebp), %eax # 4-byte Reload
+; X32-NEXT: adcl %eax, -208(%ebp) # 4-byte Folded Spill
; X32-NEXT: adcl $0, %esi
-; X32-NEXT: addl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: adcl $0, %eax
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: adcl $0, %ecx
-; X32-NEXT: addl %edx, %eax
+; X32-NEXT: addl -180(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: movl %esi, -524(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -320(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: movl %ecx, -528(%ebp) # 4-byte Spill
+; X32-NEXT: setb -180(%ebp) # 1-byte Folded Spill
+; X32-NEXT: movl -492(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: movl -288(%ebp), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, -188(%ebp) # 4-byte Spill
+; X32-NEXT: movl %eax, -320(%ebp) # 4-byte Spill
+; X32-NEXT: movl -96(%ebp), %ebx # 4-byte Reload
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: movl %eax, %edi
+; X32-NEXT: addl -188(%ebp), %edi # 4-byte Folded Reload
+; X32-NEXT: adcl $0, %esi
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: mull -16(%ebp) # 4-byte Folded Reload
+; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: addl %edi, %eax
+; X32-NEXT: movl %eax, %edi
; X32-NEXT: adcl %esi, %ecx
-; X32-NEXT: setb %dl
-; X32-NEXT: addl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: movzbl %dl, %edx
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: adcl $0, %edx
-; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X32-NEXT: movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: addl {{[0-9]+}}(%esp), %esi # 4-byte Folded Reload
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X32-NEXT: movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X32-NEXT: movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
+; X32-NEXT: setb -188(%ebp) # 1-byte Folded Spill
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: mull -16(%ebp) # 4-byte Folded Reload
+; X32-NEXT: addl %ecx, %eax
+; X32-NEXT: movzbl -188(%ebp), %ecx # 1-byte Folded Reload
+; X32-NEXT: adcl %ecx, %edx
+; X32-NEXT: movl -556(%ebp), %esi # 4-byte Reload
+; X32-NEXT: addl -280(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: movl -560(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: adcl -312(%ebp), %ecx # 4-byte Folded Reload
; X32-NEXT: addl %eax, %esi
-; X32-NEXT: movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl %ecx, %edi
-; X32-NEXT: movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
-; X32-NEXT: movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
-; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: addl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl %eax, %esi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: adcl $0, %eax
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: adcl %edx, %ecx
+; X32-NEXT: movl -320(%ebp), %edx # 4-byte Reload
+; X32-NEXT: addl -524(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: adcl -528(%ebp), %edi # 4-byte Folded Reload
+; X32-NEXT: movzbl -180(%ebp), %eax # 1-byte Folded Reload
+; X32-NEXT: adcl %eax, %esi
; X32-NEXT: adcl $0, %ecx
-; X32-NEXT: addl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %esi
-; X32-NEXT: movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: adcl $0, %edx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT: addl -724(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: adcl -668(%ebp), %edi # 4-byte Folded Reload
+; X32-NEXT: adcl -732(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: adcl -728(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: movl -148(%ebp), %eax # 4-byte Reload
+; X32-NEXT: addl %eax, -248(%ebp) # 4-byte Folded Spill
+; X32-NEXT: movl -128(%ebp), %eax # 4-byte Reload
+; X32-NEXT: adcl %eax, -244(%ebp) # 4-byte Folded Spill
+; X32-NEXT: movl -84(%ebp), %eax # 4-byte Reload
+; X32-NEXT: adcl %eax, -212(%ebp) # 4-byte Folded Spill
+; X32-NEXT: movl -144(%ebp), %eax # 4-byte Reload
+; X32-NEXT: adcl %eax, -208(%ebp) # 4-byte Folded Spill
+; X32-NEXT: movzbl -100(%ebp), %eax # 1-byte Folded Reload
+; X32-NEXT: adcl %eax, %edx
+; X32-NEXT: movl %edx, -320(%ebp) # 4-byte Spill
+; X32-NEXT: adcl $0, %edi
+; X32-NEXT: movl %edi, -300(%ebp) # 4-byte Spill
; X32-NEXT: adcl $0, %esi
-; X32-NEXT: addl %eax, %edx
+; X32-NEXT: movl %esi, -556(%ebp) # 4-byte Spill
+; X32-NEXT: adcl $0, %ecx
+; X32-NEXT: movl %ecx, -560(%ebp) # 4-byte Spill
+; X32-NEXT: movl 12(%ebp), %ebx
+; X32-NEXT: movl 96(%ebx), %ecx
+; X32-NEXT: movl %ecx, -312(%ebp) # 4-byte Spill
+; X32-NEXT: movl -184(%ebp), %esi # 4-byte Reload
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, -100(%ebp) # 4-byte Spill
+; X32-NEXT: movl %eax, -180(%ebp) # 4-byte Spill
+; X32-NEXT: movl -60(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, %edi
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: addl -100(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: adcl $0, %edi
+; X32-NEXT: movl 100(%ebx), %ebx
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: mull %ebx
+; X32-NEXT: movl %ebx, -100(%ebp) # 4-byte Spill
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: addl %ecx, %eax
+; X32-NEXT: movl %eax, -148(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %edi, %esi
+; X32-NEXT: setb -280(%ebp) # 1-byte Folded Spill
+; X32-NEXT: movl -60(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %ebx
+; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: movl %eax, %ebx
+; X32-NEXT: addl %esi, %ebx
+; X32-NEXT: movzbl -280(%ebp), %eax # 1-byte Folded Reload
+; X32-NEXT: adcl %eax, %ecx
+; X32-NEXT: movl -312(%ebp), %eax # 4-byte Reload
+; X32-NEXT: xorl %edx, %edx
+; X32-NEXT: mull %edx
+; X32-NEXT: movl %eax, -84(%ebp) # 4-byte Spill
+; X32-NEXT: movl %edx, -280(%ebp) # 4-byte Spill
+; X32-NEXT: movl -160(%ebp), %edi # 4-byte Reload
+; X32-NEXT: addl %eax, %edi
+; X32-NEXT: movl -268(%ebp), %esi # 4-byte Reload
+; X32-NEXT: adcl %edx, %esi
+; X32-NEXT: addl %ebx, %edi
+; X32-NEXT: movl %edi, -188(%ebp) # 4-byte Spill
; X32-NEXT: adcl %ecx, %esi
-; X32-NEXT: setb %al
-; X32-NEXT: addl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %esi
-; X32-NEXT: movzbl %al, %eax
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: adcl $0, %eax
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X32-NEXT: movl %esi, -144(%ebp) # 4-byte Spill
+; X32-NEXT: movl -260(%ebp), %edi # 4-byte Reload
+; X32-NEXT: movl %edi, %eax
+; X32-NEXT: movl -312(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: movl %eax, -164(%ebp) # 4-byte Spill
+; X32-NEXT: movl -124(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: addl %ebx, %ecx
+; X32-NEXT: adcl $0, %esi
+; X32-NEXT: movl %edi, %eax
+; X32-NEXT: movl -100(%ebp), %ebx # 4-byte Reload
+; X32-NEXT: mull %ebx
+; X32-NEXT: movl %edx, %edi
; X32-NEXT: addl %ecx, %eax
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT: adcl %ecx, %edi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X32-NEXT: movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
-; X32-NEXT: adcl %ebx, %ecx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X32-NEXT: movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
-; X32-NEXT: addl %edx, %eax
+; X32-NEXT: movl %eax, -384(%ebp) # 4-byte Spill
; X32-NEXT: adcl %esi, %edi
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
-; X32-NEXT: addl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %edi
-; X32-NEXT: movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
-; X32-NEXT: movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl $0, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT: adcl $0, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT: adcl $0, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT: adcl $0, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: addl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl %eax, %esi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: setb %cl
+; X32-NEXT: movl -124(%ebp), %esi # 4-byte Reload
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: mull %ebx
+; X32-NEXT: addl %edi, %eax
+; X32-NEXT: movzbl %cl, %ecx
+; X32-NEXT: adcl %ecx, %edx
+; X32-NEXT: movl -84(%ebp), %edi # 4-byte Reload
+; X32-NEXT: addl -136(%ebp), %edi # 4-byte Folded Reload
+; X32-NEXT: movl -280(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: adcl -264(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: addl %eax, %edi
+; X32-NEXT: adcl %edx, %ecx
+; X32-NEXT: addl -180(%ebp), %edi # 4-byte Folded Reload
+; X32-NEXT: movl %edi, -84(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -148(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: movl %ecx, -280(%ebp) # 4-byte Spill
+; X32-NEXT: adcl $0, -188(%ebp) # 4-byte Folded Spill
+; X32-NEXT: adcl $0, -144(%ebp) # 4-byte Folded Spill
+; X32-NEXT: movl 12(%ebp), %eax
+; X32-NEXT: movl 104(%eax), %ecx
+; X32-NEXT: movl %ecx, -180(%ebp) # 4-byte Spill
+; X32-NEXT: movl -260(%ebp), %ebx # 4-byte Reload
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, -128(%ebp) # 4-byte Spill
+; X32-NEXT: movl %eax, -148(%ebp) # 4-byte Spill
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, %edi
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: addl -128(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: adcl $0, %edi
+; X32-NEXT: movl 12(%ebp), %eax
+; X32-NEXT: movl 108(%eax), %edx
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: movl %ebx, -112(%ebp) # 4-byte Spill
+; X32-NEXT: mull %ebx
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: addl %ecx, %eax
+; X32-NEXT: movl %eax, -128(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %edi, %esi
+; X32-NEXT: setb -176(%ebp) # 1-byte Folded Spill
+; X32-NEXT: movl -124(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %ebx
+; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: movl %eax, %edi
+; X32-NEXT: addl %esi, %edi
+; X32-NEXT: movzbl -176(%ebp), %eax # 1-byte Folded Reload
+; X32-NEXT: adcl %eax, %ecx
+; X32-NEXT: movl -180(%ebp), %ebx # 4-byte Reload
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: xorl %edx, %edx
+; X32-NEXT: mull %edx
+; X32-NEXT: movl %edx, -200(%ebp) # 4-byte Spill
+; X32-NEXT: movl %eax, -176(%ebp) # 4-byte Spill
+; X32-NEXT: movl -136(%ebp), %esi # 4-byte Reload
+; X32-NEXT: addl %eax, %esi
+; X32-NEXT: movl -264(%ebp), %eax # 4-byte Reload
+; X32-NEXT: adcl %edx, %eax
+; X32-NEXT: addl %edi, %esi
+; X32-NEXT: adcl %ecx, %eax
+; X32-NEXT: movl -84(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: addl %ecx, -148(%ebp) # 4-byte Folded Spill
+; X32-NEXT: movl -280(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: adcl %ecx, -128(%ebp) # 4-byte Folded Spill
+; X32-NEXT: adcl $0, %esi
; X32-NEXT: adcl $0, %eax
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: adcl $0, %ecx
-; X32-NEXT: addl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %esi
-; X32-NEXT: movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: adcl $0, %edx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT: addl -188(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: movl %esi, -136(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -144(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: movl %eax, -264(%ebp) # 4-byte Spill
+; X32-NEXT: setb -84(%ebp) # 1-byte Folded Spill
+; X32-NEXT: movl -184(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: movl %ebx, %esi
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, -144(%ebp) # 4-byte Spill
+; X32-NEXT: movl %eax, -280(%ebp) # 4-byte Spill
+; X32-NEXT: movl -60(%ebp), %ebx # 4-byte Reload
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: movl %eax, %edi
+; X32-NEXT: addl -144(%ebp), %edi # 4-byte Folded Reload
; X32-NEXT: adcl $0, %esi
-; X32-NEXT: addl %eax, %edx
-; X32-NEXT: adcl %ecx, %esi
-; X32-NEXT: setb %al
-; X32-NEXT: addl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %esi
-; X32-NEXT: movzbl %al, %eax
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X32-NEXT: adcl $0, %ebx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: addl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
-; X32-NEXT: movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
-; X32-NEXT: addl %edx, %eax
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: mull -112(%ebp) # 4-byte Folded Reload
+; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: addl %edi, %eax
+; X32-NEXT: movl %eax, %edi
; X32-NEXT: adcl %esi, %ecx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %esi # 4-byte Folded Reload
-; X32-NEXT: movl %edi, %edx
-; X32-NEXT: adcl %ebx, %edx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: addl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: adcl %eax, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: adcl $0, %eax
+; X32-NEXT: setb -144(%ebp) # 1-byte Folded Spill
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: movl -112(%ebp), %ebx # 4-byte Reload
+; X32-NEXT: mull %ebx
+; X32-NEXT: addl %ecx, %eax
+; X32-NEXT: movzbl -144(%ebp), %ecx # 1-byte Folded Reload
+; X32-NEXT: adcl %ecx, %edx
+; X32-NEXT: movl -160(%ebp), %esi # 4-byte Reload
+; X32-NEXT: addl -176(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: movl -268(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: adcl -200(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: addl %eax, %esi
+; X32-NEXT: adcl %edx, %ecx
+; X32-NEXT: movl -136(%ebp), %eax # 4-byte Reload
+; X32-NEXT: addl %eax, -280(%ebp) # 4-byte Folded Spill
+; X32-NEXT: adcl -264(%ebp), %edi # 4-byte Folded Reload
+; X32-NEXT: movl %edi, -136(%ebp) # 4-byte Spill
+; X32-NEXT: movzbl -84(%ebp), %eax # 1-byte Folded Reload
+; X32-NEXT: adcl %eax, %esi
+; X32-NEXT: movl %esi, -160(%ebp) # 4-byte Spill
; X32-NEXT: adcl $0, %ecx
+; X32-NEXT: movl %ecx, -268(%ebp) # 4-byte Spill
+; X32-NEXT: movl -348(%ebp), %eax # 4-byte Reload
+; X32-NEXT: movl %ebx, %ecx
+; X32-NEXT: imull %eax, %ecx
+; X32-NEXT: movl -180(%ebp), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %eax, -264(%ebp) # 4-byte Spill
+; X32-NEXT: addl %ecx, %edx
+; X32-NEXT: imull -216(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: addl %edx, %esi
+; X32-NEXT: movl %esi, -180(%ebp) # 4-byte Spill
+; X32-NEXT: movl -288(%ebp), %eax # 4-byte Reload
+; X32-NEXT: movl %eax, %esi
+; X32-NEXT: movl -100(%ebp), %ebx # 4-byte Reload
+; X32-NEXT: imull %ebx, %esi
+; X32-NEXT: movl -312(%ebp), %edi # 4-byte Reload
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: addl %esi, %edx
+; X32-NEXT: movl -16(%ebp), %esi # 4-byte Reload
+; X32-NEXT: imull %edi, %esi
+; X32-NEXT: addl %edx, %esi
+; X32-NEXT: addl -264(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: movl %ecx, -84(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -180(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: movl %esi, -16(%ebp) # 4-byte Spill
+; X32-NEXT: movl %edi, %eax
+; X32-NEXT: movl -348(%ebp), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: movl %eax, -288(%ebp) # 4-byte Spill
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: movl %eax, %ebx
+; X32-NEXT: addl %ecx, %ebx
; X32-NEXT: adcl $0, %esi
-; X32-NEXT: adcl $0, %edx
-; X32-NEXT: addl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %esi # 4-byte Folded Reload
-; X32-NEXT: movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
-; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: setb {{[0-9]+}}(%esp) # 1-byte Folded Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: addl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl %edi, %eax
+; X32-NEXT: movl -216(%ebp), %edi # 4-byte Reload
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: addl %ebx, %eax
+; X32-NEXT: movl %eax, -264(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %esi, %ecx
+; X32-NEXT: setb %bl
+; X32-NEXT: movl -100(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %edi
+; X32-NEXT: addl %ecx, %eax
+; X32-NEXT: movzbl %bl, %ecx
+; X32-NEXT: adcl %ecx, %edx
+; X32-NEXT: addl -84(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: movl %eax, -348(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -16(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: movl %edx, -180(%ebp) # 4-byte Spill
+; X32-NEXT: movl 12(%ebp), %edx
+; X32-NEXT: movl 124(%edx), %ecx
+; X32-NEXT: movl -260(%ebp), %eax # 4-byte Reload
+; X32-NEXT: imull %eax, %ecx
+; X32-NEXT: movl 120(%edx), %esi
+; X32-NEXT: movl %edx, %edi
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %eax, -216(%ebp) # 4-byte Spill
+; X32-NEXT: addl %ecx, %edx
+; X32-NEXT: imull -124(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: addl %edx, %esi
+; X32-NEXT: movl 112(%edi), %ebx
+; X32-NEXT: movl 116(%edi), %ecx
+; X32-NEXT: movl %ecx, -16(%ebp) # 4-byte Spill
+; X32-NEXT: movl -184(%ebp), %eax # 4-byte Reload
+; X32-NEXT: movl %eax, %edi
+; X32-NEXT: imull %ecx, %edi
+; X32-NEXT: mull %ebx
+; X32-NEXT: addl %edi, %edx
+; X32-NEXT: movl -60(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: imull %ebx, %ecx
+; X32-NEXT: addl %edx, %ecx
+; X32-NEXT: addl -216(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: movl %eax, -184(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %esi, %ecx
+; X32-NEXT: movl %ecx, -60(%ebp) # 4-byte Spill
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: movl -260(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, -312(%ebp) # 4-byte Spill
+; X32-NEXT: movl %eax, -216(%ebp) # 4-byte Spill
+; X32-NEXT: movl -16(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, %edi
; X32-NEXT: movl %eax, %esi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: adcl $0, %eax
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: adcl $0, %ecx
-; X32-NEXT: addl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %esi
-; X32-NEXT: movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT: addl -312(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: adcl $0, %edi
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: mull -124(%ebp) # 4-byte Folded Reload
+; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: addl %esi, %ecx
+; X32-NEXT: adcl %edi, %ebx
+; X32-NEXT: setb -260(%ebp) # 1-byte Folded Spill
+; X32-NEXT: movl -16(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull -124(%ebp) # 4-byte Folded Reload
+; X32-NEXT: addl %ebx, %eax
+; X32-NEXT: movzbl -260(%ebp), %esi # 1-byte Folded Reload
+; X32-NEXT: adcl %esi, %edx
+; X32-NEXT: addl -184(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: adcl -60(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: movl -216(%ebp), %esi # 4-byte Reload
+; X32-NEXT: addl -288(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: adcl -264(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: adcl -348(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: adcl -180(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: addl -280(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: movl %esi, -216(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -136(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: movl %ecx, -264(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -160(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: movl %eax, -180(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -268(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: movl %edx, -288(%ebp) # 4-byte Spill
+; X32-NEXT: movl -352(%ebp), %esi # 4-byte Reload
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: movl -520(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, -16(%ebp) # 4-byte Spill
+; X32-NEXT: movl %eax, -60(%ebp) # 4-byte Spill
+; X32-NEXT: movl -120(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, %edi
+; X32-NEXT: movl %eax, %ebx
+; X32-NEXT: addl -16(%ebp), %ebx # 4-byte Folded Reload
+; X32-NEXT: adcl $0, %edi
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: movl -444(%ebp), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: addl %ebx, %eax
+; X32-NEXT: movl %eax, -136(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %edi, %ecx
+; X32-NEXT: setb %bl
+; X32-NEXT: movl -120(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: addl %ecx, %eax
+; X32-NEXT: movzbl %bl, %ecx
+; X32-NEXT: adcl %ecx, %edx
+; X32-NEXT: movl -364(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: addl -500(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: movl -396(%ebp), %esi # 4-byte Reload
+; X32-NEXT: adcl -496(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: addl %eax, %ecx
+; X32-NEXT: movl %ecx, -160(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %edx, %esi
+; X32-NEXT: movl %esi, -16(%ebp) # 4-byte Spill
+; X32-NEXT: movl -416(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: movl -520(%ebp), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, -124(%ebp) # 4-byte Spill
+; X32-NEXT: movl %eax, -184(%ebp) # 4-byte Spill
+; X32-NEXT: movl -316(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %edi
+; X32-NEXT: movl %eax, %ebx
+; X32-NEXT: addl -124(%ebp), %ebx # 4-byte Folded Reload
+; X32-NEXT: adcl $0, %edi
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: movl -444(%ebp), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: addl %ebx, %eax
+; X32-NEXT: movl %eax, -124(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %edi, %ecx
+; X32-NEXT: setb %bl
+; X32-NEXT: movl -316(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: addl %ecx, %eax
+; X32-NEXT: movzbl %bl, %ecx
+; X32-NEXT: adcl %ecx, %edx
+; X32-NEXT: movl -500(%ebp), %esi # 4-byte Reload
+; X32-NEXT: addl -324(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: movl -496(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: adcl -400(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: addl %eax, %esi
+; X32-NEXT: adcl %edx, %ecx
+; X32-NEXT: addl -60(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: movl %esi, -500(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -136(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: movl %ecx, -496(%ebp) # 4-byte Spill
+; X32-NEXT: adcl $0, -160(%ebp) # 4-byte Folded Spill
+; X32-NEXT: adcl $0, -16(%ebp) # 4-byte Folded Spill
+; X32-NEXT: movl -416(%ebp), %ebx # 4-byte Reload
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: movl -388(%ebp), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: movl %eax, -60(%ebp) # 4-byte Spill
+; X32-NEXT: movl -316(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: movl %eax, %edi
+; X32-NEXT: addl %ecx, %edi
; X32-NEXT: adcl $0, %esi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: adcl $0, %edx
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: movl -28(%ebp), %ebx # 4-byte Reload
+; X32-NEXT: mull %ebx
+; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: addl %edi, %eax
+; X32-NEXT: movl %eax, %edi
+; X32-NEXT: adcl %esi, %ecx
+; X32-NEXT: setb -136(%ebp) # 1-byte Folded Spill
+; X32-NEXT: movl -316(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %ebx
+; X32-NEXT: addl %ecx, %eax
+; X32-NEXT: movzbl -136(%ebp), %ecx # 1-byte Folded Reload
+; X32-NEXT: adcl %ecx, %edx
+; X32-NEXT: movl -324(%ebp), %esi # 4-byte Reload
+; X32-NEXT: addl -564(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: movl -400(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: adcl -568(%ebp), %ecx # 4-byte Folded Reload
; X32-NEXT: addl %eax, %esi
+; X32-NEXT: adcl %edx, %ecx
+; X32-NEXT: movl -500(%ebp), %eax # 4-byte Reload
+; X32-NEXT: addl %eax, -60(%ebp) # 4-byte Folded Spill
+; X32-NEXT: adcl -496(%ebp), %edi # 4-byte Folded Reload
+; X32-NEXT: movl %edi, -136(%ebp) # 4-byte Spill
+; X32-NEXT: adcl $0, %esi
+; X32-NEXT: adcl $0, %ecx
+; X32-NEXT: addl -160(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: movl %esi, -324(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -16(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: movl %ecx, -400(%ebp) # 4-byte Spill
+; X32-NEXT: setb -160(%ebp) # 1-byte Folded Spill
+; X32-NEXT: movl -352(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: movl -388(%ebp), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, -268(%ebp) # 4-byte Spill
+; X32-NEXT: movl %eax, -16(%ebp) # 4-byte Spill
+; X32-NEXT: movl -120(%ebp), %ebx # 4-byte Reload
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: movl %eax, %edi
+; X32-NEXT: addl -268(%ebp), %edi # 4-byte Folded Reload
+; X32-NEXT: adcl $0, %esi
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: mull -28(%ebp) # 4-byte Folded Reload
+; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: addl %edi, %eax
+; X32-NEXT: movl %eax, -268(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %esi, %ecx
+; X32-NEXT: setb -260(%ebp) # 1-byte Folded Spill
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: movl -28(%ebp), %edi # 4-byte Reload
+; X32-NEXT: mull %edi
+; X32-NEXT: addl %ecx, %eax
+; X32-NEXT: movzbl -260(%ebp), %ecx # 1-byte Folded Reload
; X32-NEXT: adcl %ecx, %edx
-; X32-NEXT: setb %al
-; X32-NEXT: addl {{[0-9]+}}(%esp), %esi
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: movzbl %al, %eax
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: movl -364(%ebp), %esi # 4-byte Reload
+; X32-NEXT: addl -564(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: movl -396(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: adcl -568(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: addl %eax, %esi
+; X32-NEXT: adcl %edx, %ecx
+; X32-NEXT: movl -324(%ebp), %eax # 4-byte Reload
+; X32-NEXT: addl %eax, -16(%ebp) # 4-byte Folded Spill
+; X32-NEXT: movl -400(%ebp), %eax # 4-byte Reload
+; X32-NEXT: adcl %eax, -268(%ebp) # 4-byte Folded Spill
+; X32-NEXT: movzbl -160(%ebp), %eax # 1-byte Folded Reload
+; X32-NEXT: adcl %eax, %esi
+; X32-NEXT: movl %esi, -364(%ebp) # 4-byte Spill
; X32-NEXT: adcl $0, %ecx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT: addl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
-; X32-NEXT: movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: adcl %eax, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT: addl %esi, %edi
-; X32-NEXT: movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT: adcl %edx, %edi
-; X32-NEXT: movl %ebx, %edx
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
-; X32-NEXT: adcl %ecx, %esi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: addl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: adcl %eax, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: adcl %eax, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax # 1-byte Folded Reload
-; X32-NEXT: adcl %eax, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT: adcl $0, %edi
-; X32-NEXT: movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl $0, %edx
-; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: movl %ecx, -396(%ebp) # 4-byte Spill
+; X32-NEXT: movl -440(%ebp), %eax # 4-byte Reload
+; X32-NEXT: movl %edi, %ecx
+; X32-NEXT: imull %eax, %ecx
+; X32-NEXT: movl -388(%ebp), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %eax, -28(%ebp) # 4-byte Spill
+; X32-NEXT: addl %ecx, %edx
+; X32-NEXT: imull -340(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: addl %edx, %esi
+; X32-NEXT: movl %esi, -388(%ebp) # 4-byte Spill
+; X32-NEXT: movl -408(%ebp), %eax # 4-byte Reload
+; X32-NEXT: movl %eax, %esi
+; X32-NEXT: movl -444(%ebp), %ebx # 4-byte Reload
+; X32-NEXT: imull %ebx, %esi
+; X32-NEXT: movl -520(%ebp), %edi # 4-byte Reload
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: addl %esi, %edx
+; X32-NEXT: movl -192(%ebp), %esi # 4-byte Reload
+; X32-NEXT: imull %edi, %esi
+; X32-NEXT: addl %edx, %esi
+; X32-NEXT: addl -28(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: movl %ecx, -28(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -388(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: movl %esi, -192(%ebp) # 4-byte Spill
+; X32-NEXT: movl %edi, %eax
+; X32-NEXT: movl -440(%ebp), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: movl %eax, -324(%ebp) # 4-byte Spill
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: movl %eax, %ebx
+; X32-NEXT: addl %ecx, %ebx
; X32-NEXT: adcl $0, %esi
-; X32-NEXT: movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: addl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl %edi, %eax
+; X32-NEXT: movl -340(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, %edi
+; X32-NEXT: addl %ebx, %eax
+; X32-NEXT: movl %eax, -260(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %esi, %edi
+; X32-NEXT: setb %bl
+; X32-NEXT: movl -444(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %ecx
+; X32-NEXT: addl %edi, %eax
+; X32-NEXT: movzbl %bl, %ecx
+; X32-NEXT: adcl %ecx, %edx
+; X32-NEXT: addl -28(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: movl %eax, -340(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -192(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: movl %edx, -192(%ebp) # 4-byte Spill
+; X32-NEXT: movl -416(%ebp), %eax # 4-byte Reload
+; X32-NEXT: movl -96(%ebp), %edi # 4-byte Reload
+; X32-NEXT: imull %eax, %edi
; X32-NEXT: movl %eax, %esi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: adcl $0, %eax
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: adcl $0, %ecx
-; X32-NEXT: addl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %esi
-; X32-NEXT: movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: adcl $0, %edx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT: movl -492(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %eax, -28(%ebp) # 4-byte Spill
+; X32-NEXT: addl %edi, %edx
+; X32-NEXT: imull -316(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: addl %edx, %ecx
+; X32-NEXT: movl %ecx, -492(%ebp) # 4-byte Spill
+; X32-NEXT: movl -352(%ebp), %eax # 4-byte Reload
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: movl -460(%ebp), %edi # 4-byte Reload
+; X32-NEXT: imull %edi, %ecx
+; X32-NEXT: movl -552(%ebp), %ebx # 4-byte Reload
+; X32-NEXT: mull %ebx
+; X32-NEXT: addl %ecx, %edx
+; X32-NEXT: movl -120(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: imull %ebx, %ecx
+; X32-NEXT: addl %edx, %ecx
+; X32-NEXT: addl -28(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: movl %eax, -96(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -492(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: movl %ecx, -120(%ebp) # 4-byte Spill
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: movl %eax, -28(%ebp) # 4-byte Spill
+; X32-NEXT: movl %edi, %eax
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: addl %ebx, %ecx
; X32-NEXT: adcl $0, %esi
-; X32-NEXT: addl %eax, %edx
-; X32-NEXT: adcl %ecx, %esi
-; X32-NEXT: setb %al
-; X32-NEXT: addl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %esi
-; X32-NEXT: movzbl %al, %eax
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: adcl $0, %eax
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X32-NEXT: addl {{[0-9]+}}(%esp), %edi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %ebx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: addl %edx, %edi
-; X32-NEXT: movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl %esi, %ebx
-; X32-NEXT: movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: addl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: movl %ecx, %esi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: adcl $0, %edx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: adcl $0, %ecx
-; X32-NEXT: addl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %esi
-; X32-NEXT: movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X32-NEXT: adcl $0, %ebx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: adcl $0, %eax
-; X32-NEXT: addl %edx, %ebx
-; X32-NEXT: adcl %ecx, %eax
+; X32-NEXT: movl -552(%ebp), %eax # 4-byte Reload
+; X32-NEXT: movl -316(%ebp), %ebx # 4-byte Reload
+; X32-NEXT: mull %ebx
+; X32-NEXT: movl %edx, %edi
+; X32-NEXT: addl %ecx, %eax
+; X32-NEXT: movl %eax, -160(%ebp) # 4-byte Spill
+; X32-NEXT: adcl %esi, %edi
; X32-NEXT: setb %cl
-; X32-NEXT: addl {{[0-9]+}}(%esp), %ebx
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl -460(%ebp), %eax # 4-byte Reload
+; X32-NEXT: mull %ebx
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: addl %edi, %eax
; X32-NEXT: movzbl %cl, %ecx
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: adcl $0, %edx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: addl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %esi
-; X32-NEXT: movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %esi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %edi
-; X32-NEXT: addl %ebx, %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
-; X32-NEXT: adcl %eax, %ebx
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %esi # 4-byte Folded Reload
-; X32-NEXT: adcl %edx, %edi
-; X32-NEXT: movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: addl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %esi # 4-byte Folded Reload
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: addl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
-; X32-NEXT: movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
-; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
-; X32-NEXT: movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %esi # 4-byte Folded Reload
-; X32-NEXT: movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: addl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
-; X32-NEXT: movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
-; X32-NEXT: movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
-; X32-NEXT: movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %esi # 4-byte Folded Reload
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: addl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
-; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %esi # 4-byte Folded Reload
-; X32-NEXT: movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %esi # 4-byte Folded Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
-; X32-NEXT: movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
-; X32-NEXT: movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: movl 16(%ebp), %edx
-; X32-NEXT: movl %ecx, 4(%edx)
-; X32-NEXT: movl %eax, (%edx)
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: movl %eax, 8(%edx)
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: movl %eax, 12(%edx)
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: movl %eax, 16(%edx)
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: movl %eax, 20(%edx)
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: movl %eax, 24(%edx)
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: movl %eax, 28(%edx)
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: movl %eax, 32(%edx)
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: movl %eax, 36(%edx)
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: movl %eax, 40(%edx)
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: movl %eax, 44(%edx)
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: movl %eax, 48(%edx)
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: movl %eax, 52(%edx)
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: movl %eax, 56(%edx)
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: movl %eax, 60(%edx)
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: movl %eax, 64(%edx)
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: movl %eax, 68(%edx)
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: movl %eax, 72(%edx)
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: movl %eax, 76(%edx)
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: movl %eax, 80(%edx)
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: movl %eax, 84(%edx)
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: movl %eax, 88(%edx)
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: movl %eax, 92(%edx)
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: movl %eax, 96(%edx)
-; X32-NEXT: movl %esi, 100(%edx)
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: movl %eax, 104(%edx)
-; X32-NEXT: movl %edi, 108(%edx)
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: movl %eax, 112(%edx)
-; X32-NEXT: movl %ebx, 116(%edx)
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: movl %eax, 120(%edx)
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: movl %eax, 124(%edx)
-; X32-NEXT: leal -12(%ebp), %esp
+; X32-NEXT: adcl %ecx, %esi
+; X32-NEXT: addl -96(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: adcl -120(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: movl -28(%ebp), %edx # 4-byte Reload
+; X32-NEXT: addl -324(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: movl -160(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: adcl -260(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: adcl -340(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: adcl -192(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: addl -16(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: adcl -268(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: movl %ecx, -160(%ebp) # 4-byte Spill
+; X32-NEXT: movl %eax, %edx
+; X32-NEXT: adcl -364(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: adcl -396(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: movl %esi, -16(%ebp) # 4-byte Spill
+; X32-NEXT: movl -184(%ebp), %eax # 4-byte Reload
+; X32-NEXT: addl -164(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: movl -124(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: adcl -384(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: movl -60(%ebp), %esi # 4-byte Reload
+; X32-NEXT: adcl -148(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: movl -136(%ebp), %edi # 4-byte Reload
+; X32-NEXT: adcl -128(%ebp), %edi # 4-byte Folded Reload
+; X32-NEXT: adcl -216(%ebp), %ebx # 4-byte Folded Reload
+; X32-NEXT: movl %ebx, -28(%ebp) # 4-byte Spill
+; X32-NEXT: movl -160(%ebp), %ebx # 4-byte Reload
+; X32-NEXT: adcl -264(%ebp), %ebx # 4-byte Folded Reload
+; X32-NEXT: adcl -180(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: movl %edx, -120(%ebp) # 4-byte Spill
+; X32-NEXT: movl -16(%ebp), %edx # 4-byte Reload
+; X32-NEXT: adcl -288(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: movl %edx, -16(%ebp) # 4-byte Spill
+; X32-NEXT: addl -248(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: movl %eax, -184(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -244(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: movl %ecx, -124(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -212(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: movl %esi, -60(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -208(%ebp), %edi # 4-byte Folded Reload
+; X32-NEXT: movl %edi, -136(%ebp) # 4-byte Spill
+; X32-NEXT: movl -28(%ebp), %edx # 4-byte Reload
+; X32-NEXT: adcl -320(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: adcl -300(%ebp), %ebx # 4-byte Folded Reload
+; X32-NEXT: movl %ebx, -160(%ebp) # 4-byte Spill
+; X32-NEXT: movl -120(%ebp), %eax # 4-byte Reload
+; X32-NEXT: adcl -556(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: movl %eax, -120(%ebp) # 4-byte Spill
+; X32-NEXT: movl -16(%ebp), %eax # 4-byte Reload
+; X32-NEXT: adcl -560(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: movl %eax, -16(%ebp) # 4-byte Spill
+; X32-NEXT: movl -168(%ebp), %eax # 4-byte Reload
+; X32-NEXT: addl -344(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: movl %eax, -168(%ebp) # 4-byte Spill
+; X32-NEXT: movl -308(%ebp), %esi # 4-byte Reload
+; X32-NEXT: adcl -232(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: movl -252(%ebp), %edi # 4-byte Reload
+; X32-NEXT: adcl -436(%ebp), %edi # 4-byte Folded Reload
+; X32-NEXT: movl -92(%ebp), %ebx # 4-byte Reload
+; X32-NEXT: adcl -472(%ebp), %ebx # 4-byte Folded Reload
+; X32-NEXT: movl %ebx, -92(%ebp) # 4-byte Spill
+; X32-NEXT: movl -156(%ebp), %ebx # 4-byte Reload
+; X32-NEXT: adcl -88(%ebp), %ebx # 4-byte Folded Reload
+; X32-NEXT: movl -104(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: adcl -296(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: movl %ecx, -104(%ebp) # 4-byte Spill
+; X32-NEXT: movl -48(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: adcl -40(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: movl %ecx, -48(%ebp) # 4-byte Spill
+; X32-NEXT: movl -108(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: adcl -56(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: movl %ecx, -108(%ebp) # 4-byte Spill
+; X32-NEXT: movl -184(%ebp), %eax # 4-byte Reload
+; X32-NEXT: adcl -304(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: movl %eax, -184(%ebp) # 4-byte Spill
+; X32-NEXT: movl -124(%ebp), %eax # 4-byte Reload
+; X32-NEXT: adcl -132(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: movl %eax, -124(%ebp) # 4-byte Spill
+; X32-NEXT: movl -60(%ebp), %eax # 4-byte Reload
+; X32-NEXT: adcl -236(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: movl %eax, -60(%ebp) # 4-byte Spill
+; X32-NEXT: movl -136(%ebp), %eax # 4-byte Reload
+; X32-NEXT: adcl -204(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: movl %eax, -136(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -284(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: movl %edx, -28(%ebp) # 4-byte Spill
+; X32-NEXT: movl -160(%ebp), %edx # 4-byte Reload
+; X32-NEXT: adcl -140(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: movl -120(%ebp), %eax # 4-byte Reload
+; X32-NEXT: adcl -116(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: movl -16(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: adcl -256(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: movl %ecx, -16(%ebp) # 4-byte Spill
+; X32-NEXT: movl -168(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: addl -432(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: movl %ecx, -168(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -456(%ebp), %esi # 4-byte Folded Reload
+; X32-NEXT: movl %esi, -308(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -44(%ebp), %edi # 4-byte Folded Reload
+; X32-NEXT: movl %edi, -252(%ebp) # 4-byte Spill
+; X32-NEXT: movl -92(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: adcl -52(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: movl %ecx, -92(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -24(%ebp), %ebx # 4-byte Folded Reload
+; X32-NEXT: movl %ebx, -156(%ebp) # 4-byte Spill
+; X32-NEXT: movl -104(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: adcl -272(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: movl %ecx, -104(%ebp) # 4-byte Spill
+; X32-NEXT: movl -48(%ebp), %ebx # 4-byte Reload
+; X32-NEXT: adcl -276(%ebp), %ebx # 4-byte Folded Reload
+; X32-NEXT: movl -108(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: adcl -240(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: movl %ecx, -108(%ebp) # 4-byte Spill
+; X32-NEXT: movl -184(%ebp), %edi # 4-byte Reload
+; X32-NEXT: adcl -172(%ebp), %edi # 4-byte Folded Reload
+; X32-NEXT: movl -124(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: adcl -80(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: movl %ecx, -124(%ebp) # 4-byte Spill
+; X32-NEXT: movl -60(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: adcl -36(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: movl %ecx, -60(%ebp) # 4-byte Spill
+; X32-NEXT: movl -136(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: adcl -20(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: movl %ecx, -136(%ebp) # 4-byte Spill
+; X32-NEXT: movl -28(%ebp), %ecx # 4-byte Reload
+; X32-NEXT: adcl -336(%ebp), %ecx # 4-byte Folded Reload
+; X32-NEXT: movl %ecx, -28(%ebp) # 4-byte Spill
+; X32-NEXT: adcl -360(%ebp), %edx # 4-byte Folded Reload
+; X32-NEXT: adcl -392(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: movl %eax, -120(%ebp) # 4-byte Spill
+; X32-NEXT: movl -16(%ebp), %eax # 4-byte Reload
+; X32-NEXT: adcl -412(%ebp), %eax # 4-byte Folded Reload
+; X32-NEXT: movl %eax, -16(%ebp) # 4-byte Spill
+; X32-NEXT: movl 16(%ebp), %ecx
+; X32-NEXT: movl -648(%ebp), %esi # 4-byte Reload
+; X32-NEXT: movl %esi, (%ecx)
+; X32-NEXT: movl -644(%ebp), %esi # 4-byte Reload
+; X32-NEXT: movl %esi, 4(%ecx)
+; X32-NEXT: movl -536(%ebp), %esi # 4-byte Reload
+; X32-NEXT: movl %esi, 8(%ecx)
+; X32-NEXT: movl -596(%ebp), %esi # 4-byte Reload
+; X32-NEXT: movl %esi, 12(%ecx)
+; X32-NEXT: movl -592(%ebp), %esi # 4-byte Reload
+; X32-NEXT: movl %esi, 16(%ecx)
+; X32-NEXT: movl -532(%ebp), %esi # 4-byte Reload
+; X32-NEXT: movl %esi, 20(%ecx)
+; X32-NEXT: movl -428(%ebp), %esi # 4-byte Reload
+; X32-NEXT: movl %esi, 24(%ecx)
+; X32-NEXT: movl -452(%ebp), %esi # 4-byte Reload
+; X32-NEXT: movl %esi, 28(%ecx)
+; X32-NEXT: movl -508(%ebp), %esi # 4-byte Reload
+; X32-NEXT: movl %esi, 32(%ecx)
+; X32-NEXT: movl -504(%ebp), %esi # 4-byte Reload
+; X32-NEXT: movl %esi, 36(%ecx)
+; X32-NEXT: movl -328(%ebp), %esi # 4-byte Reload
+; X32-NEXT: movl %esi, 40(%ecx)
+; X32-NEXT: movl -468(%ebp), %esi # 4-byte Reload
+; X32-NEXT: movl %esi, 44(%ecx)
+; X32-NEXT: movl -404(%ebp), %esi # 4-byte Reload
+; X32-NEXT: movl %esi, 48(%ecx)
+; X32-NEXT: movl -540(%ebp), %esi # 4-byte Reload
+; X32-NEXT: movl %esi, 52(%ecx)
+; X32-NEXT: movl -228(%ebp), %esi # 4-byte Reload
+; X32-NEXT: movl %esi, 56(%ecx)
+; X32-NEXT: movl -196(%ebp), %esi # 4-byte Reload
+; X32-NEXT: movl %esi, 60(%ecx)
+; X32-NEXT: movl -168(%ebp), %eax # 4-byte Reload
+; X32-NEXT: movl %eax, 64(%ecx)
+; X32-NEXT: movl -308(%ebp), %eax # 4-byte Reload
+; X32-NEXT: movl %eax, 68(%ecx)
+; X32-NEXT: movl -252(%ebp), %eax # 4-byte Reload
+; X32-NEXT: movl %eax, 72(%ecx)
+; X32-NEXT: movl -92(%ebp), %eax # 4-byte Reload
+; X32-NEXT: movl %eax, 76(%ecx)
+; X32-NEXT: movl -156(%ebp), %eax # 4-byte Reload
+; X32-NEXT: movl %eax, 80(%ecx)
+; X32-NEXT: movl -104(%ebp), %eax # 4-byte Reload
+; X32-NEXT: movl %eax, 84(%ecx)
+; X32-NEXT: movl %ebx, 88(%ecx)
+; X32-NEXT: movl -108(%ebp), %eax # 4-byte Reload
+; X32-NEXT: movl %eax, 92(%ecx)
+; X32-NEXT: movl %edi, 96(%ecx)
+; X32-NEXT: movl -124(%ebp), %eax # 4-byte Reload
+; X32-NEXT: movl %eax, 100(%ecx)
+; X32-NEXT: movl -60(%ebp), %eax # 4-byte Reload
+; X32-NEXT: movl %eax, 104(%ecx)
+; X32-NEXT: movl -136(%ebp), %eax # 4-byte Reload
+; X32-NEXT: movl %eax, 108(%ecx)
+; X32-NEXT: movl -28(%ebp), %eax # 4-byte Reload
+; X32-NEXT: movl %eax, 112(%ecx)
+; X32-NEXT: movl %edx, 116(%ecx)
+; X32-NEXT: movl -120(%ebp), %eax # 4-byte Reload
+; X32-NEXT: movl %eax, 120(%ecx)
+; X32-NEXT: movl -16(%ebp), %eax # 4-byte Reload
+; X32-NEXT: movl %eax, 124(%ecx)
+; X32-NEXT: addl $996, %esp # imm = 0x3E4
; X32-NEXT: popl %esi
; X32-NEXT: popl %edi
; X32-NEXT: popl %ebx
@@ -4237,7 +6726,7 @@ define void @test_1024(i1024* %a, i1024* %b, i1024* %out) nounwind {
; X32-NEXT: retl
;
; X64-LABEL: test_1024:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: pushq %rbp
; X64-NEXT: pushq %r15
; X64-NEXT: pushq %r14
diff --git a/test/CodeGen/X86/mul-i256.ll b/test/CodeGen/X86/mul-i256.ll
index acd86e949894..c79685aecd08 100644
--- a/test/CodeGen/X86/mul-i256.ll
+++ b/test/CodeGen/X86/mul-i256.ll
@@ -6,189 +6,349 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
define void @test(i256* %a, i256* %b, i256* %out) #0 {
; X32-LABEL: test:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: pushl %ebp
-; X32-NEXT: .Lcfi0:
; X32-NEXT: .cfi_def_cfa_offset 8
-; X32-NEXT: .Lcfi1:
-; X32-NEXT: .cfi_offset %ebp, -8
-; X32-NEXT: movl %esp, %ebp
-; X32-NEXT: .Lcfi2:
-; X32-NEXT: .cfi_def_cfa_register %ebp
; X32-NEXT: pushl %ebx
+; X32-NEXT: .cfi_def_cfa_offset 12
; X32-NEXT: pushl %edi
+; X32-NEXT: .cfi_def_cfa_offset 16
; X32-NEXT: pushl %esi
-; X32-NEXT: andl $-8, %esp
-; X32-NEXT: subl $168, %esp
-; X32-NEXT: .Lcfi3:
+; X32-NEXT: .cfi_def_cfa_offset 20
+; X32-NEXT: subl $88, %esp
+; X32-NEXT: .cfi_def_cfa_offset 108
; X32-NEXT: .cfi_offset %esi, -20
-; X32-NEXT: .Lcfi4:
; X32-NEXT: .cfi_offset %edi, -16
-; X32-NEXT: .Lcfi5:
; X32-NEXT: .cfi_offset %ebx, -12
-; X32-NEXT: movl 8(%ebp), %eax
-; X32-NEXT: movl 16(%eax), %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl 20(%eax), %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl 24(%eax), %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl 28(%eax), %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl 8(%eax), %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl 12(%eax), %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl (%eax), %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl 4(%eax), %ebx
-; X32-NEXT: movl 12(%ebp), %eax
-; X32-NEXT: movl 16(%eax), %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl 20(%eax), %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl 24(%eax), %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl 28(%eax), %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl (%eax), %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: .cfi_offset %ebp, -8
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: movl 12(%ecx), %ebp
+; X32-NEXT: movl 8(%ecx), %edi
+; X32-NEXT: movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: movl (%eax), %ebx
+; X32-NEXT: movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: movl %edi, %eax
+; X32-NEXT: mull %ebx
+; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: movl %ebp, %eax
+; X32-NEXT: movl %ebp, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: mull %ebx
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: movl %eax, %ebx
+; X32-NEXT: addl %ecx, %ebx
+; X32-NEXT: adcl $0, %esi
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl 4(%eax), %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl 8(%eax), %esi
-; X32-NEXT: movl 12(%eax), %edi
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl %edi
+; X32-NEXT: movl %edi, %eax
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %ecx, %edi
; X32-NEXT: movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: pushl %esi
+; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: addl %ebx, %eax
+; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: adcl %esi, %ecx
+; X32-NEXT: setb {{[0-9]+}}(%esp) # 1-byte Folded Spill
+; X32-NEXT: movl %ebp, %eax
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: movl %eax, %edi
+; X32-NEXT: addl %ecx, %edi
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax # 1-byte Folded Reload
+; X32-NEXT: adcl %eax, %ebx
+; X32-NEXT: xorl %edx, %edx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: mull %edx
+; X32-NEXT: movl %edx, %ebp
+; X32-NEXT: movl %ebp, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: movl %eax, %esi
+; X32-NEXT: movl %esi, (%esp) # 4-byte Spill
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X32-NEXT: xorl %edx, %edx
+; X32-NEXT: mull %edx
+; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: addl %esi, %eax
+; X32-NEXT: adcl %ebp, %edx
+; X32-NEXT: addl %edi, %eax
+; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: adcl %ebx, %edx
+; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT: movl (%esi), %ebp
+; X32-NEXT: movl %ebp, %eax
+; X32-NEXT: movl %ebp, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %ecx, %edi
+; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: movl 4(%esi), %esi
+; X32-NEXT: movl %esi, %eax
; X32-NEXT: movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl %ebx
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl %edi
-; X32-NEXT: pushl %esi
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %edx, %edi
+; X32-NEXT: movl %eax, %ebx
+; X32-NEXT: addl %ecx, %ebx
+; X32-NEXT: adcl $0, %edi
+; X32-NEXT: movl %ebp, %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ebp # 4-byte Reload
+; X32-NEXT: mull %ebp
+; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: addl %ebx, %eax
+; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: adcl %edi, %ecx
+; X32-NEXT: setb %bl
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: mull %ebp
+; X32-NEXT: movl %edx, %edi
+; X32-NEXT: movl %eax, %ebp
+; X32-NEXT: addl %ecx, %ebp
+; X32-NEXT: movzbl %bl, %eax
+; X32-NEXT: adcl %eax, %edi
; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
-; X32-NEXT: pushl %esi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT: pushl %edi
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl %ebx
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl %esi
-; X32-NEXT: pushl %edi
-; X32-NEXT: pushl %ebx
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl %esi
-; X32-NEXT: pushl %edi
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: xorl %edx, %edx
+; X32-NEXT: mull %edx
+; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: movl (%esp), %ecx # 4-byte Reload
+; X32-NEXT: addl %eax, %ecx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X32-NEXT: adcl %edx, %eax
+; X32-NEXT: addl %ebp, %ecx
+; X32-NEXT: adcl %edi, %eax
+; X32-NEXT: addl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
+; X32-NEXT: movl %ecx, (%esp) # 4-byte Spill
+; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
+; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: adcl $0, {{[0-9]+}}(%esp) # 4-byte Folded Spill
+; X32-NEXT: adcl $0, {{[0-9]+}}(%esp) # 4-byte Folded Spill
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: movl 8(%ecx), %ebx
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: movl %esi, %edi
+; X32-NEXT: mull %ebx
+; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X32-NEXT: mull %ebx
+; X32-NEXT: movl %edx, %ebp
+; X32-NEXT: movl %eax, %esi
+; X32-NEXT: addl {{[0-9]+}}(%esp), %esi # 4-byte Folded Reload
+; X32-NEXT: adcl $0, %ebp
+; X32-NEXT: movl 12(%ecx), %ecx
+; X32-NEXT: movl %edi, %eax
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: movl %edx, %edi
+; X32-NEXT: addl %esi, %eax
+; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: adcl %ebp, %edi
+; X32-NEXT: setb {{[0-9]+}}(%esp) # 1-byte Folded Spill
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: movl %eax, %ebp
+; X32-NEXT: addl %edi, %ebp
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax # 1-byte Folded Reload
+; X32-NEXT: adcl %eax, %esi
+; X32-NEXT: movl %ebx, %edi
+; X32-NEXT: movl %edi, %eax
+; X32-NEXT: xorl %ecx, %ecx
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
+; X32-NEXT: addl %eax, %ebx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X32-NEXT: adcl %edx, %eax
+; X32-NEXT: addl %ebp, %ebx
+; X32-NEXT: adcl %esi, %eax
+; X32-NEXT: movl (%esp), %ecx # 4-byte Reload
+; X32-NEXT: addl %ecx, {{[0-9]+}}(%esp) # 4-byte Folded Spill
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
+; X32-NEXT: adcl %ecx, {{[0-9]+}}(%esp) # 4-byte Folded Spill
+; X32-NEXT: adcl $0, %ebx
+; X32-NEXT: adcl $0, %eax
+; X32-NEXT: addl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
+; X32-NEXT: movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
+; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: setb (%esp) # 1-byte Folded Spill
+; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %edx, %edi
+; X32-NEXT: movl %eax, %ebp
+; X32-NEXT: addl %ebx, %ebp
+; X32-NEXT: adcl $0, %edi
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
+; X32-NEXT: mull %ebx
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: addl %ebp, %eax
+; X32-NEXT: movl %eax, %ebp
+; X32-NEXT: adcl %edi, %esi
+; X32-NEXT: setb {{[0-9]+}}(%esp) # 1-byte Folded Spill
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: mull %ebx
+; X32-NEXT: addl %esi, %eax
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %esi # 1-byte Folded Reload
+; X32-NEXT: adcl %esi, %edx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
+; X32-NEXT: addl {{[0-9]+}}(%esp), %esi # 4-byte Folded Reload
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
+; X32-NEXT: adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
+; X32-NEXT: addl %eax, %esi
+; X32-NEXT: adcl %edx, %ecx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X32-NEXT: addl %eax, {{[0-9]+}}(%esp) # 4-byte Folded Spill
+; X32-NEXT: adcl {{[0-9]+}}(%esp), %ebp # 4-byte Folded Reload
+; X32-NEXT: movl %ebp, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: movzbl (%esp), %eax # 1-byte Folded Reload
+; X32-NEXT: adcl %eax, %esi
+; X32-NEXT: movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: adcl $0, %ecx
+; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X32-NEXT: addl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %esi
+; X32-NEXT: movl 16(%ecx), %esi
+; X32-NEXT: imull %esi, %ebx
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: addl %ebx, %edx
+; X32-NEXT: movl 20(%ecx), %eax
+; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: imull %eax, %edi
+; X32-NEXT: addl %edx, %edi
+; X32-NEXT: movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: movl 24(%ecx), %eax
+; X32-NEXT: movl %ecx, %ebp
+; X32-NEXT: movl %eax, %edi
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
+; X32-NEXT: imull %ecx, %edi
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
+; X32-NEXT: mull %ebx
+; X32-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X32-NEXT: addl %edi, %edx
+; X32-NEXT: movl 28(%ebp), %ebp
+; X32-NEXT: imull %ebx, %ebp
+; X32-NEXT: addl %edx, %ebp
+; X32-NEXT: movl {{[0-9]+}}(%esp), %edx # 4-byte Reload
+; X32-NEXT: addl %edx, (%esp) # 4-byte Folded Spill
+; X32-NEXT: adcl {{[0-9]+}}(%esp), %ebp # 4-byte Folded Reload
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: movl %eax, %edi
+; X32-NEXT: addl %ebx, %edi
; X32-NEXT: adcl $0, %ecx
-; X32-NEXT: adcl $0, %eax
-; X32-NEXT: addl {{[0-9]+}}(%esp), %edx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
+; X32-NEXT: mull %ebx
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: addl %edi, %eax
+; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: adcl %ecx, %esi
+; X32-NEXT: setb %cl
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X32-NEXT: mull %ebx
+; X32-NEXT: addl %esi, %eax
+; X32-NEXT: movzbl %cl, %ecx
+; X32-NEXT: adcl %ecx, %edx
+; X32-NEXT: addl (%esp), %eax # 4-byte Folded Reload
+; X32-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X32-NEXT: adcl %ebp, %edx
; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %esi
-; X32-NEXT: movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X32-NEXT: adcl $0, %edi
; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X32-NEXT: adcl $0, %ebx
-; X32-NEXT: addl %ecx, %edi
-; X32-NEXT: adcl %eax, %ebx
-; X32-NEXT: setb %al
-; X32-NEXT: addl {{[0-9]+}}(%esp), %edi
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %ebx
-; X32-NEXT: movzbl %al, %eax
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl 28(%ebx), %ecx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
+; X32-NEXT: imull %esi, %ecx
+; X32-NEXT: movl 24(%ebx), %edi
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: mull %edi
; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: adcl $0, %eax
+; X32-NEXT: addl %ecx, %edx
+; X32-NEXT: imull {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
+; X32-NEXT: addl %edx, %edi
+; X32-NEXT: movl 16(%ebx), %ebp
+; X32-NEXT: movl 20(%ebx), %ebx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: imull %ebx, %ecx
+; X32-NEXT: movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: mull %ebp
+; X32-NEXT: addl %ecx, %edx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
+; X32-NEXT: imull %ebp, %ecx
+; X32-NEXT: addl %edx, %ecx
+; X32-NEXT: addl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X32-NEXT: addl {{[0-9]+}}(%esp), %esi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: addl %edi, %esi
-; X32-NEXT: adcl %ebx, %ecx
+; X32-NEXT: adcl %edi, %ecx
+; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: movl %ebp, %eax
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %edi
+; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: movl %eax, %ebx
+; X32-NEXT: addl %edi, %ebx
+; X32-NEXT: adcl $0, %ecx
+; X32-NEXT: movl %ebp, %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ebp # 4-byte Reload
+; X32-NEXT: mull %ebp
+; X32-NEXT: movl %edx, %edi
+; X32-NEXT: movl %eax, %esi
+; X32-NEXT: addl %ebx, %esi
+; X32-NEXT: adcl %ecx, %edi
+; X32-NEXT: setb %cl
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X32-NEXT: mull %ebp
+; X32-NEXT: addl %edi, %eax
+; X32-NEXT: movzbl %cl, %ecx
+; X32-NEXT: adcl %ecx, %edx
+; X32-NEXT: addl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
+; X32-NEXT: adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
+; X32-NEXT: addl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
+; X32-NEXT: adcl {{[0-9]+}}(%esp), %esi # 4-byte Folded Reload
+; X32-NEXT: adcl (%esp), %eax # 4-byte Folded Reload
; X32-NEXT: adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
+; X32-NEXT: addl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
+; X32-NEXT: movl %ecx, %ebx
+; X32-NEXT: adcl {{[0-9]+}}(%esp), %esi # 4-byte Folded Reload
; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X32-NEXT: movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X32-NEXT: movl 16(%ebp), %edi
-; X32-NEXT: movl %ebx, 4(%edi)
-; X32-NEXT: movl 16(%ebp), %ebx
+; X32-NEXT: adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
+; X32-NEXT: movl %edi, (%ecx)
; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT: movl %edi, (%ebx)
+; X32-NEXT: movl %edi, 4(%ecx)
; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT: movl %edi, 8(%ebx)
+; X32-NEXT: movl %edi, 8(%ecx)
; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT: movl %edi, 12(%ebx)
-; X32-NEXT: movl %esi, 16(%ebx)
-; X32-NEXT: movl %ecx, 20(%ebx)
-; X32-NEXT: movl %edx, 24(%ebx)
-; X32-NEXT: movl %eax, 28(%ebx)
-; X32-NEXT: leal -12(%ebp), %esp
+; X32-NEXT: movl %edi, 12(%ecx)
+; X32-NEXT: movl %ebx, 16(%ecx)
+; X32-NEXT: movl %esi, 20(%ecx)
+; X32-NEXT: movl %eax, 24(%ecx)
+; X32-NEXT: movl %edx, 28(%ecx)
+; X32-NEXT: addl $88, %esp
; X32-NEXT: popl %esi
; X32-NEXT: popl %edi
; X32-NEXT: popl %ebx
@@ -196,21 +356,15 @@ define void @test(i256* %a, i256* %b, i256* %out) #0 {
; X32-NEXT: retl
;
; X64-LABEL: test:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: pushq %r15
-; X64-NEXT: .Lcfi0:
; X64-NEXT: .cfi_def_cfa_offset 16
; X64-NEXT: pushq %r14
-; X64-NEXT: .Lcfi1:
; X64-NEXT: .cfi_def_cfa_offset 24
; X64-NEXT: pushq %rbx
-; X64-NEXT: .Lcfi2:
; X64-NEXT: .cfi_def_cfa_offset 32
-; X64-NEXT: .Lcfi3:
; X64-NEXT: .cfi_offset %rbx, -32
-; X64-NEXT: .Lcfi4:
; X64-NEXT: .cfi_offset %r14, -24
-; X64-NEXT: .Lcfi5:
; X64-NEXT: .cfi_offset %r15, -16
; X64-NEXT: movq %rdx, %r9
; X64-NEXT: movq (%rdi), %r11
diff --git a/test/CodeGen/X86/mul-i512.ll b/test/CodeGen/X86/mul-i512.ll
index 3da17b69ffb5..d846729096e1 100644
--- a/test/CodeGen/X86/mul-i512.ll
+++ b/test/CodeGen/X86/mul-i512.ll
@@ -4,888 +4,1525 @@
define void @test_512(i512* %a, i512* %b, i512* %out) nounwind {
; X32-LABEL: test_512:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pushl %ebp
-; X32-NEXT: movl %esp, %ebp
; X32-NEXT: pushl %ebx
; X32-NEXT: pushl %edi
; X32-NEXT: pushl %esi
-; X32-NEXT: andl $-8, %esp
-; X32-NEXT: subl $656, %esp # imm = 0x290
-; X32-NEXT: movl 8(%ebp), %eax
-; X32-NEXT: movl 48(%eax), %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl 52(%eax), %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl 56(%eax), %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl 60(%eax), %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl 40(%eax), %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl 44(%eax), %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl 32(%eax), %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl 36(%eax), %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl (%eax), %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl 4(%eax), %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl 16(%eax), %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl 20(%eax), %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl 8(%eax), %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl 12(%eax), %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl 24(%eax), %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl 28(%eax), %eax
+; X32-NEXT: subl $244, %esp
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: movl 20(%ecx), %edi
+; X32-NEXT: movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: movl 16(%ecx), %eax
; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl 12(%ebp), %eax
-; X32-NEXT: movl 48(%eax), %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl 52(%eax), %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl 56(%eax), %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl 60(%eax), %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl 32(%eax), %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl 36(%eax), %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl 40(%eax), %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl 44(%eax), %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl (%eax), %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl 4(%eax), %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl 8(%eax), %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl 12(%eax), %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl 16(%eax), %esi
-; X32-NEXT: movl 20(%eax), %edi
-; X32-NEXT: movl 24(%eax), %ebx
-; X32-NEXT: movl 28(%eax), %eax
-; X32-NEXT: movl %eax, (%esp) # 4-byte Spill
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl %eax
-; X32-NEXT: pushl %ebx
-; X32-NEXT: pushl %edi
-; X32-NEXT: pushl %esi
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl %edi
+; X32-NEXT: movl %ecx, %ebp
+; X32-NEXT: xorl %ebx, %ebx
+; X32-NEXT: mull %ebx
+; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: movl %eax, %esi
+; X32-NEXT: movl %edi, %eax
+; X32-NEXT: mull %ebx
+; X32-NEXT: movl %eax, %edi
+; X32-NEXT: addl %ecx, %edi
+; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: adcl $0, %ebx
+; X32-NEXT: addl %esi, %edi
+; X32-NEXT: movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: movl %edi, (%esp) # 4-byte Spill
+; X32-NEXT: adcl %ecx, %ebx
+; X32-NEXT: movl %ecx, %edi
; X32-NEXT: movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: pushl %esi
+; X32-NEXT: setb %cl
+; X32-NEXT: addl %eax, %ebx
+; X32-NEXT: movzbl %cl, %ecx
+; X32-NEXT: adcl %edx, %ecx
+; X32-NEXT: movl 24(%ebp), %eax
+; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: xorl %edx, %edx
+; X32-NEXT: mull %edx
+; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: addl %eax, %esi
+; X32-NEXT: adcl %edx, %edi
+; X32-NEXT: addl %ebx, %esi
; X32-NEXT: movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl %ebx
+; X32-NEXT: adcl %ecx, %edi
+; X32-NEXT: movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: movl (%ecx), %eax
+; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: xorl %ebx, %ebx
+; X32-NEXT: mull %ebx
+; X32-NEXT: movl %eax, %ebp
+; X32-NEXT: movl %edx, %edi
+; X32-NEXT: movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: movl 4(%ecx), %eax
+; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: movl %ecx, %esi
+; X32-NEXT: mull %ebx
+; X32-NEXT: movl %eax, %ebx
+; X32-NEXT: addl %edi, %ebx
+; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: adcl $0, %ecx
+; X32-NEXT: addl %ebp, %ebx
; X32-NEXT: movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl %edi
-; X32-NEXT: pushl %esi
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT: pushl %edi
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
-; X32-NEXT: pushl %esi
-; X32-NEXT: pushl %ebx
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
-; X32-NEXT: pushl %ebx
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl %esi
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl %edi
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT: pushl %edi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
-; X32-NEXT: pushl %esi
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl %ebx
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl %edi
-; X32-NEXT: pushl %esi
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
+; X32-NEXT: adcl %edi, %ecx
+; X32-NEXT: setb %bl
+; X32-NEXT: addl %eax, %ecx
+; X32-NEXT: movzbl %bl, %ebx
+; X32-NEXT: adcl %edx, %ebx
+; X32-NEXT: movl 8(%esi), %eax
+; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: xorl %edx, %edx
+; X32-NEXT: mull %edx
+; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: movl %ebp, %esi
+; X32-NEXT: addl %eax, %esi
+; X32-NEXT: movl %edi, %eax
+; X32-NEXT: adcl %edx, %eax
+; X32-NEXT: addl %ecx, %esi
+; X32-NEXT: movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: adcl %ebx, %eax
+; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X32-NEXT: addl %ebp, %eax
+; X32-NEXT: movl %ebp, %esi
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X32-NEXT: adcl %edi, %eax
+; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: movl (%eax), %eax
+; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: xorl %ebp, %ebp
+; X32-NEXT: mull %ebp
+; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: movl %ecx, %edx
+; X32-NEXT: addl %esi, %edx
+; X32-NEXT: movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: adcl %edi, %eax
+; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl 16(%eax), %eax
+; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: mull %ebp
+; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: movl %ecx, %edi
+; X32-NEXT: movl %ecx, %ebp
+; X32-NEXT: movl %ebp, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: addl %eax, %edi
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: adcl %edx, %eax
+; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X32-NEXT: addl %esi, %eax
+; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: movl (%esp), %eax # 4-byte Reload
+; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
+; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
+; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
+; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl 4(%eax), %eax
+; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: xorl %edi, %edi
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %eax, %esi
+; X32-NEXT: addl %ebx, %esi
+; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: adcl $0, %ecx
+; X32-NEXT: addl %ebp, %esi
+; X32-NEXT: movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: adcl %ebx, %ecx
+; X32-NEXT: movl %ebx, %esi
+; X32-NEXT: movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: setb %bl
+; X32-NEXT: addl %eax, %ecx
+; X32-NEXT: movzbl %bl, %ebx
+; X32-NEXT: adcl %edx, %ebx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl 8(%eax), %eax
+; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: movl %ebp, %edi
+; X32-NEXT: addl %eax, %edi
+; X32-NEXT: adcl %edx, %esi
+; X32-NEXT: addl %ecx, %edi
+; X32-NEXT: movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: adcl %ebx, %esi
+; X32-NEXT: movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: addl {{[0-9]+}}(%esp), %ebp # 4-byte Folded Reload
+; X32-NEXT: movl %ebp, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X32-NEXT: adcl %eax, {{[0-9]+}}(%esp) # 4-byte Folded Spill
+; X32-NEXT: adcl %edi, {{[0-9]+}}(%esp) # 4-byte Folded Spill
+; X32-NEXT: adcl %esi, {{[0-9]+}}(%esp) # 4-byte Folded Spill
+; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT: movl 20(%esi), %eax
+; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: xorl %ecx, %ecx
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %eax, %ebx
; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT: pushl %edi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
-; X32-NEXT: pushl %ebx
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
-; X32-NEXT: pushl %esi
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl %edi
-; X32-NEXT: pushl %ebx
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl %esi
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
-; X32-NEXT: pushl %esi
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl %edi
-; X32-NEXT: pushl %ebx
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
+; X32-NEXT: addl %edi, %ebx
+; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: adcl $0, %ecx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ebp # 4-byte Reload
+; X32-NEXT: addl %ebp, %ebx
+; X32-NEXT: movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: adcl %edi, %ecx
+; X32-NEXT: setb %bl
+; X32-NEXT: addl %eax, %ecx
+; X32-NEXT: movzbl %bl, %ebx
+; X32-NEXT: adcl %edx, %ebx
+; X32-NEXT: movl 24(%esi), %eax
+; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: xorl %edx, %edx
+; X32-NEXT: mull %edx
+; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: movl %ebp, %esi
+; X32-NEXT: addl %eax, %esi
; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT: pushl %edi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
-; X32-NEXT: pushl %ebx
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl %esi
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
-; X32-NEXT: pushl %esi
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl %edi
-; X32-NEXT: pushl %ebx
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
+; X32-NEXT: adcl %edx, %edi
+; X32-NEXT: addl %ecx, %esi
+; X32-NEXT: adcl %ebx, %edi
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X32-NEXT: addl %ebp, %eax
+; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: movl %ebp, %edx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ebp # 4-byte Reload
+; X32-NEXT: adcl %ebp, {{[0-9]+}}(%esp) # 4-byte Folded Spill
+; X32-NEXT: adcl %esi, {{[0-9]+}}(%esp) # 4-byte Folded Spill
+; X32-NEXT: adcl %edi, {{[0-9]+}}(%esp) # 4-byte Folded Spill
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: addl %edx, %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
+; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: addl %edx, %eax
+; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: adcl (%esp), %ebp # 4-byte Folded Reload
+; X32-NEXT: movl %ebp, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: adcl {{[0-9]+}}(%esp), %esi # 4-byte Folded Reload
+; X32-NEXT: movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: adcl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
+; X32-NEXT: movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
+; X32-NEXT: movl %ecx, %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT: pushl %edi
-; X32-NEXT: pushl %esi
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
-; X32-NEXT: pushl %ebx
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl %edi
-; X32-NEXT: pushl %esi
-; X32-NEXT: pushl %ebx
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X32-NEXT: movl %edx, %ebp
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl 28(%eax), %esi
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %edx, %edi
+; X32-NEXT: movl %eax, %ebx
+; X32-NEXT: addl %ebp, %ebx
+; X32-NEXT: adcl $0, %edi
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, %ebp
+; X32-NEXT: addl %ebx, %eax
+; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: adcl %edi, %ebp
+; X32-NEXT: setb %bl
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: mull %ecx
+; X32-NEXT: addl %ebp, %eax
+; X32-NEXT: movzbl %bl, %ecx
+; X32-NEXT: adcl %ecx, %edx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
+; X32-NEXT: addl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
-; X32-NEXT: pushl %esi
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl %ebx
-; X32-NEXT: pushl %esi
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl (%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
-; X32-NEXT: pushl %ebx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT: pushl %edi
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl %esi
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl %ebx
-; X32-NEXT: pushl %edi
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
+; X32-NEXT: adcl {{[0-9]+}}(%esp), %esi # 4-byte Folded Reload
+; X32-NEXT: addl %eax, %ecx
+; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: adcl %edx, %esi
+; X32-NEXT: movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
+; X32-NEXT: movl %ecx, %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
-; X32-NEXT: pushl %esi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
-; X32-NEXT: pushl %ebx
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT: pushl %edi
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl %esi
-; X32-NEXT: pushl %ebx
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
+; X32-NEXT: movl %edi, %eax
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: movl %eax, %ebp
+; X32-NEXT: addl {{[0-9]+}}(%esp), %ebp # 4-byte Folded Reload
+; X32-NEXT: adcl $0, %ebx
+; X32-NEXT: movl %ecx, %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
-; X32-NEXT: pushl %esi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
-; X32-NEXT: pushl %ebx
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl %edi
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT: pushl %edi
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl %esi
-; X32-NEXT: pushl %ebx
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl %edi
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT: pushl %edi
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: addl %ebp, %eax
+; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: adcl %ebx, %ecx
+; X32-NEXT: setb {{[0-9]+}}(%esp) # 1-byte Folded Spill
+; X32-NEXT: movl %edi, %ebp
+; X32-NEXT: movl %ebp, %eax
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %edi
+; X32-NEXT: movl %eax, %ebx
+; X32-NEXT: addl %ecx, %ebx
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax # 1-byte Folded Reload
+; X32-NEXT: adcl %eax, %edi
+; X32-NEXT: addl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
+; X32-NEXT: adcl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
+; X32-NEXT: addl (%esp), %ebx # 4-byte Folded Reload
+; X32-NEXT: adcl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
+; X32-NEXT: adcl $0, {{[0-9]+}}(%esp) # 4-byte Folded Spill
+; X32-NEXT: adcl $0, {{[0-9]+}}(%esp) # 4-byte Folded Spill
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
-; X32-NEXT: pushl %esi
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl %ebx
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl %edi
-; X32-NEXT: pushl %esi
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: addl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: adcl $0, %esi
-; X32-NEXT: adcl $0, %edx
-; X32-NEXT: addl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %ecx
; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: movl %ebp, %eax
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: movl %eax, %ebp
+; X32-NEXT: addl %ecx, %ebp
+; X32-NEXT: adcl $0, %esi
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: adcl $0, %eax
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: adcl $0, %ecx
-; X32-NEXT: addl %esi, %eax
+; X32-NEXT: movl 12(%eax), %ecx
+; X32-NEXT: movl %ecx, (%esp) # 4-byte Spill
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: addl %ebp, %eax
+; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: adcl %esi, %ecx
+; X32-NEXT: setb {{[0-9]+}}(%esp) # 1-byte Folded Spill
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X32-NEXT: mull (%esp) # 4-byte Folded Reload
+; X32-NEXT: addl %ecx, %eax
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %ecx # 1-byte Folded Reload
+; X32-NEXT: adcl %ecx, %edx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ebp # 4-byte Reload
+; X32-NEXT: addl {{[0-9]+}}(%esp), %ebp # 4-byte Folded Reload
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
+; X32-NEXT: adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
+; X32-NEXT: addl %eax, %ebp
; X32-NEXT: adcl %edx, %ecx
-; X32-NEXT: setb %dl
-; X32-NEXT: addl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: movzbl %dl, %edx
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %edx
+; X32-NEXT: addl %ebx, {{[0-9]+}}(%esp) # 4-byte Folded Spill
+; X32-NEXT: adcl %edi, {{[0-9]+}}(%esp) # 4-byte Folded Spill
+; X32-NEXT: adcl $0, %ebp
+; X32-NEXT: adcl $0, %ecx
+; X32-NEXT: addl {{[0-9]+}}(%esp), %ebp # 4-byte Folded Reload
+; X32-NEXT: adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
+; X32-NEXT: setb {{[0-9]+}}(%esp) # 1-byte Folded Spill
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %eax, %edi
+; X32-NEXT: addl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
; X32-NEXT: adcl $0, %edx
; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X32-NEXT: movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: addl %edx, %edi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X32-NEXT: movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl %edx, %ebx
-; X32-NEXT: movl %ebx, %edx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X32-NEXT: movl %esi, (%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X32-NEXT: movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl %esi, %ebx
-; X32-NEXT: movl %ebx, %esi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X32-NEXT: movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X32-NEXT: movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: movl (%esp), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: addl %edi, %eax
+; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: adcl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
+; X32-NEXT: setb {{[0-9]+}}(%esp) # 1-byte Folded Spill
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: addl %ebx, %eax
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %esi # 1-byte Folded Reload
+; X32-NEXT: adcl %esi, %edx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
+; X32-NEXT: addl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
; X32-NEXT: adcl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
; X32-NEXT: addl %eax, %edi
-; X32-NEXT: movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl %ecx, %edx
+; X32-NEXT: adcl %edx, %ebx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %edx # 4-byte Reload
+; X32-NEXT: addl %ebp, %edx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
+; X32-NEXT: adcl %ecx, %esi
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax # 1-byte Folded Reload
+; X32-NEXT: adcl %eax, %edi
+; X32-NEXT: adcl $0, %ebx
+; X32-NEXT: addl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
; X32-NEXT: adcl {{[0-9]+}}(%esp), %esi # 4-byte Folded Reload
; X32-NEXT: movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: adcl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
+; X32-NEXT: movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
; X32-NEXT: adcl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
; X32-NEXT: movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: addl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl %eax, %esi
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: movl %edx, %esi
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: adcl $0, %eax
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: adcl $0, %ecx
-; X32-NEXT: addl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %esi
+; X32-NEXT: movl 12(%eax), %eax
+; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %edx, %edi
+; X32-NEXT: movl %eax, %ebp
+; X32-NEXT: addl %esi, %ebp
+; X32-NEXT: adcl $0, %edi
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
+; X32-NEXT: mull %ebx
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: addl %ebp, %eax
+; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: adcl %edi, %esi
+; X32-NEXT: setb %cl
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X32-NEXT: mull %ebx
+; X32-NEXT: addl %esi, %eax
+; X32-NEXT: movzbl %cl, %ecx
+; X32-NEXT: adcl %ecx, %edx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
+; X32-NEXT: addl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
+; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
+; X32-NEXT: adcl {{[0-9]+}}(%esp), %esi # 4-byte Folded Reload
+; X32-NEXT: addl %eax, %ecx
+; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: adcl %edx, %esi
; X32-NEXT: movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: adcl $0, %edx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X32-NEXT: adcl $0, %esi
-; X32-NEXT: addl %eax, %edx
-; X32-NEXT: adcl %ecx, %esi
-; X32-NEXT: setb %al
-; X32-NEXT: addl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %esi
-; X32-NEXT: movzbl %al, %eax
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: adcl $0, %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, %edi
+; X32-NEXT: movl %eax, %ebp
+; X32-NEXT: addl {{[0-9]+}}(%esp), %ebp # 4-byte Folded Reload
+; X32-NEXT: adcl $0, %edi
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: addl %ebp, %eax
; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: adcl %edi, %ecx
+; X32-NEXT: setb {{[0-9]+}}(%esp) # 1-byte Folded Spill
+; X32-NEXT: movl %ebx, %edi
+; X32-NEXT: movl %edi, %eax
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %eax, %ebp
+; X32-NEXT: addl %ecx, %ebp
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax # 1-byte Folded Reload
+; X32-NEXT: adcl %eax, %edx
+; X32-NEXT: addl {{[0-9]+}}(%esp), %ebp # 4-byte Folded Reload
+; X32-NEXT: adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
+; X32-NEXT: addl {{[0-9]+}}(%esp), %ebp # 4-byte Folded Reload
+; X32-NEXT: adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
+; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: adcl $0, {{[0-9]+}}(%esp) # 4-byte Folded Spill
+; X32-NEXT: adcl $0, {{[0-9]+}}(%esp) # 4-byte Folded Spill
+; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
+; X32-NEXT: mull %ebx
+; X32-NEXT: movl %edx, %ecx
; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT: addl %eax, %edi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl %edi, %eax
+; X32-NEXT: mull %ebx
+; X32-NEXT: movl %edx, %edi
+; X32-NEXT: movl %eax, %ebx
+; X32-NEXT: addl %ecx, %ebx
+; X32-NEXT: adcl $0, %edi
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: movl (%esp), %ecx # 4-byte Reload
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: addl %ebx, %eax
; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: adcl %edi, %esi
+; X32-NEXT: setb %bl
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X32-NEXT: mull %ecx
+; X32-NEXT: addl %esi, %eax
+; X32-NEXT: movzbl %bl, %ecx
+; X32-NEXT: adcl %ecx, %edx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
+; X32-NEXT: addl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
-; X32-NEXT: adcl %eax, %ecx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl %eax, (%esp) # 4-byte Folded Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X32-NEXT: movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
+; X32-NEXT: addl %eax, %ebx
+; X32-NEXT: adcl %edx, %ecx
+; X32-NEXT: addl %ebp, {{[0-9]+}}(%esp) # 4-byte Folded Spill
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: adcl %ebx, %eax
-; X32-NEXT: addl %edx, %edi
-; X32-NEXT: movl %ecx, %edx
-; X32-NEXT: adcl %esi, %edx
-; X32-NEXT: movl (%esp), %ecx # 4-byte Reload
+; X32-NEXT: adcl %eax, {{[0-9]+}}(%esp) # 4-byte Folded Spill
+; X32-NEXT: adcl $0, %ebx
+; X32-NEXT: adcl $0, %ecx
+; X32-NEXT: addl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
; X32-NEXT: adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
-; X32-NEXT: addl {{[0-9]+}}(%esp), %edi
-; X32-NEXT: movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %edx
+; X32-NEXT: setb {{[0-9]+}}(%esp) # 1-byte Folded Spill
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ebp # 4-byte Reload
+; X32-NEXT: movl %ebp, %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
-; X32-NEXT: movl %ecx, (%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %eax, %edi
+; X32-NEXT: addl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
+; X32-NEXT: adcl $0, %edx
+; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: movl %ebp, %eax
+; X32-NEXT: movl (%esp), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %ebp
+; X32-NEXT: addl %edi, %eax
+; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: adcl {{[0-9]+}}(%esp), %ebp # 4-byte Folded Reload
+; X32-NEXT: setb {{[0-9]+}}(%esp) # 1-byte Folded Spill
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: addl %ebp, %eax
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %esi # 1-byte Folded Reload
+; X32-NEXT: adcl %esi, %edx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ebp # 4-byte Reload
+; X32-NEXT: addl {{[0-9]+}}(%esp), %ebp # 4-byte Folded Reload
+; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
+; X32-NEXT: adcl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
+; X32-NEXT: addl %eax, %ebp
+; X32-NEXT: adcl %edx, %edi
+; X32-NEXT: movl {{[0-9]+}}(%esp), %edx # 4-byte Reload
+; X32-NEXT: addl %ebx, %edx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
+; X32-NEXT: adcl %ecx, %esi
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax # 1-byte Folded Reload
+; X32-NEXT: adcl %eax, %ebp
+; X32-NEXT: adcl $0, %edi
+; X32-NEXT: addl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
+; X32-NEXT: adcl {{[0-9]+}}(%esp), %esi # 4-byte Folded Reload
+; X32-NEXT: adcl {{[0-9]+}}(%esp), %ebp # 4-byte Folded Reload
+; X32-NEXT: adcl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
+; X32-NEXT: addl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
+; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: adcl {{[0-9]+}}(%esp), %esi # 4-byte Folded Reload
+; X32-NEXT: movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: adcl {{[0-9]+}}(%esp), %ebp # 4-byte Folded Reload
+; X32-NEXT: movl %ebp, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: adcl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
+; X32-NEXT: movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
; X32-NEXT: adcl $0, {{[0-9]+}}(%esp) # 4-byte Folded Spill
; X32-NEXT: adcl $0, {{[0-9]+}}(%esp) # 4-byte Folded Spill
; X32-NEXT: adcl $0, {{[0-9]+}}(%esp) # 4-byte Folded Spill
; X32-NEXT: adcl $0, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: addl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl %eax, %esi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: adcl $0, %eax
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: adcl $0, %ecx
-; X32-NEXT: addl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %esi
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ebp # 4-byte Reload
+; X32-NEXT: movl %ebp, %eax
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, %edi
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: addl %esi, %ecx
+; X32-NEXT: adcl $0, %edi
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
+; X32-NEXT: mull %ebx
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: addl %ecx, %eax
+; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: adcl %edi, %esi
+; X32-NEXT: setb %cl
+; X32-NEXT: movl %ebp, %eax
+; X32-NEXT: mull %ebx
+; X32-NEXT: addl %esi, %eax
+; X32-NEXT: movzbl %cl, %ecx
+; X32-NEXT: adcl %ecx, %edx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
+; X32-NEXT: addl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
+; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
+; X32-NEXT: adcl {{[0-9]+}}(%esp), %esi # 4-byte Folded Reload
+; X32-NEXT: addl %eax, %ecx
+; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: adcl %edx, %esi
; X32-NEXT: movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: adcl $0, %edx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ebp # 4-byte Reload
+; X32-NEXT: movl %ebp, %eax
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: movl %eax, %edi
+; X32-NEXT: addl %ecx, %edi
; X32-NEXT: adcl $0, %esi
-; X32-NEXT: addl %eax, %edx
-; X32-NEXT: adcl %ecx, %esi
-; X32-NEXT: setb %al
-; X32-NEXT: addl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %esi
-; X32-NEXT: movzbl %al, %eax
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
+; X32-NEXT: mull %ebx
+; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: addl %edi, %eax
; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: adcl $0, %eax
+; X32-NEXT: adcl %esi, %ecx
+; X32-NEXT: setb {{[0-9]+}}(%esp) # 1-byte Folded Spill
+; X32-NEXT: movl %ebp, %eax
+; X32-NEXT: mull %ebx
+; X32-NEXT: movl %eax, %edi
+; X32-NEXT: addl %ecx, %edi
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax # 1-byte Folded Reload
+; X32-NEXT: adcl %eax, %edx
+; X32-NEXT: addl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
+; X32-NEXT: adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
+; X32-NEXT: addl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
+; X32-NEXT: adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
+; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: adcl $0, {{[0-9]+}}(%esp) # 4-byte Folded Spill
+; X32-NEXT: adcl $0, {{[0-9]+}}(%esp) # 4-byte Folded Spill
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: movl %ebp, %eax
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: addl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
+; X32-NEXT: adcl $0, %esi
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl 28(%eax), %ebp
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: mull %ebp
+; X32-NEXT: movl %ebp, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: addl %ecx, %eax
; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
-; X32-NEXT: addl %eax, %ebx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: adcl %esi, %ebx
+; X32-NEXT: setb %cl
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: adcl %ecx, %eax
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X32-NEXT: movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: mull %ebp
+; X32-NEXT: addl %ebx, %eax
+; X32-NEXT: movzbl %cl, %ecx
+; X32-NEXT: adcl %ecx, %edx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ebp # 4-byte Reload
+; X32-NEXT: addl {{[0-9]+}}(%esp), %ebp # 4-byte Folded Reload
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
-; X32-NEXT: adcl %edi, %ecx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X32-NEXT: movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl %edi, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT: addl %edx, %ebx
-; X32-NEXT: adcl %esi, %eax
-; X32-NEXT: movl %eax, %esi
; X32-NEXT: adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
+; X32-NEXT: addl %eax, %ebp
+; X32-NEXT: adcl %edx, %ecx
+; X32-NEXT: addl %edi, {{[0-9]+}}(%esp) # 4-byte Folded Spill
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: addl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: adcl %eax, {{[0-9]+}}(%esp) # 4-byte Folded Spill
+; X32-NEXT: adcl $0, %ebp
+; X32-NEXT: adcl $0, %ecx
+; X32-NEXT: addl {{[0-9]+}}(%esp), %ebp # 4-byte Folded Reload
+; X32-NEXT: adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
+; X32-NEXT: setb {{[0-9]+}}(%esp) # 1-byte Folded Spill
+; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %edx, %edi
+; X32-NEXT: movl %eax, %ebx
+; X32-NEXT: addl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
+; X32-NEXT: adcl $0, %edi
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: addl %ebx, %eax
; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl (%esp), %eax # 4-byte Reload
+; X32-NEXT: adcl %edi, %edx
+; X32-NEXT: movl %edx, %edi
+; X32-NEXT: setb %bl
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: addl %edi, %eax
+; X32-NEXT: movzbl %bl, %esi
+; X32-NEXT: adcl %esi, %edx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
+; X32-NEXT: addl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
+; X32-NEXT: adcl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
+; X32-NEXT: addl %eax, %edi
+; X32-NEXT: adcl %edx, %ebx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %edx # 4-byte Reload
+; X32-NEXT: addl %ebp, %edx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
+; X32-NEXT: adcl %ecx, %esi
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax # 1-byte Folded Reload
+; X32-NEXT: adcl %eax, %edi
+; X32-NEXT: adcl $0, %ebx
+; X32-NEXT: addl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
+; X32-NEXT: adcl {{[0-9]+}}(%esp), %esi # 4-byte Folded Reload
+; X32-NEXT: adcl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
+; X32-NEXT: adcl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X32-NEXT: addl %eax, {{[0-9]+}}(%esp) # 4-byte Folded Spill
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X32-NEXT: adcl %eax, {{[0-9]+}}(%esp) # 4-byte Folded Spill
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
; X32-NEXT: adcl %eax, {{[0-9]+}}(%esp) # 4-byte Folded Spill
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
; X32-NEXT: adcl %eax, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT: adcl $0, %ebx
-; X32-NEXT: movl %esi, %eax
-; X32-NEXT: adcl $0, %eax
-; X32-NEXT: adcl $0, %ecx
; X32-NEXT: adcl $0, %edx
-; X32-NEXT: addl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
+; X32-NEXT: adcl $0, %esi
+; X32-NEXT: adcl $0, %edi
+; X32-NEXT: adcl $0, %ebx
+; X32-NEXT: addl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
+; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: adcl {{[0-9]+}}(%esp), %esi # 4-byte Folded Reload
+; X32-NEXT: movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: adcl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
+; X32-NEXT: movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: adcl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
; X32-NEXT: movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
+; X32-NEXT: setb {{[0-9]+}}(%esp) # 1-byte Folded Spill
+; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %edx, %ecx
; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ebp # 4-byte Reload
+; X32-NEXT: movl %ebp, %eax
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: movl %eax, %edi
+; X32-NEXT: addl %ecx, %edi
+; X32-NEXT: adcl $0, %ebx
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: addl %edi, %eax
+; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: adcl %ebx, %ecx
+; X32-NEXT: setb %bl
+; X32-NEXT: movl %ebp, %eax
+; X32-NEXT: mull %esi
+; X32-NEXT: addl %ecx, %eax
+; X32-NEXT: movzbl %bl, %ecx
+; X32-NEXT: adcl %ecx, %edx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
+; X32-NEXT: addl {{[0-9]+}}(%esp), %esi # 4-byte Folded Reload
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
; X32-NEXT: adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
+; X32-NEXT: addl %eax, %esi
+; X32-NEXT: movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: adcl %edx, %ecx
; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ebp # 4-byte Reload
+; X32-NEXT: movl %ebp, %eax
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %edx, %edi
+; X32-NEXT: movl %eax, %ebx
+; X32-NEXT: addl %esi, %ebx
+; X32-NEXT: adcl $0, %edi
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: addl %ebx, %eax
+; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: adcl %edi, %ecx
+; X32-NEXT: setb {{[0-9]+}}(%esp) # 1-byte Folded Spill
+; X32-NEXT: movl %ebp, %eax
+; X32-NEXT: movl %ebp, %edi
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %eax, %ebx
+; X32-NEXT: addl %ecx, %ebx
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax # 1-byte Folded Reload
+; X32-NEXT: adcl %eax, %edx
+; X32-NEXT: addl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
+; X32-NEXT: adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
+; X32-NEXT: addl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
; X32-NEXT: adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: setb (%esp) # 1-byte Folded Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: addl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl %eax, %esi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: adcl $0, %eax
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: adcl $0, {{[0-9]+}}(%esp) # 4-byte Folded Spill
+; X32-NEXT: adcl $0, {{[0-9]+}}(%esp) # 4-byte Folded Spill
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ebp # 4-byte Reload
+; X32-NEXT: movl %ebp, %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: movl %edi, %eax
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, %edi
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: addl %esi, %ecx
+; X32-NEXT: adcl $0, %edi
+; X32-NEXT: movl %ebp, %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %ebp
+; X32-NEXT: addl %ecx, %eax
+; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: adcl %edi, %ebp
+; X32-NEXT: setb %cl
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: addl %ebp, %eax
+; X32-NEXT: movzbl %cl, %ecx
+; X32-NEXT: adcl %ecx, %edx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
+; X32-NEXT: addl {{[0-9]+}}(%esp), %esi # 4-byte Folded Reload
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
+; X32-NEXT: adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
+; X32-NEXT: addl %eax, %esi
+; X32-NEXT: adcl %edx, %ecx
+; X32-NEXT: addl %ebx, {{[0-9]+}}(%esp) # 4-byte Folded Spill
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X32-NEXT: adcl %eax, {{[0-9]+}}(%esp) # 4-byte Folded Spill
+; X32-NEXT: adcl $0, %esi
; X32-NEXT: adcl $0, %ecx
-; X32-NEXT: addl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %esi
+; X32-NEXT: addl {{[0-9]+}}(%esp), %esi # 4-byte Folded Reload
; X32-NEXT: movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X32-NEXT: adcl $0, %esi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: adcl $0, %edx
-; X32-NEXT: addl %eax, %esi
-; X32-NEXT: adcl %ecx, %edx
-; X32-NEXT: setb %al
-; X32-NEXT: addl {{[0-9]+}}(%esp), %esi
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: movzbl %al, %ebx
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %ebx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: adcl $0, %eax
+; X32-NEXT: adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
+; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: setb {{[0-9]+}}(%esp) # 1-byte Folded Spill
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ebp # 4-byte Reload
+; X32-NEXT: movl %ebp, %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %ecx
; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT: addl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
-; X32-NEXT: adcl %ecx, {{[0-9]+}}(%esp) # 4-byte Folded Spill
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: movl %eax, %edi
+; X32-NEXT: addl %ecx, %edi
+; X32-NEXT: adcl $0, %esi
+; X32-NEXT: movl %ebp, %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
+; X32-NEXT: mull %ebx
+; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: movl %eax, %ebp
+; X32-NEXT: addl %edi, %ebp
+; X32-NEXT: adcl %esi, %ecx
+; X32-NEXT: setb {{[0-9]+}}(%esp) # 1-byte Folded Spill
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X32-NEXT: mull %ebx
+; X32-NEXT: addl %ecx, %eax
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %ecx # 1-byte Folded Reload
+; X32-NEXT: adcl %ecx, %edx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
+; X32-NEXT: addl {{[0-9]+}}(%esp), %esi # 4-byte Folded Reload
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
-; X32-NEXT: adcl %ecx, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT: addl %esi, %edi
-; X32-NEXT: movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl %eax, %esi
-; X32-NEXT: adcl %edx, %esi
+; X32-NEXT: adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
+; X32-NEXT: addl %eax, %esi
+; X32-NEXT: adcl %edx, %ecx
; X32-NEXT: movl {{[0-9]+}}(%esp), %edx # 4-byte Reload
-; X32-NEXT: adcl %ebx, %edx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: addl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: addl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
+; X32-NEXT: adcl {{[0-9]+}}(%esp), %ebp # 4-byte Folded Reload
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax # 1-byte Folded Reload
+; X32-NEXT: adcl %eax, %esi
+; X32-NEXT: adcl $0, %ecx
+; X32-NEXT: addl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
+; X32-NEXT: adcl {{[0-9]+}}(%esp), %ebp # 4-byte Folded Reload
+; X32-NEXT: adcl {{[0-9]+}}(%esp), %esi # 4-byte Folded Reload
+; X32-NEXT: adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: addl %eax, {{[0-9]+}}(%esp) # 4-byte Folded Spill
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
; X32-NEXT: adcl %eax, {{[0-9]+}}(%esp) # 4-byte Folded Spill
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
; X32-NEXT: adcl %eax, {{[0-9]+}}(%esp) # 4-byte Folded Spill
-; X32-NEXT: movzbl (%esp), %eax # 1-byte Folded Reload
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
; X32-NEXT: adcl %eax, {{[0-9]+}}(%esp) # 4-byte Folded Spill
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax # 1-byte Folded Reload
+; X32-NEXT: adcl %eax, %edx
+; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: adcl $0, %ebp
+; X32-NEXT: movl %ebp, {{[0-9]+}}(%esp) # 4-byte Spill
; X32-NEXT: adcl $0, %esi
; X32-NEXT: movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl $0, %edx
-; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl $0, %edi
+; X32-NEXT: adcl $0, %ecx
+; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: movl 32(%ecx), %edi
; X32-NEXT: movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: addl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl %eax, %esi
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: movl (%esp), %ebp # 4-byte Reload
+; X32-NEXT: movl %ebp, %eax
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %edx, %edi
+; X32-NEXT: movl %eax, %ebx
+; X32-NEXT: addl %esi, %ebx
+; X32-NEXT: adcl $0, %edi
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: adcl $0, %eax
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: adcl $0, %ecx
-; X32-NEXT: addl {{[0-9]+}}(%esp), %edx
+; X32-NEXT: movl 36(%eax), %esi
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %esi, %ecx
+; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: addl %ebx, %eax
+; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: adcl %edi, %esi
+; X32-NEXT: setb %bl
+; X32-NEXT: movl %ebp, %eax
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: movl %eax, %ebp
+; X32-NEXT: addl %esi, %ebp
+; X32-NEXT: movzbl %bl, %eax
+; X32-NEXT: adcl %eax, %ecx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: xorl %edx, %edx
+; X32-NEXT: mull %edx
+; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %esi
+; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
+; X32-NEXT: addl %eax, %edi
+; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
+; X32-NEXT: adcl %edx, %esi
+; X32-NEXT: addl %ebp, %edi
+; X32-NEXT: movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: adcl %ecx, %esi
; X32-NEXT: movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: adcl $0, %edx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: movl %ebx, %esi
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %edi
+; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: movl %eax, %ebp
+; X32-NEXT: addl %edi, %ebp
; X32-NEXT: adcl $0, %esi
-; X32-NEXT: addl %eax, %edx
-; X32-NEXT: adcl %ecx, %esi
-; X32-NEXT: setb %al
-; X32-NEXT: addl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %esi
-; X32-NEXT: movzbl %al, %eax
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: addl %ebp, %eax
; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: adcl %esi, %ecx
+; X32-NEXT: setb {{[0-9]+}}(%esp) # 1-byte Folded Spill
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: movl %ebx, %esi
+; X32-NEXT: mull %edi
+; X32-NEXT: addl %ecx, %eax
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %ecx # 1-byte Folded Reload
+; X32-NEXT: adcl %ecx, %edx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
+; X32-NEXT: addl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
+; X32-NEXT: adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
+; X32-NEXT: addl %eax, %edi
+; X32-NEXT: adcl %edx, %ecx
+; X32-NEXT: addl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
+; X32-NEXT: movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
+; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: adcl $0, {{[0-9]+}}(%esp) # 4-byte Folded Spill
+; X32-NEXT: adcl $0, {{[0-9]+}}(%esp) # 4-byte Folded Spill
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl 40(%eax), %ebp
+; X32-NEXT: movl %eax, %ebx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
+; X32-NEXT: movl %edi, %eax
+; X32-NEXT: mull %ebp
+; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: mull %ebp
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: addl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
+; X32-NEXT: adcl $0, %esi
+; X32-NEXT: movl 44(%ebx), %ebx
+; X32-NEXT: movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: movl %edi, %eax
+; X32-NEXT: mull %ebx
+; X32-NEXT: movl %edx, %edi
+; X32-NEXT: addl %ecx, %eax
+; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: adcl %esi, %edi
+; X32-NEXT: setb {{[0-9]+}}(%esp) # 1-byte Folded Spill
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X32-NEXT: mull %ebx
+; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: movl %eax, %esi
+; X32-NEXT: addl %edi, %esi
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax # 1-byte Folded Reload
+; X32-NEXT: adcl %eax, %ecx
+; X32-NEXT: movl %ebp, %eax
+; X32-NEXT: xorl %edx, %edx
+; X32-NEXT: mull %edx
+; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
+; X32-NEXT: addl %eax, %edi
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X32-NEXT: adcl %edx, %eax
+; X32-NEXT: addl %esi, %edi
+; X32-NEXT: adcl %ecx, %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
+; X32-NEXT: addl %ecx, {{[0-9]+}}(%esp) # 4-byte Folded Spill
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
+; X32-NEXT: adcl %ecx, {{[0-9]+}}(%esp) # 4-byte Folded Spill
+; X32-NEXT: adcl $0, %edi
; X32-NEXT: adcl $0, %eax
-; X32-NEXT: movl %eax, (%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X32-NEXT: addl {{[0-9]+}}(%esp), %edi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %ebx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: addl %edx, %edi
+; X32-NEXT: addl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
; X32-NEXT: movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl %esi, %ebx
-; X32-NEXT: movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl (%esp), %ecx # 4-byte Folded Reload
+; X32-NEXT: setb {{[0-9]+}}(%esp) # 1-byte Folded Spill
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: movl %ebp, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: mull %ebp
+; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: movl (%esp), %edi # 4-byte Reload
+; X32-NEXT: movl %edi, %eax
+; X32-NEXT: mull %ebp
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: movl %eax, %ebx
+; X32-NEXT: addl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
+; X32-NEXT: adcl $0, %esi
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ebp # 4-byte Reload
+; X32-NEXT: mull %ebp
+; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: addl %ebx, %eax
+; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: adcl %esi, %ecx
+; X32-NEXT: setb %bl
+; X32-NEXT: movl %edi, %eax
+; X32-NEXT: mull %ebp
+; X32-NEXT: addl %ecx, %eax
+; X32-NEXT: movzbl %bl, %ecx
+; X32-NEXT: adcl %ecx, %edx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
+; X32-NEXT: addl {{[0-9]+}}(%esp), %esi # 4-byte Folded Reload
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
+; X32-NEXT: adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
+; X32-NEXT: addl %eax, %esi
+; X32-NEXT: adcl %edx, %ecx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X32-NEXT: addl %eax, {{[0-9]+}}(%esp) # 4-byte Folded Spill
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X32-NEXT: adcl %eax, {{[0-9]+}}(%esp) # 4-byte Folded Spill
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax # 1-byte Folded Reload
+; X32-NEXT: adcl %eax, %esi
+; X32-NEXT: movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: adcl $0, %ecx
; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: addl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X32-NEXT: imull %eax, %ebp
+; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: addl %ebp, %edx
+; X32-NEXT: imull {{[0-9]+}}(%esp), %esi # 4-byte Folded Reload
+; X32-NEXT: addl %edx, %esi
+; X32-NEXT: movl %esi, %ebp
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
; X32-NEXT: movl %eax, %esi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: adcl $0, %eax
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: adcl $0, %ecx
-; X32-NEXT: addl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: movl %edx, (%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %esi
-; X32-NEXT: movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X32-NEXT: adcl $0, %edi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
+; X32-NEXT: imull %ebx, %esi
+; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: addl %esi, %edx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
+; X32-NEXT: movl %edi, %eax
+; X32-NEXT: imull %eax, %esi
+; X32-NEXT: addl %edx, %esi
+; X32-NEXT: addl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
+; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: adcl %ebp, %esi
+; X32-NEXT: movl %esi, %edi
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, %ebp
+; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: movl %eax, %ebx
+; X32-NEXT: addl %ebp, %ebx
; X32-NEXT: adcl $0, %esi
-; X32-NEXT: addl %eax, %edi
-; X32-NEXT: adcl %ecx, %esi
-; X32-NEXT: setb %al
-; X32-NEXT: addl {{[0-9]+}}(%esp), %edi
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %esi
-; X32-NEXT: movzbl %al, %eax
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ebp # 4-byte Reload
+; X32-NEXT: mull %ebp
+; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: addl %ebx, %eax
; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: adcl $0, %eax
+; X32-NEXT: adcl %esi, %ecx
+; X32-NEXT: setb %bl
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X32-NEXT: mull %ebp
+; X32-NEXT: addl %ecx, %eax
+; X32-NEXT: movzbl %bl, %ecx
+; X32-NEXT: adcl %ecx, %edx
+; X32-NEXT: addl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X32-NEXT: addl {{[0-9]+}}(%esp), %ebx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: adcl %edi, %edx
+; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: addl %edi, %ebx
+; X32-NEXT: movl 60(%edx), %ecx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X32-NEXT: imull %eax, %ecx
+; X32-NEXT: movl 56(%edx), %esi
+; X32-NEXT: movl %edx, %edi
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: addl %ecx, %edx
+; X32-NEXT: imull {{[0-9]+}}(%esp), %esi # 4-byte Folded Reload
+; X32-NEXT: addl %edx, %esi
+; X32-NEXT: movl 48(%edi), %ebx
+; X32-NEXT: movl 52(%edi), %ebp
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X32-NEXT: movl %eax, %edi
+; X32-NEXT: imull %ebp, %edi
+; X32-NEXT: movl %ebp, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: mull %ebx
+; X32-NEXT: addl %edi, %edx
+; X32-NEXT: movl (%esp), %ecx # 4-byte Reload
+; X32-NEXT: imull %ebx, %ecx
+; X32-NEXT: addl %edx, %ecx
+; X32-NEXT: addl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
+; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
; X32-NEXT: adcl %esi, %ecx
+; X32-NEXT: movl %ecx, (%esp) # 4-byte Spill
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: movl %ebp, %eax
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, %ebp
+; X32-NEXT: movl %eax, %edi
+; X32-NEXT: addl %esi, %edi
+; X32-NEXT: adcl $0, %ebp
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: movl %eax, %ebx
+; X32-NEXT: addl %edi, %ebx
+; X32-NEXT: adcl %ebp, %ecx
+; X32-NEXT: setb {{[0-9]+}}(%esp) # 1-byte Folded Spill
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: addl %ecx, %eax
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %ecx # 1-byte Folded Reload
+; X32-NEXT: adcl %ecx, %edx
+; X32-NEXT: addl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
+; X32-NEXT: adcl (%esp), %edx # 4-byte Folded Reload
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
+; X32-NEXT: addl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
+; X32-NEXT: adcl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
+; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
; X32-NEXT: adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
+; X32-NEXT: addl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
+; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: adcl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
+; X32-NEXT: movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
+; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
+; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X32-NEXT: addl {{[0-9]+}}(%esp), %esi
-; X32-NEXT: movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: movl %esi, %ecx
+; X32-NEXT: movl 40(%ecx), %ebx
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X32-NEXT: movl %edx, %ebp
+; X32-NEXT: movl 44(%ecx), %ecx
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: movl %eax, %edi
+; X32-NEXT: addl %ebp, %edi
+; X32-NEXT: adcl $0, %esi
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ebp # 4-byte Reload
+; X32-NEXT: mull %ebp
+; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: addl %edi, %eax
+; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: adcl %esi, %ebx
+; X32-NEXT: setb {{[0-9]+}}(%esp) # 1-byte Folded Spill
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: mull %ebp
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: movl %eax, %edi
+; X32-NEXT: addl %ebx, %edi
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax # 1-byte Folded Reload
+; X32-NEXT: adcl %eax, %esi
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X32-NEXT: xorl %ecx, %ecx
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: addl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
+; X32-NEXT: adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
+; X32-NEXT: addl %edi, %eax
+; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: adcl %esi, %edx
+; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %esi
+; X32-NEXT: movl 32(%esi), %edi
+; X32-NEXT: movl %edi, %eax
+; X32-NEXT: movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: movl 36(%esi), %esi
+; X32-NEXT: movl %esi, %eax
; X32-NEXT: movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl (%esp), %edi # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: movl %eax, %ebp
+; X32-NEXT: addl {{[0-9]+}}(%esp), %ebp # 4-byte Folded Reload
+; X32-NEXT: adcl $0, %ebx
+; X32-NEXT: movl %edi, %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, %edi
+; X32-NEXT: addl %ebp, %eax
+; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: adcl %ebx, %edi
+; X32-NEXT: setb %bl
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, %ebp
+; X32-NEXT: movl %eax, %esi
+; X32-NEXT: addl %edi, %esi
+; X32-NEXT: movzbl %bl, %eax
+; X32-NEXT: adcl %eax, %ebp
+; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
+; X32-NEXT: movl %edi, %eax
+; X32-NEXT: xorl %ecx, %ecx
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
+; X32-NEXT: addl %eax, %ecx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X32-NEXT: adcl %edx, %eax
+; X32-NEXT: addl %esi, %ecx
+; X32-NEXT: adcl %ebp, %eax
+; X32-NEXT: addl (%esp), %ecx # 4-byte Folded Reload
+; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
+; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: adcl $0, {{[0-9]+}}(%esp) # 4-byte Folded Spill
+; X32-NEXT: adcl $0, {{[0-9]+}}(%esp) # 4-byte Folded Spill
+; X32-NEXT: movl %edi, %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %esi # 4-byte Folded Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
-; X32-NEXT: movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: movl %eax, %ebp
+; X32-NEXT: addl %ecx, %ebp
+; X32-NEXT: adcl $0, %esi
+; X32-NEXT: movl %edi, %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, %edi
+; X32-NEXT: addl %ebp, %eax
+; X32-NEXT: movl %eax, %ebp
+; X32-NEXT: adcl %esi, %edi
+; X32-NEXT: setb {{[0-9]+}}(%esp) # 1-byte Folded Spill
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %ecx, %ebx
+; X32-NEXT: addl %edi, %eax
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %ecx # 1-byte Folded Reload
+; X32-NEXT: adcl %ecx, %edx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
+; X32-NEXT: addl {{[0-9]+}}(%esp), %esi # 4-byte Folded Reload
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
; X32-NEXT: adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
+; X32-NEXT: addl %eax, %esi
+; X32-NEXT: adcl %edx, %ecx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X32-NEXT: addl %eax, (%esp) # 4-byte Folded Spill
+; X32-NEXT: adcl {{[0-9]+}}(%esp), %ebp # 4-byte Folded Reload
+; X32-NEXT: movl %ebp, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: adcl $0, %esi
+; X32-NEXT: adcl $0, %ecx
+; X32-NEXT: addl {{[0-9]+}}(%esp), %esi # 4-byte Folded Reload
+; X32-NEXT: movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
+; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: setb {{[0-9]+}}(%esp) # 1-byte Folded Spill
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
+; X32-NEXT: movl %edi, %eax
+; X32-NEXT: mull %esi
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: movl %eax, %ebp
+; X32-NEXT: addl {{[0-9]+}}(%esp), %ebp # 4-byte Folded Reload
+; X32-NEXT: adcl $0, %esi
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: mull %ebx
+; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: addl %ebp, %eax
+; X32-NEXT: movl %eax, %ebp
+; X32-NEXT: adcl %esi, %ecx
+; X32-NEXT: setb {{[0-9]+}}(%esp) # 1-byte Folded Spill
+; X32-NEXT: movl %edi, %eax
+; X32-NEXT: mull %ebx
+; X32-NEXT: addl %ecx, %eax
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %ecx # 1-byte Folded Reload
+; X32-NEXT: adcl %ecx, %edx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
+; X32-NEXT: addl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
+; X32-NEXT: adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
+; X32-NEXT: addl %eax, %edi
+; X32-NEXT: adcl %edx, %ecx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X32-NEXT: addl %eax, {{[0-9]+}}(%esp) # 4-byte Folded Spill
+; X32-NEXT: adcl {{[0-9]+}}(%esp), %ebp # 4-byte Folded Reload
+; X32-NEXT: movl %ebp, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax # 1-byte Folded Reload
+; X32-NEXT: adcl %eax, %edi
+; X32-NEXT: movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: adcl $0, %ecx
+; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: movl 48(%ecx), %ebp
+; X32-NEXT: imull %ebp, %ebx
+; X32-NEXT: movl %ebp, %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: addl %ebx, %edx
+; X32-NEXT: movl 52(%ecx), %eax
+; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: imull %eax, %edi
+; X32-NEXT: addl %edx, %edi
+; X32-NEXT: movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: movl 56(%ecx), %eax
+; X32-NEXT: movl %ecx, %ebx
+; X32-NEXT: movl %eax, %edi
+; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
+; X32-NEXT: imull %esi, %edi
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: addl %edi, %edx
+; X32-NEXT: movl 60(%ebx), %ebx
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: imull %eax, %ebx
+; X32-NEXT: addl %edx, %ebx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
+; X32-NEXT: addl %ecx, {{[0-9]+}}(%esp) # 4-byte Folded Spill
+; X32-NEXT: adcl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
+; X32-NEXT: mull %ebp
+; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: mull %ebp
+; X32-NEXT: movl %edx, %ecx
+; X32-NEXT: movl %eax, %edi
+; X32-NEXT: addl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
+; X32-NEXT: adcl $0, %ecx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ebp # 4-byte Reload
+; X32-NEXT: mull %ebp
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: addl %edi, %eax
+; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: adcl %ecx, %esi
+; X32-NEXT: setb %cl
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X32-NEXT: mull %ebp
+; X32-NEXT: addl %esi, %eax
+; X32-NEXT: movzbl %cl, %ecx
+; X32-NEXT: adcl %ecx, %edx
+; X32-NEXT: addl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
+; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: adcl %ebx, %edx
+; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ebp # 4-byte Reload
+; X32-NEXT: imull %ebp, %edi
+; X32-NEXT: movl %ebp, %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %eax, %esi
+; X32-NEXT: addl %edi, %edx
+; X32-NEXT: imull {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
+; X32-NEXT: addl %edx, %ecx
+; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X32-NEXT: movl %eax, %ecx
; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
-; X32-NEXT: addl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
-; X32-NEXT: movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: imull %ebx, %ecx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
+; X32-NEXT: mull %edi
+; X32-NEXT: addl %ecx, %edx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
+; X32-NEXT: imull %edi, %ecx
+; X32-NEXT: addl %edx, %ecx
+; X32-NEXT: addl %esi, %eax
+; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
+; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: movl %edi, %eax
+; X32-NEXT: mull %ebp
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: movl %ebx, %eax
+; X32-NEXT: mull %ebp
+; X32-NEXT: movl %edx, %edi
+; X32-NEXT: movl %eax, %ecx
+; X32-NEXT: addl %esi, %ecx
+; X32-NEXT: adcl $0, %edi
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
+; X32-NEXT: mull %ebx
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: movl %eax, %ebp
+; X32-NEXT: addl %ecx, %ebp
+; X32-NEXT: adcl %edi, %esi
+; X32-NEXT: setb %cl
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; X32-NEXT: mull %ebx
+; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: addl %esi, %eax
+; X32-NEXT: movzbl %cl, %ecx
+; X32-NEXT: adcl %ecx, %ebx
+; X32-NEXT: addl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
; X32-NEXT: adcl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
+; X32-NEXT: addl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
+; X32-NEXT: adcl {{[0-9]+}}(%esp), %ebp # 4-byte Folded Reload
+; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
+; X32-NEXT: adcl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
+; X32-NEXT: addl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
+; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: adcl {{[0-9]+}}(%esp), %ebp # 4-byte Folded Reload
+; X32-NEXT: movl %ebp, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
+; X32-NEXT: adcl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
+; X32-NEXT: addl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
+; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
; X32-NEXT: adcl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
-; X32-NEXT: movl %edi, (%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %esi # 4-byte Folded Reload
-; X32-NEXT: movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: movl (%esp), %edx # 4-byte Reload
+; X32-NEXT: adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
; X32-NEXT: adcl {{[0-9]+}}(%esp), %esi # 4-byte Folded Reload
-; X32-NEXT: movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: adcl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ebp # 4-byte Reload
+; X32-NEXT: adcl {{[0-9]+}}(%esp), %ebp # 4-byte Folded Reload
+; X32-NEXT: movl %ebp, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ebp # 4-byte Reload
+; X32-NEXT: adcl {{[0-9]+}}(%esp), %ebp # 4-byte Folded Reload
+; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
+; X32-NEXT: adcl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
+; X32-NEXT: addl {{[0-9]+}}(%esp), %ecx # 4-byte Folded Reload
+; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: adcl {{[0-9]+}}(%esp), %edi # 4-byte Folded Reload
+; X32-NEXT: movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT: adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
+; X32-NEXT: movl %edx, (%esp) # 4-byte Spill
+; X32-NEXT: adcl {{[0-9]+}}(%esp), %esi # 4-byte Folded Reload
+; X32-NEXT: movl {{[0-9]+}}(%esp), %edx # 4-byte Reload
; X32-NEXT: adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
+; X32-NEXT: adcl {{[0-9]+}}(%esp), %ebp # 4-byte Folded Reload
; X32-NEXT: adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X32-NEXT: movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X32-NEXT: movl 16(%ebp), %esi
-; X32-NEXT: movl %edi, 4(%esi)
-; X32-NEXT: movl 16(%ebp), %edi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
-; X32-NEXT: movl %esi, (%edi)
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
-; X32-NEXT: movl %esi, 8(%edi)
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
-; X32-NEXT: movl %esi, 12(%edi)
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
-; X32-NEXT: movl %esi, 16(%edi)
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
-; X32-NEXT: movl %esi, 20(%edi)
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
-; X32-NEXT: movl %esi, 24(%edi)
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
-; X32-NEXT: movl %esi, 28(%edi)
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
-; X32-NEXT: movl %esi, 32(%edi)
-; X32-NEXT: movl %ebx, 36(%edi)
-; X32-NEXT: movl (%esp), %esi # 4-byte Reload
-; X32-NEXT: movl %esi, 40(%edi)
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
-; X32-NEXT: movl %esi, 44(%edi)
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
-; X32-NEXT: movl %esi, 48(%edi)
-; X32-NEXT: movl %ecx, 52(%edi)
-; X32-NEXT: movl %edx, 56(%edi)
-; X32-NEXT: movl %eax, 60(%edi)
-; X32-NEXT: leal -12(%ebp), %esp
+; X32-NEXT: adcl {{[0-9]+}}(%esp), %ebx # 4-byte Folded Reload
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
+; X32-NEXT: movl %edi, (%ecx)
+; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
+; X32-NEXT: movl %edi, 4(%ecx)
+; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
+; X32-NEXT: movl %edi, 8(%ecx)
+; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
+; X32-NEXT: movl %edi, 12(%ecx)
+; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
+; X32-NEXT: movl %edi, 16(%ecx)
+; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
+; X32-NEXT: movl %edi, 20(%ecx)
+; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
+; X32-NEXT: movl %edi, 24(%ecx)
+; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
+; X32-NEXT: movl %edi, 28(%ecx)
+; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
+; X32-NEXT: movl %edi, 32(%ecx)
+; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
+; X32-NEXT: movl %edi, 36(%ecx)
+; X32-NEXT: movl (%esp), %edi # 4-byte Reload
+; X32-NEXT: movl %edi, 40(%ecx)
+; X32-NEXT: movl %esi, 44(%ecx)
+; X32-NEXT: movl %edx, 48(%ecx)
+; X32-NEXT: movl %ebp, 52(%ecx)
+; X32-NEXT: movl %eax, 56(%ecx)
+; X32-NEXT: movl %ebx, 60(%ecx)
+; X32-NEXT: addl $244, %esp
; X32-NEXT: popl %esi
; X32-NEXT: popl %edi
; X32-NEXT: popl %ebx
@@ -893,7 +1530,7 @@ define void @test_512(i512* %a, i512* %b, i512* %out) nounwind {
; X32-NEXT: retl
;
; X64-LABEL: test_512:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: pushq %rbp
; X64-NEXT: pushq %r15
; X64-NEXT: pushq %r14
diff --git a/test/CodeGen/X86/mul-legalize.ll b/test/CodeGen/X86/mul-legalize.ll
index 339de3104335..372186f0e554 100644
--- a/test/CodeGen/X86/mul-legalize.ll
+++ b/test/CodeGen/X86/mul-legalize.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | FileCheck %s
+; RUN: llc < %s | FileCheck %s
; PR2135
; CHECK: 24576
diff --git a/test/CodeGen/X86/mul-remat.ll b/test/CodeGen/X86/mul-remat.ll
index 3fa005079de7..87921324f05d 100644
--- a/test/CodeGen/X86/mul-remat.ll
+++ b/test/CodeGen/X86/mul-remat.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | grep mov | count 1
+; RUN: llc < %s -mtriple=i686-- | grep mov | count 1
; PR1874
define i32 @test(i32 %a, i32 %b) {
diff --git a/test/CodeGen/X86/mul-shift-reassoc.ll b/test/CodeGen/X86/mul-shift-reassoc.ll
index 3777d8b8cfb4..c1139b01fb63 100644
--- a/test/CodeGen/X86/mul-shift-reassoc.ll
+++ b/test/CodeGen/X86/mul-shift-reassoc.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=x86 | grep lea
-; RUN: llc < %s -march=x86 | not grep add
+; RUN: llc < %s -mtriple=i686-- | grep lea
+; RUN: llc < %s -mtriple=i686-- | not grep add
define i32 @test(i32 %X, i32 %Y) {
; Push the shl through the mul to allow an LEA to be formed, instead
diff --git a/test/CodeGen/X86/mul128.ll b/test/CodeGen/X86/mul128.ll
index 2b3a13509b3c..e851c3a3d5b3 100644
--- a/test/CodeGen/X86/mul128.ll
+++ b/test/CodeGen/X86/mul128.ll
@@ -1,9 +1,10 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -mtriple=i386-unknown | FileCheck %s --check-prefix=X86
define i128 @foo(i128 %t, i128 %u) {
; X64-LABEL: foo:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movq %rdx, %r8
; X64-NEXT: imulq %rdi, %rcx
; X64-NEXT: movq %rdi, %rax
@@ -12,6 +13,84 @@ define i128 @foo(i128 %t, i128 %u) {
; X64-NEXT: imulq %r8, %rsi
; X64-NEXT: addq %rsi, %rdx
; X64-NEXT: retq
+;
+; X86-LABEL: foo:
+; X86: # %bb.0:
+; X86-NEXT: pushl %ebp
+; X86-NEXT: .cfi_def_cfa_offset 8
+; X86-NEXT: pushl %ebx
+; X86-NEXT: .cfi_def_cfa_offset 12
+; X86-NEXT: pushl %edi
+; X86-NEXT: .cfi_def_cfa_offset 16
+; X86-NEXT: pushl %esi
+; X86-NEXT: .cfi_def_cfa_offset 20
+; X86-NEXT: subl $8, %esp
+; X86-NEXT: .cfi_def_cfa_offset 28
+; X86-NEXT: .cfi_offset %esi, -20
+; X86-NEXT: .cfi_offset %edi, -16
+; X86-NEXT: .cfi_offset %ebx, -12
+; X86-NEXT: .cfi_offset %ebp, -8
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: imull %edx, %esi
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: mull %edx
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: imull %edi, %ecx
+; X86-NEXT: addl %edx, %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: addl %esi, %ecx
+; X86-NEXT: movl %eax, %esi
+; X86-NEXT: imull {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT: mull %ebp
+; X86-NEXT: addl %esi, %edx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: imull %ebp, %edi
+; X86-NEXT: addl %edx, %edi
+; X86-NEXT: addl %ebx, %eax
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X86-NEXT: adcl %ecx, %edi
+; X86-NEXT: movl %ebp, %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: mull %ecx
+; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: mull %ecx
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: addl %ebx, %ecx
+; X86-NEXT: adcl $0, %esi
+; X86-NEXT: movl %ebp, %eax
+; X86-NEXT: mull {{[0-9]+}}(%esp)
+; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: movl %eax, %ebp
+; X86-NEXT: addl %ecx, %ebp
+; X86-NEXT: adcl %esi, %ebx
+; X86-NEXT: setb %cl
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: mull {{[0-9]+}}(%esp)
+; X86-NEXT: addl %ebx, %eax
+; X86-NEXT: movzbl %cl, %ecx
+; X86-NEXT: adcl %ecx, %edx
+; X86-NEXT: addl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
+; X86-NEXT: adcl %edi, %edx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl (%esp), %esi # 4-byte Reload
+; X86-NEXT: movl %esi, (%ecx)
+; X86-NEXT: movl %ebp, 4(%ecx)
+; X86-NEXT: movl %eax, 8(%ecx)
+; X86-NEXT: movl %edx, 12(%ecx)
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: addl $8, %esp
+; X86-NEXT: popl %esi
+; X86-NEXT: popl %edi
+; X86-NEXT: popl %ebx
+; X86-NEXT: popl %ebp
+; X86-NEXT: retl $4
%k = mul i128 %t, %u
ret i128 %k
}
diff --git a/test/CodeGen/X86/mul128_sext_loop.ll b/test/CodeGen/X86/mul128_sext_loop.ll
index efb7e02720b4..3bbcccda81a6 100644
--- a/test/CodeGen/X86/mul128_sext_loop.ll
+++ b/test/CodeGen/X86/mul128_sext_loop.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-- | FileCheck %s
define void @test(i64* nocapture %arr, i64 %arrsize, i64 %factor) nounwind uwtable {
%1 = icmp sgt i64 %arrsize, 0
diff --git a/test/CodeGen/X86/mul64.ll b/test/CodeGen/X86/mul64.ll
index f5ca52a93b51..f8a7aaade6c9 100644
--- a/test/CodeGen/X86/mul64.ll
+++ b/test/CodeGen/X86/mul64.ll
@@ -4,7 +4,7 @@
define i64 @foo(i64 %t, i64 %u) nounwind {
; X32-LABEL: foo:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pushl %esi
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
@@ -18,7 +18,7 @@ define i64 @foo(i64 %t, i64 %u) nounwind {
; X32-NEXT: retl
;
; X64-LABEL: foo:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: imulq %rsi, %rdi
; X64-NEXT: movq %rdi, %rax
; X64-NEXT: retq
diff --git a/test/CodeGen/X86/mult-alt-generic-i686.ll b/test/CodeGen/X86/mult-alt-generic-i686.ll
index 9ebdf55d0e03..e55a3a2d35b3 100644
--- a/test/CodeGen/X86/mult-alt-generic-i686.ll
+++ b/test/CodeGen/X86/mult-alt-generic-i686.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -no-integrated-as
+; RUN: llc < %s -no-integrated-as
; ModuleID = 'mult-alt-generic.c'
target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32"
target triple = "i686"
diff --git a/test/CodeGen/X86/mult-alt-generic-x86_64.ll b/test/CodeGen/X86/mult-alt-generic-x86_64.ll
index a87655e5eef3..026d0a636e88 100644
--- a/test/CodeGen/X86/mult-alt-generic-x86_64.ll
+++ b/test/CodeGen/X86/mult-alt-generic-x86_64.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -no-integrated-as
+; RUN: llc < %s -no-integrated-as
; ModuleID = 'mult-alt-generic.c'
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
target triple = "x86_64"
diff --git a/test/CodeGen/X86/mult-alt-x86.ll b/test/CodeGen/X86/mult-alt-x86.ll
index 1c83fedad3ce..18e245e80dc4 100644
--- a/test/CodeGen/X86/mult-alt-x86.ll
+++ b/test/CodeGen/X86/mult-alt-x86.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=+mmx,+sse2 -no-integrated-as
+; RUN: llc < %s -mattr=+mmx,+sse2 -no-integrated-as
; ModuleID = 'mult-alt-x86.c'
target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f80:128:128-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32"
target triple = "i686-pc-win32"
diff --git a/test/CodeGen/X86/multiple-loop-post-inc.ll b/test/CodeGen/X86/multiple-loop-post-inc.ll
index be778da57332..d54aea160c7e 100644
--- a/test/CodeGen/X86/multiple-loop-post-inc.ll
+++ b/test/CodeGen/X86/multiple-loop-post-inc.ll
@@ -1,4 +1,4 @@
-; RUN: llc -asm-verbose=false -disable-branch-fold -disable-block-placement -disable-tail-duplicate -march=x86-64 -mcpu=nehalem -no-integrated-as < %s | FileCheck %s
+; RUN: llc -asm-verbose=false -disable-branch-fold -disable-block-placement -disable-tail-duplicate -mtriple=x86_64-- -mcpu=nehalem -no-integrated-as < %s | FileCheck %s
; rdar://7236213
;
; The scheduler's 2-address hack has been disabled, so there is
diff --git a/test/CodeGen/X86/multiple-return-values-cross-block.ll b/test/CodeGen/X86/multiple-return-values-cross-block.ll
index b0cb06111348..b55e3b24a58f 100644
--- a/test/CodeGen/X86/multiple-return-values-cross-block.ll
+++ b/test/CodeGen/X86/multiple-return-values-cross-block.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86
+; RUN: llc < %s -mtriple=i686--
declare {x86_fp80, x86_fp80} @test()
diff --git a/test/CodeGen/X86/mulvi32.ll b/test/CodeGen/X86/mulvi32.ll
new file mode 100644
index 000000000000..570299fed5b7
--- /dev/null
+++ b/test/CodeGen/X86/mulvi32.ll
@@ -0,0 +1,472 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE42
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
+
+; PR6399
+
+define <2 x i32> @_mul2xi32a(<2 x i32>, <2 x i32>) {
+; SSE-LABEL: _mul2xi32a:
+; SSE: # %bb.0:
+; SSE-NEXT: movdqa %xmm0, %xmm2
+; SSE-NEXT: psrlq $32, %xmm2
+; SSE-NEXT: pmuludq %xmm1, %xmm2
+; SSE-NEXT: movdqa %xmm1, %xmm3
+; SSE-NEXT: psrlq $32, %xmm3
+; SSE-NEXT: pmuludq %xmm0, %xmm3
+; SSE-NEXT: paddq %xmm2, %xmm3
+; SSE-NEXT: psllq $32, %xmm3
+; SSE-NEXT: pmuludq %xmm1, %xmm0
+; SSE-NEXT: paddq %xmm3, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: _mul2xi32a:
+; AVX: # %bb.0:
+; AVX-NEXT: vpsrlq $32, %xmm0, %xmm2
+; AVX-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
+; AVX-NEXT: vpsrlq $32, %xmm1, %xmm3
+; AVX-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
+; AVX-NEXT: vpaddq %xmm2, %xmm3, %xmm2
+; AVX-NEXT: vpsllq $32, %xmm2, %xmm2
+; AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpaddq %xmm2, %xmm0, %xmm0
+; AVX-NEXT: retq
+ %r = mul <2 x i32> %0, %1
+ ret <2 x i32> %r
+}
+
+define <2 x i32> @_mul2xi32b(<2 x i32>, <2 x i32>) {
+; SSE2-LABEL: _mul2xi32b:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE2-NEXT: pmuludq %xmm0, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3]
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: _mul2xi32b:
+; SSE42: # %bb.0:
+; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE42-NEXT: pmuludq %xmm0, %xmm1
+; SSE42-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero
+; SSE42-NEXT: retq
+;
+; AVX-LABEL: _mul2xi32b:
+; AVX: # %bb.0:
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; AVX-NEXT: retq
+ %factor0 = shufflevector <2 x i32> %0, <2 x i32> undef, <4 x i32> <i32 0, i32 undef, i32 2, i32 undef>
+ %factor1 = shufflevector <2 x i32> %1, <2 x i32> undef, <4 x i32> <i32 0, i32 undef, i32 2, i32 undef>
+ %product64 = call <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32> %factor0, <4 x i32> %factor1) readnone
+ %product = bitcast <2 x i64> %product64 to <4 x i32>
+ %r = shufflevector <4 x i32> %product, <4 x i32> undef, <2 x i32> <i32 0, i32 4>
+ ret <2 x i32> %r
+}
+
+define <4 x i32> @_mul4xi32a(<4 x i32>, <4 x i32>) {
+; SSE2-LABEL: _mul4xi32a:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm1, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm2, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: _mul4xi32a:
+; SSE42: # %bb.0:
+; SSE42-NEXT: pmulld %xmm1, %xmm0
+; SSE42-NEXT: retq
+;
+; AVX-LABEL: _mul4xi32a:
+; AVX: # %bb.0:
+; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
+ %r = mul <4 x i32> %0, %1
+ ret <4 x i32> %r
+}
+
+define <4 x i32> @_mul4xi32b(<4 x i32>, <4 x i32>) {
+; SSE2-LABEL: _mul4xi32b:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm1, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm2, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: _mul4xi32b:
+; SSE42: # %bb.0:
+; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; SSE42-NEXT: pmuludq %xmm1, %xmm0
+; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE42-NEXT: pmuludq %xmm2, %xmm1
+; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
+; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; SSE42-NEXT: retq
+;
+; AVX1-LABEL: _mul4xi32b:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm2
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: _mul4xi32b:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm2
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3]
+; AVX2-NEXT: retq
+ %even0 = shufflevector <4 x i32> %0, <4 x i32> undef, <4 x i32> <i32 0, i32 undef, i32 2, i32 undef>
+ %even1 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 undef, i32 2, i32 undef>
+ %evenMul64 = call <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32> %even0, <4 x i32> %even1) readnone
+ %evenMul = bitcast <2 x i64> %evenMul64 to <4 x i32>
+ %odd0 = shufflevector <4 x i32> %0, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 3, i32 undef>
+ %odd1 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 3, i32 undef>
+ %oddMul64 = call <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32> %odd0, <4 x i32> %odd1) readnone
+ %oddMul = bitcast <2 x i64> %oddMul64 to <4 x i32>
+ %r = shufflevector <4 x i32> %evenMul, <4 x i32> %oddMul, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+ ret <4 x i32> %r
+}
+
+; the following extractelement's and insertelement's
+; are just an unrolled 'zext' on a vector
+; %ext0 = zext <4 x i32> %0 to <4 x i64>
+; %ext1 = zext <4 x i32> %1 to <4 x i64>
+define <4 x i64> @_mul4xi32toi64a(<4 x i32>, <4 x i32>) {
+; SSE2-LABEL: _mul4xi32toi64a:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movq %xmm1, %rax
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE2-NEXT: movq %xmm1, %rcx
+; SSE2-NEXT: movd %ecx, %xmm1
+; SSE2-NEXT: shrq $32, %rcx
+; SSE2-NEXT: movq %xmm0, %rdx
+; SSE2-NEXT: movd %edx, %xmm2
+; SSE2-NEXT: shrq $32, %rdx
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT: movq %xmm0, %rsi
+; SSE2-NEXT: movd %esi, %xmm3
+; SSE2-NEXT: shrq $32, %rsi
+; SSE2-NEXT: movd %esi, %xmm0
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0]
+; SSE2-NEXT: movd %edx, %xmm0
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0]
+; SSE2-NEXT: movd %ecx, %xmm0
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE2-NEXT: movd %eax, %xmm0
+; SSE2-NEXT: shrq $32, %rax
+; SSE2-NEXT: pmuludq %xmm3, %xmm1
+; SSE2-NEXT: movd %eax, %xmm3
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
+; SSE2-NEXT: pmuludq %xmm2, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: _mul4xi32toi64a:
+; SSE42: # %bb.0:
+; SSE42-NEXT: movq %xmm1, %rax
+; SSE42-NEXT: pextrq $1, %xmm1, %rcx
+; SSE42-NEXT: movd %ecx, %xmm1
+; SSE42-NEXT: shrq $32, %rcx
+; SSE42-NEXT: movq %xmm0, %rdx
+; SSE42-NEXT: movd %edx, %xmm2
+; SSE42-NEXT: shrq $32, %rdx
+; SSE42-NEXT: pextrq $1, %xmm0, %rsi
+; SSE42-NEXT: movd %esi, %xmm3
+; SSE42-NEXT: shrq $32, %rsi
+; SSE42-NEXT: movd %esi, %xmm0
+; SSE42-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0]
+; SSE42-NEXT: movd %edx, %xmm0
+; SSE42-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0]
+; SSE42-NEXT: movd %ecx, %xmm0
+; SSE42-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE42-NEXT: movd %eax, %xmm0
+; SSE42-NEXT: shrq $32, %rax
+; SSE42-NEXT: pmuludq %xmm3, %xmm1
+; SSE42-NEXT: movd %eax, %xmm3
+; SSE42-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
+; SSE42-NEXT: pmuludq %xmm2, %xmm0
+; SSE42-NEXT: retq
+;
+; AVX1-LABEL: _mul4xi32toi64a:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovq %xmm0, %rax
+; AVX1-NEXT: vmovd %eax, %xmm2
+; AVX1-NEXT: shrq $32, %rax
+; AVX1-NEXT: vmovq %xmm1, %rcx
+; AVX1-NEXT: vpextrq $1, %xmm0, %rdx
+; AVX1-NEXT: vmovd %edx, %xmm0
+; AVX1-NEXT: shrq $32, %rdx
+; AVX1-NEXT: vpextrq $1, %xmm1, %rsi
+; AVX1-NEXT: vmovd %esi, %xmm1
+; AVX1-NEXT: shrq $32, %rsi
+; AVX1-NEXT: vmovd %esi, %xmm3
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
+; AVX1-NEXT: vmovd %edx, %xmm3
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
+; AVX1-NEXT: vmovd %ecx, %xmm3
+; AVX1-NEXT: shrq $32, %rcx
+; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vmovd %ecx, %xmm1
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm1[0]
+; AVX1-NEXT: vmovd %eax, %xmm3
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; AVX1-NEXT: vpmuludq %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: _mul4xi32toi64a:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovq %xmm1, %rax
+; AVX2-NEXT: vmovd %eax, %xmm2
+; AVX2-NEXT: shrq $32, %rax
+; AVX2-NEXT: vpextrq $1, %xmm1, %rcx
+; AVX2-NEXT: vmovq %xmm0, %rdx
+; AVX2-NEXT: vmovd %edx, %xmm1
+; AVX2-NEXT: shrq $32, %rdx
+; AVX2-NEXT: vpextrq $1, %xmm0, %rsi
+; AVX2-NEXT: vmovd %esi, %xmm0
+; AVX2-NEXT: shrq $32, %rsi
+; AVX2-NEXT: vmovd %esi, %xmm3
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
+; AVX2-NEXT: vmovd %edx, %xmm3
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
+; AVX2-NEXT: vmovd %ecx, %xmm3
+; AVX2-NEXT: shrq $32, %rcx
+; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: vmovd %ecx, %xmm1
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm1[0]
+; AVX2-NEXT: vmovd %eax, %xmm3
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
+; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+ %f00 = extractelement <4 x i32> %0, i32 0
+ %f01 = extractelement <4 x i32> %0, i32 1
+ %f02 = extractelement <4 x i32> %0, i32 2
+ %f03 = extractelement <4 x i32> %0, i32 3
+ %f10 = extractelement <4 x i32> %1, i32 0
+ %f11 = extractelement <4 x i32> %1, i32 1
+ %f12 = extractelement <4 x i32> %1, i32 2
+ %f13 = extractelement <4 x i32> %1, i32 3
+ %ext00 = zext i32 %f00 to i64
+ %ext01 = zext i32 %f01 to i64
+ %ext02 = zext i32 %f02 to i64
+ %ext03 = zext i32 %f03 to i64
+ %ext10 = zext i32 %f10 to i64
+ %ext11 = zext i32 %f11 to i64
+ %ext12 = zext i32 %f12 to i64
+ %ext13 = zext i32 %f13 to i64
+ %extv00 = insertelement <4 x i64> undef, i64 %ext00, i32 0
+ %extv01 = insertelement <4 x i64> %extv00, i64 %ext01, i32 1
+ %extv02 = insertelement <4 x i64> %extv01, i64 %ext02, i32 2
+ %extv03 = insertelement <4 x i64> %extv02, i64 %ext03, i32 3
+ %extv10 = insertelement <4 x i64> undef, i64 %ext10, i32 0
+ %extv11 = insertelement <4 x i64> %extv10, i64 %ext11, i32 1
+ %extv12 = insertelement <4 x i64> %extv11, i64 %ext12, i32 2
+ %extv13 = insertelement <4 x i64> %extv12, i64 %ext13, i32 3
+ %r = mul <4 x i64> %extv03, %extv13
+ ret <4 x i64> %r
+}
+
+; very similar to mul4xi32 above
+; there is no bitcast and the final shuffle is a little different
+define <4 x i64> @_mul4xi32toi64b(<4 x i32>, <4 x i32>) {
+; SSE-LABEL: _mul4xi32toi64b:
+; SSE: # %bb.0:
+; SSE-NEXT: movdqa %xmm0, %xmm2
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
+; SSE-NEXT: pmuludq %xmm1, %xmm2
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE-NEXT: pmuludq %xmm0, %xmm1
+; SSE-NEXT: movdqa %xmm2, %xmm0
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm1[1]
+; SSE-NEXT: movdqa %xmm2, %xmm1
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: _mul4xi32toi64b:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm2
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm2[1],xmm0[1]
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: _mul4xi32toi64b:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm2
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX2-NEXT: retq
+ %even0 = shufflevector <4 x i32> %0, <4 x i32> undef, <4 x i32> <i32 0, i32 undef, i32 2, i32 undef>
+ %even1 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 undef, i32 2, i32 undef>
+ %evenMul = call <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32> %even0, <4 x i32> %even1) readnone
+ %odd0 = shufflevector <4 x i32> %0, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 3, i32 undef>
+ %odd1 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 3, i32 undef>
+ %oddMul = call <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32> %odd0, <4 x i32> %odd1) readnone
+ %r = shufflevector <2 x i64> %evenMul, <2 x i64> %oddMul, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+ ret <4 x i64> %r
+}
+
+; Here we do not split into even and odd indexed elements
+; but into the lower and the upper half of the factor vectors.
+; This makes the initial shuffle more complicated,
+; but the final shuffle is a no-op.
+define <4 x i64> @_mul4xi32toi64c(<4 x i32>, <4 x i32>) {
+; SSE2-LABEL: _mul4xi32toi64c:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,1,1,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,1,1,3]
+; SSE2-NEXT: pmuludq %xmm3, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,3,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3]
+; SSE2-NEXT: pmuludq %xmm0, %xmm1
+; SSE2-NEXT: movdqa %xmm2, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: _mul4xi32toi64c:
+; SSE42: # %bb.0:
+; SSE42-NEXT: pmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero
+; SSE42-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero
+; SSE42-NEXT: pmuludq %xmm3, %xmm2
+; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
+; SSE42-NEXT: pmuludq %xmm0, %xmm1
+; SSE42-NEXT: movdqa %xmm2, %xmm0
+; SSE42-NEXT: retq
+;
+; AVX1-LABEL: _mul4xi32toi64c:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero
+; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT: vpmuludq %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
+; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: _mul4xi32toi64c:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero
+; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero
+; AVX2-NEXT: vpmuludq %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
+; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0
+; AVX2-NEXT: retq
+ %lower0 = shufflevector <4 x i32> %0, <4 x i32> undef, <4 x i32> <i32 0, i32 undef, i32 1, i32 undef>
+ %lower1 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 undef, i32 1, i32 undef>
+ %lowerMul = call <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32> %lower0, <4 x i32> %lower1) readnone
+ %upper0 = shufflevector <4 x i32> %0, <4 x i32> undef, <4 x i32> <i32 2, i32 undef, i32 3, i32 undef>
+ %upper1 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 undef, i32 3, i32 undef>
+ %upperMul = call <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32> %upper0, <4 x i32> %upper1) readnone
+ %r = shufflevector <2 x i64> %lowerMul, <2 x i64> %upperMul, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ ret <4 x i64> %r
+}
+
+; If we know, that the most significant half of i64 elements are zero,
+; then multiplication can be simplified drastically.
+; In the following example I assert a zero upper half
+; by 'trunc' followed by 'zext'.
+;
+; the following extractelement's and insertelement's
+; are just an unrolled 'trunc' plus 'zext' on a vector
+; %trunc0 = trunc <2 x i64> %0 to <2 x i32>
+; %trunc1 = trunc <2 x i64> %1 to <2 x i32>
+; %ext0 = zext <2 x i32> %0 to <2 x i64>
+; %ext1 = zext <2 x i32> %1 to <2 x i64>
+define <2 x i64> @_mul2xi64toi64a(<2 x i64>, <2 x i64>) {
+; SSE2-LABEL: _mul2xi64toi64a:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,4294967295]
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: pand %xmm2, %xmm1
+; SSE2-NEXT: pmuludq %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: _mul2xi64toi64a:
+; SSE42: # %bb.0:
+; SSE42-NEXT: pxor %xmm2, %xmm2
+; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; SSE42-NEXT: pmuludq %xmm1, %xmm0
+; SSE42-NEXT: retq
+;
+; AVX1-LABEL: _mul2xi64toi64a:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: _mul2xi64toi64a:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: retq
+ %f00 = extractelement <2 x i64> %0, i32 0
+ %f01 = extractelement <2 x i64> %0, i32 1
+ %f10 = extractelement <2 x i64> %1, i32 0
+ %f11 = extractelement <2 x i64> %1, i32 1
+ %trunc00 = trunc i64 %f00 to i32
+ %trunc01 = trunc i64 %f01 to i32
+ %ext00 = zext i32 %trunc00 to i64
+ %ext01 = zext i32 %trunc01 to i64
+ %trunc10 = trunc i64 %f10 to i32
+ %trunc11 = trunc i64 %f11 to i32
+ %ext10 = zext i32 %trunc10 to i64
+ %ext11 = zext i32 %trunc11 to i64
+ %extv00 = insertelement <2 x i64> undef, i64 %ext00, i32 0
+ %extv01 = insertelement <2 x i64> %extv00, i64 %ext01, i32 1
+ %extv10 = insertelement <2 x i64> undef, i64 %ext10, i32 0
+ %extv11 = insertelement <2 x i64> %extv10, i64 %ext11, i32 1
+ %r = mul <2 x i64> %extv01, %extv11
+ ret <2 x i64> %r
+}
+
+define <2 x i64> @_mul2xi64toi64b(<2 x i64>, <2 x i64>) {
+; SSE-LABEL: _mul2xi64toi64b:
+; SSE: # %bb.0:
+; SSE-NEXT: pmuludq %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: _mul2xi64toi64b:
+; AVX: # %bb.0:
+; AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
+ %f0 = bitcast <2 x i64> %0 to <4 x i32>
+ %f1 = bitcast <2 x i64> %1 to <4 x i32>
+ %r = call <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32> %f0, <4 x i32> %f1) readnone
+ ret <2 x i64> %r
+}
+
+declare <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32>, <4 x i32>) nounwind readnone
diff --git a/test/CodeGen/X86/mulx32.ll b/test/CodeGen/X86/mulx32.ll
index 9ebd380170d3..d099f31189ce 100644
--- a/test/CodeGen/X86/mulx32.ll
+++ b/test/CodeGen/X86/mulx32.ll
@@ -4,7 +4,7 @@
define i64 @f1(i32 %a, i32 %b) {
; CHECK-LABEL: f1:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx
; CHECK-NEXT: mulxl {{[0-9]+}}(%esp), %eax, %edx
; CHECK-NEXT: retl
@@ -16,7 +16,7 @@ define i64 @f1(i32 %a, i32 %b) {
define i64 @f2(i32 %a, i32* %p) {
; CHECK-LABEL: f2:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx
; CHECK-NEXT: mulxl (%eax), %eax, %edx
diff --git a/test/CodeGen/X86/mulx64.ll b/test/CodeGen/X86/mulx64.ll
index 7cc10e017fc6..e038f3300093 100644
--- a/test/CodeGen/X86/mulx64.ll
+++ b/test/CodeGen/X86/mulx64.ll
@@ -4,7 +4,7 @@
define i128 @f1(i64 %a, i64 %b) {
; CHECK-LABEL: f1:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movq %rdi, %rdx
; CHECK-NEXT: mulxq %rsi, %rax, %rdx
; CHECK-NEXT: retq
@@ -16,7 +16,7 @@ define i128 @f1(i64 %a, i64 %b) {
define i128 @f2(i64 %a, i64* %p) {
; CHECK-LABEL: f2:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movq %rdi, %rdx
; CHECK-NEXT: mulxq (%rsi), %rax, %rdx
; CHECK-NEXT: retq
diff --git a/test/CodeGen/X86/musttail-thiscall.ll b/test/CodeGen/X86/musttail-thiscall.ll
index 1402f10b091a..454c66cd675e 100644
--- a/test/CodeGen/X86/musttail-thiscall.ll
+++ b/test/CodeGen/X86/musttail-thiscall.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=x86 < %s | FileCheck %s
-; RUN: llc -march=x86 -O0 < %s | FileCheck %s
+; RUN: llc -mtriple=i686-- < %s | FileCheck %s
+; RUN: llc -mtriple=i686-- -O0 < %s | FileCheck %s
; CHECK-LABEL: t1:
; CHECK: jmp {{_?}}t1_callee
diff --git a/test/CodeGen/X86/musttail.ll b/test/CodeGen/X86/musttail.ll
index ca5d3119cf10..927322b5723a 100644
--- a/test/CodeGen/X86/musttail.ll
+++ b/test/CodeGen/X86/musttail.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=x86 < %s | FileCheck %s
-; RUN: llc -march=x86 -O0 < %s | FileCheck %s
-; RUN: llc -march=x86 -disable-tail-calls < %s | FileCheck %s
+; RUN: llc -mtriple=i686-- < %s | FileCheck %s
+; RUN: llc -mtriple=i686-- -O0 < %s | FileCheck %s
+; RUN: llc -mtriple=i686-- -disable-tail-calls < %s | FileCheck %s
declare void @t1_callee(i8*)
define void @t1(i32* %a) {
diff --git a/test/CodeGen/X86/mwaitx-schedule.ll b/test/CodeGen/X86/mwaitx-schedule.ll
new file mode 100644
index 000000000000..3ceef57b409d
--- /dev/null
+++ b/test/CodeGen/X86/mwaitx-schedule.ll
@@ -0,0 +1,65 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=+mwaitx | FileCheck %s --check-prefix=GENERIC
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=bdver4 | FileCheck %s --check-prefix=BDVER4
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=ZNVER1
+
+define void @foo(i8* %P, i32 %E, i32 %H) nounwind {
+; GENERIC-LABEL: foo:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: leaq (%rdi), %rax # sched: [1:0.50]
+; GENERIC-NEXT: movl %esi, %ecx # sched: [1:0.33]
+; GENERIC-NEXT: monitorx # sched: [100:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; BDVER4-LABEL: foo:
+; BDVER4: # %bb.0:
+; BDVER4-NEXT: leaq (%rdi), %rax
+; BDVER4-NEXT: movl %esi, %ecx
+; BDVER4-NEXT: monitorx
+; BDVER4-NEXT: retq
+;
+; ZNVER1-LABEL: foo:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: leaq (%rdi), %rax # sched: [1:0.25]
+; ZNVER1-NEXT: movl %esi, %ecx # sched: [1:0.25]
+; ZNVER1-NEXT: monitorx # sched: [100:?]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ tail call void @llvm.x86.monitorx(i8* %P, i32 %E, i32 %H)
+ ret void
+}
+declare void @llvm.x86.monitorx(i8*, i32, i32) nounwind
+
+define void @bar(i32 %E, i32 %H, i32 %C) nounwind {
+; GENERIC-LABEL: bar:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: pushq %rbx # sched: [5:1.00]
+; GENERIC-NEXT: movl %edi, %ecx # sched: [1:0.33]
+; GENERIC-NEXT: movl %esi, %eax # sched: [1:0.33]
+; GENERIC-NEXT: movl %edx, %ebx # sched: [1:0.33]
+; GENERIC-NEXT: mwaitx # sched: [100:0.33]
+; GENERIC-NEXT: popq %rbx # sched: [6:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; BDVER4-LABEL: bar:
+; BDVER4: # %bb.0:
+; BDVER4-NEXT: pushq %rbx
+; BDVER4-NEXT: movl %edi, %ecx
+; BDVER4-NEXT: movl %esi, %eax
+; BDVER4-NEXT: movl %edx, %ebx
+; BDVER4-NEXT: mwaitx
+; BDVER4-NEXT: popq %rbx
+; BDVER4-NEXT: retq
+;
+; ZNVER1-LABEL: bar:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: pushq %rbx # sched: [1:0.50]
+; ZNVER1-NEXT: movl %edi, %ecx # sched: [1:0.25]
+; ZNVER1-NEXT: movl %esi, %eax # sched: [1:0.25]
+; ZNVER1-NEXT: movl %edx, %ebx # sched: [1:0.25]
+; ZNVER1-NEXT: mwaitx # sched: [100:?]
+; ZNVER1-NEXT: popq %rbx # sched: [8:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ tail call void @llvm.x86.mwaitx(i32 %E, i32 %H, i32 %C)
+ ret void
+}
+declare void @llvm.x86.mwaitx(i32, i32, i32) nounwind
diff --git a/test/CodeGen/X86/narrow-shl-cst.ll b/test/CodeGen/X86/narrow-shl-cst.ll
index c9e9a3d2a976..20d1641015ae 100644
--- a/test/CodeGen/X86/narrow-shl-cst.ll
+++ b/test/CodeGen/X86/narrow-shl-cst.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-- | FileCheck %s
; PR5039
define i32 @test1(i32 %x) nounwind {
diff --git a/test/CodeGen/X86/narrow-shl-load.ll b/test/CodeGen/X86/narrow-shl-load.ll
index 9dc0d749cb23..542944c3fe37 100644
--- a/test/CodeGen/X86/narrow-shl-load.ll
+++ b/test/CodeGen/X86/narrow-shl-load.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=x86-64 < %s | FileCheck %s
+; RUN: llc < %s | FileCheck %s
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
target triple = "x86_64-pc-linux-gnu"
diff --git a/test/CodeGen/X86/narrow_op-1.ll b/test/CodeGen/X86/narrow_op-1.ll
index 592ff94c57ba..96751abde28d 100644
--- a/test/CodeGen/X86/narrow_op-1.ll
+++ b/test/CodeGen/X86/narrow_op-1.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-- | FileCheck %s
%struct.bf = type { i64, i16, i16, i32 }
@bfi = common global %struct.bf zeroinitializer, align 16
diff --git a/test/CodeGen/X86/neg-shl-add.ll b/test/CodeGen/X86/neg-shl-add.ll
index 7aebc383ddeb..71d65074f845 100644
--- a/test/CodeGen/X86/neg-shl-add.ll
+++ b/test/CodeGen/X86/neg-shl-add.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=x86-64 < %s | not grep negq
+; RUN: llc -mtriple=x86_64-- < %s | not grep negq
; These sequences don't need neg instructions; they can be done with
; a single shift and sub each.
diff --git a/test/CodeGen/X86/neg_cmp.ll b/test/CodeGen/X86/neg_cmp.ll
index cc82857706c0..47fa7fbb88f0 100644
--- a/test/CodeGen/X86/neg_cmp.ll
+++ b/test/CodeGen/X86/neg_cmp.ll
@@ -8,10 +8,10 @@ declare void @g()
define void @neg_cmp(i32 %x, i32 %y) nounwind {
; CHECK-LABEL: neg_cmp:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: addl %esi, %edi
; CHECK-NEXT: jne .LBB0_1
-; CHECK-NEXT: # BB#2: # %if.then
+; CHECK-NEXT: # %bb.2: # %if.then
; CHECK-NEXT: jmp g # TAILCALL
; CHECK-NEXT: .LBB0_1: # %if.end
; CHECK-NEXT: retq
@@ -29,10 +29,10 @@ if.end:
define void @neg_cmp_commuted(i32 %x, i32 %y) nounwind {
; CHECK-LABEL: neg_cmp_commuted:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: addl %esi, %edi
; CHECK-NEXT: jne .LBB1_1
-; CHECK-NEXT: # BB#2: # %if.then
+; CHECK-NEXT: # %bb.2: # %if.then
; CHECK-NEXT: jmp g # TAILCALL
; CHECK-NEXT: .LBB1_1: # %if.end
; CHECK-NEXT: retq
diff --git a/test/CodeGen/X86/neg_fp.ll b/test/CodeGen/X86/neg_fp.ll
index efb02f8832e6..9cfe686b277e 100644
--- a/test/CodeGen/X86/neg_fp.ll
+++ b/test/CodeGen/X86/neg_fp.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=+sse4.1 -o %t
+; RUN: llc < %s -mtriple=i686-- -mattr=+sse4.1 -o %t
; RUN: grep xorps %t | count 1
; Test that when we don't -enable-unsafe-fp-math, we don't do the optimization
diff --git a/test/CodeGen/X86/negate-add-zero.ll b/test/CodeGen/X86/negate-add-zero.ll
index 5911312053dd..64f20a6f81be 100644
--- a/test/CodeGen/X86/negate-add-zero.ll
+++ b/test/CodeGen/X86/negate-add-zero.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -enable-unsafe-fp-math -march=x86 | not grep xor
+; RUN: llc < %s -enable-unsafe-fp-math | not grep xor
; PR3374
target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
diff --git a/test/CodeGen/X86/negate-i1.ll b/test/CodeGen/X86/negate-i1.ll
index 13f831fd37b7..c9ca52b92758 100644
--- a/test/CodeGen/X86/negate-i1.ll
+++ b/test/CodeGen/X86/negate-i1.ll
@@ -4,14 +4,14 @@
define i8 @select_i8_neg1_or_0(i1 %a) {
; X64-LABEL: select_i8_neg1_or_0:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: andb $1, %dil
; X64-NEXT: negb %dil
; X64-NEXT: movl %edi, %eax
; X64-NEXT: retq
;
; X32-LABEL: select_i8_neg1_or_0:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movb {{[0-9]+}}(%esp), %al
; X32-NEXT: andb $1, %al
; X32-NEXT: negb %al
@@ -22,13 +22,13 @@ define i8 @select_i8_neg1_or_0(i1 %a) {
define i8 @select_i8_neg1_or_0_zeroext(i1 zeroext %a) {
; X64-LABEL: select_i8_neg1_or_0_zeroext:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: negb %dil
; X64-NEXT: movl %edi, %eax
; X64-NEXT: retq
;
; X32-LABEL: select_i8_neg1_or_0_zeroext:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movb {{[0-9]+}}(%esp), %al
; X32-NEXT: negb %al
; X32-NEXT: retl
@@ -38,18 +38,18 @@ define i8 @select_i8_neg1_or_0_zeroext(i1 zeroext %a) {
define i16 @select_i16_neg1_or_0(i1 %a) {
; X64-LABEL: select_i16_neg1_or_0:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: andl $1, %edi
; X64-NEXT: negl %edi
; X64-NEXT: movl %edi, %eax
; X64-NEXT: retq
;
; X32-LABEL: select_i16_neg1_or_0:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; X32-NEXT: andl $1, %eax
; X32-NEXT: negl %eax
-; X32-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; X32-NEXT: # kill: def %ax killed %ax killed %eax
; X32-NEXT: retl
%b = sext i1 %a to i16
ret i16 %b
@@ -57,17 +57,16 @@ define i16 @select_i16_neg1_or_0(i1 %a) {
define i16 @select_i16_neg1_or_0_zeroext(i1 zeroext %a) {
; X64-LABEL: select_i16_neg1_or_0_zeroext:
-; X64: # BB#0:
-; X64-NEXT: movzbl %dil, %eax
-; X64-NEXT: negl %eax
-; X64-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; X64: # %bb.0:
+; X64-NEXT: negl %edi
+; X64-NEXT: movl %edi, %eax
; X64-NEXT: retq
;
; X32-LABEL: select_i16_neg1_or_0_zeroext:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; X32-NEXT: negl %eax
-; X32-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; X32-NEXT: # kill: def %ax killed %ax killed %eax
; X32-NEXT: retl
%b = sext i1 %a to i16
ret i16 %b
@@ -75,14 +74,14 @@ define i16 @select_i16_neg1_or_0_zeroext(i1 zeroext %a) {
define i32 @select_i32_neg1_or_0(i1 %a) {
; X64-LABEL: select_i32_neg1_or_0:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: andl $1, %edi
; X64-NEXT: negl %edi
; X64-NEXT: movl %edi, %eax
; X64-NEXT: retq
;
; X32-LABEL: select_i32_neg1_or_0:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; X32-NEXT: andl $1, %eax
; X32-NEXT: negl %eax
@@ -93,13 +92,13 @@ define i32 @select_i32_neg1_or_0(i1 %a) {
define i32 @select_i32_neg1_or_0_zeroext(i1 zeroext %a) {
; X64-LABEL: select_i32_neg1_or_0_zeroext:
-; X64: # BB#0:
-; X64-NEXT: movzbl %dil, %eax
-; X64-NEXT: negl %eax
+; X64: # %bb.0:
+; X64-NEXT: negl %edi
+; X64-NEXT: movl %edi, %eax
; X64-NEXT: retq
;
; X32-LABEL: select_i32_neg1_or_0_zeroext:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; X32-NEXT: negl %eax
; X32-NEXT: retl
@@ -109,15 +108,15 @@ define i32 @select_i32_neg1_or_0_zeroext(i1 zeroext %a) {
define i64 @select_i64_neg1_or_0(i1 %a) {
; X64-LABEL: select_i64_neg1_or_0:
-; X64: # BB#0:
-; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64: # %bb.0:
+; X64-NEXT: # kill: def %edi killed %edi def %rdi
; X64-NEXT: andl $1, %edi
; X64-NEXT: negq %rdi
; X64-NEXT: movq %rdi, %rax
; X64-NEXT: retq
;
; X32-LABEL: select_i64_neg1_or_0:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; X32-NEXT: andl $1, %eax
; X32-NEXT: negl %eax
@@ -129,13 +128,13 @@ define i64 @select_i64_neg1_or_0(i1 %a) {
define i64 @select_i64_neg1_or_0_zeroext(i1 zeroext %a) {
; X64-LABEL: select_i64_neg1_or_0_zeroext:
-; X64: # BB#0:
-; X64-NEXT: movzbl %dil, %eax
+; X64: # %bb.0:
+; X64-NEXT: movl %edi, %eax
; X64-NEXT: negq %rax
; X64-NEXT: retq
;
; X32-LABEL: select_i64_neg1_or_0_zeroext:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; X32-NEXT: negl %eax
; X32-NEXT: movl %eax, %edx
diff --git a/test/CodeGen/X86/negate-shift.ll b/test/CodeGen/X86/negate-shift.ll
index cbe2f9456fa1..8804460f3805 100644
--- a/test/CodeGen/X86/negate-shift.ll
+++ b/test/CodeGen/X86/negate-shift.ll
@@ -3,7 +3,7 @@
define i32 @neg_lshr_signbit(i32 %x) {
; X64-LABEL: neg_lshr_signbit:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: sarl $31, %edi
; X64-NEXT: movl %edi, %eax
; X64-NEXT: retq
@@ -14,7 +14,7 @@ define i32 @neg_lshr_signbit(i32 %x) {
define i64 @neg_ashr_signbit(i64 %x) {
; X64-LABEL: neg_ashr_signbit:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: shrq $63, %rdi
; X64-NEXT: movq %rdi, %rax
; X64-NEXT: retq
@@ -25,7 +25,7 @@ define i64 @neg_ashr_signbit(i64 %x) {
define <4 x i32> @neg_ashr_signbit_vec(<4 x i32> %x) {
; X64-LABEL: neg_ashr_signbit_vec:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: psrld $31, %xmm0
; X64-NEXT: retq
%sh = ashr <4 x i32> %x, <i32 31, i32 31, i32 31, i32 31>
@@ -35,7 +35,7 @@ define <4 x i32> @neg_ashr_signbit_vec(<4 x i32> %x) {
define <8 x i16> @neg_lshr_signbit_vec(<8 x i16> %x) {
; X64-LABEL: neg_lshr_signbit_vec:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: psraw $15, %xmm0
; X64-NEXT: retq
%sh = lshr <8 x i16> %x, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
diff --git a/test/CodeGen/X86/negate.ll b/test/CodeGen/X86/negate.ll
index 5bdb11479afc..62e4dff4593c 100644
--- a/test/CodeGen/X86/negate.ll
+++ b/test/CodeGen/X86/negate.ll
@@ -3,7 +3,7 @@
define i32 @negate_nuw(i32 %x) {
; CHECK-LABEL: negate_nuw:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: retq
%neg = sub nuw i32 0, %x
@@ -12,7 +12,7 @@ define i32 @negate_nuw(i32 %x) {
define <4 x i32> @negate_nuw_vec(<4 x i32> %x) {
; CHECK-LABEL: negate_nuw_vec:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: xorps %xmm0, %xmm0
; CHECK-NEXT: retq
%neg = sub nuw <4 x i32> zeroinitializer, %x
@@ -21,7 +21,7 @@ define <4 x i32> @negate_nuw_vec(<4 x i32> %x) {
define i8 @negate_zero_or_minsigned_nsw(i8 %x) {
; CHECK-LABEL: negate_zero_or_minsigned_nsw:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: retq
%signbit = and i8 %x, 128
@@ -31,7 +31,7 @@ define i8 @negate_zero_or_minsigned_nsw(i8 %x) {
define <4 x i32> @negate_zero_or_minsigned_nsw_vec(<4 x i32> %x) {
; CHECK-LABEL: negate_zero_or_minsigned_nsw_vec:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: xorps %xmm0, %xmm0
; CHECK-NEXT: retq
%signbit = shl <4 x i32> %x, <i32 31, i32 31, i32 31, i32 31>
@@ -41,7 +41,7 @@ define <4 x i32> @negate_zero_or_minsigned_nsw_vec(<4 x i32> %x) {
define i8 @negate_zero_or_minsigned(i8 %x) {
; CHECK-LABEL: negate_zero_or_minsigned:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: shlb $7, %dil
; CHECK-NEXT: movl %edi, %eax
; CHECK-NEXT: retq
@@ -52,7 +52,7 @@ define i8 @negate_zero_or_minsigned(i8 %x) {
define <4 x i32> @negate_zero_or_minsigned_vec(<4 x i32> %x) {
; CHECK-LABEL: negate_zero_or_minsigned_vec:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: andps {{.*}}(%rip), %xmm0
; CHECK-NEXT: retq
%signbit = and <4 x i32> %x, <i32 2147483648, i32 2147483648, i32 2147483648, i32 2147483648>
diff --git a/test/CodeGen/X86/negative-sin.ll b/test/CodeGen/X86/negative-sin.ll
index 94369e3e8d0f..c30cd2741e6b 100644
--- a/test/CodeGen/X86/negative-sin.ll
+++ b/test/CodeGen/X86/negative-sin.ll
@@ -7,7 +7,7 @@ declare double @sin(double %f)
define double @strict(double %e) nounwind {
; CHECK-LABEL: strict:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: pushq %rax
; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; CHECK-NEXT: vsubsd %xmm0, %xmm1, %xmm0
@@ -27,7 +27,7 @@ define double @strict(double %e) nounwind {
define double @fast(double %e) nounwind {
; CHECK-LABEL: fast:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: jmp sin # TAILCALL
%f = fsub fast double 0.0, %e
%g = call double @sin(double %f) readonly
@@ -39,7 +39,7 @@ define double @fast(double %e) nounwind {
define double @nsz(double %e) nounwind {
; CHECK-LABEL: nsz:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: jmp sin # TAILCALL
%f = fsub nsz double 0.0, %e
%g = call double @sin(double %f) readonly
@@ -51,7 +51,7 @@ define double @nsz(double %e) nounwind {
define double @semi_strict1(double %e) nounwind {
; CHECK-LABEL: semi_strict1:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: pushq %rax
; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; CHECK-NEXT: vsubsd %xmm0, %xmm1, %xmm0
@@ -69,7 +69,7 @@ define double @semi_strict1(double %e) nounwind {
define double @semi_strict2(double %e) nounwind {
; CHECK-LABEL: semi_strict2:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: pushq %rax
; CHECK-NEXT: callq sin
; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1
@@ -87,7 +87,7 @@ define double @semi_strict2(double %e) nounwind {
define double @fn_attr(double %e) nounwind #0 {
; CHECK-LABEL: fn_attr:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: jmp sin # TAILCALL
%f = fsub double 0.0, %e
%g = call double @sin(double %f) readonly
diff --git a/test/CodeGen/X86/negative-stride-fptosi-user.ll b/test/CodeGen/X86/negative-stride-fptosi-user.ll
index 332e0b9cc6e1..d42b3444424c 100644
--- a/test/CodeGen/X86/negative-stride-fptosi-user.ll
+++ b/test/CodeGen/X86/negative-stride-fptosi-user.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 | grep cvtsi2sd
+; RUN: llc < %s -mtriple=x86_64-- | grep cvtsi2sd
; LSR previously eliminated the sitofp by introducing an induction
; variable which stepped by a bogus ((double)UINT32_C(-1)). It's theoretically
diff --git a/test/CodeGen/X86/negative-subscript.ll b/test/CodeGen/X86/negative-subscript.ll
index f69157551b7a..f5dac9c5db15 100644
--- a/test/CodeGen/X86/negative-subscript.ll
+++ b/test/CodeGen/X86/negative-subscript.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86
+; RUN: llc < %s -mtriple=i686--
; rdar://6559995
@a = external global [255 x i8*], align 32
diff --git a/test/CodeGen/X86/negative_zero.ll b/test/CodeGen/X86/negative_zero.ll
index c8c2cd753e08..534cfc67eea7 100644
--- a/test/CodeGen/X86/negative_zero.ll
+++ b/test/CodeGen/X86/negative_zero.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=-sse2,-sse3 | FileCheck %s
+; RUN: llc < %s -mtriple=i686-- -mattr=-sse2,-sse3 | FileCheck %s
; CHECK: fchs
diff --git a/test/CodeGen/X86/no-cmov.ll b/test/CodeGen/X86/no-cmov.ll
index 8fc0f7075c0e..5a40f9d5ad69 100644
--- a/test/CodeGen/X86/no-cmov.ll
+++ b/test/CodeGen/X86/no-cmov.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=x86 -mcpu=i486 < %s | FileCheck %s
+; RUN: llc -mtriple=i686-- -mcpu=i486 < %s | FileCheck %s
define i32 @test1(i32 %g, i32* %j) {
%tobool = icmp eq i32 %g, 0
diff --git a/test/CodeGen/X86/no-plt.ll b/test/CodeGen/X86/no-plt.ll
new file mode 100644
index 000000000000..d6383c2d7d14
--- /dev/null
+++ b/test/CodeGen/X86/no-plt.ll
@@ -0,0 +1,30 @@
+; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux-gnu -relocation-model=pic \
+; RUN: | FileCheck -check-prefix=X64 %s
+; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux-gnu \
+; RUN: | FileCheck -check-prefix=X64 %s
+
+define i32 @main() #0 {
+; X64: callq *_Z3foov@GOTPCREL(%rip)
+; X64: callq _Z3barv
+; X64: callq _Z3bazv
+
+entry:
+ %retval = alloca i32, align 4
+ store i32 0, i32* %retval, align 4
+ %call1 = call i32 @_Z3foov()
+ %call2 = call i32 @_Z3barv()
+ %call3 = call i32 @_Z3bazv()
+ ret i32 0
+}
+
+; Function Attrs: nonlazybind
+declare i32 @_Z3foov() #1
+
+declare i32 @_Z3barv() #2
+
+; Function Attrs: nonlazybind
+declare hidden i32 @_Z3bazv() #3
+
+
+attributes #1 = { nonlazybind }
+attributes #3 = { nonlazybind }
diff --git a/test/CodeGen/X86/no-sse2-avg.ll b/test/CodeGen/X86/no-sse2-avg.ll
index e4b97c17047c..0472cc27d841 100644
--- a/test/CodeGen/X86/no-sse2-avg.ll
+++ b/test/CodeGen/X86/no-sse2-avg.ll
@@ -4,23 +4,9 @@
define <16 x i8> @PR27973() {
; CHECK-LABEL: PR27973:
-; CHECK: # BB#0:
-; CHECK-NEXT: movb $0, 15(%rdi)
-; CHECK-NEXT: movb $0, 14(%rdi)
-; CHECK-NEXT: movb $0, 13(%rdi)
-; CHECK-NEXT: movb $0, 12(%rdi)
-; CHECK-NEXT: movb $0, 11(%rdi)
-; CHECK-NEXT: movb $0, 10(%rdi)
-; CHECK-NEXT: movb $0, 9(%rdi)
-; CHECK-NEXT: movb $0, 8(%rdi)
-; CHECK-NEXT: movb $0, 7(%rdi)
-; CHECK-NEXT: movb $0, 6(%rdi)
-; CHECK-NEXT: movb $0, 5(%rdi)
-; CHECK-NEXT: movb $0, 4(%rdi)
-; CHECK-NEXT: movb $0, 3(%rdi)
-; CHECK-NEXT: movb $0, 2(%rdi)
-; CHECK-NEXT: movb $0, 1(%rdi)
-; CHECK-NEXT: movb $0, (%rdi)
+; CHECK: # %bb.0:
+; CHECK-NEXT: movq $0, 8(%rdi)
+; CHECK-NEXT: movq $0, (%rdi)
; CHECK-NEXT: movq %rdi, %rax
; CHECK-NEXT: retq
%t0 = zext <16 x i8> zeroinitializer to <16 x i32>
diff --git a/test/CodeGen/X86/nobt.ll b/test/CodeGen/X86/nobt.ll
index 35090e372916..b60723e9cfaa 100644
--- a/test/CodeGen/X86/nobt.ll
+++ b/test/CodeGen/X86/nobt.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | not grep btl
+; RUN: llc < %s -mtriple=i686-- | not grep btl
; This tests some cases where BT must not be generated. See also bt.ll.
; Fixes 20040709-[12].c in gcc testsuite.
diff --git a/test/CodeGen/X86/nocx16.ll b/test/CodeGen/X86/nocx16.ll
index 8b995dafa75a..f9a18b7022b3 100644
--- a/test/CodeGen/X86/nocx16.ll
+++ b/test/CodeGen/X86/nocx16.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mcpu=corei7 -mattr=-cx16 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=corei7 -mattr=-cx16 | FileCheck %s
define void @test(i128* %a) nounwind {
entry:
; CHECK: __sync_val_compare_and_swap_16
diff --git a/test/CodeGen/X86/non-value-mem-operand.mir b/test/CodeGen/X86/non-value-mem-operand.mir
index 3e969a56170e..b537a637d8d8 100644
--- a/test/CodeGen/X86/non-value-mem-operand.mir
+++ b/test/CodeGen/X86/non-value-mem-operand.mir
@@ -175,14 +175,14 @@ body: |
successors: %bb.4.bb7(0x80000000)
liveins: %rax
- MOV64mr %rsp, 1, _, 32, _, %rax :: (store 8 into %stack.5)
+ MOV64mr %rsp, 1, %noreg, 32, %noreg, %rax :: (store 8 into %stack.5)
%r12 = MOV64rr killed %rax
%r12 = ADD64ri8 killed %r12, 16, implicit-def dead %eflags
%xmm0 = XORPSrr undef %xmm0, undef %xmm0
%esi = XOR32rr undef %esi, undef %esi, implicit-def dead %eflags
%rax = MOV64ri %const.0
- %xmm1 = MOVSDrm killed %rax, 1, _, 0, _ :: (load 8 from constant-pool)
- MOVSDmr %rsp, 1, _, 40, _, killed %xmm1 :: (store 8 into %stack.4)
+ %xmm1 = MOVSDrm killed %rax, 1, %noreg, 0, %noreg :: (load 8 from constant-pool)
+ MOVSDmr %rsp, 1, %noreg, 40, %noreg, killed %xmm1 :: (store 8 into %stack.4)
%eax = IMPLICIT_DEF
%ecx = XOR32rr undef %ecx, undef %ecx, implicit-def dead %eflags
@@ -200,11 +200,11 @@ body: |
successors: %bb.6.bb26(0x80000000)
liveins: %ebp, %rbx, %r14, %xmm0
- MOV32mr %rsp, 1, _, 24, _, %ebx :: (store 4 into %stack.0, align 8)
- MOV32mr %rsp, 1, _, 16, _, %ebp :: (store 4 into %stack.1, align 8)
- MOVSDmr %rsp, 1, _, 8, _, killed %xmm0 :: (store 8 into %stack.2)
- %rax = MOV64rm %rsp, 1, _, 32, _ :: (load 8 from %stack.5)
- MOV64mr %rsp, 1, _, 48, _, killed %rax :: (store 8 into %stack.3)
+ MOV32mr %rsp, 1, %noreg, 24, %noreg, %ebx :: (store 4 into %stack.0, align 8)
+ MOV32mr %rsp, 1, %noreg, 16, %noreg, %ebp :: (store 4 into %stack.1, align 8)
+ MOVSDmr %rsp, 1, %noreg, 8, %noreg, killed %xmm0 :: (store 8 into %stack.2)
+ %rax = MOV64rm %rsp, 1, %noreg, 32, %noreg :: (load 8 from %stack.5)
+ MOV64mr %rsp, 1, %noreg, 48, %noreg, killed %rax :: (store 8 into %stack.3)
%rax = MOV64ri @wibble
STATEPOINT 2882400000, 0, 0, killed %rax, 2, 0, 2, 0, 2, 30, 2, 1, 2, 0, 2, 99, 2, 0, 2, 12, 2, 0, 2, 10, 1, 8, %rsp, 24, 2, 10, 2, 0, 2, 10, 1, 8, %rsp, 16, 2, 10, 2, 4278124286, 2, 6, 2, 4278124286, 2, 7, 1, 8, %rsp, 8, 2, 99, 2, 0, 2, 7, 2, 4278124286, 2, 99, 2, 0, 2, 13, 1, 8, %rsp, 48, 2, 7, 2, 4278124286, 2, 99, 2, 0, csr_64, implicit-def %rsp :: (volatile load 8 from %stack.0), (volatile load 8 from %stack.1), (volatile load 8 from %stack.2), (volatile load 8 from %stack.3)
%esi = XOR32rr undef %esi, undef %esi, implicit-def dead %eflags
@@ -215,16 +215,16 @@ body: |
liveins: %ebp, %esi, %rbx, %r12, %r14
%rax = MOV64ri @global.1
- %rax = MOV64rm killed %rax, 1, _, 0, _ :: (dereferenceable load 8 from @global.1)
+ %rax = MOV64rm killed %rax, 1, %noreg, 0, %noreg :: (dereferenceable load 8 from @global.1)
TEST64rr %rax, %rax, implicit-def %eflags
%rax = CMOVE64rr undef %rax, killed %rax, implicit killed %eflags
- %ecx = MOV32rm undef %rax, 1, _, 0, _ :: (load 4 from `i32* undef`)
- %rdx = MOV64rm %r12, 8, %r14, 0, _ :: (load 8 from %ir.tmp3)
- %r15 = LEA64r %rdx, 1, _, 1, _
- MOV64mr %r12, 8, %r14, 0, _, %r15 :: (store 8 into %ir.tmp3)
+ %ecx = MOV32rm undef %rax, 1, %noreg, 0, %noreg :: (load 4 from `i32* undef`)
+ %rdx = MOV64rm %r12, 8, %r14, 0, %noreg :: (load 8 from %ir.tmp3)
+ %r15 = LEA64r %rdx, 1, %noreg, 1, _
+ MOV64mr %r12, 8, %r14, 0, %noreg, %r15 :: (store 8 into %ir.tmp3)
%ecx = SUB32rr killed %ecx, %edx, implicit-def dead %eflags, implicit killed %rdx
- MOV32mr undef %rax, 1, _, 0, _, killed %ecx :: (store 4 into `i32* undef`)
- %r13 = MOV64rm killed %rax, 1, _, 768, _ :: (load 8 from %ir.tmp33)
+ MOV32mr undef %rax, 1, %noreg, 0, %noreg, killed %ecx :: (store 4 into `i32* undef`)
+ %r13 = MOV64rm killed %rax, 1, %noreg, 768, %noreg :: (load 8 from %ir.tmp33)
TEST8rr %sil, %sil, implicit-def %eflags
%rax = IMPLICIT_DEF
JNE_1 %bb.8.bb37, implicit %eflags
@@ -242,7 +242,7 @@ body: |
successors: %bb.9.bb37(0x40000000), %bb.10.bb37(0x40000000)
liveins: %ebp, %esi, %rax, %rbx, %r12, %r13, %r14, %r15
- %rcx = MOV64rm killed %rax, 1, _, 760, _ :: (load 8 from %ir.tmp40)
+ %rcx = MOV64rm killed %rax, 1, %noreg, 760, %noreg :: (load 8 from %ir.tmp40)
CMP64rr %r13, %rcx, implicit-def %eflags
JL_1 %bb.10.bb37, implicit %eflags
@@ -258,12 +258,12 @@ body: |
%cl = KILL %cl, implicit killed %rcx
%r15 = SAR64rCL killed %r15, implicit-def dead %eflags, implicit %cl
- MOV64mr %r12, 8, killed %r14, 0, _, killed %r15 :: (store 8 into %ir.tmp7)
- MOV64mi32 undef %rax, 1, _, 0, _, 0 :: (store 8 into `i64* undef`)
- %eax = LEA64_32r %rbx, 1, _, 1, _
+ MOV64mr %r12, 8, killed %r14, 0, %noreg, killed %r15 :: (store 8 into %ir.tmp7)
+ MOV64mi32 undef %rax, 1, %noreg, 0, %noreg, 0 :: (store 8 into `i64* undef`)
+ %eax = LEA64_32r %rbx, 1, %noreg, 1, _
%ecx = MOV32ri 6
CMP32ri %eax, 15141, implicit-def %eflags
- %xmm0 = MOVSDrm %rsp, 1, _, 40, _ :: (load 8 from %stack.4)
+ %xmm0 = MOVSDrm %rsp, 1, %noreg, 40, %noreg :: (load 8 from %stack.4)
JL_1 %bb.4.bb7, implicit %eflags
bb.11.bb51.loopexit:
@@ -273,14 +273,14 @@ body: |
%ebp = INC32r killed %ebp, implicit-def dead %eflags
%ebx = INC32r %ebx, implicit-def dead %eflags, implicit killed %rbx, implicit-def %rbx
%rax = MOV64ri %const.0
- %xmm0 = MOVSDrm killed %rax, 1, _, 0, _ :: (load 8 from constant-pool)
+ %xmm0 = MOVSDrm killed %rax, 1, %noreg, 0, %noreg :: (load 8 from constant-pool)
bb.12.bb51:
liveins: %ebp, %rbx, %xmm0
- MOV32mr %rsp, 1, _, 24, _, %ebx, implicit killed %rbx :: (store 4 into %stack.0, align 8)
- MOV32mr %rsp, 1, _, 16, _, killed %ebp :: (store 4 into %stack.1, align 8)
- MOVSDmr %rsp, 1, _, 8, _, killed %xmm0 :: (store 8 into %stack.2)
+ MOV32mr %rsp, 1, %noreg, 24, %noreg, %ebx, implicit killed %rbx :: (store 4 into %stack.0, align 8)
+ MOV32mr %rsp, 1, %noreg, 16, %noreg, killed %ebp :: (store 4 into %stack.1, align 8)
+ MOVSDmr %rsp, 1, %noreg, 8, %noreg, killed %xmm0 :: (store 8 into %stack.2)
%rax = MOV64ri @wobble
%edi = MOV32ri -121
STATEPOINT 2882400000, 0, 1, killed %rax, %edi, 2, 0, 2, 0, 2, 38, 2, 1, 2, 0, 2, 270, 2, 4, 2, 12, 2, 0, 2, 11, 2, 4278124286, 2, 99, 2, 0, 2, 10, 1, 8, %rsp, 24, 2, 6, 2, 4278124286, 2, 99, 2, 0, 2, 99, 2, 0, 2, 10, 1, 8, %rsp, 16, 2, 10, 2, 4278124286, 2, 99, 2, 0, 2, 7, 1, 8, %rsp, 8, 2, 99, 2, 0, 2, 7, 2, 4278124286, 2, 99, 2, 0, 2, 13, 2, 4278124286, 2, 99, 2, 0, 2, 99, 2, 0, csr_64, implicit-def %rsp :: (volatile load 8 from %stack.0), (volatile load 8 from %stack.1), (volatile load 8 from %stack.2)
diff --git a/test/CodeGen/X86/nonconst-static-ev.ll b/test/CodeGen/X86/nonconst-static-ev.ll
index 5449791f3fa3..a0aa6152bd47 100644
--- a/test/CodeGen/X86/nonconst-static-ev.ll
+++ b/test/CodeGen/X86/nonconst-static-ev.ll
@@ -1,4 +1,4 @@
-; RUN: not llc -march=x86 -mtriple=x86_64-linux-gnu < %s 2> %t
+; RUN: not llc -mtriple=i686-linux-gnu < %s 2> %t
; RUN: FileCheck --check-prefix=CHECK-ERRORS < %t %s
@0 = global i8 extractvalue ([1 x i8] select (i1 ptrtoint (i32* @1 to i1), [1 x i8] [ i8 1 ], [1 x i8] [ i8 2 ]), 0)
diff --git a/test/CodeGen/X86/nonconst-static-iv.ll b/test/CodeGen/X86/nonconst-static-iv.ll
index 30613ef383a3..b1a03cf8b2e5 100644
--- a/test/CodeGen/X86/nonconst-static-iv.ll
+++ b/test/CodeGen/X86/nonconst-static-iv.ll
@@ -1,4 +1,4 @@
-; RUN: not llc -march=x86 -mtriple=x86_64-linux-gnu < %s 2> %t
+; RUN: not llc -mtriple=i686-linux-gnu < %s 2> %t
; RUN: FileCheck --check-prefix=CHECK-ERRORS < %t %s
@0 = global i8 insertvalue( { i8 } select (i1 ptrtoint (i32* @1 to i1), { i8 } { i8 1 }, { i8 } { i8 2 }), i8 0, 0)
diff --git a/test/CodeGen/X86/nontemporal-2.ll b/test/CodeGen/X86/nontemporal-2.ll
index 337e625df168..47c1f7c0fbf7 100644
--- a/test/CodeGen/X86/nontemporal-2.ll
+++ b/test/CodeGen/X86/nontemporal-2.ll
@@ -13,19 +13,19 @@
define void @test_zero_f32(float* %dst) {
; SSE-LABEL: test_zero_f32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: xorl %eax, %eax
; SSE-NEXT: movntil %eax, (%rdi)
; SSE-NEXT: retq
;
; AVX-LABEL: test_zero_f32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: xorl %eax, %eax
; AVX-NEXT: movntil %eax, (%rdi)
; AVX-NEXT: retq
;
; VLX-LABEL: test_zero_f32:
-; VLX: # BB#0:
+; VLX: # %bb.0:
; VLX-NEXT: xorl %eax, %eax
; VLX-NEXT: movntil %eax, (%rdi)
; VLX-NEXT: retq
@@ -35,19 +35,19 @@ define void @test_zero_f32(float* %dst) {
define void @test_zero_i32(i32* %dst) {
; SSE-LABEL: test_zero_i32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: xorl %eax, %eax
; SSE-NEXT: movntil %eax, (%rdi)
; SSE-NEXT: retq
;
; AVX-LABEL: test_zero_i32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: xorl %eax, %eax
; AVX-NEXT: movntil %eax, (%rdi)
; AVX-NEXT: retq
;
; VLX-LABEL: test_zero_i32:
-; VLX: # BB#0:
+; VLX: # %bb.0:
; VLX-NEXT: xorl %eax, %eax
; VLX-NEXT: movntil %eax, (%rdi)
; VLX-NEXT: retq
@@ -57,19 +57,19 @@ define void @test_zero_i32(i32* %dst) {
define void @test_zero_f64(double* %dst) {
; SSE-LABEL: test_zero_f64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: xorl %eax, %eax
; SSE-NEXT: movntiq %rax, (%rdi)
; SSE-NEXT: retq
;
; AVX-LABEL: test_zero_f64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: xorl %eax, %eax
; AVX-NEXT: movntiq %rax, (%rdi)
; AVX-NEXT: retq
;
; VLX-LABEL: test_zero_f64:
-; VLX: # BB#0:
+; VLX: # %bb.0:
; VLX-NEXT: xorl %eax, %eax
; VLX-NEXT: movntiq %rax, (%rdi)
; VLX-NEXT: retq
@@ -79,19 +79,19 @@ define void @test_zero_f64(double* %dst) {
define void @test_zero_i64(i64* %dst) {
; SSE-LABEL: test_zero_i64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: xorl %eax, %eax
; SSE-NEXT: movntiq %rax, (%rdi)
; SSE-NEXT: retq
;
; AVX-LABEL: test_zero_i64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: xorl %eax, %eax
; AVX-NEXT: movntiq %rax, (%rdi)
; AVX-NEXT: retq
;
; VLX-LABEL: test_zero_i64:
-; VLX: # BB#0:
+; VLX: # %bb.0:
; VLX-NEXT: xorl %eax, %eax
; VLX-NEXT: movntiq %rax, (%rdi)
; VLX-NEXT: retq
@@ -103,19 +103,19 @@ define void @test_zero_i64(i64* %dst) {
define void @test_zero_v4f32(<4 x float>* %dst) {
; SSE-LABEL: test_zero_v4f32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: xorps %xmm0, %xmm0
; SSE-NEXT: movntps %xmm0, (%rdi)
; SSE-NEXT: retq
;
; AVX-LABEL: test_zero_v4f32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
; AVX-NEXT: vmovntps %xmm0, (%rdi)
; AVX-NEXT: retq
;
; VLX-LABEL: test_zero_v4f32:
-; VLX: # BB#0:
+; VLX: # %bb.0:
; VLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
; VLX-NEXT: vmovntdq %xmm0, (%rdi)
; VLX-NEXT: retq
@@ -125,19 +125,19 @@ define void @test_zero_v4f32(<4 x float>* %dst) {
define void @test_zero_v4i32(<4 x i32>* %dst) {
; SSE-LABEL: test_zero_v4i32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: xorps %xmm0, %xmm0
; SSE-NEXT: movntps %xmm0, (%rdi)
; SSE-NEXT: retq
;
; AVX-LABEL: test_zero_v4i32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
; AVX-NEXT: vmovntps %xmm0, (%rdi)
; AVX-NEXT: retq
;
; VLX-LABEL: test_zero_v4i32:
-; VLX: # BB#0:
+; VLX: # %bb.0:
; VLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
; VLX-NEXT: vmovntdq %xmm0, (%rdi)
; VLX-NEXT: retq
@@ -148,19 +148,19 @@ define void @test_zero_v4i32(<4 x i32>* %dst) {
define void @test_zero_v2f64(<2 x double>* %dst) {
; SSE-LABEL: test_zero_v2f64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: xorps %xmm0, %xmm0
; SSE-NEXT: movntps %xmm0, (%rdi)
; SSE-NEXT: retq
;
; AVX-LABEL: test_zero_v2f64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
; AVX-NEXT: vmovntps %xmm0, (%rdi)
; AVX-NEXT: retq
;
; VLX-LABEL: test_zero_v2f64:
-; VLX: # BB#0:
+; VLX: # %bb.0:
; VLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
; VLX-NEXT: vmovntdq %xmm0, (%rdi)
; VLX-NEXT: retq
@@ -170,19 +170,19 @@ define void @test_zero_v2f64(<2 x double>* %dst) {
define void @test_zero_v2i64(<2 x i64>* %dst) {
; SSE-LABEL: test_zero_v2i64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: xorps %xmm0, %xmm0
; SSE-NEXT: movntps %xmm0, (%rdi)
; SSE-NEXT: retq
;
; AVX-LABEL: test_zero_v2i64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
; AVX-NEXT: vmovntps %xmm0, (%rdi)
; AVX-NEXT: retq
;
; VLX-LABEL: test_zero_v2i64:
-; VLX: # BB#0:
+; VLX: # %bb.0:
; VLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
; VLX-NEXT: vmovntdq %xmm0, (%rdi)
; VLX-NEXT: retq
@@ -192,19 +192,19 @@ define void @test_zero_v2i64(<2 x i64>* %dst) {
define void @test_zero_v8i16(<8 x i16>* %dst) {
; SSE-LABEL: test_zero_v8i16:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: xorps %xmm0, %xmm0
; SSE-NEXT: movntps %xmm0, (%rdi)
; SSE-NEXT: retq
;
; AVX-LABEL: test_zero_v8i16:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
; AVX-NEXT: vmovntps %xmm0, (%rdi)
; AVX-NEXT: retq
;
; VLX-LABEL: test_zero_v8i16:
-; VLX: # BB#0:
+; VLX: # %bb.0:
; VLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
; VLX-NEXT: vmovntdq %xmm0, (%rdi)
; VLX-NEXT: retq
@@ -214,19 +214,19 @@ define void @test_zero_v8i16(<8 x i16>* %dst) {
define void @test_zero_v16i8(<16 x i8>* %dst) {
; SSE-LABEL: test_zero_v16i8:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: xorps %xmm0, %xmm0
; SSE-NEXT: movntps %xmm0, (%rdi)
; SSE-NEXT: retq
;
; AVX-LABEL: test_zero_v16i8:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
; AVX-NEXT: vmovntps %xmm0, (%rdi)
; AVX-NEXT: retq
;
; VLX-LABEL: test_zero_v16i8:
-; VLX: # BB#0:
+; VLX: # %bb.0:
; VLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
; VLX-NEXT: vmovntdq %xmm0, (%rdi)
; VLX-NEXT: retq
@@ -238,22 +238,22 @@ define void @test_zero_v16i8(<16 x i8>* %dst) {
define void @test_zero_v8f32(<8 x float>* %dst) {
; SSE-LABEL: test_zero_v8f32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: xorps %xmm0, %xmm0
; SSE-NEXT: movntps %xmm0, 16(%rdi)
; SSE-NEXT: movntps %xmm0, (%rdi)
; SSE-NEXT: retq
;
; AVX-LABEL: test_zero_v8f32:
-; AVX: # BB#0:
-; AVX-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; AVX: # %bb.0:
+; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
; AVX-NEXT: vmovntps %ymm0, (%rdi)
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
;
; VLX-LABEL: test_zero_v8f32:
-; VLX: # BB#0:
-; VLX-NEXT: vpxor %ymm0, %ymm0, %ymm0
+; VLX: # %bb.0:
+; VLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
; VLX-NEXT: vmovntdq %ymm0, (%rdi)
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
@@ -263,22 +263,22 @@ define void @test_zero_v8f32(<8 x float>* %dst) {
define void @test_zero_v8i32(<8 x i32>* %dst) {
; SSE-LABEL: test_zero_v8i32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: xorps %xmm0, %xmm0
; SSE-NEXT: movntps %xmm0, 16(%rdi)
; SSE-NEXT: movntps %xmm0, (%rdi)
; SSE-NEXT: retq
;
; AVX-LABEL: test_zero_v8i32:
-; AVX: # BB#0:
-; AVX-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; AVX: # %bb.0:
+; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
; AVX-NEXT: vmovntps %ymm0, (%rdi)
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
;
; VLX-LABEL: test_zero_v8i32:
-; VLX: # BB#0:
-; VLX-NEXT: vpxor %ymm0, %ymm0, %ymm0
+; VLX: # %bb.0:
+; VLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
; VLX-NEXT: vmovntdq %ymm0, (%rdi)
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
@@ -288,22 +288,22 @@ define void @test_zero_v8i32(<8 x i32>* %dst) {
define void @test_zero_v4f64(<4 x double>* %dst) {
; SSE-LABEL: test_zero_v4f64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: xorps %xmm0, %xmm0
; SSE-NEXT: movntps %xmm0, 16(%rdi)
; SSE-NEXT: movntps %xmm0, (%rdi)
; SSE-NEXT: retq
;
; AVX-LABEL: test_zero_v4f64:
-; AVX: # BB#0:
-; AVX-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; AVX: # %bb.0:
+; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
; AVX-NEXT: vmovntps %ymm0, (%rdi)
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
;
; VLX-LABEL: test_zero_v4f64:
-; VLX: # BB#0:
-; VLX-NEXT: vpxor %ymm0, %ymm0, %ymm0
+; VLX: # %bb.0:
+; VLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
; VLX-NEXT: vmovntdq %ymm0, (%rdi)
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
@@ -313,22 +313,22 @@ define void @test_zero_v4f64(<4 x double>* %dst) {
define void @test_zero_v4i64(<4 x i64>* %dst) {
; SSE-LABEL: test_zero_v4i64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: xorps %xmm0, %xmm0
; SSE-NEXT: movntps %xmm0, 16(%rdi)
; SSE-NEXT: movntps %xmm0, (%rdi)
; SSE-NEXT: retq
;
; AVX-LABEL: test_zero_v4i64:
-; AVX: # BB#0:
-; AVX-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; AVX: # %bb.0:
+; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
; AVX-NEXT: vmovntps %ymm0, (%rdi)
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
;
; VLX-LABEL: test_zero_v4i64:
-; VLX: # BB#0:
-; VLX-NEXT: vpxor %ymm0, %ymm0, %ymm0
+; VLX: # %bb.0:
+; VLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
; VLX-NEXT: vmovntdq %ymm0, (%rdi)
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
@@ -338,22 +338,22 @@ define void @test_zero_v4i64(<4 x i64>* %dst) {
define void @test_zero_v16i16(<16 x i16>* %dst) {
; SSE-LABEL: test_zero_v16i16:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: xorps %xmm0, %xmm0
; SSE-NEXT: movntps %xmm0, 16(%rdi)
; SSE-NEXT: movntps %xmm0, (%rdi)
; SSE-NEXT: retq
;
; AVX-LABEL: test_zero_v16i16:
-; AVX: # BB#0:
-; AVX-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; AVX: # %bb.0:
+; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
; AVX-NEXT: vmovntps %ymm0, (%rdi)
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
;
; VLX-LABEL: test_zero_v16i16:
-; VLX: # BB#0:
-; VLX-NEXT: vpxor %ymm0, %ymm0, %ymm0
+; VLX: # %bb.0:
+; VLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
; VLX-NEXT: vmovntdq %ymm0, (%rdi)
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
@@ -363,22 +363,22 @@ define void @test_zero_v16i16(<16 x i16>* %dst) {
define void @test_zero_v32i8(<32 x i8>* %dst) {
; SSE-LABEL: test_zero_v32i8:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: xorps %xmm0, %xmm0
; SSE-NEXT: movntps %xmm0, 16(%rdi)
; SSE-NEXT: movntps %xmm0, (%rdi)
; SSE-NEXT: retq
;
; AVX-LABEL: test_zero_v32i8:
-; AVX: # BB#0:
-; AVX-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; AVX: # %bb.0:
+; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
; AVX-NEXT: vmovntps %ymm0, (%rdi)
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
;
; VLX-LABEL: test_zero_v32i8:
-; VLX: # BB#0:
-; VLX-NEXT: vpxor %ymm0, %ymm0, %ymm0
+; VLX: # %bb.0:
+; VLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
; VLX-NEXT: vmovntdq %ymm0, (%rdi)
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
@@ -393,27 +393,27 @@ define void @test_zero_v32i8(<32 x i8>* %dst) {
define void @test_arg_f32(float %arg, float* %dst) {
; SSE2-LABEL: test_arg_f32:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movss %xmm0, (%rdi)
; SSE2-NEXT: retq
;
; SSE4A-LABEL: test_arg_f32:
-; SSE4A: # BB#0:
+; SSE4A: # %bb.0:
; SSE4A-NEXT: movntss %xmm0, (%rdi)
; SSE4A-NEXT: retq
;
; SSE41-LABEL: test_arg_f32:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movss %xmm0, (%rdi)
; SSE41-NEXT: retq
;
; AVX-LABEL: test_arg_f32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovss %xmm0, (%rdi)
; AVX-NEXT: retq
;
; VLX-LABEL: test_arg_f32:
-; VLX: # BB#0:
+; VLX: # %bb.0:
; VLX-NEXT: vmovss %xmm0, (%rdi)
; VLX-NEXT: retq
store float %arg, float* %dst, align 1, !nontemporal !1
@@ -422,17 +422,17 @@ define void @test_arg_f32(float %arg, float* %dst) {
define void @test_arg_i32(i32 %arg, i32* %dst) {
; SSE-LABEL: test_arg_i32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movntil %edi, (%rsi)
; SSE-NEXT: retq
;
; AVX-LABEL: test_arg_i32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: movntil %edi, (%rsi)
; AVX-NEXT: retq
;
; VLX-LABEL: test_arg_i32:
-; VLX: # BB#0:
+; VLX: # %bb.0:
; VLX-NEXT: movntil %edi, (%rsi)
; VLX-NEXT: retq
store i32 %arg, i32* %dst, align 1, !nontemporal !1
@@ -441,27 +441,27 @@ define void @test_arg_i32(i32 %arg, i32* %dst) {
define void @test_arg_f64(double %arg, double* %dst) {
; SSE2-LABEL: test_arg_f64:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movsd %xmm0, (%rdi)
; SSE2-NEXT: retq
;
; SSE4A-LABEL: test_arg_f64:
-; SSE4A: # BB#0:
+; SSE4A: # %bb.0:
; SSE4A-NEXT: movntsd %xmm0, (%rdi)
; SSE4A-NEXT: retq
;
; SSE41-LABEL: test_arg_f64:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movsd %xmm0, (%rdi)
; SSE41-NEXT: retq
;
; AVX-LABEL: test_arg_f64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovsd %xmm0, (%rdi)
; AVX-NEXT: retq
;
; VLX-LABEL: test_arg_f64:
-; VLX: # BB#0:
+; VLX: # %bb.0:
; VLX-NEXT: vmovsd %xmm0, (%rdi)
; VLX-NEXT: retq
store double %arg, double* %dst, align 1, !nontemporal !1
@@ -470,17 +470,17 @@ define void @test_arg_f64(double %arg, double* %dst) {
define void @test_arg_i64(i64 %arg, i64* %dst) {
; SSE-LABEL: test_arg_i64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movntiq %rdi, (%rsi)
; SSE-NEXT: retq
;
; AVX-LABEL: test_arg_i64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: movntiq %rdi, (%rsi)
; AVX-NEXT: retq
;
; VLX-LABEL: test_arg_i64:
-; VLX: # BB#0:
+; VLX: # %bb.0:
; VLX-NEXT: movntiq %rdi, (%rsi)
; VLX-NEXT: retq
store i64 %arg, i64* %dst, align 1, !nontemporal !1
@@ -491,31 +491,31 @@ define void @test_arg_i64(i64 %arg, i64* %dst) {
define void @test_extract_f32(<4 x float> %arg, float* %dst) {
; SSE2-LABEL: test_extract_f32:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
; SSE2-NEXT: movss %xmm0, (%rdi)
; SSE2-NEXT: retq
;
; SSE4A-LABEL: test_extract_f32:
-; SSE4A: # BB#0:
+; SSE4A: # %bb.0:
; SSE4A-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
; SSE4A-NEXT: movntss %xmm0, (%rdi)
; SSE4A-NEXT: retq
;
; SSE41-LABEL: test_extract_f32:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: extractps $1, %xmm0, %eax
; SSE41-NEXT: movntil %eax, (%rdi)
; SSE41-NEXT: retq
;
; AVX-LABEL: test_extract_f32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vextractps $1, %xmm0, %eax
; AVX-NEXT: movntil %eax, (%rdi)
; AVX-NEXT: retq
;
; VLX-LABEL: test_extract_f32:
-; VLX: # BB#0:
+; VLX: # %bb.0:
; VLX-NEXT: vextractps $1, %xmm0, %eax
; VLX-NEXT: movntil %eax, (%rdi)
; VLX-NEXT: retq
@@ -526,34 +526,34 @@ define void @test_extract_f32(<4 x float> %arg, float* %dst) {
define void @test_extract_i32(<4 x i32> %arg, i32* %dst) {
; SSE2-LABEL: test_extract_i32:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
; SSE2-NEXT: movd %xmm0, %eax
; SSE2-NEXT: movntil %eax, (%rdi)
; SSE2-NEXT: retq
;
; SSE4A-LABEL: test_extract_i32:
-; SSE4A: # BB#0:
+; SSE4A: # %bb.0:
; SSE4A-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
; SSE4A-NEXT: movd %xmm0, %eax
; SSE4A-NEXT: movntil %eax, (%rdi)
; SSE4A-NEXT: retq
;
; SSE41-LABEL: test_extract_i32:
-; SSE41: # BB#0:
-; SSE41-NEXT: pextrd $1, %xmm0, %eax
+; SSE41: # %bb.0:
+; SSE41-NEXT: extractps $1, %xmm0, %eax
; SSE41-NEXT: movntil %eax, (%rdi)
; SSE41-NEXT: retq
;
; AVX-LABEL: test_extract_i32:
-; AVX: # BB#0:
-; AVX-NEXT: vpextrd $1, %xmm0, %eax
+; AVX: # %bb.0:
+; AVX-NEXT: vextractps $1, %xmm0, %eax
; AVX-NEXT: movntil %eax, (%rdi)
; AVX-NEXT: retq
;
; VLX-LABEL: test_extract_i32:
-; VLX: # BB#0:
-; VLX-NEXT: vpextrd $1, %xmm0, %eax
+; VLX: # %bb.0:
+; VLX-NEXT: vextractps $1, %xmm0, %eax
; VLX-NEXT: movntil %eax, (%rdi)
; VLX-NEXT: retq
%1 = extractelement <4 x i32> %arg, i32 1
@@ -563,28 +563,28 @@ define void @test_extract_i32(<4 x i32> %arg, i32* %dst) {
define void @test_extract_f64(<2 x double> %arg, double* %dst) {
; SSE2-LABEL: test_extract_f64:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movhpd %xmm0, (%rdi)
; SSE2-NEXT: retq
;
; SSE4A-LABEL: test_extract_f64:
-; SSE4A: # BB#0:
+; SSE4A: # %bb.0:
; SSE4A-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
; SSE4A-NEXT: movntsd %xmm0, (%rdi)
; SSE4A-NEXT: retq
;
; SSE41-LABEL: test_extract_f64:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movhpd %xmm0, (%rdi)
; SSE41-NEXT: retq
;
; AVX-LABEL: test_extract_f64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovhpd %xmm0, (%rdi)
; AVX-NEXT: retq
;
; VLX-LABEL: test_extract_f64:
-; VLX: # BB#0:
+; VLX: # %bb.0:
; VLX-NEXT: vmovhpd %xmm0, (%rdi)
; VLX-NEXT: retq
%1 = extractelement <2 x double> %arg, i32 1
@@ -594,33 +594,33 @@ define void @test_extract_f64(<2 x double> %arg, double* %dst) {
define void @test_extract_i64(<2 x i64> %arg, i64* %dst) {
; SSE2-LABEL: test_extract_i64:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
; SSE2-NEXT: movq %xmm0, %rax
; SSE2-NEXT: movntiq %rax, (%rdi)
; SSE2-NEXT: retq
;
; SSE4A-LABEL: test_extract_i64:
-; SSE4A: # BB#0:
+; SSE4A: # %bb.0:
; SSE4A-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
; SSE4A-NEXT: movq %xmm0, %rax
; SSE4A-NEXT: movntiq %rax, (%rdi)
; SSE4A-NEXT: retq
;
; SSE41-LABEL: test_extract_i64:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pextrq $1, %xmm0, %rax
; SSE41-NEXT: movntiq %rax, (%rdi)
; SSE41-NEXT: retq
;
; AVX-LABEL: test_extract_i64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpextrq $1, %xmm0, %rax
; AVX-NEXT: movntiq %rax, (%rdi)
; AVX-NEXT: retq
;
; VLX-LABEL: test_extract_i64:
-; VLX: # BB#0:
+; VLX: # %bb.0:
; VLX-NEXT: vpextrq $1, %xmm0, %rax
; VLX-NEXT: movntiq %rax, (%rdi)
; VLX-NEXT: retq
@@ -633,17 +633,17 @@ define void @test_extract_i64(<2 x i64> %arg, i64* %dst) {
define void @test_arg_v4f32(<4 x float> %arg, <4 x float>* %dst) {
; SSE-LABEL: test_arg_v4f32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movntps %xmm0, (%rdi)
; SSE-NEXT: retq
;
; AVX-LABEL: test_arg_v4f32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovntps %xmm0, (%rdi)
; AVX-NEXT: retq
;
; VLX-LABEL: test_arg_v4f32:
-; VLX: # BB#0:
+; VLX: # %bb.0:
; VLX-NEXT: vmovntps %xmm0, (%rdi)
; VLX-NEXT: retq
store <4 x float> %arg, <4 x float>* %dst, align 16, !nontemporal !1
@@ -652,17 +652,17 @@ define void @test_arg_v4f32(<4 x float> %arg, <4 x float>* %dst) {
define void @test_arg_v4i32(<4 x i32> %arg, <4 x i32>* %dst) {
; SSE-LABEL: test_arg_v4i32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movntps %xmm0, (%rdi)
; SSE-NEXT: retq
;
; AVX-LABEL: test_arg_v4i32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovntps %xmm0, (%rdi)
; AVX-NEXT: retq
;
; VLX-LABEL: test_arg_v4i32:
-; VLX: # BB#0:
+; VLX: # %bb.0:
; VLX-NEXT: vmovntps %xmm0, (%rdi)
; VLX-NEXT: retq
store <4 x i32> %arg, <4 x i32>* %dst, align 16, !nontemporal !1
@@ -671,17 +671,17 @@ define void @test_arg_v4i32(<4 x i32> %arg, <4 x i32>* %dst) {
define void @test_arg_v2f64(<2 x double> %arg, <2 x double>* %dst) {
; SSE-LABEL: test_arg_v2f64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movntps %xmm0, (%rdi)
; SSE-NEXT: retq
;
; AVX-LABEL: test_arg_v2f64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovntps %xmm0, (%rdi)
; AVX-NEXT: retq
;
; VLX-LABEL: test_arg_v2f64:
-; VLX: # BB#0:
+; VLX: # %bb.0:
; VLX-NEXT: vmovntps %xmm0, (%rdi)
; VLX-NEXT: retq
store <2 x double> %arg, <2 x double>* %dst, align 16, !nontemporal !1
@@ -690,17 +690,17 @@ define void @test_arg_v2f64(<2 x double> %arg, <2 x double>* %dst) {
define void @test_arg_v2i64(<2 x i64> %arg, <2 x i64>* %dst) {
; SSE-LABEL: test_arg_v2i64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movntps %xmm0, (%rdi)
; SSE-NEXT: retq
;
; AVX-LABEL: test_arg_v2i64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovntps %xmm0, (%rdi)
; AVX-NEXT: retq
;
; VLX-LABEL: test_arg_v2i64:
-; VLX: # BB#0:
+; VLX: # %bb.0:
; VLX-NEXT: vmovntps %xmm0, (%rdi)
; VLX-NEXT: retq
store <2 x i64> %arg, <2 x i64>* %dst, align 16, !nontemporal !1
@@ -709,17 +709,17 @@ define void @test_arg_v2i64(<2 x i64> %arg, <2 x i64>* %dst) {
define void @test_arg_v8i16(<8 x i16> %arg, <8 x i16>* %dst) {
; SSE-LABEL: test_arg_v8i16:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movntps %xmm0, (%rdi)
; SSE-NEXT: retq
;
; AVX-LABEL: test_arg_v8i16:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovntps %xmm0, (%rdi)
; AVX-NEXT: retq
;
; VLX-LABEL: test_arg_v8i16:
-; VLX: # BB#0:
+; VLX: # %bb.0:
; VLX-NEXT: vmovntps %xmm0, (%rdi)
; VLX-NEXT: retq
store <8 x i16> %arg, <8 x i16>* %dst, align 16, !nontemporal !1
@@ -728,17 +728,17 @@ define void @test_arg_v8i16(<8 x i16> %arg, <8 x i16>* %dst) {
define void @test_arg_v16i8(<16 x i8> %arg, <16 x i8>* %dst) {
; SSE-LABEL: test_arg_v16i8:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movntps %xmm0, (%rdi)
; SSE-NEXT: retq
;
; AVX-LABEL: test_arg_v16i8:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovntps %xmm0, (%rdi)
; AVX-NEXT: retq
;
; VLX-LABEL: test_arg_v16i8:
-; VLX: # BB#0:
+; VLX: # %bb.0:
; VLX-NEXT: vmovntps %xmm0, (%rdi)
; VLX-NEXT: retq
store <16 x i8> %arg, <16 x i8>* %dst, align 16, !nontemporal !1
@@ -749,19 +749,19 @@ define void @test_arg_v16i8(<16 x i8> %arg, <16 x i8>* %dst) {
define void @test_arg_v8f32(<8 x float> %arg, <8 x float>* %dst) {
; SSE-LABEL: test_arg_v8f32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movntps %xmm1, 16(%rdi)
; SSE-NEXT: movntps %xmm0, (%rdi)
; SSE-NEXT: retq
;
; AVX-LABEL: test_arg_v8f32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovntps %ymm0, (%rdi)
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
;
; VLX-LABEL: test_arg_v8f32:
-; VLX: # BB#0:
+; VLX: # %bb.0:
; VLX-NEXT: vmovntps %ymm0, (%rdi)
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
@@ -771,19 +771,19 @@ define void @test_arg_v8f32(<8 x float> %arg, <8 x float>* %dst) {
define void @test_arg_v8i32(<8 x i32> %arg, <8 x i32>* %dst) {
; SSE-LABEL: test_arg_v8i32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movntps %xmm1, 16(%rdi)
; SSE-NEXT: movntps %xmm0, (%rdi)
; SSE-NEXT: retq
;
; AVX-LABEL: test_arg_v8i32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovntps %ymm0, (%rdi)
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
;
; VLX-LABEL: test_arg_v8i32:
-; VLX: # BB#0:
+; VLX: # %bb.0:
; VLX-NEXT: vmovntps %ymm0, (%rdi)
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
@@ -793,19 +793,19 @@ define void @test_arg_v8i32(<8 x i32> %arg, <8 x i32>* %dst) {
define void @test_arg_v4f64(<4 x double> %arg, <4 x double>* %dst) {
; SSE-LABEL: test_arg_v4f64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movntps %xmm1, 16(%rdi)
; SSE-NEXT: movntps %xmm0, (%rdi)
; SSE-NEXT: retq
;
; AVX-LABEL: test_arg_v4f64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovntps %ymm0, (%rdi)
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
;
; VLX-LABEL: test_arg_v4f64:
-; VLX: # BB#0:
+; VLX: # %bb.0:
; VLX-NEXT: vmovntps %ymm0, (%rdi)
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
@@ -815,19 +815,19 @@ define void @test_arg_v4f64(<4 x double> %arg, <4 x double>* %dst) {
define void @test_arg_v4i64(<4 x i64> %arg, <4 x i64>* %dst) {
; SSE-LABEL: test_arg_v4i64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movntps %xmm1, 16(%rdi)
; SSE-NEXT: movntps %xmm0, (%rdi)
; SSE-NEXT: retq
;
; AVX-LABEL: test_arg_v4i64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovntps %ymm0, (%rdi)
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
;
; VLX-LABEL: test_arg_v4i64:
-; VLX: # BB#0:
+; VLX: # %bb.0:
; VLX-NEXT: vmovntps %ymm0, (%rdi)
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
@@ -837,19 +837,19 @@ define void @test_arg_v4i64(<4 x i64> %arg, <4 x i64>* %dst) {
define void @test_arg_v16i16(<16 x i16> %arg, <16 x i16>* %dst) {
; SSE-LABEL: test_arg_v16i16:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movntps %xmm1, 16(%rdi)
; SSE-NEXT: movntps %xmm0, (%rdi)
; SSE-NEXT: retq
;
; AVX-LABEL: test_arg_v16i16:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovntps %ymm0, (%rdi)
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
;
; VLX-LABEL: test_arg_v16i16:
-; VLX: # BB#0:
+; VLX: # %bb.0:
; VLX-NEXT: vmovntps %ymm0, (%rdi)
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
@@ -859,19 +859,19 @@ define void @test_arg_v16i16(<16 x i16> %arg, <16 x i16>* %dst) {
define void @test_arg_v32i8(<32 x i8> %arg, <32 x i8>* %dst) {
; SSE-LABEL: test_arg_v32i8:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movntps %xmm1, 16(%rdi)
; SSE-NEXT: movntps %xmm0, (%rdi)
; SSE-NEXT: retq
;
; AVX-LABEL: test_arg_v32i8:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovntps %ymm0, (%rdi)
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
;
; VLX-LABEL: test_arg_v32i8:
-; VLX: # BB#0:
+; VLX: # %bb.0:
; VLX-NEXT: vmovntps %ymm0, (%rdi)
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
@@ -885,19 +885,19 @@ define void @test_arg_v32i8(<32 x i8> %arg, <32 x i8>* %dst) {
define void @test_op_v4f32(<4 x float> %a, <4 x float> %b, <4 x float>* %dst) {
; SSE-LABEL: test_op_v4f32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: addps %xmm1, %xmm0
; SSE-NEXT: movntps %xmm0, (%rdi)
; SSE-NEXT: retq
;
; AVX-LABEL: test_op_v4f32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmovntps %xmm0, (%rdi)
; AVX-NEXT: retq
;
; VLX-LABEL: test_op_v4f32:
-; VLX: # BB#0:
+; VLX: # %bb.0:
; VLX-NEXT: vaddps %xmm1, %xmm0, %xmm0
; VLX-NEXT: vmovntps %xmm0, (%rdi)
; VLX-NEXT: retq
@@ -908,19 +908,19 @@ define void @test_op_v4f32(<4 x float> %a, <4 x float> %b, <4 x float>* %dst) {
define void @test_op_v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32>* %dst) {
; SSE-LABEL: test_op_v4i32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: paddd %xmm1, %xmm0
; SSE-NEXT: movntdq %xmm0, (%rdi)
; SSE-NEXT: retq
;
; AVX-LABEL: test_op_v4i32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmovntdq %xmm0, (%rdi)
; AVX-NEXT: retq
;
; VLX-LABEL: test_op_v4i32:
-; VLX: # BB#0:
+; VLX: # %bb.0:
; VLX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; VLX-NEXT: vmovntdq %xmm0, (%rdi)
; VLX-NEXT: retq
@@ -931,19 +931,19 @@ define void @test_op_v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32>* %dst) {
define void @test_op_v2f64(<2 x double> %a, <2 x double> %b, <2 x double>* %dst) {
; SSE-LABEL: test_op_v2f64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: addpd %xmm1, %xmm0
; SSE-NEXT: movntpd %xmm0, (%rdi)
; SSE-NEXT: retq
;
; AVX-LABEL: test_op_v2f64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vaddpd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmovntpd %xmm0, (%rdi)
; AVX-NEXT: retq
;
; VLX-LABEL: test_op_v2f64:
-; VLX: # BB#0:
+; VLX: # %bb.0:
; VLX-NEXT: vaddpd %xmm1, %xmm0, %xmm0
; VLX-NEXT: vmovntpd %xmm0, (%rdi)
; VLX-NEXT: retq
@@ -954,19 +954,19 @@ define void @test_op_v2f64(<2 x double> %a, <2 x double> %b, <2 x double>* %dst)
define void @test_op_v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64>* %dst) {
; SSE-LABEL: test_op_v2i64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: paddq %xmm1, %xmm0
; SSE-NEXT: movntdq %xmm0, (%rdi)
; SSE-NEXT: retq
;
; AVX-LABEL: test_op_v2i64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmovntdq %xmm0, (%rdi)
; AVX-NEXT: retq
;
; VLX-LABEL: test_op_v2i64:
-; VLX: # BB#0:
+; VLX: # %bb.0:
; VLX-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; VLX-NEXT: vmovntdq %xmm0, (%rdi)
; VLX-NEXT: retq
@@ -977,19 +977,19 @@ define void @test_op_v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64>* %dst) {
define void @test_op_v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16>* %dst) {
; SSE-LABEL: test_op_v8i16:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: paddw %xmm1, %xmm0
; SSE-NEXT: movntdq %xmm0, (%rdi)
; SSE-NEXT: retq
;
; AVX-LABEL: test_op_v8i16:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmovntdq %xmm0, (%rdi)
; AVX-NEXT: retq
;
; VLX-LABEL: test_op_v8i16:
-; VLX: # BB#0:
+; VLX: # %bb.0:
; VLX-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; VLX-NEXT: vmovntdq %xmm0, (%rdi)
; VLX-NEXT: retq
@@ -1000,19 +1000,19 @@ define void @test_op_v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16>* %dst) {
define void @test_op_v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8>* %dst) {
; SSE-LABEL: test_op_v16i8:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: paddb %xmm1, %xmm0
; SSE-NEXT: movntdq %xmm0, (%rdi)
; SSE-NEXT: retq
;
; AVX-LABEL: test_op_v16i8:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmovntdq %xmm0, (%rdi)
; AVX-NEXT: retq
;
; VLX-LABEL: test_op_v16i8:
-; VLX: # BB#0:
+; VLX: # %bb.0:
; VLX-NEXT: vpaddb %xmm1, %xmm0, %xmm0
; VLX-NEXT: vmovntdq %xmm0, (%rdi)
; VLX-NEXT: retq
@@ -1025,7 +1025,7 @@ define void @test_op_v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8>* %dst) {
define void @test_op_v8f32(<8 x float> %a, <8 x float> %b, <8 x float>* %dst) {
; SSE-LABEL: test_op_v8f32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: addps %xmm2, %xmm0
; SSE-NEXT: addps %xmm3, %xmm1
; SSE-NEXT: movntps %xmm1, 16(%rdi)
@@ -1033,14 +1033,14 @@ define void @test_op_v8f32(<8 x float> %a, <8 x float> %b, <8 x float>* %dst) {
; SSE-NEXT: retq
;
; AVX-LABEL: test_op_v8f32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0
; AVX-NEXT: vmovntps %ymm0, (%rdi)
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
;
; VLX-LABEL: test_op_v8f32:
-; VLX: # BB#0:
+; VLX: # %bb.0:
; VLX-NEXT: vaddps %ymm1, %ymm0, %ymm0
; VLX-NEXT: vmovntps %ymm0, (%rdi)
; VLX-NEXT: vzeroupper
@@ -1052,7 +1052,7 @@ define void @test_op_v8f32(<8 x float> %a, <8 x float> %b, <8 x float>* %dst) {
define void @test_op_v8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32>* %dst) {
; SSE-LABEL: test_op_v8i32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: paddd %xmm2, %xmm0
; SSE-NEXT: paddd %xmm3, %xmm1
; SSE-NEXT: movntdq %xmm1, 16(%rdi)
@@ -1060,7 +1060,7 @@ define void @test_op_v8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32>* %dst) {
; SSE-NEXT: retq
;
; AVX1-LABEL: test_op_v8i32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2
@@ -1071,14 +1071,14 @@ define void @test_op_v8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32>* %dst) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_op_v8i32:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vmovntdq %ymm0, (%rdi)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; VLX-LABEL: test_op_v8i32:
-; VLX: # BB#0:
+; VLX: # %bb.0:
; VLX-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; VLX-NEXT: vmovntdq %ymm0, (%rdi)
; VLX-NEXT: vzeroupper
@@ -1090,7 +1090,7 @@ define void @test_op_v8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32>* %dst) {
define void @test_op_v4f64(<4 x double> %a, <4 x double> %b, <4 x double>* %dst) {
; SSE-LABEL: test_op_v4f64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: addpd %xmm2, %xmm0
; SSE-NEXT: addpd %xmm3, %xmm1
; SSE-NEXT: movntpd %xmm1, 16(%rdi)
@@ -1098,14 +1098,14 @@ define void @test_op_v4f64(<4 x double> %a, <4 x double> %b, <4 x double>* %dst)
; SSE-NEXT: retq
;
; AVX-LABEL: test_op_v4f64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0
; AVX-NEXT: vmovntpd %ymm0, (%rdi)
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
;
; VLX-LABEL: test_op_v4f64:
-; VLX: # BB#0:
+; VLX: # %bb.0:
; VLX-NEXT: vaddpd %ymm1, %ymm0, %ymm0
; VLX-NEXT: vmovntpd %ymm0, (%rdi)
; VLX-NEXT: vzeroupper
@@ -1117,7 +1117,7 @@ define void @test_op_v4f64(<4 x double> %a, <4 x double> %b, <4 x double>* %dst)
define void @test_op_v4i64(<4 x i64> %a, <4 x i64> %b, <4 x i64>* %dst) {
; SSE-LABEL: test_op_v4i64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: paddq %xmm2, %xmm0
; SSE-NEXT: paddq %xmm3, %xmm1
; SSE-NEXT: movntdq %xmm1, 16(%rdi)
@@ -1125,7 +1125,7 @@ define void @test_op_v4i64(<4 x i64> %a, <4 x i64> %b, <4 x i64>* %dst) {
; SSE-NEXT: retq
;
; AVX1-LABEL: test_op_v4i64:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2
@@ -1136,14 +1136,14 @@ define void @test_op_v4i64(<4 x i64> %a, <4 x i64> %b, <4 x i64>* %dst) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_op_v4i64:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vmovntdq %ymm0, (%rdi)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; VLX-LABEL: test_op_v4i64:
-; VLX: # BB#0:
+; VLX: # %bb.0:
; VLX-NEXT: vpaddq %ymm1, %ymm0, %ymm0
; VLX-NEXT: vmovntdq %ymm0, (%rdi)
; VLX-NEXT: vzeroupper
@@ -1155,7 +1155,7 @@ define void @test_op_v4i64(<4 x i64> %a, <4 x i64> %b, <4 x i64>* %dst) {
define void @test_op_v16i16(<16 x i16> %a, <16 x i16> %b, <16 x i16>* %dst) {
; SSE-LABEL: test_op_v16i16:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: paddw %xmm2, %xmm0
; SSE-NEXT: paddw %xmm3, %xmm1
; SSE-NEXT: movntdq %xmm1, 16(%rdi)
@@ -1163,7 +1163,7 @@ define void @test_op_v16i16(<16 x i16> %a, <16 x i16> %b, <16 x i16>* %dst) {
; SSE-NEXT: retq
;
; AVX1-LABEL: test_op_v16i16:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpaddw %xmm2, %xmm3, %xmm2
@@ -1174,14 +1174,14 @@ define void @test_op_v16i16(<16 x i16> %a, <16 x i16> %b, <16 x i16>* %dst) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_op_v16i16:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vmovntdq %ymm0, (%rdi)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; VLX-LABEL: test_op_v16i16:
-; VLX: # BB#0:
+; VLX: # %bb.0:
; VLX-NEXT: vpaddw %ymm1, %ymm0, %ymm0
; VLX-NEXT: vmovntdq %ymm0, (%rdi)
; VLX-NEXT: vzeroupper
@@ -1193,7 +1193,7 @@ define void @test_op_v16i16(<16 x i16> %a, <16 x i16> %b, <16 x i16>* %dst) {
define void @test_op_v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8>* %dst) {
; SSE-LABEL: test_op_v32i8:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: paddb %xmm2, %xmm0
; SSE-NEXT: paddb %xmm3, %xmm1
; SSE-NEXT: movntdq %xmm1, 16(%rdi)
@@ -1201,7 +1201,7 @@ define void @test_op_v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8>* %dst) {
; SSE-NEXT: retq
;
; AVX1-LABEL: test_op_v32i8:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpaddb %xmm2, %xmm3, %xmm2
@@ -1212,14 +1212,14 @@ define void @test_op_v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8>* %dst) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_op_v32i8:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vmovntdq %ymm0, (%rdi)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; VLX-LABEL: test_op_v32i8:
-; VLX: # BB#0:
+; VLX: # %bb.0:
; VLX-NEXT: vpaddb %ymm1, %ymm0, %ymm0
; VLX-NEXT: vmovntdq %ymm0, (%rdi)
; VLX-NEXT: vzeroupper
@@ -1235,7 +1235,7 @@ define void @test_op_v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8>* %dst) {
; probably always worth even some 20 instruction scalarization.
define void @test_unaligned_v8f32(<8 x float> %a, <8 x float> %b, <8 x float>* %dst) {
; SSE-LABEL: test_unaligned_v8f32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: addps %xmm2, %xmm0
; SSE-NEXT: addps %xmm3, %xmm1
; SSE-NEXT: movntps %xmm1, 16(%rdi)
@@ -1243,14 +1243,14 @@ define void @test_unaligned_v8f32(<8 x float> %a, <8 x float> %b, <8 x float>* %
; SSE-NEXT: retq
;
; AVX-LABEL: test_unaligned_v8f32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0
; AVX-NEXT: vmovups %ymm0, (%rdi)
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
;
; VLX-LABEL: test_unaligned_v8f32:
-; VLX: # BB#0:
+; VLX: # %bb.0:
; VLX-NEXT: vaddps %ymm1, %ymm0, %ymm0
; VLX-NEXT: vmovups %ymm0, (%rdi)
; VLX-NEXT: vzeroupper
diff --git a/test/CodeGen/X86/nontemporal-loads.ll b/test/CodeGen/X86/nontemporal-loads.ll
index 3c916fd38c6c..308395d365cc 100644
--- a/test/CodeGen/X86/nontemporal-loads.ll
+++ b/test/CodeGen/X86/nontemporal-loads.ll
@@ -9,22 +9,22 @@
define <4 x float> @test_v4f32(<4 x float>* %src) {
; SSE2-LABEL: test_v4f32:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movaps (%rdi), %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_v4f32:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movntdqa (%rdi), %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: test_v4f32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovntdqa (%rdi), %xmm0
; AVX-NEXT: retq
;
; AVX512-LABEL: test_v4f32:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vmovntdqa (%rdi), %xmm0
; AVX512-NEXT: retq
%1 = load <4 x float>, <4 x float>* %src, align 16, !nontemporal !1
@@ -33,22 +33,22 @@ define <4 x float> @test_v4f32(<4 x float>* %src) {
define <4 x i32> @test_v4i32(<4 x i32>* %src) {
; SSE2-LABEL: test_v4i32:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movaps (%rdi), %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_v4i32:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movntdqa (%rdi), %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: test_v4i32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovntdqa (%rdi), %xmm0
; AVX-NEXT: retq
;
; AVX512-LABEL: test_v4i32:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vmovntdqa (%rdi), %xmm0
; AVX512-NEXT: retq
%1 = load <4 x i32>, <4 x i32>* %src, align 16, !nontemporal !1
@@ -57,22 +57,22 @@ define <4 x i32> @test_v4i32(<4 x i32>* %src) {
define <2 x double> @test_v2f64(<2 x double>* %src) {
; SSE2-LABEL: test_v2f64:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movaps (%rdi), %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_v2f64:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movntdqa (%rdi), %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: test_v2f64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovntdqa (%rdi), %xmm0
; AVX-NEXT: retq
;
; AVX512-LABEL: test_v2f64:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vmovntdqa (%rdi), %xmm0
; AVX512-NEXT: retq
%1 = load <2 x double>, <2 x double>* %src, align 16, !nontemporal !1
@@ -81,22 +81,22 @@ define <2 x double> @test_v2f64(<2 x double>* %src) {
define <2 x i64> @test_v2i64(<2 x i64>* %src) {
; SSE2-LABEL: test_v2i64:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movaps (%rdi), %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_v2i64:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movntdqa (%rdi), %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: test_v2i64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovntdqa (%rdi), %xmm0
; AVX-NEXT: retq
;
; AVX512-LABEL: test_v2i64:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vmovntdqa (%rdi), %xmm0
; AVX512-NEXT: retq
%1 = load <2 x i64>, <2 x i64>* %src, align 16, !nontemporal !1
@@ -105,22 +105,22 @@ define <2 x i64> @test_v2i64(<2 x i64>* %src) {
define <8 x i16> @test_v8i16(<8 x i16>* %src) {
; SSE2-LABEL: test_v8i16:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movaps (%rdi), %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_v8i16:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movntdqa (%rdi), %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: test_v8i16:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovntdqa (%rdi), %xmm0
; AVX-NEXT: retq
;
; AVX512-LABEL: test_v8i16:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vmovntdqa (%rdi), %xmm0
; AVX512-NEXT: retq
%1 = load <8 x i16>, <8 x i16>* %src, align 16, !nontemporal !1
@@ -129,22 +129,22 @@ define <8 x i16> @test_v8i16(<8 x i16>* %src) {
define <16 x i8> @test_v16i8(<16 x i8>* %src) {
; SSE2-LABEL: test_v16i8:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movaps (%rdi), %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_v16i8:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movntdqa (%rdi), %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: test_v16i8:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovntdqa (%rdi), %xmm0
; AVX-NEXT: retq
;
; AVX512-LABEL: test_v16i8:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vmovntdqa (%rdi), %xmm0
; AVX512-NEXT: retq
%1 = load <16 x i8>, <16 x i8>* %src, align 16, !nontemporal !1
@@ -155,31 +155,31 @@ define <16 x i8> @test_v16i8(<16 x i8>* %src) {
define <8 x float> @test_v8f32(<8 x float>* %src) {
; SSE2-LABEL: test_v8f32:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movaps (%rdi), %xmm0
; SSE2-NEXT: movaps 16(%rdi), %xmm1
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_v8f32:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movntdqa (%rdi), %xmm0
; SSE41-NEXT: movntdqa 16(%rdi), %xmm1
; SSE41-NEXT: retq
;
; AVX1-LABEL: test_v8f32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovntdqa (%rdi), %xmm0
; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_v8f32:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vmovntdqa (%rdi), %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: test_v8f32:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vmovntdqa (%rdi), %ymm0
; AVX512-NEXT: retq
%1 = load <8 x float>, <8 x float>* %src, align 32, !nontemporal !1
@@ -188,74 +188,64 @@ define <8 x float> @test_v8f32(<8 x float>* %src) {
define <8 x i32> @test_v8i32(<8 x i32>* %src) {
; SSE2-LABEL: test_v8i32:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movaps (%rdi), %xmm0
; SSE2-NEXT: movaps 16(%rdi), %xmm1
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_v8i32:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movntdqa (%rdi), %xmm0
; SSE41-NEXT: movntdqa 16(%rdi), %xmm1
; SSE41-NEXT: retq
;
; AVX1-LABEL: test_v8i32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovntdqa (%rdi), %xmm0
; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_v8i32:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vmovntdqa (%rdi), %ymm0
; AVX2-NEXT: retq
;
-; AVX512F-LABEL: test_v8i32:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovntdqa (%rdi), %ymm0
-; AVX512F-NEXT: retq
-;
-; AVX512BW-LABEL: test_v8i32:
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vmovntdqa (%rdi), %ymm0
-; AVX512BW-NEXT: retq
-;
-; AVX512VL-LABEL: test_v8i32:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vmovaps (%rdi), %ymm0
-; AVX512VL-NEXT: retq
+; AVX512-LABEL: test_v8i32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vmovntdqa (%rdi), %ymm0
+; AVX512-NEXT: retq
%1 = load <8 x i32>, <8 x i32>* %src, align 32, !nontemporal !1
ret <8 x i32> %1
}
define <4 x double> @test_v4f64(<4 x double>* %src) {
; SSE2-LABEL: test_v4f64:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movaps (%rdi), %xmm0
; SSE2-NEXT: movaps 16(%rdi), %xmm1
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_v4f64:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movntdqa (%rdi), %xmm0
; SSE41-NEXT: movntdqa 16(%rdi), %xmm1
; SSE41-NEXT: retq
;
; AVX1-LABEL: test_v4f64:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovntdqa (%rdi), %xmm0
; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_v4f64:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vmovntdqa (%rdi), %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: test_v4f64:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vmovntdqa (%rdi), %ymm0
; AVX512-NEXT: retq
%1 = load <4 x double>, <4 x double>* %src, align 32, !nontemporal !1
@@ -264,31 +254,31 @@ define <4 x double> @test_v4f64(<4 x double>* %src) {
define <4 x i64> @test_v4i64(<4 x i64>* %src) {
; SSE2-LABEL: test_v4i64:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movaps (%rdi), %xmm0
; SSE2-NEXT: movaps 16(%rdi), %xmm1
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_v4i64:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movntdqa (%rdi), %xmm0
; SSE41-NEXT: movntdqa 16(%rdi), %xmm1
; SSE41-NEXT: retq
;
; AVX1-LABEL: test_v4i64:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovntdqa (%rdi), %xmm0
; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_v4i64:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vmovntdqa (%rdi), %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: test_v4i64:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vmovntdqa (%rdi), %ymm0
; AVX512-NEXT: retq
%1 = load <4 x i64>, <4 x i64>* %src, align 32, !nontemporal !1
@@ -297,31 +287,31 @@ define <4 x i64> @test_v4i64(<4 x i64>* %src) {
define <16 x i16> @test_v16i16(<16 x i16>* %src) {
; SSE2-LABEL: test_v16i16:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movaps (%rdi), %xmm0
; SSE2-NEXT: movaps 16(%rdi), %xmm1
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_v16i16:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movntdqa (%rdi), %xmm0
; SSE41-NEXT: movntdqa 16(%rdi), %xmm1
; SSE41-NEXT: retq
;
; AVX1-LABEL: test_v16i16:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovntdqa (%rdi), %xmm0
; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_v16i16:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vmovntdqa (%rdi), %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: test_v16i16:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vmovntdqa (%rdi), %ymm0
; AVX512-NEXT: retq
%1 = load <16 x i16>, <16 x i16>* %src, align 32, !nontemporal !1
@@ -330,31 +320,31 @@ define <16 x i16> @test_v16i16(<16 x i16>* %src) {
define <32 x i8> @test_v32i8(<32 x i8>* %src) {
; SSE2-LABEL: test_v32i8:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movaps (%rdi), %xmm0
; SSE2-NEXT: movaps 16(%rdi), %xmm1
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_v32i8:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movntdqa (%rdi), %xmm0
; SSE41-NEXT: movntdqa 16(%rdi), %xmm1
; SSE41-NEXT: retq
;
; AVX1-LABEL: test_v32i8:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovntdqa (%rdi), %xmm0
; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_v32i8:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vmovntdqa (%rdi), %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: test_v32i8:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vmovntdqa (%rdi), %ymm0
; AVX512-NEXT: retq
%1 = load <32 x i8>, <32 x i8>* %src, align 32, !nontemporal !1
@@ -365,7 +355,7 @@ define <32 x i8> @test_v32i8(<32 x i8>* %src) {
define <16 x float> @test_v16f32(<16 x float>* %src) {
; SSE2-LABEL: test_v16f32:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movaps (%rdi), %xmm0
; SSE2-NEXT: movaps 16(%rdi), %xmm1
; SSE2-NEXT: movaps 32(%rdi), %xmm2
@@ -373,7 +363,7 @@ define <16 x float> @test_v16f32(<16 x float>* %src) {
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_v16f32:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movntdqa (%rdi), %xmm0
; SSE41-NEXT: movntdqa 16(%rdi), %xmm1
; SSE41-NEXT: movntdqa 32(%rdi), %xmm2
@@ -381,7 +371,7 @@ define <16 x float> @test_v16f32(<16 x float>* %src) {
; SSE41-NEXT: retq
;
; AVX1-LABEL: test_v16f32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovntdqa (%rdi), %xmm0
; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
@@ -391,13 +381,13 @@ define <16 x float> @test_v16f32(<16 x float>* %src) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_v16f32:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vmovntdqa (%rdi), %ymm0
; AVX2-NEXT: vmovntdqa 32(%rdi), %ymm1
; AVX2-NEXT: retq
;
; AVX512-LABEL: test_v16f32:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vmovntdqa (%rdi), %zmm0
; AVX512-NEXT: retq
%1 = load <16 x float>, <16 x float>* %src, align 64, !nontemporal !1
@@ -406,7 +396,7 @@ define <16 x float> @test_v16f32(<16 x float>* %src) {
define <16 x i32> @test_v16i32(<16 x i32>* %src) {
; SSE2-LABEL: test_v16i32:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movaps (%rdi), %xmm0
; SSE2-NEXT: movaps 16(%rdi), %xmm1
; SSE2-NEXT: movaps 32(%rdi), %xmm2
@@ -414,7 +404,7 @@ define <16 x i32> @test_v16i32(<16 x i32>* %src) {
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_v16i32:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movntdqa (%rdi), %xmm0
; SSE41-NEXT: movntdqa 16(%rdi), %xmm1
; SSE41-NEXT: movntdqa 32(%rdi), %xmm2
@@ -422,7 +412,7 @@ define <16 x i32> @test_v16i32(<16 x i32>* %src) {
; SSE41-NEXT: retq
;
; AVX1-LABEL: test_v16i32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovntdqa (%rdi), %xmm0
; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
@@ -432,13 +422,13 @@ define <16 x i32> @test_v16i32(<16 x i32>* %src) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_v16i32:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vmovntdqa (%rdi), %ymm0
; AVX2-NEXT: vmovntdqa 32(%rdi), %ymm1
; AVX2-NEXT: retq
;
; AVX512-LABEL: test_v16i32:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vmovntdqa (%rdi), %zmm0
; AVX512-NEXT: retq
%1 = load <16 x i32>, <16 x i32>* %src, align 64, !nontemporal !1
@@ -447,7 +437,7 @@ define <16 x i32> @test_v16i32(<16 x i32>* %src) {
define <8 x double> @test_v8f64(<8 x double>* %src) {
; SSE2-LABEL: test_v8f64:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movaps (%rdi), %xmm0
; SSE2-NEXT: movaps 16(%rdi), %xmm1
; SSE2-NEXT: movaps 32(%rdi), %xmm2
@@ -455,7 +445,7 @@ define <8 x double> @test_v8f64(<8 x double>* %src) {
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_v8f64:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movntdqa (%rdi), %xmm0
; SSE41-NEXT: movntdqa 16(%rdi), %xmm1
; SSE41-NEXT: movntdqa 32(%rdi), %xmm2
@@ -463,7 +453,7 @@ define <8 x double> @test_v8f64(<8 x double>* %src) {
; SSE41-NEXT: retq
;
; AVX1-LABEL: test_v8f64:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovntdqa (%rdi), %xmm0
; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
@@ -473,13 +463,13 @@ define <8 x double> @test_v8f64(<8 x double>* %src) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_v8f64:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vmovntdqa (%rdi), %ymm0
; AVX2-NEXT: vmovntdqa 32(%rdi), %ymm1
; AVX2-NEXT: retq
;
; AVX512-LABEL: test_v8f64:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vmovntdqa (%rdi), %zmm0
; AVX512-NEXT: retq
%1 = load <8 x double>, <8 x double>* %src, align 64, !nontemporal !1
@@ -488,7 +478,7 @@ define <8 x double> @test_v8f64(<8 x double>* %src) {
define <8 x i64> @test_v8i64(<8 x i64>* %src) {
; SSE2-LABEL: test_v8i64:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movaps (%rdi), %xmm0
; SSE2-NEXT: movaps 16(%rdi), %xmm1
; SSE2-NEXT: movaps 32(%rdi), %xmm2
@@ -496,7 +486,7 @@ define <8 x i64> @test_v8i64(<8 x i64>* %src) {
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_v8i64:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movntdqa (%rdi), %xmm0
; SSE41-NEXT: movntdqa 16(%rdi), %xmm1
; SSE41-NEXT: movntdqa 32(%rdi), %xmm2
@@ -504,7 +494,7 @@ define <8 x i64> @test_v8i64(<8 x i64>* %src) {
; SSE41-NEXT: retq
;
; AVX1-LABEL: test_v8i64:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovntdqa (%rdi), %xmm0
; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
@@ -514,13 +504,13 @@ define <8 x i64> @test_v8i64(<8 x i64>* %src) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_v8i64:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vmovntdqa (%rdi), %ymm0
; AVX2-NEXT: vmovntdqa 32(%rdi), %ymm1
; AVX2-NEXT: retq
;
; AVX512-LABEL: test_v8i64:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vmovntdqa (%rdi), %zmm0
; AVX512-NEXT: retq
%1 = load <8 x i64>, <8 x i64>* %src, align 64, !nontemporal !1
@@ -529,7 +519,7 @@ define <8 x i64> @test_v8i64(<8 x i64>* %src) {
define <32 x i16> @test_v32i16(<32 x i16>* %src) {
; SSE2-LABEL: test_v32i16:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movaps (%rdi), %xmm0
; SSE2-NEXT: movaps 16(%rdi), %xmm1
; SSE2-NEXT: movaps 32(%rdi), %xmm2
@@ -537,7 +527,7 @@ define <32 x i16> @test_v32i16(<32 x i16>* %src) {
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_v32i16:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movntdqa (%rdi), %xmm0
; SSE41-NEXT: movntdqa 16(%rdi), %xmm1
; SSE41-NEXT: movntdqa 32(%rdi), %xmm2
@@ -545,7 +535,7 @@ define <32 x i16> @test_v32i16(<32 x i16>* %src) {
; SSE41-NEXT: retq
;
; AVX1-LABEL: test_v32i16:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovntdqa (%rdi), %xmm0
; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
@@ -555,24 +545,24 @@ define <32 x i16> @test_v32i16(<32 x i16>* %src) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_v32i16:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vmovntdqa (%rdi), %ymm0
; AVX2-NEXT: vmovntdqa 32(%rdi), %ymm1
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test_v32i16:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovntdqa (%rdi), %ymm0
; AVX512F-NEXT: vmovntdqa 32(%rdi), %ymm1
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: test_v32i16:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovntdqa (%rdi), %zmm0
; AVX512BW-NEXT: retq
;
; AVX512VL-LABEL: test_v32i16:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovntdqa (%rdi), %ymm0
; AVX512VL-NEXT: vmovntdqa 32(%rdi), %ymm1
; AVX512VL-NEXT: retq
@@ -582,7 +572,7 @@ define <32 x i16> @test_v32i16(<32 x i16>* %src) {
define <64 x i8> @test_v64i8(<64 x i8>* %src) {
; SSE2-LABEL: test_v64i8:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movaps (%rdi), %xmm0
; SSE2-NEXT: movaps 16(%rdi), %xmm1
; SSE2-NEXT: movaps 32(%rdi), %xmm2
@@ -590,7 +580,7 @@ define <64 x i8> @test_v64i8(<64 x i8>* %src) {
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_v64i8:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movntdqa (%rdi), %xmm0
; SSE41-NEXT: movntdqa 16(%rdi), %xmm1
; SSE41-NEXT: movntdqa 32(%rdi), %xmm2
@@ -598,7 +588,7 @@ define <64 x i8> @test_v64i8(<64 x i8>* %src) {
; SSE41-NEXT: retq
;
; AVX1-LABEL: test_v64i8:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovntdqa (%rdi), %xmm0
; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
@@ -608,24 +598,24 @@ define <64 x i8> @test_v64i8(<64 x i8>* %src) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_v64i8:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vmovntdqa (%rdi), %ymm0
; AVX2-NEXT: vmovntdqa 32(%rdi), %ymm1
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test_v64i8:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovntdqa (%rdi), %ymm0
; AVX512F-NEXT: vmovntdqa 32(%rdi), %ymm1
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: test_v64i8:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovntdqa (%rdi), %zmm0
; AVX512BW-NEXT: retq
;
; AVX512VL-LABEL: test_v64i8:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovntdqa (%rdi), %ymm0
; AVX512VL-NEXT: vmovntdqa 32(%rdi), %ymm1
; AVX512VL-NEXT: retq
@@ -638,24 +628,24 @@ define <64 x i8> @test_v64i8(<64 x i8>* %src) {
define <4 x float> @test_arg_v4f32(<4 x float> %arg, <4 x float>* %src) {
; SSE2-LABEL: test_arg_v4f32:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: addps (%rdi), %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_arg_v4f32:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movntdqa (%rdi), %xmm1
; SSE41-NEXT: addps %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: test_arg_v4f32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovntdqa (%rdi), %xmm1
; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
;
; AVX512-LABEL: test_arg_v4f32:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vmovntdqa (%rdi), %xmm1
; AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
@@ -666,24 +656,24 @@ define <4 x float> @test_arg_v4f32(<4 x float> %arg, <4 x float>* %src) {
define <4 x i32> @test_arg_v4i32(<4 x i32> %arg, <4 x i32>* %src) {
; SSE2-LABEL: test_arg_v4i32:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: paddd (%rdi), %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_arg_v4i32:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movntdqa (%rdi), %xmm1
; SSE41-NEXT: paddd %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: test_arg_v4i32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovntdqa (%rdi), %xmm1
; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
;
; AVX512-LABEL: test_arg_v4i32:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vmovntdqa (%rdi), %xmm1
; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
@@ -694,24 +684,24 @@ define <4 x i32> @test_arg_v4i32(<4 x i32> %arg, <4 x i32>* %src) {
define <2 x double> @test_arg_v2f64(<2 x double> %arg, <2 x double>* %src) {
; SSE2-LABEL: test_arg_v2f64:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: addpd (%rdi), %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_arg_v2f64:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movntdqa (%rdi), %xmm1
; SSE41-NEXT: addpd %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: test_arg_v2f64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovntdqa (%rdi), %xmm1
; AVX-NEXT: vaddpd %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
;
; AVX512-LABEL: test_arg_v2f64:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vmovntdqa (%rdi), %xmm1
; AVX512-NEXT: vaddpd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
@@ -722,24 +712,24 @@ define <2 x double> @test_arg_v2f64(<2 x double> %arg, <2 x double>* %src) {
define <2 x i64> @test_arg_v2i64(<2 x i64> %arg, <2 x i64>* %src) {
; SSE2-LABEL: test_arg_v2i64:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: paddq (%rdi), %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_arg_v2i64:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movntdqa (%rdi), %xmm1
; SSE41-NEXT: paddq %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: test_arg_v2i64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovntdqa (%rdi), %xmm1
; AVX-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
;
; AVX512-LABEL: test_arg_v2i64:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vmovntdqa (%rdi), %xmm1
; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
@@ -750,24 +740,24 @@ define <2 x i64> @test_arg_v2i64(<2 x i64> %arg, <2 x i64>* %src) {
define <8 x i16> @test_arg_v8i16(<8 x i16> %arg, <8 x i16>* %src) {
; SSE2-LABEL: test_arg_v8i16:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: paddw (%rdi), %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_arg_v8i16:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movntdqa (%rdi), %xmm1
; SSE41-NEXT: paddw %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: test_arg_v8i16:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovntdqa (%rdi), %xmm1
; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
;
; AVX512-LABEL: test_arg_v8i16:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vmovntdqa (%rdi), %xmm1
; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
@@ -778,24 +768,24 @@ define <8 x i16> @test_arg_v8i16(<8 x i16> %arg, <8 x i16>* %src) {
define <16 x i8> @test_arg_v16i8(<16 x i8> %arg, <16 x i8>* %src) {
; SSE2-LABEL: test_arg_v16i8:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: paddb (%rdi), %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_arg_v16i8:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movntdqa (%rdi), %xmm1
; SSE41-NEXT: paddb %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: test_arg_v16i8:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovntdqa (%rdi), %xmm1
; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
;
; AVX512-LABEL: test_arg_v16i8:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vmovntdqa (%rdi), %xmm1
; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
@@ -808,13 +798,13 @@ define <16 x i8> @test_arg_v16i8(<16 x i8> %arg, <16 x i8>* %src) {
define <8 x float> @test_arg_v8f32(<8 x float> %arg, <8 x float>* %src) {
; SSE2-LABEL: test_arg_v8f32:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: addps (%rdi), %xmm0
; SSE2-NEXT: addps 16(%rdi), %xmm1
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_arg_v8f32:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movntdqa 16(%rdi), %xmm2
; SSE41-NEXT: movntdqa (%rdi), %xmm3
; SSE41-NEXT: addps %xmm3, %xmm0
@@ -822,7 +812,7 @@ define <8 x float> @test_arg_v8f32(<8 x float> %arg, <8 x float>* %src) {
; SSE41-NEXT: retq
;
; AVX1-LABEL: test_arg_v8f32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovntdqa (%rdi), %xmm1
; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm2
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
@@ -830,13 +820,13 @@ define <8 x float> @test_arg_v8f32(<8 x float> %arg, <8 x float>* %src) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_arg_v8f32:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vmovntdqa (%rdi), %ymm1
; AVX2-NEXT: vaddps %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: test_arg_v8f32:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vmovntdqa (%rdi), %ymm1
; AVX512-NEXT: vaddps %ymm1, %ymm0, %ymm0
; AVX512-NEXT: retq
@@ -847,13 +837,13 @@ define <8 x float> @test_arg_v8f32(<8 x float> %arg, <8 x float>* %src) {
define <8 x i32> @test_arg_v8i32(<8 x i32> %arg, <8 x i32>* %src) {
; SSE2-LABEL: test_arg_v8i32:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: paddd (%rdi), %xmm0
; SSE2-NEXT: paddd 16(%rdi), %xmm1
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_arg_v8i32:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movntdqa 16(%rdi), %xmm2
; SSE41-NEXT: movntdqa (%rdi), %xmm3
; SSE41-NEXT: paddd %xmm3, %xmm0
@@ -861,7 +851,7 @@ define <8 x i32> @test_arg_v8i32(<8 x i32> %arg, <8 x i32>* %src) {
; SSE41-NEXT: retq
;
; AVX1-LABEL: test_arg_v8i32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovntdqa (%rdi), %xmm1
; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
@@ -871,27 +861,16 @@ define <8 x i32> @test_arg_v8i32(<8 x i32> %arg, <8 x i32>* %src) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_arg_v8i32:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vmovntdqa (%rdi), %ymm1
; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
-; AVX512F-LABEL: test_arg_v8i32:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovntdqa (%rdi), %ymm1
-; AVX512F-NEXT: vpaddd %ymm1, %ymm0, %ymm0
-; AVX512F-NEXT: retq
-;
-; AVX512BW-LABEL: test_arg_v8i32:
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vmovntdqa (%rdi), %ymm1
-; AVX512BW-NEXT: vpaddd %ymm1, %ymm0, %ymm0
-; AVX512BW-NEXT: retq
-;
-; AVX512VL-LABEL: test_arg_v8i32:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vpaddd (%rdi), %ymm0, %ymm0
-; AVX512VL-NEXT: retq
+; AVX512-LABEL: test_arg_v8i32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vmovntdqa (%rdi), %ymm1
+; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0
+; AVX512-NEXT: retq
%1 = load <8 x i32>, <8 x i32>* %src, align 32, !nontemporal !1
%2 = add <8 x i32> %arg, %1
ret <8 x i32> %2
@@ -899,13 +878,13 @@ define <8 x i32> @test_arg_v8i32(<8 x i32> %arg, <8 x i32>* %src) {
define <4 x double> @test_arg_v4f64(<4 x double> %arg, <4 x double>* %src) {
; SSE2-LABEL: test_arg_v4f64:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: addpd (%rdi), %xmm0
; SSE2-NEXT: addpd 16(%rdi), %xmm1
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_arg_v4f64:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movntdqa 16(%rdi), %xmm2
; SSE41-NEXT: movntdqa (%rdi), %xmm3
; SSE41-NEXT: addpd %xmm3, %xmm0
@@ -913,7 +892,7 @@ define <4 x double> @test_arg_v4f64(<4 x double> %arg, <4 x double>* %src) {
; SSE41-NEXT: retq
;
; AVX1-LABEL: test_arg_v4f64:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovntdqa (%rdi), %xmm1
; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm2
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
@@ -921,13 +900,13 @@ define <4 x double> @test_arg_v4f64(<4 x double> %arg, <4 x double>* %src) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_arg_v4f64:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vmovntdqa (%rdi), %ymm1
; AVX2-NEXT: vaddpd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: test_arg_v4f64:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vmovntdqa (%rdi), %ymm1
; AVX512-NEXT: vaddpd %ymm1, %ymm0, %ymm0
; AVX512-NEXT: retq
@@ -938,13 +917,13 @@ define <4 x double> @test_arg_v4f64(<4 x double> %arg, <4 x double>* %src) {
define <4 x i64> @test_arg_v4i64(<4 x i64> %arg, <4 x i64>* %src) {
; SSE2-LABEL: test_arg_v4i64:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: paddq (%rdi), %xmm0
; SSE2-NEXT: paddq 16(%rdi), %xmm1
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_arg_v4i64:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movntdqa 16(%rdi), %xmm2
; SSE41-NEXT: movntdqa (%rdi), %xmm3
; SSE41-NEXT: paddq %xmm3, %xmm0
@@ -952,7 +931,7 @@ define <4 x i64> @test_arg_v4i64(<4 x i64> %arg, <4 x i64>* %src) {
; SSE41-NEXT: retq
;
; AVX1-LABEL: test_arg_v4i64:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovntdqa (%rdi), %xmm1
; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
@@ -962,13 +941,13 @@ define <4 x i64> @test_arg_v4i64(<4 x i64> %arg, <4 x i64>* %src) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_arg_v4i64:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vmovntdqa (%rdi), %ymm1
; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: test_arg_v4i64:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vmovntdqa (%rdi), %ymm1
; AVX512-NEXT: vpaddq %ymm1, %ymm0, %ymm0
; AVX512-NEXT: retq
@@ -979,13 +958,13 @@ define <4 x i64> @test_arg_v4i64(<4 x i64> %arg, <4 x i64>* %src) {
define <16 x i16> @test_arg_v16i16(<16 x i16> %arg, <16 x i16>* %src) {
; SSE2-LABEL: test_arg_v16i16:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: paddw (%rdi), %xmm0
; SSE2-NEXT: paddw 16(%rdi), %xmm1
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_arg_v16i16:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movntdqa 16(%rdi), %xmm2
; SSE41-NEXT: movntdqa (%rdi), %xmm3
; SSE41-NEXT: paddw %xmm3, %xmm0
@@ -993,7 +972,7 @@ define <16 x i16> @test_arg_v16i16(<16 x i16> %arg, <16 x i16>* %src) {
; SSE41-NEXT: retq
;
; AVX1-LABEL: test_arg_v16i16:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovntdqa (%rdi), %xmm1
; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
@@ -1003,13 +982,13 @@ define <16 x i16> @test_arg_v16i16(<16 x i16> %arg, <16 x i16>* %src) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_arg_v16i16:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vmovntdqa (%rdi), %ymm1
; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: test_arg_v16i16:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vmovntdqa (%rdi), %ymm1
; AVX512-NEXT: vpaddw %ymm1, %ymm0, %ymm0
; AVX512-NEXT: retq
@@ -1020,13 +999,13 @@ define <16 x i16> @test_arg_v16i16(<16 x i16> %arg, <16 x i16>* %src) {
define <32 x i8> @test_arg_v32i8(<32 x i8> %arg, <32 x i8>* %src) {
; SSE2-LABEL: test_arg_v32i8:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: paddb (%rdi), %xmm0
; SSE2-NEXT: paddb 16(%rdi), %xmm1
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_arg_v32i8:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movntdqa 16(%rdi), %xmm2
; SSE41-NEXT: movntdqa (%rdi), %xmm3
; SSE41-NEXT: paddb %xmm3, %xmm0
@@ -1034,7 +1013,7 @@ define <32 x i8> @test_arg_v32i8(<32 x i8> %arg, <32 x i8>* %src) {
; SSE41-NEXT: retq
;
; AVX1-LABEL: test_arg_v32i8:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovntdqa (%rdi), %xmm1
; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
@@ -1044,13 +1023,13 @@ define <32 x i8> @test_arg_v32i8(<32 x i8> %arg, <32 x i8>* %src) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_arg_v32i8:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vmovntdqa (%rdi), %ymm1
; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: test_arg_v32i8:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vmovntdqa (%rdi), %ymm1
; AVX512-NEXT: vpaddb %ymm1, %ymm0, %ymm0
; AVX512-NEXT: retq
@@ -1063,7 +1042,7 @@ define <32 x i8> @test_arg_v32i8(<32 x i8> %arg, <32 x i8>* %src) {
define <16 x float> @test_arg_v16f32(<16 x float> %arg, <16 x float>* %src) {
; SSE2-LABEL: test_arg_v16f32:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: addps (%rdi), %xmm0
; SSE2-NEXT: addps 16(%rdi), %xmm1
; SSE2-NEXT: addps 32(%rdi), %xmm2
@@ -1071,7 +1050,7 @@ define <16 x float> @test_arg_v16f32(<16 x float> %arg, <16 x float>* %src) {
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_arg_v16f32:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movntdqa 48(%rdi), %xmm4
; SSE41-NEXT: movntdqa 32(%rdi), %xmm5
; SSE41-NEXT: movntdqa 16(%rdi), %xmm6
@@ -1083,7 +1062,7 @@ define <16 x float> @test_arg_v16f32(<16 x float> %arg, <16 x float>* %src) {
; SSE41-NEXT: retq
;
; AVX1-LABEL: test_arg_v16f32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm2
; AVX1-NEXT: vmovntdqa 48(%rdi), %xmm3
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
@@ -1095,7 +1074,7 @@ define <16 x float> @test_arg_v16f32(<16 x float> %arg, <16 x float>* %src) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_arg_v16f32:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vmovntdqa 32(%rdi), %ymm2
; AVX2-NEXT: vmovntdqa (%rdi), %ymm3
; AVX2-NEXT: vaddps %ymm3, %ymm0, %ymm0
@@ -1103,7 +1082,7 @@ define <16 x float> @test_arg_v16f32(<16 x float> %arg, <16 x float>* %src) {
; AVX2-NEXT: retq
;
; AVX512-LABEL: test_arg_v16f32:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vmovntdqa (%rdi), %zmm1
; AVX512-NEXT: vaddps %zmm1, %zmm0, %zmm0
; AVX512-NEXT: retq
@@ -1114,7 +1093,7 @@ define <16 x float> @test_arg_v16f32(<16 x float> %arg, <16 x float>* %src) {
define <16 x i32> @test_arg_v16i32(<16 x i32> %arg, <16 x i32>* %src) {
; SSE2-LABEL: test_arg_v16i32:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: paddd (%rdi), %xmm0
; SSE2-NEXT: paddd 16(%rdi), %xmm1
; SSE2-NEXT: paddd 32(%rdi), %xmm2
@@ -1122,7 +1101,7 @@ define <16 x i32> @test_arg_v16i32(<16 x i32> %arg, <16 x i32>* %src) {
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_arg_v16i32:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movntdqa 48(%rdi), %xmm4
; SSE41-NEXT: movntdqa 32(%rdi), %xmm5
; SSE41-NEXT: movntdqa 16(%rdi), %xmm6
@@ -1134,7 +1113,7 @@ define <16 x i32> @test_arg_v16i32(<16 x i32> %arg, <16 x i32>* %src) {
; SSE41-NEXT: retq
;
; AVX1-LABEL: test_arg_v16i32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm2
; AVX1-NEXT: vmovntdqa 48(%rdi), %xmm3
; AVX1-NEXT: vmovntdqa (%rdi), %xmm4
@@ -1150,7 +1129,7 @@ define <16 x i32> @test_arg_v16i32(<16 x i32> %arg, <16 x i32>* %src) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_arg_v16i32:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vmovntdqa 32(%rdi), %ymm2
; AVX2-NEXT: vmovntdqa (%rdi), %ymm3
; AVX2-NEXT: vpaddd %ymm3, %ymm0, %ymm0
@@ -1158,7 +1137,7 @@ define <16 x i32> @test_arg_v16i32(<16 x i32> %arg, <16 x i32>* %src) {
; AVX2-NEXT: retq
;
; AVX512-LABEL: test_arg_v16i32:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vmovntdqa (%rdi), %zmm1
; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
; AVX512-NEXT: retq
@@ -1169,7 +1148,7 @@ define <16 x i32> @test_arg_v16i32(<16 x i32> %arg, <16 x i32>* %src) {
define <8 x double> @test_arg_v8f64(<8 x double> %arg, <8 x double>* %src) {
; SSE2-LABEL: test_arg_v8f64:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: addpd (%rdi), %xmm0
; SSE2-NEXT: addpd 16(%rdi), %xmm1
; SSE2-NEXT: addpd 32(%rdi), %xmm2
@@ -1177,7 +1156,7 @@ define <8 x double> @test_arg_v8f64(<8 x double> %arg, <8 x double>* %src) {
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_arg_v8f64:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movntdqa 48(%rdi), %xmm4
; SSE41-NEXT: movntdqa 32(%rdi), %xmm5
; SSE41-NEXT: movntdqa 16(%rdi), %xmm6
@@ -1189,7 +1168,7 @@ define <8 x double> @test_arg_v8f64(<8 x double> %arg, <8 x double>* %src) {
; SSE41-NEXT: retq
;
; AVX1-LABEL: test_arg_v8f64:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm2
; AVX1-NEXT: vmovntdqa 48(%rdi), %xmm3
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
@@ -1201,7 +1180,7 @@ define <8 x double> @test_arg_v8f64(<8 x double> %arg, <8 x double>* %src) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_arg_v8f64:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vmovntdqa 32(%rdi), %ymm2
; AVX2-NEXT: vmovntdqa (%rdi), %ymm3
; AVX2-NEXT: vaddpd %ymm3, %ymm0, %ymm0
@@ -1209,7 +1188,7 @@ define <8 x double> @test_arg_v8f64(<8 x double> %arg, <8 x double>* %src) {
; AVX2-NEXT: retq
;
; AVX512-LABEL: test_arg_v8f64:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vmovntdqa (%rdi), %zmm1
; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0
; AVX512-NEXT: retq
@@ -1220,7 +1199,7 @@ define <8 x double> @test_arg_v8f64(<8 x double> %arg, <8 x double>* %src) {
define <8 x i64> @test_arg_v8i64(<8 x i64> %arg, <8 x i64>* %src) {
; SSE2-LABEL: test_arg_v8i64:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: paddq (%rdi), %xmm0
; SSE2-NEXT: paddq 16(%rdi), %xmm1
; SSE2-NEXT: paddq 32(%rdi), %xmm2
@@ -1228,7 +1207,7 @@ define <8 x i64> @test_arg_v8i64(<8 x i64> %arg, <8 x i64>* %src) {
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_arg_v8i64:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movntdqa 48(%rdi), %xmm4
; SSE41-NEXT: movntdqa 32(%rdi), %xmm5
; SSE41-NEXT: movntdqa 16(%rdi), %xmm6
@@ -1240,7 +1219,7 @@ define <8 x i64> @test_arg_v8i64(<8 x i64> %arg, <8 x i64>* %src) {
; SSE41-NEXT: retq
;
; AVX1-LABEL: test_arg_v8i64:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm2
; AVX1-NEXT: vmovntdqa 48(%rdi), %xmm3
; AVX1-NEXT: vmovntdqa (%rdi), %xmm4
@@ -1256,7 +1235,7 @@ define <8 x i64> @test_arg_v8i64(<8 x i64> %arg, <8 x i64>* %src) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_arg_v8i64:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vmovntdqa 32(%rdi), %ymm2
; AVX2-NEXT: vmovntdqa (%rdi), %ymm3
; AVX2-NEXT: vpaddq %ymm3, %ymm0, %ymm0
@@ -1264,7 +1243,7 @@ define <8 x i64> @test_arg_v8i64(<8 x i64> %arg, <8 x i64>* %src) {
; AVX2-NEXT: retq
;
; AVX512-LABEL: test_arg_v8i64:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vmovntdqa (%rdi), %zmm1
; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0
; AVX512-NEXT: retq
@@ -1275,7 +1254,7 @@ define <8 x i64> @test_arg_v8i64(<8 x i64> %arg, <8 x i64>* %src) {
define <32 x i16> @test_arg_v32i16(<32 x i16> %arg, <32 x i16>* %src) {
; SSE2-LABEL: test_arg_v32i16:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: paddw (%rdi), %xmm0
; SSE2-NEXT: paddw 16(%rdi), %xmm1
; SSE2-NEXT: paddw 32(%rdi), %xmm2
@@ -1283,7 +1262,7 @@ define <32 x i16> @test_arg_v32i16(<32 x i16> %arg, <32 x i16>* %src) {
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_arg_v32i16:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movntdqa 48(%rdi), %xmm4
; SSE41-NEXT: movntdqa 32(%rdi), %xmm5
; SSE41-NEXT: movntdqa 16(%rdi), %xmm6
@@ -1295,7 +1274,7 @@ define <32 x i16> @test_arg_v32i16(<32 x i16> %arg, <32 x i16>* %src) {
; SSE41-NEXT: retq
;
; AVX1-LABEL: test_arg_v32i16:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm2
; AVX1-NEXT: vmovntdqa 48(%rdi), %xmm3
; AVX1-NEXT: vmovntdqa (%rdi), %xmm4
@@ -1311,7 +1290,7 @@ define <32 x i16> @test_arg_v32i16(<32 x i16> %arg, <32 x i16>* %src) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_arg_v32i16:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vmovntdqa 32(%rdi), %ymm2
; AVX2-NEXT: vmovntdqa (%rdi), %ymm3
; AVX2-NEXT: vpaddw %ymm3, %ymm0, %ymm0
@@ -1319,7 +1298,7 @@ define <32 x i16> @test_arg_v32i16(<32 x i16> %arg, <32 x i16>* %src) {
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test_arg_v32i16:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovntdqa 32(%rdi), %ymm2
; AVX512F-NEXT: vmovntdqa (%rdi), %ymm3
; AVX512F-NEXT: vpaddw %ymm3, %ymm0, %ymm0
@@ -1327,13 +1306,13 @@ define <32 x i16> @test_arg_v32i16(<32 x i16> %arg, <32 x i16>* %src) {
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: test_arg_v32i16:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovntdqa (%rdi), %zmm1
; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512VL-LABEL: test_arg_v32i16:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovntdqa 32(%rdi), %ymm2
; AVX512VL-NEXT: vmovntdqa (%rdi), %ymm3
; AVX512VL-NEXT: vpaddw %ymm3, %ymm0, %ymm0
@@ -1346,7 +1325,7 @@ define <32 x i16> @test_arg_v32i16(<32 x i16> %arg, <32 x i16>* %src) {
define <64 x i8> @test_arg_v64i8(<64 x i8> %arg, <64 x i8>* %src) {
; SSE2-LABEL: test_arg_v64i8:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: paddb (%rdi), %xmm0
; SSE2-NEXT: paddb 16(%rdi), %xmm1
; SSE2-NEXT: paddb 32(%rdi), %xmm2
@@ -1354,7 +1333,7 @@ define <64 x i8> @test_arg_v64i8(<64 x i8> %arg, <64 x i8>* %src) {
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_arg_v64i8:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movntdqa 48(%rdi), %xmm4
; SSE41-NEXT: movntdqa 32(%rdi), %xmm5
; SSE41-NEXT: movntdqa 16(%rdi), %xmm6
@@ -1366,7 +1345,7 @@ define <64 x i8> @test_arg_v64i8(<64 x i8> %arg, <64 x i8>* %src) {
; SSE41-NEXT: retq
;
; AVX1-LABEL: test_arg_v64i8:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm2
; AVX1-NEXT: vmovntdqa 48(%rdi), %xmm3
; AVX1-NEXT: vmovntdqa (%rdi), %xmm4
@@ -1382,7 +1361,7 @@ define <64 x i8> @test_arg_v64i8(<64 x i8> %arg, <64 x i8>* %src) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_arg_v64i8:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vmovntdqa 32(%rdi), %ymm2
; AVX2-NEXT: vmovntdqa (%rdi), %ymm3
; AVX2-NEXT: vpaddb %ymm3, %ymm0, %ymm0
@@ -1390,7 +1369,7 @@ define <64 x i8> @test_arg_v64i8(<64 x i8> %arg, <64 x i8>* %src) {
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test_arg_v64i8:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovntdqa 32(%rdi), %ymm2
; AVX512F-NEXT: vmovntdqa (%rdi), %ymm3
; AVX512F-NEXT: vpaddb %ymm3, %ymm0, %ymm0
@@ -1398,13 +1377,13 @@ define <64 x i8> @test_arg_v64i8(<64 x i8> %arg, <64 x i8>* %src) {
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: test_arg_v64i8:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovntdqa (%rdi), %zmm1
; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512VL-LABEL: test_arg_v64i8:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovntdqa 32(%rdi), %ymm2
; AVX512VL-NEXT: vmovntdqa (%rdi), %ymm3
; AVX512VL-NEXT: vpaddb %ymm3, %ymm0, %ymm0
@@ -1420,17 +1399,17 @@ define <64 x i8> @test_arg_v64i8(<64 x i8> %arg, <64 x i8>* %src) {
define <4 x float> @test_unaligned_v4f32(<4 x float>* %src) {
; SSE-LABEL: test_unaligned_v4f32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movups (%rdi), %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test_unaligned_v4f32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovups (%rdi), %xmm0
; AVX-NEXT: retq
;
; AVX512-LABEL: test_unaligned_v4f32:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vmovups (%rdi), %xmm0
; AVX512-NEXT: retq
%1 = load <4 x float>, <4 x float>* %src, align 1, !nontemporal !1
@@ -1439,17 +1418,17 @@ define <4 x float> @test_unaligned_v4f32(<4 x float>* %src) {
define <4 x i32> @test_unaligned_v4i32(<4 x i32>* %src) {
; SSE-LABEL: test_unaligned_v4i32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movups (%rdi), %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test_unaligned_v4i32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovups (%rdi), %xmm0
; AVX-NEXT: retq
;
; AVX512-LABEL: test_unaligned_v4i32:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vmovups (%rdi), %xmm0
; AVX512-NEXT: retq
%1 = load <4 x i32>, <4 x i32>* %src, align 1, !nontemporal !1
@@ -1458,17 +1437,17 @@ define <4 x i32> @test_unaligned_v4i32(<4 x i32>* %src) {
define <2 x double> @test_unaligned_v2f64(<2 x double>* %src) {
; SSE-LABEL: test_unaligned_v2f64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movups (%rdi), %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test_unaligned_v2f64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovups (%rdi), %xmm0
; AVX-NEXT: retq
;
; AVX512-LABEL: test_unaligned_v2f64:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vmovups (%rdi), %xmm0
; AVX512-NEXT: retq
%1 = load <2 x double>, <2 x double>* %src, align 1, !nontemporal !1
@@ -1477,17 +1456,17 @@ define <2 x double> @test_unaligned_v2f64(<2 x double>* %src) {
define <2 x i64> @test_unaligned_v2i64(<2 x i64>* %src) {
; SSE-LABEL: test_unaligned_v2i64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movups (%rdi), %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test_unaligned_v2i64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovups (%rdi), %xmm0
; AVX-NEXT: retq
;
; AVX512-LABEL: test_unaligned_v2i64:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vmovups (%rdi), %xmm0
; AVX512-NEXT: retq
%1 = load <2 x i64>, <2 x i64>* %src, align 1, !nontemporal !1
@@ -1496,17 +1475,17 @@ define <2 x i64> @test_unaligned_v2i64(<2 x i64>* %src) {
define <8 x i16> @test_unaligned_v8i16(<8 x i16>* %src) {
; SSE-LABEL: test_unaligned_v8i16:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movups (%rdi), %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test_unaligned_v8i16:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovups (%rdi), %xmm0
; AVX-NEXT: retq
;
; AVX512-LABEL: test_unaligned_v8i16:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vmovups (%rdi), %xmm0
; AVX512-NEXT: retq
%1 = load <8 x i16>, <8 x i16>* %src, align 1, !nontemporal !1
@@ -1515,17 +1494,17 @@ define <8 x i16> @test_unaligned_v8i16(<8 x i16>* %src) {
define <16 x i8> @test_unaligned_v16i8(<16 x i8>* %src) {
; SSE-LABEL: test_unaligned_v16i8:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movups (%rdi), %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test_unaligned_v16i8:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovups (%rdi), %xmm0
; AVX-NEXT: retq
;
; AVX512-LABEL: test_unaligned_v16i8:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vmovups (%rdi), %xmm0
; AVX512-NEXT: retq
%1 = load <16 x i8>, <16 x i8>* %src, align 1, !nontemporal !1
@@ -1536,18 +1515,18 @@ define <16 x i8> @test_unaligned_v16i8(<16 x i8>* %src) {
define <8 x float> @test_unaligned_v8f32(<8 x float>* %src) {
; SSE-LABEL: test_unaligned_v8f32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movups (%rdi), %xmm0
; SSE-NEXT: movups 16(%rdi), %xmm1
; SSE-NEXT: retq
;
; AVX-LABEL: test_unaligned_v8f32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovups (%rdi), %ymm0
; AVX-NEXT: retq
;
; AVX512-LABEL: test_unaligned_v8f32:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vmovups (%rdi), %ymm0
; AVX512-NEXT: retq
%1 = load <8 x float>, <8 x float>* %src, align 1, !nontemporal !1
@@ -1556,18 +1535,18 @@ define <8 x float> @test_unaligned_v8f32(<8 x float>* %src) {
define <8 x i32> @test_unaligned_v8i32(<8 x i32>* %src) {
; SSE-LABEL: test_unaligned_v8i32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movups (%rdi), %xmm0
; SSE-NEXT: movups 16(%rdi), %xmm1
; SSE-NEXT: retq
;
; AVX-LABEL: test_unaligned_v8i32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovups (%rdi), %ymm0
; AVX-NEXT: retq
;
; AVX512-LABEL: test_unaligned_v8i32:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vmovups (%rdi), %ymm0
; AVX512-NEXT: retq
%1 = load <8 x i32>, <8 x i32>* %src, align 1, !nontemporal !1
@@ -1576,18 +1555,18 @@ define <8 x i32> @test_unaligned_v8i32(<8 x i32>* %src) {
define <4 x double> @test_unaligned_v4f64(<4 x double>* %src) {
; SSE-LABEL: test_unaligned_v4f64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movups (%rdi), %xmm0
; SSE-NEXT: movups 16(%rdi), %xmm1
; SSE-NEXT: retq
;
; AVX-LABEL: test_unaligned_v4f64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovups (%rdi), %ymm0
; AVX-NEXT: retq
;
; AVX512-LABEL: test_unaligned_v4f64:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vmovups (%rdi), %ymm0
; AVX512-NEXT: retq
%1 = load <4 x double>, <4 x double>* %src, align 1, !nontemporal !1
@@ -1596,18 +1575,18 @@ define <4 x double> @test_unaligned_v4f64(<4 x double>* %src) {
define <4 x i64> @test_unaligned_v4i64(<4 x i64>* %src) {
; SSE-LABEL: test_unaligned_v4i64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movups (%rdi), %xmm0
; SSE-NEXT: movups 16(%rdi), %xmm1
; SSE-NEXT: retq
;
; AVX-LABEL: test_unaligned_v4i64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovups (%rdi), %ymm0
; AVX-NEXT: retq
;
; AVX512-LABEL: test_unaligned_v4i64:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vmovups (%rdi), %ymm0
; AVX512-NEXT: retq
%1 = load <4 x i64>, <4 x i64>* %src, align 1, !nontemporal !1
@@ -1616,18 +1595,18 @@ define <4 x i64> @test_unaligned_v4i64(<4 x i64>* %src) {
define <16 x i16> @test_unaligned_v16i16(<16 x i16>* %src) {
; SSE-LABEL: test_unaligned_v16i16:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movups (%rdi), %xmm0
; SSE-NEXT: movups 16(%rdi), %xmm1
; SSE-NEXT: retq
;
; AVX-LABEL: test_unaligned_v16i16:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovups (%rdi), %ymm0
; AVX-NEXT: retq
;
; AVX512-LABEL: test_unaligned_v16i16:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vmovups (%rdi), %ymm0
; AVX512-NEXT: retq
%1 = load <16 x i16>, <16 x i16>* %src, align 1, !nontemporal !1
@@ -1636,18 +1615,18 @@ define <16 x i16> @test_unaligned_v16i16(<16 x i16>* %src) {
define <32 x i8> @test_unaligned_v32i8(<32 x i8>* %src) {
; SSE-LABEL: test_unaligned_v32i8:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movups (%rdi), %xmm0
; SSE-NEXT: movups 16(%rdi), %xmm1
; SSE-NEXT: retq
;
; AVX-LABEL: test_unaligned_v32i8:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovups (%rdi), %ymm0
; AVX-NEXT: retq
;
; AVX512-LABEL: test_unaligned_v32i8:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vmovups (%rdi), %ymm0
; AVX512-NEXT: retq
%1 = load <32 x i8>, <32 x i8>* %src, align 1, !nontemporal !1
@@ -1658,7 +1637,7 @@ define <32 x i8> @test_unaligned_v32i8(<32 x i8>* %src) {
define <16 x float> @test_unaligned_v16f32(<16 x float>* %src) {
; SSE-LABEL: test_unaligned_v16f32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movups (%rdi), %xmm0
; SSE-NEXT: movups 16(%rdi), %xmm1
; SSE-NEXT: movups 32(%rdi), %xmm2
@@ -1666,13 +1645,13 @@ define <16 x float> @test_unaligned_v16f32(<16 x float>* %src) {
; SSE-NEXT: retq
;
; AVX-LABEL: test_unaligned_v16f32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovups (%rdi), %ymm0
; AVX-NEXT: vmovups 32(%rdi), %ymm1
; AVX-NEXT: retq
;
; AVX512-LABEL: test_unaligned_v16f32:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vmovups (%rdi), %zmm0
; AVX512-NEXT: retq
%1 = load <16 x float>, <16 x float>* %src, align 1, !nontemporal !1
@@ -1681,7 +1660,7 @@ define <16 x float> @test_unaligned_v16f32(<16 x float>* %src) {
define <16 x i32> @test_unaligned_v16i32(<16 x i32>* %src) {
; SSE-LABEL: test_unaligned_v16i32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movups (%rdi), %xmm0
; SSE-NEXT: movups 16(%rdi), %xmm1
; SSE-NEXT: movups 32(%rdi), %xmm2
@@ -1689,13 +1668,13 @@ define <16 x i32> @test_unaligned_v16i32(<16 x i32>* %src) {
; SSE-NEXT: retq
;
; AVX-LABEL: test_unaligned_v16i32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovups (%rdi), %ymm0
; AVX-NEXT: vmovups 32(%rdi), %ymm1
; AVX-NEXT: retq
;
; AVX512-LABEL: test_unaligned_v16i32:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vmovups (%rdi), %zmm0
; AVX512-NEXT: retq
%1 = load <16 x i32>, <16 x i32>* %src, align 1, !nontemporal !1
@@ -1704,7 +1683,7 @@ define <16 x i32> @test_unaligned_v16i32(<16 x i32>* %src) {
define <8 x double> @test_unaligned_v8f64(<8 x double>* %src) {
; SSE-LABEL: test_unaligned_v8f64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movups (%rdi), %xmm0
; SSE-NEXT: movups 16(%rdi), %xmm1
; SSE-NEXT: movups 32(%rdi), %xmm2
@@ -1712,13 +1691,13 @@ define <8 x double> @test_unaligned_v8f64(<8 x double>* %src) {
; SSE-NEXT: retq
;
; AVX-LABEL: test_unaligned_v8f64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovups (%rdi), %ymm0
; AVX-NEXT: vmovups 32(%rdi), %ymm1
; AVX-NEXT: retq
;
; AVX512-LABEL: test_unaligned_v8f64:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vmovups (%rdi), %zmm0
; AVX512-NEXT: retq
%1 = load <8 x double>, <8 x double>* %src, align 1, !nontemporal !1
@@ -1727,7 +1706,7 @@ define <8 x double> @test_unaligned_v8f64(<8 x double>* %src) {
define <8 x i64> @test_unaligned_v8i64(<8 x i64>* %src) {
; SSE-LABEL: test_unaligned_v8i64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movups (%rdi), %xmm0
; SSE-NEXT: movups 16(%rdi), %xmm1
; SSE-NEXT: movups 32(%rdi), %xmm2
@@ -1735,13 +1714,13 @@ define <8 x i64> @test_unaligned_v8i64(<8 x i64>* %src) {
; SSE-NEXT: retq
;
; AVX-LABEL: test_unaligned_v8i64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovups (%rdi), %ymm0
; AVX-NEXT: vmovups 32(%rdi), %ymm1
; AVX-NEXT: retq
;
; AVX512-LABEL: test_unaligned_v8i64:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vmovups (%rdi), %zmm0
; AVX512-NEXT: retq
%1 = load <8 x i64>, <8 x i64>* %src, align 1, !nontemporal !1
@@ -1750,7 +1729,7 @@ define <8 x i64> @test_unaligned_v8i64(<8 x i64>* %src) {
define <32 x i16> @test_unaligned_v32i16(<32 x i16>* %src) {
; SSE-LABEL: test_unaligned_v32i16:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movups (%rdi), %xmm0
; SSE-NEXT: movups 16(%rdi), %xmm1
; SSE-NEXT: movups 32(%rdi), %xmm2
@@ -1758,24 +1737,24 @@ define <32 x i16> @test_unaligned_v32i16(<32 x i16>* %src) {
; SSE-NEXT: retq
;
; AVX-LABEL: test_unaligned_v32i16:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovups (%rdi), %ymm0
; AVX-NEXT: vmovups 32(%rdi), %ymm1
; AVX-NEXT: retq
;
; AVX512F-LABEL: test_unaligned_v32i16:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovups (%rdi), %ymm0
; AVX512F-NEXT: vmovups 32(%rdi), %ymm1
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: test_unaligned_v32i16:
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vmovdqu16 (%rdi), %zmm0
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovups (%rdi), %zmm0
; AVX512BW-NEXT: retq
;
; AVX512VL-LABEL: test_unaligned_v32i16:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovups (%rdi), %ymm0
; AVX512VL-NEXT: vmovups 32(%rdi), %ymm1
; AVX512VL-NEXT: retq
@@ -1785,7 +1764,7 @@ define <32 x i16> @test_unaligned_v32i16(<32 x i16>* %src) {
define <64 x i8> @test_unaligned_v64i8(<64 x i8>* %src) {
; SSE-LABEL: test_unaligned_v64i8:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movups (%rdi), %xmm0
; SSE-NEXT: movups 16(%rdi), %xmm1
; SSE-NEXT: movups 32(%rdi), %xmm2
@@ -1793,24 +1772,24 @@ define <64 x i8> @test_unaligned_v64i8(<64 x i8>* %src) {
; SSE-NEXT: retq
;
; AVX-LABEL: test_unaligned_v64i8:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovups (%rdi), %ymm0
; AVX-NEXT: vmovups 32(%rdi), %ymm1
; AVX-NEXT: retq
;
; AVX512F-LABEL: test_unaligned_v64i8:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovups (%rdi), %ymm0
; AVX512F-NEXT: vmovups 32(%rdi), %ymm1
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: test_unaligned_v64i8:
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vmovdqu8 (%rdi), %zmm0
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovups (%rdi), %zmm0
; AVX512BW-NEXT: retq
;
; AVX512VL-LABEL: test_unaligned_v64i8:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovups (%rdi), %ymm0
; AVX512VL-NEXT: vmovups 32(%rdi), %ymm1
; AVX512VL-NEXT: retq
@@ -1818,4 +1797,119 @@ define <64 x i8> @test_unaligned_v64i8(<64 x i8>* %src) {
ret <64 x i8> %1
}
+define <16 x i32> @test_masked_v16i32(i8 * %addr, <16 x i32> %old, <16 x i32> %mask1) {
+; SSE2-LABEL: test_masked_v16i32:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movdqa %xmm0, %xmm10
+; SSE2-NEXT: pxor %xmm12, %xmm12
+; SSE2-NEXT: pcmpeqd %xmm12, %xmm7
+; SSE2-NEXT: pcmpeqd %xmm0, %xmm0
+; SSE2-NEXT: movdqa %xmm7, %xmm8
+; SSE2-NEXT: pxor %xmm0, %xmm8
+; SSE2-NEXT: pcmpeqd %xmm12, %xmm6
+; SSE2-NEXT: movdqa %xmm6, %xmm9
+; SSE2-NEXT: pxor %xmm0, %xmm9
+; SSE2-NEXT: pcmpeqd %xmm12, %xmm5
+; SSE2-NEXT: movdqa %xmm5, %xmm11
+; SSE2-NEXT: pxor %xmm0, %xmm11
+; SSE2-NEXT: pcmpeqd %xmm12, %xmm4
+; SSE2-NEXT: pxor %xmm4, %xmm0
+; SSE2-NEXT: pandn (%rdi), %xmm4
+; SSE2-NEXT: pandn %xmm10, %xmm0
+; SSE2-NEXT: por %xmm4, %xmm0
+; SSE2-NEXT: pandn 16(%rdi), %xmm5
+; SSE2-NEXT: pandn %xmm1, %xmm11
+; SSE2-NEXT: por %xmm5, %xmm11
+; SSE2-NEXT: pandn 32(%rdi), %xmm6
+; SSE2-NEXT: pandn %xmm2, %xmm9
+; SSE2-NEXT: por %xmm6, %xmm9
+; SSE2-NEXT: pandn 48(%rdi), %xmm7
+; SSE2-NEXT: pandn %xmm3, %xmm8
+; SSE2-NEXT: por %xmm7, %xmm8
+; SSE2-NEXT: movdqa %xmm11, %xmm1
+; SSE2-NEXT: movdqa %xmm9, %xmm2
+; SSE2-NEXT: movdqa %xmm8, %xmm3
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: test_masked_v16i32:
+; SSE41: # %bb.0:
+; SSE41-NEXT: movdqa %xmm0, %xmm8
+; SSE41-NEXT: pxor %xmm0, %xmm0
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm7
+; SSE41-NEXT: pcmpeqd %xmm9, %xmm9
+; SSE41-NEXT: pxor %xmm9, %xmm7
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm6
+; SSE41-NEXT: pxor %xmm9, %xmm6
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm5
+; SSE41-NEXT: pxor %xmm9, %xmm5
+; SSE41-NEXT: pcmpeqd %xmm4, %xmm0
+; SSE41-NEXT: pxor %xmm9, %xmm0
+; SSE41-NEXT: movntdqa 48(%rdi), %xmm9
+; SSE41-NEXT: movntdqa 32(%rdi), %xmm10
+; SSE41-NEXT: movntdqa 16(%rdi), %xmm11
+; SSE41-NEXT: movntdqa (%rdi), %xmm4
+; SSE41-NEXT: blendvps %xmm0, %xmm4, %xmm8
+; SSE41-NEXT: movdqa %xmm5, %xmm0
+; SSE41-NEXT: blendvps %xmm0, %xmm11, %xmm1
+; SSE41-NEXT: movdqa %xmm6, %xmm0
+; SSE41-NEXT: blendvps %xmm0, %xmm10, %xmm2
+; SSE41-NEXT: movdqa %xmm7, %xmm0
+; SSE41-NEXT: blendvps %xmm0, %xmm9, %xmm3
+; SSE41-NEXT: movaps %xmm8, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: test_masked_v16i32:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
+; AVX1-NEXT: vpcmpeqd %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vpcmpeqd %xmm6, %xmm6, %xmm6
+; AVX1-NEXT: vpxor %xmm6, %xmm4, %xmm4
+; AVX1-NEXT: vpcmpeqd %xmm5, %xmm3, %xmm3
+; AVX1-NEXT: vpxor %xmm6, %xmm3, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT: vpcmpeqd %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vpxor %xmm6, %xmm4, %xmm4
+; AVX1-NEXT: vpcmpeqd %xmm5, %xmm2, %xmm2
+; AVX1-NEXT: vpxor %xmm6, %xmm2, %xmm2
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
+; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm4
+; AVX1-NEXT: vmovntdqa 48(%rdi), %xmm5
+; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4
+; AVX1-NEXT: vblendvps %ymm3, %ymm4, %ymm1, %ymm1
+; AVX1-NEXT: vmovntdqa (%rdi), %xmm3
+; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm4
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT: vblendvps %ymm2, %ymm3, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_masked_v16i32:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; AVX2-NEXT: vpcmpeqd %ymm4, %ymm3, %ymm3
+; AVX2-NEXT: vpcmpeqd %ymm5, %ymm5, %ymm5
+; AVX2-NEXT: vpxor %ymm5, %ymm3, %ymm3
+; AVX2-NEXT: vpcmpeqd %ymm4, %ymm2, %ymm2
+; AVX2-NEXT: vpxor %ymm5, %ymm2, %ymm2
+; AVX2-NEXT: vmovntdqa 32(%rdi), %ymm4
+; AVX2-NEXT: vblendvps %ymm3, %ymm4, %ymm1, %ymm1
+; AVX2-NEXT: vmovntdqa (%rdi), %ymm3
+; AVX2-NEXT: vblendvps %ymm2, %ymm3, %ymm0, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test_masked_v16i32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512-NEXT: vpcmpneqd %zmm2, %zmm1, %k1
+; AVX512-NEXT: vmovntdqa (%rdi), %zmm1
+; AVX512-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
+; AVX512-NEXT: retq
+ %mask = icmp ne <16 x i32> %mask1, zeroinitializer
+ %vaddr = bitcast i8* %addr to <16 x i32>*
+ %r = load <16 x i32>, <16 x i32>* %vaddr, align 64, !nontemporal !1
+ %res = select <16 x i1> %mask, <16 x i32> %r, <16 x i32> %old
+ ret <16 x i32>%res
+}
+
!1 = !{i32 1}
diff --git a/test/CodeGen/X86/nontemporal.ll b/test/CodeGen/X86/nontemporal.ll
index d49c88724331..f53982a85421 100644
--- a/test/CodeGen/X86/nontemporal.ll
+++ b/test/CodeGen/X86/nontemporal.ll
@@ -4,125 +4,186 @@
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X64-SSE
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=X64-AVX
-define void @f(<4 x float> %A, i8* %B, <2 x double> %C, i32 %D, <2 x i64> %E, <4 x i32> %F, <8 x i16> %G, <16 x i8> %H, i64 %I) nounwind {
+define i32 @f(<4 x float> %A, i8* %B, <2 x double> %C, i32 %D, <2 x i64> %E, <4 x i32> %F, <8 x i16> %G, <16 x i8> %H, i64 %I, i32* %loadptr) nounwind {
; X32-SSE-LABEL: f:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: pushl %ebp
; X32-SSE-NEXT: movl %esp, %ebp
+; X32-SSE-NEXT: pushl %esi
; X32-SSE-NEXT: andl $-16, %esp
; X32-SSE-NEXT: subl $16, %esp
-; X32-SSE-NEXT: movl 72(%ebp), %eax
-; X32-SSE-NEXT: movl 76(%ebp), %ecx
-; X32-SSE-NEXT: movdqa 56(%ebp), %xmm3
-; X32-SSE-NEXT: movdqa 40(%ebp), %xmm4
-; X32-SSE-NEXT: movdqa 24(%ebp), %xmm5
+; X32-SSE-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero
+; X32-SSE-NEXT: movl 12(%ebp), %eax
+; X32-SSE-NEXT: movdqa 56(%ebp), %xmm4
+; X32-SSE-NEXT: movdqa 40(%ebp), %xmm5
+; X32-SSE-NEXT: movdqa 24(%ebp), %xmm6
; X32-SSE-NEXT: movl 8(%ebp), %edx
+; X32-SSE-NEXT: movl 80(%ebp), %ecx
+; X32-SSE-NEXT: movl (%ecx), %esi
; X32-SSE-NEXT: addps {{\.LCPI.*}}, %xmm0
; X32-SSE-NEXT: movntps %xmm0, (%edx)
; X32-SSE-NEXT: paddq {{\.LCPI.*}}, %xmm2
+; X32-SSE-NEXT: addl (%ecx), %esi
; X32-SSE-NEXT: movntdq %xmm2, (%edx)
; X32-SSE-NEXT: addpd {{\.LCPI.*}}, %xmm1
+; X32-SSE-NEXT: addl (%ecx), %esi
; X32-SSE-NEXT: movntpd %xmm1, (%edx)
-; X32-SSE-NEXT: paddd {{\.LCPI.*}}, %xmm5
+; X32-SSE-NEXT: paddd {{\.LCPI.*}}, %xmm6
+; X32-SSE-NEXT: addl (%ecx), %esi
+; X32-SSE-NEXT: movntdq %xmm6, (%edx)
+; X32-SSE-NEXT: paddw {{\.LCPI.*}}, %xmm5
+; X32-SSE-NEXT: addl (%ecx), %esi
; X32-SSE-NEXT: movntdq %xmm5, (%edx)
-; X32-SSE-NEXT: paddw {{\.LCPI.*}}, %xmm4
+; X32-SSE-NEXT: paddb {{\.LCPI.*}}, %xmm4
+; X32-SSE-NEXT: addl (%ecx), %esi
; X32-SSE-NEXT: movntdq %xmm4, (%edx)
-; X32-SSE-NEXT: paddb {{\.LCPI.*}}, %xmm3
-; X32-SSE-NEXT: movntdq %xmm3, (%edx)
-; X32-SSE-NEXT: movntil %ecx, 4(%edx)
+; X32-SSE-NEXT: addl (%ecx), %esi
; X32-SSE-NEXT: movntil %eax, (%edx)
-; X32-SSE-NEXT: movl %ebp, %esp
+; X32-SSE-NEXT: movl (%ecx), %eax
+; X32-SSE-NEXT: addl %esi, %eax
+; X32-SSE-NEXT: movsd %xmm3, (%edx)
+; X32-SSE-NEXT: addl (%ecx), %eax
+; X32-SSE-NEXT: leal -4(%ebp), %esp
+; X32-SSE-NEXT: popl %esi
; X32-SSE-NEXT: popl %ebp
; X32-SSE-NEXT: retl
;
; X32-AVX-LABEL: f:
-; X32-AVX: # BB#0:
+; X32-AVX: # %bb.0:
; X32-AVX-NEXT: pushl %ebp
; X32-AVX-NEXT: movl %esp, %ebp
+; X32-AVX-NEXT: pushl %esi
; X32-AVX-NEXT: andl $-16, %esp
; X32-AVX-NEXT: subl $16, %esp
-; X32-AVX-NEXT: movl 72(%ebp), %eax
-; X32-AVX-NEXT: movl 76(%ebp), %ecx
-; X32-AVX-NEXT: vmovdqa 56(%ebp), %xmm3
-; X32-AVX-NEXT: vmovdqa 40(%ebp), %xmm4
-; X32-AVX-NEXT: vmovdqa 24(%ebp), %xmm5
-; X32-AVX-NEXT: movl 8(%ebp), %edx
+; X32-AVX-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero
+; X32-AVX-NEXT: movl 12(%ebp), %eax
+; X32-AVX-NEXT: vmovdqa 56(%ebp), %xmm4
+; X32-AVX-NEXT: vmovdqa 40(%ebp), %xmm5
+; X32-AVX-NEXT: vmovdqa 24(%ebp), %xmm6
+; X32-AVX-NEXT: movl 8(%ebp), %ecx
+; X32-AVX-NEXT: movl 80(%ebp), %edx
+; X32-AVX-NEXT: movl (%edx), %esi
; X32-AVX-NEXT: vaddps {{\.LCPI.*}}, %xmm0, %xmm0
-; X32-AVX-NEXT: vmovntps %xmm0, (%edx)
+; X32-AVX-NEXT: vmovntps %xmm0, (%ecx)
; X32-AVX-NEXT: vpaddq {{\.LCPI.*}}, %xmm2, %xmm0
-; X32-AVX-NEXT: vmovntdq %xmm0, (%edx)
+; X32-AVX-NEXT: addl (%edx), %esi
+; X32-AVX-NEXT: vmovntdq %xmm0, (%ecx)
; X32-AVX-NEXT: vaddpd {{\.LCPI.*}}, %xmm1, %xmm0
-; X32-AVX-NEXT: vmovntpd %xmm0, (%edx)
-; X32-AVX-NEXT: vpaddd {{\.LCPI.*}}, %xmm5, %xmm0
-; X32-AVX-NEXT: vmovntdq %xmm0, (%edx)
-; X32-AVX-NEXT: vpaddw {{\.LCPI.*}}, %xmm4, %xmm0
-; X32-AVX-NEXT: vmovntdq %xmm0, (%edx)
-; X32-AVX-NEXT: vpaddb {{\.LCPI.*}}, %xmm3, %xmm0
-; X32-AVX-NEXT: vmovntdq %xmm0, (%edx)
-; X32-AVX-NEXT: movntil %ecx, 4(%edx)
-; X32-AVX-NEXT: movntil %eax, (%edx)
-; X32-AVX-NEXT: movl %ebp, %esp
+; X32-AVX-NEXT: addl (%edx), %esi
+; X32-AVX-NEXT: vmovntpd %xmm0, (%ecx)
+; X32-AVX-NEXT: vpaddd {{\.LCPI.*}}, %xmm6, %xmm0
+; X32-AVX-NEXT: addl (%edx), %esi
+; X32-AVX-NEXT: vmovntdq %xmm0, (%ecx)
+; X32-AVX-NEXT: vpaddw {{\.LCPI.*}}, %xmm5, %xmm0
+; X32-AVX-NEXT: addl (%edx), %esi
+; X32-AVX-NEXT: vmovntdq %xmm0, (%ecx)
+; X32-AVX-NEXT: vpaddb {{\.LCPI.*}}, %xmm4, %xmm0
+; X32-AVX-NEXT: addl (%edx), %esi
+; X32-AVX-NEXT: vmovntdq %xmm0, (%ecx)
+; X32-AVX-NEXT: addl (%edx), %esi
+; X32-AVX-NEXT: movntil %eax, (%ecx)
+; X32-AVX-NEXT: movl (%edx), %eax
+; X32-AVX-NEXT: addl %esi, %eax
+; X32-AVX-NEXT: vmovsd %xmm3, (%ecx)
+; X32-AVX-NEXT: addl (%edx), %eax
+; X32-AVX-NEXT: leal -4(%ebp), %esp
+; X32-AVX-NEXT: popl %esi
; X32-AVX-NEXT: popl %ebp
; X32-AVX-NEXT: retl
;
; X64-SSE-LABEL: f:
-; X64-SSE: # BB#0:
+; X64-SSE: # %bb.0:
+; X64-SSE-NEXT: movl (%rcx), %eax
; X64-SSE-NEXT: addps {{.*}}(%rip), %xmm0
; X64-SSE-NEXT: movntps %xmm0, (%rdi)
; X64-SSE-NEXT: paddq {{.*}}(%rip), %xmm2
+; X64-SSE-NEXT: addl (%rcx), %eax
; X64-SSE-NEXT: movntdq %xmm2, (%rdi)
; X64-SSE-NEXT: addpd {{.*}}(%rip), %xmm1
+; X64-SSE-NEXT: addl (%rcx), %eax
; X64-SSE-NEXT: movntpd %xmm1, (%rdi)
; X64-SSE-NEXT: paddd {{.*}}(%rip), %xmm3
+; X64-SSE-NEXT: addl (%rcx), %eax
; X64-SSE-NEXT: movntdq %xmm3, (%rdi)
; X64-SSE-NEXT: paddw {{.*}}(%rip), %xmm4
+; X64-SSE-NEXT: addl (%rcx), %eax
; X64-SSE-NEXT: movntdq %xmm4, (%rdi)
; X64-SSE-NEXT: paddb {{.*}}(%rip), %xmm5
+; X64-SSE-NEXT: addl (%rcx), %eax
; X64-SSE-NEXT: movntdq %xmm5, (%rdi)
+; X64-SSE-NEXT: addl (%rcx), %eax
; X64-SSE-NEXT: movntil %esi, (%rdi)
+; X64-SSE-NEXT: addl (%rcx), %eax
; X64-SSE-NEXT: movntiq %rdx, (%rdi)
+; X64-SSE-NEXT: addl (%rcx), %eax
; X64-SSE-NEXT: retq
;
; X64-AVX-LABEL: f:
-; X64-AVX: # BB#0:
+; X64-AVX: # %bb.0:
+; X64-AVX-NEXT: movl (%rcx), %eax
; X64-AVX-NEXT: vaddps {{.*}}(%rip), %xmm0, %xmm0
; X64-AVX-NEXT: vmovntps %xmm0, (%rdi)
; X64-AVX-NEXT: vpaddq {{.*}}(%rip), %xmm2, %xmm0
+; X64-AVX-NEXT: addl (%rcx), %eax
; X64-AVX-NEXT: vmovntdq %xmm0, (%rdi)
; X64-AVX-NEXT: vaddpd {{.*}}(%rip), %xmm1, %xmm0
+; X64-AVX-NEXT: addl (%rcx), %eax
; X64-AVX-NEXT: vmovntpd %xmm0, (%rdi)
; X64-AVX-NEXT: vpaddd {{.*}}(%rip), %xmm3, %xmm0
+; X64-AVX-NEXT: addl (%rcx), %eax
; X64-AVX-NEXT: vmovntdq %xmm0, (%rdi)
; X64-AVX-NEXT: vpaddw {{.*}}(%rip), %xmm4, %xmm0
+; X64-AVX-NEXT: addl (%rcx), %eax
; X64-AVX-NEXT: vmovntdq %xmm0, (%rdi)
; X64-AVX-NEXT: vpaddb {{.*}}(%rip), %xmm5, %xmm0
+; X64-AVX-NEXT: addl (%rcx), %eax
; X64-AVX-NEXT: vmovntdq %xmm0, (%rdi)
+; X64-AVX-NEXT: addl (%rcx), %eax
; X64-AVX-NEXT: movntil %esi, (%rdi)
+; X64-AVX-NEXT: addl (%rcx), %eax
; X64-AVX-NEXT: movntiq %rdx, (%rdi)
+; X64-AVX-NEXT: addl (%rcx), %eax
; X64-AVX-NEXT: retq
+ %v0 = load i32, i32* %loadptr, align 1
%cast = bitcast i8* %B to <4 x float>*
%A2 = fadd <4 x float> %A, <float 1.0, float 2.0, float 3.0, float 4.0>
store <4 x float> %A2, <4 x float>* %cast, align 16, !nontemporal !0
+ %v1 = load i32, i32* %loadptr, align 1
%cast1 = bitcast i8* %B to <2 x i64>*
%E2 = add <2 x i64> %E, <i64 1, i64 2>
store <2 x i64> %E2, <2 x i64>* %cast1, align 16, !nontemporal !0
+ %v2 = load i32, i32* %loadptr, align 1
%cast2 = bitcast i8* %B to <2 x double>*
%C2 = fadd <2 x double> %C, <double 1.0, double 2.0>
store <2 x double> %C2, <2 x double>* %cast2, align 16, !nontemporal !0
+ %v3 = load i32, i32* %loadptr, align 1
%cast3 = bitcast i8* %B to <4 x i32>*
%F2 = add <4 x i32> %F, <i32 1, i32 2, i32 3, i32 4>
store <4 x i32> %F2, <4 x i32>* %cast3, align 16, !nontemporal !0
+ %v4 = load i32, i32* %loadptr, align 1
%cast4 = bitcast i8* %B to <8 x i16>*
%G2 = add <8 x i16> %G, <i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8>
store <8 x i16> %G2, <8 x i16>* %cast4, align 16, !nontemporal !0
+ %v5 = load i32, i32* %loadptr, align 1
%cast5 = bitcast i8* %B to <16 x i8>*
%H2 = add <16 x i8> %H, <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8>
store <16 x i8> %H2, <16 x i8>* %cast5, align 16, !nontemporal !0
+ %v6 = load i32, i32* %loadptr, align 1
%cast6 = bitcast i8* %B to i32*
store i32 %D, i32* %cast6, align 1, !nontemporal !0
+ %v7 = load i32, i32* %loadptr, align 1
%cast7 = bitcast i8* %B to i64*
store i64 %I, i64* %cast7, align 1, !nontemporal !0
- ret void
+ %v8 = load i32, i32* %loadptr, align 1
+ %sum1 = add i32 %v0, %v1
+ %sum2 = add i32 %sum1, %v2
+ %sum3 = add i32 %sum2, %v3
+ %sum4 = add i32 %sum3, %v4
+ %sum5 = add i32 %sum4, %v5
+ %sum6 = add i32 %sum5, %v6
+ %sum7 = add i32 %sum6, %v7
+ %sum8 = add i32 %sum7, %v8
+ ret i32 %sum8
}
!0 = !{i32 1}
diff --git a/test/CodeGen/X86/norex-subreg.ll b/test/CodeGen/X86/norex-subreg.ll
index dd47af9ae9ab..205fb4e00114 100644
--- a/test/CodeGen/X86/norex-subreg.ll
+++ b/test/CodeGen/X86/norex-subreg.ll
@@ -4,10 +4,10 @@ target triple = "x86_64-apple-macosx10.7"
; This test case extracts a sub_8bit_hi sub-register:
;
-; %R8B<def> = COPY %BH, %EBX<imp-use,kill>
-; %ESI<def> = MOVZX32_NOREXrr8 %R8B<kill>
+; %r8b = COPY %bh, implicit killed %ebx
+; %esi = MOVZX32_NOREXrr8 killed %r8b
;
-; The register allocation above is invalid, %BH can only be encoded without an
+; The register allocation above is invalid, %bh can only be encoded without an
; REX prefix, so the destination register must be GR8_NOREX. The code above
; triggers an assertion in copyPhysReg.
;
@@ -41,10 +41,10 @@ entry:
; This test case extracts a sub_8bit_hi sub-register:
;
-; %vreg2<def> = COPY %vreg1:sub_8bit_hi; GR8:%vreg2 GR64_ABCD:%vreg1
-; TEST8ri %vreg2, 1, %EFLAGS<imp-def>; GR8:%vreg2
+; %2 = COPY %1:sub_8bit_hi; GR8:%2 GR64_ABCD:%1
+; TEST8ri %2, 1, implicit-def %eflags; GR8:%2
;
-; %vreg2 must be constrained to GR8_NOREX, or the COPY could become impossible.
+; %2 must be constrained to GR8_NOREX, or the COPY could become impossible.
;
; PR11088
diff --git a/test/CodeGen/X86/nosse-error1.ll b/test/CodeGen/X86/nosse-error1.ll
index 7617d59f4a05..9a2242fde24a 100644
--- a/test/CodeGen/X86/nosse-error1.ll
+++ b/test/CodeGen/X86/nosse-error1.ll
@@ -1,5 +1,5 @@
-; RUN: not llc < %s -march=x86-64 -mattr=-sse 2>&1 | FileCheck --check-prefix NOSSE %s
-; RUN: llc < %s -march=x86-64 | FileCheck %s
+; RUN: not llc < %s -mattr=-sse 2>&1 | FileCheck --check-prefix NOSSE %s
+; RUN: llc < %s | FileCheck %s
; NOSSE: {{SSE register return with SSE disabled}}
diff --git a/test/CodeGen/X86/nosse-error2.ll b/test/CodeGen/X86/nosse-error2.ll
index 3da80aae686f..b88ddf85e0ef 100644
--- a/test/CodeGen/X86/nosse-error2.ll
+++ b/test/CodeGen/X86/nosse-error2.ll
@@ -1,5 +1,5 @@
-; RUN: not llc < %s -march=x86 -mcpu=i686 -mattr=-sse 2>&1 | FileCheck --check-prefix NOSSE %s
-; RUN: llc < %s -march=x86 -mcpu=i686 -mattr=+sse | FileCheck %s
+; RUN: not llc < %s -mcpu=i686 -mattr=-sse 2>&1 | FileCheck --check-prefix NOSSE %s
+; RUN: llc < %s -mcpu=i686 -mattr=+sse | FileCheck %s
; NOSSE: {{SSE register return with SSE disabled}}
diff --git a/test/CodeGen/X86/nosse-varargs.ll b/test/CodeGen/X86/nosse-varargs.ll
index 8a81d0e71953..5b6da24bba72 100644
--- a/test/CodeGen/X86/nosse-varargs.ll
+++ b/test/CodeGen/X86/nosse-varargs.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=x86-64 -mattr=-sse | FileCheck %s -check-prefix=NOSSE
-; RUN: llc < %s -march=x86-64 | FileCheck %s -check-prefix=YESSSE
+; RUN: llc < %s -mattr=-sse | FileCheck %s -check-prefix=NOSSE
+; RUN: llc < %s | FileCheck %s -check-prefix=YESSSE
; PR3403
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
target triple = "x86_64-unknown-linux-gnu"
diff --git a/test/CodeGen/X86/nosse-vector.ll b/test/CodeGen/X86/nosse-vector.ll
index 398234a6d03c..ec97b1ed9c00 100644
--- a/test/CodeGen/X86/nosse-vector.ll
+++ b/test/CodeGen/X86/nosse-vector.ll
@@ -4,7 +4,7 @@
define void @fadd_2f64_mem(<2 x double>* %p0, <2 x double>* %p1, <2 x double>* %p2) nounwind {
; X32-LABEL: fadd_2f64_mem:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
@@ -18,7 +18,7 @@ define void @fadd_2f64_mem(<2 x double>* %p0, <2 x double>* %p1, <2 x double>* %
; X32-NEXT: retl
;
; X64-LABEL: fadd_2f64_mem:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: fldl 8(%rdi)
; X64-NEXT: fldl (%rdi)
; X64-NEXT: faddl (%rsi)
@@ -36,7 +36,7 @@ define void @fadd_2f64_mem(<2 x double>* %p0, <2 x double>* %p1, <2 x double>* %
define void @fadd_4f32_mem(<4 x float>* %p0, <4 x float>* %p1, <4 x float>* %p2) nounwind {
; X32-LABEL: fadd_4f32_mem:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
@@ -59,7 +59,7 @@ define void @fadd_4f32_mem(<4 x float>* %p0, <4 x float>* %p1, <4 x float>* %p2)
; X32-NEXT: retl
;
; X64-LABEL: fadd_4f32_mem:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: flds 12(%rdi)
; X64-NEXT: flds 8(%rdi)
; X64-NEXT: flds 4(%rdi)
@@ -86,7 +86,7 @@ define void @fadd_4f32_mem(<4 x float>* %p0, <4 x float>* %p1, <4 x float>* %p2)
define void @fdiv_4f32_mem(<4 x float>* %p0, <4 x float>* %p1, <4 x float>* %p2) nounwind {
; X32-LABEL: fdiv_4f32_mem:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
@@ -109,7 +109,7 @@ define void @fdiv_4f32_mem(<4 x float>* %p0, <4 x float>* %p1, <4 x float>* %p2)
; X32-NEXT: retl
;
; X64-LABEL: fdiv_4f32_mem:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: flds 12(%rdi)
; X64-NEXT: flds 8(%rdi)
; X64-NEXT: flds 4(%rdi)
@@ -136,7 +136,7 @@ define void @fdiv_4f32_mem(<4 x float>* %p0, <4 x float>* %p1, <4 x float>* %p2)
define void @sitofp_4i64_4f32_mem(<4 x i64>* %p0, <4 x float>* %p1) nounwind {
; X32-LABEL: sitofp_4i64_4f32_mem:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pushl %ebp
; X32-NEXT: movl %esp, %ebp
; X32-NEXT: pushl %ebx
@@ -182,7 +182,7 @@ define void @sitofp_4i64_4f32_mem(<4 x i64>* %p0, <4 x float>* %p1) nounwind {
; X32-NEXT: retl
;
; X64-LABEL: sitofp_4i64_4f32_mem:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movq 24(%rdi), %rax
; X64-NEXT: movq 16(%rdi), %rcx
; X64-NEXT: movq (%rdi), %rdx
@@ -208,7 +208,7 @@ define void @sitofp_4i64_4f32_mem(<4 x i64>* %p0, <4 x float>* %p1) nounwind {
define void @sitofp_4i32_4f32_mem(<4 x i32>* %p0, <4 x float>* %p1) nounwind {
; X32-LABEL: sitofp_4i32_4f32_mem:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pushl %edi
; X32-NEXT: pushl %esi
; X32-NEXT: subl $16, %esp
@@ -236,7 +236,7 @@ define void @sitofp_4i32_4f32_mem(<4 x i32>* %p0, <4 x float>* %p1) nounwind {
; X32-NEXT: retl
;
; X64-LABEL: sitofp_4i32_4f32_mem:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movl 12(%rdi), %eax
; X64-NEXT: movl 8(%rdi), %ecx
; X64-NEXT: movl (%rdi), %edx
@@ -262,7 +262,7 @@ define void @sitofp_4i32_4f32_mem(<4 x i32>* %p0, <4 x float>* %p1) nounwind {
define void @add_2i64_mem(<2 x i64>* %p0, <2 x i64>* %p1, <2 x i64>* %p2) nounwind {
; X32-LABEL: add_2i64_mem:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pushl %ebx
; X32-NEXT: pushl %edi
; X32-NEXT: pushl %esi
@@ -287,7 +287,7 @@ define void @add_2i64_mem(<2 x i64>* %p0, <2 x i64>* %p1, <2 x i64>* %p2) nounwi
; X32-NEXT: retl
;
; X64-LABEL: add_2i64_mem:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movq (%rdi), %rax
; X64-NEXT: movq 8(%rdi), %rcx
; X64-NEXT: addq (%rsi), %rax
@@ -304,7 +304,7 @@ define void @add_2i64_mem(<2 x i64>* %p0, <2 x i64>* %p1, <2 x i64>* %p2) nounwi
define void @add_4i32_mem(<4 x i32>* %p0, <4 x i32>* %p1, <4 x i32>* %p2) nounwind {
; X32-LABEL: add_4i32_mem:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pushl %ebx
; X32-NEXT: pushl %edi
; X32-NEXT: pushl %esi
@@ -329,7 +329,7 @@ define void @add_4i32_mem(<4 x i32>* %p0, <4 x i32>* %p1, <4 x i32>* %p2) nounwi
; X32-NEXT: retl
;
; X64-LABEL: add_4i32_mem:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movl 12(%rdi), %eax
; X64-NEXT: movl 8(%rdi), %ecx
; X64-NEXT: movl (%rdi), %r8d
diff --git a/test/CodeGen/X86/not-and-simplify.ll b/test/CodeGen/X86/not-and-simplify.ll
index 87aa10a6e296..e753aeb16d58 100644
--- a/test/CodeGen/X86/not-and-simplify.ll
+++ b/test/CodeGen/X86/not-and-simplify.ll
@@ -6,7 +6,7 @@
define i32 @shrink_xor_constant1(i32 %x) {
; ALL-LABEL: shrink_xor_constant1:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: shrl $31, %edi
; ALL-NEXT: xorl $1, %edi
; ALL-NEXT: movl %edi, %eax
@@ -19,7 +19,7 @@ define i32 @shrink_xor_constant1(i32 %x) {
define <4 x i32> @shrink_xor_constant1_splat(<4 x i32> %x) {
; ALL-LABEL: shrink_xor_constant1_splat:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: psrld $31, %xmm0
; ALL-NEXT: pandn {{.*}}(%rip), %xmm0
; ALL-NEXT: retq
@@ -33,7 +33,7 @@ define <4 x i32> @shrink_xor_constant1_splat(<4 x i32> %x) {
define i8 @shrink_xor_constant2(i8 %x) {
; ALL-LABEL: shrink_xor_constant2:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: shlb $5, %dil
; ALL-NEXT: xorb $-32, %dil
; ALL-NEXT: movl %edi, %eax
@@ -46,10 +46,8 @@ define i8 @shrink_xor_constant2(i8 %x) {
define <16 x i8> @shrink_xor_constant2_splat(<16 x i8> %x) {
; ALL-LABEL: shrink_xor_constant2_splat:
-; ALL: # BB#0:
-; ALL-NEXT: psllw $5, %xmm0
-; ALL-NEXT: pand {{.*}}(%rip), %xmm0
-; ALL-NEXT: pandn {{.*}}(%rip), %xmm0
+; ALL: # %bb.0:
+; ALL-NEXT: movaps {{.*#+}} xmm0 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; ALL-NEXT: retq
%sh = shl <16 x i8> %x, <i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5>
%not = xor <16 x i8> %sh, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
diff --git a/test/CodeGen/X86/null-streamer.ll b/test/CodeGen/X86/null-streamer.ll
index 3571d37ed423..8528be3ca593 100644
--- a/test/CodeGen/X86/null-streamer.ll
+++ b/test/CodeGen/X86/null-streamer.ll
@@ -1,6 +1,6 @@
; Check the MCNullStreamer operates correctly, at least on a minimal test case.
;
-; RUN: llc -filetype=null -o %t -march=x86 %s
+; RUN: llc -filetype=null -o %t -mtriple=i686-- %s
; RUN: llc -filetype=null -o %t -mtriple=i686-cygwin %s
source_filename = "test/CodeGen/X86/null-streamer.ll"
@@ -20,7 +20,7 @@ define void @f1() {
!1 = !DIFile(filename: "file.c", directory: "")
!2 = !{}
!3 = !{!4}
-!4 = !DIGlobalVariableExpression(var: !5)
+!4 = !DIGlobalVariableExpression(var: !5, expr: !DIExpression())
!5 = !DIGlobalVariable(name: "i", linkageName: "_ZL1i", scope: null, file: !1, line: 1, type: !6, isLocal: true, isDefinition: true)
!6 = !DIBasicType(size: 32, align: 32, encoding: DW_ATE_signed)
!7 = !{i32 2, !"Dwarf Version", i32 3}
diff --git a/test/CodeGen/X86/object-size.ll b/test/CodeGen/X86/object-size.ll
index 9f42af4aea95..b795e0fda8b9 100644
--- a/test/CodeGen/X86/object-size.ll
+++ b/test/CodeGen/X86/object-size.ll
@@ -1,4 +1,4 @@
-; RUN: llc -O0 < %s -march=x86-64 | FileCheck %s
+; RUN: llc -O0 < %s | FileCheck %s
; ModuleID = 'ts.c'
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
diff --git a/test/CodeGen/X86/oddshuffles.ll b/test/CodeGen/X86/oddshuffles.ll
index 0bda41a30c69..df97973aecbd 100644
--- a/test/CodeGen/X86/oddshuffles.ll
+++ b/test/CodeGen/X86/oddshuffles.ll
@@ -3,10 +3,11 @@
; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+sse4.2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE42
; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+xop | FileCheck %s --check-prefix=XOP
define void @v3i64(<2 x i64> %a, <2 x i64> %b, <3 x i64>* %p) nounwind {
; SSE2-LABEL: v3i64:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE2-NEXT: movq %xmm2, 16(%rdi)
@@ -14,56 +15,70 @@ define void @v3i64(<2 x i64> %a, <2 x i64> %b, <3 x i64>* %p) nounwind {
; SSE2-NEXT: retq
;
; SSE42-LABEL: v3i64:
-; SSE42: # BB#0:
+; SSE42: # %bb.0:
; SSE42-NEXT: pextrq $1, %xmm0, 16(%rdi)
; SSE42-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE42-NEXT: movdqa %xmm0, (%rdi)
; SSE42-NEXT: retq
;
; AVX1-LABEL: v3i64:
-; AVX1: # BB#0:
-; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],xmm1[0]
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm0[0],xmm1[0]
; AVX1-NEXT: vpextrq $1, %xmm0, 16(%rdi)
-; AVX1-NEXT: vmovapd %xmm1, (%rdi)
+; AVX1-NEXT: vmovdqa %xmm1, (%rdi)
; AVX1-NEXT: retq
;
; AVX2-LABEL: v3i64:
-; AVX2: # BB#0:
-; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; AVX2: # %bb.0:
+; AVX2-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3]
; AVX2-NEXT: vpextrq $1, %xmm0, 16(%rdi)
; AVX2-NEXT: vmovdqa %xmm1, (%rdi)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
+;
+; XOP-LABEL: v3i64:
+; XOP: # %bb.0:
+; XOP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm0[0],xmm1[0]
+; XOP-NEXT: vpextrq $1, %xmm0, 16(%rdi)
+; XOP-NEXT: vmovdqa %xmm1, (%rdi)
+; XOP-NEXT: retq
%r = shufflevector <2 x i64> %a, <2 x i64> %b, <3 x i32> <i32 0, i32 2, i32 1>
store <3 x i64> %r, <3 x i64>* %p
ret void
}
define void @v3f64(<2 x double> %a, <2 x double> %b, <3 x double>* %p) nounwind {
; SSE-LABEL: v3f64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movhpd %xmm0, 16(%rdi)
; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE-NEXT: movapd %xmm0, (%rdi)
; SSE-NEXT: retq
;
; AVX1-LABEL: v3f64:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],xmm1[0]
; AVX1-NEXT: vmovhpd %xmm0, 16(%rdi)
; AVX1-NEXT: vmovapd %xmm1, (%rdi)
; AVX1-NEXT: retq
;
; AVX2-LABEL: v3f64:
-; AVX2: # BB#0:
-; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; AVX2: # %bb.0:
+; AVX2-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,1,3]
; AVX2-NEXT: vmovhpd %xmm0, 16(%rdi)
; AVX2-NEXT: vmovapd %xmm1, (%rdi)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
+;
+; XOP-LABEL: v3f64:
+; XOP: # %bb.0:
+; XOP-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],xmm1[0]
+; XOP-NEXT: vmovhpd %xmm0, 16(%rdi)
+; XOP-NEXT: vmovapd %xmm1, (%rdi)
+; XOP-NEXT: retq
%r = shufflevector <2 x double> %a, <2 x double> %b, <3 x i32> <i32 0, i32 2, i32 1>
store <3 x double> %r, <3 x double>* %p
ret void
@@ -71,7 +86,7 @@ define void @v3f64(<2 x double> %a, <2 x double> %b, <3 x double>* %p) nounwind
define void @v3i32(<2 x i32> %a, <2 x i32> %b, <3 x i32>* %p) nounwind {
; SSE2-LABEL: v3i32:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
@@ -80,7 +95,7 @@ define void @v3i32(<2 x i32> %a, <2 x i32> %b, <3 x i32>* %p) nounwind {
; SSE2-NEXT: retq
;
; SSE42-LABEL: v3i32:
-; SSE42: # BB#0:
+; SSE42: # %bb.0:
; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
; SSE42-NEXT: pextrd $2, %xmm0, 8(%rdi)
@@ -88,7 +103,7 @@ define void @v3i32(<2 x i32> %a, <2 x i32> %b, <3 x i32>* %p) nounwind {
; SSE42-NEXT: retq
;
; AVX1-LABEL: v3i32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
; AVX1-NEXT: vpextrd $2, %xmm0, 8(%rdi)
@@ -96,12 +111,20 @@ define void @v3i32(<2 x i32> %a, <2 x i32> %b, <3 x i32>* %p) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: v3i32:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpbroadcastd %xmm1, %xmm1
-; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2,3]
-; AVX2-NEXT: vpextrd $2, %xmm0, 8(%rdi)
-; AVX2-NEXT: vmovq %xmm1, (%rdi)
+; AVX2: # %bb.0:
+; AVX2-NEXT: vbroadcastss %xmm1, %xmm1
+; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX2-NEXT: vextractps $2, %xmm0, 8(%rdi)
+; AVX2-NEXT: vmovlps %xmm1, (%rdi)
; AVX2-NEXT: retq
+;
+; XOP-LABEL: v3i32:
+; XOP: # %bb.0:
+; XOP-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
+; XOP-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
+; XOP-NEXT: vpextrd $2, %xmm0, 8(%rdi)
+; XOP-NEXT: vmovq %xmm1, (%rdi)
+; XOP-NEXT: retq
%r = shufflevector <2 x i32> %a, <2 x i32> %b, <3 x i32> <i32 0, i32 2, i32 1>
store <3 x i32> %r, <3 x i32>* %p
ret void
@@ -109,7 +132,7 @@ define void @v3i32(<2 x i32> %a, <2 x i32> %b, <3 x i32>* %p) nounwind {
define void @v5i16(<4 x i16> %a, <4 x i16> %b, <5 x i16>* %p) nounwind {
; SSE2-LABEL: v5i16:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7]
; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[0,2,2,3,4,5,6,7]
@@ -121,7 +144,7 @@ define void @v5i16(<4 x i16> %a, <4 x i16> %b, <5 x i16>* %p) nounwind {
; SSE2-NEXT: retq
;
; SSE42-LABEL: v5i16:
-; SSE42: # BB#0:
+; SSE42: # %bb.0:
; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
; SSE42-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7]
; SSE42-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[0,2,2,3,4,5,6,7]
@@ -132,7 +155,7 @@ define void @v5i16(<4 x i16> %a, <4 x i16> %b, <5 x i16>* %p) nounwind {
; SSE42-NEXT: retq
;
; AVX-LABEL: v5i16:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7]
; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,2,2,3,4,5,6,7]
@@ -141,6 +164,13 @@ define void @v5i16(<4 x i16> %a, <4 x i16> %b, <5 x i16>* %p) nounwind {
; AVX-NEXT: vpextrw $6, %xmm0, 8(%rdi)
; AVX-NEXT: vmovq %xmm1, (%rdi)
; AVX-NEXT: retq
+;
+; XOP-LABEL: v5i16:
+; XOP: # %bb.0:
+; XOP-NEXT: vpperm {{.*#+}} xmm1 = xmm0[0,1],xmm1[4,5],xmm0[4,5],xmm1[8,9],xmm0[12,13],xmm1[4,5],xmm0[14,15],xmm1[6,7]
+; XOP-NEXT: vpextrw $6, %xmm0, 8(%rdi)
+; XOP-NEXT: vmovq %xmm1, (%rdi)
+; XOP-NEXT: retq
%r = shufflevector <4 x i16> %a, <4 x i16> %b, <5 x i32> <i32 0, i32 5, i32 1, i32 6, i32 3>
store <5 x i16> %r, <5 x i16>* %p
ret void
@@ -148,7 +178,7 @@ define void @v5i16(<4 x i16> %a, <4 x i16> %b, <5 x i16>* %p) nounwind {
define void @v5i32(<4 x i32> %a, <4 x i32> %b, <5 x i32>* %p) nounwind {
; SSE2-LABEL: v5i32:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,2,2,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
@@ -157,7 +187,7 @@ define void @v5i32(<4 x i32> %a, <4 x i32> %b, <5 x i32>* %p) nounwind {
; SSE2-NEXT: retq
;
; SSE42-LABEL: v5i32:
-; SSE42: # BB#0:
+; SSE42: # %bb.0:
; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,2]
; SSE42-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero
; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
@@ -166,23 +196,31 @@ define void @v5i32(<4 x i32> %a, <4 x i32> %b, <5 x i32>* %p) nounwind {
; SSE42-NEXT: retq
;
; AVX1-LABEL: v5i32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,1],xmm1[1,2]
; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,2,1,3]
-; AVX1-NEXT: vpextrd $3, %xmm0, 16(%rdi)
+; AVX1-NEXT: vextractps $3, %xmm0, 16(%rdi)
; AVX1-NEXT: vmovaps %xmm1, (%rdi)
; AVX1-NEXT: retq
;
; AVX2-LABEL: v5i32:
-; AVX2: # BB#0:
-; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <0,5,1,6,3,u,u,u>
-; AVX2-NEXT: vpermd %ymm1, %ymm2, %ymm1
-; AVX2-NEXT: vpextrd $3, %xmm0, 16(%rdi)
-; AVX2-NEXT: vmovdqa %xmm1, (%rdi)
+; AVX2: # %bb.0:
+; AVX2-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
+; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
+; AVX2-NEXT: vmovaps {{.*#+}} ymm2 = <0,5,1,6,3,u,u,u>
+; AVX2-NEXT: vpermps %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vextractps $3, %xmm0, 16(%rdi)
+; AVX2-NEXT: vmovaps %xmm1, (%rdi)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
+;
+; XOP-LABEL: v5i32:
+; XOP: # %bb.0:
+; XOP-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,1],xmm1[1,2]
+; XOP-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,2,1,3]
+; XOP-NEXT: vextractps $3, %xmm0, 16(%rdi)
+; XOP-NEXT: vmovaps %xmm1, (%rdi)
+; XOP-NEXT: retq
%r = shufflevector <4 x i32> %a, <4 x i32> %b, <5 x i32> <i32 0, i32 5, i32 1, i32 6, i32 3>
store <5 x i32> %r, <5 x i32>* %p
ret void
@@ -190,7 +228,7 @@ define void @v5i32(<4 x i32> %a, <4 x i32> %b, <5 x i32>* %p) nounwind {
define void @v5f32(<4 x float> %a, <4 x float> %b, <5 x float>* %p) nounwind {
; SSE2-LABEL: v5f32:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movaps %xmm0, %xmm2
; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[1,2]
; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3]
@@ -200,7 +238,7 @@ define void @v5f32(<4 x float> %a, <4 x float> %b, <5 x float>* %p) nounwind {
; SSE2-NEXT: retq
;
; SSE42-LABEL: v5f32:
-; SSE42: # BB#0:
+; SSE42: # %bb.0:
; SSE42-NEXT: extractps $3, %xmm0, 16(%rdi)
; SSE42-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[1,2]
; SSE42-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
@@ -208,7 +246,7 @@ define void @v5f32(<4 x float> %a, <4 x float> %b, <5 x float>* %p) nounwind {
; SSE42-NEXT: retq
;
; AVX1-LABEL: v5f32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,1],xmm1[1,2]
; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,2,1,3]
; AVX1-NEXT: vextractps $3, %xmm0, 16(%rdi)
@@ -216,8 +254,8 @@ define void @v5f32(<4 x float> %a, <4 x float> %b, <5 x float>* %p) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: v5f32:
-; AVX2: # BB#0:
-; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; AVX2: # %bb.0:
+; AVX2-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
; AVX2-NEXT: vmovaps {{.*#+}} ymm2 = <0,5,1,6,3,u,u,u>
; AVX2-NEXT: vpermps %ymm1, %ymm2, %ymm1
@@ -225,6 +263,14 @@ define void @v5f32(<4 x float> %a, <4 x float> %b, <5 x float>* %p) nounwind {
; AVX2-NEXT: vmovaps %xmm1, (%rdi)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
+;
+; XOP-LABEL: v5f32:
+; XOP: # %bb.0:
+; XOP-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,1],xmm1[1,2]
+; XOP-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,2,1,3]
+; XOP-NEXT: vextractps $3, %xmm0, 16(%rdi)
+; XOP-NEXT: vmovaps %xmm1, (%rdi)
+; XOP-NEXT: retq
%r = shufflevector <4 x float> %a, <4 x float> %b, <5 x i32> <i32 0, i32 5, i32 1, i32 6, i32 3>
store <5 x float> %r, <5 x float>* %p
ret void
@@ -232,7 +278,7 @@ define void @v5f32(<4 x float> %a, <4 x float> %b, <5 x float>* %p) nounwind {
define void @v7i8(<4 x i8> %a, <4 x i8> %b, <7 x i8>* %p) nounwind {
; SSE2-LABEL: v7i8:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,1,3]
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,0,65535,0,65535,65535,65535]
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,0,3]
@@ -253,7 +299,7 @@ define void @v7i8(<4 x i8> %a, <4 x i8> %b, <7 x i8>* %p) nounwind {
; SSE2-NEXT: retq
;
; SSE42-LABEL: v7i8:
-; SSE42: # BB#0:
+; SSE42: # %bb.0:
; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,1,3]
; SSE42-NEXT: pextrb $0, %xmm1, 6(%rdi)
; SSE42-NEXT: pshufb {{.*#+}} xmm1 = xmm1[8,9,8,9,4,5,8,9,0,1,12,13,0,1,14,15]
@@ -264,7 +310,7 @@ define void @v7i8(<4 x i8> %a, <4 x i8> %b, <7 x i8>* %p) nounwind {
; SSE42-NEXT: retq
;
; AVX-LABEL: v7i8:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,1,3]
; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[8,9,8,9,4,5,8,9,0,1,12,13,0,1,14,15]
; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5,6,7]
@@ -273,6 +319,14 @@ define void @v7i8(<4 x i8> %a, <4 x i8> %b, <7 x i8>* %p) nounwind {
; AVX-NEXT: vpextrw $2, %xmm0, 4(%rdi)
; AVX-NEXT: vmovd %xmm0, (%rdi)
; AVX-NEXT: retq
+;
+; XOP-LABEL: v7i8:
+; XOP: # %bb.0:
+; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm0[0],xmm1[8],xmm0[12],xmm1[8],xmm0[4],xmm1[12,0,14,u,u,u,u,u,u,u,u]
+; XOP-NEXT: vpextrb $0, %xmm1, 6(%rdi)
+; XOP-NEXT: vpextrw $2, %xmm0, 4(%rdi)
+; XOP-NEXT: vmovd %xmm0, (%rdi)
+; XOP-NEXT: retq
%r = shufflevector <4 x i8> %a, <4 x i8> %b, <7 x i32> <i32 0, i32 6, i32 3, i32 6, i32 1, i32 7, i32 4>
store <7 x i8> %r, <7 x i8>* %p
ret void
@@ -280,7 +334,7 @@ define void @v7i8(<4 x i8> %a, <4 x i8> %b, <7 x i8>* %p) nounwind {
define void @v7i16(<4 x i16> %a, <4 x i16> %b, <7 x i16>* %p) nounwind {
; SSE2-LABEL: v7i16:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,1,3]
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,0,65535,0,65535,65535,65535]
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,1,0,3]
@@ -297,7 +351,7 @@ define void @v7i16(<4 x i16> %a, <4 x i16> %b, <7 x i16>* %p) nounwind {
; SSE2-NEXT: retq
;
; SSE42-LABEL: v7i16:
-; SSE42: # BB#0:
+; SSE42: # %bb.0:
; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,1,3]
; SSE42-NEXT: pextrw $0, %xmm1, 12(%rdi)
; SSE42-NEXT: pshufb {{.*#+}} xmm1 = xmm1[8,9,8,9,4,5,8,9,0,1,12,13,0,1,14,15]
@@ -307,7 +361,7 @@ define void @v7i16(<4 x i16> %a, <4 x i16> %b, <7 x i16>* %p) nounwind {
; SSE42-NEXT: retq
;
; AVX-LABEL: v7i16:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,1,3]
; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[8,9,8,9,4,5,8,9,0,1,12,13,0,1,14,15]
; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5,6,7]
@@ -315,6 +369,14 @@ define void @v7i16(<4 x i16> %a, <4 x i16> %b, <7 x i16>* %p) nounwind {
; AVX-NEXT: vpextrd $2, %xmm0, 8(%rdi)
; AVX-NEXT: vmovq %xmm0, (%rdi)
; AVX-NEXT: retq
+;
+; XOP-LABEL: v7i16:
+; XOP: # %bb.0:
+; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm0[0,1],xmm1[8,9],xmm0[12,13],xmm1[8,9],xmm0[4,5],xmm1[12,13,0,1,14,15]
+; XOP-NEXT: vpextrw $0, %xmm1, 12(%rdi)
+; XOP-NEXT: vpextrd $2, %xmm0, 8(%rdi)
+; XOP-NEXT: vmovq %xmm0, (%rdi)
+; XOP-NEXT: retq
%r = shufflevector <4 x i16> %a, <4 x i16> %b, <7 x i32> <i32 0, i32 6, i32 3, i32 6, i32 1, i32 7, i32 4>
store <7 x i16> %r, <7 x i16>* %p
ret void
@@ -323,7 +385,7 @@ define void @v7i16(<4 x i16> %a, <4 x i16> %b, <7 x i16>* %p) nounwind {
define void @v7i32(<4 x i32> %a, <4 x i32> %b, <7 x i32>* %p) nounwind {
; SSE2-LABEL: v7i32:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,1,2,2]
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,1,0,3]
; SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm2[2],xmm3[3],xmm2[3]
@@ -335,7 +397,7 @@ define void @v7i32(<4 x i32> %a, <4 x i32> %b, <7 x i32>* %p) nounwind {
; SSE2-NEXT: retq
;
; SSE42-LABEL: v7i32:
-; SSE42: # BB#0:
+; SSE42: # %bb.0:
; SSE42-NEXT: movdqa %xmm1, %xmm2
; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3],xmm2[4,5,6,7]
; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5],xmm0[6,7]
@@ -347,7 +409,7 @@ define void @v7i32(<4 x i32> %a, <4 x i32> %b, <7 x i32>* %p) nounwind {
; SSE42-NEXT: retq
;
; AVX1-LABEL: v7i32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vblendps {{.*#+}} xmm2 = xmm0[0,1],xmm1[2],xmm0[3]
; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,2,3,2]
; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
@@ -358,17 +420,28 @@ define void @v7i32(<4 x i32> %a, <4 x i32> %b, <7 x i32>* %p) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: v7i32:
-; AVX2: # BB#0:
-; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <0,6,3,6,1,7,4,u>
-; AVX2-NEXT: vpermd %ymm0, %ymm2, %ymm0
-; AVX2-NEXT: vmovd %xmm1, 24(%rdi)
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vmovq %xmm1, 16(%rdi)
-; AVX2-NEXT: vmovdqa %xmm0, (%rdi)
+; AVX2: # %bb.0:
+; AVX2-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
+; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vmovaps {{.*#+}} ymm2 = <0,6,3,6,1,7,4,u>
+; AVX2-NEXT: vpermps %ymm0, %ymm2, %ymm0
+; AVX2-NEXT: vmovss %xmm1, 24(%rdi)
+; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vmovlps %xmm1, 16(%rdi)
+; AVX2-NEXT: vmovaps %xmm0, (%rdi)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
+;
+; XOP-LABEL: v7i32:
+; XOP: # %bb.0:
+; XOP-NEXT: vblendps {{.*#+}} xmm2 = xmm0[0,1],xmm1[2],xmm0[3]
+; XOP-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,2,3,2]
+; XOP-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
+; XOP-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,3,0,3]
+; XOP-NEXT: vmovss %xmm1, 24(%rdi)
+; XOP-NEXT: vmovlps %xmm0, 16(%rdi)
+; XOP-NEXT: vmovaps %xmm2, (%rdi)
+; XOP-NEXT: retq
%r = shufflevector <4 x i32> %a, <4 x i32> %b, <7 x i32> <i32 0, i32 6, i32 3, i32 6, i32 1, i32 7, i32 4>
store <7 x i32> %r, <7 x i32>* %p
ret void
@@ -376,7 +449,7 @@ define void @v7i32(<4 x i32> %a, <4 x i32> %b, <7 x i32>* %p) nounwind {
define void @v12i8(<8 x i8> %a, <8 x i8> %b, <12 x i8>* %p) nounwind {
; SSE2-LABEL: v12i8:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
@@ -398,7 +471,7 @@ define void @v12i8(<8 x i8> %a, <8 x i8> %b, <12 x i8>* %p) nounwind {
; SSE2-NEXT: retq
;
; SSE42-LABEL: v12i8:
-; SSE42: # BB#0:
+; SSE42: # %bb.0:
; SSE42-NEXT: pshufb {{.*#+}} xmm1 = zero,zero,xmm1[0],zero,zero,xmm1[2],zero,zero,xmm1[4],zero,zero,xmm1[6,u,u,u,u]
; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,8],zero,xmm0[2,10],zero,xmm0[4,12],zero,xmm0[6,14],zero,xmm0[u,u,u,u]
; SSE42-NEXT: por %xmm1, %xmm0
@@ -407,13 +480,22 @@ define void @v12i8(<8 x i8> %a, <8 x i8> %b, <12 x i8>* %p) nounwind {
; SSE42-NEXT: retq
;
; AVX-LABEL: v12i8:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[0],zero,zero,xmm1[2],zero,zero,xmm1[4],zero,zero,xmm1[6,u,u,u,u]
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8],zero,xmm0[2,10],zero,xmm0[4,12],zero,xmm0[6,14],zero,xmm0[u,u,u,u]
; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpextrd $2, %xmm0, 8(%rdi)
; AVX-NEXT: vmovq %xmm0, (%rdi)
; AVX-NEXT: retq
+;
+; XOP-LABEL: v12i8:
+; XOP: # %bb.0:
+; XOP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[0],zero,zero,xmm1[2],zero,zero,xmm1[4],zero,zero,xmm1[6,u,u,u,u]
+; XOP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8],zero,xmm0[2,10],zero,xmm0[4,12],zero,xmm0[6,14],zero,xmm0[u,u,u,u]
+; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0
+; XOP-NEXT: vpextrd $2, %xmm0, 8(%rdi)
+; XOP-NEXT: vmovq %xmm0, (%rdi)
+; XOP-NEXT: retq
%r = shufflevector <8 x i8> %a, <8 x i8> %b, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
store <12 x i8> %r, <12 x i8>* %p
ret void
@@ -421,7 +503,7 @@ define void @v12i8(<8 x i8> %a, <8 x i8> %b, <12 x i8>* %p) nounwind {
define void @v12i16(<8 x i16> %a, <8 x i16> %b, <12 x i16>* %p) nounwind {
; SSE2-LABEL: v12i16:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,0,3]
; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,0,65535,65535,0,65535,65535]
; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,6,5,4,7]
@@ -443,7 +525,7 @@ define void @v12i16(<8 x i16> %a, <8 x i16> %b, <12 x i16>* %p) nounwind {
; SSE2-NEXT: retq
;
; SSE42-LABEL: v12i16:
-; SSE42: # BB#0:
+; SSE42: # %bb.0:
; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,2,3]
; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,1,2,3]
; SSE42-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,1,3,4,5,6,7]
@@ -456,7 +538,7 @@ define void @v12i16(<8 x i16> %a, <8 x i16> %b, <12 x i16>* %p) nounwind {
; SSE42-NEXT: retq
;
; AVX1-LABEL: v12i16:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,2,3]
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,1,2,3]
; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,3,1,3,4,5,6,7]
@@ -469,7 +551,7 @@ define void @v12i16(<8 x i16> %a, <8 x i16> %b, <12 x i16>* %p) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: v12i16:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,2,3]
; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,1,2,3]
; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,3,1,3,4,5,6,7]
@@ -480,6 +562,14 @@ define void @v12i16(<8 x i16> %a, <8 x i16> %b, <12 x i16>* %p) nounwind {
; AVX2-NEXT: vmovdqa %xmm0, (%rdi)
; AVX2-NEXT: vmovq %xmm2, 16(%rdi)
; AVX2-NEXT: retq
+;
+; XOP-LABEL: v12i16:
+; XOP: # %bb.0:
+; XOP-NEXT: vpperm {{.*#+}} xmm2 = xmm0[0,1,8,9],xmm1[0,1],xmm0[2,3,10,11],xmm1[2,3],xmm0[4,5,12,13]
+; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm1[4,5],xmm0[6,7,14,15],xmm1[6,7],xmm0[8,9,10,11,12,13,14,15]
+; XOP-NEXT: vmovq %xmm0, 16(%rdi)
+; XOP-NEXT: vmovdqa %xmm2, (%rdi)
+; XOP-NEXT: retq
%r = shufflevector <8 x i16> %a, <8 x i16> %b, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
store <12 x i16> %r, <12 x i16>* %p
ret void
@@ -487,7 +577,7 @@ define void @v12i16(<8 x i16> %a, <8 x i16> %b, <12 x i16>* %p) nounwind {
define void @v12i32(<8 x i32> %a, <8 x i32> %b, <12 x i32>* %p) nounwind {
; SSE2-LABEL: v12i32:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm0, %xmm3
; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,2]
@@ -510,7 +600,7 @@ define void @v12i32(<8 x i32> %a, <8 x i32> %b, <12 x i32>* %p) nounwind {
; SSE2-NEXT: retq
;
; SSE42-LABEL: v12i32:
-; SSE42: # BB#0:
+; SSE42: # %bb.0:
; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,1,1]
; SSE42-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,1,0,1]
; SSE42-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,3],xmm4[4,5,6,7]
@@ -530,7 +620,7 @@ define void @v12i32(<8 x i32> %a, <8 x i32> %b, <12 x i32>* %p) nounwind {
; SSE42-NEXT: retq
;
; AVX1-LABEL: v12i32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,0,1]
; AVX1-NEXT: vmovsldup {{.*#+}} ymm2 = ymm2[0,0,2,2,4,4,6,6]
; AVX1-NEXT: vpermilps {{.*#+}} ymm3 = ymm0[0,u,u,1,5,u,u,6]
@@ -542,25 +632,41 @@ define void @v12i32(<8 x i32> %a, <8 x i32> %b, <12 x i32>* %p) nounwind {
; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3],xmm3[3,3]
; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,1]
; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
-; AVX1-NEXT: vmovaps %xmm0, 32(%rdi)
+; AVX1-NEXT: vmovapd %xmm0, 32(%rdi)
; AVX1-NEXT: vmovaps %ymm2, (%rdi)
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: v12i32:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
-; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[3,3,2,3,7,7,6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1,2],xmm2[3]
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = <0,4,u,1,5,u,2,6>
-; AVX2-NEXT: vpermd %ymm0, %ymm3, %ymm0
-; AVX2-NEXT: vpbroadcastq %xmm1, %ymm1
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
-; AVX2-NEXT: vmovdqa %ymm0, (%rdi)
-; AVX2-NEXT: vmovdqa %xmm2, 32(%rdi)
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[2,3,2,3]
+; AVX2-NEXT: vpermilps {{.*#+}} ymm3 = ymm0[3,3,2,3,7,7,6,7]
+; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,2,3]
+; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm3[1,2],xmm2[3]
+; AVX2-NEXT: vmovaps {{.*#+}} ymm3 = <0,4,u,1,5,u,2,6>
+; AVX2-NEXT: vpermps %ymm0, %ymm3, %ymm0
+; AVX2-NEXT: vbroadcastsd %xmm1, %ymm1
+; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
+; AVX2-NEXT: vmovaps %ymm0, (%rdi)
+; AVX2-NEXT: vmovaps %xmm2, 32(%rdi)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
+;
+; XOP-LABEL: v12i32:
+; XOP: # %bb.0:
+; XOP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,0,1]
+; XOP-NEXT: vpermil2ps {{.*#+}} ymm2 = ymm0[0],ymm2[0],ymm0[u,1,5,u],ymm2[6],ymm0[6]
+; XOP-NEXT: vmovddup {{.*#+}} xmm3 = xmm1[0,0]
+; XOP-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm3
+; XOP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7]
+; XOP-NEXT: vextractf128 $1, %ymm0, %xmm3
+; XOP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3],xmm3[3,3]
+; XOP-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,1]
+; XOP-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
+; XOP-NEXT: vmovapd %xmm0, 32(%rdi)
+; XOP-NEXT: vmovaps %ymm2, (%rdi)
+; XOP-NEXT: vzeroupper
+; XOP-NEXT: retq
%r = shufflevector <8 x i32> %a, <8 x i32> %b, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
store <12 x i32> %r, <12 x i32>* %p
ret void
@@ -568,7 +674,7 @@ define void @v12i32(<8 x i32> %a, <8 x i32> %b, <12 x i32>* %p) nounwind {
define void @pr29025(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <12 x i8> *%p) nounwind {
; SSE2-LABEL: pr29025:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255]
; SSE2-NEXT: pand %xmm3, %xmm1
; SSE2-NEXT: pand %xmm3, %xmm0
@@ -598,7 +704,7 @@ define void @pr29025(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <12 x i8> *%p) nounw
; SSE2-NEXT: retq
;
; SSE42-LABEL: pr29025:
-; SSE42: # BB#0:
+; SSE42: # %bb.0:
; SSE42-NEXT: movdqa {{.*#+}} xmm3 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
; SSE42-NEXT: pshufb %xmm3, %xmm1
; SSE42-NEXT: pshufb %xmm3, %xmm0
@@ -611,7 +717,7 @@ define void @pr29025(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <12 x i8> *%p) nounw
; SSE42-NEXT: retq
;
; AVX-LABEL: pr29025:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX-NEXT: vpshufb %xmm3, %xmm0, %xmm0
@@ -622,6 +728,14 @@ define void @pr29025(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <12 x i8> *%p) nounw
; AVX-NEXT: vpextrd $2, %xmm0, 8(%rdi)
; AVX-NEXT: vmovq %xmm0, (%rdi)
; AVX-NEXT: retq
+;
+; XOP-LABEL: pr29025:
+; XOP: # %bb.0:
+; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm0[0,4,8,12],xmm1[0,4,8,12],xmm0[u,u,u,u,u,u,u,u]
+; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm0[0,4],xmm2[0],xmm0[1,5],xmm2[4],xmm0[2,6],xmm2[8],xmm0[3,7],xmm2[12],xmm0[u,u,u,u]
+; XOP-NEXT: vpextrd $2, %xmm0, 8(%rdi)
+; XOP-NEXT: vmovq %xmm0, (%rdi)
+; XOP-NEXT: retq
%s1 = shufflevector <4 x i8> %a, <4 x i8> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%s2 = shufflevector <4 x i8> %c, <4 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
%r = shufflevector <8 x i8> %s1, <8 x i8> %s2, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
@@ -631,7 +745,7 @@ define void @pr29025(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <12 x i8> *%p) nounw
define void @interleave_24i8_out(<24 x i8>* %p, <8 x i8>* %q1, <8 x i8>* %q2, <8 x i8>* %q3) nounwind {
; SSE2-LABEL: interleave_24i8_out:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqu (%rdi), %xmm0
; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,0,255,255,0,255,255,255,255,255,255,255,255,255,255]
@@ -694,7 +808,7 @@ define void @interleave_24i8_out(<24 x i8>* %p, <8 x i8>* %q1, <8 x i8>* %q2, <8
; SSE2-NEXT: retq
;
; SSE42-LABEL: interleave_24i8_out:
-; SSE42: # BB#0:
+; SSE42: # %bb.0:
; SSE42-NEXT: movdqu (%rdi), %xmm0
; SSE42-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
; SSE42-NEXT: movdqa %xmm1, %xmm2
@@ -716,7 +830,7 @@ define void @interleave_24i8_out(<24 x i8>* %p, <8 x i8>* %q1, <8 x i8>* %q2, <8
; SSE42-NEXT: retq
;
; AVX-LABEL: interleave_24i8_out:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovdqu (%rdi), %xmm0
; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm1[2,5,u,u,u,u,u,u,u,u]
@@ -732,6 +846,24 @@ define void @interleave_24i8_out(<24 x i8>* %p, <8 x i8>* %q1, <8 x i8>* %q2, <8
; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmovq %xmm0, (%rcx)
; AVX-NEXT: retq
+;
+; XOP-LABEL: interleave_24i8_out:
+; XOP: # %bb.0:
+; XOP-NEXT: vmovdqu (%rdi), %xmm0
+; XOP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
+; XOP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm1[2,5,u,u,u,u,u,u,u,u]
+; XOP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,3,6,9,12,15],zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; XOP-NEXT: vpor %xmm2, %xmm3, %xmm2
+; XOP-NEXT: vmovq %xmm2, (%rsi)
+; XOP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u]
+; XOP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; XOP-NEXT: vpor %xmm2, %xmm3, %xmm2
+; XOP-NEXT: vmovq %xmm2, (%rdx)
+; XOP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u]
+; XOP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0
+; XOP-NEXT: vmovq %xmm0, (%rcx)
+; XOP-NEXT: retq
%wide.vec = load <24 x i8>, <24 x i8>* %p, align 4
%s1 = shufflevector <24 x i8> %wide.vec, <24 x i8> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
%s2 = shufflevector <24 x i8> %wide.vec, <24 x i8> undef, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22>
@@ -744,7 +876,7 @@ define void @interleave_24i8_out(<24 x i8>* %p, <8 x i8>* %q1, <8 x i8>* %q2, <8
define void @interleave_24i8_in(<24 x i8>* %p, <8 x i8>* %q1, <8 x i8>* %q2, <8 x i8>* %q3) nounwind {
; SSE2-LABEL: interleave_24i8_in:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
@@ -788,7 +920,7 @@ define void @interleave_24i8_in(<24 x i8>* %p, <8 x i8>* %q1, <8 x i8>* %q2, <8
; SSE2-NEXT: retq
;
; SSE42-LABEL: interleave_24i8_in:
-; SSE42: # BB#0:
+; SSE42: # %bb.0:
; SSE42-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; SSE42-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
; SSE42-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
@@ -806,7 +938,7 @@ define void @interleave_24i8_in(<24 x i8>* %p, <8 x i8>* %q1, <8 x i8>* %q2, <8
; SSE42-NEXT: retq
;
; AVX-LABEL: interleave_24i8_in:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
@@ -820,6 +952,22 @@ define void @interleave_24i8_in(<24 x i8>* %p, <8 x i8>* %q1, <8 x i8>* %q2, <8
; AVX-NEXT: vmovq %xmm0, 16(%rdi)
; AVX-NEXT: vmovdqu %xmm2, (%rdi)
; AVX-NEXT: retq
+;
+; XOP-LABEL: interleave_24i8_in:
+; XOP: # %bb.0:
+; XOP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; XOP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
+; XOP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; XOP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
+; XOP-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0,8],zero,xmm0[1,9],zero,xmm0[2,10],zero,xmm0[3,11],zero,xmm0[4,12],zero,xmm0[5]
+; XOP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm1[0],zero,zero,xmm1[1],zero,zero,xmm1[2],zero,zero,xmm1[3],zero,zero,xmm1[4],zero
+; XOP-NEXT: vpor %xmm3, %xmm2, %xmm2
+; XOP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[13],zero,xmm0[6,14],zero,xmm0[7,15],zero,xmm0[u,u,u,u,u,u,u,u]
+; XOP-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm1[5],zero,zero,xmm1[6],zero,zero,xmm1[7,u,u,u,u,u,u,u,u]
+; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0
+; XOP-NEXT: vmovq %xmm0, 16(%rdi)
+; XOP-NEXT: vmovdqu %xmm2, (%rdi)
+; XOP-NEXT: retq
%s1 = load <8 x i8>, <8 x i8>* %q1, align 4
%s2 = load <8 x i8>, <8 x i8>* %q2, align 4
%s3 = load <8 x i8>, <8 x i8>* %q3, align 4
@@ -833,7 +981,7 @@ define void @interleave_24i8_in(<24 x i8>* %p, <8 x i8>* %q1, <8 x i8>* %q2, <8
define void @interleave_24i16_out(<24 x i16>* %p, <8 x i16>* %q1, <8 x i16>* %q2, <8 x i16>* %q3) nounwind {
; SSE2-LABEL: interleave_24i16_out:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqu (%rdi), %xmm3
; SSE2-NEXT: movdqu 16(%rdi), %xmm2
; SSE2-NEXT: movdqu 32(%rdi), %xmm8
@@ -889,7 +1037,7 @@ define void @interleave_24i16_out(<24 x i16>* %p, <8 x i16>* %q1, <8 x i16>* %q2
; SSE2-NEXT: retq
;
; SSE42-LABEL: interleave_24i16_out:
-; SSE42: # BB#0:
+; SSE42: # %bb.0:
; SSE42-NEXT: movdqu (%rdi), %xmm0
; SSE42-NEXT: movdqu 16(%rdi), %xmm1
; SSE42-NEXT: movdqu 32(%rdi), %xmm2
@@ -915,7 +1063,7 @@ define void @interleave_24i16_out(<24 x i16>* %p, <8 x i16>* %q1, <8 x i16>* %q2
; SSE42-NEXT: retq
;
; AVX1-LABEL: interleave_24i16_out:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovdqu 32(%rdi), %xmm0
; AVX1-NEXT: vmovdqu (%rdi), %ymm1
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
@@ -939,7 +1087,7 @@ define void @interleave_24i16_out(<24 x i16>* %p, <8 x i16>* %q1, <8 x i16>* %q2
; AVX1-NEXT: retq
;
; AVX2-LABEL: interleave_24i16_out:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqu (%rdi), %ymm0
; AVX2-NEXT: vmovdqu 32(%rdi), %xmm1
; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15]
@@ -959,6 +1107,23 @@ define void @interleave_24i16_out(<24 x i16>* %p, <8 x i16>* %q1, <8 x i16>* %q2
; AVX2-NEXT: vmovdqu %xmm0, (%rcx)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
+;
+; XOP-LABEL: interleave_24i16_out:
+; XOP: # %bb.0:
+; XOP-NEXT: vmovdqu 32(%rdi), %xmm0
+; XOP-NEXT: vmovdqu (%rdi), %ymm1
+; XOP-NEXT: vextractf128 $1, %ymm1, %xmm2
+; XOP-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5,6],xmm2[7]
+; XOP-NEXT: vpperm {{.*#+}} xmm3 = xmm3[0,1,6,7,12,13,2,3,8,9,14,15],xmm0[4,5,10,11]
+; XOP-NEXT: vpblendw {{.*#+}} xmm4 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6,7]
+; XOP-NEXT: vpperm {{.*#+}} xmm4 = xmm4[2,3,8,9,14,15,4,5,10,11],xmm0[0,1,6,7,12,13]
+; XOP-NEXT: vpperm {{.*#+}} xmm1 = xmm1[4,5,10,11],xmm2[0,1,6,7,12,13,14,15,0,1,2,3]
+; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,6,7,8,9],xmm0[2,3,8,9,14,15]
+; XOP-NEXT: vmovdqu %xmm3, (%rsi)
+; XOP-NEXT: vmovdqu %xmm4, (%rdx)
+; XOP-NEXT: vmovdqu %xmm0, (%rcx)
+; XOP-NEXT: vzeroupper
+; XOP-NEXT: retq
%wide.vec = load <24 x i16>, <24 x i16>* %p, align 4
%s1 = shufflevector <24 x i16> %wide.vec, <24 x i16> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
%s2 = shufflevector <24 x i16> %wide.vec, <24 x i16> undef, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22>
@@ -971,7 +1136,7 @@ define void @interleave_24i16_out(<24 x i16>* %p, <8 x i16>* %q1, <8 x i16>* %q2
define void @interleave_24i16_in(<24 x i16>* %p, <8 x i16>* %q1, <8 x i16>* %q2, <8 x i16>* %q3) nounwind {
; SSE2-LABEL: interleave_24i16_in:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqu (%rsi), %xmm3
; SSE2-NEXT: movdqu (%rdx), %xmm2
; SSE2-NEXT: movdqu (%rcx), %xmm1
@@ -1011,7 +1176,7 @@ define void @interleave_24i16_in(<24 x i16>* %p, <8 x i16>* %q1, <8 x i16>* %q2,
; SSE2-NEXT: retq
;
; SSE42-LABEL: interleave_24i16_in:
-; SSE42: # BB#0:
+; SSE42: # %bb.0:
; SSE42-NEXT: movdqu (%rsi), %xmm0
; SSE42-NEXT: movdqu (%rdx), %xmm1
; SSE42-NEXT: movdqu (%rcx), %xmm2
@@ -1035,7 +1200,7 @@ define void @interleave_24i16_in(<24 x i16>* %p, <8 x i16>* %q1, <8 x i16>* %q2,
; SSE42-NEXT: retq
;
; AVX1-LABEL: interleave_24i16_in:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovdqu (%rsi), %xmm0
; AVX1-NEXT: vmovdqu (%rdx), %xmm1
; AVX1-NEXT: vmovdqu (%rcx), %xmm2
@@ -1060,13 +1225,13 @@ define void @interleave_24i16_in(<24 x i16>* %p, <8 x i16>* %q1, <8 x i16>* %q2,
; AVX1-NEXT: retq
;
; AVX2-LABEL: interleave_24i16_in:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqu (%rsi), %xmm0
; AVX2-NEXT: vmovdqu (%rdx), %xmm1
; AVX2-NEXT: vmovdqu (%rcx), %xmm2
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3
; AVX2-NEXT: vpshufb {{.*#+}} ymm4 = ymm3[0,1,2,3,6,7,2,3,8,9,8,9,4,5,6,7,16,17,18,19,22,23,18,19,24,25,24,25,20,21,22,23]
-; AVX2-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3,0,1]
+; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1]
; AVX2-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,0,1,u,u,u,u,2,3,u,u,u,u,4,5,u,u,22,23,u,u,u,u,24,25,u,u,u,u,26,27]
; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7],ymm4[8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13,14],ymm3[15]
; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = <u,0,0,u,1,1,u,2>
@@ -1081,6 +1246,25 @@ define void @interleave_24i16_in(<24 x i16>* %p, <8 x i16>* %q1, <8 x i16>* %q2,
; AVX2-NEXT: vmovdqu %ymm3, (%rdi)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
+;
+; XOP-LABEL: interleave_24i16_in:
+; XOP: # %bb.0:
+; XOP-NEXT: vmovdqu (%rsi), %xmm0
+; XOP-NEXT: vmovdqu (%rdx), %xmm1
+; XOP-NEXT: vmovdqu (%rcx), %xmm2
+; XOP-NEXT: vpperm {{.*#+}} xmm3 = xmm0[4,5,6,7],xmm1[6,7],xmm0[6,7,8,9],xmm1[8,9],xmm0[8,9,10,11]
+; XOP-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[1,1,2,2]
+; XOP-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1,2],xmm4[3],xmm3[4,5],xmm4[6],xmm3[7]
+; XOP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; XOP-NEXT: vpperm {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm2[0,1],xmm4[4,5,6,7],xmm2[2,3],xmm4[8,9,10,11]
+; XOP-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3
+; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm1[10,11],xmm0[12,13,12,13],xmm1[12,13,12,13],xmm0[14,15],xmm1[14,15],xmm0[14,15]
+; XOP-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,2,3,3]
+; XOP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7]
+; XOP-NEXT: vmovdqu %xmm0, 32(%rdi)
+; XOP-NEXT: vmovups %ymm3, (%rdi)
+; XOP-NEXT: vzeroupper
+; XOP-NEXT: retq
%s1 = load <8 x i16>, <8 x i16>* %q1, align 4
%s2 = load <8 x i16>, <8 x i16>* %q2, align 4
%s3 = load <8 x i16>, <8 x i16>* %q3, align 4
@@ -1093,7 +1277,7 @@ define void @interleave_24i16_in(<24 x i16>* %p, <8 x i16>* %q1, <8 x i16>* %q2,
define void @interleave_24i32_out(<24 x i32>* %p, <8 x i32>* %q1, <8 x i32>* %q2, <8 x i32>* %q3) nounwind {
; SSE2-LABEL: interleave_24i32_out:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movups 80(%rdi), %xmm5
; SSE2-NEXT: movups 64(%rdi), %xmm8
; SSE2-NEXT: movups (%rdi), %xmm0
@@ -1137,7 +1321,7 @@ define void @interleave_24i32_out(<24 x i32>* %p, <8 x i32>* %q1, <8 x i32>* %q2
; SSE2-NEXT: retq
;
; SSE42-LABEL: interleave_24i32_out:
-; SSE42: # BB#0:
+; SSE42: # %bb.0:
; SSE42-NEXT: movdqu 80(%rdi), %xmm9
; SSE42-NEXT: movdqu 64(%rdi), %xmm10
; SSE42-NEXT: movdqu (%rdi), %xmm4
@@ -1177,7 +1361,7 @@ define void @interleave_24i32_out(<24 x i32>* %p, <8 x i32>* %q1, <8 x i32>* %q2
; SSE42-NEXT: retq
;
; AVX1-LABEL: interleave_24i32_out:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovups (%rdi), %ymm0
; AVX1-NEXT: vmovups 32(%rdi), %ymm1
; AVX1-NEXT: vmovups 64(%rdi), %ymm2
@@ -1217,33 +1401,73 @@ define void @interleave_24i32_out(<24 x i32>* %p, <8 x i32>* %q1, <8 x i32>* %q2
; AVX1-NEXT: retq
;
; AVX2-LABEL: interleave_24i32_out:
-; AVX2: # BB#0:
-; AVX2-NEXT: vmovdqu (%rdi), %ymm0
-; AVX2-NEXT: vmovdqu 32(%rdi), %ymm1
-; AVX2-NEXT: vmovdqu 64(%rdi), %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = <u,u,u,u,u,u,2,5>
-; AVX2-NEXT: vpermd %ymm2, %ymm3, %ymm3
-; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = <0,3,6,1,4,7,u,u>
-; AVX2-NEXT: vpermd %ymm4, %ymm5, %ymm4
-; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = <u,u,u,u,u,0,3,6>
-; AVX2-NEXT: vpermd %ymm2, %ymm4, %ymm4
-; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = <1,4,7,2,5,u,u,u>
-; AVX2-NEXT: vpermd %ymm5, %ymm6, %ymm5
-; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = <2,5,0,3,6,u,u,u>
-; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
-; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm2[0,1,0,3,4,5,4,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,3]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
-; AVX2-NEXT: vmovdqu %ymm3, (%rsi)
-; AVX2-NEXT: vmovdqu %ymm4, (%rdx)
-; AVX2-NEXT: vmovdqu %ymm0, (%rcx)
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovups (%rdi), %ymm0
+; AVX2-NEXT: vmovups 32(%rdi), %ymm1
+; AVX2-NEXT: vmovups 64(%rdi), %ymm2
+; AVX2-NEXT: vmovaps {{.*#+}} ymm3 = <u,u,u,u,u,u,2,5>
+; AVX2-NEXT: vpermps %ymm2, %ymm3, %ymm3
+; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
+; AVX2-NEXT: vmovaps {{.*#+}} ymm5 = <0,3,6,1,4,7,u,u>
+; AVX2-NEXT: vpermps %ymm4, %ymm5, %ymm4
+; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
+; AVX2-NEXT: vmovaps {{.*#+}} ymm4 = <u,u,u,u,u,0,3,6>
+; AVX2-NEXT: vpermps %ymm2, %ymm4, %ymm4
+; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
+; AVX2-NEXT: vmovaps {{.*#+}} ymm6 = <1,4,7,2,5,u,u,u>
+; AVX2-NEXT: vpermps %ymm5, %ymm6, %ymm5
+; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7]
+; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
+; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = <2,5,0,3,6,u,u,u>
+; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpermilps {{.*#+}} ymm1 = ymm2[0,1,0,3,4,5,4,7]
+; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
+; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
+; AVX2-NEXT: vmovups %ymm3, (%rsi)
+; AVX2-NEXT: vmovups %ymm4, (%rdx)
+; AVX2-NEXT: vmovups %ymm0, (%rcx)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
+;
+; XOP-LABEL: interleave_24i32_out:
+; XOP: # %bb.0:
+; XOP-NEXT: vmovups (%rdi), %ymm0
+; XOP-NEXT: vmovups 32(%rdi), %ymm1
+; XOP-NEXT: vmovups 64(%rdi), %ymm2
+; XOP-NEXT: vextractf128 $1, %ymm2, %xmm3
+; XOP-NEXT: vinsertps {{.*#+}} xmm4 = zero,zero,xmm2[2],xmm3[1]
+; XOP-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4
+; XOP-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
+; XOP-NEXT: vextractf128 $1, %ymm5, %xmm6
+; XOP-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm6[2],xmm5[3]
+; XOP-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[0,3,2,1]
+; XOP-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[0,3,2,3]
+; XOP-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5
+; XOP-NEXT: vblendpd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3]
+; XOP-NEXT: vblendps {{.*#+}} xmm5 = xmm2[0,1],xmm3[2],xmm2[3]
+; XOP-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[0,0,3,2]
+; XOP-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5
+; XOP-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
+; XOP-NEXT: vextractf128 $1, %ymm6, %xmm7
+; XOP-NEXT: vblendps {{.*#+}} xmm6 = xmm7[0],xmm6[1,2],xmm7[3]
+; XOP-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[1,0,3,2]
+; XOP-NEXT: vmovshdup {{.*#+}} xmm7 = xmm7[1,1,3,3]
+; XOP-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6
+; XOP-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6,7]
+; XOP-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0,3]
+; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
+; XOP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
+; XOP-NEXT: vextractf128 $1, %ymm0, %xmm1
+; XOP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
+; XOP-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,1,0,3]
+; XOP-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
+; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; XOP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
+; XOP-NEXT: vmovupd %ymm4, (%rsi)
+; XOP-NEXT: vmovups %ymm5, (%rdx)
+; XOP-NEXT: vmovups %ymm0, (%rcx)
+; XOP-NEXT: vzeroupper
+; XOP-NEXT: retq
%wide.vec = load <24 x i32>, <24 x i32>* %p, align 4
%s1 = shufflevector <24 x i32> %wide.vec, <24 x i32> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
%s2 = shufflevector <24 x i32> %wide.vec, <24 x i32> undef, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22>
@@ -1256,7 +1480,7 @@ define void @interleave_24i32_out(<24 x i32>* %p, <8 x i32>* %q1, <8 x i32>* %q2
define void @interleave_24i32_in(<24 x i32>* %p, <8 x i32>* %q1, <8 x i32>* %q2, <8 x i32>* %q3) nounwind {
; SSE2-LABEL: interleave_24i32_in:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqu (%rsi), %xmm5
; SSE2-NEXT: movdqu 16(%rsi), %xmm2
; SSE2-NEXT: movdqu (%rdx), %xmm6
@@ -1304,7 +1528,7 @@ define void @interleave_24i32_in(<24 x i32>* %p, <8 x i32>* %q1, <8 x i32>* %q2,
; SSE2-NEXT: retq
;
; SSE42-LABEL: interleave_24i32_in:
-; SSE42: # BB#0:
+; SSE42: # %bb.0:
; SSE42-NEXT: movdqu (%rsi), %xmm5
; SSE42-NEXT: movdqu 16(%rsi), %xmm2
; SSE42-NEXT: movdqu (%rdx), %xmm6
@@ -1346,7 +1570,7 @@ define void @interleave_24i32_in(<24 x i32>* %p, <8 x i32>* %q1, <8 x i32>* %q2,
; SSE42-NEXT: retq
;
; AVX1-LABEL: interleave_24i32_in:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovups (%rsi), %ymm0
; AVX1-NEXT: vmovups (%rdx), %ymm1
; AVX1-NEXT: vmovupd (%rcx), %ymm2
@@ -1373,39 +1597,71 @@ define void @interleave_24i32_in(<24 x i32>* %p, <8 x i32>* %q1, <8 x i32>* %q2,
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6],ymm0[7]
; AVX1-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,0,3,3,4,4,7,7]
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
-; AVX1-NEXT: vmovups %ymm0, 32(%rdi)
-; AVX1-NEXT: vmovups %ymm4, 64(%rdi)
+; AVX1-NEXT: vmovupd %ymm0, 32(%rdi)
+; AVX1-NEXT: vmovupd %ymm4, 64(%rdi)
; AVX1-NEXT: vmovups %ymm3, (%rdi)
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: interleave_24i32_in:
-; AVX2: # BB#0:
-; AVX2-NEXT: vmovdqu (%rsi), %ymm0
-; AVX2-NEXT: vmovdqu (%rdx), %ymm1
-; AVX2-NEXT: vmovdqu (%rcx), %ymm2
-; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,0,2,2]
-; AVX2-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[0,1,0,1]
-; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,0,2,1]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7]
-; AVX2-NEXT: vpbroadcastq %xmm2, %ymm4
-; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm2[2,1,3,3]
-; AVX2-NEXT: vpshufd {{.*#+}} ymm5 = ymm1[1,2,3,3,5,6,7,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6],ymm4[7]
-; AVX2-NEXT: vpbroadcastq 24(%rsi), %ymm5
-; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,1,2,2]
-; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,1,2,2]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6],ymm0[7]
-; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,3,3,4,4,7,7]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
-; AVX2-NEXT: vmovdqu %ymm0, 32(%rdi)
-; AVX2-NEXT: vmovdqu %ymm4, 64(%rdi)
-; AVX2-NEXT: vmovdqu %ymm3, (%rdi)
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovups (%rsi), %ymm0
+; AVX2-NEXT: vmovups (%rdx), %ymm1
+; AVX2-NEXT: vmovups (%rcx), %ymm2
+; AVX2-NEXT: vpermilps {{.*#+}} xmm3 = xmm1[1,0,2,2]
+; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,1]
+; AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm0[0,0,2,1]
+; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7]
+; AVX2-NEXT: vbroadcastsd %xmm2, %ymm4
+; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7]
+; AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm2[2,1,3,3]
+; AVX2-NEXT: vpermilps {{.*#+}} ymm5 = ymm1[1,2,3,3,5,6,7,7]
+; AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,2,2,3]
+; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6],ymm4[7]
+; AVX2-NEXT: vbroadcastsd 24(%rsi), %ymm5
+; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7]
+; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,2,2]
+; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[1,1,2,2]
+; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6],ymm0[7]
+; AVX2-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,0,3,3,4,4,7,7]
+; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
+; AVX2-NEXT: vmovups %ymm0, 32(%rdi)
+; AVX2-NEXT: vmovups %ymm4, 64(%rdi)
+; AVX2-NEXT: vmovups %ymm3, (%rdi)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
+;
+; XOP-LABEL: interleave_24i32_in:
+; XOP: # %bb.0:
+; XOP-NEXT: vmovups (%rsi), %ymm0
+; XOP-NEXT: vmovups (%rdx), %ymm1
+; XOP-NEXT: vmovupd (%rcx), %ymm2
+; XOP-NEXT: vshufps {{.*#+}} xmm3 = xmm0[2,0],xmm1[2,0]
+; XOP-NEXT: vshufps {{.*#+}} xmm3 = xmm1[1,1],xmm3[0,2]
+; XOP-NEXT: vshufps {{.*#+}} xmm4 = xmm1[0,0],xmm0[0,0]
+; XOP-NEXT: vshufps {{.*#+}} xmm4 = xmm4[2,0],xmm0[2,1]
+; XOP-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3
+; XOP-NEXT: vmovddup {{.*#+}} xmm4 = xmm2[0,0]
+; XOP-NEXT: vinsertf128 $1, %xmm4, %ymm4, %ymm4
+; XOP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7]
+; XOP-NEXT: vextractf128 $1, %ymm2, %xmm4
+; XOP-NEXT: vextractf128 $1, %ymm1, %xmm5
+; XOP-NEXT: vshufps {{.*#+}} xmm6 = xmm5[3,0],xmm4[3,0]
+; XOP-NEXT: vshufps {{.*#+}} xmm6 = xmm4[2,1],xmm6[0,2]
+; XOP-NEXT: vshufps {{.*#+}} xmm4 = xmm4[1,0],xmm5[1,0]
+; XOP-NEXT: vshufps {{.*#+}} xmm4 = xmm4[2,0],xmm5[2,2]
+; XOP-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4
+; XOP-NEXT: vpermilpd {{.*#+}} ymm5 = ymm0[1,1,3,3]
+; XOP-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3,2,3]
+; XOP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7]
+; XOP-NEXT: vpermil2ps {{.*#+}} ymm0 = ymm2[2],ymm0[3],ymm2[2,3],ymm0[4],ymm2[5,4],ymm0[5]
+; XOP-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,0,3,3,4,4,7,7]
+; XOP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
+; XOP-NEXT: vmovups %ymm0, 32(%rdi)
+; XOP-NEXT: vmovupd %ymm4, 64(%rdi)
+; XOP-NEXT: vmovups %ymm3, (%rdi)
+; XOP-NEXT: vzeroupper
+; XOP-NEXT: retq
%s1 = load <8 x i32>, <8 x i32>* %q1, align 4
%s2 = load <8 x i32>, <8 x i32>* %q2, align 4
%s3 = load <8 x i32>, <8 x i32>* %q3, align 4
@@ -1418,7 +1674,7 @@ define void @interleave_24i32_in(<24 x i32>* %p, <8 x i32>* %q1, <8 x i32>* %q2,
define <2 x double> @wrongorder(<4 x double> %A, <8 x double>* %P) #0 {
; SSE2-LABEL: wrongorder:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0]
; SSE2-NEXT: movaps %xmm0, 48(%rdi)
; SSE2-NEXT: movaps %xmm0, 32(%rdi)
@@ -1427,7 +1683,7 @@ define <2 x double> @wrongorder(<4 x double> %A, <8 x double>* %P) #0 {
; SSE2-NEXT: retq
;
; SSE42-LABEL: wrongorder:
-; SSE42: # BB#0:
+; SSE42: # %bb.0:
; SSE42-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0]
; SSE42-NEXT: movapd %xmm0, 48(%rdi)
; SSE42-NEXT: movapd %xmm0, 32(%rdi)
@@ -1436,23 +1692,33 @@ define <2 x double> @wrongorder(<4 x double> %A, <8 x double>* %P) #0 {
; SSE42-NEXT: retq
;
; AVX1-LABEL: wrongorder:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1
; AVX1-NEXT: vmovaps %ymm1, 32(%rdi)
; AVX1-NEXT: vmovaps %ymm1, (%rdi)
-; AVX1-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX1-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: wrongorder:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vbroadcastsd %xmm0, %ymm1
; AVX2-NEXT: vmovapd %ymm1, 32(%rdi)
; AVX2-NEXT: vmovapd %ymm1, (%rdi)
; AVX2-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
+;
+; XOP-LABEL: wrongorder:
+; XOP: # %bb.0:
+; XOP-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
+; XOP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1
+; XOP-NEXT: vmovaps %ymm1, 32(%rdi)
+; XOP-NEXT: vmovaps %ymm1, (%rdi)
+; XOP-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
+; XOP-NEXT: vzeroupper
+; XOP-NEXT: retq
%shuffle = shufflevector <4 x double> %A, <4 x double> %A, <8 x i32> zeroinitializer
store <8 x double> %shuffle, <8 x double>* %P, align 64
%m2 = load <8 x double>, <8 x double>* %P, align 64
diff --git a/test/CodeGen/X86/opt-ext-uses.ll b/test/CodeGen/X86/opt-ext-uses.ll
index b654a81c11cd..49429b258a29 100644
--- a/test/CodeGen/X86/opt-ext-uses.ll
+++ b/test/CodeGen/X86/opt-ext-uses.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | FileCheck %s
+; RUN: llc < %s -mtriple=i686-- | FileCheck %s
; This test should get one and only one register to register mov.
; CHECK-LABEL: t:
diff --git a/test/CodeGen/X86/optimize-max-0.ll b/test/CodeGen/X86/optimize-max-0.ll
index 006592aaade2..2dde95738d1f 100644
--- a/test/CodeGen/X86/optimize-max-0.ll
+++ b/test/CodeGen/X86/optimize-max-0.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | not grep cmov
+; RUN: llc < %s | not grep cmov
; LSR should be able to eliminate the max computations by
; making the loops use slt/ult comparisons instead of ne comparisons.
diff --git a/test/CodeGen/X86/optimize-max-1.ll b/test/CodeGen/X86/optimize-max-1.ll
index 08cb86ab3989..aa560c4ecadb 100644
--- a/test/CodeGen/X86/optimize-max-1.ll
+++ b/test/CodeGen/X86/optimize-max-1.ll
@@ -8,7 +8,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
define void @fs(double* nocapture %p, i64 %n) nounwind {
; CHECK-LABEL: fs:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB0_1: # %bb
@@ -17,7 +17,7 @@ define void @fs(double* nocapture %p, i64 %n) nounwind {
; CHECK-NEXT: incq %rax
; CHECK-NEXT: cmpq %rsi, %rax
; CHECK-NEXT: jl .LBB0_1
-; CHECK-NEXT: # BB#2: # %return
+; CHECK-NEXT: # %bb.2: # %return
; CHECK-NEXT: retq
entry:
%tmp = icmp slt i64 %n, 1 ; <i1> [#uses=1]
@@ -38,7 +38,7 @@ return: ; preds = %bb
define void @bs(double* nocapture %p, i64 %n) nounwind {
; CHECK-LABEL: bs:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB1_1: # %bb
@@ -47,7 +47,7 @@ define void @bs(double* nocapture %p, i64 %n) nounwind {
; CHECK-NEXT: incq %rax
; CHECK-NEXT: cmpq %rsi, %rax
; CHECK-NEXT: jl .LBB1_1
-; CHECK-NEXT: # BB#2: # %return
+; CHECK-NEXT: # %bb.2: # %return
; CHECK-NEXT: retq
entry:
%tmp = icmp sge i64 %n, 1 ; <i1> [#uses=1]
@@ -68,7 +68,7 @@ return: ; preds = %bb
define void @fu(double* nocapture %p, i64 %n) nounwind {
; CHECK-LABEL: fu:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB2_1: # %bb
@@ -77,7 +77,7 @@ define void @fu(double* nocapture %p, i64 %n) nounwind {
; CHECK-NEXT: incq %rax
; CHECK-NEXT: cmpq %rsi, %rax
; CHECK-NEXT: jb .LBB2_1
-; CHECK-NEXT: # BB#2: # %return
+; CHECK-NEXT: # %bb.2: # %return
; CHECK-NEXT: retq
entry:
%tmp = icmp eq i64 %n, 0 ; <i1> [#uses=1]
@@ -98,7 +98,7 @@ return: ; preds = %bb
define void @bu(double* nocapture %p, i64 %n) nounwind {
; CHECK-LABEL: bu:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB3_1: # %bb
@@ -107,7 +107,7 @@ define void @bu(double* nocapture %p, i64 %n) nounwind {
; CHECK-NEXT: incq %rax
; CHECK-NEXT: cmpq %rsi, %rax
; CHECK-NEXT: jb .LBB3_1
-; CHECK-NEXT: # BB#2: # %return
+; CHECK-NEXT: # %bb.2: # %return
; CHECK-NEXT: retq
entry:
%tmp = icmp ne i64 %n, 0 ; <i1> [#uses=1]
diff --git a/test/CodeGen/X86/optimize-max-2.ll b/test/CodeGen/X86/optimize-max-2.ll
index 37d2a20975a0..04e17f066ba1 100644
--- a/test/CodeGen/X86/optimize-max-2.ll
+++ b/test/CodeGen/X86/optimize-max-2.ll
@@ -8,7 +8,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
define void @foo(double* nocapture %p, i64 %x, i64 %y) nounwind {
; CHECK-LABEL: foo:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: testq %rdx, %rdx
; CHECK-NEXT: movl $1, %eax
; CHECK-NEXT: cmovneq %rdx, %rax
@@ -23,7 +23,7 @@ define void @foo(double* nocapture %p, i64 %x, i64 %y) nounwind {
; CHECK-NEXT: addq $8, %rdi
; CHECK-NEXT: decq %rax
; CHECK-NEXT: jne .LBB0_1
-; CHECK-NEXT: # BB#2: # %return
+; CHECK-NEXT: # %bb.2: # %return
; CHECK-NEXT: retq
entry:
%tmp = icmp eq i64 %y, 0 ; <i1> [#uses=1]
diff --git a/test/CodeGen/X86/or-branch.ll b/test/CodeGen/X86/or-branch.ll
index 71d7746642e9..276258a3d404 100644
--- a/test/CodeGen/X86/or-branch.ll
+++ b/test/CodeGen/X86/or-branch.ll
@@ -4,20 +4,20 @@
define void @foo(i32 %X, i32 %Y, i32 %Z) nounwind {
; JUMP2-LABEL: foo:
-; JUMP2: # BB#0: # %entry
+; JUMP2: # %bb.0: # %entry
; JUMP2-NEXT: cmpl $5, {{[0-9]+}}(%esp)
; JUMP2-NEXT: jl .LBB0_3
-; JUMP2-NEXT: # BB#1: # %entry
+; JUMP2-NEXT: # %bb.1: # %entry
; JUMP2-NEXT: movl {{[0-9]+}}(%esp), %eax
; JUMP2-NEXT: testl %eax, %eax
; JUMP2-NEXT: je .LBB0_3
-; JUMP2-NEXT: # BB#2: # %UnifiedReturnBlock
+; JUMP2-NEXT: # %bb.2: # %UnifiedReturnBlock
; JUMP2-NEXT: retl
; JUMP2-NEXT: .LBB0_3: # %cond_true
; JUMP2-NEXT: jmp bar # TAILCALL
;
; JUMP1-LABEL: foo:
-; JUMP1: # BB#0: # %entry
+; JUMP1: # %bb.0: # %entry
; JUMP1-NEXT: cmpl $0, {{[0-9]+}}(%esp)
; JUMP1-NEXT: sete %al
; JUMP1-NEXT: cmpl $5, {{[0-9]+}}(%esp)
@@ -25,7 +25,7 @@ define void @foo(i32 %X, i32 %Y, i32 %Z) nounwind {
; JUMP1-NEXT: orb %al, %cl
; JUMP1-NEXT: cmpb $1, %cl
; JUMP1-NEXT: jne .LBB0_1
-; JUMP1-NEXT: # BB#2: # %cond_true
+; JUMP1-NEXT: # %bb.2: # %cond_true
; JUMP1-NEXT: jmp bar # TAILCALL
; JUMP1-NEXT: .LBB0_1: # %UnifiedReturnBlock
; JUMP1-NEXT: retl
@@ -48,7 +48,7 @@ UnifiedReturnBlock:
define void @unpredictable(i32 %X, i32 %Y, i32 %Z) nounwind {
; JUMP2-LABEL: unpredictable:
-; JUMP2: # BB#0: # %entry
+; JUMP2: # %bb.0: # %entry
; JUMP2-NEXT: cmpl $0, {{[0-9]+}}(%esp)
; JUMP2-NEXT: sete %al
; JUMP2-NEXT: cmpl $5, {{[0-9]+}}(%esp)
@@ -56,13 +56,13 @@ define void @unpredictable(i32 %X, i32 %Y, i32 %Z) nounwind {
; JUMP2-NEXT: orb %al, %cl
; JUMP2-NEXT: cmpb $1, %cl
; JUMP2-NEXT: jne .LBB1_1
-; JUMP2-NEXT: # BB#2: # %cond_true
+; JUMP2-NEXT: # %bb.2: # %cond_true
; JUMP2-NEXT: jmp bar # TAILCALL
; JUMP2-NEXT: .LBB1_1: # %UnifiedReturnBlock
; JUMP2-NEXT: retl
;
; JUMP1-LABEL: unpredictable:
-; JUMP1: # BB#0: # %entry
+; JUMP1: # %bb.0: # %entry
; JUMP1-NEXT: cmpl $0, {{[0-9]+}}(%esp)
; JUMP1-NEXT: sete %al
; JUMP1-NEXT: cmpl $5, {{[0-9]+}}(%esp)
@@ -70,7 +70,7 @@ define void @unpredictable(i32 %X, i32 %Y, i32 %Z) nounwind {
; JUMP1-NEXT: orb %al, %cl
; JUMP1-NEXT: cmpb $1, %cl
; JUMP1-NEXT: jne .LBB1_1
-; JUMP1-NEXT: # BB#2: # %cond_true
+; JUMP1-NEXT: # %bb.2: # %cond_true
; JUMP1-NEXT: jmp bar # TAILCALL
; JUMP1-NEXT: .LBB1_1: # %UnifiedReturnBlock
; JUMP1-NEXT: retl
diff --git a/test/CodeGen/X86/or-lea.ll b/test/CodeGen/X86/or-lea.ll
index e65056a91c43..9447ceb3c4f5 100644
--- a/test/CodeGen/X86/or-lea.ll
+++ b/test/CodeGen/X86/or-lea.ll
@@ -8,9 +8,9 @@
define i32 @or_shift1_and1(i32 %x, i32 %y) {
; CHECK-LABEL: or_shift1_and1:
-; CHECK: # BB#0:
-; CHECK-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
-; CHECK-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; CHECK: # %bb.0:
+; CHECK-NEXT: # kill: def %esi killed %esi def %rsi
+; CHECK-NEXT: # kill: def %edi killed %edi def %rdi
; CHECK-NEXT: andl $1, %esi
; CHECK-NEXT: leal (%rsi,%rdi,2), %eax
; CHECK-NEXT: retq
@@ -23,9 +23,9 @@ define i32 @or_shift1_and1(i32 %x, i32 %y) {
define i32 @or_shift1_and1_swapped(i32 %x, i32 %y) {
; CHECK-LABEL: or_shift1_and1_swapped:
-; CHECK: # BB#0:
-; CHECK-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
-; CHECK-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; CHECK: # %bb.0:
+; CHECK-NEXT: # kill: def %esi killed %esi def %rsi
+; CHECK-NEXT: # kill: def %edi killed %edi def %rdi
; CHECK-NEXT: andl $1, %esi
; CHECK-NEXT: leal (%rsi,%rdi,2), %eax
; CHECK-NEXT: retq
@@ -38,9 +38,9 @@ define i32 @or_shift1_and1_swapped(i32 %x, i32 %y) {
define i32 @or_shift2_and1(i32 %x, i32 %y) {
; CHECK-LABEL: or_shift2_and1:
-; CHECK: # BB#0:
-; CHECK-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
-; CHECK-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; CHECK: # %bb.0:
+; CHECK-NEXT: # kill: def %esi killed %esi def %rsi
+; CHECK-NEXT: # kill: def %edi killed %edi def %rdi
; CHECK-NEXT: andl $1, %esi
; CHECK-NEXT: leal (%rsi,%rdi,4), %eax
; CHECK-NEXT: retq
@@ -53,9 +53,9 @@ define i32 @or_shift2_and1(i32 %x, i32 %y) {
define i32 @or_shift3_and1(i32 %x, i32 %y) {
; CHECK-LABEL: or_shift3_and1:
-; CHECK: # BB#0:
-; CHECK-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
-; CHECK-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; CHECK: # %bb.0:
+; CHECK-NEXT: # kill: def %esi killed %esi def %rsi
+; CHECK-NEXT: # kill: def %edi killed %edi def %rdi
; CHECK-NEXT: andl $1, %esi
; CHECK-NEXT: leal (%rsi,%rdi,8), %eax
; CHECK-NEXT: retq
@@ -68,9 +68,9 @@ define i32 @or_shift3_and1(i32 %x, i32 %y) {
define i32 @or_shift3_and7(i32 %x, i32 %y) {
; CHECK-LABEL: or_shift3_and7:
-; CHECK: # BB#0:
-; CHECK-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
-; CHECK-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; CHECK: # %bb.0:
+; CHECK-NEXT: # kill: def %esi killed %esi def %rsi
+; CHECK-NEXT: # kill: def %edi killed %edi def %rdi
; CHECK-NEXT: andl $7, %esi
; CHECK-NEXT: leal (%rsi,%rdi,8), %eax
; CHECK-NEXT: retq
@@ -85,9 +85,9 @@ define i32 @or_shift3_and7(i32 %x, i32 %y) {
define i32 @or_shift4_and1(i32 %x, i32 %y) {
; CHECK-LABEL: or_shift4_and1:
-; CHECK: # BB#0:
-; CHECK-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
-; CHECK-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; CHECK: # %bb.0:
+; CHECK-NEXT: # kill: def %esi killed %esi def %rsi
+; CHECK-NEXT: # kill: def %edi killed %edi def %rdi
; CHECK-NEXT: shll $4, %edi
; CHECK-NEXT: andl $1, %esi
; CHECK-NEXT: leal (%rsi,%rdi), %eax
@@ -103,8 +103,8 @@ define i32 @or_shift4_and1(i32 %x, i32 %y) {
define i32 @or_shift3_and8(i32 %x, i32 %y) {
; CHECK-LABEL: or_shift3_and8:
-; CHECK: # BB#0:
-; CHECK-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; CHECK: # %bb.0:
+; CHECK-NEXT: # kill: def %edi killed %edi def %rdi
; CHECK-NEXT: leal (,%rdi,8), %eax
; CHECK-NEXT: andl $8, %esi
; CHECK-NEXT: orl %esi, %eax
@@ -120,7 +120,7 @@ define i32 @or_shift3_and8(i32 %x, i32 %y) {
define i64 @or_shift1_and1_64(i64 %x, i64 %y) {
; CHECK-LABEL: or_shift1_and1_64:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: andl $1, %esi
; CHECK-NEXT: leaq (%rsi,%rdi,2), %rax
; CHECK-NEXT: retq
diff --git a/test/CodeGen/X86/overflow-intrinsic-setcc-fold.ll b/test/CodeGen/X86/overflow-intrinsic-setcc-fold.ll
new file mode 100644
index 000000000000..1f26933e24bc
--- /dev/null
+++ b/test/CodeGen/X86/overflow-intrinsic-setcc-fold.ll
@@ -0,0 +1,174 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -verify-machineinstrs | FileCheck %s --check-prefix=CHECK
+
+define i1 @saddo_not_i32(i32 %v1, i32 %v2) {
+; CHECK-LABEL: saddo_not_i32:
+; CHECK: ## %bb.0: ## %entry
+; CHECK-NEXT: addl %esi, %edi
+; CHECK-NEXT: setno %al
+; CHECK-NEXT: retq
+entry:
+ %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2)
+ %obit = extractvalue {i32, i1} %t, 1
+ %ret = xor i1 %obit, true
+ ret i1 %ret
+}
+
+define i1 @saddo_not_i64(i64 %v1, i64 %v2) {
+; CHECK-LABEL: saddo_not_i64:
+; CHECK: ## %bb.0: ## %entry
+; CHECK-NEXT: addq %rsi, %rdi
+; CHECK-NEXT: setno %al
+; CHECK-NEXT: retq
+entry:
+ %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 %v2)
+ %obit = extractvalue {i64, i1} %t, 1
+ %ret = xor i1 %obit, true
+ ret i1 %ret
+}
+
+define i1 @uaddo_not_i32(i32 %v1, i32 %v2) {
+; CHECK-LABEL: uaddo_not_i32:
+; CHECK: ## %bb.0: ## %entry
+; CHECK-NEXT: addl %esi, %edi
+; CHECK-NEXT: setae %al
+; CHECK-NEXT: retq
+entry:
+ %t = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %v1, i32 %v2)
+ %obit = extractvalue {i32, i1} %t, 1
+ %ret = xor i1 %obit, true
+ ret i1 %ret
+}
+
+define i1 @uaddo_not_i64(i64 %v1, i64 %v2) {
+; CHECK-LABEL: uaddo_not_i64:
+; CHECK: ## %bb.0: ## %entry
+; CHECK-NEXT: addq %rsi, %rdi
+; CHECK-NEXT: setae %al
+; CHECK-NEXT: retq
+entry:
+ %t = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %v1, i64 %v2)
+ %obit = extractvalue {i64, i1} %t, 1
+ %ret = xor i1 %obit, true
+ ret i1 %ret
+}
+
+define i1 @ssubo_not_i32(i32 %v1, i32 %v2) {
+; CHECK-LABEL: ssubo_not_i32:
+; CHECK: ## %bb.0: ## %entry
+; CHECK-NEXT: cmpl %esi, %edi
+; CHECK-NEXT: setno %al
+; CHECK-NEXT: retq
+entry:
+ %t = call {i32, i1} @llvm.ssub.with.overflow.i32(i32 %v1, i32 %v2)
+ %obit = extractvalue {i32, i1} %t, 1
+ %ret = xor i1 %obit, true
+ ret i1 %ret
+}
+
+define i1 @ssub_not_i64(i64 %v1, i64 %v2) {
+; CHECK-LABEL: ssub_not_i64:
+; CHECK: ## %bb.0: ## %entry
+; CHECK-NEXT: cmpq %rsi, %rdi
+; CHECK-NEXT: setno %al
+; CHECK-NEXT: retq
+entry:
+ %t = call {i64, i1} @llvm.ssub.with.overflow.i64(i64 %v1, i64 %v2)
+ %obit = extractvalue {i64, i1} %t, 1
+ %ret = xor i1 %obit, true
+ ret i1 %ret
+}
+
+define i1 @usubo_not_i32(i32 %v1, i32 %v2) {
+; CHECK-LABEL: usubo_not_i32:
+; CHECK: ## %bb.0: ## %entry
+; CHECK-NEXT: cmpl %esi, %edi
+; CHECK-NEXT: setae %al
+; CHECK-NEXT: retq
+entry:
+ %t = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %v1, i32 %v2)
+ %obit = extractvalue {i32, i1} %t, 1
+ %ret = xor i1 %obit, true
+ ret i1 %ret
+}
+
+define i1 @usubo_not_i64(i64 %v1, i64 %v2) {
+; CHECK-LABEL: usubo_not_i64:
+; CHECK: ## %bb.0: ## %entry
+; CHECK-NEXT: cmpq %rsi, %rdi
+; CHECK-NEXT: setae %al
+; CHECK-NEXT: retq
+entry:
+ %t = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %v1, i64 %v2)
+ %obit = extractvalue {i64, i1} %t, 1
+ %ret = xor i1 %obit, true
+ ret i1 %ret
+}
+
+define i1 @smulo_not_i32(i32 %v1, i32 %v2) {
+; CHECK-LABEL: smulo_not_i32:
+; CHECK: ## %bb.0: ## %entry
+; CHECK-NEXT: imull %esi, %edi
+; CHECK-NEXT: setno %al
+; CHECK-NEXT: retq
+entry:
+ %t = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %v1, i32 %v2)
+ %obit = extractvalue {i32, i1} %t, 1
+ %ret = xor i1 %obit, true
+ ret i1 %ret
+}
+
+define i1 @smulo_not_i64(i64 %v1, i64 %v2) {
+; CHECK-LABEL: smulo_not_i64:
+; CHECK: ## %bb.0: ## %entry
+; CHECK-NEXT: imulq %rsi, %rdi
+; CHECK-NEXT: setno %al
+; CHECK-NEXT: retq
+entry:
+ %t = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %v1, i64 %v2)
+ %obit = extractvalue {i64, i1} %t, 1
+ %ret = xor i1 %obit, true
+ ret i1 %ret
+}
+
+define i1 @umulo_not_i32(i32 %v1, i32 %v2) {
+; CHECK-LABEL: umulo_not_i32:
+; CHECK: ## %bb.0: ## %entry
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: mull %esi
+; CHECK-NEXT: setno %al
+; CHECK-NEXT: retq
+entry:
+ %t = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %v1, i32 %v2)
+ %obit = extractvalue {i32, i1} %t, 1
+ %ret = xor i1 %obit, true
+ ret i1 %ret
+}
+
+define i1 @umulo_not_i64(i64 %v1, i64 %v2) {
+; CHECK-LABEL: umulo_not_i64:
+; CHECK: ## %bb.0: ## %entry
+; CHECK-NEXT: movq %rdi, %rax
+; CHECK-NEXT: mulq %rsi
+; CHECK-NEXT: setno %al
+; CHECK-NEXT: retq
+entry:
+ %t = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %v1, i64 %v2)
+ %obit = extractvalue {i64, i1} %t, 1
+ %ret = xor i1 %obit, true
+ ret i1 %ret
+}
+
+declare {i32, i1} @llvm.sadd.with.overflow.i32(i32, i32) nounwind readnone
+declare {i64, i1} @llvm.sadd.with.overflow.i64(i64, i64) nounwind readnone
+declare {i32, i1} @llvm.uadd.with.overflow.i32(i32, i32) nounwind readnone
+declare {i64, i1} @llvm.uadd.with.overflow.i64(i64, i64) nounwind readnone
+declare {i32, i1} @llvm.ssub.with.overflow.i32(i32, i32) nounwind readnone
+declare {i64, i1} @llvm.ssub.with.overflow.i64(i64, i64) nounwind readnone
+declare {i32, i1} @llvm.usub.with.overflow.i32(i32, i32) nounwind readnone
+declare {i64, i1} @llvm.usub.with.overflow.i64(i64, i64) nounwind readnone
+declare {i32, i1} @llvm.smul.with.overflow.i32(i32, i32) nounwind readnone
+declare {i64, i1} @llvm.smul.with.overflow.i64(i64, i64) nounwind readnone
+declare {i32, i1} @llvm.umul.with.overflow.i32(i32, i32) nounwind readnone
+declare {i64, i1} @llvm.umul.with.overflow.i64(i64, i64) nounwind readnone
+
diff --git a/test/CodeGen/X86/overflow.ll b/test/CodeGen/X86/overflow.ll
index 00dadc4a80f6..a9fd19d4f5f7 100644
--- a/test/CodeGen/X86/overflow.ll
+++ b/test/CodeGen/X86/overflow.ll
@@ -4,46 +4,67 @@
define i128 @mulhioverflow(i64 %a, i64 %b, i64 %c) nounwind {
; X32-LABEL: mulhioverflow:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pushl %ebp
-; X32-NEXT: movl %esp, %ebp
+; X32-NEXT: pushl %ebx
; X32-NEXT: pushl %edi
; X32-NEXT: pushl %esi
-; X32-NEXT: andl $-8, %esp
-; X32-NEXT: subl $16, %esp
-; X32-NEXT: movl 8(%ebp), %esi
-; X32-NEXT: movl 28(%ebp), %edi
-; X32-NEXT: movl %esp, %eax
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl 24(%ebp)
-; X32-NEXT: pushl 20(%ebp)
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl $0
-; X32-NEXT: pushl 16(%ebp)
-; X32-NEXT: pushl 12(%ebp)
-; X32-NEXT: pushl %eax
-; X32-NEXT: calll __multi3
-; X32-NEXT: addl $32, %esp
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X32-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: mull %ebp
+; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: movl %esi, %eax
+; X32-NEXT: mull %ebp
+; X32-NEXT: movl %edx, %ebp
+; X32-NEXT: movl %eax, %esi
+; X32-NEXT: addl %ebx, %esi
+; X32-NEXT: adcl $0, %ebp
+; X32-NEXT: movl %ecx, %eax
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %edx, %ebx
+; X32-NEXT: addl %esi, %eax
+; X32-NEXT: adcl %ebp, %ebx
+; X32-NEXT: setb %al
+; X32-NEXT: movzbl %al, %ecx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: mull %edi
+; X32-NEXT: movl %edx, %esi
+; X32-NEXT: movl %eax, %ebp
+; X32-NEXT: addl %ebx, %ebp
+; X32-NEXT: adcl %ecx, %esi
+; X32-NEXT: xorl %ecx, %ecx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: mull %ecx
+; X32-NEXT: movl %edx, %edi
+; X32-NEXT: movl %eax, %ebx
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: andl $1, %edi
-; X32-NEXT: addl {{[0-9]+}}(%esp), %edi
-; X32-NEXT: adcl $0, %eax
+; X32-NEXT: mull %ecx
+; X32-NEXT: addl %ebx, %eax
+; X32-NEXT: adcl %edi, %edx
+; X32-NEXT: addl %ebp, %eax
+; X32-NEXT: adcl %esi, %edx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: andl $1, %ecx
+; X32-NEXT: addl %eax, %ecx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl %ecx, (%eax)
+; X32-NEXT: adcl $0, %edx
+; X32-NEXT: movl %edx, 4(%eax)
; X32-NEXT: setb %cl
; X32-NEXT: movzbl %cl, %ecx
-; X32-NEXT: movl %edi, (%esi)
-; X32-NEXT: movl %eax, 4(%esi)
-; X32-NEXT: movl %ecx, 8(%esi)
-; X32-NEXT: movl $0, 12(%esi)
-; X32-NEXT: movl %esi, %eax
-; X32-NEXT: leal -8(%ebp), %esp
+; X32-NEXT: movl %ecx, 8(%eax)
+; X32-NEXT: movl $0, 12(%eax)
; X32-NEXT: popl %esi
; X32-NEXT: popl %edi
+; X32-NEXT: popl %ebx
; X32-NEXT: popl %ebp
; X32-NEXT: retl $4
;
; X64-LABEL: mulhioverflow:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movq %rdx, %rcx
; X64-NEXT: movq %rdi, %rax
; X64-NEXT: mulq %rsi
diff --git a/test/CodeGen/X86/overlap-shift.ll b/test/CodeGen/X86/overlap-shift.ll
index e987495f2c01..c1e15f9894aa 100644
--- a/test/CodeGen/X86/overlap-shift.ll
+++ b/test/CodeGen/X86/overlap-shift.ll
@@ -6,7 +6,7 @@
; Check that the shift gets turned into an LEA.
-; RUN: llc < %s -march=x86 -x86-asm-syntax=intel | \
+; RUN: llc < %s -mtriple=i686-- -x86-asm-syntax=intel | \
; RUN: not grep "mov E.X, E.X"
@G = external global i32 ; <i32*> [#uses=1]
diff --git a/test/CodeGen/X86/packed_struct.ll b/test/CodeGen/X86/packed_struct.ll
index c9aeb7deb4f3..e00a772ecb00 100644
--- a/test/CodeGen/X86/packed_struct.ll
+++ b/test/CodeGen/X86/packed_struct.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 > %t
+; RUN: llc < %s > %t
; RUN: grep foos+5 %t
; RUN: grep foos+1 %t
; RUN: grep foos+9 %t
diff --git a/test/CodeGen/X86/packss.ll b/test/CodeGen/X86/packss.ll
index 24db6ba9ca2f..0b4335f2b6f3 100644
--- a/test/CodeGen/X86/packss.ll
+++ b/test/CodeGen/X86/packss.ll
@@ -1,108 +1,101 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X32-SSE --check-prefix=X32-SSE2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X64-SSE --check-prefix=X64-SSE2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=X64-AVX --check-prefix=X64-AVX1
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X64-AVX --check-prefix=X64-AVX2
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=X86-SSE
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1 --check-prefix=X86-AVX --check-prefix=X86-AVX1
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=X86-AVX --check-prefix=X86-AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=X64-SSE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1 --check-prefix=X64-AVX --check-prefix=X64-AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=X64-AVX --check-prefix=X64-AVX2
define <4 x i32> @trunc_ashr_v4i64(<4 x i64> %a) nounwind {
-; X32-SSE-LABEL: trunc_ashr_v4i64:
-; X32-SSE: # BB#0:
-; X32-SSE-NEXT: psrad $31, %xmm1
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; X32-SSE-NEXT: psrad $31, %xmm0
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; X32-SSE-NEXT: packsswb %xmm1, %xmm0
-; X32-SSE-NEXT: retl
+; SSE-LABEL: trunc_ashr_v4i64:
+; SSE: # %bb.0:
+; SSE-NEXT: psrad $31, %xmm1
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE-NEXT: psrad $31, %xmm0
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE-NEXT: packssdw %xmm1, %xmm0
+; SSE-NEXT: ret{{[l|q]}}
;
-; X64-SSE-LABEL: trunc_ashr_v4i64:
-; X64-SSE: # BB#0:
-; X64-SSE-NEXT: psrad $31, %xmm1
-; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; X64-SSE-NEXT: psrad $31, %xmm0
-; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; X64-SSE-NEXT: packsswb %xmm1, %xmm0
-; X64-SSE-NEXT: retq
-;
-; X64-AVX1-LABEL: trunc_ashr_v4i64:
-; X64-AVX1: # BB#0:
-; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; X64-AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; X64-AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1
-; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm0
-; X64-AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT: vzeroupper
-; X64-AVX1-NEXT: retq
+; AVX1-LABEL: trunc_ashr_v4i64:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: ret{{[l|q]}}
;
-; X64-AVX2-LABEL: trunc_ashr_v4i64:
-; X64-AVX2: # BB#0:
-; X64-AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
-; X64-AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
-; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; X64-AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
-; X64-AVX2-NEXT: vzeroupper
-; X64-AVX2-NEXT: retq
+; AVX2-LABEL: trunc_ashr_v4i64:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: ret{{[l|q]}}
%1 = ashr <4 x i64> %a, <i64 63, i64 63, i64 63, i64 63>
%2 = trunc <4 x i64> %1 to <4 x i32>
ret <4 x i32> %2
}
define <8 x i16> @trunc_ashr_v8i32(<8 x i32> %a) nounwind {
-; X32-SSE-LABEL: trunc_ashr_v8i32:
-; X32-SSE: # BB#0:
-; X32-SSE-NEXT: psrad $31, %xmm1
-; X32-SSE-NEXT: psrad $31, %xmm0
-; X32-SSE-NEXT: packsswb %xmm1, %xmm0
-; X32-SSE-NEXT: retl
+; SSE-LABEL: trunc_ashr_v8i32:
+; SSE: # %bb.0:
+; SSE-NEXT: psrad $31, %xmm1
+; SSE-NEXT: psrad $31, %xmm0
+; SSE-NEXT: packssdw %xmm1, %xmm0
+; SSE-NEXT: ret{{[l|q]}}
;
-; X64-SSE-LABEL: trunc_ashr_v8i32:
-; X64-SSE: # BB#0:
-; X64-SSE-NEXT: psrad $31, %xmm1
-; X64-SSE-NEXT: psrad $31, %xmm0
-; X64-SSE-NEXT: packsswb %xmm1, %xmm0
-; X64-SSE-NEXT: retq
+; AVX1-LABEL: trunc_ashr_v8i32:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpsrad $31, %xmm1, %xmm1
+; AVX1-NEXT: vpsrad $31, %xmm0, %xmm0
+; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: ret{{[l|q]}}
;
-; X64-AVX1-LABEL: trunc_ashr_v8i32:
-; X64-AVX1: # BB#0:
-; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; X64-AVX1-NEXT: vpsrad $31, %xmm1, %xmm1
-; X64-AVX1-NEXT: vpsrad $31, %xmm0, %xmm0
-; X64-AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT: vzeroupper
-; X64-AVX1-NEXT: retq
-;
-; X64-AVX2-LABEL: trunc_ashr_v8i32:
-; X64-AVX2: # BB#0:
-; X64-AVX2-NEXT: vpsrad $31, %ymm0, %ymm0
-; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; X64-AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
-; X64-AVX2-NEXT: vzeroupper
-; X64-AVX2-NEXT: retq
+; AVX2-LABEL: trunc_ashr_v8i32:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpsrad $31, %ymm0, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: ret{{[l|q]}}
%1 = ashr <8 x i32> %a, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
%2 = trunc <8 x i32> %1 to <8 x i16>
ret <8 x i16> %2
}
define <8 x i16> @trunc_ashr_v4i32_icmp_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
-; X32-SSE-LABEL: trunc_ashr_v4i32_icmp_v4i32:
-; X32-SSE: # BB#0:
-; X32-SSE-NEXT: psrad $31, %xmm0
-; X32-SSE-NEXT: pcmpgtd {{\.LCPI.*}}, %xmm1
-; X32-SSE-NEXT: packsswb %xmm1, %xmm0
-; X32-SSE-NEXT: retl
+; X86-SSE-LABEL: trunc_ashr_v4i32_icmp_v4i32:
+; X86-SSE: # %bb.0:
+; X86-SSE-NEXT: psrad $31, %xmm0
+; X86-SSE-NEXT: pcmpgtd {{\.LCPI.*}}, %xmm1
+; X86-SSE-NEXT: packssdw %xmm1, %xmm0
+; X86-SSE-NEXT: ret{{[l|q]}}
+;
+; X86-AVX-LABEL: trunc_ashr_v4i32_icmp_v4i32:
+; X86-AVX: # %bb.0:
+; X86-AVX-NEXT: vpsrad $31, %xmm0, %xmm0
+; X86-AVX-NEXT: vpcmpgtd {{\.LCPI.*}}, %xmm1, %xmm1
+; X86-AVX-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: ret{{[l|q]}}
;
; X64-SSE-LABEL: trunc_ashr_v4i32_icmp_v4i32:
-; X64-SSE: # BB#0:
+; X64-SSE: # %bb.0:
; X64-SSE-NEXT: psrad $31, %xmm0
; X64-SSE-NEXT: pcmpgtd {{.*}}(%rip), %xmm1
-; X64-SSE-NEXT: packsswb %xmm1, %xmm0
-; X64-SSE-NEXT: retq
+; X64-SSE-NEXT: packssdw %xmm1, %xmm0
+; X64-SSE-NEXT: ret{{[l|q]}}
;
; X64-AVX-LABEL: trunc_ashr_v4i32_icmp_v4i32:
-; X64-AVX: # BB#0:
+; X64-AVX: # %bb.0:
; X64-AVX-NEXT: vpsrad $31, %xmm0, %xmm0
; X64-AVX-NEXT: vpcmpgtd {{.*}}(%rip), %xmm1, %xmm1
-; X64-AVX-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
-; X64-AVX-NEXT: retq
+; X64-AVX-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: ret{{[l|q]}}
%1 = ashr <4 x i32> %a, <i32 31, i32 31, i32 31, i32 31>
%2 = icmp sgt <4 x i32> %b, <i32 1, i32 16, i32 255, i32 65535>
%3 = sext <4 x i1> %2 to <4 x i32>
diff --git a/test/CodeGen/X86/palignr.ll b/test/CodeGen/X86/palignr.ll
index 700c9cf5f3af..64bbf214157f 100644
--- a/test/CodeGen/X86/palignr.ll
+++ b/test/CodeGen/X86/palignr.ll
@@ -5,13 +5,13 @@
define <4 x i32> @test1(<4 x i32> %A, <4 x i32> %B) nounwind {
; CHECK-SSE-LABEL: test1:
-; CHECK-SSE: # BB#0:
+; CHECK-SSE: # %bb.0:
; CHECK-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,2,3,0]
; CHECK-SSE-NEXT: retl
;
; CHECK-AVX-LABEL: test1:
-; CHECK-AVX: # BB#0:
-; CHECK-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,2,3,0]
+; CHECK-AVX: # %bb.0:
+; CHECK-AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,2,3,0]
; CHECK-AVX-NEXT: retl
%C = shufflevector <4 x i32> %A, <4 x i32> undef, <4 x i32> < i32 1, i32 2, i32 3, i32 0 >
ret <4 x i32> %C
@@ -19,19 +19,19 @@ define <4 x i32> @test1(<4 x i32> %A, <4 x i32> %B) nounwind {
define <4 x i32> @test2(<4 x i32> %A, <4 x i32> %B) nounwind {
; CHECK-SSE2-LABEL: test2:
-; CHECK-SSE2: # BB#0:
+; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[2,0]
; CHECK-SSE2-NEXT: retl
;
; CHECK-SSSE3-LABEL: test2:
-; CHECK-SSSE3: # BB#0:
+; CHECK-SSSE3: # %bb.0:
; CHECK-SSSE3-NEXT: palignr {{.*#+}} xmm1 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3]
; CHECK-SSSE3-NEXT: movdqa %xmm1, %xmm0
; CHECK-SSSE3-NEXT: retl
;
; CHECK-AVX-LABEL: test2:
-; CHECK-AVX: # BB#0:
+; CHECK-AVX: # %bb.0:
; CHECK-AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3]
; CHECK-AVX-NEXT: retl
%C = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> < i32 1, i32 2, i32 3, i32 4 >
@@ -40,18 +40,18 @@ define <4 x i32> @test2(<4 x i32> %A, <4 x i32> %B) nounwind {
define <4 x i32> @test3(<4 x i32> %A, <4 x i32> %B) nounwind {
; CHECK-SSE2-LABEL: test3:
-; CHECK-SSE2: # BB#0:
+; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[2,0]
; CHECK-SSE2-NEXT: retl
;
; CHECK-SSSE3-LABEL: test3:
-; CHECK-SSSE3: # BB#0:
+; CHECK-SSSE3: # %bb.0:
; CHECK-SSSE3-NEXT: palignr {{.*#+}} xmm1 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3]
; CHECK-SSSE3-NEXT: movdqa %xmm1, %xmm0
; CHECK-SSSE3-NEXT: retl
;
; CHECK-AVX-LABEL: test3:
-; CHECK-AVX: # BB#0:
+; CHECK-AVX: # %bb.0:
; CHECK-AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3]
; CHECK-AVX-NEXT: retl
%C = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> < i32 1, i32 2, i32 undef, i32 4 >
@@ -60,18 +60,18 @@ define <4 x i32> @test3(<4 x i32> %A, <4 x i32> %B) nounwind {
define <4 x i32> @test4(<4 x i32> %A, <4 x i32> %B) nounwind {
; CHECK-SSE2-LABEL: test4:
-; CHECK-SSE2: # BB#0:
+; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1],xmm0[0]
; CHECK-SSE2-NEXT: movapd %xmm1, %xmm0
; CHECK-SSE2-NEXT: retl
;
; CHECK-SSSE3-LABEL: test4:
-; CHECK-SSSE3: # BB#0:
+; CHECK-SSSE3: # %bb.0:
; CHECK-SSSE3-NEXT: palignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
; CHECK-SSSE3-NEXT: retl
;
; CHECK-AVX-LABEL: test4:
-; CHECK-AVX: # BB#0:
+; CHECK-AVX: # %bb.0:
; CHECK-AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
; CHECK-AVX-NEXT: retl
%C = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> < i32 6, i32 7, i32 undef, i32 1 >
@@ -80,13 +80,13 @@ define <4 x i32> @test4(<4 x i32> %A, <4 x i32> %B) nounwind {
define <4 x float> @test5(<4 x float> %A, <4 x float> %B) nounwind {
; CHECK-SSE-LABEL: test5:
-; CHECK-SSE: # BB#0:
+; CHECK-SSE: # %bb.0:
; CHECK-SSE-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1],xmm0[0]
; CHECK-SSE-NEXT: movapd %xmm1, %xmm0
; CHECK-SSE-NEXT: retl
;
; CHECK-AVX-LABEL: test5:
-; CHECK-AVX: # BB#0:
+; CHECK-AVX: # %bb.0:
; CHECK-AVX-NEXT: vshufpd {{.*#+}} xmm0 = xmm1[1],xmm0[0]
; CHECK-AVX-NEXT: retl
%C = shufflevector <4 x float> %A, <4 x float> %B, <4 x i32> < i32 6, i32 7, i32 undef, i32 1 >
@@ -95,20 +95,20 @@ define <4 x float> @test5(<4 x float> %A, <4 x float> %B) nounwind {
define <8 x i16> @test6(<8 x i16> %A, <8 x i16> %B) nounwind {
; CHECK-SSE2-LABEL: test6:
-; CHECK-SSE2: # BB#0:
+; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
; CHECK-SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5]
; CHECK-SSE2-NEXT: por %xmm1, %xmm0
; CHECK-SSE2-NEXT: retl
;
; CHECK-SSSE3-LABEL: test6:
-; CHECK-SSSE3: # BB#0:
+; CHECK-SSSE3: # %bb.0:
; CHECK-SSSE3-NEXT: palignr {{.*#+}} xmm1 = xmm0[6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5]
; CHECK-SSSE3-NEXT: movdqa %xmm1, %xmm0
; CHECK-SSSE3-NEXT: retl
;
; CHECK-AVX-LABEL: test6:
-; CHECK-AVX: # BB#0:
+; CHECK-AVX: # %bb.0:
; CHECK-AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5]
; CHECK-AVX-NEXT: retl
%C = shufflevector <8 x i16> %A, <8 x i16> %B, <8 x i32> < i32 3, i32 4, i32 undef, i32 6, i32 7, i32 8, i32 9, i32 10 >
@@ -117,20 +117,20 @@ define <8 x i16> @test6(<8 x i16> %A, <8 x i16> %B) nounwind {
define <8 x i16> @test7(<8 x i16> %A, <8 x i16> %B) nounwind {
; CHECK-SSE2-LABEL: test7:
-; CHECK-SSE2: # BB#0:
+; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; CHECK-SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9]
; CHECK-SSE2-NEXT: por %xmm1, %xmm0
; CHECK-SSE2-NEXT: retl
;
; CHECK-SSSE3-LABEL: test7:
-; CHECK-SSSE3: # BB#0:
+; CHECK-SSSE3: # %bb.0:
; CHECK-SSSE3-NEXT: palignr {{.*#+}} xmm1 = xmm0[10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9]
; CHECK-SSSE3-NEXT: movdqa %xmm1, %xmm0
; CHECK-SSSE3-NEXT: retl
;
; CHECK-AVX-LABEL: test7:
-; CHECK-AVX: # BB#0:
+; CHECK-AVX: # %bb.0:
; CHECK-AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9]
; CHECK-AVX-NEXT: retl
%C = shufflevector <8 x i16> %A, <8 x i16> %B, <8 x i32> < i32 undef, i32 6, i32 undef, i32 8, i32 9, i32 10, i32 11, i32 12 >
@@ -139,20 +139,20 @@ define <8 x i16> @test7(<8 x i16> %A, <8 x i16> %B) nounwind {
define <16 x i8> @test8(<16 x i8> %A, <16 x i8> %B) nounwind {
; CHECK-SSE2-LABEL: test8:
-; CHECK-SSE2: # BB#0:
+; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero
; CHECK-SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4]
; CHECK-SSE2-NEXT: por %xmm1, %xmm0
; CHECK-SSE2-NEXT: retl
;
; CHECK-SSSE3-LABEL: test8:
-; CHECK-SSSE3: # BB#0:
+; CHECK-SSSE3: # %bb.0:
; CHECK-SSSE3-NEXT: palignr {{.*#+}} xmm1 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
; CHECK-SSSE3-NEXT: movdqa %xmm1, %xmm0
; CHECK-SSSE3-NEXT: retl
;
; CHECK-AVX-LABEL: test8:
-; CHECK-AVX: # BB#0:
+; CHECK-AVX: # %bb.0:
; CHECK-AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
; CHECK-AVX-NEXT: retl
%C = shufflevector <16 x i8> %A, <16 x i8> %B, <16 x i32> < i32 5, i32 6, i32 7, i32 undef, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20 >
@@ -165,7 +165,7 @@ define <16 x i8> @test8(<16 x i8> %A, <16 x i8> %B) nounwind {
; was an UNDEF.)
define <8 x i16> @test9(<8 x i16> %A, <8 x i16> %B) nounwind {
; CHECK-SSE2-LABEL: test9:
-; CHECK-SSE2: # BB#0:
+; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm0
; CHECK-SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
; CHECK-SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
@@ -174,13 +174,13 @@ define <8 x i16> @test9(<8 x i16> %A, <8 x i16> %B) nounwind {
; CHECK-SSE2-NEXT: retl
;
; CHECK-SSSE3-LABEL: test9:
-; CHECK-SSSE3: # BB#0:
+; CHECK-SSSE3: # %bb.0:
; CHECK-SSSE3-NEXT: palignr {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1]
; CHECK-SSSE3-NEXT: movdqa %xmm1, %xmm0
; CHECK-SSSE3-NEXT: retl
;
; CHECK-AVX-LABEL: test9:
-; CHECK-AVX: # BB#0:
+; CHECK-AVX: # %bb.0:
; CHECK-AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1]
; CHECK-AVX-NEXT: retl
%C = shufflevector <8 x i16> %B, <8 x i16> %A, <8 x i32> < i32 undef, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0 >
diff --git a/test/CodeGen/X86/patchpoint-webkit_jscc.ll b/test/CodeGen/X86/patchpoint-webkit_jscc.ll
index da33ef915638..bc87bb8a7bb2 100644
--- a/test/CodeGen/X86/patchpoint-webkit_jscc.ll
+++ b/test/CodeGen/X86/patchpoint-webkit_jscc.ll
@@ -7,7 +7,6 @@
define void @jscall_patchpoint_codegen(i64 %p1, i64 %p2, i64 %p3, i64 %p4) {
entry:
; CHECK-LABEL: jscall_patchpoint_codegen:
-; CHECK: Lcfi
; CHECK: movq %r{{.+}}, (%rsp)
; CHECK: movq %r{{.+}}, %rax
; CHECK: Ltmp
@@ -16,7 +15,6 @@ entry:
; CHECK: movq %rax, (%rsp)
; CHECK: callq
; FAST-LABEL: jscall_patchpoint_codegen:
-; FAST: Lcfi
; FAST: movq %r{{.+}}, (%rsp)
; FAST: movq %r{{.+}}, %rax
; FAST: Ltmp
@@ -35,7 +33,6 @@ entry:
define i64 @jscall_patchpoint_codegen2(i64 %callee) {
entry:
; CHECK-LABEL: jscall_patchpoint_codegen2:
-; CHECK: Lcfi
; CHECK: movq $6, 24(%rsp)
; CHECK-NEXT: movl $4, 16(%rsp)
; CHECK-NEXT: movq $2, (%rsp)
@@ -43,7 +40,6 @@ entry:
; CHECK-NEXT: movabsq $-559038736, %r11
; CHECK-NEXT: callq *%r11
; FAST-LABEL: jscall_patchpoint_codegen2:
-; FAST: Lcfi
; FAST: movq $2, (%rsp)
; FAST-NEXT: movl $4, 16(%rsp)
; FAST-NEXT: movq $6, 24(%rsp)
@@ -59,7 +55,6 @@ entry:
define i64 @jscall_patchpoint_codegen3(i64 %callee) {
entry:
; CHECK-LABEL: jscall_patchpoint_codegen3:
-; CHECK: Lcfi
; CHECK: movq $10, 48(%rsp)
; CHECK-NEXT: movl $8, 36(%rsp)
; CHECK-NEXT: movq $6, 24(%rsp)
@@ -69,7 +64,6 @@ entry:
; CHECK-NEXT: movabsq $-559038736, %r11
; CHECK-NEXT: callq *%r11
; FAST-LABEL: jscall_patchpoint_codegen3:
-; FAST: Lcfi
; FAST: movq $2, (%rsp)
; FAST-NEXT: movl $4, 16(%rsp)
; FAST-NEXT: movq $6, 24(%rsp)
diff --git a/test/CodeGen/X86/pause.ll b/test/CodeGen/X86/pause.ll
index 70ac79f78f6e..2bace05e0122 100644
--- a/test/CodeGen/X86/pause.ll
+++ b/test/CodeGen/X86/pause.ll
@@ -6,7 +6,7 @@
define void @test_x86_sse2_pause() {
; CHECK-LABEL: test_x86_sse2_pause:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: pause ## encoding: [0xf3,0x90]
; CHECK-NEXT: retl ## encoding: [0xc3]
tail call void @llvm.x86.sse2.pause()
diff --git a/test/CodeGen/X86/peep-setb.ll b/test/CodeGen/X86/peep-setb.ll
index 01e445a86221..3794b378b2ce 100644
--- a/test/CodeGen/X86/peep-setb.ll
+++ b/test/CodeGen/X86/peep-setb.ll
@@ -6,7 +6,7 @@
define i8 @test1(i8 %a, i8 %b) nounwind {
; CHECK-LABEL: test1:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: cmpb %sil, %dil
; CHECK-NEXT: adcb $0, %sil
; CHECK-NEXT: movl %esi, %eax
@@ -19,7 +19,7 @@ define i8 @test1(i8 %a, i8 %b) nounwind {
define i32 @test2(i32 %a, i32 %b) nounwind {
; CHECK-LABEL: test2:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: cmpl %esi, %edi
; CHECK-NEXT: adcl $0, %esi
; CHECK-NEXT: movl %esi, %eax
@@ -32,7 +32,7 @@ define i32 @test2(i32 %a, i32 %b) nounwind {
define i64 @test3(i64 %a, i64 %b) nounwind {
; CHECK-LABEL: test3:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: cmpq %rsi, %rdi
; CHECK-NEXT: adcq $0, %rsi
; CHECK-NEXT: movq %rsi, %rax
@@ -45,7 +45,7 @@ define i64 @test3(i64 %a, i64 %b) nounwind {
define i8 @test4(i8 %a, i8 %b) nounwind {
; CHECK-LABEL: test4:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: cmpb %sil, %dil
; CHECK-NEXT: sbbb $0, %sil
; CHECK-NEXT: movl %esi, %eax
@@ -58,7 +58,7 @@ define i8 @test4(i8 %a, i8 %b) nounwind {
define i32 @test5(i32 %a, i32 %b) nounwind {
; CHECK-LABEL: test5:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: cmpl %esi, %edi
; CHECK-NEXT: sbbl $0, %esi
; CHECK-NEXT: movl %esi, %eax
@@ -71,7 +71,7 @@ define i32 @test5(i32 %a, i32 %b) nounwind {
define i64 @test6(i64 %a, i64 %b) nounwind {
; CHECK-LABEL: test6:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: cmpq %rsi, %rdi
; CHECK-NEXT: sbbq $0, %rsi
; CHECK-NEXT: movq %rsi, %rax
@@ -84,7 +84,7 @@ define i64 @test6(i64 %a, i64 %b) nounwind {
define i8 @test7(i8 %a, i8 %b) nounwind {
; CHECK-LABEL: test7:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: cmpb %sil, %dil
; CHECK-NEXT: adcb $0, %sil
; CHECK-NEXT: movl %esi, %eax
@@ -97,7 +97,7 @@ define i8 @test7(i8 %a, i8 %b) nounwind {
define i32 @test8(i32 %a, i32 %b) nounwind {
; CHECK-LABEL: test8:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: cmpl %esi, %edi
; CHECK-NEXT: adcl $0, %esi
; CHECK-NEXT: movl %esi, %eax
@@ -110,7 +110,7 @@ define i32 @test8(i32 %a, i32 %b) nounwind {
define i64 @test9(i64 %a, i64 %b) nounwind {
; CHECK-LABEL: test9:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: cmpq %rsi, %rdi
; CHECK-NEXT: adcq $0, %rsi
; CHECK-NEXT: movq %rsi, %rax
diff --git a/test/CodeGen/X86/peep-test-0.ll b/test/CodeGen/X86/peep-test-0.ll
index 1772f008b94e..2d5e7a1484ce 100644
--- a/test/CodeGen/X86/peep-test-0.ll
+++ b/test/CodeGen/X86/peep-test-0.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 > %t
+; RUN: llc < %s -mtriple=x86_64-- > %t
; RUN: not grep cmp %t
; RUN: not grep test %t
diff --git a/test/CodeGen/X86/peep-test-1.ll b/test/CodeGen/X86/peep-test-1.ll
index 7448da3894da..e3d34d30f7f4 100644
--- a/test/CodeGen/X86/peep-test-1.ll
+++ b/test/CodeGen/X86/peep-test-1.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 > %t
+; RUN: llc < %s -mtriple=i686-- > %t
; RUN: grep dec %t | count 1
; RUN: not grep test %t
; RUN: not grep cmp %t
diff --git a/test/CodeGen/X86/peep-test-2.ll b/test/CodeGen/X86/peep-test-2.ll
index e43b8ef54cf5..276e57551c19 100644
--- a/test/CodeGen/X86/peep-test-2.ll
+++ b/test/CodeGen/X86/peep-test-2.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -verify-machineinstrs -march=x86 | FileCheck %s
+; RUN: llc < %s -verify-machineinstrs | FileCheck %s
; CHECK: testl
diff --git a/test/CodeGen/X86/peep-test-3.ll b/test/CodeGen/X86/peep-test-3.ll
index b3d4f585f45d..9f8806a379b5 100644
--- a/test/CodeGen/X86/peep-test-3.ll
+++ b/test/CodeGen/X86/peep-test-3.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mcpu=generic -march=x86 -post-RA-scheduler=false | FileCheck %s
+; RUN: llc < %s -mcpu=generic -mtriple=i686-- -post-RA-scheduler=false | FileCheck %s
; rdar://7226797
; LLVM should omit the testl and use the flags result from the orl.
diff --git a/test/CodeGen/X86/peep-test-4.ll b/test/CodeGen/X86/peep-test-4.ll
index 832262aba7e4..788f8fdbc7b7 100644
--- a/test/CodeGen/X86/peep-test-4.ll
+++ b/test/CodeGen/X86/peep-test-4.ll
@@ -6,10 +6,10 @@ declare void @foo64(i64)
define void @neg(i32 %x) nounwind {
; CHECK-LABEL: neg:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: negl %edi
; CHECK-NEXT: je .LBB0_1
-; CHECK-NEXT: # BB#2: # %bb
+; CHECK-NEXT: # %bb.2: # %bb
; CHECK-NEXT: jmp foo # TAILCALL
; CHECK-NEXT: .LBB0_1: # %return
; CHECK-NEXT: retq
@@ -27,10 +27,10 @@ return:
define void @sar(i32 %x) nounwind {
; CHECK-LABEL: sar:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: sarl %edi
; CHECK-NEXT: je .LBB1_1
-; CHECK-NEXT: # BB#2: # %bb
+; CHECK-NEXT: # %bb.2: # %bb
; CHECK-NEXT: jmp foo # TAILCALL
; CHECK-NEXT: .LBB1_1: # %return
; CHECK-NEXT: retq
@@ -48,10 +48,10 @@ return:
define void @shr(i32 %x) nounwind {
; CHECK-LABEL: shr:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: shrl %edi
; CHECK-NEXT: je .LBB2_1
-; CHECK-NEXT: # BB#2: # %bb
+; CHECK-NEXT: # %bb.2: # %bb
; CHECK-NEXT: jmp foo # TAILCALL
; CHECK-NEXT: .LBB2_1: # %return
; CHECK-NEXT: retq
@@ -69,10 +69,10 @@ return:
define void @shri(i32 %x) nounwind {
; CHECK-LABEL: shri:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: shrl $3, %edi
; CHECK-NEXT: je .LBB3_1
-; CHECK-NEXT: # BB#2: # %bb
+; CHECK-NEXT: # %bb.2: # %bb
; CHECK-NEXT: jmp foo # TAILCALL
; CHECK-NEXT: .LBB3_1: # %return
; CHECK-NEXT: retq
@@ -90,10 +90,10 @@ return:
define void @shl(i32 %x) nounwind {
; CHECK-LABEL: shl:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: addl %edi, %edi
; CHECK-NEXT: je .LBB4_1
-; CHECK-NEXT: # BB#2: # %bb
+; CHECK-NEXT: # %bb.2: # %bb
; CHECK-NEXT: jmp foo # TAILCALL
; CHECK-NEXT: .LBB4_1: # %return
; CHECK-NEXT: retq
@@ -111,10 +111,10 @@ return:
define void @shli(i32 %x) nounwind {
; CHECK-LABEL: shli:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: shll $4, %edi
; CHECK-NEXT: je .LBB5_1
-; CHECK-NEXT: # BB#2: # %bb
+; CHECK-NEXT: # %bb.2: # %bb
; CHECK-NEXT: jmp foo # TAILCALL
; CHECK-NEXT: .LBB5_1: # %return
; CHECK-NEXT: retq
@@ -132,7 +132,7 @@ return:
define zeroext i1 @adc(i128 %x) nounwind {
; CHECK-LABEL: adc:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
; CHECK-NEXT: addq %rdi, %rax
; CHECK-NEXT: adcq $0, %rsi
@@ -145,7 +145,7 @@ define zeroext i1 @adc(i128 %x) nounwind {
define zeroext i1 @sbb(i128 %x, i128 %y) nounwind {
; CHECK-LABEL: sbb:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: cmpq %rdx, %rdi
; CHECK-NEXT: sbbq %rcx, %rsi
; CHECK-NEXT: setns %al
@@ -157,10 +157,10 @@ define zeroext i1 @sbb(i128 %x, i128 %y) nounwind {
define void @andn(i32 %x, i32 %y) nounwind {
; CHECK-LABEL: andn:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: andnl %esi, %edi, %edi
; CHECK-NEXT: je .LBB8_1
-; CHECK-NEXT: # BB#2: # %bb
+; CHECK-NEXT: # %bb.2: # %bb
; CHECK-NEXT: jmp foo # TAILCALL
; CHECK-NEXT: .LBB8_1: # %return
; CHECK-NEXT: retq
@@ -180,10 +180,10 @@ return:
declare i32 @llvm.x86.bmi.bextr.32(i32, i32) nounwind readnone
define void @bextr(i32 %x, i32 %y) nounwind {
; CHECK-LABEL: bextr:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: bextrl %esi, %edi, %edi
; CHECK-NEXT: je .LBB9_1
-; CHECK-NEXT: # BB#2: # %bb
+; CHECK-NEXT: # %bb.2: # %bb
; CHECK-NEXT: jmp foo # TAILCALL
; CHECK-NEXT: .LBB9_1: # %return
; CHECK-NEXT: retq
@@ -202,10 +202,10 @@ return:
declare i32 @llvm.ctpop.i32(i32) nounwind readnone
define void @popcnt(i32 %x) nounwind {
; CHECK-LABEL: popcnt:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: popcntl %edi, %edi
; CHECK-NEXT: je .LBB10_1
-; CHECK-NEXT: # BB#2: # %bb
+; CHECK-NEXT: # %bb.2: # %bb
; CHECK-NEXT: jmp foo # TAILCALL
; CHECK-NEXT: .LBB10_1: # %return
; CHECK-NEXT: retq
@@ -222,7 +222,7 @@ return:
declare i64 @llvm.cttz.i64(i64, i1)
define i64 @testCTZ(i64 %v) nounwind {
; CHECK-LABEL: testCTZ:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: tzcntq %rdi, %rcx
; CHECK-NEXT: movl $255, %eax
; CHECK-NEXT: cmovaeq %rcx, %rax
@@ -236,11 +236,11 @@ define i64 @testCTZ(i64 %v) nounwind {
declare i32 @llvm.cttz.i32(i32, i1)
define void @testCTZ2(i32 %v) nounwind {
; CHECK-LABEL: testCTZ2:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: pushq %rbx
; CHECK-NEXT: tzcntl %edi, %ebx
; CHECK-NEXT: jb .LBB12_2
-; CHECK-NEXT: # BB#1: # %bb
+; CHECK-NEXT: # %bb.1: # %bb
; CHECK-NEXT: movl %ebx, %edi
; CHECK-NEXT: callq foo
; CHECK-NEXT: .LBB12_2: # %return
@@ -262,11 +262,11 @@ return:
define void @testCTZ3(i32 %v) nounwind {
; CHECK-LABEL: testCTZ3:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: pushq %rbx
; CHECK-NEXT: tzcntl %edi, %ebx
; CHECK-NEXT: jae .LBB13_2
-; CHECK-NEXT: # BB#1: # %bb
+; CHECK-NEXT: # %bb.1: # %bb
; CHECK-NEXT: movl %ebx, %edi
; CHECK-NEXT: callq foo
; CHECK-NEXT: .LBB13_2: # %return
@@ -289,7 +289,7 @@ return:
declare i64 @llvm.ctlz.i64(i64, i1)
define i64 @testCLZ(i64 %v) nounwind {
; CHECK-LABEL: testCLZ:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: lzcntq %rdi, %rcx
; CHECK-NEXT: movl $255, %eax
; CHECK-NEXT: cmovaeq %rcx, %rax
@@ -303,7 +303,7 @@ define i64 @testCLZ(i64 %v) nounwind {
declare i64 @llvm.ctpop.i64(i64)
define i64 @testPOPCNT(i64 %v) nounwind {
; CHECK-LABEL: testPOPCNT:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: popcntq %rdi, %rcx
; CHECK-NEXT: movl $255, %eax
; CHECK-NEXT: cmovneq %rcx, %rax
diff --git a/test/CodeGen/X86/peephole-cvt-sse.ll b/test/CodeGen/X86/peephole-cvt-sse.ll
index ecf78a46c636..7e9290f2d61e 100644
--- a/test/CodeGen/X86/peephole-cvt-sse.ll
+++ b/test/CodeGen/X86/peephole-cvt-sse.ll
@@ -6,12 +6,12 @@
define <2 x double> @peephole_cvtps2pd(<4 x float>* %a0) {
; X86-64-LABEL: peephole_cvtps2pd:
-; X86-64: # BB#0:
+; X86-64: # %bb.0:
; X86-64-NEXT: cvtps2pd (%rdi), %xmm0
; X86-64-NEXT: retq
;
; I386-LABEL: peephole_cvtps2pd:
-; I386: # BB#0:
+; I386: # %bb.0:
; I386-NEXT: movl {{[0-9]+}}(%esp), %eax
; I386-NEXT: cvtps2pd (%eax), %xmm0
; I386-NEXT: retl
@@ -23,12 +23,12 @@ define <2 x double> @peephole_cvtps2pd(<4 x float>* %a0) {
define <2 x double> @peephole_cvtdq2pd(<4 x i32>* %a0) {
; X86-64-LABEL: peephole_cvtdq2pd:
-; X86-64: # BB#0:
+; X86-64: # %bb.0:
; X86-64-NEXT: cvtdq2pd (%rdi), %xmm0
; X86-64-NEXT: retq
;
; I386-LABEL: peephole_cvtdq2pd:
-; I386: # BB#0:
+; I386: # %bb.0:
; I386-NEXT: movl {{[0-9]+}}(%esp), %eax
; I386-NEXT: cvtdq2pd (%eax), %xmm0
; I386-NEXT: retl
diff --git a/test/CodeGen/X86/peephole-multiple-folds.ll b/test/CodeGen/X86/peephole-multiple-folds.ll
index 9fcc1a20798b..848f9fe2412c 100644
--- a/test/CodeGen/X86/peephole-multiple-folds.ll
+++ b/test/CodeGen/X86/peephole-multiple-folds.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=x86-64 -mcpu=core-avx2 < %s | FileCheck %s
+; RUN: llc -mtriple=x86_64-- -mcpu=core-avx2 < %s | FileCheck %s
;
; Test multiple peephole-time folds in a single basic block.
; <rdar://problem/16478629>
diff --git a/test/CodeGen/X86/peephole-na-phys-copy-folding.ll b/test/CodeGen/X86/peephole-na-phys-copy-folding.ll
index 4bdfee6f81eb..66047e3677f6 100644
--- a/test/CodeGen/X86/peephole-na-phys-copy-folding.ll
+++ b/test/CodeGen/X86/peephole-na-phys-copy-folding.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=i386-linux-gnu %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK32
; RUN: llc -mtriple=x86_64-linux-gnu -mattr=+sahf %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK64
@@ -10,16 +11,39 @@
@L = external global i32
@M = external global i8
+
declare i32 @bar(i64)
-; CHECK-LABEL: plus_one
-; CHECK-NOT: seto
-; CHECK-NOT: lahf
-; CHECK-NOT: sahf
-; CHECK-NOT: pushf
-; CHECK-NOT: popf
-; CHECK: incl L
-define i1 @plus_one() {
+define i1 @plus_one() nounwind {
+; CHECK32-LABEL: plus_one:
+; CHECK32: # %bb.0: # %entry
+; CHECK32-NEXT: movb M, %al
+; CHECK32-NEXT: incl L
+; CHECK32-NEXT: jne .LBB0_2
+; CHECK32-NEXT: # %bb.1: # %entry
+; CHECK32-NEXT: andb $8, %al
+; CHECK32-NEXT: je .LBB0_2
+; CHECK32-NEXT: # %bb.3: # %exit2
+; CHECK32-NEXT: xorl %eax, %eax
+; CHECK32-NEXT: retl
+; CHECK32-NEXT: .LBB0_2: # %exit
+; CHECK32-NEXT: movb $1, %al
+; CHECK32-NEXT: retl
+;
+; CHECK64-LABEL: plus_one:
+; CHECK64: # %bb.0: # %entry
+; CHECK64-NEXT: movb {{.*}}(%rip), %al
+; CHECK64-NEXT: incl {{.*}}(%rip)
+; CHECK64-NEXT: jne .LBB0_2
+; CHECK64-NEXT: # %bb.1: # %entry
+; CHECK64-NEXT: andb $8, %al
+; CHECK64-NEXT: je .LBB0_2
+; CHECK64-NEXT: # %bb.3: # %exit2
+; CHECK64-NEXT: xorl %eax, %eax
+; CHECK64-NEXT: retq
+; CHECK64-NEXT: .LBB0_2: # %exit
+; CHECK64-NEXT: movb $1, %al
+; CHECK64-NEXT: retq
entry:
%loaded_L = load i32, i32* @L
%val = add nsw i32 %loaded_L, 1 ; N.B. will emit inc.
@@ -38,14 +62,36 @@ exit2:
ret i1 false
}
-; CHECK-LABEL: plus_forty_two
-; CHECK-NOT: seto
-; CHECK-NOT: lahf
-; CHECK-NOT: sahf
-; CHECK-NOT: pushf
-; CHECK-NOT: popf
-; CHECK: addl $42,
-define i1 @plus_forty_two() {
+define i1 @plus_forty_two() nounwind {
+; CHECK32-LABEL: plus_forty_two:
+; CHECK32: # %bb.0: # %entry
+; CHECK32-NEXT: movb M, %al
+; CHECK32-NEXT: addl $42, L
+; CHECK32-NEXT: jne .LBB1_2
+; CHECK32-NEXT: # %bb.1: # %entry
+; CHECK32-NEXT: andb $8, %al
+; CHECK32-NEXT: je .LBB1_2
+; CHECK32-NEXT: # %bb.3: # %exit2
+; CHECK32-NEXT: xorl %eax, %eax
+; CHECK32-NEXT: retl
+; CHECK32-NEXT: .LBB1_2: # %exit
+; CHECK32-NEXT: movb $1, %al
+; CHECK32-NEXT: retl
+;
+; CHECK64-LABEL: plus_forty_two:
+; CHECK64: # %bb.0: # %entry
+; CHECK64-NEXT: movb {{.*}}(%rip), %al
+; CHECK64-NEXT: addl $42, {{.*}}(%rip)
+; CHECK64-NEXT: jne .LBB1_2
+; CHECK64-NEXT: # %bb.1: # %entry
+; CHECK64-NEXT: andb $8, %al
+; CHECK64-NEXT: je .LBB1_2
+; CHECK64-NEXT: # %bb.3: # %exit2
+; CHECK64-NEXT: xorl %eax, %eax
+; CHECK64-NEXT: retq
+; CHECK64-NEXT: .LBB1_2: # %exit
+; CHECK64-NEXT: movb $1, %al
+; CHECK64-NEXT: retq
entry:
%loaded_L = load i32, i32* @L
%val = add nsw i32 %loaded_L, 42 ; N.B. won't emit inc.
@@ -64,14 +110,36 @@ exit2:
ret i1 false
}
-; CHECK-LABEL: minus_one
-; CHECK-NOT: seto
-; CHECK-NOT: lahf
-; CHECK-NOT: sahf
-; CHECK-NOT: pushf
-; CHECK-NOT: popf
-; CHECK: decl L
-define i1 @minus_one() {
+define i1 @minus_one() nounwind {
+; CHECK32-LABEL: minus_one:
+; CHECK32: # %bb.0: # %entry
+; CHECK32-NEXT: movb M, %al
+; CHECK32-NEXT: decl L
+; CHECK32-NEXT: jne .LBB2_2
+; CHECK32-NEXT: # %bb.1: # %entry
+; CHECK32-NEXT: andb $8, %al
+; CHECK32-NEXT: je .LBB2_2
+; CHECK32-NEXT: # %bb.3: # %exit2
+; CHECK32-NEXT: xorl %eax, %eax
+; CHECK32-NEXT: retl
+; CHECK32-NEXT: .LBB2_2: # %exit
+; CHECK32-NEXT: movb $1, %al
+; CHECK32-NEXT: retl
+;
+; CHECK64-LABEL: minus_one:
+; CHECK64: # %bb.0: # %entry
+; CHECK64-NEXT: movb {{.*}}(%rip), %al
+; CHECK64-NEXT: decl {{.*}}(%rip)
+; CHECK64-NEXT: jne .LBB2_2
+; CHECK64-NEXT: # %bb.1: # %entry
+; CHECK64-NEXT: andb $8, %al
+; CHECK64-NEXT: je .LBB2_2
+; CHECK64-NEXT: # %bb.3: # %exit2
+; CHECK64-NEXT: xorl %eax, %eax
+; CHECK64-NEXT: retq
+; CHECK64-NEXT: .LBB2_2: # %exit
+; CHECK64-NEXT: movb $1, %al
+; CHECK64-NEXT: retq
entry:
%loaded_L = load i32, i32* @L
%val = add nsw i32 %loaded_L, -1 ; N.B. will emit dec.
@@ -90,14 +158,36 @@ exit2:
ret i1 false
}
-; CHECK-LABEL: minus_forty_two
-; CHECK-NOT: seto
-; CHECK-NOT: lahf
-; CHECK-NOT: sahf
-; CHECK-NOT: pushf
-; CHECK-NOT: popf
-; CHECK: addl $-42,
-define i1 @minus_forty_two() {
+define i1 @minus_forty_two() nounwind {
+; CHECK32-LABEL: minus_forty_two:
+; CHECK32: # %bb.0: # %entry
+; CHECK32-NEXT: movb M, %al
+; CHECK32-NEXT: addl $-42, L
+; CHECK32-NEXT: jne .LBB3_2
+; CHECK32-NEXT: # %bb.1: # %entry
+; CHECK32-NEXT: andb $8, %al
+; CHECK32-NEXT: je .LBB3_2
+; CHECK32-NEXT: # %bb.3: # %exit2
+; CHECK32-NEXT: xorl %eax, %eax
+; CHECK32-NEXT: retl
+; CHECK32-NEXT: .LBB3_2: # %exit
+; CHECK32-NEXT: movb $1, %al
+; CHECK32-NEXT: retl
+;
+; CHECK64-LABEL: minus_forty_two:
+; CHECK64: # %bb.0: # %entry
+; CHECK64-NEXT: movb {{.*}}(%rip), %al
+; CHECK64-NEXT: addl $-42, {{.*}}(%rip)
+; CHECK64-NEXT: jne .LBB3_2
+; CHECK64-NEXT: # %bb.1: # %entry
+; CHECK64-NEXT: andb $8, %al
+; CHECK64-NEXT: je .LBB3_2
+; CHECK64-NEXT: # %bb.3: # %exit2
+; CHECK64-NEXT: xorl %eax, %eax
+; CHECK64-NEXT: retq
+; CHECK64-NEXT: .LBB3_2: # %exit
+; CHECK64-NEXT: movb $1, %al
+; CHECK64-NEXT: retq
entry:
%loaded_L = load i32, i32* @L
%val = add nsw i32 %loaded_L, -42 ; N.B. won't emit dec.
@@ -116,14 +206,75 @@ exit2:
ret i1 false
}
-; CHECK-LABEL: test_intervening_call:
-; CHECK: cmpxchg
-; CHECK: seto %al
-; CHECK-NEXT: lahf
-; CHECK: call{{[lq]}} bar
-; CHECK: addb $127, %al
-; CHECK-NEXT: sahf
-define i64 @test_intervening_call(i64* %foo, i64 %bar, i64 %baz) {
+define i64 @test_intervening_call(i64* %foo, i64 %bar, i64 %baz) nounwind {
+; CHECK32-LABEL: test_intervening_call:
+; CHECK32: # %bb.0: # %entry
+; CHECK32-NEXT: pushl %ebp
+; CHECK32-NEXT: movl %esp, %ebp
+; CHECK32-NEXT: pushl %ebx
+; CHECK32-NEXT: pushl %esi
+; CHECK32-NEXT: movl 12(%ebp), %eax
+; CHECK32-NEXT: movl 16(%ebp), %edx
+; CHECK32-NEXT: movl 20(%ebp), %ebx
+; CHECK32-NEXT: movl 24(%ebp), %ecx
+; CHECK32-NEXT: movl 8(%ebp), %esi
+; CHECK32-NEXT: lock cmpxchg8b (%esi)
+; CHECK32-NEXT: pushl %eax
+; CHECK32-NEXT: seto %al
+; CHECK32-NEXT: lahf
+; CHECK32-NEXT: movl %eax, %esi
+; CHECK32-NEXT: popl %eax
+; CHECK32-NEXT: subl $8, %esp
+; CHECK32-NEXT: pushl %edx
+; CHECK32-NEXT: pushl %eax
+; CHECK32-NEXT: calll bar
+; CHECK32-NEXT: addl $16, %esp
+; CHECK32-NEXT: movl %esi, %eax
+; CHECK32-NEXT: addb $127, %al
+; CHECK32-NEXT: sahf
+; CHECK32-NEXT: jne .LBB4_3
+; CHECK32-NEXT: # %bb.1: # %t
+; CHECK32-NEXT: movl $42, %eax
+; CHECK32-NEXT: jmp .LBB4_2
+; CHECK32-NEXT: .LBB4_3: # %f
+; CHECK32-NEXT: xorl %eax, %eax
+; CHECK32-NEXT: .LBB4_2: # %t
+; CHECK32-NEXT: xorl %edx, %edx
+; CHECK32-NEXT: popl %esi
+; CHECK32-NEXT: popl %ebx
+; CHECK32-NEXT: popl %ebp
+; CHECK32-NEXT: retl
+;
+; CHECK64-LABEL: test_intervening_call:
+; CHECK64: # %bb.0: # %entry
+; CHECK64-NEXT: pushq %rbp
+; CHECK64-NEXT: movq %rsp, %rbp
+; CHECK64-NEXT: pushq %rbx
+; CHECK64-NEXT: pushq %rax
+; CHECK64-NEXT: movq %rsi, %rax
+; CHECK64-NEXT: lock cmpxchgq %rdx, (%rdi)
+; CHECK64-NEXT: pushq %rax
+; CHECK64-NEXT: seto %al
+; CHECK64-NEXT: lahf
+; CHECK64-NEXT: movq %rax, %rbx
+; CHECK64-NEXT: popq %rax
+; CHECK64-NEXT: movq %rax, %rdi
+; CHECK64-NEXT: callq bar
+; CHECK64-NEXT: movq %rbx, %rax
+; CHECK64-NEXT: addb $127, %al
+; CHECK64-NEXT: sahf
+; CHECK64-NEXT: jne .LBB4_3
+; CHECK64-NEXT: # %bb.1: # %t
+; CHECK64-NEXT: movl $42, %eax
+; CHECK64-NEXT: jmp .LBB4_2
+; CHECK64-NEXT: .LBB4_3: # %f
+; CHECK64-NEXT: xorl %eax, %eax
+; CHECK64-NEXT: .LBB4_2: # %t
+; CHECK64-NEXT: addq $8, %rsp
+; CHECK64-NEXT: popq %rbx
+; CHECK64-NEXT: popq %rbp
+; CHECK64-NEXT: retq
+entry:
; cmpxchg sets EFLAGS, call clobbers it, then br uses EFLAGS.
%cx = cmpxchg i64* %foo, i64 %bar, i64 %baz seq_cst seq_cst
%v = extractvalue { i64, i1 } %cx, 0
@@ -138,37 +289,83 @@ f:
ret i64 0
}
-; CHECK-LABEL: test_two_live_flags:
-; CHECK: cmpxchg
-; CHECK: seto %al
-; CHECK-NEXT: lahf
-; Save result of the first cmpxchg into a temporary.
-; For 32-bit ISA, EDX, EAX are used by the results.
-; EAX, EBX, ECX, and EDX are used to set the arguments.
-; That leaves us EDI and ESI.
-; CHECK32-NEXT: movl %[[AX:eax]], %[[TMP:e[ds]i]]
-; For 64-bit ISA, RAX is used for both the result and argument.
-; This leaves us plenty of choices for the temporary. For now,
-; this is rdx, but any register could do.
-; CHECK64-NEXT: mov{{[lq]}} %[[AX:[er]ax]], %[[TMP:rdx]]
-; CHECK: cmpxchg
-; CHECK-NEXT: sete %al
-; Save result of the second cmpxchg onto the stack.
-; CHECK-NEXT: push{{[lq]}} %[[AX]]
-; Restore result of the first cmpxchg from D, put it back in EFLAGS.
-; CHECK-NEXT: mov{{[lq]}} %[[TMP]], %[[AX]]
-; CHECK-NEXT: addb $127, %al
-; CHECK-NEXT: sahf
-; Restore result of the second cmpxchg from the stack.
-; CHECK-NEXT: pop{{[lq]}} %[[AX]]
-; Test from EFLAGS restored from first cmpxchg, jump if that fails.
-; CHECK-NEXT: jne
-; Fallthrough to test the second cmpxchg's result.
-; CHECK: testb %al, %al
-; CHECK-NEXT: je
-define i64 @test_two_live_flags(
- i64* %foo0, i64 %bar0, i64 %baz0,
- i64* %foo1, i64 %bar1, i64 %baz1) {
+define i64 @test_two_live_flags(i64* %foo0, i64 %bar0, i64 %baz0, i64* %foo1, i64 %bar1, i64 %baz1) nounwind {
+; CHECK32-LABEL: test_two_live_flags:
+; CHECK32: # %bb.0: # %entry
+; CHECK32-NEXT: pushl %ebp
+; CHECK32-NEXT: movl %esp, %ebp
+; CHECK32-NEXT: pushl %ebx
+; CHECK32-NEXT: pushl %edi
+; CHECK32-NEXT: pushl %esi
+; CHECK32-NEXT: movl 44(%ebp), %edi
+; CHECK32-NEXT: movl 12(%ebp), %eax
+; CHECK32-NEXT: movl 16(%ebp), %edx
+; CHECK32-NEXT: movl 20(%ebp), %ebx
+; CHECK32-NEXT: movl 24(%ebp), %ecx
+; CHECK32-NEXT: movl 8(%ebp), %esi
+; CHECK32-NEXT: lock cmpxchg8b (%esi)
+; CHECK32-NEXT: seto %al
+; CHECK32-NEXT: lahf
+; CHECK32-NEXT: movl %eax, %esi
+; CHECK32-NEXT: movl 32(%ebp), %eax
+; CHECK32-NEXT: movl 36(%ebp), %edx
+; CHECK32-NEXT: movl %edi, %ecx
+; CHECK32-NEXT: movl 40(%ebp), %ebx
+; CHECK32-NEXT: movl 28(%ebp), %edi
+; CHECK32-NEXT: lock cmpxchg8b (%edi)
+; CHECK32-NEXT: sete %al
+; CHECK32-NEXT: pushl %eax
+; CHECK32-NEXT: movl %esi, %eax
+; CHECK32-NEXT: addb $127, %al
+; CHECK32-NEXT: sahf
+; CHECK32-NEXT: popl %eax
+; CHECK32-NEXT: jne .LBB5_4
+; CHECK32-NEXT: # %bb.1: # %entry
+; CHECK32-NEXT: testb %al, %al
+; CHECK32-NEXT: je .LBB5_4
+; CHECK32-NEXT: # %bb.2: # %t
+; CHECK32-NEXT: movl $42, %eax
+; CHECK32-NEXT: jmp .LBB5_3
+; CHECK32-NEXT: .LBB5_4: # %f
+; CHECK32-NEXT: xorl %eax, %eax
+; CHECK32-NEXT: .LBB5_3: # %t
+; CHECK32-NEXT: xorl %edx, %edx
+; CHECK32-NEXT: popl %esi
+; CHECK32-NEXT: popl %edi
+; CHECK32-NEXT: popl %ebx
+; CHECK32-NEXT: popl %ebp
+; CHECK32-NEXT: retl
+;
+; CHECK64-LABEL: test_two_live_flags:
+; CHECK64: # %bb.0: # %entry
+; CHECK64-NEXT: pushq %rbp
+; CHECK64-NEXT: movq %rsp, %rbp
+; CHECK64-NEXT: movq %rsi, %rax
+; CHECK64-NEXT: lock cmpxchgq %rdx, (%rdi)
+; CHECK64-NEXT: seto %al
+; CHECK64-NEXT: lahf
+; CHECK64-NEXT: movq %rax, %rdx
+; CHECK64-NEXT: movq %r8, %rax
+; CHECK64-NEXT: lock cmpxchgq %r9, (%rcx)
+; CHECK64-NEXT: sete %al
+; CHECK64-NEXT: pushq %rax
+; CHECK64-NEXT: movq %rdx, %rax
+; CHECK64-NEXT: addb $127, %al
+; CHECK64-NEXT: sahf
+; CHECK64-NEXT: popq %rax
+; CHECK64-NEXT: jne .LBB5_3
+; CHECK64-NEXT: # %bb.1: # %entry
+; CHECK64-NEXT: testb %al, %al
+; CHECK64-NEXT: je .LBB5_3
+; CHECK64-NEXT: # %bb.2: # %t
+; CHECK64-NEXT: movl $42, %eax
+; CHECK64-NEXT: popq %rbp
+; CHECK64-NEXT: retq
+; CHECK64-NEXT: .LBB5_3: # %f
+; CHECK64-NEXT: xorl %eax, %eax
+; CHECK64-NEXT: popq %rbp
+; CHECK64-NEXT: retq
+entry:
%cx0 = cmpxchg i64* %foo0, i64 %bar0, i64 %baz0 seq_cst seq_cst
%p0 = extractvalue { i64, i1 } %cx0, 1
%cx1 = cmpxchg i64* %foo1, i64 %bar1, i64 %baz1 seq_cst seq_cst
@@ -183,15 +380,30 @@ f:
ret i64 0
}
-; CHECK-LABEL: asm_clobbering_flags:
-; CHECK: test
-; CHECK-NEXT: setg
-; CHECK-NEXT: #APP
-; CHECK-NEXT: bsfl
-; CHECK-NEXT: #NO_APP
-; CHECK-NEXT: movl
-; CHECK-NEXT: ret
-define i1 @asm_clobbering_flags(i32* %mem) {
+define i1 @asm_clobbering_flags(i32* %mem) nounwind {
+; CHECK32-LABEL: asm_clobbering_flags:
+; CHECK32: # %bb.0: # %entry
+; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; CHECK32-NEXT: movl (%ecx), %edx
+; CHECK32-NEXT: testl %edx, %edx
+; CHECK32-NEXT: setg %al
+; CHECK32-NEXT: #APP
+; CHECK32-NEXT: bsfl %edx, %edx
+; CHECK32-NEXT: #NO_APP
+; CHECK32-NEXT: movl %edx, (%ecx)
+; CHECK32-NEXT: retl
+;
+; CHECK64-LABEL: asm_clobbering_flags:
+; CHECK64: # %bb.0: # %entry
+; CHECK64-NEXT: movl (%rdi), %ecx
+; CHECK64-NEXT: testl %ecx, %ecx
+; CHECK64-NEXT: setg %al
+; CHECK64-NEXT: #APP
+; CHECK64-NEXT: bsfl %ecx, %ecx
+; CHECK64-NEXT: #NO_APP
+; CHECK64-NEXT: movl %ecx, (%rdi)
+; CHECK64-NEXT: retq
+entry:
%val = load i32, i32* %mem, align 4
%cmp = icmp sgt i32 %val, 0
%res = tail call i32 asm "bsfl $1,$0", "=r,r,~{cc},~{dirflag},~{fpsr},~{flags}"(i32 %val)
diff --git a/test/CodeGen/X86/peephole-recurrence.mir b/test/CodeGen/X86/peephole-recurrence.mir
index af57a4fd526f..3fc8b2a31373 100644
--- a/test/CodeGen/X86/peephole-recurrence.mir
+++ b/test/CodeGen/X86/peephole-recurrence.mir
@@ -4,54 +4,54 @@
define i32 @foo(i32 %a) {
bb0:
br label %bb1
-
+
bb1: ; preds = %bb7, %bb0
%vreg0 = phi i32 [ 0, %bb0 ], [ %vreg3, %bb7 ]
%cond0 = icmp eq i32 %a, 0
br i1 %cond0, label %bb4, label %bb3
-
+
bb3: ; preds = %bb1
br label %bb4
-
+
bb4: ; preds = %bb1, %bb3
%vreg5 = phi i32 [ 2, %bb3 ], [ 1, %bb1 ]
%cond1 = icmp eq i32 %vreg5, 0
br i1 %cond1, label %bb7, label %bb6
-
+
bb6: ; preds = %bb4
br label %bb7
-
+
bb7: ; preds = %bb4, %bb6
%vreg1 = phi i32 [ 2, %bb6 ], [ 1, %bb4 ]
%vreg2 = add i32 %vreg5, %vreg0
%vreg3 = add i32 %vreg1, %vreg2
%cond2 = icmp slt i32 %vreg3, 10
br i1 %cond2, label %bb1, label %bb8
-
+
bb8: ; preds = %bb7
ret i32 0
}
-
+
define i32 @bar(i32 %a, i32* %p) {
bb0:
br label %bb1
-
+
bb1: ; preds = %bb7, %bb0
%vreg0 = phi i32 [ 0, %bb0 ], [ %vreg3, %bb7 ]
%cond0 = icmp eq i32 %a, 0
br i1 %cond0, label %bb4, label %bb3
-
+
bb3: ; preds = %bb1
br label %bb4
-
+
bb4: ; preds = %bb1, %bb3
%vreg5 = phi i32 [ 2, %bb3 ], [ 1, %bb1 ]
%cond1 = icmp eq i32 %vreg5, 0
br i1 %cond1, label %bb7, label %bb6
-
+
bb6: ; preds = %bb4
br label %bb7
-
+
bb7: ; preds = %bb4, %bb6
%vreg1 = phi i32 [ 2, %bb6 ], [ 1, %bb4 ]
%vreg2 = add i32 %vreg5, %vreg0
@@ -59,7 +59,7 @@
%vreg3 = add i32 %vreg1, %vreg2
%cond2 = icmp slt i32 %vreg3, 10
br i1 %cond2, label %bb1, label %bb8
-
+
bb8: ; preds = %bb7
ret i32 0
}
@@ -71,7 +71,7 @@
# the recurrence are tied. This will remove redundant copy instruction.
name: foo
tracksRegLiveness: true
-registers:
+registers:
- { id: 0, class: gr32, preferred-register: '' }
- { id: 1, class: gr32, preferred-register: '' }
- { id: 2, class: gr32, preferred-register: '' }
@@ -85,60 +85,60 @@ registers:
- { id: 10, class: gr32, preferred-register: '' }
- { id: 11, class: gr32, preferred-register: '' }
- { id: 12, class: gr32, preferred-register: '' }
-liveins:
+liveins:
- { reg: '%edi', virtual-reg: '%4' }
body: |
bb.0.bb0:
- successors: %bb.1.bb1(0x80000000)
+ successors: %bb.1(0x80000000)
liveins: %edi
-
+
%4 = COPY %edi
%5 = MOV32r0 implicit-def dead %eflags
-
+
bb.1.bb1:
- successors: %bb.3.bb4(0x30000000), %bb.2.bb3(0x50000000)
-
- ; CHECK: %0 = PHI %5, %bb.0.bb0, %3, %bb.5.bb7
- %0 = PHI %5, %bb.0.bb0, %3, %bb.5.bb7
+ successors: %bb.3(0x30000000), %bb.2(0x50000000)
+
+ ; CHECK: %0:gr32 = PHI %5, %bb.0, %3, %bb.5
+ %0 = PHI %5, %bb.0, %3, %bb.5
%6 = MOV32ri 1
TEST32rr %4, %4, implicit-def %eflags
- JE_1 %bb.3.bb4, implicit %eflags
- JMP_1 %bb.2.bb3
-
+ JE_1 %bb.3, implicit %eflags
+ JMP_1 %bb.2
+
bb.2.bb3:
- successors: %bb.3.bb4(0x80000000)
-
+ successors: %bb.3(0x80000000)
+
%7 = MOV32ri 2
-
+
bb.3.bb4:
- successors: %bb.5.bb7(0x30000000), %bb.4.bb6(0x50000000)
-
- %1 = PHI %6, %bb.1.bb1, %7, %bb.2.bb3
+ successors: %bb.5(0x30000000), %bb.4(0x50000000)
+
+ %1 = PHI %6, %bb.1, %7, %bb.2
TEST32rr %1, %1, implicit-def %eflags
- JE_1 %bb.5.bb7, implicit %eflags
- JMP_1 %bb.4.bb6
-
+ JE_1 %bb.5, implicit %eflags
+ JMP_1 %bb.4
+
bb.4.bb6:
- successors: %bb.5.bb7(0x80000000)
-
+ successors: %bb.5(0x80000000)
+
%9 = MOV32ri 2
-
+
bb.5.bb7:
- successors: %bb.1.bb1(0x7c000000), %bb.6.bb8(0x04000000)
-
- %2 = PHI %6, %bb.3.bb4, %9, %bb.4.bb6
+ successors: %bb.1(0x7c000000), %bb.6(0x04000000)
+
+ %2 = PHI %6, %bb.3, %9, %bb.4
%10 = ADD32rr %1, %0, implicit-def dead %eflags
- ; CHECK: %10 = ADD32rr
+ ; CHECK: %10:gr32 = ADD32rr
; CHECK-SAME: %0,
; CHECK-SAME: %1,
%3 = ADD32rr %2, killed %10, implicit-def dead %eflags
- ; CHECK: %3 = ADD32rr
+ ; CHECK: %3:gr32 = ADD32rr
; CHECK-SAME: %10,
; CHECK-SAME: %2,
%11 = SUB32ri8 %3, 10, implicit-def %eflags
- JL_1 %bb.1.bb1, implicit %eflags
- JMP_1 %bb.6.bb8
-
+ JL_1 %bb.1, implicit %eflags
+ JMP_1 %bb.6
+
bb.6.bb8:
%12 = MOV32r0 implicit-def dead %eflags
%eax = COPY %12
@@ -149,10 +149,10 @@ body: |
# Here a recurrence is formulated around %0, %11, and %3, but operands should
# not be commuted because %0 has a use outside of recurrence. This is to
# prevent the case of commuting operands ties the values with overlapping live
-# ranges.
+# ranges.
name: bar
tracksRegLiveness: true
-registers:
+registers:
- { id: 0, class: gr32, preferred-register: '' }
- { id: 1, class: gr32, preferred-register: '' }
- { id: 2, class: gr32, preferred-register: '' }
@@ -167,63 +167,63 @@ registers:
- { id: 11, class: gr32, preferred-register: '' }
- { id: 12, class: gr32, preferred-register: '' }
- { id: 13, class: gr32, preferred-register: '' }
-liveins:
+liveins:
- { reg: '%edi', virtual-reg: '%4' }
- { reg: '%rsi', virtual-reg: '%5' }
body: |
bb.0.bb0:
- successors: %bb.1.bb1(0x80000000)
+ successors: %bb.1(0x80000000)
liveins: %edi, %rsi
-
+
%5 = COPY %rsi
%4 = COPY %edi
%6 = MOV32r0 implicit-def dead %eflags
-
+
bb.1.bb1:
- successors: %bb.3.bb4(0x30000000), %bb.2.bb3(0x50000000)
-
- %0 = PHI %6, %bb.0.bb0, %3, %bb.5.bb7
- ; CHECK: %0 = PHI %6, %bb.0.bb0, %3, %bb.5.bb7
+ successors: %bb.3(0x30000000), %bb.2(0x50000000)
+
+ %0 = PHI %6, %bb.0, %3, %bb.5
+ ; CHECK: %0:gr32 = PHI %6, %bb.0, %3, %bb.5
%7 = MOV32ri 1
TEST32rr %4, %4, implicit-def %eflags
- JE_1 %bb.3.bb4, implicit %eflags
- JMP_1 %bb.2.bb3
-
+ JE_1 %bb.3, implicit %eflags
+ JMP_1 %bb.2
+
bb.2.bb3:
- successors: %bb.3.bb4(0x80000000)
-
+ successors: %bb.3(0x80000000)
+
%8 = MOV32ri 2
-
+
bb.3.bb4:
- successors: %bb.5.bb7(0x30000000), %bb.4.bb6(0x50000000)
-
- %1 = PHI %7, %bb.1.bb1, %8, %bb.2.bb3
+ successors: %bb.5(0x30000000), %bb.4(0x50000000)
+
+ %1 = PHI %7, %bb.1, %8, %bb.2
TEST32rr %1, %1, implicit-def %eflags
- JE_1 %bb.5.bb7, implicit %eflags
- JMP_1 %bb.4.bb6
-
+ JE_1 %bb.5, implicit %eflags
+ JMP_1 %bb.4
+
bb.4.bb6:
- successors: %bb.5.bb7(0x80000000)
-
+ successors: %bb.5(0x80000000)
+
%10 = MOV32ri 2
-
+
bb.5.bb7:
- successors: %bb.1.bb1(0x7c000000), %bb.6.bb8(0x04000000)
-
- %2 = PHI %7, %bb.3.bb4, %10, %bb.4.bb6
+ successors: %bb.1(0x7c000000), %bb.6(0x04000000)
+
+ %2 = PHI %7, %bb.3, %10, %bb.4
%11 = ADD32rr %1, %0, implicit-def dead %eflags
- ; CHECK: %11 = ADD32rr
+ ; CHECK: %11:gr32 = ADD32rr
; CHECK-SAME: %1,
; CHECK-SAME: %0,
- MOV32mr %5, 1, _, 0, _, %0 :: (store 4 into %ir.p)
+ MOV32mr %5, 1, %noreg, 0, %noreg, %0 :: (store 4 into %ir.p)
%3 = ADD32rr %2, killed %11, implicit-def dead %eflags
- ; CHECK: %3 = ADD32rr
+ ; CHECK: %3:gr32 = ADD32rr
; CHECK-SAME: %2,
; CHECK-SAME: %11,
%12 = SUB32ri8 %3, 10, implicit-def %eflags
- JL_1 %bb.1.bb1, implicit %eflags
- JMP_1 %bb.6.bb8
-
+ JL_1 %bb.1, implicit %eflags
+ JMP_1 %bb.6
+
bb.6.bb8:
%13 = MOV32r0 implicit-def dead %eflags
%eax = COPY %13
diff --git a/test/CodeGen/X86/peephole.mir b/test/CodeGen/X86/peephole.mir
index 6391836e9ca2..28ce9f1f0e82 100644
--- a/test/CodeGen/X86/peephole.mir
+++ b/test/CodeGen/X86/peephole.mir
@@ -19,18 +19,18 @@ registers:
body: |
bb.0:
- ; CHECK: %1 = VMOVDI2SSrr %0
- ; CHECK: %7 = COPY %0
+ ; CHECK: %1:fr32 = VMOVDI2SSrr %0
+ ; CHECK: %7:gr32 = COPY %0
; CHECK: NOOP implicit %7
%0 = MOV32ri 42
%1 = VMOVDI2SSrr %0
%2 = MOVSS2DIrr %1
NOOP implicit %2
- ; CHECK: %4 = VMOVDI2SSrr %3
+ ; CHECK: %4:fr32 = VMOVDI2SSrr %3
; CHECK-NOT: COPY
- ; CHECK: %5 = MOVSS2DIrr %4
- ; CHECK: %6 = SUBREG_TO_REG %5, 0
+ ; CHECK: %5:gr32 = MOVSS2DIrr %4
+ ; CHECK: %6:gr64 = SUBREG_TO_REG %5, 0
; CHECK: NOOP implicit %6
%3 = MOV32ri 42
%4 = VMOVDI2SSrr %3
diff --git a/test/CodeGen/X86/phaddsub.ll b/test/CodeGen/X86/phaddsub.ll
index 08015258867b..64f89354136b 100644
--- a/test/CodeGen/X86/phaddsub.ll
+++ b/test/CodeGen/X86/phaddsub.ll
@@ -4,12 +4,12 @@
define <8 x i16> @phaddw1(<8 x i16> %x, <8 x i16> %y) {
; SSSE3-LABEL: phaddw1:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: phaddw %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; AVX-LABEL: phaddw1:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vphaddw %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%a = shufflevector <8 x i16> %x, <8 x i16> %y, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
@@ -20,12 +20,12 @@ define <8 x i16> @phaddw1(<8 x i16> %x, <8 x i16> %y) {
define <8 x i16> @phaddw2(<8 x i16> %x, <8 x i16> %y) {
; SSSE3-LABEL: phaddw2:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: phaddw %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; AVX-LABEL: phaddw2:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vphaddw %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%a = shufflevector <8 x i16> %x, <8 x i16> %y, <8 x i32> <i32 1, i32 2, i32 5, i32 6, i32 9, i32 10, i32 13, i32 14>
@@ -36,12 +36,12 @@ define <8 x i16> @phaddw2(<8 x i16> %x, <8 x i16> %y) {
define <4 x i32> @phaddd1(<4 x i32> %x, <4 x i32> %y) {
; SSSE3-LABEL: phaddd1:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: phaddd %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; AVX-LABEL: phaddd1:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vphaddd %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%a = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
@@ -52,12 +52,12 @@ define <4 x i32> @phaddd1(<4 x i32> %x, <4 x i32> %y) {
define <4 x i32> @phaddd2(<4 x i32> %x, <4 x i32> %y) {
; SSSE3-LABEL: phaddd2:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: phaddd %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; AVX-LABEL: phaddd2:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vphaddd %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%a = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
@@ -68,12 +68,12 @@ define <4 x i32> @phaddd2(<4 x i32> %x, <4 x i32> %y) {
define <4 x i32> @phaddd3(<4 x i32> %x) {
; SSSE3-LABEL: phaddd3:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: phaddd %xmm0, %xmm0
; SSSE3-NEXT: retq
;
; AVX-LABEL: phaddd3:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0
; AVX-NEXT: retq
%a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 2, i32 4, i32 6>
@@ -84,12 +84,12 @@ define <4 x i32> @phaddd3(<4 x i32> %x) {
define <4 x i32> @phaddd4(<4 x i32> %x) {
; SSSE3-LABEL: phaddd4:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: phaddd %xmm0, %xmm0
; SSSE3-NEXT: retq
;
; AVX-LABEL: phaddd4:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0
; AVX-NEXT: retq
%a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
@@ -100,12 +100,12 @@ define <4 x i32> @phaddd4(<4 x i32> %x) {
define <4 x i32> @phaddd5(<4 x i32> %x) {
; SSSE3-LABEL: phaddd5:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: phaddd %xmm0, %xmm0
; SSSE3-NEXT: retq
;
; AVX-LABEL: phaddd5:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0
; AVX-NEXT: retq
%a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 0, i32 3, i32 undef, i32 undef>
@@ -116,12 +116,12 @@ define <4 x i32> @phaddd5(<4 x i32> %x) {
define <4 x i32> @phaddd6(<4 x i32> %x) {
; SSSE3-LABEL: phaddd6:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: phaddd %xmm0, %xmm0
; SSSE3-NEXT: retq
;
; AVX-LABEL: phaddd6:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0
; AVX-NEXT: retq
%a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
@@ -132,12 +132,12 @@ define <4 x i32> @phaddd6(<4 x i32> %x) {
define <4 x i32> @phaddd7(<4 x i32> %x) {
; SSSE3-LABEL: phaddd7:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: phaddd %xmm0, %xmm0
; SSSE3-NEXT: retq
;
; AVX-LABEL: phaddd7:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0
; AVX-NEXT: retq
%a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 3, i32 undef, i32 undef>
@@ -148,12 +148,12 @@ define <4 x i32> @phaddd7(<4 x i32> %x) {
define <8 x i16> @phsubw1(<8 x i16> %x, <8 x i16> %y) {
; SSSE3-LABEL: phsubw1:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: phsubw %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; AVX-LABEL: phsubw1:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vphsubw %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%a = shufflevector <8 x i16> %x, <8 x i16> %y, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
@@ -164,12 +164,12 @@ define <8 x i16> @phsubw1(<8 x i16> %x, <8 x i16> %y) {
define <4 x i32> @phsubd1(<4 x i32> %x, <4 x i32> %y) {
; SSSE3-LABEL: phsubd1:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: phsubd %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; AVX-LABEL: phsubd1:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vphsubd %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%a = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
@@ -180,12 +180,12 @@ define <4 x i32> @phsubd1(<4 x i32> %x, <4 x i32> %y) {
define <4 x i32> @phsubd2(<4 x i32> %x) {
; SSSE3-LABEL: phsubd2:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: phsubd %xmm0, %xmm0
; SSSE3-NEXT: retq
;
; AVX-LABEL: phsubd2:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vphsubd %xmm0, %xmm0, %xmm0
; AVX-NEXT: retq
%a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 2, i32 4, i32 6>
@@ -196,12 +196,12 @@ define <4 x i32> @phsubd2(<4 x i32> %x) {
define <4 x i32> @phsubd3(<4 x i32> %x) {
; SSSE3-LABEL: phsubd3:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: phsubd %xmm0, %xmm0
; SSSE3-NEXT: retq
;
; AVX-LABEL: phsubd3:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vphsubd %xmm0, %xmm0, %xmm0
; AVX-NEXT: retq
%a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
@@ -212,12 +212,12 @@ define <4 x i32> @phsubd3(<4 x i32> %x) {
define <4 x i32> @phsubd4(<4 x i32> %x) {
; SSSE3-LABEL: phsubd4:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: phsubd %xmm0, %xmm0
; SSSE3-NEXT: retq
;
; AVX-LABEL: phsubd4:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vphsubd %xmm0, %xmm0, %xmm0
; AVX-NEXT: retq
%a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
@@ -228,7 +228,7 @@ define <4 x i32> @phsubd4(<4 x i32> %x) {
define <8 x i16> @phsubw1_reverse(<8 x i16> %x, <8 x i16> %y) {
; SSSE3-LABEL: phsubw1_reverse:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
; SSSE3-NEXT: movdqa %xmm1, %xmm4
; SSSE3-NEXT: pshufb %xmm3, %xmm4
@@ -244,7 +244,7 @@ define <8 x i16> @phsubw1_reverse(<8 x i16> %x, <8 x i16> %y) {
; SSSE3-NEXT: retq
;
; AVX-LABEL: phsubw1_reverse:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm3
; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm2
@@ -263,7 +263,7 @@ define <8 x i16> @phsubw1_reverse(<8 x i16> %x, <8 x i16> %y) {
define <4 x i32> @phsubd1_reverse(<4 x i32> %x, <4 x i32> %y) {
; SSSE3-LABEL: phsubd1_reverse:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: movaps %xmm0, %xmm2
; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm1[1,3]
; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
@@ -272,7 +272,7 @@ define <4 x i32> @phsubd1_reverse(<4 x i32> %x, <4 x i32> %y) {
; SSSE3-NEXT: retq
;
; AVX-LABEL: phsubd1_reverse:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm0[1,3],xmm1[1,3]
; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; AVX-NEXT: vpsubd %xmm0, %xmm2, %xmm0
diff --git a/test/CodeGen/X86/phi-bit-propagation.ll b/test/CodeGen/X86/phi-bit-propagation.ll
index 37f3f096556f..5d382344c211 100644
--- a/test/CodeGen/X86/phi-bit-propagation.ll
+++ b/test/CodeGen/X86/phi-bit-propagation.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-- | FileCheck %s
%"class.std::bitset" = type { [8 x i8] }
diff --git a/test/CodeGen/X86/phi-immediate-factoring.ll b/test/CodeGen/X86/phi-immediate-factoring.ll
index 8405ca436fde..e90f66df871c 100644
--- a/test/CodeGen/X86/phi-immediate-factoring.ll
+++ b/test/CodeGen/X86/phi-immediate-factoring.ll
@@ -1,7 +1,7 @@
; REQUIRES: asserts
-; RUN: llc < %s -disable-preheader-prot=true -march=x86 -stats 2>&1 | grep "Number of blocks eliminated" | grep 3
-; RUN: llc < %s -disable-preheader-prot=true -march=x86 -stats -cgp-freq-ratio-to-skip-merge=10 2>&1 | grep "Number of blocks eliminated" | grep 6
-; RUN: llc < %s -disable-preheader-prot=false -march=x86 -stats 2>&1 | grep "Number of blocks eliminated" | grep 3
+; RUN: llc < %s -disable-preheader-prot=true -stats 2>&1 | grep "Number of blocks eliminated" | grep 3
+; RUN: llc < %s -disable-preheader-prot=true -stats -cgp-freq-ratio-to-skip-merge=10 2>&1 | grep "Number of blocks eliminated" | grep 6
+; RUN: llc < %s -disable-preheader-prot=false -stats 2>&1 | grep "Number of blocks eliminated" | grep 3
; PR1296
target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64"
diff --git a/test/CodeGen/X86/phys-reg-local-regalloc.ll b/test/CodeGen/X86/phys-reg-local-regalloc.ll
index 8b370d93afdb..a5453b9e1f8c 100644
--- a/test/CodeGen/X86/phys-reg-local-regalloc.ll
+++ b/test/CodeGen/X86/phys-reg-local-regalloc.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -stack-symbol-ordering=0 -march=x86 -mtriple=i386-apple-darwin9 -mcpu=generic -regalloc=fast -optimize-regalloc=0 -no-x86-call-frame-opt | FileCheck %s
-; RUN: llc -O0 < %s -stack-symbol-ordering=0 -march=x86 -mtriple=i386-apple-darwin9 -mcpu=generic -regalloc=fast -no-x86-call-frame-opt | FileCheck %s
-; RUN: llc < %s -stack-symbol-ordering=0 -march=x86 -mtriple=i386-apple-darwin9 -mcpu=atom -regalloc=fast -optimize-regalloc=0 -no-x86-call-frame-opt | FileCheck -check-prefix=ATOM %s
+; RUN: llc < %s -stack-symbol-ordering=0 -mtriple=i386-apple-darwin9 -mcpu=generic -regalloc=fast -optimize-regalloc=0 -no-x86-call-frame-opt | FileCheck %s
+; RUN: llc -O0 < %s -stack-symbol-ordering=0 -mtriple=i386-apple-darwin9 -mcpu=generic -regalloc=fast -no-x86-call-frame-opt | FileCheck %s
+; RUN: llc < %s -stack-symbol-ordering=0 -mtriple=i386-apple-darwin9 -mcpu=atom -regalloc=fast -optimize-regalloc=0 -no-x86-call-frame-opt | FileCheck -check-prefix=ATOM %s
; CHECKed instructions should be the same with or without -O0 except on Intel Atom due to instruction scheduling.
@.str = private constant [12 x i8] c"x + y = %i\0A\00", align 1 ; <[12 x i8]*> [#uses=1]
diff --git a/test/CodeGen/X86/phys_subreg_coalesce-2.ll b/test/CodeGen/X86/phys_subreg_coalesce-2.ll
index 13605b789bfc..9c329018a136 100644
--- a/test/CodeGen/X86/phys_subreg_coalesce-2.ll
+++ b/test/CodeGen/X86/phys_subreg_coalesce-2.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=x86 | FileCheck %s
-; RUN: llc -no-phi-elim-live-out-early-exit -terminal-rule < %s -march=x86 | FileCheck %s
+; RUN: llc < %s -mtriple=i686-- | FileCheck %s
+; RUN: llc -no-phi-elim-live-out-early-exit -terminal-rule < %s -mtriple=i686-- | FileCheck %s
; PR2659
define i32 @binomial(i32 %n, i32 %k) nounwind {
diff --git a/test/CodeGen/X86/phys_subreg_coalesce-3.ll b/test/CodeGen/X86/phys_subreg_coalesce-3.ll
index 74e3d1291c05..507009683214 100644
--- a/test/CodeGen/X86/phys_subreg_coalesce-3.ll
+++ b/test/CodeGen/X86/phys_subreg_coalesce-3.ll
@@ -1,10 +1,10 @@
; RUN: llc < %s -verify-machineinstrs -mtriple=i386-apple-darwin -mcpu=corei7 | FileCheck %s
; rdar://5571034
-; This requires physreg joining, %vreg13 is live everywhere:
-; 304L %CL<def> = COPY %vreg13:sub_8bit; GR32_ABCD:%vreg13
-; 320L %vreg15<def> = COPY %vreg19; GR32:%vreg15 GR32_NOSP:%vreg19
-; 336L %vreg15<def> = SAR32rCL %vreg15, %EFLAGS<imp-def,dead>, %CL<imp-use,kill>; GR32:%vreg15
+; This requires physreg joining, %13 is live everywhere:
+; 304L %cl = COPY %13:sub_8bit; GR32_ABCD:%13
+; 320L %15 = COPY %19; GR32:%15 GR32_NOSP:%19
+; 336L %15 = SAR32rCL %15, implicit dead %eflags, implicit killed %cl; GR32:%15
define void @foo(i32* nocapture %quadrant, i32* nocapture %ptr, i32 %bbSize, i32 %bbStart, i32 %shifts) nounwind ssp {
; CHECK-LABEL: foo:
diff --git a/test/CodeGen/X86/pku.ll b/test/CodeGen/X86/pku.ll
index 79b8c474ade0..10875a589a6e 100644
--- a/test/CodeGen/X86/pku.ll
+++ b/test/CodeGen/X86/pku.ll
@@ -4,7 +4,7 @@ declare void @llvm.x86.wrpkru(i32)
define void @test_x86_wrpkru(i32 %src) {
; CHECK-LABEL: test_x86_wrpkru:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: xorl %ecx, %ecx
; CHECK-NEXT: xorl %edx, %edx
; CHECK-NEXT: movl %edi, %eax
@@ -16,7 +16,7 @@ define void @test_x86_wrpkru(i32 %src) {
define i32 @test_x86_rdpkru() {
; CHECK-LABEL: test_x86_rdpkru:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: xorl %ecx, %ecx
; CHECK-NEXT: rdpkru
; CHECK-NEXT: retq
diff --git a/test/CodeGen/X86/pmovext.ll b/test/CodeGen/X86/pmovext.ll
index 6c76949fb78b..795bf27af921 100644
--- a/test/CodeGen/X86/pmovext.ll
+++ b/test/CodeGen/X86/pmovext.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mcpu=corei7 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=corei7 | FileCheck %s
; rdar://11897677
diff --git a/test/CodeGen/X86/pmovsx-inreg.ll b/test/CodeGen/X86/pmovsx-inreg.ll
index 81e556fee356..f20065bd506b 100644
--- a/test/CodeGen/X86/pmovsx-inreg.ll
+++ b/test/CodeGen/X86/pmovsx-inreg.ll
@@ -9,7 +9,7 @@
define void @test1(<2 x i8>* %in, <2 x i64>* %out) nounwind {
; SSE41-LABEL: test1:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pmovsxbq (%rdi), %xmm0
; SSE41-NEXT: xorps %xmm1, %xmm1
; SSE41-NEXT: movups %xmm1, (%rax)
@@ -17,7 +17,7 @@ define void @test1(<2 x i8>* %in, <2 x i64>* %out) nounwind {
; SSE41-NEXT: retq
;
; AVX-LABEL: test1:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpmovsxbq (%rdi), %xmm0
; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX-NEXT: vmovups %xmm1, (%rax)
@@ -25,7 +25,7 @@ define void @test1(<2 x i8>* %in, <2 x i64>* %out) nounwind {
; AVX-NEXT: retq
;
; X32-AVX2-LABEL: test1:
-; X32-AVX2: # BB#0:
+; X32-AVX2: # %bb.0:
; X32-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-AVX2-NEXT: vpmovsxbq (%ecx), %xmm0
@@ -42,7 +42,7 @@ define void @test1(<2 x i8>* %in, <2 x i64>* %out) nounwind {
define void @test2(<4 x i8>* %in, <4 x i64>* %out) nounwind {
; SSE41-LABEL: test2:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pmovsxbq (%rdi), %xmm0
; SSE41-NEXT: pmovsxbq 2(%rdi), %xmm1
; SSE41-NEXT: xorps %xmm2, %xmm2
@@ -52,33 +52,33 @@ define void @test2(<4 x i8>* %in, <4 x i64>* %out) nounwind {
; SSE41-NEXT: retq
;
; AVX1-LABEL: test2:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpmovsxbd (%rdi), %xmm0
; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT: vxorps %ymm1, %ymm1, %ymm1
-; AVX1-NEXT: vmovups %ymm1, (%rax)
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vmovdqu %ymm1, (%rax)
; AVX1-NEXT: vmovups %ymm0, (%rsi)
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: test2:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpmovsxbq (%rdi), %ymm0
-; AVX2-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vmovups %ymm1, (%rax)
; AVX2-NEXT: vmovdqu %ymm0, (%rsi)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; X32-AVX2-LABEL: test2:
-; X32-AVX2: # BB#0:
+; X32-AVX2: # %bb.0:
; X32-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-AVX2-NEXT: vpmovsxbq (%ecx), %ymm0
-; X32-AVX2-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; X32-AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X32-AVX2-NEXT: vmovups %ymm1, (%eax)
; X32-AVX2-NEXT: vmovdqu %ymm0, (%eax)
; X32-AVX2-NEXT: vzeroupper
@@ -92,7 +92,7 @@ define void @test2(<4 x i8>* %in, <4 x i64>* %out) nounwind {
define void @test3(<4 x i8>* %in, <4 x i32>* %out) nounwind {
; SSE41-LABEL: test3:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pmovsxbd (%rdi), %xmm0
; SSE41-NEXT: xorps %xmm1, %xmm1
; SSE41-NEXT: movups %xmm1, (%rax)
@@ -100,7 +100,7 @@ define void @test3(<4 x i8>* %in, <4 x i32>* %out) nounwind {
; SSE41-NEXT: retq
;
; AVX-LABEL: test3:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpmovsxbd (%rdi), %xmm0
; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX-NEXT: vmovups %xmm1, (%rax)
@@ -108,7 +108,7 @@ define void @test3(<4 x i8>* %in, <4 x i32>* %out) nounwind {
; AVX-NEXT: retq
;
; X32-AVX2-LABEL: test3:
-; X32-AVX2: # BB#0:
+; X32-AVX2: # %bb.0:
; X32-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-AVX2-NEXT: vpmovsxbd (%ecx), %xmm0
@@ -125,7 +125,7 @@ define void @test3(<4 x i8>* %in, <4 x i32>* %out) nounwind {
define void @test4(<8 x i8>* %in, <8 x i32>* %out) nounwind {
; SSE41-LABEL: test4:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pmovsxbd (%rdi), %xmm0
; SSE41-NEXT: pmovsxbd 4(%rdi), %xmm1
; SSE41-NEXT: xorps %xmm2, %xmm2
@@ -135,33 +135,33 @@ define void @test4(<8 x i8>* %in, <8 x i32>* %out) nounwind {
; SSE41-NEXT: retq
;
; AVX1-LABEL: test4:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpmovsxbw (%rdi), %xmm0
; AVX1-NEXT: vpmovsxwd %xmm0, %xmm1
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT: vxorps %ymm1, %ymm1, %ymm1
-; AVX1-NEXT: vmovups %ymm1, (%rax)
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vmovdqu %ymm1, (%rax)
; AVX1-NEXT: vmovups %ymm0, (%rsi)
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: test4:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpmovsxbd (%rdi), %ymm0
-; AVX2-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vmovups %ymm1, (%rax)
; AVX2-NEXT: vmovdqu %ymm0, (%rsi)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; X32-AVX2-LABEL: test4:
-; X32-AVX2: # BB#0:
+; X32-AVX2: # %bb.0:
; X32-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-AVX2-NEXT: vpmovsxbd (%ecx), %ymm0
-; X32-AVX2-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; X32-AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X32-AVX2-NEXT: vmovups %ymm1, (%eax)
; X32-AVX2-NEXT: vmovdqu %ymm0, (%eax)
; X32-AVX2-NEXT: vzeroupper
@@ -175,7 +175,7 @@ define void @test4(<8 x i8>* %in, <8 x i32>* %out) nounwind {
define void @test5(<8 x i8>* %in, <8 x i16>* %out) nounwind {
; SSE41-LABEL: test5:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pmovsxbw (%rdi), %xmm0
; SSE41-NEXT: xorps %xmm1, %xmm1
; SSE41-NEXT: movups %xmm1, (%rax)
@@ -183,7 +183,7 @@ define void @test5(<8 x i8>* %in, <8 x i16>* %out) nounwind {
; SSE41-NEXT: retq
;
; AVX-LABEL: test5:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpmovsxbw (%rdi), %xmm0
; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX-NEXT: vmovups %xmm1, (%rax)
@@ -191,7 +191,7 @@ define void @test5(<8 x i8>* %in, <8 x i16>* %out) nounwind {
; AVX-NEXT: retq
;
; X32-AVX2-LABEL: test5:
-; X32-AVX2: # BB#0:
+; X32-AVX2: # %bb.0:
; X32-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-AVX2-NEXT: vpmovsxbw (%ecx), %xmm0
@@ -208,7 +208,7 @@ define void @test5(<8 x i8>* %in, <8 x i16>* %out) nounwind {
define void @test6(<16 x i8>* %in, <16 x i16>* %out) nounwind {
; SSE41-LABEL: test6:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pmovsxbw (%rdi), %xmm0
; SSE41-NEXT: pmovsxbw 8(%rdi), %xmm1
; SSE41-NEXT: xorps %xmm2, %xmm2
@@ -218,31 +218,31 @@ define void @test6(<16 x i8>* %in, <16 x i16>* %out) nounwind {
; SSE41-NEXT: retq
;
; AVX1-LABEL: test6:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpmovsxbw (%rdi), %xmm0
; AVX1-NEXT: vpmovsxbw 8(%rdi), %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX1-NEXT: vxorps %ymm1, %ymm1, %ymm1
-; AVX1-NEXT: vmovups %ymm1, (%rax)
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vmovdqu %ymm1, (%rax)
; AVX1-NEXT: vmovups %ymm0, (%rsi)
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: test6:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpmovsxbw (%rdi), %ymm0
-; AVX2-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vmovups %ymm1, (%rax)
; AVX2-NEXT: vmovdqu %ymm0, (%rsi)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; X32-AVX2-LABEL: test6:
-; X32-AVX2: # BB#0:
+; X32-AVX2: # %bb.0:
; X32-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-AVX2-NEXT: vpmovsxbw (%ecx), %ymm0
-; X32-AVX2-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; X32-AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X32-AVX2-NEXT: vmovups %ymm1, (%eax)
; X32-AVX2-NEXT: vmovdqu %ymm0, (%eax)
; X32-AVX2-NEXT: vzeroupper
@@ -256,7 +256,7 @@ define void @test6(<16 x i8>* %in, <16 x i16>* %out) nounwind {
define void @test7(<2 x i16>* %in, <2 x i64>* %out) nounwind {
; SSE41-LABEL: test7:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pmovsxwq (%rdi), %xmm0
; SSE41-NEXT: xorps %xmm1, %xmm1
; SSE41-NEXT: movups %xmm1, (%rax)
@@ -264,7 +264,7 @@ define void @test7(<2 x i16>* %in, <2 x i64>* %out) nounwind {
; SSE41-NEXT: retq
;
; AVX-LABEL: test7:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpmovsxwq (%rdi), %xmm0
; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX-NEXT: vmovups %xmm1, (%rax)
@@ -272,7 +272,7 @@ define void @test7(<2 x i16>* %in, <2 x i64>* %out) nounwind {
; AVX-NEXT: retq
;
; X32-AVX2-LABEL: test7:
-; X32-AVX2: # BB#0:
+; X32-AVX2: # %bb.0:
; X32-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-AVX2-NEXT: vpmovsxwq (%ecx), %xmm0
@@ -289,7 +289,7 @@ define void @test7(<2 x i16>* %in, <2 x i64>* %out) nounwind {
define void @test8(<4 x i16>* %in, <4 x i64>* %out) nounwind {
; SSE41-LABEL: test8:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pmovsxwq (%rdi), %xmm0
; SSE41-NEXT: pmovsxwq 4(%rdi), %xmm1
; SSE41-NEXT: xorps %xmm2, %xmm2
@@ -299,33 +299,33 @@ define void @test8(<4 x i16>* %in, <4 x i64>* %out) nounwind {
; SSE41-NEXT: retq
;
; AVX1-LABEL: test8:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpmovsxwd (%rdi), %xmm0
; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT: vxorps %ymm1, %ymm1, %ymm1
-; AVX1-NEXT: vmovups %ymm1, (%rax)
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vmovdqu %ymm1, (%rax)
; AVX1-NEXT: vmovups %ymm0, (%rsi)
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: test8:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpmovsxwq (%rdi), %ymm0
-; AVX2-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vmovups %ymm1, (%rax)
; AVX2-NEXT: vmovdqu %ymm0, (%rsi)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; X32-AVX2-LABEL: test8:
-; X32-AVX2: # BB#0:
+; X32-AVX2: # %bb.0:
; X32-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-AVX2-NEXT: vpmovsxwq (%ecx), %ymm0
-; X32-AVX2-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; X32-AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X32-AVX2-NEXT: vmovups %ymm1, (%eax)
; X32-AVX2-NEXT: vmovdqu %ymm0, (%eax)
; X32-AVX2-NEXT: vzeroupper
@@ -339,7 +339,7 @@ define void @test8(<4 x i16>* %in, <4 x i64>* %out) nounwind {
define void @test9(<4 x i16>* %in, <4 x i32>* %out) nounwind {
; SSE41-LABEL: test9:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pmovsxwd (%rdi), %xmm0
; SSE41-NEXT: xorps %xmm1, %xmm1
; SSE41-NEXT: movups %xmm1, (%rax)
@@ -347,7 +347,7 @@ define void @test9(<4 x i16>* %in, <4 x i32>* %out) nounwind {
; SSE41-NEXT: retq
;
; AVX-LABEL: test9:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpmovsxwd (%rdi), %xmm0
; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX-NEXT: vmovups %xmm1, (%rax)
@@ -355,7 +355,7 @@ define void @test9(<4 x i16>* %in, <4 x i32>* %out) nounwind {
; AVX-NEXT: retq
;
; X32-AVX2-LABEL: test9:
-; X32-AVX2: # BB#0:
+; X32-AVX2: # %bb.0:
; X32-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-AVX2-NEXT: vpmovsxwd (%ecx), %xmm0
@@ -372,7 +372,7 @@ define void @test9(<4 x i16>* %in, <4 x i32>* %out) nounwind {
define void @test10(<8 x i16>* %in, <8 x i32>* %out) nounwind {
; SSE41-LABEL: test10:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pmovsxwd (%rdi), %xmm0
; SSE41-NEXT: pmovsxwd 8(%rdi), %xmm1
; SSE41-NEXT: xorps %xmm2, %xmm2
@@ -382,31 +382,31 @@ define void @test10(<8 x i16>* %in, <8 x i32>* %out) nounwind {
; SSE41-NEXT: retq
;
; AVX1-LABEL: test10:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpmovsxwd (%rdi), %xmm0
; AVX1-NEXT: vpmovsxwd 8(%rdi), %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX1-NEXT: vxorps %ymm1, %ymm1, %ymm1
-; AVX1-NEXT: vmovups %ymm1, (%rax)
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vmovdqu %ymm1, (%rax)
; AVX1-NEXT: vmovups %ymm0, (%rsi)
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: test10:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpmovsxwd (%rdi), %ymm0
-; AVX2-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vmovups %ymm1, (%rax)
; AVX2-NEXT: vmovdqu %ymm0, (%rsi)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; X32-AVX2-LABEL: test10:
-; X32-AVX2: # BB#0:
+; X32-AVX2: # %bb.0:
; X32-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-AVX2-NEXT: vpmovsxwd (%ecx), %ymm0
-; X32-AVX2-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; X32-AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X32-AVX2-NEXT: vmovups %ymm1, (%eax)
; X32-AVX2-NEXT: vmovdqu %ymm0, (%eax)
; X32-AVX2-NEXT: vzeroupper
@@ -420,7 +420,7 @@ define void @test10(<8 x i16>* %in, <8 x i32>* %out) nounwind {
define void @test11(<2 x i32>* %in, <2 x i64>* %out) nounwind {
; SSE41-LABEL: test11:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pmovsxdq (%rdi), %xmm0
; SSE41-NEXT: xorps %xmm1, %xmm1
; SSE41-NEXT: movups %xmm1, (%rax)
@@ -428,7 +428,7 @@ define void @test11(<2 x i32>* %in, <2 x i64>* %out) nounwind {
; SSE41-NEXT: retq
;
; AVX-LABEL: test11:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpmovsxdq (%rdi), %xmm0
; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX-NEXT: vmovups %xmm1, (%rax)
@@ -436,7 +436,7 @@ define void @test11(<2 x i32>* %in, <2 x i64>* %out) nounwind {
; AVX-NEXT: retq
;
; X32-AVX2-LABEL: test11:
-; X32-AVX2: # BB#0:
+; X32-AVX2: # %bb.0:
; X32-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-AVX2-NEXT: vpmovsxdq (%ecx), %xmm0
@@ -453,7 +453,7 @@ define void @test11(<2 x i32>* %in, <2 x i64>* %out) nounwind {
define void @test12(<4 x i32>* %in, <4 x i64>* %out) nounwind {
; SSE41-LABEL: test12:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pmovsxdq (%rdi), %xmm0
; SSE41-NEXT: pmovsxdq 8(%rdi), %xmm1
; SSE41-NEXT: xorps %xmm2, %xmm2
@@ -463,31 +463,31 @@ define void @test12(<4 x i32>* %in, <4 x i64>* %out) nounwind {
; SSE41-NEXT: retq
;
; AVX1-LABEL: test12:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpmovsxdq (%rdi), %xmm0
; AVX1-NEXT: vpmovsxdq 8(%rdi), %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX1-NEXT: vxorps %ymm1, %ymm1, %ymm1
-; AVX1-NEXT: vmovups %ymm1, (%rax)
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vmovdqu %ymm1, (%rax)
; AVX1-NEXT: vmovups %ymm0, (%rsi)
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: test12:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpmovsxdq (%rdi), %ymm0
-; AVX2-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vmovups %ymm1, (%rax)
; AVX2-NEXT: vmovdqu %ymm0, (%rsi)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; X32-AVX2-LABEL: test12:
-; X32-AVX2: # BB#0:
+; X32-AVX2: # %bb.0:
; X32-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-AVX2-NEXT: vpmovsxdq (%ecx), %ymm0
-; X32-AVX2-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; X32-AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X32-AVX2-NEXT: vmovups %ymm1, (%eax)
; X32-AVX2-NEXT: vmovdqu %ymm0, (%eax)
; X32-AVX2-NEXT: vzeroupper
diff --git a/test/CodeGen/X86/pmul.ll b/test/CodeGen/X86/pmul.ll
index 76d750855cd4..f7d236ef8054 100644
--- a/test/CodeGen/X86/pmul.ll
+++ b/test/CodeGen/X86/pmul.ll
@@ -7,7 +7,7 @@
define <16 x i8> @mul_v16i8c(<16 x i8> %i) nounwind {
; SSE2-LABEL: mul_v16i8c:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; SSE2-NEXT: psraw $8, %xmm1
@@ -23,7 +23,7 @@ define <16 x i8> @mul_v16i8c(<16 x i8> %i) nounwind {
; SSE2-NEXT: retq
;
; SSE41-LABEL: mul_v16i8c:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: pmovsxbw %xmm0, %xmm1
; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [117,117,117,117,117,117,117,117]
; SSE41-NEXT: pmullw %xmm2, %xmm1
@@ -38,7 +38,7 @@ define <16 x i8> @mul_v16i8c(<16 x i8> %i) nounwind {
; SSE41-NEXT: retq
;
; AVX2-LABEL: mul_v16i8c:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0
; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
@@ -50,7 +50,7 @@ define <16 x i8> @mul_v16i8c(<16 x i8> %i) nounwind {
; AVX2-NEXT: retq
;
; AVX512F-LABEL: mul_v16i8c:
-; AVX512F: # BB#0: # %entry
+; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vpmovsxbw %xmm0, %ymm0
; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
@@ -59,11 +59,11 @@ define <16 x i8> @mul_v16i8c(<16 x i8> %i) nounwind {
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: mul_v16i8c:
-; AVX512BW: # BB#0: # %entry
+; AVX512BW: # %bb.0: # %entry
; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0
; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
-; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512BW-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
entry:
@@ -73,12 +73,12 @@ entry:
define <8 x i16> @mul_v8i16c(<8 x i16> %i) nounwind {
; SSE-LABEL: mul_v8i16c:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: pmullw {{.*}}(%rip), %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: mul_v8i16c:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: retq
entry:
@@ -88,7 +88,7 @@ entry:
define <4 x i32> @mul_v4i32c(<4 x i32> %i) nounwind {
; SSE2-LABEL: mul_v4i32c:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [117,117,117,117]
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; SSE2-NEXT: pmuludq %xmm1, %xmm0
@@ -99,12 +99,12 @@ define <4 x i32> @mul_v4i32c(<4 x i32> %i) nounwind {
; SSE2-NEXT: retq
;
; SSE41-LABEL: mul_v4i32c:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: mul_v4i32c:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [117,117,117,117]
; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
@@ -115,7 +115,7 @@ entry:
define <2 x i64> @mul_v2i64c(<2 x i64> %i) nounwind {
; SSE-LABEL: mul_v2i64c:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: movdqa {{.*#+}} xmm1 = [117,117]
; SSE-NEXT: movdqa %xmm0, %xmm2
; SSE-NEXT: pmuludq %xmm1, %xmm2
@@ -126,7 +126,7 @@ define <2 x i64> @mul_v2i64c(<2 x i64> %i) nounwind {
; SSE-NEXT: retq
;
; AVX-LABEL: mul_v2i64c:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [117,117]
; AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm2
; AVX-NEXT: vpsrlq $32, %xmm0, %xmm0
@@ -141,7 +141,7 @@ entry:
define <16 x i8> @mul_v16i8(<16 x i8> %i, <16 x i8> %j) nounwind {
; SSE2-LABEL: mul_v16i8:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa %xmm1, %xmm2
; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; SSE2-NEXT: psraw $8, %xmm2
@@ -161,7 +161,7 @@ define <16 x i8> @mul_v16i8(<16 x i8> %i, <16 x i8> %j) nounwind {
; SSE2-NEXT: retq
;
; SSE41-LABEL: mul_v16i8:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: pmovsxbw %xmm1, %xmm3
; SSE41-NEXT: pmovsxbw %xmm0, %xmm2
; SSE41-NEXT: pmullw %xmm3, %xmm2
@@ -178,7 +178,7 @@ define <16 x i8> @mul_v16i8(<16 x i8> %i, <16 x i8> %j) nounwind {
; SSE41-NEXT: retq
;
; AVX2-LABEL: mul_v16i8:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpmovsxbw %xmm1, %ymm1
; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0
; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0
@@ -191,7 +191,7 @@ define <16 x i8> @mul_v16i8(<16 x i8> %i, <16 x i8> %j) nounwind {
; AVX2-NEXT: retq
;
; AVX512F-LABEL: mul_v16i8:
-; AVX512F: # BB#0: # %entry
+; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vpmovsxbw %xmm1, %ymm1
; AVX512F-NEXT: vpmovsxbw %xmm0, %ymm0
; AVX512F-NEXT: vpmullw %ymm1, %ymm0, %ymm0
@@ -201,12 +201,12 @@ define <16 x i8> @mul_v16i8(<16 x i8> %i, <16 x i8> %j) nounwind {
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: mul_v16i8:
-; AVX512BW: # BB#0: # %entry
+; AVX512BW: # %bb.0: # %entry
; AVX512BW-NEXT: vpmovsxbw %xmm1, %ymm1
; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0
; AVX512BW-NEXT: vpmullw %ymm1, %ymm0, %ymm0
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
-; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512BW-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
entry:
@@ -216,12 +216,12 @@ entry:
define <8 x i16> @mul_v8i16(<8 x i16> %i, <8 x i16> %j) nounwind {
; SSE-LABEL: mul_v8i16:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: pmullw %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: mul_v8i16:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
entry:
@@ -231,7 +231,7 @@ entry:
define <4 x i32> @mul_v4i32(<4 x i32> %i, <4 x i32> %j) nounwind {
; SSE2-LABEL: mul_v4i32:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; SSE2-NEXT: pmuludq %xmm1, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
@@ -242,12 +242,12 @@ define <4 x i32> @mul_v4i32(<4 x i32> %i, <4 x i32> %j) nounwind {
; SSE2-NEXT: retq
;
; SSE41-LABEL: mul_v4i32:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: pmulld %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: mul_v4i32:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
entry:
@@ -257,7 +257,7 @@ entry:
define <2 x i64> @mul_v2i64(<2 x i64> %i, <2 x i64> %j) nounwind {
; SSE-LABEL: mul_v2i64:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: movdqa %xmm0, %xmm2
; SSE-NEXT: psrlq $32, %xmm2
; SSE-NEXT: pmuludq %xmm1, %xmm2
@@ -271,7 +271,7 @@ define <2 x i64> @mul_v2i64(<2 x i64> %i, <2 x i64> %j) nounwind {
; SSE-NEXT: retq
;
; AVX-LABEL: mul_v2i64:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vpsrlq $32, %xmm0, %xmm2
; AVX-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
; AVX-NEXT: vpsrlq $32, %xmm1, %xmm3
@@ -290,7 +290,7 @@ declare void @foo()
define <4 x i32> @mul_v4i32spill(<4 x i32> %i, <4 x i32> %j) nounwind {
; SSE2-LABEL: mul_v4i32spill:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: subq $40, %rsp
; SSE2-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill
; SSE2-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
@@ -308,7 +308,7 @@ define <4 x i32> @mul_v4i32spill(<4 x i32> %i, <4 x i32> %j) nounwind {
; SSE2-NEXT: retq
;
; SSE41-LABEL: mul_v4i32spill:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: subq $40, %rsp
; SSE41-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill
; SSE41-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
@@ -319,7 +319,7 @@ define <4 x i32> @mul_v4i32spill(<4 x i32> %i, <4 x i32> %j) nounwind {
; SSE41-NEXT: retq
;
; AVX-LABEL: mul_v4i32spill:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: subq $40, %rsp
; AVX-NEXT: vmovaps %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill
; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
@@ -337,7 +337,7 @@ entry:
define <2 x i64> @mul_v2i64spill(<2 x i64> %i, <2 x i64> %j) nounwind {
; SSE-LABEL: mul_v2i64spill:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: subq $40, %rsp
; SSE-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill
; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
@@ -358,7 +358,7 @@ define <2 x i64> @mul_v2i64spill(<2 x i64> %i, <2 x i64> %j) nounwind {
; SSE-NEXT: retq
;
; AVX-LABEL: mul_v2i64spill:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: subq $40, %rsp
; AVX-NEXT: vmovaps %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill
; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
@@ -384,7 +384,7 @@ entry:
define <32 x i8> @mul_v32i8c(<32 x i8> %i) nounwind {
; SSE2-LABEL: mul_v32i8c:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; SSE2-NEXT: psraw $8, %xmm2
@@ -410,7 +410,7 @@ define <32 x i8> @mul_v32i8c(<32 x i8> %i) nounwind {
; SSE2-NEXT: retq
;
; SSE41-LABEL: mul_v32i8c:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: pmovsxbw %xmm0, %xmm2
; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [117,117,117,117,117,117,117,117]
; SSE41-NEXT: pmullw %xmm4, %xmm2
@@ -434,7 +434,7 @@ define <32 x i8> @mul_v32i8c(<32 x i8> %i) nounwind {
; SSE41-NEXT: retq
;
; AVX2-LABEL: mul_v32i8c:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpmovsxbw %xmm1, %ymm1
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117]
@@ -454,7 +454,7 @@ define <32 x i8> @mul_v32i8c(<32 x i8> %i) nounwind {
; AVX2-NEXT: retq
;
; AVX512F-LABEL: mul_v32i8c:
-; AVX512F: # BB#0: # %entry
+; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vpmovsxbw %xmm0, %ymm1
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117]
; AVX512F-NEXT: vpmullw %ymm2, %ymm1, %ymm1
@@ -469,7 +469,7 @@ define <32 x i8> @mul_v32i8c(<32 x i8> %i) nounwind {
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: mul_v32i8c:
-; AVX512BW: # BB#0: # %entry
+; AVX512BW: # %bb.0: # %entry
; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm0
; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %zmm0, %zmm0
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
@@ -481,14 +481,14 @@ entry:
define <16 x i16> @mul_v16i16c(<16 x i16> %i) nounwind {
; SSE-LABEL: mul_v16i16c:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: movdqa {{.*#+}} xmm2 = [117,117,117,117,117,117,117,117]
; SSE-NEXT: pmullw %xmm2, %xmm0
; SSE-NEXT: pmullw %xmm2, %xmm1
; SSE-NEXT: retq
;
; AVX-LABEL: mul_v16i16c:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
; AVX-NEXT: retq
entry:
@@ -498,7 +498,7 @@ entry:
define <8 x i32> @mul_v8i32c(<8 x i32> %i) nounwind {
; SSE2-LABEL: mul_v8i32c:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [117,117,117,117]
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
; SSE2-NEXT: pmuludq %xmm2, %xmm0
@@ -515,14 +515,14 @@ define <8 x i32> @mul_v8i32c(<8 x i32> %i) nounwind {
; SSE2-NEXT: retq
;
; SSE41-LABEL: mul_v8i32c:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [117,117,117,117]
; SSE41-NEXT: pmulld %xmm2, %xmm0
; SSE41-NEXT: pmulld %xmm2, %xmm1
; SSE41-NEXT: retq
;
; AVX-LABEL: mul_v8i32c:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vpbroadcastd {{.*#+}} ymm1 = [117,117,117,117,117,117,117,117]
; AVX-NEXT: vpmulld %ymm1, %ymm0, %ymm0
; AVX-NEXT: retq
@@ -533,7 +533,7 @@ entry:
define <4 x i64> @mul_v4i64c(<4 x i64> %i) nounwind {
; SSE-LABEL: mul_v4i64c:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: movdqa {{.*#+}} xmm2 = [117,117]
; SSE-NEXT: movdqa %xmm0, %xmm3
; SSE-NEXT: pmuludq %xmm2, %xmm3
@@ -550,7 +550,7 @@ define <4 x i64> @mul_v4i64c(<4 x i64> %i) nounwind {
; SSE-NEXT: retq
;
; AVX-LABEL: mul_v4i64c:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vpbroadcastq {{.*#+}} ymm1 = [117,117,117,117]
; AVX-NEXT: vpmuludq %ymm1, %ymm0, %ymm2
; AVX-NEXT: vpsrlq $32, %ymm0, %ymm0
@@ -565,7 +565,7 @@ entry:
define <32 x i8> @mul_v32i8(<32 x i8> %i, <32 x i8> %j) nounwind {
; SSE2-LABEL: mul_v32i8:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa %xmm2, %xmm4
; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; SSE2-NEXT: psraw $8, %xmm4
@@ -600,7 +600,7 @@ define <32 x i8> @mul_v32i8(<32 x i8> %i, <32 x i8> %j) nounwind {
; SSE2-NEXT: retq
;
; SSE41-LABEL: mul_v32i8:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: pmovsxbw %xmm2, %xmm5
; SSE41-NEXT: pmovsxbw %xmm0, %xmm4
; SSE41-NEXT: pmullw %xmm5, %xmm4
@@ -629,7 +629,7 @@ define <32 x i8> @mul_v32i8(<32 x i8> %i, <32 x i8> %j) nounwind {
; SSE41-NEXT: retq
;
; AVX2-LABEL: mul_v32i8:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX2-NEXT: vpmovsxbw %xmm2, %ymm2
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
@@ -651,7 +651,7 @@ define <32 x i8> @mul_v32i8(<32 x i8> %i, <32 x i8> %j) nounwind {
; AVX2-NEXT: retq
;
; AVX512F-LABEL: mul_v32i8:
-; AVX512F: # BB#0: # %entry
+; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vpmovsxbw %xmm1, %ymm2
; AVX512F-NEXT: vpmovsxbw %xmm0, %ymm3
; AVX512F-NEXT: vpmullw %ymm2, %ymm3, %ymm2
@@ -668,7 +668,7 @@ define <32 x i8> @mul_v32i8(<32 x i8> %i, <32 x i8> %j) nounwind {
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: mul_v32i8:
-; AVX512BW: # BB#0: # %entry
+; AVX512BW: # %bb.0: # %entry
; AVX512BW-NEXT: vpmovsxbw %ymm1, %zmm1
; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm0
; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0
@@ -681,13 +681,13 @@ entry:
define <16 x i16> @mul_v16i16(<16 x i16> %i, <16 x i16> %j) nounwind {
; SSE-LABEL: mul_v16i16:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: pmullw %xmm2, %xmm0
; SSE-NEXT: pmullw %xmm3, %xmm1
; SSE-NEXT: retq
;
; AVX-LABEL: mul_v16i16:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vpmullw %ymm1, %ymm0, %ymm0
; AVX-NEXT: retq
entry:
@@ -697,7 +697,7 @@ entry:
define <8 x i32> @mul_v8i32(<8 x i32> %i, <8 x i32> %j) nounwind {
; SSE2-LABEL: mul_v8i32:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
; SSE2-NEXT: pmuludq %xmm2, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
@@ -715,13 +715,13 @@ define <8 x i32> @mul_v8i32(<8 x i32> %i, <8 x i32> %j) nounwind {
; SSE2-NEXT: retq
;
; SSE41-LABEL: mul_v8i32:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: pmulld %xmm2, %xmm0
; SSE41-NEXT: pmulld %xmm3, %xmm1
; SSE41-NEXT: retq
;
; AVX-LABEL: mul_v8i32:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vpmulld %ymm1, %ymm0, %ymm0
; AVX-NEXT: retq
entry:
@@ -731,7 +731,7 @@ entry:
define <4 x i64> @mul_v4i64(<4 x i64> %i, <4 x i64> %j) nounwind {
; SSE-LABEL: mul_v4i64:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: movdqa %xmm0, %xmm4
; SSE-NEXT: psrlq $32, %xmm4
; SSE-NEXT: pmuludq %xmm2, %xmm4
@@ -755,7 +755,7 @@ define <4 x i64> @mul_v4i64(<4 x i64> %i, <4 x i64> %j) nounwind {
; SSE-NEXT: retq
;
; AVX-LABEL: mul_v4i64:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vpsrlq $32, %ymm0, %ymm2
; AVX-NEXT: vpmuludq %ymm1, %ymm2, %ymm2
; AVX-NEXT: vpsrlq $32, %ymm1, %ymm3
@@ -772,7 +772,7 @@ entry:
define <64 x i8> @mul_v64i8c(<64 x i8> %i) nounwind {
; SSE2-LABEL: mul_v64i8c:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa %xmm0, %xmm6
; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; SSE2-NEXT: psraw $8, %xmm6
@@ -818,7 +818,7 @@ define <64 x i8> @mul_v64i8c(<64 x i8> %i) nounwind {
; SSE2-NEXT: retq
;
; SSE41-LABEL: mul_v64i8c:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: movdqa %xmm1, %xmm4
; SSE41-NEXT: movdqa %xmm0, %xmm1
; SSE41-NEXT: pmovsxbw %xmm1, %xmm0
@@ -860,7 +860,7 @@ define <64 x i8> @mul_v64i8c(<64 x i8> %i) nounwind {
; SSE41-NEXT: retq
;
; AVX2-LABEL: mul_v64i8c:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
; AVX2-NEXT: vpmovsxbw %xmm2, %ymm2
; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117]
@@ -894,7 +894,7 @@ define <64 x i8> @mul_v64i8c(<64 x i8> %i) nounwind {
; AVX2-NEXT: retq
;
; AVX512F-LABEL: mul_v64i8c:
-; AVX512F: # BB#0: # %entry
+; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vpmovsxbw %xmm0, %ymm2
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117]
; AVX512F-NEXT: vpmullw %ymm3, %ymm2, %ymm2
@@ -919,9 +919,9 @@ define <64 x i8> @mul_v64i8c(<64 x i8> %i) nounwind {
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: mul_v64i8c:
-; AVX512BW: # BB#0: # %entry
+; AVX512BW: # %bb.0: # %entry
; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm1
-; AVX512BW-NEXT: vmovdqu16 {{.*#+}} zmm2 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117]
+; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117]
; AVX512BW-NEXT: vpmullw %zmm2, %zmm1, %zmm1
; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1
; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm0
@@ -937,7 +937,7 @@ entry:
define <64 x i8> @mul_v64i8(<64 x i8> %i, <64 x i8> %j) nounwind {
; SSE2-LABEL: mul_v64i8:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa %xmm4, %xmm8
; SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; SSE2-NEXT: psraw $8, %xmm8
@@ -1002,7 +1002,7 @@ define <64 x i8> @mul_v64i8(<64 x i8> %i, <64 x i8> %j) nounwind {
; SSE2-NEXT: retq
;
; SSE41-LABEL: mul_v64i8:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: movdqa %xmm1, %xmm8
; SSE41-NEXT: movdqa %xmm0, %xmm1
; SSE41-NEXT: pmovsxbw %xmm4, %xmm9
@@ -1055,7 +1055,7 @@ define <64 x i8> @mul_v64i8(<64 x i8> %i, <64 x i8> %j) nounwind {
; SSE41-NEXT: retq
;
; AVX2-LABEL: mul_v64i8:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm4
; AVX2-NEXT: vpmovsxbw %xmm4, %ymm4
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm5
@@ -1094,7 +1094,7 @@ define <64 x i8> @mul_v64i8(<64 x i8> %i, <64 x i8> %j) nounwind {
; AVX2-NEXT: retq
;
; AVX512F-LABEL: mul_v64i8:
-; AVX512F: # BB#0: # %entry
+; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vpmovsxbw %xmm2, %ymm4
; AVX512F-NEXT: vpmovsxbw %xmm0, %ymm5
; AVX512F-NEXT: vpmullw %ymm4, %ymm5, %ymm4
@@ -1124,7 +1124,7 @@ define <64 x i8> @mul_v64i8(<64 x i8> %i, <64 x i8> %j) nounwind {
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: mul_v64i8:
-; AVX512BW: # BB#0: # %entry
+; AVX512BW: # %bb.0: # %entry
; AVX512BW-NEXT: vpmovsxbw %ymm1, %zmm2
; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm3
; AVX512BW-NEXT: vpmullw %zmm2, %zmm3, %zmm2
@@ -1145,7 +1145,7 @@ entry:
; PR30845
define <4 x i32> @mul_v4i64_zero_upper(<4 x i32> %val1, <4 x i32> %val2) {
; SSE2-LABEL: mul_v4i64_zero_upper:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: pxor %xmm3, %xmm3
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
@@ -1160,7 +1160,7 @@ define <4 x i32> @mul_v4i64_zero_upper(<4 x i32> %val1, <4 x i32> %val2) {
; SSE2-NEXT: retq
;
; SSE41-LABEL: mul_v4i64_zero_upper:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero
@@ -1173,7 +1173,7 @@ define <4 x i32> @mul_v4i64_zero_upper(<4 x i32> %val1, <4 x i32> %val2) {
; SSE41-NEXT: retq
;
; AVX-LABEL: mul_v4i64_zero_upper:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; AVX-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
; AVX-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
@@ -1192,7 +1192,7 @@ entry:
define <4 x i32> @mul_v4i64_zero_upper_left(<4 x i32> %val1, <4 x i64> %val2) {
; SSE2-LABEL: mul_v4i64_zero_upper_left:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: pxor %xmm3, %xmm3
; SSE2-NEXT: movdqa %xmm0, %xmm4
; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
@@ -1213,7 +1213,7 @@ define <4 x i32> @mul_v4i64_zero_upper_left(<4 x i32> %val1, <4 x i64> %val2) {
; SSE2-NEXT: retq
;
; SSE41-LABEL: mul_v4i64_zero_upper_left:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero
; SSE41-NEXT: pmovzxdq {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero
@@ -1233,7 +1233,7 @@ define <4 x i32> @mul_v4i64_zero_upper_left(<4 x i32> %val1, <4 x i64> %val2) {
; SSE41-NEXT: retq
;
; AVX-LABEL: mul_v4i64_zero_upper_left:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; AVX-NEXT: vpmuludq %ymm1, %ymm0, %ymm2
; AVX-NEXT: vpsrlq $32, %ymm1, %ymm1
@@ -1254,7 +1254,7 @@ entry:
define <4 x i32> @mul_v4i64_zero_lower(<4 x i32> %val1, <4 x i64> %val2) {
; SSE2-LABEL: mul_v4i64_zero_lower:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: pxor %xmm4, %xmm4
; SSE2-NEXT: movdqa %xmm0, %xmm3
; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
@@ -1270,7 +1270,7 @@ define <4 x i32> @mul_v4i64_zero_lower(<4 x i32> %val1, <4 x i64> %val2) {
; SSE2-NEXT: retq
;
; SSE41-LABEL: mul_v4i64_zero_lower:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero
; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
@@ -1284,7 +1284,7 @@ define <4 x i32> @mul_v4i64_zero_lower(<4 x i32> %val1, <4 x i64> %val2) {
; SSE41-NEXT: retq
;
; AVX-LABEL: mul_v4i64_zero_lower:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; AVX-NEXT: vpsrlq $32, %ymm1, %ymm1
; AVX-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
@@ -1304,7 +1304,7 @@ entry:
define <8 x i32> @mul_v8i64_zero_upper(<8 x i32> %val1, <8 x i32> %val2) {
; SSE2-LABEL: mul_v8i64_zero_upper:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: pxor %xmm6, %xmm6
; SSE2-NEXT: movdqa %xmm0, %xmm4
; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1]
@@ -1329,7 +1329,7 @@ define <8 x i32> @mul_v8i64_zero_upper(<8 x i32> %val1, <8 x i32> %val2) {
; SSE2-NEXT: retq
;
; SSE41-LABEL: mul_v8i64_zero_upper:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
; SSE41-NEXT: pmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
; SSE41-NEXT: pmovzxdq {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero
@@ -1351,7 +1351,7 @@ define <8 x i32> @mul_v8i64_zero_upper(<8 x i32> %val1, <8 x i32> %val2) {
; SSE41-NEXT: retq
;
; AVX2-LABEL: mul_v8i64_zero_upper:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
@@ -1361,17 +1361,17 @@ define <8 x i32> @mul_v8i64_zero_upper(<8 x i32> %val1, <8 x i32> %val2) {
; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,3],ymm0[1,3],ymm2[5,7],ymm0[5,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX2-NEXT: retq
;
; AVX512-LABEL: mul_v8i64_zero_upper:
-; AVX512: # BB#0: # %entry
+; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
; AVX512-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero
; AVX512-NEXT: vpmuludq %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
-; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX512-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX512-NEXT: retq
entry:
%val1a = zext <8 x i32> %val1 to <8 x i64>
@@ -1384,7 +1384,7 @@ entry:
define <8 x i64> @mul_v8i64_sext(<8 x i16> %val1, <8 x i32> %val2) {
; SSE2-LABEL: mul_v8i64_sext:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm1, %xmm4
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3]
@@ -1465,7 +1465,7 @@ define <8 x i64> @mul_v8i64_sext(<8 x i16> %val1, <8 x i32> %val2) {
; SSE2-NEXT: retq
;
; SSE41-LABEL: mul_v8i64_sext:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,1,2,3]
; SSE41-NEXT: pmovsxwq %xmm3, %xmm4
; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
@@ -1487,7 +1487,7 @@ define <8 x i64> @mul_v8i64_sext(<8 x i16> %val1, <8 x i32> %val2) {
; SSE41-NEXT: retq
;
; AVX2-LABEL: mul_v8i64_sext:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
; AVX2-NEXT: vpmovsxwq %xmm2, %ymm2
; AVX2-NEXT: vpmovsxwq %xmm0, %ymm0
@@ -1500,7 +1500,7 @@ define <8 x i64> @mul_v8i64_sext(<8 x i16> %val1, <8 x i32> %val2) {
; AVX2-NEXT: retq
;
; AVX512-LABEL: mul_v8i64_sext:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpmovsxwq %xmm0, %zmm0
; AVX512-NEXT: vpmovsxdq %ymm1, %zmm1
; AVX512-NEXT: vpmuldq %zmm1, %zmm0, %zmm0
diff --git a/test/CodeGen/X86/pointer-vector.ll b/test/CodeGen/X86/pointer-vector.ll
index d5297b9c70ce..739e66c7bad3 100644
--- a/test/CodeGen/X86/pointer-vector.ll
+++ b/test/CodeGen/X86/pointer-vector.ll
@@ -4,7 +4,7 @@
define <8 x i32*> @SHUFF0(<4 x i32*> %ptrv) nounwind {
; CHECK-LABEL: SHUFF0:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,1,2]
; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,1,1]
; CHECK-NEXT: movdqa %xmm2, %xmm0
@@ -16,7 +16,7 @@ entry:
define <4 x i32*> @SHUFF1(<4 x i32*> %ptrv) nounwind {
; CHECK-LABEL: SHUFF1:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,3,2]
; CHECK-NEXT: retl
entry:
@@ -26,7 +26,7 @@ entry:
define <4 x i8*> @SHUFF3(<4 x i8*> %ptrv) nounwind {
; CHECK-LABEL: SHUFF3:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,1,2]
; CHECK-NEXT: retl
entry:
@@ -36,7 +36,7 @@ entry:
define <4 x i8*> @LOAD0(<4 x i8*>* %p) nounwind {
; CHECK-LABEL: LOAD0:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: movaps (%eax), %xmm0
; CHECK-NEXT: retl
@@ -47,7 +47,7 @@ entry:
define <4 x i8*> @LOAD1(<4 x i8*>* %p) nounwind {
; CHECK-LABEL: LOAD1:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: movdqa (%eax), %xmm0
; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,0,3]
@@ -62,7 +62,7 @@ entry:
define <4 x i8*> @LOAD2(<4 x i8*>* %p) nounwind {
; CHECK-LABEL: LOAD2:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: subl $28, %esp
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: movaps (%eax), %xmm0
@@ -79,7 +79,7 @@ entry:
define <4 x i32> @INT2PTR0(<4 x i8*>* %p) nounwind {
; CHECK-LABEL: INT2PTR0:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: movaps (%eax), %xmm0
; CHECK-NEXT: retl
@@ -91,7 +91,7 @@ entry:
define <4 x i32*> @INT2PTR1(<4 x i8>* %p) nounwind {
; CHECK-LABEL: INT2PTR1:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; CHECK-NEXT: retl
@@ -103,7 +103,7 @@ entry:
define <4 x i32*> @BITCAST0(<4 x i8*>* %p) nounwind {
; CHECK-LABEL: BITCAST0:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: movaps (%eax), %xmm0
; CHECK-NEXT: retl
@@ -115,7 +115,7 @@ entry:
define <2 x i32*> @BITCAST1(<2 x i8*>* %p) nounwind {
; CHECK-LABEL: BITCAST1:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: pmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
; CHECK-NEXT: retl
@@ -127,7 +127,7 @@ entry:
define <4 x i32> @ICMP0(<4 x i8*>* %p0, <4 x i8*>* %p1) nounwind {
; CHECK-LABEL: ICMP0:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
; CHECK-NEXT: movdqa (%ecx), %xmm0
@@ -146,7 +146,7 @@ entry:
define <4 x i32> @ICMP1(<4 x i8*>* %p0, <4 x i8*>* %p1) nounwind {
; CHECK-LABEL: ICMP1:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
; CHECK-NEXT: movdqa (%ecx), %xmm0
diff --git a/test/CodeGen/X86/pop-stack-cleanup-msvc.ll b/test/CodeGen/X86/pop-stack-cleanup-msvc.ll
new file mode 100644
index 000000000000..6330d3de72f1
--- /dev/null
+++ b/test/CodeGen/X86/pop-stack-cleanup-msvc.ll
@@ -0,0 +1,26 @@
+; RUN: llc < %s | FileCheck %s
+
+target triple = "i686--windows-msvc"
+
+declare { i8*, i32 } @param2_ret2(i32, i32)
+declare i32 @__CxxFrameHandler3(...)
+
+
+define void @test_reserved_regs() minsize optsize personality i32 (...)* @__CxxFrameHandler3 {
+; CHECK-LABEL: test_reserved_regs:
+; CHECK: calll _param2_ret2
+; CHECK-NEXT: popl %ecx
+; CHECK-NEXT: popl %edi
+start:
+ %s = alloca i64
+ store i64 4, i64* %s
+ %0 = invoke { i8*, i32 } @param2_ret2(i32 0, i32 1)
+ to label %out unwind label %cleanup
+
+out:
+ ret void
+
+cleanup:
+ %cp = cleanuppad within none []
+ cleanupret from %cp unwind to caller
+}
diff --git a/test/CodeGen/X86/popcnt-schedule.ll b/test/CodeGen/X86/popcnt-schedule.ll
new file mode 100644
index 000000000000..b6ee5a9bfa0b
--- /dev/null
+++ b/test/CodeGen/X86/popcnt-schedule.ll
@@ -0,0 +1,212 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=+popcnt | FileCheck %s --check-prefix=CHECK --check-prefix=GENERIC
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=slm | FileCheck %s --check-prefix=CHECK --check-prefix=SLM
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=goldmont | FileCheck %s --check-prefix=CHECK --check-prefix=SLM
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=sandybridge | FileCheck %s --check-prefix=CHECK --check-prefix=SANDY
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=ivybridge | FileCheck %s --check-prefix=CHECK --check-prefix=SANDY
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=haswell | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=broadwell | FileCheck %s --check-prefix=CHECK --check-prefix=BROADWELL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=SKYLAKE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=knl | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1
+
+define i16 @test_ctpop_i16(i16 zeroext %a0, i16 *%a1) {
+; GENERIC-LABEL: test_ctpop_i16:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: popcntw (%rsi), %cx # sched: [9:1.00]
+; GENERIC-NEXT: popcntw %di, %ax # sched: [3:1.00]
+; GENERIC-NEXT: orl %ecx, %eax # sched: [1:0.33]
+; GENERIC-NEXT: # kill: def %ax killed %ax killed %eax
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SLM-LABEL: test_ctpop_i16:
+; SLM: # %bb.0:
+; SLM-NEXT: popcntw (%rsi), %cx # sched: [6:1.00]
+; SLM-NEXT: popcntw %di, %ax # sched: [3:1.00]
+; SLM-NEXT: orl %ecx, %eax # sched: [1:0.50]
+; SLM-NEXT: # kill: def %ax killed %ax killed %eax
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_ctpop_i16:
+; SANDY: # %bb.0:
+; SANDY-NEXT: popcntw (%rsi), %cx # sched: [9:1.00]
+; SANDY-NEXT: popcntw %di, %ax # sched: [3:1.00]
+; SANDY-NEXT: orl %ecx, %eax # sched: [1:0.33]
+; SANDY-NEXT: # kill: def %ax killed %ax killed %eax
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_ctpop_i16:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: popcntw (%rsi), %cx # sched: [8:1.00]
+; HASWELL-NEXT: popcntw %di, %ax # sched: [3:1.00]
+; HASWELL-NEXT: orl %ecx, %eax # sched: [1:0.25]
+; HASWELL-NEXT: # kill: def %ax killed %ax killed %eax
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_ctpop_i16:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: popcntw (%rsi), %cx # sched: [8:1.00]
+; BROADWELL-NEXT: popcntw %di, %ax # sched: [3:1.00]
+; BROADWELL-NEXT: orl %ecx, %eax # sched: [1:0.25]
+; BROADWELL-NEXT: # kill: def %ax killed %ax killed %eax
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_ctpop_i16:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: popcntw (%rsi), %cx # sched: [8:1.00]
+; SKYLAKE-NEXT: popcntw %di, %ax # sched: [3:1.00]
+; SKYLAKE-NEXT: orl %ecx, %eax # sched: [1:0.25]
+; SKYLAKE-NEXT: # kill: def %ax killed %ax killed %eax
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_ctpop_i16:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: popcntw (%rsi), %cx # sched: [8:1.00]
+; BTVER2-NEXT: popcntw %di, %ax # sched: [3:1.00]
+; BTVER2-NEXT: orl %ecx, %eax # sched: [1:0.50]
+; BTVER2-NEXT: # kill: def %ax killed %ax killed %eax
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_ctpop_i16:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: popcntw (%rsi), %cx # sched: [10:1.00]
+; ZNVER1-NEXT: popcntw %di, %ax # sched: [3:1.00]
+; ZNVER1-NEXT: orl %ecx, %eax # sched: [1:0.25]
+; ZNVER1-NEXT: # kill: def %ax killed %ax killed %eax
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = load i16, i16 *%a1
+ %2 = tail call i16 @llvm.ctpop.i16( i16 %1 )
+ %3 = tail call i16 @llvm.ctpop.i16( i16 %a0 )
+ %4 = or i16 %2, %3
+ ret i16 %4
+}
+declare i16 @llvm.ctpop.i16(i16)
+
+define i32 @test_ctpop_i32(i32 %a0, i32 *%a1) {
+; GENERIC-LABEL: test_ctpop_i32:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: popcntl (%rsi), %ecx # sched: [9:1.00]
+; GENERIC-NEXT: popcntl %edi, %eax # sched: [3:1.00]
+; GENERIC-NEXT: orl %ecx, %eax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SLM-LABEL: test_ctpop_i32:
+; SLM: # %bb.0:
+; SLM-NEXT: popcntl (%rsi), %ecx # sched: [6:1.00]
+; SLM-NEXT: popcntl %edi, %eax # sched: [3:1.00]
+; SLM-NEXT: orl %ecx, %eax # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_ctpop_i32:
+; SANDY: # %bb.0:
+; SANDY-NEXT: popcntl (%rsi), %ecx # sched: [9:1.00]
+; SANDY-NEXT: popcntl %edi, %eax # sched: [3:1.00]
+; SANDY-NEXT: orl %ecx, %eax # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_ctpop_i32:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: popcntl (%rsi), %ecx # sched: [8:1.00]
+; HASWELL-NEXT: popcntl %edi, %eax # sched: [3:1.00]
+; HASWELL-NEXT: orl %ecx, %eax # sched: [1:0.25]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_ctpop_i32:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: popcntl (%rsi), %ecx # sched: [8:1.00]
+; BROADWELL-NEXT: popcntl %edi, %eax # sched: [3:1.00]
+; BROADWELL-NEXT: orl %ecx, %eax # sched: [1:0.25]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_ctpop_i32:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: popcntl (%rsi), %ecx # sched: [8:1.00]
+; SKYLAKE-NEXT: popcntl %edi, %eax # sched: [3:1.00]
+; SKYLAKE-NEXT: orl %ecx, %eax # sched: [1:0.25]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_ctpop_i32:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: popcntl (%rsi), %ecx # sched: [8:1.00]
+; BTVER2-NEXT: popcntl %edi, %eax # sched: [3:1.00]
+; BTVER2-NEXT: orl %ecx, %eax # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_ctpop_i32:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: popcntl (%rsi), %ecx # sched: [10:1.00]
+; ZNVER1-NEXT: popcntl %edi, %eax # sched: [3:1.00]
+; ZNVER1-NEXT: orl %ecx, %eax # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = load i32, i32 *%a1
+ %2 = tail call i32 @llvm.ctpop.i32( i32 %1 )
+ %3 = tail call i32 @llvm.ctpop.i32( i32 %a0 )
+ %4 = or i32 %2, %3
+ ret i32 %4
+}
+declare i32 @llvm.ctpop.i32(i32)
+
+define i64 @test_ctpop_i64(i64 %a0, i64 *%a1) {
+; GENERIC-LABEL: test_ctpop_i64:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: popcntq (%rsi), %rcx # sched: [9:1.00]
+; GENERIC-NEXT: popcntq %rdi, %rax # sched: [3:1.00]
+; GENERIC-NEXT: orq %rcx, %rax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SLM-LABEL: test_ctpop_i64:
+; SLM: # %bb.0:
+; SLM-NEXT: popcntq (%rsi), %rcx # sched: [6:1.00]
+; SLM-NEXT: popcntq %rdi, %rax # sched: [3:1.00]
+; SLM-NEXT: orq %rcx, %rax # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_ctpop_i64:
+; SANDY: # %bb.0:
+; SANDY-NEXT: popcntq (%rsi), %rcx # sched: [9:1.00]
+; SANDY-NEXT: popcntq %rdi, %rax # sched: [3:1.00]
+; SANDY-NEXT: orq %rcx, %rax # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_ctpop_i64:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: popcntq (%rsi), %rcx # sched: [8:1.00]
+; HASWELL-NEXT: popcntq %rdi, %rax # sched: [3:1.00]
+; HASWELL-NEXT: orq %rcx, %rax # sched: [1:0.25]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_ctpop_i64:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: popcntq (%rsi), %rcx # sched: [8:1.00]
+; BROADWELL-NEXT: popcntq %rdi, %rax # sched: [3:1.00]
+; BROADWELL-NEXT: orq %rcx, %rax # sched: [1:0.25]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_ctpop_i64:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: popcntq (%rsi), %rcx # sched: [8:1.00]
+; SKYLAKE-NEXT: popcntq %rdi, %rax # sched: [3:1.00]
+; SKYLAKE-NEXT: orq %rcx, %rax # sched: [1:0.25]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_ctpop_i64:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: popcntq (%rsi), %rcx # sched: [8:1.00]
+; BTVER2-NEXT: popcntq %rdi, %rax # sched: [3:1.00]
+; BTVER2-NEXT: orq %rcx, %rax # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_ctpop_i64:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: popcntq (%rsi), %rcx # sched: [10:1.00]
+; ZNVER1-NEXT: popcntq %rdi, %rax # sched: [3:1.00]
+; ZNVER1-NEXT: orq %rcx, %rax # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = load i64, i64 *%a1
+ %2 = tail call i64 @llvm.ctpop.i64( i64 %1 )
+ %3 = tail call i64 @llvm.ctpop.i64( i64 %a0 )
+ %4 = or i64 %2, %3
+ ret i64 %4
+}
+declare i64 @llvm.ctpop.i64(i64)
diff --git a/test/CodeGen/X86/popcnt.ll b/test/CodeGen/X86/popcnt.ll
index b5d4ebba0538..d7622c8d0cab 100644
--- a/test/CodeGen/X86/popcnt.ll
+++ b/test/CodeGen/X86/popcnt.ll
@@ -6,7 +6,7 @@
define i8 @cnt8(i8 %x) nounwind readnone {
; X32-LABEL: cnt8:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movb {{[0-9]+}}(%esp), %cl
; X32-NEXT: movl %ecx, %eax
; X32-NEXT: shrb %al
@@ -24,7 +24,7 @@ define i8 @cnt8(i8 %x) nounwind readnone {
; X32-NEXT: retl
;
; X64-LABEL: cnt8:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movl %edi, %eax
; X64-NEXT: shrb %al
; X64-NEXT: andb $85, %al
@@ -41,17 +41,17 @@ define i8 @cnt8(i8 %x) nounwind readnone {
; X64-NEXT: retq
;
; X32-POPCNT-LABEL: cnt8:
-; X32-POPCNT: # BB#0:
+; X32-POPCNT: # %bb.0:
; X32-POPCNT-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; X32-POPCNT-NEXT: popcntw %ax, %ax
-; X32-POPCNT-NEXT: # kill: %AL<def> %AL<kill> %AX<kill>
+; X32-POPCNT-NEXT: popcntl %eax, %eax
+; X32-POPCNT-NEXT: # kill: def %al killed %al killed %eax
; X32-POPCNT-NEXT: retl
;
; X64-POPCNT-LABEL: cnt8:
-; X64-POPCNT: # BB#0:
+; X64-POPCNT: # %bb.0:
; X64-POPCNT-NEXT: movzbl %dil, %eax
-; X64-POPCNT-NEXT: popcntw %ax, %ax
-; X64-POPCNT-NEXT: # kill: %AL<def> %AL<kill> %AX<kill>
+; X64-POPCNT-NEXT: popcntl %eax, %eax
+; X64-POPCNT-NEXT: # kill: def %al killed %al killed %eax
; X64-POPCNT-NEXT: retq
%cnt = tail call i8 @llvm.ctpop.i8(i8 %x)
ret i8 %cnt
@@ -59,7 +59,7 @@ define i8 @cnt8(i8 %x) nounwind readnone {
define i16 @cnt16(i16 %x) nounwind readnone {
; X32-LABEL: cnt16:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl %eax, %ecx
; X32-NEXT: shrl %ecx
@@ -79,11 +79,11 @@ define i16 @cnt16(i16 %x) nounwind readnone {
; X32-NEXT: shll $8, %eax
; X32-NEXT: addl %ecx, %eax
; X32-NEXT: movzbl %ah, %eax
-; X32-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; X32-NEXT: # kill: def %ax killed %ax killed %eax
; X32-NEXT: retl
;
; X64-LABEL: cnt16:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movl %edi, %eax
; X64-NEXT: shrl %eax
; X64-NEXT: andl $21845, %eax # imm = 0x5555
@@ -102,16 +102,16 @@ define i16 @cnt16(i16 %x) nounwind readnone {
; X64-NEXT: shll $8, %ecx
; X64-NEXT: addl %eax, %ecx
; X64-NEXT: movzbl %ch, %eax # NOREX
-; X64-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-NEXT: # kill: def %ax killed %ax killed %eax
; X64-NEXT: retq
;
; X32-POPCNT-LABEL: cnt16:
-; X32-POPCNT: # BB#0:
+; X32-POPCNT: # %bb.0:
; X32-POPCNT-NEXT: popcntw {{[0-9]+}}(%esp), %ax
; X32-POPCNT-NEXT: retl
;
; X64-POPCNT-LABEL: cnt16:
-; X64-POPCNT: # BB#0:
+; X64-POPCNT: # %bb.0:
; X64-POPCNT-NEXT: popcntw %di, %ax
; X64-POPCNT-NEXT: retq
%cnt = tail call i16 @llvm.ctpop.i16(i16 %x)
@@ -120,7 +120,7 @@ define i16 @cnt16(i16 %x) nounwind readnone {
define i32 @cnt32(i32 %x) nounwind readnone {
; X32-LABEL: cnt32:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl %eax, %ecx
; X32-NEXT: shrl %ecx
@@ -140,7 +140,7 @@ define i32 @cnt32(i32 %x) nounwind readnone {
; X32-NEXT: retl
;
; X64-LABEL: cnt32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movl %edi, %eax
; X64-NEXT: shrl %eax
; X64-NEXT: andl $1431655765, %eax # imm = 0x55555555
@@ -159,12 +159,12 @@ define i32 @cnt32(i32 %x) nounwind readnone {
; X64-NEXT: retq
;
; X32-POPCNT-LABEL: cnt32:
-; X32-POPCNT: # BB#0:
+; X32-POPCNT: # %bb.0:
; X32-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %eax
; X32-POPCNT-NEXT: retl
;
; X64-POPCNT-LABEL: cnt32:
-; X64-POPCNT: # BB#0:
+; X64-POPCNT: # %bb.0:
; X64-POPCNT-NEXT: popcntl %edi, %eax
; X64-POPCNT-NEXT: retq
%cnt = tail call i32 @llvm.ctpop.i32(i32 %x)
@@ -173,7 +173,7 @@ define i32 @cnt32(i32 %x) nounwind readnone {
define i64 @cnt64(i64 %x) nounwind readnone {
; X32-LABEL: cnt64:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: movl %ecx, %edx
@@ -211,7 +211,7 @@ define i64 @cnt64(i64 %x) nounwind readnone {
; X32-NEXT: retl
;
; X64-LABEL: cnt64:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movq %rdi, %rax
; X64-NEXT: shrq %rax
; X64-NEXT: movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
@@ -234,7 +234,7 @@ define i64 @cnt64(i64 %x) nounwind readnone {
; X64-NEXT: retq
;
; X32-POPCNT-LABEL: cnt64:
-; X32-POPCNT: # BB#0:
+; X32-POPCNT: # %bb.0:
; X32-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %ecx
; X32-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %eax
; X32-POPCNT-NEXT: addl %ecx, %eax
@@ -242,7 +242,7 @@ define i64 @cnt64(i64 %x) nounwind readnone {
; X32-POPCNT-NEXT: retl
;
; X64-POPCNT-LABEL: cnt64:
-; X64-POPCNT: # BB#0:
+; X64-POPCNT: # %bb.0:
; X64-POPCNT-NEXT: popcntq %rdi, %rax
; X64-POPCNT-NEXT: retq
%cnt = tail call i64 @llvm.ctpop.i64(i64 %x)
diff --git a/test/CodeGen/X86/post-ra-sched-with-debug.mir b/test/CodeGen/X86/post-ra-sched-with-debug.mir
index ba5c85922c7a..41321598204f 100644
--- a/test/CodeGen/X86/post-ra-sched-with-debug.mir
+++ b/test/CodeGen/X86/post-ra-sched-with-debug.mir
@@ -227,6 +227,11 @@
!47 = !DIBasicType(name: "long int", size: 64, encoding: DW_ATE_signed)
!48 = !DILocation(line: 10, column: 8, scope: !40, inlinedAt: !45)
+# CHECK: ![[I_VAR:[0-9]+]] = !DILocalVariable(name: "i", {{.*}}line: 9, {{.*}})
+# CHECK: ![[I_LOC:[0-9]+]] = !DILocation(line: 9, column: 37, {{.*}})
+# CHECK: ![[J_VAR:[0-9]+]] = !DILocalVariable(name: "j", {{.*}}line: 10, {{.*}})
+# CHECK: ![[J_LOC:[0-9]+]] = !DILocation(line: 10, column: 8, {{.*}})
+
...
---
name: _ZN1sC2Ei
@@ -245,9 +250,9 @@ body: |
successors: %bb.3, %bb.2
liveins: %esi, %rdi, %r14, %rbx, %rbp
- ; CHECK: [[REGISTER:%r[a-z0-9]+]] = LEA64r {{%r[a-z0-9]+}}, 1, _, -20, _
- ; CHECK-NEXT: DBG_VALUE debug-use [[REGISTER]], debug-use _, !46, !17, debug-location !48
- ; CHECK-NEXT: DBG_VALUE debug-use [[REGISTER]], debug-use _, !39, !17, debug-location !44
+ ; CHECK: [[REGISTER:%r[a-z0-9]+]] = LEA64r {{%r[a-z0-9]+}}, 1, %noreg, -20, %noreg
+ ; CHECK-NEXT: DBG_VALUE debug-use [[REGISTER]], debug-use %noreg, ![[J_VAR]], !DIExpression(), debug-location ![[J_LOC]]
+ ; CHECK-NEXT: DBG_VALUE debug-use [[REGISTER]], debug-use %noreg, ![[I_VAR]], !DIExpression(), debug-location ![[I_LOC]]
frame-setup PUSH64r killed %rbp, implicit-def %rsp, implicit %rsp
CFI_INSTRUCTION def_cfa_offset 16
@@ -262,32 +267,32 @@ body: |
%r14d = MOV32rr %esi
%rbx = MOV64rr %rdi
CALL64pcrel32 @_ZN1lC2Ei, csr_64, implicit %rsp, implicit %rdi, implicit %esi, implicit-def %rsp
- %rdi = LEA64r %rbx, 1, _, 8, _
- DBG_VALUE debug-use %rdi, debug-use _, !20, !17, debug-location !27
- DBG_VALUE debug-use %rdi, debug-use _, !10, !17, debug-location !18
- %rax = MOV64rm %rbx, 1, _, 16, _ :: (load 8)
- MOV64mr %rbx, 1, _, 8, _, killed %rax :: (store 8)
- MOV64mr %rbx, 1, _, 24, _, %rdi :: (store 8)
+ %rdi = LEA64r %rbx, 1, %noreg, 8, %noreg
+ DBG_VALUE debug-use %rdi, debug-use %noreg, !20, !17, debug-location !27
+ DBG_VALUE debug-use %rdi, debug-use %noreg, !10, !17, debug-location !18
+ %rax = MOV64rm %rbx, 1, %noreg, 16, %noreg :: (load 8)
+ MOV64mr %rbx, 1, %noreg, 8, %noreg, killed %rax :: (store 8)
+ MOV64mr %rbx, 1, %noreg, 24, %noreg, %rdi :: (store 8)
%eax = MOV32ri -1
%cl = MOV8rr %r14b, implicit killed %r14d
%eax = SHL32rCL killed %eax, implicit-def dead %eflags, implicit %cl
- MOV32mr %rbx, 1, _, 32, _, %eax :: (store 4, align 8)
- MOV32mi %rbp, 1, _, -20, _, 0 :: (store 4)
- %rcx = MOV64rm %rbx, 1, _, 8, _ :: (load 8)
- MOV64mr %rip, 1, _, @n, _, %rcx :: (store 8)
+ MOV32mr %rbx, 1, %noreg, 32, %noreg, %eax :: (store 4, align 8)
+ MOV32mi %rbp, 1, %noreg, -20, %noreg, 0 :: (store 4)
+ %rcx = MOV64rm %rbx, 1, %noreg, 8, %noreg :: (load 8)
+ MOV64mr %rip, 1, %noreg, @n, %noreg, %rcx :: (store 8)
%edx = XOR32rr undef %edx, undef %edx, implicit-def dead %eflags, implicit-def %rdx
TEST64rr %rcx, %rcx, implicit-def %eflags
%esi = MOV32ri @o, implicit-def %rsi
%rsi = CMOVNE64rr killed %rsi, %rdx, implicit killed %eflags
%rsi = OR64rr killed %rsi, killed %rcx, implicit-def %eflags
- %rcx = LEA64r %rbp, 1, _, -20, _
- DBG_VALUE debug-use %rcx, debug-use _, !46, !17, debug-location !48
- DBG_VALUE debug-use %rcx, debug-use _, !39, !17, debug-location !44
+ %rcx = LEA64r %rbp, 1, %noreg, -20, %noreg
+ DBG_VALUE debug-use %rcx, debug-use %noreg, !46, !17, debug-location !48
+ DBG_VALUE debug-use %rcx, debug-use %noreg, !39, !17, debug-location !44
DBG_VALUE %rbp, -20, !29, !17, debug-location !36
%rcx = CMOVNE64rr killed %rcx, killed %rdx, implicit killed %eflags
%rcx = OR64rr killed %rcx, killed %rsi, implicit-def dead %eflags
- %rdx = MOVSX64rm32 %rbx, 1, _, 0, _ :: (load 4, align 8)
- TEST32rm killed %eax, killed %rcx, 4, killed %rdx, 0, _, implicit-def %eflags :: (load 4)
+ %rdx = MOVSX64rm32 %rbx, 1, %noreg, 0, %noreg :: (load 4, align 8)
+ TEST32mr killed %rcx, 4, killed %rdx, 0, %noreg, killed %eax, implicit-def %eflags :: (load 4)
JNE_1 %bb.2, implicit %eflags
JMP_1 %bb.3
@@ -295,7 +300,7 @@ body: |
successors: %bb.2
liveins: %rbx, %rbp
- %rdi = MOV64rm %rbx, 1, _, 24, _ :: (load 8)
+ %rdi = MOV64rm %rbx, 1, %noreg, 24, %noreg :: (load 8)
bb.2:
successors: %bb.1, %bb.3
@@ -303,11 +308,11 @@ body: |
CALL64pcrel32 @_ZN1p2aaEv, csr_64, implicit %rsp, implicit %rdi, implicit-def %rsp, implicit-def %eax
%eax = KILL %eax, implicit-def %rax
- %ecx = LEA64_32r %rax, 1, _, -1, _, implicit-def %rcx
+ %ecx = LEA64_32r %rax, 1, %noreg, -1, %noreg, implicit-def %rcx
%ecx = SHR32ri %ecx, 31, implicit-def dead %eflags, implicit killed %rcx, implicit-def %rcx
- %eax = LEA64_32r killed %rax, 1, killed %rcx, -1, _
+ %eax = LEA64_32r killed %rax, 1, killed %rcx, -1, %noreg
%eax = SAR32r1 killed %eax, implicit-def dead %eflags
- CMP32mr %rbx, 1, _, 0, _, killed %eax, implicit-def %eflags :: (load 4, align 8), (load 4, align 8)
+ CMP32mr %rbx, 1, %noreg, 0, %noreg, killed %eax, implicit-def %eflags :: (load 4, align 8), (load 4, align 8)
JG_1 %bb.1, implicit killed %eflags
bb.3:
diff --git a/test/CodeGen/X86/post-ra-sched.ll b/test/CodeGen/X86/post-ra-sched.ll
index c31072a8a5eb..f6de77a69883 100644
--- a/test/CodeGen/X86/post-ra-sched.ll
+++ b/test/CodeGen/X86/post-ra-sched.ll
@@ -16,7 +16,7 @@
define void @addindirect() {
; CHECK-LABEL: addindirect:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movl idxb, %ecx
; CHECK-NEXT: movl idxa, %eax
; CHECK-NEXT: movl ptrs(,%ecx,4), %ecx
diff --git a/test/CodeGen/X86/postalloc-coalescing.ll b/test/CodeGen/X86/postalloc-coalescing.ll
index fe6f521f4d32..83c435e5e2d5 100644
--- a/test/CodeGen/X86/postalloc-coalescing.ll
+++ b/test/CodeGen/X86/postalloc-coalescing.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | grep mov | count 3
+; RUN: llc < %s -mtriple=i686-- | grep mov | count 3
define fastcc i32 @_Z18yy_get_next_bufferv() nounwind {
entry:
diff --git a/test/CodeGen/X86/powi.ll b/test/CodeGen/X86/powi.ll
index fb7f570d6251..246e853eed66 100644
--- a/test/CodeGen/X86/powi.ll
+++ b/test/CodeGen/X86/powi.ll
@@ -3,7 +3,7 @@
define double @pow_wrapper(double %a) nounwind readonly ssp noredzone {
; CHECK-LABEL: pow_wrapper:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movapd %xmm0, %xmm1
; CHECK-NEXT: mulsd %xmm1, %xmm1
; CHECK-NEXT: mulsd %xmm1, %xmm0
@@ -19,7 +19,7 @@ define double @pow_wrapper(double %a) nounwind readonly ssp noredzone {
define double @pow_wrapper_optsize(double %a) optsize {
; CHECK-LABEL: pow_wrapper_optsize:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movl $15, %edi
; CHECK-NEXT: jmp
%ret = tail call double @llvm.powi.f64(double %a, i32 15) nounwind ; <double> [#uses=1]
@@ -28,7 +28,7 @@ define double @pow_wrapper_optsize(double %a) optsize {
define double @pow_wrapper_minsize(double %a) minsize {
; CHECK-LABEL: pow_wrapper_minsize:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: pushq $15
; CHECK: popq %rdi
; CHECK: jmp
diff --git a/test/CodeGen/X86/pr10068.ll b/test/CodeGen/X86/pr10068.ll
index 8829c5dbbf79..7a3da26b2364 100644
--- a/test/CodeGen/X86/pr10068.ll
+++ b/test/CodeGen/X86/pr10068.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86
+; RUN: llc < %s -mtriple=i686--
define void @foobar() {
entry:
diff --git a/test/CodeGen/X86/pr10523.ll b/test/CodeGen/X86/pr10523.ll
index 0ec22a08e440..c5013017ce89 100644
--- a/test/CodeGen/X86/pr10523.ll
+++ b/test/CodeGen/X86/pr10523.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mattr=+sse2,+sse4.1
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2,+sse4.1
; No check in a crash test
diff --git a/test/CodeGen/X86/pr10524.ll b/test/CodeGen/X86/pr10524.ll
index 12bdba9fa595..5bb4aeddb899 100644
--- a/test/CodeGen/X86/pr10524.ll
+++ b/test/CodeGen/X86/pr10524.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mattr=+sse2,+sse4.1
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2,+sse4.1
; No check in a crash test
diff --git a/test/CodeGen/X86/pr10525.ll b/test/CodeGen/X86/pr10525.ll
index 436d89caabe0..34e878098414 100644
--- a/test/CodeGen/X86/pr10525.ll
+++ b/test/CodeGen/X86/pr10525.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mattr=+sse2,+sse4.1
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2,+sse4.1
; No check in a crash test
diff --git a/test/CodeGen/X86/pr10526.ll b/test/CodeGen/X86/pr10526.ll
index 9fa83ce17b55..822f18cdca22 100644
--- a/test/CodeGen/X86/pr10526.ll
+++ b/test/CodeGen/X86/pr10526.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mattr=+sse2,+sse4.1
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2,+sse4.1
; No check in a crash test
diff --git a/test/CodeGen/X86/pr11334.ll b/test/CodeGen/X86/pr11334.ll
index 8a154653414a..d5c0f10324fb 100644
--- a/test/CodeGen/X86/pr11334.ll
+++ b/test/CodeGen/X86/pr11334.ll
@@ -4,12 +4,12 @@
define <2 x double> @v2f2d_ext_vec(<2 x float> %v1) nounwind {
; SSE-LABEL: v2f2d_ext_vec:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: cvtps2pd %xmm0, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: v2f2d_ext_vec:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vcvtps2pd %xmm0, %xmm0
; AVX-NEXT: retq
entry:
@@ -19,7 +19,7 @@ entry:
define <3 x double> @v3f2d_ext_vec(<3 x float> %v1) nounwind {
; SSE-LABEL: v3f2d_ext_vec:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: cvtps2pd %xmm0, %xmm2
; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
; SSE-NEXT: cvtps2pd %xmm0, %xmm0
@@ -31,7 +31,7 @@ define <3 x double> @v3f2d_ext_vec(<3 x float> %v1) nounwind {
; SSE-NEXT: retq
;
; AVX-LABEL: v3f2d_ext_vec:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vcvtps2pd %xmm0, %ymm0
; AVX-NEXT: retq
entry:
@@ -41,7 +41,7 @@ entry:
define <4 x double> @v4f2d_ext_vec(<4 x float> %v1) nounwind {
; SSE-LABEL: v4f2d_ext_vec:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: cvtps2pd %xmm0, %xmm2
; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
; SSE-NEXT: cvtps2pd %xmm0, %xmm1
@@ -49,7 +49,7 @@ define <4 x double> @v4f2d_ext_vec(<4 x float> %v1) nounwind {
; SSE-NEXT: retq
;
; AVX-LABEL: v4f2d_ext_vec:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vcvtps2pd %xmm0, %ymm0
; AVX-NEXT: retq
entry:
@@ -59,7 +59,7 @@ entry:
define <8 x double> @v8f2d_ext_vec(<8 x float> %v1) nounwind {
; SSE-LABEL: v8f2d_ext_vec:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: cvtps2pd %xmm0, %xmm5
; SSE-NEXT: cvtps2pd %xmm1, %xmm2
; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
@@ -71,7 +71,7 @@ define <8 x double> @v8f2d_ext_vec(<8 x float> %v1) nounwind {
; SSE-NEXT: retq
;
; AVX-LABEL: v8f2d_ext_vec:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vcvtps2pd %xmm0, %ymm2
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX-NEXT: vcvtps2pd %xmm0, %ymm1
@@ -84,14 +84,14 @@ entry:
define void @test_vector_creation() nounwind {
; SSE-LABEL: test_vector_creation:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: xorpd %xmm0, %xmm0
; SSE-NEXT: movhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
; SSE-NEXT: movapd %xmm0, (%rax)
; SSE-NEXT: retq
;
; AVX-LABEL: test_vector_creation:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vxorpd %xmm0, %xmm0, %xmm0
; AVX-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
diff --git a/test/CodeGen/X86/pr11468.ll b/test/CodeGen/X86/pr11468.ll
index 7a2cc5b1a60d..d2bd4d8d8b05 100644
--- a/test/CodeGen/X86/pr11468.ll
+++ b/test/CodeGen/X86/pr11468.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -stackrealign -stack-alignment=32 -march=x86-64 -mattr=+avx -mtriple=i686-apple-darwin10 | FileCheck %s
+; RUN: llc < %s -stackrealign -stack-alignment=32 -mattr=+avx -mtriple=x86_64-apple-darwin10 | FileCheck %s
; PR11468
define void @f(i64 %sz) uwtable {
diff --git a/test/CodeGen/X86/pr11985.ll b/test/CodeGen/X86/pr11985.ll
index aae00de112d3..94b37215f63a 100644
--- a/test/CodeGen/X86/pr11985.ll
+++ b/test/CodeGen/X86/pr11985.ll
@@ -8,7 +8,7 @@
define float @foo(i8* nocapture %buf, float %a, float %b) nounwind uwtable {
; PRESCOTT-LABEL: foo:
-; PRESCOTT: # BB#0: # %entry
+; PRESCOTT: # %bb.0: # %entry
; PRESCOTT-NEXT: movq .Ltmp0+14(%rip), %rax
; PRESCOTT-NEXT: movq %rax, 14(%rdi)
; PRESCOTT-NEXT: movq .Ltmp0+8(%rip), %rax
@@ -17,7 +17,7 @@ define float @foo(i8* nocapture %buf, float %a, float %b) nounwind uwtable {
; PRESCOTT-NEXT: movq %rax, (%rdi)
;
; NEHALEM-LABEL: foo:
-; NEHALEM: # BB#0: # %entry
+; NEHALEM: # %bb.0: # %entry
; NEHALEM-NEXT: movq .Ltmp0+14(%rip), %rax
; NEHALEM-NEXT: movq %rax, 14(%rdi)
; NEHALEM-NEXT: movups .Ltmp0(%rip), %xmm2
diff --git a/test/CodeGen/X86/pr11998.ll b/test/CodeGen/X86/pr11998.ll
index 1baf07924d39..caaf2710fba8 100644
--- a/test/CodeGen/X86/pr11998.ll
+++ b/test/CodeGen/X86/pr11998.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mcpu=corei7-avx -march=x86-64 -mattr=+avx
+; RUN: llc < %s -mcpu=corei7-avx -mtriple=x86_64-- -mattr=+avx
define void @autogen_51367_5000(i8) {
BB:
diff --git a/test/CodeGen/X86/pr12312.ll b/test/CodeGen/X86/pr12312.ll
index 6575d2a73d9c..56c17f1217c9 100644
--- a/test/CodeGen/X86/pr12312.ll
+++ b/test/CodeGen/X86/pr12312.ll
@@ -4,10 +4,10 @@
define i32 @veccond128(<4 x i32> %input) {
; SSE41-LABEL: veccond128:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: ptest %xmm0, %xmm0
; SSE41-NEXT: je .LBB0_2
-; SSE41-NEXT: # BB#1: # %if-true-block
+; SSE41-NEXT: # %bb.1: # %if-true-block
; SSE41-NEXT: xorl %eax, %eax
; SSE41-NEXT: retq
; SSE41-NEXT: .LBB0_2: # %endif-block
@@ -15,10 +15,10 @@ define i32 @veccond128(<4 x i32> %input) {
; SSE41-NEXT: retq
;
; AVX-LABEL: veccond128:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vptest %xmm0, %xmm0
; AVX-NEXT: je .LBB0_2
-; AVX-NEXT: # BB#1: # %if-true-block
+; AVX-NEXT: # %bb.1: # %if-true-block
; AVX-NEXT: xorl %eax, %eax
; AVX-NEXT: retq
; AVX-NEXT: .LBB0_2: # %endif-block
@@ -36,11 +36,11 @@ endif-block:
define i32 @veccond256(<8 x i32> %input) {
; SSE41-LABEL: veccond256:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: por %xmm1, %xmm0
; SSE41-NEXT: ptest %xmm0, %xmm0
; SSE41-NEXT: je .LBB1_2
-; SSE41-NEXT: # BB#1: # %if-true-block
+; SSE41-NEXT: # %bb.1: # %if-true-block
; SSE41-NEXT: xorl %eax, %eax
; SSE41-NEXT: retq
; SSE41-NEXT: .LBB1_2: # %endif-block
@@ -48,10 +48,10 @@ define i32 @veccond256(<8 x i32> %input) {
; SSE41-NEXT: retq
;
; AVX-LABEL: veccond256:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vptest %ymm0, %ymm0
; AVX-NEXT: je .LBB1_2
-; AVX-NEXT: # BB#1: # %if-true-block
+; AVX-NEXT: # %bb.1: # %if-true-block
; AVX-NEXT: xorl %eax, %eax
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
@@ -71,13 +71,13 @@ endif-block:
define i32 @veccond512(<16 x i32> %input) {
; SSE41-LABEL: veccond512:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: por %xmm3, %xmm1
; SSE41-NEXT: por %xmm2, %xmm1
; SSE41-NEXT: por %xmm0, %xmm1
; SSE41-NEXT: ptest %xmm1, %xmm1
; SSE41-NEXT: je .LBB2_2
-; SSE41-NEXT: # BB#1: # %if-true-block
+; SSE41-NEXT: # %bb.1: # %if-true-block
; SSE41-NEXT: xorl %eax, %eax
; SSE41-NEXT: retq
; SSE41-NEXT: .LBB2_2: # %endif-block
@@ -85,11 +85,11 @@ define i32 @veccond512(<16 x i32> %input) {
; SSE41-NEXT: retq
;
; AVX-LABEL: veccond512:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vorps %ymm1, %ymm0, %ymm0
; AVX-NEXT: vptest %ymm0, %ymm0
; AVX-NEXT: je .LBB2_2
-; AVX-NEXT: # BB#1: # %if-true-block
+; AVX-NEXT: # %bb.1: # %if-true-block
; AVX-NEXT: xorl %eax, %eax
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
@@ -109,14 +109,14 @@ endif-block:
define i32 @vectest128(<4 x i32> %input) {
; SSE41-LABEL: vectest128:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: xorl %eax, %eax
; SSE41-NEXT: ptest %xmm0, %xmm0
; SSE41-NEXT: setne %al
; SSE41-NEXT: retq
;
; AVX-LABEL: vectest128:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: xorl %eax, %eax
; AVX-NEXT: vptest %xmm0, %xmm0
; AVX-NEXT: setne %al
@@ -129,7 +129,7 @@ define i32 @vectest128(<4 x i32> %input) {
define i32 @vectest256(<8 x i32> %input) {
; SSE41-LABEL: vectest256:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: por %xmm1, %xmm0
; SSE41-NEXT: xorl %eax, %eax
; SSE41-NEXT: ptest %xmm0, %xmm0
@@ -137,7 +137,7 @@ define i32 @vectest256(<8 x i32> %input) {
; SSE41-NEXT: retq
;
; AVX-LABEL: vectest256:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: xorl %eax, %eax
; AVX-NEXT: vptest %ymm0, %ymm0
; AVX-NEXT: setne %al
@@ -151,7 +151,7 @@ define i32 @vectest256(<8 x i32> %input) {
define i32 @vectest512(<16 x i32> %input) {
; SSE41-LABEL: vectest512:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: por %xmm3, %xmm1
; SSE41-NEXT: por %xmm2, %xmm1
; SSE41-NEXT: por %xmm0, %xmm1
@@ -161,7 +161,7 @@ define i32 @vectest512(<16 x i32> %input) {
; SSE41-NEXT: retq
;
; AVX-LABEL: vectest512:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vorps %ymm1, %ymm0, %ymm0
; AVX-NEXT: xorl %eax, %eax
; AVX-NEXT: vptest %ymm0, %ymm0
@@ -176,14 +176,14 @@ define i32 @vectest512(<16 x i32> %input) {
define i32 @vecsel128(<4 x i32> %input, i32 %a, i32 %b) {
; SSE41-LABEL: vecsel128:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: ptest %xmm0, %xmm0
; SSE41-NEXT: cmovel %esi, %edi
; SSE41-NEXT: movl %edi, %eax
; SSE41-NEXT: retq
;
; AVX-LABEL: vecsel128:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vptest %xmm0, %xmm0
; AVX-NEXT: cmovel %esi, %edi
; AVX-NEXT: movl %edi, %eax
@@ -196,7 +196,7 @@ define i32 @vecsel128(<4 x i32> %input, i32 %a, i32 %b) {
define i32 @vecsel256(<8 x i32> %input, i32 %a, i32 %b) {
; SSE41-LABEL: vecsel256:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: por %xmm1, %xmm0
; SSE41-NEXT: ptest %xmm0, %xmm0
; SSE41-NEXT: cmovel %esi, %edi
@@ -204,7 +204,7 @@ define i32 @vecsel256(<8 x i32> %input, i32 %a, i32 %b) {
; SSE41-NEXT: retq
;
; AVX-LABEL: vecsel256:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vptest %ymm0, %ymm0
; AVX-NEXT: cmovel %esi, %edi
; AVX-NEXT: movl %edi, %eax
@@ -218,7 +218,7 @@ define i32 @vecsel256(<8 x i32> %input, i32 %a, i32 %b) {
define i32 @vecsel512(<16 x i32> %input, i32 %a, i32 %b) {
; SSE41-LABEL: vecsel512:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: por %xmm3, %xmm1
; SSE41-NEXT: por %xmm2, %xmm1
; SSE41-NEXT: por %xmm0, %xmm1
@@ -228,7 +228,7 @@ define i32 @vecsel512(<16 x i32> %input, i32 %a, i32 %b) {
; SSE41-NEXT: retq
;
; AVX-LABEL: vecsel512:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vorps %ymm1, %ymm0, %ymm0
; AVX-NEXT: vptest %ymm0, %ymm0
; AVX-NEXT: cmovel %esi, %edi
diff --git a/test/CodeGen/X86/pr12889.ll b/test/CodeGen/X86/pr12889.ll
index 8234fcc67e08..29e0c0416e67 100644
--- a/test/CodeGen/X86/pr12889.ll
+++ b/test/CodeGen/X86/pr12889.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -march=x86
+; RUN: llc < %s
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
+target triple = "i686-unknown-linux-gnu"
@c0 = common global i8 0, align 1
diff --git a/test/CodeGen/X86/pr13220.ll b/test/CodeGen/X86/pr13220.ll
index b9ac4b63ecf0..d9e915a0974b 100644
--- a/test/CodeGen/X86/pr13220.ll
+++ b/test/CodeGen/X86/pr13220.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=x86 < %s
+; RUN: llc -mtriple=i686-- < %s
; PR13220
define <8 x i32> @foo(<8 x i96> %x) {
diff --git a/test/CodeGen/X86/pr13577.ll b/test/CodeGen/X86/pr13577.ll
index 665df2c183bf..66bbf4531e5c 100644
--- a/test/CodeGen/X86/pr13577.ll
+++ b/test/CodeGen/X86/pr13577.ll
@@ -8,7 +8,7 @@
define x86_fp80 @foo(x86_fp80 %a) {
; CHECK-LABEL: foo:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: fldt {{[0-9]+}}(%rsp)
; CHECK-NEXT: fstpt -{{[0-9]+}}(%rsp)
; CHECK-NEXT: testb $-128, -{{[0-9]+}}(%rsp)
@@ -28,7 +28,7 @@ declare x86_fp80 @copysignl(x86_fp80, x86_fp80) nounwind readnone
define float @pr26070() {
; CHECK-LABEL: pr26070:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
; CHECK-NEXT: orps {{.*}}(%rip), %xmm0
diff --git a/test/CodeGen/X86/pr14161.ll b/test/CodeGen/X86/pr14161.ll
index 95c71405bc9e..ef8cd918f13c 100644
--- a/test/CodeGen/X86/pr14161.ll
+++ b/test/CodeGen/X86/pr14161.ll
@@ -4,7 +4,7 @@ declare <4 x i32> @llvm.x86.sse41.pminud(<4 x i32>, <4 x i32>)
define <2 x i16> @good(<4 x i32>*, <4 x i8>*) {
; CHECK-LABEL: good:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movdqa (%rdi), %xmm0
; CHECK-NEXT: pminud {{.*}}(%rip), %xmm0
; CHECK-NEXT: pmovzxwq %xmm0, %xmm0
@@ -23,7 +23,7 @@ entry:
define <2 x i16> @bad(<4 x i32>*, <4 x i8>*) {
; CHECK-LABEL: bad:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movdqa (%rdi), %xmm0
; CHECK-NEXT: pminud {{.*}}(%rip), %xmm0
; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
diff --git a/test/CodeGen/X86/pr14204.ll b/test/CodeGen/X86/pr14204.ll
index ab467d6ad96d..65d5a7f51b42 100644
--- a/test/CodeGen/X86/pr14204.ll
+++ b/test/CodeGen/X86/pr14204.ll
@@ -3,7 +3,7 @@
define <8 x i32> @foo(<8 x i1> %bar) nounwind readnone {
; CHECK-LABEL: foo:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; CHECK-NEXT: vpslld $31, %ymm0, %ymm0
; CHECK-NEXT: vpsrad $31, %ymm0, %ymm0
diff --git a/test/CodeGen/X86/pr14314.ll b/test/CodeGen/X86/pr14314.ll
index 10733a476995..5223de39a521 100644
--- a/test/CodeGen/X86/pr14314.ll
+++ b/test/CodeGen/X86/pr14314.ll
@@ -3,7 +3,7 @@
define i64 @atomicSub(i64* %a, i64 %b) nounwind {
; CHECK-LABEL: atomicSub:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: pushl %ebp
; CHECK-NEXT: pushl %ebx
; CHECK-NEXT: pushl %edi
@@ -22,7 +22,7 @@ define i64 @atomicSub(i64* %a, i64 %b) nounwind {
; CHECK-NEXT: sbbl %esi, %ecx
; CHECK-NEXT: lock cmpxchg8b (%ebp)
; CHECK-NEXT: jne .LBB0_1
-; CHECK-NEXT: # BB#2: # %atomicrmw.end
+; CHECK-NEXT: # %bb.2: # %atomicrmw.end
; CHECK-NEXT: popl %esi
; CHECK-NEXT: popl %edi
; CHECK-NEXT: popl %ebx
diff --git a/test/CodeGen/X86/pr14562.ll b/test/CodeGen/X86/pr14562.ll
index 31674546423e..1ba962c94178 100644
--- a/test/CodeGen/X86/pr14562.ll
+++ b/test/CodeGen/X86/pr14562.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | FileCheck %s
+; RUN: llc < %s -mtriple=i686-- | FileCheck %s
@temp1 = global i64 -77129852189294865, align 8
diff --git a/test/CodeGen/X86/pr15267.ll b/test/CodeGen/X86/pr15267.ll
index d62aaf90587d..b515fe8c4863 100644
--- a/test/CodeGen/X86/pr15267.ll
+++ b/test/CodeGen/X86/pr15267.ll
@@ -3,7 +3,7 @@
define <4 x i3> @test1(<4 x i3>* %in) nounwind {
; CHECK-LABEL: test1:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movzwl (%rdi), %eax
; CHECK-NEXT: movl %eax, %ecx
; CHECK-NEXT: shrl $3, %ecx
@@ -22,7 +22,7 @@ define <4 x i3> @test1(<4 x i3>* %in) nounwind {
define <4 x i1> @test2(<4 x i1>* %in) nounwind {
; CHECK-LABEL: test2:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movzbl (%rdi), %eax
; CHECK-NEXT: movl %eax, %ecx
; CHECK-NEXT: shrl %ecx
@@ -41,7 +41,7 @@ define <4 x i1> @test2(<4 x i1>* %in) nounwind {
define <4 x i64> @test3(<4 x i1>* %in) nounwind {
; CHECK-LABEL: test3:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movzbl (%rdi), %eax
; CHECK-NEXT: movq %rax, %rcx
; CHECK-NEXT: shlq $62, %rcx
@@ -70,7 +70,7 @@ define <4 x i64> @test3(<4 x i1>* %in) nounwind {
define <16 x i4> @test4(<16 x i4>* %in) nounwind {
; CHECK-LABEL: test4:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movq (%rdi), %rax
; CHECK-NEXT: movl %eax, %ecx
; CHECK-NEXT: shrl $4, %ecx
diff --git a/test/CodeGen/X86/pr15309.ll b/test/CodeGen/X86/pr15309.ll
index 0301b58def1c..8717353377f3 100644
--- a/test/CodeGen/X86/pr15309.ll
+++ b/test/CodeGen/X86/pr15309.ll
@@ -3,7 +3,7 @@
define void @test_convert_float2_ulong2(<2 x i64>* nocapture %src, <2 x float>* nocapture %dest) nounwind {
; CHECK-LABEL: test_convert_float2_ulong2:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: pushl %edi
; CHECK-NEXT: pushl %esi
; CHECK-NEXT: subl $20, %esp
diff --git a/test/CodeGen/X86/pr15705.ll b/test/CodeGen/X86/pr15705.ll
index e728bc8d34c9..d70895bac98e 100644
--- a/test/CodeGen/X86/pr15705.ll
+++ b/test/CodeGen/X86/pr15705.ll
@@ -4,16 +4,16 @@
define i32 @PR15705(i32 %x, i32 %a, i32 %b, i32 %c) #0 {
; X86-LABEL: PR15705:
-; X86: # BB#0: # %entry
+; X86: # %bb.0: # %entry
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: cmpl %ecx, %edx
; X86-NEXT: je .LBB0_4
-; X86-NEXT: # BB#1: # %if.end
+; X86-NEXT: # %bb.1: # %if.end
; X86-NEXT: cmpl %eax, %edx
; X86-NEXT: jne .LBB0_3
-; X86-NEXT: # BB#2:
+; X86-NEXT: # %bb.2:
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: .LBB0_3: # %if.end
; X86-NEXT: movl %ecx, %eax
@@ -21,10 +21,10 @@ define i32 @PR15705(i32 %x, i32 %a, i32 %b, i32 %c) #0 {
; X86-NEXT: retl
;
; X64-LABEL: PR15705:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: cmpl %esi, %edi
; X64-NEXT: je .LBB0_2
-; X64-NEXT: # BB#1: # %if.end
+; X64-NEXT: # %bb.1: # %if.end
; X64-NEXT: cmpl %edx, %edi
; X64-NEXT: cmovel %ecx, %esi
; X64-NEXT: movl %esi, %edx
diff --git a/test/CodeGen/X86/pr15981.ll b/test/CodeGen/X86/pr15981.ll
index 6b246ef7936e..90e1cca36a0e 100644
--- a/test/CodeGen/X86/pr15981.ll
+++ b/test/CodeGen/X86/pr15981.ll
@@ -8,17 +8,17 @@
define i32 @fn1(i32, i32) {
; X86-LABEL: fn1:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: testl %eax, %eax
; X86-NEXT: je .LBB0_2
-; X86-NEXT: # BB#1:
+; X86-NEXT: # %bb.1:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: .LBB0_2:
; X86-NEXT: retl
;
; X64-LABEL: fn1:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: testl %esi, %esi
; X64-NEXT: cmovel %esi, %edi
; X64-NEXT: movl %edi, %eax
@@ -30,21 +30,24 @@ define i32 @fn1(i32, i32) {
define void @fn2() {
; X86-LABEL: fn2:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movl b, %eax
; X86-NEXT: decl a
; X86-NEXT: jne .LBB1_2
-; X86-NEXT: # BB#1:
+; X86-NEXT: # %bb.1:
; X86-NEXT: xorl %eax, %eax
; X86-NEXT: .LBB1_2:
; X86-NEXT: movl %eax, c
; X86-NEXT: retl
;
; X64-LABEL: fn2:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: decl {{.*}}(%rip)
-; X64-NEXT: cmovnel {{.*}}(%rip), %eax
+; X64-NEXT: je .LBB1_2
+; X64-NEXT: # %bb.1:
+; X64-NEXT: movl {{.*}}(%rip), %eax
+; X64-NEXT: .LBB1_2:
; X64-NEXT: movl %eax, {{.*}}(%rip)
; X64-NEXT: retq
%1 = load volatile i32, i32* @b, align 4
diff --git a/test/CodeGen/X86/pr16031.ll b/test/CodeGen/X86/pr16031.ll
index 01bc38a243a5..033a10fdfb31 100644
--- a/test/CodeGen/X86/pr16031.ll
+++ b/test/CodeGen/X86/pr16031.ll
@@ -3,7 +3,7 @@
define i64 @main(i1 %tobool1) nounwind {
; CHECK-LABEL: main:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: pushl %esi
; CHECK-NEXT: testb $1, {{[0-9]+}}(%esp)
; CHECK-NEXT: movl $-12, %eax
diff --git a/test/CodeGen/X86/pr16360.ll b/test/CodeGen/X86/pr16360.ll
index 0d2878dc6af0..6511cf234deb 100644
--- a/test/CodeGen/X86/pr16360.ll
+++ b/test/CodeGen/X86/pr16360.ll
@@ -3,7 +3,7 @@
define i64 @foo(i32 %sum) {
; CHECK-LABEL: foo:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: shrl $2, %eax
; CHECK-NEXT: orl $-67108864, %eax # imm = 0xFC000000
diff --git a/test/CodeGen/X86/pr17764.ll b/test/CodeGen/X86/pr17764.ll
index ccfdb5b58344..a262fc20b542 100644
--- a/test/CodeGen/X86/pr17764.ll
+++ b/test/CodeGen/X86/pr17764.ll
@@ -3,7 +3,7 @@
define <16 x i16> @foo(<16 x i1> %mask, <16 x i16> %x, <16 x i16> %y) {
; CHECK-LABEL: foo:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
; CHECK-NEXT: vpsllw $15, %ymm0, %ymm0
; CHECK-NEXT: vpsraw $15, %ymm0, %ymm0
diff --git a/test/CodeGen/X86/pr18014.ll b/test/CodeGen/X86/pr18014.ll
index cba065002d57..fed68e86dfbc 100644
--- a/test/CodeGen/X86/pr18014.ll
+++ b/test/CodeGen/X86/pr18014.ll
@@ -6,7 +6,7 @@
define <4 x i32> @foo(<4 x i32>* %p, <4 x i1> %cond, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) {
; CHECK-LABEL: foo:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: pslld $31, %xmm0
; CHECK-NEXT: psrad $31, %xmm0
; CHECK-NEXT: blendvps %xmm0, %xmm1, %xmm2
diff --git a/test/CodeGen/X86/pr18344.ll b/test/CodeGen/X86/pr18344.ll
index fcf4174ec3d3..7ff489d70af5 100644
--- a/test/CodeGen/X86/pr18344.ll
+++ b/test/CodeGen/X86/pr18344.ll
@@ -6,7 +6,7 @@
define void @FFT(%v4_varying_complex* noalias nocapture %destination, float* noalias %re, <4 x i32>* noalias nocapture %ptr_cast_for_load) nounwind {
; X86-LABEL: FFT:
-; X86: # BB#0: # %begin
+; X86: # %bb.0: # %begin
; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
@@ -33,7 +33,7 @@ define void @FFT(%v4_varying_complex* noalias nocapture %destination, float* noa
; X86-NEXT: retl
;
; X64-LABEL: FFT:
-; X64: # BB#0: # %begin
+; X64: # %bb.0: # %begin
; X64-NEXT: movdqu (%rdx), %xmm0
; X64-NEXT: pslld $4, %xmm0
; X64-NEXT: movq %xmm0, %rax
diff --git a/test/CodeGen/X86/pr20011.ll b/test/CodeGen/X86/pr20011.ll
new file mode 100644
index 000000000000..a502df18e777
--- /dev/null
+++ b/test/CodeGen/X86/pr20011.ll
@@ -0,0 +1,33 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-linux-gnu | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu | FileCheck %s --check-prefix=X64
+
+%destTy = type { i2, i2 }
+
+define void @crash(i64 %x0, i64 %y0, %destTy* nocapture %dest) nounwind {
+; X86-LABEL: crash:
+; X86: # %bb.0:
+; X86-NEXT: movb {{[0-9]+}}(%esp), %al
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movb {{[0-9]+}}(%esp), %dl
+; X86-NEXT: andb $3, %dl
+; X86-NEXT: movb %dl, (%ecx)
+; X86-NEXT: andb $3, %al
+; X86-NEXT: movb %al, (%ecx)
+; X86-NEXT: retl
+;
+; X64-LABEL: crash:
+; X64: # %bb.0:
+; X64-NEXT: andl $3, %esi
+; X64-NEXT: movb %sil, (%rdx)
+; X64-NEXT: andl $3, %edi
+; X64-NEXT: movb %dil, (%rdx)
+; X64-NEXT: retq
+ %x1 = trunc i64 %x0 to i2
+ %y1 = trunc i64 %y0 to i2
+ %1 = bitcast %destTy* %dest to <2 x i2>*
+ %2 = insertelement <2 x i2> undef, i2 %x1, i32 0
+ %3 = insertelement <2 x i2> %2, i2 %y1, i32 1
+ store <2 x i2> %3, <2 x i2>* %1, align 1
+ ret void
+}
diff --git a/test/CodeGen/X86/pr20012.ll b/test/CodeGen/X86/pr20012.ll
new file mode 100644
index 000000000000..5df781c32e0d
--- /dev/null
+++ b/test/CodeGen/X86/pr20012.ll
@@ -0,0 +1,17 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-linux-gnu | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu | FileCheck %s --check-prefix=X64
+
+define void @test () {
+; X86-LABEL: test:
+; X86: # %bb.0:
+; X86-NEXT: movb $0, (%eax)
+; X86-NEXT: retl
+;
+; X64-LABEL: test:
+; X64: # %bb.0:
+; X64-NEXT: movb $0, (%rax)
+; X64-NEXT: retq
+ store <2 x i4> zeroinitializer, <2 x i4>* undef, align 1
+ ret void
+}
diff --git a/test/CodeGen/X86/pr20088.ll b/test/CodeGen/X86/pr20088.ll
index 3a829622424c..75d1959a5af4 100644
--- a/test/CodeGen/X86/pr20088.ll
+++ b/test/CodeGen/X86/pr20088.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mattr=+avx | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx | FileCheck %s
declare <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8>, <16 x i8>, <16 x i8>)
diff --git a/test/CodeGen/X86/pr21099.ll b/test/CodeGen/X86/pr21099.ll
index cd8205dbc815..36f531db609c 100644
--- a/test/CodeGen/X86/pr21099.ll
+++ b/test/CodeGen/X86/pr21099.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -O2 -march=x86-64 -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -O2 -mtriple=x86_64-- -verify-machineinstrs | FileCheck %s
define void @pr21099(i64* %p) {
; CHECK-LABEL: pr21099
diff --git a/test/CodeGen/X86/pr21792.ll b/test/CodeGen/X86/pr21792.ll
index 84b7467e6a17..1bb6ea6c5921 100644
--- a/test/CodeGen/X86/pr21792.ll
+++ b/test/CodeGen/X86/pr21792.ll
@@ -8,9 +8,8 @@
define void @func(<4 x float> %vx) {
; CHECK-LABEL: func:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: pushq %rax
-; CHECK-NEXT: .Lcfi0:
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: pand {{.*}}(%rip), %xmm0
; CHECK-NEXT: pextrq $1, %xmm0, %rdx
diff --git a/test/CodeGen/X86/pr22338.ll b/test/CodeGen/X86/pr22338.ll
index e0645d1ef551..ccdbe46b3435 100644
--- a/test/CodeGen/X86/pr22338.ll
+++ b/test/CodeGen/X86/pr22338.ll
@@ -4,31 +4,29 @@
define i32 @fn() {
; X86-LABEL: fn:
-; X86: # BB#0: # %entry
+; X86: # %bb.0: # %entry
+; X86-NEXT: xorl %eax, %eax
; X86-NEXT: cmpl $1, %eax
+; X86-NEXT: setne %al
; X86-NEXT: sete %cl
-; X86-NEXT: movl $-1, %eax
-; X86-NEXT: jne .LBB0_2
-; X86-NEXT: # BB#1: # %entry
-; X86-NEXT: xorl %eax, %eax
-; X86-NEXT: .LBB0_2: # %entry
+; X86-NEXT: negl %eax
; X86-NEXT: addb %cl, %cl
; X86-NEXT: shll %cl, %eax
; X86-NEXT: .p2align 4, 0x90
-; X86-NEXT: .LBB0_3: # %bb1
+; X86-NEXT: .LBB0_1: # %bb1
; X86-NEXT: # =>This Inner Loop Header: Depth=1
; X86-NEXT: testl %eax, %eax
-; X86-NEXT: je .LBB0_3
-; X86-NEXT: # BB#4: # %bb2
+; X86-NEXT: je .LBB0_1
+; X86-NEXT: # %bb.2: # %bb2
; X86-NEXT: retl
;
; X64-LABEL: fn:
-; X64: # BB#0: # %entry
-; X64-NEXT: xorl %edx, %edx
+; X64: # %bb.0: # %entry
+; X64-NEXT: xorl %eax, %eax
; X64-NEXT: cmpl $1, %eax
+; X64-NEXT: setne %al
; X64-NEXT: sete %cl
-; X64-NEXT: movl $-1, %eax
-; X64-NEXT: cmovel %edx, %eax
+; X64-NEXT: negl %eax
; X64-NEXT: addb %cl, %cl
; X64-NEXT: shll %cl, %eax
; X64-NEXT: .p2align 4, 0x90
@@ -36,7 +34,7 @@ define i32 @fn() {
; X64-NEXT: # =>This Inner Loop Header: Depth=1
; X64-NEXT: testl %eax, %eax
; X64-NEXT: je .LBB0_1
-; X64-NEXT: # BB#2: # %bb2
+; X64-NEXT: # %bb.2: # %bb2
; X64-NEXT: retq
entry:
%cmp1 = icmp ne i32 undef, 1
diff --git a/test/CodeGen/X86/pr22774.ll b/test/CodeGen/X86/pr22774.ll
index 0b2d8c04e7d9..acd394a4b43b 100644
--- a/test/CodeGen/X86/pr22774.ll
+++ b/test/CodeGen/X86/pr22774.ll
@@ -6,7 +6,7 @@
define i32 @_Z3foov() {
; CHECK-LABEL: _Z3foov:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vmovdqa {{.*}}(%rip), %ymm0
; CHECK-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
; CHECK-NEXT: vmovdqa %xmm0, {{.*}}(%rip)
diff --git a/test/CodeGen/X86/pr22970.ll b/test/CodeGen/X86/pr22970.ll
index 38c063355f64..4daa8d926ec7 100644
--- a/test/CodeGen/X86/pr22970.ll
+++ b/test/CodeGen/X86/pr22970.ll
@@ -4,7 +4,7 @@
define i32 @PR22970_i32(i32* nocapture readonly, i32) {
; X86-LABEL: PR22970_i32:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl $4095, %ecx # imm = 0xFFF
; X86-NEXT: andl {{[0-9]+}}(%esp), %ecx
@@ -12,8 +12,8 @@ define i32 @PR22970_i32(i32* nocapture readonly, i32) {
; X86-NEXT: retl
;
; X64-LABEL: PR22970_i32:
-; X64: # BB#0:
-; X64-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
+; X64: # %bb.0:
+; X64-NEXT: # kill: def %esi killed %esi def %rsi
; X64-NEXT: andl $4095, %esi # imm = 0xFFF
; X64-NEXT: movl 32(%rdi,%rsi,4), %eax
; X64-NEXT: retq
@@ -27,7 +27,7 @@ define i32 @PR22970_i32(i32* nocapture readonly, i32) {
define i32 @PR22970_i64(i32* nocapture readonly, i64) {
; X86-LABEL: PR22970_i64:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl $4095, %ecx # imm = 0xFFF
; X86-NEXT: andl {{[0-9]+}}(%esp), %ecx
@@ -35,7 +35,7 @@ define i32 @PR22970_i64(i32* nocapture readonly, i64) {
; X86-NEXT: retl
;
; X64-LABEL: PR22970_i64:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: andl $4095, %esi # imm = 0xFFF
; X64-NEXT: movl 32(%rdi,%rsi,4), %eax
; X64-NEXT: retq
diff --git a/test/CodeGen/X86/pr2326.ll b/test/CodeGen/X86/pr2326.ll
index 88c7bb586701..ec2844c42699 100644
--- a/test/CodeGen/X86/pr2326.ll
+++ b/test/CodeGen/X86/pr2326.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | grep sete
+; RUN: llc < %s -mtriple=i686-- | grep sete
; PR2326
define i32 @func_59(i32 %p_60) nounwind {
diff --git a/test/CodeGen/X86/pr23273.ll b/test/CodeGen/X86/pr23273.ll
index 2702eb820f2f..5311e9400d91 100644
--- a/test/CodeGen/X86/pr23273.ll
+++ b/test/CodeGen/X86/pr23273.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=i386-unknown-unknown -mcpu=generic -march=x86 -mattr=-sse2 -fast-isel < %s
+; RUN: llc -mtriple=i386-unknown-unknown -mcpu=generic -mattr=-sse2 -fast-isel < %s
; Verify that the backend doesn't crash during fast-isel with an assertion
; failure when selecting a int-to-double conversion. The fast selection routine
diff --git a/test/CodeGen/X86/pr23603.ll b/test/CodeGen/X86/pr23603.ll
index 315e60768613..f92d36878219 100644
--- a/test/CodeGen/X86/pr23603.ll
+++ b/test/CodeGen/X86/pr23603.ll
@@ -5,7 +5,7 @@ declare void @free_v()
define void @f(i32* %x, i32 %c32, i32* %y) nounwind {
; CHECK-LABEL: f:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: pushq %rbp
; CHECK-NEXT: pushq %r14
; CHECK-NEXT: pushq %rbx
@@ -15,7 +15,7 @@ define void @f(i32* %x, i32 %c32, i32* %y) nounwind {
; CHECK-NEXT: callq free_v
; CHECK-NEXT: testl %ebp, %ebp
; CHECK-NEXT: je .LBB0_2
-; CHECK-NEXT: # BB#1: # %left
+; CHECK-NEXT: # %bb.1: # %left
; CHECK-NEXT: movl %ebx, (%r14)
; CHECK-NEXT: .LBB0_2: # %merge
; CHECK-NEXT: popq %rbx
diff --git a/test/CodeGen/X86/pr24602.ll b/test/CodeGen/X86/pr24602.ll
index 9c029aeefec9..ef676efc42fd 100644
--- a/test/CodeGen/X86/pr24602.ll
+++ b/test/CodeGen/X86/pr24602.ll
@@ -3,7 +3,7 @@
; PR24602: Make sure we don't barf on non-foldable code (with opaque constants).
; CHECK-LABEL: pr24602:
-; CHECK-NEXT: # BB#0
+; CHECK-NEXT: # %bb.0
; CHECK-NEXT: movabsq $-10000000000, [[CST:%[a-z0-9]+]]
; CHECK-NEXT: imulq [[CST]], %rsi
; CHECK-NEXT: leaq (%rdi,%rsi,8), %rax
diff --git a/test/CodeGen/X86/pr2585.ll b/test/CodeGen/X86/pr2585.ll
index 7796ee9a2628..415164c87447 100644
--- a/test/CodeGen/X86/pr2585.ll
+++ b/test/CodeGen/X86/pr2585.ll
@@ -7,7 +7,7 @@
define internal void @PR2585() {
; X32-LABEL: PR2585:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pshuflw {{.*#+}} xmm0 = mem[0,2,2,3,4,5,6,7]
; X32-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
@@ -15,7 +15,7 @@ define internal void @PR2585() {
; X32-NEXT: retl
;
; X64-LABEL: PR2585:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: pshuflw {{.*#+}} xmm0 = mem[0,2,2,3,4,5,6,7]
; X64-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
diff --git a/test/CodeGen/X86/pr26350.ll b/test/CodeGen/X86/pr26350.ll
index 5ba5862413b5..0de1e7840ff0 100644
--- a/test/CodeGen/X86/pr26350.ll
+++ b/test/CodeGen/X86/pr26350.ll
@@ -7,7 +7,7 @@ target triple = "i386-unknown-linux-gnu"
define i32 @main() {
; CHECK-LABEL: main:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movl d, %eax
; CHECK-NEXT: movl %eax, %ecx
; CHECK-NEXT: shrl $31, %ecx
diff --git a/test/CodeGen/X86/pr2656.ll b/test/CodeGen/X86/pr2656.ll
index c54ae3d35029..7ab295f4cb6b 100644
--- a/test/CodeGen/X86/pr2656.ll
+++ b/test/CodeGen/X86/pr2656.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -march=x86 -mattr=+sse2 | FileCheck %s
+; RUN: llc < %s -mattr=+sse2 | FileCheck %s
; PR2656
target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
@@ -15,7 +15,7 @@ target triple = "i686-apple-darwin9.4.0"
define void @foo(%struct.anon* byval %p) nounwind {
; CHECK-LABEL: foo:
-; CHECK: ## BB#0: ## %entry
+; CHECK: ## %bb.0: ## %entry
; CHECK-NEXT: subl $28, %esp
; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
@@ -51,9 +51,8 @@ declare i32 @printf(...)
define double @PR22371(double %x) {
; CHECK-LABEL: PR22371:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: subl $12, %esp
-; CHECK-NEXT: Lcfi0:
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: andps LCPI1_0, %xmm0
diff --git a/test/CodeGen/X86/pr2659.ll b/test/CodeGen/X86/pr2659.ll
index cc8f8475cacf..05acbc8e9010 100644
--- a/test/CodeGen/X86/pr2659.ll
+++ b/test/CodeGen/X86/pr2659.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mtriple=i686-apple-darwin9.4.0 -disable-branch-fold | FileCheck %s
+; RUN: llc < %s -mtriple=i686-apple-darwin9.4.0 -disable-branch-fold | FileCheck %s
; PR2659
target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
diff --git a/test/CodeGen/X86/pr26652.ll b/test/CodeGen/X86/pr26652.ll
index c47128a51e9a..81a2657fc4a7 100644
--- a/test/CodeGen/X86/pr26652.ll
+++ b/test/CodeGen/X86/pr26652.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86
+; RUN: llc < %s -mtriple=i686--
; PR26652
define <2 x i32> @test(<4 x i32> %a, <4 x i32> %b) {
diff --git a/test/CodeGen/X86/pr26870.ll b/test/CodeGen/X86/pr26870.ll
index 2731ed2d0125..1e8470bfba39 100644
--- a/test/CodeGen/X86/pr26870.ll
+++ b/test/CodeGen/X86/pr26870.ll
@@ -2,11 +2,11 @@
define x86_thiscallcc i32* @fn4(i32* %this, i8* dereferenceable(1) %p1) {
entry:
- %DL = getelementptr inbounds i32, i32* %this, i32 0
- %call.i = tail call x86_thiscallcc i64 @fn1(i32* %DL)
+ %dl = getelementptr inbounds i32, i32* %this, i32 0
+ %call.i = tail call x86_thiscallcc i64 @fn1(i32* %dl)
%getTypeAllocSize___trans_tmp_2.i = getelementptr inbounds i32, i32* %this, i32 0
%0 = load i32, i32* %getTypeAllocSize___trans_tmp_2.i, align 4
- %call.i8 = tail call x86_thiscallcc i64 @fn1(i32* %DL)
+ %call.i8 = tail call x86_thiscallcc i64 @fn1(i32* %dl)
%1 = insertelement <2 x i64> undef, i64 %call.i, i32 0
%2 = insertelement <2 x i64> %1, i64 %call.i8, i32 1
%3 = add nsw <2 x i64> %2, <i64 7, i64 7>
diff --git a/test/CodeGen/X86/pr27591.ll b/test/CodeGen/X86/pr27591.ll
index b71cb8c4b3a2..9291915c7671 100644
--- a/test/CodeGen/X86/pr27591.ll
+++ b/test/CodeGen/X86/pr27591.ll
@@ -5,9 +5,9 @@ target triple = "x86_64-unknown-linux-gnu"
define void @test1(i32 %x) #0 {
; CHECK-LABEL: test1:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: pushq %rax
-; CHECK-NEXT: testl %edi, %edi
+; CHECK-NEXT: cmpl $0, %edi
; CHECK-NEXT: setne %al
; CHECK-NEXT: andb $1, %al
; CHECK-NEXT: movzbl %al, %edi
@@ -22,9 +22,9 @@ entry:
define void @test2(i32 %x) #0 {
; CHECK-LABEL: test2:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: pushq %rax
-; CHECK-NEXT: testl %edi, %edi
+; CHECK-NEXT: cmpl $0, %edi
; CHECK-NEXT: setne %al
; CHECK-NEXT: movzbl %al, %edi
; CHECK-NEXT: andl $1, %edi
diff --git a/test/CodeGen/X86/pr27681.mir b/test/CodeGen/X86/pr27681.mir
index 956df172b253..8e0296c6d7f4 100644
--- a/test/CodeGen/X86/pr27681.mir
+++ b/test/CodeGen/X86/pr27681.mir
@@ -47,11 +47,11 @@ body: |
TEST32rr %edx, %edx, implicit-def %eflags
%cl = SETNEr implicit %eflags
; This %bl def is antidependent on the above use of %ebx
- %bl = MOV8rm %esp, 1, _, 3, _ ; :: (load 1 from %stack.0)
+ %bl = MOV8rm %esp, 1, %noreg, 3, _ ; :: (load 1 from %stack.0)
%cl = OR8rr killed %cl, %bl, implicit-def dead %eflags
%esi = MOVZX32rr8 killed %cl
%esi = ADD32rr killed %esi, killed %edi, implicit-def dead %eflags
- %ecx = MOV32rm %esp, 1, _, 24, _ ; :: (load 4 from %stack.2)
+ %ecx = MOV32rm %esp, 1, %noreg, 24, _ ; :: (load 4 from %stack.2)
%edx = SAR32rCL killed %edx, implicit-def dead %eflags, implicit %cl
TEST32rr killed %edx, %edx, implicit-def %eflags
%cl = SETNEr implicit %eflags
@@ -66,7 +66,7 @@ body: |
bb.2:
liveins: %cl, %eax, %ebp, %esi
- OR32mr %esp, 1, _, 8, _, killed %eax, implicit-def %eflags ; :: (store 4 into %stack.1)
+ OR32mr %esp, 1, %noreg, 8, %noreg, killed %eax, implicit-def %eflags ; :: (store 4 into %stack.1)
%dl = SETNEr implicit %eflags, implicit-def %edx
bb.3:
diff --git a/test/CodeGen/X86/pr28129.ll b/test/CodeGen/X86/pr28129.ll
index 15bffffa207f..f86c439ef040 100644
--- a/test/CodeGen/X86/pr28129.ll
+++ b/test/CodeGen/X86/pr28129.ll
@@ -4,15 +4,15 @@
define <4 x double> @cmp4f64_domain(<4 x double> %a) {
; X86-LABEL: cmp4f64_domain:
-; X86: # BB#0:
-; X86-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; X86: # %bb.0:
+; X86-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X86-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1
; X86-NEXT: vaddpd %ymm1, %ymm0, %ymm0
; X86-NEXT: retl
;
; X64-LABEL: cmp4f64_domain:
-; X64: # BB#0:
-; X64-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; X64: # %bb.0:
+; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X64-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1
; X64-NEXT: vaddpd %ymm1, %ymm0, %ymm0
; X64-NEXT: retq
@@ -25,15 +25,15 @@ define <4 x double> @cmp4f64_domain(<4 x double> %a) {
define <4 x double> @cmp4f64_domain_optsize(<4 x double> %a) optsize {
; X86-LABEL: cmp4f64_domain_optsize:
-; X86: # BB#0:
-; X86-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; X86: # %bb.0:
+; X86-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X86-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1
; X86-NEXT: vaddpd %ymm1, %ymm0, %ymm0
; X86-NEXT: retl
;
; X64-LABEL: cmp4f64_domain_optsize:
-; X64: # BB#0:
-; X64-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; X64: # %bb.0:
+; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X64-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1
; X64-NEXT: vaddpd %ymm1, %ymm0, %ymm0
; X64-NEXT: retq
@@ -46,15 +46,15 @@ define <4 x double> @cmp4f64_domain_optsize(<4 x double> %a) optsize {
define <8 x float> @cmp8f32_domain(<8 x float> %a) {
; X86-LABEL: cmp8f32_domain:
-; X86: # BB#0:
-; X86-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; X86: # %bb.0:
+; X86-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X86-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1
; X86-NEXT: vaddps %ymm1, %ymm0, %ymm0
; X86-NEXT: retl
;
; X64-LABEL: cmp8f32_domain:
-; X64: # BB#0:
-; X64-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; X64: # %bb.0:
+; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X64-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1
; X64-NEXT: vaddps %ymm1, %ymm0, %ymm0
; X64-NEXT: retq
@@ -67,15 +67,15 @@ define <8 x float> @cmp8f32_domain(<8 x float> %a) {
define <8 x float> @cmp8f32_domain_optsize(<8 x float> %a) optsize {
; X86-LABEL: cmp8f32_domain_optsize:
-; X86: # BB#0:
-; X86-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; X86: # %bb.0:
+; X86-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X86-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1
; X86-NEXT: vaddps %ymm1, %ymm0, %ymm0
; X86-NEXT: retl
;
; X64-LABEL: cmp8f32_domain_optsize:
-; X64: # BB#0:
-; X64-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; X64: # %bb.0:
+; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X64-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1
; X64-NEXT: vaddps %ymm1, %ymm0, %ymm0
; X64-NEXT: retq
diff --git a/test/CodeGen/X86/pr28173.ll b/test/CodeGen/X86/pr28173.ll
index 3279982e4641..f181217910ff 100644
--- a/test/CodeGen/X86/pr28173.ll
+++ b/test/CodeGen/X86/pr28173.ll
@@ -7,7 +7,7 @@ target triple = "x86_64-unknown-linux-gnu"
define i64 @foo64(i1 zeroext %i) #0 {
; CHECK-LABEL: foo64:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movzbl %dil, %eax
; CHECK-NEXT: orq $-2, %rax
; CHECK-NEXT: retq
@@ -24,10 +24,10 @@ end:
define i16 @foo16(i1 zeroext %i) #0 {
; CHECK-LABEL: foo16:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movzbl %dil, %eax
; CHECK-NEXT: orl $65534, %eax # imm = 0xFFFE
-; CHECK-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; CHECK-NEXT: # kill: def %ax killed %ax killed %eax
; CHECK-NEXT: retq
br label %bb
@@ -42,10 +42,10 @@ end:
define i16 @foo16_1(i1 zeroext %i, i32 %j) #0 {
; CHECK-LABEL: foo16_1:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movzbl %dil, %eax
; CHECK-NEXT: orl $2, %eax
-; CHECK-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; CHECK-NEXT: # kill: def %ax killed %ax killed %eax
; CHECK-NEXT: retq
br label %bb
@@ -60,7 +60,7 @@ end:
define i32 @foo32(i1 zeroext %i) #0 {
; CHECK-LABEL: foo32:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movzbl %dil, %eax
; CHECK-NEXT: orl $-2, %eax
; CHECK-NEXT: retq
@@ -77,7 +77,7 @@ end:
define i8 @foo8(i1 zeroext %i) #0 {
; CHECK-LABEL: foo8:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: orb $-2, %dil
; CHECK-NEXT: movl %edi, %eax
; CHECK-NEXT: retq
diff --git a/test/CodeGen/X86/pr28472.ll b/test/CodeGen/X86/pr28472.ll
index 9d2609022b3d..603549a73139 100644
--- a/test/CodeGen/X86/pr28472.ll
+++ b/test/CodeGen/X86/pr28472.ll
@@ -1,7 +1,7 @@
; RUN: llc -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
; CHECK-LABEL: {{^}}same_dynamic_index_fp_vector_type:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: retq
define float @same_dynamic_index_fp_vector_type(float %val, i32 %idx) {
bb:
diff --git a/test/CodeGen/X86/pr28560.ll b/test/CodeGen/X86/pr28560.ll
index d0061f670cf1..d9da9ac9e883 100644
--- a/test/CodeGen/X86/pr28560.ll
+++ b/test/CodeGen/X86/pr28560.ll
@@ -1,6 +1,6 @@
; RUN: llc -mtriple=i686-pc-linux -print-after=postrapseudos < %s 2>&1 | FileCheck %s
-; CHECK: MOV8rr %{{[A-D]}}L, %E[[R:[A-D]]]X<imp-use,kill>, %E[[R]]X<imp-def>
+; CHECK: MOV8rr %{{[a-d]}}l, implicit killed %e[[R:[a-d]]]x, implicit-def %e[[R]]x
define i32 @foo(i32 %i, i32 %k, i8* %p) {
%f = icmp ne i32 %i, %k
%s = zext i1 %f to i8
diff --git a/test/CodeGen/X86/pr29061.ll b/test/CodeGen/X86/pr29061.ll
new file mode 100644
index 000000000000..9c29429af7d8
--- /dev/null
+++ b/test/CodeGen/X86/pr29061.ll
@@ -0,0 +1,40 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple i386-unknown-linux-gnu < %s | FileCheck %s
+
+; Previously, a reference to SIL/DIL was being emitted
+; but those aren't available unless on a 64bit mode
+
+define void @t1(i8 signext %c) {
+; CHECK-LABEL: t1:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: pushl %edi
+; CHECK-NEXT: .cfi_def_cfa_offset 8
+; CHECK-NEXT: .cfi_offset %edi, -8
+; CHECK-NEXT: movzbl {{[0-9]+}}(%esp), %edi
+; CHECK-NEXT: # kill: def %di killed %di killed %edi
+; CHECK-NEXT: #APP
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: popl %edi
+; CHECK-NEXT: retl
+entry:
+ tail call void asm sideeffect "", "{di},~{dirflag},~{fpsr},~{flags}"(i8 %c)
+ ret void
+}
+
+define void @t2(i8 signext %c) {
+; CHECK-LABEL: t2:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: pushl %esi
+; CHECK-NEXT: .cfi_def_cfa_offset 8
+; CHECK-NEXT: .cfi_offset %esi, -8
+; CHECK-NEXT: movzbl {{[0-9]+}}(%esp), %esi
+; CHECK-NEXT: # kill: def %si killed %si killed %esi
+; CHECK-NEXT: #APP
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: popl %esi
+; CHECK-NEXT: retl
+entry:
+ tail call void asm sideeffect "", "{si},~{dirflag},~{fpsr},~{flags}"(i8 %c)
+ ret void
+}
+
diff --git a/test/CodeGen/X86/pr29112.ll b/test/CodeGen/X86/pr29112.ll
index 8c970b3d4771..f6bf76c1f853 100644
--- a/test/CodeGen/X86/pr29112.ll
+++ b/test/CodeGen/X86/pr29112.ll
@@ -7,13 +7,12 @@ declare <4 x float> @foo(<4 x float>, <4 x float>, <4 x float>, <4 x float>, <4
define <4 x float> @bar(<4 x float>* %a1p, <4 x float>* %a2p, <4 x float> %a3, <4 x float> %a4, <16 x float>%c1, <16 x float>%c2) {
; CHECK-LABEL: bar:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: subq $88, %rsp
-; CHECK-NEXT: .Lcfi0:
; CHECK-NEXT: .cfi_def_cfa_offset 96
; CHECK-NEXT: vmovaps %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill
-; CHECK-NEXT: vextractf32x4 $1, %zmm3, %xmm1
-; CHECK-NEXT: vextractf32x4 $1, %zmm2, %xmm8
+; CHECK-NEXT: vextractf128 $1, %ymm3, %xmm1
+; CHECK-NEXT: vextractf128 $1, %ymm2, %xmm8
; CHECK-NEXT: vinsertps {{.*#+}} xmm9 = xmm8[0],xmm1[0],xmm8[2,3]
; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm9[0,1],xmm2[1],xmm9[3]
; CHECK-NEXT: vinsertps {{.*#+}} xmm15 = xmm0[0,1,2],xmm3[1]
diff --git a/test/CodeGen/X86/pr29170.ll b/test/CodeGen/X86/pr29170.ll
index ecb4c9785365..dfbad021d287 100644
--- a/test/CodeGen/X86/pr29170.ll
+++ b/test/CodeGen/X86/pr29170.ll
@@ -8,11 +8,11 @@ target triple = "i386-unknown-linux-gnu"
define i32 @main() {
; CHECK-LABEL: main:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: testb %al, %al
; CHECK-NEXT: jne .LBB0_3
-; CHECK-NEXT: # BB#1: # %go
+; CHECK-NEXT: # %bb.1: # %go
; CHECK-NEXT: movl $-1, %ecx
; CHECK-NEXT: movsbl b, %edx
; CHECK-NEXT: notl %ecx
@@ -20,7 +20,7 @@ define i32 @main() {
; CHECK-NEXT: cmpl $-1, %edx
; CHECK-NEXT: sbbl %ecx, %eax
; CHECK-NEXT: jge .LBB0_3
-; CHECK-NEXT: # BB#2: # %if.then
+; CHECK-NEXT: # %bb.2: # %if.then
; CHECK-NEXT: movl $42, %eax
; CHECK-NEXT: retl
; CHECK-NEXT: .LBB0_3: # %if.else
diff --git a/test/CodeGen/X86/pr2982.ll b/test/CodeGen/X86/pr2982.ll
index b7902b8cc3a0..3fc6f0559bc4 100644
--- a/test/CodeGen/X86/pr2982.ll
+++ b/test/CodeGen/X86/pr2982.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86
+; RUN: llc < %s
; PR2982
target datalayout =
diff --git a/test/CodeGen/X86/pr30284.ll b/test/CodeGen/X86/pr30284.ll
index 7ab1b729ea04..c6a688ebdc41 100644
--- a/test/CodeGen/X86/pr30284.ll
+++ b/test/CodeGen/X86/pr30284.ll
@@ -3,7 +3,7 @@
define void @f_f___un_3C_unf_3E_un_3C_unf_3E_() {
; CHECK-LABEL: f_f___un_3C_unf_3E_un_3C_unf_3E_:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vmovapd 0, %zmm0
; CHECK-NEXT: vmovapd 64, %zmm1
; CHECK-NEXT: vmovapd {{.*#+}} zmm2 = [0,16,0,16,0,16,0,16,0,16,0,16,0,16,0,16]
diff --git a/test/CodeGen/X86/pr30430.ll b/test/CodeGen/X86/pr30430.ll
index 14d81f14fc32..816fe2376c49 100644
--- a/test/CodeGen/X86/pr30430.ll
+++ b/test/CodeGen/X86/pr30430.ll
@@ -3,14 +3,11 @@
define <16 x float> @makefloat(float %f1, float %f2, float %f3, float %f4, float %f5, float %f6, float %f7, float %f8, float %f9, float %f10, float %f11, float %f12, float %f13, float %f14, float %f15, float %f16) #0 {
; CHECK-LABEL: makefloat:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: pushq %rbp
-; CHECK-NEXT: .Lcfi0:
; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: .Lcfi1:
; CHECK-NEXT: .cfi_offset %rbp, -16
; CHECK-NEXT: movq %rsp, %rbp
-; CHECK-NEXT: .Lcfi2:
; CHECK-NEXT: .cfi_def_cfa_register %rbp
; CHECK-NEXT: andq $-64, %rsp
; CHECK-NEXT: subq $256, %rsp # imm = 0x100
@@ -76,7 +73,7 @@ define <16 x float> @makefloat(float %f1, float %f2, float %f3, float %f4, float
; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
; CHECK-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0]
-; CHECK-NEXT: # implicit-def: %YMM2
+; CHECK-NEXT: # implicit-def: %ymm2
; CHECK-NEXT: vmovaps %xmm1, %xmm2
; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm2
; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
@@ -93,10 +90,10 @@ define <16 x float> @makefloat(float %f1, float %f2, float %f3, float %f4, float
; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3]
; CHECK-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[0]
-; CHECK-NEXT: # implicit-def: %YMM3
+; CHECK-NEXT: # implicit-def: %ymm3
; CHECK-NEXT: vmovaps %xmm1, %xmm3
; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm3
-; CHECK-NEXT: # implicit-def: %ZMM24
+; CHECK-NEXT: # implicit-def: %zmm24
; CHECK-NEXT: vmovaps %zmm3, %zmm24
; CHECK-NEXT: vinsertf64x4 $1, %ymm2, %zmm24, %zmm24
; CHECK-NEXT: vmovaps %zmm24, {{[0-9]+}}(%rsp)
diff --git a/test/CodeGen/X86/pr30511.ll b/test/CodeGen/X86/pr30511.ll
index 3c512ba27009..7372980b41e4 100644
--- a/test/CodeGen/X86/pr30511.ll
+++ b/test/CodeGen/X86/pr30511.ll
@@ -6,7 +6,7 @@ target triple = "x86_64-pc-linux-gnu"
define i64 @PR30511(<2 x double> %a) {
; CHECK-LABEL: PR30511:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: addpd {{.*}}(%rip), %xmm0
; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
; CHECK-NEXT: cvtdq2pd %xmm0, %xmm0
diff --git a/test/CodeGen/X86/pr31045.ll b/test/CodeGen/X86/pr31045.ll
new file mode 100644
index 000000000000..f62836310bb0
--- /dev/null
+++ b/test/CodeGen/X86/pr31045.ll
@@ -0,0 +1,89 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s
+
+%struct.c.2.6.10.14.38.70.74.90.94.98.106.122.362 = type { i8, %struct.b.1.5.9.13.37.69.73.89.93.97.105.121.361, i24, i24 }
+%struct.b.1.5.9.13.37.69.73.89.93.97.105.121.361 = type { %struct.a.0.4.8.12.36.68.72.88.92.96.104.120.360, %struct.a.0.4.8.12.36.68.72.88.92.96.104.120.360, i8 }
+%struct.a.0.4.8.12.36.68.72.88.92.96.104.120.360 = type <{ i8, i16 }>
+%struct.d.3.7.11.15.39.71.75.91.95.99.107.123.363 = type <{ %struct.c.2.6.10.14.38.70.74.90.94.98.106.122.362, %struct.c.2.6.10.14.38.70.74.90.94.98.106.122.362, i8, i8 }>
+
+@var_46 = external local_unnamed_addr global i8, align 1
+@var_44 = external local_unnamed_addr global i8, align 1
+@var_163 = external local_unnamed_addr global i8, align 1
+@struct_obj_12 = external local_unnamed_addr global %struct.c.2.6.10.14.38.70.74.90.94.98.106.122.362, align 2
+@struct_obj_3 = external local_unnamed_addr global %struct.d.3.7.11.15.39.71.75.91.95.99.107.123.363, align 2
+@struct_obj_8 = external local_unnamed_addr global %struct.d.3.7.11.15.39.71.75.91.95.99.107.123.363, align 2
+@var_49 = external local_unnamed_addr constant i8, align 1
+
+; Function Attrs: norecurse nounwind uwtable
+define void @_Z1av() local_unnamed_addr #0 {
+; CHECK-LABEL: _Z1av:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: movl struct_obj_3+{{.*}}(%rip), %eax
+; CHECK-NEXT: movsbl {{.*}}(%rip), %ecx
+; CHECK-NEXT: movzbl {{.*}}(%rip), %edx
+; CHECK-NEXT: movzbl {{.*}}(%rip), %esi
+; CHECK-NEXT: andl $1, %eax
+; CHECK-NEXT: leal (%rax,%rax), %edi
+; CHECK-NEXT: subl %ecx, %edi
+; CHECK-NEXT: subl %edx, %edi
+; CHECK-NEXT: movl %edi, %ecx
+; CHECK-NEXT: notl %ecx
+; CHECK-NEXT: movzbl %cl, %ecx
+; CHECK-NEXT: movw %cx, struct_obj_12+{{.*}}(%rip)
+; CHECK-NEXT: xorl %ecx, %ecx
+; CHECK-NEXT: testb %al, %al
+; CHECK-NEXT: cmovel %eax, %ecx
+; CHECK-NEXT: andl struct_obj_8+{{.*}}(%rip), %ecx
+; CHECK-NEXT: andl $1, %ecx
+; CHECK-NEXT: negl %ecx
+; CHECK-NEXT: andl %esi, %ecx
+; CHECK-NEXT: negl %ecx
+; CHECK-NEXT: andl %eax, %ecx
+; CHECK-NEXT: negl %ecx
+; CHECK-NEXT: testl %ecx, %edi
+; CHECK-NEXT: setne {{.*}}(%rip)
+; CHECK-NEXT: retq
+entry:
+ %bf.load = load i32, i32* bitcast (i24* getelementptr inbounds (%struct.d.3.7.11.15.39.71.75.91.95.99.107.123.363, %struct.d.3.7.11.15.39.71.75.91.95.99.107.123.363* @struct_obj_3, i64 0, i32 0, i32 2) to i32*), align 2
+ %tmp = load i8, i8* @var_46, align 1
+ %conv1 = sext i8 %tmp to i32
+ %tmp1 = load i8, i8* @var_49, align 1
+ %tmp2 = zext i8 %tmp1 to i32
+ %tmp3 = shl i32 %bf.load, 1
+ %factor = and i32 %tmp3, 2
+ %sub = sub nsw i32 %factor, %conv1
+ %sub8 = sub nsw i32 %sub, %tmp2
+ %add = add nsw i32 %sub8, 0
+ %tmp4 = load i8, i8* @var_44, align 1
+ %tmp5 = zext i8 %tmp4 to i32
+ %xor = xor i32 %add, 255
+ %xor20 = xor i32 %xor, 0
+ %neg = xor i32 %xor20, 0
+ %or = or i32 0, %neg
+ %or55 = or i32 %or, 0
+ %conv56 = trunc i32 %or55 to i16
+ %bf.value = and i16 %conv56, 255
+ %bf.set = or i16 %bf.value, 0
+ store i16 %bf.set, i16* getelementptr inbounds (%struct.c.2.6.10.14.38.70.74.90.94.98.106.122.362, %struct.c.2.6.10.14.38.70.74.90.94.98.106.122.362* @struct_obj_12, i64 0, i32 1, i32 1, i32 1), align 1
+ %lnot = icmp eq i8 undef, 0
+ %bf.load65 = load i32, i32* bitcast (i24* getelementptr inbounds (%struct.d.3.7.11.15.39.71.75.91.95.99.107.123.363, %struct.d.3.7.11.15.39.71.75.91.95.99.107.123.363* @struct_obj_8, i64 0, i32 0, i32 2) to i32*), align 2
+ %tmp6 = and i32 %bf.load65, 1
+ %tmp7 = select i1 %lnot, i32 undef, i32 0
+ %mul69 = and i32 %tmp6, %tmp7
+ %tmp8 = sub nsw i32 0, %mul69
+ %mul75 = and i32 %tmp5, %tmp8
+ %tmp9 = and i32 %bf.load, 1
+ %tmp10 = sub nsw i32 0, %mul75
+ %mul80 = and i32 %tmp9, %tmp10
+ %factor109 = shl nuw nsw i32 %tmp9, 1
+ %sub86 = sub nsw i32 %factor109, %conv1
+ %sub94 = sub nsw i32 %sub86, %tmp2
+ %tmp11 = sub nsw i32 0, %mul80
+ %mul95 = and i32 %sub94, %tmp11
+ %tobool96 = icmp ne i32 %mul95, 0
+ %frombool = zext i1 %tobool96 to i8
+ store i8 %frombool, i8* @var_163, align 1
+ ret void
+}
+
+attributes #0 = { norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/CodeGen/X86/pr31088.ll b/test/CodeGen/X86/pr31088.ll
index 0dd8eb0ece85..f443ff417cc9 100644
--- a/test/CodeGen/X86/pr31088.ll
+++ b/test/CodeGen/X86/pr31088.ll
@@ -5,7 +5,7 @@
define <1 x half> @ir_fadd_v1f16(<1 x half> %arg0, <1 x half> %arg1) nounwind {
; X86-LABEL: ir_fadd_v1f16:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: subl $28, %esp
; X86-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X86-NEXT: movss %xmm0, (%esp)
@@ -31,7 +31,7 @@ define <1 x half> @ir_fadd_v1f16(<1 x half> %arg0, <1 x half> %arg1) nounwind {
; X86-NEXT: retl
;
; X64-LABEL: ir_fadd_v1f16:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: pushq %rax
; X64-NEXT: movss %xmm0, {{[0-9]+}}(%rsp) # 4-byte Spill
; X64-NEXT: movaps %xmm1, %xmm0
@@ -49,7 +49,7 @@ define <1 x half> @ir_fadd_v1f16(<1 x half> %arg0, <1 x half> %arg1) nounwind {
; X64-NEXT: retq
;
; F16C-LABEL: ir_fadd_v1f16:
-; F16C: # BB#0:
+; F16C: # %bb.0:
; F16C-NEXT: vcvtps2ph $4, %xmm1, %xmm1
; F16C-NEXT: vcvtph2ps %xmm1, %xmm1
; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0
@@ -62,7 +62,7 @@ define <1 x half> @ir_fadd_v1f16(<1 x half> %arg0, <1 x half> %arg1) nounwind {
define <2 x half> @ir_fadd_v2f16(<2 x half> %arg0, <2 x half> %arg1) nounwind {
; X86-LABEL: ir_fadd_v2f16:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: subl $64, %esp
; X86-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X86-NEXT: movss %xmm0, (%esp)
@@ -110,7 +110,7 @@ define <2 x half> @ir_fadd_v2f16(<2 x half> %arg0, <2 x half> %arg1) nounwind {
; X86-NEXT: retl
;
; X64-LABEL: ir_fadd_v2f16:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: subq $24, %rsp
; X64-NEXT: movss %xmm2, {{[0-9]+}}(%rsp) # 4-byte Spill
; X64-NEXT: movss %xmm1, {{[0-9]+}}(%rsp) # 4-byte Spill
@@ -145,7 +145,7 @@ define <2 x half> @ir_fadd_v2f16(<2 x half> %arg0, <2 x half> %arg1) nounwind {
; X64-NEXT: retq
;
; F16C-LABEL: ir_fadd_v2f16:
-; F16C: # BB#0:
+; F16C: # %bb.0:
; F16C-NEXT: vcvtps2ph $4, %xmm3, %xmm3
; F16C-NEXT: vcvtph2ps %xmm3, %xmm3
; F16C-NEXT: vcvtps2ph $4, %xmm1, %xmm1
diff --git a/test/CodeGen/X86/pr31323.ll b/test/CodeGen/X86/pr31323.ll
index 6db09318cc81..e0e1dbe726db 100644
--- a/test/CodeGen/X86/pr31323.ll
+++ b/test/CodeGen/X86/pr31323.ll
@@ -6,12 +6,12 @@
define i32 @pr31323(i32) {
; X32-LABEL: pr31323:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: xorl %eax, %eax
; X32-NEXT: retl
;
; X64-LABEL: pr31323:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: retq
entry:
diff --git a/test/CodeGen/X86/pr31773.ll b/test/CodeGen/X86/pr31773.ll
index 8722df3f4b57..6b4261c24353 100644
--- a/test/CodeGen/X86/pr31773.ll
+++ b/test/CodeGen/X86/pr31773.ll
@@ -1,18 +1,47 @@
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=CHECK --check-prefix=AVX
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512
; This matter of this test is ensuring that vpackus* is not used for umin+trunc combination, since vpackus* input is a signed number.
+
define <16 x i8> @usat_trunc_wb_256(<16 x i16> %i) {
-; CHECK-LABEL: usat_trunc_wb_256:
-; CHECK-NOT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX-LABEL: usat_trunc_wb_256:
+; AVX: # %bb.0:
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; AVX-NEXT: vpminuw %xmm2, %xmm1, %xmm1
+; AVX-NEXT: vpminuw %xmm2, %xmm0, %xmm0
+; AVX-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: usat_trunc_wb_256:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpmovuswb %ymm0, %xmm0
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
%x3 = icmp ult <16 x i16> %i, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
%x5 = select <16 x i1> %x3, <16 x i16> %i, <16 x i16> <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
%x6 = trunc <16 x i16> %x5 to <16 x i8>
ret <16 x i8> %x6
}
-
+
define <8 x i16> @usat_trunc_dw_256(<8 x i32> %i) {
-; CHECK-LABEL: usat_trunc_dw_256:
-; CHECK-NOT: vpackusdw %xmm1, %xmm0, %xmm0
+; AVX-LABEL: usat_trunc_dw_256:
+; AVX: # %bb.0:
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [65535,65535,65535,65535]
+; AVX-NEXT: vpminud %xmm2, %xmm1, %xmm1
+; AVX-NEXT: vpminud %xmm2, %xmm0, %xmm0
+; AVX-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: usat_trunc_dw_256:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpmovusdw %ymm0, %xmm0
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
%x3 = icmp ult <8 x i32> %i, <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
%x5 = select <8 x i1> %x3, <8 x i32> %i, <8 x i32> <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
%x6 = trunc <8 x i32> %x5 to <8 x i16>
diff --git a/test/CodeGen/X86/pr31956.ll b/test/CodeGen/X86/pr31956.ll
index e9293048f4e5..80e4ed081f74 100644
--- a/test/CodeGen/X86/pr31956.ll
+++ b/test/CodeGen/X86/pr31956.ll
@@ -8,7 +8,7 @@ target triple = "x86_64-scei-ps4"
define <4 x float> @foo() {
; CHECK-LABEL: foo:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],mem[1,2,3]
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1
diff --git a/test/CodeGen/X86/pr32108.ll b/test/CodeGen/X86/pr32108.ll
index f14b04802a04..ff1b7d3401f8 100644
--- a/test/CodeGen/X86/pr32108.ll
+++ b/test/CodeGen/X86/pr32108.ll
@@ -3,7 +3,7 @@
define void @pr32108() {
; CHECK-LABEL: pr32108:
-; CHECK: # BB#0: # %CF257
+; CHECK: # %bb.0: # %CF257
; CHECK-NEXT: movb $0, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB0_1: # %CF244
diff --git a/test/CodeGen/X86/pr3216.ll b/test/CodeGen/X86/pr3216.ll
index 23dcf5693cd0..237ed7c4d606 100644
--- a/test/CodeGen/X86/pr3216.ll
+++ b/test/CodeGen/X86/pr3216.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | FileCheck %s
+; RUN: llc < %s -mtriple=i686-- | FileCheck %s
@foo = global i8 127
diff --git a/test/CodeGen/X86/pr32241.ll b/test/CodeGen/X86/pr32241.ll
index e1f726f0c625..69c32eaacbb6 100644
--- a/test/CodeGen/X86/pr32241.ll
+++ b/test/CodeGen/X86/pr32241.ll
@@ -3,14 +3,11 @@
define i32 @_Z3foov() {
; CHECK-LABEL: _Z3foov:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: pushl %esi
-; CHECK-NEXT: .Lcfi0:
; CHECK-NEXT: .cfi_def_cfa_offset 8
-; CHECK-NEXT: subl $24, %esp
-; CHECK-NEXT: .Lcfi1:
-; CHECK-NEXT: .cfi_def_cfa_offset 32
-; CHECK-NEXT: .Lcfi2:
+; CHECK-NEXT: subl $16, %esp
+; CHECK-NEXT: .cfi_def_cfa_offset 24
; CHECK-NEXT: .cfi_offset %esi, -8
; CHECK-NEXT: movb $1, %al
; CHECK-NEXT: movw $10959, {{[0-9]+}}(%esp) # imm = 0x2ACF
@@ -21,7 +18,7 @@ define i32 @_Z3foov() {
; CHECK-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
; CHECK-NEXT: movb %al, {{[0-9]+}}(%esp) # 1-byte Spill
; CHECK-NEXT: jne .LBB0_2
-; CHECK-NEXT: # BB#1: # %lor.rhs
+; CHECK-NEXT: # %bb.1: # %lor.rhs
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: movb %al, %cl
; CHECK-NEXT: movb %cl, {{[0-9]+}}(%esp) # 1-byte Spill
@@ -32,16 +29,15 @@ define i32 @_Z3foov() {
; CHECK-NEXT: andb $1, %al
; CHECK-NEXT: movzbl %al, %edx
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
-; CHECK-NEXT: subl %edx, %esi
+; CHECK-NEXT: cmpl %edx, %esi
; CHECK-NEXT: setl %al
; CHECK-NEXT: andb $1, %al
; CHECK-NEXT: movzbl %al, %edx
; CHECK-NEXT: xorl $-1, %edx
; CHECK-NEXT: cmpl $0, %edx
-; CHECK-NEXT: movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
; CHECK-NEXT: movb %cl, {{[0-9]+}}(%esp) # 1-byte Spill
; CHECK-NEXT: jne .LBB0_4
-; CHECK-NEXT: # BB#3: # %lor.rhs4
+; CHECK-NEXT: # %bb.3: # %lor.rhs4
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: movb %al, %cl
; CHECK-NEXT: movb %cl, {{[0-9]+}}(%esp) # 1-byte Spill
@@ -53,7 +49,7 @@ define i32 @_Z3foov() {
; CHECK-NEXT: movw %cx, %dx
; CHECK-NEXT: movw %dx, {{[0-9]+}}(%esp)
; CHECK-NEXT: movzwl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: addl $24, %esp
+; CHECK-NEXT: addl $16, %esp
; CHECK-NEXT: popl %esi
; CHECK-NEXT: retl
entry:
diff --git a/test/CodeGen/X86/pr32256.ll b/test/CodeGen/X86/pr32256.ll
index e29b56236e26..ab6af8869702 100644
--- a/test/CodeGen/X86/pr32256.ll
+++ b/test/CodeGen/X86/pr32256.ll
@@ -6,9 +6,8 @@
; Function Attrs: noinline nounwind
define void @_Z1av() {
; CHECK-LABEL: _Z1av:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: subl $2, %esp
-; CHECK-NEXT: .Lcfi0:
; CHECK-NEXT: .cfi_def_cfa_offset 6
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: movb %al, %cl
diff --git a/test/CodeGen/X86/pr32282.ll b/test/CodeGen/X86/pr32282.ll
index 26c4bdb2375a..1c4d48db7118 100644
--- a/test/CodeGen/X86/pr32282.ll
+++ b/test/CodeGen/X86/pr32282.ll
@@ -11,9 +11,8 @@
define void @foo() {
; X86-LABEL: foo:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: pushl %eax
-; X86-NEXT: .Lcfi0:
; X86-NEXT: .cfi_def_cfa_offset 8
; X86-NEXT: movl d, %eax
; X86-NEXT: movl d+4, %ecx
@@ -28,24 +27,18 @@ define void @foo() {
; X86-NEXT: cmovnel %ecx, %edx
; X86-NEXT: cmovnel %eax, %ecx
; X86-NEXT: andl $-2, %edx
-; X86-NEXT: andl $2147483647, %ecx # imm = 0x7FFFFFFF
; X86-NEXT: addl $7, %edx
; X86-NEXT: adcxl %eax, %ecx
; X86-NEXT: pushl %ecx
-; X86-NEXT: .Lcfi1:
; X86-NEXT: .cfi_adjust_cfa_offset 4
; X86-NEXT: pushl %edx
-; X86-NEXT: .Lcfi2:
; X86-NEXT: .cfi_adjust_cfa_offset 4
; X86-NEXT: pushl $0
-; X86-NEXT: .Lcfi3:
; X86-NEXT: .cfi_adjust_cfa_offset 4
; X86-NEXT: pushl $0
-; X86-NEXT: .Lcfi4:
; X86-NEXT: .cfi_adjust_cfa_offset 4
; X86-NEXT: calll __divdi3
; X86-NEXT: addl $16, %esp
-; X86-NEXT: .Lcfi5:
; X86-NEXT: .cfi_adjust_cfa_offset -16
; X86-NEXT: orl %eax, %edx
; X86-NEXT: setne {{[0-9]+}}(%esp)
@@ -53,7 +46,7 @@ define void @foo() {
; X86-NEXT: retl
;
; X64-LABEL: foo:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movq {{.*}}(%rip), %rax
; X64-NEXT: movabsq $3013716102212485120, %rcx # imm = 0x29D2DED3DE400000
; X64-NEXT: andnq %rcx, %rax, %rcx
@@ -62,7 +55,7 @@ define void @foo() {
; X64-NEXT: movabsq $4393751543808, %rax # imm = 0x3FF00000000
; X64-NEXT: testq %rax, %rcx
; X64-NEXT: je .LBB0_1
-; X64-NEXT: # BB#2:
+; X64-NEXT: # %bb.2:
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: xorl %edx, %edx
; X64-NEXT: idivq %rcx
@@ -71,7 +64,7 @@ define void @foo() {
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: xorl %edx, %edx
; X64-NEXT: divl %ecx
-; X64-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<def>
+; X64-NEXT: # kill: def %eax killed %eax def %rax
; X64-NEXT: .LBB0_3:
; X64-NEXT: testq %rax, %rax
; X64-NEXT: setne -{{[0-9]+}}(%rsp)
diff --git a/test/CodeGen/X86/pr32284.ll b/test/CodeGen/X86/pr32284.ll
index c54909cf93c1..86bb74050ad3 100644
--- a/test/CodeGen/X86/pr32284.ll
+++ b/test/CodeGen/X86/pr32284.ll
@@ -1,17 +1,94 @@
-; RUN: llc -O0 -mtriple=x86_64-unknown -mcpu=skx -o - %s | FileCheck %s --check-prefixes=CHECK,X64
-; RUN: llc -mtriple=x86_64-unknown -mcpu=skx -o - %s | FileCheck %s --check-prefixes=CHECK,X64
-; RUN: llc -O0 -mtriple=i686-unknown -mcpu=skx -o - %s | FileCheck %s --check-prefixes=CHECK,686
-; RUN: llc -mtriple=i686-unknown -mcpu=skx -o - %s | FileCheck %s --check-prefixes=CHECK,686
-; REQUIRES: asserts
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -O0 -mtriple=x86_64-unknown -mcpu=skx -o - %s | FileCheck %s --check-prefix=X86-O0
+; RUN: llc -mtriple=x86_64-unknown -mcpu=skx -o - %s | FileCheck %s --check-prefix=X64
+; RUN: llc -O0 -mtriple=i686-unknown -mcpu=skx -o - %s | FileCheck %s --check-prefix=686-O0
+; RUN: llc -mtriple=i686-unknown -mcpu=skx -o - %s | FileCheck %s --check-prefix=686
@c = external constant i8, align 1
define void @foo() {
-; CHECK-LABEL: foo:
-; CHECK: # BB#0: # %entry
-; CHECK-DAG: setne
-; CHECK-DAG: setle
-; CHECK: ret
+; X86-O0-LABEL: foo:
+; X86-O0: # %bb.0: # %entry
+; X86-O0-NEXT: xorl %eax, %eax
+; X86-O0-NEXT: movl %eax, %ecx
+; X86-O0-NEXT: xorl %eax, %eax
+; X86-O0-NEXT: movzbl c, %edx
+; X86-O0-NEXT: subl %edx, %eax
+; X86-O0-NEXT: movslq %eax, %rsi
+; X86-O0-NEXT: subq %rsi, %rcx
+; X86-O0-NEXT: movb %cl, %dil
+; X86-O0-NEXT: cmpb $0, %dil
+; X86-O0-NEXT: setne %dil
+; X86-O0-NEXT: andb $1, %dil
+; X86-O0-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
+; X86-O0-NEXT: cmpb $0, c
+; X86-O0-NEXT: setne %dil
+; X86-O0-NEXT: xorb $-1, %dil
+; X86-O0-NEXT: xorb $-1, %dil
+; X86-O0-NEXT: andb $1, %dil
+; X86-O0-NEXT: movzbl %dil, %eax
+; X86-O0-NEXT: movzbl c, %edx
+; X86-O0-NEXT: cmpl %edx, %eax
+; X86-O0-NEXT: setle %dil
+; X86-O0-NEXT: andb $1, %dil
+; X86-O0-NEXT: movzbl %dil, %eax
+; X86-O0-NEXT: movl %eax, -{{[0-9]+}}(%rsp)
+; X86-O0-NEXT: retq
+;
+; X64-LABEL: foo:
+; X64: # %bb.0: # %entry
+; X64-NEXT: movzbl {{.*}}(%rip), %eax
+; X64-NEXT: testb %al, %al
+; X64-NEXT: setne -{{[0-9]+}}(%rsp)
+; X64-NEXT: xorl %ecx, %ecx
+; X64-NEXT: testl %eax, %eax
+; X64-NEXT: setne %cl
+; X64-NEXT: xorl %edx, %edx
+; X64-NEXT: cmpl %eax, %ecx
+; X64-NEXT: setle %dl
+; X64-NEXT: movl %edx, -{{[0-9]+}}(%rsp)
+; X64-NEXT: retq
+;
+; 686-O0-LABEL: foo:
+; 686-O0: # %bb.0: # %entry
+; 686-O0-NEXT: subl $8, %esp
+; 686-O0-NEXT: .cfi_def_cfa_offset 12
+; 686-O0-NEXT: movb c, %al
+; 686-O0-NEXT: cmpb $0, %al
+; 686-O0-NEXT: setne %al
+; 686-O0-NEXT: andb $1, %al
+; 686-O0-NEXT: movb %al, {{[0-9]+}}(%esp)
+; 686-O0-NEXT: cmpb $0, c
+; 686-O0-NEXT: setne %al
+; 686-O0-NEXT: xorb $-1, %al
+; 686-O0-NEXT: xorb $-1, %al
+; 686-O0-NEXT: andb $1, %al
+; 686-O0-NEXT: movzbl %al, %ecx
+; 686-O0-NEXT: movzbl c, %edx
+; 686-O0-NEXT: cmpl %edx, %ecx
+; 686-O0-NEXT: setle %al
+; 686-O0-NEXT: andb $1, %al
+; 686-O0-NEXT: movzbl %al, %ecx
+; 686-O0-NEXT: movl %ecx, (%esp)
+; 686-O0-NEXT: addl $8, %esp
+; 686-O0-NEXT: retl
+;
+; 686-LABEL: foo:
+; 686: # %bb.0: # %entry
+; 686-NEXT: subl $8, %esp
+; 686-NEXT: .cfi_def_cfa_offset 12
+; 686-NEXT: movzbl c, %eax
+; 686-NEXT: xorl %ecx, %ecx
+; 686-NEXT: testl %eax, %eax
+; 686-NEXT: setne %cl
+; 686-NEXT: testb %al, %al
+; 686-NEXT: setne {{[0-9]+}}(%esp)
+; 686-NEXT: xorl %edx, %edx
+; 686-NEXT: cmpl %eax, %ecx
+; 686-NEXT: setle %dl
+; 686-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; 686-NEXT: addl $8, %esp
+; 686-NEXT: retl
entry:
%a = alloca i8, align 1
%b = alloca i32, align 4
@@ -42,12 +119,161 @@ entry:
@_ZN8struct_210member_2_0E = external global i64, align 8
define void @f1() {
-; CHECK-LABEL: f1:
-; CHECK: # BB#0: # %entry
-; CHECK: sete
-; X64: addq $7093, {{.*}}
-; 686: addl $7093, {{.*}}
-; CHECK: ret
+; X86-O0-LABEL: f1:
+; X86-O0: # %bb.0: # %entry
+; X86-O0-NEXT: movabsq $8381627093, %rax # imm = 0x1F3957AD5
+; X86-O0-NEXT: movslq var_5, %rcx
+; X86-O0-NEXT: addq %rax, %rcx
+; X86-O0-NEXT: cmpq $0, %rcx
+; X86-O0-NEXT: setne %dl
+; X86-O0-NEXT: andb $1, %dl
+; X86-O0-NEXT: movb %dl, -{{[0-9]+}}(%rsp)
+; X86-O0-NEXT: movl var_5, %esi
+; X86-O0-NEXT: xorl $-1, %esi
+; X86-O0-NEXT: cmpl $0, %esi
+; X86-O0-NEXT: setne %dl
+; X86-O0-NEXT: xorb $-1, %dl
+; X86-O0-NEXT: andb $1, %dl
+; X86-O0-NEXT: movzbl %dl, %esi
+; X86-O0-NEXT: movl %esi, %eax
+; X86-O0-NEXT: movslq var_5, %rcx
+; X86-O0-NEXT: addq $7093, %rcx # imm = 0x1BB5
+; X86-O0-NEXT: cmpq %rcx, %rax
+; X86-O0-NEXT: setg %dl
+; X86-O0-NEXT: andb $1, %dl
+; X86-O0-NEXT: movzbl %dl, %esi
+; X86-O0-NEXT: movl %esi, %eax
+; X86-O0-NEXT: movq %rax, var_57
+; X86-O0-NEXT: movl var_5, %esi
+; X86-O0-NEXT: xorl $-1, %esi
+; X86-O0-NEXT: cmpl $0, %esi
+; X86-O0-NEXT: setne %dl
+; X86-O0-NEXT: xorb $-1, %dl
+; X86-O0-NEXT: andb $1, %dl
+; X86-O0-NEXT: movzbl %dl, %esi
+; X86-O0-NEXT: movl %esi, %eax
+; X86-O0-NEXT: movq %rax, _ZN8struct_210member_2_0E
+; X86-O0-NEXT: retq
+;
+; X64-LABEL: f1:
+; X64: # %bb.0: # %entry
+; X64-NEXT: movslq {{.*}}(%rip), %rax
+; X64-NEXT: xorl %ecx, %ecx
+; X64-NEXT: cmpq $-1, %rax
+; X64-NEXT: sete %cl
+; X64-NEXT: movabsq $-8381627093, %rdx # imm = 0xFFFFFFFE0C6A852B
+; X64-NEXT: cmpq %rdx, %rax
+; X64-NEXT: setne -{{[0-9]+}}(%rsp)
+; X64-NEXT: xorl %edx, %edx
+; X64-NEXT: cmpl $-1, %eax
+; X64-NEXT: sete %dl
+; X64-NEXT: addq $7093, %rax # imm = 0x1BB5
+; X64-NEXT: xorl %esi, %esi
+; X64-NEXT: cmpq %rax, %rdx
+; X64-NEXT: setg %sil
+; X64-NEXT: movq %rsi, {{.*}}(%rip)
+; X64-NEXT: movq %rcx, {{.*}}(%rip)
+; X64-NEXT: retq
+;
+; 686-O0-LABEL: f1:
+; 686-O0: # %bb.0: # %entry
+; 686-O0-NEXT: pushl %ebp
+; 686-O0-NEXT: .cfi_def_cfa_offset 8
+; 686-O0-NEXT: pushl %ebx
+; 686-O0-NEXT: .cfi_def_cfa_offset 12
+; 686-O0-NEXT: pushl %edi
+; 686-O0-NEXT: .cfi_def_cfa_offset 16
+; 686-O0-NEXT: pushl %esi
+; 686-O0-NEXT: .cfi_def_cfa_offset 20
+; 686-O0-NEXT: subl $24, %esp
+; 686-O0-NEXT: .cfi_def_cfa_offset 44
+; 686-O0-NEXT: .cfi_offset %esi, -20
+; 686-O0-NEXT: .cfi_offset %edi, -16
+; 686-O0-NEXT: .cfi_offset %ebx, -12
+; 686-O0-NEXT: .cfi_offset %ebp, -8
+; 686-O0-NEXT: movl var_5, %eax
+; 686-O0-NEXT: movl %eax, %ecx
+; 686-O0-NEXT: sarl $31, %ecx
+; 686-O0-NEXT: xorl $208307499, %eax # imm = 0xC6A852B
+; 686-O0-NEXT: xorl $-2, %ecx
+; 686-O0-NEXT: orl %ecx, %eax
+; 686-O0-NEXT: setne {{[0-9]+}}(%esp)
+; 686-O0-NEXT: movl var_5, %ecx
+; 686-O0-NEXT: movl %ecx, %edx
+; 686-O0-NEXT: subl $-1, %edx
+; 686-O0-NEXT: sete %bl
+; 686-O0-NEXT: movzbl %bl, %esi
+; 686-O0-NEXT: movl %ecx, %edi
+; 686-O0-NEXT: sarl $31, %edi
+; 686-O0-NEXT: xorl %ebp, %ebp
+; 686-O0-NEXT: addl $7093, %ecx # imm = 0x1BB5
+; 686-O0-NEXT: adcxl %ebp, %edi
+; 686-O0-NEXT: subl %esi, %ecx
+; 686-O0-NEXT: sbbl $0, %edi
+; 686-O0-NEXT: setl %bl
+; 686-O0-NEXT: movzbl %bl, %esi
+; 686-O0-NEXT: movl %esi, var_57
+; 686-O0-NEXT: movl $0, var_57+4
+; 686-O0-NEXT: movl var_5, %esi
+; 686-O0-NEXT: subl $-1, %esi
+; 686-O0-NEXT: sete %bl
+; 686-O0-NEXT: movzbl %bl, %ebp
+; 686-O0-NEXT: movl %ebp, _ZN8struct_210member_2_0E
+; 686-O0-NEXT: movl $0, _ZN8struct_210member_2_0E+4
+; 686-O0-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; 686-O0-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; 686-O0-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
+; 686-O0-NEXT: movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
+; 686-O0-NEXT: movl %esi, (%esp) # 4-byte Spill
+; 686-O0-NEXT: addl $24, %esp
+; 686-O0-NEXT: popl %esi
+; 686-O0-NEXT: popl %edi
+; 686-O0-NEXT: popl %ebx
+; 686-O0-NEXT: popl %ebp
+; 686-O0-NEXT: retl
+;
+; 686-LABEL: f1:
+; 686: # %bb.0: # %entry
+; 686-NEXT: pushl %edi
+; 686-NEXT: .cfi_def_cfa_offset 8
+; 686-NEXT: pushl %esi
+; 686-NEXT: .cfi_def_cfa_offset 12
+; 686-NEXT: subl $1, %esp
+; 686-NEXT: .cfi_def_cfa_offset 13
+; 686-NEXT: .cfi_offset %esi, -12
+; 686-NEXT: .cfi_offset %edi, -8
+; 686-NEXT: movl var_5, %edx
+; 686-NEXT: movl %edx, %esi
+; 686-NEXT: sarl $31, %esi
+; 686-NEXT: movl %edx, %ecx
+; 686-NEXT: andl %esi, %ecx
+; 686-NEXT: xorl %eax, %eax
+; 686-NEXT: cmpl $-1, %ecx
+; 686-NEXT: sete %al
+; 686-NEXT: movl %edx, %ecx
+; 686-NEXT: xorl $208307499, %ecx # imm = 0xC6A852B
+; 686-NEXT: movl %esi, %edi
+; 686-NEXT: xorl $-2, %edi
+; 686-NEXT: orl %ecx, %edi
+; 686-NEXT: setne (%esp)
+; 686-NEXT: xorl %ecx, %ecx
+; 686-NEXT: cmpl $-1, %edx
+; 686-NEXT: sete %cl
+; 686-NEXT: xorl %edi, %edi
+; 686-NEXT: addl $7093, %edx # imm = 0x1BB5
+; 686-NEXT: adcxl %edi, %esi
+; 686-NEXT: cmpl %ecx, %edx
+; 686-NEXT: sbbl $0, %esi
+; 686-NEXT: setl %cl
+; 686-NEXT: movzbl %cl, %ecx
+; 686-NEXT: movl %ecx, var_57
+; 686-NEXT: movl $0, var_57+4
+; 686-NEXT: movl %eax, _ZN8struct_210member_2_0E
+; 686-NEXT: movl $0, _ZN8struct_210member_2_0E+4
+; 686-NEXT: addl $1, %esp
+; 686-NEXT: popl %esi
+; 686-NEXT: popl %edi
+; 686-NEXT: retl
entry:
%a = alloca i8, align 1
%0 = load i32, i32* @var_5, align 4
@@ -80,13 +306,109 @@ entry:
@var_7 = external global i8, align 1
define void @f2() {
-; CHECK-LABEL: f2:
-; CHECK: # BB#0: # %entry
-; X64: movzbl {{.*}}(%rip), %[[R:[a-z]*]]
-; 686: movzbl {{.*}}, %[[R:[a-z]*]]
-; CHECK: test{{[qlwb]}} %[[R]], %[[R]]
-; CHECK: sete {{.*}}
-; CHECK: ret
+; X86-O0-LABEL: f2:
+; X86-O0: # %bb.0: # %entry
+; X86-O0-NEXT: # implicit-def: %rax
+; X86-O0-NEXT: movzbl var_7, %ecx
+; X86-O0-NEXT: cmpb $0, var_7
+; X86-O0-NEXT: setne %dl
+; X86-O0-NEXT: xorb $-1, %dl
+; X86-O0-NEXT: andb $1, %dl
+; X86-O0-NEXT: movzbl %dl, %esi
+; X86-O0-NEXT: xorl %esi, %ecx
+; X86-O0-NEXT: movw %cx, %di
+; X86-O0-NEXT: movw %di, -{{[0-9]+}}(%rsp)
+; X86-O0-NEXT: movzbl var_7, %ecx
+; X86-O0-NEXT: movw %cx, %di
+; X86-O0-NEXT: cmpw $0, %di
+; X86-O0-NEXT: setne %dl
+; X86-O0-NEXT: xorb $-1, %dl
+; X86-O0-NEXT: andb $1, %dl
+; X86-O0-NEXT: movzbl %dl, %ecx
+; X86-O0-NEXT: movzbl var_7, %esi
+; X86-O0-NEXT: cmpl %esi, %ecx
+; X86-O0-NEXT: sete %dl
+; X86-O0-NEXT: andb $1, %dl
+; X86-O0-NEXT: movzbl %dl, %ecx
+; X86-O0-NEXT: movw %cx, %di
+; X86-O0-NEXT: movw %di, (%rax)
+; X86-O0-NEXT: retq
+;
+; X64-LABEL: f2:
+; X64: # %bb.0: # %entry
+; X64-NEXT: movzbl {{.*}}(%rip), %eax
+; X64-NEXT: xorl %ecx, %ecx
+; X64-NEXT: testl %eax, %eax
+; X64-NEXT: sete %cl
+; X64-NEXT: xorl %eax, %ecx
+; X64-NEXT: movw %cx, -{{[0-9]+}}(%rsp)
+; X64-NEXT: xorl %ecx, %ecx
+; X64-NEXT: testb %al, %al
+; X64-NEXT: sete %cl
+; X64-NEXT: xorl %edx, %edx
+; X64-NEXT: cmpl %eax, %ecx
+; X64-NEXT: sete %dl
+; X64-NEXT: movw %dx, (%rax)
+; X64-NEXT: retq
+;
+; 686-O0-LABEL: f2:
+; 686-O0: # %bb.0: # %entry
+; 686-O0-NEXT: pushl %edi
+; 686-O0-NEXT: .cfi_def_cfa_offset 8
+; 686-O0-NEXT: pushl %esi
+; 686-O0-NEXT: .cfi_def_cfa_offset 12
+; 686-O0-NEXT: subl $2, %esp
+; 686-O0-NEXT: .cfi_def_cfa_offset 14
+; 686-O0-NEXT: .cfi_offset %esi, -12
+; 686-O0-NEXT: .cfi_offset %edi, -8
+; 686-O0-NEXT: # implicit-def: %eax
+; 686-O0-NEXT: movzbl var_7, %ecx
+; 686-O0-NEXT: cmpb $0, var_7
+; 686-O0-NEXT: setne %dl
+; 686-O0-NEXT: xorb $-1, %dl
+; 686-O0-NEXT: andb $1, %dl
+; 686-O0-NEXT: movzbl %dl, %esi
+; 686-O0-NEXT: xorl %esi, %ecx
+; 686-O0-NEXT: movw %cx, %di
+; 686-O0-NEXT: movw %di, (%esp)
+; 686-O0-NEXT: movzbl var_7, %ecx
+; 686-O0-NEXT: movw %cx, %di
+; 686-O0-NEXT: cmpw $0, %di
+; 686-O0-NEXT: setne %dl
+; 686-O0-NEXT: xorb $-1, %dl
+; 686-O0-NEXT: andb $1, %dl
+; 686-O0-NEXT: movzbl %dl, %ecx
+; 686-O0-NEXT: movzbl var_7, %esi
+; 686-O0-NEXT: cmpl %esi, %ecx
+; 686-O0-NEXT: sete %dl
+; 686-O0-NEXT: andb $1, %dl
+; 686-O0-NEXT: movzbl %dl, %ecx
+; 686-O0-NEXT: movw %cx, %di
+; 686-O0-NEXT: movw %di, (%eax)
+; 686-O0-NEXT: addl $2, %esp
+; 686-O0-NEXT: popl %esi
+; 686-O0-NEXT: popl %edi
+; 686-O0-NEXT: retl
+;
+; 686-LABEL: f2:
+; 686: # %bb.0: # %entry
+; 686-NEXT: subl $2, %esp
+; 686-NEXT: .cfi_def_cfa_offset 6
+; 686-NEXT: movzbl var_7, %eax
+; 686-NEXT: xorl %ecx, %ecx
+; 686-NEXT: testl %eax, %eax
+; 686-NEXT: sete %cl
+; 686-NEXT: xorl %eax, %ecx
+; 686-NEXT: movw %cx, (%esp)
+; 686-NEXT: xorl %ecx, %ecx
+; 686-NEXT: testb %al, %al
+; 686-NEXT: sete %cl
+; 686-NEXT: xorl %edx, %edx
+; 686-NEXT: cmpl %eax, %ecx
+; 686-NEXT: sete %dl
+; 686-NEXT: movw %dx, (%eax)
+; 686-NEXT: addl $2, %esp
+; 686-NEXT: retl
entry:
%a = alloca i16, align 2
%0 = load i8, i8* @var_7, align 1
@@ -118,15 +440,120 @@ entry:
@var_46 = external global i32, align 4
define void @f3() #0 {
-; CHECK-LABEL: f3:
-; X64-DAG: movl var_13(%rip), {{.*}}
-; X64-DAG: movl var_16(%rip), {{.*}}
-; X64-DAG: movl {{.*}},{{.*}}var_46{{.*}}
-; X64: retq
-; 686-DAG: movl var_13, {{.*}}
-; 686-DAG: movl var_16, {{.*}}
-; 686-DAG: movl {{.*}},{{.*}}var_46{{.*}}
-; 686: retl
+; X86-O0-LABEL: f3:
+; X86-O0: # %bb.0: # %entry
+; X86-O0-NEXT: movl var_13, %eax
+; X86-O0-NEXT: xorl $-1, %eax
+; X86-O0-NEXT: movl %eax, %eax
+; X86-O0-NEXT: movl %eax, %ecx
+; X86-O0-NEXT: cmpl $0, var_13
+; X86-O0-NEXT: setne %dl
+; X86-O0-NEXT: xorb $-1, %dl
+; X86-O0-NEXT: andb $1, %dl
+; X86-O0-NEXT: movzbl %dl, %eax
+; X86-O0-NEXT: movl %eax, %esi
+; X86-O0-NEXT: movl var_13, %eax
+; X86-O0-NEXT: xorl $-1, %eax
+; X86-O0-NEXT: xorl var_16, %eax
+; X86-O0-NEXT: movl %eax, %eax
+; X86-O0-NEXT: movl %eax, %edi
+; X86-O0-NEXT: andq %rdi, %rsi
+; X86-O0-NEXT: orq %rsi, %rcx
+; X86-O0-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X86-O0-NEXT: movl var_13, %eax
+; X86-O0-NEXT: xorl $-1, %eax
+; X86-O0-NEXT: movl %eax, %eax
+; X86-O0-NEXT: movl %eax, %ecx
+; X86-O0-NEXT: cmpl $0, var_13
+; X86-O0-NEXT: setne %dl
+; X86-O0-NEXT: xorb $-1, %dl
+; X86-O0-NEXT: andb $1, %dl
+; X86-O0-NEXT: movzbl %dl, %eax
+; X86-O0-NEXT: movl %eax, %esi
+; X86-O0-NEXT: andq $0, %rsi
+; X86-O0-NEXT: orq %rsi, %rcx
+; X86-O0-NEXT: movl %ecx, %eax
+; X86-O0-NEXT: movl %eax, var_46
+; X86-O0-NEXT: retq
+;
+; X64-LABEL: f3:
+; X64: # %bb.0: # %entry
+; X64-NEXT: movl {{.*}}(%rip), %eax
+; X64-NEXT: movl $4294967295, %ecx # imm = 0xFFFFFFFF
+; X64-NEXT: xorq %rax, %rcx
+; X64-NEXT: xorl %edx, %edx
+; X64-NEXT: testq %rax, %rax
+; X64-NEXT: sete %dl
+; X64-NEXT: movl {{.*}}(%rip), %eax
+; X64-NEXT: xorl %ecx, %eax
+; X64-NEXT: andq %rdx, %rax
+; X64-NEXT: orq %rcx, %rax
+; X64-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movl %ecx, {{.*}}(%rip)
+; X64-NEXT: retq
+;
+; 686-O0-LABEL: f3:
+; 686-O0: # %bb.0: # %entry
+; 686-O0-NEXT: pushl %ebp
+; 686-O0-NEXT: .cfi_def_cfa_offset 8
+; 686-O0-NEXT: .cfi_offset %ebp, -8
+; 686-O0-NEXT: movl %esp, %ebp
+; 686-O0-NEXT: .cfi_def_cfa_register %ebp
+; 686-O0-NEXT: pushl %edi
+; 686-O0-NEXT: pushl %esi
+; 686-O0-NEXT: andl $-8, %esp
+; 686-O0-NEXT: subl $8, %esp
+; 686-O0-NEXT: .cfi_offset %esi, -16
+; 686-O0-NEXT: .cfi_offset %edi, -12
+; 686-O0-NEXT: movl var_13, %eax
+; 686-O0-NEXT: movl %eax, %ecx
+; 686-O0-NEXT: notl %ecx
+; 686-O0-NEXT: testl %eax, %eax
+; 686-O0-NEXT: sete %dl
+; 686-O0-NEXT: movzbl %dl, %eax
+; 686-O0-NEXT: movl var_16, %esi
+; 686-O0-NEXT: movl %ecx, %edi
+; 686-O0-NEXT: xorl %esi, %edi
+; 686-O0-NEXT: andl %edi, %eax
+; 686-O0-NEXT: movb %al, %dl
+; 686-O0-NEXT: movzbl %dl, %eax
+; 686-O0-NEXT: orl %eax, %ecx
+; 686-O0-NEXT: movl %ecx, (%esp)
+; 686-O0-NEXT: movl $0, {{[0-9]+}}(%esp)
+; 686-O0-NEXT: movl var_13, %eax
+; 686-O0-NEXT: notl %eax
+; 686-O0-NEXT: movl %eax, var_46
+; 686-O0-NEXT: leal -8(%ebp), %esp
+; 686-O0-NEXT: popl %esi
+; 686-O0-NEXT: popl %edi
+; 686-O0-NEXT: popl %ebp
+; 686-O0-NEXT: retl
+;
+; 686-LABEL: f3:
+; 686: # %bb.0: # %entry
+; 686-NEXT: pushl %ebp
+; 686-NEXT: .cfi_def_cfa_offset 8
+; 686-NEXT: .cfi_offset %ebp, -8
+; 686-NEXT: movl %esp, %ebp
+; 686-NEXT: .cfi_def_cfa_register %ebp
+; 686-NEXT: andl $-8, %esp
+; 686-NEXT: subl $8, %esp
+; 686-NEXT: movl var_13, %ecx
+; 686-NEXT: xorl %eax, %eax
+; 686-NEXT: testl %ecx, %ecx
+; 686-NEXT: notl %ecx
+; 686-NEXT: sete %al
+; 686-NEXT: movl var_16, %edx
+; 686-NEXT: xorl %ecx, %edx
+; 686-NEXT: andl %eax, %edx
+; 686-NEXT: movzbl %dl, %eax
+; 686-NEXT: orl %ecx, %eax
+; 686-NEXT: movl %eax, (%esp)
+; 686-NEXT: movl $0, {{[0-9]+}}(%esp)
+; 686-NEXT: movl %ecx, var_46
+; 686-NEXT: movl %ebp, %esp
+; 686-NEXT: popl %ebp
+; 686-NEXT: retl
entry:
%a = alloca i64, align 8
%0 = load i32, i32* @var_13, align 4
diff --git a/test/CodeGen/X86/pr32329.ll b/test/CodeGen/X86/pr32329.ll
index f2b79b67877f..f6c3b5cf7993 100644
--- a/test/CodeGen/X86/pr32329.ll
+++ b/test/CodeGen/X86/pr32329.ll
@@ -1,6 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i686-unknown -mcpu=skx | FileCheck %s -check-prefix=X86
-; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=skx | FileCheck %s -check-prefix=X64
+; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq | FileCheck %s -check-prefix=X86
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq | FileCheck %s -check-prefix=X64
+; According to https://bugs.llvm.org/show_bug.cgi?id=32329 it checks DAG ISEL failure on SKX target
%struct.AA = type { i24, [4 x i8] }
@@ -15,26 +16,18 @@
define void @foo() local_unnamed_addr {
; X86-LABEL: foo:
-; X86: # BB#0: # %entry
+; X86: # %bb.0: # %entry
; X86-NEXT: pushl %ebp
-; X86-NEXT: .Lcfi0:
; X86-NEXT: .cfi_def_cfa_offset 8
; X86-NEXT: pushl %ebx
-; X86-NEXT: .Lcfi1:
; X86-NEXT: .cfi_def_cfa_offset 12
; X86-NEXT: pushl %edi
-; X86-NEXT: .Lcfi2:
; X86-NEXT: .cfi_def_cfa_offset 16
; X86-NEXT: pushl %esi
-; X86-NEXT: .Lcfi3:
; X86-NEXT: .cfi_def_cfa_offset 20
-; X86-NEXT: .Lcfi4:
; X86-NEXT: .cfi_offset %esi, -20
-; X86-NEXT: .Lcfi5:
; X86-NEXT: .cfi_offset %edi, -16
-; X86-NEXT: .Lcfi6:
; X86-NEXT: .cfi_offset %ebx, -12
-; X86-NEXT: .Lcfi7:
; X86-NEXT: .cfi_offset %ebp, -8
; X86-NEXT: movl obj, %edx
; X86-NEXT: movsbl var_27, %eax
@@ -70,24 +63,25 @@ define void @foo() local_unnamed_addr {
; X86-NEXT: retl
;
; X64-LABEL: foo:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: movl {{.*}}(%rip), %eax
; X64-NEXT: movsbl {{.*}}(%rip), %r9d
; X64-NEXT: movzwl {{.*}}(%rip), %r8d
-; X64-NEXT: movl {{.*}}(%rip), %esi
-; X64-NEXT: imull %r9d, %esi
-; X64-NEXT: addl {{.*}}(%rip), %esi
+; X64-NEXT: movl {{.*}}(%rip), %ecx
+; X64-NEXT: imull %r9d, %ecx
+; X64-NEXT: addl {{.*}}(%rip), %ecx
; X64-NEXT: andl $4194303, %eax # imm = 0x3FFFFF
; X64-NEXT: leal (%rax,%rax), %edi
; X64-NEXT: subl %r9d, %edi
-; X64-NEXT: movl %edi, %edx
-; X64-NEXT: subl %r8d, %edx
-; X64-NEXT: imull %edx, %esi
-; X64-NEXT: addl $-1437483407, %esi # imm = 0xAA51BE71
-; X64-NEXT: movl $9, %ecx
-; X64-NEXT: shlxq %rsi, %rcx, %rcx
-; X64-NEXT: movq %rcx, {{.*}}(%rip)
-; X64-NEXT: cmpl %eax, %edx
+; X64-NEXT: movl %edi, %esi
+; X64-NEXT: subl %r8d, %esi
+; X64-NEXT: imull %esi, %ecx
+; X64-NEXT: addl $-1437483407, %ecx # imm = 0xAA51BE71
+; X64-NEXT: movl $9, %edx
+; X64-NEXT: # kill: def %cl killed %cl killed %ecx
+; X64-NEXT: shlq %cl, %rdx
+; X64-NEXT: movq %rdx, {{.*}}(%rip)
+; X64-NEXT: cmpl %eax, %esi
; X64-NEXT: setge {{.*}}(%rip)
; X64-NEXT: imull %r9d, %edi
; X64-NEXT: movb %dil, {{.*}}(%rip)
diff --git a/test/CodeGen/X86/pr32340.ll b/test/CodeGen/X86/pr32340.ll
index cd9b5af1dc56..f5a67c1a0520 100644
--- a/test/CodeGen/X86/pr32340.ll
+++ b/test/CodeGen/X86/pr32340.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -O0 -mtriple=x86_64-unknown-linux-gnu -o - %s | FileCheck %s -check-prefix=X64
+; RUN: llc -O0 -mtriple=x86_64-unknown-linux-gnu -fast-isel-abort=1 -o - %s | FileCheck %s -check-prefix=X64
@var_825 = external global i16, align 2
@var_32 = external global i16, align 2
@@ -12,35 +12,41 @@
define void @foo() {
; X64-LABEL: foo:
-; X64: # BB#0: # %entry
-; X64-NEXT: movw $0, {{.*}}(%rip)
-; X64-NEXT: movzwl {{.*}}(%rip), %eax
-; X64-NEXT: movw %ax, %cx
-; X64-NEXT: movw {{.*}}(%rip), %dx
-; X64-NEXT: xorw %dx, %cx
-; X64-NEXT: # implicit-def: %ESI
-; X64-NEXT: movw %cx, %si
+; X64: # %bb.0: # %entry
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: movl %eax, %ecx
+; X64-NEXT: movabsq $-1142377792914660288, %rdx # imm = 0xF02575732E06E440
+; X64-NEXT: movw $0, var_825
+; X64-NEXT: movzwl var_32, %eax
+; X64-NEXT: movzwl var_901, %esi
; X64-NEXT: movl %eax, %edi
; X64-NEXT: xorl %esi, %edi
-; X64-NEXT: movw %di, %cx
-; X64-NEXT: movzwl %cx, %esi
-; X64-NEXT: movl %esi, %edi
-; X64-NEXT: addl %eax, %edi
-; X64-NEXT: movl %edi, %r8d
-; X64-NEXT: movq %r8, {{.*}}(%rip)
-; X64-NEXT: xorl $-772157262, %esi # imm = 0xD1F9D0B2
-; X64-NEXT: movl {{.*}}(%rip), %eax
-; X64-NEXT: movl %esi, %edi
-; X64-NEXT: orl %eax, %edi
-; X64-NEXT: orl %edi, %esi
-; X64-NEXT: movw %si, %cx
-; X64-NEXT: movw %cx, {{.*}}(%rip)
-; X64-NEXT: movq {{.*}}(%rip), %r8
-; X64-NEXT: testq %r8, %r8
-; X64-NEXT: setne %r9b
-; X64-NEXT: movzbl %r9b, %eax
-; X64-NEXT: movw %ax, %cx
-; X64-NEXT: movw %cx, var_827
+; X64-NEXT: movl %eax, %esi
+; X64-NEXT: xorl %edi, %esi
+; X64-NEXT: addl %eax, %esi
+; X64-NEXT: movslq %esi, %r8
+; X64-NEXT: movq %r8, var_826
+; X64-NEXT: movzwl var_32, %eax
+; X64-NEXT: movl %eax, %r8d
+; X64-NEXT: movzwl var_901, %eax
+; X64-NEXT: xorl $51981, %eax # imm = 0xCB0D
+; X64-NEXT: movslq %eax, %r9
+; X64-NEXT: xorq %rdx, %r9
+; X64-NEXT: movq %r8, %rdx
+; X64-NEXT: xorq %r9, %rdx
+; X64-NEXT: xorq $-1, %rdx
+; X64-NEXT: xorq %rdx, %r8
+; X64-NEXT: movq %r8, %rdx
+; X64-NEXT: orq var_57, %rdx
+; X64-NEXT: orq %rdx, %r8
+; X64-NEXT: movw %r8w, %r10w
+; X64-NEXT: movw %r10w, var_900
+; X64-NEXT: cmpq var_28, %rcx
+; X64-NEXT: setne %r11b
+; X64-NEXT: andb $1, %r11b
+; X64-NEXT: movzbl %r11b, %eax
+; X64-NEXT: movw %ax, %r10w
+; X64-NEXT: movw %r10w, var_827
; X64-NEXT: retq
entry:
store i16 0, i16* @var_825, align 2
diff --git a/test/CodeGen/X86/pr32345.ll b/test/CodeGen/X86/pr32345.ll
index e9182698dd90..99666c994a34 100644
--- a/test/CodeGen/X86/pr32345.ll
+++ b/test/CodeGen/X86/pr32345.ll
@@ -9,8 +9,8 @@
define void @foo() {
; X640-LABEL: foo:
-; X640: # BB#0: # %bb
-; X640-NEXT: # implicit-def: %RAX
+; X640: # %bb.0: # %bb
+; X640-NEXT: # implicit-def: %rax
; X640-NEXT: movzwl var_22, %ecx
; X640-NEXT: movzwl var_27, %edx
; X640-NEXT: xorl %edx, %ecx
@@ -27,46 +27,49 @@ define void @foo() {
; X640-NEXT: movzwl var_27, %ecx
; X640-NEXT: subl $16610, %ecx # imm = 0x40E2
; X640-NEXT: movl %ecx, %ecx
-; X640-NEXT: # kill: %RCX<def> %ECX<kill>
-; X640-NEXT: # kill: %CL<def> %RCX<kill>
+; X640-NEXT: # kill: def %rcx killed %ecx
+; X640-NEXT: # kill: def %cl killed %rcx
; X640-NEXT: sarq %cl, %rsi
; X640-NEXT: movb %sil, %cl
; X640-NEXT: movb %cl, (%rax)
; X640-NEXT: retq
;
; 6860-LABEL: foo:
-; 6860: # BB#0: # %bb
+; 6860: # %bb.0: # %bb
; 6860-NEXT: pushl %ebp
-; 6860-NEXT: .Lcfi0:
; 6860-NEXT: .cfi_def_cfa_offset 8
-; 6860-NEXT: .Lcfi1:
; 6860-NEXT: .cfi_offset %ebp, -8
; 6860-NEXT: movl %esp, %ebp
-; 6860-NEXT: .Lcfi2:
; 6860-NEXT: .cfi_def_cfa_register %ebp
; 6860-NEXT: pushl %ebx
; 6860-NEXT: pushl %edi
; 6860-NEXT: pushl %esi
; 6860-NEXT: andl $-8, %esp
; 6860-NEXT: subl $32, %esp
-; 6860-NEXT: .Lcfi3:
; 6860-NEXT: .cfi_offset %esi, -20
-; 6860-NEXT: .Lcfi4:
; 6860-NEXT: .cfi_offset %edi, -16
-; 6860-NEXT: .Lcfi5:
; 6860-NEXT: .cfi_offset %ebx, -12
-; 6860-NEXT: # implicit-def: %EAX
+; 6860-NEXT: # implicit-def: %eax
; 6860-NEXT: movw var_22, %cx
; 6860-NEXT: movzwl var_27, %edx
; 6860-NEXT: movw %dx, %si
; 6860-NEXT: xorw %si, %cx
-; 6860-NEXT: # implicit-def: %EDI
+; 6860-NEXT: # implicit-def: %edi
; 6860-NEXT: movw %cx, %di
; 6860-NEXT: xorl %edx, %edi
; 6860-NEXT: movw %di, %cx
-; 6860-NEXT: movzwl %cx, %edi
-; 6860-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; 6860-NEXT: movzwl %cx, %edx
+; 6860-NEXT: movl %edx, {{[0-9]+}}(%esp)
; 6860-NEXT: movl $0, {{[0-9]+}}(%esp)
+; 6860-NEXT: movw var_22, %cx
+; 6860-NEXT: movzwl var_27, %edx
+; 6860-NEXT: movw %dx, %si
+; 6860-NEXT: xorw %si, %cx
+; 6860-NEXT: # implicit-def: %edi
+; 6860-NEXT: movw %cx, %di
+; 6860-NEXT: xorl %edx, %edi
+; 6860-NEXT: movw %di, %cx
+; 6860-NEXT: movzwl %cx, %edi
; 6860-NEXT: addl $-16610, %edx # imm = 0xBF1E
; 6860-NEXT: movb %dl, %bl
; 6860-NEXT: xorl %edx, %edx
@@ -77,7 +80,7 @@ define void @foo() {
; 6860-NEXT: movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
; 6860-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
; 6860-NEXT: jne .LBB0_2
-; 6860-NEXT: # BB#1: # %bb
+; 6860-NEXT: # %bb.1: # %bb
; 6860-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
; 6860-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
; 6860-NEXT: .LBB0_2: # %bb
@@ -93,33 +96,30 @@ define void @foo() {
; 6860-NEXT: retl
;
; X64-LABEL: foo:
-; X64: # BB#0: # %bb
+; X64: # %bb.0: # %bb
; X64-NEXT: movzwl {{.*}}(%rip), %ecx
-; X64-NEXT: movw {{.*}}(%rip), %ax
+; X64-NEXT: movzwl {{.*}}(%rip), %eax
; X64-NEXT: xorw %cx, %ax
; X64-NEXT: xorl %ecx, %eax
; X64-NEXT: movzwl %ax, %eax
; X64-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
; X64-NEXT: addl $-16610, %ecx # imm = 0xBF1E
-; X64-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
+; X64-NEXT: # kill: def %cl killed %cl killed %ecx
; X64-NEXT: shrq %cl, %rax
; X64-NEXT: movb %al, (%rax)
; X64-NEXT: retq
;
; 686-LABEL: foo:
-; 686: # BB#0: # %bb
+; 686: # %bb.0: # %bb
; 686-NEXT: pushl %ebp
-; 686-NEXT: .Lcfi0:
; 686-NEXT: .cfi_def_cfa_offset 8
-; 686-NEXT: .Lcfi1:
; 686-NEXT: .cfi_offset %ebp, -8
; 686-NEXT: movl %esp, %ebp
-; 686-NEXT: .Lcfi2:
; 686-NEXT: .cfi_def_cfa_register %ebp
; 686-NEXT: andl $-8, %esp
; 686-NEXT: subl $8, %esp
; 686-NEXT: movzwl var_27, %ecx
-; 686-NEXT: movw var_22, %ax
+; 686-NEXT: movzwl var_22, %eax
; 686-NEXT: xorw %cx, %ax
; 686-NEXT: xorl %ecx, %eax
; 686-NEXT: movzwl %ax, %eax
@@ -130,7 +130,7 @@ define void @foo() {
; 686-NEXT: shrdl %cl, %edx, %eax
; 686-NEXT: testb $32, %cl
; 686-NEXT: jne .LBB0_2
-; 686-NEXT: # BB#1: # %bb
+; 686-NEXT: # %bb.1: # %bb
; 686-NEXT: movl %eax, %edx
; 686-NEXT: .LBB0_2: # %bb
; 686-NEXT: movb %dl, (%eax)
diff --git a/test/CodeGen/X86/pr32368.ll b/test/CodeGen/X86/pr32368.ll
index b0f0b123cca1..5fa771c03c85 100644
--- a/test/CodeGen/X86/pr32368.ll
+++ b/test/CodeGen/X86/pr32368.ll
@@ -6,21 +6,21 @@
define <4 x float> @PR32368_128(<4 x float>) {
; SSE-LABEL: PR32368_128:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: andps {{.*}}(%rip), %xmm0
; SSE-NEXT: addps %xmm0, %xmm0
; SSE-NEXT: andps {{.*}}(%rip), %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: PR32368_128:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0
; AVX1-NEXT: vaddps %xmm0, %xmm0, %xmm0
; AVX1-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: PR32368_128:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vbroadcastss {{.*}}(%rip), %xmm1
; AVX2-NEXT: vandps %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vaddps %xmm0, %xmm0, %xmm0
@@ -29,7 +29,7 @@ define <4 x float> @PR32368_128(<4 x float>) {
; AVX2-NEXT: retq
;
; AVX512-LABEL: PR32368_128:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vbroadcastss {{.*}}(%rip), %xmm1
; AVX512-NEXT: vandps %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vaddps %xmm0, %xmm0, %xmm0
@@ -48,7 +48,7 @@ define <4 x float> @PR32368_128(<4 x float>) {
define <8 x float> @PR32368_256(<8 x float>) {
; SSE-LABEL: PR32368_256:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm2 = [4294967004,4294967004,4294967004,4294967004]
; SSE-NEXT: andps %xmm2, %xmm0
; SSE-NEXT: andps %xmm2, %xmm1
@@ -60,14 +60,14 @@ define <8 x float> @PR32368_256(<8 x float>) {
; SSE-NEXT: retq
;
; AVX1-LABEL: PR32368_256:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: vaddps %ymm0, %ymm0, %ymm0
; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: PR32368_256:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vbroadcastss {{.*}}(%rip), %ymm1
; AVX2-NEXT: vandps %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vaddps %ymm0, %ymm0, %ymm0
@@ -76,7 +76,7 @@ define <8 x float> @PR32368_256(<8 x float>) {
; AVX2-NEXT: retq
;
; AVX512-LABEL: PR32368_256:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vbroadcastss {{.*}}(%rip), %ymm1
; AVX512-NEXT: vandps %ymm1, %ymm0, %ymm0
; AVX512-NEXT: vaddps %ymm0, %ymm0, %ymm0
@@ -95,7 +95,7 @@ define <8 x float> @PR32368_256(<8 x float>) {
define <16 x float> @PR32368_512(<16 x float>) {
; SSE-LABEL: PR32368_512:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm4 = [4294967004,4294967004,4294967004,4294967004]
; SSE-NEXT: andps %xmm4, %xmm0
; SSE-NEXT: andps %xmm4, %xmm1
@@ -113,7 +113,7 @@ define <16 x float> @PR32368_512(<16 x float>) {
; SSE-NEXT: retq
;
; AVX1-LABEL: PR32368_512:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [4294967004,4294967004,4294967004,4294967004,4294967004,4294967004,4294967004,4294967004]
; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
@@ -125,7 +125,7 @@ define <16 x float> @PR32368_512(<16 x float>) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: PR32368_512:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vbroadcastss {{.*}}(%rip), %ymm2
; AVX2-NEXT: vandps %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vandps %ymm2, %ymm1, %ymm1
@@ -137,7 +137,7 @@ define <16 x float> @PR32368_512(<16 x float>) {
; AVX2-NEXT: retq
;
; AVX512-LABEL: PR32368_512:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm0
; AVX512-NEXT: vaddps %zmm0, %zmm0, %zmm0
; AVX512-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm0
diff --git a/test/CodeGen/X86/pr3241.ll b/test/CodeGen/X86/pr3241.ll
index f89634d5b82a..a324cf2ffa94 100644
--- a/test/CodeGen/X86/pr3241.ll
+++ b/test/CodeGen/X86/pr3241.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86
+; RUN: llc < %s -mtriple=i686--
; PR3241
@g_620 = external global i32
diff --git a/test/CodeGen/X86/pr32420.ll b/test/CodeGen/X86/pr32420.ll
index bf3a4720c080..e635c6835025 100644
--- a/test/CodeGen/X86/pr32420.ll
+++ b/test/CodeGen/X86/pr32420.ll
@@ -9,14 +9,14 @@ target triple = "x86_64-apple-macosx10.12.0"
define i32 @PR32420() {
; CHECK-LABEL: PR32420:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: movq _a@{{.*}}(%rip), %rax
; CHECK-NEXT: movzwl (%rax), %eax
; CHECK-NEXT: movl %eax, %ecx
; CHECK-NEXT: shll $12, %ecx
; CHECK-NEXT: sarw $12, %cx
; CHECK-NEXT: movq _b@{{.*}}(%rip), %rdx
-; CHECK-NEXT: movw %cx, %si
+; CHECK-NEXT: movl %ecx, %esi
; CHECK-NEXT: orw (%rdx), %si
; CHECK-NEXT: andl %ecx, %esi
; CHECK-NEXT: movw %si, (%rdx)
diff --git a/test/CodeGen/X86/pr3243.ll b/test/CodeGen/X86/pr3243.ll
index 483b5bf3a2a6..f5fad20bf7df 100644
--- a/test/CodeGen/X86/pr3243.ll
+++ b/test/CodeGen/X86/pr3243.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86
+; RUN: llc < %s -mtriple=i686--
; PR3243
declare signext i16 @safe_mul_func_int16_t_s_s(i16 signext, i32) nounwind readnone optsize
diff --git a/test/CodeGen/X86/pr3244.ll b/test/CodeGen/X86/pr3244.ll
index c6419d8ce768..166ca90d7995 100644
--- a/test/CodeGen/X86/pr3244.ll
+++ b/test/CodeGen/X86/pr3244.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86
+; RUN: llc < %s -mtriple=i686--
; PR3244
@g_62 = external global i16 ; <i16*> [#uses=1]
diff --git a/test/CodeGen/X86/pr32451.ll b/test/CodeGen/X86/pr32451.ll
index e4643a863f94..86a46facbb52 100644
--- a/test/CodeGen/X86/pr32451.ll
+++ b/test/CodeGen/X86/pr32451.ll
@@ -8,14 +8,11 @@ target triple = "x86_64-unknown-linux-gnu"
define i8** @japi1_convert_690(i8**, i8***, i32) {
; CHECK-LABEL: japi1_convert_690:
-; CHECK: # BB#0: # %top
+; CHECK: # %bb.0: # %top
; CHECK-NEXT: pushl %ebx
-; CHECK-NEXT: .Lcfi0:
; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: subl $16, %esp
-; CHECK-NEXT: .Lcfi1:
; CHECK-NEXT: .cfi_def_cfa_offset 24
-; CHECK-NEXT: .Lcfi2:
; CHECK-NEXT: .cfi_offset %ebx, -8
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
diff --git a/test/CodeGen/X86/pr32484.ll b/test/CodeGen/X86/pr32484.ll
index 74857f8d0066..dc67ec2924b6 100644
--- a/test/CodeGen/X86/pr32484.ll
+++ b/test/CodeGen/X86/pr32484.ll
@@ -3,11 +3,11 @@
define void @foo() {
; CHECK-LABEL: foo:
-; CHECK: # BB#0:
-; CHECK-NEXT: # implicit-def: %RAX
+; CHECK: # %bb.0:
+; CHECK-NEXT: # implicit-def: %rax
; CHECK-NEXT: jmpq *%rax
; CHECK-NEXT: .LBB0_1:
-; CHECK-NEXT: # implicit-def: %RAX
+; CHECK-NEXT: # implicit-def: %rax
; CHECK-NEXT: xorps %xmm0, %xmm0
; CHECK-NEXT: pcmpeqd %xmm1, %xmm1
; CHECK-NEXT: movdqu %xmm1, (%rax)
diff --git a/test/CodeGen/X86/pr3250.ll b/test/CodeGen/X86/pr3250.ll
index 4ab989eaf77f..ab6af4ef5312 100644
--- a/test/CodeGen/X86/pr3250.ll
+++ b/test/CodeGen/X86/pr3250.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86
+; RUN: llc < %s -mtriple=i686--
; PR3250
declare i32 @safe_sub_func_short_u_u(i16 signext, i16 signext) nounwind
diff --git a/test/CodeGen/X86/pr32659.ll b/test/CodeGen/X86/pr32659.ll
index aafae9c4f6c9..ad3a78052b66 100644
--- a/test/CodeGen/X86/pr32659.ll
+++ b/test/CodeGen/X86/pr32659.ll
@@ -1,5 +1,6 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -o - %s | FileCheck %s
-target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
+
target triple = "i386-unknown-linux-gnu"
@a = external global i32, align 4
@@ -14,20 +15,50 @@ target triple = "i386-unknown-linux-gnu"
@e = external global i64, align 8
@g = external global i32, align 4
-; Function Attrs: norecurse nounwind optsize readnone
-declare i32 @fn1(i32 returned) #0
+declare i32 @fn1(i32 returned) optsize readnone
+
+declare i32 @main() optsize
+declare i32 @putchar(i32) nounwind
-; CHECK-LABEL: fn2
-; CHECK: calll putchar
-; CHECK: addl $1,
-; CHECK: adcl $0,
-; Function Attrs: nounwind optsize
-define void @fn2() #1 {
+define void @fn2() nounwind optsize {
+; CHECK-LABEL: fn2:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: pushl %ebx
+; CHECK-NEXT: subl $8, %esp
+; CHECK-NEXT: movl $48, (%esp)
+; CHECK-NEXT: calll putchar
+; CHECK-NEXT: movl h, %eax
+; CHECK-NEXT: movl c, %ecx
+; CHECK-NEXT: movl j, %edx
+; CHECK-NEXT: movl (%edx), %edx
+; CHECK-NEXT: movl (%edx), %edx
+; CHECK-NEXT: xorl %ebx, %ebx
+; CHECK-NEXT: cmpl (%edx), %ecx
+; CHECK-NEXT: setg %bl
+; CHECK-NEXT: xorl %ecx, %ecx
+; CHECK-NEXT: cmpl %ebx, i
+; CHECK-NEXT: setg %cl
+; CHECK-NEXT: movl %ecx, b
+; CHECK-NEXT: xorl %edx, %edx
+; CHECK-NEXT: cmpl %ecx, %eax
+; CHECK-NEXT: setg %dl
+; CHECK-NEXT: xorl %edx, a
+; CHECK-NEXT: movl d, %eax
+; CHECK-NEXT: movl (%eax), %eax
+; CHECK-NEXT: andl %eax, e
+; CHECK-NEXT: sarl $31, %eax
+; CHECK-NEXT: andl %eax, e+4
+; CHECK-NEXT: decl g
+; CHECK-NEXT: addl $1, f
+; CHECK-NEXT: adcl $0, f+4
+; CHECK-NEXT: addl $8, %esp
+; CHECK-NEXT: popl %ebx
+; CHECK-NEXT: retl
entry:
%putchar = tail call i32 @putchar(i32 48)
%0 = load volatile i32, i32* @h, align 4
- %1 = load i32, i32* @c, align 4, !tbaa !2
+ %1 = load i32, i32* @c, align 4, !tbaa !1
%2 = load i32***, i32**** @j, align 4
%3 = load i32**, i32*** %2, align 4
%4 = load i32*, i32** %3, align 4
@@ -58,26 +89,12 @@ entry:
ret void
}
-; Function Attrs: nounwind optsize
-declare i32 @main() #1
-
-; Function Attrs: nounwind
-declare i32 @putchar(i32) #2
-
-attributes #0 = { optsize readnone }
-attributes #1 = { optsize }
-attributes #2 = { nounwind }
-
-!llvm.module.flags = !{!0}
-!llvm.ident = !{!1}
-
!0 = !{i32 1, !"NumRegisterParameters", i32 0}
-!1 = !{!"clang version 5.0.0 (trunk 300074) (llvm/trunk 300078)"}
-!2 = !{!3, !3, i64 0}
-!3 = !{!"int", !4, i64 0}
-!4 = !{!"omnipotent char", !5, i64 0}
-!5 = !{!"Simple C/C++ TBAA"}
-!6 = !{!7, !7, i64 0}
-!7 = !{!"any pointer", !4, i64 0}
-!8 = !{!9, !9, i64 0}
-!9 = !{!"long long", !4, i64 0}
+!1 = !{!2, !2, i64 0}
+!2 = !{!"int", !3, i64 0}
+!3 = !{!"omnipotent char", !4, i64 0}
+!4 = !{!"Simple C/C++ TBAA"}
+!5 = !{!6, !6, i64 0}
+!6 = !{!"any pointer", !3, i64 0}
+!7 = !{!8, !8, i64 0}
+!8 = !{!"long long", !3, i64 0}
diff --git a/test/CodeGen/X86/pr32907.ll b/test/CodeGen/X86/pr32907.ll
index 8057b31c961c..a4396e86cd2b 100644
--- a/test/CodeGen/X86/pr32907.ll
+++ b/test/CodeGen/X86/pr32907.ll
@@ -6,7 +6,7 @@
define <2 x i64> @PR32907(<2 x i64> %astype.i, <2 x i64> %astype6.i) {
; SSE2-LABEL: PR32907:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: psubq %xmm1, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrad $31, %xmm1
@@ -20,7 +20,7 @@ define <2 x i64> @PR32907(<2 x i64> %astype.i, <2 x i64> %astype6.i) {
; SSE2-NEXT: retq
;
; SSE42-LABEL: PR32907:
-; SSE42: # BB#0: # %entry
+; SSE42: # %bb.0: # %entry
; SSE42-NEXT: psubq %xmm1, %xmm0
; SSE42-NEXT: pxor %xmm1, %xmm1
; SSE42-NEXT: pcmpgtq %xmm0, %xmm1
@@ -29,7 +29,7 @@ define <2 x i64> @PR32907(<2 x i64> %astype.i, <2 x i64> %astype6.i) {
; SSE42-NEXT: retq
;
; AVX2-LABEL: PR32907:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm1
@@ -38,7 +38,7 @@ define <2 x i64> @PR32907(<2 x i64> %astype.i, <2 x i64> %astype6.i) {
; AVX2-NEXT: retq
;
; AVX512-LABEL: PR32907:
-; AVX512: # BB#0: # %entry
+; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vpsubq %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpsraq $63, %zmm0, %zmm1
; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
diff --git a/test/CodeGen/X86/pr3317.ll b/test/CodeGen/X86/pr3317.ll
index cab8ae6b73fd..9c4ba39c02b0 100644
--- a/test/CodeGen/X86/pr3317.ll
+++ b/test/CodeGen/X86/pr3317.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86
+; RUN: llc < %s -mtriple=i686--
; PR3317
%VT = type [0 x i32 (...)*]
diff --git a/test/CodeGen/X86/pr33290.ll b/test/CodeGen/X86/pr33290.ll
new file mode 100644
index 000000000000..b5d9754eba76
--- /dev/null
+++ b/test/CodeGen/X86/pr33290.ll
@@ -0,0 +1,51 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=X64
+
+@a = common global i32 0, align 4
+@c = common local_unnamed_addr global i8 0, align 1
+@b = common local_unnamed_addr global i32* null, align 8
+
+define void @e() {
+; X86-LABEL: e:
+; X86: # %bb.0: # %entry
+; X86-NEXT: movl b, %eax
+; X86-NEXT: .p2align 4, 0x90
+; X86-NEXT: .LBB0_1: # %for.cond
+; X86-NEXT: # =>This Inner Loop Header: Depth=1
+; X86-NEXT: movzbl c, %ecx
+; X86-NEXT: leal a+2(%ecx), %ecx
+; X86-NEXT: movb $0, c
+; X86-NEXT: movl %ecx, (%eax)
+; X86-NEXT: jmp .LBB0_1
+;
+; X64-LABEL: e:
+; X64: # %bb.0: # %entry
+; X64-NEXT: movq {{.*}}(%rip), %rax
+; X64-NEXT: movl $a, %esi
+; X64-NEXT: .p2align 4, 0x90
+; X64-NEXT: .LBB0_1: # %for.cond
+; X64-NEXT: # =>This Inner Loop Header: Depth=1
+; X64-NEXT: movzbl {{.*}}(%rip), %edx
+; X64-NEXT: addq %rsi, %rdx
+; X64-NEXT: setb %cl
+; X64-NEXT: addq $2, %rdx
+; X64-NEXT: adcb $0, %cl
+; X64-NEXT: movb %cl, {{.*}}(%rip)
+; X64-NEXT: movl %edx, (%rax)
+; X64-NEXT: jmp .LBB0_1
+entry:
+ %0 = load i32*, i32** @b, align 8
+ br label %for.cond
+
+for.cond:
+ %1 = load i8, i8* @c, align 1
+ %conv = zext i8 %1 to i128
+ %add = add nuw nsw i128 %conv, add (i128 ptrtoint (i32* @a to i128), i128 2)
+ %2 = lshr i128 %add, 64
+ %conv1 = trunc i128 %2 to i8
+ store i8 %conv1, i8* @c, align 1
+ %conv2 = trunc i128 %add to i32
+ store i32 %conv2, i32* %0, align 4
+ br label %for.cond
+}
diff --git a/test/CodeGen/X86/pr33349.ll b/test/CodeGen/X86/pr33349.ll
index db866db22481..b1428ba6667c 100644
--- a/test/CodeGen/X86/pr33349.ll
+++ b/test/CodeGen/X86/pr33349.ll
@@ -7,7 +7,7 @@ target triple = "x86_64-unknown-linux-gnu"
define void @test(<4 x i1> %m, <4 x x86_fp80> %v, <4 x x86_fp80>*%p) local_unnamed_addr {
; KNL-LABEL: test:
-; KNL: # BB#0: # %bb
+; KNL: # %bb.0: # %bb
; KNL-NEXT: vpextrb $0, %xmm0, %eax
; KNL-NEXT: testb $1, %al
; KNL-NEXT: fld1
@@ -37,52 +37,38 @@ target triple = "x86_64-unknown-linux-gnu"
; KNL-NEXT: retq
;
; SKX-LABEL: test:
-; SKX: # BB#0: # %bb
+; SKX: # %bb.0: # %bb
; SKX-NEXT: vpslld $31, %xmm0, %xmm0
; SKX-NEXT: vptestmd %xmm0, %xmm0, %k0
; SKX-NEXT: kshiftrw $2, %k0, %k1
-; SKX-NEXT: kshiftlw $15, %k1, %k2
-; SKX-NEXT: kshiftrw $15, %k2, %k2
-; SKX-NEXT: kshiftlw $15, %k2, %k2
-; SKX-NEXT: kshiftrw $15, %k2, %k2
+; SKX-NEXT: kshiftrw $1, %k1, %k2
; SKX-NEXT: kmovd %k2, %eax
; SKX-NEXT: testb $1, %al
; SKX-NEXT: fld1
; SKX-NEXT: fldz
; SKX-NEXT: fld %st(0)
; SKX-NEXT: fcmovne %st(2), %st(0)
-; SKX-NEXT: kshiftlw $14, %k1, %k1
-; SKX-NEXT: kshiftrw $15, %k1, %k1
-; SKX-NEXT: kshiftlw $15, %k1, %k1
-; SKX-NEXT: kshiftrw $15, %k1, %k1
-; SKX-NEXT: kmovd %k1, %eax
+; SKX-NEXT: kshiftrw $1, %k0, %k2
+; SKX-NEXT: kmovd %k2, %eax
; SKX-NEXT: testb $1, %al
; SKX-NEXT: fld %st(1)
; SKX-NEXT: fcmovne %st(3), %st(0)
-; SKX-NEXT: kshiftlw $15, %k0, %k1
-; SKX-NEXT: kshiftrw $15, %k1, %k1
-; SKX-NEXT: kshiftlw $15, %k1, %k1
-; SKX-NEXT: kshiftrw $15, %k1, %k1
; SKX-NEXT: kmovd %k1, %eax
; SKX-NEXT: testb $1, %al
; SKX-NEXT: fld %st(2)
; SKX-NEXT: fcmovne %st(4), %st(0)
-; SKX-NEXT: kshiftlw $14, %k0, %k0
-; SKX-NEXT: kshiftrw $15, %k0, %k0
-; SKX-NEXT: kshiftlw $15, %k0, %k0
-; SKX-NEXT: kshiftrw $15, %k0, %k0
; SKX-NEXT: kmovd %k0, %eax
; SKX-NEXT: testb $1, %al
; SKX-NEXT: fxch %st(3)
; SKX-NEXT: fcmovne %st(4), %st(0)
; SKX-NEXT: fstp %st(4)
; SKX-NEXT: fxch %st(3)
-; SKX-NEXT: fstpt 10(%rdi)
-; SKX-NEXT: fxch %st(1)
; SKX-NEXT: fstpt (%rdi)
; SKX-NEXT: fxch %st(1)
-; SKX-NEXT: fstpt 30(%rdi)
; SKX-NEXT: fstpt 20(%rdi)
+; SKX-NEXT: fxch %st(1)
+; SKX-NEXT: fstpt 10(%rdi)
+; SKX-NEXT: fstpt 30(%rdi)
; SKX-NEXT: retq
bb:
%tmp = select <4 x i1> %m, <4 x x86_fp80> <x86_fp80 0xK3FFF8000000000000000, x86_fp80 0xK3FFF8000000000000000, x86_fp80 0xK3FFF8000000000000000, x86_fp80 0xK3FFF8000000000000000>, <4 x x86_fp80> zeroinitializer
diff --git a/test/CodeGen/X86/pr3366.ll b/test/CodeGen/X86/pr3366.ll
index b89a69ab7d41..f72a35185b47 100644
--- a/test/CodeGen/X86/pr3366.ll
+++ b/test/CodeGen/X86/pr3366.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -disable-cgp-branch-opts | grep movzbl
+; RUN: llc < %s -mtriple=i686-- -disable-cgp-branch-opts | grep movzbl
; PR3366
define void @_ada_c34002a() nounwind {
diff --git a/test/CodeGen/X86/pr33828.ll b/test/CodeGen/X86/pr33828.ll
index 1b7f44323b61..6314ed6bd5bf 100644
--- a/test/CodeGen/X86/pr33828.ll
+++ b/test/CodeGen/X86/pr33828.ll
@@ -6,20 +6,20 @@
define void @foo() {
; X86-LABEL: foo:
-; X86: # BB#0: # %entry
+; X86: # %bb.0: # %entry
; X86-NEXT: movsbl var_580, %eax
; X86-NEXT: testl $-536870913, %eax # imm = 0xDFFFFFFF
; X86-NEXT: jne .LBB0_1
-; X86-NEXT: # BB#2: # %if.end13
+; X86-NEXT: # %bb.2: # %if.end13
; X86-NEXT: retl
; X86-NEXT: .LBB0_1: # %if.then11
;
; X64-LABEL: foo:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: movsbl {{.*}}(%rip), %eax
; X64-NEXT: testl $-536870913, %eax # imm = 0xDFFFFFFF
; X64-NEXT: jne .LBB0_1
-; X64-NEXT: # BB#2: # %if.end13
+; X64-NEXT: # %bb.2: # %if.end13
; X64-NEXT: retq
; X64-NEXT: .LBB0_1: # %if.then11
entry:
diff --git a/test/CodeGen/X86/pr33844.ll b/test/CodeGen/X86/pr33844.ll
index 2585945aa109..f832aca6d497 100644
--- a/test/CodeGen/X86/pr33844.ll
+++ b/test/CodeGen/X86/pr33844.ll
@@ -9,7 +9,7 @@ target triple = "x86_64-unknown-linux-gnu"
define void @patatino() {
; CHECK-LABEL: patatino:
-; CHECK: # BB#0: # %bb
+; CHECK: # %bb.0: # %bb
; CHECK-NEXT: movl {{.*}}(%rip), %eax
; CHECK-NEXT: movl %eax, %ecx
; CHECK-NEXT: shrl $31, %ecx
diff --git a/test/CodeGen/X86/pr33954.ll b/test/CodeGen/X86/pr33954.ll
new file mode 100644
index 000000000000..084873a7dc55
--- /dev/null
+++ b/test/CodeGen/X86/pr33954.ll
@@ -0,0 +1,91 @@
+; RUN: llc -mtriple=x86_64-pc-linux -x86-cmov-converter=true -verify-machineinstrs < %s | FileCheck %s
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; This test checks that x86-cmov-converter optimization does not transform CMOV
+;; instruction when the gain (in cycles) of converting to branch is less than
+;; a fix threshold (measured for "-x86-cmov-converter-threshold=4").
+;;
+;; Test was created using the following command line:
+;; > clang -S -O2 -m64 -fno-vectorize -fno-unroll-loops -emit-llvm foo.c -o -
+;; Where foo.c is:
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;int bar(int *a, int *b, int n) {
+;; int sum = 0;
+;; for (int i = 0; i < n; ++i) {
+;; int x = a[i] * a[i+1] * a[i+2];
+;; int y = b[i] * b[i+1];
+;; sum += y > x ? x : 0;
+;; }
+;; return sum;
+;;}
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Adding a test to the above function shows code with CMOV is 25% faster than
+;; the code with branch.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;#define N 10000
+;;int A[N];
+;;int B[N];
+;;
+;;
+;;
+;;int main () {
+;; for (int i=0; i< N; ++i) {
+;; A[i] = i%4;
+;; B[i] = i%5;
+;; }
+;; int sum = 0;
+;; for (int i=0; i< N*10; ++i)
+;; sum += bar(A, B, N);
+;; return sum;
+;;}
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; CHECK-NOT: jg
+; CHECK: cmovle
+define i32 @bar(i32* nocapture readonly %a, i32* nocapture readonly %b, i32 %n) #0 {
+entry:
+ %cmp30 = icmp sgt i32 %n, 0
+ br i1 %cmp30, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader: ; preds = %entry
+ %.pre = load i32, i32* %a, align 4
+ %arrayidx2.phi.trans.insert = getelementptr inbounds i32, i32* %a, i64 1
+ %.pre34 = load i32, i32* %arrayidx2.phi.trans.insert, align 4
+ %.pre35 = load i32, i32* %b, align 4
+ %wide.trip.count = zext i32 %n to i64
+ br label %for.body
+
+for.cond.cleanup: ; preds = %for.body, %entry
+ %sum.0.lcssa = phi i32 [ 0, %entry ], [ %add14, %for.body ]
+ ret i32 %sum.0.lcssa
+
+for.body: ; preds = %for.body, %for.body.preheader
+ %0 = phi i32 [ %.pre35, %for.body.preheader ], [ %5, %for.body ]
+ %1 = phi i32 [ %.pre34, %for.body.preheader ], [ %4, %for.body ]
+ %2 = phi i32 [ %.pre, %for.body.preheader ], [ %1, %for.body ]
+ %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+ %sum.032 = phi i32 [ 0, %for.body.preheader ], [ %add14, %for.body ]
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %mul = mul nsw i32 %1, %2
+ %3 = add nuw nsw i64 %indvars.iv, 2
+ %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %3
+ %4 = load i32, i32* %arrayidx5, align 4
+ %mul6 = mul nsw i32 %mul, %4
+ %arrayidx11 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv.next
+ %5 = load i32, i32* %arrayidx11, align 4
+ %mul12 = mul nsw i32 %5, %0
+ %cmp13 = icmp sgt i32 %mul12, %mul6
+ %cond = select i1 %cmp13, i32 %mul6, i32 0
+ %add14 = add nsw i32 %cond, %sum.032
+ %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+attributes #0 = {"target-cpu"="skylake"}
+
+!llvm.module.flags = !{!0, !1}
+!llvm.ident = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 2}
+!1 = !{i32 7, !"PIC Level", i32 2}
+!2 = !{!"clang version 5.0.0 (trunk)"}
diff --git a/test/CodeGen/X86/pr33960.ll b/test/CodeGen/X86/pr33960.ll
index fb9236d3ffa2..34af4df94551 100644
--- a/test/CodeGen/X86/pr33960.ll
+++ b/test/CodeGen/X86/pr33960.ll
@@ -6,12 +6,12 @@
define void @PR33960() {
; X86-LABEL: PR33960:
-; X86: # BB#0: # %entry
+; X86: # %bb.0: # %entry
; X86-NEXT: movl $0, b
; X86-NEXT: retl
;
; X64-LABEL: PR33960:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: movl $0, {{.*}}(%rip)
; X64-NEXT: retq
entry:
diff --git a/test/CodeGen/X86/pr34080.ll b/test/CodeGen/X86/pr34080.ll
new file mode 100644
index 000000000000..72dbf3c48516
--- /dev/null
+++ b/test/CodeGen/X86/pr34080.ll
@@ -0,0 +1,167 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+sse2 | FileCheck %s --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+sse2 -mcpu=x86-64 | FileCheck %s --check-prefix=SSE2-BROKEN
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+sse3 | FileCheck %s --check-prefix=SSE3
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+sse3 -mcpu=prescott | FileCheck %s --check-prefix=SSE3
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s --check-prefix=AVX
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx -mcpu=sandybridge | FileCheck %s --check-prefix=AVX
+
+define void @_Z1fe(x86_fp80 %z) local_unnamed_addr #0 {
+; SSE2-LABEL: _Z1fe:
+; SSE2: ## %bb.0: ## %entry
+; SSE2-NEXT: pushq %rbp
+; SSE2-NEXT: .cfi_def_cfa_offset 16
+; SSE2-NEXT: .cfi_offset %rbp, -16
+; SSE2-NEXT: movq %rsp, %rbp
+; SSE2-NEXT: .cfi_def_cfa_register %rbp
+; SSE2-NEXT: fldt 16(%rbp)
+; SSE2-NEXT: fnstcw -4(%rbp)
+; SSE2-NEXT: movzwl -4(%rbp), %eax
+; SSE2-NEXT: movw $3199, -4(%rbp) ## imm = 0xC7F
+; SSE2-NEXT: fldcw -4(%rbp)
+; SSE2-NEXT: movw %ax, -4(%rbp)
+; SSE2-NEXT: fistl -8(%rbp)
+; SSE2-NEXT: fldcw -4(%rbp)
+; SSE2-NEXT: cvtsi2sdl -8(%rbp), %xmm0
+; SSE2-NEXT: movsd %xmm0, -64(%rbp)
+; SSE2-NEXT: movsd %xmm0, -32(%rbp)
+; SSE2-NEXT: fsubl -32(%rbp)
+; SSE2-NEXT: flds {{.*}}(%rip)
+; SSE2-NEXT: fmul %st(0), %st(1)
+; SSE2-NEXT: fnstcw -2(%rbp)
+; SSE2-NEXT: movzwl -2(%rbp), %eax
+; SSE2-NEXT: movw $3199, -2(%rbp) ## imm = 0xC7F
+; SSE2-NEXT: fldcw -2(%rbp)
+; SSE2-NEXT: movw %ax, -2(%rbp)
+; SSE2-NEXT: fxch %st(1)
+; SSE2-NEXT: fistl -12(%rbp)
+; SSE2-NEXT: fldcw -2(%rbp)
+; SSE2-NEXT: xorps %xmm0, %xmm0
+; SSE2-NEXT: cvtsi2sdl -12(%rbp), %xmm0
+; SSE2-NEXT: movsd %xmm0, -56(%rbp)
+; SSE2-NEXT: movsd %xmm0, -24(%rbp)
+; SSE2-NEXT: fsubl -24(%rbp)
+; SSE2-NEXT: fmulp %st(1)
+; SSE2-NEXT: fstpl -48(%rbp)
+; SSE2-NEXT: popq %rbp
+; SSE2-NEXT: retq
+;
+; SSE2-BROKEN-LABEL: _Z1fe:
+; SSE2-BROKEN: ## %bb.0: ## %entry
+; SSE2-BROKEN-NEXT: pushq %rbp
+; SSE2-BROKEN-NEXT: .cfi_def_cfa_offset 16
+; SSE2-BROKEN-NEXT: .cfi_offset %rbp, -16
+; SSE2-BROKEN-NEXT: movq %rsp, %rbp
+; SSE2-BROKEN-NEXT: .cfi_def_cfa_register %rbp
+; SSE2-BROKEN-NEXT: fnstcw -4(%rbp)
+; SSE2-BROKEN-NEXT: fldt 16(%rbp)
+; SSE2-BROKEN-NEXT: movzwl -4(%rbp), %eax
+; SSE2-BROKEN-NEXT: movw $3199, -4(%rbp) ## imm = 0xC7F
+; SSE2-BROKEN-NEXT: fldcw -4(%rbp)
+; SSE2-BROKEN-NEXT: movw %ax, -4(%rbp)
+; SSE2-BROKEN-NEXT: fistl -8(%rbp)
+; SSE2-BROKEN-NEXT: fldcw -4(%rbp)
+; SSE2-BROKEN-NEXT: cvtsi2sdl -8(%rbp), %xmm0
+; SSE2-BROKEN-NEXT: movsd %xmm0, -64(%rbp)
+; SSE2-BROKEN-NEXT: movsd %xmm0, -32(%rbp)
+; SSE2-BROKEN-NEXT: fsubl -32(%rbp)
+; SSE2-BROKEN-NEXT: flds {{.*}}(%rip)
+; SSE2-BROKEN-NEXT: fnstcw -2(%rbp)
+; SSE2-BROKEN-NEXT: fmul %st(0), %st(1)
+; SSE2-BROKEN-NEXT: movzwl -2(%rbp), %eax
+; SSE2-BROKEN-NEXT: movw $3199, -2(%rbp) ## imm = 0xC7F
+; SSE2-BROKEN-NEXT: fldcw -2(%rbp)
+; SSE2-BROKEN-NEXT: movw %ax, -2(%rbp)
+; SSE2-BROKEN-NEXT: fxch %st(1)
+; SSE2-BROKEN-NEXT: fistl -12(%rbp)
+; SSE2-BROKEN-NEXT: fldcw -2(%rbp)
+; SSE2-BROKEN-NEXT: xorps %xmm0, %xmm0
+; SSE2-BROKEN-NEXT: cvtsi2sdl -12(%rbp), %xmm0
+; SSE2-BROKEN-NEXT: movsd %xmm0, -56(%rbp)
+; SSE2-BROKEN-NEXT: movsd %xmm0, -24(%rbp)
+; SSE2-BROKEN-NEXT: fsubl -24(%rbp)
+; SSE2-BROKEN-NEXT: fmulp %st(1)
+; SSE2-BROKEN-NEXT: fstpl -48(%rbp)
+; SSE2-BROKEN-NEXT: popq %rbp
+; SSE2-BROKEN-NEXT: retq
+;
+; SSE3-LABEL: _Z1fe:
+; SSE3: ## %bb.0: ## %entry
+; SSE3-NEXT: pushq %rbp
+; SSE3-NEXT: .cfi_def_cfa_offset 16
+; SSE3-NEXT: .cfi_offset %rbp, -16
+; SSE3-NEXT: movq %rsp, %rbp
+; SSE3-NEXT: .cfi_def_cfa_register %rbp
+; SSE3-NEXT: fldt 16(%rbp)
+; SSE3-NEXT: fld %st(0)
+; SSE3-NEXT: fisttpl -4(%rbp)
+; SSE3-NEXT: cvtsi2sdl -4(%rbp), %xmm0
+; SSE3-NEXT: movsd %xmm0, -48(%rbp)
+; SSE3-NEXT: movsd %xmm0, -24(%rbp)
+; SSE3-NEXT: fsubl -24(%rbp)
+; SSE3-NEXT: flds {{.*}}(%rip)
+; SSE3-NEXT: fmul %st(0), %st(1)
+; SSE3-NEXT: fld %st(1)
+; SSE3-NEXT: fisttpl -8(%rbp)
+; SSE3-NEXT: xorps %xmm0, %xmm0
+; SSE3-NEXT: cvtsi2sdl -8(%rbp), %xmm0
+; SSE3-NEXT: movsd %xmm0, -40(%rbp)
+; SSE3-NEXT: movsd %xmm0, -16(%rbp)
+; SSE3-NEXT: fxch %st(1)
+; SSE3-NEXT: fsubl -16(%rbp)
+; SSE3-NEXT: fmulp %st(1)
+; SSE3-NEXT: fstpl -32(%rbp)
+; SSE3-NEXT: popq %rbp
+; SSE3-NEXT: retq
+;
+; AVX-LABEL: _Z1fe:
+; AVX: ## %bb.0: ## %entry
+; AVX-NEXT: pushq %rbp
+; AVX-NEXT: .cfi_def_cfa_offset 16
+; AVX-NEXT: .cfi_offset %rbp, -16
+; AVX-NEXT: movq %rsp, %rbp
+; AVX-NEXT: .cfi_def_cfa_register %rbp
+; AVX-NEXT: fldt 16(%rbp)
+; AVX-NEXT: fld %st(0)
+; AVX-NEXT: fisttpl -4(%rbp)
+; AVX-NEXT: vcvtsi2sdl -4(%rbp), %xmm0, %xmm0
+; AVX-NEXT: vmovsd %xmm0, -48(%rbp)
+; AVX-NEXT: vmovsd %xmm0, -24(%rbp)
+; AVX-NEXT: fsubl -24(%rbp)
+; AVX-NEXT: flds {{.*}}(%rip)
+; AVX-NEXT: fmul %st(0), %st(1)
+; AVX-NEXT: fld %st(1)
+; AVX-NEXT: fisttpl -8(%rbp)
+; AVX-NEXT: vcvtsi2sdl -8(%rbp), %xmm1, %xmm0
+; AVX-NEXT: vmovsd %xmm0, -40(%rbp)
+; AVX-NEXT: vmovsd %xmm0, -16(%rbp)
+; AVX-NEXT: fxch %st(1)
+; AVX-NEXT: fsubl -16(%rbp)
+; AVX-NEXT: fmulp %st(1)
+; AVX-NEXT: fstpl -32(%rbp)
+; AVX-NEXT: popq %rbp
+; AVX-NEXT: retq
+entry:
+ %tx = alloca [3 x double], align 16
+ %0 = bitcast [3 x double]* %tx to i8*
+ %conv = fptosi x86_fp80 %z to i32
+ %conv1 = sitofp i32 %conv to double
+ %arrayidx = getelementptr inbounds [3 x double], [3 x double]* %tx, i64 0, i64 0
+ store double %conv1, double* %arrayidx, align 16
+ %conv4 = fpext double %conv1 to x86_fp80
+ %sub = fsub x86_fp80 %z, %conv4
+ %mul = fmul x86_fp80 %sub, 0xK40178000000000000000
+ %conv.1 = fptosi x86_fp80 %mul to i32
+ %conv1.1 = sitofp i32 %conv.1 to double
+ %arrayidx.1 = getelementptr inbounds [3 x double], [3 x double]* %tx, i64 0, i64 1
+ store double %conv1.1, double* %arrayidx.1, align 8
+ %conv4.1 = fpext double %conv1.1 to x86_fp80
+ %sub.1 = fsub x86_fp80 %mul, %conv4.1
+ %mul.1 = fmul x86_fp80 %sub.1, 0xK40178000000000000000
+ %conv5 = fptrunc x86_fp80 %mul.1 to double
+ %arrayidx6 = getelementptr inbounds [3 x double], [3 x double]* %tx, i64 0, i64 2
+ store double %conv5, double* %arrayidx6, align 16
+ ret void
+}
+
+attributes #0 = { noinline uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/CodeGen/X86/pr34088.ll b/test/CodeGen/X86/pr34088.ll
index d3667e3884d4..4fa24a506489 100644
--- a/test/CodeGen/X86/pr34088.ll
+++ b/test/CodeGen/X86/pr34088.ll
@@ -9,14 +9,11 @@
;
define i32 @pr34088() local_unnamed_addr {
; CHECK-LABEL: pr34088:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: pushl %ebp
-; CHECK-NEXT: .Lcfi0:
; CHECK-NEXT: .cfi_def_cfa_offset 8
-; CHECK-NEXT: .Lcfi1:
; CHECK-NEXT: .cfi_offset %ebp, -8
; CHECK-NEXT: movl %esp, %ebp
-; CHECK-NEXT: .Lcfi2:
; CHECK-NEXT: .cfi_def_cfa_register %ebp
; CHECK-NEXT: andl $-16, %esp
; CHECK-NEXT: subl $32, %esp
@@ -25,8 +22,8 @@ define i32 @pr34088() local_unnamed_addr {
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: movaps %xmm0, (%esp)
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
-; CHECK-NEXT: movaps %xmm1, (%esp)
; CHECK-NEXT: movl $-842150451, {{[0-9]+}}(%esp) # imm = 0xCDCDCDCD
+; CHECK-NEXT: movaps %xmm1, (%esp)
; CHECK-NEXT: movsd %xmm0, {{[0-9]+}}(%esp)
; CHECK-NEXT: movl %ebp, %esp
; CHECK-NEXT: popl %ebp
diff --git a/test/CodeGen/X86/pr34137.ll b/test/CodeGen/X86/pr34137.ll
index 3b767e4f96b0..6210103db17a 100644
--- a/test/CodeGen/X86/pr34137.ll
+++ b/test/CodeGen/X86/pr34137.ll
@@ -7,9 +7,9 @@
define void @pr34127() {
; CHECK-LABEL: pr34127:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movzwl {{.*}}(%rip), %eax
-; CHECK-NEXT: movw {{.*}}(%rip), %cx
+; CHECK-NEXT: movzwl {{.*}}(%rip), %ecx
; CHECK-NEXT: andw %ax, %cx
; CHECK-NEXT: andl %eax, %ecx
; CHECK-NEXT: movl %ecx, -{{[0-9]+}}(%rsp)
diff --git a/test/CodeGen/X86/pr34139.ll b/test/CodeGen/X86/pr34139.ll
index c20c2cd510c7..e5c7c5be6540 100644
--- a/test/CodeGen/X86/pr34139.ll
+++ b/test/CodeGen/X86/pr34139.ll
@@ -3,17 +3,9 @@
define void @f_f(<16 x double>* %ptr) {
; CHECK-LABEL: f_f:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; CHECK-NEXT: vmovdqa %xmm0, (%rax)
-; CHECK-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0
-; CHECK-NEXT: vmovapd (%rdi), %zmm1
-; CHECK-NEXT: vmovapd 64(%rdi), %zmm2
-; CHECK-NEXT: vptestmq %zmm0, %zmm0, %k1
-; CHECK-NEXT: vmovapd %zmm0, %zmm1 {%k1}
-; CHECK-NEXT: vmovapd %zmm0, %zmm2 {%k1}
-; CHECK-NEXT: vmovapd %zmm2, 64(%rdi)
-; CHECK-NEXT: vmovapd %zmm1, (%rdi)
store <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <16 x i8>* undef
%load_mask8.i.i.i = load <16 x i8>, <16 x i8>* undef
%v.i.i.i.i = load <16 x double>, <16 x double>* %ptr
diff --git a/test/CodeGen/X86/pr34149.ll b/test/CodeGen/X86/pr34149.ll
new file mode 100644
index 000000000000..017d68553fd8
--- /dev/null
+++ b/test/CodeGen/X86/pr34149.ll
@@ -0,0 +1,40 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=haswell | FileCheck %s
+
+declare <4 x double> @llvm.minnum.v4f64(<4 x double> %x, <4 x double> %y)
+declare <4 x double> @llvm.maxnum.v4f64(<4 x double> %x, <4 x double> %y)
+
+define <4 x double> @via_minnum(<4 x double> %x, <4 x double> %y) {
+; CHECK-LABEL: via_minnum:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vminpd %ymm0, %ymm1, %ymm2
+; CHECK-NEXT: vcmpunordpd %ymm0, %ymm0, %ymm0
+; CHECK-NEXT: vblendvpd %ymm0, %ymm1, %ymm2, %ymm0
+; CHECK-NEXT: retq
+ %z = call fast <4 x double> @llvm.minnum.v4f64(<4 x double> %x, <4 x double> %y) readnone
+ ret <4 x double> %z
+}
+
+define <4 x double> @via_maxnum(<4 x double> %x, <4 x double> %y) {
+; CHECK-LABEL: via_maxnum:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmaxpd %ymm0, %ymm1, %ymm2
+; CHECK-NEXT: vcmpunordpd %ymm0, %ymm0, %ymm0
+; CHECK-NEXT: vblendvpd %ymm0, %ymm1, %ymm2, %ymm0
+; CHECK-NEXT: retq
+ %z = call fast <4 x double> @llvm.maxnum.v4f64(<4 x double> %x, <4 x double> %y) readnone
+ ret <4 x double> %z
+}
+
+define <4 x double> @via_fcmp(<4 x double> %x, <4 x double> %y) {
+; CHECK-LABEL: via_fcmp:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vminpd %ymm0, %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %c = fcmp ule <4 x double> %x, %y
+ %z = select <4 x i1> %c, <4 x double> %x, <4 x double> %y
+ ret <4 x double> %z
+}
+
+
diff --git a/test/CodeGen/X86/pr34177.ll b/test/CodeGen/X86/pr34177.ll
index 7c210058ae6c..5904e5df4a1c 100644
--- a/test/CodeGen/X86/pr34177.ll
+++ b/test/CodeGen/X86/pr34177.ll
@@ -7,7 +7,7 @@ target triple = "x86_64-unknown-linux-gnu"
define void @test() local_unnamed_addr {
; CHECK-LABEL: test:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa {{.*#+}} xmm0 = [2,3]
; CHECK-NEXT: vpextrq $1, %xmm0, %rax
; CHECK-NEXT: vmovq %xmm0, %rcx
diff --git a/test/CodeGen/X86/pr34271-1.ll b/test/CodeGen/X86/pr34271-1.ll
index 2e2f0fd0aa94..d341ceb1c11e 100644
--- a/test/CodeGen/X86/pr34271-1.ll
+++ b/test/CodeGen/X86/pr34271-1.ll
@@ -3,7 +3,7 @@
define <16 x i16> @foo(<16 x i32> %i) {
; CHECK-LABEL: foo:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpminud {{.*}}(%rip){1to16}, %zmm0, %zmm0
; CHECK-NEXT: vpmovdw %zmm0, %ymm0
; CHECK-NEXT: retq
diff --git a/test/CodeGen/X86/pr34271.ll b/test/CodeGen/X86/pr34271.ll
index 40d01617c30d..d626f8f6bf9f 100644
--- a/test/CodeGen/X86/pr34271.ll
+++ b/test/CodeGen/X86/pr34271.ll
@@ -6,7 +6,7 @@
define <4 x i32> @f(<4 x i32> %a) {
; CHECK-LABEL: f:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: paddd .LCPI0_0(%rip), %xmm0
; CHECK-NEXT: retq
%v = add nuw nsw <4 x i32> %a, <i32 16843009, i32 16843009, i32 16843009, i32 16843009>
diff --git a/test/CodeGen/X86/pr34381.ll b/test/CodeGen/X86/pr34381.ll
new file mode 100644
index 000000000000..3053ddda5f89
--- /dev/null
+++ b/test/CodeGen/X86/pr34381.ll
@@ -0,0 +1,43 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+;RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=slow-incdec | FileCheck %s
+
+@var_21 = external constant i32, align 4
+@var_29 = external constant i8, align 1
+@var_390 = external global i32, align 4
+@var_11 = external constant i8, align 1
+@var_370 = external global i8, align 1
+
+; Function Attrs: noinline nounwind optnone uwtable
+define void @_Z3foov() {
+; CHECK-LABEL: _Z3foov:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: movsbl {{.*}}(%rip), %eax
+; CHECK-NEXT: negl %eax
+; CHECK-NEXT: cmpl %eax, {{.*}}(%rip)
+; CHECK-NEXT: setb %al
+; CHECK-NEXT: xorl %ecx, %ecx
+; CHECK-NEXT: addb $-1, %al
+; CHECK-NEXT: sete %cl
+; CHECK-NEXT: movl %ecx, {{.*}}(%rip)
+; CHECK-NEXT: movb {{.*}}(%rip), %al
+; CHECK-NEXT: movb %al, {{.*}}(%rip)
+; CHECK-NEXT: retq
+entry:
+ %0 = load i32, i32* @var_21, align 4
+ %1 = load i8, i8* @var_29, align 1
+ %conv = sext i8 %1 to i32
+ %sub = sub nsw i32 0, %conv
+ %cmp = icmp ult i32 %0, %sub
+ %conv1 = zext i1 %cmp to i32
+ %add = add nsw i32 %conv1, -1
+ %conv2 = trunc i32 %add to i8
+ %tobool = icmp ne i8 %conv2, 0
+ %lnot = xor i1 %tobool, true
+ %conv3 = zext i1 %lnot to i32
+ store i32 %conv3, i32* @var_390, align 4
+ %2 = load i8, i8* @var_11, align 1
+ %conv4 = sext i8 %2 to i16
+ %conv5 = trunc i16 %conv4 to i8
+ store i8 %conv5, i8* @var_370, align 1
+ ret void
+}
diff --git a/test/CodeGen/X86/pr34397.ll b/test/CodeGen/X86/pr34397.ll
new file mode 100644
index 000000000000..15f36a18479f
--- /dev/null
+++ b/test/CodeGen/X86/pr34397.ll
@@ -0,0 +1,24 @@
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mcpu=haswell %s -o - > /dev/null
+
+; Fix PR34397
+
+define internal fastcc <32 x i64> @test(<32 x i64> %s.0.6, <32 x i64> %s.0.7) {
+entry:
+ %s.1.6 = shufflevector <32 x i64> %s.0.6, <32 x i64> %s.0.7, <32 x i32> <i32 13, i32 12, i32 51, i32 20, i32 19, i32 14, i32 41, i32 0, i32 21, i32 11, i32 32, i32 32, i32 11, i32 7, i32 19, i32 11, i32 53, i32 11, i32 34, i32 27, i32 35, i32 5, i32 52, i32 29, i32 5, i32 24, i32 3, i32 29, i32 57, i32 0, i32 47, i32 50>
+ %s.2.5 = shufflevector <32 x i64> undef, <32 x i64> %s.1.6, <32 x i32> <i32 7, i32 56, i32 38, i32 60, i32 44, i32 38, i32 26, i32 23, i32 45, i32 52, i32 60, i32 60, i32 54, i32 25, i32 25, i32 51, i32 45, i32 62, i32 20, i32 54, i32 6, i32 30, i32 6, i32 1, i32 47, i32 47, i32 49, i32 19, i32 16, i32 40, i32 24, i32 59>
+ %s.3.4 = shufflevector <32 x i64> undef, <32 x i64> %s.2.5, <32 x i32> <i32 7, i32 38, i32 37, i32 24, i32 41, i32 43, i32 38, i32 43, i32 29, i32 47, i32 11, i32 5, i32 40, i32 61, i32 10, i32 2, i32 37, i32 51, i32 46, i32 53, i32 51, i32 25, i32 20, i32 20, i32 27, i32 60, i32 20, i32 13, i32 45, i32 58, i32 35, i32 28>
+ %s.4.4 = shufflevector <32 x i64> %s.3.4, <32 x i64> undef, <32 x i32> <i32 35, i32 17, i32 38, i32 57, i32 61, i32 9, i32 22, i32 13, i32 62, i32 49, i32 18, i32 21, i32 58, i32 30, i32 23, i32 15, i32 27, i32 11, i32 22, i32 29, i32 62, i32 61, i32 63, i32 45, i32 41, i32 57, i32 59, i32 57, i32 15, i32 58, i32 62, i32 56>
+ %s.5.4 = shufflevector <32 x i64> %s.4.4, <32 x i64> undef, <32 x i32> <i32 33, i32 45, i32 18, i32 59, i32 17, i32 33, i32 35, i32 5, i32 43, i32 30, i32 20, i32 47, i32 22, i32 48, i32 55, i32 59, i32 13, i32 15, i32 2, i32 52, i32 26, i32 57, i32 25, i32 17, i32 17, i32 17, i32 24, i32 46, i32 31, i32 49, i32 47, i32 22>
+ %s.6.3 = shufflevector <32 x i64> undef, <32 x i64> %s.5.4, <32 x i32> <i32 48, i32 15, i32 24, i32 53, i32 57, i32 40, i32 24, i32 33, i32 47, i32 53, i32 44, i32 62, i32 6, i32 15, i32 47, i32 55, i32 0, i32 59, i32 36, i32 63, i32 36, i32 50, i32 53, i32 29, i32 24, i32 49, i32 8, i32 42, i32 17, i32 58, i32 47, i32 38>
+ %s.7.2 = shufflevector <32 x i64> undef, <32 x i64> %s.6.3, <32 x i32> <i32 55, i32 0, i32 3, i32 60, i32 17, i32 20, i32 5, i32 8, i32 61, i32 44, i32 1, i32 50, i32 11, i32 17, i32 48, i32 48, i32 38, i32 41, i32 46, i32 55, i32 15, i32 54, i32 32, i32 60, i32 35, i32 40, i32 27, i32 53, i32 29, i32 44, i32 45, i32 16>
+ %s.8.2 = shufflevector <32 x i64> %s.7.2, <32 x i64> zeroinitializer, <32 x i32> <i32 26, i32 46, i32 1, i32 47, i32 29, i32 1, i32 22, i32 8, i32 55, i32 1, i32 41, i32 25, i32 63, i32 19, i32 42, i32 2, i32 17, i32 45, i32 15, i32 55, i32 52, i32 31, i32 22, i32 9, i32 34, i32 34, i32 36, i32 11, i32 48, i32 34, i32 38, i32 42>
+ %s.9.2 = shufflevector <32 x i64> %s.8.2, <32 x i64> undef, <32 x i32> <i32 19, i32 48, i32 5, i32 38, i32 38, i32 4, i32 41, i32 26, i32 7, i32 34, i32 15, i32 9, i32 22, i32 30, i32 9, i32 59, i32 12, i32 55, i32 30, i32 48, i32 34, i32 15, i32 33, i32 61, i32 63, i32 36, i32 28, i32 28, i32 22, i32 62, i32 50, i32 42>
+ %s.10.1 = shufflevector <32 x i64> undef, <32 x i64> %s.9.2, <32 x i32> <i32 38, i32 5, i32 44, i32 3, i32 4, i32 50, i32 42, i32 43, i32 9, i32 27, i32 14, i32 45, i32 5, i32 63, i32 6, i32 44, i32 49, i32 59, i32 35, i32 45, i32 30, i32 9, i32 54, i32 13, i32 56, i32 12, i32 54, i32 3, i32 21, i32 60, i32 49, i32 53>
+ %s.11.1 = shufflevector <32 x i64> %s.10.1, <32 x i64> undef, <32 x i32> <i32 50, i32 17, i32 3, i32 40, i32 26, i32 29, i32 54, i32 46, i32 2, i32 31, i32 7, i32 18, i32 51, i32 63, i32 42, i32 55, i32 15, i32 13, i32 52, i32 20, i32 50, i32 51, i32 22, i32 2, i32 49, i32 29, i32 61, i32 20, i32 2, i32 49, i32 46, i32 22>
+ %s.12.1 = shufflevector <32 x i64> %s.11.1, <32 x i64> undef, <32 x i32> <i32 26, i32 58, i32 25, i32 54, i32 53, i32 9, i32 17, i32 13, i32 18, i32 48, i32 49, i32 63, i32 19, i32 46, i32 22, i32 28, i32 23, i32 58, i32 58, i32 13, i32 22, i32 1, i32 11, i32 41, i32 19, i32 28, i32 20, i32 37, i32 12, i32 25, i32 26, i32 48>
+ %s.13.1 = shufflevector <32 x i64> %s.12.1, <32 x i64> undef, <32 x i32> <i32 15, i32 26, i32 60, i32 19, i32 16, i32 23, i32 61, i32 31, i32 45, i32 32, i32 17, i32 19, i32 35, i32 6, i32 3, i32 1, i32 19, i32 35, i32 46, i32 14, i32 36, i32 50, i32 38, i32 25, i32 21, i32 38, i32 24, i32 24, i32 5, i32 55, i32 8, i32 33>
+ %s.14.0 = shufflevector <32 x i64> undef, <32 x i64> %s.13.1, <32 x i32> <i32 6, i32 9, i32 23, i32 55, i32 45, i32 0, i32 7, i32 28, i32 6, i32 10, i32 49, i32 63, i32 50, i32 26, i32 9, i32 41, i32 41, i32 30, i32 20, i32 55, i32 17, i32 53, i32 13, i32 10, i32 32, i32 36, i32 24, i32 4, i32 36, i32 21, i32 59, i32 49>
+ %s.15.0 = shufflevector <32 x i64> %s.14.0, <32 x i64> undef, <32 x i32> <i32 8, i32 22, i32 30, i32 45, i32 10, i32 60, i32 27, i32 14, i32 21, i32 26, i32 27, i32 28, i32 5, i32 47, i32 12, i32 47, i32 26, i32 18, i32 27, i32 47, i32 47, i32 20, i32 27, i32 18, i32 60, i32 36, i32 41, i32 62, i32 26, i32 24, i32 25, i32 18>
+ %s.16.0 = shufflevector <32 x i64> %s.15.0, <32 x i64> undef, <32 x i32> <i32 63, i32 62, i32 29, i32 3, i32 49, i32 5, i32 54, i32 9, i32 53, i32 15, i32 11, i32 20, i32 27, i32 17, i32 6, i32 52, i32 35, i32 40, i32 9, i32 36, i32 63, i32 13, i32 60, i32 30, i32 54, i32 26, i32 44, i32 18, i32 47, i32 45, i32 26, i32 33>
+ ret <32 x i64> %s.16.0
+}
diff --git a/test/CodeGen/X86/pr34421.ll b/test/CodeGen/X86/pr34421.ll
new file mode 100644
index 000000000000..8241410be369
--- /dev/null
+++ b/test/CodeGen/X86/pr34421.ll
@@ -0,0 +1,40 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-apple-macosx10.13.0 | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -mtriple=x86_64-apple-macosx10.13.0 | FileCheck %s --check-prefix=X64
+
+define void @thread_selfcounts() noimplicitfloat noredzone nounwind {
+; X86-LABEL: thread_selfcounts:
+; X86: ## %bb.0: ## %entry
+; X86-NEXT: subl $44, %esp
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %eax, (%esp)
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: ## -- End function
+;
+; X64-LABEL: thread_selfcounts:
+; X64: ## %bb.0: ## %entry
+; X64-NEXT: subq $40, %rsp
+; X64-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; X64-NEXT: movq {{[0-9]+}}(%rsp), %rcx
+; X64-NEXT: movq %rax, (%rsp)
+; X64-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
+; X64-NEXT: ## -- End function
+entry:
+ %counts = alloca [2 x i64], align 16
+ %thread_counts = alloca [3 x i64], align 16
+ %arraydecay = getelementptr inbounds [3 x i64], [3 x i64]* %thread_counts, i64 0, i64 0
+ %0 = load i64, i64* %arraydecay, align 16
+ %arrayidx3 = getelementptr inbounds [2 x i64], [2 x i64]* %counts, i64 0, i64 0
+ store i64 %0, i64* %arrayidx3, align 16
+ %arrayidx6 = getelementptr inbounds [3 x i64], [3 x i64]* %thread_counts, i64 0, i64 1
+ %1 = load i64, i64* %arrayidx6, align 8
+ %arrayidx10 = getelementptr inbounds [2 x i64], [2 x i64]* %counts, i64 0, i64 1
+ store i64 %1, i64* %arrayidx10, align 8
+ unreachable
+}
+
diff --git a/test/CodeGen/X86/pr34605.ll b/test/CodeGen/X86/pr34605.ll
index 8330511d6449..2d51a53dc41e 100644
--- a/test/CodeGen/X86/pr34605.ll
+++ b/test/CodeGen/X86/pr34605.ll
@@ -3,7 +3,7 @@
define void @pr34605(i8* nocapture %s, i32 %p) {
; CHECK-LABEL: pr34605:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %zmm0
; CHECK-NEXT: vpcmpeqd {{\.LCPI.*}}, %zmm0, %k0
@@ -13,21 +13,20 @@ define void @pr34605(i8* nocapture %s, i32 %p) {
; CHECK-NEXT: vpcmpeqd {{\.LCPI.*}}, %zmm0, %k2
; CHECK-NEXT: kunpckwd %k1, %k2, %k1
; CHECK-NEXT: kunpckdq %k0, %k1, %k0
-; CHECK-NEXT: kxord %k0, %k0, %k1
; CHECK-NEXT: movl $1, %ecx
-; CHECK-NEXT: kmovd %ecx, %k2
-; CHECK-NEXT: kunpckdq %k2, %k1, %k1
+; CHECK-NEXT: kmovd %ecx, %k1
+; CHECK-NEXT: kmovd %k1, %k1
; CHECK-NEXT: kandq %k1, %k0, %k1
; CHECK-NEXT: vmovdqu8 {{\.LCPI.*}}, %zmm0 {%k1} {z}
-; CHECK-NEXT: vmovdqu8 %zmm0, (%eax)
-; CHECK-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; CHECK-NEXT: vmovdqu32 %zmm0, 64(%eax)
-; CHECK-NEXT: vmovdqu32 %zmm0, 128(%eax)
-; CHECK-NEXT: vmovdqu32 %zmm0, 192(%eax)
-; CHECK-NEXT: vmovdqu32 %zmm0, 256(%eax)
-; CHECK-NEXT: vmovdqu32 %zmm0, 320(%eax)
-; CHECK-NEXT: vmovdqu32 %zmm0, 384(%eax)
-; CHECK-NEXT: vmovdqu32 %zmm0, 448(%eax)
+; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vmovdqu32 %zmm0, (%eax)
+; CHECK-NEXT: vmovups %zmm1, 64(%eax)
+; CHECK-NEXT: vmovups %zmm1, 128(%eax)
+; CHECK-NEXT: vmovups %zmm1, 192(%eax)
+; CHECK-NEXT: vmovups %zmm1, 256(%eax)
+; CHECK-NEXT: vmovups %zmm1, 320(%eax)
+; CHECK-NEXT: vmovups %zmm1, 384(%eax)
+; CHECK-NEXT: vmovups %zmm1, 448(%eax)
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retl
entry:
diff --git a/test/CodeGen/X86/pr34629.ll b/test/CodeGen/X86/pr34629.ll
new file mode 100644
index 000000000000..55084b425c72
--- /dev/null
+++ b/test/CodeGen/X86/pr34629.ll
@@ -0,0 +1,52 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@b = common local_unnamed_addr global i64 0, align 8
+@a = common local_unnamed_addr global i8 0, align 1
+
+; Function Attrs: norecurse nounwind uwtable
+define void @c() local_unnamed_addr #0 {
+; CHECK-LABEL: c:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: movq {{.*}}(%rip), %rax
+; CHECK-NEXT: leaq (%rax,%rax,4), %rcx
+; CHECK-NEXT: negq %rcx
+; CHECK-NEXT: leaq (%rax,%rax,8), %rax
+; CHECK-NEXT: leaq (%rax,%rax,4), %rax
+; CHECK-NEXT: testq %rax, %rcx
+; CHECK-NEXT: je .LBB0_2
+; CHECK-NEXT: # %bb.1: # %if.then
+; CHECK-NEXT: movb $0, {{.*}}(%rip)
+; CHECK-NEXT: .LBB0_2: # %if.end
+; CHECK-NEXT: retq
+entry:
+ %0 = load i64, i64* @b, align 8, !tbaa !2
+ %mul = mul nsw i64 %0, -5
+ %mul1 = mul nsw i64 %0, 45
+ %and = and i64 %mul, %mul1
+ %tobool = icmp eq i64 %and, 0
+ br i1 %tobool, label %if.end, label %if.then
+
+if.then: ; preds = %entry
+ store i8 0, i8* @a, align 1, !tbaa !6
+ br label %if.end
+
+if.end: ; preds = %entry, %if.then
+ ret void
+}
+
+attributes #0 = { norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.module.flags = !{!0}
+!llvm.ident = !{!1}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{!"clang version 6.0.0 "}
+!2 = !{!3, !3, i64 0}
+!3 = !{!"long", !4, i64 0}
+!4 = !{!"omnipotent char", !5, i64 0}
+!5 = !{!"Simple C/C++ TBAA"}
+!6 = !{!4, !4, i64 0}
diff --git a/test/CodeGen/X86/pr34634.ll b/test/CodeGen/X86/pr34634.ll
new file mode 100644
index 000000000000..9ed78a28d4d9
--- /dev/null
+++ b/test/CodeGen/X86/pr34634.ll
@@ -0,0 +1,67 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@a = common local_unnamed_addr global [1 x [10 x i32]] zeroinitializer, align 16
+@c = common local_unnamed_addr global i32 0, align 4
+@b = common local_unnamed_addr global [1 x [7 x i32]] zeroinitializer, align 16
+
+; Function Attrs: norecurse nounwind uwtable
+define void @fn1() local_unnamed_addr #0 {
+; CHECK-LABEL: fn1:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: movslq {{.*}}(%rip), %rax
+; CHECK-NEXT: leaq (%rax,%rax,4), %rcx
+; CHECK-NEXT: leaq (,%rax,4), %rdx
+; CHECK-NEXT: movl a(%rdx,%rcx,8), %ecx
+; CHECK-NEXT: leaq (%rax,%rax,8), %rdx
+; CHECK-NEXT: leaq (%rdx,%rdx,2), %rdx
+; CHECK-NEXT: addq %rax, %rdx
+; CHECK-NEXT: movl %ecx, b(%rdx,%rax,4)
+; CHECK-NEXT: retq
+entry:
+ %0 = load i32, i32* @c, align 4, !tbaa !2
+ %idxprom = sext i32 %0 to i64
+ %arrayidx2 = getelementptr inbounds [1 x [10 x i32]], [1 x [10 x i32]]* @a, i64 0, i64 %idxprom, i64 %idxprom
+ %1 = load i32, i32* %arrayidx2, align 4, !tbaa !2
+ %arrayidx6 = getelementptr inbounds [1 x [7 x i32]], [1 x [7 x i32]]* @b, i64 0, i64 %idxprom, i64 %idxprom
+ store i32 %1, i32* %arrayidx6, align 4, !tbaa !2
+ ret void
+}
+
+; Function Attrs: norecurse nounwind uwtable
+define i32 @main() local_unnamed_addr #0 {
+; CHECK-LABEL: main:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: movslq {{.*}}(%rip), %rax
+; CHECK-NEXT: leaq (%rax,%rax,4), %rcx
+; CHECK-NEXT: leaq (,%rax,4), %rdx
+; CHECK-NEXT: movl a(%rdx,%rcx,8), %ecx
+; CHECK-NEXT: leaq (%rax,%rax,8), %rdx
+; CHECK-NEXT: leaq (%rdx,%rdx,2), %rdx
+; CHECK-NEXT: addq %rax, %rdx
+; CHECK-NEXT: movl %ecx, b(%rdx,%rax,4)
+; CHECK-NEXT: xorl %eax, %eax
+; CHECK-NEXT: retq
+entry:
+ %0 = load i32, i32* @c, align 4, !tbaa !2
+ %idxprom.i = sext i32 %0 to i64
+ %arrayidx2.i = getelementptr inbounds [1 x [10 x i32]], [1 x [10 x i32]]* @a, i64 0, i64 %idxprom.i, i64 %idxprom.i
+ %1 = load i32, i32* %arrayidx2.i, align 4, !tbaa !2
+ %arrayidx6.i = getelementptr inbounds [1 x [7 x i32]], [1 x [7 x i32]]* @b, i64 0, i64 %idxprom.i, i64 %idxprom.i
+ store i32 %1, i32* %arrayidx6.i, align 4, !tbaa !2
+ ret i32 0
+}
+
+attributes #0 = { norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.module.flags = !{!0}
+!llvm.ident = !{!1}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{!"clang version 6.0.0 "}
+!2 = !{!3, !3, i64 0}
+!3 = !{!"int", !4, i64 0}
+!4 = !{!"omnipotent char", !5, i64 0}
+!5 = !{!"Simple C/C++ TBAA"}
diff --git a/test/CodeGen/X86/pr34653.ll b/test/CodeGen/X86/pr34653.ll
new file mode 100644
index 000000000000..990cd9ac8b27
--- /dev/null
+++ b/test/CodeGen/X86/pr34653.ll
@@ -0,0 +1,209 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+avx512f -o - | FileCheck %s
+
+declare fastcc <38 x double> @test()
+
+define void @pr34653() {
+; CHECK-LABEL: pr34653:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: pushq %rbp
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: .cfi_offset %rbp, -16
+; CHECK-NEXT: movq %rsp, %rbp
+; CHECK-NEXT: .cfi_def_cfa_register %rbp
+; CHECK-NEXT: andq $-512, %rsp # imm = 0xFE00
+; CHECK-NEXT: subq $2048, %rsp # imm = 0x800
+; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; CHECK-NEXT: callq test
+; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm0
+; CHECK-NEXT: vmovaps %xmm0, %xmm1
+; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %ymm2
+; CHECK-NEXT: vextractf128 $1, %ymm2, %xmm3
+; CHECK-NEXT: vmovaps %xmm3, %xmm4
+; CHECK-NEXT: vmovaps %xmm2, %xmm5
+; CHECK-NEXT: vmovaps %xmm5, %xmm6
+; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %zmm7
+; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %zmm8
+; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %zmm9
+; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %zmm10
+; CHECK-NEXT: vextractf32x4 $3, %zmm10, %xmm11
+; CHECK-NEXT: vmovaps %xmm11, %xmm12
+; CHECK-NEXT: vextractf32x4 $2, %zmm10, %xmm13
+; CHECK-NEXT: vmovaps %xmm13, %xmm14
+; CHECK-NEXT: vmovaps %xmm10, %xmm15
+; CHECK-NEXT: vmovaps %xmm15, %xmm2
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vextractf32x4 $3, %zmm9, %xmm0
+; CHECK-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vextractf32x4 $2, %zmm9, %xmm0
+; CHECK-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovaps %xmm9, %xmm0
+; CHECK-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vextractf32x4 $3, %zmm8, %xmm0
+; CHECK-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vextractf32x4 $2, %zmm8, %xmm0
+; CHECK-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovaps %xmm8, %xmm0
+; CHECK-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vextractf32x4 $3, %zmm7, %xmm0
+; CHECK-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vextractf32x4 $2, %zmm7, %xmm0
+; CHECK-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovaps %xmm7, %xmm0
+; CHECK-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; CHECK-NEXT: vpermilpd {{.*#+}} xmm3 = xmm3[1,0]
+; CHECK-NEXT: vpermilpd {{.*#+}} xmm5 = xmm5[1,0]
+; CHECK-NEXT: vpermilpd {{.*#+}} xmm11 = xmm11[1,0]
+; CHECK-NEXT: vpermilpd {{.*#+}} xmm13 = xmm13[1,0]
+; CHECK-NEXT: # kill: def %ymm10 killed %ymm10 killed %zmm10
+; CHECK-NEXT: vextractf128 $1, %ymm10, %xmm10
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovaps %xmm10, %xmm0
+; CHECK-NEXT: vpermilpd {{.*#+}} xmm15 = xmm15[1,0]
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; CHECK-NEXT: # kill: def %ymm9 killed %ymm9 killed %zmm9
+; CHECK-NEXT: vextractf128 $1, %ymm9, %xmm9
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovaps %xmm9, %xmm0
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; CHECK-NEXT: # kill: def %ymm8 killed %ymm8 killed %zmm8
+; CHECK-NEXT: vextractf128 $1, %ymm8, %xmm8
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovaps %xmm8, %xmm0
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; CHECK-NEXT: # kill: def %ymm7 killed %ymm7 killed %zmm7
+; CHECK-NEXT: vextractf128 $1, %ymm7, %xmm7
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovaps %xmm7, %xmm0
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; CHECK-NEXT: vpermilpd {{.*#+}} xmm10 = xmm10[1,0]
+; CHECK-NEXT: vpermilpd {{.*#+}} xmm9 = xmm9[1,0]
+; CHECK-NEXT: vpermilpd {{.*#+}} xmm8 = xmm8[1,0]
+; CHECK-NEXT: vpermilpd {{.*#+}} xmm7 = xmm7[1,0]
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT: # xmm0 = mem[0],zero
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT: # xmm0 = mem[0],zero
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT: # xmm0 = mem[0],zero
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT: # xmm0 = mem[0],zero
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT: # xmm0 = mem[0],zero
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT: # xmm0 = mem[0],zero
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT: # xmm0 = mem[0],zero
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT: # xmm0 = mem[0],zero
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT: # xmm0 = mem[0],zero
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT: # xmm0 = mem[0],zero
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT: # xmm0 = mem[0],zero
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT: # xmm0 = mem[0],zero
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT: # xmm0 = mem[0],zero
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT: # xmm0 = mem[0],zero
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT: # xmm0 = mem[0],zero
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT: # xmm0 = mem[0],zero
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT: # xmm0 = mem[0],zero
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT: # xmm0 = mem[0],zero
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT: # xmm0 = mem[0],zero
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT: # xmm0 = mem[0],zero
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT: # xmm0 = mem[0],zero
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT: # xmm0 = mem[0],zero
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd {{[0-9]+}}(%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT: # xmm0 = mem[0],zero
+; CHECK-NEXT: vmovsd %xmm8, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd %xmm13, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd %xmm1, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd %xmm14, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd %xmm2, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd %xmm4, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd %xmm9, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd %xmm10, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd %xmm15, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd %xmm11, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd %xmm3, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd %xmm6, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd %xmm5, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd %xmm12, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd %xmm0, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: vmovsd %xmm7, {{[0-9]+}}(%rsp) # 8-byte Spill
+; CHECK-NEXT: movq %rbp, %rsp
+; CHECK-NEXT: popq %rbp
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+entry:
+ %v = call fastcc <38 x double> @test()
+ %v.0 = extractelement <38 x double> %v, i32 0
+ ret void
+}
+
diff --git a/test/CodeGen/X86/pr34657.ll b/test/CodeGen/X86/pr34657.ll
new file mode 100644
index 000000000000..58c97f656354
--- /dev/null
+++ b/test/CodeGen/X86/pr34657.ll
@@ -0,0 +1,20 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc %s -O2 -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw -o - | FileCheck %s
+
+define <112 x i8> @pr34657() local_unnamed_addr {
+; CHECK-LABEL: pr34657
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vmovups (%rax), %xmm0
+; CHECK-NEXT: vmovups (%rax), %ymm1
+; CHECK-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; CHECK-NEXT: vmovups (%rax), %zmm2
+; CHECK-NEXT: vmovaps %ymm1, 64(%rdi)
+; CHECK-NEXT: vmovaps %zmm2, (%rdi)
+; CHECK-NEXT: vextractf32x4 $2, %zmm0, 96(%rdi)
+; CHECK-NEXT: movq %rdi, %rax
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+entry:
+ %wide.vec51 = load <112 x i8>, <112 x i8>* undef, align 2
+ ret <112 x i8> %wide.vec51
+}
diff --git a/test/CodeGen/X86/pr34855.ll b/test/CodeGen/X86/pr34855.ll
new file mode 100644
index 000000000000..746d1ff56cca
--- /dev/null
+++ b/test/CodeGen/X86/pr34855.ll
@@ -0,0 +1,27 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X64
+
+define void @PR34855(<2 x i32> *%p0, <2 x i32> *%p1, <2 x i32> *%p2) {
+; X86-LABEL: PR34855:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT: movsd %xmm0, (%eax)
+; X86-NEXT: retl
+;
+; X64-LABEL: PR34855:
+; X64: # %bb.0:
+; X64-NEXT: movq (%rdi), %rax
+; X64-NEXT: movq %rax, (%rdx)
+; X64-NEXT: retq
+ %tmp = load <2 x i32>, <2 x i32>* %p0, align 8
+ %tmp1 = load <2 x i32>, <2 x i32>* %p1, align 8
+ %mul = mul <2 x i32> zeroinitializer, %tmp1
+ %mul1 = mul <2 x i32> <i32 -8190, i32 -8190>, %mul
+ %mul2 = mul <2 x i32> <i32 3, i32 3>, %mul1
+ %shr = ashr <2 x i32> %tmp, %mul2
+ store <2 x i32> %shr, <2 x i32>* %p2, align 8
+ ret void
+}
diff --git a/test/CodeGen/X86/pr3522.ll b/test/CodeGen/X86/pr3522.ll
index 9e048d59d4ee..d7a332b1fed7 100644
--- a/test/CodeGen/X86/pr3522.ll
+++ b/test/CodeGen/X86/pr3522.ll
@@ -1,5 +1,5 @@
; REQUIRES: asserts
-; RUN: llc < %s -march=x86 -stats 2>&1 | not grep "instructions sunk"
+; RUN: llc < %s -stats 2>&1 | not grep "instructions sunk"
; PR3522
target triple = "i386-pc-linux-gnu"
diff --git a/test/CodeGen/X86/pr35272.ll b/test/CodeGen/X86/pr35272.ll
new file mode 100644
index 000000000000..0df1d7cb83ce
--- /dev/null
+++ b/test/CodeGen/X86/pr35272.ll
@@ -0,0 +1,14 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=skx | FileCheck %s
+
+define <2 x i48> @PR35272(<2 x i64> %a0, <2 x i48> %a1, <2 x i48> %a2) {
+; CHECK-LABEL: PR35272:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpcmpeqq %xmm3, %xmm0, %k1
+; CHECK-NEXT: vpblendmq %xmm1, %xmm2, %xmm0 {%k1}
+; CHECK-NEXT: retq
+ %1 = icmp eq <2 x i64> %a0, zeroinitializer
+ %2 = select <2 x i1> %1, <2 x i48> %a1, <2 x i48> %a2
+ ret <2 x i48> %2
+}
diff --git a/test/CodeGen/X86/pr35399.ll b/test/CodeGen/X86/pr35399.ll
new file mode 100644
index 000000000000..9b4b029b5171
--- /dev/null
+++ b/test/CodeGen/X86/pr35399.ll
@@ -0,0 +1,22 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=lzcnt | FileCheck %s
+
+; Make sure we emit opoosite setcc instructions.
+define i64 @pr35399(i64, i8*, i8*) {
+; CHECK-LABEL: pr35399:
+; CHECK: # %bb.0:
+; CHECK-NEXT: lzcntq %rdi, %rax
+; CHECK-NEXT: setae (%rsi)
+; CHECK-NEXT: setb (%rdx)
+; CHECK-NEXT: retq
+ %4 = tail call i64 @llvm.ctlz.i64(i64 %0, i1 false)
+ %5 = icmp ne i64 %0, 0
+ %6 = zext i1 %5 to i8
+ store i8 %6, i8* %1, align 1
+ %7 = xor i1 %5, true
+ %8 = zext i1 %7 to i8
+ store i8 %8, i8* %2, align 1
+ ret i64 %4
+}
+
+declare i64 @llvm.ctlz.i64(i64, i1)
diff --git a/test/CodeGen/X86/pr35443.ll b/test/CodeGen/X86/pr35443.ll
new file mode 100644
index 000000000000..ac2c05adb892
--- /dev/null
+++ b/test/CodeGen/X86/pr35443.ll
@@ -0,0 +1,30 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=skx | FileCheck %s
+
+@ac = external local_unnamed_addr global [20 x i8], align 16
+@ai3 = external local_unnamed_addr global [20 x i32], align 16
+
+; Function Attrs: norecurse nounwind uwtable
+define void @pr35443() {
+; CHECK-LABEL: pr35443:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: movzbl ac+{{.*}}(%rip), %eax
+; CHECK-NEXT: vmovd %eax, %xmm0
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpsubq %ymm0, %ymm1, %ymm0
+; CHECK-NEXT: vpmovqd %ymm0, ai3+{{.*}}(%rip)
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+entry:
+ %wide.masked.load66 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* bitcast (i8* getelementptr inbounds ([20 x i8], [20 x i8]* @ac, i64 0, i64 4) to <4 x i8>*), i32 1, <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x i8> undef)
+ %0 = zext <4 x i8> %wide.masked.load66 to <4 x i64>
+ %1 = sub <4 x i64> zeroinitializer, %0
+ %predphi = shufflevector <4 x i64> %1, <4 x i64> undef, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+ %2 = trunc <4 x i64> %predphi to <4 x i32>
+ %3 = add <4 x i32> zeroinitializer, %2
+ store <4 x i32> %3, <4 x i32>* bitcast (i32* getelementptr inbounds ([20 x i32], [20 x i32]* @ai3, i64 0, i64 4) to <4 x i32>*), align 16
+ ret void
+}
+
+; Function Attrs: argmemonly nounwind readonly
+declare <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>*, i32, <4 x i1>, <4 x i8>)
diff --git a/test/CodeGen/X86/pr35636.ll b/test/CodeGen/X86/pr35636.ll
new file mode 100644
index 000000000000..70ff8d83e7e3
--- /dev/null
+++ b/test/CodeGen/X86/pr35636.ll
@@ -0,0 +1,35 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=znver1 | FileCheck %s
+
+define void @_Z15uint64_to_asciimPc(i64 %arg) {
+; CHECK-LABEL: _Z15uint64_to_asciimPc:
+; CHECK: # %bb.0: # %bb
+; CHECK-NEXT: movabsq $811296384146066817, %rax # imm = 0xB424DC35095CD81
+; CHECK-NEXT: movq %rdi, %rdx
+; CHECK-NEXT: mulxq %rax, %rax, %rcx
+; CHECK-NEXT: shrq $42, %rcx
+; CHECK-NEXT: imulq $281474977, %rcx, %rax # imm = 0x10C6F7A1
+; CHECK-NEXT: shrq $20, %rax
+; CHECK-NEXT: leal 5(%rax,%rax,4), %eax
+; CHECK-NEXT: andl $134217727, %eax # imm = 0x7FFFFFF
+; CHECK-NEXT: leal (%rax,%rax,4), %eax
+; CHECK-NEXT: shrl $26, %eax
+; CHECK-NEXT: orb $48, %al
+; CHECK-NEXT: movb %al, (%rax)
+; CHECK-NEXT: retq
+bb:
+ %tmp = udiv i64 %arg, 100000000000000
+ %tmp1 = mul nuw nsw i64 %tmp, 281474977
+ %tmp2 = lshr i64 %tmp1, 20
+ %tmp3 = trunc i64 %tmp2 to i32
+ %tmp4 = add nuw nsw i32 %tmp3, 1
+ %tmp5 = and i32 %tmp4, 268435455
+ %tmp6 = mul nuw nsw i32 %tmp5, 5
+ %tmp7 = and i32 %tmp6, 134217727
+ %tmp8 = mul nuw nsw i32 %tmp7, 5
+ %tmp9 = lshr i32 %tmp8, 26
+ %tmp10 = trunc i32 %tmp9 to i8
+ %tmp11 = or i8 %tmp10, 48
+ store i8 %tmp11, i8* undef, align 1
+ ret void
+}
diff --git a/test/CodeGen/X86/pr5145.ll b/test/CodeGen/X86/pr5145.ll
index 259d55b030e5..7da7c299791e 100644
--- a/test/CodeGen/X86/pr5145.ll
+++ b/test/CodeGen/X86/pr5145.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=x86-64 < %s | FileCheck %s
+; RUN: llc -mtriple=x86_64-- < %s | FileCheck %s
@sc8 = external global i8
define void @atomic_maxmin_i8() {
diff --git a/test/CodeGen/X86/pr7882.ll b/test/CodeGen/X86/pr7882.ll
index 88404dbe125e..13cece8a3366 100644
--- a/test/CodeGen/X86/pr7882.ll
+++ b/test/CodeGen/X86/pr7882.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mtriple=i686-apple-darwin -pre-RA-sched=fast \
+; RUN: llc < %s -mtriple=i686-apple-darwin -pre-RA-sched=fast \
; RUN: | FileCheck %s
; make sure scheduler honors the flags clobber. PR 7882.
diff --git a/test/CodeGen/X86/pr9743.ll b/test/CodeGen/X86/pr9743.ll
index 6597c235330c..73b3c7f835c5 100644
--- a/test/CodeGen/X86/pr9743.ll
+++ b/test/CodeGen/X86/pr9743.ll
@@ -6,12 +6,9 @@ define void @f() {
; CHECK: .cfi_startproc
; CHECK-NEXT: pushq
-; CHECK-NEXT: :
; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: :
; CHECK-NEXT: .cfi_offset %rbp, -16
; CHECK-NEXT: movq %rsp, %rbp
-; CHECK-NEXT: :
; CHECK-NEXT: .cfi_def_cfa_register %rbp
; CHECK-NEXT: popq %rbp
; CHECK-NEXT: ret
diff --git a/test/CodeGen/X86/pre-coalesce.mir b/test/CodeGen/X86/pre-coalesce.mir
index 17d447dd097b..fb9429bc1484 100644
--- a/test/CodeGen/X86/pre-coalesce.mir
+++ b/test/CodeGen/X86/pre-coalesce.mir
@@ -40,7 +40,7 @@
---
# Check A = B and B = A copies will not exist in the loop at the same time.
# CHECK: name: foo
-# CHECK: [[L1:bb.3.while.body]]:
+# CHECK: [[L1:bb.3]].{{[a-zA-Z0-9.]+}}:
# CHECK: %[[REGA:.*]] = COPY %[[REGB:.*]]
# CHECK-NOT: %[[REGB]] = COPY %[[REGA]]
# CHECK: JNE_1 %[[L1]]
@@ -83,15 +83,15 @@ frameInfo:
hasMustTailInVarArgFunc: false
body: |
bb.0.entry:
- %0 = MOV64rm %rip, 1, _, @b, _ :: (dereferenceable load 8 from @b)
- %12 = MOV8rm %0, 1, _, 0, _ :: (load 1 from %ir.t0)
+ %0 = MOV64rm %rip, 1, %noreg, @b, %noreg :: (dereferenceable load 8 from @b)
+ %12 = MOV8rm %0, 1, %noreg, 0, %noreg :: (load 1 from %ir.t0)
TEST8rr %12, %12, implicit-def %eflags
- %11 = MOV32rm %rip, 1, _, @a, _ :: (dereferenceable load 4 from @a)
- JNE_1 %bb.1.while.body.preheader, implicit killed %eflags
+ %11 = MOV32rm %rip, 1, %noreg, @a, %noreg :: (dereferenceable load 4 from @a)
+ JNE_1 %bb.1, implicit killed %eflags
bb.4:
%10 = COPY %11
- JMP_1 %bb.3.while.end
+ JMP_1 %bb.3
bb.1.while.body.preheader:
@@ -101,12 +101,12 @@ body: |
%10 = SHL32ri %10, 5, implicit-def dead %eflags
%10 = ADD32rr %10, %11, implicit-def dead %eflags
%10 = ADD32rr %10, %8, implicit-def dead %eflags
- MOV32mr %rip, 1, _, @a, _, %10 :: (store 4 into @a)
- %12 = MOV8rm %0, 1, _, 0, _ :: (load 1 from %ir.t0)
+ MOV32mr %rip, 1, %noreg, @a, %noreg, %10 :: (store 4 into @a)
+ %12 = MOV8rm %0, 1, %noreg, 0, %noreg :: (load 1 from %ir.t0)
TEST8rr %12, %12, implicit-def %eflags
%11 = COPY %10
- JNE_1 %bb.2.while.body, implicit killed %eflags
- JMP_1 %bb.3.while.end
+ JNE_1 %bb.2, implicit killed %eflags
+ JMP_1 %bb.3
bb.3.while.end:
%eax = COPY %10
diff --git a/test/CodeGen/X86/prefetch.ll b/test/CodeGen/X86/prefetch.ll
index d6571acbbb7e..17a9ac994a79 100644
--- a/test/CodeGen/X86/prefetch.ll
+++ b/test/CodeGen/X86/prefetch.ll
@@ -1,9 +1,9 @@
-; RUN: llc < %s -march=x86 -mattr=+sse | FileCheck %s
-; RUN: llc < %s -march=x86 -mattr=+avx | FileCheck %s
-; RUN: llc < %s -march=x86 -mattr=+sse -mattr=+prfchw | FileCheck %s -check-prefix=PRFCHW
-; RUN: llc < %s -march=x86 -mcpu=slm | FileCheck %s -check-prefix=SLM
-; RUN: llc < %s -march=x86 -mcpu=btver2 | FileCheck %s -check-prefix=PRFCHW
-; RUN: llc < %s -march=x86 -mcpu=btver2 -mattr=-prfchw | FileCheck %s -check-prefix=NOPRFCHW
+; RUN: llc < %s -mtriple=i686-- -mattr=+sse | FileCheck %s
+; RUN: llc < %s -mtriple=i686-- -mattr=+avx | FileCheck %s
+; RUN: llc < %s -mtriple=i686-- -mattr=+sse -mattr=+prfchw | FileCheck %s -check-prefix=PRFCHW
+; RUN: llc < %s -mtriple=i686-- -mcpu=slm | FileCheck %s -check-prefix=SLM
+; RUN: llc < %s -mtriple=i686-- -mcpu=btver2 | FileCheck %s -check-prefix=PRFCHW
+; RUN: llc < %s -mtriple=i686-- -mcpu=btver2 -mattr=-prfchw | FileCheck %s -check-prefix=NOPRFCHW
; rdar://10538297
diff --git a/test/CodeGen/X86/prolog-push-seq.ll b/test/CodeGen/X86/prolog-push-seq.ll
index f23791aef922..99095104d0f4 100644
--- a/test/CodeGen/X86/prolog-push-seq.ll
+++ b/test/CodeGen/X86/prolog-push-seq.ll
@@ -16,4 +16,4 @@ define fastcc void @foo(i32 %a, i32 %b) #0 {
ret void
}
-attributes #0 = { nounwind optsize "no-frame-pointer-elim-non-leaf"} \ No newline at end of file
+attributes #0 = { nounwind optsize "no-frame-pointer-elim-non-leaf"}
diff --git a/test/CodeGen/X86/prologue-epilogue-remarks.mir b/test/CodeGen/X86/prologue-epilogue-remarks.mir
new file mode 100644
index 000000000000..a57305d80bf8
--- /dev/null
+++ b/test/CodeGen/X86/prologue-epilogue-remarks.mir
@@ -0,0 +1,58 @@
+# RUN: llc -mtriple=x86_64-unknown-unknown -run-pass=prologepilog -pass-remarks-output=%t -pass-remarks-analysis=prologepilog -o /dev/null %s 2>&1
+# RUN: cat %t | FileCheck %s
+...
+---
+name: fun0
+stack:
+ - { id: 0, type: default, offset: 0, size: 8, alignment: 4 }
+# --- !Analysis
+# CHECK: Pass: prologepilog
+# CHECK-NEXT: Name: StackSize
+# CHECK-NEXT: Function: fun0
+# CHECK-NEXT: Args:
+# CHECK-NEXT: - NumStackBytes: '8'
+# CHECK-NEXT: - String: ' stack bytes in function'
+# CHECK-NEXT: ...
+
+constants:
+body: |
+ bb.0:
+ RETQ
+
+...
+---
+name: fun1
+stack:
+ - { id: 0, type: default, offset: 0, size: 19, alignment: 4 }
+# --- !Analysis
+# CHECK: Pass: prologepilog
+# CHECK-NEXT: Name: StackSize
+# CHECK-NEXT: Function: fun1
+# CHECK-NEXT: Args:
+# CHECK-NEXT: - NumStackBytes: '20'
+# CHECK-NEXT: - String: ' stack bytes in function'
+# CHECK-NEXT: ...
+constants:
+body: |
+ bb.0:
+ RETQ
+
+...
+---
+name: fun2
+stack:
+ - { id: 0, type: default, offset: 0, size: 1024, alignment: 8 }
+# --- !Analysis
+# CHECK: Pass: prologepilog
+# CHECK-NEXT: Name: StackSize
+# CHECK-NEXT: Function: fun2
+# CHECK-NEXT: Args:
+# CHECK-NEXT: - NumStackBytes: '1024'
+# CHECK-NEXT: - String: ' stack bytes in function'
+# CHECK-NEXT: ...
+constants:
+body: |
+ bb.0:
+ RETQ
+
+...
diff --git a/test/CodeGen/X86/promote-trunc.ll b/test/CodeGen/X86/promote-trunc.ll
index a20557a1fef2..d42108e5b508 100644
--- a/test/CodeGen/X86/promote-trunc.ll
+++ b/test/CodeGen/X86/promote-trunc.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64
+; RUN: llc < %s -mtriple=x86_64--
define<4 x i8> @func_8_64() {
%F = load <4 x i64>, <4 x i64>* undef
diff --git a/test/CodeGen/X86/promote-vec3.ll b/test/CodeGen/X86/promote-vec3.ll
index 42aeeb14739d..085e14ecb3b3 100644
--- a/test/CodeGen/X86/promote-vec3.ll
+++ b/test/CodeGen/X86/promote-vec3.ll
@@ -7,7 +7,7 @@
define <3 x i16> @zext_i8(<3 x i8>) {
; SSE3-LABEL: zext_i8:
-; SSE3: # BB#0:
+; SSE3: # %bb.0:
; SSE3-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; SSE3-NEXT: movd %eax, %xmm0
; SSE3-NEXT: movzbl {{[0-9]+}}(%esp), %eax
@@ -19,13 +19,13 @@ define <3 x i16> @zext_i8(<3 x i8>) {
; SSE3-NEXT: pextrw $2, %xmm0, %ecx
; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; SSE3-NEXT: movd %xmm0, %eax
-; SSE3-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
-; SSE3-NEXT: # kill: %DX<def> %DX<kill> %EDX<kill>
-; SSE3-NEXT: # kill: %CX<def> %CX<kill> %ECX<kill>
+; SSE3-NEXT: # kill: def %ax killed %ax killed %eax
+; SSE3-NEXT: # kill: def %dx killed %dx killed %edx
+; SSE3-NEXT: # kill: def %cx killed %cx killed %ecx
; SSE3-NEXT: retl
;
; SSE41-LABEL: zext_i8:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pxor %xmm0, %xmm0
; SSE41-NEXT: pinsrb $0, {{[0-9]+}}(%esp), %xmm0
; SSE41-NEXT: pinsrb $4, {{[0-9]+}}(%esp), %xmm0
@@ -33,13 +33,13 @@ define <3 x i16> @zext_i8(<3 x i8>) {
; SSE41-NEXT: movd %xmm0, %eax
; SSE41-NEXT: pextrw $2, %xmm0, %edx
; SSE41-NEXT: pextrw $4, %xmm0, %ecx
-; SSE41-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
-; SSE41-NEXT: # kill: %DX<def> %DX<kill> %EDX<kill>
-; SSE41-NEXT: # kill: %CX<def> %CX<kill> %ECX<kill>
+; SSE41-NEXT: # kill: def %ax killed %ax killed %eax
+; SSE41-NEXT: # kill: def %dx killed %dx killed %edx
+; SSE41-NEXT: # kill: def %cx killed %cx killed %ecx
; SSE41-NEXT: retl
;
; AVX-32-LABEL: zext_i8:
-; AVX-32: # BB#0:
+; AVX-32: # %bb.0:
; AVX-32-NEXT: vpxor %xmm0, %xmm0, %xmm0
; AVX-32-NEXT: vpinsrb $0, {{[0-9]+}}(%esp), %xmm0, %xmm0
; AVX-32-NEXT: vpinsrb $4, {{[0-9]+}}(%esp), %xmm0, %xmm0
@@ -47,13 +47,13 @@ define <3 x i16> @zext_i8(<3 x i8>) {
; AVX-32-NEXT: vmovd %xmm0, %eax
; AVX-32-NEXT: vpextrw $2, %xmm0, %edx
; AVX-32-NEXT: vpextrw $4, %xmm0, %ecx
-; AVX-32-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
-; AVX-32-NEXT: # kill: %DX<def> %DX<kill> %EDX<kill>
-; AVX-32-NEXT: # kill: %CX<def> %CX<kill> %ECX<kill>
+; AVX-32-NEXT: # kill: def %ax killed %ax killed %eax
+; AVX-32-NEXT: # kill: def %dx killed %dx killed %edx
+; AVX-32-NEXT: # kill: def %cx killed %cx killed %ecx
; AVX-32-NEXT: retl
;
; AVX-64-LABEL: zext_i8:
-; AVX-64: # BB#0:
+; AVX-64: # %bb.0:
; AVX-64-NEXT: vmovd %edi, %xmm0
; AVX-64-NEXT: vpinsrd $1, %esi, %xmm0, %xmm0
; AVX-64-NEXT: vpinsrd $2, %edx, %xmm0, %xmm0
@@ -61,9 +61,9 @@ define <3 x i16> @zext_i8(<3 x i8>) {
; AVX-64-NEXT: vmovd %xmm0, %eax
; AVX-64-NEXT: vpextrw $2, %xmm0, %edx
; AVX-64-NEXT: vpextrw $4, %xmm0, %ecx
-; AVX-64-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
-; AVX-64-NEXT: # kill: %DX<def> %DX<kill> %EDX<kill>
-; AVX-64-NEXT: # kill: %CX<def> %CX<kill> %ECX<kill>
+; AVX-64-NEXT: # kill: def %ax killed %ax killed %eax
+; AVX-64-NEXT: # kill: def %dx killed %dx killed %edx
+; AVX-64-NEXT: # kill: def %cx killed %cx killed %ecx
; AVX-64-NEXT: retq
%2 = zext <3 x i8> %0 to <3 x i16>
ret <3 x i16> %2
@@ -71,7 +71,7 @@ define <3 x i16> @zext_i8(<3 x i8>) {
define <3 x i16> @sext_i8(<3 x i8>) {
; SSE3-LABEL: sext_i8:
-; SSE3: # BB#0:
+; SSE3: # %bb.0:
; SSE3-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; SSE3-NEXT: movd %eax, %xmm0
; SSE3-NEXT: movzbl {{[0-9]+}}(%esp), %eax
@@ -85,13 +85,13 @@ define <3 x i16> @sext_i8(<3 x i8>) {
; SSE3-NEXT: movd %xmm0, %eax
; SSE3-NEXT: pextrw $2, %xmm0, %edx
; SSE3-NEXT: pextrw $4, %xmm0, %ecx
-; SSE3-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
-; SSE3-NEXT: # kill: %DX<def> %DX<kill> %EDX<kill>
-; SSE3-NEXT: # kill: %CX<def> %CX<kill> %ECX<kill>
+; SSE3-NEXT: # kill: def %ax killed %ax killed %eax
+; SSE3-NEXT: # kill: def %dx killed %dx killed %edx
+; SSE3-NEXT: # kill: def %cx killed %cx killed %ecx
; SSE3-NEXT: retl
;
; SSE41-LABEL: sext_i8:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE41-NEXT: pinsrb $4, {{[0-9]+}}(%esp), %xmm0
; SSE41-NEXT: pinsrb $8, {{[0-9]+}}(%esp), %xmm0
@@ -100,13 +100,13 @@ define <3 x i16> @sext_i8(<3 x i8>) {
; SSE41-NEXT: movd %xmm0, %eax
; SSE41-NEXT: pextrw $2, %xmm0, %edx
; SSE41-NEXT: pextrw $4, %xmm0, %ecx
-; SSE41-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
-; SSE41-NEXT: # kill: %DX<def> %DX<kill> %EDX<kill>
-; SSE41-NEXT: # kill: %CX<def> %CX<kill> %ECX<kill>
+; SSE41-NEXT: # kill: def %ax killed %ax killed %eax
+; SSE41-NEXT: # kill: def %dx killed %dx killed %edx
+; SSE41-NEXT: # kill: def %cx killed %cx killed %ecx
; SSE41-NEXT: retl
;
; AVX-32-LABEL: sext_i8:
-; AVX-32: # BB#0:
+; AVX-32: # %bb.0:
; AVX-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX-32-NEXT: vpinsrb $4, {{[0-9]+}}(%esp), %xmm0, %xmm0
; AVX-32-NEXT: vpinsrb $8, {{[0-9]+}}(%esp), %xmm0, %xmm0
@@ -115,13 +115,13 @@ define <3 x i16> @sext_i8(<3 x i8>) {
; AVX-32-NEXT: vmovd %xmm0, %eax
; AVX-32-NEXT: vpextrw $2, %xmm0, %edx
; AVX-32-NEXT: vpextrw $4, %xmm0, %ecx
-; AVX-32-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
-; AVX-32-NEXT: # kill: %DX<def> %DX<kill> %EDX<kill>
-; AVX-32-NEXT: # kill: %CX<def> %CX<kill> %ECX<kill>
+; AVX-32-NEXT: # kill: def %ax killed %ax killed %eax
+; AVX-32-NEXT: # kill: def %dx killed %dx killed %edx
+; AVX-32-NEXT: # kill: def %cx killed %cx killed %ecx
; AVX-32-NEXT: retl
;
; AVX-64-LABEL: sext_i8:
-; AVX-64: # BB#0:
+; AVX-64: # %bb.0:
; AVX-64-NEXT: vmovd %edi, %xmm0
; AVX-64-NEXT: vpinsrd $1, %esi, %xmm0, %xmm0
; AVX-64-NEXT: vpinsrd $2, %edx, %xmm0, %xmm0
@@ -130,9 +130,9 @@ define <3 x i16> @sext_i8(<3 x i8>) {
; AVX-64-NEXT: vmovd %xmm0, %eax
; AVX-64-NEXT: vpextrw $2, %xmm0, %edx
; AVX-64-NEXT: vpextrw $4, %xmm0, %ecx
-; AVX-64-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
-; AVX-64-NEXT: # kill: %DX<def> %DX<kill> %EDX<kill>
-; AVX-64-NEXT: # kill: %CX<def> %CX<kill> %ECX<kill>
+; AVX-64-NEXT: # kill: def %ax killed %ax killed %eax
+; AVX-64-NEXT: # kill: def %dx killed %dx killed %edx
+; AVX-64-NEXT: # kill: def %cx killed %cx killed %ecx
; AVX-64-NEXT: retq
%2 = sext <3 x i8> %0 to <3 x i16>
ret <3 x i16> %2
diff --git a/test/CodeGen/X86/promote.ll b/test/CodeGen/X86/promote.ll
index 38cdc14b380f..37dfc881a59a 100644
--- a/test/CodeGen/X86/promote.ll
+++ b/test/CodeGen/X86/promote.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mcpu=corei7 | FileCheck %s
+; RUN: llc < %s -mcpu=corei7 | FileCheck %s
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i8:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
diff --git a/test/CodeGen/X86/pseudo_cmov_lower2.ll b/test/CodeGen/X86/pseudo_cmov_lower2.ll
index 38712a96b2bf..1a61b0b97000 100644
--- a/test/CodeGen/X86/pseudo_cmov_lower2.ll
+++ b/test/CodeGen/X86/pseudo_cmov_lower2.ll
@@ -51,7 +51,7 @@ entry:
; CHECK-LABEL: foo3:
; CHECK: js
; CHECK-NOT: js
-; CHECK-LABEL: # BB#1:
+; CHECK-LABEL: # %bb.1:
; CHECK-DAG: movapd %xmm2, %xmm1
; CHECK-DAG: movapd %xmm2, %xmm0
; CHECK-LABEL:.LBB2_2:
@@ -81,7 +81,7 @@ entry:
; CHECK-LABEL: foo4:
; CHECK: js
; CHECK-NOT: js
-; CHECK-LABEL: # BB#1:
+; CHECK-LABEL: # %bb.1:
; CHECK-DAG: movapd %xmm2, %xmm1
; CHECK-DAG: movapd %xmm2, %xmm0
; CHECK-LABEL:.LBB3_2:
diff --git a/test/CodeGen/X86/pshufb-mask-comments.ll b/test/CodeGen/X86/pshufb-mask-comments.ll
index 178fe3357d43..0900fdccb49b 100644
--- a/test/CodeGen/X86/pshufb-mask-comments.ll
+++ b/test/CodeGen/X86/pshufb-mask-comments.ll
@@ -5,7 +5,7 @@
define <16 x i8> @test1(<16 x i8> %V) {
; CHECK-LABEL: test1:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,0,0,0,0,2,0,0,0,0,3,0,0,0,0,4]
; CHECK-NEXT: retq
%1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %V, <16 x i8> <i8 1, i8 0, i8 0, i8 0, i8 0, i8 2, i8 0, i8 0, i8 0, i8 0, i8 3, i8 0, i8 0, i8 0, i8 0, i8 4>)
@@ -16,7 +16,7 @@ define <16 x i8> @test1(<16 x i8> %V) {
define <16 x i8> @test2(<16 x i8> %V) {
; CHECK-LABEL: test2:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: pshufb {{.*#+}} xmm0 = xmm0[15,0,0,0,0,0,0,0,0,0,1,0,0,0,0,2]
; CHECK-NEXT: retq
%1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %V, <16 x i8> <i8 15, i8 0, i8 0, i8 0, i8 0, i8 16, i8 0, i8 0, i8 0, i8 0, i8 17, i8 0, i8 0, i8 0, i8 0, i8 50>)
@@ -27,7 +27,7 @@ define <16 x i8> @test2(<16 x i8> %V) {
define <16 x i8> @test3(<16 x i8> %V) {
; CHECK-LABEL: test3:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,0,0,15,0,2,0,0],zero,xmm0[0,3,0,0],zero,xmm0[0,4]
; CHECK-NEXT: retq
%1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %V, <16 x i8> <i8 1, i8 0, i8 0, i8 127, i8 0, i8 2, i8 0, i8 0, i8 128, i8 0, i8 3, i8 0, i8 0, i8 255, i8 0, i8 4>)
@@ -38,7 +38,7 @@ define <16 x i8> @test3(<16 x i8> %V) {
define <16 x i8> @test4(<16 x i8> %V, <2 x i64>* %P) {
; CHECK-LABEL: test4:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movaps {{.*#+}} xmm1 = [1084818905618843912,506097522914230528]
; CHECK-NEXT: movaps %xmm1, (%rdi)
; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
@@ -53,7 +53,7 @@ define <16 x i8> @test4(<16 x i8> %V, <2 x i64>* %P) {
define <16 x i8> @test5(<16 x i8> %V) {
; CHECK-LABEL: test5:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movl $1, %eax
; CHECK-NEXT: movq %rax, %xmm1
; CHECK-NEXT: movdqa %xmm1, (%rax)
@@ -74,7 +74,7 @@ define <16 x i8> @test5(<16 x i8> %V) {
define <16 x i8> @test6(<16 x i8> %V, <2 x i64>* %P) {
; CHECK-LABEL: test6:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movaps {{.*#+}} xmm1 = [217019414673948672,506380106026255364]
; CHECK-NEXT: movaps %xmm1, (%rdi)
; CHECK-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
diff --git a/test/CodeGen/X86/pshufd-combine-crash.ll b/test/CodeGen/X86/pshufd-combine-crash.ll
index 84c69e32bcc3..3f181b43c781 100644
--- a/test/CodeGen/X86/pshufd-combine-crash.ll
+++ b/test/CodeGen/X86/pshufd-combine-crash.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mcpu=corei7 -debug
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=corei7 -debug
; REQUIRES: asserts
diff --git a/test/CodeGen/X86/psubus.ll b/test/CodeGen/X86/psubus.ll
index a1f1e084d330..8642bc596f39 100644
--- a/test/CodeGen/X86/psubus.ll
+++ b/test/CodeGen/X86/psubus.ll
@@ -1,20 +1,26 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-apple-macosx10.8.0 -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
-; RUN: llc < %s -mtriple=x86_64-apple-macosx10.8.0 -mattr=+ssse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSSE3
-; RUN: llc < %s -mtriple=x86_64-apple-macosx10.8.0 -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41
-; RUN: llc < %s -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
-; RUN: llc < %s -mtriple=x86_64-apple-macosx10.8.0 -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSSE3
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=AVX512
define <8 x i16> @test1(<8 x i16> %x) nounwind {
; SSE-LABEL: test1:
-; SSE: ## BB#0: ## %vector.ph
+; SSE: # %bb.0: # %vector.ph
; SSE-NEXT: psubusw {{.*}}(%rip), %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test1:
-; AVX: ## BB#0: ## %vector.ph
+; AVX: # %bb.0: # %vector.ph
; AVX-NEXT: vpsubusw {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: retq
+;
+; AVX512-LABEL: test1:
+; AVX512: # %bb.0: # %vector.ph
+; AVX512-NEXT: vpsubusw {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT: retq
vector.ph:
%0 = icmp slt <8 x i16> %x, zeroinitializer
%1 = xor <8 x i16> %x, <i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768>
@@ -24,14 +30,19 @@ vector.ph:
define <8 x i16> @test2(<8 x i16> %x) nounwind {
; SSE-LABEL: test2:
-; SSE: ## BB#0: ## %vector.ph
+; SSE: # %bb.0: # %vector.ph
; SSE-NEXT: psubusw {{.*}}(%rip), %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test2:
-; AVX: ## BB#0: ## %vector.ph
+; AVX: # %bb.0: # %vector.ph
; AVX-NEXT: vpsubusw {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: retq
+;
+; AVX512-LABEL: test2:
+; AVX512: # %bb.0: # %vector.ph
+; AVX512-NEXT: vpsubusw {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT: retq
vector.ph:
%0 = icmp ugt <8 x i16> %x, <i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766>
%1 = add <8 x i16> %x, <i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767>
@@ -41,7 +52,7 @@ vector.ph:
define <8 x i16> @test3(<8 x i16> %x, i16 zeroext %w) nounwind {
; SSE-LABEL: test3:
-; SSE: ## BB#0: ## %vector.ph
+; SSE: # %bb.0: # %vector.ph
; SSE-NEXT: movd %edi, %xmm1
; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
@@ -49,7 +60,7 @@ define <8 x i16> @test3(<8 x i16> %x, i16 zeroext %w) nounwind {
; SSE-NEXT: retq
;
; AVX1-LABEL: test3:
-; AVX1: ## BB#0: ## %vector.ph
+; AVX1: # %bb.0: # %vector.ph
; AVX1-NEXT: vmovd %edi, %xmm1
; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
@@ -57,11 +68,17 @@ define <8 x i16> @test3(<8 x i16> %x, i16 zeroext %w) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test3:
-; AVX2: ## BB#0: ## %vector.ph
+; AVX2: # %bb.0: # %vector.ph
; AVX2-NEXT: vmovd %edi, %xmm1
; AVX2-NEXT: vpbroadcastw %xmm1, %xmm1
; AVX2-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test3:
+; AVX512: # %bb.0: # %vector.ph
+; AVX512-NEXT: vpbroadcastw %edi, %xmm1
+; AVX512-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: retq
vector.ph:
%0 = insertelement <8 x i16> undef, i16 %w, i32 0
%broadcast15 = shufflevector <8 x i16> %0, <8 x i16> undef, <8 x i32> zeroinitializer
@@ -73,14 +90,19 @@ vector.ph:
define <16 x i8> @test4(<16 x i8> %x) nounwind {
; SSE-LABEL: test4:
-; SSE: ## BB#0: ## %vector.ph
+; SSE: # %bb.0: # %vector.ph
; SSE-NEXT: psubusb {{.*}}(%rip), %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test4:
-; AVX: ## BB#0: ## %vector.ph
+; AVX: # %bb.0: # %vector.ph
; AVX-NEXT: vpsubusb {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: retq
+;
+; AVX512-LABEL: test4:
+; AVX512: # %bb.0: # %vector.ph
+; AVX512-NEXT: vpsubusb {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT: retq
vector.ph:
%0 = icmp slt <16 x i8> %x, zeroinitializer
%1 = xor <16 x i8> %x, <i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>
@@ -90,14 +112,19 @@ vector.ph:
define <16 x i8> @test5(<16 x i8> %x) nounwind {
; SSE-LABEL: test5:
-; SSE: ## BB#0: ## %vector.ph
+; SSE: # %bb.0: # %vector.ph
; SSE-NEXT: psubusb {{.*}}(%rip), %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test5:
-; AVX: ## BB#0: ## %vector.ph
+; AVX: # %bb.0: # %vector.ph
; AVX-NEXT: vpsubusb {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: retq
+;
+; AVX512-LABEL: test5:
+; AVX512: # %bb.0: # %vector.ph
+; AVX512-NEXT: vpsubusb {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT: retq
vector.ph:
%0 = icmp ugt <16 x i8> %x, <i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126>
%1 = add <16 x i8> %x, <i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127>
@@ -107,7 +134,7 @@ vector.ph:
define <16 x i8> @test6(<16 x i8> %x, i8 zeroext %w) nounwind {
; SSE2-LABEL: test6:
-; SSE2: ## BB#0: ## %vector.ph
+; SSE2: # %bb.0: # %vector.ph
; SSE2-NEXT: movd %edi, %xmm1
; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
@@ -116,7 +143,7 @@ define <16 x i8> @test6(<16 x i8> %x, i8 zeroext %w) nounwind {
; SSE2-NEXT: retq
;
; SSSE3-LABEL: test6:
-; SSSE3: ## BB#0: ## %vector.ph
+; SSSE3: # %bb.0: # %vector.ph
; SSSE3-NEXT: movd %edi, %xmm1
; SSSE3-NEXT: pxor %xmm2, %xmm2
; SSSE3-NEXT: pshufb %xmm2, %xmm1
@@ -124,7 +151,7 @@ define <16 x i8> @test6(<16 x i8> %x, i8 zeroext %w) nounwind {
; SSSE3-NEXT: retq
;
; SSE41-LABEL: test6:
-; SSE41: ## BB#0: ## %vector.ph
+; SSE41: # %bb.0: # %vector.ph
; SSE41-NEXT: movd %edi, %xmm1
; SSE41-NEXT: pxor %xmm2, %xmm2
; SSE41-NEXT: pshufb %xmm2, %xmm1
@@ -132,7 +159,7 @@ define <16 x i8> @test6(<16 x i8> %x, i8 zeroext %w) nounwind {
; SSE41-NEXT: retq
;
; AVX1-LABEL: test6:
-; AVX1: ## BB#0: ## %vector.ph
+; AVX1: # %bb.0: # %vector.ph
; AVX1-NEXT: vmovd %edi, %xmm1
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
@@ -140,11 +167,17 @@ define <16 x i8> @test6(<16 x i8> %x, i8 zeroext %w) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test6:
-; AVX2: ## BB#0: ## %vector.ph
+; AVX2: # %bb.0: # %vector.ph
; AVX2-NEXT: vmovd %edi, %xmm1
; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1
; AVX2-NEXT: vpsubusb %xmm1, %xmm0, %xmm0
; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test6:
+; AVX512: # %bb.0: # %vector.ph
+; AVX512-NEXT: vpbroadcastb %edi, %xmm1
+; AVX512-NEXT: vpsubusb %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: retq
vector.ph:
%0 = insertelement <16 x i8> undef, i8 %w, i32 0
%broadcast15 = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> zeroinitializer
@@ -156,14 +189,14 @@ vector.ph:
define <16 x i16> @test7(<16 x i16> %x) nounwind {
; SSE-LABEL: test7:
-; SSE: ## BB#0: ## %vector.ph
+; SSE: # %bb.0: # %vector.ph
; SSE-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
; SSE-NEXT: psubusw %xmm2, %xmm0
; SSE-NEXT: psubusw %xmm2, %xmm1
; SSE-NEXT: retq
;
; AVX1-LABEL: test7:
-; AVX1: ## BB#0: ## %vector.ph
+; AVX1: # %bb.0: # %vector.ph
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpcmpgtw %xmm1, %xmm2, %xmm1
@@ -174,9 +207,14 @@ define <16 x i16> @test7(<16 x i16> %x) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test7:
-; AVX2: ## BB#0: ## %vector.ph
+; AVX2: # %bb.0: # %vector.ph
; AVX2-NEXT: vpsubusw {{.*}}(%rip), %ymm0, %ymm0
; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test7:
+; AVX512: # %bb.0: # %vector.ph
+; AVX512-NEXT: vpsubusw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512-NEXT: retq
vector.ph:
%0 = icmp slt <16 x i16> %x, zeroinitializer
%1 = xor <16 x i16> %x, <i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768>
@@ -186,14 +224,14 @@ vector.ph:
define <16 x i16> @test8(<16 x i16> %x) nounwind {
; SSE-LABEL: test8:
-; SSE: ## BB#0: ## %vector.ph
+; SSE: # %bb.0: # %vector.ph
; SSE-NEXT: movdqa {{.*#+}} xmm2 = [32767,32767,32767,32767,32767,32767,32767,32767]
; SSE-NEXT: psubusw %xmm2, %xmm0
; SSE-NEXT: psubusw %xmm2, %xmm1
; SSE-NEXT: retq
;
; AVX1-LABEL: test8:
-; AVX1: ## BB#0: ## %vector.ph
+; AVX1: # %bb.0: # %vector.ph
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm3
@@ -210,9 +248,14 @@ define <16 x i16> @test8(<16 x i16> %x) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test8:
-; AVX2: ## BB#0: ## %vector.ph
+; AVX2: # %bb.0: # %vector.ph
; AVX2-NEXT: vpsubusw {{.*}}(%rip), %ymm0, %ymm0
; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test8:
+; AVX512: # %bb.0: # %vector.ph
+; AVX512-NEXT: vpsubusw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512-NEXT: retq
vector.ph:
%0 = icmp ugt <16 x i16> %x, <i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766>
%1 = add <16 x i16> %x, <i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767>
@@ -222,7 +265,7 @@ vector.ph:
define <16 x i16> @test9(<16 x i16> %x, i16 zeroext %w) nounwind {
; SSE-LABEL: test9:
-; SSE: ## BB#0: ## %vector.ph
+; SSE: # %bb.0: # %vector.ph
; SSE-NEXT: movd %edi, %xmm2
; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1]
@@ -231,7 +274,7 @@ define <16 x i16> @test9(<16 x i16> %x, i16 zeroext %w) nounwind {
; SSE-NEXT: retq
;
; AVX1-LABEL: test9:
-; AVX1: ## BB#0: ## %vector.ph
+; AVX1: # %bb.0: # %vector.ph
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vmovd %edi, %xmm2
; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7]
@@ -248,11 +291,17 @@ define <16 x i16> @test9(<16 x i16> %x, i16 zeroext %w) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test9:
-; AVX2: ## BB#0: ## %vector.ph
+; AVX2: # %bb.0: # %vector.ph
; AVX2-NEXT: vmovd %edi, %xmm1
; AVX2-NEXT: vpbroadcastw %xmm1, %ymm1
; AVX2-NEXT: vpsubusw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test9:
+; AVX512: # %bb.0: # %vector.ph
+; AVX512-NEXT: vpbroadcastw %edi, %ymm1
+; AVX512-NEXT: vpsubusw %ymm1, %ymm0, %ymm0
+; AVX512-NEXT: retq
vector.ph:
%0 = insertelement <16 x i16> undef, i16 %w, i32 0
%broadcast15 = shufflevector <16 x i16> %0, <16 x i16> undef, <16 x i32> zeroinitializer
@@ -264,14 +313,14 @@ vector.ph:
define <32 x i8> @test10(<32 x i8> %x) nounwind {
; SSE-LABEL: test10:
-; SSE: ## BB#0: ## %vector.ph
+; SSE: # %bb.0: # %vector.ph
; SSE-NEXT: movdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
; SSE-NEXT: psubusb %xmm2, %xmm0
; SSE-NEXT: psubusb %xmm2, %xmm1
; SSE-NEXT: retq
;
; AVX1-LABEL: test10:
-; AVX1: ## BB#0: ## %vector.ph
+; AVX1: # %bb.0: # %vector.ph
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpcmpgtb %xmm1, %xmm2, %xmm1
@@ -282,9 +331,14 @@ define <32 x i8> @test10(<32 x i8> %x) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test10:
-; AVX2: ## BB#0: ## %vector.ph
+; AVX2: # %bb.0: # %vector.ph
; AVX2-NEXT: vpsubusb {{.*}}(%rip), %ymm0, %ymm0
; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test10:
+; AVX512: # %bb.0: # %vector.ph
+; AVX512-NEXT: vpsubusb {{.*}}(%rip), %ymm0, %ymm0
+; AVX512-NEXT: retq
vector.ph:
%0 = icmp slt <32 x i8> %x, zeroinitializer
%1 = xor <32 x i8> %x, <i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>
@@ -294,14 +348,14 @@ vector.ph:
define <32 x i8> @test11(<32 x i8> %x) nounwind {
; SSE-LABEL: test11:
-; SSE: ## BB#0: ## %vector.ph
+; SSE: # %bb.0: # %vector.ph
; SSE-NEXT: movdqa {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
; SSE-NEXT: psubusb %xmm2, %xmm0
; SSE-NEXT: psubusb %xmm2, %xmm1
; SSE-NEXT: retq
;
; AVX1-LABEL: test11:
-; AVX1: ## BB#0: ## %vector.ph
+; AVX1: # %bb.0: # %vector.ph
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm3
@@ -318,9 +372,14 @@ define <32 x i8> @test11(<32 x i8> %x) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test11:
-; AVX2: ## BB#0: ## %vector.ph
+; AVX2: # %bb.0: # %vector.ph
; AVX2-NEXT: vpsubusb {{.*}}(%rip), %ymm0, %ymm0
; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test11:
+; AVX512: # %bb.0: # %vector.ph
+; AVX512-NEXT: vpsubusb {{.*}}(%rip), %ymm0, %ymm0
+; AVX512-NEXT: retq
vector.ph:
%0 = icmp ugt <32 x i8> %x, <i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126>
%1 = add <32 x i8> %x, <i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127>
@@ -330,7 +389,7 @@ vector.ph:
define <32 x i8> @test12(<32 x i8> %x, i8 zeroext %w) nounwind {
; SSE2-LABEL: test12:
-; SSE2: ## BB#0: ## %vector.ph
+; SSE2: # %bb.0: # %vector.ph
; SSE2-NEXT: movd %edi, %xmm2
; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7]
@@ -340,7 +399,7 @@ define <32 x i8> @test12(<32 x i8> %x, i8 zeroext %w) nounwind {
; SSE2-NEXT: retq
;
; SSSE3-LABEL: test12:
-; SSSE3: ## BB#0: ## %vector.ph
+; SSSE3: # %bb.0: # %vector.ph
; SSSE3-NEXT: movd %edi, %xmm2
; SSSE3-NEXT: pxor %xmm3, %xmm3
; SSSE3-NEXT: pshufb %xmm3, %xmm2
@@ -349,7 +408,7 @@ define <32 x i8> @test12(<32 x i8> %x, i8 zeroext %w) nounwind {
; SSSE3-NEXT: retq
;
; SSE41-LABEL: test12:
-; SSE41: ## BB#0: ## %vector.ph
+; SSE41: # %bb.0: # %vector.ph
; SSE41-NEXT: movd %edi, %xmm2
; SSE41-NEXT: pxor %xmm3, %xmm3
; SSE41-NEXT: pshufb %xmm3, %xmm2
@@ -358,7 +417,7 @@ define <32 x i8> @test12(<32 x i8> %x, i8 zeroext %w) nounwind {
; SSE41-NEXT: retq
;
; AVX1-LABEL: test12:
-; AVX1: ## BB#0: ## %vector.ph
+; AVX1: # %bb.0: # %vector.ph
; AVX1-NEXT: vmovd %edi, %xmm1
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
@@ -375,11 +434,17 @@ define <32 x i8> @test12(<32 x i8> %x, i8 zeroext %w) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test12:
-; AVX2: ## BB#0: ## %vector.ph
+; AVX2: # %bb.0: # %vector.ph
; AVX2-NEXT: vmovd %edi, %xmm1
; AVX2-NEXT: vpbroadcastb %xmm1, %ymm1
; AVX2-NEXT: vpsubusb %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test12:
+; AVX512: # %bb.0: # %vector.ph
+; AVX512-NEXT: vpbroadcastb %edi, %ymm1
+; AVX512-NEXT: vpsubusb %ymm1, %ymm0, %ymm0
+; AVX512-NEXT: retq
vector.ph:
%0 = insertelement <32 x i8> undef, i8 %w, i32 0
%broadcast15 = shufflevector <32 x i8> %0, <32 x i8> undef, <32 x i32> zeroinitializer
@@ -391,7 +456,7 @@ vector.ph:
define <8 x i16> @test13(<8 x i16> %x, <8 x i32> %y) nounwind {
; SSE2-LABEL: test13:
-; SSE2: ## BB#0: ## %vector.ph
+; SSE2: # %bb.0: # %vector.ph
; SSE2-NEXT: pxor %xmm4, %xmm4
; SSE2-NEXT: movdqa %xmm0, %xmm3
; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
@@ -399,20 +464,15 @@ define <8 x i16> @test13(<8 x i16> %x, <8 x i32> %y) nounwind {
; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
; SSE2-NEXT: movdqa %xmm0, %xmm5
; SSE2-NEXT: psubd %xmm2, %xmm0
-; SSE2-NEXT: pxor %xmm4, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm6
+; SSE2-NEXT: pxor %xmm4, %xmm6
; SSE2-NEXT: pxor %xmm4, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm5, %xmm2
-; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,2,2,3]
+; SSE2-NEXT: pcmpgtd %xmm5, %xmm6
; SSE2-NEXT: movdqa %xmm1, %xmm2
; SSE2-NEXT: pxor %xmm4, %xmm2
; SSE2-NEXT: pxor %xmm3, %xmm4
; SSE2-NEXT: pcmpgtd %xmm4, %xmm2
-; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm5[0]
+; SSE2-NEXT: packssdw %xmm6, %xmm2
; SSE2-NEXT: psubd %xmm1, %xmm3
; SSE2-NEXT: pslld $16, %xmm0
; SSE2-NEXT: psrad $16, %xmm0
@@ -424,63 +484,59 @@ define <8 x i16> @test13(<8 x i16> %x, <8 x i32> %y) nounwind {
; SSE2-NEXT: retq
;
; SSSE3-LABEL: test13:
-; SSSE3: ## BB#0: ## %vector.ph
-; SSSE3-NEXT: pxor %xmm4, %xmm4
-; SSSE3-NEXT: movdqa %xmm0, %xmm3
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; SSSE3-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
-; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
+; SSSE3: # %bb.0: # %vector.ph
+; SSSE3-NEXT: pxor %xmm3, %xmm3
+; SSSE3-NEXT: movdqa %xmm0, %xmm4
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; SSSE3-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
+; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
; SSSE3-NEXT: movdqa %xmm0, %xmm5
; SSSE3-NEXT: psubd %xmm2, %xmm0
; SSSE3-NEXT: movdqa %xmm2, %xmm6
-; SSSE3-NEXT: pxor %xmm4, %xmm6
-; SSSE3-NEXT: pxor %xmm4, %xmm5
+; SSSE3-NEXT: pxor %xmm3, %xmm6
+; SSSE3-NEXT: pxor %xmm3, %xmm5
; SSSE3-NEXT: pcmpgtd %xmm5, %xmm6
-; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; SSSE3-NEXT: pshufb %xmm5, %xmm6
; SSSE3-NEXT: movdqa %xmm1, %xmm2
-; SSSE3-NEXT: pxor %xmm4, %xmm2
-; SSSE3-NEXT: pxor %xmm3, %xmm4
-; SSSE3-NEXT: pcmpgtd %xmm4, %xmm2
-; SSSE3-NEXT: pshufb %xmm5, %xmm2
-; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm6[0]
-; SSSE3-NEXT: psubd %xmm1, %xmm3
-; SSSE3-NEXT: pshufb %xmm5, %xmm0
-; SSSE3-NEXT: pshufb %xmm5, %xmm3
-; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0]
-; SSSE3-NEXT: pandn %xmm3, %xmm2
+; SSSE3-NEXT: pxor %xmm3, %xmm2
+; SSSE3-NEXT: pxor %xmm4, %xmm3
+; SSSE3-NEXT: pcmpgtd %xmm3, %xmm2
+; SSSE3-NEXT: packssdw %xmm6, %xmm2
+; SSSE3-NEXT: psubd %xmm1, %xmm4
+; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; SSSE3-NEXT: pshufb %xmm1, %xmm0
+; SSSE3-NEXT: pshufb %xmm1, %xmm4
+; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm0[0]
+; SSSE3-NEXT: pandn %xmm4, %xmm2
; SSSE3-NEXT: movdqa %xmm2, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: test13:
-; SSE41: ## BB#0: ## %vector.ph
+; SSE41: # %bb.0: # %vector.ph
; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
-; SSE41-NEXT: pmovzxwd {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
-; SSE41-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; SSE41-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
+; SSE41-NEXT: pmovzxwd {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648]
-; SSE41-NEXT: movdqa %xmm3, %xmm6
-; SSE41-NEXT: psubd %xmm1, %xmm3
+; SSE41-NEXT: movdqa %xmm4, %xmm6
+; SSE41-NEXT: psubd %xmm1, %xmm4
; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: pxor %xmm5, %xmm0
; SSE41-NEXT: pxor %xmm5, %xmm6
; SSE41-NEXT: pcmpgtd %xmm6, %xmm0
+; SSE41-NEXT: movdqa %xmm2, %xmm1
+; SSE41-NEXT: pxor %xmm5, %xmm1
+; SSE41-NEXT: pxor %xmm3, %xmm5
+; SSE41-NEXT: pcmpgtd %xmm5, %xmm1
+; SSE41-NEXT: packssdw %xmm1, %xmm0
+; SSE41-NEXT: psubd %xmm2, %xmm3
; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; SSE41-NEXT: pshufb %xmm1, %xmm0
-; SSE41-NEXT: movdqa %xmm2, %xmm6
-; SSE41-NEXT: pxor %xmm5, %xmm6
-; SSE41-NEXT: pxor %xmm4, %xmm5
-; SSE41-NEXT: pcmpgtd %xmm5, %xmm6
-; SSE41-NEXT: pshufb %xmm1, %xmm6
-; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm6[0]
-; SSE41-NEXT: psubd %xmm2, %xmm4
-; SSE41-NEXT: pshufb %xmm1, %xmm3
; SSE41-NEXT: pshufb %xmm1, %xmm4
-; SSE41-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0]
-; SSE41-NEXT: pandn %xmm3, %xmm0
+; SSE41-NEXT: pshufb %xmm1, %xmm3
+; SSE41-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm3[0]
+; SSE41-NEXT: pandn %xmm4, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: test13:
-; AVX1: ## BB#0: ## %vector.ph
+; AVX1: # %bb.0: # %vector.ph
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
@@ -492,7 +548,7 @@ define <8 x i16> @test13(<8 x i16> %x, <8 x i32> %y) nounwind {
; AVX1-NEXT: vpxor %xmm3, %xmm5, %xmm6
; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm3
; AVX1-NEXT: vpcmpgtd %xmm3, %xmm6, %xmm3
-; AVX1-NEXT: vpacksswb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpackssdw %xmm3, %xmm4, %xmm3
; AVX1-NEXT: vpsubd %xmm5, %xmm2, %xmm2
; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
@@ -504,20 +560,29 @@ define <8 x i16> @test13(<8 x i16> %x, <8 x i32> %y) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test13:
-; AVX2: ## BB#0: ## %vector.ph
+; AVX2: # %bb.0: # %vector.ph
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm2
+; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm3
; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm2
; AVX2-NEXT: vpcmpgtd %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
-; AVX2-NEXT: vpacksswb %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vpackssdw %xmm3, %xmm2, %xmm2
; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-NEXT: vpandn %xmm0, %xmm2, %xmm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test13:
+; AVX512: # %bb.0: # %vector.ph
+; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512-NEXT: vpcmpnltud %ymm1, %ymm0, %k1
+; AVX512-NEXT: vpsubd %ymm1, %ymm0, %ymm0
+; AVX512-NEXT: vpmovdw %ymm0, %xmm0 {%k1} {z}
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
vector.ph:
%lhs = zext <8 x i16> %x to <8 x i32>
%cond = icmp ult <8 x i32> %lhs, %y
@@ -529,7 +594,7 @@ vector.ph:
define <16 x i8> @test14(<16 x i8> %x, <16 x i32> %y) nounwind {
; SSE2-LABEL: test14:
-; SSE2: ## BB#0: ## %vector.ph
+; SSE2: # %bb.0: # %vector.ph
; SSE2-NEXT: movdqa %xmm0, %xmm5
; SSE2-NEXT: pxor %xmm0, %xmm0
; SSE2-NEXT: movdqa %xmm5, %xmm6
@@ -581,49 +646,48 @@ define <16 x i8> @test14(<16 x i8> %x, <16 x i32> %y) nounwind {
; SSE2-NEXT: retq
;
; SSSE3-LABEL: test14:
-; SSSE3: ## BB#0: ## %vector.ph
-; SSSE3-NEXT: pxor %xmm7, %xmm7
-; SSSE3-NEXT: movdqa %xmm0, %xmm11
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm7[0],xmm11[1],xmm7[1],xmm11[2],xmm7[2],xmm11[3],xmm7[3],xmm11[4],xmm7[4],xmm11[5],xmm7[5],xmm11[6],xmm7[6],xmm11[7],xmm7[7]
-; SSSE3-NEXT: movdqa %xmm11, %xmm8
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3]
-; SSSE3-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm7[4],xmm11[5],xmm7[5],xmm11[6],xmm7[6],xmm11[7],xmm7[7]
-; SSSE3-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm7[8],xmm0[9],xmm7[9],xmm0[10],xmm7[10],xmm0[11],xmm7[11],xmm0[12],xmm7[12],xmm0[13],xmm7[13],xmm0[14],xmm7[14],xmm0[15],xmm7[15]
-; SSSE3-NEXT: movdqa %xmm0, %xmm10
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3]
-; SSSE3-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7]
-; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [2147483648,2147483648,2147483648,2147483648]
-; SSSE3-NEXT: movdqa %xmm4, %xmm9
-; SSSE3-NEXT: pxor %xmm7, %xmm9
-; SSSE3-NEXT: psubd %xmm0, %xmm4
-; SSSE3-NEXT: movdqa %xmm0, %xmm6
-; SSSE3-NEXT: pxor %xmm7, %xmm6
-; SSSE3-NEXT: pcmpgtd %xmm9, %xmm6
-; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
-; SSSE3-NEXT: pshufb %xmm9, %xmm6
-; SSSE3-NEXT: movdqa %xmm3, %xmm5
-; SSSE3-NEXT: pxor %xmm7, %xmm5
-; SSSE3-NEXT: psubd %xmm10, %xmm3
-; SSSE3-NEXT: movdqa %xmm10, %xmm0
-; SSSE3-NEXT: pxor %xmm7, %xmm0
-; SSSE3-NEXT: pcmpgtd %xmm5, %xmm0
-; SSSE3-NEXT: pshufb %xmm9, %xmm0
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1]
-; SSSE3-NEXT: movdqa %xmm2, %xmm5
-; SSSE3-NEXT: pxor %xmm7, %xmm5
-; SSSE3-NEXT: psubd %xmm11, %xmm2
-; SSSE3-NEXT: pxor %xmm7, %xmm11
-; SSSE3-NEXT: pcmpgtd %xmm5, %xmm11
-; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; SSSE3-NEXT: pshufb %xmm5, %xmm11
+; SSSE3: # %bb.0: # %vector.ph
+; SSSE3-NEXT: movdqa %xmm0, %xmm5
+; SSSE3-NEXT: pxor %xmm0, %xmm0
+; SSSE3-NEXT: movdqa %xmm5, %xmm7
+; SSSE3-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm0[8],xmm7[9],xmm0[9],xmm7[10],xmm0[10],xmm7[11],xmm0[11],xmm7[12],xmm0[12],xmm7[13],xmm0[13],xmm7[14],xmm0[14],xmm7[15],xmm0[15]
+; SSSE3-NEXT: movdqa %xmm7, %xmm8
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3]
+; SSSE3-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7]
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3],xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
+; SSSE3-NEXT: movdqa %xmm5, %xmm10
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3]
+; SSSE3-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
+; SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648,2147483648,2147483648]
+; SSSE3-NEXT: movdqa %xmm2, %xmm9
+; SSSE3-NEXT: pxor %xmm0, %xmm9
+; SSSE3-NEXT: psubd %xmm5, %xmm2
+; SSSE3-NEXT: pxor %xmm0, %xmm5
+; SSSE3-NEXT: pcmpgtd %xmm9, %xmm5
+; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; SSSE3-NEXT: pshufb %xmm9, %xmm5
; SSSE3-NEXT: movdqa %xmm1, %xmm6
-; SSSE3-NEXT: pxor %xmm7, %xmm6
-; SSSE3-NEXT: pxor %xmm8, %xmm7
-; SSSE3-NEXT: pcmpgtd %xmm6, %xmm7
+; SSSE3-NEXT: pxor %xmm0, %xmm6
+; SSSE3-NEXT: psubd %xmm10, %xmm1
+; SSSE3-NEXT: pxor %xmm0, %xmm10
+; SSSE3-NEXT: pcmpgtd %xmm6, %xmm10
+; SSSE3-NEXT: pshufb %xmm9, %xmm10
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm5[0],xmm10[1],xmm5[1]
+; SSSE3-NEXT: movdqa %xmm4, %xmm5
+; SSSE3-NEXT: pxor %xmm0, %xmm5
+; SSSE3-NEXT: psubd %xmm7, %xmm4
+; SSSE3-NEXT: pxor %xmm0, %xmm7
+; SSSE3-NEXT: pcmpgtd %xmm5, %xmm7
+; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
; SSSE3-NEXT: pshufb %xmm5, %xmm7
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm11[0],xmm7[1],xmm11[1]
-; SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm7[0],xmm0[1]
-; SSSE3-NEXT: psubd %xmm8, %xmm1
+; SSSE3-NEXT: movdqa %xmm3, %xmm6
+; SSSE3-NEXT: pxor %xmm0, %xmm6
+; SSSE3-NEXT: pxor %xmm8, %xmm0
+; SSSE3-NEXT: pcmpgtd %xmm6, %xmm0
+; SSSE3-NEXT: pshufb %xmm5, %xmm0
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1]
+; SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm10[0],xmm0[1]
+; SSSE3-NEXT: psubd %xmm8, %xmm3
; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
; SSSE3-NEXT: pand %xmm5, %xmm4
; SSSE3-NEXT: pand %xmm5, %xmm3
@@ -636,7 +700,7 @@ define <16 x i8> @test14(<16 x i8> %x, <16 x i32> %y) nounwind {
; SSSE3-NEXT: retq
;
; SSE41-LABEL: test14:
-; SSE41: ## BB#0: ## %vector.ph
+; SSE41: # %bb.0: # %vector.ph
; SSE41-NEXT: movdqa %xmm0, %xmm5
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,2,3]
; SSE41-NEXT: pmovzxbd {{.*#+}} xmm8 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
@@ -683,13 +747,11 @@ define <16 x i8> @test14(<16 x i8> %x, <16 x i32> %y) nounwind {
; SSE41-NEXT: pand %xmm5, %xmm2
; SSE41-NEXT: packuswb %xmm2, %xmm1
; SSE41-NEXT: packuswb %xmm3, %xmm1
-; SSE41-NEXT: pxor %xmm2, %xmm2
-; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pandn %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: test14:
-; AVX1: ## BB#0: ## %vector.ph
+; AVX1: # %bb.0: # %vector.ph
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,2,3]
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm8 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm9 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
@@ -705,7 +767,7 @@ define <16 x i8> @test14(<16 x i8> %x, <16 x i32> %y) nounwind {
; AVX1-NEXT: vpxor %xmm6, %xmm2, %xmm4
; AVX1-NEXT: vpxor %xmm6, %xmm10, %xmm5
; AVX1-NEXT: vpcmpgtd %xmm4, %xmm5, %xmm4
-; AVX1-NEXT: vpacksswb %xmm3, %xmm4, %xmm11
+; AVX1-NEXT: vpackssdw %xmm3, %xmm4, %xmm11
; AVX1-NEXT: vpxor %xmm6, %xmm1, %xmm4
; AVX1-NEXT: vpxor %xmm6, %xmm9, %xmm5
; AVX1-NEXT: vpcmpgtd %xmm4, %xmm5, %xmm4
@@ -713,7 +775,7 @@ define <16 x i8> @test14(<16 x i8> %x, <16 x i32> %y) nounwind {
; AVX1-NEXT: vpxor %xmm6, %xmm5, %xmm3
; AVX1-NEXT: vpxor %xmm6, %xmm8, %xmm6
; AVX1-NEXT: vpcmpgtd %xmm3, %xmm6, %xmm3
-; AVX1-NEXT: vpacksswb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpackssdw %xmm3, %xmm4, %xmm3
; AVX1-NEXT: vpacksswb %xmm11, %xmm3, %xmm3
; AVX1-NEXT: vpsubd %xmm8, %xmm5, %xmm4
; AVX1-NEXT: vpsubd %xmm9, %xmm1, %xmm1
@@ -727,27 +789,26 @@ define <16 x i8> @test14(<16 x i8> %x, <16 x i32> %y) nounwind {
; AVX1-NEXT: vpand %xmm5, %xmm4, %xmm2
; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpandn %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: test14:
-; AVX2: ## BB#0: ## %vector.ph
+; AVX2: # %bb.0: # %vector.ph
; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm4
+; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm4 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
; AVX2-NEXT: vpxor %ymm4, %ymm1, %ymm5
; AVX2-NEXT: vpxor %ymm4, %ymm0, %ymm6
; AVX2-NEXT: vpcmpgtd %ymm5, %ymm6, %ymm5
; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6
-; AVX2-NEXT: vpacksswb %xmm6, %xmm5, %xmm5
+; AVX2-NEXT: vpackssdw %xmm6, %xmm5, %xmm5
; AVX2-NEXT: vpxor %ymm4, %ymm2, %ymm6
; AVX2-NEXT: vpxor %ymm4, %ymm3, %ymm4
; AVX2-NEXT: vpcmpgtd %ymm6, %ymm4, %ymm4
; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm6
-; AVX2-NEXT: vpacksswb %xmm6, %xmm4, %xmm4
+; AVX2-NEXT: vpackssdw %xmm6, %xmm4, %xmm4
; AVX2-NEXT: vpacksswb %xmm4, %xmm5, %xmm4
; AVX2-NEXT: vpsubd %ymm3, %ymm2, %ymm2
; AVX2-NEXT: vpsubd %ymm0, %ymm1, %ymm0
@@ -760,10 +821,18 @@ define <16 x i8> @test14(<16 x i8> %x, <16 x i32> %y) nounwind {
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT: vpblendvb %xmm4, %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpandn %xmm0, %xmm4, %xmm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test14:
+; AVX512: # %bb.0: # %vector.ph
+; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512-NEXT: vpcmpnltud %zmm0, %zmm1, %k1
+; AVX512-NEXT: vpsubd %zmm0, %zmm1, %zmm0
+; AVX512-NEXT: vpmovdb %zmm0, %xmm0 {%k1} {z}
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
vector.ph:
%rhs = zext <16 x i8> %x to <16 x i32>
%cond = icmp ult <16 x i32> %y, %rhs
@@ -775,40 +844,33 @@ vector.ph:
define <8 x i16> @test15(<8 x i16> %x, <8 x i32> %y) nounwind {
; SSE2-LABEL: test15:
-; SSE2: ## BB#0: ## %vector.ph
-; SSE2-NEXT: pxor %xmm4, %xmm4
+; SSE2: # %bb.0: # %vector.ph
; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
+; SSE2-NEXT: pxor %xmm4, %xmm4
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
-; SSE2-NEXT: movdqa %xmm0, %xmm5
-; SSE2-NEXT: psubd %xmm2, %xmm0
+; SSE2-NEXT: movdqa %xmm3, %xmm5
+; SSE2-NEXT: psubd %xmm2, %xmm3
; SSE2-NEXT: pxor %xmm4, %xmm2
; SSE2-NEXT: pxor %xmm4, %xmm5
; SSE2-NEXT: pcmpgtd %xmm2, %xmm5
-; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm5[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; SSE2-NEXT: movdqa %xmm1, %xmm5
-; SSE2-NEXT: pxor %xmm4, %xmm5
-; SSE2-NEXT: pxor %xmm3, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm5, %xmm4
-; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm2[0]
-; SSE2-NEXT: psubd %xmm1, %xmm3
-; SSE2-NEXT: pslld $16, %xmm0
-; SSE2-NEXT: psrad $16, %xmm0
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: pxor %xmm4, %xmm2
+; SSE2-NEXT: pxor %xmm0, %xmm4
+; SSE2-NEXT: pcmpgtd %xmm2, %xmm4
+; SSE2-NEXT: packssdw %xmm5, %xmm4
+; SSE2-NEXT: psubd %xmm1, %xmm0
; SSE2-NEXT: pslld $16, %xmm3
; SSE2-NEXT: psrad $16, %xmm3
-; SSE2-NEXT: packssdw %xmm0, %xmm3
-; SSE2-NEXT: pand %xmm4, %xmm3
-; SSE2-NEXT: movdqa %xmm3, %xmm0
+; SSE2-NEXT: pslld $16, %xmm0
+; SSE2-NEXT: psrad $16, %xmm0
+; SSE2-NEXT: packssdw %xmm3, %xmm0
+; SSE2-NEXT: pand %xmm4, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: test15:
-; SSSE3: ## BB#0: ## %vector.ph
+; SSSE3: # %bb.0: # %vector.ph
; SSSE3-NEXT: pxor %xmm4, %xmm4
; SSSE3-NEXT: movdqa %xmm0, %xmm3
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
@@ -819,24 +881,22 @@ define <8 x i16> @test15(<8 x i16> %x, <8 x i32> %y) nounwind {
; SSSE3-NEXT: pxor %xmm4, %xmm2
; SSSE3-NEXT: pxor %xmm4, %xmm5
; SSSE3-NEXT: pcmpgtd %xmm2, %xmm5
-; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; SSSE3-NEXT: pshufb %xmm2, %xmm5
-; SSSE3-NEXT: movdqa %xmm1, %xmm6
-; SSSE3-NEXT: pxor %xmm4, %xmm6
+; SSSE3-NEXT: movdqa %xmm1, %xmm2
+; SSSE3-NEXT: pxor %xmm4, %xmm2
; SSSE3-NEXT: pxor %xmm3, %xmm4
-; SSSE3-NEXT: pcmpgtd %xmm6, %xmm4
-; SSSE3-NEXT: pshufb %xmm2, %xmm4
-; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0]
+; SSSE3-NEXT: pcmpgtd %xmm2, %xmm4
+; SSSE3-NEXT: packssdw %xmm5, %xmm4
; SSSE3-NEXT: psubd %xmm1, %xmm3
-; SSSE3-NEXT: pshufb %xmm2, %xmm0
-; SSSE3-NEXT: pshufb %xmm2, %xmm3
+; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; SSSE3-NEXT: pshufb %xmm1, %xmm0
+; SSSE3-NEXT: pshufb %xmm1, %xmm3
; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0]
; SSSE3-NEXT: pand %xmm4, %xmm3
; SSSE3-NEXT: movdqa %xmm3, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: test15:
-; SSE41: ## BB#0: ## %vector.ph
+; SSE41: # %bb.0: # %vector.ph
; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
@@ -846,15 +906,13 @@ define <8 x i16> @test15(<8 x i16> %x, <8 x i32> %y) nounwind {
; SSE41-NEXT: pxor %xmm4, %xmm1
; SSE41-NEXT: pxor %xmm4, %xmm5
; SSE41-NEXT: pcmpgtd %xmm1, %xmm5
-; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; SSE41-NEXT: pshufb %xmm1, %xmm5
-; SSE41-NEXT: movdqa %xmm2, %xmm6
-; SSE41-NEXT: pxor %xmm4, %xmm6
+; SSE41-NEXT: movdqa %xmm2, %xmm1
+; SSE41-NEXT: pxor %xmm4, %xmm1
; SSE41-NEXT: pxor %xmm3, %xmm4
-; SSE41-NEXT: pcmpgtd %xmm6, %xmm4
-; SSE41-NEXT: pshufb %xmm1, %xmm4
-; SSE41-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0]
+; SSE41-NEXT: pcmpgtd %xmm1, %xmm4
+; SSE41-NEXT: packssdw %xmm4, %xmm5
; SSE41-NEXT: psubd %xmm2, %xmm3
+; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; SSE41-NEXT: pshufb %xmm1, %xmm0
; SSE41-NEXT: pshufb %xmm1, %xmm3
; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
@@ -862,7 +920,7 @@ define <8 x i16> @test15(<8 x i16> %x, <8 x i32> %y) nounwind {
; SSE41-NEXT: retq
;
; AVX1-LABEL: test15:
-; AVX1: ## BB#0: ## %vector.ph
+; AVX1: # %bb.0: # %vector.ph
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
@@ -874,7 +932,7 @@ define <8 x i16> @test15(<8 x i16> %x, <8 x i32> %y) nounwind {
; AVX1-NEXT: vpxor %xmm3, %xmm5, %xmm6
; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm3
; AVX1-NEXT: vpcmpgtd %xmm6, %xmm3, %xmm3
-; AVX1-NEXT: vpacksswb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpackssdw %xmm3, %xmm4, %xmm3
; AVX1-NEXT: vpsubd %xmm5, %xmm2, %xmm2
; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
@@ -886,20 +944,29 @@ define <8 x i16> @test15(<8 x i16> %x, <8 x i32> %y) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test15:
-; AVX2: ## BB#0: ## %vector.ph
+; AVX2: # %bb.0: # %vector.ph
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm2
+; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm3
; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm2
; AVX2-NEXT: vpcmpgtd %ymm3, %ymm2, %ymm2
; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
-; AVX2-NEXT: vpacksswb %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vpackssdw %xmm3, %xmm2, %xmm2
; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-NEXT: vpand %xmm0, %xmm2, %xmm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test15:
+; AVX512: # %bb.0: # %vector.ph
+; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512-NEXT: vpcmpnleud %ymm1, %ymm0, %k1
+; AVX512-NEXT: vpsubd %ymm1, %ymm0, %ymm0
+; AVX512-NEXT: vpmovdw %ymm0, %xmm0 {%k1} {z}
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
vector.ph:
%lhs = zext <8 x i16> %x to <8 x i32>
%cond = icmp ugt <8 x i32> %lhs, %y
@@ -911,40 +978,33 @@ vector.ph:
define <8 x i16> @test16(<8 x i16> %x, <8 x i32> %y) nounwind {
; SSE2-LABEL: test16:
-; SSE2: ## BB#0: ## %vector.ph
-; SSE2-NEXT: pxor %xmm4, %xmm4
+; SSE2: # %bb.0: # %vector.ph
; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
+; SSE2-NEXT: pxor %xmm4, %xmm4
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
-; SSE2-NEXT: movdqa %xmm0, %xmm5
-; SSE2-NEXT: psubd %xmm2, %xmm0
+; SSE2-NEXT: movdqa %xmm3, %xmm5
+; SSE2-NEXT: psubd %xmm2, %xmm3
; SSE2-NEXT: pxor %xmm4, %xmm2
; SSE2-NEXT: pxor %xmm4, %xmm5
; SSE2-NEXT: pcmpgtd %xmm2, %xmm5
-; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm5[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; SSE2-NEXT: movdqa %xmm1, %xmm5
-; SSE2-NEXT: pxor %xmm4, %xmm5
-; SSE2-NEXT: pxor %xmm3, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm5, %xmm4
-; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm2[0]
-; SSE2-NEXT: psubd %xmm1, %xmm3
-; SSE2-NEXT: pslld $16, %xmm0
-; SSE2-NEXT: psrad $16, %xmm0
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: pxor %xmm4, %xmm2
+; SSE2-NEXT: pxor %xmm0, %xmm4
+; SSE2-NEXT: pcmpgtd %xmm2, %xmm4
+; SSE2-NEXT: packssdw %xmm5, %xmm4
+; SSE2-NEXT: psubd %xmm1, %xmm0
; SSE2-NEXT: pslld $16, %xmm3
; SSE2-NEXT: psrad $16, %xmm3
-; SSE2-NEXT: packssdw %xmm0, %xmm3
-; SSE2-NEXT: pand %xmm4, %xmm3
-; SSE2-NEXT: movdqa %xmm3, %xmm0
+; SSE2-NEXT: pslld $16, %xmm0
+; SSE2-NEXT: psrad $16, %xmm0
+; SSE2-NEXT: packssdw %xmm3, %xmm0
+; SSE2-NEXT: pand %xmm4, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: test16:
-; SSSE3: ## BB#0: ## %vector.ph
+; SSSE3: # %bb.0: # %vector.ph
; SSSE3-NEXT: pxor %xmm4, %xmm4
; SSSE3-NEXT: movdqa %xmm0, %xmm3
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
@@ -955,24 +1015,22 @@ define <8 x i16> @test16(<8 x i16> %x, <8 x i32> %y) nounwind {
; SSSE3-NEXT: pxor %xmm4, %xmm2
; SSSE3-NEXT: pxor %xmm4, %xmm5
; SSSE3-NEXT: pcmpgtd %xmm2, %xmm5
-; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; SSSE3-NEXT: pshufb %xmm2, %xmm5
-; SSSE3-NEXT: movdqa %xmm1, %xmm6
-; SSSE3-NEXT: pxor %xmm4, %xmm6
+; SSSE3-NEXT: movdqa %xmm1, %xmm2
+; SSSE3-NEXT: pxor %xmm4, %xmm2
; SSSE3-NEXT: pxor %xmm3, %xmm4
-; SSSE3-NEXT: pcmpgtd %xmm6, %xmm4
-; SSSE3-NEXT: pshufb %xmm2, %xmm4
-; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0]
+; SSSE3-NEXT: pcmpgtd %xmm2, %xmm4
+; SSSE3-NEXT: packssdw %xmm5, %xmm4
; SSSE3-NEXT: psubd %xmm1, %xmm3
-; SSSE3-NEXT: pshufb %xmm2, %xmm0
-; SSSE3-NEXT: pshufb %xmm2, %xmm3
+; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; SSSE3-NEXT: pshufb %xmm1, %xmm0
+; SSSE3-NEXT: pshufb %xmm1, %xmm3
; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0]
; SSSE3-NEXT: pand %xmm4, %xmm3
; SSSE3-NEXT: movdqa %xmm3, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: test16:
-; SSE41: ## BB#0: ## %vector.ph
+; SSE41: # %bb.0: # %vector.ph
; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
@@ -982,15 +1040,13 @@ define <8 x i16> @test16(<8 x i16> %x, <8 x i32> %y) nounwind {
; SSE41-NEXT: pxor %xmm4, %xmm1
; SSE41-NEXT: pxor %xmm4, %xmm5
; SSE41-NEXT: pcmpgtd %xmm1, %xmm5
-; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; SSE41-NEXT: pshufb %xmm1, %xmm5
-; SSE41-NEXT: movdqa %xmm2, %xmm6
-; SSE41-NEXT: pxor %xmm4, %xmm6
+; SSE41-NEXT: movdqa %xmm2, %xmm1
+; SSE41-NEXT: pxor %xmm4, %xmm1
; SSE41-NEXT: pxor %xmm3, %xmm4
-; SSE41-NEXT: pcmpgtd %xmm6, %xmm4
-; SSE41-NEXT: pshufb %xmm1, %xmm4
-; SSE41-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0]
+; SSE41-NEXT: pcmpgtd %xmm1, %xmm4
+; SSE41-NEXT: packssdw %xmm4, %xmm5
; SSE41-NEXT: psubd %xmm2, %xmm3
+; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; SSE41-NEXT: pshufb %xmm1, %xmm0
; SSE41-NEXT: pshufb %xmm1, %xmm3
; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
@@ -998,7 +1054,7 @@ define <8 x i16> @test16(<8 x i16> %x, <8 x i32> %y) nounwind {
; SSE41-NEXT: retq
;
; AVX1-LABEL: test16:
-; AVX1: ## BB#0: ## %vector.ph
+; AVX1: # %bb.0: # %vector.ph
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
@@ -1010,7 +1066,7 @@ define <8 x i16> @test16(<8 x i16> %x, <8 x i32> %y) nounwind {
; AVX1-NEXT: vpxor %xmm3, %xmm5, %xmm6
; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm3
; AVX1-NEXT: vpcmpgtd %xmm6, %xmm3, %xmm3
-; AVX1-NEXT: vpacksswb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpackssdw %xmm3, %xmm4, %xmm3
; AVX1-NEXT: vpsubd %xmm5, %xmm2, %xmm2
; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
@@ -1022,20 +1078,29 @@ define <8 x i16> @test16(<8 x i16> %x, <8 x i32> %y) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test16:
-; AVX2: ## BB#0: ## %vector.ph
+; AVX2: # %bb.0: # %vector.ph
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm2
+; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm3
; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm2
; AVX2-NEXT: vpcmpgtd %ymm3, %ymm2, %ymm2
; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
-; AVX2-NEXT: vpacksswb %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vpackssdw %xmm3, %xmm2, %xmm2
; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-NEXT: vpand %xmm0, %xmm2, %xmm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test16:
+; AVX512: # %bb.0: # %vector.ph
+; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512-NEXT: vpcmpltud %ymm0, %ymm1, %k1
+; AVX512-NEXT: vpsubd %ymm1, %ymm0, %ymm0
+; AVX512-NEXT: vpmovdw %ymm0, %xmm0 {%k1} {z}
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
vector.ph:
%lhs = zext <8 x i16> %x to <8 x i32>
%cond = icmp ult <8 x i32> %y, %lhs
@@ -1044,3 +1109,1268 @@ vector.ph:
%res = select <8 x i1> %cond, <8 x i16> %truncsub, <8 x i16> zeroinitializer
ret <8 x i16> %res
}
+
+define <8 x i16> @psubus_8i16_max(<8 x i16> %x, <8 x i16> %y) nounwind {
+; SSE2-LABEL: psubus_8i16_max:
+; SSE2: # %bb.0: # %vector.ph
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: pxor %xmm2, %xmm3
+; SSE2-NEXT: pxor %xmm1, %xmm2
+; SSE2-NEXT: pcmpgtw %xmm3, %xmm2
+; SSE2-NEXT: movdqa %xmm1, %xmm3
+; SSE2-NEXT: pand %xmm2, %xmm3
+; SSE2-NEXT: pandn %xmm0, %xmm2
+; SSE2-NEXT: por %xmm3, %xmm2
+; SSE2-NEXT: psubw %xmm1, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm0
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: psubus_8i16_max:
+; SSSE3: # %bb.0: # %vector.ph
+; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; SSSE3-NEXT: movdqa %xmm0, %xmm3
+; SSSE3-NEXT: pxor %xmm2, %xmm3
+; SSSE3-NEXT: pxor %xmm1, %xmm2
+; SSSE3-NEXT: pcmpgtw %xmm3, %xmm2
+; SSSE3-NEXT: movdqa %xmm1, %xmm3
+; SSSE3-NEXT: pand %xmm2, %xmm3
+; SSSE3-NEXT: pandn %xmm0, %xmm2
+; SSSE3-NEXT: por %xmm3, %xmm2
+; SSSE3-NEXT: psubw %xmm1, %xmm2
+; SSSE3-NEXT: movdqa %xmm2, %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: psubus_8i16_max:
+; SSE41: # %bb.0: # %vector.ph
+; SSE41-NEXT: psubusw %xmm1, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: psubus_8i16_max:
+; AVX: # %bb.0: # %vector.ph
+; AVX-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: psubus_8i16_max:
+; AVX512: # %bb.0: # %vector.ph
+; AVX512-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: retq
+vector.ph:
+ %cmp = icmp ult <8 x i16> %x, %y
+ %max = select <8 x i1> %cmp, <8 x i16> %y, <8 x i16> %x
+ %res = sub <8 x i16> %max, %y
+ ret <8 x i16> %res
+}
+
+define <16 x i8> @psubus_16i8_max(<16 x i8> %x, <16 x i8> %y) nounwind {
+; SSE-LABEL: psubus_16i8_max:
+; SSE: # %bb.0: # %vector.ph
+; SSE-NEXT: psubusb %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: psubus_16i8_max:
+; AVX: # %bb.0: # %vector.ph
+; AVX-NEXT: vpsubusb %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: psubus_16i8_max:
+; AVX512: # %bb.0: # %vector.ph
+; AVX512-NEXT: vpsubusb %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: retq
+vector.ph:
+ %cmp = icmp ult <16 x i8> %x, %y
+ %max = select <16 x i1> %cmp, <16 x i8> %y, <16 x i8> %x
+ %res = sub <16 x i8> %max, %y
+ ret <16 x i8> %res
+}
+
+define <16 x i16> @psubus_16i16_max(<16 x i16> %x, <16 x i16> %y) nounwind {
+; SSE2-LABEL: psubus_16i16_max:
+; SSE2: # %bb.0: # %vector.ph
+; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; SSE2-NEXT: movdqa %xmm0, %xmm6
+; SSE2-NEXT: pxor %xmm4, %xmm6
+; SSE2-NEXT: movdqa %xmm2, %xmm5
+; SSE2-NEXT: pxor %xmm4, %xmm5
+; SSE2-NEXT: pcmpgtw %xmm6, %xmm5
+; SSE2-NEXT: movdqa %xmm1, %xmm6
+; SSE2-NEXT: pxor %xmm4, %xmm6
+; SSE2-NEXT: pxor %xmm3, %xmm4
+; SSE2-NEXT: pcmpgtw %xmm6, %xmm4
+; SSE2-NEXT: movdqa %xmm3, %xmm6
+; SSE2-NEXT: pand %xmm4, %xmm6
+; SSE2-NEXT: pandn %xmm1, %xmm4
+; SSE2-NEXT: por %xmm6, %xmm4
+; SSE2-NEXT: movdqa %xmm2, %xmm1
+; SSE2-NEXT: pand %xmm5, %xmm1
+; SSE2-NEXT: pandn %xmm0, %xmm5
+; SSE2-NEXT: por %xmm1, %xmm5
+; SSE2-NEXT: psubw %xmm2, %xmm5
+; SSE2-NEXT: psubw %xmm3, %xmm4
+; SSE2-NEXT: movdqa %xmm5, %xmm0
+; SSE2-NEXT: movdqa %xmm4, %xmm1
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: psubus_16i16_max:
+; SSSE3: # %bb.0: # %vector.ph
+; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; SSSE3-NEXT: movdqa %xmm0, %xmm6
+; SSSE3-NEXT: pxor %xmm4, %xmm6
+; SSSE3-NEXT: movdqa %xmm2, %xmm5
+; SSSE3-NEXT: pxor %xmm4, %xmm5
+; SSSE3-NEXT: pcmpgtw %xmm6, %xmm5
+; SSSE3-NEXT: movdqa %xmm1, %xmm6
+; SSSE3-NEXT: pxor %xmm4, %xmm6
+; SSSE3-NEXT: pxor %xmm3, %xmm4
+; SSSE3-NEXT: pcmpgtw %xmm6, %xmm4
+; SSSE3-NEXT: movdqa %xmm3, %xmm6
+; SSSE3-NEXT: pand %xmm4, %xmm6
+; SSSE3-NEXT: pandn %xmm1, %xmm4
+; SSSE3-NEXT: por %xmm6, %xmm4
+; SSSE3-NEXT: movdqa %xmm2, %xmm1
+; SSSE3-NEXT: pand %xmm5, %xmm1
+; SSSE3-NEXT: pandn %xmm0, %xmm5
+; SSSE3-NEXT: por %xmm1, %xmm5
+; SSSE3-NEXT: psubw %xmm2, %xmm5
+; SSSE3-NEXT: psubw %xmm3, %xmm4
+; SSSE3-NEXT: movdqa %xmm5, %xmm0
+; SSSE3-NEXT: movdqa %xmm4, %xmm1
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: psubus_16i16_max:
+; SSE41: # %bb.0: # %vector.ph
+; SSE41-NEXT: psubusw %xmm2, %xmm0
+; SSE41-NEXT: psubusw %xmm3, %xmm1
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: psubus_16i16_max:
+; AVX1: # %bb.0: # %vector.ph
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vpsubusw %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: psubus_16i16_max:
+; AVX2: # %bb.0: # %vector.ph
+; AVX2-NEXT: vpsubusw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: psubus_16i16_max:
+; AVX512: # %bb.0: # %vector.ph
+; AVX512-NEXT: vpsubusw %ymm1, %ymm0, %ymm0
+; AVX512-NEXT: retq
+vector.ph:
+ %cmp = icmp ult <16 x i16> %x, %y
+ %max = select <16 x i1> %cmp, <16 x i16> %y, <16 x i16> %x
+ %res = sub <16 x i16> %max, %y
+ ret <16 x i16> %res
+}
+
+define <32 x i16> @psubus_32i16_max(<32 x i16> %x, <32 x i16> %y) nounwind {
+; SSE2-LABEL: psubus_32i16_max:
+; SSE2: # %bb.0: # %vector.ph
+; SSE2-NEXT: movdqa %xmm3, %xmm11
+; SSE2-NEXT: movdqa %xmm2, %xmm10
+; SSE2-NEXT: movdqa %xmm1, %xmm9
+; SSE2-NEXT: movdqa %xmm0, %xmm8
+; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; SSE2-NEXT: movdqa %xmm8, %xmm1
+; SSE2-NEXT: pxor %xmm3, %xmm1
+; SSE2-NEXT: movdqa %xmm4, %xmm0
+; SSE2-NEXT: pxor %xmm3, %xmm0
+; SSE2-NEXT: pcmpgtw %xmm1, %xmm0
+; SSE2-NEXT: movdqa %xmm9, %xmm2
+; SSE2-NEXT: pxor %xmm3, %xmm2
+; SSE2-NEXT: movdqa %xmm5, %xmm1
+; SSE2-NEXT: pxor %xmm3, %xmm1
+; SSE2-NEXT: pcmpgtw %xmm2, %xmm1
+; SSE2-NEXT: movdqa %xmm10, %xmm12
+; SSE2-NEXT: pxor %xmm3, %xmm12
+; SSE2-NEXT: movdqa %xmm6, %xmm2
+; SSE2-NEXT: pxor %xmm3, %xmm2
+; SSE2-NEXT: pcmpgtw %xmm12, %xmm2
+; SSE2-NEXT: movdqa %xmm11, %xmm12
+; SSE2-NEXT: pxor %xmm3, %xmm12
+; SSE2-NEXT: pxor %xmm7, %xmm3
+; SSE2-NEXT: pcmpgtw %xmm12, %xmm3
+; SSE2-NEXT: movdqa %xmm7, %xmm12
+; SSE2-NEXT: pand %xmm3, %xmm12
+; SSE2-NEXT: pandn %xmm11, %xmm3
+; SSE2-NEXT: por %xmm12, %xmm3
+; SSE2-NEXT: movdqa %xmm6, %xmm11
+; SSE2-NEXT: pand %xmm2, %xmm11
+; SSE2-NEXT: pandn %xmm10, %xmm2
+; SSE2-NEXT: por %xmm11, %xmm2
+; SSE2-NEXT: movdqa %xmm5, %xmm10
+; SSE2-NEXT: pand %xmm1, %xmm10
+; SSE2-NEXT: pandn %xmm9, %xmm1
+; SSE2-NEXT: por %xmm10, %xmm1
+; SSE2-NEXT: movdqa %xmm4, %xmm9
+; SSE2-NEXT: pand %xmm0, %xmm9
+; SSE2-NEXT: pandn %xmm8, %xmm0
+; SSE2-NEXT: por %xmm9, %xmm0
+; SSE2-NEXT: psubw %xmm4, %xmm0
+; SSE2-NEXT: psubw %xmm5, %xmm1
+; SSE2-NEXT: psubw %xmm6, %xmm2
+; SSE2-NEXT: psubw %xmm7, %xmm3
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: psubus_32i16_max:
+; SSSE3: # %bb.0: # %vector.ph
+; SSSE3-NEXT: movdqa %xmm3, %xmm11
+; SSSE3-NEXT: movdqa %xmm2, %xmm10
+; SSSE3-NEXT: movdqa %xmm1, %xmm9
+; SSSE3-NEXT: movdqa %xmm0, %xmm8
+; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; SSSE3-NEXT: movdqa %xmm8, %xmm1
+; SSSE3-NEXT: pxor %xmm3, %xmm1
+; SSSE3-NEXT: movdqa %xmm4, %xmm0
+; SSSE3-NEXT: pxor %xmm3, %xmm0
+; SSSE3-NEXT: pcmpgtw %xmm1, %xmm0
+; SSSE3-NEXT: movdqa %xmm9, %xmm2
+; SSSE3-NEXT: pxor %xmm3, %xmm2
+; SSSE3-NEXT: movdqa %xmm5, %xmm1
+; SSSE3-NEXT: pxor %xmm3, %xmm1
+; SSSE3-NEXT: pcmpgtw %xmm2, %xmm1
+; SSSE3-NEXT: movdqa %xmm10, %xmm12
+; SSSE3-NEXT: pxor %xmm3, %xmm12
+; SSSE3-NEXT: movdqa %xmm6, %xmm2
+; SSSE3-NEXT: pxor %xmm3, %xmm2
+; SSSE3-NEXT: pcmpgtw %xmm12, %xmm2
+; SSSE3-NEXT: movdqa %xmm11, %xmm12
+; SSSE3-NEXT: pxor %xmm3, %xmm12
+; SSSE3-NEXT: pxor %xmm7, %xmm3
+; SSSE3-NEXT: pcmpgtw %xmm12, %xmm3
+; SSSE3-NEXT: movdqa %xmm7, %xmm12
+; SSSE3-NEXT: pand %xmm3, %xmm12
+; SSSE3-NEXT: pandn %xmm11, %xmm3
+; SSSE3-NEXT: por %xmm12, %xmm3
+; SSSE3-NEXT: movdqa %xmm6, %xmm11
+; SSSE3-NEXT: pand %xmm2, %xmm11
+; SSSE3-NEXT: pandn %xmm10, %xmm2
+; SSSE3-NEXT: por %xmm11, %xmm2
+; SSSE3-NEXT: movdqa %xmm5, %xmm10
+; SSSE3-NEXT: pand %xmm1, %xmm10
+; SSSE3-NEXT: pandn %xmm9, %xmm1
+; SSSE3-NEXT: por %xmm10, %xmm1
+; SSSE3-NEXT: movdqa %xmm4, %xmm9
+; SSSE3-NEXT: pand %xmm0, %xmm9
+; SSSE3-NEXT: pandn %xmm8, %xmm0
+; SSSE3-NEXT: por %xmm9, %xmm0
+; SSSE3-NEXT: psubw %xmm4, %xmm0
+; SSSE3-NEXT: psubw %xmm5, %xmm1
+; SSSE3-NEXT: psubw %xmm6, %xmm2
+; SSSE3-NEXT: psubw %xmm7, %xmm3
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: psubus_32i16_max:
+; SSE41: # %bb.0: # %vector.ph
+; SSE41-NEXT: psubusw %xmm4, %xmm0
+; SSE41-NEXT: psubusw %xmm5, %xmm1
+; SSE41-NEXT: psubusw %xmm6, %xmm2
+; SSE41-NEXT: psubusw %xmm7, %xmm3
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: psubus_32i16_max:
+; AVX1: # %bb.0: # %vector.ph
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
+; AVX1-NEXT: vpsubusw %xmm4, %xmm5, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6
+; AVX1-NEXT: vpsubusw %xmm5, %xmm6, %xmm5
+; AVX1-NEXT: vpsubusw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0
+; AVX1-NEXT: vpsubusw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: psubus_32i16_max:
+; AVX2: # %bb.0: # %vector.ph
+; AVX2-NEXT: vpsubusw %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpsubusw %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: psubus_32i16_max:
+; AVX512: # %bb.0: # %vector.ph
+; AVX512-NEXT: vpsubusw %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: retq
+vector.ph:
+ %cmp = icmp ult <32 x i16> %x, %y
+ %max = select <32 x i1> %cmp, <32 x i16> %y, <32 x i16> %x
+ %res = sub <32 x i16> %max, %y
+ ret <32 x i16> %res
+}
+
+define <64 x i8> @psubus_64i8_max(<64 x i8> %x, <64 x i8> %y) nounwind {
+; SSE-LABEL: psubus_64i8_max:
+; SSE: # %bb.0: # %vector.ph
+; SSE-NEXT: psubusb %xmm4, %xmm0
+; SSE-NEXT: psubusb %xmm5, %xmm1
+; SSE-NEXT: psubusb %xmm6, %xmm2
+; SSE-NEXT: psubusb %xmm7, %xmm3
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: psubus_64i8_max:
+; AVX1: # %bb.0: # %vector.ph
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
+; AVX1-NEXT: vpsubusb %xmm4, %xmm5, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6
+; AVX1-NEXT: vpsubusb %xmm5, %xmm6, %xmm5
+; AVX1-NEXT: vpsubusb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0
+; AVX1-NEXT: vpsubusb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: psubus_64i8_max:
+; AVX2: # %bb.0: # %vector.ph
+; AVX2-NEXT: vpsubusb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpsubusb %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: psubus_64i8_max:
+; AVX512: # %bb.0: # %vector.ph
+; AVX512-NEXT: vpsubusb %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: retq
+vector.ph:
+ %cmp = icmp ult <64 x i8> %x, %y
+ %max = select <64 x i1> %cmp, <64 x i8> %y, <64 x i8> %x
+ %res = sub <64 x i8> %max, %y
+ ret <64 x i8> %res
+}
+
+define <32 x i8> @psubus_32i8_max(<32 x i8> %x, <32 x i8> %y) nounwind {
+; SSE-LABEL: psubus_32i8_max:
+; SSE: # %bb.0: # %vector.ph
+; SSE-NEXT: psubusb %xmm2, %xmm0
+; SSE-NEXT: psubusb %xmm3, %xmm1
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: psubus_32i8_max:
+; AVX1: # %bb.0: # %vector.ph
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vpsubusb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpsubusb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: psubus_32i8_max:
+; AVX2: # %bb.0: # %vector.ph
+; AVX2-NEXT: vpsubusb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: psubus_32i8_max:
+; AVX512: # %bb.0: # %vector.ph
+; AVX512-NEXT: vpsubusb %ymm1, %ymm0, %ymm0
+; AVX512-NEXT: retq
+vector.ph:
+ %cmp = icmp ult <32 x i8> %x, %y
+ %max = select <32 x i1> %cmp, <32 x i8> %y, <32 x i8> %x
+ %res = sub <32 x i8> %max, %y
+ ret <32 x i8> %res
+}
+
+define <8 x i16> @psubus_8i32_max(<8 x i16> %x, <8 x i32> %y) nounwind {
+; SSE2-LABEL: psubus_8i32_max:
+; SSE2: # %bb.0: # %vector.ph
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: pxor %xmm0, %xmm0
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
+; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: pxor %xmm5, %xmm0
+; SSE2-NEXT: movdqa %xmm3, %xmm6
+; SSE2-NEXT: por %xmm5, %xmm6
+; SSE2-NEXT: pcmpgtd %xmm6, %xmm0
+; SSE2-NEXT: movdqa %xmm2, %xmm6
+; SSE2-NEXT: pxor %xmm5, %xmm6
+; SSE2-NEXT: por %xmm4, %xmm5
+; SSE2-NEXT: pcmpgtd %xmm5, %xmm6
+; SSE2-NEXT: movdqa %xmm2, %xmm5
+; SSE2-NEXT: pand %xmm6, %xmm5
+; SSE2-NEXT: pandn %xmm4, %xmm6
+; SSE2-NEXT: por %xmm5, %xmm6
+; SSE2-NEXT: movdqa %xmm1, %xmm4
+; SSE2-NEXT: pand %xmm0, %xmm4
+; SSE2-NEXT: pandn %xmm3, %xmm0
+; SSE2-NEXT: por %xmm4, %xmm0
+; SSE2-NEXT: psubd %xmm1, %xmm0
+; SSE2-NEXT: psubd %xmm2, %xmm6
+; SSE2-NEXT: pslld $16, %xmm6
+; SSE2-NEXT: psrad $16, %xmm6
+; SSE2-NEXT: pslld $16, %xmm0
+; SSE2-NEXT: psrad $16, %xmm0
+; SSE2-NEXT: packssdw %xmm6, %xmm0
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: psubus_8i32_max:
+; SSSE3: # %bb.0: # %vector.ph
+; SSSE3-NEXT: movdqa %xmm0, %xmm3
+; SSSE3-NEXT: pxor %xmm0, %xmm0
+; SSSE3-NEXT: movdqa %xmm3, %xmm4
+; SSSE3-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
+; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648]
+; SSSE3-NEXT: movdqa %xmm1, %xmm0
+; SSSE3-NEXT: pxor %xmm5, %xmm0
+; SSSE3-NEXT: movdqa %xmm3, %xmm6
+; SSSE3-NEXT: por %xmm5, %xmm6
+; SSSE3-NEXT: pcmpgtd %xmm6, %xmm0
+; SSSE3-NEXT: movdqa %xmm2, %xmm6
+; SSSE3-NEXT: pxor %xmm5, %xmm6
+; SSSE3-NEXT: por %xmm4, %xmm5
+; SSSE3-NEXT: pcmpgtd %xmm5, %xmm6
+; SSSE3-NEXT: movdqa %xmm2, %xmm5
+; SSSE3-NEXT: pand %xmm6, %xmm5
+; SSSE3-NEXT: pandn %xmm4, %xmm6
+; SSSE3-NEXT: por %xmm5, %xmm6
+; SSSE3-NEXT: movdqa %xmm1, %xmm4
+; SSSE3-NEXT: pand %xmm0, %xmm4
+; SSSE3-NEXT: pandn %xmm3, %xmm0
+; SSSE3-NEXT: por %xmm4, %xmm0
+; SSSE3-NEXT: psubd %xmm1, %xmm0
+; SSSE3-NEXT: psubd %xmm2, %xmm6
+; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; SSSE3-NEXT: pshufb %xmm1, %xmm6
+; SSSE3-NEXT: pshufb %xmm1, %xmm0
+; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm6[0]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: psubus_8i32_max:
+; SSE41: # %bb.0: # %vector.ph
+; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535]
+; SSE41-NEXT: pminud %xmm3, %xmm2
+; SSE41-NEXT: pminud %xmm3, %xmm1
+; SSE41-NEXT: packusdw %xmm2, %xmm1
+; SSE41-NEXT: psubusw %xmm1, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: psubus_8i32_max:
+; AVX1: # %bb.0: # %vector.ph
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [65535,65535,65535,65535]
+; AVX1-NEXT: vpminud %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpminud %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: psubus_8i32_max:
+; AVX2: # %bb.0: # %vector.ph
+; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535]
+; AVX2-NEXT: vpminud %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: psubus_8i32_max:
+; AVX512: # %bb.0: # %vector.ph
+; AVX512-NEXT: vpmovusdw %ymm1, %xmm1
+; AVX512-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
+vector.ph:
+ %lhs = zext <8 x i16> %x to <8 x i32>
+ %cond = icmp ult <8 x i32> %lhs, %y
+ %max = select <8 x i1> %cond, <8 x i32> %y, <8 x i32> %lhs
+ %sub = sub <8 x i32> %max, %y
+ %res = trunc <8 x i32> %sub to <8 x i16>
+ ret <8 x i16> %res
+}
+
+define <8 x i16> @psubus_8i64_max(<8 x i16> %x, <8 x i64> %y) nounwind {
+; SSE2-LABEL: psubus_8i64_max:
+; SSE2: # %bb.0: # %vector.ph
+; SSE2-NEXT: pxor %xmm5, %xmm5
+; SSE2-NEXT: movdqa %xmm0, %xmm10
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3]
+; SSE2-NEXT: movdqa %xmm10, %xmm9
+; SSE2-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm5[2],xmm9[3],xmm5[3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm5[0],xmm10[1],xmm5[1]
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7]
+; SSE2-NEXT: movdqa %xmm0, %xmm8
+; SSE2-NEXT: punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm5[2],xmm8[3],xmm5[3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1]
+; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT: movdqa %xmm3, %xmm5
+; SSE2-NEXT: pxor %xmm6, %xmm5
+; SSE2-NEXT: movdqa %xmm0, %xmm7
+; SSE2-NEXT: por %xmm6, %xmm7
+; SSE2-NEXT: movdqa %xmm5, %xmm11
+; SSE2-NEXT: pcmpgtd %xmm7, %xmm11
+; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm5, %xmm7
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; SSE2-NEXT: pand %xmm12, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm11[1,1,3,3]
+; SSE2-NEXT: por %xmm5, %xmm11
+; SSE2-NEXT: movdqa %xmm4, %xmm5
+; SSE2-NEXT: pxor %xmm6, %xmm5
+; SSE2-NEXT: movdqa %xmm8, %xmm7
+; SSE2-NEXT: por %xmm6, %xmm7
+; SSE2-NEXT: movdqa %xmm5, %xmm12
+; SSE2-NEXT: pcmpgtd %xmm7, %xmm12
+; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm5, %xmm7
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; SSE2-NEXT: pand %xmm13, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm12[1,1,3,3]
+; SSE2-NEXT: por %xmm5, %xmm12
+; SSE2-NEXT: movdqa %xmm1, %xmm5
+; SSE2-NEXT: pxor %xmm6, %xmm5
+; SSE2-NEXT: movdqa %xmm10, %xmm7
+; SSE2-NEXT: por %xmm6, %xmm7
+; SSE2-NEXT: movdqa %xmm5, %xmm13
+; SSE2-NEXT: pcmpgtd %xmm7, %xmm13
+; SSE2-NEXT: pshufd {{.*#+}} xmm14 = xmm13[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm5, %xmm7
+; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
+; SSE2-NEXT: pand %xmm14, %xmm7
+; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm13[1,1,3,3]
+; SSE2-NEXT: por %xmm7, %xmm13
+; SSE2-NEXT: movdqa %xmm2, %xmm7
+; SSE2-NEXT: pxor %xmm6, %xmm7
+; SSE2-NEXT: por %xmm9, %xmm6
+; SSE2-NEXT: movdqa %xmm7, %xmm5
+; SSE2-NEXT: pcmpgtd %xmm6, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm14 = xmm5[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm7, %xmm6
+; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3]
+; SSE2-NEXT: pand %xmm14, %xmm7
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3]
+; SSE2-NEXT: por %xmm7, %xmm6
+; SSE2-NEXT: movdqa %xmm2, %xmm5
+; SSE2-NEXT: pand %xmm6, %xmm5
+; SSE2-NEXT: pandn %xmm9, %xmm6
+; SSE2-NEXT: por %xmm5, %xmm6
+; SSE2-NEXT: movdqa %xmm1, %xmm5
+; SSE2-NEXT: pand %xmm13, %xmm5
+; SSE2-NEXT: pandn %xmm10, %xmm13
+; SSE2-NEXT: por %xmm5, %xmm13
+; SSE2-NEXT: movdqa %xmm4, %xmm5
+; SSE2-NEXT: pand %xmm12, %xmm5
+; SSE2-NEXT: pandn %xmm8, %xmm12
+; SSE2-NEXT: por %xmm5, %xmm12
+; SSE2-NEXT: movdqa %xmm3, %xmm5
+; SSE2-NEXT: pand %xmm11, %xmm5
+; SSE2-NEXT: pandn %xmm0, %xmm11
+; SSE2-NEXT: por %xmm5, %xmm11
+; SSE2-NEXT: psubq %xmm3, %xmm11
+; SSE2-NEXT: psubq %xmm4, %xmm12
+; SSE2-NEXT: psubq %xmm1, %xmm13
+; SSE2-NEXT: psubq %xmm2, %xmm6
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,2,2,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm13[0,2,2,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm12[0,2,2,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[0,1,0,2,4,5,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,2,2,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: psubus_8i64_max:
+; SSSE3: # %bb.0: # %vector.ph
+; SSSE3-NEXT: pxor %xmm5, %xmm5
+; SSSE3-NEXT: movdqa %xmm0, %xmm10
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3]
+; SSSE3-NEXT: movdqa %xmm10, %xmm9
+; SSSE3-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm5[2],xmm9[3],xmm5[3]
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm5[0],xmm10[1],xmm5[1]
+; SSSE3-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7]
+; SSSE3-NEXT: movdqa %xmm0, %xmm8
+; SSSE3-NEXT: punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm5[2],xmm8[3],xmm5[3]
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1]
+; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648]
+; SSSE3-NEXT: movdqa %xmm3, %xmm5
+; SSSE3-NEXT: pxor %xmm6, %xmm5
+; SSSE3-NEXT: movdqa %xmm0, %xmm7
+; SSSE3-NEXT: por %xmm6, %xmm7
+; SSSE3-NEXT: movdqa %xmm5, %xmm11
+; SSSE3-NEXT: pcmpgtd %xmm7, %xmm11
+; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2]
+; SSSE3-NEXT: pcmpeqd %xmm5, %xmm7
+; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; SSSE3-NEXT: pand %xmm12, %xmm5
+; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm11[1,1,3,3]
+; SSSE3-NEXT: por %xmm5, %xmm11
+; SSSE3-NEXT: movdqa %xmm4, %xmm5
+; SSSE3-NEXT: pxor %xmm6, %xmm5
+; SSSE3-NEXT: movdqa %xmm8, %xmm7
+; SSSE3-NEXT: por %xmm6, %xmm7
+; SSSE3-NEXT: movdqa %xmm5, %xmm12
+; SSSE3-NEXT: pcmpgtd %xmm7, %xmm12
+; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2]
+; SSSE3-NEXT: pcmpeqd %xmm5, %xmm7
+; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; SSSE3-NEXT: pand %xmm13, %xmm5
+; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm12[1,1,3,3]
+; SSSE3-NEXT: por %xmm5, %xmm12
+; SSSE3-NEXT: movdqa %xmm1, %xmm5
+; SSSE3-NEXT: pxor %xmm6, %xmm5
+; SSSE3-NEXT: movdqa %xmm10, %xmm7
+; SSSE3-NEXT: por %xmm6, %xmm7
+; SSSE3-NEXT: movdqa %xmm5, %xmm13
+; SSSE3-NEXT: pcmpgtd %xmm7, %xmm13
+; SSSE3-NEXT: pshufd {{.*#+}} xmm14 = xmm13[0,0,2,2]
+; SSSE3-NEXT: pcmpeqd %xmm5, %xmm7
+; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
+; SSSE3-NEXT: pand %xmm14, %xmm7
+; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm13[1,1,3,3]
+; SSSE3-NEXT: por %xmm7, %xmm13
+; SSSE3-NEXT: movdqa %xmm2, %xmm7
+; SSSE3-NEXT: pxor %xmm6, %xmm7
+; SSSE3-NEXT: por %xmm9, %xmm6
+; SSSE3-NEXT: movdqa %xmm7, %xmm5
+; SSSE3-NEXT: pcmpgtd %xmm6, %xmm5
+; SSSE3-NEXT: pshufd {{.*#+}} xmm14 = xmm5[0,0,2,2]
+; SSSE3-NEXT: pcmpeqd %xmm7, %xmm6
+; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3]
+; SSSE3-NEXT: pand %xmm14, %xmm7
+; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3]
+; SSSE3-NEXT: por %xmm7, %xmm6
+; SSSE3-NEXT: movdqa %xmm2, %xmm5
+; SSSE3-NEXT: pand %xmm6, %xmm5
+; SSSE3-NEXT: pandn %xmm9, %xmm6
+; SSSE3-NEXT: por %xmm5, %xmm6
+; SSSE3-NEXT: movdqa %xmm1, %xmm5
+; SSSE3-NEXT: pand %xmm13, %xmm5
+; SSSE3-NEXT: pandn %xmm10, %xmm13
+; SSSE3-NEXT: por %xmm5, %xmm13
+; SSSE3-NEXT: movdqa %xmm4, %xmm5
+; SSSE3-NEXT: pand %xmm12, %xmm5
+; SSSE3-NEXT: pandn %xmm8, %xmm12
+; SSSE3-NEXT: por %xmm5, %xmm12
+; SSSE3-NEXT: movdqa %xmm3, %xmm5
+; SSSE3-NEXT: pand %xmm11, %xmm5
+; SSSE3-NEXT: pandn %xmm0, %xmm11
+; SSSE3-NEXT: por %xmm5, %xmm11
+; SSSE3-NEXT: psubq %xmm3, %xmm11
+; SSSE3-NEXT: psubq %xmm4, %xmm12
+; SSSE3-NEXT: psubq %xmm1, %xmm13
+; SSSE3-NEXT: psubq %xmm2, %xmm6
+; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,2,2,3]
+; SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm13[0,2,2,3]
+; SSSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm12[0,2,2,3]
+; SSSE3-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[0,1,0,2,4,5,6,7]
+; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,2,2,3]
+; SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: psubus_8i64_max:
+; SSE41: # %bb.0: # %vector.ph
+; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[3,1,2,3]
+; SSE41-NEXT: pmovzxwq {{.*#+}} xmm11 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero
+; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[2,3,0,1]
+; SSE41-NEXT: pmovzxwq {{.*#+}} xmm12 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero
+; SSE41-NEXT: pmovzxwq {{.*#+}} xmm13 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; SSE41-NEXT: pmovzxwq {{.*#+}} xmm10 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648,2147483648,2147483648]
+; SSE41-NEXT: movdqa %xmm2, %xmm6
+; SSE41-NEXT: pxor %xmm0, %xmm6
+; SSE41-NEXT: movdqa %xmm10, %xmm7
+; SSE41-NEXT: por %xmm0, %xmm7
+; SSE41-NEXT: movdqa %xmm6, %xmm5
+; SSE41-NEXT: pcmpgtd %xmm7, %xmm5
+; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm5[0,0,2,2]
+; SSE41-NEXT: pcmpeqd %xmm6, %xmm7
+; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3]
+; SSE41-NEXT: pand %xmm8, %xmm6
+; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm5[1,1,3,3]
+; SSE41-NEXT: por %xmm6, %xmm8
+; SSE41-NEXT: movdqa %xmm1, %xmm5
+; SSE41-NEXT: pxor %xmm0, %xmm5
+; SSE41-NEXT: movdqa %xmm13, %xmm6
+; SSE41-NEXT: por %xmm0, %xmm6
+; SSE41-NEXT: movdqa %xmm5, %xmm7
+; SSE41-NEXT: pcmpgtd %xmm6, %xmm7
+; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm7[0,0,2,2]
+; SSE41-NEXT: pcmpeqd %xmm5, %xmm6
+; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
+; SSE41-NEXT: pand %xmm9, %xmm5
+; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm7[1,1,3,3]
+; SSE41-NEXT: por %xmm5, %xmm9
+; SSE41-NEXT: movdqa %xmm3, %xmm5
+; SSE41-NEXT: pxor %xmm0, %xmm5
+; SSE41-NEXT: movdqa %xmm12, %xmm6
+; SSE41-NEXT: por %xmm0, %xmm6
+; SSE41-NEXT: movdqa %xmm5, %xmm7
+; SSE41-NEXT: pcmpgtd %xmm6, %xmm7
+; SSE41-NEXT: pshufd {{.*#+}} xmm14 = xmm7[0,0,2,2]
+; SSE41-NEXT: pcmpeqd %xmm5, %xmm6
+; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
+; SSE41-NEXT: pand %xmm14, %xmm5
+; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3]
+; SSE41-NEXT: por %xmm5, %xmm6
+; SSE41-NEXT: movdqa %xmm4, %xmm5
+; SSE41-NEXT: pxor %xmm0, %xmm5
+; SSE41-NEXT: por %xmm11, %xmm0
+; SSE41-NEXT: movdqa %xmm5, %xmm7
+; SSE41-NEXT: pcmpgtd %xmm0, %xmm7
+; SSE41-NEXT: pshufd {{.*#+}} xmm14 = xmm7[0,0,2,2]
+; SSE41-NEXT: pcmpeqd %xmm5, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
+; SSE41-NEXT: pand %xmm14, %xmm5
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3]
+; SSE41-NEXT: por %xmm5, %xmm0
+; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm11
+; SSE41-NEXT: movdqa %xmm6, %xmm0
+; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm12
+; SSE41-NEXT: movdqa %xmm9, %xmm0
+; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm13
+; SSE41-NEXT: movdqa %xmm8, %xmm0
+; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm10
+; SSE41-NEXT: psubq %xmm2, %xmm10
+; SSE41-NEXT: psubq %xmm1, %xmm13
+; SSE41-NEXT: psubq %xmm3, %xmm12
+; SSE41-NEXT: psubq %xmm4, %xmm11
+; SSE41-NEXT: pxor %xmm0, %xmm0
+; SSE41-NEXT: pblendw {{.*#+}} xmm11 = xmm11[0],xmm0[1,2,3],xmm11[4],xmm0[5,6,7]
+; SSE41-NEXT: pblendw {{.*#+}} xmm12 = xmm12[0],xmm0[1,2,3],xmm12[4],xmm0[5,6,7]
+; SSE41-NEXT: packusdw %xmm11, %xmm12
+; SSE41-NEXT: pblendw {{.*#+}} xmm13 = xmm13[0],xmm0[1,2,3],xmm13[4],xmm0[5,6,7]
+; SSE41-NEXT: pblendw {{.*#+}} xmm10 = xmm10[0],xmm0[1,2,3],xmm10[4],xmm0[5,6,7]
+; SSE41-NEXT: packusdw %xmm10, %xmm13
+; SSE41-NEXT: packusdw %xmm12, %xmm13
+; SSE41-NEXT: movdqa %xmm13, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: psubus_8i64_max:
+; AVX1: # %bb.0: # %vector.ph
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
+; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[3,1,2,3]
+; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm8
+; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm6, %ymm9
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm10
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: vpxor %xmm7, %xmm10, %xmm5
+; AVX1-NEXT: vpor %xmm7, %xmm4, %xmm4
+; AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4
+; AVX1-NEXT: vpxor %xmm7, %xmm2, %xmm5
+; AVX1-NEXT: vpor %xmm7, %xmm3, %xmm3
+; AVX1-NEXT: vpcmpgtq %xmm3, %xmm5, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT: vpxor %xmm7, %xmm4, %xmm5
+; AVX1-NEXT: vpor %xmm7, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpgtq %xmm0, %xmm5, %xmm0
+; AVX1-NEXT: vpor %xmm7, %xmm6, %xmm5
+; AVX1-NEXT: vpxor %xmm7, %xmm1, %xmm6
+; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm5, %ymm0
+; AVX1-NEXT: vblendvpd %ymm0, %ymm1, %ymm9, %ymm0
+; AVX1-NEXT: vblendvpd %ymm3, %ymm2, %ymm8, %ymm3
+; AVX1-NEXT: vpsubq %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
+; AVX1-NEXT: vpsubq %xmm10, %xmm3, %xmm3
+; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpsubq %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1,2,3],xmm0[4],xmm4[5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1,2,3],xmm1[4],xmm4[5,6,7]
+; AVX1-NEXT: vpackusdw %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4],xmm4[5,6,7]
+; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: psubus_8i64_max:
+; AVX2: # %bb.0: # %vector.ph
+; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; AVX2-NEXT: vpxor %ymm4, %ymm2, %ymm5
+; AVX2-NEXT: vpor %ymm4, %ymm0, %ymm6
+; AVX2-NEXT: vpcmpgtq %ymm6, %ymm5, %ymm5
+; AVX2-NEXT: vpxor %ymm4, %ymm1, %ymm6
+; AVX2-NEXT: vpor %ymm4, %ymm3, %ymm4
+; AVX2-NEXT: vpcmpgtq %ymm4, %ymm6, %ymm4
+; AVX2-NEXT: vblendvpd %ymm4, %ymm1, %ymm3, %ymm3
+; AVX2-NEXT: vblendvpd %ymm5, %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpsubq %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpsubq %ymm1, %ymm3, %ymm1
+; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: psubus_8i64_max:
+; AVX512: # %bb.0: # %vector.ph
+; AVX512-NEXT: vpmovusqw %zmm1, %xmm1
+; AVX512-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
+vector.ph:
+ %lhs = zext <8 x i16> %x to <8 x i64>
+ %cond = icmp ult <8 x i64> %lhs, %y
+ %max = select <8 x i1> %cond, <8 x i64> %y, <8 x i64> %lhs
+ %sub = sub <8 x i64> %max, %y
+ %res = trunc <8 x i64> %sub to <8 x i16>
+ ret <8 x i16> %res
+}
+
+define <16 x i16> @psubus_16i32_max(<16 x i16> %x, <16 x i32> %y) nounwind {
+; SSE2-LABEL: psubus_16i32_max:
+; SSE2: # %bb.0: # %vector.ph
+; SSE2-NEXT: movdqa %xmm1, %xmm8
+; SSE2-NEXT: movdqa %xmm0, %xmm9
+; SSE2-NEXT: pxor %xmm0, %xmm0
+; SSE2-NEXT: movdqa %xmm9, %xmm11
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm0[4],xmm11[5],xmm0[5],xmm11[6],xmm0[6],xmm11[7],xmm0[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3]
+; SSE2-NEXT: movdqa %xmm8, %xmm10
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm0[4],xmm10[5],xmm0[5],xmm10[6],xmm0[6],xmm10[7],xmm0[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3]
+; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT: movdqa %xmm4, %xmm1
+; SSE2-NEXT: pxor %xmm6, %xmm1
+; SSE2-NEXT: movdqa %xmm8, %xmm0
+; SSE2-NEXT: por %xmm6, %xmm0
+; SSE2-NEXT: pcmpgtd %xmm0, %xmm1
+; SSE2-NEXT: movdqa %xmm5, %xmm12
+; SSE2-NEXT: pxor %xmm6, %xmm12
+; SSE2-NEXT: movdqa %xmm10, %xmm0
+; SSE2-NEXT: por %xmm6, %xmm0
+; SSE2-NEXT: pcmpgtd %xmm0, %xmm12
+; SSE2-NEXT: movdqa %xmm2, %xmm0
+; SSE2-NEXT: pxor %xmm6, %xmm0
+; SSE2-NEXT: movdqa %xmm9, %xmm7
+; SSE2-NEXT: por %xmm6, %xmm7
+; SSE2-NEXT: pcmpgtd %xmm7, %xmm0
+; SSE2-NEXT: movdqa %xmm3, %xmm7
+; SSE2-NEXT: pxor %xmm6, %xmm7
+; SSE2-NEXT: por %xmm11, %xmm6
+; SSE2-NEXT: pcmpgtd %xmm6, %xmm7
+; SSE2-NEXT: movdqa %xmm3, %xmm6
+; SSE2-NEXT: pand %xmm7, %xmm6
+; SSE2-NEXT: pandn %xmm11, %xmm7
+; SSE2-NEXT: por %xmm6, %xmm7
+; SSE2-NEXT: movdqa %xmm2, %xmm6
+; SSE2-NEXT: pand %xmm0, %xmm6
+; SSE2-NEXT: pandn %xmm9, %xmm0
+; SSE2-NEXT: por %xmm6, %xmm0
+; SSE2-NEXT: movdqa %xmm5, %xmm6
+; SSE2-NEXT: pand %xmm12, %xmm6
+; SSE2-NEXT: pandn %xmm10, %xmm12
+; SSE2-NEXT: por %xmm6, %xmm12
+; SSE2-NEXT: movdqa %xmm4, %xmm6
+; SSE2-NEXT: pand %xmm1, %xmm6
+; SSE2-NEXT: pandn %xmm8, %xmm1
+; SSE2-NEXT: por %xmm6, %xmm1
+; SSE2-NEXT: psubd %xmm4, %xmm1
+; SSE2-NEXT: psubd %xmm5, %xmm12
+; SSE2-NEXT: psubd %xmm2, %xmm0
+; SSE2-NEXT: psubd %xmm3, %xmm7
+; SSE2-NEXT: pslld $16, %xmm7
+; SSE2-NEXT: psrad $16, %xmm7
+; SSE2-NEXT: pslld $16, %xmm0
+; SSE2-NEXT: psrad $16, %xmm0
+; SSE2-NEXT: packssdw %xmm7, %xmm0
+; SSE2-NEXT: pslld $16, %xmm12
+; SSE2-NEXT: psrad $16, %xmm12
+; SSE2-NEXT: pslld $16, %xmm1
+; SSE2-NEXT: psrad $16, %xmm1
+; SSE2-NEXT: packssdw %xmm12, %xmm1
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: psubus_16i32_max:
+; SSSE3: # %bb.0: # %vector.ph
+; SSSE3-NEXT: movdqa %xmm1, %xmm8
+; SSSE3-NEXT: movdqa %xmm0, %xmm9
+; SSSE3-NEXT: pxor %xmm0, %xmm0
+; SSSE3-NEXT: movdqa %xmm9, %xmm11
+; SSSE3-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm0[4],xmm11[5],xmm0[5],xmm11[6],xmm0[6],xmm11[7],xmm0[7]
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3]
+; SSSE3-NEXT: movdqa %xmm8, %xmm10
+; SSSE3-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm0[4],xmm10[5],xmm0[5],xmm10[6],xmm0[6],xmm10[7],xmm0[7]
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3]
+; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648]
+; SSSE3-NEXT: movdqa %xmm4, %xmm1
+; SSSE3-NEXT: pxor %xmm6, %xmm1
+; SSSE3-NEXT: movdqa %xmm8, %xmm0
+; SSSE3-NEXT: por %xmm6, %xmm0
+; SSSE3-NEXT: pcmpgtd %xmm0, %xmm1
+; SSSE3-NEXT: movdqa %xmm5, %xmm12
+; SSSE3-NEXT: pxor %xmm6, %xmm12
+; SSSE3-NEXT: movdqa %xmm10, %xmm0
+; SSSE3-NEXT: por %xmm6, %xmm0
+; SSSE3-NEXT: pcmpgtd %xmm0, %xmm12
+; SSSE3-NEXT: movdqa %xmm2, %xmm0
+; SSSE3-NEXT: pxor %xmm6, %xmm0
+; SSSE3-NEXT: movdqa %xmm9, %xmm7
+; SSSE3-NEXT: por %xmm6, %xmm7
+; SSSE3-NEXT: pcmpgtd %xmm7, %xmm0
+; SSSE3-NEXT: movdqa %xmm3, %xmm7
+; SSSE3-NEXT: pxor %xmm6, %xmm7
+; SSSE3-NEXT: por %xmm11, %xmm6
+; SSSE3-NEXT: pcmpgtd %xmm6, %xmm7
+; SSSE3-NEXT: movdqa %xmm3, %xmm6
+; SSSE3-NEXT: pand %xmm7, %xmm6
+; SSSE3-NEXT: pandn %xmm11, %xmm7
+; SSSE3-NEXT: por %xmm6, %xmm7
+; SSSE3-NEXT: movdqa %xmm2, %xmm6
+; SSSE3-NEXT: pand %xmm0, %xmm6
+; SSSE3-NEXT: pandn %xmm9, %xmm0
+; SSSE3-NEXT: por %xmm6, %xmm0
+; SSSE3-NEXT: movdqa %xmm5, %xmm6
+; SSSE3-NEXT: pand %xmm12, %xmm6
+; SSSE3-NEXT: pandn %xmm10, %xmm12
+; SSSE3-NEXT: por %xmm6, %xmm12
+; SSSE3-NEXT: movdqa %xmm4, %xmm6
+; SSSE3-NEXT: pand %xmm1, %xmm6
+; SSSE3-NEXT: pandn %xmm8, %xmm1
+; SSSE3-NEXT: por %xmm6, %xmm1
+; SSSE3-NEXT: psubd %xmm4, %xmm1
+; SSSE3-NEXT: psubd %xmm5, %xmm12
+; SSSE3-NEXT: psubd %xmm2, %xmm0
+; SSSE3-NEXT: psubd %xmm3, %xmm7
+; SSSE3-NEXT: pslld $16, %xmm7
+; SSSE3-NEXT: psrad $16, %xmm7
+; SSSE3-NEXT: pslld $16, %xmm0
+; SSSE3-NEXT: psrad $16, %xmm0
+; SSSE3-NEXT: packssdw %xmm7, %xmm0
+; SSSE3-NEXT: pslld $16, %xmm12
+; SSSE3-NEXT: psrad $16, %xmm12
+; SSSE3-NEXT: pslld $16, %xmm1
+; SSSE3-NEXT: psrad $16, %xmm1
+; SSSE3-NEXT: packssdw %xmm12, %xmm1
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: psubus_16i32_max:
+; SSE41: # %bb.0: # %vector.ph
+; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm1[2,3,0,1]
+; SSE41-NEXT: pmovzxwd {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero
+; SSE41-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm0[2,3,0,1]
+; SSE41-NEXT: pmovzxwd {{.*#+}} xmm7 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero
+; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; SSE41-NEXT: pmaxud %xmm2, %xmm0
+; SSE41-NEXT: pmaxud %xmm3, %xmm7
+; SSE41-NEXT: pmaxud %xmm4, %xmm1
+; SSE41-NEXT: pmaxud %xmm5, %xmm6
+; SSE41-NEXT: psubd %xmm5, %xmm6
+; SSE41-NEXT: psubd %xmm4, %xmm1
+; SSE41-NEXT: psubd %xmm3, %xmm7
+; SSE41-NEXT: psubd %xmm2, %xmm0
+; SSE41-NEXT: pxor %xmm2, %xmm2
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
+; SSE41-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0],xmm2[1],xmm7[2],xmm2[3],xmm7[4],xmm2[5],xmm7[6],xmm2[7]
+; SSE41-NEXT: packusdw %xmm7, %xmm0
+; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
+; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0],xmm2[1],xmm6[2],xmm2[3],xmm6[4],xmm2[5],xmm6[6],xmm2[7]
+; SSE41-NEXT: packusdw %xmm6, %xmm1
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: psubus_16i32_max:
+; AVX1: # %bb.0: # %vector.ph
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [65535,65535,65535,65535]
+; AVX1-NEXT: vpminud %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpminud %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vpminud %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpminud %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vpsubusw %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
+; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
+; AVX1-NEXT: vpackusdw %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
+; AVX1-NEXT: vpackusdw %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: psubus_16i32_max:
+; AVX2: # %bb.0: # %vector.ph
+; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [65535,65535,65535,65535,65535,65535,65535,65535]
+; AVX2-NEXT: vpminud %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm4
+; AVX2-NEXT: vpackusdw %xmm4, %xmm1, %xmm1
+; AVX2-NEXT: vpminud %ymm3, %ymm2, %ymm2
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX2-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX2-NEXT: vpackusdw %xmm4, %xmm3, %xmm3
+; AVX2-NEXT: vpsubusw %xmm1, %xmm3, %xmm1
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
+; AVX2-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
+; AVX2-NEXT: vpsubusw %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: psubus_16i32_max:
+; AVX512: # %bb.0: # %vector.ph
+; AVX512-NEXT: vpmovusdw %zmm1, %ymm1
+; AVX512-NEXT: vpsubusw %ymm1, %ymm0, %ymm0
+; AVX512-NEXT: retq
+vector.ph:
+ %lhs = zext <16 x i16> %x to <16 x i32>
+ %cond = icmp ult <16 x i32> %lhs, %y
+ %max = select <16 x i1> %cond, <16 x i32> %y, <16 x i32> %lhs
+ %sub = sub <16 x i32> %max, %y
+ %res = trunc <16 x i32> %sub to <16 x i16>
+ ret <16 x i16> %res
+}
+
+define <8 x i16> @psubus_i16_i32_max_swapped(<8 x i16> %x, <8 x i32> %y) nounwind {
+; SSE2-LABEL: psubus_i16_i32_max_swapped:
+; SSE2: # %bb.0: # %vector.ph
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: pxor %xmm0, %xmm0
+; SSE2-NEXT: movdqa %xmm3, %xmm5
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
+; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT: movdqa %xmm1, %xmm6
+; SSE2-NEXT: pxor %xmm4, %xmm6
+; SSE2-NEXT: movdqa %xmm3, %xmm0
+; SSE2-NEXT: por %xmm4, %xmm0
+; SSE2-NEXT: pcmpgtd %xmm6, %xmm0
+; SSE2-NEXT: movdqa %xmm2, %xmm6
+; SSE2-NEXT: pxor %xmm4, %xmm6
+; SSE2-NEXT: por %xmm5, %xmm4
+; SSE2-NEXT: pcmpgtd %xmm6, %xmm4
+; SSE2-NEXT: pand %xmm4, %xmm5
+; SSE2-NEXT: pandn %xmm2, %xmm4
+; SSE2-NEXT: por %xmm5, %xmm4
+; SSE2-NEXT: pand %xmm0, %xmm3
+; SSE2-NEXT: pandn %xmm1, %xmm0
+; SSE2-NEXT: por %xmm3, %xmm0
+; SSE2-NEXT: psubd %xmm1, %xmm0
+; SSE2-NEXT: psubd %xmm2, %xmm4
+; SSE2-NEXT: pslld $16, %xmm4
+; SSE2-NEXT: psrad $16, %xmm4
+; SSE2-NEXT: pslld $16, %xmm0
+; SSE2-NEXT: psrad $16, %xmm0
+; SSE2-NEXT: packssdw %xmm4, %xmm0
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: psubus_i16_i32_max_swapped:
+; SSSE3: # %bb.0: # %vector.ph
+; SSSE3-NEXT: movdqa %xmm0, %xmm3
+; SSSE3-NEXT: pxor %xmm0, %xmm0
+; SSSE3-NEXT: movdqa %xmm3, %xmm5
+; SSSE3-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
+; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
+; SSSE3-NEXT: movdqa %xmm1, %xmm6
+; SSSE3-NEXT: pxor %xmm4, %xmm6
+; SSSE3-NEXT: movdqa %xmm3, %xmm0
+; SSSE3-NEXT: por %xmm4, %xmm0
+; SSSE3-NEXT: pcmpgtd %xmm6, %xmm0
+; SSSE3-NEXT: movdqa %xmm2, %xmm6
+; SSSE3-NEXT: pxor %xmm4, %xmm6
+; SSSE3-NEXT: por %xmm5, %xmm4
+; SSSE3-NEXT: pcmpgtd %xmm6, %xmm4
+; SSSE3-NEXT: pand %xmm4, %xmm5
+; SSSE3-NEXT: pandn %xmm2, %xmm4
+; SSSE3-NEXT: por %xmm5, %xmm4
+; SSSE3-NEXT: pand %xmm0, %xmm3
+; SSSE3-NEXT: pandn %xmm1, %xmm0
+; SSSE3-NEXT: por %xmm3, %xmm0
+; SSSE3-NEXT: psubd %xmm1, %xmm0
+; SSSE3-NEXT: psubd %xmm2, %xmm4
+; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; SSSE3-NEXT: pshufb %xmm1, %xmm4
+; SSSE3-NEXT: pshufb %xmm1, %xmm0
+; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: psubus_i16_i32_max_swapped:
+; SSE41: # %bb.0: # %vector.ph
+; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535]
+; SSE41-NEXT: pminud %xmm3, %xmm2
+; SSE41-NEXT: pminud %xmm3, %xmm1
+; SSE41-NEXT: packusdw %xmm2, %xmm1
+; SSE41-NEXT: psubusw %xmm1, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: psubus_i16_i32_max_swapped:
+; AVX1: # %bb.0: # %vector.ph
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [65535,65535,65535,65535]
+; AVX1-NEXT: vpminud %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpminud %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: psubus_i16_i32_max_swapped:
+; AVX2: # %bb.0: # %vector.ph
+; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535]
+; AVX2-NEXT: vpminud %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: psubus_i16_i32_max_swapped:
+; AVX512: # %bb.0: # %vector.ph
+; AVX512-NEXT: vpmovusdw %ymm1, %xmm1
+; AVX512-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
+vector.ph:
+ %lhs = zext <8 x i16> %x to <8 x i32>
+ %cond = icmp ult <8 x i32> %y, %lhs
+ %max = select <8 x i1> %cond, <8 x i32> %lhs, <8 x i32> %y
+ %sub = sub <8 x i32> %max, %y
+ %res = trunc <8 x i32> %sub to <8 x i16>
+ ret <8 x i16> %res
+}
+
+define <8 x i16> @psubus_i16_i32_min(<8 x i16> %x, <8 x i32> %y) nounwind {
+; SSE2-LABEL: psubus_i16_i32_min:
+; SSE2: # %bb.0: # %vector.ph
+; SSE2-NEXT: pxor %xmm4, %xmm4
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
+; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT: movdqa %xmm1, %xmm5
+; SSE2-NEXT: pxor %xmm4, %xmm5
+; SSE2-NEXT: movdqa %xmm0, %xmm6
+; SSE2-NEXT: por %xmm4, %xmm6
+; SSE2-NEXT: pcmpgtd %xmm6, %xmm5
+; SSE2-NEXT: movdqa %xmm2, %xmm6
+; SSE2-NEXT: pxor %xmm4, %xmm6
+; SSE2-NEXT: por %xmm3, %xmm4
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm6
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: pand %xmm6, %xmm4
+; SSE2-NEXT: pandn %xmm2, %xmm6
+; SSE2-NEXT: por %xmm4, %xmm6
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: pand %xmm5, %xmm2
+; SSE2-NEXT: pandn %xmm1, %xmm5
+; SSE2-NEXT: por %xmm2, %xmm5
+; SSE2-NEXT: psubd %xmm5, %xmm0
+; SSE2-NEXT: psubd %xmm6, %xmm3
+; SSE2-NEXT: pslld $16, %xmm3
+; SSE2-NEXT: psrad $16, %xmm3
+; SSE2-NEXT: pslld $16, %xmm0
+; SSE2-NEXT: psrad $16, %xmm0
+; SSE2-NEXT: packssdw %xmm3, %xmm0
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: psubus_i16_i32_min:
+; SSSE3: # %bb.0: # %vector.ph
+; SSSE3-NEXT: pxor %xmm4, %xmm4
+; SSSE3-NEXT: movdqa %xmm0, %xmm3
+; SSSE3-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
+; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
+; SSSE3-NEXT: movdqa %xmm1, %xmm5
+; SSSE3-NEXT: pxor %xmm4, %xmm5
+; SSSE3-NEXT: movdqa %xmm0, %xmm6
+; SSSE3-NEXT: por %xmm4, %xmm6
+; SSSE3-NEXT: pcmpgtd %xmm6, %xmm5
+; SSSE3-NEXT: movdqa %xmm2, %xmm6
+; SSSE3-NEXT: pxor %xmm4, %xmm6
+; SSSE3-NEXT: por %xmm3, %xmm4
+; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6
+; SSSE3-NEXT: movdqa %xmm3, %xmm4
+; SSSE3-NEXT: pand %xmm6, %xmm4
+; SSSE3-NEXT: pandn %xmm2, %xmm6
+; SSSE3-NEXT: por %xmm4, %xmm6
+; SSSE3-NEXT: movdqa %xmm0, %xmm2
+; SSSE3-NEXT: pand %xmm5, %xmm2
+; SSSE3-NEXT: pandn %xmm1, %xmm5
+; SSSE3-NEXT: por %xmm2, %xmm5
+; SSSE3-NEXT: psubd %xmm5, %xmm0
+; SSSE3-NEXT: psubd %xmm6, %xmm3
+; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; SSSE3-NEXT: pshufb %xmm1, %xmm3
+; SSSE3-NEXT: pshufb %xmm1, %xmm0
+; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: psubus_i16_i32_min:
+; SSE41: # %bb.0: # %vector.ph
+; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535]
+; SSE41-NEXT: pminud %xmm3, %xmm2
+; SSE41-NEXT: pminud %xmm3, %xmm1
+; SSE41-NEXT: packusdw %xmm2, %xmm1
+; SSE41-NEXT: psubusw %xmm1, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: psubus_i16_i32_min:
+; AVX1: # %bb.0: # %vector.ph
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [65535,65535,65535,65535]
+; AVX1-NEXT: vpminud %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpminud %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: psubus_i16_i32_min:
+; AVX2: # %bb.0: # %vector.ph
+; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535]
+; AVX2-NEXT: vpminud %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: psubus_i16_i32_min:
+; AVX512: # %bb.0: # %vector.ph
+; AVX512-NEXT: vpmovusdw %ymm1, %xmm1
+; AVX512-NEXT: vpsubusw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
+vector.ph:
+ %lhs = zext <8 x i16> %x to <8 x i32>
+ %cond = icmp ult <8 x i32> %lhs, %y
+ %min = select <8 x i1> %cond, <8 x i32> %lhs, <8 x i32> %y
+ %sub = sub <8 x i32> %lhs, %min
+ %res = trunc <8 x i32> %sub to <8 x i16>
+ ret <8 x i16> %res
+}
diff --git a/test/CodeGen/X86/push-cfi.ll b/test/CodeGen/X86/push-cfi.ll
index 5428f12ad1cc..91e579a8391b 100644
--- a/test/CodeGen/X86/push-cfi.ll
+++ b/test/CodeGen/X86/push-cfi.ll
@@ -13,16 +13,12 @@ declare void @empty()
; CHECK-LABEL: test1_nofp:
; LINUX: .cfi_escape 0x2e, 0x10
; LINUX-NEXT: pushl $4
-; LINUX-NEXT: Lcfi{{[0-9]+}}:
; LINUX-NEXT: .cfi_adjust_cfa_offset 4
; LINUX-NEXT: pushl $3
-; LINUX-NEXT: Lcfi{{[0-9]+}}:
; LINUX-NEXT: .cfi_adjust_cfa_offset 4
; LINUX-NEXT: pushl $2
-; LINUX-NEXT: Lcfi{{[0-9]+}}:
; LINUX-NEXT: .cfi_adjust_cfa_offset 4
; LINUX-NEXT: pushl $1
-; LINUX-NEXT: Lcfi{{[0-9]+}}:
; LINUX-NEXT: .cfi_adjust_cfa_offset 4
; LINUX-NEXT: call
; LINUX-NEXT: addl $16, %esp
@@ -70,16 +66,12 @@ cleanup:
; CHECK-LABEL: test2_nofp:
; LINUX-NOT: .cfi_escape
; LINUX: pushl $4
-; LINUX-NEXT: Lcfi{{[0-9]+}}:
; LINUX-NEXT: .cfi_adjust_cfa_offset 4
; LINUX-NEXT: pushl $3
-; LINUX-NEXT: Lcfi{{[0-9]+}}:
; LINUX-NEXT: .cfi_adjust_cfa_offset 4
; LINUX-NEXT: pushl $2
-; LINUX-NEXT: Lcfi{{[0-9]+}}:
; LINUX-NEXT: .cfi_adjust_cfa_offset 4
; LINUX-NEXT: pushl $1
-; LINUX-NEXT: Lcfi{{[0-9]+}}:
; LINUX-NEXT: .cfi_adjust_cfa_offset 4
; LINUX-NEXT: call
; LINUX-NEXT: addl $28, %esp
@@ -185,16 +177,12 @@ cleanup:
; CHECK-LABEL: test5_nofp:
; LINUX: .cfi_escape 0x2e, 0x10
; LINUX-NEXT: pushl $4
-; LINUX-NEXT: Lcfi{{[0-9]+}}:
; LINUX-NEXT: .cfi_adjust_cfa_offset 4
; LINUX-NEXT: pushl $3
-; LINUX-NEXT: Lcfi{{[0-9]+}}:
; LINUX-NEXT: .cfi_adjust_cfa_offset 4
; LINUX-NEXT: pushl $2
-; LINUX-NEXT: Lcfi{{[0-9]+}}:
; LINUX-NEXT: .cfi_adjust_cfa_offset 4
; LINUX-NEXT: pushl $1
-; LINUX-NEXT: Lcfi{{[0-9]+}}:
; LINUX-NEXT: .cfi_adjust_cfa_offset 4
; LINUX-NEXT: call
; LINUX-NEXT: addl $16, %esp
diff --git a/test/CodeGen/X86/rd-mod-wr-eflags.ll b/test/CodeGen/X86/rd-mod-wr-eflags.ll
index 972372151bcf..c49d5c91f61e 100644
--- a/test/CodeGen/X86/rd-mod-wr-eflags.ll
+++ b/test/CodeGen/X86/rd-mod-wr-eflags.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-- | FileCheck %s
%struct.obj = type { i64 }
diff --git a/test/CodeGen/X86/rdpmc.ll b/test/CodeGen/X86/rdpmc.ll
index 7f1ca469c0b6..8c2e0711218e 100644
--- a/test/CodeGen/X86/rdpmc.ll
+++ b/test/CodeGen/X86/rdpmc.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=x86-64 -mcpu=generic | FileCheck %s --check-prefix=CHECK --check-prefix=X86-64
-; RUN: llc < %s -march=x86 -mcpu=generic | FileCheck %s --check-prefix=CHECK --check-prefix=X86
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=generic | FileCheck %s --check-prefix=CHECK --check-prefix=X86-64
+; RUN: llc < %s -mtriple=i686-- -mcpu=generic | FileCheck %s --check-prefix=CHECK --check-prefix=X86
; Verify that we correctly lower the "Read Performance-Monitoring Counters"
; x86 builtin.
diff --git a/test/CodeGen/X86/rdrand-schedule.ll b/test/CodeGen/X86/rdrand-schedule.ll
new file mode 100644
index 000000000000..183aec59f9d8
--- /dev/null
+++ b/test/CodeGen/X86/rdrand-schedule.ll
@@ -0,0 +1,148 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=+rdrnd | FileCheck %s --check-prefix=CHECK --check-prefix=GENERIC
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=goldmont | FileCheck %s --check-prefix=CHECK --check-prefix=GOLDMONT
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=ivybridge | FileCheck %s --check-prefix=CHECK --check-prefix=IVY
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=haswell | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=broadwell | FileCheck %s --check-prefix=CHECK --check-prefix=BROADWELL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=SKYLAKE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skx | FileCheck %s --check-prefix=CHECK --check-prefix=SKX
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1
+
+declare {i16, i32} @llvm.x86.rdrand.16()
+declare {i32, i32} @llvm.x86.rdrand.32()
+declare {i64, i32} @llvm.x86.rdrand.64()
+
+define i16 @test_rdrand_16(i16* %random_val) {
+; GENERIC-LABEL: test_rdrand_16:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: rdrandw %ax # sched: [100:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; GOLDMONT-LABEL: test_rdrand_16:
+; GOLDMONT: # %bb.0:
+; GOLDMONT-NEXT: rdrandw %ax # sched: [100:1.00]
+; GOLDMONT-NEXT: retq # sched: [4:1.00]
+;
+; IVY-LABEL: test_rdrand_16:
+; IVY: # %bb.0:
+; IVY-NEXT: rdrandw %ax # sched: [100:0.33]
+; IVY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_rdrand_16:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: rdrandw %ax # sched: [1:5.33]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_rdrand_16:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: rdrandw %ax # sched: [9:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_rdrand_16:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: rdrandw %ax # sched: [100:0.25]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_rdrand_16:
+; SKX: # %bb.0:
+; SKX-NEXT: rdrandw %ax # sched: [100:0.25]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_rdrand_16:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: rdrandw %ax # sched: [100:?]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %call = call {i16, i32} @llvm.x86.rdrand.16()
+ %randval = extractvalue {i16, i32} %call, 0
+ ret i16 %randval
+}
+
+define i32 @test_rdrand_32(i32* %random_val) {
+; GENERIC-LABEL: test_rdrand_32:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: rdrandl %eax # sched: [100:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; GOLDMONT-LABEL: test_rdrand_32:
+; GOLDMONT: # %bb.0:
+; GOLDMONT-NEXT: rdrandl %eax # sched: [100:1.00]
+; GOLDMONT-NEXT: retq # sched: [4:1.00]
+;
+; IVY-LABEL: test_rdrand_32:
+; IVY: # %bb.0:
+; IVY-NEXT: rdrandl %eax # sched: [100:0.33]
+; IVY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_rdrand_32:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: rdrandl %eax # sched: [1:5.33]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_rdrand_32:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: rdrandl %eax # sched: [9:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_rdrand_32:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: rdrandl %eax # sched: [100:0.25]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_rdrand_32:
+; SKX: # %bb.0:
+; SKX-NEXT: rdrandl %eax # sched: [100:0.25]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_rdrand_32:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: rdrandl %eax # sched: [100:?]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %call = call {i32, i32} @llvm.x86.rdrand.32()
+ %randval = extractvalue {i32, i32} %call, 0
+ ret i32 %randval
+}
+
+define i64 @test_rdrand_64(i64* %random_val) {
+; GENERIC-LABEL: test_rdrand_64:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: rdrandq %rax # sched: [100:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; GOLDMONT-LABEL: test_rdrand_64:
+; GOLDMONT: # %bb.0:
+; GOLDMONT-NEXT: rdrandq %rax # sched: [100:1.00]
+; GOLDMONT-NEXT: retq # sched: [4:1.00]
+;
+; IVY-LABEL: test_rdrand_64:
+; IVY: # %bb.0:
+; IVY-NEXT: rdrandq %rax # sched: [100:0.33]
+; IVY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_rdrand_64:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: rdrandq %rax # sched: [1:5.33]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_rdrand_64:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: rdrandq %rax # sched: [9:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_rdrand_64:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: rdrandq %rax # sched: [100:0.25]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_rdrand_64:
+; SKX: # %bb.0:
+; SKX-NEXT: rdrandq %rax # sched: [100:0.25]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_rdrand_64:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: rdrandq %rax # sched: [100:?]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %call = call {i64, i32} @llvm.x86.rdrand.64()
+ %randval = extractvalue {i64, i32} %call, 0
+ ret i64 %randval
+}
diff --git a/test/CodeGen/X86/rdrand-x86_64.ll b/test/CodeGen/X86/rdrand-x86_64.ll
index 06f1136087bb..88c49c03d7d2 100644
--- a/test/CodeGen/X86/rdrand-x86_64.ll
+++ b/test/CodeGen/X86/rdrand-x86_64.ll
@@ -5,7 +5,7 @@ declare {i64, i32} @llvm.x86.rdrand.64()
define i32 @_rdrand64_step(i64* %random_val) {
; CHECK-LABEL: _rdrand64_step:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: rdrandq %rcx
; CHECK-NEXT: movl $1, %eax
; CHECK-NEXT: cmovael %ecx, %eax
diff --git a/test/CodeGen/X86/rdrand.ll b/test/CodeGen/X86/rdrand.ll
index 0638e0095282..e3982cc0bc4c 100644
--- a/test/CodeGen/X86/rdrand.ll
+++ b/test/CodeGen/X86/rdrand.ll
@@ -7,7 +7,7 @@ declare {i32, i32} @llvm.x86.rdrand.32()
define i32 @_rdrand16_step(i16* %random_val) {
; X86-LABEL: _rdrand16_step:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: rdrandw %ax
; X86-NEXT: movzwl %ax, %edx
@@ -17,7 +17,7 @@ define i32 @_rdrand16_step(i16* %random_val) {
; X86-NEXT: retl
;
; X64-LABEL: _rdrand16_step:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: rdrandw %ax
; X64-NEXT: movzwl %ax, %ecx
; X64-NEXT: movl $1, %eax
@@ -33,7 +33,7 @@ define i32 @_rdrand16_step(i16* %random_val) {
define i32 @_rdrand32_step(i32* %random_val) {
; X86-LABEL: _rdrand32_step:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: rdrandl %edx
; X86-NEXT: movl $1, %eax
@@ -42,7 +42,7 @@ define i32 @_rdrand32_step(i32* %random_val) {
; X86-NEXT: retl
;
; X64-LABEL: _rdrand32_step:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: rdrandl %ecx
; X64-NEXT: movl $1, %eax
; X64-NEXT: cmovael %ecx, %eax
@@ -58,14 +58,14 @@ define i32 @_rdrand32_step(i32* %random_val) {
; Check that MachineCSE doesn't eliminate duplicate rdrand instructions.
define i32 @CSE() nounwind {
; X86-LABEL: CSE:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: rdrandl %ecx
; X86-NEXT: rdrandl %eax
; X86-NEXT: addl %ecx, %eax
; X86-NEXT: retl
;
; X64-LABEL: CSE:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: rdrandl %ecx
; X64-NEXT: rdrandl %eax
; X64-NEXT: addl %ecx, %eax
@@ -81,11 +81,11 @@ define i32 @CSE() nounwind {
; Check that MachineLICM doesn't hoist rdrand instructions.
define void @loop(i32* %p, i32 %n) nounwind {
; X86-LABEL: loop:
-; X86: # BB#0: # %entry
+; X86: # %bb.0: # %entry
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: testl %eax, %eax
; X86-NEXT: je .LBB3_3
-; X86-NEXT: # BB#1: # %while.body.preheader
+; X86-NEXT: # %bb.1: # %while.body.preheader
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: .p2align 4, 0x90
; X86-NEXT: .LBB3_2: # %while.body
@@ -93,13 +93,13 @@ define void @loop(i32* %p, i32 %n) nounwind {
; X86-NEXT: rdrandl %edx
; X86-NEXT: movl %edx, (%ecx)
; X86-NEXT: leal 4(%ecx), %ecx
-; X86-NEXT: decl %eax
+; X86-NEXT: addl $-1, %eax
; X86-NEXT: jne .LBB3_2
; X86-NEXT: .LBB3_3: # %while.end
; X86-NEXT: retl
;
; X64-LABEL: loop:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: testl %esi, %esi
; X64-NEXT: je .LBB3_2
; X64-NEXT: .p2align 4, 0x90
@@ -108,7 +108,7 @@ define void @loop(i32* %p, i32 %n) nounwind {
; X64-NEXT: rdrandl %eax
; X64-NEXT: movl %eax, (%rdi)
; X64-NEXT: leaq 4(%rdi), %rdi
-; X64-NEXT: decl %esi
+; X64-NEXT: addl $-1, %esi
; X64-NEXT: jne .LBB3_1
; X64-NEXT: .LBB3_2: # %while.end
; X64-NEXT: retq
diff --git a/test/CodeGen/X86/rdseed-schedule.ll b/test/CodeGen/X86/rdseed-schedule.ll
new file mode 100644
index 000000000000..0fc06c4babfb
--- /dev/null
+++ b/test/CodeGen/X86/rdseed-schedule.ll
@@ -0,0 +1,116 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=+rdseed | FileCheck %s --check-prefix=CHECK --check-prefix=GENERIC
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=goldmont | FileCheck %s --check-prefix=CHECK --check-prefix=GOLDMONT
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=broadwell | FileCheck %s --check-prefix=CHECK --check-prefix=BROADWELL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=SKYLAKE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skx | FileCheck %s --check-prefix=CHECK --check-prefix=SKX
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1
+
+declare {i16, i32} @llvm.x86.rdseed.16()
+declare {i32, i32} @llvm.x86.rdseed.32()
+declare {i64, i32} @llvm.x86.rdseed.64()
+
+define i16 @test_rdseed_16(i16* %random_val) {
+; GENERIC-LABEL: test_rdseed_16:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: rdseedw %ax # sched: [100:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; GOLDMONT-LABEL: test_rdseed_16:
+; GOLDMONT: # %bb.0:
+; GOLDMONT-NEXT: rdseedw %ax # sched: [100:1.00]
+; GOLDMONT-NEXT: retq # sched: [4:1.00]
+;
+; BROADWELL-LABEL: test_rdseed_16:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: rdseedw %ax # sched: [100:0.25]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_rdseed_16:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: rdseedw %ax # sched: [100:0.25]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_rdseed_16:
+; SKX: # %bb.0:
+; SKX-NEXT: rdseedw %ax # sched: [100:0.25]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_rdseed_16:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: rdseedw %ax # sched: [100:?]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %call = call {i16, i32} @llvm.x86.rdseed.16()
+ %randval = extractvalue {i16, i32} %call, 0
+ ret i16 %randval
+}
+
+define i32 @test_rdseed_32(i16* %random_val) {
+; GENERIC-LABEL: test_rdseed_32:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: rdseedl %eax # sched: [100:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; GOLDMONT-LABEL: test_rdseed_32:
+; GOLDMONT: # %bb.0:
+; GOLDMONT-NEXT: rdseedl %eax # sched: [100:1.00]
+; GOLDMONT-NEXT: retq # sched: [4:1.00]
+;
+; BROADWELL-LABEL: test_rdseed_32:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: rdseedl %eax # sched: [100:0.25]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_rdseed_32:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: rdseedl %eax # sched: [100:0.25]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_rdseed_32:
+; SKX: # %bb.0:
+; SKX-NEXT: rdseedl %eax # sched: [100:0.25]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_rdseed_32:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: rdseedl %eax # sched: [100:?]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %call = call {i32, i32} @llvm.x86.rdseed.32()
+ %randval = extractvalue {i32, i32} %call, 0
+ ret i32 %randval
+}
+
+define i64 @test_rdseed_64(i64* %random_val) {
+; GENERIC-LABEL: test_rdseed_64:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: rdseedq %rax # sched: [100:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; GOLDMONT-LABEL: test_rdseed_64:
+; GOLDMONT: # %bb.0:
+; GOLDMONT-NEXT: rdseedq %rax # sched: [100:1.00]
+; GOLDMONT-NEXT: retq # sched: [4:1.00]
+;
+; BROADWELL-LABEL: test_rdseed_64:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: rdseedq %rax # sched: [100:0.25]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_rdseed_64:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: rdseedq %rax # sched: [100:0.25]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_rdseed_64:
+; SKX: # %bb.0:
+; SKX-NEXT: rdseedq %rax # sched: [100:0.25]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_rdseed_64:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: rdseedq %rax # sched: [100:?]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %call = call {i64, i32} @llvm.x86.rdseed.64()
+ %randval = extractvalue {i64, i32} %call, 0
+ ret i64 %randval
+}
diff --git a/test/CodeGen/X86/rdseed-x86_64.ll b/test/CodeGen/X86/rdseed-x86_64.ll
index b0d9748dd6ae..0708138ab798 100644
--- a/test/CodeGen/X86/rdseed-x86_64.ll
+++ b/test/CodeGen/X86/rdseed-x86_64.ll
@@ -5,7 +5,7 @@ declare {i64, i32} @llvm.x86.rdseed.64()
define i32 @_rdseed64_step(i64* %random_val) {
; CHECK-LABEL: _rdseed64_step:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: rdseedq %rcx
; CHECK-NEXT: movl $1, %eax
; CHECK-NEXT: cmovael %ecx, %eax
diff --git a/test/CodeGen/X86/rdseed.ll b/test/CodeGen/X86/rdseed.ll
index b22e3e7ceac0..1e0d113977ca 100644
--- a/test/CodeGen/X86/rdseed.ll
+++ b/test/CodeGen/X86/rdseed.ll
@@ -7,7 +7,7 @@ declare {i32, i32} @llvm.x86.rdseed.32()
define i32 @_rdseed16_step(i16* %random_val) {
; X86-LABEL: _rdseed16_step:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: rdseedw %ax
; X86-NEXT: movzwl %ax, %edx
@@ -17,7 +17,7 @@ define i32 @_rdseed16_step(i16* %random_val) {
; X86-NEXT: retl
;
; X64-LABEL: _rdseed16_step:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: rdseedw %ax
; X64-NEXT: movzwl %ax, %ecx
; X64-NEXT: movl $1, %eax
@@ -33,7 +33,7 @@ define i32 @_rdseed16_step(i16* %random_val) {
define i32 @_rdseed32_step(i32* %random_val) {
; X86-LABEL: _rdseed32_step:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: rdseedl %edx
; X86-NEXT: movl $1, %eax
@@ -42,7 +42,7 @@ define i32 @_rdseed32_step(i32* %random_val) {
; X86-NEXT: retl
;
; X64-LABEL: _rdseed32_step:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: rdseedl %ecx
; X64-NEXT: movl $1, %eax
; X64-NEXT: cmovael %ecx, %eax
diff --git a/test/CodeGen/X86/rdtsc.ll b/test/CodeGen/X86/rdtsc.ll
index dba614ad104e..96ad1aba3c50 100644
--- a/test/CodeGen/X86/rdtsc.ll
+++ b/test/CodeGen/X86/rdtsc.ll
@@ -1,47 +1,67 @@
-; RUN: llc < %s -march=x86-64 -mcpu=generic | FileCheck %s
-; RUN: llc < %s -march=x86 -mcpu=generic | FileCheck %s --check-prefix=CHECK --check-prefix=X86
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mcpu=generic | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=generic | FileCheck %s --check-prefix=X64
; Verify that we correctly lower ISD::READCYCLECOUNTER.
define i64 @test_builtin_readcyclecounter() {
+; X86-LABEL: test_builtin_readcyclecounter:
+; X86: # %bb.0:
+; X86-NEXT: rdtsc
+; X86-NEXT: retl
+;
+; X64-LABEL: test_builtin_readcyclecounter:
+; X64: # %bb.0:
+; X64-NEXT: rdtsc
+; X64-NEXT: shlq $32, %rdx
+; X64-NEXT: orq %rdx, %rax
+; X64-NEXT: retq
%1 = tail call i64 @llvm.readcyclecounter()
ret i64 %1
}
-; CHECK-LABEL: test_builtin_readcyclecounter
-; CHECK: rdtsc
-; X86-NOT: shlq
-; X86-NOT: or
-; CHECK-NOT: mov
-; CHECK: ret
-
; Verify that we correctly lower the Read Cycle Counter GCC x86 builtins
; (i.e. RDTSC and RDTSCP).
define i64 @test_builtin_rdtsc() {
+; X86-LABEL: test_builtin_rdtsc:
+; X86: # %bb.0:
+; X86-NEXT: rdtsc
+; X86-NEXT: retl
+;
+; X64-LABEL: test_builtin_rdtsc:
+; X64: # %bb.0:
+; X64-NEXT: rdtsc
+; X64-NEXT: shlq $32, %rdx
+; X64-NEXT: orq %rdx, %rax
+; X64-NEXT: retq
%1 = tail call i64 @llvm.x86.rdtsc()
ret i64 %1
}
-; CHECK-LABEL: test_builtin_rdtsc
-; CHECK: rdtsc
-; X86-NOT: shlq
-; X86-NOT: or
-; CHECK-NOT: mov
-; CHECK: ret
-
define i64 @test_builtin_rdtscp(i8* %A) {
+; X86-LABEL: test_builtin_rdtscp:
+; X86: # %bb.0:
+; X86-NEXT: pushl %esi
+; X86-NEXT: .cfi_def_cfa_offset 8
+; X86-NEXT: .cfi_offset %esi, -8
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: rdtscp
+; X86-NEXT: movl %ecx, (%esi)
+; X86-NEXT: popl %esi
+; X86-NEXT: retl
+;
+; X64-LABEL: test_builtin_rdtscp:
+; X64: # %bb.0:
+; X64-NEXT: rdtscp
+; X64-NEXT: movl %ecx, (%rdi)
+; X64-NEXT: shlq $32, %rdx
+; X64-NEXT: orq %rdx, %rax
+; X64-NEXT: retq
%1 = tail call i64 @llvm.x86.rdtscp(i8* %A)
ret i64 %1
}
-; CHECK-LABEL: test_builtin_rdtscp
-; CHECK: rdtscp
-; X86-NOT: shlq
-; CHECK: movl %ecx, (%{{[a-z0-9]+}})
-; X86-NOT: shlq
-; CHECK: ret
-
declare i64 @llvm.readcyclecounter()
declare i64 @llvm.x86.rdtscp(i8*)
diff --git a/test/CodeGen/X86/recip-fastmath.ll b/test/CodeGen/X86/recip-fastmath.ll
index 9102e68f231b..8dbe7ba8d8d0 100644
--- a/test/CodeGen/X86/recip-fastmath.ll
+++ b/test/CodeGen/X86/recip-fastmath.ll
@@ -19,60 +19,66 @@
define float @f32_no_estimate(float %x) #0 {
; SSE-LABEL: f32_no_estimate:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; SSE-NEXT: divss %xmm0, %xmm1
; SSE-NEXT: movaps %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-RECIP-LABEL: f32_no_estimate:
-; AVX-RECIP: # BB#0:
+; AVX-RECIP: # %bb.0:
; AVX-RECIP-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; AVX-RECIP-NEXT: vdivss %xmm0, %xmm1, %xmm0
; AVX-RECIP-NEXT: retq
;
; FMA-RECIP-LABEL: f32_no_estimate:
-; FMA-RECIP: # BB#0:
+; FMA-RECIP: # %bb.0:
; FMA-RECIP-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; FMA-RECIP-NEXT: vdivss %xmm0, %xmm1, %xmm0
; FMA-RECIP-NEXT: retq
;
; BTVER2-LABEL: f32_no_estimate:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [5:1.00]
; BTVER2-NEXT: vdivss %xmm0, %xmm1, %xmm0 # sched: [19:19.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: f32_no_estimate:
-; SANDY: # BB#0:
-; SANDY-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [4:0.50]
-; SANDY-NEXT: vdivss %xmm0, %xmm1, %xmm0 # sched: [12:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [6:0.50]
+; SANDY-NEXT: vdivss %xmm0, %xmm1, %xmm0 # sched: [14:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: f32_no_estimate:
-; HASWELL: # BB#0:
-; HASWELL-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [4:0.50]
-; HASWELL-NEXT: vdivss %xmm0, %xmm1, %xmm0 # sched: [12:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [5:0.50]
+; HASWELL-NEXT: vdivss %xmm0, %xmm1, %xmm0 # sched: [13:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
;
; HASWELL-NO-FMA-LABEL: f32_no_estimate:
-; HASWELL-NO-FMA: # BB#0:
+; HASWELL-NO-FMA: # %bb.0:
; HASWELL-NO-FMA-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; HASWELL-NO-FMA-NEXT: vdivss %xmm0, %xmm1, %xmm0
; HASWELL-NO-FMA-NEXT: retq
;
-; AVX512-LABEL: f32_no_estimate:
-; AVX512: # BB#0:
-; AVX512-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [4:0.50]
-; AVX512-NEXT: vdivss %xmm0, %xmm1, %xmm0 # sched: [12:1.00]
-; AVX512-NEXT: retq # sched: [1:1.00]
+; KNL-LABEL: f32_no_estimate:
+; KNL: # %bb.0:
+; KNL-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [5:0.50]
+; KNL-NEXT: vdivss %xmm0, %xmm1, %xmm0 # sched: [13:1.00]
+; KNL-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: f32_no_estimate:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [5:0.50]
+; SKX-NEXT: vdivss %xmm0, %xmm1, %xmm0 # sched: [11:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
%div = fdiv fast float 1.0, %x
ret float %div
}
define float @f32_one_step(float %x) #1 {
; SSE-LABEL: f32_one_step:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: rcpss %xmm0, %xmm2
; SSE-NEXT: mulss %xmm2, %xmm0
; SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
@@ -83,7 +89,7 @@ define float @f32_one_step(float %x) #1 {
; SSE-NEXT: retq
;
; AVX-RECIP-LABEL: f32_one_step:
-; AVX-RECIP: # BB#0:
+; AVX-RECIP: # %bb.0:
; AVX-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm1
; AVX-RECIP-NEXT: vmulss %xmm1, %xmm0, %xmm0
; AVX-RECIP-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
@@ -93,14 +99,14 @@ define float @f32_one_step(float %x) #1 {
; AVX-RECIP-NEXT: retq
;
; FMA-RECIP-LABEL: f32_one_step:
-; FMA-RECIP: # BB#0:
+; FMA-RECIP: # %bb.0:
; FMA-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm1
; FMA-RECIP-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0
; FMA-RECIP-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0
; FMA-RECIP-NEXT: retq
;
; BTVER2-LABEL: f32_one_step:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [5:1.00]
; BTVER2-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [2:1.00]
; BTVER2-NEXT: vmulss %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
@@ -110,24 +116,24 @@ define float @f32_one_step(float %x) #1 {
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: f32_one_step:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
; SANDY-NEXT: vmulss %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; SANDY-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [4:0.50]
+; SANDY-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [6:0.50]
; SANDY-NEXT: vsubss %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
; SANDY-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
; SANDY-NEXT: vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: f32_one_step:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
-; HASWELL-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0
-; HASWELL-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 # sched: [10:0.50]
+; HASWELL-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
;
; HASWELL-NO-FMA-LABEL: f32_one_step:
-; HASWELL-NO-FMA: # BB#0:
+; HASWELL-NO-FMA: # %bb.0:
; HASWELL-NO-FMA-NEXT: vrcpss %xmm0, %xmm0, %xmm1
; HASWELL-NO-FMA-NEXT: vmulss %xmm1, %xmm0, %xmm0
; HASWELL-NO-FMA-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
@@ -136,19 +142,26 @@ define float @f32_one_step(float %x) #1 {
; HASWELL-NO-FMA-NEXT: vaddss %xmm0, %xmm1, %xmm0
; HASWELL-NO-FMA-NEXT: retq
;
-; AVX512-LABEL: f32_one_step:
-; AVX512: # BB#0:
-; AVX512-NEXT: vrcp14ss %xmm0, %xmm0, %xmm1
-; AVX512-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0
-; AVX512-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0
-; AVX512-NEXT: retq # sched: [1:1.00]
+; KNL-LABEL: f32_one_step:
+; KNL: # %bb.0:
+; KNL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
+; KNL-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 # sched: [10:0.50]
+; KNL-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: f32_one_step:
+; SKX: # %bb.0:
+; SKX-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [4:1.00]
+; SKX-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 # sched: [9:0.50]
+; SKX-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
%div = fdiv fast float 1.0, %x
ret float %div
}
define float @f32_two_step(float %x) #2 {
; SSE-LABEL: f32_two_step:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: rcpss %xmm0, %xmm2
; SSE-NEXT: movaps %xmm0, %xmm3
; SSE-NEXT: mulss %xmm2, %xmm3
@@ -165,7 +178,7 @@ define float @f32_two_step(float %x) #2 {
; SSE-NEXT: retq
;
; AVX-RECIP-LABEL: f32_two_step:
-; AVX-RECIP: # BB#0:
+; AVX-RECIP: # %bb.0:
; AVX-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm1
; AVX-RECIP-NEXT: vmulss %xmm1, %xmm0, %xmm2
; AVX-RECIP-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
@@ -179,7 +192,7 @@ define float @f32_two_step(float %x) #2 {
; AVX-RECIP-NEXT: retq
;
; FMA-RECIP-LABEL: f32_two_step:
-; FMA-RECIP: # BB#0:
+; FMA-RECIP: # %bb.0:
; FMA-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm1
; FMA-RECIP-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; FMA-RECIP-NEXT: vmovaps %xmm1, %xmm3
@@ -190,7 +203,7 @@ define float @f32_two_step(float %x) #2 {
; FMA-RECIP-NEXT: retq
;
; BTVER2-LABEL: f32_two_step:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero sched: [5:1.00]
; BTVER2-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [2:1.00]
; BTVER2-NEXT: vmulss %xmm1, %xmm0, %xmm2 # sched: [2:1.00]
@@ -204,10 +217,10 @@ define float @f32_two_step(float %x) #2 {
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: f32_two_step:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
; SANDY-NEXT: vmulss %xmm1, %xmm0, %xmm2 # sched: [5:1.00]
-; SANDY-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero sched: [4:0.50]
+; SANDY-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero sched: [6:0.50]
; SANDY-NEXT: vsubss %xmm2, %xmm3, %xmm2 # sched: [3:1.00]
; SANDY-NEXT: vmulss %xmm2, %xmm1, %xmm2 # sched: [5:1.00]
; SANDY-NEXT: vaddss %xmm2, %xmm1, %xmm1 # sched: [3:1.00]
@@ -215,21 +228,21 @@ define float @f32_two_step(float %x) #2 {
; SANDY-NEXT: vsubss %xmm0, %xmm3, %xmm0 # sched: [3:1.00]
; SANDY-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
; SANDY-NEXT: vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: f32_two_step:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
-; HASWELL-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [4:0.50]
+; HASWELL-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [5:0.50]
; HASWELL-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00]
-; HASWELL-NEXT: vfnmadd213ss %xmm2, %xmm0, %xmm3
-; HASWELL-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm3
-; HASWELL-NEXT: vfnmadd213ss %xmm2, %xmm3, %xmm0
-; HASWELL-NEXT: vfmadd132ss %xmm3, %xmm3, %xmm0
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vfnmadd213ss %xmm2, %xmm0, %xmm3 # sched: [5:0.50]
+; HASWELL-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm3 # sched: [5:0.50]
+; HASWELL-NEXT: vfnmadd213ss %xmm2, %xmm3, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfmadd132ss %xmm3, %xmm3, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
;
; HASWELL-NO-FMA-LABEL: f32_two_step:
-; HASWELL-NO-FMA: # BB#0:
+; HASWELL-NO-FMA: # %bb.0:
; HASWELL-NO-FMA-NEXT: vrcpss %xmm0, %xmm0, %xmm1
; HASWELL-NO-FMA-NEXT: vmulss %xmm1, %xmm0, %xmm2
; HASWELL-NO-FMA-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
@@ -242,76 +255,93 @@ define float @f32_two_step(float %x) #2 {
; HASWELL-NO-FMA-NEXT: vaddss %xmm0, %xmm1, %xmm0
; HASWELL-NO-FMA-NEXT: retq
;
-; AVX512-LABEL: f32_two_step:
-; AVX512: # BB#0:
-; AVX512-NEXT: vrcp14ss %xmm0, %xmm0, %xmm1
-; AVX512-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [4:0.50]
-; AVX512-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00]
-; AVX512-NEXT: vfnmadd213ss %xmm2, %xmm0, %xmm3
-; AVX512-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm3
-; AVX512-NEXT: vfnmadd213ss %xmm2, %xmm3, %xmm0
-; AVX512-NEXT: vfmadd132ss %xmm3, %xmm3, %xmm0
-; AVX512-NEXT: retq # sched: [1:1.00]
+; KNL-LABEL: f32_two_step:
+; KNL: # %bb.0:
+; KNL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
+; KNL-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [5:0.50]
+; KNL-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00]
+; KNL-NEXT: vfnmadd213ss %xmm2, %xmm0, %xmm3 # sched: [5:0.50]
+; KNL-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm3 # sched: [5:0.50]
+; KNL-NEXT: vfnmadd213ss %xmm2, %xmm3, %xmm0 # sched: [5:0.50]
+; KNL-NEXT: vfmadd132ss %xmm3, %xmm3, %xmm0 # sched: [5:0.50]
+; KNL-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: f32_two_step:
+; SKX: # %bb.0:
+; SKX-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [4:1.00]
+; SKX-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [5:0.50]
+; SKX-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vfnmadd213ss %xmm2, %xmm0, %xmm3 # sched: [4:0.33]
+; SKX-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm3 # sched: [4:0.33]
+; SKX-NEXT: vfnmadd213ss %xmm2, %xmm3, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: vfmadd132ss %xmm3, %xmm3, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
%div = fdiv fast float 1.0, %x
ret float %div
}
define <4 x float> @v4f32_no_estimate(<4 x float> %x) #0 {
; SSE-LABEL: v4f32_no_estimate:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
; SSE-NEXT: divps %xmm0, %xmm1
; SSE-NEXT: movaps %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-RECIP-LABEL: v4f32_no_estimate:
-; AVX-RECIP: # BB#0:
+; AVX-RECIP: # %bb.0:
; AVX-RECIP-NEXT: vmovaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
; AVX-RECIP-NEXT: vdivps %xmm0, %xmm1, %xmm0
; AVX-RECIP-NEXT: retq
;
; FMA-RECIP-LABEL: v4f32_no_estimate:
-; FMA-RECIP: # BB#0:
+; FMA-RECIP: # %bb.0:
; FMA-RECIP-NEXT: vmovaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
; FMA-RECIP-NEXT: vdivps %xmm0, %xmm1, %xmm0
; FMA-RECIP-NEXT: retq
;
; BTVER2-LABEL: v4f32_no_estimate:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vmovaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [5:1.00]
; BTVER2-NEXT: vdivps %xmm0, %xmm1, %xmm0 # sched: [19:19.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: v4f32_no_estimate:
-; SANDY: # BB#0:
-; SANDY-NEXT: vmovaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [4:0.50]
-; SANDY-NEXT: vdivps %xmm0, %xmm1, %xmm0 # sched: [12:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vmovaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [6:0.50]
+; SANDY-NEXT: vdivps %xmm0, %xmm1, %xmm0 # sched: [14:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: v4f32_no_estimate:
-; HASWELL: # BB#0:
-; HASWELL-NEXT: vbroadcastss {{.*#+}} xmm1 = [1,1,1,1] sched: [4:0.50]
-; HASWELL-NEXT: vdivps %xmm0, %xmm1, %xmm0 # sched: [12:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vbroadcastss {{.*#+}} xmm1 = [1,1,1,1] sched: [6:0.50]
+; HASWELL-NEXT: vdivps %xmm0, %xmm1, %xmm0 # sched: [13:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
;
; HASWELL-NO-FMA-LABEL: v4f32_no_estimate:
-; HASWELL-NO-FMA: # BB#0:
+; HASWELL-NO-FMA: # %bb.0:
; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} xmm1 = [1,1,1,1]
; HASWELL-NO-FMA-NEXT: vdivps %xmm0, %xmm1, %xmm0
; HASWELL-NO-FMA-NEXT: retq
;
-; AVX512-LABEL: v4f32_no_estimate:
-; AVX512: # BB#0:
-; AVX512-NEXT: vbroadcastss {{.*#+}} xmm1 = [1,1,1,1] sched: [4:0.50]
-; AVX512-NEXT: vdivps %xmm0, %xmm1, %xmm0 # sched: [12:1.00]
-; AVX512-NEXT: retq # sched: [1:1.00]
+; KNL-LABEL: v4f32_no_estimate:
+; KNL: # %bb.0:
+; KNL-NEXT: vbroadcastss {{.*#+}} xmm1 = [1,1,1,1] sched: [6:0.50]
+; KNL-NEXT: vdivps %xmm0, %xmm1, %xmm0 # sched: [13:1.00]
+; KNL-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: v4f32_no_estimate:
+; SKX: # %bb.0:
+; SKX-NEXT: vbroadcastss {{.*#+}} xmm1 = [1,1,1,1] sched: [6:0.50]
+; SKX-NEXT: vdivps %xmm0, %xmm1, %xmm0 # sched: [11:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
%div = fdiv fast <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %x
ret <4 x float> %div
}
define <4 x float> @v4f32_one_step(<4 x float> %x) #1 {
; SSE-LABEL: v4f32_one_step:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: rcpps %xmm0, %xmm2
; SSE-NEXT: mulps %xmm2, %xmm0
; SSE-NEXT: movaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
@@ -322,7 +352,7 @@ define <4 x float> @v4f32_one_step(<4 x float> %x) #1 {
; SSE-NEXT: retq
;
; AVX-RECIP-LABEL: v4f32_one_step:
-; AVX-RECIP: # BB#0:
+; AVX-RECIP: # %bb.0:
; AVX-RECIP-NEXT: vrcpps %xmm0, %xmm1
; AVX-RECIP-NEXT: vmulps %xmm1, %xmm0, %xmm0
; AVX-RECIP-NEXT: vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
@@ -332,14 +362,14 @@ define <4 x float> @v4f32_one_step(<4 x float> %x) #1 {
; AVX-RECIP-NEXT: retq
;
; FMA-RECIP-LABEL: v4f32_one_step:
-; FMA-RECIP: # BB#0:
+; FMA-RECIP: # %bb.0:
; FMA-RECIP-NEXT: vrcpps %xmm0, %xmm1
; FMA-RECIP-NEXT: vfnmadd213ps {{.*}}(%rip), %xmm1, %xmm0
; FMA-RECIP-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0
; FMA-RECIP-NEXT: retq
;
; BTVER2-LABEL: v4f32_one_step:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [5:1.00]
; BTVER2-NEXT: vrcpps %xmm0, %xmm1 # sched: [2:1.00]
; BTVER2-NEXT: vmulps %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
@@ -349,25 +379,25 @@ define <4 x float> @v4f32_one_step(<4 x float> %x) #1 {
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: v4f32_one_step:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00]
; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; SANDY-NEXT: vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [4:0.50]
+; SANDY-NEXT: vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [6:0.50]
; SANDY-NEXT: vsubps %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
; SANDY-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
; SANDY-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: v4f32_one_step:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00]
-; HASWELL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [4:0.50]
-; HASWELL-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0
-; HASWELL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [6:0.50]
+; HASWELL-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
;
; HASWELL-NO-FMA-LABEL: v4f32_one_step:
-; HASWELL-NO-FMA: # BB#0:
+; HASWELL-NO-FMA: # %bb.0:
; HASWELL-NO-FMA-NEXT: vrcpps %xmm0, %xmm1
; HASWELL-NO-FMA-NEXT: vmulps %xmm1, %xmm0, %xmm0
; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1]
@@ -377,26 +407,26 @@ define <4 x float> @v4f32_one_step(<4 x float> %x) #1 {
; HASWELL-NO-FMA-NEXT: retq
;
; KNL-LABEL: v4f32_one_step:
-; KNL: # BB#0:
+; KNL: # %bb.0:
; KNL-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00]
-; KNL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [4:0.50]
-; KNL-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0
-; KNL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0
-; KNL-NEXT: retq # sched: [1:1.00]
+; KNL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [6:0.50]
+; KNL-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT: retq # sched: [7:1.00]
;
; SKX-LABEL: v4f32_one_step:
-; SKX: # BB#0:
-; SKX-NEXT: vrcp14ps %xmm0, %xmm1
-; SKX-NEXT: vfnmadd213ps {{.*}}(%rip){1to4}, %xmm1, %xmm0
-; SKX-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0
-; SKX-NEXT: retq # sched: [1:1.00]
+; SKX: # %bb.0:
+; SKX-NEXT: vrcpps %xmm0, %xmm1 # sched: [4:1.00]
+; SKX-NEXT: vfnmadd213ps {{.*}}(%rip){1to4}, %xmm1, %xmm0 # sched: [10:0.50]
+; SKX-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
%div = fdiv fast <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %x
ret <4 x float> %div
}
define <4 x float> @v4f32_two_step(<4 x float> %x) #2 {
; SSE-LABEL: v4f32_two_step:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: rcpps %xmm0, %xmm2
; SSE-NEXT: movaps %xmm0, %xmm3
; SSE-NEXT: mulps %xmm2, %xmm3
@@ -413,7 +443,7 @@ define <4 x float> @v4f32_two_step(<4 x float> %x) #2 {
; SSE-NEXT: retq
;
; AVX-RECIP-LABEL: v4f32_two_step:
-; AVX-RECIP: # BB#0:
+; AVX-RECIP: # %bb.0:
; AVX-RECIP-NEXT: vrcpps %xmm0, %xmm1
; AVX-RECIP-NEXT: vmulps %xmm1, %xmm0, %xmm2
; AVX-RECIP-NEXT: vmovaps {{.*#+}} xmm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
@@ -427,7 +457,7 @@ define <4 x float> @v4f32_two_step(<4 x float> %x) #2 {
; AVX-RECIP-NEXT: retq
;
; FMA-RECIP-LABEL: v4f32_two_step:
-; FMA-RECIP: # BB#0:
+; FMA-RECIP: # %bb.0:
; FMA-RECIP-NEXT: vrcpps %xmm0, %xmm1
; FMA-RECIP-NEXT: vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
; FMA-RECIP-NEXT: vmovaps %xmm1, %xmm3
@@ -438,7 +468,7 @@ define <4 x float> @v4f32_two_step(<4 x float> %x) #2 {
; FMA-RECIP-NEXT: retq
;
; BTVER2-LABEL: v4f32_two_step:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vmovaps {{.*#+}} xmm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [5:1.00]
; BTVER2-NEXT: vrcpps %xmm0, %xmm1 # sched: [2:1.00]
; BTVER2-NEXT: vmulps %xmm1, %xmm0, %xmm2 # sched: [2:1.00]
@@ -452,10 +482,10 @@ define <4 x float> @v4f32_two_step(<4 x float> %x) #2 {
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: v4f32_two_step:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00]
; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm2 # sched: [5:1.00]
-; SANDY-NEXT: vmovaps {{.*#+}} xmm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [4:0.50]
+; SANDY-NEXT: vmovaps {{.*#+}} xmm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [6:0.50]
; SANDY-NEXT: vsubps %xmm2, %xmm3, %xmm2 # sched: [3:1.00]
; SANDY-NEXT: vmulps %xmm2, %xmm1, %xmm2 # sched: [5:1.00]
; SANDY-NEXT: vaddps %xmm2, %xmm1, %xmm1 # sched: [3:1.00]
@@ -463,21 +493,21 @@ define <4 x float> @v4f32_two_step(<4 x float> %x) #2 {
; SANDY-NEXT: vsubps %xmm0, %xmm3, %xmm0 # sched: [3:1.00]
; SANDY-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
; SANDY-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: v4f32_two_step:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00]
-; HASWELL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [4:0.50]
+; HASWELL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [6:0.50]
; HASWELL-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00]
-; HASWELL-NEXT: vfnmadd213ps %xmm2, %xmm0, %xmm3
-; HASWELL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm3
-; HASWELL-NEXT: vfnmadd213ps %xmm2, %xmm3, %xmm0
-; HASWELL-NEXT: vfmadd132ps %xmm3, %xmm3, %xmm0
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vfnmadd213ps %xmm2, %xmm0, %xmm3 # sched: [5:0.50]
+; HASWELL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm3 # sched: [5:0.50]
+; HASWELL-NEXT: vfnmadd213ps %xmm2, %xmm3, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfmadd132ps %xmm3, %xmm3, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
;
; HASWELL-NO-FMA-LABEL: v4f32_two_step:
-; HASWELL-NO-FMA: # BB#0:
+; HASWELL-NO-FMA: # %bb.0:
; HASWELL-NO-FMA-NEXT: vrcpps %xmm0, %xmm1
; HASWELL-NO-FMA-NEXT: vmulps %xmm1, %xmm0, %xmm2
; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} xmm3 = [1,1,1,1]
@@ -491,33 +521,33 @@ define <4 x float> @v4f32_two_step(<4 x float> %x) #2 {
; HASWELL-NO-FMA-NEXT: retq
;
; KNL-LABEL: v4f32_two_step:
-; KNL: # BB#0:
+; KNL: # %bb.0:
; KNL-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00]
-; KNL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [4:0.50]
+; KNL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [6:0.50]
; KNL-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00]
-; KNL-NEXT: vfnmadd213ps %xmm2, %xmm0, %xmm3
-; KNL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm3
-; KNL-NEXT: vfnmadd213ps %xmm2, %xmm3, %xmm0
-; KNL-NEXT: vfmadd132ps %xmm3, %xmm3, %xmm0
-; KNL-NEXT: retq # sched: [1:1.00]
+; KNL-NEXT: vfnmadd213ps %xmm2, %xmm0, %xmm3 # sched: [5:0.50]
+; KNL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm3 # sched: [5:0.50]
+; KNL-NEXT: vfnmadd213ps %xmm2, %xmm3, %xmm0 # sched: [5:0.50]
+; KNL-NEXT: vfmadd132ps %xmm3, %xmm3, %xmm0 # sched: [5:0.50]
+; KNL-NEXT: retq # sched: [7:1.00]
;
; SKX-LABEL: v4f32_two_step:
-; SKX: # BB#0:
-; SKX-NEXT: vrcp14ps %xmm0, %xmm1
-; SKX-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [4:0.50]
-; SKX-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00]
-; SKX-NEXT: vfnmadd213ps %xmm2, %xmm0, %xmm3
-; SKX-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm3
-; SKX-NEXT: vfnmadd213ps %xmm2, %xmm3, %xmm0
-; SKX-NEXT: vfmadd132ps %xmm3, %xmm3, %xmm0
-; SKX-NEXT: retq # sched: [1:1.00]
+; SKX: # %bb.0:
+; SKX-NEXT: vrcpps %xmm0, %xmm1 # sched: [4:1.00]
+; SKX-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [6:0.50]
+; SKX-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vfnmadd213ps %xmm2, %xmm0, %xmm3 # sched: [4:0.33]
+; SKX-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm3 # sched: [4:0.33]
+; SKX-NEXT: vfnmadd213ps %xmm2, %xmm3, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: vfmadd132ps %xmm3, %xmm3, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
%div = fdiv fast <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %x
ret <4 x float> %div
}
define <8 x float> @v8f32_no_estimate(<8 x float> %x) #0 {
; SSE-LABEL: v8f32_no_estimate:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
; SSE-NEXT: movaps %xmm2, %xmm3
; SSE-NEXT: divps %xmm0, %xmm3
@@ -527,53 +557,59 @@ define <8 x float> @v8f32_no_estimate(<8 x float> %x) #0 {
; SSE-NEXT: retq
;
; AVX-RECIP-LABEL: v8f32_no_estimate:
-; AVX-RECIP: # BB#0:
+; AVX-RECIP: # %bb.0:
; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
; AVX-RECIP-NEXT: vdivps %ymm0, %ymm1, %ymm0
; AVX-RECIP-NEXT: retq
;
; FMA-RECIP-LABEL: v8f32_no_estimate:
-; FMA-RECIP: # BB#0:
+; FMA-RECIP: # %bb.0:
; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
; FMA-RECIP-NEXT: vdivps %ymm0, %ymm1, %ymm0
; FMA-RECIP-NEXT: retq
;
; BTVER2-LABEL: v8f32_no_estimate:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vmovaps {{.*#+}} ymm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [5:1.00]
; BTVER2-NEXT: vdivps %ymm0, %ymm1, %ymm0 # sched: [38:38.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: v8f32_no_estimate:
-; SANDY: # BB#0:
-; SANDY-NEXT: vmovaps {{.*#+}} ymm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [4:0.50]
-; SANDY-NEXT: vdivps %ymm0, %ymm1, %ymm0 # sched: [12:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vmovaps {{.*#+}} ymm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [7:0.50]
+; SANDY-NEXT: vdivps %ymm0, %ymm1, %ymm0 # sched: [29:2.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: v8f32_no_estimate:
-; HASWELL: # BB#0:
-; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1] sched: [5:1.00]
-; HASWELL-NEXT: vdivps %ymm0, %ymm1, %ymm0 # sched: [19:2.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
+; HASWELL-NEXT: vdivps %ymm0, %ymm1, %ymm0 # sched: [21:2.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
;
; HASWELL-NO-FMA-LABEL: v8f32_no_estimate:
-; HASWELL-NO-FMA: # BB#0:
+; HASWELL-NO-FMA: # %bb.0:
; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1]
; HASWELL-NO-FMA-NEXT: vdivps %ymm0, %ymm1, %ymm0
; HASWELL-NO-FMA-NEXT: retq
;
-; AVX512-LABEL: v8f32_no_estimate:
-; AVX512: # BB#0:
-; AVX512-NEXT: vbroadcastss {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1] sched: [5:1.00]
-; AVX512-NEXT: vdivps %ymm0, %ymm1, %ymm0 # sched: [19:2.00]
-; AVX512-NEXT: retq # sched: [1:1.00]
+; KNL-LABEL: v8f32_no_estimate:
+; KNL: # %bb.0:
+; KNL-NEXT: vbroadcastss {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
+; KNL-NEXT: vdivps %ymm0, %ymm1, %ymm0 # sched: [21:2.00]
+; KNL-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: v8f32_no_estimate:
+; SKX: # %bb.0:
+; SKX-NEXT: vbroadcastss {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
+; SKX-NEXT: vdivps %ymm0, %ymm1, %ymm0 # sched: [11:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
%div = fdiv fast <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x
ret <8 x float> %div
}
define <8 x float> @v8f32_one_step(<8 x float> %x) #1 {
; SSE-LABEL: v8f32_one_step:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: rcpps %xmm0, %xmm4
; SSE-NEXT: mulps %xmm4, %xmm0
; SSE-NEXT: movaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
@@ -591,7 +627,7 @@ define <8 x float> @v8f32_one_step(<8 x float> %x) #1 {
; SSE-NEXT: retq
;
; AVX-RECIP-LABEL: v8f32_one_step:
-; AVX-RECIP: # BB#0:
+; AVX-RECIP: # %bb.0:
; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm1
; AVX-RECIP-NEXT: vmulps %ymm1, %ymm0, %ymm0
; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
@@ -601,14 +637,14 @@ define <8 x float> @v8f32_one_step(<8 x float> %x) #1 {
; AVX-RECIP-NEXT: retq
;
; FMA-RECIP-LABEL: v8f32_one_step:
-; FMA-RECIP: # BB#0:
+; FMA-RECIP: # %bb.0:
; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm1
; FMA-RECIP-NEXT: vfnmadd213ps {{.*}}(%rip), %ymm1, %ymm0
; FMA-RECIP-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0
; FMA-RECIP-NEXT: retq
;
; BTVER2-LABEL: v8f32_one_step:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [5:1.00]
; BTVER2-NEXT: vrcpps %ymm0, %ymm1 # sched: [2:2.00]
; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [2:2.00]
@@ -618,25 +654,25 @@ define <8 x float> @v8f32_one_step(<8 x float> %x) #1 {
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: v8f32_one_step:
-; SANDY: # BB#0:
-; SANDY-NEXT: vrcpps %ymm0, %ymm1 # sched: [5:1.00]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vrcpps %ymm0, %ymm1 # sched: [7:2.00]
; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
-; SANDY-NEXT: vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [4:0.50]
+; SANDY-NEXT: vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [7:0.50]
; SANDY-NEXT: vsubps %ymm0, %ymm2, %ymm0 # sched: [3:1.00]
; SANDY-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00]
; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: v8f32_one_step:
-; HASWELL: # BB#0:
-; HASWELL-NEXT: vrcpps %ymm0, %ymm1 # sched: [7:2.00]
-; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [5:1.00]
-; HASWELL-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0
-; HASWELL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vrcpps %ymm0, %ymm1 # sched: [11:2.00]
+; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
+; HASWELL-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
;
; HASWELL-NO-FMA-LABEL: v8f32_one_step:
-; HASWELL-NO-FMA: # BB#0:
+; HASWELL-NO-FMA: # %bb.0:
; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm1
; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm0, %ymm0
; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1]
@@ -646,26 +682,26 @@ define <8 x float> @v8f32_one_step(<8 x float> %x) #1 {
; HASWELL-NO-FMA-NEXT: retq
;
; KNL-LABEL: v8f32_one_step:
-; KNL: # BB#0:
-; KNL-NEXT: vrcpps %ymm0, %ymm1 # sched: [7:2.00]
-; KNL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [5:1.00]
-; KNL-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0
-; KNL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0
-; KNL-NEXT: retq # sched: [1:1.00]
+; KNL: # %bb.0:
+; KNL-NEXT: vrcpps %ymm0, %ymm1 # sched: [11:2.00]
+; KNL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
+; KNL-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; KNL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 # sched: [5:0.50]
+; KNL-NEXT: retq # sched: [7:1.00]
;
; SKX-LABEL: v8f32_one_step:
-; SKX: # BB#0:
-; SKX-NEXT: vrcp14ps %ymm0, %ymm1
-; SKX-NEXT: vfnmadd213ps {{.*}}(%rip){1to8}, %ymm1, %ymm0
-; SKX-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0
-; SKX-NEXT: retq # sched: [1:1.00]
+; SKX: # %bb.0:
+; SKX-NEXT: vrcpps %ymm0, %ymm1 # sched: [4:1.00]
+; SKX-NEXT: vfnmadd213ps {{.*}}(%rip){1to8}, %ymm1, %ymm0 # sched: [11:0.50]
+; SKX-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
%div = fdiv fast <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x
ret <8 x float> %div
}
define <8 x float> @v8f32_two_step(<8 x float> %x) #2 {
; SSE-LABEL: v8f32_two_step:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps %xmm1, %xmm2
; SSE-NEXT: rcpps %xmm0, %xmm3
; SSE-NEXT: movaps %xmm0, %xmm4
@@ -695,7 +731,7 @@ define <8 x float> @v8f32_two_step(<8 x float> %x) #2 {
; SSE-NEXT: retq
;
; AVX-RECIP-LABEL: v8f32_two_step:
-; AVX-RECIP: # BB#0:
+; AVX-RECIP: # %bb.0:
; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm1
; AVX-RECIP-NEXT: vmulps %ymm1, %ymm0, %ymm2
; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
@@ -709,7 +745,7 @@ define <8 x float> @v8f32_two_step(<8 x float> %x) #2 {
; AVX-RECIP-NEXT: retq
;
; FMA-RECIP-LABEL: v8f32_two_step:
-; FMA-RECIP: # BB#0:
+; FMA-RECIP: # %bb.0:
; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm1
; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
; FMA-RECIP-NEXT: vmovaps %ymm1, %ymm3
@@ -720,7 +756,7 @@ define <8 x float> @v8f32_two_step(<8 x float> %x) #2 {
; FMA-RECIP-NEXT: retq
;
; BTVER2-LABEL: v8f32_two_step:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [5:1.00]
; BTVER2-NEXT: vrcpps %ymm0, %ymm1 # sched: [2:2.00]
; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm2 # sched: [2:2.00]
@@ -734,10 +770,10 @@ define <8 x float> @v8f32_two_step(<8 x float> %x) #2 {
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: v8f32_two_step:
-; SANDY: # BB#0:
-; SANDY-NEXT: vrcpps %ymm0, %ymm1 # sched: [5:1.00]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vrcpps %ymm0, %ymm1 # sched: [7:2.00]
; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm2 # sched: [5:1.00]
-; SANDY-NEXT: vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [4:0.50]
+; SANDY-NEXT: vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [7:0.50]
; SANDY-NEXT: vsubps %ymm2, %ymm3, %ymm2 # sched: [3:1.00]
; SANDY-NEXT: vmulps %ymm2, %ymm1, %ymm2 # sched: [5:1.00]
; SANDY-NEXT: vaddps %ymm2, %ymm1, %ymm1 # sched: [3:1.00]
@@ -745,21 +781,21 @@ define <8 x float> @v8f32_two_step(<8 x float> %x) #2 {
; SANDY-NEXT: vsubps %ymm0, %ymm3, %ymm0 # sched: [3:1.00]
; SANDY-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00]
; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: v8f32_two_step:
-; HASWELL: # BB#0:
-; HASWELL-NEXT: vrcpps %ymm0, %ymm1 # sched: [7:2.00]
-; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [5:1.00]
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vrcpps %ymm0, %ymm1 # sched: [11:2.00]
+; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
; HASWELL-NEXT: vmovaps %ymm1, %ymm3 # sched: [1:1.00]
-; HASWELL-NEXT: vfnmadd213ps %ymm2, %ymm0, %ymm3
-; HASWELL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm3
-; HASWELL-NEXT: vfnmadd213ps %ymm2, %ymm3, %ymm0
-; HASWELL-NEXT: vfmadd132ps %ymm3, %ymm3, %ymm0
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vfnmadd213ps %ymm2, %ymm0, %ymm3 # sched: [5:0.50]
+; HASWELL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm3 # sched: [5:0.50]
+; HASWELL-NEXT: vfnmadd213ps %ymm2, %ymm3, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfmadd132ps %ymm3, %ymm3, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
;
; HASWELL-NO-FMA-LABEL: v8f32_two_step:
-; HASWELL-NO-FMA: # BB#0:
+; HASWELL-NO-FMA: # %bb.0:
; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm1
; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm0, %ymm2
; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1]
@@ -773,26 +809,26 @@ define <8 x float> @v8f32_two_step(<8 x float> %x) #2 {
; HASWELL-NO-FMA-NEXT: retq
;
; KNL-LABEL: v8f32_two_step:
-; KNL: # BB#0:
-; KNL-NEXT: vrcpps %ymm0, %ymm1 # sched: [7:2.00]
-; KNL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [5:1.00]
+; KNL: # %bb.0:
+; KNL-NEXT: vrcpps %ymm0, %ymm1 # sched: [11:2.00]
+; KNL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
; KNL-NEXT: vmovaps %ymm1, %ymm3 # sched: [1:1.00]
-; KNL-NEXT: vfnmadd213ps %ymm2, %ymm0, %ymm3
-; KNL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm3
-; KNL-NEXT: vfnmadd213ps %ymm2, %ymm3, %ymm0
-; KNL-NEXT: vfmadd132ps %ymm3, %ymm3, %ymm0
-; KNL-NEXT: retq # sched: [1:1.00]
+; KNL-NEXT: vfnmadd213ps %ymm2, %ymm0, %ymm3 # sched: [5:0.50]
+; KNL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm3 # sched: [5:0.50]
+; KNL-NEXT: vfnmadd213ps %ymm2, %ymm3, %ymm0 # sched: [5:0.50]
+; KNL-NEXT: vfmadd132ps %ymm3, %ymm3, %ymm0 # sched: [5:0.50]
+; KNL-NEXT: retq # sched: [7:1.00]
;
; SKX-LABEL: v8f32_two_step:
-; SKX: # BB#0:
-; SKX-NEXT: vrcp14ps %ymm0, %ymm1
-; SKX-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [5:1.00]
-; SKX-NEXT: vmovaps %ymm1, %ymm3 # sched: [1:1.00]
-; SKX-NEXT: vfnmadd213ps %ymm2, %ymm0, %ymm3
-; SKX-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm3
-; SKX-NEXT: vfnmadd213ps %ymm2, %ymm3, %ymm0
-; SKX-NEXT: vfmadd132ps %ymm3, %ymm3, %ymm0
-; SKX-NEXT: retq # sched: [1:1.00]
+; SKX: # %bb.0:
+; SKX-NEXT: vrcpps %ymm0, %ymm1 # sched: [4:1.00]
+; SKX-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
+; SKX-NEXT: vmovaps %ymm1, %ymm3 # sched: [1:0.33]
+; SKX-NEXT: vfnmadd213ps %ymm2, %ymm0, %ymm3 # sched: [4:0.33]
+; SKX-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm3 # sched: [4:0.33]
+; SKX-NEXT: vfnmadd213ps %ymm2, %ymm3, %ymm0 # sched: [4:0.33]
+; SKX-NEXT: vfmadd132ps %ymm3, %ymm3, %ymm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
%div = fdiv fast <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x
ret <8 x float> %div
}
diff --git a/test/CodeGen/X86/recip-fastmath2.ll b/test/CodeGen/X86/recip-fastmath2.ll
index e6070e41a2b2..204d7dffd536 100644
--- a/test/CodeGen/X86/recip-fastmath2.ll
+++ b/test/CodeGen/X86/recip-fastmath2.ll
@@ -13,59 +13,65 @@
define float @f32_no_step_2(float %x) #3 {
; SSE-LABEL: f32_no_step_2:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: rcpss %xmm0, %xmm0
; SSE-NEXT: mulss {{.*}}(%rip), %xmm0
; SSE-NEXT: retq
;
; AVX-RECIP-LABEL: f32_no_step_2:
-; AVX-RECIP: # BB#0:
+; AVX-RECIP: # %bb.0:
; AVX-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm0
; AVX-RECIP-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0
; AVX-RECIP-NEXT: retq
;
; FMA-RECIP-LABEL: f32_no_step_2:
-; FMA-RECIP: # BB#0:
+; FMA-RECIP: # %bb.0:
; FMA-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm0
; FMA-RECIP-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0
; FMA-RECIP-NEXT: retq
;
; BTVER2-LABEL: f32_no_step_2:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vrcpss %xmm0, %xmm0, %xmm0 # sched: [2:1.00]
; BTVER2-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [7:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: f32_no_step_2:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vrcpss %xmm0, %xmm0, %xmm0 # sched: [5:1.00]
-; SANDY-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [11:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: f32_no_step_2:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vrcpss %xmm0, %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [10:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
;
; HASWELL-NO-FMA-LABEL: f32_no_step_2:
-; HASWELL-NO-FMA: # BB#0:
+; HASWELL-NO-FMA: # %bb.0:
; HASWELL-NO-FMA-NEXT: vrcpss %xmm0, %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NO-FMA-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50]
-; HASWELL-NO-FMA-NEXT: retq # sched: [1:1.00]
-;
-; AVX512-LABEL: f32_no_step_2:
-; AVX512: # BB#0:
-; AVX512-NEXT: vrcp14ss %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50]
-; AVX512-NEXT: retq # sched: [1:1.00]
+; HASWELL-NO-FMA-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [10:0.50]
+; HASWELL-NO-FMA-NEXT: retq # sched: [7:1.00]
+;
+; KNL-LABEL: f32_no_step_2:
+; KNL: # %bb.0:
+; KNL-NEXT: vrcpss %xmm0, %xmm0, %xmm0 # sched: [5:1.00]
+; KNL-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [10:0.50]
+; KNL-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: f32_no_step_2:
+; SKX: # %bb.0:
+; SKX-NEXT: vrcpss %xmm0, %xmm0, %xmm0 # sched: [4:1.00]
+; SKX-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
%div = fdiv fast float 1234.0, %x
ret float %div
}
define float @f32_one_step_2(float %x) #1 {
; SSE-LABEL: f32_one_step_2:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: rcpss %xmm0, %xmm2
; SSE-NEXT: mulss %xmm2, %xmm0
; SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
@@ -77,7 +83,7 @@ define float @f32_one_step_2(float %x) #1 {
; SSE-NEXT: retq
;
; AVX-RECIP-LABEL: f32_one_step_2:
-; AVX-RECIP: # BB#0:
+; AVX-RECIP: # %bb.0:
; AVX-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm1
; AVX-RECIP-NEXT: vmulss %xmm1, %xmm0, %xmm0
; AVX-RECIP-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
@@ -88,7 +94,7 @@ define float @f32_one_step_2(float %x) #1 {
; AVX-RECIP-NEXT: retq
;
; FMA-RECIP-LABEL: f32_one_step_2:
-; FMA-RECIP: # BB#0:
+; FMA-RECIP: # %bb.0:
; FMA-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm1
; FMA-RECIP-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0
; FMA-RECIP-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0
@@ -96,7 +102,7 @@ define float @f32_one_step_2(float %x) #1 {
; FMA-RECIP-NEXT: retq
;
; BTVER2-LABEL: f32_one_step_2:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [5:1.00]
; BTVER2-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [2:1.00]
; BTVER2-NEXT: vmulss %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
@@ -107,49 +113,57 @@ define float @f32_one_step_2(float %x) #1 {
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: f32_one_step_2:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
; SANDY-NEXT: vmulss %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; SANDY-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [4:0.50]
+; SANDY-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [6:0.50]
; SANDY-NEXT: vsubss %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
; SANDY-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
; SANDY-NEXT: vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [11:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: f32_one_step_2:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
-; HASWELL-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0
-; HASWELL-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0
-; HASWELL-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 # sched: [10:0.50]
+; HASWELL-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [10:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
;
; HASWELL-NO-FMA-LABEL: f32_one_step_2:
-; HASWELL-NO-FMA: # BB#0:
+; HASWELL-NO-FMA: # %bb.0:
; HASWELL-NO-FMA-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
; HASWELL-NO-FMA-NEXT: vmulss %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NO-FMA-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [4:0.50]
+; HASWELL-NO-FMA-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [5:0.50]
; HASWELL-NO-FMA-NEXT: vsubss %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
; HASWELL-NO-FMA-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
; HASWELL-NO-FMA-NEXT: vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; HASWELL-NO-FMA-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50]
-; HASWELL-NO-FMA-NEXT: retq # sched: [1:1.00]
-;
-; AVX512-LABEL: f32_one_step_2:
-; AVX512: # BB#0:
-; AVX512-NEXT: vrcp14ss %xmm0, %xmm0, %xmm1
-; AVX512-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0
-; AVX512-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0
-; AVX512-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50]
-; AVX512-NEXT: retq # sched: [1:1.00]
+; HASWELL-NO-FMA-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [10:0.50]
+; HASWELL-NO-FMA-NEXT: retq # sched: [7:1.00]
+;
+; KNL-LABEL: f32_one_step_2:
+; KNL: # %bb.0:
+; KNL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
+; KNL-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 # sched: [10:0.50]
+; KNL-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [10:0.50]
+; KNL-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: f32_one_step_2:
+; SKX: # %bb.0:
+; SKX-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [4:1.00]
+; SKX-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 # sched: [9:0.50]
+; SKX-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
%div = fdiv fast float 3456.0, %x
ret float %div
}
define float @f32_one_step_2_divs(float %x) #1 {
; SSE-LABEL: f32_one_step_2_divs:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: rcpss %xmm0, %xmm1
; SSE-NEXT: mulss %xmm1, %xmm0
; SSE-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
@@ -162,7 +176,7 @@ define float @f32_one_step_2_divs(float %x) #1 {
; SSE-NEXT: retq
;
; AVX-RECIP-LABEL: f32_one_step_2_divs:
-; AVX-RECIP: # BB#0:
+; AVX-RECIP: # %bb.0:
; AVX-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm1
; AVX-RECIP-NEXT: vmulss %xmm1, %xmm0, %xmm0
; AVX-RECIP-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
@@ -174,7 +188,7 @@ define float @f32_one_step_2_divs(float %x) #1 {
; AVX-RECIP-NEXT: retq
;
; FMA-RECIP-LABEL: f32_one_step_2_divs:
-; FMA-RECIP: # BB#0:
+; FMA-RECIP: # %bb.0:
; FMA-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm1
; FMA-RECIP-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0
; FMA-RECIP-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0
@@ -183,7 +197,7 @@ define float @f32_one_step_2_divs(float %x) #1 {
; FMA-RECIP-NEXT: retq
;
; BTVER2-LABEL: f32_one_step_2_divs:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [5:1.00]
; BTVER2-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [2:1.00]
; BTVER2-NEXT: vmulss %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
@@ -195,46 +209,55 @@ define float @f32_one_step_2_divs(float %x) #1 {
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: f32_one_step_2_divs:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
; SANDY-NEXT: vmulss %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; SANDY-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [4:0.50]
+; SANDY-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [6:0.50]
; SANDY-NEXT: vsubss %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
; SANDY-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
; SANDY-NEXT: vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 # sched: [9:1.00]
+; SANDY-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 # sched: [11:1.00]
; SANDY-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: f32_one_step_2_divs:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
-; HASWELL-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0
-; HASWELL-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0
-; HASWELL-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 # sched: [9:0.50]
+; HASWELL-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 # sched: [10:0.50]
+; HASWELL-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 # sched: [10:0.50]
; HASWELL-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
;
; HASWELL-NO-FMA-LABEL: f32_one_step_2_divs:
-; HASWELL-NO-FMA: # BB#0:
+; HASWELL-NO-FMA: # %bb.0:
; HASWELL-NO-FMA-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
; HASWELL-NO-FMA-NEXT: vmulss %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NO-FMA-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [4:0.50]
+; HASWELL-NO-FMA-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [5:0.50]
; HASWELL-NO-FMA-NEXT: vsubss %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
; HASWELL-NO-FMA-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
; HASWELL-NO-FMA-NEXT: vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; HASWELL-NO-FMA-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 # sched: [9:0.50]
+; HASWELL-NO-FMA-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 # sched: [10:0.50]
; HASWELL-NO-FMA-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
-; HASWELL-NO-FMA-NEXT: retq # sched: [1:1.00]
-;
-; AVX512-LABEL: f32_one_step_2_divs:
-; AVX512: # BB#0:
-; AVX512-NEXT: vrcp14ss %xmm0, %xmm0, %xmm1
-; AVX512-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0
-; AVX512-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0
-; AVX512-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 # sched: [9:0.50]
-; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
-; AVX512-NEXT: retq # sched: [1:1.00]
+; HASWELL-NO-FMA-NEXT: retq # sched: [7:1.00]
+;
+; KNL-LABEL: f32_one_step_2_divs:
+; KNL: # %bb.0:
+; KNL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
+; KNL-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 # sched: [10:0.50]
+; KNL-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 # sched: [10:0.50]
+; KNL-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: f32_one_step_2_divs:
+; SKX: # %bb.0:
+; SKX-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [4:1.00]
+; SKX-NEXT: vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 # sched: [9:0.50]
+; SKX-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 # sched: [9:0.50]
+; SKX-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
%div = fdiv fast float 3456.0, %x
%div2 = fdiv fast float %div, %x
ret float %div2
@@ -242,7 +265,7 @@ define float @f32_one_step_2_divs(float %x) #1 {
define float @f32_two_step_2(float %x) #2 {
; SSE-LABEL: f32_two_step_2:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: rcpss %xmm0, %xmm2
; SSE-NEXT: movaps %xmm0, %xmm3
; SSE-NEXT: mulss %xmm2, %xmm3
@@ -260,7 +283,7 @@ define float @f32_two_step_2(float %x) #2 {
; SSE-NEXT: retq
;
; AVX-RECIP-LABEL: f32_two_step_2:
-; AVX-RECIP: # BB#0:
+; AVX-RECIP: # %bb.0:
; AVX-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm1
; AVX-RECIP-NEXT: vmulss %xmm1, %xmm0, %xmm2
; AVX-RECIP-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
@@ -275,7 +298,7 @@ define float @f32_two_step_2(float %x) #2 {
; AVX-RECIP-NEXT: retq
;
; FMA-RECIP-LABEL: f32_two_step_2:
-; FMA-RECIP: # BB#0:
+; FMA-RECIP: # %bb.0:
; FMA-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm1
; FMA-RECIP-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; FMA-RECIP-NEXT: vmovaps %xmm1, %xmm3
@@ -287,7 +310,7 @@ define float @f32_two_step_2(float %x) #2 {
; FMA-RECIP-NEXT: retq
;
; BTVER2-LABEL: f32_two_step_2:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero sched: [5:1.00]
; BTVER2-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [2:1.00]
; BTVER2-NEXT: vmulss %xmm1, %xmm0, %xmm2 # sched: [2:1.00]
@@ -302,10 +325,10 @@ define float @f32_two_step_2(float %x) #2 {
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: f32_two_step_2:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
; SANDY-NEXT: vmulss %xmm1, %xmm0, %xmm2 # sched: [5:1.00]
-; SANDY-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero sched: [4:0.50]
+; SANDY-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero sched: [6:0.50]
; SANDY-NEXT: vsubss %xmm2, %xmm3, %xmm2 # sched: [3:1.00]
; SANDY-NEXT: vmulss %xmm2, %xmm1, %xmm2 # sched: [5:1.00]
; SANDY-NEXT: vaddss %xmm2, %xmm1, %xmm1 # sched: [3:1.00]
@@ -313,26 +336,26 @@ define float @f32_two_step_2(float %x) #2 {
; SANDY-NEXT: vsubss %xmm0, %xmm3, %xmm0 # sched: [3:1.00]
; SANDY-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
; SANDY-NEXT: vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [11:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: f32_two_step_2:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
-; HASWELL-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [4:0.50]
+; HASWELL-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [5:0.50]
; HASWELL-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00]
-; HASWELL-NEXT: vfnmadd213ss %xmm2, %xmm0, %xmm3
-; HASWELL-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm3
-; HASWELL-NEXT: vfnmadd213ss %xmm2, %xmm3, %xmm0
-; HASWELL-NEXT: vfmadd132ss %xmm3, %xmm3, %xmm0
-; HASWELL-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vfnmadd213ss %xmm2, %xmm0, %xmm3 # sched: [5:0.50]
+; HASWELL-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm3 # sched: [5:0.50]
+; HASWELL-NEXT: vfnmadd213ss %xmm2, %xmm3, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfmadd132ss %xmm3, %xmm3, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [10:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
;
; HASWELL-NO-FMA-LABEL: f32_two_step_2:
-; HASWELL-NO-FMA: # BB#0:
+; HASWELL-NO-FMA: # %bb.0:
; HASWELL-NO-FMA-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
; HASWELL-NO-FMA-NEXT: vmulss %xmm1, %xmm0, %xmm2 # sched: [5:0.50]
-; HASWELL-NO-FMA-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero sched: [4:0.50]
+; HASWELL-NO-FMA-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero sched: [5:0.50]
; HASWELL-NO-FMA-NEXT: vsubss %xmm2, %xmm3, %xmm2 # sched: [3:1.00]
; HASWELL-NO-FMA-NEXT: vmulss %xmm2, %xmm1, %xmm2 # sched: [5:0.50]
; HASWELL-NO-FMA-NEXT: vaddss %xmm2, %xmm1, %xmm1 # sched: [3:1.00]
@@ -340,27 +363,39 @@ define float @f32_two_step_2(float %x) #2 {
; HASWELL-NO-FMA-NEXT: vsubss %xmm0, %xmm3, %xmm0 # sched: [3:1.00]
; HASWELL-NO-FMA-NEXT: vmulss %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
; HASWELL-NO-FMA-NEXT: vaddss %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; HASWELL-NO-FMA-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50]
-; HASWELL-NO-FMA-NEXT: retq # sched: [1:1.00]
-;
-; AVX512-LABEL: f32_two_step_2:
-; AVX512: # BB#0:
-; AVX512-NEXT: vrcp14ss %xmm0, %xmm0, %xmm1
-; AVX512-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [4:0.50]
-; AVX512-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00]
-; AVX512-NEXT: vfnmadd213ss %xmm2, %xmm0, %xmm3
-; AVX512-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm3
-; AVX512-NEXT: vfnmadd213ss %xmm2, %xmm3, %xmm0
-; AVX512-NEXT: vfmadd132ss %xmm3, %xmm3, %xmm0
-; AVX512-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50]
-; AVX512-NEXT: retq # sched: [1:1.00]
+; HASWELL-NO-FMA-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [10:0.50]
+; HASWELL-NO-FMA-NEXT: retq # sched: [7:1.00]
+;
+; KNL-LABEL: f32_two_step_2:
+; KNL: # %bb.0:
+; KNL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [5:1.00]
+; KNL-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [5:0.50]
+; KNL-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00]
+; KNL-NEXT: vfnmadd213ss %xmm2, %xmm0, %xmm3 # sched: [5:0.50]
+; KNL-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm3 # sched: [5:0.50]
+; KNL-NEXT: vfnmadd213ss %xmm2, %xmm3, %xmm0 # sched: [5:0.50]
+; KNL-NEXT: vfmadd132ss %xmm3, %xmm3, %xmm0 # sched: [5:0.50]
+; KNL-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [10:0.50]
+; KNL-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: f32_two_step_2:
+; SKX: # %bb.0:
+; SKX-NEXT: vrcpss %xmm0, %xmm0, %xmm1 # sched: [4:1.00]
+; SKX-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [5:0.50]
+; SKX-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vfnmadd213ss %xmm2, %xmm0, %xmm3 # sched: [4:0.33]
+; SKX-NEXT: vfmadd132ss %xmm1, %xmm1, %xmm3 # sched: [4:0.33]
+; SKX-NEXT: vfnmadd213ss %xmm2, %xmm3, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: vfmadd132ss %xmm3, %xmm3, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
%div = fdiv fast float 6789.0, %x
ret float %div
}
define <4 x float> @v4f32_one_step2(<4 x float> %x) #1 {
; SSE-LABEL: v4f32_one_step2:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: rcpps %xmm0, %xmm2
; SSE-NEXT: mulps %xmm2, %xmm0
; SSE-NEXT: movaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
@@ -372,7 +407,7 @@ define <4 x float> @v4f32_one_step2(<4 x float> %x) #1 {
; SSE-NEXT: retq
;
; AVX-RECIP-LABEL: v4f32_one_step2:
-; AVX-RECIP: # BB#0:
+; AVX-RECIP: # %bb.0:
; AVX-RECIP-NEXT: vrcpps %xmm0, %xmm1
; AVX-RECIP-NEXT: vmulps %xmm1, %xmm0, %xmm0
; AVX-RECIP-NEXT: vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
@@ -383,7 +418,7 @@ define <4 x float> @v4f32_one_step2(<4 x float> %x) #1 {
; AVX-RECIP-NEXT: retq
;
; FMA-RECIP-LABEL: v4f32_one_step2:
-; FMA-RECIP: # BB#0:
+; FMA-RECIP: # %bb.0:
; FMA-RECIP-NEXT: vrcpps %xmm0, %xmm1
; FMA-RECIP-NEXT: vfnmadd213ps {{.*}}(%rip), %xmm1, %xmm0
; FMA-RECIP-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0
@@ -391,7 +426,7 @@ define <4 x float> @v4f32_one_step2(<4 x float> %x) #1 {
; FMA-RECIP-NEXT: retq
;
; BTVER2-LABEL: v4f32_one_step2:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [5:1.00]
; BTVER2-NEXT: vrcpps %xmm0, %xmm1 # sched: [2:1.00]
; BTVER2-NEXT: vmulps %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
@@ -402,59 +437,59 @@ define <4 x float> @v4f32_one_step2(<4 x float> %x) #1 {
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: v4f32_one_step2:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00]
; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; SANDY-NEXT: vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [4:0.50]
+; SANDY-NEXT: vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [6:0.50]
; SANDY-NEXT: vsubps %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
; SANDY-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
; SANDY-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [11:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: v4f32_one_step2:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00]
-; HASWELL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [4:0.50]
-; HASWELL-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0
-; HASWELL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0
-; HASWELL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [6:0.50]
+; HASWELL-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [11:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
;
; HASWELL-NO-FMA-LABEL: v4f32_one_step2:
-; HASWELL-NO-FMA: # BB#0:
+; HASWELL-NO-FMA: # %bb.0:
; HASWELL-NO-FMA-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00]
; HASWELL-NO-FMA-NEXT: vmulps %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [4:0.50]
+; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [6:0.50]
; HASWELL-NO-FMA-NEXT: vsubps %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
; HASWELL-NO-FMA-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
; HASWELL-NO-FMA-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50]
-; HASWELL-NO-FMA-NEXT: retq # sched: [1:1.00]
+; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [11:0.50]
+; HASWELL-NO-FMA-NEXT: retq # sched: [7:1.00]
;
; KNL-LABEL: v4f32_one_step2:
-; KNL: # BB#0:
+; KNL: # %bb.0:
; KNL-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00]
-; KNL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [4:0.50]
-; KNL-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0
-; KNL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0
-; KNL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50]
-; KNL-NEXT: retq # sched: [1:1.00]
+; KNL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [6:0.50]
+; KNL-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [11:0.50]
+; KNL-NEXT: retq # sched: [7:1.00]
;
; SKX-LABEL: v4f32_one_step2:
-; SKX: # BB#0:
-; SKX-NEXT: vrcp14ps %xmm0, %xmm1
-; SKX-NEXT: vfnmadd213ps {{.*}}(%rip){1to4}, %xmm1, %xmm0
-; SKX-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0
-; SKX-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50]
-; SKX-NEXT: retq # sched: [1:1.00]
+; SKX: # %bb.0:
+; SKX-NEXT: vrcpps %xmm0, %xmm1 # sched: [4:1.00]
+; SKX-NEXT: vfnmadd213ps {{.*}}(%rip){1to4}, %xmm1, %xmm0 # sched: [10:0.50]
+; SKX-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [10:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
%div = fdiv fast <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, %x
ret <4 x float> %div
}
define <4 x float> @v4f32_one_step_2_divs(<4 x float> %x) #1 {
; SSE-LABEL: v4f32_one_step_2_divs:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: rcpps %xmm0, %xmm1
; SSE-NEXT: mulps %xmm1, %xmm0
; SSE-NEXT: movaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
@@ -467,7 +502,7 @@ define <4 x float> @v4f32_one_step_2_divs(<4 x float> %x) #1 {
; SSE-NEXT: retq
;
; AVX-RECIP-LABEL: v4f32_one_step_2_divs:
-; AVX-RECIP: # BB#0:
+; AVX-RECIP: # %bb.0:
; AVX-RECIP-NEXT: vrcpps %xmm0, %xmm1
; AVX-RECIP-NEXT: vmulps %xmm1, %xmm0, %xmm0
; AVX-RECIP-NEXT: vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
@@ -479,7 +514,7 @@ define <4 x float> @v4f32_one_step_2_divs(<4 x float> %x) #1 {
; AVX-RECIP-NEXT: retq
;
; FMA-RECIP-LABEL: v4f32_one_step_2_divs:
-; FMA-RECIP: # BB#0:
+; FMA-RECIP: # %bb.0:
; FMA-RECIP-NEXT: vrcpps %xmm0, %xmm1
; FMA-RECIP-NEXT: vfnmadd213ps {{.*}}(%rip), %xmm1, %xmm0
; FMA-RECIP-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0
@@ -488,7 +523,7 @@ define <4 x float> @v4f32_one_step_2_divs(<4 x float> %x) #1 {
; FMA-RECIP-NEXT: retq
;
; BTVER2-LABEL: v4f32_one_step_2_divs:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [5:1.00]
; BTVER2-NEXT: vrcpps %xmm0, %xmm1 # sched: [2:1.00]
; BTVER2-NEXT: vmulps %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
@@ -500,57 +535,57 @@ define <4 x float> @v4f32_one_step_2_divs(<4 x float> %x) #1 {
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: v4f32_one_step_2_divs:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00]
; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; SANDY-NEXT: vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [4:0.50]
+; SANDY-NEXT: vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [6:0.50]
; SANDY-NEXT: vsubps %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
; SANDY-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
; SANDY-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 # sched: [9:1.00]
+; SANDY-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 # sched: [11:1.00]
; SANDY-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: v4f32_one_step_2_divs:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00]
-; HASWELL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [4:0.50]
-; HASWELL-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0
-; HASWELL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0
-; HASWELL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 # sched: [9:0.50]
+; HASWELL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [6:0.50]
+; HASWELL-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 # sched: [11:0.50]
; HASWELL-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
;
; HASWELL-NO-FMA-LABEL: v4f32_one_step_2_divs:
-; HASWELL-NO-FMA: # BB#0:
+; HASWELL-NO-FMA: # %bb.0:
; HASWELL-NO-FMA-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00]
; HASWELL-NO-FMA-NEXT: vmulps %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [4:0.50]
+; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [6:0.50]
; HASWELL-NO-FMA-NEXT: vsubps %xmm0, %xmm2, %xmm0 # sched: [3:1.00]
; HASWELL-NO-FMA-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
; HASWELL-NO-FMA-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 # sched: [9:0.50]
+; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 # sched: [11:0.50]
; HASWELL-NO-FMA-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
-; HASWELL-NO-FMA-NEXT: retq # sched: [1:1.00]
+; HASWELL-NO-FMA-NEXT: retq # sched: [7:1.00]
;
; KNL-LABEL: v4f32_one_step_2_divs:
-; KNL: # BB#0:
+; KNL: # %bb.0:
; KNL-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00]
-; KNL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [4:0.50]
-; KNL-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0
-; KNL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0
-; KNL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 # sched: [9:0.50]
+; KNL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [6:0.50]
+; KNL-NEXT: vfnmadd213ps %xmm2, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 # sched: [5:0.50]
+; KNL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 # sched: [11:0.50]
; KNL-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
-; KNL-NEXT: retq # sched: [1:1.00]
+; KNL-NEXT: retq # sched: [7:1.00]
;
; SKX-LABEL: v4f32_one_step_2_divs:
-; SKX: # BB#0:
-; SKX-NEXT: vrcp14ps %xmm0, %xmm1
-; SKX-NEXT: vfnmadd213ps {{.*}}(%rip){1to4}, %xmm1, %xmm0
-; SKX-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0
-; SKX-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 # sched: [9:0.50]
-; SKX-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
-; SKX-NEXT: retq # sched: [1:1.00]
+; SKX: # %bb.0:
+; SKX-NEXT: vrcpps %xmm0, %xmm1 # sched: [4:1.00]
+; SKX-NEXT: vfnmadd213ps {{.*}}(%rip){1to4}, %xmm1, %xmm0 # sched: [10:0.50]
+; SKX-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 # sched: [10:0.50]
+; SKX-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
%div = fdiv fast <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, %x
%div2 = fdiv fast <4 x float> %div, %x
ret <4 x float> %div2
@@ -558,7 +593,7 @@ define <4 x float> @v4f32_one_step_2_divs(<4 x float> %x) #1 {
define <4 x float> @v4f32_two_step2(<4 x float> %x) #2 {
; SSE-LABEL: v4f32_two_step2:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: rcpps %xmm0, %xmm2
; SSE-NEXT: movaps %xmm0, %xmm3
; SSE-NEXT: mulps %xmm2, %xmm3
@@ -576,7 +611,7 @@ define <4 x float> @v4f32_two_step2(<4 x float> %x) #2 {
; SSE-NEXT: retq
;
; AVX-RECIP-LABEL: v4f32_two_step2:
-; AVX-RECIP: # BB#0:
+; AVX-RECIP: # %bb.0:
; AVX-RECIP-NEXT: vrcpps %xmm0, %xmm1
; AVX-RECIP-NEXT: vmulps %xmm1, %xmm0, %xmm2
; AVX-RECIP-NEXT: vmovaps {{.*#+}} xmm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
@@ -591,7 +626,7 @@ define <4 x float> @v4f32_two_step2(<4 x float> %x) #2 {
; AVX-RECIP-NEXT: retq
;
; FMA-RECIP-LABEL: v4f32_two_step2:
-; FMA-RECIP: # BB#0:
+; FMA-RECIP: # %bb.0:
; FMA-RECIP-NEXT: vrcpps %xmm0, %xmm1
; FMA-RECIP-NEXT: vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
; FMA-RECIP-NEXT: vmovaps %xmm1, %xmm3
@@ -603,7 +638,7 @@ define <4 x float> @v4f32_two_step2(<4 x float> %x) #2 {
; FMA-RECIP-NEXT: retq
;
; BTVER2-LABEL: v4f32_two_step2:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vmovaps {{.*#+}} xmm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [5:1.00]
; BTVER2-NEXT: vrcpps %xmm0, %xmm1 # sched: [2:1.00]
; BTVER2-NEXT: vmulps %xmm1, %xmm0, %xmm2 # sched: [2:1.00]
@@ -618,10 +653,10 @@ define <4 x float> @v4f32_two_step2(<4 x float> %x) #2 {
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: v4f32_two_step2:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00]
; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm2 # sched: [5:1.00]
-; SANDY-NEXT: vmovaps {{.*#+}} xmm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [4:0.50]
+; SANDY-NEXT: vmovaps {{.*#+}} xmm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [6:0.50]
; SANDY-NEXT: vsubps %xmm2, %xmm3, %xmm2 # sched: [3:1.00]
; SANDY-NEXT: vmulps %xmm2, %xmm1, %xmm2 # sched: [5:1.00]
; SANDY-NEXT: vaddps %xmm2, %xmm1, %xmm1 # sched: [3:1.00]
@@ -629,26 +664,26 @@ define <4 x float> @v4f32_two_step2(<4 x float> %x) #2 {
; SANDY-NEXT: vsubps %xmm0, %xmm3, %xmm0 # sched: [3:1.00]
; SANDY-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [5:1.00]
; SANDY-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [11:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: v4f32_two_step2:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00]
-; HASWELL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [4:0.50]
+; HASWELL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [6:0.50]
; HASWELL-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00]
-; HASWELL-NEXT: vfnmadd213ps %xmm2, %xmm0, %xmm3
-; HASWELL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm3
-; HASWELL-NEXT: vfnmadd213ps %xmm2, %xmm3, %xmm0
-; HASWELL-NEXT: vfmadd132ps %xmm3, %xmm3, %xmm0
-; HASWELL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vfnmadd213ps %xmm2, %xmm0, %xmm3 # sched: [5:0.50]
+; HASWELL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm3 # sched: [5:0.50]
+; HASWELL-NEXT: vfnmadd213ps %xmm2, %xmm3, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfmadd132ps %xmm3, %xmm3, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [11:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
;
; HASWELL-NO-FMA-LABEL: v4f32_two_step2:
-; HASWELL-NO-FMA: # BB#0:
+; HASWELL-NO-FMA: # %bb.0:
; HASWELL-NO-FMA-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00]
; HASWELL-NO-FMA-NEXT: vmulps %xmm1, %xmm0, %xmm2 # sched: [5:0.50]
-; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} xmm3 = [1,1,1,1] sched: [4:0.50]
+; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} xmm3 = [1,1,1,1] sched: [6:0.50]
; HASWELL-NO-FMA-NEXT: vsubps %xmm2, %xmm3, %xmm2 # sched: [3:1.00]
; HASWELL-NO-FMA-NEXT: vmulps %xmm2, %xmm1, %xmm2 # sched: [5:0.50]
; HASWELL-NO-FMA-NEXT: vaddps %xmm2, %xmm1, %xmm1 # sched: [3:1.00]
@@ -656,39 +691,39 @@ define <4 x float> @v4f32_two_step2(<4 x float> %x) #2 {
; HASWELL-NO-FMA-NEXT: vsubps %xmm0, %xmm3, %xmm0 # sched: [3:1.00]
; HASWELL-NO-FMA-NEXT: vmulps %xmm0, %xmm1, %xmm0 # sched: [5:0.50]
; HASWELL-NO-FMA-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50]
-; HASWELL-NO-FMA-NEXT: retq # sched: [1:1.00]
+; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [11:0.50]
+; HASWELL-NO-FMA-NEXT: retq # sched: [7:1.00]
;
; KNL-LABEL: v4f32_two_step2:
-; KNL: # BB#0:
+; KNL: # %bb.0:
; KNL-NEXT: vrcpps %xmm0, %xmm1 # sched: [5:1.00]
-; KNL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [4:0.50]
+; KNL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [6:0.50]
; KNL-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00]
-; KNL-NEXT: vfnmadd213ps %xmm2, %xmm0, %xmm3
-; KNL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm3
-; KNL-NEXT: vfnmadd213ps %xmm2, %xmm3, %xmm0
-; KNL-NEXT: vfmadd132ps %xmm3, %xmm3, %xmm0
-; KNL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50]
-; KNL-NEXT: retq # sched: [1:1.00]
+; KNL-NEXT: vfnmadd213ps %xmm2, %xmm0, %xmm3 # sched: [5:0.50]
+; KNL-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm3 # sched: [5:0.50]
+; KNL-NEXT: vfnmadd213ps %xmm2, %xmm3, %xmm0 # sched: [5:0.50]
+; KNL-NEXT: vfmadd132ps %xmm3, %xmm3, %xmm0 # sched: [5:0.50]
+; KNL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [11:0.50]
+; KNL-NEXT: retq # sched: [7:1.00]
;
; SKX-LABEL: v4f32_two_step2:
-; SKX: # BB#0:
-; SKX-NEXT: vrcp14ps %xmm0, %xmm1
-; SKX-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [4:0.50]
-; SKX-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:1.00]
-; SKX-NEXT: vfnmadd213ps %xmm2, %xmm0, %xmm3
-; SKX-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm3
-; SKX-NEXT: vfnmadd213ps %xmm2, %xmm3, %xmm0
-; SKX-NEXT: vfmadd132ps %xmm3, %xmm3, %xmm0
-; SKX-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50]
-; SKX-NEXT: retq # sched: [1:1.00]
+; SKX: # %bb.0:
+; SKX-NEXT: vrcpps %xmm0, %xmm1 # sched: [4:1.00]
+; SKX-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [6:0.50]
+; SKX-NEXT: vmovaps %xmm1, %xmm3 # sched: [1:0.33]
+; SKX-NEXT: vfnmadd213ps %xmm2, %xmm0, %xmm3 # sched: [4:0.33]
+; SKX-NEXT: vfmadd132ps %xmm1, %xmm1, %xmm3 # sched: [4:0.33]
+; SKX-NEXT: vfnmadd213ps %xmm2, %xmm3, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: vfmadd132ps %xmm3, %xmm3, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [10:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
%div = fdiv fast <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, %x
ret <4 x float> %div
}
define <8 x float> @v8f32_one_step2(<8 x float> %x) #1 {
; SSE-LABEL: v8f32_one_step2:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: rcpps %xmm1, %xmm4
; SSE-NEXT: mulps %xmm4, %xmm1
; SSE-NEXT: movaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
@@ -708,7 +743,7 @@ define <8 x float> @v8f32_one_step2(<8 x float> %x) #1 {
; SSE-NEXT: retq
;
; AVX-RECIP-LABEL: v8f32_one_step2:
-; AVX-RECIP: # BB#0:
+; AVX-RECIP: # %bb.0:
; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm1
; AVX-RECIP-NEXT: vmulps %ymm1, %ymm0, %ymm0
; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
@@ -719,7 +754,7 @@ define <8 x float> @v8f32_one_step2(<8 x float> %x) #1 {
; AVX-RECIP-NEXT: retq
;
; FMA-RECIP-LABEL: v8f32_one_step2:
-; FMA-RECIP: # BB#0:
+; FMA-RECIP: # %bb.0:
; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm1
; FMA-RECIP-NEXT: vfnmadd213ps {{.*}}(%rip), %ymm1, %ymm0
; FMA-RECIP-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0
@@ -727,7 +762,7 @@ define <8 x float> @v8f32_one_step2(<8 x float> %x) #1 {
; FMA-RECIP-NEXT: retq
;
; BTVER2-LABEL: v8f32_one_step2:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [5:1.00]
; BTVER2-NEXT: vrcpps %ymm0, %ymm1 # sched: [2:2.00]
; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [2:2.00]
@@ -738,59 +773,59 @@ define <8 x float> @v8f32_one_step2(<8 x float> %x) #1 {
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: v8f32_one_step2:
-; SANDY: # BB#0:
-; SANDY-NEXT: vrcpps %ymm0, %ymm1 # sched: [5:1.00]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vrcpps %ymm0, %ymm1 # sched: [7:2.00]
; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
-; SANDY-NEXT: vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [4:0.50]
+; SANDY-NEXT: vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [7:0.50]
; SANDY-NEXT: vsubps %ymm0, %ymm2, %ymm0 # sched: [3:1.00]
; SANDY-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00]
; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [9:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: v8f32_one_step2:
-; HASWELL: # BB#0:
-; HASWELL-NEXT: vrcpps %ymm0, %ymm1 # sched: [7:2.00]
-; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [5:1.00]
-; HASWELL-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0
-; HASWELL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0
-; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [9:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vrcpps %ymm0, %ymm1 # sched: [11:2.00]
+; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
+; HASWELL-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
;
; HASWELL-NO-FMA-LABEL: v8f32_one_step2:
-; HASWELL-NO-FMA: # BB#0:
-; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm1 # sched: [7:2.00]
-; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
-; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [5:1.00]
+; HASWELL-NO-FMA: # %bb.0:
+; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm1 # sched: [11:2.00]
+; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [5:0.50]
+; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
; HASWELL-NO-FMA-NEXT: vsubps %ymm0, %ymm2, %ymm0 # sched: [3:1.00]
-; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00]
+; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:0.50]
; HASWELL-NO-FMA-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [9:1.00]
-; HASWELL-NO-FMA-NEXT: retq # sched: [1:1.00]
+; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:0.50]
+; HASWELL-NO-FMA-NEXT: retq # sched: [7:1.00]
;
; KNL-LABEL: v8f32_one_step2:
-; KNL: # BB#0:
-; KNL-NEXT: vrcpps %ymm0, %ymm1 # sched: [7:2.00]
-; KNL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [5:1.00]
-; KNL-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0
-; KNL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0
-; KNL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [9:1.00]
-; KNL-NEXT: retq # sched: [1:1.00]
+; KNL: # %bb.0:
+; KNL-NEXT: vrcpps %ymm0, %ymm1 # sched: [11:2.00]
+; KNL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
+; KNL-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; KNL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 # sched: [5:0.50]
+; KNL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:0.50]
+; KNL-NEXT: retq # sched: [7:1.00]
;
; SKX-LABEL: v8f32_one_step2:
-; SKX: # BB#0:
-; SKX-NEXT: vrcp14ps %ymm0, %ymm1
-; SKX-NEXT: vfnmadd213ps {{.*}}(%rip){1to8}, %ymm1, %ymm0
-; SKX-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0
-; SKX-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [9:1.00]
-; SKX-NEXT: retq # sched: [1:1.00]
+; SKX: # %bb.0:
+; SKX-NEXT: vrcpps %ymm0, %ymm1 # sched: [4:1.00]
+; SKX-NEXT: vfnmadd213ps {{.*}}(%rip){1to8}, %ymm1, %ymm0 # sched: [11:0.50]
+; SKX-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 # sched: [4:0.33]
+; SKX-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [11:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
%div = fdiv fast <8 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>, %x
ret <8 x float> %div
}
define <8 x float> @v8f32_one_step_2_divs(<8 x float> %x) #1 {
; SSE-LABEL: v8f32_one_step_2_divs:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: rcpps %xmm0, %xmm2
; SSE-NEXT: mulps %xmm2, %xmm0
; SSE-NEXT: movaps {{.*#+}} xmm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
@@ -812,7 +847,7 @@ define <8 x float> @v8f32_one_step_2_divs(<8 x float> %x) #1 {
; SSE-NEXT: retq
;
; AVX-RECIP-LABEL: v8f32_one_step_2_divs:
-; AVX-RECIP: # BB#0:
+; AVX-RECIP: # %bb.0:
; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm1
; AVX-RECIP-NEXT: vmulps %ymm1, %ymm0, %ymm0
; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
@@ -824,7 +859,7 @@ define <8 x float> @v8f32_one_step_2_divs(<8 x float> %x) #1 {
; AVX-RECIP-NEXT: retq
;
; FMA-RECIP-LABEL: v8f32_one_step_2_divs:
-; FMA-RECIP: # BB#0:
+; FMA-RECIP: # %bb.0:
; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm1
; FMA-RECIP-NEXT: vfnmadd213ps {{.*}}(%rip), %ymm1, %ymm0
; FMA-RECIP-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0
@@ -833,7 +868,7 @@ define <8 x float> @v8f32_one_step_2_divs(<8 x float> %x) #1 {
; FMA-RECIP-NEXT: retq
;
; BTVER2-LABEL: v8f32_one_step_2_divs:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [5:1.00]
; BTVER2-NEXT: vrcpps %ymm0, %ymm1 # sched: [2:2.00]
; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [2:2.00]
@@ -845,57 +880,57 @@ define <8 x float> @v8f32_one_step_2_divs(<8 x float> %x) #1 {
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: v8f32_one_step_2_divs:
-; SANDY: # BB#0:
-; SANDY-NEXT: vrcpps %ymm0, %ymm1 # sched: [5:1.00]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vrcpps %ymm0, %ymm1 # sched: [7:2.00]
; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
-; SANDY-NEXT: vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [4:0.50]
+; SANDY-NEXT: vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [7:0.50]
; SANDY-NEXT: vsubps %ymm0, %ymm2, %ymm0 # sched: [3:1.00]
; SANDY-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00]
; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [9:1.00]
+; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [12:1.00]
; SANDY-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: v8f32_one_step_2_divs:
-; HASWELL: # BB#0:
-; HASWELL-NEXT: vrcpps %ymm0, %ymm1 # sched: [7:2.00]
-; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [5:1.00]
-; HASWELL-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0
-; HASWELL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0
-; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [9:1.00]
-; HASWELL-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vrcpps %ymm0, %ymm1 # sched: [11:2.00]
+; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
+; HASWELL-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [12:0.50]
+; HASWELL-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
;
; HASWELL-NO-FMA-LABEL: v8f32_one_step_2_divs:
-; HASWELL-NO-FMA: # BB#0:
-; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm1 # sched: [7:2.00]
-; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
-; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [5:1.00]
+; HASWELL-NO-FMA: # %bb.0:
+; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm1 # sched: [11:2.00]
+; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [5:0.50]
+; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
; HASWELL-NO-FMA-NEXT: vsubps %ymm0, %ymm2, %ymm0 # sched: [3:1.00]
-; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00]
+; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:0.50]
; HASWELL-NO-FMA-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [9:1.00]
-; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00]
-; HASWELL-NO-FMA-NEXT: retq # sched: [1:1.00]
+; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [12:0.50]
+; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:0.50]
+; HASWELL-NO-FMA-NEXT: retq # sched: [7:1.00]
;
; KNL-LABEL: v8f32_one_step_2_divs:
-; KNL: # BB#0:
-; KNL-NEXT: vrcpps %ymm0, %ymm1 # sched: [7:2.00]
-; KNL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [5:1.00]
-; KNL-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0
-; KNL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0
-; KNL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [9:1.00]
-; KNL-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00]
-; KNL-NEXT: retq # sched: [1:1.00]
+; KNL: # %bb.0:
+; KNL-NEXT: vrcpps %ymm0, %ymm1 # sched: [11:2.00]
+; KNL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
+; KNL-NEXT: vfnmadd213ps %ymm2, %ymm1, %ymm0 # sched: [5:0.50]
+; KNL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 # sched: [5:0.50]
+; KNL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [12:0.50]
+; KNL-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:0.50]
+; KNL-NEXT: retq # sched: [7:1.00]
;
; SKX-LABEL: v8f32_one_step_2_divs:
-; SKX: # BB#0:
-; SKX-NEXT: vrcp14ps %ymm0, %ymm1
-; SKX-NEXT: vfnmadd213ps {{.*}}(%rip){1to8}, %ymm1, %ymm0
-; SKX-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0
-; SKX-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [9:1.00]
-; SKX-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00]
-; SKX-NEXT: retq # sched: [1:1.00]
+; SKX: # %bb.0:
+; SKX-NEXT: vrcpps %ymm0, %ymm1 # sched: [4:1.00]
+; SKX-NEXT: vfnmadd213ps {{.*}}(%rip){1to8}, %ymm1, %ymm0 # sched: [11:0.50]
+; SKX-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm0 # sched: [4:0.33]
+; SKX-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [11:0.50]
+; SKX-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
%div = fdiv fast <8 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>, %x
%div2 = fdiv fast <8 x float> %div, %x
ret <8 x float> %div2
@@ -903,7 +938,7 @@ define <8 x float> @v8f32_one_step_2_divs(<8 x float> %x) #1 {
define <8 x float> @v8f32_two_step2(<8 x float> %x) #2 {
; SSE-LABEL: v8f32_two_step2:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps %xmm0, %xmm2
; SSE-NEXT: rcpps %xmm1, %xmm3
; SSE-NEXT: movaps %xmm1, %xmm4
@@ -935,7 +970,7 @@ define <8 x float> @v8f32_two_step2(<8 x float> %x) #2 {
; SSE-NEXT: retq
;
; AVX-RECIP-LABEL: v8f32_two_step2:
-; AVX-RECIP: # BB#0:
+; AVX-RECIP: # %bb.0:
; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm1
; AVX-RECIP-NEXT: vmulps %ymm1, %ymm0, %ymm2
; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
@@ -950,7 +985,7 @@ define <8 x float> @v8f32_two_step2(<8 x float> %x) #2 {
; AVX-RECIP-NEXT: retq
;
; FMA-RECIP-LABEL: v8f32_two_step2:
-; FMA-RECIP: # BB#0:
+; FMA-RECIP: # %bb.0:
; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm1
; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
; FMA-RECIP-NEXT: vmovaps %ymm1, %ymm3
@@ -962,7 +997,7 @@ define <8 x float> @v8f32_two_step2(<8 x float> %x) #2 {
; FMA-RECIP-NEXT: retq
;
; BTVER2-LABEL: v8f32_two_step2:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [5:1.00]
; BTVER2-NEXT: vrcpps %ymm0, %ymm1 # sched: [2:2.00]
; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm2 # sched: [2:2.00]
@@ -977,10 +1012,10 @@ define <8 x float> @v8f32_two_step2(<8 x float> %x) #2 {
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: v8f32_two_step2:
-; SANDY: # BB#0:
-; SANDY-NEXT: vrcpps %ymm0, %ymm1 # sched: [5:1.00]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vrcpps %ymm0, %ymm1 # sched: [7:2.00]
; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm2 # sched: [5:1.00]
-; SANDY-NEXT: vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [4:0.50]
+; SANDY-NEXT: vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00] sched: [7:0.50]
; SANDY-NEXT: vsubps %ymm2, %ymm3, %ymm2 # sched: [3:1.00]
; SANDY-NEXT: vmulps %ymm2, %ymm1, %ymm2 # sched: [5:1.00]
; SANDY-NEXT: vaddps %ymm2, %ymm1, %ymm1 # sched: [3:1.00]
@@ -988,116 +1023,116 @@ define <8 x float> @v8f32_two_step2(<8 x float> %x) #2 {
; SANDY-NEXT: vsubps %ymm0, %ymm3, %ymm0 # sched: [3:1.00]
; SANDY-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00]
; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [9:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: v8f32_two_step2:
-; HASWELL: # BB#0:
-; HASWELL-NEXT: vrcpps %ymm0, %ymm1 # sched: [7:2.00]
-; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [5:1.00]
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vrcpps %ymm0, %ymm1 # sched: [11:2.00]
+; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
; HASWELL-NEXT: vmovaps %ymm1, %ymm3 # sched: [1:1.00]
-; HASWELL-NEXT: vfnmadd213ps %ymm2, %ymm0, %ymm3
-; HASWELL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm3
-; HASWELL-NEXT: vfnmadd213ps %ymm2, %ymm3, %ymm0
-; HASWELL-NEXT: vfmadd132ps %ymm3, %ymm3, %ymm0
-; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [9:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vfnmadd213ps %ymm2, %ymm0, %ymm3 # sched: [5:0.50]
+; HASWELL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm3 # sched: [5:0.50]
+; HASWELL-NEXT: vfnmadd213ps %ymm2, %ymm3, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT: vfmadd132ps %ymm3, %ymm3, %ymm0 # sched: [5:0.50]
+; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
;
; HASWELL-NO-FMA-LABEL: v8f32_two_step2:
-; HASWELL-NO-FMA: # BB#0:
-; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm1 # sched: [7:2.00]
-; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm0, %ymm2 # sched: [5:1.00]
-; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1] sched: [5:1.00]
+; HASWELL-NO-FMA: # %bb.0:
+; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm1 # sched: [11:2.00]
+; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm0, %ymm2 # sched: [5:0.50]
+; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
; HASWELL-NO-FMA-NEXT: vsubps %ymm2, %ymm3, %ymm2 # sched: [3:1.00]
-; HASWELL-NO-FMA-NEXT: vmulps %ymm2, %ymm1, %ymm2 # sched: [5:1.00]
+; HASWELL-NO-FMA-NEXT: vmulps %ymm2, %ymm1, %ymm2 # sched: [5:0.50]
; HASWELL-NO-FMA-NEXT: vaddps %ymm2, %ymm1, %ymm1 # sched: [3:1.00]
-; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
+; HASWELL-NO-FMA-NEXT: vmulps %ymm1, %ymm0, %ymm0 # sched: [5:0.50]
; HASWELL-NO-FMA-NEXT: vsubps %ymm0, %ymm3, %ymm0 # sched: [3:1.00]
-; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:1.00]
+; HASWELL-NO-FMA-NEXT: vmulps %ymm0, %ymm1, %ymm0 # sched: [5:0.50]
; HASWELL-NO-FMA-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [9:1.00]
-; HASWELL-NO-FMA-NEXT: retq # sched: [1:1.00]
+; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:0.50]
+; HASWELL-NO-FMA-NEXT: retq # sched: [7:1.00]
;
; KNL-LABEL: v8f32_two_step2:
-; KNL: # BB#0:
-; KNL-NEXT: vrcpps %ymm0, %ymm1 # sched: [7:2.00]
-; KNL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [5:1.00]
+; KNL: # %bb.0:
+; KNL-NEXT: vrcpps %ymm0, %ymm1 # sched: [11:2.00]
+; KNL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
; KNL-NEXT: vmovaps %ymm1, %ymm3 # sched: [1:1.00]
-; KNL-NEXT: vfnmadd213ps %ymm2, %ymm0, %ymm3
-; KNL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm3
-; KNL-NEXT: vfnmadd213ps %ymm2, %ymm3, %ymm0
-; KNL-NEXT: vfmadd132ps %ymm3, %ymm3, %ymm0
-; KNL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [9:1.00]
-; KNL-NEXT: retq # sched: [1:1.00]
+; KNL-NEXT: vfnmadd213ps %ymm2, %ymm0, %ymm3 # sched: [5:0.50]
+; KNL-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm3 # sched: [5:0.50]
+; KNL-NEXT: vfnmadd213ps %ymm2, %ymm3, %ymm0 # sched: [5:0.50]
+; KNL-NEXT: vfmadd132ps %ymm3, %ymm3, %ymm0 # sched: [5:0.50]
+; KNL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:0.50]
+; KNL-NEXT: retq # sched: [7:1.00]
;
; SKX-LABEL: v8f32_two_step2:
-; SKX: # BB#0:
-; SKX-NEXT: vrcp14ps %ymm0, %ymm1
-; SKX-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [5:1.00]
-; SKX-NEXT: vmovaps %ymm1, %ymm3 # sched: [1:1.00]
-; SKX-NEXT: vfnmadd213ps %ymm2, %ymm0, %ymm3
-; SKX-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm3
-; SKX-NEXT: vfnmadd213ps %ymm2, %ymm3, %ymm0
-; SKX-NEXT: vfmadd132ps %ymm3, %ymm3, %ymm0
-; SKX-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [9:1.00]
-; SKX-NEXT: retq # sched: [1:1.00]
+; SKX: # %bb.0:
+; SKX-NEXT: vrcpps %ymm0, %ymm1 # sched: [4:1.00]
+; SKX-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
+; SKX-NEXT: vmovaps %ymm1, %ymm3 # sched: [1:0.33]
+; SKX-NEXT: vfnmadd213ps %ymm2, %ymm0, %ymm3 # sched: [4:0.33]
+; SKX-NEXT: vfmadd132ps %ymm1, %ymm1, %ymm3 # sched: [4:0.33]
+; SKX-NEXT: vfnmadd213ps %ymm2, %ymm3, %ymm0 # sched: [4:0.33]
+; SKX-NEXT: vfmadd132ps %ymm3, %ymm3, %ymm0 # sched: [4:0.33]
+; SKX-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [11:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
%div = fdiv fast <8 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>, %x
ret <8 x float> %div
}
define <8 x float> @v8f32_no_step(<8 x float> %x) #3 {
; SSE-LABEL: v8f32_no_step:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: rcpps %xmm0, %xmm0
; SSE-NEXT: rcpps %xmm1, %xmm1
; SSE-NEXT: retq
;
; AVX-RECIP-LABEL: v8f32_no_step:
-; AVX-RECIP: # BB#0:
+; AVX-RECIP: # %bb.0:
; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm0
; AVX-RECIP-NEXT: retq
;
; FMA-RECIP-LABEL: v8f32_no_step:
-; FMA-RECIP: # BB#0:
+; FMA-RECIP: # %bb.0:
; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm0
; FMA-RECIP-NEXT: retq
;
; BTVER2-LABEL: v8f32_no_step:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vrcpps %ymm0, %ymm0 # sched: [2:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: v8f32_no_step:
-; SANDY: # BB#0:
-; SANDY-NEXT: vrcpps %ymm0, %ymm0 # sched: [5:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vrcpps %ymm0, %ymm0 # sched: [7:2.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: v8f32_no_step:
-; HASWELL: # BB#0:
-; HASWELL-NEXT: vrcpps %ymm0, %ymm0 # sched: [7:2.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vrcpps %ymm0, %ymm0 # sched: [11:2.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
;
; HASWELL-NO-FMA-LABEL: v8f32_no_step:
-; HASWELL-NO-FMA: # BB#0:
-; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm0 # sched: [7:2.00]
-; HASWELL-NO-FMA-NEXT: retq # sched: [1:1.00]
+; HASWELL-NO-FMA: # %bb.0:
+; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm0 # sched: [11:2.00]
+; HASWELL-NO-FMA-NEXT: retq # sched: [7:1.00]
;
; KNL-LABEL: v8f32_no_step:
-; KNL: # BB#0:
-; KNL-NEXT: vrcpps %ymm0, %ymm0 # sched: [7:2.00]
-; KNL-NEXT: retq # sched: [1:1.00]
+; KNL: # %bb.0:
+; KNL-NEXT: vrcpps %ymm0, %ymm0 # sched: [11:2.00]
+; KNL-NEXT: retq # sched: [7:1.00]
;
; SKX-LABEL: v8f32_no_step:
-; SKX: # BB#0:
-; SKX-NEXT: vrcp14ps %ymm0, %ymm0
-; SKX-NEXT: retq # sched: [1:1.00]
+; SKX: # %bb.0:
+; SKX-NEXT: vrcpps %ymm0, %ymm0 # sched: [4:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
%div = fdiv fast <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x
ret <8 x float> %div
}
define <8 x float> @v8f32_no_step2(<8 x float> %x) #3 {
; SSE-LABEL: v8f32_no_step2:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: rcpps %xmm1, %xmm1
; SSE-NEXT: rcpps %xmm0, %xmm0
; SSE-NEXT: mulps {{.*}}(%rip), %xmm0
@@ -1105,52 +1140,52 @@ define <8 x float> @v8f32_no_step2(<8 x float> %x) #3 {
; SSE-NEXT: retq
;
; AVX-RECIP-LABEL: v8f32_no_step2:
-; AVX-RECIP: # BB#0:
+; AVX-RECIP: # %bb.0:
; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm0
; AVX-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0
; AVX-RECIP-NEXT: retq
;
; FMA-RECIP-LABEL: v8f32_no_step2:
-; FMA-RECIP: # BB#0:
+; FMA-RECIP: # %bb.0:
; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm0
; FMA-RECIP-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0
; FMA-RECIP-NEXT: retq
;
; BTVER2-LABEL: v8f32_no_step2:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vrcpps %ymm0, %ymm0 # sched: [2:2.00]
; BTVER2-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [7:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: v8f32_no_step2:
-; SANDY: # BB#0:
-; SANDY-NEXT: vrcpps %ymm0, %ymm0 # sched: [5:1.00]
-; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [9:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vrcpps %ymm0, %ymm0 # sched: [7:2.00]
+; SANDY-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: v8f32_no_step2:
-; HASWELL: # BB#0:
-; HASWELL-NEXT: vrcpps %ymm0, %ymm0 # sched: [7:2.00]
-; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [9:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vrcpps %ymm0, %ymm0 # sched: [11:2.00]
+; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
;
; HASWELL-NO-FMA-LABEL: v8f32_no_step2:
-; HASWELL-NO-FMA: # BB#0:
-; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm0 # sched: [7:2.00]
-; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [9:1.00]
-; HASWELL-NO-FMA-NEXT: retq # sched: [1:1.00]
+; HASWELL-NO-FMA: # %bb.0:
+; HASWELL-NO-FMA-NEXT: vrcpps %ymm0, %ymm0 # sched: [11:2.00]
+; HASWELL-NO-FMA-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:0.50]
+; HASWELL-NO-FMA-NEXT: retq # sched: [7:1.00]
;
; KNL-LABEL: v8f32_no_step2:
-; KNL: # BB#0:
-; KNL-NEXT: vrcpps %ymm0, %ymm0 # sched: [7:2.00]
-; KNL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [9:1.00]
-; KNL-NEXT: retq # sched: [1:1.00]
+; KNL: # %bb.0:
+; KNL-NEXT: vrcpps %ymm0, %ymm0 # sched: [11:2.00]
+; KNL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [12:0.50]
+; KNL-NEXT: retq # sched: [7:1.00]
;
; SKX-LABEL: v8f32_no_step2:
-; SKX: # BB#0:
-; SKX-NEXT: vrcp14ps %ymm0, %ymm0
-; SKX-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [9:1.00]
-; SKX-NEXT: retq # sched: [1:1.00]
+; SKX: # %bb.0:
+; SKX-NEXT: vrcpps %ymm0, %ymm0 # sched: [4:1.00]
+; SKX-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [11:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
%div = fdiv fast <8 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>, %x
ret <8 x float> %div
}
diff --git a/test/CodeGen/X86/recip-pic.ll b/test/CodeGen/X86/recip-pic.ll
index 7a0d03d6072e..b3e363ea5d13 100644
--- a/test/CodeGen/X86/recip-pic.ll
+++ b/test/CodeGen/X86/recip-pic.ll
@@ -3,13 +3,11 @@
define fastcc float @foo(float %x) unnamed_addr #0 {
; CHECK-LABEL: foo:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: calll .L0$pb
-; CHECK-NEXT: .Lcfi0:
; CHECK-NEXT: .cfi_adjust_cfa_offset 4
; CHECK-NEXT: .L0$pb:
; CHECK-NEXT: popl %eax
-; CHECK-NEXT: .Lcfi1:
; CHECK-NEXT: .cfi_adjust_cfa_offset -4
; CHECK-NEXT: .Ltmp0:
; CHECK-NEXT: addl $_GLOBAL_OFFSET_TABLE_+(.Ltmp0-.L0$pb), %eax
diff --git a/test/CodeGen/X86/reduce-trunc-shl.ll b/test/CodeGen/X86/reduce-trunc-shl.ll
index 0638e9e3f6cd..90fc2822de50 100644
--- a/test/CodeGen/X86/reduce-trunc-shl.ll
+++ b/test/CodeGen/X86/reduce-trunc-shl.ll
@@ -4,7 +4,7 @@
define void @trunc_shl_7_v4i32_v4i64(<4 x i32> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) {
; SSE2-LABEL: trunc_shl_7_v4i32_v4i64:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movaps (%rsi), %xmm0
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2]
; SSE2-NEXT: pslld $7, %xmm0
@@ -12,7 +12,7 @@ define void @trunc_shl_7_v4i32_v4i64(<4 x i32> addrspace(1)* %out, <4 x i64> add
; SSE2-NEXT: retq
;
; AVX2-LABEL: trunc_shl_7_v4i32_v4i64:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = mem[0,2,2,3,4,6,6,7]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-NEXT: vpslld $7, %xmm0, %xmm0
@@ -28,7 +28,7 @@ define void @trunc_shl_7_v4i32_v4i64(<4 x i32> addrspace(1)* %out, <4 x i64> add
define <8 x i16> @trunc_shl_v8i16_v8i32(<8 x i32> %a) {
; SSE2-LABEL: trunc_shl_v8i16_v8i32:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: pslld $17, %xmm0
; SSE2-NEXT: pslld $17, %xmm1
; SSE2-NEXT: pslld $16, %xmm1
@@ -39,11 +39,11 @@ define <8 x i16> @trunc_shl_v8i16_v8i32(<8 x i32> %a) {
; SSE2-NEXT: retq
;
; AVX2-LABEL: trunc_shl_v8i16_v8i32:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpslld $17, %ymm0, %ymm0
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
%shl = shl <8 x i32> %a, <i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17>
@@ -53,14 +53,14 @@ define <8 x i16> @trunc_shl_v8i16_v8i32(<8 x i32> %a) {
define void @trunc_shl_31_i32_i64(i32* %out, i64* %in) {
; SSE2-LABEL: trunc_shl_31_i32_i64:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movl (%rsi), %eax
; SSE2-NEXT: shll $31, %eax
; SSE2-NEXT: movl %eax, (%rdi)
; SSE2-NEXT: retq
;
; AVX2-LABEL: trunc_shl_31_i32_i64:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: movl (%rsi), %eax
; AVX2-NEXT: shll $31, %eax
; AVX2-NEXT: movl %eax, (%rdi)
@@ -74,12 +74,12 @@ define void @trunc_shl_31_i32_i64(i32* %out, i64* %in) {
define void @trunc_shl_32_i32_i64(i32* %out, i64* %in) {
; SSE2-LABEL: trunc_shl_32_i32_i64:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movl $0, (%rdi)
; SSE2-NEXT: retq
;
; AVX2-LABEL: trunc_shl_32_i32_i64:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: movl $0, (%rdi)
; AVX2-NEXT: retq
%val = load i64, i64* %in
@@ -91,14 +91,14 @@ define void @trunc_shl_32_i32_i64(i32* %out, i64* %in) {
define void @trunc_shl_15_i16_i64(i16* %out, i64* %in) {
; SSE2-LABEL: trunc_shl_15_i16_i64:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movzwl (%rsi), %eax
; SSE2-NEXT: shlw $15, %ax
; SSE2-NEXT: movw %ax, (%rdi)
; SSE2-NEXT: retq
;
; AVX2-LABEL: trunc_shl_15_i16_i64:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: movzwl (%rsi), %eax
; AVX2-NEXT: shlw $15, %ax
; AVX2-NEXT: movw %ax, (%rdi)
@@ -112,12 +112,12 @@ define void @trunc_shl_15_i16_i64(i16* %out, i64* %in) {
define void @trunc_shl_16_i16_i64(i16* %out, i64* %in) {
; SSE2-LABEL: trunc_shl_16_i16_i64:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movw $0, (%rdi)
; SSE2-NEXT: retq
;
; AVX2-LABEL: trunc_shl_16_i16_i64:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: movw $0, (%rdi)
; AVX2-NEXT: retq
%val = load i64, i64* %in
@@ -129,14 +129,14 @@ define void @trunc_shl_16_i16_i64(i16* %out, i64* %in) {
define void @trunc_shl_7_i8_i64(i8* %out, i64* %in) {
; SSE2-LABEL: trunc_shl_7_i8_i64:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movb (%rsi), %al
; SSE2-NEXT: shlb $7, %al
; SSE2-NEXT: movb %al, (%rdi)
; SSE2-NEXT: retq
;
; AVX2-LABEL: trunc_shl_7_i8_i64:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: movb (%rsi), %al
; AVX2-NEXT: shlb $7, %al
; AVX2-NEXT: movb %al, (%rdi)
@@ -150,12 +150,12 @@ define void @trunc_shl_7_i8_i64(i8* %out, i64* %in) {
define void @trunc_shl_8_i8_i64(i8* %out, i64* %in) {
; SSE2-LABEL: trunc_shl_8_i8_i64:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movb $0, (%rdi)
; SSE2-NEXT: retq
;
; AVX2-LABEL: trunc_shl_8_i8_i64:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: movb $0, (%rdi)
; AVX2-NEXT: retq
%val = load i64, i64* %in
diff --git a/test/CodeGen/X86/regpressure.ll b/test/CodeGen/X86/regpressure.ll
index 8f352b8fbb5a..eabcbe95b773 100644
--- a/test/CodeGen/X86/regpressure.ll
+++ b/test/CodeGen/X86/regpressure.ll
@@ -2,7 +2,7 @@
;; Both functions in this testcase should codegen to the same function, and
;; neither of them should require spilling anything to the stack.
-; RUN: llc < %s -march=x86 -stats 2>&1 | \
+; RUN: llc < %s -mtriple=i686-- -stats 2>&1 | \
; RUN: not grep "Number of register spills"
;; This can be compiled to use three registers if the loads are not
diff --git a/test/CodeGen/X86/rem.ll b/test/CodeGen/X86/rem.ll
index 7b138f02eb4a..672baa5c1bdc 100644
--- a/test/CodeGen/X86/rem.ll
+++ b/test/CodeGen/X86/rem.ll
@@ -3,7 +3,7 @@
define i32 @test1(i32 %X) {
; CHECK-LABEL: test1:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
; CHECK-NEXT: movl $-2139062143, %edx # imm = 0x80808081
; CHECK-NEXT: movl %ecx, %eax
@@ -25,7 +25,7 @@ define i32 @test1(i32 %X) {
define i32 @test2(i32 %X) {
; CHECK-LABEL: test2:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: movl %eax, %ecx
; CHECK-NEXT: sarl $31, %ecx
@@ -40,7 +40,7 @@ define i32 @test2(i32 %X) {
define i32 @test3(i32 %X) {
; CHECK-LABEL: test3:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
; CHECK-NEXT: movl $-2139062143, %edx # imm = 0x80808081
; CHECK-NEXT: movl %ecx, %eax
@@ -58,7 +58,7 @@ define i32 @test3(i32 %X) {
define i32 @test4(i32 %X) {
; CHECK-LABEL: test4:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: retl
%tmp1 = urem i32 %X, 256
@@ -67,7 +67,7 @@ define i32 @test4(i32 %X) {
define i32 @test5(i32 %X) nounwind readnone {
; CHECK-LABEL: test5:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movl $41, %eax
; CHECK-NEXT: xorl %edx, %edx
; CHECK-NEXT: idivl {{[0-9]+}}(%esp)
diff --git a/test/CodeGen/X86/rem_crash.ll b/test/CodeGen/X86/rem_crash.ll
index a5529a769a0b..05a613c8adb8 100644
--- a/test/CodeGen/X86/rem_crash.ll
+++ b/test/CodeGen/X86/rem_crash.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=x86
-; RUN: llc < %s -march=x86-64
+; RUN: llc < %s -mtriple=i686--
+; RUN: llc < %s -mtriple=x86_64--
define i8 @test_minsize_uu8(i8 %x) minsize optsize {
entry:
diff --git a/test/CodeGen/X86/remat-phys-dead.ll b/test/CodeGen/X86/remat-phys-dead.ll
index 6cdcd28eacd8..90bbe20a8838 100644
--- a/test/CodeGen/X86/remat-phys-dead.ll
+++ b/test/CodeGen/X86/remat-phys-dead.ll
@@ -4,12 +4,12 @@
; We need to make sure that rematerialization into a physical register marks the
; super- or sub-register as dead after this rematerialization since only the
; original register is actually used later. Largely irrelevant for a trivial
-; example like this, since EAX is never used again, but easy to test.
+; example like this, since eax is never used again, but easy to test.
define i8 @test_remat() {
ret i8 0
; CHECK: REGISTER COALESCING
-; CHECK: Remat: %EAX<def,dead> = MOV32r0 %EFLAGS<imp-def,dead>, %AL<imp-def>
+; CHECK: Remat: dead %eax = MOV32r0 implicit-def dead %eflags, implicit-def %al
}
; On the other hand, if it's already the correct width, we really shouldn't be
@@ -18,6 +18,6 @@ define i8 @test_remat() {
define i32 @test_remat32() {
ret i32 0
; CHECK: REGISTER COALESCING
-; CHECK: Remat: %EAX<def> = MOV32r0 %EFLAGS<imp-def,dead>
+; CHECK: Remat: %eax = MOV32r0 implicit-def dead %eflags
}
diff --git a/test/CodeGen/X86/replace-load-and-with-bzhi.ll b/test/CodeGen/X86/replace-load-and-with-bzhi.ll
new file mode 100644
index 000000000000..9684d06b134e
--- /dev/null
+++ b/test/CodeGen/X86/replace-load-and-with-bzhi.ll
@@ -0,0 +1,89 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+bmi2 | FileCheck %s
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+bmi2 | FileCheck %s -check-prefix=CHECK32
+
+@fill_table32 = internal unnamed_addr constant [32 x i32] [i32 0, i32 1, i32 3, i32 7, i32 15, i32 31, i32 63, i32 127, i32 255, i32 511, i32 1023, i32 2047, i32 4095, i32 8191, i32 16383, i32 32767, i32 65535, i32 131071, i32 262143, i32 524287, i32 1048575, i32 2097151, i32 4194303, i32 8388607, i32 16777215, i32 33554431, i32 67108863, i32 134217727, i32 268435455, i32 536870911, i32 1073741823, i32 2147483647], align 16
+@fill_table32_partial = internal unnamed_addr constant [17 x i32] [i32 0, i32 1, i32 3, i32 7, i32 15, i32 31, i32 63, i32 127, i32 255, i32 511, i32 1023, i32 2047, i32 4095, i32 8191, i32 16383, i32 32767, i32 65535], align 16
+@fill_table64 = internal unnamed_addr constant [64 x i64] [i64 0, i64 1, i64 3, i64 7, i64 15, i64 31, i64 63, i64 127, i64 255, i64 511, i64 1023, i64 2047, i64 4095, i64 8191, i64 16383, i64 32767, i64 65535, i64 131071, i64 262143, i64 524287, i64 1048575, i64 2097151, i64 4194303, i64 8388607, i64 16777215, i64 33554431, i64 67108863, i64 134217727, i64 268435455, i64 536870911, i64 1073741823, i64 2147483647, i64 4294967295, i64 8589934591, i64 17179869183, i64 34359738367, i64 68719476735, i64 137438953471, i64 274877906943, i64 549755813887, i64 1099511627775, i64 2199023255551, i64 4398046511103, i64 8796093022207, i64 17592186044415, i64 35184372088831, i64 70368744177663, i64 140737488355327, i64 281474976710655, i64 562949953421311, i64 1125899906842623, i64 2251799813685247, i64 4503599627370495, i64 9007199254740991, i64 18014398509481983, i64 36028797018963967, i64 72057594037927935, i64 144115188075855871, i64 288230376151711743, i64 576460752303423487, i64 1152921504606846975, i64 2305843009213693951, i64 4611686018427387903, i64 9223372036854775807], align 16
+@fill_table64_partial = internal unnamed_addr constant [51 x i64] [i64 0, i64 1, i64 3, i64 7, i64 15, i64 31, i64 63, i64 127, i64 255, i64 511, i64 1023, i64 2047, i64 4095, i64 8191, i64 16383, i64 32767, i64 65535, i64 131071, i64 262143, i64 524287, i64 1048575, i64 2097151, i64 4194303, i64 8388607, i64 16777215, i64 33554431, i64 67108863, i64 134217727, i64 268435455, i64 536870911, i64 1073741823, i64 2147483647, i64 4294967295, i64 8589934591, i64 17179869183, i64 34359738367, i64 68719476735, i64 137438953471, i64 274877906943, i64 549755813887, i64 1099511627775, i64 2199023255551, i64 4398046511103, i64 8796093022207, i64 17592186044415, i64 35184372088831, i64 70368744177663, i64 140737488355327, i64 281474976710655, i64 562949953421311, i64 1125899906842623], align 16
+
+define i32 @f32_bzhi(i32 %x, i32 %y) local_unnamed_addr {
+; CHECK-LABEL: f32_bzhi:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: bzhil %esi, %edi, %eax
+; CHECK-NEXT: retq
+;
+; CHECK32-LABEL: f32_bzhi:
+; CHECK32: # %bb.0: # %entry
+; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK32-NEXT: bzhil %eax, {{[0-9]+}}(%esp), %eax
+; CHECK32-NEXT: retl
+entry:
+ %idxprom = sext i32 %y to i64
+ %arrayidx = getelementptr inbounds [32 x i32], [32 x i32]* @fill_table32, i64 0, i64 %idxprom
+ %0 = load i32, i32* %arrayidx, align 4
+ %and = and i32 %0, %x
+ ret i32 %and
+}
+
+define i32 @f32_bzhi_partial(i32 %x, i32 %y) local_unnamed_addr {
+; CHECK-LABEL: f32_bzhi_partial:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: bzhil %esi, %edi, %eax
+; CHECK-NEXT: retq
+;
+; CHECK32-LABEL: f32_bzhi_partial:
+; CHECK32: # %bb.0: # %entry
+; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK32-NEXT: bzhil %eax, {{[0-9]+}}(%esp), %eax
+; CHECK32-NEXT: retl
+entry:
+ %idxprom = sext i32 %y to i64
+ %arrayidx = getelementptr inbounds [17 x i32], [17 x i32]* @fill_table32_partial, i64 0, i64 %idxprom
+ %0 = load i32, i32* %arrayidx, align 4
+ %and = and i32 %0, %x
+ ret i32 %and
+}
+
+define i64 @f64_bzhi(i64 %x, i64 %y) local_unnamed_addr {
+; CHECK-LABEL: f64_bzhi:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: bzhiq %rsi, %rdi, %rax
+; CHECK-NEXT: retq
+;
+; CHECK32-LABEL: f64_bzhi:
+; CHECK32: # %bb.0: # %entry
+; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK32-NEXT: movl fill_table64+4(,%eax,8), %edx
+; CHECK32-NEXT: movl fill_table64(,%eax,8), %eax
+; CHECK32-NEXT: andl {{[0-9]+}}(%esp), %eax
+; CHECK32-NEXT: andl {{[0-9]+}}(%esp), %edx
+; CHECK32-NEXT: retl
+entry:
+ %arrayidx = getelementptr inbounds [64 x i64], [64 x i64]* @fill_table64, i64 0, i64 %y
+ %0 = load i64, i64* %arrayidx, align 8
+ %and = and i64 %0, %x
+ ret i64 %and
+}
+
+define i64 @f64_bzhi_partial(i64 %x, i64 %y) local_unnamed_addr {
+; CHECK-LABEL: f64_bzhi_partial:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: bzhiq %rsi, %rdi, %rax
+; CHECK-NEXT: retq
+;
+; CHECK32-LABEL: f64_bzhi_partial:
+; CHECK32: # %bb.0: # %entry
+; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK32-NEXT: movl fill_table64_partial+4(,%eax,8), %edx
+; CHECK32-NEXT: movl fill_table64_partial(,%eax,8), %eax
+; CHECK32-NEXT: andl {{[0-9]+}}(%esp), %eax
+; CHECK32-NEXT: andl {{[0-9]+}}(%esp), %edx
+; CHECK32-NEXT: retl
+entry:
+ %arrayidx = getelementptr inbounds [51 x i64], [51 x i64]* @fill_table64_partial, i64 0, i64 %y
+ %0 = load i64, i64* %arrayidx, align 8
+ %and = and i64 %0, %x
+ ret i64 %and
+}
+
diff --git a/test/CodeGen/X86/ret-addr.ll b/test/CodeGen/X86/ret-addr.ll
index b7b57ab3b842..cf164cc567a1 100644
--- a/test/CodeGen/X86/ret-addr.ll
+++ b/test/CodeGen/X86/ret-addr.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -disable-fp-elim -march=x86 | not grep xor
-; RUN: llc < %s -disable-fp-elim -march=x86-64 | not grep xor
+; RUN: llc < %s -disable-fp-elim -mtriple=i686-- | not grep xor
+; RUN: llc < %s -disable-fp-elim -mtriple=x86_64-- | not grep xor
define i8* @h() nounwind readnone optsize {
entry:
diff --git a/test/CodeGen/X86/ret-i64-0.ll b/test/CodeGen/X86/ret-i64-0.ll
index bca0f056b90d..be82129de21f 100644
--- a/test/CodeGen/X86/ret-i64-0.ll
+++ b/test/CodeGen/X86/ret-i64-0.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | grep xor | count 2
+; RUN: llc < %s -mtriple=i686-- | grep xor | count 2
define i64 @foo() nounwind {
ret i64 0
diff --git a/test/CodeGen/X86/ret-mmx.ll b/test/CodeGen/X86/ret-mmx.ll
index 65c3ac0cc447..6a9e59193aa3 100644
--- a/test/CodeGen/X86/ret-mmx.ll
+++ b/test/CodeGen/X86/ret-mmx.ll
@@ -6,7 +6,7 @@
define void @t1() nounwind {
; CHECK-LABEL: t1:
-; CHECK: ## BB#0: ## %entry
+; CHECK: ## %bb.0: ## %entry
; CHECK-NEXT: pushq %rax
; CHECK-NEXT: callq _return_v1di
; CHECK-NEXT: movq _g_v1di@{{.*}}(%rip), %rcx
@@ -23,7 +23,7 @@ declare <1 x i64> @return_v1di()
define <1 x i64> @t2() nounwind {
; CHECK-LABEL: t2:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: movl $1, %eax
; CHECK-NEXT: retq
ret <1 x i64> <i64 1>
@@ -31,7 +31,7 @@ define <1 x i64> @t2() nounwind {
define <2 x i32> @t3() nounwind {
; CHECK-LABEL: t3:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: movl $1, %eax
; CHECK-NEXT: movq %rax, %xmm0
; CHECK-NEXT: retq
@@ -40,7 +40,7 @@ define <2 x i32> @t3() nounwind {
define double @t4() nounwind {
; CHECK-LABEL: t4:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: movl $1, %eax
; CHECK-NEXT: movd %eax, %xmm0
; CHECK-NEXT: retq
diff --git a/test/CodeGen/X86/rip-rel-address.ll b/test/CodeGen/X86/rip-rel-address.ll
index b49d597d9f05..5a2f2627d3b6 100644
--- a/test/CodeGen/X86/rip-rel-address.ll
+++ b/test/CodeGen/X86/rip-rel-address.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -relocation-model=pic -mtriple=x86_64-apple-darwin10 | FileCheck %s -check-prefix=PIC64
+; RUN: llc < %s -relocation-model=pic -mtriple=x86_64-apple-darwin10 | FileCheck %s -check-prefix=PIC64
; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -relocation-model=static | FileCheck %s -check-prefix=STATIC64
; Use %rip-relative addressing even in static mode on x86-64, because
diff --git a/test/CodeGen/X86/rot16.ll b/test/CodeGen/X86/rot16.ll
index 6d7c702afc40..481163e31261 100644
--- a/test/CodeGen/X86/rot16.ll
+++ b/test/CodeGen/X86/rot16.ll
@@ -1,85 +1,163 @@
-; RUN: llc < %s -march=x86 -mcpu=generic | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=X64
-define i16 @foo(i16 %x, i16 %y, i16 %z) nounwind readnone {
-entry:
-; CHECK-LABEL: foo:
-; CHECK: rolw %cl
- %0 = shl i16 %x, %z
- %1 = sub i16 16, %z
- %2 = lshr i16 %x, %1
- %3 = or i16 %2, %0
- ret i16 %3
+define i16 @foo(i16 %x, i16 %y, i16 %z) nounwind {
+; X32-LABEL: foo:
+; X32: # %bb.0:
+; X32-NEXT: movb {{[0-9]+}}(%esp), %cl
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: rolw %cl, %ax
+; X32-NEXT: retl
+;
+; X64-LABEL: foo:
+; X64: # %bb.0:
+; X64-NEXT: movl %edx, %ecx
+; X64-NEXT: shldw %cl, %di, %di
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: retq
+ %t0 = shl i16 %x, %z
+ %t1 = sub i16 16, %z
+ %t2 = lshr i16 %x, %t1
+ %t3 = or i16 %t2, %t0
+ ret i16 %t3
}
-define i16 @bar(i16 %x, i16 %y, i16 %z) nounwind readnone {
-entry:
-; CHECK-LABEL: bar:
-; CHECK: shldw %cl
- %0 = shl i16 %y, %z
- %1 = sub i16 16, %z
- %2 = lshr i16 %x, %1
- %3 = or i16 %2, %0
- ret i16 %3
+define i16 @bar(i16 %x, i16 %y, i16 %z) nounwind {
+; X32-LABEL: bar:
+; X32: # %bb.0:
+; X32-NEXT: movb {{[0-9]+}}(%esp), %cl
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %edx
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: shldw %cl, %dx, %ax
+; X32-NEXT: retl
+;
+; X64-LABEL: bar:
+; X64: # %bb.0:
+; X64-NEXT: movl %edx, %ecx
+; X64-NEXT: shldw %cl, %di, %si
+; X64-NEXT: movl %esi, %eax
+; X64-NEXT: retq
+ %t0 = shl i16 %y, %z
+ %t1 = sub i16 16, %z
+ %t2 = lshr i16 %x, %t1
+ %t3 = or i16 %t2, %t0
+ ret i16 %t3
}
-define i16 @un(i16 %x, i16 %y, i16 %z) nounwind readnone {
-entry:
-; CHECK-LABEL: un:
-; CHECK: rorw %cl
- %0 = lshr i16 %x, %z
- %1 = sub i16 16, %z
- %2 = shl i16 %x, %1
- %3 = or i16 %2, %0
- ret i16 %3
+define i16 @un(i16 %x, i16 %y, i16 %z) nounwind {
+; X32-LABEL: un:
+; X32: # %bb.0:
+; X32-NEXT: movb {{[0-9]+}}(%esp), %cl
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: rorw %cl, %ax
+; X32-NEXT: retl
+;
+; X64-LABEL: un:
+; X64: # %bb.0:
+; X64-NEXT: movl %edx, %ecx
+; X64-NEXT: shrdw %cl, %di, %di
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: retq
+ %t0 = lshr i16 %x, %z
+ %t1 = sub i16 16, %z
+ %t2 = shl i16 %x, %t1
+ %t3 = or i16 %t2, %t0
+ ret i16 %t3
}
-define i16 @bu(i16 %x, i16 %y, i16 %z) nounwind readnone {
-entry:
-; CHECK-LABEL: bu:
-; CHECK: shrdw
- %0 = lshr i16 %y, %z
- %1 = sub i16 16, %z
- %2 = shl i16 %x, %1
- %3 = or i16 %2, %0
- ret i16 %3
+define i16 @bu(i16 %x, i16 %y, i16 %z) nounwind {
+; X32-LABEL: bu:
+; X32: # %bb.0:
+; X32-NEXT: movb {{[0-9]+}}(%esp), %cl
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %edx
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: shrdw %cl, %dx, %ax
+; X32-NEXT: retl
+;
+; X64-LABEL: bu:
+; X64: # %bb.0:
+; X64-NEXT: movl %edx, %ecx
+; X64-NEXT: shrdw %cl, %di, %si
+; X64-NEXT: movl %esi, %eax
+; X64-NEXT: retq
+ %t0 = lshr i16 %y, %z
+ %t1 = sub i16 16, %z
+ %t2 = shl i16 %x, %t1
+ %t3 = or i16 %t2, %t0
+ ret i16 %t3
}
-define i16 @xfoo(i16 %x, i16 %y, i16 %z) nounwind readnone {
-entry:
-; CHECK-LABEL: xfoo:
-; CHECK: rolw $5
- %0 = lshr i16 %x, 11
- %1 = shl i16 %x, 5
- %2 = or i16 %0, %1
- ret i16 %2
+define i16 @xfoo(i16 %x, i16 %y, i16 %z) nounwind {
+; X32-LABEL: xfoo:
+; X32: # %bb.0:
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: rolw $5, %ax
+; X32-NEXT: retl
+;
+; X64-LABEL: xfoo:
+; X64: # %bb.0:
+; X64-NEXT: rolw $5, %di
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: retq
+ %t0 = lshr i16 %x, 11
+ %t1 = shl i16 %x, 5
+ %t2 = or i16 %t0, %t1
+ ret i16 %t2
}
-define i16 @xbar(i16 %x, i16 %y, i16 %z) nounwind readnone {
-entry:
-; CHECK-LABEL: xbar:
-; CHECK: shldw $5
- %0 = shl i16 %y, 5
- %1 = lshr i16 %x, 11
- %2 = or i16 %0, %1
- ret i16 %2
+define i16 @xbar(i16 %x, i16 %y, i16 %z) nounwind {
+; X32-LABEL: xbar:
+; X32: # %bb.0:
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: shldw $5, %cx, %ax
+; X32-NEXT: retl
+;
+; X64-LABEL: xbar:
+; X64: # %bb.0:
+; X64-NEXT: shldw $5, %di, %si
+; X64-NEXT: movl %esi, %eax
+; X64-NEXT: retq
+ %t0 = shl i16 %y, 5
+ %t1 = lshr i16 %x, 11
+ %t2 = or i16 %t0, %t1
+ ret i16 %t2
}
-define i16 @xun(i16 %x, i16 %y, i16 %z) nounwind readnone {
-entry:
-; CHECK-LABEL: xun:
-; CHECK: rolw $11
- %0 = lshr i16 %x, 5
- %1 = shl i16 %x, 11
- %2 = or i16 %0, %1
- ret i16 %2
+define i16 @xun(i16 %x, i16 %y, i16 %z) nounwind {
+; X32-LABEL: xun:
+; X32: # %bb.0:
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: rolw $11, %ax
+; X32-NEXT: retl
+;
+; X64-LABEL: xun:
+; X64: # %bb.0:
+; X64-NEXT: rolw $11, %di
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: retq
+ %t0 = lshr i16 %x, 5
+ %t1 = shl i16 %x, 11
+ %t2 = or i16 %t0, %t1
+ ret i16 %t2
}
-define i16 @xbu(i16 %x, i16 %y, i16 %z) nounwind readnone {
-entry:
-; CHECK-LABEL: xbu:
-; CHECK: shldw $11
- %0 = lshr i16 %y, 5
- %1 = shl i16 %x, 11
- %2 = or i16 %0, %1
- ret i16 %2
+define i16 @xbu(i16 %x, i16 %y, i16 %z) nounwind {
+; X32-LABEL: xbu:
+; X32: # %bb.0:
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: shldw $11, %cx, %ax
+; X32-NEXT: retl
+;
+; X64-LABEL: xbu:
+; X64: # %bb.0:
+; X64-NEXT: shldw $11, %si, %di
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: retq
+ %t0 = lshr i16 %y, 5
+ %t1 = shl i16 %x, 11
+ %t2 = or i16 %t0, %t1
+ ret i16 %t2
}
diff --git a/test/CodeGen/X86/rot32.ll b/test/CodeGen/X86/rot32.ll
index 79ecbe0514d0..bd5329168c55 100644
--- a/test/CodeGen/X86/rot32.ll
+++ b/test/CodeGen/X86/rot32.ll
@@ -1,11 +1,16 @@
-; RUN: llc < %s -march=x86 -mcpu=corei7 | FileCheck %s
-; RUN: llc < %s -march=x86 -mcpu=corei7-avx | FileCheck %s --check-prefix=SHLD
-; RUN: llc < %s -march=x86 -mcpu=core-avx2 | FileCheck %s --check-prefix=BMI2
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-- -mcpu=corei7 | FileCheck %s --check-prefix=ALL --check-prefix=X86
+; RUN: llc < %s -mtriple=i686-- -mcpu=corei7-avx | FileCheck %s --check-prefix=ALL --check-prefix=SHLD
+; RUN: llc < %s -mtriple=i686-- -mcpu=core-avx2 | FileCheck %s --check-prefix=ALL --check-prefix=BMI2
define i32 @foo(i32 %x, i32 %y, i32 %z) nounwind readnone {
+; ALL-LABEL: foo:
+; ALL: # %bb.0: # %entry
+; ALL-NEXT: movb {{[0-9]+}}(%esp), %cl
+; ALL-NEXT: movl {{[0-9]+}}(%esp), %eax
+; ALL-NEXT: roll %cl, %eax
+; ALL-NEXT: retl
entry:
-; CHECK-LABEL: foo:
-; CHECK: roll %cl
%0 = shl i32 %x, %z
%1 = sub i32 32, %z
%2 = lshr i32 %x, %1
@@ -14,9 +19,14 @@ entry:
}
define i32 @bar(i32 %x, i32 %y, i32 %z) nounwind readnone {
+; ALL-LABEL: bar:
+; ALL: # %bb.0: # %entry
+; ALL-NEXT: movb {{[0-9]+}}(%esp), %cl
+; ALL-NEXT: movl {{[0-9]+}}(%esp), %edx
+; ALL-NEXT: movl {{[0-9]+}}(%esp), %eax
+; ALL-NEXT: shldl %cl, %edx, %eax
+; ALL-NEXT: retl
entry:
-; CHECK-LABEL: bar:
-; CHECK: shldl %cl
%0 = shl i32 %y, %z
%1 = sub i32 32, %z
%2 = lshr i32 %x, %1
@@ -25,9 +35,13 @@ entry:
}
define i32 @un(i32 %x, i32 %y, i32 %z) nounwind readnone {
+; ALL-LABEL: un:
+; ALL: # %bb.0: # %entry
+; ALL-NEXT: movb {{[0-9]+}}(%esp), %cl
+; ALL-NEXT: movl {{[0-9]+}}(%esp), %eax
+; ALL-NEXT: rorl %cl, %eax
+; ALL-NEXT: retl
entry:
-; CHECK-LABEL: un:
-; CHECK: rorl %cl
%0 = lshr i32 %x, %z
%1 = sub i32 32, %z
%2 = shl i32 %x, %1
@@ -36,9 +50,14 @@ entry:
}
define i32 @bu(i32 %x, i32 %y, i32 %z) nounwind readnone {
+; ALL-LABEL: bu:
+; ALL: # %bb.0: # %entry
+; ALL-NEXT: movb {{[0-9]+}}(%esp), %cl
+; ALL-NEXT: movl {{[0-9]+}}(%esp), %edx
+; ALL-NEXT: movl {{[0-9]+}}(%esp), %eax
+; ALL-NEXT: shrdl %cl, %edx, %eax
+; ALL-NEXT: retl
entry:
-; CHECK-LABEL: bu:
-; CHECK: shrdl %cl
%0 = lshr i32 %y, %z
%1 = sub i32 32, %z
%2 = shl i32 %x, %1
@@ -47,13 +66,23 @@ entry:
}
define i32 @xfoo(i32 %x, i32 %y, i32 %z) nounwind readnone {
-entry:
-; CHECK-LABEL: xfoo:
-; CHECK: roll $7
+; X86-LABEL: xfoo:
+; X86: # %bb.0: # %entry
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: roll $7, %eax
+; X86-NEXT: retl
+;
; SHLD-LABEL: xfoo:
-; SHLD: shldl $7
+; SHLD: # %bb.0: # %entry
+; SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; SHLD-NEXT: shldl $7, %eax, %eax
+; SHLD-NEXT: retl
+;
; BMI2-LABEL: xfoo:
-; BMI2: rorxl $25
+; BMI2: # %bb.0: # %entry
+; BMI2-NEXT: rorxl $25, {{[0-9]+}}(%esp), %eax
+; BMI2-NEXT: retl
+entry:
%0 = lshr i32 %x, 25
%1 = shl i32 %x, 7
%2 = or i32 %0, %1
@@ -61,13 +90,26 @@ entry:
}
define i32 @xfoop(i32* %p) nounwind readnone {
-entry:
-; CHECK-LABEL: xfoop:
-; CHECK: roll $7
+; X86-LABEL: xfoop:
+; X86: # %bb.0: # %entry
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl (%eax), %eax
+; X86-NEXT: roll $7, %eax
+; X86-NEXT: retl
+;
; SHLD-LABEL: xfoop:
-; SHLD: shldl $7
+; SHLD: # %bb.0: # %entry
+; SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; SHLD-NEXT: movl (%eax), %eax
+; SHLD-NEXT: shldl $7, %eax, %eax
+; SHLD-NEXT: retl
+;
; BMI2-LABEL: xfoop:
-; BMI2: rorxl $25
+; BMI2: # %bb.0: # %entry
+; BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; BMI2-NEXT: rorxl $25, (%eax), %eax
+; BMI2-NEXT: retl
+entry:
%x = load i32, i32* %p
%a = lshr i32 %x, 25
%b = shl i32 %x, 7
@@ -76,9 +118,13 @@ entry:
}
define i32 @xbar(i32 %x, i32 %y, i32 %z) nounwind readnone {
+; ALL-LABEL: xbar:
+; ALL: # %bb.0: # %entry
+; ALL-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; ALL-NEXT: movl {{[0-9]+}}(%esp), %eax
+; ALL-NEXT: shldl $7, %ecx, %eax
+; ALL-NEXT: retl
entry:
-; CHECK-LABEL: xbar:
-; CHECK: shldl $7
%0 = shl i32 %y, 7
%1 = lshr i32 %x, 25
%2 = or i32 %0, %1
@@ -86,13 +132,23 @@ entry:
}
define i32 @xun(i32 %x, i32 %y, i32 %z) nounwind readnone {
-entry:
-; CHECK-LABEL: xun:
-; CHECK: roll $25
+; X86-LABEL: xun:
+; X86: # %bb.0: # %entry
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: roll $25, %eax
+; X86-NEXT: retl
+;
; SHLD-LABEL: xun:
-; SHLD: shldl $25
+; SHLD: # %bb.0: # %entry
+; SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; SHLD-NEXT: shldl $25, %eax, %eax
+; SHLD-NEXT: retl
+;
; BMI2-LABEL: xun:
-; BMI2: rorxl $7
+; BMI2: # %bb.0: # %entry
+; BMI2-NEXT: rorxl $7, {{[0-9]+}}(%esp), %eax
+; BMI2-NEXT: retl
+entry:
%0 = lshr i32 %x, 7
%1 = shl i32 %x, 25
%2 = or i32 %0, %1
@@ -100,13 +156,28 @@ entry:
}
define i32 @xunp(i32* %p) nounwind readnone {
+; X86-LABEL: xunp:
+; X86: # %bb.0: # %entry
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl (%eax), %eax
+; X86-NEXT: roll $25, %eax
+; X86-NEXT: retl
+;
+; SHLD-LABEL: xunp:
+; SHLD: # %bb.0: # %entry
+; SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; SHLD-NEXT: movl (%eax), %eax
+; SHLD-NEXT: shldl $25, %eax, %eax
+; SHLD-NEXT: retl
+;
+; BMI2-LABEL: xunp:
+; BMI2: # %bb.0: # %entry
+; BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; BMI2-NEXT: rorxl $7, (%eax), %eax
+; BMI2-NEXT: retl
entry:
-; CHECK-LABEL: xunp:
-; CHECK: roll $25
; shld-label: xunp:
; shld: shldl $25
-; BMI2-LABEL: xunp:
-; BMI2: rorxl $7
%x = load i32, i32* %p
%a = lshr i32 %x, 7
%b = shl i32 %x, 25
@@ -115,9 +186,13 @@ entry:
}
define i32 @xbu(i32 %x, i32 %y, i32 %z) nounwind readnone {
+; ALL-LABEL: xbu:
+; ALL: # %bb.0: # %entry
+; ALL-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; ALL-NEXT: movl {{[0-9]+}}(%esp), %eax
+; ALL-NEXT: shldl $25, %ecx, %eax
+; ALL-NEXT: retl
entry:
-; CHECK-LABEL: xbu:
-; CHECK: shldl $25
%0 = lshr i32 %y, 7
%1 = shl i32 %x, 25
%2 = or i32 %0, %1
diff --git a/test/CodeGen/X86/rot64.ll b/test/CodeGen/X86/rot64.ll
index 976acbb01675..e8f090cff996 100644
--- a/test/CodeGen/X86/rot64.ll
+++ b/test/CodeGen/X86/rot64.ll
@@ -1,11 +1,16 @@
-; RUN: llc < %s -march=x86-64 -mcpu=corei7 | FileCheck %s
-; RUN: llc < %s -march=x86-64 -mcpu=corei7-avx | FileCheck %s --check-prefix=SHLD
-; RUN: llc < %s -march=x86-64 -mcpu=core-avx2 | FileCheck %s --check-prefix=BMI2
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=corei7 | FileCheck %s --check-prefix=ALL --check-prefix=X64
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=corei7-avx | FileCheck %s --check-prefix=ALL --check-prefix=SHLD
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=core-avx2 | FileCheck %s --check-prefix=ALL --check-prefix=BMI2
define i64 @foo(i64 %x, i64 %y, i64 %z) nounwind readnone {
+; ALL-LABEL: foo:
+; ALL: # %bb.0: # %entry
+; ALL-NEXT: movl %edx, %ecx
+; ALL-NEXT: rolq %cl, %rdi
+; ALL-NEXT: movq %rdi, %rax
+; ALL-NEXT: retq
entry:
-; CHECK-LABEL: foo:
-; CHECK: rolq %cl
%0 = shl i64 %x, %z
%1 = sub i64 64, %z
%2 = lshr i64 %x, %1
@@ -14,9 +19,13 @@ entry:
}
define i64 @bar(i64 %x, i64 %y, i64 %z) nounwind readnone {
+; ALL-LABEL: bar:
+; ALL: # %bb.0: # %entry
+; ALL-NEXT: movl %edx, %ecx
+; ALL-NEXT: shldq %cl, %rdi, %rsi
+; ALL-NEXT: movq %rsi, %rax
+; ALL-NEXT: retq
entry:
-; CHECK-LABEL: bar:
-; CHECK: shldq %cl
%0 = shl i64 %y, %z
%1 = sub i64 64, %z
%2 = lshr i64 %x, %1
@@ -25,9 +34,13 @@ entry:
}
define i64 @un(i64 %x, i64 %y, i64 %z) nounwind readnone {
+; ALL-LABEL: un:
+; ALL: # %bb.0: # %entry
+; ALL-NEXT: movl %edx, %ecx
+; ALL-NEXT: rorq %cl, %rdi
+; ALL-NEXT: movq %rdi, %rax
+; ALL-NEXT: retq
entry:
-; CHECK-LABEL: un:
-; CHECK: rorq %cl
%0 = lshr i64 %x, %z
%1 = sub i64 64, %z
%2 = shl i64 %x, %1
@@ -36,9 +49,13 @@ entry:
}
define i64 @bu(i64 %x, i64 %y, i64 %z) nounwind readnone {
+; ALL-LABEL: bu:
+; ALL: # %bb.0: # %entry
+; ALL-NEXT: movl %edx, %ecx
+; ALL-NEXT: shrdq %cl, %rdi, %rsi
+; ALL-NEXT: movq %rsi, %rax
+; ALL-NEXT: retq
entry:
-; CHECK-LABEL: bu:
-; CHECK: shrdq %cl
%0 = lshr i64 %y, %z
%1 = sub i64 64, %z
%2 = shl i64 %x, %1
@@ -47,13 +64,23 @@ entry:
}
define i64 @xfoo(i64 %x, i64 %y, i64 %z) nounwind readnone {
-entry:
-; CHECK-LABEL: xfoo:
-; CHECK: rolq $7
+; X64-LABEL: xfoo:
+; X64: # %bb.0: # %entry
+; X64-NEXT: rolq $7, %rdi
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: retq
+;
; SHLD-LABEL: xfoo:
-; SHLD: shldq $7
+; SHLD: # %bb.0: # %entry
+; SHLD-NEXT: shldq $7, %rdi, %rdi
+; SHLD-NEXT: movq %rdi, %rax
+; SHLD-NEXT: retq
+;
; BMI2-LABEL: xfoo:
-; BMI2: rorxq $57
+; BMI2: # %bb.0: # %entry
+; BMI2-NEXT: rorxq $57, %rdi, %rax
+; BMI2-NEXT: retq
+entry:
%0 = lshr i64 %x, 57
%1 = shl i64 %x, 7
%2 = or i64 %0, %1
@@ -61,13 +88,23 @@ entry:
}
define i64 @xfoop(i64* %p) nounwind readnone {
-entry:
-; CHECK-LABEL: xfoop:
-; CHECK: rolq $7
+; X64-LABEL: xfoop:
+; X64: # %bb.0: # %entry
+; X64-NEXT: movq (%rdi), %rax
+; X64-NEXT: rolq $7, %rax
+; X64-NEXT: retq
+;
; SHLD-LABEL: xfoop:
-; SHLD: shldq $7
+; SHLD: # %bb.0: # %entry
+; SHLD-NEXT: movq (%rdi), %rax
+; SHLD-NEXT: shldq $7, %rax, %rax
+; SHLD-NEXT: retq
+;
; BMI2-LABEL: xfoop:
-; BMI2: rorxq $57
+; BMI2: # %bb.0: # %entry
+; BMI2-NEXT: rorxq $57, (%rdi), %rax
+; BMI2-NEXT: retq
+entry:
%x = load i64, i64* %p
%a = lshr i64 %x, 57
%b = shl i64 %x, 7
@@ -76,9 +113,12 @@ entry:
}
define i64 @xbar(i64 %x, i64 %y, i64 %z) nounwind readnone {
+; ALL-LABEL: xbar:
+; ALL: # %bb.0: # %entry
+; ALL-NEXT: shrdq $57, %rsi, %rdi
+; ALL-NEXT: movq %rdi, %rax
+; ALL-NEXT: retq
entry:
-; CHECK-LABEL: xbar:
-; CHECK: shrdq $57
%0 = shl i64 %y, 7
%1 = lshr i64 %x, 57
%2 = or i64 %0, %1
@@ -86,13 +126,23 @@ entry:
}
define i64 @xun(i64 %x, i64 %y, i64 %z) nounwind readnone {
-entry:
-; CHECK-LABEL: xun:
-; CHECK: rolq $57
+; X64-LABEL: xun:
+; X64: # %bb.0: # %entry
+; X64-NEXT: rolq $57, %rdi
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: retq
+;
; SHLD-LABEL: xun:
-; SHLD: shldq $57
+; SHLD: # %bb.0: # %entry
+; SHLD-NEXT: shldq $57, %rdi, %rdi
+; SHLD-NEXT: movq %rdi, %rax
+; SHLD-NEXT: retq
+;
; BMI2-LABEL: xun:
-; BMI2: rorxq $7
+; BMI2: # %bb.0: # %entry
+; BMI2-NEXT: rorxq $7, %rdi, %rax
+; BMI2-NEXT: retq
+entry:
%0 = lshr i64 %x, 7
%1 = shl i64 %x, 57
%2 = or i64 %0, %1
@@ -100,13 +150,23 @@ entry:
}
define i64 @xunp(i64* %p) nounwind readnone {
-entry:
-; CHECK-LABEL: xunp:
-; CHECK: rolq $57
+; X64-LABEL: xunp:
+; X64: # %bb.0: # %entry
+; X64-NEXT: movq (%rdi), %rax
+; X64-NEXT: rolq $57, %rax
+; X64-NEXT: retq
+;
; SHLD-LABEL: xunp:
-; SHLD: shldq $57
+; SHLD: # %bb.0: # %entry
+; SHLD-NEXT: movq (%rdi), %rax
+; SHLD-NEXT: shldq $57, %rax, %rax
+; SHLD-NEXT: retq
+;
; BMI2-LABEL: xunp:
-; BMI2: rorxq $7
+; BMI2: # %bb.0: # %entry
+; BMI2-NEXT: rorxq $7, (%rdi), %rax
+; BMI2-NEXT: retq
+entry:
%x = load i64, i64* %p
%a = lshr i64 %x, 7
%b = shl i64 %x, 57
@@ -115,9 +175,12 @@ entry:
}
define i64 @xbu(i64 %x, i64 %y, i64 %z) nounwind readnone {
+; ALL-LABEL: xbu:
+; ALL: # %bb.0: # %entry
+; ALL-NEXT: shldq $57, %rsi, %rdi
+; ALL-NEXT: movq %rdi, %rax
+; ALL-NEXT: retq
entry:
-; CHECK-LABEL: xbu:
-; CHECK: shldq $57
%0 = lshr i64 %y, 7
%1 = shl i64 %x, 57
%2 = or i64 %0, %1
diff --git a/test/CodeGen/X86/rotate.ll b/test/CodeGen/X86/rotate.ll
index 4be3a4c2391b..c31ce2a82d39 100644
--- a/test/CodeGen/X86/rotate.ll
+++ b/test/CodeGen/X86/rotate.ll
@@ -4,7 +4,7 @@
define i64 @rotl64(i64 %A, i8 %Amt) nounwind {
; 32-LABEL: rotl64:
-; 32: # BB#0:
+; 32: # %bb.0:
; 32-NEXT: pushl %ebx
; 32-NEXT: pushl %edi
; 32-NEXT: pushl %esi
@@ -17,7 +17,7 @@ define i64 @rotl64(i64 %A, i8 %Amt) nounwind {
; 32-NEXT: shldl %cl, %esi, %edx
; 32-NEXT: testb $32, %cl
; 32-NEXT: je .LBB0_2
-; 32-NEXT: # BB#1:
+; 32-NEXT: # %bb.1:
; 32-NEXT: movl %eax, %edx
; 32-NEXT: xorl %eax, %eax
; 32-NEXT: .LBB0_2:
@@ -29,7 +29,7 @@ define i64 @rotl64(i64 %A, i8 %Amt) nounwind {
; 32-NEXT: shrdl %cl, %edi, %esi
; 32-NEXT: testb $32, %ch
; 32-NEXT: je .LBB0_4
-; 32-NEXT: # BB#3:
+; 32-NEXT: # %bb.3:
; 32-NEXT: movl %ebx, %esi
; 32-NEXT: xorl %ebx, %ebx
; 32-NEXT: .LBB0_4:
@@ -41,7 +41,7 @@ define i64 @rotl64(i64 %A, i8 %Amt) nounwind {
; 32-NEXT: retl
;
; 64-LABEL: rotl64:
-; 64: # BB#0:
+; 64: # %bb.0:
; 64-NEXT: movl %esi, %ecx
; 64-NEXT: rolq %cl, %rdi
; 64-NEXT: movq %rdi, %rax
@@ -57,7 +57,7 @@ define i64 @rotl64(i64 %A, i8 %Amt) nounwind {
define i64 @rotr64(i64 %A, i8 %Amt) nounwind {
; 32-LABEL: rotr64:
-; 32: # BB#0:
+; 32: # %bb.0:
; 32-NEXT: pushl %ebx
; 32-NEXT: pushl %edi
; 32-NEXT: pushl %esi
@@ -70,7 +70,7 @@ define i64 @rotr64(i64 %A, i8 %Amt) nounwind {
; 32-NEXT: shrdl %cl, %esi, %eax
; 32-NEXT: testb $32, %cl
; 32-NEXT: je .LBB1_2
-; 32-NEXT: # BB#1:
+; 32-NEXT: # %bb.1:
; 32-NEXT: movl %edx, %eax
; 32-NEXT: xorl %edx, %edx
; 32-NEXT: .LBB1_2:
@@ -82,7 +82,7 @@ define i64 @rotr64(i64 %A, i8 %Amt) nounwind {
; 32-NEXT: shldl %cl, %edi, %esi
; 32-NEXT: testb $32, %ch
; 32-NEXT: je .LBB1_4
-; 32-NEXT: # BB#3:
+; 32-NEXT: # %bb.3:
; 32-NEXT: movl %ebx, %esi
; 32-NEXT: xorl %ebx, %ebx
; 32-NEXT: .LBB1_4:
@@ -94,7 +94,7 @@ define i64 @rotr64(i64 %A, i8 %Amt) nounwind {
; 32-NEXT: retl
;
; 64-LABEL: rotr64:
-; 64: # BB#0:
+; 64: # %bb.0:
; 64-NEXT: movl %esi, %ecx
; 64-NEXT: rorq %cl, %rdi
; 64-NEXT: movq %rdi, %rax
@@ -110,7 +110,7 @@ define i64 @rotr64(i64 %A, i8 %Amt) nounwind {
define i64 @rotli64(i64 %A) nounwind {
; 32-LABEL: rotli64:
-; 32: # BB#0:
+; 32: # %bb.0:
; 32-NEXT: movl {{[0-9]+}}(%esp), %eax
; 32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; 32-NEXT: movl %ecx, %edx
@@ -119,7 +119,7 @@ define i64 @rotli64(i64 %A) nounwind {
; 32-NEXT: retl
;
; 64-LABEL: rotli64:
-; 64: # BB#0:
+; 64: # %bb.0:
; 64-NEXT: rolq $5, %rdi
; 64-NEXT: movq %rdi, %rax
; 64-NEXT: retq
@@ -131,7 +131,7 @@ define i64 @rotli64(i64 %A) nounwind {
define i64 @rotri64(i64 %A) nounwind {
; 32-LABEL: rotri64:
-; 32: # BB#0:
+; 32: # %bb.0:
; 32-NEXT: movl {{[0-9]+}}(%esp), %edx
; 32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; 32-NEXT: movl %ecx, %eax
@@ -140,7 +140,7 @@ define i64 @rotri64(i64 %A) nounwind {
; 32-NEXT: retl
;
; 64-LABEL: rotri64:
-; 64: # BB#0:
+; 64: # %bb.0:
; 64-NEXT: rolq $59, %rdi
; 64-NEXT: movq %rdi, %rax
; 64-NEXT: retq
@@ -152,7 +152,7 @@ define i64 @rotri64(i64 %A) nounwind {
define i64 @rotl1_64(i64 %A) nounwind {
; 32-LABEL: rotl1_64:
-; 32: # BB#0:
+; 32: # %bb.0:
; 32-NEXT: movl {{[0-9]+}}(%esp), %eax
; 32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; 32-NEXT: movl %ecx, %edx
@@ -161,7 +161,7 @@ define i64 @rotl1_64(i64 %A) nounwind {
; 32-NEXT: retl
;
; 64-LABEL: rotl1_64:
-; 64: # BB#0:
+; 64: # %bb.0:
; 64-NEXT: rolq %rdi
; 64-NEXT: movq %rdi, %rax
; 64-NEXT: retq
@@ -173,7 +173,7 @@ define i64 @rotl1_64(i64 %A) nounwind {
define i64 @rotr1_64(i64 %A) nounwind {
; 32-LABEL: rotr1_64:
-; 32: # BB#0:
+; 32: # %bb.0:
; 32-NEXT: movl {{[0-9]+}}(%esp), %edx
; 32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; 32-NEXT: movl %ecx, %eax
@@ -182,7 +182,7 @@ define i64 @rotr1_64(i64 %A) nounwind {
; 32-NEXT: retl
;
; 64-LABEL: rotr1_64:
-; 64: # BB#0:
+; 64: # %bb.0:
; 64-NEXT: rorq %rdi
; 64-NEXT: movq %rdi, %rax
; 64-NEXT: retq
@@ -194,14 +194,14 @@ define i64 @rotr1_64(i64 %A) nounwind {
define i32 @rotl32(i32 %A, i8 %Amt) nounwind {
; 32-LABEL: rotl32:
-; 32: # BB#0:
+; 32: # %bb.0:
; 32-NEXT: movb {{[0-9]+}}(%esp), %cl
; 32-NEXT: movl {{[0-9]+}}(%esp), %eax
; 32-NEXT: roll %cl, %eax
; 32-NEXT: retl
;
; 64-LABEL: rotl32:
-; 64: # BB#0:
+; 64: # %bb.0:
; 64-NEXT: movl %esi, %ecx
; 64-NEXT: roll %cl, %edi
; 64-NEXT: movl %edi, %eax
@@ -217,14 +217,14 @@ define i32 @rotl32(i32 %A, i8 %Amt) nounwind {
define i32 @rotr32(i32 %A, i8 %Amt) nounwind {
; 32-LABEL: rotr32:
-; 32: # BB#0:
+; 32: # %bb.0:
; 32-NEXT: movb {{[0-9]+}}(%esp), %cl
; 32-NEXT: movl {{[0-9]+}}(%esp), %eax
; 32-NEXT: rorl %cl, %eax
; 32-NEXT: retl
;
; 64-LABEL: rotr32:
-; 64: # BB#0:
+; 64: # %bb.0:
; 64-NEXT: movl %esi, %ecx
; 64-NEXT: rorl %cl, %edi
; 64-NEXT: movl %edi, %eax
@@ -240,13 +240,13 @@ define i32 @rotr32(i32 %A, i8 %Amt) nounwind {
define i32 @rotli32(i32 %A) nounwind {
; 32-LABEL: rotli32:
-; 32: # BB#0:
+; 32: # %bb.0:
; 32-NEXT: movl {{[0-9]+}}(%esp), %eax
; 32-NEXT: roll $5, %eax
; 32-NEXT: retl
;
; 64-LABEL: rotli32:
-; 64: # BB#0:
+; 64: # %bb.0:
; 64-NEXT: roll $5, %edi
; 64-NEXT: movl %edi, %eax
; 64-NEXT: retq
@@ -258,13 +258,13 @@ define i32 @rotli32(i32 %A) nounwind {
define i32 @rotri32(i32 %A) nounwind {
; 32-LABEL: rotri32:
-; 32: # BB#0:
+; 32: # %bb.0:
; 32-NEXT: movl {{[0-9]+}}(%esp), %eax
; 32-NEXT: roll $27, %eax
; 32-NEXT: retl
;
; 64-LABEL: rotri32:
-; 64: # BB#0:
+; 64: # %bb.0:
; 64-NEXT: roll $27, %edi
; 64-NEXT: movl %edi, %eax
; 64-NEXT: retq
@@ -276,13 +276,13 @@ define i32 @rotri32(i32 %A) nounwind {
define i32 @rotl1_32(i32 %A) nounwind {
; 32-LABEL: rotl1_32:
-; 32: # BB#0:
+; 32: # %bb.0:
; 32-NEXT: movl {{[0-9]+}}(%esp), %eax
; 32-NEXT: roll %eax
; 32-NEXT: retl
;
; 64-LABEL: rotl1_32:
-; 64: # BB#0:
+; 64: # %bb.0:
; 64-NEXT: roll %edi
; 64-NEXT: movl %edi, %eax
; 64-NEXT: retq
@@ -294,13 +294,13 @@ define i32 @rotl1_32(i32 %A) nounwind {
define i32 @rotr1_32(i32 %A) nounwind {
; 32-LABEL: rotr1_32:
-; 32: # BB#0:
+; 32: # %bb.0:
; 32-NEXT: movl {{[0-9]+}}(%esp), %eax
; 32-NEXT: rorl %eax
; 32-NEXT: retl
;
; 64-LABEL: rotr1_32:
-; 64: # BB#0:
+; 64: # %bb.0:
; 64-NEXT: rorl %edi
; 64-NEXT: movl %edi, %eax
; 64-NEXT: retq
@@ -312,14 +312,14 @@ define i32 @rotr1_32(i32 %A) nounwind {
define i16 @rotl16(i16 %A, i8 %Amt) nounwind {
; 32-LABEL: rotl16:
-; 32: # BB#0:
+; 32: # %bb.0:
; 32-NEXT: movb {{[0-9]+}}(%esp), %cl
; 32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; 32-NEXT: rolw %cl, %ax
; 32-NEXT: retl
;
; 64-LABEL: rotl16:
-; 64: # BB#0:
+; 64: # %bb.0:
; 64-NEXT: movl %esi, %ecx
; 64-NEXT: rolw %cl, %di
; 64-NEXT: movl %edi, %eax
@@ -335,14 +335,14 @@ define i16 @rotl16(i16 %A, i8 %Amt) nounwind {
define i16 @rotr16(i16 %A, i8 %Amt) nounwind {
; 32-LABEL: rotr16:
-; 32: # BB#0:
+; 32: # %bb.0:
; 32-NEXT: movb {{[0-9]+}}(%esp), %cl
; 32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; 32-NEXT: rorw %cl, %ax
; 32-NEXT: retl
;
; 64-LABEL: rotr16:
-; 64: # BB#0:
+; 64: # %bb.0:
; 64-NEXT: movl %esi, %ecx
; 64-NEXT: rorw %cl, %di
; 64-NEXT: movl %edi, %eax
@@ -358,13 +358,13 @@ define i16 @rotr16(i16 %A, i8 %Amt) nounwind {
define i16 @rotli16(i16 %A) nounwind {
; 32-LABEL: rotli16:
-; 32: # BB#0:
+; 32: # %bb.0:
; 32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; 32-NEXT: rolw $5, %ax
; 32-NEXT: retl
;
; 64-LABEL: rotli16:
-; 64: # BB#0:
+; 64: # %bb.0:
; 64-NEXT: rolw $5, %di
; 64-NEXT: movl %edi, %eax
; 64-NEXT: retq
@@ -376,13 +376,13 @@ define i16 @rotli16(i16 %A) nounwind {
define i16 @rotri16(i16 %A) nounwind {
; 32-LABEL: rotri16:
-; 32: # BB#0:
+; 32: # %bb.0:
; 32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; 32-NEXT: rolw $11, %ax
; 32-NEXT: retl
;
; 64-LABEL: rotri16:
-; 64: # BB#0:
+; 64: # %bb.0:
; 64-NEXT: rolw $11, %di
; 64-NEXT: movl %edi, %eax
; 64-NEXT: retq
@@ -394,13 +394,13 @@ define i16 @rotri16(i16 %A) nounwind {
define i16 @rotl1_16(i16 %A) nounwind {
; 32-LABEL: rotl1_16:
-; 32: # BB#0:
+; 32: # %bb.0:
; 32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; 32-NEXT: rolw %ax
; 32-NEXT: retl
;
; 64-LABEL: rotl1_16:
-; 64: # BB#0:
+; 64: # %bb.0:
; 64-NEXT: rolw %di
; 64-NEXT: movl %edi, %eax
; 64-NEXT: retq
@@ -412,13 +412,13 @@ define i16 @rotl1_16(i16 %A) nounwind {
define i16 @rotr1_16(i16 %A) nounwind {
; 32-LABEL: rotr1_16:
-; 32: # BB#0:
+; 32: # %bb.0:
; 32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; 32-NEXT: rorw %ax
; 32-NEXT: retl
;
; 64-LABEL: rotr1_16:
-; 64: # BB#0:
+; 64: # %bb.0:
; 64-NEXT: rorw %di
; 64-NEXT: movl %edi, %eax
; 64-NEXT: retq
@@ -430,14 +430,14 @@ define i16 @rotr1_16(i16 %A) nounwind {
define i8 @rotl8(i8 %A, i8 %Amt) nounwind {
; 32-LABEL: rotl8:
-; 32: # BB#0:
+; 32: # %bb.0:
; 32-NEXT: movb {{[0-9]+}}(%esp), %cl
; 32-NEXT: movb {{[0-9]+}}(%esp), %al
; 32-NEXT: rolb %cl, %al
; 32-NEXT: retl
;
; 64-LABEL: rotl8:
-; 64: # BB#0:
+; 64: # %bb.0:
; 64-NEXT: movl %esi, %ecx
; 64-NEXT: rolb %cl, %dil
; 64-NEXT: movl %edi, %eax
@@ -451,14 +451,14 @@ define i8 @rotl8(i8 %A, i8 %Amt) nounwind {
define i8 @rotr8(i8 %A, i8 %Amt) nounwind {
; 32-LABEL: rotr8:
-; 32: # BB#0:
+; 32: # %bb.0:
; 32-NEXT: movb {{[0-9]+}}(%esp), %cl
; 32-NEXT: movb {{[0-9]+}}(%esp), %al
; 32-NEXT: rorb %cl, %al
; 32-NEXT: retl
;
; 64-LABEL: rotr8:
-; 64: # BB#0:
+; 64: # %bb.0:
; 64-NEXT: movl %esi, %ecx
; 64-NEXT: rorb %cl, %dil
; 64-NEXT: movl %edi, %eax
@@ -472,13 +472,13 @@ define i8 @rotr8(i8 %A, i8 %Amt) nounwind {
define i8 @rotli8(i8 %A) nounwind {
; 32-LABEL: rotli8:
-; 32: # BB#0:
+; 32: # %bb.0:
; 32-NEXT: movb {{[0-9]+}}(%esp), %al
; 32-NEXT: rolb $5, %al
; 32-NEXT: retl
;
; 64-LABEL: rotli8:
-; 64: # BB#0:
+; 64: # %bb.0:
; 64-NEXT: rolb $5, %dil
; 64-NEXT: movl %edi, %eax
; 64-NEXT: retq
@@ -490,13 +490,13 @@ define i8 @rotli8(i8 %A) nounwind {
define i8 @rotri8(i8 %A) nounwind {
; 32-LABEL: rotri8:
-; 32: # BB#0:
+; 32: # %bb.0:
; 32-NEXT: movb {{[0-9]+}}(%esp), %al
; 32-NEXT: rolb $3, %al
; 32-NEXT: retl
;
; 64-LABEL: rotri8:
-; 64: # BB#0:
+; 64: # %bb.0:
; 64-NEXT: rolb $3, %dil
; 64-NEXT: movl %edi, %eax
; 64-NEXT: retq
@@ -508,13 +508,13 @@ define i8 @rotri8(i8 %A) nounwind {
define i8 @rotl1_8(i8 %A) nounwind {
; 32-LABEL: rotl1_8:
-; 32: # BB#0:
+; 32: # %bb.0:
; 32-NEXT: movb {{[0-9]+}}(%esp), %al
; 32-NEXT: rolb %al
; 32-NEXT: retl
;
; 64-LABEL: rotl1_8:
-; 64: # BB#0:
+; 64: # %bb.0:
; 64-NEXT: rolb %dil
; 64-NEXT: movl %edi, %eax
; 64-NEXT: retq
@@ -526,13 +526,13 @@ define i8 @rotl1_8(i8 %A) nounwind {
define i8 @rotr1_8(i8 %A) nounwind {
; 32-LABEL: rotr1_8:
-; 32: # BB#0:
+; 32: # %bb.0:
; 32-NEXT: movb {{[0-9]+}}(%esp), %al
; 32-NEXT: rorb %al
; 32-NEXT: retl
;
; 64-LABEL: rotr1_8:
-; 64: # BB#0:
+; 64: # %bb.0:
; 64-NEXT: rorb %dil
; 64-NEXT: movl %edi, %eax
; 64-NEXT: retq
@@ -544,7 +544,7 @@ define i8 @rotr1_8(i8 %A) nounwind {
define void @rotr1_64_mem(i64* %Aptr) nounwind {
; 32-LABEL: rotr1_64_mem:
-; 32: # BB#0:
+; 32: # %bb.0:
; 32-NEXT: pushl %esi
; 32-NEXT: movl {{[0-9]+}}(%esp), %eax
; 32-NEXT: movl (%eax), %ecx
@@ -558,7 +558,7 @@ define void @rotr1_64_mem(i64* %Aptr) nounwind {
; 32-NEXT: retl
;
; 64-LABEL: rotr1_64_mem:
-; 64: # BB#0:
+; 64: # %bb.0:
; 64-NEXT: rorq (%rdi)
; 64-NEXT: retq
@@ -572,13 +572,13 @@ define void @rotr1_64_mem(i64* %Aptr) nounwind {
define void @rotr1_32_mem(i32* %Aptr) nounwind {
; 32-LABEL: rotr1_32_mem:
-; 32: # BB#0:
+; 32: # %bb.0:
; 32-NEXT: movl {{[0-9]+}}(%esp), %eax
; 32-NEXT: rorl (%eax)
; 32-NEXT: retl
;
; 64-LABEL: rotr1_32_mem:
-; 64: # BB#0:
+; 64: # %bb.0:
; 64-NEXT: rorl (%rdi)
; 64-NEXT: retq
%A = load i32, i32 *%Aptr
@@ -591,13 +591,13 @@ define void @rotr1_32_mem(i32* %Aptr) nounwind {
define void @rotr1_16_mem(i16* %Aptr) nounwind {
; 32-LABEL: rotr1_16_mem:
-; 32: # BB#0:
+; 32: # %bb.0:
; 32-NEXT: movl {{[0-9]+}}(%esp), %eax
; 32-NEXT: rorw (%eax)
; 32-NEXT: retl
;
; 64-LABEL: rotr1_16_mem:
-; 64: # BB#0:
+; 64: # %bb.0:
; 64-NEXT: rorw (%rdi)
; 64-NEXT: retq
%A = load i16, i16 *%Aptr
@@ -610,13 +610,13 @@ define void @rotr1_16_mem(i16* %Aptr) nounwind {
define void @rotr1_8_mem(i8* %Aptr) nounwind {
; 32-LABEL: rotr1_8_mem:
-; 32: # BB#0:
+; 32: # %bb.0:
; 32-NEXT: movl {{[0-9]+}}(%esp), %eax
; 32-NEXT: rorb (%eax)
; 32-NEXT: retl
;
; 64-LABEL: rotr1_8_mem:
-; 64: # BB#0:
+; 64: # %bb.0:
; 64-NEXT: rorb (%rdi)
; 64-NEXT: retq
%A = load i8, i8 *%Aptr
@@ -626,3 +626,55 @@ define void @rotr1_8_mem(i8* %Aptr) nounwind {
store i8 %D, i8* %Aptr
ret void
}
+
+define i64 @truncated_rot(i64 %x, i32 %amt) nounwind {
+; 32-LABEL: truncated_rot:
+; 32: # %bb.0: # %entry
+; 32-NEXT: pushl %ebx
+; 32-NEXT: pushl %edi
+; 32-NEXT: pushl %esi
+; 32-NEXT: movl {{[0-9]+}}(%esp), %esi
+; 32-NEXT: movl {{[0-9]+}}(%esp), %edi
+; 32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; 32-NEXT: movl %esi, %eax
+; 32-NEXT: shll %cl, %eax
+; 32-NEXT: testb $32, %cl
+; 32-NEXT: movl $0, %ebx
+; 32-NEXT: jne .LBB28_2
+; 32-NEXT: # %bb.1: # %entry
+; 32-NEXT: movl %eax, %ebx
+; 32-NEXT: .LBB28_2: # %entry
+; 32-NEXT: movl $64, %edx
+; 32-NEXT: subl %ecx, %edx
+; 32-NEXT: movl %edi, %eax
+; 32-NEXT: movl %edx, %ecx
+; 32-NEXT: shrl %cl, %eax
+; 32-NEXT: shrdl %cl, %edi, %esi
+; 32-NEXT: testb $32, %dl
+; 32-NEXT: jne .LBB28_4
+; 32-NEXT: # %bb.3: # %entry
+; 32-NEXT: movl %esi, %eax
+; 32-NEXT: .LBB28_4: # %entry
+; 32-NEXT: orl %ebx, %eax
+; 32-NEXT: xorl %edx, %edx
+; 32-NEXT: popl %esi
+; 32-NEXT: popl %edi
+; 32-NEXT: popl %ebx
+; 32-NEXT: retl
+;
+; 64-LABEL: truncated_rot:
+; 64: # %bb.0: # %entry
+; 64-NEXT: movl %esi, %ecx
+; 64-NEXT: rolq %cl, %rdi
+; 64-NEXT: movl %edi, %eax
+; 64-NEXT: retq
+entry:
+ %sh_prom = zext i32 %amt to i64
+ %shl = shl i64 %x, %sh_prom
+ %sub = sub nsw i32 64, %amt
+ %sh_prom1 = zext i32 %sub to i64
+ %shr = lshr i64 %x, %sh_prom1
+ %or = or i64 %shr, %shl
+ %and = and i64 %or, 4294967295
+ ret i64 %and
+}
diff --git a/test/CodeGen/X86/rotate2.ll b/test/CodeGen/X86/rotate2.ll
index 2316c708507a..73f11ab1f363 100644
--- a/test/CodeGen/X86/rotate2.ll
+++ b/test/CodeGen/X86/rotate2.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mcpu=corei7 | grep rol | count 2
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=corei7 | grep rol | count 2
define i64 @test1(i64 %x) nounwind {
entry:
diff --git a/test/CodeGen/X86/rotate4.ll b/test/CodeGen/X86/rotate4.ll
index c7117be91ab4..79822999dcad 100644
--- a/test/CodeGen/X86/rotate4.ll
+++ b/test/CodeGen/X86/rotate4.ll
@@ -6,7 +6,7 @@
define i32 @rotate_left_32(i32 %a, i32 %b) {
; CHECK-LABEL: rotate_left_32:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movl %esi, %ecx
; CHECK-NEXT: roll %cl, %edi
; CHECK-NEXT: movl %edi, %eax
@@ -22,7 +22,7 @@ define i32 @rotate_left_32(i32 %a, i32 %b) {
define i32 @rotate_right_32(i32 %a, i32 %b) {
; CHECK-LABEL: rotate_right_32:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movl %esi, %ecx
; CHECK-NEXT: rorl %cl, %edi
; CHECK-NEXT: movl %edi, %eax
@@ -38,7 +38,7 @@ define i32 @rotate_right_32(i32 %a, i32 %b) {
define i64 @rotate_left_64(i64 %a, i64 %b) {
; CHECK-LABEL: rotate_left_64:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movl %esi, %ecx
; CHECK-NEXT: rolq %cl, %rdi
; CHECK-NEXT: movq %rdi, %rax
@@ -54,7 +54,7 @@ define i64 @rotate_left_64(i64 %a, i64 %b) {
define i64 @rotate_right_64(i64 %a, i64 %b) {
; CHECK-LABEL: rotate_right_64:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movl %esi, %ecx
; CHECK-NEXT: rorq %cl, %rdi
; CHECK-NEXT: movq %rdi, %rax
@@ -72,7 +72,7 @@ define i64 @rotate_right_64(i64 %a, i64 %b) {
define void @rotate_left_m32(i32 *%pa, i32 %b) {
; CHECK-LABEL: rotate_left_m32:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movl %esi, %ecx
; CHECK-NEXT: roll %cl, (%rdi)
; CHECK-NEXT: retq
@@ -89,7 +89,7 @@ define void @rotate_left_m32(i32 *%pa, i32 %b) {
define void @rotate_right_m32(i32 *%pa, i32 %b) {
; CHECK-LABEL: rotate_right_m32:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movl %esi, %ecx
; CHECK-NEXT: rorl %cl, (%rdi)
; CHECK-NEXT: retq
@@ -106,7 +106,7 @@ define void @rotate_right_m32(i32 *%pa, i32 %b) {
define void @rotate_left_m64(i64 *%pa, i64 %b) {
; CHECK-LABEL: rotate_left_m64:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movl %esi, %ecx
; CHECK-NEXT: rolq %cl, (%rdi)
; CHECK-NEXT: retq
@@ -123,7 +123,7 @@ define void @rotate_left_m64(i64 *%pa, i64 %b) {
define void @rotate_right_m64(i64 *%pa, i64 %b) {
; CHECK-LABEL: rotate_right_m64:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movl %esi, %ecx
; CHECK-NEXT: rorq %cl, (%rdi)
; CHECK-NEXT: retq
@@ -138,3 +138,146 @@ define void @rotate_right_m64(i64 *%pa, i64 %b) {
ret void
}
+; The next 8 tests include masks of the narrow width shift amounts that should be eliminated.
+; These patterns are produced by instcombine after r310509.
+
+define i8 @rotate_left_8(i8 %x, i32 %amount) {
+; CHECK-LABEL: rotate_left_8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movl %esi, %ecx
+; CHECK-NEXT: rolb %cl, %dil
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: retq
+ %amt = trunc i32 %amount to i8
+ %sub = sub i8 0, %amt
+ %maskamt = and i8 %amt, 7
+ %masksub = and i8 %sub, 7
+ %shl = shl i8 %x, %maskamt
+ %shr = lshr i8 %x, %masksub
+ %or = or i8 %shl, %shr
+ ret i8 %or
+}
+
+define i8 @rotate_right_8(i8 %x, i32 %amount) {
+; CHECK-LABEL: rotate_right_8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movl %esi, %ecx
+; CHECK-NEXT: rorb %cl, %dil
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: retq
+ %amt = trunc i32 %amount to i8
+ %sub = sub i8 0, %amt
+ %maskamt = and i8 %amt, 7
+ %masksub = and i8 %sub, 7
+ %shr = lshr i8 %x, %maskamt
+ %shl = shl i8 %x, %masksub
+ %or = or i8 %shr, %shl
+ ret i8 %or
+}
+
+define i16 @rotate_left_16(i16 %x, i32 %amount) {
+; CHECK-LABEL: rotate_left_16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movl %esi, %ecx
+; CHECK-NEXT: rolw %cl, %di
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: retq
+ %amt = trunc i32 %amount to i16
+ %sub = sub i16 0, %amt
+ %maskamt = and i16 %amt, 15
+ %masksub = and i16 %sub, 15
+ %shl = shl i16 %x, %maskamt
+ %shr = lshr i16 %x, %masksub
+ %or = or i16 %shl, %shr
+ ret i16 %or
+}
+
+define i16 @rotate_right_16(i16 %x, i32 %amount) {
+; CHECK-LABEL: rotate_right_16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movl %esi, %ecx
+; CHECK-NEXT: rorw %cl, %di
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: retq
+ %amt = trunc i32 %amount to i16
+ %sub = sub i16 0, %amt
+ %maskamt = and i16 %amt, 15
+ %masksub = and i16 %sub, 15
+ %shr = lshr i16 %x, %maskamt
+ %shl = shl i16 %x, %masksub
+ %or = or i16 %shr, %shl
+ ret i16 %or
+}
+
+define void @rotate_left_m8(i8* %p, i32 %amount) {
+; CHECK-LABEL: rotate_left_m8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movl %esi, %ecx
+; CHECK-NEXT: rolb %cl, (%rdi)
+; CHECK-NEXT: retq
+ %x = load i8, i8* %p, align 1
+ %amt = trunc i32 %amount to i8
+ %sub = sub i8 0, %amt
+ %maskamt = and i8 %amt, 7
+ %masksub = and i8 %sub, 7
+ %shl = shl i8 %x, %maskamt
+ %shr = lshr i8 %x, %masksub
+ %or = or i8 %shl, %shr
+ store i8 %or, i8* %p, align 1
+ ret void
+}
+
+define void @rotate_right_m8(i8* %p, i32 %amount) {
+; CHECK-LABEL: rotate_right_m8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movl %esi, %ecx
+; CHECK-NEXT: rorb %cl, (%rdi)
+; CHECK-NEXT: retq
+ %x = load i8, i8* %p, align 1
+ %amt = trunc i32 %amount to i8
+ %sub = sub i8 0, %amt
+ %maskamt = and i8 %amt, 7
+ %masksub = and i8 %sub, 7
+ %shl = shl i8 %x, %masksub
+ %shr = lshr i8 %x, %maskamt
+ %or = or i8 %shl, %shr
+ store i8 %or, i8* %p, align 1
+ ret void
+}
+
+define void @rotate_left_m16(i16* %p, i32 %amount) {
+; CHECK-LABEL: rotate_left_m16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movl %esi, %ecx
+; CHECK-NEXT: rolw %cl, (%rdi)
+; CHECK-NEXT: retq
+ %x = load i16, i16* %p, align 1
+ %amt = trunc i32 %amount to i16
+ %sub = sub i16 0, %amt
+ %maskamt = and i16 %amt, 15
+ %masksub = and i16 %sub, 15
+ %shl = shl i16 %x, %maskamt
+ %shr = lshr i16 %x, %masksub
+ %or = or i16 %shl, %shr
+ store i16 %or, i16* %p, align 1
+ ret void
+}
+
+define void @rotate_right_m16(i16* %p, i32 %amount) {
+; CHECK-LABEL: rotate_right_m16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movl %esi, %ecx
+; CHECK-NEXT: rorw %cl, (%rdi)
+; CHECK-NEXT: retq
+ %x = load i16, i16* %p, align 1
+ %amt = trunc i32 %amount to i16
+ %sub = sub i16 0, %amt
+ %maskamt = and i16 %amt, 15
+ %masksub = and i16 %sub, 15
+ %shl = shl i16 %x, %masksub
+ %shr = lshr i16 %x, %maskamt
+ %or = or i16 %shl, %shr
+ store i16 %or, i16* %p, align 1
+ ret void
+}
+
diff --git a/test/CodeGen/X86/rotate_vec.ll b/test/CodeGen/X86/rotate_vec.ll
index 8fb000bae827..ed0c4717ea80 100644
--- a/test/CodeGen/X86/rotate_vec.ll
+++ b/test/CodeGen/X86/rotate_vec.ll
@@ -3,7 +3,7 @@
define <4 x i32> @rot_v4i32_splat(<4 x i32> %x) {
; CHECK-LABEL: rot_v4i32_splat:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vprotd $31, %xmm0, %xmm0
; CHECK-NEXT: retq
%1 = lshr <4 x i32> %x, <i32 1, i32 1, i32 1, i32 1>
@@ -14,7 +14,7 @@ define <4 x i32> @rot_v4i32_splat(<4 x i32> %x) {
define <4 x i32> @rot_v4i32_non_splat(<4 x i32> %x) {
; CHECK-LABEL: rot_v4i32_non_splat:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vprotd {{.*}}(%rip), %xmm0, %xmm0
; CHECK-NEXT: retq
%1 = lshr <4 x i32> %x, <i32 1, i32 2, i32 3, i32 4>
@@ -25,7 +25,7 @@ define <4 x i32> @rot_v4i32_non_splat(<4 x i32> %x) {
define <4 x i32> @rot_v4i32_splat_2masks(<4 x i32> %x) {
; CHECK-LABEL: rot_v4i32_splat_2masks:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vprotd $31, %xmm0, %xmm0
; CHECK-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; CHECK-NEXT: retq
@@ -40,7 +40,7 @@ define <4 x i32> @rot_v4i32_splat_2masks(<4 x i32> %x) {
define <4 x i32> @rot_v4i32_non_splat_2masks(<4 x i32> %x) {
; CHECK-LABEL: rot_v4i32_non_splat_2masks:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vprotd {{.*}}(%rip), %xmm0, %xmm0
; CHECK-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; CHECK-NEXT: retq
diff --git a/test/CodeGen/X86/rounding-ops.ll b/test/CodeGen/X86/rounding-ops.ll
index 15a11d1d6a96..6e84635da29b 100644
--- a/test/CodeGen/X86/rounding-ops.ll
+++ b/test/CodeGen/X86/rounding-ops.ll
@@ -1,132 +1,214 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-apple-macosx -mattr=+sse4.1 | FileCheck -check-prefix=CHECK-SSE %s
; RUN: llc < %s -mtriple=x86_64-apple-macosx -mattr=+avx | FileCheck -check-prefix=CHECK-AVX %s
+; RUN: llc < %s -mtriple=x86_64-apple-macosx -mattr=+avx512f | FileCheck -check-prefix=CHECK-AVX512 %s
define float @test1(float %x) nounwind {
- %call = tail call float @floorf(float %x) nounwind readnone
- ret float %call
-
; CHECK-SSE-LABEL: test1:
-; CHECK-SSE: roundss $9
-
+; CHECK-SSE: ## %bb.0:
+; CHECK-SSE-NEXT: roundss $9, %xmm0, %xmm0
+; CHECK-SSE-NEXT: retq
+;
; CHECK-AVX-LABEL: test1:
-; CHECK-AVX: vroundss $9
+; CHECK-AVX: ## %bb.0:
+; CHECK-AVX-NEXT: vroundss $9, %xmm0, %xmm0, %xmm0
+; CHECK-AVX-NEXT: retq
+;
+; CHECK-AVX512-LABEL: test1:
+; CHECK-AVX512: ## %bb.0:
+; CHECK-AVX512-NEXT: vrndscaless $9, %xmm0, %xmm0, %xmm0
+; CHECK-AVX512-NEXT: retq
+ %call = tail call float @floorf(float %x) nounwind readnone
+ ret float %call
}
declare float @floorf(float) nounwind readnone
define double @test2(double %x) nounwind {
- %call = tail call double @floor(double %x) nounwind readnone
- ret double %call
-
; CHECK-SSE-LABEL: test2:
-; CHECK-SSE: roundsd $9
-
+; CHECK-SSE: ## %bb.0:
+; CHECK-SSE-NEXT: roundsd $9, %xmm0, %xmm0
+; CHECK-SSE-NEXT: retq
+;
; CHECK-AVX-LABEL: test2:
-; CHECK-AVX: vroundsd $9
+; CHECK-AVX: ## %bb.0:
+; CHECK-AVX-NEXT: vroundsd $9, %xmm0, %xmm0, %xmm0
+; CHECK-AVX-NEXT: retq
+;
+; CHECK-AVX512-LABEL: test2:
+; CHECK-AVX512: ## %bb.0:
+; CHECK-AVX512-NEXT: vrndscalesd $9, %xmm0, %xmm0, %xmm0
+; CHECK-AVX512-NEXT: retq
+ %call = tail call double @floor(double %x) nounwind readnone
+ ret double %call
}
declare double @floor(double) nounwind readnone
define float @test3(float %x) nounwind {
- %call = tail call float @nearbyintf(float %x) nounwind readnone
- ret float %call
-
; CHECK-SSE-LABEL: test3:
-; CHECK-SSE: roundss $12
-
+; CHECK-SSE: ## %bb.0:
+; CHECK-SSE-NEXT: roundss $12, %xmm0, %xmm0
+; CHECK-SSE-NEXT: retq
+;
; CHECK-AVX-LABEL: test3:
-; CHECK-AVX: vroundss $12
+; CHECK-AVX: ## %bb.0:
+; CHECK-AVX-NEXT: vroundss $12, %xmm0, %xmm0, %xmm0
+; CHECK-AVX-NEXT: retq
+;
+; CHECK-AVX512-LABEL: test3:
+; CHECK-AVX512: ## %bb.0:
+; CHECK-AVX512-NEXT: vrndscaless $12, %xmm0, %xmm0, %xmm0
+; CHECK-AVX512-NEXT: retq
+ %call = tail call float @nearbyintf(float %x) nounwind readnone
+ ret float %call
}
declare float @nearbyintf(float) nounwind readnone
define double @test4(double %x) nounwind {
- %call = tail call double @nearbyint(double %x) nounwind readnone
- ret double %call
-
; CHECK-SSE-LABEL: test4:
-; CHECK-SSE: roundsd $12
-
+; CHECK-SSE: ## %bb.0:
+; CHECK-SSE-NEXT: roundsd $12, %xmm0, %xmm0
+; CHECK-SSE-NEXT: retq
+;
; CHECK-AVX-LABEL: test4:
-; CHECK-AVX: vroundsd $12
+; CHECK-AVX: ## %bb.0:
+; CHECK-AVX-NEXT: vroundsd $12, %xmm0, %xmm0, %xmm0
+; CHECK-AVX-NEXT: retq
+;
+; CHECK-AVX512-LABEL: test4:
+; CHECK-AVX512: ## %bb.0:
+; CHECK-AVX512-NEXT: vrndscalesd $12, %xmm0, %xmm0, %xmm0
+; CHECK-AVX512-NEXT: retq
+ %call = tail call double @nearbyint(double %x) nounwind readnone
+ ret double %call
}
declare double @nearbyint(double) nounwind readnone
define float @test5(float %x) nounwind {
- %call = tail call float @ceilf(float %x) nounwind readnone
- ret float %call
-
; CHECK-SSE-LABEL: test5:
-; CHECK-SSE: roundss $10
-
+; CHECK-SSE: ## %bb.0:
+; CHECK-SSE-NEXT: roundss $10, %xmm0, %xmm0
+; CHECK-SSE-NEXT: retq
+;
; CHECK-AVX-LABEL: test5:
-; CHECK-AVX: vroundss $10
+; CHECK-AVX: ## %bb.0:
+; CHECK-AVX-NEXT: vroundss $10, %xmm0, %xmm0, %xmm0
+; CHECK-AVX-NEXT: retq
+;
+; CHECK-AVX512-LABEL: test5:
+; CHECK-AVX512: ## %bb.0:
+; CHECK-AVX512-NEXT: vrndscaless $10, %xmm0, %xmm0, %xmm0
+; CHECK-AVX512-NEXT: retq
+ %call = tail call float @ceilf(float %x) nounwind readnone
+ ret float %call
}
declare float @ceilf(float) nounwind readnone
define double @test6(double %x) nounwind {
- %call = tail call double @ceil(double %x) nounwind readnone
- ret double %call
-
; CHECK-SSE-LABEL: test6:
-; CHECK-SSE: roundsd $10
-
+; CHECK-SSE: ## %bb.0:
+; CHECK-SSE-NEXT: roundsd $10, %xmm0, %xmm0
+; CHECK-SSE-NEXT: retq
+;
; CHECK-AVX-LABEL: test6:
-; CHECK-AVX: vroundsd $10
+; CHECK-AVX: ## %bb.0:
+; CHECK-AVX-NEXT: vroundsd $10, %xmm0, %xmm0, %xmm0
+; CHECK-AVX-NEXT: retq
+;
+; CHECK-AVX512-LABEL: test6:
+; CHECK-AVX512: ## %bb.0:
+; CHECK-AVX512-NEXT: vrndscalesd $10, %xmm0, %xmm0, %xmm0
+; CHECK-AVX512-NEXT: retq
+ %call = tail call double @ceil(double %x) nounwind readnone
+ ret double %call
}
declare double @ceil(double) nounwind readnone
define float @test7(float %x) nounwind {
- %call = tail call float @rintf(float %x) nounwind readnone
- ret float %call
-
; CHECK-SSE-LABEL: test7:
-; CHECK-SSE: roundss $4
-
+; CHECK-SSE: ## %bb.0:
+; CHECK-SSE-NEXT: roundss $4, %xmm0, %xmm0
+; CHECK-SSE-NEXT: retq
+;
; CHECK-AVX-LABEL: test7:
-; CHECK-AVX: vroundss $4
+; CHECK-AVX: ## %bb.0:
+; CHECK-AVX-NEXT: vroundss $4, %xmm0, %xmm0, %xmm0
+; CHECK-AVX-NEXT: retq
+;
+; CHECK-AVX512-LABEL: test7:
+; CHECK-AVX512: ## %bb.0:
+; CHECK-AVX512-NEXT: vrndscaless $4, %xmm0, %xmm0, %xmm0
+; CHECK-AVX512-NEXT: retq
+ %call = tail call float @rintf(float %x) nounwind readnone
+ ret float %call
}
declare float @rintf(float) nounwind readnone
define double @test8(double %x) nounwind {
- %call = tail call double @rint(double %x) nounwind readnone
- ret double %call
-
; CHECK-SSE-LABEL: test8:
-; CHECK-SSE: roundsd $4
-
+; CHECK-SSE: ## %bb.0:
+; CHECK-SSE-NEXT: roundsd $4, %xmm0, %xmm0
+; CHECK-SSE-NEXT: retq
+;
; CHECK-AVX-LABEL: test8:
-; CHECK-AVX: vroundsd $4
+; CHECK-AVX: ## %bb.0:
+; CHECK-AVX-NEXT: vroundsd $4, %xmm0, %xmm0, %xmm0
+; CHECK-AVX-NEXT: retq
+;
+; CHECK-AVX512-LABEL: test8:
+; CHECK-AVX512: ## %bb.0:
+; CHECK-AVX512-NEXT: vrndscalesd $4, %xmm0, %xmm0, %xmm0
+; CHECK-AVX512-NEXT: retq
+ %call = tail call double @rint(double %x) nounwind readnone
+ ret double %call
}
declare double @rint(double) nounwind readnone
define float @test9(float %x) nounwind {
- %call = tail call float @truncf(float %x) nounwind readnone
- ret float %call
-
; CHECK-SSE-LABEL: test9:
-; CHECK-SSE: roundss $11
-
+; CHECK-SSE: ## %bb.0:
+; CHECK-SSE-NEXT: roundss $11, %xmm0, %xmm0
+; CHECK-SSE-NEXT: retq
+;
; CHECK-AVX-LABEL: test9:
-; CHECK-AVX: vroundss $11
+; CHECK-AVX: ## %bb.0:
+; CHECK-AVX-NEXT: vroundss $11, %xmm0, %xmm0, %xmm0
+; CHECK-AVX-NEXT: retq
+;
+; CHECK-AVX512-LABEL: test9:
+; CHECK-AVX512: ## %bb.0:
+; CHECK-AVX512-NEXT: vrndscaless $11, %xmm0, %xmm0, %xmm0
+; CHECK-AVX512-NEXT: retq
+ %call = tail call float @truncf(float %x) nounwind readnone
+ ret float %call
}
declare float @truncf(float) nounwind readnone
define double @test10(double %x) nounwind {
- %call = tail call double @trunc(double %x) nounwind readnone
- ret double %call
-
; CHECK-SSE-LABEL: test10:
-; CHECK-SSE: roundsd $11
-
+; CHECK-SSE: ## %bb.0:
+; CHECK-SSE-NEXT: roundsd $11, %xmm0, %xmm0
+; CHECK-SSE-NEXT: retq
+;
; CHECK-AVX-LABEL: test10:
-; CHECK-AVX: vroundsd $11
+; CHECK-AVX: ## %bb.0:
+; CHECK-AVX-NEXT: vroundsd $11, %xmm0, %xmm0, %xmm0
+; CHECK-AVX-NEXT: retq
+;
+; CHECK-AVX512-LABEL: test10:
+; CHECK-AVX512: ## %bb.0:
+; CHECK-AVX512-NEXT: vrndscalesd $11, %xmm0, %xmm0, %xmm0
+; CHECK-AVX512-NEXT: retq
+ %call = tail call double @trunc(double %x) nounwind readnone
+ ret double %call
}
declare double @trunc(double) nounwind readnone
diff --git a/test/CodeGen/X86/rrlist-livereg-corrutpion.ll b/test/CodeGen/X86/rrlist-livereg-corrutpion.ll
index 7191e0453a66..e293bd606d0b 100644
--- a/test/CodeGen/X86/rrlist-livereg-corrutpion.ll
+++ b/test/CodeGen/X86/rrlist-livereg-corrutpion.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-- | FileCheck %s
; CHECK-LABEL: test
define i64 @test(i64 %a, i256 %b, i1 %c) {
diff --git a/test/CodeGen/X86/rtm-schedule.ll b/test/CodeGen/X86/rtm-schedule.ll
new file mode 100644
index 000000000000..c29eb5459910
--- /dev/null
+++ b/test/CodeGen/X86/rtm-schedule.ll
@@ -0,0 +1,61 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=+rtm | FileCheck %s --check-prefix=CHECK --check-prefix=GENERIC
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=SKYLAKE --check-prefix=SKL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skx | FileCheck %s --check-prefix=CHECK --check-prefix=SKYLAKE --check-prefix=SKX
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=cannonlake | FileCheck %s --check-prefix=CHECK --check-prefix=SKYLAKE --check-prefix=CNL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=icelake | FileCheck %s --check-prefix=CHECK --check-prefix=SKYLAKE --check-prefix=ICL
+
+define i32 @test_xbegin() nounwind uwtable {
+; GENERIC-LABEL: test_xbegin:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: xbegin .LBB0_2 # sched: [100:0.33]
+; GENERIC-NEXT: # %bb.1:
+; GENERIC-NEXT: movl $-1, %eax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+; GENERIC-NEXT: .LBB0_2:
+; GENERIC-NEXT: # XABORT DEF # sched: [100:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKYLAKE-LABEL: test_xbegin:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: xbegin .LBB0_2 # sched: [100:0.25]
+; SKYLAKE-NEXT: # %bb.1:
+; SKYLAKE-NEXT: movl $-1, %eax # sched: [1:0.25]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+; SKYLAKE-NEXT: .LBB0_2:
+; SKYLAKE-NEXT: # XABORT DEF # sched: [100:0.25]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+ %1 = tail call i32 @llvm.x86.xbegin() nounwind
+ ret i32 %1
+}
+declare i32 @llvm.x86.xbegin() nounwind
+
+define void @test_xend() nounwind uwtable {
+; GENERIC-LABEL: test_xend:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: xend # sched: [100:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKYLAKE-LABEL: test_xend:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: xend # sched: [100:0.25]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+ tail call void @llvm.x86.xend() nounwind
+ ret void
+}
+declare void @llvm.x86.xend() nounwind
+
+define void @test_xabort() nounwind uwtable {
+; GENERIC-LABEL: test_xabort:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: xabort $2 # sched: [100:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SKYLAKE-LABEL: test_xabort:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: xabort $2 # sched: [100:0.25]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+ tail call void @llvm.x86.xabort(i8 2)
+ ret void
+}
+declare void @llvm.x86.xabort(i8) nounwind
diff --git a/test/CodeGen/X86/rtm.ll b/test/CodeGen/X86/rtm.ll
index a8562677c7bf..771e2344c008 100644
--- a/test/CodeGen/X86/rtm.ll
+++ b/test/CodeGen/X86/rtm.ll
@@ -9,18 +9,18 @@ declare void @f1()
define i32 @test_xbegin() nounwind uwtable {
; X86-LABEL: test_xbegin:
-; X86: # BB#0: # %entry
+; X86: # %bb.0: # %entry
; X86-NEXT: xbegin .LBB0_2
-; X86-NEXT: # BB#1: # %entry
+; X86-NEXT: # %bb.1: # %entry
; X86-NEXT: movl $-1, %eax
; X86: .LBB0_2: # %entry
; X86-NEXT: # XABORT DEF
; X86-NEXT: retl
;
; X64-LABEL: test_xbegin:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: xbegin .LBB0_2
-; X64-NEXT: # BB#1: # %entry
+; X64-NEXT: # %bb.1: # %entry
; X64-NEXT: movl $-1, %eax
; X64: .LBB0_2: # %entry
; X64-NEXT: # XABORT DEF
@@ -32,12 +32,12 @@ entry:
define void @test_xend() nounwind uwtable {
; X86-LABEL: test_xend:
-; X86: # BB#0: # %entry
+; X86: # %bb.0: # %entry
; X86-NEXT: xend
; X86-NEXT: retl
;
; X64-LABEL: test_xend:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: xend
; X64-NEXT: retq
entry:
@@ -47,12 +47,12 @@ entry:
define void @test_xabort() nounwind uwtable {
; X86-LABEL: test_xabort:
-; X86: # BB#0: # %entry
+; X86: # %bb.0: # %entry
; X86-NEXT: xabort $2
; X86-NEXT: retl
;
; X64-LABEL: test_xabort:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: xabort $2
; X64-NEXT: retq
entry:
@@ -62,15 +62,14 @@ entry:
define void @f2(i32 %x) nounwind uwtable {
; X86-LABEL: f2:
-; X86: # BB#0: # %entry
+; X86: # %bb.0: # %entry
; X86-NEXT: xabort $1
; X86-NEXT: calll f1
; X86-NEXT: retl
;
; X64-LABEL: f2:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: pushq %rax
-; X64-NEXT: .Lcfi0:
; X64-NEXT: .cfi_def_cfa_offset 16
; X64-NEXT: movl %edi, {{[0-9]+}}(%rsp)
; X64-NEXT: xabort $1
diff --git a/test/CodeGen/X86/sad.ll b/test/CodeGen/X86/sad.ll
index 6a565a5c76f0..3524c4aab1d7 100644
--- a/test/CodeGen/X86/sad.ll
+++ b/test/CodeGen/X86/sad.ll
@@ -9,7 +9,7 @@
define i32 @sad_16i8() nounwind {
; SSE2-LABEL: sad_16i8:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: pxor %xmm0, %xmm0
; SSE2-NEXT: movq $-1024, %rax # imm = 0xFC00
; SSE2-NEXT: pxor %xmm1, %xmm1
@@ -22,7 +22,7 @@ define i32 @sad_16i8() nounwind {
; SSE2-NEXT: paddd %xmm3, %xmm1
; SSE2-NEXT: addq $4, %rax
; SSE2-NEXT: jne .LBB0_1
-; SSE2-NEXT: # BB#2: # %middle.block
+; SSE2-NEXT: # %bb.2: # %middle.block
; SSE2-NEXT: paddd %xmm0, %xmm1
; SSE2-NEXT: paddd %xmm0, %xmm0
; SSE2-NEXT: paddd %xmm1, %xmm0
@@ -34,20 +34,19 @@ define i32 @sad_16i8() nounwind {
; SSE2-NEXT: retq
;
; AVX2-LABEL: sad_16i8:
-; AVX2: # BB#0: # %entry
-; AVX2-NEXT: vpxor %ymm0, %ymm0, %ymm0
+; AVX2: # %bb.0: # %entry
+; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0
; AVX2-NEXT: movq $-1024, %rax # imm = 0xFC00
-; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: .p2align 4, 0x90
; AVX2-NEXT: .LBB0_1: # %vector.body
; AVX2-NEXT: # =>This Inner Loop Header: Depth=1
; AVX2-NEXT: vmovdqu a+1024(%rax), %xmm2
; AVX2-NEXT: vpsadbw b+1024(%rax), %xmm2, %xmm2
-; AVX2-NEXT: vpaddd %xmm1, %xmm2, %xmm2
-; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT: vpaddd %ymm1, %ymm2, %ymm1
; AVX2-NEXT: addq $4, %rax
; AVX2-NEXT: jne .LBB0_1
-; AVX2-NEXT: # BB#2: # %middle.block
+; AVX2-NEXT: # %bb.2: # %middle.block
; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
@@ -59,52 +58,50 @@ define i32 @sad_16i8() nounwind {
; AVX2-NEXT: retq
;
; AVX512F-LABEL: sad_16i8:
-; AVX512F: # BB#0: # %entry
-; AVX512F-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; AVX512F: # %bb.0: # %entry
+; AVX512F-NEXT: vpxor %xmm0, %xmm0, %xmm0
; AVX512F-NEXT: movq $-1024, %rax # imm = 0xFC00
; AVX512F-NEXT: .p2align 4, 0x90
; AVX512F-NEXT: .LBB0_1: # %vector.body
; AVX512F-NEXT: # =>This Inner Loop Header: Depth=1
; AVX512F-NEXT: vmovdqu a+1024(%rax), %xmm1
; AVX512F-NEXT: vpsadbw b+1024(%rax), %xmm1, %xmm1
-; AVX512F-NEXT: vpaddd %xmm0, %xmm1, %xmm1
-; AVX512F-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0
+; AVX512F-NEXT: vpaddd %zmm0, %zmm1, %zmm0
; AVX512F-NEXT: addq $4, %rax
; AVX512F-NEXT: jne .LBB0_1
-; AVX512F-NEXT: # BB#2: # %middle.block
-; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,0,1]
+; AVX512F-NEXT: # %bb.2: # %middle.block
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0
-; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[2,3,0,1,0,1,0,1]
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0
-; AVX512F-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15]
+; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0
-; AVX512F-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[1,1,2,3,5,5,6,7,9,9,10,11,13,13,14,15]
+; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0
; AVX512F-NEXT: vmovd %xmm0, %eax
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: sad_16i8:
-; AVX512BW: # BB#0: # %entry
-; AVX512BW-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; AVX512BW: # %bb.0: # %entry
+; AVX512BW-NEXT: vpxor %xmm0, %xmm0, %xmm0
; AVX512BW-NEXT: movq $-1024, %rax # imm = 0xFC00
; AVX512BW-NEXT: .p2align 4, 0x90
; AVX512BW-NEXT: .LBB0_1: # %vector.body
; AVX512BW-NEXT: # =>This Inner Loop Header: Depth=1
; AVX512BW-NEXT: vmovdqu a+1024(%rax), %xmm1
; AVX512BW-NEXT: vpsadbw b+1024(%rax), %xmm1, %xmm1
-; AVX512BW-NEXT: vpaddd %xmm0, %xmm1, %xmm1
-; AVX512BW-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpaddd %zmm0, %zmm1, %zmm0
; AVX512BW-NEXT: addq $4, %rax
; AVX512BW-NEXT: jne .LBB0_1
-; AVX512BW-NEXT: # BB#2: # %middle.block
-; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,0,1]
+; AVX512BW-NEXT: # %bb.2: # %middle.block
+; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[2,3,0,1,0,1,0,1]
+; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15]
+; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[1,1,2,3,5,5,6,7,9,9,10,11,13,13,14,15]
+; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vmovd %xmm0, %eax
; AVX512BW-NEXT: vzeroupper
@@ -148,7 +145,7 @@ middle.block:
define i32 @sad_32i8() nounwind {
; SSE2-LABEL: sad_32i8:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: pxor %xmm12, %xmm12
; SSE2-NEXT: movq $-1024, %rax # imm = 0xFC00
; SSE2-NEXT: pxor %xmm13, %xmm13
@@ -264,7 +261,7 @@ define i32 @sad_32i8() nounwind {
; SSE2-NEXT: paddd %xmm8, %xmm0
; SSE2-NEXT: addq $4, %rax
; SSE2-NEXT: jne .LBB1_1
-; SSE2-NEXT: # BB#2: # %middle.block
+; SSE2-NEXT: # %bb.2: # %middle.block
; SSE2-NEXT: paddd %xmm15, %xmm6
; SSE2-NEXT: paddd %xmm0, %xmm3
; SSE2-NEXT: paddd %xmm6, %xmm3
@@ -280,10 +277,10 @@ define i32 @sad_32i8() nounwind {
; SSE2-NEXT: retq
;
; AVX2-LABEL: sad_32i8:
-; AVX2: # BB#0: # %entry
-; AVX2-NEXT: vpxor %ymm0, %ymm0, %ymm0
+; AVX2: # %bb.0: # %entry
+; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0
; AVX2-NEXT: movq $-1024, %rax # imm = 0xFC00
-; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: .p2align 4, 0x90
; AVX2-NEXT: .LBB1_1: # %vector.body
; AVX2-NEXT: # =>This Inner Loop Header: Depth=1
@@ -292,7 +289,7 @@ define i32 @sad_32i8() nounwind {
; AVX2-NEXT: vpaddd %ymm1, %ymm2, %ymm1
; AVX2-NEXT: addq $4, %rax
; AVX2-NEXT: jne .LBB1_1
-; AVX2-NEXT: # BB#2: # %middle.block
+; AVX2-NEXT: # %bb.2: # %middle.block
; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm1
; AVX2-NEXT: vpaddd %ymm0, %ymm0, %ymm0
; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0
@@ -306,56 +303,54 @@ define i32 @sad_32i8() nounwind {
; AVX2-NEXT: retq
;
; AVX512F-LABEL: sad_32i8:
-; AVX512F: # BB#0: # %entry
-; AVX512F-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; AVX512F: # %bb.0: # %entry
+; AVX512F-NEXT: vpxor %xmm0, %xmm0, %xmm0
; AVX512F-NEXT: movq $-1024, %rax # imm = 0xFC00
-; AVX512F-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512F-NEXT: .p2align 4, 0x90
; AVX512F-NEXT: .LBB1_1: # %vector.body
; AVX512F-NEXT: # =>This Inner Loop Header: Depth=1
; AVX512F-NEXT: vmovdqa a+1024(%rax), %ymm2
; AVX512F-NEXT: vpsadbw b+1024(%rax), %ymm2, %ymm2
-; AVX512F-NEXT: vpaddd %ymm1, %ymm2, %ymm2
-; AVX512F-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm1
+; AVX512F-NEXT: vpaddd %zmm1, %zmm2, %zmm1
; AVX512F-NEXT: addq $4, %rax
; AVX512F-NEXT: jne .LBB1_1
-; AVX512F-NEXT: # BB#2: # %middle.block
+; AVX512F-NEXT: # %bb.2: # %middle.block
; AVX512F-NEXT: vpaddd %zmm0, %zmm1, %zmm0
-; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,0,1]
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0
-; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[2,3,0,1,0,1,0,1]
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0
-; AVX512F-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15]
+; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0
-; AVX512F-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[1,1,2,3,5,5,6,7,9,9,10,11,13,13,14,15]
+; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0
; AVX512F-NEXT: vmovd %xmm0, %eax
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: sad_32i8:
-; AVX512BW: # BB#0: # %entry
-; AVX512BW-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; AVX512BW: # %bb.0: # %entry
+; AVX512BW-NEXT: vpxor %xmm0, %xmm0, %xmm0
; AVX512BW-NEXT: movq $-1024, %rax # imm = 0xFC00
-; AVX512BW-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512BW-NEXT: .p2align 4, 0x90
; AVX512BW-NEXT: .LBB1_1: # %vector.body
; AVX512BW-NEXT: # =>This Inner Loop Header: Depth=1
; AVX512BW-NEXT: vmovdqa a+1024(%rax), %ymm2
; AVX512BW-NEXT: vpsadbw b+1024(%rax), %ymm2, %ymm2
-; AVX512BW-NEXT: vpaddd %ymm1, %ymm2, %ymm2
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm1
+; AVX512BW-NEXT: vpaddd %zmm1, %zmm2, %zmm1
; AVX512BW-NEXT: addq $4, %rax
; AVX512BW-NEXT: jne .LBB1_1
-; AVX512BW-NEXT: # BB#2: # %middle.block
+; AVX512BW-NEXT: # %bb.2: # %middle.block
; AVX512BW-NEXT: vpaddd %zmm0, %zmm1, %zmm0
-; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,0,1]
+; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[2,3,0,1,0,1,0,1]
+; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15]
+; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[1,1,2,3,5,5,6,7,9,9,10,11,13,13,14,15]
+; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vmovd %xmm0, %eax
; AVX512BW-NEXT: vzeroupper
@@ -401,7 +396,7 @@ middle.block:
define i32 @sad_avx64i8() nounwind {
; SSE2-LABEL: sad_avx64i8:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: subq $200, %rsp
; SSE2-NEXT: pxor %xmm14, %xmm14
; SSE2-NEXT: movq $-1024, %rax # imm = 0xFC00
@@ -658,7 +653,7 @@ define i32 @sad_avx64i8() nounwind {
; SSE2-NEXT: paddd %xmm7, %xmm0
; SSE2-NEXT: addq $4, %rax
; SSE2-NEXT: jne .LBB2_1
-; SSE2-NEXT: # BB#2: # %middle.block
+; SSE2-NEXT: # %bb.2: # %middle.block
; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Folded Reload
; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm8 # 16-byte Folded Reload
; SSE2-NEXT: paddd %xmm3, %xmm8
@@ -683,16 +678,16 @@ define i32 @sad_avx64i8() nounwind {
; SSE2-NEXT: retq
;
; AVX2-LABEL: sad_avx64i8:
-; AVX2: # BB#0: # %entry
-; AVX2-NEXT: vpxor %ymm0, %ymm0, %ymm0
+; AVX2: # %bb.0: # %entry
+; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0
; AVX2-NEXT: movq $-1024, %rax # imm = 0xFC00
-; AVX2-NEXT: vpxor %ymm2, %ymm2, %ymm2
-; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
-; AVX2-NEXT: vpxor %ymm4, %ymm4, %ymm4
-; AVX2-NEXT: vpxor %ymm3, %ymm3, %ymm3
-; AVX2-NEXT: vpxor %ymm6, %ymm6, %ymm6
-; AVX2-NEXT: vpxor %ymm5, %ymm5, %ymm5
-; AVX2-NEXT: vpxor %ymm7, %ymm7, %ymm7
+; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX2-NEXT: vpxor %xmm6, %xmm6, %xmm6
+; AVX2-NEXT: vpxor %xmm5, %xmm5, %xmm5
+; AVX2-NEXT: vpxor %xmm7, %xmm7, %xmm7
; AVX2-NEXT: .p2align 4, 0x90
; AVX2-NEXT: .LBB2_1: # %vector.body
; AVX2-NEXT: # =>This Inner Loop Header: Depth=1
@@ -741,7 +736,7 @@ define i32 @sad_avx64i8() nounwind {
; AVX2-NEXT: vpaddd %ymm4, %ymm8, %ymm4
; AVX2-NEXT: addq $4, %rax
; AVX2-NEXT: jne .LBB2_1
-; AVX2-NEXT: # BB#2: # %middle.block
+; AVX2-NEXT: # %bb.2: # %middle.block
; AVX2-NEXT: vpaddd %ymm6, %ymm2, %ymm2
; AVX2-NEXT: vpaddd %ymm7, %ymm4, %ymm4
; AVX2-NEXT: vpaddd %ymm4, %ymm2, %ymm2
@@ -759,12 +754,12 @@ define i32 @sad_avx64i8() nounwind {
; AVX2-NEXT: retq
;
; AVX512F-LABEL: sad_avx64i8:
-; AVX512F: # BB#0: # %entry
-; AVX512F-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; AVX512F: # %bb.0: # %entry
+; AVX512F-NEXT: vpxor %xmm0, %xmm0, %xmm0
; AVX512F-NEXT: movq $-1024, %rax # imm = 0xFC00
-; AVX512F-NEXT: vpxord %zmm1, %zmm1, %zmm1
-; AVX512F-NEXT: vpxord %zmm2, %zmm2, %zmm2
-; AVX512F-NEXT: vpxord %zmm3, %zmm3, %zmm3
+; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX512F-NEXT: .p2align 4, 0x90
; AVX512F-NEXT: .LBB2_1: # %vector.body
; AVX512F-NEXT: # =>This Inner Loop Header: Depth=1
@@ -790,46 +785,46 @@ define i32 @sad_avx64i8() nounwind {
; AVX512F-NEXT: vpaddd %zmm3, %zmm4, %zmm3
; AVX512F-NEXT: addq $4, %rax
; AVX512F-NEXT: jne .LBB2_1
-; AVX512F-NEXT: # BB#2: # %middle.block
+; AVX512F-NEXT: # %bb.2: # %middle.block
; AVX512F-NEXT: vpaddd %zmm2, %zmm0, %zmm0
; AVX512F-NEXT: vpaddd %zmm3, %zmm1, %zmm1
; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0
-; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,0,1]
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0
-; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[2,3,0,1,0,1,0,1]
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0
-; AVX512F-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15]
+; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0
-; AVX512F-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[1,1,2,3,5,5,6,7,9,9,10,11,13,13,14,15]
+; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0
; AVX512F-NEXT: vmovd %xmm0, %eax
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: sad_avx64i8:
-; AVX512BW: # BB#0: # %entry
-; AVX512BW-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; AVX512BW: # %bb.0: # %entry
+; AVX512BW-NEXT: vpxor %xmm0, %xmm0, %xmm0
; AVX512BW-NEXT: movq $-1024, %rax # imm = 0xFC00
-; AVX512BW-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512BW-NEXT: .p2align 4, 0x90
; AVX512BW-NEXT: .LBB2_1: # %vector.body
; AVX512BW-NEXT: # =>This Inner Loop Header: Depth=1
-; AVX512BW-NEXT: vmovdqu8 a+1024(%rax), %zmm2
+; AVX512BW-NEXT: vmovdqa64 a+1024(%rax), %zmm2
; AVX512BW-NEXT: vpsadbw b+1024(%rax), %zmm2, %zmm2
; AVX512BW-NEXT: vpaddd %zmm1, %zmm2, %zmm1
; AVX512BW-NEXT: addq $4, %rax
; AVX512BW-NEXT: jne .LBB2_1
-; AVX512BW-NEXT: # BB#2: # %middle.block
+; AVX512BW-NEXT: # %bb.2: # %middle.block
; AVX512BW-NEXT: vpaddd %zmm0, %zmm1, %zmm1
; AVX512BW-NEXT: vpaddd %zmm0, %zmm0, %zmm0
; AVX512BW-NEXT: vpaddd %zmm0, %zmm1, %zmm0
-; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,0,1]
+; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[2,3,0,1,0,1,0,1]
+; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15]
+; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[1,1,2,3,5,5,6,7,9,9,10,11,13,13,14,15]
+; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vmovd %xmm0, %eax
; AVX512BW-NEXT: vzeroupper
@@ -877,7 +872,7 @@ middle.block:
define i32 @sad_2i8() nounwind {
; SSE2-LABEL: sad_2i8:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: pxor %xmm0, %xmm0
; SSE2-NEXT: movq $-1024, %rax # imm = 0xFC00
; SSE2-NEXT: movl $65535, %ecx # imm = 0xFFFF
@@ -893,14 +888,14 @@ define i32 @sad_2i8() nounwind {
; SSE2-NEXT: paddq %xmm2, %xmm0
; SSE2-NEXT: addq $4, %rax
; SSE2-NEXT: jne .LBB3_1
-; SSE2-NEXT: # BB#2: # %middle.block
+; SSE2-NEXT: # %bb.2: # %middle.block
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; SSE2-NEXT: paddq %xmm0, %xmm1
; SSE2-NEXT: movd %xmm1, %eax
; SSE2-NEXT: retq
;
; AVX2-LABEL: sad_2i8:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0
; AVX2-NEXT: movq $-1024, %rax # imm = 0xFC00
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
@@ -915,14 +910,14 @@ define i32 @sad_2i8() nounwind {
; AVX2-NEXT: vpaddq %xmm1, %xmm2, %xmm1
; AVX2-NEXT: addq $4, %rax
; AVX2-NEXT: jne .LBB3_1
-; AVX2-NEXT: # BB#2: # %middle.block
+; AVX2-NEXT: # %bb.2: # %middle.block
; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
; AVX2-NEXT: vpaddq %xmm0, %xmm1, %xmm0
; AVX2-NEXT: vmovd %xmm0, %eax
; AVX2-NEXT: retq
;
; AVX512F-LABEL: sad_2i8:
-; AVX512F: # BB#0: # %entry
+; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vpxor %xmm0, %xmm0, %xmm0
; AVX512F-NEXT: movq $-1024, %rax # imm = 0xFC00
; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
@@ -937,14 +932,14 @@ define i32 @sad_2i8() nounwind {
; AVX512F-NEXT: vpaddq %xmm1, %xmm2, %xmm1
; AVX512F-NEXT: addq $4, %rax
; AVX512F-NEXT: jne .LBB3_1
-; AVX512F-NEXT: # BB#2: # %middle.block
+; AVX512F-NEXT: # %bb.2: # %middle.block
; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
; AVX512F-NEXT: vpaddq %xmm0, %xmm1, %xmm0
; AVX512F-NEXT: vmovd %xmm0, %eax
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: sad_2i8:
-; AVX512BW: # BB#0: # %entry
+; AVX512BW: # %bb.0: # %entry
; AVX512BW-NEXT: vpxor %xmm0, %xmm0, %xmm0
; AVX512BW-NEXT: movq $-1024, %rax # imm = 0xFC00
; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
@@ -959,7 +954,7 @@ define i32 @sad_2i8() nounwind {
; AVX512BW-NEXT: vpaddq %xmm1, %xmm2, %xmm1
; AVX512BW-NEXT: addq $4, %rax
; AVX512BW-NEXT: jne .LBB3_1
-; AVX512BW-NEXT: # BB#2: # %middle.block
+; AVX512BW-NEXT: # %bb.2: # %middle.block
; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
; AVX512BW-NEXT: vpaddq %xmm0, %xmm1, %xmm0
; AVX512BW-NEXT: vmovd %xmm0, %eax
@@ -997,7 +992,7 @@ middle.block:
define i32 @sad_nonloop_4i8(<4 x i8>* nocapture readonly %p, i64, <4 x i8>* nocapture readonly %q) local_unnamed_addr #0 {
; SSE2-LABEL: sad_nonloop_4i8:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; SSE2-NEXT: psadbw %xmm0, %xmm1
@@ -1005,7 +1000,7 @@ define i32 @sad_nonloop_4i8(<4 x i8>* nocapture readonly %p, i64, <4 x i8>* noca
; SSE2-NEXT: retq
;
; AVX2-LABEL: sad_nonloop_4i8:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX2-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; AVX2-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
@@ -1013,7 +1008,7 @@ define i32 @sad_nonloop_4i8(<4 x i8>* nocapture readonly %p, i64, <4 x i8>* noca
; AVX2-NEXT: retq
;
; AVX512F-LABEL: sad_nonloop_4i8:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX512F-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; AVX512F-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
@@ -1021,7 +1016,7 @@ define i32 @sad_nonloop_4i8(<4 x i8>* nocapture readonly %p, i64, <4 x i8>* noca
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: sad_nonloop_4i8:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX512BW-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; AVX512BW-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
@@ -1045,7 +1040,7 @@ define i32 @sad_nonloop_4i8(<4 x i8>* nocapture readonly %p, i64, <4 x i8>* noca
define i32 @sad_nonloop_8i8(<8 x i8>* nocapture readonly %p, i64, <8 x i8>* nocapture readonly %q) local_unnamed_addr #0 {
; SSE2-LABEL: sad_nonloop_8i8:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
; SSE2-NEXT: psadbw %xmm0, %xmm1
@@ -1053,7 +1048,7 @@ define i32 @sad_nonloop_8i8(<8 x i8>* nocapture readonly %p, i64, <8 x i8>* noca
; SSE2-NEXT: retq
;
; AVX2-LABEL: sad_nonloop_8i8:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; AVX2-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
@@ -1061,7 +1056,7 @@ define i32 @sad_nonloop_8i8(<8 x i8>* nocapture readonly %p, i64, <8 x i8>* noca
; AVX2-NEXT: retq
;
; AVX512F-LABEL: sad_nonloop_8i8:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; AVX512F-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
@@ -1069,7 +1064,7 @@ define i32 @sad_nonloop_8i8(<8 x i8>* nocapture readonly %p, i64, <8 x i8>* noca
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: sad_nonloop_8i8:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; AVX512BW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; AVX512BW-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
@@ -1095,7 +1090,7 @@ define i32 @sad_nonloop_8i8(<8 x i8>* nocapture readonly %p, i64, <8 x i8>* noca
define i32 @sad_nonloop_16i8(<16 x i8>* nocapture readonly %p, i64, <16 x i8>* nocapture readonly %q) local_unnamed_addr #0 {
; SSE2-LABEL: sad_nonloop_16i8:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqu (%rdi), %xmm0
; SSE2-NEXT: movdqu (%rdx), %xmm1
; SSE2-NEXT: psadbw %xmm0, %xmm1
@@ -1105,7 +1100,7 @@ define i32 @sad_nonloop_16i8(<16 x i8>* nocapture readonly %p, i64, <16 x i8>* n
; SSE2-NEXT: retq
;
; AVX2-LABEL: sad_nonloop_16i8:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqu (%rdi), %xmm0
; AVX2-NEXT: vpsadbw (%rdx), %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
@@ -1114,7 +1109,7 @@ define i32 @sad_nonloop_16i8(<16 x i8>* nocapture readonly %p, i64, <16 x i8>* n
; AVX2-NEXT: retq
;
; AVX512F-LABEL: sad_nonloop_16i8:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqu (%rdi), %xmm0
; AVX512F-NEXT: vpsadbw (%rdx), %xmm0, %xmm0
; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
@@ -1123,7 +1118,7 @@ define i32 @sad_nonloop_16i8(<16 x i8>* nocapture readonly %p, i64, <16 x i8>* n
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: sad_nonloop_16i8:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqu (%rdi), %xmm0
; AVX512BW-NEXT: vpsadbw (%rdx), %xmm0, %xmm0
; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
@@ -1152,7 +1147,7 @@ define i32 @sad_nonloop_16i8(<16 x i8>* nocapture readonly %p, i64, <16 x i8>* n
define i32 @sad_nonloop_32i8(<32 x i8>* nocapture readonly %p, i64, <32 x i8>* nocapture readonly %q) local_unnamed_addr #0 {
; SSE2-LABEL: sad_nonloop_32i8:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqu (%rdi), %xmm0
; SSE2-NEXT: movdqu 16(%rdi), %xmm12
; SSE2-NEXT: pxor %xmm1, %xmm1
@@ -1249,7 +1244,7 @@ define i32 @sad_nonloop_32i8(<32 x i8>* nocapture readonly %p, i64, <32 x i8>* n
; SSE2-NEXT: retq
;
; AVX2-LABEL: sad_nonloop_32i8:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqu (%rdi), %ymm0
; AVX2-NEXT: vpsadbw (%rdx), %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
@@ -1261,7 +1256,7 @@ define i32 @sad_nonloop_32i8(<32 x i8>* nocapture readonly %p, i64, <32 x i8>* n
; AVX2-NEXT: retq
;
; AVX512F-LABEL: sad_nonloop_32i8:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqu (%rdi), %ymm0
; AVX512F-NEXT: vpsadbw (%rdx), %ymm0, %ymm0
; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
@@ -1273,7 +1268,7 @@ define i32 @sad_nonloop_32i8(<32 x i8>* nocapture readonly %p, i64, <32 x i8>* n
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: sad_nonloop_32i8:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqu (%rdi), %ymm0
; AVX512BW-NEXT: vpsadbw (%rdx), %ymm0, %ymm0
; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
diff --git a/test/CodeGen/X86/sad_variations.ll b/test/CodeGen/X86/sad_variations.ll
index 04fda5ed8774..cea86091a2bb 100644
--- a/test/CodeGen/X86/sad_variations.ll
+++ b/test/CodeGen/X86/sad_variations.ll
@@ -5,7 +5,7 @@
define i32 @sad8_32bit_icmp_sge(i8* nocapture readonly %cur, i8* nocapture readonly %ref, i32 %stride) local_unnamed_addr #0 {
; SSE2-LABEL: sad8_32bit_icmp_sge:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
; SSE2-NEXT: psadbw %xmm0, %xmm1
@@ -13,7 +13,7 @@ define i32 @sad8_32bit_icmp_sge(i8* nocapture readonly %cur, i8* nocapture reado
; SSE2-NEXT: retq
;
; AVX2-LABEL: sad8_32bit_icmp_sge:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; AVX2-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
@@ -21,7 +21,7 @@ define i32 @sad8_32bit_icmp_sge(i8* nocapture readonly %cur, i8* nocapture reado
; AVX2-NEXT: retq
;
; AVX512F-LABEL: sad8_32bit_icmp_sge:
-; AVX512F: # BB#0: # %entry
+; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; AVX512F-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
@@ -55,7 +55,7 @@ for.body: ; preds = %entry
define i32 @sad8_32bit_icmp_sgt(i8* nocapture readonly %cur, i8* nocapture readonly %ref, i32 %stride) local_unnamed_addr #1 {
; SSE2-LABEL: sad8_32bit_icmp_sgt:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
; SSE2-NEXT: psadbw %xmm0, %xmm1
@@ -63,7 +63,7 @@ define i32 @sad8_32bit_icmp_sgt(i8* nocapture readonly %cur, i8* nocapture reado
; SSE2-NEXT: retq
;
; AVX2-LABEL: sad8_32bit_icmp_sgt:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; AVX2-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
@@ -71,7 +71,7 @@ define i32 @sad8_32bit_icmp_sgt(i8* nocapture readonly %cur, i8* nocapture reado
; AVX2-NEXT: retq
;
; AVX512F-LABEL: sad8_32bit_icmp_sgt:
-; AVX512F: # BB#0: # %entry
+; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; AVX512F-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
@@ -104,7 +104,7 @@ for.body: ; preds = %entry
define i32 @sad8_32bit_icmp_sle(i8* nocapture readonly %cur, i8* nocapture readonly %ref, i32 %stride) local_unnamed_addr #2 {
; SSE2-LABEL: sad8_32bit_icmp_sle:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
; SSE2-NEXT: psadbw %xmm0, %xmm1
@@ -112,7 +112,7 @@ define i32 @sad8_32bit_icmp_sle(i8* nocapture readonly %cur, i8* nocapture reado
; SSE2-NEXT: retq
;
; AVX2-LABEL: sad8_32bit_icmp_sle:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; AVX2-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
@@ -120,7 +120,7 @@ define i32 @sad8_32bit_icmp_sle(i8* nocapture readonly %cur, i8* nocapture reado
; AVX2-NEXT: retq
;
; AVX512F-LABEL: sad8_32bit_icmp_sle:
-; AVX512F: # BB#0: # %entry
+; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; AVX512F-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
@@ -153,7 +153,7 @@ for.body: ; preds = %entry
define i32 @sad8_32bit_icmp_slt(i8* nocapture readonly %cur, i8* nocapture readonly %ref, i32 %stride) local_unnamed_addr #3 {
; SSE2-LABEL: sad8_32bit_icmp_slt:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
; SSE2-NEXT: psadbw %xmm0, %xmm1
@@ -161,7 +161,7 @@ define i32 @sad8_32bit_icmp_slt(i8* nocapture readonly %cur, i8* nocapture reado
; SSE2-NEXT: retq
;
; AVX2-LABEL: sad8_32bit_icmp_slt:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; AVX2-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
@@ -169,7 +169,7 @@ define i32 @sad8_32bit_icmp_slt(i8* nocapture readonly %cur, i8* nocapture reado
; AVX2-NEXT: retq
;
; AVX512F-LABEL: sad8_32bit_icmp_slt:
-; AVX512F: # BB#0: # %entry
+; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; AVX512F-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
@@ -202,7 +202,7 @@ for.body: ; preds = %entry
define i64 @sad8_64bit_icmp_sext_slt(i8* nocapture readonly %cur, i8* nocapture readonly %ref, i64 %stride) local_unnamed_addr #4 {
; SSE2-LABEL: sad8_64bit_icmp_sext_slt:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
; SSE2-NEXT: psadbw %xmm0, %xmm1
@@ -210,7 +210,7 @@ define i64 @sad8_64bit_icmp_sext_slt(i8* nocapture readonly %cur, i8* nocapture
; SSE2-NEXT: retq
;
; AVX2-LABEL: sad8_64bit_icmp_sext_slt:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; AVX2-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
@@ -218,7 +218,7 @@ define i64 @sad8_64bit_icmp_sext_slt(i8* nocapture readonly %cur, i8* nocapture
; AVX2-NEXT: retq
;
; AVX512F-LABEL: sad8_64bit_icmp_sext_slt:
-; AVX512F: # BB#0: # %entry
+; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; AVX512F-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
@@ -251,7 +251,7 @@ for.body: ; preds = %entry
define i64 @sad8_64bit_icmp_zext_slt(i8* nocapture readonly %cur, i8* nocapture readonly %ref, i64 %stride) local_unnamed_addr #4 {
; SSE2-LABEL: sad8_64bit_icmp_zext_slt:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
; SSE2-NEXT: psadbw %xmm0, %xmm1
@@ -259,7 +259,7 @@ define i64 @sad8_64bit_icmp_zext_slt(i8* nocapture readonly %cur, i8* nocapture
; SSE2-NEXT: retq
;
; AVX2-LABEL: sad8_64bit_icmp_zext_slt:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; AVX2-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
@@ -267,7 +267,7 @@ define i64 @sad8_64bit_icmp_zext_slt(i8* nocapture readonly %cur, i8* nocapture
; AVX2-NEXT: retq
;
; AVX512F-LABEL: sad8_64bit_icmp_zext_slt:
-; AVX512F: # BB#0: # %entry
+; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; AVX512F-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
@@ -300,7 +300,7 @@ for.body: ; preds = %entry
define i64 @sad8_early_64bit_icmp_zext_slt(i8* nocapture readonly %cur, i8* nocapture readonly %ref, i64 %stride) local_unnamed_addr #4 {
; SSE2-LABEL: sad8_early_64bit_icmp_zext_slt:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
; SSE2-NEXT: psadbw %xmm0, %xmm1
@@ -308,7 +308,7 @@ define i64 @sad8_early_64bit_icmp_zext_slt(i8* nocapture readonly %cur, i8* noca
; SSE2-NEXT: retq
;
; AVX2-LABEL: sad8_early_64bit_icmp_zext_slt:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; AVX2-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
@@ -316,7 +316,7 @@ define i64 @sad8_early_64bit_icmp_zext_slt(i8* nocapture readonly %cur, i8* noca
; AVX2-NEXT: retq
;
; AVX512F-LABEL: sad8_early_64bit_icmp_zext_slt:
-; AVX512F: # BB#0: # %entry
+; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; AVX512F-NEXT: vpsadbw %xmm0, %xmm1, %xmm0
diff --git a/test/CodeGen/X86/sandybridge-loads.ll b/test/CodeGen/X86/sandybridge-loads.ll
index 8570fe7fe7ba..7e6272998f35 100644
--- a/test/CodeGen/X86/sandybridge-loads.ll
+++ b/test/CodeGen/X86/sandybridge-loads.ll
@@ -3,7 +3,7 @@
define void @wideloads(<8 x float>* %a, <8 x float>* %b, <8 x float>* %c) nounwind uwtable noinline ssp {
; CHECK-LABEL: wideloads:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps (%rdi), %xmm0
; CHECK-NEXT: vinsertf128 $1, 16(%rdi), %ymm0, %ymm0
; CHECK-NEXT: vmovaps (%rsi), %ymm1
@@ -28,7 +28,7 @@ define void @wideloads(<8 x float>* %a, <8 x float>* %b, <8 x float>* %c) nounwi
define void @widestores(<8 x float>* %a, <8 x float>* %b, <8 x float>* %c) nounwind uwtable noinline ssp {
; CHECK-LABEL: widestores:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps (%rdi), %ymm0
; CHECK-NEXT: vmovaps (%rsi), %ymm1
; CHECK-NEXT: vmovaps %ymm0, (%rsi)
diff --git a/test/CodeGen/X86/sar_fold.ll b/test/CodeGen/X86/sar_fold.ll
index bd0d0c7057d3..195d0745b3ff 100644
--- a/test/CodeGen/X86/sar_fold.ll
+++ b/test/CodeGen/X86/sar_fold.ll
@@ -2,7 +2,7 @@
define i32 @shl16sar15(i32 %a) #0 {
; CHECK-LABEL: shl16sar15:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movswl {{[0-9]+}}(%esp), %eax
%1 = shl i32 %a, 16
%2 = ashr exact i32 %1, 15
@@ -11,7 +11,7 @@ define i32 @shl16sar15(i32 %a) #0 {
define i32 @shl16sar17(i32 %a) #0 {
; CHECK-LABEL: shl16sar17:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movswl {{[0-9]+}}(%esp), %eax
%1 = shl i32 %a, 16
%2 = ashr exact i32 %1, 17
@@ -20,7 +20,7 @@ define i32 @shl16sar17(i32 %a) #0 {
define i32 @shl24sar23(i32 %a) #0 {
; CHECK-LABEL: shl24sar23:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movsbl {{[0-9]+}}(%esp), %eax
%1 = shl i32 %a, 24
%2 = ashr exact i32 %1, 23
@@ -29,7 +29,7 @@ define i32 @shl24sar23(i32 %a) #0 {
define i32 @shl24sar25(i32 %a) #0 {
; CHECK-LABEL: shl24sar25:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movsbl {{[0-9]+}}(%esp), %eax
%1 = shl i32 %a, 24
%2 = ashr exact i32 %1, 25
diff --git a/test/CodeGen/X86/sar_fold64.ll b/test/CodeGen/X86/sar_fold64.ll
index 66ad8c3f40fa..2c6229a0dec6 100644
--- a/test/CodeGen/X86/sar_fold64.ll
+++ b/test/CodeGen/X86/sar_fold64.ll
@@ -3,10 +3,10 @@
define i32 @shl48sar47(i64 %a) #0 {
; CHECK-LABEL: shl48sar47:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movswq %di, %rax
; CHECK-NEXT: addl %eax, %eax
-; CHECK-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill>
+; CHECK-NEXT: # kill: def %eax killed %eax killed %rax
; CHECK-NEXT: retq
%1 = shl i64 %a, 48
%2 = ashr exact i64 %1, 47
@@ -16,10 +16,10 @@ define i32 @shl48sar47(i64 %a) #0 {
define i32 @shl48sar49(i64 %a) #0 {
; CHECK-LABEL: shl48sar49:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movswq %di, %rax
; CHECK-NEXT: shrq %rax
-; CHECK-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill>
+; CHECK-NEXT: # kill: def %eax killed %eax killed %rax
; CHECK-NEXT: retq
%1 = shl i64 %a, 48
%2 = ashr exact i64 %1, 49
@@ -29,10 +29,10 @@ define i32 @shl48sar49(i64 %a) #0 {
define i32 @shl56sar55(i64 %a) #0 {
; CHECK-LABEL: shl56sar55:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movsbq %dil, %rax
; CHECK-NEXT: addl %eax, %eax
-; CHECK-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill>
+; CHECK-NEXT: # kill: def %eax killed %eax killed %rax
; CHECK-NEXT: retq
%1 = shl i64 %a, 56
%2 = ashr exact i64 %1, 55
@@ -42,10 +42,10 @@ define i32 @shl56sar55(i64 %a) #0 {
define i32 @shl56sar57(i64 %a) #0 {
; CHECK-LABEL: shl56sar57:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movsbq %dil, %rax
; CHECK-NEXT: shrq %rax
-; CHECK-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill>
+; CHECK-NEXT: # kill: def %eax killed %eax killed %rax
; CHECK-NEXT: retq
%1 = shl i64 %a, 56
%2 = ashr exact i64 %1, 57
@@ -55,7 +55,7 @@ define i32 @shl56sar57(i64 %a) #0 {
define i8 @all_sign_bit_ashr(i8 %x) {
; CHECK-LABEL: all_sign_bit_ashr:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: andb $1, %dil
; CHECK-NEXT: negb %dil
; CHECK-NEXT: movl %edi, %eax
@@ -68,7 +68,7 @@ define i8 @all_sign_bit_ashr(i8 %x) {
define <4 x i32> @all_sign_bit_ashr_vec(<4 x i32> %x) {
; CHECK-LABEL: all_sign_bit_ashr_vec:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: pand {{.*}}(%rip), %xmm0
; CHECK-NEXT: pxor %xmm1, %xmm1
; CHECK-NEXT: psubd %xmm0, %xmm1
diff --git a/test/CodeGen/X86/sbb.ll b/test/CodeGen/X86/sbb.ll
index b6e8ebf6ed06..bd4a62f21699 100644
--- a/test/CodeGen/X86/sbb.ll
+++ b/test/CodeGen/X86/sbb.ll
@@ -7,7 +7,7 @@
define i8 @i8_select_0_or_neg1(i8 %x) {
; CHECK-LABEL: i8_select_0_or_neg1:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: negb %dil
; CHECK-NEXT: sbbb %al, %al
; CHECK-NEXT: retq
@@ -20,7 +20,7 @@ define i8 @i8_select_0_or_neg1(i8 %x) {
define i16 @i16_select_0_or_neg1_as_math(i16 %x) {
; CHECK-LABEL: i16_select_0_or_neg1_as_math:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: negw %di
; CHECK-NEXT: sbbw %ax, %ax
; CHECK-NEXT: retq
@@ -34,7 +34,7 @@ define i16 @i16_select_0_or_neg1_as_math(i16 %x) {
define i32 @i32_select_0_or_neg1_commuted(i32 %x) {
; CHECK-LABEL: i32_select_0_or_neg1_commuted:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: negl %edi
; CHECK-NEXT: sbbl %eax, %eax
; CHECK-NEXT: retq
@@ -47,7 +47,7 @@ define i32 @i32_select_0_or_neg1_commuted(i32 %x) {
define i64 @i64_select_0_or_neg1_commuted_as_math(i64 %x) {
; CHECK-LABEL: i64_select_0_or_neg1_commuted_as_math:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: negq %rdi
; CHECK-NEXT: sbbq %rax, %rax
; CHECK-NEXT: retq
@@ -61,7 +61,7 @@ define i64 @i64_select_0_or_neg1_commuted_as_math(i64 %x) {
define i64 @i64_select_neg1_or_0(i64 %x) {
; CHECK-LABEL: i64_select_neg1_or_0:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: cmpq $1, %rdi
; CHECK-NEXT: sbbq %rax, %rax
; CHECK-NEXT: retq
@@ -74,7 +74,7 @@ define i64 @i64_select_neg1_or_0(i64 %x) {
define i32 @i32_select_neg1_or_0_as_math(i32 %x) {
; CHECK-LABEL: i32_select_neg1_or_0_as_math:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: cmpl $1, %edi
; CHECK-NEXT: sbbl %eax, %eax
; CHECK-NEXT: retq
@@ -88,7 +88,7 @@ define i32 @i32_select_neg1_or_0_as_math(i32 %x) {
define i16 @i16_select_neg1_or_0_commuted(i16 %x) {
; CHECK-LABEL: i16_select_neg1_or_0_commuted:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: cmpw $1, %di
; CHECK-NEXT: sbbw %ax, %ax
; CHECK-NEXT: retq
@@ -101,7 +101,7 @@ define i16 @i16_select_neg1_or_0_commuted(i16 %x) {
define i8 @i8_select_neg1_or_0_commuted_as_math(i8 %x) {
; CHECK-LABEL: i8_select_neg1_or_0_commuted_as_math:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: cmpb $1, %dil
; CHECK-NEXT: sbbb %al, %al
; CHECK-NEXT: retq
@@ -115,7 +115,7 @@ define i8 @i8_select_neg1_or_0_commuted_as_math(i8 %x) {
define i32 @ult_select_neg1_or_0(i32 %x, i32 %y) nounwind {
; CHECK-LABEL: ult_select_neg1_or_0:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: cmpl %esi, %edi
; CHECK-NEXT: sbbl %eax, %eax
; CHECK-NEXT: retq
@@ -129,11 +129,9 @@ define i32 @ult_select_neg1_or_0(i32 %x, i32 %y) nounwind {
define i32 @ugt_select_neg1_or_0(i32 %x, i32 %y) nounwind {
; CHECK-LABEL: ugt_select_neg1_or_0:
-; CHECK: # BB#0:
-; CHECK-NEXT: xorl %ecx, %ecx
-; CHECK-NEXT: cmpl %edi, %esi
-; CHECK-NEXT: movl $-1, %eax
-; CHECK-NEXT: cmovbel %ecx, %eax
+; CHECK: # %bb.0:
+; CHECK-NEXT: cmpl %esi, %edi
+; CHECK-NEXT: sbbl %eax, %eax
; CHECK-NEXT: retq
%cmp = icmp ugt i32 %y, %x
%ext = sext i1 %cmp to i32
@@ -145,7 +143,7 @@ define i32 @ugt_select_neg1_or_0(i32 %x, i32 %y) nounwind {
define i32 @uge_select_0_or_neg1(i32 %x, i32 %y) nounwind {
; CHECK-LABEL: uge_select_0_or_neg1:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: cmpl %esi, %edi
; CHECK-NEXT: sbbl %eax, %eax
; CHECK-NEXT: retq
@@ -160,7 +158,7 @@ define i32 @uge_select_0_or_neg1(i32 %x, i32 %y) nounwind {
define i32 @ule_select_0_or_neg1(i32 %x, i32 %y) nounwind {
; CHECK-LABEL: ule_select_0_or_neg1:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: cmpl %esi, %edi
; CHECK-NEXT: sbbl %eax, %eax
; CHECK-NEXT: retq
@@ -175,7 +173,7 @@ define i32 @ule_select_0_or_neg1(i32 %x, i32 %y) nounwind {
define i32 @uge_select_0_or_neg1_sub(i32 %x, i32 %y) nounwind {
; CHECK-LABEL: uge_select_0_or_neg1_sub:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: cmpl %esi, %edi
; CHECK-NEXT: sbbl %eax, %eax
; CHECK-NEXT: retq
@@ -190,7 +188,7 @@ define i32 @uge_select_0_or_neg1_sub(i32 %x, i32 %y) nounwind {
define i64 @ugt_select_neg1_or_0_sub(i64 %x, i64 %y) nounwind {
; CHECK-LABEL: ugt_select_neg1_or_0_sub:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: cmpq %rdi, %rsi
; CHECK-NEXT: sbbq %rax, %rax
; CHECK-NEXT: retq
@@ -205,7 +203,7 @@ define i64 @ugt_select_neg1_or_0_sub(i64 %x, i64 %y) nounwind {
define i16 @ult_select_neg1_or_0_sub(i16 %x, i16 %y) nounwind {
; CHECK-LABEL: ult_select_neg1_or_0_sub:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: cmpw %di, %si
; CHECK-NEXT: sbbw %ax, %ax
; CHECK-NEXT: retq
@@ -222,7 +220,7 @@ define i16 @ult_select_neg1_or_0_sub(i16 %x, i16 %y) nounwind {
define void @PR33560(i8 %x, i64 %y) {
; CHECK-LABEL: PR33560:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: negb %dil
; CHECK-NEXT: sbbq %rax, %rax
; CHECK-NEXT: cmpq %rsi, %rax
diff --git a/test/CodeGen/X86/scalar-extract.ll b/test/CodeGen/X86/scalar-extract.ll
index b8ef5e74c436..dd1b9a55eafc 100644
--- a/test/CodeGen/X86/scalar-extract.ll
+++ b/test/CodeGen/X86/scalar-extract.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=+mmx -o %t
+; RUN: llc < %s -mtriple=i686-- -mattr=+mmx -o %t
; RUN: not grep movq %t
; Check that widening doesn't introduce a mmx register in this case when
diff --git a/test/CodeGen/X86/scalar-int-to-fp.ll b/test/CodeGen/X86/scalar-int-to-fp.ll
index c99d3494b8ee..66cc628ad5e6 100644
--- a/test/CodeGen/X86/scalar-int-to-fp.ll
+++ b/test/CodeGen/X86/scalar-int-to-fp.ll
@@ -11,7 +11,7 @@
define float @u32_to_f(i32 %a) nounwind {
; AVX512_32-LABEL: u32_to_f:
-; AVX512_32: # BB#0:
+; AVX512_32: # %bb.0:
; AVX512_32-NEXT: pushl %eax
; AVX512_32-NEXT: vcvtusi2ssl {{[0-9]+}}(%esp), %xmm0, %xmm0
; AVX512_32-NEXT: vmovss %xmm0, (%esp)
@@ -20,12 +20,12 @@ define float @u32_to_f(i32 %a) nounwind {
; AVX512_32-NEXT: retl
;
; AVX512_64-LABEL: u32_to_f:
-; AVX512_64: # BB#0:
+; AVX512_64: # %bb.0:
; AVX512_64-NEXT: vcvtusi2ssl %edi, %xmm0, %xmm0
; AVX512_64-NEXT: retq
;
; SSE2_32-LABEL: u32_to_f:
-; SSE2_32: # BB#0:
+; SSE2_32: # %bb.0:
; SSE2_32-NEXT: pushl %eax
; SSE2_32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; SSE2_32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
@@ -39,13 +39,13 @@ define float @u32_to_f(i32 %a) nounwind {
; SSE2_32-NEXT: retl
;
; SSE2_64-LABEL: u32_to_f:
-; SSE2_64: # BB#0:
+; SSE2_64: # %bb.0:
; SSE2_64-NEXT: movl %edi, %eax
; SSE2_64-NEXT: cvtsi2ssq %rax, %xmm0
; SSE2_64-NEXT: retq
;
; X87-LABEL: u32_to_f:
-; X87: # BB#0:
+; X87: # %bb.0:
; X87-NEXT: pushl %ebp
; X87-NEXT: movl %esp, %ebp
; X87-NEXT: andl $-8, %esp
@@ -63,7 +63,7 @@ define float @u32_to_f(i32 %a) nounwind {
define float @s32_to_f(i32 %a) nounwind {
; AVX512_32-LABEL: s32_to_f:
-; AVX512_32: # BB#0:
+; AVX512_32: # %bb.0:
; AVX512_32-NEXT: pushl %eax
; AVX512_32-NEXT: vcvtsi2ssl {{[0-9]+}}(%esp), %xmm0, %xmm0
; AVX512_32-NEXT: vmovss %xmm0, (%esp)
@@ -72,12 +72,12 @@ define float @s32_to_f(i32 %a) nounwind {
; AVX512_32-NEXT: retl
;
; AVX512_64-LABEL: s32_to_f:
-; AVX512_64: # BB#0:
+; AVX512_64: # %bb.0:
; AVX512_64-NEXT: vcvtsi2ssl %edi, %xmm0, %xmm0
; AVX512_64-NEXT: retq
;
; SSE2_32-LABEL: s32_to_f:
-; SSE2_32: # BB#0:
+; SSE2_32: # %bb.0:
; SSE2_32-NEXT: pushl %eax
; SSE2_32-NEXT: cvtsi2ssl {{[0-9]+}}(%esp), %xmm0
; SSE2_32-NEXT: movss %xmm0, (%esp)
@@ -86,12 +86,12 @@ define float @s32_to_f(i32 %a) nounwind {
; SSE2_32-NEXT: retl
;
; SSE2_64-LABEL: s32_to_f:
-; SSE2_64: # BB#0:
+; SSE2_64: # %bb.0:
; SSE2_64-NEXT: cvtsi2ssl %edi, %xmm0
; SSE2_64-NEXT: retq
;
; X87-LABEL: s32_to_f:
-; X87: # BB#0:
+; X87: # %bb.0:
; X87-NEXT: pushl %eax
; X87-NEXT: movl {{[0-9]+}}(%esp), %eax
; X87-NEXT: movl %eax, (%esp)
@@ -104,7 +104,7 @@ define float @s32_to_f(i32 %a) nounwind {
define double @u32_to_d(i32 %a) nounwind {
; AVX512_32-LABEL: u32_to_d:
-; AVX512_32: # BB#0:
+; AVX512_32: # %bb.0:
; AVX512_32-NEXT: pushl %ebp
; AVX512_32-NEXT: movl %esp, %ebp
; AVX512_32-NEXT: andl $-8, %esp
@@ -117,12 +117,12 @@ define double @u32_to_d(i32 %a) nounwind {
; AVX512_32-NEXT: retl
;
; AVX512_64-LABEL: u32_to_d:
-; AVX512_64: # BB#0:
+; AVX512_64: # %bb.0:
; AVX512_64-NEXT: vcvtusi2sdl %edi, %xmm0, %xmm0
; AVX512_64-NEXT: retq
;
; SSE2_32-LABEL: u32_to_d:
-; SSE2_32: # BB#0:
+; SSE2_32: # %bb.0:
; SSE2_32-NEXT: pushl %ebp
; SSE2_32-NEXT: movl %esp, %ebp
; SSE2_32-NEXT: andl $-8, %esp
@@ -138,13 +138,13 @@ define double @u32_to_d(i32 %a) nounwind {
; SSE2_32-NEXT: retl
;
; SSE2_64-LABEL: u32_to_d:
-; SSE2_64: # BB#0:
+; SSE2_64: # %bb.0:
; SSE2_64-NEXT: movl %edi, %eax
; SSE2_64-NEXT: cvtsi2sdq %rax, %xmm0
; SSE2_64-NEXT: retq
;
; X87-LABEL: u32_to_d:
-; X87: # BB#0:
+; X87: # %bb.0:
; X87-NEXT: pushl %ebp
; X87-NEXT: movl %esp, %ebp
; X87-NEXT: andl $-8, %esp
@@ -162,7 +162,7 @@ define double @u32_to_d(i32 %a) nounwind {
define double @s32_to_d(i32 %a) nounwind {
; AVX512_32-LABEL: s32_to_d:
-; AVX512_32: # BB#0:
+; AVX512_32: # %bb.0:
; AVX512_32-NEXT: pushl %ebp
; AVX512_32-NEXT: movl %esp, %ebp
; AVX512_32-NEXT: andl $-8, %esp
@@ -175,12 +175,12 @@ define double @s32_to_d(i32 %a) nounwind {
; AVX512_32-NEXT: retl
;
; AVX512_64-LABEL: s32_to_d:
-; AVX512_64: # BB#0:
+; AVX512_64: # %bb.0:
; AVX512_64-NEXT: vcvtsi2sdl %edi, %xmm0, %xmm0
; AVX512_64-NEXT: retq
;
; SSE2_32-LABEL: s32_to_d:
-; SSE2_32: # BB#0:
+; SSE2_32: # %bb.0:
; SSE2_32-NEXT: pushl %ebp
; SSE2_32-NEXT: movl %esp, %ebp
; SSE2_32-NEXT: andl $-8, %esp
@@ -193,12 +193,12 @@ define double @s32_to_d(i32 %a) nounwind {
; SSE2_32-NEXT: retl
;
; SSE2_64-LABEL: s32_to_d:
-; SSE2_64: # BB#0:
+; SSE2_64: # %bb.0:
; SSE2_64-NEXT: cvtsi2sdl %edi, %xmm0
; SSE2_64-NEXT: retq
;
; X87-LABEL: s32_to_d:
-; X87: # BB#0:
+; X87: # %bb.0:
; X87-NEXT: pushl %eax
; X87-NEXT: movl {{[0-9]+}}(%esp), %eax
; X87-NEXT: movl %eax, (%esp)
@@ -211,7 +211,7 @@ define double @s32_to_d(i32 %a) nounwind {
define x86_fp80 @u32_to_x(i32 %a) nounwind {
; AVX512_32-LABEL: u32_to_x:
-; AVX512_32: # BB#0:
+; AVX512_32: # %bb.0:
; AVX512_32-NEXT: pushl %ebp
; AVX512_32-NEXT: movl %esp, %ebp
; AVX512_32-NEXT: andl $-8, %esp
@@ -227,7 +227,7 @@ define x86_fp80 @u32_to_x(i32 %a) nounwind {
; AVX512_32-NEXT: retl
;
; AVX512_64-LABEL: u32_to_x:
-; AVX512_64: # BB#0:
+; AVX512_64: # %bb.0:
; AVX512_64-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; AVX512_64-NEXT: vmovd %edi, %xmm1
; AVX512_64-NEXT: vpor %xmm0, %xmm1, %xmm1
@@ -237,7 +237,7 @@ define x86_fp80 @u32_to_x(i32 %a) nounwind {
; AVX512_64-NEXT: retq
;
; SSE2_32-LABEL: u32_to_x:
-; SSE2_32: # BB#0:
+; SSE2_32: # %bb.0:
; SSE2_32-NEXT: pushl %ebp
; SSE2_32-NEXT: movl %esp, %ebp
; SSE2_32-NEXT: andl $-8, %esp
@@ -253,14 +253,14 @@ define x86_fp80 @u32_to_x(i32 %a) nounwind {
; SSE2_32-NEXT: retl
;
; SSE2_64-LABEL: u32_to_x:
-; SSE2_64: # BB#0:
+; SSE2_64: # %bb.0:
; SSE2_64-NEXT: movl %edi, %eax
; SSE2_64-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
; SSE2_64-NEXT: fildll -{{[0-9]+}}(%rsp)
; SSE2_64-NEXT: retq
;
; X87-LABEL: u32_to_x:
-; X87: # BB#0:
+; X87: # %bb.0:
; X87-NEXT: pushl %ebp
; X87-NEXT: movl %esp, %ebp
; X87-NEXT: andl $-8, %esp
@@ -278,7 +278,7 @@ define x86_fp80 @u32_to_x(i32 %a) nounwind {
define x86_fp80 @s32_to_x(i32 %a) nounwind {
; CHECK32-LABEL: s32_to_x:
-; CHECK32: # BB#0:
+; CHECK32: # %bb.0:
; CHECK32-NEXT: pushl %eax
; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK32-NEXT: movl %eax, (%esp)
@@ -287,7 +287,7 @@ define x86_fp80 @s32_to_x(i32 %a) nounwind {
; CHECK32-NEXT: retl
;
; CHECK64-LABEL: s32_to_x:
-; CHECK64: # BB#0:
+; CHECK64: # %bb.0:
; CHECK64-NEXT: movl %edi, -{{[0-9]+}}(%rsp)
; CHECK64-NEXT: fildl -{{[0-9]+}}(%rsp)
; CHECK64-NEXT: retq
@@ -297,7 +297,7 @@ define x86_fp80 @s32_to_x(i32 %a) nounwind {
define float @u64_to_f(i64 %a) nounwind {
; AVX512_32-LABEL: u64_to_f:
-; AVX512_32: # BB#0:
+; AVX512_32: # %bb.0:
; AVX512_32-NEXT: pushl %ebp
; AVX512_32-NEXT: movl %esp, %ebp
; AVX512_32-NEXT: andl $-8, %esp
@@ -318,12 +318,12 @@ define float @u64_to_f(i64 %a) nounwind {
; AVX512_32-NEXT: retl
;
; AVX512_64-LABEL: u64_to_f:
-; AVX512_64: # BB#0:
+; AVX512_64: # %bb.0:
; AVX512_64-NEXT: vcvtusi2ssq %rdi, %xmm0, %xmm0
; AVX512_64-NEXT: retq
;
; SSE2_32-LABEL: u64_to_f:
-; SSE2_32: # BB#0:
+; SSE2_32: # %bb.0:
; SSE2_32-NEXT: pushl %ebp
; SSE2_32-NEXT: movl %esp, %ebp
; SSE2_32-NEXT: andl $-8, %esp
@@ -344,10 +344,10 @@ define float @u64_to_f(i64 %a) nounwind {
; SSE2_32-NEXT: retl
;
; SSE2_64-LABEL: u64_to_f:
-; SSE2_64: # BB#0:
+; SSE2_64: # %bb.0:
; SSE2_64-NEXT: testq %rdi, %rdi
; SSE2_64-NEXT: js .LBB6_1
-; SSE2_64-NEXT: # BB#2:
+; SSE2_64-NEXT: # %bb.2:
; SSE2_64-NEXT: cvtsi2ssq %rdi, %xmm0
; SSE2_64-NEXT: retq
; SSE2_64-NEXT: .LBB6_1:
@@ -360,7 +360,7 @@ define float @u64_to_f(i64 %a) nounwind {
; SSE2_64-NEXT: retq
;
; X87-LABEL: u64_to_f:
-; X87: # BB#0:
+; X87: # %bb.0:
; X87-NEXT: pushl %ebp
; X87-NEXT: movl %esp, %ebp
; X87-NEXT: andl $-8, %esp
@@ -385,7 +385,7 @@ define float @u64_to_f(i64 %a) nounwind {
define float @s64_to_f(i64 %a) nounwind {
; AVX512_32-LABEL: s64_to_f:
-; AVX512_32: # BB#0:
+; AVX512_32: # %bb.0:
; AVX512_32-NEXT: pushl %eax
; AVX512_32-NEXT: fildll {{[0-9]+}}(%esp)
; AVX512_32-NEXT: fstps (%esp)
@@ -394,12 +394,12 @@ define float @s64_to_f(i64 %a) nounwind {
; AVX512_32-NEXT: retl
;
; AVX512_64-LABEL: s64_to_f:
-; AVX512_64: # BB#0:
+; AVX512_64: # %bb.0:
; AVX512_64-NEXT: vcvtsi2ssq %rdi, %xmm0, %xmm0
; AVX512_64-NEXT: retq
;
; SSE2_32-LABEL: s64_to_f:
-; SSE2_32: # BB#0:
+; SSE2_32: # %bb.0:
; SSE2_32-NEXT: pushl %eax
; SSE2_32-NEXT: fildll {{[0-9]+}}(%esp)
; SSE2_32-NEXT: fstps (%esp)
@@ -408,12 +408,12 @@ define float @s64_to_f(i64 %a) nounwind {
; SSE2_32-NEXT: retl
;
; SSE2_64-LABEL: s64_to_f:
-; SSE2_64: # BB#0:
+; SSE2_64: # %bb.0:
; SSE2_64-NEXT: cvtsi2ssq %rdi, %xmm0
; SSE2_64-NEXT: retq
;
; X87-LABEL: s64_to_f:
-; X87: # BB#0:
+; X87: # %bb.0:
; X87-NEXT: fildll {{[0-9]+}}(%esp)
; X87-NEXT: retl
%r = sitofp i64 %a to float
@@ -422,7 +422,7 @@ define float @s64_to_f(i64 %a) nounwind {
define float @s64_to_f_2(i64 %a) nounwind {
; AVX512_32-LABEL: s64_to_f_2:
-; AVX512_32: # BB#0:
+; AVX512_32: # %bb.0:
; AVX512_32-NEXT: pushl %ebp
; AVX512_32-NEXT: movl %esp, %ebp
; AVX512_32-NEXT: andl $-8, %esp
@@ -442,13 +442,13 @@ define float @s64_to_f_2(i64 %a) nounwind {
; AVX512_32-NEXT: retl
;
; AVX512_64-LABEL: s64_to_f_2:
-; AVX512_64: # BB#0:
+; AVX512_64: # %bb.0:
; AVX512_64-NEXT: addq $5, %rdi
; AVX512_64-NEXT: vcvtsi2ssq %rdi, %xmm0, %xmm0
; AVX512_64-NEXT: retq
;
; SSE2_32-LABEL: s64_to_f_2:
-; SSE2_32: # BB#0:
+; SSE2_32: # %bb.0:
; SSE2_32-NEXT: pushl %ebp
; SSE2_32-NEXT: movl %esp, %ebp
; SSE2_32-NEXT: andl $-8, %esp
@@ -469,13 +469,13 @@ define float @s64_to_f_2(i64 %a) nounwind {
; SSE2_32-NEXT: retl
;
; SSE2_64-LABEL: s64_to_f_2:
-; SSE2_64: # BB#0:
+; SSE2_64: # %bb.0:
; SSE2_64-NEXT: addq $5, %rdi
; SSE2_64-NEXT: cvtsi2ssq %rdi, %xmm0
; SSE2_64-NEXT: retq
;
; X87-LABEL: s64_to_f_2:
-; X87: # BB#0:
+; X87: # %bb.0:
; X87-NEXT: pushl %ebp
; X87-NEXT: movl %esp, %ebp
; X87-NEXT: andl $-8, %esp
@@ -497,13 +497,13 @@ define float @s64_to_f_2(i64 %a) nounwind {
define double @u64_to_d(i64 %a) nounwind {
; AVX512_32-LABEL: u64_to_d:
-; AVX512_32: # BB#0:
+; AVX512_32: # %bb.0:
; AVX512_32-NEXT: pushl %ebp
; AVX512_32-NEXT: movl %esp, %ebp
; AVX512_32-NEXT: andl $-8, %esp
; AVX512_32-NEXT: subl $8, %esp
-; AVX512_32-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX512_32-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX512_32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX512_32-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
; AVX512_32-NEXT: vsubpd {{\.LCPI.*}}, %xmm0, %xmm0
; AVX512_32-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
; AVX512_32-NEXT: vmovlpd %xmm0, (%esp)
@@ -513,18 +513,18 @@ define double @u64_to_d(i64 %a) nounwind {
; AVX512_32-NEXT: retl
;
; AVX512_64-LABEL: u64_to_d:
-; AVX512_64: # BB#0:
+; AVX512_64: # %bb.0:
; AVX512_64-NEXT: vcvtusi2sdq %rdi, %xmm0, %xmm0
; AVX512_64-NEXT: retq
;
; SSE2_32-LABEL: u64_to_d:
-; SSE2_32: # BB#0:
+; SSE2_32: # %bb.0:
; SSE2_32-NEXT: pushl %ebp
; SSE2_32-NEXT: movl %esp, %ebp
; SSE2_32-NEXT: andl $-8, %esp
; SSE2_32-NEXT: subl $8, %esp
-; SSE2_32-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; SSE2_32-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; SSE2_32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE2_32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
; SSE2_32-NEXT: subpd {{\.LCPI.*}}, %xmm0
; SSE2_32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; SSE2_32-NEXT: addpd %xmm0, %xmm1
@@ -535,7 +535,7 @@ define double @u64_to_d(i64 %a) nounwind {
; SSE2_32-NEXT: retl
;
; SSE2_64-LABEL: u64_to_d:
-; SSE2_64: # BB#0:
+; SSE2_64: # %bb.0:
; SSE2_64-NEXT: movq %rdi, %xmm1
; SSE2_64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
; SSE2_64-NEXT: subpd {{.*}}(%rip), %xmm1
@@ -544,7 +544,7 @@ define double @u64_to_d(i64 %a) nounwind {
; SSE2_64-NEXT: retq
;
; X87-LABEL: u64_to_d:
-; X87: # BB#0:
+; X87: # %bb.0:
; X87-NEXT: pushl %ebp
; X87-NEXT: movl %esp, %ebp
; X87-NEXT: andl $-8, %esp
@@ -569,7 +569,7 @@ define double @u64_to_d(i64 %a) nounwind {
define double @s64_to_d(i64 %a) nounwind {
; AVX512_32-LABEL: s64_to_d:
-; AVX512_32: # BB#0:
+; AVX512_32: # %bb.0:
; AVX512_32-NEXT: pushl %ebp
; AVX512_32-NEXT: movl %esp, %ebp
; AVX512_32-NEXT: andl $-8, %esp
@@ -582,12 +582,12 @@ define double @s64_to_d(i64 %a) nounwind {
; AVX512_32-NEXT: retl
;
; AVX512_64-LABEL: s64_to_d:
-; AVX512_64: # BB#0:
+; AVX512_64: # %bb.0:
; AVX512_64-NEXT: vcvtsi2sdq %rdi, %xmm0, %xmm0
; AVX512_64-NEXT: retq
;
; SSE2_32-LABEL: s64_to_d:
-; SSE2_32: # BB#0:
+; SSE2_32: # %bb.0:
; SSE2_32-NEXT: pushl %ebp
; SSE2_32-NEXT: movl %esp, %ebp
; SSE2_32-NEXT: andl $-8, %esp
@@ -600,12 +600,12 @@ define double @s64_to_d(i64 %a) nounwind {
; SSE2_32-NEXT: retl
;
; SSE2_64-LABEL: s64_to_d:
-; SSE2_64: # BB#0:
+; SSE2_64: # %bb.0:
; SSE2_64-NEXT: cvtsi2sdq %rdi, %xmm0
; SSE2_64-NEXT: retq
;
; X87-LABEL: s64_to_d:
-; X87: # BB#0:
+; X87: # %bb.0:
; X87-NEXT: fildll {{[0-9]+}}(%esp)
; X87-NEXT: retl
%r = sitofp i64 %a to double
@@ -614,7 +614,7 @@ define double @s64_to_d(i64 %a) nounwind {
define double @s64_to_d_2(i64 %a) nounwind {
; AVX512_32-LABEL: s64_to_d_2:
-; AVX512_32: # BB#0:
+; AVX512_32: # %bb.0:
; AVX512_32-NEXT: pushl %ebp
; AVX512_32-NEXT: movl %esp, %ebp
; AVX512_32-NEXT: andl $-8, %esp
@@ -634,13 +634,13 @@ define double @s64_to_d_2(i64 %a) nounwind {
; AVX512_32-NEXT: retl
;
; AVX512_64-LABEL: s64_to_d_2:
-; AVX512_64: # BB#0:
+; AVX512_64: # %bb.0:
; AVX512_64-NEXT: addq $5, %rdi
; AVX512_64-NEXT: vcvtsi2sdq %rdi, %xmm0, %xmm0
; AVX512_64-NEXT: retq
;
; SSE2_32-LABEL: s64_to_d_2:
-; SSE2_32: # BB#0:
+; SSE2_32: # %bb.0:
; SSE2_32-NEXT: pushl %ebp
; SSE2_32-NEXT: movl %esp, %ebp
; SSE2_32-NEXT: andl $-8, %esp
@@ -661,13 +661,13 @@ define double @s64_to_d_2(i64 %a) nounwind {
; SSE2_32-NEXT: retl
;
; SSE2_64-LABEL: s64_to_d_2:
-; SSE2_64: # BB#0:
+; SSE2_64: # %bb.0:
; SSE2_64-NEXT: addq $5, %rdi
; SSE2_64-NEXT: cvtsi2sdq %rdi, %xmm0
; SSE2_64-NEXT: retq
;
; X87-LABEL: s64_to_d_2:
-; X87: # BB#0:
+; X87: # %bb.0:
; X87-NEXT: pushl %ebp
; X87-NEXT: movl %esp, %ebp
; X87-NEXT: andl $-8, %esp
@@ -689,7 +689,7 @@ define double @s64_to_d_2(i64 %a) nounwind {
define x86_fp80 @u64_to_x(i64 %a) nounwind {
; CHECK32-LABEL: u64_to_x:
-; CHECK32: # BB#0:
+; CHECK32: # %bb.0:
; CHECK32-NEXT: pushl %ebp
; CHECK32-NEXT: movl %esp, %ebp
; CHECK32-NEXT: andl $-8, %esp
@@ -708,7 +708,7 @@ define x86_fp80 @u64_to_x(i64 %a) nounwind {
; CHECK32-NEXT: retl
;
; CHECK64-LABEL: u64_to_x:
-; CHECK64: # BB#0:
+; CHECK64: # %bb.0:
; CHECK64-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; CHECK64-NEXT: xorl %eax, %eax
; CHECK64-NEXT: testq %rdi, %rdi
@@ -722,12 +722,12 @@ define x86_fp80 @u64_to_x(i64 %a) nounwind {
define x86_fp80 @s64_to_x(i64 %a) nounwind {
; CHECK32-LABEL: s64_to_x:
-; CHECK32: # BB#0:
+; CHECK32: # %bb.0:
; CHECK32-NEXT: fildll {{[0-9]+}}(%esp)
; CHECK32-NEXT: retl
;
; CHECK64-LABEL: s64_to_x:
-; CHECK64: # BB#0:
+; CHECK64: # %bb.0:
; CHECK64-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; CHECK64-NEXT: fildll -{{[0-9]+}}(%rsp)
; CHECK64-NEXT: retq
diff --git a/test/CodeGen/X86/scalar_sse_minmax.ll b/test/CodeGen/X86/scalar_sse_minmax.ll
index 5ca3f85ce029..6a619f7f2c88 100644
--- a/test/CodeGen/X86/scalar_sse_minmax.ll
+++ b/test/CodeGen/X86/scalar_sse_minmax.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=+sse,+sse2 | FileCheck %s
+; RUN: llc < %s -mtriple=i686-- -mattr=+sse,+sse2 | FileCheck %s
define float @min1(float %x, float %y) {
; CHECK-LABEL: min1
diff --git a/test/CodeGen/X86/scalar_widen_div.ll b/test/CodeGen/X86/scalar_widen_div.ll
index 1671f8f89108..13e01b23ed38 100644
--- a/test/CodeGen/X86/scalar_widen_div.ll
+++ b/test/CodeGen/X86/scalar_widen_div.ll
@@ -1,14 +1,34 @@
-; RUN: llc < %s -march=x86-64 -mattr=+sse4.2 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse4.2 | FileCheck %s
; Verify when widening a divide/remainder operation, we only generate a
; divide/rem per element since divide/remainder can trap.
; CHECK: vectorDiv
define void @vectorDiv (<2 x i32> addrspace(1)* %nsource, <2 x i32> addrspace(1)* %dsource, <2 x i32> addrspace(1)* %qdest) nounwind {
-; CHECK: idivq
-; CHECK: idivq
-; CHECK-NOT: idivl
-; CHECK: ret
+; CHECK-LABEL: vectorDiv:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: movq %rdx, %r8
+; CHECK-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movq %rsi, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movslq -{{[0-9]+}}(%rsp), %rcx
+; CHECK-NEXT: pmovsxdq (%rdi,%rcx,8), %xmm0
+; CHECK-NEXT: pmovsxdq (%rsi,%rcx,8), %xmm1
+; CHECK-NEXT: pextrq $1, %xmm0, %rax
+; CHECK-NEXT: pextrq $1, %xmm1, %rsi
+; CHECK-NEXT: cqto
+; CHECK-NEXT: idivq %rsi
+; CHECK-NEXT: movq %rax, %xmm2
+; CHECK-NEXT: movq %xmm0, %rax
+; CHECK-NEXT: movq %xmm1, %rsi
+; CHECK-NEXT: cqto
+; CHECK-NEXT: idivq %rsi
+; CHECK-NEXT: movq %rax, %xmm0
+; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-NEXT: movq %xmm0, (%r8,%rcx,8)
+; CHECK-NEXT: retq
entry:
%nsource.addr = alloca <2 x i32> addrspace(1)*, align 4
%dsource.addr = alloca <2 x i32> addrspace(1)*, align 4
@@ -35,117 +55,310 @@ entry:
; CHECK: test_char_div
define <3 x i8> @test_char_div(<3 x i8> %num, <3 x i8> %div) {
-; CHECK: idivb
-; CHECK: idivb
-; CHECK: idivb
-; CHECK-NOT: idivb
-; CHECK: ret
+; CHECK-LABEL: test_char_div:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: cbtw
+; CHECK-NEXT: idivb %cl
+; CHECK-NEXT: movl %eax, %edi
+; CHECK-NEXT: movl %esi, %eax
+; CHECK-NEXT: cbtw
+; CHECK-NEXT: idivb %r8b
+; CHECK-NEXT: movl %eax, %esi
+; CHECK-NEXT: movl %edx, %eax
+; CHECK-NEXT: cbtw
+; CHECK-NEXT: idivb %r9b
+; CHECK-NEXT: movl %eax, %ecx
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: movl %esi, %edx
+; CHECK-NEXT: retq
%div.r = sdiv <3 x i8> %num, %div
ret <3 x i8> %div.r
}
; CHECK: test_uchar_div
define <3 x i8> @test_uchar_div(<3 x i8> %num, <3 x i8> %div) {
-; CHECK: divb
-; CHECK: divb
-; CHECK: divb
-; CHECK-NOT: divb
-; CHECK: ret
+; CHECK-LABEL: test_uchar_div:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movzbl %dil, %eax
+; CHECK-NEXT: # kill: def %eax killed %eax def %ax
+; CHECK-NEXT: divb %cl
+; CHECK-NEXT: movl %eax, %edi
+; CHECK-NEXT: movzbl %sil, %eax
+; CHECK-NEXT: # kill: def %eax killed %eax def %ax
+; CHECK-NEXT: divb %r8b
+; CHECK-NEXT: movl %eax, %esi
+; CHECK-NEXT: movzbl %dl, %eax
+; CHECK-NEXT: # kill: def %eax killed %eax def %ax
+; CHECK-NEXT: divb %r9b
+; CHECK-NEXT: movl %eax, %ecx
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: movl %esi, %edx
+; CHECK-NEXT: retq
%div.r = udiv <3 x i8> %num, %div
ret <3 x i8> %div.r
}
; CHECK: test_short_div
define <5 x i16> @test_short_div(<5 x i16> %num, <5 x i16> %div) {
-; CHECK: idivw
-; CHECK: idivw
-; CHECK: idivw
-; CHECK: idivw
-; CHECK: idivw
-; CHECK-NOT: idivw
-; CHECK: ret
+; CHECK-LABEL: test_short_div:
+; CHECK: # %bb.0:
+; CHECK-NEXT: pextrw $4, %xmm0, %eax
+; CHECK-NEXT: pextrw $4, %xmm1, %ecx
+; CHECK-NEXT: # kill: def %ax killed %ax killed %eax
+; CHECK-NEXT: cwtd
+; CHECK-NEXT: idivw %cx
+; CHECK-NEXT: movl %eax, %r8d
+; CHECK-NEXT: pextrw $3, %xmm0, %eax
+; CHECK-NEXT: pextrw $3, %xmm1, %ecx
+; CHECK-NEXT: # kill: def %ax killed %ax killed %eax
+; CHECK-NEXT: cwtd
+; CHECK-NEXT: idivw %cx
+; CHECK-NEXT: movl %eax, %r9d
+; CHECK-NEXT: pextrw $2, %xmm0, %eax
+; CHECK-NEXT: pextrw $2, %xmm1, %ecx
+; CHECK-NEXT: # kill: def %ax killed %ax killed %eax
+; CHECK-NEXT: cwtd
+; CHECK-NEXT: idivw %cx
+; CHECK-NEXT: movl %eax, %edi
+; CHECK-NEXT: movd %xmm0, %eax
+; CHECK-NEXT: movd %xmm1, %ecx
+; CHECK-NEXT: # kill: def %ax killed %ax killed %eax
+; CHECK-NEXT: cwtd
+; CHECK-NEXT: idivw %cx
+; CHECK-NEXT: movl %eax, %ecx
+; CHECK-NEXT: pextrw $1, %xmm0, %eax
+; CHECK-NEXT: pextrw $1, %xmm1, %esi
+; CHECK-NEXT: # kill: def %ax killed %ax killed %eax
+; CHECK-NEXT: cwtd
+; CHECK-NEXT: idivw %si
+; CHECK-NEXT: # kill: def %ax killed %ax def %eax
+; CHECK-NEXT: movd %ecx, %xmm0
+; CHECK-NEXT: pinsrw $1, %eax, %xmm0
+; CHECK-NEXT: pinsrw $2, %edi, %xmm0
+; CHECK-NEXT: pinsrw $3, %r9d, %xmm0
+; CHECK-NEXT: pinsrw $4, %r8d, %xmm0
+; CHECK-NEXT: retq
%div.r = sdiv <5 x i16> %num, %div
ret <5 x i16> %div.r
}
; CHECK: test_ushort_div
define <4 x i16> @test_ushort_div(<4 x i16> %num, <4 x i16> %div) {
-; CHECK: divl
-; CHECK: divl
-; CHECK: divl
-; CHECK: divl
-; CHECK-NOT: divl
-; CHECK: ret
+; CHECK-LABEL: test_ushort_div:
+; CHECK: # %bb.0:
+; CHECK-NEXT: pxor %xmm2, %xmm2
+; CHECK-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
+; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
+; CHECK-NEXT: pextrd $1, %xmm0, %eax
+; CHECK-NEXT: pextrd $1, %xmm1, %ecx
+; CHECK-NEXT: xorl %edx, %edx
+; CHECK-NEXT: divl %ecx
+; CHECK-NEXT: movl %eax, %ecx
+; CHECK-NEXT: movd %xmm0, %eax
+; CHECK-NEXT: movd %xmm1, %esi
+; CHECK-NEXT: xorl %edx, %edx
+; CHECK-NEXT: divl %esi
+; CHECK-NEXT: movd %eax, %xmm2
+; CHECK-NEXT: pinsrd $1, %ecx, %xmm2
+; CHECK-NEXT: pextrd $2, %xmm0, %eax
+; CHECK-NEXT: pextrd $2, %xmm1, %ecx
+; CHECK-NEXT: xorl %edx, %edx
+; CHECK-NEXT: divl %ecx
+; CHECK-NEXT: pinsrd $2, %eax, %xmm2
+; CHECK-NEXT: pextrd $3, %xmm0, %eax
+; CHECK-NEXT: pextrd $3, %xmm1, %ecx
+; CHECK-NEXT: xorl %edx, %edx
+; CHECK-NEXT: divl %ecx
+; CHECK-NEXT: pinsrd $3, %eax, %xmm2
+; CHECK-NEXT: movdqa %xmm2, %xmm0
+; CHECK-NEXT: retq
%div.r = udiv <4 x i16> %num, %div
ret <4 x i16> %div.r
}
; CHECK: test_uint_div
define <3 x i32> @test_uint_div(<3 x i32> %num, <3 x i32> %div) {
-; CHECK: divl
-; CHECK: divl
-; CHECK: divl
-; CHECK-NOT: divl
-; CHECK: ret
+; CHECK-LABEL: test_uint_div:
+; CHECK: # %bb.0:
+; CHECK-NEXT: pextrd $2, %xmm0, %eax
+; CHECK-NEXT: pextrd $2, %xmm1, %ecx
+; CHECK-NEXT: xorl %edx, %edx
+; CHECK-NEXT: divl %ecx
+; CHECK-NEXT: movl %eax, %ecx
+; CHECK-NEXT: pextrd $1, %xmm0, %eax
+; CHECK-NEXT: pextrd $1, %xmm1, %esi
+; CHECK-NEXT: xorl %edx, %edx
+; CHECK-NEXT: divl %esi
+; CHECK-NEXT: movl %eax, %esi
+; CHECK-NEXT: movd %xmm0, %eax
+; CHECK-NEXT: movd %xmm1, %edi
+; CHECK-NEXT: xorl %edx, %edx
+; CHECK-NEXT: divl %edi
+; CHECK-NEXT: movd %eax, %xmm0
+; CHECK-NEXT: pinsrd $1, %esi, %xmm0
+; CHECK-NEXT: pinsrd $2, %ecx, %xmm0
+; CHECK-NEXT: retq
%div.r = udiv <3 x i32> %num, %div
ret <3 x i32> %div.r
}
; CHECK: test_long_div
define <3 x i64> @test_long_div(<3 x i64> %num, <3 x i64> %div) {
-; CHECK: idivq
-; CHECK: idivq
-; CHECK: idivq
-; CHECK-NOT: idivq
-; CHECK: ret
+; CHECK-LABEL: test_long_div:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movq %rdx, %r10
+; CHECK-NEXT: movq %rdi, %rax
+; CHECK-NEXT: cqto
+; CHECK-NEXT: idivq %rcx
+; CHECK-NEXT: movq %rax, %rcx
+; CHECK-NEXT: movq %rsi, %rax
+; CHECK-NEXT: cqto
+; CHECK-NEXT: idivq %r8
+; CHECK-NEXT: movq %rax, %rsi
+; CHECK-NEXT: movq %r10, %rax
+; CHECK-NEXT: cqto
+; CHECK-NEXT: idivq %r9
+; CHECK-NEXT: movq %rax, %rdi
+; CHECK-NEXT: movq %rcx, %rax
+; CHECK-NEXT: movq %rsi, %rdx
+; CHECK-NEXT: movq %rdi, %rcx
+; CHECK-NEXT: retq
%div.r = sdiv <3 x i64> %num, %div
ret <3 x i64> %div.r
}
; CHECK: test_ulong_div
define <3 x i64> @test_ulong_div(<3 x i64> %num, <3 x i64> %div) {
-; CHECK: divq
-; CHECK: divq
-; CHECK: divq
-; CHECK-NOT: divq
-; CHECK: ret
+; CHECK-LABEL: test_ulong_div:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movq %rdx, %r10
+; CHECK-NEXT: xorl %edx, %edx
+; CHECK-NEXT: movq %rdi, %rax
+; CHECK-NEXT: divq %rcx
+; CHECK-NEXT: movq %rax, %rcx
+; CHECK-NEXT: xorl %edx, %edx
+; CHECK-NEXT: movq %rsi, %rax
+; CHECK-NEXT: divq %r8
+; CHECK-NEXT: movq %rax, %rsi
+; CHECK-NEXT: xorl %edx, %edx
+; CHECK-NEXT: movq %r10, %rax
+; CHECK-NEXT: divq %r9
+; CHECK-NEXT: movq %rax, %rdi
+; CHECK-NEXT: movq %rcx, %rax
+; CHECK-NEXT: movq %rsi, %rdx
+; CHECK-NEXT: movq %rdi, %rcx
+; CHECK-NEXT: retq
%div.r = udiv <3 x i64> %num, %div
ret <3 x i64> %div.r
}
; CHECK: test_char_rem
define <4 x i8> @test_char_rem(<4 x i8> %num, <4 x i8> %rem) {
-; CHECK: idivl
-; CHECK: idivl
-; CHECK: idivl
-; CHECK: idivl
-; CHECK-NOT: idivl
-; CHECK: ret
+; CHECK-LABEL: test_char_rem:
+; CHECK: # %bb.0:
+; CHECK-NEXT: pslld $24, %xmm1
+; CHECK-NEXT: psrad $24, %xmm1
+; CHECK-NEXT: pslld $24, %xmm0
+; CHECK-NEXT: psrad $24, %xmm0
+; CHECK-NEXT: pextrd $1, %xmm0, %eax
+; CHECK-NEXT: pextrd $1, %xmm1, %ecx
+; CHECK-NEXT: cltd
+; CHECK-NEXT: idivl %ecx
+; CHECK-NEXT: movl %edx, %ecx
+; CHECK-NEXT: movd %xmm0, %eax
+; CHECK-NEXT: movd %xmm1, %esi
+; CHECK-NEXT: cltd
+; CHECK-NEXT: idivl %esi
+; CHECK-NEXT: movd %edx, %xmm2
+; CHECK-NEXT: pinsrd $1, %ecx, %xmm2
+; CHECK-NEXT: pextrd $2, %xmm0, %eax
+; CHECK-NEXT: pextrd $2, %xmm1, %ecx
+; CHECK-NEXT: cltd
+; CHECK-NEXT: idivl %ecx
+; CHECK-NEXT: pinsrd $2, %edx, %xmm2
+; CHECK-NEXT: pextrd $3, %xmm0, %eax
+; CHECK-NEXT: pextrd $3, %xmm1, %ecx
+; CHECK-NEXT: cltd
+; CHECK-NEXT: idivl %ecx
+; CHECK-NEXT: pinsrd $3, %edx, %xmm2
+; CHECK-NEXT: movdqa %xmm2, %xmm0
+; CHECK-NEXT: retq
%rem.r = srem <4 x i8> %num, %rem
ret <4 x i8> %rem.r
}
; CHECK: test_short_rem
define <5 x i16> @test_short_rem(<5 x i16> %num, <5 x i16> %rem) {
-; CHECK: idivw
-; CHECK: idivw
-; CHECK: idivw
-; CHECK: idivw
-; CHECK: idivw
-; CHECK-NOT: idivw
-; CHECK: ret
+; CHECK-LABEL: test_short_rem:
+; CHECK: # %bb.0:
+; CHECK-NEXT: pextrw $4, %xmm0, %eax
+; CHECK-NEXT: pextrw $4, %xmm1, %ecx
+; CHECK-NEXT: # kill: def %ax killed %ax killed %eax
+; CHECK-NEXT: cwtd
+; CHECK-NEXT: idivw %cx
+; CHECK-NEXT: movl %edx, %r8d
+; CHECK-NEXT: pextrw $3, %xmm0, %eax
+; CHECK-NEXT: pextrw $3, %xmm1, %ecx
+; CHECK-NEXT: # kill: def %ax killed %ax killed %eax
+; CHECK-NEXT: cwtd
+; CHECK-NEXT: idivw %cx
+; CHECK-NEXT: movl %edx, %r9d
+; CHECK-NEXT: pextrw $2, %xmm0, %eax
+; CHECK-NEXT: pextrw $2, %xmm1, %ecx
+; CHECK-NEXT: # kill: def %ax killed %ax killed %eax
+; CHECK-NEXT: cwtd
+; CHECK-NEXT: idivw %cx
+; CHECK-NEXT: movl %edx, %edi
+; CHECK-NEXT: movd %xmm0, %eax
+; CHECK-NEXT: movd %xmm1, %ecx
+; CHECK-NEXT: # kill: def %ax killed %ax killed %eax
+; CHECK-NEXT: cwtd
+; CHECK-NEXT: idivw %cx
+; CHECK-NEXT: movl %edx, %ecx
+; CHECK-NEXT: pextrw $1, %xmm0, %eax
+; CHECK-NEXT: pextrw $1, %xmm1, %esi
+; CHECK-NEXT: # kill: def %ax killed %ax killed %eax
+; CHECK-NEXT: cwtd
+; CHECK-NEXT: idivw %si
+; CHECK-NEXT: # kill: def %dx killed %dx def %edx
+; CHECK-NEXT: movd %ecx, %xmm0
+; CHECK-NEXT: pinsrw $1, %edx, %xmm0
+; CHECK-NEXT: pinsrw $2, %edi, %xmm0
+; CHECK-NEXT: pinsrw $3, %r9d, %xmm0
+; CHECK-NEXT: pinsrw $4, %r8d, %xmm0
+; CHECK-NEXT: retq
%rem.r = srem <5 x i16> %num, %rem
ret <5 x i16> %rem.r
}
; CHECK: test_uint_rem
define <4 x i32> @test_uint_rem(<4 x i32> %num, <4 x i32> %rem) {
-; CHECK: idivl
-; CHECK: idivl
-; CHECK: idivl
-; CHECK: idivl
-; CHECK-NOT: idivl
-; CHECK: ret
+; CHECK-LABEL: test_uint_rem:
+; CHECK: # %bb.0:
+; CHECK-NEXT: pextrd $1, %xmm0, %eax
+; CHECK-NEXT: pextrd $1, %xmm1, %ecx
+; CHECK-NEXT: cltd
+; CHECK-NEXT: idivl %ecx
+; CHECK-NEXT: movl %edx, %ecx
+; CHECK-NEXT: movd %xmm0, %eax
+; CHECK-NEXT: movd %xmm1, %esi
+; CHECK-NEXT: cltd
+; CHECK-NEXT: idivl %esi
+; CHECK-NEXT: movd %edx, %xmm2
+; CHECK-NEXT: pinsrd $1, %ecx, %xmm2
+; CHECK-NEXT: pextrd $2, %xmm0, %eax
+; CHECK-NEXT: pextrd $2, %xmm1, %ecx
+; CHECK-NEXT: cltd
+; CHECK-NEXT: idivl %ecx
+; CHECK-NEXT: pinsrd $2, %edx, %xmm2
+; CHECK-NEXT: pextrd $3, %xmm0, %eax
+; CHECK-NEXT: pextrd $3, %xmm1, %ecx
+; CHECK-NEXT: cltd
+; CHECK-NEXT: idivl %ecx
+; CHECK-NEXT: pinsrd $3, %edx, %xmm2
+; CHECK-NEXT: movdqa %xmm2, %xmm0
+; CHECK-NEXT: retq
%rem.r = srem <4 x i32> %num, %rem
ret <4 x i32> %rem.r
}
@@ -153,33 +366,84 @@ define <4 x i32> @test_uint_rem(<4 x i32> %num, <4 x i32> %rem) {
; CHECK: test_ulong_rem
define <5 x i64> @test_ulong_rem(<5 x i64> %num, <5 x i64> %rem) {
-; CHECK: divq
-; CHECK: divq
-; CHECK: divq
-; CHECK: divq
-; CHECK: divq
-; CHECK-NOT: divq
-; CHECK: ret
+; CHECK-LABEL: test_ulong_rem:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movq %rdx, %rax
+; CHECK-NEXT: xorl %edx, %edx
+; CHECK-NEXT: divq {{[0-9]+}}(%rsp)
+; CHECK-NEXT: movq %rdx, %xmm0
+; CHECK-NEXT: xorl %edx, %edx
+; CHECK-NEXT: movq %rsi, %rax
+; CHECK-NEXT: divq {{[0-9]+}}(%rsp)
+; CHECK-NEXT: movq %rdx, %xmm1
+; CHECK-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; CHECK-NEXT: xorl %edx, %edx
+; CHECK-NEXT: movq %r8, %rax
+; CHECK-NEXT: divq {{[0-9]+}}(%rsp)
+; CHECK-NEXT: movq %rdx, %xmm0
+; CHECK-NEXT: xorl %edx, %edx
+; CHECK-NEXT: movq %rcx, %rax
+; CHECK-NEXT: divq {{[0-9]+}}(%rsp)
+; CHECK-NEXT: movq %rdx, %xmm2
+; CHECK-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0]
+; CHECK-NEXT: xorl %edx, %edx
+; CHECK-NEXT: movq %r9, %rax
+; CHECK-NEXT: divq {{[0-9]+}}(%rsp)
+; CHECK-NEXT: movq %rdx, 32(%rdi)
+; CHECK-NEXT: movdqa %xmm2, 16(%rdi)
+; CHECK-NEXT: movdqa %xmm1, (%rdi)
+; CHECK-NEXT: movq %rdi, %rax
+; CHECK-NEXT: retq
%rem.r = urem <5 x i64> %num, %rem
ret <5 x i64> %rem.r
}
; CHECK: test_int_div
define void @test_int_div(<3 x i32>* %dest, <3 x i32>* %old, i32 %n) {
-; CHECK: idivl
-; CHECK: idivl
-; CHECK: idivl
-; CHECK-NOT: idivl
-; CHECK: ret
+; CHECK-LABEL: test_int_div:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: movl %edx, %r9d
+; CHECK-NEXT: testl %r9d, %r9d
+; CHECK-NEXT: jle .LBB12_3
+; CHECK-NEXT: # %bb.1: # %bb.nph
+; CHECK-NEXT: xorl %ecx, %ecx
+; CHECK-NEXT: .p2align 4, 0x90
+; CHECK-NEXT: .LBB12_2: # %for.body
+; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: movdqa (%rdi,%rcx), %xmm0
+; CHECK-NEXT: movdqa (%rsi,%rcx), %xmm1
+; CHECK-NEXT: pextrd $1, %xmm0, %eax
+; CHECK-NEXT: pextrd $1, %xmm1, %r8d
+; CHECK-NEXT: cltd
+; CHECK-NEXT: idivl %r8d
+; CHECK-NEXT: movl %eax, %r8d
+; CHECK-NEXT: movd %xmm0, %eax
+; CHECK-NEXT: movd %xmm1, %r10d
+; CHECK-NEXT: cltd
+; CHECK-NEXT: idivl %r10d
+; CHECK-NEXT: movd %eax, %xmm2
+; CHECK-NEXT: pinsrd $1, %r8d, %xmm2
+; CHECK-NEXT: pextrd $2, %xmm0, %eax
+; CHECK-NEXT: pextrd $2, %xmm1, %r8d
+; CHECK-NEXT: cltd
+; CHECK-NEXT: idivl %r8d
+; CHECK-NEXT: pinsrd $2, %eax, %xmm2
+; CHECK-NEXT: movl %eax, 8(%rdi,%rcx)
+; CHECK-NEXT: movq %xmm2, (%rdi,%rcx)
+; CHECK-NEXT: addq $16, %rcx
+; CHECK-NEXT: decl %r9d
+; CHECK-NEXT: jne .LBB12_2
+; CHECK-NEXT: .LBB12_3: # %for.end
+; CHECK-NEXT: retq
entry:
%cmp13 = icmp sgt i32 %n, 0
br i1 %cmp13, label %bb.nph, label %for.end
-bb.nph:
+bb.nph:
br label %for.body
for.body:
- %i.014 = phi i32 [ 0, %bb.nph ], [ %inc, %for.body ]
+ %i.014 = phi i32 [ 0, %bb.nph ], [ %inc, %for.body ]
%arrayidx11 = getelementptr <3 x i32>, <3 x i32>* %dest, i32 %i.014
%tmp4 = load <3 x i32>, <3 x i32>* %arrayidx11 ; <<3 x i32>> [#uses=1]
%arrayidx7 = getelementptr inbounds <3 x i32>, <3 x i32>* %old, i32 %i.014
@@ -187,7 +451,7 @@ for.body:
%div = sdiv <3 x i32> %tmp4, %tmp8
store <3 x i32> %div, <3 x i32>* %arrayidx11
%inc = add nsw i32 %i.014, 1
- %exitcond = icmp eq i32 %inc, %n
+ %exitcond = icmp eq i32 %inc, %n
br i1 %exitcond, label %for.end, label %for.body
for.end: ; preds = %for.body, %entry
diff --git a/test/CodeGen/X86/scalarize-bitcast.ll b/test/CodeGen/X86/scalarize-bitcast.ll
index 60650f46302f..25cfaa2f353d 100644
--- a/test/CodeGen/X86/scalarize-bitcast.ll
+++ b/test/CodeGen/X86/scalarize-bitcast.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64
+; RUN: llc < %s
; PR3886
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
diff --git a/test/CodeGen/X86/scatter-schedule.ll b/test/CodeGen/X86/scatter-schedule.ll
new file mode 100644
index 000000000000..c7e6628ab2dc
--- /dev/null
+++ b/test/CodeGen/X86/scatter-schedule.ll
@@ -0,0 +1,22 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mcpu=skx < %s | FileCheck %s
+
+target triple = "x86_64-unknown-linux-gnu"
+
+; This test checks the order of scatter operations after split.
+; The right order is "from LSB to MSB", otherwise the semantic is broken.
+
+define void @test(i64 %x272, <16 x i32*> %x335, <16 x i32> %x270) {
+; CHECK-LABEL: test:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kxnorw %k0, %k0, %k1
+; CHECK-NEXT: kxnorw %k0, %k0, %k2
+; CHECK-NEXT: vpscatterqd %ymm2, (,%zmm0) {%k2}
+; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm0
+; CHECK-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1}
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> %x270, <16 x i32*> %x335, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+ ret void
+}
+declare void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> , <16 x i32*> , i32, <16 x i1> )
diff --git a/test/CodeGen/X86/schedule-x86_32.ll b/test/CodeGen/X86/schedule-x86_32.ll
new file mode 100644
index 000000000000..7a60301bd6ec
--- /dev/null
+++ b/test/CodeGen/X86/schedule-x86_32.ll
@@ -0,0 +1,2332 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown -print-schedule -mcpu=i686 | FileCheck %s --check-prefix=CHECK --check-prefix=GENERIC
+; RUN: llc < %s -mtriple=i686-unknown-unknown -print-schedule -mcpu=atom | FileCheck %s --check-prefix=CHECK --check-prefix=ATOM
+; RUN: llc < %s -mtriple=i686-unknown-unknown -print-schedule -mcpu=slm | FileCheck %s --check-prefix=CHECK --check-prefix=SLM
+; RUN: llc < %s -mtriple=i686-unknown-unknown -print-schedule -mcpu=sandybridge | FileCheck %s --check-prefix=CHECK --check-prefix=SANDY
+; RUN: llc < %s -mtriple=i686-unknown-unknown -print-schedule -mcpu=ivybridge | FileCheck %s --check-prefix=CHECK --check-prefix=SANDY
+; RUN: llc < %s -mtriple=i686-unknown-unknown -print-schedule -mcpu=haswell | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
+; RUN: llc < %s -mtriple=i686-unknown-unknown -print-schedule -mcpu=broadwell | FileCheck %s --check-prefix=CHECK --check-prefix=BROADWELL
+; RUN: llc < %s -mtriple=i686-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=SKYLAKE
+; RUN: llc < %s -mtriple=i686-unknown-unknown -print-schedule -mcpu=skx | FileCheck %s --check-prefix=CHECK --check-prefix=SKX
+; RUN: llc < %s -mtriple=i686-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
+; RUN: llc < %s -mtriple=i686-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1
+
+define i8 @test_aaa(i8 %a0) optsize {
+; GENERIC-LABEL: test_aaa:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: movb {{[0-9]+}}(%esp), %al
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: aaa
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retl
+;
+; ATOM-LABEL: test_aaa:
+; ATOM: # %bb.0:
+; ATOM-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [1:1.00]
+; ATOM-NEXT: #APP
+; ATOM-NEXT: aaa # sched: [13:6.50]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retl # sched: [79:39.50]
+;
+; SLM-LABEL: test_aaa:
+; SLM: # %bb.0:
+; SLM-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [3:1.00]
+; SLM-NEXT: #APP
+; SLM-NEXT: aaa # sched: [100:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retl # sched: [4:1.00]
+;
+; SANDY-LABEL: test_aaa:
+; SANDY: # %bb.0:
+; SANDY-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [5:0.50]
+; SANDY-NEXT: #APP
+; SANDY-NEXT: aaa # sched: [100:0.33]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retl # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_aaa:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [5:0.50]
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: aaa # sched: [100:0.25]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retl # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_aaa:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [5:0.50]
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: aaa # sched: [100:0.25]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retl # sched: [6:0.50]
+;
+; SKYLAKE-LABEL: test_aaa:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [5:0.50]
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: aaa # sched: [100:0.25]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retl # sched: [6:0.50]
+;
+; SKX-LABEL: test_aaa:
+; SKX: # %bb.0:
+; SKX-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [5:0.50]
+; SKX-NEXT: #APP
+; SKX-NEXT: aaa # sched: [100:0.25]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retl # sched: [6:0.50]
+;
+; BTVER2-LABEL: test_aaa:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [5:1.00]
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: aaa # sched: [100:0.17]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retl # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_aaa:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [8:0.50]
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: aaa # sched: [100:?]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retl # sched: [1:0.50]
+ %1 = tail call i8 asm "aaa", "=r,r"(i8 %a0) nounwind
+ ret i8 %1
+}
+
+define i8 @test_aad(i16 %a0) optsize {
+; GENERIC-LABEL: test_aad:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: movzwl {{[0-9]+}}(%esp), %eax
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: aad
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retl
+;
+; ATOM-LABEL: test_aad:
+; ATOM: # %bb.0:
+; ATOM-NEXT: movzwl {{[0-9]+}}(%esp), %eax # sched: [1:1.00]
+; ATOM-NEXT: #APP
+; ATOM-NEXT: aad # sched: [7:3.50]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retl # sched: [79:39.50]
+;
+; SLM-LABEL: test_aad:
+; SLM: # %bb.0:
+; SLM-NEXT: movzwl {{[0-9]+}}(%esp), %eax # sched: [4:1.00]
+; SLM-NEXT: #APP
+; SLM-NEXT: aad # sched: [100:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retl # sched: [4:1.00]
+;
+; SANDY-LABEL: test_aad:
+; SANDY: # %bb.0:
+; SANDY-NEXT: movzwl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; SANDY-NEXT: #APP
+; SANDY-NEXT: aad # sched: [100:0.33]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retl # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_aad:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: movzwl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: aad # sched: [100:0.25]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retl # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_aad:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: movzwl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: aad # sched: [100:0.25]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retl # sched: [6:0.50]
+;
+; SKYLAKE-LABEL: test_aad:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: movzwl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: aad # sched: [100:0.25]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retl # sched: [6:0.50]
+;
+; SKX-LABEL: test_aad:
+; SKX: # %bb.0:
+; SKX-NEXT: movzwl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; SKX-NEXT: #APP
+; SKX-NEXT: aad # sched: [100:0.25]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retl # sched: [6:0.50]
+;
+; BTVER2-LABEL: test_aad:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: movzwl {{[0-9]+}}(%esp), %eax # sched: [4:1.00]
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: aad # sched: [100:0.17]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retl # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_aad:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: movzwl {{[0-9]+}}(%esp), %eax # sched: [8:0.50]
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: aad # sched: [100:?]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retl # sched: [1:0.50]
+ %1 = tail call i8 asm "aad", "=r,r"(i16 %a0) nounwind
+ ret i8 %1
+}
+
+define i16 @test_aam(i8 %a0) optsize {
+; GENERIC-LABEL: test_aam:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: movb {{[0-9]+}}(%esp), %al
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: aam
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retl
+;
+; ATOM-LABEL: test_aam:
+; ATOM: # %bb.0:
+; ATOM-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [1:1.00]
+; ATOM-NEXT: #APP
+; ATOM-NEXT: aam # sched: [21:10.50]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retl # sched: [79:39.50]
+;
+; SLM-LABEL: test_aam:
+; SLM: # %bb.0:
+; SLM-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [3:1.00]
+; SLM-NEXT: #APP
+; SLM-NEXT: aam # sched: [100:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retl # sched: [4:1.00]
+;
+; SANDY-LABEL: test_aam:
+; SANDY: # %bb.0:
+; SANDY-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [5:0.50]
+; SANDY-NEXT: #APP
+; SANDY-NEXT: aam # sched: [100:0.33]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retl # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_aam:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [5:0.50]
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: aam # sched: [100:0.25]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retl # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_aam:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [5:0.50]
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: aam # sched: [100:0.25]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retl # sched: [6:0.50]
+;
+; SKYLAKE-LABEL: test_aam:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [5:0.50]
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: aam # sched: [100:0.25]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retl # sched: [6:0.50]
+;
+; SKX-LABEL: test_aam:
+; SKX: # %bb.0:
+; SKX-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [5:0.50]
+; SKX-NEXT: #APP
+; SKX-NEXT: aam # sched: [100:0.25]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retl # sched: [6:0.50]
+;
+; BTVER2-LABEL: test_aam:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [5:1.00]
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: aam # sched: [100:0.17]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retl # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_aam:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [8:0.50]
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: aam # sched: [100:?]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retl # sched: [1:0.50]
+ %1 = tail call i16 asm "aam", "=r,r"(i8 %a0) nounwind
+ ret i16 %1
+}
+
+define i8 @test_aas(i8 %a0) optsize {
+; GENERIC-LABEL: test_aas:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: movb {{[0-9]+}}(%esp), %al
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: aas
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retl
+;
+; ATOM-LABEL: test_aas:
+; ATOM: # %bb.0:
+; ATOM-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [1:1.00]
+; ATOM-NEXT: #APP
+; ATOM-NEXT: aas # sched: [13:6.50]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retl # sched: [79:39.50]
+;
+; SLM-LABEL: test_aas:
+; SLM: # %bb.0:
+; SLM-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [3:1.00]
+; SLM-NEXT: #APP
+; SLM-NEXT: aas # sched: [100:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retl # sched: [4:1.00]
+;
+; SANDY-LABEL: test_aas:
+; SANDY: # %bb.0:
+; SANDY-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [5:0.50]
+; SANDY-NEXT: #APP
+; SANDY-NEXT: aas # sched: [100:0.33]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retl # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_aas:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [5:0.50]
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: aas # sched: [100:0.25]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retl # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_aas:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [5:0.50]
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: aas # sched: [100:0.25]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retl # sched: [6:0.50]
+;
+; SKYLAKE-LABEL: test_aas:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [5:0.50]
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: aas # sched: [100:0.25]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retl # sched: [6:0.50]
+;
+; SKX-LABEL: test_aas:
+; SKX: # %bb.0:
+; SKX-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [5:0.50]
+; SKX-NEXT: #APP
+; SKX-NEXT: aas # sched: [100:0.25]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retl # sched: [6:0.50]
+;
+; BTVER2-LABEL: test_aas:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [5:1.00]
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: aas # sched: [100:0.17]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retl # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_aas:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [8:0.50]
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: aas # sched: [100:?]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retl # sched: [1:0.50]
+ %1 = tail call i8 asm "aas", "=r,r"(i8 %a0) nounwind
+ ret i8 %1
+}
+
+define void @test_arpl(i16 %a0, i16 *%a1) optsize {
+; GENERIC-LABEL: test_arpl:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: movzwl {{[0-9]+}}(%esp), %eax
+; GENERIC-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: arpl %ax, (%ecx)
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retl
+;
+; ATOM-LABEL: test_arpl:
+; ATOM: # %bb.0:
+; ATOM-NEXT: movzwl {{[0-9]+}}(%esp), %eax # sched: [1:1.00]
+; ATOM-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [1:1.00]
+; ATOM-NEXT: #APP
+; ATOM-NEXT: arpl %ax, (%ecx) # sched: [23:11.50]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retl # sched: [79:39.50]
+;
+; SLM-LABEL: test_arpl:
+; SLM: # %bb.0:
+; SLM-NEXT: movzwl {{[0-9]+}}(%esp), %eax # sched: [4:1.00]
+; SLM-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [3:1.00]
+; SLM-NEXT: #APP
+; SLM-NEXT: arpl %ax, (%ecx) # sched: [100:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retl # sched: [4:1.00]
+;
+; SANDY-LABEL: test_arpl:
+; SANDY: # %bb.0:
+; SANDY-NEXT: movzwl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; SANDY-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; SANDY-NEXT: #APP
+; SANDY-NEXT: arpl %ax, (%ecx) # sched: [100:0.33]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retl # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_arpl:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: movzwl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; HASWELL-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: arpl %ax, (%ecx) # sched: [100:0.25]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retl # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_arpl:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: movzwl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; BROADWELL-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: arpl %ax, (%ecx) # sched: [100:0.25]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retl # sched: [6:0.50]
+;
+; SKYLAKE-LABEL: test_arpl:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: movzwl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; SKYLAKE-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: arpl %ax, (%ecx) # sched: [100:0.25]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retl # sched: [6:0.50]
+;
+; SKX-LABEL: test_arpl:
+; SKX: # %bb.0:
+; SKX-NEXT: movzwl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; SKX-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; SKX-NEXT: #APP
+; SKX-NEXT: arpl %ax, (%ecx) # sched: [100:0.25]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retl # sched: [6:0.50]
+;
+; BTVER2-LABEL: test_arpl:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: movzwl {{[0-9]+}}(%esp), %eax # sched: [4:1.00]
+; BTVER2-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:1.00]
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: arpl %ax, (%ecx) # sched: [100:0.17]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retl # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_arpl:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [8:0.50]
+; ZNVER1-NEXT: movzwl {{[0-9]+}}(%esp), %eax # sched: [8:0.50]
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: arpl %ax, (%ecx) # sched: [100:?]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retl # sched: [1:0.50]
+ call void asm sideeffect "arpl $0, $1", "r,*m"(i16 %a0, i16 *%a1)
+ ret void
+}
+
+define void @test_bound(i16 %a0, i16 *%a1, i32 %a2, i32 *%a3) optsize {
+; GENERIC-LABEL: test_bound:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: pushl %esi
+; GENERIC-NEXT: .cfi_def_cfa_offset 8
+; GENERIC-NEXT: .cfi_offset %esi, -8
+; GENERIC-NEXT: movzwl {{[0-9]+}}(%esp), %eax
+; GENERIC-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; GENERIC-NEXT: movl {{[0-9]+}}(%esp), %edx
+; GENERIC-NEXT: movl {{[0-9]+}}(%esp), %esi
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: bound (%esi), %ax
+; GENERIC-NEXT: bound (%edx), %ecx
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: popl %esi
+; GENERIC-NEXT: retl
+;
+; ATOM-LABEL: test_bound:
+; ATOM: # %bb.0:
+; ATOM-NEXT: pushl %esi # sched: [1:1.00]
+; ATOM-NEXT: .cfi_def_cfa_offset 8
+; ATOM-NEXT: .cfi_offset %esi, -8
+; ATOM-NEXT: movzwl {{[0-9]+}}(%esp), %eax # sched: [1:1.00]
+; ATOM-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [1:1.00]
+; ATOM-NEXT: movl {{[0-9]+}}(%esp), %edx # sched: [1:1.00]
+; ATOM-NEXT: movl {{[0-9]+}}(%esp), %esi # sched: [1:1.00]
+; ATOM-NEXT: #APP
+; ATOM-NEXT: bound (%esi), %ax # sched: [11:5.50]
+; ATOM-NEXT: bound (%edx), %ecx # sched: [11:5.50]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: popl %esi # sched: [1:1.00]
+; ATOM-NEXT: retl # sched: [79:39.50]
+;
+; SLM-LABEL: test_bound:
+; SLM: # %bb.0:
+; SLM-NEXT: pushl %esi # sched: [1:1.00]
+; SLM-NEXT: .cfi_def_cfa_offset 8
+; SLM-NEXT: .cfi_offset %esi, -8
+; SLM-NEXT: movzwl {{[0-9]+}}(%esp), %eax # sched: [4:1.00]
+; SLM-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [3:1.00]
+; SLM-NEXT: movl {{[0-9]+}}(%esp), %edx # sched: [3:1.00]
+; SLM-NEXT: movl {{[0-9]+}}(%esp), %esi # sched: [3:1.00]
+; SLM-NEXT: #APP
+; SLM-NEXT: bound (%esi), %ax # sched: [100:1.00]
+; SLM-NEXT: bound (%edx), %ecx # sched: [100:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: popl %esi # sched: [3:1.00]
+; SLM-NEXT: retl # sched: [4:1.00]
+;
+; SANDY-LABEL: test_bound:
+; SANDY: # %bb.0:
+; SANDY-NEXT: pushl %esi # sched: [5:1.00]
+; SANDY-NEXT: .cfi_def_cfa_offset 8
+; SANDY-NEXT: .cfi_offset %esi, -8
+; SANDY-NEXT: movzwl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; SANDY-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; SANDY-NEXT: movl {{[0-9]+}}(%esp), %edx # sched: [5:0.50]
+; SANDY-NEXT: movl {{[0-9]+}}(%esp), %esi # sched: [5:0.50]
+; SANDY-NEXT: #APP
+; SANDY-NEXT: bound (%esi), %ax # sched: [100:0.33]
+; SANDY-NEXT: bound (%edx), %ecx # sched: [100:0.33]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: popl %esi # sched: [6:0.50]
+; SANDY-NEXT: retl # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_bound:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: pushl %esi # sched: [2:1.00]
+; HASWELL-NEXT: .cfi_def_cfa_offset 8
+; HASWELL-NEXT: .cfi_offset %esi, -8
+; HASWELL-NEXT: movzwl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; HASWELL-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; HASWELL-NEXT: movl {{[0-9]+}}(%esp), %edx # sched: [5:0.50]
+; HASWELL-NEXT: movl {{[0-9]+}}(%esp), %esi # sched: [5:0.50]
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: bound (%esi), %ax # sched: [1:?]
+; HASWELL-NEXT: bound (%edx), %ecx # sched: [1:?]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: popl %esi # sched: [6:0.50]
+; HASWELL-NEXT: retl # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_bound:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: pushl %esi # sched: [2:1.00]
+; BROADWELL-NEXT: .cfi_def_cfa_offset 8
+; BROADWELL-NEXT: .cfi_offset %esi, -8
+; BROADWELL-NEXT: movzwl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; BROADWELL-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; BROADWELL-NEXT: movl {{[0-9]+}}(%esp), %edx # sched: [5:0.50]
+; BROADWELL-NEXT: movl {{[0-9]+}}(%esp), %esi # sched: [5:0.50]
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: bound (%esi), %ax # sched: [100:0.25]
+; BROADWELL-NEXT: bound (%edx), %ecx # sched: [100:0.25]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: popl %esi # sched: [6:0.50]
+; BROADWELL-NEXT: retl # sched: [6:0.50]
+;
+; SKYLAKE-LABEL: test_bound:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: pushl %esi # sched: [2:1.00]
+; SKYLAKE-NEXT: .cfi_def_cfa_offset 8
+; SKYLAKE-NEXT: .cfi_offset %esi, -8
+; SKYLAKE-NEXT: movzwl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; SKYLAKE-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; SKYLAKE-NEXT: movl {{[0-9]+}}(%esp), %edx # sched: [5:0.50]
+; SKYLAKE-NEXT: movl {{[0-9]+}}(%esp), %esi # sched: [5:0.50]
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: bound (%esi), %ax # sched: [100:0.25]
+; SKYLAKE-NEXT: bound (%edx), %ecx # sched: [100:0.25]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: popl %esi # sched: [6:0.50]
+; SKYLAKE-NEXT: retl # sched: [6:0.50]
+;
+; SKX-LABEL: test_bound:
+; SKX: # %bb.0:
+; SKX-NEXT: pushl %esi # sched: [2:1.00]
+; SKX-NEXT: .cfi_def_cfa_offset 8
+; SKX-NEXT: .cfi_offset %esi, -8
+; SKX-NEXT: movzwl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; SKX-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; SKX-NEXT: movl {{[0-9]+}}(%esp), %edx # sched: [5:0.50]
+; SKX-NEXT: movl {{[0-9]+}}(%esp), %esi # sched: [5:0.50]
+; SKX-NEXT: #APP
+; SKX-NEXT: bound (%esi), %ax # sched: [100:0.25]
+; SKX-NEXT: bound (%edx), %ecx # sched: [100:0.25]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: popl %esi # sched: [6:0.50]
+; SKX-NEXT: retl # sched: [6:0.50]
+;
+; BTVER2-LABEL: test_bound:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: pushl %esi # sched: [1:1.00]
+; BTVER2-NEXT: .cfi_def_cfa_offset 8
+; BTVER2-NEXT: .cfi_offset %esi, -8
+; BTVER2-NEXT: movzwl {{[0-9]+}}(%esp), %eax # sched: [4:1.00]
+; BTVER2-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:1.00]
+; BTVER2-NEXT: movl {{[0-9]+}}(%esp), %edx # sched: [5:1.00]
+; BTVER2-NEXT: movl {{[0-9]+}}(%esp), %esi # sched: [5:1.00]
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: bound (%esi), %ax # sched: [100:0.17]
+; BTVER2-NEXT: bound (%edx), %ecx # sched: [100:0.17]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: popl %esi # sched: [5:1.00]
+; BTVER2-NEXT: retl # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_bound:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: pushl %esi # sched: [1:0.50]
+; ZNVER1-NEXT: .cfi_def_cfa_offset 8
+; ZNVER1-NEXT: .cfi_offset %esi, -8
+; ZNVER1-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [8:0.50]
+; ZNVER1-NEXT: movl {{[0-9]+}}(%esp), %edx # sched: [8:0.50]
+; ZNVER1-NEXT: movl {{[0-9]+}}(%esp), %esi # sched: [8:0.50]
+; ZNVER1-NEXT: movzwl {{[0-9]+}}(%esp), %eax # sched: [8:0.50]
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: bound (%esi), %ax # sched: [100:?]
+; ZNVER1-NEXT: bound (%edx), %ecx # sched: [100:?]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: popl %esi # sched: [8:0.50]
+; ZNVER1-NEXT: retl # sched: [1:0.50]
+ call void asm sideeffect "bound $1, $0 \0A\09 bound $3, $2", "r,*m,r,*m"(i16 %a0, i16 *%a1, i32 %a2, i32 *%a3)
+ ret void
+}
+
+define i8 @test_daa(i8 %a0) optsize {
+; GENERIC-LABEL: test_daa:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: movb {{[0-9]+}}(%esp), %al
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: daa
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retl
+;
+; ATOM-LABEL: test_daa:
+; ATOM: # %bb.0:
+; ATOM-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [1:1.00]
+; ATOM-NEXT: #APP
+; ATOM-NEXT: daa # sched: [18:9.00]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retl # sched: [79:39.50]
+;
+; SLM-LABEL: test_daa:
+; SLM: # %bb.0:
+; SLM-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [3:1.00]
+; SLM-NEXT: #APP
+; SLM-NEXT: daa # sched: [100:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retl # sched: [4:1.00]
+;
+; SANDY-LABEL: test_daa:
+; SANDY: # %bb.0:
+; SANDY-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [5:0.50]
+; SANDY-NEXT: #APP
+; SANDY-NEXT: daa # sched: [100:0.33]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retl # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_daa:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [5:0.50]
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: daa # sched: [100:0.25]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retl # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_daa:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [5:0.50]
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: daa # sched: [100:0.25]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retl # sched: [6:0.50]
+;
+; SKYLAKE-LABEL: test_daa:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [5:0.50]
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: daa # sched: [100:0.25]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retl # sched: [6:0.50]
+;
+; SKX-LABEL: test_daa:
+; SKX: # %bb.0:
+; SKX-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [5:0.50]
+; SKX-NEXT: #APP
+; SKX-NEXT: daa # sched: [100:0.25]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retl # sched: [6:0.50]
+;
+; BTVER2-LABEL: test_daa:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [5:1.00]
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: daa # sched: [100:0.17]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retl # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_daa:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [8:0.50]
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: daa # sched: [100:?]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retl # sched: [1:0.50]
+ %1 = tail call i8 asm "daa", "=r,r"(i8 %a0) nounwind
+ ret i8 %1
+}
+
+define i8 @test_das(i8 %a0) optsize {
+; GENERIC-LABEL: test_das:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: movb {{[0-9]+}}(%esp), %al
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: das
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retl
+;
+; ATOM-LABEL: test_das:
+; ATOM: # %bb.0:
+; ATOM-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [1:1.00]
+; ATOM-NEXT: #APP
+; ATOM-NEXT: das # sched: [20:10.00]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retl # sched: [79:39.50]
+;
+; SLM-LABEL: test_das:
+; SLM: # %bb.0:
+; SLM-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [3:1.00]
+; SLM-NEXT: #APP
+; SLM-NEXT: das # sched: [100:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retl # sched: [4:1.00]
+;
+; SANDY-LABEL: test_das:
+; SANDY: # %bb.0:
+; SANDY-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [5:0.50]
+; SANDY-NEXT: #APP
+; SANDY-NEXT: das # sched: [100:0.33]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retl # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_das:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [5:0.50]
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: das # sched: [100:0.25]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retl # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_das:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [5:0.50]
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: das # sched: [100:0.25]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retl # sched: [6:0.50]
+;
+; SKYLAKE-LABEL: test_das:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [5:0.50]
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: das # sched: [100:0.25]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retl # sched: [6:0.50]
+;
+; SKX-LABEL: test_das:
+; SKX: # %bb.0:
+; SKX-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [5:0.50]
+; SKX-NEXT: #APP
+; SKX-NEXT: das # sched: [100:0.25]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retl # sched: [6:0.50]
+;
+; BTVER2-LABEL: test_das:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [5:1.00]
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: das # sched: [100:0.17]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retl # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_das:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: movb {{[0-9]+}}(%esp), %al # sched: [8:0.50]
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: das # sched: [100:?]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retl # sched: [1:0.50]
+ %1 = tail call i8 asm "das", "=r,r"(i8 %a0) nounwind
+ ret i8 %1
+}
+
+define void @test_dec16(i16 %a0, i16* %a1) optsize {
+; GENERIC-LABEL: test_dec16:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: movzwl {{[0-9]+}}(%esp), %eax
+; GENERIC-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: decw %ax
+; GENERIC-NEXT: decw (%ecx)
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retl
+;
+; ATOM-LABEL: test_dec16:
+; ATOM: # %bb.0:
+; ATOM-NEXT: movzwl {{[0-9]+}}(%esp), %eax # sched: [1:1.00]
+; ATOM-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [1:1.00]
+; ATOM-NEXT: #APP
+; ATOM-NEXT: decw %ax # sched: [1:0.50]
+; ATOM-NEXT: decw (%ecx) # sched: [1:1.00]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retl # sched: [79:39.50]
+;
+; SLM-LABEL: test_dec16:
+; SLM: # %bb.0:
+; SLM-NEXT: movzwl {{[0-9]+}}(%esp), %eax # sched: [4:1.00]
+; SLM-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [3:1.00]
+; SLM-NEXT: #APP
+; SLM-NEXT: decw %ax # sched: [1:0.50]
+; SLM-NEXT: decw (%ecx) # sched: [4:2.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retl # sched: [4:1.00]
+;
+; SANDY-LABEL: test_dec16:
+; SANDY: # %bb.0:
+; SANDY-NEXT: movzwl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; SANDY-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; SANDY-NEXT: #APP
+; SANDY-NEXT: decw %ax # sched: [1:0.33]
+; SANDY-NEXT: decw (%ecx) # sched: [7:1.00]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retl # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_dec16:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: movzwl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; HASWELL-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: decw %ax # sched: [1:0.25]
+; HASWELL-NEXT: decw (%ecx) # sched: [7:1.00]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retl # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_dec16:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: movzwl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; BROADWELL-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: decw %ax # sched: [1:0.25]
+; BROADWELL-NEXT: decw (%ecx) # sched: [6:1.00]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retl # sched: [6:0.50]
+;
+; SKYLAKE-LABEL: test_dec16:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: movzwl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; SKYLAKE-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: decw %ax # sched: [1:0.25]
+; SKYLAKE-NEXT: decw (%ecx) # sched: [6:1.00]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retl # sched: [6:0.50]
+;
+; SKX-LABEL: test_dec16:
+; SKX: # %bb.0:
+; SKX-NEXT: movzwl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; SKX-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; SKX-NEXT: #APP
+; SKX-NEXT: decw %ax # sched: [1:0.25]
+; SKX-NEXT: decw (%ecx) # sched: [6:1.00]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retl # sched: [6:0.50]
+;
+; BTVER2-LABEL: test_dec16:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: movzwl {{[0-9]+}}(%esp), %eax # sched: [4:1.00]
+; BTVER2-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:1.00]
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: decw %ax # sched: [1:0.50]
+; BTVER2-NEXT: decw (%ecx) # sched: [4:1.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retl # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_dec16:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [8:0.50]
+; ZNVER1-NEXT: movzwl {{[0-9]+}}(%esp), %eax # sched: [8:0.50]
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: decw %ax # sched: [1:0.25]
+; ZNVER1-NEXT: decw (%ecx) # sched: [5:0.50]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retl # sched: [1:0.50]
+ tail call void asm "decw $0 \0A\09 decw $1", "r,*m"(i16 %a0, i16* %a1) nounwind
+ ret void
+}
+define void @test_dec32(i32 %a0, i32* %a1) optsize {
+; GENERIC-LABEL: test_dec32:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: movl {{[0-9]+}}(%esp), %eax
+; GENERIC-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: decl %eax
+; GENERIC-NEXT: decl (%ecx)
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retl
+;
+; ATOM-LABEL: test_dec32:
+; ATOM: # %bb.0:
+; ATOM-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [1:1.00]
+; ATOM-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [1:1.00]
+; ATOM-NEXT: #APP
+; ATOM-NEXT: decl %eax # sched: [1:0.50]
+; ATOM-NEXT: decl (%ecx) # sched: [1:1.00]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retl # sched: [79:39.50]
+;
+; SLM-LABEL: test_dec32:
+; SLM: # %bb.0:
+; SLM-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [3:1.00]
+; SLM-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [3:1.00]
+; SLM-NEXT: #APP
+; SLM-NEXT: decl %eax # sched: [1:0.50]
+; SLM-NEXT: decl (%ecx) # sched: [4:2.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retl # sched: [4:1.00]
+;
+; SANDY-LABEL: test_dec32:
+; SANDY: # %bb.0:
+; SANDY-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; SANDY-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; SANDY-NEXT: #APP
+; SANDY-NEXT: decl %eax # sched: [1:0.33]
+; SANDY-NEXT: decl (%ecx) # sched: [7:1.00]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retl # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_dec32:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; HASWELL-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: decl %eax # sched: [1:0.25]
+; HASWELL-NEXT: decl (%ecx) # sched: [7:1.00]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retl # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_dec32:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; BROADWELL-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: decl %eax # sched: [1:0.25]
+; BROADWELL-NEXT: decl (%ecx) # sched: [6:1.00]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retl # sched: [6:0.50]
+;
+; SKYLAKE-LABEL: test_dec32:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; SKYLAKE-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: decl %eax # sched: [1:0.25]
+; SKYLAKE-NEXT: decl (%ecx) # sched: [6:1.00]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retl # sched: [6:0.50]
+;
+; SKX-LABEL: test_dec32:
+; SKX: # %bb.0:
+; SKX-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; SKX-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; SKX-NEXT: #APP
+; SKX-NEXT: decl %eax # sched: [1:0.25]
+; SKX-NEXT: decl (%ecx) # sched: [6:1.00]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retl # sched: [6:0.50]
+;
+; BTVER2-LABEL: test_dec32:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:1.00]
+; BTVER2-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:1.00]
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: decl %eax # sched: [1:0.50]
+; BTVER2-NEXT: decl (%ecx) # sched: [4:1.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retl # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_dec32:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [8:0.50]
+; ZNVER1-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [8:0.50]
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: decl %eax # sched: [1:0.25]
+; ZNVER1-NEXT: decl (%ecx) # sched: [5:0.50]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retl # sched: [1:0.50]
+ tail call void asm "decl $0 \0A\09 decl $1", "r,*m"(i32 %a0, i32* %a1) nounwind
+ ret void
+}
+
+define void @test_inc16(i16 %a0, i16* %a1) optsize {
+; GENERIC-LABEL: test_inc16:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: movzwl {{[0-9]+}}(%esp), %eax
+; GENERIC-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: incw %ax
+; GENERIC-NEXT: incw (%ecx)
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retl
+;
+; ATOM-LABEL: test_inc16:
+; ATOM: # %bb.0:
+; ATOM-NEXT: movzwl {{[0-9]+}}(%esp), %eax # sched: [1:1.00]
+; ATOM-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [1:1.00]
+; ATOM-NEXT: #APP
+; ATOM-NEXT: incw %ax # sched: [1:0.50]
+; ATOM-NEXT: incw (%ecx) # sched: [1:1.00]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retl # sched: [79:39.50]
+;
+; SLM-LABEL: test_inc16:
+; SLM: # %bb.0:
+; SLM-NEXT: movzwl {{[0-9]+}}(%esp), %eax # sched: [4:1.00]
+; SLM-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [3:1.00]
+; SLM-NEXT: #APP
+; SLM-NEXT: incw %ax # sched: [1:0.50]
+; SLM-NEXT: incw (%ecx) # sched: [4:2.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retl # sched: [4:1.00]
+;
+; SANDY-LABEL: test_inc16:
+; SANDY: # %bb.0:
+; SANDY-NEXT: movzwl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; SANDY-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; SANDY-NEXT: #APP
+; SANDY-NEXT: incw %ax # sched: [1:0.33]
+; SANDY-NEXT: incw (%ecx) # sched: [7:1.00]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retl # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_inc16:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: movzwl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; HASWELL-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: incw %ax # sched: [1:0.25]
+; HASWELL-NEXT: incw (%ecx) # sched: [7:1.00]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retl # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_inc16:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: movzwl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; BROADWELL-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: incw %ax # sched: [1:0.25]
+; BROADWELL-NEXT: incw (%ecx) # sched: [6:1.00]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retl # sched: [6:0.50]
+;
+; SKYLAKE-LABEL: test_inc16:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: movzwl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; SKYLAKE-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: incw %ax # sched: [1:0.25]
+; SKYLAKE-NEXT: incw (%ecx) # sched: [6:1.00]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retl # sched: [6:0.50]
+;
+; SKX-LABEL: test_inc16:
+; SKX: # %bb.0:
+; SKX-NEXT: movzwl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; SKX-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; SKX-NEXT: #APP
+; SKX-NEXT: incw %ax # sched: [1:0.25]
+; SKX-NEXT: incw (%ecx) # sched: [6:1.00]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retl # sched: [6:0.50]
+;
+; BTVER2-LABEL: test_inc16:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: movzwl {{[0-9]+}}(%esp), %eax # sched: [4:1.00]
+; BTVER2-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:1.00]
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: incw %ax # sched: [1:0.50]
+; BTVER2-NEXT: incw (%ecx) # sched: [4:1.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retl # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_inc16:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [8:0.50]
+; ZNVER1-NEXT: movzwl {{[0-9]+}}(%esp), %eax # sched: [8:0.50]
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: incw %ax # sched: [1:0.25]
+; ZNVER1-NEXT: incw (%ecx) # sched: [5:0.50]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retl # sched: [1:0.50]
+ tail call void asm "incw $0 \0A\09 incw $1", "r,*m"(i16 %a0, i16* %a1) nounwind
+ ret void
+}
+define void @test_inc32(i32 %a0, i32* %a1) optsize {
+; GENERIC-LABEL: test_inc32:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: movl {{[0-9]+}}(%esp), %eax
+; GENERIC-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: incl %eax
+; GENERIC-NEXT: incl (%ecx)
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retl
+;
+; ATOM-LABEL: test_inc32:
+; ATOM: # %bb.0:
+; ATOM-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [1:1.00]
+; ATOM-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [1:1.00]
+; ATOM-NEXT: #APP
+; ATOM-NEXT: incl %eax # sched: [1:0.50]
+; ATOM-NEXT: incl (%ecx) # sched: [1:1.00]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retl # sched: [79:39.50]
+;
+; SLM-LABEL: test_inc32:
+; SLM: # %bb.0:
+; SLM-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [3:1.00]
+; SLM-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [3:1.00]
+; SLM-NEXT: #APP
+; SLM-NEXT: incl %eax # sched: [1:0.50]
+; SLM-NEXT: incl (%ecx) # sched: [4:2.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retl # sched: [4:1.00]
+;
+; SANDY-LABEL: test_inc32:
+; SANDY: # %bb.0:
+; SANDY-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; SANDY-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; SANDY-NEXT: #APP
+; SANDY-NEXT: incl %eax # sched: [1:0.33]
+; SANDY-NEXT: incl (%ecx) # sched: [7:1.00]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retl # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_inc32:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; HASWELL-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: incl %eax # sched: [1:0.25]
+; HASWELL-NEXT: incl (%ecx) # sched: [7:1.00]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retl # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_inc32:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; BROADWELL-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: incl %eax # sched: [1:0.25]
+; BROADWELL-NEXT: incl (%ecx) # sched: [6:1.00]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retl # sched: [6:0.50]
+;
+; SKYLAKE-LABEL: test_inc32:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; SKYLAKE-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: incl %eax # sched: [1:0.25]
+; SKYLAKE-NEXT: incl (%ecx) # sched: [6:1.00]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retl # sched: [6:0.50]
+;
+; SKX-LABEL: test_inc32:
+; SKX: # %bb.0:
+; SKX-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; SKX-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; SKX-NEXT: #APP
+; SKX-NEXT: incl %eax # sched: [1:0.25]
+; SKX-NEXT: incl (%ecx) # sched: [6:1.00]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retl # sched: [6:0.50]
+;
+; BTVER2-LABEL: test_inc32:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:1.00]
+; BTVER2-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:1.00]
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: incl %eax # sched: [1:0.50]
+; BTVER2-NEXT: incl (%ecx) # sched: [4:1.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retl # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_inc32:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [8:0.50]
+; ZNVER1-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [8:0.50]
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: incl %eax # sched: [1:0.25]
+; ZNVER1-NEXT: incl (%ecx) # sched: [5:0.50]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retl # sched: [1:0.50]
+ tail call void asm "incl $0 \0A\09 incl $1", "r,*m"(i32 %a0, i32* %a1) nounwind
+ ret void
+}
+
+define void @test_into() optsize {
+; GENERIC-LABEL: test_into:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: into
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retl
+;
+; ATOM-LABEL: test_into:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: into
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retl # sched: [79:39.50]
+;
+; SLM-LABEL: test_into:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: into # sched: [100:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retl # sched: [4:1.00]
+;
+; SANDY-LABEL: test_into:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: into # sched: [100:0.33]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retl # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_into:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: into # sched: [1:?]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retl # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_into:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: into # sched: [100:0.25]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retl # sched: [6:0.50]
+;
+; SKYLAKE-LABEL: test_into:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: into # sched: [100:0.25]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retl # sched: [6:0.50]
+;
+; SKX-LABEL: test_into:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: into # sched: [100:0.25]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retl # sched: [6:0.50]
+;
+; BTVER2-LABEL: test_into:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: into # sched: [100:0.17]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retl # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_into:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: into # sched: [100:?]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retl # sched: [1:0.50]
+ call void asm sideeffect "into", ""()
+ ret void
+}
+
+define void @test_jcxz_jecxz() optsize {
+; GENERIC-LABEL: test_jcxz_jecxz:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: JXTGT:
+; GENERIC-NEXT: jcxz JXTGT
+; GENERIC-NEXT: jecxz JXTGT
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retl
+;
+; ATOM-LABEL: test_jcxz_jecxz:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: JXTGT:
+; ATOM-NEXT: jcxz JXTGT # sched: [4:2.00]
+; ATOM-NEXT: jecxz JXTGT # sched: [4:2.00]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retl # sched: [79:39.50]
+;
+; SLM-LABEL: test_jcxz_jecxz:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: JXTGT:
+; SLM-NEXT: jcxz JXTGT # sched: [1:1.00]
+; SLM-NEXT: jecxz JXTGT # sched: [1:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retl # sched: [4:1.00]
+;
+; SANDY-LABEL: test_jcxz_jecxz:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: JXTGT:
+; SANDY-NEXT: jcxz JXTGT # sched: [1:1.00]
+; SANDY-NEXT: jecxz JXTGT # sched: [1:1.00]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retl # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_jcxz_jecxz:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: JXTGT:
+; HASWELL-NEXT: jcxz JXTGT # sched: [1:0.50]
+; HASWELL-NEXT: jecxz JXTGT # sched: [1:0.50]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retl # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_jcxz_jecxz:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: JXTGT:
+; BROADWELL-NEXT: jcxz JXTGT # sched: [1:0.50]
+; BROADWELL-NEXT: jecxz JXTGT # sched: [1:0.50]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retl # sched: [6:0.50]
+;
+; SKYLAKE-LABEL: test_jcxz_jecxz:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: JXTGT:
+; SKYLAKE-NEXT: jcxz JXTGT # sched: [1:0.50]
+; SKYLAKE-NEXT: jecxz JXTGT # sched: [1:0.50]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retl # sched: [6:0.50]
+;
+; SKX-LABEL: test_jcxz_jecxz:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: JXTGT:
+; SKX-NEXT: jcxz JXTGT # sched: [1:0.50]
+; SKX-NEXT: jecxz JXTGT # sched: [1:0.50]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retl # sched: [6:0.50]
+;
+; BTVER2-LABEL: test_jcxz_jecxz:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: JXTGT:
+; BTVER2-NEXT: jcxz JXTGT # sched: [1:0.50]
+; BTVER2-NEXT: jecxz JXTGT # sched: [1:0.50]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retl # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_jcxz_jecxz:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: JXTGT:
+; ZNVER1-NEXT: jcxz JXTGT # sched: [1:0.50]
+; ZNVER1-NEXT: jecxz JXTGT # sched: [1:0.25]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retl # sched: [1:0.50]
+ call void asm sideeffect "JXTGT: \0A\09 jcxz JXTGT \0A\09 jecxz JXTGT", ""()
+ ret void
+}
+
+define void @test_leave() optsize {
+; GENERIC-LABEL: test_leave:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: leave
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retl
+;
+; ATOM-LABEL: test_leave:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: leave # sched: [2:1.00]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retl # sched: [79:39.50]
+;
+; SLM-LABEL: test_leave:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: leave # sched: [1:0.50]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retl # sched: [4:1.00]
+;
+; SANDY-LABEL: test_leave:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: leave # sched: [1:0.33]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retl # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_leave:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: leave # sched: [1:0.25]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retl # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_leave:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: leave # sched: [1:0.25]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retl # sched: [6:0.50]
+;
+; SKYLAKE-LABEL: test_leave:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: leave # sched: [1:0.25]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retl # sched: [6:0.50]
+;
+; SKX-LABEL: test_leave:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: leave # sched: [1:0.25]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retl # sched: [6:0.50]
+;
+; BTVER2-LABEL: test_leave:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: leave # sched: [1:0.50]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retl # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_leave:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: leave # sched: [8:0.50]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retl # sched: [1:0.50]
+ tail call void asm "leave", ""() nounwind
+ ret void
+}
+
+define void @test_pop_push() optsize {
+; GENERIC-LABEL: test_pop_push:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: popl %ds
+; GENERIC-NEXT: popl %es
+; GENERIC-NEXT: popl %ss
+; GENERIC-NEXT: popl %fs
+; GENERIC-NEXT: popl %gs
+; GENERIC-NEXT: pushl %cs
+; GENERIC-NEXT: pushl %ds
+; GENERIC-NEXT: pushl %es
+; GENERIC-NEXT: pushl %ss
+; GENERIC-NEXT: pushl %fs
+; GENERIC-NEXT: pushl %gs
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retl
+;
+; ATOM-LABEL: test_pop_push:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: popl %ds # sched: [29:14.50]
+; ATOM-NEXT: popl %es # sched: [29:14.50]
+; ATOM-NEXT: popl %ss # sched: [48:24.00]
+; ATOM-NEXT: popl %fs # sched: [29:14.50]
+; ATOM-NEXT: popl %gs # sched: [29:14.50]
+; ATOM-NEXT: pushl %cs # sched: [2:1.00]
+; ATOM-NEXT: pushl %ds # sched: [2:1.00]
+; ATOM-NEXT: pushl %es # sched: [2:1.00]
+; ATOM-NEXT: pushl %ss # sched: [2:1.00]
+; ATOM-NEXT: pushl %fs # sched: [2:1.00]
+; ATOM-NEXT: pushl %gs # sched: [2:1.00]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retl # sched: [79:39.50]
+;
+; SLM-LABEL: test_pop_push:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: popl %ds # sched: [100:1.00]
+; SLM-NEXT: popl %es # sched: [100:1.00]
+; SLM-NEXT: popl %ss # sched: [100:1.00]
+; SLM-NEXT: popl %fs # sched: [100:1.00]
+; SLM-NEXT: popl %gs # sched: [100:1.00]
+; SLM-NEXT: pushl %cs # sched: [100:1.00]
+; SLM-NEXT: pushl %ds # sched: [100:1.00]
+; SLM-NEXT: pushl %es # sched: [100:1.00]
+; SLM-NEXT: pushl %ss # sched: [100:1.00]
+; SLM-NEXT: pushl %fs # sched: [100:1.00]
+; SLM-NEXT: pushl %gs # sched: [100:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retl # sched: [4:1.00]
+;
+; SANDY-LABEL: test_pop_push:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: popl %ds # sched: [100:0.33]
+; SANDY-NEXT: popl %es # sched: [100:0.33]
+; SANDY-NEXT: popl %ss # sched: [100:0.33]
+; SANDY-NEXT: popl %fs # sched: [100:0.33]
+; SANDY-NEXT: popl %gs # sched: [100:0.33]
+; SANDY-NEXT: pushl %cs # sched: [100:0.33]
+; SANDY-NEXT: pushl %ds # sched: [100:0.33]
+; SANDY-NEXT: pushl %es # sched: [100:0.33]
+; SANDY-NEXT: pushl %ss # sched: [100:0.33]
+; SANDY-NEXT: pushl %fs # sched: [100:0.33]
+; SANDY-NEXT: pushl %gs # sched: [100:0.33]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retl # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_pop_push:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: popl %ds # sched: [100:0.25]
+; HASWELL-NEXT: popl %es # sched: [100:0.25]
+; HASWELL-NEXT: popl %ss # sched: [100:0.25]
+; HASWELL-NEXT: popl %fs # sched: [100:0.25]
+; HASWELL-NEXT: popl %gs # sched: [100:0.25]
+; HASWELL-NEXT: pushl %cs # sched: [100:0.25]
+; HASWELL-NEXT: pushl %ds # sched: [100:0.25]
+; HASWELL-NEXT: pushl %es # sched: [100:0.25]
+; HASWELL-NEXT: pushl %ss # sched: [100:0.25]
+; HASWELL-NEXT: pushl %fs # sched: [100:0.25]
+; HASWELL-NEXT: pushl %gs # sched: [100:0.25]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retl # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pop_push:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: popl %ds # sched: [100:0.25]
+; BROADWELL-NEXT: popl %es # sched: [100:0.25]
+; BROADWELL-NEXT: popl %ss # sched: [100:0.25]
+; BROADWELL-NEXT: popl %fs # sched: [100:0.25]
+; BROADWELL-NEXT: popl %gs # sched: [100:0.25]
+; BROADWELL-NEXT: pushl %cs # sched: [100:0.25]
+; BROADWELL-NEXT: pushl %ds # sched: [100:0.25]
+; BROADWELL-NEXT: pushl %es # sched: [100:0.25]
+; BROADWELL-NEXT: pushl %ss # sched: [100:0.25]
+; BROADWELL-NEXT: pushl %fs # sched: [100:0.25]
+; BROADWELL-NEXT: pushl %gs # sched: [100:0.25]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retl # sched: [6:0.50]
+;
+; SKYLAKE-LABEL: test_pop_push:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: popl %ds # sched: [100:0.25]
+; SKYLAKE-NEXT: popl %es # sched: [100:0.25]
+; SKYLAKE-NEXT: popl %ss # sched: [100:0.25]
+; SKYLAKE-NEXT: popl %fs # sched: [100:0.25]
+; SKYLAKE-NEXT: popl %gs # sched: [100:0.25]
+; SKYLAKE-NEXT: pushl %cs # sched: [100:0.25]
+; SKYLAKE-NEXT: pushl %ds # sched: [100:0.25]
+; SKYLAKE-NEXT: pushl %es # sched: [100:0.25]
+; SKYLAKE-NEXT: pushl %ss # sched: [100:0.25]
+; SKYLAKE-NEXT: pushl %fs # sched: [100:0.25]
+; SKYLAKE-NEXT: pushl %gs # sched: [100:0.25]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retl # sched: [6:0.50]
+;
+; SKX-LABEL: test_pop_push:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: popl %ds # sched: [100:0.25]
+; SKX-NEXT: popl %es # sched: [100:0.25]
+; SKX-NEXT: popl %ss # sched: [100:0.25]
+; SKX-NEXT: popl %fs # sched: [100:0.25]
+; SKX-NEXT: popl %gs # sched: [100:0.25]
+; SKX-NEXT: pushl %cs # sched: [100:0.25]
+; SKX-NEXT: pushl %ds # sched: [100:0.25]
+; SKX-NEXT: pushl %es # sched: [100:0.25]
+; SKX-NEXT: pushl %ss # sched: [100:0.25]
+; SKX-NEXT: pushl %fs # sched: [100:0.25]
+; SKX-NEXT: pushl %gs # sched: [100:0.25]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retl # sched: [6:0.50]
+;
+; BTVER2-LABEL: test_pop_push:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: popl %ds # sched: [100:0.17]
+; BTVER2-NEXT: popl %es # sched: [100:0.17]
+; BTVER2-NEXT: popl %ss # sched: [100:0.17]
+; BTVER2-NEXT: popl %fs # sched: [100:0.17]
+; BTVER2-NEXT: popl %gs # sched: [100:0.17]
+; BTVER2-NEXT: pushl %cs # sched: [100:0.17]
+; BTVER2-NEXT: pushl %ds # sched: [100:0.17]
+; BTVER2-NEXT: pushl %es # sched: [100:0.17]
+; BTVER2-NEXT: pushl %ss # sched: [100:0.17]
+; BTVER2-NEXT: pushl %fs # sched: [100:0.17]
+; BTVER2-NEXT: pushl %gs # sched: [100:0.17]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retl # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pop_push:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: popl %ds # sched: [100:?]
+; ZNVER1-NEXT: popl %es # sched: [100:?]
+; ZNVER1-NEXT: popl %ss # sched: [100:?]
+; ZNVER1-NEXT: popl %fs # sched: [100:?]
+; ZNVER1-NEXT: popl %gs # sched: [100:?]
+; ZNVER1-NEXT: pushl %cs # sched: [100:?]
+; ZNVER1-NEXT: pushl %ds # sched: [100:?]
+; ZNVER1-NEXT: pushl %es # sched: [100:?]
+; ZNVER1-NEXT: pushl %ss # sched: [100:?]
+; ZNVER1-NEXT: pushl %fs # sched: [100:?]
+; ZNVER1-NEXT: pushl %gs # sched: [100:?]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retl # sched: [1:0.50]
+ call void asm sideeffect "pop %DS \0A\09 pop %ES \0A\09 pop %SS \0A\09 pop %FS \0A\09 pop %GS \0A\09 push %CS \0A\09 push %DS \0A\09 push %ES \0A\09 push %SS \0A\09 push %FS \0A\09 push %GS", ""()
+ ret void
+}
+define i16 @test_pop_push_16(i16 %a0, i16 *%a1) optsize {
+; GENERIC-LABEL: test_pop_push_16:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: movzwl {{[0-9]+}}(%esp), %eax
+; GENERIC-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: popw %ax
+; GENERIC-NEXT: popw (%ecx)
+; GENERIC-NEXT: pushw %ax
+; GENERIC-NEXT: pushw (%ecx)
+; GENERIC-NEXT: pushw $4095 # imm = 0xFFF
+; GENERIC-NEXT: pushw $7
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retl
+;
+; ATOM-LABEL: test_pop_push_16:
+; ATOM: # %bb.0:
+; ATOM-NEXT: movzwl {{[0-9]+}}(%esp), %eax # sched: [1:1.00]
+; ATOM-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [1:1.00]
+; ATOM-NEXT: #APP
+; ATOM-NEXT: popw %ax # sched: [2:1.00]
+; ATOM-NEXT: popw (%ecx) # sched: [3:1.50]
+; ATOM-NEXT: pushw %ax # sched: [1:1.00]
+; ATOM-NEXT: pushw (%ecx) # sched: [2:1.00]
+; ATOM-NEXT: pushw $4095 # imm = 0xFFF
+; ATOM-NEXT: # sched: [1:1.00]
+; ATOM-NEXT: pushw $7 # sched: [1:1.00]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retl # sched: [79:39.50]
+;
+; SLM-LABEL: test_pop_push_16:
+; SLM: # %bb.0:
+; SLM-NEXT: movzwl {{[0-9]+}}(%esp), %eax # sched: [4:1.00]
+; SLM-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [3:1.00]
+; SLM-NEXT: #APP
+; SLM-NEXT: popw %ax # sched: [3:1.00]
+; SLM-NEXT: popw (%ecx) # sched: [1:1.00]
+; SLM-NEXT: pushw %ax # sched: [1:1.00]
+; SLM-NEXT: pushw (%ecx) # sched: [1:1.00]
+; SLM-NEXT: pushw $4095 # imm = 0xFFF
+; SLM-NEXT: # sched: [1:1.00]
+; SLM-NEXT: pushw $7 # sched: [1:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retl # sched: [4:1.00]
+;
+; SANDY-LABEL: test_pop_push_16:
+; SANDY: # %bb.0:
+; SANDY-NEXT: movzwl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; SANDY-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; SANDY-NEXT: #APP
+; SANDY-NEXT: popw %ax # sched: [6:0.50]
+; SANDY-NEXT: popw (%ecx) # sched: [6:0.50]
+; SANDY-NEXT: pushw %ax # sched: [5:1.00]
+; SANDY-NEXT: pushw (%ecx) # sched: [5:1.00]
+; SANDY-NEXT: pushw $4095 # imm = 0xFFF
+; SANDY-NEXT: # sched: [1:1.00]
+; SANDY-NEXT: pushw $7 # sched: [1:1.00]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retl # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_pop_push_16:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: movzwl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; HASWELL-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: popw %ax # sched: [6:0.50]
+; HASWELL-NEXT: popw (%ecx) # sched: [1:1.00]
+; HASWELL-NEXT: pushw %ax # sched: [2:1.00]
+; HASWELL-NEXT: pushw (%ecx) # sched: [1:1.00]
+; HASWELL-NEXT: pushw $4095 # imm = 0xFFF
+; HASWELL-NEXT: # sched: [1:1.00]
+; HASWELL-NEXT: pushw $7 # sched: [1:1.00]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retl # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pop_push_16:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: movzwl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; BROADWELL-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: popw %ax # sched: [6:0.50]
+; BROADWELL-NEXT: popw (%ecx) # sched: [6:0.50]
+; BROADWELL-NEXT: pushw %ax # sched: [2:1.00]
+; BROADWELL-NEXT: pushw (%ecx) # sched: [2:1.00]
+; BROADWELL-NEXT: pushw $4095 # imm = 0xFFF
+; BROADWELL-NEXT: # sched: [1:1.00]
+; BROADWELL-NEXT: pushw $7 # sched: [1:1.00]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retl # sched: [6:0.50]
+;
+; SKYLAKE-LABEL: test_pop_push_16:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: movzwl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; SKYLAKE-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: popw %ax # sched: [6:0.50]
+; SKYLAKE-NEXT: popw (%ecx) # sched: [6:0.50]
+; SKYLAKE-NEXT: pushw %ax # sched: [2:1.00]
+; SKYLAKE-NEXT: pushw (%ecx) # sched: [2:1.00]
+; SKYLAKE-NEXT: pushw $4095 # imm = 0xFFF
+; SKYLAKE-NEXT: # sched: [1:1.00]
+; SKYLAKE-NEXT: pushw $7 # sched: [1:1.00]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retl # sched: [6:0.50]
+;
+; SKX-LABEL: test_pop_push_16:
+; SKX: # %bb.0:
+; SKX-NEXT: movzwl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; SKX-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; SKX-NEXT: #APP
+; SKX-NEXT: popw %ax # sched: [6:0.50]
+; SKX-NEXT: popw (%ecx) # sched: [6:0.50]
+; SKX-NEXT: pushw %ax # sched: [2:1.00]
+; SKX-NEXT: pushw (%ecx) # sched: [2:1.00]
+; SKX-NEXT: pushw $4095 # imm = 0xFFF
+; SKX-NEXT: # sched: [1:1.00]
+; SKX-NEXT: pushw $7 # sched: [1:1.00]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retl # sched: [6:0.50]
+;
+; BTVER2-LABEL: test_pop_push_16:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: movzwl {{[0-9]+}}(%esp), %eax # sched: [4:1.00]
+; BTVER2-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:1.00]
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: popw %ax # sched: [5:1.00]
+; BTVER2-NEXT: popw (%ecx) # sched: [1:1.00]
+; BTVER2-NEXT: pushw %ax # sched: [1:1.00]
+; BTVER2-NEXT: pushw (%ecx) # sched: [1:1.00]
+; BTVER2-NEXT: pushw $4095 # imm = 0xFFF
+; BTVER2-NEXT: # sched: [1:1.00]
+; BTVER2-NEXT: pushw $7 # sched: [1:1.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retl # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pop_push_16:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [8:0.50]
+; ZNVER1-NEXT: movzwl {{[0-9]+}}(%esp), %eax # sched: [8:0.50]
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: popw %ax # sched: [8:0.50]
+; ZNVER1-NEXT: popw (%ecx) # sched: [5:0.50]
+; ZNVER1-NEXT: pushw %ax # sched: [1:0.50]
+; ZNVER1-NEXT: pushw (%ecx) # sched: [4:0.50]
+; ZNVER1-NEXT: pushw $4095 # imm = 0xFFF
+; ZNVER1-NEXT: # sched: [1:0.50]
+; ZNVER1-NEXT: pushw $7 # sched: [1:0.50]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retl # sched: [1:0.50]
+ %1 = call i16 asm sideeffect "popw $0 \0A\09 popw $2 \0A\09 pushw $1 \0A\09 pushw $2 \0A\09 pushw $3 \0A\09 pushw $4", "=r,r,*m,i,i"(i16 %a0, i16 *%a1, i16 4095, i8 7)
+ ret i16 %1
+}
+define i32 @test_pop_push_32(i32 %a0, i32 *%a1) optsize {
+; GENERIC-LABEL: test_pop_push_32:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: movl {{[0-9]+}}(%esp), %eax
+; GENERIC-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: popl %eax
+; GENERIC-NEXT: popl (%ecx)
+; GENERIC-NEXT: pushl %eax
+; GENERIC-NEXT: pushl (%ecx)
+; GENERIC-NEXT: pushl $4095 # imm = 0xFFF
+; GENERIC-NEXT: pushl $7
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retl
+;
+; ATOM-LABEL: test_pop_push_32:
+; ATOM: # %bb.0:
+; ATOM-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [1:1.00]
+; ATOM-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [1:1.00]
+; ATOM-NEXT: #APP
+; ATOM-NEXT: popl %eax # sched: [1:1.00]
+; ATOM-NEXT: popl (%ecx) # sched: [3:1.50]
+; ATOM-NEXT: pushl %eax # sched: [1:1.00]
+; ATOM-NEXT: pushl (%ecx) # sched: [2:1.00]
+; ATOM-NEXT: pushl $4095 # imm = 0xFFF
+; ATOM-NEXT: # sched: [1:1.00]
+; ATOM-NEXT: pushl $7 # sched: [1:1.00]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retl # sched: [79:39.50]
+;
+; SLM-LABEL: test_pop_push_32:
+; SLM: # %bb.0:
+; SLM-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [3:1.00]
+; SLM-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [3:1.00]
+; SLM-NEXT: #APP
+; SLM-NEXT: popl %eax # sched: [3:1.00]
+; SLM-NEXT: popl (%ecx) # sched: [1:1.00]
+; SLM-NEXT: pushl %eax # sched: [1:1.00]
+; SLM-NEXT: pushl (%ecx) # sched: [1:1.00]
+; SLM-NEXT: pushl $4095 # imm = 0xFFF
+; SLM-NEXT: # sched: [1:1.00]
+; SLM-NEXT: pushl $7 # sched: [1:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retl # sched: [4:1.00]
+;
+; SANDY-LABEL: test_pop_push_32:
+; SANDY: # %bb.0:
+; SANDY-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; SANDY-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; SANDY-NEXT: #APP
+; SANDY-NEXT: popl %eax # sched: [6:0.50]
+; SANDY-NEXT: popl (%ecx) # sched: [6:0.50]
+; SANDY-NEXT: pushl %eax # sched: [5:1.00]
+; SANDY-NEXT: pushl (%ecx) # sched: [5:1.00]
+; SANDY-NEXT: pushl $4095 # imm = 0xFFF
+; SANDY-NEXT: # sched: [1:1.00]
+; SANDY-NEXT: pushl $7 # sched: [1:1.00]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retl # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_pop_push_32:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; HASWELL-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: popl %eax # sched: [6:0.50]
+; HASWELL-NEXT: popl (%ecx) # sched: [1:1.00]
+; HASWELL-NEXT: pushl %eax # sched: [2:1.00]
+; HASWELL-NEXT: pushl (%ecx) # sched: [1:1.00]
+; HASWELL-NEXT: pushl $4095 # imm = 0xFFF
+; HASWELL-NEXT: # sched: [1:1.00]
+; HASWELL-NEXT: pushl $7 # sched: [1:1.00]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retl # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pop_push_32:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; BROADWELL-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: popl %eax # sched: [6:0.50]
+; BROADWELL-NEXT: popl (%ecx) # sched: [6:0.50]
+; BROADWELL-NEXT: pushl %eax # sched: [2:1.00]
+; BROADWELL-NEXT: pushl (%ecx) # sched: [2:1.00]
+; BROADWELL-NEXT: pushl $4095 # imm = 0xFFF
+; BROADWELL-NEXT: # sched: [1:1.00]
+; BROADWELL-NEXT: pushl $7 # sched: [1:1.00]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retl # sched: [6:0.50]
+;
+; SKYLAKE-LABEL: test_pop_push_32:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; SKYLAKE-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: popl %eax # sched: [6:0.50]
+; SKYLAKE-NEXT: popl (%ecx) # sched: [6:0.50]
+; SKYLAKE-NEXT: pushl %eax # sched: [2:1.00]
+; SKYLAKE-NEXT: pushl (%ecx) # sched: [2:1.00]
+; SKYLAKE-NEXT: pushl $4095 # imm = 0xFFF
+; SKYLAKE-NEXT: # sched: [1:1.00]
+; SKYLAKE-NEXT: pushl $7 # sched: [1:1.00]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retl # sched: [6:0.50]
+;
+; SKX-LABEL: test_pop_push_32:
+; SKX: # %bb.0:
+; SKX-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; SKX-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; SKX-NEXT: #APP
+; SKX-NEXT: popl %eax # sched: [6:0.50]
+; SKX-NEXT: popl (%ecx) # sched: [6:0.50]
+; SKX-NEXT: pushl %eax # sched: [2:1.00]
+; SKX-NEXT: pushl (%ecx) # sched: [2:1.00]
+; SKX-NEXT: pushl $4095 # imm = 0xFFF
+; SKX-NEXT: # sched: [1:1.00]
+; SKX-NEXT: pushl $7 # sched: [1:1.00]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retl # sched: [6:0.50]
+;
+; BTVER2-LABEL: test_pop_push_32:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:1.00]
+; BTVER2-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:1.00]
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: popl %eax # sched: [5:1.00]
+; BTVER2-NEXT: popl (%ecx) # sched: [1:1.00]
+; BTVER2-NEXT: pushl %eax # sched: [1:1.00]
+; BTVER2-NEXT: pushl (%ecx) # sched: [1:1.00]
+; BTVER2-NEXT: pushl $4095 # imm = 0xFFF
+; BTVER2-NEXT: # sched: [1:1.00]
+; BTVER2-NEXT: pushl $7 # sched: [1:1.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retl # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pop_push_32:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [8:0.50]
+; ZNVER1-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [8:0.50]
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: popl %eax # sched: [8:0.50]
+; ZNVER1-NEXT: popl (%ecx) # sched: [1:0.50]
+; ZNVER1-NEXT: pushl %eax # sched: [1:0.50]
+; ZNVER1-NEXT: pushl (%ecx) # sched: [4:0.50]
+; ZNVER1-NEXT: pushl $4095 # imm = 0xFFF
+; ZNVER1-NEXT: # sched: [1:0.50]
+; ZNVER1-NEXT: pushl $7 # sched: [1:0.50]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retl # sched: [1:0.50]
+ %1 = call i32 asm sideeffect "popl $0 \0A\09 popl $2 \0A\09 pushl $1 \0A\09 pushl $2 \0A\09 pushl $3 \0A\09 pushl $4", "=r,r,*m,i,i"(i32 %a0, i32 *%a1, i32 4095, i8 7)
+ ret i32 %1
+}
+
+define void @test_popa_popf_pusha_pushf() optsize {
+; GENERIC-LABEL: test_popa_popf_pusha_pushf:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: popal
+; GENERIC-NEXT: popfl
+; GENERIC-NEXT: pushal
+; GENERIC-NEXT: pushfl
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retl
+;
+; ATOM-LABEL: test_popa_popf_pusha_pushf:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: popal # sched: [9:4.50]
+; ATOM-NEXT: popfl # sched: [26:13.00]
+; ATOM-NEXT: pushal # sched: [8:4.00]
+; ATOM-NEXT: pushfl # sched: [9:4.50]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retl # sched: [79:39.50]
+;
+; SLM-LABEL: test_popa_popf_pusha_pushf:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: popal # sched: [3:1.00]
+; SLM-NEXT: popfl # sched: [3:1.00]
+; SLM-NEXT: pushal # sched: [1:1.00]
+; SLM-NEXT: pushfl # sched: [1:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retl # sched: [4:1.00]
+;
+; SANDY-LABEL: test_popa_popf_pusha_pushf:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: popal # sched: [4:0.50]
+; SANDY-NEXT: popfl # sched: [4:0.50]
+; SANDY-NEXT: pushal # sched: [1:1.00]
+; SANDY-NEXT: pushfl # sched: [1:1.00]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retl # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_popa_popf_pusha_pushf:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: popal # sched: [1:?]
+; HASWELL-NEXT: popfl # sched: [5:0.50]
+; HASWELL-NEXT: pushal # sched: [1:?]
+; HASWELL-NEXT: pushfl # sched: [1:1.00]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retl # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_popa_popf_pusha_pushf:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: popal # sched: [5:0.50]
+; BROADWELL-NEXT: popfl # sched: [5:0.50]
+; BROADWELL-NEXT: pushal # sched: [1:1.00]
+; BROADWELL-NEXT: pushfl # sched: [1:1.00]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retl # sched: [6:0.50]
+;
+; SKYLAKE-LABEL: test_popa_popf_pusha_pushf:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: popal # sched: [5:0.50]
+; SKYLAKE-NEXT: popfl # sched: [5:0.50]
+; SKYLAKE-NEXT: pushal # sched: [1:1.00]
+; SKYLAKE-NEXT: pushfl # sched: [1:1.00]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retl # sched: [6:0.50]
+;
+; SKX-LABEL: test_popa_popf_pusha_pushf:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: popal # sched: [5:0.50]
+; SKX-NEXT: popfl # sched: [5:0.50]
+; SKX-NEXT: pushal # sched: [1:1.00]
+; SKX-NEXT: pushfl # sched: [1:1.00]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retl # sched: [6:0.50]
+;
+; BTVER2-LABEL: test_popa_popf_pusha_pushf:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: popal # sched: [5:1.00]
+; BTVER2-NEXT: popfl # sched: [5:1.00]
+; BTVER2-NEXT: pushal # sched: [1:1.00]
+; BTVER2-NEXT: pushfl # sched: [1:1.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retl # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_popa_popf_pusha_pushf:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: popal # sched: [100:?]
+; ZNVER1-NEXT: popfl # sched: [100:?]
+; ZNVER1-NEXT: pushal # sched: [8:0.50]
+; ZNVER1-NEXT: pushfl # sched: [100:?]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retl # sched: [1:0.50]
+ call void asm sideeffect "popa \0A\09 popf \0A\09 pusha \0A\09 pushf", ""()
+ ret void
+}
+
+define void @test_ret() optsize {
+; GENERIC-LABEL: test_ret:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: retl
+; GENERIC-NEXT: retl $4095 # imm = 0xFFF
+; GENERIC-NEXT: lretl
+; GENERIC-NEXT: lretl $4095 # imm = 0xFFF
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retl
+;
+; ATOM-LABEL: test_ret:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: retl # sched: [79:39.50]
+; ATOM-NEXT: retl $4095 # imm = 0xFFF
+; ATOM-NEXT: # sched: [1:1.00]
+; ATOM-NEXT: lretl # sched: [79:39.50]
+; ATOM-NEXT: lretl $4095 # imm = 0xFFF
+; ATOM-NEXT: # sched: [79:39.50]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retl # sched: [79:39.50]
+;
+; SLM-LABEL: test_ret:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: retl # sched: [4:1.00]
+; SLM-NEXT: retl $4095 # imm = 0xFFF
+; SLM-NEXT: # sched: [4:1.00]
+; SLM-NEXT: lretl # sched: [4:1.00]
+; SLM-NEXT: lretl $4095 # imm = 0xFFF
+; SLM-NEXT: # sched: [4:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retl # sched: [4:1.00]
+;
+; SANDY-LABEL: test_ret:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: retl # sched: [5:1.00]
+; SANDY-NEXT: retl $4095 # imm = 0xFFF
+; SANDY-NEXT: # sched: [5:1.00]
+; SANDY-NEXT: lretl # sched: [5:1.00]
+; SANDY-NEXT: lretl $4095 # imm = 0xFFF
+; SANDY-NEXT: # sched: [5:1.00]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retl # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_ret:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: retl # sched: [7:1.00]
+; HASWELL-NEXT: retl $4095 # imm = 0xFFF
+; HASWELL-NEXT: # sched: [1:2.00]
+; HASWELL-NEXT: lretl # sched: [6:0.50]
+; HASWELL-NEXT: lretl $4095 # imm = 0xFFF
+; HASWELL-NEXT: # sched: [1:2.00]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retl # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_ret:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: retl # sched: [6:0.50]
+; BROADWELL-NEXT: retl $4095 # imm = 0xFFF
+; BROADWELL-NEXT: # sched: [6:0.50]
+; BROADWELL-NEXT: lretl # sched: [6:0.50]
+; BROADWELL-NEXT: lretl $4095 # imm = 0xFFF
+; BROADWELL-NEXT: # sched: [6:0.50]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retl # sched: [6:0.50]
+;
+; SKYLAKE-LABEL: test_ret:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: retl # sched: [6:0.50]
+; SKYLAKE-NEXT: retl $4095 # imm = 0xFFF
+; SKYLAKE-NEXT: # sched: [6:0.50]
+; SKYLAKE-NEXT: lretl # sched: [6:0.50]
+; SKYLAKE-NEXT: lretl $4095 # imm = 0xFFF
+; SKYLAKE-NEXT: # sched: [6:0.50]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retl # sched: [6:0.50]
+;
+; SKX-LABEL: test_ret:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: retl # sched: [6:0.50]
+; SKX-NEXT: retl $4095 # imm = 0xFFF
+; SKX-NEXT: # sched: [6:0.50]
+; SKX-NEXT: lretl # sched: [6:0.50]
+; SKX-NEXT: lretl $4095 # imm = 0xFFF
+; SKX-NEXT: # sched: [6:0.50]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retl # sched: [6:0.50]
+;
+; BTVER2-LABEL: test_ret:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: retl # sched: [4:1.00]
+; BTVER2-NEXT: retl $4095 # imm = 0xFFF
+; BTVER2-NEXT: # sched: [4:1.00]
+; BTVER2-NEXT: lretl # sched: [4:1.00]
+; BTVER2-NEXT: lretl $4095 # imm = 0xFFF
+; BTVER2-NEXT: # sched: [4:1.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retl # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_ret:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: retl # sched: [1:0.50]
+; ZNVER1-NEXT: retl $4095 # imm = 0xFFF
+; ZNVER1-NEXT: # sched: [5:0.50]
+; ZNVER1-NEXT: lretl # sched: [1:0.50]
+; ZNVER1-NEXT: lretl $4095 # imm = 0xFFF
+; ZNVER1-NEXT: # sched: [5:0.50]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retl # sched: [1:0.50]
+ call void asm sideeffect "ret \0A\09 ret $0 \0A\09 lret \0A\09 lret $0", "i"(i16 4095)
+ ret void
+}
+
+define i8 @test_salc() optsize {
+; GENERIC-LABEL: test_salc:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: salc
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retl
+;
+; ATOM-LABEL: test_salc:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: salc # sched: [1:0.50]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retl # sched: [79:39.50]
+;
+; SLM-LABEL: test_salc:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: salc # sched: [1:0.50]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retl # sched: [4:1.00]
+;
+; SANDY-LABEL: test_salc:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: salc # sched: [1:0.33]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retl # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_salc:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: salc # sched: [1:0.25]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retl # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_salc:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: salc # sched: [1:0.25]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retl # sched: [6:0.50]
+;
+; SKYLAKE-LABEL: test_salc:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: salc # sched: [1:0.25]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retl # sched: [6:0.50]
+;
+; SKX-LABEL: test_salc:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: salc # sched: [1:0.25]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retl # sched: [6:0.50]
+;
+; BTVER2-LABEL: test_salc:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: salc # sched: [1:0.50]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retl # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_salc:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: salc # sched: [1:0.25]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retl # sched: [1:0.50]
+ %1 = tail call i8 asm "salc", "=r"() nounwind
+ ret i8 %1
+}
+
+define void @test_xchg_32(i32 %a0, i32 %a1, i32 *%a2) optsize {
+; GENERIC-LABEL: test_xchg_32:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: movl {{[0-9]+}}(%esp), %eax
+; GENERIC-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; GENERIC-NEXT: movl {{[0-9]+}}(%esp), %edx
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: xchgl %eax, %eax
+; GENERIC-NEXT: xchgl %ecx, %eax
+; GENERIC-NEXT: xchgl %eax, (%edx)
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retl
+;
+; ATOM-LABEL: test_xchg_32:
+; ATOM: # %bb.0:
+; ATOM-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [1:1.00]
+; ATOM-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [1:1.00]
+; ATOM-NEXT: movl {{[0-9]+}}(%esp), %edx # sched: [1:1.00]
+; ATOM-NEXT: #APP
+; ATOM-NEXT: xchgl %eax, %eax # sched: [2:1.00]
+; ATOM-NEXT: xchgl %ecx, %eax # sched: [2:1.00]
+; ATOM-NEXT: xchgl %eax, (%edx) # sched: [3:1.50]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retl # sched: [79:39.50]
+;
+; SLM-LABEL: test_xchg_32:
+; SLM: # %bb.0:
+; SLM-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [3:1.00]
+; SLM-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [3:1.00]
+; SLM-NEXT: movl {{[0-9]+}}(%esp), %edx # sched: [3:1.00]
+; SLM-NEXT: #APP
+; SLM-NEXT: xchgl %eax, %eax # sched: [1:0.50]
+; SLM-NEXT: xchgl %ecx, %eax # sched: [1:0.50]
+; SLM-NEXT: xchgl %eax, (%edx) # sched: [4:2.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retl # sched: [4:1.00]
+;
+; SANDY-LABEL: test_xchg_32:
+; SANDY: # %bb.0:
+; SANDY-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; SANDY-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; SANDY-NEXT: movl {{[0-9]+}}(%esp), %edx # sched: [5:0.50]
+; SANDY-NEXT: #APP
+; SANDY-NEXT: xchgl %eax, %eax # sched: [1:0.33]
+; SANDY-NEXT: xchgl %ecx, %eax # sched: [1:0.33]
+; SANDY-NEXT: xchgl %eax, (%edx) # sched: [5:1.00]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retl # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_xchg_32:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; HASWELL-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; HASWELL-NEXT: movl {{[0-9]+}}(%esp), %edx # sched: [5:0.50]
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: xchgl %eax, %eax # sched: [1:0.25]
+; HASWELL-NEXT: xchgl %ecx, %eax # sched: [1:0.25]
+; HASWELL-NEXT: xchgl %eax, (%edx) # sched: [9:1.00]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retl # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_xchg_32:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; BROADWELL-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; BROADWELL-NEXT: movl {{[0-9]+}}(%esp), %edx # sched: [5:0.50]
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: xchgl %eax, %eax # sched: [1:0.25]
+; BROADWELL-NEXT: xchgl %ecx, %eax # sched: [1:0.25]
+; BROADWELL-NEXT: xchgl %eax, (%edx) # sched: [8:1.00]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retl # sched: [6:0.50]
+;
+; SKYLAKE-LABEL: test_xchg_32:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; SKYLAKE-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; SKYLAKE-NEXT: movl {{[0-9]+}}(%esp), %edx # sched: [5:0.50]
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: xchgl %eax, %eax # sched: [1:0.25]
+; SKYLAKE-NEXT: xchgl %ecx, %eax # sched: [1:0.25]
+; SKYLAKE-NEXT: xchgl %eax, (%edx) # sched: [10:1.25]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retl # sched: [6:0.50]
+;
+; SKX-LABEL: test_xchg_32:
+; SKX: # %bb.0:
+; SKX-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; SKX-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; SKX-NEXT: movl {{[0-9]+}}(%esp), %edx # sched: [5:0.50]
+; SKX-NEXT: #APP
+; SKX-NEXT: xchgl %eax, %eax # sched: [1:0.25]
+; SKX-NEXT: xchgl %ecx, %eax # sched: [1:0.25]
+; SKX-NEXT: xchgl %eax, (%edx) # sched: [10:1.25]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retl # sched: [6:0.50]
+;
+; BTVER2-LABEL: test_xchg_32:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:1.00]
+; BTVER2-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:1.00]
+; BTVER2-NEXT: movl {{[0-9]+}}(%esp), %edx # sched: [5:1.00]
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: xchgl %eax, %eax # sched: [1:0.50]
+; BTVER2-NEXT: xchgl %ecx, %eax # sched: [1:0.50]
+; BTVER2-NEXT: xchgl %eax, (%edx) # sched: [4:1.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retl # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_xchg_32:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [8:0.50]
+; ZNVER1-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [8:0.50]
+; ZNVER1-NEXT: movl {{[0-9]+}}(%esp), %edx # sched: [8:0.50]
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: xchgl %eax, %eax # sched: [1:0.50]
+; ZNVER1-NEXT: xchgl %ecx, %eax # sched: [1:0.50]
+; ZNVER1-NEXT: xchgl %eax, (%edx) # sched: [5:0.50]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retl # sched: [1:0.50]
+ tail call void asm "xchg %EAX, $0 \0A\09 xchg $1, $0 \0A\09 xchg $2, $0", "r,r,*m"(i32 %a0, i32 %a1, i32 *%a2) nounwind
+ ret void
+}
diff --git a/test/CodeGen/X86/schedule-x86_64.ll b/test/CodeGen/X86/schedule-x86_64.ll
new file mode 100644
index 000000000000..38874dd6d1ab
--- /dev/null
+++ b/test/CodeGen/X86/schedule-x86_64.ll
@@ -0,0 +1,17197 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 | FileCheck %s --check-prefix=CHECK --check-prefix=GENERIC
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=atom | FileCheck %s --check-prefix=CHECK --check-prefix=ATOM
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=slm | FileCheck %s --check-prefix=CHECK --check-prefix=SLM
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=sandybridge | FileCheck %s --check-prefix=CHECK --check-prefix=SANDY
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=ivybridge | FileCheck %s --check-prefix=CHECK --check-prefix=SANDY
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=haswell | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=broadwell | FileCheck %s --check-prefix=CHECK --check-prefix=BROADWELL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=SKYLAKE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skx | FileCheck %s --check-prefix=CHECK --check-prefix=SKX
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1
+
+define void @test_adc_8(i8 %a0, i8* %a1) optsize {
+; GENERIC-LABEL: test_adc_8:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: adcb $7, %al # sched: [3:1.00]
+; GENERIC-NEXT: adcb $7, %dil # sched: [2:0.67]
+; GENERIC-NEXT: adcb $7, (%rsi) # sched: [9:1.00]
+; GENERIC-NEXT: adcb %dil, %dil # sched: [2:0.67]
+; GENERIC-NEXT: adcb %dil, (%rsi) # sched: [9:1.00]
+; GENERIC-NEXT: adcb (%rsi), %dil # sched: [7:0.67]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_adc_8:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: adcb $7, %al # sched: [1:0.50]
+; ATOM-NEXT: adcb $7, %dil # sched: [1:0.50]
+; ATOM-NEXT: adcb $7, (%rsi) # sched: [1:1.00]
+; ATOM-NEXT: adcb %dil, %dil # sched: [1:0.50]
+; ATOM-NEXT: adcb %dil, (%rsi) # sched: [1:1.00]
+; ATOM-NEXT: adcb (%rsi), %dil # sched: [1:1.00]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_adc_8:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: adcb $7, %al # sched: [1:0.50]
+; SLM-NEXT: adcb $7, %dil # sched: [1:0.50]
+; SLM-NEXT: adcb $7, (%rsi) # sched: [4:2.00]
+; SLM-NEXT: adcb %dil, %dil # sched: [1:0.50]
+; SLM-NEXT: adcb %dil, (%rsi) # sched: [4:2.00]
+; SLM-NEXT: adcb (%rsi), %dil # sched: [4:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_adc_8:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: adcb $7, %al # sched: [3:1.00]
+; SANDY-NEXT: adcb $7, %dil # sched: [2:0.67]
+; SANDY-NEXT: adcb $7, (%rsi) # sched: [9:1.00]
+; SANDY-NEXT: adcb %dil, %dil # sched: [2:0.67]
+; SANDY-NEXT: adcb %dil, (%rsi) # sched: [9:1.00]
+; SANDY-NEXT: adcb (%rsi), %dil # sched: [7:0.67]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_adc_8:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: adcb $7, %al # sched: [2:0.50]
+; HASWELL-NEXT: adcb $7, %dil # sched: [2:0.50]
+; HASWELL-NEXT: adcb $7, (%rsi) # sched: [9:1.00]
+; HASWELL-NEXT: adcb %dil, %dil # sched: [2:0.50]
+; HASWELL-NEXT: adcb %dil, (%rsi) # sched: [9:1.00]
+; HASWELL-NEXT: adcb (%rsi), %dil # sched: [7:0.50]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_adc_8:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: adcb $7, %al # sched: [2:0.50]
+; BROADWELL-NEXT: adcb $7, %dil # sched: [2:0.50]
+; BROADWELL-NEXT: adcb $7, (%rsi) # sched: [8:1.00]
+; BROADWELL-NEXT: adcb %dil, %dil # sched: [1:0.50]
+; BROADWELL-NEXT: adcb %dil, (%rsi) # sched: [8:1.00]
+; BROADWELL-NEXT: adcb (%rsi), %dil # sched: [6:0.50]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_adc_8:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: adcb $7, %al # sched: [2:0.50]
+; SKYLAKE-NEXT: adcb $7, %dil # sched: [2:0.50]
+; SKYLAKE-NEXT: adcb $7, (%rsi) # sched: [8:1.00]
+; SKYLAKE-NEXT: adcb %dil, %dil # sched: [1:0.50]
+; SKYLAKE-NEXT: adcb %dil, (%rsi) # sched: [8:1.00]
+; SKYLAKE-NEXT: adcb (%rsi), %dil # sched: [6:0.50]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_adc_8:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: adcb $7, %al # sched: [2:0.50]
+; SKX-NEXT: adcb $7, %dil # sched: [2:0.50]
+; SKX-NEXT: adcb $7, (%rsi) # sched: [8:1.00]
+; SKX-NEXT: adcb %dil, %dil # sched: [1:0.50]
+; SKX-NEXT: adcb %dil, (%rsi) # sched: [8:1.00]
+; SKX-NEXT: adcb (%rsi), %dil # sched: [6:0.50]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_adc_8:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: adcb $7, %al # sched: [1:0.50]
+; BTVER2-NEXT: adcb $7, %dil # sched: [1:0.50]
+; BTVER2-NEXT: adcb $7, (%rsi) # sched: [4:1.00]
+; BTVER2-NEXT: adcb %dil, %dil # sched: [1:0.50]
+; BTVER2-NEXT: adcb %dil, (%rsi) # sched: [4:1.00]
+; BTVER2-NEXT: adcb (%rsi), %dil # sched: [4:1.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_adc_8:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: adcb $7, %al # sched: [1:0.25]
+; ZNVER1-NEXT: adcb $7, %dil # sched: [1:0.25]
+; ZNVER1-NEXT: adcb $7, (%rsi) # sched: [5:0.50]
+; ZNVER1-NEXT: adcb %dil, %dil # sched: [1:0.25]
+; ZNVER1-NEXT: adcb %dil, (%rsi) # sched: [5:0.50]
+; ZNVER1-NEXT: adcb (%rsi), %dil # sched: [5:0.50]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ tail call void asm "adcb $2, %AL \0A\09 adcb $2, $0 \0A\09 adcb $2, $1 \0A\09 adcb $0, $0 \0A\09 adcb $0, $1 \0A\09 adcb $1, $0", "r,*m,i"(i8 %a0, i8* %a1, i8 7) nounwind
+ ret void
+}
+define void @test_adc_16(i16 %a0, i16* %a1) optsize {
+; GENERIC-LABEL: test_adc_16:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: adcw $511, %ax # imm = 0x1FF
+; GENERIC-NEXT: # sched: [1:0.33]
+; GENERIC-NEXT: adcw $511, %di # imm = 0x1FF
+; GENERIC-NEXT: # sched: [2:0.67]
+; GENERIC-NEXT: adcw $511, (%rsi) # imm = 0x1FF
+; GENERIC-NEXT: # sched: [9:1.00]
+; GENERIC-NEXT: adcw $7, %di # sched: [2:0.67]
+; GENERIC-NEXT: adcw $7, (%rsi) # sched: [9:1.00]
+; GENERIC-NEXT: adcw %di, %di # sched: [2:0.67]
+; GENERIC-NEXT: adcw %di, (%rsi) # sched: [9:1.00]
+; GENERIC-NEXT: adcw (%rsi), %di # sched: [7:0.67]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_adc_16:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: adcw $511, %ax # imm = 0x1FF
+; ATOM-NEXT: # sched: [1:0.50]
+; ATOM-NEXT: adcw $511, %di # imm = 0x1FF
+; ATOM-NEXT: # sched: [1:0.50]
+; ATOM-NEXT: adcw $511, (%rsi) # imm = 0x1FF
+; ATOM-NEXT: # sched: [1:1.00]
+; ATOM-NEXT: adcw $7, %di # sched: [1:0.50]
+; ATOM-NEXT: adcw $7, (%rsi) # sched: [1:1.00]
+; ATOM-NEXT: adcw %di, %di # sched: [1:0.50]
+; ATOM-NEXT: adcw %di, (%rsi) # sched: [1:1.00]
+; ATOM-NEXT: adcw (%rsi), %di # sched: [1:1.00]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_adc_16:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: adcw $511, %ax # imm = 0x1FF
+; SLM-NEXT: # sched: [1:0.50]
+; SLM-NEXT: adcw $511, %di # imm = 0x1FF
+; SLM-NEXT: # sched: [1:0.50]
+; SLM-NEXT: adcw $511, (%rsi) # imm = 0x1FF
+; SLM-NEXT: # sched: [4:2.00]
+; SLM-NEXT: adcw $7, %di # sched: [1:0.50]
+; SLM-NEXT: adcw $7, (%rsi) # sched: [4:2.00]
+; SLM-NEXT: adcw %di, %di # sched: [1:0.50]
+; SLM-NEXT: adcw %di, (%rsi) # sched: [4:2.00]
+; SLM-NEXT: adcw (%rsi), %di # sched: [4:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_adc_16:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: adcw $511, %ax # imm = 0x1FF
+; SANDY-NEXT: # sched: [1:0.33]
+; SANDY-NEXT: adcw $511, %di # imm = 0x1FF
+; SANDY-NEXT: # sched: [2:0.67]
+; SANDY-NEXT: adcw $511, (%rsi) # imm = 0x1FF
+; SANDY-NEXT: # sched: [9:1.00]
+; SANDY-NEXT: adcw $7, %di # sched: [2:0.67]
+; SANDY-NEXT: adcw $7, (%rsi) # sched: [9:1.00]
+; SANDY-NEXT: adcw %di, %di # sched: [2:0.67]
+; SANDY-NEXT: adcw %di, (%rsi) # sched: [9:1.00]
+; SANDY-NEXT: adcw (%rsi), %di # sched: [7:0.67]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_adc_16:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: adcw $511, %ax # imm = 0x1FF
+; HASWELL-NEXT: # sched: [1:0.25]
+; HASWELL-NEXT: adcw $511, %di # imm = 0x1FF
+; HASWELL-NEXT: # sched: [2:0.50]
+; HASWELL-NEXT: adcw $511, (%rsi) # imm = 0x1FF
+; HASWELL-NEXT: # sched: [9:1.00]
+; HASWELL-NEXT: adcw $7, %di # sched: [2:0.50]
+; HASWELL-NEXT: adcw $7, (%rsi) # sched: [9:1.00]
+; HASWELL-NEXT: adcw %di, %di # sched: [2:0.50]
+; HASWELL-NEXT: adcw %di, (%rsi) # sched: [9:1.00]
+; HASWELL-NEXT: adcw (%rsi), %di # sched: [7:0.50]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_adc_16:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: adcw $511, %ax # imm = 0x1FF
+; BROADWELL-NEXT: # sched: [1:0.25]
+; BROADWELL-NEXT: adcw $511, %di # imm = 0x1FF
+; BROADWELL-NEXT: # sched: [1:0.50]
+; BROADWELL-NEXT: adcw $511, (%rsi) # imm = 0x1FF
+; BROADWELL-NEXT: # sched: [8:1.00]
+; BROADWELL-NEXT: adcw $7, %di # sched: [1:0.50]
+; BROADWELL-NEXT: adcw $7, (%rsi) # sched: [8:1.00]
+; BROADWELL-NEXT: adcw %di, %di # sched: [1:0.50]
+; BROADWELL-NEXT: adcw %di, (%rsi) # sched: [8:1.00]
+; BROADWELL-NEXT: adcw (%rsi), %di # sched: [6:0.50]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_adc_16:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: adcw $511, %ax # imm = 0x1FF
+; SKYLAKE-NEXT: # sched: [1:0.25]
+; SKYLAKE-NEXT: adcw $511, %di # imm = 0x1FF
+; SKYLAKE-NEXT: # sched: [1:0.50]
+; SKYLAKE-NEXT: adcw $511, (%rsi) # imm = 0x1FF
+; SKYLAKE-NEXT: # sched: [8:1.00]
+; SKYLAKE-NEXT: adcw $7, %di # sched: [1:0.50]
+; SKYLAKE-NEXT: adcw $7, (%rsi) # sched: [8:1.00]
+; SKYLAKE-NEXT: adcw %di, %di # sched: [1:0.50]
+; SKYLAKE-NEXT: adcw %di, (%rsi) # sched: [8:1.00]
+; SKYLAKE-NEXT: adcw (%rsi), %di # sched: [6:0.50]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_adc_16:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: adcw $511, %ax # imm = 0x1FF
+; SKX-NEXT: # sched: [1:0.25]
+; SKX-NEXT: adcw $511, %di # imm = 0x1FF
+; SKX-NEXT: # sched: [1:0.50]
+; SKX-NEXT: adcw $511, (%rsi) # imm = 0x1FF
+; SKX-NEXT: # sched: [8:1.00]
+; SKX-NEXT: adcw $7, %di # sched: [1:0.50]
+; SKX-NEXT: adcw $7, (%rsi) # sched: [8:1.00]
+; SKX-NEXT: adcw %di, %di # sched: [1:0.50]
+; SKX-NEXT: adcw %di, (%rsi) # sched: [8:1.00]
+; SKX-NEXT: adcw (%rsi), %di # sched: [6:0.50]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_adc_16:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: adcw $511, %ax # imm = 0x1FF
+; BTVER2-NEXT: # sched: [1:0.50]
+; BTVER2-NEXT: adcw $511, %di # imm = 0x1FF
+; BTVER2-NEXT: # sched: [1:0.50]
+; BTVER2-NEXT: adcw $511, (%rsi) # imm = 0x1FF
+; BTVER2-NEXT: # sched: [4:1.00]
+; BTVER2-NEXT: adcw $7, %di # sched: [1:0.50]
+; BTVER2-NEXT: adcw $7, (%rsi) # sched: [4:1.00]
+; BTVER2-NEXT: adcw %di, %di # sched: [1:0.50]
+; BTVER2-NEXT: adcw %di, (%rsi) # sched: [4:1.00]
+; BTVER2-NEXT: adcw (%rsi), %di # sched: [4:1.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_adc_16:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: adcw $511, %ax # imm = 0x1FF
+; ZNVER1-NEXT: # sched: [1:0.25]
+; ZNVER1-NEXT: adcw $511, %di # imm = 0x1FF
+; ZNVER1-NEXT: # sched: [1:0.25]
+; ZNVER1-NEXT: adcw $511, (%rsi) # imm = 0x1FF
+; ZNVER1-NEXT: # sched: [5:0.50]
+; ZNVER1-NEXT: adcw $7, %di # sched: [1:0.25]
+; ZNVER1-NEXT: adcw $7, (%rsi) # sched: [5:0.50]
+; ZNVER1-NEXT: adcw %di, %di # sched: [1:0.25]
+; ZNVER1-NEXT: adcw %di, (%rsi) # sched: [5:0.50]
+; ZNVER1-NEXT: adcw (%rsi), %di # sched: [5:0.50]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ tail call void asm "adcw $2, %AX \0A\09 adcw $2, $0 \0A\09 adcw $2, $1 \0A\09 adcw $3, $0 \0A\09 adcw $3, $1 \0A\09 adcw $0, $0 \0A\09 adcw $0, $1 \0A\09 adcw $1, $0", "r,*m,i,i"(i16 %a0, i16* %a1, i16 511, i8 7) nounwind
+ ret void
+}
+define void @test_adc_32(i32 %a0, i32* %a1) optsize {
+; GENERIC-LABEL: test_adc_32:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: adcl $665536, %eax # imm = 0xA27C0
+; GENERIC-NEXT: # sched: [1:0.33]
+; GENERIC-NEXT: adcl $665536, %edi # imm = 0xA27C0
+; GENERIC-NEXT: # sched: [2:0.67]
+; GENERIC-NEXT: adcl $665536, (%rsi) # imm = 0xA27C0
+; GENERIC-NEXT: # sched: [9:1.00]
+; GENERIC-NEXT: adcl $7, %edi # sched: [2:0.67]
+; GENERIC-NEXT: adcl $7, (%rsi) # sched: [9:1.00]
+; GENERIC-NEXT: adcl %edi, %edi # sched: [2:0.67]
+; GENERIC-NEXT: adcl %edi, (%rsi) # sched: [9:1.00]
+; GENERIC-NEXT: adcl (%rsi), %edi # sched: [7:0.67]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_adc_32:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: adcl $665536, %eax # imm = 0xA27C0
+; ATOM-NEXT: # sched: [1:0.50]
+; ATOM-NEXT: adcl $665536, %edi # imm = 0xA27C0
+; ATOM-NEXT: # sched: [1:0.50]
+; ATOM-NEXT: adcl $665536, (%rsi) # imm = 0xA27C0
+; ATOM-NEXT: # sched: [1:1.00]
+; ATOM-NEXT: adcl $7, %edi # sched: [1:0.50]
+; ATOM-NEXT: adcl $7, (%rsi) # sched: [1:1.00]
+; ATOM-NEXT: adcl %edi, %edi # sched: [1:0.50]
+; ATOM-NEXT: adcl %edi, (%rsi) # sched: [1:1.00]
+; ATOM-NEXT: adcl (%rsi), %edi # sched: [1:1.00]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_adc_32:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: adcl $665536, %eax # imm = 0xA27C0
+; SLM-NEXT: # sched: [1:0.50]
+; SLM-NEXT: adcl $665536, %edi # imm = 0xA27C0
+; SLM-NEXT: # sched: [1:0.50]
+; SLM-NEXT: adcl $665536, (%rsi) # imm = 0xA27C0
+; SLM-NEXT: # sched: [4:2.00]
+; SLM-NEXT: adcl $7, %edi # sched: [1:0.50]
+; SLM-NEXT: adcl $7, (%rsi) # sched: [4:2.00]
+; SLM-NEXT: adcl %edi, %edi # sched: [1:0.50]
+; SLM-NEXT: adcl %edi, (%rsi) # sched: [4:2.00]
+; SLM-NEXT: adcl (%rsi), %edi # sched: [4:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_adc_32:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: adcl $665536, %eax # imm = 0xA27C0
+; SANDY-NEXT: # sched: [1:0.33]
+; SANDY-NEXT: adcl $665536, %edi # imm = 0xA27C0
+; SANDY-NEXT: # sched: [2:0.67]
+; SANDY-NEXT: adcl $665536, (%rsi) # imm = 0xA27C0
+; SANDY-NEXT: # sched: [9:1.00]
+; SANDY-NEXT: adcl $7, %edi # sched: [2:0.67]
+; SANDY-NEXT: adcl $7, (%rsi) # sched: [9:1.00]
+; SANDY-NEXT: adcl %edi, %edi # sched: [2:0.67]
+; SANDY-NEXT: adcl %edi, (%rsi) # sched: [9:1.00]
+; SANDY-NEXT: adcl (%rsi), %edi # sched: [7:0.67]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_adc_32:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: adcl $665536, %eax # imm = 0xA27C0
+; HASWELL-NEXT: # sched: [1:0.25]
+; HASWELL-NEXT: adcl $665536, %edi # imm = 0xA27C0
+; HASWELL-NEXT: # sched: [2:0.50]
+; HASWELL-NEXT: adcl $665536, (%rsi) # imm = 0xA27C0
+; HASWELL-NEXT: # sched: [9:1.00]
+; HASWELL-NEXT: adcl $7, %edi # sched: [2:0.50]
+; HASWELL-NEXT: adcl $7, (%rsi) # sched: [9:1.00]
+; HASWELL-NEXT: adcl %edi, %edi # sched: [2:0.50]
+; HASWELL-NEXT: adcl %edi, (%rsi) # sched: [9:1.00]
+; HASWELL-NEXT: adcl (%rsi), %edi # sched: [7:0.50]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_adc_32:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: adcl $665536, %eax # imm = 0xA27C0
+; BROADWELL-NEXT: # sched: [1:0.25]
+; BROADWELL-NEXT: adcl $665536, %edi # imm = 0xA27C0
+; BROADWELL-NEXT: # sched: [1:0.50]
+; BROADWELL-NEXT: adcl $665536, (%rsi) # imm = 0xA27C0
+; BROADWELL-NEXT: # sched: [8:1.00]
+; BROADWELL-NEXT: adcl $7, %edi # sched: [1:0.50]
+; BROADWELL-NEXT: adcl $7, (%rsi) # sched: [8:1.00]
+; BROADWELL-NEXT: adcl %edi, %edi # sched: [1:0.50]
+; BROADWELL-NEXT: adcl %edi, (%rsi) # sched: [8:1.00]
+; BROADWELL-NEXT: adcl (%rsi), %edi # sched: [6:0.50]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_adc_32:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: adcl $665536, %eax # imm = 0xA27C0
+; SKYLAKE-NEXT: # sched: [1:0.25]
+; SKYLAKE-NEXT: adcl $665536, %edi # imm = 0xA27C0
+; SKYLAKE-NEXT: # sched: [1:0.50]
+; SKYLAKE-NEXT: adcl $665536, (%rsi) # imm = 0xA27C0
+; SKYLAKE-NEXT: # sched: [8:1.00]
+; SKYLAKE-NEXT: adcl $7, %edi # sched: [1:0.50]
+; SKYLAKE-NEXT: adcl $7, (%rsi) # sched: [8:1.00]
+; SKYLAKE-NEXT: adcl %edi, %edi # sched: [1:0.50]
+; SKYLAKE-NEXT: adcl %edi, (%rsi) # sched: [8:1.00]
+; SKYLAKE-NEXT: adcl (%rsi), %edi # sched: [6:0.50]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_adc_32:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: adcl $665536, %eax # imm = 0xA27C0
+; SKX-NEXT: # sched: [1:0.25]
+; SKX-NEXT: adcl $665536, %edi # imm = 0xA27C0
+; SKX-NEXT: # sched: [1:0.50]
+; SKX-NEXT: adcl $665536, (%rsi) # imm = 0xA27C0
+; SKX-NEXT: # sched: [8:1.00]
+; SKX-NEXT: adcl $7, %edi # sched: [1:0.50]
+; SKX-NEXT: adcl $7, (%rsi) # sched: [8:1.00]
+; SKX-NEXT: adcl %edi, %edi # sched: [1:0.50]
+; SKX-NEXT: adcl %edi, (%rsi) # sched: [8:1.00]
+; SKX-NEXT: adcl (%rsi), %edi # sched: [6:0.50]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_adc_32:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: adcl $665536, %eax # imm = 0xA27C0
+; BTVER2-NEXT: # sched: [1:0.50]
+; BTVER2-NEXT: adcl $665536, %edi # imm = 0xA27C0
+; BTVER2-NEXT: # sched: [1:0.50]
+; BTVER2-NEXT: adcl $665536, (%rsi) # imm = 0xA27C0
+; BTVER2-NEXT: # sched: [4:1.00]
+; BTVER2-NEXT: adcl $7, %edi # sched: [1:0.50]
+; BTVER2-NEXT: adcl $7, (%rsi) # sched: [4:1.00]
+; BTVER2-NEXT: adcl %edi, %edi # sched: [1:0.50]
+; BTVER2-NEXT: adcl %edi, (%rsi) # sched: [4:1.00]
+; BTVER2-NEXT: adcl (%rsi), %edi # sched: [4:1.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_adc_32:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: adcl $665536, %eax # imm = 0xA27C0
+; ZNVER1-NEXT: # sched: [1:0.25]
+; ZNVER1-NEXT: adcl $665536, %edi # imm = 0xA27C0
+; ZNVER1-NEXT: # sched: [1:0.25]
+; ZNVER1-NEXT: adcl $665536, (%rsi) # imm = 0xA27C0
+; ZNVER1-NEXT: # sched: [5:0.50]
+; ZNVER1-NEXT: adcl $7, %edi # sched: [1:0.25]
+; ZNVER1-NEXT: adcl $7, (%rsi) # sched: [5:0.50]
+; ZNVER1-NEXT: adcl %edi, %edi # sched: [1:0.25]
+; ZNVER1-NEXT: adcl %edi, (%rsi) # sched: [5:0.50]
+; ZNVER1-NEXT: adcl (%rsi), %edi # sched: [5:0.50]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ tail call void asm "adcl $2, %EAX \0A\09 adcl $2, $0 \0A\09 adcl $2, $1 \0A\09 adcl $3, $0 \0A\09 adcl $3, $1 \0A\09 adcl $0, $0 \0A\09 adcl $0, $1 \0A\09 adcl $1, $0", "r,*m,i,i"(i32 %a0, i32* %a1, i32 665536, i8 7) nounwind
+ ret void
+}
+define void @test_adc_64(i64 %a0, i64* %a1) optsize {
+; GENERIC-LABEL: test_adc_64:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: adcq $665536, %rax # imm = 0xA27C0
+; GENERIC-NEXT: # sched: [1:0.33]
+; GENERIC-NEXT: adcq $665536, %rdi # imm = 0xA27C0
+; GENERIC-NEXT: # sched: [2:0.67]
+; GENERIC-NEXT: adcq $665536, (%rsi) # imm = 0xA27C0
+; GENERIC-NEXT: # sched: [9:1.00]
+; GENERIC-NEXT: adcq $7, %rdi # sched: [2:0.67]
+; GENERIC-NEXT: adcq $7, (%rsi) # sched: [9:1.00]
+; GENERIC-NEXT: adcq %rdi, %rdi # sched: [2:0.67]
+; GENERIC-NEXT: adcq %rdi, (%rsi) # sched: [9:1.00]
+; GENERIC-NEXT: adcq (%rsi), %rdi # sched: [7:0.67]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_adc_64:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: adcq $665536, %rax # imm = 0xA27C0
+; ATOM-NEXT: # sched: [1:0.50]
+; ATOM-NEXT: adcq $665536, %rdi # imm = 0xA27C0
+; ATOM-NEXT: # sched: [1:0.50]
+; ATOM-NEXT: adcq $665536, (%rsi) # imm = 0xA27C0
+; ATOM-NEXT: # sched: [1:1.00]
+; ATOM-NEXT: adcq $7, %rdi # sched: [1:0.50]
+; ATOM-NEXT: adcq $7, (%rsi) # sched: [1:1.00]
+; ATOM-NEXT: adcq %rdi, %rdi # sched: [1:0.50]
+; ATOM-NEXT: adcq %rdi, (%rsi) # sched: [1:1.00]
+; ATOM-NEXT: adcq (%rsi), %rdi # sched: [1:1.00]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_adc_64:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: adcq $665536, %rax # imm = 0xA27C0
+; SLM-NEXT: # sched: [1:0.50]
+; SLM-NEXT: adcq $665536, %rdi # imm = 0xA27C0
+; SLM-NEXT: # sched: [1:0.50]
+; SLM-NEXT: adcq $665536, (%rsi) # imm = 0xA27C0
+; SLM-NEXT: # sched: [4:2.00]
+; SLM-NEXT: adcq $7, %rdi # sched: [1:0.50]
+; SLM-NEXT: adcq $7, (%rsi) # sched: [4:2.00]
+; SLM-NEXT: adcq %rdi, %rdi # sched: [1:0.50]
+; SLM-NEXT: adcq %rdi, (%rsi) # sched: [4:2.00]
+; SLM-NEXT: adcq (%rsi), %rdi # sched: [4:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_adc_64:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: adcq $665536, %rax # imm = 0xA27C0
+; SANDY-NEXT: # sched: [1:0.33]
+; SANDY-NEXT: adcq $665536, %rdi # imm = 0xA27C0
+; SANDY-NEXT: # sched: [2:0.67]
+; SANDY-NEXT: adcq $665536, (%rsi) # imm = 0xA27C0
+; SANDY-NEXT: # sched: [9:1.00]
+; SANDY-NEXT: adcq $7, %rdi # sched: [2:0.67]
+; SANDY-NEXT: adcq $7, (%rsi) # sched: [9:1.00]
+; SANDY-NEXT: adcq %rdi, %rdi # sched: [2:0.67]
+; SANDY-NEXT: adcq %rdi, (%rsi) # sched: [9:1.00]
+; SANDY-NEXT: adcq (%rsi), %rdi # sched: [7:0.67]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_adc_64:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: adcq $665536, %rax # imm = 0xA27C0
+; HASWELL-NEXT: # sched: [1:0.25]
+; HASWELL-NEXT: adcq $665536, %rdi # imm = 0xA27C0
+; HASWELL-NEXT: # sched: [2:0.50]
+; HASWELL-NEXT: adcq $665536, (%rsi) # imm = 0xA27C0
+; HASWELL-NEXT: # sched: [9:1.00]
+; HASWELL-NEXT: adcq $7, %rdi # sched: [2:0.50]
+; HASWELL-NEXT: adcq $7, (%rsi) # sched: [9:1.00]
+; HASWELL-NEXT: adcq %rdi, %rdi # sched: [2:0.50]
+; HASWELL-NEXT: adcq %rdi, (%rsi) # sched: [9:1.00]
+; HASWELL-NEXT: adcq (%rsi), %rdi # sched: [7:0.50]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_adc_64:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: adcq $665536, %rax # imm = 0xA27C0
+; BROADWELL-NEXT: # sched: [1:0.25]
+; BROADWELL-NEXT: adcq $665536, %rdi # imm = 0xA27C0
+; BROADWELL-NEXT: # sched: [1:0.50]
+; BROADWELL-NEXT: adcq $665536, (%rsi) # imm = 0xA27C0
+; BROADWELL-NEXT: # sched: [8:1.00]
+; BROADWELL-NEXT: adcq $7, %rdi # sched: [1:0.50]
+; BROADWELL-NEXT: adcq $7, (%rsi) # sched: [8:1.00]
+; BROADWELL-NEXT: adcq %rdi, %rdi # sched: [1:0.50]
+; BROADWELL-NEXT: adcq %rdi, (%rsi) # sched: [8:1.00]
+; BROADWELL-NEXT: adcq (%rsi), %rdi # sched: [6:0.50]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_adc_64:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: adcq $665536, %rax # imm = 0xA27C0
+; SKYLAKE-NEXT: # sched: [1:0.25]
+; SKYLAKE-NEXT: adcq $665536, %rdi # imm = 0xA27C0
+; SKYLAKE-NEXT: # sched: [1:0.50]
+; SKYLAKE-NEXT: adcq $665536, (%rsi) # imm = 0xA27C0
+; SKYLAKE-NEXT: # sched: [8:1.00]
+; SKYLAKE-NEXT: adcq $7, %rdi # sched: [1:0.50]
+; SKYLAKE-NEXT: adcq $7, (%rsi) # sched: [8:1.00]
+; SKYLAKE-NEXT: adcq %rdi, %rdi # sched: [1:0.50]
+; SKYLAKE-NEXT: adcq %rdi, (%rsi) # sched: [8:1.00]
+; SKYLAKE-NEXT: adcq (%rsi), %rdi # sched: [6:0.50]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_adc_64:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: adcq $665536, %rax # imm = 0xA27C0
+; SKX-NEXT: # sched: [1:0.25]
+; SKX-NEXT: adcq $665536, %rdi # imm = 0xA27C0
+; SKX-NEXT: # sched: [1:0.50]
+; SKX-NEXT: adcq $665536, (%rsi) # imm = 0xA27C0
+; SKX-NEXT: # sched: [8:1.00]
+; SKX-NEXT: adcq $7, %rdi # sched: [1:0.50]
+; SKX-NEXT: adcq $7, (%rsi) # sched: [8:1.00]
+; SKX-NEXT: adcq %rdi, %rdi # sched: [1:0.50]
+; SKX-NEXT: adcq %rdi, (%rsi) # sched: [8:1.00]
+; SKX-NEXT: adcq (%rsi), %rdi # sched: [6:0.50]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_adc_64:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: adcq $665536, %rax # imm = 0xA27C0
+; BTVER2-NEXT: # sched: [1:0.50]
+; BTVER2-NEXT: adcq $665536, %rdi # imm = 0xA27C0
+; BTVER2-NEXT: # sched: [1:0.50]
+; BTVER2-NEXT: adcq $665536, (%rsi) # imm = 0xA27C0
+; BTVER2-NEXT: # sched: [4:1.00]
+; BTVER2-NEXT: adcq $7, %rdi # sched: [1:0.50]
+; BTVER2-NEXT: adcq $7, (%rsi) # sched: [4:1.00]
+; BTVER2-NEXT: adcq %rdi, %rdi # sched: [1:0.50]
+; BTVER2-NEXT: adcq %rdi, (%rsi) # sched: [4:1.00]
+; BTVER2-NEXT: adcq (%rsi), %rdi # sched: [4:1.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_adc_64:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: adcq $665536, %rax # imm = 0xA27C0
+; ZNVER1-NEXT: # sched: [1:0.25]
+; ZNVER1-NEXT: adcq $665536, %rdi # imm = 0xA27C0
+; ZNVER1-NEXT: # sched: [1:0.25]
+; ZNVER1-NEXT: adcq $665536, (%rsi) # imm = 0xA27C0
+; ZNVER1-NEXT: # sched: [5:0.50]
+; ZNVER1-NEXT: adcq $7, %rdi # sched: [1:0.25]
+; ZNVER1-NEXT: adcq $7, (%rsi) # sched: [5:0.50]
+; ZNVER1-NEXT: adcq %rdi, %rdi # sched: [1:0.25]
+; ZNVER1-NEXT: adcq %rdi, (%rsi) # sched: [5:0.50]
+; ZNVER1-NEXT: adcq (%rsi), %rdi # sched: [5:0.50]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ tail call void asm "adcq $2, %RAX \0A\09 adcq $2, $0 \0A\09 adcq $2, $1 \0A\09 adcq $3, $0 \0A\09 adcq $3, $1 \0A\09 adcq $0, $0 \0A\09 adcq $0, $1 \0A\09 adcq $1, $0", "r,*m,i,i"(i64 %a0, i64* %a1, i32 665536, i8 7) nounwind
+ ret void
+}
+
+define void @test_add_8(i8 %a0, i8* %a1) optsize {
+; GENERIC-LABEL: test_add_8:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: addb $7, %al # sched: [1:0.33]
+; GENERIC-NEXT: addb $7, %dil # sched: [1:0.33]
+; GENERIC-NEXT: addb $7, (%rsi) # sched: [7:1.00]
+; GENERIC-NEXT: addb %dil, %dil # sched: [1:0.33]
+; GENERIC-NEXT: addb %dil, (%rsi) # sched: [7:1.00]
+; GENERIC-NEXT: addb (%rsi), %dil # sched: [6:0.50]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_add_8:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: addb $7, %al # sched: [1:0.50]
+; ATOM-NEXT: addb $7, %dil # sched: [1:0.50]
+; ATOM-NEXT: addb $7, (%rsi) # sched: [1:1.00]
+; ATOM-NEXT: addb %dil, %dil # sched: [1:0.50]
+; ATOM-NEXT: addb %dil, (%rsi) # sched: [1:1.00]
+; ATOM-NEXT: addb (%rsi), %dil # sched: [1:1.00]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_add_8:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: addb $7, %al # sched: [1:0.50]
+; SLM-NEXT: addb $7, %dil # sched: [1:0.50]
+; SLM-NEXT: addb $7, (%rsi) # sched: [4:2.00]
+; SLM-NEXT: addb %dil, %dil # sched: [1:0.50]
+; SLM-NEXT: addb %dil, (%rsi) # sched: [4:2.00]
+; SLM-NEXT: addb (%rsi), %dil # sched: [4:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_add_8:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: addb $7, %al # sched: [1:0.33]
+; SANDY-NEXT: addb $7, %dil # sched: [1:0.33]
+; SANDY-NEXT: addb $7, (%rsi) # sched: [7:1.00]
+; SANDY-NEXT: addb %dil, %dil # sched: [1:0.33]
+; SANDY-NEXT: addb %dil, (%rsi) # sched: [7:1.00]
+; SANDY-NEXT: addb (%rsi), %dil # sched: [6:0.50]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_add_8:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: addb $7, %al # sched: [1:0.25]
+; HASWELL-NEXT: addb $7, %dil # sched: [1:0.25]
+; HASWELL-NEXT: addb $7, (%rsi) # sched: [7:1.00]
+; HASWELL-NEXT: addb %dil, %dil # sched: [1:0.25]
+; HASWELL-NEXT: addb %dil, (%rsi) # sched: [7:1.00]
+; HASWELL-NEXT: addb (%rsi), %dil # sched: [6:0.50]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_add_8:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: addb $7, %al # sched: [1:0.25]
+; BROADWELL-NEXT: addb $7, %dil # sched: [1:0.25]
+; BROADWELL-NEXT: addb $7, (%rsi) # sched: [6:1.00]
+; BROADWELL-NEXT: addb %dil, %dil # sched: [1:0.25]
+; BROADWELL-NEXT: addb %dil, (%rsi) # sched: [6:1.00]
+; BROADWELL-NEXT: addb (%rsi), %dil # sched: [6:0.50]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_add_8:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: addb $7, %al # sched: [1:0.25]
+; SKYLAKE-NEXT: addb $7, %dil # sched: [1:0.25]
+; SKYLAKE-NEXT: addb $7, (%rsi) # sched: [6:1.00]
+; SKYLAKE-NEXT: addb %dil, %dil # sched: [1:0.25]
+; SKYLAKE-NEXT: addb %dil, (%rsi) # sched: [6:1.00]
+; SKYLAKE-NEXT: addb (%rsi), %dil # sched: [6:0.50]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_add_8:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: addb $7, %al # sched: [1:0.25]
+; SKX-NEXT: addb $7, %dil # sched: [1:0.25]
+; SKX-NEXT: addb $7, (%rsi) # sched: [6:1.00]
+; SKX-NEXT: addb %dil, %dil # sched: [1:0.25]
+; SKX-NEXT: addb %dil, (%rsi) # sched: [6:1.00]
+; SKX-NEXT: addb (%rsi), %dil # sched: [6:0.50]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_add_8:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: addb $7, %al # sched: [1:0.50]
+; BTVER2-NEXT: addb $7, %dil # sched: [1:0.50]
+; BTVER2-NEXT: addb $7, (%rsi) # sched: [4:1.00]
+; BTVER2-NEXT: addb %dil, %dil # sched: [1:0.50]
+; BTVER2-NEXT: addb %dil, (%rsi) # sched: [4:1.00]
+; BTVER2-NEXT: addb (%rsi), %dil # sched: [4:1.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_add_8:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: addb $7, %al # sched: [1:0.25]
+; ZNVER1-NEXT: addb $7, %dil # sched: [1:0.25]
+; ZNVER1-NEXT: addb $7, (%rsi) # sched: [5:0.50]
+; ZNVER1-NEXT: addb %dil, %dil # sched: [1:0.25]
+; ZNVER1-NEXT: addb %dil, (%rsi) # sched: [5:0.50]
+; ZNVER1-NEXT: addb (%rsi), %dil # sched: [5:0.50]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ tail call void asm "addb $2, %AL \0A\09 addb $2, $0 \0A\09 addb $2, $1 \0A\09 addb $0, $0 \0A\09 addb $0, $1 \0A\09 addb $1, $0", "r,*m,i"(i8 %a0, i8* %a1, i8 7) nounwind
+ ret void
+}
+define void @test_add_16(i16 %a0, i16* %a1) optsize {
+; GENERIC-LABEL: test_add_16:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: addw $511, %ax # imm = 0x1FF
+; GENERIC-NEXT: # sched: [1:0.33]
+; GENERIC-NEXT: addw $511, %di # imm = 0x1FF
+; GENERIC-NEXT: # sched: [1:0.33]
+; GENERIC-NEXT: addw $511, (%rsi) # imm = 0x1FF
+; GENERIC-NEXT: # sched: [7:1.00]
+; GENERIC-NEXT: addw $7, %di # sched: [1:0.33]
+; GENERIC-NEXT: addw $7, (%rsi) # sched: [7:1.00]
+; GENERIC-NEXT: addw %di, %di # sched: [1:0.33]
+; GENERIC-NEXT: addw %di, (%rsi) # sched: [7:1.00]
+; GENERIC-NEXT: addw (%rsi), %di # sched: [6:0.50]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_add_16:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: addw $511, %ax # imm = 0x1FF
+; ATOM-NEXT: # sched: [1:0.50]
+; ATOM-NEXT: addw $511, %di # imm = 0x1FF
+; ATOM-NEXT: # sched: [1:0.50]
+; ATOM-NEXT: addw $511, (%rsi) # imm = 0x1FF
+; ATOM-NEXT: # sched: [1:1.00]
+; ATOM-NEXT: addw $7, %di # sched: [1:0.50]
+; ATOM-NEXT: addw $7, (%rsi) # sched: [1:1.00]
+; ATOM-NEXT: addw %di, %di # sched: [1:0.50]
+; ATOM-NEXT: addw %di, (%rsi) # sched: [1:1.00]
+; ATOM-NEXT: addw (%rsi), %di # sched: [1:1.00]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_add_16:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: addw $511, %ax # imm = 0x1FF
+; SLM-NEXT: # sched: [1:0.50]
+; SLM-NEXT: addw $511, %di # imm = 0x1FF
+; SLM-NEXT: # sched: [1:0.50]
+; SLM-NEXT: addw $511, (%rsi) # imm = 0x1FF
+; SLM-NEXT: # sched: [4:2.00]
+; SLM-NEXT: addw $7, %di # sched: [1:0.50]
+; SLM-NEXT: addw $7, (%rsi) # sched: [4:2.00]
+; SLM-NEXT: addw %di, %di # sched: [1:0.50]
+; SLM-NEXT: addw %di, (%rsi) # sched: [4:2.00]
+; SLM-NEXT: addw (%rsi), %di # sched: [4:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_add_16:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: addw $511, %ax # imm = 0x1FF
+; SANDY-NEXT: # sched: [1:0.33]
+; SANDY-NEXT: addw $511, %di # imm = 0x1FF
+; SANDY-NEXT: # sched: [1:0.33]
+; SANDY-NEXT: addw $511, (%rsi) # imm = 0x1FF
+; SANDY-NEXT: # sched: [7:1.00]
+; SANDY-NEXT: addw $7, %di # sched: [1:0.33]
+; SANDY-NEXT: addw $7, (%rsi) # sched: [7:1.00]
+; SANDY-NEXT: addw %di, %di # sched: [1:0.33]
+; SANDY-NEXT: addw %di, (%rsi) # sched: [7:1.00]
+; SANDY-NEXT: addw (%rsi), %di # sched: [6:0.50]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_add_16:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: addw $511, %ax # imm = 0x1FF
+; HASWELL-NEXT: # sched: [1:0.25]
+; HASWELL-NEXT: addw $511, %di # imm = 0x1FF
+; HASWELL-NEXT: # sched: [1:0.25]
+; HASWELL-NEXT: addw $511, (%rsi) # imm = 0x1FF
+; HASWELL-NEXT: # sched: [7:1.00]
+; HASWELL-NEXT: addw $7, %di # sched: [1:0.25]
+; HASWELL-NEXT: addw $7, (%rsi) # sched: [7:1.00]
+; HASWELL-NEXT: addw %di, %di # sched: [1:0.25]
+; HASWELL-NEXT: addw %di, (%rsi) # sched: [7:1.00]
+; HASWELL-NEXT: addw (%rsi), %di # sched: [6:0.50]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_add_16:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: addw $511, %ax # imm = 0x1FF
+; BROADWELL-NEXT: # sched: [1:0.25]
+; BROADWELL-NEXT: addw $511, %di # imm = 0x1FF
+; BROADWELL-NEXT: # sched: [1:0.25]
+; BROADWELL-NEXT: addw $511, (%rsi) # imm = 0x1FF
+; BROADWELL-NEXT: # sched: [6:1.00]
+; BROADWELL-NEXT: addw $7, %di # sched: [1:0.25]
+; BROADWELL-NEXT: addw $7, (%rsi) # sched: [6:1.00]
+; BROADWELL-NEXT: addw %di, %di # sched: [1:0.25]
+; BROADWELL-NEXT: addw %di, (%rsi) # sched: [6:1.00]
+; BROADWELL-NEXT: addw (%rsi), %di # sched: [6:0.50]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_add_16:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: addw $511, %ax # imm = 0x1FF
+; SKYLAKE-NEXT: # sched: [1:0.25]
+; SKYLAKE-NEXT: addw $511, %di # imm = 0x1FF
+; SKYLAKE-NEXT: # sched: [1:0.25]
+; SKYLAKE-NEXT: addw $511, (%rsi) # imm = 0x1FF
+; SKYLAKE-NEXT: # sched: [6:1.00]
+; SKYLAKE-NEXT: addw $7, %di # sched: [1:0.25]
+; SKYLAKE-NEXT: addw $7, (%rsi) # sched: [6:1.00]
+; SKYLAKE-NEXT: addw %di, %di # sched: [1:0.25]
+; SKYLAKE-NEXT: addw %di, (%rsi) # sched: [6:1.00]
+; SKYLAKE-NEXT: addw (%rsi), %di # sched: [6:0.50]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_add_16:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: addw $511, %ax # imm = 0x1FF
+; SKX-NEXT: # sched: [1:0.25]
+; SKX-NEXT: addw $511, %di # imm = 0x1FF
+; SKX-NEXT: # sched: [1:0.25]
+; SKX-NEXT: addw $511, (%rsi) # imm = 0x1FF
+; SKX-NEXT: # sched: [6:1.00]
+; SKX-NEXT: addw $7, %di # sched: [1:0.25]
+; SKX-NEXT: addw $7, (%rsi) # sched: [6:1.00]
+; SKX-NEXT: addw %di, %di # sched: [1:0.25]
+; SKX-NEXT: addw %di, (%rsi) # sched: [6:1.00]
+; SKX-NEXT: addw (%rsi), %di # sched: [6:0.50]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_add_16:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: addw $511, %ax # imm = 0x1FF
+; BTVER2-NEXT: # sched: [1:0.50]
+; BTVER2-NEXT: addw $511, %di # imm = 0x1FF
+; BTVER2-NEXT: # sched: [1:0.50]
+; BTVER2-NEXT: addw $511, (%rsi) # imm = 0x1FF
+; BTVER2-NEXT: # sched: [4:1.00]
+; BTVER2-NEXT: addw $7, %di # sched: [1:0.50]
+; BTVER2-NEXT: addw $7, (%rsi) # sched: [4:1.00]
+; BTVER2-NEXT: addw %di, %di # sched: [1:0.50]
+; BTVER2-NEXT: addw %di, (%rsi) # sched: [4:1.00]
+; BTVER2-NEXT: addw (%rsi), %di # sched: [4:1.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_add_16:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: addw $511, %ax # imm = 0x1FF
+; ZNVER1-NEXT: # sched: [1:0.25]
+; ZNVER1-NEXT: addw $511, %di # imm = 0x1FF
+; ZNVER1-NEXT: # sched: [1:0.25]
+; ZNVER1-NEXT: addw $511, (%rsi) # imm = 0x1FF
+; ZNVER1-NEXT: # sched: [5:0.50]
+; ZNVER1-NEXT: addw $7, %di # sched: [1:0.25]
+; ZNVER1-NEXT: addw $7, (%rsi) # sched: [5:0.50]
+; ZNVER1-NEXT: addw %di, %di # sched: [1:0.25]
+; ZNVER1-NEXT: addw %di, (%rsi) # sched: [5:0.50]
+; ZNVER1-NEXT: addw (%rsi), %di # sched: [5:0.50]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ tail call void asm "addw $2, %AX \0A\09 addw $2, $0 \0A\09 addw $2, $1 \0A\09 addw $3, $0 \0A\09 addw $3, $1 \0A\09 addw $0, $0 \0A\09 addw $0, $1 \0A\09 addw $1, $0", "r,*m,i,i"(i16 %a0, i16* %a1, i16 511, i8 7) nounwind
+ ret void
+}
+define void @test_add_32(i32 %a0, i32* %a1) optsize {
+; GENERIC-LABEL: test_add_32:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: addl $665536, %eax # imm = 0xA27C0
+; GENERIC-NEXT: # sched: [1:0.33]
+; GENERIC-NEXT: addl $665536, %edi # imm = 0xA27C0
+; GENERIC-NEXT: # sched: [1:0.33]
+; GENERIC-NEXT: addl $665536, (%rsi) # imm = 0xA27C0
+; GENERIC-NEXT: # sched: [7:1.00]
+; GENERIC-NEXT: addl $7, %edi # sched: [1:0.33]
+; GENERIC-NEXT: addl $7, (%rsi) # sched: [7:1.00]
+; GENERIC-NEXT: addl %edi, %edi # sched: [1:0.33]
+; GENERIC-NEXT: addl %edi, (%rsi) # sched: [7:1.00]
+; GENERIC-NEXT: addl (%rsi), %edi # sched: [6:0.50]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_add_32:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: addl $665536, %eax # imm = 0xA27C0
+; ATOM-NEXT: # sched: [1:0.50]
+; ATOM-NEXT: addl $665536, %edi # imm = 0xA27C0
+; ATOM-NEXT: # sched: [1:0.50]
+; ATOM-NEXT: addl $665536, (%rsi) # imm = 0xA27C0
+; ATOM-NEXT: # sched: [1:1.00]
+; ATOM-NEXT: addl $7, %edi # sched: [1:0.50]
+; ATOM-NEXT: addl $7, (%rsi) # sched: [1:1.00]
+; ATOM-NEXT: addl %edi, %edi # sched: [1:0.50]
+; ATOM-NEXT: addl %edi, (%rsi) # sched: [1:1.00]
+; ATOM-NEXT: addl (%rsi), %edi # sched: [1:1.00]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_add_32:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: addl $665536, %eax # imm = 0xA27C0
+; SLM-NEXT: # sched: [1:0.50]
+; SLM-NEXT: addl $665536, %edi # imm = 0xA27C0
+; SLM-NEXT: # sched: [1:0.50]
+; SLM-NEXT: addl $665536, (%rsi) # imm = 0xA27C0
+; SLM-NEXT: # sched: [4:2.00]
+; SLM-NEXT: addl $7, %edi # sched: [1:0.50]
+; SLM-NEXT: addl $7, (%rsi) # sched: [4:2.00]
+; SLM-NEXT: addl %edi, %edi # sched: [1:0.50]
+; SLM-NEXT: addl %edi, (%rsi) # sched: [4:2.00]
+; SLM-NEXT: addl (%rsi), %edi # sched: [4:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_add_32:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: addl $665536, %eax # imm = 0xA27C0
+; SANDY-NEXT: # sched: [1:0.33]
+; SANDY-NEXT: addl $665536, %edi # imm = 0xA27C0
+; SANDY-NEXT: # sched: [1:0.33]
+; SANDY-NEXT: addl $665536, (%rsi) # imm = 0xA27C0
+; SANDY-NEXT: # sched: [7:1.00]
+; SANDY-NEXT: addl $7, %edi # sched: [1:0.33]
+; SANDY-NEXT: addl $7, (%rsi) # sched: [7:1.00]
+; SANDY-NEXT: addl %edi, %edi # sched: [1:0.33]
+; SANDY-NEXT: addl %edi, (%rsi) # sched: [7:1.00]
+; SANDY-NEXT: addl (%rsi), %edi # sched: [6:0.50]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_add_32:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: addl $665536, %eax # imm = 0xA27C0
+; HASWELL-NEXT: # sched: [1:0.25]
+; HASWELL-NEXT: addl $665536, %edi # imm = 0xA27C0
+; HASWELL-NEXT: # sched: [1:0.25]
+; HASWELL-NEXT: addl $665536, (%rsi) # imm = 0xA27C0
+; HASWELL-NEXT: # sched: [7:1.00]
+; HASWELL-NEXT: addl $7, %edi # sched: [1:0.25]
+; HASWELL-NEXT: addl $7, (%rsi) # sched: [7:1.00]
+; HASWELL-NEXT: addl %edi, %edi # sched: [1:0.25]
+; HASWELL-NEXT: addl %edi, (%rsi) # sched: [7:1.00]
+; HASWELL-NEXT: addl (%rsi), %edi # sched: [6:0.50]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_add_32:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: addl $665536, %eax # imm = 0xA27C0
+; BROADWELL-NEXT: # sched: [1:0.25]
+; BROADWELL-NEXT: addl $665536, %edi # imm = 0xA27C0
+; BROADWELL-NEXT: # sched: [1:0.25]
+; BROADWELL-NEXT: addl $665536, (%rsi) # imm = 0xA27C0
+; BROADWELL-NEXT: # sched: [6:1.00]
+; BROADWELL-NEXT: addl $7, %edi # sched: [1:0.25]
+; BROADWELL-NEXT: addl $7, (%rsi) # sched: [6:1.00]
+; BROADWELL-NEXT: addl %edi, %edi # sched: [1:0.25]
+; BROADWELL-NEXT: addl %edi, (%rsi) # sched: [6:1.00]
+; BROADWELL-NEXT: addl (%rsi), %edi # sched: [6:0.50]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_add_32:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: addl $665536, %eax # imm = 0xA27C0
+; SKYLAKE-NEXT: # sched: [1:0.25]
+; SKYLAKE-NEXT: addl $665536, %edi # imm = 0xA27C0
+; SKYLAKE-NEXT: # sched: [1:0.25]
+; SKYLAKE-NEXT: addl $665536, (%rsi) # imm = 0xA27C0
+; SKYLAKE-NEXT: # sched: [6:1.00]
+; SKYLAKE-NEXT: addl $7, %edi # sched: [1:0.25]
+; SKYLAKE-NEXT: addl $7, (%rsi) # sched: [6:1.00]
+; SKYLAKE-NEXT: addl %edi, %edi # sched: [1:0.25]
+; SKYLAKE-NEXT: addl %edi, (%rsi) # sched: [6:1.00]
+; SKYLAKE-NEXT: addl (%rsi), %edi # sched: [6:0.50]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_add_32:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: addl $665536, %eax # imm = 0xA27C0
+; SKX-NEXT: # sched: [1:0.25]
+; SKX-NEXT: addl $665536, %edi # imm = 0xA27C0
+; SKX-NEXT: # sched: [1:0.25]
+; SKX-NEXT: addl $665536, (%rsi) # imm = 0xA27C0
+; SKX-NEXT: # sched: [6:1.00]
+; SKX-NEXT: addl $7, %edi # sched: [1:0.25]
+; SKX-NEXT: addl $7, (%rsi) # sched: [6:1.00]
+; SKX-NEXT: addl %edi, %edi # sched: [1:0.25]
+; SKX-NEXT: addl %edi, (%rsi) # sched: [6:1.00]
+; SKX-NEXT: addl (%rsi), %edi # sched: [6:0.50]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_add_32:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: addl $665536, %eax # imm = 0xA27C0
+; BTVER2-NEXT: # sched: [1:0.50]
+; BTVER2-NEXT: addl $665536, %edi # imm = 0xA27C0
+; BTVER2-NEXT: # sched: [1:0.50]
+; BTVER2-NEXT: addl $665536, (%rsi) # imm = 0xA27C0
+; BTVER2-NEXT: # sched: [4:1.00]
+; BTVER2-NEXT: addl $7, %edi # sched: [1:0.50]
+; BTVER2-NEXT: addl $7, (%rsi) # sched: [4:1.00]
+; BTVER2-NEXT: addl %edi, %edi # sched: [1:0.50]
+; BTVER2-NEXT: addl %edi, (%rsi) # sched: [4:1.00]
+; BTVER2-NEXT: addl (%rsi), %edi # sched: [4:1.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_add_32:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: addl $665536, %eax # imm = 0xA27C0
+; ZNVER1-NEXT: # sched: [1:0.25]
+; ZNVER1-NEXT: addl $665536, %edi # imm = 0xA27C0
+; ZNVER1-NEXT: # sched: [1:0.25]
+; ZNVER1-NEXT: addl $665536, (%rsi) # imm = 0xA27C0
+; ZNVER1-NEXT: # sched: [5:0.50]
+; ZNVER1-NEXT: addl $7, %edi # sched: [1:0.25]
+; ZNVER1-NEXT: addl $7, (%rsi) # sched: [5:0.50]
+; ZNVER1-NEXT: addl %edi, %edi # sched: [1:0.25]
+; ZNVER1-NEXT: addl %edi, (%rsi) # sched: [5:0.50]
+; ZNVER1-NEXT: addl (%rsi), %edi # sched: [5:0.50]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ tail call void asm "addl $2, %EAX \0A\09 addl $2, $0 \0A\09 addl $2, $1 \0A\09 addl $3, $0 \0A\09 addl $3, $1 \0A\09 addl $0, $0 \0A\09 addl $0, $1 \0A\09 addl $1, $0", "r,*m,i,i"(i32 %a0, i32* %a1, i32 665536, i8 7) nounwind
+ ret void
+}
+define void @test_add_64(i64 %a0, i64* %a1) optsize {
+; GENERIC-LABEL: test_add_64:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: addq $665536, %rax # imm = 0xA27C0
+; GENERIC-NEXT: # sched: [1:0.33]
+; GENERIC-NEXT: addq $665536, %rdi # imm = 0xA27C0
+; GENERIC-NEXT: # sched: [1:0.33]
+; GENERIC-NEXT: addq $665536, (%rsi) # imm = 0xA27C0
+; GENERIC-NEXT: # sched: [7:1.00]
+; GENERIC-NEXT: addq $7, %rdi # sched: [1:0.33]
+; GENERIC-NEXT: addq $7, (%rsi) # sched: [7:1.00]
+; GENERIC-NEXT: addq %rdi, %rdi # sched: [1:0.33]
+; GENERIC-NEXT: addq %rdi, (%rsi) # sched: [7:1.00]
+; GENERIC-NEXT: addq (%rsi), %rdi # sched: [6:0.50]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_add_64:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: addq $665536, %rax # imm = 0xA27C0
+; ATOM-NEXT: # sched: [1:0.50]
+; ATOM-NEXT: addq $665536, %rdi # imm = 0xA27C0
+; ATOM-NEXT: # sched: [1:0.50]
+; ATOM-NEXT: addq $665536, (%rsi) # imm = 0xA27C0
+; ATOM-NEXT: # sched: [1:1.00]
+; ATOM-NEXT: addq $7, %rdi # sched: [1:0.50]
+; ATOM-NEXT: addq $7, (%rsi) # sched: [1:1.00]
+; ATOM-NEXT: addq %rdi, %rdi # sched: [1:0.50]
+; ATOM-NEXT: addq %rdi, (%rsi) # sched: [1:1.00]
+; ATOM-NEXT: addq (%rsi), %rdi # sched: [1:1.00]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_add_64:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: addq $665536, %rax # imm = 0xA27C0
+; SLM-NEXT: # sched: [1:0.50]
+; SLM-NEXT: addq $665536, %rdi # imm = 0xA27C0
+; SLM-NEXT: # sched: [1:0.50]
+; SLM-NEXT: addq $665536, (%rsi) # imm = 0xA27C0
+; SLM-NEXT: # sched: [4:2.00]
+; SLM-NEXT: addq $7, %rdi # sched: [1:0.50]
+; SLM-NEXT: addq $7, (%rsi) # sched: [4:2.00]
+; SLM-NEXT: addq %rdi, %rdi # sched: [1:0.50]
+; SLM-NEXT: addq %rdi, (%rsi) # sched: [4:2.00]
+; SLM-NEXT: addq (%rsi), %rdi # sched: [4:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_add_64:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: addq $665536, %rax # imm = 0xA27C0
+; SANDY-NEXT: # sched: [1:0.33]
+; SANDY-NEXT: addq $665536, %rdi # imm = 0xA27C0
+; SANDY-NEXT: # sched: [1:0.33]
+; SANDY-NEXT: addq $665536, (%rsi) # imm = 0xA27C0
+; SANDY-NEXT: # sched: [7:1.00]
+; SANDY-NEXT: addq $7, %rdi # sched: [1:0.33]
+; SANDY-NEXT: addq $7, (%rsi) # sched: [7:1.00]
+; SANDY-NEXT: addq %rdi, %rdi # sched: [1:0.33]
+; SANDY-NEXT: addq %rdi, (%rsi) # sched: [7:1.00]
+; SANDY-NEXT: addq (%rsi), %rdi # sched: [6:0.50]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_add_64:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: addq $665536, %rax # imm = 0xA27C0
+; HASWELL-NEXT: # sched: [1:0.25]
+; HASWELL-NEXT: addq $665536, %rdi # imm = 0xA27C0
+; HASWELL-NEXT: # sched: [1:0.25]
+; HASWELL-NEXT: addq $665536, (%rsi) # imm = 0xA27C0
+; HASWELL-NEXT: # sched: [7:1.00]
+; HASWELL-NEXT: addq $7, %rdi # sched: [1:0.25]
+; HASWELL-NEXT: addq $7, (%rsi) # sched: [7:1.00]
+; HASWELL-NEXT: addq %rdi, %rdi # sched: [1:0.25]
+; HASWELL-NEXT: addq %rdi, (%rsi) # sched: [7:1.00]
+; HASWELL-NEXT: addq (%rsi), %rdi # sched: [6:0.50]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_add_64:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: addq $665536, %rax # imm = 0xA27C0
+; BROADWELL-NEXT: # sched: [1:0.25]
+; BROADWELL-NEXT: addq $665536, %rdi # imm = 0xA27C0
+; BROADWELL-NEXT: # sched: [1:0.25]
+; BROADWELL-NEXT: addq $665536, (%rsi) # imm = 0xA27C0
+; BROADWELL-NEXT: # sched: [6:1.00]
+; BROADWELL-NEXT: addq $7, %rdi # sched: [1:0.25]
+; BROADWELL-NEXT: addq $7, (%rsi) # sched: [6:1.00]
+; BROADWELL-NEXT: addq %rdi, %rdi # sched: [1:0.25]
+; BROADWELL-NEXT: addq %rdi, (%rsi) # sched: [6:1.00]
+; BROADWELL-NEXT: addq (%rsi), %rdi # sched: [6:0.50]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_add_64:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: addq $665536, %rax # imm = 0xA27C0
+; SKYLAKE-NEXT: # sched: [1:0.25]
+; SKYLAKE-NEXT: addq $665536, %rdi # imm = 0xA27C0
+; SKYLAKE-NEXT: # sched: [1:0.25]
+; SKYLAKE-NEXT: addq $665536, (%rsi) # imm = 0xA27C0
+; SKYLAKE-NEXT: # sched: [6:1.00]
+; SKYLAKE-NEXT: addq $7, %rdi # sched: [1:0.25]
+; SKYLAKE-NEXT: addq $7, (%rsi) # sched: [6:1.00]
+; SKYLAKE-NEXT: addq %rdi, %rdi # sched: [1:0.25]
+; SKYLAKE-NEXT: addq %rdi, (%rsi) # sched: [6:1.00]
+; SKYLAKE-NEXT: addq (%rsi), %rdi # sched: [6:0.50]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_add_64:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: addq $665536, %rax # imm = 0xA27C0
+; SKX-NEXT: # sched: [1:0.25]
+; SKX-NEXT: addq $665536, %rdi # imm = 0xA27C0
+; SKX-NEXT: # sched: [1:0.25]
+; SKX-NEXT: addq $665536, (%rsi) # imm = 0xA27C0
+; SKX-NEXT: # sched: [6:1.00]
+; SKX-NEXT: addq $7, %rdi # sched: [1:0.25]
+; SKX-NEXT: addq $7, (%rsi) # sched: [6:1.00]
+; SKX-NEXT: addq %rdi, %rdi # sched: [1:0.25]
+; SKX-NEXT: addq %rdi, (%rsi) # sched: [6:1.00]
+; SKX-NEXT: addq (%rsi), %rdi # sched: [6:0.50]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_add_64:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: addq $665536, %rax # imm = 0xA27C0
+; BTVER2-NEXT: # sched: [1:0.50]
+; BTVER2-NEXT: addq $665536, %rdi # imm = 0xA27C0
+; BTVER2-NEXT: # sched: [1:0.50]
+; BTVER2-NEXT: addq $665536, (%rsi) # imm = 0xA27C0
+; BTVER2-NEXT: # sched: [4:1.00]
+; BTVER2-NEXT: addq $7, %rdi # sched: [1:0.50]
+; BTVER2-NEXT: addq $7, (%rsi) # sched: [4:1.00]
+; BTVER2-NEXT: addq %rdi, %rdi # sched: [1:0.50]
+; BTVER2-NEXT: addq %rdi, (%rsi) # sched: [4:1.00]
+; BTVER2-NEXT: addq (%rsi), %rdi # sched: [4:1.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_add_64:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: addq $665536, %rax # imm = 0xA27C0
+; ZNVER1-NEXT: # sched: [1:0.25]
+; ZNVER1-NEXT: addq $665536, %rdi # imm = 0xA27C0
+; ZNVER1-NEXT: # sched: [1:0.25]
+; ZNVER1-NEXT: addq $665536, (%rsi) # imm = 0xA27C0
+; ZNVER1-NEXT: # sched: [5:0.50]
+; ZNVER1-NEXT: addq $7, %rdi # sched: [1:0.25]
+; ZNVER1-NEXT: addq $7, (%rsi) # sched: [5:0.50]
+; ZNVER1-NEXT: addq %rdi, %rdi # sched: [1:0.25]
+; ZNVER1-NEXT: addq %rdi, (%rsi) # sched: [5:0.50]
+; ZNVER1-NEXT: addq (%rsi), %rdi # sched: [5:0.50]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ tail call void asm "addq $2, %RAX \0A\09 addq $2, $0 \0A\09 addq $2, $1 \0A\09 addq $3, $0 \0A\09 addq $3, $1 \0A\09 addq $0, $0 \0A\09 addq $0, $1 \0A\09 addq $1, $0", "r,*m,i,i"(i64 %a0, i64* %a1, i32 665536, i8 7) nounwind
+ ret void
+}
+
+define void @test_and_8(i8 %a0, i8* %a1) optsize {
+; GENERIC-LABEL: test_and_8:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: andb $7, %al # sched: [1:0.33]
+; GENERIC-NEXT: andb $7, %dil # sched: [1:0.33]
+; GENERIC-NEXT: andb $7, (%rsi) # sched: [7:1.00]
+; GENERIC-NEXT: andb %dil, %dil # sched: [1:0.33]
+; GENERIC-NEXT: andb %dil, (%rsi) # sched: [7:1.00]
+; GENERIC-NEXT: andb (%rsi), %dil # sched: [6:0.50]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_and_8:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: andb $7, %al # sched: [1:0.50]
+; ATOM-NEXT: andb $7, %dil # sched: [1:0.50]
+; ATOM-NEXT: andb $7, (%rsi) # sched: [1:1.00]
+; ATOM-NEXT: andb %dil, %dil # sched: [1:0.50]
+; ATOM-NEXT: andb %dil, (%rsi) # sched: [1:1.00]
+; ATOM-NEXT: andb (%rsi), %dil # sched: [1:1.00]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_and_8:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: andb $7, %al # sched: [1:0.50]
+; SLM-NEXT: andb $7, %dil # sched: [1:0.50]
+; SLM-NEXT: andb $7, (%rsi) # sched: [4:2.00]
+; SLM-NEXT: andb %dil, %dil # sched: [1:0.50]
+; SLM-NEXT: andb %dil, (%rsi) # sched: [4:2.00]
+; SLM-NEXT: andb (%rsi), %dil # sched: [4:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_and_8:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: andb $7, %al # sched: [1:0.33]
+; SANDY-NEXT: andb $7, %dil # sched: [1:0.33]
+; SANDY-NEXT: andb $7, (%rsi) # sched: [7:1.00]
+; SANDY-NEXT: andb %dil, %dil # sched: [1:0.33]
+; SANDY-NEXT: andb %dil, (%rsi) # sched: [7:1.00]
+; SANDY-NEXT: andb (%rsi), %dil # sched: [6:0.50]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_and_8:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: andb $7, %al # sched: [1:0.25]
+; HASWELL-NEXT: andb $7, %dil # sched: [1:0.25]
+; HASWELL-NEXT: andb $7, (%rsi) # sched: [7:1.00]
+; HASWELL-NEXT: andb %dil, %dil # sched: [1:0.25]
+; HASWELL-NEXT: andb %dil, (%rsi) # sched: [7:1.00]
+; HASWELL-NEXT: andb (%rsi), %dil # sched: [6:0.50]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_and_8:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: andb $7, %al # sched: [1:0.25]
+; BROADWELL-NEXT: andb $7, %dil # sched: [1:0.25]
+; BROADWELL-NEXT: andb $7, (%rsi) # sched: [6:1.00]
+; BROADWELL-NEXT: andb %dil, %dil # sched: [1:0.25]
+; BROADWELL-NEXT: andb %dil, (%rsi) # sched: [6:1.00]
+; BROADWELL-NEXT: andb (%rsi), %dil # sched: [6:0.50]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_and_8:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: andb $7, %al # sched: [1:0.25]
+; SKYLAKE-NEXT: andb $7, %dil # sched: [1:0.25]
+; SKYLAKE-NEXT: andb $7, (%rsi) # sched: [6:1.00]
+; SKYLAKE-NEXT: andb %dil, %dil # sched: [1:0.25]
+; SKYLAKE-NEXT: andb %dil, (%rsi) # sched: [6:1.00]
+; SKYLAKE-NEXT: andb (%rsi), %dil # sched: [6:0.50]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_and_8:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: andb $7, %al # sched: [1:0.25]
+; SKX-NEXT: andb $7, %dil # sched: [1:0.25]
+; SKX-NEXT: andb $7, (%rsi) # sched: [6:1.00]
+; SKX-NEXT: andb %dil, %dil # sched: [1:0.25]
+; SKX-NEXT: andb %dil, (%rsi) # sched: [6:1.00]
+; SKX-NEXT: andb (%rsi), %dil # sched: [6:0.50]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_and_8:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: andb $7, %al # sched: [1:0.50]
+; BTVER2-NEXT: andb $7, %dil # sched: [1:0.50]
+; BTVER2-NEXT: andb $7, (%rsi) # sched: [4:1.00]
+; BTVER2-NEXT: andb %dil, %dil # sched: [1:0.50]
+; BTVER2-NEXT: andb %dil, (%rsi) # sched: [4:1.00]
+; BTVER2-NEXT: andb (%rsi), %dil # sched: [4:1.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_and_8:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: andb $7, %al # sched: [1:0.25]
+; ZNVER1-NEXT: andb $7, %dil # sched: [1:0.25]
+; ZNVER1-NEXT: andb $7, (%rsi) # sched: [5:0.50]
+; ZNVER1-NEXT: andb %dil, %dil # sched: [1:0.25]
+; ZNVER1-NEXT: andb %dil, (%rsi) # sched: [5:0.50]
+; ZNVER1-NEXT: andb (%rsi), %dil # sched: [5:0.50]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ tail call void asm "andb $2, %AL \0A\09 andb $2, $0 \0A\09 andb $2, $1 \0A\09 andb $0, $0 \0A\09 andb $0, $1 \0A\09 andb $1, $0", "r,*m,i"(i8 %a0, i8* %a1, i8 7) nounwind
+ ret void
+}
+define void @test_and_16(i16 %a0, i16* %a1) optsize {
+; GENERIC-LABEL: test_and_16:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: andw $511, %ax # imm = 0x1FF
+; GENERIC-NEXT: # sched: [1:0.33]
+; GENERIC-NEXT: andw $511, %di # imm = 0x1FF
+; GENERIC-NEXT: # sched: [1:0.33]
+; GENERIC-NEXT: andw $511, (%rsi) # imm = 0x1FF
+; GENERIC-NEXT: # sched: [7:1.00]
+; GENERIC-NEXT: andw $7, %di # sched: [1:0.33]
+; GENERIC-NEXT: andw $7, (%rsi) # sched: [7:1.00]
+; GENERIC-NEXT: andw %di, %di # sched: [1:0.33]
+; GENERIC-NEXT: andw %di, (%rsi) # sched: [7:1.00]
+; GENERIC-NEXT: andw (%rsi), %di # sched: [6:0.50]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_and_16:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: andw $511, %ax # imm = 0x1FF
+; ATOM-NEXT: # sched: [1:0.50]
+; ATOM-NEXT: andw $511, %di # imm = 0x1FF
+; ATOM-NEXT: # sched: [1:0.50]
+; ATOM-NEXT: andw $511, (%rsi) # imm = 0x1FF
+; ATOM-NEXT: # sched: [1:1.00]
+; ATOM-NEXT: andw $7, %di # sched: [1:0.50]
+; ATOM-NEXT: andw $7, (%rsi) # sched: [1:1.00]
+; ATOM-NEXT: andw %di, %di # sched: [1:0.50]
+; ATOM-NEXT: andw %di, (%rsi) # sched: [1:1.00]
+; ATOM-NEXT: andw (%rsi), %di # sched: [1:1.00]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_and_16:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: andw $511, %ax # imm = 0x1FF
+; SLM-NEXT: # sched: [1:0.50]
+; SLM-NEXT: andw $511, %di # imm = 0x1FF
+; SLM-NEXT: # sched: [1:0.50]
+; SLM-NEXT: andw $511, (%rsi) # imm = 0x1FF
+; SLM-NEXT: # sched: [4:2.00]
+; SLM-NEXT: andw $7, %di # sched: [1:0.50]
+; SLM-NEXT: andw $7, (%rsi) # sched: [4:2.00]
+; SLM-NEXT: andw %di, %di # sched: [1:0.50]
+; SLM-NEXT: andw %di, (%rsi) # sched: [4:2.00]
+; SLM-NEXT: andw (%rsi), %di # sched: [4:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_and_16:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: andw $511, %ax # imm = 0x1FF
+; SANDY-NEXT: # sched: [1:0.33]
+; SANDY-NEXT: andw $511, %di # imm = 0x1FF
+; SANDY-NEXT: # sched: [1:0.33]
+; SANDY-NEXT: andw $511, (%rsi) # imm = 0x1FF
+; SANDY-NEXT: # sched: [7:1.00]
+; SANDY-NEXT: andw $7, %di # sched: [1:0.33]
+; SANDY-NEXT: andw $7, (%rsi) # sched: [7:1.00]
+; SANDY-NEXT: andw %di, %di # sched: [1:0.33]
+; SANDY-NEXT: andw %di, (%rsi) # sched: [7:1.00]
+; SANDY-NEXT: andw (%rsi), %di # sched: [6:0.50]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_and_16:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: andw $511, %ax # imm = 0x1FF
+; HASWELL-NEXT: # sched: [1:0.25]
+; HASWELL-NEXT: andw $511, %di # imm = 0x1FF
+; HASWELL-NEXT: # sched: [1:0.25]
+; HASWELL-NEXT: andw $511, (%rsi) # imm = 0x1FF
+; HASWELL-NEXT: # sched: [7:1.00]
+; HASWELL-NEXT: andw $7, %di # sched: [1:0.25]
+; HASWELL-NEXT: andw $7, (%rsi) # sched: [7:1.00]
+; HASWELL-NEXT: andw %di, %di # sched: [1:0.25]
+; HASWELL-NEXT: andw %di, (%rsi) # sched: [7:1.00]
+; HASWELL-NEXT: andw (%rsi), %di # sched: [6:0.50]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_and_16:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: andw $511, %ax # imm = 0x1FF
+; BROADWELL-NEXT: # sched: [1:0.25]
+; BROADWELL-NEXT: andw $511, %di # imm = 0x1FF
+; BROADWELL-NEXT: # sched: [1:0.25]
+; BROADWELL-NEXT: andw $511, (%rsi) # imm = 0x1FF
+; BROADWELL-NEXT: # sched: [6:1.00]
+; BROADWELL-NEXT: andw $7, %di # sched: [1:0.25]
+; BROADWELL-NEXT: andw $7, (%rsi) # sched: [6:1.00]
+; BROADWELL-NEXT: andw %di, %di # sched: [1:0.25]
+; BROADWELL-NEXT: andw %di, (%rsi) # sched: [6:1.00]
+; BROADWELL-NEXT: andw (%rsi), %di # sched: [6:0.50]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_and_16:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: andw $511, %ax # imm = 0x1FF
+; SKYLAKE-NEXT: # sched: [1:0.25]
+; SKYLAKE-NEXT: andw $511, %di # imm = 0x1FF
+; SKYLAKE-NEXT: # sched: [1:0.25]
+; SKYLAKE-NEXT: andw $511, (%rsi) # imm = 0x1FF
+; SKYLAKE-NEXT: # sched: [6:1.00]
+; SKYLAKE-NEXT: andw $7, %di # sched: [1:0.25]
+; SKYLAKE-NEXT: andw $7, (%rsi) # sched: [6:1.00]
+; SKYLAKE-NEXT: andw %di, %di # sched: [1:0.25]
+; SKYLAKE-NEXT: andw %di, (%rsi) # sched: [6:1.00]
+; SKYLAKE-NEXT: andw (%rsi), %di # sched: [6:0.50]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_and_16:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: andw $511, %ax # imm = 0x1FF
+; SKX-NEXT: # sched: [1:0.25]
+; SKX-NEXT: andw $511, %di # imm = 0x1FF
+; SKX-NEXT: # sched: [1:0.25]
+; SKX-NEXT: andw $511, (%rsi) # imm = 0x1FF
+; SKX-NEXT: # sched: [6:1.00]
+; SKX-NEXT: andw $7, %di # sched: [1:0.25]
+; SKX-NEXT: andw $7, (%rsi) # sched: [6:1.00]
+; SKX-NEXT: andw %di, %di # sched: [1:0.25]
+; SKX-NEXT: andw %di, (%rsi) # sched: [6:1.00]
+; SKX-NEXT: andw (%rsi), %di # sched: [6:0.50]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_and_16:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: andw $511, %ax # imm = 0x1FF
+; BTVER2-NEXT: # sched: [1:0.50]
+; BTVER2-NEXT: andw $511, %di # imm = 0x1FF
+; BTVER2-NEXT: # sched: [1:0.50]
+; BTVER2-NEXT: andw $511, (%rsi) # imm = 0x1FF
+; BTVER2-NEXT: # sched: [4:1.00]
+; BTVER2-NEXT: andw $7, %di # sched: [1:0.50]
+; BTVER2-NEXT: andw $7, (%rsi) # sched: [4:1.00]
+; BTVER2-NEXT: andw %di, %di # sched: [1:0.50]
+; BTVER2-NEXT: andw %di, (%rsi) # sched: [4:1.00]
+; BTVER2-NEXT: andw (%rsi), %di # sched: [4:1.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_and_16:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: andw $511, %ax # imm = 0x1FF
+; ZNVER1-NEXT: # sched: [1:0.25]
+; ZNVER1-NEXT: andw $511, %di # imm = 0x1FF
+; ZNVER1-NEXT: # sched: [1:0.25]
+; ZNVER1-NEXT: andw $511, (%rsi) # imm = 0x1FF
+; ZNVER1-NEXT: # sched: [5:0.50]
+; ZNVER1-NEXT: andw $7, %di # sched: [1:0.25]
+; ZNVER1-NEXT: andw $7, (%rsi) # sched: [5:0.50]
+; ZNVER1-NEXT: andw %di, %di # sched: [1:0.25]
+; ZNVER1-NEXT: andw %di, (%rsi) # sched: [5:0.50]
+; ZNVER1-NEXT: andw (%rsi), %di # sched: [5:0.50]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ tail call void asm "andw $2, %AX \0A\09 andw $2, $0 \0A\09 andw $2, $1 \0A\09 andw $3, $0 \0A\09 andw $3, $1 \0A\09 andw $0, $0 \0A\09 andw $0, $1 \0A\09 andw $1, $0", "r,*m,i,i"(i16 %a0, i16* %a1, i16 511, i8 7) nounwind
+ ret void
+}
+define void @test_and_32(i32 %a0, i32* %a1) optsize {
+; GENERIC-LABEL: test_and_32:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: andl $665536, %eax # imm = 0xA27C0
+; GENERIC-NEXT: # sched: [1:0.33]
+; GENERIC-NEXT: andl $665536, %edi # imm = 0xA27C0
+; GENERIC-NEXT: # sched: [1:0.33]
+; GENERIC-NEXT: andl $665536, (%rsi) # imm = 0xA27C0
+; GENERIC-NEXT: # sched: [7:1.00]
+; GENERIC-NEXT: andl $7, %edi # sched: [1:0.33]
+; GENERIC-NEXT: andl $7, (%rsi) # sched: [7:1.00]
+; GENERIC-NEXT: andl %edi, %edi # sched: [1:0.33]
+; GENERIC-NEXT: andl %edi, (%rsi) # sched: [7:1.00]
+; GENERIC-NEXT: andl (%rsi), %edi # sched: [6:0.50]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_and_32:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: andl $665536, %eax # imm = 0xA27C0
+; ATOM-NEXT: # sched: [1:0.50]
+; ATOM-NEXT: andl $665536, %edi # imm = 0xA27C0
+; ATOM-NEXT: # sched: [1:0.50]
+; ATOM-NEXT: andl $665536, (%rsi) # imm = 0xA27C0
+; ATOM-NEXT: # sched: [1:1.00]
+; ATOM-NEXT: andl $7, %edi # sched: [1:0.50]
+; ATOM-NEXT: andl $7, (%rsi) # sched: [1:1.00]
+; ATOM-NEXT: andl %edi, %edi # sched: [1:0.50]
+; ATOM-NEXT: andl %edi, (%rsi) # sched: [1:1.00]
+; ATOM-NEXT: andl (%rsi), %edi # sched: [1:1.00]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_and_32:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: andl $665536, %eax # imm = 0xA27C0
+; SLM-NEXT: # sched: [1:0.50]
+; SLM-NEXT: andl $665536, %edi # imm = 0xA27C0
+; SLM-NEXT: # sched: [1:0.50]
+; SLM-NEXT: andl $665536, (%rsi) # imm = 0xA27C0
+; SLM-NEXT: # sched: [4:2.00]
+; SLM-NEXT: andl $7, %edi # sched: [1:0.50]
+; SLM-NEXT: andl $7, (%rsi) # sched: [4:2.00]
+; SLM-NEXT: andl %edi, %edi # sched: [1:0.50]
+; SLM-NEXT: andl %edi, (%rsi) # sched: [4:2.00]
+; SLM-NEXT: andl (%rsi), %edi # sched: [4:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_and_32:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: andl $665536, %eax # imm = 0xA27C0
+; SANDY-NEXT: # sched: [1:0.33]
+; SANDY-NEXT: andl $665536, %edi # imm = 0xA27C0
+; SANDY-NEXT: # sched: [1:0.33]
+; SANDY-NEXT: andl $665536, (%rsi) # imm = 0xA27C0
+; SANDY-NEXT: # sched: [7:1.00]
+; SANDY-NEXT: andl $7, %edi # sched: [1:0.33]
+; SANDY-NEXT: andl $7, (%rsi) # sched: [7:1.00]
+; SANDY-NEXT: andl %edi, %edi # sched: [1:0.33]
+; SANDY-NEXT: andl %edi, (%rsi) # sched: [7:1.00]
+; SANDY-NEXT: andl (%rsi), %edi # sched: [6:0.50]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_and_32:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: andl $665536, %eax # imm = 0xA27C0
+; HASWELL-NEXT: # sched: [1:0.25]
+; HASWELL-NEXT: andl $665536, %edi # imm = 0xA27C0
+; HASWELL-NEXT: # sched: [1:0.25]
+; HASWELL-NEXT: andl $665536, (%rsi) # imm = 0xA27C0
+; HASWELL-NEXT: # sched: [7:1.00]
+; HASWELL-NEXT: andl $7, %edi # sched: [1:0.25]
+; HASWELL-NEXT: andl $7, (%rsi) # sched: [7:1.00]
+; HASWELL-NEXT: andl %edi, %edi # sched: [1:0.25]
+; HASWELL-NEXT: andl %edi, (%rsi) # sched: [7:1.00]
+; HASWELL-NEXT: andl (%rsi), %edi # sched: [6:0.50]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_and_32:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: andl $665536, %eax # imm = 0xA27C0
+; BROADWELL-NEXT: # sched: [1:0.25]
+; BROADWELL-NEXT: andl $665536, %edi # imm = 0xA27C0
+; BROADWELL-NEXT: # sched: [1:0.25]
+; BROADWELL-NEXT: andl $665536, (%rsi) # imm = 0xA27C0
+; BROADWELL-NEXT: # sched: [6:1.00]
+; BROADWELL-NEXT: andl $7, %edi # sched: [1:0.25]
+; BROADWELL-NEXT: andl $7, (%rsi) # sched: [6:1.00]
+; BROADWELL-NEXT: andl %edi, %edi # sched: [1:0.25]
+; BROADWELL-NEXT: andl %edi, (%rsi) # sched: [6:1.00]
+; BROADWELL-NEXT: andl (%rsi), %edi # sched: [6:0.50]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_and_32:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: andl $665536, %eax # imm = 0xA27C0
+; SKYLAKE-NEXT: # sched: [1:0.25]
+; SKYLAKE-NEXT: andl $665536, %edi # imm = 0xA27C0
+; SKYLAKE-NEXT: # sched: [1:0.25]
+; SKYLAKE-NEXT: andl $665536, (%rsi) # imm = 0xA27C0
+; SKYLAKE-NEXT: # sched: [6:1.00]
+; SKYLAKE-NEXT: andl $7, %edi # sched: [1:0.25]
+; SKYLAKE-NEXT: andl $7, (%rsi) # sched: [6:1.00]
+; SKYLAKE-NEXT: andl %edi, %edi # sched: [1:0.25]
+; SKYLAKE-NEXT: andl %edi, (%rsi) # sched: [6:1.00]
+; SKYLAKE-NEXT: andl (%rsi), %edi # sched: [6:0.50]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_and_32:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: andl $665536, %eax # imm = 0xA27C0
+; SKX-NEXT: # sched: [1:0.25]
+; SKX-NEXT: andl $665536, %edi # imm = 0xA27C0
+; SKX-NEXT: # sched: [1:0.25]
+; SKX-NEXT: andl $665536, (%rsi) # imm = 0xA27C0
+; SKX-NEXT: # sched: [6:1.00]
+; SKX-NEXT: andl $7, %edi # sched: [1:0.25]
+; SKX-NEXT: andl $7, (%rsi) # sched: [6:1.00]
+; SKX-NEXT: andl %edi, %edi # sched: [1:0.25]
+; SKX-NEXT: andl %edi, (%rsi) # sched: [6:1.00]
+; SKX-NEXT: andl (%rsi), %edi # sched: [6:0.50]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_and_32:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: andl $665536, %eax # imm = 0xA27C0
+; BTVER2-NEXT: # sched: [1:0.50]
+; BTVER2-NEXT: andl $665536, %edi # imm = 0xA27C0
+; BTVER2-NEXT: # sched: [1:0.50]
+; BTVER2-NEXT: andl $665536, (%rsi) # imm = 0xA27C0
+; BTVER2-NEXT: # sched: [4:1.00]
+; BTVER2-NEXT: andl $7, %edi # sched: [1:0.50]
+; BTVER2-NEXT: andl $7, (%rsi) # sched: [4:1.00]
+; BTVER2-NEXT: andl %edi, %edi # sched: [1:0.50]
+; BTVER2-NEXT: andl %edi, (%rsi) # sched: [4:1.00]
+; BTVER2-NEXT: andl (%rsi), %edi # sched: [4:1.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_and_32:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: andl $665536, %eax # imm = 0xA27C0
+; ZNVER1-NEXT: # sched: [1:0.25]
+; ZNVER1-NEXT: andl $665536, %edi # imm = 0xA27C0
+; ZNVER1-NEXT: # sched: [1:0.25]
+; ZNVER1-NEXT: andl $665536, (%rsi) # imm = 0xA27C0
+; ZNVER1-NEXT: # sched: [5:0.50]
+; ZNVER1-NEXT: andl $7, %edi # sched: [1:0.25]
+; ZNVER1-NEXT: andl $7, (%rsi) # sched: [5:0.50]
+; ZNVER1-NEXT: andl %edi, %edi # sched: [1:0.25]
+; ZNVER1-NEXT: andl %edi, (%rsi) # sched: [5:0.50]
+; ZNVER1-NEXT: andl (%rsi), %edi # sched: [5:0.50]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ tail call void asm "andl $2, %EAX \0A\09 andl $2, $0 \0A\09 andl $2, $1 \0A\09 andl $3, $0 \0A\09 andl $3, $1 \0A\09 andl $0, $0 \0A\09 andl $0, $1 \0A\09 andl $1, $0", "r,*m,i,i"(i32 %a0, i32* %a1, i32 665536, i8 7) nounwind
+ ret void
+}
+define void @test_and_64(i64 %a0, i64* %a1) optsize {
+; GENERIC-LABEL: test_and_64:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: andq $665536, %rax # imm = 0xA27C0
+; GENERIC-NEXT: # sched: [1:0.33]
+; GENERIC-NEXT: andq $665536, %rdi # imm = 0xA27C0
+; GENERIC-NEXT: # sched: [1:0.33]
+; GENERIC-NEXT: andq $665536, (%rsi) # imm = 0xA27C0
+; GENERIC-NEXT: # sched: [7:1.00]
+; GENERIC-NEXT: andq $7, %rdi # sched: [1:0.33]
+; GENERIC-NEXT: andq $7, (%rsi) # sched: [7:1.00]
+; GENERIC-NEXT: andq %rdi, %rdi # sched: [1:0.33]
+; GENERIC-NEXT: andq %rdi, (%rsi) # sched: [7:1.00]
+; GENERIC-NEXT: andq (%rsi), %rdi # sched: [6:0.50]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_and_64:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: andq $665536, %rax # imm = 0xA27C0
+; ATOM-NEXT: # sched: [1:0.50]
+; ATOM-NEXT: andq $665536, %rdi # imm = 0xA27C0
+; ATOM-NEXT: # sched: [1:0.50]
+; ATOM-NEXT: andq $665536, (%rsi) # imm = 0xA27C0
+; ATOM-NEXT: # sched: [1:1.00]
+; ATOM-NEXT: andq $7, %rdi # sched: [1:0.50]
+; ATOM-NEXT: andq $7, (%rsi) # sched: [1:1.00]
+; ATOM-NEXT: andq %rdi, %rdi # sched: [1:0.50]
+; ATOM-NEXT: andq %rdi, (%rsi) # sched: [1:1.00]
+; ATOM-NEXT: andq (%rsi), %rdi # sched: [1:1.00]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_and_64:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: andq $665536, %rax # imm = 0xA27C0
+; SLM-NEXT: # sched: [1:0.50]
+; SLM-NEXT: andq $665536, %rdi # imm = 0xA27C0
+; SLM-NEXT: # sched: [1:0.50]
+; SLM-NEXT: andq $665536, (%rsi) # imm = 0xA27C0
+; SLM-NEXT: # sched: [4:2.00]
+; SLM-NEXT: andq $7, %rdi # sched: [1:0.50]
+; SLM-NEXT: andq $7, (%rsi) # sched: [4:2.00]
+; SLM-NEXT: andq %rdi, %rdi # sched: [1:0.50]
+; SLM-NEXT: andq %rdi, (%rsi) # sched: [4:2.00]
+; SLM-NEXT: andq (%rsi), %rdi # sched: [4:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_and_64:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: andq $665536, %rax # imm = 0xA27C0
+; SANDY-NEXT: # sched: [1:0.33]
+; SANDY-NEXT: andq $665536, %rdi # imm = 0xA27C0
+; SANDY-NEXT: # sched: [1:0.33]
+; SANDY-NEXT: andq $665536, (%rsi) # imm = 0xA27C0
+; SANDY-NEXT: # sched: [7:1.00]
+; SANDY-NEXT: andq $7, %rdi # sched: [1:0.33]
+; SANDY-NEXT: andq $7, (%rsi) # sched: [7:1.00]
+; SANDY-NEXT: andq %rdi, %rdi # sched: [1:0.33]
+; SANDY-NEXT: andq %rdi, (%rsi) # sched: [7:1.00]
+; SANDY-NEXT: andq (%rsi), %rdi # sched: [6:0.50]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_and_64:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: andq $665536, %rax # imm = 0xA27C0
+; HASWELL-NEXT: # sched: [1:0.25]
+; HASWELL-NEXT: andq $665536, %rdi # imm = 0xA27C0
+; HASWELL-NEXT: # sched: [1:0.25]
+; HASWELL-NEXT: andq $665536, (%rsi) # imm = 0xA27C0
+; HASWELL-NEXT: # sched: [7:1.00]
+; HASWELL-NEXT: andq $7, %rdi # sched: [1:0.25]
+; HASWELL-NEXT: andq $7, (%rsi) # sched: [7:1.00]
+; HASWELL-NEXT: andq %rdi, %rdi # sched: [1:0.25]
+; HASWELL-NEXT: andq %rdi, (%rsi) # sched: [7:1.00]
+; HASWELL-NEXT: andq (%rsi), %rdi # sched: [6:0.50]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_and_64:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: andq $665536, %rax # imm = 0xA27C0
+; BROADWELL-NEXT: # sched: [1:0.25]
+; BROADWELL-NEXT: andq $665536, %rdi # imm = 0xA27C0
+; BROADWELL-NEXT: # sched: [1:0.25]
+; BROADWELL-NEXT: andq $665536, (%rsi) # imm = 0xA27C0
+; BROADWELL-NEXT: # sched: [6:1.00]
+; BROADWELL-NEXT: andq $7, %rdi # sched: [1:0.25]
+; BROADWELL-NEXT: andq $7, (%rsi) # sched: [6:1.00]
+; BROADWELL-NEXT: andq %rdi, %rdi # sched: [1:0.25]
+; BROADWELL-NEXT: andq %rdi, (%rsi) # sched: [6:1.00]
+; BROADWELL-NEXT: andq (%rsi), %rdi # sched: [6:0.50]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_and_64:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: andq $665536, %rax # imm = 0xA27C0
+; SKYLAKE-NEXT: # sched: [1:0.25]
+; SKYLAKE-NEXT: andq $665536, %rdi # imm = 0xA27C0
+; SKYLAKE-NEXT: # sched: [1:0.25]
+; SKYLAKE-NEXT: andq $665536, (%rsi) # imm = 0xA27C0
+; SKYLAKE-NEXT: # sched: [6:1.00]
+; SKYLAKE-NEXT: andq $7, %rdi # sched: [1:0.25]
+; SKYLAKE-NEXT: andq $7, (%rsi) # sched: [6:1.00]
+; SKYLAKE-NEXT: andq %rdi, %rdi # sched: [1:0.25]
+; SKYLAKE-NEXT: andq %rdi, (%rsi) # sched: [6:1.00]
+; SKYLAKE-NEXT: andq (%rsi), %rdi # sched: [6:0.50]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_and_64:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: andq $665536, %rax # imm = 0xA27C0
+; SKX-NEXT: # sched: [1:0.25]
+; SKX-NEXT: andq $665536, %rdi # imm = 0xA27C0
+; SKX-NEXT: # sched: [1:0.25]
+; SKX-NEXT: andq $665536, (%rsi) # imm = 0xA27C0
+; SKX-NEXT: # sched: [6:1.00]
+; SKX-NEXT: andq $7, %rdi # sched: [1:0.25]
+; SKX-NEXT: andq $7, (%rsi) # sched: [6:1.00]
+; SKX-NEXT: andq %rdi, %rdi # sched: [1:0.25]
+; SKX-NEXT: andq %rdi, (%rsi) # sched: [6:1.00]
+; SKX-NEXT: andq (%rsi), %rdi # sched: [6:0.50]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_and_64:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: andq $665536, %rax # imm = 0xA27C0
+; BTVER2-NEXT: # sched: [1:0.50]
+; BTVER2-NEXT: andq $665536, %rdi # imm = 0xA27C0
+; BTVER2-NEXT: # sched: [1:0.50]
+; BTVER2-NEXT: andq $665536, (%rsi) # imm = 0xA27C0
+; BTVER2-NEXT: # sched: [4:1.00]
+; BTVER2-NEXT: andq $7, %rdi # sched: [1:0.50]
+; BTVER2-NEXT: andq $7, (%rsi) # sched: [4:1.00]
+; BTVER2-NEXT: andq %rdi, %rdi # sched: [1:0.50]
+; BTVER2-NEXT: andq %rdi, (%rsi) # sched: [4:1.00]
+; BTVER2-NEXT: andq (%rsi), %rdi # sched: [4:1.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_and_64:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: andq $665536, %rax # imm = 0xA27C0
+; ZNVER1-NEXT: # sched: [1:0.25]
+; ZNVER1-NEXT: andq $665536, %rdi # imm = 0xA27C0
+; ZNVER1-NEXT: # sched: [1:0.25]
+; ZNVER1-NEXT: andq $665536, (%rsi) # imm = 0xA27C0
+; ZNVER1-NEXT: # sched: [5:0.50]
+; ZNVER1-NEXT: andq $7, %rdi # sched: [1:0.25]
+; ZNVER1-NEXT: andq $7, (%rsi) # sched: [5:0.50]
+; ZNVER1-NEXT: andq %rdi, %rdi # sched: [1:0.25]
+; ZNVER1-NEXT: andq %rdi, (%rsi) # sched: [5:0.50]
+; ZNVER1-NEXT: andq (%rsi), %rdi # sched: [5:0.50]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ tail call void asm "andq $2, %RAX \0A\09 andq $2, $0 \0A\09 andq $2, $1 \0A\09 andq $3, $0 \0A\09 andq $3, $1 \0A\09 andq $0, $0 \0A\09 andq $0, $1 \0A\09 andq $1, $0", "r,*m,i,i"(i64 %a0, i64* %a1, i32 665536, i8 7) nounwind
+ ret void
+}
+
+define i16 @test_bsf16(i16 %a0, i16* %a1) optsize {
+; GENERIC-LABEL: test_bsf16:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: bsfw %di, %ax # sched: [3:1.00]
+; GENERIC-NEXT: bsfw (%rsi), %cx # sched: [8:1.00]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: orl %ecx, %eax # sched: [1:0.33]
+; GENERIC-NEXT: # kill: def %ax killed %ax killed %eax
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_bsf16:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: bsfw %di, %ax # sched: [16:8.00]
+; ATOM-NEXT: bsfw (%rsi), %cx # sched: [16:8.00]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: orl %ecx, %eax # sched: [1:0.50]
+; ATOM-NEXT: # kill: def %ax killed %ax killed %eax
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_bsf16:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: bsfw %di, %ax # sched: [1:1.00]
+; SLM-NEXT: bsfw (%rsi), %cx # sched: [4:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: orl %ecx, %eax # sched: [1:0.50]
+; SLM-NEXT: # kill: def %ax killed %ax killed %eax
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_bsf16:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: bsfw %di, %ax # sched: [3:1.00]
+; SANDY-NEXT: bsfw (%rsi), %cx # sched: [8:1.00]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: orl %ecx, %eax # sched: [1:0.33]
+; SANDY-NEXT: # kill: def %ax killed %ax killed %eax
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_bsf16:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: bsfw %di, %ax # sched: [3:1.00]
+; HASWELL-NEXT: bsfw (%rsi), %cx # sched: [8:1.00]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: orl %ecx, %eax # sched: [1:0.25]
+; HASWELL-NEXT: # kill: def %ax killed %ax killed %eax
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_bsf16:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: bsfw %di, %ax # sched: [3:1.00]
+; BROADWELL-NEXT: bsfw (%rsi), %cx # sched: [8:1.00]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: orl %ecx, %eax # sched: [1:0.25]
+; BROADWELL-NEXT: # kill: def %ax killed %ax killed %eax
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_bsf16:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: bsfw %di, %ax # sched: [3:1.00]
+; SKYLAKE-NEXT: bsfw (%rsi), %cx # sched: [8:1.00]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: orl %ecx, %eax # sched: [1:0.25]
+; SKYLAKE-NEXT: # kill: def %ax killed %ax killed %eax
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_bsf16:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: bsfw %di, %ax # sched: [3:1.00]
+; SKX-NEXT: bsfw (%rsi), %cx # sched: [8:1.00]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: orl %ecx, %eax # sched: [1:0.25]
+; SKX-NEXT: # kill: def %ax killed %ax killed %eax
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_bsf16:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: bsfw %di, %ax # sched: [1:0.50]
+; BTVER2-NEXT: bsfw (%rsi), %cx # sched: [4:1.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: orl %ecx, %eax # sched: [1:0.50]
+; BTVER2-NEXT: # kill: def %ax killed %ax killed %eax
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_bsf16:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: bsfw %di, %ax # sched: [3:0.25]
+; ZNVER1-NEXT: bsfw (%rsi), %cx # sched: [7:0.50]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: orl %ecx, %eax # sched: [1:0.25]
+; ZNVER1-NEXT: # kill: def %ax killed %ax killed %eax
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call { i16, i16 } asm sideeffect "bsf $2, $0 \0A\09 bsf $3, $1", "=r,=r,r,*m,~{dirflag},~{fpsr},~{flags}"(i16 %a0, i16* %a1)
+ %2 = extractvalue { i16, i16 } %1, 0
+ %3 = extractvalue { i16, i16 } %1, 1
+ %4 = or i16 %2, %3
+ ret i16 %4
+}
+define i32 @test_bsf32(i32 %a0, i32* %a1) optsize {
+; GENERIC-LABEL: test_bsf32:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: bsfl %edi, %eax # sched: [3:1.00]
+; GENERIC-NEXT: bsfl (%rsi), %ecx # sched: [8:1.00]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: orl %ecx, %eax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_bsf32:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: bsfl %edi, %eax # sched: [16:8.00]
+; ATOM-NEXT: bsfl (%rsi), %ecx # sched: [16:8.00]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: orl %ecx, %eax # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_bsf32:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: bsfl %edi, %eax # sched: [1:1.00]
+; SLM-NEXT: bsfl (%rsi), %ecx # sched: [4:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: orl %ecx, %eax # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_bsf32:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: bsfl %edi, %eax # sched: [3:1.00]
+; SANDY-NEXT: bsfl (%rsi), %ecx # sched: [8:1.00]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: orl %ecx, %eax # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_bsf32:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: bsfl %edi, %eax # sched: [3:1.00]
+; HASWELL-NEXT: bsfl (%rsi), %ecx # sched: [8:1.00]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: orl %ecx, %eax # sched: [1:0.25]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_bsf32:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: bsfl %edi, %eax # sched: [3:1.00]
+; BROADWELL-NEXT: bsfl (%rsi), %ecx # sched: [8:1.00]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: orl %ecx, %eax # sched: [1:0.25]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_bsf32:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: bsfl %edi, %eax # sched: [3:1.00]
+; SKYLAKE-NEXT: bsfl (%rsi), %ecx # sched: [8:1.00]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: orl %ecx, %eax # sched: [1:0.25]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_bsf32:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: bsfl %edi, %eax # sched: [3:1.00]
+; SKX-NEXT: bsfl (%rsi), %ecx # sched: [8:1.00]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: orl %ecx, %eax # sched: [1:0.25]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_bsf32:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: bsfl %edi, %eax # sched: [1:0.50]
+; BTVER2-NEXT: bsfl (%rsi), %ecx # sched: [4:1.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: orl %ecx, %eax # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_bsf32:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: bsfl %edi, %eax # sched: [3:0.25]
+; ZNVER1-NEXT: bsfl (%rsi), %ecx # sched: [7:0.50]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: orl %ecx, %eax # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call { i32, i32 } asm sideeffect "bsf $2, $0 \0A\09 bsf $3, $1", "=r,=r,r,*m,~{dirflag},~{fpsr},~{flags}"(i32 %a0, i32* %a1)
+ %2 = extractvalue { i32, i32 } %1, 0
+ %3 = extractvalue { i32, i32 } %1, 1
+ %4 = or i32 %2, %3
+ ret i32 %4
+}
+define i64 @test_bsf64(i64 %a0, i64* %a1) optsize {
+; GENERIC-LABEL: test_bsf64:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: bsfq %rdi, %rax # sched: [3:1.00]
+; GENERIC-NEXT: bsfq (%rsi), %rcx # sched: [8:1.00]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: orq %rcx, %rax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_bsf64:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: bsfq %rdi, %rax # sched: [16:8.00]
+; ATOM-NEXT: bsfq (%rsi), %rcx # sched: [16:8.00]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: orq %rcx, %rax # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_bsf64:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: bsfq %rdi, %rax # sched: [1:1.00]
+; SLM-NEXT: bsfq (%rsi), %rcx # sched: [4:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: orq %rcx, %rax # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_bsf64:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: bsfq %rdi, %rax # sched: [3:1.00]
+; SANDY-NEXT: bsfq (%rsi), %rcx # sched: [8:1.00]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: orq %rcx, %rax # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_bsf64:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: bsfq %rdi, %rax # sched: [3:1.00]
+; HASWELL-NEXT: bsfq (%rsi), %rcx # sched: [8:1.00]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: orq %rcx, %rax # sched: [1:0.25]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_bsf64:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: bsfq %rdi, %rax # sched: [3:1.00]
+; BROADWELL-NEXT: bsfq (%rsi), %rcx # sched: [8:1.00]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: orq %rcx, %rax # sched: [1:0.25]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_bsf64:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: bsfq %rdi, %rax # sched: [3:1.00]
+; SKYLAKE-NEXT: bsfq (%rsi), %rcx # sched: [8:1.00]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: orq %rcx, %rax # sched: [1:0.25]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_bsf64:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: bsfq %rdi, %rax # sched: [3:1.00]
+; SKX-NEXT: bsfq (%rsi), %rcx # sched: [8:1.00]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: orq %rcx, %rax # sched: [1:0.25]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_bsf64:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: bsfq %rdi, %rax # sched: [1:0.50]
+; BTVER2-NEXT: bsfq (%rsi), %rcx # sched: [4:1.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: orq %rcx, %rax # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_bsf64:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: bsfq %rdi, %rax # sched: [3:0.25]
+; ZNVER1-NEXT: bsfq (%rsi), %rcx # sched: [7:0.50]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: orq %rcx, %rax # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call { i64, i64 } asm sideeffect "bsf $2, $0 \0A\09 bsf $3, $1", "=r,=r,r,*m,~{dirflag},~{fpsr},~{flags}"(i64 %a0, i64* %a1)
+ %2 = extractvalue { i64, i64 } %1, 0
+ %3 = extractvalue { i64, i64 } %1, 1
+ %4 = or i64 %2, %3
+ ret i64 %4
+}
+
+define i16 @test_bsr16(i16 %a0, i16* %a1) optsize {
+; GENERIC-LABEL: test_bsr16:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: bsrw %di, %ax # sched: [3:1.00]
+; GENERIC-NEXT: bsrw (%rsi), %cx # sched: [8:1.00]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: orl %ecx, %eax # sched: [1:0.33]
+; GENERIC-NEXT: # kill: def %ax killed %ax killed %eax
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_bsr16:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: bsrw %di, %ax # sched: [16:8.00]
+; ATOM-NEXT: bsrw (%rsi), %cx # sched: [16:8.00]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: orl %ecx, %eax # sched: [1:0.50]
+; ATOM-NEXT: # kill: def %ax killed %ax killed %eax
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_bsr16:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: bsrw %di, %ax # sched: [1:1.00]
+; SLM-NEXT: bsrw (%rsi), %cx # sched: [4:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: orl %ecx, %eax # sched: [1:0.50]
+; SLM-NEXT: # kill: def %ax killed %ax killed %eax
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_bsr16:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: bsrw %di, %ax # sched: [3:1.00]
+; SANDY-NEXT: bsrw (%rsi), %cx # sched: [8:1.00]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: orl %ecx, %eax # sched: [1:0.33]
+; SANDY-NEXT: # kill: def %ax killed %ax killed %eax
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_bsr16:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: bsrw %di, %ax # sched: [3:1.00]
+; HASWELL-NEXT: bsrw (%rsi), %cx # sched: [8:1.00]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: orl %ecx, %eax # sched: [1:0.25]
+; HASWELL-NEXT: # kill: def %ax killed %ax killed %eax
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_bsr16:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: bsrw %di, %ax # sched: [3:1.00]
+; BROADWELL-NEXT: bsrw (%rsi), %cx # sched: [8:1.00]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: orl %ecx, %eax # sched: [1:0.25]
+; BROADWELL-NEXT: # kill: def %ax killed %ax killed %eax
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_bsr16:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: bsrw %di, %ax # sched: [3:1.00]
+; SKYLAKE-NEXT: bsrw (%rsi), %cx # sched: [8:1.00]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: orl %ecx, %eax # sched: [1:0.25]
+; SKYLAKE-NEXT: # kill: def %ax killed %ax killed %eax
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_bsr16:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: bsrw %di, %ax # sched: [3:1.00]
+; SKX-NEXT: bsrw (%rsi), %cx # sched: [8:1.00]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: orl %ecx, %eax # sched: [1:0.25]
+; SKX-NEXT: # kill: def %ax killed %ax killed %eax
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_bsr16:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: bsrw %di, %ax # sched: [1:0.50]
+; BTVER2-NEXT: bsrw (%rsi), %cx # sched: [4:1.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: orl %ecx, %eax # sched: [1:0.50]
+; BTVER2-NEXT: # kill: def %ax killed %ax killed %eax
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_bsr16:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: bsrw %di, %ax # sched: [3:0.25]
+; ZNVER1-NEXT: bsrw (%rsi), %cx # sched: [7:0.50]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: orl %ecx, %eax # sched: [1:0.25]
+; ZNVER1-NEXT: # kill: def %ax killed %ax killed %eax
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call { i16, i16 } asm sideeffect "bsr $2, $0 \0A\09 bsr $3, $1", "=r,=r,r,*m,~{dirflag},~{fpsr},~{flags}"(i16 %a0, i16* %a1)
+ %2 = extractvalue { i16, i16 } %1, 0
+ %3 = extractvalue { i16, i16 } %1, 1
+ %4 = or i16 %2, %3
+ ret i16 %4
+}
+define i32 @test_bsr32(i32 %a0, i32* %a1) optsize {
+; GENERIC-LABEL: test_bsr32:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: bsrl %edi, %eax # sched: [3:1.00]
+; GENERIC-NEXT: bsrl (%rsi), %ecx # sched: [8:1.00]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: orl %ecx, %eax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_bsr32:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: bsrl %edi, %eax # sched: [16:8.00]
+; ATOM-NEXT: bsrl (%rsi), %ecx # sched: [16:8.00]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: orl %ecx, %eax # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_bsr32:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: bsrl %edi, %eax # sched: [1:1.00]
+; SLM-NEXT: bsrl (%rsi), %ecx # sched: [4:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: orl %ecx, %eax # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_bsr32:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: bsrl %edi, %eax # sched: [3:1.00]
+; SANDY-NEXT: bsrl (%rsi), %ecx # sched: [8:1.00]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: orl %ecx, %eax # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_bsr32:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: bsrl %edi, %eax # sched: [3:1.00]
+; HASWELL-NEXT: bsrl (%rsi), %ecx # sched: [8:1.00]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: orl %ecx, %eax # sched: [1:0.25]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_bsr32:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: bsrl %edi, %eax # sched: [3:1.00]
+; BROADWELL-NEXT: bsrl (%rsi), %ecx # sched: [8:1.00]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: orl %ecx, %eax # sched: [1:0.25]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_bsr32:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: bsrl %edi, %eax # sched: [3:1.00]
+; SKYLAKE-NEXT: bsrl (%rsi), %ecx # sched: [8:1.00]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: orl %ecx, %eax # sched: [1:0.25]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_bsr32:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: bsrl %edi, %eax # sched: [3:1.00]
+; SKX-NEXT: bsrl (%rsi), %ecx # sched: [8:1.00]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: orl %ecx, %eax # sched: [1:0.25]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_bsr32:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: bsrl %edi, %eax # sched: [1:0.50]
+; BTVER2-NEXT: bsrl (%rsi), %ecx # sched: [4:1.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: orl %ecx, %eax # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_bsr32:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: bsrl %edi, %eax # sched: [3:0.25]
+; ZNVER1-NEXT: bsrl (%rsi), %ecx # sched: [7:0.50]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: orl %ecx, %eax # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call { i32, i32 } asm sideeffect "bsr $2, $0 \0A\09 bsr $3, $1", "=r,=r,r,*m,~{dirflag},~{fpsr},~{flags}"(i32 %a0, i32* %a1)
+ %2 = extractvalue { i32, i32 } %1, 0
+ %3 = extractvalue { i32, i32 } %1, 1
+ %4 = or i32 %2, %3
+ ret i32 %4
+}
+define i64 @test_bsr64(i64 %a0, i64* %a1) optsize {
+; GENERIC-LABEL: test_bsr64:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: bsrq %rdi, %rax # sched: [3:1.00]
+; GENERIC-NEXT: bsrq (%rsi), %rcx # sched: [8:1.00]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: orq %rcx, %rax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_bsr64:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: bsrq %rdi, %rax # sched: [16:8.00]
+; ATOM-NEXT: bsrq (%rsi), %rcx # sched: [16:8.00]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: orq %rcx, %rax # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_bsr64:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: bsrq %rdi, %rax # sched: [1:1.00]
+; SLM-NEXT: bsrq (%rsi), %rcx # sched: [4:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: orq %rcx, %rax # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_bsr64:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: bsrq %rdi, %rax # sched: [3:1.00]
+; SANDY-NEXT: bsrq (%rsi), %rcx # sched: [8:1.00]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: orq %rcx, %rax # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_bsr64:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: bsrq %rdi, %rax # sched: [3:1.00]
+; HASWELL-NEXT: bsrq (%rsi), %rcx # sched: [8:1.00]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: orq %rcx, %rax # sched: [1:0.25]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_bsr64:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: bsrq %rdi, %rax # sched: [3:1.00]
+; BROADWELL-NEXT: bsrq (%rsi), %rcx # sched: [8:1.00]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: orq %rcx, %rax # sched: [1:0.25]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_bsr64:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: bsrq %rdi, %rax # sched: [3:1.00]
+; SKYLAKE-NEXT: bsrq (%rsi), %rcx # sched: [8:1.00]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: orq %rcx, %rax # sched: [1:0.25]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_bsr64:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: bsrq %rdi, %rax # sched: [3:1.00]
+; SKX-NEXT: bsrq (%rsi), %rcx # sched: [8:1.00]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: orq %rcx, %rax # sched: [1:0.25]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_bsr64:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: bsrq %rdi, %rax # sched: [1:0.50]
+; BTVER2-NEXT: bsrq (%rsi), %rcx # sched: [4:1.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: orq %rcx, %rax # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_bsr64:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: bsrq %rdi, %rax # sched: [3:0.25]
+; ZNVER1-NEXT: bsrq (%rsi), %rcx # sched: [7:0.50]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: orq %rcx, %rax # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call { i64, i64 } asm sideeffect "bsr $2, $0 \0A\09 bsr $3, $1", "=r,=r,r,*m,~{dirflag},~{fpsr},~{flags}"(i64 %a0, i64* %a1)
+ %2 = extractvalue { i64, i64 } %1, 0
+ %3 = extractvalue { i64, i64 } %1, 1
+ %4 = or i64 %2, %3
+ ret i64 %4
+}
+
+define i32 @test_bswap32(i32 %a0) optsize {
+; GENERIC-LABEL: test_bswap32:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: bswapl %edi # sched: [2:1.00]
+; GENERIC-NEXT: movl %edi, %eax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_bswap32:
+; ATOM: # %bb.0:
+; ATOM-NEXT: bswapl %edi # sched: [1:1.00]
+; ATOM-NEXT: movl %edi, %eax # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_bswap32:
+; SLM: # %bb.0:
+; SLM-NEXT: bswapl %edi # sched: [1:0.50]
+; SLM-NEXT: movl %edi, %eax # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_bswap32:
+; SANDY: # %bb.0:
+; SANDY-NEXT: bswapl %edi # sched: [2:1.00]
+; SANDY-NEXT: movl %edi, %eax # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_bswap32:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: bswapl %edi # sched: [2:0.50]
+; HASWELL-NEXT: movl %edi, %eax # sched: [1:0.25]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_bswap32:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: bswapl %edi # sched: [2:0.50]
+; BROADWELL-NEXT: movl %edi, %eax # sched: [1:0.25]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_bswap32:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: bswapl %edi # sched: [2:0.50]
+; SKYLAKE-NEXT: movl %edi, %eax # sched: [1:0.25]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_bswap32:
+; SKX: # %bb.0:
+; SKX-NEXT: bswapl %edi # sched: [2:0.50]
+; SKX-NEXT: movl %edi, %eax # sched: [1:0.25]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_bswap32:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: bswapl %edi # sched: [1:0.50]
+; BTVER2-NEXT: movl %edi, %eax # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_bswap32:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: bswapl %edi # sched: [1:1.00]
+; ZNVER1-NEXT: movl %edi, %eax # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = tail call i32 asm "bswap $0", "=r,0"(i32 %a0) nounwind
+ ret i32 %1
+}
+define i64 @test_bswap64(i64 %a0) optsize {
+; GENERIC-LABEL: test_bswap64:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: bswapq %rdi # sched: [2:1.00]
+; GENERIC-NEXT: movq %rdi, %rax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_bswap64:
+; ATOM: # %bb.0:
+; ATOM-NEXT: bswapq %rdi # sched: [1:1.00]
+; ATOM-NEXT: movq %rdi, %rax # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_bswap64:
+; SLM: # %bb.0:
+; SLM-NEXT: bswapq %rdi # sched: [1:0.50]
+; SLM-NEXT: movq %rdi, %rax # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_bswap64:
+; SANDY: # %bb.0:
+; SANDY-NEXT: bswapq %rdi # sched: [2:1.00]
+; SANDY-NEXT: movq %rdi, %rax # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_bswap64:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: bswapq %rdi # sched: [2:0.50]
+; HASWELL-NEXT: movq %rdi, %rax # sched: [1:0.25]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_bswap64:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: bswapq %rdi # sched: [2:0.50]
+; BROADWELL-NEXT: movq %rdi, %rax # sched: [1:0.25]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_bswap64:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: bswapq %rdi # sched: [2:0.50]
+; SKYLAKE-NEXT: movq %rdi, %rax # sched: [1:0.25]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_bswap64:
+; SKX: # %bb.0:
+; SKX-NEXT: bswapq %rdi # sched: [2:0.50]
+; SKX-NEXT: movq %rdi, %rax # sched: [1:0.25]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_bswap64:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: bswapq %rdi # sched: [1:0.50]
+; BTVER2-NEXT: movq %rdi, %rax # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_bswap64:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: bswapq %rdi # sched: [1:1.00]
+; ZNVER1-NEXT: movq %rdi, %rax # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = tail call i64 asm "bswap $0", "=r,0"(i64 %a0) nounwind
+ ret i64 %1
+}
+
+define void @test_bt_btc_btr_bts_16(i16 %a0, i16 %a1, i16 *%a2) optsize {
+; GENERIC-LABEL: test_bt_btc_btr_bts_16:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: btw %si, %di # sched: [1:0.50]
+; GENERIC-NEXT: btcw %si, %di # sched: [1:0.50]
+; GENERIC-NEXT: btrw %si, %di # sched: [1:0.50]
+; GENERIC-NEXT: btsw %si, %di # sched: [1:0.50]
+; GENERIC-NEXT: btw %si, (%rdx) # sched: [9:1.00]
+; GENERIC-NEXT: btcw %si, (%rdx) # sched: [9:1.00]
+; GENERIC-NEXT: btrw %si, (%rdx) # sched: [9:1.00]
+; GENERIC-NEXT: btsw %si, (%rdx) # sched: [9:1.00]
+; GENERIC-NEXT: btw $7, %di # sched: [1:0.50]
+; GENERIC-NEXT: btcw $7, %di # sched: [1:0.50]
+; GENERIC-NEXT: btrw $7, %di # sched: [1:0.50]
+; GENERIC-NEXT: btsw $7, %di # sched: [1:0.50]
+; GENERIC-NEXT: btw $7, (%rdx) # sched: [6:0.50]
+; GENERIC-NEXT: btcw $7, (%rdx) # sched: [7:1.00]
+; GENERIC-NEXT: btrw $7, (%rdx) # sched: [7:1.00]
+; GENERIC-NEXT: btsw $7, (%rdx) # sched: [7:1.00]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_bt_btc_btr_bts_16:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: btw %si, %di # sched: [1:1.00]
+; ATOM-NEXT: btcw %si, %di # sched: [1:1.00]
+; ATOM-NEXT: btrw %si, %di # sched: [1:1.00]
+; ATOM-NEXT: btsw %si, %di # sched: [1:1.00]
+; ATOM-NEXT: btw %si, (%rdx) # sched: [9:4.50]
+; ATOM-NEXT: btcw %si, (%rdx) # sched: [11:5.50]
+; ATOM-NEXT: btrw %si, (%rdx) # sched: [11:5.50]
+; ATOM-NEXT: btsw %si, (%rdx) # sched: [11:5.50]
+; ATOM-NEXT: btw $7, %di # sched: [1:1.00]
+; ATOM-NEXT: btcw $7, %di # sched: [1:1.00]
+; ATOM-NEXT: btrw $7, %di # sched: [1:1.00]
+; ATOM-NEXT: btsw $7, %di # sched: [1:1.00]
+; ATOM-NEXT: btw $7, (%rdx) # sched: [1:0.50]
+; ATOM-NEXT: btcw $7, (%rdx) # sched: [2:1.00]
+; ATOM-NEXT: btrw $7, (%rdx) # sched: [2:1.00]
+; ATOM-NEXT: btsw $7, (%rdx) # sched: [2:1.00]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_bt_btc_btr_bts_16:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: btw %si, %di # sched: [1:0.50]
+; SLM-NEXT: btcw %si, %di # sched: [1:0.50]
+; SLM-NEXT: btrw %si, %di # sched: [1:0.50]
+; SLM-NEXT: btsw %si, %di # sched: [1:0.50]
+; SLM-NEXT: btw %si, (%rdx) # sched: [4:1.00]
+; SLM-NEXT: btcw %si, (%rdx) # sched: [4:2.00]
+; SLM-NEXT: btrw %si, (%rdx) # sched: [4:2.00]
+; SLM-NEXT: btsw %si, (%rdx) # sched: [4:2.00]
+; SLM-NEXT: btw $7, %di # sched: [1:0.50]
+; SLM-NEXT: btcw $7, %di # sched: [1:0.50]
+; SLM-NEXT: btrw $7, %di # sched: [1:0.50]
+; SLM-NEXT: btsw $7, %di # sched: [1:0.50]
+; SLM-NEXT: btw $7, (%rdx) # sched: [1:0.50]
+; SLM-NEXT: btcw $7, (%rdx) # sched: [4:2.00]
+; SLM-NEXT: btrw $7, (%rdx) # sched: [4:2.00]
+; SLM-NEXT: btsw $7, (%rdx) # sched: [4:2.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_bt_btc_btr_bts_16:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: btw %si, %di # sched: [1:0.50]
+; SANDY-NEXT: btcw %si, %di # sched: [1:0.50]
+; SANDY-NEXT: btrw %si, %di # sched: [1:0.50]
+; SANDY-NEXT: btsw %si, %di # sched: [1:0.50]
+; SANDY-NEXT: btw %si, (%rdx) # sched: [9:1.00]
+; SANDY-NEXT: btcw %si, (%rdx) # sched: [9:1.00]
+; SANDY-NEXT: btrw %si, (%rdx) # sched: [9:1.00]
+; SANDY-NEXT: btsw %si, (%rdx) # sched: [9:1.00]
+; SANDY-NEXT: btw $7, %di # sched: [1:0.50]
+; SANDY-NEXT: btcw $7, %di # sched: [1:0.50]
+; SANDY-NEXT: btrw $7, %di # sched: [1:0.50]
+; SANDY-NEXT: btsw $7, %di # sched: [1:0.50]
+; SANDY-NEXT: btw $7, (%rdx) # sched: [6:0.50]
+; SANDY-NEXT: btcw $7, (%rdx) # sched: [7:1.00]
+; SANDY-NEXT: btrw $7, (%rdx) # sched: [7:1.00]
+; SANDY-NEXT: btsw $7, (%rdx) # sched: [7:1.00]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_bt_btc_btr_bts_16:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: btw %si, %di # sched: [1:0.50]
+; HASWELL-NEXT: btcw %si, %di # sched: [1:0.50]
+; HASWELL-NEXT: btrw %si, %di # sched: [1:0.50]
+; HASWELL-NEXT: btsw %si, %di # sched: [1:0.50]
+; HASWELL-NEXT: btw %si, (%rdx) # sched: [1:?]
+; HASWELL-NEXT: btcw %si, (%rdx) # sched: [1:?]
+; HASWELL-NEXT: btrw %si, (%rdx) # sched: [1:?]
+; HASWELL-NEXT: btsw %si, (%rdx) # sched: [1:?]
+; HASWELL-NEXT: btw $7, %di # sched: [1:0.50]
+; HASWELL-NEXT: btcw $7, %di # sched: [1:0.50]
+; HASWELL-NEXT: btrw $7, %di # sched: [1:0.50]
+; HASWELL-NEXT: btsw $7, %di # sched: [1:0.50]
+; HASWELL-NEXT: btw $7, (%rdx) # sched: [6:0.50]
+; HASWELL-NEXT: btcw $7, (%rdx) # sched: [7:1.00]
+; HASWELL-NEXT: btrw $7, (%rdx) # sched: [7:1.00]
+; HASWELL-NEXT: btsw $7, (%rdx) # sched: [7:1.00]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_bt_btc_btr_bts_16:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: btw %si, %di # sched: [1:0.50]
+; BROADWELL-NEXT: btcw %si, %di # sched: [1:0.50]
+; BROADWELL-NEXT: btrw %si, %di # sched: [1:0.50]
+; BROADWELL-NEXT: btsw %si, %di # sched: [1:0.50]
+; BROADWELL-NEXT: btw %si, (%rdx) # sched: [6:0.50]
+; BROADWELL-NEXT: btcw %si, (%rdx) # sched: [6:1.00]
+; BROADWELL-NEXT: btrw %si, (%rdx) # sched: [6:1.00]
+; BROADWELL-NEXT: btsw %si, (%rdx) # sched: [6:1.00]
+; BROADWELL-NEXT: btw $7, %di # sched: [1:0.50]
+; BROADWELL-NEXT: btcw $7, %di # sched: [1:0.50]
+; BROADWELL-NEXT: btrw $7, %di # sched: [1:0.50]
+; BROADWELL-NEXT: btsw $7, %di # sched: [1:0.50]
+; BROADWELL-NEXT: btw $7, (%rdx) # sched: [6:0.50]
+; BROADWELL-NEXT: btcw $7, (%rdx) # sched: [6:1.00]
+; BROADWELL-NEXT: btrw $7, (%rdx) # sched: [6:1.00]
+; BROADWELL-NEXT: btsw $7, (%rdx) # sched: [6:1.00]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_bt_btc_btr_bts_16:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: btw %si, %di # sched: [1:0.50]
+; SKYLAKE-NEXT: btcw %si, %di # sched: [1:0.50]
+; SKYLAKE-NEXT: btrw %si, %di # sched: [1:0.50]
+; SKYLAKE-NEXT: btsw %si, %di # sched: [1:0.50]
+; SKYLAKE-NEXT: btw %si, (%rdx) # sched: [6:0.50]
+; SKYLAKE-NEXT: btcw %si, (%rdx) # sched: [6:1.00]
+; SKYLAKE-NEXT: btrw %si, (%rdx) # sched: [6:1.00]
+; SKYLAKE-NEXT: btsw %si, (%rdx) # sched: [6:1.00]
+; SKYLAKE-NEXT: btw $7, %di # sched: [1:0.50]
+; SKYLAKE-NEXT: btcw $7, %di # sched: [1:0.50]
+; SKYLAKE-NEXT: btrw $7, %di # sched: [1:0.50]
+; SKYLAKE-NEXT: btsw $7, %di # sched: [1:0.50]
+; SKYLAKE-NEXT: btw $7, (%rdx) # sched: [6:0.50]
+; SKYLAKE-NEXT: btcw $7, (%rdx) # sched: [6:1.00]
+; SKYLAKE-NEXT: btrw $7, (%rdx) # sched: [6:1.00]
+; SKYLAKE-NEXT: btsw $7, (%rdx) # sched: [6:1.00]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_bt_btc_btr_bts_16:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: btw %si, %di # sched: [1:0.50]
+; SKX-NEXT: btcw %si, %di # sched: [1:0.50]
+; SKX-NEXT: btrw %si, %di # sched: [1:0.50]
+; SKX-NEXT: btsw %si, %di # sched: [1:0.50]
+; SKX-NEXT: btw %si, (%rdx) # sched: [6:0.50]
+; SKX-NEXT: btcw %si, (%rdx) # sched: [6:1.00]
+; SKX-NEXT: btrw %si, (%rdx) # sched: [6:1.00]
+; SKX-NEXT: btsw %si, (%rdx) # sched: [6:1.00]
+; SKX-NEXT: btw $7, %di # sched: [1:0.50]
+; SKX-NEXT: btcw $7, %di # sched: [1:0.50]
+; SKX-NEXT: btrw $7, %di # sched: [1:0.50]
+; SKX-NEXT: btsw $7, %di # sched: [1:0.50]
+; SKX-NEXT: btw $7, (%rdx) # sched: [6:0.50]
+; SKX-NEXT: btcw $7, (%rdx) # sched: [6:1.00]
+; SKX-NEXT: btrw $7, (%rdx) # sched: [6:1.00]
+; SKX-NEXT: btsw $7, (%rdx) # sched: [6:1.00]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_bt_btc_btr_bts_16:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: btw %si, %di # sched: [1:0.50]
+; BTVER2-NEXT: btcw %si, %di # sched: [1:0.50]
+; BTVER2-NEXT: btrw %si, %di # sched: [1:0.50]
+; BTVER2-NEXT: btsw %si, %di # sched: [1:0.50]
+; BTVER2-NEXT: btw %si, (%rdx) # sched: [4:1.00]
+; BTVER2-NEXT: btcw %si, (%rdx) # sched: [4:1.00]
+; BTVER2-NEXT: btrw %si, (%rdx) # sched: [4:1.00]
+; BTVER2-NEXT: btsw %si, (%rdx) # sched: [4:1.00]
+; BTVER2-NEXT: btw $7, %di # sched: [1:0.50]
+; BTVER2-NEXT: btcw $7, %di # sched: [1:0.50]
+; BTVER2-NEXT: btrw $7, %di # sched: [1:0.50]
+; BTVER2-NEXT: btsw $7, %di # sched: [1:0.50]
+; BTVER2-NEXT: btw $7, (%rdx) # sched: [1:0.50]
+; BTVER2-NEXT: btcw $7, (%rdx) # sched: [4:1.00]
+; BTVER2-NEXT: btrw $7, (%rdx) # sched: [4:1.00]
+; BTVER2-NEXT: btsw $7, (%rdx) # sched: [4:1.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_bt_btc_btr_bts_16:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: btw %si, %di # sched: [1:0.25]
+; ZNVER1-NEXT: btcw %si, %di # sched: [2:0.25]
+; ZNVER1-NEXT: btrw %si, %di # sched: [2:0.25]
+; ZNVER1-NEXT: btsw %si, %di # sched: [2:0.25]
+; ZNVER1-NEXT: btw %si, (%rdx) # sched: [5:0.50]
+; ZNVER1-NEXT: btcw %si, (%rdx) # sched: [6:0.50]
+; ZNVER1-NEXT: btrw %si, (%rdx) # sched: [6:0.50]
+; ZNVER1-NEXT: btsw %si, (%rdx) # sched: [6:0.50]
+; ZNVER1-NEXT: btw $7, %di # sched: [1:0.25]
+; ZNVER1-NEXT: btcw $7, %di # sched: [2:0.25]
+; ZNVER1-NEXT: btrw $7, %di # sched: [2:0.25]
+; ZNVER1-NEXT: btsw $7, %di # sched: [2:0.25]
+; ZNVER1-NEXT: btw $7, (%rdx) # sched: [5:0.50]
+; ZNVER1-NEXT: btcw $7, (%rdx) # sched: [6:0.50]
+; ZNVER1-NEXT: btrw $7, (%rdx) # sched: [6:0.50]
+; ZNVER1-NEXT: btsw $7, (%rdx) # sched: [6:0.50]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ call void asm sideeffect "btw $1, $0 \0A\09 btcw $1, $0 \0A\09 btrw $1, $0 \0A\09 btsw $1, $0 \0A\09 btw $1, $2 \0A\09 btcw $1, $2 \0A\09 btrw $1, $2 \0A\09 btsw $1, $2 \0A\09 btw $3, $0 \0A\09 btcw $3, $0 \0A\09 btrw $3, $0 \0A\09 btsw $3, $0 \0A\09 btw $3, $2 \0A\09 btcw $3, $2 \0A\09 btrw $3, $2 \0A\09 btsw $3, $2", "r,r,*m,i"(i16 %a0, i16 %a1, i16 *%a2, i8 7)
+ ret void
+}
+define void @test_bt_btc_btr_bts_32(i32 %a0, i32 %a1, i32 *%a2) optsize {
+; GENERIC-LABEL: test_bt_btc_btr_bts_32:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: btl %esi, %edi # sched: [1:0.50]
+; GENERIC-NEXT: btcl %esi, %edi # sched: [1:0.50]
+; GENERIC-NEXT: btrl %esi, %edi # sched: [1:0.50]
+; GENERIC-NEXT: btsl %esi, %edi # sched: [1:0.50]
+; GENERIC-NEXT: btl %esi, (%rdx) # sched: [9:1.00]
+; GENERIC-NEXT: btcl %esi, (%rdx) # sched: [9:1.00]
+; GENERIC-NEXT: btrl %esi, (%rdx) # sched: [9:1.00]
+; GENERIC-NEXT: btsl %esi, (%rdx) # sched: [9:1.00]
+; GENERIC-NEXT: btl $7, %edi # sched: [1:0.50]
+; GENERIC-NEXT: btcl $7, %edi # sched: [1:0.50]
+; GENERIC-NEXT: btrl $7, %edi # sched: [1:0.50]
+; GENERIC-NEXT: btsl $7, %edi # sched: [1:0.50]
+; GENERIC-NEXT: btl $7, (%rdx) # sched: [6:0.50]
+; GENERIC-NEXT: btcl $7, (%rdx) # sched: [7:1.00]
+; GENERIC-NEXT: btrl $7, (%rdx) # sched: [7:1.00]
+; GENERIC-NEXT: btsl $7, (%rdx) # sched: [7:1.00]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_bt_btc_btr_bts_32:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: btl %esi, %edi # sched: [1:1.00]
+; ATOM-NEXT: btcl %esi, %edi # sched: [1:1.00]
+; ATOM-NEXT: btrl %esi, %edi # sched: [1:1.00]
+; ATOM-NEXT: btsl %esi, %edi # sched: [1:1.00]
+; ATOM-NEXT: btl %esi, (%rdx) # sched: [9:4.50]
+; ATOM-NEXT: btcl %esi, (%rdx) # sched: [11:5.50]
+; ATOM-NEXT: btrl %esi, (%rdx) # sched: [11:5.50]
+; ATOM-NEXT: btsl %esi, (%rdx) # sched: [11:5.50]
+; ATOM-NEXT: btl $7, %edi # sched: [1:1.00]
+; ATOM-NEXT: btcl $7, %edi # sched: [1:1.00]
+; ATOM-NEXT: btrl $7, %edi # sched: [1:1.00]
+; ATOM-NEXT: btsl $7, %edi # sched: [1:1.00]
+; ATOM-NEXT: btl $7, (%rdx) # sched: [1:0.50]
+; ATOM-NEXT: btcl $7, (%rdx) # sched: [2:1.00]
+; ATOM-NEXT: btrl $7, (%rdx) # sched: [2:1.00]
+; ATOM-NEXT: btsl $7, (%rdx) # sched: [2:1.00]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_bt_btc_btr_bts_32:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: btl %esi, %edi # sched: [1:0.50]
+; SLM-NEXT: btcl %esi, %edi # sched: [1:0.50]
+; SLM-NEXT: btrl %esi, %edi # sched: [1:0.50]
+; SLM-NEXT: btsl %esi, %edi # sched: [1:0.50]
+; SLM-NEXT: btl %esi, (%rdx) # sched: [4:1.00]
+; SLM-NEXT: btcl %esi, (%rdx) # sched: [4:2.00]
+; SLM-NEXT: btrl %esi, (%rdx) # sched: [4:2.00]
+; SLM-NEXT: btsl %esi, (%rdx) # sched: [4:2.00]
+; SLM-NEXT: btl $7, %edi # sched: [1:0.50]
+; SLM-NEXT: btcl $7, %edi # sched: [1:0.50]
+; SLM-NEXT: btrl $7, %edi # sched: [1:0.50]
+; SLM-NEXT: btsl $7, %edi # sched: [1:0.50]
+; SLM-NEXT: btl $7, (%rdx) # sched: [1:0.50]
+; SLM-NEXT: btcl $7, (%rdx) # sched: [4:2.00]
+; SLM-NEXT: btrl $7, (%rdx) # sched: [4:2.00]
+; SLM-NEXT: btsl $7, (%rdx) # sched: [4:2.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_bt_btc_btr_bts_32:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: btl %esi, %edi # sched: [1:0.50]
+; SANDY-NEXT: btcl %esi, %edi # sched: [1:0.50]
+; SANDY-NEXT: btrl %esi, %edi # sched: [1:0.50]
+; SANDY-NEXT: btsl %esi, %edi # sched: [1:0.50]
+; SANDY-NEXT: btl %esi, (%rdx) # sched: [9:1.00]
+; SANDY-NEXT: btcl %esi, (%rdx) # sched: [9:1.00]
+; SANDY-NEXT: btrl %esi, (%rdx) # sched: [9:1.00]
+; SANDY-NEXT: btsl %esi, (%rdx) # sched: [9:1.00]
+; SANDY-NEXT: btl $7, %edi # sched: [1:0.50]
+; SANDY-NEXT: btcl $7, %edi # sched: [1:0.50]
+; SANDY-NEXT: btrl $7, %edi # sched: [1:0.50]
+; SANDY-NEXT: btsl $7, %edi # sched: [1:0.50]
+; SANDY-NEXT: btl $7, (%rdx) # sched: [6:0.50]
+; SANDY-NEXT: btcl $7, (%rdx) # sched: [7:1.00]
+; SANDY-NEXT: btrl $7, (%rdx) # sched: [7:1.00]
+; SANDY-NEXT: btsl $7, (%rdx) # sched: [7:1.00]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_bt_btc_btr_bts_32:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: btl %esi, %edi # sched: [1:0.50]
+; HASWELL-NEXT: btcl %esi, %edi # sched: [1:0.50]
+; HASWELL-NEXT: btrl %esi, %edi # sched: [1:0.50]
+; HASWELL-NEXT: btsl %esi, %edi # sched: [1:0.50]
+; HASWELL-NEXT: btl %esi, (%rdx) # sched: [1:?]
+; HASWELL-NEXT: btcl %esi, (%rdx) # sched: [1:?]
+; HASWELL-NEXT: btrl %esi, (%rdx) # sched: [1:?]
+; HASWELL-NEXT: btsl %esi, (%rdx) # sched: [1:?]
+; HASWELL-NEXT: btl $7, %edi # sched: [1:0.50]
+; HASWELL-NEXT: btcl $7, %edi # sched: [1:0.50]
+; HASWELL-NEXT: btrl $7, %edi # sched: [1:0.50]
+; HASWELL-NEXT: btsl $7, %edi # sched: [1:0.50]
+; HASWELL-NEXT: btl $7, (%rdx) # sched: [6:0.50]
+; HASWELL-NEXT: btcl $7, (%rdx) # sched: [7:1.00]
+; HASWELL-NEXT: btrl $7, (%rdx) # sched: [7:1.00]
+; HASWELL-NEXT: btsl $7, (%rdx) # sched: [7:1.00]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_bt_btc_btr_bts_32:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: btl %esi, %edi # sched: [1:0.50]
+; BROADWELL-NEXT: btcl %esi, %edi # sched: [1:0.50]
+; BROADWELL-NEXT: btrl %esi, %edi # sched: [1:0.50]
+; BROADWELL-NEXT: btsl %esi, %edi # sched: [1:0.50]
+; BROADWELL-NEXT: btl %esi, (%rdx) # sched: [6:0.50]
+; BROADWELL-NEXT: btcl %esi, (%rdx) # sched: [6:1.00]
+; BROADWELL-NEXT: btrl %esi, (%rdx) # sched: [6:1.00]
+; BROADWELL-NEXT: btsl %esi, (%rdx) # sched: [6:1.00]
+; BROADWELL-NEXT: btl $7, %edi # sched: [1:0.50]
+; BROADWELL-NEXT: btcl $7, %edi # sched: [1:0.50]
+; BROADWELL-NEXT: btrl $7, %edi # sched: [1:0.50]
+; BROADWELL-NEXT: btsl $7, %edi # sched: [1:0.50]
+; BROADWELL-NEXT: btl $7, (%rdx) # sched: [6:0.50]
+; BROADWELL-NEXT: btcl $7, (%rdx) # sched: [6:1.00]
+; BROADWELL-NEXT: btrl $7, (%rdx) # sched: [6:1.00]
+; BROADWELL-NEXT: btsl $7, (%rdx) # sched: [6:1.00]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_bt_btc_btr_bts_32:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: btl %esi, %edi # sched: [1:0.50]
+; SKYLAKE-NEXT: btcl %esi, %edi # sched: [1:0.50]
+; SKYLAKE-NEXT: btrl %esi, %edi # sched: [1:0.50]
+; SKYLAKE-NEXT: btsl %esi, %edi # sched: [1:0.50]
+; SKYLAKE-NEXT: btl %esi, (%rdx) # sched: [6:0.50]
+; SKYLAKE-NEXT: btcl %esi, (%rdx) # sched: [6:1.00]
+; SKYLAKE-NEXT: btrl %esi, (%rdx) # sched: [6:1.00]
+; SKYLAKE-NEXT: btsl %esi, (%rdx) # sched: [6:1.00]
+; SKYLAKE-NEXT: btl $7, %edi # sched: [1:0.50]
+; SKYLAKE-NEXT: btcl $7, %edi # sched: [1:0.50]
+; SKYLAKE-NEXT: btrl $7, %edi # sched: [1:0.50]
+; SKYLAKE-NEXT: btsl $7, %edi # sched: [1:0.50]
+; SKYLAKE-NEXT: btl $7, (%rdx) # sched: [6:0.50]
+; SKYLAKE-NEXT: btcl $7, (%rdx) # sched: [6:1.00]
+; SKYLAKE-NEXT: btrl $7, (%rdx) # sched: [6:1.00]
+; SKYLAKE-NEXT: btsl $7, (%rdx) # sched: [6:1.00]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_bt_btc_btr_bts_32:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: btl %esi, %edi # sched: [1:0.50]
+; SKX-NEXT: btcl %esi, %edi # sched: [1:0.50]
+; SKX-NEXT: btrl %esi, %edi # sched: [1:0.50]
+; SKX-NEXT: btsl %esi, %edi # sched: [1:0.50]
+; SKX-NEXT: btl %esi, (%rdx) # sched: [6:0.50]
+; SKX-NEXT: btcl %esi, (%rdx) # sched: [6:1.00]
+; SKX-NEXT: btrl %esi, (%rdx) # sched: [6:1.00]
+; SKX-NEXT: btsl %esi, (%rdx) # sched: [6:1.00]
+; SKX-NEXT: btl $7, %edi # sched: [1:0.50]
+; SKX-NEXT: btcl $7, %edi # sched: [1:0.50]
+; SKX-NEXT: btrl $7, %edi # sched: [1:0.50]
+; SKX-NEXT: btsl $7, %edi # sched: [1:0.50]
+; SKX-NEXT: btl $7, (%rdx) # sched: [6:0.50]
+; SKX-NEXT: btcl $7, (%rdx) # sched: [6:1.00]
+; SKX-NEXT: btrl $7, (%rdx) # sched: [6:1.00]
+; SKX-NEXT: btsl $7, (%rdx) # sched: [6:1.00]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_bt_btc_btr_bts_32:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: btl %esi, %edi # sched: [1:0.50]
+; BTVER2-NEXT: btcl %esi, %edi # sched: [1:0.50]
+; BTVER2-NEXT: btrl %esi, %edi # sched: [1:0.50]
+; BTVER2-NEXT: btsl %esi, %edi # sched: [1:0.50]
+; BTVER2-NEXT: btl %esi, (%rdx) # sched: [4:1.00]
+; BTVER2-NEXT: btcl %esi, (%rdx) # sched: [4:1.00]
+; BTVER2-NEXT: btrl %esi, (%rdx) # sched: [4:1.00]
+; BTVER2-NEXT: btsl %esi, (%rdx) # sched: [4:1.00]
+; BTVER2-NEXT: btl $7, %edi # sched: [1:0.50]
+; BTVER2-NEXT: btcl $7, %edi # sched: [1:0.50]
+; BTVER2-NEXT: btrl $7, %edi # sched: [1:0.50]
+; BTVER2-NEXT: btsl $7, %edi # sched: [1:0.50]
+; BTVER2-NEXT: btl $7, (%rdx) # sched: [1:0.50]
+; BTVER2-NEXT: btcl $7, (%rdx) # sched: [4:1.00]
+; BTVER2-NEXT: btrl $7, (%rdx) # sched: [4:1.00]
+; BTVER2-NEXT: btsl $7, (%rdx) # sched: [4:1.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_bt_btc_btr_bts_32:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: btl %esi, %edi # sched: [1:0.25]
+; ZNVER1-NEXT: btcl %esi, %edi # sched: [2:0.25]
+; ZNVER1-NEXT: btrl %esi, %edi # sched: [2:0.25]
+; ZNVER1-NEXT: btsl %esi, %edi # sched: [2:0.25]
+; ZNVER1-NEXT: btl %esi, (%rdx) # sched: [5:0.50]
+; ZNVER1-NEXT: btcl %esi, (%rdx) # sched: [6:0.50]
+; ZNVER1-NEXT: btrl %esi, (%rdx) # sched: [6:0.50]
+; ZNVER1-NEXT: btsl %esi, (%rdx) # sched: [6:0.50]
+; ZNVER1-NEXT: btl $7, %edi # sched: [1:0.25]
+; ZNVER1-NEXT: btcl $7, %edi # sched: [2:0.25]
+; ZNVER1-NEXT: btrl $7, %edi # sched: [2:0.25]
+; ZNVER1-NEXT: btsl $7, %edi # sched: [2:0.25]
+; ZNVER1-NEXT: btl $7, (%rdx) # sched: [5:0.50]
+; ZNVER1-NEXT: btcl $7, (%rdx) # sched: [6:0.50]
+; ZNVER1-NEXT: btrl $7, (%rdx) # sched: [6:0.50]
+; ZNVER1-NEXT: btsl $7, (%rdx) # sched: [6:0.50]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ call void asm sideeffect "btl $1, $0 \0A\09 btcl $1, $0 \0A\09 btrl $1, $0 \0A\09 btsl $1, $0 \0A\09 btl $1, $2 \0A\09 btcl $1, $2 \0A\09 btrl $1, $2 \0A\09 btsl $1, $2 \0A\09 btl $3, $0 \0A\09 btcl $3, $0 \0A\09 btrl $3, $0 \0A\09 btsl $3, $0 \0A\09 btl $3, $2 \0A\09 btcl $3, $2 \0A\09 btrl $3, $2 \0A\09 btsl $3, $2", "r,r,*m,i"(i32 %a0, i32 %a1, i32 *%a2, i8 7)
+ ret void
+}
+define void @test_bt_btc_btr_bts_64(i64 %a0, i64 %a1, i64 *%a2) optsize {
+; GENERIC-LABEL: test_bt_btc_btr_bts_64:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: btq %rsi, %rdi # sched: [1:0.50]
+; GENERIC-NEXT: btcq %rsi, %rdi # sched: [1:0.50]
+; GENERIC-NEXT: btrq %rsi, %rdi # sched: [1:0.50]
+; GENERIC-NEXT: btsq %rsi, %rdi # sched: [1:0.50]
+; GENERIC-NEXT: btq %rsi, (%rdx) # sched: [9:1.00]
+; GENERIC-NEXT: btcq %rsi, (%rdx) # sched: [9:1.00]
+; GENERIC-NEXT: btrq %rsi, (%rdx) # sched: [9:1.00]
+; GENERIC-NEXT: btsq %rsi, (%rdx) # sched: [9:1.00]
+; GENERIC-NEXT: btq $7, %rdi # sched: [1:0.50]
+; GENERIC-NEXT: btcq $7, %rdi # sched: [1:0.50]
+; GENERIC-NEXT: btrq $7, %rdi # sched: [1:0.50]
+; GENERIC-NEXT: btsq $7, %rdi # sched: [1:0.50]
+; GENERIC-NEXT: btq $7, (%rdx) # sched: [6:0.50]
+; GENERIC-NEXT: btcq $7, (%rdx) # sched: [7:1.00]
+; GENERIC-NEXT: btrq $7, (%rdx) # sched: [7:1.00]
+; GENERIC-NEXT: btsq $7, (%rdx) # sched: [7:1.00]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_bt_btc_btr_bts_64:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: btq %rsi, %rdi # sched: [1:1.00]
+; ATOM-NEXT: btcq %rsi, %rdi # sched: [1:1.00]
+; ATOM-NEXT: btrq %rsi, %rdi
+; ATOM-NEXT: btsq %rsi, %rdi # sched: [1:1.00]
+; ATOM-NEXT: btq %rsi, (%rdx) # sched: [9:4.50]
+; ATOM-NEXT: btcq %rsi, (%rdx) # sched: [11:5.50]
+; ATOM-NEXT: btrq %rsi, (%rdx) # sched: [11:5.50]
+; ATOM-NEXT: btsq %rsi, (%rdx) # sched: [11:5.50]
+; ATOM-NEXT: btq $7, %rdi # sched: [1:1.00]
+; ATOM-NEXT: btcq $7, %rdi # sched: [1:1.00]
+; ATOM-NEXT: btrq $7, %rdi # sched: [1:1.00]
+; ATOM-NEXT: btsq $7, %rdi # sched: [1:1.00]
+; ATOM-NEXT: btq $7, (%rdx) # sched: [1:0.50]
+; ATOM-NEXT: btcq $7, (%rdx) # sched: [2:1.00]
+; ATOM-NEXT: btrq $7, (%rdx) # sched: [2:1.00]
+; ATOM-NEXT: btsq $7, (%rdx) # sched: [2:1.00]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_bt_btc_btr_bts_64:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: btq %rsi, %rdi # sched: [1:0.50]
+; SLM-NEXT: btcq %rsi, %rdi # sched: [1:0.50]
+; SLM-NEXT: btrq %rsi, %rdi # sched: [1:0.50]
+; SLM-NEXT: btsq %rsi, %rdi # sched: [1:0.50]
+; SLM-NEXT: btq %rsi, (%rdx) # sched: [4:1.00]
+; SLM-NEXT: btcq %rsi, (%rdx) # sched: [4:2.00]
+; SLM-NEXT: btrq %rsi, (%rdx) # sched: [4:2.00]
+; SLM-NEXT: btsq %rsi, (%rdx) # sched: [4:2.00]
+; SLM-NEXT: btq $7, %rdi # sched: [1:0.50]
+; SLM-NEXT: btcq $7, %rdi # sched: [1:0.50]
+; SLM-NEXT: btrq $7, %rdi # sched: [1:0.50]
+; SLM-NEXT: btsq $7, %rdi # sched: [1:0.50]
+; SLM-NEXT: btq $7, (%rdx) # sched: [1:0.50]
+; SLM-NEXT: btcq $7, (%rdx) # sched: [4:2.00]
+; SLM-NEXT: btrq $7, (%rdx) # sched: [4:2.00]
+; SLM-NEXT: btsq $7, (%rdx) # sched: [4:2.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_bt_btc_btr_bts_64:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: btq %rsi, %rdi # sched: [1:0.50]
+; SANDY-NEXT: btcq %rsi, %rdi # sched: [1:0.50]
+; SANDY-NEXT: btrq %rsi, %rdi # sched: [1:0.50]
+; SANDY-NEXT: btsq %rsi, %rdi # sched: [1:0.50]
+; SANDY-NEXT: btq %rsi, (%rdx) # sched: [9:1.00]
+; SANDY-NEXT: btcq %rsi, (%rdx) # sched: [9:1.00]
+; SANDY-NEXT: btrq %rsi, (%rdx) # sched: [9:1.00]
+; SANDY-NEXT: btsq %rsi, (%rdx) # sched: [9:1.00]
+; SANDY-NEXT: btq $7, %rdi # sched: [1:0.50]
+; SANDY-NEXT: btcq $7, %rdi # sched: [1:0.50]
+; SANDY-NEXT: btrq $7, %rdi # sched: [1:0.50]
+; SANDY-NEXT: btsq $7, %rdi # sched: [1:0.50]
+; SANDY-NEXT: btq $7, (%rdx) # sched: [6:0.50]
+; SANDY-NEXT: btcq $7, (%rdx) # sched: [7:1.00]
+; SANDY-NEXT: btrq $7, (%rdx) # sched: [7:1.00]
+; SANDY-NEXT: btsq $7, (%rdx) # sched: [7:1.00]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_bt_btc_btr_bts_64:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: btq %rsi, %rdi # sched: [1:0.50]
+; HASWELL-NEXT: btcq %rsi, %rdi # sched: [1:0.50]
+; HASWELL-NEXT: btrq %rsi, %rdi # sched: [1:0.50]
+; HASWELL-NEXT: btsq %rsi, %rdi # sched: [1:0.50]
+; HASWELL-NEXT: btq %rsi, (%rdx) # sched: [1:?]
+; HASWELL-NEXT: btcq %rsi, (%rdx) # sched: [1:?]
+; HASWELL-NEXT: btrq %rsi, (%rdx) # sched: [1:?]
+; HASWELL-NEXT: btsq %rsi, (%rdx) # sched: [1:?]
+; HASWELL-NEXT: btq $7, %rdi # sched: [1:0.50]
+; HASWELL-NEXT: btcq $7, %rdi # sched: [1:0.50]
+; HASWELL-NEXT: btrq $7, %rdi # sched: [1:0.50]
+; HASWELL-NEXT: btsq $7, %rdi # sched: [1:0.50]
+; HASWELL-NEXT: btq $7, (%rdx) # sched: [6:0.50]
+; HASWELL-NEXT: btcq $7, (%rdx) # sched: [7:1.00]
+; HASWELL-NEXT: btrq $7, (%rdx) # sched: [7:1.00]
+; HASWELL-NEXT: btsq $7, (%rdx) # sched: [7:1.00]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_bt_btc_btr_bts_64:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: btq %rsi, %rdi # sched: [1:0.50]
+; BROADWELL-NEXT: btcq %rsi, %rdi # sched: [1:0.50]
+; BROADWELL-NEXT: btrq %rsi, %rdi # sched: [1:0.50]
+; BROADWELL-NEXT: btsq %rsi, %rdi # sched: [1:0.50]
+; BROADWELL-NEXT: btq %rsi, (%rdx) # sched: [6:0.50]
+; BROADWELL-NEXT: btcq %rsi, (%rdx) # sched: [6:1.00]
+; BROADWELL-NEXT: btrq %rsi, (%rdx) # sched: [6:1.00]
+; BROADWELL-NEXT: btsq %rsi, (%rdx) # sched: [6:1.00]
+; BROADWELL-NEXT: btq $7, %rdi # sched: [1:0.50]
+; BROADWELL-NEXT: btcq $7, %rdi # sched: [1:0.50]
+; BROADWELL-NEXT: btrq $7, %rdi # sched: [1:0.50]
+; BROADWELL-NEXT: btsq $7, %rdi # sched: [1:0.50]
+; BROADWELL-NEXT: btq $7, (%rdx) # sched: [6:0.50]
+; BROADWELL-NEXT: btcq $7, (%rdx) # sched: [6:1.00]
+; BROADWELL-NEXT: btrq $7, (%rdx) # sched: [6:1.00]
+; BROADWELL-NEXT: btsq $7, (%rdx) # sched: [6:1.00]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_bt_btc_btr_bts_64:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: btq %rsi, %rdi # sched: [1:0.50]
+; SKYLAKE-NEXT: btcq %rsi, %rdi # sched: [1:0.50]
+; SKYLAKE-NEXT: btrq %rsi, %rdi # sched: [1:0.50]
+; SKYLAKE-NEXT: btsq %rsi, %rdi # sched: [1:0.50]
+; SKYLAKE-NEXT: btq %rsi, (%rdx) # sched: [6:0.50]
+; SKYLAKE-NEXT: btcq %rsi, (%rdx) # sched: [6:1.00]
+; SKYLAKE-NEXT: btrq %rsi, (%rdx) # sched: [6:1.00]
+; SKYLAKE-NEXT: btsq %rsi, (%rdx) # sched: [6:1.00]
+; SKYLAKE-NEXT: btq $7, %rdi # sched: [1:0.50]
+; SKYLAKE-NEXT: btcq $7, %rdi # sched: [1:0.50]
+; SKYLAKE-NEXT: btrq $7, %rdi # sched: [1:0.50]
+; SKYLAKE-NEXT: btsq $7, %rdi # sched: [1:0.50]
+; SKYLAKE-NEXT: btq $7, (%rdx) # sched: [6:0.50]
+; SKYLAKE-NEXT: btcq $7, (%rdx) # sched: [6:1.00]
+; SKYLAKE-NEXT: btrq $7, (%rdx) # sched: [6:1.00]
+; SKYLAKE-NEXT: btsq $7, (%rdx) # sched: [6:1.00]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_bt_btc_btr_bts_64:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: btq %rsi, %rdi # sched: [1:0.50]
+; SKX-NEXT: btcq %rsi, %rdi # sched: [1:0.50]
+; SKX-NEXT: btrq %rsi, %rdi # sched: [1:0.50]
+; SKX-NEXT: btsq %rsi, %rdi # sched: [1:0.50]
+; SKX-NEXT: btq %rsi, (%rdx) # sched: [6:0.50]
+; SKX-NEXT: btcq %rsi, (%rdx) # sched: [6:1.00]
+; SKX-NEXT: btrq %rsi, (%rdx) # sched: [6:1.00]
+; SKX-NEXT: btsq %rsi, (%rdx) # sched: [6:1.00]
+; SKX-NEXT: btq $7, %rdi # sched: [1:0.50]
+; SKX-NEXT: btcq $7, %rdi # sched: [1:0.50]
+; SKX-NEXT: btrq $7, %rdi # sched: [1:0.50]
+; SKX-NEXT: btsq $7, %rdi # sched: [1:0.50]
+; SKX-NEXT: btq $7, (%rdx) # sched: [6:0.50]
+; SKX-NEXT: btcq $7, (%rdx) # sched: [6:1.00]
+; SKX-NEXT: btrq $7, (%rdx) # sched: [6:1.00]
+; SKX-NEXT: btsq $7, (%rdx) # sched: [6:1.00]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_bt_btc_btr_bts_64:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: btq %rsi, %rdi # sched: [1:0.50]
+; BTVER2-NEXT: btcq %rsi, %rdi # sched: [1:0.50]
+; BTVER2-NEXT: btrq %rsi, %rdi # sched: [1:0.50]
+; BTVER2-NEXT: btsq %rsi, %rdi # sched: [1:0.50]
+; BTVER2-NEXT: btq %rsi, (%rdx) # sched: [4:1.00]
+; BTVER2-NEXT: btcq %rsi, (%rdx) # sched: [4:1.00]
+; BTVER2-NEXT: btrq %rsi, (%rdx) # sched: [4:1.00]
+; BTVER2-NEXT: btsq %rsi, (%rdx) # sched: [4:1.00]
+; BTVER2-NEXT: btq $7, %rdi # sched: [1:0.50]
+; BTVER2-NEXT: btcq $7, %rdi # sched: [1:0.50]
+; BTVER2-NEXT: btrq $7, %rdi # sched: [1:0.50]
+; BTVER2-NEXT: btsq $7, %rdi # sched: [1:0.50]
+; BTVER2-NEXT: btq $7, (%rdx) # sched: [1:0.50]
+; BTVER2-NEXT: btcq $7, (%rdx) # sched: [4:1.00]
+; BTVER2-NEXT: btrq $7, (%rdx) # sched: [4:1.00]
+; BTVER2-NEXT: btsq $7, (%rdx) # sched: [4:1.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_bt_btc_btr_bts_64:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: btq %rsi, %rdi # sched: [1:0.25]
+; ZNVER1-NEXT: btcq %rsi, %rdi # sched: [2:0.25]
+; ZNVER1-NEXT: btrq %rsi, %rdi # sched: [2:0.25]
+; ZNVER1-NEXT: btsq %rsi, %rdi # sched: [2:0.25]
+; ZNVER1-NEXT: btq %rsi, (%rdx) # sched: [5:0.50]
+; ZNVER1-NEXT: btcq %rsi, (%rdx) # sched: [6:0.50]
+; ZNVER1-NEXT: btrq %rsi, (%rdx) # sched: [6:0.50]
+; ZNVER1-NEXT: btsq %rsi, (%rdx) # sched: [6:0.50]
+; ZNVER1-NEXT: btq $7, %rdi # sched: [1:0.25]
+; ZNVER1-NEXT: btcq $7, %rdi # sched: [2:0.25]
+; ZNVER1-NEXT: btrq $7, %rdi # sched: [2:0.25]
+; ZNVER1-NEXT: btsq $7, %rdi # sched: [2:0.25]
+; ZNVER1-NEXT: btq $7, (%rdx) # sched: [5:0.50]
+; ZNVER1-NEXT: btcq $7, (%rdx) # sched: [6:0.50]
+; ZNVER1-NEXT: btrq $7, (%rdx) # sched: [6:0.50]
+; ZNVER1-NEXT: btsq $7, (%rdx) # sched: [6:0.50]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ call void asm sideeffect "btq $1, $0 \0A\09 btcq $1, $0 \0A\09 btrq $1, $0 \0A\09 btsq $1, $0 \0A\09 btq $1, $2 \0A\09 btcq $1, $2 \0A\09 btrq $1, $2 \0A\09 btsq $1, $2 \0A\09 btq $3, $0 \0A\09 btcq $3, $0 \0A\09 btrq $3, $0 \0A\09 btsq $3, $0 \0A\09 btq $3, $2 \0A\09 btcq $3, $2 \0A\09 btrq $3, $2 \0A\09 btsq $3, $2", "r,r,*m,i"(i64 %a0, i64 %a1, i64 *%a2, i8 7)
+ ret void
+}
+
+; TODO - test_call
+
+define void @test_cbw_cdq_cdqe_cqo_cwd_cwde() optsize {
+; GENERIC-LABEL: test_cbw_cdq_cdqe_cqo_cwd_cwde:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: cbtw # sched: [1:0.33]
+; GENERIC-NEXT: cltd # sched: [1:0.50]
+; GENERIC-NEXT: cltq # sched: [1:0.50]
+; GENERIC-NEXT: cqto # sched: [1:0.50]
+; GENERIC-NEXT: cwtd # sched: [2:1.00]
+; GENERIC-NEXT: cwtl # sched: [1:0.33]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_cbw_cdq_cdqe_cqo_cwd_cwde:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: cbtw # sched: [4:2.00]
+; ATOM-NEXT: cltd # sched: [4:2.00]
+; ATOM-NEXT: cltq # sched: [4:2.00]
+; ATOM-NEXT: cqto # sched: [4:2.00]
+; ATOM-NEXT: cwtd # sched: [4:2.00]
+; ATOM-NEXT: cwtl # sched: [4:2.00]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_cbw_cdq_cdqe_cqo_cwd_cwde:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: cbtw # sched: [1:0.50]
+; SLM-NEXT: cltd # sched: [1:0.50]
+; SLM-NEXT: cltq # sched: [1:0.50]
+; SLM-NEXT: cqto # sched: [1:0.50]
+; SLM-NEXT: cwtd # sched: [1:0.50]
+; SLM-NEXT: cwtl # sched: [1:0.50]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_cbw_cdq_cdqe_cqo_cwd_cwde:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: cbtw # sched: [1:0.33]
+; SANDY-NEXT: cltd # sched: [1:0.50]
+; SANDY-NEXT: cltq # sched: [1:0.50]
+; SANDY-NEXT: cqto # sched: [1:0.50]
+; SANDY-NEXT: cwtd # sched: [2:1.00]
+; SANDY-NEXT: cwtl # sched: [1:0.33]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_cbw_cdq_cdqe_cqo_cwd_cwde:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: cbtw # sched: [1:0.25]
+; HASWELL-NEXT: cltd # sched: [1:0.50]
+; HASWELL-NEXT: cltq # sched: [1:0.50]
+; HASWELL-NEXT: cqto # sched: [1:0.50]
+; HASWELL-NEXT: cwtd # sched: [2:0.50]
+; HASWELL-NEXT: cwtl # sched: [1:0.25]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_cbw_cdq_cdqe_cqo_cwd_cwde:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: cbtw # sched: [1:0.25]
+; BROADWELL-NEXT: cltd # sched: [1:0.50]
+; BROADWELL-NEXT: cltq # sched: [1:0.50]
+; BROADWELL-NEXT: cqto # sched: [1:0.50]
+; BROADWELL-NEXT: cwtd # sched: [2:0.50]
+; BROADWELL-NEXT: cwtl # sched: [1:0.25]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_cbw_cdq_cdqe_cqo_cwd_cwde:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: cbtw # sched: [1:0.25]
+; SKYLAKE-NEXT: cltd # sched: [1:0.50]
+; SKYLAKE-NEXT: cltq # sched: [1:0.50]
+; SKYLAKE-NEXT: cqto # sched: [1:0.50]
+; SKYLAKE-NEXT: cwtd # sched: [2:0.50]
+; SKYLAKE-NEXT: cwtl # sched: [1:0.25]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_cbw_cdq_cdqe_cqo_cwd_cwde:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: cbtw # sched: [1:0.25]
+; SKX-NEXT: cltd # sched: [1:0.50]
+; SKX-NEXT: cltq # sched: [1:0.50]
+; SKX-NEXT: cqto # sched: [1:0.50]
+; SKX-NEXT: cwtd # sched: [2:0.50]
+; SKX-NEXT: cwtl # sched: [1:0.25]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_cbw_cdq_cdqe_cqo_cwd_cwde:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: cbtw # sched: [1:0.50]
+; BTVER2-NEXT: cltd # sched: [1:0.50]
+; BTVER2-NEXT: cltq # sched: [1:0.50]
+; BTVER2-NEXT: cqto # sched: [1:0.50]
+; BTVER2-NEXT: cwtd # sched: [1:0.50]
+; BTVER2-NEXT: cwtl # sched: [1:0.50]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_cbw_cdq_cdqe_cqo_cwd_cwde:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: cbtw # sched: [1:0.25]
+; ZNVER1-NEXT: cltd # sched: [1:0.25]
+; ZNVER1-NEXT: cltq # sched: [1:0.25]
+; ZNVER1-NEXT: cqto # sched: [1:0.25]
+; ZNVER1-NEXT: cwtd # sched: [1:0.25]
+; ZNVER1-NEXT: cwtl # sched: [1:0.25]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ tail call void asm "cbw \0A\09 cdq \0A\09 cdqe \0A\09 cqo \0A\09 cwd \0A\09 cwde", ""() nounwind
+ ret void
+}
+
+define void @test_clc_cld_cmc() optsize {
+; GENERIC-LABEL: test_clc_cld_cmc:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: clc # sched: [1:0.33]
+; GENERIC-NEXT: cld # sched: [1:0.33]
+; GENERIC-NEXT: cmc # sched: [1:0.33]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_clc_cld_cmc:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: clc # sched: [1:0.50]
+; ATOM-NEXT: cld # sched: [3:1.50]
+; ATOM-NEXT: cmc # sched: [1:0.50]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_clc_cld_cmc:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: clc # sched: [1:0.50]
+; SLM-NEXT: cld # sched: [1:0.50]
+; SLM-NEXT: cmc # sched: [1:0.50]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_clc_cld_cmc:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: clc # sched: [1:0.33]
+; SANDY-NEXT: cld # sched: [1:0.33]
+; SANDY-NEXT: cmc # sched: [1:0.33]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_clc_cld_cmc:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: clc # sched: [1:0.25]
+; HASWELL-NEXT: cld # sched: [3:1.00]
+; HASWELL-NEXT: cmc # sched: [1:0.25]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_clc_cld_cmc:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: clc # sched: [1:0.25]
+; BROADWELL-NEXT: cld # sched: [3:1.00]
+; BROADWELL-NEXT: cmc # sched: [1:0.25]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_clc_cld_cmc:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: clc # sched: [1:0.25]
+; SKYLAKE-NEXT: cld # sched: [3:1.00]
+; SKYLAKE-NEXT: cmc # sched: [1:0.25]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_clc_cld_cmc:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: clc # sched: [1:0.25]
+; SKX-NEXT: cld # sched: [3:1.00]
+; SKX-NEXT: cmc # sched: [1:0.25]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_clc_cld_cmc:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: clc # sched: [1:0.50]
+; BTVER2-NEXT: cld # sched: [1:0.50]
+; BTVER2-NEXT: cmc # sched: [1:0.50]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_clc_cld_cmc:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: clc # sched: [1:0.25]
+; ZNVER1-NEXT: cld # sched: [1:0.25]
+; ZNVER1-NEXT: cmc # sched: [1:0.25]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ tail call void asm "clc \0A\09 cld \0A\09 cmc", ""() nounwind
+ ret void
+}
+
+define void @test_cmp_8(i8 %a0, i8* %a1) optsize {
+; GENERIC-LABEL: test_cmp_8:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: cmpb $7, %al # sched: [1:0.33]
+; GENERIC-NEXT: cmpb $7, %dil # sched: [1:0.33]
+; GENERIC-NEXT: cmpb $7, (%rsi) # sched: [6:0.50]
+; GENERIC-NEXT: cmpb %dil, %dil # sched: [1:0.33]
+; GENERIC-NEXT: cmpb %dil, (%rsi) # sched: [6:0.50]
+; GENERIC-NEXT: cmpb (%rsi), %dil # sched: [6:0.50]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_cmp_8:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: cmpb $7, %al # sched: [1:0.50]
+; ATOM-NEXT: cmpb $7, %dil # sched: [1:0.50]
+; ATOM-NEXT: cmpb $7, (%rsi) # sched: [1:1.00]
+; ATOM-NEXT: cmpb %dil, %dil # sched: [1:0.50]
+; ATOM-NEXT: cmpb %dil, (%rsi) # sched: [1:1.00]
+; ATOM-NEXT: cmpb (%rsi), %dil # sched: [1:1.00]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_cmp_8:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: cmpb $7, %al # sched: [1:0.50]
+; SLM-NEXT: cmpb $7, %dil # sched: [1:0.50]
+; SLM-NEXT: cmpb $7, (%rsi) # sched: [4:2.00]
+; SLM-NEXT: cmpb %dil, %dil # sched: [1:0.50]
+; SLM-NEXT: cmpb %dil, (%rsi) # sched: [4:2.00]
+; SLM-NEXT: cmpb (%rsi), %dil # sched: [4:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_cmp_8:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: cmpb $7, %al # sched: [1:0.33]
+; SANDY-NEXT: cmpb $7, %dil # sched: [1:0.33]
+; SANDY-NEXT: cmpb $7, (%rsi) # sched: [6:0.50]
+; SANDY-NEXT: cmpb %dil, %dil # sched: [1:0.33]
+; SANDY-NEXT: cmpb %dil, (%rsi) # sched: [6:0.50]
+; SANDY-NEXT: cmpb (%rsi), %dil # sched: [6:0.50]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_cmp_8:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: cmpb $7, %al # sched: [1:0.25]
+; HASWELL-NEXT: cmpb $7, %dil # sched: [1:0.25]
+; HASWELL-NEXT: cmpb $7, (%rsi) # sched: [6:0.50]
+; HASWELL-NEXT: cmpb %dil, %dil # sched: [1:0.25]
+; HASWELL-NEXT: cmpb %dil, (%rsi) # sched: [6:0.50]
+; HASWELL-NEXT: cmpb (%rsi), %dil # sched: [6:0.50]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_cmp_8:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: cmpb $7, %al # sched: [1:0.25]
+; BROADWELL-NEXT: cmpb $7, %dil # sched: [1:0.25]
+; BROADWELL-NEXT: cmpb $7, (%rsi) # sched: [6:0.50]
+; BROADWELL-NEXT: cmpb %dil, %dil # sched: [1:0.25]
+; BROADWELL-NEXT: cmpb %dil, (%rsi) # sched: [6:0.50]
+; BROADWELL-NEXT: cmpb (%rsi), %dil # sched: [6:0.50]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_cmp_8:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: cmpb $7, %al # sched: [1:0.25]
+; SKYLAKE-NEXT: cmpb $7, %dil # sched: [1:0.25]
+; SKYLAKE-NEXT: cmpb $7, (%rsi) # sched: [6:0.50]
+; SKYLAKE-NEXT: cmpb %dil, %dil # sched: [1:0.25]
+; SKYLAKE-NEXT: cmpb %dil, (%rsi) # sched: [6:0.50]
+; SKYLAKE-NEXT: cmpb (%rsi), %dil # sched: [6:0.50]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_cmp_8:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: cmpb $7, %al # sched: [1:0.25]
+; SKX-NEXT: cmpb $7, %dil # sched: [1:0.25]
+; SKX-NEXT: cmpb $7, (%rsi) # sched: [6:0.50]
+; SKX-NEXT: cmpb %dil, %dil # sched: [1:0.25]
+; SKX-NEXT: cmpb %dil, (%rsi) # sched: [6:0.50]
+; SKX-NEXT: cmpb (%rsi), %dil # sched: [6:0.50]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_cmp_8:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: cmpb $7, %al # sched: [1:0.50]
+; BTVER2-NEXT: cmpb $7, %dil # sched: [1:0.50]
+; BTVER2-NEXT: cmpb $7, (%rsi) # sched: [4:1.00]
+; BTVER2-NEXT: cmpb %dil, %dil # sched: [1:0.50]
+; BTVER2-NEXT: cmpb %dil, (%rsi) # sched: [4:1.00]
+; BTVER2-NEXT: cmpb (%rsi), %dil # sched: [4:1.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_cmp_8:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: cmpb $7, %al # sched: [1:0.25]
+; ZNVER1-NEXT: cmpb $7, %dil # sched: [1:0.25]
+; ZNVER1-NEXT: cmpb $7, (%rsi) # sched: [5:1.00]
+; ZNVER1-NEXT: cmpb %dil, %dil # sched: [1:0.25]
+; ZNVER1-NEXT: cmpb %dil, (%rsi) # sched: [5:1.00]
+; ZNVER1-NEXT: cmpb (%rsi), %dil # sched: [5:0.50]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ tail call void asm "cmpb $2, %AL \0A\09 cmpb $2, $0 \0A\09 cmpb $2, $1 \0A\09 cmpb $0, $0 \0A\09 cmpb $0, $1 \0A\09 cmpb $1, $0", "r,*m,i"(i8 %a0, i8* %a1, i8 7) nounwind
+ ret void
+}
+define void @test_cmp_16(i16 %a0, i16* %a1) optsize {
+; GENERIC-LABEL: test_cmp_16:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: cmpw $511, %ax # imm = 0x1FF
+; GENERIC-NEXT: # sched: [1:0.33]
+; GENERIC-NEXT: cmpw $511, %di # imm = 0x1FF
+; GENERIC-NEXT: # sched: [1:0.33]
+; GENERIC-NEXT: cmpw $511, (%rsi) # imm = 0x1FF
+; GENERIC-NEXT: # sched: [6:0.50]
+; GENERIC-NEXT: cmpw $7, %di # sched: [1:0.33]
+; GENERIC-NEXT: cmpw $7, (%rsi) # sched: [6:0.50]
+; GENERIC-NEXT: cmpw %di, %di # sched: [1:0.33]
+; GENERIC-NEXT: cmpw %di, (%rsi) # sched: [6:0.50]
+; GENERIC-NEXT: cmpw (%rsi), %di # sched: [6:0.50]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_cmp_16:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: cmpw $511, %ax # imm = 0x1FF
+; ATOM-NEXT: # sched: [1:0.50]
+; ATOM-NEXT: cmpw $511, %di # imm = 0x1FF
+; ATOM-NEXT: # sched: [1:0.50]
+; ATOM-NEXT: cmpw $511, (%rsi) # imm = 0x1FF
+; ATOM-NEXT: # sched: [1:1.00]
+; ATOM-NEXT: cmpw $7, %di # sched: [1:0.50]
+; ATOM-NEXT: cmpw $7, (%rsi) # sched: [1:1.00]
+; ATOM-NEXT: cmpw %di, %di # sched: [1:0.50]
+; ATOM-NEXT: cmpw %di, (%rsi) # sched: [1:1.00]
+; ATOM-NEXT: cmpw (%rsi), %di # sched: [1:1.00]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_cmp_16:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: cmpw $511, %ax # imm = 0x1FF
+; SLM-NEXT: # sched: [1:0.50]
+; SLM-NEXT: cmpw $511, %di # imm = 0x1FF
+; SLM-NEXT: # sched: [1:0.50]
+; SLM-NEXT: cmpw $511, (%rsi) # imm = 0x1FF
+; SLM-NEXT: # sched: [4:2.00]
+; SLM-NEXT: cmpw $7, %di # sched: [1:0.50]
+; SLM-NEXT: cmpw $7, (%rsi) # sched: [4:2.00]
+; SLM-NEXT: cmpw %di, %di # sched: [1:0.50]
+; SLM-NEXT: cmpw %di, (%rsi) # sched: [4:2.00]
+; SLM-NEXT: cmpw (%rsi), %di # sched: [4:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_cmp_16:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: cmpw $511, %ax # imm = 0x1FF
+; SANDY-NEXT: # sched: [1:0.33]
+; SANDY-NEXT: cmpw $511, %di # imm = 0x1FF
+; SANDY-NEXT: # sched: [1:0.33]
+; SANDY-NEXT: cmpw $511, (%rsi) # imm = 0x1FF
+; SANDY-NEXT: # sched: [6:0.50]
+; SANDY-NEXT: cmpw $7, %di # sched: [1:0.33]
+; SANDY-NEXT: cmpw $7, (%rsi) # sched: [6:0.50]
+; SANDY-NEXT: cmpw %di, %di # sched: [1:0.33]
+; SANDY-NEXT: cmpw %di, (%rsi) # sched: [6:0.50]
+; SANDY-NEXT: cmpw (%rsi), %di # sched: [6:0.50]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_cmp_16:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: cmpw $511, %ax # imm = 0x1FF
+; HASWELL-NEXT: # sched: [1:0.25]
+; HASWELL-NEXT: cmpw $511, %di # imm = 0x1FF
+; HASWELL-NEXT: # sched: [1:0.25]
+; HASWELL-NEXT: cmpw $511, (%rsi) # imm = 0x1FF
+; HASWELL-NEXT: # sched: [6:0.50]
+; HASWELL-NEXT: cmpw $7, %di # sched: [1:0.25]
+; HASWELL-NEXT: cmpw $7, (%rsi) # sched: [6:0.50]
+; HASWELL-NEXT: cmpw %di, %di # sched: [1:0.25]
+; HASWELL-NEXT: cmpw %di, (%rsi) # sched: [6:0.50]
+; HASWELL-NEXT: cmpw (%rsi), %di # sched: [6:0.50]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_cmp_16:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: cmpw $511, %ax # imm = 0x1FF
+; BROADWELL-NEXT: # sched: [1:0.25]
+; BROADWELL-NEXT: cmpw $511, %di # imm = 0x1FF
+; BROADWELL-NEXT: # sched: [1:0.25]
+; BROADWELL-NEXT: cmpw $511, (%rsi) # imm = 0x1FF
+; BROADWELL-NEXT: # sched: [6:0.50]
+; BROADWELL-NEXT: cmpw $7, %di # sched: [1:0.25]
+; BROADWELL-NEXT: cmpw $7, (%rsi) # sched: [6:0.50]
+; BROADWELL-NEXT: cmpw %di, %di # sched: [1:0.25]
+; BROADWELL-NEXT: cmpw %di, (%rsi) # sched: [6:0.50]
+; BROADWELL-NEXT: cmpw (%rsi), %di # sched: [6:0.50]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_cmp_16:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: cmpw $511, %ax # imm = 0x1FF
+; SKYLAKE-NEXT: # sched: [1:0.25]
+; SKYLAKE-NEXT: cmpw $511, %di # imm = 0x1FF
+; SKYLAKE-NEXT: # sched: [1:0.25]
+; SKYLAKE-NEXT: cmpw $511, (%rsi) # imm = 0x1FF
+; SKYLAKE-NEXT: # sched: [6:0.50]
+; SKYLAKE-NEXT: cmpw $7, %di # sched: [1:0.25]
+; SKYLAKE-NEXT: cmpw $7, (%rsi) # sched: [6:0.50]
+; SKYLAKE-NEXT: cmpw %di, %di # sched: [1:0.25]
+; SKYLAKE-NEXT: cmpw %di, (%rsi) # sched: [6:0.50]
+; SKYLAKE-NEXT: cmpw (%rsi), %di # sched: [6:0.50]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_cmp_16:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: cmpw $511, %ax # imm = 0x1FF
+; SKX-NEXT: # sched: [1:0.25]
+; SKX-NEXT: cmpw $511, %di # imm = 0x1FF
+; SKX-NEXT: # sched: [1:0.25]
+; SKX-NEXT: cmpw $511, (%rsi) # imm = 0x1FF
+; SKX-NEXT: # sched: [6:0.50]
+; SKX-NEXT: cmpw $7, %di # sched: [1:0.25]
+; SKX-NEXT: cmpw $7, (%rsi) # sched: [6:0.50]
+; SKX-NEXT: cmpw %di, %di # sched: [1:0.25]
+; SKX-NEXT: cmpw %di, (%rsi) # sched: [6:0.50]
+; SKX-NEXT: cmpw (%rsi), %di # sched: [6:0.50]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_cmp_16:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: cmpw $511, %ax # imm = 0x1FF
+; BTVER2-NEXT: # sched: [1:0.50]
+; BTVER2-NEXT: cmpw $511, %di # imm = 0x1FF
+; BTVER2-NEXT: # sched: [1:0.50]
+; BTVER2-NEXT: cmpw $511, (%rsi) # imm = 0x1FF
+; BTVER2-NEXT: # sched: [4:1.00]
+; BTVER2-NEXT: cmpw $7, %di # sched: [1:0.50]
+; BTVER2-NEXT: cmpw $7, (%rsi) # sched: [4:1.00]
+; BTVER2-NEXT: cmpw %di, %di # sched: [1:0.50]
+; BTVER2-NEXT: cmpw %di, (%rsi) # sched: [4:1.00]
+; BTVER2-NEXT: cmpw (%rsi), %di # sched: [4:1.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_cmp_16:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: cmpw $511, %ax # imm = 0x1FF
+; ZNVER1-NEXT: # sched: [1:0.25]
+; ZNVER1-NEXT: cmpw $511, %di # imm = 0x1FF
+; ZNVER1-NEXT: # sched: [1:0.25]
+; ZNVER1-NEXT: cmpw $511, (%rsi) # imm = 0x1FF
+; ZNVER1-NEXT: # sched: [5:1.00]
+; ZNVER1-NEXT: cmpw $7, %di # sched: [1:0.25]
+; ZNVER1-NEXT: cmpw $7, (%rsi) # sched: [5:1.00]
+; ZNVER1-NEXT: cmpw %di, %di # sched: [1:0.25]
+; ZNVER1-NEXT: cmpw %di, (%rsi) # sched: [5:1.00]
+; ZNVER1-NEXT: cmpw (%rsi), %di # sched: [5:0.50]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ tail call void asm "cmpw $2, %AX \0A\09 cmpw $2, $0 \0A\09 cmpw $2, $1 \0A\09 cmpw $3, $0 \0A\09 cmpw $3, $1 \0A\09 cmpw $0, $0 \0A\09 cmpw $0, $1 \0A\09 cmpw $1, $0", "r,*m,i,i"(i16 %a0, i16* %a1, i16 511, i8 7) nounwind
+ ret void
+}
+define void @test_cmp_32(i32 %a0, i32* %a1) optsize {
+; GENERIC-LABEL: test_cmp_32:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: cmpl $665536, %eax # imm = 0xA27C0
+; GENERIC-NEXT: # sched: [1:0.33]
+; GENERIC-NEXT: cmpl $665536, %edi # imm = 0xA27C0
+; GENERIC-NEXT: # sched: [1:0.33]
+; GENERIC-NEXT: cmpl $665536, (%rsi) # imm = 0xA27C0
+; GENERIC-NEXT: # sched: [6:0.50]
+; GENERIC-NEXT: cmpl $7, %edi # sched: [1:0.33]
+; GENERIC-NEXT: cmpl $7, (%rsi) # sched: [6:0.50]
+; GENERIC-NEXT: cmpl %edi, %edi # sched: [1:0.33]
+; GENERIC-NEXT: cmpl %edi, (%rsi) # sched: [6:0.50]
+; GENERIC-NEXT: cmpl (%rsi), %edi # sched: [6:0.50]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_cmp_32:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: cmpl $665536, %eax # imm = 0xA27C0
+; ATOM-NEXT: # sched: [1:0.50]
+; ATOM-NEXT: cmpl $665536, %edi # imm = 0xA27C0
+; ATOM-NEXT: # sched: [1:0.50]
+; ATOM-NEXT: cmpl $665536, (%rsi) # imm = 0xA27C0
+; ATOM-NEXT: # sched: [1:1.00]
+; ATOM-NEXT: cmpl $7, %edi # sched: [1:0.50]
+; ATOM-NEXT: cmpl $7, (%rsi) # sched: [1:1.00]
+; ATOM-NEXT: cmpl %edi, %edi # sched: [1:0.50]
+; ATOM-NEXT: cmpl %edi, (%rsi) # sched: [1:1.00]
+; ATOM-NEXT: cmpl (%rsi), %edi # sched: [1:1.00]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_cmp_32:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: cmpl $665536, %eax # imm = 0xA27C0
+; SLM-NEXT: # sched: [1:0.50]
+; SLM-NEXT: cmpl $665536, %edi # imm = 0xA27C0
+; SLM-NEXT: # sched: [1:0.50]
+; SLM-NEXT: cmpl $665536, (%rsi) # imm = 0xA27C0
+; SLM-NEXT: # sched: [4:2.00]
+; SLM-NEXT: cmpl $7, %edi # sched: [1:0.50]
+; SLM-NEXT: cmpl $7, (%rsi) # sched: [4:2.00]
+; SLM-NEXT: cmpl %edi, %edi # sched: [1:0.50]
+; SLM-NEXT: cmpl %edi, (%rsi) # sched: [4:2.00]
+; SLM-NEXT: cmpl (%rsi), %edi # sched: [4:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_cmp_32:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: cmpl $665536, %eax # imm = 0xA27C0
+; SANDY-NEXT: # sched: [1:0.33]
+; SANDY-NEXT: cmpl $665536, %edi # imm = 0xA27C0
+; SANDY-NEXT: # sched: [1:0.33]
+; SANDY-NEXT: cmpl $665536, (%rsi) # imm = 0xA27C0
+; SANDY-NEXT: # sched: [6:0.50]
+; SANDY-NEXT: cmpl $7, %edi # sched: [1:0.33]
+; SANDY-NEXT: cmpl $7, (%rsi) # sched: [6:0.50]
+; SANDY-NEXT: cmpl %edi, %edi # sched: [1:0.33]
+; SANDY-NEXT: cmpl %edi, (%rsi) # sched: [6:0.50]
+; SANDY-NEXT: cmpl (%rsi), %edi # sched: [6:0.50]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_cmp_32:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: cmpl $665536, %eax # imm = 0xA27C0
+; HASWELL-NEXT: # sched: [1:0.25]
+; HASWELL-NEXT: cmpl $665536, %edi # imm = 0xA27C0
+; HASWELL-NEXT: # sched: [1:0.25]
+; HASWELL-NEXT: cmpl $665536, (%rsi) # imm = 0xA27C0
+; HASWELL-NEXT: # sched: [6:0.50]
+; HASWELL-NEXT: cmpl $7, %edi # sched: [1:0.25]
+; HASWELL-NEXT: cmpl $7, (%rsi) # sched: [6:0.50]
+; HASWELL-NEXT: cmpl %edi, %edi # sched: [1:0.25]
+; HASWELL-NEXT: cmpl %edi, (%rsi) # sched: [6:0.50]
+; HASWELL-NEXT: cmpl (%rsi), %edi # sched: [6:0.50]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_cmp_32:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: cmpl $665536, %eax # imm = 0xA27C0
+; BROADWELL-NEXT: # sched: [1:0.25]
+; BROADWELL-NEXT: cmpl $665536, %edi # imm = 0xA27C0
+; BROADWELL-NEXT: # sched: [1:0.25]
+; BROADWELL-NEXT: cmpl $665536, (%rsi) # imm = 0xA27C0
+; BROADWELL-NEXT: # sched: [6:0.50]
+; BROADWELL-NEXT: cmpl $7, %edi # sched: [1:0.25]
+; BROADWELL-NEXT: cmpl $7, (%rsi) # sched: [6:0.50]
+; BROADWELL-NEXT: cmpl %edi, %edi # sched: [1:0.25]
+; BROADWELL-NEXT: cmpl %edi, (%rsi) # sched: [6:0.50]
+; BROADWELL-NEXT: cmpl (%rsi), %edi # sched: [6:0.50]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_cmp_32:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: cmpl $665536, %eax # imm = 0xA27C0
+; SKYLAKE-NEXT: # sched: [1:0.25]
+; SKYLAKE-NEXT: cmpl $665536, %edi # imm = 0xA27C0
+; SKYLAKE-NEXT: # sched: [1:0.25]
+; SKYLAKE-NEXT: cmpl $665536, (%rsi) # imm = 0xA27C0
+; SKYLAKE-NEXT: # sched: [6:0.50]
+; SKYLAKE-NEXT: cmpl $7, %edi # sched: [1:0.25]
+; SKYLAKE-NEXT: cmpl $7, (%rsi) # sched: [6:0.50]
+; SKYLAKE-NEXT: cmpl %edi, %edi # sched: [1:0.25]
+; SKYLAKE-NEXT: cmpl %edi, (%rsi) # sched: [6:0.50]
+; SKYLAKE-NEXT: cmpl (%rsi), %edi # sched: [6:0.50]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_cmp_32:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: cmpl $665536, %eax # imm = 0xA27C0
+; SKX-NEXT: # sched: [1:0.25]
+; SKX-NEXT: cmpl $665536, %edi # imm = 0xA27C0
+; SKX-NEXT: # sched: [1:0.25]
+; SKX-NEXT: cmpl $665536, (%rsi) # imm = 0xA27C0
+; SKX-NEXT: # sched: [6:0.50]
+; SKX-NEXT: cmpl $7, %edi # sched: [1:0.25]
+; SKX-NEXT: cmpl $7, (%rsi) # sched: [6:0.50]
+; SKX-NEXT: cmpl %edi, %edi # sched: [1:0.25]
+; SKX-NEXT: cmpl %edi, (%rsi) # sched: [6:0.50]
+; SKX-NEXT: cmpl (%rsi), %edi # sched: [6:0.50]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_cmp_32:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: cmpl $665536, %eax # imm = 0xA27C0
+; BTVER2-NEXT: # sched: [1:0.50]
+; BTVER2-NEXT: cmpl $665536, %edi # imm = 0xA27C0
+; BTVER2-NEXT: # sched: [1:0.50]
+; BTVER2-NEXT: cmpl $665536, (%rsi) # imm = 0xA27C0
+; BTVER2-NEXT: # sched: [4:1.00]
+; BTVER2-NEXT: cmpl $7, %edi # sched: [1:0.50]
+; BTVER2-NEXT: cmpl $7, (%rsi) # sched: [4:1.00]
+; BTVER2-NEXT: cmpl %edi, %edi # sched: [1:0.50]
+; BTVER2-NEXT: cmpl %edi, (%rsi) # sched: [4:1.00]
+; BTVER2-NEXT: cmpl (%rsi), %edi # sched: [4:1.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_cmp_32:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: cmpl $665536, %eax # imm = 0xA27C0
+; ZNVER1-NEXT: # sched: [1:0.25]
+; ZNVER1-NEXT: cmpl $665536, %edi # imm = 0xA27C0
+; ZNVER1-NEXT: # sched: [1:0.25]
+; ZNVER1-NEXT: cmpl $665536, (%rsi) # imm = 0xA27C0
+; ZNVER1-NEXT: # sched: [5:1.00]
+; ZNVER1-NEXT: cmpl $7, %edi # sched: [1:0.25]
+; ZNVER1-NEXT: cmpl $7, (%rsi) # sched: [5:1.00]
+; ZNVER1-NEXT: cmpl %edi, %edi # sched: [1:0.25]
+; ZNVER1-NEXT: cmpl %edi, (%rsi) # sched: [5:1.00]
+; ZNVER1-NEXT: cmpl (%rsi), %edi # sched: [5:0.50]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ tail call void asm "cmpl $2, %EAX \0A\09 cmpl $2, $0 \0A\09 cmpl $2, $1 \0A\09 cmpl $3, $0 \0A\09 cmpl $3, $1 \0A\09 cmpl $0, $0 \0A\09 cmpl $0, $1 \0A\09 cmpl $1, $0", "r,*m,i,i"(i32 %a0, i32* %a1, i32 665536, i8 7) nounwind
+ ret void
+}
+define void @test_cmp_64(i64 %a0, i64* %a1) optsize {
+; GENERIC-LABEL: test_cmp_64:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: cmpq $665536, %rax # imm = 0xA27C0
+; GENERIC-NEXT: # sched: [1:0.33]
+; GENERIC-NEXT: cmpq $665536, %rdi # imm = 0xA27C0
+; GENERIC-NEXT: # sched: [1:0.33]
+; GENERIC-NEXT: cmpq $665536, (%rsi) # imm = 0xA27C0
+; GENERIC-NEXT: # sched: [6:0.50]
+; GENERIC-NEXT: cmpq $7, %rdi # sched: [1:0.33]
+; GENERIC-NEXT: cmpq $7, (%rsi) # sched: [6:0.50]
+; GENERIC-NEXT: cmpq %rdi, %rdi # sched: [1:0.33]
+; GENERIC-NEXT: cmpq %rdi, (%rsi) # sched: [6:0.50]
+; GENERIC-NEXT: cmpq (%rsi), %rdi # sched: [6:0.50]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_cmp_64:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: cmpq $665536, %rax # imm = 0xA27C0
+; ATOM-NEXT: # sched: [1:0.50]
+; ATOM-NEXT: cmpq $665536, %rdi # imm = 0xA27C0
+; ATOM-NEXT: # sched: [1:0.50]
+; ATOM-NEXT: cmpq $665536, (%rsi) # imm = 0xA27C0
+; ATOM-NEXT: # sched: [1:1.00]
+; ATOM-NEXT: cmpq $7, %rdi # sched: [1:0.50]
+; ATOM-NEXT: cmpq $7, (%rsi) # sched: [1:1.00]
+; ATOM-NEXT: cmpq %rdi, %rdi # sched: [1:0.50]
+; ATOM-NEXT: cmpq %rdi, (%rsi) # sched: [1:1.00]
+; ATOM-NEXT: cmpq (%rsi), %rdi # sched: [1:1.00]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_cmp_64:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: cmpq $665536, %rax # imm = 0xA27C0
+; SLM-NEXT: # sched: [1:0.50]
+; SLM-NEXT: cmpq $665536, %rdi # imm = 0xA27C0
+; SLM-NEXT: # sched: [1:0.50]
+; SLM-NEXT: cmpq $665536, (%rsi) # imm = 0xA27C0
+; SLM-NEXT: # sched: [4:2.00]
+; SLM-NEXT: cmpq $7, %rdi # sched: [1:0.50]
+; SLM-NEXT: cmpq $7, (%rsi) # sched: [4:2.00]
+; SLM-NEXT: cmpq %rdi, %rdi # sched: [1:0.50]
+; SLM-NEXT: cmpq %rdi, (%rsi) # sched: [4:2.00]
+; SLM-NEXT: cmpq (%rsi), %rdi # sched: [4:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_cmp_64:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: cmpq $665536, %rax # imm = 0xA27C0
+; SANDY-NEXT: # sched: [1:0.33]
+; SANDY-NEXT: cmpq $665536, %rdi # imm = 0xA27C0
+; SANDY-NEXT: # sched: [1:0.33]
+; SANDY-NEXT: cmpq $665536, (%rsi) # imm = 0xA27C0
+; SANDY-NEXT: # sched: [6:0.50]
+; SANDY-NEXT: cmpq $7, %rdi # sched: [1:0.33]
+; SANDY-NEXT: cmpq $7, (%rsi) # sched: [6:0.50]
+; SANDY-NEXT: cmpq %rdi, %rdi # sched: [1:0.33]
+; SANDY-NEXT: cmpq %rdi, (%rsi) # sched: [6:0.50]
+; SANDY-NEXT: cmpq (%rsi), %rdi # sched: [6:0.50]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_cmp_64:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: cmpq $665536, %rax # imm = 0xA27C0
+; HASWELL-NEXT: # sched: [1:0.25]
+; HASWELL-NEXT: cmpq $665536, %rdi # imm = 0xA27C0
+; HASWELL-NEXT: # sched: [1:0.25]
+; HASWELL-NEXT: cmpq $665536, (%rsi) # imm = 0xA27C0
+; HASWELL-NEXT: # sched: [6:0.50]
+; HASWELL-NEXT: cmpq $7, %rdi # sched: [1:0.25]
+; HASWELL-NEXT: cmpq $7, (%rsi) # sched: [6:0.50]
+; HASWELL-NEXT: cmpq %rdi, %rdi # sched: [1:0.25]
+; HASWELL-NEXT: cmpq %rdi, (%rsi) # sched: [6:0.50]
+; HASWELL-NEXT: cmpq (%rsi), %rdi # sched: [6:0.50]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_cmp_64:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: cmpq $665536, %rax # imm = 0xA27C0
+; BROADWELL-NEXT: # sched: [1:0.25]
+; BROADWELL-NEXT: cmpq $665536, %rdi # imm = 0xA27C0
+; BROADWELL-NEXT: # sched: [1:0.25]
+; BROADWELL-NEXT: cmpq $665536, (%rsi) # imm = 0xA27C0
+; BROADWELL-NEXT: # sched: [6:0.50]
+; BROADWELL-NEXT: cmpq $7, %rdi # sched: [1:0.25]
+; BROADWELL-NEXT: cmpq $7, (%rsi) # sched: [6:0.50]
+; BROADWELL-NEXT: cmpq %rdi, %rdi # sched: [1:0.25]
+; BROADWELL-NEXT: cmpq %rdi, (%rsi) # sched: [6:0.50]
+; BROADWELL-NEXT: cmpq (%rsi), %rdi # sched: [6:0.50]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_cmp_64:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: cmpq $665536, %rax # imm = 0xA27C0
+; SKYLAKE-NEXT: # sched: [1:0.25]
+; SKYLAKE-NEXT: cmpq $665536, %rdi # imm = 0xA27C0
+; SKYLAKE-NEXT: # sched: [1:0.25]
+; SKYLAKE-NEXT: cmpq $665536, (%rsi) # imm = 0xA27C0
+; SKYLAKE-NEXT: # sched: [6:0.50]
+; SKYLAKE-NEXT: cmpq $7, %rdi # sched: [1:0.25]
+; SKYLAKE-NEXT: cmpq $7, (%rsi) # sched: [6:0.50]
+; SKYLAKE-NEXT: cmpq %rdi, %rdi # sched: [1:0.25]
+; SKYLAKE-NEXT: cmpq %rdi, (%rsi) # sched: [6:0.50]
+; SKYLAKE-NEXT: cmpq (%rsi), %rdi # sched: [6:0.50]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_cmp_64:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: cmpq $665536, %rax # imm = 0xA27C0
+; SKX-NEXT: # sched: [1:0.25]
+; SKX-NEXT: cmpq $665536, %rdi # imm = 0xA27C0
+; SKX-NEXT: # sched: [1:0.25]
+; SKX-NEXT: cmpq $665536, (%rsi) # imm = 0xA27C0
+; SKX-NEXT: # sched: [6:0.50]
+; SKX-NEXT: cmpq $7, %rdi # sched: [1:0.25]
+; SKX-NEXT: cmpq $7, (%rsi) # sched: [6:0.50]
+; SKX-NEXT: cmpq %rdi, %rdi # sched: [1:0.25]
+; SKX-NEXT: cmpq %rdi, (%rsi) # sched: [6:0.50]
+; SKX-NEXT: cmpq (%rsi), %rdi # sched: [6:0.50]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_cmp_64:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: cmpq $665536, %rax # imm = 0xA27C0
+; BTVER2-NEXT: # sched: [1:0.50]
+; BTVER2-NEXT: cmpq $665536, %rdi # imm = 0xA27C0
+; BTVER2-NEXT: # sched: [1:0.50]
+; BTVER2-NEXT: cmpq $665536, (%rsi) # imm = 0xA27C0
+; BTVER2-NEXT: # sched: [4:1.00]
+; BTVER2-NEXT: cmpq $7, %rdi # sched: [1:0.50]
+; BTVER2-NEXT: cmpq $7, (%rsi) # sched: [4:1.00]
+; BTVER2-NEXT: cmpq %rdi, %rdi # sched: [1:0.50]
+; BTVER2-NEXT: cmpq %rdi, (%rsi) # sched: [4:1.00]
+; BTVER2-NEXT: cmpq (%rsi), %rdi # sched: [4:1.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_cmp_64:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: cmpq $665536, %rax # imm = 0xA27C0
+; ZNVER1-NEXT: # sched: [1:0.25]
+; ZNVER1-NEXT: cmpq $665536, %rdi # imm = 0xA27C0
+; ZNVER1-NEXT: # sched: [1:0.25]
+; ZNVER1-NEXT: cmpq $665536, (%rsi) # imm = 0xA27C0
+; ZNVER1-NEXT: # sched: [5:1.00]
+; ZNVER1-NEXT: cmpq $7, %rdi # sched: [1:0.25]
+; ZNVER1-NEXT: cmpq $7, (%rsi) # sched: [5:1.00]
+; ZNVER1-NEXT: cmpq %rdi, %rdi # sched: [1:0.25]
+; ZNVER1-NEXT: cmpq %rdi, (%rsi) # sched: [5:1.00]
+; ZNVER1-NEXT: cmpq (%rsi), %rdi # sched: [5:0.50]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ tail call void asm "cmpq $2, %RAX \0A\09 cmpq $2, $0 \0A\09 cmpq $2, $1 \0A\09 cmpq $3, $0 \0A\09 cmpq $3, $1 \0A\09 cmpq $0, $0 \0A\09 cmpq $0, $1 \0A\09 cmpq $1, $0", "r,*m,i,i"(i64 %a0, i64* %a1, i32 665536, i8 7) nounwind
+ ret void
+}
+
+define void @test_cmps() optsize {
+; GENERIC-LABEL: test_cmps:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: cmpsb %es:(%rdi), (%rsi) # sched: [8:1.00]
+; GENERIC-NEXT: cmpsw %es:(%rdi), (%rsi) # sched: [8:1.00]
+; GENERIC-NEXT: cmpsl %es:(%rdi), (%rsi) # sched: [8:1.00]
+; GENERIC-NEXT: cmpsq %es:(%rdi), (%rsi) # sched: [8:1.00]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_cmps:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: cmpsb %es:(%rdi), (%rsi) # sched: [3:1.50]
+; ATOM-NEXT: cmpsw %es:(%rdi), (%rsi) # sched: [3:1.50]
+; ATOM-NEXT: cmpsl %es:(%rdi), (%rsi) # sched: [3:1.50]
+; ATOM-NEXT: cmpsq %es:(%rdi), (%rsi) # sched: [3:1.50]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_cmps:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: cmpsb %es:(%rdi), (%rsi) # sched: [100:1.00]
+; SLM-NEXT: cmpsw %es:(%rdi), (%rsi) # sched: [100:1.00]
+; SLM-NEXT: cmpsl %es:(%rdi), (%rsi) # sched: [100:1.00]
+; SLM-NEXT: cmpsq %es:(%rdi), (%rsi) # sched: [100:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_cmps:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: cmpsb %es:(%rdi), (%rsi) # sched: [8:1.00]
+; SANDY-NEXT: cmpsw %es:(%rdi), (%rsi) # sched: [8:1.00]
+; SANDY-NEXT: cmpsl %es:(%rdi), (%rsi) # sched: [8:1.00]
+; SANDY-NEXT: cmpsq %es:(%rdi), (%rsi) # sched: [8:1.00]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_cmps:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: cmpsb %es:(%rdi), (%rsi) # sched: [4:1.00]
+; HASWELL-NEXT: cmpsw %es:(%rdi), (%rsi) # sched: [4:1.00]
+; HASWELL-NEXT: cmpsl %es:(%rdi), (%rsi) # sched: [4:1.00]
+; HASWELL-NEXT: cmpsq %es:(%rdi), (%rsi) # sched: [4:1.00]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_cmps:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: cmpsb %es:(%rdi), (%rsi) # sched: [100:0.25]
+; BROADWELL-NEXT: cmpsw %es:(%rdi), (%rsi) # sched: [100:0.25]
+; BROADWELL-NEXT: cmpsl %es:(%rdi), (%rsi) # sched: [100:0.25]
+; BROADWELL-NEXT: cmpsq %es:(%rdi), (%rsi) # sched: [100:0.25]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_cmps:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: cmpsb %es:(%rdi), (%rsi) # sched: [100:0.25]
+; SKYLAKE-NEXT: cmpsw %es:(%rdi), (%rsi) # sched: [100:0.25]
+; SKYLAKE-NEXT: cmpsl %es:(%rdi), (%rsi) # sched: [100:0.25]
+; SKYLAKE-NEXT: cmpsq %es:(%rdi), (%rsi) # sched: [100:0.25]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_cmps:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: cmpsb %es:(%rdi), (%rsi) # sched: [100:0.25]
+; SKX-NEXT: cmpsw %es:(%rdi), (%rsi) # sched: [100:0.25]
+; SKX-NEXT: cmpsl %es:(%rdi), (%rsi) # sched: [100:0.25]
+; SKX-NEXT: cmpsq %es:(%rdi), (%rsi) # sched: [100:0.25]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_cmps:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: cmpsb %es:(%rdi), (%rsi) # sched: [100:0.17]
+; BTVER2-NEXT: cmpsw %es:(%rdi), (%rsi) # sched: [100:0.17]
+; BTVER2-NEXT: cmpsl %es:(%rdi), (%rsi) # sched: [100:0.17]
+; BTVER2-NEXT: cmpsq %es:(%rdi), (%rsi) # sched: [100:0.17]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_cmps:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: cmpsb %es:(%rdi), (%rsi) # sched: [100:?]
+; ZNVER1-NEXT: cmpsw %es:(%rdi), (%rsi) # sched: [100:?]
+; ZNVER1-NEXT: cmpsl %es:(%rdi), (%rsi) # sched: [100:?]
+; ZNVER1-NEXT: cmpsq %es:(%rdi), (%rsi) # sched: [100:?]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ call void asm sideeffect "cmpsb \0A\09 cmpsw \0A\09 cmpsl \0A\09 cmpsq", ""()
+ ret void
+}
+
+define void @test_cmpxchg_8(i8 %a0, i8 %a1, i8 *%a2) optsize {
+; GENERIC-LABEL: test_cmpxchg_8:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: cmpxchgb %dil, %sil # sched: [5:1.33]
+; GENERIC-NEXT: cmpxchgb %dil, (%rdx) # sched: [8:1.00]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_cmpxchg_8:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: cmpxchgb %dil, %sil # sched: [9:4.50]
+; ATOM-NEXT: cmpxchgb %dil, (%rdx) # sched: [6:3.00]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_cmpxchg_8:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: cmpxchgb %dil, %sil # sched: [1:0.50]
+; SLM-NEXT: cmpxchgb %dil, (%rdx) # sched: [4:2.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_cmpxchg_8:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: cmpxchgb %dil, %sil # sched: [5:1.33]
+; SANDY-NEXT: cmpxchgb %dil, (%rdx) # sched: [8:1.00]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_cmpxchg_8:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: cmpxchgb %dil, %sil # sched: [5:1.25]
+; HASWELL-NEXT: cmpxchgb %dil, (%rdx) # sched: [9:1.00]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_cmpxchg_8:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: cmpxchgb %dil, %sil # sched: [5:1.25]
+; BROADWELL-NEXT: cmpxchgb %dil, (%rdx) # sched: [8:1.00]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_cmpxchg_8:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: cmpxchgb %dil, %sil # sched: [5:1.25]
+; SKYLAKE-NEXT: cmpxchgb %dil, (%rdx) # sched: [8:1.00]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_cmpxchg_8:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: cmpxchgb %dil, %sil # sched: [5:1.25]
+; SKX-NEXT: cmpxchgb %dil, (%rdx) # sched: [8:1.00]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_cmpxchg_8:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: cmpxchgb %dil, %sil # sched: [1:0.50]
+; BTVER2-NEXT: cmpxchgb %dil, (%rdx) # sched: [4:1.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_cmpxchg_8:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: cmpxchgb %dil, %sil # sched: [1:0.25]
+; ZNVER1-NEXT: cmpxchgb %dil, (%rdx) # sched: [8:0.50]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ tail call void asm "cmpxchgb $0, $1 \0a\09 cmpxchgb $0, $2", "r,r,*m"(i8 %a0, i8 %a1, i8 *%a2) nounwind
+ ret void
+}
+define void @test_cmpxchg_16(i16 %a0, i16 %a1, i16 *%a2) optsize {
+; GENERIC-LABEL: test_cmpxchg_16:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: cmpxchgw %di, %si # sched: [5:1.33]
+; GENERIC-NEXT: cmpxchgw %di, (%rdx) # sched: [8:1.00]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_cmpxchg_16:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: cmpxchgw %di, %si # sched: [15:7.50]
+; ATOM-NEXT: cmpxchgw %di, (%rdx) # sched: [14:7.00]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_cmpxchg_16:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: cmpxchgw %di, %si # sched: [1:0.50]
+; SLM-NEXT: cmpxchgw %di, (%rdx) # sched: [4:2.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_cmpxchg_16:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: cmpxchgw %di, %si # sched: [5:1.33]
+; SANDY-NEXT: cmpxchgw %di, (%rdx) # sched: [8:1.00]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_cmpxchg_16:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: cmpxchgw %di, %si # sched: [5:1.25]
+; HASWELL-NEXT: cmpxchgw %di, (%rdx) # sched: [9:1.00]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_cmpxchg_16:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: cmpxchgw %di, %si # sched: [5:1.25]
+; BROADWELL-NEXT: cmpxchgw %di, (%rdx) # sched: [8:1.00]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_cmpxchg_16:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: cmpxchgw %di, %si # sched: [5:1.25]
+; SKYLAKE-NEXT: cmpxchgw %di, (%rdx) # sched: [8:1.00]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_cmpxchg_16:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: cmpxchgw %di, %si # sched: [5:1.25]
+; SKX-NEXT: cmpxchgw %di, (%rdx) # sched: [8:1.00]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_cmpxchg_16:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: cmpxchgw %di, %si # sched: [1:0.50]
+; BTVER2-NEXT: cmpxchgw %di, (%rdx) # sched: [4:1.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_cmpxchg_16:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: cmpxchgw %di, %si # sched: [1:0.25]
+; ZNVER1-NEXT: cmpxchgw %di, (%rdx) # sched: [8:0.50]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ tail call void asm "cmpxchgw $0, $1 \0a\09 cmpxchgw $0, $2", "r,r,*m"(i16 %a0, i16 %a1, i16 *%a2) nounwind
+ ret void
+}
+define void @test_cmpxchg_32(i32 %a0, i32 %a1, i32 *%a2) optsize {
+; GENERIC-LABEL: test_cmpxchg_32:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: cmpxchgl %edi, %esi # sched: [5:1.33]
+; GENERIC-NEXT: cmpxchgl %edi, (%rdx) # sched: [8:1.00]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_cmpxchg_32:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: cmpxchgl %edi, %esi # sched: [15:7.50]
+; ATOM-NEXT: cmpxchgl %edi, (%rdx) # sched: [14:7.00]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_cmpxchg_32:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: cmpxchgl %edi, %esi # sched: [1:0.50]
+; SLM-NEXT: cmpxchgl %edi, (%rdx) # sched: [4:2.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_cmpxchg_32:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: cmpxchgl %edi, %esi # sched: [5:1.33]
+; SANDY-NEXT: cmpxchgl %edi, (%rdx) # sched: [8:1.00]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_cmpxchg_32:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: cmpxchgl %edi, %esi # sched: [5:1.25]
+; HASWELL-NEXT: cmpxchgl %edi, (%rdx) # sched: [9:1.00]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_cmpxchg_32:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: cmpxchgl %edi, %esi # sched: [5:1.25]
+; BROADWELL-NEXT: cmpxchgl %edi, (%rdx) # sched: [8:1.00]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_cmpxchg_32:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: cmpxchgl %edi, %esi # sched: [5:1.25]
+; SKYLAKE-NEXT: cmpxchgl %edi, (%rdx) # sched: [8:1.00]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_cmpxchg_32:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: cmpxchgl %edi, %esi # sched: [5:1.25]
+; SKX-NEXT: cmpxchgl %edi, (%rdx) # sched: [8:1.00]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_cmpxchg_32:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: cmpxchgl %edi, %esi # sched: [1:0.50]
+; BTVER2-NEXT: cmpxchgl %edi, (%rdx) # sched: [4:1.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_cmpxchg_32:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: cmpxchgl %edi, %esi # sched: [1:0.25]
+; ZNVER1-NEXT: cmpxchgl %edi, (%rdx) # sched: [8:0.50]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ tail call void asm "cmpxchgl $0, $1 \0a\09 cmpxchgl $0, $2", "r,r,*m"(i32 %a0, i32 %a1, i32 *%a2) nounwind
+ ret void
+}
+define void @test_cmpxchg_64(i64 %a0, i64 %a1, i64 *%a2) optsize {
+; GENERIC-LABEL: test_cmpxchg_64:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: cmpxchgq %rdi, %rsi # sched: [5:1.33]
+; GENERIC-NEXT: cmpxchgq %rdi, (%rdx) # sched: [8:1.00]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_cmpxchg_64:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: cmpxchgq %rdi, %rsi # sched: [15:7.50]
+; ATOM-NEXT: cmpxchgq %rdi, (%rdx) # sched: [14:7.00]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_cmpxchg_64:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: cmpxchgq %rdi, %rsi # sched: [1:0.50]
+; SLM-NEXT: cmpxchgq %rdi, (%rdx) # sched: [4:2.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_cmpxchg_64:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: cmpxchgq %rdi, %rsi # sched: [5:1.33]
+; SANDY-NEXT: cmpxchgq %rdi, (%rdx) # sched: [8:1.00]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_cmpxchg_64:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: cmpxchgq %rdi, %rsi # sched: [5:1.25]
+; HASWELL-NEXT: cmpxchgq %rdi, (%rdx) # sched: [9:1.00]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_cmpxchg_64:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: cmpxchgq %rdi, %rsi # sched: [5:1.25]
+; BROADWELL-NEXT: cmpxchgq %rdi, (%rdx) # sched: [8:1.00]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_cmpxchg_64:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: cmpxchgq %rdi, %rsi # sched: [5:1.25]
+; SKYLAKE-NEXT: cmpxchgq %rdi, (%rdx) # sched: [8:1.00]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_cmpxchg_64:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: cmpxchgq %rdi, %rsi # sched: [5:1.25]
+; SKX-NEXT: cmpxchgq %rdi, (%rdx) # sched: [8:1.00]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_cmpxchg_64:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: cmpxchgq %rdi, %rsi # sched: [1:0.50]
+; BTVER2-NEXT: cmpxchgq %rdi, (%rdx) # sched: [4:1.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_cmpxchg_64:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: cmpxchgq %rdi, %rsi # sched: [1:0.25]
+; ZNVER1-NEXT: cmpxchgq %rdi, (%rdx) # sched: [8:0.50]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ tail call void asm "cmpxchgq $0, $1 \0a\09 cmpxchgq $0, $2", "r,r,*m"(i64 %a0, i64 %a1, i64 *%a2) nounwind
+ ret void
+}
+define void @test_cmpxchg8b_cmpxchg16b(i8 *%a0) optsize {
+; GENERIC-LABEL: test_cmpxchg8b_cmpxchg16b:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: cmpxchg8b (%rdi) # sched: [5:1.00]
+; GENERIC-NEXT: cmpxchg16b (%rdi) # sched: [5:1.00]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_cmpxchg8b_cmpxchg16b:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: cmpxchg8b (%rdi) # sched: [18:9.00]
+; ATOM-NEXT: cmpxchg16b (%rdi) # sched: [22:11.00]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_cmpxchg8b_cmpxchg16b:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: cmpxchg8b (%rdi) # sched: [4:2.00]
+; SLM-NEXT: cmpxchg16b (%rdi) # sched: [4:2.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_cmpxchg8b_cmpxchg16b:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: cmpxchg8b (%rdi) # sched: [5:1.00]
+; SANDY-NEXT: cmpxchg16b (%rdi) # sched: [5:1.00]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_cmpxchg8b_cmpxchg16b:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: cmpxchg8b (%rdi) # sched: [17:2.75]
+; HASWELL-NEXT: cmpxchg16b (%rdi) # sched: [22:4.00]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_cmpxchg8b_cmpxchg16b:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: cmpxchg8b (%rdi) # sched: [16:2.75]
+; BROADWELL-NEXT: cmpxchg16b (%rdi) # sched: [21:4.00]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_cmpxchg8b_cmpxchg16b:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: cmpxchg8b (%rdi) # sched: [16:2.75]
+; SKYLAKE-NEXT: cmpxchg16b (%rdi) # sched: [23:4.00]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_cmpxchg8b_cmpxchg16b:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: cmpxchg8b (%rdi) # sched: [16:2.75]
+; SKX-NEXT: cmpxchg16b (%rdi) # sched: [23:4.00]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_cmpxchg8b_cmpxchg16b:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: cmpxchg8b (%rdi) # sched: [4:1.00]
+; BTVER2-NEXT: cmpxchg16b (%rdi) # sched: [4:1.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_cmpxchg8b_cmpxchg16b:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: cmpxchg8b (%rdi) # sched: [1:0.50]
+; ZNVER1-NEXT: cmpxchg16b (%rdi) # sched: [100:?]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ tail call void asm "cmpxchg8b $0 \0a\09 cmpxchg16b $0", "*m"(i8 *%a0) nounwind
+ ret void
+}
+
+define void @test_cpuid() optsize {
+; GENERIC-LABEL: test_cpuid:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: cpuid # sched: [100:0.33]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_cpuid:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: cpuid # sched: [121:60.50]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_cpuid:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: cpuid # sched: [100:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_cpuid:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: cpuid # sched: [100:0.33]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_cpuid:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: cpuid # sched: [18:2.00]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_cpuid:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: cpuid # sched: [18:2.00]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_cpuid:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: cpuid # sched: [18:2.00]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_cpuid:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: cpuid # sched: [18:2.00]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_cpuid:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: cpuid # sched: [100:0.17]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_cpuid:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: cpuid # sched: [100:?]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ tail call void asm "cpuid", ""() nounwind
+ ret void
+}
+
+define void @test_dec8(i8 %a0, i8* %a1) optsize {
+; GENERIC-LABEL: test_dec8:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: decb %dil # sched: [1:0.33]
+; GENERIC-NEXT: decb (%rsi) # sched: [7:1.00]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_dec8:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: decb %dil # sched: [1:0.50]
+; ATOM-NEXT: decb (%rsi) # sched: [1:1.00]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_dec8:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: decb %dil # sched: [1:0.50]
+; SLM-NEXT: decb (%rsi) # sched: [4:2.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_dec8:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: decb %dil # sched: [1:0.33]
+; SANDY-NEXT: decb (%rsi) # sched: [7:1.00]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_dec8:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: decb %dil # sched: [1:0.25]
+; HASWELL-NEXT: decb (%rsi) # sched: [7:1.00]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_dec8:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: decb %dil # sched: [1:0.25]
+; BROADWELL-NEXT: decb (%rsi) # sched: [6:1.00]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_dec8:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: decb %dil # sched: [1:0.25]
+; SKYLAKE-NEXT: decb (%rsi) # sched: [6:1.00]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_dec8:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: decb %dil # sched: [1:0.25]
+; SKX-NEXT: decb (%rsi) # sched: [6:1.00]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_dec8:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: decb %dil # sched: [1:0.50]
+; BTVER2-NEXT: decb (%rsi) # sched: [4:1.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_dec8:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: decb %dil # sched: [1:0.25]
+; ZNVER1-NEXT: decb (%rsi) # sched: [5:0.50]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ tail call void asm "decb $0 \0A\09 decb $1", "r,*m"(i8 %a0, i8* %a1) nounwind
+ ret void
+}
+define void @test_dec16(i16 %a0, i16* %a1) optsize {
+; GENERIC-LABEL: test_dec16:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: decw %di # sched: [1:0.33]
+; GENERIC-NEXT: decw (%rsi) # sched: [7:1.00]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_dec16:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: decw %di # sched: [1:0.50]
+; ATOM-NEXT: decw (%rsi) # sched: [1:1.00]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_dec16:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: decw %di # sched: [1:0.50]
+; SLM-NEXT: decw (%rsi) # sched: [4:2.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_dec16:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: decw %di # sched: [1:0.33]
+; SANDY-NEXT: decw (%rsi) # sched: [7:1.00]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_dec16:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: decw %di # sched: [1:0.25]
+; HASWELL-NEXT: decw (%rsi) # sched: [7:1.00]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_dec16:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: decw %di # sched: [1:0.25]
+; BROADWELL-NEXT: decw (%rsi) # sched: [6:1.00]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_dec16:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: decw %di # sched: [1:0.25]
+; SKYLAKE-NEXT: decw (%rsi) # sched: [6:1.00]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_dec16:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: decw %di # sched: [1:0.25]
+; SKX-NEXT: decw (%rsi) # sched: [6:1.00]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_dec16:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: decw %di # sched: [1:0.50]
+; BTVER2-NEXT: decw (%rsi) # sched: [4:1.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_dec16:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: decw %di # sched: [1:0.25]
+; ZNVER1-NEXT: decw (%rsi) # sched: [5:0.50]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ tail call void asm "decw $0 \0A\09 decw $1", "r,*m"(i16 %a0, i16* %a1) nounwind
+ ret void
+}
+define void @test_dec32(i32 %a0, i32* %a1) optsize {
+; GENERIC-LABEL: test_dec32:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: decl %edi # sched: [1:0.33]
+; GENERIC-NEXT: decl (%rsi) # sched: [7:1.00]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_dec32:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: decl %edi # sched: [1:0.50]
+; ATOM-NEXT: decl (%rsi) # sched: [1:1.00]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_dec32:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: decl %edi # sched: [1:0.50]
+; SLM-NEXT: decl (%rsi) # sched: [4:2.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_dec32:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: decl %edi # sched: [1:0.33]
+; SANDY-NEXT: decl (%rsi) # sched: [7:1.00]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_dec32:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: decl %edi # sched: [1:0.25]
+; HASWELL-NEXT: decl (%rsi) # sched: [7:1.00]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_dec32:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: decl %edi # sched: [1:0.25]
+; BROADWELL-NEXT: decl (%rsi) # sched: [6:1.00]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_dec32:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: decl %edi # sched: [1:0.25]
+; SKYLAKE-NEXT: decl (%rsi) # sched: [6:1.00]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_dec32:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: decl %edi # sched: [1:0.25]
+; SKX-NEXT: decl (%rsi) # sched: [6:1.00]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_dec32:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: decl %edi # sched: [1:0.50]
+; BTVER2-NEXT: decl (%rsi) # sched: [4:1.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_dec32:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: decl %edi # sched: [1:0.25]
+; ZNVER1-NEXT: decl (%rsi) # sched: [5:0.50]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ tail call void asm "decl $0 \0A\09 decl $1", "r,*m"(i32 %a0, i32* %a1) nounwind
+ ret void
+}
+define void @test_dec64(i64 %a0, i64* %a1) optsize {
+; GENERIC-LABEL: test_dec64:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: decq %rdi # sched: [1:0.33]
+; GENERIC-NEXT: decq (%rsi) # sched: [7:1.00]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_dec64:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: decq %rdi # sched: [1:0.50]
+; ATOM-NEXT: decq (%rsi) # sched: [1:1.00]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_dec64:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: decq %rdi # sched: [1:0.50]
+; SLM-NEXT: decq (%rsi) # sched: [4:2.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_dec64:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: decq %rdi # sched: [1:0.33]
+; SANDY-NEXT: decq (%rsi) # sched: [7:1.00]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_dec64:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: decq %rdi # sched: [1:0.25]
+; HASWELL-NEXT: decq (%rsi) # sched: [7:1.00]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_dec64:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: decq %rdi # sched: [1:0.25]
+; BROADWELL-NEXT: decq (%rsi) # sched: [6:1.00]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_dec64:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: decq %rdi # sched: [1:0.25]
+; SKYLAKE-NEXT: decq (%rsi) # sched: [6:1.00]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_dec64:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: decq %rdi # sched: [1:0.25]
+; SKX-NEXT: decq (%rsi) # sched: [6:1.00]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_dec64:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: decq %rdi # sched: [1:0.50]
+; BTVER2-NEXT: decq (%rsi) # sched: [4:1.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_dec64:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: decq %rdi # sched: [1:0.25]
+; ZNVER1-NEXT: decq (%rsi) # sched: [5:0.50]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ tail call void asm "decq $0 \0A\09 decq $1", "r,*m"(i64 %a0, i64* %a1) nounwind
+ ret void
+}
+
+define void @test_div(i8 %a0, i16 %a1, i32 %a2, i64 %a3, i8 *%p0, i16 *%p1, i32 *%p2, i64 *%p3) optsize {
+; GENERIC-LABEL: test_div:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: movq {{[0-9]+}}(%rsp), %r10 # sched: [5:0.50]
+; GENERIC-NEXT: movq {{[0-9]+}}(%rsp), %rax # sched: [5:0.50]
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: divb %dil # sched: [25:10.00]
+; GENERIC-NEXT: divb (%r8) # sched: [29:10.00]
+; GENERIC-NEXT: divw %si # sched: [25:10.00]
+; GENERIC-NEXT: divw (%r9) # sched: [29:10.00]
+; GENERIC-NEXT: divl %edx # sched: [25:10.00]
+; GENERIC-NEXT: divl (%rax) # sched: [29:10.00]
+; GENERIC-NEXT: divq %rcx # sched: [25:10.00]
+; GENERIC-NEXT: divq (%r10) # sched: [29:10.00]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_div:
+; ATOM: # %bb.0:
+; ATOM-NEXT: movq {{[0-9]+}}(%rsp), %r10 # sched: [1:1.00]
+; ATOM-NEXT: movq {{[0-9]+}}(%rsp), %rax # sched: [1:1.00]
+; ATOM-NEXT: #APP
+; ATOM-NEXT: divb %dil # sched: [50:25.00]
+; ATOM-NEXT: divb (%r8) # sched: [68:34.00]
+; ATOM-NEXT: divw %si # sched: [50:25.00]
+; ATOM-NEXT: divw (%r9) # sched: [50:25.00]
+; ATOM-NEXT: divl %edx # sched: [50:25.00]
+; ATOM-NEXT: divl (%rax) # sched: [50:25.00]
+; ATOM-NEXT: divq %rcx # sched: [130:65.00]
+; ATOM-NEXT: divq (%r10) # sched: [130:65.00]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_div:
+; SLM: # %bb.0:
+; SLM-NEXT: movq {{[0-9]+}}(%rsp), %r10 # sched: [3:1.00]
+; SLM-NEXT: movq {{[0-9]+}}(%rsp), %rax # sched: [3:1.00]
+; SLM-NEXT: #APP
+; SLM-NEXT: divb %dil # sched: [25:25.00]
+; SLM-NEXT: divb (%r8) # sched: [29:25.00]
+; SLM-NEXT: divw %si # sched: [25:25.00]
+; SLM-NEXT: divw (%r9) # sched: [29:25.00]
+; SLM-NEXT: divl %edx # sched: [25:25.00]
+; SLM-NEXT: divl (%rax) # sched: [29:25.00]
+; SLM-NEXT: divq %rcx # sched: [25:25.00]
+; SLM-NEXT: divq (%r10) # sched: [29:25.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_div:
+; SANDY: # %bb.0:
+; SANDY-NEXT: movq {{[0-9]+}}(%rsp), %r10 # sched: [5:0.50]
+; SANDY-NEXT: movq {{[0-9]+}}(%rsp), %rax # sched: [5:0.50]
+; SANDY-NEXT: #APP
+; SANDY-NEXT: divb %dil # sched: [25:10.00]
+; SANDY-NEXT: divb (%r8) # sched: [29:10.00]
+; SANDY-NEXT: divw %si # sched: [25:10.00]
+; SANDY-NEXT: divw (%r9) # sched: [29:10.00]
+; SANDY-NEXT: divl %edx # sched: [25:10.00]
+; SANDY-NEXT: divl (%rax) # sched: [29:10.00]
+; SANDY-NEXT: divq %rcx # sched: [25:10.00]
+; SANDY-NEXT: divq (%r10) # sched: [29:10.00]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_div:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: movq {{[0-9]+}}(%rsp), %r10 # sched: [5:0.50]
+; HASWELL-NEXT: movq {{[0-9]+}}(%rsp), %rax # sched: [5:0.50]
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: divb %dil # sched: [22:1.00]
+; HASWELL-NEXT: divb (%r8) # sched: [29:10.00]
+; HASWELL-NEXT: divw %si # sched: [98:8.00]
+; HASWELL-NEXT: divw (%r9) # sched: [29:10.00]
+; HASWELL-NEXT: divl %edx # sched: [98:8.00]
+; HASWELL-NEXT: divl (%rax) # sched: [29:10.00]
+; HASWELL-NEXT: divq %rcx # sched: [98:8.00]
+; HASWELL-NEXT: divq (%r10) # sched: [29:10.00]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_div:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: movq {{[0-9]+}}(%rsp), %r10 # sched: [5:0.50]
+; BROADWELL-NEXT: movq {{[0-9]+}}(%rsp), %rax # sched: [5:0.50]
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: divb %dil # sched: [25:10.00]
+; BROADWELL-NEXT: divb (%r8) # sched: [34:2.00]
+; BROADWELL-NEXT: divw %si # sched: [80:8.00]
+; BROADWELL-NEXT: divw (%r9) # sched: [34:2.00]
+; BROADWELL-NEXT: divl %edx # sched: [80:8.00]
+; BROADWELL-NEXT: divl (%rax) # sched: [34:2.00]
+; BROADWELL-NEXT: divq %rcx # sched: [80:8.00]
+; BROADWELL-NEXT: divq (%r10) # sched: [34:2.00]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_div:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: movq {{[0-9]+}}(%rsp), %r10 # sched: [5:0.50]
+; SKYLAKE-NEXT: movq {{[0-9]+}}(%rsp), %rax # sched: [5:0.50]
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: divb %dil # sched: [25:10.00]
+; SKYLAKE-NEXT: divb (%r8) # sched: [29:10.00]
+; SKYLAKE-NEXT: divw %si # sched: [76:8.00]
+; SKYLAKE-NEXT: divw (%r9) # sched: [29:10.00]
+; SKYLAKE-NEXT: divl %edx # sched: [76:8.00]
+; SKYLAKE-NEXT: divl (%rax) # sched: [29:10.00]
+; SKYLAKE-NEXT: divq %rcx # sched: [76:8.00]
+; SKYLAKE-NEXT: divq (%r10) # sched: [29:10.00]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_div:
+; SKX: # %bb.0:
+; SKX-NEXT: movq {{[0-9]+}}(%rsp), %r10 # sched: [5:0.50]
+; SKX-NEXT: movq {{[0-9]+}}(%rsp), %rax # sched: [5:0.50]
+; SKX-NEXT: #APP
+; SKX-NEXT: divb %dil # sched: [25:10.00]
+; SKX-NEXT: divb (%r8) # sched: [29:10.00]
+; SKX-NEXT: divw %si # sched: [76:8.00]
+; SKX-NEXT: divw (%r9) # sched: [29:10.00]
+; SKX-NEXT: divl %edx # sched: [76:8.00]
+; SKX-NEXT: divl (%rax) # sched: [29:10.00]
+; SKX-NEXT: divq %rcx # sched: [76:8.00]
+; SKX-NEXT: divq (%r10) # sched: [29:10.00]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_div:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: movq {{[0-9]+}}(%rsp), %r10 # sched: [5:1.00]
+; BTVER2-NEXT: movq {{[0-9]+}}(%rsp), %rax # sched: [5:1.00]
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: divb %dil # sched: [25:25.00]
+; BTVER2-NEXT: divb (%r8) # sched: [41:25.00]
+; BTVER2-NEXT: divw %si # sched: [25:25.00]
+; BTVER2-NEXT: divw (%r9) # sched: [41:25.00]
+; BTVER2-NEXT: divl %edx # sched: [25:25.00]
+; BTVER2-NEXT: divl (%rax) # sched: [41:25.00]
+; BTVER2-NEXT: divq %rcx # sched: [25:25.00]
+; BTVER2-NEXT: divq (%r10) # sched: [41:25.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_div:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: movq {{[0-9]+}}(%rsp), %r10 # sched: [8:0.50]
+; ZNVER1-NEXT: movq {{[0-9]+}}(%rsp), %rax # sched: [8:0.50]
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: divb %dil # sched: [15:1.00]
+; ZNVER1-NEXT: divb (%r8) # sched: [45:41.00]
+; ZNVER1-NEXT: divw %si # sched: [17:1.00]
+; ZNVER1-NEXT: divw (%r9) # sched: [45:41.00]
+; ZNVER1-NEXT: divl %edx # sched: [25:1.00]
+; ZNVER1-NEXT: divl (%rax) # sched: [45:41.00]
+; ZNVER1-NEXT: divq %rcx # sched: [41:1.00]
+; ZNVER1-NEXT: divq (%r10) # sched: [45:41.00]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ tail call void asm "divb $0 \0A\09 divb $4 \0A\09 divw $1 \0A\09 divw $5 \0A\09 divl $2 \0A\09 divl $6 \0A\09 divq $3 \0A\09 divq $7", "r,r,r,r,*m,*m,*m,*m"(i8 %a0, i16 %a1, i32 %a2, i64 %a3, i8 *%p0, i16 *%p1, i32 *%p2, i64 *%p3) nounwind
+ ret void
+}
+
+define void @test_enter() optsize {
+; GENERIC-LABEL: test_enter:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: enter $7, $4095 # imm = 0xFFF
+; GENERIC-NEXT: # sched: [100:0.33]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_enter:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: enter $7, $4095 # imm = 0xFFF
+; ATOM-NEXT: # sched: [32:16.00]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_enter:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: enter $7, $4095 # imm = 0xFFF
+; SLM-NEXT: # sched: [100:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_enter:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: enter $7, $4095 # imm = 0xFFF
+; SANDY-NEXT: # sched: [100:0.33]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_enter:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: enter $7, $4095 # imm = 0xFFF
+; HASWELL-NEXT: # sched: [100:0.25]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_enter:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: enter $7, $4095 # imm = 0xFFF
+; BROADWELL-NEXT: # sched: [100:0.25]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_enter:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: enter $7, $4095 # imm = 0xFFF
+; SKYLAKE-NEXT: # sched: [100:0.25]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_enter:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: enter $7, $4095 # imm = 0xFFF
+; SKX-NEXT: # sched: [100:0.25]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_enter:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: enter $7, $4095 # imm = 0xFFF
+; BTVER2-NEXT: # sched: [100:0.17]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_enter:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: enter $7, $4095 # imm = 0xFFF
+; ZNVER1-NEXT: # sched: [100:?]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ tail call void asm "enter $0, $1", "i,i"(i8 7, i16 4095) nounwind
+ ret void
+}
+
+define void @test_idiv(i8 %a0, i16 %a1, i32 %a2, i64 %a3, i8 *%p0, i16 *%p1, i32 *%p2, i64 *%p3) optsize {
+; GENERIC-LABEL: test_idiv:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: movq {{[0-9]+}}(%rsp), %r10 # sched: [5:0.50]
+; GENERIC-NEXT: movq {{[0-9]+}}(%rsp), %rax # sched: [5:0.50]
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: idivb %dil # sched: [25:10.00]
+; GENERIC-NEXT: idivb (%r8) # sched: [29:10.00]
+; GENERIC-NEXT: idivw %si # sched: [25:10.00]
+; GENERIC-NEXT: idivw (%r9) # sched: [29:10.00]
+; GENERIC-NEXT: idivl %edx # sched: [25:10.00]
+; GENERIC-NEXT: idivl (%rax) # sched: [29:10.00]
+; GENERIC-NEXT: idivq %rcx # sched: [25:10.00]
+; GENERIC-NEXT: idivq (%r10) # sched: [29:10.00]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_idiv:
+; ATOM: # %bb.0:
+; ATOM-NEXT: movq {{[0-9]+}}(%rsp), %r10 # sched: [1:1.00]
+; ATOM-NEXT: movq {{[0-9]+}}(%rsp), %rax # sched: [1:1.00]
+; ATOM-NEXT: #APP
+; ATOM-NEXT: idivb %dil # sched: [62:31.00]
+; ATOM-NEXT: idivb (%r8) # sched: [62:31.00]
+; ATOM-NEXT: idivw %si # sched: [62:31.00]
+; ATOM-NEXT: idivw (%r9) # sched: [62:31.00]
+; ATOM-NEXT: idivl %edx # sched: [62:31.00]
+; ATOM-NEXT: idivl (%rax) # sched: [62:31.00]
+; ATOM-NEXT: idivq %rcx # sched: [130:65.00]
+; ATOM-NEXT: idivq (%r10) # sched: [130:65.00]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_idiv:
+; SLM: # %bb.0:
+; SLM-NEXT: movq {{[0-9]+}}(%rsp), %r10 # sched: [3:1.00]
+; SLM-NEXT: movq {{[0-9]+}}(%rsp), %rax # sched: [3:1.00]
+; SLM-NEXT: #APP
+; SLM-NEXT: idivb %dil # sched: [25:25.00]
+; SLM-NEXT: idivb (%r8) # sched: [29:25.00]
+; SLM-NEXT: idivw %si # sched: [25:25.00]
+; SLM-NEXT: idivw (%r9) # sched: [29:25.00]
+; SLM-NEXT: idivl %edx # sched: [25:25.00]
+; SLM-NEXT: idivl (%rax) # sched: [29:25.00]
+; SLM-NEXT: idivq %rcx # sched: [25:25.00]
+; SLM-NEXT: idivq (%r10) # sched: [29:25.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_idiv:
+; SANDY: # %bb.0:
+; SANDY-NEXT: movq {{[0-9]+}}(%rsp), %r10 # sched: [5:0.50]
+; SANDY-NEXT: movq {{[0-9]+}}(%rsp), %rax # sched: [5:0.50]
+; SANDY-NEXT: #APP
+; SANDY-NEXT: idivb %dil # sched: [25:10.00]
+; SANDY-NEXT: idivb (%r8) # sched: [29:10.00]
+; SANDY-NEXT: idivw %si # sched: [25:10.00]
+; SANDY-NEXT: idivw (%r9) # sched: [29:10.00]
+; SANDY-NEXT: idivl %edx # sched: [25:10.00]
+; SANDY-NEXT: idivl (%rax) # sched: [29:10.00]
+; SANDY-NEXT: idivq %rcx # sched: [25:10.00]
+; SANDY-NEXT: idivq (%r10) # sched: [29:10.00]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_idiv:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: movq {{[0-9]+}}(%rsp), %r10 # sched: [5:0.50]
+; HASWELL-NEXT: movq {{[0-9]+}}(%rsp), %rax # sched: [5:0.50]
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: idivb %dil # sched: [23:1.00]
+; HASWELL-NEXT: idivb (%r8) # sched: [29:10.00]
+; HASWELL-NEXT: idivw %si # sched: [112:16.50]
+; HASWELL-NEXT: idivw (%r9) # sched: [29:10.00]
+; HASWELL-NEXT: idivl %edx # sched: [112:16.50]
+; HASWELL-NEXT: idivl (%rax) # sched: [29:10.00]
+; HASWELL-NEXT: idivq %rcx # sched: [112:16.50]
+; HASWELL-NEXT: idivq (%r10) # sched: [29:10.00]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_idiv:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: movq {{[0-9]+}}(%rsp), %r10 # sched: [5:0.50]
+; BROADWELL-NEXT: movq {{[0-9]+}}(%rsp), %rax # sched: [5:0.50]
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: idivb %dil # sched: [25:10.00]
+; BROADWELL-NEXT: idivb (%r8) # sched: [35:2.00]
+; BROADWELL-NEXT: idivw %si # sched: [25:10.00]
+; BROADWELL-NEXT: idivw (%r9) # sched: [35:2.00]
+; BROADWELL-NEXT: idivl %edx # sched: [25:10.00]
+; BROADWELL-NEXT: idivl (%rax) # sched: [35:2.00]
+; BROADWELL-NEXT: idivq %rcx # sched: [25:10.00]
+; BROADWELL-NEXT: idivq (%r10) # sched: [35:2.00]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_idiv:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: movq {{[0-9]+}}(%rsp), %r10 # sched: [5:0.50]
+; SKYLAKE-NEXT: movq {{[0-9]+}}(%rsp), %rax # sched: [5:0.50]
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: idivb %dil # sched: [25:10.00]
+; SKYLAKE-NEXT: idivb (%r8) # sched: [28:4.00]
+; SKYLAKE-NEXT: idivw %si # sched: [102:16.50]
+; SKYLAKE-NEXT: idivw (%r9) # sched: [28:4.00]
+; SKYLAKE-NEXT: idivl %edx # sched: [102:16.50]
+; SKYLAKE-NEXT: idivl (%rax) # sched: [28:4.00]
+; SKYLAKE-NEXT: idivq %rcx # sched: [102:16.50]
+; SKYLAKE-NEXT: idivq (%r10) # sched: [28:4.00]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_idiv:
+; SKX: # %bb.0:
+; SKX-NEXT: movq {{[0-9]+}}(%rsp), %r10 # sched: [5:0.50]
+; SKX-NEXT: movq {{[0-9]+}}(%rsp), %rax # sched: [5:0.50]
+; SKX-NEXT: #APP
+; SKX-NEXT: idivb %dil # sched: [25:10.00]
+; SKX-NEXT: idivb (%r8) # sched: [28:4.00]
+; SKX-NEXT: idivw %si # sched: [102:16.50]
+; SKX-NEXT: idivw (%r9) # sched: [28:4.00]
+; SKX-NEXT: idivl %edx # sched: [102:16.50]
+; SKX-NEXT: idivl (%rax) # sched: [28:4.00]
+; SKX-NEXT: idivq %rcx # sched: [102:16.50]
+; SKX-NEXT: idivq (%r10) # sched: [28:4.00]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_idiv:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: movq {{[0-9]+}}(%rsp), %r10 # sched: [5:1.00]
+; BTVER2-NEXT: movq {{[0-9]+}}(%rsp), %rax # sched: [5:1.00]
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: idivb %dil # sched: [25:25.00]
+; BTVER2-NEXT: idivb (%r8) # sched: [41:25.00]
+; BTVER2-NEXT: idivw %si # sched: [25:25.00]
+; BTVER2-NEXT: idivw (%r9) # sched: [41:25.00]
+; BTVER2-NEXT: idivl %edx # sched: [25:25.00]
+; BTVER2-NEXT: idivl (%rax) # sched: [41:25.00]
+; BTVER2-NEXT: idivq %rcx # sched: [25:25.00]
+; BTVER2-NEXT: idivq (%r10) # sched: [41:25.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_idiv:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: movq {{[0-9]+}}(%rsp), %r10 # sched: [8:0.50]
+; ZNVER1-NEXT: movq {{[0-9]+}}(%rsp), %rax # sched: [8:0.50]
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: idivb %dil # sched: [15:1.00]
+; ZNVER1-NEXT: idivb (%r8) # sched: [45:41.00]
+; ZNVER1-NEXT: idivw %si # sched: [17:1.00]
+; ZNVER1-NEXT: idivw (%r9) # sched: [45:41.00]
+; ZNVER1-NEXT: idivl %edx # sched: [25:1.00]
+; ZNVER1-NEXT: idivl (%rax) # sched: [45:41.00]
+; ZNVER1-NEXT: idivq %rcx # sched: [41:1.00]
+; ZNVER1-NEXT: idivq (%r10) # sched: [45:41.00]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ tail call void asm "idivb $0 \0A\09 idivb $4 \0A\09 idivw $1 \0A\09 idivw $5 \0A\09 idivl $2 \0A\09 idivl $6 \0A\09 idivq $3 \0A\09 idivq $7", "r,r,r,r,*m,*m,*m,*m"(i8 %a0, i16 %a1, i32 %a2, i64 %a3, i8 *%p0, i16 *%p1, i32 *%p2, i64 *%p3) nounwind
+ ret void
+}
+
+define void @test_imul_8(i8 %a0, i8* %a1) optsize {
+; GENERIC-LABEL: test_imul_8:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: imulb %dil # sched: [3:1.00]
+; GENERIC-NEXT: imulb (%rsi) # sched: [7:1.00]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_imul_8:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: imulb %dil # sched: [7:3.50]
+; ATOM-NEXT: imulb (%rsi) # sched: [7:3.50]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_imul_8:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: imulb %dil # sched: [3:1.00]
+; SLM-NEXT: imulb (%rsi) # sched: [6:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_imul_8:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: imulb %dil # sched: [3:1.00]
+; SANDY-NEXT: imulb (%rsi) # sched: [7:1.00]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_imul_8:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: imulb %dil # sched: [3:1.00]
+; HASWELL-NEXT: imulb (%rsi) # sched: [8:1.00]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_imul_8:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: imulb %dil # sched: [3:1.00]
+; BROADWELL-NEXT: imulb (%rsi) # sched: [8:1.00]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_imul_8:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: imulb %dil # sched: [3:1.00]
+; SKYLAKE-NEXT: imulb (%rsi) # sched: [8:1.00]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_imul_8:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: imulb %dil # sched: [3:1.00]
+; SKX-NEXT: imulb (%rsi) # sched: [8:1.00]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_imul_8:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: imulb %dil # sched: [3:1.00]
+; BTVER2-NEXT: imulb (%rsi) # sched: [6:1.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_imul_8:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: imulb %dil # sched: [4:1.00]
+; ZNVER1-NEXT: imulb (%rsi) # sched: [8:1.00]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ tail call void asm "imulb $0 \0A\09 imulb $1", "r,*m"(i8 %a0, i8* %a1) nounwind
+ ret void
+}
+define void @test_imul_16(i16 %a0, i16* %a1) optsize {
+; GENERIC-LABEL: test_imul_16:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: imulw %di # sched: [3:1.00]
+; GENERIC-NEXT: imulw (%rsi) # sched: [7:1.00]
+; GENERIC-NEXT: imulw %di, %di # sched: [3:1.00]
+; GENERIC-NEXT: imulw (%rsi), %di # sched: [7:1.00]
+; GENERIC-NEXT: imulw $511, %di, %di # imm = 0x1FF
+; GENERIC-NEXT: # sched: [3:1.00]
+; GENERIC-NEXT: imulw $511, (%rsi), %di # imm = 0x1FF
+; GENERIC-NEXT: # sched: [7:1.00]
+; GENERIC-NEXT: imulw $7, %di, %di # sched: [3:1.00]
+; GENERIC-NEXT: imulw $7, (%rsi), %di # sched: [7:1.00]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_imul_16:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: imulw %di # sched: [6:3.00]
+; ATOM-NEXT: imulw (%rsi) # sched: [8:4.00]
+; ATOM-NEXT: imulw %di, %di # sched: [6:3.00]
+; ATOM-NEXT: imulw (%rsi), %di # sched: [7:3.50]
+; ATOM-NEXT: imulw $511, %di, %di # imm = 0x1FF
+; ATOM-NEXT: # sched: [6:3.00]
+; ATOM-NEXT: imulw $511, (%rsi), %di # imm = 0x1FF
+; ATOM-NEXT: # sched: [7:3.50]
+; ATOM-NEXT: imulw $7, %di, %di # sched: [6:3.00]
+; ATOM-NEXT: imulw $7, (%rsi), %di # sched: [7:3.50]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_imul_16:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: imulw %di # sched: [3:1.00]
+; SLM-NEXT: imulw (%rsi) # sched: [6:1.00]
+; SLM-NEXT: imulw %di, %di # sched: [3:1.00]
+; SLM-NEXT: imulw (%rsi), %di # sched: [6:1.00]
+; SLM-NEXT: imulw $511, %di, %di # imm = 0x1FF
+; SLM-NEXT: # sched: [3:1.00]
+; SLM-NEXT: imulw $511, (%rsi), %di # imm = 0x1FF
+; SLM-NEXT: # sched: [6:1.00]
+; SLM-NEXT: imulw $7, %di, %di # sched: [3:1.00]
+; SLM-NEXT: imulw $7, (%rsi), %di # sched: [6:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_imul_16:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: imulw %di # sched: [3:1.00]
+; SANDY-NEXT: imulw (%rsi) # sched: [7:1.00]
+; SANDY-NEXT: imulw %di, %di # sched: [3:1.00]
+; SANDY-NEXT: imulw (%rsi), %di # sched: [7:1.00]
+; SANDY-NEXT: imulw $511, %di, %di # imm = 0x1FF
+; SANDY-NEXT: # sched: [3:1.00]
+; SANDY-NEXT: imulw $511, (%rsi), %di # imm = 0x1FF
+; SANDY-NEXT: # sched: [7:1.00]
+; SANDY-NEXT: imulw $7, %di, %di # sched: [3:1.00]
+; SANDY-NEXT: imulw $7, (%rsi), %di # sched: [7:1.00]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_imul_16:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: imulw %di # sched: [4:1.00]
+; HASWELL-NEXT: imulw (%rsi) # sched: [8:1.00]
+; HASWELL-NEXT: imulw %di, %di # sched: [3:1.00]
+; HASWELL-NEXT: imulw (%rsi), %di # sched: [8:1.00]
+; HASWELL-NEXT: imulw $511, %di, %di # imm = 0x1FF
+; HASWELL-NEXT: # sched: [3:1.00]
+; HASWELL-NEXT: imulw $511, (%rsi), %di # imm = 0x1FF
+; HASWELL-NEXT: # sched: [8:1.00]
+; HASWELL-NEXT: imulw $7, %di, %di # sched: [3:1.00]
+; HASWELL-NEXT: imulw $7, (%rsi), %di # sched: [8:1.00]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_imul_16:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: imulw %di # sched: [4:1.00]
+; BROADWELL-NEXT: imulw (%rsi) # sched: [8:1.00]
+; BROADWELL-NEXT: imulw %di, %di # sched: [3:1.00]
+; BROADWELL-NEXT: imulw (%rsi), %di # sched: [4:1.00]
+; BROADWELL-NEXT: imulw $511, %di, %di # imm = 0x1FF
+; BROADWELL-NEXT: # sched: [3:1.00]
+; BROADWELL-NEXT: imulw $511, (%rsi), %di # imm = 0x1FF
+; BROADWELL-NEXT: # sched: [4:1.00]
+; BROADWELL-NEXT: imulw $7, %di, %di # sched: [3:1.00]
+; BROADWELL-NEXT: imulw $7, (%rsi), %di # sched: [4:1.00]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_imul_16:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: imulw %di # sched: [4:1.00]
+; SKYLAKE-NEXT: imulw (%rsi) # sched: [3:1.00]
+; SKYLAKE-NEXT: imulw %di, %di # sched: [3:1.00]
+; SKYLAKE-NEXT: imulw (%rsi), %di # sched: [4:1.00]
+; SKYLAKE-NEXT: imulw $511, %di, %di # imm = 0x1FF
+; SKYLAKE-NEXT: # sched: [3:1.00]
+; SKYLAKE-NEXT: imulw $511, (%rsi), %di # imm = 0x1FF
+; SKYLAKE-NEXT: # sched: [4:1.00]
+; SKYLAKE-NEXT: imulw $7, %di, %di # sched: [3:1.00]
+; SKYLAKE-NEXT: imulw $7, (%rsi), %di # sched: [4:1.00]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_imul_16:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: imulw %di # sched: [4:1.00]
+; SKX-NEXT: imulw (%rsi) # sched: [8:1.00]
+; SKX-NEXT: imulw %di, %di # sched: [3:1.00]
+; SKX-NEXT: imulw (%rsi), %di # sched: [4:1.00]
+; SKX-NEXT: imulw $511, %di, %di # imm = 0x1FF
+; SKX-NEXT: # sched: [3:1.00]
+; SKX-NEXT: imulw $511, (%rsi), %di # imm = 0x1FF
+; SKX-NEXT: # sched: [4:1.00]
+; SKX-NEXT: imulw $7, %di, %di # sched: [3:1.00]
+; SKX-NEXT: imulw $7, (%rsi), %di # sched: [4:1.00]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_imul_16:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: imulw %di # sched: [3:1.00]
+; BTVER2-NEXT: imulw (%rsi) # sched: [6:1.00]
+; BTVER2-NEXT: imulw %di, %di # sched: [3:1.00]
+; BTVER2-NEXT: imulw (%rsi), %di # sched: [6:1.00]
+; BTVER2-NEXT: imulw $511, %di, %di # imm = 0x1FF
+; BTVER2-NEXT: # sched: [3:1.00]
+; BTVER2-NEXT: imulw $511, (%rsi), %di # imm = 0x1FF
+; BTVER2-NEXT: # sched: [6:1.00]
+; BTVER2-NEXT: imulw $7, %di, %di # sched: [3:1.00]
+; BTVER2-NEXT: imulw $7, (%rsi), %di # sched: [6:1.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_imul_16:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: imulw %di # sched: [3:1.00]
+; ZNVER1-NEXT: imulw (%rsi) # sched: [8:1.00]
+; ZNVER1-NEXT: imulw %di, %di # sched: [3:1.00]
+; ZNVER1-NEXT: imulw (%rsi), %di # sched: [3:1.00]
+; ZNVER1-NEXT: imulw $511, %di, %di # imm = 0x1FF
+; ZNVER1-NEXT: # sched: [3:1.00]
+; ZNVER1-NEXT: imulw $511, (%rsi), %di # imm = 0x1FF
+; ZNVER1-NEXT: # sched: [3:1.00]
+; ZNVER1-NEXT: imulw $7, %di, %di # sched: [3:1.00]
+; ZNVER1-NEXT: imulw $7, (%rsi), %di # sched: [3:1.00]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ tail call void asm "imulw $0 \0A\09 imulw $1 \0A\09 imulw $0, $0 \0A\09 imulw $1, $0 \0A\09 imulw $2, $0, $0 \0A\09 imulw $2, $1, $0 \0A\09 imulw $3, $0, $0 \0A\09 imulw $3, $1, $0", "r,*m,i,i"(i16 %a0, i16* %a1, i16 511, i8 7) nounwind
+ ret void
+}
+define void @test_imul_32(i32 %a0, i32* %a1) optsize {
+; GENERIC-LABEL: test_imul_32:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: imull %edi # sched: [3:1.00]
+; GENERIC-NEXT: imull (%rsi) # sched: [7:1.00]
+; GENERIC-NEXT: imull %edi, %edi # sched: [3:1.00]
+; GENERIC-NEXT: imull (%rsi), %edi # sched: [7:1.00]
+; GENERIC-NEXT: imull $665536, %edi, %edi # imm = 0xA27C0
+; GENERIC-NEXT: # sched: [3:1.00]
+; GENERIC-NEXT: imull $665536, (%rsi), %edi # imm = 0xA27C0
+; GENERIC-NEXT: # sched: [7:1.00]
+; GENERIC-NEXT: imull $7, %edi, %edi # sched: [3:1.00]
+; GENERIC-NEXT: imull $7, (%rsi), %edi # sched: [7:1.00]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_imul_32:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: imull %edi # sched: [5:5.00]
+; ATOM-NEXT: imull (%rsi) # sched: [7:3.50]
+; ATOM-NEXT: imull %edi, %edi # sched: [5:5.00]
+; ATOM-NEXT: imull (%rsi), %edi # sched: [5:5.00]
+; ATOM-NEXT: imull $665536, %edi, %edi # imm = 0xA27C0
+; ATOM-NEXT: # sched: [5:5.00]
+; ATOM-NEXT: imull $665536, (%rsi), %edi # imm = 0xA27C0
+; ATOM-NEXT: # sched: [5:5.00]
+; ATOM-NEXT: imull $7, %edi, %edi # sched: [5:5.00]
+; ATOM-NEXT: imull $7, (%rsi), %edi # sched: [5:5.00]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_imul_32:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: imull %edi # sched: [3:1.00]
+; SLM-NEXT: imull (%rsi) # sched: [6:1.00]
+; SLM-NEXT: imull %edi, %edi # sched: [3:1.00]
+; SLM-NEXT: imull (%rsi), %edi # sched: [6:1.00]
+; SLM-NEXT: imull $665536, %edi, %edi # imm = 0xA27C0
+; SLM-NEXT: # sched: [3:1.00]
+; SLM-NEXT: imull $665536, (%rsi), %edi # imm = 0xA27C0
+; SLM-NEXT: # sched: [6:1.00]
+; SLM-NEXT: imull $7, %edi, %edi # sched: [3:1.00]
+; SLM-NEXT: imull $7, (%rsi), %edi # sched: [6:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_imul_32:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: imull %edi # sched: [3:1.00]
+; SANDY-NEXT: imull (%rsi) # sched: [7:1.00]
+; SANDY-NEXT: imull %edi, %edi # sched: [3:1.00]
+; SANDY-NEXT: imull (%rsi), %edi # sched: [7:1.00]
+; SANDY-NEXT: imull $665536, %edi, %edi # imm = 0xA27C0
+; SANDY-NEXT: # sched: [3:1.00]
+; SANDY-NEXT: imull $665536, (%rsi), %edi # imm = 0xA27C0
+; SANDY-NEXT: # sched: [7:1.00]
+; SANDY-NEXT: imull $7, %edi, %edi # sched: [3:1.00]
+; SANDY-NEXT: imull $7, (%rsi), %edi # sched: [7:1.00]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_imul_32:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: imull %edi # sched: [4:1.00]
+; HASWELL-NEXT: imull (%rsi) # sched: [8:1.00]
+; HASWELL-NEXT: imull %edi, %edi # sched: [3:1.00]
+; HASWELL-NEXT: imull (%rsi), %edi # sched: [8:1.00]
+; HASWELL-NEXT: imull $665536, %edi, %edi # imm = 0xA27C0
+; HASWELL-NEXT: # sched: [3:1.00]
+; HASWELL-NEXT: imull $665536, (%rsi), %edi # imm = 0xA27C0
+; HASWELL-NEXT: # sched: [8:1.00]
+; HASWELL-NEXT: imull $7, %edi, %edi # sched: [3:1.00]
+; HASWELL-NEXT: imull $7, (%rsi), %edi # sched: [8:1.00]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_imul_32:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: imull %edi # sched: [4:1.00]
+; BROADWELL-NEXT: imull (%rsi) # sched: [8:1.00]
+; BROADWELL-NEXT: imull %edi, %edi # sched: [3:1.00]
+; BROADWELL-NEXT: imull (%rsi), %edi # sched: [4:1.00]
+; BROADWELL-NEXT: imull $665536, %edi, %edi # imm = 0xA27C0
+; BROADWELL-NEXT: # sched: [3:1.00]
+; BROADWELL-NEXT: imull $665536, (%rsi), %edi # imm = 0xA27C0
+; BROADWELL-NEXT: # sched: [4:1.00]
+; BROADWELL-NEXT: imull $7, %edi, %edi # sched: [3:1.00]
+; BROADWELL-NEXT: imull $7, (%rsi), %edi # sched: [4:1.00]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_imul_32:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: imull %edi # sched: [5:1.00]
+; SKYLAKE-NEXT: imull (%rsi) # sched: [3:1.00]
+; SKYLAKE-NEXT: imull %edi, %edi # sched: [3:1.00]
+; SKYLAKE-NEXT: imull (%rsi), %edi # sched: [5:1.00]
+; SKYLAKE-NEXT: imull $665536, %edi, %edi # imm = 0xA27C0
+; SKYLAKE-NEXT: # sched: [3:1.00]
+; SKYLAKE-NEXT: imull $665536, (%rsi), %edi # imm = 0xA27C0
+; SKYLAKE-NEXT: # sched: [5:1.00]
+; SKYLAKE-NEXT: imull $7, %edi, %edi # sched: [3:1.00]
+; SKYLAKE-NEXT: imull $7, (%rsi), %edi # sched: [5:1.00]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_imul_32:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: imull %edi # sched: [4:1.00]
+; SKX-NEXT: imull (%rsi) # sched: [8:1.00]
+; SKX-NEXT: imull %edi, %edi # sched: [3:1.00]
+; SKX-NEXT: imull (%rsi), %edi # sched: [4:1.00]
+; SKX-NEXT: imull $665536, %edi, %edi # imm = 0xA27C0
+; SKX-NEXT: # sched: [3:1.00]
+; SKX-NEXT: imull $665536, (%rsi), %edi # imm = 0xA27C0
+; SKX-NEXT: # sched: [4:1.00]
+; SKX-NEXT: imull $7, %edi, %edi # sched: [3:1.00]
+; SKX-NEXT: imull $7, (%rsi), %edi # sched: [4:1.00]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_imul_32:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: imull %edi # sched: [3:1.00]
+; BTVER2-NEXT: imull (%rsi) # sched: [6:1.00]
+; BTVER2-NEXT: imull %edi, %edi # sched: [3:1.00]
+; BTVER2-NEXT: imull (%rsi), %edi # sched: [6:1.00]
+; BTVER2-NEXT: imull $665536, %edi, %edi # imm = 0xA27C0
+; BTVER2-NEXT: # sched: [3:1.00]
+; BTVER2-NEXT: imull $665536, (%rsi), %edi # imm = 0xA27C0
+; BTVER2-NEXT: # sched: [6:1.00]
+; BTVER2-NEXT: imull $7, %edi, %edi # sched: [3:1.00]
+; BTVER2-NEXT: imull $7, (%rsi), %edi # sched: [6:1.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_imul_32:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: imull %edi # sched: [3:1.00]
+; ZNVER1-NEXT: imull (%rsi) # sched: [8:1.00]
+; ZNVER1-NEXT: imull %edi, %edi # sched: [3:1.00]
+; ZNVER1-NEXT: imull (%rsi), %edi # sched: [3:1.00]
+; ZNVER1-NEXT: imull $665536, %edi, %edi # imm = 0xA27C0
+; ZNVER1-NEXT: # sched: [3:1.00]
+; ZNVER1-NEXT: imull $665536, (%rsi), %edi # imm = 0xA27C0
+; ZNVER1-NEXT: # sched: [3:1.00]
+; ZNVER1-NEXT: imull $7, %edi, %edi # sched: [3:1.00]
+; ZNVER1-NEXT: imull $7, (%rsi), %edi # sched: [3:1.00]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ tail call void asm "imull $0 \0A\09 imull $1 \0A\09 imull $0, $0 \0A\09 imull $1, $0 \0A\09 imull $2, $0, $0 \0A\09 imull $2, $1, $0 \0A\09 imull $3, $0, $0 \0A\09 imull $3, $1, $0", "r,*m,i,i"(i32 %a0, i32* %a1, i32 665536, i8 7) nounwind
+ ret void
+}
+define void @test_imul_64(i64 %a0, i64* %a1) optsize {
+; GENERIC-LABEL: test_imul_64:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: imulq %rdi # sched: [3:1.00]
+; GENERIC-NEXT: imulq (%rsi) # sched: [7:1.00]
+; GENERIC-NEXT: imulq %rdi, %rdi # sched: [3:1.00]
+; GENERIC-NEXT: imulq (%rsi), %rdi # sched: [7:1.00]
+; GENERIC-NEXT: imulq $665536, %rdi, %rdi # imm = 0xA27C0
+; GENERIC-NEXT: # sched: [3:1.00]
+; GENERIC-NEXT: imulq $665536, (%rsi), %rdi # imm = 0xA27C0
+; GENERIC-NEXT: # sched: [7:1.00]
+; GENERIC-NEXT: imulq $7, %rdi, %rdi # sched: [3:1.00]
+; GENERIC-NEXT: imulq $7, (%rsi), %rdi # sched: [7:1.00]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_imul_64:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: imulq %rdi # sched: [12:6.00]
+; ATOM-NEXT: imulq (%rsi) # sched: [12:6.00]
+; ATOM-NEXT: imulq %rdi, %rdi # sched: [12:6.00]
+; ATOM-NEXT: imulq (%rsi), %rdi # sched: [12:6.00]
+; ATOM-NEXT: imulq $665536, %rdi, %rdi # imm = 0xA27C0
+; ATOM-NEXT: # sched: [14:7.00]
+; ATOM-NEXT: imulq $665536, (%rsi), %rdi # imm = 0xA27C0
+; ATOM-NEXT: # sched: [14:7.00]
+; ATOM-NEXT: imulq $7, %rdi, %rdi # sched: [14:7.00]
+; ATOM-NEXT: imulq $7, (%rsi), %rdi # sched: [14:7.00]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_imul_64:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: imulq %rdi # sched: [3:1.00]
+; SLM-NEXT: imulq (%rsi) # sched: [6:1.00]
+; SLM-NEXT: imulq %rdi, %rdi # sched: [3:1.00]
+; SLM-NEXT: imulq (%rsi), %rdi # sched: [6:1.00]
+; SLM-NEXT: imulq $665536, %rdi, %rdi # imm = 0xA27C0
+; SLM-NEXT: # sched: [3:1.00]
+; SLM-NEXT: imulq $665536, (%rsi), %rdi # imm = 0xA27C0
+; SLM-NEXT: # sched: [6:1.00]
+; SLM-NEXT: imulq $7, %rdi, %rdi # sched: [3:1.00]
+; SLM-NEXT: imulq $7, (%rsi), %rdi # sched: [6:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_imul_64:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: imulq %rdi # sched: [3:1.00]
+; SANDY-NEXT: imulq (%rsi) # sched: [7:1.00]
+; SANDY-NEXT: imulq %rdi, %rdi # sched: [3:1.00]
+; SANDY-NEXT: imulq (%rsi), %rdi # sched: [7:1.00]
+; SANDY-NEXT: imulq $665536, %rdi, %rdi # imm = 0xA27C0
+; SANDY-NEXT: # sched: [3:1.00]
+; SANDY-NEXT: imulq $665536, (%rsi), %rdi # imm = 0xA27C0
+; SANDY-NEXT: # sched: [7:1.00]
+; SANDY-NEXT: imulq $7, %rdi, %rdi # sched: [3:1.00]
+; SANDY-NEXT: imulq $7, (%rsi), %rdi # sched: [7:1.00]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_imul_64:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: imulq %rdi # sched: [4:1.00]
+; HASWELL-NEXT: imulq (%rsi) # sched: [8:1.00]
+; HASWELL-NEXT: imulq %rdi, %rdi # sched: [3:1.00]
+; HASWELL-NEXT: imulq (%rsi), %rdi # sched: [8:1.00]
+; HASWELL-NEXT: imulq $665536, %rdi, %rdi # imm = 0xA27C0
+; HASWELL-NEXT: # sched: [3:1.00]
+; HASWELL-NEXT: imulq $665536, (%rsi), %rdi # imm = 0xA27C0
+; HASWELL-NEXT: # sched: [8:1.00]
+; HASWELL-NEXT: imulq $7, %rdi, %rdi # sched: [3:1.00]
+; HASWELL-NEXT: imulq $7, (%rsi), %rdi # sched: [8:1.00]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_imul_64:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: imulq %rdi # sched: [4:1.00]
+; BROADWELL-NEXT: imulq (%rsi) # sched: [8:1.00]
+; BROADWELL-NEXT: imulq %rdi, %rdi # sched: [3:1.00]
+; BROADWELL-NEXT: imulq (%rsi), %rdi # sched: [4:1.00]
+; BROADWELL-NEXT: imulq $665536, %rdi, %rdi # imm = 0xA27C0
+; BROADWELL-NEXT: # sched: [3:1.00]
+; BROADWELL-NEXT: imulq $665536, (%rsi), %rdi # imm = 0xA27C0
+; BROADWELL-NEXT: # sched: [4:1.00]
+; BROADWELL-NEXT: imulq $7, %rdi, %rdi # sched: [3:1.00]
+; BROADWELL-NEXT: imulq $7, (%rsi), %rdi # sched: [4:1.00]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_imul_64:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: imulq %rdi # sched: [4:1.00]
+; SKYLAKE-NEXT: imulq (%rsi) # sched: [8:1.00]
+; SKYLAKE-NEXT: imulq %rdi, %rdi # sched: [3:1.00]
+; SKYLAKE-NEXT: imulq (%rsi), %rdi # sched: [4:1.00]
+; SKYLAKE-NEXT: imulq $665536, %rdi, %rdi # imm = 0xA27C0
+; SKYLAKE-NEXT: # sched: [3:1.00]
+; SKYLAKE-NEXT: imulq $665536, (%rsi), %rdi # imm = 0xA27C0
+; SKYLAKE-NEXT: # sched: [4:1.00]
+; SKYLAKE-NEXT: imulq $7, %rdi, %rdi # sched: [3:1.00]
+; SKYLAKE-NEXT: imulq $7, (%rsi), %rdi # sched: [4:1.00]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_imul_64:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: imulq %rdi # sched: [4:1.00]
+; SKX-NEXT: imulq (%rsi) # sched: [8:1.00]
+; SKX-NEXT: imulq %rdi, %rdi # sched: [3:1.00]
+; SKX-NEXT: imulq (%rsi), %rdi # sched: [4:1.00]
+; SKX-NEXT: imulq $665536, %rdi, %rdi # imm = 0xA27C0
+; SKX-NEXT: # sched: [3:1.00]
+; SKX-NEXT: imulq $665536, (%rsi), %rdi # imm = 0xA27C0
+; SKX-NEXT: # sched: [4:1.00]
+; SKX-NEXT: imulq $7, %rdi, %rdi # sched: [3:1.00]
+; SKX-NEXT: imulq $7, (%rsi), %rdi # sched: [4:1.00]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_imul_64:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: imulq %rdi # sched: [3:1.00]
+; BTVER2-NEXT: imulq (%rsi) # sched: [6:1.00]
+; BTVER2-NEXT: imulq %rdi, %rdi # sched: [3:1.00]
+; BTVER2-NEXT: imulq (%rsi), %rdi # sched: [6:1.00]
+; BTVER2-NEXT: imulq $665536, %rdi, %rdi # imm = 0xA27C0
+; BTVER2-NEXT: # sched: [3:1.00]
+; BTVER2-NEXT: imulq $665536, (%rsi), %rdi # imm = 0xA27C0
+; BTVER2-NEXT: # sched: [6:1.00]
+; BTVER2-NEXT: imulq $7, %rdi, %rdi # sched: [3:1.00]
+; BTVER2-NEXT: imulq $7, (%rsi), %rdi # sched: [6:1.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_imul_64:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: imulq %rdi # sched: [4:1.00]
+; ZNVER1-NEXT: imulq (%rsi) # sched: [9:1.00]
+; ZNVER1-NEXT: imulq %rdi, %rdi # sched: [4:1.00]
+; ZNVER1-NEXT: imulq (%rsi), %rdi # sched: [4:1.00]
+; ZNVER1-NEXT: imulq $665536, %rdi, %rdi # imm = 0xA27C0
+; ZNVER1-NEXT: # sched: [4:1.00]
+; ZNVER1-NEXT: imulq $665536, (%rsi), %rdi # imm = 0xA27C0
+; ZNVER1-NEXT: # sched: [4:1.00]
+; ZNVER1-NEXT: imulq $7, %rdi, %rdi # sched: [4:1.00]
+; ZNVER1-NEXT: imulq $7, (%rsi), %rdi # sched: [4:1.00]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ tail call void asm "imulq $0 \0A\09 imulq $1 \0A\09 imulq $0, $0 \0A\09 imulq $1, $0 \0A\09 imulq $2, $0, $0 \0A\09 imulq $2, $1, $0 \0A\09 imulq $3, $0, $0 \0A\09 imulq $3, $1, $0", "r,*m,i,i"(i64 %a0, i64* %a1, i32 665536, i8 7) nounwind
+ ret void
+}
+
+define void @test_in() optsize {
+; GENERIC-LABEL: test_in:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: inb $7, %al # sched: [100:0.33]
+; GENERIC-NEXT: inw $7, %ax # sched: [100:0.33]
+; GENERIC-NEXT: inl $7, %eax # sched: [100:0.33]
+; GENERIC-NEXT: inb %dx, %al # sched: [100:0.33]
+; GENERIC-NEXT: inw %dx, %ax # sched: [100:0.33]
+; GENERIC-NEXT: inl %dx, %eax # sched: [100:0.33]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_in:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: inb $7, %al # sched: [92:46.00]
+; ATOM-NEXT: inw $7, %ax # sched: [92:46.00]
+; ATOM-NEXT: inl $7, %eax # sched: [92:46.00]
+; ATOM-NEXT: inb %dx, %al # sched: [94:47.00]
+; ATOM-NEXT: inw %dx, %ax # sched: [94:47.00]
+; ATOM-NEXT: inl %dx, %eax # sched: [94:47.00]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_in:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: inb $7, %al # sched: [100:1.00]
+; SLM-NEXT: inw $7, %ax # sched: [100:1.00]
+; SLM-NEXT: inl $7, %eax # sched: [100:1.00]
+; SLM-NEXT: inb %dx, %al # sched: [100:1.00]
+; SLM-NEXT: inw %dx, %ax # sched: [100:1.00]
+; SLM-NEXT: inl %dx, %eax # sched: [100:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_in:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: inb $7, %al # sched: [100:0.33]
+; SANDY-NEXT: inw $7, %ax # sched: [100:0.33]
+; SANDY-NEXT: inl $7, %eax # sched: [100:0.33]
+; SANDY-NEXT: inb %dx, %al # sched: [100:0.33]
+; SANDY-NEXT: inw %dx, %ax # sched: [100:0.33]
+; SANDY-NEXT: inl %dx, %eax # sched: [100:0.33]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_in:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: inb $7, %al # sched: [35:5.00]
+; HASWELL-NEXT: inw $7, %ax # sched: [35:5.00]
+; HASWELL-NEXT: inl $7, %eax # sched: [35:5.00]
+; HASWELL-NEXT: inb %dx, %al # sched: [35:5.00]
+; HASWELL-NEXT: inw %dx, %ax # sched: [35:5.00]
+; HASWELL-NEXT: inl %dx, %eax # sched: [35:5.00]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_in:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: inb $7, %al # sched: [34:5.00]
+; BROADWELL-NEXT: inw $7, %ax # sched: [34:5.00]
+; BROADWELL-NEXT: inl $7, %eax # sched: [34:5.00]
+; BROADWELL-NEXT: inb %dx, %al # sched: [34:5.00]
+; BROADWELL-NEXT: inw %dx, %ax # sched: [34:5.00]
+; BROADWELL-NEXT: inl %dx, %eax # sched: [34:5.00]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_in:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: inb $7, %al # sched: [35:5.00]
+; SKYLAKE-NEXT: inw $7, %ax # sched: [35:5.00]
+; SKYLAKE-NEXT: inl $7, %eax # sched: [35:5.00]
+; SKYLAKE-NEXT: inb %dx, %al # sched: [35:5.00]
+; SKYLAKE-NEXT: inw %dx, %ax # sched: [35:5.00]
+; SKYLAKE-NEXT: inl %dx, %eax # sched: [35:5.00]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_in:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: inb $7, %al # sched: [35:5.00]
+; SKX-NEXT: inw $7, %ax # sched: [35:5.00]
+; SKX-NEXT: inl $7, %eax # sched: [35:5.00]
+; SKX-NEXT: inb %dx, %al # sched: [35:5.00]
+; SKX-NEXT: inw %dx, %ax # sched: [35:5.00]
+; SKX-NEXT: inl %dx, %eax # sched: [35:5.00]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_in:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: inb $7, %al # sched: [100:0.17]
+; BTVER2-NEXT: inw $7, %ax # sched: [100:0.17]
+; BTVER2-NEXT: inl $7, %eax # sched: [100:0.17]
+; BTVER2-NEXT: inb %dx, %al # sched: [100:0.17]
+; BTVER2-NEXT: inw %dx, %ax # sched: [100:0.17]
+; BTVER2-NEXT: inl %dx, %eax # sched: [100:0.17]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_in:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: inb $7, %al # sched: [100:?]
+; ZNVER1-NEXT: inw $7, %ax # sched: [100:?]
+; ZNVER1-NEXT: inl $7, %eax # sched: [100:?]
+; ZNVER1-NEXT: inb %dx, %al # sched: [100:?]
+; ZNVER1-NEXT: inw %dx, %ax # sched: [100:?]
+; ZNVER1-NEXT: inl %dx, %eax # sched: [100:?]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ tail call void asm "inb $0, %AL \0A\09 inw $0, %AX \0A\09 inl $0, %EAX \0A\09 inb %DX, %AL \0A\09 inw %DX, %AX \0A\09 inl %DX, %EAX", "i"(i8 7) nounwind
+ ret void
+}
+
+define void @test_inc8(i8 %a0, i8* %a1) optsize {
+; GENERIC-LABEL: test_inc8:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: incb %dil # sched: [1:0.33]
+; GENERIC-NEXT: incb (%rsi) # sched: [7:1.00]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_inc8:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: incb %dil # sched: [1:0.50]
+; ATOM-NEXT: incb (%rsi) # sched: [1:1.00]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_inc8:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: incb %dil # sched: [1:0.50]
+; SLM-NEXT: incb (%rsi) # sched: [4:2.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_inc8:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: incb %dil # sched: [1:0.33]
+; SANDY-NEXT: incb (%rsi) # sched: [7:1.00]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_inc8:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: incb %dil # sched: [1:0.25]
+; HASWELL-NEXT: incb (%rsi) # sched: [7:1.00]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_inc8:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: incb %dil # sched: [1:0.25]
+; BROADWELL-NEXT: incb (%rsi) # sched: [6:1.00]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_inc8:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: incb %dil # sched: [1:0.25]
+; SKYLAKE-NEXT: incb (%rsi) # sched: [6:1.00]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_inc8:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: incb %dil # sched: [1:0.25]
+; SKX-NEXT: incb (%rsi) # sched: [6:1.00]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_inc8:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: incb %dil # sched: [1:0.50]
+; BTVER2-NEXT: incb (%rsi) # sched: [4:1.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_inc8:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: incb %dil # sched: [1:0.25]
+; ZNVER1-NEXT: incb (%rsi) # sched: [5:0.50]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ tail call void asm "incb $0 \0A\09 incb $1", "r,*m"(i8 %a0, i8* %a1) nounwind
+ ret void
+}
+define void @test_inc16(i16 %a0, i16* %a1) optsize {
+; GENERIC-LABEL: test_inc16:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: incw %di # sched: [1:0.33]
+; GENERIC-NEXT: incw (%rsi) # sched: [7:1.00]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_inc16:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: incw %di # sched: [1:0.50]
+; ATOM-NEXT: incw (%rsi) # sched: [1:1.00]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_inc16:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: incw %di # sched: [1:0.50]
+; SLM-NEXT: incw (%rsi) # sched: [4:2.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_inc16:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: incw %di # sched: [1:0.33]
+; SANDY-NEXT: incw (%rsi) # sched: [7:1.00]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_inc16:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: incw %di # sched: [1:0.25]
+; HASWELL-NEXT: incw (%rsi) # sched: [7:1.00]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_inc16:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: incw %di # sched: [1:0.25]
+; BROADWELL-NEXT: incw (%rsi) # sched: [6:1.00]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_inc16:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: incw %di # sched: [1:0.25]
+; SKYLAKE-NEXT: incw (%rsi) # sched: [6:1.00]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_inc16:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: incw %di # sched: [1:0.25]
+; SKX-NEXT: incw (%rsi) # sched: [6:1.00]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_inc16:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: incw %di # sched: [1:0.50]
+; BTVER2-NEXT: incw (%rsi) # sched: [4:1.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_inc16:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: incw %di # sched: [1:0.25]
+; ZNVER1-NEXT: incw (%rsi) # sched: [5:0.50]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ tail call void asm "incw $0 \0A\09 incw $1", "r,*m"(i16 %a0, i16* %a1) nounwind
+ ret void
+}
+define void @test_inc32(i32 %a0, i32* %a1) optsize {
+; GENERIC-LABEL: test_inc32:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: incl %edi # sched: [1:0.33]
+; GENERIC-NEXT: incl (%rsi) # sched: [7:1.00]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_inc32:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: incl %edi # sched: [1:0.50]
+; ATOM-NEXT: incl (%rsi) # sched: [1:1.00]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_inc32:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: incl %edi # sched: [1:0.50]
+; SLM-NEXT: incl (%rsi) # sched: [4:2.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_inc32:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: incl %edi # sched: [1:0.33]
+; SANDY-NEXT: incl (%rsi) # sched: [7:1.00]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_inc32:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: incl %edi # sched: [1:0.25]
+; HASWELL-NEXT: incl (%rsi) # sched: [7:1.00]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_inc32:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: incl %edi # sched: [1:0.25]
+; BROADWELL-NEXT: incl (%rsi) # sched: [6:1.00]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_inc32:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: incl %edi # sched: [1:0.25]
+; SKYLAKE-NEXT: incl (%rsi) # sched: [6:1.00]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_inc32:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: incl %edi # sched: [1:0.25]
+; SKX-NEXT: incl (%rsi) # sched: [6:1.00]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_inc32:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: incl %edi # sched: [1:0.50]
+; BTVER2-NEXT: incl (%rsi) # sched: [4:1.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_inc32:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: incl %edi # sched: [1:0.25]
+; ZNVER1-NEXT: incl (%rsi) # sched: [5:0.50]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ tail call void asm "incl $0 \0A\09 incl $1", "r,*m"(i32 %a0, i32* %a1) nounwind
+ ret void
+}
+define void @test_inc64(i64 %a0, i64* %a1) optsize {
+; GENERIC-LABEL: test_inc64:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: incq %rdi # sched: [1:0.33]
+; GENERIC-NEXT: incq (%rsi) # sched: [7:1.00]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_inc64:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: incq %rdi # sched: [1:0.50]
+; ATOM-NEXT: incq (%rsi) # sched: [1:1.00]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_inc64:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: incq %rdi # sched: [1:0.50]
+; SLM-NEXT: incq (%rsi) # sched: [4:2.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_inc64:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: incq %rdi # sched: [1:0.33]
+; SANDY-NEXT: incq (%rsi) # sched: [7:1.00]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_inc64:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: incq %rdi # sched: [1:0.25]
+; HASWELL-NEXT: incq (%rsi) # sched: [7:1.00]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_inc64:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: incq %rdi # sched: [1:0.25]
+; BROADWELL-NEXT: incq (%rsi) # sched: [6:1.00]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_inc64:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: incq %rdi # sched: [1:0.25]
+; SKYLAKE-NEXT: incq (%rsi) # sched: [6:1.00]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_inc64:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: incq %rdi # sched: [1:0.25]
+; SKX-NEXT: incq (%rsi) # sched: [6:1.00]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_inc64:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: incq %rdi # sched: [1:0.50]
+; BTVER2-NEXT: incq (%rsi) # sched: [4:1.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_inc64:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: incq %rdi # sched: [1:0.25]
+; ZNVER1-NEXT: incq (%rsi) # sched: [5:0.50]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ tail call void asm "incq $0 \0A\09 incq $1", "r,*m"(i64 %a0, i64* %a1) nounwind
+ ret void
+}
+
+define void @test_ins() optsize {
+; GENERIC-LABEL: test_ins:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: insb %dx, %es:(%rdi) # sched: [100:0.33]
+; GENERIC-NEXT: insw %dx, %es:(%rdi) # sched: [100:0.33]
+; GENERIC-NEXT: insl %dx, %es:(%rdi) # sched: [100:0.33]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_ins:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: insb %dx, %es:(%rdi) # sched: [59:29.50]
+; ATOM-NEXT: insw %dx, %es:(%rdi) # sched: [59:29.50]
+; ATOM-NEXT: insl %dx, %es:(%rdi) # sched: [59:29.50]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_ins:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: insb %dx, %es:(%rdi) # sched: [100:1.00]
+; SLM-NEXT: insw %dx, %es:(%rdi) # sched: [100:1.00]
+; SLM-NEXT: insl %dx, %es:(%rdi) # sched: [100:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_ins:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: insb %dx, %es:(%rdi) # sched: [100:0.33]
+; SANDY-NEXT: insw %dx, %es:(%rdi) # sched: [100:0.33]
+; SANDY-NEXT: insl %dx, %es:(%rdi) # sched: [100:0.33]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_ins:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: insb %dx, %es:(%rdi) # sched: [21:1.25]
+; HASWELL-NEXT: insw %dx, %es:(%rdi) # sched: [21:1.25]
+; HASWELL-NEXT: insl %dx, %es:(%rdi) # sched: [21:1.25]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_ins:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: insb %dx, %es:(%rdi) # sched: [20:1.25]
+; BROADWELL-NEXT: insw %dx, %es:(%rdi) # sched: [20:1.25]
+; BROADWELL-NEXT: insl %dx, %es:(%rdi) # sched: [20:1.25]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_ins:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: insb %dx, %es:(%rdi) # sched: [20:1.25]
+; SKYLAKE-NEXT: insw %dx, %es:(%rdi) # sched: [20:1.25]
+; SKYLAKE-NEXT: insl %dx, %es:(%rdi) # sched: [20:1.25]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_ins:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: insb %dx, %es:(%rdi) # sched: [20:1.25]
+; SKX-NEXT: insw %dx, %es:(%rdi) # sched: [20:1.25]
+; SKX-NEXT: insl %dx, %es:(%rdi) # sched: [20:1.25]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_ins:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: insb %dx, %es:(%rdi) # sched: [100:0.17]
+; BTVER2-NEXT: insw %dx, %es:(%rdi) # sched: [100:0.17]
+; BTVER2-NEXT: insl %dx, %es:(%rdi) # sched: [100:0.17]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_ins:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: insb %dx, %es:(%rdi) # sched: [100:?]
+; ZNVER1-NEXT: insw %dx, %es:(%rdi) # sched: [100:?]
+; ZNVER1-NEXT: insl %dx, %es:(%rdi) # sched: [100:?]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ call void asm sideeffect "insb \0A\09 insw \0A\09 insl", ""()
+ ret void
+}
+
+define void @test_int() optsize {
+; GENERIC-LABEL: test_int:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: int $7 # sched: [100:0.33]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_int:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: int $7 # sched: [127:63.50]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_int:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: int $7 # sched: [100:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_int:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: int $7 # sched: [100:0.33]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_int:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: int $7 # sched: [100:0.25]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_int:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: int $7 # sched: [100:0.25]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_int:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: int $7 # sched: [100:0.25]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_int:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: int $7 # sched: [100:0.25]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_int:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: int $7 # sched: [100:0.17]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_int:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: int $7 # sched: [100:?]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ call void asm sideeffect "int $0", "i"(i8 7)
+ ret void
+}
+
+define void @test_invlpg_invlpga(i8 *%a0) optsize {
+; GENERIC-LABEL: test_invlpg_invlpga:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: invlpg (%rdi) # sched: [100:0.33]
+; GENERIC-NEXT: invlpga %ecx, %rax # sched: [100:0.33]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_invlpg_invlpga:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: invlpg (%rdi) # sched: [71:35.50]
+; ATOM-NEXT: invlpga %ecx, %rax # sched: [71:35.50]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_invlpg_invlpga:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: invlpg (%rdi) # sched: [100:1.00]
+; SLM-NEXT: invlpga %ecx, %rax # sched: [100:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_invlpg_invlpga:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: invlpg (%rdi) # sched: [100:0.33]
+; SANDY-NEXT: invlpga %ecx, %rax # sched: [100:0.33]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_invlpg_invlpga:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: invlpg (%rdi) # sched: [100:0.25]
+; HASWELL-NEXT: invlpga %ecx, %rax # sched: [100:0.25]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_invlpg_invlpga:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: invlpg (%rdi) # sched: [100:0.25]
+; BROADWELL-NEXT: invlpga %ecx, %rax # sched: [100:0.25]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_invlpg_invlpga:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: invlpg (%rdi) # sched: [100:0.25]
+; SKYLAKE-NEXT: invlpga %ecx, %rax # sched: [100:0.25]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_invlpg_invlpga:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: invlpg (%rdi) # sched: [100:0.25]
+; SKX-NEXT: invlpga %ecx, %rax # sched: [100:0.25]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_invlpg_invlpga:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: invlpg (%rdi) # sched: [100:0.17]
+; BTVER2-NEXT: invlpga %ecx, %rax # sched: [100:0.17]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_invlpg_invlpga:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: invlpg (%rdi) # sched: [100:?]
+; ZNVER1-NEXT: invlpga %ecx, %rax # sched: [100:?]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ tail call void asm sideeffect "invlpg $0 \0A\09 invlpga %ecx, %rax", "*m"(i8 *%a0) nounwind
+ ret void
+}
+
+define void @test_jcc() optsize {
+; GENERIC-LABEL: test_jcc:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: JCCTGT:
+; GENERIC-NEXT: jo JCCTGT # sched: [1:1.00]
+; GENERIC-NEXT: jno JCCTGT # sched: [1:1.00]
+; GENERIC-NEXT: jb JCCTGT # sched: [1:1.00]
+; GENERIC-NEXT: jb JCCTGT # sched: [1:1.00]
+; GENERIC-NEXT: jb JCCTGT # sched: [1:1.00]
+; GENERIC-NEXT: jae JCCTGT # sched: [1:1.00]
+; GENERIC-NEXT: jae JCCTGT # sched: [1:1.00]
+; GENERIC-NEXT: jae JCCTGT # sched: [1:1.00]
+; GENERIC-NEXT: je JCCTGT # sched: [1:1.00]
+; GENERIC-NEXT: je JCCTGT # sched: [1:1.00]
+; GENERIC-NEXT: jne JCCTGT # sched: [1:1.00]
+; GENERIC-NEXT: jne JCCTGT # sched: [1:1.00]
+; GENERIC-NEXT: jbe JCCTGT # sched: [1:1.00]
+; GENERIC-NEXT: jbe JCCTGT # sched: [1:1.00]
+; GENERIC-NEXT: ja JCCTGT # sched: [1:1.00]
+; GENERIC-NEXT: ja JCCTGT # sched: [1:1.00]
+; GENERIC-NEXT: js JCCTGT # sched: [1:1.00]
+; GENERIC-NEXT: jns JCCTGT # sched: [1:1.00]
+; GENERIC-NEXT: jp JCCTGT # sched: [1:1.00]
+; GENERIC-NEXT: jp JCCTGT # sched: [1:1.00]
+; GENERIC-NEXT: jnp JCCTGT # sched: [1:1.00]
+; GENERIC-NEXT: jnp JCCTGT # sched: [1:1.00]
+; GENERIC-NEXT: jl JCCTGT # sched: [1:1.00]
+; GENERIC-NEXT: jl JCCTGT # sched: [1:1.00]
+; GENERIC-NEXT: jge JCCTGT # sched: [1:1.00]
+; GENERIC-NEXT: jge JCCTGT # sched: [1:1.00]
+; GENERIC-NEXT: jle JCCTGT # sched: [1:1.00]
+; GENERIC-NEXT: jle JCCTGT # sched: [1:1.00]
+; GENERIC-NEXT: jg JCCTGT # sched: [1:1.00]
+; GENERIC-NEXT: jg JCCTGT # sched: [1:1.00]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_jcc:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: JCCTGT:
+; ATOM-NEXT: jo JCCTGT # sched: [1:1.00]
+; ATOM-NEXT: jno JCCTGT # sched: [1:1.00]
+; ATOM-NEXT: jb JCCTGT # sched: [1:1.00]
+; ATOM-NEXT: jb JCCTGT # sched: [1:1.00]
+; ATOM-NEXT: jb JCCTGT # sched: [1:1.00]
+; ATOM-NEXT: jae JCCTGT # sched: [1:1.00]
+; ATOM-NEXT: jae JCCTGT # sched: [1:1.00]
+; ATOM-NEXT: jae JCCTGT # sched: [1:1.00]
+; ATOM-NEXT: je JCCTGT # sched: [1:1.00]
+; ATOM-NEXT: je JCCTGT # sched: [1:1.00]
+; ATOM-NEXT: jne JCCTGT # sched: [1:1.00]
+; ATOM-NEXT: jne JCCTGT # sched: [1:1.00]
+; ATOM-NEXT: jbe JCCTGT # sched: [1:1.00]
+; ATOM-NEXT: jbe JCCTGT # sched: [1:1.00]
+; ATOM-NEXT: ja JCCTGT # sched: [1:1.00]
+; ATOM-NEXT: ja JCCTGT # sched: [1:1.00]
+; ATOM-NEXT: js JCCTGT # sched: [1:1.00]
+; ATOM-NEXT: jns JCCTGT # sched: [1:1.00]
+; ATOM-NEXT: jp JCCTGT # sched: [1:1.00]
+; ATOM-NEXT: jp JCCTGT # sched: [1:1.00]
+; ATOM-NEXT: jnp JCCTGT # sched: [1:1.00]
+; ATOM-NEXT: jnp JCCTGT # sched: [1:1.00]
+; ATOM-NEXT: jl JCCTGT # sched: [1:1.00]
+; ATOM-NEXT: jl JCCTGT # sched: [1:1.00]
+; ATOM-NEXT: jge JCCTGT # sched: [1:1.00]
+; ATOM-NEXT: jge JCCTGT # sched: [1:1.00]
+; ATOM-NEXT: jle JCCTGT # sched: [1:1.00]
+; ATOM-NEXT: jle JCCTGT # sched: [1:1.00]
+; ATOM-NEXT: jg JCCTGT # sched: [1:1.00]
+; ATOM-NEXT: jg JCCTGT # sched: [1:1.00]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_jcc:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: JCCTGT:
+; SLM-NEXT: jo JCCTGT # sched: [1:1.00]
+; SLM-NEXT: jno JCCTGT # sched: [1:1.00]
+; SLM-NEXT: jb JCCTGT # sched: [1:1.00]
+; SLM-NEXT: jb JCCTGT # sched: [1:1.00]
+; SLM-NEXT: jb JCCTGT # sched: [1:1.00]
+; SLM-NEXT: jae JCCTGT # sched: [1:1.00]
+; SLM-NEXT: jae JCCTGT # sched: [1:1.00]
+; SLM-NEXT: jae JCCTGT # sched: [1:1.00]
+; SLM-NEXT: je JCCTGT # sched: [1:1.00]
+; SLM-NEXT: je JCCTGT # sched: [1:1.00]
+; SLM-NEXT: jne JCCTGT # sched: [1:1.00]
+; SLM-NEXT: jne JCCTGT # sched: [1:1.00]
+; SLM-NEXT: jbe JCCTGT # sched: [1:1.00]
+; SLM-NEXT: jbe JCCTGT # sched: [1:1.00]
+; SLM-NEXT: ja JCCTGT # sched: [1:1.00]
+; SLM-NEXT: ja JCCTGT # sched: [1:1.00]
+; SLM-NEXT: js JCCTGT # sched: [1:1.00]
+; SLM-NEXT: jns JCCTGT # sched: [1:1.00]
+; SLM-NEXT: jp JCCTGT # sched: [1:1.00]
+; SLM-NEXT: jp JCCTGT # sched: [1:1.00]
+; SLM-NEXT: jnp JCCTGT # sched: [1:1.00]
+; SLM-NEXT: jnp JCCTGT # sched: [1:1.00]
+; SLM-NEXT: jl JCCTGT # sched: [1:1.00]
+; SLM-NEXT: jl JCCTGT # sched: [1:1.00]
+; SLM-NEXT: jge JCCTGT # sched: [1:1.00]
+; SLM-NEXT: jge JCCTGT # sched: [1:1.00]
+; SLM-NEXT: jle JCCTGT # sched: [1:1.00]
+; SLM-NEXT: jle JCCTGT # sched: [1:1.00]
+; SLM-NEXT: jg JCCTGT # sched: [1:1.00]
+; SLM-NEXT: jg JCCTGT # sched: [1:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_jcc:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: JCCTGT:
+; SANDY-NEXT: jo JCCTGT # sched: [1:1.00]
+; SANDY-NEXT: jno JCCTGT # sched: [1:1.00]
+; SANDY-NEXT: jb JCCTGT # sched: [1:1.00]
+; SANDY-NEXT: jb JCCTGT # sched: [1:1.00]
+; SANDY-NEXT: jb JCCTGT # sched: [1:1.00]
+; SANDY-NEXT: jae JCCTGT # sched: [1:1.00]
+; SANDY-NEXT: jae JCCTGT # sched: [1:1.00]
+; SANDY-NEXT: jae JCCTGT # sched: [1:1.00]
+; SANDY-NEXT: je JCCTGT # sched: [1:1.00]
+; SANDY-NEXT: je JCCTGT # sched: [1:1.00]
+; SANDY-NEXT: jne JCCTGT # sched: [1:1.00]
+; SANDY-NEXT: jne JCCTGT # sched: [1:1.00]
+; SANDY-NEXT: jbe JCCTGT # sched: [1:1.00]
+; SANDY-NEXT: jbe JCCTGT # sched: [1:1.00]
+; SANDY-NEXT: ja JCCTGT # sched: [1:1.00]
+; SANDY-NEXT: ja JCCTGT # sched: [1:1.00]
+; SANDY-NEXT: js JCCTGT # sched: [1:1.00]
+; SANDY-NEXT: jns JCCTGT # sched: [1:1.00]
+; SANDY-NEXT: jp JCCTGT # sched: [1:1.00]
+; SANDY-NEXT: jp JCCTGT # sched: [1:1.00]
+; SANDY-NEXT: jnp JCCTGT # sched: [1:1.00]
+; SANDY-NEXT: jnp JCCTGT # sched: [1:1.00]
+; SANDY-NEXT: jl JCCTGT # sched: [1:1.00]
+; SANDY-NEXT: jl JCCTGT # sched: [1:1.00]
+; SANDY-NEXT: jge JCCTGT # sched: [1:1.00]
+; SANDY-NEXT: jge JCCTGT # sched: [1:1.00]
+; SANDY-NEXT: jle JCCTGT # sched: [1:1.00]
+; SANDY-NEXT: jle JCCTGT # sched: [1:1.00]
+; SANDY-NEXT: jg JCCTGT # sched: [1:1.00]
+; SANDY-NEXT: jg JCCTGT # sched: [1:1.00]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_jcc:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: JCCTGT:
+; HASWELL-NEXT: jo JCCTGT # sched: [1:0.50]
+; HASWELL-NEXT: jno JCCTGT # sched: [1:0.50]
+; HASWELL-NEXT: jb JCCTGT # sched: [1:0.50]
+; HASWELL-NEXT: jb JCCTGT # sched: [1:0.50]
+; HASWELL-NEXT: jb JCCTGT # sched: [1:0.50]
+; HASWELL-NEXT: jae JCCTGT # sched: [1:0.50]
+; HASWELL-NEXT: jae JCCTGT # sched: [1:0.50]
+; HASWELL-NEXT: jae JCCTGT # sched: [1:0.50]
+; HASWELL-NEXT: je JCCTGT # sched: [1:0.50]
+; HASWELL-NEXT: je JCCTGT # sched: [1:0.50]
+; HASWELL-NEXT: jne JCCTGT # sched: [1:0.50]
+; HASWELL-NEXT: jne JCCTGT # sched: [1:0.50]
+; HASWELL-NEXT: jbe JCCTGT # sched: [1:0.50]
+; HASWELL-NEXT: jbe JCCTGT # sched: [1:0.50]
+; HASWELL-NEXT: ja JCCTGT # sched: [1:0.50]
+; HASWELL-NEXT: ja JCCTGT # sched: [1:0.50]
+; HASWELL-NEXT: js JCCTGT # sched: [1:0.50]
+; HASWELL-NEXT: jns JCCTGT # sched: [1:0.50]
+; HASWELL-NEXT: jp JCCTGT # sched: [1:0.50]
+; HASWELL-NEXT: jp JCCTGT # sched: [1:0.50]
+; HASWELL-NEXT: jnp JCCTGT # sched: [1:0.50]
+; HASWELL-NEXT: jnp JCCTGT # sched: [1:0.50]
+; HASWELL-NEXT: jl JCCTGT # sched: [1:0.50]
+; HASWELL-NEXT: jl JCCTGT # sched: [1:0.50]
+; HASWELL-NEXT: jge JCCTGT # sched: [1:0.50]
+; HASWELL-NEXT: jge JCCTGT # sched: [1:0.50]
+; HASWELL-NEXT: jle JCCTGT # sched: [1:0.50]
+; HASWELL-NEXT: jle JCCTGT # sched: [1:0.50]
+; HASWELL-NEXT: jg JCCTGT # sched: [1:0.50]
+; HASWELL-NEXT: jg JCCTGT # sched: [1:0.50]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_jcc:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: JCCTGT:
+; BROADWELL-NEXT: jo JCCTGT # sched: [1:0.50]
+; BROADWELL-NEXT: jno JCCTGT # sched: [1:0.50]
+; BROADWELL-NEXT: jb JCCTGT # sched: [1:0.50]
+; BROADWELL-NEXT: jb JCCTGT # sched: [1:0.50]
+; BROADWELL-NEXT: jb JCCTGT # sched: [1:0.50]
+; BROADWELL-NEXT: jae JCCTGT # sched: [1:0.50]
+; BROADWELL-NEXT: jae JCCTGT # sched: [1:0.50]
+; BROADWELL-NEXT: jae JCCTGT # sched: [1:0.50]
+; BROADWELL-NEXT: je JCCTGT # sched: [1:0.50]
+; BROADWELL-NEXT: je JCCTGT # sched: [1:0.50]
+; BROADWELL-NEXT: jne JCCTGT # sched: [1:0.50]
+; BROADWELL-NEXT: jne JCCTGT # sched: [1:0.50]
+; BROADWELL-NEXT: jbe JCCTGT # sched: [1:0.50]
+; BROADWELL-NEXT: jbe JCCTGT # sched: [1:0.50]
+; BROADWELL-NEXT: ja JCCTGT # sched: [1:0.50]
+; BROADWELL-NEXT: ja JCCTGT # sched: [1:0.50]
+; BROADWELL-NEXT: js JCCTGT # sched: [1:0.50]
+; BROADWELL-NEXT: jns JCCTGT # sched: [1:0.50]
+; BROADWELL-NEXT: jp JCCTGT # sched: [1:0.50]
+; BROADWELL-NEXT: jp JCCTGT # sched: [1:0.50]
+; BROADWELL-NEXT: jnp JCCTGT # sched: [1:0.50]
+; BROADWELL-NEXT: jnp JCCTGT # sched: [1:0.50]
+; BROADWELL-NEXT: jl JCCTGT # sched: [1:0.50]
+; BROADWELL-NEXT: jl JCCTGT # sched: [1:0.50]
+; BROADWELL-NEXT: jge JCCTGT # sched: [1:0.50]
+; BROADWELL-NEXT: jge JCCTGT # sched: [1:0.50]
+; BROADWELL-NEXT: jle JCCTGT # sched: [1:0.50]
+; BROADWELL-NEXT: jle JCCTGT # sched: [1:0.50]
+; BROADWELL-NEXT: jg JCCTGT # sched: [1:0.50]
+; BROADWELL-NEXT: jg JCCTGT # sched: [1:0.50]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_jcc:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: JCCTGT:
+; SKYLAKE-NEXT: jo JCCTGT # sched: [1:0.50]
+; SKYLAKE-NEXT: jno JCCTGT # sched: [1:0.50]
+; SKYLAKE-NEXT: jb JCCTGT # sched: [1:0.50]
+; SKYLAKE-NEXT: jb JCCTGT # sched: [1:0.50]
+; SKYLAKE-NEXT: jb JCCTGT # sched: [1:0.50]
+; SKYLAKE-NEXT: jae JCCTGT # sched: [1:0.50]
+; SKYLAKE-NEXT: jae JCCTGT # sched: [1:0.50]
+; SKYLAKE-NEXT: jae JCCTGT # sched: [1:0.50]
+; SKYLAKE-NEXT: je JCCTGT # sched: [1:0.50]
+; SKYLAKE-NEXT: je JCCTGT # sched: [1:0.50]
+; SKYLAKE-NEXT: jne JCCTGT # sched: [1:0.50]
+; SKYLAKE-NEXT: jne JCCTGT # sched: [1:0.50]
+; SKYLAKE-NEXT: jbe JCCTGT # sched: [1:0.50]
+; SKYLAKE-NEXT: jbe JCCTGT # sched: [1:0.50]
+; SKYLAKE-NEXT: ja JCCTGT # sched: [1:0.50]
+; SKYLAKE-NEXT: ja JCCTGT # sched: [1:0.50]
+; SKYLAKE-NEXT: js JCCTGT # sched: [1:0.50]
+; SKYLAKE-NEXT: jns JCCTGT # sched: [1:0.50]
+; SKYLAKE-NEXT: jp JCCTGT # sched: [1:0.50]
+; SKYLAKE-NEXT: jp JCCTGT # sched: [1:0.50]
+; SKYLAKE-NEXT: jnp JCCTGT # sched: [1:0.50]
+; SKYLAKE-NEXT: jnp JCCTGT # sched: [1:0.50]
+; SKYLAKE-NEXT: jl JCCTGT # sched: [1:0.50]
+; SKYLAKE-NEXT: jl JCCTGT # sched: [1:0.50]
+; SKYLAKE-NEXT: jge JCCTGT # sched: [1:0.50]
+; SKYLAKE-NEXT: jge JCCTGT # sched: [1:0.50]
+; SKYLAKE-NEXT: jle JCCTGT # sched: [1:0.50]
+; SKYLAKE-NEXT: jle JCCTGT # sched: [1:0.50]
+; SKYLAKE-NEXT: jg JCCTGT # sched: [1:0.50]
+; SKYLAKE-NEXT: jg JCCTGT # sched: [1:0.50]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_jcc:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: JCCTGT:
+; SKX-NEXT: jo JCCTGT # sched: [1:0.50]
+; SKX-NEXT: jno JCCTGT # sched: [1:0.50]
+; SKX-NEXT: jb JCCTGT # sched: [1:0.50]
+; SKX-NEXT: jb JCCTGT # sched: [1:0.50]
+; SKX-NEXT: jb JCCTGT # sched: [1:0.50]
+; SKX-NEXT: jae JCCTGT # sched: [1:0.50]
+; SKX-NEXT: jae JCCTGT # sched: [1:0.50]
+; SKX-NEXT: jae JCCTGT # sched: [1:0.50]
+; SKX-NEXT: je JCCTGT # sched: [1:0.50]
+; SKX-NEXT: je JCCTGT # sched: [1:0.50]
+; SKX-NEXT: jne JCCTGT # sched: [1:0.50]
+; SKX-NEXT: jne JCCTGT # sched: [1:0.50]
+; SKX-NEXT: jbe JCCTGT # sched: [1:0.50]
+; SKX-NEXT: jbe JCCTGT # sched: [1:0.50]
+; SKX-NEXT: ja JCCTGT # sched: [1:0.50]
+; SKX-NEXT: ja JCCTGT # sched: [1:0.50]
+; SKX-NEXT: js JCCTGT # sched: [1:0.50]
+; SKX-NEXT: jns JCCTGT # sched: [1:0.50]
+; SKX-NEXT: jp JCCTGT # sched: [1:0.50]
+; SKX-NEXT: jp JCCTGT # sched: [1:0.50]
+; SKX-NEXT: jnp JCCTGT # sched: [1:0.50]
+; SKX-NEXT: jnp JCCTGT # sched: [1:0.50]
+; SKX-NEXT: jl JCCTGT # sched: [1:0.50]
+; SKX-NEXT: jl JCCTGT # sched: [1:0.50]
+; SKX-NEXT: jge JCCTGT # sched: [1:0.50]
+; SKX-NEXT: jge JCCTGT # sched: [1:0.50]
+; SKX-NEXT: jle JCCTGT # sched: [1:0.50]
+; SKX-NEXT: jle JCCTGT # sched: [1:0.50]
+; SKX-NEXT: jg JCCTGT # sched: [1:0.50]
+; SKX-NEXT: jg JCCTGT # sched: [1:0.50]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_jcc:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: JCCTGT:
+; BTVER2-NEXT: jo JCCTGT # sched: [1:0.50]
+; BTVER2-NEXT: jno JCCTGT # sched: [1:0.50]
+; BTVER2-NEXT: jb JCCTGT # sched: [1:0.50]
+; BTVER2-NEXT: jb JCCTGT # sched: [1:0.50]
+; BTVER2-NEXT: jb JCCTGT # sched: [1:0.50]
+; BTVER2-NEXT: jae JCCTGT # sched: [1:0.50]
+; BTVER2-NEXT: jae JCCTGT # sched: [1:0.50]
+; BTVER2-NEXT: jae JCCTGT # sched: [1:0.50]
+; BTVER2-NEXT: je JCCTGT # sched: [1:0.50]
+; BTVER2-NEXT: je JCCTGT # sched: [1:0.50]
+; BTVER2-NEXT: jne JCCTGT # sched: [1:0.50]
+; BTVER2-NEXT: jne JCCTGT # sched: [1:0.50]
+; BTVER2-NEXT: jbe JCCTGT # sched: [1:0.50]
+; BTVER2-NEXT: jbe JCCTGT # sched: [1:0.50]
+; BTVER2-NEXT: ja JCCTGT # sched: [1:0.50]
+; BTVER2-NEXT: ja JCCTGT # sched: [1:0.50]
+; BTVER2-NEXT: js JCCTGT # sched: [1:0.50]
+; BTVER2-NEXT: jns JCCTGT # sched: [1:0.50]
+; BTVER2-NEXT: jp JCCTGT # sched: [1:0.50]
+; BTVER2-NEXT: jp JCCTGT # sched: [1:0.50]
+; BTVER2-NEXT: jnp JCCTGT # sched: [1:0.50]
+; BTVER2-NEXT: jnp JCCTGT # sched: [1:0.50]
+; BTVER2-NEXT: jl JCCTGT # sched: [1:0.50]
+; BTVER2-NEXT: jl JCCTGT # sched: [1:0.50]
+; BTVER2-NEXT: jge JCCTGT # sched: [1:0.50]
+; BTVER2-NEXT: jge JCCTGT # sched: [1:0.50]
+; BTVER2-NEXT: jle JCCTGT # sched: [1:0.50]
+; BTVER2-NEXT: jle JCCTGT # sched: [1:0.50]
+; BTVER2-NEXT: jg JCCTGT # sched: [1:0.50]
+; BTVER2-NEXT: jg JCCTGT # sched: [1:0.50]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_jcc:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: JCCTGT:
+; ZNVER1-NEXT: jo JCCTGT # sched: [1:0.25]
+; ZNVER1-NEXT: jno JCCTGT # sched: [1:0.25]
+; ZNVER1-NEXT: jb JCCTGT # sched: [1:0.25]
+; ZNVER1-NEXT: jb JCCTGT # sched: [1:0.25]
+; ZNVER1-NEXT: jb JCCTGT # sched: [1:0.25]
+; ZNVER1-NEXT: jae JCCTGT # sched: [1:0.25]
+; ZNVER1-NEXT: jae JCCTGT # sched: [1:0.25]
+; ZNVER1-NEXT: jae JCCTGT # sched: [1:0.25]
+; ZNVER1-NEXT: je JCCTGT # sched: [1:0.25]
+; ZNVER1-NEXT: je JCCTGT # sched: [1:0.25]
+; ZNVER1-NEXT: jne JCCTGT # sched: [1:0.25]
+; ZNVER1-NEXT: jne JCCTGT # sched: [1:0.25]
+; ZNVER1-NEXT: jbe JCCTGT # sched: [1:0.25]
+; ZNVER1-NEXT: jbe JCCTGT # sched: [1:0.25]
+; ZNVER1-NEXT: ja JCCTGT # sched: [1:0.25]
+; ZNVER1-NEXT: ja JCCTGT # sched: [1:0.25]
+; ZNVER1-NEXT: js JCCTGT # sched: [1:0.25]
+; ZNVER1-NEXT: jns JCCTGT # sched: [1:0.25]
+; ZNVER1-NEXT: jp JCCTGT # sched: [1:0.25]
+; ZNVER1-NEXT: jp JCCTGT # sched: [1:0.25]
+; ZNVER1-NEXT: jnp JCCTGT # sched: [1:0.25]
+; ZNVER1-NEXT: jnp JCCTGT # sched: [1:0.25]
+; ZNVER1-NEXT: jl JCCTGT # sched: [1:0.25]
+; ZNVER1-NEXT: jl JCCTGT # sched: [1:0.25]
+; ZNVER1-NEXT: jge JCCTGT # sched: [1:0.25]
+; ZNVER1-NEXT: jge JCCTGT # sched: [1:0.25]
+; ZNVER1-NEXT: jle JCCTGT # sched: [1:0.25]
+; ZNVER1-NEXT: jle JCCTGT # sched: [1:0.25]
+; ZNVER1-NEXT: jg JCCTGT # sched: [1:0.25]
+; ZNVER1-NEXT: jg JCCTGT # sched: [1:0.25]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ call void asm sideeffect "JCCTGT: \0A\09 jo JCCTGT \0A\09 jno JCCTGT \0A\09 jb JCCTGT \0A\09 jc JCCTGT \0A\09 jnae JCCTGT \0A\09 jnb JCCTGT \0A\09 jnc JCCTGT \0A\09 jae JCCTGT \0A\09 jz JCCTGT \0A\09 je JCCTGT \0A\09 jnz JCCTGT \0A\09 jne JCCTGT \0A\09 jbe JCCTGT \0A\09 jna JCCTGT \0A\09 jnbe JCCTGT \0A\09 ja JCCTGT \0A\09 js JCCTGT \0A\09 jns JCCTGT \0A\09 jp JCCTGT \0A\09 jpe JCCTGT \0A\09 jnp JCCTGT \0A\09 jpo JCCTGT \0A\09 jl JCCTGT \0A\09 jnge JCCTGT \0A\09 jnl JCCTGT \0A\09 jge JCCTGT \0A\09 jle JCCTGT \0A\09 jng JCCTGT \0A\09 jnle JCCTGT \0A\09 jg JCCTGT", ""()
+ ret void
+}
+
+define void @test_jecxz_jrcxz() optsize {
+; GENERIC-LABEL: test_jecxz_jrcxz:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: JXTGT:
+; GENERIC-NEXT: jecxz JXTGT # sched: [1:1.00]
+; GENERIC-NEXT: jrcxz JXTGT # sched: [2:1.00]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_jecxz_jrcxz:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: JXTGT:
+; ATOM-NEXT: jecxz JXTGT # sched: [4:2.00]
+; ATOM-NEXT: jrcxz JXTGT # sched: [4:2.00]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_jecxz_jrcxz:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: JXTGT:
+; SLM-NEXT: jecxz JXTGT # sched: [1:1.00]
+; SLM-NEXT: jrcxz JXTGT # sched: [1:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_jecxz_jrcxz:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: JXTGT:
+; SANDY-NEXT: jecxz JXTGT # sched: [1:1.00]
+; SANDY-NEXT: jrcxz JXTGT # sched: [2:1.00]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_jecxz_jrcxz:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: JXTGT:
+; HASWELL-NEXT: jecxz JXTGT # sched: [1:0.50]
+; HASWELL-NEXT: jrcxz JXTGT # sched: [2:0.50]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_jecxz_jrcxz:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: JXTGT:
+; BROADWELL-NEXT: jecxz JXTGT # sched: [1:0.50]
+; BROADWELL-NEXT: jrcxz JXTGT # sched: [2:0.50]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_jecxz_jrcxz:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: JXTGT:
+; SKYLAKE-NEXT: jecxz JXTGT # sched: [1:0.50]
+; SKYLAKE-NEXT: jrcxz JXTGT # sched: [2:0.50]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_jecxz_jrcxz:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: JXTGT:
+; SKX-NEXT: jecxz JXTGT # sched: [1:0.50]
+; SKX-NEXT: jrcxz JXTGT # sched: [2:0.50]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_jecxz_jrcxz:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: JXTGT:
+; BTVER2-NEXT: jecxz JXTGT # sched: [1:0.50]
+; BTVER2-NEXT: jrcxz JXTGT # sched: [1:0.50]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_jecxz_jrcxz:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: JXTGT:
+; ZNVER1-NEXT: jecxz JXTGT # sched: [1:0.25]
+; ZNVER1-NEXT: jrcxz JXTGT # sched: [1:0.50]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ call void asm sideeffect "JXTGT: \0A\09 jecxz JXTGT \0A\09 jrcxz JXTGT", ""()
+ ret void
+}
+
+; TODO - test_jmp
+
+define void @test_lahf_sahf() optsize {
+; GENERIC-LABEL: test_lahf_sahf:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: lahf # sched: [1:0.50]
+; GENERIC-NEXT: sahf # sched: [1:0.50]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_lahf_sahf:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: lahf # sched: [1:0.50]
+; ATOM-NEXT: sahf # sched: [1:0.50]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_lahf_sahf:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: lahf # sched: [1:0.50]
+; SLM-NEXT: sahf # sched: [1:0.50]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_lahf_sahf:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: lahf # sched: [1:0.50]
+; SANDY-NEXT: sahf # sched: [1:0.50]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_lahf_sahf:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: lahf # sched: [1:0.25]
+; HASWELL-NEXT: sahf # sched: [1:0.25]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_lahf_sahf:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: lahf # sched: [1:0.25]
+; BROADWELL-NEXT: sahf # sched: [1:0.25]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_lahf_sahf:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: lahf # sched: [1:0.25]
+; SKYLAKE-NEXT: sahf # sched: [1:0.25]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_lahf_sahf:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: lahf # sched: [1:0.25]
+; SKX-NEXT: sahf # sched: [1:0.25]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_lahf_sahf:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: lahf # sched: [1:0.50]
+; BTVER2-NEXT: sahf # sched: [1:0.50]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_lahf_sahf:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: lahf # sched: [100:?]
+; ZNVER1-NEXT: sahf # sched: [2:0.25]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ tail call void asm "lahf \0A\09 sahf", ""() nounwind
+ ret void
+}
+
+; TODO - test_lds
+; TODO - test_les
+; TODO - test_lfs
+; TODO - test_lgs
+; TODO - test_lss
+
+; TODO - test_lea
+
+define void @test_leave() optsize {
+; GENERIC-LABEL: test_leave:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: leave # sched: [3:1.00]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_leave:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: leave # sched: [2:1.00]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_leave:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: leave # sched: [1:0.50]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_leave:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: leave # sched: [3:1.00]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_leave:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: leave # sched: [7:0.50]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_leave:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: leave # sched: [7:0.50]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_leave:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: leave # sched: [7:0.50]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_leave:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: leave # sched: [7:0.50]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_leave:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: leave # sched: [1:0.50]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_leave:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: leave # sched: [8:0.50]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ tail call void asm "leave", ""() nounwind
+ ret void
+}
+
+define void @test_lods() optsize {
+; GENERIC-LABEL: test_lods:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: lodsb (%rsi), %al # sched: [7:0.67]
+; GENERIC-NEXT: lodsw (%rsi), %ax # sched: [7:0.67]
+; GENERIC-NEXT: lodsl (%rsi), %eax # sched: [6:0.50]
+; GENERIC-NEXT: lodsq (%rsi), %rax # sched: [6:0.50]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_lods:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: lodsb (%rsi), %al # sched: [2:1.00]
+; ATOM-NEXT: lodsw (%rsi), %ax # sched: [2:1.00]
+; ATOM-NEXT: lodsl (%rsi), %eax # sched: [2:1.00]
+; ATOM-NEXT: lodsq (%rsi), %rax # sched: [2:1.00]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_lods:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: lodsb (%rsi), %al # sched: [100:1.00]
+; SLM-NEXT: lodsw (%rsi), %ax # sched: [100:1.00]
+; SLM-NEXT: lodsl (%rsi), %eax # sched: [100:1.00]
+; SLM-NEXT: lodsq (%rsi), %rax # sched: [100:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_lods:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: lodsb (%rsi), %al # sched: [7:0.67]
+; SANDY-NEXT: lodsw (%rsi), %ax # sched: [7:0.67]
+; SANDY-NEXT: lodsl (%rsi), %eax # sched: [6:0.50]
+; SANDY-NEXT: lodsq (%rsi), %rax # sched: [6:0.50]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_lods:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: lodsb (%rsi), %al # sched: [1:0.50]
+; HASWELL-NEXT: lodsw (%rsi), %ax # sched: [1:0.50]
+; HASWELL-NEXT: lodsl (%rsi), %eax # sched: [1:0.50]
+; HASWELL-NEXT: lodsq (%rsi), %rax # sched: [1:0.50]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_lods:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: lodsb (%rsi), %al # sched: [100:0.25]
+; BROADWELL-NEXT: lodsw (%rsi), %ax # sched: [100:0.25]
+; BROADWELL-NEXT: lodsl (%rsi), %eax # sched: [100:0.25]
+; BROADWELL-NEXT: lodsq (%rsi), %rax # sched: [100:0.25]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_lods:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: lodsb (%rsi), %al # sched: [100:0.25]
+; SKYLAKE-NEXT: lodsw (%rsi), %ax # sched: [100:0.25]
+; SKYLAKE-NEXT: lodsl (%rsi), %eax # sched: [100:0.25]
+; SKYLAKE-NEXT: lodsq (%rsi), %rax # sched: [100:0.25]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_lods:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: lodsb (%rsi), %al # sched: [100:0.25]
+; SKX-NEXT: lodsw (%rsi), %ax # sched: [100:0.25]
+; SKX-NEXT: lodsl (%rsi), %eax # sched: [100:0.25]
+; SKX-NEXT: lodsq (%rsi), %rax # sched: [100:0.25]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_lods:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: lodsb (%rsi), %al # sched: [100:0.17]
+; BTVER2-NEXT: lodsw (%rsi), %ax # sched: [100:0.17]
+; BTVER2-NEXT: lodsl (%rsi), %eax # sched: [100:0.17]
+; BTVER2-NEXT: lodsq (%rsi), %rax # sched: [100:0.17]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_lods:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: lodsb (%rsi), %al # sched: [100:?]
+; ZNVER1-NEXT: lodsw (%rsi), %ax # sched: [100:?]
+; ZNVER1-NEXT: lodsl (%rsi), %eax # sched: [100:?]
+; ZNVER1-NEXT: lodsq (%rsi), %rax # sched: [100:?]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ call void asm sideeffect "lodsb \0A\09 lodsw \0A\09 lodsl \0A\09 lodsq", ""()
+ ret void
+}
+
+define void @test_loop() optsize {
+; GENERIC-LABEL: test_loop:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: LTGT:
+; GENERIC-NEXT: loop LTGT # sched: [1:1.00]
+; GENERIC-NEXT: loope LTGT # sched: [1:1.00]
+; GENERIC-NEXT: loopne LTGT # sched: [1:1.00]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_loop:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: LTGT:
+; ATOM-NEXT: loop LTGT # sched: [18:9.00]
+; ATOM-NEXT: loope LTGT # sched: [8:4.00]
+; ATOM-NEXT: loopne LTGT # sched: [17:8.50]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_loop:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: LTGT:
+; SLM-NEXT: loop LTGT # sched: [1:1.00]
+; SLM-NEXT: loope LTGT # sched: [1:1.00]
+; SLM-NEXT: loopne LTGT # sched: [1:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_loop:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: LTGT:
+; SANDY-NEXT: loop LTGT # sched: [1:1.00]
+; SANDY-NEXT: loope LTGT # sched: [1:1.00]
+; SANDY-NEXT: loopne LTGT # sched: [1:1.00]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_loop:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: LTGT:
+; HASWELL-NEXT: loop LTGT # sched: [7:2.00]
+; HASWELL-NEXT: loope LTGT # sched: [7:2.00]
+; HASWELL-NEXT: loopne LTGT # sched: [7:2.00]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_loop:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: LTGT:
+; BROADWELL-NEXT: loop LTGT # sched: [7:2.00]
+; BROADWELL-NEXT: loope LTGT # sched: [7:2.00]
+; BROADWELL-NEXT: loopne LTGT # sched: [7:2.00]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_loop:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: LTGT:
+; SKYLAKE-NEXT: loop LTGT # sched: [7:2.00]
+; SKYLAKE-NEXT: loope LTGT # sched: [7:2.00]
+; SKYLAKE-NEXT: loopne LTGT # sched: [7:2.00]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_loop:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: LTGT:
+; SKX-NEXT: loop LTGT # sched: [7:2.00]
+; SKX-NEXT: loope LTGT # sched: [7:2.00]
+; SKX-NEXT: loopne LTGT # sched: [7:2.00]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_loop:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: LTGT:
+; BTVER2-NEXT: loop LTGT # sched: [1:0.50]
+; BTVER2-NEXT: loope LTGT # sched: [1:0.50]
+; BTVER2-NEXT: loopne LTGT # sched: [1:0.50]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_loop:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: LTGT:
+; ZNVER1-NEXT: loop LTGT # sched: [1:0.50]
+; ZNVER1-NEXT: loope LTGT # sched: [1:0.50]
+; ZNVER1-NEXT: loopne LTGT # sched: [1:0.50]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ call void asm sideeffect "LTGT: \0A\09 loop LTGT \0A\09 loope LTGT \0A\09 loopne LTGT", ""()
+ ret void
+}
+
+; TODO - test_mov
+
+define void @test_movnti(i32 %a0, i32 *%a1, i64 %a2, i64 *%a3) optsize {
+; GENERIC-LABEL: test_movnti:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: movntil %edi, (%rsi) # sched: [5:1.00]
+; GENERIC-NEXT: movntiq %rdx, (%rcx) # sched: [5:1.00]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_movnti:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: movntil %edi, (%rsi) # sched: [1:1.00]
+; ATOM-NEXT: movntiq %rdx, (%rcx) # sched: [1:1.00]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_movnti:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: movntil %edi, (%rsi) # sched: [1:1.00]
+; SLM-NEXT: movntiq %rdx, (%rcx) # sched: [1:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_movnti:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: movntil %edi, (%rsi) # sched: [5:1.00]
+; SANDY-NEXT: movntiq %rdx, (%rcx) # sched: [5:1.00]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_movnti:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: movntil %edi, (%rsi) # sched: [1:1.00]
+; HASWELL-NEXT: movntiq %rdx, (%rcx) # sched: [1:1.00]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_movnti:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: movntil %edi, (%rsi) # sched: [1:1.00]
+; BROADWELL-NEXT: movntiq %rdx, (%rcx) # sched: [1:1.00]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_movnti:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: movntil %edi, (%rsi) # sched: [1:1.00]
+; SKYLAKE-NEXT: movntiq %rdx, (%rcx) # sched: [1:1.00]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_movnti:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: movntil %edi, (%rsi) # sched: [1:1.00]
+; SKX-NEXT: movntiq %rdx, (%rcx) # sched: [1:1.00]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_movnti:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: movntil %edi, (%rsi) # sched: [1:1.00]
+; BTVER2-NEXT: movntiq %rdx, (%rcx) # sched: [1:1.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_movnti:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: movntil %edi, (%rsi) # sched: [1:0.50]
+; ZNVER1-NEXT: movntiq %rdx, (%rcx) # sched: [1:0.50]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ call void asm sideeffect "movnti $0, $1 \0A\09 movnti $2, $3", "r,*m,r,*m"(i32 %a0, i32 *%a1, i64 %a2, i64 *%a3)
+ ret void
+}
+
+define void @test_movs() optsize {
+; GENERIC-LABEL: test_movs:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: movsb (%rsi), %es:(%rdi) # sched: [8:1.00]
+; GENERIC-NEXT: movsw (%rsi), %es:(%rdi) # sched: [8:1.00]
+; GENERIC-NEXT: movsl (%rsi), %es:(%rdi) # sched: [8:1.00]
+; GENERIC-NEXT: movsq (%rsi), %es:(%rdi) # sched: [8:1.00]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_movs:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: movsb (%rsi), %es:(%rdi) # sched: [3:1.50]
+; ATOM-NEXT: movsw (%rsi), %es:(%rdi) # sched: [3:1.50]
+; ATOM-NEXT: movsl (%rsi), %es:(%rdi) # sched: [3:1.50]
+; ATOM-NEXT: movsq (%rsi), %es:(%rdi) # sched: [3:1.50]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_movs:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: movsb (%rsi), %es:(%rdi) # sched: [100:1.00]
+; SLM-NEXT: movsw (%rsi), %es:(%rdi) # sched: [100:1.00]
+; SLM-NEXT: movsl (%rsi), %es:(%rdi) # sched: [100:1.00]
+; SLM-NEXT: movsq (%rsi), %es:(%rdi) # sched: [100:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_movs:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: movsb (%rsi), %es:(%rdi) # sched: [8:1.00]
+; SANDY-NEXT: movsw (%rsi), %es:(%rdi) # sched: [8:1.00]
+; SANDY-NEXT: movsl (%rsi), %es:(%rdi) # sched: [8:1.00]
+; SANDY-NEXT: movsq (%rsi), %es:(%rdi) # sched: [8:1.00]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_movs:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: movsb (%rsi), %es:(%rdi) # sched: [4:1.00]
+; HASWELL-NEXT: movsw (%rsi), %es:(%rdi) # sched: [4:1.00]
+; HASWELL-NEXT: movsl (%rsi), %es:(%rdi) # sched: [4:1.00]
+; HASWELL-NEXT: movsq (%rsi), %es:(%rdi) # sched: [4:1.00]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_movs:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: movsb (%rsi), %es:(%rdi) # sched: [100:0.25]
+; BROADWELL-NEXT: movsw (%rsi), %es:(%rdi) # sched: [100:0.25]
+; BROADWELL-NEXT: movsl (%rsi), %es:(%rdi) # sched: [100:0.25]
+; BROADWELL-NEXT: movsq (%rsi), %es:(%rdi) # sched: [100:0.25]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_movs:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: movsb (%rsi), %es:(%rdi) # sched: [100:0.25]
+; SKYLAKE-NEXT: movsw (%rsi), %es:(%rdi) # sched: [100:0.25]
+; SKYLAKE-NEXT: movsl (%rsi), %es:(%rdi) # sched: [100:0.25]
+; SKYLAKE-NEXT: movsq (%rsi), %es:(%rdi) # sched: [100:0.25]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_movs:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: movsb (%rsi), %es:(%rdi) # sched: [100:0.25]
+; SKX-NEXT: movsw (%rsi), %es:(%rdi) # sched: [100:0.25]
+; SKX-NEXT: movsl (%rsi), %es:(%rdi) # sched: [100:0.25]
+; SKX-NEXT: movsq (%rsi), %es:(%rdi) # sched: [100:0.25]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_movs:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: movsb (%rsi), %es:(%rdi) # sched: [100:0.17]
+; BTVER2-NEXT: movsw (%rsi), %es:(%rdi) # sched: [100:0.17]
+; BTVER2-NEXT: movsl (%rsi), %es:(%rdi) # sched: [100:0.17]
+; BTVER2-NEXT: movsq (%rsi), %es:(%rdi) # sched: [100:0.17]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_movs:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: movsb (%rsi), %es:(%rdi) # sched: [100:?]
+; ZNVER1-NEXT: movsw (%rsi), %es:(%rdi) # sched: [100:?]
+; ZNVER1-NEXT: movsl (%rsi), %es:(%rdi) # sched: [100:?]
+; ZNVER1-NEXT: movsq (%rsi), %es:(%rdi) # sched: [100:?]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ call void asm sideeffect "movsb \0A\09 movsw \0A\09 movsl \0A\09 movsq", ""()
+ ret void
+}
+
+; TODO - test_movsx
+; TODO - test_movzx
+
+define i64 @test_movslq(i32 %a0, i32 *%a1) optsize {
+; GENERIC-LABEL: test_movslq:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: movslq %edi, %rax # sched: [1:0.33]
+; GENERIC-NEXT: movslq (%rsi), %rcx # sched: [5:0.50]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: orq %rcx, %rax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_movslq:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: movslq %edi, %rax # sched: [1:1.00]
+; ATOM-NEXT: movslq (%rsi), %rcx # sched: [1:1.00]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: orq %rcx, %rax # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_movslq:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: movslq %edi, %rax # sched: [1:0.50]
+; SLM-NEXT: movslq (%rsi), %rcx # sched: [4:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: orq %rcx, %rax # sched: [1:0.50]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_movslq:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: movslq %edi, %rax # sched: [1:0.33]
+; SANDY-NEXT: movslq (%rsi), %rcx # sched: [5:0.50]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: orq %rcx, %rax # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_movslq:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: movslq %edi, %rax # sched: [1:0.25]
+; HASWELL-NEXT: movslq (%rsi), %rcx # sched: [5:0.50]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: orq %rcx, %rax # sched: [1:0.25]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_movslq:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: movslq %edi, %rax # sched: [1:0.25]
+; BROADWELL-NEXT: movslq (%rsi), %rcx # sched: [5:0.50]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: orq %rcx, %rax # sched: [1:0.25]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_movslq:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: movslq %edi, %rax # sched: [1:0.25]
+; SKYLAKE-NEXT: movslq (%rsi), %rcx # sched: [5:0.50]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: orq %rcx, %rax # sched: [1:0.25]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_movslq:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: movslq %edi, %rax # sched: [1:0.25]
+; SKX-NEXT: movslq (%rsi), %rcx # sched: [5:0.50]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: orq %rcx, %rax # sched: [1:0.25]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_movslq:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: movslq %edi, %rax # sched: [1:0.50]
+; BTVER2-NEXT: movslq (%rsi), %rcx # sched: [4:1.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: orq %rcx, %rax # sched: [1:0.50]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_movslq:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: movslq %edi, %rax # sched: [1:0.25]
+; ZNVER1-NEXT: movslq (%rsi), %rcx # sched: [5:0.50]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: orq %rcx, %rax # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call { i64, i64 } asm sideeffect "movslq $2, $0 \0A\09 movslq $3, $1", "=r,=r,r,*m"(i32 %a0, i32 *%a1)
+ %2 = extractvalue { i64, i64 } %1, 0
+ %3 = extractvalue { i64, i64 } %1, 1
+ %4 = or i64 %2, %3
+ ret i64 %4
+}
+
+define void @test_mul(i8 %a0, i16 %a1, i32 %a2, i64 %a3, i8 *%p0, i16 *%p1, i32 *%p2, i64 *%p3) optsize {
+; GENERIC-LABEL: test_mul:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: movq {{[0-9]+}}(%rsp), %r10 # sched: [5:0.50]
+; GENERIC-NEXT: movq {{[0-9]+}}(%rsp), %rax # sched: [5:0.50]
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: mulb %dil # sched: [3:1.00]
+; GENERIC-NEXT: mulb (%r8) # sched: [8:1.00]
+; GENERIC-NEXT: mulw %si # sched: [4:1.00]
+; GENERIC-NEXT: mulw (%r9) # sched: [9:1.00]
+; GENERIC-NEXT: mull %edx # sched: [4:1.00]
+; GENERIC-NEXT: mull (%rax) # sched: [9:1.00]
+; GENERIC-NEXT: mulq %rcx # sched: [4:1.00]
+; GENERIC-NEXT: mulq (%r10) # sched: [9:1.00]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_mul:
+; ATOM: # %bb.0:
+; ATOM-NEXT: movq {{[0-9]+}}(%rsp), %r10 # sched: [1:1.00]
+; ATOM-NEXT: movq {{[0-9]+}}(%rsp), %rax # sched: [1:1.00]
+; ATOM-NEXT: #APP
+; ATOM-NEXT: mulb %dil # sched: [7:3.50]
+; ATOM-NEXT: mulb (%r8) # sched: [7:3.50]
+; ATOM-NEXT: mulw %si # sched: [7:3.50]
+; ATOM-NEXT: mulw (%r9) # sched: [8:4.00]
+; ATOM-NEXT: mull %edx # sched: [6:3.00]
+; ATOM-NEXT: mull (%rax) # sched: [7:3.50]
+; ATOM-NEXT: mulq %rcx # sched: [12:6.00]
+; ATOM-NEXT: mulq (%r10) # sched: [12:6.00]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_mul:
+; SLM: # %bb.0:
+; SLM-NEXT: movq {{[0-9]+}}(%rsp), %r10 # sched: [3:1.00]
+; SLM-NEXT: movq {{[0-9]+}}(%rsp), %rax # sched: [3:1.00]
+; SLM-NEXT: #APP
+; SLM-NEXT: mulb %dil # sched: [3:1.00]
+; SLM-NEXT: mulb (%r8) # sched: [6:1.00]
+; SLM-NEXT: mulw %si # sched: [3:1.00]
+; SLM-NEXT: mulw (%r9) # sched: [6:1.00]
+; SLM-NEXT: mull %edx # sched: [3:1.00]
+; SLM-NEXT: mull (%rax) # sched: [6:1.00]
+; SLM-NEXT: mulq %rcx # sched: [3:1.00]
+; SLM-NEXT: mulq (%r10) # sched: [6:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_mul:
+; SANDY: # %bb.0:
+; SANDY-NEXT: movq {{[0-9]+}}(%rsp), %r10 # sched: [5:0.50]
+; SANDY-NEXT: movq {{[0-9]+}}(%rsp), %rax # sched: [5:0.50]
+; SANDY-NEXT: #APP
+; SANDY-NEXT: mulb %dil # sched: [3:1.00]
+; SANDY-NEXT: mulb (%r8) # sched: [8:1.00]
+; SANDY-NEXT: mulw %si # sched: [4:1.00]
+; SANDY-NEXT: mulw (%r9) # sched: [9:1.00]
+; SANDY-NEXT: mull %edx # sched: [4:1.00]
+; SANDY-NEXT: mull (%rax) # sched: [9:1.00]
+; SANDY-NEXT: mulq %rcx # sched: [4:1.00]
+; SANDY-NEXT: mulq (%r10) # sched: [9:1.00]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_mul:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: movq {{[0-9]+}}(%rsp), %r10 # sched: [5:0.50]
+; HASWELL-NEXT: movq {{[0-9]+}}(%rsp), %rax # sched: [5:0.50]
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: mulb %dil # sched: [3:1.00]
+; HASWELL-NEXT: mulb (%r8) # sched: [8:1.00]
+; HASWELL-NEXT: mulw %si # sched: [4:1.00]
+; HASWELL-NEXT: mulw (%r9) # sched: [8:1.00]
+; HASWELL-NEXT: mull %edx # sched: [4:1.00]
+; HASWELL-NEXT: mull (%rax) # sched: [8:1.00]
+; HASWELL-NEXT: mulq %rcx # sched: [4:1.00]
+; HASWELL-NEXT: mulq (%r10) # sched: [8:1.00]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_mul:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: movq {{[0-9]+}}(%rsp), %r10 # sched: [5:0.50]
+; BROADWELL-NEXT: movq {{[0-9]+}}(%rsp), %rax # sched: [5:0.50]
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: mulb %dil # sched: [3:1.00]
+; BROADWELL-NEXT: mulb (%r8) # sched: [8:1.00]
+; BROADWELL-NEXT: mulw %si # sched: [4:1.00]
+; BROADWELL-NEXT: mulw (%r9) # sched: [8:1.00]
+; BROADWELL-NEXT: mull %edx # sched: [4:1.00]
+; BROADWELL-NEXT: mull (%rax) # sched: [8:1.00]
+; BROADWELL-NEXT: mulq %rcx # sched: [4:1.00]
+; BROADWELL-NEXT: mulq (%r10) # sched: [8:1.00]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_mul:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: movq {{[0-9]+}}(%rsp), %r10 # sched: [5:0.50]
+; SKYLAKE-NEXT: movq {{[0-9]+}}(%rsp), %rax # sched: [5:0.50]
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: mulb %dil # sched: [3:1.00]
+; SKYLAKE-NEXT: mulb (%r8) # sched: [8:1.00]
+; SKYLAKE-NEXT: mulw %si # sched: [4:1.00]
+; SKYLAKE-NEXT: mulw (%r9) # sched: [8:1.00]
+; SKYLAKE-NEXT: mull %edx # sched: [5:1.00]
+; SKYLAKE-NEXT: mull (%rax) # sched: [8:1.00]
+; SKYLAKE-NEXT: mulq %rcx # sched: [4:1.00]
+; SKYLAKE-NEXT: mulq (%r10) # sched: [8:1.00]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_mul:
+; SKX: # %bb.0:
+; SKX-NEXT: movq {{[0-9]+}}(%rsp), %r10 # sched: [5:0.50]
+; SKX-NEXT: movq {{[0-9]+}}(%rsp), %rax # sched: [5:0.50]
+; SKX-NEXT: #APP
+; SKX-NEXT: mulb %dil # sched: [3:1.00]
+; SKX-NEXT: mulb (%r8) # sched: [8:1.00]
+; SKX-NEXT: mulw %si # sched: [4:1.00]
+; SKX-NEXT: mulw (%r9) # sched: [8:1.00]
+; SKX-NEXT: mull %edx # sched: [4:1.00]
+; SKX-NEXT: mull (%rax) # sched: [8:1.00]
+; SKX-NEXT: mulq %rcx # sched: [4:1.00]
+; SKX-NEXT: mulq (%r10) # sched: [8:1.00]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_mul:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: movq {{[0-9]+}}(%rsp), %r10 # sched: [5:1.00]
+; BTVER2-NEXT: movq {{[0-9]+}}(%rsp), %rax # sched: [5:1.00]
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: mulb %dil # sched: [3:1.00]
+; BTVER2-NEXT: mulb (%r8) # sched: [6:1.00]
+; BTVER2-NEXT: mulw %si # sched: [3:1.00]
+; BTVER2-NEXT: mulw (%r9) # sched: [6:1.00]
+; BTVER2-NEXT: mull %edx # sched: [3:1.00]
+; BTVER2-NEXT: mull (%rax) # sched: [6:1.00]
+; BTVER2-NEXT: mulq %rcx # sched: [3:1.00]
+; BTVER2-NEXT: mulq (%r10) # sched: [6:1.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_mul:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: movq {{[0-9]+}}(%rsp), %r10 # sched: [8:0.50]
+; ZNVER1-NEXT: movq {{[0-9]+}}(%rsp), %rax # sched: [8:0.50]
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: mulb %dil # sched: [4:1.00]
+; ZNVER1-NEXT: mulb (%r8) # sched: [8:1.00]
+; ZNVER1-NEXT: mulw %si # sched: [3:1.00]
+; ZNVER1-NEXT: mulw (%r9) # sched: [8:1.00]
+; ZNVER1-NEXT: mull %edx # sched: [3:1.00]
+; ZNVER1-NEXT: mull (%rax) # sched: [8:1.00]
+; ZNVER1-NEXT: mulq %rcx # sched: [4:1.00]
+; ZNVER1-NEXT: mulq (%r10) # sched: [9:1.00]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ tail call void asm "mulb $0 \0A\09 mulb $4 \0A\09 mulw $1 \0A\09 mulw $5 \0A\09 mull $2 \0A\09 mull $6 \0A\09 mulq $3 \0A\09 mulq $7", "r,r,r,r,*m,*m,*m,*m"(i8 %a0, i16 %a1, i32 %a2, i64 %a3, i8 *%p0, i16 *%p1, i32 *%p2, i64 *%p3) nounwind
+ ret void
+}
+
+define void @test_neg(i8 %a0, i16 %a1, i32 %a2, i64 %a3, i8 *%p0, i16 *%p1, i32 *%p2, i64 *%p3) optsize {
+; GENERIC-LABEL: test_neg:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: movq {{[0-9]+}}(%rsp), %r10 # sched: [5:0.50]
+; GENERIC-NEXT: movq {{[0-9]+}}(%rsp), %rax # sched: [5:0.50]
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: negb %dil # sched: [1:0.33]
+; GENERIC-NEXT: negb (%r8) # sched: [7:1.00]
+; GENERIC-NEXT: negw %si # sched: [1:0.33]
+; GENERIC-NEXT: negw (%r9) # sched: [7:1.00]
+; GENERIC-NEXT: negl %edx # sched: [1:0.33]
+; GENERIC-NEXT: negl (%rax) # sched: [7:1.00]
+; GENERIC-NEXT: negq %rcx # sched: [1:0.33]
+; GENERIC-NEXT: negq (%r10) # sched: [7:1.00]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_neg:
+; ATOM: # %bb.0:
+; ATOM-NEXT: movq {{[0-9]+}}(%rsp), %r10 # sched: [1:1.00]
+; ATOM-NEXT: movq {{[0-9]+}}(%rsp), %rax # sched: [1:1.00]
+; ATOM-NEXT: #APP
+; ATOM-NEXT: negb %dil # sched: [1:0.50]
+; ATOM-NEXT: negb (%r8) # sched: [1:1.00]
+; ATOM-NEXT: negw %si # sched: [1:0.50]
+; ATOM-NEXT: negw (%r9) # sched: [1:1.00]
+; ATOM-NEXT: negl %edx # sched: [1:0.50]
+; ATOM-NEXT: negl (%rax) # sched: [1:1.00]
+; ATOM-NEXT: negq %rcx # sched: [1:0.50]
+; ATOM-NEXT: negq (%r10) # sched: [1:1.00]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_neg:
+; SLM: # %bb.0:
+; SLM-NEXT: movq {{[0-9]+}}(%rsp), %r10 # sched: [3:1.00]
+; SLM-NEXT: movq {{[0-9]+}}(%rsp), %rax # sched: [3:1.00]
+; SLM-NEXT: #APP
+; SLM-NEXT: negb %dil # sched: [1:0.50]
+; SLM-NEXT: negb (%r8) # sched: [4:2.00]
+; SLM-NEXT: negw %si # sched: [1:0.50]
+; SLM-NEXT: negw (%r9) # sched: [4:2.00]
+; SLM-NEXT: negl %edx # sched: [1:0.50]
+; SLM-NEXT: negl (%rax) # sched: [4:2.00]
+; SLM-NEXT: negq %rcx # sched: [1:0.50]
+; SLM-NEXT: negq (%r10) # sched: [4:2.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_neg:
+; SANDY: # %bb.0:
+; SANDY-NEXT: movq {{[0-9]+}}(%rsp), %r10 # sched: [5:0.50]
+; SANDY-NEXT: movq {{[0-9]+}}(%rsp), %rax # sched: [5:0.50]
+; SANDY-NEXT: #APP
+; SANDY-NEXT: negb %dil # sched: [1:0.33]
+; SANDY-NEXT: negb (%r8) # sched: [7:1.00]
+; SANDY-NEXT: negw %si # sched: [1:0.33]
+; SANDY-NEXT: negw (%r9) # sched: [7:1.00]
+; SANDY-NEXT: negl %edx # sched: [1:0.33]
+; SANDY-NEXT: negl (%rax) # sched: [7:1.00]
+; SANDY-NEXT: negq %rcx # sched: [1:0.33]
+; SANDY-NEXT: negq (%r10) # sched: [7:1.00]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_neg:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: movq {{[0-9]+}}(%rsp), %r10 # sched: [5:0.50]
+; HASWELL-NEXT: movq {{[0-9]+}}(%rsp), %rax # sched: [5:0.50]
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: negb %dil # sched: [1:0.25]
+; HASWELL-NEXT: negb (%r8) # sched: [7:1.00]
+; HASWELL-NEXT: negw %si # sched: [1:0.25]
+; HASWELL-NEXT: negw (%r9) # sched: [7:1.00]
+; HASWELL-NEXT: negl %edx # sched: [1:0.25]
+; HASWELL-NEXT: negl (%rax) # sched: [7:1.00]
+; HASWELL-NEXT: negq %rcx # sched: [1:0.25]
+; HASWELL-NEXT: negq (%r10) # sched: [7:1.00]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_neg:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: movq {{[0-9]+}}(%rsp), %r10 # sched: [5:0.50]
+; BROADWELL-NEXT: movq {{[0-9]+}}(%rsp), %rax # sched: [5:0.50]
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: negb %dil # sched: [1:0.25]
+; BROADWELL-NEXT: negb (%r8) # sched: [6:1.00]
+; BROADWELL-NEXT: negw %si # sched: [1:0.25]
+; BROADWELL-NEXT: negw (%r9) # sched: [6:1.00]
+; BROADWELL-NEXT: negl %edx # sched: [1:0.25]
+; BROADWELL-NEXT: negl (%rax) # sched: [6:1.00]
+; BROADWELL-NEXT: negq %rcx # sched: [1:0.25]
+; BROADWELL-NEXT: negq (%r10) # sched: [6:1.00]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_neg:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: movq {{[0-9]+}}(%rsp), %r10 # sched: [5:0.50]
+; SKYLAKE-NEXT: movq {{[0-9]+}}(%rsp), %rax # sched: [5:0.50]
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: negb %dil # sched: [1:0.25]
+; SKYLAKE-NEXT: negb (%r8) # sched: [6:1.00]
+; SKYLAKE-NEXT: negw %si # sched: [1:0.25]
+; SKYLAKE-NEXT: negw (%r9) # sched: [6:1.00]
+; SKYLAKE-NEXT: negl %edx # sched: [1:0.25]
+; SKYLAKE-NEXT: negl (%rax) # sched: [6:1.00]
+; SKYLAKE-NEXT: negq %rcx # sched: [1:0.25]
+; SKYLAKE-NEXT: negq (%r10) # sched: [6:1.00]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_neg:
+; SKX: # %bb.0:
+; SKX-NEXT: movq {{[0-9]+}}(%rsp), %r10 # sched: [5:0.50]
+; SKX-NEXT: movq {{[0-9]+}}(%rsp), %rax # sched: [5:0.50]
+; SKX-NEXT: #APP
+; SKX-NEXT: negb %dil # sched: [1:0.25]
+; SKX-NEXT: negb (%r8) # sched: [6:1.00]
+; SKX-NEXT: negw %si # sched: [1:0.25]
+; SKX-NEXT: negw (%r9) # sched: [6:1.00]
+; SKX-NEXT: negl %edx # sched: [1:0.25]
+; SKX-NEXT: negl (%rax) # sched: [6:1.00]
+; SKX-NEXT: negq %rcx # sched: [1:0.25]
+; SKX-NEXT: negq (%r10) # sched: [6:1.00]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_neg:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: movq {{[0-9]+}}(%rsp), %r10 # sched: [5:1.00]
+; BTVER2-NEXT: movq {{[0-9]+}}(%rsp), %rax # sched: [5:1.00]
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: negb %dil # sched: [1:0.50]
+; BTVER2-NEXT: negb (%r8) # sched: [4:1.00]
+; BTVER2-NEXT: negw %si # sched: [1:0.50]
+; BTVER2-NEXT: negw (%r9) # sched: [4:1.00]
+; BTVER2-NEXT: negl %edx # sched: [1:0.50]
+; BTVER2-NEXT: negl (%rax) # sched: [4:1.00]
+; BTVER2-NEXT: negq %rcx # sched: [1:0.50]
+; BTVER2-NEXT: negq (%r10) # sched: [4:1.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_neg:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: movq {{[0-9]+}}(%rsp), %r10 # sched: [8:0.50]
+; ZNVER1-NEXT: movq {{[0-9]+}}(%rsp), %rax # sched: [8:0.50]
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: negb %dil # sched: [1:0.25]
+; ZNVER1-NEXT: negb (%r8) # sched: [5:0.50]
+; ZNVER1-NEXT: negw %si # sched: [1:0.25]
+; ZNVER1-NEXT: negw (%r9) # sched: [5:0.50]
+; ZNVER1-NEXT: negl %edx # sched: [1:0.25]
+; ZNVER1-NEXT: negl (%rax) # sched: [5:0.50]
+; ZNVER1-NEXT: negq %rcx # sched: [1:0.25]
+; ZNVER1-NEXT: negq (%r10) # sched: [5:0.50]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ tail call void asm "negb $0 \0A\09 negb $4 \0A\09 negw $1 \0A\09 negw $5 \0A\09 negl $2 \0A\09 negl $6 \0A\09 negq $3 \0A\09 negq $7", "r,r,r,r,*m,*m,*m,*m"(i8 %a0, i16 %a1, i32 %a2, i64 %a3, i8 *%p0, i16 *%p1, i32 *%p2, i64 *%p3) nounwind
+ ret void
+}
+
+define void @test_nop(i16 %a0, i32 %a1, i64 %a2, i16 *%p0, i32 *%p1, i64 *%p2) optsize {
+; GENERIC-LABEL: test_nop:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: nop # sched: [1:?]
+; GENERIC-NEXT: nopw %di # sched: [1:?]
+; GENERIC-NEXT: nopw (%rcx) # sched: [1:?]
+; GENERIC-NEXT: nopl %esi # sched: [1:?]
+; GENERIC-NEXT: nopl (%r8) # sched: [1:?]
+; GENERIC-NEXT: nopq %rdx # sched: [1:?]
+; GENERIC-NEXT: nopq (%r9) # sched: [1:?]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_nop:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nopw %di # sched: [1:0.50]
+; ATOM-NEXT: nopw (%rcx) # sched: [1:0.50]
+; ATOM-NEXT: nopl %esi # sched: [1:0.50]
+; ATOM-NEXT: nopl (%r8) # sched: [1:0.50]
+; ATOM-NEXT: nopq %rdx # sched: [1:0.50]
+; ATOM-NEXT: nopq (%r9) # sched: [1:0.50]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_nop:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: nop # sched: [1:?]
+; SLM-NEXT: nopw %di # sched: [1:?]
+; SLM-NEXT: nopw (%rcx) # sched: [1:?]
+; SLM-NEXT: nopl %esi # sched: [1:?]
+; SLM-NEXT: nopl (%r8) # sched: [1:?]
+; SLM-NEXT: nopq %rdx # sched: [1:?]
+; SLM-NEXT: nopq (%r9) # sched: [1:?]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_nop:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: nop # sched: [1:?]
+; SANDY-NEXT: nopw %di # sched: [1:?]
+; SANDY-NEXT: nopw (%rcx) # sched: [1:?]
+; SANDY-NEXT: nopl %esi # sched: [1:?]
+; SANDY-NEXT: nopl (%r8) # sched: [1:?]
+; SANDY-NEXT: nopq %rdx # sched: [1:?]
+; SANDY-NEXT: nopq (%r9) # sched: [1:?]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_nop:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: nop # sched: [1:0.25]
+; HASWELL-NEXT: nopw %di # sched: [1:0.25]
+; HASWELL-NEXT: nopw (%rcx) # sched: [1:0.25]
+; HASWELL-NEXT: nopl %esi # sched: [1:0.25]
+; HASWELL-NEXT: nopl (%r8) # sched: [1:0.25]
+; HASWELL-NEXT: nopq %rdx # sched: [1:0.25]
+; HASWELL-NEXT: nopq (%r9) # sched: [1:0.25]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_nop:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: nop # sched: [1:0.25]
+; BROADWELL-NEXT: nopw %di # sched: [1:0.25]
+; BROADWELL-NEXT: nopw (%rcx) # sched: [1:0.25]
+; BROADWELL-NEXT: nopl %esi # sched: [1:0.25]
+; BROADWELL-NEXT: nopl (%r8) # sched: [1:0.25]
+; BROADWELL-NEXT: nopq %rdx # sched: [1:0.25]
+; BROADWELL-NEXT: nopq (%r9) # sched: [1:0.25]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_nop:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: nop # sched: [1:0.25]
+; SKYLAKE-NEXT: nopw %di # sched: [1:0.25]
+; SKYLAKE-NEXT: nopw (%rcx) # sched: [1:0.25]
+; SKYLAKE-NEXT: nopl %esi # sched: [1:0.25]
+; SKYLAKE-NEXT: nopl (%r8) # sched: [1:0.25]
+; SKYLAKE-NEXT: nopq %rdx # sched: [1:0.25]
+; SKYLAKE-NEXT: nopq (%r9) # sched: [1:0.25]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_nop:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: nop # sched: [1:0.25]
+; SKX-NEXT: nopw %di # sched: [1:0.25]
+; SKX-NEXT: nopw (%rcx) # sched: [1:0.25]
+; SKX-NEXT: nopl %esi # sched: [1:0.25]
+; SKX-NEXT: nopl (%r8) # sched: [1:0.25]
+; SKX-NEXT: nopq %rdx # sched: [1:0.25]
+; SKX-NEXT: nopq (%r9) # sched: [1:0.25]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_nop:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: nop # sched: [1:?]
+; BTVER2-NEXT: nopw %di # sched: [1:?]
+; BTVER2-NEXT: nopw (%rcx) # sched: [1:?]
+; BTVER2-NEXT: nopl %esi # sched: [1:?]
+; BTVER2-NEXT: nopl (%r8) # sched: [1:?]
+; BTVER2-NEXT: nopq %rdx # sched: [1:?]
+; BTVER2-NEXT: nopq (%r9) # sched: [1:?]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_nop:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: nop # sched: [1:?]
+; ZNVER1-NEXT: nopw %di # sched: [1:?]
+; ZNVER1-NEXT: nopw (%rcx) # sched: [1:?]
+; ZNVER1-NEXT: nopl %esi # sched: [1:?]
+; ZNVER1-NEXT: nopl (%r8) # sched: [1:?]
+; ZNVER1-NEXT: nopq %rdx # sched: [1:?]
+; ZNVER1-NEXT: nopq (%r9) # sched: [1:?]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ tail call void asm "nop \0A\09 nopw $0 \0A\09 nopw $3 \0A\09 nopl $1 \0A\09 nopl $4 \0A\09 nopq $2 \0A\09 nopq $5", "r,r,r,*m,*m,*m"(i16 %a0, i32 %a1, i64 %a2, i16 *%p0, i32 *%p1, i64 *%p2) nounwind
+ ret void
+}
+
+define void @test_not(i8 %a0, i16 %a1, i32 %a2, i64 %a3, i8 *%p0, i16 *%p1, i32 *%p2, i64 *%p3) optsize {
+; GENERIC-LABEL: test_not:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: movq {{[0-9]+}}(%rsp), %r10 # sched: [5:0.50]
+; GENERIC-NEXT: movq {{[0-9]+}}(%rsp), %rax # sched: [5:0.50]
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: notb %dil # sched: [1:0.33]
+; GENERIC-NEXT: notb (%r8) # sched: [7:1.00]
+; GENERIC-NEXT: notw %si # sched: [1:0.33]
+; GENERIC-NEXT: notw (%r9) # sched: [7:1.00]
+; GENERIC-NEXT: notl %edx # sched: [1:0.33]
+; GENERIC-NEXT: notl (%rax) # sched: [7:1.00]
+; GENERIC-NEXT: notq %rcx # sched: [1:0.33]
+; GENERIC-NEXT: notq (%r10) # sched: [7:1.00]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_not:
+; ATOM: # %bb.0:
+; ATOM-NEXT: movq {{[0-9]+}}(%rsp), %r10 # sched: [1:1.00]
+; ATOM-NEXT: movq {{[0-9]+}}(%rsp), %rax # sched: [1:1.00]
+; ATOM-NEXT: #APP
+; ATOM-NEXT: notb %dil # sched: [1:0.50]
+; ATOM-NEXT: notb (%r8) # sched: [1:1.00]
+; ATOM-NEXT: notw %si # sched: [1:0.50]
+; ATOM-NEXT: notw (%r9) # sched: [1:1.00]
+; ATOM-NEXT: notl %edx # sched: [1:0.50]
+; ATOM-NEXT: notl (%rax) # sched: [1:1.00]
+; ATOM-NEXT: notq %rcx # sched: [1:0.50]
+; ATOM-NEXT: notq (%r10) # sched: [1:1.00]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_not:
+; SLM: # %bb.0:
+; SLM-NEXT: movq {{[0-9]+}}(%rsp), %r10 # sched: [3:1.00]
+; SLM-NEXT: movq {{[0-9]+}}(%rsp), %rax # sched: [3:1.00]
+; SLM-NEXT: #APP
+; SLM-NEXT: notb %dil # sched: [1:0.50]
+; SLM-NEXT: notb (%r8) # sched: [4:2.00]
+; SLM-NEXT: notw %si # sched: [1:0.50]
+; SLM-NEXT: notw (%r9) # sched: [4:2.00]
+; SLM-NEXT: notl %edx # sched: [1:0.50]
+; SLM-NEXT: notl (%rax) # sched: [4:2.00]
+; SLM-NEXT: notq %rcx # sched: [1:0.50]
+; SLM-NEXT: notq (%r10) # sched: [4:2.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_not:
+; SANDY: # %bb.0:
+; SANDY-NEXT: movq {{[0-9]+}}(%rsp), %r10 # sched: [5:0.50]
+; SANDY-NEXT: movq {{[0-9]+}}(%rsp), %rax # sched: [5:0.50]
+; SANDY-NEXT: #APP
+; SANDY-NEXT: notb %dil # sched: [1:0.33]
+; SANDY-NEXT: notb (%r8) # sched: [7:1.00]
+; SANDY-NEXT: notw %si # sched: [1:0.33]
+; SANDY-NEXT: notw (%r9) # sched: [7:1.00]
+; SANDY-NEXT: notl %edx # sched: [1:0.33]
+; SANDY-NEXT: notl (%rax) # sched: [7:1.00]
+; SANDY-NEXT: notq %rcx # sched: [1:0.33]
+; SANDY-NEXT: notq (%r10) # sched: [7:1.00]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_not:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: movq {{[0-9]+}}(%rsp), %r10 # sched: [5:0.50]
+; HASWELL-NEXT: movq {{[0-9]+}}(%rsp), %rax # sched: [5:0.50]
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: notb %dil # sched: [1:0.25]
+; HASWELL-NEXT: notb (%r8) # sched: [7:1.00]
+; HASWELL-NEXT: notw %si # sched: [1:0.25]
+; HASWELL-NEXT: notw (%r9) # sched: [7:1.00]
+; HASWELL-NEXT: notl %edx # sched: [1:0.25]
+; HASWELL-NEXT: notl (%rax) # sched: [7:1.00]
+; HASWELL-NEXT: notq %rcx # sched: [1:0.25]
+; HASWELL-NEXT: notq (%r10) # sched: [7:1.00]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_not:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: movq {{[0-9]+}}(%rsp), %r10 # sched: [5:0.50]
+; BROADWELL-NEXT: movq {{[0-9]+}}(%rsp), %rax # sched: [5:0.50]
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: notb %dil # sched: [1:0.25]
+; BROADWELL-NEXT: notb (%r8) # sched: [6:1.00]
+; BROADWELL-NEXT: notw %si # sched: [1:0.25]
+; BROADWELL-NEXT: notw (%r9) # sched: [6:1.00]
+; BROADWELL-NEXT: notl %edx # sched: [1:0.25]
+; BROADWELL-NEXT: notl (%rax) # sched: [6:1.00]
+; BROADWELL-NEXT: notq %rcx # sched: [1:0.25]
+; BROADWELL-NEXT: notq (%r10) # sched: [6:1.00]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_not:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: movq {{[0-9]+}}(%rsp), %r10 # sched: [5:0.50]
+; SKYLAKE-NEXT: movq {{[0-9]+}}(%rsp), %rax # sched: [5:0.50]
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: notb %dil # sched: [1:0.25]
+; SKYLAKE-NEXT: notb (%r8) # sched: [6:1.00]
+; SKYLAKE-NEXT: notw %si # sched: [1:0.25]
+; SKYLAKE-NEXT: notw (%r9) # sched: [6:1.00]
+; SKYLAKE-NEXT: notl %edx # sched: [1:0.25]
+; SKYLAKE-NEXT: notl (%rax) # sched: [6:1.00]
+; SKYLAKE-NEXT: notq %rcx # sched: [1:0.25]
+; SKYLAKE-NEXT: notq (%r10) # sched: [6:1.00]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_not:
+; SKX: # %bb.0:
+; SKX-NEXT: movq {{[0-9]+}}(%rsp), %r10 # sched: [5:0.50]
+; SKX-NEXT: movq {{[0-9]+}}(%rsp), %rax # sched: [5:0.50]
+; SKX-NEXT: #APP
+; SKX-NEXT: notb %dil # sched: [1:0.25]
+; SKX-NEXT: notb (%r8) # sched: [6:1.00]
+; SKX-NEXT: notw %si # sched: [1:0.25]
+; SKX-NEXT: notw (%r9) # sched: [6:1.00]
+; SKX-NEXT: notl %edx # sched: [1:0.25]
+; SKX-NEXT: notl (%rax) # sched: [6:1.00]
+; SKX-NEXT: notq %rcx # sched: [1:0.25]
+; SKX-NEXT: notq (%r10) # sched: [6:1.00]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_not:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: movq {{[0-9]+}}(%rsp), %r10 # sched: [5:1.00]
+; BTVER2-NEXT: movq {{[0-9]+}}(%rsp), %rax # sched: [5:1.00]
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: notb %dil # sched: [1:0.50]
+; BTVER2-NEXT: notb (%r8) # sched: [4:1.00]
+; BTVER2-NEXT: notw %si # sched: [1:0.50]
+; BTVER2-NEXT: notw (%r9) # sched: [4:1.00]
+; BTVER2-NEXT: notl %edx # sched: [1:0.50]
+; BTVER2-NEXT: notl (%rax) # sched: [4:1.00]
+; BTVER2-NEXT: notq %rcx # sched: [1:0.50]
+; BTVER2-NEXT: notq (%r10) # sched: [4:1.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_not:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: movq {{[0-9]+}}(%rsp), %r10 # sched: [8:0.50]
+; ZNVER1-NEXT: movq {{[0-9]+}}(%rsp), %rax # sched: [8:0.50]
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: notb %dil # sched: [1:0.25]
+; ZNVER1-NEXT: notb (%r8) # sched: [5:0.50]
+; ZNVER1-NEXT: notw %si # sched: [1:0.25]
+; ZNVER1-NEXT: notw (%r9) # sched: [5:0.50]
+; ZNVER1-NEXT: notl %edx # sched: [1:0.25]
+; ZNVER1-NEXT: notl (%rax) # sched: [5:0.50]
+; ZNVER1-NEXT: notq %rcx # sched: [1:0.25]
+; ZNVER1-NEXT: notq (%r10) # sched: [5:0.50]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ tail call void asm "notb $0 \0A\09 notb $4 \0A\09 notw $1 \0A\09 notw $5 \0A\09 notl $2 \0A\09 notl $6 \0A\09 notq $3 \0A\09 notq $7", "r,r,r,r,*m,*m,*m,*m"(i8 %a0, i16 %a1, i32 %a2, i64 %a3, i8 *%p0, i16 *%p1, i32 *%p2, i64 *%p3) nounwind
+ ret void
+}
+
+define void @test_or_8(i8 %a0, i8* %a1) optsize {
+; GENERIC-LABEL: test_or_8:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: orb $7, %al # sched: [1:0.33]
+; GENERIC-NEXT: orb $7, %dil # sched: [1:0.33]
+; GENERIC-NEXT: orb $7, (%rsi) # sched: [7:1.00]
+; GENERIC-NEXT: orb %dil, %dil # sched: [1:0.33]
+; GENERIC-NEXT: orb %dil, (%rsi) # sched: [7:1.00]
+; GENERIC-NEXT: orb (%rsi), %dil # sched: [6:0.50]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_or_8:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: orb $7, %al # sched: [1:0.50]
+; ATOM-NEXT: orb $7, %dil # sched: [1:0.50]
+; ATOM-NEXT: orb $7, (%rsi) # sched: [1:1.00]
+; ATOM-NEXT: orb %dil, %dil # sched: [1:0.50]
+; ATOM-NEXT: orb %dil, (%rsi) # sched: [1:1.00]
+; ATOM-NEXT: orb (%rsi), %dil # sched: [1:1.00]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_or_8:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: orb $7, %al # sched: [1:0.50]
+; SLM-NEXT: orb $7, %dil # sched: [1:0.50]
+; SLM-NEXT: orb $7, (%rsi) # sched: [4:2.00]
+; SLM-NEXT: orb %dil, %dil # sched: [1:0.50]
+; SLM-NEXT: orb %dil, (%rsi) # sched: [4:2.00]
+; SLM-NEXT: orb (%rsi), %dil # sched: [4:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_or_8:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: orb $7, %al # sched: [1:0.33]
+; SANDY-NEXT: orb $7, %dil # sched: [1:0.33]
+; SANDY-NEXT: orb $7, (%rsi) # sched: [7:1.00]
+; SANDY-NEXT: orb %dil, %dil # sched: [1:0.33]
+; SANDY-NEXT: orb %dil, (%rsi) # sched: [7:1.00]
+; SANDY-NEXT: orb (%rsi), %dil # sched: [6:0.50]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_or_8:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: orb $7, %al # sched: [1:0.25]
+; HASWELL-NEXT: orb $7, %dil # sched: [1:0.25]
+; HASWELL-NEXT: orb $7, (%rsi) # sched: [7:1.00]
+; HASWELL-NEXT: orb %dil, %dil # sched: [1:0.25]
+; HASWELL-NEXT: orb %dil, (%rsi) # sched: [7:1.00]
+; HASWELL-NEXT: orb (%rsi), %dil # sched: [6:0.50]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_or_8:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: orb $7, %al # sched: [1:0.25]
+; BROADWELL-NEXT: orb $7, %dil # sched: [1:0.25]
+; BROADWELL-NEXT: orb $7, (%rsi) # sched: [6:1.00]
+; BROADWELL-NEXT: orb %dil, %dil # sched: [1:0.25]
+; BROADWELL-NEXT: orb %dil, (%rsi) # sched: [6:1.00]
+; BROADWELL-NEXT: orb (%rsi), %dil # sched: [6:0.50]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_or_8:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: orb $7, %al # sched: [1:0.25]
+; SKYLAKE-NEXT: orb $7, %dil # sched: [1:0.25]
+; SKYLAKE-NEXT: orb $7, (%rsi) # sched: [6:1.00]
+; SKYLAKE-NEXT: orb %dil, %dil # sched: [1:0.25]
+; SKYLAKE-NEXT: orb %dil, (%rsi) # sched: [6:1.00]
+; SKYLAKE-NEXT: orb (%rsi), %dil # sched: [6:0.50]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_or_8:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: orb $7, %al # sched: [1:0.25]
+; SKX-NEXT: orb $7, %dil # sched: [1:0.25]
+; SKX-NEXT: orb $7, (%rsi) # sched: [6:1.00]
+; SKX-NEXT: orb %dil, %dil # sched: [1:0.25]
+; SKX-NEXT: orb %dil, (%rsi) # sched: [6:1.00]
+; SKX-NEXT: orb (%rsi), %dil # sched: [6:0.50]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_or_8:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: orb $7, %al # sched: [1:0.50]
+; BTVER2-NEXT: orb $7, %dil # sched: [1:0.50]
+; BTVER2-NEXT: orb $7, (%rsi) # sched: [4:1.00]
+; BTVER2-NEXT: orb %dil, %dil # sched: [1:0.50]
+; BTVER2-NEXT: orb %dil, (%rsi) # sched: [4:1.00]
+; BTVER2-NEXT: orb (%rsi), %dil # sched: [4:1.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_or_8:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: orb $7, %al # sched: [1:0.25]
+; ZNVER1-NEXT: orb $7, %dil # sched: [1:0.25]
+; ZNVER1-NEXT: orb $7, (%rsi) # sched: [5:0.50]
+; ZNVER1-NEXT: orb %dil, %dil # sched: [1:0.25]
+; ZNVER1-NEXT: orb %dil, (%rsi) # sched: [5:0.50]
+; ZNVER1-NEXT: orb (%rsi), %dil # sched: [5:0.50]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ tail call void asm "orb $2, %AL \0A\09 orb $2, $0 \0A\09 orb $2, $1 \0A\09 orb $0, $0 \0A\09 orb $0, $1 \0A\09 orb $1, $0", "r,*m,i"(i8 %a0, i8* %a1, i8 7) nounwind
+ ret void
+}
+define void @test_or_16(i16 %a0, i16* %a1) optsize {
+; GENERIC-LABEL: test_or_16:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: orw $511, %ax # imm = 0x1FF
+; GENERIC-NEXT: # sched: [1:0.33]
+; GENERIC-NEXT: orw $511, %di # imm = 0x1FF
+; GENERIC-NEXT: # sched: [1:0.33]
+; GENERIC-NEXT: orw $511, (%rsi) # imm = 0x1FF
+; GENERIC-NEXT: # sched: [7:1.00]
+; GENERIC-NEXT: orw $7, %di # sched: [1:0.33]
+; GENERIC-NEXT: orw $7, (%rsi) # sched: [7:1.00]
+; GENERIC-NEXT: orw %di, %di # sched: [1:0.33]
+; GENERIC-NEXT: orw %di, (%rsi) # sched: [7:1.00]
+; GENERIC-NEXT: orw (%rsi), %di # sched: [6:0.50]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_or_16:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: orw $511, %ax # imm = 0x1FF
+; ATOM-NEXT: # sched: [1:0.50]
+; ATOM-NEXT: orw $511, %di # imm = 0x1FF
+; ATOM-NEXT: # sched: [1:0.50]
+; ATOM-NEXT: orw $511, (%rsi) # imm = 0x1FF
+; ATOM-NEXT: # sched: [1:1.00]
+; ATOM-NEXT: orw $7, %di # sched: [1:0.50]
+; ATOM-NEXT: orw $7, (%rsi) # sched: [1:1.00]
+; ATOM-NEXT: orw %di, %di # sched: [1:0.50]
+; ATOM-NEXT: orw %di, (%rsi) # sched: [1:1.00]
+; ATOM-NEXT: orw (%rsi), %di # sched: [1:1.00]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_or_16:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: orw $511, %ax # imm = 0x1FF
+; SLM-NEXT: # sched: [1:0.50]
+; SLM-NEXT: orw $511, %di # imm = 0x1FF
+; SLM-NEXT: # sched: [1:0.50]
+; SLM-NEXT: orw $511, (%rsi) # imm = 0x1FF
+; SLM-NEXT: # sched: [4:2.00]
+; SLM-NEXT: orw $7, %di # sched: [1:0.50]
+; SLM-NEXT: orw $7, (%rsi) # sched: [4:2.00]
+; SLM-NEXT: orw %di, %di # sched: [1:0.50]
+; SLM-NEXT: orw %di, (%rsi) # sched: [4:2.00]
+; SLM-NEXT: orw (%rsi), %di # sched: [4:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_or_16:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: orw $511, %ax # imm = 0x1FF
+; SANDY-NEXT: # sched: [1:0.33]
+; SANDY-NEXT: orw $511, %di # imm = 0x1FF
+; SANDY-NEXT: # sched: [1:0.33]
+; SANDY-NEXT: orw $511, (%rsi) # imm = 0x1FF
+; SANDY-NEXT: # sched: [7:1.00]
+; SANDY-NEXT: orw $7, %di # sched: [1:0.33]
+; SANDY-NEXT: orw $7, (%rsi) # sched: [7:1.00]
+; SANDY-NEXT: orw %di, %di # sched: [1:0.33]
+; SANDY-NEXT: orw %di, (%rsi) # sched: [7:1.00]
+; SANDY-NEXT: orw (%rsi), %di # sched: [6:0.50]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_or_16:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: orw $511, %ax # imm = 0x1FF
+; HASWELL-NEXT: # sched: [1:0.25]
+; HASWELL-NEXT: orw $511, %di # imm = 0x1FF
+; HASWELL-NEXT: # sched: [1:0.25]
+; HASWELL-NEXT: orw $511, (%rsi) # imm = 0x1FF
+; HASWELL-NEXT: # sched: [7:1.00]
+; HASWELL-NEXT: orw $7, %di # sched: [1:0.25]
+; HASWELL-NEXT: orw $7, (%rsi) # sched: [7:1.00]
+; HASWELL-NEXT: orw %di, %di # sched: [1:0.25]
+; HASWELL-NEXT: orw %di, (%rsi) # sched: [7:1.00]
+; HASWELL-NEXT: orw (%rsi), %di # sched: [6:0.50]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_or_16:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: orw $511, %ax # imm = 0x1FF
+; BROADWELL-NEXT: # sched: [1:0.25]
+; BROADWELL-NEXT: orw $511, %di # imm = 0x1FF
+; BROADWELL-NEXT: # sched: [1:0.25]
+; BROADWELL-NEXT: orw $511, (%rsi) # imm = 0x1FF
+; BROADWELL-NEXT: # sched: [6:1.00]
+; BROADWELL-NEXT: orw $7, %di # sched: [1:0.25]
+; BROADWELL-NEXT: orw $7, (%rsi) # sched: [6:1.00]
+; BROADWELL-NEXT: orw %di, %di # sched: [1:0.25]
+; BROADWELL-NEXT: orw %di, (%rsi) # sched: [6:1.00]
+; BROADWELL-NEXT: orw (%rsi), %di # sched: [6:0.50]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_or_16:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: orw $511, %ax # imm = 0x1FF
+; SKYLAKE-NEXT: # sched: [1:0.25]
+; SKYLAKE-NEXT: orw $511, %di # imm = 0x1FF
+; SKYLAKE-NEXT: # sched: [1:0.25]
+; SKYLAKE-NEXT: orw $511, (%rsi) # imm = 0x1FF
+; SKYLAKE-NEXT: # sched: [6:1.00]
+; SKYLAKE-NEXT: orw $7, %di # sched: [1:0.25]
+; SKYLAKE-NEXT: orw $7, (%rsi) # sched: [6:1.00]
+; SKYLAKE-NEXT: orw %di, %di # sched: [1:0.25]
+; SKYLAKE-NEXT: orw %di, (%rsi) # sched: [6:1.00]
+; SKYLAKE-NEXT: orw (%rsi), %di # sched: [6:0.50]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_or_16:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: orw $511, %ax # imm = 0x1FF
+; SKX-NEXT: # sched: [1:0.25]
+; SKX-NEXT: orw $511, %di # imm = 0x1FF
+; SKX-NEXT: # sched: [1:0.25]
+; SKX-NEXT: orw $511, (%rsi) # imm = 0x1FF
+; SKX-NEXT: # sched: [6:1.00]
+; SKX-NEXT: orw $7, %di # sched: [1:0.25]
+; SKX-NEXT: orw $7, (%rsi) # sched: [6:1.00]
+; SKX-NEXT: orw %di, %di # sched: [1:0.25]
+; SKX-NEXT: orw %di, (%rsi) # sched: [6:1.00]
+; SKX-NEXT: orw (%rsi), %di # sched: [6:0.50]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_or_16:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: orw $511, %ax # imm = 0x1FF
+; BTVER2-NEXT: # sched: [1:0.50]
+; BTVER2-NEXT: orw $511, %di # imm = 0x1FF
+; BTVER2-NEXT: # sched: [1:0.50]
+; BTVER2-NEXT: orw $511, (%rsi) # imm = 0x1FF
+; BTVER2-NEXT: # sched: [4:1.00]
+; BTVER2-NEXT: orw $7, %di # sched: [1:0.50]
+; BTVER2-NEXT: orw $7, (%rsi) # sched: [4:1.00]
+; BTVER2-NEXT: orw %di, %di # sched: [1:0.50]
+; BTVER2-NEXT: orw %di, (%rsi) # sched: [4:1.00]
+; BTVER2-NEXT: orw (%rsi), %di # sched: [4:1.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_or_16:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: orw $511, %ax # imm = 0x1FF
+; ZNVER1-NEXT: # sched: [1:0.25]
+; ZNVER1-NEXT: orw $511, %di # imm = 0x1FF
+; ZNVER1-NEXT: # sched: [1:0.25]
+; ZNVER1-NEXT: orw $511, (%rsi) # imm = 0x1FF
+; ZNVER1-NEXT: # sched: [5:0.50]
+; ZNVER1-NEXT: orw $7, %di # sched: [1:0.25]
+; ZNVER1-NEXT: orw $7, (%rsi) # sched: [5:0.50]
+; ZNVER1-NEXT: orw %di, %di # sched: [1:0.25]
+; ZNVER1-NEXT: orw %di, (%rsi) # sched: [5:0.50]
+; ZNVER1-NEXT: orw (%rsi), %di # sched: [5:0.50]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ tail call void asm "orw $2, %AX \0A\09 orw $2, $0 \0A\09 orw $2, $1 \0A\09 orw $3, $0 \0A\09 orw $3, $1 \0A\09 orw $0, $0 \0A\09 orw $0, $1 \0A\09 orw $1, $0", "r,*m,i,i"(i16 %a0, i16* %a1, i16 511, i8 7) nounwind
+ ret void
+}
+define void @test_or_32(i32 %a0, i32* %a1) optsize {
+; GENERIC-LABEL: test_or_32:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: orl $665536, %eax # imm = 0xA27C0
+; GENERIC-NEXT: # sched: [1:0.33]
+; GENERIC-NEXT: orl $665536, %edi # imm = 0xA27C0
+; GENERIC-NEXT: # sched: [1:0.33]
+; GENERIC-NEXT: orl $665536, (%rsi) # imm = 0xA27C0
+; GENERIC-NEXT: # sched: [7:1.00]
+; GENERIC-NEXT: orl $7, %edi # sched: [1:0.33]
+; GENERIC-NEXT: orl $7, (%rsi) # sched: [7:1.00]
+; GENERIC-NEXT: orl %edi, %edi # sched: [1:0.33]
+; GENERIC-NEXT: orl %edi, (%rsi) # sched: [7:1.00]
+; GENERIC-NEXT: orl (%rsi), %edi # sched: [6:0.50]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_or_32:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: orl $665536, %eax # imm = 0xA27C0
+; ATOM-NEXT: # sched: [1:0.50]
+; ATOM-NEXT: orl $665536, %edi # imm = 0xA27C0
+; ATOM-NEXT: # sched: [1:0.50]
+; ATOM-NEXT: orl $665536, (%rsi) # imm = 0xA27C0
+; ATOM-NEXT: # sched: [1:1.00]
+; ATOM-NEXT: orl $7, %edi # sched: [1:0.50]
+; ATOM-NEXT: orl $7, (%rsi) # sched: [1:1.00]
+; ATOM-NEXT: orl %edi, %edi # sched: [1:0.50]
+; ATOM-NEXT: orl %edi, (%rsi) # sched: [1:1.00]
+; ATOM-NEXT: orl (%rsi), %edi # sched: [1:1.00]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_or_32:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: orl $665536, %eax # imm = 0xA27C0
+; SLM-NEXT: # sched: [1:0.50]
+; SLM-NEXT: orl $665536, %edi # imm = 0xA27C0
+; SLM-NEXT: # sched: [1:0.50]
+; SLM-NEXT: orl $665536, (%rsi) # imm = 0xA27C0
+; SLM-NEXT: # sched: [4:2.00]
+; SLM-NEXT: orl $7, %edi # sched: [1:0.50]
+; SLM-NEXT: orl $7, (%rsi) # sched: [4:2.00]
+; SLM-NEXT: orl %edi, %edi # sched: [1:0.50]
+; SLM-NEXT: orl %edi, (%rsi) # sched: [4:2.00]
+; SLM-NEXT: orl (%rsi), %edi # sched: [4:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_or_32:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: orl $665536, %eax # imm = 0xA27C0
+; SANDY-NEXT: # sched: [1:0.33]
+; SANDY-NEXT: orl $665536, %edi # imm = 0xA27C0
+; SANDY-NEXT: # sched: [1:0.33]
+; SANDY-NEXT: orl $665536, (%rsi) # imm = 0xA27C0
+; SANDY-NEXT: # sched: [7:1.00]
+; SANDY-NEXT: orl $7, %edi # sched: [1:0.33]
+; SANDY-NEXT: orl $7, (%rsi) # sched: [7:1.00]
+; SANDY-NEXT: orl %edi, %edi # sched: [1:0.33]
+; SANDY-NEXT: orl %edi, (%rsi) # sched: [7:1.00]
+; SANDY-NEXT: orl (%rsi), %edi # sched: [6:0.50]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_or_32:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: orl $665536, %eax # imm = 0xA27C0
+; HASWELL-NEXT: # sched: [1:0.25]
+; HASWELL-NEXT: orl $665536, %edi # imm = 0xA27C0
+; HASWELL-NEXT: # sched: [1:0.25]
+; HASWELL-NEXT: orl $665536, (%rsi) # imm = 0xA27C0
+; HASWELL-NEXT: # sched: [7:1.00]
+; HASWELL-NEXT: orl $7, %edi # sched: [1:0.25]
+; HASWELL-NEXT: orl $7, (%rsi) # sched: [7:1.00]
+; HASWELL-NEXT: orl %edi, %edi # sched: [1:0.25]
+; HASWELL-NEXT: orl %edi, (%rsi) # sched: [7:1.00]
+; HASWELL-NEXT: orl (%rsi), %edi # sched: [6:0.50]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_or_32:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: orl $665536, %eax # imm = 0xA27C0
+; BROADWELL-NEXT: # sched: [1:0.25]
+; BROADWELL-NEXT: orl $665536, %edi # imm = 0xA27C0
+; BROADWELL-NEXT: # sched: [1:0.25]
+; BROADWELL-NEXT: orl $665536, (%rsi) # imm = 0xA27C0
+; BROADWELL-NEXT: # sched: [6:1.00]
+; BROADWELL-NEXT: orl $7, %edi # sched: [1:0.25]
+; BROADWELL-NEXT: orl $7, (%rsi) # sched: [6:1.00]
+; BROADWELL-NEXT: orl %edi, %edi # sched: [1:0.25]
+; BROADWELL-NEXT: orl %edi, (%rsi) # sched: [6:1.00]
+; BROADWELL-NEXT: orl (%rsi), %edi # sched: [6:0.50]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_or_32:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: orl $665536, %eax # imm = 0xA27C0
+; SKYLAKE-NEXT: # sched: [1:0.25]
+; SKYLAKE-NEXT: orl $665536, %edi # imm = 0xA27C0
+; SKYLAKE-NEXT: # sched: [1:0.25]
+; SKYLAKE-NEXT: orl $665536, (%rsi) # imm = 0xA27C0
+; SKYLAKE-NEXT: # sched: [6:1.00]
+; SKYLAKE-NEXT: orl $7, %edi # sched: [1:0.25]
+; SKYLAKE-NEXT: orl $7, (%rsi) # sched: [6:1.00]
+; SKYLAKE-NEXT: orl %edi, %edi # sched: [1:0.25]
+; SKYLAKE-NEXT: orl %edi, (%rsi) # sched: [6:1.00]
+; SKYLAKE-NEXT: orl (%rsi), %edi # sched: [6:0.50]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_or_32:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: orl $665536, %eax # imm = 0xA27C0
+; SKX-NEXT: # sched: [1:0.25]
+; SKX-NEXT: orl $665536, %edi # imm = 0xA27C0
+; SKX-NEXT: # sched: [1:0.25]
+; SKX-NEXT: orl $665536, (%rsi) # imm = 0xA27C0
+; SKX-NEXT: # sched: [6:1.00]
+; SKX-NEXT: orl $7, %edi # sched: [1:0.25]
+; SKX-NEXT: orl $7, (%rsi) # sched: [6:1.00]
+; SKX-NEXT: orl %edi, %edi # sched: [1:0.25]
+; SKX-NEXT: orl %edi, (%rsi) # sched: [6:1.00]
+; SKX-NEXT: orl (%rsi), %edi # sched: [6:0.50]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_or_32:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: orl $665536, %eax # imm = 0xA27C0
+; BTVER2-NEXT: # sched: [1:0.50]
+; BTVER2-NEXT: orl $665536, %edi # imm = 0xA27C0
+; BTVER2-NEXT: # sched: [1:0.50]
+; BTVER2-NEXT: orl $665536, (%rsi) # imm = 0xA27C0
+; BTVER2-NEXT: # sched: [4:1.00]
+; BTVER2-NEXT: orl $7, %edi # sched: [1:0.50]
+; BTVER2-NEXT: orl $7, (%rsi) # sched: [4:1.00]
+; BTVER2-NEXT: orl %edi, %edi # sched: [1:0.50]
+; BTVER2-NEXT: orl %edi, (%rsi) # sched: [4:1.00]
+; BTVER2-NEXT: orl (%rsi), %edi # sched: [4:1.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_or_32:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: orl $665536, %eax # imm = 0xA27C0
+; ZNVER1-NEXT: # sched: [1:0.25]
+; ZNVER1-NEXT: orl $665536, %edi # imm = 0xA27C0
+; ZNVER1-NEXT: # sched: [1:0.25]
+; ZNVER1-NEXT: orl $665536, (%rsi) # imm = 0xA27C0
+; ZNVER1-NEXT: # sched: [5:0.50]
+; ZNVER1-NEXT: orl $7, %edi # sched: [1:0.25]
+; ZNVER1-NEXT: orl $7, (%rsi) # sched: [5:0.50]
+; ZNVER1-NEXT: orl %edi, %edi # sched: [1:0.25]
+; ZNVER1-NEXT: orl %edi, (%rsi) # sched: [5:0.50]
+; ZNVER1-NEXT: orl (%rsi), %edi # sched: [5:0.50]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ tail call void asm "orl $2, %EAX \0A\09 orl $2, $0 \0A\09 orl $2, $1 \0A\09 orl $3, $0 \0A\09 orl $3, $1 \0A\09 orl $0, $0 \0A\09 orl $0, $1 \0A\09 orl $1, $0", "r,*m,i,i"(i32 %a0, i32* %a1, i32 665536, i8 7) nounwind
+ ret void
+}
+define void @test_or_64(i64 %a0, i64* %a1) optsize {
+; GENERIC-LABEL: test_or_64:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: orq $665536, %rax # imm = 0xA27C0
+; GENERIC-NEXT: # sched: [1:0.33]
+; GENERIC-NEXT: orq $665536, %rdi # imm = 0xA27C0
+; GENERIC-NEXT: # sched: [1:0.33]
+; GENERIC-NEXT: orq $665536, (%rsi) # imm = 0xA27C0
+; GENERIC-NEXT: # sched: [7:1.00]
+; GENERIC-NEXT: orq $7, %rdi # sched: [1:0.33]
+; GENERIC-NEXT: orq $7, (%rsi) # sched: [7:1.00]
+; GENERIC-NEXT: orq %rdi, %rdi # sched: [1:0.33]
+; GENERIC-NEXT: orq %rdi, (%rsi) # sched: [7:1.00]
+; GENERIC-NEXT: orq (%rsi), %rdi # sched: [6:0.50]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_or_64:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: orq $665536, %rax # imm = 0xA27C0
+; ATOM-NEXT: # sched: [1:0.50]
+; ATOM-NEXT: orq $665536, %rdi # imm = 0xA27C0
+; ATOM-NEXT: # sched: [1:0.50]
+; ATOM-NEXT: orq $665536, (%rsi) # imm = 0xA27C0
+; ATOM-NEXT: # sched: [1:1.00]
+; ATOM-NEXT: orq $7, %rdi # sched: [1:0.50]
+; ATOM-NEXT: orq $7, (%rsi) # sched: [1:1.00]
+; ATOM-NEXT: orq %rdi, %rdi # sched: [1:0.50]
+; ATOM-NEXT: orq %rdi, (%rsi) # sched: [1:1.00]
+; ATOM-NEXT: orq (%rsi), %rdi # sched: [1:1.00]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_or_64:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: orq $665536, %rax # imm = 0xA27C0
+; SLM-NEXT: # sched: [1:0.50]
+; SLM-NEXT: orq $665536, %rdi # imm = 0xA27C0
+; SLM-NEXT: # sched: [1:0.50]
+; SLM-NEXT: orq $665536, (%rsi) # imm = 0xA27C0
+; SLM-NEXT: # sched: [4:2.00]
+; SLM-NEXT: orq $7, %rdi # sched: [1:0.50]
+; SLM-NEXT: orq $7, (%rsi) # sched: [4:2.00]
+; SLM-NEXT: orq %rdi, %rdi # sched: [1:0.50]
+; SLM-NEXT: orq %rdi, (%rsi) # sched: [4:2.00]
+; SLM-NEXT: orq (%rsi), %rdi # sched: [4:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_or_64:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: orq $665536, %rax # imm = 0xA27C0
+; SANDY-NEXT: # sched: [1:0.33]
+; SANDY-NEXT: orq $665536, %rdi # imm = 0xA27C0
+; SANDY-NEXT: # sched: [1:0.33]
+; SANDY-NEXT: orq $665536, (%rsi) # imm = 0xA27C0
+; SANDY-NEXT: # sched: [7:1.00]
+; SANDY-NEXT: orq $7, %rdi # sched: [1:0.33]
+; SANDY-NEXT: orq $7, (%rsi) # sched: [7:1.00]
+; SANDY-NEXT: orq %rdi, %rdi # sched: [1:0.33]
+; SANDY-NEXT: orq %rdi, (%rsi) # sched: [7:1.00]
+; SANDY-NEXT: orq (%rsi), %rdi # sched: [6:0.50]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_or_64:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: orq $665536, %rax # imm = 0xA27C0
+; HASWELL-NEXT: # sched: [1:0.25]
+; HASWELL-NEXT: orq $665536, %rdi # imm = 0xA27C0
+; HASWELL-NEXT: # sched: [1:0.25]
+; HASWELL-NEXT: orq $665536, (%rsi) # imm = 0xA27C0
+; HASWELL-NEXT: # sched: [7:1.00]
+; HASWELL-NEXT: orq $7, %rdi # sched: [1:0.25]
+; HASWELL-NEXT: orq $7, (%rsi) # sched: [7:1.00]
+; HASWELL-NEXT: orq %rdi, %rdi # sched: [1:0.25]
+; HASWELL-NEXT: orq %rdi, (%rsi) # sched: [7:1.00]
+; HASWELL-NEXT: orq (%rsi), %rdi # sched: [6:0.50]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_or_64:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: orq $665536, %rax # imm = 0xA27C0
+; BROADWELL-NEXT: # sched: [1:0.25]
+; BROADWELL-NEXT: orq $665536, %rdi # imm = 0xA27C0
+; BROADWELL-NEXT: # sched: [1:0.25]
+; BROADWELL-NEXT: orq $665536, (%rsi) # imm = 0xA27C0
+; BROADWELL-NEXT: # sched: [6:1.00]
+; BROADWELL-NEXT: orq $7, %rdi # sched: [1:0.25]
+; BROADWELL-NEXT: orq $7, (%rsi) # sched: [6:1.00]
+; BROADWELL-NEXT: orq %rdi, %rdi # sched: [1:0.25]
+; BROADWELL-NEXT: orq %rdi, (%rsi) # sched: [6:1.00]
+; BROADWELL-NEXT: orq (%rsi), %rdi # sched: [6:0.50]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_or_64:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: orq $665536, %rax # imm = 0xA27C0
+; SKYLAKE-NEXT: # sched: [1:0.25]
+; SKYLAKE-NEXT: orq $665536, %rdi # imm = 0xA27C0
+; SKYLAKE-NEXT: # sched: [1:0.25]
+; SKYLAKE-NEXT: orq $665536, (%rsi) # imm = 0xA27C0
+; SKYLAKE-NEXT: # sched: [6:1.00]
+; SKYLAKE-NEXT: orq $7, %rdi # sched: [1:0.25]
+; SKYLAKE-NEXT: orq $7, (%rsi) # sched: [6:1.00]
+; SKYLAKE-NEXT: orq %rdi, %rdi # sched: [1:0.25]
+; SKYLAKE-NEXT: orq %rdi, (%rsi) # sched: [6:1.00]
+; SKYLAKE-NEXT: orq (%rsi), %rdi # sched: [6:0.50]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_or_64:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: orq $665536, %rax # imm = 0xA27C0
+; SKX-NEXT: # sched: [1:0.25]
+; SKX-NEXT: orq $665536, %rdi # imm = 0xA27C0
+; SKX-NEXT: # sched: [1:0.25]
+; SKX-NEXT: orq $665536, (%rsi) # imm = 0xA27C0
+; SKX-NEXT: # sched: [6:1.00]
+; SKX-NEXT: orq $7, %rdi # sched: [1:0.25]
+; SKX-NEXT: orq $7, (%rsi) # sched: [6:1.00]
+; SKX-NEXT: orq %rdi, %rdi # sched: [1:0.25]
+; SKX-NEXT: orq %rdi, (%rsi) # sched: [6:1.00]
+; SKX-NEXT: orq (%rsi), %rdi # sched: [6:0.50]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_or_64:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: orq $665536, %rax # imm = 0xA27C0
+; BTVER2-NEXT: # sched: [1:0.50]
+; BTVER2-NEXT: orq $665536, %rdi # imm = 0xA27C0
+; BTVER2-NEXT: # sched: [1:0.50]
+; BTVER2-NEXT: orq $665536, (%rsi) # imm = 0xA27C0
+; BTVER2-NEXT: # sched: [4:1.00]
+; BTVER2-NEXT: orq $7, %rdi # sched: [1:0.50]
+; BTVER2-NEXT: orq $7, (%rsi) # sched: [4:1.00]
+; BTVER2-NEXT: orq %rdi, %rdi # sched: [1:0.50]
+; BTVER2-NEXT: orq %rdi, (%rsi) # sched: [4:1.00]
+; BTVER2-NEXT: orq (%rsi), %rdi # sched: [4:1.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_or_64:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: orq $665536, %rax # imm = 0xA27C0
+; ZNVER1-NEXT: # sched: [1:0.25]
+; ZNVER1-NEXT: orq $665536, %rdi # imm = 0xA27C0
+; ZNVER1-NEXT: # sched: [1:0.25]
+; ZNVER1-NEXT: orq $665536, (%rsi) # imm = 0xA27C0
+; ZNVER1-NEXT: # sched: [5:0.50]
+; ZNVER1-NEXT: orq $7, %rdi # sched: [1:0.25]
+; ZNVER1-NEXT: orq $7, (%rsi) # sched: [5:0.50]
+; ZNVER1-NEXT: orq %rdi, %rdi # sched: [1:0.25]
+; ZNVER1-NEXT: orq %rdi, (%rsi) # sched: [5:0.50]
+; ZNVER1-NEXT: orq (%rsi), %rdi # sched: [5:0.50]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ tail call void asm "orq $2, %RAX \0A\09 orq $2, $0 \0A\09 orq $2, $1 \0A\09 orq $3, $0 \0A\09 orq $3, $1 \0A\09 orq $0, $0 \0A\09 orq $0, $1 \0A\09 orq $1, $0", "r,*m,i,i"(i64 %a0, i64* %a1, i32 665536, i8 7) nounwind
+ ret void
+}
+
+define void @test_out() optsize {
+; GENERIC-LABEL: test_out:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: outb %al, $7 # sched: [4:1.33]
+; GENERIC-NEXT: outw %ax, $7 # sched: [100:0.33]
+; GENERIC-NEXT: outl %eax, $7 # sched: [4:1.33]
+; GENERIC-NEXT: outb %al, %dx # sched: [3:1.00]
+; GENERIC-NEXT: outw %ax, %dx # sched: [100:0.33]
+; GENERIC-NEXT: outl %eax, %dx # sched: [3:1.00]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_out:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: outb %al, $7 # sched: [72:36.00]
+; ATOM-NEXT: outw %ax, $7 # sched: [72:36.00]
+; ATOM-NEXT: outl %eax, $7 # sched: [72:36.00]
+; ATOM-NEXT: outb %al, %dx # sched: [68:34.00]
+; ATOM-NEXT: outw %ax, %dx # sched: [68:34.00]
+; ATOM-NEXT: outl %eax, %dx # sched: [68:34.00]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_out:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: outb %al, $7 # sched: [100:1.00]
+; SLM-NEXT: outw %ax, $7 # sched: [100:1.00]
+; SLM-NEXT: outl %eax, $7 # sched: [100:1.00]
+; SLM-NEXT: outb %al, %dx # sched: [100:1.00]
+; SLM-NEXT: outw %ax, %dx # sched: [100:1.00]
+; SLM-NEXT: outl %eax, %dx # sched: [100:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_out:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: outb %al, $7 # sched: [4:1.33]
+; SANDY-NEXT: outw %ax, $7 # sched: [100:0.33]
+; SANDY-NEXT: outl %eax, $7 # sched: [4:1.33]
+; SANDY-NEXT: outb %al, %dx # sched: [3:1.00]
+; SANDY-NEXT: outw %ax, %dx # sched: [100:0.33]
+; SANDY-NEXT: outl %eax, %dx # sched: [3:1.00]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_out:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: outb %al, $7 # sched: [36:5.00]
+; HASWELL-NEXT: outw %ax, $7 # sched: [36:5.00]
+; HASWELL-NEXT: outl %eax, $7 # sched: [36:5.00]
+; HASWELL-NEXT: outb %al, %dx # sched: [36:5.00]
+; HASWELL-NEXT: outw %ax, %dx # sched: [36:5.00]
+; HASWELL-NEXT: outl %eax, %dx # sched: [36:5.00]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_out:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: outb %al, $7 # sched: [35:5.00]
+; BROADWELL-NEXT: outw %ax, $7 # sched: [35:5.00]
+; BROADWELL-NEXT: outl %eax, $7 # sched: [35:5.00]
+; BROADWELL-NEXT: outb %al, %dx # sched: [35:5.00]
+; BROADWELL-NEXT: outw %ax, %dx # sched: [35:5.00]
+; BROADWELL-NEXT: outl %eax, %dx # sched: [35:5.00]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_out:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: outb %al, $7 # sched: [35:5.00]
+; SKYLAKE-NEXT: outw %ax, $7 # sched: [35:5.00]
+; SKYLAKE-NEXT: outl %eax, $7 # sched: [35:5.00]
+; SKYLAKE-NEXT: outb %al, %dx # sched: [35:5.00]
+; SKYLAKE-NEXT: outw %ax, %dx # sched: [35:5.00]
+; SKYLAKE-NEXT: outl %eax, %dx # sched: [35:5.00]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_out:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: outb %al, $7 # sched: [35:5.00]
+; SKX-NEXT: outw %ax, $7 # sched: [35:5.00]
+; SKX-NEXT: outl %eax, $7 # sched: [35:5.00]
+; SKX-NEXT: outb %al, %dx # sched: [35:5.00]
+; SKX-NEXT: outw %ax, %dx # sched: [35:5.00]
+; SKX-NEXT: outl %eax, %dx # sched: [35:5.00]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_out:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: outb %al, $7 # sched: [100:0.17]
+; BTVER2-NEXT: outw %ax, $7 # sched: [100:0.17]
+; BTVER2-NEXT: outl %eax, $7 # sched: [100:0.17]
+; BTVER2-NEXT: outb %al, %dx # sched: [100:0.17]
+; BTVER2-NEXT: outw %ax, %dx # sched: [100:0.17]
+; BTVER2-NEXT: outl %eax, %dx # sched: [100:0.17]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_out:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: outb %al, $7 # sched: [100:?]
+; ZNVER1-NEXT: outw %ax, $7 # sched: [100:?]
+; ZNVER1-NEXT: outl %eax, $7 # sched: [100:?]
+; ZNVER1-NEXT: outb %al, %dx # sched: [100:?]
+; ZNVER1-NEXT: outw %ax, %dx # sched: [100:?]
+; ZNVER1-NEXT: outl %eax, %dx # sched: [100:?]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ tail call void asm "outb %AL, $0 \0A\09 outw %AX, $0 \0A\09 outl %EAX, $0 \0A\09 outb %AL, %DX \0A\09 outw %AX, %DX \0A\09 outl %EAX, %DX", "i"(i8 7) nounwind
+ ret void
+}
+
+define void @test_outs() optsize {
+; GENERIC-LABEL: test_outs:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: outsb (%rsi), %dx # sched: [100:0.33]
+; GENERIC-NEXT: outsw (%rsi), %dx # sched: [100:0.33]
+; GENERIC-NEXT: outsl (%rsi), %dx # sched: [100:0.33]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_outs:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: outsb (%rsi), %dx # sched: [74:37.00]
+; ATOM-NEXT: outsw (%rsi), %dx # sched: [74:37.00]
+; ATOM-NEXT: outsl (%rsi), %dx # sched: [74:37.00]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_outs:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: outsb (%rsi), %dx # sched: [100:1.00]
+; SLM-NEXT: outsw (%rsi), %dx # sched: [100:1.00]
+; SLM-NEXT: outsl (%rsi), %dx # sched: [100:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_outs:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: outsb (%rsi), %dx # sched: [100:0.33]
+; SANDY-NEXT: outsw (%rsi), %dx # sched: [100:0.33]
+; SANDY-NEXT: outsl (%rsi), %dx # sched: [100:0.33]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_outs:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: outsb (%rsi), %dx # sched: [100:0.25]
+; HASWELL-NEXT: outsw (%rsi), %dx # sched: [100:0.25]
+; HASWELL-NEXT: outsl (%rsi), %dx # sched: [100:0.25]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_outs:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: outsb (%rsi), %dx # sched: [100:0.25]
+; BROADWELL-NEXT: outsw (%rsi), %dx # sched: [100:0.25]
+; BROADWELL-NEXT: outsl (%rsi), %dx # sched: [100:0.25]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_outs:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: outsb (%rsi), %dx # sched: [100:0.25]
+; SKYLAKE-NEXT: outsw (%rsi), %dx # sched: [100:0.25]
+; SKYLAKE-NEXT: outsl (%rsi), %dx # sched: [100:0.25]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_outs:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: outsb (%rsi), %dx # sched: [100:0.25]
+; SKX-NEXT: outsw (%rsi), %dx # sched: [100:0.25]
+; SKX-NEXT: outsl (%rsi), %dx # sched: [100:0.25]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_outs:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: outsb (%rsi), %dx # sched: [100:0.17]
+; BTVER2-NEXT: outsw (%rsi), %dx # sched: [100:0.17]
+; BTVER2-NEXT: outsl (%rsi), %dx # sched: [100:0.17]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_outs:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: outsb (%rsi), %dx # sched: [100:?]
+; ZNVER1-NEXT: outsw (%rsi), %dx # sched: [100:?]
+; ZNVER1-NEXT: outsl (%rsi), %dx # sched: [100:?]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ call void asm sideeffect "outsb \0A\09 outsw \0A\09 outsl", ""()
+ ret void
+}
+
+define void @test_pause() optsize {
+; GENERIC-LABEL: test_pause:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: pause # sched: [4:1.33]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_pause:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: pause # sched: [17:8.50]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_pause:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: pause # sched: [1:?]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_pause:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: pause # sched: [4:1.33]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_pause:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: pause # sched: [5:1.25]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pause:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: pause # sched: [5:1.25]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pause:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: pause # sched: [4:1.00]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pause:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: pause # sched: [140:1.00]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_pause:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: pause # sched: [1:?]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pause:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: pause # sched: [100:?]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ call void asm sideeffect "pause", ""()
+ ret void
+}
+
+define void @test_pop_push() optsize {
+; GENERIC-LABEL: test_pop_push:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: popq %fs # sched: [100:0.33]
+; GENERIC-NEXT: popq %gs # sched: [100:0.33]
+; GENERIC-NEXT: pushq %fs # sched: [3:1.00]
+; GENERIC-NEXT: pushq %gs # sched: [5:1.00]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_pop_push:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: popq %fs # sched: [29:14.50]
+; ATOM-NEXT: popq %gs # sched: [29:14.50]
+; ATOM-NEXT: pushq %fs # sched: [2:1.00]
+; ATOM-NEXT: pushq %gs # sched: [2:1.00]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_pop_push:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: popq %fs # sched: [100:1.00]
+; SLM-NEXT: popq %gs # sched: [100:1.00]
+; SLM-NEXT: pushq %fs # sched: [100:1.00]
+; SLM-NEXT: pushq %gs # sched: [100:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_pop_push:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: popq %fs # sched: [100:0.33]
+; SANDY-NEXT: popq %gs # sched: [100:0.33]
+; SANDY-NEXT: pushq %fs # sched: [3:1.00]
+; SANDY-NEXT: pushq %gs # sched: [5:1.00]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_pop_push:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: popq %fs # sched: [100:0.25]
+; HASWELL-NEXT: popq %gs # sched: [100:0.25]
+; HASWELL-NEXT: pushq %fs # sched: [100:0.25]
+; HASWELL-NEXT: pushq %gs # sched: [100:0.25]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pop_push:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: popq %fs # sched: [100:0.25]
+; BROADWELL-NEXT: popq %gs # sched: [100:0.25]
+; BROADWELL-NEXT: pushq %fs # sched: [100:0.25]
+; BROADWELL-NEXT: pushq %gs # sched: [100:0.25]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pop_push:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: popq %fs # sched: [100:0.25]
+; SKYLAKE-NEXT: popq %gs # sched: [100:0.25]
+; SKYLAKE-NEXT: pushq %fs # sched: [100:0.25]
+; SKYLAKE-NEXT: pushq %gs # sched: [100:0.25]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pop_push:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: popq %fs # sched: [100:0.25]
+; SKX-NEXT: popq %gs # sched: [100:0.25]
+; SKX-NEXT: pushq %fs # sched: [100:0.25]
+; SKX-NEXT: pushq %gs # sched: [100:0.25]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_pop_push:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: popq %fs # sched: [100:0.17]
+; BTVER2-NEXT: popq %gs # sched: [100:0.17]
+; BTVER2-NEXT: pushq %fs # sched: [100:0.17]
+; BTVER2-NEXT: pushq %gs # sched: [100:0.17]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pop_push:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: popq %fs # sched: [100:?]
+; ZNVER1-NEXT: popq %gs # sched: [100:?]
+; ZNVER1-NEXT: pushq %fs # sched: [100:?]
+; ZNVER1-NEXT: pushq %gs # sched: [100:?]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ call void asm sideeffect "pop %FS \0A\09 pop %GS \0A\09 push %FS \0A\09 push %GS", ""()
+ ret void
+}
+define i16 @test_pop_push_16(i16 %a0, i16 *%a1) optsize {
+; GENERIC-LABEL: test_pop_push_16:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: popw %ax # sched: [6:0.50]
+; GENERIC-NEXT: popw (%rsi) # sched: [6:0.50]
+; GENERIC-NEXT: pushw %di # sched: [5:1.00]
+; GENERIC-NEXT: pushw (%rsi) # sched: [5:1.00]
+; GENERIC-NEXT: pushw $4095 # imm = 0xFFF
+; GENERIC-NEXT: # sched: [1:1.00]
+; GENERIC-NEXT: pushw $7 # sched: [1:1.00]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_pop_push_16:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: popw %ax # sched: [2:1.00]
+; ATOM-NEXT: popw (%rsi) # sched: [3:1.50]
+; ATOM-NEXT: pushw %di # sched: [1:1.00]
+; ATOM-NEXT: pushw (%rsi) # sched: [2:1.00]
+; ATOM-NEXT: pushw $4095 # imm = 0xFFF
+; ATOM-NEXT: # sched: [1:1.00]
+; ATOM-NEXT: pushw $7 # sched: [1:1.00]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_pop_push_16:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: popw %ax # sched: [3:1.00]
+; SLM-NEXT: popw (%rsi) # sched: [1:1.00]
+; SLM-NEXT: pushw %di # sched: [1:1.00]
+; SLM-NEXT: pushw (%rsi) # sched: [1:1.00]
+; SLM-NEXT: pushw $4095 # imm = 0xFFF
+; SLM-NEXT: # sched: [1:1.00]
+; SLM-NEXT: pushw $7 # sched: [1:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_pop_push_16:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: popw %ax # sched: [6:0.50]
+; SANDY-NEXT: popw (%rsi) # sched: [6:0.50]
+; SANDY-NEXT: pushw %di # sched: [5:1.00]
+; SANDY-NEXT: pushw (%rsi) # sched: [5:1.00]
+; SANDY-NEXT: pushw $4095 # imm = 0xFFF
+; SANDY-NEXT: # sched: [1:1.00]
+; SANDY-NEXT: pushw $7 # sched: [1:1.00]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_pop_push_16:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: popw %ax # sched: [6:0.50]
+; HASWELL-NEXT: popw (%rsi) # sched: [1:1.00]
+; HASWELL-NEXT: pushw %di # sched: [2:1.00]
+; HASWELL-NEXT: pushw (%rsi) # sched: [1:1.00]
+; HASWELL-NEXT: pushw $4095 # imm = 0xFFF
+; HASWELL-NEXT: # sched: [1:1.00]
+; HASWELL-NEXT: pushw $7 # sched: [1:1.00]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pop_push_16:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: popw %ax # sched: [6:0.50]
+; BROADWELL-NEXT: popw (%rsi) # sched: [6:0.50]
+; BROADWELL-NEXT: pushw %di # sched: [2:1.00]
+; BROADWELL-NEXT: pushw (%rsi) # sched: [2:1.00]
+; BROADWELL-NEXT: pushw $4095 # imm = 0xFFF
+; BROADWELL-NEXT: # sched: [1:1.00]
+; BROADWELL-NEXT: pushw $7 # sched: [1:1.00]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pop_push_16:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: popw %ax # sched: [6:0.50]
+; SKYLAKE-NEXT: popw (%rsi) # sched: [6:0.50]
+; SKYLAKE-NEXT: pushw %di # sched: [2:1.00]
+; SKYLAKE-NEXT: pushw (%rsi) # sched: [2:1.00]
+; SKYLAKE-NEXT: pushw $4095 # imm = 0xFFF
+; SKYLAKE-NEXT: # sched: [1:1.00]
+; SKYLAKE-NEXT: pushw $7 # sched: [1:1.00]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pop_push_16:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: popw %ax # sched: [6:0.50]
+; SKX-NEXT: popw (%rsi) # sched: [6:0.50]
+; SKX-NEXT: pushw %di # sched: [2:1.00]
+; SKX-NEXT: pushw (%rsi) # sched: [2:1.00]
+; SKX-NEXT: pushw $4095 # imm = 0xFFF
+; SKX-NEXT: # sched: [1:1.00]
+; SKX-NEXT: pushw $7 # sched: [1:1.00]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_pop_push_16:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: popw %ax # sched: [5:1.00]
+; BTVER2-NEXT: popw (%rsi) # sched: [1:1.00]
+; BTVER2-NEXT: pushw %di # sched: [1:1.00]
+; BTVER2-NEXT: pushw (%rsi) # sched: [1:1.00]
+; BTVER2-NEXT: pushw $4095 # imm = 0xFFF
+; BTVER2-NEXT: # sched: [1:1.00]
+; BTVER2-NEXT: pushw $7 # sched: [1:1.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pop_push_16:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: popw %ax # sched: [8:0.50]
+; ZNVER1-NEXT: popw (%rsi) # sched: [5:0.50]
+; ZNVER1-NEXT: pushw %di # sched: [1:0.50]
+; ZNVER1-NEXT: pushw (%rsi) # sched: [4:0.50]
+; ZNVER1-NEXT: pushw $4095 # imm = 0xFFF
+; ZNVER1-NEXT: # sched: [1:0.50]
+; ZNVER1-NEXT: pushw $7 # sched: [1:0.50]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call i16 asm sideeffect "popw $0 \0A\09 popw $2 \0A\09 pushw $1 \0A\09 pushw $2 \0A\09 pushw $3 \0A\09 pushw $4", "=r,r,*m,i,i"(i16 %a0, i16 *%a1, i16 4095, i8 7)
+ ret i16 %1
+}
+define i64 @test_pop_push_64(i64 %a0, i64 *%a1) optsize {
+; GENERIC-LABEL: test_pop_push_64:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: popq %rax # sched: [6:0.50]
+; GENERIC-NEXT: popq (%rsi) # sched: [6:0.50]
+; GENERIC-NEXT: pushq %rdi # sched: [5:1.00]
+; GENERIC-NEXT: pushq (%rsi) # sched: [5:1.00]
+; GENERIC-NEXT: pushq $4095 # imm = 0xFFF
+; GENERIC-NEXT: # sched: [1:1.00]
+; GENERIC-NEXT: pushq $7 # sched: [5:1.00]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_pop_push_64:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: popq %rax # sched: [1:1.00]
+; ATOM-NEXT: popq (%rsi) # sched: [3:1.50]
+; ATOM-NEXT: pushq %rdi # sched: [1:1.00]
+; ATOM-NEXT: pushq (%rsi) # sched: [2:1.00]
+; ATOM-NEXT: pushq $4095 # imm = 0xFFF
+; ATOM-NEXT: # sched: [1:1.00]
+; ATOM-NEXT: pushq $7 # sched: [1:1.00]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_pop_push_64:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: popq %rax # sched: [3:1.00]
+; SLM-NEXT: popq (%rsi) # sched: [1:1.00]
+; SLM-NEXT: pushq %rdi # sched: [1:1.00]
+; SLM-NEXT: pushq (%rsi) # sched: [1:1.00]
+; SLM-NEXT: pushq $4095 # imm = 0xFFF
+; SLM-NEXT: # sched: [1:1.00]
+; SLM-NEXT: pushq $7 # sched: [1:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_pop_push_64:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: popq %rax # sched: [6:0.50]
+; SANDY-NEXT: popq (%rsi) # sched: [6:0.50]
+; SANDY-NEXT: pushq %rdi # sched: [5:1.00]
+; SANDY-NEXT: pushq (%rsi) # sched: [5:1.00]
+; SANDY-NEXT: pushq $4095 # imm = 0xFFF
+; SANDY-NEXT: # sched: [1:1.00]
+; SANDY-NEXT: pushq $7 # sched: [5:1.00]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_pop_push_64:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: popq %rax # sched: [6:0.50]
+; HASWELL-NEXT: popq (%rsi) # sched: [6:0.50]
+; HASWELL-NEXT: pushq %rdi # sched: [2:1.00]
+; HASWELL-NEXT: pushq (%rsi) # sched: [2:1.00]
+; HASWELL-NEXT: pushq $4095 # imm = 0xFFF
+; HASWELL-NEXT: # sched: [1:1.00]
+; HASWELL-NEXT: pushq $7 # sched: [2:1.00]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pop_push_64:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: popq %rax # sched: [6:0.50]
+; BROADWELL-NEXT: popq (%rsi) # sched: [6:0.50]
+; BROADWELL-NEXT: pushq %rdi # sched: [2:1.00]
+; BROADWELL-NEXT: pushq (%rsi) # sched: [2:1.00]
+; BROADWELL-NEXT: pushq $4095 # imm = 0xFFF
+; BROADWELL-NEXT: # sched: [1:1.00]
+; BROADWELL-NEXT: pushq $7 # sched: [2:1.00]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pop_push_64:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: popq %rax # sched: [6:0.50]
+; SKYLAKE-NEXT: popq (%rsi) # sched: [6:0.50]
+; SKYLAKE-NEXT: pushq %rdi # sched: [2:1.00]
+; SKYLAKE-NEXT: pushq (%rsi) # sched: [2:1.00]
+; SKYLAKE-NEXT: pushq $4095 # imm = 0xFFF
+; SKYLAKE-NEXT: # sched: [1:1.00]
+; SKYLAKE-NEXT: pushq $7 # sched: [2:1.00]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pop_push_64:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: popq %rax # sched: [6:0.50]
+; SKX-NEXT: popq (%rsi) # sched: [6:0.50]
+; SKX-NEXT: pushq %rdi # sched: [2:1.00]
+; SKX-NEXT: pushq (%rsi) # sched: [2:1.00]
+; SKX-NEXT: pushq $4095 # imm = 0xFFF
+; SKX-NEXT: # sched: [1:1.00]
+; SKX-NEXT: pushq $7 # sched: [2:1.00]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_pop_push_64:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: popq %rax # sched: [5:1.00]
+; BTVER2-NEXT: popq (%rsi) # sched: [1:1.00]
+; BTVER2-NEXT: pushq %rdi # sched: [1:1.00]
+; BTVER2-NEXT: pushq (%rsi) # sched: [1:1.00]
+; BTVER2-NEXT: pushq $4095 # imm = 0xFFF
+; BTVER2-NEXT: # sched: [1:1.00]
+; BTVER2-NEXT: pushq $7 # sched: [1:1.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pop_push_64:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: popq %rax # sched: [8:0.50]
+; ZNVER1-NEXT: popq (%rsi) # sched: [1:0.50]
+; ZNVER1-NEXT: pushq %rdi # sched: [1:0.50]
+; ZNVER1-NEXT: pushq (%rsi) # sched: [1:0.50]
+; ZNVER1-NEXT: pushq $4095 # imm = 0xFFF
+; ZNVER1-NEXT: # sched: [1:0.50]
+; ZNVER1-NEXT: pushq $7 # sched: [1:0.50]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = call i64 asm sideeffect "popq $0 \0A\09 popq $2 \0A\09 pushq $1 \0A\09 pushq $2 \0A\09 pushq $3 \0A\09 pushq $4", "=r,r,*m,i,i"(i64 %a0, i64 *%a1, i64 4095, i8 7)
+ ret i64 %1
+}
+
+define void @test_popf_pushf() optsize {
+; GENERIC-LABEL: test_popf_pushf:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: popfq # sched: [4:0.50]
+; GENERIC-NEXT: pushfq # sched: [5:1.00]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_popf_pushf:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: popfq # sched: [26:13.00]
+; ATOM-NEXT: pushfq # sched: [9:4.50]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_popf_pushf:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: popfq # sched: [3:1.00]
+; SLM-NEXT: pushfq # sched: [1:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_popf_pushf:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: popfq # sched: [4:0.50]
+; SANDY-NEXT: pushfq # sched: [5:1.00]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_popf_pushf:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: popfq # sched: [5:0.50]
+; HASWELL-NEXT: pushfq # sched: [5:1.00]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_popf_pushf:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: popfq # sched: [22:4.25]
+; BROADWELL-NEXT: pushfq # sched: [5:1.00]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_popf_pushf:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: popfq # sched: [5:0.50]
+; SKYLAKE-NEXT: pushfq # sched: [5:1.00]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_popf_pushf:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: popfq # sched: [5:0.50]
+; SKX-NEXT: pushfq # sched: [5:1.00]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_popf_pushf:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: popfq # sched: [5:1.00]
+; BTVER2-NEXT: pushfq # sched: [1:1.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_popf_pushf:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: popfq # sched: [8:0.50]
+; ZNVER1-NEXT: pushfq # sched: [1:0.50]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ call void asm sideeffect "popf \0A\09 pushf", ""()
+ ret void
+}
+
+define void @test_rcl_rcr_8(i8 %a0, i8 %a1, i8 *%a2) optsize {
+; GENERIC-LABEL: test_rcl_rcr_8:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: rclb %dil # sched: [1:0.50]
+; GENERIC-NEXT: rcrb %dil # sched: [1:0.50]
+; GENERIC-NEXT: rclb (%rdx) # sched: [5:1.00]
+; GENERIC-NEXT: rcrb (%rdx) # sched: [5:1.00]
+; GENERIC-NEXT: rclb $7, %dil # sched: [1:0.50]
+; GENERIC-NEXT: rcrb $7, %dil # sched: [1:0.50]
+; GENERIC-NEXT: rclb $7, (%rdx) # sched: [5:1.00]
+; GENERIC-NEXT: rcrb $7, (%rdx) # sched: [5:1.00]
+; GENERIC-NEXT: rclb %cl, %dil # sched: [1:0.50]
+; GENERIC-NEXT: rcrb %cl, %dil # sched: [1:0.50]
+; GENERIC-NEXT: rclb %cl, (%rdx) # sched: [5:1.00]
+; GENERIC-NEXT: rcrb %cl, (%rdx) # sched: [5:1.00]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_rcl_rcr_8:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: rclb %dil # sched: [1:1.00]
+; ATOM-NEXT: rcrb %dil # sched: [1:1.00]
+; ATOM-NEXT: rclb (%rdx) # sched: [1:1.00]
+; ATOM-NEXT: rcrb (%rdx) # sched: [1:1.00]
+; ATOM-NEXT: rclb $7, %dil # sched: [1:1.00]
+; ATOM-NEXT: rcrb $7, %dil # sched: [1:1.00]
+; ATOM-NEXT: rclb $7, (%rdx) # sched: [1:1.00]
+; ATOM-NEXT: rcrb $7, (%rdx) # sched: [1:1.00]
+; ATOM-NEXT: rclb %cl, %dil # sched: [1:1.00]
+; ATOM-NEXT: rcrb %cl, %dil # sched: [1:1.00]
+; ATOM-NEXT: rclb %cl, (%rdx) # sched: [1:1.00]
+; ATOM-NEXT: rcrb %cl, (%rdx) # sched: [1:1.00]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_rcl_rcr_8:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: rclb %dil # sched: [1:1.00]
+; SLM-NEXT: rcrb %dil # sched: [1:1.00]
+; SLM-NEXT: rclb (%rdx) # sched: [4:2.00]
+; SLM-NEXT: rcrb (%rdx) # sched: [4:2.00]
+; SLM-NEXT: rclb $7, %dil # sched: [1:1.00]
+; SLM-NEXT: rcrb $7, %dil # sched: [1:1.00]
+; SLM-NEXT: rclb $7, (%rdx) # sched: [4:2.00]
+; SLM-NEXT: rcrb $7, (%rdx) # sched: [4:2.00]
+; SLM-NEXT: rclb %cl, %dil # sched: [1:1.00]
+; SLM-NEXT: rcrb %cl, %dil # sched: [1:1.00]
+; SLM-NEXT: rclb %cl, (%rdx) # sched: [4:2.00]
+; SLM-NEXT: rcrb %cl, (%rdx) # sched: [4:2.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_rcl_rcr_8:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: rclb %dil # sched: [1:0.50]
+; SANDY-NEXT: rcrb %dil # sched: [1:0.50]
+; SANDY-NEXT: rclb (%rdx) # sched: [5:1.00]
+; SANDY-NEXT: rcrb (%rdx) # sched: [5:1.00]
+; SANDY-NEXT: rclb $7, %dil # sched: [1:0.50]
+; SANDY-NEXT: rcrb $7, %dil # sched: [1:0.50]
+; SANDY-NEXT: rclb $7, (%rdx) # sched: [5:1.00]
+; SANDY-NEXT: rcrb $7, (%rdx) # sched: [5:1.00]
+; SANDY-NEXT: rclb %cl, %dil # sched: [1:0.50]
+; SANDY-NEXT: rcrb %cl, %dil # sched: [1:0.50]
+; SANDY-NEXT: rclb %cl, (%rdx) # sched: [5:1.00]
+; SANDY-NEXT: rcrb %cl, (%rdx) # sched: [5:1.00]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_rcl_rcr_8:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: rclb %dil # sched: [3:0.75]
+; HASWELL-NEXT: rcrb %dil # sched: [3:0.75]
+; HASWELL-NEXT: rclb (%rdx) # sched: [9:0.75]
+; HASWELL-NEXT: rcrb (%rdx) # sched: [9:0.75]
+; HASWELL-NEXT: rclb $7, %dil # sched: [3:0.75]
+; HASWELL-NEXT: rcrb $7, %dil # sched: [3:0.75]
+; HASWELL-NEXT: rclb $7, (%rdx) # sched: [9:0.75]
+; HASWELL-NEXT: rcrb $7, (%rdx) # sched: [9:0.75]
+; HASWELL-NEXT: rclb %cl, %dil # sched: [11:2.25]
+; HASWELL-NEXT: rcrb %cl, %dil # sched: [14:2.50]
+; HASWELL-NEXT: rclb %cl, (%rdx) # sched: [16:2.00]
+; HASWELL-NEXT: rcrb %cl, (%rdx) # sched: [19:2.25]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_rcl_rcr_8:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: rclb %dil # sched: [3:0.75]
+; BROADWELL-NEXT: rcrb %dil # sched: [3:0.75]
+; BROADWELL-NEXT: rclb (%rdx) # sched: [8:0.75]
+; BROADWELL-NEXT: rcrb (%rdx) # sched: [8:0.75]
+; BROADWELL-NEXT: rclb $7, %dil # sched: [3:0.75]
+; BROADWELL-NEXT: rcrb $7, %dil # sched: [3:0.75]
+; BROADWELL-NEXT: rclb $7, (%rdx) # sched: [8:0.75]
+; BROADWELL-NEXT: rcrb $7, (%rdx) # sched: [8:0.75]
+; BROADWELL-NEXT: rclb %cl, %dil # sched: [11:2.25]
+; BROADWELL-NEXT: rcrb %cl, %dil # sched: [14:2.50]
+; BROADWELL-NEXT: rclb %cl, (%rdx) # sched: [15:2.00]
+; BROADWELL-NEXT: rcrb %cl, (%rdx) # sched: [18:2.25]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_rcl_rcr_8:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: rclb %dil # sched: [3:0.75]
+; SKYLAKE-NEXT: rcrb %dil # sched: [3:0.75]
+; SKYLAKE-NEXT: rclb (%rdx) # sched: [8:0.75]
+; SKYLAKE-NEXT: rcrb (%rdx) # sched: [8:0.75]
+; SKYLAKE-NEXT: rclb $7, %dil # sched: [3:0.75]
+; SKYLAKE-NEXT: rcrb $7, %dil # sched: [3:0.75]
+; SKYLAKE-NEXT: rclb $7, (%rdx) # sched: [8:0.75]
+; SKYLAKE-NEXT: rcrb $7, (%rdx) # sched: [8:0.75]
+; SKYLAKE-NEXT: rclb %cl, %dil # sched: [11:2.50]
+; SKYLAKE-NEXT: rcrb %cl, %dil # sched: [14:2.50]
+; SKYLAKE-NEXT: rclb %cl, (%rdx) # sched: [15:2.50]
+; SKYLAKE-NEXT: rcrb %cl, (%rdx) # sched: [18:2.25]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_rcl_rcr_8:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: rclb %dil # sched: [3:0.75]
+; SKX-NEXT: rcrb %dil # sched: [3:0.75]
+; SKX-NEXT: rclb (%rdx) # sched: [8:0.75]
+; SKX-NEXT: rcrb (%rdx) # sched: [8:0.75]
+; SKX-NEXT: rclb $7, %dil # sched: [3:0.75]
+; SKX-NEXT: rcrb $7, %dil # sched: [3:0.75]
+; SKX-NEXT: rclb $7, (%rdx) # sched: [8:0.75]
+; SKX-NEXT: rcrb $7, (%rdx) # sched: [8:0.75]
+; SKX-NEXT: rclb %cl, %dil # sched: [11:2.50]
+; SKX-NEXT: rcrb %cl, %dil # sched: [14:2.50]
+; SKX-NEXT: rclb %cl, (%rdx) # sched: [15:2.50]
+; SKX-NEXT: rcrb %cl, (%rdx) # sched: [18:2.25]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_rcl_rcr_8:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: rclb %dil # sched: [1:0.50]
+; BTVER2-NEXT: rcrb %dil # sched: [1:0.50]
+; BTVER2-NEXT: rclb (%rdx) # sched: [4:1.00]
+; BTVER2-NEXT: rcrb (%rdx) # sched: [4:1.00]
+; BTVER2-NEXT: rclb $7, %dil # sched: [1:0.50]
+; BTVER2-NEXT: rcrb $7, %dil # sched: [1:0.50]
+; BTVER2-NEXT: rclb $7, (%rdx) # sched: [4:1.00]
+; BTVER2-NEXT: rcrb $7, (%rdx) # sched: [4:1.00]
+; BTVER2-NEXT: rclb %cl, %dil # sched: [1:0.50]
+; BTVER2-NEXT: rcrb %cl, %dil # sched: [1:0.50]
+; BTVER2-NEXT: rclb %cl, (%rdx) # sched: [4:1.00]
+; BTVER2-NEXT: rcrb %cl, (%rdx) # sched: [4:1.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_rcl_rcr_8:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: rclb %dil # sched: [1:0.25]
+; ZNVER1-NEXT: rcrb %dil # sched: [1:0.25]
+; ZNVER1-NEXT: rclb (%rdx) # sched: [100:?]
+; ZNVER1-NEXT: rcrb (%rdx) # sched: [100:?]
+; ZNVER1-NEXT: rclb $7, %dil # sched: [1:0.25]
+; ZNVER1-NEXT: rcrb $7, %dil # sched: [1:0.25]
+; ZNVER1-NEXT: rclb $7, (%rdx) # sched: [100:?]
+; ZNVER1-NEXT: rcrb $7, (%rdx) # sched: [100:?]
+; ZNVER1-NEXT: rclb %cl, %dil # sched: [1:0.25]
+; ZNVER1-NEXT: rcrb %cl, %dil # sched: [1:0.25]
+; ZNVER1-NEXT: rclb %cl, (%rdx) # sched: [100:?]
+; ZNVER1-NEXT: rcrb %cl, (%rdx) # sched: [100:?]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ call void asm sideeffect "rclb $0 \0A\09 rcrb $0 \0A\09 rclb $2 \0A\09 rcrb $2 \0A\09 rclb $3, $0 \0A\09 rcrb $3, $0 \0A\09 rclb $3, $2 \0A\09 rcrb $3, $2 \0A\09 rclb %CL, $0 \0A\09 rcrb %CL, $0 \0A\09 rclb %CL, $2 \0A\09 rcrb %CL, $2", "r,r,*m,i"(i8 %a0, i8 %a1, i8 *%a2, i8 7)
+ ret void
+}
+define void @test_rcl_rcr_16(i16 %a0, i16 %a1, i16 *%a2) optsize {
+; GENERIC-LABEL: test_rcl_rcr_16:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: rclw %di # sched: [1:0.50]
+; GENERIC-NEXT: rcrw %di # sched: [1:0.50]
+; GENERIC-NEXT: rclw (%rdx) # sched: [5:1.00]
+; GENERIC-NEXT: rcrw (%rdx) # sched: [5:1.00]
+; GENERIC-NEXT: rclw $7, %di # sched: [1:0.50]
+; GENERIC-NEXT: rcrw $7, %di # sched: [1:0.50]
+; GENERIC-NEXT: rclw $7, (%rdx) # sched: [5:1.00]
+; GENERIC-NEXT: rcrw $7, (%rdx) # sched: [5:1.00]
+; GENERIC-NEXT: rclw %cl, %di # sched: [1:0.50]
+; GENERIC-NEXT: rcrw %cl, %di # sched: [1:0.50]
+; GENERIC-NEXT: rclw %cl, (%rdx) # sched: [5:1.00]
+; GENERIC-NEXT: rcrw %cl, (%rdx) # sched: [5:1.00]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_rcl_rcr_16:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: rclw %di # sched: [1:1.00]
+; ATOM-NEXT: rcrw %di # sched: [1:1.00]
+; ATOM-NEXT: rclw (%rdx) # sched: [1:1.00]
+; ATOM-NEXT: rcrw (%rdx) # sched: [1:1.00]
+; ATOM-NEXT: rclw $7, %di # sched: [1:1.00]
+; ATOM-NEXT: rcrw $7, %di # sched: [1:1.00]
+; ATOM-NEXT: rclw $7, (%rdx) # sched: [1:1.00]
+; ATOM-NEXT: rcrw $7, (%rdx) # sched: [1:1.00]
+; ATOM-NEXT: rclw %cl, %di # sched: [1:1.00]
+; ATOM-NEXT: rcrw %cl, %di # sched: [1:1.00]
+; ATOM-NEXT: rclw %cl, (%rdx) # sched: [1:1.00]
+; ATOM-NEXT: rcrw %cl, (%rdx) # sched: [1:1.00]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_rcl_rcr_16:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: rclw %di # sched: [1:1.00]
+; SLM-NEXT: rcrw %di # sched: [1:1.00]
+; SLM-NEXT: rclw (%rdx) # sched: [4:2.00]
+; SLM-NEXT: rcrw (%rdx) # sched: [4:2.00]
+; SLM-NEXT: rclw $7, %di # sched: [1:1.00]
+; SLM-NEXT: rcrw $7, %di # sched: [1:1.00]
+; SLM-NEXT: rclw $7, (%rdx) # sched: [4:2.00]
+; SLM-NEXT: rcrw $7, (%rdx) # sched: [4:2.00]
+; SLM-NEXT: rclw %cl, %di # sched: [1:1.00]
+; SLM-NEXT: rcrw %cl, %di # sched: [1:1.00]
+; SLM-NEXT: rclw %cl, (%rdx) # sched: [4:2.00]
+; SLM-NEXT: rcrw %cl, (%rdx) # sched: [4:2.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_rcl_rcr_16:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: rclw %di # sched: [1:0.50]
+; SANDY-NEXT: rcrw %di # sched: [1:0.50]
+; SANDY-NEXT: rclw (%rdx) # sched: [5:1.00]
+; SANDY-NEXT: rcrw (%rdx) # sched: [5:1.00]
+; SANDY-NEXT: rclw $7, %di # sched: [1:0.50]
+; SANDY-NEXT: rcrw $7, %di # sched: [1:0.50]
+; SANDY-NEXT: rclw $7, (%rdx) # sched: [5:1.00]
+; SANDY-NEXT: rcrw $7, (%rdx) # sched: [5:1.00]
+; SANDY-NEXT: rclw %cl, %di # sched: [1:0.50]
+; SANDY-NEXT: rcrw %cl, %di # sched: [1:0.50]
+; SANDY-NEXT: rclw %cl, (%rdx) # sched: [5:1.00]
+; SANDY-NEXT: rcrw %cl, (%rdx) # sched: [5:1.00]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_rcl_rcr_16:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: rclw %di # sched: [3:0.75]
+; HASWELL-NEXT: rcrw %di # sched: [3:0.75]
+; HASWELL-NEXT: rclw (%rdx) # sched: [9:0.75]
+; HASWELL-NEXT: rcrw (%rdx) # sched: [9:0.75]
+; HASWELL-NEXT: rclw $7, %di # sched: [3:0.75]
+; HASWELL-NEXT: rcrw $7, %di # sched: [3:0.75]
+; HASWELL-NEXT: rclw $7, (%rdx) # sched: [9:0.75]
+; HASWELL-NEXT: rcrw $7, (%rdx) # sched: [9:0.75]
+; HASWELL-NEXT: rclw %cl, %di # sched: [11:2.00]
+; HASWELL-NEXT: rcrw %cl, %di # sched: [11:2.00]
+; HASWELL-NEXT: rclw %cl, (%rdx) # sched: [16:2.00]
+; HASWELL-NEXT: rcrw %cl, (%rdx) # sched: [19:2.25]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_rcl_rcr_16:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: rclw %di # sched: [3:0.75]
+; BROADWELL-NEXT: rcrw %di # sched: [3:0.75]
+; BROADWELL-NEXT: rclw (%rdx) # sched: [8:0.75]
+; BROADWELL-NEXT: rcrw (%rdx) # sched: [8:0.75]
+; BROADWELL-NEXT: rclw $7, %di # sched: [3:0.75]
+; BROADWELL-NEXT: rcrw $7, %di # sched: [3:0.75]
+; BROADWELL-NEXT: rclw $7, (%rdx) # sched: [8:0.75]
+; BROADWELL-NEXT: rcrw $7, (%rdx) # sched: [8:0.75]
+; BROADWELL-NEXT: rclw %cl, %di # sched: [11:2.00]
+; BROADWELL-NEXT: rcrw %cl, %di # sched: [11:2.00]
+; BROADWELL-NEXT: rclw %cl, (%rdx) # sched: [15:2.00]
+; BROADWELL-NEXT: rcrw %cl, (%rdx) # sched: [18:2.25]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_rcl_rcr_16:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: rclw %di # sched: [3:0.75]
+; SKYLAKE-NEXT: rcrw %di # sched: [3:0.75]
+; SKYLAKE-NEXT: rclw (%rdx) # sched: [8:0.75]
+; SKYLAKE-NEXT: rcrw (%rdx) # sched: [8:0.75]
+; SKYLAKE-NEXT: rclw $7, %di # sched: [3:0.75]
+; SKYLAKE-NEXT: rcrw $7, %di # sched: [3:0.75]
+; SKYLAKE-NEXT: rclw $7, (%rdx) # sched: [8:0.75]
+; SKYLAKE-NEXT: rcrw $7, (%rdx) # sched: [8:0.75]
+; SKYLAKE-NEXT: rclw %cl, %di # sched: [11:2.00]
+; SKYLAKE-NEXT: rcrw %cl, %di # sched: [11:2.00]
+; SKYLAKE-NEXT: rclw %cl, (%rdx) # sched: [15:2.50]
+; SKYLAKE-NEXT: rcrw %cl, (%rdx) # sched: [18:2.25]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_rcl_rcr_16:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: rclw %di # sched: [3:0.75]
+; SKX-NEXT: rcrw %di # sched: [3:0.75]
+; SKX-NEXT: rclw (%rdx) # sched: [8:0.75]
+; SKX-NEXT: rcrw (%rdx) # sched: [8:0.75]
+; SKX-NEXT: rclw $7, %di # sched: [3:0.75]
+; SKX-NEXT: rcrw $7, %di # sched: [3:0.75]
+; SKX-NEXT: rclw $7, (%rdx) # sched: [8:0.75]
+; SKX-NEXT: rcrw $7, (%rdx) # sched: [8:0.75]
+; SKX-NEXT: rclw %cl, %di # sched: [11:2.00]
+; SKX-NEXT: rcrw %cl, %di # sched: [11:2.00]
+; SKX-NEXT: rclw %cl, (%rdx) # sched: [15:2.50]
+; SKX-NEXT: rcrw %cl, (%rdx) # sched: [18:2.25]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_rcl_rcr_16:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: rclw %di # sched: [1:0.50]
+; BTVER2-NEXT: rcrw %di # sched: [1:0.50]
+; BTVER2-NEXT: rclw (%rdx) # sched: [4:1.00]
+; BTVER2-NEXT: rcrw (%rdx) # sched: [4:1.00]
+; BTVER2-NEXT: rclw $7, %di # sched: [1:0.50]
+; BTVER2-NEXT: rcrw $7, %di # sched: [1:0.50]
+; BTVER2-NEXT: rclw $7, (%rdx) # sched: [4:1.00]
+; BTVER2-NEXT: rcrw $7, (%rdx) # sched: [4:1.00]
+; BTVER2-NEXT: rclw %cl, %di # sched: [1:0.50]
+; BTVER2-NEXT: rcrw %cl, %di # sched: [1:0.50]
+; BTVER2-NEXT: rclw %cl, (%rdx) # sched: [4:1.00]
+; BTVER2-NEXT: rcrw %cl, (%rdx) # sched: [4:1.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_rcl_rcr_16:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: rclw %di # sched: [1:0.25]
+; ZNVER1-NEXT: rcrw %di # sched: [1:0.25]
+; ZNVER1-NEXT: rclw (%rdx) # sched: [100:?]
+; ZNVER1-NEXT: rcrw (%rdx) # sched: [100:?]
+; ZNVER1-NEXT: rclw $7, %di # sched: [1:0.25]
+; ZNVER1-NEXT: rcrw $7, %di # sched: [1:0.25]
+; ZNVER1-NEXT: rclw $7, (%rdx) # sched: [100:?]
+; ZNVER1-NEXT: rcrw $7, (%rdx) # sched: [100:?]
+; ZNVER1-NEXT: rclw %cl, %di # sched: [1:0.25]
+; ZNVER1-NEXT: rcrw %cl, %di # sched: [1:0.25]
+; ZNVER1-NEXT: rclw %cl, (%rdx) # sched: [100:?]
+; ZNVER1-NEXT: rcrw %cl, (%rdx) # sched: [100:?]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ call void asm sideeffect "rclw $0 \0A\09 rcrw $0 \0A\09 rclw $2 \0A\09 rcrw $2 \0A\09 rclw $3, $0 \0A\09 rcrw $3, $0 \0A\09 rclw $3, $2 \0A\09 rcrw $3, $2 \0A\09 rclw %CL, $0 \0A\09 rcrw %CL, $0 \0A\09 rclw %CL, $2 \0A\09 rcrw %CL, $2", "r,r,*m,i"(i16 %a0, i16 %a1, i16 *%a2, i8 7)
+ ret void
+}
+define void @test_rcl_rcr_32(i32 %a0, i32 %a1, i32 *%a2) optsize {
+; GENERIC-LABEL: test_rcl_rcr_32:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: rcll %edi # sched: [1:0.50]
+; GENERIC-NEXT: rcrl %edi # sched: [1:0.50]
+; GENERIC-NEXT: rcll (%rdx) # sched: [5:1.00]
+; GENERIC-NEXT: rcrl (%rdx) # sched: [5:1.00]
+; GENERIC-NEXT: rcll $7, %edi # sched: [1:0.50]
+; GENERIC-NEXT: rcrl $7, %edi # sched: [1:0.50]
+; GENERIC-NEXT: rcll $7, (%rdx) # sched: [5:1.00]
+; GENERIC-NEXT: rcrl $7, (%rdx) # sched: [5:1.00]
+; GENERIC-NEXT: rcll %cl, %edi # sched: [1:0.50]
+; GENERIC-NEXT: rcrl %cl, %edi # sched: [1:0.50]
+; GENERIC-NEXT: rcll %cl, (%rdx) # sched: [5:1.00]
+; GENERIC-NEXT: rcrl %cl, (%rdx) # sched: [5:1.00]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_rcl_rcr_32:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: rcll %edi # sched: [1:1.00]
+; ATOM-NEXT: rcrl %edi # sched: [1:1.00]
+; ATOM-NEXT: rcll (%rdx) # sched: [1:1.00]
+; ATOM-NEXT: rcrl (%rdx) # sched: [1:1.00]
+; ATOM-NEXT: rcll $7, %edi # sched: [1:1.00]
+; ATOM-NEXT: rcrl $7, %edi # sched: [1:1.00]
+; ATOM-NEXT: rcll $7, (%rdx) # sched: [1:1.00]
+; ATOM-NEXT: rcrl $7, (%rdx) # sched: [1:1.00]
+; ATOM-NEXT: rcll %cl, %edi # sched: [1:1.00]
+; ATOM-NEXT: rcrl %cl, %edi # sched: [1:1.00]
+; ATOM-NEXT: rcll %cl, (%rdx) # sched: [1:1.00]
+; ATOM-NEXT: rcrl %cl, (%rdx) # sched: [1:1.00]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_rcl_rcr_32:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: rcll %edi # sched: [1:1.00]
+; SLM-NEXT: rcrl %edi # sched: [1:1.00]
+; SLM-NEXT: rcll (%rdx) # sched: [4:2.00]
+; SLM-NEXT: rcrl (%rdx) # sched: [4:2.00]
+; SLM-NEXT: rcll $7, %edi # sched: [1:1.00]
+; SLM-NEXT: rcrl $7, %edi # sched: [1:1.00]
+; SLM-NEXT: rcll $7, (%rdx) # sched: [4:2.00]
+; SLM-NEXT: rcrl $7, (%rdx) # sched: [4:2.00]
+; SLM-NEXT: rcll %cl, %edi # sched: [1:1.00]
+; SLM-NEXT: rcrl %cl, %edi # sched: [1:1.00]
+; SLM-NEXT: rcll %cl, (%rdx) # sched: [4:2.00]
+; SLM-NEXT: rcrl %cl, (%rdx) # sched: [4:2.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_rcl_rcr_32:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: rcll %edi # sched: [1:0.50]
+; SANDY-NEXT: rcrl %edi # sched: [1:0.50]
+; SANDY-NEXT: rcll (%rdx) # sched: [5:1.00]
+; SANDY-NEXT: rcrl (%rdx) # sched: [5:1.00]
+; SANDY-NEXT: rcll $7, %edi # sched: [1:0.50]
+; SANDY-NEXT: rcrl $7, %edi # sched: [1:0.50]
+; SANDY-NEXT: rcll $7, (%rdx) # sched: [5:1.00]
+; SANDY-NEXT: rcrl $7, (%rdx) # sched: [5:1.00]
+; SANDY-NEXT: rcll %cl, %edi # sched: [1:0.50]
+; SANDY-NEXT: rcrl %cl, %edi # sched: [1:0.50]
+; SANDY-NEXT: rcll %cl, (%rdx) # sched: [5:1.00]
+; SANDY-NEXT: rcrl %cl, (%rdx) # sched: [5:1.00]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_rcl_rcr_32:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: rcll %edi # sched: [3:0.75]
+; HASWELL-NEXT: rcrl %edi # sched: [3:0.75]
+; HASWELL-NEXT: rcll (%rdx) # sched: [9:0.75]
+; HASWELL-NEXT: rcrl (%rdx) # sched: [9:0.75]
+; HASWELL-NEXT: rcll $7, %edi # sched: [3:0.75]
+; HASWELL-NEXT: rcrl $7, %edi # sched: [3:0.75]
+; HASWELL-NEXT: rcll $7, (%rdx) # sched: [9:0.75]
+; HASWELL-NEXT: rcrl $7, (%rdx) # sched: [9:0.75]
+; HASWELL-NEXT: rcll %cl, %edi # sched: [11:2.00]
+; HASWELL-NEXT: rcrl %cl, %edi # sched: [11:2.00]
+; HASWELL-NEXT: rcll %cl, (%rdx) # sched: [16:2.00]
+; HASWELL-NEXT: rcrl %cl, (%rdx) # sched: [19:2.25]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_rcl_rcr_32:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: rcll %edi # sched: [3:0.75]
+; BROADWELL-NEXT: rcrl %edi # sched: [3:0.75]
+; BROADWELL-NEXT: rcll (%rdx) # sched: [8:0.75]
+; BROADWELL-NEXT: rcrl (%rdx) # sched: [8:0.75]
+; BROADWELL-NEXT: rcll $7, %edi # sched: [3:0.75]
+; BROADWELL-NEXT: rcrl $7, %edi # sched: [3:0.75]
+; BROADWELL-NEXT: rcll $7, (%rdx) # sched: [8:0.75]
+; BROADWELL-NEXT: rcrl $7, (%rdx) # sched: [8:0.75]
+; BROADWELL-NEXT: rcll %cl, %edi # sched: [11:2.00]
+; BROADWELL-NEXT: rcrl %cl, %edi # sched: [11:2.00]
+; BROADWELL-NEXT: rcll %cl, (%rdx) # sched: [15:2.00]
+; BROADWELL-NEXT: rcrl %cl, (%rdx) # sched: [18:2.25]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_rcl_rcr_32:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: rcll %edi # sched: [3:0.75]
+; SKYLAKE-NEXT: rcrl %edi # sched: [3:0.75]
+; SKYLAKE-NEXT: rcll (%rdx) # sched: [8:0.75]
+; SKYLAKE-NEXT: rcrl (%rdx) # sched: [8:0.75]
+; SKYLAKE-NEXT: rcll $7, %edi # sched: [3:0.75]
+; SKYLAKE-NEXT: rcrl $7, %edi # sched: [3:0.75]
+; SKYLAKE-NEXT: rcll $7, (%rdx) # sched: [8:0.75]
+; SKYLAKE-NEXT: rcrl $7, (%rdx) # sched: [8:0.75]
+; SKYLAKE-NEXT: rcll %cl, %edi # sched: [11:2.00]
+; SKYLAKE-NEXT: rcrl %cl, %edi # sched: [11:2.00]
+; SKYLAKE-NEXT: rcll %cl, (%rdx) # sched: [15:2.50]
+; SKYLAKE-NEXT: rcrl %cl, (%rdx) # sched: [18:2.25]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_rcl_rcr_32:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: rcll %edi # sched: [3:0.75]
+; SKX-NEXT: rcrl %edi # sched: [3:0.75]
+; SKX-NEXT: rcll (%rdx) # sched: [8:0.75]
+; SKX-NEXT: rcrl (%rdx) # sched: [8:0.75]
+; SKX-NEXT: rcll $7, %edi # sched: [3:0.75]
+; SKX-NEXT: rcrl $7, %edi # sched: [3:0.75]
+; SKX-NEXT: rcll $7, (%rdx) # sched: [8:0.75]
+; SKX-NEXT: rcrl $7, (%rdx) # sched: [8:0.75]
+; SKX-NEXT: rcll %cl, %edi # sched: [11:2.00]
+; SKX-NEXT: rcrl %cl, %edi # sched: [11:2.00]
+; SKX-NEXT: rcll %cl, (%rdx) # sched: [15:2.50]
+; SKX-NEXT: rcrl %cl, (%rdx) # sched: [18:2.25]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_rcl_rcr_32:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: rcll %edi # sched: [1:0.50]
+; BTVER2-NEXT: rcrl %edi # sched: [1:0.50]
+; BTVER2-NEXT: rcll (%rdx) # sched: [4:1.00]
+; BTVER2-NEXT: rcrl (%rdx) # sched: [4:1.00]
+; BTVER2-NEXT: rcll $7, %edi # sched: [1:0.50]
+; BTVER2-NEXT: rcrl $7, %edi # sched: [1:0.50]
+; BTVER2-NEXT: rcll $7, (%rdx) # sched: [4:1.00]
+; BTVER2-NEXT: rcrl $7, (%rdx) # sched: [4:1.00]
+; BTVER2-NEXT: rcll %cl, %edi # sched: [1:0.50]
+; BTVER2-NEXT: rcrl %cl, %edi # sched: [1:0.50]
+; BTVER2-NEXT: rcll %cl, (%rdx) # sched: [4:1.00]
+; BTVER2-NEXT: rcrl %cl, (%rdx) # sched: [4:1.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_rcl_rcr_32:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: rcll %edi # sched: [1:0.25]
+; ZNVER1-NEXT: rcrl %edi # sched: [1:0.25]
+; ZNVER1-NEXT: rcll (%rdx) # sched: [100:?]
+; ZNVER1-NEXT: rcrl (%rdx) # sched: [100:?]
+; ZNVER1-NEXT: rcll $7, %edi # sched: [1:0.25]
+; ZNVER1-NEXT: rcrl $7, %edi # sched: [1:0.25]
+; ZNVER1-NEXT: rcll $7, (%rdx) # sched: [100:?]
+; ZNVER1-NEXT: rcrl $7, (%rdx) # sched: [100:?]
+; ZNVER1-NEXT: rcll %cl, %edi # sched: [1:0.25]
+; ZNVER1-NEXT: rcrl %cl, %edi # sched: [1:0.25]
+; ZNVER1-NEXT: rcll %cl, (%rdx) # sched: [100:?]
+; ZNVER1-NEXT: rcrl %cl, (%rdx) # sched: [100:?]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ call void asm sideeffect "rcll $0 \0A\09 rcrl $0 \0A\09 rcll $2 \0A\09 rcrl $2 \0A\09 rcll $3, $0 \0A\09 rcrl $3, $0 \0A\09 rcll $3, $2 \0A\09 rcrl $3, $2 \0A\09 rcll %CL, $0 \0A\09 rcrl %CL, $0 \0A\09 rcll %CL, $2 \0A\09 rcrl %CL, $2", "r,r,*m,i"(i32 %a0, i32 %a1, i32 *%a2, i8 7)
+ ret void
+}
+define void @test_rcl_rcr_64(i64 %a0, i64 %a1, i64 *%a2) optsize {
+; GENERIC-LABEL: test_rcl_rcr_64:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: rclq %rdi # sched: [1:0.50]
+; GENERIC-NEXT: rcrq %rdi # sched: [1:0.50]
+; GENERIC-NEXT: rclq (%rdx) # sched: [5:1.00]
+; GENERIC-NEXT: rcrq (%rdx) # sched: [5:1.00]
+; GENERIC-NEXT: rclq $7, %rdi # sched: [1:0.50]
+; GENERIC-NEXT: rcrq $7, %rdi # sched: [1:0.50]
+; GENERIC-NEXT: rclq $7, (%rdx) # sched: [5:1.00]
+; GENERIC-NEXT: rcrq $7, (%rdx) # sched: [5:1.00]
+; GENERIC-NEXT: rclq %cl, %rdi # sched: [1:0.50]
+; GENERIC-NEXT: rcrq %cl, %rdi # sched: [1:0.50]
+; GENERIC-NEXT: rclq %cl, (%rdx) # sched: [5:1.00]
+; GENERIC-NEXT: rcrq %cl, (%rdx) # sched: [5:1.00]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_rcl_rcr_64:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: rclq %rdi # sched: [1:1.00]
+; ATOM-NEXT: rcrq %rdi # sched: [1:1.00]
+; ATOM-NEXT: rclq (%rdx) # sched: [1:1.00]
+; ATOM-NEXT: rcrq (%rdx) # sched: [1:1.00]
+; ATOM-NEXT: rclq $7, %rdi # sched: [1:1.00]
+; ATOM-NEXT: rcrq $7, %rdi # sched: [1:1.00]
+; ATOM-NEXT: rclq $7, (%rdx) # sched: [1:1.00]
+; ATOM-NEXT: rcrq $7, (%rdx) # sched: [1:1.00]
+; ATOM-NEXT: rclq %cl, %rdi # sched: [1:1.00]
+; ATOM-NEXT: rcrq %cl, %rdi # sched: [1:1.00]
+; ATOM-NEXT: rclq %cl, (%rdx) # sched: [1:1.00]
+; ATOM-NEXT: rcrq %cl, (%rdx) # sched: [1:1.00]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_rcl_rcr_64:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: rclq %rdi # sched: [1:1.00]
+; SLM-NEXT: rcrq %rdi # sched: [1:1.00]
+; SLM-NEXT: rclq (%rdx) # sched: [4:2.00]
+; SLM-NEXT: rcrq (%rdx) # sched: [4:2.00]
+; SLM-NEXT: rclq $7, %rdi # sched: [1:1.00]
+; SLM-NEXT: rcrq $7, %rdi # sched: [1:1.00]
+; SLM-NEXT: rclq $7, (%rdx) # sched: [4:2.00]
+; SLM-NEXT: rcrq $7, (%rdx) # sched: [4:2.00]
+; SLM-NEXT: rclq %cl, %rdi # sched: [1:1.00]
+; SLM-NEXT: rcrq %cl, %rdi # sched: [1:1.00]
+; SLM-NEXT: rclq %cl, (%rdx) # sched: [4:2.00]
+; SLM-NEXT: rcrq %cl, (%rdx) # sched: [4:2.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_rcl_rcr_64:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: rclq %rdi # sched: [1:0.50]
+; SANDY-NEXT: rcrq %rdi # sched: [1:0.50]
+; SANDY-NEXT: rclq (%rdx) # sched: [5:1.00]
+; SANDY-NEXT: rcrq (%rdx) # sched: [5:1.00]
+; SANDY-NEXT: rclq $7, %rdi # sched: [1:0.50]
+; SANDY-NEXT: rcrq $7, %rdi # sched: [1:0.50]
+; SANDY-NEXT: rclq $7, (%rdx) # sched: [5:1.00]
+; SANDY-NEXT: rcrq $7, (%rdx) # sched: [5:1.00]
+; SANDY-NEXT: rclq %cl, %rdi # sched: [1:0.50]
+; SANDY-NEXT: rcrq %cl, %rdi # sched: [1:0.50]
+; SANDY-NEXT: rclq %cl, (%rdx) # sched: [5:1.00]
+; SANDY-NEXT: rcrq %cl, (%rdx) # sched: [5:1.00]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_rcl_rcr_64:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: rclq %rdi # sched: [3:0.75]
+; HASWELL-NEXT: rcrq %rdi # sched: [3:0.75]
+; HASWELL-NEXT: rclq (%rdx) # sched: [9:0.75]
+; HASWELL-NEXT: rcrq (%rdx) # sched: [9:0.75]
+; HASWELL-NEXT: rclq $7, %rdi # sched: [3:0.75]
+; HASWELL-NEXT: rcrq $7, %rdi # sched: [3:0.75]
+; HASWELL-NEXT: rclq $7, (%rdx) # sched: [9:0.75]
+; HASWELL-NEXT: rcrq $7, (%rdx) # sched: [9:0.75]
+; HASWELL-NEXT: rclq %cl, %rdi # sched: [11:2.00]
+; HASWELL-NEXT: rcrq %cl, %rdi # sched: [11:2.00]
+; HASWELL-NEXT: rclq %cl, (%rdx) # sched: [16:2.00]
+; HASWELL-NEXT: rcrq %cl, (%rdx) # sched: [19:2.25]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_rcl_rcr_64:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: rclq %rdi # sched: [3:0.75]
+; BROADWELL-NEXT: rcrq %rdi # sched: [3:0.75]
+; BROADWELL-NEXT: rclq (%rdx) # sched: [8:0.75]
+; BROADWELL-NEXT: rcrq (%rdx) # sched: [8:0.75]
+; BROADWELL-NEXT: rclq $7, %rdi # sched: [3:0.75]
+; BROADWELL-NEXT: rcrq $7, %rdi # sched: [3:0.75]
+; BROADWELL-NEXT: rclq $7, (%rdx) # sched: [8:0.75]
+; BROADWELL-NEXT: rcrq $7, (%rdx) # sched: [8:0.75]
+; BROADWELL-NEXT: rclq %cl, %rdi # sched: [11:2.00]
+; BROADWELL-NEXT: rcrq %cl, %rdi # sched: [11:2.00]
+; BROADWELL-NEXT: rclq %cl, (%rdx) # sched: [15:2.00]
+; BROADWELL-NEXT: rcrq %cl, (%rdx) # sched: [18:2.25]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_rcl_rcr_64:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: rclq %rdi # sched: [3:0.75]
+; SKYLAKE-NEXT: rcrq %rdi # sched: [3:0.75]
+; SKYLAKE-NEXT: rclq (%rdx) # sched: [8:0.75]
+; SKYLAKE-NEXT: rcrq (%rdx) # sched: [8:0.75]
+; SKYLAKE-NEXT: rclq $7, %rdi # sched: [3:0.75]
+; SKYLAKE-NEXT: rcrq $7, %rdi # sched: [3:0.75]
+; SKYLAKE-NEXT: rclq $7, (%rdx) # sched: [8:0.75]
+; SKYLAKE-NEXT: rcrq $7, (%rdx) # sched: [8:0.75]
+; SKYLAKE-NEXT: rclq %cl, %rdi # sched: [11:2.00]
+; SKYLAKE-NEXT: rcrq %cl, %rdi # sched: [11:2.00]
+; SKYLAKE-NEXT: rclq %cl, (%rdx) # sched: [15:2.50]
+; SKYLAKE-NEXT: rcrq %cl, (%rdx) # sched: [18:2.25]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_rcl_rcr_64:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: rclq %rdi # sched: [3:0.75]
+; SKX-NEXT: rcrq %rdi # sched: [3:0.75]
+; SKX-NEXT: rclq (%rdx) # sched: [8:0.75]
+; SKX-NEXT: rcrq (%rdx) # sched: [8:0.75]
+; SKX-NEXT: rclq $7, %rdi # sched: [3:0.75]
+; SKX-NEXT: rcrq $7, %rdi # sched: [3:0.75]
+; SKX-NEXT: rclq $7, (%rdx) # sched: [8:0.75]
+; SKX-NEXT: rcrq $7, (%rdx) # sched: [8:0.75]
+; SKX-NEXT: rclq %cl, %rdi # sched: [11:2.00]
+; SKX-NEXT: rcrq %cl, %rdi # sched: [11:2.00]
+; SKX-NEXT: rclq %cl, (%rdx) # sched: [15:2.50]
+; SKX-NEXT: rcrq %cl, (%rdx) # sched: [18:2.25]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_rcl_rcr_64:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: rclq %rdi # sched: [1:0.50]
+; BTVER2-NEXT: rcrq %rdi # sched: [1:0.50]
+; BTVER2-NEXT: rclq (%rdx) # sched: [4:1.00]
+; BTVER2-NEXT: rcrq (%rdx) # sched: [4:1.00]
+; BTVER2-NEXT: rclq $7, %rdi # sched: [1:0.50]
+; BTVER2-NEXT: rcrq $7, %rdi # sched: [1:0.50]
+; BTVER2-NEXT: rclq $7, (%rdx) # sched: [4:1.00]
+; BTVER2-NEXT: rcrq $7, (%rdx) # sched: [4:1.00]
+; BTVER2-NEXT: rclq %cl, %rdi # sched: [1:0.50]
+; BTVER2-NEXT: rcrq %cl, %rdi # sched: [1:0.50]
+; BTVER2-NEXT: rclq %cl, (%rdx) # sched: [4:1.00]
+; BTVER2-NEXT: rcrq %cl, (%rdx) # sched: [4:1.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_rcl_rcr_64:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: rclq %rdi # sched: [1:0.25]
+; ZNVER1-NEXT: rcrq %rdi # sched: [1:0.25]
+; ZNVER1-NEXT: rclq (%rdx) # sched: [100:?]
+; ZNVER1-NEXT: rcrq (%rdx) # sched: [100:?]
+; ZNVER1-NEXT: rclq $7, %rdi # sched: [1:0.25]
+; ZNVER1-NEXT: rcrq $7, %rdi # sched: [1:0.25]
+; ZNVER1-NEXT: rclq $7, (%rdx) # sched: [100:?]
+; ZNVER1-NEXT: rcrq $7, (%rdx) # sched: [100:?]
+; ZNVER1-NEXT: rclq %cl, %rdi # sched: [1:0.25]
+; ZNVER1-NEXT: rcrq %cl, %rdi # sched: [1:0.25]
+; ZNVER1-NEXT: rclq %cl, (%rdx) # sched: [100:?]
+; ZNVER1-NEXT: rcrq %cl, (%rdx) # sched: [100:?]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ call void asm sideeffect "rclq $0 \0A\09 rcrq $0 \0A\09 rclq $2 \0A\09 rcrq $2 \0A\09 rclq $3, $0 \0A\09 rcrq $3, $0 \0A\09 rclq $3, $2 \0A\09 rcrq $3, $2 \0A\09 rclq %CL, $0 \0A\09 rcrq %CL, $0 \0A\09 rclq %CL, $2 \0A\09 rcrq %CL, $2", "r,r,*m,i"(i64 %a0, i64 %a1, i64 *%a2, i8 7)
+ ret void
+}
+
+define void @test_rdmsr_wrmsr() optsize {
+; GENERIC-LABEL: test_rdmsr_wrmsr:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: rdmsr # sched: [100:0.33]
+; GENERIC-NEXT: wrmsr # sched: [100:0.33]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_rdmsr_wrmsr:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: rdmsr # sched: [78:39.00]
+; ATOM-NEXT: wrmsr # sched: [202:101.00]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_rdmsr_wrmsr:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: rdmsr # sched: [100:1.00]
+; SLM-NEXT: wrmsr # sched: [100:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_rdmsr_wrmsr:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: rdmsr # sched: [100:0.33]
+; SANDY-NEXT: wrmsr # sched: [100:0.33]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_rdmsr_wrmsr:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: rdmsr # sched: [100:0.25]
+; HASWELL-NEXT: wrmsr # sched: [100:0.25]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_rdmsr_wrmsr:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: rdmsr # sched: [100:0.25]
+; BROADWELL-NEXT: wrmsr # sched: [100:0.25]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_rdmsr_wrmsr:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: rdmsr # sched: [100:0.25]
+; SKYLAKE-NEXT: wrmsr # sched: [100:0.25]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_rdmsr_wrmsr:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: rdmsr # sched: [100:0.25]
+; SKX-NEXT: wrmsr # sched: [100:0.25]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_rdmsr_wrmsr:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: rdmsr # sched: [100:0.17]
+; BTVER2-NEXT: wrmsr # sched: [100:0.17]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_rdmsr_wrmsr:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: rdmsr # sched: [100:?]
+; ZNVER1-NEXT: wrmsr # sched: [100:?]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ call void asm sideeffect "rdmsr \0A\09 wrmsr", ""()
+ ret void
+}
+
+define void @test_rdpmc() optsize {
+; GENERIC-LABEL: test_rdpmc:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: rdpmc # sched: [100:0.33]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_rdpmc:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: rdpmc # sched: [46:23.00]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_rdpmc:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: rdpmc # sched: [100:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_rdpmc:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: rdpmc # sched: [100:0.33]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_rdpmc:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: rdpmc # sched: [1:?]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_rdpmc:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: rdpmc # sched: [100:0.25]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_rdpmc:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: rdpmc # sched: [100:0.25]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_rdpmc:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: rdpmc # sched: [100:0.25]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_rdpmc:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: rdpmc # sched: [100:0.17]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_rdpmc:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: rdpmc # sched: [100:?]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ call void asm sideeffect "rdpmc", ""()
+ ret void
+}
+
+define void @test_rdtsc_rdtscp() optsize {
+; GENERIC-LABEL: test_rdtsc_rdtscp:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: rdtsc # sched: [100:0.33]
+; GENERIC-NEXT: rdtscp # sched: [100:0.33]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_rdtsc_rdtscp:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: rdtsc # sched: [30:15.00]
+; ATOM-NEXT: rdtscp # sched: [30:15.00]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_rdtsc_rdtscp:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: rdtsc # sched: [100:1.00]
+; SLM-NEXT: rdtscp # sched: [100:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_rdtsc_rdtscp:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: rdtsc # sched: [100:0.33]
+; SANDY-NEXT: rdtscp # sched: [100:0.33]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_rdtsc_rdtscp:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: rdtsc # sched: [18:2.00]
+; HASWELL-NEXT: rdtscp # sched: [18:2.00]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_rdtsc_rdtscp:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: rdtsc # sched: [18:2.00]
+; BROADWELL-NEXT: rdtscp # sched: [18:2.00]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_rdtsc_rdtscp:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: rdtsc # sched: [18:2.00]
+; SKYLAKE-NEXT: rdtscp # sched: [18:2.00]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_rdtsc_rdtscp:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: rdtsc # sched: [18:2.00]
+; SKX-NEXT: rdtscp # sched: [18:2.00]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_rdtsc_rdtscp:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: rdtsc # sched: [100:0.17]
+; BTVER2-NEXT: rdtscp # sched: [100:0.17]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_rdtsc_rdtscp:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: rdtsc # sched: [100:?]
+; ZNVER1-NEXT: rdtscp # sched: [100:?]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ call void asm sideeffect "rdtsc \0A\09 rdtscp", ""()
+ ret void
+}
+
+define void @test_ret() optsize {
+; GENERIC-LABEL: test_ret:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+; GENERIC-NEXT: retq $4095 # imm = 0xFFF
+; GENERIC-NEXT: # sched: [5:1.00]
+; GENERIC-NEXT: lretl # sched: [5:1.00]
+; GENERIC-NEXT: lretl $4095 # imm = 0xFFF
+; GENERIC-NEXT: # sched: [5:1.00]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_ret:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: retq # sched: [79:39.50]
+; ATOM-NEXT: retq $4095 # imm = 0xFFF
+; ATOM-NEXT: # sched: [1:1.00]
+; ATOM-NEXT: lretl # sched: [79:39.50]
+; ATOM-NEXT: lretl $4095 # imm = 0xFFF
+; ATOM-NEXT: # sched: [79:39.50]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_ret:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: retq # sched: [4:1.00]
+; SLM-NEXT: retq $4095 # imm = 0xFFF
+; SLM-NEXT: # sched: [4:1.00]
+; SLM-NEXT: lretl # sched: [4:1.00]
+; SLM-NEXT: lretl $4095 # imm = 0xFFF
+; SLM-NEXT: # sched: [4:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_ret:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: retq # sched: [1:1.00]
+; SANDY-NEXT: retq $4095 # imm = 0xFFF
+; SANDY-NEXT: # sched: [5:1.00]
+; SANDY-NEXT: lretl # sched: [5:1.00]
+; SANDY-NEXT: lretl $4095 # imm = 0xFFF
+; SANDY-NEXT: # sched: [5:1.00]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_ret:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+; HASWELL-NEXT: retq $4095 # imm = 0xFFF
+; HASWELL-NEXT: # sched: [1:2.00]
+; HASWELL-NEXT: lretl # sched: [6:0.50]
+; HASWELL-NEXT: lretl $4095 # imm = 0xFFF
+; HASWELL-NEXT: # sched: [1:2.00]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_ret:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+; BROADWELL-NEXT: retq $4095 # imm = 0xFFF
+; BROADWELL-NEXT: # sched: [6:0.50]
+; BROADWELL-NEXT: lretl # sched: [6:0.50]
+; BROADWELL-NEXT: lretl $4095 # imm = 0xFFF
+; BROADWELL-NEXT: # sched: [6:0.50]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_ret:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+; SKYLAKE-NEXT: retq $4095 # imm = 0xFFF
+; SKYLAKE-NEXT: # sched: [6:0.50]
+; SKYLAKE-NEXT: lretl # sched: [6:0.50]
+; SKYLAKE-NEXT: lretl $4095 # imm = 0xFFF
+; SKYLAKE-NEXT: # sched: [6:0.50]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_ret:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: retq # sched: [7:1.00]
+; SKX-NEXT: retq $4095 # imm = 0xFFF
+; SKX-NEXT: # sched: [6:0.50]
+; SKX-NEXT: lretl # sched: [6:0.50]
+; SKX-NEXT: lretl $4095 # imm = 0xFFF
+; SKX-NEXT: # sched: [6:0.50]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_ret:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: retq # sched: [4:1.00]
+; BTVER2-NEXT: retq $4095 # imm = 0xFFF
+; BTVER2-NEXT: # sched: [4:1.00]
+; BTVER2-NEXT: lretl # sched: [4:1.00]
+; BTVER2-NEXT: lretl $4095 # imm = 0xFFF
+; BTVER2-NEXT: # sched: [4:1.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_ret:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+; ZNVER1-NEXT: retq $4095 # imm = 0xFFF
+; ZNVER1-NEXT: # sched: [5:0.50]
+; ZNVER1-NEXT: lretl # sched: [1:0.50]
+; ZNVER1-NEXT: lretl $4095 # imm = 0xFFF
+; ZNVER1-NEXT: # sched: [5:0.50]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ call void asm sideeffect "ret \0A\09 ret $0 \0A\09 lret \0A\09 lret $0", "i"(i16 4095)
+ ret void
+}
+
+define void @test_rol_ror_8(i8 %a0, i8 %a1, i8 *%a2) optsize {
+; GENERIC-LABEL: test_rol_ror_8:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: rolb %dil # sched: [1:0.50]
+; GENERIC-NEXT: rorb %dil # sched: [1:0.50]
+; GENERIC-NEXT: rolb (%rdx) # sched: [5:1.00]
+; GENERIC-NEXT: rorb (%rdx) # sched: [5:1.00]
+; GENERIC-NEXT: rolb $7, %dil # sched: [2:1.00]
+; GENERIC-NEXT: rorb $7, %dil # sched: [2:1.00]
+; GENERIC-NEXT: rolb $7, (%rdx) # sched: [8:1.00]
+; GENERIC-NEXT: rorb $7, (%rdx) # sched: [8:1.00]
+; GENERIC-NEXT: rolb %cl, %dil # sched: [3:1.50]
+; GENERIC-NEXT: rorb %cl, %dil # sched: [3:1.50]
+; GENERIC-NEXT: rolb %cl, (%rdx) # sched: [9:1.50]
+; GENERIC-NEXT: rorb %cl, (%rdx) # sched: [9:1.50]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_rol_ror_8:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: rolb %dil # sched: [1:1.00]
+; ATOM-NEXT: rorb %dil # sched: [1:1.00]
+; ATOM-NEXT: rolb (%rdx) # sched: [1:1.00]
+; ATOM-NEXT: rorb (%rdx) # sched: [1:1.00]
+; ATOM-NEXT: rolb $7, %dil # sched: [1:1.00]
+; ATOM-NEXT: rorb $7, %dil # sched: [1:1.00]
+; ATOM-NEXT: rolb $7, (%rdx) # sched: [1:1.00]
+; ATOM-NEXT: rorb $7, (%rdx) # sched: [1:1.00]
+; ATOM-NEXT: rolb %cl, %dil # sched: [1:1.00]
+; ATOM-NEXT: rorb %cl, %dil # sched: [1:1.00]
+; ATOM-NEXT: rolb %cl, (%rdx) # sched: [1:1.00]
+; ATOM-NEXT: rorb %cl, (%rdx) # sched: [1:1.00]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_rol_ror_8:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: rolb %dil # sched: [1:1.00]
+; SLM-NEXT: rorb %dil # sched: [1:1.00]
+; SLM-NEXT: rolb (%rdx) # sched: [4:2.00]
+; SLM-NEXT: rorb (%rdx) # sched: [4:2.00]
+; SLM-NEXT: rolb $7, %dil # sched: [1:1.00]
+; SLM-NEXT: rorb $7, %dil # sched: [1:1.00]
+; SLM-NEXT: rolb $7, (%rdx) # sched: [4:2.00]
+; SLM-NEXT: rorb $7, (%rdx) # sched: [4:2.00]
+; SLM-NEXT: rolb %cl, %dil # sched: [1:1.00]
+; SLM-NEXT: rorb %cl, %dil # sched: [1:1.00]
+; SLM-NEXT: rolb %cl, (%rdx) # sched: [4:2.00]
+; SLM-NEXT: rorb %cl, (%rdx) # sched: [4:2.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_rol_ror_8:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: rolb %dil # sched: [1:0.50]
+; SANDY-NEXT: rorb %dil # sched: [1:0.50]
+; SANDY-NEXT: rolb (%rdx) # sched: [5:1.00]
+; SANDY-NEXT: rorb (%rdx) # sched: [5:1.00]
+; SANDY-NEXT: rolb $7, %dil # sched: [2:1.00]
+; SANDY-NEXT: rorb $7, %dil # sched: [2:1.00]
+; SANDY-NEXT: rolb $7, (%rdx) # sched: [8:1.00]
+; SANDY-NEXT: rorb $7, (%rdx) # sched: [8:1.00]
+; SANDY-NEXT: rolb %cl, %dil # sched: [3:1.50]
+; SANDY-NEXT: rorb %cl, %dil # sched: [3:1.50]
+; SANDY-NEXT: rolb %cl, (%rdx) # sched: [9:1.50]
+; SANDY-NEXT: rorb %cl, (%rdx) # sched: [9:1.50]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_rol_ror_8:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: rolb %dil # sched: [2:1.00]
+; HASWELL-NEXT: rorb %dil # sched: [2:1.00]
+; HASWELL-NEXT: rolb (%rdx) # sched: [8:1.00]
+; HASWELL-NEXT: rorb (%rdx) # sched: [8:1.00]
+; HASWELL-NEXT: rolb $7, %dil # sched: [2:1.00]
+; HASWELL-NEXT: rorb $7, %dil # sched: [2:1.00]
+; HASWELL-NEXT: rolb $7, (%rdx) # sched: [8:1.00]
+; HASWELL-NEXT: rorb $7, (%rdx) # sched: [8:1.00]
+; HASWELL-NEXT: rolb %cl, %dil # sched: [3:1.00]
+; HASWELL-NEXT: rorb %cl, %dil # sched: [3:1.00]
+; HASWELL-NEXT: rolb %cl, (%rdx) # sched: [9:1.00]
+; HASWELL-NEXT: rorb %cl, (%rdx) # sched: [9:1.00]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_rol_ror_8:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: rolb %dil # sched: [2:1.00]
+; BROADWELL-NEXT: rorb %dil # sched: [2:1.00]
+; BROADWELL-NEXT: rolb (%rdx) # sched: [7:1.00]
+; BROADWELL-NEXT: rorb (%rdx) # sched: [7:1.00]
+; BROADWELL-NEXT: rolb $7, %dil # sched: [2:1.00]
+; BROADWELL-NEXT: rorb $7, %dil # sched: [2:1.00]
+; BROADWELL-NEXT: rolb $7, (%rdx) # sched: [7:1.00]
+; BROADWELL-NEXT: rorb $7, (%rdx) # sched: [7:1.00]
+; BROADWELL-NEXT: rolb %cl, %dil # sched: [3:1.00]
+; BROADWELL-NEXT: rorb %cl, %dil # sched: [3:1.00]
+; BROADWELL-NEXT: rolb %cl, (%rdx) # sched: [8:1.00]
+; BROADWELL-NEXT: rorb %cl, (%rdx) # sched: [8:1.00]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_rol_ror_8:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: rolb %dil # sched: [2:1.00]
+; SKYLAKE-NEXT: rorb %dil # sched: [2:1.00]
+; SKYLAKE-NEXT: rolb (%rdx) # sched: [7:1.00]
+; SKYLAKE-NEXT: rorb (%rdx) # sched: [7:1.00]
+; SKYLAKE-NEXT: rolb $7, %dil # sched: [2:1.00]
+; SKYLAKE-NEXT: rorb $7, %dil # sched: [2:1.00]
+; SKYLAKE-NEXT: rolb $7, (%rdx) # sched: [7:1.00]
+; SKYLAKE-NEXT: rorb $7, (%rdx) # sched: [7:1.00]
+; SKYLAKE-NEXT: rolb %cl, %dil # sched: [3:1.50]
+; SKYLAKE-NEXT: rorb %cl, %dil # sched: [3:1.50]
+; SKYLAKE-NEXT: rolb %cl, (%rdx) # sched: [8:1.50]
+; SKYLAKE-NEXT: rorb %cl, (%rdx) # sched: [8:1.50]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_rol_ror_8:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: rolb %dil # sched: [2:1.00]
+; SKX-NEXT: rorb %dil # sched: [2:1.00]
+; SKX-NEXT: rolb (%rdx) # sched: [7:1.00]
+; SKX-NEXT: rorb (%rdx) # sched: [7:1.00]
+; SKX-NEXT: rolb $7, %dil # sched: [2:1.00]
+; SKX-NEXT: rorb $7, %dil # sched: [2:1.00]
+; SKX-NEXT: rolb $7, (%rdx) # sched: [7:1.00]
+; SKX-NEXT: rorb $7, (%rdx) # sched: [7:1.00]
+; SKX-NEXT: rolb %cl, %dil # sched: [3:1.50]
+; SKX-NEXT: rorb %cl, %dil # sched: [3:1.50]
+; SKX-NEXT: rolb %cl, (%rdx) # sched: [8:1.50]
+; SKX-NEXT: rorb %cl, (%rdx) # sched: [8:1.50]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_rol_ror_8:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: rolb %dil # sched: [1:0.50]
+; BTVER2-NEXT: rorb %dil # sched: [1:0.50]
+; BTVER2-NEXT: rolb (%rdx) # sched: [4:1.00]
+; BTVER2-NEXT: rorb (%rdx) # sched: [4:1.00]
+; BTVER2-NEXT: rolb $7, %dil # sched: [1:0.50]
+; BTVER2-NEXT: rorb $7, %dil # sched: [1:0.50]
+; BTVER2-NEXT: rolb $7, (%rdx) # sched: [4:1.00]
+; BTVER2-NEXT: rorb $7, (%rdx) # sched: [4:1.00]
+; BTVER2-NEXT: rolb %cl, %dil # sched: [1:0.50]
+; BTVER2-NEXT: rorb %cl, %dil # sched: [1:0.50]
+; BTVER2-NEXT: rolb %cl, (%rdx) # sched: [4:1.00]
+; BTVER2-NEXT: rorb %cl, (%rdx) # sched: [4:1.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_rol_ror_8:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: rolb %dil # sched: [1:0.25]
+; ZNVER1-NEXT: rorb %dil # sched: [1:0.25]
+; ZNVER1-NEXT: rolb (%rdx) # sched: [5:1.00]
+; ZNVER1-NEXT: rorb (%rdx) # sched: [5:1.00]
+; ZNVER1-NEXT: rolb $7, %dil # sched: [1:0.25]
+; ZNVER1-NEXT: rorb $7, %dil # sched: [1:0.25]
+; ZNVER1-NEXT: rolb $7, (%rdx) # sched: [5:1.00]
+; ZNVER1-NEXT: rorb $7, (%rdx) # sched: [5:1.00]
+; ZNVER1-NEXT: rolb %cl, %dil # sched: [1:0.25]
+; ZNVER1-NEXT: rorb %cl, %dil # sched: [1:0.25]
+; ZNVER1-NEXT: rolb %cl, (%rdx) # sched: [5:1.00]
+; ZNVER1-NEXT: rorb %cl, (%rdx) # sched: [5:1.00]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ call void asm sideeffect "rolb $0 \0A\09 rorb $0 \0A\09 rolb $2 \0A\09 rorb $2 \0A\09 rolb $3, $0 \0A\09 rorb $3, $0 \0A\09 rolb $3, $2 \0A\09 rorb $3, $2 \0A\09 rolb %CL, $0 \0A\09 rorb %CL, $0 \0A\09 rolb %CL, $2 \0A\09 rorb %CL, $2", "r,r,*m,i"(i8 %a0, i8 %a1, i8 *%a2, i8 7)
+ ret void
+}
+define void @test_rol_ror_16(i16 %a0, i16 %a1, i16 *%a2) optsize {
+; GENERIC-LABEL: test_rol_ror_16:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: rolw %di # sched: [1:0.50]
+; GENERIC-NEXT: rorw %di # sched: [1:0.50]
+; GENERIC-NEXT: rolw (%rdx) # sched: [5:1.00]
+; GENERIC-NEXT: rorw (%rdx) # sched: [5:1.00]
+; GENERIC-NEXT: rolw $7, %di # sched: [2:1.00]
+; GENERIC-NEXT: rorw $7, %di # sched: [2:1.00]
+; GENERIC-NEXT: rolw $7, (%rdx) # sched: [8:1.00]
+; GENERIC-NEXT: rorw $7, (%rdx) # sched: [8:1.00]
+; GENERIC-NEXT: rolw %cl, %di # sched: [3:1.50]
+; GENERIC-NEXT: rorw %cl, %di # sched: [3:1.50]
+; GENERIC-NEXT: rolw %cl, (%rdx) # sched: [9:1.50]
+; GENERIC-NEXT: rorw %cl, (%rdx) # sched: [9:1.50]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_rol_ror_16:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: rolw %di # sched: [1:1.00]
+; ATOM-NEXT: rorw %di # sched: [1:1.00]
+; ATOM-NEXT: rolw (%rdx) # sched: [1:1.00]
+; ATOM-NEXT: rorw (%rdx) # sched: [1:1.00]
+; ATOM-NEXT: rolw $7, %di # sched: [1:1.00]
+; ATOM-NEXT: rorw $7, %di # sched: [1:1.00]
+; ATOM-NEXT: rolw $7, (%rdx) # sched: [1:1.00]
+; ATOM-NEXT: rorw $7, (%rdx) # sched: [1:1.00]
+; ATOM-NEXT: rolw %cl, %di # sched: [1:1.00]
+; ATOM-NEXT: rorw %cl, %di # sched: [1:1.00]
+; ATOM-NEXT: rolw %cl, (%rdx) # sched: [1:1.00]
+; ATOM-NEXT: rorw %cl, (%rdx) # sched: [1:1.00]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_rol_ror_16:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: rolw %di # sched: [1:1.00]
+; SLM-NEXT: rorw %di # sched: [1:1.00]
+; SLM-NEXT: rolw (%rdx) # sched: [4:2.00]
+; SLM-NEXT: rorw (%rdx) # sched: [4:2.00]
+; SLM-NEXT: rolw $7, %di # sched: [1:1.00]
+; SLM-NEXT: rorw $7, %di # sched: [1:1.00]
+; SLM-NEXT: rolw $7, (%rdx) # sched: [4:2.00]
+; SLM-NEXT: rorw $7, (%rdx) # sched: [4:2.00]
+; SLM-NEXT: rolw %cl, %di # sched: [1:1.00]
+; SLM-NEXT: rorw %cl, %di # sched: [1:1.00]
+; SLM-NEXT: rolw %cl, (%rdx) # sched: [4:2.00]
+; SLM-NEXT: rorw %cl, (%rdx) # sched: [4:2.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_rol_ror_16:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: rolw %di # sched: [1:0.50]
+; SANDY-NEXT: rorw %di # sched: [1:0.50]
+; SANDY-NEXT: rolw (%rdx) # sched: [5:1.00]
+; SANDY-NEXT: rorw (%rdx) # sched: [5:1.00]
+; SANDY-NEXT: rolw $7, %di # sched: [2:1.00]
+; SANDY-NEXT: rorw $7, %di # sched: [2:1.00]
+; SANDY-NEXT: rolw $7, (%rdx) # sched: [8:1.00]
+; SANDY-NEXT: rorw $7, (%rdx) # sched: [8:1.00]
+; SANDY-NEXT: rolw %cl, %di # sched: [3:1.50]
+; SANDY-NEXT: rorw %cl, %di # sched: [3:1.50]
+; SANDY-NEXT: rolw %cl, (%rdx) # sched: [9:1.50]
+; SANDY-NEXT: rorw %cl, (%rdx) # sched: [9:1.50]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_rol_ror_16:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: rolw %di # sched: [2:1.00]
+; HASWELL-NEXT: rorw %di # sched: [2:1.00]
+; HASWELL-NEXT: rolw (%rdx) # sched: [8:1.00]
+; HASWELL-NEXT: rorw (%rdx) # sched: [8:1.00]
+; HASWELL-NEXT: rolw $7, %di # sched: [2:1.00]
+; HASWELL-NEXT: rorw $7, %di # sched: [2:1.00]
+; HASWELL-NEXT: rolw $7, (%rdx) # sched: [8:1.00]
+; HASWELL-NEXT: rorw $7, (%rdx) # sched: [8:1.00]
+; HASWELL-NEXT: rolw %cl, %di # sched: [3:1.00]
+; HASWELL-NEXT: rorw %cl, %di # sched: [3:1.00]
+; HASWELL-NEXT: rolw %cl, (%rdx) # sched: [9:1.00]
+; HASWELL-NEXT: rorw %cl, (%rdx) # sched: [9:1.00]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_rol_ror_16:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: rolw %di # sched: [2:1.00]
+; BROADWELL-NEXT: rorw %di # sched: [2:1.00]
+; BROADWELL-NEXT: rolw (%rdx) # sched: [7:1.00]
+; BROADWELL-NEXT: rorw (%rdx) # sched: [7:1.00]
+; BROADWELL-NEXT: rolw $7, %di # sched: [2:1.00]
+; BROADWELL-NEXT: rorw $7, %di # sched: [2:1.00]
+; BROADWELL-NEXT: rolw $7, (%rdx) # sched: [7:1.00]
+; BROADWELL-NEXT: rorw $7, (%rdx) # sched: [7:1.00]
+; BROADWELL-NEXT: rolw %cl, %di # sched: [3:1.00]
+; BROADWELL-NEXT: rorw %cl, %di # sched: [3:1.00]
+; BROADWELL-NEXT: rolw %cl, (%rdx) # sched: [8:1.00]
+; BROADWELL-NEXT: rorw %cl, (%rdx) # sched: [8:1.00]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_rol_ror_16:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: rolw %di # sched: [2:1.00]
+; SKYLAKE-NEXT: rorw %di # sched: [2:1.00]
+; SKYLAKE-NEXT: rolw (%rdx) # sched: [7:1.00]
+; SKYLAKE-NEXT: rorw (%rdx) # sched: [7:1.00]
+; SKYLAKE-NEXT: rolw $7, %di # sched: [2:1.00]
+; SKYLAKE-NEXT: rorw $7, %di # sched: [2:1.00]
+; SKYLAKE-NEXT: rolw $7, (%rdx) # sched: [7:1.00]
+; SKYLAKE-NEXT: rorw $7, (%rdx) # sched: [7:1.00]
+; SKYLAKE-NEXT: rolw %cl, %di # sched: [3:1.50]
+; SKYLAKE-NEXT: rorw %cl, %di # sched: [3:1.50]
+; SKYLAKE-NEXT: rolw %cl, (%rdx) # sched: [8:1.50]
+; SKYLAKE-NEXT: rorw %cl, (%rdx) # sched: [8:1.50]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_rol_ror_16:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: rolw %di # sched: [2:1.00]
+; SKX-NEXT: rorw %di # sched: [2:1.00]
+; SKX-NEXT: rolw (%rdx) # sched: [7:1.00]
+; SKX-NEXT: rorw (%rdx) # sched: [7:1.00]
+; SKX-NEXT: rolw $7, %di # sched: [2:1.00]
+; SKX-NEXT: rorw $7, %di # sched: [2:1.00]
+; SKX-NEXT: rolw $7, (%rdx) # sched: [7:1.00]
+; SKX-NEXT: rorw $7, (%rdx) # sched: [7:1.00]
+; SKX-NEXT: rolw %cl, %di # sched: [3:1.50]
+; SKX-NEXT: rorw %cl, %di # sched: [3:1.50]
+; SKX-NEXT: rolw %cl, (%rdx) # sched: [8:1.50]
+; SKX-NEXT: rorw %cl, (%rdx) # sched: [8:1.50]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_rol_ror_16:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: rolw %di # sched: [1:0.50]
+; BTVER2-NEXT: rorw %di # sched: [1:0.50]
+; BTVER2-NEXT: rolw (%rdx) # sched: [4:1.00]
+; BTVER2-NEXT: rorw (%rdx) # sched: [4:1.00]
+; BTVER2-NEXT: rolw $7, %di # sched: [1:0.50]
+; BTVER2-NEXT: rorw $7, %di # sched: [1:0.50]
+; BTVER2-NEXT: rolw $7, (%rdx) # sched: [4:1.00]
+; BTVER2-NEXT: rorw $7, (%rdx) # sched: [4:1.00]
+; BTVER2-NEXT: rolw %cl, %di # sched: [1:0.50]
+; BTVER2-NEXT: rorw %cl, %di # sched: [1:0.50]
+; BTVER2-NEXT: rolw %cl, (%rdx) # sched: [4:1.00]
+; BTVER2-NEXT: rorw %cl, (%rdx) # sched: [4:1.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_rol_ror_16:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: rolw %di # sched: [1:0.25]
+; ZNVER1-NEXT: rorw %di # sched: [1:0.25]
+; ZNVER1-NEXT: rolw (%rdx) # sched: [5:1.00]
+; ZNVER1-NEXT: rorw (%rdx) # sched: [5:1.00]
+; ZNVER1-NEXT: rolw $7, %di # sched: [1:0.25]
+; ZNVER1-NEXT: rorw $7, %di # sched: [1:0.25]
+; ZNVER1-NEXT: rolw $7, (%rdx) # sched: [5:1.00]
+; ZNVER1-NEXT: rorw $7, (%rdx) # sched: [5:1.00]
+; ZNVER1-NEXT: rolw %cl, %di # sched: [1:0.25]
+; ZNVER1-NEXT: rorw %cl, %di # sched: [1:0.25]
+; ZNVER1-NEXT: rolw %cl, (%rdx) # sched: [5:1.00]
+; ZNVER1-NEXT: rorw %cl, (%rdx) # sched: [5:1.00]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ call void asm sideeffect "rolw $0 \0A\09 rorw $0 \0A\09 rolw $2 \0A\09 rorw $2 \0A\09 rolw $3, $0 \0A\09 rorw $3, $0 \0A\09 rolw $3, $2 \0A\09 rorw $3, $2 \0A\09 rolw %CL, $0 \0A\09 rorw %CL, $0 \0A\09 rolw %CL, $2 \0A\09 rorw %CL, $2", "r,r,*m,i"(i16 %a0, i16 %a1, i16 *%a2, i8 7)
+ ret void
+}
+define void @test_rol_ror_32(i32 %a0, i32 %a1, i32 *%a2) optsize {
+; GENERIC-LABEL: test_rol_ror_32:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: roll %edi # sched: [1:0.50]
+; GENERIC-NEXT: rorl %edi # sched: [1:0.50]
+; GENERIC-NEXT: roll (%rdx) # sched: [5:1.00]
+; GENERIC-NEXT: rorl (%rdx) # sched: [5:1.00]
+; GENERIC-NEXT: roll $7, %edi # sched: [2:1.00]
+; GENERIC-NEXT: rorl $7, %edi # sched: [2:1.00]
+; GENERIC-NEXT: roll $7, (%rdx) # sched: [8:1.00]
+; GENERIC-NEXT: rorl $7, (%rdx) # sched: [8:1.00]
+; GENERIC-NEXT: roll %cl, %edi # sched: [3:1.50]
+; GENERIC-NEXT: rorl %cl, %edi # sched: [3:1.50]
+; GENERIC-NEXT: roll %cl, (%rdx) # sched: [9:1.50]
+; GENERIC-NEXT: rorl %cl, (%rdx) # sched: [9:1.50]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_rol_ror_32:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: roll %edi # sched: [1:1.00]
+; ATOM-NEXT: rorl %edi # sched: [1:1.00]
+; ATOM-NEXT: roll (%rdx) # sched: [1:1.00]
+; ATOM-NEXT: rorl (%rdx) # sched: [1:1.00]
+; ATOM-NEXT: roll $7, %edi # sched: [1:1.00]
+; ATOM-NEXT: rorl $7, %edi # sched: [1:1.00]
+; ATOM-NEXT: roll $7, (%rdx) # sched: [1:1.00]
+; ATOM-NEXT: rorl $7, (%rdx) # sched: [1:1.00]
+; ATOM-NEXT: roll %cl, %edi # sched: [1:1.00]
+; ATOM-NEXT: rorl %cl, %edi # sched: [1:1.00]
+; ATOM-NEXT: roll %cl, (%rdx) # sched: [1:1.00]
+; ATOM-NEXT: rorl %cl, (%rdx) # sched: [1:1.00]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_rol_ror_32:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: roll %edi # sched: [1:1.00]
+; SLM-NEXT: rorl %edi # sched: [1:1.00]
+; SLM-NEXT: roll (%rdx) # sched: [4:2.00]
+; SLM-NEXT: rorl (%rdx) # sched: [4:2.00]
+; SLM-NEXT: roll $7, %edi # sched: [1:1.00]
+; SLM-NEXT: rorl $7, %edi # sched: [1:1.00]
+; SLM-NEXT: roll $7, (%rdx) # sched: [4:2.00]
+; SLM-NEXT: rorl $7, (%rdx) # sched: [4:2.00]
+; SLM-NEXT: roll %cl, %edi # sched: [1:1.00]
+; SLM-NEXT: rorl %cl, %edi # sched: [1:1.00]
+; SLM-NEXT: roll %cl, (%rdx) # sched: [4:2.00]
+; SLM-NEXT: rorl %cl, (%rdx) # sched: [4:2.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_rol_ror_32:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: roll %edi # sched: [1:0.50]
+; SANDY-NEXT: rorl %edi # sched: [1:0.50]
+; SANDY-NEXT: roll (%rdx) # sched: [5:1.00]
+; SANDY-NEXT: rorl (%rdx) # sched: [5:1.00]
+; SANDY-NEXT: roll $7, %edi # sched: [2:1.00]
+; SANDY-NEXT: rorl $7, %edi # sched: [2:1.00]
+; SANDY-NEXT: roll $7, (%rdx) # sched: [8:1.00]
+; SANDY-NEXT: rorl $7, (%rdx) # sched: [8:1.00]
+; SANDY-NEXT: roll %cl, %edi # sched: [3:1.50]
+; SANDY-NEXT: rorl %cl, %edi # sched: [3:1.50]
+; SANDY-NEXT: roll %cl, (%rdx) # sched: [9:1.50]
+; SANDY-NEXT: rorl %cl, (%rdx) # sched: [9:1.50]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_rol_ror_32:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: roll %edi # sched: [2:1.00]
+; HASWELL-NEXT: rorl %edi # sched: [2:1.00]
+; HASWELL-NEXT: roll (%rdx) # sched: [8:1.00]
+; HASWELL-NEXT: rorl (%rdx) # sched: [8:1.00]
+; HASWELL-NEXT: roll $7, %edi # sched: [2:1.00]
+; HASWELL-NEXT: rorl $7, %edi # sched: [2:1.00]
+; HASWELL-NEXT: roll $7, (%rdx) # sched: [8:1.00]
+; HASWELL-NEXT: rorl $7, (%rdx) # sched: [8:1.00]
+; HASWELL-NEXT: roll %cl, %edi # sched: [3:1.00]
+; HASWELL-NEXT: rorl %cl, %edi # sched: [3:1.00]
+; HASWELL-NEXT: roll %cl, (%rdx) # sched: [9:1.00]
+; HASWELL-NEXT: rorl %cl, (%rdx) # sched: [9:1.00]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_rol_ror_32:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: roll %edi # sched: [2:1.00]
+; BROADWELL-NEXT: rorl %edi # sched: [2:1.00]
+; BROADWELL-NEXT: roll (%rdx) # sched: [7:1.00]
+; BROADWELL-NEXT: rorl (%rdx) # sched: [7:1.00]
+; BROADWELL-NEXT: roll $7, %edi # sched: [2:1.00]
+; BROADWELL-NEXT: rorl $7, %edi # sched: [2:1.00]
+; BROADWELL-NEXT: roll $7, (%rdx) # sched: [7:1.00]
+; BROADWELL-NEXT: rorl $7, (%rdx) # sched: [7:1.00]
+; BROADWELL-NEXT: roll %cl, %edi # sched: [3:1.00]
+; BROADWELL-NEXT: rorl %cl, %edi # sched: [3:1.00]
+; BROADWELL-NEXT: roll %cl, (%rdx) # sched: [8:1.00]
+; BROADWELL-NEXT: rorl %cl, (%rdx) # sched: [8:1.00]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_rol_ror_32:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: roll %edi # sched: [2:1.00]
+; SKYLAKE-NEXT: rorl %edi # sched: [2:1.00]
+; SKYLAKE-NEXT: roll (%rdx) # sched: [7:1.00]
+; SKYLAKE-NEXT: rorl (%rdx) # sched: [7:1.00]
+; SKYLAKE-NEXT: roll $7, %edi # sched: [2:1.00]
+; SKYLAKE-NEXT: rorl $7, %edi # sched: [2:1.00]
+; SKYLAKE-NEXT: roll $7, (%rdx) # sched: [7:1.00]
+; SKYLAKE-NEXT: rorl $7, (%rdx) # sched: [7:1.00]
+; SKYLAKE-NEXT: roll %cl, %edi # sched: [3:1.50]
+; SKYLAKE-NEXT: rorl %cl, %edi # sched: [3:1.50]
+; SKYLAKE-NEXT: roll %cl, (%rdx) # sched: [8:1.50]
+; SKYLAKE-NEXT: rorl %cl, (%rdx) # sched: [8:1.50]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_rol_ror_32:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: roll %edi # sched: [2:1.00]
+; SKX-NEXT: rorl %edi # sched: [2:1.00]
+; SKX-NEXT: roll (%rdx) # sched: [7:1.00]
+; SKX-NEXT: rorl (%rdx) # sched: [7:1.00]
+; SKX-NEXT: roll $7, %edi # sched: [2:1.00]
+; SKX-NEXT: rorl $7, %edi # sched: [2:1.00]
+; SKX-NEXT: roll $7, (%rdx) # sched: [7:1.00]
+; SKX-NEXT: rorl $7, (%rdx) # sched: [7:1.00]
+; SKX-NEXT: roll %cl, %edi # sched: [3:1.50]
+; SKX-NEXT: rorl %cl, %edi # sched: [3:1.50]
+; SKX-NEXT: roll %cl, (%rdx) # sched: [8:1.50]
+; SKX-NEXT: rorl %cl, (%rdx) # sched: [8:1.50]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_rol_ror_32:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: roll %edi # sched: [1:0.50]
+; BTVER2-NEXT: rorl %edi # sched: [1:0.50]
+; BTVER2-NEXT: roll (%rdx) # sched: [4:1.00]
+; BTVER2-NEXT: rorl (%rdx) # sched: [4:1.00]
+; BTVER2-NEXT: roll $7, %edi # sched: [1:0.50]
+; BTVER2-NEXT: rorl $7, %edi # sched: [1:0.50]
+; BTVER2-NEXT: roll $7, (%rdx) # sched: [4:1.00]
+; BTVER2-NEXT: rorl $7, (%rdx) # sched: [4:1.00]
+; BTVER2-NEXT: roll %cl, %edi # sched: [1:0.50]
+; BTVER2-NEXT: rorl %cl, %edi # sched: [1:0.50]
+; BTVER2-NEXT: roll %cl, (%rdx) # sched: [4:1.00]
+; BTVER2-NEXT: rorl %cl, (%rdx) # sched: [4:1.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_rol_ror_32:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: roll %edi # sched: [1:0.25]
+; ZNVER1-NEXT: rorl %edi # sched: [1:0.25]
+; ZNVER1-NEXT: roll (%rdx) # sched: [5:1.00]
+; ZNVER1-NEXT: rorl (%rdx) # sched: [5:1.00]
+; ZNVER1-NEXT: roll $7, %edi # sched: [1:0.25]
+; ZNVER1-NEXT: rorl $7, %edi # sched: [1:0.25]
+; ZNVER1-NEXT: roll $7, (%rdx) # sched: [5:1.00]
+; ZNVER1-NEXT: rorl $7, (%rdx) # sched: [5:1.00]
+; ZNVER1-NEXT: roll %cl, %edi # sched: [1:0.25]
+; ZNVER1-NEXT: rorl %cl, %edi # sched: [1:0.25]
+; ZNVER1-NEXT: roll %cl, (%rdx) # sched: [5:1.00]
+; ZNVER1-NEXT: rorl %cl, (%rdx) # sched: [5:1.00]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ call void asm sideeffect "roll $0 \0A\09 rorl $0 \0A\09 roll $2 \0A\09 rorl $2 \0A\09 roll $3, $0 \0A\09 rorl $3, $0 \0A\09 roll $3, $2 \0A\09 rorl $3, $2 \0A\09 roll %CL, $0 \0A\09 rorl %CL, $0 \0A\09 roll %CL, $2 \0A\09 rorl %CL, $2", "r,r,*m,i"(i32 %a0, i32 %a1, i32 *%a2, i8 7)
+ ret void
+}
+define void @test_rol_ror_64(i64 %a0, i64 %a1, i64 *%a2) optsize {
+; GENERIC-LABEL: test_rol_ror_64:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: rolq %rdi # sched: [1:0.50]
+; GENERIC-NEXT: rorq %rdi # sched: [1:0.50]
+; GENERIC-NEXT: rolq (%rdx) # sched: [5:1.00]
+; GENERIC-NEXT: rorq (%rdx) # sched: [5:1.00]
+; GENERIC-NEXT: rolq $7, %rdi # sched: [2:1.00]
+; GENERIC-NEXT: rorq $7, %rdi # sched: [2:1.00]
+; GENERIC-NEXT: rolq $7, (%rdx) # sched: [8:1.00]
+; GENERIC-NEXT: rorq $7, (%rdx) # sched: [8:1.00]
+; GENERIC-NEXT: rolq %cl, %rdi # sched: [3:1.50]
+; GENERIC-NEXT: rorq %cl, %rdi # sched: [3:1.50]
+; GENERIC-NEXT: rolq %cl, (%rdx) # sched: [9:1.50]
+; GENERIC-NEXT: rorq %cl, (%rdx) # sched: [9:1.50]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_rol_ror_64:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: rolq %rdi # sched: [1:1.00]
+; ATOM-NEXT: rorq %rdi # sched: [1:1.00]
+; ATOM-NEXT: rolq (%rdx) # sched: [1:1.00]
+; ATOM-NEXT: rorq (%rdx) # sched: [1:1.00]
+; ATOM-NEXT: rolq $7, %rdi # sched: [1:1.00]
+; ATOM-NEXT: rorq $7, %rdi # sched: [1:1.00]
+; ATOM-NEXT: rolq $7, (%rdx) # sched: [1:1.00]
+; ATOM-NEXT: rorq $7, (%rdx) # sched: [1:1.00]
+; ATOM-NEXT: rolq %cl, %rdi # sched: [1:1.00]
+; ATOM-NEXT: rorq %cl, %rdi # sched: [1:1.00]
+; ATOM-NEXT: rolq %cl, (%rdx) # sched: [1:1.00]
+; ATOM-NEXT: rorq %cl, (%rdx) # sched: [1:1.00]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_rol_ror_64:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: rolq %rdi # sched: [1:1.00]
+; SLM-NEXT: rorq %rdi # sched: [1:1.00]
+; SLM-NEXT: rolq (%rdx) # sched: [4:2.00]
+; SLM-NEXT: rorq (%rdx) # sched: [4:2.00]
+; SLM-NEXT: rolq $7, %rdi # sched: [1:1.00]
+; SLM-NEXT: rorq $7, %rdi # sched: [1:1.00]
+; SLM-NEXT: rolq $7, (%rdx) # sched: [4:2.00]
+; SLM-NEXT: rorq $7, (%rdx) # sched: [4:2.00]
+; SLM-NEXT: rolq %cl, %rdi # sched: [1:1.00]
+; SLM-NEXT: rorq %cl, %rdi # sched: [1:1.00]
+; SLM-NEXT: rolq %cl, (%rdx) # sched: [4:2.00]
+; SLM-NEXT: rorq %cl, (%rdx) # sched: [4:2.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_rol_ror_64:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: rolq %rdi # sched: [1:0.50]
+; SANDY-NEXT: rorq %rdi # sched: [1:0.50]
+; SANDY-NEXT: rolq (%rdx) # sched: [5:1.00]
+; SANDY-NEXT: rorq (%rdx) # sched: [5:1.00]
+; SANDY-NEXT: rolq $7, %rdi # sched: [2:1.00]
+; SANDY-NEXT: rorq $7, %rdi # sched: [2:1.00]
+; SANDY-NEXT: rolq $7, (%rdx) # sched: [8:1.00]
+; SANDY-NEXT: rorq $7, (%rdx) # sched: [8:1.00]
+; SANDY-NEXT: rolq %cl, %rdi # sched: [3:1.50]
+; SANDY-NEXT: rorq %cl, %rdi # sched: [3:1.50]
+; SANDY-NEXT: rolq %cl, (%rdx) # sched: [9:1.50]
+; SANDY-NEXT: rorq %cl, (%rdx) # sched: [9:1.50]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_rol_ror_64:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: rolq %rdi # sched: [2:1.00]
+; HASWELL-NEXT: rorq %rdi # sched: [2:1.00]
+; HASWELL-NEXT: rolq (%rdx) # sched: [8:1.00]
+; HASWELL-NEXT: rorq (%rdx) # sched: [8:1.00]
+; HASWELL-NEXT: rolq $7, %rdi # sched: [2:1.00]
+; HASWELL-NEXT: rorq $7, %rdi # sched: [2:1.00]
+; HASWELL-NEXT: rolq $7, (%rdx) # sched: [8:1.00]
+; HASWELL-NEXT: rorq $7, (%rdx) # sched: [8:1.00]
+; HASWELL-NEXT: rolq %cl, %rdi # sched: [3:1.00]
+; HASWELL-NEXT: rorq %cl, %rdi # sched: [3:1.00]
+; HASWELL-NEXT: rolq %cl, (%rdx) # sched: [9:1.00]
+; HASWELL-NEXT: rorq %cl, (%rdx) # sched: [9:1.00]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_rol_ror_64:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: rolq %rdi # sched: [2:1.00]
+; BROADWELL-NEXT: rorq %rdi # sched: [2:1.00]
+; BROADWELL-NEXT: rolq (%rdx) # sched: [7:1.00]
+; BROADWELL-NEXT: rorq (%rdx) # sched: [7:1.00]
+; BROADWELL-NEXT: rolq $7, %rdi # sched: [2:1.00]
+; BROADWELL-NEXT: rorq $7, %rdi # sched: [2:1.00]
+; BROADWELL-NEXT: rolq $7, (%rdx) # sched: [7:1.00]
+; BROADWELL-NEXT: rorq $7, (%rdx) # sched: [7:1.00]
+; BROADWELL-NEXT: rolq %cl, %rdi # sched: [3:1.00]
+; BROADWELL-NEXT: rorq %cl, %rdi # sched: [3:1.00]
+; BROADWELL-NEXT: rolq %cl, (%rdx) # sched: [8:1.00]
+; BROADWELL-NEXT: rorq %cl, (%rdx) # sched: [8:1.00]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_rol_ror_64:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: rolq %rdi # sched: [2:1.00]
+; SKYLAKE-NEXT: rorq %rdi # sched: [2:1.00]
+; SKYLAKE-NEXT: rolq (%rdx) # sched: [7:1.00]
+; SKYLAKE-NEXT: rorq (%rdx) # sched: [7:1.00]
+; SKYLAKE-NEXT: rolq $7, %rdi # sched: [2:1.00]
+; SKYLAKE-NEXT: rorq $7, %rdi # sched: [2:1.00]
+; SKYLAKE-NEXT: rolq $7, (%rdx) # sched: [7:1.00]
+; SKYLAKE-NEXT: rorq $7, (%rdx) # sched: [7:1.00]
+; SKYLAKE-NEXT: rolq %cl, %rdi # sched: [3:1.50]
+; SKYLAKE-NEXT: rorq %cl, %rdi # sched: [3:1.50]
+; SKYLAKE-NEXT: rolq %cl, (%rdx) # sched: [8:1.50]
+; SKYLAKE-NEXT: rorq %cl, (%rdx) # sched: [8:1.50]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_rol_ror_64:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: rolq %rdi # sched: [2:1.00]
+; SKX-NEXT: rorq %rdi # sched: [2:1.00]
+; SKX-NEXT: rolq (%rdx) # sched: [7:1.00]
+; SKX-NEXT: rorq (%rdx) # sched: [7:1.00]
+; SKX-NEXT: rolq $7, %rdi # sched: [2:1.00]
+; SKX-NEXT: rorq $7, %rdi # sched: [2:1.00]
+; SKX-NEXT: rolq $7, (%rdx) # sched: [7:1.00]
+; SKX-NEXT: rorq $7, (%rdx) # sched: [7:1.00]
+; SKX-NEXT: rolq %cl, %rdi # sched: [3:1.50]
+; SKX-NEXT: rorq %cl, %rdi # sched: [3:1.50]
+; SKX-NEXT: rolq %cl, (%rdx) # sched: [8:1.50]
+; SKX-NEXT: rorq %cl, (%rdx) # sched: [8:1.50]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_rol_ror_64:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: rolq %rdi # sched: [1:0.50]
+; BTVER2-NEXT: rorq %rdi # sched: [1:0.50]
+; BTVER2-NEXT: rolq (%rdx) # sched: [4:1.00]
+; BTVER2-NEXT: rorq (%rdx) # sched: [4:1.00]
+; BTVER2-NEXT: rolq $7, %rdi # sched: [1:0.50]
+; BTVER2-NEXT: rorq $7, %rdi # sched: [1:0.50]
+; BTVER2-NEXT: rolq $7, (%rdx) # sched: [4:1.00]
+; BTVER2-NEXT: rorq $7, (%rdx) # sched: [4:1.00]
+; BTVER2-NEXT: rolq %cl, %rdi # sched: [1:0.50]
+; BTVER2-NEXT: rorq %cl, %rdi # sched: [1:0.50]
+; BTVER2-NEXT: rolq %cl, (%rdx) # sched: [4:1.00]
+; BTVER2-NEXT: rorq %cl, (%rdx) # sched: [4:1.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_rol_ror_64:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: rolq %rdi # sched: [1:0.25]
+; ZNVER1-NEXT: rorq %rdi # sched: [1:0.25]
+; ZNVER1-NEXT: rolq (%rdx) # sched: [5:1.00]
+; ZNVER1-NEXT: rorq (%rdx) # sched: [5:1.00]
+; ZNVER1-NEXT: rolq $7, %rdi # sched: [1:0.25]
+; ZNVER1-NEXT: rorq $7, %rdi # sched: [1:0.25]
+; ZNVER1-NEXT: rolq $7, (%rdx) # sched: [5:1.00]
+; ZNVER1-NEXT: rorq $7, (%rdx) # sched: [5:1.00]
+; ZNVER1-NEXT: rolq %cl, %rdi # sched: [1:0.25]
+; ZNVER1-NEXT: rorq %cl, %rdi # sched: [1:0.25]
+; ZNVER1-NEXT: rolq %cl, (%rdx) # sched: [5:1.00]
+; ZNVER1-NEXT: rorq %cl, (%rdx) # sched: [5:1.00]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ call void asm sideeffect "rolq $0 \0A\09 rorq $0 \0A\09 rolq $2 \0A\09 rorq $2 \0A\09 rolq $3, $0 \0A\09 rorq $3, $0 \0A\09 rolq $3, $2 \0A\09 rorq $3, $2 \0A\09 rolq %CL, $0 \0A\09 rorq %CL, $0 \0A\09 rolq %CL, $2 \0A\09 rorq %CL, $2", "r,r,*m,i"(i64 %a0, i64 %a1, i64 *%a2, i8 7)
+ ret void
+}
+
+define void @test_sar_shl_shr_8(i8 %a0, i8 %a1, i8 *%a2) optsize {
+; GENERIC-LABEL: test_sar_shl_shr_8:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: sarb %dil # sched: [1:0.50]
+; GENERIC-NEXT: shlb %dil # sched: [1:0.50]
+; GENERIC-NEXT: shrb %dil # sched: [1:0.50]
+; GENERIC-NEXT: sarb (%rdx) # sched: [5:1.00]
+; GENERIC-NEXT: shlb (%rdx) # sched: [7:1.00]
+; GENERIC-NEXT: shrb (%rdx) # sched: [5:1.00]
+; GENERIC-NEXT: sarb $7, %dil # sched: [1:0.50]
+; GENERIC-NEXT: shlb $7, %dil # sched: [1:0.50]
+; GENERIC-NEXT: shrb $7, %dil # sched: [1:0.50]
+; GENERIC-NEXT: sarb $7, (%rdx) # sched: [7:1.00]
+; GENERIC-NEXT: shlb $7, (%rdx) # sched: [7:1.00]
+; GENERIC-NEXT: shrb $7, (%rdx) # sched: [7:1.00]
+; GENERIC-NEXT: sarb %cl, %dil # sched: [3:1.50]
+; GENERIC-NEXT: shlb %cl, %dil # sched: [3:1.50]
+; GENERIC-NEXT: shrb %cl, %dil # sched: [3:1.50]
+; GENERIC-NEXT: sarb %cl, (%rdx) # sched: [9:1.50]
+; GENERIC-NEXT: shlb %cl, (%rdx) # sched: [9:1.50]
+; GENERIC-NEXT: shrb %cl, (%rdx) # sched: [9:1.50]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_sar_shl_shr_8:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: sarb %dil # sched: [1:1.00]
+; ATOM-NEXT: shlb %dil # sched: [1:1.00]
+; ATOM-NEXT: shrb %dil # sched: [1:1.00]
+; ATOM-NEXT: sarb (%rdx) # sched: [1:1.00]
+; ATOM-NEXT: shlb (%rdx) # sched: [1:1.00]
+; ATOM-NEXT: shrb (%rdx) # sched: [1:1.00]
+; ATOM-NEXT: sarb $7, %dil # sched: [1:1.00]
+; ATOM-NEXT: shlb $7, %dil # sched: [1:1.00]
+; ATOM-NEXT: shrb $7, %dil # sched: [1:1.00]
+; ATOM-NEXT: sarb $7, (%rdx) # sched: [1:1.00]
+; ATOM-NEXT: shlb $7, (%rdx) # sched: [1:1.00]
+; ATOM-NEXT: shrb $7, (%rdx) # sched: [1:1.00]
+; ATOM-NEXT: sarb %cl, %dil # sched: [1:1.00]
+; ATOM-NEXT: shlb %cl, %dil # sched: [1:1.00]
+; ATOM-NEXT: shrb %cl, %dil # sched: [1:1.00]
+; ATOM-NEXT: sarb %cl, (%rdx) # sched: [1:1.00]
+; ATOM-NEXT: shlb %cl, (%rdx) # sched: [1:1.00]
+; ATOM-NEXT: shrb %cl, (%rdx) # sched: [1:1.00]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_sar_shl_shr_8:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: sarb %dil # sched: [1:1.00]
+; SLM-NEXT: shlb %dil # sched: [1:1.00]
+; SLM-NEXT: shrb %dil # sched: [1:1.00]
+; SLM-NEXT: sarb (%rdx) # sched: [4:2.00]
+; SLM-NEXT: shlb (%rdx) # sched: [4:2.00]
+; SLM-NEXT: shrb (%rdx) # sched: [4:2.00]
+; SLM-NEXT: sarb $7, %dil # sched: [1:1.00]
+; SLM-NEXT: shlb $7, %dil # sched: [1:1.00]
+; SLM-NEXT: shrb $7, %dil # sched: [1:1.00]
+; SLM-NEXT: sarb $7, (%rdx) # sched: [4:2.00]
+; SLM-NEXT: shlb $7, (%rdx) # sched: [4:2.00]
+; SLM-NEXT: shrb $7, (%rdx) # sched: [4:2.00]
+; SLM-NEXT: sarb %cl, %dil # sched: [1:1.00]
+; SLM-NEXT: shlb %cl, %dil # sched: [1:1.00]
+; SLM-NEXT: shrb %cl, %dil # sched: [1:1.00]
+; SLM-NEXT: sarb %cl, (%rdx) # sched: [4:2.00]
+; SLM-NEXT: shlb %cl, (%rdx) # sched: [4:2.00]
+; SLM-NEXT: shrb %cl, (%rdx) # sched: [4:2.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_sar_shl_shr_8:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: sarb %dil # sched: [1:0.50]
+; SANDY-NEXT: shlb %dil # sched: [1:0.50]
+; SANDY-NEXT: shrb %dil # sched: [1:0.50]
+; SANDY-NEXT: sarb (%rdx) # sched: [5:1.00]
+; SANDY-NEXT: shlb (%rdx) # sched: [7:1.00]
+; SANDY-NEXT: shrb (%rdx) # sched: [5:1.00]
+; SANDY-NEXT: sarb $7, %dil # sched: [1:0.50]
+; SANDY-NEXT: shlb $7, %dil # sched: [1:0.50]
+; SANDY-NEXT: shrb $7, %dil # sched: [1:0.50]
+; SANDY-NEXT: sarb $7, (%rdx) # sched: [7:1.00]
+; SANDY-NEXT: shlb $7, (%rdx) # sched: [7:1.00]
+; SANDY-NEXT: shrb $7, (%rdx) # sched: [7:1.00]
+; SANDY-NEXT: sarb %cl, %dil # sched: [3:1.50]
+; SANDY-NEXT: shlb %cl, %dil # sched: [3:1.50]
+; SANDY-NEXT: shrb %cl, %dil # sched: [3:1.50]
+; SANDY-NEXT: sarb %cl, (%rdx) # sched: [9:1.50]
+; SANDY-NEXT: shlb %cl, (%rdx) # sched: [9:1.50]
+; SANDY-NEXT: shrb %cl, (%rdx) # sched: [9:1.50]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_sar_shl_shr_8:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: sarb %dil # sched: [1:0.50]
+; HASWELL-NEXT: shlb %dil # sched: [1:0.50]
+; HASWELL-NEXT: shrb %dil # sched: [1:0.50]
+; HASWELL-NEXT: sarb (%rdx) # sched: [7:1.00]
+; HASWELL-NEXT: shlb (%rdx) # sched: [7:1.00]
+; HASWELL-NEXT: shrb (%rdx) # sched: [7:1.00]
+; HASWELL-NEXT: sarb $7, %dil # sched: [1:0.50]
+; HASWELL-NEXT: shlb $7, %dil # sched: [1:0.50]
+; HASWELL-NEXT: shrb $7, %dil # sched: [1:0.50]
+; HASWELL-NEXT: sarb $7, (%rdx) # sched: [7:1.00]
+; HASWELL-NEXT: shlb $7, (%rdx) # sched: [7:1.00]
+; HASWELL-NEXT: shrb $7, (%rdx) # sched: [7:1.00]
+; HASWELL-NEXT: sarb %cl, %dil # sched: [3:1.00]
+; HASWELL-NEXT: shlb %cl, %dil # sched: [3:1.00]
+; HASWELL-NEXT: shrb %cl, %dil # sched: [3:1.00]
+; HASWELL-NEXT: sarb %cl, (%rdx) # sched: [9:1.00]
+; HASWELL-NEXT: shlb %cl, (%rdx) # sched: [9:1.00]
+; HASWELL-NEXT: shrb %cl, (%rdx) # sched: [9:1.00]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_sar_shl_shr_8:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: sarb %dil # sched: [1:0.50]
+; BROADWELL-NEXT: shlb %dil # sched: [1:0.50]
+; BROADWELL-NEXT: shrb %dil # sched: [1:0.50]
+; BROADWELL-NEXT: sarb (%rdx) # sched: [6:1.00]
+; BROADWELL-NEXT: shlb (%rdx) # sched: [6:1.00]
+; BROADWELL-NEXT: shrb (%rdx) # sched: [6:1.00]
+; BROADWELL-NEXT: sarb $7, %dil # sched: [1:0.50]
+; BROADWELL-NEXT: shlb $7, %dil # sched: [1:0.50]
+; BROADWELL-NEXT: shrb $7, %dil # sched: [1:0.50]
+; BROADWELL-NEXT: sarb $7, (%rdx) # sched: [6:1.00]
+; BROADWELL-NEXT: shlb $7, (%rdx) # sched: [6:1.00]
+; BROADWELL-NEXT: shrb $7, (%rdx) # sched: [6:1.00]
+; BROADWELL-NEXT: sarb %cl, %dil # sched: [3:1.00]
+; BROADWELL-NEXT: shlb %cl, %dil # sched: [3:1.00]
+; BROADWELL-NEXT: shrb %cl, %dil # sched: [3:1.00]
+; BROADWELL-NEXT: sarb %cl, (%rdx) # sched: [8:1.00]
+; BROADWELL-NEXT: shlb %cl, (%rdx) # sched: [8:1.00]
+; BROADWELL-NEXT: shrb %cl, (%rdx) # sched: [8:1.00]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_sar_shl_shr_8:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: sarb %dil # sched: [1:0.50]
+; SKYLAKE-NEXT: shlb %dil # sched: [1:0.50]
+; SKYLAKE-NEXT: shrb %dil # sched: [1:0.50]
+; SKYLAKE-NEXT: sarb (%rdx) # sched: [6:1.00]
+; SKYLAKE-NEXT: shlb (%rdx) # sched: [6:1.00]
+; SKYLAKE-NEXT: shrb (%rdx) # sched: [6:1.00]
+; SKYLAKE-NEXT: sarb $7, %dil # sched: [1:0.50]
+; SKYLAKE-NEXT: shlb $7, %dil # sched: [1:0.50]
+; SKYLAKE-NEXT: shrb $7, %dil # sched: [1:0.50]
+; SKYLAKE-NEXT: sarb $7, (%rdx) # sched: [6:1.00]
+; SKYLAKE-NEXT: shlb $7, (%rdx) # sched: [6:1.00]
+; SKYLAKE-NEXT: shrb $7, (%rdx) # sched: [6:1.00]
+; SKYLAKE-NEXT: sarb %cl, %dil # sched: [3:1.50]
+; SKYLAKE-NEXT: shlb %cl, %dil # sched: [3:1.50]
+; SKYLAKE-NEXT: shrb %cl, %dil # sched: [3:1.50]
+; SKYLAKE-NEXT: sarb %cl, (%rdx) # sched: [8:1.50]
+; SKYLAKE-NEXT: shlb %cl, (%rdx) # sched: [8:1.50]
+; SKYLAKE-NEXT: shrb %cl, (%rdx) # sched: [8:1.50]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_sar_shl_shr_8:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: sarb %dil # sched: [1:0.50]
+; SKX-NEXT: shlb %dil # sched: [1:0.50]
+; SKX-NEXT: shrb %dil # sched: [1:0.50]
+; SKX-NEXT: sarb (%rdx) # sched: [6:1.00]
+; SKX-NEXT: shlb (%rdx) # sched: [6:1.00]
+; SKX-NEXT: shrb (%rdx) # sched: [6:1.00]
+; SKX-NEXT: sarb $7, %dil # sched: [1:0.50]
+; SKX-NEXT: shlb $7, %dil # sched: [1:0.50]
+; SKX-NEXT: shrb $7, %dil # sched: [1:0.50]
+; SKX-NEXT: sarb $7, (%rdx) # sched: [6:1.00]
+; SKX-NEXT: shlb $7, (%rdx) # sched: [6:1.00]
+; SKX-NEXT: shrb $7, (%rdx) # sched: [6:1.00]
+; SKX-NEXT: sarb %cl, %dil # sched: [3:1.50]
+; SKX-NEXT: shlb %cl, %dil # sched: [3:1.50]
+; SKX-NEXT: shrb %cl, %dil # sched: [3:1.50]
+; SKX-NEXT: sarb %cl, (%rdx) # sched: [8:1.50]
+; SKX-NEXT: shlb %cl, (%rdx) # sched: [8:1.50]
+; SKX-NEXT: shrb %cl, (%rdx) # sched: [8:1.50]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_sar_shl_shr_8:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: sarb %dil # sched: [1:0.50]
+; BTVER2-NEXT: shlb %dil # sched: [1:0.50]
+; BTVER2-NEXT: shrb %dil # sched: [1:0.50]
+; BTVER2-NEXT: sarb (%rdx) # sched: [4:1.00]
+; BTVER2-NEXT: shlb (%rdx) # sched: [4:1.00]
+; BTVER2-NEXT: shrb (%rdx) # sched: [4:1.00]
+; BTVER2-NEXT: sarb $7, %dil # sched: [1:0.50]
+; BTVER2-NEXT: shlb $7, %dil # sched: [1:0.50]
+; BTVER2-NEXT: shrb $7, %dil # sched: [1:0.50]
+; BTVER2-NEXT: sarb $7, (%rdx) # sched: [4:1.00]
+; BTVER2-NEXT: shlb $7, (%rdx) # sched: [4:1.00]
+; BTVER2-NEXT: shrb $7, (%rdx) # sched: [4:1.00]
+; BTVER2-NEXT: sarb %cl, %dil # sched: [1:0.50]
+; BTVER2-NEXT: shlb %cl, %dil # sched: [1:0.50]
+; BTVER2-NEXT: shrb %cl, %dil # sched: [1:0.50]
+; BTVER2-NEXT: sarb %cl, (%rdx) # sched: [4:1.00]
+; BTVER2-NEXT: shlb %cl, (%rdx) # sched: [4:1.00]
+; BTVER2-NEXT: shrb %cl, (%rdx) # sched: [4:1.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_sar_shl_shr_8:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: sarb %dil # sched: [1:0.25]
+; ZNVER1-NEXT: shlb %dil # sched: [1:0.25]
+; ZNVER1-NEXT: shrb %dil # sched: [1:0.25]
+; ZNVER1-NEXT: sarb (%rdx) # sched: [5:0.50]
+; ZNVER1-NEXT: shlb (%rdx) # sched: [5:0.50]
+; ZNVER1-NEXT: shrb (%rdx) # sched: [5:0.50]
+; ZNVER1-NEXT: sarb $7, %dil # sched: [1:0.25]
+; ZNVER1-NEXT: shlb $7, %dil # sched: [1:0.25]
+; ZNVER1-NEXT: shrb $7, %dil # sched: [1:0.25]
+; ZNVER1-NEXT: sarb $7, (%rdx) # sched: [5:0.50]
+; ZNVER1-NEXT: shlb $7, (%rdx) # sched: [5:0.50]
+; ZNVER1-NEXT: shrb $7, (%rdx) # sched: [5:0.50]
+; ZNVER1-NEXT: sarb %cl, %dil # sched: [1:0.25]
+; ZNVER1-NEXT: shlb %cl, %dil # sched: [1:0.25]
+; ZNVER1-NEXT: shrb %cl, %dil # sched: [1:0.25]
+; ZNVER1-NEXT: sarb %cl, (%rdx) # sched: [5:1.00]
+; ZNVER1-NEXT: shlb %cl, (%rdx) # sched: [5:1.00]
+; ZNVER1-NEXT: shrb %cl, (%rdx) # sched: [5:1.00]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ call void asm sideeffect "sarb $0 \0A\09 shlb $0 \0A\09 shrb $0 \0A\09 sarb $2 \0A\09 shlb $2 \0A\09 shrb $2 \0A\09 sarb $3, $0 \0A\09 shlb $3, $0 \0A\09 shrb $3, $0 \0A\09 sarb $3, $2 \0A\09 shlb $3, $2 \0A\09 shrb $3, $2 \0A\09 sarb %CL, $0 \0A\09 shlb %CL, $0 \0A\09 shrb %CL, $0 \0A\09 sarb %CL, $2 \0A\09 shlb %CL, $2 \0A\09 shrb %CL, $2", "r,r,*m,i"(i8 %a0, i8 %a1, i8 *%a2, i8 7)
+ ret void
+}
+define void @test_sar_shl_shr_16(i16 %a0, i16 %a1, i16 *%a2) optsize {
+; GENERIC-LABEL: test_sar_shl_shr_16:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: sarw %di # sched: [1:0.50]
+; GENERIC-NEXT: shlw %di # sched: [1:0.50]
+; GENERIC-NEXT: shrw %di # sched: [1:0.50]
+; GENERIC-NEXT: sarw (%rdx) # sched: [5:1.00]
+; GENERIC-NEXT: shlw (%rdx) # sched: [7:1.00]
+; GENERIC-NEXT: shrw (%rdx) # sched: [5:1.00]
+; GENERIC-NEXT: sarw $7, %di # sched: [1:0.50]
+; GENERIC-NEXT: shlw $7, %di # sched: [1:0.50]
+; GENERIC-NEXT: shrw $7, %di # sched: [1:0.50]
+; GENERIC-NEXT: sarw $7, (%rdx) # sched: [7:1.00]
+; GENERIC-NEXT: shlw $7, (%rdx) # sched: [7:1.00]
+; GENERIC-NEXT: shrw $7, (%rdx) # sched: [7:1.00]
+; GENERIC-NEXT: sarw %cl, %di # sched: [3:1.50]
+; GENERIC-NEXT: shlw %cl, %di # sched: [3:1.50]
+; GENERIC-NEXT: shrw %cl, %di # sched: [3:1.50]
+; GENERIC-NEXT: sarw %cl, (%rdx) # sched: [9:1.50]
+; GENERIC-NEXT: shlw %cl, (%rdx) # sched: [9:1.50]
+; GENERIC-NEXT: shrw %cl, (%rdx) # sched: [9:1.50]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_sar_shl_shr_16:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: sarw %di # sched: [1:1.00]
+; ATOM-NEXT: shlw %di # sched: [1:1.00]
+; ATOM-NEXT: shrw %di # sched: [1:1.00]
+; ATOM-NEXT: sarw (%rdx) # sched: [1:1.00]
+; ATOM-NEXT: shlw (%rdx) # sched: [1:1.00]
+; ATOM-NEXT: shrw (%rdx) # sched: [1:1.00]
+; ATOM-NEXT: sarw $7, %di # sched: [1:1.00]
+; ATOM-NEXT: shlw $7, %di # sched: [1:1.00]
+; ATOM-NEXT: shrw $7, %di # sched: [1:1.00]
+; ATOM-NEXT: sarw $7, (%rdx) # sched: [1:1.00]
+; ATOM-NEXT: shlw $7, (%rdx) # sched: [1:1.00]
+; ATOM-NEXT: shrw $7, (%rdx) # sched: [1:1.00]
+; ATOM-NEXT: sarw %cl, %di # sched: [1:1.00]
+; ATOM-NEXT: shlw %cl, %di # sched: [1:1.00]
+; ATOM-NEXT: shrw %cl, %di # sched: [1:1.00]
+; ATOM-NEXT: sarw %cl, (%rdx) # sched: [1:1.00]
+; ATOM-NEXT: shlw %cl, (%rdx) # sched: [1:1.00]
+; ATOM-NEXT: shrw %cl, (%rdx) # sched: [1:1.00]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_sar_shl_shr_16:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: sarw %di # sched: [1:1.00]
+; SLM-NEXT: shlw %di # sched: [1:1.00]
+; SLM-NEXT: shrw %di # sched: [1:1.00]
+; SLM-NEXT: sarw (%rdx) # sched: [4:2.00]
+; SLM-NEXT: shlw (%rdx) # sched: [4:2.00]
+; SLM-NEXT: shrw (%rdx) # sched: [4:2.00]
+; SLM-NEXT: sarw $7, %di # sched: [1:1.00]
+; SLM-NEXT: shlw $7, %di # sched: [1:1.00]
+; SLM-NEXT: shrw $7, %di # sched: [1:1.00]
+; SLM-NEXT: sarw $7, (%rdx) # sched: [4:2.00]
+; SLM-NEXT: shlw $7, (%rdx) # sched: [4:2.00]
+; SLM-NEXT: shrw $7, (%rdx) # sched: [4:2.00]
+; SLM-NEXT: sarw %cl, %di # sched: [1:1.00]
+; SLM-NEXT: shlw %cl, %di # sched: [1:1.00]
+; SLM-NEXT: shrw %cl, %di # sched: [1:1.00]
+; SLM-NEXT: sarw %cl, (%rdx) # sched: [4:2.00]
+; SLM-NEXT: shlw %cl, (%rdx) # sched: [4:2.00]
+; SLM-NEXT: shrw %cl, (%rdx) # sched: [4:2.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_sar_shl_shr_16:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: sarw %di # sched: [1:0.50]
+; SANDY-NEXT: shlw %di # sched: [1:0.50]
+; SANDY-NEXT: shrw %di # sched: [1:0.50]
+; SANDY-NEXT: sarw (%rdx) # sched: [5:1.00]
+; SANDY-NEXT: shlw (%rdx) # sched: [7:1.00]
+; SANDY-NEXT: shrw (%rdx) # sched: [5:1.00]
+; SANDY-NEXT: sarw $7, %di # sched: [1:0.50]
+; SANDY-NEXT: shlw $7, %di # sched: [1:0.50]
+; SANDY-NEXT: shrw $7, %di # sched: [1:0.50]
+; SANDY-NEXT: sarw $7, (%rdx) # sched: [7:1.00]
+; SANDY-NEXT: shlw $7, (%rdx) # sched: [7:1.00]
+; SANDY-NEXT: shrw $7, (%rdx) # sched: [7:1.00]
+; SANDY-NEXT: sarw %cl, %di # sched: [3:1.50]
+; SANDY-NEXT: shlw %cl, %di # sched: [3:1.50]
+; SANDY-NEXT: shrw %cl, %di # sched: [3:1.50]
+; SANDY-NEXT: sarw %cl, (%rdx) # sched: [9:1.50]
+; SANDY-NEXT: shlw %cl, (%rdx) # sched: [9:1.50]
+; SANDY-NEXT: shrw %cl, (%rdx) # sched: [9:1.50]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_sar_shl_shr_16:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: sarw %di # sched: [1:0.50]
+; HASWELL-NEXT: shlw %di # sched: [1:0.50]
+; HASWELL-NEXT: shrw %di # sched: [1:0.50]
+; HASWELL-NEXT: sarw (%rdx) # sched: [7:1.00]
+; HASWELL-NEXT: shlw (%rdx) # sched: [7:1.00]
+; HASWELL-NEXT: shrw (%rdx) # sched: [7:1.00]
+; HASWELL-NEXT: sarw $7, %di # sched: [1:0.50]
+; HASWELL-NEXT: shlw $7, %di # sched: [1:0.50]
+; HASWELL-NEXT: shrw $7, %di # sched: [1:0.50]
+; HASWELL-NEXT: sarw $7, (%rdx) # sched: [7:1.00]
+; HASWELL-NEXT: shlw $7, (%rdx) # sched: [7:1.00]
+; HASWELL-NEXT: shrw $7, (%rdx) # sched: [7:1.00]
+; HASWELL-NEXT: sarw %cl, %di # sched: [3:1.00]
+; HASWELL-NEXT: shlw %cl, %di # sched: [3:1.00]
+; HASWELL-NEXT: shrw %cl, %di # sched: [3:1.00]
+; HASWELL-NEXT: sarw %cl, (%rdx) # sched: [9:1.00]
+; HASWELL-NEXT: shlw %cl, (%rdx) # sched: [9:1.00]
+; HASWELL-NEXT: shrw %cl, (%rdx) # sched: [9:1.00]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_sar_shl_shr_16:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: sarw %di # sched: [1:0.50]
+; BROADWELL-NEXT: shlw %di # sched: [1:0.50]
+; BROADWELL-NEXT: shrw %di # sched: [1:0.50]
+; BROADWELL-NEXT: sarw (%rdx) # sched: [6:1.00]
+; BROADWELL-NEXT: shlw (%rdx) # sched: [6:1.00]
+; BROADWELL-NEXT: shrw (%rdx) # sched: [6:1.00]
+; BROADWELL-NEXT: sarw $7, %di # sched: [1:0.50]
+; BROADWELL-NEXT: shlw $7, %di # sched: [1:0.50]
+; BROADWELL-NEXT: shrw $7, %di # sched: [1:0.50]
+; BROADWELL-NEXT: sarw $7, (%rdx) # sched: [6:1.00]
+; BROADWELL-NEXT: shlw $7, (%rdx) # sched: [6:1.00]
+; BROADWELL-NEXT: shrw $7, (%rdx) # sched: [6:1.00]
+; BROADWELL-NEXT: sarw %cl, %di # sched: [3:1.00]
+; BROADWELL-NEXT: shlw %cl, %di # sched: [3:1.00]
+; BROADWELL-NEXT: shrw %cl, %di # sched: [3:1.00]
+; BROADWELL-NEXT: sarw %cl, (%rdx) # sched: [8:1.00]
+; BROADWELL-NEXT: shlw %cl, (%rdx) # sched: [8:1.00]
+; BROADWELL-NEXT: shrw %cl, (%rdx) # sched: [8:1.00]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_sar_shl_shr_16:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: sarw %di # sched: [1:0.50]
+; SKYLAKE-NEXT: shlw %di # sched: [1:0.50]
+; SKYLAKE-NEXT: shrw %di # sched: [1:0.50]
+; SKYLAKE-NEXT: sarw (%rdx) # sched: [6:1.00]
+; SKYLAKE-NEXT: shlw (%rdx) # sched: [6:1.00]
+; SKYLAKE-NEXT: shrw (%rdx) # sched: [6:1.00]
+; SKYLAKE-NEXT: sarw $7, %di # sched: [1:0.50]
+; SKYLAKE-NEXT: shlw $7, %di # sched: [1:0.50]
+; SKYLAKE-NEXT: shrw $7, %di # sched: [1:0.50]
+; SKYLAKE-NEXT: sarw $7, (%rdx) # sched: [6:1.00]
+; SKYLAKE-NEXT: shlw $7, (%rdx) # sched: [6:1.00]
+; SKYLAKE-NEXT: shrw $7, (%rdx) # sched: [6:1.00]
+; SKYLAKE-NEXT: sarw %cl, %di # sched: [3:1.50]
+; SKYLAKE-NEXT: shlw %cl, %di # sched: [3:1.50]
+; SKYLAKE-NEXT: shrw %cl, %di # sched: [3:1.50]
+; SKYLAKE-NEXT: sarw %cl, (%rdx) # sched: [8:1.50]
+; SKYLAKE-NEXT: shlw %cl, (%rdx) # sched: [8:1.50]
+; SKYLAKE-NEXT: shrw %cl, (%rdx) # sched: [8:1.50]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_sar_shl_shr_16:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: sarw %di # sched: [1:0.50]
+; SKX-NEXT: shlw %di # sched: [1:0.50]
+; SKX-NEXT: shrw %di # sched: [1:0.50]
+; SKX-NEXT: sarw (%rdx) # sched: [6:1.00]
+; SKX-NEXT: shlw (%rdx) # sched: [6:1.00]
+; SKX-NEXT: shrw (%rdx) # sched: [6:1.00]
+; SKX-NEXT: sarw $7, %di # sched: [1:0.50]
+; SKX-NEXT: shlw $7, %di # sched: [1:0.50]
+; SKX-NEXT: shrw $7, %di # sched: [1:0.50]
+; SKX-NEXT: sarw $7, (%rdx) # sched: [6:1.00]
+; SKX-NEXT: shlw $7, (%rdx) # sched: [6:1.00]
+; SKX-NEXT: shrw $7, (%rdx) # sched: [6:1.00]
+; SKX-NEXT: sarw %cl, %di # sched: [3:1.50]
+; SKX-NEXT: shlw %cl, %di # sched: [3:1.50]
+; SKX-NEXT: shrw %cl, %di # sched: [3:1.50]
+; SKX-NEXT: sarw %cl, (%rdx) # sched: [8:1.50]
+; SKX-NEXT: shlw %cl, (%rdx) # sched: [8:1.50]
+; SKX-NEXT: shrw %cl, (%rdx) # sched: [8:1.50]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_sar_shl_shr_16:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: sarw %di # sched: [1:0.50]
+; BTVER2-NEXT: shlw %di # sched: [1:0.50]
+; BTVER2-NEXT: shrw %di # sched: [1:0.50]
+; BTVER2-NEXT: sarw (%rdx) # sched: [4:1.00]
+; BTVER2-NEXT: shlw (%rdx) # sched: [4:1.00]
+; BTVER2-NEXT: shrw (%rdx) # sched: [4:1.00]
+; BTVER2-NEXT: sarw $7, %di # sched: [1:0.50]
+; BTVER2-NEXT: shlw $7, %di # sched: [1:0.50]
+; BTVER2-NEXT: shrw $7, %di # sched: [1:0.50]
+; BTVER2-NEXT: sarw $7, (%rdx) # sched: [4:1.00]
+; BTVER2-NEXT: shlw $7, (%rdx) # sched: [4:1.00]
+; BTVER2-NEXT: shrw $7, (%rdx) # sched: [4:1.00]
+; BTVER2-NEXT: sarw %cl, %di # sched: [1:0.50]
+; BTVER2-NEXT: shlw %cl, %di # sched: [1:0.50]
+; BTVER2-NEXT: shrw %cl, %di # sched: [1:0.50]
+; BTVER2-NEXT: sarw %cl, (%rdx) # sched: [4:1.00]
+; BTVER2-NEXT: shlw %cl, (%rdx) # sched: [4:1.00]
+; BTVER2-NEXT: shrw %cl, (%rdx) # sched: [4:1.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_sar_shl_shr_16:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: sarw %di # sched: [1:0.25]
+; ZNVER1-NEXT: shlw %di # sched: [1:0.25]
+; ZNVER1-NEXT: shrw %di # sched: [1:0.25]
+; ZNVER1-NEXT: sarw (%rdx) # sched: [5:0.50]
+; ZNVER1-NEXT: shlw (%rdx) # sched: [5:0.50]
+; ZNVER1-NEXT: shrw (%rdx) # sched: [5:0.50]
+; ZNVER1-NEXT: sarw $7, %di # sched: [1:0.25]
+; ZNVER1-NEXT: shlw $7, %di # sched: [1:0.25]
+; ZNVER1-NEXT: shrw $7, %di # sched: [1:0.25]
+; ZNVER1-NEXT: sarw $7, (%rdx) # sched: [5:0.50]
+; ZNVER1-NEXT: shlw $7, (%rdx) # sched: [5:0.50]
+; ZNVER1-NEXT: shrw $7, (%rdx) # sched: [5:0.50]
+; ZNVER1-NEXT: sarw %cl, %di # sched: [1:0.25]
+; ZNVER1-NEXT: shlw %cl, %di # sched: [1:0.25]
+; ZNVER1-NEXT: shrw %cl, %di # sched: [1:0.25]
+; ZNVER1-NEXT: sarw %cl, (%rdx) # sched: [5:1.00]
+; ZNVER1-NEXT: shlw %cl, (%rdx) # sched: [5:1.00]
+; ZNVER1-NEXT: shrw %cl, (%rdx) # sched: [5:1.00]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ call void asm sideeffect "sarw $0 \0A\09 shlw $0 \0A\09 shrw $0 \0A\09 sarw $2 \0A\09 shlw $2 \0A\09 shrw $2 \0A\09 sarw $3, $0 \0A\09 shlw $3, $0 \0A\09 shrw $3, $0 \0A\09 sarw $3, $2 \0A\09 shlw $3, $2 \0A\09 shrw $3, $2 \0A\09 sarw %CL, $0 \0A\09 shlw %CL, $0 \0A\09 shrw %CL, $0 \0A\09 sarw %CL, $2 \0A\09 shlw %CL, $2 \0A\09 shrw %CL, $2", "r,r,*m,i"(i16 %a0, i16 %a1, i16 *%a2, i8 7)
+ ret void
+}
+define void @test_sar_shl_shr_32(i32 %a0, i32 %a1, i32 *%a2) optsize {
+; GENERIC-LABEL: test_sar_shl_shr_32:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: sarl %edi # sched: [1:0.50]
+; GENERIC-NEXT: shll %edi # sched: [1:0.50]
+; GENERIC-NEXT: shrl %edi # sched: [1:0.50]
+; GENERIC-NEXT: sarl (%rdx) # sched: [5:1.00]
+; GENERIC-NEXT: shll (%rdx) # sched: [7:1.00]
+; GENERIC-NEXT: shrl (%rdx) # sched: [5:1.00]
+; GENERIC-NEXT: sarl $7, %edi # sched: [1:0.50]
+; GENERIC-NEXT: shll $7, %edi # sched: [1:0.50]
+; GENERIC-NEXT: shrl $7, %edi # sched: [1:0.50]
+; GENERIC-NEXT: sarl $7, (%rdx) # sched: [7:1.00]
+; GENERIC-NEXT: shll $7, (%rdx) # sched: [7:1.00]
+; GENERIC-NEXT: shrl $7, (%rdx) # sched: [7:1.00]
+; GENERIC-NEXT: sarl %cl, %edi # sched: [3:1.50]
+; GENERIC-NEXT: shll %cl, %edi # sched: [3:1.50]
+; GENERIC-NEXT: shrl %cl, %edi # sched: [3:1.50]
+; GENERIC-NEXT: sarl %cl, (%rdx) # sched: [9:1.50]
+; GENERIC-NEXT: shll %cl, (%rdx) # sched: [9:1.50]
+; GENERIC-NEXT: shrl %cl, (%rdx) # sched: [9:1.50]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_sar_shl_shr_32:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: sarl %edi # sched: [1:1.00]
+; ATOM-NEXT: shll %edi # sched: [1:1.00]
+; ATOM-NEXT: shrl %edi # sched: [1:1.00]
+; ATOM-NEXT: sarl (%rdx) # sched: [1:1.00]
+; ATOM-NEXT: shll (%rdx) # sched: [1:1.00]
+; ATOM-NEXT: shrl (%rdx) # sched: [1:1.00]
+; ATOM-NEXT: sarl $7, %edi # sched: [1:1.00]
+; ATOM-NEXT: shll $7, %edi # sched: [1:1.00]
+; ATOM-NEXT: shrl $7, %edi # sched: [1:1.00]
+; ATOM-NEXT: sarl $7, (%rdx) # sched: [1:1.00]
+; ATOM-NEXT: shll $7, (%rdx) # sched: [1:1.00]
+; ATOM-NEXT: shrl $7, (%rdx) # sched: [1:1.00]
+; ATOM-NEXT: sarl %cl, %edi # sched: [1:1.00]
+; ATOM-NEXT: shll %cl, %edi # sched: [1:1.00]
+; ATOM-NEXT: shrl %cl, %edi # sched: [1:1.00]
+; ATOM-NEXT: sarl %cl, (%rdx) # sched: [1:1.00]
+; ATOM-NEXT: shll %cl, (%rdx) # sched: [1:1.00]
+; ATOM-NEXT: shrl %cl, (%rdx) # sched: [1:1.00]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_sar_shl_shr_32:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: sarl %edi # sched: [1:1.00]
+; SLM-NEXT: shll %edi # sched: [1:1.00]
+; SLM-NEXT: shrl %edi # sched: [1:1.00]
+; SLM-NEXT: sarl (%rdx) # sched: [4:2.00]
+; SLM-NEXT: shll (%rdx) # sched: [4:2.00]
+; SLM-NEXT: shrl (%rdx) # sched: [4:2.00]
+; SLM-NEXT: sarl $7, %edi # sched: [1:1.00]
+; SLM-NEXT: shll $7, %edi # sched: [1:1.00]
+; SLM-NEXT: shrl $7, %edi # sched: [1:1.00]
+; SLM-NEXT: sarl $7, (%rdx) # sched: [4:2.00]
+; SLM-NEXT: shll $7, (%rdx) # sched: [4:2.00]
+; SLM-NEXT: shrl $7, (%rdx) # sched: [4:2.00]
+; SLM-NEXT: sarl %cl, %edi # sched: [1:1.00]
+; SLM-NEXT: shll %cl, %edi # sched: [1:1.00]
+; SLM-NEXT: shrl %cl, %edi # sched: [1:1.00]
+; SLM-NEXT: sarl %cl, (%rdx) # sched: [4:2.00]
+; SLM-NEXT: shll %cl, (%rdx) # sched: [4:2.00]
+; SLM-NEXT: shrl %cl, (%rdx) # sched: [4:2.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_sar_shl_shr_32:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: sarl %edi # sched: [1:0.50]
+; SANDY-NEXT: shll %edi # sched: [1:0.50]
+; SANDY-NEXT: shrl %edi # sched: [1:0.50]
+; SANDY-NEXT: sarl (%rdx) # sched: [5:1.00]
+; SANDY-NEXT: shll (%rdx) # sched: [7:1.00]
+; SANDY-NEXT: shrl (%rdx) # sched: [5:1.00]
+; SANDY-NEXT: sarl $7, %edi # sched: [1:0.50]
+; SANDY-NEXT: shll $7, %edi # sched: [1:0.50]
+; SANDY-NEXT: shrl $7, %edi # sched: [1:0.50]
+; SANDY-NEXT: sarl $7, (%rdx) # sched: [7:1.00]
+; SANDY-NEXT: shll $7, (%rdx) # sched: [7:1.00]
+; SANDY-NEXT: shrl $7, (%rdx) # sched: [7:1.00]
+; SANDY-NEXT: sarl %cl, %edi # sched: [3:1.50]
+; SANDY-NEXT: shll %cl, %edi # sched: [3:1.50]
+; SANDY-NEXT: shrl %cl, %edi # sched: [3:1.50]
+; SANDY-NEXT: sarl %cl, (%rdx) # sched: [9:1.50]
+; SANDY-NEXT: shll %cl, (%rdx) # sched: [9:1.50]
+; SANDY-NEXT: shrl %cl, (%rdx) # sched: [9:1.50]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_sar_shl_shr_32:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: sarl %edi # sched: [1:0.50]
+; HASWELL-NEXT: shll %edi # sched: [1:0.50]
+; HASWELL-NEXT: shrl %edi # sched: [1:0.50]
+; HASWELL-NEXT: sarl (%rdx) # sched: [7:1.00]
+; HASWELL-NEXT: shll (%rdx) # sched: [7:1.00]
+; HASWELL-NEXT: shrl (%rdx) # sched: [7:1.00]
+; HASWELL-NEXT: sarl $7, %edi # sched: [1:0.50]
+; HASWELL-NEXT: shll $7, %edi # sched: [1:0.50]
+; HASWELL-NEXT: shrl $7, %edi # sched: [1:0.50]
+; HASWELL-NEXT: sarl $7, (%rdx) # sched: [7:1.00]
+; HASWELL-NEXT: shll $7, (%rdx) # sched: [7:1.00]
+; HASWELL-NEXT: shrl $7, (%rdx) # sched: [7:1.00]
+; HASWELL-NEXT: sarl %cl, %edi # sched: [3:1.00]
+; HASWELL-NEXT: shll %cl, %edi # sched: [3:1.00]
+; HASWELL-NEXT: shrl %cl, %edi # sched: [3:1.00]
+; HASWELL-NEXT: sarl %cl, (%rdx) # sched: [9:1.00]
+; HASWELL-NEXT: shll %cl, (%rdx) # sched: [9:1.00]
+; HASWELL-NEXT: shrl %cl, (%rdx) # sched: [9:1.00]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_sar_shl_shr_32:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: sarl %edi # sched: [1:0.50]
+; BROADWELL-NEXT: shll %edi # sched: [1:0.50]
+; BROADWELL-NEXT: shrl %edi # sched: [1:0.50]
+; BROADWELL-NEXT: sarl (%rdx) # sched: [6:1.00]
+; BROADWELL-NEXT: shll (%rdx) # sched: [6:1.00]
+; BROADWELL-NEXT: shrl (%rdx) # sched: [6:1.00]
+; BROADWELL-NEXT: sarl $7, %edi # sched: [1:0.50]
+; BROADWELL-NEXT: shll $7, %edi # sched: [1:0.50]
+; BROADWELL-NEXT: shrl $7, %edi # sched: [1:0.50]
+; BROADWELL-NEXT: sarl $7, (%rdx) # sched: [6:1.00]
+; BROADWELL-NEXT: shll $7, (%rdx) # sched: [6:1.00]
+; BROADWELL-NEXT: shrl $7, (%rdx) # sched: [6:1.00]
+; BROADWELL-NEXT: sarl %cl, %edi # sched: [3:1.00]
+; BROADWELL-NEXT: shll %cl, %edi # sched: [3:1.00]
+; BROADWELL-NEXT: shrl %cl, %edi # sched: [3:1.00]
+; BROADWELL-NEXT: sarl %cl, (%rdx) # sched: [8:1.00]
+; BROADWELL-NEXT: shll %cl, (%rdx) # sched: [8:1.00]
+; BROADWELL-NEXT: shrl %cl, (%rdx) # sched: [8:1.00]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_sar_shl_shr_32:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: sarl %edi # sched: [1:0.50]
+; SKYLAKE-NEXT: shll %edi # sched: [1:0.50]
+; SKYLAKE-NEXT: shrl %edi # sched: [1:0.50]
+; SKYLAKE-NEXT: sarl (%rdx) # sched: [6:1.00]
+; SKYLAKE-NEXT: shll (%rdx) # sched: [6:1.00]
+; SKYLAKE-NEXT: shrl (%rdx) # sched: [6:1.00]
+; SKYLAKE-NEXT: sarl $7, %edi # sched: [1:0.50]
+; SKYLAKE-NEXT: shll $7, %edi # sched: [1:0.50]
+; SKYLAKE-NEXT: shrl $7, %edi # sched: [1:0.50]
+; SKYLAKE-NEXT: sarl $7, (%rdx) # sched: [6:1.00]
+; SKYLAKE-NEXT: shll $7, (%rdx) # sched: [6:1.00]
+; SKYLAKE-NEXT: shrl $7, (%rdx) # sched: [6:1.00]
+; SKYLAKE-NEXT: sarl %cl, %edi # sched: [3:1.50]
+; SKYLAKE-NEXT: shll %cl, %edi # sched: [3:1.50]
+; SKYLAKE-NEXT: shrl %cl, %edi # sched: [3:1.50]
+; SKYLAKE-NEXT: sarl %cl, (%rdx) # sched: [8:1.50]
+; SKYLAKE-NEXT: shll %cl, (%rdx) # sched: [8:1.50]
+; SKYLAKE-NEXT: shrl %cl, (%rdx) # sched: [8:1.50]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_sar_shl_shr_32:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: sarl %edi # sched: [1:0.50]
+; SKX-NEXT: shll %edi # sched: [1:0.50]
+; SKX-NEXT: shrl %edi # sched: [1:0.50]
+; SKX-NEXT: sarl (%rdx) # sched: [6:1.00]
+; SKX-NEXT: shll (%rdx) # sched: [6:1.00]
+; SKX-NEXT: shrl (%rdx) # sched: [6:1.00]
+; SKX-NEXT: sarl $7, %edi # sched: [1:0.50]
+; SKX-NEXT: shll $7, %edi # sched: [1:0.50]
+; SKX-NEXT: shrl $7, %edi # sched: [1:0.50]
+; SKX-NEXT: sarl $7, (%rdx) # sched: [6:1.00]
+; SKX-NEXT: shll $7, (%rdx) # sched: [6:1.00]
+; SKX-NEXT: shrl $7, (%rdx) # sched: [6:1.00]
+; SKX-NEXT: sarl %cl, %edi # sched: [3:1.50]
+; SKX-NEXT: shll %cl, %edi # sched: [3:1.50]
+; SKX-NEXT: shrl %cl, %edi # sched: [3:1.50]
+; SKX-NEXT: sarl %cl, (%rdx) # sched: [8:1.50]
+; SKX-NEXT: shll %cl, (%rdx) # sched: [8:1.50]
+; SKX-NEXT: shrl %cl, (%rdx) # sched: [8:1.50]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_sar_shl_shr_32:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: sarl %edi # sched: [1:0.50]
+; BTVER2-NEXT: shll %edi # sched: [1:0.50]
+; BTVER2-NEXT: shrl %edi # sched: [1:0.50]
+; BTVER2-NEXT: sarl (%rdx) # sched: [4:1.00]
+; BTVER2-NEXT: shll (%rdx) # sched: [4:1.00]
+; BTVER2-NEXT: shrl (%rdx) # sched: [4:1.00]
+; BTVER2-NEXT: sarl $7, %edi # sched: [1:0.50]
+; BTVER2-NEXT: shll $7, %edi # sched: [1:0.50]
+; BTVER2-NEXT: shrl $7, %edi # sched: [1:0.50]
+; BTVER2-NEXT: sarl $7, (%rdx) # sched: [4:1.00]
+; BTVER2-NEXT: shll $7, (%rdx) # sched: [4:1.00]
+; BTVER2-NEXT: shrl $7, (%rdx) # sched: [4:1.00]
+; BTVER2-NEXT: sarl %cl, %edi # sched: [1:0.50]
+; BTVER2-NEXT: shll %cl, %edi # sched: [1:0.50]
+; BTVER2-NEXT: shrl %cl, %edi # sched: [1:0.50]
+; BTVER2-NEXT: sarl %cl, (%rdx) # sched: [4:1.00]
+; BTVER2-NEXT: shll %cl, (%rdx) # sched: [4:1.00]
+; BTVER2-NEXT: shrl %cl, (%rdx) # sched: [4:1.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_sar_shl_shr_32:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: sarl %edi # sched: [1:0.25]
+; ZNVER1-NEXT: shll %edi # sched: [1:0.25]
+; ZNVER1-NEXT: shrl %edi # sched: [1:0.25]
+; ZNVER1-NEXT: sarl (%rdx) # sched: [5:0.50]
+; ZNVER1-NEXT: shll (%rdx) # sched: [5:0.50]
+; ZNVER1-NEXT: shrl (%rdx) # sched: [5:0.50]
+; ZNVER1-NEXT: sarl $7, %edi # sched: [1:0.25]
+; ZNVER1-NEXT: shll $7, %edi # sched: [1:0.25]
+; ZNVER1-NEXT: shrl $7, %edi # sched: [1:0.25]
+; ZNVER1-NEXT: sarl $7, (%rdx) # sched: [5:0.50]
+; ZNVER1-NEXT: shll $7, (%rdx) # sched: [5:0.50]
+; ZNVER1-NEXT: shrl $7, (%rdx) # sched: [5:0.50]
+; ZNVER1-NEXT: sarl %cl, %edi # sched: [1:0.25]
+; ZNVER1-NEXT: shll %cl, %edi # sched: [1:0.25]
+; ZNVER1-NEXT: shrl %cl, %edi # sched: [1:0.25]
+; ZNVER1-NEXT: sarl %cl, (%rdx) # sched: [5:1.00]
+; ZNVER1-NEXT: shll %cl, (%rdx) # sched: [5:1.00]
+; ZNVER1-NEXT: shrl %cl, (%rdx) # sched: [5:1.00]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ call void asm sideeffect "sarl $0 \0A\09 shll $0 \0A\09 shrl $0 \0A\09 sarl $2 \0A\09 shll $2 \0A\09 shrl $2 \0A\09 sarl $3, $0 \0A\09 shll $3, $0 \0A\09 shrl $3, $0 \0A\09 sarl $3, $2 \0A\09 shll $3, $2 \0A\09 shrl $3, $2 \0A\09 sarl %CL, $0 \0A\09 shll %CL, $0 \0A\09 shrl %CL, $0 \0A\09 sarl %CL, $2 \0A\09 shll %CL, $2 \0A\09 shrl %CL, $2", "r,r,*m,i"(i32 %a0, i32 %a1, i32 *%a2, i8 7)
+ ret void
+}
+define void @test_sar_shl_shr_64(i64 %a0, i64 %a1, i64 *%a2) optsize {
+; GENERIC-LABEL: test_sar_shl_shr_64:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: sarq %rdi # sched: [1:0.50]
+; GENERIC-NEXT: shlq %rdi # sched: [1:0.50]
+; GENERIC-NEXT: shrq %rdi # sched: [1:0.50]
+; GENERIC-NEXT: sarq (%rdx) # sched: [5:1.00]
+; GENERIC-NEXT: shlq (%rdx) # sched: [7:1.00]
+; GENERIC-NEXT: shrq (%rdx) # sched: [5:1.00]
+; GENERIC-NEXT: sarq $7, %rdi # sched: [1:0.50]
+; GENERIC-NEXT: shlq $7, %rdi # sched: [1:0.50]
+; GENERIC-NEXT: shrq $7, %rdi # sched: [1:0.50]
+; GENERIC-NEXT: sarq $7, (%rdx) # sched: [7:1.00]
+; GENERIC-NEXT: shlq $7, (%rdx) # sched: [7:1.00]
+; GENERIC-NEXT: shrq $7, (%rdx) # sched: [7:1.00]
+; GENERIC-NEXT: sarq %cl, %rdi # sched: [3:1.50]
+; GENERIC-NEXT: shlq %cl, %rdi # sched: [3:1.50]
+; GENERIC-NEXT: shrq %cl, %rdi # sched: [3:1.50]
+; GENERIC-NEXT: sarq %cl, (%rdx) # sched: [9:1.50]
+; GENERIC-NEXT: shlq %cl, (%rdx) # sched: [9:1.50]
+; GENERIC-NEXT: shrq %cl, (%rdx) # sched: [9:1.50]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_sar_shl_shr_64:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: sarq %rdi # sched: [1:1.00]
+; ATOM-NEXT: shlq %rdi # sched: [1:1.00]
+; ATOM-NEXT: shrq %rdi # sched: [1:1.00]
+; ATOM-NEXT: sarq (%rdx) # sched: [1:1.00]
+; ATOM-NEXT: shlq (%rdx) # sched: [1:1.00]
+; ATOM-NEXT: shrq (%rdx) # sched: [1:1.00]
+; ATOM-NEXT: sarq $7, %rdi # sched: [1:1.00]
+; ATOM-NEXT: shlq $7, %rdi # sched: [1:1.00]
+; ATOM-NEXT: shrq $7, %rdi # sched: [1:1.00]
+; ATOM-NEXT: sarq $7, (%rdx) # sched: [1:1.00]
+; ATOM-NEXT: shlq $7, (%rdx) # sched: [1:1.00]
+; ATOM-NEXT: shrq $7, (%rdx) # sched: [1:1.00]
+; ATOM-NEXT: sarq %cl, %rdi # sched: [1:1.00]
+; ATOM-NEXT: shlq %cl, %rdi # sched: [1:1.00]
+; ATOM-NEXT: shrq %cl, %rdi # sched: [1:1.00]
+; ATOM-NEXT: sarq %cl, (%rdx) # sched: [1:1.00]
+; ATOM-NEXT: shlq %cl, (%rdx) # sched: [1:1.00]
+; ATOM-NEXT: shrq %cl, (%rdx) # sched: [1:1.00]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_sar_shl_shr_64:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: sarq %rdi # sched: [1:1.00]
+; SLM-NEXT: shlq %rdi # sched: [1:1.00]
+; SLM-NEXT: shrq %rdi # sched: [1:1.00]
+; SLM-NEXT: sarq (%rdx) # sched: [4:2.00]
+; SLM-NEXT: shlq (%rdx) # sched: [4:2.00]
+; SLM-NEXT: shrq (%rdx) # sched: [4:2.00]
+; SLM-NEXT: sarq $7, %rdi # sched: [1:1.00]
+; SLM-NEXT: shlq $7, %rdi # sched: [1:1.00]
+; SLM-NEXT: shrq $7, %rdi # sched: [1:1.00]
+; SLM-NEXT: sarq $7, (%rdx) # sched: [4:2.00]
+; SLM-NEXT: shlq $7, (%rdx) # sched: [4:2.00]
+; SLM-NEXT: shrq $7, (%rdx) # sched: [4:2.00]
+; SLM-NEXT: sarq %cl, %rdi # sched: [1:1.00]
+; SLM-NEXT: shlq %cl, %rdi # sched: [1:1.00]
+; SLM-NEXT: shrq %cl, %rdi # sched: [1:1.00]
+; SLM-NEXT: sarq %cl, (%rdx) # sched: [4:2.00]
+; SLM-NEXT: shlq %cl, (%rdx) # sched: [4:2.00]
+; SLM-NEXT: shrq %cl, (%rdx) # sched: [4:2.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_sar_shl_shr_64:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: sarq %rdi # sched: [1:0.50]
+; SANDY-NEXT: shlq %rdi # sched: [1:0.50]
+; SANDY-NEXT: shrq %rdi # sched: [1:0.50]
+; SANDY-NEXT: sarq (%rdx) # sched: [5:1.00]
+; SANDY-NEXT: shlq (%rdx) # sched: [7:1.00]
+; SANDY-NEXT: shrq (%rdx) # sched: [5:1.00]
+; SANDY-NEXT: sarq $7, %rdi # sched: [1:0.50]
+; SANDY-NEXT: shlq $7, %rdi # sched: [1:0.50]
+; SANDY-NEXT: shrq $7, %rdi # sched: [1:0.50]
+; SANDY-NEXT: sarq $7, (%rdx) # sched: [7:1.00]
+; SANDY-NEXT: shlq $7, (%rdx) # sched: [7:1.00]
+; SANDY-NEXT: shrq $7, (%rdx) # sched: [7:1.00]
+; SANDY-NEXT: sarq %cl, %rdi # sched: [3:1.50]
+; SANDY-NEXT: shlq %cl, %rdi # sched: [3:1.50]
+; SANDY-NEXT: shrq %cl, %rdi # sched: [3:1.50]
+; SANDY-NEXT: sarq %cl, (%rdx) # sched: [9:1.50]
+; SANDY-NEXT: shlq %cl, (%rdx) # sched: [9:1.50]
+; SANDY-NEXT: shrq %cl, (%rdx) # sched: [9:1.50]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_sar_shl_shr_64:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: sarq %rdi # sched: [1:0.50]
+; HASWELL-NEXT: shlq %rdi # sched: [1:0.50]
+; HASWELL-NEXT: shrq %rdi # sched: [1:0.50]
+; HASWELL-NEXT: sarq (%rdx) # sched: [7:1.00]
+; HASWELL-NEXT: shlq (%rdx) # sched: [7:1.00]
+; HASWELL-NEXT: shrq (%rdx) # sched: [7:1.00]
+; HASWELL-NEXT: sarq $7, %rdi # sched: [1:0.50]
+; HASWELL-NEXT: shlq $7, %rdi # sched: [1:0.50]
+; HASWELL-NEXT: shrq $7, %rdi # sched: [1:0.50]
+; HASWELL-NEXT: sarq $7, (%rdx) # sched: [7:1.00]
+; HASWELL-NEXT: shlq $7, (%rdx) # sched: [7:1.00]
+; HASWELL-NEXT: shrq $7, (%rdx) # sched: [7:1.00]
+; HASWELL-NEXT: sarq %cl, %rdi # sched: [3:1.00]
+; HASWELL-NEXT: shlq %cl, %rdi # sched: [3:1.00]
+; HASWELL-NEXT: shrq %cl, %rdi # sched: [3:1.00]
+; HASWELL-NEXT: sarq %cl, (%rdx) # sched: [9:1.00]
+; HASWELL-NEXT: shlq %cl, (%rdx) # sched: [9:1.00]
+; HASWELL-NEXT: shrq %cl, (%rdx) # sched: [9:1.00]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_sar_shl_shr_64:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: sarq %rdi # sched: [1:0.50]
+; BROADWELL-NEXT: shlq %rdi # sched: [1:0.50]
+; BROADWELL-NEXT: shrq %rdi # sched: [1:0.50]
+; BROADWELL-NEXT: sarq (%rdx) # sched: [6:1.00]
+; BROADWELL-NEXT: shlq (%rdx) # sched: [6:1.00]
+; BROADWELL-NEXT: shrq (%rdx) # sched: [6:1.00]
+; BROADWELL-NEXT: sarq $7, %rdi # sched: [1:0.50]
+; BROADWELL-NEXT: shlq $7, %rdi # sched: [1:0.50]
+; BROADWELL-NEXT: shrq $7, %rdi # sched: [1:0.50]
+; BROADWELL-NEXT: sarq $7, (%rdx) # sched: [6:1.00]
+; BROADWELL-NEXT: shlq $7, (%rdx) # sched: [6:1.00]
+; BROADWELL-NEXT: shrq $7, (%rdx) # sched: [6:1.00]
+; BROADWELL-NEXT: sarq %cl, %rdi # sched: [3:1.00]
+; BROADWELL-NEXT: shlq %cl, %rdi # sched: [3:1.00]
+; BROADWELL-NEXT: shrq %cl, %rdi # sched: [3:1.00]
+; BROADWELL-NEXT: sarq %cl, (%rdx) # sched: [8:1.00]
+; BROADWELL-NEXT: shlq %cl, (%rdx) # sched: [8:1.00]
+; BROADWELL-NEXT: shrq %cl, (%rdx) # sched: [8:1.00]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_sar_shl_shr_64:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: sarq %rdi # sched: [1:0.50]
+; SKYLAKE-NEXT: shlq %rdi # sched: [1:0.50]
+; SKYLAKE-NEXT: shrq %rdi # sched: [1:0.50]
+; SKYLAKE-NEXT: sarq (%rdx) # sched: [6:1.00]
+; SKYLAKE-NEXT: shlq (%rdx) # sched: [6:1.00]
+; SKYLAKE-NEXT: shrq (%rdx) # sched: [6:1.00]
+; SKYLAKE-NEXT: sarq $7, %rdi # sched: [1:0.50]
+; SKYLAKE-NEXT: shlq $7, %rdi # sched: [1:0.50]
+; SKYLAKE-NEXT: shrq $7, %rdi # sched: [1:0.50]
+; SKYLAKE-NEXT: sarq $7, (%rdx) # sched: [6:1.00]
+; SKYLAKE-NEXT: shlq $7, (%rdx) # sched: [6:1.00]
+; SKYLAKE-NEXT: shrq $7, (%rdx) # sched: [6:1.00]
+; SKYLAKE-NEXT: sarq %cl, %rdi # sched: [3:1.50]
+; SKYLAKE-NEXT: shlq %cl, %rdi # sched: [3:1.50]
+; SKYLAKE-NEXT: shrq %cl, %rdi # sched: [3:1.50]
+; SKYLAKE-NEXT: sarq %cl, (%rdx) # sched: [8:1.50]
+; SKYLAKE-NEXT: shlq %cl, (%rdx) # sched: [8:1.50]
+; SKYLAKE-NEXT: shrq %cl, (%rdx) # sched: [8:1.50]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_sar_shl_shr_64:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: sarq %rdi # sched: [1:0.50]
+; SKX-NEXT: shlq %rdi # sched: [1:0.50]
+; SKX-NEXT: shrq %rdi # sched: [1:0.50]
+; SKX-NEXT: sarq (%rdx) # sched: [6:1.00]
+; SKX-NEXT: shlq (%rdx) # sched: [6:1.00]
+; SKX-NEXT: shrq (%rdx) # sched: [6:1.00]
+; SKX-NEXT: sarq $7, %rdi # sched: [1:0.50]
+; SKX-NEXT: shlq $7, %rdi # sched: [1:0.50]
+; SKX-NEXT: shrq $7, %rdi # sched: [1:0.50]
+; SKX-NEXT: sarq $7, (%rdx) # sched: [6:1.00]
+; SKX-NEXT: shlq $7, (%rdx) # sched: [6:1.00]
+; SKX-NEXT: shrq $7, (%rdx) # sched: [6:1.00]
+; SKX-NEXT: sarq %cl, %rdi # sched: [3:1.50]
+; SKX-NEXT: shlq %cl, %rdi # sched: [3:1.50]
+; SKX-NEXT: shrq %cl, %rdi # sched: [3:1.50]
+; SKX-NEXT: sarq %cl, (%rdx) # sched: [8:1.50]
+; SKX-NEXT: shlq %cl, (%rdx) # sched: [8:1.50]
+; SKX-NEXT: shrq %cl, (%rdx) # sched: [8:1.50]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_sar_shl_shr_64:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: sarq %rdi # sched: [1:0.50]
+; BTVER2-NEXT: shlq %rdi # sched: [1:0.50]
+; BTVER2-NEXT: shrq %rdi # sched: [1:0.50]
+; BTVER2-NEXT: sarq (%rdx) # sched: [4:1.00]
+; BTVER2-NEXT: shlq (%rdx) # sched: [4:1.00]
+; BTVER2-NEXT: shrq (%rdx) # sched: [4:1.00]
+; BTVER2-NEXT: sarq $7, %rdi # sched: [1:0.50]
+; BTVER2-NEXT: shlq $7, %rdi # sched: [1:0.50]
+; BTVER2-NEXT: shrq $7, %rdi # sched: [1:0.50]
+; BTVER2-NEXT: sarq $7, (%rdx) # sched: [4:1.00]
+; BTVER2-NEXT: shlq $7, (%rdx) # sched: [4:1.00]
+; BTVER2-NEXT: shrq $7, (%rdx) # sched: [4:1.00]
+; BTVER2-NEXT: sarq %cl, %rdi # sched: [1:0.50]
+; BTVER2-NEXT: shlq %cl, %rdi # sched: [1:0.50]
+; BTVER2-NEXT: shrq %cl, %rdi # sched: [1:0.50]
+; BTVER2-NEXT: sarq %cl, (%rdx) # sched: [4:1.00]
+; BTVER2-NEXT: shlq %cl, (%rdx) # sched: [4:1.00]
+; BTVER2-NEXT: shrq %cl, (%rdx) # sched: [4:1.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_sar_shl_shr_64:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: sarq %rdi # sched: [1:0.25]
+; ZNVER1-NEXT: shlq %rdi # sched: [1:0.25]
+; ZNVER1-NEXT: shrq %rdi # sched: [1:0.25]
+; ZNVER1-NEXT: sarq (%rdx) # sched: [5:0.50]
+; ZNVER1-NEXT: shlq (%rdx) # sched: [5:0.50]
+; ZNVER1-NEXT: shrq (%rdx) # sched: [5:0.50]
+; ZNVER1-NEXT: sarq $7, %rdi # sched: [1:0.25]
+; ZNVER1-NEXT: shlq $7, %rdi # sched: [1:0.25]
+; ZNVER1-NEXT: shrq $7, %rdi # sched: [1:0.25]
+; ZNVER1-NEXT: sarq $7, (%rdx) # sched: [5:0.50]
+; ZNVER1-NEXT: shlq $7, (%rdx) # sched: [5:0.50]
+; ZNVER1-NEXT: shrq $7, (%rdx) # sched: [5:0.50]
+; ZNVER1-NEXT: sarq %cl, %rdi # sched: [1:0.25]
+; ZNVER1-NEXT: shlq %cl, %rdi # sched: [1:0.25]
+; ZNVER1-NEXT: shrq %cl, %rdi # sched: [1:0.25]
+; ZNVER1-NEXT: sarq %cl, (%rdx) # sched: [5:1.00]
+; ZNVER1-NEXT: shlq %cl, (%rdx) # sched: [5:1.00]
+; ZNVER1-NEXT: shrq %cl, (%rdx) # sched: [5:1.00]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ call void asm sideeffect "sarq $0 \0A\09 shlq $0 \0A\09 shrq $0 \0A\09 sarq $2 \0A\09 shlq $2 \0A\09 shrq $2 \0A\09 sarq $3, $0 \0A\09 shlq $3, $0 \0A\09 shrq $3, $0 \0A\09 sarq $3, $2 \0A\09 shlq $3, $2 \0A\09 shrq $3, $2 \0A\09 sarq %CL, $0 \0A\09 shlq %CL, $0 \0A\09 shrq %CL, $0 \0A\09 sarq %CL, $2 \0A\09 shlq %CL, $2 \0A\09 shrq %CL, $2", "r,r,*m,i"(i64 %a0, i64 %a1, i64 *%a2, i8 7)
+ ret void
+}
+
+define void @test_sbb_8(i8 %a0, i8* %a1) optsize {
+; GENERIC-LABEL: test_sbb_8:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: sbbb $7, %al # sched: [3:1.00]
+; GENERIC-NEXT: sbbb $7, %dil # sched: [2:0.67]
+; GENERIC-NEXT: sbbb $7, (%rsi) # sched: [9:1.00]
+; GENERIC-NEXT: sbbb %dil, %dil # sched: [2:0.67]
+; GENERIC-NEXT: sbbb %dil, (%rsi) # sched: [9:1.00]
+; GENERIC-NEXT: sbbb (%rsi), %dil # sched: [7:0.67]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_sbb_8:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: sbbb $7, %al # sched: [1:0.50]
+; ATOM-NEXT: sbbb $7, %dil # sched: [1:0.50]
+; ATOM-NEXT: sbbb $7, (%rsi) # sched: [1:1.00]
+; ATOM-NEXT: sbbb %dil, %dil # sched: [1:0.50]
+; ATOM-NEXT: sbbb %dil, (%rsi) # sched: [1:1.00]
+; ATOM-NEXT: sbbb (%rsi), %dil # sched: [1:1.00]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_sbb_8:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: sbbb $7, %al # sched: [1:0.50]
+; SLM-NEXT: sbbb $7, %dil # sched: [1:0.50]
+; SLM-NEXT: sbbb $7, (%rsi) # sched: [4:2.00]
+; SLM-NEXT: sbbb %dil, %dil # sched: [1:0.50]
+; SLM-NEXT: sbbb %dil, (%rsi) # sched: [4:2.00]
+; SLM-NEXT: sbbb (%rsi), %dil # sched: [4:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_sbb_8:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: sbbb $7, %al # sched: [3:1.00]
+; SANDY-NEXT: sbbb $7, %dil # sched: [2:0.67]
+; SANDY-NEXT: sbbb $7, (%rsi) # sched: [9:1.00]
+; SANDY-NEXT: sbbb %dil, %dil # sched: [2:0.67]
+; SANDY-NEXT: sbbb %dil, (%rsi) # sched: [9:1.00]
+; SANDY-NEXT: sbbb (%rsi), %dil # sched: [7:0.67]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_sbb_8:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: sbbb $7, %al # sched: [2:0.50]
+; HASWELL-NEXT: sbbb $7, %dil # sched: [2:0.50]
+; HASWELL-NEXT: sbbb $7, (%rsi) # sched: [9:1.00]
+; HASWELL-NEXT: sbbb %dil, %dil # sched: [2:0.50]
+; HASWELL-NEXT: sbbb %dil, (%rsi) # sched: [9:1.00]
+; HASWELL-NEXT: sbbb (%rsi), %dil # sched: [7:0.50]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_sbb_8:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: sbbb $7, %al # sched: [2:0.50]
+; BROADWELL-NEXT: sbbb $7, %dil # sched: [2:0.50]
+; BROADWELL-NEXT: sbbb $7, (%rsi) # sched: [8:1.00]
+; BROADWELL-NEXT: sbbb %dil, %dil # sched: [1:0.50]
+; BROADWELL-NEXT: sbbb %dil, (%rsi) # sched: [8:1.00]
+; BROADWELL-NEXT: sbbb (%rsi), %dil # sched: [6:0.50]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_sbb_8:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: sbbb $7, %al # sched: [2:0.50]
+; SKYLAKE-NEXT: sbbb $7, %dil # sched: [2:0.50]
+; SKYLAKE-NEXT: sbbb $7, (%rsi) # sched: [8:1.00]
+; SKYLAKE-NEXT: sbbb %dil, %dil # sched: [1:0.50]
+; SKYLAKE-NEXT: sbbb %dil, (%rsi) # sched: [8:1.00]
+; SKYLAKE-NEXT: sbbb (%rsi), %dil # sched: [6:0.50]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_sbb_8:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: sbbb $7, %al # sched: [2:0.50]
+; SKX-NEXT: sbbb $7, %dil # sched: [2:0.50]
+; SKX-NEXT: sbbb $7, (%rsi) # sched: [8:1.00]
+; SKX-NEXT: sbbb %dil, %dil # sched: [1:0.50]
+; SKX-NEXT: sbbb %dil, (%rsi) # sched: [8:1.00]
+; SKX-NEXT: sbbb (%rsi), %dil # sched: [6:0.50]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_sbb_8:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: sbbb $7, %al # sched: [1:0.50]
+; BTVER2-NEXT: sbbb $7, %dil # sched: [1:0.50]
+; BTVER2-NEXT: sbbb $7, (%rsi) # sched: [4:1.00]
+; BTVER2-NEXT: sbbb %dil, %dil # sched: [1:0.50]
+; BTVER2-NEXT: sbbb %dil, (%rsi) # sched: [4:1.00]
+; BTVER2-NEXT: sbbb (%rsi), %dil # sched: [4:1.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_sbb_8:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: sbbb $7, %al # sched: [1:0.25]
+; ZNVER1-NEXT: sbbb $7, %dil # sched: [1:0.25]
+; ZNVER1-NEXT: sbbb $7, (%rsi) # sched: [5:0.50]
+; ZNVER1-NEXT: sbbb %dil, %dil # sched: [1:0.25]
+; ZNVER1-NEXT: sbbb %dil, (%rsi) # sched: [5:0.50]
+; ZNVER1-NEXT: sbbb (%rsi), %dil # sched: [5:0.50]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ tail call void asm "sbbb $2, %AL \0A\09 sbbb $2, $0 \0A\09 sbbb $2, $1 \0A\09 sbbb $0, $0 \0A\09 sbbb $0, $1 \0A\09 sbbb $1, $0", "r,*m,i"(i8 %a0, i8* %a1, i8 7) nounwind
+ ret void
+}
+define void @test_sbb_16(i16 %a0, i16* %a1) optsize {
+; GENERIC-LABEL: test_sbb_16:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: sbbw $511, %ax # imm = 0x1FF
+; GENERIC-NEXT: # sched: [1:0.33]
+; GENERIC-NEXT: sbbw $511, %di # imm = 0x1FF
+; GENERIC-NEXT: # sched: [2:0.67]
+; GENERIC-NEXT: sbbw $511, (%rsi) # imm = 0x1FF
+; GENERIC-NEXT: # sched: [9:1.00]
+; GENERIC-NEXT: sbbw $7, %di # sched: [2:0.67]
+; GENERIC-NEXT: sbbw $7, (%rsi) # sched: [9:1.00]
+; GENERIC-NEXT: sbbw %di, %di # sched: [2:0.67]
+; GENERIC-NEXT: sbbw %di, (%rsi) # sched: [9:1.00]
+; GENERIC-NEXT: sbbw (%rsi), %di # sched: [7:0.67]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_sbb_16:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: sbbw $511, %ax # imm = 0x1FF
+; ATOM-NEXT: # sched: [1:0.50]
+; ATOM-NEXT: sbbw $511, %di # imm = 0x1FF
+; ATOM-NEXT: # sched: [1:0.50]
+; ATOM-NEXT: sbbw $511, (%rsi) # imm = 0x1FF
+; ATOM-NEXT: # sched: [1:1.00]
+; ATOM-NEXT: sbbw $7, %di # sched: [1:0.50]
+; ATOM-NEXT: sbbw $7, (%rsi) # sched: [1:1.00]
+; ATOM-NEXT: sbbw %di, %di # sched: [1:0.50]
+; ATOM-NEXT: sbbw %di, (%rsi) # sched: [1:1.00]
+; ATOM-NEXT: sbbw (%rsi), %di # sched: [1:1.00]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_sbb_16:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: sbbw $511, %ax # imm = 0x1FF
+; SLM-NEXT: # sched: [1:0.50]
+; SLM-NEXT: sbbw $511, %di # imm = 0x1FF
+; SLM-NEXT: # sched: [1:0.50]
+; SLM-NEXT: sbbw $511, (%rsi) # imm = 0x1FF
+; SLM-NEXT: # sched: [4:2.00]
+; SLM-NEXT: sbbw $7, %di # sched: [1:0.50]
+; SLM-NEXT: sbbw $7, (%rsi) # sched: [4:2.00]
+; SLM-NEXT: sbbw %di, %di # sched: [1:0.50]
+; SLM-NEXT: sbbw %di, (%rsi) # sched: [4:2.00]
+; SLM-NEXT: sbbw (%rsi), %di # sched: [4:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_sbb_16:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: sbbw $511, %ax # imm = 0x1FF
+; SANDY-NEXT: # sched: [1:0.33]
+; SANDY-NEXT: sbbw $511, %di # imm = 0x1FF
+; SANDY-NEXT: # sched: [2:0.67]
+; SANDY-NEXT: sbbw $511, (%rsi) # imm = 0x1FF
+; SANDY-NEXT: # sched: [9:1.00]
+; SANDY-NEXT: sbbw $7, %di # sched: [2:0.67]
+; SANDY-NEXT: sbbw $7, (%rsi) # sched: [9:1.00]
+; SANDY-NEXT: sbbw %di, %di # sched: [2:0.67]
+; SANDY-NEXT: sbbw %di, (%rsi) # sched: [9:1.00]
+; SANDY-NEXT: sbbw (%rsi), %di # sched: [7:0.67]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_sbb_16:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: sbbw $511, %ax # imm = 0x1FF
+; HASWELL-NEXT: # sched: [1:0.25]
+; HASWELL-NEXT: sbbw $511, %di # imm = 0x1FF
+; HASWELL-NEXT: # sched: [2:0.50]
+; HASWELL-NEXT: sbbw $511, (%rsi) # imm = 0x1FF
+; HASWELL-NEXT: # sched: [9:1.00]
+; HASWELL-NEXT: sbbw $7, %di # sched: [2:0.50]
+; HASWELL-NEXT: sbbw $7, (%rsi) # sched: [9:1.00]
+; HASWELL-NEXT: sbbw %di, %di # sched: [2:0.50]
+; HASWELL-NEXT: sbbw %di, (%rsi) # sched: [9:1.00]
+; HASWELL-NEXT: sbbw (%rsi), %di # sched: [7:0.50]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_sbb_16:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: sbbw $511, %ax # imm = 0x1FF
+; BROADWELL-NEXT: # sched: [1:0.25]
+; BROADWELL-NEXT: sbbw $511, %di # imm = 0x1FF
+; BROADWELL-NEXT: # sched: [1:0.50]
+; BROADWELL-NEXT: sbbw $511, (%rsi) # imm = 0x1FF
+; BROADWELL-NEXT: # sched: [8:1.00]
+; BROADWELL-NEXT: sbbw $7, %di # sched: [1:0.50]
+; BROADWELL-NEXT: sbbw $7, (%rsi) # sched: [8:1.00]
+; BROADWELL-NEXT: sbbw %di, %di # sched: [1:0.50]
+; BROADWELL-NEXT: sbbw %di, (%rsi) # sched: [8:1.00]
+; BROADWELL-NEXT: sbbw (%rsi), %di # sched: [6:0.50]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_sbb_16:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: sbbw $511, %ax # imm = 0x1FF
+; SKYLAKE-NEXT: # sched: [1:0.25]
+; SKYLAKE-NEXT: sbbw $511, %di # imm = 0x1FF
+; SKYLAKE-NEXT: # sched: [1:0.50]
+; SKYLAKE-NEXT: sbbw $511, (%rsi) # imm = 0x1FF
+; SKYLAKE-NEXT: # sched: [8:1.00]
+; SKYLAKE-NEXT: sbbw $7, %di # sched: [1:0.50]
+; SKYLAKE-NEXT: sbbw $7, (%rsi) # sched: [8:1.00]
+; SKYLAKE-NEXT: sbbw %di, %di # sched: [1:0.50]
+; SKYLAKE-NEXT: sbbw %di, (%rsi) # sched: [8:1.00]
+; SKYLAKE-NEXT: sbbw (%rsi), %di # sched: [6:0.50]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_sbb_16:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: sbbw $511, %ax # imm = 0x1FF
+; SKX-NEXT: # sched: [1:0.25]
+; SKX-NEXT: sbbw $511, %di # imm = 0x1FF
+; SKX-NEXT: # sched: [1:0.50]
+; SKX-NEXT: sbbw $511, (%rsi) # imm = 0x1FF
+; SKX-NEXT: # sched: [8:1.00]
+; SKX-NEXT: sbbw $7, %di # sched: [1:0.50]
+; SKX-NEXT: sbbw $7, (%rsi) # sched: [8:1.00]
+; SKX-NEXT: sbbw %di, %di # sched: [1:0.50]
+; SKX-NEXT: sbbw %di, (%rsi) # sched: [8:1.00]
+; SKX-NEXT: sbbw (%rsi), %di # sched: [6:0.50]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_sbb_16:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: sbbw $511, %ax # imm = 0x1FF
+; BTVER2-NEXT: # sched: [1:0.50]
+; BTVER2-NEXT: sbbw $511, %di # imm = 0x1FF
+; BTVER2-NEXT: # sched: [1:0.50]
+; BTVER2-NEXT: sbbw $511, (%rsi) # imm = 0x1FF
+; BTVER2-NEXT: # sched: [4:1.00]
+; BTVER2-NEXT: sbbw $7, %di # sched: [1:0.50]
+; BTVER2-NEXT: sbbw $7, (%rsi) # sched: [4:1.00]
+; BTVER2-NEXT: sbbw %di, %di # sched: [1:0.50]
+; BTVER2-NEXT: sbbw %di, (%rsi) # sched: [4:1.00]
+; BTVER2-NEXT: sbbw (%rsi), %di # sched: [4:1.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_sbb_16:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: sbbw $511, %ax # imm = 0x1FF
+; ZNVER1-NEXT: # sched: [1:0.25]
+; ZNVER1-NEXT: sbbw $511, %di # imm = 0x1FF
+; ZNVER1-NEXT: # sched: [1:0.25]
+; ZNVER1-NEXT: sbbw $511, (%rsi) # imm = 0x1FF
+; ZNVER1-NEXT: # sched: [5:0.50]
+; ZNVER1-NEXT: sbbw $7, %di # sched: [1:0.25]
+; ZNVER1-NEXT: sbbw $7, (%rsi) # sched: [5:0.50]
+; ZNVER1-NEXT: sbbw %di, %di # sched: [1:0.25]
+; ZNVER1-NEXT: sbbw %di, (%rsi) # sched: [5:0.50]
+; ZNVER1-NEXT: sbbw (%rsi), %di # sched: [5:0.50]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ tail call void asm "sbbw $2, %AX \0A\09 sbbw $2, $0 \0A\09 sbbw $2, $1 \0A\09 sbbw $3, $0 \0A\09 sbbw $3, $1 \0A\09 sbbw $0, $0 \0A\09 sbbw $0, $1 \0A\09 sbbw $1, $0", "r,*m,i,i"(i16 %a0, i16* %a1, i16 511, i8 7) nounwind
+ ret void
+}
+define void @test_sbb_32(i32 %a0, i32* %a1) optsize {
+; GENERIC-LABEL: test_sbb_32:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: sbbl $665536, %eax # imm = 0xA27C0
+; GENERIC-NEXT: # sched: [1:0.33]
+; GENERIC-NEXT: sbbl $665536, %edi # imm = 0xA27C0
+; GENERIC-NEXT: # sched: [2:0.67]
+; GENERIC-NEXT: sbbl $665536, (%rsi) # imm = 0xA27C0
+; GENERIC-NEXT: # sched: [9:1.00]
+; GENERIC-NEXT: sbbl $7, %edi # sched: [2:0.67]
+; GENERIC-NEXT: sbbl $7, (%rsi) # sched: [9:1.00]
+; GENERIC-NEXT: sbbl %edi, %edi # sched: [2:0.67]
+; GENERIC-NEXT: sbbl %edi, (%rsi) # sched: [9:1.00]
+; GENERIC-NEXT: sbbl (%rsi), %edi # sched: [7:0.67]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_sbb_32:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: sbbl $665536, %eax # imm = 0xA27C0
+; ATOM-NEXT: # sched: [1:0.50]
+; ATOM-NEXT: sbbl $665536, %edi # imm = 0xA27C0
+; ATOM-NEXT: # sched: [1:0.50]
+; ATOM-NEXT: sbbl $665536, (%rsi) # imm = 0xA27C0
+; ATOM-NEXT: # sched: [1:1.00]
+; ATOM-NEXT: sbbl $7, %edi # sched: [1:0.50]
+; ATOM-NEXT: sbbl $7, (%rsi) # sched: [1:1.00]
+; ATOM-NEXT: sbbl %edi, %edi # sched: [1:0.50]
+; ATOM-NEXT: sbbl %edi, (%rsi) # sched: [1:1.00]
+; ATOM-NEXT: sbbl (%rsi), %edi # sched: [1:1.00]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_sbb_32:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: sbbl $665536, %eax # imm = 0xA27C0
+; SLM-NEXT: # sched: [1:0.50]
+; SLM-NEXT: sbbl $665536, %edi # imm = 0xA27C0
+; SLM-NEXT: # sched: [1:0.50]
+; SLM-NEXT: sbbl $665536, (%rsi) # imm = 0xA27C0
+; SLM-NEXT: # sched: [4:2.00]
+; SLM-NEXT: sbbl $7, %edi # sched: [1:0.50]
+; SLM-NEXT: sbbl $7, (%rsi) # sched: [4:2.00]
+; SLM-NEXT: sbbl %edi, %edi # sched: [1:0.50]
+; SLM-NEXT: sbbl %edi, (%rsi) # sched: [4:2.00]
+; SLM-NEXT: sbbl (%rsi), %edi # sched: [4:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_sbb_32:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: sbbl $665536, %eax # imm = 0xA27C0
+; SANDY-NEXT: # sched: [1:0.33]
+; SANDY-NEXT: sbbl $665536, %edi # imm = 0xA27C0
+; SANDY-NEXT: # sched: [2:0.67]
+; SANDY-NEXT: sbbl $665536, (%rsi) # imm = 0xA27C0
+; SANDY-NEXT: # sched: [9:1.00]
+; SANDY-NEXT: sbbl $7, %edi # sched: [2:0.67]
+; SANDY-NEXT: sbbl $7, (%rsi) # sched: [9:1.00]
+; SANDY-NEXT: sbbl %edi, %edi # sched: [2:0.67]
+; SANDY-NEXT: sbbl %edi, (%rsi) # sched: [9:1.00]
+; SANDY-NEXT: sbbl (%rsi), %edi # sched: [7:0.67]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_sbb_32:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: sbbl $665536, %eax # imm = 0xA27C0
+; HASWELL-NEXT: # sched: [1:0.25]
+; HASWELL-NEXT: sbbl $665536, %edi # imm = 0xA27C0
+; HASWELL-NEXT: # sched: [2:0.50]
+; HASWELL-NEXT: sbbl $665536, (%rsi) # imm = 0xA27C0
+; HASWELL-NEXT: # sched: [9:1.00]
+; HASWELL-NEXT: sbbl $7, %edi # sched: [2:0.50]
+; HASWELL-NEXT: sbbl $7, (%rsi) # sched: [9:1.00]
+; HASWELL-NEXT: sbbl %edi, %edi # sched: [2:0.50]
+; HASWELL-NEXT: sbbl %edi, (%rsi) # sched: [9:1.00]
+; HASWELL-NEXT: sbbl (%rsi), %edi # sched: [7:0.50]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_sbb_32:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: sbbl $665536, %eax # imm = 0xA27C0
+; BROADWELL-NEXT: # sched: [1:0.25]
+; BROADWELL-NEXT: sbbl $665536, %edi # imm = 0xA27C0
+; BROADWELL-NEXT: # sched: [1:0.50]
+; BROADWELL-NEXT: sbbl $665536, (%rsi) # imm = 0xA27C0
+; BROADWELL-NEXT: # sched: [8:1.00]
+; BROADWELL-NEXT: sbbl $7, %edi # sched: [1:0.50]
+; BROADWELL-NEXT: sbbl $7, (%rsi) # sched: [8:1.00]
+; BROADWELL-NEXT: sbbl %edi, %edi # sched: [1:0.50]
+; BROADWELL-NEXT: sbbl %edi, (%rsi) # sched: [8:1.00]
+; BROADWELL-NEXT: sbbl (%rsi), %edi # sched: [6:0.50]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_sbb_32:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: sbbl $665536, %eax # imm = 0xA27C0
+; SKYLAKE-NEXT: # sched: [1:0.25]
+; SKYLAKE-NEXT: sbbl $665536, %edi # imm = 0xA27C0
+; SKYLAKE-NEXT: # sched: [1:0.50]
+; SKYLAKE-NEXT: sbbl $665536, (%rsi) # imm = 0xA27C0
+; SKYLAKE-NEXT: # sched: [8:1.00]
+; SKYLAKE-NEXT: sbbl $7, %edi # sched: [1:0.50]
+; SKYLAKE-NEXT: sbbl $7, (%rsi) # sched: [8:1.00]
+; SKYLAKE-NEXT: sbbl %edi, %edi # sched: [1:0.50]
+; SKYLAKE-NEXT: sbbl %edi, (%rsi) # sched: [8:1.00]
+; SKYLAKE-NEXT: sbbl (%rsi), %edi # sched: [6:0.50]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_sbb_32:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: sbbl $665536, %eax # imm = 0xA27C0
+; SKX-NEXT: # sched: [1:0.25]
+; SKX-NEXT: sbbl $665536, %edi # imm = 0xA27C0
+; SKX-NEXT: # sched: [1:0.50]
+; SKX-NEXT: sbbl $665536, (%rsi) # imm = 0xA27C0
+; SKX-NEXT: # sched: [8:1.00]
+; SKX-NEXT: sbbl $7, %edi # sched: [1:0.50]
+; SKX-NEXT: sbbl $7, (%rsi) # sched: [8:1.00]
+; SKX-NEXT: sbbl %edi, %edi # sched: [1:0.50]
+; SKX-NEXT: sbbl %edi, (%rsi) # sched: [8:1.00]
+; SKX-NEXT: sbbl (%rsi), %edi # sched: [6:0.50]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_sbb_32:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: sbbl $665536, %eax # imm = 0xA27C0
+; BTVER2-NEXT: # sched: [1:0.50]
+; BTVER2-NEXT: sbbl $665536, %edi # imm = 0xA27C0
+; BTVER2-NEXT: # sched: [1:0.50]
+; BTVER2-NEXT: sbbl $665536, (%rsi) # imm = 0xA27C0
+; BTVER2-NEXT: # sched: [4:1.00]
+; BTVER2-NEXT: sbbl $7, %edi # sched: [1:0.50]
+; BTVER2-NEXT: sbbl $7, (%rsi) # sched: [4:1.00]
+; BTVER2-NEXT: sbbl %edi, %edi # sched: [1:0.50]
+; BTVER2-NEXT: sbbl %edi, (%rsi) # sched: [4:1.00]
+; BTVER2-NEXT: sbbl (%rsi), %edi # sched: [4:1.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_sbb_32:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: sbbl $665536, %eax # imm = 0xA27C0
+; ZNVER1-NEXT: # sched: [1:0.25]
+; ZNVER1-NEXT: sbbl $665536, %edi # imm = 0xA27C0
+; ZNVER1-NEXT: # sched: [1:0.25]
+; ZNVER1-NEXT: sbbl $665536, (%rsi) # imm = 0xA27C0
+; ZNVER1-NEXT: # sched: [5:0.50]
+; ZNVER1-NEXT: sbbl $7, %edi # sched: [1:0.25]
+; ZNVER1-NEXT: sbbl $7, (%rsi) # sched: [5:0.50]
+; ZNVER1-NEXT: sbbl %edi, %edi # sched: [1:0.25]
+; ZNVER1-NEXT: sbbl %edi, (%rsi) # sched: [5:0.50]
+; ZNVER1-NEXT: sbbl (%rsi), %edi # sched: [5:0.50]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ tail call void asm "sbbl $2, %EAX \0A\09 sbbl $2, $0 \0A\09 sbbl $2, $1 \0A\09 sbbl $3, $0 \0A\09 sbbl $3, $1 \0A\09 sbbl $0, $0 \0A\09 sbbl $0, $1 \0A\09 sbbl $1, $0", "r,*m,i,i"(i32 %a0, i32* %a1, i32 665536, i8 7) nounwind
+ ret void
+}
+define void @test_sbb_64(i64 %a0, i64* %a1) optsize {
+; GENERIC-LABEL: test_sbb_64:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: sbbq $665536, %rax # imm = 0xA27C0
+; GENERIC-NEXT: # sched: [1:0.33]
+; GENERIC-NEXT: sbbq $665536, %rdi # imm = 0xA27C0
+; GENERIC-NEXT: # sched: [2:0.67]
+; GENERIC-NEXT: sbbq $665536, (%rsi) # imm = 0xA27C0
+; GENERIC-NEXT: # sched: [9:1.00]
+; GENERIC-NEXT: sbbq $7, %rdi # sched: [2:0.67]
+; GENERIC-NEXT: sbbq $7, (%rsi) # sched: [9:1.00]
+; GENERIC-NEXT: sbbq %rdi, %rdi # sched: [2:0.67]
+; GENERIC-NEXT: sbbq %rdi, (%rsi) # sched: [9:1.00]
+; GENERIC-NEXT: sbbq (%rsi), %rdi # sched: [7:0.67]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_sbb_64:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: sbbq $665536, %rax # imm = 0xA27C0
+; ATOM-NEXT: # sched: [1:0.50]
+; ATOM-NEXT: sbbq $665536, %rdi # imm = 0xA27C0
+; ATOM-NEXT: # sched: [1:0.50]
+; ATOM-NEXT: sbbq $665536, (%rsi) # imm = 0xA27C0
+; ATOM-NEXT: # sched: [1:1.00]
+; ATOM-NEXT: sbbq $7, %rdi # sched: [1:0.50]
+; ATOM-NEXT: sbbq $7, (%rsi) # sched: [1:1.00]
+; ATOM-NEXT: sbbq %rdi, %rdi # sched: [1:0.50]
+; ATOM-NEXT: sbbq %rdi, (%rsi) # sched: [1:1.00]
+; ATOM-NEXT: sbbq (%rsi), %rdi # sched: [1:1.00]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_sbb_64:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: sbbq $665536, %rax # imm = 0xA27C0
+; SLM-NEXT: # sched: [1:0.50]
+; SLM-NEXT: sbbq $665536, %rdi # imm = 0xA27C0
+; SLM-NEXT: # sched: [1:0.50]
+; SLM-NEXT: sbbq $665536, (%rsi) # imm = 0xA27C0
+; SLM-NEXT: # sched: [4:2.00]
+; SLM-NEXT: sbbq $7, %rdi # sched: [1:0.50]
+; SLM-NEXT: sbbq $7, (%rsi) # sched: [4:2.00]
+; SLM-NEXT: sbbq %rdi, %rdi # sched: [1:0.50]
+; SLM-NEXT: sbbq %rdi, (%rsi) # sched: [4:2.00]
+; SLM-NEXT: sbbq (%rsi), %rdi # sched: [4:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_sbb_64:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: sbbq $665536, %rax # imm = 0xA27C0
+; SANDY-NEXT: # sched: [1:0.33]
+; SANDY-NEXT: sbbq $665536, %rdi # imm = 0xA27C0
+; SANDY-NEXT: # sched: [2:0.67]
+; SANDY-NEXT: sbbq $665536, (%rsi) # imm = 0xA27C0
+; SANDY-NEXT: # sched: [9:1.00]
+; SANDY-NEXT: sbbq $7, %rdi # sched: [2:0.67]
+; SANDY-NEXT: sbbq $7, (%rsi) # sched: [9:1.00]
+; SANDY-NEXT: sbbq %rdi, %rdi # sched: [2:0.67]
+; SANDY-NEXT: sbbq %rdi, (%rsi) # sched: [9:1.00]
+; SANDY-NEXT: sbbq (%rsi), %rdi # sched: [7:0.67]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_sbb_64:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: sbbq $665536, %rax # imm = 0xA27C0
+; HASWELL-NEXT: # sched: [1:0.25]
+; HASWELL-NEXT: sbbq $665536, %rdi # imm = 0xA27C0
+; HASWELL-NEXT: # sched: [2:0.50]
+; HASWELL-NEXT: sbbq $665536, (%rsi) # imm = 0xA27C0
+; HASWELL-NEXT: # sched: [9:1.00]
+; HASWELL-NEXT: sbbq $7, %rdi # sched: [2:0.50]
+; HASWELL-NEXT: sbbq $7, (%rsi) # sched: [9:1.00]
+; HASWELL-NEXT: sbbq %rdi, %rdi # sched: [2:0.50]
+; HASWELL-NEXT: sbbq %rdi, (%rsi) # sched: [9:1.00]
+; HASWELL-NEXT: sbbq (%rsi), %rdi # sched: [7:0.50]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_sbb_64:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: sbbq $665536, %rax # imm = 0xA27C0
+; BROADWELL-NEXT: # sched: [1:0.25]
+; BROADWELL-NEXT: sbbq $665536, %rdi # imm = 0xA27C0
+; BROADWELL-NEXT: # sched: [1:0.50]
+; BROADWELL-NEXT: sbbq $665536, (%rsi) # imm = 0xA27C0
+; BROADWELL-NEXT: # sched: [8:1.00]
+; BROADWELL-NEXT: sbbq $7, %rdi # sched: [1:0.50]
+; BROADWELL-NEXT: sbbq $7, (%rsi) # sched: [8:1.00]
+; BROADWELL-NEXT: sbbq %rdi, %rdi # sched: [1:0.50]
+; BROADWELL-NEXT: sbbq %rdi, (%rsi) # sched: [8:1.00]
+; BROADWELL-NEXT: sbbq (%rsi), %rdi # sched: [6:0.50]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_sbb_64:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: sbbq $665536, %rax # imm = 0xA27C0
+; SKYLAKE-NEXT: # sched: [1:0.25]
+; SKYLAKE-NEXT: sbbq $665536, %rdi # imm = 0xA27C0
+; SKYLAKE-NEXT: # sched: [1:0.50]
+; SKYLAKE-NEXT: sbbq $665536, (%rsi) # imm = 0xA27C0
+; SKYLAKE-NEXT: # sched: [8:1.00]
+; SKYLAKE-NEXT: sbbq $7, %rdi # sched: [1:0.50]
+; SKYLAKE-NEXT: sbbq $7, (%rsi) # sched: [8:1.00]
+; SKYLAKE-NEXT: sbbq %rdi, %rdi # sched: [1:0.50]
+; SKYLAKE-NEXT: sbbq %rdi, (%rsi) # sched: [8:1.00]
+; SKYLAKE-NEXT: sbbq (%rsi), %rdi # sched: [6:0.50]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_sbb_64:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: sbbq $665536, %rax # imm = 0xA27C0
+; SKX-NEXT: # sched: [1:0.25]
+; SKX-NEXT: sbbq $665536, %rdi # imm = 0xA27C0
+; SKX-NEXT: # sched: [1:0.50]
+; SKX-NEXT: sbbq $665536, (%rsi) # imm = 0xA27C0
+; SKX-NEXT: # sched: [8:1.00]
+; SKX-NEXT: sbbq $7, %rdi # sched: [1:0.50]
+; SKX-NEXT: sbbq $7, (%rsi) # sched: [8:1.00]
+; SKX-NEXT: sbbq %rdi, %rdi # sched: [1:0.50]
+; SKX-NEXT: sbbq %rdi, (%rsi) # sched: [8:1.00]
+; SKX-NEXT: sbbq (%rsi), %rdi # sched: [6:0.50]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_sbb_64:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: sbbq $665536, %rax # imm = 0xA27C0
+; BTVER2-NEXT: # sched: [1:0.50]
+; BTVER2-NEXT: sbbq $665536, %rdi # imm = 0xA27C0
+; BTVER2-NEXT: # sched: [1:0.50]
+; BTVER2-NEXT: sbbq $665536, (%rsi) # imm = 0xA27C0
+; BTVER2-NEXT: # sched: [4:1.00]
+; BTVER2-NEXT: sbbq $7, %rdi # sched: [1:0.50]
+; BTVER2-NEXT: sbbq $7, (%rsi) # sched: [4:1.00]
+; BTVER2-NEXT: sbbq %rdi, %rdi # sched: [1:0.50]
+; BTVER2-NEXT: sbbq %rdi, (%rsi) # sched: [4:1.00]
+; BTVER2-NEXT: sbbq (%rsi), %rdi # sched: [4:1.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_sbb_64:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: sbbq $665536, %rax # imm = 0xA27C0
+; ZNVER1-NEXT: # sched: [1:0.25]
+; ZNVER1-NEXT: sbbq $665536, %rdi # imm = 0xA27C0
+; ZNVER1-NEXT: # sched: [1:0.25]
+; ZNVER1-NEXT: sbbq $665536, (%rsi) # imm = 0xA27C0
+; ZNVER1-NEXT: # sched: [5:0.50]
+; ZNVER1-NEXT: sbbq $7, %rdi # sched: [1:0.25]
+; ZNVER1-NEXT: sbbq $7, (%rsi) # sched: [5:0.50]
+; ZNVER1-NEXT: sbbq %rdi, %rdi # sched: [1:0.25]
+; ZNVER1-NEXT: sbbq %rdi, (%rsi) # sched: [5:0.50]
+; ZNVER1-NEXT: sbbq (%rsi), %rdi # sched: [5:0.50]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ tail call void asm "sbbq $2, %RAX \0A\09 sbbq $2, $0 \0A\09 sbbq $2, $1 \0A\09 sbbq $3, $0 \0A\09 sbbq $3, $1 \0A\09 sbbq $0, $0 \0A\09 sbbq $0, $1 \0A\09 sbbq $1, $0", "r,*m,i,i"(i64 %a0, i64* %a1, i32 665536, i8 7) nounwind
+ ret void
+}
+
+define void @test_scas() optsize {
+; GENERIC-LABEL: test_scas:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: scasb %es:(%rdi), %al # sched: [2:0.67]
+; GENERIC-NEXT: scasw %es:(%rdi), %ax # sched: [2:0.67]
+; GENERIC-NEXT: scasl %es:(%rdi), %eax # sched: [2:0.67]
+; GENERIC-NEXT: scasq %es:(%rdi), %rax # sched: [2:0.67]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_scas:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: scasb %es:(%rdi), %al # sched: [2:1.00]
+; ATOM-NEXT: scasw %es:(%rdi), %ax # sched: [2:1.00]
+; ATOM-NEXT: scasl %es:(%rdi), %eax # sched: [2:1.00]
+; ATOM-NEXT: scasq %es:(%rdi), %rax # sched: [2:1.00]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_scas:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: scasb %es:(%rdi), %al # sched: [100:1.00]
+; SLM-NEXT: scasw %es:(%rdi), %ax # sched: [100:1.00]
+; SLM-NEXT: scasl %es:(%rdi), %eax # sched: [100:1.00]
+; SLM-NEXT: scasq %es:(%rdi), %rax # sched: [100:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_scas:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: scasb %es:(%rdi), %al # sched: [2:0.67]
+; SANDY-NEXT: scasw %es:(%rdi), %ax # sched: [2:0.67]
+; SANDY-NEXT: scasl %es:(%rdi), %eax # sched: [2:0.67]
+; SANDY-NEXT: scasq %es:(%rdi), %rax # sched: [2:0.67]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_scas:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: scasb %es:(%rdi), %al # sched: [7:0.50]
+; HASWELL-NEXT: scasw %es:(%rdi), %ax # sched: [7:0.50]
+; HASWELL-NEXT: scasl %es:(%rdi), %eax # sched: [7:0.50]
+; HASWELL-NEXT: scasq %es:(%rdi), %rax # sched: [7:0.50]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_scas:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: scasb %es:(%rdi), %al # sched: [7:0.50]
+; BROADWELL-NEXT: scasw %es:(%rdi), %ax # sched: [7:0.50]
+; BROADWELL-NEXT: scasl %es:(%rdi), %eax # sched: [7:0.50]
+; BROADWELL-NEXT: scasq %es:(%rdi), %rax # sched: [7:0.50]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_scas:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: scasb %es:(%rdi), %al # sched: [7:0.50]
+; SKYLAKE-NEXT: scasw %es:(%rdi), %ax # sched: [7:0.50]
+; SKYLAKE-NEXT: scasl %es:(%rdi), %eax # sched: [7:0.50]
+; SKYLAKE-NEXT: scasq %es:(%rdi), %rax # sched: [7:0.50]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_scas:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: scasb %es:(%rdi), %al # sched: [7:0.50]
+; SKX-NEXT: scasw %es:(%rdi), %ax # sched: [7:0.50]
+; SKX-NEXT: scasl %es:(%rdi), %eax # sched: [7:0.50]
+; SKX-NEXT: scasq %es:(%rdi), %rax # sched: [7:0.50]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_scas:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: scasb %es:(%rdi), %al # sched: [100:0.17]
+; BTVER2-NEXT: scasw %es:(%rdi), %ax # sched: [100:0.17]
+; BTVER2-NEXT: scasl %es:(%rdi), %eax # sched: [100:0.17]
+; BTVER2-NEXT: scasq %es:(%rdi), %rax # sched: [100:0.17]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_scas:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: scasb %es:(%rdi), %al # sched: [100:?]
+; ZNVER1-NEXT: scasw %es:(%rdi), %ax # sched: [100:?]
+; ZNVER1-NEXT: scasl %es:(%rdi), %eax # sched: [100:?]
+; ZNVER1-NEXT: scasq %es:(%rdi), %rax # sched: [100:?]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ call void asm sideeffect "scasb \0A\09 scasw \0A\09 scasl \0A\09 scasq", ""()
+ ret void
+}
+
+define void @test_setcc(i8 %a0, i8 *%a1) optsize {
+; GENERIC-LABEL: test_setcc:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: seto %dil # sched: [1:0.50]
+; GENERIC-NEXT: setno %dil # sched: [1:0.50]
+; GENERIC-NEXT: setb %dil # sched: [1:0.50]
+; GENERIC-NEXT: setae %dil # sched: [1:0.50]
+; GENERIC-NEXT: sete %dil # sched: [1:0.50]
+; GENERIC-NEXT: setne %dil # sched: [1:0.50]
+; GENERIC-NEXT: setbe %dil # sched: [2:1.00]
+; GENERIC-NEXT: seta %dil # sched: [2:1.00]
+; GENERIC-NEXT: sets %dil # sched: [1:0.50]
+; GENERIC-NEXT: setns %dil # sched: [1:0.50]
+; GENERIC-NEXT: setp %dil # sched: [1:0.50]
+; GENERIC-NEXT: setnp %dil # sched: [1:0.50]
+; GENERIC-NEXT: setl %dil # sched: [1:0.50]
+; GENERIC-NEXT: setge %dil # sched: [1:0.50]
+; GENERIC-NEXT: setle %dil # sched: [1:0.50]
+; GENERIC-NEXT: setg %dil # sched: [1:0.50]
+; GENERIC-NEXT: seto (%rsi) # sched: [5:1.00]
+; GENERIC-NEXT: setno (%rsi) # sched: [5:1.00]
+; GENERIC-NEXT: setb (%rsi) # sched: [5:1.00]
+; GENERIC-NEXT: setae (%rsi) # sched: [5:1.00]
+; GENERIC-NEXT: sete (%rsi) # sched: [5:1.00]
+; GENERIC-NEXT: setne (%rsi) # sched: [5:1.00]
+; GENERIC-NEXT: setbe (%rsi) # sched: [5:1.00]
+; GENERIC-NEXT: seta (%rsi) # sched: [5:1.00]
+; GENERIC-NEXT: sets (%rsi) # sched: [5:1.00]
+; GENERIC-NEXT: setns (%rsi) # sched: [5:1.00]
+; GENERIC-NEXT: setp (%rsi) # sched: [5:1.00]
+; GENERIC-NEXT: setnp (%rsi) # sched: [5:1.00]
+; GENERIC-NEXT: setl (%rsi) # sched: [5:1.00]
+; GENERIC-NEXT: setge (%rsi) # sched: [5:1.00]
+; GENERIC-NEXT: setle (%rsi) # sched: [5:1.00]
+; GENERIC-NEXT: setg (%rsi) # sched: [5:1.00]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_setcc:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: seto %dil # sched: [1:0.50]
+; ATOM-NEXT: setno %dil # sched: [1:0.50]
+; ATOM-NEXT: setb %dil # sched: [1:0.50]
+; ATOM-NEXT: setae %dil # sched: [1:0.50]
+; ATOM-NEXT: sete %dil # sched: [1:0.50]
+; ATOM-NEXT: setne %dil # sched: [1:0.50]
+; ATOM-NEXT: setbe %dil # sched: [1:0.50]
+; ATOM-NEXT: seta %dil # sched: [1:0.50]
+; ATOM-NEXT: sets %dil # sched: [1:0.50]
+; ATOM-NEXT: setns %dil # sched: [1:0.50]
+; ATOM-NEXT: setp %dil # sched: [1:0.50]
+; ATOM-NEXT: setnp %dil # sched: [1:0.50]
+; ATOM-NEXT: setl %dil # sched: [1:0.50]
+; ATOM-NEXT: setge %dil # sched: [1:0.50]
+; ATOM-NEXT: setle %dil # sched: [1:0.50]
+; ATOM-NEXT: setg %dil # sched: [1:0.50]
+; ATOM-NEXT: seto (%rsi) # sched: [2:1.00]
+; ATOM-NEXT: setno (%rsi) # sched: [2:1.00]
+; ATOM-NEXT: setb (%rsi) # sched: [2:1.00]
+; ATOM-NEXT: setae (%rsi) # sched: [2:1.00]
+; ATOM-NEXT: sete (%rsi) # sched: [2:1.00]
+; ATOM-NEXT: setne (%rsi) # sched: [2:1.00]
+; ATOM-NEXT: setbe (%rsi) # sched: [2:1.00]
+; ATOM-NEXT: seta (%rsi) # sched: [2:1.00]
+; ATOM-NEXT: sets (%rsi) # sched: [2:1.00]
+; ATOM-NEXT: setns (%rsi) # sched: [2:1.00]
+; ATOM-NEXT: setp (%rsi) # sched: [2:1.00]
+; ATOM-NEXT: setnp (%rsi) # sched: [2:1.00]
+; ATOM-NEXT: setl (%rsi) # sched: [2:1.00]
+; ATOM-NEXT: setge (%rsi) # sched: [2:1.00]
+; ATOM-NEXT: setle (%rsi) # sched: [2:1.00]
+; ATOM-NEXT: setg (%rsi) # sched: [2:1.00]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_setcc:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: seto %dil # sched: [1:0.50]
+; SLM-NEXT: setno %dil # sched: [1:0.50]
+; SLM-NEXT: setb %dil # sched: [1:0.50]
+; SLM-NEXT: setae %dil # sched: [1:0.50]
+; SLM-NEXT: sete %dil # sched: [1:0.50]
+; SLM-NEXT: setne %dil # sched: [1:0.50]
+; SLM-NEXT: setbe %dil # sched: [1:0.50]
+; SLM-NEXT: seta %dil # sched: [1:0.50]
+; SLM-NEXT: sets %dil # sched: [1:0.50]
+; SLM-NEXT: setns %dil # sched: [1:0.50]
+; SLM-NEXT: setp %dil # sched: [1:0.50]
+; SLM-NEXT: setnp %dil # sched: [1:0.50]
+; SLM-NEXT: setl %dil # sched: [1:0.50]
+; SLM-NEXT: setge %dil # sched: [1:0.50]
+; SLM-NEXT: setle %dil # sched: [1:0.50]
+; SLM-NEXT: setg %dil # sched: [1:0.50]
+; SLM-NEXT: seto (%rsi) # sched: [1:1.00]
+; SLM-NEXT: setno (%rsi) # sched: [1:1.00]
+; SLM-NEXT: setb (%rsi) # sched: [1:1.00]
+; SLM-NEXT: setae (%rsi) # sched: [1:1.00]
+; SLM-NEXT: sete (%rsi) # sched: [1:1.00]
+; SLM-NEXT: setne (%rsi) # sched: [1:1.00]
+; SLM-NEXT: setbe (%rsi) # sched: [1:1.00]
+; SLM-NEXT: seta (%rsi) # sched: [1:1.00]
+; SLM-NEXT: sets (%rsi) # sched: [1:1.00]
+; SLM-NEXT: setns (%rsi) # sched: [1:1.00]
+; SLM-NEXT: setp (%rsi) # sched: [1:1.00]
+; SLM-NEXT: setnp (%rsi) # sched: [1:1.00]
+; SLM-NEXT: setl (%rsi) # sched: [1:1.00]
+; SLM-NEXT: setge (%rsi) # sched: [1:1.00]
+; SLM-NEXT: setle (%rsi) # sched: [1:1.00]
+; SLM-NEXT: setg (%rsi) # sched: [1:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_setcc:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: seto %dil # sched: [1:0.50]
+; SANDY-NEXT: setno %dil # sched: [1:0.50]
+; SANDY-NEXT: setb %dil # sched: [1:0.50]
+; SANDY-NEXT: setae %dil # sched: [1:0.50]
+; SANDY-NEXT: sete %dil # sched: [1:0.50]
+; SANDY-NEXT: setne %dil # sched: [1:0.50]
+; SANDY-NEXT: setbe %dil # sched: [2:1.00]
+; SANDY-NEXT: seta %dil # sched: [2:1.00]
+; SANDY-NEXT: sets %dil # sched: [1:0.50]
+; SANDY-NEXT: setns %dil # sched: [1:0.50]
+; SANDY-NEXT: setp %dil # sched: [1:0.50]
+; SANDY-NEXT: setnp %dil # sched: [1:0.50]
+; SANDY-NEXT: setl %dil # sched: [1:0.50]
+; SANDY-NEXT: setge %dil # sched: [1:0.50]
+; SANDY-NEXT: setle %dil # sched: [1:0.50]
+; SANDY-NEXT: setg %dil # sched: [1:0.50]
+; SANDY-NEXT: seto (%rsi) # sched: [5:1.00]
+; SANDY-NEXT: setno (%rsi) # sched: [5:1.00]
+; SANDY-NEXT: setb (%rsi) # sched: [5:1.00]
+; SANDY-NEXT: setae (%rsi) # sched: [5:1.00]
+; SANDY-NEXT: sete (%rsi) # sched: [5:1.00]
+; SANDY-NEXT: setne (%rsi) # sched: [5:1.00]
+; SANDY-NEXT: setbe (%rsi) # sched: [5:1.00]
+; SANDY-NEXT: seta (%rsi) # sched: [5:1.00]
+; SANDY-NEXT: sets (%rsi) # sched: [5:1.00]
+; SANDY-NEXT: setns (%rsi) # sched: [5:1.00]
+; SANDY-NEXT: setp (%rsi) # sched: [5:1.00]
+; SANDY-NEXT: setnp (%rsi) # sched: [5:1.00]
+; SANDY-NEXT: setl (%rsi) # sched: [5:1.00]
+; SANDY-NEXT: setge (%rsi) # sched: [5:1.00]
+; SANDY-NEXT: setle (%rsi) # sched: [5:1.00]
+; SANDY-NEXT: setg (%rsi) # sched: [5:1.00]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_setcc:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: seto %dil # sched: [1:0.50]
+; HASWELL-NEXT: setno %dil # sched: [1:0.50]
+; HASWELL-NEXT: setb %dil # sched: [1:0.50]
+; HASWELL-NEXT: setae %dil # sched: [1:0.50]
+; HASWELL-NEXT: sete %dil # sched: [1:0.50]
+; HASWELL-NEXT: setne %dil # sched: [1:0.50]
+; HASWELL-NEXT: setbe %dil # sched: [2:0.50]
+; HASWELL-NEXT: seta %dil # sched: [2:0.50]
+; HASWELL-NEXT: sets %dil # sched: [1:0.50]
+; HASWELL-NEXT: setns %dil # sched: [1:0.50]
+; HASWELL-NEXT: setp %dil # sched: [1:0.50]
+; HASWELL-NEXT: setnp %dil # sched: [1:0.50]
+; HASWELL-NEXT: setl %dil # sched: [1:0.50]
+; HASWELL-NEXT: setge %dil # sched: [1:0.50]
+; HASWELL-NEXT: setle %dil # sched: [1:0.50]
+; HASWELL-NEXT: setg %dil # sched: [1:0.50]
+; HASWELL-NEXT: seto (%rsi) # sched: [2:1.00]
+; HASWELL-NEXT: setno (%rsi) # sched: [2:1.00]
+; HASWELL-NEXT: setb (%rsi) # sched: [2:1.00]
+; HASWELL-NEXT: setae (%rsi) # sched: [2:1.00]
+; HASWELL-NEXT: sete (%rsi) # sched: [2:1.00]
+; HASWELL-NEXT: setne (%rsi) # sched: [2:1.00]
+; HASWELL-NEXT: setbe (%rsi) # sched: [3:1.00]
+; HASWELL-NEXT: seta (%rsi) # sched: [3:1.00]
+; HASWELL-NEXT: sets (%rsi) # sched: [2:1.00]
+; HASWELL-NEXT: setns (%rsi) # sched: [2:1.00]
+; HASWELL-NEXT: setp (%rsi) # sched: [2:1.00]
+; HASWELL-NEXT: setnp (%rsi) # sched: [2:1.00]
+; HASWELL-NEXT: setl (%rsi) # sched: [2:1.00]
+; HASWELL-NEXT: setge (%rsi) # sched: [2:1.00]
+; HASWELL-NEXT: setle (%rsi) # sched: [2:1.00]
+; HASWELL-NEXT: setg (%rsi) # sched: [2:1.00]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_setcc:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: seto %dil # sched: [1:0.50]
+; BROADWELL-NEXT: setno %dil # sched: [1:0.50]
+; BROADWELL-NEXT: setb %dil # sched: [1:0.50]
+; BROADWELL-NEXT: setae %dil # sched: [1:0.50]
+; BROADWELL-NEXT: sete %dil # sched: [1:0.50]
+; BROADWELL-NEXT: setne %dil # sched: [1:0.50]
+; BROADWELL-NEXT: setbe %dil # sched: [2:0.50]
+; BROADWELL-NEXT: seta %dil # sched: [2:0.50]
+; BROADWELL-NEXT: sets %dil # sched: [1:0.50]
+; BROADWELL-NEXT: setns %dil # sched: [1:0.50]
+; BROADWELL-NEXT: setp %dil # sched: [1:0.50]
+; BROADWELL-NEXT: setnp %dil # sched: [1:0.50]
+; BROADWELL-NEXT: setl %dil # sched: [1:0.50]
+; BROADWELL-NEXT: setge %dil # sched: [1:0.50]
+; BROADWELL-NEXT: setle %dil # sched: [1:0.50]
+; BROADWELL-NEXT: setg %dil # sched: [1:0.50]
+; BROADWELL-NEXT: seto (%rsi) # sched: [2:1.00]
+; BROADWELL-NEXT: setno (%rsi) # sched: [2:1.00]
+; BROADWELL-NEXT: setb (%rsi) # sched: [2:1.00]
+; BROADWELL-NEXT: setae (%rsi) # sched: [2:1.00]
+; BROADWELL-NEXT: sete (%rsi) # sched: [2:1.00]
+; BROADWELL-NEXT: setne (%rsi) # sched: [2:1.00]
+; BROADWELL-NEXT: setbe (%rsi) # sched: [3:1.00]
+; BROADWELL-NEXT: seta (%rsi) # sched: [3:1.00]
+; BROADWELL-NEXT: sets (%rsi) # sched: [2:1.00]
+; BROADWELL-NEXT: setns (%rsi) # sched: [2:1.00]
+; BROADWELL-NEXT: setp (%rsi) # sched: [2:1.00]
+; BROADWELL-NEXT: setnp (%rsi) # sched: [2:1.00]
+; BROADWELL-NEXT: setl (%rsi) # sched: [2:1.00]
+; BROADWELL-NEXT: setge (%rsi) # sched: [2:1.00]
+; BROADWELL-NEXT: setle (%rsi) # sched: [2:1.00]
+; BROADWELL-NEXT: setg (%rsi) # sched: [2:1.00]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_setcc:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: seto %dil # sched: [1:0.50]
+; SKYLAKE-NEXT: setno %dil # sched: [1:0.50]
+; SKYLAKE-NEXT: setb %dil # sched: [1:0.50]
+; SKYLAKE-NEXT: setae %dil # sched: [1:0.50]
+; SKYLAKE-NEXT: sete %dil # sched: [1:0.50]
+; SKYLAKE-NEXT: setne %dil # sched: [1:0.50]
+; SKYLAKE-NEXT: setbe %dil # sched: [2:1.00]
+; SKYLAKE-NEXT: seta %dil # sched: [2:1.00]
+; SKYLAKE-NEXT: sets %dil # sched: [1:0.50]
+; SKYLAKE-NEXT: setns %dil # sched: [1:0.50]
+; SKYLAKE-NEXT: setp %dil # sched: [1:0.50]
+; SKYLAKE-NEXT: setnp %dil # sched: [1:0.50]
+; SKYLAKE-NEXT: setl %dil # sched: [1:0.50]
+; SKYLAKE-NEXT: setge %dil # sched: [1:0.50]
+; SKYLAKE-NEXT: setle %dil # sched: [1:0.50]
+; SKYLAKE-NEXT: setg %dil # sched: [1:0.50]
+; SKYLAKE-NEXT: seto (%rsi) # sched: [2:1.00]
+; SKYLAKE-NEXT: setno (%rsi) # sched: [2:1.00]
+; SKYLAKE-NEXT: setb (%rsi) # sched: [2:1.00]
+; SKYLAKE-NEXT: setae (%rsi) # sched: [2:1.00]
+; SKYLAKE-NEXT: sete (%rsi) # sched: [2:1.00]
+; SKYLAKE-NEXT: setne (%rsi) # sched: [2:1.00]
+; SKYLAKE-NEXT: setbe (%rsi) # sched: [3:1.00]
+; SKYLAKE-NEXT: seta (%rsi) # sched: [3:1.00]
+; SKYLAKE-NEXT: sets (%rsi) # sched: [2:1.00]
+; SKYLAKE-NEXT: setns (%rsi) # sched: [2:1.00]
+; SKYLAKE-NEXT: setp (%rsi) # sched: [2:1.00]
+; SKYLAKE-NEXT: setnp (%rsi) # sched: [2:1.00]
+; SKYLAKE-NEXT: setl (%rsi) # sched: [2:1.00]
+; SKYLAKE-NEXT: setge (%rsi) # sched: [2:1.00]
+; SKYLAKE-NEXT: setle (%rsi) # sched: [2:1.00]
+; SKYLAKE-NEXT: setg (%rsi) # sched: [2:1.00]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_setcc:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: seto %dil # sched: [1:0.50]
+; SKX-NEXT: setno %dil # sched: [1:0.50]
+; SKX-NEXT: setb %dil # sched: [1:0.50]
+; SKX-NEXT: setae %dil # sched: [1:0.50]
+; SKX-NEXT: sete %dil # sched: [1:0.50]
+; SKX-NEXT: setne %dil # sched: [1:0.50]
+; SKX-NEXT: setbe %dil # sched: [2:1.00]
+; SKX-NEXT: seta %dil # sched: [2:1.00]
+; SKX-NEXT: sets %dil # sched: [1:0.50]
+; SKX-NEXT: setns %dil # sched: [1:0.50]
+; SKX-NEXT: setp %dil # sched: [1:0.50]
+; SKX-NEXT: setnp %dil # sched: [1:0.50]
+; SKX-NEXT: setl %dil # sched: [1:0.50]
+; SKX-NEXT: setge %dil # sched: [1:0.50]
+; SKX-NEXT: setle %dil # sched: [1:0.50]
+; SKX-NEXT: setg %dil # sched: [1:0.50]
+; SKX-NEXT: seto (%rsi) # sched: [2:1.00]
+; SKX-NEXT: setno (%rsi) # sched: [2:1.00]
+; SKX-NEXT: setb (%rsi) # sched: [2:1.00]
+; SKX-NEXT: setae (%rsi) # sched: [2:1.00]
+; SKX-NEXT: sete (%rsi) # sched: [2:1.00]
+; SKX-NEXT: setne (%rsi) # sched: [2:1.00]
+; SKX-NEXT: setbe (%rsi) # sched: [3:1.00]
+; SKX-NEXT: seta (%rsi) # sched: [3:1.00]
+; SKX-NEXT: sets (%rsi) # sched: [2:1.00]
+; SKX-NEXT: setns (%rsi) # sched: [2:1.00]
+; SKX-NEXT: setp (%rsi) # sched: [2:1.00]
+; SKX-NEXT: setnp (%rsi) # sched: [2:1.00]
+; SKX-NEXT: setl (%rsi) # sched: [2:1.00]
+; SKX-NEXT: setge (%rsi) # sched: [2:1.00]
+; SKX-NEXT: setle (%rsi) # sched: [2:1.00]
+; SKX-NEXT: setg (%rsi) # sched: [2:1.00]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_setcc:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: seto %dil # sched: [1:0.50]
+; BTVER2-NEXT: setno %dil # sched: [1:0.50]
+; BTVER2-NEXT: setb %dil # sched: [1:0.50]
+; BTVER2-NEXT: setae %dil # sched: [1:0.50]
+; BTVER2-NEXT: sete %dil # sched: [1:0.50]
+; BTVER2-NEXT: setne %dil # sched: [1:0.50]
+; BTVER2-NEXT: setbe %dil # sched: [1:0.50]
+; BTVER2-NEXT: seta %dil # sched: [1:0.50]
+; BTVER2-NEXT: sets %dil # sched: [1:0.50]
+; BTVER2-NEXT: setns %dil # sched: [1:0.50]
+; BTVER2-NEXT: setp %dil # sched: [1:0.50]
+; BTVER2-NEXT: setnp %dil # sched: [1:0.50]
+; BTVER2-NEXT: setl %dil # sched: [1:0.50]
+; BTVER2-NEXT: setge %dil # sched: [1:0.50]
+; BTVER2-NEXT: setle %dil # sched: [1:0.50]
+; BTVER2-NEXT: setg %dil # sched: [1:0.50]
+; BTVER2-NEXT: seto (%rsi) # sched: [1:1.00]
+; BTVER2-NEXT: setno (%rsi) # sched: [1:1.00]
+; BTVER2-NEXT: setb (%rsi) # sched: [1:1.00]
+; BTVER2-NEXT: setae (%rsi) # sched: [1:1.00]
+; BTVER2-NEXT: sete (%rsi) # sched: [1:1.00]
+; BTVER2-NEXT: setne (%rsi) # sched: [1:1.00]
+; BTVER2-NEXT: setbe (%rsi) # sched: [1:1.00]
+; BTVER2-NEXT: seta (%rsi) # sched: [1:1.00]
+; BTVER2-NEXT: sets (%rsi) # sched: [1:1.00]
+; BTVER2-NEXT: setns (%rsi) # sched: [1:1.00]
+; BTVER2-NEXT: setp (%rsi) # sched: [1:1.00]
+; BTVER2-NEXT: setnp (%rsi) # sched: [1:1.00]
+; BTVER2-NEXT: setl (%rsi) # sched: [1:1.00]
+; BTVER2-NEXT: setge (%rsi) # sched: [1:1.00]
+; BTVER2-NEXT: setle (%rsi) # sched: [1:1.00]
+; BTVER2-NEXT: setg (%rsi) # sched: [1:1.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_setcc:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: seto %dil # sched: [1:0.25]
+; ZNVER1-NEXT: setno %dil # sched: [1:0.25]
+; ZNVER1-NEXT: setb %dil # sched: [1:0.25]
+; ZNVER1-NEXT: setae %dil # sched: [1:0.25]
+; ZNVER1-NEXT: sete %dil # sched: [1:0.25]
+; ZNVER1-NEXT: setne %dil # sched: [1:0.25]
+; ZNVER1-NEXT: setbe %dil # sched: [1:0.25]
+; ZNVER1-NEXT: seta %dil # sched: [1:0.25]
+; ZNVER1-NEXT: sets %dil # sched: [1:0.25]
+; ZNVER1-NEXT: setns %dil # sched: [1:0.25]
+; ZNVER1-NEXT: setp %dil # sched: [1:0.25]
+; ZNVER1-NEXT: setnp %dil # sched: [1:0.25]
+; ZNVER1-NEXT: setl %dil # sched: [1:0.25]
+; ZNVER1-NEXT: setge %dil # sched: [1:0.25]
+; ZNVER1-NEXT: setle %dil # sched: [1:0.25]
+; ZNVER1-NEXT: setg %dil # sched: [1:0.25]
+; ZNVER1-NEXT: seto (%rsi) # sched: [1:0.25]
+; ZNVER1-NEXT: setno (%rsi) # sched: [1:0.25]
+; ZNVER1-NEXT: setb (%rsi) # sched: [1:0.25]
+; ZNVER1-NEXT: setae (%rsi) # sched: [1:0.25]
+; ZNVER1-NEXT: sete (%rsi) # sched: [1:0.25]
+; ZNVER1-NEXT: setne (%rsi) # sched: [1:0.25]
+; ZNVER1-NEXT: setbe (%rsi) # sched: [1:0.25]
+; ZNVER1-NEXT: seta (%rsi) # sched: [1:0.25]
+; ZNVER1-NEXT: sets (%rsi) # sched: [1:0.25]
+; ZNVER1-NEXT: setns (%rsi) # sched: [1:0.25]
+; ZNVER1-NEXT: setp (%rsi) # sched: [1:0.25]
+; ZNVER1-NEXT: setnp (%rsi) # sched: [1:0.25]
+; ZNVER1-NEXT: setl (%rsi) # sched: [1:0.25]
+; ZNVER1-NEXT: setge (%rsi) # sched: [1:0.25]
+; ZNVER1-NEXT: setle (%rsi) # sched: [1:0.25]
+; ZNVER1-NEXT: setg (%rsi) # sched: [1:0.25]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ call void asm sideeffect "seto $0 \0A\09 setno $0 \0A\09 setb $0 \0A\09 setnb $0 \0A\09 setz $0 \0A\09 setnz $0 \0A\09 setbe $0 \0A\09 setnbe $0 \0A\09 sets $0 \0A\09 setns $0 \0A\09 setp $0 \0A\09 setnp $0 \0A\09 setl $0 \0A\09 setnl $0 \0A\09 setle $0 \0A\09 setnle $0 \0A\09 seto $1 \0A\09 setno $1 \0A\09 setb $1 \0A\09 setnb $1 \0A\09 setz $1 \0A\09 setnz $1 \0A\09 setbe $1 \0A\09 setnbe $1 \0A\09 sets $1 \0A\09 setns $1 \0A\09 setp $1 \0A\09 setnp $1 \0A\09 setl $1 \0A\09 setnl $1 \0A\09 setle $1 \0A\09 setnle $1", "r,*m"(i8 %a0, i8 *%a1)
+ ret void
+}
+
+define void @test_shld_shrd_16(i16 %a0, i16 %a1, i16 *%a2) optsize {
+; GENERIC-LABEL: test_shld_shrd_16:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: shldw %cl, %si, %di # sched: [4:1.50]
+; GENERIC-NEXT: shrdw %cl, %si, %di # sched: [4:1.50]
+; GENERIC-NEXT: shldw %cl, %si, (%rdx) # sched: [10:1.50]
+; GENERIC-NEXT: shrdw %cl, %si, (%rdx) # sched: [10:1.50]
+; GENERIC-NEXT: shldw $7, %si, %di # sched: [2:0.67]
+; GENERIC-NEXT: shrdw $7, %si, %di # sched: [2:0.67]
+; GENERIC-NEXT: shldw $7, %si, (%rdx) # sched: [8:1.00]
+; GENERIC-NEXT: shrdw $7, %si, (%rdx) # sched: [8:1.00]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_shld_shrd_16:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: shldw %cl, %si, %di # sched: [6:3.00]
+; ATOM-NEXT: shrdw %cl, %si, %di # sched: [6:3.00]
+; ATOM-NEXT: shldw %cl, %si, (%rdx) # sched: [6:3.00]
+; ATOM-NEXT: shrdw %cl, %si, (%rdx) # sched: [6:3.00]
+; ATOM-NEXT: shldw $7, %si, %di # sched: [6:3.00]
+; ATOM-NEXT: shrdw $7, %si, %di # sched: [6:3.00]
+; ATOM-NEXT: shldw $7, %si, (%rdx) # sched: [6:3.00]
+; ATOM-NEXT: shrdw $7, %si, (%rdx) # sched: [6:3.00]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_shld_shrd_16:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: shldw %cl, %si, %di # sched: [1:1.00]
+; SLM-NEXT: shrdw %cl, %si, %di # sched: [1:1.00]
+; SLM-NEXT: shldw %cl, %si, (%rdx) # sched: [4:2.00]
+; SLM-NEXT: shrdw %cl, %si, (%rdx) # sched: [4:2.00]
+; SLM-NEXT: shldw $7, %si, %di # sched: [1:1.00]
+; SLM-NEXT: shrdw $7, %si, %di # sched: [1:1.00]
+; SLM-NEXT: shldw $7, %si, (%rdx) # sched: [4:2.00]
+; SLM-NEXT: shrdw $7, %si, (%rdx) # sched: [4:2.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_shld_shrd_16:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: shldw %cl, %si, %di # sched: [4:1.50]
+; SANDY-NEXT: shrdw %cl, %si, %di # sched: [4:1.50]
+; SANDY-NEXT: shldw %cl, %si, (%rdx) # sched: [10:1.50]
+; SANDY-NEXT: shrdw %cl, %si, (%rdx) # sched: [10:1.50]
+; SANDY-NEXT: shldw $7, %si, %di # sched: [2:0.67]
+; SANDY-NEXT: shrdw $7, %si, %di # sched: [2:0.67]
+; SANDY-NEXT: shldw $7, %si, (%rdx) # sched: [8:1.00]
+; SANDY-NEXT: shrdw $7, %si, (%rdx) # sched: [8:1.00]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_shld_shrd_16:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: shldw %cl, %si, %di # sched: [6:1.00]
+; HASWELL-NEXT: shrdw %cl, %si, %di # sched: [6:1.00]
+; HASWELL-NEXT: shldw %cl, %si, (%rdx) # sched: [12:1.00]
+; HASWELL-NEXT: shrdw %cl, %si, (%rdx) # sched: [12:1.00]
+; HASWELL-NEXT: shldw $7, %si, %di # sched: [3:1.00]
+; HASWELL-NEXT: shrdw $7, %si, %di # sched: [3:1.00]
+; HASWELL-NEXT: shldw $7, %si, (%rdx) # sched: [10:1.00]
+; HASWELL-NEXT: shrdw $7, %si, (%rdx) # sched: [10:1.00]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_shld_shrd_16:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: shldw %cl, %si, %di # sched: [6:1.00]
+; BROADWELL-NEXT: shrdw %cl, %si, %di # sched: [6:1.00]
+; BROADWELL-NEXT: shldw %cl, %si, (%rdx) # sched: [11:1.00]
+; BROADWELL-NEXT: shrdw %cl, %si, (%rdx) # sched: [11:1.00]
+; BROADWELL-NEXT: shldw $7, %si, %di # sched: [3:1.00]
+; BROADWELL-NEXT: shrdw $7, %si, %di # sched: [3:1.00]
+; BROADWELL-NEXT: shldw $7, %si, (%rdx) # sched: [9:1.00]
+; BROADWELL-NEXT: shrdw $7, %si, (%rdx) # sched: [9:1.00]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_shld_shrd_16:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: shldw %cl, %si, %di # sched: [6:1.00]
+; SKYLAKE-NEXT: shrdw %cl, %si, %di # sched: [6:1.00]
+; SKYLAKE-NEXT: shldw %cl, %si, (%rdx) # sched: [11:1.00]
+; SKYLAKE-NEXT: shrdw %cl, %si, (%rdx) # sched: [11:1.00]
+; SKYLAKE-NEXT: shldw $7, %si, %di # sched: [3:1.00]
+; SKYLAKE-NEXT: shrdw $7, %si, %di # sched: [3:1.00]
+; SKYLAKE-NEXT: shldw $7, %si, (%rdx) # sched: [9:1.00]
+; SKYLAKE-NEXT: shrdw $7, %si, (%rdx) # sched: [9:1.00]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_shld_shrd_16:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: shldw %cl, %si, %di # sched: [6:1.00]
+; SKX-NEXT: shrdw %cl, %si, %di # sched: [6:1.00]
+; SKX-NEXT: shldw %cl, %si, (%rdx) # sched: [11:1.00]
+; SKX-NEXT: shrdw %cl, %si, (%rdx) # sched: [11:1.00]
+; SKX-NEXT: shldw $7, %si, %di # sched: [3:1.00]
+; SKX-NEXT: shrdw $7, %si, %di # sched: [3:1.00]
+; SKX-NEXT: shldw $7, %si, (%rdx) # sched: [9:1.00]
+; SKX-NEXT: shrdw $7, %si, (%rdx) # sched: [9:1.00]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_shld_shrd_16:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: shldw %cl, %si, %di # sched: [4:4.00]
+; BTVER2-NEXT: shrdw %cl, %si, %di # sched: [4:4.00]
+; BTVER2-NEXT: shldw %cl, %si, (%rdx) # sched: [9:11.00]
+; BTVER2-NEXT: shrdw %cl, %si, (%rdx) # sched: [9:11.00]
+; BTVER2-NEXT: shldw $7, %si, %di # sched: [3:3.00]
+; BTVER2-NEXT: shrdw $7, %si, %di # sched: [3:3.00]
+; BTVER2-NEXT: shldw $7, %si, (%rdx) # sched: [9:11.00]
+; BTVER2-NEXT: shrdw $7, %si, (%rdx) # sched: [9:11.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_shld_shrd_16:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: shldw %cl, %si, %di # sched: [100:?]
+; ZNVER1-NEXT: shrdw %cl, %si, %di # sched: [100:?]
+; ZNVER1-NEXT: shldw %cl, %si, (%rdx) # sched: [100:?]
+; ZNVER1-NEXT: shrdw %cl, %si, (%rdx) # sched: [100:?]
+; ZNVER1-NEXT: shldw $7, %si, %di # sched: [1:0.25]
+; ZNVER1-NEXT: shrdw $7, %si, %di # sched: [1:0.25]
+; ZNVER1-NEXT: shldw $7, %si, (%rdx) # sched: [5:0.50]
+; ZNVER1-NEXT: shrdw $7, %si, (%rdx) # sched: [5:0.50]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ call void asm sideeffect "shld $1, $0 \0A\09 shrd $1, $0 \0A\09 shld $1, $2 \0A\09 shrd $1, $2 \0A\09 shld $3, $1, $0 \0A\09 shrd $3, $1, $0 \0A\09 shld $3, $1, $2 \0A\09 shrd $3, $1, $2", "r,r,*m,i"(i16 %a0, i16 %a1, i16 *%a2, i8 7)
+ ret void
+}
+define void @test_shld_shrd_32(i32 %a0, i32 %a1, i32 *%a2) optsize {
+; GENERIC-LABEL: test_shld_shrd_32:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: shldl %cl, %esi, %edi # sched: [4:1.50]
+; GENERIC-NEXT: shrdl %cl, %esi, %edi # sched: [4:1.50]
+; GENERIC-NEXT: shldl %cl, %esi, (%rdx) # sched: [10:1.50]
+; GENERIC-NEXT: shrdl %cl, %esi, (%rdx) # sched: [10:1.50]
+; GENERIC-NEXT: shldl $7, %esi, %edi # sched: [2:0.67]
+; GENERIC-NEXT: shrdl $7, %esi, %edi # sched: [2:0.67]
+; GENERIC-NEXT: shldl $7, %esi, (%rdx) # sched: [8:1.00]
+; GENERIC-NEXT: shrdl $7, %esi, (%rdx) # sched: [8:1.00]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_shld_shrd_32:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: shldl %cl, %esi, %edi # sched: [2:1.00]
+; ATOM-NEXT: shrdl %cl, %esi, %edi # sched: [2:1.00]
+; ATOM-NEXT: shldl %cl, %esi, (%rdx) # sched: [4:2.00]
+; ATOM-NEXT: shrdl %cl, %esi, (%rdx) # sched: [4:2.00]
+; ATOM-NEXT: shldl $7, %esi, %edi # sched: [2:1.00]
+; ATOM-NEXT: shrdl $7, %esi, %edi # sched: [2:1.00]
+; ATOM-NEXT: shldl $7, %esi, (%rdx) # sched: [4:2.00]
+; ATOM-NEXT: shrdl $7, %esi, (%rdx) # sched: [4:2.00]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_shld_shrd_32:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: shldl %cl, %esi, %edi # sched: [1:1.00]
+; SLM-NEXT: shrdl %cl, %esi, %edi # sched: [1:1.00]
+; SLM-NEXT: shldl %cl, %esi, (%rdx) # sched: [4:2.00]
+; SLM-NEXT: shrdl %cl, %esi, (%rdx) # sched: [4:2.00]
+; SLM-NEXT: shldl $7, %esi, %edi # sched: [1:1.00]
+; SLM-NEXT: shrdl $7, %esi, %edi # sched: [1:1.00]
+; SLM-NEXT: shldl $7, %esi, (%rdx) # sched: [4:2.00]
+; SLM-NEXT: shrdl $7, %esi, (%rdx) # sched: [4:2.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_shld_shrd_32:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: shldl %cl, %esi, %edi # sched: [4:1.50]
+; SANDY-NEXT: shrdl %cl, %esi, %edi # sched: [4:1.50]
+; SANDY-NEXT: shldl %cl, %esi, (%rdx) # sched: [10:1.50]
+; SANDY-NEXT: shrdl %cl, %esi, (%rdx) # sched: [10:1.50]
+; SANDY-NEXT: shldl $7, %esi, %edi # sched: [2:0.67]
+; SANDY-NEXT: shrdl $7, %esi, %edi # sched: [2:0.67]
+; SANDY-NEXT: shldl $7, %esi, (%rdx) # sched: [8:1.00]
+; SANDY-NEXT: shrdl $7, %esi, (%rdx) # sched: [8:1.00]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_shld_shrd_32:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: shldl %cl, %esi, %edi # sched: [6:1.00]
+; HASWELL-NEXT: shrdl %cl, %esi, %edi # sched: [6:1.00]
+; HASWELL-NEXT: shldl %cl, %esi, (%rdx) # sched: [12:1.00]
+; HASWELL-NEXT: shrdl %cl, %esi, (%rdx) # sched: [12:1.00]
+; HASWELL-NEXT: shldl $7, %esi, %edi # sched: [3:1.00]
+; HASWELL-NEXT: shrdl $7, %esi, %edi # sched: [3:1.00]
+; HASWELL-NEXT: shldl $7, %esi, (%rdx) # sched: [10:1.00]
+; HASWELL-NEXT: shrdl $7, %esi, (%rdx) # sched: [10:1.00]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_shld_shrd_32:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: shldl %cl, %esi, %edi # sched: [6:1.00]
+; BROADWELL-NEXT: shrdl %cl, %esi, %edi # sched: [6:1.00]
+; BROADWELL-NEXT: shldl %cl, %esi, (%rdx) # sched: [11:1.00]
+; BROADWELL-NEXT: shrdl %cl, %esi, (%rdx) # sched: [11:1.00]
+; BROADWELL-NEXT: shldl $7, %esi, %edi # sched: [3:1.00]
+; BROADWELL-NEXT: shrdl $7, %esi, %edi # sched: [3:1.00]
+; BROADWELL-NEXT: shldl $7, %esi, (%rdx) # sched: [9:1.00]
+; BROADWELL-NEXT: shrdl $7, %esi, (%rdx) # sched: [9:1.00]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_shld_shrd_32:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: shldl %cl, %esi, %edi # sched: [6:1.00]
+; SKYLAKE-NEXT: shrdl %cl, %esi, %edi # sched: [6:1.00]
+; SKYLAKE-NEXT: shldl %cl, %esi, (%rdx) # sched: [11:1.00]
+; SKYLAKE-NEXT: shrdl %cl, %esi, (%rdx) # sched: [11:1.00]
+; SKYLAKE-NEXT: shldl $7, %esi, %edi # sched: [3:1.00]
+; SKYLAKE-NEXT: shrdl $7, %esi, %edi # sched: [3:1.00]
+; SKYLAKE-NEXT: shldl $7, %esi, (%rdx) # sched: [9:1.00]
+; SKYLAKE-NEXT: shrdl $7, %esi, (%rdx) # sched: [9:1.00]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_shld_shrd_32:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: shldl %cl, %esi, %edi # sched: [6:1.00]
+; SKX-NEXT: shrdl %cl, %esi, %edi # sched: [6:1.00]
+; SKX-NEXT: shldl %cl, %esi, (%rdx) # sched: [11:1.00]
+; SKX-NEXT: shrdl %cl, %esi, (%rdx) # sched: [11:1.00]
+; SKX-NEXT: shldl $7, %esi, %edi # sched: [3:1.00]
+; SKX-NEXT: shrdl $7, %esi, %edi # sched: [3:1.00]
+; SKX-NEXT: shldl $7, %esi, (%rdx) # sched: [9:1.00]
+; SKX-NEXT: shrdl $7, %esi, (%rdx) # sched: [9:1.00]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_shld_shrd_32:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: shldl %cl, %esi, %edi # sched: [4:4.00]
+; BTVER2-NEXT: shrdl %cl, %esi, %edi # sched: [4:4.00]
+; BTVER2-NEXT: shldl %cl, %esi, (%rdx) # sched: [9:11.00]
+; BTVER2-NEXT: shrdl %cl, %esi, (%rdx) # sched: [9:11.00]
+; BTVER2-NEXT: shldl $7, %esi, %edi # sched: [3:3.00]
+; BTVER2-NEXT: shrdl $7, %esi, %edi # sched: [3:3.00]
+; BTVER2-NEXT: shldl $7, %esi, (%rdx) # sched: [9:11.00]
+; BTVER2-NEXT: shrdl $7, %esi, (%rdx) # sched: [9:11.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_shld_shrd_32:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: shldl %cl, %esi, %edi # sched: [100:?]
+; ZNVER1-NEXT: shrdl %cl, %esi, %edi # sched: [100:?]
+; ZNVER1-NEXT: shldl %cl, %esi, (%rdx) # sched: [100:?]
+; ZNVER1-NEXT: shrdl %cl, %esi, (%rdx) # sched: [100:?]
+; ZNVER1-NEXT: shldl $7, %esi, %edi # sched: [1:0.25]
+; ZNVER1-NEXT: shrdl $7, %esi, %edi # sched: [1:0.25]
+; ZNVER1-NEXT: shldl $7, %esi, (%rdx) # sched: [5:0.50]
+; ZNVER1-NEXT: shrdl $7, %esi, (%rdx) # sched: [5:0.50]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ call void asm sideeffect "shld $1, $0 \0A\09 shrd $1, $0 \0A\09 shld $1, $2 \0A\09 shrd $1, $2 \0A\09 shld $3, $1, $0 \0A\09 shrd $3, $1, $0 \0A\09 shld $3, $1, $2 \0A\09 shrd $3, $1, $2", "r,r,*m,i"(i32 %a0, i32 %a1, i32 *%a2, i8 7)
+ ret void
+}
+define void @test_shld_shrd_64(i64 %a0, i64 %a1, i64 *%a2) optsize {
+; GENERIC-LABEL: test_shld_shrd_64:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: shldq %cl, %rsi, %rdi # sched: [4:1.50]
+; GENERIC-NEXT: shrdq %cl, %rsi, %rdi # sched: [4:1.50]
+; GENERIC-NEXT: shldq %cl, %rsi, (%rdx) # sched: [10:1.50]
+; GENERIC-NEXT: shrdq %cl, %rsi, (%rdx) # sched: [10:1.50]
+; GENERIC-NEXT: shldq $7, %rsi, %rdi # sched: [2:0.67]
+; GENERIC-NEXT: shrdq $7, %rsi, %rdi # sched: [2:0.67]
+; GENERIC-NEXT: shldq $7, %rsi, (%rdx) # sched: [8:1.00]
+; GENERIC-NEXT: shrdq $7, %rsi, (%rdx) # sched: [8:1.00]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_shld_shrd_64:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: shldq %cl, %rsi, %rdi # sched: [8:4.00]
+; ATOM-NEXT: shrdq %cl, %rsi, %rdi # sched: [8:4.00]
+; ATOM-NEXT: shldq %cl, %rsi, (%rdx) # sched: [9:4.50]
+; ATOM-NEXT: shrdq %cl, %rsi, (%rdx) # sched: [9:4.50]
+; ATOM-NEXT: shldq $7, %rsi, %rdi # sched: [9:4.50]
+; ATOM-NEXT: shrdq $7, %rsi, %rdi # sched: [9:4.50]
+; ATOM-NEXT: shldq $7, %rsi, (%rdx) # sched: [9:4.50]
+; ATOM-NEXT: shrdq $7, %rsi, (%rdx) # sched: [9:4.50]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_shld_shrd_64:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: shldq %cl, %rsi, %rdi # sched: [1:1.00]
+; SLM-NEXT: shrdq %cl, %rsi, %rdi # sched: [1:1.00]
+; SLM-NEXT: shldq %cl, %rsi, (%rdx) # sched: [4:2.00]
+; SLM-NEXT: shrdq %cl, %rsi, (%rdx) # sched: [4:2.00]
+; SLM-NEXT: shldq $7, %rsi, %rdi # sched: [1:1.00]
+; SLM-NEXT: shrdq $7, %rsi, %rdi # sched: [1:1.00]
+; SLM-NEXT: shldq $7, %rsi, (%rdx) # sched: [4:2.00]
+; SLM-NEXT: shrdq $7, %rsi, (%rdx) # sched: [4:2.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_shld_shrd_64:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: shldq %cl, %rsi, %rdi # sched: [4:1.50]
+; SANDY-NEXT: shrdq %cl, %rsi, %rdi # sched: [4:1.50]
+; SANDY-NEXT: shldq %cl, %rsi, (%rdx) # sched: [10:1.50]
+; SANDY-NEXT: shrdq %cl, %rsi, (%rdx) # sched: [10:1.50]
+; SANDY-NEXT: shldq $7, %rsi, %rdi # sched: [2:0.67]
+; SANDY-NEXT: shrdq $7, %rsi, %rdi # sched: [2:0.67]
+; SANDY-NEXT: shldq $7, %rsi, (%rdx) # sched: [8:1.00]
+; SANDY-NEXT: shrdq $7, %rsi, (%rdx) # sched: [8:1.00]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_shld_shrd_64:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: shldq %cl, %rsi, %rdi # sched: [6:1.00]
+; HASWELL-NEXT: shrdq %cl, %rsi, %rdi # sched: [6:1.00]
+; HASWELL-NEXT: shldq %cl, %rsi, (%rdx) # sched: [12:1.00]
+; HASWELL-NEXT: shrdq %cl, %rsi, (%rdx) # sched: [12:1.00]
+; HASWELL-NEXT: shldq $7, %rsi, %rdi # sched: [3:1.00]
+; HASWELL-NEXT: shrdq $7, %rsi, %rdi # sched: [3:1.00]
+; HASWELL-NEXT: shldq $7, %rsi, (%rdx) # sched: [10:1.00]
+; HASWELL-NEXT: shrdq $7, %rsi, (%rdx) # sched: [10:1.00]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_shld_shrd_64:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: shldq %cl, %rsi, %rdi # sched: [6:1.00]
+; BROADWELL-NEXT: shrdq %cl, %rsi, %rdi # sched: [6:1.00]
+; BROADWELL-NEXT: shldq %cl, %rsi, (%rdx) # sched: [11:1.00]
+; BROADWELL-NEXT: shrdq %cl, %rsi, (%rdx) # sched: [11:1.00]
+; BROADWELL-NEXT: shldq $7, %rsi, %rdi # sched: [3:1.00]
+; BROADWELL-NEXT: shrdq $7, %rsi, %rdi # sched: [3:1.00]
+; BROADWELL-NEXT: shldq $7, %rsi, (%rdx) # sched: [9:1.00]
+; BROADWELL-NEXT: shrdq $7, %rsi, (%rdx) # sched: [9:1.00]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_shld_shrd_64:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: shldq %cl, %rsi, %rdi # sched: [6:1.00]
+; SKYLAKE-NEXT: shrdq %cl, %rsi, %rdi # sched: [6:1.00]
+; SKYLAKE-NEXT: shldq %cl, %rsi, (%rdx) # sched: [11:1.00]
+; SKYLAKE-NEXT: shrdq %cl, %rsi, (%rdx) # sched: [11:1.00]
+; SKYLAKE-NEXT: shldq $7, %rsi, %rdi # sched: [3:1.00]
+; SKYLAKE-NEXT: shrdq $7, %rsi, %rdi # sched: [3:1.00]
+; SKYLAKE-NEXT: shldq $7, %rsi, (%rdx) # sched: [9:1.00]
+; SKYLAKE-NEXT: shrdq $7, %rsi, (%rdx) # sched: [9:1.00]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_shld_shrd_64:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: shldq %cl, %rsi, %rdi # sched: [6:1.00]
+; SKX-NEXT: shrdq %cl, %rsi, %rdi # sched: [6:1.00]
+; SKX-NEXT: shldq %cl, %rsi, (%rdx) # sched: [11:1.00]
+; SKX-NEXT: shrdq %cl, %rsi, (%rdx) # sched: [11:1.00]
+; SKX-NEXT: shldq $7, %rsi, %rdi # sched: [3:1.00]
+; SKX-NEXT: shrdq $7, %rsi, %rdi # sched: [3:1.00]
+; SKX-NEXT: shldq $7, %rsi, (%rdx) # sched: [9:1.00]
+; SKX-NEXT: shrdq $7, %rsi, (%rdx) # sched: [9:1.00]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_shld_shrd_64:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: shldq %cl, %rsi, %rdi # sched: [4:4.00]
+; BTVER2-NEXT: shrdq %cl, %rsi, %rdi # sched: [4:4.00]
+; BTVER2-NEXT: shldq %cl, %rsi, (%rdx) # sched: [9:11.00]
+; BTVER2-NEXT: shrdq %cl, %rsi, (%rdx) # sched: [9:11.00]
+; BTVER2-NEXT: shldq $7, %rsi, %rdi # sched: [3:3.00]
+; BTVER2-NEXT: shrdq $7, %rsi, %rdi # sched: [3:3.00]
+; BTVER2-NEXT: shldq $7, %rsi, (%rdx) # sched: [9:11.00]
+; BTVER2-NEXT: shrdq $7, %rsi, (%rdx) # sched: [9:11.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_shld_shrd_64:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: shldq %cl, %rsi, %rdi # sched: [100:?]
+; ZNVER1-NEXT: shrdq %cl, %rsi, %rdi # sched: [100:?]
+; ZNVER1-NEXT: shldq %cl, %rsi, (%rdx) # sched: [100:?]
+; ZNVER1-NEXT: shrdq %cl, %rsi, (%rdx) # sched: [100:?]
+; ZNVER1-NEXT: shldq $7, %rsi, %rdi # sched: [1:0.25]
+; ZNVER1-NEXT: shrdq $7, %rsi, %rdi # sched: [1:0.25]
+; ZNVER1-NEXT: shldq $7, %rsi, (%rdx) # sched: [5:0.50]
+; ZNVER1-NEXT: shrdq $7, %rsi, (%rdx) # sched: [5:0.50]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ call void asm sideeffect "shld $1, $0 \0A\09 shrd $1, $0 \0A\09 shld $1, $2 \0A\09 shrd $1, $2 \0A\09 shld $3, $1, $0 \0A\09 shrd $3, $1, $0 \0A\09 shld $3, $1, $2 \0A\09 shrd $3, $1, $2", "r,r,*m,i"(i64 %a0, i64 %a1, i64 *%a2, i8 7)
+ ret void
+}
+
+define void @test_stc_std() optsize {
+; GENERIC-LABEL: test_stc_std:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: stc # sched: [1:0.33]
+; GENERIC-NEXT: std # sched: [1:0.33]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_stc_std:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: stc # sched: [1:0.50]
+; ATOM-NEXT: std # sched: [21:10.50]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_stc_std:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: stc # sched: [1:0.50]
+; SLM-NEXT: std # sched: [1:0.50]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_stc_std:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: stc # sched: [1:0.33]
+; SANDY-NEXT: std # sched: [1:0.33]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_stc_std:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: stc # sched: [1:0.25]
+; HASWELL-NEXT: std # sched: [6:1.50]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_stc_std:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: stc # sched: [1:0.25]
+; BROADWELL-NEXT: std # sched: [6:1.50]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_stc_std:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: stc # sched: [1:0.25]
+; SKYLAKE-NEXT: std # sched: [6:1.50]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_stc_std:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: stc # sched: [1:0.25]
+; SKX-NEXT: std # sched: [6:1.50]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_stc_std:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: stc # sched: [1:0.50]
+; BTVER2-NEXT: std # sched: [1:0.50]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_stc_std:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: stc # sched: [1:0.25]
+; ZNVER1-NEXT: std # sched: [1:0.25]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ call void asm sideeffect "stc \0A\09 std", ""()
+ ret void
+}
+
+define void @test_stos() optsize {
+; GENERIC-LABEL: test_stos:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: stosb %al, %es:(%rdi) # sched: [5:1.00]
+; GENERIC-NEXT: stosw %ax, %es:(%rdi) # sched: [5:1.00]
+; GENERIC-NEXT: stosl %eax, %es:(%rdi) # sched: [5:1.00]
+; GENERIC-NEXT: stosq %rax, %es:(%rdi) # sched: [5:1.00]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_stos:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: stosb %al, %es:(%rdi) # sched: [1:0.50]
+; ATOM-NEXT: stosw %ax, %es:(%rdi) # sched: [1:0.50]
+; ATOM-NEXT: stosl %eax, %es:(%rdi) # sched: [1:0.50]
+; ATOM-NEXT: stosq %rax, %es:(%rdi) # sched: [1:0.50]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_stos:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: stosb %al, %es:(%rdi) # sched: [100:1.00]
+; SLM-NEXT: stosw %ax, %es:(%rdi) # sched: [100:1.00]
+; SLM-NEXT: stosl %eax, %es:(%rdi) # sched: [100:1.00]
+; SLM-NEXT: stosq %rax, %es:(%rdi) # sched: [100:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_stos:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: stosb %al, %es:(%rdi) # sched: [5:1.00]
+; SANDY-NEXT: stosw %ax, %es:(%rdi) # sched: [5:1.00]
+; SANDY-NEXT: stosl %eax, %es:(%rdi) # sched: [5:1.00]
+; SANDY-NEXT: stosq %rax, %es:(%rdi) # sched: [5:1.00]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_stos:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: stosb %al, %es:(%rdi) # sched: [2:1.00]
+; HASWELL-NEXT: stosw %ax, %es:(%rdi) # sched: [2:1.00]
+; HASWELL-NEXT: stosl %eax, %es:(%rdi) # sched: [2:1.00]
+; HASWELL-NEXT: stosq %rax, %es:(%rdi) # sched: [2:1.00]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_stos:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: stosb %al, %es:(%rdi) # sched: [2:1.00]
+; BROADWELL-NEXT: stosw %ax, %es:(%rdi) # sched: [2:1.00]
+; BROADWELL-NEXT: stosl %eax, %es:(%rdi) # sched: [2:1.00]
+; BROADWELL-NEXT: stosq %rax, %es:(%rdi) # sched: [2:1.00]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_stos:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: stosb %al, %es:(%rdi) # sched: [2:1.00]
+; SKYLAKE-NEXT: stosw %ax, %es:(%rdi) # sched: [2:1.00]
+; SKYLAKE-NEXT: stosl %eax, %es:(%rdi) # sched: [2:1.00]
+; SKYLAKE-NEXT: stosq %rax, %es:(%rdi) # sched: [2:1.00]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_stos:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: stosb %al, %es:(%rdi) # sched: [2:1.00]
+; SKX-NEXT: stosw %ax, %es:(%rdi) # sched: [2:1.00]
+; SKX-NEXT: stosl %eax, %es:(%rdi) # sched: [2:1.00]
+; SKX-NEXT: stosq %rax, %es:(%rdi) # sched: [2:1.00]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_stos:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: stosb %al, %es:(%rdi) # sched: [100:0.17]
+; BTVER2-NEXT: stosw %ax, %es:(%rdi) # sched: [100:0.17]
+; BTVER2-NEXT: stosl %eax, %es:(%rdi) # sched: [100:0.17]
+; BTVER2-NEXT: stosq %rax, %es:(%rdi) # sched: [100:0.17]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_stos:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: stosb %al, %es:(%rdi) # sched: [100:?]
+; ZNVER1-NEXT: stosw %ax, %es:(%rdi) # sched: [100:?]
+; ZNVER1-NEXT: stosl %eax, %es:(%rdi) # sched: [100:?]
+; ZNVER1-NEXT: stosq %rax, %es:(%rdi) # sched: [100:?]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ call void asm sideeffect "stosb \0A\09 stosw \0A\09 stosl \0A\09 stosq", ""()
+ ret void
+}
+
+define void @test_sub_8(i8 %a0, i8* %a1) optsize {
+; GENERIC-LABEL: test_sub_8:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: subb $7, %al # sched: [1:0.33]
+; GENERIC-NEXT: subb $7, %dil # sched: [1:0.33]
+; GENERIC-NEXT: subb $7, (%rsi) # sched: [7:1.00]
+; GENERIC-NEXT: subb %dil, %dil # sched: [1:0.33]
+; GENERIC-NEXT: subb %dil, (%rsi) # sched: [7:1.00]
+; GENERIC-NEXT: subb (%rsi), %dil # sched: [6:0.50]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_sub_8:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: subb $7, %al # sched: [1:0.50]
+; ATOM-NEXT: subb $7, %dil # sched: [1:0.50]
+; ATOM-NEXT: subb $7, (%rsi) # sched: [1:1.00]
+; ATOM-NEXT: subb %dil, %dil # sched: [1:0.50]
+; ATOM-NEXT: subb %dil, (%rsi) # sched: [1:1.00]
+; ATOM-NEXT: subb (%rsi), %dil # sched: [1:1.00]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_sub_8:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: subb $7, %al # sched: [1:0.50]
+; SLM-NEXT: subb $7, %dil # sched: [1:0.50]
+; SLM-NEXT: subb $7, (%rsi) # sched: [4:2.00]
+; SLM-NEXT: subb %dil, %dil # sched: [1:0.50]
+; SLM-NEXT: subb %dil, (%rsi) # sched: [4:2.00]
+; SLM-NEXT: subb (%rsi), %dil # sched: [4:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_sub_8:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: subb $7, %al # sched: [1:0.33]
+; SANDY-NEXT: subb $7, %dil # sched: [1:0.33]
+; SANDY-NEXT: subb $7, (%rsi) # sched: [7:1.00]
+; SANDY-NEXT: subb %dil, %dil # sched: [1:0.33]
+; SANDY-NEXT: subb %dil, (%rsi) # sched: [7:1.00]
+; SANDY-NEXT: subb (%rsi), %dil # sched: [6:0.50]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_sub_8:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: subb $7, %al # sched: [1:0.25]
+; HASWELL-NEXT: subb $7, %dil # sched: [1:0.25]
+; HASWELL-NEXT: subb $7, (%rsi) # sched: [7:1.00]
+; HASWELL-NEXT: subb %dil, %dil # sched: [1:0.25]
+; HASWELL-NEXT: subb %dil, (%rsi) # sched: [7:1.00]
+; HASWELL-NEXT: subb (%rsi), %dil # sched: [6:0.50]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_sub_8:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: subb $7, %al # sched: [1:0.25]
+; BROADWELL-NEXT: subb $7, %dil # sched: [1:0.25]
+; BROADWELL-NEXT: subb $7, (%rsi) # sched: [6:1.00]
+; BROADWELL-NEXT: subb %dil, %dil # sched: [1:0.25]
+; BROADWELL-NEXT: subb %dil, (%rsi) # sched: [6:1.00]
+; BROADWELL-NEXT: subb (%rsi), %dil # sched: [6:0.50]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_sub_8:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: subb $7, %al # sched: [1:0.25]
+; SKYLAKE-NEXT: subb $7, %dil # sched: [1:0.25]
+; SKYLAKE-NEXT: subb $7, (%rsi) # sched: [6:1.00]
+; SKYLAKE-NEXT: subb %dil, %dil # sched: [1:0.25]
+; SKYLAKE-NEXT: subb %dil, (%rsi) # sched: [6:1.00]
+; SKYLAKE-NEXT: subb (%rsi), %dil # sched: [6:0.50]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_sub_8:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: subb $7, %al # sched: [1:0.25]
+; SKX-NEXT: subb $7, %dil # sched: [1:0.25]
+; SKX-NEXT: subb $7, (%rsi) # sched: [6:1.00]
+; SKX-NEXT: subb %dil, %dil # sched: [1:0.25]
+; SKX-NEXT: subb %dil, (%rsi) # sched: [6:1.00]
+; SKX-NEXT: subb (%rsi), %dil # sched: [6:0.50]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_sub_8:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: subb $7, %al # sched: [1:0.50]
+; BTVER2-NEXT: subb $7, %dil # sched: [1:0.50]
+; BTVER2-NEXT: subb $7, (%rsi) # sched: [4:1.00]
+; BTVER2-NEXT: subb %dil, %dil # sched: [1:0.50]
+; BTVER2-NEXT: subb %dil, (%rsi) # sched: [4:1.00]
+; BTVER2-NEXT: subb (%rsi), %dil # sched: [4:1.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_sub_8:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: subb $7, %al # sched: [1:0.25]
+; ZNVER1-NEXT: subb $7, %dil # sched: [1:0.25]
+; ZNVER1-NEXT: subb $7, (%rsi) # sched: [5:0.50]
+; ZNVER1-NEXT: subb %dil, %dil # sched: [1:0.25]
+; ZNVER1-NEXT: subb %dil, (%rsi) # sched: [5:0.50]
+; ZNVER1-NEXT: subb (%rsi), %dil # sched: [5:0.50]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ tail call void asm "subb $2, %AL \0A\09 subb $2, $0 \0A\09 subb $2, $1 \0A\09 subb $0, $0 \0A\09 subb $0, $1 \0A\09 subb $1, $0", "r,*m,i"(i8 %a0, i8* %a1, i8 7) nounwind
+ ret void
+}
+define void @test_sub_16(i16 %a0, i16* %a1) optsize {
+; GENERIC-LABEL: test_sub_16:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: subw $511, %ax # imm = 0x1FF
+; GENERIC-NEXT: # sched: [1:0.33]
+; GENERIC-NEXT: subw $511, %di # imm = 0x1FF
+; GENERIC-NEXT: # sched: [1:0.33]
+; GENERIC-NEXT: subw $511, (%rsi) # imm = 0x1FF
+; GENERIC-NEXT: # sched: [7:1.00]
+; GENERIC-NEXT: subw $7, %di # sched: [1:0.33]
+; GENERIC-NEXT: subw $7, (%rsi) # sched: [7:1.00]
+; GENERIC-NEXT: subw %di, %di # sched: [1:0.33]
+; GENERIC-NEXT: subw %di, (%rsi) # sched: [7:1.00]
+; GENERIC-NEXT: subw (%rsi), %di # sched: [6:0.50]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_sub_16:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: subw $511, %ax # imm = 0x1FF
+; ATOM-NEXT: # sched: [1:0.50]
+; ATOM-NEXT: subw $511, %di # imm = 0x1FF
+; ATOM-NEXT: # sched: [1:0.50]
+; ATOM-NEXT: subw $511, (%rsi) # imm = 0x1FF
+; ATOM-NEXT: # sched: [1:1.00]
+; ATOM-NEXT: subw $7, %di # sched: [1:0.50]
+; ATOM-NEXT: subw $7, (%rsi) # sched: [1:1.00]
+; ATOM-NEXT: subw %di, %di # sched: [1:0.50]
+; ATOM-NEXT: subw %di, (%rsi) # sched: [1:1.00]
+; ATOM-NEXT: subw (%rsi), %di # sched: [1:1.00]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_sub_16:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: subw $511, %ax # imm = 0x1FF
+; SLM-NEXT: # sched: [1:0.50]
+; SLM-NEXT: subw $511, %di # imm = 0x1FF
+; SLM-NEXT: # sched: [1:0.50]
+; SLM-NEXT: subw $511, (%rsi) # imm = 0x1FF
+; SLM-NEXT: # sched: [4:2.00]
+; SLM-NEXT: subw $7, %di # sched: [1:0.50]
+; SLM-NEXT: subw $7, (%rsi) # sched: [4:2.00]
+; SLM-NEXT: subw %di, %di # sched: [1:0.50]
+; SLM-NEXT: subw %di, (%rsi) # sched: [4:2.00]
+; SLM-NEXT: subw (%rsi), %di # sched: [4:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_sub_16:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: subw $511, %ax # imm = 0x1FF
+; SANDY-NEXT: # sched: [1:0.33]
+; SANDY-NEXT: subw $511, %di # imm = 0x1FF
+; SANDY-NEXT: # sched: [1:0.33]
+; SANDY-NEXT: subw $511, (%rsi) # imm = 0x1FF
+; SANDY-NEXT: # sched: [7:1.00]
+; SANDY-NEXT: subw $7, %di # sched: [1:0.33]
+; SANDY-NEXT: subw $7, (%rsi) # sched: [7:1.00]
+; SANDY-NEXT: subw %di, %di # sched: [1:0.33]
+; SANDY-NEXT: subw %di, (%rsi) # sched: [7:1.00]
+; SANDY-NEXT: subw (%rsi), %di # sched: [6:0.50]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_sub_16:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: subw $511, %ax # imm = 0x1FF
+; HASWELL-NEXT: # sched: [1:0.25]
+; HASWELL-NEXT: subw $511, %di # imm = 0x1FF
+; HASWELL-NEXT: # sched: [1:0.25]
+; HASWELL-NEXT: subw $511, (%rsi) # imm = 0x1FF
+; HASWELL-NEXT: # sched: [7:1.00]
+; HASWELL-NEXT: subw $7, %di # sched: [1:0.25]
+; HASWELL-NEXT: subw $7, (%rsi) # sched: [7:1.00]
+; HASWELL-NEXT: subw %di, %di # sched: [1:0.25]
+; HASWELL-NEXT: subw %di, (%rsi) # sched: [7:1.00]
+; HASWELL-NEXT: subw (%rsi), %di # sched: [6:0.50]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_sub_16:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: subw $511, %ax # imm = 0x1FF
+; BROADWELL-NEXT: # sched: [1:0.25]
+; BROADWELL-NEXT: subw $511, %di # imm = 0x1FF
+; BROADWELL-NEXT: # sched: [1:0.25]
+; BROADWELL-NEXT: subw $511, (%rsi) # imm = 0x1FF
+; BROADWELL-NEXT: # sched: [6:1.00]
+; BROADWELL-NEXT: subw $7, %di # sched: [1:0.25]
+; BROADWELL-NEXT: subw $7, (%rsi) # sched: [6:1.00]
+; BROADWELL-NEXT: subw %di, %di # sched: [1:0.25]
+; BROADWELL-NEXT: subw %di, (%rsi) # sched: [6:1.00]
+; BROADWELL-NEXT: subw (%rsi), %di # sched: [6:0.50]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_sub_16:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: subw $511, %ax # imm = 0x1FF
+; SKYLAKE-NEXT: # sched: [1:0.25]
+; SKYLAKE-NEXT: subw $511, %di # imm = 0x1FF
+; SKYLAKE-NEXT: # sched: [1:0.25]
+; SKYLAKE-NEXT: subw $511, (%rsi) # imm = 0x1FF
+; SKYLAKE-NEXT: # sched: [6:1.00]
+; SKYLAKE-NEXT: subw $7, %di # sched: [1:0.25]
+; SKYLAKE-NEXT: subw $7, (%rsi) # sched: [6:1.00]
+; SKYLAKE-NEXT: subw %di, %di # sched: [1:0.25]
+; SKYLAKE-NEXT: subw %di, (%rsi) # sched: [6:1.00]
+; SKYLAKE-NEXT: subw (%rsi), %di # sched: [6:0.50]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_sub_16:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: subw $511, %ax # imm = 0x1FF
+; SKX-NEXT: # sched: [1:0.25]
+; SKX-NEXT: subw $511, %di # imm = 0x1FF
+; SKX-NEXT: # sched: [1:0.25]
+; SKX-NEXT: subw $511, (%rsi) # imm = 0x1FF
+; SKX-NEXT: # sched: [6:1.00]
+; SKX-NEXT: subw $7, %di # sched: [1:0.25]
+; SKX-NEXT: subw $7, (%rsi) # sched: [6:1.00]
+; SKX-NEXT: subw %di, %di # sched: [1:0.25]
+; SKX-NEXT: subw %di, (%rsi) # sched: [6:1.00]
+; SKX-NEXT: subw (%rsi), %di # sched: [6:0.50]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_sub_16:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: subw $511, %ax # imm = 0x1FF
+; BTVER2-NEXT: # sched: [1:0.50]
+; BTVER2-NEXT: subw $511, %di # imm = 0x1FF
+; BTVER2-NEXT: # sched: [1:0.50]
+; BTVER2-NEXT: subw $511, (%rsi) # imm = 0x1FF
+; BTVER2-NEXT: # sched: [4:1.00]
+; BTVER2-NEXT: subw $7, %di # sched: [1:0.50]
+; BTVER2-NEXT: subw $7, (%rsi) # sched: [4:1.00]
+; BTVER2-NEXT: subw %di, %di # sched: [1:0.50]
+; BTVER2-NEXT: subw %di, (%rsi) # sched: [4:1.00]
+; BTVER2-NEXT: subw (%rsi), %di # sched: [4:1.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_sub_16:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: subw $511, %ax # imm = 0x1FF
+; ZNVER1-NEXT: # sched: [1:0.25]
+; ZNVER1-NEXT: subw $511, %di # imm = 0x1FF
+; ZNVER1-NEXT: # sched: [1:0.25]
+; ZNVER1-NEXT: subw $511, (%rsi) # imm = 0x1FF
+; ZNVER1-NEXT: # sched: [5:0.50]
+; ZNVER1-NEXT: subw $7, %di # sched: [1:0.25]
+; ZNVER1-NEXT: subw $7, (%rsi) # sched: [5:0.50]
+; ZNVER1-NEXT: subw %di, %di # sched: [1:0.25]
+; ZNVER1-NEXT: subw %di, (%rsi) # sched: [5:0.50]
+; ZNVER1-NEXT: subw (%rsi), %di # sched: [5:0.50]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ tail call void asm "subw $2, %AX \0A\09 subw $2, $0 \0A\09 subw $2, $1 \0A\09 subw $3, $0 \0A\09 subw $3, $1 \0A\09 subw $0, $0 \0A\09 subw $0, $1 \0A\09 subw $1, $0", "r,*m,i,i"(i16 %a0, i16* %a1, i16 511, i8 7) nounwind
+ ret void
+}
+define void @test_sub_32(i32 %a0, i32* %a1) optsize {
+; GENERIC-LABEL: test_sub_32:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: subl $665536, %eax # imm = 0xA27C0
+; GENERIC-NEXT: # sched: [1:0.33]
+; GENERIC-NEXT: subl $665536, %edi # imm = 0xA27C0
+; GENERIC-NEXT: # sched: [1:0.33]
+; GENERIC-NEXT: subl $665536, (%rsi) # imm = 0xA27C0
+; GENERIC-NEXT: # sched: [7:1.00]
+; GENERIC-NEXT: subl $7, %edi # sched: [1:0.33]
+; GENERIC-NEXT: subl $7, (%rsi) # sched: [7:1.00]
+; GENERIC-NEXT: subl %edi, %edi # sched: [1:0.33]
+; GENERIC-NEXT: subl %edi, (%rsi) # sched: [7:1.00]
+; GENERIC-NEXT: subl (%rsi), %edi # sched: [6:0.50]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_sub_32:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: subl $665536, %eax # imm = 0xA27C0
+; ATOM-NEXT: # sched: [1:0.50]
+; ATOM-NEXT: subl $665536, %edi # imm = 0xA27C0
+; ATOM-NEXT: # sched: [1:0.50]
+; ATOM-NEXT: subl $665536, (%rsi) # imm = 0xA27C0
+; ATOM-NEXT: # sched: [1:1.00]
+; ATOM-NEXT: subl $7, %edi # sched: [1:0.50]
+; ATOM-NEXT: subl $7, (%rsi) # sched: [1:1.00]
+; ATOM-NEXT: subl %edi, %edi # sched: [1:0.50]
+; ATOM-NEXT: subl %edi, (%rsi) # sched: [1:1.00]
+; ATOM-NEXT: subl (%rsi), %edi # sched: [1:1.00]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_sub_32:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: subl $665536, %eax # imm = 0xA27C0
+; SLM-NEXT: # sched: [1:0.50]
+; SLM-NEXT: subl $665536, %edi # imm = 0xA27C0
+; SLM-NEXT: # sched: [1:0.50]
+; SLM-NEXT: subl $665536, (%rsi) # imm = 0xA27C0
+; SLM-NEXT: # sched: [4:2.00]
+; SLM-NEXT: subl $7, %edi # sched: [1:0.50]
+; SLM-NEXT: subl $7, (%rsi) # sched: [4:2.00]
+; SLM-NEXT: subl %edi, %edi # sched: [1:0.50]
+; SLM-NEXT: subl %edi, (%rsi) # sched: [4:2.00]
+; SLM-NEXT: subl (%rsi), %edi # sched: [4:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_sub_32:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: subl $665536, %eax # imm = 0xA27C0
+; SANDY-NEXT: # sched: [1:0.33]
+; SANDY-NEXT: subl $665536, %edi # imm = 0xA27C0
+; SANDY-NEXT: # sched: [1:0.33]
+; SANDY-NEXT: subl $665536, (%rsi) # imm = 0xA27C0
+; SANDY-NEXT: # sched: [7:1.00]
+; SANDY-NEXT: subl $7, %edi # sched: [1:0.33]
+; SANDY-NEXT: subl $7, (%rsi) # sched: [7:1.00]
+; SANDY-NEXT: subl %edi, %edi # sched: [1:0.33]
+; SANDY-NEXT: subl %edi, (%rsi) # sched: [7:1.00]
+; SANDY-NEXT: subl (%rsi), %edi # sched: [6:0.50]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_sub_32:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: subl $665536, %eax # imm = 0xA27C0
+; HASWELL-NEXT: # sched: [1:0.25]
+; HASWELL-NEXT: subl $665536, %edi # imm = 0xA27C0
+; HASWELL-NEXT: # sched: [1:0.25]
+; HASWELL-NEXT: subl $665536, (%rsi) # imm = 0xA27C0
+; HASWELL-NEXT: # sched: [7:1.00]
+; HASWELL-NEXT: subl $7, %edi # sched: [1:0.25]
+; HASWELL-NEXT: subl $7, (%rsi) # sched: [7:1.00]
+; HASWELL-NEXT: subl %edi, %edi # sched: [1:0.25]
+; HASWELL-NEXT: subl %edi, (%rsi) # sched: [7:1.00]
+; HASWELL-NEXT: subl (%rsi), %edi # sched: [6:0.50]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_sub_32:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: subl $665536, %eax # imm = 0xA27C0
+; BROADWELL-NEXT: # sched: [1:0.25]
+; BROADWELL-NEXT: subl $665536, %edi # imm = 0xA27C0
+; BROADWELL-NEXT: # sched: [1:0.25]
+; BROADWELL-NEXT: subl $665536, (%rsi) # imm = 0xA27C0
+; BROADWELL-NEXT: # sched: [6:1.00]
+; BROADWELL-NEXT: subl $7, %edi # sched: [1:0.25]
+; BROADWELL-NEXT: subl $7, (%rsi) # sched: [6:1.00]
+; BROADWELL-NEXT: subl %edi, %edi # sched: [1:0.25]
+; BROADWELL-NEXT: subl %edi, (%rsi) # sched: [6:1.00]
+; BROADWELL-NEXT: subl (%rsi), %edi # sched: [6:0.50]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_sub_32:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: subl $665536, %eax # imm = 0xA27C0
+; SKYLAKE-NEXT: # sched: [1:0.25]
+; SKYLAKE-NEXT: subl $665536, %edi # imm = 0xA27C0
+; SKYLAKE-NEXT: # sched: [1:0.25]
+; SKYLAKE-NEXT: subl $665536, (%rsi) # imm = 0xA27C0
+; SKYLAKE-NEXT: # sched: [6:1.00]
+; SKYLAKE-NEXT: subl $7, %edi # sched: [1:0.25]
+; SKYLAKE-NEXT: subl $7, (%rsi) # sched: [6:1.00]
+; SKYLAKE-NEXT: subl %edi, %edi # sched: [1:0.25]
+; SKYLAKE-NEXT: subl %edi, (%rsi) # sched: [6:1.00]
+; SKYLAKE-NEXT: subl (%rsi), %edi # sched: [6:0.50]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_sub_32:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: subl $665536, %eax # imm = 0xA27C0
+; SKX-NEXT: # sched: [1:0.25]
+; SKX-NEXT: subl $665536, %edi # imm = 0xA27C0
+; SKX-NEXT: # sched: [1:0.25]
+; SKX-NEXT: subl $665536, (%rsi) # imm = 0xA27C0
+; SKX-NEXT: # sched: [6:1.00]
+; SKX-NEXT: subl $7, %edi # sched: [1:0.25]
+; SKX-NEXT: subl $7, (%rsi) # sched: [6:1.00]
+; SKX-NEXT: subl %edi, %edi # sched: [1:0.25]
+; SKX-NEXT: subl %edi, (%rsi) # sched: [6:1.00]
+; SKX-NEXT: subl (%rsi), %edi # sched: [6:0.50]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_sub_32:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: subl $665536, %eax # imm = 0xA27C0
+; BTVER2-NEXT: # sched: [1:0.50]
+; BTVER2-NEXT: subl $665536, %edi # imm = 0xA27C0
+; BTVER2-NEXT: # sched: [1:0.50]
+; BTVER2-NEXT: subl $665536, (%rsi) # imm = 0xA27C0
+; BTVER2-NEXT: # sched: [4:1.00]
+; BTVER2-NEXT: subl $7, %edi # sched: [1:0.50]
+; BTVER2-NEXT: subl $7, (%rsi) # sched: [4:1.00]
+; BTVER2-NEXT: subl %edi, %edi # sched: [1:0.50]
+; BTVER2-NEXT: subl %edi, (%rsi) # sched: [4:1.00]
+; BTVER2-NEXT: subl (%rsi), %edi # sched: [4:1.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_sub_32:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: subl $665536, %eax # imm = 0xA27C0
+; ZNVER1-NEXT: # sched: [1:0.25]
+; ZNVER1-NEXT: subl $665536, %edi # imm = 0xA27C0
+; ZNVER1-NEXT: # sched: [1:0.25]
+; ZNVER1-NEXT: subl $665536, (%rsi) # imm = 0xA27C0
+; ZNVER1-NEXT: # sched: [5:0.50]
+; ZNVER1-NEXT: subl $7, %edi # sched: [1:0.25]
+; ZNVER1-NEXT: subl $7, (%rsi) # sched: [5:0.50]
+; ZNVER1-NEXT: subl %edi, %edi # sched: [1:0.25]
+; ZNVER1-NEXT: subl %edi, (%rsi) # sched: [5:0.50]
+; ZNVER1-NEXT: subl (%rsi), %edi # sched: [5:0.50]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ tail call void asm "subl $2, %EAX \0A\09 subl $2, $0 \0A\09 subl $2, $1 \0A\09 subl $3, $0 \0A\09 subl $3, $1 \0A\09 subl $0, $0 \0A\09 subl $0, $1 \0A\09 subl $1, $0", "r,*m,i,i"(i32 %a0, i32* %a1, i32 665536, i8 7) nounwind
+ ret void
+}
+define void @test_sub_64(i64 %a0, i64* %a1) optsize {
+; GENERIC-LABEL: test_sub_64:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: subq $665536, %rax # imm = 0xA27C0
+; GENERIC-NEXT: # sched: [1:0.33]
+; GENERIC-NEXT: subq $665536, %rdi # imm = 0xA27C0
+; GENERIC-NEXT: # sched: [1:0.33]
+; GENERIC-NEXT: subq $665536, (%rsi) # imm = 0xA27C0
+; GENERIC-NEXT: # sched: [7:1.00]
+; GENERIC-NEXT: subq $7, %rdi # sched: [1:0.33]
+; GENERIC-NEXT: subq $7, (%rsi) # sched: [7:1.00]
+; GENERIC-NEXT: subq %rdi, %rdi # sched: [1:0.33]
+; GENERIC-NEXT: subq %rdi, (%rsi) # sched: [7:1.00]
+; GENERIC-NEXT: subq (%rsi), %rdi # sched: [6:0.50]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_sub_64:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: subq $665536, %rax # imm = 0xA27C0
+; ATOM-NEXT: # sched: [1:0.50]
+; ATOM-NEXT: subq $665536, %rdi # imm = 0xA27C0
+; ATOM-NEXT: # sched: [1:0.50]
+; ATOM-NEXT: subq $665536, (%rsi) # imm = 0xA27C0
+; ATOM-NEXT: # sched: [1:1.00]
+; ATOM-NEXT: subq $7, %rdi # sched: [1:0.50]
+; ATOM-NEXT: subq $7, (%rsi) # sched: [1:1.00]
+; ATOM-NEXT: subq %rdi, %rdi # sched: [1:0.50]
+; ATOM-NEXT: subq %rdi, (%rsi) # sched: [1:1.00]
+; ATOM-NEXT: subq (%rsi), %rdi # sched: [1:1.00]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_sub_64:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: subq $665536, %rax # imm = 0xA27C0
+; SLM-NEXT: # sched: [1:0.50]
+; SLM-NEXT: subq $665536, %rdi # imm = 0xA27C0
+; SLM-NEXT: # sched: [1:0.50]
+; SLM-NEXT: subq $665536, (%rsi) # imm = 0xA27C0
+; SLM-NEXT: # sched: [4:2.00]
+; SLM-NEXT: subq $7, %rdi # sched: [1:0.50]
+; SLM-NEXT: subq $7, (%rsi) # sched: [4:2.00]
+; SLM-NEXT: subq %rdi, %rdi # sched: [1:0.50]
+; SLM-NEXT: subq %rdi, (%rsi) # sched: [4:2.00]
+; SLM-NEXT: subq (%rsi), %rdi # sched: [4:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_sub_64:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: subq $665536, %rax # imm = 0xA27C0
+; SANDY-NEXT: # sched: [1:0.33]
+; SANDY-NEXT: subq $665536, %rdi # imm = 0xA27C0
+; SANDY-NEXT: # sched: [1:0.33]
+; SANDY-NEXT: subq $665536, (%rsi) # imm = 0xA27C0
+; SANDY-NEXT: # sched: [7:1.00]
+; SANDY-NEXT: subq $7, %rdi # sched: [1:0.33]
+; SANDY-NEXT: subq $7, (%rsi) # sched: [7:1.00]
+; SANDY-NEXT: subq %rdi, %rdi # sched: [1:0.33]
+; SANDY-NEXT: subq %rdi, (%rsi) # sched: [7:1.00]
+; SANDY-NEXT: subq (%rsi), %rdi # sched: [6:0.50]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_sub_64:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: subq $665536, %rax # imm = 0xA27C0
+; HASWELL-NEXT: # sched: [1:0.25]
+; HASWELL-NEXT: subq $665536, %rdi # imm = 0xA27C0
+; HASWELL-NEXT: # sched: [1:0.25]
+; HASWELL-NEXT: subq $665536, (%rsi) # imm = 0xA27C0
+; HASWELL-NEXT: # sched: [7:1.00]
+; HASWELL-NEXT: subq $7, %rdi # sched: [1:0.25]
+; HASWELL-NEXT: subq $7, (%rsi) # sched: [7:1.00]
+; HASWELL-NEXT: subq %rdi, %rdi # sched: [1:0.25]
+; HASWELL-NEXT: subq %rdi, (%rsi) # sched: [7:1.00]
+; HASWELL-NEXT: subq (%rsi), %rdi # sched: [6:0.50]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_sub_64:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: subq $665536, %rax # imm = 0xA27C0
+; BROADWELL-NEXT: # sched: [1:0.25]
+; BROADWELL-NEXT: subq $665536, %rdi # imm = 0xA27C0
+; BROADWELL-NEXT: # sched: [1:0.25]
+; BROADWELL-NEXT: subq $665536, (%rsi) # imm = 0xA27C0
+; BROADWELL-NEXT: # sched: [6:1.00]
+; BROADWELL-NEXT: subq $7, %rdi # sched: [1:0.25]
+; BROADWELL-NEXT: subq $7, (%rsi) # sched: [6:1.00]
+; BROADWELL-NEXT: subq %rdi, %rdi # sched: [1:0.25]
+; BROADWELL-NEXT: subq %rdi, (%rsi) # sched: [6:1.00]
+; BROADWELL-NEXT: subq (%rsi), %rdi # sched: [6:0.50]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_sub_64:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: subq $665536, %rax # imm = 0xA27C0
+; SKYLAKE-NEXT: # sched: [1:0.25]
+; SKYLAKE-NEXT: subq $665536, %rdi # imm = 0xA27C0
+; SKYLAKE-NEXT: # sched: [1:0.25]
+; SKYLAKE-NEXT: subq $665536, (%rsi) # imm = 0xA27C0
+; SKYLAKE-NEXT: # sched: [6:1.00]
+; SKYLAKE-NEXT: subq $7, %rdi # sched: [1:0.25]
+; SKYLAKE-NEXT: subq $7, (%rsi) # sched: [6:1.00]
+; SKYLAKE-NEXT: subq %rdi, %rdi # sched: [1:0.25]
+; SKYLAKE-NEXT: subq %rdi, (%rsi) # sched: [6:1.00]
+; SKYLAKE-NEXT: subq (%rsi), %rdi # sched: [6:0.50]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_sub_64:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: subq $665536, %rax # imm = 0xA27C0
+; SKX-NEXT: # sched: [1:0.25]
+; SKX-NEXT: subq $665536, %rdi # imm = 0xA27C0
+; SKX-NEXT: # sched: [1:0.25]
+; SKX-NEXT: subq $665536, (%rsi) # imm = 0xA27C0
+; SKX-NEXT: # sched: [6:1.00]
+; SKX-NEXT: subq $7, %rdi # sched: [1:0.25]
+; SKX-NEXT: subq $7, (%rsi) # sched: [6:1.00]
+; SKX-NEXT: subq %rdi, %rdi # sched: [1:0.25]
+; SKX-NEXT: subq %rdi, (%rsi) # sched: [6:1.00]
+; SKX-NEXT: subq (%rsi), %rdi # sched: [6:0.50]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_sub_64:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: subq $665536, %rax # imm = 0xA27C0
+; BTVER2-NEXT: # sched: [1:0.50]
+; BTVER2-NEXT: subq $665536, %rdi # imm = 0xA27C0
+; BTVER2-NEXT: # sched: [1:0.50]
+; BTVER2-NEXT: subq $665536, (%rsi) # imm = 0xA27C0
+; BTVER2-NEXT: # sched: [4:1.00]
+; BTVER2-NEXT: subq $7, %rdi # sched: [1:0.50]
+; BTVER2-NEXT: subq $7, (%rsi) # sched: [4:1.00]
+; BTVER2-NEXT: subq %rdi, %rdi # sched: [1:0.50]
+; BTVER2-NEXT: subq %rdi, (%rsi) # sched: [4:1.00]
+; BTVER2-NEXT: subq (%rsi), %rdi # sched: [4:1.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_sub_64:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: subq $665536, %rax # imm = 0xA27C0
+; ZNVER1-NEXT: # sched: [1:0.25]
+; ZNVER1-NEXT: subq $665536, %rdi # imm = 0xA27C0
+; ZNVER1-NEXT: # sched: [1:0.25]
+; ZNVER1-NEXT: subq $665536, (%rsi) # imm = 0xA27C0
+; ZNVER1-NEXT: # sched: [5:0.50]
+; ZNVER1-NEXT: subq $7, %rdi # sched: [1:0.25]
+; ZNVER1-NEXT: subq $7, (%rsi) # sched: [5:0.50]
+; ZNVER1-NEXT: subq %rdi, %rdi # sched: [1:0.25]
+; ZNVER1-NEXT: subq %rdi, (%rsi) # sched: [5:0.50]
+; ZNVER1-NEXT: subq (%rsi), %rdi # sched: [5:0.50]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ tail call void asm "subq $2, %RAX \0A\09 subq $2, $0 \0A\09 subq $2, $1 \0A\09 subq $3, $0 \0A\09 subq $3, $1 \0A\09 subq $0, $0 \0A\09 subq $0, $1 \0A\09 subq $1, $0", "r,*m,i,i"(i64 %a0, i64* %a1, i32 665536, i8 7) nounwind
+ ret void
+}
+
+define void @test_test_8(i8 %a0, i8* %a1) optsize {
+; GENERIC-LABEL: test_test_8:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: testb $7, %al # sched: [1:0.33]
+; GENERIC-NEXT: testb $7, %dil # sched: [1:0.33]
+; GENERIC-NEXT: testb $7, (%rsi) # sched: [7:1.00]
+; GENERIC-NEXT: testb %dil, %dil # sched: [1:0.33]
+; GENERIC-NEXT: testb %dil, (%rsi) # sched: [7:1.00]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_test_8:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: testb $7, %al # sched: [1:0.50]
+; ATOM-NEXT: testb $7, %dil # sched: [1:0.50]
+; ATOM-NEXT: testb $7, (%rsi) # sched: [1:1.00]
+; ATOM-NEXT: testb %dil, %dil # sched: [1:0.50]
+; ATOM-NEXT: testb %dil, (%rsi) # sched: [1:1.00]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_test_8:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: testb $7, %al # sched: [1:0.50]
+; SLM-NEXT: testb $7, %dil # sched: [1:0.50]
+; SLM-NEXT: testb $7, (%rsi) # sched: [4:2.00]
+; SLM-NEXT: testb %dil, %dil # sched: [1:0.50]
+; SLM-NEXT: testb %dil, (%rsi) # sched: [4:2.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_test_8:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: testb $7, %al # sched: [1:0.33]
+; SANDY-NEXT: testb $7, %dil # sched: [1:0.33]
+; SANDY-NEXT: testb $7, (%rsi) # sched: [7:1.00]
+; SANDY-NEXT: testb %dil, %dil # sched: [1:0.33]
+; SANDY-NEXT: testb %dil, (%rsi) # sched: [7:1.00]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_test_8:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: testb $7, %al # sched: [1:0.25]
+; HASWELL-NEXT: testb $7, %dil # sched: [1:0.25]
+; HASWELL-NEXT: testb $7, (%rsi) # sched: [6:0.50]
+; HASWELL-NEXT: testb %dil, %dil # sched: [1:0.25]
+; HASWELL-NEXT: testb %dil, (%rsi) # sched: [6:0.50]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_test_8:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: testb $7, %al # sched: [1:0.25]
+; BROADWELL-NEXT: testb $7, %dil # sched: [1:0.25]
+; BROADWELL-NEXT: testb $7, (%rsi) # sched: [6:0.50]
+; BROADWELL-NEXT: testb %dil, %dil # sched: [1:0.25]
+; BROADWELL-NEXT: testb %dil, (%rsi) # sched: [6:0.50]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_test_8:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: testb $7, %al # sched: [1:0.25]
+; SKYLAKE-NEXT: testb $7, %dil # sched: [1:0.25]
+; SKYLAKE-NEXT: testb $7, (%rsi) # sched: [6:0.50]
+; SKYLAKE-NEXT: testb %dil, %dil # sched: [1:0.25]
+; SKYLAKE-NEXT: testb %dil, (%rsi) # sched: [6:0.50]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_test_8:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: testb $7, %al # sched: [1:0.25]
+; SKX-NEXT: testb $7, %dil # sched: [1:0.25]
+; SKX-NEXT: testb $7, (%rsi) # sched: [6:0.50]
+; SKX-NEXT: testb %dil, %dil # sched: [1:0.25]
+; SKX-NEXT: testb %dil, (%rsi) # sched: [6:0.50]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_test_8:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: testb $7, %al # sched: [1:0.50]
+; BTVER2-NEXT: testb $7, %dil # sched: [1:0.50]
+; BTVER2-NEXT: testb $7, (%rsi) # sched: [4:1.00]
+; BTVER2-NEXT: testb %dil, %dil # sched: [1:0.50]
+; BTVER2-NEXT: testb %dil, (%rsi) # sched: [4:1.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_test_8:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: testb $7, %al # sched: [1:0.25]
+; ZNVER1-NEXT: testb $7, %dil # sched: [1:0.25]
+; ZNVER1-NEXT: testb $7, (%rsi) # sched: [5:1.00]
+; ZNVER1-NEXT: testb %dil, %dil # sched: [1:0.25]
+; ZNVER1-NEXT: testb %dil, (%rsi) # sched: [5:1.00]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ tail call void asm "testb $2, %AL \0A\09 testb $2, $0 \0A\09 testb $2, $1 \0A\09 testb $0, $0 \0A\09 testb $0, $1", "r,*m,i"(i8 %a0, i8* %a1, i8 7) nounwind
+ ret void
+}
+define void @test_test_16(i16 %a0, i16* %a1) optsize {
+; GENERIC-LABEL: test_test_16:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: testw $511, %ax # imm = 0x1FF
+; GENERIC-NEXT: # sched: [1:0.33]
+; GENERIC-NEXT: testw $511, %di # imm = 0x1FF
+; GENERIC-NEXT: # sched: [1:0.33]
+; GENERIC-NEXT: testw $511, (%rsi) # imm = 0x1FF
+; GENERIC-NEXT: # sched: [5:1.00]
+; GENERIC-NEXT: testw %di, %di # sched: [1:0.33]
+; GENERIC-NEXT: testw %di, (%rsi) # sched: [7:1.00]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_test_16:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: testw $511, %ax # imm = 0x1FF
+; ATOM-NEXT: # sched: [1:0.50]
+; ATOM-NEXT: testw $511, %di # imm = 0x1FF
+; ATOM-NEXT: # sched: [1:0.50]
+; ATOM-NEXT: testw $511, (%rsi) # imm = 0x1FF
+; ATOM-NEXT: # sched: [1:1.00]
+; ATOM-NEXT: testw %di, %di # sched: [1:0.50]
+; ATOM-NEXT: testw %di, (%rsi) # sched: [1:1.00]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_test_16:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: testw $511, %ax # imm = 0x1FF
+; SLM-NEXT: # sched: [1:0.50]
+; SLM-NEXT: testw $511, %di # imm = 0x1FF
+; SLM-NEXT: # sched: [1:0.50]
+; SLM-NEXT: testw $511, (%rsi) # imm = 0x1FF
+; SLM-NEXT: # sched: [4:2.00]
+; SLM-NEXT: testw %di, %di # sched: [1:0.50]
+; SLM-NEXT: testw %di, (%rsi) # sched: [4:2.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_test_16:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: testw $511, %ax # imm = 0x1FF
+; SANDY-NEXT: # sched: [1:0.33]
+; SANDY-NEXT: testw $511, %di # imm = 0x1FF
+; SANDY-NEXT: # sched: [1:0.33]
+; SANDY-NEXT: testw $511, (%rsi) # imm = 0x1FF
+; SANDY-NEXT: # sched: [5:1.00]
+; SANDY-NEXT: testw %di, %di # sched: [1:0.33]
+; SANDY-NEXT: testw %di, (%rsi) # sched: [7:1.00]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_test_16:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: testw $511, %ax # imm = 0x1FF
+; HASWELL-NEXT: # sched: [1:0.25]
+; HASWELL-NEXT: testw $511, %di # imm = 0x1FF
+; HASWELL-NEXT: # sched: [1:0.25]
+; HASWELL-NEXT: testw $511, (%rsi) # imm = 0x1FF
+; HASWELL-NEXT: # sched: [6:1.00]
+; HASWELL-NEXT: testw %di, %di # sched: [1:0.25]
+; HASWELL-NEXT: testw %di, (%rsi) # sched: [6:0.50]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_test_16:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: testw $511, %ax # imm = 0x1FF
+; BROADWELL-NEXT: # sched: [1:0.25]
+; BROADWELL-NEXT: testw $511, %di # imm = 0x1FF
+; BROADWELL-NEXT: # sched: [1:0.25]
+; BROADWELL-NEXT: testw $511, (%rsi) # imm = 0x1FF
+; BROADWELL-NEXT: # sched: [6:1.00]
+; BROADWELL-NEXT: testw %di, %di # sched: [1:0.25]
+; BROADWELL-NEXT: testw %di, (%rsi) # sched: [6:0.50]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_test_16:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: testw $511, %ax # imm = 0x1FF
+; SKYLAKE-NEXT: # sched: [1:0.25]
+; SKYLAKE-NEXT: testw $511, %di # imm = 0x1FF
+; SKYLAKE-NEXT: # sched: [1:0.25]
+; SKYLAKE-NEXT: testw $511, (%rsi) # imm = 0x1FF
+; SKYLAKE-NEXT: # sched: [6:1.00]
+; SKYLAKE-NEXT: testw %di, %di # sched: [1:0.25]
+; SKYLAKE-NEXT: testw %di, (%rsi) # sched: [6:0.50]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_test_16:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: testw $511, %ax # imm = 0x1FF
+; SKX-NEXT: # sched: [1:0.25]
+; SKX-NEXT: testw $511, %di # imm = 0x1FF
+; SKX-NEXT: # sched: [1:0.25]
+; SKX-NEXT: testw $511, (%rsi) # imm = 0x1FF
+; SKX-NEXT: # sched: [6:1.00]
+; SKX-NEXT: testw %di, %di # sched: [1:0.25]
+; SKX-NEXT: testw %di, (%rsi) # sched: [6:0.50]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_test_16:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: testw $511, %ax # imm = 0x1FF
+; BTVER2-NEXT: # sched: [1:0.50]
+; BTVER2-NEXT: testw $511, %di # imm = 0x1FF
+; BTVER2-NEXT: # sched: [1:0.50]
+; BTVER2-NEXT: testw $511, (%rsi) # imm = 0x1FF
+; BTVER2-NEXT: # sched: [4:1.00]
+; BTVER2-NEXT: testw %di, %di # sched: [1:0.50]
+; BTVER2-NEXT: testw %di, (%rsi) # sched: [4:1.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_test_16:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: testw $511, %ax # imm = 0x1FF
+; ZNVER1-NEXT: # sched: [1:0.25]
+; ZNVER1-NEXT: testw $511, %di # imm = 0x1FF
+; ZNVER1-NEXT: # sched: [1:0.25]
+; ZNVER1-NEXT: testw $511, (%rsi) # imm = 0x1FF
+; ZNVER1-NEXT: # sched: [5:1.00]
+; ZNVER1-NEXT: testw %di, %di # sched: [1:0.25]
+; ZNVER1-NEXT: testw %di, (%rsi) # sched: [5:1.00]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ tail call void asm "testw $2, %AX \0A\09 testw $2, $0 \0A\09 testw $2, $1 \0A\09 testw $0, $0 \0A\09 testw $0, $1", "r,*m,i"(i16 %a0, i16* %a1, i16 511) nounwind
+ ret void
+}
+define void @test_test_32(i32 %a0, i32* %a1) optsize {
+; GENERIC-LABEL: test_test_32:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: testl $665536, %eax # imm = 0xA27C0
+; GENERIC-NEXT: # sched: [1:0.33]
+; GENERIC-NEXT: testl $665536, %edi # imm = 0xA27C0
+; GENERIC-NEXT: # sched: [1:0.33]
+; GENERIC-NEXT: testl $665536, (%rsi) # imm = 0xA27C0
+; GENERIC-NEXT: # sched: [5:1.00]
+; GENERIC-NEXT: testl %edi, %edi # sched: [1:0.33]
+; GENERIC-NEXT: testl %edi, (%rsi) # sched: [7:1.00]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_test_32:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: testl $665536, %eax # imm = 0xA27C0
+; ATOM-NEXT: # sched: [1:0.50]
+; ATOM-NEXT: testl $665536, %edi # imm = 0xA27C0
+; ATOM-NEXT: # sched: [1:0.50]
+; ATOM-NEXT: testl $665536, (%rsi) # imm = 0xA27C0
+; ATOM-NEXT: # sched: [1:1.00]
+; ATOM-NEXT: testl %edi, %edi # sched: [1:0.50]
+; ATOM-NEXT: testl %edi, (%rsi) # sched: [1:1.00]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_test_32:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: testl $665536, %eax # imm = 0xA27C0
+; SLM-NEXT: # sched: [1:0.50]
+; SLM-NEXT: testl $665536, %edi # imm = 0xA27C0
+; SLM-NEXT: # sched: [1:0.50]
+; SLM-NEXT: testl $665536, (%rsi) # imm = 0xA27C0
+; SLM-NEXT: # sched: [4:2.00]
+; SLM-NEXT: testl %edi, %edi # sched: [1:0.50]
+; SLM-NEXT: testl %edi, (%rsi) # sched: [4:2.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_test_32:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: testl $665536, %eax # imm = 0xA27C0
+; SANDY-NEXT: # sched: [1:0.33]
+; SANDY-NEXT: testl $665536, %edi # imm = 0xA27C0
+; SANDY-NEXT: # sched: [1:0.33]
+; SANDY-NEXT: testl $665536, (%rsi) # imm = 0xA27C0
+; SANDY-NEXT: # sched: [5:1.00]
+; SANDY-NEXT: testl %edi, %edi # sched: [1:0.33]
+; SANDY-NEXT: testl %edi, (%rsi) # sched: [7:1.00]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_test_32:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: testl $665536, %eax # imm = 0xA27C0
+; HASWELL-NEXT: # sched: [1:0.25]
+; HASWELL-NEXT: testl $665536, %edi # imm = 0xA27C0
+; HASWELL-NEXT: # sched: [1:0.25]
+; HASWELL-NEXT: testl $665536, (%rsi) # imm = 0xA27C0
+; HASWELL-NEXT: # sched: [6:1.00]
+; HASWELL-NEXT: testl %edi, %edi # sched: [1:0.25]
+; HASWELL-NEXT: testl %edi, (%rsi) # sched: [6:0.50]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_test_32:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: testl $665536, %eax # imm = 0xA27C0
+; BROADWELL-NEXT: # sched: [1:0.25]
+; BROADWELL-NEXT: testl $665536, %edi # imm = 0xA27C0
+; BROADWELL-NEXT: # sched: [1:0.25]
+; BROADWELL-NEXT: testl $665536, (%rsi) # imm = 0xA27C0
+; BROADWELL-NEXT: # sched: [6:1.00]
+; BROADWELL-NEXT: testl %edi, %edi # sched: [1:0.25]
+; BROADWELL-NEXT: testl %edi, (%rsi) # sched: [6:0.50]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_test_32:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: testl $665536, %eax # imm = 0xA27C0
+; SKYLAKE-NEXT: # sched: [1:0.25]
+; SKYLAKE-NEXT: testl $665536, %edi # imm = 0xA27C0
+; SKYLAKE-NEXT: # sched: [1:0.25]
+; SKYLAKE-NEXT: testl $665536, (%rsi) # imm = 0xA27C0
+; SKYLAKE-NEXT: # sched: [6:1.00]
+; SKYLAKE-NEXT: testl %edi, %edi # sched: [1:0.25]
+; SKYLAKE-NEXT: testl %edi, (%rsi) # sched: [6:0.50]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_test_32:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: testl $665536, %eax # imm = 0xA27C0
+; SKX-NEXT: # sched: [1:0.25]
+; SKX-NEXT: testl $665536, %edi # imm = 0xA27C0
+; SKX-NEXT: # sched: [1:0.25]
+; SKX-NEXT: testl $665536, (%rsi) # imm = 0xA27C0
+; SKX-NEXT: # sched: [6:1.00]
+; SKX-NEXT: testl %edi, %edi # sched: [1:0.25]
+; SKX-NEXT: testl %edi, (%rsi) # sched: [6:0.50]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_test_32:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: testl $665536, %eax # imm = 0xA27C0
+; BTVER2-NEXT: # sched: [1:0.50]
+; BTVER2-NEXT: testl $665536, %edi # imm = 0xA27C0
+; BTVER2-NEXT: # sched: [1:0.50]
+; BTVER2-NEXT: testl $665536, (%rsi) # imm = 0xA27C0
+; BTVER2-NEXT: # sched: [4:1.00]
+; BTVER2-NEXT: testl %edi, %edi # sched: [1:0.50]
+; BTVER2-NEXT: testl %edi, (%rsi) # sched: [4:1.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_test_32:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: testl $665536, %eax # imm = 0xA27C0
+; ZNVER1-NEXT: # sched: [1:0.25]
+; ZNVER1-NEXT: testl $665536, %edi # imm = 0xA27C0
+; ZNVER1-NEXT: # sched: [1:0.25]
+; ZNVER1-NEXT: testl $665536, (%rsi) # imm = 0xA27C0
+; ZNVER1-NEXT: # sched: [5:1.00]
+; ZNVER1-NEXT: testl %edi, %edi # sched: [1:0.25]
+; ZNVER1-NEXT: testl %edi, (%rsi) # sched: [5:1.00]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ tail call void asm "testl $2, %EAX \0A\09 testl $2, $0 \0A\09 testl $2, $1 \0A\09 testl $0, $0 \0A\09 testl $0, $1", "r,*m,i"(i32 %a0, i32* %a1, i32 665536) nounwind
+ ret void
+}
+define void @test_test_64(i64 %a0, i64* %a1) optsize {
+; GENERIC-LABEL: test_test_64:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: testq $665536, %rax # imm = 0xA27C0
+; GENERIC-NEXT: # sched: [1:0.33]
+; GENERIC-NEXT: testq $665536, %rdi # imm = 0xA27C0
+; GENERIC-NEXT: # sched: [1:0.33]
+; GENERIC-NEXT: testq $665536, (%rsi) # imm = 0xA27C0
+; GENERIC-NEXT: # sched: [5:1.00]
+; GENERIC-NEXT: testq %rdi, %rdi # sched: [1:0.33]
+; GENERIC-NEXT: testq %rdi, (%rsi) # sched: [7:1.00]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_test_64:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: testq $665536, %rax # imm = 0xA27C0
+; ATOM-NEXT: # sched: [1:0.50]
+; ATOM-NEXT: testq $665536, %rdi # imm = 0xA27C0
+; ATOM-NEXT: # sched: [1:0.50]
+; ATOM-NEXT: testq $665536, (%rsi) # imm = 0xA27C0
+; ATOM-NEXT: # sched: [1:1.00]
+; ATOM-NEXT: testq %rdi, %rdi # sched: [1:0.50]
+; ATOM-NEXT: testq %rdi, (%rsi) # sched: [1:1.00]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_test_64:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: testq $665536, %rax # imm = 0xA27C0
+; SLM-NEXT: # sched: [1:0.50]
+; SLM-NEXT: testq $665536, %rdi # imm = 0xA27C0
+; SLM-NEXT: # sched: [1:0.50]
+; SLM-NEXT: testq $665536, (%rsi) # imm = 0xA27C0
+; SLM-NEXT: # sched: [4:2.00]
+; SLM-NEXT: testq %rdi, %rdi # sched: [1:0.50]
+; SLM-NEXT: testq %rdi, (%rsi) # sched: [4:2.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_test_64:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: testq $665536, %rax # imm = 0xA27C0
+; SANDY-NEXT: # sched: [1:0.33]
+; SANDY-NEXT: testq $665536, %rdi # imm = 0xA27C0
+; SANDY-NEXT: # sched: [1:0.33]
+; SANDY-NEXT: testq $665536, (%rsi) # imm = 0xA27C0
+; SANDY-NEXT: # sched: [5:1.00]
+; SANDY-NEXT: testq %rdi, %rdi # sched: [1:0.33]
+; SANDY-NEXT: testq %rdi, (%rsi) # sched: [7:1.00]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_test_64:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: testq $665536, %rax # imm = 0xA27C0
+; HASWELL-NEXT: # sched: [1:0.25]
+; HASWELL-NEXT: testq $665536, %rdi # imm = 0xA27C0
+; HASWELL-NEXT: # sched: [1:0.25]
+; HASWELL-NEXT: testq $665536, (%rsi) # imm = 0xA27C0
+; HASWELL-NEXT: # sched: [6:1.00]
+; HASWELL-NEXT: testq %rdi, %rdi # sched: [1:0.25]
+; HASWELL-NEXT: testq %rdi, (%rsi) # sched: [6:0.50]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_test_64:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: testq $665536, %rax # imm = 0xA27C0
+; BROADWELL-NEXT: # sched: [1:0.25]
+; BROADWELL-NEXT: testq $665536, %rdi # imm = 0xA27C0
+; BROADWELL-NEXT: # sched: [1:0.25]
+; BROADWELL-NEXT: testq $665536, (%rsi) # imm = 0xA27C0
+; BROADWELL-NEXT: # sched: [6:1.00]
+; BROADWELL-NEXT: testq %rdi, %rdi # sched: [1:0.25]
+; BROADWELL-NEXT: testq %rdi, (%rsi) # sched: [6:0.50]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_test_64:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: testq $665536, %rax # imm = 0xA27C0
+; SKYLAKE-NEXT: # sched: [1:0.25]
+; SKYLAKE-NEXT: testq $665536, %rdi # imm = 0xA27C0
+; SKYLAKE-NEXT: # sched: [1:0.25]
+; SKYLAKE-NEXT: testq $665536, (%rsi) # imm = 0xA27C0
+; SKYLAKE-NEXT: # sched: [6:1.00]
+; SKYLAKE-NEXT: testq %rdi, %rdi # sched: [1:0.25]
+; SKYLAKE-NEXT: testq %rdi, (%rsi) # sched: [6:0.50]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_test_64:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: testq $665536, %rax # imm = 0xA27C0
+; SKX-NEXT: # sched: [1:0.25]
+; SKX-NEXT: testq $665536, %rdi # imm = 0xA27C0
+; SKX-NEXT: # sched: [1:0.25]
+; SKX-NEXT: testq $665536, (%rsi) # imm = 0xA27C0
+; SKX-NEXT: # sched: [6:1.00]
+; SKX-NEXT: testq %rdi, %rdi # sched: [1:0.25]
+; SKX-NEXT: testq %rdi, (%rsi) # sched: [6:0.50]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_test_64:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: testq $665536, %rax # imm = 0xA27C0
+; BTVER2-NEXT: # sched: [1:0.50]
+; BTVER2-NEXT: testq $665536, %rdi # imm = 0xA27C0
+; BTVER2-NEXT: # sched: [1:0.50]
+; BTVER2-NEXT: testq $665536, (%rsi) # imm = 0xA27C0
+; BTVER2-NEXT: # sched: [4:1.00]
+; BTVER2-NEXT: testq %rdi, %rdi # sched: [1:0.50]
+; BTVER2-NEXT: testq %rdi, (%rsi) # sched: [4:1.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_test_64:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: testq $665536, %rax # imm = 0xA27C0
+; ZNVER1-NEXT: # sched: [1:0.25]
+; ZNVER1-NEXT: testq $665536, %rdi # imm = 0xA27C0
+; ZNVER1-NEXT: # sched: [1:0.25]
+; ZNVER1-NEXT: testq $665536, (%rsi) # imm = 0xA27C0
+; ZNVER1-NEXT: # sched: [5:1.00]
+; ZNVER1-NEXT: testq %rdi, %rdi # sched: [1:0.25]
+; ZNVER1-NEXT: testq %rdi, (%rsi) # sched: [5:1.00]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ tail call void asm "testq $2, %RAX \0A\09 testq $2, $0 \0A\09 testq $2, $1 \0A\09 testq $0, $0 \0A\09 testq $0, $1", "r,*m,i"(i64 %a0, i64* %a1, i32 665536) nounwind
+ ret void
+}
+
+; TODO: ud0, ud1
+define void @test_ud2() optsize {
+; GENERIC-LABEL: test_ud2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: ud2 # sched: [100:0.33]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_ud2:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: ud2
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_ud2:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: ud2 # sched: [100:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_ud2:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: ud2 # sched: [100:0.33]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_ud2:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: ud2 # sched: [100:0.25]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_ud2:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: ud2 # sched: [100:0.25]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_ud2:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: ud2 # sched: [100:0.25]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_ud2:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: ud2 # sched: [100:0.25]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_ud2:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: ud2 # sched: [100:0.17]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_ud2:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: ud2 # sched: [100:?]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ call void asm sideeffect "ud2", ""()
+ ret void
+}
+
+define void @test_xadd_8(i8 %a0, i8 %a1, i8 *%a2) optsize {
+; GENERIC-LABEL: test_xadd_8:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: xaddb %dil, %sil # sched: [3:1.00]
+; GENERIC-NEXT: xaddb %dil, (%rdx) # sched: [8:1.00]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_xadd_8:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: xaddb %dil, %sil # sched: [2:1.00]
+; ATOM-NEXT: xaddb %dil, (%rdx) # sched: [3:1.50]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_xadd_8:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: xaddb %dil, %sil # sched: [1:0.50]
+; SLM-NEXT: xaddb %dil, (%rdx) # sched: [4:2.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_xadd_8:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: xaddb %dil, %sil # sched: [3:1.00]
+; SANDY-NEXT: xaddb %dil, (%rdx) # sched: [8:1.00]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_xadd_8:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: xaddb %dil, %sil # sched: [3:0.75]
+; HASWELL-NEXT: xaddb %dil, (%rdx) # sched: [8:1.00]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_xadd_8:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: xaddb %dil, %sil # sched: [3:0.75]
+; BROADWELL-NEXT: xaddb %dil, (%rdx) # sched: [7:1.00]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_xadd_8:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: xaddb %dil, %sil # sched: [3:0.75]
+; SKYLAKE-NEXT: xaddb %dil, (%rdx) # sched: [7:1.00]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_xadd_8:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: xaddb %dil, %sil # sched: [3:0.75]
+; SKX-NEXT: xaddb %dil, (%rdx) # sched: [7:1.00]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_xadd_8:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: xaddb %dil, %sil # sched: [1:0.50]
+; BTVER2-NEXT: xaddb %dil, (%rdx) # sched: [4:1.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_xadd_8:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: xaddb %dil, %sil # sched: [1:0.25]
+; ZNVER1-NEXT: xaddb %dil, (%rdx) # sched: [100:?]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ tail call void asm "xaddb $0, $1 \0A\09 xaddb $0, $2", "r,r,*m"(i8 %a0, i8 %a1, i8 *%a2) nounwind
+ ret void
+}
+define void @test_xadd_16(i16 %a0, i16 %a1, i16 *%a2) optsize {
+; GENERIC-LABEL: test_xadd_16:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: xaddw %di, %si # sched: [3:1.00]
+; GENERIC-NEXT: xaddw %di, (%rdx) # sched: [8:1.00]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_xadd_16:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: xaddw %di, %si # sched: [2:1.00]
+; ATOM-NEXT: xaddw %di, (%rdx) # sched: [3:1.50]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_xadd_16:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: xaddw %di, %si # sched: [1:0.50]
+; SLM-NEXT: xaddw %di, (%rdx) # sched: [4:2.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_xadd_16:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: xaddw %di, %si # sched: [3:1.00]
+; SANDY-NEXT: xaddw %di, (%rdx) # sched: [8:1.00]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_xadd_16:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: xaddw %di, %si # sched: [3:0.75]
+; HASWELL-NEXT: xaddw %di, (%rdx) # sched: [8:1.00]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_xadd_16:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: xaddw %di, %si # sched: [3:0.75]
+; BROADWELL-NEXT: xaddw %di, (%rdx) # sched: [7:1.00]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_xadd_16:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: xaddw %di, %si # sched: [3:0.75]
+; SKYLAKE-NEXT: xaddw %di, (%rdx) # sched: [7:1.00]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_xadd_16:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: xaddw %di, %si # sched: [3:0.75]
+; SKX-NEXT: xaddw %di, (%rdx) # sched: [7:1.00]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_xadd_16:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: xaddw %di, %si # sched: [1:0.50]
+; BTVER2-NEXT: xaddw %di, (%rdx) # sched: [4:1.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_xadd_16:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: xaddw %di, %si # sched: [1:0.25]
+; ZNVER1-NEXT: xaddw %di, (%rdx) # sched: [100:?]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ tail call void asm "xaddw $0, $1 \0A\09 xaddw $0, $2", "r,r,*m"(i16 %a0, i16 %a1, i16 *%a2) nounwind
+ ret void
+}
+define void @test_xadd_32(i32 %a0, i32 %a1, i32 *%a2) optsize {
+; GENERIC-LABEL: test_xadd_32:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: xaddl %edi, %esi # sched: [3:1.00]
+; GENERIC-NEXT: xaddl %edi, (%rdx) # sched: [8:1.00]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_xadd_32:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: xaddl %edi, %esi # sched: [2:1.00]
+; ATOM-NEXT: xaddl %edi, (%rdx) # sched: [3:1.50]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_xadd_32:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: xaddl %edi, %esi # sched: [1:0.50]
+; SLM-NEXT: xaddl %edi, (%rdx) # sched: [4:2.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_xadd_32:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: xaddl %edi, %esi # sched: [3:1.00]
+; SANDY-NEXT: xaddl %edi, (%rdx) # sched: [8:1.00]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_xadd_32:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: xaddl %edi, %esi # sched: [3:0.75]
+; HASWELL-NEXT: xaddl %edi, (%rdx) # sched: [8:1.00]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_xadd_32:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: xaddl %edi, %esi # sched: [3:0.75]
+; BROADWELL-NEXT: xaddl %edi, (%rdx) # sched: [7:1.00]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_xadd_32:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: xaddl %edi, %esi # sched: [3:0.75]
+; SKYLAKE-NEXT: xaddl %edi, (%rdx) # sched: [7:1.00]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_xadd_32:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: xaddl %edi, %esi # sched: [3:0.75]
+; SKX-NEXT: xaddl %edi, (%rdx) # sched: [7:1.00]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_xadd_32:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: xaddl %edi, %esi # sched: [1:0.50]
+; BTVER2-NEXT: xaddl %edi, (%rdx) # sched: [4:1.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_xadd_32:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: xaddl %edi, %esi # sched: [1:0.25]
+; ZNVER1-NEXT: xaddl %edi, (%rdx) # sched: [100:?]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ tail call void asm "xaddl $0, $1 \0A\09 xaddl $0, $2", "r,r,*m"(i32 %a0, i32 %a1, i32 *%a2) nounwind
+ ret void
+}
+define void @test_xadd_64(i64 %a0, i64 %a1, i64 *%a2) optsize {
+; GENERIC-LABEL: test_xadd_64:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: xaddq %rdi, %rsi # sched: [3:1.00]
+; GENERIC-NEXT: xaddq %rdi, (%rdx) # sched: [8:1.00]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_xadd_64:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: xaddq %rdi, %rsi # sched: [2:1.00]
+; ATOM-NEXT: xaddq %rdi, (%rdx) # sched: [3:1.50]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_xadd_64:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: xaddq %rdi, %rsi # sched: [1:0.50]
+; SLM-NEXT: xaddq %rdi, (%rdx) # sched: [4:2.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_xadd_64:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: xaddq %rdi, %rsi # sched: [3:1.00]
+; SANDY-NEXT: xaddq %rdi, (%rdx) # sched: [8:1.00]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_xadd_64:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: xaddq %rdi, %rsi # sched: [3:0.75]
+; HASWELL-NEXT: xaddq %rdi, (%rdx) # sched: [8:1.00]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_xadd_64:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: xaddq %rdi, %rsi # sched: [3:0.75]
+; BROADWELL-NEXT: xaddq %rdi, (%rdx) # sched: [7:1.00]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_xadd_64:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: xaddq %rdi, %rsi # sched: [3:0.75]
+; SKYLAKE-NEXT: xaddq %rdi, (%rdx) # sched: [7:1.00]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_xadd_64:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: xaddq %rdi, %rsi # sched: [3:0.75]
+; SKX-NEXT: xaddq %rdi, (%rdx) # sched: [7:1.00]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_xadd_64:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: xaddq %rdi, %rsi # sched: [1:0.50]
+; BTVER2-NEXT: xaddq %rdi, (%rdx) # sched: [4:1.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_xadd_64:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: xaddq %rdi, %rsi # sched: [1:0.25]
+; ZNVER1-NEXT: xaddq %rdi, (%rdx) # sched: [100:?]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ tail call void asm "xaddq $0, $1 \0A\09 xaddq $0, $2", "r,r,*m"(i64 %a0, i64 %a1, i64 *%a2) nounwind
+ ret void
+}
+
+define void @test_xchg_8(i8 %a0, i8 %a1, i8 *%a2) optsize {
+; GENERIC-LABEL: test_xchg_8:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: xchgb %sil, %dil # sched: [1:0.33]
+; GENERIC-NEXT: xchgb %dil, (%rdx) # sched: [5:1.00]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_xchg_8:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: xchgb %sil, %dil # sched: [2:1.00]
+; ATOM-NEXT: xchgb %dil, (%rdx) # sched: [3:1.50]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_xchg_8:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: xchgb %sil, %dil # sched: [1:0.50]
+; SLM-NEXT: xchgb %dil, (%rdx) # sched: [4:2.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_xchg_8:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: xchgb %sil, %dil # sched: [1:0.33]
+; SANDY-NEXT: xchgb %dil, (%rdx) # sched: [5:1.00]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_xchg_8:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: xchgb %sil, %dil # sched: [3:0.75]
+; HASWELL-NEXT: xchgb %dil, (%rdx) # sched: [9:1.00]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_xchg_8:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: xchgb %sil, %dil # sched: [3:0.75]
+; BROADWELL-NEXT: xchgb %dil, (%rdx) # sched: [8:1.00]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_xchg_8:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: xchgb %sil, %dil # sched: [3:0.75]
+; SKYLAKE-NEXT: xchgb %dil, (%rdx) # sched: [10:1.25]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_xchg_8:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: xchgb %sil, %dil # sched: [3:0.75]
+; SKX-NEXT: xchgb %dil, (%rdx) # sched: [10:1.25]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_xchg_8:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: xchgb %sil, %dil # sched: [1:0.50]
+; BTVER2-NEXT: xchgb %dil, (%rdx) # sched: [4:1.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_xchg_8:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: xchgb %sil, %dil # sched: [1:0.50]
+; ZNVER1-NEXT: xchgb %dil, (%rdx) # sched: [5:0.50]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ tail call void asm "xchg $1, $0 \0A\09 xchg $2, $0", "r,r,*m"(i8 %a0, i8 %a1, i8 *%a2) nounwind
+ ret void
+}
+define void @test_xchg_16(i16 %a0, i16 %a1, i16 *%a2) optsize {
+; GENERIC-LABEL: test_xchg_16:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: xchgw %di, %ax # sched: [1:0.33]
+; GENERIC-NEXT: xchgw %si, %di # sched: [1:0.33]
+; GENERIC-NEXT: xchgw %di, (%rdx) # sched: [5:1.00]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_xchg_16:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: xchgw %di, %ax # sched: [2:1.00]
+; ATOM-NEXT: xchgw %si, %di # sched: [2:1.00]
+; ATOM-NEXT: xchgw %di, (%rdx) # sched: [3:1.50]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_xchg_16:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: xchgw %di, %ax # sched: [1:0.50]
+; SLM-NEXT: xchgw %si, %di # sched: [1:0.50]
+; SLM-NEXT: xchgw %di, (%rdx) # sched: [4:2.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_xchg_16:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: xchgw %di, %ax # sched: [1:0.33]
+; SANDY-NEXT: xchgw %si, %di # sched: [1:0.33]
+; SANDY-NEXT: xchgw %di, (%rdx) # sched: [5:1.00]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_xchg_16:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: xchgw %di, %ax # sched: [1:0.25]
+; HASWELL-NEXT: xchgw %si, %di # sched: [1:0.25]
+; HASWELL-NEXT: xchgw %di, (%rdx) # sched: [9:1.00]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_xchg_16:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: xchgw %di, %ax # sched: [1:0.25]
+; BROADWELL-NEXT: xchgw %si, %di # sched: [1:0.25]
+; BROADWELL-NEXT: xchgw %di, (%rdx) # sched: [8:1.00]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_xchg_16:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: xchgw %di, %ax # sched: [1:0.25]
+; SKYLAKE-NEXT: xchgw %si, %di # sched: [1:0.25]
+; SKYLAKE-NEXT: xchgw %di, (%rdx) # sched: [10:1.25]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_xchg_16:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: xchgw %di, %ax # sched: [1:0.25]
+; SKX-NEXT: xchgw %si, %di # sched: [1:0.25]
+; SKX-NEXT: xchgw %di, (%rdx) # sched: [10:1.25]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_xchg_16:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: xchgw %di, %ax # sched: [1:0.50]
+; BTVER2-NEXT: xchgw %si, %di # sched: [1:0.50]
+; BTVER2-NEXT: xchgw %di, (%rdx) # sched: [4:1.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_xchg_16:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: xchgw %di, %ax # sched: [1:0.50]
+; ZNVER1-NEXT: xchgw %si, %di # sched: [1:0.50]
+; ZNVER1-NEXT: xchgw %di, (%rdx) # sched: [5:0.50]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ tail call void asm "xchg %AX, $0 \0A\09 xchg $1, $0 \0A\09 xchg $2, $0", "r,r,*m"(i16 %a0, i16 %a1, i16 *%a2) nounwind
+ ret void
+}
+define void @test_xchg_32(i32 %a0, i32 %a1, i32 *%a2) optsize {
+; GENERIC-LABEL: test_xchg_32:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: xchgl %edi, %eax # sched: [1:0.33]
+; GENERIC-NEXT: xchgl %esi, %edi # sched: [1:0.33]
+; GENERIC-NEXT: xchgl %edi, (%rdx) # sched: [5:1.00]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_xchg_32:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: xchgl %edi, %eax # sched: [2:1.00]
+; ATOM-NEXT: xchgl %esi, %edi # sched: [2:1.00]
+; ATOM-NEXT: xchgl %edi, (%rdx) # sched: [3:1.50]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_xchg_32:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: xchgl %edi, %eax # sched: [1:0.50]
+; SLM-NEXT: xchgl %esi, %edi # sched: [1:0.50]
+; SLM-NEXT: xchgl %edi, (%rdx) # sched: [4:2.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_xchg_32:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: xchgl %edi, %eax # sched: [1:0.33]
+; SANDY-NEXT: xchgl %esi, %edi # sched: [1:0.33]
+; SANDY-NEXT: xchgl %edi, (%rdx) # sched: [5:1.00]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_xchg_32:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: xchgl %edi, %eax # sched: [1:0.25]
+; HASWELL-NEXT: xchgl %esi, %edi # sched: [1:0.25]
+; HASWELL-NEXT: xchgl %edi, (%rdx) # sched: [9:1.00]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_xchg_32:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: xchgl %edi, %eax # sched: [1:0.25]
+; BROADWELL-NEXT: xchgl %esi, %edi # sched: [1:0.25]
+; BROADWELL-NEXT: xchgl %edi, (%rdx) # sched: [8:1.00]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_xchg_32:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: xchgl %edi, %eax # sched: [1:0.25]
+; SKYLAKE-NEXT: xchgl %esi, %edi # sched: [1:0.25]
+; SKYLAKE-NEXT: xchgl %edi, (%rdx) # sched: [10:1.25]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_xchg_32:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: xchgl %edi, %eax # sched: [1:0.25]
+; SKX-NEXT: xchgl %esi, %edi # sched: [1:0.25]
+; SKX-NEXT: xchgl %edi, (%rdx) # sched: [10:1.25]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_xchg_32:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: xchgl %edi, %eax # sched: [1:0.50]
+; BTVER2-NEXT: xchgl %esi, %edi # sched: [1:0.50]
+; BTVER2-NEXT: xchgl %edi, (%rdx) # sched: [4:1.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_xchg_32:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: xchgl %edi, %eax # sched: [1:0.50]
+; ZNVER1-NEXT: xchgl %esi, %edi # sched: [1:0.50]
+; ZNVER1-NEXT: xchgl %edi, (%rdx) # sched: [5:0.50]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ tail call void asm "xchg %EAX, $0 \0A\09 xchg $1, $0 \0A\09 xchg $2, $0", "r,r,*m"(i32 %a0, i32 %a1, i32 *%a2) nounwind
+ ret void
+}
+define void @test_xchg_64(i64 %a0, i64 %a1, i64 *%a2) optsize {
+; GENERIC-LABEL: test_xchg_64:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: xchgq %rdi, %rax # sched: [1:0.33]
+; GENERIC-NEXT: xchgq %rsi, %rdi # sched: [1:0.33]
+; GENERIC-NEXT: xchgq %rdi, (%rdx) # sched: [5:1.00]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_xchg_64:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: xchgq %rdi, %rax # sched: [2:1.00]
+; ATOM-NEXT: xchgq %rsi, %rdi # sched: [2:1.00]
+; ATOM-NEXT: xchgq %rdi, (%rdx) # sched: [3:1.50]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_xchg_64:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: xchgq %rdi, %rax # sched: [1:0.50]
+; SLM-NEXT: xchgq %rsi, %rdi # sched: [1:0.50]
+; SLM-NEXT: xchgq %rdi, (%rdx) # sched: [4:2.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_xchg_64:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: xchgq %rdi, %rax # sched: [1:0.33]
+; SANDY-NEXT: xchgq %rsi, %rdi # sched: [1:0.33]
+; SANDY-NEXT: xchgq %rdi, (%rdx) # sched: [5:1.00]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_xchg_64:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: xchgq %rdi, %rax # sched: [1:0.25]
+; HASWELL-NEXT: xchgq %rsi, %rdi # sched: [1:0.25]
+; HASWELL-NEXT: xchgq %rdi, (%rdx) # sched: [9:1.00]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_xchg_64:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: xchgq %rdi, %rax # sched: [1:0.25]
+; BROADWELL-NEXT: xchgq %rsi, %rdi # sched: [1:0.25]
+; BROADWELL-NEXT: xchgq %rdi, (%rdx) # sched: [8:1.00]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_xchg_64:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: xchgq %rdi, %rax # sched: [1:0.25]
+; SKYLAKE-NEXT: xchgq %rsi, %rdi # sched: [1:0.25]
+; SKYLAKE-NEXT: xchgq %rdi, (%rdx) # sched: [10:1.25]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_xchg_64:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: xchgq %rdi, %rax # sched: [1:0.25]
+; SKX-NEXT: xchgq %rsi, %rdi # sched: [1:0.25]
+; SKX-NEXT: xchgq %rdi, (%rdx) # sched: [10:1.25]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_xchg_64:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: xchgq %rdi, %rax # sched: [1:0.50]
+; BTVER2-NEXT: xchgq %rsi, %rdi # sched: [1:0.50]
+; BTVER2-NEXT: xchgq %rdi, (%rdx) # sched: [4:1.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_xchg_64:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: xchgq %rdi, %rax # sched: [1:0.50]
+; ZNVER1-NEXT: xchgq %rsi, %rdi # sched: [1:0.50]
+; ZNVER1-NEXT: xchgq %rdi, (%rdx) # sched: [5:0.50]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ tail call void asm "xchg %RAX, $0 \0A\09 xchg $1, $0 \0A\09 xchg $2, $0", "r,r,*m"(i64 %a0, i64 %a1, i64 *%a2) nounwind
+ ret void
+}
+
+define void @test_xlat() optsize {
+; GENERIC-LABEL: test_xlat:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: xlatb # sched: [4:0.50]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_xlat:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: xlatb # sched: [6:3.00]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_xlat:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: xlatb # sched: [3:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_xlat:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: xlatb # sched: [4:0.50]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_xlat:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: xlatb # sched: [7:?]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_xlat:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: xlatb # sched: [5:0.50]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_xlat:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: xlatb # sched: [5:0.50]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_xlat:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: xlatb # sched: [5:0.50]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_xlat:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: xlatb # sched: [5:1.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_xlat:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: xlatb # sched: [100:?]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ tail call void asm "xlat", ""() nounwind
+ ret void
+}
+
+define void @test_xor_8(i8 %a0, i8* %a1) optsize {
+; GENERIC-LABEL: test_xor_8:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: xorb $7, %al # sched: [1:0.33]
+; GENERIC-NEXT: xorb $7, %dil # sched: [1:0.33]
+; GENERIC-NEXT: xorb $7, (%rsi) # sched: [7:1.00]
+; GENERIC-NEXT: xorb %dil, %dil # sched: [1:0.33]
+; GENERIC-NEXT: xorb %dil, (%rsi) # sched: [7:1.00]
+; GENERIC-NEXT: xorb (%rsi), %dil # sched: [6:0.50]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_xor_8:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: xorb $7, %al # sched: [1:0.50]
+; ATOM-NEXT: xorb $7, %dil # sched: [1:0.50]
+; ATOM-NEXT: xorb $7, (%rsi) # sched: [1:1.00]
+; ATOM-NEXT: xorb %dil, %dil # sched: [1:0.50]
+; ATOM-NEXT: xorb %dil, (%rsi) # sched: [1:1.00]
+; ATOM-NEXT: xorb (%rsi), %dil # sched: [1:1.00]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_xor_8:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: xorb $7, %al # sched: [1:0.50]
+; SLM-NEXT: xorb $7, %dil # sched: [1:0.50]
+; SLM-NEXT: xorb $7, (%rsi) # sched: [4:2.00]
+; SLM-NEXT: xorb %dil, %dil # sched: [1:0.50]
+; SLM-NEXT: xorb %dil, (%rsi) # sched: [4:2.00]
+; SLM-NEXT: xorb (%rsi), %dil # sched: [4:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_xor_8:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: xorb $7, %al # sched: [1:0.33]
+; SANDY-NEXT: xorb $7, %dil # sched: [1:0.33]
+; SANDY-NEXT: xorb $7, (%rsi) # sched: [7:1.00]
+; SANDY-NEXT: xorb %dil, %dil # sched: [1:0.33]
+; SANDY-NEXT: xorb %dil, (%rsi) # sched: [7:1.00]
+; SANDY-NEXT: xorb (%rsi), %dil # sched: [6:0.50]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_xor_8:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: xorb $7, %al # sched: [1:0.25]
+; HASWELL-NEXT: xorb $7, %dil # sched: [1:0.25]
+; HASWELL-NEXT: xorb $7, (%rsi) # sched: [7:1.00]
+; HASWELL-NEXT: xorb %dil, %dil # sched: [1:0.25]
+; HASWELL-NEXT: xorb %dil, (%rsi) # sched: [7:1.00]
+; HASWELL-NEXT: xorb (%rsi), %dil # sched: [6:0.50]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_xor_8:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: xorb $7, %al # sched: [1:0.25]
+; BROADWELL-NEXT: xorb $7, %dil # sched: [1:0.25]
+; BROADWELL-NEXT: xorb $7, (%rsi) # sched: [6:1.00]
+; BROADWELL-NEXT: xorb %dil, %dil # sched: [1:0.25]
+; BROADWELL-NEXT: xorb %dil, (%rsi) # sched: [6:1.00]
+; BROADWELL-NEXT: xorb (%rsi), %dil # sched: [6:0.50]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_xor_8:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: xorb $7, %al # sched: [1:0.25]
+; SKYLAKE-NEXT: xorb $7, %dil # sched: [1:0.25]
+; SKYLAKE-NEXT: xorb $7, (%rsi) # sched: [6:1.00]
+; SKYLAKE-NEXT: xorb %dil, %dil # sched: [1:0.25]
+; SKYLAKE-NEXT: xorb %dil, (%rsi) # sched: [6:1.00]
+; SKYLAKE-NEXT: xorb (%rsi), %dil # sched: [6:0.50]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_xor_8:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: xorb $7, %al # sched: [1:0.25]
+; SKX-NEXT: xorb $7, %dil # sched: [1:0.25]
+; SKX-NEXT: xorb $7, (%rsi) # sched: [6:1.00]
+; SKX-NEXT: xorb %dil, %dil # sched: [1:0.25]
+; SKX-NEXT: xorb %dil, (%rsi) # sched: [6:1.00]
+; SKX-NEXT: xorb (%rsi), %dil # sched: [6:0.50]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_xor_8:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: xorb $7, %al # sched: [1:0.50]
+; BTVER2-NEXT: xorb $7, %dil # sched: [1:0.50]
+; BTVER2-NEXT: xorb $7, (%rsi) # sched: [4:1.00]
+; BTVER2-NEXT: xorb %dil, %dil # sched: [1:0.50]
+; BTVER2-NEXT: xorb %dil, (%rsi) # sched: [4:1.00]
+; BTVER2-NEXT: xorb (%rsi), %dil # sched: [4:1.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_xor_8:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: xorb $7, %al # sched: [1:0.25]
+; ZNVER1-NEXT: xorb $7, %dil # sched: [1:0.25]
+; ZNVER1-NEXT: xorb $7, (%rsi) # sched: [5:0.50]
+; ZNVER1-NEXT: xorb %dil, %dil # sched: [1:0.25]
+; ZNVER1-NEXT: xorb %dil, (%rsi) # sched: [5:0.50]
+; ZNVER1-NEXT: xorb (%rsi), %dil # sched: [5:0.50]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ tail call void asm "xorb $2, %AL \0A\09 xorb $2, $0 \0A\09 xorb $2, $1 \0A\09 xorb $0, $0 \0A\09 xorb $0, $1 \0A\09 xorb $1, $0", "r,*m,i"(i8 %a0, i8* %a1, i8 7) nounwind
+ ret void
+}
+define void @test_xor_16(i16 %a0, i16* %a1) optsize {
+; GENERIC-LABEL: test_xor_16:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: xorw $511, %ax # imm = 0x1FF
+; GENERIC-NEXT: # sched: [1:0.33]
+; GENERIC-NEXT: xorw $511, %di # imm = 0x1FF
+; GENERIC-NEXT: # sched: [1:0.33]
+; GENERIC-NEXT: xorw $511, (%rsi) # imm = 0x1FF
+; GENERIC-NEXT: # sched: [7:1.00]
+; GENERIC-NEXT: xorw $7, %di # sched: [1:0.33]
+; GENERIC-NEXT: xorw $7, (%rsi) # sched: [7:1.00]
+; GENERIC-NEXT: xorw %di, %di # sched: [1:0.33]
+; GENERIC-NEXT: xorw %di, (%rsi) # sched: [7:1.00]
+; GENERIC-NEXT: xorw (%rsi), %di # sched: [6:0.50]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_xor_16:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: xorw $511, %ax # imm = 0x1FF
+; ATOM-NEXT: # sched: [1:0.50]
+; ATOM-NEXT: xorw $511, %di # imm = 0x1FF
+; ATOM-NEXT: # sched: [1:0.50]
+; ATOM-NEXT: xorw $511, (%rsi) # imm = 0x1FF
+; ATOM-NEXT: # sched: [1:1.00]
+; ATOM-NEXT: xorw $7, %di # sched: [1:0.50]
+; ATOM-NEXT: xorw $7, (%rsi) # sched: [1:1.00]
+; ATOM-NEXT: xorw %di, %di # sched: [1:0.50]
+; ATOM-NEXT: xorw %di, (%rsi) # sched: [1:1.00]
+; ATOM-NEXT: xorw (%rsi), %di # sched: [1:1.00]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_xor_16:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: xorw $511, %ax # imm = 0x1FF
+; SLM-NEXT: # sched: [1:0.50]
+; SLM-NEXT: xorw $511, %di # imm = 0x1FF
+; SLM-NEXT: # sched: [1:0.50]
+; SLM-NEXT: xorw $511, (%rsi) # imm = 0x1FF
+; SLM-NEXT: # sched: [4:2.00]
+; SLM-NEXT: xorw $7, %di # sched: [1:0.50]
+; SLM-NEXT: xorw $7, (%rsi) # sched: [4:2.00]
+; SLM-NEXT: xorw %di, %di # sched: [1:0.50]
+; SLM-NEXT: xorw %di, (%rsi) # sched: [4:2.00]
+; SLM-NEXT: xorw (%rsi), %di # sched: [4:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_xor_16:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: xorw $511, %ax # imm = 0x1FF
+; SANDY-NEXT: # sched: [1:0.33]
+; SANDY-NEXT: xorw $511, %di # imm = 0x1FF
+; SANDY-NEXT: # sched: [1:0.33]
+; SANDY-NEXT: xorw $511, (%rsi) # imm = 0x1FF
+; SANDY-NEXT: # sched: [7:1.00]
+; SANDY-NEXT: xorw $7, %di # sched: [1:0.33]
+; SANDY-NEXT: xorw $7, (%rsi) # sched: [7:1.00]
+; SANDY-NEXT: xorw %di, %di # sched: [1:0.33]
+; SANDY-NEXT: xorw %di, (%rsi) # sched: [7:1.00]
+; SANDY-NEXT: xorw (%rsi), %di # sched: [6:0.50]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_xor_16:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: xorw $511, %ax # imm = 0x1FF
+; HASWELL-NEXT: # sched: [1:0.25]
+; HASWELL-NEXT: xorw $511, %di # imm = 0x1FF
+; HASWELL-NEXT: # sched: [1:0.25]
+; HASWELL-NEXT: xorw $511, (%rsi) # imm = 0x1FF
+; HASWELL-NEXT: # sched: [7:1.00]
+; HASWELL-NEXT: xorw $7, %di # sched: [1:0.25]
+; HASWELL-NEXT: xorw $7, (%rsi) # sched: [7:1.00]
+; HASWELL-NEXT: xorw %di, %di # sched: [1:0.25]
+; HASWELL-NEXT: xorw %di, (%rsi) # sched: [7:1.00]
+; HASWELL-NEXT: xorw (%rsi), %di # sched: [6:0.50]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_xor_16:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: xorw $511, %ax # imm = 0x1FF
+; BROADWELL-NEXT: # sched: [1:0.25]
+; BROADWELL-NEXT: xorw $511, %di # imm = 0x1FF
+; BROADWELL-NEXT: # sched: [1:0.25]
+; BROADWELL-NEXT: xorw $511, (%rsi) # imm = 0x1FF
+; BROADWELL-NEXT: # sched: [6:1.00]
+; BROADWELL-NEXT: xorw $7, %di # sched: [1:0.25]
+; BROADWELL-NEXT: xorw $7, (%rsi) # sched: [6:1.00]
+; BROADWELL-NEXT: xorw %di, %di # sched: [1:0.25]
+; BROADWELL-NEXT: xorw %di, (%rsi) # sched: [6:1.00]
+; BROADWELL-NEXT: xorw (%rsi), %di # sched: [6:0.50]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_xor_16:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: xorw $511, %ax # imm = 0x1FF
+; SKYLAKE-NEXT: # sched: [1:0.25]
+; SKYLAKE-NEXT: xorw $511, %di # imm = 0x1FF
+; SKYLAKE-NEXT: # sched: [1:0.25]
+; SKYLAKE-NEXT: xorw $511, (%rsi) # imm = 0x1FF
+; SKYLAKE-NEXT: # sched: [6:1.00]
+; SKYLAKE-NEXT: xorw $7, %di # sched: [1:0.25]
+; SKYLAKE-NEXT: xorw $7, (%rsi) # sched: [6:1.00]
+; SKYLAKE-NEXT: xorw %di, %di # sched: [1:0.25]
+; SKYLAKE-NEXT: xorw %di, (%rsi) # sched: [6:1.00]
+; SKYLAKE-NEXT: xorw (%rsi), %di # sched: [6:0.50]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_xor_16:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: xorw $511, %ax # imm = 0x1FF
+; SKX-NEXT: # sched: [1:0.25]
+; SKX-NEXT: xorw $511, %di # imm = 0x1FF
+; SKX-NEXT: # sched: [1:0.25]
+; SKX-NEXT: xorw $511, (%rsi) # imm = 0x1FF
+; SKX-NEXT: # sched: [6:1.00]
+; SKX-NEXT: xorw $7, %di # sched: [1:0.25]
+; SKX-NEXT: xorw $7, (%rsi) # sched: [6:1.00]
+; SKX-NEXT: xorw %di, %di # sched: [1:0.25]
+; SKX-NEXT: xorw %di, (%rsi) # sched: [6:1.00]
+; SKX-NEXT: xorw (%rsi), %di # sched: [6:0.50]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_xor_16:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: xorw $511, %ax # imm = 0x1FF
+; BTVER2-NEXT: # sched: [1:0.50]
+; BTVER2-NEXT: xorw $511, %di # imm = 0x1FF
+; BTVER2-NEXT: # sched: [1:0.50]
+; BTVER2-NEXT: xorw $511, (%rsi) # imm = 0x1FF
+; BTVER2-NEXT: # sched: [4:1.00]
+; BTVER2-NEXT: xorw $7, %di # sched: [1:0.50]
+; BTVER2-NEXT: xorw $7, (%rsi) # sched: [4:1.00]
+; BTVER2-NEXT: xorw %di, %di # sched: [1:0.50]
+; BTVER2-NEXT: xorw %di, (%rsi) # sched: [4:1.00]
+; BTVER2-NEXT: xorw (%rsi), %di # sched: [4:1.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_xor_16:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: xorw $511, %ax # imm = 0x1FF
+; ZNVER1-NEXT: # sched: [1:0.25]
+; ZNVER1-NEXT: xorw $511, %di # imm = 0x1FF
+; ZNVER1-NEXT: # sched: [1:0.25]
+; ZNVER1-NEXT: xorw $511, (%rsi) # imm = 0x1FF
+; ZNVER1-NEXT: # sched: [5:0.50]
+; ZNVER1-NEXT: xorw $7, %di # sched: [1:0.25]
+; ZNVER1-NEXT: xorw $7, (%rsi) # sched: [5:0.50]
+; ZNVER1-NEXT: xorw %di, %di # sched: [1:0.25]
+; ZNVER1-NEXT: xorw %di, (%rsi) # sched: [5:0.50]
+; ZNVER1-NEXT: xorw (%rsi), %di # sched: [5:0.50]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ tail call void asm "xorw $2, %AX \0A\09 xorw $2, $0 \0A\09 xorw $2, $1 \0A\09 xorw $3, $0 \0A\09 xorw $3, $1 \0A\09 xorw $0, $0 \0A\09 xorw $0, $1 \0A\09 xorw $1, $0", "r,*m,i,i"(i16 %a0, i16* %a1, i16 511, i8 7) nounwind
+ ret void
+}
+define void @test_xor_32(i32 %a0, i32* %a1) optsize {
+; GENERIC-LABEL: test_xor_32:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: xorl $665536, %eax # imm = 0xA27C0
+; GENERIC-NEXT: # sched: [1:0.33]
+; GENERIC-NEXT: xorl $665536, %edi # imm = 0xA27C0
+; GENERIC-NEXT: # sched: [1:0.33]
+; GENERIC-NEXT: xorl $665536, (%rsi) # imm = 0xA27C0
+; GENERIC-NEXT: # sched: [7:1.00]
+; GENERIC-NEXT: xorl $7, %edi # sched: [1:0.33]
+; GENERIC-NEXT: xorl $7, (%rsi) # sched: [7:1.00]
+; GENERIC-NEXT: xorl %edi, %edi # sched: [1:0.33]
+; GENERIC-NEXT: xorl %edi, (%rsi) # sched: [7:1.00]
+; GENERIC-NEXT: xorl (%rsi), %edi # sched: [6:0.50]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_xor_32:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: xorl $665536, %eax # imm = 0xA27C0
+; ATOM-NEXT: # sched: [1:0.50]
+; ATOM-NEXT: xorl $665536, %edi # imm = 0xA27C0
+; ATOM-NEXT: # sched: [1:0.50]
+; ATOM-NEXT: xorl $665536, (%rsi) # imm = 0xA27C0
+; ATOM-NEXT: # sched: [1:1.00]
+; ATOM-NEXT: xorl $7, %edi # sched: [1:0.50]
+; ATOM-NEXT: xorl $7, (%rsi) # sched: [1:1.00]
+; ATOM-NEXT: xorl %edi, %edi # sched: [1:0.50]
+; ATOM-NEXT: xorl %edi, (%rsi) # sched: [1:1.00]
+; ATOM-NEXT: xorl (%rsi), %edi # sched: [1:1.00]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_xor_32:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: xorl $665536, %eax # imm = 0xA27C0
+; SLM-NEXT: # sched: [1:0.50]
+; SLM-NEXT: xorl $665536, %edi # imm = 0xA27C0
+; SLM-NEXT: # sched: [1:0.50]
+; SLM-NEXT: xorl $665536, (%rsi) # imm = 0xA27C0
+; SLM-NEXT: # sched: [4:2.00]
+; SLM-NEXT: xorl $7, %edi # sched: [1:0.50]
+; SLM-NEXT: xorl $7, (%rsi) # sched: [4:2.00]
+; SLM-NEXT: xorl %edi, %edi # sched: [1:0.50]
+; SLM-NEXT: xorl %edi, (%rsi) # sched: [4:2.00]
+; SLM-NEXT: xorl (%rsi), %edi # sched: [4:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_xor_32:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: xorl $665536, %eax # imm = 0xA27C0
+; SANDY-NEXT: # sched: [1:0.33]
+; SANDY-NEXT: xorl $665536, %edi # imm = 0xA27C0
+; SANDY-NEXT: # sched: [1:0.33]
+; SANDY-NEXT: xorl $665536, (%rsi) # imm = 0xA27C0
+; SANDY-NEXT: # sched: [7:1.00]
+; SANDY-NEXT: xorl $7, %edi # sched: [1:0.33]
+; SANDY-NEXT: xorl $7, (%rsi) # sched: [7:1.00]
+; SANDY-NEXT: xorl %edi, %edi # sched: [1:0.33]
+; SANDY-NEXT: xorl %edi, (%rsi) # sched: [7:1.00]
+; SANDY-NEXT: xorl (%rsi), %edi # sched: [6:0.50]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_xor_32:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: xorl $665536, %eax # imm = 0xA27C0
+; HASWELL-NEXT: # sched: [1:0.25]
+; HASWELL-NEXT: xorl $665536, %edi # imm = 0xA27C0
+; HASWELL-NEXT: # sched: [1:0.25]
+; HASWELL-NEXT: xorl $665536, (%rsi) # imm = 0xA27C0
+; HASWELL-NEXT: # sched: [7:1.00]
+; HASWELL-NEXT: xorl $7, %edi # sched: [1:0.25]
+; HASWELL-NEXT: xorl $7, (%rsi) # sched: [7:1.00]
+; HASWELL-NEXT: xorl %edi, %edi # sched: [1:0.25]
+; HASWELL-NEXT: xorl %edi, (%rsi) # sched: [7:1.00]
+; HASWELL-NEXT: xorl (%rsi), %edi # sched: [6:0.50]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_xor_32:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: xorl $665536, %eax # imm = 0xA27C0
+; BROADWELL-NEXT: # sched: [1:0.25]
+; BROADWELL-NEXT: xorl $665536, %edi # imm = 0xA27C0
+; BROADWELL-NEXT: # sched: [1:0.25]
+; BROADWELL-NEXT: xorl $665536, (%rsi) # imm = 0xA27C0
+; BROADWELL-NEXT: # sched: [6:1.00]
+; BROADWELL-NEXT: xorl $7, %edi # sched: [1:0.25]
+; BROADWELL-NEXT: xorl $7, (%rsi) # sched: [6:1.00]
+; BROADWELL-NEXT: xorl %edi, %edi # sched: [1:0.25]
+; BROADWELL-NEXT: xorl %edi, (%rsi) # sched: [6:1.00]
+; BROADWELL-NEXT: xorl (%rsi), %edi # sched: [6:0.50]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_xor_32:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: xorl $665536, %eax # imm = 0xA27C0
+; SKYLAKE-NEXT: # sched: [1:0.25]
+; SKYLAKE-NEXT: xorl $665536, %edi # imm = 0xA27C0
+; SKYLAKE-NEXT: # sched: [1:0.25]
+; SKYLAKE-NEXT: xorl $665536, (%rsi) # imm = 0xA27C0
+; SKYLAKE-NEXT: # sched: [6:1.00]
+; SKYLAKE-NEXT: xorl $7, %edi # sched: [1:0.25]
+; SKYLAKE-NEXT: xorl $7, (%rsi) # sched: [6:1.00]
+; SKYLAKE-NEXT: xorl %edi, %edi # sched: [1:0.25]
+; SKYLAKE-NEXT: xorl %edi, (%rsi) # sched: [6:1.00]
+; SKYLAKE-NEXT: xorl (%rsi), %edi # sched: [6:0.50]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_xor_32:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: xorl $665536, %eax # imm = 0xA27C0
+; SKX-NEXT: # sched: [1:0.25]
+; SKX-NEXT: xorl $665536, %edi # imm = 0xA27C0
+; SKX-NEXT: # sched: [1:0.25]
+; SKX-NEXT: xorl $665536, (%rsi) # imm = 0xA27C0
+; SKX-NEXT: # sched: [6:1.00]
+; SKX-NEXT: xorl $7, %edi # sched: [1:0.25]
+; SKX-NEXT: xorl $7, (%rsi) # sched: [6:1.00]
+; SKX-NEXT: xorl %edi, %edi # sched: [1:0.25]
+; SKX-NEXT: xorl %edi, (%rsi) # sched: [6:1.00]
+; SKX-NEXT: xorl (%rsi), %edi # sched: [6:0.50]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_xor_32:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: xorl $665536, %eax # imm = 0xA27C0
+; BTVER2-NEXT: # sched: [1:0.50]
+; BTVER2-NEXT: xorl $665536, %edi # imm = 0xA27C0
+; BTVER2-NEXT: # sched: [1:0.50]
+; BTVER2-NEXT: xorl $665536, (%rsi) # imm = 0xA27C0
+; BTVER2-NEXT: # sched: [4:1.00]
+; BTVER2-NEXT: xorl $7, %edi # sched: [1:0.50]
+; BTVER2-NEXT: xorl $7, (%rsi) # sched: [4:1.00]
+; BTVER2-NEXT: xorl %edi, %edi # sched: [1:0.50]
+; BTVER2-NEXT: xorl %edi, (%rsi) # sched: [4:1.00]
+; BTVER2-NEXT: xorl (%rsi), %edi # sched: [4:1.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_xor_32:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: xorl $665536, %eax # imm = 0xA27C0
+; ZNVER1-NEXT: # sched: [1:0.25]
+; ZNVER1-NEXT: xorl $665536, %edi # imm = 0xA27C0
+; ZNVER1-NEXT: # sched: [1:0.25]
+; ZNVER1-NEXT: xorl $665536, (%rsi) # imm = 0xA27C0
+; ZNVER1-NEXT: # sched: [5:0.50]
+; ZNVER1-NEXT: xorl $7, %edi # sched: [1:0.25]
+; ZNVER1-NEXT: xorl $7, (%rsi) # sched: [5:0.50]
+; ZNVER1-NEXT: xorl %edi, %edi # sched: [1:0.25]
+; ZNVER1-NEXT: xorl %edi, (%rsi) # sched: [5:0.50]
+; ZNVER1-NEXT: xorl (%rsi), %edi # sched: [5:0.50]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ tail call void asm "xorl $2, %EAX \0A\09 xorl $2, $0 \0A\09 xorl $2, $1 \0A\09 xorl $3, $0 \0A\09 xorl $3, $1 \0A\09 xorl $0, $0 \0A\09 xorl $0, $1 \0A\09 xorl $1, $0", "r,*m,i,i"(i32 %a0, i32* %a1, i32 665536, i8 7) nounwind
+ ret void
+}
+define void @test_xor_64(i64 %a0, i64* %a1) optsize {
+; GENERIC-LABEL: test_xor_64:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: xorq $665536, %rax # imm = 0xA27C0
+; GENERIC-NEXT: # sched: [1:0.33]
+; GENERIC-NEXT: xorq $665536, %rdi # imm = 0xA27C0
+; GENERIC-NEXT: # sched: [1:0.33]
+; GENERIC-NEXT: xorq $665536, (%rsi) # imm = 0xA27C0
+; GENERIC-NEXT: # sched: [7:1.00]
+; GENERIC-NEXT: xorq $7, %rdi # sched: [1:0.33]
+; GENERIC-NEXT: xorq $7, (%rsi) # sched: [7:1.00]
+; GENERIC-NEXT: xorq %rdi, %rdi # sched: [1:0.33]
+; GENERIC-NEXT: xorq %rdi, (%rsi) # sched: [7:1.00]
+; GENERIC-NEXT: xorq (%rsi), %rdi # sched: [6:0.50]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_xor_64:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: xorq $665536, %rax # imm = 0xA27C0
+; ATOM-NEXT: # sched: [1:0.50]
+; ATOM-NEXT: xorq $665536, %rdi # imm = 0xA27C0
+; ATOM-NEXT: # sched: [1:0.50]
+; ATOM-NEXT: xorq $665536, (%rsi) # imm = 0xA27C0
+; ATOM-NEXT: # sched: [1:1.00]
+; ATOM-NEXT: xorq $7, %rdi # sched: [1:0.50]
+; ATOM-NEXT: xorq $7, (%rsi) # sched: [1:1.00]
+; ATOM-NEXT: xorq %rdi, %rdi # sched: [1:0.50]
+; ATOM-NEXT: xorq %rdi, (%rsi) # sched: [1:1.00]
+; ATOM-NEXT: xorq (%rsi), %rdi # sched: [1:1.00]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_xor_64:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: xorq $665536, %rax # imm = 0xA27C0
+; SLM-NEXT: # sched: [1:0.50]
+; SLM-NEXT: xorq $665536, %rdi # imm = 0xA27C0
+; SLM-NEXT: # sched: [1:0.50]
+; SLM-NEXT: xorq $665536, (%rsi) # imm = 0xA27C0
+; SLM-NEXT: # sched: [4:2.00]
+; SLM-NEXT: xorq $7, %rdi # sched: [1:0.50]
+; SLM-NEXT: xorq $7, (%rsi) # sched: [4:2.00]
+; SLM-NEXT: xorq %rdi, %rdi # sched: [1:0.50]
+; SLM-NEXT: xorq %rdi, (%rsi) # sched: [4:2.00]
+; SLM-NEXT: xorq (%rsi), %rdi # sched: [4:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_xor_64:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: xorq $665536, %rax # imm = 0xA27C0
+; SANDY-NEXT: # sched: [1:0.33]
+; SANDY-NEXT: xorq $665536, %rdi # imm = 0xA27C0
+; SANDY-NEXT: # sched: [1:0.33]
+; SANDY-NEXT: xorq $665536, (%rsi) # imm = 0xA27C0
+; SANDY-NEXT: # sched: [7:1.00]
+; SANDY-NEXT: xorq $7, %rdi # sched: [1:0.33]
+; SANDY-NEXT: xorq $7, (%rsi) # sched: [7:1.00]
+; SANDY-NEXT: xorq %rdi, %rdi # sched: [1:0.33]
+; SANDY-NEXT: xorq %rdi, (%rsi) # sched: [7:1.00]
+; SANDY-NEXT: xorq (%rsi), %rdi # sched: [6:0.50]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_xor_64:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: xorq $665536, %rax # imm = 0xA27C0
+; HASWELL-NEXT: # sched: [1:0.25]
+; HASWELL-NEXT: xorq $665536, %rdi # imm = 0xA27C0
+; HASWELL-NEXT: # sched: [1:0.25]
+; HASWELL-NEXT: xorq $665536, (%rsi) # imm = 0xA27C0
+; HASWELL-NEXT: # sched: [7:1.00]
+; HASWELL-NEXT: xorq $7, %rdi # sched: [1:0.25]
+; HASWELL-NEXT: xorq $7, (%rsi) # sched: [7:1.00]
+; HASWELL-NEXT: xorq %rdi, %rdi # sched: [1:0.25]
+; HASWELL-NEXT: xorq %rdi, (%rsi) # sched: [7:1.00]
+; HASWELL-NEXT: xorq (%rsi), %rdi # sched: [6:0.50]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_xor_64:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: xorq $665536, %rax # imm = 0xA27C0
+; BROADWELL-NEXT: # sched: [1:0.25]
+; BROADWELL-NEXT: xorq $665536, %rdi # imm = 0xA27C0
+; BROADWELL-NEXT: # sched: [1:0.25]
+; BROADWELL-NEXT: xorq $665536, (%rsi) # imm = 0xA27C0
+; BROADWELL-NEXT: # sched: [6:1.00]
+; BROADWELL-NEXT: xorq $7, %rdi # sched: [1:0.25]
+; BROADWELL-NEXT: xorq $7, (%rsi) # sched: [6:1.00]
+; BROADWELL-NEXT: xorq %rdi, %rdi # sched: [1:0.25]
+; BROADWELL-NEXT: xorq %rdi, (%rsi) # sched: [6:1.00]
+; BROADWELL-NEXT: xorq (%rsi), %rdi # sched: [6:0.50]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_xor_64:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: xorq $665536, %rax # imm = 0xA27C0
+; SKYLAKE-NEXT: # sched: [1:0.25]
+; SKYLAKE-NEXT: xorq $665536, %rdi # imm = 0xA27C0
+; SKYLAKE-NEXT: # sched: [1:0.25]
+; SKYLAKE-NEXT: xorq $665536, (%rsi) # imm = 0xA27C0
+; SKYLAKE-NEXT: # sched: [6:1.00]
+; SKYLAKE-NEXT: xorq $7, %rdi # sched: [1:0.25]
+; SKYLAKE-NEXT: xorq $7, (%rsi) # sched: [6:1.00]
+; SKYLAKE-NEXT: xorq %rdi, %rdi # sched: [1:0.25]
+; SKYLAKE-NEXT: xorq %rdi, (%rsi) # sched: [6:1.00]
+; SKYLAKE-NEXT: xorq (%rsi), %rdi # sched: [6:0.50]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_xor_64:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: xorq $665536, %rax # imm = 0xA27C0
+; SKX-NEXT: # sched: [1:0.25]
+; SKX-NEXT: xorq $665536, %rdi # imm = 0xA27C0
+; SKX-NEXT: # sched: [1:0.25]
+; SKX-NEXT: xorq $665536, (%rsi) # imm = 0xA27C0
+; SKX-NEXT: # sched: [6:1.00]
+; SKX-NEXT: xorq $7, %rdi # sched: [1:0.25]
+; SKX-NEXT: xorq $7, (%rsi) # sched: [6:1.00]
+; SKX-NEXT: xorq %rdi, %rdi # sched: [1:0.25]
+; SKX-NEXT: xorq %rdi, (%rsi) # sched: [6:1.00]
+; SKX-NEXT: xorq (%rsi), %rdi # sched: [6:0.50]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_xor_64:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: xorq $665536, %rax # imm = 0xA27C0
+; BTVER2-NEXT: # sched: [1:0.50]
+; BTVER2-NEXT: xorq $665536, %rdi # imm = 0xA27C0
+; BTVER2-NEXT: # sched: [1:0.50]
+; BTVER2-NEXT: xorq $665536, (%rsi) # imm = 0xA27C0
+; BTVER2-NEXT: # sched: [4:1.00]
+; BTVER2-NEXT: xorq $7, %rdi # sched: [1:0.50]
+; BTVER2-NEXT: xorq $7, (%rsi) # sched: [4:1.00]
+; BTVER2-NEXT: xorq %rdi, %rdi # sched: [1:0.50]
+; BTVER2-NEXT: xorq %rdi, (%rsi) # sched: [4:1.00]
+; BTVER2-NEXT: xorq (%rsi), %rdi # sched: [4:1.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_xor_64:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: xorq $665536, %rax # imm = 0xA27C0
+; ZNVER1-NEXT: # sched: [1:0.25]
+; ZNVER1-NEXT: xorq $665536, %rdi # imm = 0xA27C0
+; ZNVER1-NEXT: # sched: [1:0.25]
+; ZNVER1-NEXT: xorq $665536, (%rsi) # imm = 0xA27C0
+; ZNVER1-NEXT: # sched: [5:0.50]
+; ZNVER1-NEXT: xorq $7, %rdi # sched: [1:0.25]
+; ZNVER1-NEXT: xorq $7, (%rsi) # sched: [5:0.50]
+; ZNVER1-NEXT: xorq %rdi, %rdi # sched: [1:0.25]
+; ZNVER1-NEXT: xorq %rdi, (%rsi) # sched: [5:0.50]
+; ZNVER1-NEXT: xorq (%rsi), %rdi # sched: [5:0.50]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ tail call void asm "xorq $2, %RAX \0A\09 xorq $2, $0 \0A\09 xorq $2, $1 \0A\09 xorq $3, $0 \0A\09 xorq $3, $1 \0A\09 xorq $0, $0 \0A\09 xorq $0, $1 \0A\09 xorq $1, $0", "r,*m,i,i"(i64 %a0, i64* %a1, i32 665536, i8 7) nounwind
+ ret void
+}
diff --git a/test/CodeGen/X86/scheduler-backtracking.ll b/test/CodeGen/X86/scheduler-backtracking.ll
index 98471ee90d53..d62f07fa0f7d 100644
--- a/test/CodeGen/X86/scheduler-backtracking.ll
+++ b/test/CodeGen/X86/scheduler-backtracking.ll
@@ -1,8 +1,8 @@
-; RUN: llc -march=x86-64 < %s -pre-RA-sched=list-ilp | FileCheck %s
-; RUN: llc -march=x86-64 < %s -pre-RA-sched=list-hybrid | FileCheck %s
-; RUN: llc -march=x86-64 < %s -pre-RA-sched=source | FileCheck %s
-; RUN: llc -march=x86-64 < %s -pre-RA-sched=list-burr | FileCheck %s
-; RUN: llc -march=x86-64 < %s -pre-RA-sched=linearize | FileCheck %s
+; RUN: llc -mtriple=x86_64-- < %s -pre-RA-sched=list-ilp | FileCheck %s
+; RUN: llc -mtriple=x86_64-- < %s -pre-RA-sched=list-hybrid | FileCheck %s
+; RUN: llc -mtriple=x86_64-- < %s -pre-RA-sched=source | FileCheck %s
+; RUN: llc -mtriple=x86_64-- < %s -pre-RA-sched=list-burr | FileCheck %s
+; RUN: llc -mtriple=x86_64-- < %s -pre-RA-sched=linearize | FileCheck %s
; PR22304 https://llvm.org/bugs/show_bug.cgi?id=22304
; Tests checking backtracking in source scheduler. llc used to crash on them.
diff --git a/test/CodeGen/X86/sdiv-exact.ll b/test/CodeGen/X86/sdiv-exact.ll
index a6ace5bc31c1..6efe867b9d49 100644
--- a/test/CodeGen/X86/sdiv-exact.ll
+++ b/test/CodeGen/X86/sdiv-exact.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=x86 -mattr=+sse2 < %s | FileCheck %s
+; RUN: llc -mtriple=i686-- -mattr=+sse2 < %s | FileCheck %s
define i32 @test1(i32 %x) {
%div = sdiv exact i32 %x, 25
diff --git a/test/CodeGen/X86/sdiv-pow2.ll b/test/CodeGen/X86/sdiv-pow2.ll
index e89f76931e18..d3042f6ca4ba 100644
--- a/test/CodeGen/X86/sdiv-pow2.ll
+++ b/test/CodeGen/X86/sdiv-pow2.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=x86 < %s | FileCheck %s
+; RUN: llc -mtriple=i686-- < %s | FileCheck %s
; No attributes, should not use idiv
define i32 @test1(i32 inreg %x) {
diff --git a/test/CodeGen/X86/segmented-stacks.ll b/test/CodeGen/X86/segmented-stacks.ll
index a0cd1824629a..a4861efac0e0 100644
--- a/test/CodeGen/X86/segmented-stacks.ll
+++ b/test/CodeGen/X86/segmented-stacks.ll
@@ -636,8 +636,27 @@ define void @test_nostack() #0 {
; X64-DFlyBSD-NOT: callq __morestack
}
+define void @test_nosplitstck() {
+ ret void
+}
+
attributes #0 = { "split-stack" }
; X64-Linux-Large: .rodata
; X64-Linux-Large-NEXT: __morestack_addr:
; X64-Linux-Large-NEXT: .quad __morestack
+
+; X32-Linux: .section ".note.GNU-split-stack","",@progbits
+; X32-Linux: .section ".note.GNU-no-split-stack","",@progbits
+
+; X64-Linux: .section ".note.GNU-split-stack","",@progbits
+; X64-Linux: .section ".note.GNU-no-split-stack","",@progbits
+
+; X64-FreeBSD: .section ".note.GNU-split-stack","",@progbits
+; X64-FreeBSD: .section ".note.GNU-no-split-stack","",@progbits
+
+; X32-DFlyBSD: .section ".note.GNU-split-stack","",@progbits
+; X32-DFlyBSD: .section ".note.GNU-no-split-stack","",@progbits
+
+; X64-DFlyBSD: .section ".note.GNU-split-stack","",@progbits
+; X64-DFlyBSD: .section ".note.GNU-no-split-stack","",@progbits
diff --git a/test/CodeGen/X86/select-mmx.ll b/test/CodeGen/X86/select-mmx.ll
index 9e6382faaa59..d452237e6e9d 100644
--- a/test/CodeGen/X86/select-mmx.ll
+++ b/test/CodeGen/X86/select-mmx.ll
@@ -13,7 +13,7 @@
define i64 @test47(i64 %arg) {
;
; X64-LABEL: test47:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: testq %rdi, %rdi
; X64-NEXT: movl $7, %ecx
@@ -24,14 +24,11 @@ define i64 @test47(i64 %arg) {
; X64-NEXT: retq
;
; I32-LABEL: test47:
-; I32: # BB#0:
+; I32: # %bb.0:
; I32-NEXT: pushl %ebp
-; I32-NEXT: .Lcfi0:
; I32-NEXT: .cfi_def_cfa_offset 8
-; I32-NEXT: .Lcfi1:
; I32-NEXT: .cfi_offset %ebp, -8
; I32-NEXT: movl %esp, %ebp
-; I32-NEXT: .Lcfi2:
; I32-NEXT: .cfi_def_cfa_register %ebp
; I32-NEXT: andl $-8, %esp
; I32-NEXT: subl $16, %esp
@@ -39,7 +36,7 @@ define i64 @test47(i64 %arg) {
; I32-NEXT: orl 12(%ebp), %eax
; I32-NEXT: movl $7, %eax
; I32-NEXT: je .LBB0_2
-; I32-NEXT: # BB#1:
+; I32-NEXT: # %bb.1:
; I32-NEXT: xorl %eax, %eax
; I32-NEXT: .LBB0_2:
; I32-NEXT: movl %eax, {{[0-9]+}}(%esp)
@@ -70,7 +67,7 @@ define i64 @test47(i64 %arg) {
define i64 @test49(i64 %arg, i64 %x, i64 %y) {
;
; X64-LABEL: test49:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: testq %rdi, %rdi
; X64-NEXT: cmovneq %rdx, %rsi
; X64-NEXT: movd %rsi, %mm0
@@ -79,21 +76,18 @@ define i64 @test49(i64 %arg, i64 %x, i64 %y) {
; X64-NEXT: retq
;
; I32-LABEL: test49:
-; I32: # BB#0:
+; I32: # %bb.0:
; I32-NEXT: pushl %ebp
-; I32-NEXT: .Lcfi3:
; I32-NEXT: .cfi_def_cfa_offset 8
-; I32-NEXT: .Lcfi4:
; I32-NEXT: .cfi_offset %ebp, -8
; I32-NEXT: movl %esp, %ebp
-; I32-NEXT: .Lcfi5:
; I32-NEXT: .cfi_def_cfa_register %ebp
; I32-NEXT: andl $-8, %esp
; I32-NEXT: subl $8, %esp
; I32-NEXT: movl 8(%ebp), %eax
; I32-NEXT: orl 12(%ebp), %eax
; I32-NEXT: je .LBB1_1
-; I32-NEXT: # BB#2:
+; I32-NEXT: # %bb.2:
; I32-NEXT: leal 24(%ebp), %eax
; I32-NEXT: jmp .LBB1_3
; I32-NEXT: .LBB1_1:
diff --git a/test/CodeGen/X86/select-with-and-or.ll b/test/CodeGen/X86/select-with-and-or.ll
index 45e4384d0fa1..f710a5ce4099 100644
--- a/test/CodeGen/X86/select-with-and-or.ll
+++ b/test/CodeGen/X86/select-with-and-or.ll
@@ -3,7 +3,7 @@
define <4 x i32> @test1(<4 x float> %a, <4 x float> %b, <4 x i32> %c) {
; CHECK-LABEL: test1:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vcmpnleps %xmm0, %xmm1, %xmm0
; CHECK-NEXT: vandps %xmm2, %xmm0, %xmm0
; CHECK-NEXT: retq
@@ -14,7 +14,7 @@ define <4 x i32> @test1(<4 x float> %a, <4 x float> %b, <4 x i32> %c) {
define <4 x i32> @test2(<4 x float> %a, <4 x float> %b, <4 x i32> %c) {
; CHECK-LABEL: test2:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vcmpnleps %xmm0, %xmm1, %xmm0
; CHECK-NEXT: vorps %xmm2, %xmm0, %xmm0
; CHECK-NEXT: retq
@@ -25,7 +25,7 @@ define <4 x i32> @test2(<4 x float> %a, <4 x float> %b, <4 x i32> %c) {
define <4 x i32> @test3(<4 x float> %a, <4 x float> %b, <4 x i32> %c) {
; CHECK-LABEL: test3:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vcmpleps %xmm0, %xmm1, %xmm0
; CHECK-NEXT: vandps %xmm2, %xmm0, %xmm0
; CHECK-NEXT: retq
@@ -36,7 +36,7 @@ define <4 x i32> @test3(<4 x float> %a, <4 x float> %b, <4 x i32> %c) {
define <4 x i32> @test4(<4 x float> %a, <4 x float> %b, <4 x i32> %c) {
; CHECK-LABEL: test4:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vcmpleps %xmm0, %xmm1, %xmm0
; CHECK-NEXT: vorps %xmm2, %xmm0, %xmm0
; CHECK-NEXT: retq
@@ -47,7 +47,7 @@ define <4 x i32> @test4(<4 x float> %a, <4 x float> %b, <4 x i32> %c) {
define <4 x i32> @test5(<4 x float> %a, <4 x float> %b, <4 x i32> %c) {
; CHECK-LABEL: test5:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vcmpnleps %xmm0, %xmm1, %xmm0
; CHECK-NEXT: retq
%f = fcmp ult <4 x float> %a, %b
@@ -57,7 +57,7 @@ define <4 x i32> @test5(<4 x float> %a, <4 x float> %b, <4 x i32> %c) {
define <4 x i32> @test6(<4 x float> %a, <4 x float> %b, <4 x i32> %c) {
; CHECK-LABEL: test6:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vcmpleps %xmm0, %xmm1, %xmm0
; CHECK-NEXT: retq
%not.f = fcmp oge <4 x float> %a, %b
@@ -67,7 +67,7 @@ define <4 x i32> @test6(<4 x float> %a, <4 x float> %b, <4 x i32> %c) {
define <4 x i32> @test7(<4 x float> %a, <4 x float> %b, <4 x i32>* %p) {
; CHECK-LABEL: test7:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vcmpnleps %xmm0, %xmm1, %xmm0
; CHECK-NEXT: vandps (%rdi), %xmm0, %xmm0
; CHECK-NEXT: retq
@@ -81,7 +81,7 @@ define <4 x i32> @test7(<4 x float> %a, <4 x float> %b, <4 x i32>* %p) {
define <2 x double> @test1f(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
; CHECK-LABEL: test1f:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vcmpltpd %xmm0, %xmm1, %xmm0
; CHECK-NEXT: vandpd %xmm2, %xmm0, %xmm0
; CHECK-NEXT: retq
@@ -92,7 +92,7 @@ define <2 x double> @test1f(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
define <2 x double> @test2f(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
; CHECK-LABEL: test2f:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vcmplepd %xmm0, %xmm1, %xmm0
; CHECK-NEXT: vorpd %xmm2, %xmm0, %xmm0
; CHECK-NEXT: retq
@@ -103,7 +103,7 @@ define <2 x double> @test2f(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
define <2 x double> @test3f(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
; CHECK-LABEL: test3f:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vcmpnltpd %xmm1, %xmm0, %xmm0
; CHECK-NEXT: vandpd %xmm2, %xmm0, %xmm0
; CHECK-NEXT: retq
@@ -114,7 +114,7 @@ define <2 x double> @test3f(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
define <2 x double> @test4f(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
; CHECK-LABEL: test4f:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vcmpnlepd %xmm1, %xmm0, %xmm0
; CHECK-NEXT: vorpd %xmm2, %xmm0, %xmm0
; CHECK-NEXT: retq
@@ -125,7 +125,7 @@ define <2 x double> @test4f(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
define <2 x double> @test5f(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
; CHECK-LABEL: test5f:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vcmpnlepd %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%f = fcmp ugt <2 x double> %a, %b
@@ -135,7 +135,7 @@ define <2 x double> @test5f(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
define <2 x double> @test6f(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
; CHECK-LABEL: test6f:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vcmpltpd %xmm0, %xmm1, %xmm0
; CHECK-NEXT: retq
%f = fcmp ule <2 x double> %a, %b
@@ -145,7 +145,7 @@ define <2 x double> @test6f(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
define <2 x double> @test7f(<2 x double> %a, <2 x double> %b, <2 x double>* %p) {
; CHECK-LABEL: test7f:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm0
; CHECK-NEXT: vandpd (%rdi), %xmm0, %xmm0
; CHECK-NEXT: retq
diff --git a/test/CodeGen/X86/select.ll b/test/CodeGen/X86/select.ll
index ec15d1a9520a..d3a8d9d2af45 100644
--- a/test/CodeGen/X86/select.ll
+++ b/test/CodeGen/X86/select.ll
@@ -8,20 +8,19 @@
define i32 @test1(%0* %p, %0* %q, i1 %r) nounwind {
; CHECK-LABEL: test1:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: addq $8, %rdi
; CHECK-NEXT: addq $8, %rsi
; CHECK-NEXT: testb $1, %dl
; CHECK-NEXT: cmovneq %rdi, %rsi
; CHECK-NEXT: movl (%rsi), %eax
; CHECK-NEXT: retq
-; CHECK-NEXT: ## -- End function
;
; MCU-LABEL: test1:
-; MCU: # BB#0:
+; MCU: # %bb.0:
; MCU-NEXT: testb $1, %cl
; MCU-NEXT: jne .LBB0_1
-; MCU-NEXT: # BB#2:
+; MCU-NEXT: # %bb.2:
; MCU-NEXT: addl $8, %edx
; MCU-NEXT: movl %edx, %eax
; MCU-NEXT: movl (%eax), %eax
@@ -39,44 +38,56 @@ define i32 @test1(%0* %p, %0* %q, i1 %r) nounwind {
; PR2139
define i32 @test2() nounwind {
-; CHECK-LABEL: test2:
-; CHECK: ## BB#0: ## %entry
-; CHECK-NEXT: pushq %rax
-; CHECK-NEXT: callq _return_false
-; CHECK-NEXT: xorl %ecx, %ecx
-; CHECK-NEXT: testb $1, %al
-; CHECK-NEXT: movw $-480, %ax ## imm = 0xFE20
-; CHECK-NEXT: cmovnew %cx, %ax
-; CHECK-NEXT: cwtl
-; CHECK-NEXT: shll $3, %eax
-; CHECK-NEXT: cmpl $32768, %eax ## imm = 0x8000
-; CHECK-NEXT: jge LBB1_1
-; CHECK-NEXT: ## BB#2: ## %bb91
-; CHECK-NEXT: xorl %eax, %eax
-; CHECK-NEXT: popq %rcx
-; CHECK-NEXT: retq
-; CHECK-NEXT: LBB1_1: ## %bb90
-; CHECK-NEXT: ## -- End function
+; GENERIC-LABEL: test2:
+; GENERIC: ## %bb.0: ## %entry
+; GENERIC-NEXT: pushq %rax
+; GENERIC-NEXT: callq _return_false
+; GENERIC-NEXT: xorl %ecx, %ecx
+; GENERIC-NEXT: testb $1, %al
+; GENERIC-NEXT: movl $-480, %eax ## imm = 0xFE20
+; GENERIC-NEXT: cmovnel %ecx, %eax
+; GENERIC-NEXT: shll $3, %eax
+; GENERIC-NEXT: cmpl $32768, %eax ## imm = 0x8000
+; GENERIC-NEXT: jge LBB1_1
+; GENERIC-NEXT: ## %bb.2: ## %bb91
+; GENERIC-NEXT: xorl %eax, %eax
+; GENERIC-NEXT: popq %rcx
+; GENERIC-NEXT: retq
+; GENERIC-NEXT: LBB1_1: ## %bb90
+;
+; ATOM-LABEL: test2:
+; ATOM: ## %bb.0: ## %entry
+; ATOM-NEXT: pushq %rax
+; ATOM-NEXT: callq _return_false
+; ATOM-NEXT: xorl %ecx, %ecx
+; ATOM-NEXT: movl $-480, %edx ## imm = 0xFE20
+; ATOM-NEXT: testb $1, %al
+; ATOM-NEXT: cmovnel %ecx, %edx
+; ATOM-NEXT: shll $3, %edx
+; ATOM-NEXT: cmpl $32768, %edx ## imm = 0x8000
+; ATOM-NEXT: jge LBB1_1
+; ATOM-NEXT: ## %bb.2: ## %bb91
+; ATOM-NEXT: xorl %eax, %eax
+; ATOM-NEXT: popq %rcx
+; ATOM-NEXT: retq
+; ATOM-NEXT: LBB1_1: ## %bb90
;
; MCU-LABEL: test2:
-; MCU: # BB#0: # %entry
+; MCU: # %bb.0: # %entry
; MCU-NEXT: calll return_false
+; MCU-NEXT: xorl %ecx, %ecx
; MCU-NEXT: testb $1, %al
-; MCU-NEXT: jne .LBB1_1
-; MCU-NEXT: # BB#2: # %entry
-; MCU-NEXT: movw $-480, %ax # imm = 0xFE20
-; MCU-NEXT: jmp .LBB1_3
-; MCU-NEXT: .LBB1_1:
-; MCU-NEXT: xorl %eax, %eax
-; MCU-NEXT: .LBB1_3: # %entry
-; MCU-NEXT: cwtl
-; MCU-NEXT: shll $3, %eax
-; MCU-NEXT: cmpl $32768, %eax # imm = 0x8000
-; MCU-NEXT: jge .LBB1_4
-; MCU-NEXT: # BB#5: # %bb91
+; MCU-NEXT: jne .LBB1_2
+; MCU-NEXT: # %bb.1: # %entry
+; MCU-NEXT: movl $-480, %ecx # imm = 0xFE20
+; MCU-NEXT: .LBB1_2: # %entry
+; MCU-NEXT: shll $3, %ecx
+; MCU-NEXT: cmpl $32768, %ecx # imm = 0x8000
+; MCU-NEXT: jge .LBB1_3
+; MCU-NEXT: # %bb.4: # %bb91
; MCU-NEXT: xorl %eax, %eax
; MCU-NEXT: retl
-; MCU-NEXT: .LBB1_4: # %bb90
+; MCU-NEXT: .LBB1_3: # %bb90
entry:
%tmp73 = tail call i1 @return_false()
%g.0 = select i1 %tmp73, i16 0, i16 -480
@@ -95,17 +106,16 @@ declare i1 @return_false()
;; Select between two floating point constants.
define float @test3(i32 %x) nounwind readnone {
; CHECK-LABEL: test3:
-; CHECK: ## BB#0: ## %entry
+; CHECK: ## %bb.0: ## %entry
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: testl %edi, %edi
; CHECK-NEXT: sete %al
; CHECK-NEXT: leaq {{.*}}(%rip), %rcx
; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: retq
-; CHECK-NEXT: ## -- End function
;
; MCU-LABEL: test3:
-; MCU: # BB#0: # %entry
+; MCU: # %bb.0: # %entry
; MCU-NEXT: xorl %ecx, %ecx
; MCU-NEXT: testl %eax, %eax
; MCU-NEXT: sete %cl
@@ -119,24 +129,23 @@ entry:
define signext i8 @test4(i8* nocapture %P, double %F) nounwind readonly {
; CHECK-LABEL: test4:
-; CHECK: ## BB#0: ## %entry
+; CHECK: ## %bb.0: ## %entry
; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: ucomisd %xmm0, %xmm1
; CHECK-NEXT: seta %al
; CHECK-NEXT: movsbl (%rdi,%rax,4), %eax
; CHECK-NEXT: retq
-; CHECK-NEXT: ## -- End function
;
; MCU-LABEL: test4:
-; MCU: # BB#0: # %entry
+; MCU: # %bb.0: # %entry
; MCU-NEXT: movl %eax, %ecx
; MCU-NEXT: fldl {{[0-9]+}}(%esp)
; MCU-NEXT: flds {{\.LCPI.*}}
; MCU-NEXT: fucompp
; MCU-NEXT: fnstsw %ax
; MCU-NEXT: xorl %edx, %edx
-; MCU-NEXT: # kill: %AH<def> %AH<kill> %AX<kill>
+; MCU-NEXT: # kill: def %ah killed %ah killed %ax
; MCU-NEXT: sahf
; MCU-NEXT: seta %dl
; MCU-NEXT: movb (%ecx,%edx,4), %al
@@ -151,34 +160,29 @@ entry:
define void @test5(i1 %c, <2 x i16> %a, <2 x i16> %b, <2 x i16>* %p) nounwind {
; CHECK-LABEL: test5:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: testb $1, %dil
; CHECK-NEXT: jne LBB4_2
-; CHECK-NEXT: ## BB#1:
+; CHECK-NEXT: ## %bb.1:
; CHECK-NEXT: movdqa %xmm1, %xmm0
; CHECK-NEXT: LBB4_2:
; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; CHECK-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; CHECK-NEXT: movd %xmm0, (%rsi)
; CHECK-NEXT: retq
-; CHECK-NEXT: ## -- End function
;
; MCU-LABEL: test5:
-; MCU: # BB#0:
+; MCU: # %bb.0:
; MCU-NEXT: pushl %esi
-; MCU-NEXT: andb $1, %al
+; MCU-NEXT: movl {{[0-9]+}}(%esp), %esi
+; MCU-NEXT: testb $1, %al
; MCU-NEXT: jne .LBB4_2
-; MCU-NEXT: # BB#1:
-; MCU-NEXT: movw {{[0-9]+}}(%esp), %dx
+; MCU-NEXT: # %bb.1:
+; MCU-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
+; MCU-NEXT: movzwl {{[0-9]+}}(%esp), %edx
; MCU-NEXT: .LBB4_2:
-; MCU-NEXT: movl {{[0-9]+}}(%esp), %esi
-; MCU-NEXT: testb %al, %al
-; MCU-NEXT: jne .LBB4_4
-; MCU-NEXT: # BB#3:
-; MCU-NEXT: movw {{[0-9]+}}(%esp), %cx
-; MCU-NEXT: .LBB4_4:
-; MCU-NEXT: movw %dx, (%esi)
; MCU-NEXT: movw %cx, 2(%esi)
+; MCU-NEXT: movw %dx, (%esi)
; MCU-NEXT: popl %esi
; MCU-NEXT: retl
%x = select i1 %c, <2 x i16> %a, <2 x i16> %b
@@ -189,10 +193,10 @@ define void @test5(i1 %c, <2 x i16> %a, <2 x i16> %b, <2 x i16>* %p) nounwind {
; Verify that the fmul gets sunk into the one part of the diamond where it is needed.
define void @test6(i32 %C, <4 x float>* %A, <4 x float>* %B) nounwind {
; CHECK-LABEL: test6:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: testl %edi, %edi
; CHECK-NEXT: je LBB5_1
-; CHECK-NEXT: ## BB#2:
+; CHECK-NEXT: ## %bb.2:
; CHECK-NEXT: movaps (%rsi), %xmm0
; CHECK-NEXT: movaps %xmm0, (%rsi)
; CHECK-NEXT: retq
@@ -201,10 +205,9 @@ define void @test6(i32 %C, <4 x float>* %A, <4 x float>* %B) nounwind {
; CHECK-NEXT: mulps %xmm0, %xmm0
; CHECK-NEXT: movaps %xmm0, (%rsi)
; CHECK-NEXT: retq
-; CHECK-NEXT: ## -- End function
;
; MCU-LABEL: test6:
-; MCU: # BB#0:
+; MCU: # %bb.0:
; MCU-NEXT: pushl %eax
; MCU-NEXT: flds 12(%edx)
; MCU-NEXT: fstps (%esp) # 4-byte Folded Spill
@@ -224,7 +227,7 @@ define void @test6(i32 %C, <4 x float>* %A, <4 x float>* %B) nounwind {
; MCU-NEXT: testl %eax, %eax
; MCU-NEXT: flds (%edx)
; MCU-NEXT: je .LBB5_2
-; MCU-NEXT: # BB#1:
+; MCU-NEXT: # %bb.1:
; MCU-NEXT: fstp %st(1)
; MCU-NEXT: fstp %st(3)
; MCU-NEXT: fstp %st(1)
@@ -265,7 +268,7 @@ define void @test6(i32 %C, <4 x float>* %A, <4 x float>* %B) nounwind {
; Select with fp80's
define x86_fp80 @test7(i32 %tmp8) nounwind {
; CHECK-LABEL: test7:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: testl %edi, %edi
; CHECK-NEXT: setns %al
@@ -273,10 +276,9 @@ define x86_fp80 @test7(i32 %tmp8) nounwind {
; CHECK-NEXT: leaq {{.*}}(%rip), %rcx
; CHECK-NEXT: fldt (%rax,%rcx)
; CHECK-NEXT: retq
-; CHECK-NEXT: ## -- End function
;
; MCU-LABEL: test7:
-; MCU: # BB#0:
+; MCU: # %bb.0:
; MCU-NEXT: xorl %ecx, %ecx
; MCU-NEXT: testl %eax, %eax
; MCU-NEXT: setns %cl
@@ -291,153 +293,138 @@ define x86_fp80 @test7(i32 %tmp8) nounwind {
; widening select v6i32 and then a sub
define void @test8(i1 %c, <6 x i32>* %dst.addr, <6 x i32> %src1,<6 x i32> %src2) nounwind {
; GENERIC-LABEL: test8:
-; GENERIC: ## BB#0:
-; GENERIC-NEXT: andb $1, %dil
+; GENERIC: ## %bb.0:
+; GENERIC-NEXT: testb $1, %dil
; GENERIC-NEXT: jne LBB7_1
-; GENERIC-NEXT: ## BB#2:
-; GENERIC-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; GENERIC-NEXT: ## %bb.2:
; GENERIC-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; GENERIC-NEXT: jmp LBB7_3
-; GENERIC-NEXT: LBB7_1:
; GENERIC-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; GENERIC-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; GENERIC-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
; GENERIC-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; GENERIC-NEXT: LBB7_3:
-; GENERIC-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; GENERIC-NEXT: testb %dil, %dil
-; GENERIC-NEXT: jne LBB7_4
-; GENERIC-NEXT: ## BB#5:
+; GENERIC-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; GENERIC-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; GENERIC-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
; GENERIC-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; GENERIC-NEXT: jmp LBB7_3
+; GENERIC-NEXT: LBB7_1:
+; GENERIC-NEXT: movd %r9d, %xmm0
+; GENERIC-NEXT: movd %r8d, %xmm1
+; GENERIC-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; GENERIC-NEXT: movd %ecx, %xmm2
+; GENERIC-NEXT: movd %edx, %xmm0
+; GENERIC-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; GENERIC-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; GENERIC-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; GENERIC-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; GENERIC-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
; GENERIC-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; GENERIC-NEXT: jmp LBB7_6
-; GENERIC-NEXT: LBB7_4:
-; GENERIC-NEXT: movd %r9d, %xmm1
-; GENERIC-NEXT: movd %r8d, %xmm2
-; GENERIC-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; GENERIC-NEXT: movd %ecx, %xmm3
-; GENERIC-NEXT: movd %edx, %xmm1
-; GENERIC-NEXT: LBB7_6:
-; GENERIC-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
-; GENERIC-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; GENERIC-NEXT: LBB7_3:
+; GENERIC-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; GENERIC-NEXT: pcmpeqd %xmm2, %xmm2
-; GENERIC-NEXT: paddd %xmm2, %xmm1
; GENERIC-NEXT: paddd %xmm2, %xmm0
-; GENERIC-NEXT: movq %xmm0, 16(%rsi)
-; GENERIC-NEXT: movdqa %xmm1, (%rsi)
+; GENERIC-NEXT: paddd %xmm2, %xmm1
+; GENERIC-NEXT: movq %xmm1, 16(%rsi)
+; GENERIC-NEXT: movdqa %xmm0, (%rsi)
; GENERIC-NEXT: retq
-; GENERIC-NEXT: ## -- End function
;
; ATOM-LABEL: test8:
-; ATOM: ## BB#0:
-; ATOM-NEXT: andb $1, %dil
+; ATOM: ## %bb.0:
+; ATOM-NEXT: testb $1, %dil
; ATOM-NEXT: jne LBB7_1
-; ATOM-NEXT: ## BB#2:
-; ATOM-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; ATOM-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; ATOM-NEXT: jmp LBB7_3
-; ATOM-NEXT: LBB7_1:
-; ATOM-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; ATOM-NEXT: ## %bb.2:
; ATOM-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; ATOM-NEXT: LBB7_3:
-; ATOM-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; ATOM-NEXT: testb %dil, %dil
-; ATOM-NEXT: jne LBB7_4
-; ATOM-NEXT: ## BB#5:
; ATOM-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
; ATOM-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
; ATOM-NEXT: movd {{.*#+}} xmm4 = mem[0],zero,zero,zero
+; ATOM-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; ATOM-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; ATOM-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; ATOM-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; ATOM-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
-; ATOM-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
-; ATOM-NEXT: jmp LBB7_6
-; ATOM-NEXT: LBB7_4:
-; ATOM-NEXT: movd %r9d, %xmm1
+; ATOM-NEXT: jmp LBB7_3
+; ATOM-NEXT: LBB7_1:
+; ATOM-NEXT: movd %r9d, %xmm0
; ATOM-NEXT: movd %r8d, %xmm2
-; ATOM-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; ATOM-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
; ATOM-NEXT: movd %ecx, %xmm3
-; ATOM-NEXT: movd %edx, %xmm1
-; ATOM-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
-; ATOM-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; ATOM-NEXT: LBB7_6:
+; ATOM-NEXT: movd %edx, %xmm0
+; ATOM-NEXT: movd {{.*#+}} xmm4 = mem[0],zero,zero,zero
+; ATOM-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; ATOM-NEXT: LBB7_3:
+; ATOM-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; ATOM-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; ATOM-NEXT: pcmpeqd %xmm2, %xmm2
+; ATOM-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
; ATOM-NEXT: paddd %xmm2, %xmm0
; ATOM-NEXT: paddd %xmm2, %xmm1
-; ATOM-NEXT: movq %xmm0, 16(%rsi)
-; ATOM-NEXT: movdqa %xmm1, (%rsi)
+; ATOM-NEXT: movdqa %xmm0, (%rsi)
+; ATOM-NEXT: movq %xmm1, 16(%rsi)
; ATOM-NEXT: retq
-; ATOM-NEXT: ## -- End function
;
; MCU-LABEL: test8:
-; MCU: # BB#0:
+; MCU: # %bb.0:
; MCU-NEXT: pushl %ebp
; MCU-NEXT: pushl %ebx
; MCU-NEXT: pushl %edi
; MCU-NEXT: pushl %esi
-; MCU-NEXT: andb $1, %al
+; MCU-NEXT: testb $1, %al
; MCU-NEXT: jne .LBB7_1
-; MCU-NEXT: # BB#2:
-; MCU-NEXT: leal {{[0-9]+}}(%esp), %ecx
-; MCU-NEXT: movl (%ecx), %ecx
+; MCU-NEXT: # %bb.2:
+; MCU-NEXT: leal {{[0-9]+}}(%esp), %eax
+; MCU-NEXT: movl (%eax), %eax
; MCU-NEXT: je .LBB7_5
; MCU-NEXT: .LBB7_4:
-; MCU-NEXT: leal {{[0-9]+}}(%esp), %esi
-; MCU-NEXT: movl (%esi), %esi
+; MCU-NEXT: leal {{[0-9]+}}(%esp), %ecx
+; MCU-NEXT: movl (%ecx), %ecx
; MCU-NEXT: je .LBB7_8
; MCU-NEXT: .LBB7_7:
-; MCU-NEXT: leal {{[0-9]+}}(%esp), %edi
-; MCU-NEXT: movl (%edi), %edi
+; MCU-NEXT: leal {{[0-9]+}}(%esp), %esi
+; MCU-NEXT: movl (%esi), %esi
; MCU-NEXT: je .LBB7_11
; MCU-NEXT: .LBB7_10:
-; MCU-NEXT: leal {{[0-9]+}}(%esp), %ebx
-; MCU-NEXT: movl (%ebx), %ebx
+; MCU-NEXT: leal {{[0-9]+}}(%esp), %edi
+; MCU-NEXT: movl (%edi), %edi
; MCU-NEXT: je .LBB7_14
; MCU-NEXT: .LBB7_13:
+; MCU-NEXT: leal {{[0-9]+}}(%esp), %ebx
+; MCU-NEXT: movl (%ebx), %ebx
+; MCU-NEXT: je .LBB7_17
+; MCU-NEXT: .LBB7_16:
; MCU-NEXT: leal {{[0-9]+}}(%esp), %ebp
-; MCU-NEXT: jmp .LBB7_15
+; MCU-NEXT: jmp .LBB7_18
; MCU-NEXT: .LBB7_1:
-; MCU-NEXT: leal {{[0-9]+}}(%esp), %ecx
-; MCU-NEXT: movl (%ecx), %ecx
+; MCU-NEXT: leal {{[0-9]+}}(%esp), %eax
+; MCU-NEXT: movl (%eax), %eax
; MCU-NEXT: jne .LBB7_4
; MCU-NEXT: .LBB7_5:
-; MCU-NEXT: leal {{[0-9]+}}(%esp), %esi
-; MCU-NEXT: movl (%esi), %esi
+; MCU-NEXT: leal {{[0-9]+}}(%esp), %ecx
+; MCU-NEXT: movl (%ecx), %ecx
; MCU-NEXT: jne .LBB7_7
; MCU-NEXT: .LBB7_8:
-; MCU-NEXT: leal {{[0-9]+}}(%esp), %edi
-; MCU-NEXT: movl (%edi), %edi
+; MCU-NEXT: leal {{[0-9]+}}(%esp), %esi
+; MCU-NEXT: movl (%esi), %esi
; MCU-NEXT: jne .LBB7_10
; MCU-NEXT: .LBB7_11:
-; MCU-NEXT: leal {{[0-9]+}}(%esp), %ebx
-; MCU-NEXT: movl (%ebx), %ebx
+; MCU-NEXT: leal {{[0-9]+}}(%esp), %edi
+; MCU-NEXT: movl (%edi), %edi
; MCU-NEXT: jne .LBB7_13
; MCU-NEXT: .LBB7_14:
-; MCU-NEXT: leal {{[0-9]+}}(%esp), %ebp
-; MCU-NEXT: .LBB7_15:
-; MCU-NEXT: movl (%ebp), %ebp
-; MCU-NEXT: testb %al, %al
+; MCU-NEXT: leal {{[0-9]+}}(%esp), %ebx
+; MCU-NEXT: movl (%ebx), %ebx
; MCU-NEXT: jne .LBB7_16
-; MCU-NEXT: # BB#17:
-; MCU-NEXT: leal {{[0-9]+}}(%esp), %eax
-; MCU-NEXT: jmp .LBB7_18
-; MCU-NEXT: .LBB7_16:
-; MCU-NEXT: leal {{[0-9]+}}(%esp), %eax
+; MCU-NEXT: .LBB7_17:
+; MCU-NEXT: leal {{[0-9]+}}(%esp), %ebp
; MCU-NEXT: .LBB7_18:
-; MCU-NEXT: movl (%eax), %eax
-; MCU-NEXT: decl %eax
+; MCU-NEXT: movl (%ebp), %ebp
; MCU-NEXT: decl %ebp
; MCU-NEXT: decl %ebx
; MCU-NEXT: decl %edi
; MCU-NEXT: decl %esi
; MCU-NEXT: decl %ecx
-; MCU-NEXT: movl %ecx, 20(%edx)
-; MCU-NEXT: movl %esi, 16(%edx)
-; MCU-NEXT: movl %edi, 12(%edx)
-; MCU-NEXT: movl %ebx, 8(%edx)
-; MCU-NEXT: movl %ebp, 4(%edx)
-; MCU-NEXT: movl %eax, (%edx)
+; MCU-NEXT: decl %eax
+; MCU-NEXT: movl %eax, 20(%edx)
+; MCU-NEXT: movl %ecx, 16(%edx)
+; MCU-NEXT: movl %esi, 12(%edx)
+; MCU-NEXT: movl %edi, 8(%edx)
+; MCU-NEXT: movl %ebx, 4(%edx)
+; MCU-NEXT: movl %ebp, (%edx)
; MCU-NEXT: popl %esi
; MCU-NEXT: popl %edi
; MCU-NEXT: popl %ebx
@@ -454,28 +441,26 @@ define void @test8(i1 %c, <6 x i32>* %dst.addr, <6 x i32> %src1,<6 x i32> %src2)
define i64 @test9(i64 %x, i64 %y) nounwind readnone ssp noredzone {
; GENERIC-LABEL: test9:
-; GENERIC: ## BB#0:
+; GENERIC: ## %bb.0:
; GENERIC-NEXT: cmpq $1, %rdi
; GENERIC-NEXT: sbbq %rax, %rax
; GENERIC-NEXT: orq %rsi, %rax
; GENERIC-NEXT: retq
-; GENERIC-NEXT: ## -- End function
;
; ATOM-LABEL: test9:
-; ATOM: ## BB#0:
+; ATOM: ## %bb.0:
; ATOM-NEXT: cmpq $1, %rdi
; ATOM-NEXT: sbbq %rax, %rax
; ATOM-NEXT: orq %rsi, %rax
; ATOM-NEXT: nop
; ATOM-NEXT: nop
; ATOM-NEXT: retq
-; ATOM-NEXT: ## -- End function
;
; MCU-LABEL: test9:
-; MCU: # BB#0:
+; MCU: # %bb.0:
; MCU-NEXT: orl %edx, %eax
; MCU-NEXT: jne .LBB8_1
-; MCU-NEXT: # BB#2:
+; MCU-NEXT: # %bb.2:
; MCU-NEXT: movl $-1, %eax
; MCU-NEXT: movl $-1, %edx
; MCU-NEXT: retl
@@ -491,30 +476,28 @@ define i64 @test9(i64 %x, i64 %y) nounwind readnone ssp noredzone {
;; Same as test9
define i64 @test9a(i64 %x, i64 %y) nounwind readnone ssp noredzone {
; GENERIC-LABEL: test9a:
-; GENERIC: ## BB#0:
+; GENERIC: ## %bb.0:
; GENERIC-NEXT: cmpq $1, %rdi
; GENERIC-NEXT: sbbq %rax, %rax
; GENERIC-NEXT: orq %rsi, %rax
; GENERIC-NEXT: retq
-; GENERIC-NEXT: ## -- End function
;
; ATOM-LABEL: test9a:
-; ATOM: ## BB#0:
+; ATOM: ## %bb.0:
; ATOM-NEXT: cmpq $1, %rdi
; ATOM-NEXT: sbbq %rax, %rax
; ATOM-NEXT: orq %rsi, %rax
; ATOM-NEXT: nop
; ATOM-NEXT: nop
; ATOM-NEXT: retq
-; ATOM-NEXT: ## -- End function
;
; MCU-LABEL: test9a:
-; MCU: # BB#0:
+; MCU: # %bb.0:
; MCU-NEXT: orl %edx, %eax
; MCU-NEXT: movl $-1, %eax
; MCU-NEXT: movl $-1, %edx
; MCU-NEXT: je .LBB9_2
-; MCU-NEXT: # BB#1:
+; MCU-NEXT: # %bb.1:
; MCU-NEXT: movl {{[0-9]+}}(%esp), %eax
; MCU-NEXT: movl {{[0-9]+}}(%esp), %edx
; MCU-NEXT: .LBB9_2:
@@ -526,31 +509,28 @@ define i64 @test9a(i64 %x, i64 %y) nounwind readnone ssp noredzone {
define i64 @test9b(i64 %x, i64 %y) nounwind readnone ssp noredzone {
; GENERIC-LABEL: test9b:
-; GENERIC: ## BB#0:
+; GENERIC: ## %bb.0:
; GENERIC-NEXT: cmpq $1, %rdi
; GENERIC-NEXT: sbbq %rax, %rax
; GENERIC-NEXT: orq %rsi, %rax
; GENERIC-NEXT: retq
-; GENERIC-NEXT: ## -- End function
;
; ATOM-LABEL: test9b:
-; ATOM: ## BB#0:
+; ATOM: ## %bb.0:
; ATOM-NEXT: cmpq $1, %rdi
; ATOM-NEXT: sbbq %rax, %rax
; ATOM-NEXT: orq %rsi, %rax
; ATOM-NEXT: nop
; ATOM-NEXT: nop
; ATOM-NEXT: retq
-; ATOM-NEXT: ## -- End function
;
; MCU-LABEL: test9b:
-; MCU: # BB#0:
-; MCU-NEXT: orl %edx, %eax
-; MCU-NEXT: movl $-1, %edx
-; MCU-NEXT: je .LBB10_2
-; MCU-NEXT: # BB#1:
+; MCU: # %bb.0:
+; MCU-NEXT: movl %edx, %ecx
; MCU-NEXT: xorl %edx, %edx
-; MCU-NEXT: .LBB10_2:
+; MCU-NEXT: orl %ecx, %eax
+; MCU-NEXT: sete %dl
+; MCU-NEXT: negl %edx
; MCU-NEXT: movl {{[0-9]+}}(%esp), %eax
; MCU-NEXT: orl %edx, %eax
; MCU-NEXT: orl {{[0-9]+}}(%esp), %edx
@@ -563,31 +543,21 @@ define i64 @test9b(i64 %x, i64 %y) nounwind readnone ssp noredzone {
;; Select between -1 and 1.
define i64 @test10(i64 %x, i64 %y) nounwind readnone ssp noredzone {
-; GENERIC-LABEL: test10:
-; GENERIC: ## BB#0:
-; GENERIC-NEXT: cmpq $1, %rdi
-; GENERIC-NEXT: sbbq %rax, %rax
-; GENERIC-NEXT: orq $1, %rax
-; GENERIC-NEXT: retq
-; GENERIC-NEXT: ## -- End function
-;
-; ATOM-LABEL: test10:
-; ATOM: ## BB#0:
-; ATOM-NEXT: cmpq $1, %rdi
-; ATOM-NEXT: sbbq %rax, %rax
-; ATOM-NEXT: orq $1, %rax
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: retq
-; ATOM-NEXT: ## -- End function
+; CHECK-LABEL: test10:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: xorl %eax, %eax
+; CHECK-NEXT: testq %rdi, %rdi
+; CHECK-NEXT: setne %al
+; CHECK-NEXT: leaq -1(%rax,%rax), %rax
+; CHECK-NEXT: retq
;
; MCU-LABEL: test10:
-; MCU: # BB#0:
+; MCU: # %bb.0:
; MCU-NEXT: orl %edx, %eax
; MCU-NEXT: movl $-1, %eax
; MCU-NEXT: movl $-1, %edx
; MCU-NEXT: je .LBB11_2
-; MCU-NEXT: # BB#1:
+; MCU-NEXT: # %bb.1:
; MCU-NEXT: xorl %edx, %edx
; MCU-NEXT: movl $1, %eax
; MCU-NEXT: .LBB11_2:
@@ -599,19 +569,18 @@ define i64 @test10(i64 %x, i64 %y) nounwind readnone ssp noredzone {
define i64 @test11(i64 %x, i64 %y) nounwind readnone ssp noredzone {
; CHECK-LABEL: test11:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: cmpq $1, %rdi
; CHECK-NEXT: sbbq %rax, %rax
; CHECK-NEXT: notq %rax
; CHECK-NEXT: orq %rsi, %rax
; CHECK-NEXT: retq
-; CHECK-NEXT: ## -- End function
;
; MCU-LABEL: test11:
-; MCU: # BB#0:
+; MCU: # %bb.0:
; MCU-NEXT: orl %edx, %eax
; MCU-NEXT: je .LBB12_1
-; MCU-NEXT: # BB#2:
+; MCU-NEXT: # %bb.2:
; MCU-NEXT: movl $-1, %eax
; MCU-NEXT: movl $-1, %edx
; MCU-NEXT: retl
@@ -626,21 +595,20 @@ define i64 @test11(i64 %x, i64 %y) nounwind readnone ssp noredzone {
define i64 @test11a(i64 %x, i64 %y) nounwind readnone ssp noredzone {
; CHECK-LABEL: test11a:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: cmpq $1, %rdi
; CHECK-NEXT: sbbq %rax, %rax
; CHECK-NEXT: notq %rax
; CHECK-NEXT: orq %rsi, %rax
; CHECK-NEXT: retq
-; CHECK-NEXT: ## -- End function
;
; MCU-LABEL: test11a:
-; MCU: # BB#0:
+; MCU: # %bb.0:
; MCU-NEXT: orl %edx, %eax
; MCU-NEXT: movl $-1, %eax
; MCU-NEXT: movl $-1, %edx
; MCU-NEXT: jne .LBB13_2
-; MCU-NEXT: # BB#1:
+; MCU-NEXT: # %bb.1:
; MCU-NEXT: movl {{[0-9]+}}(%esp), %eax
; MCU-NEXT: movl {{[0-9]+}}(%esp), %edx
; MCU-NEXT: .LBB13_2:
@@ -655,27 +623,25 @@ declare noalias i8* @_Znam(i64) noredzone
define noalias i8* @test12(i64 %count) nounwind ssp noredzone {
; GENERIC-LABEL: test12:
-; GENERIC: ## BB#0: ## %entry
+; GENERIC: ## %bb.0: ## %entry
; GENERIC-NEXT: movl $4, %ecx
; GENERIC-NEXT: movq %rdi, %rax
; GENERIC-NEXT: mulq %rcx
; GENERIC-NEXT: movq $-1, %rdi
; GENERIC-NEXT: cmovnoq %rax, %rdi
; GENERIC-NEXT: jmp __Znam ## TAILCALL
-; GENERIC-NEXT: ## -- End function
;
; ATOM-LABEL: test12:
-; ATOM: ## BB#0: ## %entry
+; ATOM: ## %bb.0: ## %entry
; ATOM-NEXT: movq %rdi, %rax
; ATOM-NEXT: movl $4, %ecx
; ATOM-NEXT: mulq %rcx
; ATOM-NEXT: movq $-1, %rdi
; ATOM-NEXT: cmovnoq %rax, %rdi
; ATOM-NEXT: jmp __Znam ## TAILCALL
-; ATOM-NEXT: ## -- End function
;
; MCU-LABEL: test12:
-; MCU: # BB#0: # %entry
+; MCU: # %bb.0: # %entry
; MCU-NEXT: pushl %ebp
; MCU-NEXT: pushl %ebx
; MCU-NEXT: pushl %edi
@@ -697,7 +663,7 @@ define noalias i8* @test12(i64 %count) nounwind ssp noredzone {
; MCU-NEXT: movl $-1, %eax
; MCU-NEXT: movl $-1, %edx
; MCU-NEXT: jne .LBB14_2
-; MCU-NEXT: # BB#1: # %entry
+; MCU-NEXT: # %bb.1: # %entry
; MCU-NEXT: movl %esi, %eax
; MCU-NEXT: movl %edi, %edx
; MCU-NEXT: .LBB14_2: # %entry
@@ -719,14 +685,13 @@ declare { i64, i1 } @llvm.umul.with.overflow.i64(i64, i64) nounwind readnone
define i32 @test13(i32 %a, i32 %b) nounwind {
; GENERIC-LABEL: test13:
-; GENERIC: ## BB#0:
+; GENERIC: ## %bb.0:
; GENERIC-NEXT: cmpl %esi, %edi
; GENERIC-NEXT: sbbl %eax, %eax
; GENERIC-NEXT: retq
-; GENERIC-NEXT: ## -- End function
;
; ATOM-LABEL: test13:
-; ATOM: ## BB#0:
+; ATOM: ## %bb.0:
; ATOM-NEXT: cmpl %esi, %edi
; ATOM-NEXT: sbbl %eax, %eax
; ATOM-NEXT: nop
@@ -734,10 +699,9 @@ define i32 @test13(i32 %a, i32 %b) nounwind {
; ATOM-NEXT: nop
; ATOM-NEXT: nop
; ATOM-NEXT: retq
-; ATOM-NEXT: ## -- End function
;
; MCU-LABEL: test13:
-; MCU: # BB#0:
+; MCU: # %bb.0:
; MCU-NEXT: cmpl %edx, %eax
; MCU-NEXT: sbbl %eax, %eax
; MCU-NEXT: retl
@@ -747,29 +711,21 @@ define i32 @test13(i32 %a, i32 %b) nounwind {
}
define i32 @test14(i32 %a, i32 %b) nounwind {
-; GENERIC-LABEL: test14:
-; GENERIC: ## BB#0:
-; GENERIC-NEXT: cmpl %esi, %edi
-; GENERIC-NEXT: sbbl %eax, %eax
-; GENERIC-NEXT: notl %eax
-; GENERIC-NEXT: retq
-; GENERIC-NEXT: ## -- End function
-;
-; ATOM-LABEL: test14:
-; ATOM: ## BB#0:
-; ATOM-NEXT: cmpl %esi, %edi
-; ATOM-NEXT: sbbl %eax, %eax
-; ATOM-NEXT: notl %eax
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: retq
-; ATOM-NEXT: ## -- End function
+; CHECK-LABEL: test14:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: xorl %eax, %eax
+; CHECK-NEXT: cmpl %esi, %edi
+; CHECK-NEXT: setae %al
+; CHECK-NEXT: negl %eax
+; CHECK-NEXT: retq
;
; MCU-LABEL: test14:
-; MCU: # BB#0:
+; MCU: # %bb.0:
+; MCU-NEXT: xorl %ecx, %ecx
; MCU-NEXT: cmpl %edx, %eax
-; MCU-NEXT: sbbl %eax, %eax
-; MCU-NEXT: notl %eax
+; MCU-NEXT: setae %cl
+; MCU-NEXT: negl %ecx
+; MCU-NEXT: movl %ecx, %eax
; MCU-NEXT: retl
%c = icmp uge i32 %a, %b
%d = sext i1 %c to i32
@@ -779,14 +735,13 @@ define i32 @test14(i32 %a, i32 %b) nounwind {
; rdar://10961709
define i32 @test15(i32 %x) nounwind {
; GENERIC-LABEL: test15:
-; GENERIC: ## BB#0: ## %entry
+; GENERIC: ## %bb.0: ## %entry
; GENERIC-NEXT: negl %edi
; GENERIC-NEXT: sbbl %eax, %eax
; GENERIC-NEXT: retq
-; GENERIC-NEXT: ## -- End function
;
; ATOM-LABEL: test15:
-; ATOM: ## BB#0: ## %entry
+; ATOM: ## %bb.0: ## %entry
; ATOM-NEXT: negl %edi
; ATOM-NEXT: sbbl %eax, %eax
; ATOM-NEXT: nop
@@ -794,10 +749,9 @@ define i32 @test15(i32 %x) nounwind {
; ATOM-NEXT: nop
; ATOM-NEXT: nop
; ATOM-NEXT: retq
-; ATOM-NEXT: ## -- End function
;
; MCU-LABEL: test15:
-; MCU: # BB#0: # %entry
+; MCU: # %bb.0: # %entry
; MCU-NEXT: negl %eax
; MCU-NEXT: sbbl %eax, %eax
; MCU-NEXT: retl
@@ -809,13 +763,13 @@ entry:
define i64 @test16(i64 %x) nounwind uwtable readnone ssp {
; GENERIC-LABEL: test16:
-; GENERIC: ## BB#0: ## %entry
+; GENERIC: ## %bb.0: ## %entry
; GENERIC-NEXT: negq %rdi
; GENERIC-NEXT: sbbq %rax, %rax
; GENERIC-NEXT: retq
;
; ATOM-LABEL: test16:
-; ATOM: ## BB#0: ## %entry
+; ATOM: ## %bb.0: ## %entry
; ATOM-NEXT: negq %rdi
; ATOM-NEXT: sbbq %rax, %rax
; ATOM-NEXT: nop
@@ -825,13 +779,12 @@ define i64 @test16(i64 %x) nounwind uwtable readnone ssp {
; ATOM-NEXT: retq
;
; MCU-LABEL: test16:
-; MCU: # BB#0: # %entry
-; MCU-NEXT: orl %edx, %eax
-; MCU-NEXT: movl $-1, %eax
-; MCU-NEXT: jne .LBB18_2
-; MCU-NEXT: # BB#1: # %entry
+; MCU: # %bb.0: # %entry
+; MCU-NEXT: movl %eax, %ecx
; MCU-NEXT: xorl %eax, %eax
-; MCU-NEXT: .LBB18_2: # %entry
+; MCU-NEXT: orl %edx, %ecx
+; MCU-NEXT: setne %al
+; MCU-NEXT: negl %eax
; MCU-NEXT: movl %eax, %edx
; MCU-NEXT: retl
entry:
@@ -842,27 +795,28 @@ entry:
define i16 @test17(i16 %x) nounwind {
; GENERIC-LABEL: test17:
-; GENERIC: ## BB#0: ## %entry
+; GENERIC: ## %bb.0: ## %entry
; GENERIC-NEXT: negw %di
-; GENERIC-NEXT: sbbw %ax, %ax
+; GENERIC-NEXT: sbbl %eax, %eax
+; GENERIC-NEXT: ## kill: def %ax killed %ax killed %eax
; GENERIC-NEXT: retq
-; GENERIC-NEXT: ## -- End function
;
; ATOM-LABEL: test17:
-; ATOM: ## BB#0: ## %entry
+; ATOM: ## %bb.0: ## %entry
; ATOM-NEXT: negw %di
-; ATOM-NEXT: sbbw %ax, %ax
+; ATOM-NEXT: sbbl %eax, %eax
+; ATOM-NEXT: ## kill: def %ax killed %ax killed %eax
; ATOM-NEXT: nop
; ATOM-NEXT: nop
; ATOM-NEXT: nop
; ATOM-NEXT: nop
; ATOM-NEXT: retq
-; ATOM-NEXT: ## -- End function
;
; MCU-LABEL: test17:
-; MCU: # BB#0: # %entry
+; MCU: # %bb.0: # %entry
; MCU-NEXT: negw %ax
-; MCU-NEXT: sbbw %ax, %ax
+; MCU-NEXT: sbbl %eax, %eax
+; MCU-NEXT: # kill: def %ax killed %ax killed %eax
; MCU-NEXT: retl
entry:
%cmp = icmp ne i16 %x, 0
@@ -872,28 +826,26 @@ entry:
define i8 @test18(i32 %x, i8 zeroext %a, i8 zeroext %b) nounwind {
; GENERIC-LABEL: test18:
-; GENERIC: ## BB#0:
+; GENERIC: ## %bb.0:
; GENERIC-NEXT: cmpl $15, %edi
; GENERIC-NEXT: cmovgel %edx, %esi
; GENERIC-NEXT: movl %esi, %eax
; GENERIC-NEXT: retq
-; GENERIC-NEXT: ## -- End function
;
; ATOM-LABEL: test18:
-; ATOM: ## BB#0:
+; ATOM: ## %bb.0:
; ATOM-NEXT: cmpl $15, %edi
; ATOM-NEXT: cmovgel %edx, %esi
; ATOM-NEXT: movl %esi, %eax
; ATOM-NEXT: nop
; ATOM-NEXT: nop
; ATOM-NEXT: retq
-; ATOM-NEXT: ## -- End function
;
; MCU-LABEL: test18:
-; MCU: # BB#0:
+; MCU: # %bb.0:
; MCU-NEXT: cmpl $15, %eax
; MCU-NEXT: jl .LBB20_2
-; MCU-NEXT: # BB#1:
+; MCU-NEXT: # %bb.1:
; MCU-NEXT: movl %ecx, %edx
; MCU-NEXT: .LBB20_2:
; MCU-NEXT: movl %edx, %eax
@@ -905,7 +857,7 @@ define i8 @test18(i32 %x, i8 zeroext %a, i8 zeroext %b) nounwind {
define i32 @trunc_select_miscompile(i32 %a, i1 zeroext %cc) {
; CHECK-LABEL: trunc_select_miscompile:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: orb $2, %sil
; CHECK-NEXT: movl %esi, %ecx
; CHECK-NEXT: shll %cl, %edi
@@ -913,7 +865,7 @@ define i32 @trunc_select_miscompile(i32 %a, i1 zeroext %cc) {
; CHECK-NEXT: retq
;
; MCU-LABEL: trunc_select_miscompile:
-; MCU: # BB#0:
+; MCU: # %bb.0:
; MCU-NEXT: orb $2, %dl
; MCU-NEXT: movl %edx, %ecx
; MCU-NEXT: shll %cl, %eax
@@ -926,45 +878,45 @@ define i32 @trunc_select_miscompile(i32 %a, i1 zeroext %cc) {
; reproducer for pr29002
define void @clamp_i8(i32 %src, i8* %dst) {
; GENERIC-LABEL: clamp_i8:
-; GENERIC: ## BB#0:
+; GENERIC: ## %bb.0:
; GENERIC-NEXT: cmpl $127, %edi
; GENERIC-NEXT: movl $127, %eax
; GENERIC-NEXT: cmovlel %edi, %eax
; GENERIC-NEXT: cmpl $-128, %eax
; GENERIC-NEXT: movb $-128, %cl
; GENERIC-NEXT: jl LBB22_2
-; GENERIC-NEXT: ## BB#1:
+; GENERIC-NEXT: ## %bb.1:
; GENERIC-NEXT: movl %eax, %ecx
; GENERIC-NEXT: LBB22_2:
; GENERIC-NEXT: movb %cl, (%rsi)
; GENERIC-NEXT: retq
;
; ATOM-LABEL: clamp_i8:
-; ATOM: ## BB#0:
+; ATOM: ## %bb.0:
; ATOM-NEXT: cmpl $127, %edi
; ATOM-NEXT: movl $127, %eax
; ATOM-NEXT: cmovlel %edi, %eax
; ATOM-NEXT: movb $-128, %cl
; ATOM-NEXT: cmpl $-128, %eax
; ATOM-NEXT: jl LBB22_2
-; ATOM-NEXT: ## BB#1:
+; ATOM-NEXT: ## %bb.1:
; ATOM-NEXT: movl %eax, %ecx
; ATOM-NEXT: LBB22_2:
; ATOM-NEXT: movb %cl, (%rsi)
; ATOM-NEXT: retq
;
; MCU-LABEL: clamp_i8:
-; MCU: # BB#0:
+; MCU: # %bb.0:
; MCU-NEXT: cmpl $127, %eax
; MCU-NEXT: movl $127, %ecx
; MCU-NEXT: jg .LBB22_2
-; MCU-NEXT: # BB#1:
+; MCU-NEXT: # %bb.1:
; MCU-NEXT: movl %eax, %ecx
; MCU-NEXT: .LBB22_2:
; MCU-NEXT: cmpl $-128, %ecx
; MCU-NEXT: movb $-128, %al
; MCU-NEXT: jl .LBB22_4
-; MCU-NEXT: # BB#3:
+; MCU-NEXT: # %bb.3:
; MCU-NEXT: movl %ecx, %eax
; MCU-NEXT: .LBB22_4:
; MCU-NEXT: movb %al, (%edx)
@@ -981,7 +933,7 @@ define void @clamp_i8(i32 %src, i8* %dst) {
; reproducer for pr29002
define void @clamp(i32 %src, i16* %dst) {
; GENERIC-LABEL: clamp:
-; GENERIC: ## BB#0:
+; GENERIC: ## %bb.0:
; GENERIC-NEXT: cmpl $32767, %edi ## imm = 0x7FFF
; GENERIC-NEXT: movl $32767, %eax ## imm = 0x7FFF
; GENERIC-NEXT: cmovlel %edi, %eax
@@ -992,7 +944,7 @@ define void @clamp(i32 %src, i16* %dst) {
; GENERIC-NEXT: retq
;
; ATOM-LABEL: clamp:
-; ATOM: ## BB#0:
+; ATOM: ## %bb.0:
; ATOM-NEXT: cmpl $32767, %edi ## imm = 0x7FFF
; ATOM-NEXT: movl $32767, %eax ## imm = 0x7FFF
; ATOM-NEXT: cmovlel %edi, %eax
@@ -1003,17 +955,17 @@ define void @clamp(i32 %src, i16* %dst) {
; ATOM-NEXT: retq
;
; MCU-LABEL: clamp:
-; MCU: # BB#0:
+; MCU: # %bb.0:
; MCU-NEXT: cmpl $32767, %eax # imm = 0x7FFF
; MCU-NEXT: movl $32767, %ecx # imm = 0x7FFF
; MCU-NEXT: jg .LBB23_2
-; MCU-NEXT: # BB#1:
+; MCU-NEXT: # %bb.1:
; MCU-NEXT: movl %eax, %ecx
; MCU-NEXT: .LBB23_2:
; MCU-NEXT: cmpl $-32768, %ecx # imm = 0x8000
; MCU-NEXT: movw $-32768, %ax # imm = 0x8000
; MCU-NEXT: jl .LBB23_4
-; MCU-NEXT: # BB#3:
+; MCU-NEXT: # %bb.3:
; MCU-NEXT: movl %ecx, %eax
; MCU-NEXT: .LBB23_4:
; MCU-NEXT: movw %ax, (%edx)
@@ -1035,7 +987,7 @@ define void @test19() {
; that code path, it can be deleted.
;
; CHECK-LABEL: test19:
-; CHECK: ## BB#0: ## %BB
+; CHECK: ## %bb.0: ## %BB
; CHECK-NEXT: movl $-1, %eax
; CHECK-NEXT: movb $1, %cl
; CHECK-NEXT: .p2align 4, 0x90
@@ -1043,7 +995,7 @@ define void @test19() {
; CHECK-NEXT: ## =>This Inner Loop Header: Depth=1
; CHECK-NEXT: testb %cl, %cl
; CHECK-NEXT: jne LBB24_1
-; CHECK-NEXT: ## BB#2: ## %CF250
+; CHECK-NEXT: ## %bb.2: ## %CF250
; CHECK-NEXT: ## in Loop: Header=BB24_1 Depth=1
; CHECK-NEXT: jne LBB24_1
; CHECK-NEXT: .p2align 4, 0x90
@@ -1052,11 +1004,11 @@ define void @test19() {
; CHECK-NEXT: cmpl %eax, %eax
; CHECK-NEXT: ucomiss %xmm0, %xmm0
; CHECK-NEXT: jp LBB24_3
-; CHECK-NEXT: ## BB#4: ## %CF244
+; CHECK-NEXT: ## %bb.4: ## %CF244
; CHECK-NEXT: retq
;
; MCU-LABEL: test19:
-; MCU: # BB#0: # %BB
+; MCU: # %bb.0: # %BB
; MCU-NEXT: movl $-1, %ecx
; MCU-NEXT: movb $1, %al
; MCU-NEXT: .p2align 4, 0x90
@@ -1064,10 +1016,10 @@ define void @test19() {
; MCU-NEXT: # =>This Inner Loop Header: Depth=1
; MCU-NEXT: testb %al, %al
; MCU-NEXT: jne .LBB24_1
-; MCU-NEXT: # BB#2: # %CF250
+; MCU-NEXT: # %bb.2: # %CF250
; MCU-NEXT: # in Loop: Header=BB24_1 Depth=1
; MCU-NEXT: jne .LBB24_1
-; MCU-NEXT: # BB#3: # %CF242.preheader
+; MCU-NEXT: # %bb.3: # %CF242.preheader
; MCU-NEXT: fldz
; MCU-NEXT: .p2align 4, 0x90
; MCU-NEXT: .LBB24_4: # %CF242
@@ -1075,10 +1027,10 @@ define void @test19() {
; MCU-NEXT: cmpl %eax, %ecx
; MCU-NEXT: fucom %st(0)
; MCU-NEXT: fnstsw %ax
-; MCU-NEXT: # kill: %AH<def> %AH<kill> %AX<kill>
+; MCU-NEXT: # kill: def %ah killed %ah killed %ax
; MCU-NEXT: sahf
; MCU-NEXT: jp .LBB24_4
-; MCU-NEXT: # BB#5: # %CF244
+; MCU-NEXT: # %bb.5: # %CF244
; MCU-NEXT: fstp %st(0)
; MCU-NEXT: retl
BB:
@@ -1107,7 +1059,7 @@ CF244:
define i16 @select_xor_1(i16 %A, i8 %cond) {
; CHECK-LABEL: select_xor_1:
-; CHECK: ## BB#0: ## %entry
+; CHECK: ## %bb.0: ## %entry
; CHECK-NEXT: movl %edi, %eax
; CHECK-NEXT: xorl $43, %eax
; CHECK-NEXT: testb $1, %sil
@@ -1116,12 +1068,12 @@ define i16 @select_xor_1(i16 %A, i8 %cond) {
; CHECK-NEXT: retq
;
; MCU-LABEL: select_xor_1:
-; MCU: # BB#0: # %entry
+; MCU: # %bb.0: # %entry
; MCU-NEXT: andl $1, %edx
; MCU-NEXT: negl %edx
; MCU-NEXT: andl $43, %edx
; MCU-NEXT: xorl %edx, %eax
-; MCU-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; MCU-NEXT: # kill: def %ax killed %ax killed %eax
; MCU-NEXT: retl
entry:
%and = and i8 %cond, 1
@@ -1133,7 +1085,7 @@ entry:
define i32 @select_xor_2(i32 %A, i32 %B, i8 %cond) {
; CHECK-LABEL: select_xor_2:
-; CHECK: ## BB#0: ## %entry
+; CHECK: ## %bb.0: ## %entry
; CHECK-NEXT: xorl %edi, %esi
; CHECK-NEXT: testb $1, %dl
; CHECK-NEXT: cmovel %edi, %esi
@@ -1141,7 +1093,7 @@ define i32 @select_xor_2(i32 %A, i32 %B, i8 %cond) {
; CHECK-NEXT: retq
;
; MCU-LABEL: select_xor_2:
-; MCU: # BB#0: # %entry
+; MCU: # %bb.0: # %entry
; MCU-NEXT: andl $1, %ecx
; MCU-NEXT: negl %ecx
; MCU-NEXT: andl %edx, %ecx
@@ -1157,7 +1109,7 @@ entry:
define i32 @select_or(i32 %A, i32 %B, i8 %cond) {
; CHECK-LABEL: select_or:
-; CHECK: ## BB#0: ## %entry
+; CHECK: ## %bb.0: ## %entry
; CHECK-NEXT: orl %edi, %esi
; CHECK-NEXT: testb $1, %dl
; CHECK-NEXT: cmovel %edi, %esi
@@ -1165,7 +1117,7 @@ define i32 @select_or(i32 %A, i32 %B, i8 %cond) {
; CHECK-NEXT: retq
;
; MCU-LABEL: select_or:
-; MCU: # BB#0: # %entry
+; MCU: # %bb.0: # %entry
; MCU-NEXT: andl $1, %ecx
; MCU-NEXT: negl %ecx
; MCU-NEXT: andl %edx, %ecx
@@ -1181,7 +1133,7 @@ entry:
define i32 @select_or_1(i32 %A, i32 %B, i32 %cond) {
; CHECK-LABEL: select_or_1:
-; CHECK: ## BB#0: ## %entry
+; CHECK: ## %bb.0: ## %entry
; CHECK-NEXT: orl %edi, %esi
; CHECK-NEXT: testb $1, %dl
; CHECK-NEXT: cmovel %edi, %esi
@@ -1189,7 +1141,7 @@ define i32 @select_or_1(i32 %A, i32 %B, i32 %cond) {
; CHECK-NEXT: retq
;
; MCU-LABEL: select_or_1:
-; MCU: # BB#0: # %entry
+; MCU: # %bb.0: # %entry
; MCU-NEXT: andl $1, %ecx
; MCU-NEXT: negl %ecx
; MCU-NEXT: andl %edx, %ecx
diff --git a/test/CodeGen/X86/select_const.ll b/test/CodeGen/X86/select_const.ll
index 0eb9bf46ffd1..d78f94db71ab 100644
--- a/test/CodeGen/X86/select_const.ll
+++ b/test/CodeGen/X86/select_const.ll
@@ -8,7 +8,7 @@
define i32 @select_0_or_1(i1 %cond) {
; CHECK-LABEL: select_0_or_1:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: notb %dil
; CHECK-NEXT: movzbl %dil, %eax
; CHECK-NEXT: andl $1, %eax
@@ -19,7 +19,7 @@ define i32 @select_0_or_1(i1 %cond) {
define i32 @select_0_or_1_zeroext(i1 zeroext %cond) {
; CHECK-LABEL: select_0_or_1_zeroext:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: xorb $1, %dil
; CHECK-NEXT: movzbl %dil, %eax
; CHECK-NEXT: retq
@@ -29,7 +29,7 @@ define i32 @select_0_or_1_zeroext(i1 zeroext %cond) {
define i32 @select_0_or_1_signext(i1 signext %cond) {
; CHECK-LABEL: select_0_or_1_signext:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: notb %dil
; CHECK-NEXT: movzbl %dil, %eax
; CHECK-NEXT: andl $1, %eax
@@ -42,7 +42,7 @@ define i32 @select_0_or_1_signext(i1 signext %cond) {
define i32 @select_1_or_0(i1 %cond) {
; CHECK-LABEL: select_1_or_0:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: andl $1, %edi
; CHECK-NEXT: movl %edi, %eax
; CHECK-NEXT: retq
@@ -52,8 +52,8 @@ define i32 @select_1_or_0(i1 %cond) {
define i32 @select_1_or_0_zeroext(i1 zeroext %cond) {
; CHECK-LABEL: select_1_or_0_zeroext:
-; CHECK: # BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
+; CHECK: # %bb.0:
+; CHECK-NEXT: movl %edi, %eax
; CHECK-NEXT: retq
%sel = select i1 %cond, i32 1, i32 0
ret i32 %sel
@@ -61,9 +61,9 @@ define i32 @select_1_or_0_zeroext(i1 zeroext %cond) {
define i32 @select_1_or_0_signext(i1 signext %cond) {
; CHECK-LABEL: select_1_or_0_signext:
-; CHECK: # BB#0:
-; CHECK-NEXT: andb $1, %dil
-; CHECK-NEXT: movzbl %dil, %eax
+; CHECK: # %bb.0:
+; CHECK-NEXT: andl $1, %edi
+; CHECK-NEXT: movl %edi, %eax
; CHECK-NEXT: retq
%sel = select i1 %cond, i32 1, i32 0
ret i32 %sel
@@ -73,8 +73,8 @@ define i32 @select_1_or_0_signext(i1 signext %cond) {
define i32 @select_0_or_neg1(i1 %cond) {
; CHECK-LABEL: select_0_or_neg1:
-; CHECK: # BB#0:
-; CHECK-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; CHECK: # %bb.0:
+; CHECK-NEXT: # kill: def %edi killed %edi def %rdi
; CHECK-NEXT: andl $1, %edi
; CHECK-NEXT: leal -1(%rdi), %eax
; CHECK-NEXT: retq
@@ -84,9 +84,9 @@ define i32 @select_0_or_neg1(i1 %cond) {
define i32 @select_0_or_neg1_zeroext(i1 zeroext %cond) {
; CHECK-LABEL: select_0_or_neg1_zeroext:
-; CHECK: # BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: decl %eax
+; CHECK: # %bb.0:
+; CHECK-NEXT: # kill: def %edi killed %edi def %rdi
+; CHECK-NEXT: leal -1(%rdi), %eax
; CHECK-NEXT: retq
%sel = select i1 %cond, i32 0, i32 -1
ret i32 %sel
@@ -94,10 +94,9 @@ define i32 @select_0_or_neg1_zeroext(i1 zeroext %cond) {
define i32 @select_0_or_neg1_signext(i1 signext %cond) {
; CHECK-LABEL: select_0_or_neg1_signext:
-; CHECK: # BB#0:
-; CHECK-NEXT: andb $1, %dil
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: decl %eax
+; CHECK: # %bb.0:
+; CHECK-NEXT: notl %edi
+; CHECK-NEXT: movl %edi, %eax
; CHECK-NEXT: retq
%sel = select i1 %cond, i32 0, i32 -1
ret i32 %sel
@@ -107,7 +106,7 @@ define i32 @select_0_or_neg1_signext(i1 signext %cond) {
define i32 @select_neg1_or_0(i1 %cond) {
; CHECK-LABEL: select_neg1_or_0:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: andl $1, %edi
; CHECK-NEXT: negl %edi
; CHECK-NEXT: movl %edi, %eax
@@ -118,9 +117,9 @@ define i32 @select_neg1_or_0(i1 %cond) {
define i32 @select_neg1_or_0_zeroext(i1 zeroext %cond) {
; CHECK-LABEL: select_neg1_or_0_zeroext:
-; CHECK: # BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: negl %eax
+; CHECK: # %bb.0:
+; CHECK-NEXT: negl %edi
+; CHECK-NEXT: movl %edi, %eax
; CHECK-NEXT: retq
%sel = select i1 %cond, i32 -1, i32 0
ret i32 %sel
@@ -128,8 +127,8 @@ define i32 @select_neg1_or_0_zeroext(i1 zeroext %cond) {
define i32 @select_neg1_or_0_signext(i1 signext %cond) {
; CHECK-LABEL: select_neg1_or_0_signext:
-; CHECK: # BB#0:
-; CHECK-NEXT: movsbl %dil, %eax
+; CHECK: # %bb.0:
+; CHECK-NEXT: movl %edi, %eax
; CHECK-NEXT: retq
%sel = select i1 %cond, i32 -1, i32 0
ret i32 %sel
@@ -139,8 +138,8 @@ define i32 @select_neg1_or_0_signext(i1 signext %cond) {
define i32 @select_Cplus1_C(i1 %cond) {
; CHECK-LABEL: select_Cplus1_C:
-; CHECK: # BB#0:
-; CHECK-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; CHECK: # %bb.0:
+; CHECK-NEXT: # kill: def %edi killed %edi def %rdi
; CHECK-NEXT: andl $1, %edi
; CHECK-NEXT: leal 41(%rdi), %eax
; CHECK-NEXT: retq
@@ -150,9 +149,9 @@ define i32 @select_Cplus1_C(i1 %cond) {
define i32 @select_Cplus1_C_zeroext(i1 zeroext %cond) {
; CHECK-LABEL: select_Cplus1_C_zeroext:
-; CHECK: # BB#0:
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: addl $41, %eax
+; CHECK: # %bb.0:
+; CHECK-NEXT: # kill: def %edi killed %edi def %rdi
+; CHECK-NEXT: leal 41(%rdi), %eax
; CHECK-NEXT: retq
%sel = select i1 %cond, i32 42, i32 41
ret i32 %sel
@@ -160,10 +159,9 @@ define i32 @select_Cplus1_C_zeroext(i1 zeroext %cond) {
define i32 @select_Cplus1_C_signext(i1 signext %cond) {
; CHECK-LABEL: select_Cplus1_C_signext:
-; CHECK: # BB#0:
-; CHECK-NEXT: andb $1, %dil
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: addl $41, %eax
+; CHECK: # %bb.0:
+; CHECK-NEXT: movl $41, %eax
+; CHECK-NEXT: subl %edi, %eax
; CHECK-NEXT: retq
%sel = select i1 %cond, i32 42, i32 41
ret i32 %sel
@@ -173,7 +171,7 @@ define i32 @select_Cplus1_C_signext(i1 signext %cond) {
define i32 @select_C_Cplus1(i1 %cond) {
; CHECK-LABEL: select_C_Cplus1:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: andl $1, %edi
; CHECK-NEXT: movl $42, %eax
; CHECK-NEXT: subl %edi, %eax
@@ -184,10 +182,9 @@ define i32 @select_C_Cplus1(i1 %cond) {
define i32 @select_C_Cplus1_zeroext(i1 zeroext %cond) {
; CHECK-LABEL: select_C_Cplus1_zeroext:
-; CHECK: # BB#0:
-; CHECK-NEXT: movzbl %dil, %ecx
+; CHECK: # %bb.0:
; CHECK-NEXT: movl $42, %eax
-; CHECK-NEXT: subl %ecx, %eax
+; CHECK-NEXT: subl %edi, %eax
; CHECK-NEXT: retq
%sel = select i1 %cond, i32 41, i32 42
ret i32 %sel
@@ -195,11 +192,10 @@ define i32 @select_C_Cplus1_zeroext(i1 zeroext %cond) {
define i32 @select_C_Cplus1_signext(i1 signext %cond) {
; CHECK-LABEL: select_C_Cplus1_signext:
-; CHECK: # BB#0:
-; CHECK-NEXT: andb $1, %dil
-; CHECK-NEXT: movzbl %dil, %ecx
+; CHECK: # %bb.0:
+; CHECK-NEXT: andl $1, %edi
; CHECK-NEXT: movl $42, %eax
-; CHECK-NEXT: subl %ecx, %eax
+; CHECK-NEXT: subl %edi, %eax
; CHECK-NEXT: retq
%sel = select i1 %cond, i32 41, i32 42
ret i32 %sel
@@ -210,11 +206,10 @@ define i32 @select_C_Cplus1_signext(i1 signext %cond) {
define i32 @select_lea_2(i1 zeroext %cond) {
; CHECK-LABEL: select_lea_2:
-; CHECK: # BB#0:
-; CHECK-NEXT: testb %dil, %dil
-; CHECK-NEXT: movl $-1, %ecx
-; CHECK-NEXT: movl $1, %eax
-; CHECK-NEXT: cmovnel %ecx, %eax
+; CHECK: # %bb.0:
+; CHECK-NEXT: xorb $1, %dil
+; CHECK-NEXT: movzbl %dil, %eax
+; CHECK-NEXT: leal -1(%rax,%rax), %eax
; CHECK-NEXT: retq
%sel = select i1 %cond, i32 -1, i32 1
ret i32 %sel
@@ -222,11 +217,10 @@ define i32 @select_lea_2(i1 zeroext %cond) {
define i64 @select_lea_3(i1 zeroext %cond) {
; CHECK-LABEL: select_lea_3:
-; CHECK: # BB#0:
-; CHECK-NEXT: testb %dil, %dil
-; CHECK-NEXT: movl $1, %ecx
-; CHECK-NEXT: movq $-2, %rax
-; CHECK-NEXT: cmoveq %rcx, %rax
+; CHECK: # %bb.0:
+; CHECK-NEXT: xorb $1, %dil
+; CHECK-NEXT: movzbl %dil, %eax
+; CHECK-NEXT: leaq -2(%rax,%rax,2), %rax
; CHECK-NEXT: retq
%sel = select i1 %cond, i64 -2, i64 1
ret i64 %sel
@@ -234,11 +228,10 @@ define i64 @select_lea_3(i1 zeroext %cond) {
define i32 @select_lea_5(i1 zeroext %cond) {
; CHECK-LABEL: select_lea_5:
-; CHECK: # BB#0:
-; CHECK-NEXT: testb %dil, %dil
-; CHECK-NEXT: movl $-2, %ecx
-; CHECK-NEXT: movl $3, %eax
-; CHECK-NEXT: cmovnel %ecx, %eax
+; CHECK: # %bb.0:
+; CHECK-NEXT: xorb $1, %dil
+; CHECK-NEXT: movzbl %dil, %eax
+; CHECK-NEXT: leal -2(%rax,%rax,4), %eax
; CHECK-NEXT: retq
%sel = select i1 %cond, i32 -2, i32 3
ret i32 %sel
@@ -246,29 +239,99 @@ define i32 @select_lea_5(i1 zeroext %cond) {
define i64 @select_lea_9(i1 zeroext %cond) {
; CHECK-LABEL: select_lea_9:
-; CHECK: # BB#0:
-; CHECK-NEXT: testb %dil, %dil
-; CHECK-NEXT: movl $2, %ecx
-; CHECK-NEXT: movq $-7, %rax
-; CHECK-NEXT: cmoveq %rcx, %rax
+; CHECK: # %bb.0:
+; CHECK-NEXT: xorb $1, %dil
+; CHECK-NEXT: movzbl %dil, %eax
+; CHECK-NEXT: leaq -7(%rax,%rax,8), %rax
; CHECK-NEXT: retq
%sel = select i1 %cond, i64 -7, i64 2
ret i64 %sel
}
+; Should this be 'sbb x,x' or 'sbb 0,x' with simpler LEA or add?
+
+define i64 @sel_1_2(i64 %x, i64 %y) {
+; CHECK-LABEL: sel_1_2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: cmpq $42, %rdi
+; CHECK-NEXT: sbbq $0, %rsi
+; CHECK-NEXT: leaq 2(%rsi), %rax
+; CHECK-NEXT: retq
+ %cmp = icmp ult i64 %x, 42
+ %sel = select i1 %cmp, i64 1, i64 2
+ %sub = add i64 %sel, %y
+ ret i64 %sub
+}
+
+; No LEA with 8-bit, but this shouldn't need branches or cmov.
+
+define i8 @sel_1_neg1(i32 %x) {
+; CHECK-LABEL: sel_1_neg1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: cmpl $42, %edi
+; CHECK-NEXT: setg %al
+; CHECK-NEXT: shlb $2, %al
+; CHECK-NEXT: decb %al
+; CHECK-NEXT: retq
+ %cmp = icmp sgt i32 %x, 42
+ %sel = select i1 %cmp, i8 3, i8 -1
+ ret i8 %sel
+}
+
+; We get an LEA for 16-bit because we ignore the high-bits.
+
+define i16 @sel_neg1_1(i32 %x) {
+; CHECK-LABEL: sel_neg1_1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: xorl %eax, %eax
+; CHECK-NEXT: cmpl $43, %edi
+; CHECK-NEXT: setl %al
+; CHECK-NEXT: leal -1(,%rax,4), %eax
+; CHECK-NEXT: # kill: def %ax killed %ax killed %eax
+; CHECK-NEXT: retq
+ %cmp = icmp sgt i32 %x, 42
+ %sel = select i1 %cmp, i16 -1, i16 3
+ ret i16 %sel
+}
+
+; If the comparison is available, the predicate can be inverted.
+
+define i32 @sel_1_neg1_32(i32 %x) {
+; CHECK-LABEL: sel_1_neg1_32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: xorl %eax, %eax
+; CHECK-NEXT: cmpl $42, %edi
+; CHECK-NEXT: setg %al
+; CHECK-NEXT: leal -1(%rax,%rax,8), %eax
+; CHECK-NEXT: retq
+ %cmp = icmp sgt i32 %x, 42
+ %sel = select i1 %cmp, i32 8, i32 -1
+ ret i32 %sel
+}
+
+define i32 @sel_neg1_1_32(i32 %x) {
+; CHECK-LABEL: sel_neg1_1_32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: xorl %eax, %eax
+; CHECK-NEXT: cmpl $43, %edi
+; CHECK-NEXT: setl %al
+; CHECK-NEXT: leal -7(%rax,%rax,8), %eax
+; CHECK-NEXT: retq
+ %cmp = icmp sgt i32 %x, 42
+ %sel = select i1 %cmp, i32 -7, i32 2
+ ret i32 %sel
+}
+
; If the constants differ by a large power-of-2, that can be a shift of the difference plus the smaller constant.
; select Cond, C1, C2 --> add (mul (zext Cond), C1-C2), C2
define i8 @select_pow2_diff(i1 zeroext %cond) {
; CHECK-LABEL: select_pow2_diff:
-; CHECK: # BB#0:
-; CHECK-NEXT: testb %dil, %dil
-; CHECK-NEXT: movb $19, %al
-; CHECK-NEXT: jne .LBB22_2
-; CHECK-NEXT: # BB#1:
-; CHECK-NEXT: movb $3, %al
-; CHECK-NEXT: .LBB22_2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: shlb $4, %dil
+; CHECK-NEXT: orb $3, %dil
+; CHECK-NEXT: movl %edi, %eax
; CHECK-NEXT: retq
%sel = select i1 %cond, i8 19, i8 3
ret i8 %sel
@@ -276,11 +339,12 @@ define i8 @select_pow2_diff(i1 zeroext %cond) {
define i16 @select_pow2_diff_invert(i1 zeroext %cond) {
; CHECK-LABEL: select_pow2_diff_invert:
-; CHECK: # BB#0:
-; CHECK-NEXT: testb %dil, %dil
-; CHECK-NEXT: movw $7, %cx
-; CHECK-NEXT: movw $71, %ax
-; CHECK-NEXT: cmovnew %cx, %ax
+; CHECK: # %bb.0:
+; CHECK-NEXT: xorb $1, %dil
+; CHECK-NEXT: movzbl %dil, %eax
+; CHECK-NEXT: shll $6, %eax
+; CHECK-NEXT: orl $7, %eax
+; CHECK-NEXT: # kill: def %ax killed %ax killed %eax
; CHECK-NEXT: retq
%sel = select i1 %cond, i16 7, i16 71
ret i16 %sel
@@ -288,11 +352,10 @@ define i16 @select_pow2_diff_invert(i1 zeroext %cond) {
define i32 @select_pow2_diff_neg(i1 zeroext %cond) {
; CHECK-LABEL: select_pow2_diff_neg:
-; CHECK: # BB#0:
-; CHECK-NEXT: testb %dil, %dil
-; CHECK-NEXT: movl $-9, %ecx
-; CHECK-NEXT: movl $-25, %eax
-; CHECK-NEXT: cmovnel %ecx, %eax
+; CHECK: # %bb.0:
+; CHECK-NEXT: shlb $4, %dil
+; CHECK-NEXT: movzbl %dil, %eax
+; CHECK-NEXT: orl $-25, %eax
; CHECK-NEXT: retq
%sel = select i1 %cond, i32 -9, i32 -25
ret i32 %sel
@@ -300,22 +363,40 @@ define i32 @select_pow2_diff_neg(i1 zeroext %cond) {
define i64 @select_pow2_diff_neg_invert(i1 zeroext %cond) {
; CHECK-LABEL: select_pow2_diff_neg_invert:
-; CHECK: # BB#0:
-; CHECK-NEXT: testb %dil, %dil
-; CHECK-NEXT: movl $29, %ecx
-; CHECK-NEXT: movq $-99, %rax
-; CHECK-NEXT: cmoveq %rcx, %rax
+; CHECK: # %bb.0:
+; CHECK-NEXT: xorb $1, %dil
+; CHECK-NEXT: movzbl %dil, %eax
+; CHECK-NEXT: shlq $7, %rax
+; CHECK-NEXT: addq $-99, %rax
; CHECK-NEXT: retq
%sel = select i1 %cond, i64 -99, i64 29
ret i64 %sel
}
+; This doesn't need a branch, but don't do the wrong thing if subtraction of the constants overflows.
+
+define i8 @sel_67_neg125(i32 %x) {
+; CHECK-LABEL: sel_67_neg125:
+; CHECK: # %bb.0:
+; CHECK-NEXT: cmpl $42, %edi
+; CHECK-NEXT: movb $67, %al
+; CHECK-NEXT: jg .LBB31_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: movb $-125, %al
+; CHECK-NEXT: .LBB31_2:
+; CHECK-NEXT: retq
+ %cmp = icmp sgt i32 %x, 42
+ %sel = select i1 %cmp, i8 67, i8 -125
+ ret i8 %sel
+}
+
+
; In general, select of 2 constants could be:
; select Cond, C1, C2 --> add (mul (zext Cond), C1-C2), C2 --> add (and (sext Cond), C1-C2), C2
define i32 @select_C1_C2(i1 %cond) {
; CHECK-LABEL: select_C1_C2:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: testb $1, %dil
; CHECK-NEXT: movl $421, %ecx # imm = 0x1A5
; CHECK-NEXT: movl $42, %eax
@@ -327,8 +408,8 @@ define i32 @select_C1_C2(i1 %cond) {
define i32 @select_C1_C2_zeroext(i1 zeroext %cond) {
; CHECK-LABEL: select_C1_C2_zeroext:
-; CHECK: # BB#0:
-; CHECK-NEXT: testb %dil, %dil
+; CHECK: # %bb.0:
+; CHECK-NEXT: testl %edi, %edi
; CHECK-NEXT: movl $421, %ecx # imm = 0x1A5
; CHECK-NEXT: movl $42, %eax
; CHECK-NEXT: cmovnel %ecx, %eax
@@ -339,7 +420,7 @@ define i32 @select_C1_C2_zeroext(i1 zeroext %cond) {
define i32 @select_C1_C2_signext(i1 signext %cond) {
; CHECK-LABEL: select_C1_C2_signext:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: testb $1, %dil
; CHECK-NEXT: movl $421, %ecx # imm = 0x1A5
; CHECK-NEXT: movl $42, %eax
@@ -353,7 +434,7 @@ define i32 @select_C1_C2_signext(i1 signext %cond) {
define i64 @select_2_or_inc(i64 %x) {
; CHECK-LABEL: select_2_or_inc:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: leaq 1(%rdi), %rax
; CHECK-NEXT: cmpq $2, %rdi
; CHECK-NEXT: cmoveq %rdi, %rax
@@ -366,13 +447,13 @@ define i64 @select_2_or_inc(i64 %x) {
define <4 x i32> @sel_constants_add_constant_vec(i1 %cond) {
; CHECK-LABEL: sel_constants_add_constant_vec:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: testb $1, %dil
-; CHECK-NEXT: jne .LBB30_1
-; CHECK-NEXT: # BB#2:
+; CHECK-NEXT: jne .LBB36_1
+; CHECK-NEXT: # %bb.2:
; CHECK-NEXT: movaps {{.*#+}} xmm0 = [12,13,14,15]
; CHECK-NEXT: retq
-; CHECK-NEXT: .LBB30_1:
+; CHECK-NEXT: .LBB36_1:
; CHECK-NEXT: movaps {{.*#+}} xmm0 = [4294967293,14,4,4]
; CHECK-NEXT: retq
%sel = select i1 %cond, <4 x i32> <i32 -4, i32 12, i32 1, i32 0>, <4 x i32> <i32 11, i32 11, i32 11, i32 11>
@@ -382,13 +463,13 @@ define <4 x i32> @sel_constants_add_constant_vec(i1 %cond) {
define <2 x double> @sel_constants_fmul_constant_vec(i1 %cond) {
; CHECK-LABEL: sel_constants_fmul_constant_vec:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: testb $1, %dil
-; CHECK-NEXT: jne .LBB31_1
-; CHECK-NEXT: # BB#2:
+; CHECK-NEXT: jne .LBB37_1
+; CHECK-NEXT: # %bb.2:
; CHECK-NEXT: movaps {{.*#+}} xmm0 = [1.188300e+02,3.454000e+01]
; CHECK-NEXT: retq
-; CHECK-NEXT: .LBB31_1:
+; CHECK-NEXT: .LBB37_1:
; CHECK-NEXT: movaps {{.*#+}} xmm0 = [-2.040000e+01,3.768000e+01]
; CHECK-NEXT: retq
%sel = select i1 %cond, <2 x double> <double -4.0, double 12.0>, <2 x double> <double 23.3, double 11.0>
@@ -401,7 +482,7 @@ define <2 x double> @sel_constants_fmul_constant_vec(i1 %cond) {
define i64 @opaque_constant(i1 %cond, i64 %x) {
; CHECK-LABEL: opaque_constant:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: testb $1, %dil
; CHECK-NEXT: movl $23, %ecx
; CHECK-NEXT: movq $-4, %rax
diff --git a/test/CodeGen/X86/setcc-combine.ll b/test/CodeGen/X86/setcc-combine.ll
index 38205c660731..a4a8e67d742c 100644
--- a/test/CodeGen/X86/setcc-combine.ll
+++ b/test/CodeGen/X86/setcc-combine.ll
@@ -3,7 +3,7 @@
define i32 @test_eq_1(<4 x i32> %A, <4 x i32> %B) {
; CHECK-LABEL: test_eq_1:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: pcmpgtd %xmm0, %xmm1
; CHECK-NEXT: pcmpeqd %xmm0, %xmm0
; CHECK-NEXT: pxor %xmm1, %xmm0
@@ -20,7 +20,7 @@ define i32 @test_eq_1(<4 x i32> %A, <4 x i32> %B) {
define i32 @test_ne_1(<4 x i32> %A, <4 x i32> %B) {
; CHECK-LABEL: test_ne_1:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: pcmpgtd %xmm0, %xmm1
; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; CHECK-NEXT: movd %xmm0, %eax
@@ -35,7 +35,7 @@ define i32 @test_ne_1(<4 x i32> %A, <4 x i32> %B) {
define i32 @test_le_1(<4 x i32> %A, <4 x i32> %B) {
; CHECK-LABEL: test_le_1:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movl $-1, %eax
; CHECK-NEXT: retq
%cmp = icmp slt <4 x i32> %A, %B
@@ -48,7 +48,7 @@ define i32 @test_le_1(<4 x i32> %A, <4 x i32> %B) {
define i32 @test_ge_1(<4 x i32> %A, <4 x i32> %B) {
; CHECK-LABEL: test_ge_1:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: pcmpgtd %xmm0, %xmm1
; CHECK-NEXT: pcmpeqd %xmm0, %xmm0
; CHECK-NEXT: pxor %xmm1, %xmm0
@@ -65,7 +65,7 @@ define i32 @test_ge_1(<4 x i32> %A, <4 x i32> %B) {
define i32 @test_lt_1(<4 x i32> %A, <4 x i32> %B) {
; CHECK-LABEL: test_lt_1:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: pcmpgtd %xmm0, %xmm1
; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; CHECK-NEXT: movd %xmm0, %eax
@@ -80,7 +80,7 @@ define i32 @test_lt_1(<4 x i32> %A, <4 x i32> %B) {
define i32 @test_gt_1(<4 x i32> %A, <4 x i32> %B) {
; CHECK-LABEL: test_gt_1:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: retq
%cmp = icmp slt <4 x i32> %A, %B
@@ -93,7 +93,7 @@ define i32 @test_gt_1(<4 x i32> %A, <4 x i32> %B) {
define i32 @test_eq_2(<4 x i32> %A, <4 x i32> %B) {
; CHECK-LABEL: test_eq_2:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: pcmpgtd %xmm1, %xmm0
; CHECK-NEXT: pcmpeqd %xmm1, %xmm1
; CHECK-NEXT: pxor %xmm0, %xmm1
@@ -110,7 +110,7 @@ define i32 @test_eq_2(<4 x i32> %A, <4 x i32> %B) {
define i32 @test_ne_2(<4 x i32> %A, <4 x i32> %B) {
; CHECK-LABEL: test_ne_2:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: pcmpgtd %xmm1, %xmm0
; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
; CHECK-NEXT: movd %xmm0, %eax
@@ -125,7 +125,7 @@ define i32 @test_ne_2(<4 x i32> %A, <4 x i32> %B) {
define i32 @test_le_2(<4 x i32> %A, <4 x i32> %B) {
; CHECK-LABEL: test_le_2:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: pcmpgtd %xmm1, %xmm0
; CHECK-NEXT: pcmpeqd %xmm1, %xmm1
; CHECK-NEXT: pxor %xmm0, %xmm1
@@ -142,7 +142,7 @@ define i32 @test_le_2(<4 x i32> %A, <4 x i32> %B) {
define i32 @test_ge_2(<4 x i32> %A, <4 x i32> %B) {
; CHECK-LABEL: test_ge_2:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movl $-1, %eax
; CHECK-NEXT: retq
%cmp = icmp slt <4 x i32> %B, %A
@@ -155,7 +155,7 @@ define i32 @test_ge_2(<4 x i32> %A, <4 x i32> %B) {
define i32 @test_lt_2(<4 x i32> %A, <4 x i32> %B) {
; CHECK-LABEL: test_lt_2:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: pcmpgtd %xmm1, %xmm0
; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
; CHECK-NEXT: movd %xmm0, %eax
@@ -170,7 +170,7 @@ define i32 @test_lt_2(<4 x i32> %A, <4 x i32> %B) {
define i32 @test_gt_2(<4 x i32> %A, <4 x i32> %B) {
; CHECK-LABEL: test_gt_2:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: pcmpgtd %xmm1, %xmm0
; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
; CHECK-NEXT: movd %xmm0, %eax
diff --git a/test/CodeGen/X86/setcc-logic.ll b/test/CodeGen/X86/setcc-logic.ll
index 4d1e5ba16540..9933b9cffc51 100644
--- a/test/CodeGen/X86/setcc-logic.ll
+++ b/test/CodeGen/X86/setcc-logic.ll
@@ -3,7 +3,7 @@
define zeroext i1 @all_bits_clear(i32 %P, i32 %Q) nounwind {
; CHECK-LABEL: all_bits_clear:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: orl %esi, %edi
; CHECK-NEXT: sete %al
; CHECK-NEXT: retq
@@ -15,7 +15,7 @@ define zeroext i1 @all_bits_clear(i32 %P, i32 %Q) nounwind {
define zeroext i1 @all_sign_bits_clear(i32 %P, i32 %Q) nounwind {
; CHECK-LABEL: all_sign_bits_clear:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: orl %esi, %edi
; CHECK-NEXT: setns %al
; CHECK-NEXT: retq
@@ -27,7 +27,7 @@ define zeroext i1 @all_sign_bits_clear(i32 %P, i32 %Q) nounwind {
define zeroext i1 @all_bits_set(i32 %P, i32 %Q) nounwind {
; CHECK-LABEL: all_bits_set:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: andl %esi, %edi
; CHECK-NEXT: cmpl $-1, %edi
; CHECK-NEXT: sete %al
@@ -40,7 +40,7 @@ define zeroext i1 @all_bits_set(i32 %P, i32 %Q) nounwind {
define zeroext i1 @all_sign_bits_set(i32 %P, i32 %Q) nounwind {
; CHECK-LABEL: all_sign_bits_set:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: andl %esi, %edi
; CHECK-NEXT: shrl $31, %edi
; CHECK-NEXT: movl %edi, %eax
@@ -53,7 +53,7 @@ define zeroext i1 @all_sign_bits_set(i32 %P, i32 %Q) nounwind {
define zeroext i1 @any_bits_set(i32 %P, i32 %Q) nounwind {
; CHECK-LABEL: any_bits_set:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: orl %esi, %edi
; CHECK-NEXT: setne %al
; CHECK-NEXT: retq
@@ -65,7 +65,7 @@ define zeroext i1 @any_bits_set(i32 %P, i32 %Q) nounwind {
define zeroext i1 @any_sign_bits_set(i32 %P, i32 %Q) nounwind {
; CHECK-LABEL: any_sign_bits_set:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: orl %esi, %edi
; CHECK-NEXT: shrl $31, %edi
; CHECK-NEXT: movl %edi, %eax
@@ -78,7 +78,7 @@ define zeroext i1 @any_sign_bits_set(i32 %P, i32 %Q) nounwind {
define zeroext i1 @any_bits_clear(i32 %P, i32 %Q) nounwind {
; CHECK-LABEL: any_bits_clear:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: andl %esi, %edi
; CHECK-NEXT: cmpl $-1, %edi
; CHECK-NEXT: setne %al
@@ -91,7 +91,7 @@ define zeroext i1 @any_bits_clear(i32 %P, i32 %Q) nounwind {
define zeroext i1 @any_sign_bits_clear(i32 %P, i32 %Q) nounwind {
; CHECK-LABEL: any_sign_bits_clear:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: testl %esi, %edi
; CHECK-NEXT: setns %al
; CHECK-NEXT: retq
@@ -104,10 +104,10 @@ define zeroext i1 @any_sign_bits_clear(i32 %P, i32 %Q) nounwind {
; PR3351 - (P == 0) & (Q == 0) -> (P|Q) == 0
define i32 @all_bits_clear_branch(i32* %P, i32* %Q) nounwind {
; CHECK-LABEL: all_bits_clear_branch:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: orq %rsi, %rdi
; CHECK-NEXT: jne .LBB8_2
-; CHECK-NEXT: # BB#1: # %bb1
+; CHECK-NEXT: # %bb.1: # %bb1
; CHECK-NEXT: movl $4, %eax
; CHECK-NEXT: retq
; CHECK-NEXT: .LBB8_2: # %return
@@ -128,13 +128,13 @@ return:
define i32 @all_sign_bits_clear_branch(i32 %P, i32 %Q) nounwind {
; CHECK-LABEL: all_sign_bits_clear_branch:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: testl %edi, %edi
; CHECK-NEXT: js .LBB9_3
-; CHECK-NEXT: # BB#1: # %entry
+; CHECK-NEXT: # %bb.1: # %entry
; CHECK-NEXT: testl %esi, %esi
; CHECK-NEXT: js .LBB9_3
-; CHECK-NEXT: # BB#2: # %bb1
+; CHECK-NEXT: # %bb.2: # %bb1
; CHECK-NEXT: movl $4, %eax
; CHECK-NEXT: retq
; CHECK-NEXT: .LBB9_3: # %return
@@ -155,13 +155,13 @@ return:
define i32 @all_bits_set_branch(i32 %P, i32 %Q) nounwind {
; CHECK-LABEL: all_bits_set_branch:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: cmpl $-1, %edi
; CHECK-NEXT: jne .LBB10_3
-; CHECK-NEXT: # BB#1: # %entry
+; CHECK-NEXT: # %bb.1: # %entry
; CHECK-NEXT: cmpl $-1, %esi
; CHECK-NEXT: jne .LBB10_3
-; CHECK-NEXT: # BB#2: # %bb1
+; CHECK-NEXT: # %bb.2: # %bb1
; CHECK-NEXT: movl $4, %eax
; CHECK-NEXT: retq
; CHECK-NEXT: .LBB10_3: # %return
@@ -182,13 +182,13 @@ return:
define i32 @all_sign_bits_set_branch(i32 %P, i32 %Q) nounwind {
; CHECK-LABEL: all_sign_bits_set_branch:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: testl %edi, %edi
; CHECK-NEXT: jns .LBB11_3
-; CHECK-NEXT: # BB#1: # %entry
+; CHECK-NEXT: # %bb.1: # %entry
; CHECK-NEXT: testl %esi, %esi
; CHECK-NEXT: jns .LBB11_3
-; CHECK-NEXT: # BB#2: # %bb1
+; CHECK-NEXT: # %bb.2: # %bb1
; CHECK-NEXT: movl $4, %eax
; CHECK-NEXT: retq
; CHECK-NEXT: .LBB11_3: # %return
@@ -210,10 +210,10 @@ return:
; PR3351 - (P != 0) | (Q != 0) -> (P|Q) != 0
define i32 @any_bits_set_branch(i32* %P, i32* %Q) nounwind {
; CHECK-LABEL: any_bits_set_branch:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: orq %rsi, %rdi
; CHECK-NEXT: je .LBB12_2
-; CHECK-NEXT: # BB#1: # %bb1
+; CHECK-NEXT: # %bb.1: # %bb1
; CHECK-NEXT: movl $4, %eax
; CHECK-NEXT: retq
; CHECK-NEXT: .LBB12_2: # %return
@@ -234,13 +234,13 @@ return:
define i32 @any_sign_bits_set_branch(i32 %P, i32 %Q) nounwind {
; CHECK-LABEL: any_sign_bits_set_branch:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: testl %edi, %edi
; CHECK-NEXT: js .LBB13_2
-; CHECK-NEXT: # BB#1: # %entry
+; CHECK-NEXT: # %bb.1: # %entry
; CHECK-NEXT: testl %esi, %esi
; CHECK-NEXT: js .LBB13_2
-; CHECK-NEXT: # BB#3: # %return
+; CHECK-NEXT: # %bb.3: # %return
; CHECK-NEXT: movl $192, %eax
; CHECK-NEXT: retq
; CHECK-NEXT: .LBB13_2: # %bb1
@@ -261,13 +261,13 @@ return:
define i32 @any_bits_clear_branch(i32 %P, i32 %Q) nounwind {
; CHECK-LABEL: any_bits_clear_branch:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: cmpl $-1, %edi
; CHECK-NEXT: jne .LBB14_2
-; CHECK-NEXT: # BB#1: # %entry
+; CHECK-NEXT: # %bb.1: # %entry
; CHECK-NEXT: cmpl $-1, %esi
; CHECK-NEXT: jne .LBB14_2
-; CHECK-NEXT: # BB#3: # %return
+; CHECK-NEXT: # %bb.3: # %return
; CHECK-NEXT: movl $192, %eax
; CHECK-NEXT: retq
; CHECK-NEXT: .LBB14_2: # %bb1
@@ -288,13 +288,13 @@ return:
define i32 @any_sign_bits_clear_branch(i32 %P, i32 %Q) nounwind {
; CHECK-LABEL: any_sign_bits_clear_branch:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: testl %edi, %edi
; CHECK-NEXT: jns .LBB15_2
-; CHECK-NEXT: # BB#1: # %entry
+; CHECK-NEXT: # %bb.1: # %entry
; CHECK-NEXT: testl %esi, %esi
; CHECK-NEXT: jns .LBB15_2
-; CHECK-NEXT: # BB#3: # %return
+; CHECK-NEXT: # %bb.3: # %return
; CHECK-NEXT: movl $192, %eax
; CHECK-NEXT: retq
; CHECK-NEXT: .LBB15_2: # %bb1
@@ -315,7 +315,7 @@ return:
define <4 x i1> @all_bits_clear_vec(<4 x i32> %P, <4 x i32> %Q) nounwind {
; CHECK-LABEL: all_bits_clear_vec:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: por %xmm1, %xmm0
; CHECK-NEXT: pxor %xmm1, %xmm1
; CHECK-NEXT: pcmpeqd %xmm1, %xmm0
@@ -328,7 +328,7 @@ define <4 x i1> @all_bits_clear_vec(<4 x i32> %P, <4 x i32> %Q) nounwind {
define <4 x i1> @all_sign_bits_clear_vec(<4 x i32> %P, <4 x i32> %Q) nounwind {
; CHECK-LABEL: all_sign_bits_clear_vec:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: por %xmm1, %xmm0
; CHECK-NEXT: pcmpeqd %xmm1, %xmm1
; CHECK-NEXT: pcmpgtd %xmm1, %xmm0
@@ -341,7 +341,7 @@ define <4 x i1> @all_sign_bits_clear_vec(<4 x i32> %P, <4 x i32> %Q) nounwind {
define <4 x i1> @all_bits_set_vec(<4 x i32> %P, <4 x i32> %Q) nounwind {
; CHECK-LABEL: all_bits_set_vec:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: pand %xmm1, %xmm0
; CHECK-NEXT: pcmpeqd %xmm1, %xmm1
; CHECK-NEXT: pcmpeqd %xmm1, %xmm0
@@ -354,7 +354,7 @@ define <4 x i1> @all_bits_set_vec(<4 x i32> %P, <4 x i32> %Q) nounwind {
define <4 x i1> @all_sign_bits_set_vec(<4 x i32> %P, <4 x i32> %Q) nounwind {
; CHECK-LABEL: all_sign_bits_set_vec:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: pand %xmm1, %xmm0
; CHECK-NEXT: pxor %xmm1, %xmm1
; CHECK-NEXT: pcmpgtd %xmm0, %xmm1
@@ -368,7 +368,7 @@ define <4 x i1> @all_sign_bits_set_vec(<4 x i32> %P, <4 x i32> %Q) nounwind {
define <4 x i1> @any_bits_set_vec(<4 x i32> %P, <4 x i32> %Q) nounwind {
; CHECK-LABEL: any_bits_set_vec:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: por %xmm1, %xmm0
; CHECK-NEXT: pxor %xmm1, %xmm1
; CHECK-NEXT: pcmpeqd %xmm1, %xmm0
@@ -383,7 +383,7 @@ define <4 x i1> @any_bits_set_vec(<4 x i32> %P, <4 x i32> %Q) nounwind {
define <4 x i1> @any_sign_bits_set_vec(<4 x i32> %P, <4 x i32> %Q) nounwind {
; CHECK-LABEL: any_sign_bits_set_vec:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: por %xmm1, %xmm0
; CHECK-NEXT: pxor %xmm1, %xmm1
; CHECK-NEXT: pcmpgtd %xmm0, %xmm1
@@ -397,7 +397,7 @@ define <4 x i1> @any_sign_bits_set_vec(<4 x i32> %P, <4 x i32> %Q) nounwind {
define <4 x i1> @any_bits_clear_vec(<4 x i32> %P, <4 x i32> %Q) nounwind {
; CHECK-LABEL: any_bits_clear_vec:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: pand %xmm1, %xmm0
; CHECK-NEXT: pcmpeqd %xmm1, %xmm1
; CHECK-NEXT: pcmpeqd %xmm1, %xmm0
@@ -411,7 +411,7 @@ define <4 x i1> @any_bits_clear_vec(<4 x i32> %P, <4 x i32> %Q) nounwind {
define <4 x i1> @any_sign_bits_clear_vec(<4 x i32> %P, <4 x i32> %Q) nounwind {
; CHECK-LABEL: any_sign_bits_clear_vec:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: pand %xmm1, %xmm0
; CHECK-NEXT: pcmpeqd %xmm1, %xmm1
; CHECK-NEXT: pcmpgtd %xmm1, %xmm0
@@ -424,7 +424,7 @@ define <4 x i1> @any_sign_bits_clear_vec(<4 x i32> %P, <4 x i32> %Q) nounwind {
define zeroext i1 @ne_neg1_and_ne_zero(i64 %x) nounwind {
; CHECK-LABEL: ne_neg1_and_ne_zero:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: incq %rdi
; CHECK-NEXT: cmpq $1, %rdi
; CHECK-NEXT: seta %al
@@ -439,7 +439,7 @@ define zeroext i1 @ne_neg1_and_ne_zero(i64 %x) nounwind {
define zeroext i1 @and_eq(i8 %a, i8 %b, i8 %c, i8 %d) nounwind {
; CHECK-LABEL: and_eq:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: xorl %esi, %edi
; CHECK-NEXT: xorl %ecx, %edx
; CHECK-NEXT: orb %dl, %dil
@@ -453,7 +453,7 @@ define zeroext i1 @and_eq(i8 %a, i8 %b, i8 %c, i8 %d) nounwind {
define zeroext i1 @or_ne(i8 %a, i8 %b, i8 %c, i8 %d) nounwind {
; CHECK-LABEL: or_ne:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: xorl %esi, %edi
; CHECK-NEXT: xorl %ecx, %edx
; CHECK-NEXT: orb %dl, %dil
@@ -469,7 +469,7 @@ define zeroext i1 @or_ne(i8 %a, i8 %b, i8 %c, i8 %d) nounwind {
define <4 x i1> @and_eq_vec(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d) nounwind {
; CHECK-LABEL: and_eq_vec:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: pcmpeqd %xmm1, %xmm0
; CHECK-NEXT: pcmpeqd %xmm3, %xmm2
; CHECK-NEXT: pand %xmm2, %xmm0
diff --git a/test/CodeGen/X86/setcc-lowering.ll b/test/CodeGen/X86/setcc-lowering.ll
index 2628f824ea40..e0390da3069f 100644
--- a/test/CodeGen/X86/setcc-lowering.ll
+++ b/test/CodeGen/X86/setcc-lowering.ll
@@ -8,13 +8,13 @@
define <8 x i16> @pr25080(<8 x i32> %a) {
; AVX-LABEL: pr25080:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1
; AVX-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: vpsllw $15, %xmm0, %xmm0
; AVX-NEXT: vpsraw $15, %xmm0, %xmm0
@@ -22,16 +22,16 @@ define <8 x i16> @pr25080(<8 x i32> %a) {
; AVX-NEXT: retq
;
; KNL-32-LABEL: pr25080:
-; KNL-32: # BB#0: # %entry
-; KNL-32-NEXT: vpbroadcastd {{\.LCPI.*}}, %ymm1
-; KNL-32-NEXT: vpand %ymm1, %ymm0, %ymm0
-; KNL-32-NEXT: vpxor %ymm1, %ymm1, %ymm1
-; KNL-32-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
+; KNL-32: # %bb.0: # %entry
+; KNL-32-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
+; KNL-32-NEXT: vpbroadcastd {{.*#+}} ymm1 = [8388607,8388607,8388607,8388607,8388607,8388607,8388607,8388607]
+; KNL-32-NEXT: vptestnmd %zmm1, %zmm0, %k0
; KNL-32-NEXT: movb $15, %al
; KNL-32-NEXT: kmovw %eax, %k1
; KNL-32-NEXT: korw %k1, %k0, %k1
-; KNL-32-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; KNL-32-NEXT: vpmovqw %zmm0, %xmm0
+; KNL-32-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; KNL-32-NEXT: vpmovdw %zmm0, %ymm0
+; KNL-32-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; KNL-32-NEXT: retl
entry:
%0 = trunc <8 x i32> %a to <8 x i23>
@@ -43,7 +43,7 @@ entry:
define void @pr26232(i64 %a, <16 x i1> %b) {
; AVX-LABEL: pr26232:
-; AVX: # BB#0: # %for_loop599.preheader
+; AVX: # %bb.0: # %for_loop599.preheader
; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
; AVX-NEXT: .p2align 4, 0x90
@@ -57,19 +57,16 @@ define void @pr26232(i64 %a, <16 x i1> %b) {
; AVX-NEXT: vpand %xmm0, %xmm3, %xmm3
; AVX-NEXT: vpsllw $7, %xmm3, %xmm3
; AVX-NEXT: vpand %xmm2, %xmm3, %xmm3
-; AVX-NEXT: vpcmpgtb %xmm3, %xmm1, %xmm3
; AVX-NEXT: vpmovmskb %xmm3, %eax
; AVX-NEXT: testw %ax, %ax
; AVX-NEXT: jne .LBB1_1
-; AVX-NEXT: # BB#2: # %for_exit600
+; AVX-NEXT: # %bb.2: # %for_exit600
; AVX-NEXT: retq
;
; KNL-32-LABEL: pr26232:
-; KNL-32: # BB#0: # %for_loop599.preheader
+; KNL-32: # %bb.0: # %for_loop599.preheader
; KNL-32-NEXT: pushl %esi
-; KNL-32-NEXT: .Lcfi0:
; KNL-32-NEXT: .cfi_def_cfa_offset 8
-; KNL-32-NEXT: .Lcfi1:
; KNL-32-NEXT: .cfi_offset %esi, -8
; KNL-32-NEXT: vpmovsxbd %xmm0, %zmm0
; KNL-32-NEXT: vpslld $31, %zmm0, %zmm0
@@ -90,7 +87,7 @@ define void @pr26232(i64 %a, <16 x i1> %b) {
; KNL-32-NEXT: kmovw %k1, %esi
; KNL-32-NEXT: testw %si, %si
; KNL-32-NEXT: jne .LBB1_1
-; KNL-32-NEXT: # BB#2: # %for_exit600
+; KNL-32-NEXT: # %bb.2: # %for_exit600
; KNL-32-NEXT: popl %esi
; KNL-32-NEXT: retl
allocas:
diff --git a/test/CodeGen/X86/setcc-narrowing.ll b/test/CodeGen/X86/setcc-narrowing.ll
index a4259ddd2318..52f143f8b323 100644
--- a/test/CodeGen/X86/setcc-narrowing.ll
+++ b/test/CodeGen/X86/setcc-narrowing.ll
@@ -1,15 +1,18 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=i686-apple-darwin | FileCheck %s
; PR17338
@t1.global = internal global i64 -1, align 8
define i32 @t1() nounwind ssp {
-entry:
; CHECK-LABEL: t1:
-; CHECK: xorl %eax, %eax
-; CHECK-NEXT: cmpl $0, _t1.global
-; CHECK-NEXT: setne %al
-; CHECK-NEXT: ret
+; CHECK: ## %bb.0: ## %entry
+; CHECK-NEXT: xorl %eax, %eax
+; CHECK-NEXT: cmpl $0, _t1.global
+; CHECK-NEXT: setne %al
+; CHECK-NEXT: retl
+; CHECK-NEXT: ## -- End function
+entry:
%0 = load i64, i64* @t1.global, align 8
%and = and i64 4294967295, %0
%cmp = icmp sgt i64 %and, 0
diff --git a/test/CodeGen/X86/setcc-wide-types.ll b/test/CodeGen/X86/setcc-wide-types.ll
index 332bf2887fb0..f935db72dcb9 100644
--- a/test/CodeGen/X86/setcc-wide-types.ll
+++ b/test/CodeGen/X86/setcc-wide-types.ll
@@ -6,7 +6,7 @@
define i32 @ne_i128(<2 x i64> %x, <2 x i64> %y) {
; SSE2-LABEL: ne_i128:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: pcmpeqb %xmm1, %xmm0
; SSE2-NEXT: pmovmskb %xmm0, %ecx
; SSE2-NEXT: xorl %eax, %eax
@@ -15,7 +15,7 @@ define i32 @ne_i128(<2 x i64> %x, <2 x i64> %y) {
; SSE2-NEXT: retq
;
; AVX2-LABEL: ne_i128:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpmovmskb %xmm0, %ecx
; AVX2-NEXT: xorl %eax, %eax
@@ -31,7 +31,7 @@ define i32 @ne_i128(<2 x i64> %x, <2 x i64> %y) {
define i32 @eq_i128(<2 x i64> %x, <2 x i64> %y) {
; SSE2-LABEL: eq_i128:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: pcmpeqb %xmm1, %xmm0
; SSE2-NEXT: pmovmskb %xmm0, %ecx
; SSE2-NEXT: xorl %eax, %eax
@@ -40,7 +40,7 @@ define i32 @eq_i128(<2 x i64> %x, <2 x i64> %y) {
; SSE2-NEXT: retq
;
; AVX2-LABEL: eq_i128:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpmovmskb %xmm0, %ecx
; AVX2-NEXT: xorl %eax, %eax
@@ -56,7 +56,7 @@ define i32 @eq_i128(<2 x i64> %x, <2 x i64> %y) {
define i32 @ne_i256(<4 x i64> %x, <4 x i64> %y) {
; SSE2-LABEL: ne_i256:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
; SSE2-NEXT: movq %xmm4, %rax
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1]
@@ -81,7 +81,7 @@ define i32 @ne_i256(<4 x i64> %x, <4 x i64> %y) {
; SSE2-NEXT: retq
;
; AVX2-LABEL: ne_i256:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpmovmskb %ymm0, %ecx
; AVX2-NEXT: xorl %eax, %eax
@@ -98,7 +98,7 @@ define i32 @ne_i256(<4 x i64> %x, <4 x i64> %y) {
define i32 @eq_i256(<4 x i64> %x, <4 x i64> %y) {
; SSE2-LABEL: eq_i256:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
; SSE2-NEXT: movq %xmm4, %rax
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1]
@@ -123,7 +123,7 @@ define i32 @eq_i256(<4 x i64> %x, <4 x i64> %y) {
; SSE2-NEXT: retq
;
; AVX2-LABEL: eq_i256:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpmovmskb %ymm0, %ecx
; AVX2-NEXT: xorl %eax, %eax
diff --git a/test/CodeGen/X86/setcc.ll b/test/CodeGen/X86/setcc.ll
index fab4f4137251..a1d27d38fc5d 100644
--- a/test/CodeGen/X86/setcc.ll
+++ b/test/CodeGen/X86/setcc.ll
@@ -7,7 +7,7 @@
define zeroext i16 @t1(i16 zeroext %x) nounwind readnone ssp {
; CHECK-LABEL: t1:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: cmpl $26, %edi
; CHECK-NEXT: seta %al
@@ -20,7 +20,7 @@ define zeroext i16 @t1(i16 zeroext %x) nounwind readnone ssp {
define zeroext i16 @t2(i16 zeroext %x) nounwind readnone ssp {
; CHECK-LABEL: t2:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: cmpl $26, %edi
; CHECK-NEXT: setb %al
@@ -33,7 +33,7 @@ define zeroext i16 @t2(i16 zeroext %x) nounwind readnone ssp {
define i64 @t3(i64 %x) nounwind readnone ssp {
; CHECK-LABEL: t3:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: cmpq $18, %rdi
; CHECK-NEXT: setb %al
@@ -48,7 +48,7 @@ define i64 @t3(i64 %x) nounwind readnone ssp {
define i32 @t4(i32 %a) {
; CHECK-LABEL: t4:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: movq _v4@{{.*}}(%rip), %rax
; CHECK-NEXT: cmpl $1, (%rax)
; CHECK-NEXT: movw $1, %ax
@@ -67,7 +67,7 @@ define i32 @t4(i32 %a) {
define i8 @t5(i32 %a) #0 {
; CHECK-LABEL: t5:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: testl %edi, %edi
; CHECK-NEXT: setns %al
; CHECK-NEXT: retq
@@ -79,7 +79,7 @@ define i8 @t5(i32 %a) #0 {
define zeroext i1 @t6(i32 %a) #0 {
; CHECK-LABEL: t6:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: testl %edi, %edi
; CHECK-NEXT: setns %al
; CHECK-NEXT: retq
diff --git a/test/CodeGen/X86/setoeq.ll b/test/CodeGen/X86/setoeq.ll
index aa2f0af55cc9..5c2f1d5c5da5 100644
--- a/test/CodeGen/X86/setoeq.ll
+++ b/test/CodeGen/X86/setoeq.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 | FileCheck %s
+; RUN: llc < %s -mtriple=i686-- -mattr=+sse2 | FileCheck %s
define zeroext i8 @t(double %x) nounwind readnone {
entry:
diff --git a/test/CodeGen/X86/setuge.ll b/test/CodeGen/X86/setuge.ll
index 4ca2f1871c0f..96187198ac6f 100644
--- a/test/CodeGen/X86/setuge.ll
+++ b/test/CodeGen/X86/setuge.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | not grep set
+; RUN: llc < %s -mtriple=i686-- | not grep set
declare i1 @llvm.isunordered.f32(float, float)
diff --git a/test/CodeGen/X86/sext-i1.ll b/test/CodeGen/X86/sext-i1.ll
index 8c92434db21a..bb8a4bcec8dc 100644
--- a/test/CodeGen/X86/sext-i1.ll
+++ b/test/CodeGen/X86/sext-i1.ll
@@ -7,13 +7,13 @@
define i32 @t1(i32 %x) nounwind readnone ssp {
; X32-LABEL: t1:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: cmpl $1, {{[0-9]+}}(%esp)
; X32-NEXT: sbbl %eax, %eax
; X32-NEXT: retl
;
; X64-LABEL: t1:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: cmpl $1, %edi
; X64-NEXT: sbbl %eax, %eax
; X64-NEXT: retq
@@ -24,13 +24,13 @@ define i32 @t1(i32 %x) nounwind readnone ssp {
define i32 @t2(i32 %x) nounwind readnone ssp {
; X32-LABEL: t2:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: cmpl $1, {{[0-9]+}}(%esp)
; X32-NEXT: sbbl %eax, %eax
; X32-NEXT: retl
;
; X64-LABEL: t2:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: cmpl $1, %edi
; X64-NEXT: sbbl %eax, %eax
; X64-NEXT: retq
@@ -41,7 +41,7 @@ define i32 @t2(i32 %x) nounwind readnone ssp {
define i32 @t3() nounwind readonly {
; X32-LABEL: t3:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: cmpl $1, %eax
; X32-NEXT: sbbl %eax, %eax
; X32-NEXT: cmpl %eax, %eax
@@ -50,9 +50,11 @@ define i32 @t3() nounwind readonly {
; X32-NEXT: retl
;
; X64-LABEL: t3:
-; X64: # BB#0: # %entry
-; X64-NEXT: cmpl $1, %eax
-; X64-NEXT: sbbq %rax, %rax
+; X64: # %bb.0: # %entry
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: testl %eax, %eax
+; X64-NEXT: sete %al
+; X64-NEXT: negq %rax
; X64-NEXT: cmpq %rax, %rax
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: retq
@@ -74,18 +76,16 @@ if.end:
define i32 @t4(i64 %x) nounwind readnone ssp {
; X32-LABEL: t4:
-; X32: # BB#0:
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: orl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl $-1, %eax
-; X32-NEXT: je .LBB3_2
-; X32-NEXT: # BB#1:
+; X32: # %bb.0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: xorl %eax, %eax
-; X32-NEXT: .LBB3_2:
+; X32-NEXT: orl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: sete %al
+; X32-NEXT: negl %eax
; X32-NEXT: retl
;
; X64-LABEL: t4:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: cmpq $1, %rdi
; X64-NEXT: sbbl %eax, %eax
; X64-NEXT: retq
@@ -96,14 +96,14 @@ define i32 @t4(i64 %x) nounwind readnone ssp {
define i64 @t5(i32 %x) nounwind readnone ssp {
; X32-LABEL: t5:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: cmpl $1, {{[0-9]+}}(%esp)
; X32-NEXT: sbbl %eax, %eax
; X32-NEXT: movl %eax, %edx
; X32-NEXT: retl
;
; X64-LABEL: t5:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: cmpl $1, %edi
; X64-NEXT: sbbq %rax, %rax
; X64-NEXT: retq
@@ -116,15 +116,15 @@ define i64 @t5(i32 %x) nounwind readnone ssp {
define i32 @select_0_or_1s(i1 %cond) {
; X32-LABEL: select_0_or_1s:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; X32-NEXT: andl $1, %eax
; X32-NEXT: decl %eax
; X32-NEXT: retl
;
; X64-LABEL: select_0_or_1s:
-; X64: # BB#0:
-; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64: # %bb.0:
+; X64-NEXT: # kill: def %edi killed %edi def %rdi
; X64-NEXT: andl $1, %edi
; X64-NEXT: leal -1(%rdi), %eax
; X64-NEXT: retq
@@ -137,15 +137,15 @@ define i32 @select_0_or_1s(i1 %cond) {
define i32 @select_0_or_1s_zeroext(i1 zeroext %cond) {
; X32-LABEL: select_0_or_1s_zeroext:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; X32-NEXT: decl %eax
; X32-NEXT: retl
;
; X64-LABEL: select_0_or_1s_zeroext:
-; X64: # BB#0:
-; X64-NEXT: movzbl %dil, %eax
-; X64-NEXT: decl %eax
+; X64: # %bb.0:
+; X64-NEXT: # kill: def %edi killed %edi def %rdi
+; X64-NEXT: leal -1(%rdi), %eax
; X64-NEXT: retq
%not = xor i1 %cond, 1
%sext = sext i1 %not to i32
@@ -156,7 +156,7 @@ define i32 @select_0_or_1s_zeroext(i1 zeroext %cond) {
define i32 @select_0_or_1s_signext(i1 signext %cond) {
; X32-LABEL: select_0_or_1s_signext:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movb {{[0-9]+}}(%esp), %al
; X32-NEXT: andb $1, %al
; X32-NEXT: movzbl %al, %eax
@@ -164,10 +164,9 @@ define i32 @select_0_or_1s_signext(i1 signext %cond) {
; X32-NEXT: retl
;
; X64-LABEL: select_0_or_1s_signext:
-; X64: # BB#0:
-; X64-NEXT: andb $1, %dil
-; X64-NEXT: movzbl %dil, %eax
-; X64-NEXT: decl %eax
+; X64: # %bb.0:
+; X64-NEXT: notl %edi
+; X64-NEXT: movl %edi, %eax
; X64-NEXT: retq
%not = xor i1 %cond, 1
%sext = sext i1 %not to i32
diff --git a/test/CodeGen/X86/sext-load.ll b/test/CodeGen/X86/sext-load.ll
index 2ea6e012192e..4300c8f3a713 100644
--- a/test/CodeGen/X86/sext-load.ll
+++ b/test/CodeGen/X86/sext-load.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | FileCheck %s
+; RUN: llc < %s -mtriple=i686-- | FileCheck %s
; When doing sign extension, use the sext-load lowering to take advantage of
; x86's sign extension during loads.
diff --git a/test/CodeGen/X86/sext-setcc-self.ll b/test/CodeGen/X86/sext-setcc-self.ll
index 9cbd3d85b381..452b600ffb5e 100644
--- a/test/CodeGen/X86/sext-setcc-self.ll
+++ b/test/CodeGen/X86/sext-setcc-self.ll
@@ -3,7 +3,7 @@
define <4 x i32> @test_ueq(<4 x float> %in) {
; CHECK-LABEL: test_ueq:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: pcmpeqd %xmm0, %xmm0
; CHECK-NEXT: retq
%t0 = fcmp ueq <4 x float> %in, %in
@@ -13,7 +13,7 @@ define <4 x i32> @test_ueq(<4 x float> %in) {
define <4 x i32> @test_uge(<4 x float> %in) {
; CHECK-LABEL: test_uge:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: pcmpeqd %xmm0, %xmm0
; CHECK-NEXT: retq
%t0 = fcmp uge <4 x float> %in, %in
@@ -23,7 +23,7 @@ define <4 x i32> @test_uge(<4 x float> %in) {
define <4 x i32> @test_ule(<4 x float> %in) {
; CHECK-LABEL: test_ule:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: pcmpeqd %xmm0, %xmm0
; CHECK-NEXT: retq
%t0 = fcmp ule <4 x float> %in, %in
@@ -33,7 +33,7 @@ define <4 x i32> @test_ule(<4 x float> %in) {
define <4 x i32> @test_one(<4 x float> %in) {
; CHECK-LABEL: test_one:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: xorps %xmm0, %xmm0
; CHECK-NEXT: retq
%t0 = fcmp one <4 x float> %in, %in
@@ -43,7 +43,7 @@ define <4 x i32> @test_one(<4 x float> %in) {
define <4 x i32> @test_ogt(<4 x float> %in) {
; CHECK-LABEL: test_ogt:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: xorps %xmm0, %xmm0
; CHECK-NEXT: retq
%t0 = fcmp ogt <4 x float> %in, %in
@@ -53,7 +53,7 @@ define <4 x i32> @test_ogt(<4 x float> %in) {
define <4 x i32> @test_olt(<4 x float> %in) {
; CHECK-LABEL: test_olt:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: xorps %xmm0, %xmm0
; CHECK-NEXT: retq
%t0 = fcmp olt <4 x float> %in, %in
diff --git a/test/CodeGen/X86/sext-subreg.ll b/test/CodeGen/X86/sext-subreg.ll
index e0c8ff9b5e08..f96f5b2a7c94 100644
--- a/test/CodeGen/X86/sext-subreg.ll
+++ b/test/CodeGen/X86/sext-subreg.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-- | FileCheck %s
; rdar://7529457
define i64 @t(i64 %A, i64 %B, i32* %P, i64 *%P2) nounwind {
diff --git a/test/CodeGen/X86/sha-schedule.ll b/test/CodeGen/X86/sha-schedule.ll
new file mode 100644
index 000000000000..138ff888b924
--- /dev/null
+++ b/test/CodeGen/X86/sha-schedule.ll
@@ -0,0 +1,242 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=+sha | FileCheck %s --check-prefix=CHECK --check-prefix=GENERIC
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=goldmont | FileCheck %s --check-prefix=CHECK --check-prefix=GOLDMONT
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=cannonlake | FileCheck %s --check-prefix=CHECK --check-prefix=CANNONLAKE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1
+
+;
+; SHA1
+;
+
+define <4 x i32> @test_sha1msg1(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
+; GENERIC-LABEL: test_sha1msg1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: sha1msg1 %xmm1, %xmm0 # sched: [5:1.00]
+; GENERIC-NEXT: sha1msg1 (%rdi), %xmm0 # sched: [9:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; GOLDMONT-LABEL: test_sha1msg1:
+; GOLDMONT: # %bb.0:
+; GOLDMONT-NEXT: sha1msg1 %xmm1, %xmm0 # sched: [4:1.00]
+; GOLDMONT-NEXT: sha1msg1 (%rdi), %xmm0 # sched: [7:1.00]
+; GOLDMONT-NEXT: retq # sched: [4:1.00]
+;
+; CANNONLAKE-LABEL: test_sha1msg1:
+; CANNONLAKE: # %bb.0:
+; CANNONLAKE-NEXT: sha1msg1 %xmm1, %xmm0 # sched: [5:1.00]
+; CANNONLAKE-NEXT: sha1msg1 (%rdi), %xmm0 # sched: [10:1.00]
+; CANNONLAKE-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_sha1msg1:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: sha1msg1 %xmm1, %xmm0 # sched: [2:1.00]
+; ZNVER1-NEXT: sha1msg1 (%rdi), %xmm0 # sched: [9:1.00]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = load <4 x i32>, <4 x i32>* %a2
+ %2 = tail call <4 x i32> @llvm.x86.sha1msg1(<4 x i32> %a0, <4 x i32> %a1)
+ %3 = tail call <4 x i32> @llvm.x86.sha1msg1(<4 x i32> %2, <4 x i32> %1)
+ ret <4 x i32> %3
+}
+declare <4 x i32> @llvm.x86.sha1msg1(<4 x i32>, <4 x i32>)
+
+define <4 x i32> @test_sha1msg2(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
+; GENERIC-LABEL: test_sha1msg2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: sha1msg2 %xmm1, %xmm0 # sched: [5:1.00]
+; GENERIC-NEXT: sha1msg2 (%rdi), %xmm0 # sched: [9:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; GOLDMONT-LABEL: test_sha1msg2:
+; GOLDMONT: # %bb.0:
+; GOLDMONT-NEXT: sha1msg2 %xmm1, %xmm0 # sched: [4:1.00]
+; GOLDMONT-NEXT: sha1msg2 (%rdi), %xmm0 # sched: [7:1.00]
+; GOLDMONT-NEXT: retq # sched: [4:1.00]
+;
+; CANNONLAKE-LABEL: test_sha1msg2:
+; CANNONLAKE: # %bb.0:
+; CANNONLAKE-NEXT: sha1msg2 %xmm1, %xmm0 # sched: [5:1.00]
+; CANNONLAKE-NEXT: sha1msg2 (%rdi), %xmm0 # sched: [10:1.00]
+; CANNONLAKE-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_sha1msg2:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: sha1msg2 %xmm1, %xmm0 # sched: [1:0.50]
+; ZNVER1-NEXT: sha1msg2 (%rdi), %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = load <4 x i32>, <4 x i32>* %a2
+ %2 = tail call <4 x i32> @llvm.x86.sha1msg2(<4 x i32> %a0, <4 x i32> %a1)
+ %3 = tail call <4 x i32> @llvm.x86.sha1msg2(<4 x i32> %2, <4 x i32> %1)
+ ret <4 x i32> %3
+}
+declare <4 x i32> @llvm.x86.sha1msg2(<4 x i32>, <4 x i32>)
+
+define <4 x i32> @test_sha1nexte(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
+; GENERIC-LABEL: test_sha1nexte:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: sha1nexte %xmm1, %xmm0 # sched: [5:1.00]
+; GENERIC-NEXT: sha1nexte (%rdi), %xmm0 # sched: [9:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; GOLDMONT-LABEL: test_sha1nexte:
+; GOLDMONT: # %bb.0:
+; GOLDMONT-NEXT: sha1nexte %xmm1, %xmm0 # sched: [4:1.00]
+; GOLDMONT-NEXT: sha1nexte (%rdi), %xmm0 # sched: [7:1.00]
+; GOLDMONT-NEXT: retq # sched: [4:1.00]
+;
+; CANNONLAKE-LABEL: test_sha1nexte:
+; CANNONLAKE: # %bb.0:
+; CANNONLAKE-NEXT: sha1nexte %xmm1, %xmm0 # sched: [5:1.00]
+; CANNONLAKE-NEXT: sha1nexte (%rdi), %xmm0 # sched: [10:1.00]
+; CANNONLAKE-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_sha1nexte:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: sha1nexte %xmm1, %xmm0 # sched: [1:1.00]
+; ZNVER1-NEXT: sha1nexte (%rdi), %xmm0 # sched: [8:1.00]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = load <4 x i32>, <4 x i32>* %a2
+ %2 = tail call <4 x i32> @llvm.x86.sha1nexte(<4 x i32> %a0, <4 x i32> %a1)
+ %3 = tail call <4 x i32> @llvm.x86.sha1nexte(<4 x i32> %2, <4 x i32> %1)
+ ret <4 x i32> %3
+}
+declare <4 x i32> @llvm.x86.sha1nexte(<4 x i32>, <4 x i32>)
+
+define <4 x i32> @test_sha1rnds4(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
+; GENERIC-LABEL: test_sha1rnds4:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: sha1rnds4 $3, %xmm1, %xmm0 # sched: [5:1.00]
+; GENERIC-NEXT: sha1rnds4 $3, (%rdi), %xmm0 # sched: [9:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; GOLDMONT-LABEL: test_sha1rnds4:
+; GOLDMONT: # %bb.0:
+; GOLDMONT-NEXT: sha1rnds4 $3, %xmm1, %xmm0 # sched: [4:1.00]
+; GOLDMONT-NEXT: sha1rnds4 $3, (%rdi), %xmm0 # sched: [7:1.00]
+; GOLDMONT-NEXT: retq # sched: [4:1.00]
+;
+; CANNONLAKE-LABEL: test_sha1rnds4:
+; CANNONLAKE: # %bb.0:
+; CANNONLAKE-NEXT: sha1rnds4 $3, %xmm1, %xmm0 # sched: [5:1.00]
+; CANNONLAKE-NEXT: sha1rnds4 $3, (%rdi), %xmm0 # sched: [10:1.00]
+; CANNONLAKE-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_sha1rnds4:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: sha1rnds4 $3, %xmm1, %xmm0 # sched: [6:1.00]
+; ZNVER1-NEXT: sha1rnds4 $3, (%rdi), %xmm0 # sched: [13:1.00]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = load <4 x i32>, <4 x i32>* %a2
+ %2 = tail call <4 x i32> @llvm.x86.sha1rnds4(<4 x i32> %a0, <4 x i32> %a1, i8 3)
+ %3 = tail call <4 x i32> @llvm.x86.sha1rnds4(<4 x i32> %2, <4 x i32> %1, i8 3)
+ ret <4 x i32> %3
+}
+declare <4 x i32> @llvm.x86.sha1rnds4(<4 x i32>, <4 x i32>, i8)
+
+;
+; SHA256
+;
+
+define <4 x i32> @test_sha256msg1(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
+; GENERIC-LABEL: test_sha256msg1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: sha256msg1 %xmm1, %xmm0 # sched: [5:1.00]
+; GENERIC-NEXT: sha256msg1 (%rdi), %xmm0 # sched: [9:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; GOLDMONT-LABEL: test_sha256msg1:
+; GOLDMONT: # %bb.0:
+; GOLDMONT-NEXT: sha256msg1 %xmm1, %xmm0 # sched: [4:1.00]
+; GOLDMONT-NEXT: sha256msg1 (%rdi), %xmm0 # sched: [7:1.00]
+; GOLDMONT-NEXT: retq # sched: [4:1.00]
+;
+; CANNONLAKE-LABEL: test_sha256msg1:
+; CANNONLAKE: # %bb.0:
+; CANNONLAKE-NEXT: sha256msg1 %xmm1, %xmm0 # sched: [5:1.00]
+; CANNONLAKE-NEXT: sha256msg1 (%rdi), %xmm0 # sched: [10:1.00]
+; CANNONLAKE-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_sha256msg1:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: sha256msg1 %xmm1, %xmm0 # sched: [2:1.00]
+; ZNVER1-NEXT: sha256msg1 (%rdi), %xmm0 # sched: [9:1.00]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = load <4 x i32>, <4 x i32>* %a2
+ %2 = tail call <4 x i32> @llvm.x86.sha256msg1(<4 x i32> %a0, <4 x i32> %a1)
+ %3 = tail call <4 x i32> @llvm.x86.sha256msg1(<4 x i32> %2, <4 x i32> %1)
+ ret <4 x i32> %3
+}
+declare <4 x i32> @llvm.x86.sha256msg1(<4 x i32>, <4 x i32>)
+
+define <4 x i32> @test_sha256msg2(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
+; GENERIC-LABEL: test_sha256msg2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: sha256msg2 %xmm1, %xmm0 # sched: [5:1.00]
+; GENERIC-NEXT: sha256msg2 (%rdi), %xmm0 # sched: [9:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; GOLDMONT-LABEL: test_sha256msg2:
+; GOLDMONT: # %bb.0:
+; GOLDMONT-NEXT: sha256msg2 %xmm1, %xmm0 # sched: [4:1.00]
+; GOLDMONT-NEXT: sha256msg2 (%rdi), %xmm0 # sched: [7:1.00]
+; GOLDMONT-NEXT: retq # sched: [4:1.00]
+;
+; CANNONLAKE-LABEL: test_sha256msg2:
+; CANNONLAKE: # %bb.0:
+; CANNONLAKE-NEXT: sha256msg2 %xmm1, %xmm0 # sched: [5:1.00]
+; CANNONLAKE-NEXT: sha256msg2 (%rdi), %xmm0 # sched: [10:1.00]
+; CANNONLAKE-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_sha256msg2:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: sha256msg2 %xmm1, %xmm0 # sched: [100:?]
+; ZNVER1-NEXT: sha256msg2 (%rdi), %xmm0 # sched: [100:?]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = load <4 x i32>, <4 x i32>* %a2
+ %2 = tail call <4 x i32> @llvm.x86.sha256msg2(<4 x i32> %a0, <4 x i32> %a1)
+ %3 = tail call <4 x i32> @llvm.x86.sha256msg2(<4 x i32> %2, <4 x i32> %1)
+ ret <4 x i32> %3
+}
+declare <4 x i32> @llvm.x86.sha256msg2(<4 x i32>, <4 x i32>)
+
+define <4 x i32> @test_sha256rnds2(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2, <4 x i32> *%a3) {
+; GENERIC-LABEL: test_sha256rnds2:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: movaps %xmm0, %xmm3 # sched: [1:1.00]
+; GENERIC-NEXT: movaps %xmm2, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: sha256rnds2 %xmm0, %xmm1, %xmm3 # sched: [5:1.00]
+; GENERIC-NEXT: sha256rnds2 %xmm0, (%rdi), %xmm3 # sched: [9:1.00]
+; GENERIC-NEXT: movaps %xmm3, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; GOLDMONT-LABEL: test_sha256rnds2:
+; GOLDMONT: # %bb.0:
+; GOLDMONT-NEXT: movaps %xmm0, %xmm3 # sched: [1:1.00]
+; GOLDMONT-NEXT: movaps %xmm2, %xmm0 # sched: [1:1.00]
+; GOLDMONT-NEXT: sha256rnds2 %xmm0, %xmm1, %xmm3 # sched: [4:1.00]
+; GOLDMONT-NEXT: sha256rnds2 %xmm0, (%rdi), %xmm3 # sched: [7:1.00]
+; GOLDMONT-NEXT: movaps %xmm3, %xmm0 # sched: [1:1.00]
+; GOLDMONT-NEXT: retq # sched: [4:1.00]
+;
+; CANNONLAKE-LABEL: test_sha256rnds2:
+; CANNONLAKE: # %bb.0:
+; CANNONLAKE-NEXT: vmovaps %xmm0, %xmm3 # sched: [1:0.33]
+; CANNONLAKE-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:0.33]
+; CANNONLAKE-NEXT: sha256rnds2 %xmm0, %xmm1, %xmm3 # sched: [5:1.00]
+; CANNONLAKE-NEXT: sha256rnds2 %xmm0, (%rdi), %xmm3 # sched: [10:1.00]
+; CANNONLAKE-NEXT: vmovaps %xmm3, %xmm0 # sched: [1:0.33]
+; CANNONLAKE-NEXT: retq # sched: [7:1.00]
+;
+; ZNVER1-LABEL: test_sha256rnds2:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vmovaps %xmm0, %xmm3 # sched: [1:0.50]
+; ZNVER1-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:0.50]
+; ZNVER1-NEXT: sha256rnds2 %xmm0, %xmm1, %xmm3 # sched: [4:1.00]
+; ZNVER1-NEXT: sha256rnds2 %xmm0, (%rdi), %xmm3 # sched: [11:1.00]
+; ZNVER1-NEXT: vmovaps %xmm3, %xmm0 # sched: [1:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = load <4 x i32>, <4 x i32>* %a3
+ %2 = tail call <4 x i32> @llvm.x86.sha256rnds2(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2)
+ %3 = tail call <4 x i32> @llvm.x86.sha256rnds2(<4 x i32> %2, <4 x i32> %1, <4 x i32> %a2)
+ ret <4 x i32> %3
+}
+declare <4 x i32> @llvm.x86.sha256rnds2(<4 x i32>, <4 x i32>, <4 x i32>)
diff --git a/test/CodeGen/X86/sha.ll b/test/CodeGen/X86/sha.ll
index eb1966470491..cf428b2a7e84 100644
--- a/test/CodeGen/X86/sha.ll
+++ b/test/CodeGen/X86/sha.ll
@@ -84,9 +84,9 @@ entry:
%0 = tail call <4 x i32> @llvm.x86.sha256rnds2(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c)
ret <4 x i32> %0
; CHECK: test_sha256rnds2rr
- ; CHECK: movaps %xmm0, [[XMM_TMP1:%xmm[1-9][0-9]?]]
+ ; CHECK: movaps %xmm0, [[xmm_TMP1:%xmm[1-9][0-9]?]]
; CHECK: movaps %xmm2, %xmm0
- ; CHECK: sha256rnds2 %xmm0, %xmm1, [[XMM_TMP1]]
+ ; CHECK: sha256rnds2 %xmm0, %xmm1, [[xmm_TMP1]]
}
define <4 x i32> @test_sha256rnds2rm(<4 x i32> %a, <4 x i32>* %b, <4 x i32> %c) nounwind uwtable {
@@ -95,9 +95,9 @@ entry:
%1 = tail call <4 x i32> @llvm.x86.sha256rnds2(<4 x i32> %a, <4 x i32> %0, <4 x i32> %c)
ret <4 x i32> %1
; CHECK: test_sha256rnds2rm
- ; CHECK: movaps %xmm0, [[XMM_TMP2:%xmm[1-9][0-9]?]]
+ ; CHECK: movaps %xmm0, [[xmm_TMP2:%xmm[1-9][0-9]?]]
; CHECK: movaps %xmm1, %xmm0
- ; CHECK: sha256rnds2 %xmm0, (%rdi), [[XMM_TMP2]]
+ ; CHECK: sha256rnds2 %xmm0, (%rdi), [[xmm_TMP2]]
}
declare <4 x i32> @llvm.x86.sha256msg1(<4 x i32>, <4 x i32>) nounwind readnone
diff --git a/test/CodeGen/X86/shift-and.ll b/test/CodeGen/X86/shift-and.ll
index edd43a35ce56..1e448d39f772 100644
--- a/test/CodeGen/X86/shift-and.ll
+++ b/test/CodeGen/X86/shift-and.ll
@@ -1,14 +1,21 @@
-; RUN: llc < %s -mtriple=i386-apple-macosx | FileCheck %s --check-prefix=X32
-; RUN: llc < %s -mtriple=x86_64-apple-macosx | FileCheck %s --check-prefix=X64
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i386-unknown-unknown | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=X64
define i32 @t1(i32 %t, i32 %val) nounwind {
; X32-LABEL: t1:
-; X32-NOT: andl
-; X32: shll
-
+; X32: # %bb.0:
+; X32-NEXT: movb {{[0-9]+}}(%esp), %cl
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: shll %cl, %eax
+; X32-NEXT: retl
+;
; X64-LABEL: t1:
-; X64-NOT: andl
-; X64: shll
+; X64: # %bb.0:
+; X64-NEXT: movl %edi, %ecx
+; X64-NEXT: shll %cl, %esi
+; X64-NEXT: movl %esi, %eax
+; X64-NEXT: retq
%shamt = and i32 %t, 31
%res = shl i32 %val, %shamt
ret i32 %res
@@ -16,12 +23,18 @@ define i32 @t1(i32 %t, i32 %val) nounwind {
define i32 @t2(i32 %t, i32 %val) nounwind {
; X32-LABEL: t2:
-; X32-NOT: andl
-; X32: shll
-
+; X32: # %bb.0:
+; X32-NEXT: movb {{[0-9]+}}(%esp), %cl
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: shll %cl, %eax
+; X32-NEXT: retl
+;
; X64-LABEL: t2:
-; X64-NOT: andl
-; X64: shll
+; X64: # %bb.0:
+; X64-NEXT: movl %edi, %ecx
+; X64-NEXT: shll %cl, %esi
+; X64-NEXT: movl %esi, %eax
+; X64-NEXT: retq
%shamt = and i32 %t, 63
%res = shl i32 %val, %shamt
ret i32 %res
@@ -31,12 +44,16 @@ define i32 @t2(i32 %t, i32 %val) nounwind {
define void @t3(i16 %t) nounwind {
; X32-LABEL: t3:
-; X32-NOT: andl
-; X32: sarw
-
+; X32: # %bb.0:
+; X32-NEXT: movb {{[0-9]+}}(%esp), %cl
+; X32-NEXT: sarw %cl, X
+; X32-NEXT: retl
+;
; X64-LABEL: t3:
-; X64-NOT: andl
-; X64: sarw
+; X64: # %bb.0:
+; X64-NEXT: movl %edi, %ecx
+; X64-NEXT: sarw %cl, {{.*}}(%rip)
+; X64-NEXT: retq
%shamt = and i16 %t, 31
%tmp = load i16, i16* @X
%tmp1 = ashr i16 %tmp, %shamt
@@ -45,34 +62,155 @@ define void @t3(i16 %t) nounwind {
}
define i64 @t4(i64 %t, i64 %val) nounwind {
+; X32-LABEL: t4:
+; X32: # %bb.0:
+; X32-NEXT: pushl %esi
+; X32-NEXT: movb {{[0-9]+}}(%esp), %cl
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT: movl %esi, %edx
+; X32-NEXT: shrl %cl, %edx
+; X32-NEXT: shrdl %cl, %esi, %eax
+; X32-NEXT: testb $32, %cl
+; X32-NEXT: je .LBB3_2
+; X32-NEXT: # %bb.1:
+; X32-NEXT: movl %edx, %eax
+; X32-NEXT: xorl %edx, %edx
+; X32-NEXT: .LBB3_2:
+; X32-NEXT: popl %esi
+; X32-NEXT: retl
+;
; X64-LABEL: t4:
-; X64-NOT: and
-; X64: shrq
+; X64: # %bb.0:
+; X64-NEXT: movl %edi, %ecx
+; X64-NEXT: shrq %cl, %rsi
+; X64-NEXT: movq %rsi, %rax
+; X64-NEXT: retq
%shamt = and i64 %t, 63
%res = lshr i64 %val, %shamt
ret i64 %res
}
define i64 @t5(i64 %t, i64 %val) nounwind {
+; X32-LABEL: t5:
+; X32: # %bb.0:
+; X32-NEXT: pushl %esi
+; X32-NEXT: movb {{[0-9]+}}(%esp), %cl
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT: movl %esi, %edx
+; X32-NEXT: shrl %cl, %edx
+; X32-NEXT: shrdl %cl, %esi, %eax
+; X32-NEXT: testb $32, %cl
+; X32-NEXT: je .LBB4_2
+; X32-NEXT: # %bb.1:
+; X32-NEXT: movl %edx, %eax
+; X32-NEXT: xorl %edx, %edx
+; X32-NEXT: .LBB4_2:
+; X32-NEXT: popl %esi
+; X32-NEXT: retl
+;
; X64-LABEL: t5:
-; X64-NOT: and
-; X64: shrq
+; X64: # %bb.0:
+; X64-NEXT: movl %edi, %ecx
+; X64-NEXT: shrq %cl, %rsi
+; X64-NEXT: movq %rsi, %rax
+; X64-NEXT: retq
%shamt = and i64 %t, 191
%res = lshr i64 %val, %shamt
ret i64 %res
}
+define void @t5ptr(i64 %t, i64* %ptr) nounwind {
+; X32-LABEL: t5ptr:
+; X32: # %bb.0:
+; X32-NEXT: pushl %edi
+; X32-NEXT: pushl %esi
+; X32-NEXT: movb {{[0-9]+}}(%esp), %cl
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl (%eax), %edx
+; X32-NEXT: movl 4(%eax), %edi
+; X32-NEXT: movl %edi, %esi
+; X32-NEXT: shrl %cl, %esi
+; X32-NEXT: shrdl %cl, %edi, %edx
+; X32-NEXT: testb $32, %cl
+; X32-NEXT: je .LBB5_2
+; X32-NEXT: # %bb.1:
+; X32-NEXT: movl %esi, %edx
+; X32-NEXT: xorl %esi, %esi
+; X32-NEXT: .LBB5_2:
+; X32-NEXT: movl %esi, 4(%eax)
+; X32-NEXT: movl %edx, (%eax)
+; X32-NEXT: popl %esi
+; X32-NEXT: popl %edi
+; X32-NEXT: retl
+;
+; X64-LABEL: t5ptr:
+; X64: # %bb.0:
+; X64-NEXT: movl %edi, %ecx
+; X64-NEXT: shrq %cl, (%rsi)
+; X64-NEXT: retq
+ %shamt = and i64 %t, 191
+ %tmp = load i64, i64* %ptr
+ %tmp1 = lshr i64 %tmp, %shamt
+ store i64 %tmp1, i64* %ptr
+ ret void
+}
+
; rdar://11866926
define i64 @t6(i64 %key, i64* nocapture %val) nounwind {
-entry:
+; X32-LABEL: t6:
+; X32: # %bb.0:
+; X32-NEXT: pushl %edi
+; X32-NEXT: pushl %esi
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: shrdl $3, %eax, %esi
+; X32-NEXT: movl %eax, %edi
+; X32-NEXT: shrl $3, %edi
+; X32-NEXT: movl (%ecx), %eax
+; X32-NEXT: movl 4(%ecx), %edx
+; X32-NEXT: addl $-1, %eax
+; X32-NEXT: adcl $-1, %edx
+; X32-NEXT: andl %esi, %eax
+; X32-NEXT: andl %edi, %edx
+; X32-NEXT: popl %esi
+; X32-NEXT: popl %edi
+; X32-NEXT: retl
+;
; X64-LABEL: t6:
-; X64-NOT: movabsq
-; X64: decq
-; X64: andq
+; X64: # %bb.0:
+; X64-NEXT: shrq $3, %rdi
+; X64-NEXT: movq (%rsi), %rax
+; X64-NEXT: decq %rax
+; X64-NEXT: andq %rdi, %rax
+; X64-NEXT: retq
%shr = lshr i64 %key, 3
- %0 = load i64, i64* %val, align 8
- %sub = add i64 %0, 2305843009213693951
+ %1 = load i64, i64* %val, align 8
+ %sub = add i64 %1, 2305843009213693951
%and = and i64 %sub, %shr
ret i64 %and
}
+
+define i64 @big_mask_constant(i64 %x) nounwind {
+; X32-LABEL: big_mask_constant:
+; X32: # %bb.0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: andl $4, %eax
+; X32-NEXT: shll $25, %eax
+; X32-NEXT: xorl %edx, %edx
+; X32-NEXT: retl
+;
+; X64-LABEL: big_mask_constant:
+; X64: # %bb.0:
+; X64-NEXT: shrq $7, %rdi
+; X64-NEXT: andl $134217728, %edi # imm = 0x8000000
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: retq
+ %and = and i64 %x, 17179869184 ; 0x400000000
+ %sh = lshr i64 %and, 7
+ ret i64 %sh
+}
+
diff --git a/test/CodeGen/X86/shift-bmi2.ll b/test/CodeGen/X86/shift-bmi2.ll
index fdeddffdfb0e..07e60e345c56 100644
--- a/test/CodeGen/X86/shift-bmi2.ll
+++ b/test/CodeGen/X86/shift-bmi2.ll
@@ -1,179 +1,289 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=i386-unknown-unknown -mcpu=core-avx2 < %s | FileCheck --check-prefix=BMI2 %s
; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=core-avx2 < %s | FileCheck --check-prefix=BMI264 %s
define i32 @shl32(i32 %x, i32 %shamt) nounwind uwtable readnone {
-entry:
+; BMI2-LABEL: shl32:
+; BMI2: # %bb.0:
+; BMI2-NEXT: movb {{[0-9]+}}(%esp), %al
+; BMI2-NEXT: shlxl %eax, {{[0-9]+}}(%esp), %eax
+; BMI2-NEXT: retl
+;
+; BMI264-LABEL: shl32:
+; BMI264: # %bb.0:
+; BMI264-NEXT: shlxl %esi, %edi, %eax
+; BMI264-NEXT: retq
%shl = shl i32 %x, %shamt
-; BMI2: shl32
-; BMI2: shlxl
-; BMI2: ret
-; BMI264: shl32
-; BMI264: shlxl
-; BMI264: ret
ret i32 %shl
}
define i32 @shl32i(i32 %x) nounwind uwtable readnone {
-entry:
+; BMI2-LABEL: shl32i:
+; BMI2: # %bb.0:
+; BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; BMI2-NEXT: shll $5, %eax
+; BMI2-NEXT: retl
+;
+; BMI264-LABEL: shl32i:
+; BMI264: # %bb.0:
+; BMI264-NEXT: shll $5, %edi
+; BMI264-NEXT: movl %edi, %eax
+; BMI264-NEXT: retq
%shl = shl i32 %x, 5
-; BMI2: shl32i
-; BMI2-NOT: shlxl
-; BMI2: ret
-; BMI264: shl32i
-; BMI264-NOT: shlxl
-; BMI264: ret
ret i32 %shl
}
define i32 @shl32p(i32* %p, i32 %shamt) nounwind uwtable readnone {
-entry:
+; BMI2-LABEL: shl32p:
+; BMI2: # %bb.0:
+; BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; BMI2-NEXT: movb {{[0-9]+}}(%esp), %cl
+; BMI2-NEXT: shlxl %ecx, (%eax), %eax
+; BMI2-NEXT: retl
+;
+; BMI264-LABEL: shl32p:
+; BMI264: # %bb.0:
+; BMI264-NEXT: shlxl %esi, (%rdi), %eax
+; BMI264-NEXT: retq
%x = load i32, i32* %p
%shl = shl i32 %x, %shamt
-; BMI2: shl32p
-; BMI2: shlxl %{{.+}}, ({{.+}}), %{{.+}}
-; BMI2: ret
-; BMI264: shl32p
-; BMI264: shlxl %{{.+}}, ({{.+}}), %{{.+}}
-; BMI264: ret
ret i32 %shl
}
define i32 @shl32pi(i32* %p) nounwind uwtable readnone {
-entry:
+; BMI2-LABEL: shl32pi:
+; BMI2: # %bb.0:
+; BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; BMI2-NEXT: movl (%eax), %eax
+; BMI2-NEXT: shll $5, %eax
+; BMI2-NEXT: retl
+;
+; BMI264-LABEL: shl32pi:
+; BMI264: # %bb.0:
+; BMI264-NEXT: movl (%rdi), %eax
+; BMI264-NEXT: shll $5, %eax
+; BMI264-NEXT: retq
%x = load i32, i32* %p
%shl = shl i32 %x, 5
-; BMI2: shl32pi
-; BMI2-NOT: shlxl
-; BMI2: ret
-; BMI264: shl32pi
-; BMI264-NOT: shlxl
-; BMI264: ret
ret i32 %shl
}
define i64 @shl64(i64 %x, i64 %shamt) nounwind uwtable readnone {
-entry:
+; BMI264-LABEL: shl64:
+; BMI264: # %bb.0:
+; BMI264-NEXT: shlxq %rsi, %rdi, %rax
+; BMI264-NEXT: retq
%shl = shl i64 %x, %shamt
-; BMI264: shl64
-; BMI264: shlxq
-; BMI264: ret
ret i64 %shl
}
define i64 @shl64i(i64 %x) nounwind uwtable readnone {
-entry:
+; BMI264-LABEL: shl64i:
+; BMI264: # %bb.0:
+; BMI264-NEXT: shlq $7, %rdi
+; BMI264-NEXT: movq %rdi, %rax
+; BMI264-NEXT: retq
%shl = shl i64 %x, 7
-; BMI264: shl64i
-; BMI264-NOT: shlxq
-; BMI264: ret
ret i64 %shl
}
define i64 @shl64p(i64* %p, i64 %shamt) nounwind uwtable readnone {
-entry:
+; BMI264-LABEL: shl64p:
+; BMI264: # %bb.0:
+; BMI264-NEXT: shlxq %rsi, (%rdi), %rax
+; BMI264-NEXT: retq
%x = load i64, i64* %p
%shl = shl i64 %x, %shamt
-; BMI264: shl64p
-; BMI264: shlxq %{{.+}}, ({{.+}}), %{{.+}}
-; BMI264: ret
ret i64 %shl
}
define i64 @shl64pi(i64* %p) nounwind uwtable readnone {
-entry:
+; BMI264-LABEL: shl64pi:
+; BMI264: # %bb.0:
+; BMI264-NEXT: movq (%rdi), %rax
+; BMI264-NEXT: shlq $7, %rax
+; BMI264-NEXT: retq
%x = load i64, i64* %p
%shl = shl i64 %x, 7
-; BMI264: shl64pi
-; BMI264-NOT: shlxq
-; BMI264: ret
ret i64 %shl
}
define i32 @lshr32(i32 %x, i32 %shamt) nounwind uwtable readnone {
-entry:
+; BMI2-LABEL: lshr32:
+; BMI2: # %bb.0:
+; BMI2-NEXT: movb {{[0-9]+}}(%esp), %al
+; BMI2-NEXT: shrxl %eax, {{[0-9]+}}(%esp), %eax
+; BMI2-NEXT: retl
+;
+; BMI264-LABEL: lshr32:
+; BMI264: # %bb.0:
+; BMI264-NEXT: shrxl %esi, %edi, %eax
+; BMI264-NEXT: retq
%shl = lshr i32 %x, %shamt
-; BMI2: lshr32
-; BMI2: shrxl
-; BMI2: ret
-; BMI264: lshr32
-; BMI264: shrxl
-; BMI264: ret
ret i32 %shl
}
define i32 @lshr32p(i32* %p, i32 %shamt) nounwind uwtable readnone {
-entry:
+; BMI2-LABEL: lshr32p:
+; BMI2: # %bb.0:
+; BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; BMI2-NEXT: movb {{[0-9]+}}(%esp), %cl
+; BMI2-NEXT: shrxl %ecx, (%eax), %eax
+; BMI2-NEXT: retl
+;
+; BMI264-LABEL: lshr32p:
+; BMI264: # %bb.0:
+; BMI264-NEXT: shrxl %esi, (%rdi), %eax
+; BMI264-NEXT: retq
%x = load i32, i32* %p
%shl = lshr i32 %x, %shamt
-; BMI2: lshr32p
-; BMI2: shrxl %{{.+}}, ({{.+}}), %{{.+}}
-; BMI2: ret
-; BMI264: lshr32p
-; BMI264: shrxl %{{.+}}, ({{.+}}), %{{.+}}
-; BMI264: ret
ret i32 %shl
}
define i64 @lshr64(i64 %x, i64 %shamt) nounwind uwtable readnone {
-entry:
+; BMI264-LABEL: lshr64:
+; BMI264: # %bb.0:
+; BMI264-NEXT: shrxq %rsi, %rdi, %rax
+; BMI264-NEXT: retq
%shl = lshr i64 %x, %shamt
-; BMI264: lshr64
-; BMI264: shrxq
-; BMI264: ret
ret i64 %shl
}
define i64 @lshr64p(i64* %p, i64 %shamt) nounwind uwtable readnone {
-entry:
+; BMI264-LABEL: lshr64p:
+; BMI264: # %bb.0:
+; BMI264-NEXT: shrxq %rsi, (%rdi), %rax
+; BMI264-NEXT: retq
%x = load i64, i64* %p
%shl = lshr i64 %x, %shamt
-; BMI264: lshr64p
-; BMI264: shrxq %{{.+}}, ({{.+}}), %{{.+}}
-; BMI264: ret
ret i64 %shl
}
define i32 @ashr32(i32 %x, i32 %shamt) nounwind uwtable readnone {
-entry:
+; BMI2-LABEL: ashr32:
+; BMI2: # %bb.0:
+; BMI2-NEXT: movb {{[0-9]+}}(%esp), %al
+; BMI2-NEXT: sarxl %eax, {{[0-9]+}}(%esp), %eax
+; BMI2-NEXT: retl
+;
+; BMI264-LABEL: ashr32:
+; BMI264: # %bb.0:
+; BMI264-NEXT: sarxl %esi, %edi, %eax
+; BMI264-NEXT: retq
%shl = ashr i32 %x, %shamt
-; BMI2: ashr32
-; BMI2: sarxl
-; BMI2: ret
-; BMI264: ashr32
-; BMI264: sarxl
-; BMI264: ret
ret i32 %shl
}
define i32 @ashr32p(i32* %p, i32 %shamt) nounwind uwtable readnone {
-entry:
+; BMI2-LABEL: ashr32p:
+; BMI2: # %bb.0:
+; BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; BMI2-NEXT: movb {{[0-9]+}}(%esp), %cl
+; BMI2-NEXT: sarxl %ecx, (%eax), %eax
+; BMI2-NEXT: retl
+;
+; BMI264-LABEL: ashr32p:
+; BMI264: # %bb.0:
+; BMI264-NEXT: sarxl %esi, (%rdi), %eax
+; BMI264-NEXT: retq
%x = load i32, i32* %p
%shl = ashr i32 %x, %shamt
-; BMI2: ashr32p
-; Source order scheduling prevents folding, rdar:14208996.
-; BMI2: sarxl %{{.+}}, ({{.+}}), %{{.+}}
-; BMI2: ret
-; BMI264: ashr32p
-; BMI264: sarxl %{{.+}}, ({{.+}}), %{{.+}}
-; BMI264: ret
ret i32 %shl
}
define i64 @ashr64(i64 %x, i64 %shamt) nounwind uwtable readnone {
-entry:
+; BMI264-LABEL: ashr64:
+; BMI264: # %bb.0:
+; BMI264-NEXT: sarxq %rsi, %rdi, %rax
+; BMI264-NEXT: retq
%shl = ashr i64 %x, %shamt
-; BMI264: ashr64
-; BMI264: sarxq
-; BMI264: ret
ret i64 %shl
}
define i64 @ashr64p(i64* %p, i64 %shamt) nounwind uwtable readnone {
-entry:
+; BMI264-LABEL: ashr64p:
+; BMI264: # %bb.0:
+; BMI264-NEXT: sarxq %rsi, (%rdi), %rax
+; BMI264-NEXT: retq
%x = load i64, i64* %p
%shl = ashr i64 %x, %shamt
-; BMI264: ashr64p
-; BMI264: sarxq %{{.+}}, ({{.+}}), %{{.+}}
-; BMI264: ret
ret i64 %shl
}
+
+define i32 @shl32and(i32 %t, i32 %val) nounwind {
+; BMI2-LABEL: shl32and:
+; BMI2: # %bb.0:
+; BMI2-NEXT: movb {{[0-9]+}}(%esp), %al
+; BMI2-NEXT: shlxl %eax, {{[0-9]+}}(%esp), %eax
+; BMI2-NEXT: retl
+;
+; BMI264-LABEL: shl32and:
+; BMI264: # %bb.0:
+; BMI264-NEXT: shlxl %edi, %esi, %eax
+; BMI264-NEXT: retq
+ %shamt = and i32 %t, 31
+ %res = shl i32 %val, %shamt
+ ret i32 %res
+}
+
+define i64 @shl64and(i64 %t, i64 %val) nounwind {
+; BMI264-LABEL: shl64and:
+; BMI264: # %bb.0:
+; BMI264-NEXT: shlxq %rdi, %rsi, %rax
+; BMI264-NEXT: retq
+ %shamt = and i64 %t, 63
+ %res = shl i64 %val, %shamt
+ ret i64 %res
+}
+
+define i32 @lshr32and(i32 %t, i32 %val) nounwind {
+; BMI2-LABEL: lshr32and:
+; BMI2: # %bb.0:
+; BMI2-NEXT: movb {{[0-9]+}}(%esp), %al
+; BMI2-NEXT: shrxl %eax, {{[0-9]+}}(%esp), %eax
+; BMI2-NEXT: retl
+;
+; BMI264-LABEL: lshr32and:
+; BMI264: # %bb.0:
+; BMI264-NEXT: shrxl %edi, %esi, %eax
+; BMI264-NEXT: retq
+ %shamt = and i32 %t, 31
+ %res = lshr i32 %val, %shamt
+ ret i32 %res
+}
+
+define i64 @lshr64and(i64 %t, i64 %val) nounwind {
+; BMI264-LABEL: lshr64and:
+; BMI264: # %bb.0:
+; BMI264-NEXT: shrxq %rdi, %rsi, %rax
+; BMI264-NEXT: retq
+ %shamt = and i64 %t, 63
+ %res = lshr i64 %val, %shamt
+ ret i64 %res
+}
+
+define i32 @ashr32and(i32 %t, i32 %val) nounwind {
+; BMI2-LABEL: ashr32and:
+; BMI2: # %bb.0:
+; BMI2-NEXT: movb {{[0-9]+}}(%esp), %al
+; BMI2-NEXT: sarxl %eax, {{[0-9]+}}(%esp), %eax
+; BMI2-NEXT: retl
+;
+; BMI264-LABEL: ashr32and:
+; BMI264: # %bb.0:
+; BMI264-NEXT: sarxl %edi, %esi, %eax
+; BMI264-NEXT: retq
+ %shamt = and i32 %t, 31
+ %res = ashr i32 %val, %shamt
+ ret i32 %res
+}
+
+define i64 @ashr64and(i64 %t, i64 %val) nounwind {
+; BMI264-LABEL: ashr64and:
+; BMI264: # %bb.0:
+; BMI264-NEXT: sarxq %rdi, %rsi, %rax
+; BMI264-NEXT: retq
+ %shamt = and i64 %t, 63
+ %res = ashr i64 %val, %shamt
+ ret i64 %res
+}
diff --git a/test/CodeGen/X86/shift-coalesce.ll b/test/CodeGen/X86/shift-coalesce.ll
index dee7d373dcee..7f2c3b5a0752 100644
--- a/test/CodeGen/X86/shift-coalesce.ll
+++ b/test/CodeGen/X86/shift-coalesce.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -march=x86 -x86-asm-syntax=intel | \
+; RUN: llc < %s -mtriple=i686-- -x86-asm-syntax=intel | \
; RUN: grep "shld.*cl"
-; RUN: llc < %s -march=x86 -x86-asm-syntax=intel | \
+; RUN: llc < %s -mtriple=i686-- -x86-asm-syntax=intel | \
; RUN: not grep "mov cl, bl"
; PR687
diff --git a/test/CodeGen/X86/shift-codegen.ll b/test/CodeGen/X86/shift-codegen.ll
index 295a55d86a00..838ec789db5b 100644
--- a/test/CodeGen/X86/shift-codegen.ll
+++ b/test/CodeGen/X86/shift-codegen.ll
@@ -9,7 +9,7 @@ target triple = "i686-apple-darwin8"
define void @fn1() {
; CHECK-LABEL: fn1:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movl Y, %eax
; CHECK-NEXT: shll $3, %eax
; CHECK-NEXT: orl %eax, X
@@ -24,7 +24,7 @@ define void @fn1() {
define i32 @fn2(i32 %X, i32 %Y) {
; CHECK-LABEL: fn2:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: shll $3, %eax
; CHECK-NEXT: orl {{[0-9]+}}(%esp), %eax
diff --git a/test/CodeGen/X86/shift-combine.ll b/test/CodeGen/X86/shift-combine.ll
index 6e132f25bf39..0f2966f962b6 100644
--- a/test/CodeGen/X86/shift-combine.ll
+++ b/test/CodeGen/X86/shift-combine.ll
@@ -6,15 +6,15 @@
define i32 @test_lshr_and(i32 %x) {
; X32-LABEL: test_lshr_and:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: andl $12, %eax
; X32-NEXT: movl array(%eax), %eax
; X32-NEXT: retl
;
; X64-LABEL: test_lshr_and:
-; X64: # BB#0:
-; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64: # %bb.0:
+; X64-NEXT: # kill: def %edi killed %edi def %rdi
; X64-NEXT: shrl $2, %edi
; X64-NEXT: andl $3, %edi
; X64-NEXT: movl array(,%rdi,4), %eax
@@ -28,7 +28,7 @@ define i32 @test_lshr_and(i32 %x) {
define i32* @test_exact1(i32 %a, i32 %b, i32* %x) {
; X32-LABEL: test_exact1:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: subl {{[0-9]+}}(%esp), %eax
; X32-NEXT: sarl %eax
@@ -36,7 +36,7 @@ define i32* @test_exact1(i32 %a, i32 %b, i32* %x) {
; X32-NEXT: retl
;
; X64-LABEL: test_exact1:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: subl %edi, %esi
; X64-NEXT: sarl $3, %esi
; X64-NEXT: movslq %esi, %rax
@@ -50,7 +50,7 @@ define i32* @test_exact1(i32 %a, i32 %b, i32* %x) {
define i32* @test_exact2(i32 %a, i32 %b, i32* %x) {
; X32-LABEL: test_exact2:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: subl {{[0-9]+}}(%esp), %eax
; X32-NEXT: sarl %eax
@@ -58,7 +58,7 @@ define i32* @test_exact2(i32 %a, i32 %b, i32* %x) {
; X32-NEXT: retl
;
; X64-LABEL: test_exact2:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: subl %edi, %esi
; X64-NEXT: sarl $3, %esi
; X64-NEXT: movslq %esi, %rax
@@ -72,14 +72,14 @@ define i32* @test_exact2(i32 %a, i32 %b, i32* %x) {
define i32* @test_exact3(i32 %a, i32 %b, i32* %x) {
; X32-LABEL: test_exact3:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: subl {{[0-9]+}}(%esp), %eax
; X32-NEXT: addl {{[0-9]+}}(%esp), %eax
; X32-NEXT: retl
;
; X64-LABEL: test_exact3:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: subl %edi, %esi
; X64-NEXT: sarl $2, %esi
; X64-NEXT: movslq %esi, %rax
@@ -93,7 +93,7 @@ define i32* @test_exact3(i32 %a, i32 %b, i32* %x) {
define i32* @test_exact4(i32 %a, i32 %b, i32* %x) {
; X32-LABEL: test_exact4:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: subl {{[0-9]+}}(%esp), %eax
; X32-NEXT: shrl %eax
@@ -101,8 +101,8 @@ define i32* @test_exact4(i32 %a, i32 %b, i32* %x) {
; X32-NEXT: retl
;
; X64-LABEL: test_exact4:
-; X64: # BB#0:
-; X64-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
+; X64: # %bb.0:
+; X64-NEXT: # kill: def %esi killed %esi def %rsi
; X64-NEXT: subl %edi, %esi
; X64-NEXT: shrl $3, %esi
; X64-NEXT: leaq (%rdx,%rsi,4), %rax
@@ -115,7 +115,7 @@ define i32* @test_exact4(i32 %a, i32 %b, i32* %x) {
define i32* @test_exact5(i32 %a, i32 %b, i32* %x) {
; X32-LABEL: test_exact5:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: subl {{[0-9]+}}(%esp), %eax
; X32-NEXT: shrl %eax
@@ -123,8 +123,8 @@ define i32* @test_exact5(i32 %a, i32 %b, i32* %x) {
; X32-NEXT: retl
;
; X64-LABEL: test_exact5:
-; X64: # BB#0:
-; X64-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
+; X64: # %bb.0:
+; X64-NEXT: # kill: def %esi killed %esi def %rsi
; X64-NEXT: subl %edi, %esi
; X64-NEXT: shrl $3, %esi
; X64-NEXT: leaq (%rdx,%rsi,4), %rax
@@ -137,15 +137,15 @@ define i32* @test_exact5(i32 %a, i32 %b, i32* %x) {
define i32* @test_exact6(i32 %a, i32 %b, i32* %x) {
; X32-LABEL: test_exact6:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: subl {{[0-9]+}}(%esp), %eax
; X32-NEXT: addl {{[0-9]+}}(%esp), %eax
; X32-NEXT: retl
;
; X64-LABEL: test_exact6:
-; X64: # BB#0:
-; X64-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
+; X64: # %bb.0:
+; X64-NEXT: # kill: def %esi killed %esi def %rsi
; X64-NEXT: subl %edi, %esi
; X64-NEXT: leaq (%rsi,%rdx), %rax
; X64-NEXT: retq
diff --git a/test/CodeGen/X86/shift-double-x86_64.ll b/test/CodeGen/X86/shift-double-x86_64.ll
index 28f6731e25eb..0d5d9498fda2 100644
--- a/test/CodeGen/X86/shift-double-x86_64.ll
+++ b/test/CodeGen/X86/shift-double-x86_64.ll
@@ -5,7 +5,7 @@
define i64 @test1(i64 %hi, i64 %lo, i64 %bits) nounwind {
; CHECK-LABEL: test1:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: andl $63, %edx
; CHECK-NEXT: movl %edx, %ecx
; CHECK-NEXT: shldq %cl, %rsi, %rdi
@@ -21,7 +21,7 @@ define i64 @test1(i64 %hi, i64 %lo, i64 %bits) nounwind {
define i64 @test2(i64 %hi, i64 %lo, i64 %bits) nounwind {
; CHECK-LABEL: test2:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: andl $63, %edx
; CHECK-NEXT: movl %edx, %ecx
; CHECK-NEXT: shrdq %cl, %rdi, %rsi
@@ -37,7 +37,7 @@ define i64 @test2(i64 %hi, i64 %lo, i64 %bits) nounwind {
define i64 @test3(i64 %hi, i64 %lo, i64 %bits) nounwind {
; CHECK-LABEL: test3:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movl %edx, %ecx
; CHECK-NEXT: shldq %cl, %rsi, %rdi
; CHECK-NEXT: movq %rdi, %rax
@@ -51,7 +51,7 @@ define i64 @test3(i64 %hi, i64 %lo, i64 %bits) nounwind {
define i64 @test4(i64 %hi, i64 %lo, i64 %bits) nounwind {
; CHECK-LABEL: test4:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movl %edx, %ecx
; CHECK-NEXT: shrdq %cl, %rdi, %rsi
; CHECK-NEXT: movq %rsi, %rax
@@ -65,7 +65,7 @@ define i64 @test4(i64 %hi, i64 %lo, i64 %bits) nounwind {
define i64 @test5(i64 %hi, i64 %lo, i64 %bits) nounwind {
; CHECK-LABEL: test5:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movl %edx, %ecx
; CHECK-NEXT: shldq %cl, %rsi, %rdi
; CHECK-NEXT: movq %rdi, %rax
@@ -80,7 +80,7 @@ define i64 @test5(i64 %hi, i64 %lo, i64 %bits) nounwind {
define i64 @test6(i64 %hi, i64 %lo, i64 %bits) nounwind {
; CHECK-LABEL: test6:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movl %edx, %ecx
; CHECK-NEXT: shrdq %cl, %rsi, %rdi
; CHECK-NEXT: movq %rdi, %rax
@@ -95,7 +95,7 @@ define i64 @test6(i64 %hi, i64 %lo, i64 %bits) nounwind {
define i64 @test7(i64 %hi, i64 %lo, i64 %bits) nounwind {
; CHECK-LABEL: test7:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movl %edx, %ecx
; CHECK-NEXT: shrdq %cl, %rsi, %rdi
; CHECK-NEXT: movq %rdi, %rax
diff --git a/test/CodeGen/X86/shift-double.ll b/test/CodeGen/X86/shift-double.ll
index 8594c0713298..f7ea2e339c32 100644
--- a/test/CodeGen/X86/shift-double.ll
+++ b/test/CodeGen/X86/shift-double.ll
@@ -1,73 +1,95 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i686-unknown | FileCheck %s
+; RUN: llc < %s -mtriple=i686-unknown | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -mtriple=x86_64-unknown | FileCheck %s --check-prefix=X64
; Shift i64 integers on 32-bit target
define i64 @test1(i64 %X, i8 %C) nounwind {
-; CHECK-LABEL: test1:
-; CHECK: # BB#0:
-; CHECK-NEXT: pushl %esi
-; CHECK-NEXT: movb {{[0-9]+}}(%esp), %cl
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %esi
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx
-; CHECK-NEXT: movl %esi, %eax
-; CHECK-NEXT: shll %cl, %eax
-; CHECK-NEXT: shldl %cl, %esi, %edx
-; CHECK-NEXT: testb $32, %cl
-; CHECK-NEXT: je .LBB0_2
-; CHECK-NEXT: # BB#1:
-; CHECK-NEXT: movl %eax, %edx
-; CHECK-NEXT: xorl %eax, %eax
-; CHECK-NEXT: .LBB0_2:
-; CHECK-NEXT: popl %esi
-; CHECK-NEXT: retl
+; X86-LABEL: test1:
+; X86: # %bb.0:
+; X86-NEXT: pushl %esi
+; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: shll %cl, %eax
+; X86-NEXT: shldl %cl, %esi, %edx
+; X86-NEXT: testb $32, %cl
+; X86-NEXT: je .LBB0_2
+; X86-NEXT: # %bb.1:
+; X86-NEXT: movl %eax, %edx
+; X86-NEXT: xorl %eax, %eax
+; X86-NEXT: .LBB0_2:
+; X86-NEXT: popl %esi
+; X86-NEXT: retl
+;
+; X64-LABEL: test1:
+; X64: # %bb.0:
+; X64-NEXT: movl %esi, %ecx
+; X64-NEXT: shlq %cl, %rdi
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: retq
%shift.upgrd.1 = zext i8 %C to i64 ; <i64> [#uses=1]
%Y = shl i64 %X, %shift.upgrd.1 ; <i64> [#uses=1]
ret i64 %Y
}
define i64 @test2(i64 %X, i8 %C) nounwind {
-; CHECK-LABEL: test2:
-; CHECK: # BB#0:
-; CHECK-NEXT: pushl %esi
-; CHECK-NEXT: movb {{[0-9]+}}(%esp), %cl
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %esi
-; CHECK-NEXT: movl %esi, %edx
-; CHECK-NEXT: sarl %cl, %edx
-; CHECK-NEXT: shrdl %cl, %esi, %eax
-; CHECK-NEXT: testb $32, %cl
-; CHECK-NEXT: je .LBB1_2
-; CHECK-NEXT: # BB#1:
-; CHECK-NEXT: sarl $31, %esi
-; CHECK-NEXT: movl %edx, %eax
-; CHECK-NEXT: movl %esi, %edx
-; CHECK-NEXT: .LBB1_2:
-; CHECK-NEXT: popl %esi
-; CHECK-NEXT: retl
+; X86-LABEL: test2:
+; X86: # %bb.0:
+; X86-NEXT: pushl %esi
+; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movl %esi, %edx
+; X86-NEXT: sarl %cl, %edx
+; X86-NEXT: shrdl %cl, %esi, %eax
+; X86-NEXT: testb $32, %cl
+; X86-NEXT: je .LBB1_2
+; X86-NEXT: # %bb.1:
+; X86-NEXT: sarl $31, %esi
+; X86-NEXT: movl %edx, %eax
+; X86-NEXT: movl %esi, %edx
+; X86-NEXT: .LBB1_2:
+; X86-NEXT: popl %esi
+; X86-NEXT: retl
+;
+; X64-LABEL: test2:
+; X64: # %bb.0:
+; X64-NEXT: movl %esi, %ecx
+; X64-NEXT: sarq %cl, %rdi
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: retq
%shift.upgrd.2 = zext i8 %C to i64 ; <i64> [#uses=1]
%Y = ashr i64 %X, %shift.upgrd.2 ; <i64> [#uses=1]
ret i64 %Y
}
define i64 @test3(i64 %X, i8 %C) nounwind {
-; CHECK-LABEL: test3:
-; CHECK: # BB#0:
-; CHECK-NEXT: pushl %esi
-; CHECK-NEXT: movb {{[0-9]+}}(%esp), %cl
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %esi
-; CHECK-NEXT: movl %esi, %edx
-; CHECK-NEXT: shrl %cl, %edx
-; CHECK-NEXT: shrdl %cl, %esi, %eax
-; CHECK-NEXT: testb $32, %cl
-; CHECK-NEXT: je .LBB2_2
-; CHECK-NEXT: # BB#1:
-; CHECK-NEXT: movl %edx, %eax
-; CHECK-NEXT: xorl %edx, %edx
-; CHECK-NEXT: .LBB2_2:
-; CHECK-NEXT: popl %esi
-; CHECK-NEXT: retl
+; X86-LABEL: test3:
+; X86: # %bb.0:
+; X86-NEXT: pushl %esi
+; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movl %esi, %edx
+; X86-NEXT: shrl %cl, %edx
+; X86-NEXT: shrdl %cl, %esi, %eax
+; X86-NEXT: testb $32, %cl
+; X86-NEXT: je .LBB2_2
+; X86-NEXT: # %bb.1:
+; X86-NEXT: movl %edx, %eax
+; X86-NEXT: xorl %edx, %edx
+; X86-NEXT: .LBB2_2:
+; X86-NEXT: popl %esi
+; X86-NEXT: retl
+;
+; X64-LABEL: test3:
+; X64: # %bb.0:
+; X64-NEXT: movl %esi, %ecx
+; X64-NEXT: shrq %cl, %rdi
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: retq
%shift.upgrd.3 = zext i8 %C to i64 ; <i64> [#uses=1]
%Y = lshr i64 %X, %shift.upgrd.3 ; <i64> [#uses=1]
ret i64 %Y
@@ -76,13 +98,20 @@ define i64 @test3(i64 %X, i8 %C) nounwind {
; Combine 2xi32/2xi16 shifts into SHLD
define i32 @test4(i32 %A, i32 %B, i8 %C) nounwind {
-; CHECK-LABEL: test4:
-; CHECK: # BB#0:
-; CHECK-NEXT: movb {{[0-9]+}}(%esp), %cl
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: shldl %cl, %edx, %eax
-; CHECK-NEXT: retl
+; X86-LABEL: test4:
+; X86: # %bb.0:
+; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: retl
+;
+; X64-LABEL: test4:
+; X64: # %bb.0:
+; X64-NEXT: movl %edx, %ecx
+; X64-NEXT: shldl %cl, %esi, %edi
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: retq
%shift.upgrd.4 = zext i8 %C to i32 ; <i32> [#uses=1]
%X = shl i32 %A, %shift.upgrd.4 ; <i32> [#uses=1]
%Cv = sub i8 32, %C ; <i8> [#uses=1]
@@ -93,13 +122,20 @@ define i32 @test4(i32 %A, i32 %B, i8 %C) nounwind {
}
define i16 @test5(i16 %A, i16 %B, i8 %C) nounwind {
-; CHECK-LABEL: test5:
-; CHECK: # BB#0:
-; CHECK-NEXT: movb {{[0-9]+}}(%esp), %cl
-; CHECK-NEXT: movzwl {{[0-9]+}}(%esp), %edx
-; CHECK-NEXT: movzwl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: shldw %cl, %dx, %ax
-; CHECK-NEXT: retl
+; X86-LABEL: test5:
+; X86: # %bb.0:
+; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
+; X86-NEXT: movzwl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: shldw %cl, %dx, %ax
+; X86-NEXT: retl
+;
+; X64-LABEL: test5:
+; X64: # %bb.0:
+; X64-NEXT: movl %edx, %ecx
+; X64-NEXT: shldw %cl, %si, %di
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: retq
%shift.upgrd.6 = zext i8 %C to i16 ; <i16> [#uses=1]
%X = shl i16 %A, %shift.upgrd.6 ; <i16> [#uses=1]
%Cv = sub i8 16, %C ; <i8> [#uses=1]
@@ -112,13 +148,20 @@ define i16 @test5(i16 %A, i16 %B, i8 %C) nounwind {
; Combine 2xi32/2xi16 shifts into SHRD
define i32 @test6(i32 %A, i32 %B, i8 %C) nounwind {
-; CHECK-LABEL: test6:
-; CHECK: # BB#0:
-; CHECK-NEXT: movb {{[0-9]+}}(%esp), %cl
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: shrdl %cl, %edx, %eax
-; CHECK-NEXT: retl
+; X86-LABEL: test6:
+; X86: # %bb.0:
+; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: shrdl %cl, %edx, %eax
+; X86-NEXT: retl
+;
+; X64-LABEL: test6:
+; X64: # %bb.0:
+; X64-NEXT: movl %edx, %ecx
+; X64-NEXT: shrdl %cl, %esi, %edi
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: retq
%shift.upgrd.4 = zext i8 %C to i32 ; <i32> [#uses=1]
%X = lshr i32 %A, %shift.upgrd.4 ; <i32> [#uses=1]
%Cv = sub i8 32, %C ; <i8> [#uses=1]
@@ -129,13 +172,20 @@ define i32 @test6(i32 %A, i32 %B, i8 %C) nounwind {
}
define i16 @test7(i16 %A, i16 %B, i8 %C) nounwind {
-; CHECK-LABEL: test7:
-; CHECK: # BB#0:
-; CHECK-NEXT: movb {{[0-9]+}}(%esp), %cl
-; CHECK-NEXT: movzwl {{[0-9]+}}(%esp), %edx
-; CHECK-NEXT: movzwl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: shrdw %cl, %dx, %ax
-; CHECK-NEXT: retl
+; X86-LABEL: test7:
+; X86: # %bb.0:
+; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
+; X86-NEXT: movzwl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: shrdw %cl, %dx, %ax
+; X86-NEXT: retl
+;
+; X64-LABEL: test7:
+; X64: # %bb.0:
+; X64-NEXT: movl %edx, %ecx
+; X64-NEXT: shrdw %cl, %si, %di
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: retq
%shift.upgrd.6 = zext i8 %C to i16 ; <i16> [#uses=1]
%X = lshr i16 %A, %shift.upgrd.6 ; <i16> [#uses=1]
%Cv = sub i8 16, %C ; <i8> [#uses=1]
@@ -148,17 +198,25 @@ define i16 @test7(i16 %A, i16 %B, i8 %C) nounwind {
; Shift i64 integers on 32-bit target by shift value less then 32 (PR14593)
define i64 @test8(i64 %val, i32 %bits) nounwind {
-; CHECK-LABEL: test8:
-; CHECK: # BB#0:
-; CHECK-NEXT: pushl %esi
-; CHECK-NEXT: movb {{[0-9]+}}(%esp), %cl
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %esi
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx
-; CHECK-NEXT: movl %esi, %eax
-; CHECK-NEXT: shll %cl, %eax
-; CHECK-NEXT: shldl %cl, %esi, %edx
-; CHECK-NEXT: popl %esi
-; CHECK-NEXT: retl
+; X86-LABEL: test8:
+; X86: # %bb.0:
+; X86-NEXT: pushl %esi
+; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: shll %cl, %eax
+; X86-NEXT: shldl %cl, %esi, %edx
+; X86-NEXT: popl %esi
+; X86-NEXT: retl
+;
+; X64-LABEL: test8:
+; X64: # %bb.0:
+; X64-NEXT: andb $31, %sil
+; X64-NEXT: movl %esi, %ecx
+; X64-NEXT: shlq %cl, %rdi
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: retq
%and = and i32 %bits, 31
%sh_prom = zext i32 %and to i64
%shl = shl i64 %val, %sh_prom
@@ -166,14 +224,22 @@ define i64 @test8(i64 %val, i32 %bits) nounwind {
}
define i64 @test9(i64 %val, i32 %bits) nounwind {
-; CHECK-LABEL: test9:
-; CHECK: # BB#0:
-; CHECK-NEXT: movb {{[0-9]+}}(%esp), %cl
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx
-; CHECK-NEXT: shrdl %cl, %edx, %eax
-; CHECK-NEXT: sarl %cl, %edx
-; CHECK-NEXT: retl
+; X86-LABEL: test9:
+; X86: # %bb.0:
+; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: shrdl %cl, %edx, %eax
+; X86-NEXT: sarl %cl, %edx
+; X86-NEXT: retl
+;
+; X64-LABEL: test9:
+; X64: # %bb.0:
+; X64-NEXT: andb $31, %sil
+; X64-NEXT: movl %esi, %ecx
+; X64-NEXT: sarq %cl, %rdi
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: retq
%and = and i32 %bits, 31
%sh_prom = zext i32 %and to i64
%ashr = ashr i64 %val, %sh_prom
@@ -181,14 +247,22 @@ define i64 @test9(i64 %val, i32 %bits) nounwind {
}
define i64 @test10(i64 %val, i32 %bits) nounwind {
-; CHECK-LABEL: test10:
-; CHECK: # BB#0:
-; CHECK-NEXT: movb {{[0-9]+}}(%esp), %cl
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx
-; CHECK-NEXT: shrdl %cl, %edx, %eax
-; CHECK-NEXT: shrl %cl, %edx
-; CHECK-NEXT: retl
+; X86-LABEL: test10:
+; X86: # %bb.0:
+; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: shrdl %cl, %edx, %eax
+; X86-NEXT: shrl %cl, %edx
+; X86-NEXT: retl
+;
+; X64-LABEL: test10:
+; X64: # %bb.0:
+; X64-NEXT: andb $31, %sil
+; X64-NEXT: movl %esi, %ecx
+; X64-NEXT: shrq %cl, %rdi
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: retq
%and = and i32 %bits, 31
%sh_prom = zext i32 %and to i64
%lshr = lshr i64 %val, %sh_prom
@@ -198,15 +272,23 @@ define i64 @test10(i64 %val, i32 %bits) nounwind {
; SHLD/SHRD manual shifts
define i32 @test11(i32 %hi, i32 %lo, i32 %bits) nounwind {
-; CHECK-LABEL: test11:
-; CHECK: # BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; CHECK-NEXT: andl $31, %ecx
-; CHECK-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; CHECK-NEXT: shldl %cl, %edx, %eax
-; CHECK-NEXT: retl
+; X86-LABEL: test11:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: andl $31, %ecx
+; X86-NEXT: # kill: def %cl killed %cl killed %ecx
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: retl
+;
+; X64-LABEL: test11:
+; X64: # %bb.0:
+; X64-NEXT: andl $31, %edx
+; X64-NEXT: movl %edx, %ecx
+; X64-NEXT: shldl %cl, %esi, %edi
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: retq
%and = and i32 %bits, 31
%and32 = sub i32 32, %and
%sh_lo = lshr i32 %lo, %and32
@@ -216,15 +298,23 @@ define i32 @test11(i32 %hi, i32 %lo, i32 %bits) nounwind {
}
define i32 @test12(i32 %hi, i32 %lo, i32 %bits) nounwind {
-; CHECK-LABEL: test12:
-; CHECK: # BB#0:
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; CHECK-NEXT: andl $31, %ecx
-; CHECK-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
-; CHECK-NEXT: shrdl %cl, %edx, %eax
-; CHECK-NEXT: retl
+; X86-LABEL: test12:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: andl $31, %ecx
+; X86-NEXT: # kill: def %cl killed %cl killed %ecx
+; X86-NEXT: shrdl %cl, %edx, %eax
+; X86-NEXT: retl
+;
+; X64-LABEL: test12:
+; X64: # %bb.0:
+; X64-NEXT: andl $31, %edx
+; X64-NEXT: movl %edx, %ecx
+; X64-NEXT: shrdl %cl, %edi, %esi
+; X64-NEXT: movl %esi, %eax
+; X64-NEXT: retq
%and = and i32 %bits, 31
%and32 = sub i32 32, %and
%sh_lo = shl i32 %hi, %and32
@@ -234,13 +324,20 @@ define i32 @test12(i32 %hi, i32 %lo, i32 %bits) nounwind {
}
define i32 @test13(i32 %hi, i32 %lo, i32 %bits) nounwind {
-; CHECK-LABEL: test13:
-; CHECK: # BB#0:
-; CHECK-NEXT: movb {{[0-9]+}}(%esp), %cl
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: shldl %cl, %edx, %eax
-; CHECK-NEXT: retl
+; X86-LABEL: test13:
+; X86: # %bb.0:
+; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: retl
+;
+; X64-LABEL: test13:
+; X64: # %bb.0:
+; X64-NEXT: movl %edx, %ecx
+; X64-NEXT: shldl %cl, %esi, %edi
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: retq
%bits32 = sub i32 32, %bits
%sh_lo = lshr i32 %lo, %bits32
%sh_hi = shl i32 %hi, %bits
@@ -249,13 +346,20 @@ define i32 @test13(i32 %hi, i32 %lo, i32 %bits) nounwind {
}
define i32 @test14(i32 %hi, i32 %lo, i32 %bits) nounwind {
-; CHECK-LABEL: test14:
-; CHECK: # BB#0:
-; CHECK-NEXT: movb {{[0-9]+}}(%esp), %cl
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: shrdl %cl, %edx, %eax
-; CHECK-NEXT: retl
+; X86-LABEL: test14:
+; X86: # %bb.0:
+; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: shrdl %cl, %edx, %eax
+; X86-NEXT: retl
+;
+; X64-LABEL: test14:
+; X64: # %bb.0:
+; X64-NEXT: movl %edx, %ecx
+; X64-NEXT: shrdl %cl, %edi, %esi
+; X64-NEXT: movl %esi, %eax
+; X64-NEXT: retq
%bits32 = sub i32 32, %bits
%sh_lo = shl i32 %hi, %bits32
%sh_hi = lshr i32 %lo, %bits
@@ -264,13 +368,20 @@ define i32 @test14(i32 %hi, i32 %lo, i32 %bits) nounwind {
}
define i32 @test15(i32 %hi, i32 %lo, i32 %bits) nounwind {
-; CHECK-LABEL: test15:
-; CHECK: # BB#0:
-; CHECK-NEXT: movb {{[0-9]+}}(%esp), %cl
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: shldl %cl, %edx, %eax
-; CHECK-NEXT: retl
+; X86-LABEL: test15:
+; X86: # %bb.0:
+; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: retl
+;
+; X64-LABEL: test15:
+; X64: # %bb.0:
+; X64-NEXT: movl %edx, %ecx
+; X64-NEXT: shldl %cl, %esi, %edi
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: retq
%bits32 = xor i32 %bits, 31
%lo2 = lshr i32 %lo, 1
%sh_lo = lshr i32 %lo2, %bits32
@@ -280,13 +391,20 @@ define i32 @test15(i32 %hi, i32 %lo, i32 %bits) nounwind {
}
define i32 @test16(i32 %hi, i32 %lo, i32 %bits) nounwind {
-; CHECK-LABEL: test16:
-; CHECK: # BB#0:
-; CHECK-NEXT: movb {{[0-9]+}}(%esp), %cl
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: shrdl %cl, %edx, %eax
-; CHECK-NEXT: retl
+; X86-LABEL: test16:
+; X86: # %bb.0:
+; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: shrdl %cl, %edx, %eax
+; X86-NEXT: retl
+;
+; X64-LABEL: test16:
+; X64: # %bb.0:
+; X64-NEXT: movl %edx, %ecx
+; X64-NEXT: shrdl %cl, %esi, %edi
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: retq
%bits32 = xor i32 %bits, 31
%lo2 = shl i32 %lo, 1
%sh_lo = shl i32 %lo2, %bits32
@@ -296,13 +414,20 @@ define i32 @test16(i32 %hi, i32 %lo, i32 %bits) nounwind {
}
define i32 @test17(i32 %hi, i32 %lo, i32 %bits) nounwind {
-; CHECK-LABEL: test17:
-; CHECK: # BB#0:
-; CHECK-NEXT: movb {{[0-9]+}}(%esp), %cl
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: shrdl %cl, %edx, %eax
-; CHECK-NEXT: retl
+; X86-LABEL: test17:
+; X86: # %bb.0:
+; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: shrdl %cl, %edx, %eax
+; X86-NEXT: retl
+;
+; X64-LABEL: test17:
+; X64: # %bb.0:
+; X64-NEXT: movl %edx, %ecx
+; X64-NEXT: shrdl %cl, %esi, %edi
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: retq
%bits32 = xor i32 %bits, 31
%lo2 = add i32 %lo, %lo
%sh_lo = shl i32 %lo2, %bits32
diff --git a/test/CodeGen/X86/shift-folding.ll b/test/CodeGen/X86/shift-folding.ll
index 76cf4a41a6cb..d8cc50cb01d3 100644
--- a/test/CodeGen/X86/shift-folding.ll
+++ b/test/CodeGen/X86/shift-folding.ll
@@ -3,7 +3,7 @@
define i32* @test1(i32* %P, i32 %X) {
; CHECK-LABEL: test1:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: andl $-4, %eax
; CHECK-NEXT: addl {{[0-9]+}}(%esp), %eax
@@ -16,7 +16,7 @@ define i32* @test1(i32* %P, i32 %X) {
define i32* @test2(i32* %P, i32 %X) {
; CHECK-LABEL: test2:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: shll $4, %eax
; CHECK-NEXT: addl {{[0-9]+}}(%esp), %eax
@@ -29,7 +29,7 @@ define i32* @test2(i32* %P, i32 %X) {
define i32* @test3(i32* %P, i32 %X) {
; CHECK-LABEL: test3:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: andl $-4, %eax
; CHECK-NEXT: addl {{[0-9]+}}(%esp), %eax
@@ -41,7 +41,7 @@ define i32* @test3(i32* %P, i32 %X) {
define fastcc i32 @test4(i32* %d) {
; CHECK-LABEL: test4:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movzbl 3(%ecx), %eax
; CHECK-NEXT: retl
%tmp4 = load i32, i32* %d
@@ -54,7 +54,7 @@ define fastcc i32 @test4(i32* %d) {
define i64 @test5(i16 %i, i32* %arr) {
; CHECK-LABEL: test5:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
; CHECK-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: shrl $11, %eax
diff --git a/test/CodeGen/X86/shift-i128.ll b/test/CodeGen/X86/shift-i128.ll
index 802277ea1121..9c69aab5b3d1 100644
--- a/test/CodeGen/X86/shift-i128.ll
+++ b/test/CodeGen/X86/shift-i128.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=x86
-; RUN: llc < %s -march=x86-64
+; RUN: llc < %s -mtriple=i686--
+; RUN: llc < %s -mtriple=x86_64--
;
; Scalars
diff --git a/test/CodeGen/X86/shift-i256.ll b/test/CodeGen/X86/shift-i256.ll
index 866e7e67fb0a..4fa3303baf04 100644
--- a/test/CodeGen/X86/shift-i256.ll
+++ b/test/CodeGen/X86/shift-i256.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -march=x86 | FileCheck %s
-; RUN: llc < %s -march=x86-64 -O0 | FileCheck %s -check-prefix=CHECK-X64
-; RUN: llc < %s -march=x86-64 -O2 | FileCheck %s -check-prefix=CHECK-X64
+; RUN: llc < %s -mtriple=i686-- | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-- -O0 | FileCheck %s -check-prefix=CHECK-X64
+; RUN: llc < %s -mtriple=x86_64-- -O2 | FileCheck %s -check-prefix=CHECK-X64
; CHECK-LABEL: shift1
define void @shift1(i256 %x, i256 %a, i256* nocapture %r) nounwind readnone {
diff --git a/test/CodeGen/X86/shift-one.ll b/test/CodeGen/X86/shift-one.ll
index 1ff02eb53e93..d961eb1451b9 100644
--- a/test/CodeGen/X86/shift-one.ll
+++ b/test/CodeGen/X86/shift-one.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | not grep leal
+; RUN: llc < %s -mtriple=i686-- | not grep leal
@x = external global i32 ; <i32*> [#uses=1]
diff --git a/test/CodeGen/X86/shift-pair.ll b/test/CodeGen/X86/shift-pair.ll
index 62e51f002f7d..01ebfcd321f6 100644
--- a/test/CodeGen/X86/shift-pair.ll
+++ b/test/CodeGen/X86/shift-pair.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-- | FileCheck %s
define i64 @test(i64 %A) {
; CHECK: @test
diff --git a/test/CodeGen/X86/shift-parts.ll b/test/CodeGen/X86/shift-parts.ll
index 0b25a7595f2a..678866979814 100644
--- a/test/CodeGen/X86/shift-parts.ll
+++ b/test/CodeGen/X86/shift-parts.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=x86-64 < %s | FileCheck %s
+; RUN: llc -mtriple=x86_64-- < %s | FileCheck %s
; PR4736
%0 = type { i32, i8, [35 x i8] }
diff --git a/test/CodeGen/X86/shift-pcmp.ll b/test/CodeGen/X86/shift-pcmp.ll
index f509da2674bc..e3ca10353cd7 100644
--- a/test/CodeGen/X86/shift-pcmp.ll
+++ b/test/CodeGen/X86/shift-pcmp.ll
@@ -4,13 +4,13 @@
define <8 x i16> @foo(<8 x i16> %a, <8 x i16> %b) {
; SSE-LABEL: foo:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pcmpeqw %xmm1, %xmm0
; SSE-NEXT: pand {{.*}}(%rip), %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: foo:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: retq
@@ -23,13 +23,13 @@ define <8 x i16> @foo(<8 x i16> %a, <8 x i16> %b) {
; Don't fail with an assert due to an undef in the buildvector
define <8 x i16> @bar(<8 x i16> %a, <8 x i16> %b) {
; SSE-LABEL: bar:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pcmpeqw %xmm1, %xmm0
; SSE-NEXT: pand {{.*}}(%rip), %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: bar:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: retq
diff --git a/test/CodeGen/X86/shl-anyext.ll b/test/CodeGen/X86/shl-anyext.ll
index 0a5d047d23d3..dbf560c6267e 100644
--- a/test/CodeGen/X86/shl-anyext.ll
+++ b/test/CodeGen/X86/shl-anyext.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=x86-64 < %s | FileCheck %s
+; RUN: llc -mtriple=x86_64-- < %s | FileCheck %s
; Codegen should be able to use a 32-bit shift instead of a 64-bit shift.
; CHECK: shll $16
diff --git a/test/CodeGen/X86/shl-crash-on-legalize.ll b/test/CodeGen/X86/shl-crash-on-legalize.ll
index 2029bae8c463..22735f07b0a1 100644
--- a/test/CodeGen/X86/shl-crash-on-legalize.ll
+++ b/test/CodeGen/X86/shl-crash-on-legalize.ll
@@ -11,7 +11,7 @@ target triple = "x86_64-unknown-linux-gnu"
; Function Attrs: norecurse nounwind uwtable
define i32 @_Z3foov() local_unnamed_addr #0 {
; CHECK-LABEL: _Z3foov:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movq %rax, {{.*}}(%rip)
; CHECK-NEXT: retq
entry:
diff --git a/test/CodeGen/X86/shl-i64.ll b/test/CodeGen/X86/shl-i64.ll
index 849912cc12e9..9326cc2d3ecc 100644
--- a/test/CodeGen/X86/shl-i64.ll
+++ b/test/CodeGen/X86/shl-i64.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=x86 -mattr=+sse2 < %s | FileCheck %s
+; RUN: llc -mtriple=i686-- -mattr=+sse2 < %s | FileCheck %s
; Make sure that we don't generate an illegal i64 extract after LegalizeType.
; CHECK: shll
diff --git a/test/CodeGen/X86/shl_elim.ll b/test/CodeGen/X86/shl_elim.ll
index 4762b13b516f..ed67a09f06c6 100644
--- a/test/CodeGen/X86/shl_elim.ll
+++ b/test/CodeGen/X86/shl_elim.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | FileCheck %s
+; RUN: llc < %s -mtriple=i686-- | FileCheck %s
define i32 @test1(i64 %a) nounwind {
%tmp29 = lshr i64 %a, 24 ; <i64> [#uses=1]
diff --git a/test/CodeGen/X86/shrink-compare.ll b/test/CodeGen/X86/shrink-compare.ll
index 7f35258377ec..32dcf4268253 100644
--- a/test/CodeGen/X86/shrink-compare.ll
+++ b/test/CodeGen/X86/shrink-compare.ll
@@ -5,10 +5,10 @@ declare void @bar()
define void @test1(i32* nocapture %X) nounwind minsize {
; CHECK-LABEL: test1:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: cmpb $47, (%rdi)
; CHECK-NEXT: je bar # TAILCALL
-; CHECK-NEXT: # BB#1: # %if.end
+; CHECK-NEXT: # %bb.1: # %if.end
; CHECK-NEXT: retq
entry:
%tmp1 = load i32, i32* %X, align 4
@@ -26,10 +26,10 @@ if.end:
define void @test2(i32 %X) nounwind minsize {
; CHECK-LABEL: test2:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: cmpb $47, %dil
; CHECK-NEXT: je bar # TAILCALL
-; CHECK-NEXT: # BB#1: # %if.end
+; CHECK-NEXT: # %bb.1: # %if.end
; CHECK-NEXT: retq
entry:
%and = and i32 %X, 255
@@ -46,10 +46,10 @@ if.end:
define void @test3(i32 %X) nounwind minsize {
; CHECK-LABEL: test3:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: cmpb $-1, %dil
; CHECK-NEXT: je bar # TAILCALL
-; CHECK-NEXT: # BB#1: # %if.end
+; CHECK-NEXT: # %bb.1: # %if.end
; CHECK-NEXT: retq
entry:
%and = and i32 %X, 255
@@ -67,16 +67,16 @@ if.end:
; PR16083
define i1 @test4(i64 %a, i32 %b) {
; CHECK-LABEL: test4:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movb $1, %al
; CHECK-NEXT: testl %esi, %esi
; CHECK-NEXT: je .LBB3_1
-; CHECK-NEXT: # BB#2: # %lor.end
-; CHECK-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: # %bb.2: # %lor.end
+; CHECK-NEXT: # kill: def %al killed %al killed %eax
; CHECK-NEXT: retq
; CHECK-NEXT: .LBB3_1: # %lor.rhs
; CHECK-NEXT: xorl %eax, %eax
-; CHECK-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: # kill: def %al killed %al killed %eax
; CHECK-NEXT: retq
entry:
%tobool = icmp ne i32 %b, 0
@@ -97,14 +97,14 @@ lor.end: ; preds = %lor.rhs, %entry
; PR16551
define void @test5(i32 %X) nounwind minsize {
; CHECK-LABEL: test5:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movzbl x+{{.*}}(%rip), %eax
; CHECK-NEXT: shll $16, %eax
; CHECK-NEXT: movzwl x+{{.*}}(%rip), %ecx
; CHECK-NEXT: orl %eax, %ecx
; CHECK-NEXT: cmpl $1, %ecx
; CHECK-NEXT: jne bar # TAILCALL
-; CHECK-NEXT: # BB#1: # %if.end
+; CHECK-NEXT: # %bb.1: # %if.end
; CHECK-NEXT: retq
entry:
%bf.load = load i56, i56* bitcast ({ i8, i8, i8, i8, i8, i8, i8, i8 }* @x to i56*), align 4
@@ -123,11 +123,11 @@ if.end:
define void @test2_1(i32 %X) nounwind minsize {
; CHECK-LABEL: test2_1:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movzbl %dil, %eax
; CHECK-NEXT: cmpl $256, %eax # imm = 0x100
; CHECK-NEXT: je bar # TAILCALL
-; CHECK-NEXT: # BB#1: # %if.end
+; CHECK-NEXT: # %bb.1: # %if.end
; CHECK-NEXT: retq
entry:
%and = and i32 %X, 255
@@ -144,10 +144,10 @@ if.end:
define void @test_sext_i8_icmp_1(i8 %x) nounwind minsize {
; CHECK-LABEL: test_sext_i8_icmp_1:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: cmpb $1, %dil
; CHECK-NEXT: je bar # TAILCALL
-; CHECK-NEXT: # BB#1: # %if.end
+; CHECK-NEXT: # %bb.1: # %if.end
; CHECK-NEXT: retq
entry:
%sext = sext i8 %x to i32
@@ -164,10 +164,10 @@ if.end:
define void @test_sext_i8_icmp_47(i8 %x) nounwind minsize {
; CHECK-LABEL: test_sext_i8_icmp_47:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: cmpb $47, %dil
; CHECK-NEXT: je bar # TAILCALL
-; CHECK-NEXT: # BB#1: # %if.end
+; CHECK-NEXT: # %bb.1: # %if.end
; CHECK-NEXT: retq
entry:
%sext = sext i8 %x to i32
@@ -184,10 +184,10 @@ if.end:
define void @test_sext_i8_icmp_127(i8 %x) nounwind minsize {
; CHECK-LABEL: test_sext_i8_icmp_127:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: cmpb $127, %dil
; CHECK-NEXT: je bar # TAILCALL
-; CHECK-NEXT: # BB#1: # %if.end
+; CHECK-NEXT: # %bb.1: # %if.end
; CHECK-NEXT: retq
entry:
%sext = sext i8 %x to i32
@@ -204,10 +204,10 @@ if.end:
define void @test_sext_i8_icmp_neg1(i8 %x) nounwind minsize {
; CHECK-LABEL: test_sext_i8_icmp_neg1:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: cmpb $-1, %dil
; CHECK-NEXT: je bar # TAILCALL
-; CHECK-NEXT: # BB#1: # %if.end
+; CHECK-NEXT: # %bb.1: # %if.end
; CHECK-NEXT: retq
entry:
%sext = sext i8 %x to i32
@@ -224,10 +224,10 @@ if.end:
define void @test_sext_i8_icmp_neg2(i8 %x) nounwind minsize {
; CHECK-LABEL: test_sext_i8_icmp_neg2:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: cmpb $-2, %dil
; CHECK-NEXT: je bar # TAILCALL
-; CHECK-NEXT: # BB#1: # %if.end
+; CHECK-NEXT: # %bb.1: # %if.end
; CHECK-NEXT: retq
entry:
%sext = sext i8 %x to i32
@@ -244,10 +244,10 @@ if.end:
define void @test_sext_i8_icmp_neg127(i8 %x) nounwind minsize {
; CHECK-LABEL: test_sext_i8_icmp_neg127:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: cmpb $-127, %dil
; CHECK-NEXT: je bar # TAILCALL
-; CHECK-NEXT: # BB#1: # %if.end
+; CHECK-NEXT: # %bb.1: # %if.end
; CHECK-NEXT: retq
entry:
%sext = sext i8 %x to i32
@@ -264,10 +264,10 @@ if.end:
define void @test_sext_i8_icmp_neg128(i8 %x) nounwind minsize {
; CHECK-LABEL: test_sext_i8_icmp_neg128:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: cmpb $-128, %dil
; CHECK-NEXT: je bar # TAILCALL
-; CHECK-NEXT: # BB#1: # %if.end
+; CHECK-NEXT: # %bb.1: # %if.end
; CHECK-NEXT: retq
entry:
%sext = sext i8 %x to i32
@@ -284,11 +284,11 @@ if.end:
define void @test_sext_i8_icmp_255(i8 %x) nounwind minsize {
; CHECK-LABEL: test_sext_i8_icmp_255:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movb $1, %al
; CHECK-NEXT: testb %al, %al
; CHECK-NEXT: je bar # TAILCALL
-; CHECK-NEXT: # BB#1: # %if.end
+; CHECK-NEXT: # %bb.1: # %if.end
; CHECK-NEXT: retq
entry:
%sext = sext i8 %x to i32
diff --git a/test/CodeGen/X86/shrink-fp-const1.ll b/test/CodeGen/X86/shrink-fp-const1.ll
index 49b9fa3c4129..ba769c194a52 100644
--- a/test/CodeGen/X86/shrink-fp-const1.ll
+++ b/test/CodeGen/X86/shrink-fp-const1.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mattr=+sse2 | not grep cvtss2sd
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2 | not grep cvtss2sd
; PR1264
define double @foo(double %x) {
diff --git a/test/CodeGen/X86/shrink-fp-const2.ll b/test/CodeGen/X86/shrink-fp-const2.ll
index 3d5203be09a0..b62a69cab18a 100644
--- a/test/CodeGen/X86/shrink-fp-const2.ll
+++ b/test/CodeGen/X86/shrink-fp-const2.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | grep flds
+; RUN: llc < %s -mtriple=i686-- | grep flds
; This should be a flds, not fldt.
define x86_fp80 @test2() nounwind {
entry:
diff --git a/test/CodeGen/X86/shrink_vmul.ll b/test/CodeGen/X86/shrink_vmul.ll
index d5cd8b0525dd..5700b1df15bd 100644
--- a/test/CodeGen/X86/shrink_vmul.ll
+++ b/test/CodeGen/X86/shrink_vmul.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; NOTE: Assertions have been autogenerated by update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X64
@c = external global i32*, align 8
@@ -11,20 +11,42 @@
; %rst = mul <2 x i32> %op1, %op2
;
define void @mul_2xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
-; CHECK-LABEL: mul_2xi8:
-; CHECK: # BB#0: # %entry
-; CHECK-NEXT: movq {{.*}}(%rip), %rax
-; CHECK-NEXT: movzwl (%rdi,%rdx), %ecx
-; CHECK-NEXT: movd %ecx, %xmm0
-; CHECK-NEXT: movzwl (%rsi,%rdx), %ecx
-; CHECK-NEXT: movd %ecx, %xmm1
-; CHECK-NEXT: pxor %xmm2, %xmm2
-; CHECK-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; CHECK-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; CHECK-NEXT: pmullw %xmm0, %xmm1
-; CHECK-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; CHECK-NEXT: movq %xmm1, (%rax,%rdx,4)
-; CHECK-NEXT: retq
+; X86-LABEL: mul_2xi8:
+; X86: # %bb.0: # %entry
+; X86-NEXT: pushl %esi
+; X86-NEXT: .cfi_def_cfa_offset 8
+; X86-NEXT: .cfi_offset %esi, -8
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movl c, %esi
+; X86-NEXT: movzwl (%edx,%ecx), %edx
+; X86-NEXT: movd %edx, %xmm0
+; X86-NEXT: movzwl (%eax,%ecx), %eax
+; X86-NEXT: movd %eax, %xmm1
+; X86-NEXT: pxor %xmm2, %xmm2
+; X86-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; X86-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; X86-NEXT: pmullw %xmm0, %xmm1
+; X86-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; X86-NEXT: movq %xmm1, (%esi,%ecx,4)
+; X86-NEXT: popl %esi
+; X86-NEXT: retl
+;
+; X64-LABEL: mul_2xi8:
+; X64: # %bb.0: # %entry
+; X64-NEXT: movq {{.*}}(%rip), %rax
+; X64-NEXT: movzwl (%rdi,%rdx), %ecx
+; X64-NEXT: movd %ecx, %xmm0
+; X64-NEXT: movzwl (%rsi,%rdx), %ecx
+; X64-NEXT: movd %ecx, %xmm1
+; X64-NEXT: pxor %xmm2, %xmm2
+; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; X64-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; X64-NEXT: pmullw %xmm0, %xmm1
+; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; X64-NEXT: movq %xmm1, (%rax,%rdx,4)
+; X64-NEXT: retq
entry:
%pre = load i32*, i32** @c
%tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
@@ -49,18 +71,38 @@ entry:
; %rst = mul <4 x i32> %op1, %op2
;
define void @mul_4xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
-; CHECK-LABEL: mul_4xi8:
-; CHECK: # BB#0: # %entry
-; CHECK-NEXT: movq {{.*}}(%rip), %rax
-; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; CHECK-NEXT: pxor %xmm2, %xmm2
-; CHECK-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; CHECK-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; CHECK-NEXT: pmullw %xmm0, %xmm1
-; CHECK-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; CHECK-NEXT: movdqu %xmm1, (%rax,%rdx,4)
-; CHECK-NEXT: retq
+; X86-LABEL: mul_4xi8:
+; X86: # %bb.0: # %entry
+; X86-NEXT: pushl %esi
+; X86-NEXT: .cfi_def_cfa_offset 8
+; X86-NEXT: .cfi_offset %esi, -8
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movl c, %esi
+; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-NEXT: pxor %xmm2, %xmm2
+; X86-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; X86-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; X86-NEXT: pmullw %xmm0, %xmm1
+; X86-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; X86-NEXT: movdqu %xmm1, (%esi,%ecx,4)
+; X86-NEXT: popl %esi
+; X86-NEXT: retl
+;
+; X64-LABEL: mul_4xi8:
+; X64: # %bb.0: # %entry
+; X64-NEXT: movq {{.*}}(%rip), %rax
+; X64-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X64-NEXT: pxor %xmm2, %xmm2
+; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; X64-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; X64-NEXT: pmullw %xmm0, %xmm1
+; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; X64-NEXT: movdqu %xmm1, (%rax,%rdx,4)
+; X64-NEXT: retq
entry:
%pre = load i32*, i32** @c
%tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
@@ -85,21 +127,44 @@ entry:
; %rst = mul <8 x i32> %op1, %op2
;
define void @mul_8xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
-; CHECK-LABEL: mul_8xi8:
-; CHECK: # BB#0: # %entry
-; CHECK-NEXT: movq {{.*}}(%rip), %rax
-; CHECK-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; CHECK-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
-; CHECK-NEXT: pxor %xmm2, %xmm2
-; CHECK-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; CHECK-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; CHECK-NEXT: pmullw %xmm0, %xmm1
-; CHECK-NEXT: movdqa %xmm1, %xmm0
-; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; CHECK-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; CHECK-NEXT: movdqu %xmm1, 16(%rax,%rdx,4)
-; CHECK-NEXT: movdqu %xmm0, (%rax,%rdx,4)
-; CHECK-NEXT: retq
+; X86-LABEL: mul_8xi8:
+; X86: # %bb.0: # %entry
+; X86-NEXT: pushl %esi
+; X86-NEXT: .cfi_def_cfa_offset 8
+; X86-NEXT: .cfi_offset %esi, -8
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movl c, %esi
+; X86-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
+; X86-NEXT: pxor %xmm2, %xmm2
+; X86-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; X86-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; X86-NEXT: pmullw %xmm0, %xmm1
+; X86-NEXT: movdqa %xmm1, %xmm0
+; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; X86-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; X86-NEXT: movdqu %xmm1, 16(%esi,%ecx,4)
+; X86-NEXT: movdqu %xmm0, (%esi,%ecx,4)
+; X86-NEXT: popl %esi
+; X86-NEXT: retl
+;
+; X64-LABEL: mul_8xi8:
+; X64: # %bb.0: # %entry
+; X64-NEXT: movq {{.*}}(%rip), %rax
+; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X64-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
+; X64-NEXT: pxor %xmm2, %xmm2
+; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; X64-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; X64-NEXT: pmullw %xmm0, %xmm1
+; X64-NEXT: movdqa %xmm1, %xmm0
+; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; X64-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; X64-NEXT: movdqu %xmm1, 16(%rax,%rdx,4)
+; X64-NEXT: movdqu %xmm0, (%rax,%rdx,4)
+; X64-NEXT: retq
entry:
%pre = load i32*, i32** @c
%tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
@@ -124,31 +189,64 @@ entry:
; %rst = mul <16 x i32> %op1, %op2
;
define void @mul_16xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
-; CHECK-LABEL: mul_16xi8:
-; CHECK: # BB#0: # %entry
-; CHECK-NEXT: movq {{.*}}(%rip), %rax
-; CHECK-NEXT: movdqu (%rdi,%rdx), %xmm0
-; CHECK-NEXT: movdqu (%rsi,%rdx), %xmm1
-; CHECK-NEXT: pxor %xmm2, %xmm2
-; CHECK-NEXT: movdqa %xmm0, %xmm3
-; CHECK-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
-; CHECK-NEXT: movdqa %xmm1, %xmm4
-; CHECK-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
-; CHECK-NEXT: pmullw %xmm3, %xmm4
-; CHECK-NEXT: movdqa %xmm4, %xmm3
-; CHECK-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; CHECK-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
-; CHECK-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
-; CHECK-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
-; CHECK-NEXT: pmullw %xmm0, %xmm1
-; CHECK-NEXT: movdqa %xmm1, %xmm0
-; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; CHECK-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; CHECK-NEXT: movdqu %xmm1, 48(%rax,%rdx,4)
-; CHECK-NEXT: movdqu %xmm0, 32(%rax,%rdx,4)
-; CHECK-NEXT: movdqu %xmm4, 16(%rax,%rdx,4)
-; CHECK-NEXT: movdqu %xmm3, (%rax,%rdx,4)
-; CHECK-NEXT: retq
+; X86-LABEL: mul_16xi8:
+; X86: # %bb.0: # %entry
+; X86-NEXT: pushl %esi
+; X86-NEXT: .cfi_def_cfa_offset 8
+; X86-NEXT: .cfi_offset %esi, -8
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movl c, %esi
+; X86-NEXT: movdqu (%edx,%ecx), %xmm0
+; X86-NEXT: movdqu (%eax,%ecx), %xmm1
+; X86-NEXT: pxor %xmm2, %xmm2
+; X86-NEXT: movdqa %xmm0, %xmm3
+; X86-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; X86-NEXT: movdqa %xmm1, %xmm4
+; X86-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
+; X86-NEXT: pmullw %xmm3, %xmm4
+; X86-NEXT: movdqa %xmm4, %xmm3
+; X86-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; X86-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
+; X86-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
+; X86-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
+; X86-NEXT: pmullw %xmm0, %xmm1
+; X86-NEXT: movdqa %xmm1, %xmm0
+; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; X86-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; X86-NEXT: movdqu %xmm1, 48(%esi,%ecx,4)
+; X86-NEXT: movdqu %xmm0, 32(%esi,%ecx,4)
+; X86-NEXT: movdqu %xmm4, 16(%esi,%ecx,4)
+; X86-NEXT: movdqu %xmm3, (%esi,%ecx,4)
+; X86-NEXT: popl %esi
+; X86-NEXT: retl
+;
+; X64-LABEL: mul_16xi8:
+; X64: # %bb.0: # %entry
+; X64-NEXT: movq {{.*}}(%rip), %rax
+; X64-NEXT: movdqu (%rdi,%rdx), %xmm0
+; X64-NEXT: movdqu (%rsi,%rdx), %xmm1
+; X64-NEXT: pxor %xmm2, %xmm2
+; X64-NEXT: movdqa %xmm0, %xmm3
+; X64-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; X64-NEXT: movdqa %xmm1, %xmm4
+; X64-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
+; X64-NEXT: pmullw %xmm3, %xmm4
+; X64-NEXT: movdqa %xmm4, %xmm3
+; X64-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; X64-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
+; X64-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
+; X64-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
+; X64-NEXT: pmullw %xmm0, %xmm1
+; X64-NEXT: movdqa %xmm1, %xmm0
+; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; X64-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; X64-NEXT: movdqu %xmm1, 48(%rax,%rdx,4)
+; X64-NEXT: movdqu %xmm0, 32(%rax,%rdx,4)
+; X64-NEXT: movdqu %xmm4, 16(%rax,%rdx,4)
+; X64-NEXT: movdqu %xmm3, (%rax,%rdx,4)
+; X64-NEXT: retq
entry:
%pre = load i32*, i32** @c
%tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
@@ -173,17 +271,36 @@ entry:
; %rst = mul <2 x i32> %op1, %op2
;
define void @mul_2xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
-; CHECK-LABEL: mul_2xi16:
-; CHECK: # BB#0: # %entry
-; CHECK-NEXT: movq {{.*}}(%rip), %rax
-; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; CHECK-NEXT: movdqa %xmm1, %xmm2
-; CHECK-NEXT: pmulhuw %xmm0, %xmm2
-; CHECK-NEXT: pmullw %xmm0, %xmm1
-; CHECK-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; CHECK-NEXT: movq %xmm1, (%rax,%rdx,4)
-; CHECK-NEXT: retq
+; X86-LABEL: mul_2xi16:
+; X86: # %bb.0: # %entry
+; X86-NEXT: pushl %esi
+; X86-NEXT: .cfi_def_cfa_offset 8
+; X86-NEXT: .cfi_offset %esi, -8
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movl c, %esi
+; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-NEXT: movdqa %xmm1, %xmm2
+; X86-NEXT: pmulhuw %xmm0, %xmm2
+; X86-NEXT: pmullw %xmm0, %xmm1
+; X86-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; X86-NEXT: movq %xmm1, (%esi,%ecx,4)
+; X86-NEXT: popl %esi
+; X86-NEXT: retl
+;
+; X64-LABEL: mul_2xi16:
+; X64: # %bb.0: # %entry
+; X64-NEXT: movq {{.*}}(%rip), %rax
+; X64-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X64-NEXT: movdqa %xmm1, %xmm2
+; X64-NEXT: pmulhuw %xmm0, %xmm2
+; X64-NEXT: pmullw %xmm0, %xmm1
+; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; X64-NEXT: movq %xmm1, (%rax,%rdx,4)
+; X64-NEXT: retq
entry:
%pre = load i32*, i32** @c
%tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
@@ -208,17 +325,36 @@ entry:
; %rst = mul <4 x i32> %op1, %op2
;
define void @mul_4xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
-; CHECK-LABEL: mul_4xi16:
-; CHECK: # BB#0: # %entry
-; CHECK-NEXT: movq {{.*}}(%rip), %rax
-; CHECK-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; CHECK-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
-; CHECK-NEXT: movdqa %xmm1, %xmm2
-; CHECK-NEXT: pmulhuw %xmm0, %xmm2
-; CHECK-NEXT: pmullw %xmm0, %xmm1
-; CHECK-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; CHECK-NEXT: movdqu %xmm1, (%rax,%rdx,4)
-; CHECK-NEXT: retq
+; X86-LABEL: mul_4xi16:
+; X86: # %bb.0: # %entry
+; X86-NEXT: pushl %esi
+; X86-NEXT: .cfi_def_cfa_offset 8
+; X86-NEXT: .cfi_offset %esi, -8
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movl c, %esi
+; X86-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
+; X86-NEXT: movdqa %xmm1, %xmm2
+; X86-NEXT: pmulhuw %xmm0, %xmm2
+; X86-NEXT: pmullw %xmm0, %xmm1
+; X86-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; X86-NEXT: movdqu %xmm1, (%esi,%ecx,4)
+; X86-NEXT: popl %esi
+; X86-NEXT: retl
+;
+; X64-LABEL: mul_4xi16:
+; X64: # %bb.0: # %entry
+; X64-NEXT: movq {{.*}}(%rip), %rax
+; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X64-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
+; X64-NEXT: movdqa %xmm1, %xmm2
+; X64-NEXT: pmulhuw %xmm0, %xmm2
+; X64-NEXT: pmullw %xmm0, %xmm1
+; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; X64-NEXT: movdqu %xmm1, (%rax,%rdx,4)
+; X64-NEXT: retq
entry:
%pre = load i32*, i32** @c
%tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
@@ -243,20 +379,42 @@ entry:
; %rst = mul <8 x i32> %op1, %op2
;
define void @mul_8xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
-; CHECK-LABEL: mul_8xi16:
-; CHECK: # BB#0: # %entry
-; CHECK-NEXT: movq {{.*}}(%rip), %rax
-; CHECK-NEXT: movdqu (%rdi,%rdx), %xmm0
-; CHECK-NEXT: movdqu (%rsi,%rdx), %xmm1
-; CHECK-NEXT: movdqa %xmm1, %xmm2
-; CHECK-NEXT: pmulhuw %xmm0, %xmm2
-; CHECK-NEXT: pmullw %xmm0, %xmm1
-; CHECK-NEXT: movdqa %xmm1, %xmm0
-; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; CHECK-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; CHECK-NEXT: movdqu %xmm1, 16(%rax,%rdx,4)
-; CHECK-NEXT: movdqu %xmm0, (%rax,%rdx,4)
-; CHECK-NEXT: retq
+; X86-LABEL: mul_8xi16:
+; X86: # %bb.0: # %entry
+; X86-NEXT: pushl %esi
+; X86-NEXT: .cfi_def_cfa_offset 8
+; X86-NEXT: .cfi_offset %esi, -8
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movl c, %esi
+; X86-NEXT: movdqu (%edx,%ecx), %xmm0
+; X86-NEXT: movdqu (%eax,%ecx), %xmm1
+; X86-NEXT: movdqa %xmm1, %xmm2
+; X86-NEXT: pmulhuw %xmm0, %xmm2
+; X86-NEXT: pmullw %xmm0, %xmm1
+; X86-NEXT: movdqa %xmm1, %xmm0
+; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; X86-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; X86-NEXT: movdqu %xmm1, 16(%esi,%ecx,4)
+; X86-NEXT: movdqu %xmm0, (%esi,%ecx,4)
+; X86-NEXT: popl %esi
+; X86-NEXT: retl
+;
+; X64-LABEL: mul_8xi16:
+; X64: # %bb.0: # %entry
+; X64-NEXT: movq {{.*}}(%rip), %rax
+; X64-NEXT: movdqu (%rdi,%rdx), %xmm0
+; X64-NEXT: movdqu (%rsi,%rdx), %xmm1
+; X64-NEXT: movdqa %xmm1, %xmm2
+; X64-NEXT: pmulhuw %xmm0, %xmm2
+; X64-NEXT: pmullw %xmm0, %xmm1
+; X64-NEXT: movdqa %xmm1, %xmm0
+; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; X64-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; X64-NEXT: movdqu %xmm1, 16(%rax,%rdx,4)
+; X64-NEXT: movdqu %xmm0, (%rax,%rdx,4)
+; X64-NEXT: retq
entry:
%pre = load i32*, i32** @c
%tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
@@ -281,30 +439,62 @@ entry:
; %rst = mul <16 x i32> %op1, %op2
;
define void @mul_16xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
-; CHECK-LABEL: mul_16xi16:
-; CHECK: # BB#0: # %entry
-; CHECK-NEXT: movq {{.*}}(%rip), %rax
-; CHECK-NEXT: movdqu (%rdi,%rdx), %xmm0
-; CHECK-NEXT: movdqu 16(%rdi,%rdx), %xmm1
-; CHECK-NEXT: movdqu (%rsi,%rdx), %xmm2
-; CHECK-NEXT: movdqu 16(%rsi,%rdx), %xmm3
-; CHECK-NEXT: movdqa %xmm2, %xmm4
-; CHECK-NEXT: pmulhuw %xmm0, %xmm4
-; CHECK-NEXT: pmullw %xmm0, %xmm2
-; CHECK-NEXT: movdqa %xmm2, %xmm0
-; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; CHECK-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
-; CHECK-NEXT: movdqa %xmm3, %xmm4
-; CHECK-NEXT: pmulhuw %xmm1, %xmm4
-; CHECK-NEXT: pmullw %xmm1, %xmm3
-; CHECK-NEXT: movdqa %xmm3, %xmm1
-; CHECK-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
-; CHECK-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
-; CHECK-NEXT: movdqu %xmm3, 48(%rax,%rdx,4)
-; CHECK-NEXT: movdqu %xmm1, 32(%rax,%rdx,4)
-; CHECK-NEXT: movdqu %xmm2, 16(%rax,%rdx,4)
-; CHECK-NEXT: movdqu %xmm0, (%rax,%rdx,4)
-; CHECK-NEXT: retq
+; X86-LABEL: mul_16xi16:
+; X86: # %bb.0: # %entry
+; X86-NEXT: pushl %esi
+; X86-NEXT: .cfi_def_cfa_offset 8
+; X86-NEXT: .cfi_offset %esi, -8
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movl c, %esi
+; X86-NEXT: movdqu (%edx,%ecx), %xmm0
+; X86-NEXT: movdqu 16(%edx,%ecx), %xmm1
+; X86-NEXT: movdqu (%eax,%ecx), %xmm2
+; X86-NEXT: movdqu 16(%eax,%ecx), %xmm3
+; X86-NEXT: movdqa %xmm2, %xmm4
+; X86-NEXT: pmulhuw %xmm0, %xmm4
+; X86-NEXT: pmullw %xmm0, %xmm2
+; X86-NEXT: movdqa %xmm2, %xmm0
+; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
+; X86-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
+; X86-NEXT: movdqa %xmm3, %xmm4
+; X86-NEXT: pmulhuw %xmm1, %xmm4
+; X86-NEXT: pmullw %xmm1, %xmm3
+; X86-NEXT: movdqa %xmm3, %xmm1
+; X86-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
+; X86-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; X86-NEXT: movdqu %xmm3, 48(%esi,%ecx,4)
+; X86-NEXT: movdqu %xmm1, 32(%esi,%ecx,4)
+; X86-NEXT: movdqu %xmm2, 16(%esi,%ecx,4)
+; X86-NEXT: movdqu %xmm0, (%esi,%ecx,4)
+; X86-NEXT: popl %esi
+; X86-NEXT: retl
+;
+; X64-LABEL: mul_16xi16:
+; X64: # %bb.0: # %entry
+; X64-NEXT: movq {{.*}}(%rip), %rax
+; X64-NEXT: movdqu (%rdi,%rdx), %xmm0
+; X64-NEXT: movdqu 16(%rdi,%rdx), %xmm1
+; X64-NEXT: movdqu (%rsi,%rdx), %xmm2
+; X64-NEXT: movdqu 16(%rsi,%rdx), %xmm3
+; X64-NEXT: movdqa %xmm2, %xmm4
+; X64-NEXT: pmulhuw %xmm0, %xmm4
+; X64-NEXT: pmullw %xmm0, %xmm2
+; X64-NEXT: movdqa %xmm2, %xmm0
+; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
+; X64-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
+; X64-NEXT: movdqa %xmm3, %xmm4
+; X64-NEXT: pmulhuw %xmm1, %xmm4
+; X64-NEXT: pmullw %xmm1, %xmm3
+; X64-NEXT: movdqa %xmm3, %xmm1
+; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
+; X64-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; X64-NEXT: movdqu %xmm3, 48(%rax,%rdx,4)
+; X64-NEXT: movdqu %xmm1, 32(%rax,%rdx,4)
+; X64-NEXT: movdqu %xmm2, 16(%rax,%rdx,4)
+; X64-NEXT: movdqu %xmm0, (%rax,%rdx,4)
+; X64-NEXT: retq
entry:
%pre = load i32*, i32** @c
%tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
@@ -329,22 +519,46 @@ entry:
; %rst = mul <2 x i32> %op1, %op2
;
define void @mul_2xi8_sext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
-; CHECK-LABEL: mul_2xi8_sext:
-; CHECK: # BB#0: # %entry
-; CHECK-NEXT: movq {{.*}}(%rip), %rax
-; CHECK-NEXT: movzwl (%rdi,%rdx), %ecx
-; CHECK-NEXT: movd %ecx, %xmm0
-; CHECK-NEXT: movzwl (%rsi,%rdx), %ecx
-; CHECK-NEXT: movd %ecx, %xmm1
-; CHECK-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; CHECK-NEXT: psraw $8, %xmm0
-; CHECK-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; CHECK-NEXT: psraw $8, %xmm1
-; CHECK-NEXT: pmullw %xmm0, %xmm1
-; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; CHECK-NEXT: psrad $16, %xmm0
-; CHECK-NEXT: movq %xmm0, (%rax,%rdx,4)
-; CHECK-NEXT: retq
+; X86-LABEL: mul_2xi8_sext:
+; X86: # %bb.0: # %entry
+; X86-NEXT: pushl %esi
+; X86-NEXT: .cfi_def_cfa_offset 8
+; X86-NEXT: .cfi_offset %esi, -8
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movl c, %esi
+; X86-NEXT: movzwl (%edx,%ecx), %edx
+; X86-NEXT: movd %edx, %xmm0
+; X86-NEXT: movzwl (%eax,%ecx), %eax
+; X86-NEXT: movd %eax, %xmm1
+; X86-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X86-NEXT: psraw $8, %xmm0
+; X86-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X86-NEXT: psraw $8, %xmm1
+; X86-NEXT: pmullw %xmm0, %xmm1
+; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X86-NEXT: psrad $16, %xmm0
+; X86-NEXT: movq %xmm0, (%esi,%ecx,4)
+; X86-NEXT: popl %esi
+; X86-NEXT: retl
+;
+; X64-LABEL: mul_2xi8_sext:
+; X64: # %bb.0: # %entry
+; X64-NEXT: movq {{.*}}(%rip), %rax
+; X64-NEXT: movzwl (%rdi,%rdx), %ecx
+; X64-NEXT: movd %ecx, %xmm0
+; X64-NEXT: movzwl (%rsi,%rdx), %ecx
+; X64-NEXT: movd %ecx, %xmm1
+; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X64-NEXT: psraw $8, %xmm0
+; X64-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X64-NEXT: psraw $8, %xmm1
+; X64-NEXT: pmullw %xmm0, %xmm1
+; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X64-NEXT: psrad $16, %xmm0
+; X64-NEXT: movq %xmm0, (%rax,%rdx,4)
+; X64-NEXT: retq
entry:
%pre = load i32*, i32** @c
%tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
@@ -369,23 +583,48 @@ entry:
; %rst = mul <2 x i32> %op1, %op2
;
define void @mul_2xi8_sext_zext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
-; CHECK-LABEL: mul_2xi8_sext_zext:
-; CHECK: # BB#0: # %entry
-; CHECK-NEXT: movq {{.*}}(%rip), %rax
-; CHECK-NEXT: movzwl (%rdi,%rdx), %ecx
-; CHECK-NEXT: movd %ecx, %xmm0
-; CHECK-NEXT: movzwl (%rsi,%rdx), %ecx
-; CHECK-NEXT: movd %ecx, %xmm1
-; CHECK-NEXT: pxor %xmm2, %xmm2
-; CHECK-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; CHECK-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; CHECK-NEXT: psraw $8, %xmm0
-; CHECK-NEXT: movdqa %xmm1, %xmm2
-; CHECK-NEXT: pmulhw %xmm0, %xmm2
-; CHECK-NEXT: pmullw %xmm1, %xmm0
-; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; CHECK-NEXT: movq %xmm0, (%rax,%rdx,4)
-; CHECK-NEXT: retq
+; X86-LABEL: mul_2xi8_sext_zext:
+; X86: # %bb.0: # %entry
+; X86-NEXT: pushl %esi
+; X86-NEXT: .cfi_def_cfa_offset 8
+; X86-NEXT: .cfi_offset %esi, -8
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movl c, %esi
+; X86-NEXT: movzwl (%edx,%ecx), %edx
+; X86-NEXT: movd %edx, %xmm0
+; X86-NEXT: movzwl (%eax,%ecx), %eax
+; X86-NEXT: movd %eax, %xmm1
+; X86-NEXT: pxor %xmm2, %xmm2
+; X86-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; X86-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X86-NEXT: psraw $8, %xmm0
+; X86-NEXT: movdqa %xmm1, %xmm2
+; X86-NEXT: pmulhw %xmm0, %xmm2
+; X86-NEXT: pmullw %xmm1, %xmm0
+; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; X86-NEXT: movq %xmm0, (%esi,%ecx,4)
+; X86-NEXT: popl %esi
+; X86-NEXT: retl
+;
+; X64-LABEL: mul_2xi8_sext_zext:
+; X64: # %bb.0: # %entry
+; X64-NEXT: movq {{.*}}(%rip), %rax
+; X64-NEXT: movzwl (%rdi,%rdx), %ecx
+; X64-NEXT: movd %ecx, %xmm0
+; X64-NEXT: movzwl (%rsi,%rdx), %ecx
+; X64-NEXT: movd %ecx, %xmm1
+; X64-NEXT: pxor %xmm2, %xmm2
+; X64-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X64-NEXT: psraw $8, %xmm0
+; X64-NEXT: movdqa %xmm1, %xmm2
+; X64-NEXT: pmulhw %xmm0, %xmm2
+; X64-NEXT: pmullw %xmm1, %xmm0
+; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; X64-NEXT: movq %xmm0, (%rax,%rdx,4)
+; X64-NEXT: retq
entry:
%pre = load i32*, i32** @c
%tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
@@ -410,17 +649,36 @@ entry:
; %rst = mul <2 x i32> %op1, %op2
;
define void @mul_2xi16_sext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
-; CHECK-LABEL: mul_2xi16_sext:
-; CHECK: # BB#0: # %entry
-; CHECK-NEXT: movq {{.*}}(%rip), %rax
-; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; CHECK-NEXT: movdqa %xmm1, %xmm2
-; CHECK-NEXT: pmulhw %xmm0, %xmm2
-; CHECK-NEXT: pmullw %xmm0, %xmm1
-; CHECK-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; CHECK-NEXT: movq %xmm1, (%rax,%rdx,4)
-; CHECK-NEXT: retq
+; X86-LABEL: mul_2xi16_sext:
+; X86: # %bb.0: # %entry
+; X86-NEXT: pushl %esi
+; X86-NEXT: .cfi_def_cfa_offset 8
+; X86-NEXT: .cfi_offset %esi, -8
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movl c, %esi
+; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-NEXT: movdqa %xmm1, %xmm2
+; X86-NEXT: pmulhw %xmm0, %xmm2
+; X86-NEXT: pmullw %xmm0, %xmm1
+; X86-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; X86-NEXT: movq %xmm1, (%esi,%ecx,4)
+; X86-NEXT: popl %esi
+; X86-NEXT: retl
+;
+; X64-LABEL: mul_2xi16_sext:
+; X64: # %bb.0: # %entry
+; X64-NEXT: movq {{.*}}(%rip), %rax
+; X64-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X64-NEXT: movdqa %xmm1, %xmm2
+; X64-NEXT: pmulhw %xmm0, %xmm2
+; X64-NEXT: pmullw %xmm0, %xmm1
+; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; X64-NEXT: movq %xmm1, (%rax,%rdx,4)
+; X64-NEXT: retq
entry:
%pre = load i32*, i32** @c
%tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
@@ -445,30 +703,62 @@ entry:
; %rst = mul <2 x i32> %op1, %op2
;
define void @mul_2xi16_sext_zext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
-; CHECK-LABEL: mul_2xi16_sext_zext:
-; CHECK: # BB#0: # %entry
-; CHECK-NEXT: movq {{.*}}(%rip), %rax
-; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
-; CHECK-NEXT: psrad $16, %xmm0
-; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
-; CHECK-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; CHECK-NEXT: pxor %xmm2, %xmm2
-; CHECK-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3]
-; CHECK-NEXT: movdqa %xmm1, %xmm2
-; CHECK-NEXT: psrlq $32, %xmm2
-; CHECK-NEXT: pmuludq %xmm0, %xmm2
-; CHECK-NEXT: movdqa %xmm0, %xmm3
-; CHECK-NEXT: psrlq $32, %xmm3
-; CHECK-NEXT: pmuludq %xmm1, %xmm3
-; CHECK-NEXT: paddq %xmm2, %xmm3
-; CHECK-NEXT: psllq $32, %xmm3
-; CHECK-NEXT: pmuludq %xmm0, %xmm1
-; CHECK-NEXT: paddq %xmm3, %xmm1
-; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
-; CHECK-NEXT: movq %xmm0, (%rax,%rdx,4)
-; CHECK-NEXT: retq
+; X86-LABEL: mul_2xi16_sext_zext:
+; X86: # %bb.0: # %entry
+; X86-NEXT: pushl %esi
+; X86-NEXT: .cfi_def_cfa_offset 8
+; X86-NEXT: .cfi_offset %esi, -8
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movl c, %esi
+; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
+; X86-NEXT: psrad $16, %xmm0
+; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; X86-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-NEXT: pxor %xmm2, %xmm2
+; X86-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3]
+; X86-NEXT: movdqa %xmm1, %xmm2
+; X86-NEXT: psrlq $32, %xmm2
+; X86-NEXT: pmuludq %xmm0, %xmm2
+; X86-NEXT: movdqa %xmm0, %xmm3
+; X86-NEXT: psrlq $32, %xmm3
+; X86-NEXT: pmuludq %xmm1, %xmm3
+; X86-NEXT: paddq %xmm2, %xmm3
+; X86-NEXT: psllq $32, %xmm3
+; X86-NEXT: pmuludq %xmm0, %xmm1
+; X86-NEXT: paddq %xmm3, %xmm1
+; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
+; X86-NEXT: movq %xmm0, (%esi,%ecx,4)
+; X86-NEXT: popl %esi
+; X86-NEXT: retl
+;
+; X64-LABEL: mul_2xi16_sext_zext:
+; X64: # %bb.0: # %entry
+; X64-NEXT: movq {{.*}}(%rip), %rax
+; X64-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
+; X64-NEXT: psrad $16, %xmm0
+; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; X64-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X64-NEXT: pxor %xmm2, %xmm2
+; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3]
+; X64-NEXT: movdqa %xmm1, %xmm2
+; X64-NEXT: psrlq $32, %xmm2
+; X64-NEXT: pmuludq %xmm0, %xmm2
+; X64-NEXT: movdqa %xmm0, %xmm3
+; X64-NEXT: psrlq $32, %xmm3
+; X64-NEXT: pmuludq %xmm1, %xmm3
+; X64-NEXT: paddq %xmm2, %xmm3
+; X64-NEXT: psllq $32, %xmm3
+; X64-NEXT: pmuludq %xmm0, %xmm1
+; X64-NEXT: paddq %xmm3, %xmm1
+; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
+; X64-NEXT: movq %xmm0, (%rax,%rdx,4)
+; X64-NEXT: retq
entry:
%pre = load i32*, i32** @c
%tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
@@ -493,30 +783,62 @@ entry:
; %rst = mul <16 x i32> %op1, %op2
;
define void @mul_16xi16_sext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
-; CHECK-LABEL: mul_16xi16_sext:
-; CHECK: # BB#0: # %entry
-; CHECK-NEXT: movq {{.*}}(%rip), %rax
-; CHECK-NEXT: movdqu (%rdi,%rdx), %xmm0
-; CHECK-NEXT: movdqu 16(%rdi,%rdx), %xmm1
-; CHECK-NEXT: movdqu (%rsi,%rdx), %xmm2
-; CHECK-NEXT: movdqu 16(%rsi,%rdx), %xmm3
-; CHECK-NEXT: movdqa %xmm2, %xmm4
-; CHECK-NEXT: pmulhw %xmm0, %xmm4
-; CHECK-NEXT: pmullw %xmm0, %xmm2
-; CHECK-NEXT: movdqa %xmm2, %xmm0
-; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; CHECK-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
-; CHECK-NEXT: movdqa %xmm3, %xmm4
-; CHECK-NEXT: pmulhw %xmm1, %xmm4
-; CHECK-NEXT: pmullw %xmm1, %xmm3
-; CHECK-NEXT: movdqa %xmm3, %xmm1
-; CHECK-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
-; CHECK-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
-; CHECK-NEXT: movdqu %xmm3, 48(%rax,%rdx,4)
-; CHECK-NEXT: movdqu %xmm1, 32(%rax,%rdx,4)
-; CHECK-NEXT: movdqu %xmm2, 16(%rax,%rdx,4)
-; CHECK-NEXT: movdqu %xmm0, (%rax,%rdx,4)
-; CHECK-NEXT: retq
+; X86-LABEL: mul_16xi16_sext:
+; X86: # %bb.0: # %entry
+; X86-NEXT: pushl %esi
+; X86-NEXT: .cfi_def_cfa_offset 8
+; X86-NEXT: .cfi_offset %esi, -8
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movl c, %esi
+; X86-NEXT: movdqu (%edx,%ecx), %xmm0
+; X86-NEXT: movdqu 16(%edx,%ecx), %xmm1
+; X86-NEXT: movdqu (%eax,%ecx), %xmm2
+; X86-NEXT: movdqu 16(%eax,%ecx), %xmm3
+; X86-NEXT: movdqa %xmm2, %xmm4
+; X86-NEXT: pmulhw %xmm0, %xmm4
+; X86-NEXT: pmullw %xmm0, %xmm2
+; X86-NEXT: movdqa %xmm2, %xmm0
+; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
+; X86-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
+; X86-NEXT: movdqa %xmm3, %xmm4
+; X86-NEXT: pmulhw %xmm1, %xmm4
+; X86-NEXT: pmullw %xmm1, %xmm3
+; X86-NEXT: movdqa %xmm3, %xmm1
+; X86-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
+; X86-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; X86-NEXT: movdqu %xmm3, 48(%esi,%ecx,4)
+; X86-NEXT: movdqu %xmm1, 32(%esi,%ecx,4)
+; X86-NEXT: movdqu %xmm2, 16(%esi,%ecx,4)
+; X86-NEXT: movdqu %xmm0, (%esi,%ecx,4)
+; X86-NEXT: popl %esi
+; X86-NEXT: retl
+;
+; X64-LABEL: mul_16xi16_sext:
+; X64: # %bb.0: # %entry
+; X64-NEXT: movq {{.*}}(%rip), %rax
+; X64-NEXT: movdqu (%rdi,%rdx), %xmm0
+; X64-NEXT: movdqu 16(%rdi,%rdx), %xmm1
+; X64-NEXT: movdqu (%rsi,%rdx), %xmm2
+; X64-NEXT: movdqu 16(%rsi,%rdx), %xmm3
+; X64-NEXT: movdqa %xmm2, %xmm4
+; X64-NEXT: pmulhw %xmm0, %xmm4
+; X64-NEXT: pmullw %xmm0, %xmm2
+; X64-NEXT: movdqa %xmm2, %xmm0
+; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
+; X64-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
+; X64-NEXT: movdqa %xmm3, %xmm4
+; X64-NEXT: pmulhw %xmm1, %xmm4
+; X64-NEXT: pmullw %xmm1, %xmm3
+; X64-NEXT: movdqa %xmm3, %xmm1
+; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
+; X64-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; X64-NEXT: movdqu %xmm3, 48(%rax,%rdx,4)
+; X64-NEXT: movdqu %xmm1, 32(%rax,%rdx,4)
+; X64-NEXT: movdqu %xmm2, 16(%rax,%rdx,4)
+; X64-NEXT: movdqu %xmm0, (%rax,%rdx,4)
+; X64-NEXT: retq
entry:
%pre = load i32*, i32** @c
%tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
@@ -540,17 +862,31 @@ entry:
; %rst = mul <2 x i32> %op1, %op2
;
define void @mul_2xi8_varconst1(i8* nocapture readonly %a, i64 %index) {
-; CHECK-LABEL: mul_2xi8_varconst1:
-; CHECK: # BB#0: # %entry
-; CHECK-NEXT: movq {{.*}}(%rip), %rax
-; CHECK-NEXT: movzwl (%rdi,%rsi), %ecx
-; CHECK-NEXT: movd %ecx, %xmm0
-; CHECK-NEXT: pxor %xmm1, %xmm1
-; CHECK-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; CHECK-NEXT: pmullw {{.*}}(%rip), %xmm0
-; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; CHECK-NEXT: movq %xmm0, (%rax,%rsi,4)
-; CHECK-NEXT: retq
+; X86-LABEL: mul_2xi8_varconst1:
+; X86: # %bb.0: # %entry
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl c, %edx
+; X86-NEXT: movzwl (%ecx,%eax), %ecx
+; X86-NEXT: movd %ecx, %xmm0
+; X86-NEXT: pxor %xmm1, %xmm1
+; X86-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; X86-NEXT: pmullw {{\.LCPI.*}}, %xmm0
+; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X86-NEXT: movq %xmm0, (%edx,%eax,4)
+; X86-NEXT: retl
+;
+; X64-LABEL: mul_2xi8_varconst1:
+; X64: # %bb.0: # %entry
+; X64-NEXT: movq {{.*}}(%rip), %rax
+; X64-NEXT: movzwl (%rdi,%rsi), %ecx
+; X64-NEXT: movd %ecx, %xmm0
+; X64-NEXT: pxor %xmm1, %xmm1
+; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; X64-NEXT: pmullw {{.*}}(%rip), %xmm0
+; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X64-NEXT: movq %xmm0, (%rax,%rsi,4)
+; X64-NEXT: retq
entry:
%pre = load i32*, i32** @c
%tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
@@ -570,18 +906,33 @@ entry:
; %rst = mul <2 x i32> %op1, %op2
;
define void @mul_2xi8_varconst2(i8* nocapture readonly %a, i64 %index) {
-; CHECK-LABEL: mul_2xi8_varconst2:
-; CHECK: # BB#0: # %entry
-; CHECK-NEXT: movq {{.*}}(%rip), %rax
-; CHECK-NEXT: movzwl (%rdi,%rsi), %ecx
-; CHECK-NEXT: movd %ecx, %xmm0
-; CHECK-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; CHECK-NEXT: psraw $8, %xmm0
-; CHECK-NEXT: pmullw {{.*}}(%rip), %xmm0
-; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; CHECK-NEXT: psrad $16, %xmm0
-; CHECK-NEXT: movq %xmm0, (%rax,%rsi,4)
-; CHECK-NEXT: retq
+; X86-LABEL: mul_2xi8_varconst2:
+; X86: # %bb.0: # %entry
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl c, %edx
+; X86-NEXT: movzwl (%ecx,%eax), %ecx
+; X86-NEXT: movd %ecx, %xmm0
+; X86-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X86-NEXT: psraw $8, %xmm0
+; X86-NEXT: pmullw {{\.LCPI.*}}, %xmm0
+; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; X86-NEXT: psrad $16, %xmm0
+; X86-NEXT: movq %xmm0, (%edx,%eax,4)
+; X86-NEXT: retl
+;
+; X64-LABEL: mul_2xi8_varconst2:
+; X64: # %bb.0: # %entry
+; X64-NEXT: movq {{.*}}(%rip), %rax
+; X64-NEXT: movzwl (%rdi,%rsi), %ecx
+; X64-NEXT: movd %ecx, %xmm0
+; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X64-NEXT: psraw $8, %xmm0
+; X64-NEXT: pmullw {{.*}}(%rip), %xmm0
+; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; X64-NEXT: psrad $16, %xmm0
+; X64-NEXT: movq %xmm0, (%rax,%rsi,4)
+; X64-NEXT: retq
entry:
%pre = load i32*, i32** @c
%tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
@@ -601,20 +952,37 @@ entry:
; %rst = mul <2 x i32> %op1, %op2
;
define void @mul_2xi8_varconst3(i8* nocapture readonly %a, i64 %index) {
-; CHECK-LABEL: mul_2xi8_varconst3:
-; CHECK: # BB#0: # %entry
-; CHECK-NEXT: movq {{.*}}(%rip), %rax
-; CHECK-NEXT: movzwl (%rdi,%rsi), %ecx
-; CHECK-NEXT: movd %ecx, %xmm0
-; CHECK-NEXT: pxor %xmm1, %xmm1
-; CHECK-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; CHECK-NEXT: movdqa {{.*#+}} xmm1 = <0,256,u,u,u,u,u,u>
-; CHECK-NEXT: movdqa %xmm0, %xmm2
-; CHECK-NEXT: pmulhw %xmm1, %xmm2
-; CHECK-NEXT: pmullw %xmm1, %xmm0
-; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; CHECK-NEXT: movq %xmm0, (%rax,%rsi,4)
-; CHECK-NEXT: retq
+; X86-LABEL: mul_2xi8_varconst3:
+; X86: # %bb.0: # %entry
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl c, %edx
+; X86-NEXT: movzwl (%ecx,%eax), %ecx
+; X86-NEXT: movd %ecx, %xmm0
+; X86-NEXT: pxor %xmm1, %xmm1
+; X86-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; X86-NEXT: movdqa {{.*#+}} xmm1 = <0,256,u,u,u,u,u,u>
+; X86-NEXT: movdqa %xmm0, %xmm2
+; X86-NEXT: pmulhw %xmm1, %xmm2
+; X86-NEXT: pmullw %xmm1, %xmm0
+; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; X86-NEXT: movq %xmm0, (%edx,%eax,4)
+; X86-NEXT: retl
+;
+; X64-LABEL: mul_2xi8_varconst3:
+; X64: # %bb.0: # %entry
+; X64-NEXT: movq {{.*}}(%rip), %rax
+; X64-NEXT: movzwl (%rdi,%rsi), %ecx
+; X64-NEXT: movd %ecx, %xmm0
+; X64-NEXT: pxor %xmm1, %xmm1
+; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; X64-NEXT: movdqa {{.*#+}} xmm1 = <0,256,u,u,u,u,u,u>
+; X64-NEXT: movdqa %xmm0, %xmm2
+; X64-NEXT: pmulhw %xmm1, %xmm2
+; X64-NEXT: pmullw %xmm1, %xmm0
+; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; X64-NEXT: movq %xmm0, (%rax,%rsi,4)
+; X64-NEXT: retq
entry:
%pre = load i32*, i32** @c
%tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
@@ -634,20 +1002,37 @@ entry:
; %rst = mul <2 x i32> %op1, %op2
;
define void @mul_2xi8_varconst4(i8* nocapture readonly %a, i64 %index) {
-; CHECK-LABEL: mul_2xi8_varconst4:
-; CHECK: # BB#0: # %entry
-; CHECK-NEXT: movq {{.*}}(%rip), %rax
-; CHECK-NEXT: movzwl (%rdi,%rsi), %ecx
-; CHECK-NEXT: movd %ecx, %xmm0
-; CHECK-NEXT: pxor %xmm1, %xmm1
-; CHECK-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; CHECK-NEXT: movdqa {{.*#+}} xmm1 = <65535,255,u,u,u,u,u,u>
-; CHECK-NEXT: movdqa %xmm0, %xmm2
-; CHECK-NEXT: pmulhw %xmm1, %xmm2
-; CHECK-NEXT: pmullw %xmm1, %xmm0
-; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; CHECK-NEXT: movq %xmm0, (%rax,%rsi,4)
-; CHECK-NEXT: retq
+; X86-LABEL: mul_2xi8_varconst4:
+; X86: # %bb.0: # %entry
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl c, %edx
+; X86-NEXT: movzwl (%ecx,%eax), %ecx
+; X86-NEXT: movd %ecx, %xmm0
+; X86-NEXT: pxor %xmm1, %xmm1
+; X86-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; X86-NEXT: movdqa {{.*#+}} xmm1 = <65535,255,u,u,u,u,u,u>
+; X86-NEXT: movdqa %xmm0, %xmm2
+; X86-NEXT: pmulhw %xmm1, %xmm2
+; X86-NEXT: pmullw %xmm1, %xmm0
+; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; X86-NEXT: movq %xmm0, (%edx,%eax,4)
+; X86-NEXT: retl
+;
+; X64-LABEL: mul_2xi8_varconst4:
+; X64: # %bb.0: # %entry
+; X64-NEXT: movq {{.*}}(%rip), %rax
+; X64-NEXT: movzwl (%rdi,%rsi), %ecx
+; X64-NEXT: movd %ecx, %xmm0
+; X64-NEXT: pxor %xmm1, %xmm1
+; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; X64-NEXT: movdqa {{.*#+}} xmm1 = <65535,255,u,u,u,u,u,u>
+; X64-NEXT: movdqa %xmm0, %xmm2
+; X64-NEXT: pmulhw %xmm1, %xmm2
+; X64-NEXT: pmullw %xmm1, %xmm0
+; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; X64-NEXT: movq %xmm0, (%rax,%rsi,4)
+; X64-NEXT: retq
entry:
%pre = load i32*, i32** @c
%tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
@@ -667,20 +1052,37 @@ entry:
; %rst = mul <2 x i32> %op1, %op2
;
define void @mul_2xi8_varconst5(i8* nocapture readonly %a, i64 %index) {
-; CHECK-LABEL: mul_2xi8_varconst5:
-; CHECK: # BB#0: # %entry
-; CHECK-NEXT: movq {{.*}}(%rip), %rax
-; CHECK-NEXT: movzwl (%rdi,%rsi), %ecx
-; CHECK-NEXT: movd %ecx, %xmm0
-; CHECK-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; CHECK-NEXT: psraw $8, %xmm0
-; CHECK-NEXT: movdqa {{.*#+}} xmm1 = <65407,127,u,u,u,u,u,u>
-; CHECK-NEXT: movdqa %xmm0, %xmm2
-; CHECK-NEXT: pmulhw %xmm1, %xmm2
-; CHECK-NEXT: pmullw %xmm1, %xmm0
-; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; CHECK-NEXT: movq %xmm0, (%rax,%rsi,4)
-; CHECK-NEXT: retq
+; X86-LABEL: mul_2xi8_varconst5:
+; X86: # %bb.0: # %entry
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl c, %edx
+; X86-NEXT: movzwl (%ecx,%eax), %ecx
+; X86-NEXT: movd %ecx, %xmm0
+; X86-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X86-NEXT: psraw $8, %xmm0
+; X86-NEXT: movdqa {{.*#+}} xmm1 = <65407,127,u,u,u,u,u,u>
+; X86-NEXT: movdqa %xmm0, %xmm2
+; X86-NEXT: pmulhw %xmm1, %xmm2
+; X86-NEXT: pmullw %xmm1, %xmm0
+; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; X86-NEXT: movq %xmm0, (%edx,%eax,4)
+; X86-NEXT: retl
+;
+; X64-LABEL: mul_2xi8_varconst5:
+; X64: # %bb.0: # %entry
+; X64-NEXT: movq {{.*}}(%rip), %rax
+; X64-NEXT: movzwl (%rdi,%rsi), %ecx
+; X64-NEXT: movd %ecx, %xmm0
+; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X64-NEXT: psraw $8, %xmm0
+; X64-NEXT: movdqa {{.*#+}} xmm1 = <65407,127,u,u,u,u,u,u>
+; X64-NEXT: movdqa %xmm0, %xmm2
+; X64-NEXT: pmulhw %xmm1, %xmm2
+; X64-NEXT: pmullw %xmm1, %xmm0
+; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; X64-NEXT: movq %xmm0, (%rax,%rsi,4)
+; X64-NEXT: retq
entry:
%pre = load i32*, i32** @c
%tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
@@ -700,20 +1102,37 @@ entry:
; %rst = mul <2 x i32> %op1, %op2
;
define void @mul_2xi8_varconst6(i8* nocapture readonly %a, i64 %index) {
-; CHECK-LABEL: mul_2xi8_varconst6:
-; CHECK: # BB#0: # %entry
-; CHECK-NEXT: movq {{.*}}(%rip), %rax
-; CHECK-NEXT: movzwl (%rdi,%rsi), %ecx
-; CHECK-NEXT: movd %ecx, %xmm0
-; CHECK-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; CHECK-NEXT: psraw $8, %xmm0
-; CHECK-NEXT: movdqa {{.*#+}} xmm1 = <65408,128,u,u,u,u,u,u>
-; CHECK-NEXT: movdqa %xmm0, %xmm2
-; CHECK-NEXT: pmulhw %xmm1, %xmm2
-; CHECK-NEXT: pmullw %xmm1, %xmm0
-; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; CHECK-NEXT: movq %xmm0, (%rax,%rsi,4)
-; CHECK-NEXT: retq
+; X86-LABEL: mul_2xi8_varconst6:
+; X86: # %bb.0: # %entry
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl c, %edx
+; X86-NEXT: movzwl (%ecx,%eax), %ecx
+; X86-NEXT: movd %ecx, %xmm0
+; X86-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X86-NEXT: psraw $8, %xmm0
+; X86-NEXT: movdqa {{.*#+}} xmm1 = <65408,128,u,u,u,u,u,u>
+; X86-NEXT: movdqa %xmm0, %xmm2
+; X86-NEXT: pmulhw %xmm1, %xmm2
+; X86-NEXT: pmullw %xmm1, %xmm0
+; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; X86-NEXT: movq %xmm0, (%edx,%eax,4)
+; X86-NEXT: retl
+;
+; X64-LABEL: mul_2xi8_varconst6:
+; X64: # %bb.0: # %entry
+; X64-NEXT: movq {{.*}}(%rip), %rax
+; X64-NEXT: movzwl (%rdi,%rsi), %ecx
+; X64-NEXT: movd %ecx, %xmm0
+; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X64-NEXT: psraw $8, %xmm0
+; X64-NEXT: movdqa {{.*#+}} xmm1 = <65408,128,u,u,u,u,u,u>
+; X64-NEXT: movdqa %xmm0, %xmm2
+; X64-NEXT: pmulhw %xmm1, %xmm2
+; X64-NEXT: pmullw %xmm1, %xmm0
+; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; X64-NEXT: movq %xmm0, (%rax,%rsi,4)
+; X64-NEXT: retq
entry:
%pre = load i32*, i32** @c
%tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
@@ -733,17 +1152,31 @@ entry:
; %rst = mul <2 x i32> %op1, %op2
;
define void @mul_2xi16_varconst1(i8* nocapture readonly %a, i64 %index) {
-; CHECK-LABEL: mul_2xi16_varconst1:
-; CHECK: # BB#0: # %entry
-; CHECK-NEXT: movq {{.*}}(%rip), %rax
-; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT: movdqa {{.*#+}} xmm1 = <0,65535,u,u,u,u,u,u>
-; CHECK-NEXT: movdqa %xmm0, %xmm2
-; CHECK-NEXT: pmulhuw %xmm1, %xmm2
-; CHECK-NEXT: pmullw %xmm1, %xmm0
-; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; CHECK-NEXT: movq %xmm0, (%rax,%rsi,4)
-; CHECK-NEXT: retq
+; X86-LABEL: mul_2xi16_varconst1:
+; X86: # %bb.0: # %entry
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl c, %edx
+; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT: movdqa {{.*#+}} xmm1 = <0,65535,u,u,u,u,u,u>
+; X86-NEXT: movdqa %xmm0, %xmm2
+; X86-NEXT: pmulhuw %xmm1, %xmm2
+; X86-NEXT: pmullw %xmm1, %xmm0
+; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; X86-NEXT: movq %xmm0, (%edx,%eax,4)
+; X86-NEXT: retl
+;
+; X64-LABEL: mul_2xi16_varconst1:
+; X64: # %bb.0: # %entry
+; X64-NEXT: movq {{.*}}(%rip), %rax
+; X64-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-NEXT: movdqa {{.*#+}} xmm1 = <0,65535,u,u,u,u,u,u>
+; X64-NEXT: movdqa %xmm0, %xmm2
+; X64-NEXT: pmulhuw %xmm1, %xmm2
+; X64-NEXT: pmullw %xmm1, %xmm0
+; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; X64-NEXT: movq %xmm0, (%rax,%rsi,4)
+; X64-NEXT: retq
entry:
%pre = load i32*, i32** @c
%tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
@@ -763,17 +1196,31 @@ entry:
; %rst = mul <2 x i32> %op1, %op2
;
define void @mul_2xi16_varconst2(i8* nocapture readonly %a, i64 %index) {
-; CHECK-LABEL: mul_2xi16_varconst2:
-; CHECK: # BB#0: # %entry
-; CHECK-NEXT: movq {{.*}}(%rip), %rax
-; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT: movdqa {{.*#+}} xmm1 = <32768,32767,u,u,u,u,u,u>
-; CHECK-NEXT: movdqa %xmm0, %xmm2
-; CHECK-NEXT: pmulhw %xmm1, %xmm2
-; CHECK-NEXT: pmullw %xmm1, %xmm0
-; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; CHECK-NEXT: movq %xmm0, (%rax,%rsi,4)
-; CHECK-NEXT: retq
+; X86-LABEL: mul_2xi16_varconst2:
+; X86: # %bb.0: # %entry
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl c, %edx
+; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT: movdqa {{.*#+}} xmm1 = <32768,32767,u,u,u,u,u,u>
+; X86-NEXT: movdqa %xmm0, %xmm2
+; X86-NEXT: pmulhw %xmm1, %xmm2
+; X86-NEXT: pmullw %xmm1, %xmm0
+; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; X86-NEXT: movq %xmm0, (%edx,%eax,4)
+; X86-NEXT: retl
+;
+; X64-LABEL: mul_2xi16_varconst2:
+; X64: # %bb.0: # %entry
+; X64-NEXT: movq {{.*}}(%rip), %rax
+; X64-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-NEXT: movdqa {{.*#+}} xmm1 = <32768,32767,u,u,u,u,u,u>
+; X64-NEXT: movdqa %xmm0, %xmm2
+; X64-NEXT: pmulhw %xmm1, %xmm2
+; X64-NEXT: pmullw %xmm1, %xmm0
+; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; X64-NEXT: movq %xmm0, (%rax,%rsi,4)
+; X64-NEXT: retq
entry:
%pre = load i32*, i32** @c
%tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
@@ -793,25 +1240,45 @@ entry:
; %rst = mul <2 x i32> %op1, %op2
;
define void @mul_2xi16_varconst3(i8* nocapture readonly %a, i64 %index) {
-; CHECK-LABEL: mul_2xi16_varconst3:
-; CHECK: # BB#0: # %entry
-; CHECK-NEXT: movq {{.*}}(%rip), %rax
-; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT: pxor %xmm1, %xmm1
-; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
-; CHECK-NEXT: movl $65536, %ecx # imm = 0x10000
-; CHECK-NEXT: movq %rcx, %xmm1
-; CHECK-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]
-; CHECK-NEXT: movdqa %xmm0, %xmm2
-; CHECK-NEXT: pmuludq %xmm1, %xmm2
-; CHECK-NEXT: psrlq $32, %xmm0
-; CHECK-NEXT: pmuludq %xmm1, %xmm0
-; CHECK-NEXT: psllq $32, %xmm0
-; CHECK-NEXT: paddq %xmm2, %xmm0
-; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-NEXT: movq %xmm0, (%rax,%rsi,4)
-; CHECK-NEXT: retq
+; X86-LABEL: mul_2xi16_varconst3:
+; X86: # %bb.0: # %entry
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl c, %edx
+; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT: pxor %xmm1, %xmm1
+; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; X86-NEXT: movdqa {{.*#+}} xmm1 = [0,0,65536,0]
+; X86-NEXT: movdqa %xmm0, %xmm2
+; X86-NEXT: pmuludq %xmm1, %xmm2
+; X86-NEXT: psrlq $32, %xmm0
+; X86-NEXT: pmuludq %xmm1, %xmm0
+; X86-NEXT: psllq $32, %xmm0
+; X86-NEXT: paddq %xmm2, %xmm0
+; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X86-NEXT: movq %xmm0, (%edx,%eax,4)
+; X86-NEXT: retl
+;
+; X64-LABEL: mul_2xi16_varconst3:
+; X64: # %bb.0: # %entry
+; X64-NEXT: movq {{.*}}(%rip), %rax
+; X64-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-NEXT: pxor %xmm1, %xmm1
+; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; X64-NEXT: movl $65536, %ecx # imm = 0x10000
+; X64-NEXT: movq %rcx, %xmm1
+; X64-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]
+; X64-NEXT: movdqa %xmm0, %xmm2
+; X64-NEXT: pmuludq %xmm1, %xmm2
+; X64-NEXT: psrlq $32, %xmm0
+; X64-NEXT: pmuludq %xmm1, %xmm0
+; X64-NEXT: psllq $32, %xmm0
+; X64-NEXT: paddq %xmm2, %xmm0
+; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X64-NEXT: movq %xmm0, (%rax,%rsi,4)
+; X64-NEXT: retq
entry:
%pre = load i32*, i32** @c
%tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
@@ -831,25 +1298,45 @@ entry:
; %rst = mul <2 x i32> %op1, %op2
;
define void @mul_2xi16_varconst4(i8* nocapture readonly %a, i64 %index) {
-; CHECK-LABEL: mul_2xi16_varconst4:
-; CHECK: # BB#0: # %entry
-; CHECK-NEXT: movq {{.*}}(%rip), %rax
-; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
-; CHECK-NEXT: psrad $16, %xmm0
-; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
-; CHECK-NEXT: movl $32768, %ecx # imm = 0x8000
-; CHECK-NEXT: movq %rcx, %xmm1
-; CHECK-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]
-; CHECK-NEXT: movdqa %xmm0, %xmm2
-; CHECK-NEXT: pmuludq %xmm1, %xmm2
-; CHECK-NEXT: psrlq $32, %xmm0
-; CHECK-NEXT: pmuludq %xmm1, %xmm0
-; CHECK-NEXT: psllq $32, %xmm0
-; CHECK-NEXT: paddq %xmm2, %xmm0
-; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-NEXT: movq %xmm0, (%rax,%rsi,4)
-; CHECK-NEXT: retq
+; X86-LABEL: mul_2xi16_varconst4:
+; X86: # %bb.0: # %entry
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl c, %edx
+; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
+; X86-NEXT: psrad $16, %xmm0
+; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; X86-NEXT: movdqa {{.*#+}} xmm1 = [0,0,32768,0]
+; X86-NEXT: movdqa %xmm0, %xmm2
+; X86-NEXT: pmuludq %xmm1, %xmm2
+; X86-NEXT: psrlq $32, %xmm0
+; X86-NEXT: pmuludq %xmm1, %xmm0
+; X86-NEXT: psllq $32, %xmm0
+; X86-NEXT: paddq %xmm2, %xmm0
+; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X86-NEXT: movq %xmm0, (%edx,%eax,4)
+; X86-NEXT: retl
+;
+; X64-LABEL: mul_2xi16_varconst4:
+; X64: # %bb.0: # %entry
+; X64-NEXT: movq {{.*}}(%rip), %rax
+; X64-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
+; X64-NEXT: psrad $16, %xmm0
+; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; X64-NEXT: movl $32768, %ecx # imm = 0x8000
+; X64-NEXT: movq %rcx, %xmm1
+; X64-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]
+; X64-NEXT: movdqa %xmm0, %xmm2
+; X64-NEXT: pmuludq %xmm1, %xmm2
+; X64-NEXT: psrlq $32, %xmm0
+; X64-NEXT: pmuludq %xmm1, %xmm0
+; X64-NEXT: psllq $32, %xmm0
+; X64-NEXT: paddq %xmm2, %xmm0
+; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X64-NEXT: movq %xmm0, (%rax,%rsi,4)
+; X64-NEXT: retq
entry:
%pre = load i32*, i32** @c
%tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
@@ -862,3 +1349,108 @@ entry:
store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
ret void
}
+
+;
+; Illegal Types
+;
+
+define void @PR34947() {
+; X86-LABEL: PR34947:
+; X86: # %bb.0:
+; X86-NEXT: movdqa (%eax), %xmm0
+; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
+; X86-NEXT: movd %xmm1, %ecx
+; X86-NEXT: xorl %eax, %eax
+; X86-NEXT: xorl %edx, %edx
+; X86-NEXT: divl %ecx
+; X86-NEXT: movd %edx, %xmm1
+; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; X86-NEXT: movd %xmm2, %ecx
+; X86-NEXT: xorl %eax, %eax
+; X86-NEXT: xorl %edx, %edx
+; X86-NEXT: divl %ecx
+; X86-NEXT: movd %edx, %xmm2
+; X86-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; X86-NEXT: movd %xmm0, %ecx
+; X86-NEXT: xorl %eax, %eax
+; X86-NEXT: xorl %edx, %edx
+; X86-NEXT: divl %ecx
+; X86-NEXT: movd %edx, %xmm1
+; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; X86-NEXT: movd %xmm0, %ecx
+; X86-NEXT: xorl %eax, %eax
+; X86-NEXT: xorl %edx, %edx
+; X86-NEXT: divl %ecx
+; X86-NEXT: movd %edx, %xmm0
+; X86-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; X86-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; X86-NEXT: xorl %eax, %eax
+; X86-NEXT: xorl %edx, %edx
+; X86-NEXT: divl (%eax)
+; X86-NEXT: movd %edx, %xmm0
+; X86-NEXT: movdqa {{.*#+}} xmm2 = [8199,8199,8199,8199]
+; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
+; X86-NEXT: pmuludq %xmm2, %xmm1
+; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; X86-NEXT: pmuludq %xmm2, %xmm3
+; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
+; X86-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; X86-NEXT: movl $8199, %eax # imm = 0x2007
+; X86-NEXT: movd %eax, %xmm2
+; X86-NEXT: pmuludq %xmm0, %xmm2
+; X86-NEXT: movd %xmm2, (%eax)
+; X86-NEXT: movdqa %xmm1, (%eax)
+; X86-NEXT: retl
+;
+; X64-LABEL: PR34947:
+; X64: # %bb.0:
+; X64-NEXT: movdqa (%rax), %xmm0
+; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
+; X64-NEXT: movd %xmm1, %ecx
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: xorl %edx, %edx
+; X64-NEXT: divl %ecx
+; X64-NEXT: movd %edx, %xmm1
+; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; X64-NEXT: movd %xmm2, %ecx
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: xorl %edx, %edx
+; X64-NEXT: divl %ecx
+; X64-NEXT: movd %edx, %xmm2
+; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; X64-NEXT: movd %xmm0, %ecx
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: xorl %edx, %edx
+; X64-NEXT: divl %ecx
+; X64-NEXT: movd %edx, %xmm1
+; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; X64-NEXT: movd %xmm0, %ecx
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: xorl %edx, %edx
+; X64-NEXT: divl %ecx
+; X64-NEXT: movd %edx, %xmm0
+; X64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; X64-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: xorl %edx, %edx
+; X64-NEXT: divl (%rax)
+; X64-NEXT: movd %edx, %xmm0
+; X64-NEXT: movdqa {{.*#+}} xmm2 = [8199,8199,8199,8199]
+; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
+; X64-NEXT: pmuludq %xmm2, %xmm1
+; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; X64-NEXT: pmuludq %xmm2, %xmm3
+; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
+; X64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; X64-NEXT: movl $8199, %eax # imm = 0x2007
+; X64-NEXT: movd %eax, %xmm2
+; X64-NEXT: pmuludq %xmm0, %xmm2
+; X64-NEXT: movd %xmm2, (%rax)
+; X64-NEXT: movdqa %xmm1, (%rax)
+; X64-NEXT: retq
+ %tmp = load <9 x i32>, <9 x i32>* undef, align 64
+ %rem = urem <9 x i32> zeroinitializer, %tmp
+ %mul = mul <9 x i32> <i32 8199, i32 8199, i32 8199, i32 8199, i32 8199, i32 8199, i32 8199, i32 8199, i32 8199>, %rem
+ store <9 x i32> %mul, <9 x i32>* undef, align 64
+ ret void
+}
diff --git a/test/CodeGen/X86/shrink_vmul_sse.ll b/test/CodeGen/X86/shrink_vmul_sse.ll
index 6701c247e6fc..93bb2a4b1cdb 100644
--- a/test/CodeGen/X86/shrink_vmul_sse.ll
+++ b/test/CodeGen/X86/shrink_vmul_sse.ll
@@ -9,7 +9,7 @@
define void @mul_2xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind {
; CHECK-LABEL: mul_2xi8:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: pushl %ebx
; CHECK-NEXT: pushl %edi
; CHECK-NEXT: pushl %esi
diff --git a/test/CodeGen/X86/shuffle-combine-crash-2.ll b/test/CodeGen/X86/shuffle-combine-crash-2.ll
index ea37d5b48531..c449ec5d3f10 100644
--- a/test/CodeGen/X86/shuffle-combine-crash-2.ll
+++ b/test/CodeGen/X86/shuffle-combine-crash-2.ll
@@ -4,13 +4,13 @@
define <4 x i64> @fold_movsd_zero() {
; X86-LABEL: fold_movsd_zero:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: xorps %xmm0, %xmm0
; X86-NEXT: xorps %xmm1, %xmm1
; X86-NEXT: retl
;
; X64-LABEL: fold_movsd_zero:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: xorps %xmm0, %xmm0
; X64-NEXT: xorps %xmm1, %xmm1
; X64-NEXT: retq
diff --git a/test/CodeGen/X86/shuffle-of-insert.ll b/test/CodeGen/X86/shuffle-of-insert.ll
new file mode 100644
index 000000000000..16074dced154
--- /dev/null
+++ b/test/CodeGen/X86/shuffle-of-insert.ll
@@ -0,0 +1,197 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSEANY --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSEANY --check-prefix=SSE4
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX
+
+define <4 x i32> @ins_elt_0(i32 %x, <4 x i32> %v1, <4 x i32> %v2) {
+; SSE2-LABEL: ins_elt_0:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movd %edi, %xmm0
+; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
+; SSE2-NEXT: movaps %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE4-LABEL: ins_elt_0:
+; SSE4: # %bb.0:
+; SSE4-NEXT: pinsrd $0, %edi, %xmm1
+; SSE4-NEXT: movdqa %xmm1, %xmm0
+; SSE4-NEXT: retq
+;
+; AVX-LABEL: ins_elt_0:
+; AVX: # %bb.0:
+; AVX-NEXT: vpinsrd $0, %edi, %xmm1, %xmm0
+; AVX-NEXT: retq
+ %ins = insertelement <4 x i32> %v1, i32 %x, i32 0
+ %shuf = shufflevector <4 x i32> %ins, <4 x i32> %v2, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+ ret <4 x i32> %shuf
+}
+
+define <4 x i32> @ins_elt_1(i32 %x, <4 x i32> %v1, <4 x i32> %v2) {
+; SSE2-LABEL: ins_elt_1:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movd %edi, %xmm0
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
+; SSE2-NEXT: retq
+;
+; SSE4-LABEL: ins_elt_1:
+; SSE4: # %bb.0:
+; SSE4-NEXT: pinsrd $1, %edi, %xmm1
+; SSE4-NEXT: movdqa %xmm1, %xmm0
+; SSE4-NEXT: retq
+;
+; AVX-LABEL: ins_elt_1:
+; AVX: # %bb.0:
+; AVX-NEXT: vpinsrd $1, %edi, %xmm1, %xmm0
+; AVX-NEXT: retq
+ %ins = insertelement <4 x i32> %v1, i32 %x, i32 1
+ %shuf = shufflevector <4 x i32> %ins, <4 x i32> %v2, <4 x i32> <i32 4, i32 1, i32 6, i32 7>
+ ret <4 x i32> %shuf
+}
+
+; Verify that the transform still works when the insert element is the 2nd operand to the shuffle.
+
+define <4 x i32> @ins_elt_2_commute(i32 %x, <4 x i32> %v1, <4 x i32> %v2) {
+; SSE2-LABEL: ins_elt_2_commute:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movd %edi, %xmm0
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2]
+; SSE2-NEXT: movaps %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE4-LABEL: ins_elt_2_commute:
+; SSE4: # %bb.0:
+; SSE4-NEXT: pinsrd $2, %edi, %xmm1
+; SSE4-NEXT: movdqa %xmm1, %xmm0
+; SSE4-NEXT: retq
+;
+; AVX-LABEL: ins_elt_2_commute:
+; AVX: # %bb.0:
+; AVX-NEXT: vpinsrd $2, %edi, %xmm1, %xmm0
+; AVX-NEXT: retq
+ %ins = insertelement <4 x i32> %v1, i32 %x, i32 2
+ %shuf = shufflevector <4 x i32> %v2, <4 x i32> %ins, <4 x i32> <i32 0, i32 1, i32 6, i32 3>
+ ret <4 x i32> %shuf
+}
+
+define <4 x i32> @ins_elt_3_commute(i32 %x, <4 x i32> %v1, <4 x i32> %v2) {
+; SSE2-LABEL: ins_elt_3_commute:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movd %edi, %xmm0
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[2,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
+; SSE2-NEXT: movaps %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE4-LABEL: ins_elt_3_commute:
+; SSE4: # %bb.0:
+; SSE4-NEXT: pinsrd $3, %edi, %xmm1
+; SSE4-NEXT: movdqa %xmm1, %xmm0
+; SSE4-NEXT: retq
+;
+; AVX-LABEL: ins_elt_3_commute:
+; AVX: # %bb.0:
+; AVX-NEXT: vpinsrd $3, %edi, %xmm1, %xmm0
+; AVX-NEXT: retq
+ %ins = insertelement <4 x i32> %v1, i32 %x, i32 3
+ %shuf = shufflevector <4 x i32> %v2, <4 x i32> %ins, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+ ret <4 x i32> %shuf
+}
+
+; In the next 4 tests, the shuffle moves the inserted scalar to a different position in the output vector.
+
+define <4 x i32> @ins_elt_0_to_2(i32 %x, <4 x i32> %v1, <4 x i32> %v2) {
+; SSE2-LABEL: ins_elt_0_to_2:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movd %edi, %xmm0
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2]
+; SSE2-NEXT: movaps %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE4-LABEL: ins_elt_0_to_2:
+; SSE4: # %bb.0:
+; SSE4-NEXT: pinsrd $2, %edi, %xmm1
+; SSE4-NEXT: movdqa %xmm1, %xmm0
+; SSE4-NEXT: retq
+;
+; AVX-LABEL: ins_elt_0_to_2:
+; AVX: # %bb.0:
+; AVX-NEXT: vpinsrd $2, %edi, %xmm1, %xmm0
+; AVX-NEXT: retq
+ %ins = insertelement <4 x i32> %v1, i32 %x, i32 0
+ %shuf = shufflevector <4 x i32> %ins, <4 x i32> %v2, <4 x i32> <i32 4, i32 5, i32 0, i32 7>
+ ret <4 x i32> %shuf
+}
+
+define <4 x i32> @ins_elt_1_to_0(i32 %x, <4 x i32> %v1, <4 x i32> %v2) {
+; SSE2-LABEL: ins_elt_1_to_0:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movd %edi, %xmm0
+; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
+; SSE2-NEXT: movaps %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE4-LABEL: ins_elt_1_to_0:
+; SSE4: # %bb.0:
+; SSE4-NEXT: pinsrd $0, %edi, %xmm1
+; SSE4-NEXT: movdqa %xmm1, %xmm0
+; SSE4-NEXT: retq
+;
+; AVX-LABEL: ins_elt_1_to_0:
+; AVX: # %bb.0:
+; AVX-NEXT: vpinsrd $0, %edi, %xmm1, %xmm0
+; AVX-NEXT: retq
+ %ins = insertelement <4 x i32> %v1, i32 %x, i32 1
+ %shuf = shufflevector <4 x i32> %ins, <4 x i32> %v2, <4 x i32> <i32 1, i32 5, i32 6, i32 7>
+ ret <4 x i32> %shuf
+}
+
+define <4 x i32> @ins_elt_2_to_3(i32 %x, <4 x i32> %v1, <4 x i32> %v2) {
+; SSE2-LABEL: ins_elt_2_to_3:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movd %edi, %xmm0
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[2,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
+; SSE2-NEXT: movaps %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE4-LABEL: ins_elt_2_to_3:
+; SSE4: # %bb.0:
+; SSE4-NEXT: pinsrd $3, %edi, %xmm1
+; SSE4-NEXT: movdqa %xmm1, %xmm0
+; SSE4-NEXT: retq
+;
+; AVX-LABEL: ins_elt_2_to_3:
+; AVX: # %bb.0:
+; AVX-NEXT: vpinsrd $3, %edi, %xmm1, %xmm0
+; AVX-NEXT: retq
+ %ins = insertelement <4 x i32> %v1, i32 %x, i32 2
+ %shuf = shufflevector <4 x i32> %v2, <4 x i32> %ins, <4 x i32> <i32 0, i32 1, i32 2, i32 6>
+ ret <4 x i32> %shuf
+}
+
+define <4 x i32> @ins_elt_3_to_1(i32 %x, <4 x i32> %v1, <4 x i32> %v2) {
+; SSE2-LABEL: ins_elt_3_to_1:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movd %edi, %xmm0
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
+; SSE2-NEXT: retq
+;
+; SSE4-LABEL: ins_elt_3_to_1:
+; SSE4: # %bb.0:
+; SSE4-NEXT: pinsrd $1, %edi, %xmm1
+; SSE4-NEXT: movdqa %xmm1, %xmm0
+; SSE4-NEXT: retq
+;
+; AVX-LABEL: ins_elt_3_to_1:
+; AVX: # %bb.0:
+; AVX-NEXT: vpinsrd $1, %edi, %xmm1, %xmm0
+; AVX-NEXT: retq
+ %ins = insertelement <4 x i32> %v1, i32 %x, i32 3
+ %shuf = shufflevector <4 x i32> %v2, <4 x i32> %ins, <4 x i32> <i32 0, i32 7, i32 2, i32 3>
+ ret <4 x i32> %shuf
+}
+
diff --git a/test/CodeGen/X86/shuffle-of-splat-multiuses.ll b/test/CodeGen/X86/shuffle-of-splat-multiuses.ll
index cbd5c69b1772..bbdff971c2f0 100644
--- a/test/CodeGen/X86/shuffle-of-splat-multiuses.ll
+++ b/test/CodeGen/X86/shuffle-of-splat-multiuses.ll
@@ -4,7 +4,7 @@
define <2 x double> @foo2(<2 x double> %v, <2 x double> *%p) nounwind {
; AVX2-LABEL: foo2:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,1]
; AVX2-NEXT: vmovapd %xmm0, (%rdi)
; AVX2-NEXT: retq
@@ -16,9 +16,9 @@ define <2 x double> @foo2(<2 x double> %v, <2 x double> *%p) nounwind {
define <4 x double> @foo4(<4 x double> %v, <4 x double> *%p) nounwind {
; AVX2-LABEL: foo4:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2]
-; AVX2-NEXT: vmovapd %ymm0, (%rdi)
+; AVX2-NEXT: vmovaps %ymm0, (%rdi)
; AVX2-NEXT: retq
%res = shufflevector <4 x double> %v, <4 x double> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
%res1 = shufflevector<4 x double> %res, <4 x double> undef, <4 x i32> <i32 2, i32 0, i32 undef, i32 undef>
@@ -28,10 +28,10 @@ define <4 x double> @foo4(<4 x double> %v, <4 x double> *%p) nounwind {
define <8 x float> @foo8(<8 x float> %v, <8 x float> *%p) nounwind {
; AVX2-LABEL: foo8:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7]
; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2]
-; AVX2-NEXT: vmovapd %ymm0, (%rdi)
+; AVX2-NEXT: vmovaps %ymm0, (%rdi)
; AVX2-NEXT: retq
%res = shufflevector <8 x float> %v, <8 x float> undef, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
%res1 = shufflevector<8 x float> %res, <8 x float> undef, <8 x i32> <i32 2, i32 0, i32 undef, i32 undef, i32 5, i32 1, i32 3, i32 7>
@@ -41,8 +41,8 @@ define <8 x float> @foo8(<8 x float> %v, <8 x float> *%p) nounwind {
define <4 x i32> @undef_splatmask(<4 x i32> %v) nounwind {
; AVX2-LABEL: undef_splatmask:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3]
; AVX2-NEXT: retq
%res = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 2, i32 undef, i32 2, i32 undef>
%res1 = shufflevector <4 x i32> %res, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
@@ -51,8 +51,8 @@ define <4 x i32> @undef_splatmask(<4 x i32> %v) nounwind {
define <4 x i32> @undef_splatmask2(<4 x i32> %v) nounwind {
; AVX2-LABEL: undef_splatmask2:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3]
; AVX2-NEXT: retq
%res = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 2, i32 undef>
%res1 = shufflevector <4 x i32> %res, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
@@ -61,8 +61,8 @@ define <4 x i32> @undef_splatmask2(<4 x i32> %v) nounwind {
define <4 x i32> @undef_splatmask3(<4 x i32> %v) nounwind {
; AVX2-LABEL: undef_splatmask3:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3]
; AVX2-NEXT: retq
%res = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 2, i32 undef, i32 2, i32 undef>
%res1 = shufflevector <4 x i32> %res, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 3>
@@ -71,11 +71,11 @@ define <4 x i32> @undef_splatmask3(<4 x i32> %v) nounwind {
define <4 x i32> @undef_splatmask4(<4 x i32> %v, <4 x i32>* %p) nounwind {
; AVX2-LABEL: undef_splatmask4:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,2,3,3]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; AVX2-NEXT: vmovdqa %xmm0, (%rdi)
-; AVX2-NEXT: vmovdqa %xmm1, %xmm0
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,2,3,3]
+; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX2-NEXT: vmovaps %xmm0, (%rdi)
+; AVX2-NEXT: vmovaps %xmm1, %xmm0
; AVX2-NEXT: retq
%res = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 2, i32 undef, i32 2, i32 undef>
%res1 = shufflevector <4 x i32> %res, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
@@ -85,7 +85,7 @@ define <4 x i32> @undef_splatmask4(<4 x i32> %v, <4 x i32>* %p) nounwind {
define <4 x i32> @undef_splatmask5(<4 x i32> %v, <4 x i32>* %p) nounwind {
; AVX2-LABEL: undef_splatmask5:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpbroadcastd %xmm0, %xmm1
; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0
; AVX2-NEXT: vmovdqa %xmm0, (%rdi)
diff --git a/test/CodeGen/X86/shuffle-strided-with-offset-128.ll b/test/CodeGen/X86/shuffle-strided-with-offset-128.ll
new file mode 100644
index 000000000000..0f1f818e250d
--- /dev/null
+++ b/test/CodeGen/X86/shuffle-strided-with-offset-128.ll
@@ -0,0 +1,907 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE42
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512VL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BWVL
+
+define void @shuffle_v16i8_to_v8i8_1(<16 x i8>* %L, <8 x i8>* %S) nounwind {
+; SSE2-LABEL: shuffle_v16i8_to_v8i8_1:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movdqa (%rdi), %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,2,4,5,6,7]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7]
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; SSE2-NEXT: packuswb %xmm0, %xmm0
+; SSE2-NEXT: movq %xmm0, (%rsi)
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: shuffle_v16i8_to_v8i8_1:
+; SSE42: # %bb.0:
+; SSE42-NEXT: movdqa (%rdi), %xmm0
+; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
+; SSE42-NEXT: movq %xmm0, (%rsi)
+; SSE42-NEXT: retq
+;
+; AVX-LABEL: shuffle_v16i8_to_v8i8_1:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovdqa (%rdi), %xmm0
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
+; AVX-NEXT: vmovq %xmm0, (%rsi)
+; AVX-NEXT: retq
+;
+; AVX512F-LABEL: shuffle_v16i8_to_v8i8_1:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
+; AVX512F-NEXT: vmovq %xmm0, (%rsi)
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v16i8_to_v8i8_1:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
+; AVX512VL-NEXT: vmovq %xmm0, (%rsi)
+; AVX512VL-NEXT: retq
+;
+; AVX512BW-LABEL: shuffle_v16i8_to_v8i8_1:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
+; AVX512BW-NEXT: retq
+;
+; AVX512BWVL-LABEL: shuffle_v16i8_to_v8i8_1:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vpsrlw $8, (%rdi), %xmm0
+; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rsi)
+; AVX512BWVL-NEXT: retq
+ %vec = load <16 x i8>, <16 x i8>* %L
+ %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+ store <8 x i8> %strided.vec, <8 x i8>* %S
+ ret void
+}
+
+define void @shuffle_v8i16_to_v4i16_1(<8 x i16>* %L, <4 x i16>* %S) nounwind {
+; SSE2-LABEL: shuffle_v8i16_to_v4i16_1:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = mem[3,1,2,3,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7]
+; SSE2-NEXT: movq %xmm0, (%rsi)
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: shuffle_v8i16_to_v4i16_1:
+; SSE42: # %bb.0:
+; SSE42-NEXT: movdqa (%rdi), %xmm0
+; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
+; SSE42-NEXT: movq %xmm0, (%rsi)
+; SSE42-NEXT: retq
+;
+; AVX-LABEL: shuffle_v8i16_to_v4i16_1:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovdqa (%rdi), %xmm0
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
+; AVX-NEXT: vmovq %xmm0, (%rsi)
+; AVX-NEXT: retq
+;
+; AVX512F-LABEL: shuffle_v8i16_to_v4i16_1:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
+; AVX512F-NEXT: vmovq %xmm0, (%rsi)
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v8i16_to_v4i16_1:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpsrld $16, (%rdi), %xmm0
+; AVX512VL-NEXT: vpmovdw %xmm0, (%rsi)
+; AVX512VL-NEXT: retq
+;
+; AVX512BW-LABEL: shuffle_v8i16_to_v4i16_1:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
+; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
+; AVX512BW-NEXT: retq
+;
+; AVX512BWVL-LABEL: shuffle_v8i16_to_v4i16_1:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vpsrld $16, (%rdi), %xmm0
+; AVX512BWVL-NEXT: vpmovdw %xmm0, (%rsi)
+; AVX512BWVL-NEXT: retq
+ %vec = load <8 x i16>, <8 x i16>* %L
+ %strided.vec = shufflevector <8 x i16> %vec, <8 x i16> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+ store <4 x i16> %strided.vec, <4 x i16>* %S
+ ret void
+}
+
+define void @shuffle_v4i32_to_v2i32_1(<4 x i32>* %L, <2 x i32>* %S) nounwind {
+; SSE-LABEL: shuffle_v4i32_to_v2i32_1:
+; SSE: # %bb.0:
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = mem[1,3,2,3]
+; SSE-NEXT: movq %xmm0, (%rsi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: shuffle_v4i32_to_v2i32_1:
+; AVX: # %bb.0:
+; AVX-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,3,2,3]
+; AVX-NEXT: vmovlps %xmm0, (%rsi)
+; AVX-NEXT: retq
+;
+; AVX512F-LABEL: shuffle_v4i32_to_v2i32_1:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,3,2,3]
+; AVX512F-NEXT: vmovlps %xmm0, (%rsi)
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v4i32_to_v2i32_1:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = mem[1,1,3,3]
+; AVX512VL-NEXT: vpmovqd %xmm0, (%rsi)
+; AVX512VL-NEXT: retq
+;
+; AVX512BW-LABEL: shuffle_v4i32_to_v2i32_1:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,3,2,3]
+; AVX512BW-NEXT: vmovlps %xmm0, (%rsi)
+; AVX512BW-NEXT: retq
+;
+; AVX512BWVL-LABEL: shuffle_v4i32_to_v2i32_1:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm0 = mem[1,1,3,3]
+; AVX512BWVL-NEXT: vpmovqd %xmm0, (%rsi)
+; AVX512BWVL-NEXT: retq
+ %vec = load <4 x i32>, <4 x i32>* %L
+ %strided.vec = shufflevector <4 x i32> %vec, <4 x i32> undef, <2 x i32> <i32 1, i32 3>
+ store <2 x i32> %strided.vec, <2 x i32>* %S
+ ret void
+}
+
+define void @shuffle_v16i8_to_v4i8_1(<16 x i8>* %L, <4 x i8>* %S) nounwind {
+; SSE2-LABEL: shuffle_v16i8_to_v4i8_1:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movdqa (%rdi), %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT: packuswb %xmm0, %xmm0
+; SSE2-NEXT: movd %xmm0, (%rsi)
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: shuffle_v16i8_to_v4i8_1:
+; SSE42: # %bb.0:
+; SSE42-NEXT: movdqa (%rdi), %xmm0
+; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u]
+; SSE42-NEXT: movd %xmm0, (%rsi)
+; SSE42-NEXT: retq
+;
+; AVX-LABEL: shuffle_v16i8_to_v4i8_1:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovdqa (%rdi), %xmm0
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX-NEXT: vmovd %xmm0, (%rsi)
+; AVX-NEXT: retq
+;
+; AVX512F-LABEL: shuffle_v16i8_to_v4i8_1:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512F-NEXT: vmovd %xmm0, (%rsi)
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v16i8_to_v4i8_1:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512VL-NEXT: vpsrlw $8, %xmm0, %xmm0
+; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi)
+; AVX512VL-NEXT: retq
+;
+; AVX512BW-LABEL: shuffle_v16i8_to_v4i8_1:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
+; AVX512BW-NEXT: retq
+;
+; AVX512BWVL-LABEL: shuffle_v16i8_to_v4i8_1:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vpsrlw $8, (%rdi), %xmm0
+; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi)
+; AVX512BWVL-NEXT: retq
+ %vec = load <16 x i8>, <16 x i8>* %L
+ %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
+ store <4 x i8> %strided.vec, <4 x i8>* %S
+ ret void
+}
+
+define void @shuffle_v16i8_to_v4i8_2(<16 x i8>* %L, <4 x i8>* %S) nounwind {
+; SSE2-LABEL: shuffle_v16i8_to_v4i8_2:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movdqa (%rdi), %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7]
+; SSE2-NEXT: packuswb %xmm0, %xmm0
+; SSE2-NEXT: movd %xmm0, (%rsi)
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: shuffle_v16i8_to_v4i8_2:
+; SSE42: # %bb.0:
+; SSE42-NEXT: movdqa (%rdi), %xmm0
+; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u]
+; SSE42-NEXT: movd %xmm0, (%rsi)
+; SSE42-NEXT: retq
+;
+; AVX-LABEL: shuffle_v16i8_to_v4i8_2:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovdqa (%rdi), %xmm0
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX-NEXT: vmovd %xmm0, (%rsi)
+; AVX-NEXT: retq
+;
+; AVX512F-LABEL: shuffle_v16i8_to_v4i8_2:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512F-NEXT: vmovd %xmm0, (%rsi)
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v16i8_to_v4i8_2:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpsrld $16, (%rdi), %xmm0
+; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi)
+; AVX512VL-NEXT: retq
+;
+; AVX512BW-LABEL: shuffle_v16i8_to_v4i8_2:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
+; AVX512BW-NEXT: retq
+;
+; AVX512BWVL-LABEL: shuffle_v16i8_to_v4i8_2:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vpsrld $16, (%rdi), %xmm0
+; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi)
+; AVX512BWVL-NEXT: retq
+ %vec = load <16 x i8>, <16 x i8>* %L
+ %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
+ store <4 x i8> %strided.vec, <4 x i8>* %S
+ ret void
+}
+
+define void @shuffle_v16i8_to_v4i8_3(<16 x i8>* %L, <4 x i8>* %S) nounwind {
+; SSE2-LABEL: shuffle_v16i8_to_v4i8_3:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movdqa (%rdi), %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT: packuswb %xmm0, %xmm0
+; SSE2-NEXT: movd %xmm0, (%rsi)
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: shuffle_v16i8_to_v4i8_3:
+; SSE42: # %bb.0:
+; SSE42-NEXT: movdqa (%rdi), %xmm0
+; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u]
+; SSE42-NEXT: movd %xmm0, (%rsi)
+; SSE42-NEXT: retq
+;
+; AVX-LABEL: shuffle_v16i8_to_v4i8_3:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovdqa (%rdi), %xmm0
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX-NEXT: vmovd %xmm0, (%rsi)
+; AVX-NEXT: retq
+;
+; AVX512F-LABEL: shuffle_v16i8_to_v4i8_3:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512F-NEXT: vmovd %xmm0, (%rsi)
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v16i8_to_v4i8_3:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpsrld $24, (%rdi), %xmm0
+; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi)
+; AVX512VL-NEXT: retq
+;
+; AVX512BW-LABEL: shuffle_v16i8_to_v4i8_3:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
+; AVX512BW-NEXT: retq
+;
+; AVX512BWVL-LABEL: shuffle_v16i8_to_v4i8_3:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vpsrld $24, (%rdi), %xmm0
+; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi)
+; AVX512BWVL-NEXT: retq
+ %vec = load <16 x i8>, <16 x i8>* %L
+ %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
+ store <4 x i8> %strided.vec, <4 x i8>* %S
+ ret void
+}
+
+define void @shuffle_v8i16_to_v2i16_1(<8 x i16>* %L, <2 x i16>* %S) nounwind {
+; SSE-LABEL: shuffle_v8i16_to_v2i16_1:
+; SSE: # %bb.0:
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = mem[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
+; SSE-NEXT: movd %xmm0, (%rsi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: shuffle_v8i16_to_v2i16_1:
+; AVX: # %bb.0:
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
+; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
+; AVX-NEXT: vmovd %xmm0, (%rsi)
+; AVX-NEXT: retq
+;
+; AVX512F-LABEL: shuffle_v8i16_to_v2i16_1:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
+; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
+; AVX512F-NEXT: vmovd %xmm0, (%rsi)
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v8i16_to_v2i16_1:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpsrld $16, (%rdi), %xmm0
+; AVX512VL-NEXT: vpmovqw %xmm0, (%rsi)
+; AVX512VL-NEXT: retq
+;
+; AVX512BW-LABEL: shuffle_v8i16_to_v2i16_1:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
+; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
+; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
+; AVX512BW-NEXT: retq
+;
+; AVX512BWVL-LABEL: shuffle_v8i16_to_v2i16_1:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vpsrld $16, (%rdi), %xmm0
+; AVX512BWVL-NEXT: vpmovqw %xmm0, (%rsi)
+; AVX512BWVL-NEXT: retq
+ %vec = load <8 x i16>, <8 x i16>* %L
+ %strided.vec = shufflevector <8 x i16> %vec, <8 x i16> undef, <2 x i32> <i32 1, i32 5>
+ store <2 x i16> %strided.vec, <2 x i16>* %S
+ ret void
+}
+
+define void @shuffle_v8i16_to_v2i16_2(<8 x i16>* %L, <2 x i16>* %S) nounwind {
+; SSE-LABEL: shuffle_v8i16_to_v2i16_2:
+; SSE: # %bb.0:
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = mem[3,1,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
+; SSE-NEXT: movd %xmm0, (%rsi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: shuffle_v8i16_to_v2i16_2:
+; AVX: # %bb.0:
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = mem[3,1,2,3]
+; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
+; AVX-NEXT: vmovd %xmm0, (%rsi)
+; AVX-NEXT: retq
+;
+; AVX512F-LABEL: shuffle_v8i16_to_v2i16_2:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = mem[3,1,2,3]
+; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
+; AVX512F-NEXT: vmovd %xmm0, (%rsi)
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v8i16_to_v2i16_2:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = mem[1,1,3,3]
+; AVX512VL-NEXT: vpmovqw %xmm0, (%rsi)
+; AVX512VL-NEXT: retq
+;
+; AVX512BW-LABEL: shuffle_v8i16_to_v2i16_2:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = mem[3,1,2,3]
+; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
+; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
+; AVX512BW-NEXT: retq
+;
+; AVX512BWVL-LABEL: shuffle_v8i16_to_v2i16_2:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm0 = mem[1,1,3,3]
+; AVX512BWVL-NEXT: vpmovqw %xmm0, (%rsi)
+; AVX512BWVL-NEXT: retq
+ %vec = load <8 x i16>, <8 x i16>* %L
+ %strided.vec = shufflevector <8 x i16> %vec, <8 x i16> undef, <2 x i32> <i32 2, i32 6>
+ store <2 x i16> %strided.vec, <2 x i16>* %S
+ ret void
+}
+
+define void @shuffle_v8i16_to_v2i16_3(<8 x i16>* %L, <2 x i16>* %S) nounwind {
+; SSE-LABEL: shuffle_v8i16_to_v2i16_3:
+; SSE: # %bb.0:
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = mem[3,1,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
+; SSE-NEXT: movd %xmm0, (%rsi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: shuffle_v8i16_to_v2i16_3:
+; AVX: # %bb.0:
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = mem[3,1,2,3]
+; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
+; AVX-NEXT: vmovd %xmm0, (%rsi)
+; AVX-NEXT: retq
+;
+; AVX512F-LABEL: shuffle_v8i16_to_v2i16_3:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = mem[3,1,2,3]
+; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
+; AVX512F-NEXT: vmovd %xmm0, (%rsi)
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v8i16_to_v2i16_3:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpsrlq $48, (%rdi), %xmm0
+; AVX512VL-NEXT: vpmovqw %xmm0, (%rsi)
+; AVX512VL-NEXT: retq
+;
+; AVX512BW-LABEL: shuffle_v8i16_to_v2i16_3:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = mem[3,1,2,3]
+; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
+; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
+; AVX512BW-NEXT: retq
+;
+; AVX512BWVL-LABEL: shuffle_v8i16_to_v2i16_3:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vpsrlq $48, (%rdi), %xmm0
+; AVX512BWVL-NEXT: vpmovqw %xmm0, (%rsi)
+; AVX512BWVL-NEXT: retq
+ %vec = load <8 x i16>, <8 x i16>* %L
+ %strided.vec = shufflevector <8 x i16> %vec, <8 x i16> undef, <2 x i32> <i32 3, i32 7>
+ store <2 x i16> %strided.vec, <2 x i16>* %S
+ ret void
+}
+
+define void @shuffle_v16i8_to_v2i8_1(<16 x i8>* %L, <2 x i8>* %S) nounwind {
+; SSE2-LABEL: shuffle_v16i8_to_v2i8_1:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movdqa (%rdi), %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; SSE2-NEXT: packuswb %xmm0, %xmm0
+; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: movw %ax, (%rsi)
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: shuffle_v16i8_to_v2i8_1:
+; SSE42: # %bb.0:
+; SSE42-NEXT: movdqa (%rdi), %xmm0
+; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; SSE42-NEXT: pextrw $0, %xmm0, (%rsi)
+; SSE42-NEXT: retq
+;
+; AVX-LABEL: shuffle_v16i8_to_v2i8_1:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovdqa (%rdi), %xmm0
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX-NEXT: vpextrw $0, %xmm0, (%rsi)
+; AVX-NEXT: retq
+;
+; AVX512F-LABEL: shuffle_v16i8_to_v2i8_1:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512F-NEXT: vpextrw $0, %xmm0, (%rsi)
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v16i8_to_v2i8_1:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512VL-NEXT: vpsrlw $8, %xmm0, %xmm0
+; AVX512VL-NEXT: vpmovqb %xmm0, (%rsi)
+; AVX512VL-NEXT: retq
+;
+; AVX512BW-LABEL: shuffle_v16i8_to_v2i8_1:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vpextrw $0, %xmm0, (%rsi)
+; AVX512BW-NEXT: retq
+;
+; AVX512BWVL-LABEL: shuffle_v16i8_to_v2i8_1:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vpsrlw $8, (%rdi), %xmm0
+; AVX512BWVL-NEXT: vpmovqb %xmm0, (%rsi)
+; AVX512BWVL-NEXT: retq
+ %vec = load <16 x i8>, <16 x i8>* %L
+ %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <2 x i32> <i32 1, i32 9>
+ store <2 x i8> %strided.vec, <2 x i8>* %S
+ ret void
+}
+
+define void @shuffle_v16i8_to_v2i8_2(<16 x i8>* %L, <2 x i8>* %S) nounwind {
+; SSE2-LABEL: shuffle_v16i8_to_v2i8_2:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movdqa (%rdi), %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
+; SSE2-NEXT: packuswb %xmm0, %xmm0
+; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: movw %ax, (%rsi)
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: shuffle_v16i8_to_v2i8_2:
+; SSE42: # %bb.0:
+; SSE42-NEXT: movdqa (%rdi), %xmm0
+; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; SSE42-NEXT: pextrw $0, %xmm0, (%rsi)
+; SSE42-NEXT: retq
+;
+; AVX-LABEL: shuffle_v16i8_to_v2i8_2:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovdqa (%rdi), %xmm0
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX-NEXT: vpextrw $0, %xmm0, (%rsi)
+; AVX-NEXT: retq
+;
+; AVX512F-LABEL: shuffle_v16i8_to_v2i8_2:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512F-NEXT: vpextrw $0, %xmm0, (%rsi)
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v16i8_to_v2i8_2:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpsrld $16, (%rdi), %xmm0
+; AVX512VL-NEXT: vpmovqb %xmm0, (%rsi)
+; AVX512VL-NEXT: retq
+;
+; AVX512BW-LABEL: shuffle_v16i8_to_v2i8_2:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vpextrw $0, %xmm0, (%rsi)
+; AVX512BW-NEXT: retq
+;
+; AVX512BWVL-LABEL: shuffle_v16i8_to_v2i8_2:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vpsrld $16, (%rdi), %xmm0
+; AVX512BWVL-NEXT: vpmovqb %xmm0, (%rsi)
+; AVX512BWVL-NEXT: retq
+ %vec = load <16 x i8>, <16 x i8>* %L
+ %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <2 x i32> <i32 2, i32 10>
+ store <2 x i8> %strided.vec, <2 x i8>* %S
+ ret void
+}
+
+define void @shuffle_v16i8_to_v2i8_3(<16 x i8>* %L, <2 x i8>* %S) nounwind {
+; SSE2-LABEL: shuffle_v16i8_to_v2i8_3:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movdqa (%rdi), %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; SSE2-NEXT: packuswb %xmm0, %xmm0
+; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: movw %ax, (%rsi)
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: shuffle_v16i8_to_v2i8_3:
+; SSE42: # %bb.0:
+; SSE42-NEXT: movdqa (%rdi), %xmm0
+; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; SSE42-NEXT: pextrw $0, %xmm0, (%rsi)
+; SSE42-NEXT: retq
+;
+; AVX-LABEL: shuffle_v16i8_to_v2i8_3:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovdqa (%rdi), %xmm0
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX-NEXT: vpextrw $0, %xmm0, (%rsi)
+; AVX-NEXT: retq
+;
+; AVX512F-LABEL: shuffle_v16i8_to_v2i8_3:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512F-NEXT: vpextrw $0, %xmm0, (%rsi)
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v16i8_to_v2i8_3:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpsrld $24, (%rdi), %xmm0
+; AVX512VL-NEXT: vpmovqb %xmm0, (%rsi)
+; AVX512VL-NEXT: retq
+;
+; AVX512BW-LABEL: shuffle_v16i8_to_v2i8_3:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vpextrw $0, %xmm0, (%rsi)
+; AVX512BW-NEXT: retq
+;
+; AVX512BWVL-LABEL: shuffle_v16i8_to_v2i8_3:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vpsrld $24, (%rdi), %xmm0
+; AVX512BWVL-NEXT: vpmovqb %xmm0, (%rsi)
+; AVX512BWVL-NEXT: retq
+ %vec = load <16 x i8>, <16 x i8>* %L
+ %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <2 x i32> <i32 3, i32 11>
+ store <2 x i8> %strided.vec, <2 x i8>* %S
+ ret void
+}
+
+define void @shuffle_v16i8_to_v2i8_4(<16 x i8>* %L, <2 x i8>* %S) nounwind {
+; SSE2-LABEL: shuffle_v16i8_to_v2i8_4:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movdqa (%rdi), %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
+; SSE2-NEXT: packuswb %xmm0, %xmm0
+; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: movw %ax, (%rsi)
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: shuffle_v16i8_to_v2i8_4:
+; SSE42: # %bb.0:
+; SSE42-NEXT: movdqa (%rdi), %xmm0
+; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; SSE42-NEXT: pextrw $0, %xmm0, (%rsi)
+; SSE42-NEXT: retq
+;
+; AVX-LABEL: shuffle_v16i8_to_v2i8_4:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovdqa (%rdi), %xmm0
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX-NEXT: vpextrw $0, %xmm0, (%rsi)
+; AVX-NEXT: retq
+;
+; AVX512F-LABEL: shuffle_v16i8_to_v2i8_4:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512F-NEXT: vpextrw $0, %xmm0, (%rsi)
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v16i8_to_v2i8_4:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = mem[1,1,3,3]
+; AVX512VL-NEXT: vpmovqb %xmm0, (%rsi)
+; AVX512VL-NEXT: retq
+;
+; AVX512BW-LABEL: shuffle_v16i8_to_v2i8_4:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vpextrw $0, %xmm0, (%rsi)
+; AVX512BW-NEXT: retq
+;
+; AVX512BWVL-LABEL: shuffle_v16i8_to_v2i8_4:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm0 = mem[1,1,3,3]
+; AVX512BWVL-NEXT: vpmovqb %xmm0, (%rsi)
+; AVX512BWVL-NEXT: retq
+ %vec = load <16 x i8>, <16 x i8>* %L
+ %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <2 x i32> <i32 4, i32 12>
+ store <2 x i8> %strided.vec, <2 x i8>* %S
+ ret void
+}
+
+define void @shuffle_v16i8_to_v2i8_5(<16 x i8>* %L, <2 x i8>* %S) nounwind {
+; SSE2-LABEL: shuffle_v16i8_to_v2i8_5:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movdqa (%rdi), %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; SSE2-NEXT: packuswb %xmm0, %xmm0
+; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: movw %ax, (%rsi)
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: shuffle_v16i8_to_v2i8_5:
+; SSE42: # %bb.0:
+; SSE42-NEXT: movdqa (%rdi), %xmm0
+; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; SSE42-NEXT: pextrw $0, %xmm0, (%rsi)
+; SSE42-NEXT: retq
+;
+; AVX-LABEL: shuffle_v16i8_to_v2i8_5:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovdqa (%rdi), %xmm0
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX-NEXT: vpextrw $0, %xmm0, (%rsi)
+; AVX-NEXT: retq
+;
+; AVX512F-LABEL: shuffle_v16i8_to_v2i8_5:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512F-NEXT: vpextrw $0, %xmm0, (%rsi)
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v16i8_to_v2i8_5:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpsrlq $40, (%rdi), %xmm0
+; AVX512VL-NEXT: vpmovqb %xmm0, (%rsi)
+; AVX512VL-NEXT: retq
+;
+; AVX512BW-LABEL: shuffle_v16i8_to_v2i8_5:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vpextrw $0, %xmm0, (%rsi)
+; AVX512BW-NEXT: retq
+;
+; AVX512BWVL-LABEL: shuffle_v16i8_to_v2i8_5:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vpsrlq $40, (%rdi), %xmm0
+; AVX512BWVL-NEXT: vpmovqb %xmm0, (%rsi)
+; AVX512BWVL-NEXT: retq
+ %vec = load <16 x i8>, <16 x i8>* %L
+ %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <2 x i32> <i32 5, i32 13>
+ store <2 x i8> %strided.vec, <2 x i8>* %S
+ ret void
+}
+
+define void @shuffle_v16i8_to_v2i8_6(<16 x i8>* %L, <2 x i8>* %S) nounwind {
+; SSE2-LABEL: shuffle_v16i8_to_v2i8_6:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movdqa (%rdi), %xmm0
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
+; SSE2-NEXT: packuswb %xmm0, %xmm0
+; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: movw %ax, (%rsi)
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: shuffle_v16i8_to_v2i8_6:
+; SSE42: # %bb.0:
+; SSE42-NEXT: movdqa (%rdi), %xmm0
+; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; SSE42-NEXT: pextrw $0, %xmm0, (%rsi)
+; SSE42-NEXT: retq
+;
+; AVX-LABEL: shuffle_v16i8_to_v2i8_6:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovdqa (%rdi), %xmm0
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX-NEXT: vpextrw $0, %xmm0, (%rsi)
+; AVX-NEXT: retq
+;
+; AVX512F-LABEL: shuffle_v16i8_to_v2i8_6:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512F-NEXT: vpextrw $0, %xmm0, (%rsi)
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v16i8_to_v2i8_6:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpsrlq $48, (%rdi), %xmm0
+; AVX512VL-NEXT: vpmovqb %xmm0, (%rsi)
+; AVX512VL-NEXT: retq
+;
+; AVX512BW-LABEL: shuffle_v16i8_to_v2i8_6:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vpextrw $0, %xmm0, (%rsi)
+; AVX512BW-NEXT: retq
+;
+; AVX512BWVL-LABEL: shuffle_v16i8_to_v2i8_6:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vpsrlq $48, (%rdi), %xmm0
+; AVX512BWVL-NEXT: vpmovqb %xmm0, (%rsi)
+; AVX512BWVL-NEXT: retq
+ %vec = load <16 x i8>, <16 x i8>* %L
+ %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <2 x i32> <i32 6, i32 14>
+ store <2 x i8> %strided.vec, <2 x i8>* %S
+ ret void
+}
+
+define void @shuffle_v16i8_to_v2i8_7(<16 x i8>* %L, <2 x i8>* %S) nounwind {
+; SSE2-LABEL: shuffle_v16i8_to_v2i8_7:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movdqa (%rdi), %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; SSE2-NEXT: packuswb %xmm0, %xmm0
+; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: movw %ax, (%rsi)
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: shuffle_v16i8_to_v2i8_7:
+; SSE42: # %bb.0:
+; SSE42-NEXT: movdqa (%rdi), %xmm0
+; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; SSE42-NEXT: pextrw $0, %xmm0, (%rsi)
+; SSE42-NEXT: retq
+;
+; AVX-LABEL: shuffle_v16i8_to_v2i8_7:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovdqa (%rdi), %xmm0
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX-NEXT: vpextrw $0, %xmm0, (%rsi)
+; AVX-NEXT: retq
+;
+; AVX512F-LABEL: shuffle_v16i8_to_v2i8_7:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512F-NEXT: vpextrw $0, %xmm0, (%rsi)
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v16i8_to_v2i8_7:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpsrlq $56, (%rdi), %xmm0
+; AVX512VL-NEXT: vpmovqb %xmm0, (%rsi)
+; AVX512VL-NEXT: retq
+;
+; AVX512BW-LABEL: shuffle_v16i8_to_v2i8_7:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vpextrw $0, %xmm0, (%rsi)
+; AVX512BW-NEXT: retq
+;
+; AVX512BWVL-LABEL: shuffle_v16i8_to_v2i8_7:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vpsrlq $56, (%rdi), %xmm0
+; AVX512BWVL-NEXT: vpmovqb %xmm0, (%rsi)
+; AVX512BWVL-NEXT: retq
+ %vec = load <16 x i8>, <16 x i8>* %L
+ %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <2 x i32> <i32 7, i32 15>
+ store <2 x i8> %strided.vec, <2 x i8>* %S
+ ret void
+}
+
diff --git a/test/CodeGen/X86/shuffle-strided-with-offset-256.ll b/test/CodeGen/X86/shuffle-strided-with-offset-256.ll
new file mode 100644
index 000000000000..7cef269ebc2b
--- /dev/null
+++ b/test/CodeGen/X86/shuffle-strided-with-offset-256.ll
@@ -0,0 +1,1156 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512VL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BWVL
+
+define void @shuffle_v32i8_to_v16i8_1(<32 x i8>* %L, <16 x i8>* %S) nounwind {
+; AVX1-LABEL: shuffle_v32i8_to_v16i8_1:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa (%rdi), %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX1-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v32i8_to_v16i8_1:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovdqa (%rdi), %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: shuffle_v32i8_to_v16i8_1:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u>
+; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
+ %vec = load <32 x i8>, <32 x i8>* %L
+ %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+ store <16 x i8> %strided.vec, <16 x i8>* %S
+ ret void
+}
+
+define void @shuffle_v16i16_to_v8i16_1(<16 x i16>* %L, <8 x i16>* %S) nounwind {
+; AVX1-LABEL: shuffle_v16i16_to_v8i16_1:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa (%rdi), %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX1-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v16i16_to_v8i16_1:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovdqa (%rdi), %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
+; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: shuffle_v16i16_to_v8i16_1:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
+; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
+ %vec = load <16 x i16>, <16 x i16>* %L
+ %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+ store <8 x i16> %strided.vec, <8 x i16>* %S
+ ret void
+}
+
+define void @shuffle_v8i32_to_v4i32_1(<8 x i32>* %L, <4 x i32>* %S) nounwind {
+; AVX-LABEL: shuffle_v8i32_to_v4i32_1:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovaps (%rdi), %ymm0
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
+; AVX-NEXT: vmovaps %xmm0, (%rsi)
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: shuffle_v8i32_to_v4i32_1:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vmovaps (%rdi), %ymm0
+; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
+; AVX512-NEXT: vmovaps %xmm0, (%rsi)
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
+ %vec = load <8 x i32>, <8 x i32>* %L
+ %strided.vec = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+ store <4 x i32> %strided.vec, <4 x i32>* %S
+ ret void
+}
+
+define void @shuffle_v32i8_to_v8i8_1(<32 x i8>* %L, <8 x i8>* %S) nounwind {
+; AVX1-LABEL: shuffle_v32i8_to_v8i8_1:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa (%rdi), %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX1-NEXT: vmovq %xmm0, (%rsi)
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v32i8_to_v8i8_1:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovdqa (%rdi), %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX2-NEXT: vmovq %xmm0, (%rsi)
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: shuffle_v32i8_to_v8i8_1:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512F-NEXT: vmovq %xmm0, (%rsi)
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v32i8_to_v8i8_1:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512VL-NEXT: vmovq %xmm0, (%rsi)
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
+;
+; AVX512BW-LABEL: shuffle_v32i8_to_v8i8_1:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+;
+; AVX512BWVL-LABEL: shuffle_v32i8_to_v8i8_1:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [1,1,5,5,9,9,13,13,13,13,5,5,12,12,13,13]
+; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rsi)
+; AVX512BWVL-NEXT: vzeroupper
+; AVX512BWVL-NEXT: retq
+ %vec = load <32 x i8>, <32 x i8>* %L
+ %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
+ store <8 x i8> %strided.vec, <8 x i8>* %S
+ ret void
+}
+
+define void @shuffle_v32i8_to_v8i8_2(<32 x i8>* %L, <8 x i8>* %S) nounwind {
+; AVX1-LABEL: shuffle_v32i8_to_v8i8_2:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa (%rdi), %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX1-NEXT: vmovq %xmm0, (%rsi)
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v32i8_to_v8i8_2:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovdqa (%rdi), %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX2-NEXT: vmovq %xmm0, (%rsi)
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: shuffle_v32i8_to_v8i8_2:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512F-NEXT: vmovq %xmm0, (%rsi)
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v32i8_to_v8i8_2:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512VL-NEXT: vmovq %xmm0, (%rsi)
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
+;
+; AVX512BW-LABEL: shuffle_v32i8_to_v8i8_2:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+;
+; AVX512BWVL-LABEL: shuffle_v32i8_to_v8i8_2:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
+; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rsi)
+; AVX512BWVL-NEXT: vzeroupper
+; AVX512BWVL-NEXT: retq
+ %vec = load <32 x i8>, <32 x i8>* %L
+ %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
+ store <8 x i8> %strided.vec, <8 x i8>* %S
+ ret void
+}
+
+define void @shuffle_v32i8_to_v8i8_3(<32 x i8>* %L, <8 x i8>* %S) nounwind {
+; AVX1-LABEL: shuffle_v32i8_to_v8i8_3:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa (%rdi), %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX1-NEXT: vmovq %xmm0, (%rsi)
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v32i8_to_v8i8_3:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovdqa (%rdi), %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX2-NEXT: vmovq %xmm0, (%rsi)
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: shuffle_v32i8_to_v8i8_3:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512F-NEXT: vmovq %xmm0, (%rsi)
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v32i8_to_v8i8_3:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512VL-NEXT: vmovq %xmm0, (%rsi)
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
+;
+; AVX512BW-LABEL: shuffle_v32i8_to_v8i8_3:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+;
+; AVX512BWVL-LABEL: shuffle_v32i8_to_v8i8_3:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [3,3,7,7,11,11,15,15,7,7,15,15,6,6,7,7]
+; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rsi)
+; AVX512BWVL-NEXT: vzeroupper
+; AVX512BWVL-NEXT: retq
+ %vec = load <32 x i8>, <32 x i8>* %L
+ %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
+ store <8 x i8> %strided.vec, <8 x i8>* %S
+ ret void
+}
+
+define void @shuffle_v16i16_to_v4i16_1(<16 x i16>* %L, <4 x i16>* %S) nounwind {
+; AVX1-LABEL: shuffle_v16i16_to_v4i16_1:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa (%rdi), %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX1-NEXT: vmovq %xmm0, (%rsi)
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v16i16_to_v4i16_1:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovdqa (%rdi), %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
+; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX2-NEXT: vmovq %xmm0, (%rsi)
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: shuffle_v16i16_to_v4i16_1:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; AVX512F-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7]
+; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
+; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512F-NEXT: vmovq %xmm0, (%rsi)
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v16i16_to_v4i16_1:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,1,3,3,4,5,6,7]
+; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,3,3,4,5,6,7]
+; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512VL-NEXT: vpmovdw %xmm0, (%rsi)
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
+;
+; AVX512BW-LABEL: shuffle_v16i16_to_v4i16_1:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7]
+; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
+; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+;
+; AVX512BWVL-LABEL: shuffle_v16i16_to_v4i16_1:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,1,3,3,4,5,6,7]
+; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,3,3,4,5,6,7]
+; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512BWVL-NEXT: vpmovdw %xmm0, (%rsi)
+; AVX512BWVL-NEXT: vzeroupper
+; AVX512BWVL-NEXT: retq
+ %vec = load <16 x i16>, <16 x i16>* %L
+ %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
+ store <4 x i16> %strided.vec, <4 x i16>* %S
+ ret void
+}
+
+define void @shuffle_v16i16_to_v4i16_2(<16 x i16>* %L, <4 x i16>* %S) nounwind {
+; AVX1-LABEL: shuffle_v16i16_to_v4i16_2:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa (%rdi), %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX1-NEXT: vmovq %xmm0, (%rsi)
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v16i16_to_v4i16_2:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovdqa (%rdi), %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
+; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX2-NEXT: vmovq %xmm0, (%rsi)
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: shuffle_v16i16_to_v4i16_2:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
+; AVX512F-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7]
+; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
+; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512F-NEXT: vmovq %xmm0, (%rsi)
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v16i16_to_v4i16_2:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovaps (%rdi), %ymm0
+; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX512VL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
+; AVX512VL-NEXT: vpmovdw %xmm0, (%rsi)
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
+;
+; AVX512BW-LABEL: shuffle_v16i16_to_v4i16_2:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
+; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7]
+; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
+; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+;
+; AVX512BWVL-LABEL: shuffle_v16i16_to_v4i16_2:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vmovaps (%rdi), %ymm0
+; AVX512BWVL-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX512BWVL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
+; AVX512BWVL-NEXT: vpmovdw %xmm0, (%rsi)
+; AVX512BWVL-NEXT: vzeroupper
+; AVX512BWVL-NEXT: retq
+ %vec = load <16 x i16>, <16 x i16>* %L
+ %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
+ store <4 x i16> %strided.vec, <4 x i16>* %S
+ ret void
+}
+
+define void @shuffle_v16i16_to_v4i16_3(<16 x i16>* %L, <4 x i16>* %S) nounwind {
+; AVX1-LABEL: shuffle_v16i16_to_v4i16_3:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa (%rdi), %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX1-NEXT: vmovq %xmm0, (%rsi)
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v16i16_to_v4i16_3:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovdqa (%rdi), %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
+; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX2-NEXT: vmovq %xmm0, (%rsi)
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: shuffle_v16i16_to_v4i16_3:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
+; AVX512F-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
+; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
+; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512F-NEXT: vmovq %xmm0, (%rsi)
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v16i16_to_v4i16_3:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
+; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,1,3,4,5,6,7]
+; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,1,3,4,5,6,7]
+; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512VL-NEXT: vpmovdw %xmm0, (%rsi)
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
+;
+; AVX512BW-LABEL: shuffle_v16i16_to_v4i16_3:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
+; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
+; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
+; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+;
+; AVX512BWVL-LABEL: shuffle_v16i16_to_v4i16_3:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
+; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,1,3,4,5,6,7]
+; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,1,3,4,5,6,7]
+; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512BWVL-NEXT: vpmovdw %xmm0, (%rsi)
+; AVX512BWVL-NEXT: vzeroupper
+; AVX512BWVL-NEXT: retq
+ %vec = load <16 x i16>, <16 x i16>* %L
+ %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
+ store <4 x i16> %strided.vec, <4 x i16>* %S
+ ret void
+}
+
+define void @shuffle_v32i8_to_v4i8_1(<32 x i8>* %L, <4 x i8>* %S) nounwind {
+; AVX1-LABEL: shuffle_v32i8_to_v4i8_1:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa (%rdi), %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT: vmovd %xmm0, (%rsi)
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v32i8_to_v4i8_1:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovdqa (%rdi), %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX2-NEXT: vmovd %xmm0, (%rsi)
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: shuffle_v32i8_to_v4i8_1:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512F-NEXT: vmovd %xmm0, (%rsi)
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v32i8_to_v4i8_1:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,1,1,8,8,9,9,8,8,9,9,10,10,11,11]
+; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512VL-NEXT: vpsrld $16, %xmm1, %xmm1
+; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512VL-NEXT: vpsrld $16, %xmm0, %xmm0
+; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi)
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
+;
+; AVX512BW-LABEL: shuffle_v32i8_to_v4i8_1:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+;
+; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_1:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,1,1,8,8,9,9,8,8,9,9,10,10,11,11]
+; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512BWVL-NEXT: vpsrld $16, %xmm1, %xmm1
+; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512BWVL-NEXT: vpsrld $16, %xmm0, %xmm0
+; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi)
+; AVX512BWVL-NEXT: vzeroupper
+; AVX512BWVL-NEXT: retq
+ %vec = load <32 x i8>, <32 x i8>* %L
+ %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 1, i32 9, i32 17, i32 25>
+ store <4 x i8> %strided.vec, <4 x i8>* %S
+ ret void
+}
+
+define void @shuffle_v32i8_to_v4i8_2(<32 x i8>* %L, <4 x i8>* %S) nounwind {
+; AVX1-LABEL: shuffle_v32i8_to_v4i8_2:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa (%rdi), %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT: vmovd %xmm0, (%rsi)
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v32i8_to_v4i8_2:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovdqa (%rdi), %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX2-NEXT: vmovd %xmm0, (%rsi)
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: shuffle_v32i8_to_v4i8_2:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512F-NEXT: vmovd %xmm0, (%rsi)
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v32i8_to_v4i8_2:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,1,3,3,4,5,6,7]
+; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,3,3,4,5,6,7]
+; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi)
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
+;
+; AVX512BW-LABEL: shuffle_v32i8_to_v4i8_2:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+;
+; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_2:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,1,3,3,4,5,6,7]
+; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,3,3,4,5,6,7]
+; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi)
+; AVX512BWVL-NEXT: vzeroupper
+; AVX512BWVL-NEXT: retq
+ %vec = load <32 x i8>, <32 x i8>* %L
+ %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 2, i32 10, i32 18, i32 26>
+ store <4 x i8> %strided.vec, <4 x i8>* %S
+ ret void
+}
+
+define void @shuffle_v32i8_to_v4i8_3(<32 x i8>* %L, <4 x i8>* %S) nounwind {
+; AVX1-LABEL: shuffle_v32i8_to_v4i8_3:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa (%rdi), %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT: vmovd %xmm0, (%rsi)
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v32i8_to_v4i8_3:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovdqa (%rdi), %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX2-NEXT: vmovd %xmm0, (%rsi)
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: shuffle_v32i8_to_v4i8_3:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512F-NEXT: vmovd %xmm0, (%rsi)
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v32i8_to_v4i8_3:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [10,10,11,11,2,2,3,3,8,8,9,9,10,10,11,11]
+; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,1,3,4,5,6,7]
+; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,1,3,4,5,6,7]
+; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi)
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
+;
+; AVX512BW-LABEL: shuffle_v32i8_to_v4i8_3:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+;
+; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_3:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [10,10,11,11,2,2,3,3,8,8,9,9,10,10,11,11]
+; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,1,3,4,5,6,7]
+; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,1,3,4,5,6,7]
+; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi)
+; AVX512BWVL-NEXT: vzeroupper
+; AVX512BWVL-NEXT: retq
+ %vec = load <32 x i8>, <32 x i8>* %L
+ %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 3, i32 11, i32 19, i32 27>
+ store <4 x i8> %strided.vec, <4 x i8>* %S
+ ret void
+}
+
+define void @shuffle_v32i8_to_v4i8_4(<32 x i8>* %L, <4 x i8>* %S) nounwind {
+; AVX1-LABEL: shuffle_v32i8_to_v4i8_4:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa (%rdi), %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT: vmovd %xmm0, (%rsi)
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v32i8_to_v4i8_4:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovdqa (%rdi), %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX2-NEXT: vmovd %xmm0, (%rsi)
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: shuffle_v32i8_to_v4i8_4:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512F-NEXT: vmovd %xmm0, (%rsi)
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v32i8_to_v4i8_4:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovaps (%rdi), %ymm0
+; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX512VL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
+; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi)
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
+;
+; AVX512BW-LABEL: shuffle_v32i8_to_v4i8_4:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+;
+; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_4:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vmovaps (%rdi), %ymm0
+; AVX512BWVL-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX512BWVL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
+; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi)
+; AVX512BWVL-NEXT: vzeroupper
+; AVX512BWVL-NEXT: retq
+ %vec = load <32 x i8>, <32 x i8>* %L
+ %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 4, i32 12, i32 20, i32 28>
+ store <4 x i8> %strided.vec, <4 x i8>* %S
+ ret void
+}
+
+define void @shuffle_v32i8_to_v4i8_5(<32 x i8>* %L, <4 x i8>* %S) nounwind {
+; AVX1-LABEL: shuffle_v32i8_to_v4i8_5:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa (%rdi), %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT: vmovd %xmm0, (%rsi)
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v32i8_to_v4i8_5:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovdqa (%rdi), %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX2-NEXT: vmovd %xmm0, (%rsi)
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: shuffle_v32i8_to_v4i8_5:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512F-NEXT: vmovd %xmm0, (%rsi)
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v32i8_to_v4i8_5:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
+; AVX512VL-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,1,3,4,5,6,7]
+; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; AVX512VL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,1,3,4,5,6,7]
+; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi)
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
+;
+; AVX512BW-LABEL: shuffle_v32i8_to_v4i8_5:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+;
+; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_5:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
+; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,1,3,4,5,6,7]
+; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,1,3,4,5,6,7]
+; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi)
+; AVX512BWVL-NEXT: vzeroupper
+; AVX512BWVL-NEXT: retq
+ %vec = load <32 x i8>, <32 x i8>* %L
+ %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 5, i32 13, i32 21, i32 29>
+ store <4 x i8> %strided.vec, <4 x i8>* %S
+ ret void
+}
+
+define void @shuffle_v32i8_to_v4i8_6(<32 x i8>* %L, <4 x i8>* %S) nounwind {
+; AVX1-LABEL: shuffle_v32i8_to_v4i8_6:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa (%rdi), %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT: vmovd %xmm0, (%rsi)
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v32i8_to_v4i8_6:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovdqa (%rdi), %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX2-NEXT: vmovd %xmm0, (%rsi)
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: shuffle_v32i8_to_v4i8_6:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512F-NEXT: vmovd %xmm0, (%rsi)
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v32i8_to_v4i8_6:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
+; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,1,3,4,5,6,7]
+; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,1,3,4,5,6,7]
+; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi)
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
+;
+; AVX512BW-LABEL: shuffle_v32i8_to_v4i8_6:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+;
+; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_6:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
+; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,1,3,4,5,6,7]
+; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,1,3,4,5,6,7]
+; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi)
+; AVX512BWVL-NEXT: vzeroupper
+; AVX512BWVL-NEXT: retq
+ %vec = load <32 x i8>, <32 x i8>* %L
+ %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 6, i32 14, i32 22, i32 30>
+ store <4 x i8> %strided.vec, <4 x i8>* %S
+ ret void
+}
+
+define void @shuffle_v32i8_to_v4i8_7(<32 x i8>* %L, <4 x i8>* %S) nounwind {
+; AVX1-LABEL: shuffle_v32i8_to_v4i8_7:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa (%rdi), %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT: vmovd %xmm0, (%rsi)
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v32i8_to_v4i8_7:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovdqa (%rdi), %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX2-NEXT: vmovd %xmm0, (%rsi)
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: shuffle_v32i8_to_v4i8_7:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512F-NEXT: vmovd %xmm0, (%rsi)
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v32i8_to_v4i8_7:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,14,14,15,15,14,14,15,15,4,4,5,5,6,6]
+; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi)
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
+;
+; AVX512BW-LABEL: shuffle_v32i8_to_v4i8_7:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+;
+; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_7:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,14,14,15,15,14,14,15,15,4,4,5,5,6,6]
+; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi)
+; AVX512BWVL-NEXT: vzeroupper
+; AVX512BWVL-NEXT: retq
+ %vec = load <32 x i8>, <32 x i8>* %L
+ %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 7, i32 15, i32 23, i32 31>
+ store <4 x i8> %strided.vec, <4 x i8>* %S
+ ret void
+}
+
diff --git a/test/CodeGen/X86/shuffle-strided-with-offset-512.ll b/test/CodeGen/X86/shuffle-strided-with-offset-512.ll
new file mode 100644
index 000000000000..7f3431fabedc
--- /dev/null
+++ b/test/CodeGen/X86/shuffle-strided-with-offset-512.ll
@@ -0,0 +1,1178 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512VL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BWVL
+
+define void @shuffle_v64i8_to_v32i8_1(<64 x i8>* %L, <32 x i8>* %S) nounwind {
+; AVX512F-LABEL: shuffle_v64i8_to_v32i8_1:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1
+; AVX512F-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31]
+; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u]
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
+; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX512F-NEXT: vmovdqa %ymm0, (%rsi)
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v64i8_to_v32i8_1:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1
+; AVX512VL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31]
+; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u]
+; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
+; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX512VL-NEXT: vmovdqa %ymm0, (%rsi)
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
+;
+; AVX512BW-LABEL: shuffle_v64i8_to_v32i8_1:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31]
+; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
+; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX512BW-NEXT: vmovdqa %ymm0, (%rsi)
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+;
+; AVX512BWVL-LABEL: shuffle_v64i8_to_v32i8_1:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512BWVL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31]
+; AVX512BWVL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u]
+; AVX512BWVL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
+; AVX512BWVL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX512BWVL-NEXT: vmovdqa %ymm0, (%rsi)
+; AVX512BWVL-NEXT: vzeroupper
+; AVX512BWVL-NEXT: retq
+ %vec = load <64 x i8>, <64 x i8>* %L
+ %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <32 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31, i32 33, i32 35, i32 37, i32 39, i32 41, i32 43, i32 45, i32 47, i32 49, i32 51, i32 53, i32 55, i32 57, i32 59, i32 61, i32 63>
+ store <32 x i8> %strided.vec, <32 x i8>* %S
+ ret void
+}
+
+define void @shuffle_v32i16_to_v16i16_1(<32 x i16>* %L, <16 x i16>* %S) nounwind {
+; AVX512F-LABEL: shuffle_v32i16_to_v16i16_1:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1
+; AVX512F-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[6,7,2,3,4,5,6,7,2,3,6,7,10,11,14,15,22,23,18,19,20,21,22,23,18,19,22,23,26,27,30,31]
+; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,30,31,26,27,28,29,30,31]
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
+; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX512F-NEXT: vmovdqa %ymm0, (%rsi)
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v32i16_to_v16i16_1:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1
+; AVX512VL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[6,7,2,3,4,5,6,7,2,3,6,7,10,11,14,15,22,23,18,19,20,21,22,23,18,19,22,23,26,27,30,31]
+; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,30,31,26,27,28,29,30,31]
+; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
+; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX512VL-NEXT: vmovdqa %ymm0, (%rsi)
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
+;
+; AVX512BW-LABEL: shuffle_v32i16_to_v16i16_1:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[6,7,2,3,4,5,6,7,2,3,6,7,10,11,14,15,22,23,18,19,20,21,22,23,18,19,22,23,26,27,30,31]
+; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,30,31,26,27,28,29,30,31]
+; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
+; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX512BW-NEXT: vmovdqa %ymm0, (%rsi)
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+;
+; AVX512BWVL-LABEL: shuffle_v32i16_to_v16i16_1:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm2 = [1,3,5,7,17,19,21,23,9,11,13,15,25,27,29,31]
+; AVX512BWVL-NEXT: vpermi2w %ymm1, %ymm0, %ymm2
+; AVX512BWVL-NEXT: vpermq {{.*#+}} ymm0 = ymm2[0,2,1,3]
+; AVX512BWVL-NEXT: vmovdqa %ymm0, (%rsi)
+; AVX512BWVL-NEXT: vzeroupper
+; AVX512BWVL-NEXT: retq
+ %vec = load <32 x i16>, <32 x i16>* %L
+ %strided.vec = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+ store <16 x i16> %strided.vec, <16 x i16>* %S
+ ret void
+}
+
+define void @shuffle_v16i32_to_v8i32_1(<16 x i32>* %L, <8 x i32>* %S) nounwind {
+; AVX512-LABEL: shuffle_v16i32_to_v8i32_1:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vmovaps (%rdi), %zmm0
+; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1
+; AVX512-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
+; AVX512-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX512-NEXT: vmovaps %ymm0, (%rsi)
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
+ %vec = load <16 x i32>, <16 x i32>* %L
+ %strided.vec = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+ store <8 x i32> %strided.vec, <8 x i32>* %S
+ ret void
+}
+
+define void @shuffle_v64i8_to_v16i8_1(<64 x i8>* %L, <16 x i8>* %S) nounwind {
+; AVX512F-LABEL: shuffle_v64i8_to_v16i8_1:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512F-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512F-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX512F-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v64i8_to_v16i8_1:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1
+; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
+; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
+;
+; AVX512BW-LABEL: shuffle_v64i8_to_v16i8_1:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+;
+; AVX512BWVL-LABEL: shuffle_v64i8_to_v16i8_1:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512BWVL-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
+; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; AVX512BWVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX512BWVL-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX512BWVL-NEXT: vzeroupper
+; AVX512BWVL-NEXT: retq
+ %vec = load <64 x i8>, <64 x i8>* %L
+ %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61>
+ store <16 x i8> %strided.vec, <16 x i8>* %S
+ ret void
+}
+
+define void @shuffle_v64i8_to_v16i8_2(<64 x i8>* %L, <16 x i8>* %S) nounwind {
+; AVX512F-LABEL: shuffle_v64i8_to_v16i8_2:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512F-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512F-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX512F-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v64i8_to_v16i8_2:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1
+; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u>
+; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
+;
+; AVX512BW-LABEL: shuffle_v64i8_to_v16i8_2:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+;
+; AVX512BWVL-LABEL: shuffle_v64i8_to_v16i8_2:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512BWVL-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u>
+; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; AVX512BWVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX512BWVL-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX512BWVL-NEXT: vzeroupper
+; AVX512BWVL-NEXT: retq
+ %vec = load <64 x i8>, <64 x i8>* %L
+ %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <16 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 34, i32 38, i32 42, i32 46, i32 50, i32 54, i32 58, i32 62>
+ store <16 x i8> %strided.vec, <16 x i8>* %S
+ ret void
+}
+
+define void @shuffle_v64i8_to_v16i8_3(<64 x i8>* %L, <16 x i8>* %S) nounwind {
+; AVX512F-LABEL: shuffle_v64i8_to_v16i8_3:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512F-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512F-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX512F-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v64i8_to_v16i8_3:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1
+; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u>
+; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
+;
+; AVX512BW-LABEL: shuffle_v64i8_to_v16i8_3:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+;
+; AVX512BWVL-LABEL: shuffle_v64i8_to_v16i8_3:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512BWVL-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u>
+; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; AVX512BWVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX512BWVL-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX512BWVL-NEXT: vzeroupper
+; AVX512BWVL-NEXT: retq
+ %vec = load <64 x i8>, <64 x i8>* %L
+ %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <16 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63>
+ store <16 x i8> %strided.vec, <16 x i8>* %S
+ ret void
+}
+
+define void @shuffle_v32i16_to_v8i16_1(<32 x i16>* %L, <8 x i16>* %S) nounwind {
+; AVX512F-LABEL: shuffle_v32i16_to_v8i16_1:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; AVX512F-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7]
+; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; AVX512F-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7]
+; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX512F-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; AVX512F-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7]
+; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
+; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX512F-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v32i16_to_v8i16_1:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1
+; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7]
+; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7]
+; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7]
+; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
+; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
+;
+; AVX512BW-LABEL: shuffle_v32i16_to_v8i16_1:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512BW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7]
+; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7]
+; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX512BW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7]
+; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
+; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+;
+; AVX512BWVL-LABEL: shuffle_v32i16_to_v8i16_1:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm2 = <1,5,9,13,17,21,25,29,u,u,u,u,u,u,u,u>
+; AVX512BWVL-NEXT: vpermi2w %ymm1, %ymm0, %ymm2
+; AVX512BWVL-NEXT: vmovdqa %xmm2, (%rsi)
+; AVX512BWVL-NEXT: vzeroupper
+; AVX512BWVL-NEXT: retq
+ %vec = load <32 x i16>, <32 x i16>* %L
+ %strided.vec = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
+ store <8 x i16> %strided.vec, <8 x i16>* %S
+ ret void
+}
+
+define void @shuffle_v32i16_to_v8i16_2(<32 x i16>* %L, <8 x i16>* %S) nounwind {
+; AVX512F-LABEL: shuffle_v32i16_to_v8i16_2:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3]
+; AVX512F-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,0,4,5,6,7]
+; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
+; AVX512F-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7]
+; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX512F-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3]
+; AVX512F-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7]
+; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
+; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX512F-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v32i16_to_v8i16_2:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1
+; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3]
+; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,0,4,5,6,7]
+; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
+; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7]
+; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3]
+; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7]
+; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
+; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
+;
+; AVX512BW-LABEL: shuffle_v32i16_to_v8i16_2:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512BW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3]
+; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,0,4,5,6,7]
+; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
+; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7]
+; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX512BW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3]
+; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7]
+; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
+; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+;
+; AVX512BWVL-LABEL: shuffle_v32i16_to_v8i16_2:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm2 = <2,6,10,14,18,22,26,30,u,u,u,u,u,u,u,u>
+; AVX512BWVL-NEXT: vpermi2w %ymm1, %ymm0, %ymm2
+; AVX512BWVL-NEXT: vmovdqa %xmm2, (%rsi)
+; AVX512BWVL-NEXT: vzeroupper
+; AVX512BWVL-NEXT: retq
+ %vec = load <32 x i16>, <32 x i16>* %L
+ %strided.vec = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
+ store <8 x i16> %strided.vec, <8 x i16>* %S
+ ret void
+}
+
+define void @shuffle_v32i16_to_v8i16_3(<32 x i16>* %L, <8 x i16>* %S) nounwind {
+; AVX512F-LABEL: shuffle_v32i16_to_v8i16_3:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3]
+; AVX512F-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7]
+; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
+; AVX512F-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,1,4,5,6,7]
+; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX512F-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3]
+; AVX512F-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
+; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
+; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX512F-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v32i16_to_v8i16_3:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1
+; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3]
+; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7]
+; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
+; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,1,4,5,6,7]
+; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3]
+; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
+; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
+; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
+;
+; AVX512BW-LABEL: shuffle_v32i16_to_v8i16_3:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512BW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3]
+; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7]
+; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
+; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,1,4,5,6,7]
+; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX512BW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3]
+; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
+; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
+; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+;
+; AVX512BWVL-LABEL: shuffle_v32i16_to_v8i16_3:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm2 = <3,7,11,15,19,23,27,31,u,u,u,u,u,u,u,u>
+; AVX512BWVL-NEXT: vpermi2w %ymm1, %ymm0, %ymm2
+; AVX512BWVL-NEXT: vmovdqa %xmm2, (%rsi)
+; AVX512BWVL-NEXT: vzeroupper
+; AVX512BWVL-NEXT: retq
+ %vec = load <32 x i16>, <32 x i16>* %L
+ %strided.vec = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
+ store <8 x i16> %strided.vec, <8 x i16>* %S
+ ret void
+}
+
+define void @shuffle_v64i8_to_v8i8_1(<64 x i8>* %L, <8 x i8>* %S) nounwind {
+; AVX512F-LABEL: shuffle_v64i8_to_v8i8_1:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,1,9,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512F-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512F-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX512F-NEXT: vmovq %xmm0, (%rsi)
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v64i8_to_v8i8_1:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1
+; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,1,9,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX512VL-NEXT: vmovq %xmm0, (%rsi)
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
+;
+; AVX512BW-LABEL: shuffle_v64i8_to_v8i8_1:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,1,9,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+;
+; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8_1:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512BWVL-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = [0,0,1,1,8,8,9,9,8,8,9,9,10,10,11,11]
+; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7]
+; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7]
+; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7]
+; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
+; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; AVX512BWVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rsi)
+; AVX512BWVL-NEXT: vzeroupper
+; AVX512BWVL-NEXT: retq
+ %vec = load <64 x i8>, <64 x i8>* %L
+ %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <8 x i32> <i32 1, i32 9, i32 17, i32 25, i32 33, i32 41, i32 49, i32 57>
+ store <8 x i8> %strided.vec, <8 x i8>* %S
+ ret void
+}
+
+define void @shuffle_v64i8_to_v8i8_2(<64 x i8>* %L, <8 x i8>* %S) nounwind {
+; AVX512F-LABEL: shuffle_v64i8_to_v8i8_2:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,2,10,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512F-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512F-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX512F-NEXT: vmovq %xmm0, (%rsi)
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v64i8_to_v8i8_2:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1
+; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,2,10,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX512VL-NEXT: vmovq %xmm0, (%rsi)
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
+;
+; AVX512BW-LABEL: shuffle_v64i8_to_v8i8_2:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,2,10,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+;
+; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8_2:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm2 = <1,5,9,13,17,21,25,29,u,u,u,u,u,u,u,u>
+; AVX512BWVL-NEXT: vpermi2w %ymm1, %ymm0, %ymm2
+; AVX512BWVL-NEXT: vpmovwb %xmm2, (%rsi)
+; AVX512BWVL-NEXT: vzeroupper
+; AVX512BWVL-NEXT: retq
+ %vec = load <64 x i8>, <64 x i8>* %L
+ %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <8 x i32> <i32 2, i32 10, i32 18, i32 26, i32 34, i32 42, i32 50, i32 58>
+ store <8 x i8> %strided.vec, <8 x i8>* %S
+ ret void
+}
+
+define void @shuffle_v64i8_to_v8i8_3(<64 x i8>* %L, <8 x i8>* %S) nounwind {
+; AVX512F-LABEL: shuffle_v64i8_to_v8i8_3:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,3,11,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512F-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512F-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX512F-NEXT: vmovq %xmm0, (%rsi)
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v64i8_to_v8i8_3:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1
+; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,3,11,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX512VL-NEXT: vmovq %xmm0, (%rsi)
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
+;
+; AVX512BW-LABEL: shuffle_v64i8_to_v8i8_3:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,3,11,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+;
+; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8_3:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512BWVL-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = [10,10,11,11,2,2,3,3,8,8,9,9,10,10,11,11]
+; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7]
+; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,1,4,5,6,7]
+; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
+; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
+; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; AVX512BWVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rsi)
+; AVX512BWVL-NEXT: vzeroupper
+; AVX512BWVL-NEXT: retq
+ %vec = load <64 x i8>, <64 x i8>* %L
+ %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <8 x i32> <i32 3, i32 11, i32 19, i32 27, i32 35, i32 43, i32 51, i32 59>
+ store <8 x i8> %strided.vec, <8 x i8>* %S
+ ret void
+}
+
+define void @shuffle_v64i8_to_v8i8_4(<64 x i8>* %L, <8 x i8>* %S) nounwind {
+; AVX512F-LABEL: shuffle_v64i8_to_v8i8_4:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,4,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512F-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512F-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX512F-NEXT: vmovq %xmm0, (%rsi)
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v64i8_to_v8i8_4:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1
+; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,4,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX512VL-NEXT: vmovq %xmm0, (%rsi)
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
+;
+; AVX512BW-LABEL: shuffle_v64i8_to_v8i8_4:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,4,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+;
+; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8_4:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm2 = <2,6,10,14,18,22,26,30,u,u,u,u,u,u,u,u>
+; AVX512BWVL-NEXT: vpermi2w %ymm1, %ymm0, %ymm2
+; AVX512BWVL-NEXT: vpmovwb %xmm2, (%rsi)
+; AVX512BWVL-NEXT: vzeroupper
+; AVX512BWVL-NEXT: retq
+ %vec = load <64 x i8>, <64 x i8>* %L
+ %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <8 x i32> <i32 4, i32 12, i32 20, i32 28, i32 36, i32 44, i32 52, i32 60>
+ store <8 x i8> %strided.vec, <8 x i8>* %S
+ ret void
+}
+
+define void @shuffle_v64i8_to_v8i8_5(<64 x i8>* %L, <8 x i8>* %S) nounwind {
+; AVX512F-LABEL: shuffle_v64i8_to_v8i8_5:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,5,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512F-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512F-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX512F-NEXT: vmovq %xmm0, (%rsi)
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v64i8_to_v8i8_5:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1
+; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,5,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX512VL-NEXT: vmovq %xmm0, (%rsi)
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
+;
+; AVX512BW-LABEL: shuffle_v64i8_to_v8i8_5:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,5,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+;
+; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8_5:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512BWVL-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3]
+; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7]
+; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
+; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,1,4,5,6,7]
+; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3]
+; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
+; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
+; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; AVX512BWVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rsi)
+; AVX512BWVL-NEXT: vzeroupper
+; AVX512BWVL-NEXT: retq
+ %vec = load <64 x i8>, <64 x i8>* %L
+ %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <8 x i32> <i32 5, i32 13, i32 21, i32 29, i32 37, i32 45, i32 53, i32 61>
+ store <8 x i8> %strided.vec, <8 x i8>* %S
+ ret void
+}
+
+define void @shuffle_v64i8_to_v8i8_6(<64 x i8>* %L, <8 x i8>* %S) nounwind {
+; AVX512F-LABEL: shuffle_v64i8_to_v8i8_6:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,6,14,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512F-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512F-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX512F-NEXT: vmovq %xmm0, (%rsi)
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v64i8_to_v8i8_6:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1
+; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,6,14,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX512VL-NEXT: vmovq %xmm0, (%rsi)
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
+;
+; AVX512BW-LABEL: shuffle_v64i8_to_v8i8_6:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,6,14,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+;
+; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8_6:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm2 = <3,7,11,15,19,23,27,31,u,u,u,u,u,u,u,u>
+; AVX512BWVL-NEXT: vpermi2w %ymm1, %ymm0, %ymm2
+; AVX512BWVL-NEXT: vpmovwb %xmm2, (%rsi)
+; AVX512BWVL-NEXT: vzeroupper
+; AVX512BWVL-NEXT: retq
+ %vec = load <64 x i8>, <64 x i8>* %L
+ %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <8 x i32> <i32 6, i32 14, i32 22, i32 30, i32 38, i32 46, i32 54, i32 62>
+ store <8 x i8> %strided.vec, <8 x i8>* %S
+ ret void
+}
+
+define void @shuffle_v64i8_to_v8i8_7(<64 x i8>* %L, <8 x i8>* %S) nounwind {
+; AVX512F-LABEL: shuffle_v64i8_to_v8i8_7:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,7,15,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512F-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512F-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX512F-NEXT: vmovq %xmm0, (%rsi)
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v64i8_to_v8i8_7:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1
+; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,7,15,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX512VL-NEXT: vmovq %xmm0, (%rsi)
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
+;
+; AVX512BW-LABEL: shuffle_v64i8_to_v8i8_7:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,7,15,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+;
+; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8_7:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512BWVL-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = [14,14,15,15,6,6,7,7,4,4,5,5,6,6,7,7]
+; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7]
+; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,1,4,5,6,7]
+; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
+; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
+; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; AVX512BWVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rsi)
+; AVX512BWVL-NEXT: vzeroupper
+; AVX512BWVL-NEXT: retq
+ %vec = load <64 x i8>, <64 x i8>* %L
+ %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <8 x i32> <i32 7, i32 15, i32 23, i32 31, i32 39, i32 47, i32 55, i32 63>
+ store <8 x i8> %strided.vec, <8 x i8>* %S
+ ret void
+}
+
diff --git a/test/CodeGen/X86/shuffle-vs-trunc-128.ll b/test/CodeGen/X86/shuffle-vs-trunc-128.ll
index 12a8443c31bd..1bfe37b1497e 100644
--- a/test/CodeGen/X86/shuffle-vs-trunc-128.ll
+++ b/test/CodeGen/X86/shuffle-vs-trunc-128.ll
@@ -14,7 +14,7 @@
define void @shuffle_v16i8_to_v8i8(<16 x i8>* %L, <8 x i8>* %S) nounwind {
; SSE2-LABEL: shuffle_v16i8_to_v8i8:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa (%rdi), %xmm0
; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
; SSE2-NEXT: packuswb %xmm0, %xmm0
@@ -22,43 +22,43 @@ define void @shuffle_v16i8_to_v8i8(<16 x i8>* %L, <8 x i8>* %S) nounwind {
; SSE2-NEXT: retq
;
; SSE42-LABEL: shuffle_v16i8_to_v8i8:
-; SSE42: # BB#0:
+; SSE42: # %bb.0:
; SSE42-NEXT: movdqa (%rdi), %xmm0
; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; SSE42-NEXT: movq %xmm0, (%rsi)
; SSE42-NEXT: retq
;
; AVX-LABEL: shuffle_v16i8_to_v8i8:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovdqa (%rdi), %xmm0
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; AVX-NEXT: vmovq %xmm0, (%rsi)
; AVX-NEXT: retq
;
; AVX512F-LABEL: shuffle_v16i8_to_v8i8:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; AVX512F-NEXT: vmovq %xmm0, (%rsi)
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i8_to_v8i8:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; AVX512VL-NEXT: vmovq %xmm0, (%rsi)
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: shuffle_v16i8_to_v8i8:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: shuffle_v16i8_to_v8i8:
-; AVX512BWVL: # BB#0:
-; AVX512BWVL-NEXT: vmovdqu (%rdi), %xmm0
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rsi)
; AVX512BWVL-NEXT: retq
%vec = load <16 x i8>, <16 x i8>* %L
@@ -69,7 +69,7 @@ define void @shuffle_v16i8_to_v8i8(<16 x i8>* %L, <8 x i8>* %S) nounwind {
define void @trunc_v8i16_to_v8i8(<16 x i8>* %L, <8 x i8>* %S) nounwind {
; SSE2-LABEL: trunc_v8i16_to_v8i8:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa (%rdi), %xmm0
; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
; SSE2-NEXT: packuswb %xmm0, %xmm0
@@ -77,43 +77,43 @@ define void @trunc_v8i16_to_v8i8(<16 x i8>* %L, <8 x i8>* %S) nounwind {
; SSE2-NEXT: retq
;
; SSE42-LABEL: trunc_v8i16_to_v8i8:
-; SSE42: # BB#0:
+; SSE42: # %bb.0:
; SSE42-NEXT: movdqa (%rdi), %xmm0
; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; SSE42-NEXT: movq %xmm0, (%rsi)
; SSE42-NEXT: retq
;
; AVX-LABEL: trunc_v8i16_to_v8i8:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovdqa (%rdi), %xmm0
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; AVX-NEXT: vmovq %xmm0, (%rsi)
; AVX-NEXT: retq
;
; AVX512F-LABEL: trunc_v8i16_to_v8i8:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; AVX512F-NEXT: vmovq %xmm0, (%rsi)
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: trunc_v8i16_to_v8i8:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; AVX512VL-NEXT: vmovq %xmm0, (%rsi)
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: trunc_v8i16_to_v8i8:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: trunc_v8i16_to_v8i8:
-; AVX512BWVL: # BB#0:
-; AVX512BWVL-NEXT: vmovdqu (%rdi), %xmm0
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rsi)
; AVX512BWVL-NEXT: retq
%vec = load <16 x i8>, <16 x i8>* %L
@@ -125,7 +125,7 @@ define void @trunc_v8i16_to_v8i8(<16 x i8>* %L, <8 x i8>* %S) nounwind {
define void @shuffle_v8i16_to_v4i16(<8 x i16>* %L, <4 x i16>* %S) nounwind {
; SSE2-LABEL: shuffle_v8i16_to_v4i16:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = mem[0,2,2,3,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
@@ -133,41 +133,41 @@ define void @shuffle_v8i16_to_v4i16(<8 x i16>* %L, <4 x i16>* %S) nounwind {
; SSE2-NEXT: retq
;
; SSE42-LABEL: shuffle_v8i16_to_v4i16:
-; SSE42: # BB#0:
+; SSE42: # %bb.0:
; SSE42-NEXT: movdqa (%rdi), %xmm0
; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; SSE42-NEXT: movq %xmm0, (%rsi)
; SSE42-NEXT: retq
;
; AVX-LABEL: shuffle_v8i16_to_v4i16:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovdqa (%rdi), %xmm0
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; AVX-NEXT: vmovq %xmm0, (%rsi)
; AVX-NEXT: retq
;
; AVX512F-LABEL: shuffle_v8i16_to_v4i16:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; AVX512F-NEXT: vmovq %xmm0, (%rsi)
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v8i16_to_v4i16:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
; AVX512VL-NEXT: vpmovdw %xmm0, (%rsi)
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: shuffle_v8i16_to_v4i16:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: shuffle_v8i16_to_v4i16:
-; AVX512BWVL: # BB#0:
+; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
; AVX512BWVL-NEXT: vpmovdw %xmm0, (%rsi)
; AVX512BWVL-NEXT: retq
@@ -179,7 +179,7 @@ define void @shuffle_v8i16_to_v4i16(<8 x i16>* %L, <4 x i16>* %S) nounwind {
define void @trunc_v4i32_to_v4i16(<8 x i16>* %L, <4 x i16>* %S) nounwind {
; SSE2-LABEL: trunc_v4i32_to_v4i16:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = mem[0,2,2,3,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
@@ -187,41 +187,41 @@ define void @trunc_v4i32_to_v4i16(<8 x i16>* %L, <4 x i16>* %S) nounwind {
; SSE2-NEXT: retq
;
; SSE42-LABEL: trunc_v4i32_to_v4i16:
-; SSE42: # BB#0:
+; SSE42: # %bb.0:
; SSE42-NEXT: movdqa (%rdi), %xmm0
; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; SSE42-NEXT: movq %xmm0, (%rsi)
; SSE42-NEXT: retq
;
; AVX-LABEL: trunc_v4i32_to_v4i16:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovdqa (%rdi), %xmm0
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; AVX-NEXT: vmovq %xmm0, (%rsi)
; AVX-NEXT: retq
;
; AVX512F-LABEL: trunc_v4i32_to_v4i16:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; AVX512F-NEXT: vmovq %xmm0, (%rsi)
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: trunc_v4i32_to_v4i16:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
; AVX512VL-NEXT: vpmovdw %xmm0, (%rsi)
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: trunc_v4i32_to_v4i16:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: trunc_v4i32_to_v4i16:
-; AVX512BWVL: # BB#0:
+; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
; AVX512BWVL-NEXT: vpmovdw %xmm0, (%rsi)
; AVX512BWVL-NEXT: retq
@@ -234,37 +234,37 @@ define void @trunc_v4i32_to_v4i16(<8 x i16>* %L, <4 x i16>* %S) nounwind {
define void @shuffle_v4i32_to_v2i32(<4 x i32>* %L, <2 x i32>* %S) nounwind {
; SSE-LABEL: shuffle_v4i32_to_v2i32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pshufd {{.*#+}} xmm0 = mem[0,2,2,3]
; SSE-NEXT: movq %xmm0, (%rsi)
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v4i32_to_v2i32:
-; AVX: # BB#0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
-; AVX-NEXT: vmovq %xmm0, (%rsi)
+; AVX: # %bb.0:
+; AVX-NEXT: vpermilps {{.*#+}} xmm0 = mem[0,2,2,3]
+; AVX-NEXT: vmovlps %xmm0, (%rsi)
; AVX-NEXT: retq
;
; AVX512F-LABEL: shuffle_v4i32_to_v2i32:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
-; AVX512F-NEXT: vmovq %xmm0, (%rsi)
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = mem[0,2,2,3]
+; AVX512F-NEXT: vmovlps %xmm0, (%rsi)
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v4i32_to_v2i32:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
; AVX512VL-NEXT: vpmovqd %xmm0, (%rsi)
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: shuffle_v4i32_to_v2i32:
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
-; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpermilps {{.*#+}} xmm0 = mem[0,2,2,3]
+; AVX512BW-NEXT: vmovlps %xmm0, (%rsi)
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: shuffle_v4i32_to_v2i32:
-; AVX512BWVL: # BB#0:
+; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
; AVX512BWVL-NEXT: vpmovqd %xmm0, (%rsi)
; AVX512BWVL-NEXT: retq
@@ -276,37 +276,37 @@ define void @shuffle_v4i32_to_v2i32(<4 x i32>* %L, <2 x i32>* %S) nounwind {
define void @trunc_v2i64_to_v2i32(<4 x i32>* %L, <2 x i32>* %S) nounwind {
; SSE-LABEL: trunc_v2i64_to_v2i32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pshufd {{.*#+}} xmm0 = mem[0,2,2,3]
; SSE-NEXT: movq %xmm0, (%rsi)
; SSE-NEXT: retq
;
; AVX-LABEL: trunc_v2i64_to_v2i32:
-; AVX: # BB#0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
-; AVX-NEXT: vmovq %xmm0, (%rsi)
+; AVX: # %bb.0:
+; AVX-NEXT: vpermilps {{.*#+}} xmm0 = mem[0,2,2,3]
+; AVX-NEXT: vmovlps %xmm0, (%rsi)
; AVX-NEXT: retq
;
; AVX512F-LABEL: trunc_v2i64_to_v2i32:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
-; AVX512F-NEXT: vmovq %xmm0, (%rsi)
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = mem[0,2,2,3]
+; AVX512F-NEXT: vmovlps %xmm0, (%rsi)
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: trunc_v2i64_to_v2i32:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
; AVX512VL-NEXT: vpmovqd %xmm0, (%rsi)
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: trunc_v2i64_to_v2i32:
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
-; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpermilps {{.*#+}} xmm0 = mem[0,2,2,3]
+; AVX512BW-NEXT: vmovlps %xmm0, (%rsi)
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: trunc_v2i64_to_v2i32:
-; AVX512BWVL: # BB#0:
+; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
; AVX512BWVL-NEXT: vpmovqd %xmm0, (%rsi)
; AVX512BWVL-NEXT: retq
@@ -319,7 +319,7 @@ define void @trunc_v2i64_to_v2i32(<4 x i32>* %L, <2 x i32>* %S) nounwind {
define void @shuffle_v16i8_to_v4i8(<16 x i8>* %L, <4 x i8>* %S) nounwind {
; SSE2-LABEL: shuffle_v16i8_to_v4i8:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa (%rdi), %xmm0
; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
; SSE2-NEXT: packuswb %xmm0, %xmm0
@@ -328,41 +328,41 @@ define void @shuffle_v16i8_to_v4i8(<16 x i8>* %L, <4 x i8>* %S) nounwind {
; SSE2-NEXT: retq
;
; SSE42-LABEL: shuffle_v16i8_to_v4i8:
-; SSE42: # BB#0:
+; SSE42: # %bb.0:
; SSE42-NEXT: movdqa (%rdi), %xmm0
; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
; SSE42-NEXT: movd %xmm0, (%rsi)
; SSE42-NEXT: retq
;
; AVX-LABEL: shuffle_v16i8_to_v4i8:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovdqa (%rdi), %xmm0
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX-NEXT: vmovd %xmm0, (%rsi)
; AVX-NEXT: retq
;
; AVX512F-LABEL: shuffle_v16i8_to_v4i8:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512F-NEXT: vmovd %xmm0, (%rsi)
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i8_to_v4i8:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi)
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: shuffle_v16i8_to_v4i8:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: shuffle_v16i8_to_v4i8:
-; AVX512BWVL: # BB#0:
+; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi)
; AVX512BWVL-NEXT: retq
@@ -374,7 +374,7 @@ define void @shuffle_v16i8_to_v4i8(<16 x i8>* %L, <4 x i8>* %S) nounwind {
define void @trunc_v4i32_to_v4i8(<16 x i8>* %L, <4 x i8>* %S) nounwind {
; SSE2-LABEL: trunc_v4i32_to_v4i8:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa (%rdi), %xmm0
; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
; SSE2-NEXT: packuswb %xmm0, %xmm0
@@ -383,41 +383,41 @@ define void @trunc_v4i32_to_v4i8(<16 x i8>* %L, <4 x i8>* %S) nounwind {
; SSE2-NEXT: retq
;
; SSE42-LABEL: trunc_v4i32_to_v4i8:
-; SSE42: # BB#0:
+; SSE42: # %bb.0:
; SSE42-NEXT: movdqa (%rdi), %xmm0
; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
; SSE42-NEXT: movd %xmm0, (%rsi)
; SSE42-NEXT: retq
;
; AVX-LABEL: trunc_v4i32_to_v4i8:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovdqa (%rdi), %xmm0
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX-NEXT: vmovd %xmm0, (%rsi)
; AVX-NEXT: retq
;
; AVX512F-LABEL: trunc_v4i32_to_v4i8:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512F-NEXT: vmovd %xmm0, (%rsi)
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: trunc_v4i32_to_v4i8:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi)
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: trunc_v4i32_to_v4i8:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: trunc_v4i32_to_v4i8:
-; AVX512BWVL: # BB#0:
+; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi)
; AVX512BWVL-NEXT: retq
@@ -430,41 +430,41 @@ define void @trunc_v4i32_to_v4i8(<16 x i8>* %L, <4 x i8>* %S) nounwind {
define void @shuffle_v8i16_to_v2i16(<8 x i16>* %L, <2 x i16>* %S) nounwind {
; SSE-LABEL: shuffle_v8i16_to_v2i16:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pshufd {{.*#+}} xmm0 = mem[0,2,2,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; SSE-NEXT: movd %xmm0, (%rsi)
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v8i16_to_v2i16:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; AVX-NEXT: vmovd %xmm0, (%rsi)
; AVX-NEXT: retq
;
; AVX512F-LABEL: shuffle_v8i16_to_v2i16:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; AVX512F-NEXT: vmovd %xmm0, (%rsi)
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v8i16_to_v2i16:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
; AVX512VL-NEXT: vpmovqw %xmm0, (%rsi)
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: shuffle_v8i16_to_v2i16:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: shuffle_v8i16_to_v2i16:
-; AVX512BWVL: # BB#0:
+; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
; AVX512BWVL-NEXT: vpmovqw %xmm0, (%rsi)
; AVX512BWVL-NEXT: retq
@@ -476,41 +476,41 @@ define void @shuffle_v8i16_to_v2i16(<8 x i16>* %L, <2 x i16>* %S) nounwind {
define void @trunc_v2i64_to_v2i16(<8 x i16>* %L, <2 x i16>* %S) nounwind {
; SSE-LABEL: trunc_v2i64_to_v2i16:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pshufd {{.*#+}} xmm0 = mem[0,2,2,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; SSE-NEXT: movd %xmm0, (%rsi)
; SSE-NEXT: retq
;
; AVX-LABEL: trunc_v2i64_to_v2i16:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; AVX-NEXT: vmovd %xmm0, (%rsi)
; AVX-NEXT: retq
;
; AVX512F-LABEL: trunc_v2i64_to_v2i16:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; AVX512F-NEXT: vmovd %xmm0, (%rsi)
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: trunc_v2i64_to_v2i16:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
; AVX512VL-NEXT: vpmovqw %xmm0, (%rsi)
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: trunc_v2i64_to_v2i16:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: trunc_v2i64_to_v2i16:
-; AVX512BWVL: # BB#0:
+; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
; AVX512BWVL-NEXT: vpmovqw %xmm0, (%rsi)
; AVX512BWVL-NEXT: retq
@@ -523,7 +523,7 @@ define void @trunc_v2i64_to_v2i16(<8 x i16>* %L, <2 x i16>* %S) nounwind {
define void @shuffle_v16i8_to_v2i8(<16 x i8>* %L, <2 x i8>* %S) nounwind {
; SSE2-LABEL: shuffle_v16i8_to_v2i8:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa (%rdi), %xmm0
; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
; SSE2-NEXT: packuswb %xmm0, %xmm0
@@ -534,41 +534,41 @@ define void @shuffle_v16i8_to_v2i8(<16 x i8>* %L, <2 x i8>* %S) nounwind {
; SSE2-NEXT: retq
;
; SSE42-LABEL: shuffle_v16i8_to_v2i8:
-; SSE42: # BB#0:
+; SSE42: # %bb.0:
; SSE42-NEXT: movdqa (%rdi), %xmm0
; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
; SSE42-NEXT: pextrw $0, %xmm0, (%rsi)
; SSE42-NEXT: retq
;
; AVX-LABEL: shuffle_v16i8_to_v2i8:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovdqa (%rdi), %xmm0
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX-NEXT: vpextrw $0, %xmm0, (%rsi)
; AVX-NEXT: retq
;
; AVX512F-LABEL: shuffle_v16i8_to_v2i8:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512F-NEXT: vpextrw $0, %xmm0, (%rsi)
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i8_to_v2i8:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
; AVX512VL-NEXT: vpmovqb %xmm0, (%rsi)
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: shuffle_v16i8_to_v2i8:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512BW-NEXT: vpextrw $0, %xmm0, (%rsi)
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: shuffle_v16i8_to_v2i8:
-; AVX512BWVL: # BB#0:
+; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
; AVX512BWVL-NEXT: vpmovqb %xmm0, (%rsi)
; AVX512BWVL-NEXT: retq
@@ -580,7 +580,7 @@ define void @shuffle_v16i8_to_v2i8(<16 x i8>* %L, <2 x i8>* %S) nounwind {
define void @trunc_v2i64_to_v2i8(<16 x i8>* %L, <2 x i8>* %S) nounwind {
; SSE2-LABEL: trunc_v2i64_to_v2i8:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa (%rdi), %xmm0
; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
; SSE2-NEXT: packuswb %xmm0, %xmm0
@@ -591,41 +591,41 @@ define void @trunc_v2i64_to_v2i8(<16 x i8>* %L, <2 x i8>* %S) nounwind {
; SSE2-NEXT: retq
;
; SSE42-LABEL: trunc_v2i64_to_v2i8:
-; SSE42: # BB#0:
+; SSE42: # %bb.0:
; SSE42-NEXT: movdqa (%rdi), %xmm0
; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
; SSE42-NEXT: pextrw $0, %xmm0, (%rsi)
; SSE42-NEXT: retq
;
; AVX-LABEL: trunc_v2i64_to_v2i8:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovdqa (%rdi), %xmm0
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX-NEXT: vpextrw $0, %xmm0, (%rsi)
; AVX-NEXT: retq
;
; AVX512F-LABEL: trunc_v2i64_to_v2i8:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512F-NEXT: vpextrw $0, %xmm0, (%rsi)
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: trunc_v2i64_to_v2i8:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
; AVX512VL-NEXT: vpmovqb %xmm0, (%rsi)
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: trunc_v2i64_to_v2i8:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512BW-NEXT: vpextrw $0, %xmm0, (%rsi)
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: trunc_v2i64_to_v2i8:
-; AVX512BWVL: # BB#0:
+; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
; AVX512BWVL-NEXT: vpmovqb %xmm0, (%rsi)
; AVX512BWVL-NEXT: retq
diff --git a/test/CodeGen/X86/shuffle-vs-trunc-256.ll b/test/CodeGen/X86/shuffle-vs-trunc-256.ll
index c84869433546..59a8aa47246c 100644
--- a/test/CodeGen/X86/shuffle-vs-trunc-256.ll
+++ b/test/CodeGen/X86/shuffle-vs-trunc-256.ll
@@ -12,7 +12,7 @@
define void @shuffle_v32i8_to_v16i8(<32 x i8>* %L, <16 x i8>* %S) nounwind {
; AVX1-LABEL: shuffle_v32i8_to_v16i8:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovdqa (%rdi), %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
@@ -24,7 +24,7 @@ define void @shuffle_v32i8_to_v16i8(<32 x i8>* %L, <16 x i8>* %S) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v32i8_to_v16i8:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa (%rdi), %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
@@ -35,36 +35,17 @@ define void @shuffle_v32i8_to_v16i8(<32 x i8>* %L, <16 x i8>* %S) nounwind {
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
-; AVX512F-LABEL: shuffle_v32i8_to_v16i8:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vpmovsxwd (%rdi), %zmm0
-; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512F-NEXT: vmovdqa %xmm0, (%rsi)
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v32i8_to_v16i8:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vpmovsxwd (%rdi), %zmm0
-; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi)
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: retq
-;
-; AVX512BW-LABEL: shuffle_v32i8_to_v16i8:
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
-; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: shuffle_v32i8_to_v16i8:
-; AVX512BWVL: # BB#0:
-; AVX512BWVL-NEXT: vmovdqu (%rdi), %ymm0
-; AVX512BWVL-NEXT: vpmovwb %ymm0, (%rsi)
-; AVX512BWVL-NEXT: vzeroupper
-; AVX512BWVL-NEXT: retq
+; AVX512-LABEL: shuffle_v32i8_to_v16i8:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
%vec = load <32 x i8>, <32 x i8>* %L
%strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
store <16 x i8> %strided.vec, <16 x i8>* %S
@@ -73,7 +54,7 @@ define void @shuffle_v32i8_to_v16i8(<32 x i8>* %L, <16 x i8>* %S) nounwind {
define void @trunc_v16i16_to_v16i8(<32 x i8>* %L, <16 x i8>* %S) nounwind {
; AVX1-LABEL: trunc_v16i16_to_v16i8:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovdqa (%rdi), %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
@@ -85,7 +66,7 @@ define void @trunc_v16i16_to_v16i8(<32 x i8>* %L, <16 x i8>* %S) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_v16i16_to_v16i8:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa (%rdi), %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
@@ -97,7 +78,7 @@ define void @trunc_v16i16_to_v16i8(<32 x i8>* %L, <16 x i8>* %S) nounwind {
; AVX2-NEXT: retq
;
; AVX512F-LABEL: trunc_v16i16_to_v16i8:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vpmovsxwd (%rdi), %zmm0
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
; AVX512F-NEXT: vmovdqa %xmm0, (%rsi)
@@ -105,7 +86,7 @@ define void @trunc_v16i16_to_v16i8(<32 x i8>* %L, <16 x i8>* %S) nounwind {
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: trunc_v16i16_to_v16i8:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpmovsxwd (%rdi), %zmm0
; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi)
@@ -113,7 +94,7 @@ define void @trunc_v16i16_to_v16i8(<32 x i8>* %L, <16 x i8>* %S) nounwind {
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: trunc_v16i16_to_v16i8:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
@@ -121,8 +102,8 @@ define void @trunc_v16i16_to_v16i8(<32 x i8>* %L, <16 x i8>* %S) nounwind {
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: trunc_v16i16_to_v16i8:
-; AVX512BWVL: # BB#0:
-; AVX512BWVL-NEXT: vmovdqu (%rdi), %ymm0
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
; AVX512BWVL-NEXT: vpmovwb %ymm0, (%rsi)
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
@@ -135,7 +116,7 @@ define void @trunc_v16i16_to_v16i8(<32 x i8>* %L, <16 x i8>* %S) nounwind {
define void @shuffle_v16i16_to_v8i16(<16 x i16>* %L, <8 x i16>* %S) nounwind {
; AVX1-LABEL: shuffle_v16i16_to_v8i16:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovdqa (%rdi), %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
@@ -147,41 +128,68 @@ define void @shuffle_v16i16_to_v8i16(<16 x i16>* %L, <8 x i16>* %S) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v16i16_to_v8i16:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa (%rdi), %ymm0
-; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX2-NEXT: vmovdqa %xmm0, (%rsi)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512F-LABEL: shuffle_v16i16_to_v8i16:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512F-NEXT: vmovdqa %xmm0, (%rsi)
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_to_v8i16:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512VL-NEXT: vpmovdw %ymm0, (%rsi)
+; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
+; AVX512VL-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
+; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; AVX512VL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
+; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi)
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: shuffle_v16i16_to_v8i16:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: shuffle_v16i16_to_v8i16:
-; AVX512BWVL: # BB#0:
+; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512BWVL-NEXT: vpmovdw %ymm0, (%rsi)
+; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
+; AVX512BWVL-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
+; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; AVX512BWVL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
+; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512BWVL-NEXT: vmovdqa %xmm0, (%rsi)
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
%vec = load <16 x i16>, <16 x i16>* %L
@@ -192,7 +200,7 @@ define void @shuffle_v16i16_to_v8i16(<16 x i16>* %L, <8 x i16>* %S) nounwind {
define void @trunc_v8i32_to_v8i16(<16 x i16>* %L, <8 x i16>* %S) nounwind {
; AVX1-LABEL: trunc_v8i32_to_v8i16:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovdqa (%rdi), %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
@@ -204,7 +212,7 @@ define void @trunc_v8i32_to_v8i16(<16 x i16>* %L, <8 x i16>* %S) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_v8i32_to_v8i16:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa (%rdi), %ymm0
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
@@ -213,7 +221,7 @@ define void @trunc_v8i32_to_v8i16(<16 x i16>* %L, <8 x i16>* %S) nounwind {
; AVX2-NEXT: retq
;
; AVX512F-LABEL: trunc_v8i32_to_v8i16:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
; AVX512F-NEXT: vmovdqa %xmm0, (%rsi)
@@ -221,14 +229,14 @@ define void @trunc_v8i32_to_v8i16(<16 x i16>* %L, <8 x i16>* %S) nounwind {
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: trunc_v8i32_to_v8i16:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
; AVX512VL-NEXT: vpmovdw %ymm0, (%rsi)
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: trunc_v8i32_to_v8i16:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0
; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
@@ -236,7 +244,7 @@ define void @trunc_v8i32_to_v8i16(<16 x i16>* %L, <8 x i16>* %S) nounwind {
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: trunc_v8i32_to_v8i16:
-; AVX512BWVL: # BB#0:
+; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
; AVX512BWVL-NEXT: vpmovdw %ymm0, (%rsi)
; AVX512BWVL-NEXT: vzeroupper
@@ -249,52 +257,23 @@ define void @trunc_v8i32_to_v8i16(<16 x i16>* %L, <8 x i16>* %S) nounwind {
}
define void @shuffle_v8i32_to_v4i32(<8 x i32>* %L, <4 x i32>* %S) nounwind {
-; AVX1-LABEL: shuffle_v8i32_to_v4i32:
-; AVX1: # BB#0:
-; AVX1-NEXT: vmovaps (%rdi), %ymm0
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
-; AVX1-NEXT: vmovaps %xmm0, (%rsi)
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: shuffle_v8i32_to_v4i32:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = mem[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: vmovdqa %xmm0, (%rsi)
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: shuffle_v8i32_to_v4i32:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512F-NEXT: vmovdqa %xmm0, (%rsi)
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v8i32_to_v4i32:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512VL-NEXT: vpmovqd %ymm0, (%rsi)
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: retq
-;
-; AVX512BW-LABEL: shuffle_v8i32_to_v4i32:
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: shuffle_v8i32_to_v4i32:
-; AVX512BWVL: # BB#0:
-; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512BWVL-NEXT: vpmovqd %ymm0, (%rsi)
-; AVX512BWVL-NEXT: vzeroupper
-; AVX512BWVL-NEXT: retq
+; AVX-LABEL: shuffle_v8i32_to_v4i32:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovaps (%rdi), %ymm0
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; AVX-NEXT: vmovaps %xmm0, (%rsi)
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: shuffle_v8i32_to_v4i32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vmovaps (%rdi), %ymm0
+; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; AVX512-NEXT: vmovaps %xmm0, (%rsi)
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
%vec = load <8 x i32>, <8 x i32>* %L
%strided.vec = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
store <4 x i32> %strided.vec, <4 x i32>* %S
@@ -303,7 +282,7 @@ define void @shuffle_v8i32_to_v4i32(<8 x i32>* %L, <4 x i32>* %S) nounwind {
define void @trunc_v4i64_to_v4i32(<8 x i32>* %L, <4 x i32>* %S) nounwind {
; AVX1-LABEL: trunc_v4i64_to_v4i32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovaps (%rdi), %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
@@ -312,15 +291,15 @@ define void @trunc_v4i64_to_v4i32(<8 x i32>* %L, <4 x i32>* %S) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_v4i64_to_v4i32:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = mem[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = mem[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vmovaps %xmm0, (%rsi)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512F-LABEL: trunc_v4i64_to_v4i32:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
; AVX512F-NEXT: vmovdqa %xmm0, (%rsi)
@@ -328,14 +307,14 @@ define void @trunc_v4i64_to_v4i32(<8 x i32>* %L, <4 x i32>* %S) nounwind {
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: trunc_v4i64_to_v4i32:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
; AVX512VL-NEXT: vpmovqd %ymm0, (%rsi)
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: trunc_v4i64_to_v4i32:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
@@ -343,7 +322,7 @@ define void @trunc_v4i64_to_v4i32(<8 x i32>* %L, <4 x i32>* %S) nounwind {
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: trunc_v4i64_to_v4i32:
-; AVX512BWVL: # BB#0:
+; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
; AVX512BWVL-NEXT: vpmovqd %ymm0, (%rsi)
; AVX512BWVL-NEXT: vzeroupper
@@ -357,7 +336,7 @@ define void @trunc_v4i64_to_v4i32(<8 x i32>* %L, <4 x i32>* %S) nounwind {
define void @shuffle_v32i8_to_v8i8(<32 x i8>* %L, <8 x i8>* %S) nounwind {
; AVX1-LABEL: shuffle_v32i8_to_v8i8:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovdqa (%rdi), %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
@@ -369,44 +348,65 @@ define void @shuffle_v32i8_to_v8i8(<32 x i8>* %L, <8 x i8>* %S) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v32i8_to_v8i8:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa (%rdi), %ymm0
-; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; AVX2-NEXT: vmovq %xmm0, (%rsi)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512F-LABEL: shuffle_v32i8_to_v8i8:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; AVX512F-NEXT: vmovq %xmm0, (%rsi)
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v32i8_to_v8i8:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512VL-NEXT: vpmovdb %ymm0, (%rsi)
+; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512VL-NEXT: vmovq %xmm0, (%rsi)
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: shuffle_v32i8_to_v8i8:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: shuffle_v32i8_to_v8i8:
-; AVX512BWVL: # BB#0:
+; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512BWVL-NEXT: vpmovdb %ymm0, (%rsi)
+; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
+; AVX512BWVL-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
+; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; AVX512BWVL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
+; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rsi)
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
%vec = load <32 x i8>, <32 x i8>* %L
@@ -417,7 +417,7 @@ define void @shuffle_v32i8_to_v8i8(<32 x i8>* %L, <8 x i8>* %S) nounwind {
define void @trunc_v8i32_to_v8i8(<32 x i8>* %L, <8 x i8>* %S) nounwind {
; AVX1-LABEL: trunc_v8i32_to_v8i8:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovdqa (%rdi), %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
@@ -429,7 +429,7 @@ define void @trunc_v8i32_to_v8i8(<32 x i8>* %L, <8 x i8>* %S) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_v8i32_to_v8i8:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa (%rdi), %ymm0
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
@@ -439,7 +439,7 @@ define void @trunc_v8i32_to_v8i8(<32 x i8>* %L, <8 x i8>* %S) nounwind {
; AVX2-NEXT: retq
;
; AVX512F-LABEL: trunc_v8i32_to_v8i8:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
@@ -448,14 +448,14 @@ define void @trunc_v8i32_to_v8i8(<32 x i8>* %L, <8 x i8>* %S) nounwind {
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: trunc_v8i32_to_v8i8:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
; AVX512VL-NEXT: vpmovdb %ymm0, (%rsi)
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: trunc_v8i32_to_v8i8:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0
; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
@@ -464,7 +464,7 @@ define void @trunc_v8i32_to_v8i8(<32 x i8>* %L, <8 x i8>* %S) nounwind {
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: trunc_v8i32_to_v8i8:
-; AVX512BWVL: # BB#0:
+; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
; AVX512BWVL-NEXT: vpmovdb %ymm0, (%rsi)
; AVX512BWVL-NEXT: vzeroupper
@@ -478,53 +478,72 @@ define void @trunc_v8i32_to_v8i8(<32 x i8>* %L, <8 x i8>* %S) nounwind {
define void @shuffle_v16i16_to_v4i16(<16 x i16>* %L, <4 x i16>* %S) nounwind {
; AVX1-LABEL: shuffle_v16i16_to_v4i16:
-; AVX1: # BB#0:
-; AVX1-NEXT: vmovaps (%rdi), %ymm0
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa (%rdi), %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; AVX1-NEXT: vmovq %xmm0, (%rsi)
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v16i16_to_v4i16:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = mem[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovdqa (%rdi), %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; AVX2-NEXT: vmovq %xmm0, (%rsi)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512F-LABEL: shuffle_v16i16_to_v4i16:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; AVX512F-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
+; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; AVX512F-NEXT: vmovq %xmm0, (%rsi)
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_to_v4i16:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512VL-NEXT: vpmovqw %ymm0, (%rsi)
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovaps (%rdi), %ymm0
+; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX512VL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; AVX512VL-NEXT: vpmovdw %xmm0, (%rsi)
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: shuffle_v16i16_to_v4i16:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
+; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: shuffle_v16i16_to_v4i16:
-; AVX512BWVL: # BB#0:
-; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512BWVL-NEXT: vpmovqw %ymm0, (%rsi)
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vmovaps (%rdi), %ymm0
+; AVX512BWVL-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX512BWVL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; AVX512BWVL-NEXT: vpmovdw %xmm0, (%rsi)
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
%vec = load <16 x i16>, <16 x i16>* %L
@@ -535,7 +554,7 @@ define void @shuffle_v16i16_to_v4i16(<16 x i16>* %L, <4 x i16>* %S) nounwind {
define void @trunc_v4i64_to_v4i16(<16 x i16>* %L, <4 x i16>* %S) nounwind {
; AVX1-LABEL: trunc_v4i64_to_v4i16:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovaps (%rdi), %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
@@ -545,7 +564,7 @@ define void @trunc_v4i64_to_v4i16(<16 x i16>* %L, <4 x i16>* %S) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_v4i64_to_v4i16:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = mem[0,2,2,3,4,6,6,7]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
@@ -554,7 +573,7 @@ define void @trunc_v4i64_to_v4i16(<16 x i16>* %L, <4 x i16>* %S) nounwind {
; AVX2-NEXT: retq
;
; AVX512F-LABEL: trunc_v4i64_to_v4i16:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
@@ -563,14 +582,14 @@ define void @trunc_v4i64_to_v4i16(<16 x i16>* %L, <4 x i16>* %S) nounwind {
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: trunc_v4i64_to_v4i16:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
; AVX512VL-NEXT: vpmovqw %ymm0, (%rsi)
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: trunc_v4i64_to_v4i16:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
@@ -579,7 +598,7 @@ define void @trunc_v4i64_to_v4i16(<16 x i16>* %L, <4 x i16>* %S) nounwind {
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: trunc_v4i64_to_v4i16:
-; AVX512BWVL: # BB#0:
+; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
; AVX512BWVL-NEXT: vpmovqw %ymm0, (%rsi)
; AVX512BWVL-NEXT: vzeroupper
@@ -593,53 +612,68 @@ define void @trunc_v4i64_to_v4i16(<16 x i16>* %L, <4 x i16>* %S) nounwind {
define void @shuffle_v32i8_to_v4i8(<32 x i8>* %L, <4 x i8>* %S) nounwind {
; AVX1-LABEL: shuffle_v32i8_to_v4i8:
-; AVX1: # BB#0:
-; AVX1-NEXT: vmovaps (%rdi), %ymm0
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa (%rdi), %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX1-NEXT: vmovd %xmm0, (%rsi)
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v32i8_to_v4i8:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = mem[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovdqa (%rdi), %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX2-NEXT: vmovd %xmm0, (%rsi)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512F-LABEL: shuffle_v32i8_to_v4i8:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX512F-NEXT: vmovd %xmm0, (%rsi)
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v32i8_to_v4i8:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512VL-NEXT: vpmovqb %ymm0, (%rsi)
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovaps (%rdi), %ymm0
+; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX512VL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi)
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: shuffle_v32i8_to_v4i8:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8:
-; AVX512BWVL: # BB#0:
-; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512BWVL-NEXT: vpmovqb %ymm0, (%rsi)
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vmovaps (%rdi), %ymm0
+; AVX512BWVL-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX512BWVL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi)
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
%vec = load <32 x i8>, <32 x i8>* %L
@@ -650,7 +684,7 @@ define void @shuffle_v32i8_to_v4i8(<32 x i8>* %L, <4 x i8>* %S) nounwind {
define void @trunc_v4i64_to_v4i8(<32 x i8>* %L, <4 x i8>* %S) nounwind {
; AVX1-LABEL: trunc_v4i64_to_v4i8:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovaps (%rdi), %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
@@ -660,7 +694,7 @@ define void @trunc_v4i64_to_v4i8(<32 x i8>* %L, <4 x i8>* %S) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_v4i64_to_v4i8:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = mem[0,2,2,3,4,6,6,7]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
@@ -669,7 +703,7 @@ define void @trunc_v4i64_to_v4i8(<32 x i8>* %L, <4 x i8>* %S) nounwind {
; AVX2-NEXT: retq
;
; AVX512F-LABEL: trunc_v4i64_to_v4i8:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
@@ -678,14 +712,14 @@ define void @trunc_v4i64_to_v4i8(<32 x i8>* %L, <4 x i8>* %S) nounwind {
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: trunc_v4i64_to_v4i8:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
; AVX512VL-NEXT: vpmovqb %ymm0, (%rsi)
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: trunc_v4i64_to_v4i8:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
@@ -694,7 +728,7 @@ define void @trunc_v4i64_to_v4i8(<32 x i8>* %L, <4 x i8>* %S) nounwind {
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: trunc_v4i64_to_v4i8:
-; AVX512BWVL: # BB#0:
+; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
; AVX512BWVL-NEXT: vpmovqb %ymm0, (%rsi)
; AVX512BWVL-NEXT: vzeroupper
@@ -710,7 +744,7 @@ define void @trunc_v4i64_to_v4i8(<32 x i8>* %L, <4 x i8>* %S) nounwind {
; the resulting BUILD_VECTOR should not be combined to a truncate.
define <16 x i8> @negative(<32 x i8> %v, <32 x i8> %w) nounwind {
; AVX1-LABEL: negative:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[u,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u],zero,zero,zero,zero,zero,zero,zero,xmm0[0,2,4,6,8,10,12,14]
@@ -721,53 +755,53 @@ define <16 x i8> @negative(<32 x i8> %v, <32 x i8> %w) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: negative:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,u,18,20,22,24,26,28,30,16,18,20,22,24,26,28,30]
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
-; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512F-LABEL: negative:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,u,18,20,22,24,26,28,30,16,18,20,22,24,26,28,30]
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX512F-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
-; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512F-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: negative:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,u,18,20,22,24,26,28,30,16,18,20,22,24,26,28,30]
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX512VL-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
-; AVX512VL-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512VL-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: negative:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,u,18,20,22,24,26,28,30,16,18,20,22,24,26,28,30]
; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX512BW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
-; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512BW-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: negative:
-; AVX512BWVL: # BB#0:
+; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,u,18,20,22,24,26,28,30,16,18,20,22,24,26,28,30]
; AVX512BWVL-NEXT: movl $65537, %eax # imm = 0x10001
; AVX512BWVL-NEXT: kmovd %eax, %k1
; AVX512BWVL-NEXT: vmovdqu8 %ymm1, %ymm0 {%k1}
; AVX512BWVL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
-; AVX512BWVL-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512BWVL-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
%strided.vec = shufflevector <32 x i8> %v, <32 x i8> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
diff --git a/test/CodeGen/X86/shuffle-vs-trunc-512.ll b/test/CodeGen/X86/shuffle-vs-trunc-512.ll
index 69155b5cc565..3fa148405f6b 100644
--- a/test/CodeGen/X86/shuffle-vs-trunc-512.ll
+++ b/test/CodeGen/X86/shuffle-vs-trunc-512.ll
@@ -10,38 +10,50 @@
define void @shuffle_v64i8_to_v32i8(<64 x i8>* %L, <32 x i8>* %S) nounwind {
; AVX512F-LABEL: shuffle_v64i8_to_v32i8:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vpmovsxwd (%rdi), %zmm0
-; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512F-NEXT: vpmovsxwd 32(%rdi), %zmm1
-; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
-; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1
+; AVX512F-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30]
+; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u]
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
+; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX512F-NEXT: vmovdqa %ymm0, (%rsi)
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v64i8_to_v32i8:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vpmovsxwd (%rdi), %zmm0
-; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512VL-NEXT: vpmovsxwd 32(%rdi), %zmm1
-; AVX512VL-NEXT: vpmovdb %zmm1, %xmm1
-; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1
+; AVX512VL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30]
+; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u]
+; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
+; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX512VL-NEXT: vmovdqa %ymm0, (%rsi)
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: shuffle_v64i8_to_v32i8:
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vmovdqu16 (%rdi), %zmm0
-; AVX512BW-NEXT: vpmovwb %zmm0, (%rsi)
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30]
+; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
+; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX512BW-NEXT: vmovdqa %ymm0, (%rsi)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: shuffle_v64i8_to_v32i8:
-; AVX512BWVL: # BB#0:
-; AVX512BWVL-NEXT: vmovdqu16 (%rdi), %zmm0
-; AVX512BWVL-NEXT: vpmovwb %zmm0, (%rsi)
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512BWVL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30]
+; AVX512BWVL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u]
+; AVX512BWVL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
+; AVX512BWVL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX512BWVL-NEXT: vmovdqa %ymm0, (%rsi)
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
%vec = load <64 x i8>, <64 x i8>* %L
@@ -52,7 +64,7 @@ define void @shuffle_v64i8_to_v32i8(<64 x i8>* %L, <32 x i8>* %S) nounwind {
define void @trunc_v32i16_to_v32i8(<64 x i8>* %L, <32 x i8>* %S) nounwind {
; AVX512F-LABEL: trunc_v32i16_to_v32i8:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vpmovsxwd (%rdi), %zmm0
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
; AVX512F-NEXT: vpmovsxwd 32(%rdi), %zmm1
@@ -63,7 +75,7 @@ define void @trunc_v32i16_to_v32i8(<64 x i8>* %L, <32 x i8>* %S) nounwind {
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: trunc_v32i16_to_v32i8:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpmovsxwd (%rdi), %zmm0
; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
; AVX512VL-NEXT: vpmovsxwd 32(%rdi), %zmm1
@@ -74,15 +86,15 @@ define void @trunc_v32i16_to_v32i8(<64 x i8>* %L, <32 x i8>* %S) nounwind {
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: trunc_v32i16_to_v32i8:
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vmovdqu16 (%rdi), %zmm0
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
; AVX512BW-NEXT: vpmovwb %zmm0, (%rsi)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: trunc_v32i16_to_v32i8:
-; AVX512BWVL: # BB#0:
-; AVX512BWVL-NEXT: vmovdqu16 (%rdi), %zmm0
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vmovdqa64 (%rdi), %zmm0
; AVX512BWVL-NEXT: vpmovwb %zmm0, (%rsi)
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
@@ -94,12 +106,54 @@ define void @trunc_v32i16_to_v32i8(<64 x i8>* %L, <32 x i8>* %S) nounwind {
}
define void @shuffle_v32i16_to_v16i16(<32 x i16>* %L, <16 x i16>* %S) nounwind {
-; AVX512-LABEL: shuffle_v32i16_to_v16i16:
-; AVX512: # BB#0:
-; AVX512-NEXT: vmovdqa32 (%rdi), %zmm0
-; AVX512-NEXT: vpmovdw %zmm0, (%rsi)
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
+; AVX512F-LABEL: shuffle_v32i16_to_v16i16:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpshuflw {{.*#+}} ymm0 = mem[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
+; AVX512F-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
+; AVX512F-NEXT: vpshuflw {{.*#+}} ymm1 = mem[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
+; AVX512F-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
+; AVX512F-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,2],ymm0[0,2],ymm1[4,6],ymm0[4,6]
+; AVX512F-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX512F-NEXT: vmovaps %ymm0, (%rsi)
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v32i16_to_v16i16:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpshuflw {{.*#+}} ymm0 = mem[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
+; AVX512VL-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
+; AVX512VL-NEXT: vpshuflw {{.*#+}} ymm1 = mem[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
+; AVX512VL-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
+; AVX512VL-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,2],ymm0[0,2],ymm1[4,6],ymm0[4,6]
+; AVX512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX512VL-NEXT: vmovaps %ymm0, (%rsi)
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
+;
+; AVX512BW-LABEL: shuffle_v32i16_to_v16i16:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512BW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
+; AVX512BW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
+; AVX512BW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
+; AVX512BW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
+; AVX512BW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
+; AVX512BW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX512BW-NEXT: vmovaps %ymm0, (%rsi)
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+;
+; AVX512BWVL-LABEL: shuffle_v32i16_to_v16i16:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,16,18,20,22,8,10,12,14,24,26,28,30]
+; AVX512BWVL-NEXT: vpermi2w %ymm1, %ymm0, %ymm2
+; AVX512BWVL-NEXT: vpermq {{.*#+}} ymm0 = ymm2[0,2,1,3]
+; AVX512BWVL-NEXT: vmovdqa %ymm0, (%rsi)
+; AVX512BWVL-NEXT: vzeroupper
+; AVX512BWVL-NEXT: retq
%vec = load <32 x i16>, <32 x i16>* %L
%strided.vec = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
store <16 x i16> %strided.vec, <16 x i16>* %S
@@ -108,7 +162,7 @@ define void @shuffle_v32i16_to_v16i16(<32 x i16>* %L, <16 x i16>* %S) nounwind {
define void @trunc_v16i32_to_v16i16(<32 x i16>* %L, <16 x i16>* %S) nounwind {
; AVX512-LABEL: trunc_v16i32_to_v16i16:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vmovdqa32 (%rdi), %zmm0
; AVX512-NEXT: vpmovdw %zmm0, (%rsi)
; AVX512-NEXT: vzeroupper
@@ -122,9 +176,12 @@ define void @trunc_v16i32_to_v16i16(<32 x i16>* %L, <16 x i16>* %S) nounwind {
define void @shuffle_v16i32_to_v8i32(<16 x i32>* %L, <8 x i32>* %S) nounwind {
; AVX512-LABEL: shuffle_v16i32_to_v8i32:
-; AVX512: # BB#0:
-; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
-; AVX512-NEXT: vpmovqd %zmm0, (%rsi)
+; AVX512: # %bb.0:
+; AVX512-NEXT: vmovaps (%rdi), %zmm0
+; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1
+; AVX512-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
+; AVX512-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX512-NEXT: vmovaps %ymm0, (%rsi)
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%vec = load <16 x i32>, <16 x i32>* %L
@@ -135,7 +192,7 @@ define void @shuffle_v16i32_to_v8i32(<16 x i32>* %L, <8 x i32>* %S) nounwind {
define void @trunc_v8i64_to_v8i32(<16 x i32>* %L, <8 x i32>* %S) nounwind {
; AVX512-LABEL: trunc_v8i64_to_v8i32:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
; AVX512-NEXT: vpmovqd %zmm0, (%rsi)
; AVX512-NEXT: vzeroupper
@@ -148,12 +205,81 @@ define void @trunc_v8i64_to_v8i32(<16 x i32>* %L, <8 x i32>* %S) nounwind {
}
define void @shuffle_v64i8_to_v16i8(<64 x i8>* %L, <16 x i8>* %S) nounwind {
-; AVX512-LABEL: shuffle_v64i8_to_v16i8:
-; AVX512: # BB#0:
-; AVX512-NEXT: vmovdqa32 (%rdi), %zmm0
-; AVX512-NEXT: vpmovdb %zmm0, (%rsi)
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
+; AVX512F-LABEL: shuffle_v64i8_to_v16i8:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512F-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512F-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX512F-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v64i8_to_v16i8:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1
+; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
+; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
+;
+; AVX512BW-LABEL: shuffle_v64i8_to_v16i8:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+;
+; AVX512BWVL-LABEL: shuffle_v64i8_to_v16i8:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512BWVL-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
+; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; AVX512BWVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX512BWVL-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX512BWVL-NEXT: vzeroupper
+; AVX512BWVL-NEXT: retq
%vec = load <64 x i8>, <64 x i8>* %L
%strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60>
store <16 x i8> %strided.vec, <16 x i8>* %S
@@ -162,7 +288,7 @@ define void @shuffle_v64i8_to_v16i8(<64 x i8>* %L, <16 x i8>* %S) nounwind {
define void @trunc_v16i32_to_v16i8(<64 x i8>* %L, <16 x i8>* %S) nounwind {
; AVX512-LABEL: trunc_v16i32_to_v16i8:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vmovdqa32 (%rdi), %zmm0
; AVX512-NEXT: vpmovdb %zmm0, (%rsi)
; AVX512-NEXT: vzeroupper
@@ -175,12 +301,78 @@ define void @trunc_v16i32_to_v16i8(<64 x i8>* %L, <16 x i8>* %S) nounwind {
}
define void @shuffle_v32i16_to_v8i16(<32 x i16>* %L, <8 x i16>* %S) nounwind {
-; AVX512-LABEL: shuffle_v32i16_to_v8i16:
-; AVX512: # BB#0:
-; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
-; AVX512-NEXT: vpmovqw %zmm0, (%rsi)
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
+; AVX512F-LABEL: shuffle_v32i16_to_v8i16:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; AVX512F-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7]
+; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; AVX512F-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,0,2,4,5,6,7]
+; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX512F-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; AVX512F-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
+; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX512F-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v32i16_to_v8i16:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1
+; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7]
+; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,0,2,4,5,6,7]
+; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
+; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
+;
+; AVX512BW-LABEL: shuffle_v32i16_to_v8i16:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512BW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7]
+; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,0,2,4,5,6,7]
+; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX512BW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
+; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+;
+; AVX512BWVL-LABEL: shuffle_v32i16_to_v8i16:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm2 = <0,4,8,12,16,20,24,28,u,u,u,u,u,u,u,u>
+; AVX512BWVL-NEXT: vpermi2w %ymm1, %ymm0, %ymm2
+; AVX512BWVL-NEXT: vmovdqa %xmm2, (%rsi)
+; AVX512BWVL-NEXT: vzeroupper
+; AVX512BWVL-NEXT: retq
%vec = load <32 x i16>, <32 x i16>* %L
%strided.vec = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
store <8 x i16> %strided.vec, <8 x i16>* %S
@@ -189,7 +381,7 @@ define void @shuffle_v32i16_to_v8i16(<32 x i16>* %L, <8 x i16>* %S) nounwind {
define void @trunc_v8i64_to_v8i16(<32 x i16>* %L, <8 x i16>* %S) nounwind {
; AVX512-LABEL: trunc_v8i64_to_v8i16:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
; AVX512-NEXT: vpmovqw %zmm0, (%rsi)
; AVX512-NEXT: vzeroupper
@@ -202,12 +394,72 @@ define void @trunc_v8i64_to_v8i16(<32 x i16>* %L, <8 x i16>* %S) nounwind {
}
define void @shuffle_v64i8_to_v8i8(<64 x i8>* %L, <8 x i8>* %S) nounwind {
-; AVX512-LABEL: shuffle_v64i8_to_v8i8:
-; AVX512: # BB#0:
-; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
-; AVX512-NEXT: vpmovqb %zmm0, (%rsi)
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
+; AVX512F-LABEL: shuffle_v64i8_to_v8i8:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512F-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512F-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX512F-NEXT: vmovq %xmm0, (%rsi)
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v64i8_to_v8i8:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1
+; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX512VL-NEXT: vmovq %xmm0, (%rsi)
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
+;
+; AVX512BW-LABEL: shuffle_v64i8_to_v8i8:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+;
+; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm2 = <0,4,8,12,16,20,24,28,u,u,u,u,u,u,u,u>
+; AVX512BWVL-NEXT: vpermi2w %ymm1, %ymm0, %ymm2
+; AVX512BWVL-NEXT: vpmovwb %xmm2, (%rsi)
+; AVX512BWVL-NEXT: vzeroupper
+; AVX512BWVL-NEXT: retq
%vec = load <64 x i8>, <64 x i8>* %L
%strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <8 x i32> <i32 0, i32 8, i32 16, i32 24, i32 32, i32 40, i32 48, i32 56>
store <8 x i8> %strided.vec, <8 x i8>* %S
@@ -216,7 +468,7 @@ define void @shuffle_v64i8_to_v8i8(<64 x i8>* %L, <8 x i8>* %S) nounwind {
define void @trunc_v8i64_to_v8i8(<64 x i8>* %L, <8 x i8>* %S) nounwind {
; AVX512-LABEL: trunc_v8i64_to_v8i8:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
; AVX512-NEXT: vpmovqb %zmm0, (%rsi)
; AVX512-NEXT: vzeroupper
@@ -227,3 +479,197 @@ define void @trunc_v8i64_to_v8i8(<64 x i8>* %L, <8 x i8>* %S) nounwind {
store <8 x i8> %strided.vec, <8 x i8>* %S
ret void
}
+
+define <16 x i8> @trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61(<64 x i8> %x) {
+; AVX512F-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512F-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512F-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
+; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
+;
+; AVX512BW-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+;
+; AVX512BWVL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512BWVL-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
+; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; AVX512BWVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX512BWVL-NEXT: vzeroupper
+; AVX512BWVL-NEXT: retq
+ %res = shufflevector <64 x i8> %x, <64 x i8> %x, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61>
+ ret <16 x i8> %res
+}
+
+define <16 x i8> @trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62(<64 x i8> %x) {
+; AVX512F-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512F-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,5,9,14,u,u,u,u,u,u,u,u]
+; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u]
+; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512VL-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,5,9,14,u,u,u,u,u,u,u,u]
+; AVX512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u]
+; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
+;
+; AVX512BW-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm2
+; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,5,9,14,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+;
+; AVX512BWVL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm2
+; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,5,9,14,u,u,u,u,u,u,u,u]
+; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u]
+; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; AVX512BWVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX512BWVL-NEXT: vzeroupper
+; AVX512BWVL-NEXT: retq
+ %res = shufflevector <64 x i8> %x, <64 x i8> %x, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 62>
+ ret <16 x i8> %res
+}
+
+define <4 x double> @PR34175(<32 x i16>* %p) {
+; AVX512F-LABEL: PR34175:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovdqu (%rdi), %ymm0
+; AVX512F-NEXT: vmovdqu 32(%rdi), %ymm1
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; AVX512F-NEXT: vpbroadcastd %xmm1, %xmm1
+; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX512F-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX512F-NEXT: vcvtdq2pd %xmm0, %ymm0
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: PR34175:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqu (%rdi), %ymm0
+; AVX512VL-NEXT: vmovdqu 32(%rdi), %ymm1
+; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; AVX512VL-NEXT: vpbroadcastd %xmm1, %xmm1
+; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX512VL-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX512VL-NEXT: vcvtdq2pd %xmm0, %ymm0
+; AVX512VL-NEXT: retq
+;
+; AVX512BW-LABEL: PR34175:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqu64 (%rdi), %zmm0
+; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; AVX512BW-NEXT: vpbroadcastd %xmm0, %xmm0
+; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
+; AVX512BW-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX512BW-NEXT: vcvtdq2pd %xmm0, %ymm0
+; AVX512BW-NEXT: retq
+;
+; AVX512BWVL-LABEL: PR34175:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vmovdqu64 (%rdi), %zmm0
+; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm2 = <0,8,16,24,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BWVL-NEXT: vpermi2w %ymm1, %ymm0, %ymm2
+; AVX512BWVL-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
+; AVX512BWVL-NEXT: vcvtdq2pd %xmm0, %ymm0
+; AVX512BWVL-NEXT: retq
+ %v = load <32 x i16>, <32 x i16>* %p, align 2
+ %shuf = shufflevector <32 x i16> %v, <32 x i16> undef, <4 x i32> <i32 0, i32 8, i32 16, i32 24>
+ %tofp = uitofp <4 x i16> %shuf to <4 x double>
+ ret <4 x double> %tofp
+}
diff --git a/test/CodeGen/X86/sincos-opt.ll b/test/CodeGen/X86/sincos-opt.ll
index e2fd63eab30f..b4330ea58ea5 100644
--- a/test/CodeGen/X86/sincos-opt.ll
+++ b/test/CodeGen/X86/sincos-opt.ll
@@ -3,6 +3,8 @@
; RUN: llc < %s -mtriple=x86_64-pc-linux-gnu -mcpu=core2 | FileCheck %s --check-prefix=GNU_SINCOS
; RUN: llc < %s -mtriple=x86_64-pc-linux-gnu -mcpu=core2 -enable-unsafe-fp-math | FileCheck %s --check-prefix=GNU_SINCOS_FASTMATH
; RUN: llc < %s -mtriple=x86_64-pc-linux-gnux32 -mcpu=core2 -enable-unsafe-fp-math | FileCheck %s --check-prefix=GNU_SINCOS_FASTMATH
+; RUN: llc < %s -mtriple=x86_64-fuchsia -mcpu=core2 | FileCheck %s --check-prefix=GNU_SINCOS
+; RUN: llc < %s -mtriple=x86_64-fuchsia -mcpu=core2 -enable-unsafe-fp-math | FileCheck %s --check-prefix=GNU_SINCOS_FASTMATH
; Combine sin / cos into a single call unless they may write errno (as
; captured by readnone attrbiute, controlled by clang -fmath-errno
@@ -116,10 +118,10 @@ entry:
; GNU_SINCOS: faddp %st(1)
; GNU_SINCOS_FASTMATH-LABEL: test3:
-; GNU_SINCOS_FASTMATH: fsin
-; GNU_SINCOS_FASTMATH: fcos
+; GNU_SINCOS_FASTMATH: callq sincosl
+; GNU_SINCOS_FASTMATH: fldt 16(%{{[re]}}sp)
+; GNU_SINCOS_FASTMATH: fldt 32(%{{[re]}}sp)
; GNU_SINCOS_FASTMATH: faddp %st(1)
-; GNU_SINCOS_FASTMATH: ret
%call = tail call x86_fp80 @sinl(x86_fp80 %x) readnone
%call1 = tail call x86_fp80 @cosl(x86_fp80 %x) readnone
%add = fadd x86_fp80 %call, %call1
diff --git a/test/CodeGen/X86/sincos.ll b/test/CodeGen/X86/sincos.ll
index 8f0e6f1edf66..c6c995f1a568 100644
--- a/test/CodeGen/X86/sincos.ll
+++ b/test/CodeGen/X86/sincos.ll
@@ -1,7 +1,7 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; Make sure this testcase codegens to the sin and cos instructions, not calls
-; RUN: llc < %s -mtriple=i686-apple-macosx -mattr=-sse,-sse2,-sse3 -enable-unsafe-fp-math | FileCheck %s --check-prefix=SIN
-; RUN: llc < %s -mtriple=i686-apple-macosx -mattr=-sse,-sse2,-sse3 -enable-unsafe-fp-math | FileCheck %s --check-prefix=COS
-; RUN: llc < %s -mtriple=i686-apple-macosx -mattr=-sse,-sse2,-sse3 | FileCheck %s --check-prefix=SAFE
+; RUN: llc < %s -mtriple=i686-apple-macosx -mattr=-sse,-sse2,-sse3 -enable-unsafe-fp-math | FileCheck %s --check-prefix=CHECK --check-prefix=UNSAFE
+; RUN: llc < %s -mtriple=i686-apple-macosx -mattr=-sse,-sse2,-sse3 | FileCheck %s --check-prefix=CHECK --check-prefix=SAFE
declare float @sinf(float) readonly
@@ -9,39 +9,48 @@ declare double @sin(double) readonly
declare x86_fp80 @sinl(x86_fp80) readonly
-; SIN-LABEL: test1:
define float @test1(float %X) {
+; CHECK-LABEL: test1:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: subl $12, %esp
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: flds {{[0-9]+}}(%esp)
+; CHECK-NEXT: fstps (%esp)
+; CHECK-NEXT: calll _sinf
+; CHECK-NEXT: addl $12, %esp
+; CHECK-NEXT: retl
%Y = call float @sinf(float %X) readonly
ret float %Y
}
-; SIN: {{^[ \t]*fsin$}}
-; SIN-NOT: fsin
-
-; SAFE: test1
-; SAFE-NOT: fsin
-
-; SIN-LABEL: test2:
define double @test2(double %X) {
+; CHECK-LABEL: test2:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: subl $12, %esp
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: fldl {{[0-9]+}}(%esp)
+; CHECK-NEXT: fstpl (%esp)
+; CHECK-NEXT: calll _sin
+; CHECK-NEXT: addl $12, %esp
+; CHECK-NEXT: retl
%Y = call double @sin(double %X) readonly
ret double %Y
}
-; SIN: {{^[ \t]*fsin$}}
-
-; SIN-NOT: fsin
-
-; SAFE: test2
-; SAFE-NOT: fsin
-; SIN-LABEL: test3:
define x86_fp80 @test3(x86_fp80 %X) {
+; CHECK-LABEL: test3:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: subl $28, %esp
+; CHECK-NEXT: .cfi_def_cfa_offset 32
+; CHECK-NEXT: fldt {{[0-9]+}}(%esp)
+; CHECK-NEXT: fstpt (%esp)
+; CHECK-NEXT: calll _sinl
+; CHECK-NEXT: addl $28, %esp
+; CHECK-NEXT: retl
%Y = call x86_fp80 @sinl(x86_fp80 %X) readonly
ret x86_fp80 %Y
}
-; SIN: {{^[ \t]*fsin$}}
-; SIN-NOT: fsin
-; COS-NOT: fcos
declare float @cosf(float) readonly
declare double @cos(double) readonly
@@ -49,31 +58,44 @@ declare double @cos(double) readonly
declare x86_fp80 @cosl(x86_fp80) readonly
-; SIN-LABEL: test4:
-; COS-LABEL: test3:
define float @test4(float %X) {
+; CHECK-LABEL: test4:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: subl $12, %esp
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: flds {{[0-9]+}}(%esp)
+; CHECK-NEXT: fstps (%esp)
+; CHECK-NEXT: calll _cosf
+; CHECK-NEXT: addl $12, %esp
+; CHECK-NEXT: retl
%Y = call float @cosf(float %X) readonly
ret float %Y
}
-; COS: {{^[ \t]*fcos}}
-
-; SAFE: test4
-; SAFE-NOT: fcos
define double @test5(double %X) {
+; CHECK-LABEL: test5:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: subl $12, %esp
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: fldl {{[0-9]+}}(%esp)
+; CHECK-NEXT: fstpl (%esp)
+; CHECK-NEXT: calll _cos
+; CHECK-NEXT: addl $12, %esp
+; CHECK-NEXT: retl
%Y = call double @cos(double %X) readonly
ret double %Y
}
-; COS: {{^[ \t]*fcos}}
-
-; SAFE: test5
-; SAFE-NOT: fcos
define x86_fp80 @test6(x86_fp80 %X) {
+; CHECK-LABEL: test6:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: subl $28, %esp
+; CHECK-NEXT: .cfi_def_cfa_offset 32
+; CHECK-NEXT: fldt {{[0-9]+}}(%esp)
+; CHECK-NEXT: fstpt (%esp)
+; CHECK-NEXT: calll _cosl
+; CHECK-NEXT: addl $28, %esp
+; CHECK-NEXT: retl
%Y = call x86_fp80 @cosl(x86_fp80 %X) readonly
ret x86_fp80 %Y
}
-; COS: {{^[ \t]*fcos}}
-
-; SIN-NOT: fsin
-; COS-NOT: fcos
diff --git a/test/CodeGen/X86/sink-blockfreq.ll b/test/CodeGen/X86/sink-blockfreq.ll
index d0b8972cee50..cad9cf81905c 100644
--- a/test/CodeGen/X86/sink-blockfreq.ll
+++ b/test/CodeGen/X86/sink-blockfreq.ll
@@ -9,7 +9,7 @@
define i32 @sink_freqinfo(i32 %a, i32 %b) nounwind uwtable ssp {
; MSINK_BFI-LABEL: sink_freqinfo
; MSINK_BFI: jl
-; MSINK_BFI-NEXT: ## BB#
+; MSINK_BFI-NEXT: ## %bb.
; MSINK_BFI-NEXT: imull
; MSINK_NOBFI-LABEL: sink_freqinfo
diff --git a/test/CodeGen/X86/sink-hoist.ll b/test/CodeGen/X86/sink-hoist.ll
index 972fbdf48cb5..8111aa72b6e7 100644
--- a/test/CodeGen/X86/sink-hoist.ll
+++ b/test/CodeGen/X86/sink-hoist.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -verify-machineinstrs -march=x86-64 -asm-verbose=false -mtriple=x86_64-unknown-linux-gnu -mcpu=nehalem -post-RA-scheduler=true -schedmodel=false | FileCheck %s
+; RUN: llc < %s -verify-machineinstrs -asm-verbose=false -mtriple=x86_64-unknown-linux-gnu -mcpu=nehalem -post-RA-scheduler=true -schedmodel=false | FileCheck %s
; Currently, floating-point selects are lowered to CFG triangles.
; This means that one side of the select is always unconditionally
diff --git a/test/CodeGen/X86/sink-out-of-loop.ll b/test/CodeGen/X86/sink-out-of-loop.ll
index 4bf829a02738..e7b721d36a0d 100644
--- a/test/CodeGen/X86/sink-out-of-loop.ll
+++ b/test/CodeGen/X86/sink-out-of-loop.ll
@@ -68,7 +68,7 @@ loop:
br i1 %exit_cond, label %exit, label %loop
exit:
-; CHECK: BB#2
+; CHECK: %bb.2
; CHECK: imull %eax, %eax
; CHECK: retq
ret i32 %j
diff --git a/test/CodeGen/X86/sjlj-eh.ll b/test/CodeGen/X86/sjlj-eh.ll
index 4d2e4e821f42..9a40b5932d49 100644
--- a/test/CodeGen/X86/sjlj-eh.ll
+++ b/test/CodeGen/X86/sjlj-eh.ll
@@ -1,4 +1,6 @@
; RUN: llc -mtriple i386-windows-gnu -exception-model sjlj -filetype asm -o - %s | FileCheck %s
+; RUN: llc -mtriple x86_64-windows-gnu -exception-model sjlj -filetype asm -o - %s | FileCheck %s -check-prefix CHECK-X64
+; RUN: llc -mtriple x86_64-linux -exception-model sjlj -filetype asm -o - %s | FileCheck %s -check-prefix CHECK-X64-LINUX
declare void @_Z20function_that_throwsv()
declare i32 @__gxx_personality_sj0(...)
@@ -24,11 +26,11 @@ try.cont:
; struct _Unwind_FunctionContext {
; +00 struct _Unwind_FunctionContext *prev; -64(%ebp)
-; +04 uintptr_t __callsite; -60(%ebp)
-; +08 uintptr_t __buffer[4]; -44(%ebp)
-; +28 __personality_routine __personality; -40(%ebp)
-; +32 uintptr_t __lsda; -36(%ebp)
-; +36 void *__jbuf[]; -32(%ebp)
+; +04 uint32_t __callsite; -60(%ebp)
+; +08 uint32_t __buffer[4]; -56(%ebp)
+; +24 __personality_routine __personality; -40(%ebp)
+; +28 uintptr_t __lsda; -36(%ebp)
+; +32 void *__jbuf[]; -32(%ebp)
; };
@@ -39,9 +41,9 @@ try.cont:
; CHECK: movl $___gxx_personality_sj0, -40(%ebp)
; UFC.__lsda = $LSDA
; CHECK: movl $[[LSDA:GCC_except_table[0-9]+]], -36(%ebp)
-; UFC.__jbuf[0] = $EBP
+; UFC.__jbuf[0] = $ebp
; CHECK: movl %ebp, -32(%ebp)
-; UFC.__jbuf[2] = $ESP
+; UFC.__jbuf[2] = $esp
; CHECK: movl %esp, -24(%ebp)
; UFC.__jbuf[1] = $EIP
; CHECK: movl $[[RESUME:LBB[0-9]+_[0-9]+]], -28(%ebp)
@@ -60,13 +62,74 @@ try.cont:
;
; CHECK: [[RESUME]]:
; CHECK: leal -64(%ebp), %esi
-; assert(UFC.__callsite <= 1);
+; assert(UFC.__callsite < 1);
; CHECK: movl -60(%ebp), %eax
; CHECK: cmpl $1, %eax
-; CHECK: jbe [[CONT:LBB[0-9]+_[0-9]+]]
+; CHECK: jb [[CONT:LBB[0-9]+_[0-9]+]]
; CHECK: ud2
; CHECK: [[CONT]]:
-; *Handlers[--UFC.__callsite]
-; CHECK: subl $1, %eax
+; *Handlers[UFC.__callsite]
; CHECK: jmpl *LJTI
+
+; struct _Unwind_FunctionContext {
+; +00 struct _Unwind_FunctionContext *prev; -312(%rbp)
+; +08 uint32_t __callsite; -304(%rbp)
+; +12 uint32_t __buffer[4]; -300(%rbp)
+; +32 __personality_routine __personality; -280(%rbp)
+; +40 uintptr_t __lsda; -272(%rbp)
+; +48 void *__jbuf[]; -264(%rbp)
+; };
+
+
+; CHECK-X64-LABEL: _Z8functionv:
+; struct _Unwind_FunctionContext UFC;
+;
+; UFC.__personality = __gxx_personality_sj0
+; CHECK-X64: leaq __gxx_personality_sj0(%rip), %rax
+; CHECK-X64: movq %rax, -280(%rbp)
+; UFC.__lsda = $LSDA
+; CHECK-X64: leaq [[LSDA:GCC_except_table[0-9]+]](%rip), %rax
+; CHECK-X64: movq %rax, -272(%rbp)
+; UFC.__jbuf[0] = $rbp
+; CHECK-X64: movq %rbp, -264(%rbp)
+; UFC.__jbuf[2] = $rsp
+; CHECK-X64: movq %rsp, -248(%rbp)
+; UFC.__jbuf[1] = $RIP
+; CHECK-X64: leaq .[[RESUME:LBB[0-9]+_[0-9]+]](%rip), %rax
+; CHECK-X64: movq %rax, -256(%rbp)
+; UFC.__callsite = 1
+; CHECK-X64: movl $1, -304(%rbp)
+; _Unwind_SjLj_Register(&UFC);
+; CHECK-X64: leaq -312(%rbp), %rcx
+; CHECK-X64: callq _Unwind_SjLj_Register
+; function_that_throws();
+; CHECK-X64: callq _Z20function_that_throwsv
+; _Unwind_SjLj_Unregister(&UFC);
+; CHECK-X64: leaq -312(%rbp), %rcx
+; CHECK-X64: callq _Unwind_SjLj_Unregister
+;
+; CHECK-X64: [[RESUME]]:
+; assert(UFC.__callsite < 1);
+; CHECK-X64: movl -304(%rbp), %eax
+; CHECK-X64: cmpl $1, %eax
+; CHECK-X64: jb .[[CONT:LBB[0-9]+_[0-9]+]]
+; CHECK-X64: ud2
+; CHECK-X64: [[CONT]]:
+; *Handlers[UFC.__callsite]
+; CHECK-X64: leaq .[[TABLE:LJTI[0-9]+_[0-9]+]](%rip), %rcx
+; CHECK-X64: movl (%rcx,%rax,4), %eax
+; CHECK-X64: cltq
+; CHECK-X64: addq %rcx, %rax
+; CHECK-X64: jmpq *%rax
+
+; CHECK-X64-LINUX: .[[RESUME:LBB[0-9]+_[0-9]+]]:
+; assert(UFC.__callsite < 1);
+; CHECK-X64-LINUX: movl -120(%rbp), %eax
+; CHECK-X64-LINUX: cmpl $1, %eax
+; CHECK-X64-LINUX: jb .[[CONT:LBB[0-9]+_[0-9]+]]
+; CHECK-X64-LINUX: ud2
+; CHECK-X64-LINUX: [[CONT]]:
+; *Handlers[UFC.__callsite]
+; CHECK-X64-LINUX: leaq .[[TABLE:LJTI[0-9]+_[0-9]+]](%rip), %rcx
+; CHECK-X64-LINUX: jmpq *(%rcx,%rax,8)
diff --git a/test/CodeGen/X86/slow-incdec.ll b/test/CodeGen/X86/slow-incdec.ll
index 1857f61e6c29..5e466f99a38b 100644
--- a/test/CodeGen/X86/slow-incdec.ll
+++ b/test/CodeGen/X86/slow-incdec.ll
@@ -1,80 +1,55 @@
-; RUN: llc -mtriple=i386-unknown-linux-gnu -mattr=-slow-incdec < %s | FileCheck -check-prefix=INCDEC %s
-; RUN: llc -mtriple=i386-unknown-linux-gnu -mattr=+slow-incdec < %s | FileCheck -check-prefix=ADD %s
-
-; check -mattr=-slow-incdec
-; INCDEC-NOT: addl $-1
-; INCDEC: dec
-; INCDEC-NOT: addl $1
-; INCDEC: inc
-
-; check -mattr=+slow-incdec
-; ADD: addl $-1
-; ADD-NOT: dec
-; ADD: addl $1
-; ADD-NOT: inc
-
-; Function Attrs: nounwind readonly
-define i32 @slow_1(i32* nocapture readonly %a, i32 %s) #0 {
-entry:
- %cmp5 = icmp eq i32 %s, 0
- br i1 %cmp5, label %for.end, label %for.body.preheader
-
-for.body.preheader: ; preds = %entry
- br label %for.body
-
-for.cond: ; preds = %for.body
- %cmp = icmp eq i32 %dec, 0
- br i1 %cmp, label %for.end.loopexit, label %for.body
-
-for.body: ; preds = %for.body.preheader, %for.cond
- %i.06 = phi i32 [ %dec, %for.cond ], [ %s, %for.body.preheader ]
- %arrayidx = getelementptr inbounds i32, i32* %a, i32 %i.06
- %0 = load i32, i32* %arrayidx, align 4, !tbaa !1
- %cmp1 = icmp eq i32 %0, 0
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=i386-unknown-linux-gnu -mattr=-slow-incdec < %s | FileCheck -check-prefix=CHECK -check-prefix=INCDEC %s
+; RUN: llc -mtriple=i386-unknown-linux-gnu -mattr=+slow-incdec < %s | FileCheck -check-prefix=CHECK -check-prefix=ADD %s
+
+define i32 @inc(i32 %x) {
+; INCDEC-LABEL: inc:
+; INCDEC: # %bb.0:
+; INCDEC-NEXT: movl {{[0-9]+}}(%esp), %eax
+; INCDEC-NEXT: incl %eax
+; INCDEC-NEXT: retl
;
- %dec = add nsw i32 %i.06, -1
- br i1 %cmp1, label %for.end.loopexit, label %for.cond
-
-for.end.loopexit: ; preds = %for.cond, %for.body
- %i.0.lcssa.ph = phi i32 [ 0, %for.cond ], [ %i.06, %for.body ]
- br label %for.end
-
-for.end: ; preds = %for.end.loopexit, %entry
- %i.0.lcssa = phi i32 [ 0, %entry ], [ %i.0.lcssa.ph, %for.end.loopexit ]
- ret i32 %i.0.lcssa
+; ADD-LABEL: inc:
+; ADD: # %bb.0:
+; ADD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; ADD-NEXT: addl $1, %eax
+; ADD-NEXT: retl
+ %r = add i32 %x, 1
+ ret i32 %r
}
-; Function Attrs: nounwind readonly
-define i32 @slow_2(i32* nocapture readonly %a, i32 %s) #0 {
-entry:
- %cmp5 = icmp eq i32 %s, 0
- br i1 %cmp5, label %for.end, label %for.body.preheader
-
-for.body.preheader: ; preds = %entry
- br label %for.body
-
-for.cond: ; preds = %for.body
- %cmp = icmp eq i32 %inc, 0
- br i1 %cmp, label %for.end.loopexit, label %for.body
-
-for.body: ; preds = %for.body.preheader, %for.cond
- %i.06 = phi i32 [ %inc, %for.cond ], [ %s, %for.body.preheader ]
- %arrayidx = getelementptr inbounds i32, i32* %a, i32 %i.06
- %0 = load i32, i32* %arrayidx, align 4, !tbaa !1
- %cmp1 = icmp eq i32 %0, 0
- %inc = add nsw i32 %i.06, 1
- br i1 %cmp1, label %for.end.loopexit, label %for.cond
-
-for.end.loopexit: ; preds = %for.cond, %for.body
- %i.0.lcssa.ph = phi i32 [ 0, %for.cond ], [ %i.06, %for.body ]
- br label %for.end
+define i32 @dec(i32 %x) {
+; INCDEC-LABEL: dec:
+; INCDEC: # %bb.0:
+; INCDEC-NEXT: movl {{[0-9]+}}(%esp), %eax
+; INCDEC-NEXT: decl %eax
+; INCDEC-NEXT: retl
+;
+; ADD-LABEL: dec:
+; ADD: # %bb.0:
+; ADD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; ADD-NEXT: addl $-1, %eax
+; ADD-NEXT: retl
+ %r = add i32 %x, -1
+ ret i32 %r
+}
-for.end: ; preds = %for.end.loopexit, %entry
- %i.0.lcssa = phi i32 [ 0, %entry ], [ %i.0.lcssa.ph, %for.end.loopexit ]
- ret i32 %i.0.lcssa
+define i32 @inc_size(i32 %x) optsize {
+; CHECK-LABEL: inc_size:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: incl %eax
+; CHECK-NEXT: retl
+ %r = add i32 %x, 1
+ ret i32 %r
}
-!1 = !{!2, !2, i64 0}
-!2 = !{!"int", !3, i64 0}
-!3 = !{!"omnipotent char", !4, i64 0}
-!4 = !{!"Simple C/C++ TBAA"}
+define i32 @dec_size(i32 %x) optsize {
+; CHECK-LABEL: dec_size:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: decl %eax
+; CHECK-NEXT: retl
+ %r = add i32 %x, -1
+ ret i32 %r
+}
diff --git a/test/CodeGen/X86/slow-pmulld.ll b/test/CodeGen/X86/slow-pmulld.ll
index 1de19d2334d4..4d73b11349f5 100644
--- a/test/CodeGen/X86/slow-pmulld.ll
+++ b/test/CodeGen/X86/slow-pmulld.ll
@@ -9,7 +9,7 @@
define <4 x i32> @foo(<4 x i8> %A) {
; CHECK32-LABEL: foo:
-; CHECK32: # BB#0:
+; CHECK32: # %bb.0:
; CHECK32-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,xmm0[4],zero,xmm0[8],zero,xmm0[12],zero,xmm0[u,u,u,u,u,u,u,u]
; CHECK32-NEXT: movdqa {{.*#+}} xmm1 = <18778,18778,18778,18778,u,u,u,u>
; CHECK32-NEXT: movdqa %xmm0, %xmm2
@@ -19,7 +19,7 @@ define <4 x i32> @foo(<4 x i8> %A) {
; CHECK32-NEXT: retl
;
; CHECK64-LABEL: foo:
-; CHECK64: # BB#0:
+; CHECK64: # %bb.0:
; CHECK64-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,xmm0[4],zero,xmm0[8],zero,xmm0[12],zero,xmm0[u,u,u,u,u,u,u,u]
; CHECK64-NEXT: movdqa {{.*#+}} xmm1 = <18778,18778,18778,18778,u,u,u,u>
; CHECK64-NEXT: movdqa %xmm0, %xmm2
@@ -29,13 +29,13 @@ define <4 x i32> @foo(<4 x i8> %A) {
; CHECK64-NEXT: retq
;
; SSE4-32-LABEL: foo:
-; SSE4-32: # BB#0:
+; SSE4-32: # %bb.0:
; SSE4-32-NEXT: pand {{\.LCPI.*}}, %xmm0
; SSE4-32-NEXT: pmulld {{\.LCPI.*}}, %xmm0
; SSE4-32-NEXT: retl
;
; SSE4-64-LABEL: foo:
-; SSE4-64: # BB#0:
+; SSE4-64: # %bb.0:
; SSE4-64-NEXT: pand {{.*}}(%rip), %xmm0
; SSE4-64-NEXT: pmulld {{.*}}(%rip), %xmm0
; SSE4-64-NEXT: retq
@@ -46,25 +46,25 @@ define <4 x i32> @foo(<4 x i8> %A) {
define <4 x i32> @foo_os(<4 x i8> %A) minsize {
; CHECK32-LABEL: foo_os:
-; CHECK32: # BB#0:
+; CHECK32: # %bb.0:
; CHECK32-NEXT: pand {{\.LCPI.*}}, %xmm0
; CHECK32-NEXT: pmulld {{\.LCPI.*}}, %xmm0
; CHECK32-NEXT: retl
;
; CHECK64-LABEL: foo_os:
-; CHECK64: # BB#0:
+; CHECK64: # %bb.0:
; CHECK64-NEXT: pand {{.*}}(%rip), %xmm0
; CHECK64-NEXT: pmulld {{.*}}(%rip), %xmm0
; CHECK64-NEXT: retq
;
; SSE4-32-LABEL: foo_os:
-; SSE4-32: # BB#0:
+; SSE4-32: # %bb.0:
; SSE4-32-NEXT: pand {{\.LCPI.*}}, %xmm0
; SSE4-32-NEXT: pmulld {{\.LCPI.*}}, %xmm0
; SSE4-32-NEXT: retl
;
; SSE4-64-LABEL: foo_os:
-; SSE4-64: # BB#0:
+; SSE4-64: # %bb.0:
; SSE4-64-NEXT: pand {{.*}}(%rip), %xmm0
; SSE4-64-NEXT: pmulld {{.*}}(%rip), %xmm0
; SSE4-64-NEXT: retq
diff --git a/test/CodeGen/X86/slow-unaligned-mem.ll b/test/CodeGen/X86/slow-unaligned-mem.ll
index 8251eb324a77..a3a21892339b 100644
--- a/test/CodeGen/X86/slow-unaligned-mem.ll
+++ b/test/CodeGen/X86/slow-unaligned-mem.ll
@@ -64,7 +64,7 @@
define void @store_zeros(i8* %a) {
; SLOW-NOT: not a recognized processor
; SLOW-LABEL: store_zeros:
-; SLOW: # BB#0:
+; SLOW: # %bb.0:
; SLOW-NEXT: movl
; SLOW-NEXT: movl
; SLOW-NEXT: movl
@@ -85,7 +85,7 @@ define void @store_zeros(i8* %a) {
;
; FAST-NOT: not a recognized processor
; FAST-LABEL: store_zeros:
-; FAST: # BB#0:
+; FAST: # %bb.0:
; FAST-NEXT: movl {{[0-9]+}}(%esp), %eax
; FAST-NOT: movl
call void @llvm.memset.p0i8.i64(i8* %a, i8 0, i64 64, i32 1, i1 false)
diff --git a/test/CodeGen/X86/smul-with-overflow.ll b/test/CodeGen/X86/smul-with-overflow.ll
index 2b21f4ff84e9..7154a896a359 100644
--- a/test/CodeGen/X86/smul-with-overflow.ll
+++ b/test/CodeGen/X86/smul-with-overflow.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | FileCheck %s
+; RUN: llc < %s -mtriple=i686-- | FileCheck %s
@ok = internal constant [4 x i8] c"%d\0A\00"
@no = internal constant [4 x i8] c"no\0A\00"
diff --git a/test/CodeGen/X86/soft-fp-legal-in-HW-reg.ll b/test/CodeGen/X86/soft-fp-legal-in-HW-reg.ll
index 0461ee809efb..ae516c3bf933 100644
--- a/test/CodeGen/X86/soft-fp-legal-in-HW-reg.ll
+++ b/test/CodeGen/X86/soft-fp-legal-in-HW-reg.ll
@@ -17,7 +17,7 @@ define fp128 @TestSelect(fp128 %a, fp128 %b) {
; CHECK-NEXT callq __subtf3
; CHECK-NEXT testl %ebx, %ebx
; CHECK-NEXT jg .LBB0_2
-; CHECK-NEXT # BB#1:
+; CHECK-NEXT # %bb.1:
; CHECK-NEXT movaps .LCPI0_0(%rip), %xmm0
; CHECK-NEXT .LBB0_2:
; CHECK-NEXT addq $32, %rsp
diff --git a/test/CodeGen/X86/soft-fp.ll b/test/CodeGen/X86/soft-fp.ll
index 138e66c394ba..134b24c73b60 100644
--- a/test/CodeGen/X86/soft-fp.ll
+++ b/test/CodeGen/X86/soft-fp.ll
@@ -1,10 +1,10 @@
-; RUN: llc < %s -march=x86 -mattr=+mmx,+sse,+soft-float \
+; RUN: llc < %s -mtriple=i686-- -mattr=+mmx,+sse,+soft-float \
; RUN: | FileCheck %s --check-prefix=SOFT1 --check-prefix=CHECK
-; RUN: llc < %s -march=x86-64 -mattr=+mmx,+sse2,+soft-float \
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+mmx,+sse2,+soft-float \
; RUN: | FileCheck %s --check-prefix=SOFT2 --check-prefix=CHECK
-; RUN: llc < %s -march=x86-64 -mattr=+mmx,+sse \
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+mmx,+sse \
; RUN: | FileCheck %s --check-prefix=SSE1 --check-prefix=CHECK
-; RUN: llc < %s -march=x86-64 -mattr=+mmx,+sse2 \
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+mmx,+sse2 \
; RUN: | FileCheck %s --check-prefix=SSE2 --check-prefix=CHECK
; RUN: llc < %s -mtriple=x86_64-gnux32 -mattr=+mmx,+sse2,+soft-float | FileCheck %s
diff --git a/test/CodeGen/X86/splat-for-size.ll b/test/CodeGen/X86/splat-for-size.ll
index a43e7b767322..5a98a00338bf 100644
--- a/test/CodeGen/X86/splat-for-size.ll
+++ b/test/CodeGen/X86/splat-for-size.ll
@@ -8,7 +8,7 @@
; There is no AVX broadcast from double to 128-bit vector because movddup has been around since SSE3 (grrr).
define <2 x double> @splat_v2f64(<2 x double> %x) #0 {
; CHECK-LABEL: splat_v2f64:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0]
; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
@@ -18,7 +18,7 @@ define <2 x double> @splat_v2f64(<2 x double> %x) #0 {
define <4 x double> @splat_v4f64(<4 x double> %x) #1 {
; CHECK-LABEL: splat_v4f64:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vbroadcastsd {{.*}}(%rip), %ymm1
; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0
; CHECK-NEXT: retq
@@ -28,7 +28,7 @@ define <4 x double> @splat_v4f64(<4 x double> %x) #1 {
define <4 x float> @splat_v4f32(<4 x float> %x) #0 {
; CHECK-LABEL: splat_v4f32:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vbroadcastss {{.*}}(%rip), %xmm1
; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
@@ -38,7 +38,7 @@ define <4 x float> @splat_v4f32(<4 x float> %x) #0 {
define <8 x float> @splat_v8f32(<8 x float> %x) #1 {
; CHECK-LABEL: splat_v8f32:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vbroadcastss {{.*}}(%rip), %ymm1
; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0
; CHECK-NEXT: retq
@@ -50,13 +50,13 @@ define <8 x float> @splat_v8f32(<8 x float> %x) #1 {
; We also generate vmovddup for AVX2 because it's one byte smaller than vpbroadcastq.
define <2 x i64> @splat_v2i64(<2 x i64> %x) #1 {
; AVX-LABEL: splat_v2i64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0]
; AVX-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
;
; AVX2-LABEL: splat_v2i64:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpbroadcastq {{.*}}(%rip), %xmm1
; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; AVX2-NEXT: retq
@@ -68,7 +68,7 @@ define <2 x i64> @splat_v2i64(<2 x i64> %x) #1 {
; and then we fake it: use vmovddup to splat 64-bit value.
define <4 x i64> @splat_v4i64(<4 x i64> %x) #0 {
; AVX-LABEL: splat_v4i64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0]
; AVX-NEXT: vpaddq %xmm2, %xmm1, %xmm1
@@ -77,7 +77,7 @@ define <4 x i64> @splat_v4i64(<4 x i64> %x) #0 {
; AVX-NEXT: retq
;
; AVX2-LABEL: splat_v4i64:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpbroadcastq {{.*}}(%rip), %ymm1
; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
@@ -88,13 +88,13 @@ define <4 x i64> @splat_v4i64(<4 x i64> %x) #0 {
; AVX can't do integer splats, so fake it: use vbroadcastss to splat 32-bit value.
define <4 x i32> @splat_v4i32(<4 x i32> %x) #1 {
; AVX-LABEL: splat_v4i32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vbroadcastss {{.*}}(%rip), %xmm1
; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
;
; AVX2-LABEL: splat_v4i32:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: retq
@@ -105,7 +105,7 @@ define <4 x i32> @splat_v4i32(<4 x i32> %x) #1 {
; AVX can't do integer splats, so fake it: use vbroadcastss to splat 32-bit value.
define <8 x i32> @splat_v8i32(<8 x i32> %x) #0 {
; AVX-LABEL: splat_v8i32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX-NEXT: vbroadcastss {{.*}}(%rip), %xmm2
; AVX-NEXT: vpaddd %xmm2, %xmm1, %xmm1
@@ -114,7 +114,7 @@ define <8 x i32> @splat_v8i32(<8 x i32> %x) #0 {
; AVX-NEXT: retq
;
; AVX2-LABEL: splat_v8i32:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1
; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
@@ -125,12 +125,12 @@ define <8 x i32> @splat_v8i32(<8 x i32> %x) #0 {
; AVX can't do integer splats, and there's no broadcast fakery for 16-bit. Could use pshuflw, etc?
define <8 x i16> @splat_v8i16(<8 x i16> %x) #1 {
; AVX-LABEL: splat_v8i16:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: retq
;
; AVX2-LABEL: splat_v8i16:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpbroadcastw {{.*}}(%rip), %xmm1
; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; AVX2-NEXT: retq
@@ -141,7 +141,7 @@ define <8 x i16> @splat_v8i16(<8 x i16> %x) #1 {
; AVX can't do integer splats, and there's no broadcast fakery for 16-bit. Could use pshuflw, etc?
define <16 x i16> @splat_v16i16(<16 x i16> %x) #0 {
; AVX-LABEL: splat_v16i16:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [2,2,2,2,2,2,2,2]
; AVX-NEXT: vpaddw %xmm2, %xmm1, %xmm1
@@ -150,7 +150,7 @@ define <16 x i16> @splat_v16i16(<16 x i16> %x) #0 {
; AVX-NEXT: retq
;
; AVX2-LABEL: splat_v16i16:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpbroadcastw {{.*}}(%rip), %ymm1
; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
@@ -161,12 +161,12 @@ define <16 x i16> @splat_v16i16(<16 x i16> %x) #0 {
; AVX can't do integer splats, and there's no broadcast fakery for 8-bit. Could use pshufb, etc?
define <16 x i8> @splat_v16i8(<16 x i8> %x) #1 {
; AVX-LABEL: splat_v16i8:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: retq
;
; AVX2-LABEL: splat_v16i8:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpbroadcastb {{.*}}(%rip), %xmm1
; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0
; AVX2-NEXT: retq
@@ -177,7 +177,7 @@ define <16 x i8> @splat_v16i8(<16 x i8> %x) #1 {
; AVX can't do integer splats, and there's no broadcast fakery for 8-bit. Could use pshufb, etc?
define <32 x i8> @splat_v32i8(<32 x i8> %x) #0 {
; AVX-LABEL: splat_v32i8:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
; AVX-NEXT: vpaddb %xmm2, %xmm1, %xmm1
@@ -186,7 +186,7 @@ define <32 x i8> @splat_v32i8(<32 x i8> %x) #0 {
; AVX-NEXT: retq
;
; AVX2-LABEL: splat_v32i8:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpbroadcastb {{.*}}(%rip), %ymm1
; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
diff --git a/test/CodeGen/X86/split-extend-vector-inreg.ll b/test/CodeGen/X86/split-extend-vector-inreg.ll
index 692cbdb00be6..b477b29ac542 100644
--- a/test/CodeGen/X86/split-extend-vector-inreg.ll
+++ b/test/CodeGen/X86/split-extend-vector-inreg.ll
@@ -4,10 +4,10 @@
define <4 x i64> @autogen_SD88863() {
; X32-LABEL: autogen_SD88863:
-; X32: # BB#0: # %BB
-; X32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; X32: # %bb.0: # %BB
+; X32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,0,1]
; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; X32-NEXT: vxorpd %ymm1, %ymm1, %ymm1
+; X32-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; X32-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3]
; X32-NEXT: movb $1, %al
; X32-NEXT: .p2align 4, 0x90
@@ -15,14 +15,14 @@ define <4 x i64> @autogen_SD88863() {
; X32-NEXT: # =>This Inner Loop Header: Depth=1
; X32-NEXT: testb %al, %al
; X32-NEXT: jne .LBB0_1
-; X32-NEXT: # BB#2: # %CF240
+; X32-NEXT: # %bb.2: # %CF240
; X32-NEXT: retl
;
; X64-LABEL: autogen_SD88863:
-; X64: # BB#0: # %BB
-; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; X64: # %bb.0: # %BB
+; X64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,0,1]
; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; X64-NEXT: vxorpd %ymm1, %ymm1, %ymm1
+; X64-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; X64-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3]
; X64-NEXT: movb $1, %al
; X64-NEXT: .p2align 4, 0x90
@@ -30,7 +30,7 @@ define <4 x i64> @autogen_SD88863() {
; X64-NEXT: # =>This Inner Loop Header: Depth=1
; X64-NEXT: testb %al, %al
; X64-NEXT: jne .LBB0_1
-; X64-NEXT: # BB#2: # %CF240
+; X64-NEXT: # %bb.2: # %CF240
; X64-NEXT: retq
BB:
%I26 = insertelement <4 x i64> undef, i64 undef, i32 2
diff --git a/test/CodeGen/X86/split-store.ll b/test/CodeGen/X86/split-store.ll
index 6e320efb2b26..64238901d102 100644
--- a/test/CodeGen/X86/split-store.ll
+++ b/test/CodeGen/X86/split-store.ll
@@ -1,10 +1,12 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=x86_64-unknown-unknown -force-split-store < %s | FileCheck %s
-; CHECK-LABEL: int32_float_pair
-; CHECK-DAG: movl %edi, (%rsi)
-; CHECK-DAG: movss %xmm0, 4(%rsi)
define void @int32_float_pair(i32 %tmp1, float %tmp2, i64* %ref.tmp) {
-entry:
+; CHECK-LABEL: int32_float_pair:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movl %edi, (%rsi)
+; CHECK-NEXT: movss %xmm0, 4(%rsi)
+; CHECK-NEXT: retq
%t0 = bitcast float %tmp2 to i32
%t1 = zext i32 %t0 to i64
%t2 = shl nuw i64 %t1, 32
@@ -14,11 +16,12 @@ entry:
ret void
}
-; CHECK-LABEL: float_int32_pair
-; CHECK-DAG: movss %xmm0, (%rsi)
-; CHECK-DAG: movl %edi, 4(%rsi)
define void @float_int32_pair(float %tmp1, i32 %tmp2, i64* %ref.tmp) {
-entry:
+; CHECK-LABEL: float_int32_pair:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movss %xmm0, (%rsi)
+; CHECK-NEXT: movl %edi, 4(%rsi)
+; CHECK-NEXT: retq
%t0 = bitcast float %tmp1 to i32
%t1 = zext i32 %tmp2 to i64
%t2 = shl nuw i64 %t1, 32
@@ -28,12 +31,13 @@ entry:
ret void
}
-; CHECK-LABEL: int16_float_pair
-; CHECK-DAG: movzwl %di, %eax
-; CHECK-DAG: movl %eax, (%rsi)
-; CHECK-DAG: movss %xmm0, 4(%rsi)
define void @int16_float_pair(i16 signext %tmp1, float %tmp2, i64* %ref.tmp) {
-entry:
+; CHECK-LABEL: int16_float_pair:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movzwl %di, %eax
+; CHECK-NEXT: movl %eax, (%rsi)
+; CHECK-NEXT: movss %xmm0, 4(%rsi)
+; CHECK-NEXT: retq
%t0 = bitcast float %tmp2 to i32
%t1 = zext i32 %t0 to i64
%t2 = shl nuw i64 %t1, 32
@@ -43,12 +47,13 @@ entry:
ret void
}
-; CHECK-LABEL: int8_float_pair
-; CHECK-DAG: movzbl %dil, %eax
-; CHECK-DAG: movl %eax, (%rsi)
-; CHECK-DAG: movss %xmm0, 4(%rsi)
define void @int8_float_pair(i8 signext %tmp1, float %tmp2, i64* %ref.tmp) {
-entry:
+; CHECK-LABEL: int8_float_pair:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movzbl %dil, %eax
+; CHECK-NEXT: movl %eax, (%rsi)
+; CHECK-NEXT: movss %xmm0, 4(%rsi)
+; CHECK-NEXT: retq
%t0 = bitcast float %tmp2 to i32
%t1 = zext i32 %t0 to i64
%t2 = shl nuw i64 %t1, 32
@@ -58,11 +63,12 @@ entry:
ret void
}
-; CHECK-LABEL: int32_int32_pair
-; CHECK: movl %edi, (%rdx)
-; CHECK: movl %esi, 4(%rdx)
define void @int32_int32_pair(i32 %tmp1, i32 %tmp2, i64* %ref.tmp) {
-entry:
+; CHECK-LABEL: int32_int32_pair:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movl %edi, (%rdx)
+; CHECK-NEXT: movl %esi, 4(%rdx)
+; CHECK-NEXT: retq
%t1 = zext i32 %tmp2 to i64
%t2 = shl nuw i64 %t1, 32
%t3 = zext i32 %tmp1 to i64
@@ -71,11 +77,12 @@ entry:
ret void
}
-; CHECK-LABEL: int16_int16_pair
-; CHECK: movw %di, (%rdx)
-; CHECK: movw %si, 2(%rdx)
define void @int16_int16_pair(i16 signext %tmp1, i16 signext %tmp2, i32* %ref.tmp) {
-entry:
+; CHECK-LABEL: int16_int16_pair:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movw %di, (%rdx)
+; CHECK-NEXT: movw %si, 2(%rdx)
+; CHECK-NEXT: retq
%t1 = zext i16 %tmp2 to i32
%t2 = shl nuw i32 %t1, 16
%t3 = zext i16 %tmp1 to i32
@@ -84,11 +91,12 @@ entry:
ret void
}
-; CHECK-LABEL: int8_int8_pair
-; CHECK: movb %dil, (%rdx)
-; CHECK: movb %sil, 1(%rdx)
define void @int8_int8_pair(i8 signext %tmp1, i8 signext %tmp2, i16* %ref.tmp) {
-entry:
+; CHECK-LABEL: int8_int8_pair:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movb %dil, (%rdx)
+; CHECK-NEXT: movb %sil, 1(%rdx)
+; CHECK-NEXT: retq
%t1 = zext i8 %tmp2 to i16
%t2 = shl nuw i16 %t1, 8
%t3 = zext i8 %tmp1 to i16
@@ -97,13 +105,14 @@ entry:
ret void
}
-; CHECK-LABEL: int31_int31_pair
-; CHECK: andl $2147483647, %edi
-; CHECK: movl %edi, (%rdx)
-; CHECK: andl $2147483647, %esi
-; CHECK: movl %esi, 4(%rdx)
define void @int31_int31_pair(i31 %tmp1, i31 %tmp2, i64* %ref.tmp) {
-entry:
+; CHECK-LABEL: int31_int31_pair:
+; CHECK: # %bb.0:
+; CHECK-NEXT: andl $2147483647, %edi # imm = 0x7FFFFFFF
+; CHECK-NEXT: movl %edi, (%rdx)
+; CHECK-NEXT: andl $2147483647, %esi # imm = 0x7FFFFFFF
+; CHECK-NEXT: movl %esi, 4(%rdx)
+; CHECK-NEXT: retq
%t1 = zext i31 %tmp2 to i64
%t2 = shl nuw i64 %t1, 32
%t3 = zext i31 %tmp1 to i64
@@ -112,13 +121,14 @@ entry:
ret void
}
-; CHECK-LABEL: int31_int17_pair
-; CHECK: andl $2147483647, %edi
-; CHECK: movl %edi, (%rdx)
-; CHECK: andl $131071, %esi
-; CHECK: movl %esi, 4(%rdx)
define void @int31_int17_pair(i31 %tmp1, i17 %tmp2, i64* %ref.tmp) {
-entry:
+; CHECK-LABEL: int31_int17_pair:
+; CHECK: # %bb.0:
+; CHECK-NEXT: andl $2147483647, %edi # imm = 0x7FFFFFFF
+; CHECK-NEXT: movl %edi, (%rdx)
+; CHECK-NEXT: andl $131071, %esi # imm = 0x1FFFF
+; CHECK-NEXT: movl %esi, 4(%rdx)
+; CHECK-NEXT: retq
%t1 = zext i17 %tmp2 to i64
%t2 = shl nuw i64 %t1, 32
%t3 = zext i31 %tmp1 to i64
@@ -127,13 +137,14 @@ entry:
ret void
}
-; CHECK-LABEL: int7_int3_pair
-; CHECK: andb $127, %dil
-; CHECK: movb %dil, (%rdx)
-; CHECK: andb $7, %sil
-; CHECK: movb %sil, 1(%rdx)
define void @int7_int3_pair(i7 signext %tmp1, i3 signext %tmp2, i16* %ref.tmp) {
-entry:
+; CHECK-LABEL: int7_int3_pair:
+; CHECK: # %bb.0:
+; CHECK-NEXT: andb $127, %dil
+; CHECK-NEXT: movb %dil, (%rdx)
+; CHECK-NEXT: andb $7, %sil
+; CHECK-NEXT: movb %sil, 1(%rdx)
+; CHECK-NEXT: retq
%t1 = zext i3 %tmp2 to i16
%t2 = shl nuw i16 %t1, 8
%t3 = zext i7 %tmp1 to i16
@@ -142,15 +153,16 @@ entry:
ret void
}
-; CHECK-LABEL: int24_int24_pair
-; CHECK: movw %di, (%rdx)
-; CHECK: shrl $16, %edi
-; CHECK: movb %dil, 2(%rdx)
-; CHECK: movw %si, 4(%rdx)
-; CHECK: shrl $16, %esi
-; CHECK: movb %sil, 6(%rdx)
define void @int24_int24_pair(i24 signext %tmp1, i24 signext %tmp2, i48* %ref.tmp) {
-entry:
+; CHECK-LABEL: int24_int24_pair:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movw %di, (%rdx)
+; CHECK-NEXT: shrl $16, %edi
+; CHECK-NEXT: movb %dil, 2(%rdx)
+; CHECK-NEXT: movw %si, 4(%rdx)
+; CHECK-NEXT: shrl $16, %esi
+; CHECK-NEXT: movb %sil, 6(%rdx)
+; CHECK-NEXT: retq
%t1 = zext i24 %tmp2 to i48
%t2 = shl nuw i48 %t1, 24
%t3 = zext i24 %tmp1 to i48
@@ -160,16 +172,18 @@ entry:
}
; getTypeSizeInBits(i12) != getTypeStoreSizeInBits(i12), so store split doesn't kick in.
-; CHECK-LABEL: int12_int12_pair
-; CHECK: movl %esi, %eax
-; CHECK: shll $12, %eax
-; CHECK: andl $4095, %edi
-; CHECK: orl %eax, %edi
-; CHECK: shrl $4, %esi
-; CHECK: movb %sil, 2(%rdx)
-; CHECK: movw %di, (%rdx)
+
define void @int12_int12_pair(i12 signext %tmp1, i12 signext %tmp2, i24* %ref.tmp) {
-entry:
+; CHECK-LABEL: int12_int12_pair:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movl %esi, %eax
+; CHECK-NEXT: shll $12, %eax
+; CHECK-NEXT: andl $4095, %edi # imm = 0xFFF
+; CHECK-NEXT: orl %eax, %edi
+; CHECK-NEXT: shrl $4, %esi
+; CHECK-NEXT: movb %sil, 2(%rdx)
+; CHECK-NEXT: movw %di, (%rdx)
+; CHECK-NEXT: retq
%t1 = zext i12 %tmp2 to i24
%t2 = shl nuw i24 %t1, 12
%t3 = zext i12 %tmp1 to i24
@@ -179,16 +193,16 @@ entry:
}
; getTypeSizeInBits(i14) != getTypeStoreSizeInBits(i14), so store split doesn't kick in.
-; CHECK-LABEL: int7_int7_pair
-; CHECK: movzbl %sil, %eax
-; CHECK: shll $7, %eax
-; CHECK: andb $127, %dil
-; CHECK: movzbl %dil, %ecx
-; CHECK: orl %eax, %ecx
-; CHECK: andl $16383, %ecx
-; CHECK: movw %cx, (%rdx)
+
define void @int7_int7_pair(i7 signext %tmp1, i7 signext %tmp2, i14* %ref.tmp) {
-entry:
+; CHECK-LABEL: int7_int7_pair:
+; CHECK: # %bb.0:
+; CHECK-NEXT: shll $7, %esi
+; CHECK-NEXT: andl $127, %edi
+; CHECK-NEXT: orl %esi, %edi
+; CHECK-NEXT: andl $16383, %edi # imm = 0x3FFF
+; CHECK-NEXT: movw %di, (%rdx)
+; CHECK-NEXT: retq
%t1 = zext i7 %tmp2 to i14
%t2 = shl nuw i14 %t1, 7
%t3 = zext i7 %tmp1 to i14
@@ -198,14 +212,16 @@ entry:
}
; getTypeSizeInBits(i2) != getTypeStoreSizeInBits(i2), so store split doesn't kick in.
-; CHECK-LABEL: int1_int1_pair
-; CHECK: addb %sil, %sil
-; CHECK: andb $1, %dil
-; CHECK: orb %sil, %dil
-; CHECK: andb $3, %dil
-; CHECK: movb %dil, (%rdx)
+
define void @int1_int1_pair(i1 signext %tmp1, i1 signext %tmp2, i2* %ref.tmp) {
-entry:
+; CHECK-LABEL: int1_int1_pair:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addb %sil, %sil
+; CHECK-NEXT: andb $1, %dil
+; CHECK-NEXT: orb %sil, %dil
+; CHECK-NEXT: andb $3, %dil
+; CHECK-NEXT: movb %dil, (%rdx)
+; CHECK-NEXT: retq
%t1 = zext i1 %tmp2 to i2
%t2 = shl nuw i2 %t1, 1
%t3 = zext i1 %tmp1 to i2
@@ -214,10 +230,12 @@ entry:
ret void
}
-; CHECK-LABEL: mbb_int32_float_pair
-; CHECK: movl %edi, (%rsi)
-; CHECK: movss %xmm0, 4(%rsi)
define void @mbb_int32_float_pair(i32 %tmp1, float %tmp2, i64* %ref.tmp) {
+; CHECK-LABEL: mbb_int32_float_pair:
+; CHECK: # %bb.0: # %next
+; CHECK-NEXT: movl %edi, (%rsi)
+; CHECK-NEXT: movss %xmm0, 4(%rsi)
+; CHECK-NEXT: retq
entry:
%t0 = bitcast float %tmp2 to i32
br label %next
@@ -230,13 +248,18 @@ next:
ret void
}
-; CHECK-LABEL: mbb_int32_float_multi_stores
-; CHECK: movl %edi, (%rsi)
-; CHECK: movss %xmm0, 4(%rsi)
-; CHECK: # %bb2
-; CHECK: movl %edi, (%rdx)
-; CHECK: movss %xmm0, 4(%rdx)
define void @mbb_int32_float_multi_stores(i32 %tmp1, float %tmp2, i64* %ref.tmp, i64* %ref.tmp1, i1 %cmp) {
+; CHECK-LABEL: mbb_int32_float_multi_stores:
+; CHECK: # %bb.0: # %bb1
+; CHECK-NEXT: movl %edi, (%rsi)
+; CHECK-NEXT: movss %xmm0, 4(%rsi)
+; CHECK-NEXT: testb $1, %cl
+; CHECK-NEXT: je .LBB15_2
+; CHECK-NEXT: # %bb.1: # %bb2
+; CHECK-NEXT: movl %edi, (%rdx)
+; CHECK-NEXT: movss %xmm0, 4(%rdx)
+; CHECK-NEXT: .LBB15_2: # %exitbb
+; CHECK-NEXT: retq
entry:
%t0 = bitcast float %tmp2 to i32
br label %bb1
diff --git a/test/CodeGen/X86/split-vector-bitcast.ll b/test/CodeGen/X86/split-vector-bitcast.ll
index 8d80754b9a35..1ad71600324d 100644
--- a/test/CodeGen/X86/split-vector-bitcast.ll
+++ b/test/CodeGen/X86/split-vector-bitcast.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=-sse2,+sse | grep addps
+; RUN: llc < %s -mtriple=i686-- -mattr=-sse2,+sse | grep addps
; PR10497 + another isel issue with sse2 disabled
; (This is primarily checking that this construct doesn't crash.)
diff --git a/test/CodeGen/X86/split-vector-rem.ll b/test/CodeGen/X86/split-vector-rem.ll
index 681c6b0beaa0..0e51dbf78bd0 100644
--- a/test/CodeGen/X86/split-vector-rem.ll
+++ b/test/CodeGen/X86/split-vector-rem.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=x86-64 | grep div | count 16
-; RUN: llc < %s -march=x86-64 | grep fmodf | count 8
+; RUN: llc < %s -mtriple=x86_64-- | grep div | count 16
+; RUN: llc < %s -mtriple=x86_64-- | grep fmodf | count 8
define <8 x i32> @foo(<8 x i32> %t, <8 x i32> %u) {
%m = srem <8 x i32> %t, %u
diff --git a/test/CodeGen/X86/sqrt-fastmath-mir.ll b/test/CodeGen/X86/sqrt-fastmath-mir.ll
index c613ef8ee383..3e4600bfd5d5 100644
--- a/test/CodeGen/X86/sqrt-fastmath-mir.ll
+++ b/test/CodeGen/X86/sqrt-fastmath-mir.ll
@@ -5,21 +5,21 @@ declare float @llvm.sqrt.f32(float) #0
define float @foo(float %f) #0 {
; CHECK: {{name: *foo}}
; CHECK: body:
-; CHECK: %0 = COPY %xmm0
-; CHECK: %1 = VRSQRTSSr killed %2, %0
-; CHECK: %3 = VMULSSrr %0, %1
-; CHECK: %4 = VMOVSSrm
-; CHECK: %5 = VFMADD213SSr %1, killed %3, %4
-; CHECK: %6 = VMOVSSrm
-; CHECK: %7 = VMULSSrr %1, %6
-; CHECK: %8 = VMULSSrr killed %7, killed %5
-; CHECK: %9 = VMULSSrr %0, %8
-; CHECK: %10 = VFMADD213SSr %8, %9, %4
-; CHECK: %11 = VMULSSrr %9, %6
-; CHECK: %12 = VMULSSrr killed %11, killed %10
-; CHECK: %14 = FsFLD0SS
-; CHECK: %15 = VCMPSSrr %0, killed %14, 0
-; CHECK: %17 = VANDNPSrr killed %16, killed %13
+; CHECK: %0:fr32 = COPY %xmm0
+; CHECK: %1:fr32 = VRSQRTSSr killed %2, %0
+; CHECK: %3:fr32 = VMULSSrr %0, %1
+; CHECK: %4:fr32 = VMOVSSrm
+; CHECK: %5:fr32 = VFMADD213SSr %1, killed %3, %4
+; CHECK: %6:fr32 = VMOVSSrm
+; CHECK: %7:fr32 = VMULSSrr %1, %6
+; CHECK: %8:fr32 = VMULSSrr killed %7, killed %5
+; CHECK: %9:fr32 = VMULSSrr %0, %8
+; CHECK: %10:fr32 = VFMADD213SSr %8, %9, %4
+; CHECK: %11:fr32 = VMULSSrr %9, %6
+; CHECK: %12:fr32 = VMULSSrr killed %11, killed %10
+; CHECK: %14:fr32 = FsFLD0SS
+; CHECK: %15:fr32 = VCMPSSrr %0, killed %14, 0
+; CHECK: %17:vr128 = VANDNPSrr killed %16, killed %13
; CHECK: %xmm0 = COPY %18
; CHECK: RET 0, %xmm0
%call = tail call float @llvm.sqrt.f32(float %f) #1
@@ -29,18 +29,18 @@ define float @foo(float %f) #0 {
define float @rfoo(float %f) #0 {
; CHECK: {{name: *rfoo}}
; CHECK: body: |
-; CHECK: %0 = COPY %xmm0
-; CHECK: %1 = VRSQRTSSr killed %2, %0
-; CHECK: %3 = VMULSSrr %0, %1
-; CHECK: %4 = VMOVSSrm
-; CHECK: %5 = VFMADD213SSr %1, killed %3, %4
-; CHECK: %6 = VMOVSSrm
-; CHECK: %7 = VMULSSrr %1, %6
-; CHECK: %8 = VMULSSrr killed %7, killed %5
-; CHECK: %9 = VMULSSrr %0, %8
-; CHECK: %10 = VFMADD213SSr %8, killed %9, %4
-; CHECK: %11 = VMULSSrr %8, %6
-; CHECK: %12 = VMULSSrr killed %11, killed %10
+; CHECK: %0:fr32 = COPY %xmm0
+; CHECK: %1:fr32 = VRSQRTSSr killed %2, %0
+; CHECK: %3:fr32 = VMULSSrr %0, %1
+; CHECK: %4:fr32 = VMOVSSrm
+; CHECK: %5:fr32 = VFMADD213SSr %1, killed %3, %4
+; CHECK: %6:fr32 = VMOVSSrm
+; CHECK: %7:fr32 = VMULSSrr %1, %6
+; CHECK: %8:fr32 = VMULSSrr killed %7, killed %5
+; CHECK: %9:fr32 = VMULSSrr %0, %8
+; CHECK: %10:fr32 = VFMADD213SSr %8, killed %9, %4
+; CHECK: %11:fr32 = VMULSSrr %8, %6
+; CHECK: %12:fr32 = VMULSSrr killed %11, killed %10
; CHECK: %xmm0 = COPY %12
; CHECK: RET 0, %xmm0
%sqrt = tail call float @llvm.sqrt.f32(float %f)
diff --git a/test/CodeGen/X86/sqrt-fastmath-tune.ll b/test/CodeGen/X86/sqrt-fastmath-tune.ll
index afa01b674a65..65befee085c0 100644
--- a/test/CodeGen/X86/sqrt-fastmath-tune.ll
+++ b/test/CodeGen/X86/sqrt-fastmath-tune.ll
@@ -12,12 +12,12 @@ declare <8 x float> @llvm.sqrt.v8f32(<8 x float>) #0
define float @foo_x1(float %f) #0 {
; SCALAR-EST-LABEL: foo_x1:
-; SCALAR-EST: # BB#0:
+; SCALAR-EST: # %bb.0:
; SCALAR-EST-NEXT: rsqrtss %xmm0
; SCALAR-EST: retq
;
; SCALAR-ACC-LABEL: foo_x1:
-; SCALAR-ACC: # BB#0:
+; SCALAR-ACC: # %bb.0:
; SCALAR-ACC-NEXT: {{^ *v?sqrtss %xmm0}}
; SCALAR-ACC-NEXT: retq
%call = tail call float @llvm.sqrt.f32(float %f) #1
@@ -26,12 +26,12 @@ define float @foo_x1(float %f) #0 {
define <4 x float> @foo_x4(<4 x float> %f) #0 {
; VECTOR-EST-LABEL: foo_x4:
-; VECTOR-EST: # BB#0:
+; VECTOR-EST: # %bb.0:
; VECTOR-EST-NEXT: rsqrtps %xmm0
; VECTOR-EST: retq
;
; VECTOR-ACC-LABEL: foo_x4:
-; VECTOR-ACC: # BB#0:
+; VECTOR-ACC: # %bb.0:
; VECTOR-ACC-NEXT: {{^ *v?sqrtps %xmm0}}
; VECTOR-ACC-NEXT: retq
%call = tail call <4 x float> @llvm.sqrt.v4f32(<4 x float> %f) #1
@@ -40,12 +40,12 @@ define <4 x float> @foo_x4(<4 x float> %f) #0 {
define <8 x float> @foo_x8(<8 x float> %f) #0 {
; VECTOR-EST-LABEL: foo_x8:
-; VECTOR-EST: # BB#0:
+; VECTOR-EST: # %bb.0:
; VECTOR-EST-NEXT: rsqrtps
; VECTOR-EST: retq
;
; VECTOR-ACC-LABEL: foo_x8:
-; VECTOR-ACC: # BB#0:
+; VECTOR-ACC: # %bb.0:
; VECTOR-ACC-NEXT: {{^ *v?sqrtps %[xy]mm0}}
; VECTOR-ACC-NOT: rsqrt
; VECTOR-ACC: retq
diff --git a/test/CodeGen/X86/sqrt-fastmath.ll b/test/CodeGen/X86/sqrt-fastmath.ll
index af2dcc495f53..ede954d92d34 100644
--- a/test/CodeGen/X86/sqrt-fastmath.ll
+++ b/test/CodeGen/X86/sqrt-fastmath.ll
@@ -12,12 +12,12 @@ declare <8 x float> @llvm.sqrt.v8f32(<8 x float>)
define double @finite_f64_no_estimate(double %d) #0 {
; SSE-LABEL: finite_f64_no_estimate:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: sqrtsd %xmm0, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: finite_f64_no_estimate:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0
; AVX-NEXT: retq
%call = tail call double @__sqrt_finite(double %d) #2
@@ -28,12 +28,12 @@ define double @finite_f64_no_estimate(double %d) #0 {
define double @finite_f64_estimate(double %d) #1 {
; SSE-LABEL: finite_f64_estimate:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: sqrtsd %xmm0, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: finite_f64_estimate:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0
; AVX-NEXT: retq
%call = tail call double @__sqrt_finite(double %d) #2
@@ -42,12 +42,12 @@ define double @finite_f64_estimate(double %d) #1 {
define float @finite_f32_no_estimate(float %f) #0 {
; SSE-LABEL: finite_f32_no_estimate:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: sqrtss %xmm0, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: finite_f32_no_estimate:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vsqrtss %xmm0, %xmm0, %xmm0
; AVX-NEXT: retq
%call = tail call float @__sqrtf_finite(float %f) #2
@@ -56,7 +56,7 @@ define float @finite_f32_no_estimate(float %f) #0 {
define float @finite_f32_estimate(float %f) #1 {
; SSE-LABEL: finite_f32_estimate:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: rsqrtss %xmm0, %xmm1
; SSE-NEXT: movaps %xmm0, %xmm2
; SSE-NEXT: mulss %xmm1, %xmm2
@@ -71,7 +71,7 @@ define float @finite_f32_estimate(float %f) #1 {
; SSE-NEXT: retq
;
; AVX-LABEL: finite_f32_estimate:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vrsqrtss %xmm0, %xmm0, %xmm1
; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm2
; AVX-NEXT: vmulss %xmm1, %xmm2, %xmm1
@@ -88,7 +88,7 @@ define float @finite_f32_estimate(float %f) #1 {
define x86_fp80 @finite_f80_no_estimate(x86_fp80 %ld) #0 {
; CHECK-LABEL: finite_f80_no_estimate:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: fldt {{[0-9]+}}(%rsp)
; CHECK-NEXT: fsqrt
; CHECK-NEXT: retq
@@ -100,7 +100,7 @@ define x86_fp80 @finite_f80_no_estimate(x86_fp80 %ld) #0 {
define x86_fp80 @finite_f80_estimate_but_no(x86_fp80 %ld) #1 {
; CHECK-LABEL: finite_f80_estimate_but_no:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: fldt {{[0-9]+}}(%rsp)
; CHECK-NEXT: fsqrt
; CHECK-NEXT: retq
@@ -110,14 +110,14 @@ define x86_fp80 @finite_f80_estimate_but_no(x86_fp80 %ld) #1 {
define float @f32_no_estimate(float %x) #0 {
; SSE-LABEL: f32_no_estimate:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: sqrtss %xmm0, %xmm1
; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE-NEXT: divss %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: f32_no_estimate:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vsqrtss %xmm0, %xmm0, %xmm0
; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; AVX-NEXT: vdivss %xmm0, %xmm1, %xmm0
@@ -129,7 +129,7 @@ define float @f32_no_estimate(float %x) #0 {
define float @f32_estimate(float %x) #1 {
; SSE-LABEL: f32_estimate:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: rsqrtss %xmm0, %xmm1
; SSE-NEXT: movaps %xmm1, %xmm2
; SSE-NEXT: mulss %xmm2, %xmm2
@@ -141,7 +141,7 @@ define float @f32_estimate(float %x) #1 {
; SSE-NEXT: retq
;
; AVX-LABEL: f32_estimate:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vrsqrtss %xmm0, %xmm0, %xmm1
; AVX-NEXT: vmulss %xmm1, %xmm1, %xmm2
; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0
@@ -156,14 +156,14 @@ define float @f32_estimate(float %x) #1 {
define <4 x float> @v4f32_no_estimate(<4 x float> %x) #0 {
; SSE-LABEL: v4f32_no_estimate:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: sqrtps %xmm0, %xmm1
; SSE-NEXT: movaps {{.*#+}} xmm0 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
; SSE-NEXT: divps %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: v4f32_no_estimate:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vsqrtps %xmm0, %xmm0
; AVX-NEXT: vmovaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
; AVX-NEXT: vdivps %xmm0, %xmm1, %xmm0
@@ -175,7 +175,7 @@ define <4 x float> @v4f32_no_estimate(<4 x float> %x) #0 {
define <4 x float> @v4f32_estimate(<4 x float> %x) #1 {
; SSE-LABEL: v4f32_estimate:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: rsqrtps %xmm0, %xmm1
; SSE-NEXT: movaps %xmm1, %xmm2
; SSE-NEXT: mulps %xmm2, %xmm2
@@ -187,7 +187,7 @@ define <4 x float> @v4f32_estimate(<4 x float> %x) #1 {
; SSE-NEXT: retq
;
; AVX-LABEL: v4f32_estimate:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vrsqrtps %xmm0, %xmm1
; AVX-NEXT: vmulps %xmm1, %xmm1, %xmm2
; AVX-NEXT: vmulps %xmm2, %xmm0, %xmm0
@@ -202,7 +202,7 @@ define <4 x float> @v4f32_estimate(<4 x float> %x) #1 {
define <8 x float> @v8f32_no_estimate(<8 x float> %x) #0 {
; SSE-LABEL: v8f32_no_estimate:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: sqrtps %xmm1, %xmm2
; SSE-NEXT: sqrtps %xmm0, %xmm3
; SSE-NEXT: movaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
@@ -212,7 +212,7 @@ define <8 x float> @v8f32_no_estimate(<8 x float> %x) #0 {
; SSE-NEXT: retq
;
; AVX-LABEL: v8f32_no_estimate:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vsqrtps %ymm0, %ymm0
; AVX-NEXT: vmovaps {{.*#+}} ymm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
; AVX-NEXT: vdivps %ymm0, %ymm1, %ymm0
@@ -224,7 +224,7 @@ define <8 x float> @v8f32_no_estimate(<8 x float> %x) #0 {
define <8 x float> @v8f32_estimate(<8 x float> %x) #1 {
; SSE-LABEL: v8f32_estimate:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: rsqrtps %xmm0, %xmm3
; SSE-NEXT: movaps {{.*#+}} xmm4 = [-5.000000e-01,-5.000000e-01,-5.000000e-01,-5.000000e-01]
; SSE-NEXT: movaps %xmm3, %xmm2
@@ -246,7 +246,7 @@ define <8 x float> @v8f32_estimate(<8 x float> %x) #1 {
; SSE-NEXT: retq
;
; AVX-LABEL: v8f32_estimate:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vrsqrtps %ymm0, %ymm1
; AVX-NEXT: vmulps %ymm1, %ymm1, %ymm2
; AVX-NEXT: vmulps %ymm2, %ymm0, %ymm0
diff --git a/test/CodeGen/X86/sqrt-partial.ll b/test/CodeGen/X86/sqrt-partial.ll
new file mode 100644
index 000000000000..6f0d52490786
--- /dev/null
+++ b/test/CodeGen/X86/sqrt-partial.ll
@@ -0,0 +1,43 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s
+
+; PR31455 - https://bugs.llvm.org/show_bug.cgi?id=31455
+; We have to assume that errno can be set, so we have to make a libcall in that case.
+; But it's better for perf to check that the argument is valid rather than the result of
+; sqrtss/sqrtsd.
+; Note: This is really a test of the -partially-inline-libcalls IR pass (and we have an IR test
+; for that), but we're checking the final asm to make sure that comes out as expected too.
+
+define float @f(float %val) nounwind {
+; CHECK-LABEL: f:
+; CHECK: # %bb.0:
+; CHECK-NEXT: xorps %xmm1, %xmm1
+; CHECK-NEXT: ucomiss %xmm1, %xmm0
+; CHECK-NEXT: jb .LBB0_2
+; CHECK-NEXT: # %bb.1: # %.split
+; CHECK-NEXT: sqrtss %xmm0, %xmm0
+; CHECK-NEXT: retq
+; CHECK-NEXT: .LBB0_2: # %call.sqrt
+; CHECK-NEXT: jmp sqrtf # TAILCALL
+ %res = tail call float @sqrtf(float %val)
+ ret float %res
+}
+
+define double @d(double %val) nounwind {
+; CHECK-LABEL: d:
+; CHECK: # %bb.0:
+; CHECK-NEXT: xorps %xmm1, %xmm1
+; CHECK-NEXT: ucomisd %xmm1, %xmm0
+; CHECK-NEXT: jb .LBB1_2
+; CHECK-NEXT: # %bb.1: # %.split
+; CHECK-NEXT: sqrtsd %xmm0, %xmm0
+; CHECK-NEXT: retq
+; CHECK-NEXT: .LBB1_2: # %call.sqrt
+; CHECK-NEXT: jmp sqrt # TAILCALL
+ %res = tail call double @sqrt(double %val)
+ ret double %res
+}
+
+declare float @sqrtf(float)
+declare double @sqrt(double)
+
diff --git a/test/CodeGen/X86/sse-align-1.ll b/test/CodeGen/X86/sse-align-1.ll
index 1a6058c6114c..378100d693ba 100644
--- a/test/CodeGen/X86/sse-align-1.ll
+++ b/test/CodeGen/X86/sse-align-1.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 | grep movap | count 2
+; RUN: llc < %s -mtriple=x86_64-- | grep movap | count 2
define <4 x float> @foo(<4 x float>* %p) nounwind {
%t = load <4 x float>, <4 x float>* %p
diff --git a/test/CodeGen/X86/sse-align-10.ll b/test/CodeGen/X86/sse-align-10.ll
index 1e688a56ad44..48e405d792ed 100644
--- a/test/CodeGen/X86/sse-align-10.ll
+++ b/test/CodeGen/X86/sse-align-10.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-- | FileCheck %s
define <2 x i64> @bar(<2 x i64>* %p) nounwind {
; CHECK-LABEL: bar:
diff --git a/test/CodeGen/X86/sse-align-11.ll b/test/CodeGen/X86/sse-align-11.ll
index 9f5d4b40d61a..857ea4508172 100644
--- a/test/CodeGen/X86/sse-align-11.ll
+++ b/test/CodeGen/X86/sse-align-11.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=x86 -mcpu=yonah -mtriple=i686-apple-darwin8 | grep movaps
-; RUN: llc < %s -march=x86 -mcpu=yonah -mtriple=i686-linux-gnu | grep movaps
+; RUN: llc < %s -mcpu=yonah -mtriple=i686-apple-darwin8 | grep movaps
+; RUN: llc < %s -mcpu=yonah -mtriple=i686-linux-gnu | grep movaps
; PR8969 - make 32-bit linux have a 16-byte aligned stack
define <4 x float> @foo(float %a, float %b, float %c, float %d) nounwind {
diff --git a/test/CodeGen/X86/sse-align-12.ll b/test/CodeGen/X86/sse-align-12.ll
index 4fbb6e42ccae..15c3cb014aba 100644
--- a/test/CodeGen/X86/sse-align-12.ll
+++ b/test/CodeGen/X86/sse-align-12.ll
@@ -1,9 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -march=x86-64 -mcpu=nehalem | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=nehalem | FileCheck %s
define <4 x float> @a(<4 x float>* %y) nounwind {
; CHECK-LABEL: a:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movups (%rdi), %xmm0
; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,2,1,0]
; CHECK-NEXT: retq
@@ -21,7 +21,7 @@ define <4 x float> @a(<4 x float>* %y) nounwind {
define <4 x float> @b(<4 x float>* %y, <4 x float> %z) nounwind {
; CHECK-LABEL: b:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movups (%rdi), %xmm1
; CHECK-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; CHECK-NEXT: retq
@@ -39,7 +39,7 @@ define <4 x float> @b(<4 x float>* %y, <4 x float> %z) nounwind {
define <2 x double> @c(<2 x double>* %y) nounwind {
; CHECK-LABEL: c:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movupd (%rdi), %xmm0
; CHECK-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0]
; CHECK-NEXT: retq
@@ -53,8 +53,8 @@ define <2 x double> @c(<2 x double>* %y) nounwind {
define <2 x double> @d(<2 x double>* %y, <2 x double> %z) nounwind {
; CHECK-LABEL: d:
-; CHECK: # BB#0:
-; CHECK-NEXT: movupd (%rdi), %xmm1
+; CHECK: # %bb.0:
+; CHECK-NEXT: movups (%rdi), %xmm1
; CHECK-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
; CHECK-NEXT: retq
%x = load <2 x double>, <2 x double>* %y, align 8
diff --git a/test/CodeGen/X86/sse-align-2.ll b/test/CodeGen/X86/sse-align-2.ll
index 063cc9d2f563..af548be6c051 100644
--- a/test/CodeGen/X86/sse-align-2.ll
+++ b/test/CodeGen/X86/sse-align-2.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mcpu=penryn | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=penryn | FileCheck %s
define <4 x float> @foo(<4 x float>* %p, <4 x float> %x) nounwind {
%t = load <4 x float>, <4 x float>* %p, align 4
diff --git a/test/CodeGen/X86/sse-align-4.ll b/test/CodeGen/X86/sse-align-4.ll
index 4c59934917f3..8edafd96983d 100644
--- a/test/CodeGen/X86/sse-align-4.ll
+++ b/test/CodeGen/X86/sse-align-4.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 | grep movup | count 2
+; RUN: llc < %s -mtriple=x86_64-- | grep movup | count 2
define void @foo(<4 x float>* %p, <4 x float> %x) nounwind {
store <4 x float> %x, <4 x float>* %p, align 4
diff --git a/test/CodeGen/X86/sse-align-5.ll b/test/CodeGen/X86/sse-align-5.ll
index a64b953220d5..4429d457950a 100644
--- a/test/CodeGen/X86/sse-align-5.ll
+++ b/test/CodeGen/X86/sse-align-5.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 | grep movaps | count 1
+; RUN: llc < %s -mtriple=x86_64-- | grep movaps | count 1
define <2 x i64> @bar(<2 x i64>* %p) nounwind {
%t = load <2 x i64>, <2 x i64>* %p
diff --git a/test/CodeGen/X86/sse-align-6.ll b/test/CodeGen/X86/sse-align-6.ll
index 01f225101b96..343832c74648 100644
--- a/test/CodeGen/X86/sse-align-6.ll
+++ b/test/CodeGen/X86/sse-align-6.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 | grep movdqu | count 1
+; RUN: llc < %s -mtriple=x86_64-- | grep movdqu | count 1
define <2 x i64> @bar(<2 x i64>* %p, <2 x i64> %x) nounwind {
%t = load <2 x i64>, <2 x i64>* %p, align 8
diff --git a/test/CodeGen/X86/sse-align-8.ll b/test/CodeGen/X86/sse-align-8.ll
index cfeff8161c5c..60aa22b7e196 100644
--- a/test/CodeGen/X86/sse-align-8.ll
+++ b/test/CodeGen/X86/sse-align-8.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 | grep movups | count 1
+; RUN: llc < %s -mtriple=x86_64-- | grep movups | count 1
define void @bar(<2 x i64>* %p, <2 x i64> %x) nounwind {
store <2 x i64> %x, <2 x i64>* %p, align 8
diff --git a/test/CodeGen/X86/sse-align-9.ll b/test/CodeGen/X86/sse-align-9.ll
index 182c91c69d93..afec18ef578f 100644
--- a/test/CodeGen/X86/sse-align-9.ll
+++ b/test/CodeGen/X86/sse-align-9.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 | grep movup | count 2
+; RUN: llc < %s -mtriple=x86_64-- | grep movup | count 2
define <4 x float> @foo(<4 x float>* %p) nounwind {
%t = load <4 x float>, <4 x float>* %p, align 4
diff --git a/test/CodeGen/X86/sse-fcopysign.ll b/test/CodeGen/X86/sse-fcopysign.ll
index 6805334140f2..883fb5290f0e 100644
--- a/test/CodeGen/X86/sse-fcopysign.ll
+++ b/test/CodeGen/X86/sse-fcopysign.ll
@@ -8,7 +8,7 @@
define float @tst1(float %a, float %b) nounwind {
; X32-LABEL: tst1:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: subl $8, %esp
; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
@@ -19,7 +19,7 @@ define float @tst1(float %a, float %b) nounwind {
; X32-NEXT: retl
;
; X64-LABEL: tst1:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movaps %xmm0, %xmm2
; X64-NEXT: movaps %xmm1, %xmm0
; X64-NEXT: movaps %xmm2, %xmm1
@@ -30,7 +30,7 @@ define float @tst1(float %a, float %b) nounwind {
define double @tst2(double %a, float %b, float %c) nounwind {
; X32-LABEL: tst2:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: subl $16, %esp
; X32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
@@ -43,7 +43,7 @@ define double @tst2(double %a, float %b, float %c) nounwind {
; X32-NEXT: retl
;
; X64-LABEL: tst2:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: addss %xmm2, %xmm1
; X64-NEXT: cvtss2sd %xmm1, %xmm1
; X64-NEXT: jmp copysign # TAILCALL
@@ -62,7 +62,7 @@ declare double @copysign(double, double)
define float @int1(float %a, float %b) nounwind {
; X32-LABEL: int1:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pushl %eax
; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X32-NEXT: andps {{\.LCPI.*}}, %xmm0
@@ -75,7 +75,7 @@ define float @int1(float %a, float %b) nounwind {
; X32-NEXT: retl
;
; X64-LABEL: int1:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: andps {{.*}}(%rip), %xmm0
; X64-NEXT: andps {{.*}}(%rip), %xmm1
; X64-NEXT: orps %xmm1, %xmm0
@@ -86,7 +86,7 @@ define float @int1(float %a, float %b) nounwind {
define double @int2(double %a, float %b, float %c) nounwind {
; X32-LABEL: int2:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pushl %ebp
; X32-NEXT: movl %esp, %ebp
; X32-NEXT: andl $-8, %esp
@@ -105,7 +105,7 @@ define double @int2(double %a, float %b, float %c) nounwind {
; X32-NEXT: retl
;
; X64-LABEL: int2:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: addss %xmm2, %xmm1
; X64-NEXT: cvtss2sd %xmm1, %xmm1
; X64-NEXT: andps {{.*}}(%rip), %xmm1
@@ -120,13 +120,13 @@ define double @int2(double %a, float %b, float %c) nounwind {
define float @cst1() nounwind {
; X32-LABEL: cst1:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: fld1
; X32-NEXT: fchs
; X32-NEXT: retl
;
; X64-LABEL: cst1:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X64-NEXT: retq
%tmp = tail call float @llvm.copysign.f32( float 1.0, float -2.0 )
@@ -135,13 +135,13 @@ define float @cst1() nounwind {
define double @cst2() nounwind {
; X32-LABEL: cst2:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: fldz
; X32-NEXT: fchs
; X32-NEXT: retl
;
; X64-LABEL: cst2:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; X64-NEXT: retq
%tmp1 = fadd float -1.0, -1.0
diff --git a/test/CodeGen/X86/sse-fsignum.ll b/test/CodeGen/X86/sse-fsignum.ll
index 8b27941571e8..0b6c205fd26b 100644
--- a/test/CodeGen/X86/sse-fsignum.ll
+++ b/test/CodeGen/X86/sse-fsignum.ll
@@ -11,7 +11,7 @@
define void @signum32a(<4 x float>*) {
; AVX-LABEL: signum32a:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vmovaps (%rdi), %xmm0
; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX-NEXT: vcmpltps %xmm1, %xmm0, %xmm2
@@ -34,7 +34,7 @@ entry:
define void @signum64a(<2 x double>*) {
; AVX-LABEL: signum64a:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vmovapd (%rdi), %xmm0
; AVX-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; AVX-NEXT: vcmpltpd %xmm1, %xmm0, %xmm2
@@ -63,9 +63,9 @@ entry:
define void @signum32b(<8 x float>*) {
; AVX1-LABEL: signum32b:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vmovaps (%rdi), %ymm0
-; AVX1-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vcmpltps %ymm1, %ymm0, %ymm2
; AVX1-NEXT: vcvtdq2ps %ymm2, %ymm2
; AVX1-NEXT: vcmpltps %ymm0, %ymm1, %ymm0
@@ -76,9 +76,9 @@ define void @signum32b(<8 x float>*) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: signum32b:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vmovaps (%rdi), %ymm0
-; AVX2-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vcmpltps %ymm1, %ymm0, %ymm2
; AVX2-NEXT: vcvtdq2ps %ymm2, %ymm2
; AVX2-NEXT: vcmpltps %ymm0, %ymm1, %ymm0
@@ -89,16 +89,14 @@ define void @signum32b(<8 x float>*) {
; AVX2-NEXT: retq
;
; AVX512F-LABEL: signum32b:
-; AVX512F: # BB#0: # %entry
+; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vmovaps (%rdi), %ymm0
-; AVX512F-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; AVX512F-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX512F-NEXT: vcmpltps %zmm1, %zmm0, %k1
-; AVX512F-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; AVX512F-NEXT: vpmovqd %zmm2, %ymm2
+; AVX512F-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; AVX512F-NEXT: vcvtdq2ps %ymm2, %ymm2
; AVX512F-NEXT: vcmpltps %zmm0, %zmm1, %k1
-; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
+; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-NEXT: vcvtdq2ps %ymm0, %ymm0
; AVX512F-NEXT: vsubps %ymm0, %ymm2, %ymm0
; AVX512F-NEXT: vmovaps %ymm0, (%rdi)
@@ -117,16 +115,16 @@ entry:
define void @signum64b(<4 x double>*) {
; AVX1-LABEL: signum64b:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vmovapd (%rdi), %ymm0
-; AVX1-NEXT: vxorpd %ymm1, %ymm1, %ymm1
+; AVX1-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vcmpltpd %ymm1, %ymm0, %ymm2
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
-; AVX1-NEXT: vpacksswb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpackssdw %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vcvtdq2pd %xmm2, %ymm2
; AVX1-NEXT: vcmpltpd %ymm0, %ymm1, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vcvtdq2pd %xmm0, %ymm0
; AVX1-NEXT: vsubpd %ymm0, %ymm2, %ymm0
; AVX1-NEXT: vmovapd %ymm0, (%rdi)
@@ -134,16 +132,16 @@ define void @signum64b(<4 x double>*) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: signum64b:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vmovapd (%rdi), %ymm0
-; AVX2-NEXT: vxorpd %ymm1, %ymm1, %ymm1
+; AVX2-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vcmpltpd %ymm1, %ymm0, %ymm2
; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm3
-; AVX2-NEXT: vpacksswb %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vpackssdw %xmm3, %xmm2, %xmm2
; AVX2-NEXT: vcvtdq2pd %xmm2, %ymm2
; AVX2-NEXT: vcmpltpd %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0
; AVX2-NEXT: vsubpd %ymm0, %ymm2, %ymm0
; AVX2-NEXT: vmovapd %ymm0, (%rdi)
@@ -151,9 +149,9 @@ define void @signum64b(<4 x double>*) {
; AVX2-NEXT: retq
;
; AVX512F-LABEL: signum64b:
-; AVX512F: # BB#0: # %entry
+; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vmovapd (%rdi), %ymm0
-; AVX512F-NEXT: vxorpd %ymm1, %ymm1, %ymm1
+; AVX512F-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; AVX512F-NEXT: vcmpltpd %ymm1, %ymm0, %ymm2
; AVX512F-NEXT: vpmovqd %zmm2, %ymm2
; AVX512F-NEXT: vcvtdq2pd %xmm2, %ymm2
@@ -181,9 +179,9 @@ entry:
define void @signum32c(<8 x float>*) {
; AVX-LABEL: signum32c:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vmovaps (%rdi), %ymm0
-; AVX-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX-NEXT: vcmpltps %ymm1, %ymm0, %ymm2
; AVX-NEXT: vcvtdq2ps %ymm2, %ymm2
; AVX-NEXT: vcmpltps %ymm0, %ymm1, %ymm0
@@ -207,9 +205,9 @@ entry:
define void @signum64c(<4 x double>*) {
; AVX1-LABEL: signum64c:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vmovapd (%rdi), %ymm0
-; AVX1-NEXT: vxorpd %ymm1, %ymm1, %ymm1
+; AVX1-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vcmpltpd %ymm1, %ymm0, %ymm2
; AVX1-NEXT: vcmpltpd %ymm0, %ymm1, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
@@ -223,9 +221,9 @@ define void @signum64c(<4 x double>*) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: signum64c:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vmovapd (%rdi), %ymm0
-; AVX2-NEXT: vxorpd %ymm1, %ymm1, %ymm1
+; AVX2-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vcmpltpd %ymm1, %ymm0, %ymm2
; AVX2-NEXT: vcmpltpd %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vpsubd %ymm0, %ymm2, %ymm0
@@ -237,9 +235,9 @@ define void @signum64c(<4 x double>*) {
; AVX2-NEXT: retq
;
; AVX512F-LABEL: signum64c:
-; AVX512F: # BB#0: # %entry
+; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vmovapd (%rdi), %ymm0
-; AVX512F-NEXT: vxorpd %ymm1, %ymm1, %ymm1
+; AVX512F-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; AVX512F-NEXT: vcmpltpd %ymm1, %ymm0, %ymm2
; AVX512F-NEXT: vcmpltpd %ymm0, %ymm1, %ymm0
; AVX512F-NEXT: vpsubd %ymm0, %ymm2, %ymm0
diff --git a/test/CodeGen/X86/sse-intrinsics-fast-isel-x86_64.ll b/test/CodeGen/X86/sse-intrinsics-fast-isel-x86_64.ll
index aad00e71dda0..753f787e2d93 100644
--- a/test/CodeGen/X86/sse-intrinsics-fast-isel-x86_64.ll
+++ b/test/CodeGen/X86/sse-intrinsics-fast-isel-x86_64.ll
@@ -5,7 +5,7 @@
define <4 x float> @test_mm_cvtsi64_ss(<4 x float> %a0, i64 %a1) nounwind {
; X64-LABEL: test_mm_cvtsi64_ss:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: cvtsi2ssq %rdi, %xmm0
; X64-NEXT: retq
%res = call <4 x float> @llvm.x86.sse.cvtsi642ss(<4 x float> %a0, i64 %a1)
@@ -15,7 +15,7 @@ declare <4 x float> @llvm.x86.sse.cvtsi642ss(<4 x float>, i64) nounwind readnone
define i64 @test_mm_cvtss_si64(<4 x float> %a0) nounwind {
; X64-LABEL: test_mm_cvtss_si64:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: cvtss2si %xmm0, %rax
; X64-NEXT: retq
%res = call i64 @llvm.x86.sse.cvtss2si64(<4 x float> %a0)
@@ -25,7 +25,7 @@ declare i64 @llvm.x86.sse.cvtss2si64(<4 x float>) nounwind readnone
define i64 @test_mm_cvttss_si64(<4 x float> %a0) nounwind {
; X64-LABEL: test_mm_cvttss_si64:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: cvttss2si %xmm0, %rax
; X64-NEXT: retq
%res = call i64 @llvm.x86.sse.cvttss2si64(<4 x float> %a0)
diff --git a/test/CodeGen/X86/sse-intrinsics-fast-isel.ll b/test/CodeGen/X86/sse-intrinsics-fast-isel.ll
index e468c69db5dd..649a86dc1fc2 100644
--- a/test/CodeGen/X86/sse-intrinsics-fast-isel.ll
+++ b/test/CodeGen/X86/sse-intrinsics-fast-isel.ll
@@ -6,12 +6,12 @@
define <4 x float> @test_mm_add_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
; X32-LABEL: test_mm_add_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: addps %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_add_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: addps %xmm1, %xmm0
; X64-NEXT: retq
%res = fadd <4 x float> %a0, %a1
@@ -20,12 +20,12 @@ define <4 x float> @test_mm_add_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
define <4 x float> @test_mm_add_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
; X32-LABEL: test_mm_add_ss:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: addss %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_add_ss:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: addss %xmm1, %xmm0
; X64-NEXT: retq
%ext0 = extractelement <4 x float> %a0, i32 0
@@ -37,67 +37,13 @@ define <4 x float> @test_mm_add_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
define <4 x float> @test_mm_and_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
; X32-LABEL: test_mm_and_ps:
-; X32: # BB#0:
-; X32-NEXT: pushl %ebp
-; X32-NEXT: movl %esp, %ebp
-; X32-NEXT: pushl %esi
-; X32-NEXT: andl $-16, %esp
-; X32-NEXT: subl $64, %esp
-; X32-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X32-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
-; X32-NEXT: andl {{[0-9]+}}(%esp), %esi
-; X32-NEXT: movl %esi, (%esp)
-; X32-NEXT: andl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X32-NEXT: andl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X32-NEXT: andl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X32-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; X32-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; X32-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; X32-NEXT: leal -4(%ebp), %esp
-; X32-NEXT: popl %esi
-; X32-NEXT: popl %ebp
+; X32: # %bb.0:
+; X32-NEXT: andps %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_and_ps:
-; X64: # BB#0:
-; X64-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; X64-NEXT: movq -{{[0-9]+}}(%rsp), %r8
-; X64-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rdx
-; X64-NEXT: movq %rdx, %rsi
-; X64-NEXT: andl %eax, %edx
-; X64-NEXT: shrq $32, %rax
-; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; X64-NEXT: movq %rcx, %rdi
-; X64-NEXT: andl %r8d, %ecx
-; X64-NEXT: shrq $32, %r8
-; X64-NEXT: shrq $32, %rsi
-; X64-NEXT: shrq $32, %rdi
-; X64-NEXT: movl %ecx, -{{[0-9]+}}(%rsp)
-; X64-NEXT: andl %r8d, %edi
-; X64-NEXT: movl %edi, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movl %edx, -{{[0-9]+}}(%rsp)
-; X64-NEXT: andl %eax, %esi
-; X64-NEXT: movl %esi, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X64-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X64-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; X64-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; X64-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X64: # %bb.0:
+; X64-NEXT: andps %xmm1, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <4 x float> %a0 to <4 x i32>
%arg1 = bitcast <4 x float> %a1 to <4 x i32>
@@ -108,75 +54,13 @@ define <4 x float> @test_mm_and_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
define <4 x float> @test_mm_andnot_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
; X32-LABEL: test_mm_andnot_ps:
-; X32: # BB#0:
-; X32-NEXT: pushl %ebp
-; X32-NEXT: movl %esp, %ebp
-; X32-NEXT: pushl %esi
-; X32-NEXT: andl $-16, %esp
-; X32-NEXT: subl $64, %esp
-; X32-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X32-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
-; X32-NEXT: notl %edx
-; X32-NEXT: notl %esi
-; X32-NEXT: notl %ecx
-; X32-NEXT: notl %eax
-; X32-NEXT: andl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl %eax, (%esp)
-; X32-NEXT: andl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X32-NEXT: andl {{[0-9]+}}(%esp), %esi
-; X32-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; X32-NEXT: andl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X32-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; X32-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; X32-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; X32-NEXT: leal -4(%ebp), %esp
-; X32-NEXT: popl %esi
-; X32-NEXT: popl %ebp
+; X32: # %bb.0:
+; X32-NEXT: andnps %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_andnot_ps:
-; X64: # BB#0:
-; X64-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; X64-NEXT: movq %rcx, %rdx
-; X64-NEXT: shrq $32, %rdx
-; X64-NEXT: movq %rax, %rsi
-; X64-NEXT: shrq $32, %rsi
-; X64-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rdi
-; X64-NEXT: movq -{{[0-9]+}}(%rsp), %r8
-; X64-NEXT: notl %eax
-; X64-NEXT: andl %edi, %eax
-; X64-NEXT: shrq $32, %rdi
-; X64-NEXT: notl %ecx
-; X64-NEXT: andl %r8d, %ecx
-; X64-NEXT: shrq $32, %r8
-; X64-NEXT: notl %esi
-; X64-NEXT: notl %edx
-; X64-NEXT: movl %ecx, -{{[0-9]+}}(%rsp)
-; X64-NEXT: andl %r8d, %edx
-; X64-NEXT: movl %edx, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movl %eax, -{{[0-9]+}}(%rsp)
-; X64-NEXT: andl %edi, %esi
-; X64-NEXT: movl %esi, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X64-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X64-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; X64-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; X64-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X64: # %bb.0:
+; X64-NEXT: andnps %xmm1, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <4 x float> %a0 to <4 x i32>
%arg1 = bitcast <4 x float> %a1 to <4 x i32>
@@ -188,12 +72,12 @@ define <4 x float> @test_mm_andnot_ps(<4 x float> %a0, <4 x float> %a1) nounwind
define <4 x float> @test_mm_cmpeq_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
; X32-LABEL: test_mm_cmpeq_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: cmpeqps %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_cmpeq_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: cmpeqps %xmm1, %xmm0
; X64-NEXT: retq
%cmp = fcmp oeq <4 x float> %a0, %a1
@@ -204,12 +88,12 @@ define <4 x float> @test_mm_cmpeq_ps(<4 x float> %a0, <4 x float> %a1) nounwind
define <4 x float> @test_mm_cmpeq_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
; X32-LABEL: test_mm_cmpeq_ss:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: cmpeqss %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_cmpeq_ss:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: cmpeqss %xmm1, %xmm0
; X64-NEXT: retq
%res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 0)
@@ -219,13 +103,13 @@ declare <4 x float> @llvm.x86.sse.cmp.ss(<4 x float>, <4 x float>, i8) nounwind
define <4 x float> @test_mm_cmpge_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
; X32-LABEL: test_mm_cmpge_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: cmpleps %xmm0, %xmm1
; X32-NEXT: movaps %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_cmpge_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: cmpleps %xmm0, %xmm1
; X64-NEXT: movaps %xmm1, %xmm0
; X64-NEXT: retq
@@ -237,13 +121,13 @@ define <4 x float> @test_mm_cmpge_ps(<4 x float> %a0, <4 x float> %a1) nounwind
define <4 x float> @test_mm_cmpge_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
; X32-LABEL: test_mm_cmpge_ss:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: cmpless %xmm0, %xmm1
; X32-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
; X32-NEXT: retl
;
; X64-LABEL: test_mm_cmpge_ss:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: cmpless %xmm0, %xmm1
; X64-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
; X64-NEXT: retq
@@ -254,13 +138,13 @@ define <4 x float> @test_mm_cmpge_ss(<4 x float> %a0, <4 x float> %a1) nounwind
define <4 x float> @test_mm_cmpgt_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
; X32-LABEL: test_mm_cmpgt_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: cmpltps %xmm0, %xmm1
; X32-NEXT: movaps %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_cmpgt_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: cmpltps %xmm0, %xmm1
; X64-NEXT: movaps %xmm1, %xmm0
; X64-NEXT: retq
@@ -272,13 +156,13 @@ define <4 x float> @test_mm_cmpgt_ps(<4 x float> %a0, <4 x float> %a1) nounwind
define <4 x float> @test_mm_cmpgt_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
; X32-LABEL: test_mm_cmpgt_ss:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: cmpltss %xmm0, %xmm1
; X32-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
; X32-NEXT: retl
;
; X64-LABEL: test_mm_cmpgt_ss:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: cmpltss %xmm0, %xmm1
; X64-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
; X64-NEXT: retq
@@ -289,12 +173,12 @@ define <4 x float> @test_mm_cmpgt_ss(<4 x float> %a0, <4 x float> %a1) nounwind
define <4 x float> @test_mm_cmple_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
; X32-LABEL: test_mm_cmple_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: cmpleps %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_cmple_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: cmpleps %xmm1, %xmm0
; X64-NEXT: retq
%cmp = fcmp ole <4 x float> %a0, %a1
@@ -305,12 +189,12 @@ define <4 x float> @test_mm_cmple_ps(<4 x float> %a0, <4 x float> %a1) nounwind
define <4 x float> @test_mm_cmple_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
; X32-LABEL: test_mm_cmple_ss:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: cmpless %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_cmple_ss:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: cmpless %xmm1, %xmm0
; X64-NEXT: retq
%res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 2)
@@ -319,12 +203,12 @@ define <4 x float> @test_mm_cmple_ss(<4 x float> %a0, <4 x float> %a1) nounwind
define <4 x float> @test_mm_cmplt_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
; X32-LABEL: test_mm_cmplt_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: cmpltps %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_cmplt_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: cmpltps %xmm1, %xmm0
; X64-NEXT: retq
%cmp = fcmp olt <4 x float> %a0, %a1
@@ -335,12 +219,12 @@ define <4 x float> @test_mm_cmplt_ps(<4 x float> %a0, <4 x float> %a1) nounwind
define <4 x float> @test_mm_cmplt_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
; X32-LABEL: test_mm_cmplt_ss:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: cmpltss %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_cmplt_ss:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: cmpltss %xmm1, %xmm0
; X64-NEXT: retq
%res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 1)
@@ -349,12 +233,12 @@ define <4 x float> @test_mm_cmplt_ss(<4 x float> %a0, <4 x float> %a1) nounwind
define <4 x float> @test_mm_cmpneq_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
; X32-LABEL: test_mm_cmpneq_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: cmpneqps %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_cmpneq_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: cmpneqps %xmm1, %xmm0
; X64-NEXT: retq
%cmp = fcmp une <4 x float> %a0, %a1
@@ -365,12 +249,12 @@ define <4 x float> @test_mm_cmpneq_ps(<4 x float> %a0, <4 x float> %a1) nounwind
define <4 x float> @test_mm_cmpneq_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
; X32-LABEL: test_mm_cmpneq_ss:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: cmpneqss %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_cmpneq_ss:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: cmpneqss %xmm1, %xmm0
; X64-NEXT: retq
%res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 4)
@@ -379,13 +263,13 @@ define <4 x float> @test_mm_cmpneq_ss(<4 x float> %a0, <4 x float> %a1) nounwind
define <4 x float> @test_mm_cmpnge_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
; X32-LABEL: test_mm_cmpnge_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: cmpnleps %xmm0, %xmm1
; X32-NEXT: movaps %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_cmpnge_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: cmpnleps %xmm0, %xmm1
; X64-NEXT: movaps %xmm1, %xmm0
; X64-NEXT: retq
@@ -397,13 +281,13 @@ define <4 x float> @test_mm_cmpnge_ps(<4 x float> %a0, <4 x float> %a1) nounwind
define <4 x float> @test_mm_cmpnge_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
; X32-LABEL: test_mm_cmpnge_ss:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: cmpnless %xmm0, %xmm1
; X32-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
; X32-NEXT: retl
;
; X64-LABEL: test_mm_cmpnge_ss:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: cmpnless %xmm0, %xmm1
; X64-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
; X64-NEXT: retq
@@ -414,13 +298,13 @@ define <4 x float> @test_mm_cmpnge_ss(<4 x float> %a0, <4 x float> %a1) nounwind
define <4 x float> @test_mm_cmpngt_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
; X32-LABEL: test_mm_cmpngt_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: cmpnltps %xmm0, %xmm1
; X32-NEXT: movaps %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_cmpngt_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: cmpnltps %xmm0, %xmm1
; X64-NEXT: movaps %xmm1, %xmm0
; X64-NEXT: retq
@@ -432,13 +316,13 @@ define <4 x float> @test_mm_cmpngt_ps(<4 x float> %a0, <4 x float> %a1) nounwind
define <4 x float> @test_mm_cmpngt_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
; X32-LABEL: test_mm_cmpngt_ss:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: cmpnltss %xmm0, %xmm1
; X32-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
; X32-NEXT: retl
;
; X64-LABEL: test_mm_cmpngt_ss:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: cmpnltss %xmm0, %xmm1
; X64-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
; X64-NEXT: retq
@@ -449,12 +333,12 @@ define <4 x float> @test_mm_cmpngt_ss(<4 x float> %a0, <4 x float> %a1) nounwind
define <4 x float> @test_mm_cmpnle_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
; X32-LABEL: test_mm_cmpnle_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: cmpnleps %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_cmpnle_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: cmpnleps %xmm1, %xmm0
; X64-NEXT: retq
%cmp = fcmp ugt <4 x float> %a0, %a1
@@ -465,12 +349,12 @@ define <4 x float> @test_mm_cmpnle_ps(<4 x float> %a0, <4 x float> %a1) nounwind
define <4 x float> @test_mm_cmpnle_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
; X32-LABEL: test_mm_cmpnle_ss:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: cmpnless %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_cmpnle_ss:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: cmpnless %xmm1, %xmm0
; X64-NEXT: retq
%res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 6)
@@ -479,12 +363,12 @@ define <4 x float> @test_mm_cmpnle_ss(<4 x float> %a0, <4 x float> %a1) nounwind
define <4 x float> @test_mm_cmpnlt_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
; X32-LABEL: test_mm_cmpnlt_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: cmpnltps %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_cmpnlt_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: cmpnltps %xmm1, %xmm0
; X64-NEXT: retq
%cmp = fcmp uge <4 x float> %a0, %a1
@@ -495,12 +379,12 @@ define <4 x float> @test_mm_cmpnlt_ps(<4 x float> %a0, <4 x float> %a1) nounwind
define <4 x float> @test_mm_cmpnlt_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
; X32-LABEL: test_mm_cmpnlt_ss:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: cmpnltss %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_cmpnlt_ss:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: cmpnltss %xmm1, %xmm0
; X64-NEXT: retq
%res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 5)
@@ -509,12 +393,12 @@ define <4 x float> @test_mm_cmpnlt_ss(<4 x float> %a0, <4 x float> %a1) nounwind
define <4 x float> @test_mm_cmpord_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
; X32-LABEL: test_mm_cmpord_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: cmpordps %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_cmpord_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: cmpordps %xmm1, %xmm0
; X64-NEXT: retq
%cmp = fcmp ord <4 x float> %a0, %a1
@@ -525,12 +409,12 @@ define <4 x float> @test_mm_cmpord_ps(<4 x float> %a0, <4 x float> %a1) nounwind
define <4 x float> @test_mm_cmpord_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
; X32-LABEL: test_mm_cmpord_ss:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: cmpordss %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_cmpord_ss:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: cmpordss %xmm1, %xmm0
; X64-NEXT: retq
%res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 7)
@@ -539,12 +423,12 @@ define <4 x float> @test_mm_cmpord_ss(<4 x float> %a0, <4 x float> %a1) nounwind
define <4 x float> @test_mm_cmpunord_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
; X32-LABEL: test_mm_cmpunord_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: cmpunordps %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_cmpunord_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: cmpunordps %xmm1, %xmm0
; X64-NEXT: retq
%cmp = fcmp uno <4 x float> %a0, %a1
@@ -555,12 +439,12 @@ define <4 x float> @test_mm_cmpunord_ps(<4 x float> %a0, <4 x float> %a1) nounwi
define <4 x float> @test_mm_cmpunord_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
; X32-LABEL: test_mm_cmpunord_ss:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: cmpunordss %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_cmpunord_ss:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: cmpunordss %xmm1, %xmm0
; X64-NEXT: retq
%res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 3)
@@ -569,7 +453,7 @@ define <4 x float> @test_mm_cmpunord_ss(<4 x float> %a0, <4 x float> %a1) nounwi
define i32 @test_mm_comieq_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
; X32-LABEL: test_mm_comieq_ss:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: comiss %xmm1, %xmm0
; X32-NEXT: setnp %al
; X32-NEXT: sete %cl
@@ -578,7 +462,7 @@ define i32 @test_mm_comieq_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
; X32-NEXT: retl
;
; X64-LABEL: test_mm_comieq_ss:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: comiss %xmm1, %xmm0
; X64-NEXT: setnp %al
; X64-NEXT: sete %cl
@@ -592,14 +476,14 @@ declare i32 @llvm.x86.sse.comieq.ss(<4 x float>, <4 x float>) nounwind readnone
define i32 @test_mm_comige_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
; X32-LABEL: test_mm_comige_ss:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: xorl %eax, %eax
; X32-NEXT: comiss %xmm1, %xmm0
; X32-NEXT: setae %al
; X32-NEXT: retl
;
; X64-LABEL: test_mm_comige_ss:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: comiss %xmm1, %xmm0
; X64-NEXT: setae %al
@@ -611,14 +495,14 @@ declare i32 @llvm.x86.sse.comige.ss(<4 x float>, <4 x float>) nounwind readnone
define i32 @test_mm_comigt_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
; X32-LABEL: test_mm_comigt_ss:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: xorl %eax, %eax
; X32-NEXT: comiss %xmm1, %xmm0
; X32-NEXT: seta %al
; X32-NEXT: retl
;
; X64-LABEL: test_mm_comigt_ss:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: comiss %xmm1, %xmm0
; X64-NEXT: seta %al
@@ -630,14 +514,14 @@ declare i32 @llvm.x86.sse.comigt.ss(<4 x float>, <4 x float>) nounwind readnone
define i32 @test_mm_comile_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
; X32-LABEL: test_mm_comile_ss:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: xorl %eax, %eax
; X32-NEXT: comiss %xmm0, %xmm1
; X32-NEXT: setae %al
; X32-NEXT: retl
;
; X64-LABEL: test_mm_comile_ss:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: comiss %xmm0, %xmm1
; X64-NEXT: setae %al
@@ -649,14 +533,14 @@ declare i32 @llvm.x86.sse.comile.ss(<4 x float>, <4 x float>) nounwind readnone
define i32 @test_mm_comilt_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
; X32-LABEL: test_mm_comilt_ss:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: xorl %eax, %eax
; X32-NEXT: comiss %xmm0, %xmm1
; X32-NEXT: seta %al
; X32-NEXT: retl
;
; X64-LABEL: test_mm_comilt_ss:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: comiss %xmm0, %xmm1
; X64-NEXT: seta %al
@@ -668,7 +552,7 @@ declare i32 @llvm.x86.sse.comilt.ss(<4 x float>, <4 x float>) nounwind readnone
define i32 @test_mm_comineq_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
; X32-LABEL: test_mm_comineq_ss:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: comiss %xmm1, %xmm0
; X32-NEXT: setp %al
; X32-NEXT: setne %cl
@@ -677,7 +561,7 @@ define i32 @test_mm_comineq_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
; X32-NEXT: retl
;
; X64-LABEL: test_mm_comineq_ss:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: comiss %xmm1, %xmm0
; X64-NEXT: setp %al
; X64-NEXT: setne %cl
@@ -691,12 +575,12 @@ declare i32 @llvm.x86.sse.comineq.ss(<4 x float>, <4 x float>) nounwind readnone
define i32 @test_mm_cvt_ss2si(<4 x float> %a0) nounwind {
; X32-LABEL: test_mm_cvt_ss2si:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: cvtss2si %xmm0, %eax
; X32-NEXT: retl
;
; X64-LABEL: test_mm_cvt_ss2si:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: cvtss2si %xmm0, %eax
; X64-NEXT: retq
%res = call i32 @llvm.x86.sse.cvtss2si(<4 x float> %a0)
@@ -706,12 +590,12 @@ declare i32 @llvm.x86.sse.cvtss2si(<4 x float>) nounwind readnone
define <4 x float> @test_mm_cvtsi32_ss(<4 x float> %a0, i32 %a1) nounwind {
; X32-LABEL: test_mm_cvtsi32_ss:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: cvtsi2ssl {{[0-9]+}}(%esp), %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_cvtsi32_ss:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: cvtsi2ssl %edi, %xmm0
; X64-NEXT: retq
%res = call <4 x float> @llvm.x86.sse.cvtsi2ss(<4 x float> %a0, i32 %a1)
@@ -721,7 +605,7 @@ declare <4 x float> @llvm.x86.sse.cvtsi2ss(<4 x float>, i32) nounwind readnone
define float @test_mm_cvtss_f32(<4 x float> %a0) nounwind {
; X32-LABEL: test_mm_cvtss_f32:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pushl %eax
; X32-NEXT: movss %xmm0, (%esp)
; X32-NEXT: flds (%esp)
@@ -729,7 +613,7 @@ define float @test_mm_cvtss_f32(<4 x float> %a0) nounwind {
; X32-NEXT: retl
;
; X64-LABEL: test_mm_cvtss_f32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: retq
%res = extractelement <4 x float> %a0, i32 0
ret float %res
@@ -737,12 +621,12 @@ define float @test_mm_cvtss_f32(<4 x float> %a0) nounwind {
define i32 @test_mm_cvtss_si32(<4 x float> %a0) nounwind {
; X32-LABEL: test_mm_cvtss_si32:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: cvtss2si %xmm0, %eax
; X32-NEXT: retl
;
; X64-LABEL: test_mm_cvtss_si32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: cvtss2si %xmm0, %eax
; X64-NEXT: retq
%res = call i32 @llvm.x86.sse.cvtss2si(<4 x float> %a0)
@@ -751,12 +635,12 @@ define i32 @test_mm_cvtss_si32(<4 x float> %a0) nounwind {
define i32 @test_mm_cvttss_si(<4 x float> %a0) nounwind {
; X32-LABEL: test_mm_cvttss_si:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: cvttss2si %xmm0, %eax
; X32-NEXT: retl
;
; X64-LABEL: test_mm_cvttss_si:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: cvttss2si %xmm0, %eax
; X64-NEXT: retq
%res = call i32 @llvm.x86.sse.cvttss2si(<4 x float> %a0)
@@ -766,12 +650,12 @@ declare i32 @llvm.x86.sse.cvttss2si(<4 x float>) nounwind readnone
define i32 @test_mm_cvttss_si32(<4 x float> %a0) nounwind {
; X32-LABEL: test_mm_cvttss_si32:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: cvttss2si %xmm0, %eax
; X32-NEXT: retl
;
; X64-LABEL: test_mm_cvttss_si32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: cvttss2si %xmm0, %eax
; X64-NEXT: retq
%res = call i32 @llvm.x86.sse.cvttss2si(<4 x float> %a0)
@@ -780,12 +664,12 @@ define i32 @test_mm_cvttss_si32(<4 x float> %a0) nounwind {
define <4 x float> @test_mm_div_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
; X32-LABEL: test_mm_div_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: divps %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_div_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: divps %xmm1, %xmm0
; X64-NEXT: retq
%res = fdiv <4 x float> %a0, %a1
@@ -794,12 +678,12 @@ define <4 x float> @test_mm_div_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
define <4 x float> @test_mm_div_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
; X32-LABEL: test_mm_div_ss:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: divss %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_div_ss:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: divss %xmm1, %xmm0
; X64-NEXT: retq
%ext0 = extractelement <4 x float> %a0, i32 0
@@ -811,7 +695,7 @@ define <4 x float> @test_mm_div_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
define i32 @test_MM_GET_EXCEPTION_MASK() nounwind {
; X32-LABEL: test_MM_GET_EXCEPTION_MASK:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pushl %eax
; X32-NEXT: movl %esp, %eax
; X32-NEXT: stmxcsr (%eax)
@@ -821,7 +705,7 @@ define i32 @test_MM_GET_EXCEPTION_MASK() nounwind {
; X32-NEXT: retl
;
; X64-LABEL: test_MM_GET_EXCEPTION_MASK:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: leaq -{{[0-9]+}}(%rsp), %rax
; X64-NEXT: stmxcsr (%rax)
; X64-NEXT: movl -{{[0-9]+}}(%rsp), %eax
@@ -838,7 +722,7 @@ declare void @llvm.x86.sse.stmxcsr(i8*) nounwind readnone
define i32 @test_MM_GET_EXCEPTION_STATE() nounwind {
; X32-LABEL: test_MM_GET_EXCEPTION_STATE:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pushl %eax
; X32-NEXT: movl %esp, %eax
; X32-NEXT: stmxcsr (%eax)
@@ -848,7 +732,7 @@ define i32 @test_MM_GET_EXCEPTION_STATE() nounwind {
; X32-NEXT: retl
;
; X64-LABEL: test_MM_GET_EXCEPTION_STATE:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: leaq -{{[0-9]+}}(%rsp), %rax
; X64-NEXT: stmxcsr (%rax)
; X64-NEXT: movl -{{[0-9]+}}(%rsp), %eax
@@ -864,7 +748,7 @@ define i32 @test_MM_GET_EXCEPTION_STATE() nounwind {
define i32 @test_MM_GET_FLUSH_ZERO_MODE() nounwind {
; X32-LABEL: test_MM_GET_FLUSH_ZERO_MODE:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pushl %eax
; X32-NEXT: movl %esp, %eax
; X32-NEXT: stmxcsr (%eax)
@@ -874,7 +758,7 @@ define i32 @test_MM_GET_FLUSH_ZERO_MODE() nounwind {
; X32-NEXT: retl
;
; X64-LABEL: test_MM_GET_FLUSH_ZERO_MODE:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: leaq -{{[0-9]+}}(%rsp), %rax
; X64-NEXT: stmxcsr (%rax)
; X64-NEXT: movl -{{[0-9]+}}(%rsp), %eax
@@ -890,7 +774,7 @@ define i32 @test_MM_GET_FLUSH_ZERO_MODE() nounwind {
define i32 @test_MM_GET_ROUNDING_MODE() nounwind {
; X32-LABEL: test_MM_GET_ROUNDING_MODE:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pushl %eax
; X32-NEXT: movl %esp, %eax
; X32-NEXT: stmxcsr (%eax)
@@ -900,7 +784,7 @@ define i32 @test_MM_GET_ROUNDING_MODE() nounwind {
; X32-NEXT: retl
;
; X64-LABEL: test_MM_GET_ROUNDING_MODE:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: leaq -{{[0-9]+}}(%rsp), %rax
; X64-NEXT: stmxcsr (%rax)
; X64-NEXT: movl -{{[0-9]+}}(%rsp), %eax
@@ -916,7 +800,7 @@ define i32 @test_MM_GET_ROUNDING_MODE() nounwind {
define i32 @test_mm_getcsr() nounwind {
; X32-LABEL: test_mm_getcsr:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pushl %eax
; X32-NEXT: movl %esp, %eax
; X32-NEXT: stmxcsr (%eax)
@@ -925,7 +809,7 @@ define i32 @test_mm_getcsr() nounwind {
; X32-NEXT: retl
;
; X64-LABEL: test_mm_getcsr:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: leaq -{{[0-9]+}}(%rsp), %rax
; X64-NEXT: stmxcsr (%rax)
; X64-NEXT: movl -{{[0-9]+}}(%rsp), %eax
@@ -939,13 +823,13 @@ define i32 @test_mm_getcsr() nounwind {
define <4 x float> @test_mm_load_ps(float* %a0) nounwind {
; X32-LABEL: test_mm_load_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movaps (%eax), %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_load_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movaps (%rdi), %xmm0
; X64-NEXT: retq
%arg0 = bitcast float* %a0 to <4 x float>*
@@ -955,14 +839,14 @@ define <4 x float> @test_mm_load_ps(float* %a0) nounwind {
define <4 x float> @test_mm_load_ps1(float* %a0) nounwind {
; X32-LABEL: test_mm_load_ps1:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
; X32-NEXT: retl
;
; X64-LABEL: test_mm_load_ps1:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
; X64-NEXT: retq
@@ -976,13 +860,13 @@ define <4 x float> @test_mm_load_ps1(float* %a0) nounwind {
define <4 x float> @test_mm_load_ss(float* %a0) nounwind {
; X32-LABEL: test_mm_load_ss:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X32-NEXT: retl
;
; X64-LABEL: test_mm_load_ss:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X64-NEXT: retq
%ld = load float, float* %a0, align 1
@@ -995,14 +879,14 @@ define <4 x float> @test_mm_load_ss(float* %a0) nounwind {
define <4 x float> @test_mm_load1_ps(float* %a0) nounwind {
; X32-LABEL: test_mm_load1_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
; X32-NEXT: retl
;
; X64-LABEL: test_mm_load1_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
; X64-NEXT: retq
@@ -1016,7 +900,7 @@ define <4 x float> @test_mm_load1_ps(float* %a0) nounwind {
define <4 x float> @test_mm_loadh_pi(<4 x float> %a0, x86_mmx* %a1) {
; X32-LABEL: test_mm_loadh_pi:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X32-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
@@ -1025,7 +909,7 @@ define <4 x float> @test_mm_loadh_pi(<4 x float> %a0, x86_mmx* %a1) {
; X32-NEXT: retl
;
; X64-LABEL: test_mm_loadh_pi:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movq (%rdi), %rax
; X64-NEXT: movl %eax, -{{[0-9]+}}(%rsp)
; X64-NEXT: shrq $32, %rax
@@ -1046,7 +930,7 @@ define <4 x float> @test_mm_loadh_pi(<4 x float> %a0, x86_mmx* %a1) {
define <4 x float> @test_mm_loadl_pi(<4 x float> %a0, x86_mmx* %a1) {
; X32-LABEL: test_mm_loadl_pi:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X32-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
@@ -1056,7 +940,7 @@ define <4 x float> @test_mm_loadl_pi(<4 x float> %a0, x86_mmx* %a1) {
; X32-NEXT: retl
;
; X64-LABEL: test_mm_loadl_pi:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movq (%rdi), %rax
; X64-NEXT: movl %eax, -{{[0-9]+}}(%rsp)
; X64-NEXT: shrq $32, %rax
@@ -1078,14 +962,14 @@ define <4 x float> @test_mm_loadl_pi(<4 x float> %a0, x86_mmx* %a1) {
define <4 x float> @test_mm_loadr_ps(float* %a0) nounwind {
; X32-LABEL: test_mm_loadr_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movaps (%eax), %xmm0
; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,2,1,0]
; X32-NEXT: retl
;
; X64-LABEL: test_mm_loadr_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movaps (%rdi), %xmm0
; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,2,1,0]
; X64-NEXT: retq
@@ -1097,13 +981,13 @@ define <4 x float> @test_mm_loadr_ps(float* %a0) nounwind {
define <4 x float> @test_mm_loadu_ps(float* %a0) nounwind {
; X32-LABEL: test_mm_loadu_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movups (%eax), %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_loadu_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movups (%rdi), %xmm0
; X64-NEXT: retq
%arg0 = bitcast float* %a0 to <4 x float>*
@@ -1113,12 +997,12 @@ define <4 x float> @test_mm_loadu_ps(float* %a0) nounwind {
define <4 x float> @test_mm_max_ps(<4 x float> %a0, <4 x float> %a1) {
; X32-LABEL: test_mm_max_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: maxps %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_max_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: maxps %xmm1, %xmm0
; X64-NEXT: retq
%res = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %a0, <4 x float> %a1)
@@ -1128,12 +1012,12 @@ declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind read
define <4 x float> @test_mm_max_ss(<4 x float> %a0, <4 x float> %a1) {
; X32-LABEL: test_mm_max_ss:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: maxss %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_max_ss:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: maxss %xmm1, %xmm0
; X64-NEXT: retq
%res = call <4 x float> @llvm.x86.sse.max.ss(<4 x float> %a0, <4 x float> %a1)
@@ -1143,12 +1027,12 @@ declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>) nounwind read
define <4 x float> @test_mm_min_ps(<4 x float> %a0, <4 x float> %a1) {
; X32-LABEL: test_mm_min_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: minps %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_min_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: minps %xmm1, %xmm0
; X64-NEXT: retq
%res = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %a0, <4 x float> %a1)
@@ -1158,12 +1042,12 @@ declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind read
define <4 x float> @test_mm_min_ss(<4 x float> %a0, <4 x float> %a1) {
; X32-LABEL: test_mm_min_ss:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: minss %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_min_ss:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: minss %xmm1, %xmm0
; X64-NEXT: retq
%res = call <4 x float> @llvm.x86.sse.min.ss(<4 x float> %a0, <4 x float> %a1)
@@ -1173,12 +1057,12 @@ declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>) nounwind read
define <4 x float> @test_mm_move_ss(<4 x float> %a0, <4 x float> %a1) {
; X32-LABEL: test_mm_move_ss:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
; X32-NEXT: retl
;
; X64-LABEL: test_mm_move_ss:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
; X64-NEXT: retq
%res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
@@ -1187,12 +1071,12 @@ define <4 x float> @test_mm_move_ss(<4 x float> %a0, <4 x float> %a1) {
define <4 x float> @test_mm_movehl_ps(<4 x float> %a0, <4 x float> %a1) {
; X32-LABEL: test_mm_movehl_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1]
; X32-NEXT: retl
;
; X64-LABEL: test_mm_movehl_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1]
; X64-NEXT: retq
%res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 6, i32 7, i32 2, i32 3>
@@ -1201,12 +1085,12 @@ define <4 x float> @test_mm_movehl_ps(<4 x float> %a0, <4 x float> %a1) {
define <4 x float> @test_mm_movelh_ps(<4 x float> %a0, <4 x float> %a1) {
; X32-LABEL: test_mm_movelh_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; X32-NEXT: retl
;
; X64-LABEL: test_mm_movelh_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; X64-NEXT: retq
%res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
@@ -1215,12 +1099,12 @@ define <4 x float> @test_mm_movelh_ps(<4 x float> %a0, <4 x float> %a1) {
define i32 @test_mm_movemask_ps(<4 x float> %a0) nounwind {
; X32-LABEL: test_mm_movemask_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movmskps %xmm0, %eax
; X32-NEXT: retl
;
; X64-LABEL: test_mm_movemask_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movmskps %xmm0, %eax
; X64-NEXT: retq
%res = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %a0)
@@ -1230,12 +1114,12 @@ declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone
define <4 x float> @test_mm_mul_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
; X32-LABEL: test_mm_mul_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: mulps %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_mul_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: mulps %xmm1, %xmm0
; X64-NEXT: retq
%res = fmul <4 x float> %a0, %a1
@@ -1244,12 +1128,12 @@ define <4 x float> @test_mm_mul_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
define <4 x float> @test_mm_mul_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
; X32-LABEL: test_mm_mul_ss:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: mulss %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_mul_ss:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: mulss %xmm1, %xmm0
; X64-NEXT: retq
%ext0 = extractelement <4 x float> %a0, i32 0
@@ -1261,67 +1145,13 @@ define <4 x float> @test_mm_mul_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
define <4 x float> @test_mm_or_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
; X32-LABEL: test_mm_or_ps:
-; X32: # BB#0:
-; X32-NEXT: pushl %ebp
-; X32-NEXT: movl %esp, %ebp
-; X32-NEXT: pushl %esi
-; X32-NEXT: andl $-16, %esp
-; X32-NEXT: subl $64, %esp
-; X32-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X32-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
-; X32-NEXT: orl {{[0-9]+}}(%esp), %esi
-; X32-NEXT: movl %esi, (%esp)
-; X32-NEXT: orl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X32-NEXT: orl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X32-NEXT: orl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X32-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; X32-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; X32-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; X32-NEXT: leal -4(%ebp), %esp
-; X32-NEXT: popl %esi
-; X32-NEXT: popl %ebp
+; X32: # %bb.0:
+; X32-NEXT: orps %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_or_ps:
-; X64: # BB#0:
-; X64-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; X64-NEXT: movq -{{[0-9]+}}(%rsp), %r8
-; X64-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rdx
-; X64-NEXT: movq %rdx, %rsi
-; X64-NEXT: orl %eax, %edx
-; X64-NEXT: shrq $32, %rax
-; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; X64-NEXT: movq %rcx, %rdi
-; X64-NEXT: orl %r8d, %ecx
-; X64-NEXT: shrq $32, %r8
-; X64-NEXT: shrq $32, %rsi
-; X64-NEXT: shrq $32, %rdi
-; X64-NEXT: movl %ecx, -{{[0-9]+}}(%rsp)
-; X64-NEXT: orl %r8d, %edi
-; X64-NEXT: movl %edi, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movl %edx, -{{[0-9]+}}(%rsp)
-; X64-NEXT: orl %eax, %esi
-; X64-NEXT: movl %esi, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X64-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X64-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; X64-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; X64-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X64: # %bb.0:
+; X64-NEXT: orps %xmm1, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <4 x float> %a0 to <4 x i32>
%arg1 = bitcast <4 x float> %a1 to <4 x i32>
@@ -1332,13 +1162,13 @@ define <4 x float> @test_mm_or_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
define void @test_mm_prefetch(i8* %a0) {
; X32-LABEL: test_mm_prefetch:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: prefetchnta (%eax)
; X32-NEXT: retl
;
; X64-LABEL: test_mm_prefetch:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: prefetchnta (%rdi)
; X64-NEXT: retq
call void @llvm.prefetch(i8* %a0, i32 0, i32 0, i32 1)
@@ -1348,12 +1178,12 @@ declare void @llvm.prefetch(i8* nocapture, i32, i32, i32) nounwind readnone
define <4 x float> @test_mm_rcp_ps(<4 x float> %a0) {
; X32-LABEL: test_mm_rcp_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: rcpps %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_rcp_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: rcpps %xmm0, %xmm0
; X64-NEXT: retq
%res = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %a0)
@@ -1363,12 +1193,12 @@ declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone
define <4 x float> @test_mm_rcp_ss(<4 x float> %a0) {
; X32-LABEL: test_mm_rcp_ss:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: rcpss %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_rcp_ss:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: rcpss %xmm0, %xmm0
; X64-NEXT: retq
%rcp = call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %a0)
@@ -1386,12 +1216,12 @@ declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone
define <4 x float> @test_mm_rsqrt_ps(<4 x float> %a0) {
; X32-LABEL: test_mm_rsqrt_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: rsqrtps %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_rsqrt_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: rsqrtps %xmm0, %xmm0
; X64-NEXT: retq
%res = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %a0)
@@ -1401,12 +1231,12 @@ declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone
define <4 x float> @test_mm_rsqrt_ss(<4 x float> %a0) {
; X32-LABEL: test_mm_rsqrt_ss:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: rsqrtss %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_rsqrt_ss:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: rsqrtss %xmm0, %xmm0
; X64-NEXT: retq
%rsqrt = call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %a0)
@@ -1424,7 +1254,7 @@ declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone
define void @test_MM_SET_EXCEPTION_MASK(i32 %a0) nounwind {
; X32-LABEL: test_MM_SET_EXCEPTION_MASK:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pushl %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl %esp, %ecx
@@ -1438,7 +1268,7 @@ define void @test_MM_SET_EXCEPTION_MASK(i32 %a0) nounwind {
; X32-NEXT: retl
;
; X64-LABEL: test_MM_SET_EXCEPTION_MASK:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: leaq -{{[0-9]+}}(%rsp), %rax
; X64-NEXT: stmxcsr (%rax)
; X64-NEXT: movl -{{[0-9]+}}(%rsp), %ecx
@@ -1461,7 +1291,7 @@ declare void @llvm.x86.sse.ldmxcsr(i8*) nounwind readnone
define void @test_MM_SET_EXCEPTION_STATE(i32 %a0) nounwind {
; X32-LABEL: test_MM_SET_EXCEPTION_STATE:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pushl %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl %esp, %ecx
@@ -1475,7 +1305,7 @@ define void @test_MM_SET_EXCEPTION_STATE(i32 %a0) nounwind {
; X32-NEXT: retl
;
; X64-LABEL: test_MM_SET_EXCEPTION_STATE:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: leaq -{{[0-9]+}}(%rsp), %rax
; X64-NEXT: stmxcsr (%rax)
; X64-NEXT: movl -{{[0-9]+}}(%rsp), %ecx
@@ -1497,7 +1327,7 @@ define void @test_MM_SET_EXCEPTION_STATE(i32 %a0) nounwind {
define void @test_MM_SET_FLUSH_ZERO_MODE(i32 %a0) nounwind {
; X32-LABEL: test_MM_SET_FLUSH_ZERO_MODE:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pushl %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl %esp, %ecx
@@ -1511,7 +1341,7 @@ define void @test_MM_SET_FLUSH_ZERO_MODE(i32 %a0) nounwind {
; X32-NEXT: retl
;
; X64-LABEL: test_MM_SET_FLUSH_ZERO_MODE:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: leaq -{{[0-9]+}}(%rsp), %rax
; X64-NEXT: stmxcsr (%rax)
; X64-NEXT: movl -{{[0-9]+}}(%rsp), %ecx
@@ -1533,7 +1363,7 @@ define void @test_MM_SET_FLUSH_ZERO_MODE(i32 %a0) nounwind {
define <4 x float> @test_mm_set_ps(float %a0, float %a1, float %a2, float %a3) nounwind {
; X32-LABEL: test_mm_set_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X32-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
@@ -1544,7 +1374,7 @@ define <4 x float> @test_mm_set_ps(float %a0, float %a1, float %a2, float %a3) n
; X32-NEXT: retl
;
; X64-LABEL: test_mm_set_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; X64-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
; X64-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm1[0]
@@ -1559,13 +1389,13 @@ define <4 x float> @test_mm_set_ps(float %a0, float %a1, float %a2, float %a3) n
define <4 x float> @test_mm_set_ps1(float %a0) nounwind {
; X32-LABEL: test_mm_set_ps1:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
; X32-NEXT: retl
;
; X64-LABEL: test_mm_set_ps1:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
; X64-NEXT: retq
%res0 = insertelement <4 x float> undef, float %a0, i32 0
@@ -1577,7 +1407,7 @@ define <4 x float> @test_mm_set_ps1(float %a0) nounwind {
define void @test_MM_SET_ROUNDING_MODE(i32 %a0) nounwind {
; X32-LABEL: test_MM_SET_ROUNDING_MODE:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pushl %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl %esp, %ecx
@@ -1591,7 +1421,7 @@ define void @test_MM_SET_ROUNDING_MODE(i32 %a0) nounwind {
; X32-NEXT: retl
;
; X64-LABEL: test_MM_SET_ROUNDING_MODE:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: leaq -{{[0-9]+}}(%rsp), %rax
; X64-NEXT: stmxcsr (%rax)
; X64-NEXT: movl -{{[0-9]+}}(%rsp), %ecx
@@ -1613,14 +1443,14 @@ define void @test_MM_SET_ROUNDING_MODE(i32 %a0) nounwind {
define <4 x float> @test_mm_set_ss(float %a0) nounwind {
; X32-LABEL: test_mm_set_ss:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X32-NEXT: xorps %xmm0, %xmm0
; X32-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
; X32-NEXT: retl
;
; X64-LABEL: test_mm_set_ss:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: xorps %xmm1, %xmm1
; X64-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
; X64-NEXT: movaps %xmm1, %xmm0
@@ -1634,13 +1464,13 @@ define <4 x float> @test_mm_set_ss(float %a0) nounwind {
define <4 x float> @test_mm_set1_ps(float %a0) nounwind {
; X32-LABEL: test_mm_set1_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
; X32-NEXT: retl
;
; X64-LABEL: test_mm_set1_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
; X64-NEXT: retq
%res0 = insertelement <4 x float> undef, float %a0, i32 0
@@ -1652,13 +1482,13 @@ define <4 x float> @test_mm_set1_ps(float %a0) nounwind {
define void @test_mm_setcsr(i32 %a0) nounwind {
; X32-LABEL: test_mm_setcsr:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
; X32-NEXT: ldmxcsr (%eax)
; X32-NEXT: retl
;
; X64-LABEL: test_mm_setcsr:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: leaq -{{[0-9]+}}(%rsp), %rax
; X64-NEXT: movl %edi, -{{[0-9]+}}(%rsp)
; X64-NEXT: ldmxcsr (%rax)
@@ -1672,7 +1502,7 @@ define void @test_mm_setcsr(i32 %a0) nounwind {
define <4 x float> @test_mm_setr_ps(float %a0, float %a1, float %a2, float %a3) nounwind {
; X32-LABEL: test_mm_setr_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X32-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; X32-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
@@ -1683,7 +1513,7 @@ define <4 x float> @test_mm_setr_ps(float %a0, float %a1, float %a2, float %a3)
; X32-NEXT: retl
;
; X64-LABEL: test_mm_setr_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; X64-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; X64-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
@@ -1697,12 +1527,12 @@ define <4 x float> @test_mm_setr_ps(float %a0, float %a1, float %a2, float %a3)
define <4 x float> @test_mm_setzero_ps() {
; X32-LABEL: test_mm_setzero_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: xorps %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_setzero_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: xorps %xmm0, %xmm0
; X64-NEXT: retq
ret <4 x float> zeroinitializer
@@ -1710,12 +1540,12 @@ define <4 x float> @test_mm_setzero_ps() {
define void @test_mm_sfence() nounwind {
; X32-LABEL: test_mm_sfence:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: sfence
; X32-NEXT: retl
;
; X64-LABEL: test_mm_sfence:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: sfence
; X64-NEXT: retq
call void @llvm.x86.sse.sfence()
@@ -1725,12 +1555,12 @@ declare void @llvm.x86.sse.sfence() nounwind readnone
define <4 x float> @test_mm_shuffle_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
; X32-LABEL: test_mm_shuffle_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0]
; X32-NEXT: retl
;
; X64-LABEL: test_mm_shuffle_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0]
; X64-NEXT: retq
%res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 0, i32 0, i32 4, i32 4>
@@ -1739,12 +1569,12 @@ define <4 x float> @test_mm_shuffle_ps(<4 x float> %a0, <4 x float> %a1) nounwin
define <4 x float> @test_mm_sqrt_ps(<4 x float> %a0) {
; X32-LABEL: test_mm_sqrt_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: sqrtps %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_sqrt_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: sqrtps %xmm0, %xmm0
; X64-NEXT: retq
%res = call <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float> %a0)
@@ -1754,12 +1584,12 @@ declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone
define <4 x float> @test_mm_sqrt_ss(<4 x float> %a0) {
; X32-LABEL: test_mm_sqrt_ss:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: sqrtss %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_sqrt_ss:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: sqrtss %xmm0, %xmm0
; X64-NEXT: retq
%sqrt = call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %a0)
@@ -1777,13 +1607,13 @@ declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone
define void @test_mm_store_ps(float *%a0, <4 x float> %a1) {
; X32-LABEL: test_mm_store_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movaps %xmm0, (%eax)
; X32-NEXT: retl
;
; X64-LABEL: test_mm_store_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movaps %xmm0, (%rdi)
; X64-NEXT: retq
%arg0 = bitcast float* %a0 to <4 x float>*
@@ -1793,14 +1623,14 @@ define void @test_mm_store_ps(float *%a0, <4 x float> %a1) {
define void @test_mm_store_ps1(float *%a0, <4 x float> %a1) {
; X32-LABEL: test_mm_store_ps1:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
; X32-NEXT: movaps %xmm0, (%eax)
; X32-NEXT: retl
;
; X64-LABEL: test_mm_store_ps1:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
; X64-NEXT: movaps %xmm0, (%rdi)
; X64-NEXT: retq
@@ -1812,13 +1642,13 @@ define void @test_mm_store_ps1(float *%a0, <4 x float> %a1) {
define void @test_mm_store_ss(float *%a0, <4 x float> %a1) {
; X32-LABEL: test_mm_store_ss:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movss %xmm0, (%eax)
; X32-NEXT: retl
;
; X64-LABEL: test_mm_store_ss:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movss %xmm0, (%rdi)
; X64-NEXT: retq
%ext = extractelement <4 x float> %a1, i32 0
@@ -1828,14 +1658,14 @@ define void @test_mm_store_ss(float *%a0, <4 x float> %a1) {
define void @test_mm_store1_ps(float *%a0, <4 x float> %a1) {
; X32-LABEL: test_mm_store1_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
; X32-NEXT: movaps %xmm0, (%eax)
; X32-NEXT: retl
;
; X64-LABEL: test_mm_store1_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
; X64-NEXT: movaps %xmm0, (%rdi)
; X64-NEXT: retq
@@ -1847,7 +1677,7 @@ define void @test_mm_store1_ps(float *%a0, <4 x float> %a1) {
define void @test_mm_storeh_ps(x86_mmx *%a0, <4 x float> %a1) nounwind {
; X32-LABEL: test_mm_storeh_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pushl %ebp
; X32-NEXT: movl %esp, %ebp
; X32-NEXT: andl $-16, %esp
@@ -1863,7 +1693,7 @@ define void @test_mm_storeh_ps(x86_mmx *%a0, <4 x float> %a1) nounwind {
; X32-NEXT: retl
;
; X64-LABEL: test_mm_storeh_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rax
; X64-NEXT: movq %rax, (%rdi)
@@ -1877,7 +1707,7 @@ define void @test_mm_storeh_ps(x86_mmx *%a0, <4 x float> %a1) nounwind {
define void @test_mm_storel_ps(x86_mmx *%a0, <4 x float> %a1) nounwind {
; X32-LABEL: test_mm_storel_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pushl %ebp
; X32-NEXT: movl %esp, %ebp
; X32-NEXT: andl $-16, %esp
@@ -1893,7 +1723,7 @@ define void @test_mm_storel_ps(x86_mmx *%a0, <4 x float> %a1) nounwind {
; X32-NEXT: retl
;
; X64-LABEL: test_mm_storel_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rax
; X64-NEXT: movq %rax, (%rdi)
@@ -1907,14 +1737,14 @@ define void @test_mm_storel_ps(x86_mmx *%a0, <4 x float> %a1) nounwind {
define void @test_mm_storer_ps(float *%a0, <4 x float> %a1) {
; X32-LABEL: test_mm_storer_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,2,1,0]
; X32-NEXT: movaps %xmm0, (%eax)
; X32-NEXT: retl
;
; X64-LABEL: test_mm_storer_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,2,1,0]
; X64-NEXT: movaps %xmm0, (%rdi)
; X64-NEXT: retq
@@ -1926,13 +1756,13 @@ define void @test_mm_storer_ps(float *%a0, <4 x float> %a1) {
define void @test_mm_storeu_ps(float *%a0, <4 x float> %a1) {
; X32-LABEL: test_mm_storeu_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movups %xmm0, (%eax)
; X32-NEXT: retl
;
; X64-LABEL: test_mm_storeu_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movups %xmm0, (%rdi)
; X64-NEXT: retq
%arg0 = bitcast float* %a0 to <4 x float>*
@@ -1942,13 +1772,13 @@ define void @test_mm_storeu_ps(float *%a0, <4 x float> %a1) {
define void @test_mm_stream_ps(float *%a0, <4 x float> %a1) {
; X32-LABEL: test_mm_stream_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movntps %xmm0, (%eax)
; X32-NEXT: retl
;
; X64-LABEL: test_mm_stream_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movntps %xmm0, (%rdi)
; X64-NEXT: retq
%arg0 = bitcast float* %a0 to <4 x float>*
@@ -1958,12 +1788,12 @@ define void @test_mm_stream_ps(float *%a0, <4 x float> %a1) {
define <4 x float> @test_mm_sub_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
; X32-LABEL: test_mm_sub_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: subps %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_sub_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: subps %xmm1, %xmm0
; X64-NEXT: retq
%res = fsub <4 x float> %a0, %a1
@@ -1972,12 +1802,12 @@ define <4 x float> @test_mm_sub_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
define <4 x float> @test_mm_sub_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
; X32-LABEL: test_mm_sub_ss:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: subss %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_sub_ss:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: subss %xmm1, %xmm0
; X64-NEXT: retq
%ext0 = extractelement <4 x float> %a0, i32 0
@@ -1989,7 +1819,7 @@ define <4 x float> @test_mm_sub_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
define void @test_MM_TRANSPOSE4_PS(<4 x float>* %a0, <4 x float>* %a1, <4 x float>* %a2, <4 x float>* %a3) nounwind {
; X32-LABEL: test_MM_TRANSPOSE4_PS:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pushl %esi
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
@@ -2019,7 +1849,7 @@ define void @test_MM_TRANSPOSE4_PS(<4 x float>* %a0, <4 x float>* %a1, <4 x floa
; X32-NEXT: retl
;
; X64-LABEL: test_MM_TRANSPOSE4_PS:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movaps (%rdi), %xmm0
; X64-NEXT: movaps (%rsi), %xmm1
; X64-NEXT: movaps (%rdx), %xmm2
@@ -2062,7 +1892,7 @@ define void @test_MM_TRANSPOSE4_PS(<4 x float>* %a0, <4 x float>* %a1, <4 x floa
define i32 @test_mm_ucomieq_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
; X32-LABEL: test_mm_ucomieq_ss:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: ucomiss %xmm1, %xmm0
; X32-NEXT: setnp %al
; X32-NEXT: sete %cl
@@ -2071,7 +1901,7 @@ define i32 @test_mm_ucomieq_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
; X32-NEXT: retl
;
; X64-LABEL: test_mm_ucomieq_ss:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: ucomiss %xmm1, %xmm0
; X64-NEXT: setnp %al
; X64-NEXT: sete %cl
@@ -2085,14 +1915,14 @@ declare i32 @llvm.x86.sse.ucomieq.ss(<4 x float>, <4 x float>) nounwind readnone
define i32 @test_mm_ucomige_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
; X32-LABEL: test_mm_ucomige_ss:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: xorl %eax, %eax
; X32-NEXT: ucomiss %xmm1, %xmm0
; X32-NEXT: setae %al
; X32-NEXT: retl
;
; X64-LABEL: test_mm_ucomige_ss:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: ucomiss %xmm1, %xmm0
; X64-NEXT: setae %al
@@ -2104,14 +1934,14 @@ declare i32 @llvm.x86.sse.ucomige.ss(<4 x float>, <4 x float>) nounwind readnone
define i32 @test_mm_ucomigt_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
; X32-LABEL: test_mm_ucomigt_ss:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: xorl %eax, %eax
; X32-NEXT: ucomiss %xmm1, %xmm0
; X32-NEXT: seta %al
; X32-NEXT: retl
;
; X64-LABEL: test_mm_ucomigt_ss:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: ucomiss %xmm1, %xmm0
; X64-NEXT: seta %al
@@ -2123,14 +1953,14 @@ declare i32 @llvm.x86.sse.ucomigt.ss(<4 x float>, <4 x float>) nounwind readnone
define i32 @test_mm_ucomile_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
; X32-LABEL: test_mm_ucomile_ss:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: xorl %eax, %eax
; X32-NEXT: ucomiss %xmm0, %xmm1
; X32-NEXT: setae %al
; X32-NEXT: retl
;
; X64-LABEL: test_mm_ucomile_ss:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: ucomiss %xmm0, %xmm1
; X64-NEXT: setae %al
@@ -2142,14 +1972,14 @@ declare i32 @llvm.x86.sse.ucomile.ss(<4 x float>, <4 x float>) nounwind readnone
define i32 @test_mm_ucomilt_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
; X32-LABEL: test_mm_ucomilt_ss:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: xorl %eax, %eax
; X32-NEXT: ucomiss %xmm0, %xmm1
; X32-NEXT: seta %al
; X32-NEXT: retl
;
; X64-LABEL: test_mm_ucomilt_ss:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: ucomiss %xmm0, %xmm1
; X64-NEXT: seta %al
@@ -2161,7 +1991,7 @@ declare i32 @llvm.x86.sse.ucomilt.ss(<4 x float>, <4 x float>) nounwind readnone
define i32 @test_mm_ucomineq_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
; X32-LABEL: test_mm_ucomineq_ss:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: ucomiss %xmm1, %xmm0
; X32-NEXT: setp %al
; X32-NEXT: setne %cl
@@ -2170,7 +2000,7 @@ define i32 @test_mm_ucomineq_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
; X32-NEXT: retl
;
; X64-LABEL: test_mm_ucomineq_ss:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: ucomiss %xmm1, %xmm0
; X64-NEXT: setp %al
; X64-NEXT: setne %cl
@@ -2184,23 +2014,23 @@ declare i32 @llvm.x86.sse.ucomineq.ss(<4 x float>, <4 x float>) nounwind readnon
define <4 x float> @test_mm_undefined_ps() {
; X32-LABEL: test_mm_undefined_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: retl
;
; X64-LABEL: test_mm_undefined_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: retq
ret <4 x float> undef
}
define <4 x float> @test_mm_unpackhi_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
; X32-LABEL: test_mm_unpackhi_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; X32-NEXT: retl
;
; X64-LABEL: test_mm_unpackhi_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; X64-NEXT: retq
%res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
@@ -2209,12 +2039,12 @@ define <4 x float> @test_mm_unpackhi_ps(<4 x float> %a0, <4 x float> %a1) nounwi
define <4 x float> @test_mm_unpacklo_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
; X32-LABEL: test_mm_unpacklo_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; X32-NEXT: retl
;
; X64-LABEL: test_mm_unpacklo_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; X64-NEXT: retq
%res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
@@ -2223,67 +2053,13 @@ define <4 x float> @test_mm_unpacklo_ps(<4 x float> %a0, <4 x float> %a1) nounwi
define <4 x float> @test_mm_xor_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
; X32-LABEL: test_mm_xor_ps:
-; X32: # BB#0:
-; X32-NEXT: pushl %ebp
-; X32-NEXT: movl %esp, %ebp
-; X32-NEXT: pushl %esi
-; X32-NEXT: andl $-16, %esp
-; X32-NEXT: subl $64, %esp
-; X32-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X32-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
-; X32-NEXT: xorl {{[0-9]+}}(%esp), %esi
-; X32-NEXT: movl %esi, (%esp)
-; X32-NEXT: xorl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X32-NEXT: xorl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X32-NEXT: xorl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X32-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; X32-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; X32-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; X32-NEXT: leal -4(%ebp), %esp
-; X32-NEXT: popl %esi
-; X32-NEXT: popl %ebp
+; X32: # %bb.0:
+; X32-NEXT: xorps %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_xor_ps:
-; X64: # BB#0:
-; X64-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; X64-NEXT: movq -{{[0-9]+}}(%rsp), %r8
-; X64-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rdx
-; X64-NEXT: movq %rdx, %rsi
-; X64-NEXT: xorl %eax, %edx
-; X64-NEXT: shrq $32, %rax
-; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; X64-NEXT: movq %rcx, %rdi
-; X64-NEXT: xorl %r8d, %ecx
-; X64-NEXT: shrq $32, %r8
-; X64-NEXT: shrq $32, %rsi
-; X64-NEXT: shrq $32, %rdi
-; X64-NEXT: movl %ecx, -{{[0-9]+}}(%rsp)
-; X64-NEXT: xorl %r8d, %edi
-; X64-NEXT: movl %edi, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movl %edx, -{{[0-9]+}}(%rsp)
-; X64-NEXT: xorl %eax, %esi
-; X64-NEXT: movl %esi, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X64-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X64-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; X64-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; X64-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X64: # %bb.0:
+; X64-NEXT: xorps %xmm1, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <4 x float> %a0 to <4 x i32>
%arg1 = bitcast <4 x float> %a1 to <4 x i32>
diff --git a/test/CodeGen/X86/sse-intrinsics-x86-upgrade.ll b/test/CodeGen/X86/sse-intrinsics-x86-upgrade.ll
index 2ecba887f7cb..a65c1d312aa4 100644
--- a/test/CodeGen/X86/sse-intrinsics-x86-upgrade.ll
+++ b/test/CodeGen/X86/sse-intrinsics-x86-upgrade.ll
@@ -3,18 +3,18 @@
define void @test_x86_sse_storeu_ps(i8* %a0, <4 x float> %a1) {
; SSE-LABEL: test_x86_sse_storeu_ps:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
; SSE-NEXT: movups %xmm0, (%eax)
; SSE-NEXT: retl
;
; KNL-LABEL: test_x86_sse_storeu_ps:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: movl {{[0-9]+}}(%esp), %eax
; KNL-NEXT: vmovups %xmm0, (%eax)
; KNL-NEXT: retl
; CHECK-LABEL: test_x86_sse_storeu_ps:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: movups %xmm0, (%eax)
; CHECK-NEXT: retl
@@ -26,21 +26,21 @@ declare void @llvm.x86.sse.storeu.ps(i8*, <4 x float>) nounwind
define <4 x float> @test_x86_sse_add_ss(<4 x float> %a0, <4 x float> %a1) {
; SSE-LABEL: test_x86_sse_add_ss:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: addss %xmm1, %xmm0 ## encoding: [0xf3,0x0f,0x58,0xc1]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; AVX2-LABEL: test_x86_sse_add_ss:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vaddss %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x58,0xc1]
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_sse_add_ss:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vaddss %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7e,0x08,0x58,0xc1]
; SKX-NEXT: retl ## encoding: [0xc3]
; CHECK-LABEL: test_x86_sse_add_ss:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: addss %xmm1, %xmm0
; CHECK-NEXT: retl
%res = call <4 x float> @llvm.x86.sse.add.ss(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1]
@@ -51,21 +51,21 @@ declare <4 x float> @llvm.x86.sse.add.ss(<4 x float>, <4 x float>) nounwind read
define <4 x float> @test_x86_sse_sub_ss(<4 x float> %a0, <4 x float> %a1) {
; SSE-LABEL: test_x86_sse_sub_ss:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: subss %xmm1, %xmm0 ## encoding: [0xf3,0x0f,0x5c,0xc1]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; AVX2-LABEL: test_x86_sse_sub_ss:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vsubss %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x5c,0xc1]
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_sse_sub_ss:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vsubss %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7e,0x08,0x5c,0xc1]
; SKX-NEXT: retl ## encoding: [0xc3]
; CHECK-LABEL: test_x86_sse_sub_ss:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: subss %xmm1, %xmm0
; CHECK-NEXT: retl
%res = call <4 x float> @llvm.x86.sse.sub.ss(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1]
@@ -76,21 +76,21 @@ declare <4 x float> @llvm.x86.sse.sub.ss(<4 x float>, <4 x float>) nounwind read
define <4 x float> @test_x86_sse_mul_ss(<4 x float> %a0, <4 x float> %a1) {
; SSE-LABEL: test_x86_sse_mul_ss:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: mulss %xmm1, %xmm0 ## encoding: [0xf3,0x0f,0x59,0xc1]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; AVX2-LABEL: test_x86_sse_mul_ss:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vmulss %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x59,0xc1]
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_sse_mul_ss:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vmulss %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7e,0x08,0x59,0xc1]
; SKX-NEXT: retl ## encoding: [0xc3]
; CHECK-LABEL: test_x86_sse_mul_ss:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: mulss %xmm1, %xmm0
; CHECK-NEXT: retl
%res = call <4 x float> @llvm.x86.sse.mul.ss(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1]
@@ -101,21 +101,21 @@ declare <4 x float> @llvm.x86.sse.mul.ss(<4 x float>, <4 x float>) nounwind read
define <4 x float> @test_x86_sse_div_ss(<4 x float> %a0, <4 x float> %a1) {
; SSE-LABEL: test_x86_sse_div_ss:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: divss %xmm1, %xmm0 ## encoding: [0xf3,0x0f,0x5e,0xc1]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; AVX2-LABEL: test_x86_sse_div_ss:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vdivss %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x5e,0xc1]
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_sse_div_ss:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vdivss %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7e,0x08,0x5e,0xc1]
; SKX-NEXT: retl ## encoding: [0xc3]
; CHECK-LABEL: test_x86_sse_div_ss:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: divss %xmm1, %xmm0
; CHECK-NEXT: retl
%res = call <4 x float> @llvm.x86.sse.div.ss(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1]
diff --git a/test/CodeGen/X86/sse-intrinsics-x86.ll b/test/CodeGen/X86/sse-intrinsics-x86.ll
index 679b1e8b057f..04a4352accaa 100644
--- a/test/CodeGen/X86/sse-intrinsics-x86.ll
+++ b/test/CodeGen/X86/sse-intrinsics-x86.ll
@@ -1,17 +1,16 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; NOTE: Assertions have been autogenerated by update_llc_test_checks.py
; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=-avx,+sse -show-mc-encoding | FileCheck %s --check-prefix=SSE
; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=+avx2 -show-mc-encoding | FileCheck %s --check-prefix=VCHECK --check-prefix=AVX2
; RUN: llc < %s -mtriple=i386-apple-darwin -mcpu=skx -show-mc-encoding | FileCheck %s --check-prefix=VCHECK --check-prefix=SKX
define <4 x float> @test_x86_sse_cmp_ps(<4 x float> %a0, <4 x float> %a1) {
; SSE-LABEL: test_x86_sse_cmp_ps:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: cmpordps %xmm1, %xmm0 ## encoding: [0x0f,0xc2,0xc1,0x07]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; VCHECK-LABEL: test_x86_sse_cmp_ps:
-; VCHECK: ## BB#0:
+; VCHECK: ## %bb.0:
; VCHECK-NEXT: vcmpordps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0xc2,0xc1,0x07]
; VCHECK-NEXT: retl ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.sse.cmp.ps(<4 x float> %a0, <4 x float> %a1, i8 7) ; <<4 x float>> [#uses=1]
@@ -22,12 +21,12 @@ declare <4 x float> @llvm.x86.sse.cmp.ps(<4 x float>, <4 x float>, i8) nounwind
define <4 x float> @test_x86_sse_cmp_ss(<4 x float> %a0, <4 x float> %a1) {
; SSE-LABEL: test_x86_sse_cmp_ss:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: cmpordss %xmm1, %xmm0 ## encoding: [0xf3,0x0f,0xc2,0xc1,0x07]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; VCHECK-LABEL: test_x86_sse_cmp_ss:
-; VCHECK: ## BB#0:
+; VCHECK: ## %bb.0:
; VCHECK-NEXT: vcmpordss %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0xc2,0xc1,0x07]
; VCHECK-NEXT: retl ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 7) ; <<4 x float>> [#uses=1]
@@ -38,7 +37,7 @@ declare <4 x float> @llvm.x86.sse.cmp.ss(<4 x float>, <4 x float>, i8) nounwind
define i32 @test_x86_sse_comieq_ss(<4 x float> %a0, <4 x float> %a1) {
; SSE-LABEL: test_x86_sse_comieq_ss:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: comiss %xmm1, %xmm0 ## encoding: [0x0f,0x2f,0xc1]
; SSE-NEXT: setnp %al ## encoding: [0x0f,0x9b,0xc0]
; SSE-NEXT: sete %cl ## encoding: [0x0f,0x94,0xc1]
@@ -47,7 +46,7 @@ define i32 @test_x86_sse_comieq_ss(<4 x float> %a0, <4 x float> %a1) {
; SSE-NEXT: retl ## encoding: [0xc3]
;
; AVX2-LABEL: test_x86_sse_comieq_ss:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vcomiss %xmm1, %xmm0 ## encoding: [0xc5,0xf8,0x2f,0xc1]
; AVX2-NEXT: setnp %al ## encoding: [0x0f,0x9b,0xc0]
; AVX2-NEXT: sete %cl ## encoding: [0x0f,0x94,0xc1]
@@ -56,7 +55,7 @@ define i32 @test_x86_sse_comieq_ss(<4 x float> %a0, <4 x float> %a1) {
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_sse_comieq_ss:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vcomiss %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2f,0xc1]
; SKX-NEXT: setnp %al ## encoding: [0x0f,0x9b,0xc0]
; SKX-NEXT: sete %cl ## encoding: [0x0f,0x94,0xc1]
@@ -71,21 +70,21 @@ declare i32 @llvm.x86.sse.comieq.ss(<4 x float>, <4 x float>) nounwind readnone
define i32 @test_x86_sse_comige_ss(<4 x float> %a0, <4 x float> %a1) {
; SSE-LABEL: test_x86_sse_comige_ss:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0]
; SSE-NEXT: comiss %xmm1, %xmm0 ## encoding: [0x0f,0x2f,0xc1]
; SSE-NEXT: setae %al ## encoding: [0x0f,0x93,0xc0]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; AVX2-LABEL: test_x86_sse_comige_ss:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0]
; AVX2-NEXT: vcomiss %xmm1, %xmm0 ## encoding: [0xc5,0xf8,0x2f,0xc1]
; AVX2-NEXT: setae %al ## encoding: [0x0f,0x93,0xc0]
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_sse_comige_ss:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0]
; SKX-NEXT: vcomiss %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2f,0xc1]
; SKX-NEXT: setae %al ## encoding: [0x0f,0x93,0xc0]
@@ -98,21 +97,21 @@ declare i32 @llvm.x86.sse.comige.ss(<4 x float>, <4 x float>) nounwind readnone
define i32 @test_x86_sse_comigt_ss(<4 x float> %a0, <4 x float> %a1) {
; SSE-LABEL: test_x86_sse_comigt_ss:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0]
; SSE-NEXT: comiss %xmm1, %xmm0 ## encoding: [0x0f,0x2f,0xc1]
; SSE-NEXT: seta %al ## encoding: [0x0f,0x97,0xc0]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; AVX2-LABEL: test_x86_sse_comigt_ss:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0]
; AVX2-NEXT: vcomiss %xmm1, %xmm0 ## encoding: [0xc5,0xf8,0x2f,0xc1]
; AVX2-NEXT: seta %al ## encoding: [0x0f,0x97,0xc0]
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_sse_comigt_ss:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0]
; SKX-NEXT: vcomiss %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2f,0xc1]
; SKX-NEXT: seta %al ## encoding: [0x0f,0x97,0xc0]
@@ -125,21 +124,21 @@ declare i32 @llvm.x86.sse.comigt.ss(<4 x float>, <4 x float>) nounwind readnone
define i32 @test_x86_sse_comile_ss(<4 x float> %a0, <4 x float> %a1) {
; SSE-LABEL: test_x86_sse_comile_ss:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0]
; SSE-NEXT: comiss %xmm0, %xmm1 ## encoding: [0x0f,0x2f,0xc8]
; SSE-NEXT: setae %al ## encoding: [0x0f,0x93,0xc0]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; AVX2-LABEL: test_x86_sse_comile_ss:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0]
; AVX2-NEXT: vcomiss %xmm0, %xmm1 ## encoding: [0xc5,0xf8,0x2f,0xc8]
; AVX2-NEXT: setae %al ## encoding: [0x0f,0x93,0xc0]
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_sse_comile_ss:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0]
; SKX-NEXT: vcomiss %xmm0, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2f,0xc8]
; SKX-NEXT: setae %al ## encoding: [0x0f,0x93,0xc0]
@@ -152,21 +151,21 @@ declare i32 @llvm.x86.sse.comile.ss(<4 x float>, <4 x float>) nounwind readnone
define i32 @test_x86_sse_comilt_ss(<4 x float> %a0, <4 x float> %a1) {
; SSE-LABEL: test_x86_sse_comilt_ss:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0]
; SSE-NEXT: comiss %xmm0, %xmm1 ## encoding: [0x0f,0x2f,0xc8]
; SSE-NEXT: seta %al ## encoding: [0x0f,0x97,0xc0]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; AVX2-LABEL: test_x86_sse_comilt_ss:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0]
; AVX2-NEXT: vcomiss %xmm0, %xmm1 ## encoding: [0xc5,0xf8,0x2f,0xc8]
; AVX2-NEXT: seta %al ## encoding: [0x0f,0x97,0xc0]
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_sse_comilt_ss:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0]
; SKX-NEXT: vcomiss %xmm0, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2f,0xc8]
; SKX-NEXT: seta %al ## encoding: [0x0f,0x97,0xc0]
@@ -179,7 +178,7 @@ declare i32 @llvm.x86.sse.comilt.ss(<4 x float>, <4 x float>) nounwind readnone
define i32 @test_x86_sse_comineq_ss(<4 x float> %a0, <4 x float> %a1) {
; SSE-LABEL: test_x86_sse_comineq_ss:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: comiss %xmm1, %xmm0 ## encoding: [0x0f,0x2f,0xc1]
; SSE-NEXT: setp %al ## encoding: [0x0f,0x9a,0xc0]
; SSE-NEXT: setne %cl ## encoding: [0x0f,0x95,0xc1]
@@ -188,7 +187,7 @@ define i32 @test_x86_sse_comineq_ss(<4 x float> %a0, <4 x float> %a1) {
; SSE-NEXT: retl ## encoding: [0xc3]
;
; AVX2-LABEL: test_x86_sse_comineq_ss:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vcomiss %xmm1, %xmm0 ## encoding: [0xc5,0xf8,0x2f,0xc1]
; AVX2-NEXT: setp %al ## encoding: [0x0f,0x9a,0xc0]
; AVX2-NEXT: setne %cl ## encoding: [0x0f,0x95,0xc1]
@@ -197,7 +196,7 @@ define i32 @test_x86_sse_comineq_ss(<4 x float> %a0, <4 x float> %a1) {
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_sse_comineq_ss:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vcomiss %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2f,0xc1]
; SKX-NEXT: setp %al ## encoding: [0x0f,0x9a,0xc0]
; SKX-NEXT: setne %cl ## encoding: [0x0f,0x95,0xc1]
@@ -212,19 +211,19 @@ declare i32 @llvm.x86.sse.comineq.ss(<4 x float>, <4 x float>) nounwind readnone
define <4 x float> @test_x86_sse_cvtsi2ss(<4 x float> %a0) {
; SSE-LABEL: test_x86_sse_cvtsi2ss:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: movl $7, %eax ## encoding: [0xb8,0x07,0x00,0x00,0x00]
; SSE-NEXT: cvtsi2ssl %eax, %xmm0 ## encoding: [0xf3,0x0f,0x2a,0xc0]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; AVX2-LABEL: test_x86_sse_cvtsi2ss:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: movl $7, %eax ## encoding: [0xb8,0x07,0x00,0x00,0x00]
; AVX2-NEXT: vcvtsi2ssl %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x2a,0xc0]
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_sse_cvtsi2ss:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: movl $7, %eax ## encoding: [0xb8,0x07,0x00,0x00,0x00]
; SKX-NEXT: vcvtsi2ssl %eax, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x2a,0xc0]
; SKX-NEXT: retl ## encoding: [0xc3]
@@ -236,17 +235,17 @@ declare <4 x float> @llvm.x86.sse.cvtsi2ss(<4 x float>, i32) nounwind readnone
define i32 @test_x86_sse_cvtss2si(<4 x float> %a0) {
; SSE-LABEL: test_x86_sse_cvtss2si:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: cvtss2si %xmm0, %eax ## encoding: [0xf3,0x0f,0x2d,0xc0]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; AVX2-LABEL: test_x86_sse_cvtss2si:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vcvtss2si %xmm0, %eax ## encoding: [0xc5,0xfa,0x2d,0xc0]
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_sse_cvtss2si:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vcvtss2si %xmm0, %eax ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x2d,0xc0]
; SKX-NEXT: retl ## encoding: [0xc3]
%res = call i32 @llvm.x86.sse.cvtss2si(<4 x float> %a0) ; <i32> [#uses=1]
@@ -257,17 +256,17 @@ declare i32 @llvm.x86.sse.cvtss2si(<4 x float>) nounwind readnone
define i32 @test_x86_sse_cvttss2si(<4 x float> %a0) {
; SSE-LABEL: test_x86_sse_cvttss2si:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: cvttss2si %xmm0, %eax ## encoding: [0xf3,0x0f,0x2c,0xc0]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; AVX2-LABEL: test_x86_sse_cvttss2si:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vcvttss2si %xmm0, %eax ## encoding: [0xc5,0xfa,0x2c,0xc0]
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_sse_cvttss2si:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vcvttss2si %xmm0, %eax ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x2c,0xc0]
; SKX-NEXT: retl ## encoding: [0xc3]
%res = call i32 @llvm.x86.sse.cvttss2si(<4 x float> %a0) ; <i32> [#uses=1]
@@ -278,13 +277,13 @@ declare i32 @llvm.x86.sse.cvttss2si(<4 x float>) nounwind readnone
define void @test_x86_sse_ldmxcsr(i8* %a0) {
; SSE-LABEL: test_x86_sse_ldmxcsr:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
; SSE-NEXT: ldmxcsr (%eax) ## encoding: [0x0f,0xae,0x10]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; VCHECK-LABEL: test_x86_sse_ldmxcsr:
-; VCHECK: ## BB#0:
+; VCHECK: ## %bb.0:
; VCHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
; VCHECK-NEXT: vldmxcsr (%eax) ## encoding: [0xc5,0xf8,0xae,0x10]
; VCHECK-NEXT: retl ## encoding: [0xc3]
@@ -297,17 +296,17 @@ declare void @llvm.x86.sse.ldmxcsr(i8*) nounwind
define <4 x float> @test_x86_sse_max_ps(<4 x float> %a0, <4 x float> %a1) {
; SSE-LABEL: test_x86_sse_max_ps:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: maxps %xmm1, %xmm0 ## encoding: [0x0f,0x5f,0xc1]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; AVX2-LABEL: test_x86_sse_max_ps:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vmaxps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x5f,0xc1]
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_sse_max_ps:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vmaxps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x5f,0xc1]
; SKX-NEXT: retl ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1]
@@ -318,17 +317,17 @@ declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind read
define <4 x float> @test_x86_sse_max_ss(<4 x float> %a0, <4 x float> %a1) {
; SSE-LABEL: test_x86_sse_max_ss:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: maxss %xmm1, %xmm0 ## encoding: [0xf3,0x0f,0x5f,0xc1]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; AVX2-LABEL: test_x86_sse_max_ss:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vmaxss %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x5f,0xc1]
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_sse_max_ss:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vmaxss %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x5f,0xc1]
; SKX-NEXT: retl ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.sse.max.ss(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1]
@@ -339,17 +338,17 @@ declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>) nounwind read
define <4 x float> @test_x86_sse_min_ps(<4 x float> %a0, <4 x float> %a1) {
; SSE-LABEL: test_x86_sse_min_ps:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: minps %xmm1, %xmm0 ## encoding: [0x0f,0x5d,0xc1]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; AVX2-LABEL: test_x86_sse_min_ps:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vminps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x5d,0xc1]
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_sse_min_ps:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vminps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x5d,0xc1]
; SKX-NEXT: retl ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1]
@@ -360,17 +359,17 @@ declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind read
define <4 x float> @test_x86_sse_min_ss(<4 x float> %a0, <4 x float> %a1) {
; SSE-LABEL: test_x86_sse_min_ss:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: minss %xmm1, %xmm0 ## encoding: [0xf3,0x0f,0x5d,0xc1]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; AVX2-LABEL: test_x86_sse_min_ss:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vminss %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x5d,0xc1]
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_sse_min_ss:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vminss %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x5d,0xc1]
; SKX-NEXT: retl ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.sse.min.ss(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1]
@@ -381,12 +380,12 @@ declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>) nounwind read
define i32 @test_x86_sse_movmsk_ps(<4 x float> %a0) {
; SSE-LABEL: test_x86_sse_movmsk_ps:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: movmskps %xmm0, %eax ## encoding: [0x0f,0x50,0xc0]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; VCHECK-LABEL: test_x86_sse_movmsk_ps:
-; VCHECK: ## BB#0:
+; VCHECK: ## %bb.0:
; VCHECK-NEXT: vmovmskps %xmm0, %eax ## encoding: [0xc5,0xf8,0x50,0xc0]
; VCHECK-NEXT: retl ## encoding: [0xc3]
%res = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %a0) ; <i32> [#uses=1]
@@ -398,19 +397,14 @@ declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone
define <4 x float> @test_x86_sse_rcp_ps(<4 x float> %a0) {
; SSE-LABEL: test_x86_sse_rcp_ps:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: rcpps %xmm0, %xmm0 ## encoding: [0x0f,0x53,0xc0]
; SSE-NEXT: retl ## encoding: [0xc3]
;
-; AVX2-LABEL: test_x86_sse_rcp_ps:
-; AVX2: ## BB#0:
-; AVX2-NEXT: vrcpps %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x53,0xc0]
-; AVX2-NEXT: retl ## encoding: [0xc3]
-;
-; SKX-LABEL: test_x86_sse_rcp_ps:
-; SKX: ## BB#0:
-; SKX-NEXT: vrcp14ps %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7d,0x08,0x4c,0xc0]
-; SKX-NEXT: retl ## encoding: [0xc3]
+; VCHECK-LABEL: test_x86_sse_rcp_ps:
+; VCHECK: ## %bb.0:
+; VCHECK-NEXT: vrcpps %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x53,0xc0]
+; VCHECK-NEXT: retl ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %a0) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
@@ -419,12 +413,12 @@ declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone
define <4 x float> @test_x86_sse_rcp_ss(<4 x float> %a0) {
; SSE-LABEL: test_x86_sse_rcp_ss:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: rcpss %xmm0, %xmm0 ## encoding: [0xf3,0x0f,0x53,0xc0]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; VCHECK-LABEL: test_x86_sse_rcp_ss:
-; VCHECK: ## BB#0:
+; VCHECK: ## %bb.0:
; VCHECK-NEXT: vrcpss %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x53,0xc0]
; VCHECK-NEXT: retl ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %a0) ; <<4 x float>> [#uses=1]
@@ -435,19 +429,14 @@ declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone
define <4 x float> @test_x86_sse_rsqrt_ps(<4 x float> %a0) {
; SSE-LABEL: test_x86_sse_rsqrt_ps:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: rsqrtps %xmm0, %xmm0 ## encoding: [0x0f,0x52,0xc0]
; SSE-NEXT: retl ## encoding: [0xc3]
;
-; AVX2-LABEL: test_x86_sse_rsqrt_ps:
-; AVX2: ## BB#0:
-; AVX2-NEXT: vrsqrtps %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x52,0xc0]
-; AVX2-NEXT: retl ## encoding: [0xc3]
-;
-; SKX-LABEL: test_x86_sse_rsqrt_ps:
-; SKX: ## BB#0:
-; SKX-NEXT: vrsqrt14ps %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7d,0x08,0x4e,0xc0]
-; SKX-NEXT: retl ## encoding: [0xc3]
+; VCHECK-LABEL: test_x86_sse_rsqrt_ps:
+; VCHECK: ## %bb.0:
+; VCHECK-NEXT: vrsqrtps %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x52,0xc0]
+; VCHECK-NEXT: retl ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %a0) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
@@ -456,12 +445,12 @@ declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone
define <4 x float> @test_x86_sse_rsqrt_ss(<4 x float> %a0) {
; SSE-LABEL: test_x86_sse_rsqrt_ss:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: rsqrtss %xmm0, %xmm0 ## encoding: [0xf3,0x0f,0x52,0xc0]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; VCHECK-LABEL: test_x86_sse_rsqrt_ss:
-; VCHECK: ## BB#0:
+; VCHECK: ## %bb.0:
; VCHECK-NEXT: vrsqrtss %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x52,0xc0]
; VCHECK-NEXT: retl ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %a0) ; <<4 x float>> [#uses=1]
@@ -472,14 +461,19 @@ declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone
define <4 x float> @test_x86_sse_sqrt_ps(<4 x float> %a0) {
; SSE-LABEL: test_x86_sse_sqrt_ps:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: sqrtps %xmm0, %xmm0 ## encoding: [0x0f,0x51,0xc0]
; SSE-NEXT: retl ## encoding: [0xc3]
;
-; VCHECK-LABEL: test_x86_sse_sqrt_ps:
-; VCHECK: ## BB#0:
-; VCHECK-NEXT: vsqrtps %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x51,0xc0]
-; VCHECK-NEXT: retl ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_sse_sqrt_ps:
+; AVX2: ## %bb.0:
+; AVX2-NEXT: vsqrtps %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x51,0xc0]
+; AVX2-NEXT: retl ## encoding: [0xc3]
+;
+; SKX-LABEL: test_x86_sse_sqrt_ps:
+; SKX: ## %bb.0:
+; SKX-NEXT: vsqrtps %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x51,0xc0]
+; SKX-NEXT: retl ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float> %a0) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
@@ -488,14 +482,19 @@ declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone
define <4 x float> @test_x86_sse_sqrt_ss(<4 x float> %a0) {
; SSE-LABEL: test_x86_sse_sqrt_ss:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: sqrtss %xmm0, %xmm0 ## encoding: [0xf3,0x0f,0x51,0xc0]
; SSE-NEXT: retl ## encoding: [0xc3]
;
-; VCHECK-LABEL: test_x86_sse_sqrt_ss:
-; VCHECK: ## BB#0:
-; VCHECK-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x51,0xc0]
-; VCHECK-NEXT: retl ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_sse_sqrt_ss:
+; AVX2: ## %bb.0:
+; AVX2-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x51,0xc0]
+; AVX2-NEXT: retl ## encoding: [0xc3]
+;
+; SKX-LABEL: test_x86_sse_sqrt_ss:
+; SKX: ## %bb.0:
+; SKX-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x51,0xc0]
+; SKX-NEXT: retl ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %a0) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
@@ -504,13 +503,13 @@ declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone
define void @test_x86_sse_stmxcsr(i8* %a0) {
; SSE-LABEL: test_x86_sse_stmxcsr:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
; SSE-NEXT: stmxcsr (%eax) ## encoding: [0x0f,0xae,0x18]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; VCHECK-LABEL: test_x86_sse_stmxcsr:
-; VCHECK: ## BB#0:
+; VCHECK: ## %bb.0:
; VCHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
; VCHECK-NEXT: vstmxcsr (%eax) ## encoding: [0xc5,0xf8,0xae,0x18]
; VCHECK-NEXT: retl ## encoding: [0xc3]
@@ -522,7 +521,7 @@ declare void @llvm.x86.sse.stmxcsr(i8*) nounwind
define i32 @test_x86_sse_ucomieq_ss(<4 x float> %a0, <4 x float> %a1) {
; SSE-LABEL: test_x86_sse_ucomieq_ss:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: ucomiss %xmm1, %xmm0 ## encoding: [0x0f,0x2e,0xc1]
; SSE-NEXT: setnp %al ## encoding: [0x0f,0x9b,0xc0]
; SSE-NEXT: sete %cl ## encoding: [0x0f,0x94,0xc1]
@@ -531,7 +530,7 @@ define i32 @test_x86_sse_ucomieq_ss(<4 x float> %a0, <4 x float> %a1) {
; SSE-NEXT: retl ## encoding: [0xc3]
;
; AVX2-LABEL: test_x86_sse_ucomieq_ss:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vucomiss %xmm1, %xmm0 ## encoding: [0xc5,0xf8,0x2e,0xc1]
; AVX2-NEXT: setnp %al ## encoding: [0x0f,0x9b,0xc0]
; AVX2-NEXT: sete %cl ## encoding: [0x0f,0x94,0xc1]
@@ -540,7 +539,7 @@ define i32 @test_x86_sse_ucomieq_ss(<4 x float> %a0, <4 x float> %a1) {
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_sse_ucomieq_ss:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vucomiss %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2e,0xc1]
; SKX-NEXT: setnp %al ## encoding: [0x0f,0x9b,0xc0]
; SKX-NEXT: sete %cl ## encoding: [0x0f,0x94,0xc1]
@@ -555,21 +554,21 @@ declare i32 @llvm.x86.sse.ucomieq.ss(<4 x float>, <4 x float>) nounwind readnone
define i32 @test_x86_sse_ucomige_ss(<4 x float> %a0, <4 x float> %a1) {
; SSE-LABEL: test_x86_sse_ucomige_ss:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0]
; SSE-NEXT: ucomiss %xmm1, %xmm0 ## encoding: [0x0f,0x2e,0xc1]
; SSE-NEXT: setae %al ## encoding: [0x0f,0x93,0xc0]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; AVX2-LABEL: test_x86_sse_ucomige_ss:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0]
; AVX2-NEXT: vucomiss %xmm1, %xmm0 ## encoding: [0xc5,0xf8,0x2e,0xc1]
; AVX2-NEXT: setae %al ## encoding: [0x0f,0x93,0xc0]
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_sse_ucomige_ss:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0]
; SKX-NEXT: vucomiss %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2e,0xc1]
; SKX-NEXT: setae %al ## encoding: [0x0f,0x93,0xc0]
@@ -582,21 +581,21 @@ declare i32 @llvm.x86.sse.ucomige.ss(<4 x float>, <4 x float>) nounwind readnone
define i32 @test_x86_sse_ucomigt_ss(<4 x float> %a0, <4 x float> %a1) {
; SSE-LABEL: test_x86_sse_ucomigt_ss:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0]
; SSE-NEXT: ucomiss %xmm1, %xmm0 ## encoding: [0x0f,0x2e,0xc1]
; SSE-NEXT: seta %al ## encoding: [0x0f,0x97,0xc0]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; AVX2-LABEL: test_x86_sse_ucomigt_ss:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0]
; AVX2-NEXT: vucomiss %xmm1, %xmm0 ## encoding: [0xc5,0xf8,0x2e,0xc1]
; AVX2-NEXT: seta %al ## encoding: [0x0f,0x97,0xc0]
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_sse_ucomigt_ss:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0]
; SKX-NEXT: vucomiss %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2e,0xc1]
; SKX-NEXT: seta %al ## encoding: [0x0f,0x97,0xc0]
@@ -609,21 +608,21 @@ declare i32 @llvm.x86.sse.ucomigt.ss(<4 x float>, <4 x float>) nounwind readnone
define i32 @test_x86_sse_ucomile_ss(<4 x float> %a0, <4 x float> %a1) {
; SSE-LABEL: test_x86_sse_ucomile_ss:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0]
; SSE-NEXT: ucomiss %xmm0, %xmm1 ## encoding: [0x0f,0x2e,0xc8]
; SSE-NEXT: setae %al ## encoding: [0x0f,0x93,0xc0]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; AVX2-LABEL: test_x86_sse_ucomile_ss:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0]
; AVX2-NEXT: vucomiss %xmm0, %xmm1 ## encoding: [0xc5,0xf8,0x2e,0xc8]
; AVX2-NEXT: setae %al ## encoding: [0x0f,0x93,0xc0]
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_sse_ucomile_ss:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0]
; SKX-NEXT: vucomiss %xmm0, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2e,0xc8]
; SKX-NEXT: setae %al ## encoding: [0x0f,0x93,0xc0]
@@ -636,21 +635,21 @@ declare i32 @llvm.x86.sse.ucomile.ss(<4 x float>, <4 x float>) nounwind readnone
define i32 @test_x86_sse_ucomilt_ss(<4 x float> %a0, <4 x float> %a1) {
; SSE-LABEL: test_x86_sse_ucomilt_ss:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0]
; SSE-NEXT: ucomiss %xmm0, %xmm1 ## encoding: [0x0f,0x2e,0xc8]
; SSE-NEXT: seta %al ## encoding: [0x0f,0x97,0xc0]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; AVX2-LABEL: test_x86_sse_ucomilt_ss:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0]
; AVX2-NEXT: vucomiss %xmm0, %xmm1 ## encoding: [0xc5,0xf8,0x2e,0xc8]
; AVX2-NEXT: seta %al ## encoding: [0x0f,0x97,0xc0]
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_sse_ucomilt_ss:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0]
; SKX-NEXT: vucomiss %xmm0, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2e,0xc8]
; SKX-NEXT: seta %al ## encoding: [0x0f,0x97,0xc0]
@@ -663,7 +662,7 @@ declare i32 @llvm.x86.sse.ucomilt.ss(<4 x float>, <4 x float>) nounwind readnone
define i32 @test_x86_sse_ucomineq_ss(<4 x float> %a0, <4 x float> %a1) {
; SSE-LABEL: test_x86_sse_ucomineq_ss:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: ucomiss %xmm1, %xmm0 ## encoding: [0x0f,0x2e,0xc1]
; SSE-NEXT: setp %al ## encoding: [0x0f,0x9a,0xc0]
; SSE-NEXT: setne %cl ## encoding: [0x0f,0x95,0xc1]
@@ -672,7 +671,7 @@ define i32 @test_x86_sse_ucomineq_ss(<4 x float> %a0, <4 x float> %a1) {
; SSE-NEXT: retl ## encoding: [0xc3]
;
; AVX2-LABEL: test_x86_sse_ucomineq_ss:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vucomiss %xmm1, %xmm0 ## encoding: [0xc5,0xf8,0x2e,0xc1]
; AVX2-NEXT: setp %al ## encoding: [0x0f,0x9a,0xc0]
; AVX2-NEXT: setne %cl ## encoding: [0x0f,0x95,0xc1]
@@ -681,7 +680,7 @@ define i32 @test_x86_sse_ucomineq_ss(<4 x float> %a0, <4 x float> %a1) {
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_sse_ucomineq_ss:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vucomiss %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2e,0xc1]
; SKX-NEXT: setp %al ## encoding: [0x0f,0x9a,0xc0]
; SKX-NEXT: setne %cl ## encoding: [0x0f,0x95,0xc1]
@@ -696,12 +695,12 @@ declare i32 @llvm.x86.sse.ucomineq.ss(<4 x float>, <4 x float>) nounwind readnon
define void @sfence() nounwind {
; SSE-LABEL: sfence:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: sfence ## encoding: [0x0f,0xae,0xf8]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; VCHECK-LABEL: sfence:
-; VCHECK: ## BB#0:
+; VCHECK: ## %bb.0:
; VCHECK-NEXT: sfence ## encoding: [0x0f,0xae,0xf8]
; VCHECK-NEXT: retl ## encoding: [0xc3]
tail call void @llvm.x86.sse.sfence()
diff --git a/test/CodeGen/X86/sse-intrinsics-x86_64.ll b/test/CodeGen/X86/sse-intrinsics-x86_64.ll
index 61d0cae9acf1..6f95b8d9ea87 100644
--- a/test/CodeGen/X86/sse-intrinsics-x86_64.ll
+++ b/test/CodeGen/X86/sse-intrinsics-x86_64.ll
@@ -5,21 +5,21 @@
define i64 @test_x86_sse_cvtss2si64(<4 x float> %a0) {
; CHECK-LABEL: test_x86_sse_cvtss2si64:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vcvtss2si %xmm0, %rax
; CHECK-NEXT: retq
; SSE-LABEL: test_x86_sse_cvtss2si64:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: cvtss2si %xmm0, %rax ## encoding: [0xf3,0x48,0x0f,0x2d,0xc0]
; SSE-NEXT: retq ## encoding: [0xc3]
;
; AVX2-LABEL: test_x86_sse_cvtss2si64:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vcvtss2si %xmm0, %rax ## encoding: [0xc4,0xe1,0xfa,0x2d,0xc0]
; AVX2-NEXT: retq ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_sse_cvtss2si64:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vcvtss2si %xmm0, %rax ## EVEX TO VEX Compression encoding: [0xc4,0xe1,0xfa,0x2d,0xc0]
; SKX-NEXT: retq ## encoding: [0xc3]
%res = call i64 @llvm.x86.sse.cvtss2si64(<4 x float> %a0) ; <i64> [#uses=1]
@@ -30,21 +30,21 @@ declare i64 @llvm.x86.sse.cvtss2si64(<4 x float>) nounwind readnone
define <4 x float> @test_x86_sse_cvtsi642ss(<4 x float> %a0, i64 %a1) {
; CHECK-LABEL: test_x86_sse_cvtsi642ss:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vcvtsi2ssq %rdi, %xmm0, %xmm0
; CHECK-NEXT: retq
; SSE-LABEL: test_x86_sse_cvtsi642ss:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: cvtsi2ssq %rdi, %xmm0 ## encoding: [0xf3,0x48,0x0f,0x2a,0xc7]
; SSE-NEXT: retq ## encoding: [0xc3]
;
; AVX2-LABEL: test_x86_sse_cvtsi642ss:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vcvtsi2ssq %rdi, %xmm0, %xmm0 ## encoding: [0xc4,0xe1,0xfa,0x2a,0xc7]
; AVX2-NEXT: retq ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_sse_cvtsi642ss:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vcvtsi2ssq %rdi, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe1,0xfa,0x2a,0xc7]
; SKX-NEXT: retq ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.sse.cvtsi642ss(<4 x float> %a0, i64 %a1) ; <<4 x float>> [#uses=1]
@@ -55,21 +55,21 @@ declare <4 x float> @llvm.x86.sse.cvtsi642ss(<4 x float>, i64) nounwind readnone
define i64 @test_x86_sse_cvttss2si64(<4 x float> %a0) {
; CHECK-LABEL: test_x86_sse_cvttss2si64:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vcvttss2si %xmm0, %rax
; CHECK-NEXT: retq
; SSE-LABEL: test_x86_sse_cvttss2si64:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: cvttss2si %xmm0, %rax ## encoding: [0xf3,0x48,0x0f,0x2c,0xc0]
; SSE-NEXT: retq ## encoding: [0xc3]
;
; AVX2-LABEL: test_x86_sse_cvttss2si64:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vcvttss2si %xmm0, %rax ## encoding: [0xc4,0xe1,0xfa,0x2c,0xc0]
; AVX2-NEXT: retq ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_sse_cvttss2si64:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vcvttss2si %xmm0, %rax ## EVEX TO VEX Compression encoding: [0xc4,0xe1,0xfa,0x2c,0xc0]
; SKX-NEXT: retq ## encoding: [0xc3]
%res = call i64 @llvm.x86.sse.cvttss2si64(<4 x float> %a0) ; <i64> [#uses=1]
diff --git a/test/CodeGen/X86/sse-load-ret.ll b/test/CodeGen/X86/sse-load-ret.ll
index 8da45a786e7c..a84201a9a11d 100644
--- a/test/CodeGen/X86/sse-load-ret.ll
+++ b/test/CodeGen/X86/sse-load-ret.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=x86 -mcpu=yonah | not grep movss
-; RUN: llc < %s -march=x86 -mcpu=yonah | not grep xmm
+; RUN: llc < %s -mtriple=i686-- -mcpu=yonah | not grep movss
+; RUN: llc < %s -mtriple=i686-- -mcpu=yonah | not grep xmm
define double @test1(double* %P) {
%X = load double, double* %P ; <double> [#uses=1]
diff --git a/test/CodeGen/X86/sse-minmax.ll b/test/CodeGen/X86/sse-minmax.ll
index 2944001ed7e9..f79749169c0a 100644
--- a/test/CodeGen/X86/sse-minmax.ll
+++ b/test/CodeGen/X86/sse-minmax.ll
@@ -15,7 +15,7 @@
define double @ogt(double %x, double %y) {
; ALL-LABEL: ogt:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: maxsd %xmm1, %xmm0
; ALL-NEXT: retq
%c = fcmp ogt double %x, %y
@@ -25,7 +25,7 @@ define double @ogt(double %x, double %y) {
define double @olt(double %x, double %y) {
; ALL-LABEL: olt:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: minsd %xmm1, %xmm0
; ALL-NEXT: retq
%c = fcmp olt double %x, %y
@@ -35,18 +35,18 @@ define double @olt(double %x, double %y) {
define double @ogt_inverse(double %x, double %y) {
; STRICT-LABEL: ogt_inverse:
-; STRICT: # BB#0:
+; STRICT: # %bb.0:
; STRICT-NEXT: minsd %xmm0, %xmm1
; STRICT-NEXT: movapd %xmm1, %xmm0
; STRICT-NEXT: retq
;
; UNSAFE-LABEL: ogt_inverse:
-; UNSAFE: # BB#0:
+; UNSAFE: # %bb.0:
; UNSAFE-NEXT: minsd %xmm1, %xmm0
; UNSAFE-NEXT: retq
;
; FINITE-LABEL: ogt_inverse:
-; FINITE: # BB#0:
+; FINITE: # %bb.0:
; FINITE-NEXT: minsd %xmm0, %xmm1
; FINITE-NEXT: movapd %xmm1, %xmm0
; FINITE-NEXT: retq
@@ -57,18 +57,18 @@ define double @ogt_inverse(double %x, double %y) {
define double @olt_inverse(double %x, double %y) {
; STRICT-LABEL: olt_inverse:
-; STRICT: # BB#0:
+; STRICT: # %bb.0:
; STRICT-NEXT: maxsd %xmm0, %xmm1
; STRICT-NEXT: movapd %xmm1, %xmm0
; STRICT-NEXT: retq
;
; UNSAFE-LABEL: olt_inverse:
-; UNSAFE: # BB#0:
+; UNSAFE: # %bb.0:
; UNSAFE-NEXT: maxsd %xmm1, %xmm0
; UNSAFE-NEXT: retq
;
; FINITE-LABEL: olt_inverse:
-; FINITE: # BB#0:
+; FINITE: # %bb.0:
; FINITE-NEXT: maxsd %xmm0, %xmm1
; FINITE-NEXT: movapd %xmm1, %xmm0
; FINITE-NEXT: retq
@@ -79,7 +79,7 @@ define double @olt_inverse(double %x, double %y) {
define double @oge(double %x, double %y) {
; STRICT-LABEL: oge:
-; STRICT: # BB#0:
+; STRICT: # %bb.0:
; STRICT-NEXT: movapd %xmm1, %xmm2
; STRICT-NEXT: cmplesd %xmm0, %xmm2
; STRICT-NEXT: andpd %xmm2, %xmm0
@@ -88,7 +88,7 @@ define double @oge(double %x, double %y) {
; STRICT-NEXT: retq
;
; RELAX-LABEL: oge:
-; RELAX: # BB#0:
+; RELAX: # %bb.0:
; RELAX-NEXT: maxsd %xmm1, %xmm0
; RELAX-NEXT: retq
%c = fcmp oge double %x, %y
@@ -98,7 +98,7 @@ define double @oge(double %x, double %y) {
define double @ole(double %x, double %y) {
; STRICT-LABEL: ole:
-; STRICT: # BB#0:
+; STRICT: # %bb.0:
; STRICT-NEXT: movapd %xmm0, %xmm2
; STRICT-NEXT: cmplesd %xmm1, %xmm2
; STRICT-NEXT: andpd %xmm2, %xmm0
@@ -108,7 +108,7 @@ define double @ole(double %x, double %y) {
; STRICT-NEXT: retq
;
; RELAX-LABEL: ole:
-; RELAX: # BB#0:
+; RELAX: # %bb.0:
; RELAX-NEXT: minsd %xmm1, %xmm0
; RELAX-NEXT: retq
%c = fcmp ole double %x, %y
@@ -118,7 +118,7 @@ define double @ole(double %x, double %y) {
define double @oge_inverse(double %x, double %y) {
; STRICT-LABEL: oge_inverse:
-; STRICT: # BB#0:
+; STRICT: # %bb.0:
; STRICT-NEXT: movapd %xmm1, %xmm2
; STRICT-NEXT: cmplesd %xmm0, %xmm2
; STRICT-NEXT: andpd %xmm2, %xmm1
@@ -128,12 +128,12 @@ define double @oge_inverse(double %x, double %y) {
; STRICT-NEXT: retq
;
; UNSAFE-LABEL: oge_inverse:
-; UNSAFE: # BB#0:
+; UNSAFE: # %bb.0:
; UNSAFE-NEXT: minsd %xmm1, %xmm0
; UNSAFE-NEXT: retq
;
; FINITE-LABEL: oge_inverse:
-; FINITE: # BB#0:
+; FINITE: # %bb.0:
; FINITE-NEXT: minsd %xmm0, %xmm1
; FINITE-NEXT: movapd %xmm1, %xmm0
; FINITE-NEXT: retq
@@ -144,7 +144,7 @@ define double @oge_inverse(double %x, double %y) {
define double @ole_inverse(double %x, double %y) {
; STRICT-LABEL: ole_inverse:
-; STRICT: # BB#0:
+; STRICT: # %bb.0:
; STRICT-NEXT: movapd %xmm0, %xmm2
; STRICT-NEXT: cmplesd %xmm1, %xmm2
; STRICT-NEXT: andpd %xmm2, %xmm1
@@ -154,12 +154,12 @@ define double @ole_inverse(double %x, double %y) {
; STRICT-NEXT: retq
;
; UNSAFE-LABEL: ole_inverse:
-; UNSAFE: # BB#0:
+; UNSAFE: # %bb.0:
; UNSAFE-NEXT: maxsd %xmm1, %xmm0
; UNSAFE-NEXT: retq
;
; FINITE-LABEL: ole_inverse:
-; FINITE: # BB#0:
+; FINITE: # %bb.0:
; FINITE-NEXT: maxsd %xmm0, %xmm1
; FINITE-NEXT: movapd %xmm1, %xmm0
; FINITE-NEXT: retq
@@ -170,7 +170,7 @@ define double @ole_inverse(double %x, double %y) {
define double @ogt_x(double %x) {
; ALL-LABEL: ogt_x:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: xorpd %xmm1, %xmm1
; ALL-NEXT: maxsd %xmm1, %xmm0
; ALL-NEXT: retq
@@ -181,7 +181,7 @@ define double @ogt_x(double %x) {
define double @olt_x(double %x) {
; ALL-LABEL: olt_x:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: xorpd %xmm1, %xmm1
; ALL-NEXT: minsd %xmm1, %xmm0
; ALL-NEXT: retq
@@ -192,20 +192,20 @@ define double @olt_x(double %x) {
define double @ogt_inverse_x(double %x) {
; STRICT-LABEL: ogt_inverse_x:
-; STRICT: # BB#0:
+; STRICT: # %bb.0:
; STRICT-NEXT: xorpd %xmm1, %xmm1
; STRICT-NEXT: minsd %xmm0, %xmm1
; STRICT-NEXT: movapd %xmm1, %xmm0
; STRICT-NEXT: retq
;
; UNSAFE-LABEL: ogt_inverse_x:
-; UNSAFE: # BB#0:
+; UNSAFE: # %bb.0:
; UNSAFE-NEXT: xorpd %xmm1, %xmm1
; UNSAFE-NEXT: minsd %xmm1, %xmm0
; UNSAFE-NEXT: retq
;
; FINITE-LABEL: ogt_inverse_x:
-; FINITE: # BB#0:
+; FINITE: # %bb.0:
; FINITE-NEXT: xorpd %xmm1, %xmm1
; FINITE-NEXT: minsd %xmm0, %xmm1
; FINITE-NEXT: movapd %xmm1, %xmm0
@@ -217,20 +217,20 @@ define double @ogt_inverse_x(double %x) {
define double @olt_inverse_x(double %x) {
; STRICT-LABEL: olt_inverse_x:
-; STRICT: # BB#0:
+; STRICT: # %bb.0:
; STRICT-NEXT: xorpd %xmm1, %xmm1
; STRICT-NEXT: maxsd %xmm0, %xmm1
; STRICT-NEXT: movapd %xmm1, %xmm0
; STRICT-NEXT: retq
;
; UNSAFE-LABEL: olt_inverse_x:
-; UNSAFE: # BB#0:
+; UNSAFE: # %bb.0:
; UNSAFE-NEXT: xorpd %xmm1, %xmm1
; UNSAFE-NEXT: maxsd %xmm1, %xmm0
; UNSAFE-NEXT: retq
;
; FINITE-LABEL: olt_inverse_x:
-; FINITE: # BB#0:
+; FINITE: # %bb.0:
; FINITE-NEXT: xorpd %xmm1, %xmm1
; FINITE-NEXT: maxsd %xmm0, %xmm1
; FINITE-NEXT: movapd %xmm1, %xmm0
@@ -242,14 +242,14 @@ define double @olt_inverse_x(double %x) {
define double @oge_x(double %x) {
; STRICT-LABEL: oge_x:
-; STRICT: # BB#0:
+; STRICT: # %bb.0:
; STRICT-NEXT: xorpd %xmm1, %xmm1
; STRICT-NEXT: cmplesd %xmm0, %xmm1
; STRICT-NEXT: andpd %xmm1, %xmm0
; STRICT-NEXT: retq
;
; RELAX-LABEL: oge_x:
-; RELAX: # BB#0:
+; RELAX: # %bb.0:
; RELAX-NEXT: xorpd %xmm1, %xmm1
; RELAX-NEXT: maxsd %xmm1, %xmm0
; RELAX-NEXT: retq
@@ -260,7 +260,7 @@ define double @oge_x(double %x) {
define double @ole_x(double %x) {
; STRICT-LABEL: ole_x:
-; STRICT: # BB#0:
+; STRICT: # %bb.0:
; STRICT-NEXT: xorpd %xmm2, %xmm2
; STRICT-NEXT: movapd %xmm0, %xmm1
; STRICT-NEXT: cmplesd %xmm2, %xmm1
@@ -269,7 +269,7 @@ define double @ole_x(double %x) {
; STRICT-NEXT: retq
;
; RELAX-LABEL: ole_x:
-; RELAX: # BB#0:
+; RELAX: # %bb.0:
; RELAX-NEXT: xorpd %xmm1, %xmm1
; RELAX-NEXT: minsd %xmm1, %xmm0
; RELAX-NEXT: retq
@@ -280,7 +280,7 @@ define double @ole_x(double %x) {
define double @oge_inverse_x(double %x) {
; STRICT-LABEL: oge_inverse_x:
-; STRICT: # BB#0:
+; STRICT: # %bb.0:
; STRICT-NEXT: xorpd %xmm1, %xmm1
; STRICT-NEXT: cmplesd %xmm0, %xmm1
; STRICT-NEXT: andnpd %xmm0, %xmm1
@@ -288,13 +288,13 @@ define double @oge_inverse_x(double %x) {
; STRICT-NEXT: retq
;
; UNSAFE-LABEL: oge_inverse_x:
-; UNSAFE: # BB#0:
+; UNSAFE: # %bb.0:
; UNSAFE-NEXT: xorpd %xmm1, %xmm1
; UNSAFE-NEXT: minsd %xmm1, %xmm0
; UNSAFE-NEXT: retq
;
; FINITE-LABEL: oge_inverse_x:
-; FINITE: # BB#0:
+; FINITE: # %bb.0:
; FINITE-NEXT: xorpd %xmm1, %xmm1
; FINITE-NEXT: minsd %xmm0, %xmm1
; FINITE-NEXT: movapd %xmm1, %xmm0
@@ -306,7 +306,7 @@ define double @oge_inverse_x(double %x) {
define double @ole_inverse_x(double %x) {
; STRICT-LABEL: ole_inverse_x:
-; STRICT: # BB#0:
+; STRICT: # %bb.0:
; STRICT-NEXT: xorpd %xmm2, %xmm2
; STRICT-NEXT: movapd %xmm0, %xmm1
; STRICT-NEXT: cmplesd %xmm2, %xmm1
@@ -315,13 +315,13 @@ define double @ole_inverse_x(double %x) {
; STRICT-NEXT: retq
;
; UNSAFE-LABEL: ole_inverse_x:
-; UNSAFE: # BB#0:
+; UNSAFE: # %bb.0:
; UNSAFE-NEXT: xorpd %xmm1, %xmm1
; UNSAFE-NEXT: maxsd %xmm1, %xmm0
; UNSAFE-NEXT: retq
;
; FINITE-LABEL: ole_inverse_x:
-; FINITE: # BB#0:
+; FINITE: # %bb.0:
; FINITE-NEXT: xorpd %xmm1, %xmm1
; FINITE-NEXT: maxsd %xmm0, %xmm1
; FINITE-NEXT: movapd %xmm1, %xmm0
@@ -333,7 +333,7 @@ define double @ole_inverse_x(double %x) {
define double @ugt(double %x, double %y) {
; STRICT-LABEL: ugt:
-; STRICT: # BB#0:
+; STRICT: # %bb.0:
; STRICT-NEXT: movapd %xmm0, %xmm2
; STRICT-NEXT: cmpnlesd %xmm1, %xmm2
; STRICT-NEXT: andpd %xmm2, %xmm0
@@ -343,7 +343,7 @@ define double @ugt(double %x, double %y) {
; STRICT-NEXT: retq
;
; RELAX-LABEL: ugt:
-; RELAX: # BB#0:
+; RELAX: # %bb.0:
; RELAX-NEXT: maxsd %xmm1, %xmm0
; RELAX-NEXT: retq
%c = fcmp ugt double %x, %y
@@ -353,7 +353,7 @@ define double @ugt(double %x, double %y) {
define double @ult(double %x, double %y) {
; STRICT-LABEL: ult:
-; STRICT: # BB#0:
+; STRICT: # %bb.0:
; STRICT-NEXT: movapd %xmm1, %xmm2
; STRICT-NEXT: cmpnlesd %xmm0, %xmm2
; STRICT-NEXT: andpd %xmm2, %xmm0
@@ -362,7 +362,7 @@ define double @ult(double %x, double %y) {
; STRICT-NEXT: retq
;
; RELAX-LABEL: ult:
-; RELAX: # BB#0:
+; RELAX: # %bb.0:
; RELAX-NEXT: minsd %xmm1, %xmm0
; RELAX-NEXT: retq
%c = fcmp ult double %x, %y
@@ -372,7 +372,7 @@ define double @ult(double %x, double %y) {
define double @ugt_inverse(double %x, double %y) {
; STRICT-LABEL: ugt_inverse:
-; STRICT: # BB#0:
+; STRICT: # %bb.0:
; STRICT-NEXT: movapd %xmm0, %xmm2
; STRICT-NEXT: cmpnlesd %xmm1, %xmm2
; STRICT-NEXT: andpd %xmm2, %xmm1
@@ -382,12 +382,12 @@ define double @ugt_inverse(double %x, double %y) {
; STRICT-NEXT: retq
;
; UNSAFE-LABEL: ugt_inverse:
-; UNSAFE: # BB#0:
+; UNSAFE: # %bb.0:
; UNSAFE-NEXT: minsd %xmm1, %xmm0
; UNSAFE-NEXT: retq
;
; FINITE-LABEL: ugt_inverse:
-; FINITE: # BB#0:
+; FINITE: # %bb.0:
; FINITE-NEXT: minsd %xmm0, %xmm1
; FINITE-NEXT: movapd %xmm1, %xmm0
; FINITE-NEXT: retq
@@ -398,7 +398,7 @@ define double @ugt_inverse(double %x, double %y) {
define double @ult_inverse(double %x, double %y) {
; STRICT-LABEL: ult_inverse:
-; STRICT: # BB#0:
+; STRICT: # %bb.0:
; STRICT-NEXT: movapd %xmm1, %xmm2
; STRICT-NEXT: cmpnlesd %xmm0, %xmm2
; STRICT-NEXT: andpd %xmm2, %xmm1
@@ -408,12 +408,12 @@ define double @ult_inverse(double %x, double %y) {
; STRICT-NEXT: retq
;
; UNSAFE-LABEL: ult_inverse:
-; UNSAFE: # BB#0:
+; UNSAFE: # %bb.0:
; UNSAFE-NEXT: maxsd %xmm1, %xmm0
; UNSAFE-NEXT: retq
;
; FINITE-LABEL: ult_inverse:
-; FINITE: # BB#0:
+; FINITE: # %bb.0:
; FINITE-NEXT: maxsd %xmm0, %xmm1
; FINITE-NEXT: movapd %xmm1, %xmm0
; FINITE-NEXT: retq
@@ -424,13 +424,13 @@ define double @ult_inverse(double %x, double %y) {
define double @uge(double %x, double %y) {
; STRICT-LABEL: uge:
-; STRICT: # BB#0:
+; STRICT: # %bb.0:
; STRICT-NEXT: maxsd %xmm0, %xmm1
; STRICT-NEXT: movapd %xmm1, %xmm0
; STRICT-NEXT: retq
;
; RELAX-LABEL: uge:
-; RELAX: # BB#0:
+; RELAX: # %bb.0:
; RELAX-NEXT: maxsd %xmm1, %xmm0
; RELAX-NEXT: retq
%c = fcmp uge double %x, %y
@@ -440,13 +440,13 @@ define double @uge(double %x, double %y) {
define double @ule(double %x, double %y) {
; STRICT-LABEL: ule:
-; STRICT: # BB#0:
+; STRICT: # %bb.0:
; STRICT-NEXT: minsd %xmm0, %xmm1
; STRICT-NEXT: movapd %xmm1, %xmm0
; STRICT-NEXT: retq
;
; RELAX-LABEL: ule:
-; RELAX: # BB#0:
+; RELAX: # %bb.0:
; RELAX-NEXT: minsd %xmm1, %xmm0
; RELAX-NEXT: retq
%c = fcmp ule double %x, %y
@@ -456,17 +456,17 @@ define double @ule(double %x, double %y) {
define double @uge_inverse(double %x, double %y) {
; STRICT-LABEL: uge_inverse:
-; STRICT: # BB#0:
+; STRICT: # %bb.0:
; STRICT-NEXT: minsd %xmm1, %xmm0
; STRICT-NEXT: retq
;
; UNSAFE-LABEL: uge_inverse:
-; UNSAFE: # BB#0:
+; UNSAFE: # %bb.0:
; UNSAFE-NEXT: minsd %xmm1, %xmm0
; UNSAFE-NEXT: retq
;
; FINITE-LABEL: uge_inverse:
-; FINITE: # BB#0:
+; FINITE: # %bb.0:
; FINITE-NEXT: minsd %xmm0, %xmm1
; FINITE-NEXT: movapd %xmm1, %xmm0
; FINITE-NEXT: retq
@@ -477,17 +477,17 @@ define double @uge_inverse(double %x, double %y) {
define double @ule_inverse(double %x, double %y) {
; STRICT-LABEL: ule_inverse:
-; STRICT: # BB#0:
+; STRICT: # %bb.0:
; STRICT-NEXT: maxsd %xmm1, %xmm0
; STRICT-NEXT: retq
;
; UNSAFE-LABEL: ule_inverse:
-; UNSAFE: # BB#0:
+; UNSAFE: # %bb.0:
; UNSAFE-NEXT: maxsd %xmm1, %xmm0
; UNSAFE-NEXT: retq
;
; FINITE-LABEL: ule_inverse:
-; FINITE: # BB#0:
+; FINITE: # %bb.0:
; FINITE-NEXT: maxsd %xmm0, %xmm1
; FINITE-NEXT: movapd %xmm1, %xmm0
; FINITE-NEXT: retq
@@ -498,7 +498,7 @@ define double @ule_inverse(double %x, double %y) {
define double @ugt_x(double %x) {
; STRICT-LABEL: ugt_x:
-; STRICT: # BB#0:
+; STRICT: # %bb.0:
; STRICT-NEXT: xorpd %xmm2, %xmm2
; STRICT-NEXT: movapd %xmm0, %xmm1
; STRICT-NEXT: cmpnlesd %xmm2, %xmm1
@@ -507,7 +507,7 @@ define double @ugt_x(double %x) {
; STRICT-NEXT: retq
;
; RELAX-LABEL: ugt_x:
-; RELAX: # BB#0:
+; RELAX: # %bb.0:
; RELAX-NEXT: xorpd %xmm1, %xmm1
; RELAX-NEXT: maxsd %xmm1, %xmm0
; RELAX-NEXT: retq
@@ -518,14 +518,14 @@ define double @ugt_x(double %x) {
define double @ult_x(double %x) {
; STRICT-LABEL: ult_x:
-; STRICT: # BB#0:
+; STRICT: # %bb.0:
; STRICT-NEXT: xorpd %xmm1, %xmm1
; STRICT-NEXT: cmpnlesd %xmm0, %xmm1
; STRICT-NEXT: andpd %xmm1, %xmm0
; STRICT-NEXT: retq
;
; RELAX-LABEL: ult_x:
-; RELAX: # BB#0:
+; RELAX: # %bb.0:
; RELAX-NEXT: xorpd %xmm1, %xmm1
; RELAX-NEXT: minsd %xmm1, %xmm0
; RELAX-NEXT: retq
@@ -536,7 +536,7 @@ define double @ult_x(double %x) {
define double @ugt_inverse_x(double %x) {
; STRICT-LABEL: ugt_inverse_x:
-; STRICT: # BB#0:
+; STRICT: # %bb.0:
; STRICT-NEXT: xorpd %xmm2, %xmm2
; STRICT-NEXT: movapd %xmm0, %xmm1
; STRICT-NEXT: cmpnlesd %xmm2, %xmm1
@@ -545,13 +545,13 @@ define double @ugt_inverse_x(double %x) {
; STRICT-NEXT: retq
;
; UNSAFE-LABEL: ugt_inverse_x:
-; UNSAFE: # BB#0:
+; UNSAFE: # %bb.0:
; UNSAFE-NEXT: xorpd %xmm1, %xmm1
; UNSAFE-NEXT: minsd %xmm1, %xmm0
; UNSAFE-NEXT: retq
;
; FINITE-LABEL: ugt_inverse_x:
-; FINITE: # BB#0:
+; FINITE: # %bb.0:
; FINITE-NEXT: xorpd %xmm1, %xmm1
; FINITE-NEXT: minsd %xmm0, %xmm1
; FINITE-NEXT: movapd %xmm1, %xmm0
@@ -563,7 +563,7 @@ define double @ugt_inverse_x(double %x) {
define double @ult_inverse_x(double %x) {
; STRICT-LABEL: ult_inverse_x:
-; STRICT: # BB#0:
+; STRICT: # %bb.0:
; STRICT-NEXT: xorpd %xmm1, %xmm1
; STRICT-NEXT: cmpnlesd %xmm0, %xmm1
; STRICT-NEXT: andnpd %xmm0, %xmm1
@@ -571,13 +571,13 @@ define double @ult_inverse_x(double %x) {
; STRICT-NEXT: retq
;
; UNSAFE-LABEL: ult_inverse_x:
-; UNSAFE: # BB#0:
+; UNSAFE: # %bb.0:
; UNSAFE-NEXT: xorpd %xmm1, %xmm1
; UNSAFE-NEXT: maxsd %xmm1, %xmm0
; UNSAFE-NEXT: retq
;
; FINITE-LABEL: ult_inverse_x:
-; FINITE: # BB#0:
+; FINITE: # %bb.0:
; FINITE-NEXT: xorpd %xmm1, %xmm1
; FINITE-NEXT: maxsd %xmm0, %xmm1
; FINITE-NEXT: movapd %xmm1, %xmm0
@@ -589,14 +589,14 @@ define double @ult_inverse_x(double %x) {
define double @uge_x(double %x) {
; STRICT-LABEL: uge_x:
-; STRICT: # BB#0:
+; STRICT: # %bb.0:
; STRICT-NEXT: xorpd %xmm1, %xmm1
; STRICT-NEXT: maxsd %xmm0, %xmm1
; STRICT-NEXT: movapd %xmm1, %xmm0
; STRICT-NEXT: retq
;
; RELAX-LABEL: uge_x:
-; RELAX: # BB#0:
+; RELAX: # %bb.0:
; RELAX-NEXT: xorpd %xmm1, %xmm1
; RELAX-NEXT: maxsd %xmm1, %xmm0
; RELAX-NEXT: retq
@@ -607,14 +607,14 @@ define double @uge_x(double %x) {
define double @ule_x(double %x) {
; STRICT-LABEL: ule_x:
-; STRICT: # BB#0:
+; STRICT: # %bb.0:
; STRICT-NEXT: xorpd %xmm1, %xmm1
; STRICT-NEXT: minsd %xmm0, %xmm1
; STRICT-NEXT: movapd %xmm1, %xmm0
; STRICT-NEXT: retq
;
; RELAX-LABEL: ule_x:
-; RELAX: # BB#0:
+; RELAX: # %bb.0:
; RELAX-NEXT: xorpd %xmm1, %xmm1
; RELAX-NEXT: minsd %xmm1, %xmm0
; RELAX-NEXT: retq
@@ -625,19 +625,19 @@ define double @ule_x(double %x) {
define double @uge_inverse_x(double %x) {
; STRICT-LABEL: uge_inverse_x:
-; STRICT: # BB#0:
+; STRICT: # %bb.0:
; STRICT-NEXT: xorpd %xmm1, %xmm1
; STRICT-NEXT: minsd %xmm1, %xmm0
; STRICT-NEXT: retq
;
; UNSAFE-LABEL: uge_inverse_x:
-; UNSAFE: # BB#0:
+; UNSAFE: # %bb.0:
; UNSAFE-NEXT: xorpd %xmm1, %xmm1
; UNSAFE-NEXT: minsd %xmm1, %xmm0
; UNSAFE-NEXT: retq
;
; FINITE-LABEL: uge_inverse_x:
-; FINITE: # BB#0:
+; FINITE: # %bb.0:
; FINITE-NEXT: xorpd %xmm1, %xmm1
; FINITE-NEXT: minsd %xmm0, %xmm1
; FINITE-NEXT: movapd %xmm1, %xmm0
@@ -649,19 +649,19 @@ define double @uge_inverse_x(double %x) {
define double @ule_inverse_x(double %x) {
; STRICT-LABEL: ule_inverse_x:
-; STRICT: # BB#0:
+; STRICT: # %bb.0:
; STRICT-NEXT: xorpd %xmm1, %xmm1
; STRICT-NEXT: maxsd %xmm1, %xmm0
; STRICT-NEXT: retq
;
; UNSAFE-LABEL: ule_inverse_x:
-; UNSAFE: # BB#0:
+; UNSAFE: # %bb.0:
; UNSAFE-NEXT: xorpd %xmm1, %xmm1
; UNSAFE-NEXT: maxsd %xmm1, %xmm0
; UNSAFE-NEXT: retq
;
; FINITE-LABEL: ule_inverse_x:
-; FINITE: # BB#0:
+; FINITE: # %bb.0:
; FINITE-NEXT: xorpd %xmm1, %xmm1
; FINITE-NEXT: maxsd %xmm0, %xmm1
; FINITE-NEXT: movapd %xmm1, %xmm0
@@ -673,7 +673,7 @@ define double @ule_inverse_x(double %x) {
define double @ogt_y(double %x) {
; ALL-LABEL: ogt_y:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: maxsd {{.*}}(%rip), %xmm0
; ALL-NEXT: retq
%c = fcmp ogt double %x, -0.000000e+00
@@ -683,7 +683,7 @@ define double @ogt_y(double %x) {
define double @olt_y(double %x) {
; ALL-LABEL: olt_y:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: minsd {{.*}}(%rip), %xmm0
; ALL-NEXT: retq
%c = fcmp olt double %x, -0.000000e+00
@@ -693,19 +693,19 @@ define double @olt_y(double %x) {
define double @ogt_inverse_y(double %x) {
; STRICT-LABEL: ogt_inverse_y:
-; STRICT: # BB#0:
+; STRICT: # %bb.0:
; STRICT-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
; STRICT-NEXT: minsd %xmm0, %xmm1
; STRICT-NEXT: movapd %xmm1, %xmm0
; STRICT-NEXT: retq
;
; UNSAFE-LABEL: ogt_inverse_y:
-; UNSAFE: # BB#0:
+; UNSAFE: # %bb.0:
; UNSAFE-NEXT: minsd {{.*}}(%rip), %xmm0
; UNSAFE-NEXT: retq
;
; FINITE-LABEL: ogt_inverse_y:
-; FINITE: # BB#0:
+; FINITE: # %bb.0:
; FINITE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
; FINITE-NEXT: minsd %xmm0, %xmm1
; FINITE-NEXT: movapd %xmm1, %xmm0
@@ -717,19 +717,19 @@ define double @ogt_inverse_y(double %x) {
define double @olt_inverse_y(double %x) {
; STRICT-LABEL: olt_inverse_y:
-; STRICT: # BB#0:
+; STRICT: # %bb.0:
; STRICT-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
; STRICT-NEXT: maxsd %xmm0, %xmm1
; STRICT-NEXT: movapd %xmm1, %xmm0
; STRICT-NEXT: retq
;
; UNSAFE-LABEL: olt_inverse_y:
-; UNSAFE: # BB#0:
+; UNSAFE: # %bb.0:
; UNSAFE-NEXT: maxsd {{.*}}(%rip), %xmm0
; UNSAFE-NEXT: retq
;
; FINITE-LABEL: olt_inverse_y:
-; FINITE: # BB#0:
+; FINITE: # %bb.0:
; FINITE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
; FINITE-NEXT: maxsd %xmm0, %xmm1
; FINITE-NEXT: movapd %xmm1, %xmm0
@@ -741,7 +741,7 @@ define double @olt_inverse_y(double %x) {
define double @oge_y(double %x) {
; STRICT-LABEL: oge_y:
-; STRICT: # BB#0:
+; STRICT: # %bb.0:
; STRICT-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
; STRICT-NEXT: movapd %xmm1, %xmm2
; STRICT-NEXT: cmplesd %xmm0, %xmm2
@@ -751,7 +751,7 @@ define double @oge_y(double %x) {
; STRICT-NEXT: retq
;
; RELAX-LABEL: oge_y:
-; RELAX: # BB#0:
+; RELAX: # %bb.0:
; RELAX-NEXT: maxsd {{.*}}(%rip), %xmm0
; RELAX-NEXT: retq
%c = fcmp oge double %x, -0.000000e+00
@@ -761,7 +761,7 @@ define double @oge_y(double %x) {
define double @ole_y(double %x) {
; STRICT-LABEL: ole_y:
-; STRICT: # BB#0:
+; STRICT: # %bb.0:
; STRICT-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero
; STRICT-NEXT: movapd %xmm0, %xmm1
; STRICT-NEXT: cmplesd %xmm2, %xmm1
@@ -772,7 +772,7 @@ define double @ole_y(double %x) {
; STRICT-NEXT: retq
;
; RELAX-LABEL: ole_y:
-; RELAX: # BB#0:
+; RELAX: # %bb.0:
; RELAX-NEXT: minsd {{.*}}(%rip), %xmm0
; RELAX-NEXT: retq
%c = fcmp ole double %x, -0.000000e+00
@@ -782,7 +782,7 @@ define double @ole_y(double %x) {
define double @oge_inverse_y(double %x) {
; STRICT-LABEL: oge_inverse_y:
-; STRICT: # BB#0:
+; STRICT: # %bb.0:
; STRICT-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero
; STRICT-NEXT: movapd %xmm2, %xmm1
; STRICT-NEXT: cmplesd %xmm0, %xmm1
@@ -793,12 +793,12 @@ define double @oge_inverse_y(double %x) {
; STRICT-NEXT: retq
;
; UNSAFE-LABEL: oge_inverse_y:
-; UNSAFE: # BB#0:
+; UNSAFE: # %bb.0:
; UNSAFE-NEXT: minsd {{.*}}(%rip), %xmm0
; UNSAFE-NEXT: retq
;
; FINITE-LABEL: oge_inverse_y:
-; FINITE: # BB#0:
+; FINITE: # %bb.0:
; FINITE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
; FINITE-NEXT: minsd %xmm0, %xmm1
; FINITE-NEXT: movapd %xmm1, %xmm0
@@ -810,7 +810,7 @@ define double @oge_inverse_y(double %x) {
define double @ole_inverse_y(double %x) {
; STRICT-LABEL: ole_inverse_y:
-; STRICT: # BB#0:
+; STRICT: # %bb.0:
; STRICT-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero
; STRICT-NEXT: movapd %xmm0, %xmm1
; STRICT-NEXT: cmplesd %xmm2, %xmm1
@@ -821,12 +821,12 @@ define double @ole_inverse_y(double %x) {
; STRICT-NEXT: retq
;
; UNSAFE-LABEL: ole_inverse_y:
-; UNSAFE: # BB#0:
+; UNSAFE: # %bb.0:
; UNSAFE-NEXT: maxsd {{.*}}(%rip), %xmm0
; UNSAFE-NEXT: retq
;
; FINITE-LABEL: ole_inverse_y:
-; FINITE: # BB#0:
+; FINITE: # %bb.0:
; FINITE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
; FINITE-NEXT: maxsd %xmm0, %xmm1
; FINITE-NEXT: movapd %xmm1, %xmm0
@@ -838,7 +838,7 @@ define double @ole_inverse_y(double %x) {
define double @ugt_y(double %x) {
; STRICT-LABEL: ugt_y:
-; STRICT: # BB#0:
+; STRICT: # %bb.0:
; STRICT-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero
; STRICT-NEXT: movapd %xmm0, %xmm1
; STRICT-NEXT: cmpnlesd %xmm2, %xmm1
@@ -849,7 +849,7 @@ define double @ugt_y(double %x) {
; STRICT-NEXT: retq
;
; RELAX-LABEL: ugt_y:
-; RELAX: # BB#0:
+; RELAX: # %bb.0:
; RELAX-NEXT: maxsd {{.*}}(%rip), %xmm0
; RELAX-NEXT: retq
%c = fcmp ugt double %x, -0.000000e+00
@@ -859,7 +859,7 @@ define double @ugt_y(double %x) {
define double @ult_y(double %x) {
; STRICT-LABEL: ult_y:
-; STRICT: # BB#0:
+; STRICT: # %bb.0:
; STRICT-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
; STRICT-NEXT: movapd %xmm1, %xmm2
; STRICT-NEXT: cmpnlesd %xmm0, %xmm2
@@ -869,7 +869,7 @@ define double @ult_y(double %x) {
; STRICT-NEXT: retq
;
; RELAX-LABEL: ult_y:
-; RELAX: # BB#0:
+; RELAX: # %bb.0:
; RELAX-NEXT: minsd {{.*}}(%rip), %xmm0
; RELAX-NEXT: retq
%c = fcmp ult double %x, -0.000000e+00
@@ -879,7 +879,7 @@ define double @ult_y(double %x) {
define double @ugt_inverse_y(double %x) {
; STRICT-LABEL: ugt_inverse_y:
-; STRICT: # BB#0:
+; STRICT: # %bb.0:
; STRICT-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero
; STRICT-NEXT: movapd %xmm0, %xmm1
; STRICT-NEXT: cmpnlesd %xmm2, %xmm1
@@ -890,12 +890,12 @@ define double @ugt_inverse_y(double %x) {
; STRICT-NEXT: retq
;
; UNSAFE-LABEL: ugt_inverse_y:
-; UNSAFE: # BB#0:
+; UNSAFE: # %bb.0:
; UNSAFE-NEXT: minsd {{.*}}(%rip), %xmm0
; UNSAFE-NEXT: retq
;
; FINITE-LABEL: ugt_inverse_y:
-; FINITE: # BB#0:
+; FINITE: # %bb.0:
; FINITE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
; FINITE-NEXT: minsd %xmm0, %xmm1
; FINITE-NEXT: movapd %xmm1, %xmm0
@@ -907,7 +907,7 @@ define double @ugt_inverse_y(double %x) {
define double @ult_inverse_y(double %x) {
; STRICT-LABEL: ult_inverse_y:
-; STRICT: # BB#0:
+; STRICT: # %bb.0:
; STRICT-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero
; STRICT-NEXT: movapd %xmm2, %xmm1
; STRICT-NEXT: cmpnlesd %xmm0, %xmm1
@@ -918,12 +918,12 @@ define double @ult_inverse_y(double %x) {
; STRICT-NEXT: retq
;
; UNSAFE-LABEL: ult_inverse_y:
-; UNSAFE: # BB#0:
+; UNSAFE: # %bb.0:
; UNSAFE-NEXT: maxsd {{.*}}(%rip), %xmm0
; UNSAFE-NEXT: retq
;
; FINITE-LABEL: ult_inverse_y:
-; FINITE: # BB#0:
+; FINITE: # %bb.0:
; FINITE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
; FINITE-NEXT: maxsd %xmm0, %xmm1
; FINITE-NEXT: movapd %xmm1, %xmm0
@@ -935,14 +935,14 @@ define double @ult_inverse_y(double %x) {
define double @uge_y(double %x) {
; STRICT-LABEL: uge_y:
-; STRICT: # BB#0:
+; STRICT: # %bb.0:
; STRICT-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
; STRICT-NEXT: maxsd %xmm0, %xmm1
; STRICT-NEXT: movapd %xmm1, %xmm0
; STRICT-NEXT: retq
;
; RELAX-LABEL: uge_y:
-; RELAX: # BB#0:
+; RELAX: # %bb.0:
; RELAX-NEXT: maxsd {{.*}}(%rip), %xmm0
; RELAX-NEXT: retq
%c = fcmp uge double %x, -0.000000e+00
@@ -952,14 +952,14 @@ define double @uge_y(double %x) {
define double @ule_y(double %x) {
; STRICT-LABEL: ule_y:
-; STRICT: # BB#0:
+; STRICT: # %bb.0:
; STRICT-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
; STRICT-NEXT: minsd %xmm0, %xmm1
; STRICT-NEXT: movapd %xmm1, %xmm0
; STRICT-NEXT: retq
;
; RELAX-LABEL: ule_y:
-; RELAX: # BB#0:
+; RELAX: # %bb.0:
; RELAX-NEXT: minsd {{.*}}(%rip), %xmm0
; RELAX-NEXT: retq
%c = fcmp ule double %x, -0.000000e+00
@@ -969,17 +969,17 @@ define double @ule_y(double %x) {
define double @uge_inverse_y(double %x) {
; STRICT-LABEL: uge_inverse_y:
-; STRICT: # BB#0:
+; STRICT: # %bb.0:
; STRICT-NEXT: minsd {{.*}}(%rip), %xmm0
; STRICT-NEXT: retq
;
; UNSAFE-LABEL: uge_inverse_y:
-; UNSAFE: # BB#0:
+; UNSAFE: # %bb.0:
; UNSAFE-NEXT: minsd {{.*}}(%rip), %xmm0
; UNSAFE-NEXT: retq
;
; FINITE-LABEL: uge_inverse_y:
-; FINITE: # BB#0:
+; FINITE: # %bb.0:
; FINITE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
; FINITE-NEXT: minsd %xmm0, %xmm1
; FINITE-NEXT: movapd %xmm1, %xmm0
@@ -991,17 +991,17 @@ define double @uge_inverse_y(double %x) {
define double @ule_inverse_y(double %x) {
; STRICT-LABEL: ule_inverse_y:
-; STRICT: # BB#0:
+; STRICT: # %bb.0:
; STRICT-NEXT: maxsd {{.*}}(%rip), %xmm0
; STRICT-NEXT: retq
;
; UNSAFE-LABEL: ule_inverse_y:
-; UNSAFE: # BB#0:
+; UNSAFE: # %bb.0:
; UNSAFE-NEXT: maxsd {{.*}}(%rip), %xmm0
; UNSAFE-NEXT: retq
;
; FINITE-LABEL: ule_inverse_y:
-; FINITE: # BB#0:
+; FINITE: # %bb.0:
; FINITE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
; FINITE-NEXT: maxsd %xmm0, %xmm1
; FINITE-NEXT: movapd %xmm1, %xmm0
@@ -1015,19 +1015,19 @@ define double @ule_inverse_y(double %x) {
define double @clampTo3k_a(double %x) {
; STRICT-LABEL: clampTo3k_a:
-; STRICT: # BB#0:
+; STRICT: # %bb.0:
; STRICT-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
; STRICT-NEXT: minsd %xmm0, %xmm1
; STRICT-NEXT: movapd %xmm1, %xmm0
; STRICT-NEXT: retq
;
; UNSAFE-LABEL: clampTo3k_a:
-; UNSAFE: # BB#0:
+; UNSAFE: # %bb.0:
; UNSAFE-NEXT: minsd {{.*}}(%rip), %xmm0
; UNSAFE-NEXT: retq
;
; FINITE-LABEL: clampTo3k_a:
-; FINITE: # BB#0:
+; FINITE: # %bb.0:
; FINITE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
; FINITE-NEXT: minsd %xmm0, %xmm1
; FINITE-NEXT: movapd %xmm1, %xmm0
@@ -1039,17 +1039,17 @@ define double @clampTo3k_a(double %x) {
define double @clampTo3k_b(double %x) {
; STRICT-LABEL: clampTo3k_b:
-; STRICT: # BB#0:
+; STRICT: # %bb.0:
; STRICT-NEXT: minsd {{.*}}(%rip), %xmm0
; STRICT-NEXT: retq
;
; UNSAFE-LABEL: clampTo3k_b:
-; UNSAFE: # BB#0:
+; UNSAFE: # %bb.0:
; UNSAFE-NEXT: minsd {{.*}}(%rip), %xmm0
; UNSAFE-NEXT: retq
;
; FINITE-LABEL: clampTo3k_b:
-; FINITE: # BB#0:
+; FINITE: # %bb.0:
; FINITE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
; FINITE-NEXT: minsd %xmm0, %xmm1
; FINITE-NEXT: movapd %xmm1, %xmm0
@@ -1061,19 +1061,19 @@ define double @clampTo3k_b(double %x) {
define double @clampTo3k_c(double %x) {
; STRICT-LABEL: clampTo3k_c:
-; STRICT: # BB#0:
+; STRICT: # %bb.0:
; STRICT-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
; STRICT-NEXT: maxsd %xmm0, %xmm1
; STRICT-NEXT: movapd %xmm1, %xmm0
; STRICT-NEXT: retq
;
; UNSAFE-LABEL: clampTo3k_c:
-; UNSAFE: # BB#0:
+; UNSAFE: # %bb.0:
; UNSAFE-NEXT: maxsd {{.*}}(%rip), %xmm0
; UNSAFE-NEXT: retq
;
; FINITE-LABEL: clampTo3k_c:
-; FINITE: # BB#0:
+; FINITE: # %bb.0:
; FINITE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
; FINITE-NEXT: maxsd %xmm0, %xmm1
; FINITE-NEXT: movapd %xmm1, %xmm0
@@ -1085,17 +1085,17 @@ define double @clampTo3k_c(double %x) {
define double @clampTo3k_d(double %x) {
; STRICT-LABEL: clampTo3k_d:
-; STRICT: # BB#0:
+; STRICT: # %bb.0:
; STRICT-NEXT: maxsd {{.*}}(%rip), %xmm0
; STRICT-NEXT: retq
;
; UNSAFE-LABEL: clampTo3k_d:
-; UNSAFE: # BB#0:
+; UNSAFE: # %bb.0:
; UNSAFE-NEXT: maxsd {{.*}}(%rip), %xmm0
; UNSAFE-NEXT: retq
;
; FINITE-LABEL: clampTo3k_d:
-; FINITE: # BB#0:
+; FINITE: # %bb.0:
; FINITE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
; FINITE-NEXT: maxsd %xmm0, %xmm1
; FINITE-NEXT: movapd %xmm1, %xmm0
@@ -1107,19 +1107,19 @@ define double @clampTo3k_d(double %x) {
define double @clampTo3k_e(double %x) {
; STRICT-LABEL: clampTo3k_e:
-; STRICT: # BB#0:
+; STRICT: # %bb.0:
; STRICT-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
; STRICT-NEXT: maxsd %xmm0, %xmm1
; STRICT-NEXT: movapd %xmm1, %xmm0
; STRICT-NEXT: retq
;
; UNSAFE-LABEL: clampTo3k_e:
-; UNSAFE: # BB#0:
+; UNSAFE: # %bb.0:
; UNSAFE-NEXT: maxsd {{.*}}(%rip), %xmm0
; UNSAFE-NEXT: retq
;
; FINITE-LABEL: clampTo3k_e:
-; FINITE: # BB#0:
+; FINITE: # %bb.0:
; FINITE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
; FINITE-NEXT: maxsd %xmm0, %xmm1
; FINITE-NEXT: movapd %xmm1, %xmm0
@@ -1131,17 +1131,17 @@ define double @clampTo3k_e(double %x) {
define double @clampTo3k_f(double %x) {
; STRICT-LABEL: clampTo3k_f:
-; STRICT: # BB#0:
+; STRICT: # %bb.0:
; STRICT-NEXT: maxsd {{.*}}(%rip), %xmm0
; STRICT-NEXT: retq
;
; UNSAFE-LABEL: clampTo3k_f:
-; UNSAFE: # BB#0:
+; UNSAFE: # %bb.0:
; UNSAFE-NEXT: maxsd {{.*}}(%rip), %xmm0
; UNSAFE-NEXT: retq
;
; FINITE-LABEL: clampTo3k_f:
-; FINITE: # BB#0:
+; FINITE: # %bb.0:
; FINITE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
; FINITE-NEXT: maxsd %xmm0, %xmm1
; FINITE-NEXT: movapd %xmm1, %xmm0
@@ -1153,19 +1153,19 @@ define double @clampTo3k_f(double %x) {
define double @clampTo3k_g(double %x) {
; STRICT-LABEL: clampTo3k_g:
-; STRICT: # BB#0:
+; STRICT: # %bb.0:
; STRICT-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
; STRICT-NEXT: minsd %xmm0, %xmm1
; STRICT-NEXT: movapd %xmm1, %xmm0
; STRICT-NEXT: retq
;
; UNSAFE-LABEL: clampTo3k_g:
-; UNSAFE: # BB#0:
+; UNSAFE: # %bb.0:
; UNSAFE-NEXT: minsd {{.*}}(%rip), %xmm0
; UNSAFE-NEXT: retq
;
; FINITE-LABEL: clampTo3k_g:
-; FINITE: # BB#0:
+; FINITE: # %bb.0:
; FINITE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
; FINITE-NEXT: minsd %xmm0, %xmm1
; FINITE-NEXT: movapd %xmm1, %xmm0
@@ -1177,17 +1177,17 @@ define double @clampTo3k_g(double %x) {
define double @clampTo3k_h(double %x) {
; STRICT-LABEL: clampTo3k_h:
-; STRICT: # BB#0:
+; STRICT: # %bb.0:
; STRICT-NEXT: minsd {{.*}}(%rip), %xmm0
; STRICT-NEXT: retq
;
; UNSAFE-LABEL: clampTo3k_h:
-; UNSAFE: # BB#0:
+; UNSAFE: # %bb.0:
; UNSAFE-NEXT: minsd {{.*}}(%rip), %xmm0
; UNSAFE-NEXT: retq
;
; FINITE-LABEL: clampTo3k_h:
-; FINITE: # BB#0:
+; FINITE: # %bb.0:
; FINITE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
; FINITE-NEXT: minsd %xmm0, %xmm1
; FINITE-NEXT: movapd %xmm1, %xmm0
@@ -1199,7 +1199,7 @@ define double @clampTo3k_h(double %x) {
define <2 x double> @test_maxpd(<2 x double> %x, <2 x double> %y) {
; STRICT-LABEL: test_maxpd:
-; STRICT: # BB#0:
+; STRICT: # %bb.0:
; STRICT-NEXT: movapd %xmm0, %xmm2
; STRICT-NEXT: movapd %xmm1, %xmm0
; STRICT-NEXT: cmplepd %xmm2, %xmm0
@@ -1208,7 +1208,7 @@ define <2 x double> @test_maxpd(<2 x double> %x, <2 x double> %y) {
; STRICT-NEXT: retq
;
; RELAX-LABEL: test_maxpd:
-; RELAX: # BB#0:
+; RELAX: # %bb.0:
; RELAX-NEXT: maxpd %xmm1, %xmm0
; RELAX-NEXT: retq
%max_is_x = fcmp oge <2 x double> %x, %y
@@ -1218,7 +1218,7 @@ define <2 x double> @test_maxpd(<2 x double> %x, <2 x double> %y) {
define <2 x double> @test_minpd(<2 x double> %x, <2 x double> %y) {
; STRICT-LABEL: test_minpd:
-; STRICT: # BB#0:
+; STRICT: # %bb.0:
; STRICT-NEXT: movapd %xmm0, %xmm2
; STRICT-NEXT: cmplepd %xmm1, %xmm0
; STRICT-NEXT: blendvpd %xmm0, %xmm2, %xmm1
@@ -1226,7 +1226,7 @@ define <2 x double> @test_minpd(<2 x double> %x, <2 x double> %y) {
; STRICT-NEXT: retq
;
; RELAX-LABEL: test_minpd:
-; RELAX: # BB#0:
+; RELAX: # %bb.0:
; RELAX-NEXT: minpd %xmm1, %xmm0
; RELAX-NEXT: retq
%min_is_x = fcmp ole <2 x double> %x, %y
@@ -1236,7 +1236,7 @@ define <2 x double> @test_minpd(<2 x double> %x, <2 x double> %y) {
define <4 x float> @test_maxps(<4 x float> %x, <4 x float> %y) {
; STRICT-LABEL: test_maxps:
-; STRICT: # BB#0:
+; STRICT: # %bb.0:
; STRICT-NEXT: movaps %xmm0, %xmm2
; STRICT-NEXT: movaps %xmm1, %xmm0
; STRICT-NEXT: cmpleps %xmm2, %xmm0
@@ -1245,7 +1245,7 @@ define <4 x float> @test_maxps(<4 x float> %x, <4 x float> %y) {
; STRICT-NEXT: retq
;
; RELAX-LABEL: test_maxps:
-; RELAX: # BB#0:
+; RELAX: # %bb.0:
; RELAX-NEXT: maxps %xmm1, %xmm0
; RELAX-NEXT: retq
%max_is_x = fcmp oge <4 x float> %x, %y
@@ -1255,7 +1255,7 @@ define <4 x float> @test_maxps(<4 x float> %x, <4 x float> %y) {
define <4 x float> @test_minps(<4 x float> %x, <4 x float> %y) {
; STRICT-LABEL: test_minps:
-; STRICT: # BB#0:
+; STRICT: # %bb.0:
; STRICT-NEXT: movaps %xmm0, %xmm2
; STRICT-NEXT: cmpleps %xmm1, %xmm0
; STRICT-NEXT: blendvps %xmm0, %xmm2, %xmm1
@@ -1263,7 +1263,7 @@ define <4 x float> @test_minps(<4 x float> %x, <4 x float> %y) {
; STRICT-NEXT: retq
;
; RELAX-LABEL: test_minps:
-; RELAX: # BB#0:
+; RELAX: # %bb.0:
; RELAX-NEXT: minps %xmm1, %xmm0
; RELAX-NEXT: retq
%min_is_x = fcmp ole <4 x float> %x, %y
@@ -1273,7 +1273,7 @@ define <4 x float> @test_minps(<4 x float> %x, <4 x float> %y) {
define <2 x float> @test_maxps_illegal_v2f32(<2 x float> %x, <2 x float> %y) {
; STRICT-LABEL: test_maxps_illegal_v2f32:
-; STRICT: # BB#0:
+; STRICT: # %bb.0:
; STRICT-NEXT: movaps %xmm0, %xmm2
; STRICT-NEXT: movaps %xmm1, %xmm0
; STRICT-NEXT: cmpleps %xmm2, %xmm0
@@ -1282,7 +1282,7 @@ define <2 x float> @test_maxps_illegal_v2f32(<2 x float> %x, <2 x float> %y) {
; STRICT-NEXT: retq
;
; RELAX-LABEL: test_maxps_illegal_v2f32:
-; RELAX: # BB#0:
+; RELAX: # %bb.0:
; RELAX-NEXT: maxps %xmm1, %xmm0
; RELAX-NEXT: retq
%max_is_x = fcmp oge <2 x float> %x, %y
@@ -1292,7 +1292,7 @@ define <2 x float> @test_maxps_illegal_v2f32(<2 x float> %x, <2 x float> %y) {
define <2 x float> @test_minps_illegal_v2f32(<2 x float> %x, <2 x float> %y) {
; STRICT-LABEL: test_minps_illegal_v2f32:
-; STRICT: # BB#0:
+; STRICT: # %bb.0:
; STRICT-NEXT: movaps %xmm0, %xmm2
; STRICT-NEXT: cmpleps %xmm1, %xmm0
; STRICT-NEXT: blendvps %xmm0, %xmm2, %xmm1
@@ -1300,7 +1300,7 @@ define <2 x float> @test_minps_illegal_v2f32(<2 x float> %x, <2 x float> %y) {
; STRICT-NEXT: retq
;
; RELAX-LABEL: test_minps_illegal_v2f32:
-; RELAX: # BB#0:
+; RELAX: # %bb.0:
; RELAX-NEXT: minps %xmm1, %xmm0
; RELAX-NEXT: retq
%min_is_x = fcmp ole <2 x float> %x, %y
@@ -1310,7 +1310,7 @@ define <2 x float> @test_minps_illegal_v2f32(<2 x float> %x, <2 x float> %y) {
define <3 x float> @test_maxps_illegal_v3f32(<3 x float> %x, <3 x float> %y) {
; STRICT-LABEL: test_maxps_illegal_v3f32:
-; STRICT: # BB#0:
+; STRICT: # %bb.0:
; STRICT-NEXT: movaps %xmm0, %xmm2
; STRICT-NEXT: movaps %xmm1, %xmm0
; STRICT-NEXT: cmpleps %xmm2, %xmm0
@@ -1319,7 +1319,7 @@ define <3 x float> @test_maxps_illegal_v3f32(<3 x float> %x, <3 x float> %y) {
; STRICT-NEXT: retq
;
; RELAX-LABEL: test_maxps_illegal_v3f32:
-; RELAX: # BB#0:
+; RELAX: # %bb.0:
; RELAX-NEXT: maxps %xmm1, %xmm0
; RELAX-NEXT: retq
%max_is_x = fcmp oge <3 x float> %x, %y
@@ -1329,7 +1329,7 @@ define <3 x float> @test_maxps_illegal_v3f32(<3 x float> %x, <3 x float> %y) {
define <3 x float> @test_minps_illegal_v3f32(<3 x float> %x, <3 x float> %y) {
; STRICT-LABEL: test_minps_illegal_v3f32:
-; STRICT: # BB#0:
+; STRICT: # %bb.0:
; STRICT-NEXT: movaps %xmm0, %xmm2
; STRICT-NEXT: cmpleps %xmm1, %xmm0
; STRICT-NEXT: blendvps %xmm0, %xmm2, %xmm1
@@ -1337,7 +1337,7 @@ define <3 x float> @test_minps_illegal_v3f32(<3 x float> %x, <3 x float> %y) {
; STRICT-NEXT: retq
;
; RELAX-LABEL: test_minps_illegal_v3f32:
-; RELAX: # BB#0:
+; RELAX: # %bb.0:
; RELAX-NEXT: minps %xmm1, %xmm0
; RELAX-NEXT: retq
%min_is_x = fcmp ole <3 x float> %x, %y
diff --git a/test/CodeGen/X86/sse-only.ll b/test/CodeGen/X86/sse-only.ll
index 3fe9faaba850..5cc09c52004c 100644
--- a/test/CodeGen/X86/sse-only.ll
+++ b/test/CodeGen/X86/sse-only.ll
@@ -1,11 +1,11 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -march=x86 -mattr=+sse2,-mmx | FileCheck %s
+; RUN: llc < %s -mtriple=i686-- -mattr=+sse2,-mmx | FileCheck %s
; Test that turning off mmx doesn't turn off sse
define void @test1(<2 x double>* %r, <2 x double>* %A, double %B) nounwind {
; CHECK-LABEL: test1:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
; CHECK-NEXT: movapd (%ecx), %xmm0
diff --git a/test/CodeGen/X86/sse-regcall.ll b/test/CodeGen/X86/sse-regcall.ll
index 862b9cc92f6c..e7a4c686f874 100644
--- a/test/CodeGen/X86/sse-regcall.ll
+++ b/test/CodeGen/X86/sse-regcall.ll
@@ -75,7 +75,7 @@ define x86_regcallcc i1 @test_CallargReti1(i1 %a) {
; LINUXOSX: movaps {{.*(%r(b|s)p).*}}, {{%xmm(1[2-5])}} {{#+}} 16-byte Reload
; LINUXOSX: retq
-;test calling conventions - input parameters, callee saved XMMs
+;test calling conventions - input parameters, callee saved xmms
define x86_regcallcc <16 x float> @testf32_inp(<16 x float> %a, <16 x float> %b, <16 x float> %c) nounwind {
%x1 = fadd <16 x float> %a, %b
%x2 = fmul <16 x float> %a, %b
diff --git a/test/CodeGen/X86/sse-scalar-fp-arith-unary.ll b/test/CodeGen/X86/sse-scalar-fp-arith-unary.ll
index 63751e1ab7e1..1ed4d3401ca1 100644
--- a/test/CodeGen/X86/sse-scalar-fp-arith-unary.ll
+++ b/test/CodeGen/X86/sse-scalar-fp-arith-unary.ll
@@ -9,12 +9,12 @@
define <4 x float> @recip(<4 x float> %x) {
; SSE-LABEL: recip:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: rcpss %xmm0, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: recip:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vrcpss %xmm0, %xmm0, %xmm0
; AVX-NEXT: retq
%y = tail call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %x)
@@ -24,12 +24,12 @@ define <4 x float> @recip(<4 x float> %x) {
define <4 x float> @recip_square_root(<4 x float> %x) {
; SSE-LABEL: recip_square_root:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: rsqrtss %xmm0, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: recip_square_root:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vrsqrtss %xmm0, %xmm0, %xmm0
; AVX-NEXT: retq
%y = tail call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %x)
@@ -39,12 +39,12 @@ define <4 x float> @recip_square_root(<4 x float> %x) {
define <4 x float> @square_root(<4 x float> %x) {
; SSE-LABEL: square_root:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: sqrtss %xmm0, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: square_root:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vsqrtss %xmm0, %xmm0, %xmm0
; AVX-NEXT: retq
%y = tail call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %x)
@@ -54,12 +54,12 @@ define <4 x float> @square_root(<4 x float> %x) {
define <2 x double> @square_root_double(<2 x double> %x) {
; SSE-LABEL: square_root_double:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: sqrtsd %xmm0, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: square_root_double:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0
; AVX-NEXT: retq
%y = tail call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %x)
diff --git a/test/CodeGen/X86/sse-scalar-fp-arith.ll b/test/CodeGen/X86/sse-scalar-fp-arith.ll
index ebc29b1393b0..8761920bb16f 100644
--- a/test/CodeGen/X86/sse-scalar-fp-arith.ll
+++ b/test/CodeGen/X86/sse-scalar-fp-arith.ll
@@ -10,12 +10,12 @@
define <4 x float> @test_add_ss(<4 x float> %a, <4 x float> %b) {
; SSE-LABEL: test_add_ss:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: addss %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test_add_ss:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%1 = extractelement <4 x float> %b, i32 0
@@ -27,12 +27,12 @@ define <4 x float> @test_add_ss(<4 x float> %a, <4 x float> %b) {
define <4 x float> @test_sub_ss(<4 x float> %a, <4 x float> %b) {
; SSE-LABEL: test_sub_ss:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: subss %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test_sub_ss:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vsubss %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%1 = extractelement <4 x float> %b, i32 0
@@ -44,12 +44,12 @@ define <4 x float> @test_sub_ss(<4 x float> %a, <4 x float> %b) {
define <4 x float> @test_mul_ss(<4 x float> %a, <4 x float> %b) {
; SSE-LABEL: test_mul_ss:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: mulss %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test_mul_ss:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%1 = extractelement <4 x float> %b, i32 0
@@ -61,12 +61,12 @@ define <4 x float> @test_mul_ss(<4 x float> %a, <4 x float> %b) {
define <4 x float> @test_div_ss(<4 x float> %a, <4 x float> %b) {
; SSE-LABEL: test_div_ss:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: divss %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test_div_ss:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vdivss %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%1 = extractelement <4 x float> %b, i32 0
@@ -78,25 +78,25 @@ define <4 x float> @test_div_ss(<4 x float> %a, <4 x float> %b) {
define <4 x float> @test_sqrt_ss(<4 x float> %a) {
; SSE2-LABEL: test_sqrt_ss:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: sqrtss %xmm0, %xmm1
; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_sqrt_ss:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: sqrtss %xmm0, %xmm1
; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
; SSE41-NEXT: retq
;
; AVX1-LABEL: test_sqrt_ss:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vsqrtss %xmm0, %xmm0, %xmm1
; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
; AVX1-NEXT: retq
;
; AVX512-LABEL: test_sqrt_ss:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vsqrtss %xmm0, %xmm0, %xmm1
; AVX512-NEXT: vmovss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
; AVX512-NEXT: retq
@@ -109,12 +109,12 @@ declare float @llvm.sqrt.f32(float)
define <2 x double> @test_add_sd(<2 x double> %a, <2 x double> %b) {
; SSE-LABEL: test_add_sd:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: addsd %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test_add_sd:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%1 = extractelement <2 x double> %b, i32 0
@@ -126,12 +126,12 @@ define <2 x double> @test_add_sd(<2 x double> %a, <2 x double> %b) {
define <2 x double> @test_sub_sd(<2 x double> %a, <2 x double> %b) {
; SSE-LABEL: test_sub_sd:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: subsd %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test_sub_sd:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vsubsd %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%1 = extractelement <2 x double> %b, i32 0
@@ -143,12 +143,12 @@ define <2 x double> @test_sub_sd(<2 x double> %a, <2 x double> %b) {
define <2 x double> @test_mul_sd(<2 x double> %a, <2 x double> %b) {
; SSE-LABEL: test_mul_sd:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: mulsd %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test_mul_sd:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%1 = extractelement <2 x double> %b, i32 0
@@ -160,12 +160,12 @@ define <2 x double> @test_mul_sd(<2 x double> %a, <2 x double> %b) {
define <2 x double> @test_div_sd(<2 x double> %a, <2 x double> %b) {
; SSE-LABEL: test_div_sd:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: divsd %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test_div_sd:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vdivsd %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%1 = extractelement <2 x double> %b, i32 0
@@ -177,25 +177,25 @@ define <2 x double> @test_div_sd(<2 x double> %a, <2 x double> %b) {
define <2 x double> @test_sqrt_sd(<2 x double> %a) {
; SSE2-LABEL: test_sqrt_sd:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: sqrtsd %xmm0, %xmm1
; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_sqrt_sd:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: sqrtsd %xmm0, %xmm1
; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; SSE41-NEXT: retq
;
; AVX1-LABEL: test_sqrt_sd:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vsqrtsd %xmm0, %xmm0, %xmm1
; AVX1-NEXT: vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; AVX1-NEXT: retq
;
; AVX512-LABEL: test_sqrt_sd:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vsqrtsd %xmm0, %xmm0, %xmm1
; AVX512-NEXT: vmovsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; AVX512-NEXT: retq
@@ -208,13 +208,13 @@ declare double @llvm.sqrt.f64(double)
define <4 x float> @test2_add_ss(<4 x float> %a, <4 x float> %b) {
; SSE-LABEL: test2_add_ss:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: addss %xmm0, %xmm1
; SSE-NEXT: movaps %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test2_add_ss:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vaddss %xmm0, %xmm1, %xmm0
; AVX-NEXT: retq
%1 = extractelement <4 x float> %a, i32 0
@@ -226,13 +226,13 @@ define <4 x float> @test2_add_ss(<4 x float> %a, <4 x float> %b) {
define <4 x float> @test2_sub_ss(<4 x float> %a, <4 x float> %b) {
; SSE-LABEL: test2_sub_ss:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: subss %xmm0, %xmm1
; SSE-NEXT: movaps %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test2_sub_ss:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vsubss %xmm0, %xmm1, %xmm0
; AVX-NEXT: retq
%1 = extractelement <4 x float> %a, i32 0
@@ -244,13 +244,13 @@ define <4 x float> @test2_sub_ss(<4 x float> %a, <4 x float> %b) {
define <4 x float> @test2_mul_ss(<4 x float> %a, <4 x float> %b) {
; SSE-LABEL: test2_mul_ss:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: mulss %xmm0, %xmm1
; SSE-NEXT: movaps %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test2_mul_ss:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmulss %xmm0, %xmm1, %xmm0
; AVX-NEXT: retq
%1 = extractelement <4 x float> %a, i32 0
@@ -262,13 +262,13 @@ define <4 x float> @test2_mul_ss(<4 x float> %a, <4 x float> %b) {
define <4 x float> @test2_div_ss(<4 x float> %a, <4 x float> %b) {
; SSE-LABEL: test2_div_ss:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: divss %xmm0, %xmm1
; SSE-NEXT: movaps %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test2_div_ss:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vdivss %xmm0, %xmm1, %xmm0
; AVX-NEXT: retq
%1 = extractelement <4 x float> %a, i32 0
@@ -280,13 +280,13 @@ define <4 x float> @test2_div_ss(<4 x float> %a, <4 x float> %b) {
define <2 x double> @test2_add_sd(<2 x double> %a, <2 x double> %b) {
; SSE-LABEL: test2_add_sd:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: addsd %xmm0, %xmm1
; SSE-NEXT: movapd %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test2_add_sd:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vaddsd %xmm0, %xmm1, %xmm0
; AVX-NEXT: retq
%1 = extractelement <2 x double> %a, i32 0
@@ -298,13 +298,13 @@ define <2 x double> @test2_add_sd(<2 x double> %a, <2 x double> %b) {
define <2 x double> @test2_sub_sd(<2 x double> %a, <2 x double> %b) {
; SSE-LABEL: test2_sub_sd:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: subsd %xmm0, %xmm1
; SSE-NEXT: movapd %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test2_sub_sd:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vsubsd %xmm0, %xmm1, %xmm0
; AVX-NEXT: retq
%1 = extractelement <2 x double> %a, i32 0
@@ -316,13 +316,13 @@ define <2 x double> @test2_sub_sd(<2 x double> %a, <2 x double> %b) {
define <2 x double> @test2_mul_sd(<2 x double> %a, <2 x double> %b) {
; SSE-LABEL: test2_mul_sd:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: mulsd %xmm0, %xmm1
; SSE-NEXT: movapd %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test2_mul_sd:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmulsd %xmm0, %xmm1, %xmm0
; AVX-NEXT: retq
%1 = extractelement <2 x double> %a, i32 0
@@ -334,13 +334,13 @@ define <2 x double> @test2_mul_sd(<2 x double> %a, <2 x double> %b) {
define <2 x double> @test2_div_sd(<2 x double> %a, <2 x double> %b) {
; SSE-LABEL: test2_div_sd:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: divsd %xmm0, %xmm1
; SSE-NEXT: movapd %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test2_div_sd:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vdivsd %xmm0, %xmm1, %xmm0
; AVX-NEXT: retq
%1 = extractelement <2 x double> %a, i32 0
@@ -352,13 +352,13 @@ define <2 x double> @test2_div_sd(<2 x double> %a, <2 x double> %b) {
define <4 x float> @test_multiple_add_ss(<4 x float> %a, <4 x float> %b) {
; SSE-LABEL: test_multiple_add_ss:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: addss %xmm0, %xmm1
; SSE-NEXT: addss %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test_multiple_add_ss:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm1
; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
@@ -372,14 +372,14 @@ define <4 x float> @test_multiple_add_ss(<4 x float> %a, <4 x float> %b) {
define <4 x float> @test_multiple_sub_ss(<4 x float> %a, <4 x float> %b) {
; SSE-LABEL: test_multiple_sub_ss:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps %xmm0, %xmm2
; SSE-NEXT: subss %xmm1, %xmm2
; SSE-NEXT: subss %xmm2, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test_multiple_sub_ss:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vsubss %xmm1, %xmm0, %xmm1
; AVX-NEXT: vsubss %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
@@ -393,13 +393,13 @@ define <4 x float> @test_multiple_sub_ss(<4 x float> %a, <4 x float> %b) {
define <4 x float> @test_multiple_mul_ss(<4 x float> %a, <4 x float> %b) {
; SSE-LABEL: test_multiple_mul_ss:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: mulss %xmm0, %xmm1
; SSE-NEXT: mulss %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test_multiple_mul_ss:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm1
; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
@@ -413,14 +413,14 @@ define <4 x float> @test_multiple_mul_ss(<4 x float> %a, <4 x float> %b) {
define <4 x float> @test_multiple_div_ss(<4 x float> %a, <4 x float> %b) {
; SSE-LABEL: test_multiple_div_ss:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps %xmm0, %xmm2
; SSE-NEXT: divss %xmm1, %xmm2
; SSE-NEXT: divss %xmm2, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test_multiple_div_ss:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vdivss %xmm1, %xmm0, %xmm1
; AVX-NEXT: vdivss %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
@@ -437,12 +437,12 @@ define <4 x float> @test_multiple_div_ss(<4 x float> %a, <4 x float> %b) {
define <4 x float> @blend_add_ss(<4 x float> %a, float %b) {
; SSE-LABEL: blend_add_ss:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: addss %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: blend_add_ss:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
@@ -455,12 +455,12 @@ define <4 x float> @blend_add_ss(<4 x float> %a, float %b) {
define <4 x float> @blend_sub_ss(<4 x float> %a, float %b) {
; SSE-LABEL: blend_sub_ss:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: subss %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: blend_sub_ss:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vsubss %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
@@ -473,12 +473,12 @@ define <4 x float> @blend_sub_ss(<4 x float> %a, float %b) {
define <4 x float> @blend_mul_ss(<4 x float> %a, float %b) {
; SSE-LABEL: blend_mul_ss:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: mulss %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: blend_mul_ss:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
@@ -491,12 +491,12 @@ define <4 x float> @blend_mul_ss(<4 x float> %a, float %b) {
define <4 x float> @blend_div_ss(<4 x float> %a, float %b) {
; SSE-LABEL: blend_div_ss:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: divss %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: blend_div_ss:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vdivss %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
@@ -509,12 +509,12 @@ define <4 x float> @blend_div_ss(<4 x float> %a, float %b) {
define <2 x double> @blend_add_sd(<2 x double> %a, double %b) {
; SSE-LABEL: blend_add_sd:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: addsd %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: blend_add_sd:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
@@ -527,12 +527,12 @@ define <2 x double> @blend_add_sd(<2 x double> %a, double %b) {
define <2 x double> @blend_sub_sd(<2 x double> %a, double %b) {
; SSE-LABEL: blend_sub_sd:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: subsd %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: blend_sub_sd:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vsubsd %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
@@ -545,12 +545,12 @@ define <2 x double> @blend_sub_sd(<2 x double> %a, double %b) {
define <2 x double> @blend_mul_sd(<2 x double> %a, double %b) {
; SSE-LABEL: blend_mul_sd:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: mulsd %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: blend_mul_sd:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
@@ -563,12 +563,12 @@ define <2 x double> @blend_mul_sd(<2 x double> %a, double %b) {
define <2 x double> @blend_div_sd(<2 x double> %a, double %b) {
; SSE-LABEL: blend_div_sd:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: divsd %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: blend_div_sd:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vdivsd %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
@@ -584,12 +584,12 @@ define <2 x double> @blend_div_sd(<2 x double> %a, double %b) {
define <4 x float> @insert_test_add_ss(<4 x float> %a, <4 x float> %b) {
; SSE-LABEL: insert_test_add_ss:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: addss %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: insert_test_add_ss:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%1 = fadd <4 x float> %a, %b
@@ -599,12 +599,12 @@ define <4 x float> @insert_test_add_ss(<4 x float> %a, <4 x float> %b) {
define <4 x float> @insert_test_sub_ss(<4 x float> %a, <4 x float> %b) {
; SSE-LABEL: insert_test_sub_ss:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: subss %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: insert_test_sub_ss:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vsubss %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%1 = fsub <4 x float> %a, %b
@@ -614,12 +614,12 @@ define <4 x float> @insert_test_sub_ss(<4 x float> %a, <4 x float> %b) {
define <4 x float> @insert_test_mul_ss(<4 x float> %a, <4 x float> %b) {
; SSE-LABEL: insert_test_mul_ss:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: mulss %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: insert_test_mul_ss:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%1 = fmul <4 x float> %a, %b
@@ -629,12 +629,12 @@ define <4 x float> @insert_test_mul_ss(<4 x float> %a, <4 x float> %b) {
define <4 x float> @insert_test_div_ss(<4 x float> %a, <4 x float> %b) {
; SSE-LABEL: insert_test_div_ss:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: divss %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: insert_test_div_ss:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vdivss %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%1 = fdiv <4 x float> %a, %b
@@ -644,12 +644,12 @@ define <4 x float> @insert_test_div_ss(<4 x float> %a, <4 x float> %b) {
define <2 x double> @insert_test_add_sd(<2 x double> %a, <2 x double> %b) {
; SSE-LABEL: insert_test_add_sd:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: addsd %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: insert_test_add_sd:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%1 = fadd <2 x double> %a, %b
@@ -659,12 +659,12 @@ define <2 x double> @insert_test_add_sd(<2 x double> %a, <2 x double> %b) {
define <2 x double> @insert_test_sub_sd(<2 x double> %a, <2 x double> %b) {
; SSE-LABEL: insert_test_sub_sd:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: subsd %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: insert_test_sub_sd:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vsubsd %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%1 = fsub <2 x double> %a, %b
@@ -674,12 +674,12 @@ define <2 x double> @insert_test_sub_sd(<2 x double> %a, <2 x double> %b) {
define <2 x double> @insert_test_mul_sd(<2 x double> %a, <2 x double> %b) {
; SSE-LABEL: insert_test_mul_sd:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: mulsd %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: insert_test_mul_sd:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%1 = fmul <2 x double> %a, %b
@@ -689,12 +689,12 @@ define <2 x double> @insert_test_mul_sd(<2 x double> %a, <2 x double> %b) {
define <2 x double> @insert_test_div_sd(<2 x double> %a, <2 x double> %b) {
; SSE-LABEL: insert_test_div_sd:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: divsd %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: insert_test_div_sd:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vdivsd %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%1 = fdiv <2 x double> %a, %b
@@ -704,13 +704,13 @@ define <2 x double> @insert_test_div_sd(<2 x double> %a, <2 x double> %b) {
define <4 x float> @insert_test2_add_ss(<4 x float> %a, <4 x float> %b) {
; SSE-LABEL: insert_test2_add_ss:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: addss %xmm0, %xmm1
; SSE-NEXT: movaps %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: insert_test2_add_ss:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vaddss %xmm0, %xmm1, %xmm0
; AVX-NEXT: retq
%1 = fadd <4 x float> %b, %a
@@ -720,13 +720,13 @@ define <4 x float> @insert_test2_add_ss(<4 x float> %a, <4 x float> %b) {
define <4 x float> @insert_test2_sub_ss(<4 x float> %a, <4 x float> %b) {
; SSE-LABEL: insert_test2_sub_ss:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: subss %xmm0, %xmm1
; SSE-NEXT: movaps %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: insert_test2_sub_ss:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vsubss %xmm0, %xmm1, %xmm0
; AVX-NEXT: retq
%1 = fsub <4 x float> %b, %a
@@ -736,13 +736,13 @@ define <4 x float> @insert_test2_sub_ss(<4 x float> %a, <4 x float> %b) {
define <4 x float> @insert_test2_mul_ss(<4 x float> %a, <4 x float> %b) {
; SSE-LABEL: insert_test2_mul_ss:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: mulss %xmm0, %xmm1
; SSE-NEXT: movaps %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: insert_test2_mul_ss:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmulss %xmm0, %xmm1, %xmm0
; AVX-NEXT: retq
%1 = fmul <4 x float> %b, %a
@@ -752,13 +752,13 @@ define <4 x float> @insert_test2_mul_ss(<4 x float> %a, <4 x float> %b) {
define <4 x float> @insert_test2_div_ss(<4 x float> %a, <4 x float> %b) {
; SSE-LABEL: insert_test2_div_ss:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: divss %xmm0, %xmm1
; SSE-NEXT: movaps %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: insert_test2_div_ss:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vdivss %xmm0, %xmm1, %xmm0
; AVX-NEXT: retq
%1 = fdiv <4 x float> %b, %a
@@ -768,13 +768,13 @@ define <4 x float> @insert_test2_div_ss(<4 x float> %a, <4 x float> %b) {
define <2 x double> @insert_test2_add_sd(<2 x double> %a, <2 x double> %b) {
; SSE-LABEL: insert_test2_add_sd:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: addsd %xmm0, %xmm1
; SSE-NEXT: movapd %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: insert_test2_add_sd:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vaddsd %xmm0, %xmm1, %xmm0
; AVX-NEXT: retq
%1 = fadd <2 x double> %b, %a
@@ -784,13 +784,13 @@ define <2 x double> @insert_test2_add_sd(<2 x double> %a, <2 x double> %b) {
define <2 x double> @insert_test2_sub_sd(<2 x double> %a, <2 x double> %b) {
; SSE-LABEL: insert_test2_sub_sd:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: subsd %xmm0, %xmm1
; SSE-NEXT: movapd %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: insert_test2_sub_sd:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vsubsd %xmm0, %xmm1, %xmm0
; AVX-NEXT: retq
%1 = fsub <2 x double> %b, %a
@@ -800,13 +800,13 @@ define <2 x double> @insert_test2_sub_sd(<2 x double> %a, <2 x double> %b) {
define <2 x double> @insert_test2_mul_sd(<2 x double> %a, <2 x double> %b) {
; SSE-LABEL: insert_test2_mul_sd:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: mulsd %xmm0, %xmm1
; SSE-NEXT: movapd %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: insert_test2_mul_sd:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmulsd %xmm0, %xmm1, %xmm0
; AVX-NEXT: retq
%1 = fmul <2 x double> %b, %a
@@ -816,13 +816,13 @@ define <2 x double> @insert_test2_mul_sd(<2 x double> %a, <2 x double> %b) {
define <2 x double> @insert_test2_div_sd(<2 x double> %a, <2 x double> %b) {
; SSE-LABEL: insert_test2_div_sd:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: divsd %xmm0, %xmm1
; SSE-NEXT: movapd %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: insert_test2_div_sd:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vdivsd %xmm0, %xmm1, %xmm0
; AVX-NEXT: retq
%1 = fdiv <2 x double> %b, %a
@@ -832,12 +832,12 @@ define <2 x double> @insert_test2_div_sd(<2 x double> %a, <2 x double> %b) {
define <4 x float> @insert_test3_add_ss(<4 x float> %a, <4 x float> %b) {
; SSE-LABEL: insert_test3_add_ss:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: addss %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: insert_test3_add_ss:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%1 = fadd <4 x float> %a, %b
@@ -847,12 +847,12 @@ define <4 x float> @insert_test3_add_ss(<4 x float> %a, <4 x float> %b) {
define <4 x float> @insert_test3_sub_ss(<4 x float> %a, <4 x float> %b) {
; SSE-LABEL: insert_test3_sub_ss:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: subss %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: insert_test3_sub_ss:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vsubss %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%1 = fsub <4 x float> %a, %b
@@ -862,12 +862,12 @@ define <4 x float> @insert_test3_sub_ss(<4 x float> %a, <4 x float> %b) {
define <4 x float> @insert_test3_mul_ss(<4 x float> %a, <4 x float> %b) {
; SSE-LABEL: insert_test3_mul_ss:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: mulss %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: insert_test3_mul_ss:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%1 = fmul <4 x float> %a, %b
@@ -877,12 +877,12 @@ define <4 x float> @insert_test3_mul_ss(<4 x float> %a, <4 x float> %b) {
define <4 x float> @insert_test3_div_ss(<4 x float> %a, <4 x float> %b) {
; SSE-LABEL: insert_test3_div_ss:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: divss %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: insert_test3_div_ss:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vdivss %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%1 = fdiv <4 x float> %a, %b
@@ -892,12 +892,12 @@ define <4 x float> @insert_test3_div_ss(<4 x float> %a, <4 x float> %b) {
define <2 x double> @insert_test3_add_sd(<2 x double> %a, <2 x double> %b) {
; SSE-LABEL: insert_test3_add_sd:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: addsd %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: insert_test3_add_sd:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%1 = fadd <2 x double> %a, %b
@@ -907,12 +907,12 @@ define <2 x double> @insert_test3_add_sd(<2 x double> %a, <2 x double> %b) {
define <2 x double> @insert_test3_sub_sd(<2 x double> %a, <2 x double> %b) {
; SSE-LABEL: insert_test3_sub_sd:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: subsd %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: insert_test3_sub_sd:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vsubsd %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%1 = fsub <2 x double> %a, %b
@@ -922,12 +922,12 @@ define <2 x double> @insert_test3_sub_sd(<2 x double> %a, <2 x double> %b) {
define <2 x double> @insert_test3_mul_sd(<2 x double> %a, <2 x double> %b) {
; SSE-LABEL: insert_test3_mul_sd:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: mulsd %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: insert_test3_mul_sd:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%1 = fmul <2 x double> %a, %b
@@ -937,12 +937,12 @@ define <2 x double> @insert_test3_mul_sd(<2 x double> %a, <2 x double> %b) {
define <2 x double> @insert_test3_div_sd(<2 x double> %a, <2 x double> %b) {
; SSE-LABEL: insert_test3_div_sd:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: divsd %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: insert_test3_div_sd:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vdivsd %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%1 = fdiv <2 x double> %a, %b
@@ -952,13 +952,13 @@ define <2 x double> @insert_test3_div_sd(<2 x double> %a, <2 x double> %b) {
define <4 x float> @insert_test4_add_ss(<4 x float> %a, <4 x float> %b) {
; SSE-LABEL: insert_test4_add_ss:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: addss %xmm0, %xmm1
; SSE-NEXT: movaps %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: insert_test4_add_ss:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vaddss %xmm0, %xmm1, %xmm0
; AVX-NEXT: retq
%1 = fadd <4 x float> %b, %a
@@ -968,13 +968,13 @@ define <4 x float> @insert_test4_add_ss(<4 x float> %a, <4 x float> %b) {
define <4 x float> @insert_test4_sub_ss(<4 x float> %a, <4 x float> %b) {
; SSE-LABEL: insert_test4_sub_ss:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: subss %xmm0, %xmm1
; SSE-NEXT: movaps %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: insert_test4_sub_ss:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vsubss %xmm0, %xmm1, %xmm0
; AVX-NEXT: retq
%1 = fsub <4 x float> %b, %a
@@ -984,13 +984,13 @@ define <4 x float> @insert_test4_sub_ss(<4 x float> %a, <4 x float> %b) {
define <4 x float> @insert_test4_mul_ss(<4 x float> %a, <4 x float> %b) {
; SSE-LABEL: insert_test4_mul_ss:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: mulss %xmm0, %xmm1
; SSE-NEXT: movaps %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: insert_test4_mul_ss:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmulss %xmm0, %xmm1, %xmm0
; AVX-NEXT: retq
%1 = fmul <4 x float> %b, %a
@@ -1000,13 +1000,13 @@ define <4 x float> @insert_test4_mul_ss(<4 x float> %a, <4 x float> %b) {
define <4 x float> @insert_test4_div_ss(<4 x float> %a, <4 x float> %b) {
; SSE-LABEL: insert_test4_div_ss:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: divss %xmm0, %xmm1
; SSE-NEXT: movaps %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: insert_test4_div_ss:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vdivss %xmm0, %xmm1, %xmm0
; AVX-NEXT: retq
%1 = fdiv <4 x float> %b, %a
@@ -1016,13 +1016,13 @@ define <4 x float> @insert_test4_div_ss(<4 x float> %a, <4 x float> %b) {
define <2 x double> @insert_test4_add_sd(<2 x double> %a, <2 x double> %b) {
; SSE-LABEL: insert_test4_add_sd:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: addsd %xmm0, %xmm1
; SSE-NEXT: movapd %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: insert_test4_add_sd:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vaddsd %xmm0, %xmm1, %xmm0
; AVX-NEXT: retq
%1 = fadd <2 x double> %b, %a
@@ -1032,13 +1032,13 @@ define <2 x double> @insert_test4_add_sd(<2 x double> %a, <2 x double> %b) {
define <2 x double> @insert_test4_sub_sd(<2 x double> %a, <2 x double> %b) {
; SSE-LABEL: insert_test4_sub_sd:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: subsd %xmm0, %xmm1
; SSE-NEXT: movapd %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: insert_test4_sub_sd:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vsubsd %xmm0, %xmm1, %xmm0
; AVX-NEXT: retq
%1 = fsub <2 x double> %b, %a
@@ -1048,13 +1048,13 @@ define <2 x double> @insert_test4_sub_sd(<2 x double> %a, <2 x double> %b) {
define <2 x double> @insert_test4_mul_sd(<2 x double> %a, <2 x double> %b) {
; SSE-LABEL: insert_test4_mul_sd:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: mulsd %xmm0, %xmm1
; SSE-NEXT: movapd %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: insert_test4_mul_sd:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmulsd %xmm0, %xmm1, %xmm0
; AVX-NEXT: retq
%1 = fmul <2 x double> %b, %a
@@ -1064,13 +1064,13 @@ define <2 x double> @insert_test4_mul_sd(<2 x double> %a, <2 x double> %b) {
define <2 x double> @insert_test4_div_sd(<2 x double> %a, <2 x double> %b) {
; SSE-LABEL: insert_test4_div_sd:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: divsd %xmm0, %xmm1
; SSE-NEXT: movapd %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: insert_test4_div_sd:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vdivsd %xmm0, %xmm1, %xmm0
; AVX-NEXT: retq
%1 = fdiv <2 x double> %b, %a
@@ -1080,10 +1080,10 @@ define <2 x double> @insert_test4_div_sd(<2 x double> %a, <2 x double> %b) {
define <4 x float> @add_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
; SSE2-LABEL: add_ss_mask:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: testb $1, %dil
; SSE2-NEXT: jne .LBB62_1
-; SSE2-NEXT: # BB#2:
+; SSE2-NEXT: # %bb.2:
; SSE2-NEXT: movaps %xmm2, %xmm1
; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
; SSE2-NEXT: retq
@@ -1093,10 +1093,10 @@ define <4 x float> @add_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> %c,
; SSE2-NEXT: retq
;
; SSE41-LABEL: add_ss_mask:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: testb $1, %dil
; SSE41-NEXT: jne .LBB62_1
-; SSE41-NEXT: # BB#2:
+; SSE41-NEXT: # %bb.2:
; SSE41-NEXT: movaps %xmm2, %xmm1
; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
; SSE41-NEXT: retq
@@ -1106,17 +1106,17 @@ define <4 x float> @add_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> %c,
; SSE41-NEXT: retq
;
; AVX1-LABEL: add_ss_mask:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: testb $1, %dil
; AVX1-NEXT: je .LBB62_2
-; AVX1-NEXT: # BB#1:
+; AVX1-NEXT: # %bb.1:
; AVX1-NEXT: vaddss %xmm1, %xmm0, %xmm2
; AVX1-NEXT: .LBB62_2:
; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
; AVX1-NEXT: retq
;
; AVX512-LABEL: add_ss_mask:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm1
; AVX512-NEXT: kmovw %edi, %k1
; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm2 {%k1}
@@ -1135,10 +1135,10 @@ define <4 x float> @add_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> %c,
define <2 x double> @add_sd_mask(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
; SSE2-LABEL: add_sd_mask:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: testb $1, %dil
; SSE2-NEXT: jne .LBB63_1
-; SSE2-NEXT: # BB#2:
+; SSE2-NEXT: # %bb.2:
; SSE2-NEXT: movapd %xmm2, %xmm1
; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; SSE2-NEXT: retq
@@ -1148,10 +1148,10 @@ define <2 x double> @add_sd_mask(<2 x double> %a, <2 x double> %b, <2 x double>
; SSE2-NEXT: retq
;
; SSE41-LABEL: add_sd_mask:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: testb $1, %dil
; SSE41-NEXT: jne .LBB63_1
-; SSE41-NEXT: # BB#2:
+; SSE41-NEXT: # %bb.2:
; SSE41-NEXT: movapd %xmm2, %xmm1
; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; SSE41-NEXT: retq
@@ -1161,17 +1161,17 @@ define <2 x double> @add_sd_mask(<2 x double> %a, <2 x double> %b, <2 x double>
; SSE41-NEXT: retq
;
; AVX1-LABEL: add_sd_mask:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: testb $1, %dil
; AVX1-NEXT: je .LBB63_2
-; AVX1-NEXT: # BB#1:
+; AVX1-NEXT: # %bb.1:
; AVX1-NEXT: vaddsd %xmm1, %xmm0, %xmm2
; AVX1-NEXT: .LBB63_2:
; AVX1-NEXT: vblendpd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
; AVX1-NEXT: retq
;
; AVX512-LABEL: add_sd_mask:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm1
; AVX512-NEXT: kmovw %edi, %k1
; AVX512-NEXT: vmovsd %xmm1, %xmm0, %xmm2 {%k1}
diff --git a/test/CodeGen/X86/sse-schedule.ll b/test/CodeGen/X86/sse-schedule.ll
index f44cee9db22c..61f1d6b353ef 100644
--- a/test/CodeGen/X86/sse-schedule.ll
+++ b/test/CodeGen/X86/sse-schedule.ll
@@ -1,56 +1,76 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule | FileCheck %s --check-prefix=CHECK --check-prefix=GENERIC
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 | FileCheck %s --check-prefix=CHECK --check-prefix=GENERIC
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=atom | FileCheck %s --check-prefix=CHECK --check-prefix=ATOM
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=slm | FileCheck %s --check-prefix=CHECK --check-prefix=SLM
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=sandybridge | FileCheck %s --check-prefix=CHECK --check-prefix=SANDY
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=ivybridge | FileCheck %s --check-prefix=CHECK --check-prefix=SANDY
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=haswell | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=broadwell | FileCheck %s --check-prefix=CHECK --check-prefix=BROADWELL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=SKYLAKE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skx | FileCheck %s --check-prefix=CHECK --check-prefix=SKX
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1
define <4 x float> @test_addps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2) {
; GENERIC-LABEL: test_addps:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: addps %xmm1, %xmm0
-; GENERIC-NEXT: addps (%rdi), %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: addps %xmm1, %xmm0 # sched: [3:1.00]
+; GENERIC-NEXT: addps (%rdi), %xmm0 # sched: [9:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_addps:
-; ATOM: # BB#0:
-; ATOM-NEXT: addps %xmm1, %xmm0
-; ATOM-NEXT: addps (%rdi), %xmm0
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: addps %xmm1, %xmm0 # sched: [5:5.00]
+; ATOM-NEXT: addps (%rdi), %xmm0 # sched: [5:5.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_addps:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: addps %xmm1, %xmm0 # sched: [3:1.00]
; SLM-NEXT: addps (%rdi), %xmm0 # sched: [6:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_addps:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vaddps (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vaddps (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_addps:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: vaddps (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vaddps (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_addps:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; BROADWELL-NEXT: vaddps (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_addps:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vaddps (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_addps:
+; SKX: # %bb.0:
+; SKX-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: vaddps (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_addps:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: vaddps (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_addps:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; ZNVER1-NEXT: vaddps (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = fadd <4 x float> %a0, %a1
%2 = load <4 x float>, <4 x float> *%a2, align 16
%3 = fadd <4 x float> %1, %2
@@ -59,46 +79,64 @@ define <4 x float> @test_addps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a
define float @test_addss(float %a0, float %a1, float *%a2) {
; GENERIC-LABEL: test_addss:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: addss %xmm1, %xmm0
-; GENERIC-NEXT: addss (%rdi), %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: addss %xmm1, %xmm0 # sched: [3:1.00]
+; GENERIC-NEXT: addss (%rdi), %xmm0 # sched: [9:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_addss:
-; ATOM: # BB#0:
-; ATOM-NEXT: addss %xmm1, %xmm0
-; ATOM-NEXT: addss (%rdi), %xmm0
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: addss %xmm1, %xmm0 # sched: [5:5.00]
+; ATOM-NEXT: addss (%rdi), %xmm0 # sched: [5:5.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_addss:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: addss %xmm1, %xmm0 # sched: [3:1.00]
; SLM-NEXT: addss (%rdi), %xmm0 # sched: [6:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_addss:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vaddss (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vaddss (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_addss:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: vaddss (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vaddss (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_addss:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; BROADWELL-NEXT: vaddss (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_addss:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vaddss %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vaddss (%rdi), %xmm0, %xmm0 # sched: [9:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_addss:
+; SKX: # %bb.0:
+; SKX-NEXT: vaddss %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: vaddss (%rdi), %xmm0, %xmm0 # sched: [9:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_addss:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: vaddss (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_addss:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; ZNVER1-NEXT: vaddss (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = fadd float %a0, %a1
%2 = load float, float *%a2, align 4
%3 = fadd float %1, %2
@@ -107,54 +145,68 @@ define float @test_addss(float %a0, float %a1, float *%a2) {
define <4 x float> @test_andps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2) {
; GENERIC-LABEL: test_andps:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: andps %xmm1, %xmm0
-; GENERIC-NEXT: andps (%rdi), %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: andps %xmm1, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: andps (%rdi), %xmm0 # sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_andps:
-; ATOM: # BB#0:
-; ATOM-NEXT: andps %xmm1, %xmm0
-; ATOM-NEXT: andps (%rdi), %xmm0
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: andps %xmm1, %xmm0 # sched: [1:0.50]
+; ATOM-NEXT: andps (%rdi), %xmm0 # sched: [1:1.00]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_andps:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: andps %xmm1, %xmm0 # sched: [1:0.50]
; SLM-NEXT: andps (%rdi), %xmm0 # sched: [4:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_andps:
-; SANDY: # BB#0:
-; SANDY-NEXT: vandps %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
-; SANDY-NEXT: vandps (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vandps %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; SANDY-NEXT: vandps (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_andps:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vandps %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT: vandps (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vandps (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_andps:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vandps %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; BROADWELL-NEXT: vandps (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_andps:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vandps %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: vandps (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_andps:
+; SKX: # %bb.0:
+; SKX-NEXT: vandps %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKX-NEXT: vandps (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_andps:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vandps %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vandps (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_andps:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vandps %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
; ZNVER1-NEXT: vandps (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = bitcast <4 x float> %a0 to <4 x i32>
%2 = bitcast <4 x float> %a1 to <4 x i32>
%3 = and <4 x i32> %1, %2
@@ -167,54 +219,68 @@ define <4 x float> @test_andps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a
define <4 x float> @test_andnotps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2) {
; GENERIC-LABEL: test_andnotps:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: andnps %xmm1, %xmm0
-; GENERIC-NEXT: andnps (%rdi), %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: andnps %xmm1, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: andnps (%rdi), %xmm0 # sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_andnotps:
-; ATOM: # BB#0:
-; ATOM-NEXT: andnps %xmm1, %xmm0
-; ATOM-NEXT: andnps (%rdi), %xmm0
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: andnps %xmm1, %xmm0 # sched: [1:0.50]
+; ATOM-NEXT: andnps (%rdi), %xmm0 # sched: [1:1.00]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_andnotps:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: andnps %xmm1, %xmm0 # sched: [1:0.50]
; SLM-NEXT: andnps (%rdi), %xmm0 # sched: [4:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_andnotps:
-; SANDY: # BB#0:
-; SANDY-NEXT: vandnps %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
-; SANDY-NEXT: vandnps (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vandnps %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; SANDY-NEXT: vandnps (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_andnotps:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vandnps %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT: vandnps (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vandnps (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_andnotps:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vandnps %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; BROADWELL-NEXT: vandnps (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_andnotps:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vandnps %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: vandnps (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_andnotps:
+; SKX: # %bb.0:
+; SKX-NEXT: vandnps %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKX-NEXT: vandnps (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_andnotps:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vandnps %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vandnps (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_andnotps:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vandnps %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
; ZNVER1-NEXT: vandnps (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = bitcast <4 x float> %a0 to <4 x i32>
%2 = bitcast <4 x float> %a1 to <4 x i32>
%3 = xor <4 x i32> %1, <i32 -1, i32 -1, i32 -1, i32 -1>
@@ -229,53 +295,75 @@ define <4 x float> @test_andnotps(<4 x float> %a0, <4 x float> %a1, <4 x float>
define <4 x float> @test_cmpps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2) {
; GENERIC-LABEL: test_cmpps:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: cmpeqps %xmm0, %xmm1
-; GENERIC-NEXT: cmpeqps (%rdi), %xmm0
-; GENERIC-NEXT: orps %xmm1, %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: cmpeqps %xmm0, %xmm1 # sched: [3:1.00]
+; GENERIC-NEXT: cmpeqps (%rdi), %xmm0 # sched: [9:1.00]
+; GENERIC-NEXT: orps %xmm1, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_cmpps:
-; ATOM: # BB#0:
-; ATOM-NEXT: cmpeqps %xmm0, %xmm1
-; ATOM-NEXT: cmpeqps (%rdi), %xmm0
-; ATOM-NEXT: orps %xmm1, %xmm0
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: cmpeqps %xmm0, %xmm1 # sched: [5:5.00]
+; ATOM-NEXT: cmpeqps (%rdi), %xmm0 # sched: [5:5.00]
+; ATOM-NEXT: orps %xmm1, %xmm0 # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_cmpps:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: cmpeqps %xmm0, %xmm1 # sched: [3:1.00]
; SLM-NEXT: cmpeqps (%rdi), %xmm0 # sched: [6:1.00]
; SLM-NEXT: orps %xmm1, %xmm0 # sched: [1:0.50]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_cmpps:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vcmpeqps %xmm1, %xmm0, %xmm1 # sched: [3:1.00]
-; SANDY-NEXT: vcmpeqps (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; SANDY-NEXT: vorps %xmm0, %xmm1, %xmm0 # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vcmpeqps (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; SANDY-NEXT: vorps %xmm0, %xmm1, %xmm0 # sched: [1:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cmpps:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vcmpeqps %xmm1, %xmm0, %xmm1 # sched: [3:1.00]
-; HASWELL-NEXT: vcmpeqps (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; HASWELL-NEXT: vcmpeqps (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
; HASWELL-NEXT: vorps %xmm0, %xmm1, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_cmpps:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vcmpeqps %xmm1, %xmm0, %xmm1 # sched: [3:1.00]
+; BROADWELL-NEXT: vcmpeqps (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
+; BROADWELL-NEXT: vorps %xmm0, %xmm1, %xmm0 # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_cmpps:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vcmpeqps %xmm1, %xmm0, %xmm1 # sched: [4:0.33]
+; SKYLAKE-NEXT: vcmpeqps (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
+; SKYLAKE-NEXT: vorps %xmm0, %xmm1, %xmm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_cmpps:
+; SKX: # %bb.0:
+; SKX-NEXT: vcmpeqps %xmm1, %xmm0, %k0 # sched: [3:1.00]
+; SKX-NEXT: vcmpeqps (%rdi), %xmm0, %k1 # sched: [9:1.00]
+; SKX-NEXT: korw %k1, %k0, %k0 # sched: [1:1.00]
+; SKX-NEXT: vpmovm2d %k0, %xmm0 # sched: [1:0.25]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_cmpps:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vcmpeqps %xmm1, %xmm0, %xmm1 # sched: [3:1.00]
; BTVER2-NEXT: vcmpeqps (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
; BTVER2-NEXT: vorps %xmm0, %xmm1, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_cmpps:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vcmpeqps %xmm1, %xmm0, %xmm1 # sched: [3:1.00]
; ZNVER1-NEXT: vcmpeqps (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
; ZNVER1-NEXT: vorps %xmm0, %xmm1, %xmm0 # sched: [1:0.25]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = fcmp oeq <4 x float> %a0, %a1
%2 = load <4 x float>, <4 x float> *%a2, align 16
%3 = fcmp oeq <4 x float> %a0, %2
@@ -287,46 +375,64 @@ define <4 x float> @test_cmpps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a
define float @test_cmpss(float %a0, float %a1, float *%a2) {
; GENERIC-LABEL: test_cmpss:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: cmpeqss %xmm1, %xmm0
-; GENERIC-NEXT: cmpeqss (%rdi), %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: cmpeqss %xmm1, %xmm0 # sched: [3:1.00]
+; GENERIC-NEXT: cmpeqss (%rdi), %xmm0 # sched: [9:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_cmpss:
-; ATOM: # BB#0:
-; ATOM-NEXT: cmpeqss %xmm1, %xmm0
-; ATOM-NEXT: cmpeqss (%rdi), %xmm0
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: cmpeqss %xmm1, %xmm0 # sched: [5:5.00]
+; ATOM-NEXT: cmpeqss (%rdi), %xmm0 # sched: [5:5.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_cmpss:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: cmpeqss %xmm1, %xmm0 # sched: [3:1.00]
; SLM-NEXT: cmpeqss (%rdi), %xmm0 # sched: [6:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_cmpss:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vcmpeqss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vcmpeqss (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vcmpeqss (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cmpss:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vcmpeqss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: vcmpeqss (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vcmpeqss (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_cmpss:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vcmpeqss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; BROADWELL-NEXT: vcmpeqss (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_cmpss:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vcmpeqss %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
+; SKYLAKE-NEXT: vcmpeqss (%rdi), %xmm0, %xmm0 # sched: [9:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_cmpss:
+; SKX: # %bb.0:
+; SKX-NEXT: vcmpeqss %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: vcmpeqss (%rdi), %xmm0, %xmm0 # sched: [9:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_cmpss:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vcmpeqss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: vcmpeqss (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_cmpss:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vcmpeqss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; ZNVER1-NEXT: vcmpeqss (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = insertelement <4 x float> undef, float %a0, i32 0
%2 = insertelement <4 x float> undef, float %a1, i32 0
%3 = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %1, <4 x float> %2, i8 0)
@@ -340,35 +446,35 @@ declare <4 x float> @llvm.x86.sse.cmp.ss(<4 x float>, <4 x float>, i8) nounwind
define i32 @test_comiss(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2) {
; GENERIC-LABEL: test_comiss:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: comiss %xmm1, %xmm0
-; GENERIC-NEXT: setnp %al
-; GENERIC-NEXT: sete %cl
-; GENERIC-NEXT: andb %al, %cl
-; GENERIC-NEXT: comiss (%rdi), %xmm0
-; GENERIC-NEXT: setnp %al
-; GENERIC-NEXT: sete %dl
-; GENERIC-NEXT: andb %al, %dl
-; GENERIC-NEXT: orb %cl, %dl
-; GENERIC-NEXT: movzbl %dl, %eax
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: comiss %xmm1, %xmm0 # sched: [3:1.00]
+; GENERIC-NEXT: setnp %al # sched: [1:0.50]
+; GENERIC-NEXT: sete %cl # sched: [1:0.50]
+; GENERIC-NEXT: andb %al, %cl # sched: [1:0.33]
+; GENERIC-NEXT: comiss (%rdi), %xmm0 # sched: [7:1.00]
+; GENERIC-NEXT: setnp %al # sched: [1:0.50]
+; GENERIC-NEXT: sete %dl # sched: [1:0.50]
+; GENERIC-NEXT: andb %al, %dl # sched: [1:0.33]
+; GENERIC-NEXT: orb %cl, %dl # sched: [1:0.33]
+; GENERIC-NEXT: movzbl %dl, %eax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_comiss:
-; ATOM: # BB#0:
-; ATOM-NEXT: comiss %xmm1, %xmm0
-; ATOM-NEXT: setnp %al
-; ATOM-NEXT: sete %cl
-; ATOM-NEXT: andb %al, %cl
-; ATOM-NEXT: comiss (%rdi), %xmm0
-; ATOM-NEXT: setnp %al
-; ATOM-NEXT: sete %dl
-; ATOM-NEXT: andb %al, %dl
-; ATOM-NEXT: orb %cl, %dl
-; ATOM-NEXT: movzbl %dl, %eax
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: comiss %xmm1, %xmm0 # sched: [9:4.50]
+; ATOM-NEXT: setnp %al # sched: [1:0.50]
+; ATOM-NEXT: sete %cl # sched: [1:0.50]
+; ATOM-NEXT: andb %al, %cl # sched: [1:0.50]
+; ATOM-NEXT: comiss (%rdi), %xmm0 # sched: [10:5.00]
+; ATOM-NEXT: setnp %al # sched: [1:0.50]
+; ATOM-NEXT: sete %dl # sched: [1:0.50]
+; ATOM-NEXT: andb %al, %dl # sched: [1:0.50]
+; ATOM-NEXT: orb %cl, %dl # sched: [1:0.50]
+; ATOM-NEXT: movzbl %dl, %eax # sched: [1:1.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_comiss:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: comiss %xmm1, %xmm0 # sched: [3:1.00]
; SLM-NEXT: setnp %al # sched: [1:0.50]
; SLM-NEXT: sete %cl # sched: [1:0.50]
@@ -382,35 +488,77 @@ define i32 @test_comiss(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2) {
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_comiss:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vcomiss %xmm1, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: setnp %al # sched: [1:0.33]
-; SANDY-NEXT: sete %cl # sched: [1:0.33]
+; SANDY-NEXT: setnp %al # sched: [1:0.50]
+; SANDY-NEXT: sete %cl # sched: [1:0.50]
; SANDY-NEXT: andb %al, %cl # sched: [1:0.33]
; SANDY-NEXT: vcomiss (%rdi), %xmm0 # sched: [7:1.00]
-; SANDY-NEXT: setnp %al # sched: [1:0.33]
-; SANDY-NEXT: sete %dl # sched: [1:0.33]
+; SANDY-NEXT: setnp %al # sched: [1:0.50]
+; SANDY-NEXT: sete %dl # sched: [1:0.50]
; SANDY-NEXT: andb %al, %dl # sched: [1:0.33]
; SANDY-NEXT: orb %cl, %dl # sched: [1:0.33]
; SANDY-NEXT: movzbl %dl, %eax # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_comiss:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vcomiss %xmm1, %xmm0 # sched: [3:1.00]
; HASWELL-NEXT: setnp %al # sched: [1:0.50]
; HASWELL-NEXT: sete %cl # sched: [1:0.50]
; HASWELL-NEXT: andb %al, %cl # sched: [1:0.25]
-; HASWELL-NEXT: vcomiss (%rdi), %xmm0 # sched: [7:1.00]
+; HASWELL-NEXT: vcomiss (%rdi), %xmm0 # sched: [8:1.00]
; HASWELL-NEXT: setnp %al # sched: [1:0.50]
; HASWELL-NEXT: sete %dl # sched: [1:0.50]
; HASWELL-NEXT: andb %al, %dl # sched: [1:0.25]
; HASWELL-NEXT: orb %cl, %dl # sched: [1:0.25]
; HASWELL-NEXT: movzbl %dl, %eax # sched: [1:0.25]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_comiss:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vcomiss %xmm1, %xmm0 # sched: [3:1.00]
+; BROADWELL-NEXT: setnp %al # sched: [1:0.50]
+; BROADWELL-NEXT: sete %cl # sched: [1:0.50]
+; BROADWELL-NEXT: andb %al, %cl # sched: [1:0.25]
+; BROADWELL-NEXT: vcomiss (%rdi), %xmm0 # sched: [8:1.00]
+; BROADWELL-NEXT: setnp %al # sched: [1:0.50]
+; BROADWELL-NEXT: sete %dl # sched: [1:0.50]
+; BROADWELL-NEXT: andb %al, %dl # sched: [1:0.25]
+; BROADWELL-NEXT: orb %cl, %dl # sched: [1:0.25]
+; BROADWELL-NEXT: movzbl %dl, %eax # sched: [1:0.25]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_comiss:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vcomiss %xmm1, %xmm0 # sched: [3:1.00]
+; SKYLAKE-NEXT: setnp %al # sched: [1:0.50]
+; SKYLAKE-NEXT: sete %cl # sched: [1:0.50]
+; SKYLAKE-NEXT: andb %al, %cl # sched: [1:0.25]
+; SKYLAKE-NEXT: vcomiss (%rdi), %xmm0 # sched: [8:1.00]
+; SKYLAKE-NEXT: setnp %al # sched: [1:0.50]
+; SKYLAKE-NEXT: sete %dl # sched: [1:0.50]
+; SKYLAKE-NEXT: andb %al, %dl # sched: [1:0.25]
+; SKYLAKE-NEXT: orb %cl, %dl # sched: [1:0.25]
+; SKYLAKE-NEXT: movzbl %dl, %eax # sched: [1:0.25]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_comiss:
+; SKX: # %bb.0:
+; SKX-NEXT: vcomiss %xmm1, %xmm0 # sched: [3:1.00]
+; SKX-NEXT: setnp %al # sched: [1:0.50]
+; SKX-NEXT: sete %cl # sched: [1:0.50]
+; SKX-NEXT: andb %al, %cl # sched: [1:0.25]
+; SKX-NEXT: vcomiss (%rdi), %xmm0 # sched: [8:1.00]
+; SKX-NEXT: setnp %al # sched: [1:0.50]
+; SKX-NEXT: sete %dl # sched: [1:0.50]
+; SKX-NEXT: andb %al, %dl # sched: [1:0.25]
+; SKX-NEXT: orb %cl, %dl # sched: [1:0.25]
+; SKX-NEXT: movzbl %dl, %eax # sched: [1:0.25]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_comiss:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vcomiss %xmm1, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: setnp %al # sched: [1:0.50]
; BTVER2-NEXT: sete %cl # sched: [1:0.50]
@@ -424,7 +572,7 @@ define i32 @test_comiss(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2) {
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_comiss:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vcomiss %xmm1, %xmm0 # sched: [3:1.00]
; ZNVER1-NEXT: setnp %al # sched: [1:0.25]
; ZNVER1-NEXT: sete %cl # sched: [1:0.25]
@@ -435,7 +583,7 @@ define i32 @test_comiss(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2) {
; ZNVER1-NEXT: andb %al, %dl # sched: [1:0.25]
; ZNVER1-NEXT: orb %cl, %dl # sched: [1:0.25]
; ZNVER1-NEXT: movzbl %dl, %eax # sched: [1:0.25]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call i32 @llvm.x86.sse.comieq.ss(<4 x float> %a0, <4 x float> %a1)
%2 = load <4 x float>, <4 x float> *%a2, align 4
%3 = call i32 @llvm.x86.sse.comieq.ss(<4 x float> %a0, <4 x float> %2)
@@ -446,53 +594,74 @@ declare i32 @llvm.x86.sse.comieq.ss(<4 x float>, <4 x float>) nounwind readnone
define float @test_cvtsi2ss(i32 %a0, i32 *%a1) {
; GENERIC-LABEL: test_cvtsi2ss:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: cvtsi2ssl %edi, %xmm1
-; GENERIC-NEXT: cvtsi2ssl (%rsi), %xmm0
-; GENERIC-NEXT: addss %xmm1, %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: cvtsi2ssl %edi, %xmm1 # sched: [5:2.00]
+; GENERIC-NEXT: cvtsi2ssl (%rsi), %xmm0 # sched: [10:1.00]
+; GENERIC-NEXT: addss %xmm1, %xmm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_cvtsi2ss:
-; ATOM: # BB#0:
-; ATOM-NEXT: cvtsi2ssl (%rsi), %xmm0
-; ATOM-NEXT: cvtsi2ssl %edi, %xmm1
-; ATOM-NEXT: addss %xmm1, %xmm0
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: cvtsi2ssl (%rsi), %xmm0 # sched: [7:3.50]
+; ATOM-NEXT: cvtsi2ssl %edi, %xmm1 # sched: [6:3.00]
+; ATOM-NEXT: addss %xmm1, %xmm0 # sched: [5:5.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_cvtsi2ss:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: cvtsi2ssl (%rsi), %xmm0 # sched: [7:1.00]
; SLM-NEXT: cvtsi2ssl %edi, %xmm1 # sched: [4:0.50]
; SLM-NEXT: addss %xmm1, %xmm0 # sched: [3:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_cvtsi2ss:
-; SANDY: # BB#0:
-; SANDY-NEXT: vcvtsi2ssl %edi, %xmm0, %xmm0 # sched: [4:1.00]
-; SANDY-NEXT: vcvtsi2ssl (%rsi), %xmm1, %xmm1 # sched: [8:1.00]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vcvtsi2ssl %edi, %xmm0, %xmm0 # sched: [5:2.00]
+; SANDY-NEXT: vcvtsi2ssl (%rsi), %xmm1, %xmm1 # sched: [10:1.00]
; SANDY-NEXT: vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cvtsi2ss:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vcvtsi2ssl %edi, %xmm0, %xmm0 # sched: [4:1.00]
-; HASWELL-NEXT: vcvtsi2ssl (%rsi), %xmm1, %xmm1 # sched: [8:1.00]
+; HASWELL-NEXT: vcvtsi2ssl (%rsi), %xmm1, %xmm1 # sched: [9:1.00]
; HASWELL-NEXT: vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_cvtsi2ss:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vcvtsi2ssl %edi, %xmm0, %xmm0 # sched: [4:1.00]
+; BROADWELL-NEXT: vcvtsi2ssl (%rsi), %xmm1, %xmm1 # sched: [9:1.00]
+; BROADWELL-NEXT: vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_cvtsi2ss:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vcvtsi2ssl %edi, %xmm0, %xmm0 # sched: [5:1.00]
+; SKYLAKE-NEXT: vcvtsi2ssl (%rsi), %xmm1, %xmm1 # sched: [9:1.00]
+; SKYLAKE-NEXT: vaddss %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_cvtsi2ss:
+; SKX: # %bb.0:
+; SKX-NEXT: vcvtsi2ssl %edi, %xmm0, %xmm0 # sched: [5:1.00]
+; SKX-NEXT: vcvtsi2ssl (%rsi), %xmm1, %xmm1 # sched: [9:1.00]
+; SKX-NEXT: vaddss %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_cvtsi2ss:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vcvtsi2ssl %edi, %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: vcvtsi2ssl (%rsi), %xmm1, %xmm1 # sched: [8:1.00]
; BTVER2-NEXT: vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_cvtsi2ss:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vcvtsi2ssl %edi, %xmm0, %xmm0 # sched: [5:1.00]
; ZNVER1-NEXT: vcvtsi2ssl (%rsi), %xmm1, %xmm1 # sched: [12:1.00]
; ZNVER1-NEXT: vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = sitofp i32 %a0 to float
%2 = load i32, i32 *%a1, align 4
%3 = sitofp i32 %2 to float
@@ -502,53 +671,74 @@ define float @test_cvtsi2ss(i32 %a0, i32 *%a1) {
define float @test_cvtsi2ssq(i64 %a0, i64 *%a1) {
; GENERIC-LABEL: test_cvtsi2ssq:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: cvtsi2ssq %rdi, %xmm1
-; GENERIC-NEXT: cvtsi2ssq (%rsi), %xmm0
-; GENERIC-NEXT: addss %xmm1, %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: cvtsi2ssq %rdi, %xmm1 # sched: [5:2.00]
+; GENERIC-NEXT: cvtsi2ssq (%rsi), %xmm0 # sched: [10:1.00]
+; GENERIC-NEXT: addss %xmm1, %xmm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_cvtsi2ssq:
-; ATOM: # BB#0:
-; ATOM-NEXT: cvtsi2ssq (%rsi), %xmm0
-; ATOM-NEXT: cvtsi2ssq %rdi, %xmm1
-; ATOM-NEXT: addss %xmm1, %xmm0
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: cvtsi2ssq (%rsi), %xmm0 # sched: [7:3.50]
+; ATOM-NEXT: cvtsi2ssq %rdi, %xmm1 # sched: [6:3.00]
+; ATOM-NEXT: addss %xmm1, %xmm0 # sched: [5:5.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_cvtsi2ssq:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: cvtsi2ssq (%rsi), %xmm0 # sched: [7:1.00]
; SLM-NEXT: cvtsi2ssq %rdi, %xmm1 # sched: [4:0.50]
; SLM-NEXT: addss %xmm1, %xmm0 # sched: [3:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_cvtsi2ssq:
-; SANDY: # BB#0:
-; SANDY-NEXT: vcvtsi2ssq %rdi, %xmm0, %xmm0 # sched: [4:1.00]
-; SANDY-NEXT: vcvtsi2ssq (%rsi), %xmm1, %xmm1 # sched: [8:1.00]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vcvtsi2ssq %rdi, %xmm0, %xmm0 # sched: [5:2.00]
+; SANDY-NEXT: vcvtsi2ssq (%rsi), %xmm1, %xmm1 # sched: [10:1.00]
; SANDY-NEXT: vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cvtsi2ssq:
-; HASWELL: # BB#0:
-; HASWELL-NEXT: vcvtsi2ssq %rdi, %xmm0, %xmm0 # sched: [4:1.00]
-; HASWELL-NEXT: vcvtsi2ssq (%rsi), %xmm1, %xmm1 # sched: [8:1.00]
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vcvtsi2ssq %rdi, %xmm0, %xmm0 # sched: [5:2.00]
+; HASWELL-NEXT: vcvtsi2ssq (%rsi), %xmm1, %xmm1 # sched: [9:1.00]
; HASWELL-NEXT: vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_cvtsi2ssq:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vcvtsi2ssq %rdi, %xmm0, %xmm0 # sched: [5:2.00]
+; BROADWELL-NEXT: vcvtsi2ssq (%rsi), %xmm1, %xmm1 # sched: [9:1.00]
+; BROADWELL-NEXT: vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_cvtsi2ssq:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vcvtsi2ssq %rdi, %xmm0, %xmm0 # sched: [6:2.00]
+; SKYLAKE-NEXT: vcvtsi2ssq (%rsi), %xmm1, %xmm1 # sched: [9:1.00]
+; SKYLAKE-NEXT: vaddss %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_cvtsi2ssq:
+; SKX: # %bb.0:
+; SKX-NEXT: vcvtsi2ssq %rdi, %xmm0, %xmm0 # sched: [6:2.00]
+; SKX-NEXT: vcvtsi2ssq (%rsi), %xmm1, %xmm1 # sched: [9:1.00]
+; SKX-NEXT: vaddss %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_cvtsi2ssq:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vcvtsi2ssq %rdi, %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: vcvtsi2ssq (%rsi), %xmm1, %xmm1 # sched: [8:1.00]
; BTVER2-NEXT: vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_cvtsi2ssq:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vcvtsi2ssq %rdi, %xmm0, %xmm0 # sched: [5:1.00]
; ZNVER1-NEXT: vcvtsi2ssq (%rsi), %xmm1, %xmm1 # sched: [12:1.00]
; ZNVER1-NEXT: vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = sitofp i64 %a0 to float
%2 = load i64, i64 *%a1, align 8
%3 = sitofp i64 %2 to float
@@ -558,53 +748,74 @@ define float @test_cvtsi2ssq(i64 %a0, i64 *%a1) {
define i32 @test_cvtss2si(float %a0, float *%a1) {
; GENERIC-LABEL: test_cvtss2si:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: cvtss2si %xmm0, %ecx
-; GENERIC-NEXT: cvtss2si (%rdi), %eax
-; GENERIC-NEXT: addl %ecx, %eax
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: cvtss2si %xmm0, %ecx # sched: [5:1.00]
+; GENERIC-NEXT: cvtss2si (%rdi), %eax # sched: [9:1.00]
+; GENERIC-NEXT: addl %ecx, %eax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_cvtss2si:
-; ATOM: # BB#0:
-; ATOM-NEXT: cvtss2si (%rdi), %eax
-; ATOM-NEXT: cvtss2si %xmm0, %ecx
-; ATOM-NEXT: addl %ecx, %eax
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: cvtss2si (%rdi), %eax # sched: [9:4.50]
+; ATOM-NEXT: cvtss2si %xmm0, %ecx # sched: [8:4.00]
+; ATOM-NEXT: addl %ecx, %eax # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_cvtss2si:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: cvtss2si (%rdi), %eax # sched: [7:1.00]
; SLM-NEXT: cvtss2si %xmm0, %ecx # sched: [4:0.50]
; SLM-NEXT: addl %ecx, %eax # sched: [1:0.50]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_cvtss2si:
-; SANDY: # BB#0:
-; SANDY-NEXT: vcvtss2si %xmm0, %ecx # sched: [3:1.00]
-; SANDY-NEXT: vcvtss2si (%rdi), %eax # sched: [7:1.00]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vcvtss2si %xmm0, %ecx # sched: [5:1.00]
+; SANDY-NEXT: vcvtss2si (%rdi), %eax # sched: [10:1.00]
; SANDY-NEXT: addl %ecx, %eax # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cvtss2si:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vcvtss2si %xmm0, %ecx # sched: [4:1.00]
-; HASWELL-NEXT: vcvtss2si (%rdi), %eax # sched: [8:1.00]
+; HASWELL-NEXT: vcvtss2si (%rdi), %eax # sched: [9:1.00]
; HASWELL-NEXT: addl %ecx, %eax # sched: [1:0.25]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_cvtss2si:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vcvtss2si (%rdi), %eax # sched: [9:1.00]
+; BROADWELL-NEXT: vcvtss2si %xmm0, %ecx # sched: [4:1.00]
+; BROADWELL-NEXT: addl %ecx, %eax # sched: [1:0.25]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_cvtss2si:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vcvtss2si %xmm0, %ecx # sched: [6:1.00]
+; SKYLAKE-NEXT: vcvtss2si (%rdi), %eax # sched: [11:1.00]
+; SKYLAKE-NEXT: addl %ecx, %eax # sched: [1:0.25]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_cvtss2si:
+; SKX: # %bb.0:
+; SKX-NEXT: vcvtss2si %xmm0, %ecx # sched: [6:1.00]
+; SKX-NEXT: vcvtss2si (%rdi), %eax # sched: [11:1.00]
+; SKX-NEXT: addl %ecx, %eax # sched: [1:0.25]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_cvtss2si:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vcvtss2si (%rdi), %eax # sched: [8:1.00]
; BTVER2-NEXT: vcvtss2si %xmm0, %ecx # sched: [3:1.00]
; BTVER2-NEXT: addl %ecx, %eax # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_cvtss2si:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vcvtss2si (%rdi), %eax # sched: [12:1.00]
; ZNVER1-NEXT: vcvtss2si %xmm0, %ecx # sched: [5:1.00]
; ZNVER1-NEXT: addl %ecx, %eax # sched: [1:0.25]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = insertelement <4 x float> undef, float %a0, i32 0
%2 = call i32 @llvm.x86.sse.cvtss2si(<4 x float> %1)
%3 = load float, float *%a1, align 4
@@ -617,53 +828,74 @@ declare i32 @llvm.x86.sse.cvtss2si(<4 x float>) nounwind readnone
define i64 @test_cvtss2siq(float %a0, float *%a1) {
; GENERIC-LABEL: test_cvtss2siq:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: cvtss2si %xmm0, %rcx
-; GENERIC-NEXT: cvtss2si (%rdi), %rax
-; GENERIC-NEXT: addq %rcx, %rax
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: cvtss2si %xmm0, %rcx # sched: [5:1.00]
+; GENERIC-NEXT: cvtss2si (%rdi), %rax # sched: [9:1.00]
+; GENERIC-NEXT: addq %rcx, %rax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_cvtss2siq:
-; ATOM: # BB#0:
-; ATOM-NEXT: cvtss2si (%rdi), %rax
-; ATOM-NEXT: cvtss2si %xmm0, %rcx
-; ATOM-NEXT: addq %rcx, %rax
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: cvtss2si (%rdi), %rax # sched: [10:5.00]
+; ATOM-NEXT: cvtss2si %xmm0, %rcx # sched: [9:4.50]
+; ATOM-NEXT: addq %rcx, %rax # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_cvtss2siq:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: cvtss2si (%rdi), %rax # sched: [7:1.00]
; SLM-NEXT: cvtss2si %xmm0, %rcx # sched: [4:0.50]
; SLM-NEXT: addq %rcx, %rax # sched: [1:0.50]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_cvtss2siq:
-; SANDY: # BB#0:
-; SANDY-NEXT: vcvtss2si %xmm0, %rcx # sched: [3:1.00]
-; SANDY-NEXT: vcvtss2si (%rdi), %rax # sched: [7:1.00]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vcvtss2si %xmm0, %rcx # sched: [5:1.00]
+; SANDY-NEXT: vcvtss2si (%rdi), %rax # sched: [10:1.00]
; SANDY-NEXT: addq %rcx, %rax # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cvtss2siq:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vcvtss2si %xmm0, %rcx # sched: [4:1.00]
-; HASWELL-NEXT: vcvtss2si (%rdi), %rax # sched: [8:1.00]
+; HASWELL-NEXT: vcvtss2si (%rdi), %rax # sched: [9:1.00]
; HASWELL-NEXT: addq %rcx, %rax # sched: [1:0.25]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_cvtss2siq:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vcvtss2si (%rdi), %rax # sched: [9:1.00]
+; BROADWELL-NEXT: vcvtss2si %xmm0, %rcx # sched: [4:1.00]
+; BROADWELL-NEXT: addq %rcx, %rax # sched: [1:0.25]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_cvtss2siq:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vcvtss2si %xmm0, %rcx # sched: [6:1.00]
+; SKYLAKE-NEXT: vcvtss2si (%rdi), %rax # sched: [11:1.00]
+; SKYLAKE-NEXT: addq %rcx, %rax # sched: [1:0.25]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_cvtss2siq:
+; SKX: # %bb.0:
+; SKX-NEXT: vcvtss2si %xmm0, %rcx # sched: [6:1.00]
+; SKX-NEXT: vcvtss2si (%rdi), %rax # sched: [11:1.00]
+; SKX-NEXT: addq %rcx, %rax # sched: [1:0.25]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_cvtss2siq:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vcvtss2si (%rdi), %rax # sched: [8:1.00]
; BTVER2-NEXT: vcvtss2si %xmm0, %rcx # sched: [3:1.00]
; BTVER2-NEXT: addq %rcx, %rax # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_cvtss2siq:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vcvtss2si (%rdi), %rax # sched: [12:1.00]
; ZNVER1-NEXT: vcvtss2si %xmm0, %rcx # sched: [5:1.00]
; ZNVER1-NEXT: addq %rcx, %rax # sched: [1:0.25]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = insertelement <4 x float> undef, float %a0, i32 0
%2 = call i64 @llvm.x86.sse.cvtss2si64(<4 x float> %1)
%3 = load float, float *%a1, align 4
@@ -676,53 +908,74 @@ declare i64 @llvm.x86.sse.cvtss2si64(<4 x float>) nounwind readnone
define i32 @test_cvttss2si(float %a0, float *%a1) {
; GENERIC-LABEL: test_cvttss2si:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: cvttss2si %xmm0, %ecx
-; GENERIC-NEXT: cvttss2si (%rdi), %eax
-; GENERIC-NEXT: addl %ecx, %eax
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: cvttss2si %xmm0, %ecx # sched: [5:1.00]
+; GENERIC-NEXT: cvttss2si (%rdi), %eax # sched: [9:1.00]
+; GENERIC-NEXT: addl %ecx, %eax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_cvttss2si:
-; ATOM: # BB#0:
-; ATOM-NEXT: cvttss2si (%rdi), %eax
-; ATOM-NEXT: cvttss2si %xmm0, %ecx
-; ATOM-NEXT: addl %ecx, %eax
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: cvttss2si (%rdi), %eax # sched: [9:4.50]
+; ATOM-NEXT: cvttss2si %xmm0, %ecx # sched: [8:4.00]
+; ATOM-NEXT: addl %ecx, %eax # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_cvttss2si:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: cvttss2si (%rdi), %eax # sched: [7:1.00]
; SLM-NEXT: cvttss2si %xmm0, %ecx # sched: [4:0.50]
; SLM-NEXT: addl %ecx, %eax # sched: [1:0.50]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_cvttss2si:
-; SANDY: # BB#0:
-; SANDY-NEXT: vcvttss2si %xmm0, %ecx # sched: [3:1.00]
-; SANDY-NEXT: vcvttss2si (%rdi), %eax # sched: [7:1.00]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vcvttss2si %xmm0, %ecx # sched: [5:1.00]
+; SANDY-NEXT: vcvttss2si (%rdi), %eax # sched: [10:1.00]
; SANDY-NEXT: addl %ecx, %eax # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cvttss2si:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vcvttss2si %xmm0, %ecx # sched: [4:1.00]
-; HASWELL-NEXT: vcvttss2si (%rdi), %eax # sched: [8:1.00]
+; HASWELL-NEXT: vcvttss2si (%rdi), %eax # sched: [9:1.00]
; HASWELL-NEXT: addl %ecx, %eax # sched: [1:0.25]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_cvttss2si:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vcvttss2si (%rdi), %eax # sched: [9:1.00]
+; BROADWELL-NEXT: vcvttss2si %xmm0, %ecx # sched: [4:1.00]
+; BROADWELL-NEXT: addl %ecx, %eax # sched: [1:0.25]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_cvttss2si:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vcvttss2si %xmm0, %ecx # sched: [7:1.00]
+; SKYLAKE-NEXT: vcvttss2si (%rdi), %eax # sched: [11:1.00]
+; SKYLAKE-NEXT: addl %ecx, %eax # sched: [1:0.25]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_cvttss2si:
+; SKX: # %bb.0:
+; SKX-NEXT: vcvttss2si %xmm0, %ecx # sched: [7:1.00]
+; SKX-NEXT: vcvttss2si (%rdi), %eax # sched: [11:1.00]
+; SKX-NEXT: addl %ecx, %eax # sched: [1:0.25]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_cvttss2si:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vcvttss2si (%rdi), %eax # sched: [8:1.00]
; BTVER2-NEXT: vcvttss2si %xmm0, %ecx # sched: [3:1.00]
; BTVER2-NEXT: addl %ecx, %eax # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_cvttss2si:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vcvttss2si (%rdi), %eax # sched: [12:1.00]
; ZNVER1-NEXT: vcvttss2si %xmm0, %ecx # sched: [5:1.00]
; ZNVER1-NEXT: addl %ecx, %eax # sched: [1:0.25]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = fptosi float %a0 to i32
%2 = load float, float *%a1, align 4
%3 = fptosi float %2 to i32
@@ -732,53 +985,74 @@ define i32 @test_cvttss2si(float %a0, float *%a1) {
define i64 @test_cvttss2siq(float %a0, float *%a1) {
; GENERIC-LABEL: test_cvttss2siq:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: cvttss2si %xmm0, %rcx
-; GENERIC-NEXT: cvttss2si (%rdi), %rax
-; GENERIC-NEXT: addq %rcx, %rax
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: cvttss2si %xmm0, %rcx # sched: [5:1.00]
+; GENERIC-NEXT: cvttss2si (%rdi), %rax # sched: [9:1.00]
+; GENERIC-NEXT: addq %rcx, %rax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_cvttss2siq:
-; ATOM: # BB#0:
-; ATOM-NEXT: cvttss2si (%rdi), %rax
-; ATOM-NEXT: cvttss2si %xmm0, %rcx
-; ATOM-NEXT: addq %rcx, %rax
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: cvttss2si (%rdi), %rax # sched: [10:5.00]
+; ATOM-NEXT: cvttss2si %xmm0, %rcx # sched: [9:4.50]
+; ATOM-NEXT: addq %rcx, %rax # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_cvttss2siq:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: cvttss2si (%rdi), %rax # sched: [7:1.00]
; SLM-NEXT: cvttss2si %xmm0, %rcx # sched: [4:0.50]
; SLM-NEXT: addq %rcx, %rax # sched: [1:0.50]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_cvttss2siq:
-; SANDY: # BB#0:
-; SANDY-NEXT: vcvttss2si %xmm0, %rcx # sched: [3:1.00]
-; SANDY-NEXT: vcvttss2si (%rdi), %rax # sched: [7:1.00]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vcvttss2si %xmm0, %rcx # sched: [5:1.00]
+; SANDY-NEXT: vcvttss2si (%rdi), %rax # sched: [10:1.00]
; SANDY-NEXT: addq %rcx, %rax # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cvttss2siq:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vcvttss2si %xmm0, %rcx # sched: [4:1.00]
-; HASWELL-NEXT: vcvttss2si (%rdi), %rax # sched: [8:1.00]
+; HASWELL-NEXT: vcvttss2si (%rdi), %rax # sched: [9:1.00]
; HASWELL-NEXT: addq %rcx, %rax # sched: [1:0.25]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_cvttss2siq:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vcvttss2si (%rdi), %rax # sched: [9:1.00]
+; BROADWELL-NEXT: vcvttss2si %xmm0, %rcx # sched: [4:1.00]
+; BROADWELL-NEXT: addq %rcx, %rax # sched: [1:0.25]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_cvttss2siq:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vcvttss2si %xmm0, %rcx # sched: [7:1.00]
+; SKYLAKE-NEXT: vcvttss2si (%rdi), %rax # sched: [11:1.00]
+; SKYLAKE-NEXT: addq %rcx, %rax # sched: [1:0.25]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_cvttss2siq:
+; SKX: # %bb.0:
+; SKX-NEXT: vcvttss2si %xmm0, %rcx # sched: [7:1.00]
+; SKX-NEXT: vcvttss2si (%rdi), %rax # sched: [11:1.00]
+; SKX-NEXT: addq %rcx, %rax # sched: [1:0.25]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_cvttss2siq:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vcvttss2si (%rdi), %rax # sched: [8:1.00]
; BTVER2-NEXT: vcvttss2si %xmm0, %rcx # sched: [3:1.00]
; BTVER2-NEXT: addq %rcx, %rax # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_cvttss2siq:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vcvttss2si (%rdi), %rax # sched: [12:1.00]
; ZNVER1-NEXT: vcvttss2si %xmm0, %rcx # sched: [5:1.00]
; ZNVER1-NEXT: addq %rcx, %rax # sched: [1:0.25]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = fptosi float %a0 to i64
%2 = load float, float *%a1, align 4
%3 = fptosi float %2 to i64
@@ -788,46 +1062,64 @@ define i64 @test_cvttss2siq(float %a0, float *%a1) {
define <4 x float> @test_divps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2) {
; GENERIC-LABEL: test_divps:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: divps %xmm1, %xmm0
-; GENERIC-NEXT: divps (%rdi), %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: divps %xmm1, %xmm0 # sched: [14:1.00]
+; GENERIC-NEXT: divps (%rdi), %xmm0 # sched: [20:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_divps:
-; ATOM: # BB#0:
-; ATOM-NEXT: divps %xmm1, %xmm0
-; ATOM-NEXT: divps (%rdi), %xmm0
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: divps %xmm1, %xmm0 # sched: [70:35.00]
+; ATOM-NEXT: divps (%rdi), %xmm0 # sched: [125:62.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_divps:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: divps %xmm1, %xmm0 # sched: [34:34.00]
; SLM-NEXT: divps (%rdi), %xmm0 # sched: [37:34.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_divps:
-; SANDY: # BB#0:
-; SANDY-NEXT: vdivps %xmm1, %xmm0, %xmm0 # sched: [12:1.00]
-; SANDY-NEXT: vdivps (%rdi), %xmm0, %xmm0 # sched: [16:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vdivps %xmm1, %xmm0, %xmm0 # sched: [14:1.00]
+; SANDY-NEXT: vdivps (%rdi), %xmm0, %xmm0 # sched: [20:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_divps:
-; HASWELL: # BB#0:
-; HASWELL-NEXT: vdivps %xmm1, %xmm0, %xmm0 # sched: [12:1.00]
-; HASWELL-NEXT: vdivps (%rdi), %xmm0, %xmm0 # sched: [16:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vdivps %xmm1, %xmm0, %xmm0 # sched: [13:1.00]
+; HASWELL-NEXT: vdivps (%rdi), %xmm0, %xmm0 # sched: [19:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_divps:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vdivps %xmm1, %xmm0, %xmm0 # sched: [11:1.00]
+; BROADWELL-NEXT: vdivps (%rdi), %xmm0, %xmm0 # sched: [16:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_divps:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vdivps %xmm1, %xmm0, %xmm0 # sched: [11:1.00]
+; SKYLAKE-NEXT: vdivps (%rdi), %xmm0, %xmm0 # sched: [17:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_divps:
+; SKX: # %bb.0:
+; SKX-NEXT: vdivps %xmm1, %xmm0, %xmm0 # sched: [11:1.00]
+; SKX-NEXT: vdivps (%rdi), %xmm0, %xmm0 # sched: [17:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_divps:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vdivps %xmm1, %xmm0, %xmm0 # sched: [19:19.00]
; BTVER2-NEXT: vdivps (%rdi), %xmm0, %xmm0 # sched: [24:19.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_divps:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vdivps %xmm1, %xmm0, %xmm0 # sched: [15:1.00]
; ZNVER1-NEXT: vdivps (%rdi), %xmm0, %xmm0 # sched: [22:1.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = fdiv <4 x float> %a0, %a1
%2 = load <4 x float>, <4 x float> *%a2, align 16
%3 = fdiv <4 x float> %1, %2
@@ -836,46 +1128,64 @@ define <4 x float> @test_divps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a
define float @test_divss(float %a0, float %a1, float *%a2) {
; GENERIC-LABEL: test_divss:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: divss %xmm1, %xmm0
-; GENERIC-NEXT: divss (%rdi), %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: divss %xmm1, %xmm0 # sched: [14:1.00]
+; GENERIC-NEXT: divss (%rdi), %xmm0 # sched: [20:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_divss:
-; ATOM: # BB#0:
-; ATOM-NEXT: divss %xmm1, %xmm0
-; ATOM-NEXT: divss (%rdi), %xmm0
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: divss %xmm1, %xmm0 # sched: [34:17.00]
+; ATOM-NEXT: divss (%rdi), %xmm0 # sched: [62:31.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_divss:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: divss %xmm1, %xmm0 # sched: [34:34.00]
; SLM-NEXT: divss (%rdi), %xmm0 # sched: [37:34.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_divss:
-; SANDY: # BB#0:
-; SANDY-NEXT: vdivss %xmm1, %xmm0, %xmm0 # sched: [12:1.00]
-; SANDY-NEXT: vdivss (%rdi), %xmm0, %xmm0 # sched: [16:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vdivss %xmm1, %xmm0, %xmm0 # sched: [14:1.00]
+; SANDY-NEXT: vdivss (%rdi), %xmm0, %xmm0 # sched: [20:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_divss:
-; HASWELL: # BB#0:
-; HASWELL-NEXT: vdivss %xmm1, %xmm0, %xmm0 # sched: [12:1.00]
-; HASWELL-NEXT: vdivss (%rdi), %xmm0, %xmm0 # sched: [16:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vdivss %xmm1, %xmm0, %xmm0 # sched: [13:1.00]
+; HASWELL-NEXT: vdivss (%rdi), %xmm0, %xmm0 # sched: [18:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_divss:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vdivss %xmm1, %xmm0, %xmm0 # sched: [11:1.00]
+; BROADWELL-NEXT: vdivss (%rdi), %xmm0, %xmm0 # sched: [16:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_divss:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vdivss %xmm1, %xmm0, %xmm0 # sched: [11:1.00]
+; SKYLAKE-NEXT: vdivss (%rdi), %xmm0, %xmm0 # sched: [16:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_divss:
+; SKX: # %bb.0:
+; SKX-NEXT: vdivss %xmm1, %xmm0, %xmm0 # sched: [11:1.00]
+; SKX-NEXT: vdivss (%rdi), %xmm0, %xmm0 # sched: [16:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_divss:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vdivss %xmm1, %xmm0, %xmm0 # sched: [19:19.00]
; BTVER2-NEXT: vdivss (%rdi), %xmm0, %xmm0 # sched: [24:19.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_divss:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vdivss %xmm1, %xmm0, %xmm0 # sched: [15:1.00]
; ZNVER1-NEXT: vdivss (%rdi), %xmm0, %xmm0 # sched: [22:1.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = fdiv float %a0, %a1
%2 = load float, float *%a2, align 4
%3 = fdiv float %1, %2
@@ -884,46 +1194,64 @@ define float @test_divss(float %a0, float %a1, float *%a2) {
define void @test_ldmxcsr(i32 %a0) {
; GENERIC-LABEL: test_ldmxcsr:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: movl %edi, -{{[0-9]+}}(%rsp)
-; GENERIC-NEXT: ldmxcsr -{{[0-9]+}}(%rsp)
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: movl %edi, -{{[0-9]+}}(%rsp) # sched: [5:1.00]
+; GENERIC-NEXT: ldmxcsr -{{[0-9]+}}(%rsp) # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_ldmxcsr:
-; ATOM: # BB#0:
-; ATOM-NEXT: movl %edi, -{{[0-9]+}}(%rsp)
-; ATOM-NEXT: ldmxcsr -{{[0-9]+}}(%rsp)
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: movl %edi, -{{[0-9]+}}(%rsp) # sched: [1:1.00]
+; ATOM-NEXT: ldmxcsr -{{[0-9]+}}(%rsp) # sched: [5:2.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_ldmxcsr:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: movl %edi, -{{[0-9]+}}(%rsp) # sched: [1:1.00]
; SLM-NEXT: ldmxcsr -{{[0-9]+}}(%rsp) # sched: [3:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_ldmxcsr:
-; SANDY: # BB#0:
-; SANDY-NEXT: movl %edi, -{{[0-9]+}}(%rsp) # sched: [1:1.00]
-; SANDY-NEXT: vldmxcsr -{{[0-9]+}}(%rsp) # sched: [4:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY: # %bb.0:
+; SANDY-NEXT: movl %edi, -{{[0-9]+}}(%rsp) # sched: [5:1.00]
+; SANDY-NEXT: vldmxcsr -{{[0-9]+}}(%rsp) # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_ldmxcsr:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: movl %edi, -{{[0-9]+}}(%rsp) # sched: [1:1.00]
-; HASWELL-NEXT: vldmxcsr -{{[0-9]+}}(%rsp) # sched: [6:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vldmxcsr -{{[0-9]+}}(%rsp) # sched: [7:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_ldmxcsr:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: movl %edi, -{{[0-9]+}}(%rsp) # sched: [1:1.00]
+; BROADWELL-NEXT: vldmxcsr -{{[0-9]+}}(%rsp) # sched: [7:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_ldmxcsr:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: movl %edi, -{{[0-9]+}}(%rsp) # sched: [1:1.00]
+; SKYLAKE-NEXT: vldmxcsr -{{[0-9]+}}(%rsp) # sched: [7:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_ldmxcsr:
+; SKX: # %bb.0:
+; SKX-NEXT: movl %edi, -{{[0-9]+}}(%rsp) # sched: [1:1.00]
+; SKX-NEXT: vldmxcsr -{{[0-9]+}}(%rsp) # sched: [7:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_ldmxcsr:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: movl %edi, -{{[0-9]+}}(%rsp) # sched: [1:1.00]
; BTVER2-NEXT: vldmxcsr -{{[0-9]+}}(%rsp) # sched: [5:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_ldmxcsr:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: movl %edi, -{{[0-9]+}}(%rsp) # sched: [1:0.50]
-; ZNVER1-NEXT: vldmxcsr -{{[0-9]+}}(%rsp) # sched: [8:0.50]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: vldmxcsr -{{[0-9]+}}(%rsp) # sched: [100:?]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = alloca i32, align 4
%2 = bitcast i32* %1 to i8*
store i32 %a0, i32* %1
@@ -934,46 +1262,64 @@ declare void @llvm.x86.sse.ldmxcsr(i8*) nounwind readnone
define <4 x float> @test_maxps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2) {
; GENERIC-LABEL: test_maxps:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: maxps %xmm1, %xmm0
-; GENERIC-NEXT: maxps (%rdi), %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: maxps %xmm1, %xmm0 # sched: [3:1.00]
+; GENERIC-NEXT: maxps (%rdi), %xmm0 # sched: [9:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_maxps:
-; ATOM: # BB#0:
-; ATOM-NEXT: maxps %xmm1, %xmm0
-; ATOM-NEXT: maxps (%rdi), %xmm0
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: maxps %xmm1, %xmm0 # sched: [5:5.00]
+; ATOM-NEXT: maxps (%rdi), %xmm0 # sched: [5:5.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_maxps:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: maxps %xmm1, %xmm0 # sched: [3:1.00]
; SLM-NEXT: maxps (%rdi), %xmm0 # sched: [6:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_maxps:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vmaxps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vmaxps (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmaxps (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_maxps:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vmaxps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: vmaxps (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vmaxps (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_maxps:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vmaxps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; BROADWELL-NEXT: vmaxps (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_maxps:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vmaxps %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
+; SKYLAKE-NEXT: vmaxps (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_maxps:
+; SKX: # %bb.0:
+; SKX-NEXT: vmaxps %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: vmaxps (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_maxps:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vmaxps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: vmaxps (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_maxps:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vmaxps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; ZNVER1-NEXT: vmaxps (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %a0, <4 x float> %a1)
%2 = load <4 x float>, <4 x float> *%a2, align 16
%3 = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %1, <4 x float> %2)
@@ -983,46 +1329,64 @@ declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind read
define <4 x float> @test_maxss(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2) {
; GENERIC-LABEL: test_maxss:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: maxss %xmm1, %xmm0
-; GENERIC-NEXT: maxss (%rdi), %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: maxss %xmm1, %xmm0 # sched: [3:1.00]
+; GENERIC-NEXT: maxss (%rdi), %xmm0 # sched: [9:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_maxss:
-; ATOM: # BB#0:
-; ATOM-NEXT: maxss %xmm1, %xmm0
-; ATOM-NEXT: maxss (%rdi), %xmm0
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: maxss %xmm1, %xmm0 # sched: [5:5.00]
+; ATOM-NEXT: maxss (%rdi), %xmm0 # sched: [5:5.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_maxss:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: maxss %xmm1, %xmm0 # sched: [3:1.00]
; SLM-NEXT: maxss (%rdi), %xmm0 # sched: [6:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_maxss:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vmaxss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vmaxss (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmaxss (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_maxss:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vmaxss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: vmaxss (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vmaxss (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_maxss:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vmaxss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; BROADWELL-NEXT: vmaxss (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_maxss:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vmaxss %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
+; SKYLAKE-NEXT: vmaxss (%rdi), %xmm0, %xmm0 # sched: [9:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_maxss:
+; SKX: # %bb.0:
+; SKX-NEXT: vmaxss %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: vmaxss (%rdi), %xmm0, %xmm0 # sched: [9:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_maxss:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vmaxss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: vmaxss (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_maxss:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vmaxss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; ZNVER1-NEXT: vmaxss (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call <4 x float> @llvm.x86.sse.max.ss(<4 x float> %a0, <4 x float> %a1)
%2 = load <4 x float>, <4 x float> *%a2, align 16
%3 = call <4 x float> @llvm.x86.sse.max.ss(<4 x float> %1, <4 x float> %2)
@@ -1032,46 +1396,64 @@ declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>) nounwind read
define <4 x float> @test_minps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2) {
; GENERIC-LABEL: test_minps:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: minps %xmm1, %xmm0
-; GENERIC-NEXT: minps (%rdi), %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: minps %xmm1, %xmm0 # sched: [3:1.00]
+; GENERIC-NEXT: minps (%rdi), %xmm0 # sched: [9:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_minps:
-; ATOM: # BB#0:
-; ATOM-NEXT: minps %xmm1, %xmm0
-; ATOM-NEXT: minps (%rdi), %xmm0
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: minps %xmm1, %xmm0 # sched: [5:5.00]
+; ATOM-NEXT: minps (%rdi), %xmm0 # sched: [5:5.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_minps:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: minps %xmm1, %xmm0 # sched: [3:1.00]
; SLM-NEXT: minps (%rdi), %xmm0 # sched: [6:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_minps:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vminps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vminps (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vminps (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_minps:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vminps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: vminps (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vminps (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_minps:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vminps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; BROADWELL-NEXT: vminps (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_minps:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vminps %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
+; SKYLAKE-NEXT: vminps (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_minps:
+; SKX: # %bb.0:
+; SKX-NEXT: vminps %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: vminps (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_minps:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vminps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: vminps (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_minps:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vminps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; ZNVER1-NEXT: vminps (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %a0, <4 x float> %a1)
%2 = load <4 x float>, <4 x float> *%a2, align 16
%3 = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %1, <4 x float> %2)
@@ -1081,46 +1463,64 @@ declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind read
define <4 x float> @test_minss(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2) {
; GENERIC-LABEL: test_minss:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: minss %xmm1, %xmm0
-; GENERIC-NEXT: minss (%rdi), %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: minss %xmm1, %xmm0 # sched: [3:1.00]
+; GENERIC-NEXT: minss (%rdi), %xmm0 # sched: [9:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_minss:
-; ATOM: # BB#0:
-; ATOM-NEXT: minss %xmm1, %xmm0
-; ATOM-NEXT: minss (%rdi), %xmm0
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: minss %xmm1, %xmm0 # sched: [5:5.00]
+; ATOM-NEXT: minss (%rdi), %xmm0 # sched: [5:5.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_minss:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: minss %xmm1, %xmm0 # sched: [3:1.00]
; SLM-NEXT: minss (%rdi), %xmm0 # sched: [6:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_minss:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vminss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vminss (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vminss (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_minss:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vminss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: vminss (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vminss (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_minss:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vminss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; BROADWELL-NEXT: vminss (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_minss:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vminss %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
+; SKYLAKE-NEXT: vminss (%rdi), %xmm0, %xmm0 # sched: [9:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_minss:
+; SKX: # %bb.0:
+; SKX-NEXT: vminss %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: vminss (%rdi), %xmm0, %xmm0 # sched: [9:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_minss:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vminss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: vminss (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_minss:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vminss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; ZNVER1-NEXT: vminss (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call <4 x float> @llvm.x86.sse.min.ss(<4 x float> %a0, <4 x float> %a1)
%2 = load <4 x float>, <4 x float> *%a2, align 16
%3 = call <4 x float> @llvm.x86.sse.min.ss(<4 x float> %1, <4 x float> %2)
@@ -1130,53 +1530,74 @@ declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>) nounwind read
define void @test_movaps(<4 x float> *%a0, <4 x float> *%a1) {
; GENERIC-LABEL: test_movaps:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: movaps (%rdi), %xmm0
-; GENERIC-NEXT: addps %xmm0, %xmm0
-; GENERIC-NEXT: movaps %xmm0, (%rsi)
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: movaps (%rdi), %xmm0 # sched: [6:0.50]
+; GENERIC-NEXT: addps %xmm0, %xmm0 # sched: [3:1.00]
+; GENERIC-NEXT: movaps %xmm0, (%rsi) # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_movaps:
-; ATOM: # BB#0:
-; ATOM-NEXT: movaps (%rdi), %xmm0
-; ATOM-NEXT: addps %xmm0, %xmm0
-; ATOM-NEXT: movaps %xmm0, (%rsi)
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: movaps (%rdi), %xmm0 # sched: [1:1.00]
+; ATOM-NEXT: addps %xmm0, %xmm0 # sched: [5:5.00]
+; ATOM-NEXT: movaps %xmm0, (%rsi) # sched: [1:1.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_movaps:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: movaps (%rdi), %xmm0 # sched: [3:1.00]
; SLM-NEXT: addps %xmm0, %xmm0 # sched: [3:1.00]
; SLM-NEXT: movaps %xmm0, (%rsi) # sched: [1:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_movaps:
-; SANDY: # BB#0:
-; SANDY-NEXT: vmovaps (%rdi), %xmm0 # sched: [4:0.50]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vmovaps (%rdi), %xmm0 # sched: [6:0.50]
; SANDY-NEXT: vaddps %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vmovaps %xmm0, (%rsi) # sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmovaps %xmm0, (%rsi) # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movaps:
-; HASWELL: # BB#0:
-; HASWELL-NEXT: vmovaps (%rdi), %xmm0 # sched: [4:0.50]
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vmovaps (%rdi), %xmm0 # sched: [6:0.50]
; HASWELL-NEXT: vaddps %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
; HASWELL-NEXT: vmovaps %xmm0, (%rsi) # sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_movaps:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vmovaps (%rdi), %xmm0 # sched: [5:0.50]
+; BROADWELL-NEXT: vaddps %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
+; BROADWELL-NEXT: vmovaps %xmm0, (%rsi) # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_movaps:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vmovaps (%rdi), %xmm0 # sched: [6:0.50]
+; SKYLAKE-NEXT: vaddps %xmm0, %xmm0, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vmovaps %xmm0, (%rsi) # sched: [1:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_movaps:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovaps (%rdi), %xmm0 # sched: [6:0.50]
+; SKX-NEXT: vaddps %xmm0, %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: vmovaps %xmm0, (%rsi) # sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_movaps:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vmovaps (%rdi), %xmm0 # sched: [5:1.00]
; BTVER2-NEXT: vaddps %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: vmovaps %xmm0, (%rsi) # sched: [1:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_movaps:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vmovaps (%rdi), %xmm0 # sched: [8:0.50]
; ZNVER1-NEXT: vaddps %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
; ZNVER1-NEXT: vmovaps %xmm0, (%rsi) # sched: [1:0.50]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = load <4 x float>, <4 x float> *%a0, align 16
%2 = fadd <4 x float> %1, %1
store <4 x float> %2, <4 x float> *%a1, align 16
@@ -1187,45 +1608,60 @@ define void @test_movaps(<4 x float> *%a0, <4 x float> *%a1) {
define <4 x float> @test_movhlps(<4 x float> %a0, <4 x float> %a1) {
; GENERIC-LABEL: test_movhlps:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_movhlps:
-; ATOM: # BB#0:
-; ATOM-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1] sched: [1:1.00]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_movhlps:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1] sched: [1:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_movhlps:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movhlps:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_movhlps:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_movhlps:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] sched: [1:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_movhlps:
+; SKX: # %bb.0:
+; SKX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_movhlps:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_movhlps:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] sched: [1:0.50]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 6, i32 7, i32 2, i32 3>
ret <4 x float> %1
}
@@ -1234,55 +1670,76 @@ define <4 x float> @test_movhlps(<4 x float> %a0, <4 x float> %a1) {
define void @test_movhps(<4 x float> %a0, <4 x float> %a1, x86_mmx *%a2) {
; GENERIC-LABEL: test_movhps:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: movhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
-; GENERIC-NEXT: addps %xmm0, %xmm1
-; GENERIC-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
-; GENERIC-NEXT: movlps %xmm1, (%rdi)
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: movhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [7:1.00]
+; GENERIC-NEXT: addps %xmm0, %xmm1 # sched: [3:1.00]
+; GENERIC-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] sched: [1:1.00]
+; GENERIC-NEXT: movlps %xmm1, (%rdi) # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_movhps:
-; ATOM: # BB#0:
-; ATOM-NEXT: movhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
-; ATOM-NEXT: addps %xmm0, %xmm1
-; ATOM-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
-; ATOM-NEXT: movlps %xmm1, (%rdi)
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: movhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [1:1.00]
+; ATOM-NEXT: addps %xmm0, %xmm1 # sched: [5:5.00]
+; ATOM-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] sched: [1:1.00]
+; ATOM-NEXT: movlps %xmm1, (%rdi) # sched: [1:1.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_movhps:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: movhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [4:1.00]
; SLM-NEXT: addps %xmm0, %xmm1 # sched: [3:1.00]
; SLM-NEXT: pextrq $1, %xmm1, (%rdi) # sched: [4:2.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_movhps:
-; SANDY: # BB#0:
-; SANDY-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [5:1.00]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [7:1.00]
; SANDY-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; SANDY-NEXT: vpextrq $1, %xmm0, (%rdi) # sched: [5:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movhps:
-; HASWELL: # BB#0:
-; HASWELL-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [5:1.00]
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [6:1.00]
; HASWELL-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: vpextrq $1, %xmm0, (%rdi) # sched: [5:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpextrq $1, %xmm0, (%rdi) # sched: [2:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_movhps:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [6:1.00]
+; BROADWELL-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; BROADWELL-NEXT: vpextrq $1, %xmm0, (%rdi) # sched: [2:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_movhps:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [6:1.00]
+; SKYLAKE-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vpextrq $1, %xmm0, (%rdi) # sched: [2:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_movhps:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [6:1.00]
+; SKX-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: vpextrq $1, %xmm0, (%rdi) # sched: [2:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_movhps:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [6:1.00]
; BTVER2-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: vpextrq $1, %xmm0, (%rdi) # sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_movhps:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [8:0.50]
; ZNVER1-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; ZNVER1-NEXT: vpextrq $1, %xmm0, (%rdi) # sched: [8:1.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = bitcast x86_mmx* %a2 to <2 x float>*
%2 = load <2 x float>, <2 x float> *%1, align 8
%3 = shufflevector <2 x float> %2, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -1297,46 +1754,64 @@ define void @test_movhps(<4 x float> %a0, <4 x float> %a1, x86_mmx *%a2) {
define <4 x float> @test_movlhps(<4 x float> %a0, <4 x float> %a1) {
; GENERIC-LABEL: test_movlhps:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; GENERIC-NEXT: addps %xmm1, %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:1.00]
+; GENERIC-NEXT: addps %xmm1, %xmm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_movlhps:
-; ATOM: # BB#0:
-; ATOM-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; ATOM-NEXT: addps %xmm1, %xmm0
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:1.00]
+; ATOM-NEXT: addps %xmm1, %xmm0 # sched: [5:5.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_movlhps:
-; SLM: # BB#0:
-; SLM-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:1.00]
+; SLM: # %bb.0:
+; SLM-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:1.00]
; SLM-NEXT: addps %xmm1, %xmm0 # sched: [3:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_movlhps:
-; SANDY: # BB#0:
-; SANDY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:1.00]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:1.00]
; SANDY-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movlhps:
-; HASWELL: # BB#0:
-; HASWELL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:1.00]
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:1.00]
; HASWELL-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_movlhps:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:1.00]
+; BROADWELL-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_movlhps:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:1.00]
+; SKYLAKE-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_movlhps:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:1.00]
+; SKX-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_movlhps:
-; BTVER2: # BB#0:
-; BTVER2-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:0.50]
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:0.50]
; BTVER2-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_movlhps:
-; ZNVER1: # BB#0:
-; ZNVER1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:0.50]
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:0.50]
; ZNVER1-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
%2 = fadd <4 x float> %a1, %1
ret <4 x float> %2
@@ -1344,53 +1819,74 @@ define <4 x float> @test_movlhps(<4 x float> %a0, <4 x float> %a1) {
define void @test_movlps(<4 x float> %a0, <4 x float> %a1, x86_mmx *%a2) {
; GENERIC-LABEL: test_movlps:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: movlpd {{.*#+}} xmm1 = mem[0],xmm1[1]
-; GENERIC-NEXT: addps %xmm0, %xmm1
-; GENERIC-NEXT: movlps %xmm1, (%rdi)
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: movlpd {{.*#+}} xmm1 = mem[0],xmm1[1] sched: [7:1.00]
+; GENERIC-NEXT: addps %xmm0, %xmm1 # sched: [3:1.00]
+; GENERIC-NEXT: movlps %xmm1, (%rdi) # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_movlps:
-; ATOM: # BB#0:
-; ATOM-NEXT: movlpd {{.*#+}} xmm1 = mem[0],xmm1[1]
-; ATOM-NEXT: addps %xmm0, %xmm1
-; ATOM-NEXT: movlps %xmm1, (%rdi)
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: movlpd {{.*#+}} xmm1 = mem[0],xmm1[1] sched: [1:1.00]
+; ATOM-NEXT: addps %xmm0, %xmm1 # sched: [5:5.00]
+; ATOM-NEXT: movlps %xmm1, (%rdi) # sched: [1:1.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_movlps:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: movlpd {{.*#+}} xmm1 = mem[0],xmm1[1] sched: [4:1.00]
; SLM-NEXT: addps %xmm0, %xmm1 # sched: [3:1.00]
; SLM-NEXT: movlps %xmm1, (%rdi) # sched: [1:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_movlps:
-; SANDY: # BB#0:
-; SANDY-NEXT: vmovlpd {{.*#+}} xmm1 = mem[0],xmm1[1] sched: [5:1.00]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vmovlpd {{.*#+}} xmm1 = mem[0],xmm1[1] sched: [7:1.00]
; SANDY-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vmovlps %xmm0, (%rdi) # sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmovlps %xmm0, (%rdi) # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movlps:
-; HASWELL: # BB#0:
-; HASWELL-NEXT: vmovlpd {{.*#+}} xmm1 = mem[0],xmm1[1] sched: [5:1.00]
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vmovlpd {{.*#+}} xmm1 = mem[0],xmm1[1] sched: [6:1.00]
; HASWELL-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; HASWELL-NEXT: vmovlps %xmm0, (%rdi) # sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_movlps:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vmovlpd {{.*#+}} xmm1 = mem[0],xmm1[1] sched: [6:1.00]
+; BROADWELL-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; BROADWELL-NEXT: vmovlps %xmm0, (%rdi) # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_movlps:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vmovlpd {{.*#+}} xmm1 = mem[0],xmm1[1] sched: [6:1.00]
+; SKYLAKE-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vmovlps %xmm0, (%rdi) # sched: [1:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_movlps:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovlpd {{.*#+}} xmm1 = mem[0],xmm1[1] sched: [6:1.00]
+; SKX-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: vmovlps %xmm0, (%rdi) # sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_movlps:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vmovlpd {{.*#+}} xmm1 = mem[0],xmm1[1] sched: [6:1.00]
; BTVER2-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: vmovlps %xmm0, (%rdi) # sched: [1:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_movlps:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vmovlpd {{.*#+}} xmm1 = mem[0],xmm1[1] sched: [8:0.50]
; ZNVER1-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; ZNVER1-NEXT: vmovlps %xmm0, (%rdi) # sched: [1:0.50]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = bitcast x86_mmx* %a2 to <2 x float>*
%2 = load <2 x float>, <2 x float> *%1, align 8
%3 = shufflevector <2 x float> %2, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -1403,41 +1899,56 @@ define void @test_movlps(<4 x float> %a0, <4 x float> %a1, x86_mmx *%a2) {
define i32 @test_movmskps(<4 x float> %a0) {
; GENERIC-LABEL: test_movmskps:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: movmskps %xmm0, %eax
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: movmskps %xmm0, %eax # sched: [2:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_movmskps:
-; ATOM: # BB#0:
-; ATOM-NEXT: movmskps %xmm0, %eax
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: movmskps %xmm0, %eax # sched: [3:3.00]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_movmskps:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: movmskps %xmm0, %eax # sched: [1:0.50]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_movmskps:
-; SANDY: # BB#0:
-; SANDY-NEXT: vmovmskps %xmm0, %eax # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vmovmskps %xmm0, %eax # sched: [2:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movmskps:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vmovmskps %xmm0, %eax # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_movmskps:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vmovmskps %xmm0, %eax # sched: [3:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_movmskps:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vmovmskps %xmm0, %eax # sched: [2:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_movmskps:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovmskps %xmm0, %eax # sched: [2:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_movmskps:
-; BTVER2: # BB#0:
-; BTVER2-NEXT: vmovmskps %xmm0, %eax # sched: [1:0.50]
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: vmovmskps %xmm0, %eax # sched: [3:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_movmskps:
-; ZNVER1: # BB#0:
-; ZNVER1-NEXT: vmovmskps %xmm0, %eax # sched: [1:0.25]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vmovmskps %xmm0, %eax # sched: [1:1.00]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %a0)
ret i32 %1
}
@@ -1445,98 +1956,134 @@ declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone
define void @test_movntps(<4 x float> %a0, <4 x float> *%a1) {
; GENERIC-LABEL: test_movntps:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: movntps %xmm0, (%rdi)
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: movntps %xmm0, (%rdi) # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_movntps:
-; ATOM: # BB#0:
-; ATOM-NEXT: movntps %xmm0, (%rdi)
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: movntps %xmm0, (%rdi) # sched: [1:1.00]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_movntps:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: movntps %xmm0, (%rdi) # sched: [1:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_movntps:
-; SANDY: # BB#0:
-; SANDY-NEXT: vmovntps %xmm0, (%rdi) # sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vmovntps %xmm0, (%rdi) # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movntps:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vmovntps %xmm0, (%rdi) # sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_movntps:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vmovntps %xmm0, (%rdi) # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_movntps:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vmovntps %xmm0, (%rdi) # sched: [1:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_movntps:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovntps %xmm0, (%rdi) # sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_movntps:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vmovntps %xmm0, (%rdi) # sched: [1:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_movntps:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vmovntps %xmm0, (%rdi) # sched: [1:0.50]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
store <4 x float> %a0, <4 x float> *%a1, align 16, !nontemporal !0
ret void
}
define void @test_movss_mem(float* %a0, float* %a1) {
; GENERIC-LABEL: test_movss_mem:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; GENERIC-NEXT: addss %xmm0, %xmm0
-; GENERIC-NEXT: movss %xmm0, (%rsi)
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [6:0.50]
+; GENERIC-NEXT: addss %xmm0, %xmm0 # sched: [3:1.00]
+; GENERIC-NEXT: movss %xmm0, (%rsi) # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_movss_mem:
-; ATOM: # BB#0:
-; ATOM-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; ATOM-NEXT: addss %xmm0, %xmm0
-; ATOM-NEXT: movss %xmm0, (%rsi)
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [1:1.00]
+; ATOM-NEXT: addss %xmm0, %xmm0 # sched: [5:5.00]
+; ATOM-NEXT: movss %xmm0, (%rsi) # sched: [1:1.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_movss_mem:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [3:1.00]
; SLM-NEXT: addss %xmm0, %xmm0 # sched: [3:1.00]
; SLM-NEXT: movss %xmm0, (%rsi) # sched: [1:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_movss_mem:
-; SANDY: # BB#0:
-; SANDY-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [4:0.50]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [6:0.50]
; SANDY-NEXT: vaddss %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vmovss %xmm0, (%rsi) # sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmovss %xmm0, (%rsi) # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movss_mem:
-; HASWELL: # BB#0:
-; HASWELL-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [4:0.50]
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [5:0.50]
; HASWELL-NEXT: vaddss %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
; HASWELL-NEXT: vmovss %xmm0, (%rsi) # sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_movss_mem:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [5:0.50]
+; BROADWELL-NEXT: vaddss %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
+; BROADWELL-NEXT: vmovss %xmm0, (%rsi) # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_movss_mem:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [5:0.50]
+; SKYLAKE-NEXT: vaddss %xmm0, %xmm0, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vmovss %xmm0, (%rsi) # sched: [1:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_movss_mem:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [5:0.50]
+; SKX-NEXT: vaddss %xmm0, %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: vmovss %xmm0, (%rsi) # sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_movss_mem:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [5:1.00]
; BTVER2-NEXT: vaddss %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: vmovss %xmm0, (%rsi) # sched: [1:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_movss_mem:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [8:0.50]
; ZNVER1-NEXT: vaddss %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
; ZNVER1-NEXT: vmovss %xmm0, (%rsi) # sched: [1:0.50]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = load float, float* %a0, align 1
%2 = fadd float %1, %1
store float %2, float *%a1, align 1
@@ -1545,98 +2092,134 @@ define void @test_movss_mem(float* %a0, float* %a1) {
define <4 x float> @test_movss_reg(<4 x float> %a0, <4 x float> %a1) {
; GENERIC-LABEL: test_movss_reg:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_movss_reg:
-; ATOM: # BB#0:
-; ATOM-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_movss_reg:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] sched: [1:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_movss_reg:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movss_reg:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] sched: [1:0.33]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_movss_reg:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] sched: [1:0.33]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_movss_reg:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_movss_reg:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_movss_reg:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_movss_reg:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] sched: [1:0.50]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
ret <4 x float> %1
}
define void @test_movups(<4 x float> *%a0, <4 x float> *%a1) {
; GENERIC-LABEL: test_movups:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: movups (%rdi), %xmm0
-; GENERIC-NEXT: addps %xmm0, %xmm0
-; GENERIC-NEXT: movups %xmm0, (%rsi)
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: movups (%rdi), %xmm0 # sched: [6:0.50]
+; GENERIC-NEXT: addps %xmm0, %xmm0 # sched: [3:1.00]
+; GENERIC-NEXT: movups %xmm0, (%rsi) # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_movups:
-; ATOM: # BB#0:
-; ATOM-NEXT: movups (%rdi), %xmm0
-; ATOM-NEXT: addps %xmm0, %xmm0
-; ATOM-NEXT: movups %xmm0, (%rsi)
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: movups (%rdi), %xmm0 # sched: [3:1.50]
+; ATOM-NEXT: addps %xmm0, %xmm0 # sched: [5:5.00]
+; ATOM-NEXT: movups %xmm0, (%rsi) # sched: [2:1.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_movups:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: movups (%rdi), %xmm0 # sched: [3:1.00]
; SLM-NEXT: addps %xmm0, %xmm0 # sched: [3:1.00]
; SLM-NEXT: movups %xmm0, (%rsi) # sched: [1:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_movups:
-; SANDY: # BB#0:
-; SANDY-NEXT: vmovups (%rdi), %xmm0 # sched: [4:0.50]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vmovups (%rdi), %xmm0 # sched: [6:0.50]
; SANDY-NEXT: vaddps %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vmovups %xmm0, (%rsi) # sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmovups %xmm0, (%rsi) # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movups:
-; HASWELL: # BB#0:
-; HASWELL-NEXT: vmovups (%rdi), %xmm0 # sched: [4:0.50]
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vmovups (%rdi), %xmm0 # sched: [6:0.50]
; HASWELL-NEXT: vaddps %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
; HASWELL-NEXT: vmovups %xmm0, (%rsi) # sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_movups:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vmovups (%rdi), %xmm0 # sched: [5:0.50]
+; BROADWELL-NEXT: vaddps %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
+; BROADWELL-NEXT: vmovups %xmm0, (%rsi) # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_movups:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vmovups (%rdi), %xmm0 # sched: [6:0.50]
+; SKYLAKE-NEXT: vaddps %xmm0, %xmm0, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vmovups %xmm0, (%rsi) # sched: [1:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_movups:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovups (%rdi), %xmm0 # sched: [6:0.50]
+; SKX-NEXT: vaddps %xmm0, %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: vmovups %xmm0, (%rsi) # sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_movups:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vmovups (%rdi), %xmm0 # sched: [5:1.00]
; BTVER2-NEXT: vaddps %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: vmovups %xmm0, (%rsi) # sched: [1:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_movups:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vmovups (%rdi), %xmm0 # sched: [8:0.50]
; ZNVER1-NEXT: vaddps %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
; ZNVER1-NEXT: vmovups %xmm0, (%rsi) # sched: [1:0.50]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = load <4 x float>, <4 x float> *%a0, align 1
%2 = fadd <4 x float> %1, %1
store <4 x float> %2, <4 x float> *%a1, align 1
@@ -1645,46 +2228,64 @@ define void @test_movups(<4 x float> *%a0, <4 x float> *%a1) {
define <4 x float> @test_mulps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2) {
; GENERIC-LABEL: test_mulps:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: mulps %xmm1, %xmm0
-; GENERIC-NEXT: mulps (%rdi), %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: mulps %xmm1, %xmm0 # sched: [5:1.00]
+; GENERIC-NEXT: mulps (%rdi), %xmm0 # sched: [11:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_mulps:
-; ATOM: # BB#0:
-; ATOM-NEXT: mulps %xmm1, %xmm0
-; ATOM-NEXT: mulps (%rdi), %xmm0
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: mulps %xmm1, %xmm0 # sched: [5:5.00]
+; ATOM-NEXT: mulps (%rdi), %xmm0 # sched: [10:5.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_mulps:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: mulps %xmm1, %xmm0 # sched: [5:2.00]
; SLM-NEXT: mulps (%rdi), %xmm0 # sched: [8:2.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_mulps:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; SANDY-NEXT: vmulps (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmulps (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_mulps:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vmulps %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT: vmulps (%rdi), %xmm0, %xmm0 # sched: [9:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vmulps (%rdi), %xmm0, %xmm0 # sched: [11:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_mulps:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vmulps %xmm1, %xmm0, %xmm0 # sched: [3:0.50]
+; BROADWELL-NEXT: vmulps (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_mulps:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vmulps %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vmulps (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_mulps:
+; SKX: # %bb.0:
+; SKX-NEXT: vmulps %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: vmulps (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_mulps:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vmulps %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
; BTVER2-NEXT: vmulps (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_mulps:
-; ZNVER1: # BB#0:
-; ZNVER1-NEXT: vmulps %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; ZNVER1-NEXT: vmulps (%rdi), %xmm0, %xmm0 # sched: [12:1.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vmulps %xmm1, %xmm0, %xmm0 # sched: [3:0.50]
+; ZNVER1-NEXT: vmulps (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = fmul <4 x float> %a0, %a1
%2 = load <4 x float>, <4 x float> *%a2, align 16
%3 = fmul <4 x float> %1, %2
@@ -1693,46 +2294,64 @@ define <4 x float> @test_mulps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a
define float @test_mulss(float %a0, float %a1, float *%a2) {
; GENERIC-LABEL: test_mulss:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: mulss %xmm1, %xmm0
-; GENERIC-NEXT: mulss (%rdi), %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: mulss %xmm1, %xmm0 # sched: [5:1.00]
+; GENERIC-NEXT: mulss (%rdi), %xmm0 # sched: [11:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_mulss:
-; ATOM: # BB#0:
-; ATOM-NEXT: mulss %xmm1, %xmm0
-; ATOM-NEXT: mulss (%rdi), %xmm0
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: mulss %xmm1, %xmm0 # sched: [4:4.00]
+; ATOM-NEXT: mulss (%rdi), %xmm0 # sched: [5:5.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_mulss:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: mulss %xmm1, %xmm0 # sched: [5:2.00]
; SLM-NEXT: mulss (%rdi), %xmm0 # sched: [8:2.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_mulss:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vmulss %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; SANDY-NEXT: vmulss (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmulss (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_mulss:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vmulss %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT: vmulss (%rdi), %xmm0, %xmm0 # sched: [9:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vmulss (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_mulss:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vmulss %xmm1, %xmm0, %xmm0 # sched: [3:0.50]
+; BROADWELL-NEXT: vmulss (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_mulss:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vmulss %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vmulss (%rdi), %xmm0, %xmm0 # sched: [9:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_mulss:
+; SKX: # %bb.0:
+; SKX-NEXT: vmulss %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: vmulss (%rdi), %xmm0, %xmm0 # sched: [9:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_mulss:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vmulss %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
; BTVER2-NEXT: vmulss (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_mulss:
-; ZNVER1: # BB#0:
-; ZNVER1-NEXT: vmulss %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; ZNVER1-NEXT: vmulss (%rdi), %xmm0, %xmm0 # sched: [12:1.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vmulss %xmm1, %xmm0, %xmm0 # sched: [3:0.50]
+; ZNVER1-NEXT: vmulss (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = fmul float %a0, %a1
%2 = load float, float *%a2, align 4
%3 = fmul float %1, %2
@@ -1741,54 +2360,68 @@ define float @test_mulss(float %a0, float %a1, float *%a2) {
define <4 x float> @test_orps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2) {
; GENERIC-LABEL: test_orps:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: orps %xmm1, %xmm0
-; GENERIC-NEXT: orps (%rdi), %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: orps %xmm1, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: orps (%rdi), %xmm0 # sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_orps:
-; ATOM: # BB#0:
-; ATOM-NEXT: orps %xmm1, %xmm0
-; ATOM-NEXT: orps (%rdi), %xmm0
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: orps %xmm1, %xmm0 # sched: [1:0.50]
+; ATOM-NEXT: orps (%rdi), %xmm0 # sched: [1:1.00]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_orps:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: orps %xmm1, %xmm0 # sched: [1:0.50]
; SLM-NEXT: orps (%rdi), %xmm0 # sched: [4:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_orps:
-; SANDY: # BB#0:
-; SANDY-NEXT: vorps %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
-; SANDY-NEXT: vorps (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vorps %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; SANDY-NEXT: vorps (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_orps:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vorps %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT: vorps (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vorps (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_orps:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vorps %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; BROADWELL-NEXT: vorps (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_orps:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vorps %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: vorps (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_orps:
+; SKX: # %bb.0:
+; SKX-NEXT: vorps %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKX-NEXT: vorps (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_orps:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vorps %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vorps (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_orps:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vorps %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
; ZNVER1-NEXT: vorps (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = bitcast <4 x float> %a0 to <4 x i32>
%2 = bitcast <4 x float> %a1 to <4 x i32>
%3 = or <4 x i32> %1, %2
@@ -1799,70 +2432,128 @@ define <4 x float> @test_orps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2
ret <4 x float> %7
}
-define void @test_prefetchnta(i8* %a0) {
-; GENERIC-LABEL: test_prefetchnta:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: prefetchnta (%rdi)
-; GENERIC-NEXT: retq
-;
-; ATOM-LABEL: test_prefetchnta:
-; ATOM: # BB#0:
-; ATOM-NEXT: prefetchnta (%rdi)
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: retq
-;
-; SLM-LABEL: test_prefetchnta:
-; SLM: # BB#0:
+define void @test_prefetch(i8* %a0) optsize {
+; GENERIC-LABEL: test_prefetch:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: prefetchnta (%rdi) # sched: [5:0.50]
+; GENERIC-NEXT: prefetcht0 (%rdi) # sched: [5:0.50]
+; GENERIC-NEXT: prefetcht1 (%rdi) # sched: [5:0.50]
+; GENERIC-NEXT: prefetcht2 (%rdi) # sched: [5:0.50]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_prefetch:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: prefetchnta (%rdi) # sched: [1:1.00]
+; ATOM-NEXT: prefetcht0 (%rdi) # sched: [1:1.00]
+; ATOM-NEXT: prefetcht1 (%rdi) # sched: [1:1.00]
+; ATOM-NEXT: prefetcht2 (%rdi) # sched: [1:1.00]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_prefetch:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
; SLM-NEXT: prefetchnta (%rdi) # sched: [3:1.00]
+; SLM-NEXT: prefetcht0 (%rdi) # sched: [3:1.00]
+; SLM-NEXT: prefetcht1 (%rdi) # sched: [3:1.00]
+; SLM-NEXT: prefetcht2 (%rdi) # sched: [3:1.00]
+; SLM-NEXT: #NO_APP
; SLM-NEXT: retq # sched: [4:1.00]
;
-; SANDY-LABEL: test_prefetchnta:
-; SANDY: # BB#0:
-; SANDY-NEXT: prefetchnta (%rdi) # sched: [4:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
-;
-; HASWELL-LABEL: test_prefetchnta:
-; HASWELL: # BB#0:
-; HASWELL-NEXT: prefetchnta (%rdi) # sched: [4:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
-;
-; BTVER2-LABEL: test_prefetchnta:
-; BTVER2: # BB#0:
+; SANDY-LABEL: test_prefetch:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: prefetchnta (%rdi) # sched: [5:0.50]
+; SANDY-NEXT: prefetcht0 (%rdi) # sched: [5:0.50]
+; SANDY-NEXT: prefetcht1 (%rdi) # sched: [5:0.50]
+; SANDY-NEXT: prefetcht2 (%rdi) # sched: [5:0.50]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_prefetch:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: prefetchnta (%rdi) # sched: [5:0.50]
+; HASWELL-NEXT: prefetcht0 (%rdi) # sched: [5:0.50]
+; HASWELL-NEXT: prefetcht1 (%rdi) # sched: [5:0.50]
+; HASWELL-NEXT: prefetcht2 (%rdi) # sched: [5:0.50]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_prefetch:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: prefetchnta (%rdi) # sched: [5:0.50]
+; BROADWELL-NEXT: prefetcht0 (%rdi) # sched: [5:0.50]
+; BROADWELL-NEXT: prefetcht1 (%rdi) # sched: [5:0.50]
+; BROADWELL-NEXT: prefetcht2 (%rdi) # sched: [5:0.50]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_prefetch:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: prefetchnta (%rdi) # sched: [5:0.50]
+; SKYLAKE-NEXT: prefetcht0 (%rdi) # sched: [5:0.50]
+; SKYLAKE-NEXT: prefetcht1 (%rdi) # sched: [5:0.50]
+; SKYLAKE-NEXT: prefetcht2 (%rdi) # sched: [5:0.50]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_prefetch:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: prefetchnta (%rdi) # sched: [5:0.50]
+; SKX-NEXT: prefetcht0 (%rdi) # sched: [5:0.50]
+; SKX-NEXT: prefetcht1 (%rdi) # sched: [5:0.50]
+; SKX-NEXT: prefetcht2 (%rdi) # sched: [5:0.50]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_prefetch:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
; BTVER2-NEXT: prefetchnta (%rdi) # sched: [5:1.00]
+; BTVER2-NEXT: prefetcht0 (%rdi) # sched: [5:1.00]
+; BTVER2-NEXT: prefetcht1 (%rdi) # sched: [5:1.00]
+; BTVER2-NEXT: prefetcht2 (%rdi) # sched: [5:1.00]
+; BTVER2-NEXT: #NO_APP
; BTVER2-NEXT: retq # sched: [4:1.00]
;
-; ZNVER1-LABEL: test_prefetchnta:
-; ZNVER1: # BB#0:
+; ZNVER1-LABEL: test_prefetch:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
; ZNVER1-NEXT: prefetchnta (%rdi) # sched: [8:0.50]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
- call void @llvm.prefetch(i8* %a0, i32 0, i32 0, i32 1)
+; ZNVER1-NEXT: prefetcht0 (%rdi) # sched: [8:0.50]
+; ZNVER1-NEXT: prefetcht1 (%rdi) # sched: [8:0.50]
+; ZNVER1-NEXT: prefetcht2 (%rdi) # sched: [8:0.50]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ call void asm sideeffect "prefetchnta $0 \0A\09 prefetcht0 $0 \0A\09 prefetcht1 $0 \0A\09 prefetcht2 $0", "*m"(i8 *%a0)
ret void
}
-declare void @llvm.prefetch(i8* nocapture, i32, i32, i32) nounwind readnone
define <4 x float> @test_rcpps(<4 x float> %a0, <4 x float> *%a1) {
; GENERIC-LABEL: test_rcpps:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: rcpps %xmm0, %xmm1
-; GENERIC-NEXT: rcpps (%rdi), %xmm0
-; GENERIC-NEXT: addps %xmm1, %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: rcpps %xmm0, %xmm1 # sched: [5:1.00]
+; GENERIC-NEXT: rcpps (%rdi), %xmm0 # sched: [11:1.00]
+; GENERIC-NEXT: addps %xmm1, %xmm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_rcpps:
-; ATOM: # BB#0:
-; ATOM-NEXT: rcpps (%rdi), %xmm1
-; ATOM-NEXT: rcpps %xmm0, %xmm0
-; ATOM-NEXT: addps %xmm0, %xmm1
-; ATOM-NEXT: movaps %xmm1, %xmm0
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: rcpps (%rdi), %xmm1 # sched: [10:5.00]
+; ATOM-NEXT: rcpps %xmm0, %xmm0 # sched: [9:4.50]
+; ATOM-NEXT: addps %xmm0, %xmm1 # sched: [5:5.00]
+; ATOM-NEXT: movaps %xmm1, %xmm0 # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_rcpps:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: rcpps (%rdi), %xmm1 # sched: [8:1.00]
; SLM-NEXT: rcpps %xmm0, %xmm0 # sched: [5:1.00]
; SLM-NEXT: addps %xmm0, %xmm1 # sched: [3:1.00]
@@ -1870,32 +2561,53 @@ define <4 x float> @test_rcpps(<4 x float> %a0, <4 x float> *%a1) {
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_rcpps:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vrcpps %xmm0, %xmm0 # sched: [5:1.00]
-; SANDY-NEXT: vrcpps (%rdi), %xmm1 # sched: [9:1.00]
+; SANDY-NEXT: vrcpps (%rdi), %xmm1 # sched: [11:1.00]
; SANDY-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_rcpps:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vrcpps %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT: vrcpps (%rdi), %xmm1 # sched: [9:1.00]
+; HASWELL-NEXT: vrcpps (%rdi), %xmm1 # sched: [11:1.00]
; HASWELL-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_rcpps:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vrcpps %xmm0, %xmm0 # sched: [5:1.00]
+; BROADWELL-NEXT: vrcpps (%rdi), %xmm1 # sched: [10:1.00]
+; BROADWELL-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_rcpps:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vrcpps %xmm0, %xmm0 # sched: [4:1.00]
+; SKYLAKE-NEXT: vrcpps (%rdi), %xmm1 # sched: [10:1.00]
+; SKYLAKE-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_rcpps:
+; SKX: # %bb.0:
+; SKX-NEXT: vrcpps %xmm0, %xmm0 # sched: [4:1.00]
+; SKX-NEXT: vrcpps (%rdi), %xmm1 # sched: [10:1.00]
+; SKX-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_rcpps:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vrcpps (%rdi), %xmm1 # sched: [7:1.00]
; BTVER2-NEXT: vrcpps %xmm0, %xmm0 # sched: [2:1.00]
; BTVER2-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_rcpps:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vrcpps (%rdi), %xmm1 # sched: [12:0.50]
; ZNVER1-NEXT: vrcpps %xmm0, %xmm0 # sched: [5:0.50]
; ZNVER1-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %a0)
%2 = load <4 x float>, <4 x float> *%a1, align 16
%3 = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %2)
@@ -1908,23 +2620,23 @@ declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone
define <4 x float> @test_rcpss(float %a0, float *%a1) {
; GENERIC-LABEL: test_rcpss:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: rcpss %xmm0, %xmm0
-; GENERIC-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; GENERIC-NEXT: rcpss %xmm1, %xmm1
-; GENERIC-NEXT: addps %xmm1, %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: rcpss %xmm0, %xmm0 # sched: [5:1.00]
+; GENERIC-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [6:0.50]
+; GENERIC-NEXT: rcpss %xmm1, %xmm1 # sched: [5:1.00]
+; GENERIC-NEXT: addps %xmm1, %xmm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_rcpss:
-; ATOM: # BB#0:
-; ATOM-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; ATOM: # %bb.0:
+; ATOM-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [1:1.00]
; ATOM-NEXT: rcpss %xmm0, %xmm0
; ATOM-NEXT: rcpss %xmm1, %xmm1
-; ATOM-NEXT: addps %xmm1, %xmm0
-; ATOM-NEXT: retq
+; ATOM-NEXT: addps %xmm1, %xmm0 # sched: [5:5.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_rcpss:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [3:1.00]
; SLM-NEXT: rcpss %xmm0, %xmm0 # sched: [8:1.00]
; SLM-NEXT: rcpss %xmm1, %xmm1 # sched: [8:1.00]
@@ -1932,23 +2644,47 @@ define <4 x float> @test_rcpss(float %a0, float *%a1) {
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_rcpss:
-; SANDY: # BB#0:
-; SANDY-NEXT: vrcpss %xmm0, %xmm0, %xmm0 # sched: [9:1.00]
-; SANDY-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [4:0.50]
-; SANDY-NEXT: vrcpss %xmm1, %xmm1, %xmm1 # sched: [9:1.00]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vrcpss %xmm0, %xmm0, %xmm0 # sched: [5:1.00]
+; SANDY-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [6:0.50]
+; SANDY-NEXT: vrcpss %xmm1, %xmm1, %xmm1 # sched: [5:1.00]
; SANDY-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_rcpss:
-; HASWELL: # BB#0:
-; HASWELL-NEXT: vrcpss %xmm0, %xmm0, %xmm0 # sched: [9:1.00]
-; HASWELL-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [4:0.50]
-; HASWELL-NEXT: vrcpss %xmm1, %xmm1, %xmm1 # sched: [9:1.00]
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vrcpss %xmm0, %xmm0, %xmm0 # sched: [5:1.00]
+; HASWELL-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [5:0.50]
+; HASWELL-NEXT: vrcpss %xmm1, %xmm1, %xmm1 # sched: [5:1.00]
; HASWELL-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_rcpss:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vrcpss %xmm0, %xmm0, %xmm0 # sched: [5:1.00]
+; BROADWELL-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [5:0.50]
+; BROADWELL-NEXT: vrcpss %xmm1, %xmm1, %xmm1 # sched: [5:1.00]
+; BROADWELL-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_rcpss:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vrcpss %xmm0, %xmm0, %xmm0 # sched: [4:1.00]
+; SKYLAKE-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [5:0.50]
+; SKYLAKE-NEXT: vrcpss %xmm1, %xmm1, %xmm1 # sched: [4:1.00]
+; SKYLAKE-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_rcpss:
+; SKX: # %bb.0:
+; SKX-NEXT: vrcpss %xmm0, %xmm0, %xmm0 # sched: [4:1.00]
+; SKX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [5:0.50]
+; SKX-NEXT: vrcpss %xmm1, %xmm1, %xmm1 # sched: [4:1.00]
+; SKX-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_rcpss:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [5:1.00]
; BTVER2-NEXT: vrcpss %xmm0, %xmm0, %xmm0 # sched: [7:1.00]
; BTVER2-NEXT: vrcpss %xmm1, %xmm1, %xmm1 # sched: [7:1.00]
@@ -1956,12 +2692,12 @@ define <4 x float> @test_rcpss(float %a0, float *%a1) {
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_rcpss:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [8:0.50]
; ZNVER1-NEXT: vrcpss %xmm0, %xmm0, %xmm0 # sched: [12:0.50]
; ZNVER1-NEXT: vrcpss %xmm1, %xmm1, %xmm1 # sched: [12:0.50]
; ZNVER1-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = insertelement <4 x float> undef, float %a0, i32 0
%2 = call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %1)
%3 = load float, float *%a1, align 4
@@ -1974,22 +2710,22 @@ declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone
define <4 x float> @test_rsqrtps(<4 x float> %a0, <4 x float> *%a1) {
; GENERIC-LABEL: test_rsqrtps:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: rsqrtps %xmm0, %xmm1
-; GENERIC-NEXT: rsqrtps (%rdi), %xmm0
-; GENERIC-NEXT: addps %xmm1, %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: rsqrtps %xmm0, %xmm1 # sched: [5:1.00]
+; GENERIC-NEXT: rsqrtps (%rdi), %xmm0 # sched: [11:1.00]
+; GENERIC-NEXT: addps %xmm1, %xmm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_rsqrtps:
-; ATOM: # BB#0:
-; ATOM-NEXT: rsqrtps (%rdi), %xmm1
-; ATOM-NEXT: rsqrtps %xmm0, %xmm0
-; ATOM-NEXT: addps %xmm0, %xmm1
-; ATOM-NEXT: movaps %xmm1, %xmm0
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: rsqrtps (%rdi), %xmm1 # sched: [10:5.00]
+; ATOM-NEXT: rsqrtps %xmm0, %xmm0 # sched: [9:4.50]
+; ATOM-NEXT: addps %xmm0, %xmm1 # sched: [5:5.00]
+; ATOM-NEXT: movaps %xmm1, %xmm0 # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_rsqrtps:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: rsqrtps (%rdi), %xmm1 # sched: [8:1.00]
; SLM-NEXT: rsqrtps %xmm0, %xmm0 # sched: [5:1.00]
; SLM-NEXT: addps %xmm0, %xmm1 # sched: [3:1.00]
@@ -1997,32 +2733,53 @@ define <4 x float> @test_rsqrtps(<4 x float> %a0, <4 x float> *%a1) {
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_rsqrtps:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vrsqrtps %xmm0, %xmm0 # sched: [5:1.00]
-; SANDY-NEXT: vrsqrtps (%rdi), %xmm1 # sched: [9:1.00]
+; SANDY-NEXT: vrsqrtps (%rdi), %xmm1 # sched: [11:1.00]
; SANDY-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_rsqrtps:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vrsqrtps %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT: vrsqrtps (%rdi), %xmm1 # sched: [9:1.00]
+; HASWELL-NEXT: vrsqrtps (%rdi), %xmm1 # sched: [11:1.00]
; HASWELL-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_rsqrtps:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vrsqrtps %xmm0, %xmm0 # sched: [5:1.00]
+; BROADWELL-NEXT: vrsqrtps (%rdi), %xmm1 # sched: [10:1.00]
+; BROADWELL-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_rsqrtps:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vrsqrtps %xmm0, %xmm0 # sched: [4:1.00]
+; SKYLAKE-NEXT: vrsqrtps (%rdi), %xmm1 # sched: [10:1.00]
+; SKYLAKE-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_rsqrtps:
+; SKX: # %bb.0:
+; SKX-NEXT: vrsqrtps %xmm0, %xmm0 # sched: [4:1.00]
+; SKX-NEXT: vrsqrtps (%rdi), %xmm1 # sched: [10:1.00]
+; SKX-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_rsqrtps:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vrsqrtps (%rdi), %xmm1 # sched: [7:1.00]
; BTVER2-NEXT: vrsqrtps %xmm0, %xmm0 # sched: [2:1.00]
; BTVER2-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_rsqrtps:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vrsqrtps (%rdi), %xmm1 # sched: [12:0.50]
; ZNVER1-NEXT: vrsqrtps %xmm0, %xmm0 # sched: [5:0.50]
; ZNVER1-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %a0)
%2 = load <4 x float>, <4 x float> *%a1, align 16
%3 = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %2)
@@ -2035,23 +2792,23 @@ declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone
define <4 x float> @test_rsqrtss(float %a0, float *%a1) {
; GENERIC-LABEL: test_rsqrtss:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: rsqrtss %xmm0, %xmm0
-; GENERIC-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; GENERIC-NEXT: rsqrtss %xmm1, %xmm1
-; GENERIC-NEXT: addps %xmm1, %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: rsqrtss %xmm0, %xmm0 # sched: [5:1.00]
+; GENERIC-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [6:0.50]
+; GENERIC-NEXT: rsqrtss %xmm1, %xmm1 # sched: [5:1.00]
+; GENERIC-NEXT: addps %xmm1, %xmm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_rsqrtss:
-; ATOM: # BB#0:
-; ATOM-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; ATOM: # %bb.0:
+; ATOM-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [1:1.00]
; ATOM-NEXT: rsqrtss %xmm0, %xmm0
; ATOM-NEXT: rsqrtss %xmm1, %xmm1
-; ATOM-NEXT: addps %xmm1, %xmm0
-; ATOM-NEXT: retq
+; ATOM-NEXT: addps %xmm1, %xmm0 # sched: [5:5.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_rsqrtss:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [3:1.00]
; SLM-NEXT: rsqrtss %xmm0, %xmm0 # sched: [8:1.00]
; SLM-NEXT: rsqrtss %xmm1, %xmm1 # sched: [8:1.00]
@@ -2059,23 +2816,47 @@ define <4 x float> @test_rsqrtss(float %a0, float *%a1) {
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_rsqrtss:
-; SANDY: # BB#0:
-; SANDY-NEXT: vrsqrtss %xmm0, %xmm0, %xmm0 # sched: [9:1.00]
-; SANDY-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [4:0.50]
-; SANDY-NEXT: vrsqrtss %xmm1, %xmm1, %xmm1 # sched: [9:1.00]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vrsqrtss %xmm0, %xmm0, %xmm0 # sched: [5:1.00]
+; SANDY-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [6:0.50]
+; SANDY-NEXT: vrsqrtss %xmm1, %xmm1, %xmm1 # sched: [5:1.00]
; SANDY-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_rsqrtss:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vrsqrtss %xmm0, %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [4:0.50]
+; HASWELL-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [5:0.50]
; HASWELL-NEXT: vrsqrtss %xmm1, %xmm1, %xmm1 # sched: [5:1.00]
; HASWELL-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_rsqrtss:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vrsqrtss %xmm0, %xmm0, %xmm0 # sched: [5:1.00]
+; BROADWELL-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [5:0.50]
+; BROADWELL-NEXT: vrsqrtss %xmm1, %xmm1, %xmm1 # sched: [5:1.00]
+; BROADWELL-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_rsqrtss:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vrsqrtss %xmm0, %xmm0, %xmm0 # sched: [4:1.00]
+; SKYLAKE-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [5:0.50]
+; SKYLAKE-NEXT: vrsqrtss %xmm1, %xmm1, %xmm1 # sched: [4:1.00]
+; SKYLAKE-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_rsqrtss:
+; SKX: # %bb.0:
+; SKX-NEXT: vrsqrtss %xmm0, %xmm0, %xmm0 # sched: [4:1.00]
+; SKX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [5:0.50]
+; SKX-NEXT: vrsqrtss %xmm1, %xmm1, %xmm1 # sched: [4:1.00]
+; SKX-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_rsqrtss:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [5:1.00]
; BTVER2-NEXT: vrsqrtss %xmm0, %xmm0, %xmm0 # sched: [7:1.00]
; BTVER2-NEXT: vrsqrtss %xmm1, %xmm1, %xmm1 # sched: [7:1.00]
@@ -2083,12 +2864,12 @@ define <4 x float> @test_rsqrtss(float %a0, float *%a1) {
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_rsqrtss:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [8:0.50]
-; ZNVER1-NEXT: vrsqrtss %xmm0, %xmm0, %xmm0 # sched: [12:0.50]
-; ZNVER1-NEXT: vrsqrtss %xmm1, %xmm1, %xmm1 # sched: [12:0.50]
+; ZNVER1-NEXT: vrsqrtss %xmm0, %xmm0, %xmm0 # sched: [5:0.50]
+; ZNVER1-NEXT: vrsqrtss %xmm1, %xmm1, %xmm1 # sched: [5:0.50]
; ZNVER1-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = insertelement <4 x float> undef, float %a0, i32 0
%2 = call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %1)
%3 = load float, float *%a1, align 4
@@ -2101,45 +2882,60 @@ declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone
define void @test_sfence() {
; GENERIC-LABEL: test_sfence:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: sfence
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: sfence # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_sfence:
-; ATOM: # BB#0:
-; ATOM-NEXT: sfence
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: sfence # sched: [1:1.00]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_sfence:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: sfence # sched: [1:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_sfence:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: sfence # sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_sfence:
-; HASWELL: # BB#0:
-; HASWELL-NEXT: sfence # sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: sfence # sched: [2:0.33]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_sfence:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: sfence # sched: [2:0.33]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_sfence:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: sfence # sched: [2:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_sfence:
+; SKX: # %bb.0:
+; SKX-NEXT: sfence # sched: [2:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_sfence:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: sfence # sched: [1:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_sfence:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: sfence # sched: [1:0.50]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
call void @llvm.x86.sse.sfence()
ret void
}
@@ -2147,50 +2943,68 @@ declare void @llvm.x86.sse.sfence() nounwind readnone
define <4 x float> @test_shufps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2) nounwind {
; GENERIC-LABEL: test_shufps:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0]
-; GENERIC-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],mem[0,0]
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0] sched: [1:1.00]
+; GENERIC-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],mem[0,0] sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_shufps:
-; ATOM: # BB#0:
-; ATOM-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0]
-; ATOM-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],mem[0,0]
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0] sched: [1:1.00]
+; ATOM-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],mem[0,0] sched: [1:1.00]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_shufps:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0] sched: [1:1.00]
; SLM-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],mem[0,0] sched: [4:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_shufps:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0] sched: [1:1.00]
-; SANDY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3],mem[0,0] sched: [5:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3],mem[0,0] sched: [7:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_shufps:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0] sched: [1:1.00]
-; HASWELL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3],mem[0,0] sched: [5:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3],mem[0,0] sched: [7:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_shufps:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0] sched: [1:1.00]
+; BROADWELL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3],mem[0,0] sched: [6:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_shufps:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0] sched: [1:1.00]
+; SKYLAKE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3],mem[0,0] sched: [7:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_shufps:
+; SKX: # %bb.0:
+; SKX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0] sched: [1:1.00]
+; SKX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3],mem[0,0] sched: [7:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_shufps:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0] sched: [1:0.50]
; BTVER2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3],mem[0,0] sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_shufps:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0] sched: [1:0.50]
; ZNVER1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3],mem[0,0] sched: [8:0.50]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 0, i32 0, i32 4, i32 4>
%2 = load <4 x float>, <4 x float> *%a2, align 16
%3 = shufflevector <4 x float> %1, <4 x float> %2, <4 x i32> <i32 0, i32 3, i32 4, i32 4>
@@ -2199,21 +3013,21 @@ define <4 x float> @test_shufps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%
define <4 x float> @test_sqrtps(<4 x float> %a0, <4 x float> *%a1) {
; GENERIC-LABEL: test_sqrtps:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: sqrtps %xmm0, %xmm1
-; GENERIC-NEXT: sqrtps (%rdi), %xmm0
-; GENERIC-NEXT: addps %xmm1, %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: sqrtps %xmm0, %xmm1 # sched: [14:1.00]
+; GENERIC-NEXT: sqrtps (%rdi), %xmm0 # sched: [20:1.00]
+; GENERIC-NEXT: addps %xmm1, %xmm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_sqrtps:
-; ATOM: # BB#0:
-; ATOM-NEXT: sqrtps %xmm0, %xmm1
-; ATOM-NEXT: sqrtps (%rdi), %xmm0
-; ATOM-NEXT: addps %xmm1, %xmm0
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: sqrtps %xmm0, %xmm1 # sched: [70:35.00]
+; ATOM-NEXT: sqrtps (%rdi), %xmm0 # sched: [70:35.00]
+; ATOM-NEXT: addps %xmm1, %xmm0 # sched: [5:5.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_sqrtps:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: sqrtps (%rdi), %xmm1 # sched: [18:1.00]
; SLM-NEXT: sqrtps %xmm0, %xmm0 # sched: [15:1.00]
; SLM-NEXT: addps %xmm0, %xmm1 # sched: [3:1.00]
@@ -2221,32 +3035,53 @@ define <4 x float> @test_sqrtps(<4 x float> %a0, <4 x float> *%a1) {
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_sqrtps:
-; SANDY: # BB#0:
-; SANDY-NEXT: vsqrtps %xmm0, %xmm0 # sched: [15:1.00]
-; SANDY-NEXT: vsqrtps (%rdi), %xmm1 # sched: [19:1.00]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vsqrtps %xmm0, %xmm0 # sched: [14:1.00]
+; SANDY-NEXT: vsqrtps (%rdi), %xmm1 # sched: [20:1.00]
; SANDY-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_sqrtps:
-; HASWELL: # BB#0:
-; HASWELL-NEXT: vsqrtps %xmm0, %xmm0 # sched: [15:1.00]
-; HASWELL-NEXT: vsqrtps (%rdi), %xmm1 # sched: [19:1.00]
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vsqrtps %xmm0, %xmm0 # sched: [14:1.00]
+; HASWELL-NEXT: vsqrtps (%rdi), %xmm1 # sched: [20:1.00]
; HASWELL-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_sqrtps:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vsqrtps %xmm0, %xmm0 # sched: [14:1.00]
+; BROADWELL-NEXT: vsqrtps (%rdi), %xmm1 # sched: [19:1.00]
+; BROADWELL-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_sqrtps:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vsqrtps %xmm0, %xmm0 # sched: [12:1.00]
+; SKYLAKE-NEXT: vsqrtps (%rdi), %xmm1 # sched: [18:1.00]
+; SKYLAKE-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_sqrtps:
+; SKX: # %bb.0:
+; SKX-NEXT: vsqrtps %xmm0, %xmm0 # sched: [12:1.00]
+; SKX-NEXT: vsqrtps (%rdi), %xmm1 # sched: [18:1.00]
+; SKX-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_sqrtps:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vsqrtps (%rdi), %xmm1 # sched: [26:21.00]
; BTVER2-NEXT: vsqrtps %xmm0, %xmm0 # sched: [21:21.00]
; BTVER2-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_sqrtps:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vsqrtps (%rdi), %xmm1 # sched: [27:1.00]
; ZNVER1-NEXT: vsqrtps %xmm0, %xmm0 # sched: [20:1.00]
; ZNVER1-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float> %a0)
%2 = load <4 x float>, <4 x float> *%a1, align 16
%3 = call <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float> %2)
@@ -2259,23 +3094,23 @@ declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone
define <4 x float> @test_sqrtss(<4 x float> %a0, <4 x float> *%a1) {
; GENERIC-LABEL: test_sqrtss:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: sqrtss %xmm0, %xmm0
-; GENERIC-NEXT: movaps (%rdi), %xmm1
-; GENERIC-NEXT: sqrtss %xmm1, %xmm1
-; GENERIC-NEXT: addps %xmm1, %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: sqrtss %xmm0, %xmm0 # sched: [14:1.00]
+; GENERIC-NEXT: movaps (%rdi), %xmm1 # sched: [6:0.50]
+; GENERIC-NEXT: sqrtss %xmm1, %xmm1 # sched: [14:1.00]
+; GENERIC-NEXT: addps %xmm1, %xmm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_sqrtss:
-; ATOM: # BB#0:
-; ATOM-NEXT: movaps (%rdi), %xmm1
+; ATOM: # %bb.0:
+; ATOM-NEXT: movaps (%rdi), %xmm1 # sched: [1:1.00]
; ATOM-NEXT: sqrtss %xmm0, %xmm0
; ATOM-NEXT: sqrtss %xmm1, %xmm1
-; ATOM-NEXT: addps %xmm1, %xmm0
-; ATOM-NEXT: retq
+; ATOM-NEXT: addps %xmm1, %xmm0 # sched: [5:5.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_sqrtss:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: movaps (%rdi), %xmm1 # sched: [3:1.00]
; SLM-NEXT: sqrtss %xmm0, %xmm0 # sched: [18:1.00]
; SLM-NEXT: sqrtss %xmm1, %xmm1 # sched: [18:1.00]
@@ -2283,23 +3118,47 @@ define <4 x float> @test_sqrtss(<4 x float> %a0, <4 x float> *%a1) {
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_sqrtss:
-; SANDY: # BB#0:
-; SANDY-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 # sched: [19:1.00]
-; SANDY-NEXT: vmovaps (%rdi), %xmm1 # sched: [4:0.50]
-; SANDY-NEXT: vsqrtss %xmm1, %xmm1, %xmm1 # sched: [19:1.00]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 # sched: [114:1.00]
+; SANDY-NEXT: vmovaps (%rdi), %xmm1 # sched: [6:0.50]
+; SANDY-NEXT: vsqrtss %xmm1, %xmm1, %xmm1 # sched: [114:1.00]
; SANDY-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_sqrtss:
-; HASWELL: # BB#0:
-; HASWELL-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 # sched: [19:1.00]
-; HASWELL-NEXT: vmovaps (%rdi), %xmm1 # sched: [4:0.50]
-; HASWELL-NEXT: vsqrtss %xmm1, %xmm1, %xmm1 # sched: [19:1.00]
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 # sched: [14:1.00]
+; HASWELL-NEXT: vmovaps (%rdi), %xmm1 # sched: [6:0.50]
+; HASWELL-NEXT: vsqrtss %xmm1, %xmm1, %xmm1 # sched: [14:1.00]
; HASWELL-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_sqrtss:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 # sched: [14:1.00]
+; BROADWELL-NEXT: vmovaps (%rdi), %xmm1 # sched: [5:0.50]
+; BROADWELL-NEXT: vsqrtss %xmm1, %xmm1, %xmm1 # sched: [14:1.00]
+; BROADWELL-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_sqrtss:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 # sched: [12:1.00]
+; SKYLAKE-NEXT: vmovaps (%rdi), %xmm1 # sched: [6:0.50]
+; SKYLAKE-NEXT: vsqrtss %xmm1, %xmm1, %xmm1 # sched: [12:1.00]
+; SKYLAKE-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_sqrtss:
+; SKX: # %bb.0:
+; SKX-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 # sched: [12:1.00]
+; SKX-NEXT: vmovaps (%rdi), %xmm1 # sched: [6:0.50]
+; SKX-NEXT: vsqrtss %xmm1, %xmm1, %xmm1 # sched: [12:1.00]
+; SKX-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_sqrtss:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vmovaps (%rdi), %xmm1 # sched: [5:1.00]
; BTVER2-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 # sched: [26:21.00]
; BTVER2-NEXT: vsqrtss %xmm1, %xmm1, %xmm1 # sched: [26:21.00]
@@ -2307,12 +3166,12 @@ define <4 x float> @test_sqrtss(<4 x float> %a0, <4 x float> *%a1) {
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_sqrtss:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vmovaps (%rdi), %xmm1 # sched: [8:0.50]
; ZNVER1-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 # sched: [27:1.00]
; ZNVER1-NEXT: vsqrtss %xmm1, %xmm1, %xmm1 # sched: [27:1.00]
; ZNVER1-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %a0)
%2 = load <4 x float>, <4 x float> *%a1, align 16
%3 = call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %2)
@@ -2323,46 +3182,64 @@ declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone
define i32 @test_stmxcsr() {
; GENERIC-LABEL: test_stmxcsr:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: stmxcsr -{{[0-9]+}}(%rsp)
-; GENERIC-NEXT: movl -{{[0-9]+}}(%rsp), %eax
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: stmxcsr -{{[0-9]+}}(%rsp) # sched: [5:1.00]
+; GENERIC-NEXT: movl -{{[0-9]+}}(%rsp), %eax # sched: [5:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_stmxcsr:
-; ATOM: # BB#0:
-; ATOM-NEXT: stmxcsr -{{[0-9]+}}(%rsp)
-; ATOM-NEXT: movl -{{[0-9]+}}(%rsp), %eax
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: stmxcsr -{{[0-9]+}}(%rsp) # sched: [15:7.50]
+; ATOM-NEXT: movl -{{[0-9]+}}(%rsp), %eax # sched: [1:1.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_stmxcsr:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: stmxcsr -{{[0-9]+}}(%rsp) # sched: [1:1.00]
; SLM-NEXT: movl -{{[0-9]+}}(%rsp), %eax # sched: [3:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_stmxcsr:
-; SANDY: # BB#0:
-; SANDY-NEXT: vstmxcsr -{{[0-9]+}}(%rsp) # sched: [1:1.00]
-; SANDY-NEXT: movl -{{[0-9]+}}(%rsp), %eax # sched: [4:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vstmxcsr -{{[0-9]+}}(%rsp) # sched: [5:1.00]
+; SANDY-NEXT: movl -{{[0-9]+}}(%rsp), %eax # sched: [5:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_stmxcsr:
-; HASWELL: # BB#0:
-; HASWELL-NEXT: vstmxcsr -{{[0-9]+}}(%rsp) # sched: [7:1.00]
-; HASWELL-NEXT: movl -{{[0-9]+}}(%rsp), %eax # sched: [4:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vstmxcsr -{{[0-9]+}}(%rsp) # sched: [2:1.00]
+; HASWELL-NEXT: movl -{{[0-9]+}}(%rsp), %eax # sched: [5:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_stmxcsr:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vstmxcsr -{{[0-9]+}}(%rsp) # sched: [2:1.00]
+; BROADWELL-NEXT: movl -{{[0-9]+}}(%rsp), %eax # sched: [5:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_stmxcsr:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vstmxcsr -{{[0-9]+}}(%rsp) # sched: [2:1.00]
+; SKYLAKE-NEXT: movl -{{[0-9]+}}(%rsp), %eax # sched: [5:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_stmxcsr:
+; SKX: # %bb.0:
+; SKX-NEXT: vstmxcsr -{{[0-9]+}}(%rsp) # sched: [2:1.00]
+; SKX-NEXT: movl -{{[0-9]+}}(%rsp), %eax # sched: [5:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_stmxcsr:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vstmxcsr -{{[0-9]+}}(%rsp) # sched: [1:1.00]
; BTVER2-NEXT: movl -{{[0-9]+}}(%rsp), %eax # sched: [5:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_stmxcsr:
-; ZNVER1: # BB#0:
-; ZNVER1-NEXT: vstmxcsr -{{[0-9]+}}(%rsp) # sched: [1:0.50]
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vstmxcsr -{{[0-9]+}}(%rsp) # sched: [100:?]
; ZNVER1-NEXT: movl -{{[0-9]+}}(%rsp), %eax # sched: [8:0.50]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = alloca i32, align 4
%2 = bitcast i32* %1 to i8*
call void @llvm.x86.sse.stmxcsr(i8* %2)
@@ -2373,46 +3250,64 @@ declare void @llvm.x86.sse.stmxcsr(i8*) nounwind readnone
define <4 x float> @test_subps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2) {
; GENERIC-LABEL: test_subps:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: subps %xmm1, %xmm0
-; GENERIC-NEXT: subps (%rdi), %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: subps %xmm1, %xmm0 # sched: [3:1.00]
+; GENERIC-NEXT: subps (%rdi), %xmm0 # sched: [9:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_subps:
-; ATOM: # BB#0:
-; ATOM-NEXT: subps %xmm1, %xmm0
-; ATOM-NEXT: subps (%rdi), %xmm0
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: subps %xmm1, %xmm0 # sched: [5:5.00]
+; ATOM-NEXT: subps (%rdi), %xmm0 # sched: [5:5.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_subps:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: subps %xmm1, %xmm0 # sched: [3:1.00]
; SLM-NEXT: subps (%rdi), %xmm0 # sched: [6:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_subps:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vsubps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vsubps (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vsubps (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_subps:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vsubps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: vsubps (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vsubps (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_subps:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vsubps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; BROADWELL-NEXT: vsubps (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_subps:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vsubps %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vsubps (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_subps:
+; SKX: # %bb.0:
+; SKX-NEXT: vsubps %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: vsubps (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_subps:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vsubps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: vsubps (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_subps:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vsubps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; ZNVER1-NEXT: vsubps (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = fsub <4 x float> %a0, %a1
%2 = load <4 x float>, <4 x float> *%a2, align 16
%3 = fsub <4 x float> %1, %2
@@ -2421,46 +3316,64 @@ define <4 x float> @test_subps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a
define float @test_subss(float %a0, float %a1, float *%a2) {
; GENERIC-LABEL: test_subss:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: subss %xmm1, %xmm0
-; GENERIC-NEXT: subss (%rdi), %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: subss %xmm1, %xmm0 # sched: [3:1.00]
+; GENERIC-NEXT: subss (%rdi), %xmm0 # sched: [9:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_subss:
-; ATOM: # BB#0:
-; ATOM-NEXT: subss %xmm1, %xmm0
-; ATOM-NEXT: subss (%rdi), %xmm0
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: subss %xmm1, %xmm0 # sched: [5:5.00]
+; ATOM-NEXT: subss (%rdi), %xmm0 # sched: [5:5.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_subss:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: subss %xmm1, %xmm0 # sched: [3:1.00]
; SLM-NEXT: subss (%rdi), %xmm0 # sched: [6:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_subss:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vsubss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vsubss (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vsubss (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_subss:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vsubss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: vsubss (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vsubss (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_subss:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vsubss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; BROADWELL-NEXT: vsubss (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_subss:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vsubss %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vsubss (%rdi), %xmm0, %xmm0 # sched: [9:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_subss:
+; SKX: # %bb.0:
+; SKX-NEXT: vsubss %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: vsubss (%rdi), %xmm0, %xmm0 # sched: [9:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_subss:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vsubss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: vsubss (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_subss:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vsubss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; ZNVER1-NEXT: vsubss (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = fsub float %a0, %a1
%2 = load float, float *%a2, align 4
%3 = fsub float %1, %2
@@ -2469,35 +3382,35 @@ define float @test_subss(float %a0, float %a1, float *%a2) {
define i32 @test_ucomiss(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2) {
; GENERIC-LABEL: test_ucomiss:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: ucomiss %xmm1, %xmm0
-; GENERIC-NEXT: setnp %al
-; GENERIC-NEXT: sete %cl
-; GENERIC-NEXT: andb %al, %cl
-; GENERIC-NEXT: ucomiss (%rdi), %xmm0
-; GENERIC-NEXT: setnp %al
-; GENERIC-NEXT: sete %dl
-; GENERIC-NEXT: andb %al, %dl
-; GENERIC-NEXT: orb %cl, %dl
-; GENERIC-NEXT: movzbl %dl, %eax
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: ucomiss %xmm1, %xmm0 # sched: [3:1.00]
+; GENERIC-NEXT: setnp %al # sched: [1:0.50]
+; GENERIC-NEXT: sete %cl # sched: [1:0.50]
+; GENERIC-NEXT: andb %al, %cl # sched: [1:0.33]
+; GENERIC-NEXT: ucomiss (%rdi), %xmm0 # sched: [7:1.00]
+; GENERIC-NEXT: setnp %al # sched: [1:0.50]
+; GENERIC-NEXT: sete %dl # sched: [1:0.50]
+; GENERIC-NEXT: andb %al, %dl # sched: [1:0.33]
+; GENERIC-NEXT: orb %cl, %dl # sched: [1:0.33]
+; GENERIC-NEXT: movzbl %dl, %eax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_ucomiss:
-; ATOM: # BB#0:
-; ATOM-NEXT: ucomiss %xmm1, %xmm0
-; ATOM-NEXT: setnp %al
-; ATOM-NEXT: sete %cl
-; ATOM-NEXT: andb %al, %cl
-; ATOM-NEXT: ucomiss (%rdi), %xmm0
-; ATOM-NEXT: setnp %al
-; ATOM-NEXT: sete %dl
-; ATOM-NEXT: andb %al, %dl
-; ATOM-NEXT: orb %cl, %dl
-; ATOM-NEXT: movzbl %dl, %eax
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: ucomiss %xmm1, %xmm0 # sched: [9:4.50]
+; ATOM-NEXT: setnp %al # sched: [1:0.50]
+; ATOM-NEXT: sete %cl # sched: [1:0.50]
+; ATOM-NEXT: andb %al, %cl # sched: [1:0.50]
+; ATOM-NEXT: ucomiss (%rdi), %xmm0 # sched: [10:5.00]
+; ATOM-NEXT: setnp %al # sched: [1:0.50]
+; ATOM-NEXT: sete %dl # sched: [1:0.50]
+; ATOM-NEXT: andb %al, %dl # sched: [1:0.50]
+; ATOM-NEXT: orb %cl, %dl # sched: [1:0.50]
+; ATOM-NEXT: movzbl %dl, %eax # sched: [1:1.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_ucomiss:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: ucomiss %xmm1, %xmm0 # sched: [3:1.00]
; SLM-NEXT: setnp %al # sched: [1:0.50]
; SLM-NEXT: sete %cl # sched: [1:0.50]
@@ -2511,35 +3424,77 @@ define i32 @test_ucomiss(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2) {
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_ucomiss:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vucomiss %xmm1, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: setnp %al # sched: [1:0.33]
-; SANDY-NEXT: sete %cl # sched: [1:0.33]
+; SANDY-NEXT: setnp %al # sched: [1:0.50]
+; SANDY-NEXT: sete %cl # sched: [1:0.50]
; SANDY-NEXT: andb %al, %cl # sched: [1:0.33]
; SANDY-NEXT: vucomiss (%rdi), %xmm0 # sched: [7:1.00]
-; SANDY-NEXT: setnp %al # sched: [1:0.33]
-; SANDY-NEXT: sete %dl # sched: [1:0.33]
+; SANDY-NEXT: setnp %al # sched: [1:0.50]
+; SANDY-NEXT: sete %dl # sched: [1:0.50]
; SANDY-NEXT: andb %al, %dl # sched: [1:0.33]
; SANDY-NEXT: orb %cl, %dl # sched: [1:0.33]
; SANDY-NEXT: movzbl %dl, %eax # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_ucomiss:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vucomiss %xmm1, %xmm0 # sched: [3:1.00]
; HASWELL-NEXT: setnp %al # sched: [1:0.50]
; HASWELL-NEXT: sete %cl # sched: [1:0.50]
; HASWELL-NEXT: andb %al, %cl # sched: [1:0.25]
-; HASWELL-NEXT: vucomiss (%rdi), %xmm0 # sched: [7:1.00]
+; HASWELL-NEXT: vucomiss (%rdi), %xmm0 # sched: [8:1.00]
; HASWELL-NEXT: setnp %al # sched: [1:0.50]
; HASWELL-NEXT: sete %dl # sched: [1:0.50]
; HASWELL-NEXT: andb %al, %dl # sched: [1:0.25]
; HASWELL-NEXT: orb %cl, %dl # sched: [1:0.25]
; HASWELL-NEXT: movzbl %dl, %eax # sched: [1:0.25]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_ucomiss:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vucomiss %xmm1, %xmm0 # sched: [3:1.00]
+; BROADWELL-NEXT: setnp %al # sched: [1:0.50]
+; BROADWELL-NEXT: sete %cl # sched: [1:0.50]
+; BROADWELL-NEXT: andb %al, %cl # sched: [1:0.25]
+; BROADWELL-NEXT: vucomiss (%rdi), %xmm0 # sched: [8:1.00]
+; BROADWELL-NEXT: setnp %al # sched: [1:0.50]
+; BROADWELL-NEXT: sete %dl # sched: [1:0.50]
+; BROADWELL-NEXT: andb %al, %dl # sched: [1:0.25]
+; BROADWELL-NEXT: orb %cl, %dl # sched: [1:0.25]
+; BROADWELL-NEXT: movzbl %dl, %eax # sched: [1:0.25]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_ucomiss:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vucomiss %xmm1, %xmm0 # sched: [3:1.00]
+; SKYLAKE-NEXT: setnp %al # sched: [1:0.50]
+; SKYLAKE-NEXT: sete %cl # sched: [1:0.50]
+; SKYLAKE-NEXT: andb %al, %cl # sched: [1:0.25]
+; SKYLAKE-NEXT: vucomiss (%rdi), %xmm0 # sched: [8:1.00]
+; SKYLAKE-NEXT: setnp %al # sched: [1:0.50]
+; SKYLAKE-NEXT: sete %dl # sched: [1:0.50]
+; SKYLAKE-NEXT: andb %al, %dl # sched: [1:0.25]
+; SKYLAKE-NEXT: orb %cl, %dl # sched: [1:0.25]
+; SKYLAKE-NEXT: movzbl %dl, %eax # sched: [1:0.25]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_ucomiss:
+; SKX: # %bb.0:
+; SKX-NEXT: vucomiss %xmm1, %xmm0 # sched: [3:1.00]
+; SKX-NEXT: setnp %al # sched: [1:0.50]
+; SKX-NEXT: sete %cl # sched: [1:0.50]
+; SKX-NEXT: andb %al, %cl # sched: [1:0.25]
+; SKX-NEXT: vucomiss (%rdi), %xmm0 # sched: [8:1.00]
+; SKX-NEXT: setnp %al # sched: [1:0.50]
+; SKX-NEXT: sete %dl # sched: [1:0.50]
+; SKX-NEXT: andb %al, %dl # sched: [1:0.25]
+; SKX-NEXT: orb %cl, %dl # sched: [1:0.25]
+; SKX-NEXT: movzbl %dl, %eax # sched: [1:0.25]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_ucomiss:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vucomiss %xmm1, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: setnp %al # sched: [1:0.50]
; BTVER2-NEXT: sete %cl # sched: [1:0.50]
@@ -2553,7 +3508,7 @@ define i32 @test_ucomiss(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2) {
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_ucomiss:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vucomiss %xmm1, %xmm0 # sched: [3:1.00]
; ZNVER1-NEXT: setnp %al # sched: [1:0.25]
; ZNVER1-NEXT: sete %cl # sched: [1:0.25]
@@ -2564,7 +3519,7 @@ define i32 @test_ucomiss(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2) {
; ZNVER1-NEXT: andb %al, %dl # sched: [1:0.25]
; ZNVER1-NEXT: orb %cl, %dl # sched: [1:0.25]
; ZNVER1-NEXT: movzbl %dl, %eax # sched: [1:0.25]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call i32 @llvm.x86.sse.ucomieq.ss(<4 x float> %a0, <4 x float> %a1)
%2 = load <4 x float>, <4 x float> *%a2, align 4
%3 = call i32 @llvm.x86.sse.ucomieq.ss(<4 x float> %a0, <4 x float> %2)
@@ -2575,50 +3530,68 @@ declare i32 @llvm.x86.sse.ucomieq.ss(<4 x float>, <4 x float>) nounwind readnone
define <4 x float> @test_unpckhps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2) {
; GENERIC-LABEL: test_unpckhps:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; GENERIC-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
+; GENERIC-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_unpckhps:
-; ATOM: # BB#0:
-; ATOM-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; ATOM-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
+; ATOM-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] sched: [1:1.00]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_unpckhps:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
; SLM-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] sched: [4:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_unpckhps:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
-; SANDY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] sched: [5:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_unpckhps:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
-; HASWELL-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] sched: [5:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_unpckhps:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
+; BROADWELL-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] sched: [6:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_unpckhps:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
+; SKYLAKE-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_unpckhps:
+; SKX: # %bb.0:
+; SKX-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
+; SKX-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_unpckhps:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:0.50]
; BTVER2-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_unpckhps:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:0.50]
; ZNVER1-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] sched: [8:0.50]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
%2 = load <4 x float>, <4 x float> *%a2, align 16
%3 = shufflevector <4 x float> %1, <4 x float> %2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
@@ -2627,50 +3600,68 @@ define <4 x float> @test_unpckhps(<4 x float> %a0, <4 x float> %a1, <4 x float>
define <4 x float> @test_unpcklps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2) {
; GENERIC-LABEL: test_unpcklps:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; GENERIC-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
+; GENERIC-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_unpcklps:
-; ATOM: # BB#0:
-; ATOM-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; ATOM-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
+; ATOM-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] sched: [1:1.00]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_unpcklps:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
; SLM-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] sched: [4:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_unpcklps:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
-; SANDY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] sched: [5:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_unpcklps:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
-; HASWELL-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] sched: [5:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_unpcklps:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
+; BROADWELL-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] sched: [6:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_unpcklps:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
+; SKYLAKE-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_unpcklps:
+; SKX: # %bb.0:
+; SKX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
+; SKX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_unpcklps:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:0.50]
; BTVER2-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_unpcklps:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:0.50]
; ZNVER1-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] sched: [8:0.50]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
%2 = load <4 x float>, <4 x float> *%a2, align 16
%3 = shufflevector <4 x float> %1, <4 x float> %2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
@@ -2679,54 +3670,68 @@ define <4 x float> @test_unpcklps(<4 x float> %a0, <4 x float> %a1, <4 x float>
define <4 x float> @test_xorps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2) {
; GENERIC-LABEL: test_xorps:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: xorps %xmm1, %xmm0
-; GENERIC-NEXT: xorps (%rdi), %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: xorps %xmm1, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: xorps (%rdi), %xmm0 # sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_xorps:
-; ATOM: # BB#0:
-; ATOM-NEXT: xorps %xmm1, %xmm0
-; ATOM-NEXT: xorps (%rdi), %xmm0
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: xorps %xmm1, %xmm0 # sched: [1:0.50]
+; ATOM-NEXT: xorps (%rdi), %xmm0 # sched: [1:1.00]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_xorps:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: xorps %xmm1, %xmm0 # sched: [1:0.50]
; SLM-NEXT: xorps (%rdi), %xmm0 # sched: [4:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_xorps:
-; SANDY: # BB#0:
-; SANDY-NEXT: vxorps %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
-; SANDY-NEXT: vxorps (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vxorps %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; SANDY-NEXT: vxorps (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_xorps:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vxorps %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT: vxorps (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vxorps (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_xorps:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vxorps %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; BROADWELL-NEXT: vxorps (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_xorps:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vxorps %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: vxorps (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_xorps:
+; SKX: # %bb.0:
+; SKX-NEXT: vxorps %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKX-NEXT: vxorps (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_xorps:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vxorps %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vxorps (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_xorps:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vxorps %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
; ZNVER1-NEXT: vxorps (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = bitcast <4 x float> %a0 to <4 x i32>
%2 = bitcast <4 x float> %a1 to <4 x i32>
%3 = xor <4 x i32> %1, %2
diff --git a/test/CodeGen/X86/sse-unaligned-mem-feature.ll b/test/CodeGen/X86/sse-unaligned-mem-feature.ll
index 1c61a515f383..a5f62dde81fe 100644
--- a/test/CodeGen/X86/sse-unaligned-mem-feature.ll
+++ b/test/CodeGen/X86/sse-unaligned-mem-feature.ll
@@ -1,7 +1,7 @@
-; RUN: llc -mcpu=yonah -mattr=sse-unaligned-mem -march=x86 < %s | FileCheck %s
+; RUN: llc -mcpu=yonah -mattr=sse-unaligned-mem < %s | FileCheck %s
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
-target triple = "x86_64-unknown-linux-gnu"
+target triple = "i686-unknown-linux-gnu"
define <4 x float> @foo(<4 x float>* %P, <4 x float> %In) nounwind {
%A = load <4 x float>, <4 x float>* %P, align 4
diff --git a/test/CodeGen/X86/sse-varargs.ll b/test/CodeGen/X86/sse-varargs.ll
index 7c3c78113def..3d1bec88e396 100644
--- a/test/CodeGen/X86/sse-varargs.ll
+++ b/test/CodeGen/X86/sse-varargs.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 | grep xmm | grep esp
+; RUN: llc < %s -mtriple=i686-- -mattr=+sse2 | grep xmm | grep esp
define i32 @t() nounwind {
entry:
diff --git a/test/CodeGen/X86/sse1.ll b/test/CodeGen/X86/sse1.ll
index c74dec3e21b6..7222a27c826b 100644
--- a/test/CodeGen/X86/sse1.ll
+++ b/test/CodeGen/X86/sse1.ll
@@ -14,7 +14,7 @@
; rdar://8368414
define <2 x float> @test4(<2 x float> %A, <2 x float> %B) nounwind {
; X32-LABEL: test4:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: movaps %xmm0, %xmm2
; X32-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1,2,3]
; X32-NEXT: addss %xmm1, %xmm0
@@ -24,7 +24,7 @@ define <2 x float> @test4(<2 x float> %A, <2 x float> %B) nounwind {
; X32-NEXT: retl
;
; X64-LABEL: test4:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: movaps %xmm0, %xmm2
; X64-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1,2,3]
; X64-NEXT: addss %xmm1, %xmm0
@@ -52,11 +52,11 @@ entry:
define <4 x float> @vselect(<4 x float>*%p, <4 x i32> %q) {
; X32-LABEL: vselect:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: cmpl $0, {{[0-9]+}}(%esp)
; X32-NEXT: xorps %xmm0, %xmm0
; X32-NEXT: je .LBB1_1
-; X32-NEXT: # BB#2: # %entry
+; X32-NEXT: # %bb.2: # %entry
; X32-NEXT: xorps %xmm1, %xmm1
; X32-NEXT: cmpl $0, {{[0-9]+}}(%esp)
; X32-NEXT: jne .LBB1_5
@@ -91,11 +91,11 @@ define <4 x float> @vselect(<4 x float>*%p, <4 x i32> %q) {
; X32-NEXT: retl
;
; X64-LABEL: vselect:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: testl %edx, %edx
; X64-NEXT: xorps %xmm0, %xmm0
; X64-NEXT: je .LBB1_1
-; X64-NEXT: # BB#2: # %entry
+; X64-NEXT: # %bb.2: # %entry
; X64-NEXT: xorps %xmm1, %xmm1
; X64-NEXT: testl %ecx, %ecx
; X64-NEXT: jne .LBB1_5
@@ -138,12 +138,12 @@ entry:
define <4 x float> @PR28044(<4 x float> %a0, <4 x float> %a1) nounwind {
; X32-LABEL: PR28044:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: cmpeqps %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: PR28044:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: cmpeqps %xmm1, %xmm0
; X64-NEXT: retq
%cmp = fcmp oeq <4 x float> %a0, %a1
@@ -157,57 +157,82 @@ define <4 x float> @PR28044(<4 x float> %a0, <4 x float> %a1) nounwind {
define <4 x i32> @PR30512(<4 x i32> %x, <4 x i32> %y) nounwind {
; X32-LABEL: PR30512:
-; X32: # BB#0:
-; X32-NEXT: pushl %ebp
+; X32: # %bb.0:
; X32-NEXT: pushl %ebx
; X32-NEXT: pushl %edi
; X32-NEXT: pushl %esi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X32-NEXT: subl $16, %esp
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
; X32-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: xorl %ecx, %ecx
-; X32-NEXT: cmpl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: sete %cl
-; X32-NEXT: xorl %edx, %edx
-; X32-NEXT: cmpl {{[0-9]+}}(%esp), %ebx
-; X32-NEXT: sete %dl
; X32-NEXT: xorl %ebx, %ebx
; X32-NEXT: cmpl {{[0-9]+}}(%esp), %edi
; X32-NEXT: sete %bl
-; X32-NEXT: xorl %eax, %eax
+; X32-NEXT: negl %ebx
+; X32-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X32-NEXT: xorl %ebx, %ebx
; X32-NEXT: cmpl {{[0-9]+}}(%esp), %esi
-; X32-NEXT: sete %al
-; X32-NEXT: movl %eax, 12(%ebp)
-; X32-NEXT: movl %ebx, 8(%ebp)
-; X32-NEXT: movl %edx, 4(%ebp)
-; X32-NEXT: movl %ecx, (%ebp)
-; X32-NEXT: movl %ebp, %eax
+; X32-NEXT: sete %bl
+; X32-NEXT: negl %ebx
+; X32-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X32-NEXT: xorl %ebx, %ebx
+; X32-NEXT: cmpl {{[0-9]+}}(%esp), %edx
+; X32-NEXT: sete %bl
+; X32-NEXT: negl %ebx
+; X32-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X32-NEXT: xorl %edx, %edx
+; X32-NEXT: cmpl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: sete %dl
+; X32-NEXT: negl %edx
+; X32-NEXT: movl %edx, (%esp)
+; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X32-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X32-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; X32-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0]
+; X32-NEXT: andps {{\.LCPI.*}}, %xmm2
+; X32-NEXT: movaps %xmm2, (%eax)
+; X32-NEXT: addl $16, %esp
; X32-NEXT: popl %esi
; X32-NEXT: popl %edi
; X32-NEXT: popl %ebx
-; X32-NEXT: popl %ebp
; X32-NEXT: retl $4
;
; X64-LABEL: PR30512:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: xorl %eax, %eax
-; X64-NEXT: cmpl %r9d, %esi
+; X64-NEXT: cmpl {{[0-9]+}}(%rsp), %r8d
; X64-NEXT: sete %al
-; X64-NEXT: xorl %esi, %esi
-; X64-NEXT: cmpl {{[0-9]+}}(%rsp), %edx
-; X64-NEXT: sete %sil
-; X64-NEXT: xorl %edx, %edx
+; X64-NEXT: negl %eax
+; X64-NEXT: movl %eax, -{{[0-9]+}}(%rsp)
+; X64-NEXT: xorl %eax, %eax
; X64-NEXT: cmpl {{[0-9]+}}(%rsp), %ecx
-; X64-NEXT: sete %dl
-; X64-NEXT: xorl %ecx, %ecx
-; X64-NEXT: cmpl {{[0-9]+}}(%rsp), %r8d
-; X64-NEXT: sete %cl
-; X64-NEXT: movl %ecx, 12(%rdi)
-; X64-NEXT: movl %edx, 8(%rdi)
-; X64-NEXT: movl %esi, 4(%rdi)
-; X64-NEXT: movl %eax, (%rdi)
+; X64-NEXT: sete %al
+; X64-NEXT: negl %eax
+; X64-NEXT: movl %eax, -{{[0-9]+}}(%rsp)
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: cmpl {{[0-9]+}}(%rsp), %edx
+; X64-NEXT: sete %al
+; X64-NEXT: negl %eax
+; X64-NEXT: movl %eax, -{{[0-9]+}}(%rsp)
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: cmpl %r9d, %esi
+; X64-NEXT: sete %al
+; X64-NEXT: negl %eax
+; X64-NEXT: movl %eax, -{{[0-9]+}}(%rsp)
+; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X64-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X64-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; X64-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0]
+; X64-NEXT: andps {{.*}}(%rip), %xmm2
+; X64-NEXT: movaps %xmm2, (%rdi)
; X64-NEXT: movq %rdi, %rax
; X64-NEXT: retq
%cmp = icmp eq <4 x i32> %x, %y
@@ -219,126 +244,20 @@ define <4 x i32> @PR30512(<4 x i32> %x, <4 x i32> %y) nounwind {
; post-legalization to cause the crash seen in:
; https://llvm.org/bugs/show_bug.cgi?id=31672
; Is there a way to do that without an unsafe/fast sqrt intrinsic call?
-; Also, although the goal for adding this test is to prove that we
-; don't crash, I have no idea what this code is doing, so I'm keeping
-; the full codegen checks in case there's motivation to improve this.
+;
+; We now no longer try to lower sqrt using rsqrt with SSE1 only as the
+; v4i32 vselect mentioned above should never have been created. We ended up
+; scalarizing it anyway.
define <2 x float> @PR31672() #0 {
; X32-LABEL: PR31672:
-; X32: # BB#0:
-; X32-NEXT: pushl %ebp
-; X32-NEXT: movl %esp, %ebp
-; X32-NEXT: andl $-16, %esp
-; X32-NEXT: subl $80, %esp
-; X32-NEXT: xorps %xmm0, %xmm0
-; X32-NEXT: movaps {{.*#+}} xmm1 = <42,3,u,u>
-; X32-NEXT: movaps %xmm1, %xmm2
-; X32-NEXT: cmpeqps %xmm0, %xmm2
-; X32-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
-; X32-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
-; X32-NEXT: rsqrtps %xmm1, %xmm0
-; X32-NEXT: mulps %xmm0, %xmm1
-; X32-NEXT: mulps %xmm0, %xmm1
-; X32-NEXT: addps {{\.LCPI.*}}, %xmm1
-; X32-NEXT: mulps {{\.LCPI.*}}, %xmm0
-; X32-NEXT: mulps %xmm1, %xmm0
-; X32-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: andl %eax, %ecx
-; X32-NEXT: notl %eax
-; X32-NEXT: andl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: orl %ecx, %eax
-; X32-NEXT: movl %eax, (%esp)
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: andl %eax, %ecx
-; X32-NEXT: notl %eax
-; X32-NEXT: andl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: orl %ecx, %eax
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X32-NEXT: andl %ecx, %edx
-; X32-NEXT: notl %ecx
-; X32-NEXT: andl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: orl %edx, %ecx
-; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: andl %eax, %ecx
-; X32-NEXT: notl %eax
-; X32-NEXT: andl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: orl %ecx, %eax
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X32-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; X32-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; X32-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; X32-NEXT: movl %ebp, %esp
-; X32-NEXT: popl %ebp
+; X32: # %bb.0:
+; X32-NEXT: sqrtps {{\.LCPI.*}}, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: PR31672:
-; X64: # BB#0:
-; X64-NEXT: xorps %xmm0, %xmm0
-; X64-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movaps {{.*#+}} xmm1 = <42,3,u,u>
-; X64-NEXT: cmpeqps %xmm1, %xmm0
-; X64-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: rsqrtps %xmm1, %xmm0
-; X64-NEXT: mulps %xmm0, %xmm1
-; X64-NEXT: mulps %xmm0, %xmm1
-; X64-NEXT: addps {{.*}}(%rip), %xmm1
-; X64-NEXT: mulps {{.*}}(%rip), %xmm0
-; X64-NEXT: mulps %xmm1, %xmm0
-; X64-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movq -{{[0-9]+}}(%rsp), %r8
-; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rsi
-; X64-NEXT: movq -{{[0-9]+}}(%rsp), %r9
-; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rdi
-; X64-NEXT: movl %esi, %eax
-; X64-NEXT: andl %edi, %eax
-; X64-NEXT: movl %edi, %ecx
-; X64-NEXT: notl %ecx
-; X64-NEXT: movq -{{[0-9]+}}(%rsp), %r10
-; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rdx
-; X64-NEXT: andl %edx, %ecx
-; X64-NEXT: orl %eax, %ecx
-; X64-NEXT: movl %ecx, -{{[0-9]+}}(%rsp)
-; X64-NEXT: shrq $32, %rsi
-; X64-NEXT: shrq $32, %rdi
-; X64-NEXT: andl %edi, %esi
-; X64-NEXT: notl %edi
-; X64-NEXT: shrq $32, %rdx
-; X64-NEXT: andl %edi, %edx
-; X64-NEXT: orl %esi, %edx
-; X64-NEXT: movl %edx, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movl %r8d, %eax
-; X64-NEXT: andl %r9d, %eax
-; X64-NEXT: movl %r9d, %ecx
-; X64-NEXT: notl %ecx
-; X64-NEXT: andl %r10d, %ecx
-; X64-NEXT: orl %eax, %ecx
-; X64-NEXT: movl %ecx, -{{[0-9]+}}(%rsp)
-; X64-NEXT: shrq $32, %r8
-; X64-NEXT: shrq $32, %r9
-; X64-NEXT: andl %r9d, %r8d
-; X64-NEXT: notl %r9d
-; X64-NEXT: shrq $32, %r10
-; X64-NEXT: andl %r9d, %r10d
-; X64-NEXT: orl %r8d, %r10d
-; X64-NEXT: movl %r10d, -{{[0-9]+}}(%rsp)
-; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X64-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X64-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; X64-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; X64-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X64: # %bb.0:
+; X64-NEXT: sqrtps {{.*}}(%rip), %xmm0
; X64-NEXT: retq
%t0 = call fast <2 x float> @llvm.sqrt.v2f32(<2 x float> <float 42.0, float 3.0>)
ret <2 x float> %t0
diff --git a/test/CodeGen/X86/sse2-intrinsics-fast-isel-x86_64.ll b/test/CodeGen/X86/sse2-intrinsics-fast-isel-x86_64.ll
index 54de15c292f6..bfbcf250c7b8 100644
--- a/test/CodeGen/X86/sse2-intrinsics-fast-isel-x86_64.ll
+++ b/test/CodeGen/X86/sse2-intrinsics-fast-isel-x86_64.ll
@@ -5,7 +5,7 @@
define i64 @test_mm_cvtsd_si64(<2 x double> %a0) nounwind {
; X64-LABEL: test_mm_cvtsd_si64:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: cvtsd2si %xmm0, %rax
; X64-NEXT: retq
%res = call i64 @llvm.x86.sse2.cvtsd2si64(<2 x double> %a0)
@@ -15,7 +15,7 @@ declare i64 @llvm.x86.sse2.cvtsd2si64(<2 x double>) nounwind readnone
define i64 @test_mm_cvtsi128_si64(<2 x i64> %a0) nounwind {
; X64-LABEL: test_mm_cvtsi128_si64:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movq %xmm0, %rax
; X64-NEXT: retq
%res = extractelement <2 x i64> %a0, i32 0
@@ -24,7 +24,7 @@ define i64 @test_mm_cvtsi128_si64(<2 x i64> %a0) nounwind {
define <2 x double> @test_mm_cvtsi64_sd(<2 x double> %a0, i64 %a1) nounwind {
; X64-LABEL: test_mm_cvtsi64_sd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: cvtsi2sdq %rdi, %xmm0
; X64-NEXT: retq
%res = call <2 x double> @llvm.x86.sse2.cvtsi642sd(<2 x double> %a0, i64 %a1)
@@ -34,7 +34,7 @@ declare <2 x double> @llvm.x86.sse2.cvtsi642sd(<2 x double>, i64) nounwind readn
define <2 x i64> @test_mm_cvtsi64_si128(i64 %a0) nounwind {
; X64-LABEL: test_mm_cvtsi64_si128:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movq %rdi, %xmm0
; X64-NEXT: retq
%res0 = insertelement <2 x i64> undef, i64 %a0, i32 0
@@ -44,7 +44,7 @@ define <2 x i64> @test_mm_cvtsi64_si128(i64 %a0) nounwind {
define i64 @test_mm_cvttsd_si64(<2 x double> %a0) nounwind {
; X64-LABEL: test_mm_cvttsd_si64:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: cvttsd2si %xmm0, %rax
; X64-NEXT: retq
%res = call i64 @llvm.x86.sse2.cvttsd2si64(<2 x double> %a0)
@@ -54,7 +54,7 @@ declare i64 @llvm.x86.sse2.cvttsd2si64(<2 x double>) nounwind readnone
define <2 x i64> @test_mm_loadu_si64(i64* %a0) nounwind {
; X64-LABEL: test_mm_loadu_si64:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; X64-NEXT: retq
%ld = load i64, i64* %a0, align 1
@@ -65,7 +65,7 @@ define <2 x i64> @test_mm_loadu_si64(i64* %a0) nounwind {
define void @test_mm_stream_si64(i64 *%a0, i64 %a1) {
; X64-LABEL: test_mm_stream_si64:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movntiq %rsi, (%rdi)
; X64-NEXT: retq
store i64 %a1, i64* %a0, align 1, !nontemporal !0
diff --git a/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll b/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
index ff5d624e6042..a75a0597325d 100644
--- a/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
+++ b/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
@@ -6,12 +6,12 @@
define <2 x i64> @test_mm_add_epi8(<2 x i64> %a0, <2 x i64> %a1) nounwind {
; X32-LABEL: test_mm_add_epi8:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: paddb %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_add_epi8:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: paddb %xmm1, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <16 x i8>
@@ -23,12 +23,12 @@ define <2 x i64> @test_mm_add_epi8(<2 x i64> %a0, <2 x i64> %a1) nounwind {
define <2 x i64> @test_mm_add_epi16(<2 x i64> %a0, <2 x i64> %a1) nounwind {
; X32-LABEL: test_mm_add_epi16:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: paddw %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_add_epi16:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: paddw %xmm1, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <8 x i16>
@@ -40,12 +40,12 @@ define <2 x i64> @test_mm_add_epi16(<2 x i64> %a0, <2 x i64> %a1) nounwind {
define <2 x i64> @test_mm_add_epi32(<2 x i64> %a0, <2 x i64> %a1) nounwind {
; X32-LABEL: test_mm_add_epi32:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: paddd %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_add_epi32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: paddd %xmm1, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <4 x i32>
@@ -57,12 +57,12 @@ define <2 x i64> @test_mm_add_epi32(<2 x i64> %a0, <2 x i64> %a1) nounwind {
define <2 x i64> @test_mm_add_epi64(<2 x i64> %a0, <2 x i64> %a1) nounwind {
; X32-LABEL: test_mm_add_epi64:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: paddq %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_add_epi64:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: paddq %xmm1, %xmm0
; X64-NEXT: retq
%res = add <2 x i64> %a0, %a1
@@ -71,12 +71,12 @@ define <2 x i64> @test_mm_add_epi64(<2 x i64> %a0, <2 x i64> %a1) nounwind {
define <2 x double> @test_mm_add_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
; X32-LABEL: test_mm_add_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: addpd %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_add_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: addpd %xmm1, %xmm0
; X64-NEXT: retq
%res = fadd <2 x double> %a0, %a1
@@ -85,12 +85,12 @@ define <2 x double> @test_mm_add_pd(<2 x double> %a0, <2 x double> %a1) nounwind
define <2 x double> @test_mm_add_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
; X32-LABEL: test_mm_add_sd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: addsd %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_add_sd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: addsd %xmm1, %xmm0
; X64-NEXT: retq
%ext0 = extractelement <2 x double> %a0, i32 0
@@ -102,12 +102,12 @@ define <2 x double> @test_mm_add_sd(<2 x double> %a0, <2 x double> %a1) nounwind
define <2 x i64> @test_mm_adds_epi8(<2 x i64> %a0, <2 x i64> %a1) nounwind {
; X32-LABEL: test_mm_adds_epi8:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: paddsb %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_adds_epi8:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: paddsb %xmm1, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <16 x i8>
@@ -120,12 +120,12 @@ declare <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8>, <16 x i8>) nounwind readnone
define <2 x i64> @test_mm_adds_epi16(<2 x i64> %a0, <2 x i64> %a1) nounwind {
; X32-LABEL: test_mm_adds_epi16:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: paddsw %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_adds_epi16:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: paddsw %xmm1, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <8 x i16>
@@ -138,12 +138,12 @@ declare <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16>, <8 x i16>) nounwind readnone
define <2 x i64> @test_mm_adds_epu8(<2 x i64> %a0, <2 x i64> %a1) nounwind {
; X32-LABEL: test_mm_adds_epu8:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: paddusb %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_adds_epu8:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: paddusb %xmm1, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <16 x i8>
@@ -156,12 +156,12 @@ declare <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8>, <16 x i8>) nounwind readnon
define <2 x i64> @test_mm_adds_epu16(<2 x i64> %a0, <2 x i64> %a1) nounwind {
; X32-LABEL: test_mm_adds_epu16:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: paddusw %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_adds_epu16:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: paddusw %xmm1, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <8 x i16>
@@ -174,12 +174,12 @@ declare <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16>, <8 x i16>) nounwind readnon
define <2 x double> @test_mm_and_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
; X32-LABEL: test_mm_and_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: andps %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_and_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: andps %xmm1, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x double> %a0 to <4 x i32>
@@ -191,12 +191,12 @@ define <2 x double> @test_mm_and_pd(<2 x double> %a0, <2 x double> %a1) nounwind
define <2 x i64> @test_mm_and_si128(<2 x i64> %a0, <2 x i64> %a1) nounwind {
; X32-LABEL: test_mm_and_si128:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: andps %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_and_si128:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: andps %xmm1, %xmm0
; X64-NEXT: retq
%res = and <2 x i64> %a0, %a1
@@ -205,12 +205,12 @@ define <2 x i64> @test_mm_and_si128(<2 x i64> %a0, <2 x i64> %a1) nounwind {
define <2 x double> @test_mm_andnot_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
; X32-LABEL: test_mm_andnot_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: andnps %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_andnot_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: andnps %xmm1, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x double> %a0 to <4 x i32>
@@ -223,14 +223,14 @@ define <2 x double> @test_mm_andnot_pd(<2 x double> %a0, <2 x double> %a1) nounw
define <2 x i64> @test_mm_andnot_si128(<2 x i64> %a0, <2 x i64> %a1) nounwind {
; X32-LABEL: test_mm_andnot_si128:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pcmpeqd %xmm2, %xmm2
; X32-NEXT: pxor %xmm2, %xmm0
; X32-NEXT: pand %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_andnot_si128:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: pcmpeqd %xmm2, %xmm2
; X64-NEXT: pxor %xmm2, %xmm0
; X64-NEXT: pand %xmm1, %xmm0
@@ -242,48 +242,56 @@ define <2 x i64> @test_mm_andnot_si128(<2 x i64> %a0, <2 x i64> %a1) nounwind {
define <2 x i64> @test_mm_avg_epu8(<2 x i64> %a0, <2 x i64> %a1) nounwind {
; X32-LABEL: test_mm_avg_epu8:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pavgb %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_avg_epu8:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: pavgb %xmm1, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <16 x i8>
%arg1 = bitcast <2 x i64> %a1 to <16 x i8>
- %res = call <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> %arg0, <16 x i8> %arg1)
+ %zext0 = zext <16 x i8> %arg0 to <16 x i16>
+ %zext1 = zext <16 x i8> %arg1 to <16 x i16>
+ %add = add <16 x i16> %zext0, %zext1
+ %add1 = add <16 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+ %lshr = lshr <16 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+ %res = trunc <16 x i16> %lshr to <16 x i8>
%bc = bitcast <16 x i8> %res to <2 x i64>
ret <2 x i64> %bc
}
-declare <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> %arg0, <16 x i8> %arg1) nounwind readnone
define <2 x i64> @test_mm_avg_epu16(<2 x i64> %a0, <2 x i64> %a1) nounwind {
; X32-LABEL: test_mm_avg_epu16:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pavgw %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_avg_epu16:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: pavgw %xmm1, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <8 x i16>
%arg1 = bitcast <2 x i64> %a1 to <8 x i16>
- %res = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %arg0, <8 x i16> %arg1)
+ %zext0 = zext <8 x i16> %arg0 to <8 x i32>
+ %zext1 = zext <8 x i16> %arg1 to <8 x i32>
+ %add = add <8 x i32> %zext0, %zext1
+ %add1 = add <8 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+ %lshr = lshr <8 x i32> %add1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+ %res = trunc <8 x i32> %lshr to <8 x i16>
%bc = bitcast <8 x i16> %res to <2 x i64>
ret <2 x i64> %bc
}
-declare <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16>, <8 x i16>) nounwind readnone
define <2 x i64> @test_mm_bslli_si128(<2 x i64> %a0) nounwind {
; X32-LABEL: test_mm_bslli_si128:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10]
; X32-NEXT: retl
;
; X64-LABEL: test_mm_bslli_si128:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10]
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <16 x i8>
@@ -294,12 +302,12 @@ define <2 x i64> @test_mm_bslli_si128(<2 x i64> %a0) nounwind {
define <2 x i64> @test_mm_bsrli_si128(<2 x i64> %a0) nounwind {
; X32-LABEL: test_mm_bsrli_si128:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: psrldq {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero
; X32-NEXT: retl
;
; X64-LABEL: test_mm_bsrli_si128:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: psrldq {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <16 x i8>
@@ -310,11 +318,11 @@ define <2 x i64> @test_mm_bsrli_si128(<2 x i64> %a0) nounwind {
define <4 x float> @test_mm_castpd_ps(<2 x double> %a0) nounwind {
; X32-LABEL: test_mm_castpd_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: retl
;
; X64-LABEL: test_mm_castpd_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: retq
%res = bitcast <2 x double> %a0 to <4 x float>
ret <4 x float> %res
@@ -322,11 +330,11 @@ define <4 x float> @test_mm_castpd_ps(<2 x double> %a0) nounwind {
define <2 x i64> @test_mm_castpd_si128(<2 x double> %a0) nounwind {
; X32-LABEL: test_mm_castpd_si128:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: retl
;
; X64-LABEL: test_mm_castpd_si128:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: retq
%res = bitcast <2 x double> %a0 to <2 x i64>
ret <2 x i64> %res
@@ -334,11 +342,11 @@ define <2 x i64> @test_mm_castpd_si128(<2 x double> %a0) nounwind {
define <2 x double> @test_mm_castps_pd(<4 x float> %a0) nounwind {
; X32-LABEL: test_mm_castps_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: retl
;
; X64-LABEL: test_mm_castps_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: retq
%res = bitcast <4 x float> %a0 to <2 x double>
ret <2 x double> %res
@@ -346,11 +354,11 @@ define <2 x double> @test_mm_castps_pd(<4 x float> %a0) nounwind {
define <2 x i64> @test_mm_castps_si128(<4 x float> %a0) nounwind {
; X32-LABEL: test_mm_castps_si128:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: retl
;
; X64-LABEL: test_mm_castps_si128:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: retq
%res = bitcast <4 x float> %a0 to <2 x i64>
ret <2 x i64> %res
@@ -358,11 +366,11 @@ define <2 x i64> @test_mm_castps_si128(<4 x float> %a0) nounwind {
define <2 x double> @test_mm_castsi128_pd(<2 x i64> %a0) nounwind {
; X32-LABEL: test_mm_castsi128_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: retl
;
; X64-LABEL: test_mm_castsi128_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: retq
%res = bitcast <2 x i64> %a0 to <2 x double>
ret <2 x double> %res
@@ -370,11 +378,11 @@ define <2 x double> @test_mm_castsi128_pd(<2 x i64> %a0) nounwind {
define <4 x float> @test_mm_castsi128_ps(<2 x i64> %a0) nounwind {
; X32-LABEL: test_mm_castsi128_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: retl
;
; X64-LABEL: test_mm_castsi128_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: retq
%res = bitcast <2 x i64> %a0 to <4 x float>
ret <4 x float> %res
@@ -382,13 +390,13 @@ define <4 x float> @test_mm_castsi128_ps(<2 x i64> %a0) nounwind {
define void @test_mm_clflush(i8* %a0) nounwind {
; X32-LABEL: test_mm_clflush:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: clflush (%eax)
; X32-NEXT: retl
;
; X64-LABEL: test_mm_clflush:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: clflush (%rdi)
; X64-NEXT: retq
call void @llvm.x86.sse2.clflush(i8* %a0)
@@ -398,12 +406,12 @@ declare void @llvm.x86.sse2.clflush(i8*) nounwind readnone
define <2 x i64> @test_mm_cmpeq_epi8(<2 x i64> %a0, <2 x i64> %a1) nounwind {
; X32-LABEL: test_mm_cmpeq_epi8:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pcmpeqb %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_cmpeq_epi8:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: pcmpeqb %xmm1, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <16 x i8>
@@ -416,12 +424,12 @@ define <2 x i64> @test_mm_cmpeq_epi8(<2 x i64> %a0, <2 x i64> %a1) nounwind {
define <2 x i64> @test_mm_cmpeq_epi16(<2 x i64> %a0, <2 x i64> %a1) nounwind {
; X32-LABEL: test_mm_cmpeq_epi16:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pcmpeqw %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_cmpeq_epi16:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: pcmpeqw %xmm1, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <8 x i16>
@@ -434,12 +442,12 @@ define <2 x i64> @test_mm_cmpeq_epi16(<2 x i64> %a0, <2 x i64> %a1) nounwind {
define <2 x i64> @test_mm_cmpeq_epi32(<2 x i64> %a0, <2 x i64> %a1) nounwind {
; X32-LABEL: test_mm_cmpeq_epi32:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pcmpeqd %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_cmpeq_epi32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: pcmpeqd %xmm1, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <4 x i32>
@@ -452,12 +460,12 @@ define <2 x i64> @test_mm_cmpeq_epi32(<2 x i64> %a0, <2 x i64> %a1) nounwind {
define <2 x double> @test_mm_cmpeq_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
; X32-LABEL: test_mm_cmpeq_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: cmpeqpd %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_cmpeq_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: cmpeqpd %xmm1, %xmm0
; X64-NEXT: retq
%fcmp = fcmp oeq <2 x double> %a0, %a1
@@ -468,12 +476,12 @@ define <2 x double> @test_mm_cmpeq_pd(<2 x double> %a0, <2 x double> %a1) nounwi
define <2 x double> @test_mm_cmpeq_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
; X32-LABEL: test_mm_cmpeq_sd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: cmpeqsd %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_cmpeq_sd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: cmpeqsd %xmm1, %xmm0
; X64-NEXT: retq
%res = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a0, <2 x double> %a1, i8 0)
@@ -483,13 +491,13 @@ declare <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double>, <2 x double>, i8) nounw
define <2 x double> @test_mm_cmpge_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
; X32-LABEL: test_mm_cmpge_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: cmplepd %xmm0, %xmm1
; X32-NEXT: movapd %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_cmpge_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: cmplepd %xmm0, %xmm1
; X64-NEXT: movapd %xmm1, %xmm0
; X64-NEXT: retq
@@ -501,13 +509,13 @@ define <2 x double> @test_mm_cmpge_pd(<2 x double> %a0, <2 x double> %a1) nounwi
define <2 x double> @test_mm_cmpge_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
; X32-LABEL: test_mm_cmpge_sd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: cmplesd %xmm0, %xmm1
; X32-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; X32-NEXT: retl
;
; X64-LABEL: test_mm_cmpge_sd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: cmplesd %xmm0, %xmm1
; X64-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; X64-NEXT: retq
@@ -521,12 +529,12 @@ define <2 x double> @test_mm_cmpge_sd(<2 x double> %a0, <2 x double> %a1) nounwi
define <2 x i64> @test_mm_cmpgt_epi8(<2 x i64> %a0, <2 x i64> %a1) nounwind {
; X32-LABEL: test_mm_cmpgt_epi8:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pcmpgtb %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_cmpgt_epi8:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: pcmpgtb %xmm1, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <16 x i8>
@@ -539,12 +547,12 @@ define <2 x i64> @test_mm_cmpgt_epi8(<2 x i64> %a0, <2 x i64> %a1) nounwind {
define <2 x i64> @test_mm_cmpgt_epi16(<2 x i64> %a0, <2 x i64> %a1) nounwind {
; X32-LABEL: test_mm_cmpgt_epi16:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pcmpgtw %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_cmpgt_epi16:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: pcmpgtw %xmm1, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <8 x i16>
@@ -557,12 +565,12 @@ define <2 x i64> @test_mm_cmpgt_epi16(<2 x i64> %a0, <2 x i64> %a1) nounwind {
define <2 x i64> @test_mm_cmpgt_epi32(<2 x i64> %a0, <2 x i64> %a1) nounwind {
; X32-LABEL: test_mm_cmpgt_epi32:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pcmpgtd %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_cmpgt_epi32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: pcmpgtd %xmm1, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <4 x i32>
@@ -575,13 +583,13 @@ define <2 x i64> @test_mm_cmpgt_epi32(<2 x i64> %a0, <2 x i64> %a1) nounwind {
define <2 x double> @test_mm_cmpgt_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
; X32-LABEL: test_mm_cmpgt_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: cmpltpd %xmm0, %xmm1
; X32-NEXT: movapd %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_cmpgt_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: cmpltpd %xmm0, %xmm1
; X64-NEXT: movapd %xmm1, %xmm0
; X64-NEXT: retq
@@ -593,13 +601,13 @@ define <2 x double> @test_mm_cmpgt_pd(<2 x double> %a0, <2 x double> %a1) nounwi
define <2 x double> @test_mm_cmpgt_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
; X32-LABEL: test_mm_cmpgt_sd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: cmpltsd %xmm0, %xmm1
; X32-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; X32-NEXT: retl
;
; X64-LABEL: test_mm_cmpgt_sd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: cmpltsd %xmm0, %xmm1
; X64-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; X64-NEXT: retq
@@ -613,12 +621,12 @@ define <2 x double> @test_mm_cmpgt_sd(<2 x double> %a0, <2 x double> %a1) nounwi
define <2 x double> @test_mm_cmple_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
; X32-LABEL: test_mm_cmple_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: cmplepd %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_cmple_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: cmplepd %xmm1, %xmm0
; X64-NEXT: retq
%fcmp = fcmp ole <2 x double> %a0, %a1
@@ -629,12 +637,12 @@ define <2 x double> @test_mm_cmple_pd(<2 x double> %a0, <2 x double> %a1) nounwi
define <2 x double> @test_mm_cmple_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
; X32-LABEL: test_mm_cmple_sd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: cmplesd %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_cmple_sd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: cmplesd %xmm1, %xmm0
; X64-NEXT: retq
%res = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a0, <2 x double> %a1, i8 2)
@@ -643,13 +651,13 @@ define <2 x double> @test_mm_cmple_sd(<2 x double> %a0, <2 x double> %a1) nounwi
define <2 x i64> @test_mm_cmplt_epi8(<2 x i64> %a0, <2 x i64> %a1) nounwind {
; X32-LABEL: test_mm_cmplt_epi8:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pcmpgtb %xmm0, %xmm1
; X32-NEXT: movdqa %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_cmplt_epi8:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: pcmpgtb %xmm0, %xmm1
; X64-NEXT: movdqa %xmm1, %xmm0
; X64-NEXT: retq
@@ -663,13 +671,13 @@ define <2 x i64> @test_mm_cmplt_epi8(<2 x i64> %a0, <2 x i64> %a1) nounwind {
define <2 x i64> @test_mm_cmplt_epi16(<2 x i64> %a0, <2 x i64> %a1) nounwind {
; X32-LABEL: test_mm_cmplt_epi16:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pcmpgtw %xmm0, %xmm1
; X32-NEXT: movdqa %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_cmplt_epi16:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: pcmpgtw %xmm0, %xmm1
; X64-NEXT: movdqa %xmm1, %xmm0
; X64-NEXT: retq
@@ -683,13 +691,13 @@ define <2 x i64> @test_mm_cmplt_epi16(<2 x i64> %a0, <2 x i64> %a1) nounwind {
define <2 x i64> @test_mm_cmplt_epi32(<2 x i64> %a0, <2 x i64> %a1) nounwind {
; X32-LABEL: test_mm_cmplt_epi32:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pcmpgtd %xmm0, %xmm1
; X32-NEXT: movdqa %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_cmplt_epi32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: pcmpgtd %xmm0, %xmm1
; X64-NEXT: movdqa %xmm1, %xmm0
; X64-NEXT: retq
@@ -703,12 +711,12 @@ define <2 x i64> @test_mm_cmplt_epi32(<2 x i64> %a0, <2 x i64> %a1) nounwind {
define <2 x double> @test_mm_cmplt_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
; X32-LABEL: test_mm_cmplt_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: cmpltpd %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_cmplt_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: cmpltpd %xmm1, %xmm0
; X64-NEXT: retq
%fcmp = fcmp olt <2 x double> %a0, %a1
@@ -719,12 +727,12 @@ define <2 x double> @test_mm_cmplt_pd(<2 x double> %a0, <2 x double> %a1) nounwi
define <2 x double> @test_mm_cmplt_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
; X32-LABEL: test_mm_cmplt_sd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: cmpltsd %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_cmplt_sd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: cmpltsd %xmm1, %xmm0
; X64-NEXT: retq
%res = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a0, <2 x double> %a1, i8 1)
@@ -733,12 +741,12 @@ define <2 x double> @test_mm_cmplt_sd(<2 x double> %a0, <2 x double> %a1) nounwi
define <2 x double> @test_mm_cmpneq_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
; X32-LABEL: test_mm_cmpneq_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: cmpneqpd %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_cmpneq_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: cmpneqpd %xmm1, %xmm0
; X64-NEXT: retq
%fcmp = fcmp une <2 x double> %a0, %a1
@@ -749,12 +757,12 @@ define <2 x double> @test_mm_cmpneq_pd(<2 x double> %a0, <2 x double> %a1) nounw
define <2 x double> @test_mm_cmpneq_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
; X32-LABEL: test_mm_cmpneq_sd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: cmpneqsd %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_cmpneq_sd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: cmpneqsd %xmm1, %xmm0
; X64-NEXT: retq
%res = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a0, <2 x double> %a1, i8 4)
@@ -763,13 +771,13 @@ define <2 x double> @test_mm_cmpneq_sd(<2 x double> %a0, <2 x double> %a1) nounw
define <2 x double> @test_mm_cmpnge_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
; X32-LABEL: test_mm_cmpnge_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: cmpnlepd %xmm0, %xmm1
; X32-NEXT: movapd %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_cmpnge_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: cmpnlepd %xmm0, %xmm1
; X64-NEXT: movapd %xmm1, %xmm0
; X64-NEXT: retq
@@ -781,13 +789,13 @@ define <2 x double> @test_mm_cmpnge_pd(<2 x double> %a0, <2 x double> %a1) nounw
define <2 x double> @test_mm_cmpnge_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
; X32-LABEL: test_mm_cmpnge_sd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: cmpnlesd %xmm0, %xmm1
; X32-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; X32-NEXT: retl
;
; X64-LABEL: test_mm_cmpnge_sd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: cmpnlesd %xmm0, %xmm1
; X64-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; X64-NEXT: retq
@@ -801,13 +809,13 @@ define <2 x double> @test_mm_cmpnge_sd(<2 x double> %a0, <2 x double> %a1) nounw
define <2 x double> @test_mm_cmpngt_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
; X32-LABEL: test_mm_cmpngt_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: cmpnltpd %xmm0, %xmm1
; X32-NEXT: movapd %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_cmpngt_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: cmpnltpd %xmm0, %xmm1
; X64-NEXT: movapd %xmm1, %xmm0
; X64-NEXT: retq
@@ -819,13 +827,13 @@ define <2 x double> @test_mm_cmpngt_pd(<2 x double> %a0, <2 x double> %a1) nounw
define <2 x double> @test_mm_cmpngt_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
; X32-LABEL: test_mm_cmpngt_sd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: cmpnltsd %xmm0, %xmm1
; X32-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; X32-NEXT: retl
;
; X64-LABEL: test_mm_cmpngt_sd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: cmpnltsd %xmm0, %xmm1
; X64-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; X64-NEXT: retq
@@ -839,12 +847,12 @@ define <2 x double> @test_mm_cmpngt_sd(<2 x double> %a0, <2 x double> %a1) nounw
define <2 x double> @test_mm_cmpnle_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
; X32-LABEL: test_mm_cmpnle_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: cmpnlepd %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_cmpnle_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: cmpnlepd %xmm1, %xmm0
; X64-NEXT: retq
%fcmp = fcmp ugt <2 x double> %a0, %a1
@@ -855,12 +863,12 @@ define <2 x double> @test_mm_cmpnle_pd(<2 x double> %a0, <2 x double> %a1) nounw
define <2 x double> @test_mm_cmpnle_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
; X32-LABEL: test_mm_cmpnle_sd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: cmpnlesd %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_cmpnle_sd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: cmpnlesd %xmm1, %xmm0
; X64-NEXT: retq
%res = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a0, <2 x double> %a1, i8 6)
@@ -869,12 +877,12 @@ define <2 x double> @test_mm_cmpnle_sd(<2 x double> %a0, <2 x double> %a1) nounw
define <2 x double> @test_mm_cmpnlt_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
; X32-LABEL: test_mm_cmpnlt_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: cmpnltpd %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_cmpnlt_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: cmpnltpd %xmm1, %xmm0
; X64-NEXT: retq
%fcmp = fcmp uge <2 x double> %a0, %a1
@@ -885,12 +893,12 @@ define <2 x double> @test_mm_cmpnlt_pd(<2 x double> %a0, <2 x double> %a1) nounw
define <2 x double> @test_mm_cmpnlt_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
; X32-LABEL: test_mm_cmpnlt_sd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: cmpnltsd %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_cmpnlt_sd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: cmpnltsd %xmm1, %xmm0
; X64-NEXT: retq
%res = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a0, <2 x double> %a1, i8 5)
@@ -899,12 +907,12 @@ define <2 x double> @test_mm_cmpnlt_sd(<2 x double> %a0, <2 x double> %a1) nounw
define <2 x double> @test_mm_cmpord_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
; X32-LABEL: test_mm_cmpord_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: cmpordpd %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_cmpord_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: cmpordpd %xmm1, %xmm0
; X64-NEXT: retq
%fcmp = fcmp ord <2 x double> %a0, %a1
@@ -915,12 +923,12 @@ define <2 x double> @test_mm_cmpord_pd(<2 x double> %a0, <2 x double> %a1) nounw
define <2 x double> @test_mm_cmpord_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
; X32-LABEL: test_mm_cmpord_sd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: cmpordsd %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_cmpord_sd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: cmpordsd %xmm1, %xmm0
; X64-NEXT: retq
%res = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a0, <2 x double> %a1, i8 7)
@@ -929,12 +937,12 @@ define <2 x double> @test_mm_cmpord_sd(<2 x double> %a0, <2 x double> %a1) nounw
define <2 x double> @test_mm_cmpunord_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
; X32-LABEL: test_mm_cmpunord_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: cmpunordpd %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_cmpunord_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: cmpunordpd %xmm1, %xmm0
; X64-NEXT: retq
%fcmp = fcmp uno <2 x double> %a0, %a1
@@ -945,12 +953,12 @@ define <2 x double> @test_mm_cmpunord_pd(<2 x double> %a0, <2 x double> %a1) nou
define <2 x double> @test_mm_cmpunord_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
; X32-LABEL: test_mm_cmpunord_sd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: cmpunordsd %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_cmpunord_sd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: cmpunordsd %xmm1, %xmm0
; X64-NEXT: retq
%res = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a0, <2 x double> %a1, i8 3)
@@ -959,7 +967,7 @@ define <2 x double> @test_mm_cmpunord_sd(<2 x double> %a0, <2 x double> %a1) nou
define i32 @test_mm_comieq_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
; X32-LABEL: test_mm_comieq_sd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: comisd %xmm1, %xmm0
; X32-NEXT: setnp %al
; X32-NEXT: sete %cl
@@ -968,7 +976,7 @@ define i32 @test_mm_comieq_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
; X32-NEXT: retl
;
; X64-LABEL: test_mm_comieq_sd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: comisd %xmm1, %xmm0
; X64-NEXT: setnp %al
; X64-NEXT: sete %cl
@@ -982,14 +990,14 @@ declare i32 @llvm.x86.sse2.comieq.sd(<2 x double>, <2 x double>) nounwind readno
define i32 @test_mm_comige_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
; X32-LABEL: test_mm_comige_sd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: xorl %eax, %eax
; X32-NEXT: comisd %xmm1, %xmm0
; X32-NEXT: setae %al
; X32-NEXT: retl
;
; X64-LABEL: test_mm_comige_sd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: comisd %xmm1, %xmm0
; X64-NEXT: setae %al
@@ -1001,14 +1009,14 @@ declare i32 @llvm.x86.sse2.comige.sd(<2 x double>, <2 x double>) nounwind readno
define i32 @test_mm_comigt_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
; X32-LABEL: test_mm_comigt_sd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: xorl %eax, %eax
; X32-NEXT: comisd %xmm1, %xmm0
; X32-NEXT: seta %al
; X32-NEXT: retl
;
; X64-LABEL: test_mm_comigt_sd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: comisd %xmm1, %xmm0
; X64-NEXT: seta %al
@@ -1020,14 +1028,14 @@ declare i32 @llvm.x86.sse2.comigt.sd(<2 x double>, <2 x double>) nounwind readno
define i32 @test_mm_comile_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
; X32-LABEL: test_mm_comile_sd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: xorl %eax, %eax
; X32-NEXT: comisd %xmm0, %xmm1
; X32-NEXT: setae %al
; X32-NEXT: retl
;
; X64-LABEL: test_mm_comile_sd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: comisd %xmm0, %xmm1
; X64-NEXT: setae %al
@@ -1039,14 +1047,14 @@ declare i32 @llvm.x86.sse2.comile.sd(<2 x double>, <2 x double>) nounwind readno
define i32 @test_mm_comilt_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
; X32-LABEL: test_mm_comilt_sd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: xorl %eax, %eax
; X32-NEXT: comisd %xmm0, %xmm1
; X32-NEXT: seta %al
; X32-NEXT: retl
;
; X64-LABEL: test_mm_comilt_sd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: comisd %xmm0, %xmm1
; X64-NEXT: seta %al
@@ -1058,7 +1066,7 @@ declare i32 @llvm.x86.sse2.comilt.sd(<2 x double>, <2 x double>) nounwind readno
define i32 @test_mm_comineq_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
; X32-LABEL: test_mm_comineq_sd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: comisd %xmm1, %xmm0
; X32-NEXT: setp %al
; X32-NEXT: setne %cl
@@ -1067,7 +1075,7 @@ define i32 @test_mm_comineq_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
; X32-NEXT: retl
;
; X64-LABEL: test_mm_comineq_sd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: comisd %xmm1, %xmm0
; X64-NEXT: setp %al
; X64-NEXT: setne %cl
@@ -1081,12 +1089,12 @@ declare i32 @llvm.x86.sse2.comineq.sd(<2 x double>, <2 x double>) nounwind readn
define <2 x double> @test_mm_cvtepi32_pd(<2 x i64> %a0) nounwind {
; X32-LABEL: test_mm_cvtepi32_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: cvtdq2pd %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_cvtepi32_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: cvtdq2pd %xmm0, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <4 x i32>
@@ -1097,12 +1105,12 @@ define <2 x double> @test_mm_cvtepi32_pd(<2 x i64> %a0) nounwind {
define <4 x float> @test_mm_cvtepi32_ps(<2 x i64> %a0) nounwind {
; X32-LABEL: test_mm_cvtepi32_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: cvtdq2ps %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_cvtepi32_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: cvtdq2ps %xmm0, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <4 x i32>
@@ -1113,12 +1121,12 @@ declare <4 x float> @llvm.x86.sse2.cvtdq2ps(<4 x i32>) nounwind readnone
define <2 x i64> @test_mm_cvtpd_epi32(<2 x double> %a0) nounwind {
; X32-LABEL: test_mm_cvtpd_epi32:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: cvtpd2dq %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_cvtpd_epi32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: cvtpd2dq %xmm0, %xmm0
; X64-NEXT: retq
%res = call <4 x i32> @llvm.x86.sse2.cvtpd2dq(<2 x double> %a0)
@@ -1129,12 +1137,12 @@ declare <4 x i32> @llvm.x86.sse2.cvtpd2dq(<2 x double>) nounwind readnone
define <4 x float> @test_mm_cvtpd_ps(<2 x double> %a0) nounwind {
; X32-LABEL: test_mm_cvtpd_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: cvtpd2ps %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_cvtpd_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: cvtpd2ps %xmm0, %xmm0
; X64-NEXT: retq
%res = call <4 x float> @llvm.x86.sse2.cvtpd2ps(<2 x double> %a0)
@@ -1144,12 +1152,12 @@ declare <4 x float> @llvm.x86.sse2.cvtpd2ps(<2 x double>) nounwind readnone
define <2 x i64> @test_mm_cvtps_epi32(<4 x float> %a0) nounwind {
; X32-LABEL: test_mm_cvtps_epi32:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: cvtps2dq %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_cvtps_epi32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: cvtps2dq %xmm0, %xmm0
; X64-NEXT: retq
%res = call <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float> %a0)
@@ -1160,12 +1168,12 @@ declare <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float>) nounwind readnone
define <2 x double> @test_mm_cvtps_pd(<4 x float> %a0) nounwind {
; X32-LABEL: test_mm_cvtps_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: cvtps2pd %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_cvtps_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: cvtps2pd %xmm0, %xmm0
; X64-NEXT: retq
%ext = shufflevector <4 x float> %a0, <4 x float> %a0, <2 x i32> <i32 0, i32 1>
@@ -1175,7 +1183,7 @@ define <2 x double> @test_mm_cvtps_pd(<4 x float> %a0) nounwind {
define double @test_mm_cvtsd_f64(<2 x double> %a0) nounwind {
; X32-LABEL: test_mm_cvtsd_f64:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pushl %ebp
; X32-NEXT: movl %esp, %ebp
; X32-NEXT: andl $-8, %esp
@@ -1187,7 +1195,7 @@ define double @test_mm_cvtsd_f64(<2 x double> %a0) nounwind {
; X32-NEXT: retl
;
; X64-LABEL: test_mm_cvtsd_f64:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: retq
%res = extractelement <2 x double> %a0, i32 0
ret double %res
@@ -1195,12 +1203,12 @@ define double @test_mm_cvtsd_f64(<2 x double> %a0) nounwind {
define i32 @test_mm_cvtsd_si32(<2 x double> %a0) nounwind {
; X32-LABEL: test_mm_cvtsd_si32:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: cvtsd2si %xmm0, %eax
; X32-NEXT: retl
;
; X64-LABEL: test_mm_cvtsd_si32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: cvtsd2si %xmm0, %eax
; X64-NEXT: retq
%res = call i32 @llvm.x86.sse2.cvtsd2si(<2 x double> %a0)
@@ -1210,12 +1218,12 @@ declare i32 @llvm.x86.sse2.cvtsd2si(<2 x double>) nounwind readnone
define <4 x float> @test_mm_cvtsd_ss(<4 x float> %a0, <2 x double> %a1) {
; X32-LABEL: test_mm_cvtsd_ss:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: cvtsd2ss %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_cvtsd_ss:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: cvtsd2ss %xmm1, %xmm0
; X64-NEXT: retq
%res = call <4 x float> @llvm.x86.sse2.cvtsd2ss(<4 x float> %a0, <2 x double> %a1)
@@ -1225,13 +1233,13 @@ declare <4 x float> @llvm.x86.sse2.cvtsd2ss(<4 x float>, <2 x double>) nounwind
define <4 x float> @test_mm_cvtsd_ss_load(<4 x float> %a0, <2 x double>* %p1) {
; X32-LABEL: test_mm_cvtsd_ss_load:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: cvtsd2ss (%eax), %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_cvtsd_ss_load:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: cvtsd2ss (%rdi), %xmm0
; X64-NEXT: retq
%a1 = load <2 x double>, <2 x double>* %p1
@@ -1241,12 +1249,12 @@ define <4 x float> @test_mm_cvtsd_ss_load(<4 x float> %a0, <2 x double>* %p1) {
define i32 @test_mm_cvtsi128_si32(<2 x i64> %a0) nounwind {
; X32-LABEL: test_mm_cvtsi128_si32:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movd %xmm0, %eax
; X32-NEXT: retl
;
; X64-LABEL: test_mm_cvtsi128_si32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movd %xmm0, %eax
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <4 x i32>
@@ -1256,12 +1264,12 @@ define i32 @test_mm_cvtsi128_si32(<2 x i64> %a0) nounwind {
define <2 x double> @test_mm_cvtsi32_sd(<2 x double> %a0, i32 %a1) nounwind {
; X32-LABEL: test_mm_cvtsi32_sd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: cvtsi2sdl {{[0-9]+}}(%esp), %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_cvtsi32_sd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: cvtsi2sdl %edi, %xmm0
; X64-NEXT: retq
%cvt = sitofp i32 %a1 to double
@@ -1271,12 +1279,12 @@ define <2 x double> @test_mm_cvtsi32_sd(<2 x double> %a0, i32 %a1) nounwind {
define <2 x i64> @test_mm_cvtsi32_si128(i32 %a0) nounwind {
; X32-LABEL: test_mm_cvtsi32_si128:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X32-NEXT: retl
;
; X64-LABEL: test_mm_cvtsi32_si128:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movd %edi, %xmm0
; X64-NEXT: retq
%res0 = insertelement <4 x i32> undef, i32 %a0, i32 0
@@ -1289,12 +1297,12 @@ define <2 x i64> @test_mm_cvtsi32_si128(i32 %a0) nounwind {
define <2 x double> @test_mm_cvtss_sd(<2 x double> %a0, <4 x float> %a1) nounwind {
; X32-LABEL: test_mm_cvtss_sd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: cvtss2sd %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_cvtss_sd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: cvtss2sd %xmm1, %xmm0
; X64-NEXT: retq
%ext = extractelement <4 x float> %a1, i32 0
@@ -1305,12 +1313,12 @@ define <2 x double> @test_mm_cvtss_sd(<2 x double> %a0, <4 x float> %a1) nounwin
define <2 x i64> @test_mm_cvttpd_epi32(<2 x double> %a0) nounwind {
; X32-LABEL: test_mm_cvttpd_epi32:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: cvttpd2dq %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_cvttpd_epi32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: cvttpd2dq %xmm0, %xmm0
; X64-NEXT: retq
%res = call <4 x i32> @llvm.x86.sse2.cvttpd2dq(<2 x double> %a0)
@@ -1321,12 +1329,12 @@ declare <4 x i32> @llvm.x86.sse2.cvttpd2dq(<2 x double>) nounwind readnone
define <2 x i64> @test_mm_cvttps_epi32(<4 x float> %a0) nounwind {
; X32-LABEL: test_mm_cvttps_epi32:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: cvttps2dq %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_cvttps_epi32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: cvttps2dq %xmm0, %xmm0
; X64-NEXT: retq
%res = call <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float> %a0)
@@ -1337,12 +1345,12 @@ declare <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float>) nounwind readnone
define i32 @test_mm_cvttsd_si32(<2 x double> %a0) nounwind {
; X32-LABEL: test_mm_cvttsd_si32:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: cvttsd2si %xmm0, %eax
; X32-NEXT: retl
;
; X64-LABEL: test_mm_cvttsd_si32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: cvttsd2si %xmm0, %eax
; X64-NEXT: retq
%res = call i32 @llvm.x86.sse2.cvttsd2si(<2 x double> %a0)
@@ -1352,12 +1360,12 @@ declare i32 @llvm.x86.sse2.cvttsd2si(<2 x double>) nounwind readnone
define <2 x double> @test_mm_div_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
; X32-LABEL: test_mm_div_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: divpd %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_div_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: divpd %xmm1, %xmm0
; X64-NEXT: retq
%res = fdiv <2 x double> %a0, %a1
@@ -1366,12 +1374,12 @@ define <2 x double> @test_mm_div_pd(<2 x double> %a0, <2 x double> %a1) nounwind
define <2 x double> @test_mm_div_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
; X32-LABEL: test_mm_div_sd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: divsd %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_div_sd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: divsd %xmm1, %xmm0
; X64-NEXT: retq
%ext0 = extractelement <2 x double> %a0, i32 0
@@ -1383,13 +1391,13 @@ define <2 x double> @test_mm_div_sd(<2 x double> %a0, <2 x double> %a1) nounwind
define i32 @test_mm_extract_epi16(<2 x i64> %a0) nounwind {
; X32-LABEL: test_mm_extract_epi16:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pextrw $1, %xmm0, %eax
; X32-NEXT: movzwl %ax, %eax
; X32-NEXT: retl
;
; X64-LABEL: test_mm_extract_epi16:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: pextrw $1, %xmm0, %eax
; X64-NEXT: movzwl %ax, %eax
; X64-NEXT: retq
@@ -1401,13 +1409,13 @@ define i32 @test_mm_extract_epi16(<2 x i64> %a0) nounwind {
define <2 x i64> @test_mm_insert_epi16(<2 x i64> %a0, i16 %a1) nounwind {
; X32-LABEL: test_mm_insert_epi16:
-; X32: # BB#0:
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32: # %bb.0:
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: pinsrw $1, %eax, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_insert_epi16:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: pinsrw $1, %edi, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <8 x i16>
@@ -1418,12 +1426,12 @@ define <2 x i64> @test_mm_insert_epi16(<2 x i64> %a0, i16 %a1) nounwind {
define void @test_mm_lfence() nounwind {
; X32-LABEL: test_mm_lfence:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: lfence
; X32-NEXT: retl
;
; X64-LABEL: test_mm_lfence:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: lfence
; X64-NEXT: retq
call void @llvm.x86.sse2.lfence()
@@ -1433,13 +1441,13 @@ declare void @llvm.x86.sse2.lfence() nounwind readnone
define <2 x double> @test_mm_load_pd(double* %a0) nounwind {
; X32-LABEL: test_mm_load_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movaps (%eax), %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_load_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movaps (%rdi), %xmm0
; X64-NEXT: retq
%arg0 = bitcast double* %a0 to <2 x double>*
@@ -1449,13 +1457,13 @@ define <2 x double> @test_mm_load_pd(double* %a0) nounwind {
define <2 x double> @test_mm_load_sd(double* %a0) nounwind {
; X32-LABEL: test_mm_load_sd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; X32-NEXT: retl
;
; X64-LABEL: test_mm_load_sd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; X64-NEXT: retq
%ld = load double, double* %a0, align 1
@@ -1466,13 +1474,13 @@ define <2 x double> @test_mm_load_sd(double* %a0) nounwind {
define <2 x i64> @test_mm_load_si128(<2 x i64>* %a0) nounwind {
; X32-LABEL: test_mm_load_si128:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movaps (%eax), %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_load_si128:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movaps (%rdi), %xmm0
; X64-NEXT: retq
%res = load <2 x i64>, <2 x i64>* %a0, align 16
@@ -1481,14 +1489,14 @@ define <2 x i64> @test_mm_load_si128(<2 x i64>* %a0) nounwind {
define <2 x double> @test_mm_load1_pd(double* %a0) nounwind {
; X32-LABEL: test_mm_load1_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; X32-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0]
; X32-NEXT: retl
;
; X64-LABEL: test_mm_load1_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; X64-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0]
; X64-NEXT: retq
@@ -1500,13 +1508,13 @@ define <2 x double> @test_mm_load1_pd(double* %a0) nounwind {
define <2 x double> @test_mm_loadh_pd(<2 x double> %a0, double* %a1) nounwind {
; X32-LABEL: test_mm_loadh_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
; X32-NEXT: retl
;
; X64-LABEL: test_mm_loadh_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
; X64-NEXT: retq
%ld = load double, double* %a1, align 8
@@ -1516,13 +1524,13 @@ define <2 x double> @test_mm_loadh_pd(<2 x double> %a0, double* %a1) nounwind {
define <2 x i64> @test_mm_loadl_epi64(<2 x i64> %a0, <2 x i64>* %a1) nounwind {
; X32-LABEL: test_mm_loadl_epi64:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; X32-NEXT: retl
;
; X64-LABEL: test_mm_loadl_epi64:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; X64-NEXT: retq
%bc = bitcast <2 x i64>* %a1 to i64*
@@ -1534,13 +1542,13 @@ define <2 x i64> @test_mm_loadl_epi64(<2 x i64> %a0, <2 x i64>* %a1) nounwind {
define <2 x double> @test_mm_loadl_pd(<2 x double> %a0, double* %a1) nounwind {
; X32-LABEL: test_mm_loadl_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movlpd {{.*#+}} xmm0 = mem[0],xmm0[1]
; X32-NEXT: retl
;
; X64-LABEL: test_mm_loadl_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movlpd {{.*#+}} xmm0 = mem[0],xmm0[1]
; X64-NEXT: retq
%ld = load double, double* %a1, align 8
@@ -1550,14 +1558,14 @@ define <2 x double> @test_mm_loadl_pd(<2 x double> %a0, double* %a1) nounwind {
define <2 x double> @test_mm_loadr_pd(double* %a0) nounwind {
; X32-LABEL: test_mm_loadr_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movapd (%eax), %xmm0
; X32-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0]
; X32-NEXT: retl
;
; X64-LABEL: test_mm_loadr_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movapd (%rdi), %xmm0
; X64-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0]
; X64-NEXT: retq
@@ -1569,13 +1577,13 @@ define <2 x double> @test_mm_loadr_pd(double* %a0) nounwind {
define <2 x double> @test_mm_loadu_pd(double* %a0) nounwind {
; X32-LABEL: test_mm_loadu_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movups (%eax), %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_loadu_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movups (%rdi), %xmm0
; X64-NEXT: retq
%arg0 = bitcast double* %a0 to <2 x double>*
@@ -1585,13 +1593,13 @@ define <2 x double> @test_mm_loadu_pd(double* %a0) nounwind {
define <2 x i64> @test_mm_loadu_si128(<2 x i64>* %a0) nounwind {
; X32-LABEL: test_mm_loadu_si128:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movups (%eax), %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_loadu_si128:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movups (%rdi), %xmm0
; X64-NEXT: retq
%res = load <2 x i64>, <2 x i64>* %a0, align 1
@@ -1600,12 +1608,12 @@ define <2 x i64> @test_mm_loadu_si128(<2 x i64>* %a0) nounwind {
define <2 x i64> @test_mm_madd_epi16(<2 x i64> %a0, <2 x i64> %a1) nounwind {
; X32-LABEL: test_mm_madd_epi16:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pmaddwd %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_madd_epi16:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: pmaddwd %xmm1, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <8 x i16>
@@ -1618,7 +1626,7 @@ declare <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16>, <8 x i16>) nounwind readnon
define void @test_mm_maskmoveu_si128(<2 x i64> %a0, <2 x i64> %a1, i8* %a2) nounwind {
; X32-LABEL: test_mm_maskmoveu_si128:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pushl %edi
; X32-NEXT: movl {{[0-9]+}}(%esp), %edi
; X32-NEXT: maskmovdqu %xmm1, %xmm0
@@ -1626,7 +1634,7 @@ define void @test_mm_maskmoveu_si128(<2 x i64> %a0, <2 x i64> %a1, i8* %a2) noun
; X32-NEXT: retl
;
; X64-LABEL: test_mm_maskmoveu_si128:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: maskmovdqu %xmm1, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <16 x i8>
@@ -1638,12 +1646,12 @@ declare void @llvm.x86.sse2.maskmov.dqu(<16 x i8>, <16 x i8>, i8*) nounwind
define <2 x i64> @test_mm_max_epi16(<2 x i64> %a0, <2 x i64> %a1) nounwind {
; X32-LABEL: test_mm_max_epi16:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pmaxsw %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_max_epi16:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: pmaxsw %xmm1, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <8 x i16>
@@ -1656,12 +1664,12 @@ define <2 x i64> @test_mm_max_epi16(<2 x i64> %a0, <2 x i64> %a1) nounwind {
define <2 x i64> @test_mm_max_epu8(<2 x i64> %a0, <2 x i64> %a1) nounwind {
; X32-LABEL: test_mm_max_epu8:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pmaxub %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_max_epu8:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: pmaxub %xmm1, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <16 x i8>
@@ -1674,12 +1682,12 @@ define <2 x i64> @test_mm_max_epu8(<2 x i64> %a0, <2 x i64> %a1) nounwind {
define <2 x double> @test_mm_max_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
; X32-LABEL: test_mm_max_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: maxpd %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_max_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: maxpd %xmm1, %xmm0
; X64-NEXT: retq
%res = call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %a0, <2 x double> %a1)
@@ -1689,12 +1697,12 @@ declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind
define <2 x double> @test_mm_max_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
; X32-LABEL: test_mm_max_sd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: maxsd %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_max_sd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: maxsd %xmm1, %xmm0
; X64-NEXT: retq
%res = call <2 x double> @llvm.x86.sse2.max.sd(<2 x double> %a0, <2 x double> %a1)
@@ -1704,12 +1712,12 @@ declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>) nounwind
define void @test_mm_mfence() nounwind {
; X32-LABEL: test_mm_mfence:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: mfence
; X32-NEXT: retl
;
; X64-LABEL: test_mm_mfence:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: mfence
; X64-NEXT: retq
call void @llvm.x86.sse2.mfence()
@@ -1719,12 +1727,12 @@ declare void @llvm.x86.sse2.mfence() nounwind readnone
define <2 x i64> @test_mm_min_epi16(<2 x i64> %a0, <2 x i64> %a1) nounwind {
; X32-LABEL: test_mm_min_epi16:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pminsw %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_min_epi16:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: pminsw %xmm1, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <8 x i16>
@@ -1737,12 +1745,12 @@ define <2 x i64> @test_mm_min_epi16(<2 x i64> %a0, <2 x i64> %a1) nounwind {
define <2 x i64> @test_mm_min_epu8(<2 x i64> %a0, <2 x i64> %a1) nounwind {
; X32-LABEL: test_mm_min_epu8:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pminub %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_min_epu8:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: pminub %xmm1, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <16 x i8>
@@ -1755,12 +1763,12 @@ define <2 x i64> @test_mm_min_epu8(<2 x i64> %a0, <2 x i64> %a1) nounwind {
define <2 x double> @test_mm_min_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
; X32-LABEL: test_mm_min_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: minpd %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_min_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: minpd %xmm1, %xmm0
; X64-NEXT: retq
%res = call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %a0, <2 x double> %a1)
@@ -1770,12 +1778,12 @@ declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind
define <2 x double> @test_mm_min_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
; X32-LABEL: test_mm_min_sd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: minsd %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_min_sd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: minsd %xmm1, %xmm0
; X64-NEXT: retq
%res = call <2 x double> @llvm.x86.sse2.min.sd(<2 x double> %a0, <2 x double> %a1)
@@ -1785,12 +1793,12 @@ declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind
define <2 x i64> @test_mm_move_epi64(<2 x i64> %a0) nounwind {
; X32-LABEL: test_mm_move_epi64:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
; X32-NEXT: retl
;
; X64-LABEL: test_mm_move_epi64:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
; X64-NEXT: retq
%res = shufflevector <2 x i64> %a0, <2 x i64> zeroinitializer, <2 x i32> <i32 0, i32 2>
@@ -1799,12 +1807,12 @@ define <2 x i64> @test_mm_move_epi64(<2 x i64> %a0) nounwind {
define <2 x double> @test_mm_move_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
; X32-LABEL: test_mm_move_sd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; X32-NEXT: retl
;
; X64-LABEL: test_mm_move_sd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; X64-NEXT: retq
%ext0 = extractelement <2 x double> %a1, i32 0
@@ -1816,12 +1824,12 @@ define <2 x double> @test_mm_move_sd(<2 x double> %a0, <2 x double> %a1) nounwin
define i32 @test_mm_movemask_epi8(<2 x i64> %a0) nounwind {
; X32-LABEL: test_mm_movemask_epi8:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pmovmskb %xmm0, %eax
; X32-NEXT: retl
;
; X64-LABEL: test_mm_movemask_epi8:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: pmovmskb %xmm0, %eax
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <16 x i8>
@@ -1832,12 +1840,12 @@ declare i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8>) nounwind readnone
define i32 @test_mm_movemask_pd(<2 x double> %a0) nounwind {
; X32-LABEL: test_mm_movemask_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movmskpd %xmm0, %eax
; X32-NEXT: retl
;
; X64-LABEL: test_mm_movemask_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movmskpd %xmm0, %eax
; X64-NEXT: retq
%res = call i32 @llvm.x86.sse2.movmsk.pd(<2 x double> %a0)
@@ -1847,12 +1855,12 @@ declare i32 @llvm.x86.sse2.movmsk.pd(<2 x double>) nounwind readnone
define <2 x i64> @test_mm_mul_epu32(<2 x i64> %a0, <2 x i64> %a1) {
; X32-LABEL: test_mm_mul_epu32:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pmuludq %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_mul_epu32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: pmuludq %xmm1, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <4 x i32>
@@ -1864,12 +1872,12 @@ declare <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32>, <4 x i32>) nounwind readnon
define <2 x double> @test_mm_mul_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
; X32-LABEL: test_mm_mul_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: mulpd %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_mul_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: mulpd %xmm1, %xmm0
; X64-NEXT: retq
%res = fmul <2 x double> %a0, %a1
@@ -1878,12 +1886,12 @@ define <2 x double> @test_mm_mul_pd(<2 x double> %a0, <2 x double> %a1) nounwind
define <2 x double> @test_mm_mul_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
; X32-LABEL: test_mm_mul_sd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: mulsd %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_mul_sd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: mulsd %xmm1, %xmm0
; X64-NEXT: retq
%ext0 = extractelement <2 x double> %a0, i32 0
@@ -1895,12 +1903,12 @@ define <2 x double> @test_mm_mul_sd(<2 x double> %a0, <2 x double> %a1) nounwind
define <2 x i64> @test_mm_mulhi_epi16(<2 x i64> %a0, <2 x i64> %a1) {
; X32-LABEL: test_mm_mulhi_epi16:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pmulhw %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_mulhi_epi16:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: pmulhw %xmm1, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <8 x i16>
@@ -1913,12 +1921,12 @@ declare <8 x i16> @llvm.x86.sse2.pmulh.w(<8 x i16>, <8 x i16>) nounwind readnone
define <2 x i64> @test_mm_mulhi_epu16(<2 x i64> %a0, <2 x i64> %a1) {
; X32-LABEL: test_mm_mulhi_epu16:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pmulhuw %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_mulhi_epu16:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: pmulhuw %xmm1, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <8 x i16>
@@ -1931,12 +1939,12 @@ declare <8 x i16> @llvm.x86.sse2.pmulhu.w(<8 x i16>, <8 x i16>) nounwind readnon
define <2 x i64> @test_mm_mullo_epi16(<2 x i64> %a0, <2 x i64> %a1) {
; X32-LABEL: test_mm_mullo_epi16:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pmullw %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_mullo_epi16:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: pmullw %xmm1, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <8 x i16>
@@ -1948,12 +1956,12 @@ define <2 x i64> @test_mm_mullo_epi16(<2 x i64> %a0, <2 x i64> %a1) {
define <2 x double> @test_mm_or_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
; X32-LABEL: test_mm_or_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: orps %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_or_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: orps %xmm1, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x double> %a0 to <4 x i32>
@@ -1965,12 +1973,12 @@ define <2 x double> @test_mm_or_pd(<2 x double> %a0, <2 x double> %a1) nounwind
define <2 x i64> @test_mm_or_si128(<2 x i64> %a0, <2 x i64> %a1) nounwind {
; X32-LABEL: test_mm_or_si128:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: orps %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_or_si128:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: orps %xmm1, %xmm0
; X64-NEXT: retq
%res = or <2 x i64> %a0, %a1
@@ -1979,12 +1987,12 @@ define <2 x i64> @test_mm_or_si128(<2 x i64> %a0, <2 x i64> %a1) nounwind {
define <2 x i64> @test_mm_packs_epi16(<2 x i64> %a0, <2 x i64> %a1) {
; X32-LABEL: test_mm_packs_epi16:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: packsswb %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_packs_epi16:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: packsswb %xmm1, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <8 x i16>
@@ -1997,12 +2005,12 @@ declare <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16>, <8 x i16>) nounwind rea
define <2 x i64> @test_mm_packs_epi32(<2 x i64> %a0, <2 x i64> %a1) {
; X32-LABEL: test_mm_packs_epi32:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: packssdw %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_packs_epi32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: packssdw %xmm1, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <4 x i32>
@@ -2015,12 +2023,12 @@ declare <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32>, <4 x i32>) nounwind rea
define <2 x i64> @test_mm_packus_epi16(<2 x i64> %a0, <2 x i64> %a1) {
; X32-LABEL: test_mm_packus_epi16:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: packuswb %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_packus_epi16:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: packuswb %xmm1, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <8 x i16>
@@ -2033,12 +2041,12 @@ declare <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16>, <8 x i16>) nounwind rea
define void @test_mm_pause() nounwind {
; X32-LABEL: test_mm_pause:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pause
; X32-NEXT: retl
;
; X64-LABEL: test_mm_pause:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: pause
; X64-NEXT: retq
call void @llvm.x86.sse2.pause()
@@ -2048,12 +2056,12 @@ declare void @llvm.x86.sse2.pause() nounwind readnone
define <2 x i64> @test_mm_sad_epu8(<2 x i64> %a0, <2 x i64> %a1) nounwind {
; X32-LABEL: test_mm_sad_epu8:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: psadbw %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_sad_epu8:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: psadbw %xmm1, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <16 x i8>
@@ -2065,7 +2073,7 @@ declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
define <2 x i64> @test_mm_set_epi8(i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, i8 %a5, i8 %a6, i8 %a7, i8 %a8, i8 %a9, i8 %a10, i8 %a11, i8 %a12, i8 %a13, i8 %a14, i8 %a15) nounwind {
; X32-LABEL: test_mm_set_epi8:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movd %eax, %xmm0
; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
@@ -2116,7 +2124,7 @@ define <2 x i64> @test_mm_set_epi8(i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, i8 %a
; X32-NEXT: retl
;
; X64-LABEL: test_mm_set_epi8:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movzbl %dil, %eax
; X64-NEXT: movd %eax, %xmm0
; X64-NEXT: movzbl %sil, %eax
@@ -2187,22 +2195,22 @@ define <2 x i64> @test_mm_set_epi8(i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, i8 %a
define <2 x i64> @test_mm_set_epi16(i16 %a0, i16 %a1, i16 %a2, i16 %a3, i16 %a4, i16 %a5, i16 %a6, i16 %a7) nounwind {
; X32-LABEL: test_mm_set_epi16:
-; X32: # BB#0:
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32: # %bb.0:
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movd %eax, %xmm1
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movd %eax, %xmm2
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movd %eax, %xmm3
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movd %eax, %xmm4
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movd %eax, %xmm5
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movd %eax, %xmm6
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movd %eax, %xmm7
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movd %eax, %xmm0
; X32-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
; X32-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
@@ -2214,9 +2222,9 @@ define <2 x i64> @test_mm_set_epi16(i16 %a0, i16 %a1, i16 %a2, i16 %a3, i16 %a4,
; X32-NEXT: retl
;
; X64-LABEL: test_mm_set_epi16:
-; X64: # BB#0:
-; X64-NEXT: movw {{[0-9]+}}(%rsp), %r10w
-; X64-NEXT: movw {{[0-9]+}}(%rsp), %ax
+; X64: # %bb.0:
+; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %r10d
+; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax
; X64-NEXT: movd %edi, %xmm0
; X64-NEXT: movd %esi, %xmm1
; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
@@ -2247,18 +2255,18 @@ define <2 x i64> @test_mm_set_epi16(i16 %a0, i16 %a1, i16 %a2, i16 %a3, i16 %a4,
define <2 x i64> @test_mm_set_epi32(i32 %a0, i32 %a1, i32 %a2, i32 %a3) nounwind {
; X32-LABEL: test_mm_set_epi32:
-; X32: # BB#0:
-; X32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X32-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X32-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; X32-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; X32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X32-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; X32-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X32: # %bb.0:
+; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X32-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; X32-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; X32-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; X32-NEXT: retl
;
; X64-LABEL: test_mm_set_epi32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movd %edi, %xmm0
; X64-NEXT: movd %esi, %xmm1
; X64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
@@ -2279,18 +2287,18 @@ define <2 x i64> @test_mm_set_epi32(i32 %a0, i32 %a1, i32 %a2, i32 %a3) nounwind
define <2 x i64> @test_mm_set_epi64x(i64 %a0, i64 %a1) nounwind {
; X32-LABEL: test_mm_set_epi64x:
-; X32: # BB#0:
-; X32-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X32-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; X32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X32-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; X32-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; X32-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X32: # %bb.0:
+; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; X32-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; X32-NEXT: retl
;
; X64-LABEL: test_mm_set_epi64x:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movq %rdi, %xmm1
; X64-NEXT: movq %rsi, %xmm0
; X64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
@@ -2302,16 +2310,16 @@ define <2 x i64> @test_mm_set_epi64x(i64 %a0, i64 %a1) nounwind {
define <2 x double> @test_mm_set_pd(double %a0, double %a1) nounwind {
; X32-LABEL: test_mm_set_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; X32-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
-; X32-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X32-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; X32-NEXT: retl
;
; X64-LABEL: test_mm_set_pd:
-; X64: # BB#0:
-; X64-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; X64-NEXT: movapd %xmm1, %xmm0
+; X64: # %bb.0:
+; X64-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; X64-NEXT: movaps %xmm1, %xmm0
; X64-NEXT: retq
%res0 = insertelement <2 x double> undef, double %a1, i32 0
%res1 = insertelement <2 x double> %res0, double %a0, i32 1
@@ -2320,13 +2328,13 @@ define <2 x double> @test_mm_set_pd(double %a0, double %a1) nounwind {
define <2 x double> @test_mm_set_pd1(double %a0) nounwind {
; X32-LABEL: test_mm_set_pd1:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; X32-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0]
; X32-NEXT: retl
;
; X64-LABEL: test_mm_set_pd1:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0]
; X64-NEXT: retq
%res0 = insertelement <2 x double> undef, double %a0, i32 0
@@ -2336,13 +2344,13 @@ define <2 x double> @test_mm_set_pd1(double %a0) nounwind {
define <2 x double> @test_mm_set_sd(double %a0) nounwind {
; X32-LABEL: test_mm_set_sd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; X32-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
; X32-NEXT: retl
;
; X64-LABEL: test_mm_set_sd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
; X64-NEXT: retq
%res0 = insertelement <2 x double> undef, double %a0, i32 0
@@ -2352,7 +2360,7 @@ define <2 x double> @test_mm_set_sd(double %a0) nounwind {
define <2 x i64> @test_mm_set1_epi8(i8 %a0) nounwind {
; X32-LABEL: test_mm_set1_epi8:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movd %eax, %xmm0
; X32-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
@@ -2361,7 +2369,7 @@ define <2 x i64> @test_mm_set1_epi8(i8 %a0) nounwind {
; X32-NEXT: retl
;
; X64-LABEL: test_mm_set1_epi8:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movzbl %dil, %eax
; X64-NEXT: movd %eax, %xmm0
; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
@@ -2390,15 +2398,15 @@ define <2 x i64> @test_mm_set1_epi8(i8 %a0) nounwind {
define <2 x i64> @test_mm_set1_epi16(i16 %a0) nounwind {
; X32-LABEL: test_mm_set1_epi16:
-; X32: # BB#0:
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32: # %bb.0:
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movd %eax, %xmm0
; X32-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; X32-NEXT: retl
;
; X64-LABEL: test_mm_set1_epi16:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movd %edi, %xmm0
; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
@@ -2417,13 +2425,13 @@ define <2 x i64> @test_mm_set1_epi16(i16 %a0) nounwind {
define <2 x i64> @test_mm_set1_epi32(i32 %a0) nounwind {
; X32-LABEL: test_mm_set1_epi32:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; X32-NEXT: retl
;
; X64-LABEL: test_mm_set1_epi32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movd %edi, %xmm0
; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; X64-NEXT: retq
@@ -2439,7 +2447,7 @@ define <2 x i64> @test_mm_set1_epi32(i32 %a0) nounwind {
define <2 x i64> @test_mm_set1_epi64x(i64 %a0) nounwind {
; X32-LABEL: test_mm_set1_epi64x:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X32-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X32-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
@@ -2447,7 +2455,7 @@ define <2 x i64> @test_mm_set1_epi64x(i64 %a0) nounwind {
; X32-NEXT: retl
;
; X64-LABEL: test_mm_set1_epi64x:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movq %rdi, %xmm0
; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; X64-NEXT: retq
@@ -2458,13 +2466,13 @@ define <2 x i64> @test_mm_set1_epi64x(i64 %a0) nounwind {
define <2 x double> @test_mm_set1_pd(double %a0) nounwind {
; X32-LABEL: test_mm_set1_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; X32-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0]
; X32-NEXT: retl
;
; X64-LABEL: test_mm_set1_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0]
; X64-NEXT: retq
%res0 = insertelement <2 x double> undef, double %a0, i32 0
@@ -2474,7 +2482,7 @@ define <2 x double> @test_mm_set1_pd(double %a0) nounwind {
define <2 x i64> @test_mm_setr_epi8(i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, i8 %a5, i8 %a6, i8 %a7, i8 %a8, i8 %a9, i8 %a10, i8 %a11, i8 %a12, i8 %a13, i8 %a14, i8 %a15) nounwind {
; X32-LABEL: test_mm_setr_epi8:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movd %eax, %xmm0
; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
@@ -2525,7 +2533,7 @@ define <2 x i64> @test_mm_setr_epi8(i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, i8 %
; X32-NEXT: retl
;
; X64-LABEL: test_mm_setr_epi8:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
; X64-NEXT: movd %eax, %xmm0
; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
@@ -2596,22 +2604,22 @@ define <2 x i64> @test_mm_setr_epi8(i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, i8 %
define <2 x i64> @test_mm_setr_epi16(i16 %a0, i16 %a1, i16 %a2, i16 %a3, i16 %a4, i16 %a5, i16 %a6, i16 %a7) nounwind {
; X32-LABEL: test_mm_setr_epi16:
-; X32: # BB#0:
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32: # %bb.0:
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movd %eax, %xmm1
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movd %eax, %xmm2
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movd %eax, %xmm3
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movd %eax, %xmm4
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movd %eax, %xmm5
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movd %eax, %xmm6
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movd %eax, %xmm7
-; X32-NEXT: movw {{[0-9]+}}(%esp), %ax
+; X32-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movd %eax, %xmm0
; X32-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
; X32-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
@@ -2623,9 +2631,9 @@ define <2 x i64> @test_mm_setr_epi16(i16 %a0, i16 %a1, i16 %a2, i16 %a3, i16 %a4
; X32-NEXT: retl
;
; X64-LABEL: test_mm_setr_epi16:
-; X64: # BB#0:
-; X64-NEXT: movw {{[0-9]+}}(%rsp), %ax
-; X64-NEXT: movw {{[0-9]+}}(%rsp), %r10w
+; X64: # %bb.0:
+; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT: movzwl {{[0-9]+}}(%rsp), %r10d
; X64-NEXT: movd %eax, %xmm0
; X64-NEXT: movd %r10d, %xmm1
; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
@@ -2656,18 +2664,18 @@ define <2 x i64> @test_mm_setr_epi16(i16 %a0, i16 %a1, i16 %a2, i16 %a3, i16 %a4
define <2 x i64> @test_mm_setr_epi32(i32 %a0, i32 %a1, i32 %a2, i32 %a3) nounwind {
; X32-LABEL: test_mm_setr_epi32:
-; X32: # BB#0:
-; X32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X32-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X32-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; X32-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; X32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X32-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; X32-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X32: # %bb.0:
+; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X32-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; X32-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; X32-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; X32-NEXT: retl
;
; X64-LABEL: test_mm_setr_epi32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movd %ecx, %xmm0
; X64-NEXT: movd %edx, %xmm1
; X64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
@@ -2688,18 +2696,18 @@ define <2 x i64> @test_mm_setr_epi32(i32 %a0, i32 %a1, i32 %a2, i32 %a3) nounwin
define <2 x i64> @test_mm_setr_epi64x(i64 %a0, i64 %a1) nounwind {
; X32-LABEL: test_mm_setr_epi64x:
-; X32: # BB#0:
-; X32-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X32-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; X32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X32-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; X32-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; X32-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X32: # %bb.0:
+; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; X32-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; X32-NEXT: retl
;
; X64-LABEL: test_mm_setr_epi64x:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movq %rsi, %xmm1
; X64-NEXT: movq %rdi, %xmm0
; X64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
@@ -2711,15 +2719,15 @@ define <2 x i64> @test_mm_setr_epi64x(i64 %a0, i64 %a1) nounwind {
define <2 x double> @test_mm_setr_pd(double %a0, double %a1) nounwind {
; X32-LABEL: test_mm_setr_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
; X32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
-; X32-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X32-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; X32-NEXT: retl
;
; X64-LABEL: test_mm_setr_pd:
-; X64: # BB#0:
-; X64-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X64: # %bb.0:
+; X64-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; X64-NEXT: retq
%res0 = insertelement <2 x double> undef, double %a0, i32 0
%res1 = insertelement <2 x double> %res0, double %a1, i32 1
@@ -2728,12 +2736,12 @@ define <2 x double> @test_mm_setr_pd(double %a0, double %a1) nounwind {
define <2 x double> @test_mm_setzero_pd() {
; X32-LABEL: test_mm_setzero_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: xorps %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_setzero_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: xorps %xmm0, %xmm0
; X64-NEXT: retq
ret <2 x double> zeroinitializer
@@ -2741,12 +2749,12 @@ define <2 x double> @test_mm_setzero_pd() {
define <2 x i64> @test_mm_setzero_si128() {
; X32-LABEL: test_mm_setzero_si128:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: xorps %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_setzero_si128:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: xorps %xmm0, %xmm0
; X64-NEXT: retq
ret <2 x i64> zeroinitializer
@@ -2754,12 +2762,12 @@ define <2 x i64> @test_mm_setzero_si128() {
define <2 x i64> @test_mm_shuffle_epi32(<2 x i64> %a0) {
; X32-LABEL: test_mm_shuffle_epi32:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; X32-NEXT: retl
;
; X64-LABEL: test_mm_shuffle_epi32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <4 x i32>
@@ -2770,12 +2778,12 @@ define <2 x i64> @test_mm_shuffle_epi32(<2 x i64> %a0) {
define <2 x double> @test_mm_shuffle_pd(<2 x double> %a0, <2 x double> %a1) {
; X32-LABEL: test_mm_shuffle_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0]
; X32-NEXT: retl
;
; X64-LABEL: test_mm_shuffle_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0]
; X64-NEXT: retq
%res = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> <i32 1, i32 2>
@@ -2784,12 +2792,12 @@ define <2 x double> @test_mm_shuffle_pd(<2 x double> %a0, <2 x double> %a1) {
define <2 x i64> @test_mm_shufflehi_epi16(<2 x i64> %a0) {
; X32-LABEL: test_mm_shufflehi_epi16:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
; X32-NEXT: retl
;
; X64-LABEL: test_mm_shufflehi_epi16:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <8 x i16>
@@ -2800,12 +2808,12 @@ define <2 x i64> @test_mm_shufflehi_epi16(<2 x i64> %a0) {
define <2 x i64> @test_mm_shufflelo_epi16(<2 x i64> %a0) {
; X32-LABEL: test_mm_shufflelo_epi16:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
; X32-NEXT: retl
;
; X64-LABEL: test_mm_shufflelo_epi16:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <8 x i16>
@@ -2816,12 +2824,12 @@ define <2 x i64> @test_mm_shufflelo_epi16(<2 x i64> %a0) {
define <2 x i64> @test_mm_sll_epi16(<2 x i64> %a0, <2 x i64> %a1) {
; X32-LABEL: test_mm_sll_epi16:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: psllw %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_sll_epi16:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: psllw %xmm1, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <8 x i16>
@@ -2834,12 +2842,12 @@ declare <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16>, <8 x i16>) nounwind readnone
define <2 x i64> @test_mm_sll_epi32(<2 x i64> %a0, <2 x i64> %a1) {
; X32-LABEL: test_mm_sll_epi32:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pslld %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_sll_epi32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: pslld %xmm1, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <4 x i32>
@@ -2852,12 +2860,12 @@ declare <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32>, <4 x i32>) nounwind readnone
define <2 x i64> @test_mm_sll_epi64(<2 x i64> %a0, <2 x i64> %a1) {
; X32-LABEL: test_mm_sll_epi64:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: psllq %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_sll_epi64:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: psllq %xmm1, %xmm0
; X64-NEXT: retq
%res = call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %a0, <2 x i64> %a1)
@@ -2867,12 +2875,12 @@ declare <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64>, <2 x i64>) nounwind readnone
define <2 x i64> @test_mm_slli_epi16(<2 x i64> %a0) {
; X32-LABEL: test_mm_slli_epi16:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: psllw $1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_slli_epi16:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: psllw $1, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <8 x i16>
@@ -2884,12 +2892,12 @@ declare <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16>, i32) nounwind readnone
define <2 x i64> @test_mm_slli_epi32(<2 x i64> %a0) {
; X32-LABEL: test_mm_slli_epi32:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pslld $1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_slli_epi32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: pslld $1, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <4 x i32>
@@ -2901,12 +2909,12 @@ declare <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32>, i32) nounwind readnone
define <2 x i64> @test_mm_slli_epi64(<2 x i64> %a0) {
; X32-LABEL: test_mm_slli_epi64:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: psllq $1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_slli_epi64:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: psllq $1, %xmm0
; X64-NEXT: retq
%res = call <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> %a0, i32 1)
@@ -2916,12 +2924,12 @@ declare <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64>, i32) nounwind readnone
define <2 x i64> @test_mm_slli_si128(<2 x i64> %a0) nounwind {
; X32-LABEL: test_mm_slli_si128:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10]
; X32-NEXT: retl
;
; X64-LABEL: test_mm_slli_si128:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10]
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <16 x i8>
@@ -2932,12 +2940,12 @@ define <2 x i64> @test_mm_slli_si128(<2 x i64> %a0) nounwind {
define <2 x double> @test_mm_sqrt_pd(<2 x double> %a0) nounwind {
; X32-LABEL: test_mm_sqrt_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: sqrtpd %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_sqrt_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: sqrtpd %xmm0, %xmm0
; X64-NEXT: retq
%res = call <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double> %a0)
@@ -2947,13 +2955,13 @@ declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone
define <2 x double> @test_mm_sqrt_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
; X32-LABEL: test_mm_sqrt_sd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: sqrtsd %xmm0, %xmm1
; X32-NEXT: movapd %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_sqrt_sd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: sqrtsd %xmm0, %xmm1
; X64-NEXT: movapd %xmm1, %xmm0
; X64-NEXT: retq
@@ -2968,12 +2976,12 @@ declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone
define <2 x i64> @test_mm_sra_epi16(<2 x i64> %a0, <2 x i64> %a1) {
; X32-LABEL: test_mm_sra_epi16:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: psraw %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_sra_epi16:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: psraw %xmm1, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <8 x i16>
@@ -2986,12 +2994,12 @@ declare <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16>, <8 x i16>) nounwind readnone
define <2 x i64> @test_mm_sra_epi32(<2 x i64> %a0, <2 x i64> %a1) {
; X32-LABEL: test_mm_sra_epi32:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: psrad %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_sra_epi32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: psrad %xmm1, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <4 x i32>
@@ -3004,12 +3012,12 @@ declare <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32>, <4 x i32>) nounwind readnone
define <2 x i64> @test_mm_srai_epi16(<2 x i64> %a0) {
; X32-LABEL: test_mm_srai_epi16:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: psraw $1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_srai_epi16:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: psraw $1, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <8 x i16>
@@ -3021,12 +3029,12 @@ declare <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16>, i32) nounwind readnone
define <2 x i64> @test_mm_srai_epi32(<2 x i64> %a0) {
; X32-LABEL: test_mm_srai_epi32:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: psrad $1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_srai_epi32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: psrad $1, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <4 x i32>
@@ -3038,12 +3046,12 @@ declare <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32>, i32) nounwind readnone
define <2 x i64> @test_mm_srl_epi16(<2 x i64> %a0, <2 x i64> %a1) {
; X32-LABEL: test_mm_srl_epi16:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: psrlw %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_srl_epi16:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: psrlw %xmm1, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <8 x i16>
@@ -3056,12 +3064,12 @@ declare <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16>, <8 x i16>) nounwind readnone
define <2 x i64> @test_mm_srl_epi32(<2 x i64> %a0, <2 x i64> %a1) {
; X32-LABEL: test_mm_srl_epi32:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: psrld %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_srl_epi32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: psrld %xmm1, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <4 x i32>
@@ -3074,12 +3082,12 @@ declare <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32>, <4 x i32>) nounwind readnone
define <2 x i64> @test_mm_srl_epi64(<2 x i64> %a0, <2 x i64> %a1) {
; X32-LABEL: test_mm_srl_epi64:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: psrlq %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_srl_epi64:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: psrlq %xmm1, %xmm0
; X64-NEXT: retq
%res = call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> %a0, <2 x i64> %a1)
@@ -3089,12 +3097,12 @@ declare <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64>, <2 x i64>) nounwind readnone
define <2 x i64> @test_mm_srli_epi16(<2 x i64> %a0) {
; X32-LABEL: test_mm_srli_epi16:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: psrlw $1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_srli_epi16:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: psrlw $1, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <8 x i16>
@@ -3106,12 +3114,12 @@ declare <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16>, i32) nounwind readnone
define <2 x i64> @test_mm_srli_epi32(<2 x i64> %a0) {
; X32-LABEL: test_mm_srli_epi32:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: psrld $1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_srli_epi32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: psrld $1, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <4 x i32>
@@ -3123,12 +3131,12 @@ declare <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32>, i32) nounwind readnone
define <2 x i64> @test_mm_srli_epi64(<2 x i64> %a0) {
; X32-LABEL: test_mm_srli_epi64:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: psrlq $1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_srli_epi64:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: psrlq $1, %xmm0
; X64-NEXT: retq
%res = call <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64> %a0, i32 1)
@@ -3138,12 +3146,12 @@ declare <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64>, i32) nounwind readnone
define <2 x i64> @test_mm_srli_si128(<2 x i64> %a0) nounwind {
; X32-LABEL: test_mm_srli_si128:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: psrldq {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero
; X32-NEXT: retl
;
; X64-LABEL: test_mm_srli_si128:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: psrldq {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <16 x i8>
@@ -3154,13 +3162,13 @@ define <2 x i64> @test_mm_srli_si128(<2 x i64> %a0) nounwind {
define void @test_mm_store_pd(double *%a0, <2 x double> %a1) {
; X32-LABEL: test_mm_store_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movaps %xmm0, (%eax)
; X32-NEXT: retl
;
; X64-LABEL: test_mm_store_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movaps %xmm0, (%rdi)
; X64-NEXT: retq
%arg0 = bitcast double* %a0 to <2 x double>*
@@ -3170,14 +3178,14 @@ define void @test_mm_store_pd(double *%a0, <2 x double> %a1) {
define void @test_mm_store_pd1(double *%a0, <2 x double> %a1) {
; X32-LABEL: test_mm_store_pd1:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0]
; X32-NEXT: movaps %xmm0, (%eax)
; X32-NEXT: retl
;
; X64-LABEL: test_mm_store_pd1:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0]
; X64-NEXT: movaps %xmm0, (%rdi)
; X64-NEXT: retq
@@ -3189,13 +3197,13 @@ define void @test_mm_store_pd1(double *%a0, <2 x double> %a1) {
define void @test_mm_store_sd(double *%a0, <2 x double> %a1) {
; X32-LABEL: test_mm_store_sd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movsd %xmm0, (%eax)
; X32-NEXT: retl
;
; X64-LABEL: test_mm_store_sd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movsd %xmm0, (%rdi)
; X64-NEXT: retq
%ext = extractelement <2 x double> %a1, i32 0
@@ -3205,13 +3213,13 @@ define void @test_mm_store_sd(double *%a0, <2 x double> %a1) {
define void @test_mm_store_si128(<2 x i64> *%a0, <2 x i64> %a1) {
; X32-LABEL: test_mm_store_si128:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movaps %xmm0, (%eax)
; X32-NEXT: retl
;
; X64-LABEL: test_mm_store_si128:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movaps %xmm0, (%rdi)
; X64-NEXT: retq
store <2 x i64> %a1, <2 x i64>* %a0, align 16
@@ -3220,14 +3228,14 @@ define void @test_mm_store_si128(<2 x i64> *%a0, <2 x i64> %a1) {
define void @test_mm_store1_pd(double *%a0, <2 x double> %a1) {
; X32-LABEL: test_mm_store1_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0]
; X32-NEXT: movaps %xmm0, (%eax)
; X32-NEXT: retl
;
; X64-LABEL: test_mm_store1_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0]
; X64-NEXT: movaps %xmm0, (%rdi)
; X64-NEXT: retq
@@ -3239,14 +3247,14 @@ define void @test_mm_store1_pd(double *%a0, <2 x double> %a1) {
define void @test_mm_storeh_sd(double *%a0, <2 x double> %a1) {
; X32-LABEL: test_mm_storeh_sd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
; X32-NEXT: movsd %xmm0, (%eax)
; X32-NEXT: retl
;
; X64-LABEL: test_mm_storeh_sd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
; X64-NEXT: movsd %xmm0, (%rdi)
; X64-NEXT: retq
@@ -3257,13 +3265,13 @@ define void @test_mm_storeh_sd(double *%a0, <2 x double> %a1) {
define void @test_mm_storel_epi64(<2 x i64> *%a0, <2 x i64> %a1) {
; X32-LABEL: test_mm_storel_epi64:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movlps %xmm0, (%eax)
; X32-NEXT: retl
;
; X64-LABEL: test_mm_storel_epi64:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movq %xmm0, %rax
; X64-NEXT: movq %rax, (%rdi)
; X64-NEXT: retq
@@ -3275,13 +3283,13 @@ define void @test_mm_storel_epi64(<2 x i64> *%a0, <2 x i64> %a1) {
define void @test_mm_storel_sd(double *%a0, <2 x double> %a1) {
; X32-LABEL: test_mm_storel_sd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movsd %xmm0, (%eax)
; X32-NEXT: retl
;
; X64-LABEL: test_mm_storel_sd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movsd %xmm0, (%rdi)
; X64-NEXT: retq
%ext = extractelement <2 x double> %a1, i32 0
@@ -3291,14 +3299,14 @@ define void @test_mm_storel_sd(double *%a0, <2 x double> %a1) {
define void @test_mm_storer_pd(double *%a0, <2 x double> %a1) {
; X32-LABEL: test_mm_storer_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0]
; X32-NEXT: movapd %xmm0, (%eax)
; X32-NEXT: retl
;
; X64-LABEL: test_mm_storer_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0]
; X64-NEXT: movapd %xmm0, (%rdi)
; X64-NEXT: retq
@@ -3310,13 +3318,13 @@ define void @test_mm_storer_pd(double *%a0, <2 x double> %a1) {
define void @test_mm_storeu_pd(double *%a0, <2 x double> %a1) {
; X32-LABEL: test_mm_storeu_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movups %xmm0, (%eax)
; X32-NEXT: retl
;
; X64-LABEL: test_mm_storeu_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movups %xmm0, (%rdi)
; X64-NEXT: retq
%arg0 = bitcast double* %a0 to <2 x double>*
@@ -3326,13 +3334,13 @@ define void @test_mm_storeu_pd(double *%a0, <2 x double> %a1) {
define void @test_mm_storeu_si128(<2 x i64> *%a0, <2 x i64> %a1) {
; X32-LABEL: test_mm_storeu_si128:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movups %xmm0, (%eax)
; X32-NEXT: retl
;
; X64-LABEL: test_mm_storeu_si128:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movups %xmm0, (%rdi)
; X64-NEXT: retq
store <2 x i64> %a1, <2 x i64>* %a0, align 1
@@ -3341,13 +3349,13 @@ define void @test_mm_storeu_si128(<2 x i64> *%a0, <2 x i64> %a1) {
define void @test_mm_stream_pd(double *%a0, <2 x double> %a1) {
; X32-LABEL: test_mm_stream_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movntps %xmm0, (%eax)
; X32-NEXT: retl
;
; X64-LABEL: test_mm_stream_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movntps %xmm0, (%rdi)
; X64-NEXT: retq
%arg0 = bitcast double* %a0 to <2 x double>*
@@ -3357,14 +3365,14 @@ define void @test_mm_stream_pd(double *%a0, <2 x double> %a1) {
define void @test_mm_stream_si32(i32 *%a0, i32 %a1) {
; X32-LABEL: test_mm_stream_si32:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: movntil %eax, (%ecx)
; X32-NEXT: retl
;
; X64-LABEL: test_mm_stream_si32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movntil %esi, (%rdi)
; X64-NEXT: retq
store i32 %a1, i32* %a0, align 1, !nontemporal !0
@@ -3373,13 +3381,13 @@ define void @test_mm_stream_si32(i32 *%a0, i32 %a1) {
define void @test_mm_stream_si128(<2 x i64> *%a0, <2 x i64> %a1) {
; X32-LABEL: test_mm_stream_si128:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movntps %xmm0, (%eax)
; X32-NEXT: retl
;
; X64-LABEL: test_mm_stream_si128:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movntps %xmm0, (%rdi)
; X64-NEXT: retq
store <2 x i64> %a1, <2 x i64>* %a0, align 16, !nontemporal !0
@@ -3388,12 +3396,12 @@ define void @test_mm_stream_si128(<2 x i64> *%a0, <2 x i64> %a1) {
define <2 x i64> @test_mm_sub_epi8(<2 x i64> %a0, <2 x i64> %a1) nounwind {
; X32-LABEL: test_mm_sub_epi8:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: psubb %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_sub_epi8:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: psubb %xmm1, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <16 x i8>
@@ -3405,12 +3413,12 @@ define <2 x i64> @test_mm_sub_epi8(<2 x i64> %a0, <2 x i64> %a1) nounwind {
define <2 x i64> @test_mm_sub_epi16(<2 x i64> %a0, <2 x i64> %a1) nounwind {
; X32-LABEL: test_mm_sub_epi16:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: psubw %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_sub_epi16:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: psubw %xmm1, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <8 x i16>
@@ -3422,12 +3430,12 @@ define <2 x i64> @test_mm_sub_epi16(<2 x i64> %a0, <2 x i64> %a1) nounwind {
define <2 x i64> @test_mm_sub_epi32(<2 x i64> %a0, <2 x i64> %a1) nounwind {
; X32-LABEL: test_mm_sub_epi32:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: psubd %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_sub_epi32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: psubd %xmm1, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <4 x i32>
@@ -3439,12 +3447,12 @@ define <2 x i64> @test_mm_sub_epi32(<2 x i64> %a0, <2 x i64> %a1) nounwind {
define <2 x i64> @test_mm_sub_epi64(<2 x i64> %a0, <2 x i64> %a1) nounwind {
; X32-LABEL: test_mm_sub_epi64:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: psubq %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_sub_epi64:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: psubq %xmm1, %xmm0
; X64-NEXT: retq
%res = sub <2 x i64> %a0, %a1
@@ -3453,12 +3461,12 @@ define <2 x i64> @test_mm_sub_epi64(<2 x i64> %a0, <2 x i64> %a1) nounwind {
define <2 x double> @test_mm_sub_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
; X32-LABEL: test_mm_sub_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: subpd %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_sub_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: subpd %xmm1, %xmm0
; X64-NEXT: retq
%res = fsub <2 x double> %a0, %a1
@@ -3467,12 +3475,12 @@ define <2 x double> @test_mm_sub_pd(<2 x double> %a0, <2 x double> %a1) nounwind
define <2 x double> @test_mm_sub_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
; X32-LABEL: test_mm_sub_sd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: subsd %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_sub_sd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: subsd %xmm1, %xmm0
; X64-NEXT: retq
%ext0 = extractelement <2 x double> %a0, i32 0
@@ -3484,12 +3492,12 @@ define <2 x double> @test_mm_sub_sd(<2 x double> %a0, <2 x double> %a1) nounwind
define <2 x i64> @test_mm_subs_epi8(<2 x i64> %a0, <2 x i64> %a1) nounwind {
; X32-LABEL: test_mm_subs_epi8:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: psubsb %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_subs_epi8:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: psubsb %xmm1, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <16 x i8>
@@ -3502,12 +3510,12 @@ declare <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8>, <16 x i8>) nounwind readnone
define <2 x i64> @test_mm_subs_epi16(<2 x i64> %a0, <2 x i64> %a1) nounwind {
; X32-LABEL: test_mm_subs_epi16:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: psubsw %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_subs_epi16:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: psubsw %xmm1, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <8 x i16>
@@ -3520,12 +3528,12 @@ declare <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16>, <8 x i16>) nounwind readnone
define <2 x i64> @test_mm_subs_epu8(<2 x i64> %a0, <2 x i64> %a1) nounwind {
; X32-LABEL: test_mm_subs_epu8:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: psubusb %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_subs_epu8:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: psubusb %xmm1, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <16 x i8>
@@ -3538,12 +3546,12 @@ declare <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8>, <16 x i8>) nounwind readnon
define <2 x i64> @test_mm_subs_epu16(<2 x i64> %a0, <2 x i64> %a1) nounwind {
; X32-LABEL: test_mm_subs_epu16:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: psubusw %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_subs_epu16:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: psubusw %xmm1, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <8 x i16>
@@ -3556,7 +3564,7 @@ declare <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16>, <8 x i16>) nounwind readnon
define i32 @test_mm_ucomieq_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
; X32-LABEL: test_mm_ucomieq_sd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: ucomisd %xmm1, %xmm0
; X32-NEXT: setnp %al
; X32-NEXT: sete %cl
@@ -3565,7 +3573,7 @@ define i32 @test_mm_ucomieq_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
; X32-NEXT: retl
;
; X64-LABEL: test_mm_ucomieq_sd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: ucomisd %xmm1, %xmm0
; X64-NEXT: setnp %al
; X64-NEXT: sete %cl
@@ -3579,14 +3587,14 @@ declare i32 @llvm.x86.sse2.ucomieq.sd(<2 x double>, <2 x double>) nounwind readn
define i32 @test_mm_ucomige_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
; X32-LABEL: test_mm_ucomige_sd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: xorl %eax, %eax
; X32-NEXT: ucomisd %xmm1, %xmm0
; X32-NEXT: setae %al
; X32-NEXT: retl
;
; X64-LABEL: test_mm_ucomige_sd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: ucomisd %xmm1, %xmm0
; X64-NEXT: setae %al
@@ -3598,14 +3606,14 @@ declare i32 @llvm.x86.sse2.ucomige.sd(<2 x double>, <2 x double>) nounwind readn
define i32 @test_mm_ucomigt_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
; X32-LABEL: test_mm_ucomigt_sd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: xorl %eax, %eax
; X32-NEXT: ucomisd %xmm1, %xmm0
; X32-NEXT: seta %al
; X32-NEXT: retl
;
; X64-LABEL: test_mm_ucomigt_sd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: ucomisd %xmm1, %xmm0
; X64-NEXT: seta %al
@@ -3617,14 +3625,14 @@ declare i32 @llvm.x86.sse2.ucomigt.sd(<2 x double>, <2 x double>) nounwind readn
define i32 @test_mm_ucomile_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
; X32-LABEL: test_mm_ucomile_sd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: xorl %eax, %eax
; X32-NEXT: ucomisd %xmm0, %xmm1
; X32-NEXT: setae %al
; X32-NEXT: retl
;
; X64-LABEL: test_mm_ucomile_sd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: ucomisd %xmm0, %xmm1
; X64-NEXT: setae %al
@@ -3636,14 +3644,14 @@ declare i32 @llvm.x86.sse2.ucomile.sd(<2 x double>, <2 x double>) nounwind readn
define i32 @test_mm_ucomilt_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
; X32-LABEL: test_mm_ucomilt_sd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: xorl %eax, %eax
; X32-NEXT: ucomisd %xmm0, %xmm1
; X32-NEXT: seta %al
; X32-NEXT: retl
;
; X64-LABEL: test_mm_ucomilt_sd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: ucomisd %xmm0, %xmm1
; X64-NEXT: seta %al
@@ -3655,7 +3663,7 @@ declare i32 @llvm.x86.sse2.ucomilt.sd(<2 x double>, <2 x double>) nounwind readn
define i32 @test_mm_ucomineq_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
; X32-LABEL: test_mm_ucomineq_sd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: ucomisd %xmm1, %xmm0
; X32-NEXT: setp %al
; X32-NEXT: setne %cl
@@ -3664,7 +3672,7 @@ define i32 @test_mm_ucomineq_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
; X32-NEXT: retl
;
; X64-LABEL: test_mm_ucomineq_sd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: ucomisd %xmm1, %xmm0
; X64-NEXT: setp %al
; X64-NEXT: setne %cl
@@ -3678,34 +3686,34 @@ declare i32 @llvm.x86.sse2.ucomineq.sd(<2 x double>, <2 x double>) nounwind read
define <2 x double> @test_mm_undefined_pd() {
; X32-LABEL: test_mm_undefined_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: retl
;
; X64-LABEL: test_mm_undefined_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: retq
ret <2 x double> undef
}
define <2 x i64> @test_mm_undefined_si128() {
; X32-LABEL: test_mm_undefined_si128:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: retl
;
; X64-LABEL: test_mm_undefined_si128:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: retq
ret <2 x i64> undef
}
define <2 x i64> @test_mm_unpackhi_epi8(<2 x i64> %a0, <2 x i64> %a1) {
; X32-LABEL: test_mm_unpackhi_epi8:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
; X32-NEXT: retl
;
; X64-LABEL: test_mm_unpackhi_epi8:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <16 x i8>
@@ -3717,12 +3725,12 @@ define <2 x i64> @test_mm_unpackhi_epi8(<2 x i64> %a0, <2 x i64> %a1) {
define <2 x i64> @test_mm_unpackhi_epi16(<2 x i64> %a0, <2 x i64> %a1) {
; X32-LABEL: test_mm_unpackhi_epi16:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; X32-NEXT: retl
;
; X64-LABEL: test_mm_unpackhi_epi16:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <8 x i16>
@@ -3734,13 +3742,13 @@ define <2 x i64> @test_mm_unpackhi_epi16(<2 x i64> %a0, <2 x i64> %a1) {
define <2 x i64> @test_mm_unpackhi_epi32(<2 x i64> %a0, <2 x i64> %a1) {
; X32-LABEL: test_mm_unpackhi_epi32:
-; X32: # BB#0:
-; X32-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X32: # %bb.0:
+; X32-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; X32-NEXT: retl
;
; X64-LABEL: test_mm_unpackhi_epi32:
-; X64: # BB#0:
-; X64-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X64: # %bb.0:
+; X64-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <4 x i32>
%arg1 = bitcast <2 x i64> %a1 to <4 x i32>
@@ -3751,13 +3759,13 @@ define <2 x i64> @test_mm_unpackhi_epi32(<2 x i64> %a0, <2 x i64> %a1) {
define <2 x i64> @test_mm_unpackhi_epi64(<2 x i64> %a0, <2 x i64> %a1) {
; X32-LABEL: test_mm_unpackhi_epi64:
-; X32: # BB#0:
-; X32-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; X32: # %bb.0:
+; X32-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
; X32-NEXT: retl
;
; X64-LABEL: test_mm_unpackhi_epi64:
-; X64: # BB#0:
-; X64-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; X64: # %bb.0:
+; X64-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
; X64-NEXT: retq
%res = shufflevector <2 x i64> %a0, <2 x i64> %a1, <2 x i32> <i32 1, i32 3>
ret <2 x i64> %res
@@ -3765,12 +3773,12 @@ define <2 x i64> @test_mm_unpackhi_epi64(<2 x i64> %a0, <2 x i64> %a1) {
define <2 x double> @test_mm_unpackhi_pd(<2 x double> %a0, <2 x double> %a1) {
; X32-LABEL: test_mm_unpackhi_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
; X32-NEXT: retl
;
; X64-LABEL: test_mm_unpackhi_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
; X64-NEXT: retq
%res = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> <i32 1, i32 3>
@@ -3779,12 +3787,12 @@ define <2 x double> @test_mm_unpackhi_pd(<2 x double> %a0, <2 x double> %a1) {
define <2 x i64> @test_mm_unpacklo_epi8(<2 x i64> %a0, <2 x i64> %a1) {
; X32-LABEL: test_mm_unpacklo_epi8:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; X32-NEXT: retl
;
; X64-LABEL: test_mm_unpacklo_epi8:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <16 x i8>
@@ -3796,12 +3804,12 @@ define <2 x i64> @test_mm_unpacklo_epi8(<2 x i64> %a0, <2 x i64> %a1) {
define <2 x i64> @test_mm_unpacklo_epi16(<2 x i64> %a0, <2 x i64> %a1) {
; X32-LABEL: test_mm_unpacklo_epi16:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; X32-NEXT: retl
;
; X64-LABEL: test_mm_unpacklo_epi16:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <8 x i16>
@@ -3813,13 +3821,13 @@ define <2 x i64> @test_mm_unpacklo_epi16(<2 x i64> %a0, <2 x i64> %a1) {
define <2 x i64> @test_mm_unpacklo_epi32(<2 x i64> %a0, <2 x i64> %a1) {
; X32-LABEL: test_mm_unpacklo_epi32:
-; X32: # BB#0:
-; X32-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X32: # %bb.0:
+; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; X32-NEXT: retl
;
; X64-LABEL: test_mm_unpacklo_epi32:
-; X64: # BB#0:
-; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X64: # %bb.0:
+; X64-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <4 x i32>
%arg1 = bitcast <2 x i64> %a1 to <4 x i32>
@@ -3830,13 +3838,13 @@ define <2 x i64> @test_mm_unpacklo_epi32(<2 x i64> %a0, <2 x i64> %a1) {
define <2 x i64> @test_mm_unpacklo_epi64(<2 x i64> %a0, <2 x i64> %a1) {
; X32-LABEL: test_mm_unpacklo_epi64:
-; X32: # BB#0:
-; X32-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X32: # %bb.0:
+; X32-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; X32-NEXT: retl
;
; X64-LABEL: test_mm_unpacklo_epi64:
-; X64: # BB#0:
-; X64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X64: # %bb.0:
+; X64-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; X64-NEXT: retq
%res = shufflevector <2 x i64> %a0, <2 x i64> %a1, <2 x i32> <i32 0, i32 2>
ret <2 x i64> %res
@@ -3844,13 +3852,13 @@ define <2 x i64> @test_mm_unpacklo_epi64(<2 x i64> %a0, <2 x i64> %a1) {
define <2 x double> @test_mm_unpacklo_pd(<2 x double> %a0, <2 x double> %a1) {
; X32-LABEL: test_mm_unpacklo_pd:
-; X32: # BB#0:
-; X32-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X32: # %bb.0:
+; X32-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; X32-NEXT: retl
;
; X64-LABEL: test_mm_unpacklo_pd:
-; X64: # BB#0:
-; X64-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X64: # %bb.0:
+; X64-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; X64-NEXT: retq
%res = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> <i32 0, i32 2>
ret <2 x double> %res
@@ -3858,12 +3866,12 @@ define <2 x double> @test_mm_unpacklo_pd(<2 x double> %a0, <2 x double> %a1) {
define <2 x double> @test_mm_xor_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
; X32-LABEL: test_mm_xor_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: xorps %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_xor_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: xorps %xmm1, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x double> %a0 to <4 x i32>
@@ -3875,12 +3883,12 @@ define <2 x double> @test_mm_xor_pd(<2 x double> %a0, <2 x double> %a1) nounwind
define <2 x i64> @test_mm_xor_si128(<2 x i64> %a0, <2 x i64> %a1) nounwind {
; X32-LABEL: test_mm_xor_si128:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: xorps %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_xor_si128:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: xorps %xmm1, %xmm0
; X64-NEXT: retq
%res = xor <2 x i64> %a0, %a1
diff --git a/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll b/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll
index 02c2420d9fae..3571e2968bf8 100644
--- a/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll
+++ b/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll
@@ -3,7 +3,7 @@
define <2 x i64> @test_x86_sse2_psll_dq_bs(<2 x i64> %a0) {
; CHECK-LABEL: test_x86_sse2_psll_dq_bs:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8]
; CHECK-NEXT: retl
%res = call <2 x i64> @llvm.x86.sse2.psll.dq.bs(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]
@@ -14,7 +14,7 @@ declare <2 x i64> @llvm.x86.sse2.psll.dq.bs(<2 x i64>, i32) nounwind readnone
define <2 x i64> @test_x86_sse2_psrl_dq_bs(<2 x i64> %a0) {
; CHECK-LABEL: test_x86_sse2_psrl_dq_bs:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero
; CHECK-NEXT: retl
%res = call <2 x i64> @llvm.x86.sse2.psrl.dq.bs(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]
@@ -24,7 +24,7 @@ declare <2 x i64> @llvm.x86.sse2.psrl.dq.bs(<2 x i64>, i32) nounwind readnone
define <2 x i64> @test_x86_sse2_psll_dq(<2 x i64> %a0) {
; CHECK-LABEL: test_x86_sse2_psll_dq:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: pslldq {{.*#+}} xmm0 = zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
; CHECK-NEXT: retl
%res = call <2 x i64> @llvm.x86.sse2.psll.dq(<2 x i64> %a0, i32 8) ; <<2 x i64>> [#uses=1]
@@ -35,7 +35,7 @@ declare <2 x i64> @llvm.x86.sse2.psll.dq(<2 x i64>, i32) nounwind readnone
define <2 x i64> @test_x86_sse2_psrl_dq(<2 x i64> %a0) {
; CHECK-LABEL: test_x86_sse2_psrl_dq:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero
; CHECK-NEXT: retl
%res = call <2 x i64> @llvm.x86.sse2.psrl.dq(<2 x i64> %a0, i32 8) ; <<2 x i64>> [#uses=1]
@@ -46,7 +46,7 @@ declare <2 x i64> @llvm.x86.sse2.psrl.dq(<2 x i64>, i32) nounwind readnone
define <2 x double> @test_x86_sse2_cvtdq2pd(<4 x i32> %a0) {
; CHECK-LABEL: test_x86_sse2_cvtdq2pd:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: cvtdq2pd %xmm0, %xmm0
; CHECK-NEXT: retl
%res = call <2 x double> @llvm.x86.sse2.cvtdq2pd(<4 x i32> %a0) ; <<2 x double>> [#uses=1]
@@ -57,7 +57,7 @@ declare <2 x double> @llvm.x86.sse2.cvtdq2pd(<4 x i32>) nounwind readnone
define <2 x double> @test_x86_sse2_cvtps2pd(<4 x float> %a0) {
; CHECK-LABEL: test_x86_sse2_cvtps2pd:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: cvtps2pd %xmm0, %xmm0
; CHECK-NEXT: retl
%res = call <2 x double> @llvm.x86.sse2.cvtps2pd(<4 x float> %a0) ; <<2 x double>> [#uses=1]
@@ -68,7 +68,7 @@ declare <2 x double> @llvm.x86.sse2.cvtps2pd(<4 x float>) nounwind readnone
define void @test_x86_sse2_storel_dq(i8* %a0, <4 x i32> %a1) {
; CHECK-LABEL: test_x86_sse2_storel_dq:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: movlps %xmm0, (%eax)
; CHECK-NEXT: retl
@@ -81,7 +81,7 @@ declare void @llvm.x86.sse2.storel.dq(i8*, <4 x i32>) nounwind
define void @test_x86_sse2_storeu_dq(i8* %a0, <16 x i8> %a1) {
; add operation forces the execution domain.
; CHECK-LABEL: test_x86_sse2_storeu_dq:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: pcmpeqd %xmm1, %xmm1
; CHECK-NEXT: psubb %xmm1, %xmm0
@@ -97,7 +97,7 @@ declare void @llvm.x86.sse2.storeu.dq(i8*, <16 x i8>) nounwind
define void @test_x86_sse2_storeu_pd(i8* %a0, <2 x double> %a1) {
; fadd operation forces the execution domain.
; CHECK-LABEL: test_x86_sse2_storeu_pd:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: xorpd %xmm1, %xmm1
; CHECK-NEXT: movhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
@@ -112,7 +112,7 @@ declare void @llvm.x86.sse2.storeu.pd(i8*, <2 x double>) nounwind
define <4 x i32> @test_x86_sse2_pshuf_d(<4 x i32> %a) {
; CHECK-LABEL: test_x86_sse2_pshuf_d:
-; CHECK: ## BB#0: ## %entry
+; CHECK: ## %bb.0: ## %entry
; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
; CHECK-NEXT: retl
entry:
@@ -123,7 +123,7 @@ declare <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32>, i8) nounwind readnone
define <8 x i16> @test_x86_sse2_pshufl_w(<8 x i16> %a) {
; CHECK-LABEL: test_x86_sse2_pshufl_w:
-; CHECK: ## BB#0: ## %entry
+; CHECK: ## %bb.0: ## %entry
; CHECK-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
; CHECK-NEXT: retl
entry:
@@ -134,7 +134,7 @@ declare <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16>, i8) nounwind readnone
define <8 x i16> @test_x86_sse2_pshufh_w(<8 x i16> %a) {
; CHECK-LABEL: test_x86_sse2_pshufh_w:
-; CHECK: ## BB#0: ## %entry
+; CHECK: ## %bb.0: ## %entry
; CHECK-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
; CHECK-NEXT: retl
entry:
@@ -145,7 +145,7 @@ declare <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16>, i8) nounwind readnone
define <16 x i8> @max_epu8(<16 x i8> %a0, <16 x i8> %a1) {
; CHECK-LABEL: max_epu8:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: pmaxub %xmm1, %xmm0
; CHECK-NEXT: retl
%res = call <16 x i8> @llvm.x86.sse2.pmaxu.b(<16 x i8> %a0, <16 x i8> %a1)
@@ -155,7 +155,7 @@ declare <16 x i8> @llvm.x86.sse2.pmaxu.b(<16 x i8>, <16 x i8>) nounwind readnone
define <16 x i8> @min_epu8(<16 x i8> %a0, <16 x i8> %a1) {
; CHECK-LABEL: min_epu8:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: pminub %xmm1, %xmm0
; CHECK-NEXT: retl
%res = call <16 x i8> @llvm.x86.sse2.pminu.b(<16 x i8> %a0, <16 x i8> %a1)
@@ -165,7 +165,7 @@ declare <16 x i8> @llvm.x86.sse2.pminu.b(<16 x i8>, <16 x i8>) nounwind readnone
define <8 x i16> @max_epi16(<8 x i16> %a0, <8 x i16> %a1) {
; CHECK-LABEL: max_epi16:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: pmaxsw %xmm1, %xmm0
; CHECK-NEXT: retl
%res = call <8 x i16> @llvm.x86.sse2.pmaxs.w(<8 x i16> %a0, <8 x i16> %a1)
@@ -175,7 +175,7 @@ declare <8 x i16> @llvm.x86.sse2.pmaxs.w(<8 x i16>, <8 x i16>) nounwind readnone
define <8 x i16> @min_epi16(<8 x i16> %a0, <8 x i16> %a1) {
; CHECK-LABEL: min_epi16:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: pminsw %xmm1, %xmm0
; CHECK-NEXT: retl
%res = call <8 x i16> @llvm.x86.sse2.pmins.w(<8 x i16> %a0, <8 x i16> %a1)
@@ -185,21 +185,21 @@ declare <8 x i16> @llvm.x86.sse2.pmins.w(<8 x i16>, <8 x i16>) nounwind readnone
define <2 x double> @test_x86_sse2_add_sd(<2 x double> %a0, <2 x double> %a1) {
; SSE-LABEL: test_x86_sse2_add_sd:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: addsd %xmm1, %xmm0 ## encoding: [0xf2,0x0f,0x58,0xc1]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; AVX2-LABEL: test_x86_sse2_add_sd:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfb,0x58,0xc1]
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_sse2_add_sd:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xff,0x08,0x58,0xc1]
; SKX-NEXT: retl ## encoding: [0xc3]
; CHECK-LABEL: test_x86_sse2_add_sd:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: addsd %xmm1, %xmm0
; CHECK-NEXT: retl
%res = call <2 x double> @llvm.x86.sse2.add.sd(<2 x double> %a0, <2 x double> %a1) ; <<2 x double>> [#uses=1]
@@ -210,21 +210,21 @@ declare <2 x double> @llvm.x86.sse2.add.sd(<2 x double>, <2 x double>) nounwind
define <2 x double> @test_x86_sse2_sub_sd(<2 x double> %a0, <2 x double> %a1) {
; SSE-LABEL: test_x86_sse2_sub_sd:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: subsd %xmm1, %xmm0 ## encoding: [0xf2,0x0f,0x5c,0xc1]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; AVX2-LABEL: test_x86_sse2_sub_sd:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vsubsd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfb,0x5c,0xc1]
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_sse2_sub_sd:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vsubsd %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xff,0x08,0x5c,0xc1]
; SKX-NEXT: retl ## encoding: [0xc3]
; CHECK-LABEL: test_x86_sse2_sub_sd:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: subsd %xmm1, %xmm0
; CHECK-NEXT: retl
%res = call <2 x double> @llvm.x86.sse2.sub.sd(<2 x double> %a0, <2 x double> %a1) ; <<2 x double>> [#uses=1]
@@ -235,21 +235,21 @@ declare <2 x double> @llvm.x86.sse2.sub.sd(<2 x double>, <2 x double>) nounwind
define <2 x double> @test_x86_sse2_mul_sd(<2 x double> %a0, <2 x double> %a1) {
; SSE-LABEL: test_x86_sse2_mul_sd:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: mulsd %xmm1, %xmm0 ## encoding: [0xf2,0x0f,0x59,0xc1]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; AVX2-LABEL: test_x86_sse2_mul_sd:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfb,0x59,0xc1]
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_sse2_mul_sd:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xff,0x08,0x59,0xc1]
; SKX-NEXT: retl ## encoding: [0xc3]
; CHECK-LABEL: test_x86_sse2_mul_sd:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: mulsd %xmm1, %xmm0
; CHECK-NEXT: retl
%res = call <2 x double> @llvm.x86.sse2.mul.sd(<2 x double> %a0, <2 x double> %a1) ; <<2 x double>> [#uses=1]
@@ -260,21 +260,21 @@ declare <2 x double> @llvm.x86.sse2.mul.sd(<2 x double>, <2 x double>) nounwind
define <2 x double> @test_x86_sse2_div_sd(<2 x double> %a0, <2 x double> %a1) {
; SSE-LABEL: test_x86_sse2_div_sd:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: divsd %xmm1, %xmm0 ## encoding: [0xf2,0x0f,0x5e,0xc1]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; AVX2-LABEL: test_x86_sse2_div_sd:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vdivsd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfb,0x5e,0xc1]
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_sse2_div_sd:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vdivsd %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xff,0x08,0x5e,0xc1]
; SKX-NEXT: retl ## encoding: [0xc3]
; CHECK-LABEL: test_x86_sse2_div_sd:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: divsd %xmm1, %xmm0
; CHECK-NEXT: retl
%res = call <2 x double> @llvm.x86.sse2.div.sd(<2 x double> %a0, <2 x double> %a1) ; <<2 x double>> [#uses=1]
@@ -282,5 +282,24 @@ define <2 x double> @test_x86_sse2_div_sd(<2 x double> %a0, <2 x double> %a1) {
}
declare <2 x double> @llvm.x86.sse2.div.sd(<2 x double>, <2 x double>) nounwind readnone
+define <16 x i8> @mm_avg_epu8(<16 x i8> %a0, <16 x i8> %a1) {
+; CHECK-LABEL: mm_avg_epu8:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: pavgb %xmm1, %xmm0
+; CHECK-NEXT: retl
+ %res = call <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
+ ret <16 x i8> %res
+}
+declare <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <8 x i16> @mm_avg_epu16(<8 x i16> %a0, <8 x i16> %a1) {
+; CHECK-LABEL: mm_avg_epu16:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: pavgw %xmm1, %xmm0
+; CHECK-NEXT: retl
+ %res = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
+ ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16>, <8 x i16>) nounwind readnone
diff --git a/test/CodeGen/X86/sse2-intrinsics-x86.ll b/test/CodeGen/X86/sse2-intrinsics-x86.ll
index b0a8744f5d80..e3c02b625fb5 100644
--- a/test/CodeGen/X86/sse2-intrinsics-x86.ll
+++ b/test/CodeGen/X86/sse2-intrinsics-x86.ll
@@ -5,12 +5,12 @@
define <2 x double> @test_x86_sse2_cmp_pd(<2 x double> %a0, <2 x double> %a1) {
; SSE-LABEL: test_x86_sse2_cmp_pd:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: cmpordpd %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xc2,0xc1,0x07]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; VCHECK-LABEL: test_x86_sse2_cmp_pd:
-; VCHECK: ## BB#0:
+; VCHECK: ## %bb.0:
; VCHECK-NEXT: vcmpordpd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc2,0xc1,0x07]
; VCHECK-NEXT: retl ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double> %a0, <2 x double> %a1, i8 7) ; <<2 x double>> [#uses=1]
@@ -21,12 +21,12 @@ declare <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double>, <2 x double>, i8) nounw
define <2 x double> @test_x86_sse2_cmp_sd(<2 x double> %a0, <2 x double> %a1) {
; SSE-LABEL: test_x86_sse2_cmp_sd:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: cmpordsd %xmm1, %xmm0 ## encoding: [0xf2,0x0f,0xc2,0xc1,0x07]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; VCHECK-LABEL: test_x86_sse2_cmp_sd:
-; VCHECK: ## BB#0:
+; VCHECK: ## %bb.0:
; VCHECK-NEXT: vcmpordsd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfb,0xc2,0xc1,0x07]
; VCHECK-NEXT: retl ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a0, <2 x double> %a1, i8 7) ; <<2 x double>> [#uses=1]
@@ -37,7 +37,7 @@ declare <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double>, <2 x double>, i8) nounw
define i32 @test_x86_sse2_comieq_sd(<2 x double> %a0, <2 x double> %a1) {
; SSE-LABEL: test_x86_sse2_comieq_sd:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: comisd %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x2f,0xc1]
; SSE-NEXT: setnp %al ## encoding: [0x0f,0x9b,0xc0]
; SSE-NEXT: sete %cl ## encoding: [0x0f,0x94,0xc1]
@@ -46,7 +46,7 @@ define i32 @test_x86_sse2_comieq_sd(<2 x double> %a0, <2 x double> %a1) {
; SSE-NEXT: retl ## encoding: [0xc3]
;
; AVX2-LABEL: test_x86_sse2_comieq_sd:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vcomisd %xmm1, %xmm0 ## encoding: [0xc5,0xf9,0x2f,0xc1]
; AVX2-NEXT: setnp %al ## encoding: [0x0f,0x9b,0xc0]
; AVX2-NEXT: sete %cl ## encoding: [0x0f,0x94,0xc1]
@@ -55,7 +55,7 @@ define i32 @test_x86_sse2_comieq_sd(<2 x double> %a0, <2 x double> %a1) {
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_sse2_comieq_sd:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vcomisd %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x2f,0xc1]
; SKX-NEXT: setnp %al ## encoding: [0x0f,0x9b,0xc0]
; SKX-NEXT: sete %cl ## encoding: [0x0f,0x94,0xc1]
@@ -70,21 +70,21 @@ declare i32 @llvm.x86.sse2.comieq.sd(<2 x double>, <2 x double>) nounwind readno
define i32 @test_x86_sse2_comige_sd(<2 x double> %a0, <2 x double> %a1) {
; SSE-LABEL: test_x86_sse2_comige_sd:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0]
; SSE-NEXT: comisd %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x2f,0xc1]
; SSE-NEXT: setae %al ## encoding: [0x0f,0x93,0xc0]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; AVX2-LABEL: test_x86_sse2_comige_sd:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0]
; AVX2-NEXT: vcomisd %xmm1, %xmm0 ## encoding: [0xc5,0xf9,0x2f,0xc1]
; AVX2-NEXT: setae %al ## encoding: [0x0f,0x93,0xc0]
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_sse2_comige_sd:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0]
; SKX-NEXT: vcomisd %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x2f,0xc1]
; SKX-NEXT: setae %al ## encoding: [0x0f,0x93,0xc0]
@@ -97,21 +97,21 @@ declare i32 @llvm.x86.sse2.comige.sd(<2 x double>, <2 x double>) nounwind readno
define i32 @test_x86_sse2_comigt_sd(<2 x double> %a0, <2 x double> %a1) {
; SSE-LABEL: test_x86_sse2_comigt_sd:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0]
; SSE-NEXT: comisd %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x2f,0xc1]
; SSE-NEXT: seta %al ## encoding: [0x0f,0x97,0xc0]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; AVX2-LABEL: test_x86_sse2_comigt_sd:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0]
; AVX2-NEXT: vcomisd %xmm1, %xmm0 ## encoding: [0xc5,0xf9,0x2f,0xc1]
; AVX2-NEXT: seta %al ## encoding: [0x0f,0x97,0xc0]
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_sse2_comigt_sd:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0]
; SKX-NEXT: vcomisd %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x2f,0xc1]
; SKX-NEXT: seta %al ## encoding: [0x0f,0x97,0xc0]
@@ -124,21 +124,21 @@ declare i32 @llvm.x86.sse2.comigt.sd(<2 x double>, <2 x double>) nounwind readno
define i32 @test_x86_sse2_comile_sd(<2 x double> %a0, <2 x double> %a1) {
; SSE-LABEL: test_x86_sse2_comile_sd:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0]
; SSE-NEXT: comisd %xmm0, %xmm1 ## encoding: [0x66,0x0f,0x2f,0xc8]
; SSE-NEXT: setae %al ## encoding: [0x0f,0x93,0xc0]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; AVX2-LABEL: test_x86_sse2_comile_sd:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0]
; AVX2-NEXT: vcomisd %xmm0, %xmm1 ## encoding: [0xc5,0xf9,0x2f,0xc8]
; AVX2-NEXT: setae %al ## encoding: [0x0f,0x93,0xc0]
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_sse2_comile_sd:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0]
; SKX-NEXT: vcomisd %xmm0, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x2f,0xc8]
; SKX-NEXT: setae %al ## encoding: [0x0f,0x93,0xc0]
@@ -151,21 +151,21 @@ declare i32 @llvm.x86.sse2.comile.sd(<2 x double>, <2 x double>) nounwind readno
define i32 @test_x86_sse2_comilt_sd(<2 x double> %a0, <2 x double> %a1) {
; SSE-LABEL: test_x86_sse2_comilt_sd:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0]
; SSE-NEXT: comisd %xmm0, %xmm1 ## encoding: [0x66,0x0f,0x2f,0xc8]
; SSE-NEXT: seta %al ## encoding: [0x0f,0x97,0xc0]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; AVX2-LABEL: test_x86_sse2_comilt_sd:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0]
; AVX2-NEXT: vcomisd %xmm0, %xmm1 ## encoding: [0xc5,0xf9,0x2f,0xc8]
; AVX2-NEXT: seta %al ## encoding: [0x0f,0x97,0xc0]
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_sse2_comilt_sd:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0]
; SKX-NEXT: vcomisd %xmm0, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x2f,0xc8]
; SKX-NEXT: seta %al ## encoding: [0x0f,0x97,0xc0]
@@ -178,7 +178,7 @@ declare i32 @llvm.x86.sse2.comilt.sd(<2 x double>, <2 x double>) nounwind readno
define i32 @test_x86_sse2_comineq_sd(<2 x double> %a0, <2 x double> %a1) {
; SSE-LABEL: test_x86_sse2_comineq_sd:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: comisd %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x2f,0xc1]
; SSE-NEXT: setp %al ## encoding: [0x0f,0x9a,0xc0]
; SSE-NEXT: setne %cl ## encoding: [0x0f,0x95,0xc1]
@@ -187,7 +187,7 @@ define i32 @test_x86_sse2_comineq_sd(<2 x double> %a0, <2 x double> %a1) {
; SSE-NEXT: retl ## encoding: [0xc3]
;
; AVX2-LABEL: test_x86_sse2_comineq_sd:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vcomisd %xmm1, %xmm0 ## encoding: [0xc5,0xf9,0x2f,0xc1]
; AVX2-NEXT: setp %al ## encoding: [0x0f,0x9a,0xc0]
; AVX2-NEXT: setne %cl ## encoding: [0x0f,0x95,0xc1]
@@ -196,7 +196,7 @@ define i32 @test_x86_sse2_comineq_sd(<2 x double> %a0, <2 x double> %a1) {
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_sse2_comineq_sd:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vcomisd %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x2f,0xc1]
; SKX-NEXT: setp %al ## encoding: [0x0f,0x9a,0xc0]
; SKX-NEXT: setne %cl ## encoding: [0x0f,0x95,0xc1]
@@ -211,17 +211,17 @@ declare i32 @llvm.x86.sse2.comineq.sd(<2 x double>, <2 x double>) nounwind readn
define <4 x float> @test_x86_sse2_cvtdq2ps(<4 x i32> %a0) {
; SSE-LABEL: test_x86_sse2_cvtdq2ps:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: cvtdq2ps %xmm0, %xmm0 ## encoding: [0x0f,0x5b,0xc0]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; AVX2-LABEL: test_x86_sse2_cvtdq2ps:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vcvtdq2ps %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x5b,0xc0]
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_sse2_cvtdq2ps:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vcvtdq2ps %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x5b,0xc0]
; SKX-NEXT: retl ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.sse2.cvtdq2ps(<4 x i32> %a0) ; <<4 x float>> [#uses=1]
@@ -232,17 +232,17 @@ declare <4 x float> @llvm.x86.sse2.cvtdq2ps(<4 x i32>) nounwind readnone
define <4 x i32> @test_x86_sse2_cvtpd2dq(<2 x double> %a0) {
; SSE-LABEL: test_x86_sse2_cvtpd2dq:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: cvtpd2dq %xmm0, %xmm0 ## encoding: [0xf2,0x0f,0xe6,0xc0]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; AVX2-LABEL: test_x86_sse2_cvtpd2dq:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vcvtpd2dq %xmm0, %xmm0 ## encoding: [0xc5,0xfb,0xe6,0xc0]
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_sse2_cvtpd2dq:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vcvtpd2dq %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfb,0xe6,0xc0]
; SKX-NEXT: retl ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.sse2.cvtpd2dq(<2 x double> %a0) ; <<4 x i32>> [#uses=1]
@@ -253,17 +253,17 @@ declare <4 x i32> @llvm.x86.sse2.cvtpd2dq(<2 x double>) nounwind readnone
define <2 x i64> @test_mm_cvtpd_epi32_zext(<2 x double> %a0) nounwind {
; SSE-LABEL: test_mm_cvtpd_epi32_zext:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: cvtpd2dq %xmm0, %xmm0 ## encoding: [0xf2,0x0f,0xe6,0xc0]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; AVX2-LABEL: test_mm_cvtpd_epi32_zext:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vcvtpd2dq %xmm0, %xmm0 ## encoding: [0xc5,0xfb,0xe6,0xc0]
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; SKX-LABEL: test_mm_cvtpd_epi32_zext:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vcvtpd2dq %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfb,0xe6,0xc0]
; SKX-NEXT: retl ## encoding: [0xc3]
%cvt = call <4 x i32> @llvm.x86.sse2.cvtpd2dq(<2 x double> %a0)
@@ -273,19 +273,45 @@ define <2 x i64> @test_mm_cvtpd_epi32_zext(<2 x double> %a0) nounwind {
}
+define <2 x i64> @test_mm_cvtpd_epi32_zext_load(<2 x double>* %p0) nounwind {
+; SSE-LABEL: test_mm_cvtpd_epi32_zext_load:
+; SSE: ## %bb.0:
+; SSE-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; SSE-NEXT: cvtpd2dq (%eax), %xmm0 ## encoding: [0xf2,0x0f,0xe6,0x00]
+; SSE-NEXT: retl ## encoding: [0xc3]
+;
+; AVX2-LABEL: test_mm_cvtpd_epi32_zext_load:
+; AVX2: ## %bb.0:
+; AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; AVX2-NEXT: vcvtpd2dqx (%eax), %xmm0 ## encoding: [0xc5,0xfb,0xe6,0x00]
+; AVX2-NEXT: retl ## encoding: [0xc3]
+;
+; SKX-LABEL: test_mm_cvtpd_epi32_zext_load:
+; SKX: ## %bb.0:
+; SKX-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; SKX-NEXT: vcvtpd2dqx (%eax), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfb,0xe6,0x00]
+; SKX-NEXT: retl ## encoding: [0xc3]
+ %a0 = load <2 x double>, <2 x double>* %p0
+ %cvt = call <4 x i32> @llvm.x86.sse2.cvtpd2dq(<2 x double> %a0)
+ %res = shufflevector <4 x i32> %cvt, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+ %bc = bitcast <4 x i32> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+
+
define <4 x float> @test_x86_sse2_cvtpd2ps(<2 x double> %a0) {
; SSE-LABEL: test_x86_sse2_cvtpd2ps:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: cvtpd2ps %xmm0, %xmm0 ## encoding: [0x66,0x0f,0x5a,0xc0]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; AVX2-LABEL: test_x86_sse2_cvtpd2ps:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vcvtpd2ps %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x5a,0xc0]
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_sse2_cvtpd2ps:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vcvtpd2ps %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x5a,0xc0]
; SKX-NEXT: retl ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.sse2.cvtpd2ps(<2 x double> %a0) ; <<4 x float>> [#uses=1]
@@ -295,17 +321,17 @@ declare <4 x float> @llvm.x86.sse2.cvtpd2ps(<2 x double>) nounwind readnone
define <4 x float> @test_x86_sse2_cvtpd2ps_zext(<2 x double> %a0) nounwind {
; SSE-LABEL: test_x86_sse2_cvtpd2ps_zext:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: cvtpd2ps %xmm0, %xmm0 ## encoding: [0x66,0x0f,0x5a,0xc0]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; AVX2-LABEL: test_x86_sse2_cvtpd2ps_zext:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vcvtpd2ps %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x5a,0xc0]
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_sse2_cvtpd2ps_zext:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vcvtpd2ps %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x5a,0xc0]
; SKX-NEXT: retl ## encoding: [0xc3]
%cvt = call <4 x float> @llvm.x86.sse2.cvtpd2ps(<2 x double> %a0)
@@ -313,14 +339,38 @@ define <4 x float> @test_x86_sse2_cvtpd2ps_zext(<2 x double> %a0) nounwind {
ret <4 x float> %res
}
+define <4 x float> @test_x86_sse2_cvtpd2ps_zext_load(<2 x double>* %p0) nounwind {
+; SSE-LABEL: test_x86_sse2_cvtpd2ps_zext_load:
+; SSE: ## %bb.0:
+; SSE-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; SSE-NEXT: cvtpd2ps (%eax), %xmm0 ## encoding: [0x66,0x0f,0x5a,0x00]
+; SSE-NEXT: retl ## encoding: [0xc3]
+;
+; AVX2-LABEL: test_x86_sse2_cvtpd2ps_zext_load:
+; AVX2: ## %bb.0:
+; AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; AVX2-NEXT: vcvtpd2psx (%eax), %xmm0 ## encoding: [0xc5,0xf9,0x5a,0x00]
+; AVX2-NEXT: retl ## encoding: [0xc3]
+;
+; SKX-LABEL: test_x86_sse2_cvtpd2ps_zext_load:
+; SKX: ## %bb.0:
+; SKX-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; SKX-NEXT: vcvtpd2psx (%eax), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x5a,0x00]
+; SKX-NEXT: retl ## encoding: [0xc3]
+ %a0 = load <2 x double>, <2 x double>* %p0
+ %cvt = call <4 x float> @llvm.x86.sse2.cvtpd2ps(<2 x double> %a0)
+ %res = shufflevector <4 x float> %cvt, <4 x float> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+ ret <4 x float> %res
+}
+
define <4 x i32> @test_x86_sse2_cvtps2dq(<4 x float> %a0) {
; SSE-LABEL: test_x86_sse2_cvtps2dq:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: cvtps2dq %xmm0, %xmm0 ## encoding: [0x66,0x0f,0x5b,0xc0]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; VCHECK-LABEL: test_x86_sse2_cvtps2dq:
-; VCHECK: ## BB#0:
+; VCHECK: ## %bb.0:
; VCHECK-NEXT: vcvtps2dq %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x5b,0xc0]
; VCHECK-NEXT: retl ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float> %a0) ; <<4 x i32>> [#uses=1]
@@ -331,17 +381,17 @@ declare <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float>) nounwind readnone
define i32 @test_x86_sse2_cvtsd2si(<2 x double> %a0) {
; SSE-LABEL: test_x86_sse2_cvtsd2si:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: cvtsd2si %xmm0, %eax ## encoding: [0xf2,0x0f,0x2d,0xc0]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; AVX2-LABEL: test_x86_sse2_cvtsd2si:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vcvtsd2si %xmm0, %eax ## encoding: [0xc5,0xfb,0x2d,0xc0]
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_sse2_cvtsd2si:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vcvtsd2si %xmm0, %eax ## EVEX TO VEX Compression encoding: [0xc5,0xfb,0x2d,0xc0]
; SKX-NEXT: retl ## encoding: [0xc3]
%res = call i32 @llvm.x86.sse2.cvtsd2si(<2 x double> %a0) ; <i32> [#uses=1]
@@ -352,12 +402,12 @@ declare i32 @llvm.x86.sse2.cvtsd2si(<2 x double>) nounwind readnone
define <4 x float> @test_x86_sse2_cvtsd2ss(<4 x float> %a0, <2 x double> %a1) {
; SSE-LABEL: test_x86_sse2_cvtsd2ss:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: cvtsd2ss %xmm1, %xmm0 ## encoding: [0xf2,0x0f,0x5a,0xc1]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; VCHECK-LABEL: test_x86_sse2_cvtsd2ss:
-; VCHECK: ## BB#0:
+; VCHECK: ## %bb.0:
; VCHECK-NEXT: vcvtsd2ss %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfb,0x5a,0xc1]
; VCHECK-NEXT: retl ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.sse2.cvtsd2ss(<4 x float> %a0, <2 x double> %a1) ; <<4 x float>> [#uses=1]
@@ -368,13 +418,13 @@ declare <4 x float> @llvm.x86.sse2.cvtsd2ss(<4 x float>, <2 x double>) nounwind
define <4 x float> @test_x86_sse2_cvtsd2ss_load(<4 x float> %a0, <2 x double>* %p1) {
; SSE-LABEL: test_x86_sse2_cvtsd2ss_load:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
; SSE-NEXT: cvtsd2ss (%eax), %xmm0 ## encoding: [0xf2,0x0f,0x5a,0x00]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; VCHECK-LABEL: test_x86_sse2_cvtsd2ss_load:
-; VCHECK: ## BB#0:
+; VCHECK: ## %bb.0:
; VCHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
; VCHECK-NEXT: vcvtsd2ss (%eax), %xmm0, %xmm0 ## encoding: [0xc5,0xfb,0x5a,0x00]
; VCHECK-NEXT: retl ## encoding: [0xc3]
@@ -386,13 +436,13 @@ define <4 x float> @test_x86_sse2_cvtsd2ss_load(<4 x float> %a0, <2 x double>* %
define <4 x float> @test_x86_sse2_cvtsd2ss_load_optsize(<4 x float> %a0, <2 x double>* %p1) optsize {
; SSE-LABEL: test_x86_sse2_cvtsd2ss_load_optsize:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
; SSE-NEXT: cvtsd2ss (%eax), %xmm0 ## encoding: [0xf2,0x0f,0x5a,0x00]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; VCHECK-LABEL: test_x86_sse2_cvtsd2ss_load_optsize:
-; VCHECK: ## BB#0:
+; VCHECK: ## %bb.0:
; VCHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
; VCHECK-NEXT: vcvtsd2ss (%eax), %xmm0, %xmm0 ## encoding: [0xc5,0xfb,0x5a,0x00]
; VCHECK-NEXT: retl ## encoding: [0xc3]
@@ -404,17 +454,17 @@ define <4 x float> @test_x86_sse2_cvtsd2ss_load_optsize(<4 x float> %a0, <2 x do
define <2 x double> @test_x86_sse2_cvtsi2sd(<2 x double> %a0, i32 %a1) {
; SSE-LABEL: test_x86_sse2_cvtsi2sd:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: cvtsi2sdl {{[0-9]+}}(%esp), %xmm0 ## encoding: [0xf2,0x0f,0x2a,0x44,0x24,0x04]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; AVX2-LABEL: test_x86_sse2_cvtsi2sd:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vcvtsi2sdl {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc5,0xfb,0x2a,0x44,0x24,0x04]
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_sse2_cvtsi2sd:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vcvtsi2sdl {{[0-9]+}}(%esp), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfb,0x2a,0x44,0x24,0x04]
; SKX-NEXT: retl ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.sse2.cvtsi2sd(<2 x double> %a0, i32 %a1) ; <<2 x double>> [#uses=1]
@@ -425,12 +475,12 @@ declare <2 x double> @llvm.x86.sse2.cvtsi2sd(<2 x double>, i32) nounwind readnon
define <2 x double> @test_x86_sse2_cvtss2sd(<2 x double> %a0, <4 x float> %a1) {
; SSE-LABEL: test_x86_sse2_cvtss2sd:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: cvtss2sd %xmm1, %xmm0 ## encoding: [0xf3,0x0f,0x5a,0xc1]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; VCHECK-LABEL: test_x86_sse2_cvtss2sd:
-; VCHECK: ## BB#0:
+; VCHECK: ## %bb.0:
; VCHECK-NEXT: vcvtss2sd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x5a,0xc1]
; VCHECK-NEXT: retl ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.sse2.cvtss2sd(<2 x double> %a0, <4 x float> %a1) ; <<2 x double>> [#uses=1]
@@ -441,13 +491,13 @@ declare <2 x double> @llvm.x86.sse2.cvtss2sd(<2 x double>, <4 x float>) nounwind
define <2 x double> @test_x86_sse2_cvtss2sd_load(<2 x double> %a0, <4 x float>* %p1) {
; SSE-LABEL: test_x86_sse2_cvtss2sd_load:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
; SSE-NEXT: cvtss2sd (%eax), %xmm0 ## encoding: [0xf3,0x0f,0x5a,0x00]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; VCHECK-LABEL: test_x86_sse2_cvtss2sd_load:
-; VCHECK: ## BB#0:
+; VCHECK: ## %bb.0:
; VCHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
; VCHECK-NEXT: vcvtss2sd (%eax), %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x5a,0x00]
; VCHECK-NEXT: retl ## encoding: [0xc3]
@@ -459,13 +509,13 @@ define <2 x double> @test_x86_sse2_cvtss2sd_load(<2 x double> %a0, <4 x float>*
define <2 x double> @test_x86_sse2_cvtss2sd_load_optsize(<2 x double> %a0, <4 x float>* %p1) optsize {
; SSE-LABEL: test_x86_sse2_cvtss2sd_load_optsize:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
; SSE-NEXT: cvtss2sd (%eax), %xmm0 ## encoding: [0xf3,0x0f,0x5a,0x00]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; VCHECK-LABEL: test_x86_sse2_cvtss2sd_load_optsize:
-; VCHECK: ## BB#0:
+; VCHECK: ## %bb.0:
; VCHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
; VCHECK-NEXT: vcvtss2sd (%eax), %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x5a,0x00]
; VCHECK-NEXT: retl ## encoding: [0xc3]
@@ -477,17 +527,17 @@ define <2 x double> @test_x86_sse2_cvtss2sd_load_optsize(<2 x double> %a0, <4 x
define <4 x i32> @test_x86_sse2_cvttpd2dq(<2 x double> %a0) {
; SSE-LABEL: test_x86_sse2_cvttpd2dq:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: cvttpd2dq %xmm0, %xmm0 ## encoding: [0x66,0x0f,0xe6,0xc0]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; AVX2-LABEL: test_x86_sse2_cvttpd2dq:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vcvttpd2dq %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xe6,0xc0]
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_sse2_cvttpd2dq:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vcvttpd2dq %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe6,0xc0]
; SKX-NEXT: retl ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.sse2.cvttpd2dq(<2 x double> %a0) ; <<4 x i32>> [#uses=1]
@@ -498,17 +548,17 @@ declare <4 x i32> @llvm.x86.sse2.cvttpd2dq(<2 x double>) nounwind readnone
define <2 x i64> @test_mm_cvttpd_epi32_zext(<2 x double> %a0) nounwind {
; SSE-LABEL: test_mm_cvttpd_epi32_zext:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: cvttpd2dq %xmm0, %xmm0 ## encoding: [0x66,0x0f,0xe6,0xc0]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; AVX2-LABEL: test_mm_cvttpd_epi32_zext:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vcvttpd2dq %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xe6,0xc0]
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; SKX-LABEL: test_mm_cvttpd_epi32_zext:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vcvttpd2dq %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe6,0xc0]
; SKX-NEXT: retl ## encoding: [0xc3]
%cvt = call <4 x i32> @llvm.x86.sse2.cvttpd2dq(<2 x double> %a0)
@@ -518,19 +568,45 @@ define <2 x i64> @test_mm_cvttpd_epi32_zext(<2 x double> %a0) nounwind {
}
+define <2 x i64> @test_mm_cvttpd_epi32_zext_load(<2 x double>* %p0) nounwind {
+; SSE-LABEL: test_mm_cvttpd_epi32_zext_load:
+; SSE: ## %bb.0:
+; SSE-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; SSE-NEXT: cvttpd2dq (%eax), %xmm0 ## encoding: [0x66,0x0f,0xe6,0x00]
+; SSE-NEXT: retl ## encoding: [0xc3]
+;
+; AVX2-LABEL: test_mm_cvttpd_epi32_zext_load:
+; AVX2: ## %bb.0:
+; AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; AVX2-NEXT: vcvttpd2dqx (%eax), %xmm0 ## encoding: [0xc5,0xf9,0xe6,0x00]
+; AVX2-NEXT: retl ## encoding: [0xc3]
+;
+; SKX-LABEL: test_mm_cvttpd_epi32_zext_load:
+; SKX: ## %bb.0:
+; SKX-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; SKX-NEXT: vcvttpd2dqx (%eax), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe6,0x00]
+; SKX-NEXT: retl ## encoding: [0xc3]
+ %a0 = load <2 x double>, <2 x double>* %p0
+ %cvt = call <4 x i32> @llvm.x86.sse2.cvttpd2dq(<2 x double> %a0)
+ %res = shufflevector <4 x i32> %cvt, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+ %bc = bitcast <4 x i32> %res to <2 x i64>
+ ret <2 x i64> %bc
+}
+
+
define <4 x i32> @test_x86_sse2_cvttps2dq(<4 x float> %a0) {
; SSE-LABEL: test_x86_sse2_cvttps2dq:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: cvttps2dq %xmm0, %xmm0 ## encoding: [0xf3,0x0f,0x5b,0xc0]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; AVX2-LABEL: test_x86_sse2_cvttps2dq:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vcvttps2dq %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x5b,0xc0]
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_sse2_cvttps2dq:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vcvttps2dq %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x5b,0xc0]
; SKX-NEXT: retl ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float> %a0) ; <<4 x i32>> [#uses=1]
@@ -541,17 +617,17 @@ declare <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float>) nounwind readnone
define i32 @test_x86_sse2_cvttsd2si(<2 x double> %a0) {
; SSE-LABEL: test_x86_sse2_cvttsd2si:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: cvttsd2si %xmm0, %eax ## encoding: [0xf2,0x0f,0x2c,0xc0]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; AVX2-LABEL: test_x86_sse2_cvttsd2si:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vcvttsd2si %xmm0, %eax ## encoding: [0xc5,0xfb,0x2c,0xc0]
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_sse2_cvttsd2si:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vcvttsd2si %xmm0, %eax ## EVEX TO VEX Compression encoding: [0xc5,0xfb,0x2c,0xc0]
; SKX-NEXT: retl ## encoding: [0xc3]
%res = call i32 @llvm.x86.sse2.cvttsd2si(<2 x double> %a0) ; <i32> [#uses=1]
@@ -562,17 +638,17 @@ declare i32 @llvm.x86.sse2.cvttsd2si(<2 x double>) nounwind readnone
define <2 x double> @test_x86_sse2_max_pd(<2 x double> %a0, <2 x double> %a1) {
; SSE-LABEL: test_x86_sse2_max_pd:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: maxpd %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x5f,0xc1]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; AVX2-LABEL: test_x86_sse2_max_pd:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vmaxpd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x5f,0xc1]
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_sse2_max_pd:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vmaxpd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x5f,0xc1]
; SKX-NEXT: retl ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %a0, <2 x double> %a1) ; <<2 x double>> [#uses=1]
@@ -583,17 +659,17 @@ declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind
define <2 x double> @test_x86_sse2_max_sd(<2 x double> %a0, <2 x double> %a1) {
; SSE-LABEL: test_x86_sse2_max_sd:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: maxsd %xmm1, %xmm0 ## encoding: [0xf2,0x0f,0x5f,0xc1]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; AVX2-LABEL: test_x86_sse2_max_sd:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfb,0x5f,0xc1]
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_sse2_max_sd:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfb,0x5f,0xc1]
; SKX-NEXT: retl ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.sse2.max.sd(<2 x double> %a0, <2 x double> %a1) ; <<2 x double>> [#uses=1]
@@ -604,17 +680,17 @@ declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>) nounwind
define <2 x double> @test_x86_sse2_min_pd(<2 x double> %a0, <2 x double> %a1) {
; SSE-LABEL: test_x86_sse2_min_pd:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: minpd %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x5d,0xc1]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; AVX2-LABEL: test_x86_sse2_min_pd:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vminpd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x5d,0xc1]
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_sse2_min_pd:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vminpd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x5d,0xc1]
; SKX-NEXT: retl ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %a0, <2 x double> %a1) ; <<2 x double>> [#uses=1]
@@ -625,17 +701,17 @@ declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind
define <2 x double> @test_x86_sse2_min_sd(<2 x double> %a0, <2 x double> %a1) {
; SSE-LABEL: test_x86_sse2_min_sd:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: minsd %xmm1, %xmm0 ## encoding: [0xf2,0x0f,0x5d,0xc1]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; AVX2-LABEL: test_x86_sse2_min_sd:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vminsd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfb,0x5d,0xc1]
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_sse2_min_sd:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vminsd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfb,0x5d,0xc1]
; SKX-NEXT: retl ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.sse2.min.sd(<2 x double> %a0, <2 x double> %a1) ; <<2 x double>> [#uses=1]
@@ -646,12 +722,12 @@ declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind
define i32 @test_x86_sse2_movmsk_pd(<2 x double> %a0) {
; SSE-LABEL: test_x86_sse2_movmsk_pd:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: movmskpd %xmm0, %eax ## encoding: [0x66,0x0f,0x50,0xc0]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; VCHECK-LABEL: test_x86_sse2_movmsk_pd:
-; VCHECK: ## BB#0:
+; VCHECK: ## %bb.0:
; VCHECK-NEXT: vmovmskpd %xmm0, %eax ## encoding: [0xc5,0xf9,0x50,0xc0]
; VCHECK-NEXT: retl ## encoding: [0xc3]
%res = call i32 @llvm.x86.sse2.movmsk.pd(<2 x double> %a0) ; <i32> [#uses=1]
@@ -660,21 +736,19 @@ define i32 @test_x86_sse2_movmsk_pd(<2 x double> %a0) {
declare i32 @llvm.x86.sse2.movmsk.pd(<2 x double>) nounwind readnone
-
-
define <8 x i16> @test_x86_sse2_packssdw_128(<4 x i32> %a0, <4 x i32> %a1) {
; SSE-LABEL: test_x86_sse2_packssdw_128:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: packssdw %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x6b,0xc1]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; AVX2-LABEL: test_x86_sse2_packssdw_128:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x6b,0xc1]
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_sse2_packssdw_128:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6b,0xc1]
; SKX-NEXT: retl ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a0, <4 x i32> %a1) ; <<8 x i16>> [#uses=1]
@@ -683,19 +757,45 @@ define <8 x i16> @test_x86_sse2_packssdw_128(<4 x i32> %a0, <4 x i32> %a1) {
declare <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32>, <4 x i32>) nounwind readnone
+define <8 x i16> @test_x86_sse2_packssdw_128_fold() {
+; SSE-LABEL: test_x86_sse2_packssdw_128_fold:
+; SSE: ## %bb.0:
+; SSE-NEXT: movaps {{.*#+}} xmm0 = [0,0,0,0,32767,32767,65535,32768]
+; SSE-NEXT: ## encoding: [0x0f,0x28,0x05,A,A,A,A]
+; SSE-NEXT: ## fixup A - offset: 3, value: LCPI35_0, kind: FK_Data_4
+; SSE-NEXT: retl ## encoding: [0xc3]
+;
+; AVX2-LABEL: test_x86_sse2_packssdw_128_fold:
+; AVX2: ## %bb.0:
+; AVX2-NEXT: vmovaps {{.*#+}} xmm0 = [0,0,0,0,32767,32767,65535,32768]
+; AVX2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x05,A,A,A,A]
+; AVX2-NEXT: ## fixup A - offset: 4, value: LCPI35_0, kind: FK_Data_4
+; AVX2-NEXT: retl ## encoding: [0xc3]
+;
+; SKX-LABEL: test_x86_sse2_packssdw_128_fold:
+; SKX: ## %bb.0:
+; SKX-NEXT: vmovaps LCPI35_0, %xmm0 ## EVEX TO VEX Compression xmm0 = [0,0,0,0,32767,32767,65535,32768]
+; SKX-NEXT: ## encoding: [0xc5,0xf8,0x28,0x05,A,A,A,A]
+; SKX-NEXT: ## fixup A - offset: 4, value: LCPI35_0, kind: FK_Data_4
+; SKX-NEXT: retl ## encoding: [0xc3]
+ %res = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> zeroinitializer, <4 x i32> <i32 65535, i32 65536, i32 -1, i32 -131072>)
+ ret <8 x i16> %res
+}
+
+
define <16 x i8> @test_x86_sse2_packsswb_128(<8 x i16> %a0, <8 x i16> %a1) {
; SSE-LABEL: test_x86_sse2_packsswb_128:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: packsswb %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x63,0xc1]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; AVX2-LABEL: test_x86_sse2_packsswb_128:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x63,0xc1]
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_sse2_packsswb_128:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x63,0xc1]
; SKX-NEXT: retl ## encoding: [0xc3]
%res = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %a0, <8 x i16> %a1) ; <<16 x i8>> [#uses=1]
@@ -704,19 +804,45 @@ define <16 x i8> @test_x86_sse2_packsswb_128(<8 x i16> %a0, <8 x i16> %a1) {
declare <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16>, <8 x i16>) nounwind readnone
+define <16 x i8> @test_x86_sse2_packsswb_128_fold() {
+; SSE-LABEL: test_x86_sse2_packsswb_128_fold:
+; SSE: ## %bb.0:
+; SSE-NEXT: movaps {{.*#+}} xmm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0]
+; SSE-NEXT: ## encoding: [0x0f,0x28,0x05,A,A,A,A]
+; SSE-NEXT: ## fixup A - offset: 3, value: LCPI37_0, kind: FK_Data_4
+; SSE-NEXT: retl ## encoding: [0xc3]
+;
+; AVX2-LABEL: test_x86_sse2_packsswb_128_fold:
+; AVX2: ## %bb.0:
+; AVX2-NEXT: vmovaps {{.*#+}} xmm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0]
+; AVX2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x05,A,A,A,A]
+; AVX2-NEXT: ## fixup A - offset: 4, value: LCPI37_0, kind: FK_Data_4
+; AVX2-NEXT: retl ## encoding: [0xc3]
+;
+; SKX-LABEL: test_x86_sse2_packsswb_128_fold:
+; SKX: ## %bb.0:
+; SKX-NEXT: vmovaps LCPI37_0, %xmm0 ## EVEX TO VEX Compression xmm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0]
+; SKX-NEXT: ## encoding: [0xc5,0xf8,0x28,0x05,A,A,A,A]
+; SKX-NEXT: ## fixup A - offset: 4, value: LCPI37_0, kind: FK_Data_4
+; SKX-NEXT: retl ## encoding: [0xc3]
+ %res = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> <i16 0, i16 255, i16 256, i16 65535, i16 -1, i16 -255, i16 -256, i16 -32678>, <8 x i16> zeroinitializer)
+ ret <16 x i8> %res
+}
+
+
define <16 x i8> @test_x86_sse2_packuswb_128(<8 x i16> %a0, <8 x i16> %a1) {
; SSE-LABEL: test_x86_sse2_packuswb_128:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: packuswb %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x67,0xc1]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; AVX2-LABEL: test_x86_sse2_packuswb_128:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x67,0xc1]
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_sse2_packuswb_128:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x67,0xc1]
; SKX-NEXT: retl ## encoding: [0xc3]
%res = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %a0, <8 x i16> %a1) ; <<16 x i8>> [#uses=1]
@@ -725,19 +851,45 @@ define <16 x i8> @test_x86_sse2_packuswb_128(<8 x i16> %a0, <8 x i16> %a1) {
declare <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16>, <8 x i16>) nounwind readnone
+define <16 x i8> @test_x86_sse2_packuswb_128_fold() {
+; SSE-LABEL: test_x86_sse2_packuswb_128_fold:
+; SSE: ## %bb.0:
+; SSE-NEXT: movaps {{.*#+}} xmm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; SSE-NEXT: ## encoding: [0x0f,0x28,0x05,A,A,A,A]
+; SSE-NEXT: ## fixup A - offset: 3, value: LCPI39_0, kind: FK_Data_4
+; SSE-NEXT: retl ## encoding: [0xc3]
+;
+; AVX2-LABEL: test_x86_sse2_packuswb_128_fold:
+; AVX2: ## %bb.0:
+; AVX2-NEXT: vmovaps {{.*#+}} xmm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x05,A,A,A,A]
+; AVX2-NEXT: ## fixup A - offset: 4, value: LCPI39_0, kind: FK_Data_4
+; AVX2-NEXT: retl ## encoding: [0xc3]
+;
+; SKX-LABEL: test_x86_sse2_packuswb_128_fold:
+; SKX: ## %bb.0:
+; SKX-NEXT: vmovaps LCPI39_0, %xmm0 ## EVEX TO VEX Compression xmm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; SKX-NEXT: ## encoding: [0xc5,0xf8,0x28,0x05,A,A,A,A]
+; SKX-NEXT: ## fixup A - offset: 4, value: LCPI39_0, kind: FK_Data_4
+; SKX-NEXT: retl ## encoding: [0xc3]
+ %res = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> <i16 0, i16 255, i16 256, i16 65535, i16 -1, i16 -255, i16 -256, i16 -32678>, <8 x i16> zeroinitializer)
+ ret <16 x i8> %res
+}
+
+
define <16 x i8> @test_x86_sse2_padds_b(<16 x i8> %a0, <16 x i8> %a1) {
; SSE-LABEL: test_x86_sse2_padds_b:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: paddsb %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xec,0xc1]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; AVX2-LABEL: test_x86_sse2_padds_b:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xec,0xc1]
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_sse2_padds_b:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xec,0xc1]
; SKX-NEXT: retl ## encoding: [0xc3]
%res = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
@@ -748,17 +900,17 @@ declare <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8>, <16 x i8>) nounwind readnone
define <8 x i16> @test_x86_sse2_padds_w(<8 x i16> %a0, <8 x i16> %a1) {
; SSE-LABEL: test_x86_sse2_padds_w:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: paddsw %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xed,0xc1]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; AVX2-LABEL: test_x86_sse2_padds_w:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vpaddsw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xed,0xc1]
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_sse2_padds_w:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpaddsw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xed,0xc1]
; SKX-NEXT: retl ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
@@ -769,17 +921,17 @@ declare <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16>, <8 x i16>) nounwind readnone
define <16 x i8> @test_x86_sse2_paddus_b(<16 x i8> %a0, <16 x i8> %a1) {
; SSE-LABEL: test_x86_sse2_paddus_b:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: paddusb %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xdc,0xc1]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; AVX2-LABEL: test_x86_sse2_paddus_b:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xdc,0xc1]
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_sse2_paddus_b:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdc,0xc1]
; SKX-NEXT: retl ## encoding: [0xc3]
%res = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
@@ -790,17 +942,17 @@ declare <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8>, <16 x i8>) nounwind readnon
define <8 x i16> @test_x86_sse2_paddus_w(<8 x i16> %a0, <8 x i16> %a1) {
; SSE-LABEL: test_x86_sse2_paddus_w:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: paddusw %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xdd,0xc1]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; AVX2-LABEL: test_x86_sse2_paddus_w:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vpaddusw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xdd,0xc1]
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_sse2_paddus_w:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpaddusw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdd,0xc1]
; SKX-NEXT: retl ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
@@ -809,61 +961,19 @@ define <8 x i16> @test_x86_sse2_paddus_w(<8 x i16> %a0, <8 x i16> %a1) {
declare <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16>, <8 x i16>) nounwind readnone
-define <16 x i8> @test_x86_sse2_pavg_b(<16 x i8> %a0, <16 x i8> %a1) {
-; SSE-LABEL: test_x86_sse2_pavg_b:
-; SSE: ## BB#0:
-; SSE-NEXT: pavgb %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xe0,0xc1]
-; SSE-NEXT: retl ## encoding: [0xc3]
-;
-; AVX2-LABEL: test_x86_sse2_pavg_b:
-; AVX2: ## BB#0:
-; AVX2-NEXT: vpavgb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xe0,0xc1]
-; AVX2-NEXT: retl ## encoding: [0xc3]
-;
-; SKX-LABEL: test_x86_sse2_pavg_b:
-; SKX: ## BB#0:
-; SKX-NEXT: vpavgb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe0,0xc1]
-; SKX-NEXT: retl ## encoding: [0xc3]
- %res = call <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
- ret <16 x i8> %res
-}
-declare <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8>, <16 x i8>) nounwind readnone
-
-
-define <8 x i16> @test_x86_sse2_pavg_w(<8 x i16> %a0, <8 x i16> %a1) {
-; SSE-LABEL: test_x86_sse2_pavg_w:
-; SSE: ## BB#0:
-; SSE-NEXT: pavgw %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xe3,0xc1]
-; SSE-NEXT: retl ## encoding: [0xc3]
-;
-; AVX2-LABEL: test_x86_sse2_pavg_w:
-; AVX2: ## BB#0:
-; AVX2-NEXT: vpavgw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xe3,0xc1]
-; AVX2-NEXT: retl ## encoding: [0xc3]
-;
-; SKX-LABEL: test_x86_sse2_pavg_w:
-; SKX: ## BB#0:
-; SKX-NEXT: vpavgw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe3,0xc1]
-; SKX-NEXT: retl ## encoding: [0xc3]
- %res = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
- ret <8 x i16> %res
-}
-declare <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16>, <8 x i16>) nounwind readnone
-
-
define <4 x i32> @test_x86_sse2_pmadd_wd(<8 x i16> %a0, <8 x i16> %a1) {
; SSE-LABEL: test_x86_sse2_pmadd_wd:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: pmaddwd %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xf5,0xc1]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; AVX2-LABEL: test_x86_sse2_pmadd_wd:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xf5,0xc1]
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_sse2_pmadd_wd:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xf5,0xc1]
; SKX-NEXT: retl ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %a0, <8 x i16> %a1) ; <<4 x i32>> [#uses=1]
@@ -874,17 +984,17 @@ declare <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16>, <8 x i16>) nounwind readnon
define <8 x i16> @test_x86_sse2_pmaxs_w(<8 x i16> %a0, <8 x i16> %a1) {
; SSE-LABEL: test_x86_sse2_pmaxs_w:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: pmaxsw %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xee,0xc1]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; AVX2-LABEL: test_x86_sse2_pmaxs_w:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xee,0xc1]
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_sse2_pmaxs_w:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xee,0xc1]
; SKX-NEXT: retl ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.sse2.pmaxs.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
@@ -895,17 +1005,17 @@ declare <8 x i16> @llvm.x86.sse2.pmaxs.w(<8 x i16>, <8 x i16>) nounwind readnone
define <16 x i8> @test_x86_sse2_pmaxu_b(<16 x i8> %a0, <16 x i8> %a1) {
; SSE-LABEL: test_x86_sse2_pmaxu_b:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: pmaxub %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xde,0xc1]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; AVX2-LABEL: test_x86_sse2_pmaxu_b:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xde,0xc1]
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_sse2_pmaxu_b:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xde,0xc1]
; SKX-NEXT: retl ## encoding: [0xc3]
%res = call <16 x i8> @llvm.x86.sse2.pmaxu.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
@@ -916,17 +1026,17 @@ declare <16 x i8> @llvm.x86.sse2.pmaxu.b(<16 x i8>, <16 x i8>) nounwind readnone
define <8 x i16> @test_x86_sse2_pmins_w(<8 x i16> %a0, <8 x i16> %a1) {
; SSE-LABEL: test_x86_sse2_pmins_w:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: pminsw %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xea,0xc1]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; AVX2-LABEL: test_x86_sse2_pmins_w:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vpminsw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xea,0xc1]
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_sse2_pmins_w:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpminsw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xea,0xc1]
; SKX-NEXT: retl ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.sse2.pmins.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
@@ -937,17 +1047,17 @@ declare <8 x i16> @llvm.x86.sse2.pmins.w(<8 x i16>, <8 x i16>) nounwind readnone
define <16 x i8> @test_x86_sse2_pminu_b(<16 x i8> %a0, <16 x i8> %a1) {
; SSE-LABEL: test_x86_sse2_pminu_b:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: pminub %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xda,0xc1]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; AVX2-LABEL: test_x86_sse2_pminu_b:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xda,0xc1]
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_sse2_pminu_b:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpminub %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xda,0xc1]
; SKX-NEXT: retl ## encoding: [0xc3]
%res = call <16 x i8> @llvm.x86.sse2.pminu.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
@@ -958,12 +1068,12 @@ declare <16 x i8> @llvm.x86.sse2.pminu.b(<16 x i8>, <16 x i8>) nounwind readnone
define i32 @test_x86_sse2_pmovmskb_128(<16 x i8> %a0) {
; SSE-LABEL: test_x86_sse2_pmovmskb_128:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: pmovmskb %xmm0, %eax ## encoding: [0x66,0x0f,0xd7,0xc0]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; VCHECK-LABEL: test_x86_sse2_pmovmskb_128:
-; VCHECK: ## BB#0:
+; VCHECK: ## %bb.0:
; VCHECK-NEXT: vpmovmskb %xmm0, %eax ## encoding: [0xc5,0xf9,0xd7,0xc0]
; VCHECK-NEXT: retl ## encoding: [0xc3]
%res = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %a0) ; <i32> [#uses=1]
@@ -974,17 +1084,17 @@ declare i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8>) nounwind readnone
define <8 x i16> @test_x86_sse2_pmulh_w(<8 x i16> %a0, <8 x i16> %a1) {
; SSE-LABEL: test_x86_sse2_pmulh_w:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: pmulhw %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xe5,0xc1]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; AVX2-LABEL: test_x86_sse2_pmulh_w:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vpmulhw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xe5,0xc1]
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_sse2_pmulh_w:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpmulhw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe5,0xc1]
; SKX-NEXT: retl ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.sse2.pmulh.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
@@ -995,17 +1105,17 @@ declare <8 x i16> @llvm.x86.sse2.pmulh.w(<8 x i16>, <8 x i16>) nounwind readnone
define <8 x i16> @test_x86_sse2_pmulhu_w(<8 x i16> %a0, <8 x i16> %a1) {
; SSE-LABEL: test_x86_sse2_pmulhu_w:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: pmulhuw %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xe4,0xc1]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; AVX2-LABEL: test_x86_sse2_pmulhu_w:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vpmulhuw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xe4,0xc1]
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_sse2_pmulhu_w:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpmulhuw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe4,0xc1]
; SKX-NEXT: retl ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.sse2.pmulhu.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
@@ -1016,17 +1126,17 @@ declare <8 x i16> @llvm.x86.sse2.pmulhu.w(<8 x i16>, <8 x i16>) nounwind readnon
define <2 x i64> @test_x86_sse2_pmulu_dq(<4 x i32> %a0, <4 x i32> %a1) {
; SSE-LABEL: test_x86_sse2_pmulu_dq:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: pmuludq %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xf4,0xc1]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; AVX2-LABEL: test_x86_sse2_pmulu_dq:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xf4,0xc1]
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_sse2_pmulu_dq:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xf4,0xc1]
; SKX-NEXT: retl ## encoding: [0xc3]
%res = call <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32> %a0, <4 x i32> %a1) ; <<2 x i64>> [#uses=1]
@@ -1037,17 +1147,17 @@ declare <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32>, <4 x i32>) nounwind readnon
define <2 x i64> @test_x86_sse2_psad_bw(<16 x i8> %a0, <16 x i8> %a1) {
; SSE-LABEL: test_x86_sse2_psad_bw:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: psadbw %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xf6,0xc1]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; AVX2-LABEL: test_x86_sse2_psad_bw:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xf6,0xc1]
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_sse2_psad_bw:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xf6,0xc1]
; SKX-NEXT: retl ## encoding: [0xc3]
%res = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %a0, <16 x i8> %a1) ; <<2 x i64>> [#uses=1]
@@ -1058,17 +1168,17 @@ declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
define <4 x i32> @test_x86_sse2_psll_d(<4 x i32> %a0, <4 x i32> %a1) {
; SSE-LABEL: test_x86_sse2_psll_d:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: pslld %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xf2,0xc1]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; AVX2-LABEL: test_x86_sse2_psll_d:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vpslld %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xf2,0xc1]
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_sse2_psll_d:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpslld %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xf2,0xc1]
; SKX-NEXT: retl ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1]
@@ -1079,17 +1189,17 @@ declare <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32>, <4 x i32>) nounwind readnone
define <2 x i64> @test_x86_sse2_psll_q(<2 x i64> %a0, <2 x i64> %a1) {
; SSE-LABEL: test_x86_sse2_psll_q:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: psllq %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xf3,0xc1]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; AVX2-LABEL: test_x86_sse2_psll_q:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vpsllq %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xf3,0xc1]
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_sse2_psll_q:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpsllq %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xf3,0xc1]
; SKX-NEXT: retl ## encoding: [0xc3]
%res = call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %a0, <2 x i64> %a1) ; <<2 x i64>> [#uses=1]
@@ -1100,17 +1210,17 @@ declare <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64>, <2 x i64>) nounwind readnone
define <8 x i16> @test_x86_sse2_psll_w(<8 x i16> %a0, <8 x i16> %a1) {
; SSE-LABEL: test_x86_sse2_psll_w:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: psllw %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xf1,0xc1]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; AVX2-LABEL: test_x86_sse2_psll_w:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vpsllw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xf1,0xc1]
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_sse2_psll_w:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpsllw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xf1,0xc1]
; SKX-NEXT: retl ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
@@ -1121,17 +1231,17 @@ declare <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16>, <8 x i16>) nounwind readnone
define <4 x i32> @test_x86_sse2_pslli_d(<4 x i32> %a0) {
; SSE-LABEL: test_x86_sse2_pslli_d:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: pslld $7, %xmm0 ## encoding: [0x66,0x0f,0x72,0xf0,0x07]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; AVX2-LABEL: test_x86_sse2_pslli_d:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vpslld $7, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x72,0xf0,0x07]
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_sse2_pslli_d:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpslld $7, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x72,0xf0,0x07]
; SKX-NEXT: retl ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %a0, i32 7) ; <<4 x i32>> [#uses=1]
@@ -1142,17 +1252,17 @@ declare <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32>, i32) nounwind readnone
define <2 x i64> @test_x86_sse2_pslli_q(<2 x i64> %a0) {
; SSE-LABEL: test_x86_sse2_pslli_q:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: psllq $7, %xmm0 ## encoding: [0x66,0x0f,0x73,0xf0,0x07]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; AVX2-LABEL: test_x86_sse2_pslli_q:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vpsllq $7, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x73,0xf0,0x07]
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_sse2_pslli_q:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpsllq $7, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x73,0xf0,0x07]
; SKX-NEXT: retl ## encoding: [0xc3]
%res = call <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]
@@ -1163,17 +1273,17 @@ declare <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64>, i32) nounwind readnone
define <8 x i16> @test_x86_sse2_pslli_w(<8 x i16> %a0) {
; SSE-LABEL: test_x86_sse2_pslli_w:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: psllw $7, %xmm0 ## encoding: [0x66,0x0f,0x71,0xf0,0x07]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; AVX2-LABEL: test_x86_sse2_pslli_w:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vpsllw $7, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x71,0xf0,0x07]
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_sse2_pslli_w:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpsllw $7, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x71,0xf0,0x07]
; SKX-NEXT: retl ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> %a0, i32 7) ; <<8 x i16>> [#uses=1]
@@ -1184,17 +1294,17 @@ declare <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16>, i32) nounwind readnone
define <4 x i32> @test_x86_sse2_psra_d(<4 x i32> %a0, <4 x i32> %a1) {
; SSE-LABEL: test_x86_sse2_psra_d:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: psrad %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xe2,0xc1]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; AVX2-LABEL: test_x86_sse2_psra_d:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vpsrad %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xe2,0xc1]
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_sse2_psra_d:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpsrad %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe2,0xc1]
; SKX-NEXT: retl ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1]
@@ -1205,17 +1315,17 @@ declare <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32>, <4 x i32>) nounwind readnone
define <8 x i16> @test_x86_sse2_psra_w(<8 x i16> %a0, <8 x i16> %a1) {
; SSE-LABEL: test_x86_sse2_psra_w:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: psraw %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xe1,0xc1]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; AVX2-LABEL: test_x86_sse2_psra_w:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vpsraw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xe1,0xc1]
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_sse2_psra_w:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpsraw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe1,0xc1]
; SKX-NEXT: retl ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
@@ -1226,17 +1336,17 @@ declare <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16>, <8 x i16>) nounwind readnone
define <4 x i32> @test_x86_sse2_psrai_d(<4 x i32> %a0) {
; SSE-LABEL: test_x86_sse2_psrai_d:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: psrad $7, %xmm0 ## encoding: [0x66,0x0f,0x72,0xe0,0x07]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; AVX2-LABEL: test_x86_sse2_psrai_d:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vpsrad $7, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x72,0xe0,0x07]
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_sse2_psrai_d:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpsrad $7, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x72,0xe0,0x07]
; SKX-NEXT: retl ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %a0, i32 7) ; <<4 x i32>> [#uses=1]
@@ -1247,17 +1357,17 @@ declare <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32>, i32) nounwind readnone
define <8 x i16> @test_x86_sse2_psrai_w(<8 x i16> %a0) {
; SSE-LABEL: test_x86_sse2_psrai_w:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: psraw $7, %xmm0 ## encoding: [0x66,0x0f,0x71,0xe0,0x07]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; AVX2-LABEL: test_x86_sse2_psrai_w:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vpsraw $7, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x71,0xe0,0x07]
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_sse2_psrai_w:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpsraw $7, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x71,0xe0,0x07]
; SKX-NEXT: retl ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %a0, i32 7) ; <<8 x i16>> [#uses=1]
@@ -1268,17 +1378,17 @@ declare <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16>, i32) nounwind readnone
define <4 x i32> @test_x86_sse2_psrl_d(<4 x i32> %a0, <4 x i32> %a1) {
; SSE-LABEL: test_x86_sse2_psrl_d:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: psrld %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xd2,0xc1]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; AVX2-LABEL: test_x86_sse2_psrl_d:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vpsrld %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xd2,0xc1]
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_sse2_psrl_d:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpsrld %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd2,0xc1]
; SKX-NEXT: retl ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1]
@@ -1289,17 +1399,17 @@ declare <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32>, <4 x i32>) nounwind readnone
define <2 x i64> @test_x86_sse2_psrl_q(<2 x i64> %a0, <2 x i64> %a1) {
; SSE-LABEL: test_x86_sse2_psrl_q:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: psrlq %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xd3,0xc1]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; AVX2-LABEL: test_x86_sse2_psrl_q:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xd3,0xc1]
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_sse2_psrl_q:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd3,0xc1]
; SKX-NEXT: retl ## encoding: [0xc3]
%res = call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> %a0, <2 x i64> %a1) ; <<2 x i64>> [#uses=1]
@@ -1310,17 +1420,17 @@ declare <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64>, <2 x i64>) nounwind readnone
define <8 x i16> @test_x86_sse2_psrl_w(<8 x i16> %a0, <8 x i16> %a1) {
; SSE-LABEL: test_x86_sse2_psrl_w:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: psrlw %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xd1,0xc1]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; AVX2-LABEL: test_x86_sse2_psrl_w:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xd1,0xc1]
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_sse2_psrl_w:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd1,0xc1]
; SKX-NEXT: retl ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
@@ -1331,17 +1441,17 @@ declare <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16>, <8 x i16>) nounwind readnone
define <4 x i32> @test_x86_sse2_psrli_d(<4 x i32> %a0) {
; SSE-LABEL: test_x86_sse2_psrli_d:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: psrld $7, %xmm0 ## encoding: [0x66,0x0f,0x72,0xd0,0x07]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; AVX2-LABEL: test_x86_sse2_psrli_d:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vpsrld $7, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x72,0xd0,0x07]
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_sse2_psrli_d:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpsrld $7, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x72,0xd0,0x07]
; SKX-NEXT: retl ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> %a0, i32 7) ; <<4 x i32>> [#uses=1]
@@ -1352,17 +1462,17 @@ declare <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32>, i32) nounwind readnone
define <2 x i64> @test_x86_sse2_psrli_q(<2 x i64> %a0) {
; SSE-LABEL: test_x86_sse2_psrli_q:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: psrlq $7, %xmm0 ## encoding: [0x66,0x0f,0x73,0xd0,0x07]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; AVX2-LABEL: test_x86_sse2_psrli_q:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vpsrlq $7, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x73,0xd0,0x07]
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_sse2_psrli_q:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpsrlq $7, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x73,0xd0,0x07]
; SKX-NEXT: retl ## encoding: [0xc3]
%res = call <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]
@@ -1373,17 +1483,17 @@ declare <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64>, i32) nounwind readnone
define <8 x i16> @test_x86_sse2_psrli_w(<8 x i16> %a0) {
; SSE-LABEL: test_x86_sse2_psrli_w:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: psrlw $7, %xmm0 ## encoding: [0x66,0x0f,0x71,0xd0,0x07]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; AVX2-LABEL: test_x86_sse2_psrli_w:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vpsrlw $7, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x71,0xd0,0x07]
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_sse2_psrli_w:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpsrlw $7, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x71,0xd0,0x07]
; SKX-NEXT: retl ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> %a0, i32 7) ; <<8 x i16>> [#uses=1]
@@ -1394,17 +1504,17 @@ declare <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16>, i32) nounwind readnone
define <16 x i8> @test_x86_sse2_psubs_b(<16 x i8> %a0, <16 x i8> %a1) {
; SSE-LABEL: test_x86_sse2_psubs_b:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: psubsb %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xe8,0xc1]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; AVX2-LABEL: test_x86_sse2_psubs_b:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xe8,0xc1]
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_sse2_psubs_b:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe8,0xc1]
; SKX-NEXT: retl ## encoding: [0xc3]
%res = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
@@ -1415,17 +1525,17 @@ declare <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8>, <16 x i8>) nounwind readnone
define <8 x i16> @test_x86_sse2_psubs_w(<8 x i16> %a0, <8 x i16> %a1) {
; SSE-LABEL: test_x86_sse2_psubs_w:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: psubsw %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xe9,0xc1]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; AVX2-LABEL: test_x86_sse2_psubs_w:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vpsubsw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xe9,0xc1]
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_sse2_psubs_w:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpsubsw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe9,0xc1]
; SKX-NEXT: retl ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
@@ -1436,17 +1546,17 @@ declare <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16>, <8 x i16>) nounwind readnone
define <16 x i8> @test_x86_sse2_psubus_b(<16 x i8> %a0, <16 x i8> %a1) {
; SSE-LABEL: test_x86_sse2_psubus_b:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: psubusb %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xd8,0xc1]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; AVX2-LABEL: test_x86_sse2_psubus_b:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xd8,0xc1]
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_sse2_psubus_b:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd8,0xc1]
; SKX-NEXT: retl ## encoding: [0xc3]
%res = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
@@ -1457,17 +1567,17 @@ declare <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8>, <16 x i8>) nounwind readnon
define <8 x i16> @test_x86_sse2_psubus_w(<8 x i16> %a0, <8 x i16> %a1) {
; SSE-LABEL: test_x86_sse2_psubus_w:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: psubusw %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xd9,0xc1]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; AVX2-LABEL: test_x86_sse2_psubus_w:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xd9,0xc1]
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_sse2_psubus_w:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd9,0xc1]
; SKX-NEXT: retl ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
@@ -1478,14 +1588,19 @@ declare <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16>, <8 x i16>) nounwind readnon
define <2 x double> @test_x86_sse2_sqrt_pd(<2 x double> %a0) {
; SSE-LABEL: test_x86_sse2_sqrt_pd:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: sqrtpd %xmm0, %xmm0 ## encoding: [0x66,0x0f,0x51,0xc0]
; SSE-NEXT: retl ## encoding: [0xc3]
;
-; VCHECK-LABEL: test_x86_sse2_sqrt_pd:
-; VCHECK: ## BB#0:
-; VCHECK-NEXT: vsqrtpd %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x51,0xc0]
-; VCHECK-NEXT: retl ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_sse2_sqrt_pd:
+; AVX2: ## %bb.0:
+; AVX2-NEXT: vsqrtpd %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x51,0xc0]
+; AVX2-NEXT: retl ## encoding: [0xc3]
+;
+; SKX-LABEL: test_x86_sse2_sqrt_pd:
+; SKX: ## %bb.0:
+; SKX-NEXT: vsqrtpd %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x51,0xc0]
+; SKX-NEXT: retl ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double> %a0) ; <<2 x double>> [#uses=1]
ret <2 x double> %res
}
@@ -1494,14 +1609,19 @@ declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone
define <2 x double> @test_x86_sse2_sqrt_sd(<2 x double> %a0) {
; SSE-LABEL: test_x86_sse2_sqrt_sd:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: sqrtsd %xmm0, %xmm0 ## encoding: [0xf2,0x0f,0x51,0xc0]
; SSE-NEXT: retl ## encoding: [0xc3]
;
-; VCHECK-LABEL: test_x86_sse2_sqrt_sd:
-; VCHECK: ## BB#0:
-; VCHECK-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xfb,0x51,0xc0]
-; VCHECK-NEXT: retl ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_sse2_sqrt_sd:
+; AVX2: ## %bb.0:
+; AVX2-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xfb,0x51,0xc0]
+; AVX2-NEXT: retl ## encoding: [0xc3]
+;
+; SKX-LABEL: test_x86_sse2_sqrt_sd:
+; SKX: ## %bb.0:
+; SKX-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfb,0x51,0xc0]
+; SKX-NEXT: retl ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %a0) ; <<2 x double>> [#uses=1]
ret <2 x double> %res
}
@@ -1510,24 +1630,24 @@ declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone
define <2 x double> @test_x86_sse2_sqrt_sd_vec_load(<2 x double>* %a0) {
; SSE-LABEL: test_x86_sse2_sqrt_sd_vec_load:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
; SSE-NEXT: movapd (%eax), %xmm0 ## encoding: [0x66,0x0f,0x28,0x00]
; SSE-NEXT: sqrtsd %xmm0, %xmm0 ## encoding: [0xf2,0x0f,0x51,0xc0]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; AVX2-LABEL: test_x86_sse2_sqrt_sd_vec_load:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
; AVX2-NEXT: vmovapd (%eax), %xmm0 ## encoding: [0xc5,0xf9,0x28,0x00]
; AVX2-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xfb,0x51,0xc0]
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_sse2_sqrt_sd_vec_load:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
; SKX-NEXT: vmovapd (%eax), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0x00]
-; SKX-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xfb,0x51,0xc0]
+; SKX-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfb,0x51,0xc0]
; SKX-NEXT: retl ## encoding: [0xc3]
%a1 = load <2 x double>, <2 x double>* %a0, align 16
%res = call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %a1) ; <<2 x double>> [#uses=1]
@@ -1537,7 +1657,7 @@ define <2 x double> @test_x86_sse2_sqrt_sd_vec_load(<2 x double>* %a0) {
define i32 @test_x86_sse2_ucomieq_sd(<2 x double> %a0, <2 x double> %a1) {
; SSE-LABEL: test_x86_sse2_ucomieq_sd:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: ucomisd %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x2e,0xc1]
; SSE-NEXT: setnp %al ## encoding: [0x0f,0x9b,0xc0]
; SSE-NEXT: sete %cl ## encoding: [0x0f,0x94,0xc1]
@@ -1546,7 +1666,7 @@ define i32 @test_x86_sse2_ucomieq_sd(<2 x double> %a0, <2 x double> %a1) {
; SSE-NEXT: retl ## encoding: [0xc3]
;
; AVX2-LABEL: test_x86_sse2_ucomieq_sd:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vucomisd %xmm1, %xmm0 ## encoding: [0xc5,0xf9,0x2e,0xc1]
; AVX2-NEXT: setnp %al ## encoding: [0x0f,0x9b,0xc0]
; AVX2-NEXT: sete %cl ## encoding: [0x0f,0x94,0xc1]
@@ -1555,7 +1675,7 @@ define i32 @test_x86_sse2_ucomieq_sd(<2 x double> %a0, <2 x double> %a1) {
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_sse2_ucomieq_sd:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vucomisd %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x2e,0xc1]
; SKX-NEXT: setnp %al ## encoding: [0x0f,0x9b,0xc0]
; SKX-NEXT: sete %cl ## encoding: [0x0f,0x94,0xc1]
@@ -1570,21 +1690,21 @@ declare i32 @llvm.x86.sse2.ucomieq.sd(<2 x double>, <2 x double>) nounwind readn
define i32 @test_x86_sse2_ucomige_sd(<2 x double> %a0, <2 x double> %a1) {
; SSE-LABEL: test_x86_sse2_ucomige_sd:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0]
; SSE-NEXT: ucomisd %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x2e,0xc1]
; SSE-NEXT: setae %al ## encoding: [0x0f,0x93,0xc0]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; AVX2-LABEL: test_x86_sse2_ucomige_sd:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0]
; AVX2-NEXT: vucomisd %xmm1, %xmm0 ## encoding: [0xc5,0xf9,0x2e,0xc1]
; AVX2-NEXT: setae %al ## encoding: [0x0f,0x93,0xc0]
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_sse2_ucomige_sd:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0]
; SKX-NEXT: vucomisd %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x2e,0xc1]
; SKX-NEXT: setae %al ## encoding: [0x0f,0x93,0xc0]
@@ -1597,21 +1717,21 @@ declare i32 @llvm.x86.sse2.ucomige.sd(<2 x double>, <2 x double>) nounwind readn
define i32 @test_x86_sse2_ucomigt_sd(<2 x double> %a0, <2 x double> %a1) {
; SSE-LABEL: test_x86_sse2_ucomigt_sd:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0]
; SSE-NEXT: ucomisd %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x2e,0xc1]
; SSE-NEXT: seta %al ## encoding: [0x0f,0x97,0xc0]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; AVX2-LABEL: test_x86_sse2_ucomigt_sd:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0]
; AVX2-NEXT: vucomisd %xmm1, %xmm0 ## encoding: [0xc5,0xf9,0x2e,0xc1]
; AVX2-NEXT: seta %al ## encoding: [0x0f,0x97,0xc0]
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_sse2_ucomigt_sd:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0]
; SKX-NEXT: vucomisd %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x2e,0xc1]
; SKX-NEXT: seta %al ## encoding: [0x0f,0x97,0xc0]
@@ -1624,21 +1744,21 @@ declare i32 @llvm.x86.sse2.ucomigt.sd(<2 x double>, <2 x double>) nounwind readn
define i32 @test_x86_sse2_ucomile_sd(<2 x double> %a0, <2 x double> %a1) {
; SSE-LABEL: test_x86_sse2_ucomile_sd:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0]
; SSE-NEXT: ucomisd %xmm0, %xmm1 ## encoding: [0x66,0x0f,0x2e,0xc8]
; SSE-NEXT: setae %al ## encoding: [0x0f,0x93,0xc0]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; AVX2-LABEL: test_x86_sse2_ucomile_sd:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0]
; AVX2-NEXT: vucomisd %xmm0, %xmm1 ## encoding: [0xc5,0xf9,0x2e,0xc8]
; AVX2-NEXT: setae %al ## encoding: [0x0f,0x93,0xc0]
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_sse2_ucomile_sd:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0]
; SKX-NEXT: vucomisd %xmm0, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x2e,0xc8]
; SKX-NEXT: setae %al ## encoding: [0x0f,0x93,0xc0]
@@ -1651,21 +1771,21 @@ declare i32 @llvm.x86.sse2.ucomile.sd(<2 x double>, <2 x double>) nounwind readn
define i32 @test_x86_sse2_ucomilt_sd(<2 x double> %a0, <2 x double> %a1) {
; SSE-LABEL: test_x86_sse2_ucomilt_sd:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0]
; SSE-NEXT: ucomisd %xmm0, %xmm1 ## encoding: [0x66,0x0f,0x2e,0xc8]
; SSE-NEXT: seta %al ## encoding: [0x0f,0x97,0xc0]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; AVX2-LABEL: test_x86_sse2_ucomilt_sd:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0]
; AVX2-NEXT: vucomisd %xmm0, %xmm1 ## encoding: [0xc5,0xf9,0x2e,0xc8]
; AVX2-NEXT: seta %al ## encoding: [0x0f,0x97,0xc0]
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_sse2_ucomilt_sd:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0]
; SKX-NEXT: vucomisd %xmm0, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x2e,0xc8]
; SKX-NEXT: seta %al ## encoding: [0x0f,0x97,0xc0]
@@ -1678,7 +1798,7 @@ declare i32 @llvm.x86.sse2.ucomilt.sd(<2 x double>, <2 x double>) nounwind readn
define i32 @test_x86_sse2_ucomineq_sd(<2 x double> %a0, <2 x double> %a1) {
; SSE-LABEL: test_x86_sse2_ucomineq_sd:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: ucomisd %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x2e,0xc1]
; SSE-NEXT: setp %al ## encoding: [0x0f,0x9a,0xc0]
; SSE-NEXT: setne %cl ## encoding: [0x0f,0x95,0xc1]
@@ -1687,7 +1807,7 @@ define i32 @test_x86_sse2_ucomineq_sd(<2 x double> %a0, <2 x double> %a1) {
; SSE-NEXT: retl ## encoding: [0xc3]
;
; AVX2-LABEL: test_x86_sse2_ucomineq_sd:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vucomisd %xmm1, %xmm0 ## encoding: [0xc5,0xf9,0x2e,0xc1]
; AVX2-NEXT: setp %al ## encoding: [0x0f,0x9a,0xc0]
; AVX2-NEXT: setne %cl ## encoding: [0x0f,0x95,0xc1]
@@ -1696,7 +1816,7 @@ define i32 @test_x86_sse2_ucomineq_sd(<2 x double> %a0, <2 x double> %a1) {
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_sse2_ucomineq_sd:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vucomisd %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x2e,0xc1]
; SKX-NEXT: setp %al ## encoding: [0x0f,0x9a,0xc0]
; SKX-NEXT: setne %cl ## encoding: [0x0f,0x95,0xc1]
@@ -1710,7 +1830,7 @@ declare i32 @llvm.x86.sse2.ucomineq.sd(<2 x double>, <2 x double>) nounwind read
define void @test_x86_sse2_pause() {
; CHECK-LABEL: test_x86_sse2_pause:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: pause ## encoding: [0xf3,0x90]
; CHECK-NEXT: retl ## encoding: [0xc3]
tail call void @llvm.x86.sse2.pause()
@@ -1720,7 +1840,7 @@ declare void @llvm.x86.sse2.pause() nounwind
define void @lfence() nounwind {
; CHECK-LABEL: lfence:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: lfence ## encoding: [0x0f,0xae,0xe8]
; CHECK-NEXT: retl ## encoding: [0xc3]
tail call void @llvm.x86.sse2.lfence()
@@ -1730,7 +1850,7 @@ declare void @llvm.x86.sse2.lfence() nounwind
define void @mfence() nounwind {
; CHECK-LABEL: mfence:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: mfence ## encoding: [0x0f,0xae,0xf0]
; CHECK-NEXT: retl ## encoding: [0xc3]
tail call void @llvm.x86.sse2.mfence()
@@ -1740,7 +1860,7 @@ declare void @llvm.x86.sse2.mfence() nounwind
define void @clflush(i8* %p) nounwind {
; CHECK-LABEL: clflush:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
; CHECK-NEXT: clflush (%eax) ## encoding: [0x0f,0xae,0x38]
; CHECK-NEXT: retl ## encoding: [0xc3]
diff --git a/test/CodeGen/X86/sse2-intrinsics-x86_64.ll b/test/CodeGen/X86/sse2-intrinsics-x86_64.ll
index cd5e11e12795..41b4b2905dc5 100644
--- a/test/CodeGen/X86/sse2-intrinsics-x86_64.ll
+++ b/test/CodeGen/X86/sse2-intrinsics-x86_64.ll
@@ -5,21 +5,21 @@
define i64 @test_x86_sse2_cvtsd2si64(<2 x double> %a0) {
; CHECK-LABEL: test_x86_sse2_cvtsd2si64:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vcvtsd2si %xmm0, %rax
; CHECK-NEXT: retq
; SSE-LABEL: test_x86_sse2_cvtsd2si64:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: cvtsd2si %xmm0, %rax ## encoding: [0xf2,0x48,0x0f,0x2d,0xc0]
; SSE-NEXT: retq ## encoding: [0xc3]
;
; AVX2-LABEL: test_x86_sse2_cvtsd2si64:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vcvtsd2si %xmm0, %rax ## encoding: [0xc4,0xe1,0xfb,0x2d,0xc0]
; AVX2-NEXT: retq ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_sse2_cvtsd2si64:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vcvtsd2si %xmm0, %rax ## EVEX TO VEX Compression encoding: [0xc4,0xe1,0xfb,0x2d,0xc0]
; SKX-NEXT: retq ## encoding: [0xc3]
%res = call i64 @llvm.x86.sse2.cvtsd2si64(<2 x double> %a0) ; <i64> [#uses=1]
@@ -30,21 +30,21 @@ declare i64 @llvm.x86.sse2.cvtsd2si64(<2 x double>) nounwind readnone
define <2 x double> @test_x86_sse2_cvtsi642sd(<2 x double> %a0, i64 %a1) {
; CHECK-LABEL: test_x86_sse2_cvtsi642sd:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vcvtsi2sdq %rdi, %xmm0, %xmm0
; CHECK-NEXT: retq
; SSE-LABEL: test_x86_sse2_cvtsi642sd:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: cvtsi2sdq %rdi, %xmm0 ## encoding: [0xf2,0x48,0x0f,0x2a,0xc7]
; SSE-NEXT: retq ## encoding: [0xc3]
;
; AVX2-LABEL: test_x86_sse2_cvtsi642sd:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vcvtsi2sdq %rdi, %xmm0, %xmm0 ## encoding: [0xc4,0xe1,0xfb,0x2a,0xc7]
; AVX2-NEXT: retq ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_sse2_cvtsi642sd:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vcvtsi2sdq %rdi, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe1,0xfb,0x2a,0xc7]
; SKX-NEXT: retq ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.sse2.cvtsi642sd(<2 x double> %a0, i64 %a1) ; <<2 x double>> [#uses=1]
@@ -55,21 +55,21 @@ declare <2 x double> @llvm.x86.sse2.cvtsi642sd(<2 x double>, i64) nounwind readn
define i64 @test_x86_sse2_cvttsd2si64(<2 x double> %a0) {
; CHECK-LABEL: test_x86_sse2_cvttsd2si64:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vcvttsd2si %xmm0, %rax
; CHECK-NEXT: retq
; SSE-LABEL: test_x86_sse2_cvttsd2si64:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: cvttsd2si %xmm0, %rax ## encoding: [0xf2,0x48,0x0f,0x2c,0xc0]
; SSE-NEXT: retq ## encoding: [0xc3]
;
; AVX2-LABEL: test_x86_sse2_cvttsd2si64:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vcvttsd2si %xmm0, %rax ## encoding: [0xc4,0xe1,0xfb,0x2c,0xc0]
; AVX2-NEXT: retq ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_sse2_cvttsd2si64:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vcvttsd2si %xmm0, %rax ## EVEX TO VEX Compression encoding: [0xc4,0xe1,0xfb,0x2c,0xc0]
; SKX-NEXT: retq ## encoding: [0xc3]
%res = call i64 @llvm.x86.sse2.cvttsd2si64(<2 x double> %a0) ; <i64> [#uses=1]
diff --git a/test/CodeGen/X86/sse2-schedule.ll b/test/CodeGen/X86/sse2-schedule.ll
index 62c194f2fc4b..ad2edfe0959e 100644
--- a/test/CodeGen/X86/sse2-schedule.ll
+++ b/test/CodeGen/X86/sse2-schedule.ll
@@ -1,56 +1,76 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule | FileCheck %s --check-prefix=CHECK --check-prefix=GENERIC
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 | FileCheck %s --check-prefix=CHECK --check-prefix=GENERIC
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=atom | FileCheck %s --check-prefix=CHECK --check-prefix=ATOM
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=slm | FileCheck %s --check-prefix=CHECK --check-prefix=SLM
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=sandybridge | FileCheck %s --check-prefix=CHECK --check-prefix=SANDY
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=ivybridge | FileCheck %s --check-prefix=CHECK --check-prefix=SANDY
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=haswell | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=broadwell | FileCheck %s --check-prefix=CHECK --check-prefix=BROADWELL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=SKYLAKE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skx | FileCheck %s --check-prefix=CHECK --check-prefix=SKX
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1
define <2 x double> @test_addpd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2) {
; GENERIC-LABEL: test_addpd:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: addpd %xmm1, %xmm0
-; GENERIC-NEXT: addpd (%rdi), %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: addpd %xmm1, %xmm0 # sched: [3:1.00]
+; GENERIC-NEXT: addpd (%rdi), %xmm0 # sched: [9:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_addpd:
-; ATOM: # BB#0:
-; ATOM-NEXT: addpd %xmm1, %xmm0
-; ATOM-NEXT: addpd (%rdi), %xmm0
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: addpd %xmm1, %xmm0 # sched: [6:3.00]
+; ATOM-NEXT: addpd (%rdi), %xmm0 # sched: [7:3.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_addpd:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: addpd %xmm1, %xmm0 # sched: [3:1.00]
; SLM-NEXT: addpd (%rdi), %xmm0 # sched: [6:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_addpd:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vaddpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vaddpd (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_addpd:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: vaddpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vaddpd (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_addpd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; BROADWELL-NEXT: vaddpd (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_addpd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vaddpd (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_addpd:
+; SKX: # %bb.0:
+; SKX-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: vaddpd (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_addpd:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: vaddpd (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_addpd:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; ZNVER1-NEXT: vaddpd (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = fadd <2 x double> %a0, %a1
%2 = load <2 x double>, <2 x double> *%a2, align 16
%3 = fadd <2 x double> %1, %2
@@ -59,46 +79,64 @@ define <2 x double> @test_addpd(<2 x double> %a0, <2 x double> %a1, <2 x double>
define double @test_addsd(double %a0, double %a1, double *%a2) {
; GENERIC-LABEL: test_addsd:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: addsd %xmm1, %xmm0
-; GENERIC-NEXT: addsd (%rdi), %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: addsd %xmm1, %xmm0 # sched: [3:1.00]
+; GENERIC-NEXT: addsd (%rdi), %xmm0 # sched: [9:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_addsd:
-; ATOM: # BB#0:
-; ATOM-NEXT: addsd %xmm1, %xmm0
-; ATOM-NEXT: addsd (%rdi), %xmm0
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: addsd %xmm1, %xmm0 # sched: [5:5.00]
+; ATOM-NEXT: addsd (%rdi), %xmm0 # sched: [5:5.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_addsd:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: addsd %xmm1, %xmm0 # sched: [3:1.00]
; SLM-NEXT: addsd (%rdi), %xmm0 # sched: [6:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_addsd:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vaddsd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vaddsd (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_addsd:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: vaddsd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vaddsd (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_addsd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; BROADWELL-NEXT: vaddsd (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_addsd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vaddsd %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vaddsd (%rdi), %xmm0, %xmm0 # sched: [9:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_addsd:
+; SKX: # %bb.0:
+; SKX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: vaddsd (%rdi), %xmm0, %xmm0 # sched: [9:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_addsd:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: vaddsd (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_addsd:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; ZNVER1-NEXT: vaddsd (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = fadd double %a0, %a1
%2 = load double, double *%a2, align 8
%3 = fadd double %1, %2
@@ -107,53 +145,74 @@ define double @test_addsd(double %a0, double %a1, double *%a2) {
define <2 x double> @test_andpd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2) {
; GENERIC-LABEL: test_andpd:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: andpd %xmm1, %xmm0
-; GENERIC-NEXT: andpd (%rdi), %xmm0
-; GENERIC-NEXT: addpd %xmm1, %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: andpd %xmm1, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: andpd (%rdi), %xmm0 # sched: [7:1.00]
+; GENERIC-NEXT: addpd %xmm1, %xmm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_andpd:
-; ATOM: # BB#0:
-; ATOM-NEXT: andpd %xmm1, %xmm0
-; ATOM-NEXT: andpd (%rdi), %xmm0
-; ATOM-NEXT: addpd %xmm1, %xmm0
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: andpd %xmm1, %xmm0 # sched: [1:0.50]
+; ATOM-NEXT: andpd (%rdi), %xmm0 # sched: [1:1.00]
+; ATOM-NEXT: addpd %xmm1, %xmm0 # sched: [6:3.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_andpd:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: andpd %xmm1, %xmm0 # sched: [1:0.50]
; SLM-NEXT: andpd (%rdi), %xmm0 # sched: [4:1.00]
; SLM-NEXT: addpd %xmm1, %xmm0 # sched: [3:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_andpd:
-; SANDY: # BB#0:
-; SANDY-NEXT: vandpd %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
-; SANDY-NEXT: vandpd (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vandpd %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; SANDY-NEXT: vandpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
; SANDY-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_andpd:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vandpd %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT: vandpd (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
+; HASWELL-NEXT: vandpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
; HASWELL-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_andpd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vandpd %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; BROADWELL-NEXT: vandpd (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
+; BROADWELL-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_andpd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vandpd %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: vandpd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKYLAKE-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_andpd:
+; SKX: # %bb.0:
+; SKX-NEXT: vandpd %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKX-NEXT: vandpd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKX-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_andpd:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vandpd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vandpd (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_andpd:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vandpd %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
; ZNVER1-NEXT: vandpd (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
; ZNVER1-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = bitcast <2 x double> %a0 to <4 x i32>
%2 = bitcast <2 x double> %a1 to <4 x i32>
%3 = and <4 x i32> %1, %2
@@ -167,53 +226,74 @@ define <2 x double> @test_andpd(<2 x double> %a0, <2 x double> %a1, <2 x double>
define <2 x double> @test_andnotpd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2) {
; GENERIC-LABEL: test_andnotpd:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: andnpd %xmm1, %xmm0
-; GENERIC-NEXT: andnpd (%rdi), %xmm0
-; GENERIC-NEXT: addpd %xmm1, %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: andnpd %xmm1, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: andnpd (%rdi), %xmm0 # sched: [7:1.00]
+; GENERIC-NEXT: addpd %xmm1, %xmm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_andnotpd:
-; ATOM: # BB#0:
-; ATOM-NEXT: andnpd %xmm1, %xmm0
-; ATOM-NEXT: andnpd (%rdi), %xmm0
-; ATOM-NEXT: addpd %xmm1, %xmm0
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: andnpd %xmm1, %xmm0 # sched: [1:0.50]
+; ATOM-NEXT: andnpd (%rdi), %xmm0 # sched: [1:1.00]
+; ATOM-NEXT: addpd %xmm1, %xmm0 # sched: [6:3.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_andnotpd:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: andnpd %xmm1, %xmm0 # sched: [1:0.50]
; SLM-NEXT: andnpd (%rdi), %xmm0 # sched: [4:1.00]
; SLM-NEXT: addpd %xmm1, %xmm0 # sched: [3:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_andnotpd:
-; SANDY: # BB#0:
-; SANDY-NEXT: vandnpd %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
-; SANDY-NEXT: vandnpd (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vandnpd %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; SANDY-NEXT: vandnpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
; SANDY-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_andnotpd:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vandnpd %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT: vandnpd (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
+; HASWELL-NEXT: vandnpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
; HASWELL-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_andnotpd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vandnpd %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; BROADWELL-NEXT: vandnpd (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
+; BROADWELL-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_andnotpd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vandnpd %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: vandnpd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKYLAKE-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_andnotpd:
+; SKX: # %bb.0:
+; SKX-NEXT: vandnpd %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKX-NEXT: vandnpd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKX-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_andnotpd:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vandnpd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vandnpd (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_andnotpd:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vandnpd %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
; ZNVER1-NEXT: vandnpd (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
; ZNVER1-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = bitcast <2 x double> %a0 to <4 x i32>
%2 = bitcast <2 x double> %a1 to <4 x i32>
%3 = xor <4 x i32> %1, <i32 -1, i32 -1, i32 -1, i32 -1>
@@ -227,55 +307,138 @@ define <2 x double> @test_andnotpd(<2 x double> %a0, <2 x double> %a1, <2 x doub
ret <2 x double> %10
}
+define void @test_clflush(i8* %p){
+; GENERIC-LABEL: test_clflush:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: clflush (%rdi) # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_clflush:
+; ATOM: # %bb.0:
+; ATOM-NEXT: clflush (%rdi) # sched: [1:1.00]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_clflush:
+; SLM: # %bb.0:
+; SLM-NEXT: clflush (%rdi) # sched: [3:1.00]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_clflush:
+; SANDY: # %bb.0:
+; SANDY-NEXT: clflush (%rdi) # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_clflush:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: clflush (%rdi) # sched: [2:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_clflush:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: clflush (%rdi) # sched: [2:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_clflush:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: clflush (%rdi) # sched: [2:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_clflush:
+; SKX: # %bb.0:
+; SKX-NEXT: clflush (%rdi) # sched: [2:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_clflush:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: clflush (%rdi) # sched: [5:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_clflush:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: clflush (%rdi) # sched: [8:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ tail call void @llvm.x86.sse2.clflush(i8* %p)
+ ret void
+}
+declare void @llvm.x86.sse2.clflush(i8*) nounwind
+
define <2 x double> @test_cmppd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2) {
; GENERIC-LABEL: test_cmppd:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: cmpeqpd %xmm0, %xmm1
-; GENERIC-NEXT: cmpeqpd (%rdi), %xmm0
-; GENERIC-NEXT: orpd %xmm1, %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: cmpeqpd %xmm0, %xmm1 # sched: [3:1.00]
+; GENERIC-NEXT: cmpeqpd (%rdi), %xmm0 # sched: [9:1.00]
+; GENERIC-NEXT: orpd %xmm1, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_cmppd:
-; ATOM: # BB#0:
-; ATOM-NEXT: cmpeqpd %xmm0, %xmm1
-; ATOM-NEXT: cmpeqpd (%rdi), %xmm0
-; ATOM-NEXT: orpd %xmm1, %xmm0
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: cmpeqpd %xmm0, %xmm1 # sched: [6:3.00]
+; ATOM-NEXT: cmpeqpd (%rdi), %xmm0 # sched: [7:3.50]
+; ATOM-NEXT: orpd %xmm1, %xmm0 # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_cmppd:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: cmpeqpd %xmm0, %xmm1 # sched: [3:1.00]
; SLM-NEXT: cmpeqpd (%rdi), %xmm0 # sched: [6:1.00]
; SLM-NEXT: orpd %xmm1, %xmm0 # sched: [1:0.50]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_cmppd:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm1 # sched: [3:1.00]
-; SANDY-NEXT: vcmpeqpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; SANDY-NEXT: vorpd %xmm0, %xmm1, %xmm0 # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vcmpeqpd (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; SANDY-NEXT: vorpd %xmm0, %xmm1, %xmm0 # sched: [1:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cmppd:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm1 # sched: [3:1.00]
-; HASWELL-NEXT: vcmpeqpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; HASWELL-NEXT: vcmpeqpd (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
; HASWELL-NEXT: vorpd %xmm0, %xmm1, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_cmppd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm1 # sched: [3:1.00]
+; BROADWELL-NEXT: vcmpeqpd (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
+; BROADWELL-NEXT: vorpd %xmm0, %xmm1, %xmm0 # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_cmppd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm1 # sched: [4:0.33]
+; SKYLAKE-NEXT: vcmpeqpd (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
+; SKYLAKE-NEXT: vorpd %xmm0, %xmm1, %xmm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_cmppd:
+; SKX: # %bb.0:
+; SKX-NEXT: vcmpeqpd %xmm1, %xmm0, %k0 # sched: [3:1.00]
+; SKX-NEXT: vcmpeqpd (%rdi), %xmm0, %k1 # sched: [9:1.00]
+; SKX-NEXT: korw %k1, %k0, %k0 # sched: [1:1.00]
+; SKX-NEXT: vpmovm2q %k0, %xmm0 # sched: [1:0.25]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_cmppd:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm1 # sched: [3:1.00]
; BTVER2-NEXT: vcmpeqpd (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
; BTVER2-NEXT: vorpd %xmm0, %xmm1, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_cmppd:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm1 # sched: [3:1.00]
; ZNVER1-NEXT: vcmpeqpd (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
; ZNVER1-NEXT: vorpd %xmm0, %xmm1, %xmm0 # sched: [1:0.25]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = fcmp oeq <2 x double> %a0, %a1
%2 = load <2 x double>, <2 x double> *%a2, align 16
%3 = fcmp oeq <2 x double> %a0, %2
@@ -287,46 +450,64 @@ define <2 x double> @test_cmppd(<2 x double> %a0, <2 x double> %a1, <2 x double>
define double @test_cmpsd(double %a0, double %a1, double *%a2) {
; GENERIC-LABEL: test_cmpsd:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: cmpeqsd %xmm1, %xmm0
-; GENERIC-NEXT: cmpeqsd (%rdi), %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: cmpeqsd %xmm1, %xmm0 # sched: [3:1.00]
+; GENERIC-NEXT: cmpeqsd (%rdi), %xmm0 # sched: [9:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_cmpsd:
-; ATOM: # BB#0:
-; ATOM-NEXT: cmpeqsd %xmm1, %xmm0
-; ATOM-NEXT: cmpeqsd (%rdi), %xmm0
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: cmpeqsd %xmm1, %xmm0 # sched: [5:5.00]
+; ATOM-NEXT: cmpeqsd (%rdi), %xmm0 # sched: [5:5.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_cmpsd:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: cmpeqsd %xmm1, %xmm0 # sched: [3:1.00]
; SLM-NEXT: cmpeqsd (%rdi), %xmm0 # sched: [6:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_cmpsd:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vcmpeqsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vcmpeqsd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vcmpeqsd (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cmpsd:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vcmpeqsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: vcmpeqsd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vcmpeqsd (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_cmpsd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vcmpeqsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; BROADWELL-NEXT: vcmpeqsd (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_cmpsd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vcmpeqsd %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
+; SKYLAKE-NEXT: vcmpeqsd (%rdi), %xmm0, %xmm0 # sched: [9:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_cmpsd:
+; SKX: # %bb.0:
+; SKX-NEXT: vcmpeqsd %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: vcmpeqsd (%rdi), %xmm0, %xmm0 # sched: [9:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_cmpsd:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vcmpeqsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: vcmpeqsd (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_cmpsd:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vcmpeqsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; ZNVER1-NEXT: vcmpeqsd (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = insertelement <2 x double> undef, double %a0, i32 0
%2 = insertelement <2 x double> undef, double %a1, i32 0
%3 = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %1, <2 x double> %2, i8 0)
@@ -340,35 +521,35 @@ declare <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double>, <2 x double>, i8) nounw
define i32 @test_comisd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2) {
; GENERIC-LABEL: test_comisd:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: comisd %xmm1, %xmm0
-; GENERIC-NEXT: setnp %al
-; GENERIC-NEXT: sete %cl
-; GENERIC-NEXT: andb %al, %cl
-; GENERIC-NEXT: comisd (%rdi), %xmm0
-; GENERIC-NEXT: setnp %al
-; GENERIC-NEXT: sete %dl
-; GENERIC-NEXT: andb %al, %dl
-; GENERIC-NEXT: orb %cl, %dl
-; GENERIC-NEXT: movzbl %dl, %eax
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: comisd %xmm1, %xmm0 # sched: [3:1.00]
+; GENERIC-NEXT: setnp %al # sched: [1:0.50]
+; GENERIC-NEXT: sete %cl # sched: [1:0.50]
+; GENERIC-NEXT: andb %al, %cl # sched: [1:0.33]
+; GENERIC-NEXT: comisd (%rdi), %xmm0 # sched: [7:1.00]
+; GENERIC-NEXT: setnp %al # sched: [1:0.50]
+; GENERIC-NEXT: sete %dl # sched: [1:0.50]
+; GENERIC-NEXT: andb %al, %dl # sched: [1:0.33]
+; GENERIC-NEXT: orb %cl, %dl # sched: [1:0.33]
+; GENERIC-NEXT: movzbl %dl, %eax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_comisd:
-; ATOM: # BB#0:
-; ATOM-NEXT: comisd %xmm1, %xmm0
-; ATOM-NEXT: setnp %al
-; ATOM-NEXT: sete %cl
-; ATOM-NEXT: andb %al, %cl
-; ATOM-NEXT: comisd (%rdi), %xmm0
-; ATOM-NEXT: setnp %al
-; ATOM-NEXT: sete %dl
-; ATOM-NEXT: andb %al, %dl
-; ATOM-NEXT: orb %cl, %dl
-; ATOM-NEXT: movzbl %dl, %eax
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: comisd %xmm1, %xmm0 # sched: [9:4.50]
+; ATOM-NEXT: setnp %al # sched: [1:0.50]
+; ATOM-NEXT: sete %cl # sched: [1:0.50]
+; ATOM-NEXT: andb %al, %cl # sched: [1:0.50]
+; ATOM-NEXT: comisd (%rdi), %xmm0 # sched: [10:5.00]
+; ATOM-NEXT: setnp %al # sched: [1:0.50]
+; ATOM-NEXT: sete %dl # sched: [1:0.50]
+; ATOM-NEXT: andb %al, %dl # sched: [1:0.50]
+; ATOM-NEXT: orb %cl, %dl # sched: [1:0.50]
+; ATOM-NEXT: movzbl %dl, %eax # sched: [1:1.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_comisd:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: comisd %xmm1, %xmm0 # sched: [3:1.00]
; SLM-NEXT: setnp %al # sched: [1:0.50]
; SLM-NEXT: sete %cl # sched: [1:0.50]
@@ -382,35 +563,77 @@ define i32 @test_comisd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2) {
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_comisd:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vcomisd %xmm1, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: setnp %al # sched: [1:0.33]
-; SANDY-NEXT: sete %cl # sched: [1:0.33]
+; SANDY-NEXT: setnp %al # sched: [1:0.50]
+; SANDY-NEXT: sete %cl # sched: [1:0.50]
; SANDY-NEXT: andb %al, %cl # sched: [1:0.33]
; SANDY-NEXT: vcomisd (%rdi), %xmm0 # sched: [7:1.00]
-; SANDY-NEXT: setnp %al # sched: [1:0.33]
-; SANDY-NEXT: sete %dl # sched: [1:0.33]
+; SANDY-NEXT: setnp %al # sched: [1:0.50]
+; SANDY-NEXT: sete %dl # sched: [1:0.50]
; SANDY-NEXT: andb %al, %dl # sched: [1:0.33]
; SANDY-NEXT: orb %cl, %dl # sched: [1:0.33]
; SANDY-NEXT: movzbl %dl, %eax # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_comisd:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vcomisd %xmm1, %xmm0 # sched: [3:1.00]
; HASWELL-NEXT: setnp %al # sched: [1:0.50]
; HASWELL-NEXT: sete %cl # sched: [1:0.50]
; HASWELL-NEXT: andb %al, %cl # sched: [1:0.25]
-; HASWELL-NEXT: vcomisd (%rdi), %xmm0 # sched: [7:1.00]
+; HASWELL-NEXT: vcomisd (%rdi), %xmm0 # sched: [8:1.00]
; HASWELL-NEXT: setnp %al # sched: [1:0.50]
; HASWELL-NEXT: sete %dl # sched: [1:0.50]
; HASWELL-NEXT: andb %al, %dl # sched: [1:0.25]
; HASWELL-NEXT: orb %cl, %dl # sched: [1:0.25]
; HASWELL-NEXT: movzbl %dl, %eax # sched: [1:0.25]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_comisd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vcomisd %xmm1, %xmm0 # sched: [3:1.00]
+; BROADWELL-NEXT: setnp %al # sched: [1:0.50]
+; BROADWELL-NEXT: sete %cl # sched: [1:0.50]
+; BROADWELL-NEXT: andb %al, %cl # sched: [1:0.25]
+; BROADWELL-NEXT: vcomisd (%rdi), %xmm0 # sched: [8:1.00]
+; BROADWELL-NEXT: setnp %al # sched: [1:0.50]
+; BROADWELL-NEXT: sete %dl # sched: [1:0.50]
+; BROADWELL-NEXT: andb %al, %dl # sched: [1:0.25]
+; BROADWELL-NEXT: orb %cl, %dl # sched: [1:0.25]
+; BROADWELL-NEXT: movzbl %dl, %eax # sched: [1:0.25]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_comisd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vcomisd %xmm1, %xmm0 # sched: [3:1.00]
+; SKYLAKE-NEXT: setnp %al # sched: [1:0.50]
+; SKYLAKE-NEXT: sete %cl # sched: [1:0.50]
+; SKYLAKE-NEXT: andb %al, %cl # sched: [1:0.25]
+; SKYLAKE-NEXT: vcomisd (%rdi), %xmm0 # sched: [8:1.00]
+; SKYLAKE-NEXT: setnp %al # sched: [1:0.50]
+; SKYLAKE-NEXT: sete %dl # sched: [1:0.50]
+; SKYLAKE-NEXT: andb %al, %dl # sched: [1:0.25]
+; SKYLAKE-NEXT: orb %cl, %dl # sched: [1:0.25]
+; SKYLAKE-NEXT: movzbl %dl, %eax # sched: [1:0.25]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_comisd:
+; SKX: # %bb.0:
+; SKX-NEXT: vcomisd %xmm1, %xmm0 # sched: [3:1.00]
+; SKX-NEXT: setnp %al # sched: [1:0.50]
+; SKX-NEXT: sete %cl # sched: [1:0.50]
+; SKX-NEXT: andb %al, %cl # sched: [1:0.25]
+; SKX-NEXT: vcomisd (%rdi), %xmm0 # sched: [8:1.00]
+; SKX-NEXT: setnp %al # sched: [1:0.50]
+; SKX-NEXT: sete %dl # sched: [1:0.50]
+; SKX-NEXT: andb %al, %dl # sched: [1:0.25]
+; SKX-NEXT: orb %cl, %dl # sched: [1:0.25]
+; SKX-NEXT: movzbl %dl, %eax # sched: [1:0.25]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_comisd:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vcomisd %xmm1, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: setnp %al # sched: [1:0.50]
; BTVER2-NEXT: sete %cl # sched: [1:0.50]
@@ -424,7 +647,7 @@ define i32 @test_comisd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2) {
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_comisd:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vcomisd %xmm1, %xmm0 # sched: [3:1.00]
; ZNVER1-NEXT: setnp %al # sched: [1:0.25]
; ZNVER1-NEXT: sete %cl # sched: [1:0.25]
@@ -435,7 +658,7 @@ define i32 @test_comisd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2) {
; ZNVER1-NEXT: andb %al, %dl # sched: [1:0.25]
; ZNVER1-NEXT: orb %cl, %dl # sched: [1:0.25]
; ZNVER1-NEXT: movzbl %dl, %eax # sched: [1:0.25]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call i32 @llvm.x86.sse2.comieq.sd(<2 x double> %a0, <2 x double> %a1)
%2 = load <2 x double>, <2 x double> *%a2, align 8
%3 = call i32 @llvm.x86.sse2.comieq.sd(<2 x double> %a0, <2 x double> %2)
@@ -446,53 +669,74 @@ declare i32 @llvm.x86.sse2.comieq.sd(<2 x double>, <2 x double>) nounwind readno
define <2 x double> @test_cvtdq2pd(<4 x i32> %a0, <4 x i32> *%a1) {
; GENERIC-LABEL: test_cvtdq2pd:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: cvtdq2pd %xmm0, %xmm1
-; GENERIC-NEXT: cvtdq2pd (%rdi), %xmm0
-; GENERIC-NEXT: addpd %xmm1, %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: cvtdq2pd %xmm0, %xmm1 # sched: [4:1.00]
+; GENERIC-NEXT: cvtdq2pd (%rdi), %xmm0 # sched: [10:1.00]
+; GENERIC-NEXT: addpd %xmm1, %xmm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_cvtdq2pd:
-; ATOM: # BB#0:
-; ATOM-NEXT: cvtdq2pd %xmm0, %xmm1
-; ATOM-NEXT: cvtdq2pd (%rdi), %xmm0
-; ATOM-NEXT: addpd %xmm1, %xmm0
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: cvtdq2pd %xmm0, %xmm1 # sched: [8:4.00]
+; ATOM-NEXT: cvtdq2pd (%rdi), %xmm0 # sched: [7:3.50]
+; ATOM-NEXT: addpd %xmm1, %xmm0 # sched: [6:3.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_cvtdq2pd:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: cvtdq2pd %xmm0, %xmm1 # sched: [4:0.50]
; SLM-NEXT: cvtdq2pd (%rdi), %xmm0 # sched: [7:1.00]
; SLM-NEXT: addpd %xmm1, %xmm0 # sched: [3:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_cvtdq2pd:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vcvtdq2pd %xmm0, %xmm0 # sched: [4:1.00]
-; SANDY-NEXT: vcvtdq2pd (%rdi), %xmm1 # sched: [8:1.00]
+; SANDY-NEXT: vcvtdq2pd (%rdi), %xmm1 # sched: [10:1.00]
; SANDY-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cvtdq2pd:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vcvtdq2pd %xmm0, %xmm0 # sched: [4:1.00]
-; HASWELL-NEXT: vcvtdq2pd (%rdi), %xmm1 # sched: [8:1.00]
+; HASWELL-NEXT: vcvtdq2pd (%rdi), %xmm1 # sched: [10:1.00]
; HASWELL-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_cvtdq2pd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vcvtdq2pd (%rdi), %xmm1 # sched: [9:1.00]
+; BROADWELL-NEXT: vcvtdq2pd %xmm0, %xmm0 # sched: [4:1.00]
+; BROADWELL-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_cvtdq2pd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vcvtdq2pd %xmm0, %xmm0 # sched: [5:1.00]
+; SKYLAKE-NEXT: vcvtdq2pd (%rdi), %xmm1 # sched: [11:1.00]
+; SKYLAKE-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_cvtdq2pd:
+; SKX: # %bb.0:
+; SKX-NEXT: vcvtdq2pd %xmm0, %xmm0 # sched: [5:1.00]
+; SKX-NEXT: vcvtdq2pd (%rdi), %xmm1 # sched: [11:1.00]
+; SKX-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_cvtdq2pd:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vcvtdq2pd (%rdi), %xmm1 # sched: [8:1.00]
; BTVER2-NEXT: vcvtdq2pd %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_cvtdq2pd:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vcvtdq2pd (%rdi), %xmm1 # sched: [12:1.00]
; ZNVER1-NEXT: vcvtdq2pd %xmm0, %xmm0 # sched: [5:1.00]
; ZNVER1-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
%2 = sitofp <2 x i32> %1 to <2 x double>
%3 = load <4 x i32>, <4 x i32>*%a1, align 16
@@ -504,54 +748,75 @@ define <2 x double> @test_cvtdq2pd(<4 x i32> %a0, <4 x i32> *%a1) {
define <4 x float> @test_cvtdq2ps(<4 x i32> %a0, <4 x i32> *%a1) {
; GENERIC-LABEL: test_cvtdq2ps:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: cvtdq2ps %xmm0, %xmm1
-; GENERIC-NEXT: cvtdq2ps (%rdi), %xmm0
-; GENERIC-NEXT: addps %xmm1, %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: cvtdq2ps %xmm0, %xmm1 # sched: [3:1.00]
+; GENERIC-NEXT: cvtdq2ps (%rdi), %xmm0 # sched: [9:1.00]
+; GENERIC-NEXT: addps %xmm1, %xmm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_cvtdq2ps:
-; ATOM: # BB#0:
-; ATOM-NEXT: cvtdq2ps (%rdi), %xmm1
-; ATOM-NEXT: cvtdq2ps %xmm0, %xmm0
-; ATOM-NEXT: addps %xmm0, %xmm1
-; ATOM-NEXT: movaps %xmm1, %xmm0
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: cvtdq2ps (%rdi), %xmm1 # sched: [7:3.50]
+; ATOM-NEXT: cvtdq2ps %xmm0, %xmm0 # sched: [6:3.00]
+; ATOM-NEXT: addps %xmm0, %xmm1 # sched: [5:5.00]
+; ATOM-NEXT: movaps %xmm1, %xmm0 # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_cvtdq2ps:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: cvtdq2ps %xmm0, %xmm1 # sched: [4:0.50]
; SLM-NEXT: cvtdq2ps (%rdi), %xmm0 # sched: [7:1.00]
; SLM-NEXT: addps %xmm1, %xmm0 # sched: [3:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_cvtdq2ps:
-; SANDY: # BB#0:
-; SANDY-NEXT: vcvtdq2ps %xmm0, %xmm0 # sched: [4:1.00]
-; SANDY-NEXT: vcvtdq2ps (%rdi), %xmm1 # sched: [8:1.00]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vcvtdq2ps %xmm0, %xmm0 # sched: [3:1.00]
+; SANDY-NEXT: vcvtdq2ps (%rdi), %xmm1 # sched: [9:1.00]
; SANDY-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cvtdq2ps:
-; HASWELL: # BB#0:
-; HASWELL-NEXT: vcvtdq2ps %xmm0, %xmm0 # sched: [4:1.00]
-; HASWELL-NEXT: vcvtdq2ps (%rdi), %xmm1 # sched: [8:1.00]
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vcvtdq2ps %xmm0, %xmm0 # sched: [3:1.00]
+; HASWELL-NEXT: vcvtdq2ps (%rdi), %xmm1 # sched: [9:1.00]
; HASWELL-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_cvtdq2ps:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vcvtdq2ps %xmm0, %xmm0 # sched: [3:1.00]
+; BROADWELL-NEXT: vcvtdq2ps (%rdi), %xmm1 # sched: [8:1.00]
+; BROADWELL-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_cvtdq2ps:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vcvtdq2ps %xmm0, %xmm0 # sched: [4:0.33]
+; SKYLAKE-NEXT: vcvtdq2ps (%rdi), %xmm1 # sched: [10:0.50]
+; SKYLAKE-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_cvtdq2ps:
+; SKX: # %bb.0:
+; SKX-NEXT: vcvtdq2ps %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: vcvtdq2ps (%rdi), %xmm1 # sched: [10:0.50]
+; SKX-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_cvtdq2ps:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vcvtdq2ps (%rdi), %xmm1 # sched: [8:1.00]
; BTVER2-NEXT: vcvtdq2ps %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_cvtdq2ps:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vcvtdq2ps (%rdi), %xmm1 # sched: [12:1.00]
; ZNVER1-NEXT: vcvtdq2ps %xmm0, %xmm0 # sched: [5:1.00]
; ZNVER1-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = sitofp <4 x i32> %a0 to <4 x float>
%2 = load <4 x i32>, <4 x i32>*%a1, align 16
%3 = sitofp <4 x i32> %2 to <4 x float>
@@ -561,54 +826,75 @@ define <4 x float> @test_cvtdq2ps(<4 x i32> %a0, <4 x i32> *%a1) {
define <4 x i32> @test_cvtpd2dq(<2 x double> %a0, <2 x double> *%a1) {
; GENERIC-LABEL: test_cvtpd2dq:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: cvtpd2dq %xmm0, %xmm1
-; GENERIC-NEXT: cvtpd2dq (%rdi), %xmm0
-; GENERIC-NEXT: paddd %xmm1, %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: cvtpd2dq %xmm0, %xmm1 # sched: [4:1.00]
+; GENERIC-NEXT: cvtpd2dq (%rdi), %xmm0 # sched: [10:1.00]
+; GENERIC-NEXT: paddd %xmm1, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_cvtpd2dq:
-; ATOM: # BB#0:
-; ATOM-NEXT: cvtpd2dq (%rdi), %xmm1
-; ATOM-NEXT: cvtpd2dq %xmm0, %xmm0
-; ATOM-NEXT: paddd %xmm0, %xmm1
-; ATOM-NEXT: movdqa %xmm1, %xmm0
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: cvtpd2dq (%rdi), %xmm1 # sched: [8:4.00]
+; ATOM-NEXT: cvtpd2dq %xmm0, %xmm0 # sched: [7:3.50]
+; ATOM-NEXT: paddd %xmm0, %xmm1 # sched: [1:0.50]
+; ATOM-NEXT: movdqa %xmm1, %xmm0 # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_cvtpd2dq:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: cvtpd2dq %xmm0, %xmm1 # sched: [4:0.50]
; SLM-NEXT: cvtpd2dq (%rdi), %xmm0 # sched: [7:1.00]
; SLM-NEXT: paddd %xmm1, %xmm0 # sched: [1:0.50]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_cvtpd2dq:
-; SANDY: # BB#0:
-; SANDY-NEXT: vcvtpd2dq %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vcvtpd2dqx (%rdi), %xmm1 # sched: [7:1.00]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vcvtpd2dq %xmm0, %xmm0 # sched: [4:1.00]
+; SANDY-NEXT: vcvtpd2dqx (%rdi), %xmm1 # sched: [10:1.00]
; SANDY-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cvtpd2dq:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vcvtpd2dq %xmm0, %xmm0 # sched: [4:1.00]
; HASWELL-NEXT: vcvtpd2dqx (%rdi), %xmm1 # sched: [8:1.00]
; HASWELL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_cvtpd2dq:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vcvtpd2dq %xmm0, %xmm0 # sched: [4:1.00]
+; BROADWELL-NEXT: vcvtpd2dqx (%rdi), %xmm1 # sched: [8:1.00]
+; BROADWELL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_cvtpd2dq:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vcvtpd2dq %xmm0, %xmm0 # sched: [5:1.00]
+; SKYLAKE-NEXT: vcvtpd2dqx (%rdi), %xmm1 # sched: [8:1.00]
+; SKYLAKE-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_cvtpd2dq:
+; SKX: # %bb.0:
+; SKX-NEXT: vcvtpd2dq %xmm0, %xmm0 # sched: [5:1.00]
+; SKX-NEXT: vcvtpd2dqx (%rdi), %xmm1 # sched: [8:1.00]
+; SKX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_cvtpd2dq:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vcvtpd2dqx (%rdi), %xmm1 # sched: [8:1.00]
; BTVER2-NEXT: vcvtpd2dq %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_cvtpd2dq:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vcvtpd2dqx (%rdi), %xmm1 # sched: [12:1.00]
; ZNVER1-NEXT: vcvtpd2dq %xmm0, %xmm0 # sched: [5:1.00]
; ZNVER1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call <4 x i32> @llvm.x86.sse2.cvtpd2dq(<2 x double> %a0)
%2 = load <2 x double>, <2 x double> *%a1, align 16
%3 = call <4 x i32> @llvm.x86.sse2.cvtpd2dq(<2 x double> %2)
@@ -619,54 +905,75 @@ declare <4 x i32> @llvm.x86.sse2.cvtpd2dq(<2 x double>) nounwind readnone
define <4 x float> @test_cvtpd2ps(<2 x double> %a0, <2 x double> *%a1) {
; GENERIC-LABEL: test_cvtpd2ps:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: cvtpd2ps %xmm0, %xmm1
-; GENERIC-NEXT: cvtpd2ps (%rdi), %xmm0
-; GENERIC-NEXT: addps %xmm1, %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: cvtpd2ps %xmm0, %xmm1 # sched: [4:1.00]
+; GENERIC-NEXT: cvtpd2ps (%rdi), %xmm0 # sched: [10:1.00]
+; GENERIC-NEXT: addps %xmm1, %xmm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_cvtpd2ps:
-; ATOM: # BB#0:
-; ATOM-NEXT: cvtpd2ps (%rdi), %xmm1
-; ATOM-NEXT: cvtpd2ps %xmm0, %xmm0
-; ATOM-NEXT: addps %xmm0, %xmm1
-; ATOM-NEXT: movaps %xmm1, %xmm0
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: cvtpd2ps (%rdi), %xmm1 # sched: [8:4.00]
+; ATOM-NEXT: cvtpd2ps %xmm0, %xmm0 # sched: [7:3.50]
+; ATOM-NEXT: addps %xmm0, %xmm1 # sched: [5:5.00]
+; ATOM-NEXT: movaps %xmm1, %xmm0 # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_cvtpd2ps:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: cvtpd2ps %xmm0, %xmm1 # sched: [4:0.50]
; SLM-NEXT: cvtpd2ps (%rdi), %xmm0 # sched: [7:1.00]
; SLM-NEXT: addps %xmm1, %xmm0 # sched: [3:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_cvtpd2ps:
-; SANDY: # BB#0:
-; SANDY-NEXT: vcvtpd2ps %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vcvtpd2psx (%rdi), %xmm1 # sched: [7:1.00]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vcvtpd2ps %xmm0, %xmm0 # sched: [4:1.00]
+; SANDY-NEXT: vcvtpd2psx (%rdi), %xmm1 # sched: [10:1.00]
; SANDY-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cvtpd2ps:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vcvtpd2ps %xmm0, %xmm0 # sched: [4:1.00]
; HASWELL-NEXT: vcvtpd2psx (%rdi), %xmm1 # sched: [8:1.00]
; HASWELL-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_cvtpd2ps:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vcvtpd2ps %xmm0, %xmm0 # sched: [4:1.00]
+; BROADWELL-NEXT: vcvtpd2psx (%rdi), %xmm1 # sched: [8:1.00]
+; BROADWELL-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_cvtpd2ps:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vcvtpd2ps %xmm0, %xmm0 # sched: [5:1.00]
+; SKYLAKE-NEXT: vcvtpd2psx (%rdi), %xmm1 # sched: [8:1.00]
+; SKYLAKE-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_cvtpd2ps:
+; SKX: # %bb.0:
+; SKX-NEXT: vcvtpd2ps %xmm0, %xmm0 # sched: [5:1.00]
+; SKX-NEXT: vcvtpd2psx (%rdi), %xmm1 # sched: [8:1.00]
+; SKX-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_cvtpd2ps:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vcvtpd2psx (%rdi), %xmm1 # sched: [8:1.00]
; BTVER2-NEXT: vcvtpd2ps %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_cvtpd2ps:
-; ZNVER1: # BB#0:
-; ZNVER1-NEXT: vcvtpd2psx (%rdi), %xmm1 # sched: [12:1.00]
-; ZNVER1-NEXT: vcvtpd2ps %xmm0, %xmm0 # sched: [5:1.00]
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vcvtpd2psx (%rdi), %xmm1 # sched: [11:1.00]
+; ZNVER1-NEXT: vcvtpd2ps %xmm0, %xmm0 # sched: [4:1.00]
; ZNVER1-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call <4 x float> @llvm.x86.sse2.cvtpd2ps(<2 x double> %a0)
%2 = load <2 x double>, <2 x double> *%a1, align 16
%3 = call <4 x float> @llvm.x86.sse2.cvtpd2ps(<2 x double> %2)
@@ -677,54 +984,75 @@ declare <4 x float> @llvm.x86.sse2.cvtpd2ps(<2 x double>) nounwind readnone
define <4 x i32> @test_cvtps2dq(<4 x float> %a0, <4 x float> *%a1) {
; GENERIC-LABEL: test_cvtps2dq:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: cvtps2dq %xmm0, %xmm1
-; GENERIC-NEXT: cvtps2dq (%rdi), %xmm0
-; GENERIC-NEXT: paddd %xmm1, %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: cvtps2dq %xmm0, %xmm1 # sched: [3:1.00]
+; GENERIC-NEXT: cvtps2dq (%rdi), %xmm0 # sched: [9:1.00]
+; GENERIC-NEXT: paddd %xmm1, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_cvtps2dq:
-; ATOM: # BB#0:
-; ATOM-NEXT: cvtps2dq (%rdi), %xmm1
-; ATOM-NEXT: cvtps2dq %xmm0, %xmm0
-; ATOM-NEXT: paddd %xmm0, %xmm1
-; ATOM-NEXT: movdqa %xmm1, %xmm0
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: cvtps2dq (%rdi), %xmm1 # sched: [7:3.50]
+; ATOM-NEXT: cvtps2dq %xmm0, %xmm0 # sched: [6:3.00]
+; ATOM-NEXT: paddd %xmm0, %xmm1 # sched: [1:0.50]
+; ATOM-NEXT: movdqa %xmm1, %xmm0 # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_cvtps2dq:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: cvtps2dq %xmm0, %xmm1 # sched: [4:0.50]
; SLM-NEXT: cvtps2dq (%rdi), %xmm0 # sched: [7:1.00]
; SLM-NEXT: paddd %xmm1, %xmm0 # sched: [1:0.50]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_cvtps2dq:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vcvtps2dq %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vcvtps2dq (%rdi), %xmm1 # sched: [7:1.00]
+; SANDY-NEXT: vcvtps2dq (%rdi), %xmm1 # sched: [9:1.00]
; SANDY-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cvtps2dq:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vcvtps2dq %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: vcvtps2dq (%rdi), %xmm1 # sched: [7:1.00]
+; HASWELL-NEXT: vcvtps2dq (%rdi), %xmm1 # sched: [9:1.00]
; HASWELL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_cvtps2dq:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vcvtps2dq %xmm0, %xmm0 # sched: [3:1.00]
+; BROADWELL-NEXT: vcvtps2dq (%rdi), %xmm1 # sched: [8:1.00]
+; BROADWELL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_cvtps2dq:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vcvtps2dq %xmm0, %xmm0 # sched: [4:0.33]
+; SKYLAKE-NEXT: vcvtps2dq (%rdi), %xmm1 # sched: [10:0.50]
+; SKYLAKE-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_cvtps2dq:
+; SKX: # %bb.0:
+; SKX-NEXT: vcvtps2dq %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: vcvtps2dq (%rdi), %xmm1 # sched: [10:0.50]
+; SKX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_cvtps2dq:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vcvtps2dq (%rdi), %xmm1 # sched: [8:1.00]
; BTVER2-NEXT: vcvtps2dq %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_cvtps2dq:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vcvtps2dq (%rdi), %xmm1 # sched: [12:1.00]
; ZNVER1-NEXT: vcvtps2dq %xmm0, %xmm0 # sched: [5:1.00]
; ZNVER1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float> %a0)
%2 = load <4 x float>, <4 x float> *%a1, align 16
%3 = call <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float> %2)
@@ -735,54 +1063,75 @@ declare <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float>) nounwind readnone
define <2 x double> @test_cvtps2pd(<4 x float> %a0, <4 x float> *%a1) {
; GENERIC-LABEL: test_cvtps2pd:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: cvtps2pd %xmm0, %xmm1
-; GENERIC-NEXT: cvtps2pd (%rdi), %xmm0
-; GENERIC-NEXT: addpd %xmm1, %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: cvtps2pd %xmm0, %xmm1 # sched: [2:1.00]
+; GENERIC-NEXT: cvtps2pd (%rdi), %xmm0 # sched: [7:1.00]
+; GENERIC-NEXT: addpd %xmm1, %xmm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_cvtps2pd:
-; ATOM: # BB#0:
-; ATOM-NEXT: cvtps2pd (%rdi), %xmm1
-; ATOM-NEXT: cvtps2pd %xmm0, %xmm0
-; ATOM-NEXT: addpd %xmm0, %xmm1
-; ATOM-NEXT: movapd %xmm1, %xmm0
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: cvtps2pd (%rdi), %xmm1 # sched: [8:4.00]
+; ATOM-NEXT: cvtps2pd %xmm0, %xmm0 # sched: [7:3.50]
+; ATOM-NEXT: addpd %xmm0, %xmm1 # sched: [6:3.00]
+; ATOM-NEXT: movapd %xmm1, %xmm0 # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_cvtps2pd:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: cvtps2pd %xmm0, %xmm1 # sched: [4:0.50]
; SLM-NEXT: cvtps2pd (%rdi), %xmm0 # sched: [7:1.00]
; SLM-NEXT: addpd %xmm1, %xmm0 # sched: [3:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_cvtps2pd:
-; SANDY: # BB#0:
-; SANDY-NEXT: vcvtps2pd %xmm0, %xmm0 # sched: [3:1.00]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vcvtps2pd %xmm0, %xmm0 # sched: [2:1.00]
; SANDY-NEXT: vcvtps2pd (%rdi), %xmm1 # sched: [7:1.00]
; SANDY-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cvtps2pd:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vcvtps2pd %xmm0, %xmm0 # sched: [2:1.00]
-; HASWELL-NEXT: vcvtps2pd (%rdi), %xmm1 # sched: [5:1.00]
+; HASWELL-NEXT: vcvtps2pd (%rdi), %xmm1 # sched: [6:1.00]
; HASWELL-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_cvtps2pd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vcvtps2pd %xmm0, %xmm0 # sched: [2:1.00]
+; BROADWELL-NEXT: vcvtps2pd (%rdi), %xmm1 # sched: [6:1.00]
+; BROADWELL-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_cvtps2pd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vcvtps2pd %xmm0, %xmm0 # sched: [5:1.00]
+; SKYLAKE-NEXT: vcvtps2pd (%rdi), %xmm1 # sched: [9:0.50]
+; SKYLAKE-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_cvtps2pd:
+; SKX: # %bb.0:
+; SKX-NEXT: vcvtps2pd %xmm0, %xmm0 # sched: [5:1.00]
+; SKX-NEXT: vcvtps2pd (%rdi), %xmm1 # sched: [9:0.50]
+; SKX-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_cvtps2pd:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vcvtps2pd (%rdi), %xmm1 # sched: [8:1.00]
; BTVER2-NEXT: vcvtps2pd %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_cvtps2pd:
-; ZNVER1: # BB#0:
-; ZNVER1-NEXT: vcvtps2pd (%rdi), %xmm1 # sched: [12:1.00]
-; ZNVER1-NEXT: vcvtps2pd %xmm0, %xmm0 # sched: [5:1.00]
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vcvtps2pd (%rdi), %xmm1 # sched: [10:1.00]
+; ZNVER1-NEXT: vcvtps2pd %xmm0, %xmm0 # sched: [3:1.00]
; ZNVER1-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = shufflevector <4 x float> %a0, <4 x float> undef, <2 x i32> <i32 0, i32 1>
%2 = fpext <2 x float> %1 to <2 x double>
%3 = load <4 x float>, <4 x float> *%a1, align 16
@@ -794,53 +1143,74 @@ define <2 x double> @test_cvtps2pd(<4 x float> %a0, <4 x float> *%a1) {
define i32 @test_cvtsd2si(double %a0, double *%a1) {
; GENERIC-LABEL: test_cvtsd2si:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: cvtsd2si %xmm0, %ecx
-; GENERIC-NEXT: cvtsd2si (%rdi), %eax
-; GENERIC-NEXT: addl %ecx, %eax
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: cvtsd2si %xmm0, %ecx # sched: [5:1.00]
+; GENERIC-NEXT: cvtsd2si (%rdi), %eax # sched: [9:1.00]
+; GENERIC-NEXT: addl %ecx, %eax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_cvtsd2si:
-; ATOM: # BB#0:
-; ATOM-NEXT: cvtsd2si (%rdi), %eax
-; ATOM-NEXT: cvtsd2si %xmm0, %ecx
-; ATOM-NEXT: addl %ecx, %eax
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: cvtsd2si (%rdi), %eax # sched: [9:4.50]
+; ATOM-NEXT: cvtsd2si %xmm0, %ecx # sched: [8:4.00]
+; ATOM-NEXT: addl %ecx, %eax # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_cvtsd2si:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: cvtsd2si (%rdi), %eax # sched: [7:1.00]
; SLM-NEXT: cvtsd2si %xmm0, %ecx # sched: [4:0.50]
; SLM-NEXT: addl %ecx, %eax # sched: [1:0.50]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_cvtsd2si:
-; SANDY: # BB#0:
-; SANDY-NEXT: vcvtsd2si %xmm0, %ecx # sched: [3:1.00]
-; SANDY-NEXT: vcvtsd2si (%rdi), %eax # sched: [7:1.00]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vcvtsd2si %xmm0, %ecx # sched: [5:1.00]
+; SANDY-NEXT: vcvtsd2si (%rdi), %eax # sched: [10:1.00]
; SANDY-NEXT: addl %ecx, %eax # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cvtsd2si:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vcvtsd2si %xmm0, %ecx # sched: [4:1.00]
-; HASWELL-NEXT: vcvtsd2si (%rdi), %eax # sched: [8:1.00]
+; HASWELL-NEXT: vcvtsd2si (%rdi), %eax # sched: [9:1.00]
; HASWELL-NEXT: addl %ecx, %eax # sched: [1:0.25]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_cvtsd2si:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vcvtsd2si (%rdi), %eax # sched: [9:1.00]
+; BROADWELL-NEXT: vcvtsd2si %xmm0, %ecx # sched: [4:1.00]
+; BROADWELL-NEXT: addl %ecx, %eax # sched: [1:0.25]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_cvtsd2si:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vcvtsd2si %xmm0, %ecx # sched: [6:1.00]
+; SKYLAKE-NEXT: vcvtsd2si (%rdi), %eax # sched: [11:1.00]
+; SKYLAKE-NEXT: addl %ecx, %eax # sched: [1:0.25]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_cvtsd2si:
+; SKX: # %bb.0:
+; SKX-NEXT: vcvtsd2si %xmm0, %ecx # sched: [6:1.00]
+; SKX-NEXT: vcvtsd2si (%rdi), %eax # sched: [11:1.00]
+; SKX-NEXT: addl %ecx, %eax # sched: [1:0.25]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_cvtsd2si:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vcvtsd2si (%rdi), %eax # sched: [8:1.00]
; BTVER2-NEXT: vcvtsd2si %xmm0, %ecx # sched: [3:1.00]
; BTVER2-NEXT: addl %ecx, %eax # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_cvtsd2si:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vcvtsd2si (%rdi), %eax # sched: [12:1.00]
; ZNVER1-NEXT: vcvtsd2si %xmm0, %ecx # sched: [5:1.00]
; ZNVER1-NEXT: addl %ecx, %eax # sched: [1:0.25]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = insertelement <2 x double> undef, double %a0, i32 0
%2 = call i32 @llvm.x86.sse2.cvtsd2si(<2 x double> %1)
%3 = load double, double *%a1, align 8
@@ -853,53 +1223,74 @@ declare i32 @llvm.x86.sse2.cvtsd2si(<2 x double>) nounwind readnone
define i64 @test_cvtsd2siq(double %a0, double *%a1) {
; GENERIC-LABEL: test_cvtsd2siq:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: cvtsd2si %xmm0, %rcx
-; GENERIC-NEXT: cvtsd2si (%rdi), %rax
-; GENERIC-NEXT: addq %rcx, %rax
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: cvtsd2si %xmm0, %rcx # sched: [5:1.00]
+; GENERIC-NEXT: cvtsd2si (%rdi), %rax # sched: [9:1.00]
+; GENERIC-NEXT: addq %rcx, %rax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_cvtsd2siq:
-; ATOM: # BB#0:
-; ATOM-NEXT: cvtsd2si (%rdi), %rax
-; ATOM-NEXT: cvtsd2si %xmm0, %rcx
-; ATOM-NEXT: addq %rcx, %rax
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: cvtsd2si (%rdi), %rax # sched: [9:4.50]
+; ATOM-NEXT: cvtsd2si %xmm0, %rcx # sched: [8:4.00]
+; ATOM-NEXT: addq %rcx, %rax # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_cvtsd2siq:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: cvtsd2si (%rdi), %rax # sched: [7:1.00]
; SLM-NEXT: cvtsd2si %xmm0, %rcx # sched: [4:0.50]
; SLM-NEXT: addq %rcx, %rax # sched: [1:0.50]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_cvtsd2siq:
-; SANDY: # BB#0:
-; SANDY-NEXT: vcvtsd2si %xmm0, %rcx # sched: [3:1.00]
-; SANDY-NEXT: vcvtsd2si (%rdi), %rax # sched: [7:1.00]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vcvtsd2si %xmm0, %rcx # sched: [5:1.00]
+; SANDY-NEXT: vcvtsd2si (%rdi), %rax # sched: [10:1.00]
; SANDY-NEXT: addq %rcx, %rax # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cvtsd2siq:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vcvtsd2si %xmm0, %rcx # sched: [4:1.00]
-; HASWELL-NEXT: vcvtsd2si (%rdi), %rax # sched: [8:1.00]
+; HASWELL-NEXT: vcvtsd2si (%rdi), %rax # sched: [9:1.00]
; HASWELL-NEXT: addq %rcx, %rax # sched: [1:0.25]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_cvtsd2siq:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vcvtsd2si (%rdi), %rax # sched: [9:1.00]
+; BROADWELL-NEXT: vcvtsd2si %xmm0, %rcx # sched: [4:1.00]
+; BROADWELL-NEXT: addq %rcx, %rax # sched: [1:0.25]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_cvtsd2siq:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vcvtsd2si %xmm0, %rcx # sched: [6:1.00]
+; SKYLAKE-NEXT: vcvtsd2si (%rdi), %rax # sched: [11:1.00]
+; SKYLAKE-NEXT: addq %rcx, %rax # sched: [1:0.25]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_cvtsd2siq:
+; SKX: # %bb.0:
+; SKX-NEXT: vcvtsd2si %xmm0, %rcx # sched: [6:1.00]
+; SKX-NEXT: vcvtsd2si (%rdi), %rax # sched: [11:1.00]
+; SKX-NEXT: addq %rcx, %rax # sched: [1:0.25]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_cvtsd2siq:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vcvtsd2si (%rdi), %rax # sched: [8:1.00]
; BTVER2-NEXT: vcvtsd2si %xmm0, %rcx # sched: [3:1.00]
; BTVER2-NEXT: addq %rcx, %rax # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_cvtsd2siq:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vcvtsd2si (%rdi), %rax # sched: [12:1.00]
; ZNVER1-NEXT: vcvtsd2si %xmm0, %rcx # sched: [5:1.00]
; ZNVER1-NEXT: addq %rcx, %rax # sched: [1:0.25]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = insertelement <2 x double> undef, double %a0, i32 0
%2 = call i64 @llvm.x86.sse2.cvtsd2si64(<2 x double> %1)
%3 = load double, double *%a1, align 8
@@ -912,24 +1303,24 @@ declare i64 @llvm.x86.sse2.cvtsd2si64(<2 x double>) nounwind readnone
define float @test_cvtsd2ss(double %a0, double *%a1) {
; GENERIC-LABEL: test_cvtsd2ss:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: cvtsd2ss %xmm0, %xmm1
-; GENERIC-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
-; GENERIC-NEXT: cvtsd2ss %xmm0, %xmm0
-; GENERIC-NEXT: addss %xmm1, %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: cvtsd2ss %xmm0, %xmm1 # sched: [4:1.00]
+; GENERIC-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero sched: [6:0.50]
+; GENERIC-NEXT: cvtsd2ss %xmm0, %xmm0 # sched: [4:1.00]
+; GENERIC-NEXT: addss %xmm1, %xmm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_cvtsd2ss:
-; ATOM: # BB#0:
-; ATOM-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
-; ATOM-NEXT: cvtsd2ss %xmm0, %xmm2
-; ATOM-NEXT: xorps %xmm0, %xmm0
-; ATOM-NEXT: cvtsd2ss %xmm1, %xmm0
-; ATOM-NEXT: addss %xmm2, %xmm0
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero sched: [1:1.00]
+; ATOM-NEXT: cvtsd2ss %xmm0, %xmm2 # sched: [6:3.00]
+; ATOM-NEXT: xorps %xmm0, %xmm0 # sched: [1:0.50]
+; ATOM-NEXT: cvtsd2ss %xmm1, %xmm0 # sched: [6:3.00]
+; ATOM-NEXT: addss %xmm2, %xmm0 # sched: [5:5.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_cvtsd2ss:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: cvtsd2ss %xmm0, %xmm1 # sched: [4:0.50]
; SLM-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero sched: [3:1.00]
; SLM-NEXT: cvtsd2ss %xmm0, %xmm0 # sched: [4:0.50]
@@ -937,23 +1328,47 @@ define float @test_cvtsd2ss(double %a0, double *%a1) {
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_cvtsd2ss:
-; SANDY: # BB#0:
-; SANDY-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero sched: [4:0.50]
-; SANDY-NEXT: vcvtsd2ss %xmm1, %xmm1, %xmm1 # sched: [3:1.00]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0 # sched: [4:1.00]
+; SANDY-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero sched: [6:0.50]
+; SANDY-NEXT: vcvtsd2ss %xmm1, %xmm1, %xmm1 # sched: [4:1.00]
; SANDY-NEXT: vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cvtsd2ss:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0 # sched: [4:1.00]
-; HASWELL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero sched: [4:0.50]
+; HASWELL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero sched: [5:0.50]
; HASWELL-NEXT: vcvtsd2ss %xmm1, %xmm1, %xmm1 # sched: [4:1.00]
; HASWELL-NEXT: vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_cvtsd2ss:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0 # sched: [4:1.00]
+; BROADWELL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero sched: [5:0.50]
+; BROADWELL-NEXT: vcvtsd2ss %xmm1, %xmm1, %xmm1 # sched: [4:1.00]
+; BROADWELL-NEXT: vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_cvtsd2ss:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0 # sched: [5:1.00]
+; SKYLAKE-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero sched: [5:0.50]
+; SKYLAKE-NEXT: vcvtsd2ss %xmm1, %xmm1, %xmm1 # sched: [5:1.00]
+; SKYLAKE-NEXT: vaddss %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_cvtsd2ss:
+; SKX: # %bb.0:
+; SKX-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0 # sched: [5:1.00]
+; SKX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero sched: [5:0.50]
+; SKX-NEXT: vcvtsd2ss %xmm1, %xmm1, %xmm1 # sched: [5:1.00]
+; SKX-NEXT: vaddss %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_cvtsd2ss:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero sched: [5:1.00]
; BTVER2-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: vcvtsd2ss %xmm1, %xmm1, %xmm1 # sched: [3:1.00]
@@ -961,12 +1376,12 @@ define float @test_cvtsd2ss(double %a0, double *%a1) {
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_cvtsd2ss:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero sched: [8:0.50]
-; ZNVER1-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0 # sched: [5:1.00]
-; ZNVER1-NEXT: vcvtsd2ss %xmm1, %xmm1, %xmm1 # sched: [5:1.00]
+; ZNVER1-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0 # sched: [4:1.00]
+; ZNVER1-NEXT: vcvtsd2ss %xmm1, %xmm1, %xmm1 # sched: [4:1.00]
; ZNVER1-NEXT: vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = fptrunc double %a0 to float
%2 = load double, double *%a1, align 8
%3 = fptrunc double %2 to float
@@ -976,53 +1391,74 @@ define float @test_cvtsd2ss(double %a0, double *%a1) {
define double @test_cvtsi2sd(i32 %a0, i32 *%a1) {
; GENERIC-LABEL: test_cvtsi2sd:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: cvtsi2sdl %edi, %xmm1
-; GENERIC-NEXT: cvtsi2sdl (%rsi), %xmm0
-; GENERIC-NEXT: addsd %xmm1, %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: cvtsi2sdl %edi, %xmm1 # sched: [4:1.00]
+; GENERIC-NEXT: cvtsi2sdl (%rsi), %xmm0 # sched: [9:1.00]
+; GENERIC-NEXT: addsd %xmm1, %xmm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_cvtsi2sd:
-; ATOM: # BB#0:
-; ATOM-NEXT: cvtsi2sdl (%rsi), %xmm0
-; ATOM-NEXT: cvtsi2sdl %edi, %xmm1
-; ATOM-NEXT: addsd %xmm1, %xmm0
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: cvtsi2sdl (%rsi), %xmm0 # sched: [7:3.50]
+; ATOM-NEXT: cvtsi2sdl %edi, %xmm1 # sched: [6:3.00]
+; ATOM-NEXT: addsd %xmm1, %xmm0 # sched: [5:5.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_cvtsi2sd:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: cvtsi2sdl (%rsi), %xmm0 # sched: [7:1.00]
; SLM-NEXT: cvtsi2sdl %edi, %xmm1 # sched: [4:0.50]
; SLM-NEXT: addsd %xmm1, %xmm0 # sched: [3:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_cvtsi2sd:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vcvtsi2sdl %edi, %xmm0, %xmm0 # sched: [4:1.00]
-; SANDY-NEXT: vcvtsi2sdl (%rsi), %xmm1, %xmm1 # sched: [8:1.00]
+; SANDY-NEXT: vcvtsi2sdl (%rsi), %xmm1, %xmm1 # sched: [9:1.00]
; SANDY-NEXT: vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cvtsi2sd:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vcvtsi2sdl %edi, %xmm0, %xmm0 # sched: [4:1.00]
-; HASWELL-NEXT: vcvtsi2sdl (%rsi), %xmm1, %xmm1 # sched: [8:1.00]
+; HASWELL-NEXT: vcvtsi2sdl (%rsi), %xmm1, %xmm1 # sched: [9:1.00]
; HASWELL-NEXT: vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_cvtsi2sd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vcvtsi2sdl %edi, %xmm0, %xmm0 # sched: [4:1.00]
+; BROADWELL-NEXT: vcvtsi2sdl (%rsi), %xmm1, %xmm1 # sched: [9:1.00]
+; BROADWELL-NEXT: vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_cvtsi2sd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vcvtsi2sdl %edi, %xmm0, %xmm0 # sched: [5:1.00]
+; SKYLAKE-NEXT: vcvtsi2sdl (%rsi), %xmm1, %xmm1 # sched: [9:1.00]
+; SKYLAKE-NEXT: vaddsd %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_cvtsi2sd:
+; SKX: # %bb.0:
+; SKX-NEXT: vcvtsi2sdl %edi, %xmm0, %xmm0 # sched: [5:1.00]
+; SKX-NEXT: vcvtsi2sdl (%rsi), %xmm1, %xmm1 # sched: [9:1.00]
+; SKX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_cvtsi2sd:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vcvtsi2sdl %edi, %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: vcvtsi2sdl (%rsi), %xmm1, %xmm1 # sched: [8:1.00]
; BTVER2-NEXT: vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_cvtsi2sd:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vcvtsi2sdl %edi, %xmm0, %xmm0 # sched: [5:1.00]
; ZNVER1-NEXT: vcvtsi2sdl (%rsi), %xmm1, %xmm1 # sched: [12:1.00]
; ZNVER1-NEXT: vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = sitofp i32 %a0 to double
%2 = load i32, i32 *%a1, align 8
%3 = sitofp i32 %2 to double
@@ -1032,53 +1468,74 @@ define double @test_cvtsi2sd(i32 %a0, i32 *%a1) {
define double @test_cvtsi2sdq(i64 %a0, i64 *%a1) {
; GENERIC-LABEL: test_cvtsi2sdq:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: cvtsi2sdq %rdi, %xmm1
-; GENERIC-NEXT: cvtsi2sdq (%rsi), %xmm0
-; GENERIC-NEXT: addsd %xmm1, %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: cvtsi2sdq %rdi, %xmm1 # sched: [4:1.00]
+; GENERIC-NEXT: cvtsi2sdq (%rsi), %xmm0 # sched: [9:1.00]
+; GENERIC-NEXT: addsd %xmm1, %xmm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_cvtsi2sdq:
-; ATOM: # BB#0:
-; ATOM-NEXT: cvtsi2sdq (%rsi), %xmm0
-; ATOM-NEXT: cvtsi2sdq %rdi, %xmm1
-; ATOM-NEXT: addsd %xmm1, %xmm0
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: cvtsi2sdq (%rsi), %xmm0 # sched: [7:3.50]
+; ATOM-NEXT: cvtsi2sdq %rdi, %xmm1 # sched: [6:3.00]
+; ATOM-NEXT: addsd %xmm1, %xmm0 # sched: [5:5.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_cvtsi2sdq:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: cvtsi2sdq (%rsi), %xmm0 # sched: [7:1.00]
; SLM-NEXT: cvtsi2sdq %rdi, %xmm1 # sched: [4:0.50]
; SLM-NEXT: addsd %xmm1, %xmm0 # sched: [3:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_cvtsi2sdq:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vcvtsi2sdq %rdi, %xmm0, %xmm0 # sched: [4:1.00]
-; SANDY-NEXT: vcvtsi2sdq (%rsi), %xmm1, %xmm1 # sched: [8:1.00]
+; SANDY-NEXT: vcvtsi2sdq (%rsi), %xmm1, %xmm1 # sched: [9:1.00]
; SANDY-NEXT: vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cvtsi2sdq:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vcvtsi2sdq %rdi, %xmm0, %xmm0 # sched: [4:1.00]
-; HASWELL-NEXT: vcvtsi2sdq (%rsi), %xmm1, %xmm1 # sched: [8:1.00]
+; HASWELL-NEXT: vcvtsi2sdq (%rsi), %xmm1, %xmm1 # sched: [9:1.00]
; HASWELL-NEXT: vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_cvtsi2sdq:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vcvtsi2sdq %rdi, %xmm0, %xmm0 # sched: [4:1.00]
+; BROADWELL-NEXT: vcvtsi2sdq (%rsi), %xmm1, %xmm1 # sched: [9:1.00]
+; BROADWELL-NEXT: vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_cvtsi2sdq:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vcvtsi2sdq %rdi, %xmm0, %xmm0 # sched: [5:1.00]
+; SKYLAKE-NEXT: vcvtsi2sdq (%rsi), %xmm1, %xmm1 # sched: [9:1.00]
+; SKYLAKE-NEXT: vaddsd %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_cvtsi2sdq:
+; SKX: # %bb.0:
+; SKX-NEXT: vcvtsi2sdq %rdi, %xmm0, %xmm0 # sched: [5:1.00]
+; SKX-NEXT: vcvtsi2sdq (%rsi), %xmm1, %xmm1 # sched: [9:1.00]
+; SKX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_cvtsi2sdq:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vcvtsi2sdq %rdi, %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: vcvtsi2sdq (%rsi), %xmm1, %xmm1 # sched: [8:1.00]
; BTVER2-NEXT: vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_cvtsi2sdq:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vcvtsi2sdq %rdi, %xmm0, %xmm0 # sched: [5:1.00]
; ZNVER1-NEXT: vcvtsi2sdq (%rsi), %xmm1, %xmm1 # sched: [12:1.00]
; ZNVER1-NEXT: vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = sitofp i64 %a0 to double
%2 = load i64, i64 *%a1, align 8
%3 = sitofp i64 %2 to double
@@ -1090,24 +1547,24 @@ define double @test_cvtsi2sdq(i64 %a0, i64 *%a1) {
define double @test_cvtss2sd(float %a0, float *%a1) {
; GENERIC-LABEL: test_cvtss2sd:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: cvtss2sd %xmm0, %xmm1
-; GENERIC-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; GENERIC-NEXT: cvtss2sd %xmm0, %xmm0
-; GENERIC-NEXT: addsd %xmm1, %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: cvtss2sd %xmm0, %xmm1 # sched: [1:1.00]
+; GENERIC-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [6:0.50]
+; GENERIC-NEXT: cvtss2sd %xmm0, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: addsd %xmm1, %xmm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_cvtss2sd:
-; ATOM: # BB#0:
-; ATOM-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; ATOM-NEXT: cvtss2sd %xmm0, %xmm2
-; ATOM-NEXT: xorps %xmm0, %xmm0
-; ATOM-NEXT: cvtss2sd %xmm1, %xmm0
-; ATOM-NEXT: addsd %xmm2, %xmm0
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [1:1.00]
+; ATOM-NEXT: cvtss2sd %xmm0, %xmm2 # sched: [6:3.00]
+; ATOM-NEXT: xorps %xmm0, %xmm0 # sched: [1:0.50]
+; ATOM-NEXT: cvtss2sd %xmm1, %xmm0 # sched: [6:3.00]
+; ATOM-NEXT: addsd %xmm2, %xmm0 # sched: [5:5.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_cvtss2sd:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: cvtss2sd %xmm0, %xmm1 # sched: [4:0.50]
; SLM-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [3:1.00]
; SLM-NEXT: cvtss2sd %xmm0, %xmm0 # sched: [4:0.50]
@@ -1115,23 +1572,47 @@ define double @test_cvtss2sd(float %a0, float *%a1) {
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_cvtss2sd:
-; SANDY: # BB#0:
-; SANDY-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [4:0.50]
-; SANDY-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 # sched: [3:1.00]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 # sched: [1:1.00]
+; SANDY-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [6:0.50]
+; SANDY-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 # sched: [1:1.00]
; SANDY-NEXT: vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cvtss2sd:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 # sched: [2:1.00]
-; HASWELL-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [4:0.50]
+; HASWELL-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [5:0.50]
; HASWELL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 # sched: [2:1.00]
; HASWELL-NEXT: vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_cvtss2sd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 # sched: [2:1.00]
+; BROADWELL-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [5:0.50]
+; BROADWELL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 # sched: [2:1.00]
+; BROADWELL-NEXT: vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_cvtss2sd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 # sched: [5:1.00]
+; SKYLAKE-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [5:0.50]
+; SKYLAKE-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 # sched: [5:1.00]
+; SKYLAKE-NEXT: vaddsd %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_cvtss2sd:
+; SKX: # %bb.0:
+; SKX-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 # sched: [5:1.00]
+; SKX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [5:0.50]
+; SKX-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 # sched: [5:1.00]
+; SKX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_cvtss2sd:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [5:1.00]
; BTVER2-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 # sched: [3:1.00]
@@ -1139,12 +1620,12 @@ define double @test_cvtss2sd(float %a0, float *%a1) {
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_cvtss2sd:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [8:0.50]
-; ZNVER1-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 # sched: [5:1.00]
-; ZNVER1-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 # sched: [5:1.00]
+; ZNVER1-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 # sched: [4:1.00]
+; ZNVER1-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 # sched: [4:1.00]
; ZNVER1-NEXT: vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = fpext float %a0 to double
%2 = load float, float *%a1, align 4
%3 = fpext float %2 to double
@@ -1154,54 +1635,75 @@ define double @test_cvtss2sd(float %a0, float *%a1) {
define <4 x i32> @test_cvttpd2dq(<2 x double> %a0, <2 x double> *%a1) {
; GENERIC-LABEL: test_cvttpd2dq:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: cvttpd2dq %xmm0, %xmm1
-; GENERIC-NEXT: cvttpd2dq (%rdi), %xmm0
-; GENERIC-NEXT: paddd %xmm1, %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: cvttpd2dq %xmm0, %xmm1 # sched: [4:1.00]
+; GENERIC-NEXT: cvttpd2dq (%rdi), %xmm0 # sched: [10:1.00]
+; GENERIC-NEXT: paddd %xmm1, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_cvttpd2dq:
-; ATOM: # BB#0:
-; ATOM-NEXT: cvttpd2dq (%rdi), %xmm1
-; ATOM-NEXT: cvttpd2dq %xmm0, %xmm0
-; ATOM-NEXT: paddd %xmm0, %xmm1
-; ATOM-NEXT: movdqa %xmm1, %xmm0
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: cvttpd2dq (%rdi), %xmm1 # sched: [8:4.00]
+; ATOM-NEXT: cvttpd2dq %xmm0, %xmm0 # sched: [7:3.50]
+; ATOM-NEXT: paddd %xmm0, %xmm1 # sched: [1:0.50]
+; ATOM-NEXT: movdqa %xmm1, %xmm0 # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_cvttpd2dq:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: cvttpd2dq %xmm0, %xmm1 # sched: [4:0.50]
; SLM-NEXT: cvttpd2dq (%rdi), %xmm0 # sched: [7:1.00]
; SLM-NEXT: paddd %xmm1, %xmm0 # sched: [1:0.50]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_cvttpd2dq:
-; SANDY: # BB#0:
-; SANDY-NEXT: vcvttpd2dq %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vcvttpd2dqx (%rdi), %xmm1 # sched: [7:1.00]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vcvttpd2dq %xmm0, %xmm0 # sched: [4:1.00]
+; SANDY-NEXT: vcvttpd2dqx (%rdi), %xmm1 # sched: [10:1.00]
; SANDY-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cvttpd2dq:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vcvttpd2dq %xmm0, %xmm0 # sched: [4:1.00]
; HASWELL-NEXT: vcvttpd2dqx (%rdi), %xmm1 # sched: [8:1.00]
; HASWELL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_cvttpd2dq:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vcvttpd2dq %xmm0, %xmm0 # sched: [4:1.00]
+; BROADWELL-NEXT: vcvttpd2dqx (%rdi), %xmm1 # sched: [8:1.00]
+; BROADWELL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_cvttpd2dq:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vcvttpd2dq %xmm0, %xmm0 # sched: [5:1.00]
+; SKYLAKE-NEXT: vcvttpd2dqx (%rdi), %xmm1 # sched: [8:1.00]
+; SKYLAKE-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_cvttpd2dq:
+; SKX: # %bb.0:
+; SKX-NEXT: vcvttpd2dq %xmm0, %xmm0 # sched: [5:1.00]
+; SKX-NEXT: vcvttpd2dqx (%rdi), %xmm1 # sched: [8:1.00]
+; SKX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_cvttpd2dq:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vcvttpd2dqx (%rdi), %xmm1 # sched: [8:1.00]
; BTVER2-NEXT: vcvttpd2dq %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_cvttpd2dq:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vcvttpd2dqx (%rdi), %xmm1 # sched: [12:1.00]
; ZNVER1-NEXT: vcvttpd2dq %xmm0, %xmm0 # sched: [5:1.00]
; ZNVER1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = fptosi <2 x double> %a0 to <2 x i32>
%2 = shufflevector <2 x i32> %1, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%3 = load <2 x double>, <2 x double> *%a1, align 16
@@ -1213,54 +1715,75 @@ define <4 x i32> @test_cvttpd2dq(<2 x double> %a0, <2 x double> *%a1) {
define <4 x i32> @test_cvttps2dq(<4 x float> %a0, <4 x float> *%a1) {
; GENERIC-LABEL: test_cvttps2dq:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: cvttps2dq %xmm0, %xmm1
-; GENERIC-NEXT: cvttps2dq (%rdi), %xmm0
-; GENERIC-NEXT: paddd %xmm1, %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: cvttps2dq %xmm0, %xmm1 # sched: [3:1.00]
+; GENERIC-NEXT: cvttps2dq (%rdi), %xmm0 # sched: [9:1.00]
+; GENERIC-NEXT: paddd %xmm1, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_cvttps2dq:
-; ATOM: # BB#0:
-; ATOM-NEXT: cvttps2dq (%rdi), %xmm1
-; ATOM-NEXT: cvttps2dq %xmm0, %xmm0
-; ATOM-NEXT: paddd %xmm0, %xmm1
-; ATOM-NEXT: movdqa %xmm1, %xmm0
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: cvttps2dq (%rdi), %xmm1 # sched: [7:3.50]
+; ATOM-NEXT: cvttps2dq %xmm0, %xmm0 # sched: [6:3.00]
+; ATOM-NEXT: paddd %xmm0, %xmm1 # sched: [1:0.50]
+; ATOM-NEXT: movdqa %xmm1, %xmm0 # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_cvttps2dq:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: cvttps2dq %xmm0, %xmm1 # sched: [4:0.50]
; SLM-NEXT: cvttps2dq (%rdi), %xmm0 # sched: [7:1.00]
; SLM-NEXT: paddd %xmm1, %xmm0 # sched: [1:0.50]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_cvttps2dq:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vcvttps2dq %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vcvttps2dq (%rdi), %xmm1 # sched: [7:1.00]
+; SANDY-NEXT: vcvttps2dq (%rdi), %xmm1 # sched: [9:1.00]
; SANDY-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cvttps2dq:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vcvttps2dq %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: vcvttps2dq (%rdi), %xmm1 # sched: [7:1.00]
+; HASWELL-NEXT: vcvttps2dq (%rdi), %xmm1 # sched: [9:1.00]
; HASWELL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_cvttps2dq:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vcvttps2dq %xmm0, %xmm0 # sched: [3:1.00]
+; BROADWELL-NEXT: vcvttps2dq (%rdi), %xmm1 # sched: [8:1.00]
+; BROADWELL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_cvttps2dq:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vcvttps2dq %xmm0, %xmm0 # sched: [4:0.33]
+; SKYLAKE-NEXT: vcvttps2dq (%rdi), %xmm1 # sched: [10:0.50]
+; SKYLAKE-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_cvttps2dq:
+; SKX: # %bb.0:
+; SKX-NEXT: vcvttps2dq %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: vcvttps2dq (%rdi), %xmm1 # sched: [10:0.50]
+; SKX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_cvttps2dq:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vcvttps2dq (%rdi), %xmm1 # sched: [8:1.00]
; BTVER2-NEXT: vcvttps2dq %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_cvttps2dq:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vcvttps2dq (%rdi), %xmm1 # sched: [12:1.00]
; ZNVER1-NEXT: vcvttps2dq %xmm0, %xmm0 # sched: [5:1.00]
; ZNVER1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = fptosi <4 x float> %a0 to <4 x i32>
%2 = load <4 x float>, <4 x float> *%a1, align 16
%3 = fptosi <4 x float> %2 to <4 x i32>
@@ -1270,53 +1793,74 @@ define <4 x i32> @test_cvttps2dq(<4 x float> %a0, <4 x float> *%a1) {
define i32 @test_cvttsd2si(double %a0, double *%a1) {
; GENERIC-LABEL: test_cvttsd2si:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: cvttsd2si %xmm0, %ecx
-; GENERIC-NEXT: cvttsd2si (%rdi), %eax
-; GENERIC-NEXT: addl %ecx, %eax
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: cvttsd2si %xmm0, %ecx # sched: [5:1.00]
+; GENERIC-NEXT: cvttsd2si (%rdi), %eax # sched: [9:1.00]
+; GENERIC-NEXT: addl %ecx, %eax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_cvttsd2si:
-; ATOM: # BB#0:
-; ATOM-NEXT: cvttsd2si (%rdi), %eax
-; ATOM-NEXT: cvttsd2si %xmm0, %ecx
-; ATOM-NEXT: addl %ecx, %eax
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: cvttsd2si (%rdi), %eax # sched: [9:4.50]
+; ATOM-NEXT: cvttsd2si %xmm0, %ecx # sched: [8:4.00]
+; ATOM-NEXT: addl %ecx, %eax # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_cvttsd2si:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: cvttsd2si (%rdi), %eax # sched: [7:1.00]
; SLM-NEXT: cvttsd2si %xmm0, %ecx # sched: [4:0.50]
; SLM-NEXT: addl %ecx, %eax # sched: [1:0.50]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_cvttsd2si:
-; SANDY: # BB#0:
-; SANDY-NEXT: vcvttsd2si %xmm0, %ecx # sched: [3:1.00]
-; SANDY-NEXT: vcvttsd2si (%rdi), %eax # sched: [7:1.00]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vcvttsd2si %xmm0, %ecx # sched: [5:1.00]
+; SANDY-NEXT: vcvttsd2si (%rdi), %eax # sched: [10:1.00]
; SANDY-NEXT: addl %ecx, %eax # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cvttsd2si:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vcvttsd2si %xmm0, %ecx # sched: [4:1.00]
-; HASWELL-NEXT: vcvttsd2si (%rdi), %eax # sched: [8:1.00]
+; HASWELL-NEXT: vcvttsd2si (%rdi), %eax # sched: [9:1.00]
; HASWELL-NEXT: addl %ecx, %eax # sched: [1:0.25]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_cvttsd2si:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vcvttsd2si (%rdi), %eax # sched: [9:1.00]
+; BROADWELL-NEXT: vcvttsd2si %xmm0, %ecx # sched: [4:1.00]
+; BROADWELL-NEXT: addl %ecx, %eax # sched: [1:0.25]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_cvttsd2si:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vcvttsd2si %xmm0, %ecx # sched: [6:1.00]
+; SKYLAKE-NEXT: vcvttsd2si (%rdi), %eax # sched: [11:1.00]
+; SKYLAKE-NEXT: addl %ecx, %eax # sched: [1:0.25]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_cvttsd2si:
+; SKX: # %bb.0:
+; SKX-NEXT: vcvttsd2si %xmm0, %ecx # sched: [6:1.00]
+; SKX-NEXT: vcvttsd2si (%rdi), %eax # sched: [11:1.00]
+; SKX-NEXT: addl %ecx, %eax # sched: [1:0.25]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_cvttsd2si:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vcvttsd2si (%rdi), %eax # sched: [8:1.00]
; BTVER2-NEXT: vcvttsd2si %xmm0, %ecx # sched: [3:1.00]
; BTVER2-NEXT: addl %ecx, %eax # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_cvttsd2si:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vcvttsd2si (%rdi), %eax # sched: [12:1.00]
; ZNVER1-NEXT: vcvttsd2si %xmm0, %ecx # sched: [5:1.00]
; ZNVER1-NEXT: addl %ecx, %eax # sched: [1:0.25]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = fptosi double %a0 to i32
%2 = load double, double *%a1, align 8
%3 = fptosi double %2 to i32
@@ -1326,53 +1870,74 @@ define i32 @test_cvttsd2si(double %a0, double *%a1) {
define i64 @test_cvttsd2siq(double %a0, double *%a1) {
; GENERIC-LABEL: test_cvttsd2siq:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: cvttsd2si %xmm0, %rcx
-; GENERIC-NEXT: cvttsd2si (%rdi), %rax
-; GENERIC-NEXT: addq %rcx, %rax
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: cvttsd2si %xmm0, %rcx # sched: [5:1.00]
+; GENERIC-NEXT: cvttsd2si (%rdi), %rax # sched: [9:1.00]
+; GENERIC-NEXT: addq %rcx, %rax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_cvttsd2siq:
-; ATOM: # BB#0:
-; ATOM-NEXT: cvttsd2si (%rdi), %rax
-; ATOM-NEXT: cvttsd2si %xmm0, %rcx
-; ATOM-NEXT: addq %rcx, %rax
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: cvttsd2si (%rdi), %rax # sched: [9:4.50]
+; ATOM-NEXT: cvttsd2si %xmm0, %rcx # sched: [8:4.00]
+; ATOM-NEXT: addq %rcx, %rax # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_cvttsd2siq:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: cvttsd2si (%rdi), %rax # sched: [7:1.00]
; SLM-NEXT: cvttsd2si %xmm0, %rcx # sched: [4:0.50]
; SLM-NEXT: addq %rcx, %rax # sched: [1:0.50]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_cvttsd2siq:
-; SANDY: # BB#0:
-; SANDY-NEXT: vcvttsd2si %xmm0, %rcx # sched: [3:1.00]
-; SANDY-NEXT: vcvttsd2si (%rdi), %rax # sched: [7:1.00]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vcvttsd2si %xmm0, %rcx # sched: [5:1.00]
+; SANDY-NEXT: vcvttsd2si (%rdi), %rax # sched: [10:1.00]
; SANDY-NEXT: addq %rcx, %rax # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_cvttsd2siq:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vcvttsd2si %xmm0, %rcx # sched: [4:1.00]
-; HASWELL-NEXT: vcvttsd2si (%rdi), %rax # sched: [8:1.00]
+; HASWELL-NEXT: vcvttsd2si (%rdi), %rax # sched: [9:1.00]
; HASWELL-NEXT: addq %rcx, %rax # sched: [1:0.25]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_cvttsd2siq:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vcvttsd2si (%rdi), %rax # sched: [9:1.00]
+; BROADWELL-NEXT: vcvttsd2si %xmm0, %rcx # sched: [4:1.00]
+; BROADWELL-NEXT: addq %rcx, %rax # sched: [1:0.25]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_cvttsd2siq:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vcvttsd2si %xmm0, %rcx # sched: [6:1.00]
+; SKYLAKE-NEXT: vcvttsd2si (%rdi), %rax # sched: [11:1.00]
+; SKYLAKE-NEXT: addq %rcx, %rax # sched: [1:0.25]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_cvttsd2siq:
+; SKX: # %bb.0:
+; SKX-NEXT: vcvttsd2si %xmm0, %rcx # sched: [6:1.00]
+; SKX-NEXT: vcvttsd2si (%rdi), %rax # sched: [11:1.00]
+; SKX-NEXT: addq %rcx, %rax # sched: [1:0.25]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_cvttsd2siq:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vcvttsd2si (%rdi), %rax # sched: [8:1.00]
; BTVER2-NEXT: vcvttsd2si %xmm0, %rcx # sched: [3:1.00]
; BTVER2-NEXT: addq %rcx, %rax # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_cvttsd2siq:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vcvttsd2si (%rdi), %rax # sched: [12:1.00]
; ZNVER1-NEXT: vcvttsd2si %xmm0, %rcx # sched: [5:1.00]
; ZNVER1-NEXT: addq %rcx, %rax # sched: [1:0.25]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = fptosi double %a0 to i64
%2 = load double, double *%a1, align 8
%3 = fptosi double %2 to i64
@@ -1382,46 +1947,64 @@ define i64 @test_cvttsd2siq(double %a0, double *%a1) {
define <2 x double> @test_divpd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2) {
; GENERIC-LABEL: test_divpd:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: divpd %xmm1, %xmm0
-; GENERIC-NEXT: divpd (%rdi), %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: divpd %xmm1, %xmm0 # sched: [22:1.00]
+; GENERIC-NEXT: divpd (%rdi), %xmm0 # sched: [28:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_divpd:
-; ATOM: # BB#0:
-; ATOM-NEXT: divpd %xmm1, %xmm0
-; ATOM-NEXT: divpd (%rdi), %xmm0
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: divpd %xmm1, %xmm0 # sched: [125:62.50]
+; ATOM-NEXT: divpd (%rdi), %xmm0 # sched: [125:62.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_divpd:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: divpd %xmm1, %xmm0 # sched: [34:34.00]
; SLM-NEXT: divpd (%rdi), %xmm0 # sched: [37:34.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_divpd:
-; SANDY: # BB#0:
-; SANDY-NEXT: vdivpd %xmm1, %xmm0, %xmm0 # sched: [12:1.00]
-; SANDY-NEXT: vdivpd (%rdi), %xmm0, %xmm0 # sched: [16:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vdivpd %xmm1, %xmm0, %xmm0 # sched: [22:1.00]
+; SANDY-NEXT: vdivpd (%rdi), %xmm0, %xmm0 # sched: [28:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_divpd:
-; HASWELL: # BB#0:
-; HASWELL-NEXT: vdivpd %xmm1, %xmm0, %xmm0 # sched: [12:1.00]
-; HASWELL-NEXT: vdivpd (%rdi), %xmm0, %xmm0 # sched: [16:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vdivpd %xmm1, %xmm0, %xmm0 # sched: [20:1.00]
+; HASWELL-NEXT: vdivpd (%rdi), %xmm0, %xmm0 # sched: [26:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_divpd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vdivpd %xmm1, %xmm0, %xmm0 # sched: [14:1.00]
+; BROADWELL-NEXT: vdivpd (%rdi), %xmm0, %xmm0 # sched: [19:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_divpd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vdivpd %xmm1, %xmm0, %xmm0 # sched: [14:1.00]
+; SKYLAKE-NEXT: vdivpd (%rdi), %xmm0, %xmm0 # sched: [20:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_divpd:
+; SKX: # %bb.0:
+; SKX-NEXT: vdivpd %xmm1, %xmm0, %xmm0 # sched: [14:1.00]
+; SKX-NEXT: vdivpd (%rdi), %xmm0, %xmm0 # sched: [20:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_divpd:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vdivpd %xmm1, %xmm0, %xmm0 # sched: [19:19.00]
; BTVER2-NEXT: vdivpd (%rdi), %xmm0, %xmm0 # sched: [24:19.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_divpd:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vdivpd %xmm1, %xmm0, %xmm0 # sched: [15:1.00]
; ZNVER1-NEXT: vdivpd (%rdi), %xmm0, %xmm0 # sched: [22:1.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = fdiv <2 x double> %a0, %a1
%2 = load <2 x double>, <2 x double> *%a2, align 16
%3 = fdiv <2 x double> %1, %2
@@ -1430,46 +2013,64 @@ define <2 x double> @test_divpd(<2 x double> %a0, <2 x double> %a1, <2 x double>
define double @test_divsd(double %a0, double %a1, double *%a2) {
; GENERIC-LABEL: test_divsd:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: divsd %xmm1, %xmm0
-; GENERIC-NEXT: divsd (%rdi), %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: divsd %xmm1, %xmm0 # sched: [22:1.00]
+; GENERIC-NEXT: divsd (%rdi), %xmm0 # sched: [28:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_divsd:
-; ATOM: # BB#0:
-; ATOM-NEXT: divsd %xmm1, %xmm0
-; ATOM-NEXT: divsd (%rdi), %xmm0
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: divsd %xmm1, %xmm0 # sched: [62:31.00]
+; ATOM-NEXT: divsd (%rdi), %xmm0 # sched: [62:31.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_divsd:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: divsd %xmm1, %xmm0 # sched: [34:34.00]
; SLM-NEXT: divsd (%rdi), %xmm0 # sched: [37:34.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_divsd:
-; SANDY: # BB#0:
-; SANDY-NEXT: vdivsd %xmm1, %xmm0, %xmm0 # sched: [12:1.00]
-; SANDY-NEXT: vdivsd (%rdi), %xmm0, %xmm0 # sched: [16:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vdivsd %xmm1, %xmm0, %xmm0 # sched: [22:1.00]
+; SANDY-NEXT: vdivsd (%rdi), %xmm0, %xmm0 # sched: [28:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_divsd:
-; HASWELL: # BB#0:
-; HASWELL-NEXT: vdivsd %xmm1, %xmm0, %xmm0 # sched: [12:1.00]
-; HASWELL-NEXT: vdivsd (%rdi), %xmm0, %xmm0 # sched: [16:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vdivsd %xmm1, %xmm0, %xmm0 # sched: [20:1.00]
+; HASWELL-NEXT: vdivsd (%rdi), %xmm0, %xmm0 # sched: [25:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_divsd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vdivsd %xmm1, %xmm0, %xmm0 # sched: [14:1.00]
+; BROADWELL-NEXT: vdivsd (%rdi), %xmm0, %xmm0 # sched: [19:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_divsd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vdivsd %xmm1, %xmm0, %xmm0 # sched: [14:1.00]
+; SKYLAKE-NEXT: vdivsd (%rdi), %xmm0, %xmm0 # sched: [19:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_divsd:
+; SKX: # %bb.0:
+; SKX-NEXT: vdivsd %xmm1, %xmm0, %xmm0 # sched: [14:1.00]
+; SKX-NEXT: vdivsd (%rdi), %xmm0, %xmm0 # sched: [19:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_divsd:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vdivsd %xmm1, %xmm0, %xmm0 # sched: [19:19.00]
; BTVER2-NEXT: vdivsd (%rdi), %xmm0, %xmm0 # sched: [24:19.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_divsd:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vdivsd %xmm1, %xmm0, %xmm0 # sched: [15:1.00]
; ZNVER1-NEXT: vdivsd (%rdi), %xmm0, %xmm0 # sched: [22:1.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = fdiv double %a0, %a1
%2 = load double, double *%a2, align 8
%3 = fdiv double %1, %2
@@ -1478,45 +2079,60 @@ define double @test_divsd(double %a0, double %a1, double *%a2) {
define void @test_lfence() {
; GENERIC-LABEL: test_lfence:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: lfence
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: lfence # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_lfence:
-; ATOM: # BB#0:
-; ATOM-NEXT: lfence
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: lfence # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_lfence:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: lfence # sched: [1:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_lfence:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: lfence # sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_lfence:
-; HASWELL: # BB#0:
-; HASWELL-NEXT: lfence # sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: lfence # sched: [2:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_lfence:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: lfence # sched: [2:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_lfence:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: lfence # sched: [2:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_lfence:
+; SKX: # %bb.0:
+; SKX-NEXT: lfence # sched: [2:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_lfence:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: lfence # sched: [1:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_lfence:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: lfence # sched: [1:0.50]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
call void @llvm.x86.sse2.lfence()
ret void
}
@@ -1524,45 +2140,60 @@ declare void @llvm.x86.sse2.lfence() nounwind readnone
define void @test_mfence() {
; GENERIC-LABEL: test_mfence:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: mfence
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: mfence # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_mfence:
-; ATOM: # BB#0:
-; ATOM-NEXT: mfence
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: mfence # sched: [1:1.00]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_mfence:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: mfence # sched: [1:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_mfence:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: mfence # sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_mfence:
-; HASWELL: # BB#0:
-; HASWELL-NEXT: mfence # sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: mfence # sched: [2:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_mfence:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: mfence # sched: [2:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_mfence:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: mfence # sched: [3:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_mfence:
+; SKX: # %bb.0:
+; SKX-NEXT: mfence # sched: [3:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_mfence:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: mfence # sched: [1:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_mfence:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: mfence # sched: [1:0.50]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
call void @llvm.x86.sse2.mfence()
ret void
}
@@ -1570,43 +2201,58 @@ declare void @llvm.x86.sse2.mfence() nounwind readnone
define void @test_maskmovdqu(<16 x i8> %a0, <16 x i8> %a1, i8* %a2) {
; GENERIC-LABEL: test_maskmovdqu:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: maskmovdqu %xmm1, %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: maskmovdqu %xmm1, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_maskmovdqu:
-; ATOM: # BB#0:
-; ATOM-NEXT: maskmovdqu %xmm1, %xmm0
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: maskmovdqu %xmm1, %xmm0 # sched: [2:1.00]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_maskmovdqu:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: maskmovdqu %xmm1, %xmm0 # sched: [1:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_maskmovdqu:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vmaskmovdqu %xmm1, %xmm0 # sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_maskmovdqu:
-; HASWELL: # BB#0:
-; HASWELL-NEXT: vmaskmovdqu %xmm1, %xmm0 # sched: [14:2.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vmaskmovdqu %xmm1, %xmm0 # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_maskmovdqu:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vmaskmovdqu %xmm1, %xmm0 # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_maskmovdqu:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vmaskmovdqu %xmm1, %xmm0 # sched: [2:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_maskmovdqu:
+; SKX: # %bb.0:
+; SKX-NEXT: vmaskmovdqu %xmm1, %xmm0 # sched: [2:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_maskmovdqu:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vmaskmovdqu %xmm1, %xmm0 # sched: [1:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_maskmovdqu:
-; ZNVER1: # BB#0:
-; ZNVER1-NEXT: vmaskmovdqu %xmm1, %xmm0 # sched: [1:0.50]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vmaskmovdqu %xmm1, %xmm0 # sched: [100:?]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
call void @llvm.x86.sse2.maskmov.dqu(<16 x i8> %a0, <16 x i8> %a1, i8* %a2)
ret void
}
@@ -1614,46 +2260,64 @@ declare void @llvm.x86.sse2.maskmov.dqu(<16 x i8>, <16 x i8>, i8*) nounwind
define <2 x double> @test_maxpd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2) {
; GENERIC-LABEL: test_maxpd:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: maxpd %xmm1, %xmm0
-; GENERIC-NEXT: maxpd (%rdi), %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: maxpd %xmm1, %xmm0 # sched: [3:1.00]
+; GENERIC-NEXT: maxpd (%rdi), %xmm0 # sched: [9:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_maxpd:
-; ATOM: # BB#0:
-; ATOM-NEXT: maxpd %xmm1, %xmm0
-; ATOM-NEXT: maxpd (%rdi), %xmm0
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: maxpd %xmm1, %xmm0 # sched: [6:3.00]
+; ATOM-NEXT: maxpd (%rdi), %xmm0 # sched: [7:3.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_maxpd:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: maxpd %xmm1, %xmm0 # sched: [3:1.00]
; SLM-NEXT: maxpd (%rdi), %xmm0 # sched: [6:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_maxpd:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vmaxpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vmaxpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmaxpd (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_maxpd:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vmaxpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: vmaxpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vmaxpd (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_maxpd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vmaxpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; BROADWELL-NEXT: vmaxpd (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_maxpd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vmaxpd %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
+; SKYLAKE-NEXT: vmaxpd (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_maxpd:
+; SKX: # %bb.0:
+; SKX-NEXT: vmaxpd %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: vmaxpd (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_maxpd:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vmaxpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: vmaxpd (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_maxpd:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vmaxpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; ZNVER1-NEXT: vmaxpd (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %a0, <2 x double> %a1)
%2 = load <2 x double>, <2 x double> *%a2, align 16
%3 = call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %1, <2 x double> %2)
@@ -1663,46 +2327,64 @@ declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind
define <2 x double> @test_maxsd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2) {
; GENERIC-LABEL: test_maxsd:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: maxsd %xmm1, %xmm0
-; GENERIC-NEXT: maxsd (%rdi), %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: maxsd %xmm1, %xmm0 # sched: [3:1.00]
+; GENERIC-NEXT: maxsd (%rdi), %xmm0 # sched: [9:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_maxsd:
-; ATOM: # BB#0:
-; ATOM-NEXT: maxsd %xmm1, %xmm0
-; ATOM-NEXT: maxsd (%rdi), %xmm0
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: maxsd %xmm1, %xmm0 # sched: [5:5.00]
+; ATOM-NEXT: maxsd (%rdi), %xmm0 # sched: [5:5.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_maxsd:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: maxsd %xmm1, %xmm0 # sched: [3:1.00]
; SLM-NEXT: maxsd (%rdi), %xmm0 # sched: [6:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_maxsd:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vmaxsd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmaxsd (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_maxsd:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: vmaxsd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vmaxsd (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_maxsd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; BROADWELL-NEXT: vmaxsd (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_maxsd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
+; SKYLAKE-NEXT: vmaxsd (%rdi), %xmm0, %xmm0 # sched: [9:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_maxsd:
+; SKX: # %bb.0:
+; SKX-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: vmaxsd (%rdi), %xmm0, %xmm0 # sched: [9:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_maxsd:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: vmaxsd (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_maxsd:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; ZNVER1-NEXT: vmaxsd (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call <2 x double> @llvm.x86.sse2.max.sd(<2 x double> %a0, <2 x double> %a1)
%2 = load <2 x double>, <2 x double> *%a2, align 16
%3 = call <2 x double> @llvm.x86.sse2.max.sd(<2 x double> %1, <2 x double> %2)
@@ -1712,46 +2394,64 @@ declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>) nounwind
define <2 x double> @test_minpd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2) {
; GENERIC-LABEL: test_minpd:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: minpd %xmm1, %xmm0
-; GENERIC-NEXT: minpd (%rdi), %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: minpd %xmm1, %xmm0 # sched: [3:1.00]
+; GENERIC-NEXT: minpd (%rdi), %xmm0 # sched: [9:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_minpd:
-; ATOM: # BB#0:
-; ATOM-NEXT: minpd %xmm1, %xmm0
-; ATOM-NEXT: minpd (%rdi), %xmm0
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: minpd %xmm1, %xmm0 # sched: [6:3.00]
+; ATOM-NEXT: minpd (%rdi), %xmm0 # sched: [7:3.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_minpd:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: minpd %xmm1, %xmm0 # sched: [3:1.00]
; SLM-NEXT: minpd (%rdi), %xmm0 # sched: [6:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_minpd:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vminpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vminpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vminpd (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_minpd:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vminpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: vminpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vminpd (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_minpd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vminpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; BROADWELL-NEXT: vminpd (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_minpd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vminpd %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
+; SKYLAKE-NEXT: vminpd (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_minpd:
+; SKX: # %bb.0:
+; SKX-NEXT: vminpd %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: vminpd (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_minpd:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vminpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: vminpd (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_minpd:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vminpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; ZNVER1-NEXT: vminpd (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %a0, <2 x double> %a1)
%2 = load <2 x double>, <2 x double> *%a2, align 16
%3 = call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %1, <2 x double> %2)
@@ -1761,46 +2461,64 @@ declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind
define <2 x double> @test_minsd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2) {
; GENERIC-LABEL: test_minsd:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: minsd %xmm1, %xmm0
-; GENERIC-NEXT: minsd (%rdi), %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: minsd %xmm1, %xmm0 # sched: [3:1.00]
+; GENERIC-NEXT: minsd (%rdi), %xmm0 # sched: [9:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_minsd:
-; ATOM: # BB#0:
-; ATOM-NEXT: minsd %xmm1, %xmm0
-; ATOM-NEXT: minsd (%rdi), %xmm0
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: minsd %xmm1, %xmm0 # sched: [5:5.00]
+; ATOM-NEXT: minsd (%rdi), %xmm0 # sched: [5:5.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_minsd:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: minsd %xmm1, %xmm0 # sched: [3:1.00]
; SLM-NEXT: minsd (%rdi), %xmm0 # sched: [6:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_minsd:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vminsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vminsd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vminsd (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_minsd:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vminsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: vminsd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vminsd (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_minsd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vminsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; BROADWELL-NEXT: vminsd (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_minsd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vminsd %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
+; SKYLAKE-NEXT: vminsd (%rdi), %xmm0, %xmm0 # sched: [9:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_minsd:
+; SKX: # %bb.0:
+; SKX-NEXT: vminsd %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: vminsd (%rdi), %xmm0, %xmm0 # sched: [9:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_minsd:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vminsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: vminsd (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_minsd:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vminsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; ZNVER1-NEXT: vminsd (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call <2 x double> @llvm.x86.sse2.min.sd(<2 x double> %a0, <2 x double> %a1)
%2 = load <2 x double>, <2 x double> *%a2, align 16
%3 = call <2 x double> @llvm.x86.sse2.min.sd(<2 x double> %1, <2 x double> %2)
@@ -1810,53 +2528,74 @@ declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind
define void @test_movapd(<2 x double> *%a0, <2 x double> *%a1) {
; GENERIC-LABEL: test_movapd:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: movapd (%rdi), %xmm0
-; GENERIC-NEXT: addpd %xmm0, %xmm0
-; GENERIC-NEXT: movapd %xmm0, (%rsi)
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: movapd (%rdi), %xmm0 # sched: [6:0.50]
+; GENERIC-NEXT: addpd %xmm0, %xmm0 # sched: [3:1.00]
+; GENERIC-NEXT: movapd %xmm0, (%rsi) # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_movapd:
-; ATOM: # BB#0:
-; ATOM-NEXT: movapd (%rdi), %xmm0
-; ATOM-NEXT: addpd %xmm0, %xmm0
-; ATOM-NEXT: movapd %xmm0, (%rsi)
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: movapd (%rdi), %xmm0 # sched: [1:1.00]
+; ATOM-NEXT: addpd %xmm0, %xmm0 # sched: [6:3.00]
+; ATOM-NEXT: movapd %xmm0, (%rsi) # sched: [1:1.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_movapd:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: movapd (%rdi), %xmm0 # sched: [3:1.00]
; SLM-NEXT: addpd %xmm0, %xmm0 # sched: [3:1.00]
; SLM-NEXT: movapd %xmm0, (%rsi) # sched: [1:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_movapd:
-; SANDY: # BB#0:
-; SANDY-NEXT: vmovapd (%rdi), %xmm0 # sched: [4:0.50]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vmovapd (%rdi), %xmm0 # sched: [6:0.50]
; SANDY-NEXT: vaddpd %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vmovapd %xmm0, (%rsi) # sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmovapd %xmm0, (%rsi) # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movapd:
-; HASWELL: # BB#0:
-; HASWELL-NEXT: vmovapd (%rdi), %xmm0 # sched: [4:0.50]
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vmovapd (%rdi), %xmm0 # sched: [6:0.50]
; HASWELL-NEXT: vaddpd %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
; HASWELL-NEXT: vmovapd %xmm0, (%rsi) # sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_movapd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vmovapd (%rdi), %xmm0 # sched: [5:0.50]
+; BROADWELL-NEXT: vaddpd %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
+; BROADWELL-NEXT: vmovapd %xmm0, (%rsi) # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_movapd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vmovapd (%rdi), %xmm0 # sched: [6:0.50]
+; SKYLAKE-NEXT: vaddpd %xmm0, %xmm0, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vmovapd %xmm0, (%rsi) # sched: [1:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_movapd:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovapd (%rdi), %xmm0 # sched: [6:0.50]
+; SKX-NEXT: vaddpd %xmm0, %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: vmovapd %xmm0, (%rsi) # sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_movapd:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vmovapd (%rdi), %xmm0 # sched: [5:1.00]
; BTVER2-NEXT: vaddpd %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: vmovapd %xmm0, (%rsi) # sched: [1:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_movapd:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vmovapd (%rdi), %xmm0 # sched: [8:0.50]
; ZNVER1-NEXT: vaddpd %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
; ZNVER1-NEXT: vmovapd %xmm0, (%rsi) # sched: [1:0.50]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = load <2 x double>, <2 x double> *%a0, align 16
%2 = fadd <2 x double> %1, %1
store <2 x double> %2, <2 x double> *%a1, align 16
@@ -1865,53 +2604,74 @@ define void @test_movapd(<2 x double> *%a0, <2 x double> *%a1) {
define void @test_movdqa(<2 x i64> *%a0, <2 x i64> *%a1) {
; GENERIC-LABEL: test_movdqa:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: movdqa (%rdi), %xmm0
-; GENERIC-NEXT: paddq %xmm0, %xmm0
-; GENERIC-NEXT: movdqa %xmm0, (%rsi)
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: movdqa (%rdi), %xmm0 # sched: [6:0.50]
+; GENERIC-NEXT: paddq %xmm0, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT: movdqa %xmm0, (%rsi) # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_movdqa:
-; ATOM: # BB#0:
-; ATOM-NEXT: movdqa (%rdi), %xmm0
-; ATOM-NEXT: paddq %xmm0, %xmm0
-; ATOM-NEXT: movdqa %xmm0, (%rsi)
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: movdqa (%rdi), %xmm0 # sched: [1:1.00]
+; ATOM-NEXT: paddq %xmm0, %xmm0 # sched: [2:1.00]
+; ATOM-NEXT: movdqa %xmm0, (%rsi) # sched: [1:1.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_movdqa:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: movdqa (%rdi), %xmm0 # sched: [3:1.00]
; SLM-NEXT: paddq %xmm0, %xmm0 # sched: [1:0.50]
; SLM-NEXT: movdqa %xmm0, (%rsi) # sched: [1:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_movdqa:
-; SANDY: # BB#0:
-; SANDY-NEXT: vmovdqa (%rdi), %xmm0 # sched: [4:0.50]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vmovdqa (%rdi), %xmm0 # sched: [6:0.50]
; SANDY-NEXT: vpaddq %xmm0, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vmovdqa %xmm0, (%rsi) # sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmovdqa %xmm0, (%rsi) # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movdqa:
-; HASWELL: # BB#0:
-; HASWELL-NEXT: vmovdqa (%rdi), %xmm0 # sched: [4:0.50]
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vmovdqa (%rdi), %xmm0 # sched: [6:0.50]
; HASWELL-NEXT: vpaddq %xmm0, %xmm0, %xmm0 # sched: [1:0.50]
; HASWELL-NEXT: vmovdqa %xmm0, (%rsi) # sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_movdqa:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vmovdqa (%rdi), %xmm0 # sched: [5:0.50]
+; BROADWELL-NEXT: vpaddq %xmm0, %xmm0, %xmm0 # sched: [1:0.50]
+; BROADWELL-NEXT: vmovdqa %xmm0, (%rsi) # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_movdqa:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vmovdqa (%rdi), %xmm0 # sched: [6:0.50]
+; SKYLAKE-NEXT: vpaddq %xmm0, %xmm0, %xmm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: vmovdqa %xmm0, (%rsi) # sched: [1:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_movdqa:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovdqa (%rdi), %xmm0 # sched: [6:0.50]
+; SKX-NEXT: vpaddq %xmm0, %xmm0, %xmm0 # sched: [1:0.33]
+; SKX-NEXT: vmovdqa %xmm0, (%rsi) # sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_movdqa:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vmovdqa (%rdi), %xmm0 # sched: [5:1.00]
; BTVER2-NEXT: vpaddq %xmm0, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vmovdqa %xmm0, (%rsi) # sched: [1:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_movdqa:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vmovdqa (%rdi), %xmm0 # sched: [8:0.50]
; ZNVER1-NEXT: vpaddq %xmm0, %xmm0, %xmm0 # sched: [1:0.25]
; ZNVER1-NEXT: vmovdqa %xmm0, (%rsi) # sched: [1:0.50]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = load <2 x i64>, <2 x i64> *%a0, align 16
%2 = add <2 x i64> %1, %1
store <2 x i64> %2, <2 x i64> *%a1, align 16
@@ -1920,53 +2680,74 @@ define void @test_movdqa(<2 x i64> *%a0, <2 x i64> *%a1) {
define void @test_movdqu(<2 x i64> *%a0, <2 x i64> *%a1) {
; GENERIC-LABEL: test_movdqu:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: movdqu (%rdi), %xmm0
-; GENERIC-NEXT: paddq %xmm0, %xmm0
-; GENERIC-NEXT: movdqu %xmm0, (%rsi)
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: movdqu (%rdi), %xmm0 # sched: [6:0.50]
+; GENERIC-NEXT: paddq %xmm0, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT: movdqu %xmm0, (%rsi) # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_movdqu:
-; ATOM: # BB#0:
-; ATOM-NEXT: movdqu (%rdi), %xmm0
-; ATOM-NEXT: paddq %xmm0, %xmm0
-; ATOM-NEXT: movdqu %xmm0, (%rsi)
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: movdqu (%rdi), %xmm0 # sched: [3:1.50]
+; ATOM-NEXT: paddq %xmm0, %xmm0 # sched: [2:1.00]
+; ATOM-NEXT: movdqu %xmm0, (%rsi) # sched: [2:1.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_movdqu:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: movdqu (%rdi), %xmm0 # sched: [3:1.00]
; SLM-NEXT: paddq %xmm0, %xmm0 # sched: [1:0.50]
; SLM-NEXT: movdqu %xmm0, (%rsi) # sched: [1:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_movdqu:
-; SANDY: # BB#0:
-; SANDY-NEXT: vmovdqu (%rdi), %xmm0 # sched: [4:0.50]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vmovdqu (%rdi), %xmm0 # sched: [6:0.50]
; SANDY-NEXT: vpaddq %xmm0, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vmovdqu %xmm0, (%rsi) # sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmovdqu %xmm0, (%rsi) # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movdqu:
-; HASWELL: # BB#0:
-; HASWELL-NEXT: vmovdqu (%rdi), %xmm0 # sched: [4:0.50]
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vmovdqu (%rdi), %xmm0 # sched: [6:0.50]
; HASWELL-NEXT: vpaddq %xmm0, %xmm0, %xmm0 # sched: [1:0.50]
; HASWELL-NEXT: vmovdqu %xmm0, (%rsi) # sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_movdqu:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vmovdqu (%rdi), %xmm0 # sched: [5:0.50]
+; BROADWELL-NEXT: vpaddq %xmm0, %xmm0, %xmm0 # sched: [1:0.50]
+; BROADWELL-NEXT: vmovdqu %xmm0, (%rsi) # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_movdqu:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vmovdqu (%rdi), %xmm0 # sched: [6:0.50]
+; SKYLAKE-NEXT: vpaddq %xmm0, %xmm0, %xmm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: vmovdqu %xmm0, (%rsi) # sched: [1:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_movdqu:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovdqu (%rdi), %xmm0 # sched: [6:0.50]
+; SKX-NEXT: vpaddq %xmm0, %xmm0, %xmm0 # sched: [1:0.33]
+; SKX-NEXT: vmovdqu %xmm0, (%rsi) # sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_movdqu:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vmovdqu (%rdi), %xmm0 # sched: [5:1.00]
; BTVER2-NEXT: vpaddq %xmm0, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vmovdqu %xmm0, (%rsi) # sched: [1:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_movdqu:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vmovdqu (%rdi), %xmm0 # sched: [8:0.50]
; ZNVER1-NEXT: vpaddq %xmm0, %xmm0, %xmm0 # sched: [1:0.25]
; ZNVER1-NEXT: vmovdqu %xmm0, (%rsi) # sched: [1:0.50]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = load <2 x i64>, <2 x i64> *%a0, align 1
%2 = add <2 x i64> %1, %1
store <2 x i64> %2, <2 x i64> *%a1, align 1
@@ -1975,27 +2756,27 @@ define void @test_movdqu(<2 x i64> *%a0, <2 x i64> *%a1) {
define i32 @test_movd(<4 x i32> %a0, i32 %a1, i32 *%a2) {
; GENERIC-LABEL: test_movd:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: movd %edi, %xmm1
-; GENERIC-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; GENERIC-NEXT: paddd %xmm0, %xmm1
-; GENERIC-NEXT: paddd %xmm0, %xmm2
-; GENERIC-NEXT: movd %xmm2, %eax
-; GENERIC-NEXT: movd %xmm1, (%rsi)
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: movd %edi, %xmm1 # sched: [1:1.00]
+; GENERIC-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [6:0.50]
+; GENERIC-NEXT: paddd %xmm0, %xmm1 # sched: [1:0.50]
+; GENERIC-NEXT: paddd %xmm0, %xmm2 # sched: [1:0.50]
+; GENERIC-NEXT: movd %xmm2, %eax # sched: [2:1.00]
+; GENERIC-NEXT: movd %xmm1, (%rsi) # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_movd:
-; ATOM: # BB#0:
-; ATOM-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; ATOM-NEXT: paddd %xmm0, %xmm1
-; ATOM-NEXT: movd %xmm1, %eax
-; ATOM-NEXT: movd %edi, %xmm1
-; ATOM-NEXT: paddd %xmm0, %xmm1
-; ATOM-NEXT: movd %xmm1, (%rsi)
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [1:1.00]
+; ATOM-NEXT: paddd %xmm0, %xmm1 # sched: [1:0.50]
+; ATOM-NEXT: movd %xmm1, %eax # sched: [3:3.00]
+; ATOM-NEXT: movd %edi, %xmm1 # sched: [1:1.00]
+; ATOM-NEXT: paddd %xmm0, %xmm1 # sched: [1:0.50]
+; ATOM-NEXT: movd %xmm1, (%rsi) # sched: [1:1.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_movd:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [3:1.00]
; SLM-NEXT: movd %edi, %xmm1 # sched: [1:0.50]
; SLM-NEXT: paddd %xmm0, %xmm1 # sched: [1:0.50]
@@ -2005,44 +2786,74 @@ define i32 @test_movd(<4 x i32> %a0, i32 %a1, i32 *%a2) {
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_movd:
-; SANDY: # BB#0:
-; SANDY-NEXT: vmovd %edi, %xmm1 # sched: [1:0.33]
-; SANDY-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [4:0.50]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vmovd %edi, %xmm1 # sched: [1:1.00]
+; SANDY-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [6:0.50]
; SANDY-NEXT: vpaddd %xmm1, %xmm0, %xmm1 # sched: [1:0.50]
; SANDY-NEXT: vpaddd %xmm2, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vmovd %xmm0, %eax # sched: [1:0.33]
-; SANDY-NEXT: vmovd %xmm1, (%rsi) # sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmovd %xmm0, %eax # sched: [2:1.00]
+; SANDY-NEXT: vmovd %xmm1, (%rsi) # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movd:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vmovd %edi, %xmm1 # sched: [1:1.00]
-; HASWELL-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [4:0.50]
+; HASWELL-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [5:0.50]
; HASWELL-NEXT: vpaddd %xmm1, %xmm0, %xmm1 # sched: [1:0.50]
; HASWELL-NEXT: vpaddd %xmm2, %xmm0, %xmm0 # sched: [1:0.50]
; HASWELL-NEXT: vmovd %xmm0, %eax # sched: [1:1.00]
; HASWELL-NEXT: vmovd %xmm1, (%rsi) # sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_movd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vmovd %edi, %xmm1 # sched: [1:1.00]
+; BROADWELL-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [5:0.50]
+; BROADWELL-NEXT: vpaddd %xmm1, %xmm0, %xmm1 # sched: [1:0.50]
+; BROADWELL-NEXT: vpaddd %xmm2, %xmm0, %xmm0 # sched: [1:0.50]
+; BROADWELL-NEXT: vmovd %xmm0, %eax # sched: [1:1.00]
+; BROADWELL-NEXT: vmovd %xmm1, (%rsi) # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_movd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vmovd %edi, %xmm1 # sched: [1:1.00]
+; SKYLAKE-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [5:0.50]
+; SKYLAKE-NEXT: vpaddd %xmm1, %xmm0, %xmm1 # sched: [1:0.33]
+; SKYLAKE-NEXT: vpaddd %xmm2, %xmm0, %xmm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: vmovd %xmm0, %eax # sched: [2:1.00]
+; SKYLAKE-NEXT: vmovd %xmm1, (%rsi) # sched: [1:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_movd:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [5:0.50]
+; SKX-NEXT: vmovd %edi, %xmm2 # sched: [1:1.00]
+; SKX-NEXT: vpaddd %xmm2, %xmm0, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKX-NEXT: vmovd %xmm0, %eax # sched: [2:1.00]
+; SKX-NEXT: vmovd %xmm2, (%rsi) # sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_movd:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [5:1.00]
-; BTVER2-NEXT: vmovd %edi, %xmm1 # sched: [1:0.17]
+; BTVER2-NEXT: vmovd %edi, %xmm1 # sched: [1:0.50]
; BTVER2-NEXT: vpaddd %xmm1, %xmm0, %xmm1 # sched: [1:0.50]
; BTVER2-NEXT: vmovd %xmm1, (%rsi) # sched: [1:1.00]
; BTVER2-NEXT: vpaddd %xmm2, %xmm0, %xmm0 # sched: [1:0.50]
-; BTVER2-NEXT: vmovd %xmm0, %eax # sched: [1:0.17]
+; BTVER2-NEXT: vmovd %xmm0, %eax # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_movd:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [8:0.50]
-; ZNVER1-NEXT: vmovd %edi, %xmm1 # sched: [1:0.25]
+; ZNVER1-NEXT: vmovd %edi, %xmm1 # sched: [3:1.00]
; ZNVER1-NEXT: vpaddd %xmm1, %xmm0, %xmm1 # sched: [1:0.25]
; ZNVER1-NEXT: vmovd %xmm1, (%rsi) # sched: [1:0.50]
; ZNVER1-NEXT: vpaddd %xmm2, %xmm0, %xmm0 # sched: [1:0.25]
-; ZNVER1-NEXT: vmovd %xmm0, %eax # sched: [1:0.25]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: vmovd %xmm0, %eax # sched: [2:1.00]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = insertelement <4 x i32> undef, i32 %a1, i32 0
%2 = load i32, i32 *%a2
%3 = insertelement <4 x i32> undef, i32 %2, i32 0
@@ -2056,27 +2867,27 @@ define i32 @test_movd(<4 x i32> %a0, i32 %a1, i32 *%a2) {
define i64 @test_movd_64(<2 x i64> %a0, i64 %a1, i64 *%a2) {
; GENERIC-LABEL: test_movd_64:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: movq %rdi, %xmm1
-; GENERIC-NEXT: movq {{.*#+}} xmm2 = mem[0],zero
-; GENERIC-NEXT: paddq %xmm0, %xmm1
-; GENERIC-NEXT: paddq %xmm0, %xmm2
-; GENERIC-NEXT: movq %xmm2, %rax
-; GENERIC-NEXT: movq %xmm1, (%rsi)
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: movq %rdi, %xmm1 # sched: [1:1.00]
+; GENERIC-NEXT: movq {{.*#+}} xmm2 = mem[0],zero sched: [6:0.50]
+; GENERIC-NEXT: paddq %xmm0, %xmm1 # sched: [1:0.50]
+; GENERIC-NEXT: paddq %xmm0, %xmm2 # sched: [1:0.50]
+; GENERIC-NEXT: movq %xmm2, %rax # sched: [2:1.00]
+; GENERIC-NEXT: movq %xmm1, (%rsi) # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_movd_64:
-; ATOM: # BB#0:
-; ATOM-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
-; ATOM-NEXT: movq %rdi, %xmm2
-; ATOM-NEXT: paddq %xmm0, %xmm2
-; ATOM-NEXT: paddq %xmm0, %xmm1
-; ATOM-NEXT: movq %xmm2, (%rsi)
-; ATOM-NEXT: movq %xmm1, %rax
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: movq {{.*#+}} xmm1 = mem[0],zero sched: [1:1.00]
+; ATOM-NEXT: movq %rdi, %xmm2 # sched: [1:1.00]
+; ATOM-NEXT: paddq %xmm0, %xmm2 # sched: [2:1.00]
+; ATOM-NEXT: paddq %xmm0, %xmm1 # sched: [2:1.00]
+; ATOM-NEXT: movq %xmm2, (%rsi) # sched: [1:1.00]
+; ATOM-NEXT: movq %xmm1, %rax # sched: [3:3.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_movd_64:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: movq {{.*#+}} xmm2 = mem[0],zero sched: [3:1.00]
; SLM-NEXT: movq %rdi, %xmm1 # sched: [1:0.50]
; SLM-NEXT: paddq %xmm0, %xmm1 # sched: [1:0.50]
@@ -2086,44 +2897,74 @@ define i64 @test_movd_64(<2 x i64> %a0, i64 %a1, i64 *%a2) {
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_movd_64:
-; SANDY: # BB#0:
-; SANDY-NEXT: vmovq %rdi, %xmm1 # sched: [1:0.33]
-; SANDY-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero sched: [4:0.50]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vmovq %rdi, %xmm1 # sched: [1:1.00]
+; SANDY-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero sched: [6:0.50]
; SANDY-NEXT: vpaddq %xmm1, %xmm0, %xmm1 # sched: [1:0.50]
; SANDY-NEXT: vpaddq %xmm2, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vmovq %xmm0, %rax # sched: [1:0.33]
-; SANDY-NEXT: vmovq %xmm1, (%rsi) # sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmovq %xmm0, %rax # sched: [2:1.00]
+; SANDY-NEXT: vmovq %xmm1, (%rsi) # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movd_64:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vmovq %rdi, %xmm1 # sched: [1:1.00]
-; HASWELL-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero sched: [4:0.50]
+; HASWELL-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero sched: [5:0.50]
; HASWELL-NEXT: vpaddq %xmm1, %xmm0, %xmm1 # sched: [1:0.50]
; HASWELL-NEXT: vpaddq %xmm2, %xmm0, %xmm0 # sched: [1:0.50]
; HASWELL-NEXT: vmovq %xmm0, %rax # sched: [1:1.00]
; HASWELL-NEXT: vmovq %xmm1, (%rsi) # sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_movd_64:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vmovq %rdi, %xmm1 # sched: [1:1.00]
+; BROADWELL-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero sched: [5:0.50]
+; BROADWELL-NEXT: vpaddq %xmm1, %xmm0, %xmm1 # sched: [1:0.50]
+; BROADWELL-NEXT: vpaddq %xmm2, %xmm0, %xmm0 # sched: [1:0.50]
+; BROADWELL-NEXT: vmovq %xmm0, %rax # sched: [1:1.00]
+; BROADWELL-NEXT: vmovq %xmm1, (%rsi) # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_movd_64:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vmovq %rdi, %xmm1 # sched: [1:1.00]
+; SKYLAKE-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero sched: [5:0.50]
+; SKYLAKE-NEXT: vpaddq %xmm1, %xmm0, %xmm1 # sched: [1:0.33]
+; SKYLAKE-NEXT: vpaddq %xmm2, %xmm0, %xmm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: vmovq %xmm0, %rax # sched: [2:1.00]
+; SKYLAKE-NEXT: vmovq %xmm1, (%rsi) # sched: [1:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_movd_64:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero sched: [5:0.50]
+; SKX-NEXT: vmovq %rdi, %xmm2 # sched: [1:1.00]
+; SKX-NEXT: vpaddq %xmm2, %xmm0, %xmm2 # sched: [1:0.33]
+; SKX-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKX-NEXT: vmovq %xmm0, %rax # sched: [2:1.00]
+; SKX-NEXT: vmovq %xmm2, (%rsi) # sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_movd_64:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero sched: [5:1.00]
-; BTVER2-NEXT: vmovq %rdi, %xmm1 # sched: [1:0.17]
+; BTVER2-NEXT: vmovq %rdi, %xmm1 # sched: [1:0.50]
; BTVER2-NEXT: vpaddq %xmm1, %xmm0, %xmm1 # sched: [1:0.50]
; BTVER2-NEXT: vmovq %xmm1, (%rsi) # sched: [1:1.00]
; BTVER2-NEXT: vpaddq %xmm2, %xmm0, %xmm0 # sched: [1:0.50]
-; BTVER2-NEXT: vmovq %xmm0, %rax # sched: [1:0.17]
+; BTVER2-NEXT: vmovq %xmm0, %rax # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_movd_64:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero sched: [8:0.50]
-; ZNVER1-NEXT: vmovq %rdi, %xmm1 # sched: [1:0.25]
+; ZNVER1-NEXT: vmovq %rdi, %xmm1 # sched: [3:1.00]
; ZNVER1-NEXT: vpaddq %xmm1, %xmm0, %xmm1 # sched: [1:0.25]
; ZNVER1-NEXT: vmovq %xmm1, (%rsi) # sched: [1:0.50]
; ZNVER1-NEXT: vpaddq %xmm2, %xmm0, %xmm0 # sched: [1:0.25]
-; ZNVER1-NEXT: vmovq %xmm0, %rax # sched: [1:0.25]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: vmovq %xmm0, %rax # sched: [2:1.00]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = insertelement <2 x i64> undef, i64 %a1, i64 0
%2 = load i64, i64 *%a2
%3 = insertelement <2 x i64> undef, i64 %2, i64 0
@@ -2137,53 +2978,74 @@ define i64 @test_movd_64(<2 x i64> %a0, i64 %a1, i64 *%a2) {
define void @test_movhpd(<2 x double> %a0, <2 x double> %a1, x86_mmx *%a2) {
; GENERIC-LABEL: test_movhpd:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: movhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
-; GENERIC-NEXT: addpd %xmm0, %xmm1
-; GENERIC-NEXT: movhpd %xmm1, (%rdi)
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: movhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [7:1.00]
+; GENERIC-NEXT: addpd %xmm0, %xmm1 # sched: [3:1.00]
+; GENERIC-NEXT: movhpd %xmm1, (%rdi) # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_movhpd:
-; ATOM: # BB#0:
-; ATOM-NEXT: movhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
-; ATOM-NEXT: addpd %xmm0, %xmm1
-; ATOM-NEXT: movhpd %xmm1, (%rdi)
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: movhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [1:1.00]
+; ATOM-NEXT: addpd %xmm0, %xmm1 # sched: [6:3.00]
+; ATOM-NEXT: movhpd %xmm1, (%rdi) # sched: [1:1.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_movhpd:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: movhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [4:1.00]
; SLM-NEXT: addpd %xmm0, %xmm1 # sched: [3:1.00]
; SLM-NEXT: movhpd %xmm1, (%rdi) # sched: [1:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_movhpd:
-; SANDY: # BB#0:
-; SANDY-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [5:1.00]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [7:1.00]
; SANDY-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vmovhpd %xmm0, (%rdi) # sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmovhpd %xmm0, (%rdi) # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movhpd:
-; HASWELL: # BB#0:
-; HASWELL-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [5:1.00]
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [6:1.00]
; HASWELL-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; HASWELL-NEXT: vmovhpd %xmm0, (%rdi) # sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_movhpd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [6:1.00]
+; BROADWELL-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; BROADWELL-NEXT: vmovhpd %xmm0, (%rdi) # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_movhpd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [6:1.00]
+; SKYLAKE-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vmovhpd %xmm0, (%rdi) # sched: [1:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_movhpd:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [6:1.00]
+; SKX-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: vmovhpd %xmm0, (%rdi) # sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_movhpd:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [6:1.00]
; BTVER2-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: vmovhpd %xmm0, (%rdi) # sched: [1:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_movhpd:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [8:0.50]
; ZNVER1-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; ZNVER1-NEXT: vmovhpd %xmm0, (%rdi) # sched: [1:0.50]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = bitcast x86_mmx* %a2 to double*
%2 = load double, double *%1, align 8
%3 = insertelement <2 x double> %a1, double %2, i32 1
@@ -2195,53 +3057,74 @@ define void @test_movhpd(<2 x double> %a0, <2 x double> %a1, x86_mmx *%a2) {
define void @test_movlpd(<2 x double> %a0, <2 x double> %a1, x86_mmx *%a2) {
; GENERIC-LABEL: test_movlpd:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: movlpd {{.*#+}} xmm1 = mem[0],xmm1[1]
-; GENERIC-NEXT: addpd %xmm0, %xmm1
-; GENERIC-NEXT: movlpd %xmm1, (%rdi)
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: movlpd {{.*#+}} xmm1 = mem[0],xmm1[1] sched: [7:1.00]
+; GENERIC-NEXT: addpd %xmm0, %xmm1 # sched: [3:1.00]
+; GENERIC-NEXT: movlpd %xmm1, (%rdi) # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_movlpd:
-; ATOM: # BB#0:
-; ATOM-NEXT: movlpd {{.*#+}} xmm1 = mem[0],xmm1[1]
-; ATOM-NEXT: addpd %xmm0, %xmm1
-; ATOM-NEXT: movlpd %xmm1, (%rdi)
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: movlpd {{.*#+}} xmm1 = mem[0],xmm1[1] sched: [1:1.00]
+; ATOM-NEXT: addpd %xmm0, %xmm1 # sched: [6:3.00]
+; ATOM-NEXT: movlpd %xmm1, (%rdi) # sched: [1:1.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_movlpd:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: movlpd {{.*#+}} xmm1 = mem[0],xmm1[1] sched: [4:1.00]
; SLM-NEXT: addpd %xmm0, %xmm1 # sched: [3:1.00]
; SLM-NEXT: movlpd %xmm1, (%rdi) # sched: [1:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_movlpd:
-; SANDY: # BB#0:
-; SANDY-NEXT: vmovlpd {{.*#+}} xmm1 = mem[0],xmm1[1] sched: [5:1.00]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vmovlpd {{.*#+}} xmm1 = mem[0],xmm1[1] sched: [7:1.00]
; SANDY-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vmovlpd %xmm0, (%rdi) # sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmovlpd %xmm0, (%rdi) # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movlpd:
-; HASWELL: # BB#0:
-; HASWELL-NEXT: vmovlpd {{.*#+}} xmm1 = mem[0],xmm1[1] sched: [5:1.00]
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vmovlpd {{.*#+}} xmm1 = mem[0],xmm1[1] sched: [6:1.00]
; HASWELL-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; HASWELL-NEXT: vmovlpd %xmm0, (%rdi) # sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_movlpd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vmovlpd {{.*#+}} xmm1 = mem[0],xmm1[1] sched: [6:1.00]
+; BROADWELL-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; BROADWELL-NEXT: vmovlpd %xmm0, (%rdi) # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_movlpd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vmovlpd {{.*#+}} xmm1 = mem[0],xmm1[1] sched: [6:1.00]
+; SKYLAKE-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vmovlpd %xmm0, (%rdi) # sched: [1:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_movlpd:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovlpd {{.*#+}} xmm1 = mem[0],xmm1[1] sched: [6:1.00]
+; SKX-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: vmovlpd %xmm0, (%rdi) # sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_movlpd:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vmovlpd {{.*#+}} xmm1 = mem[0],xmm1[1] sched: [6:1.00]
; BTVER2-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: vmovlpd %xmm0, (%rdi) # sched: [1:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_movlpd:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vmovlpd {{.*#+}} xmm1 = mem[0],xmm1[1] sched: [8:0.50]
; ZNVER1-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; ZNVER1-NEXT: vmovlpd %xmm0, (%rdi) # sched: [1:0.50]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = bitcast x86_mmx* %a2 to double*
%2 = load double, double *%1, align 8
%3 = insertelement <2 x double> %a1, double %2, i32 0
@@ -2253,41 +3136,56 @@ define void @test_movlpd(<2 x double> %a0, <2 x double> %a1, x86_mmx *%a2) {
define i32 @test_movmskpd(<2 x double> %a0) {
; GENERIC-LABEL: test_movmskpd:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: movmskpd %xmm0, %eax
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: movmskpd %xmm0, %eax # sched: [2:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_movmskpd:
-; ATOM: # BB#0:
-; ATOM-NEXT: movmskpd %xmm0, %eax
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: movmskpd %xmm0, %eax # sched: [3:3.00]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_movmskpd:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: movmskpd %xmm0, %eax # sched: [1:0.50]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_movmskpd:
-; SANDY: # BB#0:
-; SANDY-NEXT: vmovmskpd %xmm0, %eax # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vmovmskpd %xmm0, %eax # sched: [2:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movmskpd:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vmovmskpd %xmm0, %eax # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_movmskpd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vmovmskpd %xmm0, %eax # sched: [3:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_movmskpd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vmovmskpd %xmm0, %eax # sched: [2:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_movmskpd:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovmskpd %xmm0, %eax # sched: [2:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_movmskpd:
-; BTVER2: # BB#0:
-; BTVER2-NEXT: vmovmskpd %xmm0, %eax # sched: [1:0.50]
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: vmovmskpd %xmm0, %eax # sched: [3:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_movmskpd:
-; ZNVER1: # BB#0:
-; ZNVER1-NEXT: vmovmskpd %xmm0, %eax # sched: [1:0.25]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vmovmskpd %xmm0, %eax # sched: [1:1.00]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call i32 @llvm.x86.sse2.movmsk.pd(<2 x double> %a0)
ret i32 %1
}
@@ -2295,48 +3193,66 @@ declare i32 @llvm.x86.sse2.movmsk.pd(<2 x double>) nounwind readnone
define void @test_movntdqa(<2 x i64> %a0, <2 x i64> *%a1) {
; GENERIC-LABEL: test_movntdqa:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: paddq %xmm0, %xmm0
-; GENERIC-NEXT: movntdq %xmm0, (%rdi)
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: paddq %xmm0, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT: movntdq %xmm0, (%rdi) # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_movntdqa:
-; ATOM: # BB#0:
-; ATOM-NEXT: paddq %xmm0, %xmm0
-; ATOM-NEXT: movntdq %xmm0, (%rdi)
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: paddq %xmm0, %xmm0 # sched: [2:1.00]
+; ATOM-NEXT: movntdq %xmm0, (%rdi) # sched: [1:1.00]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_movntdqa:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: paddq %xmm0, %xmm0 # sched: [1:0.50]
; SLM-NEXT: movntdq %xmm0, (%rdi) # sched: [1:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_movntdqa:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vpaddq %xmm0, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vmovntdq %xmm0, (%rdi) # sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmovntdq %xmm0, (%rdi) # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movntdqa:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vpaddq %xmm0, %xmm0, %xmm0 # sched: [1:0.50]
; HASWELL-NEXT: vmovntdq %xmm0, (%rdi) # sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_movntdqa:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpaddq %xmm0, %xmm0, %xmm0 # sched: [1:0.50]
+; BROADWELL-NEXT: vmovntdq %xmm0, (%rdi) # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_movntdqa:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpaddq %xmm0, %xmm0, %xmm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: vmovntdq %xmm0, (%rdi) # sched: [1:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_movntdqa:
+; SKX: # %bb.0:
+; SKX-NEXT: vpaddq %xmm0, %xmm0, %xmm0 # sched: [1:0.33]
+; SKX-NEXT: vmovntdq %xmm0, (%rdi) # sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_movntdqa:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vpaddq %xmm0, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vmovntdq %xmm0, (%rdi) # sched: [1:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_movntdqa:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vpaddq %xmm0, %xmm0, %xmm0 # sched: [1:0.25]
; ZNVER1-NEXT: vmovntdq %xmm0, (%rdi) # sched: [1:0.50]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = add <2 x i64> %a0, %a0
store <2 x i64> %1, <2 x i64> *%a1, align 16, !nontemporal !0
ret void
@@ -2344,46 +3260,64 @@ define void @test_movntdqa(<2 x i64> %a0, <2 x i64> *%a1) {
define void @test_movntpd(<2 x double> %a0, <2 x double> *%a1) {
; GENERIC-LABEL: test_movntpd:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: addpd %xmm0, %xmm0
-; GENERIC-NEXT: movntpd %xmm0, (%rdi)
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: addpd %xmm0, %xmm0 # sched: [3:1.00]
+; GENERIC-NEXT: movntpd %xmm0, (%rdi) # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_movntpd:
-; ATOM: # BB#0:
-; ATOM-NEXT: addpd %xmm0, %xmm0
-; ATOM-NEXT: movntpd %xmm0, (%rdi)
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: addpd %xmm0, %xmm0 # sched: [6:3.00]
+; ATOM-NEXT: movntpd %xmm0, (%rdi) # sched: [1:1.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_movntpd:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: addpd %xmm0, %xmm0 # sched: [3:1.00]
; SLM-NEXT: movntpd %xmm0, (%rdi) # sched: [1:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_movntpd:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vaddpd %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vmovntpd %xmm0, (%rdi) # sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmovntpd %xmm0, (%rdi) # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movntpd:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vaddpd %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
; HASWELL-NEXT: vmovntpd %xmm0, (%rdi) # sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_movntpd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vaddpd %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
+; BROADWELL-NEXT: vmovntpd %xmm0, (%rdi) # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_movntpd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vaddpd %xmm0, %xmm0, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vmovntpd %xmm0, (%rdi) # sched: [1:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_movntpd:
+; SKX: # %bb.0:
+; SKX-NEXT: vaddpd %xmm0, %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: vmovntpd %xmm0, (%rdi) # sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_movntpd:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vaddpd %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: vmovntpd %xmm0, (%rdi) # sched: [1:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_movntpd:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vaddpd %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
; ZNVER1-NEXT: vmovntpd %xmm0, (%rdi) # sched: [1:0.50]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = fadd <2 x double> %a0, %a0
store <2 x double> %1, <2 x double> *%a1, align 16, !nontemporal !0
ret void
@@ -2391,53 +3325,74 @@ define void @test_movntpd(<2 x double> %a0, <2 x double> *%a1) {
define <2 x i64> @test_movq_mem(<2 x i64> %a0, i64 *%a1) {
; GENERIC-LABEL: test_movq_mem:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
-; GENERIC-NEXT: paddq %xmm1, %xmm0
-; GENERIC-NEXT: movq %xmm0, (%rdi)
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: movq {{.*#+}} xmm1 = mem[0],zero sched: [6:0.50]
+; GENERIC-NEXT: paddq %xmm1, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT: movq %xmm0, (%rdi) # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_movq_mem:
-; ATOM: # BB#0:
-; ATOM-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
-; ATOM-NEXT: paddq %xmm1, %xmm0
-; ATOM-NEXT: movq %xmm0, (%rdi)
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: movq {{.*#+}} xmm1 = mem[0],zero sched: [1:1.00]
+; ATOM-NEXT: paddq %xmm1, %xmm0 # sched: [2:1.00]
+; ATOM-NEXT: movq %xmm0, (%rdi) # sched: [1:1.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_movq_mem:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: movq {{.*#+}} xmm1 = mem[0],zero sched: [3:1.00]
; SLM-NEXT: paddq %xmm1, %xmm0 # sched: [1:0.50]
; SLM-NEXT: movq %xmm0, (%rdi) # sched: [1:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_movq_mem:
-; SANDY: # BB#0:
-; SANDY-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero sched: [4:0.50]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero sched: [6:0.50]
; SANDY-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vmovq %xmm0, (%rdi) # sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmovq %xmm0, (%rdi) # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movq_mem:
-; HASWELL: # BB#0:
-; HASWELL-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero sched: [4:0.50]
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero sched: [5:0.50]
; HASWELL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; HASWELL-NEXT: vmovq %xmm0, (%rdi) # sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_movq_mem:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero sched: [5:0.50]
+; BROADWELL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BROADWELL-NEXT: vmovq %xmm0, (%rdi) # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_movq_mem:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero sched: [5:0.50]
+; SKYLAKE-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: vmovq %xmm0, (%rdi) # sched: [1:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_movq_mem:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero sched: [5:0.50]
+; SKX-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKX-NEXT: vmovq %xmm0, (%rdi) # sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_movq_mem:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero sched: [5:1.00]
; BTVER2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vmovq %xmm0, (%rdi) # sched: [1:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_movq_mem:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero sched: [8:0.50]
; ZNVER1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
; ZNVER1-NEXT: vmovq %xmm0, (%rdi) # sched: [1:0.50]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = load i64, i64* %a1, align 1
%2 = insertelement <2 x i64> zeroinitializer, i64 %1, i32 0
%3 = add <2 x i64> %a0, %2
@@ -2448,48 +3403,66 @@ define <2 x i64> @test_movq_mem(<2 x i64> %a0, i64 *%a1) {
define <2 x i64> @test_movq_reg(<2 x i64> %a0, <2 x i64> %a1) {
; GENERIC-LABEL: test_movq_reg:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
-; GENERIC-NEXT: paddq %xmm1, %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero sched: [1:1.00]
+; GENERIC-NEXT: paddq %xmm1, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_movq_reg:
-; ATOM: # BB#0:
-; ATOM-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
-; ATOM-NEXT: paddq %xmm1, %xmm0
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero sched: [1:0.50]
+; ATOM-NEXT: paddq %xmm1, %xmm0 # sched: [2:1.00]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_movq_reg:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero sched: [1:0.50]
; SLM-NEXT: paddq %xmm1, %xmm0 # sched: [1:0.50]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_movq_reg:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero sched: [1:0.33]
; SANDY-NEXT: vpaddq %xmm0, %xmm1, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movq_reg:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero sched: [1:0.33]
; HASWELL-NEXT: vpaddq %xmm0, %xmm1, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_movq_reg:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero sched: [1:0.33]
+; BROADWELL-NEXT: vpaddq %xmm0, %xmm1, %xmm0 # sched: [1:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_movq_reg:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero sched: [1:0.33]
+; SKYLAKE-NEXT: vpaddq %xmm0, %xmm1, %xmm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_movq_reg:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero sched: [1:0.33]
+; SKX-NEXT: vpaddq %xmm0, %xmm1, %xmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_movq_reg:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero sched: [1:0.50]
; BTVER2-NEXT: vpaddq %xmm0, %xmm1, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_movq_reg:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero sched: [1:0.25]
; ZNVER1-NEXT: vpaddq %xmm0, %xmm1, %xmm0 # sched: [1:0.25]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = shufflevector <2 x i64> %a0, <2 x i64> zeroinitializer, <2 x i32> <i32 0, i32 2>
%2 = add <2 x i64> %a1, %1
ret <2 x i64> %2
@@ -2497,53 +3470,74 @@ define <2 x i64> @test_movq_reg(<2 x i64> %a0, <2 x i64> %a1) {
define void @test_movsd_mem(double* %a0, double* %a1) {
; GENERIC-LABEL: test_movsd_mem:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
-; GENERIC-NEXT: addsd %xmm0, %xmm0
-; GENERIC-NEXT: movsd %xmm0, (%rsi)
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero sched: [6:0.50]
+; GENERIC-NEXT: addsd %xmm0, %xmm0 # sched: [3:1.00]
+; GENERIC-NEXT: movsd %xmm0, (%rsi) # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_movsd_mem:
-; ATOM: # BB#0:
-; ATOM-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
-; ATOM-NEXT: addsd %xmm0, %xmm0
-; ATOM-NEXT: movsd %xmm0, (%rsi)
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero sched: [1:1.00]
+; ATOM-NEXT: addsd %xmm0, %xmm0 # sched: [5:5.00]
+; ATOM-NEXT: movsd %xmm0, (%rsi) # sched: [1:1.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_movsd_mem:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero sched: [3:1.00]
; SLM-NEXT: addsd %xmm0, %xmm0 # sched: [3:1.00]
; SLM-NEXT: movsd %xmm0, (%rsi) # sched: [1:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_movsd_mem:
-; SANDY: # BB#0:
-; SANDY-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero sched: [4:0.50]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero sched: [6:0.50]
; SANDY-NEXT: vaddsd %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vmovsd %xmm0, (%rsi) # sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmovsd %xmm0, (%rsi) # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movsd_mem:
-; HASWELL: # BB#0:
-; HASWELL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero sched: [4:0.50]
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero sched: [5:0.50]
; HASWELL-NEXT: vaddsd %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
; HASWELL-NEXT: vmovsd %xmm0, (%rsi) # sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_movsd_mem:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero sched: [5:0.50]
+; BROADWELL-NEXT: vaddsd %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
+; BROADWELL-NEXT: vmovsd %xmm0, (%rsi) # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_movsd_mem:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero sched: [5:0.50]
+; SKYLAKE-NEXT: vaddsd %xmm0, %xmm0, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vmovsd %xmm0, (%rsi) # sched: [1:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_movsd_mem:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero sched: [5:0.50]
+; SKX-NEXT: vaddsd %xmm0, %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: vmovsd %xmm0, (%rsi) # sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_movsd_mem:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero sched: [5:1.00]
; BTVER2-NEXT: vaddsd %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: vmovsd %xmm0, (%rsi) # sched: [1:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_movsd_mem:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero sched: [8:0.50]
; ZNVER1-NEXT: vaddsd %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
; ZNVER1-NEXT: vmovsd %xmm0, (%rsi) # sched: [1:0.50]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = load double, double* %a0, align 1
%2 = fadd double %1, %1
store double %2, double *%a1, align 1
@@ -2552,99 +3546,135 @@ define void @test_movsd_mem(double* %a0, double* %a1) {
define <2 x double> @test_movsd_reg(<2 x double> %a0, <2 x double> %a1) {
; GENERIC-LABEL: test_movsd_reg:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; GENERIC-NEXT: movapd %xmm1, %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] sched: [1:1.00]
+; GENERIC-NEXT: movaps %xmm1, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_movsd_reg:
-; ATOM: # BB#0:
-; ATOM-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; ATOM-NEXT: movapd %xmm1, %xmm0
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] sched: [1:1.00]
+; ATOM-NEXT: movaps %xmm1, %xmm0 # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_movsd_reg:
-; SLM: # BB#0:
-; SLM-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0] sched: [1:1.00]
-; SLM-NEXT: movapd %xmm1, %xmm0 # sched: [1:1.00]
+; SLM: # %bb.0:
+; SLM-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] sched: [1:1.00]
+; SLM-NEXT: movaps %xmm1, %xmm0 # sched: [1:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_movsd_reg:
-; SANDY: # BB#0:
-; SANDY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] sched: [1:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movsd_reg:
-; HASWELL: # BB#0:
-; HASWELL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_movsd_reg:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_movsd_reg:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] sched: [1:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_movsd_reg:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_movsd_reg:
-; BTVER2: # BB#0:
-; BTVER2-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] sched: [1:0.50]
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_movsd_reg:
-; ZNVER1: # BB#0:
-; ZNVER1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] sched: [1:0.50]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] sched: [1:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> <i32 2, i32 0>
ret <2 x double> %1
}
define void @test_movupd(<2 x double> *%a0, <2 x double> *%a1) {
; GENERIC-LABEL: test_movupd:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: movupd (%rdi), %xmm0
-; GENERIC-NEXT: addpd %xmm0, %xmm0
-; GENERIC-NEXT: movupd %xmm0, (%rsi)
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: movupd (%rdi), %xmm0 # sched: [6:0.50]
+; GENERIC-NEXT: addpd %xmm0, %xmm0 # sched: [3:1.00]
+; GENERIC-NEXT: movupd %xmm0, (%rsi) # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_movupd:
-; ATOM: # BB#0:
-; ATOM-NEXT: movupd (%rdi), %xmm0
-; ATOM-NEXT: addpd %xmm0, %xmm0
-; ATOM-NEXT: movupd %xmm0, (%rsi)
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: movupd (%rdi), %xmm0 # sched: [3:1.50]
+; ATOM-NEXT: addpd %xmm0, %xmm0 # sched: [6:3.00]
+; ATOM-NEXT: movupd %xmm0, (%rsi) # sched: [2:1.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_movupd:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: movupd (%rdi), %xmm0 # sched: [3:1.00]
; SLM-NEXT: addpd %xmm0, %xmm0 # sched: [3:1.00]
; SLM-NEXT: movupd %xmm0, (%rsi) # sched: [1:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_movupd:
-; SANDY: # BB#0:
-; SANDY-NEXT: vmovupd (%rdi), %xmm0 # sched: [4:0.50]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vmovupd (%rdi), %xmm0 # sched: [6:0.50]
; SANDY-NEXT: vaddpd %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vmovupd %xmm0, (%rsi) # sched: [1:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmovupd %xmm0, (%rsi) # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movupd:
-; HASWELL: # BB#0:
-; HASWELL-NEXT: vmovupd (%rdi), %xmm0 # sched: [4:0.50]
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vmovupd (%rdi), %xmm0 # sched: [6:0.50]
; HASWELL-NEXT: vaddpd %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
; HASWELL-NEXT: vmovupd %xmm0, (%rsi) # sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_movupd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vmovupd (%rdi), %xmm0 # sched: [5:0.50]
+; BROADWELL-NEXT: vaddpd %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
+; BROADWELL-NEXT: vmovupd %xmm0, (%rsi) # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_movupd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vmovupd (%rdi), %xmm0 # sched: [6:0.50]
+; SKYLAKE-NEXT: vaddpd %xmm0, %xmm0, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vmovupd %xmm0, (%rsi) # sched: [1:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_movupd:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovupd (%rdi), %xmm0 # sched: [6:0.50]
+; SKX-NEXT: vaddpd %xmm0, %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: vmovupd %xmm0, (%rsi) # sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_movupd:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vmovupd (%rdi), %xmm0 # sched: [5:1.00]
; BTVER2-NEXT: vaddpd %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: vmovupd %xmm0, (%rsi) # sched: [1:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_movupd:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vmovupd (%rdi), %xmm0 # sched: [8:0.50]
; ZNVER1-NEXT: vaddpd %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
; ZNVER1-NEXT: vmovupd %xmm0, (%rsi) # sched: [1:0.50]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = load <2 x double>, <2 x double> *%a0, align 1
%2 = fadd <2 x double> %1, %1
store <2 x double> %2, <2 x double> *%a1, align 1
@@ -2653,46 +3683,64 @@ define void @test_movupd(<2 x double> *%a0, <2 x double> *%a1) {
define <2 x double> @test_mulpd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2) {
; GENERIC-LABEL: test_mulpd:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: mulpd %xmm1, %xmm0
-; GENERIC-NEXT: mulpd (%rdi), %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: mulpd %xmm1, %xmm0 # sched: [5:1.00]
+; GENERIC-NEXT: mulpd (%rdi), %xmm0 # sched: [11:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_mulpd:
-; ATOM: # BB#0:
-; ATOM-NEXT: mulpd %xmm1, %xmm0
-; ATOM-NEXT: mulpd (%rdi), %xmm0
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: mulpd %xmm1, %xmm0 # sched: [9:4.50]
+; ATOM-NEXT: mulpd (%rdi), %xmm0 # sched: [10:5.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_mulpd:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: mulpd %xmm1, %xmm0 # sched: [5:2.00]
; SLM-NEXT: mulpd (%rdi), %xmm0 # sched: [8:2.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_mulpd:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vmulpd %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; SANDY-NEXT: vmulpd (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmulpd (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_mulpd:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vmulpd %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT: vmulpd (%rdi), %xmm0, %xmm0 # sched: [9:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vmulpd (%rdi), %xmm0, %xmm0 # sched: [11:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_mulpd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vmulpd %xmm1, %xmm0, %xmm0 # sched: [3:0.50]
+; BROADWELL-NEXT: vmulpd (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_mulpd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vmulpd %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vmulpd (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_mulpd:
+; SKX: # %bb.0:
+; SKX-NEXT: vmulpd %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: vmulpd (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_mulpd:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vmulpd %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
; BTVER2-NEXT: vmulpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_mulpd:
-; ZNVER1: # BB#0:
-; ZNVER1-NEXT: vmulpd %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; ZNVER1-NEXT: vmulpd (%rdi), %xmm0, %xmm0 # sched: [12:1.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vmulpd %xmm1, %xmm0, %xmm0 # sched: [3:0.50]
+; ZNVER1-NEXT: vmulpd (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = fmul <2 x double> %a0, %a1
%2 = load <2 x double>, <2 x double> *%a2, align 16
%3 = fmul <2 x double> %1, %2
@@ -2701,46 +3749,64 @@ define <2 x double> @test_mulpd(<2 x double> %a0, <2 x double> %a1, <2 x double>
define double @test_mulsd(double %a0, double %a1, double *%a2) {
; GENERIC-LABEL: test_mulsd:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: mulsd %xmm1, %xmm0
-; GENERIC-NEXT: mulsd (%rdi), %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: mulsd %xmm1, %xmm0 # sched: [5:1.00]
+; GENERIC-NEXT: mulsd (%rdi), %xmm0 # sched: [11:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_mulsd:
-; ATOM: # BB#0:
-; ATOM-NEXT: mulsd %xmm1, %xmm0
-; ATOM-NEXT: mulsd (%rdi), %xmm0
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: mulsd %xmm1, %xmm0 # sched: [5:5.00]
+; ATOM-NEXT: mulsd (%rdi), %xmm0 # sched: [5:5.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_mulsd:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: mulsd %xmm1, %xmm0 # sched: [5:2.00]
; SLM-NEXT: mulsd (%rdi), %xmm0 # sched: [8:2.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_mulsd:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vmulsd %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; SANDY-NEXT: vmulsd (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmulsd (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_mulsd:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vmulsd %xmm1, %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT: vmulsd (%rdi), %xmm0, %xmm0 # sched: [9:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vmulsd (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_mulsd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vmulsd %xmm1, %xmm0, %xmm0 # sched: [3:0.50]
+; BROADWELL-NEXT: vmulsd (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_mulsd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vmulsd %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vmulsd (%rdi), %xmm0, %xmm0 # sched: [9:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_mulsd:
+; SKX: # %bb.0:
+; SKX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: vmulsd (%rdi), %xmm0, %xmm0 # sched: [9:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_mulsd:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vmulsd %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
; BTVER2-NEXT: vmulsd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_mulsd:
-; ZNVER1: # BB#0:
-; ZNVER1-NEXT: vmulsd %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; ZNVER1-NEXT: vmulsd (%rdi), %xmm0, %xmm0 # sched: [12:1.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vmulsd %xmm1, %xmm0, %xmm0 # sched: [3:0.50]
+; ZNVER1-NEXT: vmulsd (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = fmul double %a0, %a1
%2 = load double, double *%a2, align 8
%3 = fmul double %1, %2
@@ -2749,53 +3815,74 @@ define double @test_mulsd(double %a0, double %a1, double *%a2) {
define <2 x double> @test_orpd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2) {
; GENERIC-LABEL: test_orpd:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: orpd %xmm1, %xmm0
-; GENERIC-NEXT: orpd (%rdi), %xmm0
-; GENERIC-NEXT: addpd %xmm1, %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: orpd %xmm1, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: orpd (%rdi), %xmm0 # sched: [7:1.00]
+; GENERIC-NEXT: addpd %xmm1, %xmm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_orpd:
-; ATOM: # BB#0:
-; ATOM-NEXT: orpd %xmm1, %xmm0
-; ATOM-NEXT: orpd (%rdi), %xmm0
-; ATOM-NEXT: addpd %xmm1, %xmm0
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: orpd %xmm1, %xmm0 # sched: [1:0.50]
+; ATOM-NEXT: orpd (%rdi), %xmm0 # sched: [1:1.00]
+; ATOM-NEXT: addpd %xmm1, %xmm0 # sched: [6:3.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_orpd:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: orpd %xmm1, %xmm0 # sched: [1:0.50]
; SLM-NEXT: orpd (%rdi), %xmm0 # sched: [4:1.00]
; SLM-NEXT: addpd %xmm1, %xmm0 # sched: [3:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_orpd:
-; SANDY: # BB#0:
-; SANDY-NEXT: vorpd %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
-; SANDY-NEXT: vorpd (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vorpd %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; SANDY-NEXT: vorpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
; SANDY-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_orpd:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vorpd %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT: vorpd (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
+; HASWELL-NEXT: vorpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
; HASWELL-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_orpd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vorpd %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; BROADWELL-NEXT: vorpd (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
+; BROADWELL-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_orpd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vorpd %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: vorpd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKYLAKE-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_orpd:
+; SKX: # %bb.0:
+; SKX-NEXT: vorpd %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKX-NEXT: vorpd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKX-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_orpd:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vorpd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vorpd (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_orpd:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vorpd %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
; ZNVER1-NEXT: vorpd (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
; ZNVER1-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = bitcast <2 x double> %a0 to <4 x i32>
%2 = bitcast <2 x double> %a1 to <4 x i32>
%3 = or <4 x i32> %1, %2
@@ -2809,54 +3896,68 @@ define <2 x double> @test_orpd(<2 x double> %a0, <2 x double> %a1, <2 x double>
define <8 x i16> @test_packssdw(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
; GENERIC-LABEL: test_packssdw:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: packssdw %xmm1, %xmm0
-; GENERIC-NEXT: packssdw (%rdi), %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: packssdw %xmm1, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT: packssdw (%rdi), %xmm0 # sched: [7:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_packssdw:
-; ATOM: # BB#0:
-; ATOM-NEXT: packssdw %xmm1, %xmm0
-; ATOM-NEXT: packssdw (%rdi), %xmm0
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: packssdw %xmm1, %xmm0 # sched: [1:1.00]
+; ATOM-NEXT: packssdw (%rdi), %xmm0 # sched: [1:1.00]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_packssdw:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: packssdw %xmm1, %xmm0 # sched: [1:1.00]
; SLM-NEXT: packssdw (%rdi), %xmm0 # sched: [4:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_packssdw:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpackssdw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpackssdw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_packssdw:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT: vpackssdw (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpackssdw (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_packssdw:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; BROADWELL-NEXT: vpackssdw (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_packssdw:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; SKYLAKE-NEXT: vpackssdw (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_packssdw:
+; SKX: # %bb.0:
+; SKX-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; SKX-NEXT: vpackssdw (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_packssdw:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vpackssdw (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_packssdw:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
; ZNVER1-NEXT: vpackssdw (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a0, <4 x i32> %a1)
%2 = bitcast <8 x i16> %1 to <4 x i32>
%3 = load <4 x i32>, <4 x i32> *%a2, align 16
@@ -2867,54 +3968,68 @@ declare <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32>, <4 x i32>) nounwind rea
define <16 x i8> @test_packsswb(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; GENERIC-LABEL: test_packsswb:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: packsswb %xmm1, %xmm0
-; GENERIC-NEXT: packsswb (%rdi), %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: packsswb %xmm1, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT: packsswb (%rdi), %xmm0 # sched: [7:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_packsswb:
-; ATOM: # BB#0:
-; ATOM-NEXT: packsswb %xmm1, %xmm0
-; ATOM-NEXT: packsswb (%rdi), %xmm0
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: packsswb %xmm1, %xmm0 # sched: [1:1.00]
+; ATOM-NEXT: packsswb (%rdi), %xmm0 # sched: [1:1.00]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_packsswb:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: packsswb %xmm1, %xmm0 # sched: [1:1.00]
; SLM-NEXT: packsswb (%rdi), %xmm0 # sched: [4:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_packsswb:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpacksswb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpacksswb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_packsswb:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT: vpacksswb (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpacksswb (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_packsswb:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; BROADWELL-NEXT: vpacksswb (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_packsswb:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; SKYLAKE-NEXT: vpacksswb (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_packsswb:
+; SKX: # %bb.0:
+; SKX-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; SKX-NEXT: vpacksswb (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_packsswb:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vpacksswb (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_packsswb:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
; ZNVER1-NEXT: vpacksswb (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %a0, <8 x i16> %a1)
%2 = bitcast <16 x i8> %1 to <8 x i16>
%3 = load <8 x i16>, <8 x i16> *%a2, align 16
@@ -2925,54 +4040,68 @@ declare <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16>, <8 x i16>) nounwind rea
define <16 x i8> @test_packuswb(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; GENERIC-LABEL: test_packuswb:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: packuswb %xmm1, %xmm0
-; GENERIC-NEXT: packuswb (%rdi), %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: packuswb %xmm1, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT: packuswb (%rdi), %xmm0 # sched: [7:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_packuswb:
-; ATOM: # BB#0:
-; ATOM-NEXT: packuswb %xmm1, %xmm0
-; ATOM-NEXT: packuswb (%rdi), %xmm0
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: packuswb %xmm1, %xmm0 # sched: [1:1.00]
+; ATOM-NEXT: packuswb (%rdi), %xmm0 # sched: [1:1.00]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_packuswb:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: packuswb %xmm1, %xmm0 # sched: [1:1.00]
; SLM-NEXT: packuswb (%rdi), %xmm0 # sched: [4:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_packuswb:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpackuswb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpackuswb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_packuswb:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT: vpackuswb (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpackuswb (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_packuswb:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; BROADWELL-NEXT: vpackuswb (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_packuswb:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; SKYLAKE-NEXT: vpackuswb (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_packuswb:
+; SKX: # %bb.0:
+; SKX-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; SKX-NEXT: vpackuswb (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_packuswb:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vpackuswb (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_packuswb:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
; ZNVER1-NEXT: vpackuswb (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %a0, <8 x i16> %a1)
%2 = bitcast <16 x i8> %1 to <8 x i16>
%3 = load <8 x i16>, <8 x i16> *%a2, align 16
@@ -2983,50 +4112,68 @@ declare <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16>, <8 x i16>) nounwind rea
define <16 x i8> @test_paddb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; GENERIC-LABEL: test_paddb:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: paddb %xmm1, %xmm0
-; GENERIC-NEXT: paddb (%rdi), %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: paddb %xmm1, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT: paddb (%rdi), %xmm0 # sched: [7:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_paddb:
-; ATOM: # BB#0:
-; ATOM-NEXT: paddb %xmm1, %xmm0
-; ATOM-NEXT: paddb (%rdi), %xmm0
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: paddb %xmm1, %xmm0 # sched: [1:0.50]
+; ATOM-NEXT: paddb (%rdi), %xmm0 # sched: [1:1.00]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_paddb:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: paddb %xmm1, %xmm0 # sched: [1:0.50]
; SLM-NEXT: paddb (%rdi), %xmm0 # sched: [4:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_paddb:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vpaddb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpaddb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpaddb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_paddb:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vpaddb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpaddb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpaddb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_paddb:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpaddb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BROADWELL-NEXT: vpaddb (%rdi), %xmm0, %xmm0 # sched: [6:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_paddb:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpaddb %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: vpaddb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_paddb:
+; SKX: # %bb.0:
+; SKX-NEXT: vpaddb %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKX-NEXT: vpaddb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_paddb:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vpaddb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vpaddb (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_paddb:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
; ZNVER1-NEXT: vpaddb (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = add <16 x i8> %a0, %a1
%2 = load <16 x i8>, <16 x i8> *%a2, align 16
%3 = add <16 x i8> %1, %2
@@ -3035,50 +4182,68 @@ define <16 x i8> @test_paddb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
define <4 x i32> @test_paddd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
; GENERIC-LABEL: test_paddd:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: paddd %xmm1, %xmm0
-; GENERIC-NEXT: paddd (%rdi), %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: paddd %xmm1, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT: paddd (%rdi), %xmm0 # sched: [7:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_paddd:
-; ATOM: # BB#0:
-; ATOM-NEXT: paddd %xmm1, %xmm0
-; ATOM-NEXT: paddd (%rdi), %xmm0
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: paddd %xmm1, %xmm0 # sched: [1:0.50]
+; ATOM-NEXT: paddd (%rdi), %xmm0 # sched: [1:1.00]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_paddd:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: paddd %xmm1, %xmm0 # sched: [1:0.50]
; SLM-NEXT: paddd (%rdi), %xmm0 # sched: [4:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_paddd:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpaddd (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpaddd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_paddd:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpaddd (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpaddd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_paddd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BROADWELL-NEXT: vpaddd (%rdi), %xmm0, %xmm0 # sched: [6:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_paddd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: vpaddd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_paddd:
+; SKX: # %bb.0:
+; SKX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKX-NEXT: vpaddd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_paddd:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vpaddd (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_paddd:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
; ZNVER1-NEXT: vpaddd (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = add <4 x i32> %a0, %a1
%2 = load <4 x i32>, <4 x i32> *%a2, align 16
%3 = add <4 x i32> %1, %2
@@ -3087,46 +4252,64 @@ define <4 x i32> @test_paddd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
define <2 x i64> @test_paddq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
; GENERIC-LABEL: test_paddq:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: paddq %xmm1, %xmm0
-; GENERIC-NEXT: paddq (%rdi), %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: paddq %xmm1, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT: paddq (%rdi), %xmm0 # sched: [7:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_paddq:
-; ATOM: # BB#0:
-; ATOM-NEXT: paddq %xmm1, %xmm0
-; ATOM-NEXT: paddq (%rdi), %xmm0
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: paddq %xmm1, %xmm0 # sched: [2:1.00]
+; ATOM-NEXT: paddq (%rdi), %xmm0 # sched: [3:1.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_paddq:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: paddq %xmm1, %xmm0 # sched: [1:0.50]
; SLM-NEXT: paddq (%rdi), %xmm0 # sched: [4:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_paddq:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpaddq (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpaddq (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_paddq:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpaddq (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpaddq (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_paddq:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BROADWELL-NEXT: vpaddq (%rdi), %xmm0, %xmm0 # sched: [6:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_paddq:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: vpaddq (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_paddq:
+; SKX: # %bb.0:
+; SKX-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKX-NEXT: vpaddq (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_paddq:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vpaddq (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_paddq:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
; ZNVER1-NEXT: vpaddq (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = add <2 x i64> %a0, %a1
%2 = load <2 x i64>, <2 x i64> *%a2, align 16
%3 = add <2 x i64> %1, %2
@@ -3135,50 +4318,68 @@ define <2 x i64> @test_paddq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
define <16 x i8> @test_paddsb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; GENERIC-LABEL: test_paddsb:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: paddsb %xmm1, %xmm0
-; GENERIC-NEXT: paddsb (%rdi), %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: paddsb %xmm1, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT: paddsb (%rdi), %xmm0 # sched: [7:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_paddsb:
-; ATOM: # BB#0:
-; ATOM-NEXT: paddsb %xmm1, %xmm0
-; ATOM-NEXT: paddsb (%rdi), %xmm0
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: paddsb %xmm1, %xmm0 # sched: [1:0.50]
+; ATOM-NEXT: paddsb (%rdi), %xmm0 # sched: [1:1.00]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_paddsb:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: paddsb %xmm1, %xmm0 # sched: [1:0.50]
; SLM-NEXT: paddsb (%rdi), %xmm0 # sched: [4:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_paddsb:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpaddsb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpaddsb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_paddsb:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpaddsb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpaddsb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_paddsb:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BROADWELL-NEXT: vpaddsb (%rdi), %xmm0, %xmm0 # sched: [6:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_paddsb:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; SKYLAKE-NEXT: vpaddsb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_paddsb:
+; SKX: # %bb.0:
+; SKX-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; SKX-NEXT: vpaddsb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_paddsb:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vpaddsb (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_paddsb:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
; ZNVER1-NEXT: vpaddsb (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %a0, <16 x i8> %a1)
%2 = load <16 x i8>, <16 x i8> *%a2, align 16
%3 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %1, <16 x i8> %2)
@@ -3188,50 +4389,68 @@ declare <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8>, <16 x i8>) nounwind readnone
define <8 x i16> @test_paddsw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; GENERIC-LABEL: test_paddsw:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: paddsw %xmm1, %xmm0
-; GENERIC-NEXT: paddsw (%rdi), %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: paddsw %xmm1, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT: paddsw (%rdi), %xmm0 # sched: [7:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_paddsw:
-; ATOM: # BB#0:
-; ATOM-NEXT: paddsw %xmm1, %xmm0
-; ATOM-NEXT: paddsw (%rdi), %xmm0
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: paddsw %xmm1, %xmm0 # sched: [1:0.50]
+; ATOM-NEXT: paddsw (%rdi), %xmm0 # sched: [1:1.00]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_paddsw:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: paddsw %xmm1, %xmm0 # sched: [1:0.50]
; SLM-NEXT: paddsw (%rdi), %xmm0 # sched: [4:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_paddsw:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vpaddsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpaddsw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpaddsw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_paddsw:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vpaddsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpaddsw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpaddsw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_paddsw:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpaddsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BROADWELL-NEXT: vpaddsw (%rdi), %xmm0, %xmm0 # sched: [6:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_paddsw:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpaddsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; SKYLAKE-NEXT: vpaddsw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_paddsw:
+; SKX: # %bb.0:
+; SKX-NEXT: vpaddsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; SKX-NEXT: vpaddsw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_paddsw:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vpaddsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vpaddsw (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_paddsw:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vpaddsw %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
; ZNVER1-NEXT: vpaddsw (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %a0, <8 x i16> %a1)
%2 = load <8 x i16>, <8 x i16> *%a2, align 16
%3 = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %1, <8 x i16> %2)
@@ -3241,50 +4460,68 @@ declare <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16>, <8 x i16>) nounwind readnone
define <16 x i8> @test_paddusb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; GENERIC-LABEL: test_paddusb:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: paddusb %xmm1, %xmm0
-; GENERIC-NEXT: paddusb (%rdi), %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: paddusb %xmm1, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT: paddusb (%rdi), %xmm0 # sched: [7:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_paddusb:
-; ATOM: # BB#0:
-; ATOM-NEXT: paddusb %xmm1, %xmm0
-; ATOM-NEXT: paddusb (%rdi), %xmm0
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: paddusb %xmm1, %xmm0 # sched: [1:0.50]
+; ATOM-NEXT: paddusb (%rdi), %xmm0 # sched: [1:1.00]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_paddusb:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: paddusb %xmm1, %xmm0 # sched: [1:0.50]
; SLM-NEXT: paddusb (%rdi), %xmm0 # sched: [4:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_paddusb:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpaddusb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpaddusb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_paddusb:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpaddusb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpaddusb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_paddusb:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BROADWELL-NEXT: vpaddusb (%rdi), %xmm0, %xmm0 # sched: [6:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_paddusb:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; SKYLAKE-NEXT: vpaddusb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_paddusb:
+; SKX: # %bb.0:
+; SKX-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; SKX-NEXT: vpaddusb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_paddusb:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vpaddusb (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_paddusb:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
; ZNVER1-NEXT: vpaddusb (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %a0, <16 x i8> %a1)
%2 = load <16 x i8>, <16 x i8> *%a2, align 16
%3 = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %1, <16 x i8> %2)
@@ -3294,50 +4531,68 @@ declare <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8>, <16 x i8>) nounwind readnon
define <8 x i16> @test_paddusw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; GENERIC-LABEL: test_paddusw:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: paddusw %xmm1, %xmm0
-; GENERIC-NEXT: paddusw (%rdi), %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: paddusw %xmm1, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT: paddusw (%rdi), %xmm0 # sched: [7:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_paddusw:
-; ATOM: # BB#0:
-; ATOM-NEXT: paddusw %xmm1, %xmm0
-; ATOM-NEXT: paddusw (%rdi), %xmm0
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: paddusw %xmm1, %xmm0 # sched: [1:0.50]
+; ATOM-NEXT: paddusw (%rdi), %xmm0 # sched: [1:1.00]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_paddusw:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: paddusw %xmm1, %xmm0 # sched: [1:0.50]
; SLM-NEXT: paddusw (%rdi), %xmm0 # sched: [4:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_paddusw:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vpaddusw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpaddusw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpaddusw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_paddusw:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vpaddusw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpaddusw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpaddusw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_paddusw:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpaddusw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BROADWELL-NEXT: vpaddusw (%rdi), %xmm0, %xmm0 # sched: [6:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_paddusw:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpaddusw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; SKYLAKE-NEXT: vpaddusw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_paddusw:
+; SKX: # %bb.0:
+; SKX-NEXT: vpaddusw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; SKX-NEXT: vpaddusw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_paddusw:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vpaddusw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vpaddusw (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_paddusw:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vpaddusw %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
; ZNVER1-NEXT: vpaddusw (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16> %a0, <8 x i16> %a1)
%2 = load <8 x i16>, <8 x i16> *%a2, align 16
%3 = call <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16> %1, <8 x i16> %2)
@@ -3347,50 +4602,68 @@ declare <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16>, <8 x i16>) nounwind readnon
define <8 x i16> @test_paddw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; GENERIC-LABEL: test_paddw:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: paddw %xmm1, %xmm0
-; GENERIC-NEXT: paddw (%rdi), %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: paddw %xmm1, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT: paddw (%rdi), %xmm0 # sched: [7:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_paddw:
-; ATOM: # BB#0:
-; ATOM-NEXT: paddw %xmm1, %xmm0
-; ATOM-NEXT: paddw (%rdi), %xmm0
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: paddw %xmm1, %xmm0 # sched: [1:0.50]
+; ATOM-NEXT: paddw (%rdi), %xmm0 # sched: [1:1.00]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_paddw:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: paddw %xmm1, %xmm0 # sched: [1:0.50]
; SLM-NEXT: paddw (%rdi), %xmm0 # sched: [4:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_paddw:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpaddw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpaddw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_paddw:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpaddw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpaddw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_paddw:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BROADWELL-NEXT: vpaddw (%rdi), %xmm0, %xmm0 # sched: [6:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_paddw:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: vpaddw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_paddw:
+; SKX: # %bb.0:
+; SKX-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKX-NEXT: vpaddw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_paddw:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vpaddw (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_paddw:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
; ZNVER1-NEXT: vpaddw (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = add <8 x i16> %a0, %a1
%2 = load <8 x i16>, <8 x i16> *%a2, align 16
%3 = add <8 x i16> %1, %2
@@ -3399,53 +4672,74 @@ define <8 x i16> @test_paddw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
define <2 x i64> @test_pand(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
; GENERIC-LABEL: test_pand:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: pand %xmm1, %xmm0
-; GENERIC-NEXT: pand (%rdi), %xmm0
-; GENERIC-NEXT: paddq %xmm1, %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: pand %xmm1, %xmm0 # sched: [1:0.33]
+; GENERIC-NEXT: pand (%rdi), %xmm0 # sched: [7:0.50]
+; GENERIC-NEXT: paddq %xmm1, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_pand:
-; ATOM: # BB#0:
-; ATOM-NEXT: pand %xmm1, %xmm0
-; ATOM-NEXT: pand (%rdi), %xmm0
-; ATOM-NEXT: paddq %xmm1, %xmm0
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: pand %xmm1, %xmm0 # sched: [1:0.50]
+; ATOM-NEXT: pand (%rdi), %xmm0 # sched: [1:1.00]
+; ATOM-NEXT: paddq %xmm1, %xmm0 # sched: [2:1.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_pand:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: pand %xmm1, %xmm0 # sched: [1:0.50]
; SLM-NEXT: pand (%rdi), %xmm0 # sched: [4:1.00]
; SLM-NEXT: paddq %xmm1, %xmm0 # sched: [1:0.50]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_pand:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vpand %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
-; SANDY-NEXT: vpand (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; SANDY-NEXT: vpand (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
; SANDY-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pand:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vpand %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
-; HASWELL-NEXT: vpand (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vpand (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
; HASWELL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pand:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpand %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; BROADWELL-NEXT: vpand (%rdi), %xmm0, %xmm0 # sched: [6:0.50]
+; BROADWELL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pand:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpand %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: vpand (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKYLAKE-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pand:
+; SKX: # %bb.0:
+; SKX-NEXT: vpand %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKX-NEXT: vpand (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKX-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_pand:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vpand %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vpand (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_pand:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vpand %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
; ZNVER1-NEXT: vpand (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
; ZNVER1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = and <2 x i64> %a0, %a1
%2 = load <2 x i64>, <2 x i64> *%a2, align 16
%3 = and <2 x i64> %1, %2
@@ -3455,25 +4749,25 @@ define <2 x i64> @test_pand(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
define <2 x i64> @test_pandn(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
; GENERIC-LABEL: test_pandn:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: pandn %xmm1, %xmm0
-; GENERIC-NEXT: movdqa %xmm0, %xmm1
-; GENERIC-NEXT: pandn (%rdi), %xmm1
-; GENERIC-NEXT: paddq %xmm0, %xmm1
-; GENERIC-NEXT: movdqa %xmm1, %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: pandn %xmm1, %xmm0 # sched: [1:0.33]
+; GENERIC-NEXT: movdqa %xmm0, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT: pandn (%rdi), %xmm1 # sched: [7:0.50]
+; GENERIC-NEXT: paddq %xmm0, %xmm1 # sched: [1:0.50]
+; GENERIC-NEXT: movdqa %xmm1, %xmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_pandn:
-; ATOM: # BB#0:
-; ATOM-NEXT: pandn %xmm1, %xmm0
-; ATOM-NEXT: movdqa %xmm0, %xmm1
-; ATOM-NEXT: pandn (%rdi), %xmm1
-; ATOM-NEXT: paddq %xmm0, %xmm1
-; ATOM-NEXT: movdqa %xmm1, %xmm0
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: pandn %xmm1, %xmm0 # sched: [1:0.50]
+; ATOM-NEXT: movdqa %xmm0, %xmm1 # sched: [1:0.50]
+; ATOM-NEXT: pandn (%rdi), %xmm1 # sched: [1:1.00]
+; ATOM-NEXT: paddq %xmm0, %xmm1 # sched: [2:1.00]
+; ATOM-NEXT: movdqa %xmm1, %xmm0 # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_pandn:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: pandn %xmm1, %xmm0 # sched: [1:0.50]
; SLM-NEXT: movdqa %xmm0, %xmm1 # sched: [1:0.50]
; SLM-NEXT: pandn (%rdi), %xmm1 # sched: [4:1.00]
@@ -3482,32 +4776,53 @@ define <2 x i64> @test_pandn(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_pandn:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vpandn %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
-; SANDY-NEXT: vpandn (%rdi), %xmm0, %xmm1 # sched: [5:0.50]
+; SANDY-NEXT: vpandn (%rdi), %xmm0, %xmm1 # sched: [7:0.50]
; SANDY-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pandn:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vpandn %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
-; HASWELL-NEXT: vpandn (%rdi), %xmm0, %xmm1 # sched: [5:0.50]
+; HASWELL-NEXT: vpandn (%rdi), %xmm0, %xmm1 # sched: [7:0.50]
; HASWELL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pandn:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpandn %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; BROADWELL-NEXT: vpandn (%rdi), %xmm0, %xmm1 # sched: [6:0.50]
+; BROADWELL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pandn:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpandn %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: vpandn (%rdi), %xmm0, %xmm1 # sched: [7:0.50]
+; SKYLAKE-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pandn:
+; SKX: # %bb.0:
+; SKX-NEXT: vpandn %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKX-NEXT: vpandn (%rdi), %xmm0, %xmm1 # sched: [7:0.50]
+; SKX-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_pandn:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vpandn %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vpandn (%rdi), %xmm0, %xmm1 # sched: [6:1.00]
; BTVER2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_pandn:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vpandn %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
; ZNVER1-NEXT: vpandn (%rdi), %xmm0, %xmm1 # sched: [8:0.50]
; ZNVER1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = xor <2 x i64> %a0, <i64 -1, i64 -1>
%2 = and <2 x i64> %a1, %1
%3 = load <2 x i64>, <2 x i64> *%a2, align 16
@@ -3519,161 +4834,237 @@ define <2 x i64> @test_pandn(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
define <16 x i8> @test_pavgb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; GENERIC-LABEL: test_pavgb:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: pavgb %xmm1, %xmm0
-; GENERIC-NEXT: pavgb (%rdi), %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: pavgb %xmm1, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT: pavgb (%rdi), %xmm0 # sched: [7:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_pavgb:
-; ATOM: # BB#0:
-; ATOM-NEXT: pavgb %xmm1, %xmm0
-; ATOM-NEXT: pavgb (%rdi), %xmm0
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: pavgb %xmm1, %xmm0 # sched: [1:0.50]
+; ATOM-NEXT: pavgb (%rdi), %xmm0 # sched: [1:1.00]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_pavgb:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: pavgb %xmm1, %xmm0 # sched: [1:0.50]
; SLM-NEXT: pavgb (%rdi), %xmm0 # sched: [4:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_pavgb:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vpavgb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpavgb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpavgb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pavgb:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vpavgb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpavgb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpavgb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pavgb:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpavgb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BROADWELL-NEXT: vpavgb (%rdi), %xmm0, %xmm0 # sched: [6:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pavgb:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpavgb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; SKYLAKE-NEXT: vpavgb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pavgb:
+; SKX: # %bb.0:
+; SKX-NEXT: vpavgb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; SKX-NEXT: vpavgb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_pavgb:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vpavgb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vpavgb (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_pavgb:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vpavgb %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
; ZNVER1-NEXT: vpavgb (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
- %1 = call <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> %a0, <16 x i8> %a1)
- %2 = load <16 x i8>, <16 x i8> *%a2, align 16
- %3 = call <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> %1, <16 x i8> %2)
- ret <16 x i8> %3
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = zext <16 x i8> %a0 to <16 x i16>
+ %2 = zext <16 x i8> %a1 to <16 x i16>
+ %3 = add <16 x i16> %1, %2
+ %4 = add <16 x i16> %3, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+ %5 = lshr <16 x i16> %4, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+ %6 = trunc <16 x i16> %5 to <16 x i8>
+ %7 = load <16 x i8>, <16 x i8> *%a2, align 16
+ %8 = zext <16 x i8> %6 to <16 x i16>
+ %9 = zext <16 x i8> %7 to <16 x i16>
+ %10 = add <16 x i16> %8, %9
+ %11 = add <16 x i16> %10, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+ %12 = lshr <16 x i16> %11, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+ %13 = trunc <16 x i16> %12 to <16 x i8>
+ ret <16 x i8> %13
}
-declare <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> %arg0, <16 x i8> %arg1) nounwind readnone
define <8 x i16> @test_pavgw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; GENERIC-LABEL: test_pavgw:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: pavgw %xmm1, %xmm0
-; GENERIC-NEXT: pavgw (%rdi), %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: pavgw %xmm1, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT: pavgw (%rdi), %xmm0 # sched: [7:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_pavgw:
-; ATOM: # BB#0:
-; ATOM-NEXT: pavgw %xmm1, %xmm0
-; ATOM-NEXT: pavgw (%rdi), %xmm0
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: pavgw %xmm1, %xmm0 # sched: [1:0.50]
+; ATOM-NEXT: pavgw (%rdi), %xmm0 # sched: [1:1.00]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_pavgw:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: pavgw %xmm1, %xmm0 # sched: [1:0.50]
; SLM-NEXT: pavgw (%rdi), %xmm0 # sched: [4:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_pavgw:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vpavgw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpavgw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpavgw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pavgw:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vpavgw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpavgw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpavgw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pavgw:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpavgw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BROADWELL-NEXT: vpavgw (%rdi), %xmm0, %xmm0 # sched: [6:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pavgw:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpavgw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; SKYLAKE-NEXT: vpavgw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pavgw:
+; SKX: # %bb.0:
+; SKX-NEXT: vpavgw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; SKX-NEXT: vpavgw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_pavgw:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vpavgw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vpavgw (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_pavgw:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vpavgw %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
; ZNVER1-NEXT: vpavgw (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
- %1 = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %a0, <8 x i16> %a1)
- %2 = load <8 x i16>, <8 x i16> *%a2, align 16
- %3 = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %1, <8 x i16> %2)
- ret <8 x i16> %3
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = zext <8 x i16> %a0 to <8 x i32>
+ %2 = zext <8 x i16> %a1 to <8 x i32>
+ %3 = add <8 x i32> %1, %2
+ %4 = add <8 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+ %5 = lshr <8 x i32> %4, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+ %6 = trunc <8 x i32> %5 to <8 x i16>
+ %7 = load <8 x i16>, <8 x i16> *%a2, align 16
+ %8 = zext <8 x i16> %6 to <8 x i32>
+ %9 = zext <8 x i16> %7 to <8 x i32>
+ %10 = add <8 x i32> %8, %9
+ %11 = add <8 x i32> %10, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+ %12 = lshr <8 x i32> %11, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+ %13 = trunc <8 x i32> %12 to <8 x i16>
+ ret <8 x i16> %13
}
-declare <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16>, <8 x i16>) nounwind readnone
define <16 x i8> @test_pcmpeqb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; GENERIC-LABEL: test_pcmpeqb:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: pcmpeqb %xmm0, %xmm1
-; GENERIC-NEXT: pcmpeqb (%rdi), %xmm0
-; GENERIC-NEXT: por %xmm1, %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: pcmpeqb %xmm0, %xmm1 # sched: [1:0.50]
+; GENERIC-NEXT: pcmpeqb (%rdi), %xmm0 # sched: [7:0.50]
+; GENERIC-NEXT: por %xmm1, %xmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_pcmpeqb:
-; ATOM: # BB#0:
-; ATOM-NEXT: pcmpeqb %xmm0, %xmm1
-; ATOM-NEXT: pcmpeqb (%rdi), %xmm0
-; ATOM-NEXT: por %xmm1, %xmm0
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: pcmpeqb %xmm0, %xmm1 # sched: [1:0.50]
+; ATOM-NEXT: pcmpeqb (%rdi), %xmm0 # sched: [1:1.00]
+; ATOM-NEXT: por %xmm1, %xmm0 # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_pcmpeqb:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: pcmpeqb %xmm0, %xmm1 # sched: [1:0.50]
; SLM-NEXT: pcmpeqb (%rdi), %xmm0 # sched: [4:1.00]
; SLM-NEXT: por %xmm1, %xmm0 # sched: [1:0.50]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_pcmpeqb:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm1 # sched: [1:0.50]
-; SANDY-NEXT: vpcmpeqb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; SANDY-NEXT: vpcmpeqb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
; SANDY-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pcmpeqb:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm1 # sched: [1:0.50]
-; HASWELL-NEXT: vpcmpeqb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vpcmpeqb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
; HASWELL-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.33]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pcmpeqb:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm1 # sched: [1:0.50]
+; BROADWELL-NEXT: vpcmpeqb (%rdi), %xmm0, %xmm0 # sched: [6:0.50]
+; BROADWELL-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.33]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pcmpeqb:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm1 # sched: [1:0.50]
+; SKYLAKE-NEXT: vpcmpeqb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKYLAKE-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pcmpeqb:
+; SKX: # %bb.0:
+; SKX-NEXT: vpcmpeqb %xmm1, %xmm0, %k0 # sched: [3:1.00]
+; SKX-NEXT: vpcmpeqb (%rdi), %xmm0, %k1 # sched: [9:1.00]
+; SKX-NEXT: korw %k1, %k0, %k0 # sched: [1:1.00]
+; SKX-NEXT: vpmovm2b %k0, %xmm0 # sched: [1:0.25]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_pcmpeqb:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm1 # sched: [1:0.50]
; BTVER2-NEXT: vpcmpeqb (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_pcmpeqb:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm1 # sched: [1:0.25]
; ZNVER1-NEXT: vpcmpeqb (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
; ZNVER1-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.25]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = icmp eq <16 x i8> %a0, %a1
%2 = load <16 x i8>, <16 x i8> *%a2, align 16
%3 = icmp eq <16 x i8> %a0, %2
@@ -3684,55 +5075,77 @@ define <16 x i8> @test_pcmpeqb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
define <4 x i32> @test_pcmpeqd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
; GENERIC-LABEL: test_pcmpeqd:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: pcmpeqd %xmm0, %xmm1
-; GENERIC-NEXT: pcmpeqd (%rdi), %xmm0
-; GENERIC-NEXT: por %xmm1, %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: pcmpeqd %xmm0, %xmm1 # sched: [1:0.50]
+; GENERIC-NEXT: pcmpeqd (%rdi), %xmm0 # sched: [7:0.50]
+; GENERIC-NEXT: por %xmm1, %xmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_pcmpeqd:
-; ATOM: # BB#0:
-; ATOM-NEXT: pcmpeqd %xmm0, %xmm1
-; ATOM-NEXT: pcmpeqd (%rdi), %xmm0
-; ATOM-NEXT: por %xmm1, %xmm0
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: pcmpeqd %xmm0, %xmm1 # sched: [1:0.50]
+; ATOM-NEXT: pcmpeqd (%rdi), %xmm0 # sched: [1:1.00]
+; ATOM-NEXT: por %xmm1, %xmm0 # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_pcmpeqd:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: pcmpeqd %xmm0, %xmm1 # sched: [1:0.50]
; SLM-NEXT: pcmpeqd (%rdi), %xmm0 # sched: [4:1.00]
; SLM-NEXT: por %xmm1, %xmm0 # sched: [1:0.50]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_pcmpeqd:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm1 # sched: [1:0.50]
-; SANDY-NEXT: vpcmpeqd (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; SANDY-NEXT: vpcmpeqd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
; SANDY-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pcmpeqd:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm1 # sched: [1:0.50]
-; HASWELL-NEXT: vpcmpeqd (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vpcmpeqd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
; HASWELL-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.33]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pcmpeqd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm1 # sched: [1:0.50]
+; BROADWELL-NEXT: vpcmpeqd (%rdi), %xmm0, %xmm0 # sched: [6:0.50]
+; BROADWELL-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.33]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pcmpeqd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm1 # sched: [1:0.50]
+; SKYLAKE-NEXT: vpcmpeqd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKYLAKE-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pcmpeqd:
+; SKX: # %bb.0:
+; SKX-NEXT: vpcmpeqd %xmm1, %xmm0, %k0 # sched: [3:1.00]
+; SKX-NEXT: vpcmpeqd (%rdi), %xmm0, %k1 # sched: [9:1.00]
+; SKX-NEXT: korw %k1, %k0, %k0 # sched: [1:1.00]
+; SKX-NEXT: vpmovm2d %k0, %xmm0 # sched: [1:0.25]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_pcmpeqd:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm1 # sched: [1:0.50]
; BTVER2-NEXT: vpcmpeqd (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_pcmpeqd:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm1 # sched: [1:0.25]
; ZNVER1-NEXT: vpcmpeqd (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
; ZNVER1-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.25]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = icmp eq <4 x i32> %a0, %a1
%2 = load <4 x i32>, <4 x i32> *%a2, align 16
%3 = icmp eq <4 x i32> %a0, %2
@@ -3743,55 +5156,77 @@ define <4 x i32> @test_pcmpeqd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
define <8 x i16> @test_pcmpeqw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; GENERIC-LABEL: test_pcmpeqw:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: pcmpeqw %xmm0, %xmm1
-; GENERIC-NEXT: pcmpeqw (%rdi), %xmm0
-; GENERIC-NEXT: por %xmm1, %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: pcmpeqw %xmm0, %xmm1 # sched: [1:0.50]
+; GENERIC-NEXT: pcmpeqw (%rdi), %xmm0 # sched: [7:0.50]
+; GENERIC-NEXT: por %xmm1, %xmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_pcmpeqw:
-; ATOM: # BB#0:
-; ATOM-NEXT: pcmpeqw %xmm0, %xmm1
-; ATOM-NEXT: pcmpeqw (%rdi), %xmm0
-; ATOM-NEXT: por %xmm1, %xmm0
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: pcmpeqw %xmm0, %xmm1 # sched: [1:0.50]
+; ATOM-NEXT: pcmpeqw (%rdi), %xmm0 # sched: [1:1.00]
+; ATOM-NEXT: por %xmm1, %xmm0 # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_pcmpeqw:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: pcmpeqw %xmm0, %xmm1 # sched: [1:0.50]
; SLM-NEXT: pcmpeqw (%rdi), %xmm0 # sched: [4:1.00]
; SLM-NEXT: por %xmm1, %xmm0 # sched: [1:0.50]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_pcmpeqw:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm1 # sched: [1:0.50]
-; SANDY-NEXT: vpcmpeqw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; SANDY-NEXT: vpcmpeqw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
; SANDY-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pcmpeqw:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm1 # sched: [1:0.50]
-; HASWELL-NEXT: vpcmpeqw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vpcmpeqw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
; HASWELL-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.33]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pcmpeqw:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm1 # sched: [1:0.50]
+; BROADWELL-NEXT: vpcmpeqw (%rdi), %xmm0, %xmm0 # sched: [6:0.50]
+; BROADWELL-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.33]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pcmpeqw:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm1 # sched: [1:0.50]
+; SKYLAKE-NEXT: vpcmpeqw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKYLAKE-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pcmpeqw:
+; SKX: # %bb.0:
+; SKX-NEXT: vpcmpeqw %xmm1, %xmm0, %k0 # sched: [3:1.00]
+; SKX-NEXT: vpcmpeqw (%rdi), %xmm0, %k1 # sched: [9:1.00]
+; SKX-NEXT: korb %k1, %k0, %k0 # sched: [1:1.00]
+; SKX-NEXT: vpmovm2w %k0, %xmm0 # sched: [1:0.25]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_pcmpeqw:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm1 # sched: [1:0.50]
; BTVER2-NEXT: vpcmpeqw (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_pcmpeqw:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm1 # sched: [1:0.25]
; ZNVER1-NEXT: vpcmpeqw (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
; ZNVER1-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.25]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = icmp eq <8 x i16> %a0, %a1
%2 = load <8 x i16>, <8 x i16> *%a2, align 16
%3 = icmp eq <8 x i16> %a0, %2
@@ -3802,23 +5237,23 @@ define <8 x i16> @test_pcmpeqw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
define <16 x i8> @test_pcmpgtb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; GENERIC-LABEL: test_pcmpgtb:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: movdqa %xmm0, %xmm2
-; GENERIC-NEXT: pcmpgtb %xmm1, %xmm2
-; GENERIC-NEXT: pcmpgtb (%rdi), %xmm0
-; GENERIC-NEXT: por %xmm2, %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: movdqa %xmm0, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: pcmpgtb %xmm1, %xmm2 # sched: [1:0.50]
+; GENERIC-NEXT: pcmpgtb (%rdi), %xmm0 # sched: [7:0.50]
+; GENERIC-NEXT: por %xmm2, %xmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_pcmpgtb:
-; ATOM: # BB#0:
-; ATOM-NEXT: movdqa %xmm0, %xmm2
-; ATOM-NEXT: pcmpgtb (%rdi), %xmm0
-; ATOM-NEXT: pcmpgtb %xmm1, %xmm2
-; ATOM-NEXT: por %xmm2, %xmm0
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: movdqa %xmm0, %xmm2 # sched: [1:0.50]
+; ATOM-NEXT: pcmpgtb (%rdi), %xmm0 # sched: [1:1.00]
+; ATOM-NEXT: pcmpgtb %xmm1, %xmm2 # sched: [1:0.50]
+; ATOM-NEXT: por %xmm2, %xmm0 # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_pcmpgtb:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: movdqa %xmm0, %xmm2 # sched: [1:0.50]
; SLM-NEXT: pcmpgtb (%rdi), %xmm0 # sched: [4:1.00]
; SLM-NEXT: pcmpgtb %xmm1, %xmm2 # sched: [1:0.50]
@@ -3826,32 +5261,54 @@ define <16 x i8> @test_pcmpgtb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_pcmpgtb:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm1 # sched: [1:0.50]
-; SANDY-NEXT: vpcmpgtb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; SANDY-NEXT: vpcmpgtb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
; SANDY-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pcmpgtb:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm1 # sched: [1:0.50]
-; HASWELL-NEXT: vpcmpgtb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vpcmpgtb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
; HASWELL-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.33]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pcmpgtb:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm1 # sched: [1:0.50]
+; BROADWELL-NEXT: vpcmpgtb (%rdi), %xmm0, %xmm0 # sched: [6:0.50]
+; BROADWELL-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.33]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pcmpgtb:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm1 # sched: [1:0.50]
+; SKYLAKE-NEXT: vpcmpgtb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKYLAKE-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pcmpgtb:
+; SKX: # %bb.0:
+; SKX-NEXT: vpcmpgtb %xmm1, %xmm0, %k0 # sched: [3:1.00]
+; SKX-NEXT: vpcmpgtb (%rdi), %xmm0, %k1 # sched: [9:1.00]
+; SKX-NEXT: korw %k1, %k0, %k0 # sched: [1:1.00]
+; SKX-NEXT: vpmovm2b %k0, %xmm0 # sched: [1:0.25]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_pcmpgtb:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm1 # sched: [1:0.50]
; BTVER2-NEXT: vpcmpgtb (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_pcmpgtb:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm1 # sched: [1:0.25]
; ZNVER1-NEXT: vpcmpgtb (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
; ZNVER1-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.25]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = icmp sgt <16 x i8> %a0, %a1
%2 = load <16 x i8>, <16 x i8> *%a2, align 16
%3 = icmp sgt <16 x i8> %a0, %2
@@ -3862,23 +5319,23 @@ define <16 x i8> @test_pcmpgtb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
define <4 x i32> @test_pcmpgtd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
; GENERIC-LABEL: test_pcmpgtd:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: movdqa %xmm0, %xmm2
-; GENERIC-NEXT: pcmpgtd %xmm1, %xmm2
-; GENERIC-NEXT: pcmpeqd (%rdi), %xmm0
-; GENERIC-NEXT: por %xmm2, %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: movdqa %xmm0, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: pcmpgtd %xmm1, %xmm2 # sched: [1:0.50]
+; GENERIC-NEXT: pcmpeqd (%rdi), %xmm0 # sched: [7:0.50]
+; GENERIC-NEXT: por %xmm2, %xmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_pcmpgtd:
-; ATOM: # BB#0:
-; ATOM-NEXT: movdqa %xmm0, %xmm2
-; ATOM-NEXT: pcmpeqd (%rdi), %xmm0
-; ATOM-NEXT: pcmpgtd %xmm1, %xmm2
-; ATOM-NEXT: por %xmm2, %xmm0
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: movdqa %xmm0, %xmm2 # sched: [1:0.50]
+; ATOM-NEXT: pcmpeqd (%rdi), %xmm0 # sched: [1:1.00]
+; ATOM-NEXT: pcmpgtd %xmm1, %xmm2 # sched: [1:0.50]
+; ATOM-NEXT: por %xmm2, %xmm0 # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_pcmpgtd:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: movdqa %xmm0, %xmm2 # sched: [1:0.50]
; SLM-NEXT: pcmpeqd (%rdi), %xmm0 # sched: [4:1.00]
; SLM-NEXT: pcmpgtd %xmm1, %xmm2 # sched: [1:0.50]
@@ -3886,32 +5343,54 @@ define <4 x i32> @test_pcmpgtd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_pcmpgtd:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm1 # sched: [1:0.50]
-; SANDY-NEXT: vpcmpeqd (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; SANDY-NEXT: vpcmpeqd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
; SANDY-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pcmpgtd:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm1 # sched: [1:0.50]
-; HASWELL-NEXT: vpcmpeqd (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vpcmpeqd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
; HASWELL-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.33]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pcmpgtd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm1 # sched: [1:0.50]
+; BROADWELL-NEXT: vpcmpeqd (%rdi), %xmm0, %xmm0 # sched: [6:0.50]
+; BROADWELL-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.33]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pcmpgtd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm1 # sched: [1:0.50]
+; SKYLAKE-NEXT: vpcmpeqd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKYLAKE-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pcmpgtd:
+; SKX: # %bb.0:
+; SKX-NEXT: vpcmpgtd %xmm1, %xmm0, %k0 # sched: [3:1.00]
+; SKX-NEXT: vpcmpeqd (%rdi), %xmm0, %k1 # sched: [9:1.00]
+; SKX-NEXT: korw %k1, %k0, %k0 # sched: [1:1.00]
+; SKX-NEXT: vpmovm2d %k0, %xmm0 # sched: [1:0.25]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_pcmpgtd:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm1 # sched: [1:0.50]
; BTVER2-NEXT: vpcmpeqd (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_pcmpgtd:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm1 # sched: [1:0.25]
; ZNVER1-NEXT: vpcmpeqd (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
; ZNVER1-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.25]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = icmp sgt <4 x i32> %a0, %a1
%2 = load <4 x i32>, <4 x i32> *%a2, align 16
%3 = icmp eq <4 x i32> %a0, %2
@@ -3922,23 +5401,23 @@ define <4 x i32> @test_pcmpgtd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
define <8 x i16> @test_pcmpgtw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; GENERIC-LABEL: test_pcmpgtw:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: movdqa %xmm0, %xmm2
-; GENERIC-NEXT: pcmpgtw %xmm1, %xmm2
-; GENERIC-NEXT: pcmpgtw (%rdi), %xmm0
-; GENERIC-NEXT: por %xmm2, %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: movdqa %xmm0, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT: pcmpgtw %xmm1, %xmm2 # sched: [1:0.50]
+; GENERIC-NEXT: pcmpgtw (%rdi), %xmm0 # sched: [7:0.50]
+; GENERIC-NEXT: por %xmm2, %xmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_pcmpgtw:
-; ATOM: # BB#0:
-; ATOM-NEXT: movdqa %xmm0, %xmm2
-; ATOM-NEXT: pcmpgtw (%rdi), %xmm0
-; ATOM-NEXT: pcmpgtw %xmm1, %xmm2
-; ATOM-NEXT: por %xmm2, %xmm0
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: movdqa %xmm0, %xmm2 # sched: [1:0.50]
+; ATOM-NEXT: pcmpgtw (%rdi), %xmm0 # sched: [1:1.00]
+; ATOM-NEXT: pcmpgtw %xmm1, %xmm2 # sched: [1:0.50]
+; ATOM-NEXT: por %xmm2, %xmm0 # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_pcmpgtw:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: movdqa %xmm0, %xmm2 # sched: [1:0.50]
; SLM-NEXT: pcmpgtw (%rdi), %xmm0 # sched: [4:1.00]
; SLM-NEXT: pcmpgtw %xmm1, %xmm2 # sched: [1:0.50]
@@ -3946,32 +5425,54 @@ define <8 x i16> @test_pcmpgtw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_pcmpgtw:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm1 # sched: [1:0.50]
-; SANDY-NEXT: vpcmpgtw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; SANDY-NEXT: vpcmpgtw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
; SANDY-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pcmpgtw:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm1 # sched: [1:0.50]
-; HASWELL-NEXT: vpcmpgtw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vpcmpgtw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
; HASWELL-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.33]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pcmpgtw:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm1 # sched: [1:0.50]
+; BROADWELL-NEXT: vpcmpgtw (%rdi), %xmm0, %xmm0 # sched: [6:0.50]
+; BROADWELL-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.33]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pcmpgtw:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm1 # sched: [1:0.50]
+; SKYLAKE-NEXT: vpcmpgtw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKYLAKE-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pcmpgtw:
+; SKX: # %bb.0:
+; SKX-NEXT: vpcmpgtw %xmm1, %xmm0, %k0 # sched: [3:1.00]
+; SKX-NEXT: vpcmpgtw (%rdi), %xmm0, %k1 # sched: [9:1.00]
+; SKX-NEXT: korb %k1, %k0, %k0 # sched: [1:1.00]
+; SKX-NEXT: vpmovm2w %k0, %xmm0 # sched: [1:0.25]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_pcmpgtw:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm1 # sched: [1:0.50]
; BTVER2-NEXT: vpcmpgtw (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_pcmpgtw:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm1 # sched: [1:0.25]
; ZNVER1-NEXT: vpcmpgtw (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
; ZNVER1-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.25]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = icmp sgt <8 x i16> %a0, %a1
%2 = load <8 x i16>, <8 x i16> *%a2, align 16
%3 = icmp sgt <8 x i16> %a0, %2
@@ -3982,96 +5483,132 @@ define <8 x i16> @test_pcmpgtw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
define i16 @test_pextrw(<8 x i16> %a0) {
; GENERIC-LABEL: test_pextrw:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: pextrw $6, %xmm0, %eax
-; GENERIC-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: pextrw $6, %xmm0, %eax # sched: [3:1.00]
+; GENERIC-NEXT: # kill: def %ax killed %ax killed %eax
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_pextrw:
-; ATOM: # BB#0:
-; ATOM-NEXT: pextrw $6, %xmm0, %eax
-; ATOM-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: pextrw $6, %xmm0, %eax # sched: [4:2.00]
+; ATOM-NEXT: # kill: def %ax killed %ax killed %eax
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_pextrw:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: pextrw $6, %xmm0, %eax # sched: [4:1.00]
-; SLM-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; SLM-NEXT: # kill: def %ax killed %ax killed %eax
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_pextrw:
-; SANDY: # BB#0:
-; SANDY-NEXT: vpextrw $6, %xmm0, %eax # sched: [1:0.50]
-; SANDY-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vpextrw $6, %xmm0, %eax # sched: [3:1.00]
+; SANDY-NEXT: # kill: def %ax killed %ax killed %eax
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pextrw:
-; HASWELL: # BB#0:
-; HASWELL-NEXT: vpextrw $6, %xmm0, %eax # sched: [1:1.00]
-; HASWELL-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vpextrw $6, %xmm0, %eax # sched: [2:1.00]
+; HASWELL-NEXT: # kill: def %ax killed %ax killed %eax
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pextrw:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpextrw $6, %xmm0, %eax # sched: [2:1.00]
+; BROADWELL-NEXT: # kill: def %ax killed %ax killed %eax
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pextrw:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpextrw $6, %xmm0, %eax # sched: [3:1.00]
+; SKYLAKE-NEXT: # kill: def %ax killed %ax killed %eax
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pextrw:
+; SKX: # %bb.0:
+; SKX-NEXT: vpextrw $6, %xmm0, %eax # sched: [3:1.00]
+; SKX-NEXT: # kill: def %ax killed %ax killed %eax
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_pextrw:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vpextrw $6, %xmm0, %eax # sched: [1:0.50]
-; BTVER2-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; BTVER2-NEXT: # kill: def %ax killed %ax killed %eax
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_pextrw:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vpextrw $6, %xmm0, %eax # sched: [1:0.25]
-; ZNVER1-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: # kill: def %ax killed %ax killed %eax
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = extractelement <8 x i16> %a0, i32 6
ret i16 %1
}
define <8 x i16> @test_pinsrw(<8 x i16> %a0, i16 %a1, i16 *%a2) {
; GENERIC-LABEL: test_pinsrw:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: pinsrw $1, %edi, %xmm0
-; GENERIC-NEXT: pinsrw $3, (%rsi), %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: pinsrw $1, %edi, %xmm0 # sched: [2:1.00]
+; GENERIC-NEXT: pinsrw $3, (%rsi), %xmm0 # sched: [7:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_pinsrw:
-; ATOM: # BB#0:
-; ATOM-NEXT: pinsrw $1, %edi, %xmm0
-; ATOM-NEXT: pinsrw $3, (%rsi), %xmm0
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: pinsrw $1, %edi, %xmm0 # sched: [1:1.00]
+; ATOM-NEXT: pinsrw $3, (%rsi), %xmm0 # sched: [1:1.00]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_pinsrw:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: pinsrw $1, %edi, %xmm0 # sched: [1:1.00]
; SLM-NEXT: pinsrw $3, (%rsi), %xmm0 # sched: [4:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_pinsrw:
-; SANDY: # BB#0:
-; SANDY-NEXT: vpinsrw $1, %edi, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpinsrw $3, (%rsi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vpinsrw $1, %edi, %xmm0, %xmm0 # sched: [2:1.00]
+; SANDY-NEXT: vpinsrw $3, (%rsi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pinsrw:
-; HASWELL: # BB#0:
-; HASWELL-NEXT: vpinsrw $1, %edi, %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT: vpinsrw $3, (%rsi), %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vpinsrw $1, %edi, %xmm0, %xmm0 # sched: [2:2.00]
+; HASWELL-NEXT: vpinsrw $3, (%rsi), %xmm0, %xmm0 # sched: [6:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pinsrw:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpinsrw $1, %edi, %xmm0, %xmm0 # sched: [2:2.00]
+; BROADWELL-NEXT: vpinsrw $3, (%rsi), %xmm0, %xmm0 # sched: [6:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pinsrw:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpinsrw $1, %edi, %xmm0, %xmm0 # sched: [2:2.00]
+; SKYLAKE-NEXT: vpinsrw $3, (%rsi), %xmm0, %xmm0 # sched: [6:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pinsrw:
+; SKX: # %bb.0:
+; SKX-NEXT: vpinsrw $1, %edi, %xmm0, %xmm0 # sched: [2:2.00]
+; SKX-NEXT: vpinsrw $3, (%rsi), %xmm0, %xmm0 # sched: [6:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_pinsrw:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vpinsrw $1, %edi, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vpinsrw $3, (%rsi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_pinsrw:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vpinsrw $1, %edi, %xmm0, %xmm0 # sched: [1:0.25]
; ZNVER1-NEXT: vpinsrw $3, (%rsi), %xmm0, %xmm0 # sched: [8:0.50]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = insertelement <8 x i16> %a0, i16 %a1, i32 1
%2 = load i16, i16 *%a2
%3 = insertelement <8 x i16> %1, i16 %2, i32 3
@@ -4080,54 +5617,72 @@ define <8 x i16> @test_pinsrw(<8 x i16> %a0, i16 %a1, i16 *%a2) {
define <4 x i32> @test_pmaddwd(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; GENERIC-LABEL: test_pmaddwd:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: pmaddwd %xmm1, %xmm0
-; GENERIC-NEXT: pmaddwd (%rdi), %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: pmaddwd %xmm1, %xmm0 # sched: [3:1.00]
+; GENERIC-NEXT: pmaddwd (%rdi), %xmm0 # sched: [9:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_pmaddwd:
-; ATOM: # BB#0:
+; ATOM: # %bb.0:
; ATOM-NEXT: pmaddwd %xmm1, %xmm0
; ATOM-NEXT: pmaddwd (%rdi), %xmm0
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: retq
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_pmaddwd:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: pmaddwd %xmm1, %xmm0 # sched: [4:1.00]
; SLM-NEXT: pmaddwd (%rdi), %xmm0 # sched: [7:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_pmaddwd:
-; SANDY: # BB#0:
-; SANDY-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; SANDY-NEXT: vpmaddwd (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pmaddwd:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT: vpmaddwd (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpmaddwd (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pmaddwd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BROADWELL-NEXT: vpmaddwd (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pmaddwd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
+; SKYLAKE-NEXT: vpmaddwd (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pmaddwd:
+; SKX: # %bb.0:
+; SKX-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: vpmaddwd (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_pmaddwd:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
; BTVER2-NEXT: vpmaddwd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_pmaddwd:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 # sched: [4:1.00]
; ZNVER1-NEXT: vpmaddwd (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %a0, <8 x i16> %a1)
%2 = bitcast <4 x i32> %1 to <8 x i16>
%3 = load <8 x i16>, <8 x i16> *%a2, align 16
@@ -4138,50 +5693,68 @@ declare <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16>, <8 x i16>) nounwind readnon
define <8 x i16> @test_pmaxsw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; GENERIC-LABEL: test_pmaxsw:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: pmaxsw %xmm1, %xmm0
-; GENERIC-NEXT: pmaxsw (%rdi), %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: pmaxsw %xmm1, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT: pmaxsw (%rdi), %xmm0 # sched: [7:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_pmaxsw:
-; ATOM: # BB#0:
-; ATOM-NEXT: pmaxsw %xmm1, %xmm0
-; ATOM-NEXT: pmaxsw (%rdi), %xmm0
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: pmaxsw %xmm1, %xmm0 # sched: [1:0.50]
+; ATOM-NEXT: pmaxsw (%rdi), %xmm0 # sched: [1:1.00]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_pmaxsw:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: pmaxsw %xmm1, %xmm0 # sched: [1:0.50]
; SLM-NEXT: pmaxsw (%rdi), %xmm0 # sched: [4:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_pmaxsw:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpmaxsw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpmaxsw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pmaxsw:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpmaxsw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpmaxsw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pmaxsw:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BROADWELL-NEXT: vpmaxsw (%rdi), %xmm0, %xmm0 # sched: [6:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pmaxsw:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; SKYLAKE-NEXT: vpmaxsw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pmaxsw:
+; SKX: # %bb.0:
+; SKX-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; SKX-NEXT: vpmaxsw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_pmaxsw:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vpmaxsw (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_pmaxsw:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
; ZNVER1-NEXT: vpmaxsw (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call <8 x i16> @llvm.x86.sse2.pmaxs.w(<8 x i16> %a0, <8 x i16> %a1)
%2 = load <8 x i16>, <8 x i16> *%a2, align 16
%3 = call <8 x i16> @llvm.x86.sse2.pmaxs.w(<8 x i16> %1, <8 x i16> %2)
@@ -4191,50 +5764,68 @@ declare <8 x i16> @llvm.x86.sse2.pmaxs.w(<8 x i16>, <8 x i16>) nounwind readnone
define <16 x i8> @test_pmaxub(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; GENERIC-LABEL: test_pmaxub:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: pmaxub %xmm1, %xmm0
-; GENERIC-NEXT: pmaxub (%rdi), %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: pmaxub %xmm1, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT: pmaxub (%rdi), %xmm0 # sched: [7:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_pmaxub:
-; ATOM: # BB#0:
-; ATOM-NEXT: pmaxub %xmm1, %xmm0
-; ATOM-NEXT: pmaxub (%rdi), %xmm0
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: pmaxub %xmm1, %xmm0 # sched: [1:0.50]
+; ATOM-NEXT: pmaxub (%rdi), %xmm0 # sched: [1:1.00]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_pmaxub:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: pmaxub %xmm1, %xmm0 # sched: [1:0.50]
; SLM-NEXT: pmaxub (%rdi), %xmm0 # sched: [4:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_pmaxub:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpmaxub (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpmaxub (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pmaxub:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpmaxub (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpmaxub (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pmaxub:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BROADWELL-NEXT: vpmaxub (%rdi), %xmm0, %xmm0 # sched: [6:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pmaxub:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; SKYLAKE-NEXT: vpmaxub (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pmaxub:
+; SKX: # %bb.0:
+; SKX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; SKX-NEXT: vpmaxub (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_pmaxub:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vpmaxub (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_pmaxub:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
; ZNVER1-NEXT: vpmaxub (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call <16 x i8> @llvm.x86.sse2.pmaxu.b(<16 x i8> %a0, <16 x i8> %a1)
%2 = load <16 x i8>, <16 x i8> *%a2, align 16
%3 = call <16 x i8> @llvm.x86.sse2.pmaxu.b(<16 x i8> %1, <16 x i8> %2)
@@ -4244,50 +5835,68 @@ declare <16 x i8> @llvm.x86.sse2.pmaxu.b(<16 x i8>, <16 x i8>) nounwind readnone
define <8 x i16> @test_pminsw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; GENERIC-LABEL: test_pminsw:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: pminsw %xmm1, %xmm0
-; GENERIC-NEXT: pminsw (%rdi), %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: pminsw %xmm1, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT: pminsw (%rdi), %xmm0 # sched: [7:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_pminsw:
-; ATOM: # BB#0:
-; ATOM-NEXT: pminsw %xmm1, %xmm0
-; ATOM-NEXT: pminsw (%rdi), %xmm0
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: pminsw %xmm1, %xmm0 # sched: [1:0.50]
+; ATOM-NEXT: pminsw (%rdi), %xmm0 # sched: [1:1.00]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_pminsw:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: pminsw %xmm1, %xmm0 # sched: [1:0.50]
; SLM-NEXT: pminsw (%rdi), %xmm0 # sched: [4:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_pminsw:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vpminsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpminsw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpminsw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pminsw:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vpminsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpminsw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpminsw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pminsw:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpminsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BROADWELL-NEXT: vpminsw (%rdi), %xmm0, %xmm0 # sched: [6:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pminsw:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpminsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; SKYLAKE-NEXT: vpminsw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pminsw:
+; SKX: # %bb.0:
+; SKX-NEXT: vpminsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; SKX-NEXT: vpminsw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_pminsw:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vpminsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vpminsw (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_pminsw:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vpminsw %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
; ZNVER1-NEXT: vpminsw (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call <8 x i16> @llvm.x86.sse2.pmins.w(<8 x i16> %a0, <8 x i16> %a1)
%2 = load <8 x i16>, <8 x i16> *%a2, align 16
%3 = call <8 x i16> @llvm.x86.sse2.pmins.w(<8 x i16> %1, <8 x i16> %2)
@@ -4297,50 +5906,68 @@ declare <8 x i16> @llvm.x86.sse2.pmins.w(<8 x i16>, <8 x i16>) nounwind readnone
define <16 x i8> @test_pminub(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; GENERIC-LABEL: test_pminub:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: pminub %xmm1, %xmm0
-; GENERIC-NEXT: pminub (%rdi), %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: pminub %xmm1, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT: pminub (%rdi), %xmm0 # sched: [7:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_pminub:
-; ATOM: # BB#0:
-; ATOM-NEXT: pminub %xmm1, %xmm0
-; ATOM-NEXT: pminub (%rdi), %xmm0
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: pminub %xmm1, %xmm0 # sched: [1:0.50]
+; ATOM-NEXT: pminub (%rdi), %xmm0 # sched: [1:1.00]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_pminub:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: pminub %xmm1, %xmm0 # sched: [1:0.50]
; SLM-NEXT: pminub (%rdi), %xmm0 # sched: [4:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_pminub:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vpminub %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpminub (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpminub (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pminub:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vpminub %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpminub (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpminub (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pminub:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpminub %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BROADWELL-NEXT: vpminub (%rdi), %xmm0, %xmm0 # sched: [6:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pminub:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpminub %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; SKYLAKE-NEXT: vpminub (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pminub:
+; SKX: # %bb.0:
+; SKX-NEXT: vpminub %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; SKX-NEXT: vpminub (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_pminub:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vpminub %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vpminub (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_pminub:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vpminub %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
; ZNVER1-NEXT: vpminub (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call <16 x i8> @llvm.x86.sse2.pminu.b(<16 x i8> %a0, <16 x i8> %a1)
%2 = load <16 x i8>, <16 x i8> *%a2, align 16
%3 = call <16 x i8> @llvm.x86.sse2.pminu.b(<16 x i8> %1, <16 x i8> %2)
@@ -4350,41 +5977,56 @@ declare <16 x i8> @llvm.x86.sse2.pminu.b(<16 x i8>, <16 x i8>) nounwind readnone
define i32 @test_pmovmskb(<16 x i8> %a0) {
; GENERIC-LABEL: test_pmovmskb:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: pmovmskb %xmm0, %eax
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: pmovmskb %xmm0, %eax # sched: [2:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_pmovmskb:
-; ATOM: # BB#0:
-; ATOM-NEXT: pmovmskb %xmm0, %eax
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: pmovmskb %xmm0, %eax # sched: [3:3.00]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_pmovmskb:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: pmovmskb %xmm0, %eax # sched: [1:0.50]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_pmovmskb:
-; SANDY: # BB#0:
-; SANDY-NEXT: vpmovmskb %xmm0, %eax # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vpmovmskb %xmm0, %eax # sched: [1:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pmovmskb:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vpmovmskb %xmm0, %eax # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pmovmskb:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpmovmskb %xmm0, %eax # sched: [3:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pmovmskb:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpmovmskb %xmm0, %eax # sched: [2:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pmovmskb:
+; SKX: # %bb.0:
+; SKX-NEXT: vpmovmskb %xmm0, %eax # sched: [2:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_pmovmskb:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vpmovmskb %xmm0, %eax # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_pmovmskb:
-; ZNVER1: # BB#0:
-; ZNVER1-NEXT: vpmovmskb %xmm0, %eax # sched: [1:0.25]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpmovmskb %xmm0, %eax # sched: [1:1.00]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %a0)
ret i32 %1
}
@@ -4392,46 +6034,64 @@ declare i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8>) nounwind readnone
define <8 x i16> @test_pmulhuw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; GENERIC-LABEL: test_pmulhuw:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: pmulhuw %xmm1, %xmm0
-; GENERIC-NEXT: pmulhuw (%rdi), %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: pmulhuw %xmm1, %xmm0 # sched: [3:1.00]
+; GENERIC-NEXT: pmulhuw (%rdi), %xmm0 # sched: [9:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_pmulhuw:
-; ATOM: # BB#0:
-; ATOM-NEXT: pmulhuw %xmm1, %xmm0
-; ATOM-NEXT: pmulhuw (%rdi), %xmm0
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: pmulhuw %xmm1, %xmm0 # sched: [5:5.00]
+; ATOM-NEXT: pmulhuw (%rdi), %xmm0 # sched: [5:5.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_pmulhuw:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: pmulhuw %xmm1, %xmm0 # sched: [4:1.00]
; SLM-NEXT: pmulhuw (%rdi), %xmm0 # sched: [7:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_pmulhuw:
-; SANDY: # BB#0:
-; SANDY-NEXT: vpmulhuw %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vpmulhuw %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; SANDY-NEXT: vpmulhuw (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pmulhuw:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vpmulhuw %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT: vpmulhuw (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpmulhuw (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pmulhuw:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpmulhuw %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BROADWELL-NEXT: vpmulhuw (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pmulhuw:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpmulhuw %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
+; SKYLAKE-NEXT: vpmulhuw (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pmulhuw:
+; SKX: # %bb.0:
+; SKX-NEXT: vpmulhuw %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: vpmulhuw (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_pmulhuw:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vpmulhuw %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
; BTVER2-NEXT: vpmulhuw (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_pmulhuw:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vpmulhuw %xmm1, %xmm0, %xmm0 # sched: [4:1.00]
; ZNVER1-NEXT: vpmulhuw (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call <8 x i16> @llvm.x86.sse2.pmulhu.w(<8 x i16> %a0, <8 x i16> %a1)
%2 = load <8 x i16>, <8 x i16> *%a2, align 16
%3 = call <8 x i16> @llvm.x86.sse2.pmulhu.w(<8 x i16> %1, <8 x i16> %2)
@@ -4441,46 +6101,64 @@ declare <8 x i16> @llvm.x86.sse2.pmulhu.w(<8 x i16>, <8 x i16>) nounwind readnon
define <8 x i16> @test_pmulhw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; GENERIC-LABEL: test_pmulhw:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: pmulhw %xmm1, %xmm0
-; GENERIC-NEXT: pmulhw (%rdi), %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: pmulhw %xmm1, %xmm0 # sched: [3:1.00]
+; GENERIC-NEXT: pmulhw (%rdi), %xmm0 # sched: [9:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_pmulhw:
-; ATOM: # BB#0:
-; ATOM-NEXT: pmulhw %xmm1, %xmm0
-; ATOM-NEXT: pmulhw (%rdi), %xmm0
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: pmulhw %xmm1, %xmm0 # sched: [5:5.00]
+; ATOM-NEXT: pmulhw (%rdi), %xmm0 # sched: [5:5.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_pmulhw:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: pmulhw %xmm1, %xmm0 # sched: [4:1.00]
; SLM-NEXT: pmulhw (%rdi), %xmm0 # sched: [7:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_pmulhw:
-; SANDY: # BB#0:
-; SANDY-NEXT: vpmulhw %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vpmulhw %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; SANDY-NEXT: vpmulhw (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pmulhw:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vpmulhw %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT: vpmulhw (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpmulhw (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pmulhw:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpmulhw %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BROADWELL-NEXT: vpmulhw (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pmulhw:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpmulhw %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
+; SKYLAKE-NEXT: vpmulhw (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pmulhw:
+; SKX: # %bb.0:
+; SKX-NEXT: vpmulhw %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: vpmulhw (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_pmulhw:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vpmulhw %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
; BTVER2-NEXT: vpmulhw (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_pmulhw:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vpmulhw %xmm1, %xmm0, %xmm0 # sched: [4:1.00]
; ZNVER1-NEXT: vpmulhw (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call <8 x i16> @llvm.x86.sse2.pmulh.w(<8 x i16> %a0, <8 x i16> %a1)
%2 = load <8 x i16>, <8 x i16> *%a2, align 16
%3 = call <8 x i16> @llvm.x86.sse2.pmulh.w(<8 x i16> %1, <8 x i16> %2)
@@ -4490,46 +6168,64 @@ declare <8 x i16> @llvm.x86.sse2.pmulh.w(<8 x i16>, <8 x i16>) nounwind readnone
define <8 x i16> @test_pmullw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; GENERIC-LABEL: test_pmullw:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: pmullw %xmm1, %xmm0
-; GENERIC-NEXT: pmullw (%rdi), %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: pmullw %xmm1, %xmm0 # sched: [3:1.00]
+; GENERIC-NEXT: pmullw (%rdi), %xmm0 # sched: [9:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_pmullw:
-; ATOM: # BB#0:
-; ATOM-NEXT: pmullw %xmm1, %xmm0
-; ATOM-NEXT: pmullw (%rdi), %xmm0
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: pmullw %xmm1, %xmm0 # sched: [5:5.00]
+; ATOM-NEXT: pmullw (%rdi), %xmm0 # sched: [5:5.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_pmullw:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: pmullw %xmm1, %xmm0 # sched: [4:1.00]
; SLM-NEXT: pmullw (%rdi), %xmm0 # sched: [7:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_pmullw:
-; SANDY: # BB#0:
-; SANDY-NEXT: vpmullw %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vpmullw %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; SANDY-NEXT: vpmullw (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pmullw:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT: vpmullw (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpmullw (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pmullw:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BROADWELL-NEXT: vpmullw (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pmullw:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpmullw %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
+; SKYLAKE-NEXT: vpmullw (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pmullw:
+; SKX: # %bb.0:
+; SKX-NEXT: vpmullw %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: vpmullw (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_pmullw:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
; BTVER2-NEXT: vpmullw (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_pmullw:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 # sched: [4:1.00]
; ZNVER1-NEXT: vpmullw (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = mul <8 x i16> %a0, %a1
%2 = load <8 x i16>, <8 x i16> *%a2, align 16
%3 = mul <8 x i16> %1, %2
@@ -4538,54 +6234,72 @@ define <8 x i16> @test_pmullw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
define <2 x i64> @test_pmuludq(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
; GENERIC-LABEL: test_pmuludq:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: pmuludq %xmm1, %xmm0
-; GENERIC-NEXT: pmuludq (%rdi), %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: pmuludq %xmm1, %xmm0 # sched: [3:1.00]
+; GENERIC-NEXT: pmuludq (%rdi), %xmm0 # sched: [9:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_pmuludq:
-; ATOM: # BB#0:
+; ATOM: # %bb.0:
; ATOM-NEXT: pmuludq %xmm1, %xmm0
; ATOM-NEXT: pmuludq (%rdi), %xmm0
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: retq
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_pmuludq:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: pmuludq %xmm1, %xmm0 # sched: [4:1.00]
; SLM-NEXT: pmuludq (%rdi), %xmm0 # sched: [7:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_pmuludq:
-; SANDY: # BB#0:
-; SANDY-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; SANDY-NEXT: vpmuludq (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pmuludq:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT: vpmuludq (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpmuludq (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pmuludq:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BROADWELL-NEXT: vpmuludq (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pmuludq:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
+; SKYLAKE-NEXT: vpmuludq (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pmuludq:
+; SKX: # %bb.0:
+; SKX-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: vpmuludq (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_pmuludq:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
; BTVER2-NEXT: vpmuludq (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_pmuludq:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 # sched: [4:1.00]
; ZNVER1-NEXT: vpmuludq (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32> %a0, <4 x i32> %a1)
%2 = bitcast <2 x i64> %1 to <4 x i32>
%3 = load <4 x i32>, <4 x i32> *%a2, align 16
@@ -4596,53 +6310,74 @@ declare <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32>, <4 x i32>) nounwind readnon
define <2 x i64> @test_por(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
; GENERIC-LABEL: test_por:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: por %xmm1, %xmm0
-; GENERIC-NEXT: por (%rdi), %xmm0
-; GENERIC-NEXT: paddq %xmm1, %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: por %xmm1, %xmm0 # sched: [1:0.33]
+; GENERIC-NEXT: por (%rdi), %xmm0 # sched: [7:0.50]
+; GENERIC-NEXT: paddq %xmm1, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_por:
-; ATOM: # BB#0:
-; ATOM-NEXT: por %xmm1, %xmm0
-; ATOM-NEXT: por (%rdi), %xmm0
-; ATOM-NEXT: paddq %xmm1, %xmm0
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: por %xmm1, %xmm0 # sched: [1:0.50]
+; ATOM-NEXT: por (%rdi), %xmm0 # sched: [1:1.00]
+; ATOM-NEXT: paddq %xmm1, %xmm0 # sched: [2:1.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_por:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: por %xmm1, %xmm0 # sched: [1:0.50]
; SLM-NEXT: por (%rdi), %xmm0 # sched: [4:1.00]
; SLM-NEXT: paddq %xmm1, %xmm0 # sched: [1:0.50]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_por:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
-; SANDY-NEXT: vpor (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; SANDY-NEXT: vpor (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
; SANDY-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_por:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
-; HASWELL-NEXT: vpor (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vpor (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
; HASWELL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_por:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; BROADWELL-NEXT: vpor (%rdi), %xmm0, %xmm0 # sched: [6:0.50]
+; BROADWELL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_por:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: vpor (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKYLAKE-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_por:
+; SKX: # %bb.0:
+; SKX-NEXT: vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKX-NEXT: vpor (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKX-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_por:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vpor (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_por:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
; ZNVER1-NEXT: vpor (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
; ZNVER1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = or <2 x i64> %a0, %a1
%2 = load <2 x i64>, <2 x i64> *%a2, align 16
%3 = or <2 x i64> %1, %2
@@ -4652,54 +6387,72 @@ define <2 x i64> @test_por(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
define <2 x i64> @test_psadbw(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; GENERIC-LABEL: test_psadbw:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: psadbw %xmm1, %xmm0
-; GENERIC-NEXT: psadbw (%rdi), %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: psadbw %xmm1, %xmm0 # sched: [3:1.00]
+; GENERIC-NEXT: psadbw (%rdi), %xmm0 # sched: [9:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_psadbw:
-; ATOM: # BB#0:
+; ATOM: # %bb.0:
; ATOM-NEXT: psadbw %xmm1, %xmm0
; ATOM-NEXT: psadbw (%rdi), %xmm0
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: retq
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_psadbw:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: psadbw %xmm1, %xmm0 # sched: [1:0.50]
; SLM-NEXT: psadbw (%rdi), %xmm0 # sched: [4:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_psadbw:
-; SANDY: # BB#0:
-; SANDY-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; SANDY-NEXT: vpsadbw (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_psadbw:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT: vpsadbw (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpsadbw (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_psadbw:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BROADWELL-NEXT: vpsadbw (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_psadbw:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; SKYLAKE-NEXT: vpsadbw (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_psadbw:
+; SKX: # %bb.0:
+; SKX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; SKX-NEXT: vpsadbw (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_psadbw:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
; BTVER2-NEXT: vpsadbw (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_psadbw:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 # sched: [4:1.00]
; ZNVER1-NEXT: vpsadbw (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %a0, <16 x i8> %a1)
%2 = bitcast <2 x i64> %1 to <16 x i8>
%3 = load <16 x i8>, <16 x i8> *%a2, align 16
@@ -4710,22 +6463,22 @@ declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
define <4 x i32> @test_pshufd(<4 x i32> %a0, <4 x i32> *%a1) {
; GENERIC-LABEL: test_pshufd:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
-; GENERIC-NEXT: pshufd {{.*#+}} xmm0 = mem[3,2,1,0]
-; GENERIC-NEXT: paddd %xmm1, %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2] sched: [1:0.50]
+; GENERIC-NEXT: pshufd {{.*#+}} xmm0 = mem[3,2,1,0] sched: [7:0.50]
+; GENERIC-NEXT: paddd %xmm1, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_pshufd:
-; ATOM: # BB#0:
-; ATOM-NEXT: pshufd {{.*#+}} xmm1 = mem[3,2,1,0]
-; ATOM-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,3,2]
-; ATOM-NEXT: paddd %xmm0, %xmm1
-; ATOM-NEXT: movdqa %xmm1, %xmm0
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: pshufd {{.*#+}} xmm1 = mem[3,2,1,0] sched: [1:1.00]
+; ATOM-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,3,2] sched: [1:1.00]
+; ATOM-NEXT: paddd %xmm0, %xmm1 # sched: [1:0.50]
+; ATOM-NEXT: movdqa %xmm1, %xmm0 # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_pshufd:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: pshufd {{.*#+}} xmm1 = mem[3,2,1,0] sched: [4:1.00]
; SLM-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,3,2] sched: [1:1.00]
; SLM-NEXT: paddd %xmm0, %xmm1 # sched: [1:0.50]
@@ -4733,32 +6486,53 @@ define <4 x i32> @test_pshufd(<4 x i32> %a0, <4 x i32> *%a1) {
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_pshufd:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,3,2] sched: [1:0.50]
-; SANDY-NEXT: vpshufd {{.*#+}} xmm1 = mem[3,2,1,0] sched: [5:0.50]
+; SANDY-NEXT: vpshufd {{.*#+}} xmm1 = mem[3,2,1,0] sched: [7:0.50]
; SANDY-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pshufd:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,3,2] sched: [1:1.00]
-; HASWELL-NEXT: vpshufd {{.*#+}} xmm1 = mem[3,2,1,0] sched: [5:1.00]
+; HASWELL-NEXT: vpshufd {{.*#+}} xmm1 = mem[3,2,1,0] sched: [7:1.00]
; HASWELL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pshufd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,3,2] sched: [1:1.00]
+; BROADWELL-NEXT: vpshufd {{.*#+}} xmm1 = mem[3,2,1,0] sched: [6:1.00]
+; BROADWELL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pshufd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,3,2] sched: [1:1.00]
+; SKYLAKE-NEXT: vpshufd {{.*#+}} xmm1 = mem[3,2,1,0] sched: [7:1.00]
+; SKYLAKE-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pshufd:
+; SKX: # %bb.0:
+; SKX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,3,2] sched: [1:1.00]
+; SKX-NEXT: vpshufd {{.*#+}} xmm1 = mem[3,2,1,0] sched: [7:1.00]
+; SKX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_pshufd:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vpshufd {{.*#+}} xmm1 = mem[3,2,1,0] sched: [6:1.00]
; BTVER2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,3,2] sched: [1:0.50]
; BTVER2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_pshufd:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vpshufd {{.*#+}} xmm1 = mem[3,2,1,0] sched: [8:0.50]
; ZNVER1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,3,2] sched: [1:0.25]
; ZNVER1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
%2 = load <4 x i32>, <4 x i32> *%a1, align 16
%3 = shufflevector <4 x i32> %2, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
@@ -4768,22 +6542,22 @@ define <4 x i32> @test_pshufd(<4 x i32> %a0, <4 x i32> *%a1) {
define <8 x i16> @test_pshufhw(<8 x i16> %a0, <8 x i16> *%a1) {
; GENERIC-LABEL: test_pshufhw:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,5,4,7,6]
-; GENERIC-NEXT: pshufhw {{.*#+}} xmm0 = mem[0,1,2,3,7,6,5,4]
-; GENERIC-NEXT: paddw %xmm1, %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,5,4,7,6] sched: [1:0.50]
+; GENERIC-NEXT: pshufhw {{.*#+}} xmm0 = mem[0,1,2,3,7,6,5,4] sched: [7:0.50]
+; GENERIC-NEXT: paddw %xmm1, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_pshufhw:
-; ATOM: # BB#0:
-; ATOM-NEXT: pshufhw {{.*#+}} xmm1 = mem[0,1,2,3,7,6,5,4]
-; ATOM-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6]
-; ATOM-NEXT: paddw %xmm0, %xmm1
-; ATOM-NEXT: movdqa %xmm1, %xmm0
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: pshufhw {{.*#+}} xmm1 = mem[0,1,2,3,7,6,5,4] sched: [1:1.00]
+; ATOM-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6] sched: [1:1.00]
+; ATOM-NEXT: paddw %xmm0, %xmm1 # sched: [1:0.50]
+; ATOM-NEXT: movdqa %xmm1, %xmm0 # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_pshufhw:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: pshufhw {{.*#+}} xmm1 = mem[0,1,2,3,7,6,5,4] sched: [4:1.00]
; SLM-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6] sched: [1:1.00]
; SLM-NEXT: paddw %xmm0, %xmm1 # sched: [1:0.50]
@@ -4791,32 +6565,53 @@ define <8 x i16> @test_pshufhw(<8 x i16> %a0, <8 x i16> *%a1) {
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_pshufhw:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6] sched: [1:0.50]
-; SANDY-NEXT: vpshufhw {{.*#+}} xmm1 = mem[0,1,2,3,7,6,5,4] sched: [5:0.50]
+; SANDY-NEXT: vpshufhw {{.*#+}} xmm1 = mem[0,1,2,3,7,6,5,4] sched: [7:0.50]
; SANDY-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pshufhw:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6] sched: [1:1.00]
-; HASWELL-NEXT: vpshufhw {{.*#+}} xmm1 = mem[0,1,2,3,7,6,5,4] sched: [5:1.00]
+; HASWELL-NEXT: vpshufhw {{.*#+}} xmm1 = mem[0,1,2,3,7,6,5,4] sched: [7:1.00]
; HASWELL-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pshufhw:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6] sched: [1:1.00]
+; BROADWELL-NEXT: vpshufhw {{.*#+}} xmm1 = mem[0,1,2,3,7,6,5,4] sched: [6:1.00]
+; BROADWELL-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pshufhw:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6] sched: [1:1.00]
+; SKYLAKE-NEXT: vpshufhw {{.*#+}} xmm1 = mem[0,1,2,3,7,6,5,4] sched: [7:1.00]
+; SKYLAKE-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pshufhw:
+; SKX: # %bb.0:
+; SKX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6] sched: [1:1.00]
+; SKX-NEXT: vpshufhw {{.*#+}} xmm1 = mem[0,1,2,3,7,6,5,4] sched: [7:1.00]
+; SKX-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_pshufhw:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vpshufhw {{.*#+}} xmm1 = mem[0,1,2,3,7,6,5,4] sched: [6:1.00]
; BTVER2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6] sched: [1:0.50]
; BTVER2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_pshufhw:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vpshufhw {{.*#+}} xmm1 = mem[0,1,2,3,7,6,5,4] sched: [8:0.50]
; ZNVER1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6] sched: [1:0.25]
; ZNVER1-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 7, i32 6>
%2 = load <8 x i16>, <8 x i16> *%a1, align 16
%3 = shufflevector <8 x i16> %2, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 5, i32 4>
@@ -4826,22 +6621,22 @@ define <8 x i16> @test_pshufhw(<8 x i16> %a0, <8 x i16> *%a1) {
define <8 x i16> @test_pshuflw(<8 x i16> %a0, <8 x i16> *%a1) {
; GENERIC-LABEL: test_pshuflw:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[1,0,3,2,4,5,6,7]
-; GENERIC-NEXT: pshuflw {{.*#+}} xmm0 = mem[3,2,1,0,4,5,6,7]
-; GENERIC-NEXT: paddw %xmm1, %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[1,0,3,2,4,5,6,7] sched: [1:0.50]
+; GENERIC-NEXT: pshuflw {{.*#+}} xmm0 = mem[3,2,1,0,4,5,6,7] sched: [7:0.50]
+; GENERIC-NEXT: paddw %xmm1, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_pshuflw:
-; ATOM: # BB#0:
-; ATOM-NEXT: pshuflw {{.*#+}} xmm1 = mem[3,2,1,0,4,5,6,7]
-; ATOM-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7]
-; ATOM-NEXT: paddw %xmm0, %xmm1
-; ATOM-NEXT: movdqa %xmm1, %xmm0
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: pshuflw {{.*#+}} xmm1 = mem[3,2,1,0,4,5,6,7] sched: [1:1.00]
+; ATOM-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] sched: [1:1.00]
+; ATOM-NEXT: paddw %xmm0, %xmm1 # sched: [1:0.50]
+; ATOM-NEXT: movdqa %xmm1, %xmm0 # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_pshuflw:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: pshuflw {{.*#+}} xmm1 = mem[3,2,1,0,4,5,6,7] sched: [4:1.00]
; SLM-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] sched: [1:1.00]
; SLM-NEXT: paddw %xmm0, %xmm1 # sched: [1:0.50]
@@ -4849,32 +6644,53 @@ define <8 x i16> @test_pshuflw(<8 x i16> %a0, <8 x i16> *%a1) {
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_pshuflw:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] sched: [1:0.50]
-; SANDY-NEXT: vpshuflw {{.*#+}} xmm1 = mem[3,2,1,0,4,5,6,7] sched: [5:0.50]
+; SANDY-NEXT: vpshuflw {{.*#+}} xmm1 = mem[3,2,1,0,4,5,6,7] sched: [7:0.50]
; SANDY-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pshuflw:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] sched: [1:1.00]
-; HASWELL-NEXT: vpshuflw {{.*#+}} xmm1 = mem[3,2,1,0,4,5,6,7] sched: [5:1.00]
+; HASWELL-NEXT: vpshuflw {{.*#+}} xmm1 = mem[3,2,1,0,4,5,6,7] sched: [7:1.00]
; HASWELL-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pshuflw:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] sched: [1:1.00]
+; BROADWELL-NEXT: vpshuflw {{.*#+}} xmm1 = mem[3,2,1,0,4,5,6,7] sched: [6:1.00]
+; BROADWELL-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pshuflw:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] sched: [1:1.00]
+; SKYLAKE-NEXT: vpshuflw {{.*#+}} xmm1 = mem[3,2,1,0,4,5,6,7] sched: [7:1.00]
+; SKYLAKE-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pshuflw:
+; SKX: # %bb.0:
+; SKX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] sched: [1:1.00]
+; SKX-NEXT: vpshuflw {{.*#+}} xmm1 = mem[3,2,1,0,4,5,6,7] sched: [7:1.00]
+; SKX-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_pshuflw:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vpshuflw {{.*#+}} xmm1 = mem[3,2,1,0,4,5,6,7] sched: [6:1.00]
; BTVER2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] sched: [1:0.50]
; BTVER2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_pshuflw:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vpshuflw {{.*#+}} xmm1 = mem[3,2,1,0,4,5,6,7] sched: [8:0.50]
; ZNVER1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] sched: [1:0.25]
; ZNVER1-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7>
%2 = load <8 x i16>, <8 x i16> *%a1, align 16
%3 = shufflevector <8 x i16> %2, <8 x i16> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7>
@@ -4884,53 +6700,74 @@ define <8 x i16> @test_pshuflw(<8 x i16> %a0, <8 x i16> *%a1) {
define <4 x i32> @test_pslld(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
; GENERIC-LABEL: test_pslld:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: pslld %xmm1, %xmm0
-; GENERIC-NEXT: pslld (%rdi), %xmm0
-; GENERIC-NEXT: pslld $2, %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: pslld %xmm1, %xmm0 # sched: [2:1.00]
+; GENERIC-NEXT: pslld (%rdi), %xmm0 # sched: [8:1.00]
+; GENERIC-NEXT: pslld $2, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_pslld:
-; ATOM: # BB#0:
-; ATOM-NEXT: pslld %xmm1, %xmm0
-; ATOM-NEXT: pslld (%rdi), %xmm0
-; ATOM-NEXT: pslld $2, %xmm0
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: pslld %xmm1, %xmm0 # sched: [2:1.00]
+; ATOM-NEXT: pslld (%rdi), %xmm0 # sched: [3:1.50]
+; ATOM-NEXT: pslld $2, %xmm0 # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_pslld:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: pslld %xmm1, %xmm0 # sched: [1:1.00]
; SLM-NEXT: pslld (%rdi), %xmm0 # sched: [4:1.00]
; SLM-NEXT: pslld $2, %xmm0 # sched: [1:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_pslld:
-; SANDY: # BB#0:
-; SANDY-NEXT: vpslld %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpslld (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: vpslld $2, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vpslld %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
+; SANDY-NEXT: vpslld (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
+; SANDY-NEXT: vpslld $2, %xmm0, %xmm0 # sched: [1:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pslld:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vpslld %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
-; HASWELL-NEXT: vpslld (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
+; HASWELL-NEXT: vpslld (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
; HASWELL-NEXT: vpslld $2, %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pslld:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpslld %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
+; BROADWELL-NEXT: vpslld (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; BROADWELL-NEXT: vpslld $2, %xmm0, %xmm0 # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pslld:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpslld %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
+; SKYLAKE-NEXT: vpslld (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKYLAKE-NEXT: vpslld $2, %xmm0, %xmm0 # sched: [1:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pslld:
+; SKX: # %bb.0:
+; SKX-NEXT: vpslld %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
+; SKX-NEXT: vpslld (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKX-NEXT: vpslld $2, %xmm0, %xmm0 # sched: [1:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_pslld:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vpslld %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vpslld (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: vpslld $2, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_pslld:
-; ZNVER1: # BB#0:
-; ZNVER1-NEXT: vpslld %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
-; ZNVER1-NEXT: vpslld (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpslld %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; ZNVER1-NEXT: vpslld (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
; ZNVER1-NEXT: vpslld $2, %xmm0, %xmm0 # sched: [1:0.25]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %a0, <4 x i32> %a1)
%2 = load <4 x i32>, <4 x i32> *%a2, align 16
%3 = call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %1, <4 x i32> %2)
@@ -4942,98 +6779,134 @@ declare <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32>, i32) nounwind readnone
define <4 x i32> @test_pslldq(<4 x i32> %a0) {
; GENERIC-LABEL: test_pslldq:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11]
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11] sched: [1:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_pslldq:
-; ATOM: # BB#0:
-; ATOM-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11]
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11] sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_pslldq:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11] sched: [1:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_pslldq:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11] sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pslldq:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11] sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pslldq:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11] sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pslldq:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11] sched: [1:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pslldq:
+; SKX: # %bb.0:
+; SKX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11] sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_pslldq:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11] sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_pslldq:
-; ZNVER1: # BB#0:
-; ZNVER1-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11] sched: [1:0.25]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11] sched: [1:1.00]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = shufflevector <4 x i32> %a0, <4 x i32> zeroinitializer, <4 x i32> <i32 4, i32 0, i32 1, i32 2>
ret <4 x i32> %1
}
define <2 x i64> @test_psllq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
; GENERIC-LABEL: test_psllq:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: psllq %xmm1, %xmm0
-; GENERIC-NEXT: psllq (%rdi), %xmm0
-; GENERIC-NEXT: psllq $2, %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: psllq %xmm1, %xmm0 # sched: [2:1.00]
+; GENERIC-NEXT: psllq (%rdi), %xmm0 # sched: [8:1.00]
+; GENERIC-NEXT: psllq $2, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_psllq:
-; ATOM: # BB#0:
-; ATOM-NEXT: psllq %xmm1, %xmm0
-; ATOM-NEXT: psllq (%rdi), %xmm0
-; ATOM-NEXT: psllq $2, %xmm0
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: psllq %xmm1, %xmm0 # sched: [2:1.00]
+; ATOM-NEXT: psllq (%rdi), %xmm0 # sched: [3:1.50]
+; ATOM-NEXT: psllq $2, %xmm0 # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_psllq:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: psllq %xmm1, %xmm0 # sched: [1:1.00]
; SLM-NEXT: psllq (%rdi), %xmm0 # sched: [4:1.00]
; SLM-NEXT: psllq $2, %xmm0 # sched: [1:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_psllq:
-; SANDY: # BB#0:
-; SANDY-NEXT: vpsllq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpsllq (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: vpsllq $2, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vpsllq %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
+; SANDY-NEXT: vpsllq (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
+; SANDY-NEXT: vpsllq $2, %xmm0, %xmm0 # sched: [1:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_psllq:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vpsllq %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
-; HASWELL-NEXT: vpsllq (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
+; HASWELL-NEXT: vpsllq (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
; HASWELL-NEXT: vpsllq $2, %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_psllq:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpsllq %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
+; BROADWELL-NEXT: vpsllq (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; BROADWELL-NEXT: vpsllq $2, %xmm0, %xmm0 # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_psllq:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpsllq %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
+; SKYLAKE-NEXT: vpsllq (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKYLAKE-NEXT: vpsllq $2, %xmm0, %xmm0 # sched: [1:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_psllq:
+; SKX: # %bb.0:
+; SKX-NEXT: vpsllq %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
+; SKX-NEXT: vpsllq (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKX-NEXT: vpsllq $2, %xmm0, %xmm0 # sched: [1:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_psllq:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vpsllq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vpsllq (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: vpsllq $2, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_psllq:
-; ZNVER1: # BB#0:
-; ZNVER1-NEXT: vpsllq %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
-; ZNVER1-NEXT: vpsllq (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpsllq %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; ZNVER1-NEXT: vpsllq (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
; ZNVER1-NEXT: vpsllq $2, %xmm0, %xmm0 # sched: [1:0.25]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %a0, <2 x i64> %a1)
%2 = load <2 x i64>, <2 x i64> *%a2, align 16
%3 = call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %1, <2 x i64> %2)
@@ -5045,53 +6918,74 @@ declare <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64>, i32) nounwind readnone
define <8 x i16> @test_psllw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; GENERIC-LABEL: test_psllw:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: psllw %xmm1, %xmm0
-; GENERIC-NEXT: psllw (%rdi), %xmm0
-; GENERIC-NEXT: psllw $2, %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: psllw %xmm1, %xmm0 # sched: [2:1.00]
+; GENERIC-NEXT: psllw (%rdi), %xmm0 # sched: [8:1.00]
+; GENERIC-NEXT: psllw $2, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_psllw:
-; ATOM: # BB#0:
-; ATOM-NEXT: psllw %xmm1, %xmm0
-; ATOM-NEXT: psllw (%rdi), %xmm0
-; ATOM-NEXT: psllw $2, %xmm0
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: psllw %xmm1, %xmm0 # sched: [2:1.00]
+; ATOM-NEXT: psllw (%rdi), %xmm0 # sched: [3:1.50]
+; ATOM-NEXT: psllw $2, %xmm0 # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_psllw:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: psllw %xmm1, %xmm0 # sched: [1:1.00]
; SLM-NEXT: psllw (%rdi), %xmm0 # sched: [4:1.00]
; SLM-NEXT: psllw $2, %xmm0 # sched: [1:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_psllw:
-; SANDY: # BB#0:
-; SANDY-NEXT: vpsllw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpsllw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: vpsllw $2, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vpsllw %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
+; SANDY-NEXT: vpsllw (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
+; SANDY-NEXT: vpsllw $2, %xmm0, %xmm0 # sched: [1:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_psllw:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vpsllw %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
-; HASWELL-NEXT: vpsllw (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
+; HASWELL-NEXT: vpsllw (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
; HASWELL-NEXT: vpsllw $2, %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_psllw:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpsllw %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
+; BROADWELL-NEXT: vpsllw (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; BROADWELL-NEXT: vpsllw $2, %xmm0, %xmm0 # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_psllw:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpsllw %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
+; SKYLAKE-NEXT: vpsllw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKYLAKE-NEXT: vpsllw $2, %xmm0, %xmm0 # sched: [1:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_psllw:
+; SKX: # %bb.0:
+; SKX-NEXT: vpsllw %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
+; SKX-NEXT: vpsllw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKX-NEXT: vpsllw $2, %xmm0, %xmm0 # sched: [1:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_psllw:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vpsllw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vpsllw (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: vpsllw $2, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_psllw:
-; ZNVER1: # BB#0:
-; ZNVER1-NEXT: vpsllw %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
-; ZNVER1-NEXT: vpsllw (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpsllw %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; ZNVER1-NEXT: vpsllw (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
; ZNVER1-NEXT: vpsllw $2, %xmm0, %xmm0 # sched: [1:0.25]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> %a0, <8 x i16> %a1)
%2 = load <8 x i16>, <8 x i16> *%a2, align 16
%3 = call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> %1, <8 x i16> %2)
@@ -5103,53 +6997,74 @@ declare <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16>, i32) nounwind readnone
define <4 x i32> @test_psrad(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
; GENERIC-LABEL: test_psrad:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: psrad %xmm1, %xmm0
-; GENERIC-NEXT: psrad (%rdi), %xmm0
-; GENERIC-NEXT: psrad $2, %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: psrad %xmm1, %xmm0 # sched: [2:1.00]
+; GENERIC-NEXT: psrad (%rdi), %xmm0 # sched: [8:1.00]
+; GENERIC-NEXT: psrad $2, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_psrad:
-; ATOM: # BB#0:
-; ATOM-NEXT: psrad %xmm1, %xmm0
-; ATOM-NEXT: psrad (%rdi), %xmm0
-; ATOM-NEXT: psrad $2, %xmm0
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: psrad %xmm1, %xmm0 # sched: [2:1.00]
+; ATOM-NEXT: psrad (%rdi), %xmm0 # sched: [3:1.50]
+; ATOM-NEXT: psrad $2, %xmm0 # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_psrad:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: psrad %xmm1, %xmm0 # sched: [1:1.00]
; SLM-NEXT: psrad (%rdi), %xmm0 # sched: [4:1.00]
; SLM-NEXT: psrad $2, %xmm0 # sched: [1:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_psrad:
-; SANDY: # BB#0:
-; SANDY-NEXT: vpsrad %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpsrad (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: vpsrad $2, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vpsrad %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
+; SANDY-NEXT: vpsrad (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
+; SANDY-NEXT: vpsrad $2, %xmm0, %xmm0 # sched: [1:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_psrad:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vpsrad %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
-; HASWELL-NEXT: vpsrad (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
+; HASWELL-NEXT: vpsrad (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
; HASWELL-NEXT: vpsrad $2, %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_psrad:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpsrad %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
+; BROADWELL-NEXT: vpsrad (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; BROADWELL-NEXT: vpsrad $2, %xmm0, %xmm0 # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_psrad:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpsrad %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
+; SKYLAKE-NEXT: vpsrad (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKYLAKE-NEXT: vpsrad $2, %xmm0, %xmm0 # sched: [1:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_psrad:
+; SKX: # %bb.0:
+; SKX-NEXT: vpsrad %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
+; SKX-NEXT: vpsrad (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKX-NEXT: vpsrad $2, %xmm0, %xmm0 # sched: [1:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_psrad:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vpsrad %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vpsrad (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: vpsrad $2, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_psrad:
-; ZNVER1: # BB#0:
-; ZNVER1-NEXT: vpsrad %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
-; ZNVER1-NEXT: vpsrad (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpsrad %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; ZNVER1-NEXT: vpsrad (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
; ZNVER1-NEXT: vpsrad $2, %xmm0, %xmm0 # sched: [1:0.25]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %a0, <4 x i32> %a1)
%2 = load <4 x i32>, <4 x i32> *%a2, align 16
%3 = call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %1, <4 x i32> %2)
@@ -5161,53 +7076,74 @@ declare <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32>, i32) nounwind readnone
define <8 x i16> @test_psraw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; GENERIC-LABEL: test_psraw:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: psraw %xmm1, %xmm0
-; GENERIC-NEXT: psraw (%rdi), %xmm0
-; GENERIC-NEXT: psraw $2, %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: psraw %xmm1, %xmm0 # sched: [2:1.00]
+; GENERIC-NEXT: psraw (%rdi), %xmm0 # sched: [8:1.00]
+; GENERIC-NEXT: psraw $2, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_psraw:
-; ATOM: # BB#0:
-; ATOM-NEXT: psraw %xmm1, %xmm0
-; ATOM-NEXT: psraw (%rdi), %xmm0
-; ATOM-NEXT: psraw $2, %xmm0
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: psraw %xmm1, %xmm0 # sched: [2:1.00]
+; ATOM-NEXT: psraw (%rdi), %xmm0 # sched: [3:1.50]
+; ATOM-NEXT: psraw $2, %xmm0 # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_psraw:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: psraw %xmm1, %xmm0 # sched: [1:1.00]
; SLM-NEXT: psraw (%rdi), %xmm0 # sched: [4:1.00]
; SLM-NEXT: psraw $2, %xmm0 # sched: [1:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_psraw:
-; SANDY: # BB#0:
-; SANDY-NEXT: vpsraw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpsraw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: vpsraw $2, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vpsraw %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
+; SANDY-NEXT: vpsraw (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
+; SANDY-NEXT: vpsraw $2, %xmm0, %xmm0 # sched: [1:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_psraw:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vpsraw %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
-; HASWELL-NEXT: vpsraw (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
+; HASWELL-NEXT: vpsraw (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
; HASWELL-NEXT: vpsraw $2, %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_psraw:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpsraw %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
+; BROADWELL-NEXT: vpsraw (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; BROADWELL-NEXT: vpsraw $2, %xmm0, %xmm0 # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_psraw:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpsraw %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
+; SKYLAKE-NEXT: vpsraw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKYLAKE-NEXT: vpsraw $2, %xmm0, %xmm0 # sched: [1:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_psraw:
+; SKX: # %bb.0:
+; SKX-NEXT: vpsraw %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
+; SKX-NEXT: vpsraw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKX-NEXT: vpsraw $2, %xmm0, %xmm0 # sched: [1:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_psraw:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vpsraw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vpsraw (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: vpsraw $2, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_psraw:
-; ZNVER1: # BB#0:
-; ZNVER1-NEXT: vpsraw %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
-; ZNVER1-NEXT: vpsraw (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpsraw %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; ZNVER1-NEXT: vpsraw (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
; ZNVER1-NEXT: vpsraw $2, %xmm0, %xmm0 # sched: [1:0.25]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %a0, <8 x i16> %a1)
%2 = load <8 x i16>, <8 x i16> *%a2, align 16
%3 = call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %1, <8 x i16> %2)
@@ -5219,53 +7155,74 @@ declare <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16>, i32) nounwind readnone
define <4 x i32> @test_psrld(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
; GENERIC-LABEL: test_psrld:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: psrld %xmm1, %xmm0
-; GENERIC-NEXT: psrld (%rdi), %xmm0
-; GENERIC-NEXT: psrld $2, %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: psrld %xmm1, %xmm0 # sched: [2:1.00]
+; GENERIC-NEXT: psrld (%rdi), %xmm0 # sched: [8:1.00]
+; GENERIC-NEXT: psrld $2, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_psrld:
-; ATOM: # BB#0:
-; ATOM-NEXT: psrld %xmm1, %xmm0
-; ATOM-NEXT: psrld (%rdi), %xmm0
-; ATOM-NEXT: psrld $2, %xmm0
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: psrld %xmm1, %xmm0 # sched: [2:1.00]
+; ATOM-NEXT: psrld (%rdi), %xmm0 # sched: [3:1.50]
+; ATOM-NEXT: psrld $2, %xmm0 # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_psrld:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: psrld %xmm1, %xmm0 # sched: [1:1.00]
; SLM-NEXT: psrld (%rdi), %xmm0 # sched: [4:1.00]
; SLM-NEXT: psrld $2, %xmm0 # sched: [1:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_psrld:
-; SANDY: # BB#0:
-; SANDY-NEXT: vpsrld %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpsrld (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: vpsrld $2, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vpsrld %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
+; SANDY-NEXT: vpsrld (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
+; SANDY-NEXT: vpsrld $2, %xmm0, %xmm0 # sched: [1:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_psrld:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vpsrld %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
-; HASWELL-NEXT: vpsrld (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
+; HASWELL-NEXT: vpsrld (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
; HASWELL-NEXT: vpsrld $2, %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_psrld:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpsrld %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
+; BROADWELL-NEXT: vpsrld (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; BROADWELL-NEXT: vpsrld $2, %xmm0, %xmm0 # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_psrld:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpsrld %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
+; SKYLAKE-NEXT: vpsrld (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKYLAKE-NEXT: vpsrld $2, %xmm0, %xmm0 # sched: [1:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_psrld:
+; SKX: # %bb.0:
+; SKX-NEXT: vpsrld %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
+; SKX-NEXT: vpsrld (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKX-NEXT: vpsrld $2, %xmm0, %xmm0 # sched: [1:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_psrld:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vpsrld %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vpsrld (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: vpsrld $2, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_psrld:
-; ZNVER1: # BB#0:
-; ZNVER1-NEXT: vpsrld %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
-; ZNVER1-NEXT: vpsrld (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpsrld %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; ZNVER1-NEXT: vpsrld (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
; ZNVER1-NEXT: vpsrld $2, %xmm0, %xmm0 # sched: [1:0.25]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %a0, <4 x i32> %a1)
%2 = load <4 x i32>, <4 x i32> *%a2, align 16
%3 = call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %1, <4 x i32> %2)
@@ -5277,98 +7234,134 @@ declare <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32>, i32) nounwind readnone
define <4 x i32> @test_psrldq(<4 x i32> %a0) {
; GENERIC-LABEL: test_psrldq:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: psrldq {{.*#+}} xmm0 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: psrldq {{.*#+}} xmm0 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero sched: [1:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_psrldq:
-; ATOM: # BB#0:
-; ATOM-NEXT: psrldq {{.*#+}} xmm0 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: psrldq {{.*#+}} xmm0 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_psrldq:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: psrldq {{.*#+}} xmm0 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero sched: [1:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_psrldq:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_psrldq:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_psrldq:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_psrldq:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero sched: [1:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_psrldq:
+; SKX: # %bb.0:
+; SKX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_psrldq:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_psrldq:
-; ZNVER1: # BB#0:
-; ZNVER1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero sched: [1:0.25]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero sched: [1:1.00]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = shufflevector <4 x i32> %a0, <4 x i32> zeroinitializer, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
ret <4 x i32> %1
}
define <2 x i64> @test_psrlq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
; GENERIC-LABEL: test_psrlq:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: psrlq %xmm1, %xmm0
-; GENERIC-NEXT: psrlq (%rdi), %xmm0
-; GENERIC-NEXT: psrlq $2, %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: psrlq %xmm1, %xmm0 # sched: [2:1.00]
+; GENERIC-NEXT: psrlq (%rdi), %xmm0 # sched: [8:1.00]
+; GENERIC-NEXT: psrlq $2, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_psrlq:
-; ATOM: # BB#0:
-; ATOM-NEXT: psrlq %xmm1, %xmm0
-; ATOM-NEXT: psrlq (%rdi), %xmm0
-; ATOM-NEXT: psrlq $2, %xmm0
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: psrlq %xmm1, %xmm0 # sched: [2:1.00]
+; ATOM-NEXT: psrlq (%rdi), %xmm0 # sched: [3:1.50]
+; ATOM-NEXT: psrlq $2, %xmm0 # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_psrlq:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: psrlq %xmm1, %xmm0 # sched: [1:1.00]
; SLM-NEXT: psrlq (%rdi), %xmm0 # sched: [4:1.00]
; SLM-NEXT: psrlq $2, %xmm0 # sched: [1:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_psrlq:
-; SANDY: # BB#0:
-; SANDY-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpsrlq (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: vpsrlq $2, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
+; SANDY-NEXT: vpsrlq (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
+; SANDY-NEXT: vpsrlq $2, %xmm0, %xmm0 # sched: [1:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_psrlq:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
-; HASWELL-NEXT: vpsrlq (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
+; HASWELL-NEXT: vpsrlq (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
; HASWELL-NEXT: vpsrlq $2, %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_psrlq:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
+; BROADWELL-NEXT: vpsrlq (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; BROADWELL-NEXT: vpsrlq $2, %xmm0, %xmm0 # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_psrlq:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
+; SKYLAKE-NEXT: vpsrlq (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKYLAKE-NEXT: vpsrlq $2, %xmm0, %xmm0 # sched: [1:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_psrlq:
+; SKX: # %bb.0:
+; SKX-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
+; SKX-NEXT: vpsrlq (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKX-NEXT: vpsrlq $2, %xmm0, %xmm0 # sched: [1:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_psrlq:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vpsrlq (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: vpsrlq $2, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_psrlq:
-; ZNVER1: # BB#0:
-; ZNVER1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
-; ZNVER1-NEXT: vpsrlq (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; ZNVER1-NEXT: vpsrlq (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
; ZNVER1-NEXT: vpsrlq $2, %xmm0, %xmm0 # sched: [1:0.25]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> %a0, <2 x i64> %a1)
%2 = load <2 x i64>, <2 x i64> *%a2, align 16
%3 = call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> %1, <2 x i64> %2)
@@ -5380,53 +7373,74 @@ declare <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64>, i32) nounwind readnone
define <8 x i16> @test_psrlw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; GENERIC-LABEL: test_psrlw:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: psrlw %xmm1, %xmm0
-; GENERIC-NEXT: psrlw (%rdi), %xmm0
-; GENERIC-NEXT: psrlw $2, %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: psrlw %xmm1, %xmm0 # sched: [2:1.00]
+; GENERIC-NEXT: psrlw (%rdi), %xmm0 # sched: [8:1.00]
+; GENERIC-NEXT: psrlw $2, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_psrlw:
-; ATOM: # BB#0:
-; ATOM-NEXT: psrlw %xmm1, %xmm0
-; ATOM-NEXT: psrlw (%rdi), %xmm0
-; ATOM-NEXT: psrlw $2, %xmm0
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: psrlw %xmm1, %xmm0 # sched: [2:1.00]
+; ATOM-NEXT: psrlw (%rdi), %xmm0 # sched: [3:1.50]
+; ATOM-NEXT: psrlw $2, %xmm0 # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_psrlw:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: psrlw %xmm1, %xmm0 # sched: [1:1.00]
; SLM-NEXT: psrlw (%rdi), %xmm0 # sched: [4:1.00]
; SLM-NEXT: psrlw $2, %xmm0 # sched: [1:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_psrlw:
-; SANDY: # BB#0:
-; SANDY-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpsrlw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: vpsrlw $2, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
+; SANDY-NEXT: vpsrlw (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
+; SANDY-NEXT: vpsrlw $2, %xmm0, %xmm0 # sched: [1:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_psrlw:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
-; HASWELL-NEXT: vpsrlw (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
+; HASWELL-NEXT: vpsrlw (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
; HASWELL-NEXT: vpsrlw $2, %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_psrlw:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
+; BROADWELL-NEXT: vpsrlw (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; BROADWELL-NEXT: vpsrlw $2, %xmm0, %xmm0 # sched: [1:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_psrlw:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
+; SKYLAKE-NEXT: vpsrlw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKYLAKE-NEXT: vpsrlw $2, %xmm0, %xmm0 # sched: [1:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_psrlw:
+; SKX: # %bb.0:
+; SKX-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
+; SKX-NEXT: vpsrlw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKX-NEXT: vpsrlw $2, %xmm0, %xmm0 # sched: [1:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_psrlw:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vpsrlw (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: vpsrlw $2, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_psrlw:
-; ZNVER1: # BB#0:
-; ZNVER1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
-; ZNVER1-NEXT: vpsrlw (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; ZNVER1-NEXT: vpsrlw (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
; ZNVER1-NEXT: vpsrlw $2, %xmm0, %xmm0 # sched: [1:0.25]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> %a0, <8 x i16> %a1)
%2 = load <8 x i16>, <8 x i16> *%a2, align 16
%3 = call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> %1, <8 x i16> %2)
@@ -5438,50 +7452,68 @@ declare <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16>, i32) nounwind readnone
define <16 x i8> @test_psubb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; GENERIC-LABEL: test_psubb:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: psubb %xmm1, %xmm0
-; GENERIC-NEXT: psubb (%rdi), %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: psubb %xmm1, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT: psubb (%rdi), %xmm0 # sched: [7:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_psubb:
-; ATOM: # BB#0:
-; ATOM-NEXT: psubb %xmm1, %xmm0
-; ATOM-NEXT: psubb (%rdi), %xmm0
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: psubb %xmm1, %xmm0 # sched: [1:0.50]
+; ATOM-NEXT: psubb (%rdi), %xmm0 # sched: [1:1.00]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_psubb:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: psubb %xmm1, %xmm0 # sched: [1:0.50]
; SLM-NEXT: psubb (%rdi), %xmm0 # sched: [4:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_psubb:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vpsubb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpsubb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpsubb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_psubb:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vpsubb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpsubb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpsubb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_psubb:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpsubb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BROADWELL-NEXT: vpsubb (%rdi), %xmm0, %xmm0 # sched: [6:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_psubb:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpsubb %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: vpsubb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_psubb:
+; SKX: # %bb.0:
+; SKX-NEXT: vpsubb %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKX-NEXT: vpsubb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_psubb:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vpsubb (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_psubb:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
; ZNVER1-NEXT: vpsubb (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = sub <16 x i8> %a0, %a1
%2 = load <16 x i8>, <16 x i8> *%a2, align 16
%3 = sub <16 x i8> %1, %2
@@ -5490,50 +7522,68 @@ define <16 x i8> @test_psubb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
define <4 x i32> @test_psubd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
; GENERIC-LABEL: test_psubd:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: psubd %xmm1, %xmm0
-; GENERIC-NEXT: psubd (%rdi), %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: psubd %xmm1, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT: psubd (%rdi), %xmm0 # sched: [7:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_psubd:
-; ATOM: # BB#0:
-; ATOM-NEXT: psubd %xmm1, %xmm0
-; ATOM-NEXT: psubd (%rdi), %xmm0
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: psubd %xmm1, %xmm0 # sched: [1:0.50]
+; ATOM-NEXT: psubd (%rdi), %xmm0 # sched: [1:1.00]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_psubd:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: psubd %xmm1, %xmm0 # sched: [1:0.50]
; SLM-NEXT: psubd (%rdi), %xmm0 # sched: [4:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_psubd:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vpsubd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpsubd (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpsubd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_psubd:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpsubd (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpsubd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_psubd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BROADWELL-NEXT: vpsubd (%rdi), %xmm0, %xmm0 # sched: [6:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_psubd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpsubd %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: vpsubd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_psubd:
+; SKX: # %bb.0:
+; SKX-NEXT: vpsubd %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKX-NEXT: vpsubd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_psubd:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vpsubd (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_psubd:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
; ZNVER1-NEXT: vpsubd (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = sub <4 x i32> %a0, %a1
%2 = load <4 x i32>, <4 x i32> *%a2, align 16
%3 = sub <4 x i32> %1, %2
@@ -5542,46 +7592,64 @@ define <4 x i32> @test_psubd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
define <2 x i64> @test_psubq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
; GENERIC-LABEL: test_psubq:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: psubq %xmm1, %xmm0
-; GENERIC-NEXT: psubq (%rdi), %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: psubq %xmm1, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT: psubq (%rdi), %xmm0 # sched: [7:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_psubq:
-; ATOM: # BB#0:
-; ATOM-NEXT: psubq %xmm1, %xmm0
-; ATOM-NEXT: psubq (%rdi), %xmm0
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: psubq %xmm1, %xmm0 # sched: [2:1.00]
+; ATOM-NEXT: psubq (%rdi), %xmm0 # sched: [3:1.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_psubq:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: psubq %xmm1, %xmm0 # sched: [1:0.50]
; SLM-NEXT: psubq (%rdi), %xmm0 # sched: [4:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_psubq:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vpsubq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpsubq (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpsubq (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_psubq:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vpsubq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpsubq (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpsubq (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_psubq:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpsubq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BROADWELL-NEXT: vpsubq (%rdi), %xmm0, %xmm0 # sched: [6:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_psubq:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpsubq %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: vpsubq (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_psubq:
+; SKX: # %bb.0:
+; SKX-NEXT: vpsubq %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKX-NEXT: vpsubq (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_psubq:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vpsubq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vpsubq (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_psubq:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vpsubq %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
; ZNVER1-NEXT: vpsubq (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = sub <2 x i64> %a0, %a1
%2 = load <2 x i64>, <2 x i64> *%a2, align 16
%3 = sub <2 x i64> %1, %2
@@ -5590,50 +7658,68 @@ define <2 x i64> @test_psubq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
define <16 x i8> @test_psubsb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; GENERIC-LABEL: test_psubsb:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: psubsb %xmm1, %xmm0
-; GENERIC-NEXT: psubsb (%rdi), %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: psubsb %xmm1, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT: psubsb (%rdi), %xmm0 # sched: [7:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_psubsb:
-; ATOM: # BB#0:
-; ATOM-NEXT: psubsb %xmm1, %xmm0
-; ATOM-NEXT: psubsb (%rdi), %xmm0
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: psubsb %xmm1, %xmm0 # sched: [1:0.50]
+; ATOM-NEXT: psubsb (%rdi), %xmm0 # sched: [1:1.00]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_psubsb:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: psubsb %xmm1, %xmm0 # sched: [1:0.50]
; SLM-NEXT: psubsb (%rdi), %xmm0 # sched: [4:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_psubsb:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpsubsb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpsubsb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_psubsb:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpsubsb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpsubsb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_psubsb:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BROADWELL-NEXT: vpsubsb (%rdi), %xmm0, %xmm0 # sched: [6:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_psubsb:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; SKYLAKE-NEXT: vpsubsb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_psubsb:
+; SKX: # %bb.0:
+; SKX-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; SKX-NEXT: vpsubsb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_psubsb:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vpsubsb (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_psubsb:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
; ZNVER1-NEXT: vpsubsb (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %a0, <16 x i8> %a1)
%2 = load <16 x i8>, <16 x i8> *%a2, align 16
%3 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %1, <16 x i8> %2)
@@ -5643,50 +7729,68 @@ declare <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8>, <16 x i8>) nounwind readnone
define <8 x i16> @test_psubsw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; GENERIC-LABEL: test_psubsw:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: psubsw %xmm1, %xmm0
-; GENERIC-NEXT: psubsw (%rdi), %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: psubsw %xmm1, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT: psubsw (%rdi), %xmm0 # sched: [7:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_psubsw:
-; ATOM: # BB#0:
-; ATOM-NEXT: psubsw %xmm1, %xmm0
-; ATOM-NEXT: psubsw (%rdi), %xmm0
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: psubsw %xmm1, %xmm0 # sched: [1:0.50]
+; ATOM-NEXT: psubsw (%rdi), %xmm0 # sched: [1:1.00]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_psubsw:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: psubsw %xmm1, %xmm0 # sched: [1:0.50]
; SLM-NEXT: psubsw (%rdi), %xmm0 # sched: [4:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_psubsw:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vpsubsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpsubsw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpsubsw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_psubsw:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vpsubsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpsubsw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpsubsw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_psubsw:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpsubsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BROADWELL-NEXT: vpsubsw (%rdi), %xmm0, %xmm0 # sched: [6:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_psubsw:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpsubsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; SKYLAKE-NEXT: vpsubsw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_psubsw:
+; SKX: # %bb.0:
+; SKX-NEXT: vpsubsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; SKX-NEXT: vpsubsw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_psubsw:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vpsubsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vpsubsw (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_psubsw:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vpsubsw %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
; ZNVER1-NEXT: vpsubsw (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %a0, <8 x i16> %a1)
%2 = load <8 x i16>, <8 x i16> *%a2, align 16
%3 = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %1, <8 x i16> %2)
@@ -5696,50 +7800,68 @@ declare <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16>, <8 x i16>) nounwind readnone
define <16 x i8> @test_psubusb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; GENERIC-LABEL: test_psubusb:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: psubusb %xmm1, %xmm0
-; GENERIC-NEXT: psubusb (%rdi), %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: psubusb %xmm1, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT: psubusb (%rdi), %xmm0 # sched: [7:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_psubusb:
-; ATOM: # BB#0:
-; ATOM-NEXT: psubusb %xmm1, %xmm0
-; ATOM-NEXT: psubusb (%rdi), %xmm0
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: psubusb %xmm1, %xmm0 # sched: [1:0.50]
+; ATOM-NEXT: psubusb (%rdi), %xmm0 # sched: [1:1.00]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_psubusb:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: psubusb %xmm1, %xmm0 # sched: [1:0.50]
; SLM-NEXT: psubusb (%rdi), %xmm0 # sched: [4:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_psubusb:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpsubusb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpsubusb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_psubusb:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpsubusb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpsubusb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_psubusb:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BROADWELL-NEXT: vpsubusb (%rdi), %xmm0, %xmm0 # sched: [6:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_psubusb:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; SKYLAKE-NEXT: vpsubusb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_psubusb:
+; SKX: # %bb.0:
+; SKX-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; SKX-NEXT: vpsubusb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_psubusb:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vpsubusb (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_psubusb:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
; ZNVER1-NEXT: vpsubusb (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %a0, <16 x i8> %a1)
%2 = load <16 x i8>, <16 x i8> *%a2, align 16
%3 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %1, <16 x i8> %2)
@@ -5749,50 +7871,68 @@ declare <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8>, <16 x i8>) nounwind readnon
define <8 x i16> @test_psubusw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; GENERIC-LABEL: test_psubusw:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: psubusw %xmm1, %xmm0
-; GENERIC-NEXT: psubusw (%rdi), %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: psubusw %xmm1, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT: psubusw (%rdi), %xmm0 # sched: [7:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_psubusw:
-; ATOM: # BB#0:
-; ATOM-NEXT: psubusw %xmm1, %xmm0
-; ATOM-NEXT: psubusw (%rdi), %xmm0
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: psubusw %xmm1, %xmm0 # sched: [1:0.50]
+; ATOM-NEXT: psubusw (%rdi), %xmm0 # sched: [1:1.00]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_psubusw:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: psubusw %xmm1, %xmm0 # sched: [1:0.50]
; SLM-NEXT: psubusw (%rdi), %xmm0 # sched: [4:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_psubusw:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpsubusw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpsubusw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_psubusw:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpsubusw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpsubusw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_psubusw:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BROADWELL-NEXT: vpsubusw (%rdi), %xmm0, %xmm0 # sched: [6:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_psubusw:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; SKYLAKE-NEXT: vpsubusw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_psubusw:
+; SKX: # %bb.0:
+; SKX-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; SKX-NEXT: vpsubusw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_psubusw:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vpsubusw (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_psubusw:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
; ZNVER1-NEXT: vpsubusw (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %a0, <8 x i16> %a1)
%2 = load <8 x i16>, <8 x i16> *%a2, align 16
%3 = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %1, <8 x i16> %2)
@@ -5802,50 +7942,68 @@ declare <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16>, <8 x i16>) nounwind readnon
define <8 x i16> @test_psubw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; GENERIC-LABEL: test_psubw:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: psubw %xmm1, %xmm0
-; GENERIC-NEXT: psubw (%rdi), %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: psubw %xmm1, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT: psubw (%rdi), %xmm0 # sched: [7:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_psubw:
-; ATOM: # BB#0:
-; ATOM-NEXT: psubw %xmm1, %xmm0
-; ATOM-NEXT: psubw (%rdi), %xmm0
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: psubw %xmm1, %xmm0 # sched: [1:0.50]
+; ATOM-NEXT: psubw (%rdi), %xmm0 # sched: [1:1.00]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_psubw:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: psubw %xmm1, %xmm0 # sched: [1:0.50]
; SLM-NEXT: psubw (%rdi), %xmm0 # sched: [4:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_psubw:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vpsubw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpsubw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpsubw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_psubw:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vpsubw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpsubw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpsubw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_psubw:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpsubw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BROADWELL-NEXT: vpsubw (%rdi), %xmm0, %xmm0 # sched: [6:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_psubw:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpsubw %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: vpsubw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_psubw:
+; SKX: # %bb.0:
+; SKX-NEXT: vpsubw %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKX-NEXT: vpsubw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_psubw:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vpsubw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vpsubw (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_psubw:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vpsubw %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
; ZNVER1-NEXT: vpsubw (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = sub <8 x i16> %a0, %a1
%2 = load <8 x i16>, <8 x i16> *%a2, align 16
%3 = sub <8 x i16> %1, %2
@@ -5854,50 +8012,68 @@ define <8 x i16> @test_psubw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
define <16 x i8> @test_punpckhbw(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; GENERIC-LABEL: test_punpckhbw:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
-; GENERIC-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15]
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] sched: [1:0.50]
+; GENERIC-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15] sched: [7:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_punpckhbw:
-; ATOM: # BB#0:
-; ATOM-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
-; ATOM-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15]
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] sched: [1:1.00]
+; ATOM-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15] sched: [1:1.00]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_punpckhbw:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] sched: [1:1.00]
; SLM-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15] sched: [4:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_punpckhbw:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] sched: [1:0.50]
-; SANDY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15] sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15] sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_punpckhbw:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] sched: [1:1.00]
-; HASWELL-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15] sched: [5:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15] sched: [7:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_punpckhbw:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] sched: [1:1.00]
+; BROADWELL-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15] sched: [6:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_punpckhbw:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] sched: [1:1.00]
+; SKYLAKE-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15] sched: [7:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_punpckhbw:
+; SKX: # %bb.0:
+; SKX-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] sched: [1:1.00]
+; SKX-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15] sched: [7:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_punpckhbw:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] sched: [1:0.50]
; BTVER2-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15] sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_punpckhbw:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] sched: [1:0.25]
; ZNVER1-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15] sched: [8:0.50]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
%2 = load <16 x i8>, <16 x i8> *%a2, align 16
%3 = shufflevector <16 x i8> %1, <16 x i8> %2, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
@@ -5906,55 +8082,76 @@ define <16 x i8> @test_punpckhbw(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
define <4 x i32> @test_punpckhdq(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
; GENERIC-LABEL: test_punpckhdq:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; GENERIC-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],mem[2],xmm1[3],mem[3]
-; GENERIC-NEXT: paddd %xmm1, %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:0.50]
+; GENERIC-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] sched: [7:0.50]
+; GENERIC-NEXT: paddd %xmm1, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_punpckhdq:
-; ATOM: # BB#0:
-; ATOM-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; ATOM-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],mem[2],xmm1[3],mem[3]
-; ATOM-NEXT: paddd %xmm1, %xmm0
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
+; ATOM-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] sched: [1:1.00]
+; ATOM-NEXT: paddd %xmm1, %xmm0 # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_punpckhdq:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
; SLM-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] sched: [4:1.00]
; SLM-NEXT: paddd %xmm1, %xmm0 # sched: [1:0.50]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_punpckhdq:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:0.50]
-; SANDY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] sched: [5:0.50]
+; SANDY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] sched: [7:0.50]
; SANDY-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_punpckhdq:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
-; HASWELL-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] sched: [5:1.00]
+; HASWELL-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] sched: [7:1.00]
; HASWELL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_punpckhdq:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
+; BROADWELL-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] sched: [6:1.00]
+; BROADWELL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_punpckhdq:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
+; SKYLAKE-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] sched: [7:1.00]
+; SKYLAKE-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_punpckhdq:
+; SKX: # %bb.0:
+; SKX-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
+; SKX-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] sched: [7:1.00]
+; SKX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_punpckhdq:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:0.50]
; BTVER2-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] sched: [6:1.00]
; BTVER2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_punpckhdq:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:0.25]
; ZNVER1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] sched: [8:0.50]
; ZNVER1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
%2 = load <4 x i32>, <4 x i32> *%a2, align 16
%3 = shufflevector <4 x i32> %a1, <4 x i32> %2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
@@ -5964,53 +8161,74 @@ define <4 x i32> @test_punpckhdq(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
define <2 x i64> @test_punpckhqdq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
; GENERIC-LABEL: test_punpckhqdq:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; GENERIC-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],mem[1]
-; GENERIC-NEXT: paddq %xmm1, %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] sched: [1:0.50]
+; GENERIC-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],mem[1] sched: [7:0.50]
+; GENERIC-NEXT: paddq %xmm1, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_punpckhqdq:
-; ATOM: # BB#0:
-; ATOM-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; ATOM-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],mem[1]
-; ATOM-NEXT: paddq %xmm1, %xmm0
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] sched: [1:1.00]
+; ATOM-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],mem[1] sched: [1:1.00]
+; ATOM-NEXT: paddq %xmm1, %xmm0 # sched: [2:1.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_punpckhqdq:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] sched: [1:1.00]
; SLM-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],mem[1] sched: [4:1.00]
; SLM-NEXT: paddq %xmm1, %xmm0 # sched: [1:0.50]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_punpckhqdq:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] sched: [1:0.50]
-; SANDY-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm1[1],mem[1] sched: [5:0.50]
+; SANDY-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm1[1],mem[1] sched: [7:0.50]
; SANDY-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_punpckhqdq:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] sched: [1:1.00]
-; HASWELL-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm1[1],mem[1] sched: [5:1.00]
+; HASWELL-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm1[1],mem[1] sched: [7:1.00]
; HASWELL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_punpckhqdq:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] sched: [1:1.00]
+; BROADWELL-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm1[1],mem[1] sched: [6:1.00]
+; BROADWELL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_punpckhqdq:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] sched: [1:1.00]
+; SKYLAKE-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm1[1],mem[1] sched: [7:1.00]
+; SKYLAKE-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_punpckhqdq:
+; SKX: # %bb.0:
+; SKX-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] sched: [1:1.00]
+; SKX-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm1[1],mem[1] sched: [7:1.00]
+; SKX-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_punpckhqdq:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] sched: [1:0.50]
; BTVER2-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm1[1],mem[1] sched: [6:1.00]
; BTVER2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_punpckhqdq:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] sched: [1:0.25]
; ZNVER1-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm1[1],mem[1] sched: [8:0.50]
; ZNVER1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = shufflevector <2 x i64> %a0, <2 x i64> %a1, <2 x i32> <i32 1, i32 3>
%2 = load <2 x i64>, <2 x i64> *%a2, align 16
%3 = shufflevector <2 x i64> %a1, <2 x i64> %2, <2x i32> <i32 1, i32 3>
@@ -6020,50 +8238,68 @@ define <2 x i64> @test_punpckhqdq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2)
define <8 x i16> @test_punpckhwd(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; GENERIC-LABEL: test_punpckhwd:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; GENERIC-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] sched: [1:0.50]
+; GENERIC-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] sched: [7:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_punpckhwd:
-; ATOM: # BB#0:
-; ATOM-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; ATOM-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] sched: [1:1.00]
+; ATOM-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] sched: [1:1.00]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_punpckhwd:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] sched: [1:1.00]
; SLM-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] sched: [4:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_punpckhwd:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] sched: [1:0.50]
-; SANDY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_punpckhwd:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] sched: [1:1.00]
-; HASWELL-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] sched: [5:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] sched: [7:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_punpckhwd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] sched: [1:1.00]
+; BROADWELL-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] sched: [6:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_punpckhwd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] sched: [1:1.00]
+; SKYLAKE-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] sched: [7:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_punpckhwd:
+; SKX: # %bb.0:
+; SKX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] sched: [1:1.00]
+; SKX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] sched: [7:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_punpckhwd:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] sched: [1:0.50]
; BTVER2-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_punpckhwd:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] sched: [1:0.25]
; ZNVER1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] sched: [8:0.50]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
%2 = load <8 x i16>, <8 x i16> *%a2, align 16
%3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
@@ -6072,50 +8308,68 @@ define <8 x i16> @test_punpckhwd(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
define <16 x i8> @test_punpcklbw(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; GENERIC-LABEL: test_punpcklbw:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; GENERIC-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] sched: [1:0.50]
+; GENERIC-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] sched: [7:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_punpcklbw:
-; ATOM: # BB#0:
-; ATOM-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; ATOM-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] sched: [1:1.00]
+; ATOM-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] sched: [1:1.00]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_punpcklbw:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] sched: [1:1.00]
; SLM-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] sched: [4:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_punpcklbw:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] sched: [1:0.50]
-; SANDY-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_punpcklbw:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] sched: [1:1.00]
-; HASWELL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] sched: [5:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] sched: [7:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_punpcklbw:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] sched: [1:1.00]
+; BROADWELL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] sched: [6:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_punpcklbw:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] sched: [1:1.00]
+; SKYLAKE-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] sched: [7:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_punpcklbw:
+; SKX: # %bb.0:
+; SKX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] sched: [1:1.00]
+; SKX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] sched: [7:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_punpcklbw:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] sched: [1:0.50]
; BTVER2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_punpcklbw:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] sched: [1:0.25]
; ZNVER1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] sched: [8:0.50]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
%2 = load <16 x i8>, <16 x i8> *%a2, align 16
%3 = shufflevector <16 x i8> %1, <16 x i8> %2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
@@ -6124,55 +8378,76 @@ define <16 x i8> @test_punpcklbw(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
define <4 x i32> @test_punpckldq(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
; GENERIC-LABEL: test_punpckldq:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; GENERIC-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
-; GENERIC-NEXT: paddd %xmm1, %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:0.50]
+; GENERIC-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] sched: [7:0.50]
+; GENERIC-NEXT: paddd %xmm1, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_punpckldq:
-; ATOM: # BB#0:
-; ATOM-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; ATOM-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
-; ATOM-NEXT: paddd %xmm1, %xmm0
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
+; ATOM-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] sched: [1:1.00]
+; ATOM-NEXT: paddd %xmm1, %xmm0 # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_punpckldq:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
; SLM-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] sched: [4:1.00]
; SLM-NEXT: paddd %xmm1, %xmm0 # sched: [1:0.50]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_punpckldq:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:0.50]
-; SANDY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] sched: [5:0.50]
+; SANDY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] sched: [7:0.50]
; SANDY-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_punpckldq:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
-; HASWELL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] sched: [5:1.00]
+; HASWELL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] sched: [7:1.00]
; HASWELL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_punpckldq:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
+; BROADWELL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] sched: [6:1.00]
+; BROADWELL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_punpckldq:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
+; SKYLAKE-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] sched: [7:1.00]
+; SKYLAKE-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_punpckldq:
+; SKX: # %bb.0:
+; SKX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
+; SKX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] sched: [7:1.00]
+; SKX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_punpckldq:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:0.50]
; BTVER2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] sched: [6:1.00]
; BTVER2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_punpckldq:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:0.25]
; ZNVER1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] sched: [8:0.50]
; ZNVER1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
%2 = load <4 x i32>, <4 x i32> *%a2, align 16
%3 = shufflevector <4 x i32> %a1, <4 x i32> %2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
@@ -6182,53 +8457,74 @@ define <4 x i32> @test_punpckldq(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
define <2 x i64> @test_punpcklqdq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
; GENERIC-LABEL: test_punpcklqdq:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; GENERIC-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0]
-; GENERIC-NEXT: paddq %xmm1, %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:0.50]
+; GENERIC-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [7:0.50]
+; GENERIC-NEXT: paddq %xmm1, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_punpcklqdq:
-; ATOM: # BB#0:
-; ATOM-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; ATOM-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0]
-; ATOM-NEXT: paddq %xmm1, %xmm0
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:1.00]
+; ATOM-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [1:1.00]
+; ATOM-NEXT: paddq %xmm1, %xmm0 # sched: [2:1.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_punpcklqdq:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:1.00]
; SLM-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [4:1.00]
; SLM-NEXT: paddq %xmm1, %xmm0 # sched: [1:0.50]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_punpcklqdq:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:0.50]
-; SANDY-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [5:0.50]
+; SANDY-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [7:0.50]
; SANDY-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_punpcklqdq:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:1.00]
-; HASWELL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [5:1.00]
+; HASWELL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [7:1.00]
; HASWELL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_punpcklqdq:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:1.00]
+; BROADWELL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [6:1.00]
+; BROADWELL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_punpcklqdq:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:1.00]
+; SKYLAKE-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [7:1.00]
+; SKYLAKE-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_punpcklqdq:
+; SKX: # %bb.0:
+; SKX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:1.00]
+; SKX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [7:1.00]
+; SKX-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_punpcklqdq:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:0.50]
; BTVER2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [6:1.00]
; BTVER2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_punpcklqdq:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:0.25]
; ZNVER1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [8:0.50]
; ZNVER1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = shufflevector <2 x i64> %a0, <2 x i64> %a1, <2 x i32> <i32 0, i32 2>
%2 = load <2 x i64>, <2 x i64> *%a2, align 16
%3 = shufflevector <2 x i64> %a1, <2 x i64> %2, <2x i32> <i32 0, i32 2>
@@ -6238,50 +8534,68 @@ define <2 x i64> @test_punpcklqdq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2)
define <8 x i16> @test_punpcklwd(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; GENERIC-LABEL: test_punpcklwd:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; GENERIC-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:0.50]
+; GENERIC-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_punpcklwd:
-; ATOM: # BB#0:
-; ATOM-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; ATOM-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
+; ATOM-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] sched: [1:1.00]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_punpcklwd:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
; SLM-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] sched: [4:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_punpcklwd:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:0.50]
-; SANDY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_punpcklwd:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
-; HASWELL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] sched: [5:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_punpcklwd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
+; BROADWELL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] sched: [6:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_punpcklwd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
+; SKYLAKE-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_punpcklwd:
+; SKX: # %bb.0:
+; SKX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
+; SKX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_punpcklwd:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:0.50]
; BTVER2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_punpcklwd:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:0.25]
; ZNVER1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] sched: [8:0.50]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
%2 = load <8 x i16>, <8 x i16> *%a2, align 16
%3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
@@ -6290,53 +8604,74 @@ define <8 x i16> @test_punpcklwd(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
define <2 x i64> @test_pxor(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
; GENERIC-LABEL: test_pxor:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: pxor %xmm1, %xmm0
-; GENERIC-NEXT: pxor (%rdi), %xmm0
-; GENERIC-NEXT: paddq %xmm1, %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: pxor %xmm1, %xmm0 # sched: [1:0.33]
+; GENERIC-NEXT: pxor (%rdi), %xmm0 # sched: [7:0.50]
+; GENERIC-NEXT: paddq %xmm1, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_pxor:
-; ATOM: # BB#0:
-; ATOM-NEXT: pxor %xmm1, %xmm0
-; ATOM-NEXT: pxor (%rdi), %xmm0
-; ATOM-NEXT: paddq %xmm1, %xmm0
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: pxor %xmm1, %xmm0 # sched: [1:0.50]
+; ATOM-NEXT: pxor (%rdi), %xmm0 # sched: [1:1.00]
+; ATOM-NEXT: paddq %xmm1, %xmm0 # sched: [2:1.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_pxor:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: pxor %xmm1, %xmm0 # sched: [1:0.50]
; SLM-NEXT: pxor (%rdi), %xmm0 # sched: [4:1.00]
; SLM-NEXT: paddq %xmm1, %xmm0 # sched: [1:0.50]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_pxor:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vpxor %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
-; SANDY-NEXT: vpxor (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; SANDY-NEXT: vpxor (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
; SANDY-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pxor:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vpxor %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
-; HASWELL-NEXT: vpxor (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; HASWELL-NEXT: vpxor (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
; HASWELL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pxor:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpxor %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; BROADWELL-NEXT: vpxor (%rdi), %xmm0, %xmm0 # sched: [6:0.50]
+; BROADWELL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pxor:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpxor %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: vpxor (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKYLAKE-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pxor:
+; SKX: # %bb.0:
+; SKX-NEXT: vpxor %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKX-NEXT: vpxor (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKX-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_pxor:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vpxor %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vpxor (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_pxor:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vpxor %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
; ZNVER1-NEXT: vpxor (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
; ZNVER1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = xor <2 x i64> %a0, %a1
%2 = load <2 x i64>, <2 x i64> *%a2, align 16
%3 = xor <2 x i64> %1, %2
@@ -6346,53 +8681,74 @@ define <2 x i64> @test_pxor(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
define <2 x double> @test_shufpd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2) {
; GENERIC-LABEL: test_shufpd:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0]
-; GENERIC-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1],mem[0]
-; GENERIC-NEXT: addpd %xmm1, %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0] sched: [1:1.00]
+; GENERIC-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1],mem[0] sched: [7:1.00]
+; GENERIC-NEXT: addpd %xmm1, %xmm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_shufpd:
-; ATOM: # BB#0:
-; ATOM-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0]
-; ATOM-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1],mem[0]
-; ATOM-NEXT: addpd %xmm1, %xmm0
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0] sched: [1:1.00]
+; ATOM-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1],mem[0] sched: [1:1.00]
+; ATOM-NEXT: addpd %xmm1, %xmm0 # sched: [6:3.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_shufpd:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0] sched: [1:1.00]
; SLM-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1],mem[0] sched: [4:1.00]
; SLM-NEXT: addpd %xmm1, %xmm0 # sched: [3:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_shufpd:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0] sched: [1:1.00]
-; SANDY-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1],mem[0] sched: [5:1.00]
+; SANDY-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1],mem[0] sched: [7:1.00]
; SANDY-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_shufpd:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0] sched: [1:1.00]
-; HASWELL-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1],mem[0] sched: [5:1.00]
+; HASWELL-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1],mem[0] sched: [7:1.00]
; HASWELL-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_shufpd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0] sched: [1:1.00]
+; BROADWELL-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1],mem[0] sched: [6:1.00]
+; BROADWELL-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_shufpd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0] sched: [1:1.00]
+; SKYLAKE-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1],mem[0] sched: [7:1.00]
+; SKYLAKE-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_shufpd:
+; SKX: # %bb.0:
+; SKX-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0] sched: [1:1.00]
+; SKX-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1],mem[0] sched: [7:1.00]
+; SKX-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_shufpd:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0] sched: [1:0.50]
; BTVER2-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1],mem[0] sched: [6:1.00]
; BTVER2-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_shufpd:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0] sched: [1:0.50]
; ZNVER1-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1],mem[0] sched: [8:0.50]
; ZNVER1-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> <i32 1, i32 2>
%2 = load <2 x double>, <2 x double> *%a2, align 16
%3 = shufflevector <2 x double> %a1, <2 x double> %2, <2 x i32> <i32 1, i32 2>
@@ -6402,21 +8758,21 @@ define <2 x double> @test_shufpd(<2 x double> %a0, <2 x double> %a1, <2 x double
define <2 x double> @test_sqrtpd(<2 x double> %a0, <2 x double> *%a1) {
; GENERIC-LABEL: test_sqrtpd:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: sqrtpd %xmm0, %xmm1
-; GENERIC-NEXT: sqrtpd (%rdi), %xmm0
-; GENERIC-NEXT: addpd %xmm1, %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: sqrtpd %xmm0, %xmm1 # sched: [22:1.00]
+; GENERIC-NEXT: sqrtpd (%rdi), %xmm0 # sched: [28:1.00]
+; GENERIC-NEXT: addpd %xmm1, %xmm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_sqrtpd:
-; ATOM: # BB#0:
-; ATOM-NEXT: sqrtpd %xmm0, %xmm1
-; ATOM-NEXT: sqrtpd (%rdi), %xmm0
-; ATOM-NEXT: addpd %xmm1, %xmm0
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: sqrtpd %xmm0, %xmm1 # sched: [125:62.50]
+; ATOM-NEXT: sqrtpd (%rdi), %xmm0 # sched: [125:62.50]
+; ATOM-NEXT: addpd %xmm1, %xmm0 # sched: [6:3.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_sqrtpd:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: sqrtpd (%rdi), %xmm1 # sched: [18:1.00]
; SLM-NEXT: sqrtpd %xmm0, %xmm0 # sched: [15:1.00]
; SLM-NEXT: addpd %xmm0, %xmm1 # sched: [3:1.00]
@@ -6424,32 +8780,53 @@ define <2 x double> @test_sqrtpd(<2 x double> %a0, <2 x double> *%a1) {
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_sqrtpd:
-; SANDY: # BB#0:
-; SANDY-NEXT: vsqrtpd %xmm0, %xmm0 # sched: [15:1.00]
-; SANDY-NEXT: vsqrtpd (%rdi), %xmm1 # sched: [19:1.00]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vsqrtpd %xmm0, %xmm0 # sched: [22:1.00]
+; SANDY-NEXT: vsqrtpd (%rdi), %xmm1 # sched: [28:1.00]
; SANDY-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_sqrtpd:
-; HASWELL: # BB#0:
-; HASWELL-NEXT: vsqrtpd %xmm0, %xmm0 # sched: [15:1.00]
-; HASWELL-NEXT: vsqrtpd (%rdi), %xmm1 # sched: [19:1.00]
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vsqrtpd %xmm0, %xmm0 # sched: [21:1.00]
+; HASWELL-NEXT: vsqrtpd (%rdi), %xmm1 # sched: [27:1.00]
; HASWELL-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_sqrtpd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vsqrtpd %xmm0, %xmm0 # sched: [21:1.00]
+; BROADWELL-NEXT: vsqrtpd (%rdi), %xmm1 # sched: [26:1.00]
+; BROADWELL-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_sqrtpd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vsqrtpd %xmm0, %xmm0 # sched: [18:1.00]
+; SKYLAKE-NEXT: vsqrtpd (%rdi), %xmm1 # sched: [24:1.00]
+; SKYLAKE-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_sqrtpd:
+; SKX: # %bb.0:
+; SKX-NEXT: vsqrtpd %xmm0, %xmm0 # sched: [18:1.00]
+; SKX-NEXT: vsqrtpd (%rdi), %xmm1 # sched: [24:1.00]
+; SKX-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_sqrtpd:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vsqrtpd (%rdi), %xmm1 # sched: [26:21.00]
; BTVER2-NEXT: vsqrtpd %xmm0, %xmm0 # sched: [21:21.00]
; BTVER2-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_sqrtpd:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vsqrtpd (%rdi), %xmm1 # sched: [27:1.00]
; ZNVER1-NEXT: vsqrtpd %xmm0, %xmm0 # sched: [20:1.00]
; ZNVER1-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double> %a0)
%2 = load <2 x double>, <2 x double> *%a1, align 16
%3 = call <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double> %2)
@@ -6462,23 +8839,23 @@ declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone
define <2 x double> @test_sqrtsd(<2 x double> %a0, <2 x double> *%a1) {
; GENERIC-LABEL: test_sqrtsd:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: sqrtsd %xmm0, %xmm0
-; GENERIC-NEXT: movapd (%rdi), %xmm1
-; GENERIC-NEXT: sqrtsd %xmm1, %xmm1
-; GENERIC-NEXT: addpd %xmm1, %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: sqrtsd %xmm0, %xmm0 # sched: [22:1.00]
+; GENERIC-NEXT: movapd (%rdi), %xmm1 # sched: [6:0.50]
+; GENERIC-NEXT: sqrtsd %xmm1, %xmm1 # sched: [22:1.00]
+; GENERIC-NEXT: addpd %xmm1, %xmm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_sqrtsd:
-; ATOM: # BB#0:
-; ATOM-NEXT: movapd (%rdi), %xmm1
+; ATOM: # %bb.0:
+; ATOM-NEXT: movapd (%rdi), %xmm1 # sched: [1:1.00]
; ATOM-NEXT: sqrtsd %xmm0, %xmm0
; ATOM-NEXT: sqrtsd %xmm1, %xmm1
-; ATOM-NEXT: addpd %xmm1, %xmm0
-; ATOM-NEXT: retq
+; ATOM-NEXT: addpd %xmm1, %xmm0 # sched: [6:3.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_sqrtsd:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: movapd (%rdi), %xmm1 # sched: [3:1.00]
; SLM-NEXT: sqrtsd %xmm0, %xmm0 # sched: [18:1.00]
; SLM-NEXT: sqrtsd %xmm1, %xmm1 # sched: [18:1.00]
@@ -6486,23 +8863,47 @@ define <2 x double> @test_sqrtsd(<2 x double> %a0, <2 x double> *%a1) {
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_sqrtsd:
-; SANDY: # BB#0:
-; SANDY-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0 # sched: [19:1.00]
-; SANDY-NEXT: vmovapd (%rdi), %xmm1 # sched: [4:0.50]
-; SANDY-NEXT: vsqrtsd %xmm1, %xmm1, %xmm1 # sched: [19:1.00]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0 # sched: [21:1.00]
+; SANDY-NEXT: vmovapd (%rdi), %xmm1 # sched: [6:0.50]
+; SANDY-NEXT: vsqrtsd %xmm1, %xmm1, %xmm1 # sched: [21:1.00]
; SANDY-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_sqrtsd:
-; HASWELL: # BB#0:
-; HASWELL-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0 # sched: [19:1.00]
-; HASWELL-NEXT: vmovapd (%rdi), %xmm1 # sched: [4:0.50]
-; HASWELL-NEXT: vsqrtsd %xmm1, %xmm1, %xmm1 # sched: [19:1.00]
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0 # sched: [21:1.00]
+; HASWELL-NEXT: vmovapd (%rdi), %xmm1 # sched: [6:0.50]
+; HASWELL-NEXT: vsqrtsd %xmm1, %xmm1, %xmm1 # sched: [21:1.00]
; HASWELL-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_sqrtsd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0 # sched: [21:1.00]
+; BROADWELL-NEXT: vmovapd (%rdi), %xmm1 # sched: [5:0.50]
+; BROADWELL-NEXT: vsqrtsd %xmm1, %xmm1, %xmm1 # sched: [21:1.00]
+; BROADWELL-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_sqrtsd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0 # sched: [18:1.00]
+; SKYLAKE-NEXT: vmovapd (%rdi), %xmm1 # sched: [6:0.50]
+; SKYLAKE-NEXT: vsqrtsd %xmm1, %xmm1, %xmm1 # sched: [18:1.00]
+; SKYLAKE-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_sqrtsd:
+; SKX: # %bb.0:
+; SKX-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0 # sched: [18:1.00]
+; SKX-NEXT: vmovapd (%rdi), %xmm1 # sched: [6:0.50]
+; SKX-NEXT: vsqrtsd %xmm1, %xmm1, %xmm1 # sched: [18:1.00]
+; SKX-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_sqrtsd:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vmovapd (%rdi), %xmm1 # sched: [5:1.00]
; BTVER2-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0 # sched: [26:21.00]
; BTVER2-NEXT: vsqrtsd %xmm1, %xmm1, %xmm1 # sched: [26:21.00]
@@ -6510,12 +8911,12 @@ define <2 x double> @test_sqrtsd(<2 x double> %a0, <2 x double> *%a1) {
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_sqrtsd:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vmovapd (%rdi), %xmm1 # sched: [8:0.50]
; ZNVER1-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0 # sched: [27:1.00]
; ZNVER1-NEXT: vsqrtsd %xmm1, %xmm1, %xmm1 # sched: [27:1.00]
; ZNVER1-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %a0)
%2 = load <2 x double>, <2 x double> *%a1, align 16
%3 = call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %2)
@@ -6526,46 +8927,64 @@ declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone
define <2 x double> @test_subpd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2) {
; GENERIC-LABEL: test_subpd:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: subpd %xmm1, %xmm0
-; GENERIC-NEXT: subpd (%rdi), %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: subpd %xmm1, %xmm0 # sched: [3:1.00]
+; GENERIC-NEXT: subpd (%rdi), %xmm0 # sched: [9:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_subpd:
-; ATOM: # BB#0:
-; ATOM-NEXT: subpd %xmm1, %xmm0
-; ATOM-NEXT: subpd (%rdi), %xmm0
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: subpd %xmm1, %xmm0 # sched: [6:3.00]
+; ATOM-NEXT: subpd (%rdi), %xmm0 # sched: [7:3.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_subpd:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: subpd %xmm1, %xmm0 # sched: [3:1.00]
; SLM-NEXT: subpd (%rdi), %xmm0 # sched: [6:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_subpd:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vsubpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vsubpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vsubpd (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_subpd:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vsubpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: vsubpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vsubpd (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_subpd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vsubpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; BROADWELL-NEXT: vsubpd (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_subpd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vsubpd %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vsubpd (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_subpd:
+; SKX: # %bb.0:
+; SKX-NEXT: vsubpd %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: vsubpd (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_subpd:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vsubpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: vsubpd (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_subpd:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vsubpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; ZNVER1-NEXT: vsubpd (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = fsub <2 x double> %a0, %a1
%2 = load <2 x double>, <2 x double> *%a2, align 16
%3 = fsub <2 x double> %1, %2
@@ -6574,46 +8993,64 @@ define <2 x double> @test_subpd(<2 x double> %a0, <2 x double> %a1, <2 x double>
define double @test_subsd(double %a0, double %a1, double *%a2) {
; GENERIC-LABEL: test_subsd:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: subsd %xmm1, %xmm0
-; GENERIC-NEXT: subsd (%rdi), %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: subsd %xmm1, %xmm0 # sched: [3:1.00]
+; GENERIC-NEXT: subsd (%rdi), %xmm0 # sched: [9:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_subsd:
-; ATOM: # BB#0:
-; ATOM-NEXT: subsd %xmm1, %xmm0
-; ATOM-NEXT: subsd (%rdi), %xmm0
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: subsd %xmm1, %xmm0 # sched: [5:5.00]
+; ATOM-NEXT: subsd (%rdi), %xmm0 # sched: [5:5.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_subsd:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: subsd %xmm1, %xmm0 # sched: [3:1.00]
; SLM-NEXT: subsd (%rdi), %xmm0 # sched: [6:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_subsd:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vsubsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vsubsd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vsubsd (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_subsd:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vsubsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: vsubsd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vsubsd (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_subsd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vsubsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; BROADWELL-NEXT: vsubsd (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_subsd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vsubsd %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vsubsd (%rdi), %xmm0, %xmm0 # sched: [9:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_subsd:
+; SKX: # %bb.0:
+; SKX-NEXT: vsubsd %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: vsubsd (%rdi), %xmm0, %xmm0 # sched: [9:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_subsd:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vsubsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: vsubsd (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_subsd:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vsubsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; ZNVER1-NEXT: vsubsd (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = fsub double %a0, %a1
%2 = load double, double *%a2, align 8
%3 = fsub double %1, %2
@@ -6622,35 +9059,35 @@ define double @test_subsd(double %a0, double %a1, double *%a2) {
define i32 @test_ucomisd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2) {
; GENERIC-LABEL: test_ucomisd:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: ucomisd %xmm1, %xmm0
-; GENERIC-NEXT: setnp %al
-; GENERIC-NEXT: sete %cl
-; GENERIC-NEXT: andb %al, %cl
-; GENERIC-NEXT: ucomisd (%rdi), %xmm0
-; GENERIC-NEXT: setnp %al
-; GENERIC-NEXT: sete %dl
-; GENERIC-NEXT: andb %al, %dl
-; GENERIC-NEXT: orb %cl, %dl
-; GENERIC-NEXT: movzbl %dl, %eax
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: ucomisd %xmm1, %xmm0 # sched: [3:1.00]
+; GENERIC-NEXT: setnp %al # sched: [1:0.50]
+; GENERIC-NEXT: sete %cl # sched: [1:0.50]
+; GENERIC-NEXT: andb %al, %cl # sched: [1:0.33]
+; GENERIC-NEXT: ucomisd (%rdi), %xmm0 # sched: [7:1.00]
+; GENERIC-NEXT: setnp %al # sched: [1:0.50]
+; GENERIC-NEXT: sete %dl # sched: [1:0.50]
+; GENERIC-NEXT: andb %al, %dl # sched: [1:0.33]
+; GENERIC-NEXT: orb %cl, %dl # sched: [1:0.33]
+; GENERIC-NEXT: movzbl %dl, %eax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_ucomisd:
-; ATOM: # BB#0:
-; ATOM-NEXT: ucomisd %xmm1, %xmm0
-; ATOM-NEXT: setnp %al
-; ATOM-NEXT: sete %cl
-; ATOM-NEXT: andb %al, %cl
-; ATOM-NEXT: ucomisd (%rdi), %xmm0
-; ATOM-NEXT: setnp %al
-; ATOM-NEXT: sete %dl
-; ATOM-NEXT: andb %al, %dl
-; ATOM-NEXT: orb %cl, %dl
-; ATOM-NEXT: movzbl %dl, %eax
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: ucomisd %xmm1, %xmm0 # sched: [9:4.50]
+; ATOM-NEXT: setnp %al # sched: [1:0.50]
+; ATOM-NEXT: sete %cl # sched: [1:0.50]
+; ATOM-NEXT: andb %al, %cl # sched: [1:0.50]
+; ATOM-NEXT: ucomisd (%rdi), %xmm0 # sched: [10:5.00]
+; ATOM-NEXT: setnp %al # sched: [1:0.50]
+; ATOM-NEXT: sete %dl # sched: [1:0.50]
+; ATOM-NEXT: andb %al, %dl # sched: [1:0.50]
+; ATOM-NEXT: orb %cl, %dl # sched: [1:0.50]
+; ATOM-NEXT: movzbl %dl, %eax # sched: [1:1.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_ucomisd:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: ucomisd %xmm1, %xmm0 # sched: [3:1.00]
; SLM-NEXT: setnp %al # sched: [1:0.50]
; SLM-NEXT: sete %cl # sched: [1:0.50]
@@ -6664,35 +9101,77 @@ define i32 @test_ucomisd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2)
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_ucomisd:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vucomisd %xmm1, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: setnp %al # sched: [1:0.33]
-; SANDY-NEXT: sete %cl # sched: [1:0.33]
+; SANDY-NEXT: setnp %al # sched: [1:0.50]
+; SANDY-NEXT: sete %cl # sched: [1:0.50]
; SANDY-NEXT: andb %al, %cl # sched: [1:0.33]
; SANDY-NEXT: vucomisd (%rdi), %xmm0 # sched: [7:1.00]
-; SANDY-NEXT: setnp %al # sched: [1:0.33]
-; SANDY-NEXT: sete %dl # sched: [1:0.33]
+; SANDY-NEXT: setnp %al # sched: [1:0.50]
+; SANDY-NEXT: sete %dl # sched: [1:0.50]
; SANDY-NEXT: andb %al, %dl # sched: [1:0.33]
; SANDY-NEXT: orb %cl, %dl # sched: [1:0.33]
; SANDY-NEXT: movzbl %dl, %eax # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_ucomisd:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vucomisd %xmm1, %xmm0 # sched: [3:1.00]
; HASWELL-NEXT: setnp %al # sched: [1:0.50]
; HASWELL-NEXT: sete %cl # sched: [1:0.50]
; HASWELL-NEXT: andb %al, %cl # sched: [1:0.25]
-; HASWELL-NEXT: vucomisd (%rdi), %xmm0 # sched: [7:1.00]
+; HASWELL-NEXT: vucomisd (%rdi), %xmm0 # sched: [8:1.00]
; HASWELL-NEXT: setnp %al # sched: [1:0.50]
; HASWELL-NEXT: sete %dl # sched: [1:0.50]
; HASWELL-NEXT: andb %al, %dl # sched: [1:0.25]
; HASWELL-NEXT: orb %cl, %dl # sched: [1:0.25]
; HASWELL-NEXT: movzbl %dl, %eax # sched: [1:0.25]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_ucomisd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vucomisd %xmm1, %xmm0 # sched: [3:1.00]
+; BROADWELL-NEXT: setnp %al # sched: [1:0.50]
+; BROADWELL-NEXT: sete %cl # sched: [1:0.50]
+; BROADWELL-NEXT: andb %al, %cl # sched: [1:0.25]
+; BROADWELL-NEXT: vucomisd (%rdi), %xmm0 # sched: [8:1.00]
+; BROADWELL-NEXT: setnp %al # sched: [1:0.50]
+; BROADWELL-NEXT: sete %dl # sched: [1:0.50]
+; BROADWELL-NEXT: andb %al, %dl # sched: [1:0.25]
+; BROADWELL-NEXT: orb %cl, %dl # sched: [1:0.25]
+; BROADWELL-NEXT: movzbl %dl, %eax # sched: [1:0.25]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_ucomisd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vucomisd %xmm1, %xmm0 # sched: [3:1.00]
+; SKYLAKE-NEXT: setnp %al # sched: [1:0.50]
+; SKYLAKE-NEXT: sete %cl # sched: [1:0.50]
+; SKYLAKE-NEXT: andb %al, %cl # sched: [1:0.25]
+; SKYLAKE-NEXT: vucomisd (%rdi), %xmm0 # sched: [8:1.00]
+; SKYLAKE-NEXT: setnp %al # sched: [1:0.50]
+; SKYLAKE-NEXT: sete %dl # sched: [1:0.50]
+; SKYLAKE-NEXT: andb %al, %dl # sched: [1:0.25]
+; SKYLAKE-NEXT: orb %cl, %dl # sched: [1:0.25]
+; SKYLAKE-NEXT: movzbl %dl, %eax # sched: [1:0.25]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_ucomisd:
+; SKX: # %bb.0:
+; SKX-NEXT: vucomisd %xmm1, %xmm0 # sched: [3:1.00]
+; SKX-NEXT: setnp %al # sched: [1:0.50]
+; SKX-NEXT: sete %cl # sched: [1:0.50]
+; SKX-NEXT: andb %al, %cl # sched: [1:0.25]
+; SKX-NEXT: vucomisd (%rdi), %xmm0 # sched: [8:1.00]
+; SKX-NEXT: setnp %al # sched: [1:0.50]
+; SKX-NEXT: sete %dl # sched: [1:0.50]
+; SKX-NEXT: andb %al, %dl # sched: [1:0.25]
+; SKX-NEXT: orb %cl, %dl # sched: [1:0.25]
+; SKX-NEXT: movzbl %dl, %eax # sched: [1:0.25]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_ucomisd:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vucomisd %xmm1, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: setnp %al # sched: [1:0.50]
; BTVER2-NEXT: sete %cl # sched: [1:0.50]
@@ -6706,7 +9185,7 @@ define i32 @test_ucomisd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2)
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_ucomisd:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vucomisd %xmm1, %xmm0 # sched: [3:1.00]
; ZNVER1-NEXT: setnp %al # sched: [1:0.25]
; ZNVER1-NEXT: sete %cl # sched: [1:0.25]
@@ -6717,7 +9196,7 @@ define i32 @test_ucomisd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2)
; ZNVER1-NEXT: andb %al, %dl # sched: [1:0.25]
; ZNVER1-NEXT: orb %cl, %dl # sched: [1:0.25]
; ZNVER1-NEXT: movzbl %dl, %eax # sched: [1:0.25]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call i32 @llvm.x86.sse2.ucomieq.sd(<2 x double> %a0, <2 x double> %a1)
%2 = load <2 x double>, <2 x double> *%a2, align 8
%3 = call i32 @llvm.x86.sse2.ucomieq.sd(<2 x double> %a0, <2 x double> %2)
@@ -6728,53 +9207,74 @@ declare i32 @llvm.x86.sse2.ucomieq.sd(<2 x double>, <2 x double>) nounwind readn
define <2 x double> @test_unpckhpd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2) {
; GENERIC-LABEL: test_unpckhpd:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; GENERIC-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1]
-; GENERIC-NEXT: addpd %xmm1, %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] sched: [1:1.00]
+; GENERIC-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] sched: [7:1.00]
+; GENERIC-NEXT: addpd %xmm1, %xmm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_unpckhpd:
-; ATOM: # BB#0:
-; ATOM-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; ATOM-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1]
-; ATOM-NEXT: addpd %xmm1, %xmm0
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] sched: [1:1.00]
+; ATOM-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] sched: [1:1.00]
+; ATOM-NEXT: addpd %xmm1, %xmm0 # sched: [6:3.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_unpckhpd:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] sched: [1:1.00]
; SLM-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] sched: [4:1.00]
; SLM-NEXT: addpd %xmm1, %xmm0 # sched: [3:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_unpckhpd:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] sched: [1:1.00]
-; SANDY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] sched: [5:1.00]
+; SANDY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] sched: [7:1.00]
; SANDY-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_unpckhpd:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] sched: [1:1.00]
-; HASWELL-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] sched: [5:1.00]
+; HASWELL-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] sched: [7:1.00]
; HASWELL-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_unpckhpd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] sched: [1:1.00]
+; BROADWELL-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] sched: [6:1.00]
+; BROADWELL-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_unpckhpd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] sched: [1:1.00]
+; SKYLAKE-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] sched: [7:1.00]
+; SKYLAKE-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_unpckhpd:
+; SKX: # %bb.0:
+; SKX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] sched: [1:1.00]
+; SKX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] sched: [7:1.00]
+; SKX-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_unpckhpd:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] sched: [1:0.50]
; BTVER2-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] sched: [6:1.00]
; BTVER2-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_unpckhpd:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] sched: [1:0.50]
; ZNVER1-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] sched: [8:0.50]
; ZNVER1-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> <i32 1, i32 3>
%2 = load <2 x double>, <2 x double> *%a2, align 16
%3 = shufflevector <2 x double> %a1, <2 x double> %2, <2 x i32> <i32 1, i32 3>
@@ -6784,25 +9284,25 @@ define <2 x double> @test_unpckhpd(<2 x double> %a0, <2 x double> %a1, <2 x doub
define <2 x double> @test_unpcklpd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2) {
; GENERIC-LABEL: test_unpcklpd:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; GENERIC-NEXT: movapd %xmm0, %xmm1
-; GENERIC-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0]
-; GENERIC-NEXT: addpd %xmm0, %xmm1
-; GENERIC-NEXT: movapd %xmm1, %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:1.00]
+; GENERIC-NEXT: movapd %xmm0, %xmm1 # sched: [1:1.00]
+; GENERIC-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [7:1.00]
+; GENERIC-NEXT: addpd %xmm0, %xmm1 # sched: [3:1.00]
+; GENERIC-NEXT: movapd %xmm1, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_unpcklpd:
-; ATOM: # BB#0:
-; ATOM-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; ATOM-NEXT: movapd %xmm0, %xmm1
-; ATOM-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0]
-; ATOM-NEXT: addpd %xmm0, %xmm1
-; ATOM-NEXT: movapd %xmm1, %xmm0
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:1.00]
+; ATOM-NEXT: movapd %xmm0, %xmm1 # sched: [1:0.50]
+; ATOM-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [1:1.00]
+; ATOM-NEXT: addpd %xmm0, %xmm1 # sched: [6:3.00]
+; ATOM-NEXT: movapd %xmm1, %xmm0 # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_unpcklpd:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:1.00]
; SLM-NEXT: movapd %xmm0, %xmm1 # sched: [1:1.00]
; SLM-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [4:1.00]
@@ -6811,32 +9311,53 @@ define <2 x double> @test_unpcklpd(<2 x double> %a0, <2 x double> %a1, <2 x doub
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_unpcklpd:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:1.00]
-; SANDY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] sched: [5:1.00]
+; SANDY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] sched: [7:1.00]
; SANDY-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_unpcklpd:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:1.00]
-; HASWELL-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] sched: [5:1.00]
+; HASWELL-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] sched: [7:1.00]
; HASWELL-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_unpcklpd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:1.00]
+; BROADWELL-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] sched: [6:1.00]
+; BROADWELL-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_unpcklpd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:1.00]
+; SKYLAKE-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] sched: [7:1.00]
+; SKYLAKE-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_unpcklpd:
+; SKX: # %bb.0:
+; SKX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:1.00]
+; SKX-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] sched: [7:1.00]
+; SKX-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_unpcklpd:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:0.50]
; BTVER2-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] sched: [6:1.00]
; BTVER2-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_unpcklpd:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:0.50]
; ZNVER1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] sched: [8:0.50]
; ZNVER1-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> <i32 0, i32 2>
%2 = load <2 x double>, <2 x double> *%a2, align 16
%3 = shufflevector <2 x double> %1, <2 x double> %2, <2 x i32> <i32 0, i32 2>
@@ -6846,53 +9367,74 @@ define <2 x double> @test_unpcklpd(<2 x double> %a0, <2 x double> %a1, <2 x doub
define <2 x double> @test_xorpd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2) {
; GENERIC-LABEL: test_xorpd:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: xorpd %xmm1, %xmm0
-; GENERIC-NEXT: xorpd (%rdi), %xmm0
-; GENERIC-NEXT: addpd %xmm1, %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: xorpd %xmm1, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: xorpd (%rdi), %xmm0 # sched: [7:1.00]
+; GENERIC-NEXT: addpd %xmm1, %xmm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_xorpd:
-; ATOM: # BB#0:
-; ATOM-NEXT: xorpd %xmm1, %xmm0
-; ATOM-NEXT: xorpd (%rdi), %xmm0
-; ATOM-NEXT: addpd %xmm1, %xmm0
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: xorpd %xmm1, %xmm0 # sched: [1:0.50]
+; ATOM-NEXT: xorpd (%rdi), %xmm0 # sched: [1:1.00]
+; ATOM-NEXT: addpd %xmm1, %xmm0 # sched: [6:3.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_xorpd:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: xorpd %xmm1, %xmm0 # sched: [1:0.50]
; SLM-NEXT: xorpd (%rdi), %xmm0 # sched: [4:1.00]
; SLM-NEXT: addpd %xmm1, %xmm0 # sched: [3:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_xorpd:
-; SANDY: # BB#0:
-; SANDY-NEXT: vxorpd %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
-; SANDY-NEXT: vxorpd (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vxorpd %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; SANDY-NEXT: vxorpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
; SANDY-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_xorpd:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vxorpd %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT: vxorpd (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
+; HASWELL-NEXT: vxorpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
; HASWELL-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_xorpd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vxorpd %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; BROADWELL-NEXT: vxorpd (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
+; BROADWELL-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_xorpd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vxorpd %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: vxorpd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKYLAKE-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_xorpd:
+; SKX: # %bb.0:
+; SKX-NEXT: vxorpd %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKX-NEXT: vxorpd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKX-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_xorpd:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vxorpd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vxorpd (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_xorpd:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vxorpd %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
; ZNVER1-NEXT: vxorpd (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
; ZNVER1-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = bitcast <2 x double> %a0 to <4 x i32>
%2 = bitcast <2 x double> %a1 to <4 x i32>
%3 = xor <4 x i32> %1, %2
diff --git a/test/CodeGen/X86/sse2-vector-shifts.ll b/test/CodeGen/X86/sse2-vector-shifts.ll
index d1c7adb6263b..82d4b7721d91 100644
--- a/test/CodeGen/X86/sse2-vector-shifts.ll
+++ b/test/CodeGen/X86/sse2-vector-shifts.ll
@@ -5,7 +5,7 @@
define <8 x i16> @test_sllw_1(<8 x i16> %InVec) {
; CHECK-LABEL: test_sllw_1:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: retq
entry:
%shl = shl <8 x i16> %InVec, <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>
@@ -14,7 +14,7 @@ entry:
define <8 x i16> @test_sllw_2(<8 x i16> %InVec) {
; CHECK-LABEL: test_sllw_2:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: paddw %xmm0, %xmm0
; CHECK-NEXT: retq
entry:
@@ -24,7 +24,7 @@ entry:
define <8 x i16> @test_sllw_3(<8 x i16> %InVec) {
; CHECK-LABEL: test_sllw_3:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: psllw $15, %xmm0
; CHECK-NEXT: retq
entry:
@@ -34,7 +34,7 @@ entry:
define <4 x i32> @test_slld_1(<4 x i32> %InVec) {
; CHECK-LABEL: test_slld_1:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: retq
entry:
%shl = shl <4 x i32> %InVec, <i32 0, i32 0, i32 0, i32 0>
@@ -43,7 +43,7 @@ entry:
define <4 x i32> @test_slld_2(<4 x i32> %InVec) {
; CHECK-LABEL: test_slld_2:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: paddd %xmm0, %xmm0
; CHECK-NEXT: retq
entry:
@@ -53,7 +53,7 @@ entry:
define <4 x i32> @test_slld_3(<4 x i32> %InVec) {
; CHECK-LABEL: test_slld_3:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: pslld $31, %xmm0
; CHECK-NEXT: retq
entry:
@@ -63,7 +63,7 @@ entry:
define <2 x i64> @test_sllq_1(<2 x i64> %InVec) {
; CHECK-LABEL: test_sllq_1:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: retq
entry:
%shl = shl <2 x i64> %InVec, <i64 0, i64 0>
@@ -72,7 +72,7 @@ entry:
define <2 x i64> @test_sllq_2(<2 x i64> %InVec) {
; CHECK-LABEL: test_sllq_2:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: paddq %xmm0, %xmm0
; CHECK-NEXT: retq
entry:
@@ -82,7 +82,7 @@ entry:
define <2 x i64> @test_sllq_3(<2 x i64> %InVec) {
; CHECK-LABEL: test_sllq_3:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: psllq $63, %xmm0
; CHECK-NEXT: retq
entry:
@@ -94,7 +94,7 @@ entry:
define <8 x i16> @test_sraw_1(<8 x i16> %InVec) {
; CHECK-LABEL: test_sraw_1:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: retq
entry:
%shl = ashr <8 x i16> %InVec, <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>
@@ -103,7 +103,7 @@ entry:
define <8 x i16> @test_sraw_2(<8 x i16> %InVec) {
; CHECK-LABEL: test_sraw_2:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: psraw $1, %xmm0
; CHECK-NEXT: retq
entry:
@@ -113,7 +113,7 @@ entry:
define <8 x i16> @test_sraw_3(<8 x i16> %InVec) {
; CHECK-LABEL: test_sraw_3:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: psraw $15, %xmm0
; CHECK-NEXT: retq
entry:
@@ -123,7 +123,7 @@ entry:
define <4 x i32> @test_srad_1(<4 x i32> %InVec) {
; CHECK-LABEL: test_srad_1:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: retq
entry:
%shl = ashr <4 x i32> %InVec, <i32 0, i32 0, i32 0, i32 0>
@@ -132,7 +132,7 @@ entry:
define <4 x i32> @test_srad_2(<4 x i32> %InVec) {
; CHECK-LABEL: test_srad_2:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: psrad $1, %xmm0
; CHECK-NEXT: retq
entry:
@@ -142,7 +142,7 @@ entry:
define <4 x i32> @test_srad_3(<4 x i32> %InVec) {
; CHECK-LABEL: test_srad_3:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: psrad $31, %xmm0
; CHECK-NEXT: retq
entry:
@@ -154,7 +154,7 @@ entry:
define <8 x i16> @test_srlw_1(<8 x i16> %InVec) {
; CHECK-LABEL: test_srlw_1:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: retq
entry:
%shl = lshr <8 x i16> %InVec, <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>
@@ -163,7 +163,7 @@ entry:
define <8 x i16> @test_srlw_2(<8 x i16> %InVec) {
; CHECK-LABEL: test_srlw_2:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: psrlw $1, %xmm0
; CHECK-NEXT: retq
entry:
@@ -173,7 +173,7 @@ entry:
define <8 x i16> @test_srlw_3(<8 x i16> %InVec) {
; CHECK-LABEL: test_srlw_3:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: psrlw $15, %xmm0
; CHECK-NEXT: retq
entry:
@@ -183,7 +183,7 @@ entry:
define <4 x i32> @test_srld_1(<4 x i32> %InVec) {
; CHECK-LABEL: test_srld_1:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: retq
entry:
%shl = lshr <4 x i32> %InVec, <i32 0, i32 0, i32 0, i32 0>
@@ -192,7 +192,7 @@ entry:
define <4 x i32> @test_srld_2(<4 x i32> %InVec) {
; CHECK-LABEL: test_srld_2:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: psrld $1, %xmm0
; CHECK-NEXT: retq
entry:
@@ -202,7 +202,7 @@ entry:
define <4 x i32> @test_srld_3(<4 x i32> %InVec) {
; CHECK-LABEL: test_srld_3:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: psrld $31, %xmm0
; CHECK-NEXT: retq
entry:
@@ -212,7 +212,7 @@ entry:
define <2 x i64> @test_srlq_1(<2 x i64> %InVec) {
; CHECK-LABEL: test_srlq_1:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: retq
entry:
%shl = lshr <2 x i64> %InVec, <i64 0, i64 0>
@@ -221,7 +221,7 @@ entry:
define <2 x i64> @test_srlq_2(<2 x i64> %InVec) {
; CHECK-LABEL: test_srlq_2:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: psrlq $1, %xmm0
; CHECK-NEXT: retq
entry:
@@ -231,7 +231,7 @@ entry:
define <2 x i64> @test_srlq_3(<2 x i64> %InVec) {
; CHECK-LABEL: test_srlq_3:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: psrlq $63, %xmm0
; CHECK-NEXT: retq
entry:
@@ -241,7 +241,7 @@ entry:
define <4 x i32> @sra_sra_v4i32(<4 x i32> %x) nounwind {
; CHECK-LABEL: sra_sra_v4i32:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: psrad $6, %xmm0
; CHECK-NEXT: retq
%sra0 = ashr <4 x i32> %x, <i32 2, i32 2, i32 2, i32 2>
@@ -251,7 +251,7 @@ define <4 x i32> @sra_sra_v4i32(<4 x i32> %x) nounwind {
define <4 x i32> @srl_srl_v4i32(<4 x i32> %x) nounwind {
; CHECK-LABEL: srl_srl_v4i32:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: psrld $6, %xmm0
; CHECK-NEXT: retq
%srl0 = lshr <4 x i32> %x, <i32 2, i32 2, i32 2, i32 2>
@@ -261,7 +261,7 @@ define <4 x i32> @srl_srl_v4i32(<4 x i32> %x) nounwind {
define <4 x i32> @srl_shl_v4i32(<4 x i32> %x) nounwind {
; CHECK-LABEL: srl_shl_v4i32:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: andps {{.*}}(%rip), %xmm0
; CHECK-NEXT: retq
%srl0 = shl <4 x i32> %x, <i32 4, i32 4, i32 4, i32 4>
@@ -271,7 +271,7 @@ define <4 x i32> @srl_shl_v4i32(<4 x i32> %x) nounwind {
define <4 x i32> @srl_sra_31_v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
; CHECK-LABEL: srl_sra_31_v4i32:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: psrld $31, %xmm0
; CHECK-NEXT: retq
%sra = ashr <4 x i32> %x, %y
@@ -281,7 +281,7 @@ define <4 x i32> @srl_sra_31_v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
define <4 x i32> @shl_shl_v4i32(<4 x i32> %x) nounwind {
; CHECK-LABEL: shl_shl_v4i32:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: pslld $6, %xmm0
; CHECK-NEXT: retq
%shl0 = shl <4 x i32> %x, <i32 2, i32 2, i32 2, i32 2>
@@ -291,7 +291,7 @@ define <4 x i32> @shl_shl_v4i32(<4 x i32> %x) nounwind {
define <4 x i32> @shl_sra_v4i32(<4 x i32> %x) nounwind {
; CHECK-LABEL: shl_sra_v4i32:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: andps {{.*}}(%rip), %xmm0
; CHECK-NEXT: retq
%shl0 = ashr <4 x i32> %x, <i32 4, i32 4, i32 4, i32 4>
@@ -301,7 +301,7 @@ define <4 x i32> @shl_sra_v4i32(<4 x i32> %x) nounwind {
define <4 x i32> @shl_srl_v4i32(<4 x i32> %x) nounwind {
; CHECK-LABEL: shl_srl_v4i32:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: pslld $3, %xmm0
; CHECK-NEXT: pand {{.*}}(%rip), %xmm0
; CHECK-NEXT: retq
@@ -312,7 +312,7 @@ define <4 x i32> @shl_srl_v4i32(<4 x i32> %x) nounwind {
define <4 x i32> @shl_zext_srl_v4i32(<4 x i16> %x) nounwind {
; CHECK-LABEL: shl_zext_srl_v4i32:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: andps {{.*}}(%rip), %xmm0
; CHECK-NEXT: andps {{.*}}(%rip), %xmm0
; CHECK-NEXT: retq
@@ -324,7 +324,7 @@ define <4 x i32> @shl_zext_srl_v4i32(<4 x i16> %x) nounwind {
define <4 x i16> @sra_trunc_srl_v4i32(<4 x i32> %x) nounwind {
; CHECK-LABEL: sra_trunc_srl_v4i32:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: psrad $19, %xmm0
; CHECK-NEXT: retq
%srl = lshr <4 x i32> %x, <i32 16, i32 16, i32 16, i32 16>
@@ -335,8 +335,7 @@ define <4 x i16> @sra_trunc_srl_v4i32(<4 x i32> %x) nounwind {
define <4 x i32> @shl_zext_shl_v4i32(<4 x i16> %x) nounwind {
; CHECK-LABEL: shl_zext_shl_v4i32:
-; CHECK: # BB#0:
-; CHECK-NEXT: pand {{.*}}(%rip), %xmm0
+; CHECK: # %bb.0:
; CHECK-NEXT: pslld $19, %xmm0
; CHECK-NEXT: retq
%shl0 = shl <4 x i16> %x, <i16 2, i16 2, i16 2, i16 2>
@@ -347,7 +346,7 @@ define <4 x i32> @shl_zext_shl_v4i32(<4 x i16> %x) nounwind {
define <4 x i32> @sra_v4i32(<4 x i32> %x) nounwind {
; CHECK-LABEL: sra_v4i32:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: psrad $3, %xmm0
; CHECK-NEXT: retq
%sra = ashr <4 x i32> %x, <i32 3, i32 3, i32 3, i32 3>
@@ -356,7 +355,7 @@ define <4 x i32> @sra_v4i32(<4 x i32> %x) nounwind {
define <4 x i32> @srl_v4i32(<4 x i32> %x) nounwind {
; CHECK-LABEL: srl_v4i32:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: psrld $3, %xmm0
; CHECK-NEXT: retq
%sra = lshr <4 x i32> %x, <i32 3, i32 3, i32 3, i32 3>
@@ -365,7 +364,7 @@ define <4 x i32> @srl_v4i32(<4 x i32> %x) nounwind {
define <4 x i32> @shl_v4i32(<4 x i32> %x) nounwind {
; CHECK-LABEL: shl_v4i32:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: pslld $3, %xmm0
; CHECK-NEXT: retq
%sra = shl <4 x i32> %x, <i32 3, i32 3, i32 3, i32 3>
diff --git a/test/CodeGen/X86/sse2.ll b/test/CodeGen/X86/sse2.ll
index 5e7def9150e9..285fdb6e76d8 100644
--- a/test/CodeGen/X86/sse2.ll
+++ b/test/CodeGen/X86/sse2.ll
@@ -6,7 +6,7 @@
define void @test1(<2 x double>* %r, <2 x double>* %A, double %B) nounwind {
; X86-LABEL: test1:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movapd (%ecx), %xmm0
@@ -15,7 +15,7 @@ define void @test1(<2 x double>* %r, <2 x double>* %A, double %B) nounwind {
; X86-NEXT: retl
;
; X64-LABEL: test1:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movapd (%rsi), %xmm1
; X64-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
; X64-NEXT: movapd %xmm1, (%rdi)
@@ -29,7 +29,7 @@ define void @test1(<2 x double>* %r, <2 x double>* %A, double %B) nounwind {
define void @test2(<2 x double>* %r, <2 x double>* %A, double %B) nounwind {
; X86-LABEL: test2:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movapd (%ecx), %xmm0
@@ -38,10 +38,10 @@ define void @test2(<2 x double>* %r, <2 x double>* %A, double %B) nounwind {
; X86-NEXT: retl
;
; X64-LABEL: test2:
-; X64: # BB#0:
-; X64-NEXT: movapd (%rsi), %xmm1
-; X64-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; X64-NEXT: movapd %xmm1, (%rdi)
+; X64: # %bb.0:
+; X64-NEXT: movaps (%rsi), %xmm1
+; X64-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; X64-NEXT: movaps %xmm1, (%rdi)
; X64-NEXT: retq
%tmp3 = load <2 x double>, <2 x double>* %A, align 16
%tmp7 = insertelement <2 x double> undef, double %B, i32 0
@@ -53,7 +53,7 @@ define void @test2(<2 x double>* %r, <2 x double>* %A, double %B) nounwind {
define void @test3(<4 x float>* %res, <4 x float>* %A, <4 x float>* %B) nounwind {
; X86-LABEL: test3:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
@@ -63,7 +63,7 @@ define void @test3(<4 x float>* %res, <4 x float>* %A, <4 x float>* %B) nounwind
; X86-NEXT: retl
;
; X64-LABEL: test3:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movaps (%rsi), %xmm0
; X64-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
; X64-NEXT: movaps %xmm0, (%rdi)
@@ -84,14 +84,14 @@ define void @test3(<4 x float>* %res, <4 x float>* %A, <4 x float>* %B) nounwind
define void @test4(<4 x float> %X, <4 x float>* %res) nounwind {
; X86-LABEL: test4:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,1,3,3]
; X86-NEXT: movaps %xmm0, (%eax)
; X86-NEXT: retl
;
; X64-LABEL: test4:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,1,3,3]
; X64-NEXT: movaps %xmm0, (%rdi)
; X64-NEXT: retq
@@ -102,7 +102,7 @@ define void @test4(<4 x float> %X, <4 x float>* %res) nounwind {
define <4 x i32> @test5(i8** %ptr) nounwind {
; X86-LABEL: test5:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl (%eax), %eax
; X86-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
@@ -112,7 +112,7 @@ define <4 x i32> @test5(i8** %ptr) nounwind {
; X86-NEXT: retl
;
; X64-LABEL: test5:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movq (%rdi), %rax
; X64-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X64-NEXT: pxor %xmm0, %xmm0
@@ -136,7 +136,7 @@ define <4 x i32> @test5(i8** %ptr) nounwind {
define void @test6(<4 x float>* %res, <4 x float>* %A) nounwind {
; X86-LABEL: test6:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movaps (%ecx), %xmm0
@@ -144,7 +144,7 @@ define void @test6(<4 x float>* %res, <4 x float>* %A) nounwind {
; X86-NEXT: retl
;
; X64-LABEL: test6:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movaps (%rsi), %xmm0
; X64-NEXT: movaps %xmm0, (%rdi)
; X64-NEXT: retq
@@ -156,13 +156,13 @@ define void @test6(<4 x float>* %res, <4 x float>* %A) nounwind {
define void @test7() nounwind {
; X86-LABEL: test7:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: xorps %xmm0, %xmm0
; X86-NEXT: movaps %xmm0, 0
; X86-NEXT: retl
;
; X64-LABEL: test7:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: xorps %xmm0, %xmm0
; X64-NEXT: movaps %xmm0, 0
; X64-NEXT: retq
@@ -176,12 +176,12 @@ define void @test7() nounwind {
define <2 x i64> @test8() nounwind {
; X86-LABEL: test8:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movups x, %xmm0
; X86-NEXT: retl
;
; X64-LABEL: test8:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movups {{.*}}(%rip), %xmm0
; X64-NEXT: retq
%tmp = load i32, i32* getelementptr ([4 x i32], [4 x i32]* @x, i32 0, i32 0) ; <i32> [#uses=1]
@@ -198,15 +198,15 @@ define <2 x i64> @test8() nounwind {
define <4 x float> @test9(i32 %dummy, float %a, float %b, float %c, float %d) nounwind {
; X86-LABEL: test9:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movups {{[0-9]+}}(%esp), %xmm0
; X86-NEXT: retl
;
; X64-LABEL: test9:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; X64-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; X64-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; X64-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; X64-NEXT: retq
%tmp = insertelement <4 x float> undef, float %a, i32 0 ; <<4 x float>> [#uses=1]
%tmp11 = insertelement <4 x float> %tmp, float %b, i32 1 ; <<4 x float>> [#uses=1]
@@ -217,15 +217,15 @@ define <4 x float> @test9(i32 %dummy, float %a, float %b, float %c, float %d) no
define <4 x float> @test10(float %a, float %b, float %c, float %d) nounwind {
; X86-LABEL: test10:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movups {{[0-9]+}}(%esp), %xmm0
; X86-NEXT: retl
;
; X64-LABEL: test10:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; X64-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; X64-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; X64-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; X64-NEXT: retq
%tmp = insertelement <4 x float> undef, float %a, i32 0 ; <<4 x float>> [#uses=1]
%tmp11 = insertelement <4 x float> %tmp, float %b, i32 1 ; <<4 x float>> [#uses=1]
@@ -236,13 +236,13 @@ define <4 x float> @test10(float %a, float %b, float %c, float %d) nounwind {
define <2 x double> @test11(double %a, double %b) nounwind {
; X86-LABEL: test11:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movups {{[0-9]+}}(%esp), %xmm0
; X86-NEXT: retl
;
; X64-LABEL: test11:
-; X64: # BB#0:
-; X64-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X64: # %bb.0:
+; X64-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; X64-NEXT: retq
%tmp = insertelement <2 x double> undef, double %a, i32 0 ; <<2 x double>> [#uses=1]
%tmp7 = insertelement <2 x double> %tmp, double %b, i32 1 ; <<2 x double>> [#uses=1]
@@ -251,7 +251,7 @@ define <2 x double> @test11(double %a, double %b) nounwind {
define void @test12() nounwind {
; X86-LABEL: test12:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movapd 0, %xmm0
; X86-NEXT: movapd {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
; X86-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
@@ -262,7 +262,7 @@ define void @test12() nounwind {
; X86-NEXT: retl
;
; X64-LABEL: test12:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movapd 0, %xmm0
; X64-NEXT: movapd {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
; X64-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
@@ -281,7 +281,7 @@ define void @test12() nounwind {
define void @test13(<4 x float>* %res, <4 x float>* %A, <4 x float>* %B, <4 x float>* %C) nounwind {
; X86-LABEL: test13:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
@@ -292,7 +292,7 @@ define void @test13(<4 x float>* %res, <4 x float>* %A, <4 x float>* %B, <4 x fl
; X86-NEXT: retl
;
; X64-LABEL: test13:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movaps (%rdx), %xmm0
; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],mem[0,1]
; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
@@ -307,7 +307,7 @@ define void @test13(<4 x float>* %res, <4 x float>* %A, <4 x float>* %B, <4 x fl
define <4 x float> @test14(<4 x float>* %x, <4 x float>* %y) nounwind {
; X86-LABEL: test14:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movaps (%ecx), %xmm1
@@ -315,17 +315,17 @@ define <4 x float> @test14(<4 x float>* %x, <4 x float>* %y) nounwind {
; X86-NEXT: movaps %xmm2, %xmm0
; X86-NEXT: addps %xmm1, %xmm0
; X86-NEXT: subps %xmm1, %xmm2
-; X86-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; X86-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; X86-NEXT: retl
;
; X64-LABEL: test14:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movaps (%rsi), %xmm1
; X64-NEXT: movaps (%rdi), %xmm2
; X64-NEXT: movaps %xmm2, %xmm0
; X64-NEXT: addps %xmm1, %xmm0
; X64-NEXT: subps %xmm1, %xmm2
-; X64-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; X64-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; X64-NEXT: retq
%tmp = load <4 x float>, <4 x float>* %y ; <<4 x float>> [#uses=2]
%tmp5 = load <4 x float>, <4 x float>* %x ; <<4 x float>> [#uses=2]
@@ -337,16 +337,16 @@ define <4 x float> @test14(<4 x float>* %x, <4 x float>* %y) nounwind {
define <4 x float> @test15(<4 x float>* %x, <4 x float>* %y) nounwind {
; X86-LABEL: test15:
-; X86: # BB#0: # %entry
+; X86: # %bb.0: # %entry
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movapd (%ecx), %xmm0
+; X86-NEXT: movaps (%ecx), %xmm0
; X86-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1]
; X86-NEXT: retl
;
; X64-LABEL: test15:
-; X64: # BB#0: # %entry
-; X64-NEXT: movapd (%rdi), %xmm0
+; X64: # %bb.0: # %entry
+; X64-NEXT: movaps (%rdi), %xmm0
; X64-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1]
; X64-NEXT: retq
entry:
@@ -360,15 +360,15 @@ entry:
define <2 x double> @test16(<4 x double> * nocapture %srcA, <2 x double>* nocapture %dst) {
; X86-LABEL: test16:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movapd 96(%eax), %xmm0
+; X86-NEXT: movaps 96(%eax), %xmm0
; X86-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
; X86-NEXT: retl
;
; X64-LABEL: test16:
-; X64: # BB#0:
-; X64-NEXT: movapd 96(%rdi), %xmm0
+; X64: # %bb.0:
+; X64-NEXT: movaps 96(%rdi), %xmm0
; X64-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
; X64-NEXT: retq
%i5 = getelementptr inbounds <4 x double>, <4 x double>* %srcA, i32 3
@@ -380,13 +380,13 @@ define <2 x double> @test16(<4 x double> * nocapture %srcA, <2 x double>* nocap
; PR9009
define fastcc void @test17() nounwind {
; X86-LABEL: test17:
-; X86: # BB#0: # %entry
+; X86: # %bb.0: # %entry
; X86-NEXT: movaps {{.*#+}} xmm0 = <u,u,32768,32768>
; X86-NEXT: movaps %xmm0, (%eax)
; X86-NEXT: retl
;
; X64-LABEL: test17:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: movaps {{.*#+}} xmm0 = <u,u,32768,32768>
; X64-NEXT: movaps %xmm0, (%rax)
; X64-NEXT: retq
@@ -401,14 +401,14 @@ entry:
; PR9210
define <4 x float> @f(<4 x double>) nounwind {
; X86-LABEL: f:
-; X86: # BB#0: # %entry
+; X86: # %bb.0: # %entry
; X86-NEXT: cvtpd2ps %xmm1, %xmm1
; X86-NEXT: cvtpd2ps %xmm0, %xmm0
; X86-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; X86-NEXT: retl
;
; X64-LABEL: f:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: cvtpd2ps %xmm1, %xmm1
; X64-NEXT: cvtpd2ps %xmm0, %xmm0
; X64-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
@@ -420,12 +420,12 @@ entry:
define <2 x i64> @test_insert_64_zext(<2 x i64> %i) {
; X86-LABEL: test_insert_64_zext:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
; X86-NEXT: retl
;
; X64-LABEL: test_insert_64_zext:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
; X64-NEXT: retq
%1 = shufflevector <2 x i64> %i, <2 x i64> <i64 0, i64 undef>, <2 x i32> <i32 0, i32 2>
@@ -434,12 +434,12 @@ define <2 x i64> @test_insert_64_zext(<2 x i64> %i) {
define <4 x i32> @PR19721(<4 x i32> %i) {
; X86-LABEL: PR19721:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: andps {{\.LCPI.*}}, %xmm0
; X86-NEXT: retl
;
; X64-LABEL: PR19721:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movq %xmm0, %rax
; X64-NEXT: movabsq $-4294967296, %rcx # imm = 0xFFFFFFFF00000000
; X64-NEXT: andq %rax, %rcx
@@ -454,7 +454,7 @@ define <4 x i32> @PR19721(<4 x i32> %i) {
define <4 x i32> @test_mul(<4 x i32> %x, <4 x i32> %y) {
; X86-LABEL: test_mul:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; X86-NEXT: pmuludq %xmm1, %xmm0
; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
@@ -465,7 +465,7 @@ define <4 x i32> @test_mul(<4 x i32> %x, <4 x i32> %y) {
; X86-NEXT: retl
;
; X64-LABEL: test_mul:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; X64-NEXT: pmuludq %xmm1, %xmm0
; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
diff --git a/test/CodeGen/X86/sse3-avx-addsub-2.ll b/test/CodeGen/X86/sse3-avx-addsub-2.ll
index b5aa26f532ef..aba916241f3a 100644
--- a/test/CodeGen/X86/sse3-avx-addsub-2.ll
+++ b/test/CodeGen/X86/sse3-avx-addsub-2.ll
@@ -7,12 +7,12 @@
define <4 x float> @test1(<4 x float> %A, <4 x float> %B) {
; SSE-LABEL: test1:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: addsubps %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test1:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vaddsubps %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%1 = extractelement <4 x float> %A, i32 0
@@ -36,12 +36,12 @@ define <4 x float> @test1(<4 x float> %A, <4 x float> %B) {
define <4 x float> @test2(<4 x float> %A, <4 x float> %B) {
; SSE-LABEL: test2:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: addsubps %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test2:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vaddsubps %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%1 = extractelement <4 x float> %A, i32 2
@@ -57,12 +57,12 @@ define <4 x float> @test2(<4 x float> %A, <4 x float> %B) {
define <4 x float> @test3(<4 x float> %A, <4 x float> %B) {
; SSE-LABEL: test3:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: addsubps %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test3:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vaddsubps %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%1 = extractelement <4 x float> %A, i32 0
@@ -78,12 +78,12 @@ define <4 x float> @test3(<4 x float> %A, <4 x float> %B) {
define <4 x float> @test4(<4 x float> %A, <4 x float> %B) {
; SSE-LABEL: test4:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: addsubps %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test4:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vaddsubps %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%1 = extractelement <4 x float> %A, i32 2
@@ -99,12 +99,12 @@ define <4 x float> @test4(<4 x float> %A, <4 x float> %B) {
define <4 x float> @test5(<4 x float> %A, <4 x float> %B) {
; SSE-LABEL: test5:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: addsubps %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test5:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vaddsubps %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%1 = extractelement <4 x float> %A, i32 0
@@ -120,12 +120,12 @@ define <4 x float> @test5(<4 x float> %A, <4 x float> %B) {
define <4 x float> @test6(<4 x float> %A, <4 x float> %B) {
; SSE-LABEL: test6:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: addsubps %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test6:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vaddsubps %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%1 = extractelement <4 x float> %A, i32 0
@@ -149,13 +149,13 @@ define <4 x float> @test6(<4 x float> %A, <4 x float> %B) {
define <4 x double> @test7(<4 x double> %A, <4 x double> %B) {
; SSE-LABEL: test7:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: addsubpd %xmm2, %xmm0
; SSE-NEXT: addsubpd %xmm3, %xmm1
; SSE-NEXT: retq
;
; AVX-LABEL: test7:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vaddsubpd %ymm1, %ymm0, %ymm0
; AVX-NEXT: retq
%1 = extractelement <4 x double> %A, i32 0
@@ -179,12 +179,12 @@ define <4 x double> @test7(<4 x double> %A, <4 x double> %B) {
define <2 x double> @test8(<2 x double> %A, <2 x double> %B) {
; SSE-LABEL: test8:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: addsubpd %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test8:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vaddsubpd %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%1 = extractelement <2 x double> %A, i32 0
@@ -200,13 +200,13 @@ define <2 x double> @test8(<2 x double> %A, <2 x double> %B) {
define <8 x float> @test9(<8 x float> %A, <8 x float> %B) {
; SSE-LABEL: test9:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: addsubps %xmm2, %xmm0
; SSE-NEXT: addsubps %xmm3, %xmm1
; SSE-NEXT: retq
;
; AVX-LABEL: test9:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vaddsubps %ymm1, %ymm0, %ymm0
; AVX-NEXT: retq
%1 = extractelement <8 x float> %A, i32 0
@@ -249,12 +249,12 @@ define <8 x float> @test9(<8 x float> %A, <8 x float> %B) {
define <4 x float> @test10(<4 x float> %A, <4 x float> %B) {
; SSE-LABEL: test10:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: subss %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test10:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vsubss %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%1 = extractelement <4 x float> %A, i32 0
@@ -266,7 +266,7 @@ define <4 x float> @test10(<4 x float> %A, <4 x float> %B) {
define <4 x float> @test11(<4 x float> %A, <4 x float> %B) {
; SSE-LABEL: test11:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
; SSE-NEXT: subss %xmm1, %xmm0
@@ -274,7 +274,7 @@ define <4 x float> @test11(<4 x float> %A, <4 x float> %B) {
; SSE-NEXT: retq
;
; AVX-LABEL: test11:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
; AVX-NEXT: vsubss %xmm1, %xmm0, %xmm0
@@ -289,7 +289,7 @@ define <4 x float> @test11(<4 x float> %A, <4 x float> %B) {
define <4 x float> @test12(<4 x float> %A, <4 x float> %B) {
; SSE-LABEL: test12:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
; SSE-NEXT: movshdup {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSE-NEXT: addss %xmm0, %xmm1
@@ -297,7 +297,7 @@ define <4 x float> @test12(<4 x float> %A, <4 x float> %B) {
; SSE-NEXT: retq
;
; AVX-LABEL: test12:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm1[1,1,3,3]
; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
@@ -312,7 +312,7 @@ define <4 x float> @test12(<4 x float> %A, <4 x float> %B) {
define <4 x float> @test13(<4 x float> %A, <4 x float> %B) {
; SSE-LABEL: test13:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1,2,3]
; SSE-NEXT: addss %xmm0, %xmm1
@@ -321,7 +321,7 @@ define <4 x float> @test13(<4 x float> %A, <4 x float> %B) {
; SSE-NEXT: retq
;
; AVX-LABEL: test13:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
@@ -336,18 +336,18 @@ define <4 x float> @test13(<4 x float> %A, <4 x float> %B) {
define <4 x float> @test14(<4 x float> %A, <4 x float> %B) {
; SSE-LABEL: test14:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps %xmm0, %xmm2
; SSE-NEXT: subss %xmm1, %xmm2
; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
; SSE-NEXT: subss %xmm1, %xmm0
-; SSE-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm0[0]
-; SSE-NEXT: movapd %xmm2, %xmm0
+; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
+; SSE-NEXT: movaps %xmm2, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test14:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vsubss %xmm1, %xmm0, %xmm2
; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
@@ -367,7 +367,7 @@ define <4 x float> @test14(<4 x float> %A, <4 x float> %B) {
define <4 x float> @test15(<4 x float> %A, <4 x float> %B) {
; SSE-LABEL: test15:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movshdup {{.*#+}} xmm3 = xmm0[1,1,3,3]
; SSE-NEXT: movshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
; SSE-NEXT: addss %xmm3, %xmm2
@@ -379,7 +379,7 @@ define <4 x float> @test15(<4 x float> %A, <4 x float> %B) {
; SSE-NEXT: retq
;
; AVX-LABEL: test15:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
; AVX-NEXT: vmovshdup {{.*#+}} xmm3 = xmm1[1,1,3,3]
; AVX-NEXT: vaddss %xmm3, %xmm2, %xmm2
@@ -402,7 +402,7 @@ define <4 x float> @test15(<4 x float> %A, <4 x float> %B) {
define <4 x float> @test16(<4 x float> %A, <4 x float> %B) {
; SSE-LABEL: test16:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps %xmm0, %xmm2
; SSE-NEXT: subss %xmm0, %xmm2
; SSE-NEXT: movaps %xmm0, %xmm3
@@ -417,12 +417,12 @@ define <4 x float> @test16(<4 x float> %A, <4 x float> %B) {
; SSE-NEXT: addss %xmm0, %xmm1
; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
-; SSE-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; SSE-NEXT: movapd %xmm2, %xmm0
+; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; SSE-NEXT: movaps %xmm2, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test16:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vsubss %xmm0, %xmm0, %xmm2
; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
; AVX-NEXT: vpermilpd {{.*#+}} xmm4 = xmm1[1,0]
@@ -457,12 +457,12 @@ define <4 x float> @test16(<4 x float> %A, <4 x float> %B) {
define <2 x float> @test_v2f32(<2 x float> %v0, <2 x float> %v1) {
; SSE-LABEL: test_v2f32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: addsubps %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test_v2f32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vaddsubps %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%v2 = extractelement <2 x float> %v0, i32 0
diff --git a/test/CodeGen/X86/sse3-avx-addsub.ll b/test/CodeGen/X86/sse3-avx-addsub.ll
index 0e0cf4852568..7c87532ffea6 100644
--- a/test/CodeGen/X86/sse3-avx-addsub.ll
+++ b/test/CodeGen/X86/sse3-avx-addsub.ll
@@ -38,12 +38,12 @@
define <4 x float> @test1(<4 x float> %A, <4 x float> %B) {
; SSE-LABEL: test1:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: addsubps %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test1:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vaddsubps %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%sub = fsub <4 x float> %A, %B
@@ -54,13 +54,13 @@ define <4 x float> @test1(<4 x float> %A, <4 x float> %B) {
define <8 x float> @test2(<8 x float> %A, <8 x float> %B) {
; SSE-LABEL: test2:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: addsubps %xmm2, %xmm0
; SSE-NEXT: addsubps %xmm3, %xmm1
; SSE-NEXT: retq
;
; AVX-LABEL: test2:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vaddsubps %ymm1, %ymm0, %ymm0
; AVX-NEXT: retq
%sub = fsub <8 x float> %A, %B
@@ -71,13 +71,13 @@ define <8 x float> @test2(<8 x float> %A, <8 x float> %B) {
define <4 x double> @test3(<4 x double> %A, <4 x double> %B) {
; SSE-LABEL: test3:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: addsubpd %xmm2, %xmm0
; SSE-NEXT: addsubpd %xmm3, %xmm1
; SSE-NEXT: retq
;
; AVX-LABEL: test3:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vaddsubpd %ymm1, %ymm0, %ymm0
; AVX-NEXT: retq
%sub = fsub <4 x double> %A, %B
@@ -88,12 +88,12 @@ define <4 x double> @test3(<4 x double> %A, <4 x double> %B) {
define <2 x double> @test4(<2 x double> %A, <2 x double> %B) #0 {
; SSE-LABEL: test4:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: addsubpd %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test4:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vaddsubpd %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%add = fadd <2 x double> %A, %B
@@ -104,7 +104,7 @@ define <2 x double> @test4(<2 x double> %A, <2 x double> %B) #0 {
define <16 x float> @test5(<16 x float> %A, <16 x float> %B) {
; SSE-LABEL: test5:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: addsubps %xmm4, %xmm0
; SSE-NEXT: addsubps %xmm5, %xmm1
; SSE-NEXT: addsubps %xmm6, %xmm2
@@ -112,13 +112,13 @@ define <16 x float> @test5(<16 x float> %A, <16 x float> %B) {
; SSE-NEXT: retq
;
; AVX1-LABEL: test5:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vaddsubps %ymm2, %ymm0, %ymm0
; AVX1-NEXT: vaddsubps %ymm3, %ymm1, %ymm1
; AVX1-NEXT: retq
;
; AVX512-LABEL: test5:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vsubps %zmm1, %zmm0, %zmm2
; AVX512-NEXT: movw $-21846, %ax # imm = 0xAAAA
; AVX512-NEXT: kmovw %eax, %k1
@@ -133,7 +133,7 @@ define <16 x float> @test5(<16 x float> %A, <16 x float> %B) {
define <8 x double> @test6(<8 x double> %A, <8 x double> %B) {
; SSE-LABEL: test6:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: addsubpd %xmm4, %xmm0
; SSE-NEXT: addsubpd %xmm5, %xmm1
; SSE-NEXT: addsubpd %xmm6, %xmm2
@@ -141,13 +141,13 @@ define <8 x double> @test6(<8 x double> %A, <8 x double> %B) {
; SSE-NEXT: retq
;
; AVX1-LABEL: test6:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vaddsubpd %ymm2, %ymm0, %ymm0
; AVX1-NEXT: vaddsubpd %ymm3, %ymm1, %ymm1
; AVX1-NEXT: retq
;
; AVX512-LABEL: test6:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm2
; AVX512-NEXT: vsubpd %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vshufpd {{.*#+}} zmm0 = zmm0[0],zmm2[1],zmm0[2],zmm2[3],zmm0[4],zmm2[5],zmm0[6],zmm2[7]
@@ -160,12 +160,12 @@ define <8 x double> @test6(<8 x double> %A, <8 x double> %B) {
define <4 x float> @test1b(<4 x float> %A, <4 x float>* %B) {
; SSE-LABEL: test1b:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: addsubps (%rdi), %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test1b:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vaddsubps (%rdi), %xmm0, %xmm0
; AVX-NEXT: retq
%1 = load <4 x float>, <4 x float>* %B
@@ -177,13 +177,13 @@ define <4 x float> @test1b(<4 x float> %A, <4 x float>* %B) {
define <8 x float> @test2b(<8 x float> %A, <8 x float>* %B) {
; SSE-LABEL: test2b:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: addsubps (%rdi), %xmm0
; SSE-NEXT: addsubps 16(%rdi), %xmm1
; SSE-NEXT: retq
;
; AVX-LABEL: test2b:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vaddsubps (%rdi), %ymm0, %ymm0
; AVX-NEXT: retq
%1 = load <8 x float>, <8 x float>* %B
@@ -195,13 +195,13 @@ define <8 x float> @test2b(<8 x float> %A, <8 x float>* %B) {
define <4 x double> @test3b(<4 x double> %A, <4 x double>* %B) {
; SSE-LABEL: test3b:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: addsubpd (%rdi), %xmm0
; SSE-NEXT: addsubpd 16(%rdi), %xmm1
; SSE-NEXT: retq
;
; AVX-LABEL: test3b:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vaddsubpd (%rdi), %ymm0, %ymm0
; AVX-NEXT: retq
%1 = load <4 x double>, <4 x double>* %B
@@ -213,12 +213,12 @@ define <4 x double> @test3b(<4 x double> %A, <4 x double>* %B) {
define <2 x double> @test4b(<2 x double> %A, <2 x double>* %B) {
; SSE-LABEL: test4b:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: addsubpd (%rdi), %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test4b:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vaddsubpd (%rdi), %xmm0, %xmm0
; AVX-NEXT: retq
%1 = load <2 x double>, <2 x double>* %B
@@ -230,12 +230,12 @@ define <2 x double> @test4b(<2 x double> %A, <2 x double>* %B) {
define <4 x float> @test1c(<4 x float> %A, <4 x float>* %B) {
; SSE-LABEL: test1c:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: addsubps (%rdi), %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test1c:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vaddsubps (%rdi), %xmm0, %xmm0
; AVX-NEXT: retq
%1 = load <4 x float>, <4 x float>* %B
@@ -247,13 +247,13 @@ define <4 x float> @test1c(<4 x float> %A, <4 x float>* %B) {
define <8 x float> @test2c(<8 x float> %A, <8 x float>* %B) {
; SSE-LABEL: test2c:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: addsubps (%rdi), %xmm0
; SSE-NEXT: addsubps 16(%rdi), %xmm1
; SSE-NEXT: retq
;
; AVX-LABEL: test2c:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vaddsubps (%rdi), %ymm0, %ymm0
; AVX-NEXT: retq
%1 = load <8 x float>, <8 x float>* %B
@@ -265,13 +265,13 @@ define <8 x float> @test2c(<8 x float> %A, <8 x float>* %B) {
define <4 x double> @test3c(<4 x double> %A, <4 x double>* %B) {
; SSE-LABEL: test3c:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: addsubpd (%rdi), %xmm0
; SSE-NEXT: addsubpd 16(%rdi), %xmm1
; SSE-NEXT: retq
;
; AVX-LABEL: test3c:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vaddsubpd (%rdi), %ymm0, %ymm0
; AVX-NEXT: retq
%1 = load <4 x double>, <4 x double>* %B
@@ -283,12 +283,12 @@ define <4 x double> @test3c(<4 x double> %A, <4 x double>* %B) {
define <2 x double> @test4c(<2 x double> %A, <2 x double>* %B) {
; SSE-LABEL: test4c:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: addsubpd (%rdi), %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test4c:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vaddsubpd (%rdi), %xmm0, %xmm0
; AVX-NEXT: retq
%1 = load <2 x double>, <2 x double>* %B
diff --git a/test/CodeGen/X86/sse3-intrinsics-fast-isel.ll b/test/CodeGen/X86/sse3-intrinsics-fast-isel.ll
index 0111de2f5211..5bf36a51c764 100644
--- a/test/CodeGen/X86/sse3-intrinsics-fast-isel.ll
+++ b/test/CodeGen/X86/sse3-intrinsics-fast-isel.ll
@@ -6,12 +6,12 @@
define <2 x double> @test_mm_addsub_pd(<2 x double> %a0, <2 x double> %a1) {
; X32-LABEL: test_mm_addsub_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: addsubpd %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_addsub_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: addsubpd %xmm1, %xmm0
; X64-NEXT: retq
%res = call <2 x double> @llvm.x86.sse3.addsub.pd(<2 x double> %a0, <2 x double> %a1)
@@ -21,12 +21,12 @@ declare <2 x double> @llvm.x86.sse3.addsub.pd(<2 x double>, <2 x double>) nounwi
define <4 x float> @test_mm_addsub_ps(<4 x float> %a0, <4 x float> %a1) {
; X32-LABEL: test_mm_addsub_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: addsubps %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_addsub_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: addsubps %xmm1, %xmm0
; X64-NEXT: retq
%res = call <4 x float> @llvm.x86.sse3.addsub.ps(<4 x float> %a0, <4 x float> %a1)
@@ -36,12 +36,12 @@ declare <4 x float> @llvm.x86.sse3.addsub.ps(<4 x float>, <4 x float>) nounwind
define <2 x double> @test_mm_hadd_pd(<2 x double> %a0, <2 x double> %a1) {
; X32-LABEL: test_mm_hadd_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: haddpd %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_hadd_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: haddpd %xmm1, %xmm0
; X64-NEXT: retq
%res = call <2 x double> @llvm.x86.sse3.hadd.pd(<2 x double> %a0, <2 x double> %a1)
@@ -51,12 +51,12 @@ declare <2 x double> @llvm.x86.sse3.hadd.pd(<2 x double>, <2 x double>) nounwind
define <4 x float> @test_mm_hadd_ps(<4 x float> %a0, <4 x float> %a1) {
; X32-LABEL: test_mm_hadd_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: haddps %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_hadd_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: haddps %xmm1, %xmm0
; X64-NEXT: retq
%res = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %a0, <4 x float> %a1)
@@ -66,12 +66,12 @@ declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) nounwind re
define <2 x double> @test_mm_hsub_pd(<2 x double> %a0, <2 x double> %a1) {
; X32-LABEL: test_mm_hsub_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: hsubpd %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_hsub_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: hsubpd %xmm1, %xmm0
; X64-NEXT: retq
%res = call <2 x double> @llvm.x86.sse3.hsub.pd(<2 x double> %a0, <2 x double> %a1)
@@ -81,12 +81,12 @@ declare <2 x double> @llvm.x86.sse3.hsub.pd(<2 x double>, <2 x double>) nounwind
define <4 x float> @test_mm_hsub_ps(<4 x float> %a0, <4 x float> %a1) {
; X32-LABEL: test_mm_hsub_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: hsubps %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_hsub_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: hsubps %xmm1, %xmm0
; X64-NEXT: retq
%res = call <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float> %a0, <4 x float> %a1)
@@ -96,13 +96,13 @@ declare <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float>, <4 x float>) nounwind re
define <2 x i64> @test_mm_lddqu_si128(<2 x i64>* %a0) {
; X32-LABEL: test_mm_lddqu_si128:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: lddqu (%eax), %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_lddqu_si128:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: lddqu (%rdi), %xmm0
; X64-NEXT: retq
%bc = bitcast <2 x i64>* %a0 to i8*
@@ -114,13 +114,13 @@ declare <16 x i8> @llvm.x86.sse3.ldu.dq(i8*) nounwind readonly
define <2 x double> @test_mm_loaddup_pd(double* %a0) {
; X32-LABEL: test_mm_loaddup_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movddup {{.*#+}} xmm0 = mem[0,0]
; X32-NEXT: retl
;
; X64-LABEL: test_mm_loaddup_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movddup {{.*#+}} xmm0 = mem[0,0]
; X64-NEXT: retq
%ld = load double, double* %a0
@@ -131,12 +131,12 @@ define <2 x double> @test_mm_loaddup_pd(double* %a0) {
define <2 x double> @test_mm_movedup_pd(<2 x double> %a0) {
; X32-LABEL: test_mm_movedup_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0]
; X32-NEXT: retl
;
; X64-LABEL: test_mm_movedup_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0]
; X64-NEXT: retq
%res = shufflevector <2 x double> %a0, <2 x double> %a0, <2 x i32> zeroinitializer
@@ -145,12 +145,12 @@ define <2 x double> @test_mm_movedup_pd(<2 x double> %a0) {
define <4 x float> @test_mm_movehdup_ps(<4 x float> %a0) {
; X32-LABEL: test_mm_movehdup_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
; X32-NEXT: retl
;
; X64-LABEL: test_mm_movehdup_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
; X64-NEXT: retq
%res = shufflevector <4 x float> %a0, <4 x float> %a0, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
@@ -159,12 +159,12 @@ define <4 x float> @test_mm_movehdup_ps(<4 x float> %a0) {
define <4 x float> @test_mm_moveldup_ps(<4 x float> %a0) {
; X32-LABEL: test_mm_moveldup_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
; X32-NEXT: retl
;
; X64-LABEL: test_mm_moveldup_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
; X64-NEXT: retq
%res = shufflevector <4 x float> %a0, <4 x float> %a0, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
diff --git a/test/CodeGen/X86/sse3-intrinsics-x86.ll b/test/CodeGen/X86/sse3-intrinsics-x86.ll
index fd7f59a01579..18bd2195cb93 100644
--- a/test/CodeGen/X86/sse3-intrinsics-x86.ll
+++ b/test/CodeGen/X86/sse3-intrinsics-x86.ll
@@ -5,12 +5,12 @@
define <2 x double> @test_x86_sse3_addsub_pd(<2 x double> %a0, <2 x double> %a1) {
; SSE-LABEL: test_x86_sse3_addsub_pd:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: addsubpd %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xd0,0xc1]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; VCHECK-LABEL: test_x86_sse3_addsub_pd:
-; VCHECK: ## BB#0:
+; VCHECK: ## %bb.0:
; VCHECK-NEXT: vaddsubpd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xd0,0xc1]
; VCHECK-NEXT: retl ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.sse3.addsub.pd(<2 x double> %a0, <2 x double> %a1) ; <<2 x double>> [#uses=1]
@@ -21,12 +21,12 @@ declare <2 x double> @llvm.x86.sse3.addsub.pd(<2 x double>, <2 x double>) nounwi
define <4 x float> @test_x86_sse3_addsub_ps(<4 x float> %a0, <4 x float> %a1) {
; SSE-LABEL: test_x86_sse3_addsub_ps:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: addsubps %xmm1, %xmm0 ## encoding: [0xf2,0x0f,0xd0,0xc1]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; VCHECK-LABEL: test_x86_sse3_addsub_ps:
-; VCHECK: ## BB#0:
+; VCHECK: ## %bb.0:
; VCHECK-NEXT: vaddsubps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfb,0xd0,0xc1]
; VCHECK-NEXT: retl ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.sse3.addsub.ps(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1]
@@ -37,12 +37,12 @@ declare <4 x float> @llvm.x86.sse3.addsub.ps(<4 x float>, <4 x float>) nounwind
define <2 x double> @test_x86_sse3_hadd_pd(<2 x double> %a0, <2 x double> %a1) {
; SSE-LABEL: test_x86_sse3_hadd_pd:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: haddpd %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x7c,0xc1]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; VCHECK-LABEL: test_x86_sse3_hadd_pd:
-; VCHECK: ## BB#0:
+; VCHECK: ## %bb.0:
; VCHECK-NEXT: vhaddpd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x7c,0xc1]
; VCHECK-NEXT: retl ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.sse3.hadd.pd(<2 x double> %a0, <2 x double> %a1) ; <<2 x double>> [#uses=1]
@@ -53,12 +53,12 @@ declare <2 x double> @llvm.x86.sse3.hadd.pd(<2 x double>, <2 x double>) nounwind
define <4 x float> @test_x86_sse3_hadd_ps(<4 x float> %a0, <4 x float> %a1) {
; SSE-LABEL: test_x86_sse3_hadd_ps:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: haddps %xmm1, %xmm0 ## encoding: [0xf2,0x0f,0x7c,0xc1]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; VCHECK-LABEL: test_x86_sse3_hadd_ps:
-; VCHECK: ## BB#0:
+; VCHECK: ## %bb.0:
; VCHECK-NEXT: vhaddps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfb,0x7c,0xc1]
; VCHECK-NEXT: retl ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1]
@@ -69,12 +69,12 @@ declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) nounwind re
define <2 x double> @test_x86_sse3_hsub_pd(<2 x double> %a0, <2 x double> %a1) {
; SSE-LABEL: test_x86_sse3_hsub_pd:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: hsubpd %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x7d,0xc1]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; VCHECK-LABEL: test_x86_sse3_hsub_pd:
-; VCHECK: ## BB#0:
+; VCHECK: ## %bb.0:
; VCHECK-NEXT: vhsubpd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x7d,0xc1]
; VCHECK-NEXT: retl ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.sse3.hsub.pd(<2 x double> %a0, <2 x double> %a1) ; <<2 x double>> [#uses=1]
@@ -85,12 +85,12 @@ declare <2 x double> @llvm.x86.sse3.hsub.pd(<2 x double>, <2 x double>) nounwind
define <4 x float> @test_x86_sse3_hsub_ps(<4 x float> %a0, <4 x float> %a1) {
; SSE-LABEL: test_x86_sse3_hsub_ps:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: hsubps %xmm1, %xmm0 ## encoding: [0xf2,0x0f,0x7d,0xc1]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; VCHECK-LABEL: test_x86_sse3_hsub_ps:
-; VCHECK: ## BB#0:
+; VCHECK: ## %bb.0:
; VCHECK-NEXT: vhsubps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfb,0x7d,0xc1]
; VCHECK-NEXT: retl ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1]
@@ -101,13 +101,13 @@ declare <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float>, <4 x float>) nounwind re
define <16 x i8> @test_x86_sse3_ldu_dq(i8* %a0) {
; SSE-LABEL: test_x86_sse3_ldu_dq:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
; SSE-NEXT: lddqu (%eax), %xmm0 ## encoding: [0xf2,0x0f,0xf0,0x00]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; VCHECK-LABEL: test_x86_sse3_ldu_dq:
-; VCHECK: ## BB#0:
+; VCHECK: ## %bb.0:
; VCHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
; VCHECK-NEXT: vlddqu (%eax), %xmm0 ## encoding: [0xc5,0xfb,0xf0,0x00]
; VCHECK-NEXT: retl ## encoding: [0xc3]
@@ -120,7 +120,7 @@ declare <16 x i8> @llvm.x86.sse3.ldu.dq(i8*) nounwind readonly
define void @monitor(i8* %P, i32 %E, i32 %H) nounwind {
; CHECK-LABEL: monitor:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx ## encoding: [0x8b,0x54,0x24,0x0c]
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x08]
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
@@ -134,7 +134,7 @@ declare void @llvm.x86.sse3.monitor(i8*, i32, i32) nounwind
define void @mwait(i32 %E, i32 %H) nounwind {
; CHECK-LABEL: mwait:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x04]
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x08]
; CHECK-NEXT: mwait ## encoding: [0x0f,0x01,0xc9]
diff --git a/test/CodeGen/X86/sse3-schedule.ll b/test/CodeGen/X86/sse3-schedule.ll
index 5f41ccda0fde..5de26ab19d21 100644
--- a/test/CodeGen/X86/sse3-schedule.ll
+++ b/test/CodeGen/X86/sse3-schedule.ll
@@ -1,56 +1,76 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mattr=+sse3 | FileCheck %s --check-prefix=CHECK --check-prefix=GENERIC
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=+sse3 | FileCheck %s --check-prefix=CHECK --check-prefix=GENERIC
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=atom | FileCheck %s --check-prefix=CHECK --check-prefix=ATOM
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=slm | FileCheck %s --check-prefix=CHECK --check-prefix=SLM
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=sandybridge | FileCheck %s --check-prefix=CHECK --check-prefix=SANDY
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=ivybridge | FileCheck %s --check-prefix=CHECK --check-prefix=SANDY
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=haswell | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=broadwell | FileCheck %s --check-prefix=CHECK --check-prefix=BROADWELL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=SKYLAKE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skx | FileCheck %s --check-prefix=CHECK --check-prefix=SKX
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1
define <2 x double> @test_addsubpd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2) {
; GENERIC-LABEL: test_addsubpd:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: addsubpd %xmm1, %xmm0
-; GENERIC-NEXT: addsubpd (%rdi), %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: addsubpd %xmm1, %xmm0 # sched: [3:1.00]
+; GENERIC-NEXT: addsubpd (%rdi), %xmm0 # sched: [9:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_addsubpd:
-; ATOM: # BB#0:
-; ATOM-NEXT: addsubpd %xmm1, %xmm0
-; ATOM-NEXT: addsubpd (%rdi), %xmm0
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: addsubpd %xmm1, %xmm0 # sched: [6:3.00]
+; ATOM-NEXT: addsubpd (%rdi), %xmm0 # sched: [6:3.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_addsubpd:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: addsubpd %xmm1, %xmm0 # sched: [3:1.00]
; SLM-NEXT: addsubpd (%rdi), %xmm0 # sched: [6:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_addsubpd:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vaddsubpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vaddsubpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vaddsubpd (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_addsubpd:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vaddsubpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: vaddsubpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vaddsubpd (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_addsubpd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vaddsubpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; BROADWELL-NEXT: vaddsubpd (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_addsubpd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vaddsubpd %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vaddsubpd (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_addsubpd:
+; SKX: # %bb.0:
+; SKX-NEXT: vaddsubpd %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: vaddsubpd (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_addsubpd:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vaddsubpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: vaddsubpd (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_addsubpd:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vaddsubpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; ZNVER1-NEXT: vaddsubpd (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call <2 x double> @llvm.x86.sse3.addsub.pd(<2 x double> %a0, <2 x double> %a1)
%2 = load <2 x double>, <2 x double> *%a2, align 16
%3 = call <2 x double> @llvm.x86.sse3.addsub.pd(<2 x double> %1, <2 x double> %2)
@@ -60,46 +80,64 @@ declare <2 x double> @llvm.x86.sse3.addsub.pd(<2 x double>, <2 x double>) nounwi
define <4 x float> @test_addsubps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2) {
; GENERIC-LABEL: test_addsubps:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: addsubps %xmm1, %xmm0
-; GENERIC-NEXT: addsubps (%rdi), %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: addsubps %xmm1, %xmm0 # sched: [3:1.00]
+; GENERIC-NEXT: addsubps (%rdi), %xmm0 # sched: [9:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_addsubps:
-; ATOM: # BB#0:
-; ATOM-NEXT: addsubps %xmm1, %xmm0
-; ATOM-NEXT: addsubps (%rdi), %xmm0
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: addsubps %xmm1, %xmm0 # sched: [5:5.00]
+; ATOM-NEXT: addsubps (%rdi), %xmm0 # sched: [5:5.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_addsubps:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: addsubps %xmm1, %xmm0 # sched: [3:1.00]
; SLM-NEXT: addsubps (%rdi), %xmm0 # sched: [6:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_addsubps:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vaddsubps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vaddsubps (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vaddsubps (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_addsubps:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vaddsubps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: vaddsubps (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vaddsubps (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_addsubps:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vaddsubps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; BROADWELL-NEXT: vaddsubps (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_addsubps:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vaddsubps %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vaddsubps (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_addsubps:
+; SKX: # %bb.0:
+; SKX-NEXT: vaddsubps %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: vaddsubps (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_addsubps:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vaddsubps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: vaddsubps (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_addsubps:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vaddsubps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; ZNVER1-NEXT: vaddsubps (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call <4 x float> @llvm.x86.sse3.addsub.ps(<4 x float> %a0, <4 x float> %a1)
%2 = load <4 x float>, <4 x float> *%a2, align 16
%3 = call <4 x float> @llvm.x86.sse3.addsub.ps(<4 x float> %1, <4 x float> %2)
@@ -109,46 +147,64 @@ declare <4 x float> @llvm.x86.sse3.addsub.ps(<4 x float>, <4 x float>) nounwind
define <2 x double> @test_haddpd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2) {
; GENERIC-LABEL: test_haddpd:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: haddpd %xmm1, %xmm0
-; GENERIC-NEXT: haddpd (%rdi), %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: haddpd %xmm1, %xmm0 # sched: [5:2.00]
+; GENERIC-NEXT: haddpd (%rdi), %xmm0 # sched: [11:2.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_haddpd:
-; ATOM: # BB#0:
-; ATOM-NEXT: haddpd %xmm1, %xmm0
-; ATOM-NEXT: haddpd (%rdi), %xmm0
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: haddpd %xmm1, %xmm0 # sched: [8:4.00]
+; ATOM-NEXT: haddpd (%rdi), %xmm0 # sched: [9:4.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_haddpd:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: haddpd %xmm1, %xmm0 # sched: [3:1.00]
; SLM-NEXT: haddpd (%rdi), %xmm0 # sched: [6:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_haddpd:
-; SANDY: # BB#0:
-; SANDY-NEXT: vhaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vhaddpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vhaddpd %xmm1, %xmm0, %xmm0 # sched: [5:2.00]
+; SANDY-NEXT: vhaddpd (%rdi), %xmm0, %xmm0 # sched: [11:2.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_haddpd:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vhaddpd %xmm1, %xmm0, %xmm0 # sched: [5:2.00]
-; HASWELL-NEXT: vhaddpd (%rdi), %xmm0, %xmm0 # sched: [9:2.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vhaddpd (%rdi), %xmm0, %xmm0 # sched: [11:2.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_haddpd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vhaddpd %xmm1, %xmm0, %xmm0 # sched: [5:2.00]
+; BROADWELL-NEXT: vhaddpd (%rdi), %xmm0, %xmm0 # sched: [10:2.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_haddpd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vhaddpd %xmm1, %xmm0, %xmm0 # sched: [6:2.00]
+; SKYLAKE-NEXT: vhaddpd (%rdi), %xmm0, %xmm0 # sched: [12:2.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_haddpd:
+; SKX: # %bb.0:
+; SKX-NEXT: vhaddpd %xmm1, %xmm0, %xmm0 # sched: [6:2.00]
+; SKX-NEXT: vhaddpd (%rdi), %xmm0, %xmm0 # sched: [12:2.00]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_haddpd:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vhaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: vhaddpd (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_haddpd:
-; ZNVER1: # BB#0:
-; ZNVER1-NEXT: vhaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; ZNVER1-NEXT: vhaddpd (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vhaddpd %xmm1, %xmm0, %xmm0 # sched: [100:?]
+; ZNVER1-NEXT: vhaddpd (%rdi), %xmm0, %xmm0 # sched: [100:?]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call <2 x double> @llvm.x86.sse3.hadd.pd(<2 x double> %a0, <2 x double> %a1)
%2 = load <2 x double>, <2 x double> *%a2, align 16
%3 = call <2 x double> @llvm.x86.sse3.hadd.pd(<2 x double> %1, <2 x double> %2)
@@ -158,46 +214,64 @@ declare <2 x double> @llvm.x86.sse3.hadd.pd(<2 x double>, <2 x double>) nounwind
define <4 x float> @test_haddps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2) {
; GENERIC-LABEL: test_haddps:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: haddps %xmm1, %xmm0
-; GENERIC-NEXT: haddps (%rdi), %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: haddps %xmm1, %xmm0 # sched: [5:2.00]
+; GENERIC-NEXT: haddps (%rdi), %xmm0 # sched: [11:2.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_haddps:
-; ATOM: # BB#0:
-; ATOM-NEXT: haddps %xmm1, %xmm0
-; ATOM-NEXT: haddps (%rdi), %xmm0
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: haddps %xmm1, %xmm0 # sched: [8:4.00]
+; ATOM-NEXT: haddps (%rdi), %xmm0 # sched: [9:4.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_haddps:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: haddps %xmm1, %xmm0 # sched: [3:1.00]
; SLM-NEXT: haddps (%rdi), %xmm0 # sched: [6:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_haddps:
-; SANDY: # BB#0:
-; SANDY-NEXT: vhaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vhaddps (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vhaddps %xmm1, %xmm0, %xmm0 # sched: [5:2.00]
+; SANDY-NEXT: vhaddps (%rdi), %xmm0, %xmm0 # sched: [11:2.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_haddps:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vhaddps %xmm1, %xmm0, %xmm0 # sched: [5:2.00]
-; HASWELL-NEXT: vhaddps (%rdi), %xmm0, %xmm0 # sched: [9:2.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vhaddps (%rdi), %xmm0, %xmm0 # sched: [11:2.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_haddps:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vhaddps %xmm1, %xmm0, %xmm0 # sched: [5:2.00]
+; BROADWELL-NEXT: vhaddps (%rdi), %xmm0, %xmm0 # sched: [10:2.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_haddps:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vhaddps %xmm1, %xmm0, %xmm0 # sched: [6:2.00]
+; SKYLAKE-NEXT: vhaddps (%rdi), %xmm0, %xmm0 # sched: [12:2.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_haddps:
+; SKX: # %bb.0:
+; SKX-NEXT: vhaddps %xmm1, %xmm0, %xmm0 # sched: [6:2.00]
+; SKX-NEXT: vhaddps (%rdi), %xmm0, %xmm0 # sched: [12:2.00]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_haddps:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vhaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: vhaddps (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_haddps:
-; ZNVER1: # BB#0:
-; ZNVER1-NEXT: vhaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; ZNVER1-NEXT: vhaddps (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vhaddps %xmm1, %xmm0, %xmm0 # sched: [100:?]
+; ZNVER1-NEXT: vhaddps (%rdi), %xmm0, %xmm0 # sched: [100:?]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %a0, <4 x float> %a1)
%2 = load <4 x float>, <4 x float> *%a2, align 16
%3 = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %1, <4 x float> %2)
@@ -207,46 +281,64 @@ declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) nounwind re
define <2 x double> @test_hsubpd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2) {
; GENERIC-LABEL: test_hsubpd:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: hsubpd %xmm1, %xmm0
-; GENERIC-NEXT: hsubpd (%rdi), %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: hsubpd %xmm1, %xmm0 # sched: [5:2.00]
+; GENERIC-NEXT: hsubpd (%rdi), %xmm0 # sched: [11:2.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_hsubpd:
-; ATOM: # BB#0:
-; ATOM-NEXT: hsubpd %xmm1, %xmm0
-; ATOM-NEXT: hsubpd (%rdi), %xmm0
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: hsubpd %xmm1, %xmm0 # sched: [8:4.00]
+; ATOM-NEXT: hsubpd (%rdi), %xmm0 # sched: [9:4.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_hsubpd:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: hsubpd %xmm1, %xmm0 # sched: [3:1.00]
; SLM-NEXT: hsubpd (%rdi), %xmm0 # sched: [6:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_hsubpd:
-; SANDY: # BB#0:
-; SANDY-NEXT: vhsubpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vhsubpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vhsubpd %xmm1, %xmm0, %xmm0 # sched: [5:2.00]
+; SANDY-NEXT: vhsubpd (%rdi), %xmm0, %xmm0 # sched: [11:2.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_hsubpd:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vhsubpd %xmm1, %xmm0, %xmm0 # sched: [5:2.00]
-; HASWELL-NEXT: vhsubpd (%rdi), %xmm0, %xmm0 # sched: [9:2.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vhsubpd (%rdi), %xmm0, %xmm0 # sched: [11:2.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_hsubpd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vhsubpd %xmm1, %xmm0, %xmm0 # sched: [5:2.00]
+; BROADWELL-NEXT: vhsubpd (%rdi), %xmm0, %xmm0 # sched: [10:2.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_hsubpd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vhsubpd %xmm1, %xmm0, %xmm0 # sched: [6:2.00]
+; SKYLAKE-NEXT: vhsubpd (%rdi), %xmm0, %xmm0 # sched: [12:2.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_hsubpd:
+; SKX: # %bb.0:
+; SKX-NEXT: vhsubpd %xmm1, %xmm0, %xmm0 # sched: [6:2.00]
+; SKX-NEXT: vhsubpd (%rdi), %xmm0, %xmm0 # sched: [12:2.00]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_hsubpd:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vhsubpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: vhsubpd (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_hsubpd:
-; ZNVER1: # BB#0:
-; ZNVER1-NEXT: vhsubpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; ZNVER1-NEXT: vhsubpd (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vhsubpd %xmm1, %xmm0, %xmm0 # sched: [100:?]
+; ZNVER1-NEXT: vhsubpd (%rdi), %xmm0, %xmm0 # sched: [100:?]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call <2 x double> @llvm.x86.sse3.hsub.pd(<2 x double> %a0, <2 x double> %a1)
%2 = load <2 x double>, <2 x double> *%a2, align 16
%3 = call <2 x double> @llvm.x86.sse3.hsub.pd(<2 x double> %1, <2 x double> %2)
@@ -256,46 +348,64 @@ declare <2 x double> @llvm.x86.sse3.hsub.pd(<2 x double>, <2 x double>) nounwind
define <4 x float> @test_hsubps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2) {
; GENERIC-LABEL: test_hsubps:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: hsubps %xmm1, %xmm0
-; GENERIC-NEXT: hsubps (%rdi), %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: hsubps %xmm1, %xmm0 # sched: [5:2.00]
+; GENERIC-NEXT: hsubps (%rdi), %xmm0 # sched: [11:2.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_hsubps:
-; ATOM: # BB#0:
-; ATOM-NEXT: hsubps %xmm1, %xmm0
-; ATOM-NEXT: hsubps (%rdi), %xmm0
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: hsubps %xmm1, %xmm0 # sched: [8:4.00]
+; ATOM-NEXT: hsubps (%rdi), %xmm0 # sched: [9:4.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_hsubps:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: hsubps %xmm1, %xmm0 # sched: [3:1.00]
; SLM-NEXT: hsubps (%rdi), %xmm0 # sched: [6:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_hsubps:
-; SANDY: # BB#0:
-; SANDY-NEXT: vhsubps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vhsubps (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vhsubps %xmm1, %xmm0, %xmm0 # sched: [5:2.00]
+; SANDY-NEXT: vhsubps (%rdi), %xmm0, %xmm0 # sched: [11:2.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_hsubps:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vhsubps %xmm1, %xmm0, %xmm0 # sched: [5:2.00]
-; HASWELL-NEXT: vhsubps (%rdi), %xmm0, %xmm0 # sched: [9:2.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vhsubps (%rdi), %xmm0, %xmm0 # sched: [11:2.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_hsubps:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vhsubps %xmm1, %xmm0, %xmm0 # sched: [5:2.00]
+; BROADWELL-NEXT: vhsubps (%rdi), %xmm0, %xmm0 # sched: [10:2.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_hsubps:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vhsubps %xmm1, %xmm0, %xmm0 # sched: [6:2.00]
+; SKYLAKE-NEXT: vhsubps (%rdi), %xmm0, %xmm0 # sched: [12:2.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_hsubps:
+; SKX: # %bb.0:
+; SKX-NEXT: vhsubps %xmm1, %xmm0, %xmm0 # sched: [6:2.00]
+; SKX-NEXT: vhsubps (%rdi), %xmm0, %xmm0 # sched: [12:2.00]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_hsubps:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vhsubps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: vhsubps (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_hsubps:
-; ZNVER1: # BB#0:
-; ZNVER1-NEXT: vhsubps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; ZNVER1-NEXT: vhsubps (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vhsubps %xmm1, %xmm0, %xmm0 # sched: [100:?]
+; ZNVER1-NEXT: vhsubps (%rdi), %xmm0, %xmm0 # sched: [100:?]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float> %a0, <4 x float> %a1)
%2 = load <4 x float>, <4 x float> *%a2, align 16
%3 = call <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float> %1, <4 x float> %2)
@@ -305,153 +415,285 @@ declare <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float>, <4 x float>) nounwind re
define <16 x i8> @test_lddqu(i8* %a0) {
; GENERIC-LABEL: test_lddqu:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: lddqu (%rdi), %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: lddqu (%rdi), %xmm0 # sched: [6:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_lddqu:
-; ATOM: # BB#0:
-; ATOM-NEXT: lddqu (%rdi), %xmm0
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: lddqu (%rdi), %xmm0 # sched: [3:1.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_lddqu:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: lddqu (%rdi), %xmm0 # sched: [3:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_lddqu:
-; SANDY: # BB#0:
-; SANDY-NEXT: vlddqu (%rdi), %xmm0 # sched: [4:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vlddqu (%rdi), %xmm0 # sched: [6:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_lddqu:
-; HASWELL: # BB#0:
-; HASWELL-NEXT: vlddqu (%rdi), %xmm0 # sched: [4:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vlddqu (%rdi), %xmm0 # sched: [6:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_lddqu:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vlddqu (%rdi), %xmm0 # sched: [5:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_lddqu:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vlddqu (%rdi), %xmm0 # sched: [6:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_lddqu:
+; SKX: # %bb.0:
+; SKX-NEXT: vlddqu (%rdi), %xmm0 # sched: [6:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_lddqu:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vlddqu (%rdi), %xmm0 # sched: [5:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_lddqu:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vlddqu (%rdi), %xmm0 # sched: [8:0.50]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call <16 x i8> @llvm.x86.sse3.ldu.dq(i8* %a0)
ret <16 x i8> %1
}
declare <16 x i8> @llvm.x86.sse3.ldu.dq(i8*) nounwind readonly
+define void @test_monitor(i8* %a0, i32 %a1, i32 %a2) {
+; GENERIC-LABEL: test_monitor:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: leaq (%rdi), %rax # sched: [1:0.50]
+; GENERIC-NEXT: movl %esi, %ecx # sched: [1:0.33]
+; GENERIC-NEXT: monitor # sched: [100:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_monitor:
+; ATOM: # %bb.0:
+; ATOM-NEXT: leaq (%rdi), %rax # sched: [1:1.00]
+; ATOM-NEXT: movl %esi, %ecx # sched: [1:0.50]
+; ATOM-NEXT: monitor # sched: [45:22.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_monitor:
+; SLM: # %bb.0:
+; SLM-NEXT: leaq (%rdi), %rax # sched: [1:1.00]
+; SLM-NEXT: movl %esi, %ecx # sched: [1:0.50]
+; SLM-NEXT: monitor # sched: [100:1.00]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_monitor:
+; SANDY: # %bb.0:
+; SANDY-NEXT: leaq (%rdi), %rax # sched: [1:0.50]
+; SANDY-NEXT: movl %esi, %ecx # sched: [1:0.33]
+; SANDY-NEXT: monitor # sched: [100:0.33]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_monitor:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: leaq (%rdi), %rax # sched: [1:0.50]
+; HASWELL-NEXT: movl %esi, %ecx # sched: [1:0.25]
+; HASWELL-NEXT: monitor # sched: [100:0.25]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_monitor:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: leaq (%rdi), %rax # sched: [1:0.50]
+; BROADWELL-NEXT: movl %esi, %ecx # sched: [1:0.25]
+; BROADWELL-NEXT: monitor # sched: [100:0.25]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_monitor:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: leaq (%rdi), %rax # sched: [1:0.50]
+; SKYLAKE-NEXT: movl %esi, %ecx # sched: [1:0.25]
+; SKYLAKE-NEXT: monitor # sched: [100:0.25]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_monitor:
+; SKX: # %bb.0:
+; SKX-NEXT: leaq (%rdi), %rax # sched: [1:0.50]
+; SKX-NEXT: movl %esi, %ecx # sched: [1:0.25]
+; SKX-NEXT: monitor # sched: [100:0.25]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_monitor:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: leaq (%rdi), %rax # sched: [1:0.50]
+; BTVER2-NEXT: movl %esi, %ecx # sched: [1:0.50]
+; BTVER2-NEXT: monitor # sched: [100:0.17]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_monitor:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: leaq (%rdi), %rax # sched: [1:0.25]
+; ZNVER1-NEXT: movl %esi, %ecx # sched: [1:0.25]
+; ZNVER1-NEXT: monitor # sched: [100:?]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ tail call void @llvm.x86.sse3.monitor(i8* %a0, i32 %a1, i32 %a2)
+ ret void
+}
+declare void @llvm.x86.sse3.monitor(i8*, i32, i32)
+
define <2 x double> @test_movddup(<2 x double> %a0, <2 x double> *%a1) {
; GENERIC-LABEL: test_movddup:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: movddup {{.*#+}} xmm1 = xmm0[0,0]
-; GENERIC-NEXT: movddup {{.*#+}} xmm0 = mem[0,0]
-; GENERIC-NEXT: addpd %xmm1, %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: movddup {{.*#+}} xmm1 = xmm0[0,0] sched: [1:1.00]
+; GENERIC-NEXT: movddup {{.*#+}} xmm0 = mem[0,0] sched: [6:0.50]
+; GENERIC-NEXT: subpd %xmm1, %xmm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_movddup:
-; ATOM: # BB#0:
-; ATOM-NEXT: movddup {{.*#+}} xmm1 = mem[0,0]
-; ATOM-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0]
-; ATOM-NEXT: addpd %xmm0, %xmm1
-; ATOM-NEXT: movapd %xmm1, %xmm0
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: movddup {{.*#+}} xmm1 = mem[0,0] sched: [1:1.00]
+; ATOM-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] sched: [1:1.00]
+; ATOM-NEXT: subpd %xmm0, %xmm1 # sched: [6:3.00]
+; ATOM-NEXT: movapd %xmm1, %xmm0 # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_movddup:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: movddup {{.*#+}} xmm1 = xmm0[0,0] sched: [1:1.00]
; SLM-NEXT: movddup {{.*#+}} xmm0 = mem[0,0] sched: [3:1.00]
-; SLM-NEXT: addpd %xmm1, %xmm0 # sched: [3:1.00]
+; SLM-NEXT: subpd %xmm1, %xmm0 # sched: [3:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_movddup:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] sched: [1:1.00]
-; SANDY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] sched: [4:0.50]
-; SANDY-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] sched: [6:0.50]
+; SANDY-NEXT: vsubpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movddup:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] sched: [1:1.00]
-; HASWELL-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] sched: [4:0.50]
-; HASWELL-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] sched: [5:0.50]
+; HASWELL-NEXT: vsubpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_movddup:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] sched: [1:1.00]
+; BROADWELL-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] sched: [5:0.50]
+; BROADWELL-NEXT: vsubpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_movddup:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] sched: [1:1.00]
+; SKYLAKE-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] sched: [5:0.50]
+; SKYLAKE-NEXT: vsubpd %xmm0, %xmm1, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_movddup:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] sched: [1:1.00]
+; SKX-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] sched: [5:0.50]
+; SKX-NEXT: vsubpd %xmm0, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_movddup:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] sched: [5:1.00]
; BTVER2-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] sched: [1:0.50]
-; BTVER2-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; BTVER2-NEXT: vsubpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_movddup:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] sched: [8:0.50]
; ZNVER1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] sched: [1:0.50]
-; ZNVER1-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: vsubpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = shufflevector <2 x double> %a0, <2 x double> undef, <2 x i32> zeroinitializer
%2 = load <2 x double>, <2 x double> *%a1, align 16
%3 = shufflevector <2 x double> %2, <2 x double> undef, <2 x i32> zeroinitializer
- %4 = fadd <2 x double> %1, %3
+ %4 = fsub <2 x double> %3, %1 ; Use fsub to stop the movddup from being folded as a broadcast load in avx512vl.
ret <2 x double> %4
}
define <4 x float> @test_movshdup(<4 x float> %a0, <4 x float> *%a1) {
; GENERIC-LABEL: test_movshdup:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; GENERIC-NEXT: movshdup {{.*#+}} xmm0 = mem[1,1,3,3]
-; GENERIC-NEXT: addps %xmm1, %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] sched: [1:1.00]
+; GENERIC-NEXT: movshdup {{.*#+}} xmm0 = mem[1,1,3,3] sched: [6:0.50]
+; GENERIC-NEXT: addps %xmm1, %xmm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_movshdup:
-; ATOM: # BB#0:
-; ATOM-NEXT: movshdup {{.*#+}} xmm1 = mem[1,1,3,3]
-; ATOM-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; ATOM-NEXT: addps %xmm0, %xmm1
-; ATOM-NEXT: movaps %xmm1, %xmm0
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: movshdup {{.*#+}} xmm1 = mem[1,1,3,3] sched: [1:1.00]
+; ATOM-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] sched: [1:1.00]
+; ATOM-NEXT: addps %xmm0, %xmm1 # sched: [5:5.00]
+; ATOM-NEXT: movaps %xmm1, %xmm0 # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_movshdup:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] sched: [1:1.00]
; SLM-NEXT: movshdup {{.*#+}} xmm0 = mem[1,1,3,3] sched: [3:1.00]
; SLM-NEXT: addps %xmm1, %xmm0 # sched: [3:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_movshdup:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] sched: [1:1.00]
-; SANDY-NEXT: vmovshdup {{.*#+}} xmm1 = mem[1,1,3,3] sched: [4:0.50]
+; SANDY-NEXT: vmovshdup {{.*#+}} xmm1 = mem[1,1,3,3] sched: [6:0.50]
; SANDY-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movshdup:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] sched: [1:1.00]
-; HASWELL-NEXT: vmovshdup {{.*#+}} xmm1 = mem[1,1,3,3] sched: [4:0.50]
+; HASWELL-NEXT: vmovshdup {{.*#+}} xmm1 = mem[1,1,3,3] sched: [6:0.50]
; HASWELL-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_movshdup:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] sched: [1:1.00]
+; BROADWELL-NEXT: vmovshdup {{.*#+}} xmm1 = mem[1,1,3,3] sched: [5:0.50]
+; BROADWELL-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_movshdup:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] sched: [1:1.00]
+; SKYLAKE-NEXT: vmovshdup {{.*#+}} xmm1 = mem[1,1,3,3] sched: [6:0.50]
+; SKYLAKE-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_movshdup:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] sched: [1:1.00]
+; SKX-NEXT: vmovshdup {{.*#+}} xmm1 = mem[1,1,3,3] sched: [6:0.50]
+; SKX-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_movshdup:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vmovshdup {{.*#+}} xmm1 = mem[1,1,3,3] sched: [5:1.00]
; BTVER2-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] sched: [1:0.50]
; BTVER2-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_movshdup:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vmovshdup {{.*#+}} xmm1 = mem[1,1,3,3] sched: [8:0.50]
; ZNVER1-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] sched: [1:0.50]
; ZNVER1-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
%2 = load <4 x float>, <4 x float> *%a1, align 16
%3 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
@@ -461,57 +703,153 @@ define <4 x float> @test_movshdup(<4 x float> %a0, <4 x float> *%a1) {
define <4 x float> @test_movsldup(<4 x float> %a0, <4 x float> *%a1) {
; GENERIC-LABEL: test_movsldup:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: movsldup {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; GENERIC-NEXT: movsldup {{.*#+}} xmm0 = mem[0,0,2,2]
-; GENERIC-NEXT: addps %xmm1, %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: movsldup {{.*#+}} xmm1 = xmm0[0,0,2,2] sched: [1:1.00]
+; GENERIC-NEXT: movsldup {{.*#+}} xmm0 = mem[0,0,2,2] sched: [6:0.50]
+; GENERIC-NEXT: addps %xmm1, %xmm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_movsldup:
-; ATOM: # BB#0:
-; ATOM-NEXT: movsldup {{.*#+}} xmm1 = mem[0,0,2,2]
-; ATOM-NEXT: movsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
-; ATOM-NEXT: addps %xmm0, %xmm1
-; ATOM-NEXT: movaps %xmm1, %xmm0
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: movsldup {{.*#+}} xmm1 = mem[0,0,2,2] sched: [1:1.00]
+; ATOM-NEXT: movsldup {{.*#+}} xmm0 = xmm0[0,0,2,2] sched: [1:1.00]
+; ATOM-NEXT: addps %xmm0, %xmm1 # sched: [5:5.00]
+; ATOM-NEXT: movaps %xmm1, %xmm0 # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_movsldup:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: movsldup {{.*#+}} xmm1 = xmm0[0,0,2,2] sched: [1:1.00]
; SLM-NEXT: movsldup {{.*#+}} xmm0 = mem[0,0,2,2] sched: [3:1.00]
; SLM-NEXT: addps %xmm1, %xmm0 # sched: [3:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_movsldup:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2] sched: [1:1.00]
-; SANDY-NEXT: vmovsldup {{.*#+}} xmm1 = mem[0,0,2,2] sched: [4:0.50]
+; SANDY-NEXT: vmovsldup {{.*#+}} xmm1 = mem[0,0,2,2] sched: [6:0.50]
; SANDY-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movsldup:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2] sched: [1:1.00]
-; HASWELL-NEXT: vmovsldup {{.*#+}} xmm1 = mem[0,0,2,2] sched: [4:0.50]
+; HASWELL-NEXT: vmovsldup {{.*#+}} xmm1 = mem[0,0,2,2] sched: [6:0.50]
; HASWELL-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_movsldup:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2] sched: [1:1.00]
+; BROADWELL-NEXT: vmovsldup {{.*#+}} xmm1 = mem[0,0,2,2] sched: [5:0.50]
+; BROADWELL-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_movsldup:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2] sched: [1:1.00]
+; SKYLAKE-NEXT: vmovsldup {{.*#+}} xmm1 = mem[0,0,2,2] sched: [6:0.50]
+; SKYLAKE-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_movsldup:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2] sched: [1:1.00]
+; SKX-NEXT: vmovsldup {{.*#+}} xmm1 = mem[0,0,2,2] sched: [6:0.50]
+; SKX-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_movsldup:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vmovsldup {{.*#+}} xmm1 = mem[0,0,2,2] sched: [5:1.00]
; BTVER2-NEXT: vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2] sched: [1:0.50]
; BTVER2-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_movsldup:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vmovsldup {{.*#+}} xmm1 = mem[0,0,2,2] sched: [8:0.50]
; ZNVER1-NEXT: vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2] sched: [1:0.50]
; ZNVER1-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
%2 = load <4 x float>, <4 x float> *%a1, align 16
%3 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
%4 = fadd <4 x float> %1, %3
ret <4 x float> %4
}
+
+define void @test_mwait(i32 %a0, i32 %a1) {
+; GENERIC-LABEL: test_mwait:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: movl %edi, %ecx # sched: [1:0.33]
+; GENERIC-NEXT: movl %esi, %eax # sched: [1:0.33]
+; GENERIC-NEXT: mwait # sched: [100:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; ATOM-LABEL: test_mwait:
+; ATOM: # %bb.0:
+; ATOM-NEXT: movl %edi, %ecx # sched: [1:0.50]
+; ATOM-NEXT: movl %esi, %eax # sched: [1:0.50]
+; ATOM-NEXT: mwait # sched: [46:23.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
+;
+; SLM-LABEL: test_mwait:
+; SLM: # %bb.0:
+; SLM-NEXT: movl %edi, %ecx # sched: [1:0.50]
+; SLM-NEXT: movl %esi, %eax # sched: [1:0.50]
+; SLM-NEXT: mwait # sched: [100:1.00]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_mwait:
+; SANDY: # %bb.0:
+; SANDY-NEXT: movl %edi, %ecx # sched: [1:0.33]
+; SANDY-NEXT: movl %esi, %eax # sched: [1:0.33]
+; SANDY-NEXT: mwait # sched: [100:0.33]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_mwait:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: movl %edi, %ecx # sched: [1:0.25]
+; HASWELL-NEXT: movl %esi, %eax # sched: [1:0.25]
+; HASWELL-NEXT: mwait # sched: [20:2.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_mwait:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: movl %edi, %ecx # sched: [1:0.25]
+; BROADWELL-NEXT: movl %esi, %eax # sched: [1:0.25]
+; BROADWELL-NEXT: mwait # sched: [100:0.25]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_mwait:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: movl %edi, %ecx # sched: [1:0.25]
+; SKYLAKE-NEXT: movl %esi, %eax # sched: [1:0.25]
+; SKYLAKE-NEXT: mwait # sched: [20:2.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_mwait:
+; SKX: # %bb.0:
+; SKX-NEXT: movl %edi, %ecx # sched: [1:0.25]
+; SKX-NEXT: movl %esi, %eax # sched: [1:0.25]
+; SKX-NEXT: mwait # sched: [20:2.50]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_mwait:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: movl %edi, %ecx # sched: [1:0.50]
+; BTVER2-NEXT: movl %esi, %eax # sched: [1:0.50]
+; BTVER2-NEXT: mwait # sched: [100:0.17]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_mwait:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: movl %edi, %ecx # sched: [1:0.25]
+; ZNVER1-NEXT: movl %esi, %eax # sched: [1:0.25]
+; ZNVER1-NEXT: mwait # sched: [100:?]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ tail call void @llvm.x86.sse3.mwait(i32 %a0, i32 %a1)
+ ret void
+}
+declare void @llvm.x86.sse3.mwait(i32, i32)
diff --git a/test/CodeGen/X86/sse3.ll b/test/CodeGen/X86/sse3.ll
index 1e7b9da6a321..09914e09faa8 100644
--- a/test/CodeGen/X86/sse3.ll
+++ b/test/CodeGen/X86/sse3.ll
@@ -9,7 +9,7 @@
define void @t0(<8 x i16>* %dest, <8 x i16>* %old) nounwind {
; X86-LABEL: t0:
-; X86: # BB#0: # %entry
+; X86: # %bb.0: # %entry
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl $1, %edx
@@ -19,7 +19,7 @@ define void @t0(<8 x i16>* %dest, <8 x i16>* %old) nounwind {
; X86-NEXT: retl
;
; X64-LABEL: t0:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: movl $1, %eax
; X64-NEXT: movd %eax, %xmm0
; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
@@ -36,7 +36,7 @@ entry:
define <8 x i16> @t1(<8 x i16>* %A, <8 x i16>* %B) nounwind {
; X86-LABEL: t1:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movaps {{.*#+}} xmm0 = [0,65535,65535,65535,65535,65535,65535,65535]
@@ -47,7 +47,7 @@ define <8 x i16> @t1(<8 x i16>* %A, <8 x i16>* %B) nounwind {
; X86-NEXT: retl
;
; X64-LABEL: t1:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movaps {{.*#+}} xmm0 = [0,65535,65535,65535,65535,65535,65535,65535]
; X64-NEXT: movaps %xmm0, %xmm1
; X64-NEXT: andnps (%rsi), %xmm1
@@ -63,7 +63,7 @@ define <8 x i16> @t1(<8 x i16>* %A, <8 x i16>* %B) nounwind {
define <8 x i16> @t2(<8 x i16> %A, <8 x i16> %B) nounwind {
; X86-LABEL: t2:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,0,65535,65535,65535,65535]
; X86-NEXT: pand %xmm2, %xmm0
; X86-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,1,2,1,4,5,6,7]
@@ -72,7 +72,7 @@ define <8 x i16> @t2(<8 x i16> %A, <8 x i16> %B) nounwind {
; X86-NEXT: retl
;
; X64-LABEL: t2:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,0,65535,65535,65535,65535]
; X64-NEXT: pand %xmm2, %xmm0
; X64-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,1,2,1,4,5,6,7]
@@ -85,7 +85,7 @@ define <8 x i16> @t2(<8 x i16> %A, <8 x i16> %B) nounwind {
define <8 x i16> @t3(<8 x i16> %A, <8 x i16> %B) nounwind {
; X86-LABEL: t3:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0]
; X86-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,5]
; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0]
@@ -94,7 +94,7 @@ define <8 x i16> @t3(<8 x i16> %A, <8 x i16> %B) nounwind {
; X86-NEXT: retl
;
; X64-LABEL: t3:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0]
; X64-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,5]
; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0]
@@ -107,7 +107,7 @@ define <8 x i16> @t3(<8 x i16> %A, <8 x i16> %B) nounwind {
define <8 x i16> @t4(<8 x i16> %A, <8 x i16> %B) nounwind {
; X86-LABEL: t4:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
; X86-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7]
; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0]
@@ -115,7 +115,7 @@ define <8 x i16> @t4(<8 x i16> %A, <8 x i16> %B) nounwind {
; X86-NEXT: retl
;
; X64-LABEL: t4:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
; X64-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7]
; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0]
@@ -127,15 +127,15 @@ define <8 x i16> @t4(<8 x i16> %A, <8 x i16> %B) nounwind {
define <8 x i16> @t5(<8 x i16> %A, <8 x i16> %B) nounwind {
; X86-LABEL: t5:
-; X86: # BB#0:
-; X86-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; X86-NEXT: movdqa %xmm1, %xmm0
+; X86: # %bb.0:
+; X86-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; X86-NEXT: movaps %xmm1, %xmm0
; X86-NEXT: retl
;
; X64-LABEL: t5:
-; X64: # BB#0:
-; X64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; X64-NEXT: movdqa %xmm1, %xmm0
+; X64: # %bb.0:
+; X64-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; X64-NEXT: movaps %xmm1, %xmm0
; X64-NEXT: retq
%tmp = shufflevector <8 x i16> %A, <8 x i16> %B, <8 x i32> < i32 8, i32 9, i32 0, i32 1, i32 10, i32 11, i32 2, i32 3 >
ret <8 x i16> %tmp
@@ -143,12 +143,12 @@ define <8 x i16> @t5(<8 x i16> %A, <8 x i16> %B) nounwind {
define <8 x i16> @t6(<8 x i16> %A, <8 x i16> %B) nounwind {
; X86-LABEL: t6:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
; X86-NEXT: retl
;
; X64-LABEL: t6:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
; X64-NEXT: retq
%tmp = shufflevector <8 x i16> %A, <8 x i16> %B, <8 x i32> < i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7 >
@@ -157,13 +157,13 @@ define <8 x i16> @t6(<8 x i16> %A, <8 x i16> %B) nounwind {
define <8 x i16> @t7(<8 x i16> %A, <8 x i16> %B) nounwind {
; X86-LABEL: t7:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,3,2,4,5,6,7]
; X86-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7]
; X86-NEXT: retl
;
; X64-LABEL: t7:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,3,2,4,5,6,7]
; X64-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7]
; X64-NEXT: retq
@@ -173,7 +173,7 @@ define <8 x i16> @t7(<8 x i16> %A, <8 x i16> %B) nounwind {
define void @t8(<2 x i64>* %res, <2 x i64>* %A) nounwind {
; X86-LABEL: t8:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: pshuflw {{.*#+}} xmm0 = mem[2,1,0,3,4,5,6,7]
@@ -182,7 +182,7 @@ define void @t8(<2 x i64>* %res, <2 x i64>* %A) nounwind {
; X86-NEXT: retl
;
; X64-LABEL: t8:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: pshuflw {{.*#+}} xmm0 = mem[2,1,0,3,4,5,6,7]
; X64-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7]
; X64-NEXT: movdqa %xmm0, (%rdi)
@@ -212,7 +212,7 @@ define void @t8(<2 x i64>* %res, <2 x i64>* %A) nounwind {
define void @t9(<4 x float>* %r, <2 x i32>* %A) nounwind {
; X86-LABEL: t9:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movapd (%ecx), %xmm0
@@ -221,7 +221,7 @@ define void @t9(<4 x float>* %r, <2 x i32>* %A) nounwind {
; X86-NEXT: retl
;
; X64-LABEL: t9:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movapd (%rdi), %xmm0
; X64-NEXT: movhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
; X64-NEXT: movapd %xmm0, (%rdi)
@@ -254,7 +254,7 @@ define void @t9(<4 x float>* %r, <2 x i32>* %A) nounwind {
define void @t10() nounwind {
; X86-LABEL: t10:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: pshuflw {{.*#+}} xmm0 = mem[0,2,2,3,4,5,6,7]
; X86-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
@@ -262,7 +262,7 @@ define void @t10() nounwind {
; X86-NEXT: retl
;
; X64-LABEL: t10:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: pshuflw {{.*#+}} xmm0 = mem[0,2,2,3,4,5,6,7]
; X64-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
@@ -281,13 +281,13 @@ define void @t10() nounwind {
; Pack various elements via shuffles.
define <8 x i16> @t11(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone {
; X86-LABEL: t11:
-; X86: # BB#0: # %entry
+; X86: # %bb.0: # %entry
; X86-NEXT: psrld $16, %xmm0
; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; X86-NEXT: retl
;
; X64-LABEL: t11:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: psrld $16, %xmm0
; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; X64-NEXT: retq
@@ -299,14 +299,14 @@ entry:
define <8 x i16> @t12(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone {
; X86-LABEL: t12:
-; X86: # BB#0: # %entry
+; X86: # %bb.0: # %entry
; X86-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; X86-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,3,3]
; X86-NEXT: retl
;
; X64-LABEL: t12:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,3,3]
@@ -319,14 +319,14 @@ entry:
define <8 x i16> @t13(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone {
; X86-LABEL: t13:
-; X86: # BB#0: # %entry
+; X86: # %bb.0: # %entry
; X86-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; X86-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7]
; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,3,3]
; X86-NEXT: retl
;
; X64-LABEL: t13:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7]
; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,3,3]
@@ -338,14 +338,14 @@ entry:
define <8 x i16> @t14(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone {
; X86-LABEL: t14:
-; X86: # BB#0: # %entry
+; X86: # %bb.0: # %entry
; X86-NEXT: psrlq $16, %xmm0
; X86-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
; X86-NEXT: movdqa %xmm1, %xmm0
; X86-NEXT: retl
;
; X64-LABEL: t14:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: psrlq $16, %xmm0
; X64-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
; X64-NEXT: movdqa %xmm1, %xmm0
@@ -358,14 +358,14 @@ entry:
; FIXME: t15 is worse off from disabling of scheduler 2-address hack.
define <8 x i16> @t15(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone {
; X86-LABEL: t15:
-; X86: # BB#0: # %entry
+; X86: # %bb.0: # %entry
; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
; X86-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,2,4,5,6,7]
; X86-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; X86-NEXT: retl
;
; X64-LABEL: t15:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,2,4,5,6,7]
; X64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
@@ -378,14 +378,14 @@ entry:
; Test yonah where we convert a shuffle to pextrw and pinrsw
define <16 x i8> @t16(<16 x i8> %T0) nounwind readnone {
; X86-LABEL: t16:
-; X86: # BB#0: # %entry
+; X86: # %bb.0: # %entry
; X86-NEXT: movdqa {{.*#+}} xmm1 = [0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0]
; X86-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; X86-NEXT: movdqa %xmm1, %xmm0
; X86-NEXT: retl
;
; X64-LABEL: t16:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: movdqa {{.*#+}} xmm1 = [0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0]
; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; X64-NEXT: movdqa %xmm1, %xmm0
@@ -399,19 +399,19 @@ entry:
; rdar://8520311
define <4 x i32> @t17() nounwind {
; X86-LABEL: t17:
-; X86: # BB#0: # %entry
+; X86: # %bb.0: # %entry
; X86-NEXT: movaps (%eax), %xmm0
; X86-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0,0,1,1]
-; X86-NEXT: pxor %xmm1, %xmm1
-; X86-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X86-NEXT: xorps %xmm1, %xmm1
+; X86-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; X86-NEXT: retl
;
; X64-LABEL: t17:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: movaps (%rax), %xmm0
; X64-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0,0,1,1]
-; X64-NEXT: pxor %xmm1, %xmm1
-; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X64-NEXT: xorps %xmm1, %xmm1
+; X64-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; X64-NEXT: retq
entry:
%tmp1 = load <4 x float>, <4 x float>* undef, align 16
diff --git a/test/CodeGen/X86/sse41-intrinsics-fast-isel.ll b/test/CodeGen/X86/sse41-intrinsics-fast-isel.ll
index f106f7ec5cc1..fcb6bbbdd115 100644
--- a/test/CodeGen/X86/sse41-intrinsics-fast-isel.ll
+++ b/test/CodeGen/X86/sse41-intrinsics-fast-isel.ll
@@ -6,12 +6,12 @@
define <2 x i64> @test_mm_blend_epi16(<2 x i64> %a0, <2 x i64> %a1) {
; X32-LABEL: test_mm_blend_epi16:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6,7]
; X32-NEXT: retl
;
; X64-LABEL: test_mm_blend_epi16:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6,7]
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <8 x i16>
@@ -23,12 +23,12 @@ define <2 x i64> @test_mm_blend_epi16(<2 x i64> %a0, <2 x i64> %a1) {
define <2 x double> @test_mm_blend_pd(<2 x double> %a0, <2 x double> %a1) {
; X32-LABEL: test_mm_blend_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
; X32-NEXT: retl
;
; X64-LABEL: test_mm_blend_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
; X64-NEXT: retq
%res = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> <i32 0, i32 3>
@@ -37,12 +37,12 @@ define <2 x double> @test_mm_blend_pd(<2 x double> %a0, <2 x double> %a1) {
define <4 x float> @test_mm_blend_ps(<4 x float> %a0, <4 x float> %a1) {
; X32-LABEL: test_mm_blend_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3]
; X32-NEXT: retl
;
; X64-LABEL: test_mm_blend_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3]
; X64-NEXT: retq
%res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 0, i32 5, i32 6, i32 3>
@@ -51,7 +51,7 @@ define <4 x float> @test_mm_blend_ps(<4 x float> %a0, <4 x float> %a1) {
define <2 x i64> @test_mm_blendv_epi8(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2) {
; X32-LABEL: test_mm_blendv_epi8:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movdqa %xmm0, %xmm3
; X32-NEXT: movaps %xmm2, %xmm0
; X32-NEXT: pblendvb %xmm0, %xmm1, %xmm3
@@ -59,7 +59,7 @@ define <2 x i64> @test_mm_blendv_epi8(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a
; X32-NEXT: retl
;
; X64-LABEL: test_mm_blendv_epi8:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movdqa %xmm0, %xmm3
; X64-NEXT: movaps %xmm2, %xmm0
; X64-NEXT: pblendvb %xmm0, %xmm1, %xmm3
@@ -76,7 +76,7 @@ declare <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8>, <16 x i8>, <16 x i8>) noun
define <2 x double> @test_mm_blendv_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
; X32-LABEL: test_mm_blendv_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movapd %xmm0, %xmm3
; X32-NEXT: movaps %xmm2, %xmm0
; X32-NEXT: blendvpd %xmm0, %xmm1, %xmm3
@@ -84,7 +84,7 @@ define <2 x double> @test_mm_blendv_pd(<2 x double> %a0, <2 x double> %a1, <2 x
; X32-NEXT: retl
;
; X64-LABEL: test_mm_blendv_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movapd %xmm0, %xmm3
; X64-NEXT: movaps %xmm2, %xmm0
; X64-NEXT: blendvpd %xmm0, %xmm1, %xmm3
@@ -97,7 +97,7 @@ declare <2 x double> @llvm.x86.sse41.blendvpd(<2 x double>, <2 x double>, <2 x d
define <4 x float> @test_mm_blendv_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
; X32-LABEL: test_mm_blendv_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movaps %xmm0, %xmm3
; X32-NEXT: movaps %xmm2, %xmm0
; X32-NEXT: blendvps %xmm0, %xmm1, %xmm3
@@ -105,7 +105,7 @@ define <4 x float> @test_mm_blendv_ps(<4 x float> %a0, <4 x float> %a1, <4 x flo
; X32-NEXT: retl
;
; X64-LABEL: test_mm_blendv_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movaps %xmm0, %xmm3
; X64-NEXT: movaps %xmm2, %xmm0
; X64-NEXT: blendvps %xmm0, %xmm1, %xmm3
@@ -118,12 +118,12 @@ declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>, <4 x floa
define <2 x double> @test_mm_ceil_pd(<2 x double> %a0) {
; X32-LABEL: test_mm_ceil_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: roundpd $2, %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_ceil_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: roundpd $2, %xmm0, %xmm0
; X64-NEXT: retq
%res = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %a0, i32 2)
@@ -133,12 +133,12 @@ declare <2 x double> @llvm.x86.sse41.round.pd(<2 x double>, i32) nounwind readno
define <4 x float> @test_mm_ceil_ps(<4 x float> %a0) {
; X32-LABEL: test_mm_ceil_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: roundps $2, %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_ceil_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: roundps $2, %xmm0, %xmm0
; X64-NEXT: retq
%res = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %a0, i32 2)
@@ -148,12 +148,12 @@ declare <4 x float> @llvm.x86.sse41.round.ps(<4 x float>, i32) nounwind readnone
define <2 x double> @test_mm_ceil_sd(<2 x double> %a0, <2 x double> %a1) {
; X32-LABEL: test_mm_ceil_sd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: roundsd $2, %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_ceil_sd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: roundsd $2, %xmm1, %xmm0
; X64-NEXT: retq
%res = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %a0, <2 x double> %a1, i32 2)
@@ -163,12 +163,12 @@ declare <2 x double> @llvm.x86.sse41.round.sd(<2 x double>, <2 x double>, i32) n
define <4 x float> @test_mm_ceil_ss(<4 x float> %a0, <4 x float> %a1) {
; X32-LABEL: test_mm_ceil_ss:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: roundss $2, %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_ceil_ss:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: roundss $2, %xmm1, %xmm0
; X64-NEXT: retq
%res = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %a0, <4 x float> %a1, i32 2)
@@ -178,12 +178,12 @@ declare <4 x float> @llvm.x86.sse41.round.ss(<4 x float>, <4 x float>, i32) noun
define <2 x i64> @test_mm_cmpeq_epi64(<2 x i64> %a0, <2 x i64> %a1) {
; X32-LABEL: test_mm_cmpeq_epi64:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pcmpeqq %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_cmpeq_epi64:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: pcmpeqq %xmm1, %xmm0
; X64-NEXT: retq
%cmp = icmp eq <2 x i64> %a0, %a1
@@ -193,12 +193,12 @@ define <2 x i64> @test_mm_cmpeq_epi64(<2 x i64> %a0, <2 x i64> %a1) {
define <2 x i64> @test_mm_cvtepi8_epi16(<2 x i64> %a0) {
; X32-LABEL: test_mm_cvtepi8_epi16:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pmovsxbw %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_cvtepi8_epi16:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: pmovsxbw %xmm0, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <16 x i8>
@@ -210,12 +210,12 @@ define <2 x i64> @test_mm_cvtepi8_epi16(<2 x i64> %a0) {
define <2 x i64> @test_mm_cvtepi8_epi32(<2 x i64> %a0) {
; X32-LABEL: test_mm_cvtepi8_epi32:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pmovsxbd %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_cvtepi8_epi32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: pmovsxbd %xmm0, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <16 x i8>
@@ -227,12 +227,12 @@ define <2 x i64> @test_mm_cvtepi8_epi32(<2 x i64> %a0) {
define <2 x i64> @test_mm_cvtepi8_epi64(<2 x i64> %a0) {
; X32-LABEL: test_mm_cvtepi8_epi64:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pmovsxbq %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_cvtepi8_epi64:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: pmovsxbq %xmm0, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <16 x i8>
@@ -243,12 +243,12 @@ define <2 x i64> @test_mm_cvtepi8_epi64(<2 x i64> %a0) {
define <2 x i64> @test_mm_cvtepi16_epi32(<2 x i64> %a0) {
; X32-LABEL: test_mm_cvtepi16_epi32:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pmovsxwd %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_cvtepi16_epi32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: pmovsxwd %xmm0, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <8 x i16>
@@ -260,12 +260,12 @@ define <2 x i64> @test_mm_cvtepi16_epi32(<2 x i64> %a0) {
define <2 x i64> @test_mm_cvtepi16_epi64(<2 x i64> %a0) {
; X32-LABEL: test_mm_cvtepi16_epi64:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pmovsxwq %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_cvtepi16_epi64:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: pmovsxwq %xmm0, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <8 x i16>
@@ -276,12 +276,12 @@ define <2 x i64> @test_mm_cvtepi16_epi64(<2 x i64> %a0) {
define <2 x i64> @test_mm_cvtepi32_epi64(<2 x i64> %a0) {
; X32-LABEL: test_mm_cvtepi32_epi64:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pmovsxdq %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_cvtepi32_epi64:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: pmovsxdq %xmm0, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <4 x i32>
@@ -292,12 +292,12 @@ define <2 x i64> @test_mm_cvtepi32_epi64(<2 x i64> %a0) {
define <2 x i64> @test_mm_cvtepu8_epi16(<2 x i64> %a0) {
; X32-LABEL: test_mm_cvtepu8_epi16:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; X32-NEXT: retl
;
; X64-LABEL: test_mm_cvtepu8_epi16:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <16 x i8>
@@ -309,12 +309,12 @@ define <2 x i64> @test_mm_cvtepu8_epi16(<2 x i64> %a0) {
define <2 x i64> @test_mm_cvtepu8_epi32(<2 x i64> %a0) {
; X32-LABEL: test_mm_cvtepu8_epi32:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; X32-NEXT: retl
;
; X64-LABEL: test_mm_cvtepu8_epi32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <16 x i8>
@@ -326,12 +326,12 @@ define <2 x i64> @test_mm_cvtepu8_epi32(<2 x i64> %a0) {
define <2 x i64> @test_mm_cvtepu8_epi64(<2 x i64> %a0) {
; X32-LABEL: test_mm_cvtepu8_epi64:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
; X32-NEXT: retl
;
; X64-LABEL: test_mm_cvtepu8_epi64:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <16 x i8>
@@ -342,12 +342,12 @@ define <2 x i64> @test_mm_cvtepu8_epi64(<2 x i64> %a0) {
define <2 x i64> @test_mm_cvtepu16_epi32(<2 x i64> %a0) {
; X32-LABEL: test_mm_cvtepu16_epi32:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; X32-NEXT: retl
;
; X64-LABEL: test_mm_cvtepu16_epi32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <8 x i16>
@@ -359,12 +359,12 @@ define <2 x i64> @test_mm_cvtepu16_epi32(<2 x i64> %a0) {
define <2 x i64> @test_mm_cvtepu16_epi64(<2 x i64> %a0) {
; X32-LABEL: test_mm_cvtepu16_epi64:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
; X32-NEXT: retl
;
; X64-LABEL: test_mm_cvtepu16_epi64:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <8 x i16>
@@ -375,12 +375,12 @@ define <2 x i64> @test_mm_cvtepu16_epi64(<2 x i64> %a0) {
define <2 x i64> @test_mm_cvtepu32_epi64(<2 x i64> %a0) {
; X32-LABEL: test_mm_cvtepu32_epi64:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; X32-NEXT: retl
;
; X64-LABEL: test_mm_cvtepu32_epi64:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <4 x i32>
@@ -391,12 +391,12 @@ define <2 x i64> @test_mm_cvtepu32_epi64(<2 x i64> %a0) {
define <2 x double> @test_mm_dp_pd(<2 x double> %a0, <2 x double> %a1) {
; X32-LABEL: test_mm_dp_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: dppd $7, %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_dp_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: dppd $7, %xmm1, %xmm0
; X64-NEXT: retq
%res = call <2 x double> @llvm.x86.sse41.dppd(<2 x double> %a0, <2 x double> %a1, i8 7)
@@ -406,12 +406,12 @@ declare <2 x double> @llvm.x86.sse41.dppd(<2 x double>, <2 x double>, i8) nounwi
define <4 x float> @test_mm_dp_ps(<4 x float> %a0, <4 x float> %a1) {
; X32-LABEL: test_mm_dp_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: dpps $7, %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_dp_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: dpps $7, %xmm1, %xmm0
; X64-NEXT: retq
%res = call <4 x float> @llvm.x86.sse41.dpps(<4 x float> %a0, <4 x float> %a1, i8 7)
@@ -421,13 +421,13 @@ declare <4 x float> @llvm.x86.sse41.dpps(<4 x float>, <4 x float>, i8) nounwind
define i32 @test_mm_extract_epi8(<2 x i64> %a0) {
; X32-LABEL: test_mm_extract_epi8:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pextrb $1, %xmm0, %eax
; X32-NEXT: movzbl %al, %eax
; X32-NEXT: retl
;
; X64-LABEL: test_mm_extract_epi8:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: pextrb $1, %xmm0, %eax
; X64-NEXT: movzbl %al, %eax
; X64-NEXT: retq
@@ -439,13 +439,13 @@ define i32 @test_mm_extract_epi8(<2 x i64> %a0) {
define i32 @test_mm_extract_epi32(<2 x i64> %a0) {
; X32-LABEL: test_mm_extract_epi32:
-; X32: # BB#0:
-; X32-NEXT: pextrd $1, %xmm0, %eax
+; X32: # %bb.0:
+; X32-NEXT: extractps $1, %xmm0, %eax
; X32-NEXT: retl
;
; X64-LABEL: test_mm_extract_epi32:
-; X64: # BB#0:
-; X64-NEXT: pextrd $1, %xmm0, %eax
+; X64: # %bb.0:
+; X64-NEXT: extractps $1, %xmm0, %eax
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <4 x i32>
%ext = extractelement <4 x i32> %arg0, i32 1
@@ -454,13 +454,13 @@ define i32 @test_mm_extract_epi32(<2 x i64> %a0) {
define i64 @test_mm_extract_epi64(<2 x i64> %a0) {
; X32-LABEL: test_mm_extract_epi64:
-; X32: # BB#0:
-; X32-NEXT: pextrd $2, %xmm0, %eax
-; X32-NEXT: pextrd $3, %xmm0, %edx
+; X32: # %bb.0:
+; X32-NEXT: extractps $2, %xmm0, %eax
+; X32-NEXT: extractps $3, %xmm0, %edx
; X32-NEXT: retl
;
; X64-LABEL: test_mm_extract_epi64:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: pextrq $1, %xmm0, %rax
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <4 x i32>
@@ -468,16 +468,31 @@ define i64 @test_mm_extract_epi64(<2 x i64> %a0) {
ret i64 %ext
}
-; TODO test_mm_extract_ps
+define i32 @test_mm_extract_ps(<4 x float> %a0) {
+; X32-LABEL: test_mm_extract_ps:
+; X32: # %bb.0:
+; X32-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; X32-NEXT: movd %xmm0, %eax
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_extract_ps:
+; X64: # %bb.0:
+; X64-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; X64-NEXT: movd %xmm0, %eax
+; X64-NEXT: retq
+ %ext = extractelement <4 x float> %a0, i32 1
+ %bc = bitcast float %ext to i32
+ ret i32 %bc
+}
define <2 x double> @test_mm_floor_pd(<2 x double> %a0) {
; X32-LABEL: test_mm_floor_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: roundpd $1, %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_floor_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: roundpd $1, %xmm0, %xmm0
; X64-NEXT: retq
%res = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %a0, i32 1)
@@ -486,12 +501,12 @@ define <2 x double> @test_mm_floor_pd(<2 x double> %a0) {
define <4 x float> @test_mm_floor_ps(<4 x float> %a0) {
; X32-LABEL: test_mm_floor_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: roundps $1, %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_floor_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: roundps $1, %xmm0, %xmm0
; X64-NEXT: retq
%res = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %a0, i32 1)
@@ -500,12 +515,12 @@ define <4 x float> @test_mm_floor_ps(<4 x float> %a0) {
define <2 x double> @test_mm_floor_sd(<2 x double> %a0, <2 x double> %a1) {
; X32-LABEL: test_mm_floor_sd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: roundsd $1, %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_floor_sd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: roundsd $1, %xmm1, %xmm0
; X64-NEXT: retq
%res = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %a0, <2 x double> %a1, i32 1)
@@ -514,12 +529,12 @@ define <2 x double> @test_mm_floor_sd(<2 x double> %a0, <2 x double> %a1) {
define <4 x float> @test_mm_floor_ss(<4 x float> %a0, <4 x float> %a1) {
; X32-LABEL: test_mm_floor_ss:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: roundss $1, %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_floor_ss:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: roundss $1, %xmm1, %xmm0
; X64-NEXT: retq
%res = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %a0, <4 x float> %a1, i32 1)
@@ -528,13 +543,13 @@ define <4 x float> @test_mm_floor_ss(<4 x float> %a0, <4 x float> %a1) {
define <2 x i64> @test_mm_insert_epi8(<2 x i64> %a0, i8 %a1) {
; X32-LABEL: test_mm_insert_epi8:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; X32-NEXT: pinsrb $1, %eax, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_insert_epi8:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movzbl %dil, %eax
; X64-NEXT: pinsrb $1, %eax, %xmm0
; X64-NEXT: retq
@@ -546,12 +561,12 @@ define <2 x i64> @test_mm_insert_epi8(<2 x i64> %a0, i8 %a1) {
define <2 x i64> @test_mm_insert_epi32(<2 x i64> %a0, i32 %a1) {
; X32-LABEL: test_mm_insert_epi32:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pinsrd $1, {{[0-9]+}}(%esp), %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_insert_epi32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: pinsrd $1, %edi, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <4 x i32>
@@ -562,13 +577,13 @@ define <2 x i64> @test_mm_insert_epi32(<2 x i64> %a0, i32 %a1) {
define <2 x i64> @test_mm_insert_epi64(<2 x i64> %a0, i64 %a1) {
; X32-LABEL: test_mm_insert_epi64:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pinsrd $2, {{[0-9]+}}(%esp), %xmm0
; X32-NEXT: pinsrd $3, {{[0-9]+}}(%esp), %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_insert_epi64:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: pinsrq $1, %rdi, %xmm0
; X64-NEXT: retq
%res = insertelement <2 x i64> %a0, i64 %a1,i32 1
@@ -577,12 +592,12 @@ define <2 x i64> @test_mm_insert_epi64(<2 x i64> %a0, i64 %a1) {
define <4 x float> @test_mm_insert_ps(<4 x float> %a0, <4 x float> %a1) {
; X32-LABEL: test_mm_insert_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: insertps {{.*#+}} xmm0 = xmm1[0],xmm0[1],zero,xmm0[3]
; X32-NEXT: retl
;
; X64-LABEL: test_mm_insert_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: insertps {{.*#+}} xmm0 = xmm1[0],xmm0[1],zero,xmm0[3]
; X64-NEXT: retq
%res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a0, <4 x float> %a1, i8 4)
@@ -592,12 +607,12 @@ declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i8) nounw
define <2 x i64> @test_mm_max_epi8(<2 x i64> %a0, <2 x i64> %a1) {
; X32-LABEL: test_mm_max_epi8:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pmaxsb %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_max_epi8:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: pmaxsb %xmm1, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <16 x i8>
@@ -610,12 +625,12 @@ define <2 x i64> @test_mm_max_epi8(<2 x i64> %a0, <2 x i64> %a1) {
define <2 x i64> @test_mm_max_epi32(<2 x i64> %a0, <2 x i64> %a1) {
; X32-LABEL: test_mm_max_epi32:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pmaxsd %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_max_epi32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: pmaxsd %xmm1, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <4 x i32>
@@ -628,12 +643,12 @@ define <2 x i64> @test_mm_max_epi32(<2 x i64> %a0, <2 x i64> %a1) {
define <2 x i64> @test_mm_max_epu16(<2 x i64> %a0, <2 x i64> %a1) {
; X32-LABEL: test_mm_max_epu16:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pmaxuw %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_max_epu16:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: pmaxuw %xmm1, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <8 x i16>
@@ -646,12 +661,12 @@ define <2 x i64> @test_mm_max_epu16(<2 x i64> %a0, <2 x i64> %a1) {
define <2 x i64> @test_mm_max_epu32(<2 x i64> %a0, <2 x i64> %a1) {
; X32-LABEL: test_mm_max_epu32:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pmaxud %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_max_epu32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: pmaxud %xmm1, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <4 x i32>
@@ -664,12 +679,12 @@ define <2 x i64> @test_mm_max_epu32(<2 x i64> %a0, <2 x i64> %a1) {
define <2 x i64> @test_mm_min_epi8(<2 x i64> %a0, <2 x i64> %a1) {
; X32-LABEL: test_mm_min_epi8:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pminsb %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_min_epi8:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: pminsb %xmm1, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <16 x i8>
@@ -682,12 +697,12 @@ define <2 x i64> @test_mm_min_epi8(<2 x i64> %a0, <2 x i64> %a1) {
define <2 x i64> @test_mm_min_epi32(<2 x i64> %a0, <2 x i64> %a1) {
; X32-LABEL: test_mm_min_epi32:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pminsd %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_min_epi32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: pminsd %xmm1, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <4 x i32>
@@ -700,12 +715,12 @@ define <2 x i64> @test_mm_min_epi32(<2 x i64> %a0, <2 x i64> %a1) {
define <2 x i64> @test_mm_min_epu16(<2 x i64> %a0, <2 x i64> %a1) {
; X32-LABEL: test_mm_min_epu16:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pminuw %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_min_epu16:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: pminuw %xmm1, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <8 x i16>
@@ -718,12 +733,12 @@ define <2 x i64> @test_mm_min_epu16(<2 x i64> %a0, <2 x i64> %a1) {
define <2 x i64> @test_mm_min_epu32(<2 x i64> %a0, <2 x i64> %a1) {
; X32-LABEL: test_mm_min_epu32:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pminud %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_min_epu32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: pminud %xmm1, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <4 x i32>
@@ -736,12 +751,12 @@ define <2 x i64> @test_mm_min_epu32(<2 x i64> %a0, <2 x i64> %a1) {
define <2 x i64> @test_mm_minpos_epu16(<2 x i64> %a0) {
; X32-LABEL: test_mm_minpos_epu16:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: phminposuw %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_minpos_epu16:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: phminposuw %xmm0, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <8 x i16>
@@ -753,12 +768,12 @@ declare <8 x i16> @llvm.x86.sse41.phminposuw(<8 x i16>) nounwind readnone
define <2 x i64> @test_mm_mpsadbw_epu8(<2 x i64> %a0, <2 x i64> %a1) {
; X32-LABEL: test_mm_mpsadbw_epu8:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: mpsadbw $1, %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_mpsadbw_epu8:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: mpsadbw $1, %xmm1, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <16 x i8>
@@ -771,12 +786,12 @@ declare <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8>, <16 x i8>, i8) nounwind rea
define <2 x i64> @test_mm_mul_epi32(<2 x i64> %a0, <2 x i64> %a1) {
; X32-LABEL: test_mm_mul_epi32:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pmuldq %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_mul_epi32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: pmuldq %xmm1, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <4 x i32>
@@ -788,12 +803,12 @@ declare <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32>, <4 x i32>) nounwind readnone
define <2 x i64> @test_mm_mullo_epi32(<2 x i64> %a0, <2 x i64> %a1) {
; X32-LABEL: test_mm_mullo_epi32:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pmulld %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_mullo_epi32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: pmulld %xmm1, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <4 x i32>
@@ -805,12 +820,12 @@ define <2 x i64> @test_mm_mullo_epi32(<2 x i64> %a0, <2 x i64> %a1) {
define <2 x i64> @test_mm_packus_epi32(<2 x i64> %a0, <2 x i64> %a1) {
; X32-LABEL: test_mm_packus_epi32:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: packusdw %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_packus_epi32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: packusdw %xmm1, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <4 x i32>
@@ -823,12 +838,12 @@ declare <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32>, <4 x i32>) nounwind readno
define <2 x double> @test_mm_round_pd(<2 x double> %a0) {
; X32-LABEL: test_mm_round_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: roundpd $4, %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_round_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: roundpd $4, %xmm0, %xmm0
; X64-NEXT: retq
%res = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %a0, i32 4)
@@ -837,12 +852,12 @@ define <2 x double> @test_mm_round_pd(<2 x double> %a0) {
define <4 x float> @test_mm_round_ps(<4 x float> %a0) {
; X32-LABEL: test_mm_round_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: roundps $4, %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_round_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: roundps $4, %xmm0, %xmm0
; X64-NEXT: retq
%res = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %a0, i32 4)
@@ -851,12 +866,12 @@ define <4 x float> @test_mm_round_ps(<4 x float> %a0) {
define <2 x double> @test_mm_round_sd(<2 x double> %a0, <2 x double> %a1) {
; X32-LABEL: test_mm_round_sd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: roundsd $4, %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_round_sd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: roundsd $4, %xmm1, %xmm0
; X64-NEXT: retq
%res = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %a0, <2 x double> %a1, i32 4)
@@ -865,12 +880,12 @@ define <2 x double> @test_mm_round_sd(<2 x double> %a0, <2 x double> %a1) {
define <4 x float> @test_mm_round_ss(<4 x float> %a0, <4 x float> %a1) {
; X32-LABEL: test_mm_round_ss:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: roundss $4, %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_round_ss:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: roundss $4, %xmm1, %xmm0
; X64-NEXT: retq
%res = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %a0, <4 x float> %a1, i32 4)
@@ -879,13 +894,13 @@ define <4 x float> @test_mm_round_ss(<4 x float> %a0, <4 x float> %a1) {
define <2 x i64> @test_mm_stream_load_si128(<2 x i64>* %a0) {
; X32-LABEL: test_mm_stream_load_si128:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movntdqa (%eax), %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_stream_load_si128:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movntdqa (%rdi), %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64>* %a0 to i8*
@@ -896,7 +911,7 @@ declare <2 x i64> @llvm.x86.sse41.movntdqa(i8*) nounwind readnone
define i32 @test_mm_test_all_ones(<2 x i64> %a0) {
; X32-LABEL: test_mm_test_all_ones:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pcmpeqd %xmm1, %xmm1
; X32-NEXT: xorl %eax, %eax
; X32-NEXT: ptest %xmm1, %xmm0
@@ -904,7 +919,7 @@ define i32 @test_mm_test_all_ones(<2 x i64> %a0) {
; X32-NEXT: retl
;
; X64-LABEL: test_mm_test_all_ones:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: pcmpeqd %xmm1, %xmm1
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: ptest %xmm1, %xmm0
@@ -917,14 +932,14 @@ declare i32 @llvm.x86.sse41.ptestc(<2 x i64>, <2 x i64>) nounwind readnone
define i32 @test_mm_test_all_zeros(<2 x i64> %a0, <2 x i64> %a1) {
; X32-LABEL: test_mm_test_all_zeros:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: xorl %eax, %eax
; X32-NEXT: ptest %xmm1, %xmm0
; X32-NEXT: sete %al
; X32-NEXT: retl
;
; X64-LABEL: test_mm_test_all_zeros:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: ptest %xmm1, %xmm0
; X64-NEXT: sete %al
@@ -936,14 +951,14 @@ declare i32 @llvm.x86.sse41.ptestz(<2 x i64>, <2 x i64>) nounwind readnone
define i32 @test_mm_test_mix_ones_zeros(<2 x i64> %a0, <2 x i64> %a1) {
; X32-LABEL: test_mm_test_mix_ones_zeros:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: xorl %eax, %eax
; X32-NEXT: ptest %xmm1, %xmm0
; X32-NEXT: seta %al
; X32-NEXT: retl
;
; X64-LABEL: test_mm_test_mix_ones_zeros:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: ptest %xmm1, %xmm0
; X64-NEXT: seta %al
@@ -955,14 +970,14 @@ declare i32 @llvm.x86.sse41.ptestnzc(<2 x i64>, <2 x i64>) nounwind readnone
define i32 @test_mm_testc_si128(<2 x i64> %a0, <2 x i64> %a1) {
; X32-LABEL: test_mm_testc_si128:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: xorl %eax, %eax
; X32-NEXT: ptest %xmm1, %xmm0
; X32-NEXT: setb %al
; X32-NEXT: retl
;
; X64-LABEL: test_mm_testc_si128:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: ptest %xmm1, %xmm0
; X64-NEXT: setb %al
@@ -973,14 +988,14 @@ define i32 @test_mm_testc_si128(<2 x i64> %a0, <2 x i64> %a1) {
define i32 @test_mm_testnzc_si128(<2 x i64> %a0, <2 x i64> %a1) {
; X32-LABEL: test_mm_testnzc_si128:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: xorl %eax, %eax
; X32-NEXT: ptest %xmm1, %xmm0
; X32-NEXT: seta %al
; X32-NEXT: retl
;
; X64-LABEL: test_mm_testnzc_si128:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: ptest %xmm1, %xmm0
; X64-NEXT: seta %al
@@ -991,14 +1006,14 @@ define i32 @test_mm_testnzc_si128(<2 x i64> %a0, <2 x i64> %a1) {
define i32 @test_mm_testz_si128(<2 x i64> %a0, <2 x i64> %a1) {
; X32-LABEL: test_mm_testz_si128:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: xorl %eax, %eax
; X32-NEXT: ptest %xmm1, %xmm0
; X32-NEXT: sete %al
; X32-NEXT: retl
;
; X64-LABEL: test_mm_testz_si128:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: ptest %xmm1, %xmm0
; X64-NEXT: sete %al
diff --git a/test/CodeGen/X86/sse41-intrinsics-x86-upgrade.ll b/test/CodeGen/X86/sse41-intrinsics-x86-upgrade.ll
index 9bda90a23023..d942d4776c15 100644
--- a/test/CodeGen/X86/sse41-intrinsics-x86-upgrade.ll
+++ b/test/CodeGen/X86/sse41-intrinsics-x86-upgrade.ll
@@ -6,7 +6,7 @@
define <2 x double> @test_x86_sse41_blendpd(<2 x double> %a0, <2 x double> %a1) {
; CHECK-LABEL: test_x86_sse41_blendpd:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
; CHECK-NEXT: retl
%res = call <2 x double> @llvm.x86.sse41.blendpd(<2 x double> %a0, <2 x double> %a1, i32 6) ; <<2 x double>> [#uses=1]
@@ -17,7 +17,7 @@ declare <2 x double> @llvm.x86.sse41.blendpd(<2 x double>, <2 x double>, i32) no
define <4 x float> @test_x86_sse41_blendps(<4 x float> %a0, <4 x float> %a1) {
; CHECK-LABEL: test_x86_sse41_blendps:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
; CHECK-NEXT: retl
%res = call <4 x float> @llvm.x86.sse41.blendps(<4 x float> %a0, <4 x float> %a1, i32 7) ; <<4 x float>> [#uses=1]
@@ -28,7 +28,7 @@ declare <4 x float> @llvm.x86.sse41.blendps(<4 x float>, <4 x float>, i32) nounw
define <2 x double> @test_x86_sse41_dppd(<2 x double> %a0, <2 x double> %a1) {
; CHECK-LABEL: test_x86_sse41_dppd:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: dppd $7, %xmm1, %xmm0
; CHECK-NEXT: retl
%res = call <2 x double> @llvm.x86.sse41.dppd(<2 x double> %a0, <2 x double> %a1, i32 7) ; <<2 x double>> [#uses=1]
@@ -39,7 +39,7 @@ declare <2 x double> @llvm.x86.sse41.dppd(<2 x double>, <2 x double>, i32) nounw
define <4 x float> @test_x86_sse41_dpps(<4 x float> %a0, <4 x float> %a1) {
; CHECK-LABEL: test_x86_sse41_dpps:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: dpps $7, %xmm1, %xmm0
; CHECK-NEXT: retl
%res = call <4 x float> @llvm.x86.sse41.dpps(<4 x float> %a0, <4 x float> %a1, i32 7) ; <<4 x float>> [#uses=1]
@@ -50,7 +50,7 @@ declare <4 x float> @llvm.x86.sse41.dpps(<4 x float>, <4 x float>, i32) nounwind
define <4 x float> @test_x86_sse41_insertps(<4 x float> %a0, <4 x float> %a1) {
; CHECK-LABEL: test_x86_sse41_insertps:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: insertps {{.*#+}} xmm0 = zero,xmm1[0],xmm0[2,3]
; CHECK-NEXT: retl
%res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a0, <4 x float> %a1, i32 17) ; <<4 x float>> [#uses=1]
@@ -61,7 +61,7 @@ declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i32) noun
define <2 x i64> @test_x86_sse41_movntdqa(<2 x i64>* %a0) {
; CHECK-LABEL: test_x86_sse41_movntdqa:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: movntdqa (%eax), %xmm0
; CHECK-NEXT: retl
@@ -74,7 +74,7 @@ declare <2 x i64> @llvm.x86.sse41.movntdqa(i8*) nounwind readnone
define <8 x i16> @test_x86_sse41_mpsadbw(<16 x i8> %a0, <16 x i8> %a1) {
; CHECK-LABEL: test_x86_sse41_mpsadbw:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: mpsadbw $7, %xmm1, %xmm0
; CHECK-NEXT: retl
%res = call <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8> %a0, <16 x i8> %a1, i32 7) ; <<8 x i16>> [#uses=1]
@@ -85,7 +85,7 @@ declare <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8>, <16 x i8>, i32) nounwind re
define <8 x i16> @test_x86_sse41_pblendw(<8 x i16> %a0, <8 x i16> %a1) {
; CHECK-LABEL: test_x86_sse41_pblendw:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4,5,6,7]
; CHECK-NEXT: retl
%res = call <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16> %a0, <8 x i16> %a1, i32 7) ; <<8 x i16>> [#uses=1]
@@ -96,7 +96,7 @@ declare <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16>, <8 x i16>, i32) nounwind re
define <4 x i32> @test_x86_sse41_pmovsxbd(<16 x i8> %a0) {
; CHECK-LABEL: test_x86_sse41_pmovsxbd:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: pmovsxbd %xmm0, %xmm0
; CHECK-NEXT: retl
%res = call <4 x i32> @llvm.x86.sse41.pmovsxbd(<16 x i8> %a0) ; <<4 x i32>> [#uses=1]
@@ -107,7 +107,7 @@ declare <4 x i32> @llvm.x86.sse41.pmovsxbd(<16 x i8>) nounwind readnone
define <2 x i64> @test_x86_sse41_pmovsxbq(<16 x i8> %a0) {
; CHECK-LABEL: test_x86_sse41_pmovsxbq:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: pmovsxbq %xmm0, %xmm0
; CHECK-NEXT: retl
%res = call <2 x i64> @llvm.x86.sse41.pmovsxbq(<16 x i8> %a0) ; <<2 x i64>> [#uses=1]
@@ -118,7 +118,7 @@ declare <2 x i64> @llvm.x86.sse41.pmovsxbq(<16 x i8>) nounwind readnone
define <8 x i16> @test_x86_sse41_pmovsxbw(<16 x i8> %a0) {
; CHECK-LABEL: test_x86_sse41_pmovsxbw:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: pmovsxbw %xmm0, %xmm0
; CHECK-NEXT: retl
%res = call <8 x i16> @llvm.x86.sse41.pmovsxbw(<16 x i8> %a0) ; <<8 x i16>> [#uses=1]
@@ -129,7 +129,7 @@ declare <8 x i16> @llvm.x86.sse41.pmovsxbw(<16 x i8>) nounwind readnone
define <2 x i64> @test_x86_sse41_pmovsxdq(<4 x i32> %a0) {
; CHECK-LABEL: test_x86_sse41_pmovsxdq:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: pmovsxdq %xmm0, %xmm0
; CHECK-NEXT: retl
%res = call <2 x i64> @llvm.x86.sse41.pmovsxdq(<4 x i32> %a0) ; <<2 x i64>> [#uses=1]
@@ -140,7 +140,7 @@ declare <2 x i64> @llvm.x86.sse41.pmovsxdq(<4 x i32>) nounwind readnone
define <4 x i32> @test_x86_sse41_pmovsxwd(<8 x i16> %a0) {
; CHECK-LABEL: test_x86_sse41_pmovsxwd:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: pmovsxwd %xmm0, %xmm0
; CHECK-NEXT: retl
%res = call <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16> %a0) ; <<4 x i32>> [#uses=1]
@@ -151,7 +151,7 @@ declare <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16>) nounwind readnone
define <2 x i64> @test_x86_sse41_pmovsxwq(<8 x i16> %a0) {
; CHECK-LABEL: test_x86_sse41_pmovsxwq:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: pmovsxwq %xmm0, %xmm0
; CHECK-NEXT: retl
%res = call <2 x i64> @llvm.x86.sse41.pmovsxwq(<8 x i16> %a0) ; <<2 x i64>> [#uses=1]
@@ -162,7 +162,7 @@ declare <2 x i64> @llvm.x86.sse41.pmovsxwq(<8 x i16>) nounwind readnone
define <4 x i32> @test_x86_sse41_pmovzxbd(<16 x i8> %a0) {
; CHECK-LABEL: test_x86_sse41_pmovzxbd:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; CHECK-NEXT: retl
%res = call <4 x i32> @llvm.x86.sse41.pmovzxbd(<16 x i8> %a0) ; <<4 x i32>> [#uses=1]
@@ -173,7 +173,7 @@ declare <4 x i32> @llvm.x86.sse41.pmovzxbd(<16 x i8>) nounwind readnone
define <2 x i64> @test_x86_sse41_pmovzxbq(<16 x i8> %a0) {
; CHECK-LABEL: test_x86_sse41_pmovzxbq:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
; CHECK-NEXT: retl
%res = call <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8> %a0) ; <<2 x i64>> [#uses=1]
@@ -184,7 +184,7 @@ declare <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8>) nounwind readnone
define <8 x i16> @test_x86_sse41_pmovzxbw(<16 x i8> %a0) {
; CHECK-LABEL: test_x86_sse41_pmovzxbw:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; CHECK-NEXT: retl
%res = call <8 x i16> @llvm.x86.sse41.pmovzxbw(<16 x i8> %a0) ; <<8 x i16>> [#uses=1]
@@ -195,7 +195,7 @@ declare <8 x i16> @llvm.x86.sse41.pmovzxbw(<16 x i8>) nounwind readnone
define <2 x i64> @test_x86_sse41_pmovzxdq(<4 x i32> %a0) {
; CHECK-LABEL: test_x86_sse41_pmovzxdq:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; CHECK-NEXT: retl
%res = call <2 x i64> @llvm.x86.sse41.pmovzxdq(<4 x i32> %a0) ; <<2 x i64>> [#uses=1]
@@ -206,7 +206,7 @@ declare <2 x i64> @llvm.x86.sse41.pmovzxdq(<4 x i32>) nounwind readnone
define <4 x i32> @test_x86_sse41_pmovzxwd(<8 x i16> %a0) {
; CHECK-LABEL: test_x86_sse41_pmovzxwd:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; CHECK-NEXT: retl
%res = call <4 x i32> @llvm.x86.sse41.pmovzxwd(<8 x i16> %a0) ; <<4 x i32>> [#uses=1]
@@ -217,7 +217,7 @@ declare <4 x i32> @llvm.x86.sse41.pmovzxwd(<8 x i16>) nounwind readnone
define <2 x i64> @test_x86_sse41_pmovzxwq(<8 x i16> %a0) {
; CHECK-LABEL: test_x86_sse41_pmovzxwq:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
; CHECK-NEXT: retl
%res = call <2 x i64> @llvm.x86.sse41.pmovzxwq(<8 x i16> %a0) ; <<2 x i64>> [#uses=1]
@@ -227,7 +227,7 @@ declare <2 x i64> @llvm.x86.sse41.pmovzxwq(<8 x i16>) nounwind readnone
define <16 x i8> @max_epi8(<16 x i8> %a0, <16 x i8> %a1) {
; CHECK-LABEL: max_epi8:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: pmaxsb %xmm1, %xmm0
; CHECK-NEXT: retl
%res = call <16 x i8> @llvm.x86.sse41.pmaxsb(<16 x i8> %a0, <16 x i8> %a1)
@@ -237,7 +237,7 @@ declare <16 x i8> @llvm.x86.sse41.pmaxsb(<16 x i8>, <16 x i8>) nounwind readnone
define <16 x i8> @min_epi8(<16 x i8> %a0, <16 x i8> %a1) {
; CHECK-LABEL: min_epi8:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: pminsb %xmm1, %xmm0
; CHECK-NEXT: retl
%res = call <16 x i8> @llvm.x86.sse41.pminsb(<16 x i8> %a0, <16 x i8> %a1)
@@ -247,7 +247,7 @@ declare <16 x i8> @llvm.x86.sse41.pminsb(<16 x i8>, <16 x i8>) nounwind readnone
define <8 x i16> @max_epu16(<8 x i16> %a0, <8 x i16> %a1) {
; CHECK-LABEL: max_epu16:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: pmaxuw %xmm1, %xmm0
; CHECK-NEXT: retl
%res = call <8 x i16> @llvm.x86.sse41.pmaxuw(<8 x i16> %a0, <8 x i16> %a1)
@@ -257,7 +257,7 @@ declare <8 x i16> @llvm.x86.sse41.pmaxuw(<8 x i16>, <8 x i16>) nounwind readnone
define <8 x i16> @min_epu16(<8 x i16> %a0, <8 x i16> %a1) {
; CHECK-LABEL: min_epu16:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: pminuw %xmm1, %xmm0
; CHECK-NEXT: retl
%res = call <8 x i16> @llvm.x86.sse41.pminuw(<8 x i16> %a0, <8 x i16> %a1)
@@ -267,7 +267,7 @@ declare <8 x i16> @llvm.x86.sse41.pminuw(<8 x i16>, <8 x i16>) nounwind readnone
define <4 x i32> @max_epi32(<4 x i32> %a0, <4 x i32> %a1) {
; CHECK-LABEL: max_epi32:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: pmaxsd %xmm1, %xmm0
; CHECK-NEXT: retl
%res = call <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32> %a0, <4 x i32> %a1)
@@ -277,7 +277,7 @@ declare <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32>, <4 x i32>) nounwind readnone
define <4 x i32> @min_epi32(<4 x i32> %a0, <4 x i32> %a1) {
; CHECK-LABEL: min_epi32:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: pminsd %xmm1, %xmm0
; CHECK-NEXT: retl
%res = call <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32> %a0, <4 x i32> %a1)
@@ -287,7 +287,7 @@ declare <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32>, <4 x i32>) nounwind readnone
define <4 x i32> @max_epu32(<4 x i32> %a0, <4 x i32> %a1) {
; CHECK-LABEL: max_epu32:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: pmaxud %xmm1, %xmm0
; CHECK-NEXT: retl
%res = call <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32> %a0, <4 x i32> %a1)
@@ -297,7 +297,7 @@ declare <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32>, <4 x i32>) nounwind readnone
define <4 x i32> @min_epu32(<4 x i32> %a0, <4 x i32> %a1) {
; CHECK-LABEL: min_epu32:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: pminud %xmm1, %xmm0
; CHECK-NEXT: retl
%res = call <4 x i32> @llvm.x86.sse41.pminud(<4 x i32> %a0, <4 x i32> %a1)
diff --git a/test/CodeGen/X86/sse41-intrinsics-x86.ll b/test/CodeGen/X86/sse41-intrinsics-x86.ll
index 3abfcf4d542e..2c38904e4c7e 100644
--- a/test/CodeGen/X86/sse41-intrinsics-x86.ll
+++ b/test/CodeGen/X86/sse41-intrinsics-x86.ll
@@ -5,7 +5,7 @@
define <2 x double> @test_x86_sse41_blendvpd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
; SSE41-LABEL: test_x86_sse41_blendvpd:
-; SSE41: ## BB#0:
+; SSE41: ## %bb.0:
; SSE41-NEXT: movapd %xmm0, %xmm3 ## encoding: [0x66,0x0f,0x28,0xd8]
; SSE41-NEXT: movaps %xmm2, %xmm0 ## encoding: [0x0f,0x28,0xc2]
; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm3 ## encoding: [0x66,0x0f,0x38,0x15,0xd9]
@@ -13,7 +13,7 @@ define <2 x double> @test_x86_sse41_blendvpd(<2 x double> %a0, <2 x double> %a1,
; SSE41-NEXT: retl ## encoding: [0xc3]
;
; VCHECK-LABEL: test_x86_sse41_blendvpd:
-; VCHECK: ## BB#0:
+; VCHECK: ## %bb.0:
; VCHECK-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x4b,0xc1,0x20]
; VCHECK-NEXT: retl ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.sse41.blendvpd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) ; <<2 x double>> [#uses=1]
@@ -24,7 +24,7 @@ declare <2 x double> @llvm.x86.sse41.blendvpd(<2 x double>, <2 x double>, <2 x d
define <4 x float> @test_x86_sse41_blendvps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
; SSE41-LABEL: test_x86_sse41_blendvps:
-; SSE41: ## BB#0:
+; SSE41: ## %bb.0:
; SSE41-NEXT: movaps %xmm0, %xmm3 ## encoding: [0x0f,0x28,0xd8]
; SSE41-NEXT: movaps %xmm2, %xmm0 ## encoding: [0x0f,0x28,0xc2]
; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm3 ## encoding: [0x66,0x0f,0x38,0x14,0xd9]
@@ -32,7 +32,7 @@ define <4 x float> @test_x86_sse41_blendvps(<4 x float> %a0, <4 x float> %a1, <4
; SSE41-NEXT: retl ## encoding: [0xc3]
;
; VCHECK-LABEL: test_x86_sse41_blendvps:
-; VCHECK: ## BB#0:
+; VCHECK: ## %bb.0:
; VCHECK-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x4a,0xc1,0x20]
; VCHECK-NEXT: retl ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) ; <<4 x float>> [#uses=1]
@@ -43,12 +43,12 @@ declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>, <4 x floa
define <2 x double> @test_x86_sse41_dppd(<2 x double> %a0, <2 x double> %a1) {
; SSE41-LABEL: test_x86_sse41_dppd:
-; SSE41: ## BB#0:
+; SSE41: ## %bb.0:
; SSE41-NEXT: dppd $7, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x41,0xc1,0x07]
; SSE41-NEXT: retl ## encoding: [0xc3]
;
; VCHECK-LABEL: test_x86_sse41_dppd:
-; VCHECK: ## BB#0:
+; VCHECK: ## %bb.0:
; VCHECK-NEXT: vdppd $7, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x41,0xc1,0x07]
; VCHECK-NEXT: retl ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.sse41.dppd(<2 x double> %a0, <2 x double> %a1, i8 7) ; <<2 x double>> [#uses=1]
@@ -59,12 +59,12 @@ declare <2 x double> @llvm.x86.sse41.dppd(<2 x double>, <2 x double>, i8) nounwi
define <4 x float> @test_x86_sse41_dpps(<4 x float> %a0, <4 x float> %a1) {
; SSE41-LABEL: test_x86_sse41_dpps:
-; SSE41: ## BB#0:
+; SSE41: ## %bb.0:
; SSE41-NEXT: dpps $7, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x40,0xc1,0x07]
; SSE41-NEXT: retl ## encoding: [0xc3]
;
; VCHECK-LABEL: test_x86_sse41_dpps:
-; VCHECK: ## BB#0:
+; VCHECK: ## %bb.0:
; VCHECK-NEXT: vdpps $7, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x40,0xc1,0x07]
; VCHECK-NEXT: retl ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.sse41.dpps(<4 x float> %a0, <4 x float> %a1, i8 7) ; <<4 x float>> [#uses=1]
@@ -75,19 +75,19 @@ declare <4 x float> @llvm.x86.sse41.dpps(<4 x float>, <4 x float>, i8) nounwind
define <4 x float> @test_x86_sse41_insertps(<4 x float> %a0, <4 x float> %a1) {
; SSE41-LABEL: test_x86_sse41_insertps:
-; SSE41: ## BB#0:
+; SSE41: ## %bb.0:
; SSE41-NEXT: insertps $17, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0x11]
; SSE41-NEXT: ## xmm0 = zero,xmm1[0],xmm0[2,3]
; SSE41-NEXT: retl ## encoding: [0xc3]
;
; AVX2-LABEL: test_x86_sse41_insertps:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vinsertps $17, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x11]
; AVX2-NEXT: ## xmm0 = zero,xmm1[0],xmm0[2,3]
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_sse41_insertps:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vinsertps $17, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x11]
; SKX-NEXT: ## xmm0 = zero,xmm1[0],xmm0[2,3]
; SKX-NEXT: retl ## encoding: [0xc3]
@@ -100,12 +100,12 @@ declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i8) nounw
define <8 x i16> @test_x86_sse41_mpsadbw(<16 x i8> %a0, <16 x i8> %a1) {
; SSE41-LABEL: test_x86_sse41_mpsadbw:
-; SSE41: ## BB#0:
+; SSE41: ## %bb.0:
; SSE41-NEXT: mpsadbw $7, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x42,0xc1,0x07]
; SSE41-NEXT: retl ## encoding: [0xc3]
;
; VCHECK-LABEL: test_x86_sse41_mpsadbw:
-; VCHECK: ## BB#0:
+; VCHECK: ## %bb.0:
; VCHECK-NEXT: vmpsadbw $7, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x42,0xc1,0x07]
; VCHECK-NEXT: retl ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8> %a0, <16 x i8> %a1, i8 7) ; <<8 x i16>> [#uses=1]
@@ -116,17 +116,17 @@ declare <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8>, <16 x i8>, i8) nounwind rea
define <8 x i16> @test_x86_sse41_packusdw(<4 x i32> %a0, <4 x i32> %a1) {
; SSE41-LABEL: test_x86_sse41_packusdw:
-; SSE41: ## BB#0:
+; SSE41: ## %bb.0:
; SSE41-NEXT: packusdw %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x38,0x2b,0xc1]
; SSE41-NEXT: retl ## encoding: [0xc3]
;
; AVX2-LABEL: test_x86_sse41_packusdw:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x2b,0xc1]
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_sse41_packusdw:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x2b,0xc1]
; SKX-NEXT: retl ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a0, <4 x i32> %a1) ; <<8 x i16>> [#uses=1]
@@ -135,9 +135,35 @@ define <8 x i16> @test_x86_sse41_packusdw(<4 x i32> %a0, <4 x i32> %a1) {
declare <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32>, <4 x i32>) nounwind readnone
+define <8 x i16> @test_x86_sse41_packusdw_fold() {
+; SSE41-LABEL: test_x86_sse41_packusdw_fold:
+; SSE41: ## %bb.0:
+; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,0,0,0,65535,65535,0,0]
+; SSE41-NEXT: ## encoding: [0x0f,0x28,0x05,A,A,A,A]
+; SSE41-NEXT: ## fixup A - offset: 3, value: LCPI7_0, kind: FK_Data_4
+; SSE41-NEXT: retl ## encoding: [0xc3]
+;
+; AVX2-LABEL: test_x86_sse41_packusdw_fold:
+; AVX2: ## %bb.0:
+; AVX2-NEXT: vmovaps {{.*#+}} xmm0 = [0,0,0,0,65535,65535,0,0]
+; AVX2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x05,A,A,A,A]
+; AVX2-NEXT: ## fixup A - offset: 4, value: LCPI7_0, kind: FK_Data_4
+; AVX2-NEXT: retl ## encoding: [0xc3]
+;
+; SKX-LABEL: test_x86_sse41_packusdw_fold:
+; SKX: ## %bb.0:
+; SKX-NEXT: vmovaps LCPI7_0, %xmm0 ## EVEX TO VEX Compression xmm0 = [0,0,0,0,65535,65535,0,0]
+; SKX-NEXT: ## encoding: [0xc5,0xf8,0x28,0x05,A,A,A,A]
+; SKX-NEXT: ## fixup A - offset: 4, value: LCPI7_0, kind: FK_Data_4
+; SKX-NEXT: retl ## encoding: [0xc3]
+ %res = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> zeroinitializer, <4 x i32> <i32 65535, i32 65536, i32 -1, i32 -131072>)
+ ret <8 x i16> %res
+}
+
+
define <16 x i8> @test_x86_sse41_pblendvb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %a2) {
; SSE41-LABEL: test_x86_sse41_pblendvb:
-; SSE41: ## BB#0:
+; SSE41: ## %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm3 ## encoding: [0x66,0x0f,0x6f,0xd8]
; SSE41-NEXT: movaps %xmm2, %xmm0 ## encoding: [0x0f,0x28,0xc2]
; SSE41-NEXT: pblendvb %xmm0, %xmm1, %xmm3 ## encoding: [0x66,0x0f,0x38,0x10,0xd9]
@@ -145,7 +171,7 @@ define <16 x i8> @test_x86_sse41_pblendvb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8
; SSE41-NEXT: retl ## encoding: [0xc3]
;
; VCHECK-LABEL: test_x86_sse41_pblendvb:
-; VCHECK: ## BB#0:
+; VCHECK: ## %bb.0:
; VCHECK-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x4c,0xc1,0x20]
; VCHECK-NEXT: retl ## encoding: [0xc3]
%res = call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %a2) ; <<16 x i8>> [#uses=1]
@@ -156,12 +182,12 @@ declare <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8>, <16 x i8>, <16 x i8>) noun
define <8 x i16> @test_x86_sse41_phminposuw(<8 x i16> %a0) {
; SSE41-LABEL: test_x86_sse41_phminposuw:
-; SSE41: ## BB#0:
+; SSE41: ## %bb.0:
; SSE41-NEXT: phminposuw %xmm0, %xmm0 ## encoding: [0x66,0x0f,0x38,0x41,0xc0]
; SSE41-NEXT: retl ## encoding: [0xc3]
;
; VCHECK-LABEL: test_x86_sse41_phminposuw:
-; VCHECK: ## BB#0:
+; VCHECK: ## %bb.0:
; VCHECK-NEXT: vphminposuw %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x41,0xc0]
; VCHECK-NEXT: retl ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.sse41.phminposuw(<8 x i16> %a0) ; <<8 x i16>> [#uses=1]
@@ -172,17 +198,17 @@ declare <8 x i16> @llvm.x86.sse41.phminposuw(<8 x i16>) nounwind readnone
define <16 x i8> @test_x86_sse41_pmaxsb(<16 x i8> %a0, <16 x i8> %a1) {
; SSE41-LABEL: test_x86_sse41_pmaxsb:
-; SSE41: ## BB#0:
+; SSE41: ## %bb.0:
; SSE41-NEXT: pmaxsb %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x38,0x3c,0xc1]
; SSE41-NEXT: retl ## encoding: [0xc3]
;
; AVX2-LABEL: test_x86_sse41_pmaxsb:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x3c,0xc1]
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_sse41_pmaxsb:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x3c,0xc1]
; SKX-NEXT: retl ## encoding: [0xc3]
%res = call <16 x i8> @llvm.x86.sse41.pmaxsb(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
@@ -193,17 +219,17 @@ declare <16 x i8> @llvm.x86.sse41.pmaxsb(<16 x i8>, <16 x i8>) nounwind readnone
define <4 x i32> @test_x86_sse41_pmaxsd(<4 x i32> %a0, <4 x i32> %a1) {
; SSE41-LABEL: test_x86_sse41_pmaxsd:
-; SSE41: ## BB#0:
+; SSE41: ## %bb.0:
; SSE41-NEXT: pmaxsd %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x38,0x3d,0xc1]
; SSE41-NEXT: retl ## encoding: [0xc3]
;
; AVX2-LABEL: test_x86_sse41_pmaxsd:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x3d,0xc1]
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_sse41_pmaxsd:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x3d,0xc1]
; SKX-NEXT: retl ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32> %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1]
@@ -214,17 +240,17 @@ declare <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32>, <4 x i32>) nounwind readnone
define <4 x i32> @test_x86_sse41_pmaxud(<4 x i32> %a0, <4 x i32> %a1) {
; SSE41-LABEL: test_x86_sse41_pmaxud:
-; SSE41: ## BB#0:
+; SSE41: ## %bb.0:
; SSE41-NEXT: pmaxud %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x38,0x3f,0xc1]
; SSE41-NEXT: retl ## encoding: [0xc3]
;
; AVX2-LABEL: test_x86_sse41_pmaxud:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x3f,0xc1]
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_sse41_pmaxud:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x3f,0xc1]
; SKX-NEXT: retl ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32> %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1]
@@ -235,17 +261,17 @@ declare <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32>, <4 x i32>) nounwind readnone
define <8 x i16> @test_x86_sse41_pmaxuw(<8 x i16> %a0, <8 x i16> %a1) {
; SSE41-LABEL: test_x86_sse41_pmaxuw:
-; SSE41: ## BB#0:
+; SSE41: ## %bb.0:
; SSE41-NEXT: pmaxuw %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x38,0x3e,0xc1]
; SSE41-NEXT: retl ## encoding: [0xc3]
;
; AVX2-LABEL: test_x86_sse41_pmaxuw:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x3e,0xc1]
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_sse41_pmaxuw:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x3e,0xc1]
; SKX-NEXT: retl ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.sse41.pmaxuw(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
@@ -256,17 +282,17 @@ declare <8 x i16> @llvm.x86.sse41.pmaxuw(<8 x i16>, <8 x i16>) nounwind readnone
define <16 x i8> @test_x86_sse41_pminsb(<16 x i8> %a0, <16 x i8> %a1) {
; SSE41-LABEL: test_x86_sse41_pminsb:
-; SSE41: ## BB#0:
+; SSE41: ## %bb.0:
; SSE41-NEXT: pminsb %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x38,0x38,0xc1]
; SSE41-NEXT: retl ## encoding: [0xc3]
;
; AVX2-LABEL: test_x86_sse41_pminsb:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vpminsb %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x38,0xc1]
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_sse41_pminsb:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpminsb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x38,0xc1]
; SKX-NEXT: retl ## encoding: [0xc3]
%res = call <16 x i8> @llvm.x86.sse41.pminsb(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
@@ -277,17 +303,17 @@ declare <16 x i8> @llvm.x86.sse41.pminsb(<16 x i8>, <16 x i8>) nounwind readnone
define <4 x i32> @test_x86_sse41_pminsd(<4 x i32> %a0, <4 x i32> %a1) {
; SSE41-LABEL: test_x86_sse41_pminsd:
-; SSE41: ## BB#0:
+; SSE41: ## %bb.0:
; SSE41-NEXT: pminsd %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x38,0x39,0xc1]
; SSE41-NEXT: retl ## encoding: [0xc3]
;
; AVX2-LABEL: test_x86_sse41_pminsd:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x39,0xc1]
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_sse41_pminsd:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpminsd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x39,0xc1]
; SKX-NEXT: retl ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32> %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1]
@@ -298,17 +324,17 @@ declare <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32>, <4 x i32>) nounwind readnone
define <4 x i32> @test_x86_sse41_pminud(<4 x i32> %a0, <4 x i32> %a1) {
; SSE41-LABEL: test_x86_sse41_pminud:
-; SSE41: ## BB#0:
+; SSE41: ## %bb.0:
; SSE41-NEXT: pminud %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x38,0x3b,0xc1]
; SSE41-NEXT: retl ## encoding: [0xc3]
;
; AVX2-LABEL: test_x86_sse41_pminud:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x3b,0xc1]
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_sse41_pminud:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpminud %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x3b,0xc1]
; SKX-NEXT: retl ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.sse41.pminud(<4 x i32> %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1]
@@ -319,17 +345,17 @@ declare <4 x i32> @llvm.x86.sse41.pminud(<4 x i32>, <4 x i32>) nounwind readnone
define <8 x i16> @test_x86_sse41_pminuw(<8 x i16> %a0, <8 x i16> %a1) {
; SSE41-LABEL: test_x86_sse41_pminuw:
-; SSE41: ## BB#0:
+; SSE41: ## %bb.0:
; SSE41-NEXT: pminuw %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x38,0x3a,0xc1]
; SSE41-NEXT: retl ## encoding: [0xc3]
;
; AVX2-LABEL: test_x86_sse41_pminuw:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vpminuw %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x3a,0xc1]
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_sse41_pminuw:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpminuw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x3a,0xc1]
; SKX-NEXT: retl ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.sse41.pminuw(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
@@ -340,17 +366,17 @@ declare <8 x i16> @llvm.x86.sse41.pminuw(<8 x i16>, <8 x i16>) nounwind readnone
define <2 x i64> @test_x86_sse41_pmuldq(<4 x i32> %a0, <4 x i32> %a1) {
; SSE41-LABEL: test_x86_sse41_pmuldq:
-; SSE41: ## BB#0:
+; SSE41: ## %bb.0:
; SSE41-NEXT: pmuldq %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x38,0x28,0xc1]
; SSE41-NEXT: retl ## encoding: [0xc3]
;
; AVX2-LABEL: test_x86_sse41_pmuldq:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vpmuldq %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x28,0xc1]
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_sse41_pmuldq:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpmuldq %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x28,0xc1]
; SKX-NEXT: retl ## encoding: [0xc3]
%res = call <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32> %a0, <4 x i32> %a1) ; <<2 x i64>> [#uses=1]
@@ -361,14 +387,14 @@ declare <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32>, <4 x i32>) nounwind readnone
define i32 @test_x86_sse41_ptestc(<2 x i64> %a0, <2 x i64> %a1) {
; SSE41-LABEL: test_x86_sse41_ptestc:
-; SSE41: ## BB#0:
+; SSE41: ## %bb.0:
; SSE41-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0]
; SSE41-NEXT: ptest %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x38,0x17,0xc1]
; SSE41-NEXT: setb %al ## encoding: [0x0f,0x92,0xc0]
; SSE41-NEXT: retl ## encoding: [0xc3]
;
; VCHECK-LABEL: test_x86_sse41_ptestc:
-; VCHECK: ## BB#0:
+; VCHECK: ## %bb.0:
; VCHECK-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0]
; VCHECK-NEXT: vptest %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x17,0xc1]
; VCHECK-NEXT: setb %al ## encoding: [0x0f,0x92,0xc0]
@@ -381,14 +407,14 @@ declare i32 @llvm.x86.sse41.ptestc(<2 x i64>, <2 x i64>) nounwind readnone
define i32 @test_x86_sse41_ptestnzc(<2 x i64> %a0, <2 x i64> %a1) {
; SSE41-LABEL: test_x86_sse41_ptestnzc:
-; SSE41: ## BB#0:
+; SSE41: ## %bb.0:
; SSE41-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0]
; SSE41-NEXT: ptest %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x38,0x17,0xc1]
; SSE41-NEXT: seta %al ## encoding: [0x0f,0x97,0xc0]
; SSE41-NEXT: retl ## encoding: [0xc3]
;
; VCHECK-LABEL: test_x86_sse41_ptestnzc:
-; VCHECK: ## BB#0:
+; VCHECK: ## %bb.0:
; VCHECK-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0]
; VCHECK-NEXT: vptest %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x17,0xc1]
; VCHECK-NEXT: seta %al ## encoding: [0x0f,0x97,0xc0]
@@ -401,14 +427,14 @@ declare i32 @llvm.x86.sse41.ptestnzc(<2 x i64>, <2 x i64>) nounwind readnone
define i32 @test_x86_sse41_ptestz(<2 x i64> %a0, <2 x i64> %a1) {
; SSE41-LABEL: test_x86_sse41_ptestz:
-; SSE41: ## BB#0:
+; SSE41: ## %bb.0:
; SSE41-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0]
; SSE41-NEXT: ptest %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x38,0x17,0xc1]
; SSE41-NEXT: sete %al ## encoding: [0x0f,0x94,0xc0]
; SSE41-NEXT: retl ## encoding: [0xc3]
;
; VCHECK-LABEL: test_x86_sse41_ptestz:
-; VCHECK: ## BB#0:
+; VCHECK: ## %bb.0:
; VCHECK-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0]
; VCHECK-NEXT: vptest %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x17,0xc1]
; VCHECK-NEXT: sete %al ## encoding: [0x0f,0x94,0xc0]
@@ -421,14 +447,19 @@ declare i32 @llvm.x86.sse41.ptestz(<2 x i64>, <2 x i64>) nounwind readnone
define <2 x double> @test_x86_sse41_round_pd(<2 x double> %a0) {
; SSE41-LABEL: test_x86_sse41_round_pd:
-; SSE41: ## BB#0:
+; SSE41: ## %bb.0:
; SSE41-NEXT: roundpd $7, %xmm0, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x09,0xc0,0x07]
; SSE41-NEXT: retl ## encoding: [0xc3]
;
-; VCHECK-LABEL: test_x86_sse41_round_pd:
-; VCHECK: ## BB#0:
-; VCHECK-NEXT: vroundpd $7, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x09,0xc0,0x07]
-; VCHECK-NEXT: retl ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_sse41_round_pd:
+; AVX2: ## %bb.0:
+; AVX2-NEXT: vroundpd $7, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x09,0xc0,0x07]
+; AVX2-NEXT: retl ## encoding: [0xc3]
+;
+; SKX-LABEL: test_x86_sse41_round_pd:
+; SKX: ## %bb.0:
+; SKX-NEXT: vrndscalepd $7, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0xfd,0x08,0x09,0xc0,0x07]
+; SKX-NEXT: retl ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %a0, i32 7) ; <<2 x double>> [#uses=1]
ret <2 x double> %res
}
@@ -437,14 +468,19 @@ declare <2 x double> @llvm.x86.sse41.round.pd(<2 x double>, i32) nounwind readno
define <4 x float> @test_x86_sse41_round_ps(<4 x float> %a0) {
; SSE41-LABEL: test_x86_sse41_round_ps:
-; SSE41: ## BB#0:
+; SSE41: ## %bb.0:
; SSE41-NEXT: roundps $7, %xmm0, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x08,0xc0,0x07]
; SSE41-NEXT: retl ## encoding: [0xc3]
;
-; VCHECK-LABEL: test_x86_sse41_round_ps:
-; VCHECK: ## BB#0:
-; VCHECK-NEXT: vroundps $7, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x08,0xc0,0x07]
-; VCHECK-NEXT: retl ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_sse41_round_ps:
+; AVX2: ## %bb.0:
+; AVX2-NEXT: vroundps $7, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x08,0xc0,0x07]
+; AVX2-NEXT: retl ## encoding: [0xc3]
+;
+; SKX-LABEL: test_x86_sse41_round_ps:
+; SKX: ## %bb.0:
+; SKX-NEXT: vrndscaleps $7, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0x7d,0x08,0x08,0xc0,0x07]
+; SKX-NEXT: retl ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %a0, i32 7) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
@@ -453,14 +489,19 @@ declare <4 x float> @llvm.x86.sse41.round.ps(<4 x float>, i32) nounwind readnone
define <2 x double> @test_x86_sse41_round_sd(<2 x double> %a0, <2 x double> %a1) {
; SSE41-LABEL: test_x86_sse41_round_sd:
-; SSE41: ## BB#0:
+; SSE41: ## %bb.0:
; SSE41-NEXT: roundsd $7, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0b,0xc1,0x07]
; SSE41-NEXT: retl ## encoding: [0xc3]
;
-; VCHECK-LABEL: test_x86_sse41_round_sd:
-; VCHECK: ## BB#0:
-; VCHECK-NEXT: vroundsd $7, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0b,0xc1,0x07]
-; VCHECK-NEXT: retl ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_sse41_round_sd:
+; AVX2: ## %bb.0:
+; AVX2-NEXT: vroundsd $7, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0b,0xc1,0x07]
+; AVX2-NEXT: retl ## encoding: [0xc3]
+;
+; SKX-LABEL: test_x86_sse41_round_sd:
+; SKX: ## %bb.0:
+; SKX-NEXT: vrndscalesd $7, %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0xfd,0x08,0x0b,0xc1,0x07]
+; SKX-NEXT: retl ## encoding: [0xc3]
%res = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %a0, <2 x double> %a1, i32 7) ; <<2 x double>> [#uses=1]
ret <2 x double> %res
}
@@ -469,16 +510,22 @@ declare <2 x double> @llvm.x86.sse41.round.sd(<2 x double>, <2 x double>, i32) n
define <2 x double> @test_x86_sse41_round_sd_load(<2 x double> %a0, <2 x double>* %a1) {
; SSE41-LABEL: test_x86_sse41_round_sd_load:
-; SSE41: ## BB#0:
+; SSE41: ## %bb.0:
; SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
; SSE41-NEXT: roundsd $7, (%eax), %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0b,0x00,0x07]
; SSE41-NEXT: retl ## encoding: [0xc3]
;
-; VCHECK-LABEL: test_x86_sse41_round_sd_load:
-; VCHECK: ## BB#0:
-; VCHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; VCHECK-NEXT: vroundsd $7, (%eax), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0b,0x00,0x07]
-; VCHECK-NEXT: retl ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_sse41_round_sd_load:
+; AVX2: ## %bb.0:
+; AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; AVX2-NEXT: vroundsd $7, (%eax), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0b,0x00,0x07]
+; AVX2-NEXT: retl ## encoding: [0xc3]
+;
+; SKX-LABEL: test_x86_sse41_round_sd_load:
+; SKX: ## %bb.0:
+; SKX-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; SKX-NEXT: vrndscalesd $7, (%eax), %xmm0, %xmm0 ## encoding: [0x62,0xf3,0xfd,0x08,0x0b,0x00,0x07]
+; SKX-NEXT: retl ## encoding: [0xc3]
%a1b = load <2 x double>, <2 x double>* %a1
%res = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %a0, <2 x double> %a1b, i32 7) ; <<2 x double>> [#uses=1]
ret <2 x double> %res
@@ -487,14 +534,19 @@ define <2 x double> @test_x86_sse41_round_sd_load(<2 x double> %a0, <2 x double>
define <4 x float> @test_x86_sse41_round_ss(<4 x float> %a0, <4 x float> %a1) {
; SSE41-LABEL: test_x86_sse41_round_ss:
-; SSE41: ## BB#0:
+; SSE41: ## %bb.0:
; SSE41-NEXT: roundss $7, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0a,0xc1,0x07]
; SSE41-NEXT: retl ## encoding: [0xc3]
;
-; VCHECK-LABEL: test_x86_sse41_round_ss:
-; VCHECK: ## BB#0:
-; VCHECK-NEXT: vroundss $7, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0a,0xc1,0x07]
-; VCHECK-NEXT: retl ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_sse41_round_ss:
+; AVX2: ## %bb.0:
+; AVX2-NEXT: vroundss $7, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0a,0xc1,0x07]
+; AVX2-NEXT: retl ## encoding: [0xc3]
+;
+; SKX-LABEL: test_x86_sse41_round_ss:
+; SKX: ## %bb.0:
+; SKX-NEXT: vrndscaless $7, %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0x7d,0x08,0x0a,0xc1,0x07]
+; SKX-NEXT: retl ## encoding: [0xc3]
%res = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %a0, <4 x float> %a1, i32 7) ; <<4 x float>> [#uses=1]
ret <4 x float> %res
}
diff --git a/test/CodeGen/X86/sse41-pmovxrm.ll b/test/CodeGen/X86/sse41-pmovxrm.ll
index d62053c96b74..2e65a470435c 100644
--- a/test/CodeGen/X86/sse41-pmovxrm.ll
+++ b/test/CodeGen/X86/sse41-pmovxrm.ll
@@ -5,12 +5,12 @@
define <8 x i16> @test_llvm_x86_sse41_pmovsxbw(<16 x i8>* %a) {
; SSE41-LABEL: test_llvm_x86_sse41_pmovsxbw:
-; SSE41: ## BB#0:
+; SSE41: ## %bb.0:
; SSE41-NEXT: pmovsxbw (%rdi), %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: test_llvm_x86_sse41_pmovsxbw:
-; AVX: ## BB#0:
+; AVX: ## %bb.0:
; AVX-NEXT: vpmovsxbw (%rdi), %xmm0
; AVX-NEXT: retq
%1 = load <16 x i8>, <16 x i8>* %a, align 1
@@ -21,12 +21,12 @@ define <8 x i16> @test_llvm_x86_sse41_pmovsxbw(<16 x i8>* %a) {
define <4 x i32> @test_llvm_x86_sse41_pmovsxbd(<16 x i8>* %a) {
; SSE41-LABEL: test_llvm_x86_sse41_pmovsxbd:
-; SSE41: ## BB#0:
+; SSE41: ## %bb.0:
; SSE41-NEXT: pmovsxbd (%rdi), %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: test_llvm_x86_sse41_pmovsxbd:
-; AVX: ## BB#0:
+; AVX: ## %bb.0:
; AVX-NEXT: vpmovsxbd (%rdi), %xmm0
; AVX-NEXT: retq
%1 = load <16 x i8>, <16 x i8>* %a, align 1
@@ -37,12 +37,12 @@ define <4 x i32> @test_llvm_x86_sse41_pmovsxbd(<16 x i8>* %a) {
define <2 x i64> @test_llvm_x86_sse41_pmovsxbq(<16 x i8>* %a) {
; SSE41-LABEL: test_llvm_x86_sse41_pmovsxbq:
-; SSE41: ## BB#0:
+; SSE41: ## %bb.0:
; SSE41-NEXT: pmovsxbq (%rdi), %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: test_llvm_x86_sse41_pmovsxbq:
-; AVX: ## BB#0:
+; AVX: ## %bb.0:
; AVX-NEXT: vpmovsxbq (%rdi), %xmm0
; AVX-NEXT: retq
%1 = load <16 x i8>, <16 x i8>* %a, align 1
@@ -53,12 +53,12 @@ define <2 x i64> @test_llvm_x86_sse41_pmovsxbq(<16 x i8>* %a) {
define <4 x i32> @test_llvm_x86_sse41_pmovsxwd(<8 x i16>* %a) {
; SSE41-LABEL: test_llvm_x86_sse41_pmovsxwd:
-; SSE41: ## BB#0:
+; SSE41: ## %bb.0:
; SSE41-NEXT: pmovsxwd (%rdi), %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: test_llvm_x86_sse41_pmovsxwd:
-; AVX: ## BB#0:
+; AVX: ## %bb.0:
; AVX-NEXT: vpmovsxwd (%rdi), %xmm0
; AVX-NEXT: retq
%1 = load <8 x i16>, <8 x i16>* %a, align 1
@@ -69,12 +69,12 @@ define <4 x i32> @test_llvm_x86_sse41_pmovsxwd(<8 x i16>* %a) {
define <2 x i64> @test_llvm_x86_sse41_pmovsxwq(<8 x i16>* %a) {
; SSE41-LABEL: test_llvm_x86_sse41_pmovsxwq:
-; SSE41: ## BB#0:
+; SSE41: ## %bb.0:
; SSE41-NEXT: pmovsxwq (%rdi), %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: test_llvm_x86_sse41_pmovsxwq:
-; AVX: ## BB#0:
+; AVX: ## %bb.0:
; AVX-NEXT: vpmovsxwq (%rdi), %xmm0
; AVX-NEXT: retq
%1 = load <8 x i16>, <8 x i16>* %a, align 1
@@ -85,12 +85,12 @@ define <2 x i64> @test_llvm_x86_sse41_pmovsxwq(<8 x i16>* %a) {
define <2 x i64> @test_llvm_x86_sse41_pmovsxdq(<4 x i32>* %a) {
; SSE41-LABEL: test_llvm_x86_sse41_pmovsxdq:
-; SSE41: ## BB#0:
+; SSE41: ## %bb.0:
; SSE41-NEXT: pmovsxdq (%rdi), %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: test_llvm_x86_sse41_pmovsxdq:
-; AVX: ## BB#0:
+; AVX: ## %bb.0:
; AVX-NEXT: vpmovsxdq (%rdi), %xmm0
; AVX-NEXT: retq
%1 = load <4 x i32>, <4 x i32>* %a, align 1
@@ -101,12 +101,12 @@ define <2 x i64> @test_llvm_x86_sse41_pmovsxdq(<4 x i32>* %a) {
define <8 x i16> @test_llvm_x86_sse41_pmovzxbw(<16 x i8>* %a) {
; SSE41-LABEL: test_llvm_x86_sse41_pmovzxbw:
-; SSE41: ## BB#0:
+; SSE41: ## %bb.0:
; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
; SSE41-NEXT: retq
;
; AVX-LABEL: test_llvm_x86_sse41_pmovzxbw:
-; AVX: ## BB#0:
+; AVX: ## %bb.0:
; AVX-NEXT: vpmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
; AVX-NEXT: retq
%1 = load <16 x i8>, <16 x i8>* %a, align 1
@@ -117,12 +117,12 @@ define <8 x i16> @test_llvm_x86_sse41_pmovzxbw(<16 x i8>* %a) {
define <4 x i32> @test_llvm_x86_sse41_pmovzxbd(<16 x i8>* %a) {
; SSE41-LABEL: test_llvm_x86_sse41_pmovzxbd:
-; SSE41: ## BB#0:
+; SSE41: ## %bb.0:
; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; SSE41-NEXT: retq
;
; AVX-LABEL: test_llvm_x86_sse41_pmovzxbd:
-; AVX: ## BB#0:
+; AVX: ## %bb.0:
; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; AVX-NEXT: retq
%1 = load <16 x i8>, <16 x i8>* %a, align 1
@@ -133,12 +133,12 @@ define <4 x i32> @test_llvm_x86_sse41_pmovzxbd(<16 x i8>* %a) {
define <2 x i64> @test_llvm_x86_sse41_pmovzxbq(<16 x i8>* %a) {
; SSE41-LABEL: test_llvm_x86_sse41_pmovzxbq:
-; SSE41: ## BB#0:
+; SSE41: ## %bb.0:
; SSE41-NEXT: pmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
; SSE41-NEXT: retq
;
; AVX-LABEL: test_llvm_x86_sse41_pmovzxbq:
-; AVX: ## BB#0:
+; AVX: ## %bb.0:
; AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
; AVX-NEXT: retq
%1 = load <16 x i8>, <16 x i8>* %a, align 1
@@ -149,12 +149,12 @@ define <2 x i64> @test_llvm_x86_sse41_pmovzxbq(<16 x i8>* %a) {
define <4 x i32> @test_llvm_x86_sse41_pmovzxwd(<8 x i16>* %a) {
; SSE41-LABEL: test_llvm_x86_sse41_pmovzxwd:
-; SSE41: ## BB#0:
+; SSE41: ## %bb.0:
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
; SSE41-NEXT: retq
;
; AVX-LABEL: test_llvm_x86_sse41_pmovzxwd:
-; AVX: ## BB#0:
+; AVX: ## %bb.0:
; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
; AVX-NEXT: retq
%1 = load <8 x i16>, <8 x i16>* %a, align 1
@@ -165,12 +165,12 @@ define <4 x i32> @test_llvm_x86_sse41_pmovzxwd(<8 x i16>* %a) {
define <2 x i64> @test_llvm_x86_sse41_pmovzxwq(<8 x i16>* %a) {
; SSE41-LABEL: test_llvm_x86_sse41_pmovzxwq:
-; SSE41: ## BB#0:
+; SSE41: ## %bb.0:
; SSE41-NEXT: pmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero
; SSE41-NEXT: retq
;
; AVX-LABEL: test_llvm_x86_sse41_pmovzxwq:
-; AVX: ## BB#0:
+; AVX: ## %bb.0:
; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero
; AVX-NEXT: retq
%1 = load <8 x i16>, <8 x i16>* %a, align 1
@@ -181,12 +181,12 @@ define <2 x i64> @test_llvm_x86_sse41_pmovzxwq(<8 x i16>* %a) {
define <2 x i64> @test_llvm_x86_sse41_pmovzxdq(<4 x i32>* %a) {
; SSE41-LABEL: test_llvm_x86_sse41_pmovzxdq:
-; SSE41: ## BB#0:
+; SSE41: ## %bb.0:
; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
; SSE41-NEXT: retq
;
; AVX-LABEL: test_llvm_x86_sse41_pmovzxdq:
-; AVX: ## BB#0:
+; AVX: ## %bb.0:
; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
; AVX-NEXT: retq
%1 = load <4 x i32>, <4 x i32>* %a, align 1
diff --git a/test/CodeGen/X86/sse41-schedule.ll b/test/CodeGen/X86/sse41-schedule.ll
index ac600fed0ea0..a2073f7ffb02 100644
--- a/test/CodeGen/X86/sse41-schedule.ll
+++ b/test/CodeGen/X86/sse41-schedule.ll
@@ -1,55 +1,79 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mattr=+sse4.1 | FileCheck %s --check-prefix=CHECK --check-prefix=GENERIC
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=+sse4.1 | FileCheck %s --check-prefix=CHECK --check-prefix=GENERIC
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=slm | FileCheck %s --check-prefix=CHECK --check-prefix=SLM
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=sandybridge | FileCheck %s --check-prefix=CHECK --check-prefix=SANDY
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=ivybridge | FileCheck %s --check-prefix=CHECK --check-prefix=SANDY
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=haswell | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=broadwell | FileCheck %s --check-prefix=CHECK --check-prefix=BROADWELL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=SKYLAKE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skx | FileCheck %s --check-prefix=CHECK --check-prefix=SKX
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1
define <2 x double> @test_blendpd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2) {
; GENERIC-LABEL: test_blendpd:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
-; GENERIC-NEXT: addpd %xmm1, %xmm0
-; GENERIC-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],mem[1]
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] sched: [1:0.50]
+; GENERIC-NEXT: addpd %xmm1, %xmm0 # sched: [3:1.00]
+; GENERIC-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],mem[1] sched: [7:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SLM-LABEL: test_blendpd:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] sched: [1:1.00]
; SLM-NEXT: addpd %xmm1, %xmm0 # sched: [3:1.00]
; SLM-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],mem[1] sched: [4:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_blendpd:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] sched: [1:0.50]
; SANDY-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],mem[1] sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],mem[1] sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_blendpd:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] sched: [1:0.33]
; HASWELL-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],mem[1] sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],mem[1] sched: [7:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_blendpd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] sched: [1:0.33]
+; BROADWELL-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
+; BROADWELL-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],mem[1] sched: [6:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_blendpd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] sched: [1:0.33]
+; SKYLAKE-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],mem[1] sched: [7:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_blendpd:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1] sched: [1:1.00]
+; SKX-NEXT: vmovapd (%rdi), %xmm2 # sched: [6:0.50]
+; SKX-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: vmovsd {{.*#+}} xmm0 = xmm0[0],xmm2[1] sched: [1:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_blendpd:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] sched: [1:0.50]
; BTVER2-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],mem[1] sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_blendpd:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] sched: [1:0.50]
; ZNVER1-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
; ZNVER1-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],mem[1] sched: [8:0.50]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> <i32 0, i32 3>
%2 = load <2 x double>, <2 x double> *%a2, align 16
%3 = fadd <2 x double> %a1, %1
@@ -59,40 +83,58 @@ define <2 x double> @test_blendpd(<2 x double> %a0, <2 x double> %a1, <2 x doubl
define <4 x float> @test_blendps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2) {
; GENERIC-LABEL: test_blendps:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3]
-; GENERIC-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2,3]
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3] sched: [1:0.50]
+; GENERIC-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2,3] sched: [7:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SLM-LABEL: test_blendps:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3] sched: [1:1.00]
; SLM-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2,3] sched: [4:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_blendps:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3] sched: [1:0.50]
-; SANDY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2,3] sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2,3] sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_blendps:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3] sched: [1:0.33]
-; HASWELL-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2,3] sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2,3] sched: [7:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_blendps:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3] sched: [1:0.33]
+; BROADWELL-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2,3] sched: [6:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_blendps:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3] sched: [1:0.33]
+; SKYLAKE-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2,3] sched: [7:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_blendps:
+; SKX: # %bb.0:
+; SKX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3] sched: [1:0.33]
+; SKX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2,3] sched: [7:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_blendps:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3] sched: [1:0.50]
; BTVER2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2,3] sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_blendps:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3] sched: [1:0.50]
; ZNVER1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2,3] sched: [8:0.50]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 0, i32 5, i32 6, i32 3>
%2 = load <4 x float>, <4 x float> *%a2, align 16
%3 = shufflevector <4 x float> %1, <4 x float> %2, <4 x i32> <i32 0, i32 5, i32 2, i32 3>
@@ -101,16 +143,16 @@ define <4 x float> @test_blendps(<4 x float> %a0, <4 x float> %a1, <4 x float> *
define <2 x double> @test_blendvpd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> *%a3) {
; GENERIC-LABEL: test_blendvpd:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: movapd %xmm0, %xmm3
-; GENERIC-NEXT: movaps %xmm2, %xmm0
-; GENERIC-NEXT: blendvpd %xmm0, %xmm1, %xmm3
-; GENERIC-NEXT: blendvpd %xmm0, (%rdi), %xmm3
-; GENERIC-NEXT: movapd %xmm3, %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: movapd %xmm0, %xmm3 # sched: [1:1.00]
+; GENERIC-NEXT: movaps %xmm2, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: blendvpd %xmm0, %xmm1, %xmm3 # sched: [2:1.00]
+; GENERIC-NEXT: blendvpd %xmm0, (%rdi), %xmm3 # sched: [8:1.00]
+; GENERIC-NEXT: movapd %xmm3, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SLM-LABEL: test_blendvpd:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: movapd %xmm0, %xmm3 # sched: [1:1.00]
; SLM-NEXT: movaps %xmm2, %xmm0 # sched: [1:1.00]
; SLM-NEXT: blendvpd %xmm0, %xmm1, %xmm3 # sched: [1:1.00]
@@ -119,28 +161,46 @@ define <2 x double> @test_blendvpd(<2 x double> %a0, <2 x double> %a1, <2 x doub
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_blendvpd:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
-; SANDY-NEXT: vblendvpd %xmm2, (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vblendvpd %xmm2, (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_blendvpd:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 # sched: [2:2.00]
-; HASWELL-NEXT: vblendvpd %xmm2, (%rdi), %xmm0, %xmm0 # sched: [6:2.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vblendvpd %xmm2, (%rdi), %xmm0, %xmm0 # sched: [8:2.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_blendvpd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 # sched: [2:2.00]
+; BROADWELL-NEXT: vblendvpd %xmm2, (%rdi), %xmm0, %xmm0 # sched: [7:2.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_blendvpd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 # sched: [2:0.67]
+; SKYLAKE-NEXT: vblendvpd %xmm2, (%rdi), %xmm0, %xmm0 # sched: [8:0.67]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_blendvpd:
+; SKX: # %bb.0:
+; SKX-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 # sched: [2:0.67]
+; SKX-NEXT: vblendvpd %xmm2, (%rdi), %xmm0, %xmm0 # sched: [8:0.67]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_blendvpd:
-; BTVER2: # BB#0:
-; BTVER2-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
-; BTVER2-NEXT: vblendvpd %xmm2, (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 # sched: [2:2.00]
+; BTVER2-NEXT: vblendvpd %xmm2, (%rdi), %xmm0, %xmm0 # sched: [7:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_blendvpd:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; ZNVER1-NEXT: vblendvpd %xmm2, (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call <2 x double> @llvm.x86.sse41.blendvpd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
%2 = load <2 x double>, <2 x double> *%a3, align 16
%3 = call <2 x double> @llvm.x86.sse41.blendvpd(<2 x double> %1, <2 x double> %2, <2 x double> %a2)
@@ -150,16 +210,16 @@ declare <2 x double> @llvm.x86.sse41.blendvpd(<2 x double>, <2 x double>, <2 x d
define <4 x float> @test_blendvps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> *%a3) {
; GENERIC-LABEL: test_blendvps:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: movaps %xmm0, %xmm3
-; GENERIC-NEXT: movaps %xmm2, %xmm0
-; GENERIC-NEXT: blendvps %xmm0, %xmm1, %xmm3
-; GENERIC-NEXT: blendvps %xmm0, (%rdi), %xmm3
-; GENERIC-NEXT: movaps %xmm3, %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: movaps %xmm0, %xmm3 # sched: [1:1.00]
+; GENERIC-NEXT: movaps %xmm2, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: blendvps %xmm0, %xmm1, %xmm3 # sched: [2:1.00]
+; GENERIC-NEXT: blendvps %xmm0, (%rdi), %xmm3 # sched: [8:1.00]
+; GENERIC-NEXT: movaps %xmm3, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SLM-LABEL: test_blendvps:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: movaps %xmm0, %xmm3 # sched: [1:1.00]
; SLM-NEXT: movaps %xmm2, %xmm0 # sched: [1:1.00]
; SLM-NEXT: blendvps %xmm0, %xmm1, %xmm3 # sched: [1:1.00]
@@ -168,28 +228,46 @@ define <4 x float> @test_blendvps(<4 x float> %a0, <4 x float> %a1, <4 x float>
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_blendvps:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
-; SANDY-NEXT: vblendvps %xmm2, (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vblendvps %xmm2, (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_blendvps:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0 # sched: [2:2.00]
-; HASWELL-NEXT: vblendvps %xmm2, (%rdi), %xmm0, %xmm0 # sched: [6:2.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vblendvps %xmm2, (%rdi), %xmm0, %xmm0 # sched: [8:2.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_blendvps:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0 # sched: [2:2.00]
+; BROADWELL-NEXT: vblendvps %xmm2, (%rdi), %xmm0, %xmm0 # sched: [7:2.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_blendvps:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0 # sched: [2:0.67]
+; SKYLAKE-NEXT: vblendvps %xmm2, (%rdi), %xmm0, %xmm0 # sched: [8:0.67]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_blendvps:
+; SKX: # %bb.0:
+; SKX-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0 # sched: [2:0.67]
+; SKX-NEXT: vblendvps %xmm2, (%rdi), %xmm0, %xmm0 # sched: [8:0.67]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_blendvps:
-; BTVER2: # BB#0:
-; BTVER2-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
-; BTVER2-NEXT: vblendvps %xmm2, (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0 # sched: [2:2.00]
+; BTVER2-NEXT: vblendvps %xmm2, (%rdi), %xmm0, %xmm0 # sched: [7:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_blendvps:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; ZNVER1-NEXT: vblendvps %xmm2, (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
%2 = load <4 x float>, <4 x float> *%a3
%3 = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %1, <4 x float> %2, <4 x float> %a2)
@@ -199,40 +277,58 @@ declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>, <4 x floa
define <2 x double> @test_dppd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2) {
; GENERIC-LABEL: test_dppd:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: dppd $7, %xmm1, %xmm0
-; GENERIC-NEXT: dppd $7, (%rdi), %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: dppd $7, %xmm1, %xmm0 # sched: [9:1.00]
+; GENERIC-NEXT: dppd $7, (%rdi), %xmm0 # sched: [15:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SLM-LABEL: test_dppd:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: dppd $7, %xmm1, %xmm0 # sched: [3:1.00]
; SLM-NEXT: dppd $7, (%rdi), %xmm0 # sched: [6:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_dppd:
-; SANDY: # BB#0:
-; SANDY-NEXT: vdppd $7, %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vdppd $7, (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vdppd $7, %xmm1, %xmm0, %xmm0 # sched: [9:1.00]
+; SANDY-NEXT: vdppd $7, (%rdi), %xmm0, %xmm0 # sched: [15:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_dppd:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vdppd $7, %xmm1, %xmm0, %xmm0 # sched: [9:1.00]
-; HASWELL-NEXT: vdppd $7, (%rdi), %xmm0, %xmm0 # sched: [13:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vdppd $7, (%rdi), %xmm0, %xmm0 # sched: [15:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_dppd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vdppd $7, %xmm1, %xmm0, %xmm0 # sched: [9:1.00]
+; BROADWELL-NEXT: vdppd $7, (%rdi), %xmm0, %xmm0 # sched: [14:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_dppd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vdppd $7, %xmm1, %xmm0, %xmm0 # sched: [9:1.00]
+; SKYLAKE-NEXT: vdppd $7, (%rdi), %xmm0, %xmm0 # sched: [15:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_dppd:
+; SKX: # %bb.0:
+; SKX-NEXT: vdppd $7, %xmm1, %xmm0, %xmm0 # sched: [9:1.00]
+; SKX-NEXT: vdppd $7, (%rdi), %xmm0, %xmm0 # sched: [15:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_dppd:
-; BTVER2: # BB#0:
-; BTVER2-NEXT: vdppd $7, %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; BTVER2-NEXT: vdppd $7, (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: vdppd $7, %xmm1, %xmm0, %xmm0 # sched: [9:3.00]
+; BTVER2-NEXT: vdppd $7, (%rdi), %xmm0, %xmm0 # sched: [14:3.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_dppd:
-; ZNVER1: # BB#0:
-; ZNVER1-NEXT: vdppd $7, %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; ZNVER1-NEXT: vdppd $7, (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vdppd $7, %xmm1, %xmm0, %xmm0 # sched: [100:?]
+; ZNVER1-NEXT: vdppd $7, (%rdi), %xmm0, %xmm0 # sched: [100:?]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call <2 x double> @llvm.x86.sse41.dppd(<2 x double> %a0, <2 x double> %a1, i8 7)
%2 = load <2 x double>, <2 x double> *%a2, align 16
%3 = call <2 x double> @llvm.x86.sse41.dppd(<2 x double> %1, <2 x double> %2, i8 7)
@@ -242,40 +338,58 @@ declare <2 x double> @llvm.x86.sse41.dppd(<2 x double>, <2 x double>, i8) nounwi
define <4 x float> @test_dpps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2) {
; GENERIC-LABEL: test_dpps:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: dpps $7, %xmm1, %xmm0
-; GENERIC-NEXT: dpps $7, (%rdi), %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: dpps $7, %xmm1, %xmm0 # sched: [12:2.00]
+; GENERIC-NEXT: dpps $7, (%rdi), %xmm0 # sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SLM-LABEL: test_dpps:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: dpps $7, %xmm1, %xmm0 # sched: [3:1.00]
; SLM-NEXT: dpps $7, (%rdi), %xmm0 # sched: [6:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_dpps:
-; SANDY: # BB#0:
-; SANDY-NEXT: vdpps $7, %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vdpps $7, %xmm1, %xmm0, %xmm0 # sched: [12:2.00]
; SANDY-NEXT: vdpps $7, (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_dpps:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vdpps $7, %xmm1, %xmm0, %xmm0 # sched: [14:2.00]
-; HASWELL-NEXT: vdpps $7, (%rdi), %xmm0, %xmm0 # sched: [18:2.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vdpps $7, (%rdi), %xmm0, %xmm0 # sched: [20:2.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_dpps:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vdpps $7, %xmm1, %xmm0, %xmm0 # sched: [14:2.00]
+; BROADWELL-NEXT: vdpps $7, (%rdi), %xmm0, %xmm0 # sched: [19:2.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_dpps:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vdpps $7, %xmm1, %xmm0, %xmm0 # sched: [13:1.33]
+; SKYLAKE-NEXT: vdpps $7, (%rdi), %xmm0, %xmm0 # sched: [19:1.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_dpps:
+; SKX: # %bb.0:
+; SKX-NEXT: vdpps $7, %xmm1, %xmm0, %xmm0 # sched: [13:1.33]
+; SKX-NEXT: vdpps $7, (%rdi), %xmm0, %xmm0 # sched: [19:1.33]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_dpps:
-; BTVER2: # BB#0:
-; BTVER2-NEXT: vdpps $7, %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; BTVER2-NEXT: vdpps $7, (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: vdpps $7, %xmm1, %xmm0, %xmm0 # sched: [11:3.00]
+; BTVER2-NEXT: vdpps $7, (%rdi), %xmm0, %xmm0 # sched: [16:3.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_dpps:
-; ZNVER1: # BB#0:
-; ZNVER1-NEXT: vdpps $7, %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; ZNVER1-NEXT: vdpps $7, (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vdpps $7, %xmm1, %xmm0, %xmm0 # sched: [100:?]
+; ZNVER1-NEXT: vdpps $7, (%rdi), %xmm0, %xmm0 # sched: [100:?]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call <4 x float> @llvm.x86.sse41.dpps(<4 x float> %a0, <4 x float> %a1, i8 7)
%2 = load <4 x float>, <4 x float> *%a2, align 16
%3 = call <4 x float> @llvm.x86.sse41.dpps(<4 x float> %1, <4 x float> %2, i8 7)
@@ -283,42 +397,122 @@ define <4 x float> @test_dpps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2
}
declare <4 x float> @llvm.x86.sse41.dpps(<4 x float>, <4 x float>, i8) nounwind readnone
+define i32 @test_extractps(<4 x float> %a0, i32 *%a1) {
+; GENERIC-LABEL: test_extractps:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: extractps $3, %xmm0, %eax # sched: [3:1.00]
+; GENERIC-NEXT: extractps $1, %xmm0, (%rdi) # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SLM-LABEL: test_extractps:
+; SLM: # %bb.0:
+; SLM-NEXT: extractps $3, %xmm0, %eax # sched: [1:1.00]
+; SLM-NEXT: extractps $1, %xmm0, (%rdi) # sched: [4:2.00]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_extractps:
+; SANDY: # %bb.0:
+; SANDY-NEXT: vextractps $3, %xmm0, %eax # sched: [3:1.00]
+; SANDY-NEXT: vextractps $1, %xmm0, (%rdi) # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_extractps:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vextractps $3, %xmm0, %eax # sched: [2:1.00]
+; HASWELL-NEXT: vextractps $1, %xmm0, (%rdi) # sched: [2:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_extractps:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vextractps $3, %xmm0, %eax # sched: [2:1.00]
+; BROADWELL-NEXT: vextractps $1, %xmm0, (%rdi) # sched: [2:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_extractps:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vextractps $3, %xmm0, %eax # sched: [3:1.00]
+; SKYLAKE-NEXT: vextractps $1, %xmm0, (%rdi) # sched: [2:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_extractps:
+; SKX: # %bb.0:
+; SKX-NEXT: vextractps $3, %xmm0, %eax # sched: [3:1.00]
+; SKX-NEXT: vextractps $1, %xmm0, (%rdi) # sched: [2:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_extractps:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: vextractps $3, %xmm0, %eax # sched: [1:0.50]
+; BTVER2-NEXT: vextractps $1, %xmm0, (%rdi) # sched: [6:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_extractps:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vextractps $3, %xmm0, %eax # sched: [2:2.00]
+; ZNVER1-NEXT: vextractps $1, %xmm0, (%rdi) # sched: [5:2.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = extractelement <4 x float> %a0, i32 3
+ %2 = extractelement <4 x float> %a0, i32 1
+ %3 = bitcast float %1 to i32
+ %4 = bitcast float %2 to i32
+ store i32 %4, i32 *%a1
+ ret i32 %3
+}
+
define <4 x float> @test_insertps(<4 x float> %a0, <4 x float> %a1, float *%a2) {
; GENERIC-LABEL: test_insertps:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: insertps {{.*#+}} xmm0 = zero,xmm1[0],xmm0[2,3]
-; GENERIC-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: insertps {{.*#+}} xmm0 = zero,xmm1[0],xmm0[2,3] sched: [1:1.00]
+; GENERIC-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] sched: [7:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SLM-LABEL: test_insertps:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: insertps {{.*#+}} xmm0 = zero,xmm1[0],xmm0[2,3] sched: [1:1.00]
; SLM-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] sched: [4:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_insertps:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vinsertps {{.*#+}} xmm0 = zero,xmm1[0],xmm0[2,3] sched: [1:1.00]
-; SANDY-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] sched: [5:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] sched: [7:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_insertps:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vinsertps {{.*#+}} xmm0 = zero,xmm1[0],xmm0[2,3] sched: [1:1.00]
-; HASWELL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] sched: [5:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] sched: [7:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_insertps:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vinsertps {{.*#+}} xmm0 = zero,xmm1[0],xmm0[2,3] sched: [1:1.00]
+; BROADWELL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] sched: [6:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_insertps:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vinsertps {{.*#+}} xmm0 = zero,xmm1[0],xmm0[2,3] sched: [1:1.00]
+; SKYLAKE-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] sched: [7:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_insertps:
+; SKX: # %bb.0:
+; SKX-NEXT: vinsertps {{.*#+}} xmm0 = zero,xmm1[0],xmm0[2,3] sched: [1:1.00]
+; SKX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] sched: [7:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_insertps:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vinsertps {{.*#+}} xmm0 = zero,xmm1[0],xmm0[2,3] sched: [1:0.50]
; BTVER2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_insertps:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vinsertps {{.*#+}} xmm0 = zero,xmm1[0],xmm0[2,3] sched: [1:0.50]
; ZNVER1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] sched: [8:0.50]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a0, <4 x float> %a1, i8 17)
%2 = load float, float *%a2
%3 = insertelement <4 x float> %1, float %2, i32 3
@@ -328,34 +522,49 @@ declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i8) nounw
define <2 x i64> @test_movntdqa(i8* %a0) {
; GENERIC-LABEL: test_movntdqa:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: movntdqa (%rdi), %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: movntdqa (%rdi), %xmm0 # sched: [6:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SLM-LABEL: test_movntdqa:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: movntdqa (%rdi), %xmm0 # sched: [3:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_movntdqa:
-; SANDY: # BB#0:
-; SANDY-NEXT: vmovntdqa (%rdi), %xmm0 # sched: [4:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vmovntdqa (%rdi), %xmm0 # sched: [6:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movntdqa:
-; HASWELL: # BB#0:
-; HASWELL-NEXT: vmovntdqa (%rdi), %xmm0 # sched: [4:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vmovntdqa (%rdi), %xmm0 # sched: [6:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_movntdqa:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vmovntdqa (%rdi), %xmm0 # sched: [5:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_movntdqa:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vmovntdqa (%rdi), %xmm0 # sched: [6:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_movntdqa:
+; SKX: # %bb.0:
+; SKX-NEXT: vmovntdqa (%rdi), %xmm0 # sched: [6:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_movntdqa:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vmovntdqa (%rdi), %xmm0 # sched: [5:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_movntdqa:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vmovntdqa (%rdi), %xmm0 # sched: [8:0.50]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call <2 x i64> @llvm.x86.sse41.movntdqa(i8* %a0)
ret <2 x i64> %1
}
@@ -363,40 +572,58 @@ declare <2 x i64> @llvm.x86.sse41.movntdqa(i8*) nounwind readnone
define <8 x i16> @test_mpsadbw(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; GENERIC-LABEL: test_mpsadbw:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: mpsadbw $7, %xmm1, %xmm0
-; GENERIC-NEXT: mpsadbw $7, (%rdi), %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: mpsadbw $7, %xmm1, %xmm0 # sched: [5:1.00]
+; GENERIC-NEXT: mpsadbw $7, (%rdi), %xmm0 # sched: [11:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SLM-LABEL: test_mpsadbw:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: mpsadbw $7, %xmm1, %xmm0 # sched: [7:1.00]
; SLM-NEXT: mpsadbw $7, (%rdi), %xmm0 # sched: [10:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_mpsadbw:
-; SANDY: # BB#0:
-; SANDY-NEXT: vmpsadbw $7, %xmm1, %xmm0, %xmm0 # sched: [6:1.00]
-; SANDY-NEXT: vmpsadbw $7, (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vmpsadbw $7, %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; SANDY-NEXT: vmpsadbw $7, (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_mpsadbw:
-; HASWELL: # BB#0:
-; HASWELL-NEXT: vmpsadbw $7, %xmm1, %xmm0, %xmm0 # sched: [6:2.00]
-; HASWELL-NEXT: vmpsadbw $7, (%rdi), %xmm0, %xmm0 # sched: [6:2.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vmpsadbw $7, %xmm1, %xmm0, %xmm0 # sched: [7:2.00]
+; HASWELL-NEXT: vmpsadbw $7, (%rdi), %xmm0, %xmm0 # sched: [13:2.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_mpsadbw:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vmpsadbw $7, %xmm1, %xmm0, %xmm0 # sched: [7:2.00]
+; BROADWELL-NEXT: vmpsadbw $7, (%rdi), %xmm0, %xmm0 # sched: [12:2.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_mpsadbw:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vmpsadbw $7, %xmm1, %xmm0, %xmm0 # sched: [4:2.00]
+; SKYLAKE-NEXT: vmpsadbw $7, (%rdi), %xmm0, %xmm0 # sched: [10:2.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_mpsadbw:
+; SKX: # %bb.0:
+; SKX-NEXT: vmpsadbw $7, %xmm1, %xmm0, %xmm0 # sched: [4:2.00]
+; SKX-NEXT: vmpsadbw $7, (%rdi), %xmm0, %xmm0 # sched: [10:2.00]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_mpsadbw:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vmpsadbw $7, %xmm1, %xmm0, %xmm0 # sched: [3:2.00]
; BTVER2-NEXT: vmpsadbw $7, (%rdi), %xmm0, %xmm0 # sched: [8:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_mpsadbw:
-; ZNVER1: # BB#0:
-; ZNVER1-NEXT: vmpsadbw $7, %xmm1, %xmm0, %xmm0 # sched: [100:0.00]
-; ZNVER1-NEXT: vmpsadbw $7, (%rdi), %xmm0, %xmm0 # sched: [100:0.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vmpsadbw $7, %xmm1, %xmm0, %xmm0 # sched: [100:?]
+; ZNVER1-NEXT: vmpsadbw $7, (%rdi), %xmm0, %xmm0 # sched: [100:?]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8> %a0, <16 x i8> %a1, i8 7)
%2 = bitcast <8 x i16> %1 to <16 x i8>
%3 = load <16 x i8>, <16 x i8> *%a2, align 16
@@ -407,40 +634,58 @@ declare <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8>, <16 x i8>, i8) nounwind rea
define <8 x i16> @test_packusdw(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
; GENERIC-LABEL: test_packusdw:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: packusdw %xmm1, %xmm0
-; GENERIC-NEXT: packusdw (%rdi), %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: packusdw %xmm1, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT: packusdw (%rdi), %xmm0 # sched: [7:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SLM-LABEL: test_packusdw:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: packusdw %xmm1, %xmm0 # sched: [1:1.00]
; SLM-NEXT: packusdw (%rdi), %xmm0 # sched: [4:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_packusdw:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpackusdw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpackusdw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_packusdw:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT: vpackusdw (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpackusdw (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_packusdw:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; BROADWELL-NEXT: vpackusdw (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_packusdw:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; SKYLAKE-NEXT: vpackusdw (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_packusdw:
+; SKX: # %bb.0:
+; SKX-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; SKX-NEXT: vpackusdw (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_packusdw:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vpackusdw (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_packusdw:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
; ZNVER1-NEXT: vpackusdw (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a0, <4 x i32> %a1)
%2 = bitcast <8 x i16> %1 to <4 x i32>
%3 = load <4 x i32>, <4 x i32> *%a2, align 16
@@ -451,16 +696,16 @@ declare <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32>, <4 x i32>) nounwind readno
define <16 x i8> @test_pblendvb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %a2, <16 x i8> *%a3) {
; GENERIC-LABEL: test_pblendvb:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: movdqa %xmm0, %xmm3
-; GENERIC-NEXT: movaps %xmm2, %xmm0
-; GENERIC-NEXT: pblendvb %xmm0, %xmm1, %xmm3
-; GENERIC-NEXT: pblendvb %xmm0, (%rdi), %xmm3
-; GENERIC-NEXT: movdqa %xmm3, %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: movdqa %xmm0, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT: movaps %xmm2, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: pblendvb %xmm0, %xmm1, %xmm3 # sched: [8:1.00]
+; GENERIC-NEXT: pblendvb %xmm0, (%rdi), %xmm3 # sched: [6:1.00]
+; GENERIC-NEXT: movdqa %xmm3, %xmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SLM-LABEL: test_pblendvb:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: movdqa %xmm0, %xmm3 # sched: [1:0.50]
; SLM-NEXT: movaps %xmm2, %xmm0 # sched: [1:1.00]
; SLM-NEXT: pblendvb %xmm0, %xmm1, %xmm3 # sched: [1:1.00]
@@ -469,28 +714,46 @@ define <16 x i8> @test_pblendvb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %a2, <16
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_pblendvb:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
-; SANDY-NEXT: vpblendvb %xmm2, (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpblendvb %xmm2, (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pblendvb:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 # sched: [2:2.00]
-; HASWELL-NEXT: vpblendvb %xmm2, (%rdi), %xmm0, %xmm0 # sched: [6:2.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpblendvb %xmm2, (%rdi), %xmm0, %xmm0 # sched: [8:2.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pblendvb:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 # sched: [2:2.00]
+; BROADWELL-NEXT: vpblendvb %xmm2, (%rdi), %xmm0, %xmm0 # sched: [7:2.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pblendvb:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 # sched: [2:0.67]
+; SKYLAKE-NEXT: vpblendvb %xmm2, (%rdi), %xmm0, %xmm0 # sched: [8:0.67]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pblendvb:
+; SKX: # %bb.0:
+; SKX-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 # sched: [2:0.67]
+; SKX-NEXT: vpblendvb %xmm2, (%rdi), %xmm0, %xmm0 # sched: [8:0.67]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_pblendvb:
-; BTVER2: # BB#0:
-; BTVER2-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
-; BTVER2-NEXT: vpblendvb %xmm2, (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 # sched: [2:2.00]
+; BTVER2-NEXT: vpblendvb %xmm2, (%rdi), %xmm0, %xmm0 # sched: [7:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_pblendvb:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
; ZNVER1-NEXT: vpblendvb %xmm2, (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %a2)
%2 = load <16 x i8>, <16 x i8> *%a3, align 16
%3 = call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %1, <16 x i8> %2, <16 x i8> %a2)
@@ -500,40 +763,58 @@ declare <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8>, <16 x i8>, <16 x i8>) noun
define <8 x i16> @test_pblendw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; GENERIC-LABEL: test_pblendw:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
-; GENERIC-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],mem[2,3],xmm0[4,5,6],mem[7]
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] sched: [1:0.50]
+; GENERIC-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],mem[2,3],xmm0[4,5,6],mem[7] sched: [7:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SLM-LABEL: test_pblendw:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] sched: [1:1.00]
; SLM-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],mem[2,3],xmm0[4,5,6],mem[7] sched: [4:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_pblendw:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] sched: [1:0.50]
-; SANDY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],mem[2,3],xmm0[4,5,6],mem[7] sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],mem[2,3],xmm0[4,5,6],mem[7] sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pblendw:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] sched: [1:1.00]
-; HASWELL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],mem[2,3],xmm0[4,5,6],mem[7] sched: [4:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],mem[2,3],xmm0[4,5,6],mem[7] sched: [7:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pblendw:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] sched: [1:1.00]
+; BROADWELL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],mem[2,3],xmm0[4,5,6],mem[7] sched: [6:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pblendw:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] sched: [1:1.00]
+; SKYLAKE-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],mem[2,3],xmm0[4,5,6],mem[7] sched: [7:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pblendw:
+; SKX: # %bb.0:
+; SKX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] sched: [1:1.00]
+; SKX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],mem[2,3],xmm0[4,5,6],mem[7] sched: [7:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_pblendw:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] sched: [1:0.50]
; BTVER2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],mem[2,3],xmm0[4,5,6],mem[7] sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_pblendw:
-; ZNVER1: # BB#0:
-; ZNVER1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] sched: [1:0.50]
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] sched: [1:0.33]
; ZNVER1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],mem[2,3],xmm0[4,5,6],mem[7] sched: [8:0.50]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
%2 = load <8 x i16>, <8 x i16> *%a2, align 16
%3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> <i32 0, i32 1, i32 10, i32 11, i32 4, i32 5, i32 6, i32 15>
@@ -542,40 +823,60 @@ define <8 x i16> @test_pblendw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
define <2 x i64> @test_pcmpeqq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
; GENERIC-LABEL: test_pcmpeqq:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: pcmpeqq %xmm1, %xmm0
-; GENERIC-NEXT: pcmpeqq (%rdi), %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: pcmpeqq %xmm1, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT: pcmpeqq (%rdi), %xmm0 # sched: [7:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SLM-LABEL: test_pcmpeqq:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: pcmpeqq %xmm1, %xmm0 # sched: [1:0.50]
; SLM-NEXT: pcmpeqq (%rdi), %xmm0 # sched: [4:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_pcmpeqq:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpcmpeqq (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpcmpeqq (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pcmpeqq:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpcmpeqq (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpcmpeqq (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pcmpeqq:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BROADWELL-NEXT: vpcmpeqq (%rdi), %xmm0, %xmm0 # sched: [6:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pcmpeqq:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; SKYLAKE-NEXT: vpcmpeqq (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pcmpeqq:
+; SKX: # %bb.0:
+; SKX-NEXT: vpcmpeqq %xmm1, %xmm0, %k0 # sched: [3:1.00]
+; SKX-NEXT: vpmovm2q %k0, %xmm0 # sched: [1:0.25]
+; SKX-NEXT: vpcmpeqq (%rdi), %xmm0, %k0 # sched: [9:1.00]
+; SKX-NEXT: vpmovm2q %k0, %xmm0 # sched: [1:0.25]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_pcmpeqq:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vpcmpeqq (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_pcmpeqq:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
; ZNVER1-NEXT: vpcmpeqq (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = icmp eq <2 x i64> %a0, %a1
%2 = sext <2 x i1> %1 to <2 x i64>
%3 = load <2 x i64>, <2 x i64>*%a2, align 16
@@ -586,40 +887,58 @@ define <2 x i64> @test_pcmpeqq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
define i32 @test_pextrb(<16 x i8> %a0, i8 *%a1) {
; GENERIC-LABEL: test_pextrb:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: pextrb $3, %xmm0, %eax
-; GENERIC-NEXT: pextrb $1, %xmm0, (%rdi)
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: pextrb $3, %xmm0, %eax # sched: [3:1.00]
+; GENERIC-NEXT: pextrb $1, %xmm0, (%rdi) # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SLM-LABEL: test_pextrb:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: pextrb $3, %xmm0, %eax # sched: [1:1.00]
; SLM-NEXT: pextrb $1, %xmm0, (%rdi) # sched: [4:2.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_pextrb:
-; SANDY: # BB#0:
-; SANDY-NEXT: vpextrb $3, %xmm0, %eax # sched: [1:0.50]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vpextrb $3, %xmm0, %eax # sched: [3:1.00]
; SANDY-NEXT: vpextrb $1, %xmm0, (%rdi) # sched: [5:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pextrb:
-; HASWELL: # BB#0:
-; HASWELL-NEXT: vpextrb $3, %xmm0, %eax # sched: [1:1.00]
-; HASWELL-NEXT: vpextrb $1, %xmm0, (%rdi) # sched: [5:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vpextrb $3, %xmm0, %eax # sched: [2:1.00]
+; HASWELL-NEXT: vpextrb $1, %xmm0, (%rdi) # sched: [2:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pextrb:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpextrb $3, %xmm0, %eax # sched: [2:1.00]
+; BROADWELL-NEXT: vpextrb $1, %xmm0, (%rdi) # sched: [2:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pextrb:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpextrb $3, %xmm0, %eax # sched: [3:1.00]
+; SKYLAKE-NEXT: vpextrb $1, %xmm0, (%rdi) # sched: [2:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pextrb:
+; SKX: # %bb.0:
+; SKX-NEXT: vpextrb $3, %xmm0, %eax # sched: [3:1.00]
+; SKX-NEXT: vpextrb $1, %xmm0, (%rdi) # sched: [2:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_pextrb:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vpextrb $3, %xmm0, %eax # sched: [1:0.50]
; BTVER2-NEXT: vpextrb $1, %xmm0, (%rdi) # sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_pextrb:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vpextrb $3, %xmm0, %eax # sched: [1:0.25]
; ZNVER1-NEXT: vpextrb $1, %xmm0, (%rdi) # sched: [8:1.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = extractelement <16 x i8> %a0, i32 3
%2 = extractelement <16 x i8> %a0, i32 1
store i8 %2, i8 *%a1
@@ -629,82 +948,128 @@ define i32 @test_pextrb(<16 x i8> %a0, i8 *%a1) {
define i32 @test_pextrd(<4 x i32> %a0, i32 *%a1) {
; GENERIC-LABEL: test_pextrd:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: pextrd $3, %xmm0, %eax
-; GENERIC-NEXT: pextrd $1, %xmm0, (%rdi)
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: paddd %xmm0, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT: pextrd $3, %xmm0, %eax # sched: [3:1.00]
+; GENERIC-NEXT: pextrd $1, %xmm0, (%rdi) # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SLM-LABEL: test_pextrd:
-; SLM: # BB#0:
+; SLM: # %bb.0:
+; SLM-NEXT: paddd %xmm0, %xmm0 # sched: [1:0.50]
; SLM-NEXT: pextrd $3, %xmm0, %eax # sched: [1:1.00]
; SLM-NEXT: pextrd $1, %xmm0, (%rdi) # sched: [4:2.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_pextrd:
-; SANDY: # BB#0:
-; SANDY-NEXT: vpextrd $3, %xmm0, %eax # sched: [1:0.50]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vpaddd %xmm0, %xmm0, %xmm0 # sched: [1:0.50]
+; SANDY-NEXT: vpextrd $3, %xmm0, %eax # sched: [3:1.00]
; SANDY-NEXT: vpextrd $1, %xmm0, (%rdi) # sched: [5:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pextrd:
-; HASWELL: # BB#0:
-; HASWELL-NEXT: vpextrd $3, %xmm0, %eax # sched: [1:1.00]
-; HASWELL-NEXT: vpextrd $1, %xmm0, (%rdi) # sched: [5:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vpaddd %xmm0, %xmm0, %xmm0 # sched: [1:0.50]
+; HASWELL-NEXT: vpextrd $3, %xmm0, %eax # sched: [2:1.00]
+; HASWELL-NEXT: vpextrd $1, %xmm0, (%rdi) # sched: [2:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pextrd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpaddd %xmm0, %xmm0, %xmm0 # sched: [1:0.50]
+; BROADWELL-NEXT: vpextrd $3, %xmm0, %eax # sched: [2:1.00]
+; BROADWELL-NEXT: vpextrd $1, %xmm0, (%rdi) # sched: [2:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pextrd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpaddd %xmm0, %xmm0, %xmm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: vpextrd $3, %xmm0, %eax # sched: [3:1.00]
+; SKYLAKE-NEXT: vpextrd $1, %xmm0, (%rdi) # sched: [2:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pextrd:
+; SKX: # %bb.0:
+; SKX-NEXT: vpaddd %xmm0, %xmm0, %xmm0 # sched: [1:0.33]
+; SKX-NEXT: vpextrd $3, %xmm0, %eax # sched: [3:1.00]
+; SKX-NEXT: vpextrd $1, %xmm0, (%rdi) # sched: [2:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_pextrd:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: vpaddd %xmm0, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vpextrd $3, %xmm0, %eax # sched: [1:0.50]
; BTVER2-NEXT: vpextrd $1, %xmm0, (%rdi) # sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_pextrd:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpaddd %xmm0, %xmm0, %xmm0 # sched: [1:0.25]
; ZNVER1-NEXT: vpextrd $3, %xmm0, %eax # sched: [1:0.25]
; ZNVER1-NEXT: vpextrd $1, %xmm0, (%rdi) # sched: [8:1.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
- %1 = extractelement <4 x i32> %a0, i32 3
- %2 = extractelement <4 x i32> %a0, i32 1
- store i32 %2, i32 *%a1
- ret i32 %1
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = add <4 x i32> %a0, %a0
+ %2 = extractelement <4 x i32> %1, i32 3
+ %3 = extractelement <4 x i32> %1, i32 1
+ store i32 %3, i32 *%a1
+ ret i32 %2
}
define i64 @test_pextrq(<2 x i64> %a0, <2 x i64> %a1, i64 *%a2) {
; GENERIC-LABEL: test_pextrq:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: pextrq $1, %xmm0, %rax
-; GENERIC-NEXT: pextrq $1, %xmm0, (%rdi)
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: pextrq $1, %xmm0, %rax # sched: [3:1.00]
+; GENERIC-NEXT: pextrq $1, %xmm0, (%rdi) # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SLM-LABEL: test_pextrq:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: pextrq $1, %xmm0, %rax # sched: [1:1.00]
; SLM-NEXT: pextrq $1, %xmm0, (%rdi) # sched: [4:2.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_pextrq:
-; SANDY: # BB#0:
-; SANDY-NEXT: vpextrq $1, %xmm0, %rax # sched: [1:0.50]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vpextrq $1, %xmm0, %rax # sched: [3:1.00]
; SANDY-NEXT: vpextrq $1, %xmm0, (%rdi) # sched: [5:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pextrq:
-; HASWELL: # BB#0:
-; HASWELL-NEXT: vpextrq $1, %xmm0, %rax # sched: [1:1.00]
-; HASWELL-NEXT: vpextrq $1, %xmm0, (%rdi) # sched: [5:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vpextrq $1, %xmm0, %rax # sched: [2:1.00]
+; HASWELL-NEXT: vpextrq $1, %xmm0, (%rdi) # sched: [2:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pextrq:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpextrq $1, %xmm0, %rax # sched: [2:1.00]
+; BROADWELL-NEXT: vpextrq $1, %xmm0, (%rdi) # sched: [2:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pextrq:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpextrq $1, %xmm0, %rax # sched: [3:1.00]
+; SKYLAKE-NEXT: vpextrq $1, %xmm0, (%rdi) # sched: [2:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pextrq:
+; SKX: # %bb.0:
+; SKX-NEXT: vpextrq $1, %xmm0, %rax # sched: [3:1.00]
+; SKX-NEXT: vpextrq $1, %xmm0, (%rdi) # sched: [2:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_pextrq:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vpextrq $1, %xmm0, %rax # sched: [1:0.50]
; BTVER2-NEXT: vpextrq $1, %xmm0, (%rdi) # sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_pextrq:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vpextrq $1, %xmm0, %rax # sched: [1:0.25]
; ZNVER1-NEXT: vpextrq $1, %xmm0, (%rdi) # sched: [8:1.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = extractelement <2 x i64> %a0, i32 1
%2 = extractelement <2 x i64> %a0, i32 1
store i64 %2, i64 *%a2
@@ -713,40 +1078,58 @@ define i64 @test_pextrq(<2 x i64> %a0, <2 x i64> %a1, i64 *%a2) {
define i32 @test_pextrw(<8 x i16> %a0, i16 *%a1) {
; GENERIC-LABEL: test_pextrw:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: pextrw $3, %xmm0, %eax
-; GENERIC-NEXT: pextrw $1, %xmm0, (%rdi)
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: pextrw $3, %xmm0, %eax # sched: [3:1.00]
+; GENERIC-NEXT: pextrw $1, %xmm0, (%rdi) # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SLM-LABEL: test_pextrw:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: pextrw $3, %xmm0, %eax # sched: [4:1.00]
; SLM-NEXT: pextrw $1, %xmm0, (%rdi) # sched: [4:2.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_pextrw:
-; SANDY: # BB#0:
-; SANDY-NEXT: vpextrw $3, %xmm0, %eax # sched: [1:0.50]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vpextrw $3, %xmm0, %eax # sched: [3:1.00]
; SANDY-NEXT: vpextrw $1, %xmm0, (%rdi) # sched: [5:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pextrw:
-; HASWELL: # BB#0:
-; HASWELL-NEXT: vpextrw $3, %xmm0, %eax # sched: [1:1.00]
-; HASWELL-NEXT: vpextrw $1, %xmm0, (%rdi) # sched: [5:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vpextrw $3, %xmm0, %eax # sched: [2:1.00]
+; HASWELL-NEXT: vpextrw $1, %xmm0, (%rdi) # sched: [2:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pextrw:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpextrw $3, %xmm0, %eax # sched: [2:1.00]
+; BROADWELL-NEXT: vpextrw $1, %xmm0, (%rdi) # sched: [2:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pextrw:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpextrw $3, %xmm0, %eax # sched: [3:1.00]
+; SKYLAKE-NEXT: vpextrw $1, %xmm0, (%rdi) # sched: [2:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pextrw:
+; SKX: # %bb.0:
+; SKX-NEXT: vpextrw $3, %xmm0, %eax # sched: [3:1.00]
+; SKX-NEXT: vpextrw $1, %xmm0, (%rdi) # sched: [2:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_pextrw:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vpextrw $3, %xmm0, %eax # sched: [1:0.50]
; BTVER2-NEXT: vpextrw $1, %xmm0, (%rdi) # sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_pextrw:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vpextrw $3, %xmm0, %eax # sched: [1:0.25]
; ZNVER1-NEXT: vpextrw $1, %xmm0, (%rdi) # sched: [8:1.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = extractelement <8 x i16> %a0, i32 3
%2 = extractelement <8 x i16> %a0, i32 1
store i16 %2, i16 *%a1
@@ -756,40 +1139,58 @@ define i32 @test_pextrw(<8 x i16> %a0, i16 *%a1) {
define <8 x i16> @test_phminposuw(<8 x i16> *%a0) {
; GENERIC-LABEL: test_phminposuw:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: phminposuw (%rdi), %xmm0
-; GENERIC-NEXT: phminposuw %xmm0, %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: phminposuw (%rdi), %xmm0 # sched: [11:1.00]
+; GENERIC-NEXT: phminposuw %xmm0, %xmm0 # sched: [5:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SLM-LABEL: test_phminposuw:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: phminposuw (%rdi), %xmm0 # sched: [7:1.00]
; SLM-NEXT: phminposuw %xmm0, %xmm0 # sched: [4:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_phminposuw:
-; SANDY: # BB#0:
-; SANDY-NEXT: vphminposuw (%rdi), %xmm0 # sched: [9:1.00]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vphminposuw (%rdi), %xmm0 # sched: [11:1.00]
; SANDY-NEXT: vphminposuw %xmm0, %xmm0 # sched: [5:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_phminposuw:
-; HASWELL: # BB#0:
-; HASWELL-NEXT: vphminposuw (%rdi), %xmm0 # sched: [9:1.00]
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vphminposuw (%rdi), %xmm0 # sched: [11:1.00]
; HASWELL-NEXT: vphminposuw %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_phminposuw:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vphminposuw (%rdi), %xmm0 # sched: [10:1.00]
+; BROADWELL-NEXT: vphminposuw %xmm0, %xmm0 # sched: [5:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_phminposuw:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vphminposuw (%rdi), %xmm0 # sched: [10:0.50]
+; SKYLAKE-NEXT: vphminposuw %xmm0, %xmm0 # sched: [4:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_phminposuw:
+; SKX: # %bb.0:
+; SKX-NEXT: vphminposuw (%rdi), %xmm0 # sched: [10:0.50]
+; SKX-NEXT: vphminposuw %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_phminposuw:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vphminposuw (%rdi), %xmm0 # sched: [7:1.00]
; BTVER2-NEXT: vphminposuw %xmm0, %xmm0 # sched: [2:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_phminposuw:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vphminposuw (%rdi), %xmm0 # sched: [11:1.00]
; ZNVER1-NEXT: vphminposuw %xmm0, %xmm0 # sched: [4:1.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = load <8 x i16>, <8 x i16> *%a0, align 16
%2 = call <8 x i16> @llvm.x86.sse41.phminposuw(<8 x i16> %1)
%3 = call <8 x i16> @llvm.x86.sse41.phminposuw(<8 x i16> %2)
@@ -799,40 +1200,58 @@ declare <8 x i16> @llvm.x86.sse41.phminposuw(<8 x i16>) nounwind readnone
define <16 x i8> @test_pinsrb(<16 x i8> %a0, i8 %a1, i8 *%a2) {
; GENERIC-LABEL: test_pinsrb:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: pinsrb $1, %edi, %xmm0
-; GENERIC-NEXT: pinsrb $3, (%rsi), %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: pinsrb $1, %edi, %xmm0 # sched: [2:1.00]
+; GENERIC-NEXT: pinsrb $3, (%rsi), %xmm0 # sched: [7:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SLM-LABEL: test_pinsrb:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: pinsrb $1, %edi, %xmm0 # sched: [1:1.00]
; SLM-NEXT: pinsrb $3, (%rsi), %xmm0 # sched: [4:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_pinsrb:
-; SANDY: # BB#0:
-; SANDY-NEXT: vpinsrb $1, %edi, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpinsrb $3, (%rsi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vpinsrb $1, %edi, %xmm0, %xmm0 # sched: [2:1.00]
+; SANDY-NEXT: vpinsrb $3, (%rsi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pinsrb:
-; HASWELL: # BB#0:
-; HASWELL-NEXT: vpinsrb $1, %edi, %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT: vpinsrb $3, (%rsi), %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vpinsrb $1, %edi, %xmm0, %xmm0 # sched: [2:2.00]
+; HASWELL-NEXT: vpinsrb $3, (%rsi), %xmm0, %xmm0 # sched: [6:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pinsrb:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpinsrb $1, %edi, %xmm0, %xmm0 # sched: [2:2.00]
+; BROADWELL-NEXT: vpinsrb $3, (%rsi), %xmm0, %xmm0 # sched: [6:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pinsrb:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpinsrb $1, %edi, %xmm0, %xmm0 # sched: [2:2.00]
+; SKYLAKE-NEXT: vpinsrb $3, (%rsi), %xmm0, %xmm0 # sched: [6:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pinsrb:
+; SKX: # %bb.0:
+; SKX-NEXT: vpinsrb $1, %edi, %xmm0, %xmm0 # sched: [2:2.00]
+; SKX-NEXT: vpinsrb $3, (%rsi), %xmm0, %xmm0 # sched: [6:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_pinsrb:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vpinsrb $1, %edi, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vpinsrb $3, (%rsi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_pinsrb:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vpinsrb $1, %edi, %xmm0, %xmm0 # sched: [1:0.25]
; ZNVER1-NEXT: vpinsrb $3, (%rsi), %xmm0, %xmm0 # sched: [8:0.50]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = insertelement <16 x i8> %a0, i8 %a1, i32 1
%2 = load i8, i8 *%a2
%3 = insertelement <16 x i8> %1, i8 %2, i32 3
@@ -841,40 +1260,58 @@ define <16 x i8> @test_pinsrb(<16 x i8> %a0, i8 %a1, i8 *%a2) {
define <4 x i32> @test_pinsrd(<4 x i32> %a0, i32 %a1, i32 *%a2) {
; GENERIC-LABEL: test_pinsrd:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: pinsrd $1, %edi, %xmm0
-; GENERIC-NEXT: pinsrd $3, (%rsi), %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: pinsrd $1, %edi, %xmm0 # sched: [2:1.00]
+; GENERIC-NEXT: pinsrd $3, (%rsi), %xmm0 # sched: [7:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SLM-LABEL: test_pinsrd:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: pinsrd $1, %edi, %xmm0 # sched: [1:1.00]
; SLM-NEXT: pinsrd $3, (%rsi), %xmm0 # sched: [4:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_pinsrd:
-; SANDY: # BB#0:
-; SANDY-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpinsrd $3, (%rsi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0 # sched: [2:1.00]
+; SANDY-NEXT: vpinsrd $3, (%rsi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pinsrd:
-; HASWELL: # BB#0:
-; HASWELL-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT: vpinsrd $3, (%rsi), %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0 # sched: [2:2.00]
+; HASWELL-NEXT: vpinsrd $3, (%rsi), %xmm0, %xmm0 # sched: [6:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pinsrd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0 # sched: [2:2.00]
+; BROADWELL-NEXT: vpinsrd $3, (%rsi), %xmm0, %xmm0 # sched: [6:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pinsrd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0 # sched: [2:2.00]
+; SKYLAKE-NEXT: vpinsrd $3, (%rsi), %xmm0, %xmm0 # sched: [6:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pinsrd:
+; SKX: # %bb.0:
+; SKX-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0 # sched: [2:2.00]
+; SKX-NEXT: vpinsrd $3, (%rsi), %xmm0, %xmm0 # sched: [6:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_pinsrd:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vpinsrd $3, (%rsi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_pinsrd:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0 # sched: [1:0.25]
; ZNVER1-NEXT: vpinsrd $3, (%rsi), %xmm0, %xmm0 # sched: [8:0.50]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = insertelement <4 x i32> %a0, i32 %a1, i32 1
%2 = load i32, i32 *%a2
%3 = insertelement <4 x i32> %1, i32 %2, i32 3
@@ -883,46 +1320,67 @@ define <4 x i32> @test_pinsrd(<4 x i32> %a0, i32 %a1, i32 *%a2) {
define <2 x i64> @test_pinsrq(<2 x i64> %a0, <2 x i64> %a1, i64 %a2, i64 *%a3) {
; GENERIC-LABEL: test_pinsrq:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: pinsrq $1, %rdi, %xmm0
-; GENERIC-NEXT: pinsrq $1, (%rsi), %xmm1
-; GENERIC-NEXT: paddq %xmm1, %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: pinsrq $1, %rdi, %xmm0 # sched: [2:1.00]
+; GENERIC-NEXT: pinsrq $1, (%rsi), %xmm1 # sched: [7:0.50]
+; GENERIC-NEXT: paddq %xmm1, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SLM-LABEL: test_pinsrq:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: pinsrq $1, (%rsi), %xmm1 # sched: [4:1.00]
; SLM-NEXT: pinsrq $1, %rdi, %xmm0 # sched: [1:1.00]
; SLM-NEXT: paddq %xmm1, %xmm0 # sched: [1:0.50]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_pinsrq:
-; SANDY: # BB#0:
-; SANDY-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpinsrq $1, (%rsi), %xmm1, %xmm1 # sched: [5:0.50]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm0 # sched: [2:1.00]
+; SANDY-NEXT: vpinsrq $1, (%rsi), %xmm1, %xmm1 # sched: [7:0.50]
; SANDY-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pinsrq:
-; HASWELL: # BB#0:
-; HASWELL-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT: vpinsrq $1, (%rsi), %xmm1, %xmm1 # sched: [5:1.00]
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm0 # sched: [2:2.00]
+; HASWELL-NEXT: vpinsrq $1, (%rsi), %xmm1, %xmm1 # sched: [6:1.00]
; HASWELL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pinsrq:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm0 # sched: [2:2.00]
+; BROADWELL-NEXT: vpinsrq $1, (%rsi), %xmm1, %xmm1 # sched: [6:1.00]
+; BROADWELL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pinsrq:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm0 # sched: [2:2.00]
+; SKYLAKE-NEXT: vpinsrq $1, (%rsi), %xmm1, %xmm1 # sched: [6:1.00]
+; SKYLAKE-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pinsrq:
+; SKX: # %bb.0:
+; SKX-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm0 # sched: [2:2.00]
+; SKX-NEXT: vpinsrq $1, (%rsi), %xmm1, %xmm1 # sched: [6:1.00]
+; SKX-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_pinsrq:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vpinsrq $1, (%rsi), %xmm1, %xmm1 # sched: [6:1.00]
; BTVER2-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_pinsrq:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vpinsrq $1, (%rsi), %xmm1, %xmm1 # sched: [8:0.50]
; ZNVER1-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm0 # sched: [1:0.25]
; ZNVER1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = insertelement <2 x i64> %a0, i64 %a2, i32 1
%2 = load i64, i64 *%a3
%3 = insertelement <2 x i64> %a1, i64 %2, i32 1
@@ -932,40 +1390,58 @@ define <2 x i64> @test_pinsrq(<2 x i64> %a0, <2 x i64> %a1, i64 %a2, i64 *%a3) {
define <16 x i8> @test_pmaxsb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; GENERIC-LABEL: test_pmaxsb:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: pmaxsb %xmm1, %xmm0
-; GENERIC-NEXT: pmaxsb (%rdi), %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: pmaxsb %xmm1, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT: pmaxsb (%rdi), %xmm0 # sched: [7:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SLM-LABEL: test_pmaxsb:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: pmaxsb %xmm1, %xmm0 # sched: [1:0.50]
; SLM-NEXT: pmaxsb (%rdi), %xmm0 # sched: [4:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_pmaxsb:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpmaxsb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpmaxsb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pmaxsb:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpmaxsb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpmaxsb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pmaxsb:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BROADWELL-NEXT: vpmaxsb (%rdi), %xmm0, %xmm0 # sched: [6:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pmaxsb:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; SKYLAKE-NEXT: vpmaxsb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pmaxsb:
+; SKX: # %bb.0:
+; SKX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; SKX-NEXT: vpmaxsb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_pmaxsb:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vpmaxsb (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_pmaxsb:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
; ZNVER1-NEXT: vpmaxsb (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call <16 x i8> @llvm.x86.sse41.pmaxsb(<16 x i8> %a0, <16 x i8> %a1)
%2 = load <16 x i8>, <16 x i8> *%a2, align 16
%3 = call <16 x i8> @llvm.x86.sse41.pmaxsb(<16 x i8> %1, <16 x i8> %2)
@@ -975,40 +1451,58 @@ declare <16 x i8> @llvm.x86.sse41.pmaxsb(<16 x i8>, <16 x i8>) nounwind readnone
define <4 x i32> @test_pmaxsd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
; GENERIC-LABEL: test_pmaxsd:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: pmaxsd %xmm1, %xmm0
-; GENERIC-NEXT: pmaxsd (%rdi), %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: pmaxsd %xmm1, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT: pmaxsd (%rdi), %xmm0 # sched: [7:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SLM-LABEL: test_pmaxsd:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: pmaxsd %xmm1, %xmm0 # sched: [1:0.50]
; SLM-NEXT: pmaxsd (%rdi), %xmm0 # sched: [4:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_pmaxsd:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpmaxsd (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpmaxsd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pmaxsd:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpmaxsd (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpmaxsd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pmaxsd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BROADWELL-NEXT: vpmaxsd (%rdi), %xmm0, %xmm0 # sched: [6:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pmaxsd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; SKYLAKE-NEXT: vpmaxsd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pmaxsd:
+; SKX: # %bb.0:
+; SKX-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; SKX-NEXT: vpmaxsd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_pmaxsd:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vpmaxsd (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_pmaxsd:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
; ZNVER1-NEXT: vpmaxsd (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32> %a0, <4 x i32> %a1)
%2 = load <4 x i32>, <4 x i32> *%a2, align 16
%3 = call <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32> %1, <4 x i32> %2)
@@ -1018,40 +1512,58 @@ declare <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32>, <4 x i32>) nounwind readnone
define <4 x i32> @test_pmaxud(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
; GENERIC-LABEL: test_pmaxud:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: pmaxud %xmm1, %xmm0
-; GENERIC-NEXT: pmaxud (%rdi), %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: pmaxud %xmm1, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT: pmaxud (%rdi), %xmm0 # sched: [7:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SLM-LABEL: test_pmaxud:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: pmaxud %xmm1, %xmm0 # sched: [1:0.50]
; SLM-NEXT: pmaxud (%rdi), %xmm0 # sched: [4:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_pmaxud:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpmaxud (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpmaxud (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pmaxud:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpmaxud (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpmaxud (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pmaxud:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BROADWELL-NEXT: vpmaxud (%rdi), %xmm0, %xmm0 # sched: [6:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pmaxud:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; SKYLAKE-NEXT: vpmaxud (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pmaxud:
+; SKX: # %bb.0:
+; SKX-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; SKX-NEXT: vpmaxud (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_pmaxud:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vpmaxud (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_pmaxud:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
; ZNVER1-NEXT: vpmaxud (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32> %a0, <4 x i32> %a1)
%2 = load <4 x i32>, <4 x i32> *%a2, align 16
%3 = call <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32> %1, <4 x i32> %2)
@@ -1061,40 +1573,58 @@ declare <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32>, <4 x i32>) nounwind readnone
define <8 x i16> @test_pmaxuw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; GENERIC-LABEL: test_pmaxuw:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: pmaxuw %xmm1, %xmm0
-; GENERIC-NEXT: pmaxuw (%rdi), %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: pmaxuw %xmm1, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT: pmaxuw (%rdi), %xmm0 # sched: [7:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SLM-LABEL: test_pmaxuw:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: pmaxuw %xmm1, %xmm0 # sched: [1:0.50]
; SLM-NEXT: pmaxuw (%rdi), %xmm0 # sched: [4:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_pmaxuw:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpmaxuw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpmaxuw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pmaxuw:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpmaxuw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpmaxuw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pmaxuw:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BROADWELL-NEXT: vpmaxuw (%rdi), %xmm0, %xmm0 # sched: [6:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pmaxuw:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; SKYLAKE-NEXT: vpmaxuw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pmaxuw:
+; SKX: # %bb.0:
+; SKX-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; SKX-NEXT: vpmaxuw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_pmaxuw:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vpmaxuw (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_pmaxuw:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
; ZNVER1-NEXT: vpmaxuw (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call <8 x i16> @llvm.x86.sse41.pmaxuw(<8 x i16> %a0, <8 x i16> %a1)
%2 = load <8 x i16>, <8 x i16> *%a2, align 16
%3 = call <8 x i16> @llvm.x86.sse41.pmaxuw(<8 x i16> %1, <8 x i16> %2)
@@ -1104,40 +1634,58 @@ declare <8 x i16> @llvm.x86.sse41.pmaxuw(<8 x i16>, <8 x i16>) nounwind readnone
define <16 x i8> @test_pminsb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; GENERIC-LABEL: test_pminsb:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: pminsb %xmm1, %xmm0
-; GENERIC-NEXT: pminsb (%rdi), %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: pminsb %xmm1, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT: pminsb (%rdi), %xmm0 # sched: [7:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SLM-LABEL: test_pminsb:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: pminsb %xmm1, %xmm0 # sched: [1:0.50]
; SLM-NEXT: pminsb (%rdi), %xmm0 # sched: [4:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_pminsb:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vpminsb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpminsb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpminsb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pminsb:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vpminsb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpminsb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpminsb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pminsb:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpminsb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BROADWELL-NEXT: vpminsb (%rdi), %xmm0, %xmm0 # sched: [6:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pminsb:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpminsb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; SKYLAKE-NEXT: vpminsb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pminsb:
+; SKX: # %bb.0:
+; SKX-NEXT: vpminsb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; SKX-NEXT: vpminsb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_pminsb:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vpminsb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vpminsb (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_pminsb:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vpminsb %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
; ZNVER1-NEXT: vpminsb (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call <16 x i8> @llvm.x86.sse41.pminsb(<16 x i8> %a0, <16 x i8> %a1)
%2 = load <16 x i8>, <16 x i8> *%a2, align 16
%3 = call <16 x i8> @llvm.x86.sse41.pminsb(<16 x i8> %1, <16 x i8> %2)
@@ -1147,40 +1695,58 @@ declare <16 x i8> @llvm.x86.sse41.pminsb(<16 x i8>, <16 x i8>) nounwind readnone
define <4 x i32> @test_pminsd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
; GENERIC-LABEL: test_pminsd:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: pminsd %xmm1, %xmm0
-; GENERIC-NEXT: pminsd (%rdi), %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: pminsd %xmm1, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT: pminsd (%rdi), %xmm0 # sched: [7:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SLM-LABEL: test_pminsd:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: pminsd %xmm1, %xmm0 # sched: [1:0.50]
; SLM-NEXT: pminsd (%rdi), %xmm0 # sched: [4:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_pminsd:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vpminsd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpminsd (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpminsd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pminsd:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vpminsd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpminsd (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpminsd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pminsd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpminsd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BROADWELL-NEXT: vpminsd (%rdi), %xmm0, %xmm0 # sched: [6:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pminsd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpminsd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; SKYLAKE-NEXT: vpminsd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pminsd:
+; SKX: # %bb.0:
+; SKX-NEXT: vpminsd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; SKX-NEXT: vpminsd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_pminsd:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vpminsd (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_pminsd:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vpminsd %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
; ZNVER1-NEXT: vpminsd (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32> %a0, <4 x i32> %a1)
%2 = load <4 x i32>, <4 x i32> *%a2, align 16
%3 = call <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32> %1, <4 x i32> %2)
@@ -1190,40 +1756,58 @@ declare <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32>, <4 x i32>) nounwind readnone
define <4 x i32> @test_pminud(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
; GENERIC-LABEL: test_pminud:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: pminud %xmm1, %xmm0
-; GENERIC-NEXT: pminud (%rdi), %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: pminud %xmm1, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT: pminud (%rdi), %xmm0 # sched: [7:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SLM-LABEL: test_pminud:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: pminud %xmm1, %xmm0 # sched: [1:0.50]
; SLM-NEXT: pminud (%rdi), %xmm0 # sched: [4:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_pminud:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vpminud %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpminud (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpminud (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pminud:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vpminud %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpminud (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpminud (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pminud:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpminud %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BROADWELL-NEXT: vpminud (%rdi), %xmm0, %xmm0 # sched: [6:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pminud:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpminud %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; SKYLAKE-NEXT: vpminud (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pminud:
+; SKX: # %bb.0:
+; SKX-NEXT: vpminud %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; SKX-NEXT: vpminud (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_pminud:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vpminud %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vpminud (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_pminud:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vpminud %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
; ZNVER1-NEXT: vpminud (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call <4 x i32> @llvm.x86.sse41.pminud(<4 x i32> %a0, <4 x i32> %a1)
%2 = load <4 x i32>, <4 x i32> *%a2, align 16
%3 = call <4 x i32> @llvm.x86.sse41.pminud(<4 x i32> %1, <4 x i32> %2)
@@ -1233,40 +1817,58 @@ declare <4 x i32> @llvm.x86.sse41.pminud(<4 x i32>, <4 x i32>) nounwind readnone
define <8 x i16> @test_pminuw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; GENERIC-LABEL: test_pminuw:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: pminuw %xmm1, %xmm0
-; GENERIC-NEXT: pminuw (%rdi), %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: pminuw %xmm1, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT: pminuw (%rdi), %xmm0 # sched: [7:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SLM-LABEL: test_pminuw:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: pminuw %xmm1, %xmm0 # sched: [1:0.50]
; SLM-NEXT: pminuw (%rdi), %xmm0 # sched: [4:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_pminuw:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vpminuw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpminuw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpminuw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pminuw:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vpminuw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpminuw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpminuw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pminuw:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpminuw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BROADWELL-NEXT: vpminuw (%rdi), %xmm0, %xmm0 # sched: [6:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pminuw:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpminuw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; SKYLAKE-NEXT: vpminuw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pminuw:
+; SKX: # %bb.0:
+; SKX-NEXT: vpminuw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; SKX-NEXT: vpminuw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_pminuw:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vpminuw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vpminuw (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_pminuw:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vpminuw %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
; ZNVER1-NEXT: vpminuw (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call <8 x i16> @llvm.x86.sse41.pminuw(<8 x i16> %a0, <8 x i16> %a1)
%2 = load <8 x i16>, <8 x i16> *%a2, align 16
%3 = call <8 x i16> @llvm.x86.sse41.pminuw(<8 x i16> %1, <8 x i16> %2)
@@ -1276,14 +1878,14 @@ declare <8 x i16> @llvm.x86.sse41.pminuw(<8 x i16>, <8 x i16>) nounwind readnone
define <8 x i16> @test_pmovsxbw(<16 x i8> %a0, <8 x i8> *%a1) {
; GENERIC-LABEL: test_pmovsxbw:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: pmovsxbw %xmm0, %xmm1
-; GENERIC-NEXT: pmovsxbw (%rdi), %xmm0
-; GENERIC-NEXT: paddw %xmm1, %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: pmovsxbw %xmm0, %xmm1 # sched: [1:0.50]
+; GENERIC-NEXT: pmovsxbw (%rdi), %xmm0 # sched: [7:0.50]
+; GENERIC-NEXT: paddw %xmm1, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SLM-LABEL: test_pmovsxbw:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: pmovsxbw (%rdi), %xmm1 # sched: [4:1.00]
; SLM-NEXT: pmovsxbw %xmm0, %xmm0 # sched: [1:1.00]
; SLM-NEXT: paddw %xmm0, %xmm1 # sched: [1:0.50]
@@ -1291,32 +1893,53 @@ define <8 x i16> @test_pmovsxbw(<16 x i8> %a0, <8 x i8> *%a1) {
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_pmovsxbw:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vpmovsxbw %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpmovsxbw (%rdi), %xmm1 # sched: [5:0.50]
+; SANDY-NEXT: vpmovsxbw (%rdi), %xmm1 # sched: [7:0.50]
; SANDY-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pmovsxbw:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vpmovsxbw %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT: vpmovsxbw (%rdi), %xmm1 # sched: [5:1.00]
+; HASWELL-NEXT: vpmovsxbw (%rdi), %xmm1 # sched: [6:1.00]
; HASWELL-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pmovsxbw:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpmovsxbw %xmm0, %xmm0 # sched: [1:1.00]
+; BROADWELL-NEXT: vpmovsxbw (%rdi), %xmm1 # sched: [6:1.00]
+; BROADWELL-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pmovsxbw:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpmovsxbw %xmm0, %xmm0 # sched: [1:1.00]
+; SKYLAKE-NEXT: vpmovsxbw (%rdi), %xmm1 # sched: [6:1.00]
+; SKYLAKE-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pmovsxbw:
+; SKX: # %bb.0:
+; SKX-NEXT: vpmovsxbw %xmm0, %xmm0 # sched: [1:1.00]
+; SKX-NEXT: vpmovsxbw (%rdi), %xmm1 # sched: [6:1.00]
+; SKX-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_pmovsxbw:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vpmovsxbw (%rdi), %xmm1 # sched: [6:1.00]
; BTVER2-NEXT: vpmovsxbw %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_pmovsxbw:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vpmovsxbw (%rdi), %xmm1 # sched: [8:0.50]
; ZNVER1-NEXT: vpmovsxbw %xmm0, %xmm0 # sched: [1:0.25]
; ZNVER1-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%2 = sext <8 x i8> %1 to <8 x i16>
%3 = load <8 x i8>, <8 x i8>* %a1, align 1
@@ -1327,14 +1950,14 @@ define <8 x i16> @test_pmovsxbw(<16 x i8> %a0, <8 x i8> *%a1) {
define <4 x i32> @test_pmovsxbd(<16 x i8> %a0, <4 x i8> *%a1) {
; GENERIC-LABEL: test_pmovsxbd:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: pmovsxbd %xmm0, %xmm1
-; GENERIC-NEXT: pmovsxbd (%rdi), %xmm0
-; GENERIC-NEXT: paddd %xmm1, %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: pmovsxbd %xmm0, %xmm1 # sched: [1:0.50]
+; GENERIC-NEXT: pmovsxbd (%rdi), %xmm0 # sched: [7:0.50]
+; GENERIC-NEXT: paddd %xmm1, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SLM-LABEL: test_pmovsxbd:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: pmovsxbd (%rdi), %xmm1 # sched: [4:1.00]
; SLM-NEXT: pmovsxbd %xmm0, %xmm0 # sched: [1:1.00]
; SLM-NEXT: paddd %xmm0, %xmm1 # sched: [1:0.50]
@@ -1342,32 +1965,53 @@ define <4 x i32> @test_pmovsxbd(<16 x i8> %a0, <4 x i8> *%a1) {
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_pmovsxbd:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vpmovsxbd %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpmovsxbd (%rdi), %xmm1 # sched: [5:0.50]
+; SANDY-NEXT: vpmovsxbd (%rdi), %xmm1 # sched: [7:0.50]
; SANDY-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pmovsxbd:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vpmovsxbd %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT: vpmovsxbd (%rdi), %xmm1 # sched: [5:1.00]
+; HASWELL-NEXT: vpmovsxbd (%rdi), %xmm1 # sched: [6:1.00]
; HASWELL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pmovsxbd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpmovsxbd %xmm0, %xmm0 # sched: [1:1.00]
+; BROADWELL-NEXT: vpmovsxbd (%rdi), %xmm1 # sched: [6:1.00]
+; BROADWELL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pmovsxbd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpmovsxbd %xmm0, %xmm0 # sched: [1:1.00]
+; SKYLAKE-NEXT: vpmovsxbd (%rdi), %xmm1 # sched: [6:1.00]
+; SKYLAKE-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pmovsxbd:
+; SKX: # %bb.0:
+; SKX-NEXT: vpmovsxbd %xmm0, %xmm0 # sched: [1:1.00]
+; SKX-NEXT: vpmovsxbd (%rdi), %xmm1 # sched: [6:1.00]
+; SKX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_pmovsxbd:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vpmovsxbd (%rdi), %xmm1 # sched: [6:1.00]
; BTVER2-NEXT: vpmovsxbd %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_pmovsxbd:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vpmovsxbd (%rdi), %xmm1 # sched: [8:0.50]
; ZNVER1-NEXT: vpmovsxbd %xmm0, %xmm0 # sched: [1:0.25]
; ZNVER1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%2 = sext <4 x i8> %1 to <4 x i32>
%3 = load <4 x i8>, <4 x i8>* %a1, align 1
@@ -1378,14 +2022,14 @@ define <4 x i32> @test_pmovsxbd(<16 x i8> %a0, <4 x i8> *%a1) {
define <2 x i64> @test_pmovsxbq(<16 x i8> %a0, <2 x i8> *%a1) {
; GENERIC-LABEL: test_pmovsxbq:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: pmovsxbq %xmm0, %xmm1
-; GENERIC-NEXT: pmovsxbq (%rdi), %xmm0
-; GENERIC-NEXT: paddq %xmm1, %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: pmovsxbq %xmm0, %xmm1 # sched: [1:0.50]
+; GENERIC-NEXT: pmovsxbq (%rdi), %xmm0 # sched: [7:0.50]
+; GENERIC-NEXT: paddq %xmm1, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SLM-LABEL: test_pmovsxbq:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: pmovsxbq (%rdi), %xmm1 # sched: [4:1.00]
; SLM-NEXT: pmovsxbq %xmm0, %xmm0 # sched: [1:1.00]
; SLM-NEXT: paddq %xmm0, %xmm1 # sched: [1:0.50]
@@ -1393,32 +2037,53 @@ define <2 x i64> @test_pmovsxbq(<16 x i8> %a0, <2 x i8> *%a1) {
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_pmovsxbq:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vpmovsxbq %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpmovsxbq (%rdi), %xmm1 # sched: [5:0.50]
+; SANDY-NEXT: vpmovsxbq (%rdi), %xmm1 # sched: [7:0.50]
; SANDY-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pmovsxbq:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vpmovsxbq %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT: vpmovsxbq (%rdi), %xmm1 # sched: [5:1.00]
+; HASWELL-NEXT: vpmovsxbq (%rdi), %xmm1 # sched: [6:1.00]
; HASWELL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pmovsxbq:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpmovsxbq %xmm0, %xmm0 # sched: [1:1.00]
+; BROADWELL-NEXT: vpmovsxbq (%rdi), %xmm1 # sched: [6:1.00]
+; BROADWELL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pmovsxbq:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpmovsxbq %xmm0, %xmm0 # sched: [1:1.00]
+; SKYLAKE-NEXT: vpmovsxbq (%rdi), %xmm1 # sched: [6:1.00]
+; SKYLAKE-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pmovsxbq:
+; SKX: # %bb.0:
+; SKX-NEXT: vpmovsxbq %xmm0, %xmm0 # sched: [1:1.00]
+; SKX-NEXT: vpmovsxbq (%rdi), %xmm1 # sched: [6:1.00]
+; SKX-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_pmovsxbq:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vpmovsxbq (%rdi), %xmm1 # sched: [6:1.00]
; BTVER2-NEXT: vpmovsxbq %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_pmovsxbq:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vpmovsxbq (%rdi), %xmm1 # sched: [8:0.50]
; ZNVER1-NEXT: vpmovsxbq %xmm0, %xmm0 # sched: [1:0.25]
; ZNVER1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
%2 = sext <2 x i8> %1 to <2 x i64>
%3 = load <2 x i8>, <2 x i8>* %a1, align 1
@@ -1429,14 +2094,14 @@ define <2 x i64> @test_pmovsxbq(<16 x i8> %a0, <2 x i8> *%a1) {
define <2 x i64> @test_pmovsxdq(<4 x i32> %a0, <2 x i32> *%a1) {
; GENERIC-LABEL: test_pmovsxdq:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: pmovsxdq %xmm0, %xmm1
-; GENERIC-NEXT: pmovsxdq (%rdi), %xmm0
-; GENERIC-NEXT: paddq %xmm1, %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: pmovsxdq %xmm0, %xmm1 # sched: [1:0.50]
+; GENERIC-NEXT: pmovsxdq (%rdi), %xmm0 # sched: [7:0.50]
+; GENERIC-NEXT: paddq %xmm1, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SLM-LABEL: test_pmovsxdq:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: pmovsxdq (%rdi), %xmm1 # sched: [4:1.00]
; SLM-NEXT: pmovsxdq %xmm0, %xmm0 # sched: [1:1.00]
; SLM-NEXT: paddq %xmm0, %xmm1 # sched: [1:0.50]
@@ -1444,32 +2109,53 @@ define <2 x i64> @test_pmovsxdq(<4 x i32> %a0, <2 x i32> *%a1) {
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_pmovsxdq:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vpmovsxdq %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpmovsxdq (%rdi), %xmm1 # sched: [5:0.50]
+; SANDY-NEXT: vpmovsxdq (%rdi), %xmm1 # sched: [7:0.50]
; SANDY-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pmovsxdq:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vpmovsxdq %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT: vpmovsxdq (%rdi), %xmm1 # sched: [5:1.00]
+; HASWELL-NEXT: vpmovsxdq (%rdi), %xmm1 # sched: [6:1.00]
; HASWELL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pmovsxdq:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpmovsxdq %xmm0, %xmm0 # sched: [1:1.00]
+; BROADWELL-NEXT: vpmovsxdq (%rdi), %xmm1 # sched: [6:1.00]
+; BROADWELL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pmovsxdq:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpmovsxdq %xmm0, %xmm0 # sched: [1:1.00]
+; SKYLAKE-NEXT: vpmovsxdq (%rdi), %xmm1 # sched: [6:1.00]
+; SKYLAKE-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pmovsxdq:
+; SKX: # %bb.0:
+; SKX-NEXT: vpmovsxdq %xmm0, %xmm0 # sched: [1:1.00]
+; SKX-NEXT: vpmovsxdq (%rdi), %xmm1 # sched: [6:1.00]
+; SKX-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_pmovsxdq:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vpmovsxdq (%rdi), %xmm1 # sched: [6:1.00]
; BTVER2-NEXT: vpmovsxdq %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_pmovsxdq:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vpmovsxdq (%rdi), %xmm1 # sched: [8:0.50]
; ZNVER1-NEXT: vpmovsxdq %xmm0, %xmm0 # sched: [1:0.25]
; ZNVER1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
%2 = sext <2 x i32> %1 to <2 x i64>
%3 = load <2 x i32>, <2 x i32>* %a1, align 1
@@ -1480,14 +2166,14 @@ define <2 x i64> @test_pmovsxdq(<4 x i32> %a0, <2 x i32> *%a1) {
define <4 x i32> @test_pmovsxwd(<8 x i16> %a0, <4 x i16> *%a1) {
; GENERIC-LABEL: test_pmovsxwd:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: pmovsxwd %xmm0, %xmm1
-; GENERIC-NEXT: pmovsxwd (%rdi), %xmm0
-; GENERIC-NEXT: paddd %xmm1, %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: pmovsxwd %xmm0, %xmm1 # sched: [1:0.50]
+; GENERIC-NEXT: pmovsxwd (%rdi), %xmm0 # sched: [7:0.50]
+; GENERIC-NEXT: paddd %xmm1, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SLM-LABEL: test_pmovsxwd:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: pmovsxwd (%rdi), %xmm1 # sched: [4:1.00]
; SLM-NEXT: pmovsxwd %xmm0, %xmm0 # sched: [1:1.00]
; SLM-NEXT: paddd %xmm0, %xmm1 # sched: [1:0.50]
@@ -1495,32 +2181,53 @@ define <4 x i32> @test_pmovsxwd(<8 x i16> %a0, <4 x i16> *%a1) {
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_pmovsxwd:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vpmovsxwd %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpmovsxwd (%rdi), %xmm1 # sched: [5:0.50]
+; SANDY-NEXT: vpmovsxwd (%rdi), %xmm1 # sched: [7:0.50]
; SANDY-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pmovsxwd:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vpmovsxwd %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT: vpmovsxwd (%rdi), %xmm1 # sched: [5:1.00]
+; HASWELL-NEXT: vpmovsxwd (%rdi), %xmm1 # sched: [6:1.00]
; HASWELL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pmovsxwd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpmovsxwd %xmm0, %xmm0 # sched: [1:1.00]
+; BROADWELL-NEXT: vpmovsxwd (%rdi), %xmm1 # sched: [6:1.00]
+; BROADWELL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pmovsxwd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpmovsxwd %xmm0, %xmm0 # sched: [1:1.00]
+; SKYLAKE-NEXT: vpmovsxwd (%rdi), %xmm1 # sched: [6:1.00]
+; SKYLAKE-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pmovsxwd:
+; SKX: # %bb.0:
+; SKX-NEXT: vpmovsxwd %xmm0, %xmm0 # sched: [1:1.00]
+; SKX-NEXT: vpmovsxwd (%rdi), %xmm1 # sched: [6:1.00]
+; SKX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_pmovsxwd:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vpmovsxwd (%rdi), %xmm1 # sched: [6:1.00]
; BTVER2-NEXT: vpmovsxwd %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_pmovsxwd:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vpmovsxwd (%rdi), %xmm1 # sched: [8:0.50]
; ZNVER1-NEXT: vpmovsxwd %xmm0, %xmm0 # sched: [1:0.25]
; ZNVER1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%2 = sext <4 x i16> %1 to <4 x i32>
%3 = load <4 x i16>, <4 x i16>* %a1, align 1
@@ -1531,14 +2238,14 @@ define <4 x i32> @test_pmovsxwd(<8 x i16> %a0, <4 x i16> *%a1) {
define <2 x i64> @test_pmovsxwq(<8 x i16> %a0, <2 x i16> *%a1) {
; GENERIC-LABEL: test_pmovsxwq:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: pmovsxwq %xmm0, %xmm1
-; GENERIC-NEXT: pmovsxwq (%rdi), %xmm0
-; GENERIC-NEXT: paddq %xmm1, %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: pmovsxwq %xmm0, %xmm1 # sched: [1:0.50]
+; GENERIC-NEXT: pmovsxwq (%rdi), %xmm0 # sched: [7:0.50]
+; GENERIC-NEXT: paddq %xmm1, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SLM-LABEL: test_pmovsxwq:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: pmovsxwq (%rdi), %xmm1 # sched: [4:1.00]
; SLM-NEXT: pmovsxwq %xmm0, %xmm0 # sched: [1:1.00]
; SLM-NEXT: paddq %xmm0, %xmm1 # sched: [1:0.50]
@@ -1546,32 +2253,53 @@ define <2 x i64> @test_pmovsxwq(<8 x i16> %a0, <2 x i16> *%a1) {
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_pmovsxwq:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vpmovsxwq %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpmovsxwq (%rdi), %xmm1 # sched: [5:0.50]
+; SANDY-NEXT: vpmovsxwq (%rdi), %xmm1 # sched: [7:0.50]
; SANDY-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pmovsxwq:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vpmovsxwq %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT: vpmovsxwq (%rdi), %xmm1 # sched: [5:1.00]
+; HASWELL-NEXT: vpmovsxwq (%rdi), %xmm1 # sched: [6:1.00]
; HASWELL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pmovsxwq:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpmovsxwq %xmm0, %xmm0 # sched: [1:1.00]
+; BROADWELL-NEXT: vpmovsxwq (%rdi), %xmm1 # sched: [6:1.00]
+; BROADWELL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pmovsxwq:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpmovsxwq %xmm0, %xmm0 # sched: [1:1.00]
+; SKYLAKE-NEXT: vpmovsxwq (%rdi), %xmm1 # sched: [6:1.00]
+; SKYLAKE-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pmovsxwq:
+; SKX: # %bb.0:
+; SKX-NEXT: vpmovsxwq %xmm0, %xmm0 # sched: [1:1.00]
+; SKX-NEXT: vpmovsxwq (%rdi), %xmm1 # sched: [6:1.00]
+; SKX-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_pmovsxwq:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vpmovsxwq (%rdi), %xmm1 # sched: [6:1.00]
; BTVER2-NEXT: vpmovsxwq %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_pmovsxwq:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vpmovsxwq (%rdi), %xmm1 # sched: [8:0.50]
; ZNVER1-NEXT: vpmovsxwq %xmm0, %xmm0 # sched: [1:0.25]
; ZNVER1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
%2 = sext <2 x i16> %1 to <2 x i64>
%3 = load <2 x i16>, <2 x i16>* %a1, align 1
@@ -1582,14 +2310,14 @@ define <2 x i64> @test_pmovsxwq(<8 x i16> %a0, <2 x i16> *%a1) {
define <8 x i16> @test_pmovzxbw(<16 x i8> %a0, <8 x i8> *%a1) {
; GENERIC-LABEL: test_pmovzxbw:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; GENERIC-NEXT: pmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; GENERIC-NEXT: paddw %xmm1, %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [1:0.50]
+; GENERIC-NEXT: pmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero sched: [7:0.50]
+; GENERIC-NEXT: paddw %xmm1, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SLM-LABEL: test_pmovzxbw:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: pmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero sched: [4:1.00]
; SLM-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [1:1.00]
; SLM-NEXT: paddw %xmm0, %xmm1 # sched: [1:0.50]
@@ -1597,32 +2325,53 @@ define <8 x i16> @test_pmovzxbw(<16 x i8> %a0, <8 x i8> *%a1) {
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_pmovzxbw:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [1:0.50]
-; SANDY-NEXT: vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero sched: [5:0.50]
+; SANDY-NEXT: vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero sched: [7:0.50]
; SANDY-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pmovzxbw:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [1:1.00]
-; HASWELL-NEXT: vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero sched: [5:1.00]
+; HASWELL-NEXT: vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero sched: [6:1.00]
; HASWELL-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pmovzxbw:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [1:1.00]
+; BROADWELL-NEXT: vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero sched: [6:1.00]
+; BROADWELL-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pmovzxbw:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [1:1.00]
+; SKYLAKE-NEXT: vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero sched: [6:1.00]
+; SKYLAKE-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pmovzxbw:
+; SKX: # %bb.0:
+; SKX-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [1:1.00]
+; SKX-NEXT: vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero sched: [6:1.00]
+; SKX-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_pmovzxbw:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero sched: [6:1.00]
; BTVER2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [1:0.50]
; BTVER2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_pmovzxbw:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero sched: [8:0.50]
; ZNVER1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [1:0.25]
; ZNVER1-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%2 = zext <8 x i8> %1 to <8 x i16>
%3 = load <8 x i8>, <8 x i8>* %a1, align 1
@@ -1633,14 +2382,14 @@ define <8 x i16> @test_pmovzxbw(<16 x i8> %a0, <8 x i8> *%a1) {
define <4 x i32> @test_pmovzxbd(<16 x i8> %a0, <4 x i8> *%a1) {
; GENERIC-LABEL: test_pmovzxbd:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; GENERIC-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; GENERIC-NEXT: paddd %xmm1, %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero sched: [1:0.50]
+; GENERIC-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero sched: [7:0.50]
+; GENERIC-NEXT: paddd %xmm1, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SLM-LABEL: test_pmovzxbd:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero sched: [4:1.00]
; SLM-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero sched: [1:1.00]
; SLM-NEXT: paddd %xmm0, %xmm1 # sched: [1:0.50]
@@ -1648,32 +2397,53 @@ define <4 x i32> @test_pmovzxbd(<16 x i8> %a0, <4 x i8> *%a1) {
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_pmovzxbd:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero sched: [1:0.50]
-; SANDY-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero sched: [5:0.50]
+; SANDY-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero sched: [7:0.50]
; SANDY-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pmovzxbd:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero sched: [1:1.00]
-; HASWELL-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero sched: [5:1.00]
+; HASWELL-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero sched: [6:1.00]
; HASWELL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pmovzxbd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero sched: [1:1.00]
+; BROADWELL-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero sched: [6:1.00]
+; BROADWELL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pmovzxbd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero sched: [1:1.00]
+; SKYLAKE-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero sched: [6:1.00]
+; SKYLAKE-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pmovzxbd:
+; SKX: # %bb.0:
+; SKX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero sched: [1:1.00]
+; SKX-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero sched: [6:1.00]
+; SKX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_pmovzxbd:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero sched: [6:1.00]
; BTVER2-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero sched: [1:0.50]
; BTVER2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_pmovzxbd:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero sched: [8:0.50]
; ZNVER1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero sched: [1:0.25]
; ZNVER1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%2 = zext <4 x i8> %1 to <4 x i32>
%3 = load <4 x i8>, <4 x i8>* %a1, align 1
@@ -1684,14 +2454,14 @@ define <4 x i32> @test_pmovzxbd(<16 x i8> %a0, <4 x i8> *%a1) {
define <2 x i64> @test_pmovzxbq(<16 x i8> %a0, <2 x i8> *%a1) {
; GENERIC-LABEL: test_pmovzxbq:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: pmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
-; GENERIC-NEXT: pmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
-; GENERIC-NEXT: paddq %xmm1, %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: pmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero sched: [1:0.50]
+; GENERIC-NEXT: pmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero sched: [7:0.50]
+; GENERIC-NEXT: paddq %xmm1, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SLM-LABEL: test_pmovzxbq:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: pmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero sched: [4:1.00]
; SLM-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero sched: [1:1.00]
; SLM-NEXT: paddq %xmm0, %xmm1 # sched: [1:0.50]
@@ -1699,32 +2469,53 @@ define <2 x i64> @test_pmovzxbq(<16 x i8> %a0, <2 x i8> *%a1) {
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_pmovzxbq:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero sched: [1:0.50]
-; SANDY-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero sched: [5:0.50]
+; SANDY-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero sched: [7:0.50]
; SANDY-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pmovzxbq:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero sched: [1:1.00]
-; HASWELL-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero sched: [5:1.00]
+; HASWELL-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero sched: [6:1.00]
; HASWELL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pmovzxbq:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero sched: [1:1.00]
+; BROADWELL-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero sched: [6:1.00]
+; BROADWELL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pmovzxbq:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero sched: [1:1.00]
+; SKYLAKE-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero sched: [6:1.00]
+; SKYLAKE-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pmovzxbq:
+; SKX: # %bb.0:
+; SKX-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero sched: [1:1.00]
+; SKX-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero sched: [6:1.00]
+; SKX-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_pmovzxbq:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero sched: [6:1.00]
; BTVER2-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero sched: [1:0.50]
; BTVER2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_pmovzxbq:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero sched: [8:0.50]
; ZNVER1-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero sched: [1:0.25]
; ZNVER1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
%2 = zext <2 x i8> %1 to <2 x i64>
%3 = load <2 x i8>, <2 x i8>* %a1, align 1
@@ -1735,14 +2526,14 @@ define <2 x i64> @test_pmovzxbq(<16 x i8> %a0, <2 x i8> *%a1) {
define <2 x i64> @test_pmovzxdq(<4 x i32> %a0, <2 x i32> *%a1) {
; GENERIC-LABEL: test_pmovzxdq:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero
-; GENERIC-NEXT: pmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
-; GENERIC-NEXT: paddq %xmm1, %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero sched: [1:0.50]
+; GENERIC-NEXT: pmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero sched: [7:0.50]
+; GENERIC-NEXT: paddq %xmm1, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SLM-LABEL: test_pmovzxdq:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: pmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero sched: [4:1.00]
; SLM-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero sched: [1:1.00]
; SLM-NEXT: paddq %xmm0, %xmm1 # sched: [1:0.50]
@@ -1750,32 +2541,53 @@ define <2 x i64> @test_pmovzxdq(<4 x i32> %a0, <2 x i32> *%a1) {
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_pmovzxdq:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero sched: [1:0.50]
-; SANDY-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero sched: [5:0.50]
+; SANDY-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero sched: [7:0.50]
; SANDY-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pmovzxdq:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero sched: [1:1.00]
-; HASWELL-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero sched: [5:1.00]
+; HASWELL-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero sched: [6:1.00]
; HASWELL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pmovzxdq:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero sched: [1:1.00]
+; BROADWELL-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero sched: [6:1.00]
+; BROADWELL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pmovzxdq:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero sched: [1:1.00]
+; SKYLAKE-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero sched: [6:1.00]
+; SKYLAKE-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pmovzxdq:
+; SKX: # %bb.0:
+; SKX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero sched: [1:1.00]
+; SKX-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero sched: [6:1.00]
+; SKX-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_pmovzxdq:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero sched: [6:1.00]
; BTVER2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero sched: [1:0.50]
; BTVER2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_pmovzxdq:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero sched: [8:0.50]
; ZNVER1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero sched: [1:0.25]
; ZNVER1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
%2 = zext <2 x i32> %1 to <2 x i64>
%3 = load <2 x i32>, <2 x i32>* %a1, align 1
@@ -1786,14 +2598,14 @@ define <2 x i64> @test_pmovzxdq(<4 x i32> %a0, <2 x i32> *%a1) {
define <4 x i32> @test_pmovzxwd(<8 x i16> %a0, <4 x i16> *%a1) {
; GENERIC-LABEL: test_pmovzxwd:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; GENERIC-NEXT: pmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; GENERIC-NEXT: paddd %xmm1, %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero sched: [1:0.50]
+; GENERIC-NEXT: pmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero sched: [7:0.50]
+; GENERIC-NEXT: paddd %xmm1, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SLM-LABEL: test_pmovzxwd:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: pmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero sched: [4:1.00]
; SLM-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero sched: [1:1.00]
; SLM-NEXT: paddd %xmm0, %xmm1 # sched: [1:0.50]
@@ -1801,32 +2613,53 @@ define <4 x i32> @test_pmovzxwd(<8 x i16> %a0, <4 x i16> *%a1) {
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_pmovzxwd:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero sched: [1:0.50]
-; SANDY-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero sched: [5:0.50]
+; SANDY-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero sched: [7:0.50]
; SANDY-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pmovzxwd:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero sched: [1:1.00]
-; HASWELL-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero sched: [5:1.00]
+; HASWELL-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero sched: [6:1.00]
; HASWELL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pmovzxwd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero sched: [1:1.00]
+; BROADWELL-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero sched: [6:1.00]
+; BROADWELL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pmovzxwd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero sched: [1:1.00]
+; SKYLAKE-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero sched: [6:1.00]
+; SKYLAKE-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pmovzxwd:
+; SKX: # %bb.0:
+; SKX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero sched: [1:1.00]
+; SKX-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero sched: [6:1.00]
+; SKX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_pmovzxwd:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero sched: [6:1.00]
; BTVER2-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero sched: [1:0.50]
; BTVER2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_pmovzxwd:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero sched: [8:0.50]
; ZNVER1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero sched: [1:0.25]
; ZNVER1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%2 = zext <4 x i16> %1 to <4 x i32>
%3 = load <4 x i16>, <4 x i16>* %a1, align 1
@@ -1837,14 +2670,14 @@ define <4 x i32> @test_pmovzxwd(<8 x i16> %a0, <4 x i16> *%a1) {
define <2 x i64> @test_pmovzxwq(<8 x i16> %a0, <2 x i16> *%a1) {
; GENERIC-LABEL: test_pmovzxwq:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; GENERIC-NEXT: pmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero
-; GENERIC-NEXT: paddq %xmm1, %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero sched: [1:0.50]
+; GENERIC-NEXT: pmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero sched: [7:0.50]
+; GENERIC-NEXT: paddq %xmm1, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SLM-LABEL: test_pmovzxwq:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: pmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero sched: [4:1.00]
; SLM-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero sched: [1:1.00]
; SLM-NEXT: paddq %xmm0, %xmm1 # sched: [1:0.50]
@@ -1852,32 +2685,53 @@ define <2 x i64> @test_pmovzxwq(<8 x i16> %a0, <2 x i16> *%a1) {
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_pmovzxwq:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero sched: [1:0.50]
-; SANDY-NEXT: vpmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero sched: [5:0.50]
+; SANDY-NEXT: vpmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero sched: [7:0.50]
; SANDY-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pmovzxwq:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero sched: [1:1.00]
-; HASWELL-NEXT: vpmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero sched: [5:1.00]
+; HASWELL-NEXT: vpmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero sched: [6:1.00]
; HASWELL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pmovzxwq:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero sched: [1:1.00]
+; BROADWELL-NEXT: vpmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero sched: [6:1.00]
+; BROADWELL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pmovzxwq:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero sched: [1:1.00]
+; SKYLAKE-NEXT: vpmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero sched: [6:1.00]
+; SKYLAKE-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pmovzxwq:
+; SKX: # %bb.0:
+; SKX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero sched: [1:1.00]
+; SKX-NEXT: vpmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero sched: [6:1.00]
+; SKX-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_pmovzxwq:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vpmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero sched: [6:1.00]
; BTVER2-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero sched: [1:0.50]
; BTVER2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_pmovzxwq:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vpmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero sched: [8:0.50]
; ZNVER1-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero sched: [1:0.25]
; ZNVER1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
%2 = zext <2 x i16> %1 to <2 x i64>
%3 = load <2 x i16>, <2 x i16>* %a1, align 1
@@ -1888,40 +2742,58 @@ define <2 x i64> @test_pmovzxwq(<8 x i16> %a0, <2 x i16> *%a1) {
define <2 x i64> @test_pmuldq(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
; GENERIC-LABEL: test_pmuldq:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: pmuldq %xmm1, %xmm0
-; GENERIC-NEXT: pmuldq (%rdi), %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: pmuldq %xmm1, %xmm0 # sched: [3:1.00]
+; GENERIC-NEXT: pmuldq (%rdi), %xmm0 # sched: [9:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SLM-LABEL: test_pmuldq:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: pmuldq %xmm1, %xmm0 # sched: [4:1.00]
; SLM-NEXT: pmuldq (%rdi), %xmm0 # sched: [7:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_pmuldq:
-; SANDY: # BB#0:
-; SANDY-NEXT: vpmuldq %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vpmuldq %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; SANDY-NEXT: vpmuldq (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pmuldq:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vpmuldq %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT: vpmuldq (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpmuldq (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pmuldq:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpmuldq %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BROADWELL-NEXT: vpmuldq (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pmuldq:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpmuldq %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
+; SKYLAKE-NEXT: vpmuldq (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pmuldq:
+; SKX: # %bb.0:
+; SKX-NEXT: vpmuldq %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: vpmuldq (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_pmuldq:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vpmuldq %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
; BTVER2-NEXT: vpmuldq (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_pmuldq:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vpmuldq %xmm1, %xmm0, %xmm0 # sched: [4:1.00]
; ZNVER1-NEXT: vpmuldq (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32> %a0, <4 x i32> %a1)
%2 = bitcast <2 x i64> %1 to <4 x i32>
%3 = load <4 x i32>, <4 x i32> *%a2, align 16
@@ -1932,40 +2804,58 @@ declare <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32>, <4 x i32>) nounwind readnone
define <4 x i32> @test_pmulld(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
; GENERIC-LABEL: test_pmulld:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: pmulld %xmm1, %xmm0
-; GENERIC-NEXT: pmulld (%rdi), %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: pmulld %xmm1, %xmm0 # sched: [3:1.00]
+; GENERIC-NEXT: pmulld (%rdi), %xmm0 # sched: [9:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SLM-LABEL: test_pmulld:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: pmulld %xmm1, %xmm0 # sched: [4:1.00]
; SLM-NEXT: pmulld (%rdi), %xmm0 # sched: [7:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_pmulld:
-; SANDY: # BB#0:
-; SANDY-NEXT: vpmulld %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vpmulld %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; SANDY-NEXT: vpmulld (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pmulld:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vpmulld %xmm1, %xmm0, %xmm0 # sched: [10:2.00]
-; HASWELL-NEXT: vpmulld (%rdi), %xmm0, %xmm0 # sched: [10:2.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpmulld (%rdi), %xmm0, %xmm0 # sched: [16:2.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pmulld:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpmulld %xmm1, %xmm0, %xmm0 # sched: [10:2.00]
+; BROADWELL-NEXT: vpmulld (%rdi), %xmm0, %xmm0 # sched: [15:2.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pmulld:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpmulld %xmm1, %xmm0, %xmm0 # sched: [8:0.67]
+; SKYLAKE-NEXT: vpmulld (%rdi), %xmm0, %xmm0 # sched: [14:0.67]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pmulld:
+; SKX: # %bb.0:
+; SKX-NEXT: vpmulld %xmm1, %xmm0, %xmm0 # sched: [8:0.67]
+; SKX-NEXT: vpmulld (%rdi), %xmm0, %xmm0 # sched: [14:0.67]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_pmulld:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
; BTVER2-NEXT: vpmulld (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_pmulld:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 # sched: [4:1.00]
; ZNVER1-NEXT: vpmulld (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = mul <4 x i32> %a0, %a1
%2 = load <4 x i32>, <4 x i32> *%a2, align 16
%3 = mul <4 x i32> %1, %2
@@ -1974,17 +2864,17 @@ define <4 x i32> @test_pmulld(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
define i32 @test_ptest(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
; GENERIC-LABEL: test_ptest:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: ptest %xmm1, %xmm0
-; GENERIC-NEXT: setb %al
-; GENERIC-NEXT: ptest (%rdi), %xmm0
-; GENERIC-NEXT: setb %cl
-; GENERIC-NEXT: andb %al, %cl
-; GENERIC-NEXT: movzbl %cl, %eax
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: ptest %xmm1, %xmm0 # sched: [2:1.00]
+; GENERIC-NEXT: setb %al # sched: [1:0.50]
+; GENERIC-NEXT: ptest (%rdi), %xmm0 # sched: [8:1.00]
+; GENERIC-NEXT: setb %cl # sched: [1:0.50]
+; GENERIC-NEXT: andb %al, %cl # sched: [1:0.33]
+; GENERIC-NEXT: movzbl %cl, %eax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SLM-LABEL: test_ptest:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: ptest %xmm1, %xmm0 # sched: [1:0.50]
; SLM-NEXT: setb %al # sched: [1:0.50]
; SLM-NEXT: ptest (%rdi), %xmm0 # sched: [4:1.00]
@@ -1994,44 +2884,74 @@ define i32 @test_ptest(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_ptest:
-; SANDY: # BB#0:
-; SANDY-NEXT: vptest %xmm1, %xmm0 # sched: [1:0.33]
-; SANDY-NEXT: setb %al # sched: [1:0.33]
-; SANDY-NEXT: vptest (%rdi), %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: setb %cl # sched: [1:0.33]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vptest %xmm1, %xmm0 # sched: [2:1.00]
+; SANDY-NEXT: setb %al # sched: [1:0.50]
+; SANDY-NEXT: vptest (%rdi), %xmm0 # sched: [8:1.00]
+; SANDY-NEXT: setb %cl # sched: [1:0.50]
; SANDY-NEXT: andb %al, %cl # sched: [1:0.33]
; SANDY-NEXT: movzbl %cl, %eax # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_ptest:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vptest %xmm1, %xmm0 # sched: [2:1.00]
; HASWELL-NEXT: setb %al # sched: [1:0.50]
-; HASWELL-NEXT: vptest (%rdi), %xmm0 # sched: [2:1.00]
+; HASWELL-NEXT: vptest (%rdi), %xmm0 # sched: [8:1.00]
; HASWELL-NEXT: setb %cl # sched: [1:0.50]
; HASWELL-NEXT: andb %al, %cl # sched: [1:0.25]
; HASWELL-NEXT: movzbl %cl, %eax # sched: [1:0.25]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_ptest:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vptest %xmm1, %xmm0 # sched: [2:1.00]
+; BROADWELL-NEXT: setb %al # sched: [1:0.50]
+; BROADWELL-NEXT: vptest (%rdi), %xmm0 # sched: [7:1.00]
+; BROADWELL-NEXT: setb %cl # sched: [1:0.50]
+; BROADWELL-NEXT: andb %al, %cl # sched: [1:0.25]
+; BROADWELL-NEXT: movzbl %cl, %eax # sched: [1:0.25]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_ptest:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vptest %xmm1, %xmm0 # sched: [3:1.00]
+; SKYLAKE-NEXT: setb %al # sched: [1:0.50]
+; SKYLAKE-NEXT: vptest (%rdi), %xmm0 # sched: [9:1.00]
+; SKYLAKE-NEXT: setb %cl # sched: [1:0.50]
+; SKYLAKE-NEXT: andb %al, %cl # sched: [1:0.25]
+; SKYLAKE-NEXT: movzbl %cl, %eax # sched: [1:0.25]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_ptest:
+; SKX: # %bb.0:
+; SKX-NEXT: vptest %xmm1, %xmm0 # sched: [3:1.00]
+; SKX-NEXT: setb %al # sched: [1:0.50]
+; SKX-NEXT: vptest (%rdi), %xmm0 # sched: [9:1.00]
+; SKX-NEXT: setb %cl # sched: [1:0.50]
+; SKX-NEXT: andb %al, %cl # sched: [1:0.25]
+; SKX-NEXT: movzbl %cl, %eax # sched: [1:0.25]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_ptest:
-; BTVER2: # BB#0:
-; BTVER2-NEXT: vptest %xmm1, %xmm0 # sched: [1:0.50]
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: vptest %xmm1, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: setb %al # sched: [1:0.50]
-; BTVER2-NEXT: vptest (%rdi), %xmm0 # sched: [6:1.00]
+; BTVER2-NEXT: vptest (%rdi), %xmm0 # sched: [8:1.00]
; BTVER2-NEXT: setb %cl # sched: [1:0.50]
; BTVER2-NEXT: andb %al, %cl # sched: [1:0.50]
; BTVER2-NEXT: movzbl %cl, %eax # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_ptest:
-; ZNVER1: # BB#0:
-; ZNVER1-NEXT: vptest %xmm1, %xmm0 # sched: [1:0.25]
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vptest %xmm1, %xmm0 # sched: [1:1.00]
; ZNVER1-NEXT: setb %al # sched: [1:0.25]
-; ZNVER1-NEXT: vptest (%rdi), %xmm0 # sched: [8:0.50]
+; ZNVER1-NEXT: vptest (%rdi), %xmm0 # sched: [8:1.00]
; ZNVER1-NEXT: setb %cl # sched: [1:0.25]
; ZNVER1-NEXT: andb %al, %cl # sched: [1:0.25]
; ZNVER1-NEXT: movzbl %cl, %eax # sched: [1:0.25]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call i32 @llvm.x86.sse41.ptestc(<2 x i64> %a0, <2 x i64> %a1)
%2 = load <2 x i64>, <2 x i64> *%a2, align 16
%3 = call i32 @llvm.x86.sse41.ptestc(<2 x i64> %a0, <2 x i64> %2)
@@ -2042,14 +2962,14 @@ declare i32 @llvm.x86.sse41.ptestc(<2 x i64>, <2 x i64>) nounwind readnone
define <2 x double> @test_roundpd(<2 x double> %a0, <2 x double> *%a1) {
; GENERIC-LABEL: test_roundpd:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: roundpd $7, %xmm0, %xmm1
-; GENERIC-NEXT: roundpd $7, (%rdi), %xmm0
-; GENERIC-NEXT: addpd %xmm1, %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: roundpd $7, %xmm0, %xmm1 # sched: [3:1.00]
+; GENERIC-NEXT: roundpd $7, (%rdi), %xmm0 # sched: [9:1.00]
+; GENERIC-NEXT: addpd %xmm1, %xmm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SLM-LABEL: test_roundpd:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: roundpd $7, (%rdi), %xmm1 # sched: [6:1.00]
; SLM-NEXT: roundpd $7, %xmm0, %xmm0 # sched: [3:1.00]
; SLM-NEXT: addpd %xmm0, %xmm1 # sched: [3:1.00]
@@ -2057,32 +2977,53 @@ define <2 x double> @test_roundpd(<2 x double> %a0, <2 x double> *%a1) {
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_roundpd:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vroundpd $7, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vroundpd $7, (%rdi), %xmm1 # sched: [7:1.00]
+; SANDY-NEXT: vroundpd $7, (%rdi), %xmm1 # sched: [9:1.00]
; SANDY-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_roundpd:
-; HASWELL: # BB#0:
-; HASWELL-NEXT: vroundpd $7, %xmm0, %xmm0 # sched: [6:2.00]
-; HASWELL-NEXT: vroundpd $7, (%rdi), %xmm1 # sched: [10:2.00]
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vroundpd $7, %xmm0, %xmm0 # sched: [6:0.50]
+; HASWELL-NEXT: vroundpd $7, (%rdi), %xmm1 # sched: [12:2.00]
; HASWELL-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_roundpd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vroundpd $7, (%rdi), %xmm1 # sched: [11:2.00]
+; BROADWELL-NEXT: vroundpd $7, %xmm0, %xmm0 # sched: [6:0.50]
+; BROADWELL-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_roundpd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vroundpd $7, %xmm0, %xmm0 # sched: [8:0.67]
+; SKYLAKE-NEXT: vroundpd $7, (%rdi), %xmm1 # sched: [14:0.67]
+; SKYLAKE-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_roundpd:
+; SKX: # %bb.0:
+; SKX-NEXT: vrndscalepd $7, %xmm0, %xmm0 # sched: [8:0.67]
+; SKX-NEXT: vrndscalepd $7, (%rdi), %xmm1 # sched: [14:0.67]
+; SKX-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_roundpd:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vroundpd $7, (%rdi), %xmm1 # sched: [8:1.00]
; BTVER2-NEXT: vroundpd $7, %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_roundpd:
-; ZNVER1: # BB#0:
-; ZNVER1-NEXT: vroundpd $7, (%rdi), %xmm1 # sched: [10:1.00]
-; ZNVER1-NEXT: vroundpd $7, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vroundpd $7, (%rdi), %xmm1 # sched: [11:1.00]
+; ZNVER1-NEXT: vroundpd $7, %xmm0, %xmm0 # sched: [4:1.00]
; ZNVER1-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %a0, i32 7)
%2 = load <2 x double>, <2 x double> *%a1, align 16
%3 = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %2, i32 7)
@@ -2093,14 +3034,14 @@ declare <2 x double> @llvm.x86.sse41.round.pd(<2 x double>, i32) nounwind readno
define <4 x float> @test_roundps(<4 x float> %a0, <4 x float> *%a1) {
; GENERIC-LABEL: test_roundps:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: roundps $7, %xmm0, %xmm1
-; GENERIC-NEXT: roundps $7, (%rdi), %xmm0
-; GENERIC-NEXT: addps %xmm1, %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: roundps $7, %xmm0, %xmm1 # sched: [3:1.00]
+; GENERIC-NEXT: roundps $7, (%rdi), %xmm0 # sched: [9:1.00]
+; GENERIC-NEXT: addps %xmm1, %xmm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SLM-LABEL: test_roundps:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: roundps $7, (%rdi), %xmm1 # sched: [6:1.00]
; SLM-NEXT: roundps $7, %xmm0, %xmm0 # sched: [3:1.00]
; SLM-NEXT: addps %xmm0, %xmm1 # sched: [3:1.00]
@@ -2108,32 +3049,53 @@ define <4 x float> @test_roundps(<4 x float> %a0, <4 x float> *%a1) {
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_roundps:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vroundps $7, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: vroundps $7, (%rdi), %xmm1 # sched: [7:1.00]
+; SANDY-NEXT: vroundps $7, (%rdi), %xmm1 # sched: [9:1.00]
; SANDY-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_roundps:
-; HASWELL: # BB#0:
-; HASWELL-NEXT: vroundps $7, %xmm0, %xmm0 # sched: [6:2.00]
-; HASWELL-NEXT: vroundps $7, (%rdi), %xmm1 # sched: [10:2.00]
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vroundps $7, %xmm0, %xmm0 # sched: [6:0.50]
+; HASWELL-NEXT: vroundps $7, (%rdi), %xmm1 # sched: [12:2.00]
; HASWELL-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_roundps:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vroundps $7, (%rdi), %xmm1 # sched: [11:2.00]
+; BROADWELL-NEXT: vroundps $7, %xmm0, %xmm0 # sched: [6:0.50]
+; BROADWELL-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_roundps:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vroundps $7, %xmm0, %xmm0 # sched: [8:0.67]
+; SKYLAKE-NEXT: vroundps $7, (%rdi), %xmm1 # sched: [14:0.67]
+; SKYLAKE-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_roundps:
+; SKX: # %bb.0:
+; SKX-NEXT: vrndscaleps $7, %xmm0, %xmm0 # sched: [8:0.67]
+; SKX-NEXT: vrndscaleps $7, (%rdi), %xmm1 # sched: [14:0.67]
+; SKX-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_roundps:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vroundps $7, (%rdi), %xmm1 # sched: [8:1.00]
; BTVER2-NEXT: vroundps $7, %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_roundps:
-; ZNVER1: # BB#0:
-; ZNVER1-NEXT: vroundps $7, (%rdi), %xmm1 # sched: [10:1.00]
-; ZNVER1-NEXT: vroundps $7, %xmm0, %xmm0 # sched: [3:1.00]
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vroundps $7, (%rdi), %xmm1 # sched: [11:1.00]
+; ZNVER1-NEXT: vroundps $7, %xmm0, %xmm0 # sched: [4:1.00]
; ZNVER1-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %a0, i32 7)
%2 = load <4 x float>, <4 x float> *%a1, align 16
%3 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %2, i32 7)
@@ -2144,48 +3106,69 @@ declare <4 x float> @llvm.x86.sse41.round.ps(<4 x float>, i32) nounwind readnone
define <2 x double> @test_roundsd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2) {
; GENERIC-LABEL: test_roundsd:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: movaps %xmm0, %xmm2
-; GENERIC-NEXT: roundsd $7, %xmm1, %xmm2
-; GENERIC-NEXT: roundsd $7, (%rdi), %xmm0
-; GENERIC-NEXT: addpd %xmm2, %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: movapd %xmm0, %xmm2 # sched: [1:1.00]
+; GENERIC-NEXT: roundsd $7, %xmm1, %xmm2 # sched: [3:1.00]
+; GENERIC-NEXT: roundsd $7, (%rdi), %xmm0 # sched: [9:1.00]
+; GENERIC-NEXT: addpd %xmm2, %xmm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SLM-LABEL: test_roundsd:
-; SLM: # BB#0:
-; SLM-NEXT: movaps %xmm0, %xmm2 # sched: [1:1.00]
+; SLM: # %bb.0:
+; SLM-NEXT: movapd %xmm0, %xmm2 # sched: [1:1.00]
; SLM-NEXT: roundsd $7, (%rdi), %xmm0 # sched: [6:1.00]
; SLM-NEXT: roundsd $7, %xmm1, %xmm2 # sched: [3:1.00]
; SLM-NEXT: addpd %xmm2, %xmm0 # sched: [3:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_roundsd:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vroundsd $7, %xmm1, %xmm0, %xmm1 # sched: [3:1.00]
-; SANDY-NEXT: vroundsd $7, (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; SANDY-NEXT: vroundsd $7, (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
; SANDY-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_roundsd:
-; HASWELL: # BB#0:
-; HASWELL-NEXT: vroundsd $7, %xmm1, %xmm0, %xmm1 # sched: [6:2.00]
-; HASWELL-NEXT: vroundsd $7, (%rdi), %xmm0, %xmm0 # sched: [10:2.00]
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vroundsd $7, %xmm1, %xmm0, %xmm1 # sched: [6:0.50]
+; HASWELL-NEXT: vroundsd $7, (%rdi), %xmm0, %xmm0 # sched: [12:2.00]
; HASWELL-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_roundsd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vroundsd $7, (%rdi), %xmm0, %xmm2 # sched: [11:2.00]
+; BROADWELL-NEXT: vroundsd $7, %xmm1, %xmm0, %xmm0 # sched: [6:0.50]
+; BROADWELL-NEXT: vaddpd %xmm2, %xmm0, %xmm0 # sched: [3:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_roundsd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vroundsd $7, %xmm1, %xmm0, %xmm1 # sched: [8:0.67]
+; SKYLAKE-NEXT: vroundsd $7, (%rdi), %xmm0, %xmm0 # sched: [14:0.67]
+; SKYLAKE-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_roundsd:
+; SKX: # %bb.0:
+; SKX-NEXT: vrndscalesd $7, %xmm1, %xmm0, %xmm1 # sched: [8:0.67]
+; SKX-NEXT: vrndscalesd $7, (%rdi), %xmm0, %xmm0 # sched: [14:0.67]
+; SKX-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_roundsd:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vroundsd $7, %xmm1, %xmm0, %xmm1 # sched: [3:1.00]
; BTVER2-NEXT: vroundsd $7, (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
; BTVER2-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_roundsd:
-; ZNVER1: # BB#0:
-; ZNVER1-NEXT: vroundsd $7, %xmm1, %xmm0, %xmm1 # sched: [3:1.00]
-; ZNVER1-NEXT: vroundsd $7, (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vroundsd $7, %xmm1, %xmm0, %xmm1 # sched: [4:1.00]
+; ZNVER1-NEXT: vroundsd $7, (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
; ZNVER1-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %a0, <2 x double> %a1, i32 7)
%2 = load <2 x double>, <2 x double>* %a2, align 16
%3 = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %a0, <2 x double> %2, i32 7)
@@ -2196,15 +3179,15 @@ declare <2 x double> @llvm.x86.sse41.round.sd(<2 x double>, <2 x double>, i32) n
define <4 x float> @test_roundss(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2) {
; GENERIC-LABEL: test_roundss:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: movaps %xmm0, %xmm2
-; GENERIC-NEXT: roundss $7, %xmm1, %xmm2
-; GENERIC-NEXT: roundss $7, (%rdi), %xmm0
-; GENERIC-NEXT: addps %xmm2, %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: movaps %xmm0, %xmm2 # sched: [1:1.00]
+; GENERIC-NEXT: roundss $7, %xmm1, %xmm2 # sched: [3:1.00]
+; GENERIC-NEXT: roundss $7, (%rdi), %xmm0 # sched: [9:1.00]
+; GENERIC-NEXT: addps %xmm2, %xmm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SLM-LABEL: test_roundss:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: movaps %xmm0, %xmm2 # sched: [1:1.00]
; SLM-NEXT: roundss $7, (%rdi), %xmm0 # sched: [6:1.00]
; SLM-NEXT: roundss $7, %xmm1, %xmm2 # sched: [3:1.00]
@@ -2212,32 +3195,53 @@ define <4 x float> @test_roundss(<4 x float> %a0, <4 x float> %a1, <4 x float> *
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_roundss:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vroundss $7, %xmm1, %xmm0, %xmm1 # sched: [3:1.00]
-; SANDY-NEXT: vroundss $7, (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; SANDY-NEXT: vroundss $7, (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
; SANDY-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_roundss:
-; HASWELL: # BB#0:
-; HASWELL-NEXT: vroundss $7, %xmm1, %xmm0, %xmm1 # sched: [6:2.00]
-; HASWELL-NEXT: vroundss $7, (%rdi), %xmm0, %xmm0 # sched: [10:2.00]
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vroundss $7, %xmm1, %xmm0, %xmm1 # sched: [6:0.50]
+; HASWELL-NEXT: vroundss $7, (%rdi), %xmm0, %xmm0 # sched: [12:2.00]
; HASWELL-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_roundss:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vroundss $7, (%rdi), %xmm0, %xmm2 # sched: [11:2.00]
+; BROADWELL-NEXT: vroundss $7, %xmm1, %xmm0, %xmm0 # sched: [6:0.50]
+; BROADWELL-NEXT: vaddps %xmm2, %xmm0, %xmm0 # sched: [3:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_roundss:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vroundss $7, %xmm1, %xmm0, %xmm1 # sched: [8:0.67]
+; SKYLAKE-NEXT: vroundss $7, (%rdi), %xmm0, %xmm0 # sched: [14:0.67]
+; SKYLAKE-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [4:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_roundss:
+; SKX: # %bb.0:
+; SKX-NEXT: vrndscaless $7, %xmm1, %xmm0, %xmm1 # sched: [8:0.67]
+; SKX-NEXT: vrndscaless $7, (%rdi), %xmm0, %xmm0 # sched: [14:0.67]
+; SKX-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_roundss:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vroundss $7, %xmm1, %xmm0, %xmm1 # sched: [3:1.00]
; BTVER2-NEXT: vroundss $7, (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
; BTVER2-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_roundss:
-; ZNVER1: # BB#0:
-; ZNVER1-NEXT: vroundss $7, %xmm1, %xmm0, %xmm1 # sched: [3:1.00]
-; ZNVER1-NEXT: vroundss $7, (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vroundss $7, %xmm1, %xmm0, %xmm1 # sched: [4:1.00]
+; ZNVER1-NEXT: vroundss $7, (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
; ZNVER1-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %a0, <4 x float> %a1, i32 7)
%2 = load <4 x float>, <4 x float> *%a2, align 16
%3 = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %a0, <4 x float> %2, i32 7)
diff --git a/test/CodeGen/X86/sse41.ll b/test/CodeGen/X86/sse41.ll
index 4a0dc9c1eb17..0100c9142ad6 100644
--- a/test/CodeGen/X86/sse41.ll
+++ b/test/CodeGen/X86/sse41.ll
@@ -6,12 +6,12 @@
define <4 x i32> @pinsrd_1(i32 %s, <4 x i32> %tmp) nounwind {
; X32-LABEL: pinsrd_1:
-; X32: ## BB#0:
+; X32: ## %bb.0:
; X32-NEXT: pinsrd $1, {{[0-9]+}}(%esp), %xmm0
; X32-NEXT: retl
;
; X64-LABEL: pinsrd_1:
-; X64: ## BB#0:
+; X64: ## %bb.0:
; X64-NEXT: pinsrd $1, %edi, %xmm0
; X64-NEXT: retq
%tmp1 = insertelement <4 x i32> %tmp, i32 %s, i32 1
@@ -20,12 +20,12 @@ define <4 x i32> @pinsrd_1(i32 %s, <4 x i32> %tmp) nounwind {
define <16 x i8> @pinsrb_1(i8 %s, <16 x i8> %tmp) nounwind {
; X32-LABEL: pinsrb_1:
-; X32: ## BB#0:
+; X32: ## %bb.0:
; X32-NEXT: pinsrb $1, {{[0-9]+}}(%esp), %xmm0
; X32-NEXT: retl
;
; X64-LABEL: pinsrb_1:
-; X64: ## BB#0:
+; X64: ## %bb.0:
; X64-NEXT: pinsrb $1, %edi, %xmm0
; X64-NEXT: retq
%tmp1 = insertelement <16 x i8> %tmp, i8 %s, i32 1
@@ -34,13 +34,13 @@ define <16 x i8> @pinsrb_1(i8 %s, <16 x i8> %tmp) nounwind {
define <2 x i64> @pmovzxbq_1() nounwind {
; X32-LABEL: pmovzxbq_1:
-; X32: ## BB#0: ## %entry
+; X32: ## %bb.0: ## %entry
; X32-NEXT: movl L_g16$non_lazy_ptr, %eax
; X32-NEXT: pmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
; X32-NEXT: retl
;
; X64-LABEL: pmovzxbq_1:
-; X64: ## BB#0: ## %entry
+; X64: ## %bb.0: ## %entry
; X64-NEXT: movq _g16@{{.*}}(%rip), %rax
; X64-NEXT: pmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
; X64-NEXT: retq
@@ -56,12 +56,12 @@ declare <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8>) nounwind readnone
define i32 @extractps_1(<4 x float> %v) nounwind {
; X32-LABEL: extractps_1:
-; X32: ## BB#0:
+; X32: ## %bb.0:
; X32-NEXT: extractps $3, %xmm0, %eax
; X32-NEXT: retl
;
; X64-LABEL: extractps_1:
-; X64: ## BB#0:
+; X64: ## %bb.0:
; X64-NEXT: extractps $3, %xmm0, %eax
; X64-NEXT: retq
%s = extractelement <4 x float> %v, i32 3
@@ -70,12 +70,12 @@ define i32 @extractps_1(<4 x float> %v) nounwind {
}
define i32 @extractps_2(<4 x float> %v) nounwind {
; X32-LABEL: extractps_2:
-; X32: ## BB#0:
+; X32: ## %bb.0:
; X32-NEXT: extractps $3, %xmm0, %eax
; X32-NEXT: retl
;
; X64-LABEL: extractps_2:
-; X64: ## BB#0:
+; X64: ## %bb.0:
; X64-NEXT: extractps $3, %xmm0, %eax
; X64-NEXT: retq
%t = bitcast <4 x float> %v to <4 x i32>
@@ -90,7 +90,7 @@ define i32 @extractps_2(<4 x float> %v) nounwind {
define float @ext_1(<4 x float> %v) nounwind {
; X32-LABEL: ext_1:
-; X32: ## BB#0:
+; X32: ## %bb.0:
; X32-NEXT: pushl %eax
; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
; X32-NEXT: addss LCPI5_0, %xmm0
@@ -100,7 +100,7 @@ define float @ext_1(<4 x float> %v) nounwind {
; X32-NEXT: retl
;
; X64-LABEL: ext_1:
-; X64: ## BB#0:
+; X64: ## %bb.0:
; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
; X64-NEXT: addss {{.*}}(%rip), %xmm0
; X64-NEXT: retq
@@ -108,9 +108,10 @@ define float @ext_1(<4 x float> %v) nounwind {
%t = fadd float %s, 1.0
ret float %t
}
+
define float @ext_2(<4 x float> %v) nounwind {
; X32-LABEL: ext_2:
-; X32: ## BB#0:
+; X32: ## %bb.0:
; X32-NEXT: pushl %eax
; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
; X32-NEXT: movss %xmm0, (%esp)
@@ -119,21 +120,22 @@ define float @ext_2(<4 x float> %v) nounwind {
; X32-NEXT: retl
;
; X64-LABEL: ext_2:
-; X64: ## BB#0:
+; X64: ## %bb.0:
; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
; X64-NEXT: retq
%s = extractelement <4 x float> %v, i32 3
ret float %s
}
+
define i32 @ext_3(<4 x i32> %v) nounwind {
; X32-LABEL: ext_3:
-; X32: ## BB#0:
-; X32-NEXT: pextrd $3, %xmm0, %eax
+; X32: ## %bb.0:
+; X32-NEXT: extractps $3, %xmm0, %eax
; X32-NEXT: retl
;
; X64-LABEL: ext_3:
-; X64: ## BB#0:
-; X64-NEXT: pextrd $3, %xmm0, %eax
+; X64: ## %bb.0:
+; X64-NEXT: extractps $3, %xmm0, %eax
; X64-NEXT: retq
%i = extractelement <4 x i32> %v, i32 3
ret i32 %i
@@ -141,12 +143,12 @@ define i32 @ext_3(<4 x i32> %v) nounwind {
define <4 x float> @insertps_1(<4 x float> %t1, <4 x float> %t2) nounwind {
; X32-LABEL: insertps_1:
-; X32: ## BB#0:
+; X32: ## %bb.0:
; X32-NEXT: insertps {{.*#+}} xmm0 = zero,xmm1[0],zero,xmm0[3]
; X32-NEXT: retl
;
; X64-LABEL: insertps_1:
-; X64: ## BB#0:
+; X64: ## %bb.0:
; X64-NEXT: insertps {{.*#+}} xmm0 = zero,xmm1[0],zero,xmm0[3]
; X64-NEXT: retq
%tmp1 = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %t1, <4 x float> %t2, i32 21) nounwind readnone
@@ -159,13 +161,13 @@ declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i32) noun
; generate a separate movss to load the scalar operand.
define <4 x float> @blendps_not_insertps_1(<4 x float> %t1, float %t2) nounwind {
; X32-LABEL: blendps_not_insertps_1:
-; X32: ## BB#0:
+; X32: ## %bb.0:
; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X32-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
; X32-NEXT: retl
;
; X64-LABEL: blendps_not_insertps_1:
-; X64: ## BB#0:
+; X64: ## %bb.0:
; X64-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
; X64-NEXT: retq
%tmp1 = insertelement <4 x float> %t1, float %t2, i32 0
@@ -177,13 +179,13 @@ define <4 x float> @blendps_not_insertps_1(<4 x float> %t1, float %t2) nounwind
; generate an insertps for X32 but not for X64!
define <4 x float> @insertps_or_blendps(<4 x float> %t1, float %t2) minsize nounwind {
; X32-LABEL: insertps_or_blendps:
-; X32: ## BB#0:
+; X32: ## %bb.0:
; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X32-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
; X32-NEXT: retl
;
; X64-LABEL: insertps_or_blendps:
-; X64: ## BB#0:
+; X64: ## %bb.0:
; X64-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
; X64-NEXT: retq
%tmp1 = insertelement <4 x float> %t1, float %t2, i32 0
@@ -194,12 +196,12 @@ define <4 x float> @insertps_or_blendps(<4 x float> %t1, float %t2) minsize noun
; is always just a blendps because blendps is never more expensive than insertps.
define <4 x float> @blendps_not_insertps_2(<4 x float> %t1, <4 x float> %t2) nounwind {
; X32-LABEL: blendps_not_insertps_2:
-; X32: ## BB#0:
+; X32: ## %bb.0:
; X32-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
; X32-NEXT: retl
;
; X64-LABEL: blendps_not_insertps_2:
-; X64: ## BB#0:
+; X64: ## %bb.0:
; X64-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
; X64-NEXT: retq
%tmp2 = extractelement <4 x float> %t2, i32 0
@@ -209,14 +211,14 @@ define <4 x float> @blendps_not_insertps_2(<4 x float> %t1, <4 x float> %t2) nou
define i32 @ptestz_1(<2 x i64> %t1, <2 x i64> %t2) nounwind {
; X32-LABEL: ptestz_1:
-; X32: ## BB#0:
+; X32: ## %bb.0:
; X32-NEXT: xorl %eax, %eax
; X32-NEXT: ptest %xmm1, %xmm0
; X32-NEXT: sete %al
; X32-NEXT: retl
;
; X64-LABEL: ptestz_1:
-; X64: ## BB#0:
+; X64: ## %bb.0:
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: ptest %xmm1, %xmm0
; X64-NEXT: sete %al
@@ -227,14 +229,14 @@ define i32 @ptestz_1(<2 x i64> %t1, <2 x i64> %t2) nounwind {
define i32 @ptestz_2(<2 x i64> %t1, <2 x i64> %t2) nounwind {
; X32-LABEL: ptestz_2:
-; X32: ## BB#0:
+; X32: ## %bb.0:
; X32-NEXT: xorl %eax, %eax
; X32-NEXT: ptest %xmm1, %xmm0
; X32-NEXT: setb %al
; X32-NEXT: retl
;
; X64-LABEL: ptestz_2:
-; X64: ## BB#0:
+; X64: ## %bb.0:
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: ptest %xmm1, %xmm0
; X64-NEXT: setb %al
@@ -245,14 +247,14 @@ define i32 @ptestz_2(<2 x i64> %t1, <2 x i64> %t2) nounwind {
define i32 @ptestz_3(<2 x i64> %t1, <2 x i64> %t2) nounwind {
; X32-LABEL: ptestz_3:
-; X32: ## BB#0:
+; X32: ## %bb.0:
; X32-NEXT: xorl %eax, %eax
; X32-NEXT: ptest %xmm1, %xmm0
; X32-NEXT: seta %al
; X32-NEXT: retl
;
; X64-LABEL: ptestz_3:
-; X64: ## BB#0:
+; X64: ## %bb.0:
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: ptest %xmm1, %xmm0
; X64-NEXT: seta %al
@@ -261,7 +263,6 @@ define i32 @ptestz_3(<2 x i64> %t1, <2 x i64> %t2) nounwind {
ret i32 %tmp1
}
-
declare i32 @llvm.x86.sse41.ptestz(<2 x i64>, <2 x i64>) nounwind readnone
declare i32 @llvm.x86.sse41.ptestc(<2 x i64>, <2 x i64>) nounwind readnone
declare i32 @llvm.x86.sse41.ptestnzc(<2 x i64>, <2 x i64>) nounwind readnone
@@ -270,7 +271,7 @@ declare i32 @llvm.x86.sse41.ptestnzc(<2 x i64>, <2 x i64>) nounwind readnone
; pointless.
define <2 x float> @buildvector(<2 x float> %A, <2 x float> %B) nounwind {
; X32-LABEL: buildvector:
-; X32: ## BB#0: ## %entry
+; X32: ## %bb.0: ## %entry
; X32-NEXT: movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
; X32-NEXT: movshdup {{.*#+}} xmm3 = xmm1[1,1,3,3]
; X32-NEXT: addss %xmm2, %xmm3
@@ -279,7 +280,7 @@ define <2 x float> @buildvector(<2 x float> %A, <2 x float> %B) nounwind {
; X32-NEXT: retl
;
; X64-LABEL: buildvector:
-; X64: ## BB#0: ## %entry
+; X64: ## %bb.0: ## %entry
; X64-NEXT: movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
; X64-NEXT: movshdup {{.*#+}} xmm3 = xmm1[1,1,3,3]
; X64-NEXT: addss %xmm2, %xmm3
@@ -300,13 +301,13 @@ entry:
define <4 x float> @insertps_from_shufflevector_1(<4 x float> %a, <4 x float>* nocapture readonly %pb) {
; X32-LABEL: insertps_from_shufflevector_1:
-; X32: ## BB#0: ## %entry
+; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
; X32-NEXT: retl
;
; X64-LABEL: insertps_from_shufflevector_1:
-; X64: ## BB#0: ## %entry
+; X64: ## %bb.0: ## %entry
; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
; X64-NEXT: retq
entry:
@@ -317,12 +318,12 @@ entry:
define <4 x float> @insertps_from_shufflevector_2(<4 x float> %a, <4 x float> %b) {
; X32-LABEL: insertps_from_shufflevector_2:
-; X32: ## BB#0: ## %entry
+; X32: ## %bb.0: ## %entry
; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[1],xmm0[3]
; X32-NEXT: retl
;
; X64-LABEL: insertps_from_shufflevector_2:
-; X64: ## BB#0: ## %entry
+; X64: ## %bb.0: ## %entry
; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[1],xmm0[3]
; X64-NEXT: retq
entry:
@@ -334,14 +335,14 @@ entry:
; instead of insertps
define <4 x i32> @pinsrd_from_shufflevector_i32(<4 x i32> %a, <4 x i32>* nocapture readonly %pb) {
; X32-LABEL: pinsrd_from_shufflevector_i32:
-; X32: ## BB#0: ## %entry
+; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: pshufd {{.*#+}} xmm1 = mem[0,1,2,0]
; X32-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
; X32-NEXT: retl
;
; X64-LABEL: pinsrd_from_shufflevector_i32:
-; X64: ## BB#0: ## %entry
+; X64: ## %bb.0: ## %entry
; X64-NEXT: pshufd {{.*#+}} xmm1 = mem[0,1,2,0]
; X64-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
; X64-NEXT: retq
@@ -353,13 +354,13 @@ entry:
define <4 x i32> @insertps_from_shufflevector_i32_2(<4 x i32> %a, <4 x i32> %b) {
; X32-LABEL: insertps_from_shufflevector_i32_2:
-; X32: ## BB#0: ## %entry
+; X32: ## %bb.0: ## %entry
; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
; X32-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
; X32-NEXT: retl
;
; X64-LABEL: insertps_from_shufflevector_i32_2:
-; X64: ## BB#0: ## %entry
+; X64: ## %bb.0: ## %entry
; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
; X64-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
; X64-NEXT: retq
@@ -370,13 +371,13 @@ entry:
define <4 x float> @insertps_from_load_ins_elt_undef(<4 x float> %a, float* %b) {
; X32-LABEL: insertps_from_load_ins_elt_undef:
-; X32: ## BB#0:
+; X32: ## %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
; X32-NEXT: retl
;
; X64-LABEL: insertps_from_load_ins_elt_undef:
-; X64: ## BB#0:
+; X64: ## %bb.0:
; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
; X64-NEXT: retq
%1 = load float, float* %b, align 4
@@ -388,18 +389,14 @@ define <4 x float> @insertps_from_load_ins_elt_undef(<4 x float> %a, float* %b)
; TODO: Like on pinsrd_from_shufflevector_i32, remove this mov instr
define <4 x i32> @insertps_from_load_ins_elt_undef_i32(<4 x i32> %a, i32* %b) {
; X32-LABEL: insertps_from_load_ins_elt_undef_i32:
-; X32: ## BB#0:
+; X32: ## %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
-; X32-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5],xmm0[6,7]
+; X32-NEXT: pinsrd $2, (%eax), %xmm0
; X32-NEXT: retl
;
; X64-LABEL: insertps_from_load_ins_elt_undef_i32:
-; X64: ## BB#0:
-; X64-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
-; X64-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5],xmm0[6,7]
+; X64: ## %bb.0:
+; X64-NEXT: pinsrd $2, (%rdi), %xmm0
; X64-NEXT: retq
%1 = load i32, i32* %b, align 4
%2 = insertelement <4 x i32> undef, i32 %1, i32 0
@@ -410,13 +407,13 @@ define <4 x i32> @insertps_from_load_ins_elt_undef_i32(<4 x i32> %a, i32* %b) {
;;;;;; Shuffles optimizable with a single insertps or blend instruction
define <4 x float> @shuf_XYZ0(<4 x float> %x, <4 x float> %a) {
; X32-LABEL: shuf_XYZ0:
-; X32: ## BB#0:
+; X32: ## %bb.0:
; X32-NEXT: xorps %xmm1, %xmm1
; X32-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
; X32-NEXT: retl
;
; X64-LABEL: shuf_XYZ0:
-; X64: ## BB#0:
+; X64: ## %bb.0:
; X64-NEXT: xorps %xmm1, %xmm1
; X64-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
; X64-NEXT: retq
@@ -432,12 +429,12 @@ define <4 x float> @shuf_XYZ0(<4 x float> %x, <4 x float> %a) {
define <4 x float> @shuf_XY00(<4 x float> %x, <4 x float> %a) {
; X32-LABEL: shuf_XY00:
-; X32: ## BB#0:
+; X32: ## %bb.0:
; X32-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
; X32-NEXT: retl
;
; X64-LABEL: shuf_XY00:
-; X64: ## BB#0:
+; X64: ## %bb.0:
; X64-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
; X64-NEXT: retq
%vecext = extractelement <4 x float> %x, i32 0
@@ -451,12 +448,12 @@ define <4 x float> @shuf_XY00(<4 x float> %x, <4 x float> %a) {
define <4 x float> @shuf_XYY0(<4 x float> %x, <4 x float> %a) {
; X32-LABEL: shuf_XYY0:
-; X32: ## BB#0:
+; X32: ## %bb.0:
; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,1],zero
; X32-NEXT: retl
;
; X64-LABEL: shuf_XYY0:
-; X64: ## BB#0:
+; X64: ## %bb.0:
; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,1],zero
; X64-NEXT: retq
%vecext = extractelement <4 x float> %x, i32 0
@@ -470,12 +467,12 @@ define <4 x float> @shuf_XYY0(<4 x float> %x, <4 x float> %a) {
define <4 x float> @shuf_XYW0(<4 x float> %x, <4 x float> %a) {
; X32-LABEL: shuf_XYW0:
-; X32: ## BB#0:
+; X32: ## %bb.0:
; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,3],zero
; X32-NEXT: retl
;
; X64-LABEL: shuf_XYW0:
-; X64: ## BB#0:
+; X64: ## %bb.0:
; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,3],zero
; X64-NEXT: retq
%vecext = extractelement <4 x float> %x, i32 0
@@ -490,12 +487,12 @@ define <4 x float> @shuf_XYW0(<4 x float> %x, <4 x float> %a) {
define <4 x float> @shuf_W00W(<4 x float> %x, <4 x float> %a) {
; X32-LABEL: shuf_W00W:
-; X32: ## BB#0:
+; X32: ## %bb.0:
; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[3],zero,zero,xmm0[3]
; X32-NEXT: retl
;
; X64-LABEL: shuf_W00W:
-; X64: ## BB#0:
+; X64: ## %bb.0:
; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[3],zero,zero,xmm0[3]
; X64-NEXT: retq
%vecext = extractelement <4 x float> %x, i32 3
@@ -508,12 +505,12 @@ define <4 x float> @shuf_W00W(<4 x float> %x, <4 x float> %a) {
define <4 x float> @shuf_X00A(<4 x float> %x, <4 x float> %a) {
; X32-LABEL: shuf_X00A:
-; X32: ## BB#0:
+; X32: ## %bb.0:
; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm1[0]
; X32-NEXT: retl
;
; X64-LABEL: shuf_X00A:
-; X64: ## BB#0:
+; X64: ## %bb.0:
; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm1[0]
; X64-NEXT: retq
%vecext = extractelement <4 x float> %x, i32 0
@@ -526,12 +523,12 @@ define <4 x float> @shuf_X00A(<4 x float> %x, <4 x float> %a) {
define <4 x float> @shuf_X00X(<4 x float> %x, <4 x float> %a) {
; X32-LABEL: shuf_X00X:
-; X32: ## BB#0:
+; X32: ## %bb.0:
; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm0[0]
; X32-NEXT: retl
;
; X64-LABEL: shuf_X00X:
-; X64: ## BB#0:
+; X64: ## %bb.0:
; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm0[0]
; X64-NEXT: retq
%vecext = extractelement <4 x float> %x, i32 0
@@ -544,14 +541,14 @@ define <4 x float> @shuf_X00X(<4 x float> %x, <4 x float> %a) {
define <4 x float> @shuf_X0YC(<4 x float> %x, <4 x float> %a) {
; X32-LABEL: shuf_X0YC:
-; X32: ## BB#0:
+; X32: ## %bb.0:
; X32-NEXT: xorps %xmm2, %xmm2
; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[2]
; X32-NEXT: retl
;
; X64-LABEL: shuf_X0YC:
-; X64: ## BB#0:
+; X64: ## %bb.0:
; X64-NEXT: xorps %xmm2, %xmm2
; X64-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[2]
@@ -566,13 +563,13 @@ define <4 x float> @shuf_X0YC(<4 x float> %x, <4 x float> %a) {
define <4 x i32> @i32_shuf_XYZ0(<4 x i32> %x, <4 x i32> %a) {
; X32-LABEL: i32_shuf_XYZ0:
-; X32: ## BB#0:
+; X32: ## %bb.0:
; X32-NEXT: pxor %xmm1, %xmm1
; X32-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
; X32-NEXT: retl
;
; X64-LABEL: i32_shuf_XYZ0:
-; X64: ## BB#0:
+; X64: ## %bb.0:
; X64-NEXT: pxor %xmm1, %xmm1
; X64-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
; X64-NEXT: retq
@@ -588,12 +585,12 @@ define <4 x i32> @i32_shuf_XYZ0(<4 x i32> %x, <4 x i32> %a) {
define <4 x i32> @i32_shuf_XY00(<4 x i32> %x, <4 x i32> %a) {
; X32-LABEL: i32_shuf_XY00:
-; X32: ## BB#0:
+; X32: ## %bb.0:
; X32-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
; X32-NEXT: retl
;
; X64-LABEL: i32_shuf_XY00:
-; X64: ## BB#0:
+; X64: ## %bb.0:
; X64-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
; X64-NEXT: retq
%vecext = extractelement <4 x i32> %x, i32 0
@@ -607,14 +604,14 @@ define <4 x i32> @i32_shuf_XY00(<4 x i32> %x, <4 x i32> %a) {
define <4 x i32> @i32_shuf_XYY0(<4 x i32> %x, <4 x i32> %a) {
; X32-LABEL: i32_shuf_XYY0:
-; X32: ## BB#0:
+; X32: ## %bb.0:
; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,1,3]
; X32-NEXT: pxor %xmm0, %xmm0
; X32-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7]
; X32-NEXT: retl
;
; X64-LABEL: i32_shuf_XYY0:
-; X64: ## BB#0:
+; X64: ## %bb.0:
; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,1,3]
; X64-NEXT: pxor %xmm0, %xmm0
; X64-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7]
@@ -630,14 +627,14 @@ define <4 x i32> @i32_shuf_XYY0(<4 x i32> %x, <4 x i32> %a) {
define <4 x i32> @i32_shuf_XYW0(<4 x i32> %x, <4 x i32> %a) {
; X32-LABEL: i32_shuf_XYW0:
-; X32: ## BB#0:
+; X32: ## %bb.0:
; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,3,3]
; X32-NEXT: pxor %xmm0, %xmm0
; X32-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7]
; X32-NEXT: retl
;
; X64-LABEL: i32_shuf_XYW0:
-; X64: ## BB#0:
+; X64: ## %bb.0:
; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,3,3]
; X64-NEXT: pxor %xmm0, %xmm0
; X64-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7]
@@ -654,14 +651,14 @@ define <4 x i32> @i32_shuf_XYW0(<4 x i32> %x, <4 x i32> %a) {
define <4 x i32> @i32_shuf_W00W(<4 x i32> %x, <4 x i32> %a) {
; X32-LABEL: i32_shuf_W00W:
-; X32: ## BB#0:
+; X32: ## %bb.0:
; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
; X32-NEXT: pxor %xmm0, %xmm0
; X32-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7]
; X32-NEXT: retl
;
; X64-LABEL: i32_shuf_W00W:
-; X64: ## BB#0:
+; X64: ## %bb.0:
; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
; X64-NEXT: pxor %xmm0, %xmm0
; X64-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7]
@@ -676,7 +673,7 @@ define <4 x i32> @i32_shuf_W00W(<4 x i32> %x, <4 x i32> %a) {
define <4 x i32> @i32_shuf_X00A(<4 x i32> %x, <4 x i32> %a) {
; X32-LABEL: i32_shuf_X00A:
-; X32: ## BB#0:
+; X32: ## %bb.0:
; X32-NEXT: pxor %xmm2, %xmm2
; X32-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7]
; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
@@ -684,7 +681,7 @@ define <4 x i32> @i32_shuf_X00A(<4 x i32> %x, <4 x i32> %a) {
; X32-NEXT: retl
;
; X64-LABEL: i32_shuf_X00A:
-; X64: ## BB#0:
+; X64: ## %bb.0:
; X64-NEXT: pxor %xmm2, %xmm2
; X64-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7]
; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
@@ -700,14 +697,14 @@ define <4 x i32> @i32_shuf_X00A(<4 x i32> %x, <4 x i32> %a) {
define <4 x i32> @i32_shuf_X00X(<4 x i32> %x, <4 x i32> %a) {
; X32-LABEL: i32_shuf_X00X:
-; X32: ## BB#0:
+; X32: ## %bb.0:
; X32-NEXT: pxor %xmm1, %xmm1
; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0]
; X32-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5],xmm0[6,7]
; X32-NEXT: retl
;
; X64-LABEL: i32_shuf_X00X:
-; X64: ## BB#0:
+; X64: ## %bb.0:
; X64-NEXT: pxor %xmm1, %xmm1
; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0]
; X64-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5],xmm0[6,7]
@@ -722,14 +719,14 @@ define <4 x i32> @i32_shuf_X00X(<4 x i32> %x, <4 x i32> %a) {
define <4 x i32> @i32_shuf_X0YC(<4 x i32> %x, <4 x i32> %a) {
; X32-LABEL: i32_shuf_X0YC:
-; X32: ## BB#0:
+; X32: ## %bb.0:
; X32-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero
; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,2,2]
; X32-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,5],xmm0[6,7]
; X32-NEXT: retl
;
; X64-LABEL: i32_shuf_X0YC:
-; X64: ## BB#0:
+; X64: ## %bb.0:
; X64-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero
; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,2,2]
; X64-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,5],xmm0[6,7]
@@ -745,14 +742,14 @@ define <4 x i32> @i32_shuf_X0YC(<4 x i32> %x, <4 x i32> %a) {
;; Test for a bug in the first implementation of LowerBuildVectorv4x32
define < 4 x float> @test_insertps_no_undef(<4 x float> %x) {
; X32-LABEL: test_insertps_no_undef:
-; X32: ## BB#0:
+; X32: ## %bb.0:
; X32-NEXT: xorps %xmm1, %xmm1
; X32-NEXT: blendps {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3]
; X32-NEXT: maxps %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_insertps_no_undef:
-; X64: ## BB#0:
+; X64: ## %bb.0:
; X64-NEXT: xorps %xmm1, %xmm1
; X64-NEXT: blendps {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3]
; X64-NEXT: maxps %xmm1, %xmm0
@@ -771,7 +768,7 @@ define < 4 x float> @test_insertps_no_undef(<4 x float> %x) {
define <8 x i16> @blendvb_fallback(<8 x i1> %mask, <8 x i16> %x, <8 x i16> %y) {
; X32-LABEL: blendvb_fallback:
-; X32: ## BB#0:
+; X32: ## %bb.0:
; X32-NEXT: psllw $15, %xmm0
; X32-NEXT: psraw $15, %xmm0
; X32-NEXT: pblendvb %xmm0, %xmm1, %xmm2
@@ -779,7 +776,7 @@ define <8 x i16> @blendvb_fallback(<8 x i1> %mask, <8 x i16> %x, <8 x i16> %y) {
; X32-NEXT: retl
;
; X64-LABEL: blendvb_fallback:
-; X64: ## BB#0:
+; X64: ## %bb.0:
; X64-NEXT: psllw $15, %xmm0
; X64-NEXT: psraw $15, %xmm0
; X64-NEXT: pblendvb %xmm0, %xmm1, %xmm2
@@ -792,13 +789,13 @@ define <8 x i16> @blendvb_fallback(<8 x i1> %mask, <8 x i16> %x, <8 x i16> %y) {
; On X32, account for the argument's move to registers
define <4 x float> @insertps_from_vector_load(<4 x float> %a, <4 x float>* nocapture readonly %pb) {
; X32-LABEL: insertps_from_vector_load:
-; X32: ## BB#0:
+; X32: ## %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
; X32-NEXT: retl
;
; X64-LABEL: insertps_from_vector_load:
-; X64: ## BB#0:
+; X64: ## %bb.0:
; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
; X64-NEXT: retq
%1 = load <4 x float>, <4 x float>* %pb, align 16
@@ -810,13 +807,13 @@ define <4 x float> @insertps_from_vector_load(<4 x float> %a, <4 x float>* nocap
;; Try to match a bit more of the instr, since we need the load's offset.
define <4 x float> @insertps_from_vector_load_offset(<4 x float> %a, <4 x float>* nocapture readonly %pb) {
; X32-LABEL: insertps_from_vector_load_offset:
-; X32: ## BB#0:
+; X32: ## %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
; X32-NEXT: retl
;
; X64-LABEL: insertps_from_vector_load_offset:
-; X64: ## BB#0:
+; X64: ## %bb.0:
; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
; X64-NEXT: retq
%1 = load <4 x float>, <4 x float>* %pb, align 16
@@ -827,7 +824,7 @@ define <4 x float> @insertps_from_vector_load_offset(<4 x float> %a, <4 x float>
;; Try to match a bit more of the instr, since we need the load's offset.
define <4 x float> @insertps_from_vector_load_offset_2(<4 x float> %a, <4 x float>* nocapture readonly %pb, i64 %index) {
; X32-LABEL: insertps_from_vector_load_offset_2:
-; X32: ## BB#0:
+; X32: ## %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: shll $4, %ecx
@@ -835,7 +832,7 @@ define <4 x float> @insertps_from_vector_load_offset_2(<4 x float> %a, <4 x floa
; X32-NEXT: retl
;
; X64-LABEL: insertps_from_vector_load_offset_2:
-; X64: ## BB#0:
+; X64: ## %bb.0:
; X64-NEXT: shlq $4, %rsi
; X64-NEXT: insertps {{.*#+}} xmm0 = mem[0],xmm0[1,2,3]
; X64-NEXT: retq
@@ -847,14 +844,14 @@ define <4 x float> @insertps_from_vector_load_offset_2(<4 x float> %a, <4 x floa
define <4 x float> @insertps_from_broadcast_loadf32(<4 x float> %a, float* nocapture readonly %fb, i64 %index) {
; X32-LABEL: insertps_from_broadcast_loadf32:
-; X32: ## BB#0:
+; X32: ## %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
; X32-NEXT: retl
;
; X64-LABEL: insertps_from_broadcast_loadf32:
-; X64: ## BB#0:
+; X64: ## %bb.0:
; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
; X64-NEXT: retq
%1 = getelementptr inbounds float, float* %fb, i64 %index
@@ -869,13 +866,13 @@ define <4 x float> @insertps_from_broadcast_loadf32(<4 x float> %a, float* nocap
define <4 x float> @insertps_from_broadcast_loadv4f32(<4 x float> %a, <4 x float>* nocapture readonly %b) {
; X32-LABEL: insertps_from_broadcast_loadv4f32:
-; X32: ## BB#0:
+; X32: ## %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
; X32-NEXT: retl
;
; X64-LABEL: insertps_from_broadcast_loadv4f32:
-; X64: ## BB#0:
+; X64: ## %bb.0:
; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
; X64-NEXT: retq
%1 = load <4 x float>, <4 x float>* %b, align 4
@@ -890,7 +887,7 @@ define <4 x float> @insertps_from_broadcast_loadv4f32(<4 x float> %a, <4 x float
define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, float* nocapture readonly %fb, i64 %index) {
; X32-LABEL: insertps_from_broadcast_multiple_use:
-; X32: ## BB#0:
+; X32: ## %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: movss {{.*#+}} xmm4 = mem[0],zero,zero,zero
@@ -904,7 +901,7 @@ define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x fl
; X32-NEXT: retl
;
; X64-LABEL: insertps_from_broadcast_multiple_use:
-; X64: ## BB#0:
+; X64: ## %bb.0:
; X64-NEXT: movss {{.*#+}} xmm4 = mem[0],zero,zero,zero
; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0]
; X64-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0]
@@ -932,18 +929,18 @@ define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x fl
define <4 x float> @insertps_with_undefs(<4 x float> %a, float* %b) {
; X32-LABEL: insertps_with_undefs:
-; X32: ## BB#0:
+; X32: ## %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X32-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; X32-NEXT: movapd %xmm1, %xmm0
+; X32-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; X32-NEXT: movaps %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: insertps_with_undefs:
-; X64: ## BB#0:
+; X64: ## %bb.0:
; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X64-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; X64-NEXT: movapd %xmm1, %xmm0
+; X64-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; X64-NEXT: movaps %xmm1, %xmm0
; X64-NEXT: retq
%1 = load float, float* %b, align 4
%2 = insertelement <4 x float> undef, float %1, i32 0
@@ -955,13 +952,13 @@ define <4 x float> @insertps_with_undefs(<4 x float> %a, float* %b) {
; the destination index to change the load, instead of the source index.
define <4 x float> @pr20087(<4 x float> %a, <4 x float> *%ptr) {
; X32-LABEL: pr20087:
-; X32: ## BB#0:
+; X32: ## %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2],mem[0]
; X32-NEXT: retl
;
; X64-LABEL: pr20087:
-; X64: ## BB#0:
+; X64: ## %bb.0:
; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2],mem[0]
; X64-NEXT: retq
%load = load <4 x float> , <4 x float> *%ptr
@@ -972,7 +969,7 @@ define <4 x float> @pr20087(<4 x float> %a, <4 x float> *%ptr) {
; Edge case for insertps where we end up with a shuffle with mask=<0, 7, -1, -1>
define void @insertps_pr20411(<4 x i32> %shuffle109, <4 x i32> %shuffle116, i32* noalias nocapture %RET) #1 {
; X32-LABEL: insertps_pr20411:
-; X32: ## BB#0:
+; X32: ## %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
; X32-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
@@ -980,7 +977,7 @@ define void @insertps_pr20411(<4 x i32> %shuffle109, <4 x i32> %shuffle116, i32*
; X32-NEXT: retl
;
; X64-LABEL: insertps_pr20411:
-; X64: ## BB#0:
+; X64: ## %bb.0:
; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
; X64-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
; X64-NEXT: movdqu %xmm1, (%rdi)
@@ -993,12 +990,12 @@ define void @insertps_pr20411(<4 x i32> %shuffle109, <4 x i32> %shuffle116, i32*
define <4 x float> @insertps_4(<4 x float> %A, <4 x float> %B) {
; X32-LABEL: insertps_4:
-; X32: ## BB#0:
+; X32: ## %bb.0:
; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm1[2],zero
; X32-NEXT: retl
;
; X64-LABEL: insertps_4:
-; X64: ## BB#0:
+; X64: ## %bb.0:
; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm1[2],zero
; X64-NEXT: retq
%vecext = extractelement <4 x float> %A, i32 0
@@ -1012,12 +1009,12 @@ define <4 x float> @insertps_4(<4 x float> %A, <4 x float> %B) {
define <4 x float> @insertps_5(<4 x float> %A, <4 x float> %B) {
; X32-LABEL: insertps_5:
-; X32: ## BB#0:
+; X32: ## %bb.0:
; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[1],zero,zero
; X32-NEXT: retl
;
; X64-LABEL: insertps_5:
-; X64: ## BB#0:
+; X64: ## %bb.0:
; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[1],zero,zero
; X64-NEXT: retq
%vecext = extractelement <4 x float> %A, i32 0
@@ -1031,12 +1028,12 @@ define <4 x float> @insertps_5(<4 x float> %A, <4 x float> %B) {
define <4 x float> @insertps_6(<4 x float> %A, <4 x float> %B) {
; X32-LABEL: insertps_6:
-; X32: ## BB#0:
+; X32: ## %bb.0:
; X32-NEXT: insertps {{.*#+}} xmm0 = zero,xmm0[1],xmm1[2],zero
; X32-NEXT: retl
;
; X64-LABEL: insertps_6:
-; X64: ## BB#0:
+; X64: ## %bb.0:
; X64-NEXT: insertps {{.*#+}} xmm0 = zero,xmm0[1],xmm1[2],zero
; X64-NEXT: retq
%vecext = extractelement <4 x float> %A, i32 1
@@ -1049,12 +1046,12 @@ define <4 x float> @insertps_6(<4 x float> %A, <4 x float> %B) {
define <4 x float> @insertps_7(<4 x float> %A, <4 x float> %B) {
; X32-LABEL: insertps_7:
-; X32: ## BB#0:
+; X32: ## %bb.0:
; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm1[1],zero
; X32-NEXT: retl
;
; X64-LABEL: insertps_7:
-; X64: ## BB#0:
+; X64: ## %bb.0:
; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm1[1],zero
; X64-NEXT: retq
%vecext = extractelement <4 x float> %A, i32 0
@@ -1068,12 +1065,12 @@ define <4 x float> @insertps_7(<4 x float> %A, <4 x float> %B) {
define <4 x float> @insertps_8(<4 x float> %A, <4 x float> %B) {
; X32-LABEL: insertps_8:
-; X32: ## BB#0:
+; X32: ## %bb.0:
; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
; X32-NEXT: retl
;
; X64-LABEL: insertps_8:
-; X64: ## BB#0:
+; X64: ## %bb.0:
; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
; X64-NEXT: retq
%vecext = extractelement <4 x float> %A, i32 0
@@ -1087,13 +1084,13 @@ define <4 x float> @insertps_8(<4 x float> %A, <4 x float> %B) {
define <4 x float> @insertps_9(<4 x float> %A, <4 x float> %B) {
; X32-LABEL: insertps_9:
-; X32: ## BB#0:
+; X32: ## %bb.0:
; X32-NEXT: insertps {{.*#+}} xmm1 = zero,xmm0[0],xmm1[2],zero
; X32-NEXT: movaps %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: insertps_9:
-; X64: ## BB#0:
+; X64: ## %bb.0:
; X64-NEXT: insertps {{.*#+}} xmm1 = zero,xmm0[0],xmm1[2],zero
; X64-NEXT: movaps %xmm1, %xmm0
; X64-NEXT: retq
@@ -1107,12 +1104,12 @@ define <4 x float> @insertps_9(<4 x float> %A, <4 x float> %B) {
define <4 x float> @insertps_10(<4 x float> %A) {
; X32-LABEL: insertps_10:
-; X32: ## BB#0:
+; X32: ## %bb.0:
; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[0],zero
; X32-NEXT: retl
;
; X64-LABEL: insertps_10:
-; X64: ## BB#0:
+; X64: ## %bb.0:
; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[0],zero
; X64-NEXT: retq
%vecext = extractelement <4 x float> %A, i32 0
@@ -1123,13 +1120,13 @@ define <4 x float> @insertps_10(<4 x float> %A) {
define <4 x float> @build_vector_to_shuffle_1(<4 x float> %A) {
; X32-LABEL: build_vector_to_shuffle_1:
-; X32: ## BB#0:
+; X32: ## %bb.0:
; X32-NEXT: xorps %xmm1, %xmm1
; X32-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
; X32-NEXT: retl
;
; X64-LABEL: build_vector_to_shuffle_1:
-; X64: ## BB#0:
+; X64: ## %bb.0:
; X64-NEXT: xorps %xmm1, %xmm1
; X64-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
; X64-NEXT: retq
@@ -1142,13 +1139,13 @@ define <4 x float> @build_vector_to_shuffle_1(<4 x float> %A) {
define <4 x float> @build_vector_to_shuffle_2(<4 x float> %A) {
; X32-LABEL: build_vector_to_shuffle_2:
-; X32: ## BB#0:
+; X32: ## %bb.0:
; X32-NEXT: xorps %xmm1, %xmm1
; X32-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
; X32-NEXT: retl
;
; X64-LABEL: build_vector_to_shuffle_2:
-; X64: ## BB#0:
+; X64: ## %bb.0:
; X64-NEXT: xorps %xmm1, %xmm1
; X64-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
; X64-NEXT: retq
diff --git a/test/CodeGen/X86/sse42-intrinsics-fast-isel-x86_64.ll b/test/CodeGen/X86/sse42-intrinsics-fast-isel-x86_64.ll
index 0a69d2632123..cac396f8b774 100644
--- a/test/CodeGen/X86/sse42-intrinsics-fast-isel-x86_64.ll
+++ b/test/CodeGen/X86/sse42-intrinsics-fast-isel-x86_64.ll
@@ -5,7 +5,7 @@
define i64 @test_mm_crc64_u8(i64 %a0, i8 %a1) nounwind{
; X64-LABEL: test_mm_crc64_u8:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: crc32b %sil, %edi
; X64-NEXT: movl %edi, %eax
; X64-NEXT: retq
@@ -16,7 +16,7 @@ declare i64 @llvm.x86.sse42.crc32.64.8(i64, i8) nounwind readnone
define i64 @test_mm_crc64_u64(i64 %a0, i64 %a1) nounwind{
; X64-LABEL: test_mm_crc64_u64:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: crc32q %rsi, %rdi
; X64-NEXT: movq %rdi, %rax
; X64-NEXT: retq
diff --git a/test/CodeGen/X86/sse42-intrinsics-fast-isel.ll b/test/CodeGen/X86/sse42-intrinsics-fast-isel.ll
index 383ab21bd404..f8d7f61d2069 100644
--- a/test/CodeGen/X86/sse42-intrinsics-fast-isel.ll
+++ b/test/CodeGen/X86/sse42-intrinsics-fast-isel.ll
@@ -6,7 +6,7 @@
define i32 @test_mm_cmpestra(<2 x i64> %a0, i32 %a1, <2 x i64> %a2, i32 %a3) nounwind {
; X32-LABEL: test_mm_cmpestra:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pushl %ebx
; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
@@ -18,7 +18,7 @@ define i32 @test_mm_cmpestra(<2 x i64> %a0, i32 %a1, <2 x i64> %a2, i32 %a3) nou
; X32-NEXT: retl
;
; X64-LABEL: test_mm_cmpestra:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: xorl %r8d, %r8d
; X64-NEXT: movl %edi, %eax
; X64-NEXT: movl %esi, %edx
@@ -35,7 +35,7 @@ declare i32 @llvm.x86.sse42.pcmpestria128(<16 x i8>, i32, <16 x i8>, i32, i8) no
define i32 @test_mm_cmpestrc(<2 x i64> %a0, i32 %a1, <2 x i64> %a2, i32 %a3) nounwind {
; X32-LABEL: test_mm_cmpestrc:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pushl %ebx
; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
@@ -47,7 +47,7 @@ define i32 @test_mm_cmpestrc(<2 x i64> %a0, i32 %a1, <2 x i64> %a2, i32 %a3) nou
; X32-NEXT: retl
;
; X64-LABEL: test_mm_cmpestrc:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: xorl %r8d, %r8d
; X64-NEXT: movl %edi, %eax
; X64-NEXT: movl %esi, %edx
@@ -64,7 +64,7 @@ declare i32 @llvm.x86.sse42.pcmpestric128(<16 x i8>, i32, <16 x i8>, i32, i8) no
define i32 @test_mm_cmpestri(<2 x i64> %a0, i32 %a1, <2 x i64> %a2, i32 %a3) {
; X32-LABEL: test_mm_cmpestri:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: pcmpestri $7, %xmm1, %xmm0
@@ -72,7 +72,7 @@ define i32 @test_mm_cmpestri(<2 x i64> %a0, i32 %a1, <2 x i64> %a2, i32 %a3) {
; X32-NEXT: retl
;
; X64-LABEL: test_mm_cmpestri:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movl %edi, %eax
; X64-NEXT: movl %esi, %edx
; X64-NEXT: pcmpestri $7, %xmm1, %xmm0
@@ -87,14 +87,14 @@ declare i32 @llvm.x86.sse42.pcmpestri128(<16 x i8>, i32, <16 x i8>, i32, i8) nou
define <2 x i64> @test_mm_cmpestrm(<2 x i64> %a0, i32 %a1, <2 x i64> %a2, i32 %a3) {
; X32-LABEL: test_mm_cmpestrm:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: pcmpestrm $7, %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_cmpestrm:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movl %edi, %eax
; X64-NEXT: movl %esi, %edx
; X64-NEXT: pcmpestrm $7, %xmm1, %xmm0
@@ -109,7 +109,7 @@ declare <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8>, i32, <16 x i8>, i32, i
define i32 @test_mm_cmpestro(<2 x i64> %a0, i32 %a1, <2 x i64> %a2, i32 %a3) nounwind {
; X32-LABEL: test_mm_cmpestro:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pushl %ebx
; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
@@ -121,7 +121,7 @@ define i32 @test_mm_cmpestro(<2 x i64> %a0, i32 %a1, <2 x i64> %a2, i32 %a3) nou
; X32-NEXT: retl
;
; X64-LABEL: test_mm_cmpestro:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: xorl %r8d, %r8d
; X64-NEXT: movl %edi, %eax
; X64-NEXT: movl %esi, %edx
@@ -138,7 +138,7 @@ declare i32 @llvm.x86.sse42.pcmpestrio128(<16 x i8>, i32, <16 x i8>, i32, i8) no
define i32 @test_mm_cmpestrs(<2 x i64> %a0, i32 %a1, <2 x i64> %a2, i32 %a3) nounwind {
; X32-LABEL: test_mm_cmpestrs:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pushl %ebx
; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
@@ -150,7 +150,7 @@ define i32 @test_mm_cmpestrs(<2 x i64> %a0, i32 %a1, <2 x i64> %a2, i32 %a3) nou
; X32-NEXT: retl
;
; X64-LABEL: test_mm_cmpestrs:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: xorl %r8d, %r8d
; X64-NEXT: movl %edi, %eax
; X64-NEXT: movl %esi, %edx
@@ -167,7 +167,7 @@ declare i32 @llvm.x86.sse42.pcmpestris128(<16 x i8>, i32, <16 x i8>, i32, i8) no
define i32 @test_mm_cmpestrz(<2 x i64> %a0, i32 %a1, <2 x i64> %a2, i32 %a3) nounwind {
; X32-LABEL: test_mm_cmpestrz:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pushl %ebx
; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
@@ -179,7 +179,7 @@ define i32 @test_mm_cmpestrz(<2 x i64> %a0, i32 %a1, <2 x i64> %a2, i32 %a3) nou
; X32-NEXT: retl
;
; X64-LABEL: test_mm_cmpestrz:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: xorl %r8d, %r8d
; X64-NEXT: movl %edi, %eax
; X64-NEXT: movl %esi, %edx
@@ -196,12 +196,12 @@ declare i32 @llvm.x86.sse42.pcmpestriz128(<16 x i8>, i32, <16 x i8>, i32, i8) no
define <2 x i64> @test_mm_cmpgt_epi64(<2 x i64> %a0, <2 x i64> %a1) {
; X32-LABEL: test_mm_cmpgt_epi64:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pcmpgtq %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_cmpgt_epi64:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: pcmpgtq %xmm1, %xmm0
; X64-NEXT: retq
%cmp = icmp sgt <2 x i64> %a0, %a1
@@ -211,14 +211,14 @@ define <2 x i64> @test_mm_cmpgt_epi64(<2 x i64> %a0, <2 x i64> %a1) {
define i32 @test_mm_cmpistra(<2 x i64> %a0, <2 x i64> %a1) {
; X32-LABEL: test_mm_cmpistra:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: xorl %eax, %eax
; X32-NEXT: pcmpistri $7, %xmm1, %xmm0
; X32-NEXT: seta %al
; X32-NEXT: retl
;
; X64-LABEL: test_mm_cmpistra:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: pcmpistri $7, %xmm1, %xmm0
; X64-NEXT: seta %al
@@ -232,14 +232,14 @@ declare i32 @llvm.x86.sse42.pcmpistria128(<16 x i8>, <16 x i8>, i8) nounwind rea
define i32 @test_mm_cmpistrc(<2 x i64> %a0, <2 x i64> %a1) {
; X32-LABEL: test_mm_cmpistrc:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: xorl %eax, %eax
; X32-NEXT: pcmpistri $7, %xmm1, %xmm0
; X32-NEXT: setb %al
; X32-NEXT: retl
;
; X64-LABEL: test_mm_cmpistrc:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: pcmpistri $7, %xmm1, %xmm0
; X64-NEXT: setb %al
@@ -253,13 +253,13 @@ declare i32 @llvm.x86.sse42.pcmpistric128(<16 x i8>, <16 x i8>, i8) nounwind rea
define i32 @test_mm_cmpistri(<2 x i64> %a0, <2 x i64> %a1) {
; X32-LABEL: test_mm_cmpistri:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pcmpistri $7, %xmm1, %xmm0
; X32-NEXT: movl %ecx, %eax
; X32-NEXT: retl
;
; X64-LABEL: test_mm_cmpistri:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: pcmpistri $7, %xmm1, %xmm0
; X64-NEXT: movl %ecx, %eax
; X64-NEXT: retq
@@ -272,12 +272,12 @@ declare i32 @llvm.x86.sse42.pcmpistri128(<16 x i8>, <16 x i8>, i8) nounwind read
define <2 x i64> @test_mm_cmpistrm(<2 x i64> %a0, <2 x i64> %a1) {
; X32-LABEL: test_mm_cmpistrm:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pcmpistrm $7, %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_cmpistrm:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: pcmpistrm $7, %xmm1, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <16 x i8>
@@ -290,14 +290,14 @@ declare <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8>, <16 x i8>, i8) nounwin
define i32 @test_mm_cmpistro(<2 x i64> %a0, <2 x i64> %a1) {
; X32-LABEL: test_mm_cmpistro:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: xorl %eax, %eax
; X32-NEXT: pcmpistri $7, %xmm1, %xmm0
; X32-NEXT: seto %al
; X32-NEXT: retl
;
; X64-LABEL: test_mm_cmpistro:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: pcmpistri $7, %xmm1, %xmm0
; X64-NEXT: seto %al
@@ -311,14 +311,14 @@ declare i32 @llvm.x86.sse42.pcmpistrio128(<16 x i8>, <16 x i8>, i8) nounwind rea
define i32 @test_mm_cmpistrs(<2 x i64> %a0, <2 x i64> %a1) {
; X32-LABEL: test_mm_cmpistrs:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: xorl %eax, %eax
; X32-NEXT: pcmpistri $7, %xmm1, %xmm0
; X32-NEXT: sets %al
; X32-NEXT: retl
;
; X64-LABEL: test_mm_cmpistrs:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: pcmpistri $7, %xmm1, %xmm0
; X64-NEXT: sets %al
@@ -332,14 +332,14 @@ declare i32 @llvm.x86.sse42.pcmpistris128(<16 x i8>, <16 x i8>, i8) nounwind rea
define i32 @test_mm_cmpistrz(<2 x i64> %a0, <2 x i64> %a1) {
; X32-LABEL: test_mm_cmpistrz:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: xorl %eax, %eax
; X32-NEXT: pcmpistri $7, %xmm1, %xmm0
; X32-NEXT: sete %al
; X32-NEXT: retl
;
; X64-LABEL: test_mm_cmpistrz:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: pcmpistri $7, %xmm1, %xmm0
; X64-NEXT: sete %al
@@ -353,14 +353,14 @@ declare i32 @llvm.x86.sse42.pcmpistriz128(<16 x i8>, <16 x i8>, i8) nounwind rea
define i32 @test_mm_crc32_u8(i32 %a0, i8 %a1) {
; X32-LABEL: test_mm_crc32_u8:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movb {{[0-9]+}}(%esp), %cl
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: crc32b %cl, %eax
; X32-NEXT: retl
;
; X64-LABEL: test_mm_crc32_u8:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: crc32b %sil, %edi
; X64-NEXT: movl %edi, %eax
; X64-NEXT: retq
@@ -371,14 +371,14 @@ declare i32 @llvm.x86.sse42.crc32.32.8(i32, i8) nounwind readnone
define i32 @test_mm_crc32_u16(i32 %a0, i16 %a1) {
; X32-LABEL: test_mm_crc32_u16:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: crc32w %cx, %eax
; X32-NEXT: retl
;
; X64-LABEL: test_mm_crc32_u16:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: crc32w %si, %edi
; X64-NEXT: movl %edi, %eax
; X64-NEXT: retq
@@ -389,13 +389,13 @@ declare i32 @llvm.x86.sse42.crc32.32.16(i32, i16) nounwind readnone
define i32 @test_mm_crc32_u32(i32 %a0, i32 %a1) {
; X32-LABEL: test_mm_crc32_u32:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: crc32l {{[0-9]+}}(%esp), %eax
; X32-NEXT: retl
;
; X64-LABEL: test_mm_crc32_u32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: crc32l %esi, %edi
; X64-NEXT: movl %edi, %eax
; X64-NEXT: retq
diff --git a/test/CodeGen/X86/sse42-intrinsics-x86.ll b/test/CodeGen/X86/sse42-intrinsics-x86.ll
index d9e103c48111..400a78f85bc7 100644
--- a/test/CodeGen/X86/sse42-intrinsics-x86.ll
+++ b/test/CodeGen/X86/sse42-intrinsics-x86.ll
@@ -1,11 +1,11 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=-avx,+sse4.2 -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=SSE42
; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=+avx2 -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=VCHECK --check-prefix=AVX2
-; RUN: llc < %s -mtriple=i386-apple-darwin -mcpu=skx -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=VCHECK --check-prefix=SKX
+; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=VCHECK --check-prefix=SKX
define i32 @test_x86_sse42_pcmpestri128(<16 x i8> %a0, <16 x i8> %a2) {
; SSE42-LABEL: test_x86_sse42_pcmpestri128:
-; SSE42: ## BB#0:
+; SSE42: ## %bb.0:
; SSE42-NEXT: movl $7, %eax ## encoding: [0xb8,0x07,0x00,0x00,0x00]
; SSE42-NEXT: movl $7, %edx ## encoding: [0xba,0x07,0x00,0x00,0x00]
; SSE42-NEXT: pcmpestri $7, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x61,0xc1,0x07]
@@ -13,7 +13,7 @@ define i32 @test_x86_sse42_pcmpestri128(<16 x i8> %a0, <16 x i8> %a2) {
; SSE42-NEXT: retl ## encoding: [0xc3]
;
; VCHECK-LABEL: test_x86_sse42_pcmpestri128:
-; VCHECK: ## BB#0:
+; VCHECK: ## %bb.0:
; VCHECK-NEXT: movl $7, %eax ## encoding: [0xb8,0x07,0x00,0x00,0x00]
; VCHECK-NEXT: movl $7, %edx ## encoding: [0xba,0x07,0x00,0x00,0x00]
; VCHECK-NEXT: vpcmpestri $7, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x61,0xc1,0x07]
@@ -27,7 +27,7 @@ declare i32 @llvm.x86.sse42.pcmpestri128(<16 x i8>, i32, <16 x i8>, i32, i8) nou
define i32 @test_x86_sse42_pcmpestri128_load(<16 x i8>* %a0, <16 x i8>* %a2) {
; SSE42-LABEL: test_x86_sse42_pcmpestri128_load:
-; SSE42: ## BB#0:
+; SSE42: ## %bb.0:
; SSE42-NEXT: movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x08]
; SSE42-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
; SSE42-NEXT: movdqa (%eax), %xmm0 ## encoding: [0x66,0x0f,0x6f,0x00]
@@ -38,7 +38,7 @@ define i32 @test_x86_sse42_pcmpestri128_load(<16 x i8>* %a0, <16 x i8>* %a2) {
; SSE42-NEXT: retl ## encoding: [0xc3]
;
; AVX2-LABEL: test_x86_sse42_pcmpestri128_load:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x08]
; AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
; AVX2-NEXT: vmovdqa (%eax), %xmm0 ## encoding: [0xc5,0xf9,0x6f,0x00]
@@ -49,10 +49,10 @@ define i32 @test_x86_sse42_pcmpestri128_load(<16 x i8>* %a0, <16 x i8>* %a2) {
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_sse42_pcmpestri128_load:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x08]
; SKX-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; SKX-NEXT: vmovdqu (%eax), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x6f,0x00]
+; SKX-NEXT: vmovdqa (%eax), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0x00]
; SKX-NEXT: movl $7, %eax ## encoding: [0xb8,0x07,0x00,0x00,0x00]
; SKX-NEXT: movl $7, %edx ## encoding: [0xba,0x07,0x00,0x00,0x00]
; SKX-NEXT: vpcmpestri $7, (%ecx), %xmm0 ## encoding: [0xc4,0xe3,0x79,0x61,0x01,0x07]
@@ -67,7 +67,7 @@ define i32 @test_x86_sse42_pcmpestri128_load(<16 x i8>* %a0, <16 x i8>* %a2) {
define i32 @test_x86_sse42_pcmpestria128(<16 x i8> %a0, <16 x i8> %a2) nounwind {
; SSE42-LABEL: test_x86_sse42_pcmpestria128:
-; SSE42: ## BB#0:
+; SSE42: ## %bb.0:
; SSE42-NEXT: pushl %ebx ## encoding: [0x53]
; SSE42-NEXT: movl $7, %eax ## encoding: [0xb8,0x07,0x00,0x00,0x00]
; SSE42-NEXT: movl $7, %edx ## encoding: [0xba,0x07,0x00,0x00,0x00]
@@ -79,7 +79,7 @@ define i32 @test_x86_sse42_pcmpestria128(<16 x i8> %a0, <16 x i8> %a2) nounwind
; SSE42-NEXT: retl ## encoding: [0xc3]
;
; VCHECK-LABEL: test_x86_sse42_pcmpestria128:
-; VCHECK: ## BB#0:
+; VCHECK: ## %bb.0:
; VCHECK-NEXT: pushl %ebx ## encoding: [0x53]
; VCHECK-NEXT: movl $7, %eax ## encoding: [0xb8,0x07,0x00,0x00,0x00]
; VCHECK-NEXT: movl $7, %edx ## encoding: [0xba,0x07,0x00,0x00,0x00]
@@ -97,7 +97,7 @@ declare i32 @llvm.x86.sse42.pcmpestria128(<16 x i8>, i32, <16 x i8>, i32, i8) no
define i32 @test_x86_sse42_pcmpestric128(<16 x i8> %a0, <16 x i8> %a2) nounwind {
; SSE42-LABEL: test_x86_sse42_pcmpestric128:
-; SSE42: ## BB#0:
+; SSE42: ## %bb.0:
; SSE42-NEXT: pushl %ebx ## encoding: [0x53]
; SSE42-NEXT: movl $7, %eax ## encoding: [0xb8,0x07,0x00,0x00,0x00]
; SSE42-NEXT: movl $7, %edx ## encoding: [0xba,0x07,0x00,0x00,0x00]
@@ -109,7 +109,7 @@ define i32 @test_x86_sse42_pcmpestric128(<16 x i8> %a0, <16 x i8> %a2) nounwind
; SSE42-NEXT: retl ## encoding: [0xc3]
;
; VCHECK-LABEL: test_x86_sse42_pcmpestric128:
-; VCHECK: ## BB#0:
+; VCHECK: ## %bb.0:
; VCHECK-NEXT: pushl %ebx ## encoding: [0x53]
; VCHECK-NEXT: movl $7, %eax ## encoding: [0xb8,0x07,0x00,0x00,0x00]
; VCHECK-NEXT: movl $7, %edx ## encoding: [0xba,0x07,0x00,0x00,0x00]
@@ -127,7 +127,7 @@ declare i32 @llvm.x86.sse42.pcmpestric128(<16 x i8>, i32, <16 x i8>, i32, i8) no
define i32 @test_x86_sse42_pcmpestrio128(<16 x i8> %a0, <16 x i8> %a2) nounwind {
; SSE42-LABEL: test_x86_sse42_pcmpestrio128:
-; SSE42: ## BB#0:
+; SSE42: ## %bb.0:
; SSE42-NEXT: pushl %ebx ## encoding: [0x53]
; SSE42-NEXT: movl $7, %eax ## encoding: [0xb8,0x07,0x00,0x00,0x00]
; SSE42-NEXT: movl $7, %edx ## encoding: [0xba,0x07,0x00,0x00,0x00]
@@ -139,7 +139,7 @@ define i32 @test_x86_sse42_pcmpestrio128(<16 x i8> %a0, <16 x i8> %a2) nounwind
; SSE42-NEXT: retl ## encoding: [0xc3]
;
; VCHECK-LABEL: test_x86_sse42_pcmpestrio128:
-; VCHECK: ## BB#0:
+; VCHECK: ## %bb.0:
; VCHECK-NEXT: pushl %ebx ## encoding: [0x53]
; VCHECK-NEXT: movl $7, %eax ## encoding: [0xb8,0x07,0x00,0x00,0x00]
; VCHECK-NEXT: movl $7, %edx ## encoding: [0xba,0x07,0x00,0x00,0x00]
@@ -157,7 +157,7 @@ declare i32 @llvm.x86.sse42.pcmpestrio128(<16 x i8>, i32, <16 x i8>, i32, i8) no
define i32 @test_x86_sse42_pcmpestris128(<16 x i8> %a0, <16 x i8> %a2) nounwind {
; SSE42-LABEL: test_x86_sse42_pcmpestris128:
-; SSE42: ## BB#0:
+; SSE42: ## %bb.0:
; SSE42-NEXT: pushl %ebx ## encoding: [0x53]
; SSE42-NEXT: movl $7, %eax ## encoding: [0xb8,0x07,0x00,0x00,0x00]
; SSE42-NEXT: movl $7, %edx ## encoding: [0xba,0x07,0x00,0x00,0x00]
@@ -169,7 +169,7 @@ define i32 @test_x86_sse42_pcmpestris128(<16 x i8> %a0, <16 x i8> %a2) nounwind
; SSE42-NEXT: retl ## encoding: [0xc3]
;
; VCHECK-LABEL: test_x86_sse42_pcmpestris128:
-; VCHECK: ## BB#0:
+; VCHECK: ## %bb.0:
; VCHECK-NEXT: pushl %ebx ## encoding: [0x53]
; VCHECK-NEXT: movl $7, %eax ## encoding: [0xb8,0x07,0x00,0x00,0x00]
; VCHECK-NEXT: movl $7, %edx ## encoding: [0xba,0x07,0x00,0x00,0x00]
@@ -187,7 +187,7 @@ declare i32 @llvm.x86.sse42.pcmpestris128(<16 x i8>, i32, <16 x i8>, i32, i8) no
define i32 @test_x86_sse42_pcmpestriz128(<16 x i8> %a0, <16 x i8> %a2) nounwind {
; SSE42-LABEL: test_x86_sse42_pcmpestriz128:
-; SSE42: ## BB#0:
+; SSE42: ## %bb.0:
; SSE42-NEXT: pushl %ebx ## encoding: [0x53]
; SSE42-NEXT: movl $7, %eax ## encoding: [0xb8,0x07,0x00,0x00,0x00]
; SSE42-NEXT: movl $7, %edx ## encoding: [0xba,0x07,0x00,0x00,0x00]
@@ -199,7 +199,7 @@ define i32 @test_x86_sse42_pcmpestriz128(<16 x i8> %a0, <16 x i8> %a2) nounwind
; SSE42-NEXT: retl ## encoding: [0xc3]
;
; VCHECK-LABEL: test_x86_sse42_pcmpestriz128:
-; VCHECK: ## BB#0:
+; VCHECK: ## %bb.0:
; VCHECK-NEXT: pushl %ebx ## encoding: [0x53]
; VCHECK-NEXT: movl $7, %eax ## encoding: [0xb8,0x07,0x00,0x00,0x00]
; VCHECK-NEXT: movl $7, %edx ## encoding: [0xba,0x07,0x00,0x00,0x00]
@@ -217,14 +217,14 @@ declare i32 @llvm.x86.sse42.pcmpestriz128(<16 x i8>, i32, <16 x i8>, i32, i8) no
define <16 x i8> @test_x86_sse42_pcmpestrm128(<16 x i8> %a0, <16 x i8> %a2) {
; SSE42-LABEL: test_x86_sse42_pcmpestrm128:
-; SSE42: ## BB#0:
+; SSE42: ## %bb.0:
; SSE42-NEXT: movl $7, %eax ## encoding: [0xb8,0x07,0x00,0x00,0x00]
; SSE42-NEXT: movl $7, %edx ## encoding: [0xba,0x07,0x00,0x00,0x00]
; SSE42-NEXT: pcmpestrm $7, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x60,0xc1,0x07]
; SSE42-NEXT: retl ## encoding: [0xc3]
;
; VCHECK-LABEL: test_x86_sse42_pcmpestrm128:
-; VCHECK: ## BB#0:
+; VCHECK: ## %bb.0:
; VCHECK-NEXT: movl $7, %eax ## encoding: [0xb8,0x07,0x00,0x00,0x00]
; VCHECK-NEXT: movl $7, %edx ## encoding: [0xba,0x07,0x00,0x00,0x00]
; VCHECK-NEXT: vpcmpestrm $7, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x60,0xc1,0x07]
@@ -237,7 +237,7 @@ declare <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8>, i32, <16 x i8>, i32, i
define <16 x i8> @test_x86_sse42_pcmpestrm128_load(<16 x i8> %a0, <16 x i8>* %a2) {
; SSE42-LABEL: test_x86_sse42_pcmpestrm128_load:
-; SSE42: ## BB#0:
+; SSE42: ## %bb.0:
; SSE42-NEXT: movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x04]
; SSE42-NEXT: movl $7, %eax ## encoding: [0xb8,0x07,0x00,0x00,0x00]
; SSE42-NEXT: movl $7, %edx ## encoding: [0xba,0x07,0x00,0x00,0x00]
@@ -245,7 +245,7 @@ define <16 x i8> @test_x86_sse42_pcmpestrm128_load(<16 x i8> %a0, <16 x i8>* %a2
; SSE42-NEXT: retl ## encoding: [0xc3]
;
; VCHECK-LABEL: test_x86_sse42_pcmpestrm128_load:
-; VCHECK: ## BB#0:
+; VCHECK: ## %bb.0:
; VCHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x04]
; VCHECK-NEXT: movl $7, %eax ## encoding: [0xb8,0x07,0x00,0x00,0x00]
; VCHECK-NEXT: movl $7, %edx ## encoding: [0xba,0x07,0x00,0x00,0x00]
@@ -259,13 +259,13 @@ define <16 x i8> @test_x86_sse42_pcmpestrm128_load(<16 x i8> %a0, <16 x i8>* %a2
define i32 @test_x86_sse42_pcmpistri128(<16 x i8> %a0, <16 x i8> %a1) {
; SSE42-LABEL: test_x86_sse42_pcmpistri128:
-; SSE42: ## BB#0:
+; SSE42: ## %bb.0:
; SSE42-NEXT: pcmpistri $7, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x63,0xc1,0x07]
; SSE42-NEXT: movl %ecx, %eax ## encoding: [0x89,0xc8]
; SSE42-NEXT: retl ## encoding: [0xc3]
;
; VCHECK-LABEL: test_x86_sse42_pcmpistri128:
-; VCHECK: ## BB#0:
+; VCHECK: ## %bb.0:
; VCHECK-NEXT: vpcmpistri $7, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x63,0xc1,0x07]
; VCHECK-NEXT: movl %ecx, %eax ## encoding: [0x89,0xc8]
; VCHECK-NEXT: retl ## encoding: [0xc3]
@@ -277,7 +277,7 @@ declare i32 @llvm.x86.sse42.pcmpistri128(<16 x i8>, <16 x i8>, i8) nounwind read
define i32 @test_x86_sse42_pcmpistri128_load(<16 x i8>* %a0, <16 x i8>* %a1) {
; SSE42-LABEL: test_x86_sse42_pcmpistri128_load:
-; SSE42: ## BB#0:
+; SSE42: ## %bb.0:
; SSE42-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x08]
; SSE42-NEXT: movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x04]
; SSE42-NEXT: movdqa (%ecx), %xmm0 ## encoding: [0x66,0x0f,0x6f,0x01]
@@ -286,7 +286,7 @@ define i32 @test_x86_sse42_pcmpistri128_load(<16 x i8>* %a0, <16 x i8>* %a1) {
; SSE42-NEXT: retl ## encoding: [0xc3]
;
; AVX2-LABEL: test_x86_sse42_pcmpistri128_load:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x08]
; AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x04]
; AVX2-NEXT: vmovdqa (%ecx), %xmm0 ## encoding: [0xc5,0xf9,0x6f,0x01]
@@ -295,10 +295,10 @@ define i32 @test_x86_sse42_pcmpistri128_load(<16 x i8>* %a0, <16 x i8>* %a1) {
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_sse42_pcmpistri128_load:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x08]
; SKX-NEXT: movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x04]
-; SKX-NEXT: vmovdqu (%ecx), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x6f,0x01]
+; SKX-NEXT: vmovdqa (%ecx), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0x01]
; SKX-NEXT: vpcmpistri $7, (%eax), %xmm0 ## encoding: [0xc4,0xe3,0x79,0x63,0x00,0x07]
; SKX-NEXT: movl %ecx, %eax ## encoding: [0x89,0xc8]
; SKX-NEXT: retl ## encoding: [0xc3]
@@ -311,14 +311,14 @@ define i32 @test_x86_sse42_pcmpistri128_load(<16 x i8>* %a0, <16 x i8>* %a1) {
define i32 @test_x86_sse42_pcmpistria128(<16 x i8> %a0, <16 x i8> %a1) {
; SSE42-LABEL: test_x86_sse42_pcmpistria128:
-; SSE42: ## BB#0:
+; SSE42: ## %bb.0:
; SSE42-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0]
; SSE42-NEXT: pcmpistri $7, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x63,0xc1,0x07]
; SSE42-NEXT: seta %al ## encoding: [0x0f,0x97,0xc0]
; SSE42-NEXT: retl ## encoding: [0xc3]
;
; VCHECK-LABEL: test_x86_sse42_pcmpistria128:
-; VCHECK: ## BB#0:
+; VCHECK: ## %bb.0:
; VCHECK-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0]
; VCHECK-NEXT: vpcmpistri $7, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x63,0xc1,0x07]
; VCHECK-NEXT: seta %al ## encoding: [0x0f,0x97,0xc0]
@@ -331,14 +331,14 @@ declare i32 @llvm.x86.sse42.pcmpistria128(<16 x i8>, <16 x i8>, i8) nounwind rea
define i32 @test_x86_sse42_pcmpistric128(<16 x i8> %a0, <16 x i8> %a1) {
; SSE42-LABEL: test_x86_sse42_pcmpistric128:
-; SSE42: ## BB#0:
+; SSE42: ## %bb.0:
; SSE42-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0]
; SSE42-NEXT: pcmpistri $7, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x63,0xc1,0x07]
; SSE42-NEXT: setb %al ## encoding: [0x0f,0x92,0xc0]
; SSE42-NEXT: retl ## encoding: [0xc3]
;
; VCHECK-LABEL: test_x86_sse42_pcmpistric128:
-; VCHECK: ## BB#0:
+; VCHECK: ## %bb.0:
; VCHECK-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0]
; VCHECK-NEXT: vpcmpistri $7, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x63,0xc1,0x07]
; VCHECK-NEXT: setb %al ## encoding: [0x0f,0x92,0xc0]
@@ -351,14 +351,14 @@ declare i32 @llvm.x86.sse42.pcmpistric128(<16 x i8>, <16 x i8>, i8) nounwind rea
define i32 @test_x86_sse42_pcmpistrio128(<16 x i8> %a0, <16 x i8> %a1) {
; SSE42-LABEL: test_x86_sse42_pcmpistrio128:
-; SSE42: ## BB#0:
+; SSE42: ## %bb.0:
; SSE42-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0]
; SSE42-NEXT: pcmpistri $7, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x63,0xc1,0x07]
; SSE42-NEXT: seto %al ## encoding: [0x0f,0x90,0xc0]
; SSE42-NEXT: retl ## encoding: [0xc3]
;
; VCHECK-LABEL: test_x86_sse42_pcmpistrio128:
-; VCHECK: ## BB#0:
+; VCHECK: ## %bb.0:
; VCHECK-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0]
; VCHECK-NEXT: vpcmpistri $7, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x63,0xc1,0x07]
; VCHECK-NEXT: seto %al ## encoding: [0x0f,0x90,0xc0]
@@ -371,14 +371,14 @@ declare i32 @llvm.x86.sse42.pcmpistrio128(<16 x i8>, <16 x i8>, i8) nounwind rea
define i32 @test_x86_sse42_pcmpistris128(<16 x i8> %a0, <16 x i8> %a1) {
; SSE42-LABEL: test_x86_sse42_pcmpistris128:
-; SSE42: ## BB#0:
+; SSE42: ## %bb.0:
; SSE42-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0]
; SSE42-NEXT: pcmpistri $7, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x63,0xc1,0x07]
; SSE42-NEXT: sets %al ## encoding: [0x0f,0x98,0xc0]
; SSE42-NEXT: retl ## encoding: [0xc3]
;
; VCHECK-LABEL: test_x86_sse42_pcmpistris128:
-; VCHECK: ## BB#0:
+; VCHECK: ## %bb.0:
; VCHECK-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0]
; VCHECK-NEXT: vpcmpistri $7, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x63,0xc1,0x07]
; VCHECK-NEXT: sets %al ## encoding: [0x0f,0x98,0xc0]
@@ -391,14 +391,14 @@ declare i32 @llvm.x86.sse42.pcmpistris128(<16 x i8>, <16 x i8>, i8) nounwind rea
define i32 @test_x86_sse42_pcmpistriz128(<16 x i8> %a0, <16 x i8> %a1) {
; SSE42-LABEL: test_x86_sse42_pcmpistriz128:
-; SSE42: ## BB#0:
+; SSE42: ## %bb.0:
; SSE42-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0]
; SSE42-NEXT: pcmpistri $7, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x63,0xc1,0x07]
; SSE42-NEXT: sete %al ## encoding: [0x0f,0x94,0xc0]
; SSE42-NEXT: retl ## encoding: [0xc3]
;
; VCHECK-LABEL: test_x86_sse42_pcmpistriz128:
-; VCHECK: ## BB#0:
+; VCHECK: ## %bb.0:
; VCHECK-NEXT: xorl %eax, %eax ## encoding: [0x31,0xc0]
; VCHECK-NEXT: vpcmpistri $7, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x63,0xc1,0x07]
; VCHECK-NEXT: sete %al ## encoding: [0x0f,0x94,0xc0]
@@ -411,12 +411,12 @@ declare i32 @llvm.x86.sse42.pcmpistriz128(<16 x i8>, <16 x i8>, i8) nounwind rea
define <16 x i8> @test_x86_sse42_pcmpistrm128(<16 x i8> %a0, <16 x i8> %a1) {
; SSE42-LABEL: test_x86_sse42_pcmpistrm128:
-; SSE42: ## BB#0:
+; SSE42: ## %bb.0:
; SSE42-NEXT: pcmpistrm $7, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x62,0xc1,0x07]
; SSE42-NEXT: retl ## encoding: [0xc3]
;
; VCHECK-LABEL: test_x86_sse42_pcmpistrm128:
-; VCHECK: ## BB#0:
+; VCHECK: ## %bb.0:
; VCHECK-NEXT: vpcmpistrm $7, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x62,0xc1,0x07]
; VCHECK-NEXT: retl ## encoding: [0xc3]
%res = call <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %a0, <16 x i8> %a1, i8 7) ; <<16 x i8>> [#uses=1]
@@ -427,13 +427,13 @@ declare <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8>, <16 x i8>, i8) nounwin
define <16 x i8> @test_x86_sse42_pcmpistrm128_load(<16 x i8> %a0, <16 x i8>* %a1) {
; SSE42-LABEL: test_x86_sse42_pcmpistrm128_load:
-; SSE42: ## BB#0:
+; SSE42: ## %bb.0:
; SSE42-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
; SSE42-NEXT: pcmpistrm $7, (%eax), %xmm0 ## encoding: [0x66,0x0f,0x3a,0x62,0x00,0x07]
; SSE42-NEXT: retl ## encoding: [0xc3]
;
; VCHECK-LABEL: test_x86_sse42_pcmpistrm128_load:
-; VCHECK: ## BB#0:
+; VCHECK: ## %bb.0:
; VCHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
; VCHECK-NEXT: vpcmpistrm $7, (%eax), %xmm0 ## encoding: [0xc4,0xe3,0x79,0x62,0x00,0x07]
; VCHECK-NEXT: retl ## encoding: [0xc3]
@@ -444,7 +444,7 @@ define <16 x i8> @test_x86_sse42_pcmpistrm128_load(<16 x i8> %a0, <16 x i8>* %a1
define i32 @crc32_32_8(i32 %a, i8 %b) nounwind {
; CHECK-LABEL: crc32_32_8:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
; CHECK-NEXT: crc32b {{[0-9]+}}(%esp), %eax ## encoding: [0xf2,0x0f,0x38,0xf0,0x44,0x24,0x08]
; CHECK-NEXT: retl ## encoding: [0xc3]
@@ -455,7 +455,7 @@ declare i32 @llvm.x86.sse42.crc32.32.8(i32, i8) nounwind
define i32 @crc32_32_16(i32 %a, i16 %b) nounwind {
; CHECK-LABEL: crc32_32_16:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
; CHECK-NEXT: crc32w {{[0-9]+}}(%esp), %eax ## encoding: [0x66,0xf2,0x0f,0x38,0xf1,0x44,0x24,0x08]
; CHECK-NEXT: retl ## encoding: [0xc3]
@@ -466,7 +466,7 @@ declare i32 @llvm.x86.sse42.crc32.32.16(i32, i16) nounwind
define i32 @crc32_32_32(i32 %a, i32 %b) nounwind {
; CHECK-LABEL: crc32_32_32:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
; CHECK-NEXT: crc32l {{[0-9]+}}(%esp), %eax ## encoding: [0xf2,0x0f,0x38,0xf1,0x44,0x24,0x08]
; CHECK-NEXT: retl ## encoding: [0xc3]
diff --git a/test/CodeGen/X86/sse42-intrinsics-x86_64.ll b/test/CodeGen/X86/sse42-intrinsics-x86_64.ll
index e90aa455cfd8..bde37879fe17 100644
--- a/test/CodeGen/X86/sse42-intrinsics-x86_64.ll
+++ b/test/CodeGen/X86/sse42-intrinsics-x86_64.ll
@@ -8,7 +8,7 @@ declare i64 @llvm.x86.sse42.crc32.64.64(i64, i64) nounwind
define i64 @crc32_64_8(i64 %a, i8 %b) nounwind {
; CHECK-LABEL: crc32_64_8:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: crc32b %sil, %edi ## encoding: [0xf2,0x40,0x0f,0x38,0xf0,0xfe]
; CHECK-NEXT: movq %rdi, %rax ## encoding: [0x48,0x89,0xf8]
; CHECK-NEXT: retq ## encoding: [0xc3]
@@ -18,7 +18,7 @@ define i64 @crc32_64_8(i64 %a, i8 %b) nounwind {
define i64 @crc32_64_64(i64 %a, i64 %b) nounwind {
; CHECK-LABEL: crc32_64_64:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: crc32q %rsi, %rdi ## encoding: [0xf2,0x48,0x0f,0x38,0xf1,0xfe]
; CHECK-NEXT: movq %rdi, %rax ## encoding: [0x48,0x89,0xf8]
; CHECK-NEXT: retq ## encoding: [0xc3]
diff --git a/test/CodeGen/X86/sse42-schedule.ll b/test/CodeGen/X86/sse42-schedule.ll
index 2a502e809bca..1eaedeabed47 100644
--- a/test/CodeGen/X86/sse42-schedule.ll
+++ b/test/CodeGen/X86/sse42-schedule.ll
@@ -1,55 +1,78 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mattr=+sse4.2 | FileCheck %s --check-prefix=CHECK --check-prefix=GENERIC
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=+sse4.2,+pclmul | FileCheck %s --check-prefix=CHECK --check-prefix=GENERIC
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=slm | FileCheck %s --check-prefix=CHECK --check-prefix=SLM
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=sandybridge | FileCheck %s --check-prefix=CHECK --check-prefix=SANDY
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=ivybridge | FileCheck %s --check-prefix=CHECK --check-prefix=SANDY
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=haswell | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=broadwell | FileCheck %s --check-prefix=CHECK --check-prefix=BROADWELL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=SKYLAKE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skx | FileCheck %s --check-prefix=CHECK --check-prefix=SKX
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1
define i32 @crc32_32_8(i32 %a0, i8 %a1, i8 *%a2) {
; GENERIC-LABEL: crc32_32_8:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: crc32b %sil, %edi
-; GENERIC-NEXT: crc32b (%rdx), %edi
-; GENERIC-NEXT: movl %edi, %eax
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: crc32b %sil, %edi # sched: [3:1.00]
+; GENERIC-NEXT: crc32b (%rdx), %edi # sched: [8:1.00]
+; GENERIC-NEXT: movl %edi, %eax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SLM-LABEL: crc32_32_8:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: crc32b %sil, %edi # sched: [3:1.00]
; SLM-NEXT: crc32b (%rdx), %edi # sched: [6:1.00]
; SLM-NEXT: movl %edi, %eax # sched: [1:0.50]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: crc32_32_8:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: crc32b %sil, %edi # sched: [3:1.00]
-; SANDY-NEXT: crc32b (%rdx), %edi # sched: [7:1.00]
+; SANDY-NEXT: crc32b (%rdx), %edi # sched: [8:1.00]
; SANDY-NEXT: movl %edi, %eax # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: crc32_32_8:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: crc32b %sil, %edi # sched: [3:1.00]
-; HASWELL-NEXT: crc32b (%rdx), %edi # sched: [7:1.00]
+; HASWELL-NEXT: crc32b (%rdx), %edi # sched: [8:1.00]
; HASWELL-NEXT: movl %edi, %eax # sched: [1:0.25]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: crc32_32_8:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: crc32b %sil, %edi # sched: [3:1.00]
+; BROADWELL-NEXT: crc32b (%rdx), %edi # sched: [8:1.00]
+; BROADWELL-NEXT: movl %edi, %eax # sched: [1:0.25]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: crc32_32_8:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: crc32b %sil, %edi # sched: [3:1.00]
+; SKYLAKE-NEXT: crc32b (%rdx), %edi # sched: [8:1.00]
+; SKYLAKE-NEXT: movl %edi, %eax # sched: [1:0.25]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: crc32_32_8:
+; SKX: # %bb.0:
+; SKX-NEXT: crc32b %sil, %edi # sched: [3:1.00]
+; SKX-NEXT: crc32b (%rdx), %edi # sched: [8:1.00]
+; SKX-NEXT: movl %edi, %eax # sched: [1:0.25]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: crc32_32_8:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: crc32b %sil, %edi # sched: [3:1.00]
; BTVER2-NEXT: crc32b (%rdx), %edi # sched: [8:1.00]
-; BTVER2-NEXT: movl %edi, %eax # sched: [1:0.17]
+; BTVER2-NEXT: movl %edi, %eax # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: crc32_32_8:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: crc32b %sil, %edi # sched: [3:1.00]
; ZNVER1-NEXT: crc32b (%rdx), %edi # sched: [10:1.00]
; ZNVER1-NEXT: movl %edi, %eax # sched: [1:0.25]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call i32 @llvm.x86.sse42.crc32.32.8(i32 %a0, i8 %a1)
%2 = load i8, i8 *%a2
%3 = call i32 @llvm.x86.sse42.crc32.32.8(i32 %1, i8 %2)
@@ -59,46 +82,67 @@ declare i32 @llvm.x86.sse42.crc32.32.8(i32, i8) nounwind
define i32 @crc32_32_16(i32 %a0, i16 %a1, i16 *%a2) {
; GENERIC-LABEL: crc32_32_16:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: crc32w %si, %edi
-; GENERIC-NEXT: crc32w (%rdx), %edi
-; GENERIC-NEXT: movl %edi, %eax
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: crc32w %si, %edi # sched: [3:1.00]
+; GENERIC-NEXT: crc32w (%rdx), %edi # sched: [7:1.00]
+; GENERIC-NEXT: movl %edi, %eax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SLM-LABEL: crc32_32_16:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: crc32w %si, %edi # sched: [3:1.00]
; SLM-NEXT: crc32w (%rdx), %edi # sched: [6:1.00]
; SLM-NEXT: movl %edi, %eax # sched: [1:0.50]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: crc32_32_16:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: crc32w %si, %edi # sched: [3:1.00]
; SANDY-NEXT: crc32w (%rdx), %edi # sched: [7:1.00]
; SANDY-NEXT: movl %edi, %eax # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: crc32_32_16:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: crc32w %si, %edi # sched: [3:1.00]
-; HASWELL-NEXT: crc32w (%rdx), %edi # sched: [7:1.00]
+; HASWELL-NEXT: crc32w (%rdx), %edi # sched: [8:1.00]
; HASWELL-NEXT: movl %edi, %eax # sched: [1:0.25]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: crc32_32_16:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: crc32w %si, %edi # sched: [3:1.00]
+; BROADWELL-NEXT: crc32w (%rdx), %edi # sched: [8:1.00]
+; BROADWELL-NEXT: movl %edi, %eax # sched: [1:0.25]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: crc32_32_16:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: crc32w %si, %edi # sched: [3:1.00]
+; SKYLAKE-NEXT: crc32w (%rdx), %edi # sched: [8:1.00]
+; SKYLAKE-NEXT: movl %edi, %eax # sched: [1:0.25]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: crc32_32_16:
+; SKX: # %bb.0:
+; SKX-NEXT: crc32w %si, %edi # sched: [3:1.00]
+; SKX-NEXT: crc32w (%rdx), %edi # sched: [8:1.00]
+; SKX-NEXT: movl %edi, %eax # sched: [1:0.25]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: crc32_32_16:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: crc32w %si, %edi # sched: [3:1.00]
; BTVER2-NEXT: crc32w (%rdx), %edi # sched: [8:1.00]
-; BTVER2-NEXT: movl %edi, %eax # sched: [1:0.17]
+; BTVER2-NEXT: movl %edi, %eax # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: crc32_32_16:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: crc32w %si, %edi # sched: [3:1.00]
; ZNVER1-NEXT: crc32w (%rdx), %edi # sched: [10:1.00]
; ZNVER1-NEXT: movl %edi, %eax # sched: [1:0.25]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call i32 @llvm.x86.sse42.crc32.32.16(i32 %a0, i16 %a1)
%2 = load i16, i16 *%a2
%3 = call i32 @llvm.x86.sse42.crc32.32.16(i32 %1, i16 %2)
@@ -108,46 +152,67 @@ declare i32 @llvm.x86.sse42.crc32.32.16(i32, i16) nounwind
define i32 @crc32_32_32(i32 %a0, i32 %a1, i32 *%a2) {
; GENERIC-LABEL: crc32_32_32:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: crc32l %esi, %edi
-; GENERIC-NEXT: crc32l (%rdx), %edi
-; GENERIC-NEXT: movl %edi, %eax
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: crc32l %esi, %edi # sched: [3:1.00]
+; GENERIC-NEXT: crc32l (%rdx), %edi # sched: [7:1.00]
+; GENERIC-NEXT: movl %edi, %eax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SLM-LABEL: crc32_32_32:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: crc32l %esi, %edi # sched: [3:1.00]
; SLM-NEXT: crc32l (%rdx), %edi # sched: [6:1.00]
; SLM-NEXT: movl %edi, %eax # sched: [1:0.50]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: crc32_32_32:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: crc32l %esi, %edi # sched: [3:1.00]
; SANDY-NEXT: crc32l (%rdx), %edi # sched: [7:1.00]
; SANDY-NEXT: movl %edi, %eax # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: crc32_32_32:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: crc32l %esi, %edi # sched: [3:1.00]
-; HASWELL-NEXT: crc32l (%rdx), %edi # sched: [7:1.00]
+; HASWELL-NEXT: crc32l (%rdx), %edi # sched: [8:1.00]
; HASWELL-NEXT: movl %edi, %eax # sched: [1:0.25]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: crc32_32_32:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: crc32l %esi, %edi # sched: [3:1.00]
+; BROADWELL-NEXT: crc32l (%rdx), %edi # sched: [8:1.00]
+; BROADWELL-NEXT: movl %edi, %eax # sched: [1:0.25]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: crc32_32_32:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: crc32l %esi, %edi # sched: [3:1.00]
+; SKYLAKE-NEXT: crc32l (%rdx), %edi # sched: [8:1.00]
+; SKYLAKE-NEXT: movl %edi, %eax # sched: [1:0.25]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: crc32_32_32:
+; SKX: # %bb.0:
+; SKX-NEXT: crc32l %esi, %edi # sched: [3:1.00]
+; SKX-NEXT: crc32l (%rdx), %edi # sched: [8:1.00]
+; SKX-NEXT: movl %edi, %eax # sched: [1:0.25]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: crc32_32_32:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: crc32l %esi, %edi # sched: [3:1.00]
; BTVER2-NEXT: crc32l (%rdx), %edi # sched: [8:1.00]
-; BTVER2-NEXT: movl %edi, %eax # sched: [1:0.17]
+; BTVER2-NEXT: movl %edi, %eax # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: crc32_32_32:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: crc32l %esi, %edi # sched: [3:1.00]
; ZNVER1-NEXT: crc32l (%rdx), %edi # sched: [10:1.00]
; ZNVER1-NEXT: movl %edi, %eax # sched: [1:0.25]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call i32 @llvm.x86.sse42.crc32.32.32(i32 %a0, i32 %a1)
%2 = load i32, i32 *%a2
%3 = call i32 @llvm.x86.sse42.crc32.32.32(i32 %1, i32 %2)
@@ -157,46 +222,67 @@ declare i32 @llvm.x86.sse42.crc32.32.32(i32, i32) nounwind
define i64 @crc32_64_8(i64 %a0, i8 %a1, i8 *%a2) nounwind {
; GENERIC-LABEL: crc32_64_8:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: crc32b %sil, %edi
-; GENERIC-NEXT: crc32b (%rdx), %edi
-; GENERIC-NEXT: movq %rdi, %rax
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: crc32b %sil, %edi # sched: [3:1.00]
+; GENERIC-NEXT: crc32b (%rdx), %edi # sched: [8:1.00]
+; GENERIC-NEXT: movq %rdi, %rax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SLM-LABEL: crc32_64_8:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: crc32b %sil, %edi # sched: [3:1.00]
; SLM-NEXT: crc32b (%rdx), %edi # sched: [6:1.00]
; SLM-NEXT: movq %rdi, %rax # sched: [1:0.50]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: crc32_64_8:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: crc32b %sil, %edi # sched: [3:1.00]
-; SANDY-NEXT: crc32b (%rdx), %edi # sched: [7:1.00]
+; SANDY-NEXT: crc32b (%rdx), %edi # sched: [8:1.00]
; SANDY-NEXT: movq %rdi, %rax # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: crc32_64_8:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: crc32b %sil, %edi # sched: [3:1.00]
-; HASWELL-NEXT: crc32b (%rdx), %edi # sched: [7:1.00]
+; HASWELL-NEXT: crc32b (%rdx), %edi # sched: [8:1.00]
; HASWELL-NEXT: movq %rdi, %rax # sched: [1:0.25]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: crc32_64_8:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: crc32b %sil, %edi # sched: [3:1.00]
+; BROADWELL-NEXT: crc32b (%rdx), %edi # sched: [8:1.00]
+; BROADWELL-NEXT: movq %rdi, %rax # sched: [1:0.25]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: crc32_64_8:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: crc32b %sil, %edi # sched: [3:1.00]
+; SKYLAKE-NEXT: crc32b (%rdx), %edi # sched: [8:1.00]
+; SKYLAKE-NEXT: movq %rdi, %rax # sched: [1:0.25]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: crc32_64_8:
+; SKX: # %bb.0:
+; SKX-NEXT: crc32b %sil, %edi # sched: [3:1.00]
+; SKX-NEXT: crc32b (%rdx), %edi # sched: [8:1.00]
+; SKX-NEXT: movq %rdi, %rax # sched: [1:0.25]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: crc32_64_8:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: crc32b %sil, %edi # sched: [3:1.00]
; BTVER2-NEXT: crc32b (%rdx), %edi # sched: [8:1.00]
-; BTVER2-NEXT: movq %rdi, %rax # sched: [1:0.17]
+; BTVER2-NEXT: movq %rdi, %rax # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: crc32_64_8:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: crc32b %sil, %edi # sched: [3:1.00]
; ZNVER1-NEXT: crc32b (%rdx), %edi # sched: [10:1.00]
; ZNVER1-NEXT: movq %rdi, %rax # sched: [1:0.25]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call i64 @llvm.x86.sse42.crc32.64.8(i64 %a0, i8 %a1)
%2 = load i8, i8 *%a2
%3 = call i64 @llvm.x86.sse42.crc32.64.8(i64 %1, i8 %2)
@@ -206,46 +292,67 @@ declare i64 @llvm.x86.sse42.crc32.64.8(i64, i8) nounwind
define i64 @crc32_64_64(i64 %a0, i64 %a1, i64 *%a2) {
; GENERIC-LABEL: crc32_64_64:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: crc32q %rsi, %rdi
-; GENERIC-NEXT: crc32q (%rdx), %rdi
-; GENERIC-NEXT: movq %rdi, %rax
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: crc32q %rsi, %rdi # sched: [3:1.00]
+; GENERIC-NEXT: crc32q (%rdx), %rdi # sched: [8:1.00]
+; GENERIC-NEXT: movq %rdi, %rax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SLM-LABEL: crc32_64_64:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: crc32q %rsi, %rdi # sched: [3:1.00]
; SLM-NEXT: crc32q (%rdx), %rdi # sched: [6:1.00]
; SLM-NEXT: movq %rdi, %rax # sched: [1:0.50]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: crc32_64_64:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: crc32q %rsi, %rdi # sched: [3:1.00]
-; SANDY-NEXT: crc32q (%rdx), %rdi # sched: [7:1.00]
+; SANDY-NEXT: crc32q (%rdx), %rdi # sched: [8:1.00]
; SANDY-NEXT: movq %rdi, %rax # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: crc32_64_64:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: crc32q %rsi, %rdi # sched: [3:1.00]
-; HASWELL-NEXT: crc32q (%rdx), %rdi # sched: [7:1.00]
+; HASWELL-NEXT: crc32q (%rdx), %rdi # sched: [8:1.00]
; HASWELL-NEXT: movq %rdi, %rax # sched: [1:0.25]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: crc32_64_64:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: crc32q %rsi, %rdi # sched: [3:1.00]
+; BROADWELL-NEXT: crc32q (%rdx), %rdi # sched: [8:1.00]
+; BROADWELL-NEXT: movq %rdi, %rax # sched: [1:0.25]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: crc32_64_64:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: crc32q %rsi, %rdi # sched: [3:1.00]
+; SKYLAKE-NEXT: crc32q (%rdx), %rdi # sched: [8:1.00]
+; SKYLAKE-NEXT: movq %rdi, %rax # sched: [1:0.25]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: crc32_64_64:
+; SKX: # %bb.0:
+; SKX-NEXT: crc32q %rsi, %rdi # sched: [3:1.00]
+; SKX-NEXT: crc32q (%rdx), %rdi # sched: [8:1.00]
+; SKX-NEXT: movq %rdi, %rax # sched: [1:0.25]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: crc32_64_64:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: crc32q %rsi, %rdi # sched: [3:1.00]
; BTVER2-NEXT: crc32q (%rdx), %rdi # sched: [8:1.00]
-; BTVER2-NEXT: movq %rdi, %rax # sched: [1:0.17]
+; BTVER2-NEXT: movq %rdi, %rax # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: crc32_64_64:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: crc32q %rsi, %rdi # sched: [3:1.00]
; ZNVER1-NEXT: crc32q (%rdx), %rdi # sched: [10:1.00]
; ZNVER1-NEXT: movq %rdi, %rax # sched: [1:0.25]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call i64 @llvm.x86.sse42.crc32.64.64(i64 %a0, i64 %a1)
%2 = load i64, i64 *%a2
%3 = call i64 @llvm.x86.sse42.crc32.64.64(i64 %1, i64 %2)
@@ -255,20 +362,20 @@ declare i64 @llvm.x86.sse42.crc32.64.64(i64, i64) nounwind
define i32 @test_pcmpestri(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; GENERIC-LABEL: test_pcmpestri:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: movl $7, %eax
-; GENERIC-NEXT: movl $7, %edx
-; GENERIC-NEXT: pcmpestri $7, %xmm1, %xmm0
-; GENERIC-NEXT: movl %ecx, %esi
-; GENERIC-NEXT: movl $7, %eax
-; GENERIC-NEXT: movl $7, %edx
-; GENERIC-NEXT: pcmpestri $7, (%rdi), %xmm0
-; GENERIC-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def>
-; GENERIC-NEXT: leal (%rcx,%rsi), %eax
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: movl $7, %eax # sched: [1:0.33]
+; GENERIC-NEXT: movl $7, %edx # sched: [1:0.33]
+; GENERIC-NEXT: pcmpestri $7, %xmm1, %xmm0 # sched: [4:2.67]
+; GENERIC-NEXT: movl %ecx, %esi # sched: [1:0.33]
+; GENERIC-NEXT: movl $7, %eax # sched: [1:0.33]
+; GENERIC-NEXT: movl $7, %edx # sched: [1:0.33]
+; GENERIC-NEXT: pcmpestri $7, (%rdi), %xmm0 # sched: [4:2.33]
+; GENERIC-NEXT: # kill: def %ecx killed %ecx def %rcx
+; GENERIC-NEXT: leal (%rcx,%rsi), %eax # sched: [1:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SLM-LABEL: test_pcmpestri:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: movl $7, %eax # sched: [1:0.50]
; SLM-NEXT: movl $7, %edx # sched: [1:0.50]
; SLM-NEXT: pcmpestri $7, %xmm1, %xmm0 # sched: [21:21.00]
@@ -276,12 +383,12 @@ define i32 @test_pcmpestri(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; SLM-NEXT: movl $7, %edx # sched: [1:0.50]
; SLM-NEXT: movl %ecx, %esi # sched: [1:0.50]
; SLM-NEXT: pcmpestri $7, (%rdi), %xmm0 # sched: [21:21.00]
-; SLM-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def>
+; SLM-NEXT: # kill: def %ecx killed %ecx def %rcx
; SLM-NEXT: leal (%rcx,%rsi), %eax # sched: [1:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_pcmpestri:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: movl $7, %eax # sched: [1:0.33]
; SANDY-NEXT: movl $7, %edx # sched: [1:0.33]
; SANDY-NEXT: vpcmpestri $7, %xmm1, %xmm0 # sched: [4:2.67]
@@ -289,48 +396,87 @@ define i32 @test_pcmpestri(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; SANDY-NEXT: movl $7, %eax # sched: [1:0.33]
; SANDY-NEXT: movl $7, %edx # sched: [1:0.33]
; SANDY-NEXT: vpcmpestri $7, (%rdi), %xmm0 # sched: [4:2.33]
-; SANDY-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def>
+; SANDY-NEXT: # kill: def %ecx killed %ecx def %rcx
; SANDY-NEXT: leal (%rcx,%rsi), %eax # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pcmpestri:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: movl $7, %eax # sched: [1:0.25]
; HASWELL-NEXT: movl $7, %edx # sched: [1:0.25]
-; HASWELL-NEXT: vpcmpestri $7, %xmm1, %xmm0 # sched: [11:3.00]
+; HASWELL-NEXT: vpcmpestri $7, %xmm1, %xmm0 # sched: [18:4.00]
; HASWELL-NEXT: movl %ecx, %esi # sched: [1:0.25]
; HASWELL-NEXT: movl $7, %eax # sched: [1:0.25]
; HASWELL-NEXT: movl $7, %edx # sched: [1:0.25]
-; HASWELL-NEXT: vpcmpestri $7, (%rdi), %xmm0 # sched: [11:3.00]
-; HASWELL-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def>
+; HASWELL-NEXT: vpcmpestri $7, (%rdi), %xmm0 # sched: [24:4.00]
+; HASWELL-NEXT: # kill: def %ecx killed %ecx def %rcx
; HASWELL-NEXT: leal (%rcx,%rsi), %eax # sched: [1:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pcmpestri:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: movl $7, %eax # sched: [1:0.25]
+; BROADWELL-NEXT: movl $7, %edx # sched: [1:0.25]
+; BROADWELL-NEXT: vpcmpestri $7, %xmm1, %xmm0 # sched: [18:4.00]
+; BROADWELL-NEXT: movl %ecx, %esi # sched: [1:0.25]
+; BROADWELL-NEXT: movl $7, %eax # sched: [1:0.25]
+; BROADWELL-NEXT: movl $7, %edx # sched: [1:0.25]
+; BROADWELL-NEXT: vpcmpestri $7, (%rdi), %xmm0 # sched: [23:4.00]
+; BROADWELL-NEXT: # kill: def %ecx killed %ecx def %rcx
+; BROADWELL-NEXT: leal (%rcx,%rsi), %eax # sched: [1:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pcmpestri:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: movl $7, %eax # sched: [1:0.25]
+; SKYLAKE-NEXT: movl $7, %edx # sched: [1:0.25]
+; SKYLAKE-NEXT: vpcmpestri $7, %xmm1, %xmm0 # sched: [18:4.00]
+; SKYLAKE-NEXT: movl %ecx, %esi # sched: [1:0.25]
+; SKYLAKE-NEXT: movl $7, %eax # sched: [1:0.25]
+; SKYLAKE-NEXT: movl $7, %edx # sched: [1:0.25]
+; SKYLAKE-NEXT: vpcmpestri $7, (%rdi), %xmm0 # sched: [24:4.00]
+; SKYLAKE-NEXT: # kill: def %ecx killed %ecx def %rcx
+; SKYLAKE-NEXT: leal (%rcx,%rsi), %eax # sched: [1:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pcmpestri:
+; SKX: # %bb.0:
+; SKX-NEXT: movl $7, %eax # sched: [1:0.25]
+; SKX-NEXT: movl $7, %edx # sched: [1:0.25]
+; SKX-NEXT: vpcmpestri $7, %xmm1, %xmm0 # sched: [18:4.00]
+; SKX-NEXT: movl %ecx, %esi # sched: [1:0.25]
+; SKX-NEXT: movl $7, %eax # sched: [1:0.25]
+; SKX-NEXT: movl $7, %edx # sched: [1:0.25]
+; SKX-NEXT: vpcmpestri $7, (%rdi), %xmm0 # sched: [24:4.00]
+; SKX-NEXT: # kill: def %ecx killed %ecx def %rcx
+; SKX-NEXT: leal (%rcx,%rsi), %eax # sched: [1:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_pcmpestri:
-; BTVER2: # BB#0:
-; BTVER2-NEXT: movl $7, %eax # sched: [1:0.17]
-; BTVER2-NEXT: movl $7, %edx # sched: [1:0.17]
-; BTVER2-NEXT: vpcmpestri $7, %xmm1, %xmm0 # sched: [13:2.50]
-; BTVER2-NEXT: movl $7, %eax # sched: [1:0.17]
-; BTVER2-NEXT: movl $7, %edx # sched: [1:0.17]
-; BTVER2-NEXT: movl %ecx, %esi # sched: [1:0.17]
-; BTVER2-NEXT: vpcmpestri $7, (%rdi), %xmm0 # sched: [18:2.50]
-; BTVER2-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def>
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: movl $7, %eax # sched: [1:0.50]
+; BTVER2-NEXT: movl $7, %edx # sched: [1:0.50]
+; BTVER2-NEXT: vpcmpestri $7, %xmm1, %xmm0 # sched: [14:10.00]
+; BTVER2-NEXT: movl $7, %eax # sched: [1:0.50]
+; BTVER2-NEXT: movl $7, %edx # sched: [1:0.50]
+; BTVER2-NEXT: movl %ecx, %esi # sched: [1:0.50]
+; BTVER2-NEXT: vpcmpestri $7, (%rdi), %xmm0 # sched: [19:10.00]
+; BTVER2-NEXT: # kill: def %ecx killed %ecx def %rcx
; BTVER2-NEXT: leal (%rcx,%rsi), %eax # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_pcmpestri:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: movl $7, %eax # sched: [1:0.25]
; ZNVER1-NEXT: movl $7, %edx # sched: [1:0.25]
-; ZNVER1-NEXT: vpcmpestri $7, %xmm1, %xmm0 # sched: [100:0.00]
+; ZNVER1-NEXT: vpcmpestri $7, %xmm1, %xmm0 # sched: [100:?]
; ZNVER1-NEXT: movl $7, %eax # sched: [1:0.25]
; ZNVER1-NEXT: movl $7, %edx # sched: [1:0.25]
; ZNVER1-NEXT: movl %ecx, %esi # sched: [1:0.25]
-; ZNVER1-NEXT: vpcmpestri $7, (%rdi), %xmm0 # sched: [100:0.00]
-; ZNVER1-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def>
+; ZNVER1-NEXT: vpcmpestri $7, (%rdi), %xmm0 # sched: [100:?]
+; ZNVER1-NEXT: # kill: def %ecx killed %ecx def %rcx
; ZNVER1-NEXT: leal (%rcx,%rsi), %eax # sched: [1:0.25]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %a0, i32 7, <16 x i8> %a1, i32 7, i8 7)
%2 = load <16 x i8>, <16 x i8> *%a2, align 16
%3 = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %a0, i32 7, <16 x i8> %2, i32 7, i8 7)
@@ -341,17 +487,17 @@ declare i32 @llvm.x86.sse42.pcmpestri128(<16 x i8>, i32, <16 x i8>, i32, i8) nou
define <16 x i8> @test_pcmpestrm(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; GENERIC-LABEL: test_pcmpestrm:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: movl $7, %eax
-; GENERIC-NEXT: movl $7, %edx
-; GENERIC-NEXT: pcmpestrm $7, %xmm1, %xmm0
-; GENERIC-NEXT: movl $7, %eax
-; GENERIC-NEXT: movl $7, %edx
-; GENERIC-NEXT: pcmpestrm $7, (%rdi), %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: movl $7, %eax # sched: [1:0.33]
+; GENERIC-NEXT: movl $7, %edx # sched: [1:0.33]
+; GENERIC-NEXT: pcmpestrm $7, %xmm1, %xmm0 # sched: [11:2.67]
+; GENERIC-NEXT: movl $7, %eax # sched: [1:0.33]
+; GENERIC-NEXT: movl $7, %edx # sched: [1:0.33]
+; GENERIC-NEXT: pcmpestrm $7, (%rdi), %xmm0 # sched: [11:2.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SLM-LABEL: test_pcmpestrm:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: movl $7, %eax # sched: [1:0.50]
; SLM-NEXT: movl $7, %edx # sched: [1:0.50]
; SLM-NEXT: pcmpestrm $7, %xmm1, %xmm0 # sched: [17:17.00]
@@ -361,44 +507,74 @@ define <16 x i8> @test_pcmpestrm(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_pcmpestrm:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: movl $7, %eax # sched: [1:0.33]
; SANDY-NEXT: movl $7, %edx # sched: [1:0.33]
; SANDY-NEXT: vpcmpestrm $7, %xmm1, %xmm0 # sched: [11:2.67]
; SANDY-NEXT: movl $7, %eax # sched: [1:0.33]
; SANDY-NEXT: movl $7, %edx # sched: [1:0.33]
; SANDY-NEXT: vpcmpestrm $7, (%rdi), %xmm0 # sched: [11:2.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pcmpestrm:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: movl $7, %eax # sched: [1:0.25]
; HASWELL-NEXT: movl $7, %edx # sched: [1:0.25]
-; HASWELL-NEXT: vpcmpestrm $7, %xmm1, %xmm0 # sched: [10:4.00]
+; HASWELL-NEXT: vpcmpestrm $7, %xmm1, %xmm0 # sched: [19:4.00]
; HASWELL-NEXT: movl $7, %eax # sched: [1:0.25]
; HASWELL-NEXT: movl $7, %edx # sched: [1:0.25]
-; HASWELL-NEXT: vpcmpestrm $7, (%rdi), %xmm0 # sched: [10:3.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpcmpestrm $7, (%rdi), %xmm0 # sched: [25:4.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pcmpestrm:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: movl $7, %eax # sched: [1:0.25]
+; BROADWELL-NEXT: movl $7, %edx # sched: [1:0.25]
+; BROADWELL-NEXT: vpcmpestrm $7, %xmm1, %xmm0 # sched: [19:4.00]
+; BROADWELL-NEXT: movl $7, %eax # sched: [1:0.25]
+; BROADWELL-NEXT: movl $7, %edx # sched: [1:0.25]
+; BROADWELL-NEXT: vpcmpestrm $7, (%rdi), %xmm0 # sched: [24:4.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pcmpestrm:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: movl $7, %eax # sched: [1:0.25]
+; SKYLAKE-NEXT: movl $7, %edx # sched: [1:0.25]
+; SKYLAKE-NEXT: vpcmpestrm $7, %xmm1, %xmm0 # sched: [19:4.00]
+; SKYLAKE-NEXT: movl $7, %eax # sched: [1:0.25]
+; SKYLAKE-NEXT: movl $7, %edx # sched: [1:0.25]
+; SKYLAKE-NEXT: vpcmpestrm $7, (%rdi), %xmm0 # sched: [25:4.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pcmpestrm:
+; SKX: # %bb.0:
+; SKX-NEXT: movl $7, %eax # sched: [1:0.25]
+; SKX-NEXT: movl $7, %edx # sched: [1:0.25]
+; SKX-NEXT: vpcmpestrm $7, %xmm1, %xmm0 # sched: [19:4.00]
+; SKX-NEXT: movl $7, %eax # sched: [1:0.25]
+; SKX-NEXT: movl $7, %edx # sched: [1:0.25]
+; SKX-NEXT: vpcmpestrm $7, (%rdi), %xmm0 # sched: [25:4.00]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_pcmpestrm:
-; BTVER2: # BB#0:
-; BTVER2-NEXT: movl $7, %eax # sched: [1:0.17]
-; BTVER2-NEXT: movl $7, %edx # sched: [1:0.17]
-; BTVER2-NEXT: vpcmpestrm $7, %xmm1, %xmm0 # sched: [13:2.50]
-; BTVER2-NEXT: movl $7, %eax # sched: [1:0.17]
-; BTVER2-NEXT: movl $7, %edx # sched: [1:0.17]
-; BTVER2-NEXT: vpcmpestrm $7, (%rdi), %xmm0 # sched: [18:2.50]
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: movl $7, %eax # sched: [1:0.50]
+; BTVER2-NEXT: movl $7, %edx # sched: [1:0.50]
+; BTVER2-NEXT: vpcmpestrm $7, %xmm1, %xmm0 # sched: [14:10.00]
+; BTVER2-NEXT: movl $7, %eax # sched: [1:0.50]
+; BTVER2-NEXT: movl $7, %edx # sched: [1:0.50]
+; BTVER2-NEXT: vpcmpestrm $7, (%rdi), %xmm0 # sched: [19:10.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_pcmpestrm:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: movl $7, %eax # sched: [1:0.25]
; ZNVER1-NEXT: movl $7, %edx # sched: [1:0.25]
-; ZNVER1-NEXT: vpcmpestrm $7, %xmm1, %xmm0 # sched: [100:0.00]
+; ZNVER1-NEXT: vpcmpestrm $7, %xmm1, %xmm0 # sched: [100:?]
; ZNVER1-NEXT: movl $7, %eax # sched: [1:0.25]
; ZNVER1-NEXT: movl $7, %edx # sched: [1:0.25]
-; ZNVER1-NEXT: vpcmpestrm $7, (%rdi), %xmm0 # sched: [100:0.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: vpcmpestrm $7, (%rdi), %xmm0 # sched: [100:?]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8> %a0, i32 7, <16 x i8> %a1, i32 7, i8 7)
%2 = load <16 x i8>, <16 x i8> *%a2, align 16
%3 = call <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8> %1, i32 7, <16 x i8> %2, i32 7, i8 7)
@@ -408,58 +584,85 @@ declare <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8>, i32, <16 x i8>, i32, i
define i32 @test_pcmpistri(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; GENERIC-LABEL: test_pcmpistri:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: pcmpistri $7, %xmm1, %xmm0
-; GENERIC-NEXT: movl %ecx, %eax
-; GENERIC-NEXT: pcmpistri $7, (%rdi), %xmm0
-; GENERIC-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def>
-; GENERIC-NEXT: leal (%rcx,%rax), %eax
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: pcmpistri $7, %xmm1, %xmm0 # sched: [11:3.00]
+; GENERIC-NEXT: movl %ecx, %eax # sched: [1:0.33]
+; GENERIC-NEXT: pcmpistri $7, (%rdi), %xmm0 # sched: [17:3.00]
+; GENERIC-NEXT: # kill: def %ecx killed %ecx def %rcx
+; GENERIC-NEXT: leal (%rcx,%rax), %eax # sched: [1:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SLM-LABEL: test_pcmpistri:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: pcmpistri $7, %xmm1, %xmm0 # sched: [17:17.00]
; SLM-NEXT: movl %ecx, %eax # sched: [1:0.50]
; SLM-NEXT: pcmpistri $7, (%rdi), %xmm0 # sched: [17:17.00]
-; SLM-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def>
+; SLM-NEXT: # kill: def %ecx killed %ecx def %rcx
; SLM-NEXT: leal (%rcx,%rax), %eax # sched: [1:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_pcmpistri:
-; SANDY: # BB#0:
-; SANDY-NEXT: vpcmpistri $7, %xmm1, %xmm0 # sched: [3:1.00]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vpcmpistri $7, %xmm1, %xmm0 # sched: [11:3.00]
; SANDY-NEXT: movl %ecx, %eax # sched: [1:0.33]
-; SANDY-NEXT: vpcmpistri $7, (%rdi), %xmm0 # sched: [3:1.00]
-; SANDY-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def>
+; SANDY-NEXT: vpcmpistri $7, (%rdi), %xmm0 # sched: [17:3.00]
+; SANDY-NEXT: # kill: def %ecx killed %ecx def %rcx
; SANDY-NEXT: leal (%rcx,%rax), %eax # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pcmpistri:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vpcmpistri $7, %xmm1, %xmm0 # sched: [11:3.00]
; HASWELL-NEXT: movl %ecx, %eax # sched: [1:0.25]
-; HASWELL-NEXT: vpcmpistri $7, (%rdi), %xmm0 # sched: [11:3.00]
-; HASWELL-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def>
+; HASWELL-NEXT: vpcmpistri $7, (%rdi), %xmm0 # sched: [17:3.00]
+; HASWELL-NEXT: # kill: def %ecx killed %ecx def %rcx
; HASWELL-NEXT: leal (%rcx,%rax), %eax # sched: [1:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pcmpistri:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpcmpistri $7, %xmm1, %xmm0 # sched: [11:3.00]
+; BROADWELL-NEXT: movl %ecx, %eax # sched: [1:0.25]
+; BROADWELL-NEXT: vpcmpistri $7, (%rdi), %xmm0 # sched: [16:3.00]
+; BROADWELL-NEXT: # kill: def %ecx killed %ecx def %rcx
+; BROADWELL-NEXT: leal (%rcx,%rax), %eax # sched: [1:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pcmpistri:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpcmpistri $7, %xmm1, %xmm0 # sched: [10:3.00]
+; SKYLAKE-NEXT: movl %ecx, %eax # sched: [1:0.25]
+; SKYLAKE-NEXT: vpcmpistri $7, (%rdi), %xmm0 # sched: [16:3.00]
+; SKYLAKE-NEXT: # kill: def %ecx killed %ecx def %rcx
+; SKYLAKE-NEXT: leal (%rcx,%rax), %eax # sched: [1:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pcmpistri:
+; SKX: # %bb.0:
+; SKX-NEXT: vpcmpistri $7, %xmm1, %xmm0 # sched: [10:3.00]
+; SKX-NEXT: movl %ecx, %eax # sched: [1:0.25]
+; SKX-NEXT: vpcmpistri $7, (%rdi), %xmm0 # sched: [16:3.00]
+; SKX-NEXT: # kill: def %ecx killed %ecx def %rcx
+; SKX-NEXT: leal (%rcx,%rax), %eax # sched: [1:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_pcmpistri:
-; BTVER2: # BB#0:
-; BTVER2-NEXT: vpcmpistri $7, %xmm1, %xmm0 # sched: [6:1.00]
-; BTVER2-NEXT: movl %ecx, %eax # sched: [1:0.17]
-; BTVER2-NEXT: vpcmpistri $7, (%rdi), %xmm0 # sched: [11:1.00]
-; BTVER2-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def>
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: vpcmpistri $7, %xmm1, %xmm0 # sched: [7:2.00]
+; BTVER2-NEXT: movl %ecx, %eax # sched: [1:0.50]
+; BTVER2-NEXT: vpcmpistri $7, (%rdi), %xmm0 # sched: [12:2.00]
+; BTVER2-NEXT: # kill: def %ecx killed %ecx def %rcx
; BTVER2-NEXT: leal (%rcx,%rax), %eax # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_pcmpistri:
-; ZNVER1: # BB#0:
-; ZNVER1-NEXT: vpcmpistri $7, %xmm1, %xmm0 # sched: [100:0.00]
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpcmpistri $7, %xmm1, %xmm0 # sched: [100:?]
; ZNVER1-NEXT: movl %ecx, %eax # sched: [1:0.25]
-; ZNVER1-NEXT: vpcmpistri $7, (%rdi), %xmm0 # sched: [100:0.00]
-; ZNVER1-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def>
+; ZNVER1-NEXT: vpcmpistri $7, (%rdi), %xmm0 # sched: [100:?]
+; ZNVER1-NEXT: # kill: def %ecx killed %ecx def %rcx
; ZNVER1-NEXT: leal (%rcx,%rax), %eax # sched: [1:0.25]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %a0, <16 x i8> %a1, i8 7)
%2 = load <16 x i8>, <16 x i8> *%a2, align 16
%3 = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %a0, <16 x i8> %2, i8 7)
@@ -470,40 +673,58 @@ declare i32 @llvm.x86.sse42.pcmpistri128(<16 x i8>, <16 x i8>, i8) nounwind read
define <16 x i8> @test_pcmpistrm(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; GENERIC-LABEL: test_pcmpistrm:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: pcmpistrm $7, %xmm1, %xmm0
-; GENERIC-NEXT: pcmpistrm $7, (%rdi), %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: pcmpistrm $7, %xmm1, %xmm0 # sched: [11:3.00]
+; GENERIC-NEXT: pcmpistrm $7, (%rdi), %xmm0 # sched: [17:3.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SLM-LABEL: test_pcmpistrm:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: pcmpistrm $7, %xmm1, %xmm0 # sched: [13:13.00]
; SLM-NEXT: pcmpistrm $7, (%rdi), %xmm0 # sched: [13:13.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_pcmpistrm:
-; SANDY: # BB#0:
-; SANDY-NEXT: vpcmpistrm $7, %xmm1, %xmm0 # sched: [11:1.00]
-; SANDY-NEXT: vpcmpistrm $7, (%rdi), %xmm0 # sched: [11:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vpcmpistrm $7, %xmm1, %xmm0 # sched: [11:3.00]
+; SANDY-NEXT: vpcmpistrm $7, (%rdi), %xmm0 # sched: [17:3.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pcmpistrm:
-; HASWELL: # BB#0:
-; HASWELL-NEXT: vpcmpistrm $7, %xmm1, %xmm0 # sched: [10:3.00]
-; HASWELL-NEXT: vpcmpistrm $7, (%rdi), %xmm0 # sched: [10:3.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vpcmpistrm $7, %xmm1, %xmm0 # sched: [11:3.00]
+; HASWELL-NEXT: vpcmpistrm $7, (%rdi), %xmm0 # sched: [17:3.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pcmpistrm:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpcmpistrm $7, %xmm1, %xmm0 # sched: [11:3.00]
+; BROADWELL-NEXT: vpcmpistrm $7, (%rdi), %xmm0 # sched: [16:3.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pcmpistrm:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpcmpistrm $7, %xmm1, %xmm0 # sched: [10:3.00]
+; SKYLAKE-NEXT: vpcmpistrm $7, (%rdi), %xmm0 # sched: [16:3.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pcmpistrm:
+; SKX: # %bb.0:
+; SKX-NEXT: vpcmpistrm $7, %xmm1, %xmm0 # sched: [10:3.00]
+; SKX-NEXT: vpcmpistrm $7, (%rdi), %xmm0 # sched: [16:3.00]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_pcmpistrm:
-; BTVER2: # BB#0:
-; BTVER2-NEXT: vpcmpistrm $7, %xmm1, %xmm0 # sched: [7:1.00]
-; BTVER2-NEXT: vpcmpistrm $7, (%rdi), %xmm0 # sched: [12:1.00]
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: vpcmpistrm $7, %xmm1, %xmm0 # sched: [8:2.00]
+; BTVER2-NEXT: vpcmpistrm $7, (%rdi), %xmm0 # sched: [13:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_pcmpistrm:
-; ZNVER1: # BB#0:
-; ZNVER1-NEXT: vpcmpistrm $7, %xmm1, %xmm0 # sched: [100:0.00]
-; ZNVER1-NEXT: vpcmpistrm $7, (%rdi), %xmm0 # sched: [100:0.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpcmpistrm $7, %xmm1, %xmm0 # sched: [100:?]
+; ZNVER1-NEXT: vpcmpistrm $7, (%rdi), %xmm0 # sched: [100:?]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %a0, <16 x i8> %a1, i8 7)
%2 = load <16 x i8>, <16 x i8> *%a2, align 16
%3 = call <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %1, <16 x i8> %2, i8 7)
@@ -513,40 +734,60 @@ declare <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8>, <16 x i8>, i8) nounwin
define <2 x i64> @test_pcmpgtq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
; GENERIC-LABEL: test_pcmpgtq:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: pcmpgtq %xmm1, %xmm0
-; GENERIC-NEXT: pcmpgtq (%rdi), %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: pcmpgtq %xmm1, %xmm0 # sched: [5:1.00]
+; GENERIC-NEXT: pcmpgtq (%rdi), %xmm0 # sched: [11:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; SLM-LABEL: test_pcmpgtq:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: pcmpgtq %xmm1, %xmm0 # sched: [1:0.50]
; SLM-NEXT: pcmpgtq (%rdi), %xmm0 # sched: [4:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_pcmpgtq:
-; SANDY: # BB#0:
-; SANDY-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpcmpgtq (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; SANDY-NEXT: vpcmpgtq (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pcmpgtq:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT: vpcmpgtq (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpcmpgtq (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pcmpgtq:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BROADWELL-NEXT: vpcmpgtq (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pcmpgtq:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; SKYLAKE-NEXT: vpcmpgtq (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pcmpgtq:
+; SKX: # %bb.0:
+; SKX-NEXT: vpcmpgtq %xmm1, %xmm0, %k0 # sched: [3:1.00]
+; SKX-NEXT: vpmovm2q %k0, %xmm0 # sched: [1:0.25]
+; SKX-NEXT: vpcmpgtq (%rdi), %xmm0, %k0 # sched: [9:1.00]
+; SKX-NEXT: vpmovm2q %k0, %xmm0 # sched: [1:0.25]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_pcmpgtq:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vpcmpgtq (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_pcmpgtq:
-; ZNVER1: # BB#0:
-; ZNVER1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; ZNVER1-NEXT: vpcmpgtq (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = icmp sgt <2 x i64> %a0, %a1
%2 = sext <2 x i1> %1 to <2 x i64>
%3 = load <2 x i64>, <2 x i64>*%a2, align 16
@@ -554,3 +795,64 @@ define <2 x i64> @test_pcmpgtq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
%5 = sext <2 x i1> %4 to <2 x i64>
ret <2 x i64> %5
}
+
+define <2 x i64> @test_pclmulqdq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
+; GENERIC-LABEL: test_pclmulqdq:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: pclmulqdq $0, %xmm1, %xmm0 # sched: [14:6.00]
+; GENERIC-NEXT: pclmulqdq $0, (%rdi), %xmm0 # sched: [14:5.67]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; SLM-LABEL: test_pclmulqdq:
+; SLM: # %bb.0:
+; SLM-NEXT: pclmulqdq $0, %xmm1, %xmm0 # sched: [10:10.00]
+; SLM-NEXT: pclmulqdq $0, (%rdi), %xmm0 # sched: [10:10.00]
+; SLM-NEXT: retq # sched: [4:1.00]
+;
+; SANDY-LABEL: test_pclmulqdq:
+; SANDY: # %bb.0:
+; SANDY-NEXT: vpclmulqdq $0, %xmm1, %xmm0, %xmm0 # sched: [14:6.00]
+; SANDY-NEXT: vpclmulqdq $0, (%rdi), %xmm0, %xmm0 # sched: [14:5.67]
+; SANDY-NEXT: retq # sched: [1:1.00]
+;
+; HASWELL-LABEL: test_pclmulqdq:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: vpclmulqdq $0, %xmm1, %xmm0, %xmm0 # sched: [11:2.00]
+; HASWELL-NEXT: vpclmulqdq $0, (%rdi), %xmm0, %xmm0 # sched: [17:2.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pclmulqdq:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpclmulqdq $0, %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BROADWELL-NEXT: vpclmulqdq $0, (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pclmulqdq:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpclmulqdq $0, %xmm1, %xmm0, %xmm0 # sched: [6:1.00]
+; SKYLAKE-NEXT: vpclmulqdq $0, (%rdi), %xmm0, %xmm0 # sched: [12:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pclmulqdq:
+; SKX: # %bb.0:
+; SKX-NEXT: vpclmulqdq $0, %xmm1, %xmm0, %xmm0 # sched: [6:1.00]
+; SKX-NEXT: vpclmulqdq $0, (%rdi), %xmm0, %xmm0 # sched: [12:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
+;
+; BTVER2-LABEL: test_pclmulqdq:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: vpclmulqdq $0, %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
+; BTVER2-NEXT: vpclmulqdq $0, (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; BTVER2-NEXT: retq # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_pclmulqdq:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpclmulqdq $0, %xmm1, %xmm0, %xmm0 # sched: [100:?]
+; ZNVER1-NEXT: vpclmulqdq $0, (%rdi), %xmm0, %xmm0 # sched: [100:?]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
+ %1 = load <2 x i64>, <2 x i64> *%a2, align 16
+ %2 = call <2 x i64> @llvm.x86.pclmulqdq(<2 x i64> %a0, <2 x i64> %a1, i8 0)
+ %3 = call <2 x i64> @llvm.x86.pclmulqdq(<2 x i64> %1, <2 x i64> %2, i8 0)
+ ret <2 x i64> %3
+}
+declare <2 x i64> @llvm.x86.pclmulqdq(<2 x i64>, <2 x i64>, i8)
diff --git a/test/CodeGen/X86/sse4a-intrinsics-fast-isel.ll b/test/CodeGen/X86/sse4a-intrinsics-fast-isel.ll
index f45abf1d85df..51d056f2049d 100644
--- a/test/CodeGen/X86/sse4a-intrinsics-fast-isel.ll
+++ b/test/CodeGen/X86/sse4a-intrinsics-fast-isel.ll
@@ -8,12 +8,12 @@
define <2 x i64> @test_mm_extracti_si64(<2 x i64> %x) {
; X32-LABEL: test_mm_extracti_si64:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: extrq $2, $3, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_extracti_si64:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: extrq $2, $3, %xmm0
; X64-NEXT: retq
%res = call <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64> %x, i8 3, i8 2)
@@ -23,12 +23,12 @@ declare <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64>, i8, i8) nounwind readnone
define <2 x i64> @test_mm_extract_si64(<2 x i64> %x, <2 x i64> %y) {
; X32-LABEL: test_mm_extract_si64:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: extrq %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_extract_si64:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: extrq %xmm1, %xmm0
; X64-NEXT: retq
%bc = bitcast <2 x i64> %y to <16 x i8>
@@ -39,12 +39,12 @@ declare <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64>, <16 x i8>) nounwind readnone
define <2 x i64> @test_mm_inserti_si64(<2 x i64> %x, <2 x i64> %y) {
; X32-LABEL: test_mm_inserti_si64:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: insertq $6, $5, %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_inserti_si64:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: insertq $6, $5, %xmm1, %xmm0
; X64-NEXT: retq
%res = call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %x, <2 x i64> %y, i8 5, i8 6)
@@ -54,12 +54,12 @@ declare <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64>, <2 x i64>, i8, i8) nounwin
define <2 x i64> @test_mm_insert_si64(<2 x i64> %x, <2 x i64> %y) {
; X32-LABEL: test_mm_insert_si64:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: insertq %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_insert_si64:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: insertq %xmm1, %xmm0
; X64-NEXT: retq
%res = call <2 x i64> @llvm.x86.sse4a.insertq(<2 x i64> %x, <2 x i64> %y)
@@ -69,13 +69,13 @@ declare <2 x i64> @llvm.x86.sse4a.insertq(<2 x i64>, <2 x i64>) nounwind readnon
define void @test_stream_sd(double* %p, <2 x double> %a) {
; X32-LABEL: test_stream_sd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movntsd %xmm0, (%eax)
; X32-NEXT: retl
;
; X64-LABEL: test_stream_sd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movntsd %xmm0, (%rdi)
; X64-NEXT: retq
%1 = extractelement <2 x double> %a, i64 0
@@ -85,13 +85,13 @@ define void @test_stream_sd(double* %p, <2 x double> %a) {
define void @test_mm_stream_ss(float* %p, <4 x float> %a) {
; X32-LABEL: test_mm_stream_ss:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movntss %xmm0, (%eax)
; X32-NEXT: retl
;
; X64-LABEL: test_mm_stream_ss:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movntss %xmm0, (%rdi)
; X64-NEXT: retq
%1 = extractelement <4 x float> %a, i64 0
diff --git a/test/CodeGen/X86/sse4a-schedule.ll b/test/CodeGen/X86/sse4a-schedule.ll
index 9ad6b0dfd4d6..d2b90f7f8a7d 100644
--- a/test/CodeGen/X86/sse4a-schedule.ll
+++ b/test/CodeGen/X86/sse4a-schedule.ll
@@ -1,23 +1,23 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mattr=+sse4a | FileCheck %s --check-prefix=GENERIC
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=+sse4a | FileCheck %s --check-prefix=GENERIC
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=BTVER2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=ZNVER1
define <2 x i64> @test_extrq(<2 x i64> %a0, <16 x i8> %a1) {
; GENERIC-LABEL: test_extrq:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: extrq %xmm1, %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: extrq %xmm1, %xmm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; BTVER2-LABEL: test_extrq:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: extrq %xmm1, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_extrq:
-; ZNVER1: # BB#0:
-; ZNVER1-NEXT: extrq %xmm1, %xmm0 # sched: [?:0.000000e+00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: extrq %xmm1, %xmm0 # sched: [2:1.00]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = tail call <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64> %a0, <16 x i8> %a1)
ret <2 x i64> %1
}
@@ -25,19 +25,19 @@ declare <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64>, <16 x i8>)
define <2 x i64> @test_extrqi(<2 x i64> %a0) {
; GENERIC-LABEL: test_extrqi:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: extrq $2, $3, %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: extrq $2, $3, %xmm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; BTVER2-LABEL: test_extrqi:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: extrq $2, $3, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_extrqi:
-; ZNVER1: # BB#0:
-; ZNVER1-NEXT: extrq $2, $3, %xmm0 # sched: [?:0.000000e+00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: extrq $2, $3, %xmm0 # sched: [2:1.00]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = tail call <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64> %a0, i8 3, i8 2)
ret <2 x i64> %1
}
@@ -45,19 +45,19 @@ declare <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64>, i8, i8)
define <2 x i64> @test_insertq(<2 x i64> %a0, <2 x i64> %a1) {
; GENERIC-LABEL: test_insertq:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: insertq %xmm1, %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: insertq %xmm1, %xmm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; BTVER2-LABEL: test_insertq:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: insertq %xmm1, %xmm0 # sched: [2:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_insertq:
-; ZNVER1: # BB#0:
-; ZNVER1-NEXT: insertq %xmm1, %xmm0 # sched: [?:0.000000e+00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: insertq %xmm1, %xmm0 # sched: [4:1.00]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = tail call <2 x i64> @llvm.x86.sse4a.insertq(<2 x i64> %a0, <2 x i64> %a1)
ret <2 x i64> %1
}
@@ -65,19 +65,19 @@ declare <2 x i64> @llvm.x86.sse4a.insertq(<2 x i64>, <2 x i64>)
define <2 x i64> @test_insertqi(<2 x i64> %a0, <2 x i64> %a1) {
; GENERIC-LABEL: test_insertqi:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: insertq $6, $5, %xmm1, %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: insertq $6, $5, %xmm1, %xmm0 # sched: [3:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; BTVER2-LABEL: test_insertqi:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: insertq $6, $5, %xmm1, %xmm0 # sched: [2:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_insertqi:
-; ZNVER1: # BB#0:
-; ZNVER1-NEXT: insertq $6, $5, %xmm1, %xmm0 # sched: [?:0.000000e+00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: insertq $6, $5, %xmm1, %xmm0 # sched: [4:1.00]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %a0, <2 x i64> %a1, i8 5, i8 6)
ret <2 x i64> %1
}
@@ -85,19 +85,19 @@ declare <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64>, <2 x i64>, i8, i8)
define void @test_movntsd(i8* %p, <2 x double> %a) {
; GENERIC-LABEL: test_movntsd:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: movntsd %xmm0, (%rdi)
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: movntsd %xmm0, (%rdi) # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; BTVER2-LABEL: test_movntsd:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: movntsd %xmm0, (%rdi) # sched: [1:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_movntsd:
-; ZNVER1: # BB#0:
-; ZNVER1-NEXT: movntsd %xmm0, (%rdi) # sched: [1:0.50]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: movntsd %xmm0, (%rdi) # sched: [8:1.00]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
tail call void @llvm.x86.sse4a.movnt.sd(i8* %p, <2 x double> %a)
ret void
}
@@ -105,19 +105,19 @@ declare void @llvm.x86.sse4a.movnt.sd(i8*, <2 x double>)
define void @test_movntss(i8* %p, <4 x float> %a) {
; GENERIC-LABEL: test_movntss:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: movntss %xmm0, (%rdi)
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: movntss %xmm0, (%rdi) # sched: [1:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; BTVER2-LABEL: test_movntss:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: movntss %xmm0, (%rdi) # sched: [1:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_movntss:
-; ZNVER1: # BB#0:
-; ZNVER1-NEXT: movntss %xmm0, (%rdi) # sched: [1:0.50]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: movntss %xmm0, (%rdi) # sched: [8:1.00]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
tail call void @llvm.x86.sse4a.movnt.ss(i8* %p, <4 x float> %a)
ret void
}
diff --git a/test/CodeGen/X86/sse4a-upgrade.ll b/test/CodeGen/X86/sse4a-upgrade.ll
index a129c658f4b9..04cb11758cae 100644
--- a/test/CodeGen/X86/sse4a-upgrade.ll
+++ b/test/CodeGen/X86/sse4a-upgrade.ll
@@ -6,13 +6,13 @@
define void @test_movntss(i8* %p, <4 x float> %a) nounwind optsize ssp {
; X32-LABEL: test_movntss:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movntss %xmm0, (%eax)
; X32-NEXT: retl
;
; X64-LABEL: test_movntss:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movntss %xmm0, (%rdi)
; X64-NEXT: retq
tail call void @llvm.x86.sse4a.movnt.ss(i8* %p, <4 x float> %a) nounwind
@@ -23,13 +23,13 @@ declare void @llvm.x86.sse4a.movnt.ss(i8*, <4 x float>)
define void @test_movntsd(i8* %p, <2 x double> %a) nounwind optsize ssp {
; X32-LABEL: test_movntsd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movntsd %xmm0, (%eax)
; X32-NEXT: retl
;
; X64-LABEL: test_movntsd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movntsd %xmm0, (%rdi)
; X64-NEXT: retq
tail call void @llvm.x86.sse4a.movnt.sd(i8* %p, <2 x double> %a) nounwind
diff --git a/test/CodeGen/X86/sse4a.ll b/test/CodeGen/X86/sse4a.ll
index ad04e257dc95..612e3b7de9c3 100644
--- a/test/CodeGen/X86/sse4a.ll
+++ b/test/CodeGen/X86/sse4a.ll
@@ -6,12 +6,12 @@
define <2 x i64> @test_extrqi(<2 x i64> %x) nounwind uwtable ssp {
; X32-LABEL: test_extrqi:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: extrq $2, $3, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_extrqi:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: extrq $2, $3, %xmm0
; X64-NEXT: retq
%1 = tail call <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64> %x, i8 3, i8 2)
@@ -20,27 +20,27 @@ define <2 x i64> @test_extrqi(<2 x i64> %x) nounwind uwtable ssp {
define <2 x i64> @test_extrqi_domain(<2 x i64> *%p) nounwind uwtable ssp {
; X32-SSE-LABEL: test_extrqi_domain:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-SSE-NEXT: movdqa (%eax), %xmm0
; X32-SSE-NEXT: extrq $2, $3, %xmm0
; X32-SSE-NEXT: retl
;
; X32-AVX-LABEL: test_extrqi_domain:
-; X32-AVX: # BB#0:
+; X32-AVX: # %bb.0:
; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-AVX-NEXT: vmovdqa (%eax), %xmm0
; X32-AVX-NEXT: extrq $2, $3, %xmm0
; X32-AVX-NEXT: retl
;
; X64-SSE-LABEL: test_extrqi_domain:
-; X64-SSE: # BB#0:
+; X64-SSE: # %bb.0:
; X64-SSE-NEXT: movdqa (%rdi), %xmm0
; X64-SSE-NEXT: extrq $2, $3, %xmm0
; X64-SSE-NEXT: retq
;
; X64-AVX-LABEL: test_extrqi_domain:
-; X64-AVX: # BB#0:
+; X64-AVX: # %bb.0:
; X64-AVX-NEXT: vmovdqa (%rdi), %xmm0
; X64-AVX-NEXT: extrq $2, $3, %xmm0
; X64-AVX-NEXT: retq
@@ -53,12 +53,12 @@ declare <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64>, i8, i8) nounwind
define <2 x i64> @test_extrq(<2 x i64> %x, <2 x i64> %y) nounwind uwtable ssp {
; X32-LABEL: test_extrq:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: extrq %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_extrq:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: extrq %xmm1, %xmm0
; X64-NEXT: retq
%1 = bitcast <2 x i64> %y to <16 x i8>
@@ -68,7 +68,7 @@ define <2 x i64> @test_extrq(<2 x i64> %x, <2 x i64> %y) nounwind uwtable ssp {
define <2 x i64> @test_extrq_domain(<2 x i64> *%p, <2 x i64> %y) nounwind uwtable ssp {
; X32-SSE-LABEL: test_extrq_domain:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-SSE-NEXT: movdqa (%eax), %xmm1
; X32-SSE-NEXT: extrq %xmm0, %xmm1
@@ -76,7 +76,7 @@ define <2 x i64> @test_extrq_domain(<2 x i64> *%p, <2 x i64> %y) nounwind uwtabl
; X32-SSE-NEXT: retl
;
; X32-AVX-LABEL: test_extrq_domain:
-; X32-AVX: # BB#0:
+; X32-AVX: # %bb.0:
; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-AVX-NEXT: vmovdqa (%eax), %xmm1
; X32-AVX-NEXT: extrq %xmm0, %xmm1
@@ -84,14 +84,14 @@ define <2 x i64> @test_extrq_domain(<2 x i64> *%p, <2 x i64> %y) nounwind uwtabl
; X32-AVX-NEXT: retl
;
; X64-SSE-LABEL: test_extrq_domain:
-; X64-SSE: # BB#0:
+; X64-SSE: # %bb.0:
; X64-SSE-NEXT: movdqa (%rdi), %xmm1
; X64-SSE-NEXT: extrq %xmm0, %xmm1
; X64-SSE-NEXT: movdqa %xmm1, %xmm0
; X64-SSE-NEXT: retq
;
; X64-AVX-LABEL: test_extrq_domain:
-; X64-AVX: # BB#0:
+; X64-AVX: # %bb.0:
; X64-AVX-NEXT: vmovdqa (%rdi), %xmm1
; X64-AVX-NEXT: extrq %xmm0, %xmm1
; X64-AVX-NEXT: vmovdqa %xmm1, %xmm0
@@ -106,12 +106,12 @@ declare <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64>, <16 x i8>) nounwind
define <2 x i64> @test_insertqi(<2 x i64> %x, <2 x i64> %y) nounwind uwtable ssp {
; X32-LABEL: test_insertqi:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: insertq $6, $5, %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_insertqi:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: insertq $6, $5, %xmm1, %xmm0
; X64-NEXT: retq
%1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %x, <2 x i64> %y, i8 5, i8 6)
@@ -120,7 +120,7 @@ define <2 x i64> @test_insertqi(<2 x i64> %x, <2 x i64> %y) nounwind uwtable ssp
define <2 x i64> @test_insertqi_domain(<2 x i64> *%p, <2 x i64> %y) nounwind uwtable ssp {
; X32-SSE-LABEL: test_insertqi_domain:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-SSE-NEXT: movdqa (%eax), %xmm1
; X32-SSE-NEXT: insertq $6, $5, %xmm0, %xmm1
@@ -128,7 +128,7 @@ define <2 x i64> @test_insertqi_domain(<2 x i64> *%p, <2 x i64> %y) nounwind uwt
; X32-SSE-NEXT: retl
;
; X32-AVX-LABEL: test_insertqi_domain:
-; X32-AVX: # BB#0:
+; X32-AVX: # %bb.0:
; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-AVX-NEXT: vmovdqa (%eax), %xmm1
; X32-AVX-NEXT: insertq $6, $5, %xmm0, %xmm1
@@ -136,14 +136,14 @@ define <2 x i64> @test_insertqi_domain(<2 x i64> *%p, <2 x i64> %y) nounwind uwt
; X32-AVX-NEXT: retl
;
; X64-SSE-LABEL: test_insertqi_domain:
-; X64-SSE: # BB#0:
+; X64-SSE: # %bb.0:
; X64-SSE-NEXT: movdqa (%rdi), %xmm1
; X64-SSE-NEXT: insertq $6, $5, %xmm0, %xmm1
; X64-SSE-NEXT: movdqa %xmm1, %xmm0
; X64-SSE-NEXT: retq
;
; X64-AVX-LABEL: test_insertqi_domain:
-; X64-AVX: # BB#0:
+; X64-AVX: # %bb.0:
; X64-AVX-NEXT: vmovdqa (%rdi), %xmm1
; X64-AVX-NEXT: insertq $6, $5, %xmm0, %xmm1
; X64-AVX-NEXT: vmovdqa %xmm1, %xmm0
@@ -157,12 +157,12 @@ declare <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64>, <2 x i64>, i8, i8) nounwin
define <2 x i64> @test_insertq(<2 x i64> %x, <2 x i64> %y) nounwind uwtable ssp {
; X32-LABEL: test_insertq:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: insertq %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_insertq:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: insertq %xmm1, %xmm0
; X64-NEXT: retq
%1 = tail call <2 x i64> @llvm.x86.sse4a.insertq(<2 x i64> %x, <2 x i64> %y) nounwind
@@ -171,7 +171,7 @@ define <2 x i64> @test_insertq(<2 x i64> %x, <2 x i64> %y) nounwind uwtable ssp
define <2 x i64> @test_insertq_domain(<2 x i64> *%p, <2 x i64> %y) nounwind uwtable ssp {
; X32-SSE-LABEL: test_insertq_domain:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-SSE-NEXT: movdqa (%eax), %xmm1
; X32-SSE-NEXT: insertq %xmm0, %xmm1
@@ -179,7 +179,7 @@ define <2 x i64> @test_insertq_domain(<2 x i64> *%p, <2 x i64> %y) nounwind uwta
; X32-SSE-NEXT: retl
;
; X32-AVX-LABEL: test_insertq_domain:
-; X32-AVX: # BB#0:
+; X32-AVX: # %bb.0:
; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-AVX-NEXT: vmovdqa (%eax), %xmm1
; X32-AVX-NEXT: insertq %xmm0, %xmm1
@@ -187,14 +187,14 @@ define <2 x i64> @test_insertq_domain(<2 x i64> *%p, <2 x i64> %y) nounwind uwta
; X32-AVX-NEXT: retl
;
; X64-SSE-LABEL: test_insertq_domain:
-; X64-SSE: # BB#0:
+; X64-SSE: # %bb.0:
; X64-SSE-NEXT: movdqa (%rdi), %xmm1
; X64-SSE-NEXT: insertq %xmm0, %xmm1
; X64-SSE-NEXT: movdqa %xmm1, %xmm0
; X64-SSE-NEXT: retq
;
; X64-AVX-LABEL: test_insertq_domain:
-; X64-AVX: # BB#0:
+; X64-AVX: # %bb.0:
; X64-AVX-NEXT: vmovdqa (%rdi), %xmm1
; X64-AVX-NEXT: insertq %xmm0, %xmm1
; X64-AVX-NEXT: vmovdqa %xmm1, %xmm0
diff --git a/test/CodeGen/X86/sse_partial_update.ll b/test/CodeGen/X86/sse_partial_update.ll
index 8dfb8ee70076..f1007cc9951b 100644
--- a/test/CodeGen/X86/sse_partial_update.ll
+++ b/test/CodeGen/X86/sse_partial_update.ll
@@ -10,7 +10,7 @@
define void @rsqrtss(<4 x float> %a) nounwind uwtable ssp {
; CHECK-LABEL: rsqrtss:
-; CHECK: ## BB#0: ## %entry
+; CHECK: ## %bb.0: ## %entry
; CHECK-NEXT: rsqrtss %xmm0, %xmm0
; CHECK-NEXT: cvtss2sd %xmm0, %xmm2
; CHECK-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
@@ -32,7 +32,7 @@ declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone
define void @rcpss(<4 x float> %a) nounwind uwtable ssp {
; CHECK-LABEL: rcpss:
-; CHECK: ## BB#0: ## %entry
+; CHECK: ## %bb.0: ## %entry
; CHECK-NEXT: rcpss %xmm0, %xmm0
; CHECK-NEXT: cvtss2sd %xmm0, %xmm2
; CHECK-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
@@ -53,7 +53,7 @@ declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone
define void @sqrtss(<4 x float> %a) nounwind uwtable ssp {
; CHECK-LABEL: sqrtss:
-; CHECK: ## BB#0: ## %entry
+; CHECK: ## %bb.0: ## %entry
; CHECK-NEXT: sqrtss %xmm0, %xmm0
; CHECK-NEXT: cvtss2sd %xmm0, %xmm2
; CHECK-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
@@ -74,7 +74,7 @@ declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone
define void @sqrtsd(<2 x double> %a) nounwind uwtable ssp {
; CHECK-LABEL: sqrtsd:
-; CHECK: ## BB#0: ## %entry
+; CHECK: ## %bb.0: ## %entry
; CHECK-NEXT: sqrtsd %xmm0, %xmm0
; CHECK-NEXT: cvtsd2ss %xmm0, %xmm2
; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
@@ -97,7 +97,7 @@ declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone
define <2 x double> @load_fold_cvtss2sd_int(<4 x float> *%a) {
; CHECK-LABEL: load_fold_cvtss2sd_int:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: xorps %xmm0, %xmm0
; CHECK-NEXT: cvtss2sd (%rdi), %xmm0
; CHECK-NEXT: retq
@@ -108,7 +108,7 @@ define <2 x double> @load_fold_cvtss2sd_int(<4 x float> *%a) {
define <2 x double> @load_fold_cvtss2sd_int_optsize(<4 x float> *%a) optsize {
; CHECK-LABEL: load_fold_cvtss2sd_int_optsize:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: xorps %xmm0, %xmm0
; CHECK-NEXT: cvtss2sd (%rdi), %xmm0
; CHECK-NEXT: retq
@@ -119,7 +119,7 @@ define <2 x double> @load_fold_cvtss2sd_int_optsize(<4 x float> *%a) optsize {
define <2 x double> @load_fold_cvtss2sd_int_minsize(<4 x float> *%a) minsize {
; CHECK-LABEL: load_fold_cvtss2sd_int_minsize:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: xorps %xmm0, %xmm0
; CHECK-NEXT: cvtss2sd (%rdi), %xmm0
; CHECK-NEXT: retq
diff --git a/test/CodeGen/X86/ssse3-intrinsics-fast-isel.ll b/test/CodeGen/X86/ssse3-intrinsics-fast-isel.ll
index 163dc0bc9a0c..74c5924b6005 100644
--- a/test/CodeGen/X86/ssse3-intrinsics-fast-isel.ll
+++ b/test/CodeGen/X86/ssse3-intrinsics-fast-isel.ll
@@ -6,64 +6,70 @@
define <2 x i64> @test_mm_abs_epi8(<2 x i64> %a0) {
; X32-LABEL: test_mm_abs_epi8:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pabsb %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_abs_epi8:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: pabsb %xmm0, %xmm0
; X64-NEXT: retq
%arg = bitcast <2 x i64> %a0 to <16 x i8>
- %call = call <16 x i8> @llvm.x86.ssse3.pabs.b.128(<16 x i8> %arg)
- %res = bitcast <16 x i8> %call to <2 x i64>
+ %sub = sub <16 x i8> zeroinitializer, %arg
+ %cmp = icmp sgt <16 x i8> %arg, zeroinitializer
+ %sel = select <16 x i1> %cmp, <16 x i8> %arg, <16 x i8> %sub
+ %res = bitcast <16 x i8> %sel to <2 x i64>
ret <2 x i64> %res
}
declare <16 x i8> @llvm.x86.ssse3.pabs.b.128(<16 x i8>) nounwind readnone
define <2 x i64> @test_mm_abs_epi16(<2 x i64> %a0) {
; X32-LABEL: test_mm_abs_epi16:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pabsw %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_abs_epi16:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: pabsw %xmm0, %xmm0
; X64-NEXT: retq
%arg = bitcast <2 x i64> %a0 to <8 x i16>
- %call = call <8 x i16> @llvm.x86.ssse3.pabs.w.128(<8 x i16> %arg)
- %res = bitcast <8 x i16> %call to <2 x i64>
+ %sub = sub <8 x i16> zeroinitializer, %arg
+ %cmp = icmp sgt <8 x i16> %arg, zeroinitializer
+ %sel = select <8 x i1> %cmp, <8 x i16> %arg, <8 x i16> %sub
+ %res = bitcast <8 x i16> %sel to <2 x i64>
ret <2 x i64> %res
}
declare <8 x i16> @llvm.x86.ssse3.pabs.w.128(<8 x i16>) nounwind readnone
define <2 x i64> @test_mm_abs_epi32(<2 x i64> %a0) {
; X32-LABEL: test_mm_abs_epi32:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pabsd %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_abs_epi32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: pabsd %xmm0, %xmm0
; X64-NEXT: retq
%arg = bitcast <2 x i64> %a0 to <4 x i32>
- %call = call <4 x i32> @llvm.x86.ssse3.pabs.d.128(<4 x i32> %arg)
- %res = bitcast <4 x i32> %call to <2 x i64>
+ %sub = sub <4 x i32> zeroinitializer, %arg
+ %cmp = icmp sgt <4 x i32> %arg, zeroinitializer
+ %sel = select <4 x i1> %cmp, <4 x i32> %arg, <4 x i32> %sub
+ %res = bitcast <4 x i32> %sel to <2 x i64>
ret <2 x i64> %res
}
declare <4 x i32> @llvm.x86.ssse3.pabs.d.128(<4 x i32>) nounwind readnone
define <2 x i64> @test_mm_alignr_epi8(<2 x i64> %a0, <2 x i64> %a1) {
; X32-LABEL: test_mm_alignr_epi8:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: palignr {{.*#+}} xmm1 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1]
; X32-NEXT: movdqa %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_alignr_epi8:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: palignr {{.*#+}} xmm1 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1]
; X64-NEXT: movdqa %xmm1, %xmm0
; X64-NEXT: retq
@@ -76,13 +82,13 @@ define <2 x i64> @test_mm_alignr_epi8(<2 x i64> %a0, <2 x i64> %a1) {
define <2 x i64> @test2_mm_alignr_epi8(<2 x i64> %a0, <2 x i64> %a1) {
; X32-LABEL: test2_mm_alignr_epi8:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: palignr {{.*#+}} xmm1 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0]
; X32-NEXT: movdqa %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test2_mm_alignr_epi8:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: palignr {{.*#+}} xmm1 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0]
; X64-NEXT: movdqa %xmm1, %xmm0
; X64-NEXT: retq
@@ -95,12 +101,12 @@ define <2 x i64> @test2_mm_alignr_epi8(<2 x i64> %a0, <2 x i64> %a1) {
define <2 x i64> @test_mm_hadd_epi16(<2 x i64> %a0, <2 x i64> %a1) {
; X32-LABEL: test_mm_hadd_epi16:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: phaddw %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_hadd_epi16:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: phaddw %xmm1, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <8 x i16>
@@ -113,12 +119,12 @@ declare <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16>, <8 x i16>) nounwind rea
define <2 x i64> @test_mm_hadd_epi32(<2 x i64> %a0, <2 x i64> %a1) {
; X32-LABEL: test_mm_hadd_epi32:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: phaddd %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_hadd_epi32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: phaddd %xmm1, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <4 x i32>
@@ -131,12 +137,12 @@ declare <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32>, <4 x i32>) nounwind rea
define <2 x i64> @test_mm_hadds_epi16(<2 x i64> %a0, <2 x i64> %a1) {
; X32-LABEL: test_mm_hadds_epi16:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: phaddsw %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_hadds_epi16:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: phaddsw %xmm1, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <8 x i16>
@@ -149,12 +155,12 @@ declare <8 x i16> @llvm.x86.ssse3.phadd.sw.128(<8 x i16>, <8 x i16>) nounwind re
define <2 x i64> @test_mm_hsub_epi16(<2 x i64> %a0, <2 x i64> %a1) {
; X32-LABEL: test_mm_hsub_epi16:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: phsubw %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_hsub_epi16:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: phsubw %xmm1, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <8 x i16>
@@ -167,12 +173,12 @@ declare <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16>, <8 x i16>) nounwind rea
define <2 x i64> @test_mm_hsub_epi32(<2 x i64> %a0, <2 x i64> %a1) {
; X32-LABEL: test_mm_hsub_epi32:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: phsubd %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_hsub_epi32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: phsubd %xmm1, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <4 x i32>
@@ -185,12 +191,12 @@ declare <4 x i32> @llvm.x86.ssse3.phsub.d.128(<4 x i32>, <4 x i32>) nounwind rea
define <2 x i64> @test_mm_hsubs_epi16(<2 x i64> %a0, <2 x i64> %a1) {
; X32-LABEL: test_mm_hsubs_epi16:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: phsubsw %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_hsubs_epi16:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: phsubsw %xmm1, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <8 x i16>
@@ -203,12 +209,12 @@ declare <8 x i16> @llvm.x86.ssse3.phsub.sw.128(<8 x i16>, <8 x i16>) nounwind re
define <2 x i64> @test_mm_maddubs_epi16(<2 x i64> %a0, <2 x i64> %a1) {
; X32-LABEL: test_mm_maddubs_epi16:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pmaddubsw %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_maddubs_epi16:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: pmaddubsw %xmm1, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <16 x i8>
@@ -221,12 +227,12 @@ declare <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8>, <16 x i8>) nounwind
define <2 x i64> @test_mm_mulhrs_epi16(<2 x i64> %a0, <2 x i64> %a1) {
; X32-LABEL: test_mm_mulhrs_epi16:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pmulhrsw %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_mulhrs_epi16:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: pmulhrsw %xmm1, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <8 x i16>
@@ -239,12 +245,12 @@ declare <8 x i16> @llvm.x86.ssse3.pmul.hr.sw.128(<8 x i16>, <8 x i16>) nounwind
define <2 x i64> @test_mm_shuffle_epi8(<2 x i64> %a0, <2 x i64> %a1) {
; X32-LABEL: test_mm_shuffle_epi8:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pshufb %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_shuffle_epi8:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: pshufb %xmm1, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <16 x i8>
@@ -257,12 +263,12 @@ declare <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8>, <16 x i8>) nounwind rea
define <2 x i64> @test_mm_sign_epi8(<2 x i64> %a0, <2 x i64> %a1) {
; X32-LABEL: test_mm_sign_epi8:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: psignb %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_sign_epi8:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: psignb %xmm1, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <16 x i8>
@@ -275,12 +281,12 @@ declare <16 x i8> @llvm.x86.ssse3.psign.b.128(<16 x i8>, <16 x i8>) nounwind rea
define <2 x i64> @test_mm_sign_epi16(<2 x i64> %a0, <2 x i64> %a1) {
; X32-LABEL: test_mm_sign_epi16:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: psignw %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_sign_epi16:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: psignw %xmm1, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <8 x i16>
@@ -293,12 +299,12 @@ declare <8 x i16> @llvm.x86.ssse3.psign.w.128(<8 x i16>, <8 x i16>) nounwind rea
define <2 x i64> @test_mm_sign_epi32(<2 x i64> %a0, <2 x i64> %a1) {
; X32-LABEL: test_mm_sign_epi32:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: psignd %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_sign_epi32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: psignd %xmm1, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <4 x i32>
diff --git a/test/CodeGen/X86/ssse3-intrinsics-x86.ll b/test/CodeGen/X86/ssse3-intrinsics-x86.ll
index 4f49385fec7f..66265d63a975 100644
--- a/test/CodeGen/X86/ssse3-intrinsics-x86.ll
+++ b/test/CodeGen/X86/ssse3-intrinsics-x86.ll
@@ -5,17 +5,17 @@
define <16 x i8> @test_x86_ssse3_pabs_b_128(<16 x i8> %a0) {
; SSE-LABEL: test_x86_ssse3_pabs_b_128:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: pabsb %xmm0, %xmm0 ## encoding: [0x66,0x0f,0x38,0x1c,0xc0]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; AVX2-LABEL: test_x86_ssse3_pabs_b_128:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vpabsb %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x1c,0xc0]
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_ssse3_pabs_b_128:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpabsb %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x1c,0xc0]
; SKX-NEXT: retl ## encoding: [0xc3]
%res = call <16 x i8> @llvm.x86.ssse3.pabs.b.128(<16 x i8> %a0) ; <<16 x i8>> [#uses=1]
@@ -26,17 +26,17 @@ declare <16 x i8> @llvm.x86.ssse3.pabs.b.128(<16 x i8>) nounwind readnone
define <4 x i32> @test_x86_ssse3_pabs_d_128(<4 x i32> %a0) {
; SSE-LABEL: test_x86_ssse3_pabs_d_128:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: pabsd %xmm0, %xmm0 ## encoding: [0x66,0x0f,0x38,0x1e,0xc0]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; AVX2-LABEL: test_x86_ssse3_pabs_d_128:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vpabsd %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x1e,0xc0]
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_ssse3_pabs_d_128:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpabsd %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x1e,0xc0]
; SKX-NEXT: retl ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.ssse3.pabs.d.128(<4 x i32> %a0) ; <<4 x i32>> [#uses=1]
@@ -47,17 +47,17 @@ declare <4 x i32> @llvm.x86.ssse3.pabs.d.128(<4 x i32>) nounwind readnone
define <8 x i16> @test_x86_ssse3_pabs_w_128(<8 x i16> %a0) {
; SSE-LABEL: test_x86_ssse3_pabs_w_128:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: pabsw %xmm0, %xmm0 ## encoding: [0x66,0x0f,0x38,0x1d,0xc0]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; AVX2-LABEL: test_x86_ssse3_pabs_w_128:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vpabsw %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x1d,0xc0]
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_ssse3_pabs_w_128:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpabsw %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x1d,0xc0]
; SKX-NEXT: retl ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.ssse3.pabs.w.128(<8 x i16> %a0) ; <<8 x i16>> [#uses=1]
@@ -68,12 +68,12 @@ declare <8 x i16> @llvm.x86.ssse3.pabs.w.128(<8 x i16>) nounwind readnone
define <4 x i32> @test_x86_ssse3_phadd_d_128(<4 x i32> %a0, <4 x i32> %a1) {
; SSE-LABEL: test_x86_ssse3_phadd_d_128:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: phaddd %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x38,0x02,0xc1]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; VCHECK-LABEL: test_x86_ssse3_phadd_d_128:
-; VCHECK: ## BB#0:
+; VCHECK: ## %bb.0:
; VCHECK-NEXT: vphaddd %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x02,0xc1]
; VCHECK-NEXT: retl ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32> %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1]
@@ -84,12 +84,12 @@ declare <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32>, <4 x i32>) nounwind rea
define <8 x i16> @test_x86_ssse3_phadd_sw_128(<8 x i16> %a0, <8 x i16> %a1) {
; SSE-LABEL: test_x86_ssse3_phadd_sw_128:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: phaddsw %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x38,0x03,0xc1]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; VCHECK-LABEL: test_x86_ssse3_phadd_sw_128:
-; VCHECK: ## BB#0:
+; VCHECK: ## %bb.0:
; VCHECK-NEXT: vphaddsw %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x03,0xc1]
; VCHECK-NEXT: retl ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.ssse3.phadd.sw.128(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
@@ -100,12 +100,12 @@ declare <8 x i16> @llvm.x86.ssse3.phadd.sw.128(<8 x i16>, <8 x i16>) nounwind re
define <8 x i16> @test_x86_ssse3_phadd_w_128(<8 x i16> %a0, <8 x i16> %a1) {
; SSE-LABEL: test_x86_ssse3_phadd_w_128:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: phaddw %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x38,0x01,0xc1]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; VCHECK-LABEL: test_x86_ssse3_phadd_w_128:
-; VCHECK: ## BB#0:
+; VCHECK: ## %bb.0:
; VCHECK-NEXT: vphaddw %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x01,0xc1]
; VCHECK-NEXT: retl ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
@@ -116,12 +116,12 @@ declare <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16>, <8 x i16>) nounwind rea
define <4 x i32> @test_x86_ssse3_phsub_d_128(<4 x i32> %a0, <4 x i32> %a1) {
; SSE-LABEL: test_x86_ssse3_phsub_d_128:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: phsubd %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x38,0x06,0xc1]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; VCHECK-LABEL: test_x86_ssse3_phsub_d_128:
-; VCHECK: ## BB#0:
+; VCHECK: ## %bb.0:
; VCHECK-NEXT: vphsubd %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x06,0xc1]
; VCHECK-NEXT: retl ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.ssse3.phsub.d.128(<4 x i32> %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1]
@@ -132,12 +132,12 @@ declare <4 x i32> @llvm.x86.ssse3.phsub.d.128(<4 x i32>, <4 x i32>) nounwind rea
define <8 x i16> @test_x86_ssse3_phsub_sw_128(<8 x i16> %a0, <8 x i16> %a1) {
; SSE-LABEL: test_x86_ssse3_phsub_sw_128:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: phsubsw %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x38,0x07,0xc1]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; VCHECK-LABEL: test_x86_ssse3_phsub_sw_128:
-; VCHECK: ## BB#0:
+; VCHECK: ## %bb.0:
; VCHECK-NEXT: vphsubsw %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x07,0xc1]
; VCHECK-NEXT: retl ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.ssse3.phsub.sw.128(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
@@ -148,12 +148,12 @@ declare <8 x i16> @llvm.x86.ssse3.phsub.sw.128(<8 x i16>, <8 x i16>) nounwind re
define <8 x i16> @test_x86_ssse3_phsub_w_128(<8 x i16> %a0, <8 x i16> %a1) {
; SSE-LABEL: test_x86_ssse3_phsub_w_128:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: phsubw %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x38,0x05,0xc1]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; VCHECK-LABEL: test_x86_ssse3_phsub_w_128:
-; VCHECK: ## BB#0:
+; VCHECK: ## %bb.0:
; VCHECK-NEXT: vphsubw %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x05,0xc1]
; VCHECK-NEXT: retl ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
@@ -164,17 +164,17 @@ declare <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16>, <8 x i16>) nounwind rea
define <8 x i16> @test_x86_ssse3_pmadd_ub_sw_128(<16 x i8> %a0, <16 x i8> %a1) {
; SSE-LABEL: test_x86_ssse3_pmadd_ub_sw_128:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: pmaddubsw %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x38,0x04,0xc1]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; AVX2-LABEL: test_x86_ssse3_pmadd_ub_sw_128:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vpmaddubsw %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x04,0xc1]
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_ssse3_pmadd_ub_sw_128:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpmaddubsw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x04,0xc1]
; SKX-NEXT: retl ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8> %a0, <16 x i8> %a1) ; <<8 x i16>> [#uses=1]
@@ -186,7 +186,7 @@ declare <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8>, <16 x i8>) nounwind
; Make sure we don't commute this operation.
define <8 x i16> @test_x86_ssse3_pmadd_ub_sw_128_load_op0(<16 x i8>* %ptr, <16 x i8> %a1) {
; SSE-LABEL: test_x86_ssse3_pmadd_ub_sw_128_load_op0:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
; SSE-NEXT: movdqa (%eax), %xmm1 ## encoding: [0x66,0x0f,0x6f,0x08]
; SSE-NEXT: pmaddubsw %xmm0, %xmm1 ## encoding: [0x66,0x0f,0x38,0x04,0xc8]
@@ -194,16 +194,16 @@ define <8 x i16> @test_x86_ssse3_pmadd_ub_sw_128_load_op0(<16 x i8>* %ptr, <16 x
; SSE-NEXT: retl ## encoding: [0xc3]
;
; AVX2-LABEL: test_x86_ssse3_pmadd_ub_sw_128_load_op0:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
; AVX2-NEXT: vmovdqa (%eax), %xmm1 ## encoding: [0xc5,0xf9,0x6f,0x08]
; AVX2-NEXT: vpmaddubsw %xmm0, %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0x71,0x04,0xc0]
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_ssse3_pmadd_ub_sw_128_load_op0:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; SKX-NEXT: vmovdqu (%eax), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x6f,0x08]
+; SKX-NEXT: vmovdqa (%eax), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0x08]
; SKX-NEXT: vpmaddubsw %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0x04,0xc0]
; SKX-NEXT: retl ## encoding: [0xc3]
%a0 = load <16 x i8>, <16 x i8>* %ptr
@@ -214,17 +214,17 @@ define <8 x i16> @test_x86_ssse3_pmadd_ub_sw_128_load_op0(<16 x i8>* %ptr, <16 x
define <8 x i16> @test_x86_ssse3_pmul_hr_sw_128(<8 x i16> %a0, <8 x i16> %a1) {
; SSE-LABEL: test_x86_ssse3_pmul_hr_sw_128:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: pmulhrsw %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x38,0x0b,0xc1]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; AVX2-LABEL: test_x86_ssse3_pmul_hr_sw_128:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vpmulhrsw %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x0b,0xc1]
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_ssse3_pmul_hr_sw_128:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpmulhrsw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x0b,0xc1]
; SKX-NEXT: retl ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.ssse3.pmul.hr.sw.128(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
@@ -235,17 +235,17 @@ declare <8 x i16> @llvm.x86.ssse3.pmul.hr.sw.128(<8 x i16>, <8 x i16>) nounwind
define <16 x i8> @test_x86_ssse3_pshuf_b_128(<16 x i8> %a0, <16 x i8> %a1) {
; SSE-LABEL: test_x86_ssse3_pshuf_b_128:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: pshufb %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x38,0x00,0xc1]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; AVX2-LABEL: test_x86_ssse3_pshuf_b_128:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x00,0xc1]
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_ssse3_pshuf_b_128:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x00,0xc1]
; SKX-NEXT: retl ## encoding: [0xc3]
%res = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
@@ -256,12 +256,12 @@ declare <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8>, <16 x i8>) nounwind rea
define <16 x i8> @test_x86_ssse3_psign_b_128(<16 x i8> %a0, <16 x i8> %a1) {
; SSE-LABEL: test_x86_ssse3_psign_b_128:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: psignb %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x38,0x08,0xc1]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; VCHECK-LABEL: test_x86_ssse3_psign_b_128:
-; VCHECK: ## BB#0:
+; VCHECK: ## %bb.0:
; VCHECK-NEXT: vpsignb %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x08,0xc1]
; VCHECK-NEXT: retl ## encoding: [0xc3]
%res = call <16 x i8> @llvm.x86.ssse3.psign.b.128(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
@@ -272,12 +272,12 @@ declare <16 x i8> @llvm.x86.ssse3.psign.b.128(<16 x i8>, <16 x i8>) nounwind rea
define <4 x i32> @test_x86_ssse3_psign_d_128(<4 x i32> %a0, <4 x i32> %a1) {
; SSE-LABEL: test_x86_ssse3_psign_d_128:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: psignd %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x38,0x0a,0xc1]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; VCHECK-LABEL: test_x86_ssse3_psign_d_128:
-; VCHECK: ## BB#0:
+; VCHECK: ## %bb.0:
; VCHECK-NEXT: vpsignd %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x0a,0xc1]
; VCHECK-NEXT: retl ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.ssse3.psign.d.128(<4 x i32> %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1]
@@ -288,12 +288,12 @@ declare <4 x i32> @llvm.x86.ssse3.psign.d.128(<4 x i32>, <4 x i32>) nounwind rea
define <8 x i16> @test_x86_ssse3_psign_w_128(<8 x i16> %a0, <8 x i16> %a1) {
; SSE-LABEL: test_x86_ssse3_psign_w_128:
-; SSE: ## BB#0:
+; SSE: ## %bb.0:
; SSE-NEXT: psignw %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x38,0x09,0xc1]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; VCHECK-LABEL: test_x86_ssse3_psign_w_128:
-; VCHECK: ## BB#0:
+; VCHECK: ## %bb.0:
; VCHECK-NEXT: vpsignw %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x09,0xc1]
; VCHECK-NEXT: retl ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.ssse3.psign.w.128(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
diff --git a/test/CodeGen/X86/ssse3-schedule.ll b/test/CodeGen/X86/ssse3-schedule.ll
index fb3530667ce7..1ea703face2c 100644
--- a/test/CodeGen/X86/ssse3-schedule.ll
+++ b/test/CodeGen/X86/ssse3-schedule.ll
@@ -1,64 +1,87 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mattr=+ssse3 | FileCheck %s --check-prefix=CHECK --check-prefix=GENERIC
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=+ssse3 | FileCheck %s --check-prefix=CHECK --check-prefix=GENERIC
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=atom | FileCheck %s --check-prefix=CHECK --check-prefix=ATOM
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=slm | FileCheck %s --check-prefix=CHECK --check-prefix=SLM
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=sandybridge | FileCheck %s --check-prefix=CHECK --check-prefix=SANDY
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=ivybridge | FileCheck %s --check-prefix=CHECK --check-prefix=SANDY
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=haswell | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=broadwell | FileCheck %s --check-prefix=CHECK --check-prefix=BROADWELL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=SKYLAKE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skx | FileCheck %s --check-prefix=CHECK --check-prefix=SKX
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1
define <16 x i8> @test_pabsb(<16 x i8> %a0, <16 x i8> *%a1) {
; GENERIC-LABEL: test_pabsb:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: pabsb %xmm0, %xmm1
-; GENERIC-NEXT: pabsb (%rdi), %xmm0
-; GENERIC-NEXT: por %xmm1, %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: pabsb %xmm0, %xmm1 # sched: [1:0.50]
+; GENERIC-NEXT: pabsb (%rdi), %xmm0 # sched: [7:0.50]
+; GENERIC-NEXT: por %xmm1, %xmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_pabsb:
-; ATOM: # BB#0:
-; ATOM-NEXT: pabsb (%rdi), %xmm1
-; ATOM-NEXT: pabsb %xmm0, %xmm0
-; ATOM-NEXT: por %xmm0, %xmm1
-; ATOM-NEXT: movdqa %xmm1, %xmm0
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: pabsb (%rdi), %xmm1 # sched: [1:1.00]
+; ATOM-NEXT: pabsb %xmm0, %xmm0 # sched: [1:0.50]
+; ATOM-NEXT: por %xmm0, %xmm1 # sched: [1:0.50]
+; ATOM-NEXT: movdqa %xmm1, %xmm0 # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_pabsb:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: pabsb %xmm0, %xmm1 # sched: [1:0.50]
; SLM-NEXT: pabsb (%rdi), %xmm0 # sched: [4:1.00]
; SLM-NEXT: por %xmm1, %xmm0 # sched: [1:0.50]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_pabsb:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vpabsb %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpabsb (%rdi), %xmm1 # sched: [5:0.50]
+; SANDY-NEXT: vpabsb (%rdi), %xmm1 # sched: [7:0.50]
; SANDY-NEXT: vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pabsb:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vpabsb %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpabsb (%rdi), %xmm1 # sched: [5:0.50]
+; HASWELL-NEXT: vpabsb (%rdi), %xmm1 # sched: [7:0.50]
; HASWELL-NEXT: vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pabsb:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpabsb %xmm0, %xmm0 # sched: [1:0.50]
+; BROADWELL-NEXT: vpabsb (%rdi), %xmm1 # sched: [6:0.50]
+; BROADWELL-NEXT: vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pabsb:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpabsb %xmm0, %xmm0 # sched: [1:0.50]
+; SKYLAKE-NEXT: vpabsb (%rdi), %xmm1 # sched: [7:0.50]
+; SKYLAKE-NEXT: vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pabsb:
+; SKX: # %bb.0:
+; SKX-NEXT: vpabsb %xmm0, %xmm0 # sched: [1:0.50]
+; SKX-NEXT: vpabsb (%rdi), %xmm1 # sched: [7:0.50]
+; SKX-NEXT: vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_pabsb:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vpabsb (%rdi), %xmm1 # sched: [6:1.00]
; BTVER2-NEXT: vpabsb %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_pabsb:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vpabsb (%rdi), %xmm1 # sched: [8:0.50]
; ZNVER1-NEXT: vpabsb %xmm0, %xmm0 # sched: [1:0.25]
; ZNVER1-NEXT: vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call <16 x i8> @llvm.x86.ssse3.pabs.b.128(<16 x i8> %a0)
%2 = load <16 x i8>, <16 x i8> *%a1, align 16
%3 = call <16 x i8> @llvm.x86.ssse3.pabs.b.128(<16 x i8> %2)
@@ -69,54 +92,75 @@ declare <16 x i8> @llvm.x86.ssse3.pabs.b.128(<16 x i8>) nounwind readnone
define <4 x i32> @test_pabsd(<4 x i32> %a0, <4 x i32> *%a1) {
; GENERIC-LABEL: test_pabsd:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: pabsd %xmm0, %xmm1
-; GENERIC-NEXT: pabsd (%rdi), %xmm0
-; GENERIC-NEXT: por %xmm1, %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: pabsd %xmm0, %xmm1 # sched: [1:0.50]
+; GENERIC-NEXT: pabsd (%rdi), %xmm0 # sched: [7:0.50]
+; GENERIC-NEXT: por %xmm1, %xmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_pabsd:
-; ATOM: # BB#0:
-; ATOM-NEXT: pabsd (%rdi), %xmm1
-; ATOM-NEXT: pabsd %xmm0, %xmm0
-; ATOM-NEXT: por %xmm0, %xmm1
-; ATOM-NEXT: movdqa %xmm1, %xmm0
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: pabsd (%rdi), %xmm1 # sched: [1:1.00]
+; ATOM-NEXT: pabsd %xmm0, %xmm0 # sched: [1:0.50]
+; ATOM-NEXT: por %xmm0, %xmm1 # sched: [1:0.50]
+; ATOM-NEXT: movdqa %xmm1, %xmm0 # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_pabsd:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: pabsd %xmm0, %xmm1 # sched: [1:0.50]
; SLM-NEXT: pabsd (%rdi), %xmm0 # sched: [4:1.00]
; SLM-NEXT: por %xmm1, %xmm0 # sched: [1:0.50]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_pabsd:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vpabsd %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpabsd (%rdi), %xmm1 # sched: [5:0.50]
+; SANDY-NEXT: vpabsd (%rdi), %xmm1 # sched: [7:0.50]
; SANDY-NEXT: vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pabsd:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vpabsd %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpabsd (%rdi), %xmm1 # sched: [5:0.50]
+; HASWELL-NEXT: vpabsd (%rdi), %xmm1 # sched: [7:0.50]
; HASWELL-NEXT: vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pabsd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpabsd %xmm0, %xmm0 # sched: [1:0.50]
+; BROADWELL-NEXT: vpabsd (%rdi), %xmm1 # sched: [6:0.50]
+; BROADWELL-NEXT: vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pabsd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpabsd %xmm0, %xmm0 # sched: [1:0.50]
+; SKYLAKE-NEXT: vpabsd (%rdi), %xmm1 # sched: [7:0.50]
+; SKYLAKE-NEXT: vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pabsd:
+; SKX: # %bb.0:
+; SKX-NEXT: vpabsd %xmm0, %xmm0 # sched: [1:0.50]
+; SKX-NEXT: vpabsd (%rdi), %xmm1 # sched: [7:0.50]
+; SKX-NEXT: vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_pabsd:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vpabsd (%rdi), %xmm1 # sched: [6:1.00]
; BTVER2-NEXT: vpabsd %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_pabsd:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vpabsd (%rdi), %xmm1 # sched: [8:0.50]
; ZNVER1-NEXT: vpabsd %xmm0, %xmm0 # sched: [1:0.25]
; ZNVER1-NEXT: vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call <4 x i32> @llvm.x86.ssse3.pabs.d.128(<4 x i32> %a0)
%2 = load <4 x i32>, <4 x i32> *%a1, align 16
%3 = call <4 x i32> @llvm.x86.ssse3.pabs.d.128(<4 x i32> %2)
@@ -127,100 +171,148 @@ declare <4 x i32> @llvm.x86.ssse3.pabs.d.128(<4 x i32>) nounwind readnone
define <8 x i16> @test_pabsw(<8 x i16> %a0, <8 x i16> *%a1) {
; GENERIC-LABEL: test_pabsw:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: pabsw %xmm0, %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: pabsw %xmm0, %xmm1 # sched: [1:0.50]
+; GENERIC-NEXT: pabsw (%rdi), %xmm0 # sched: [7:0.50]
+; GENERIC-NEXT: por %xmm1, %xmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_pabsw:
-; ATOM: # BB#0:
-; ATOM-NEXT: pabsw %xmm0, %xmm0
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: pabsw (%rdi), %xmm1 # sched: [1:1.00]
+; ATOM-NEXT: pabsw %xmm0, %xmm0 # sched: [1:0.50]
+; ATOM-NEXT: por %xmm0, %xmm1 # sched: [1:0.50]
+; ATOM-NEXT: movdqa %xmm1, %xmm0 # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_pabsw:
-; SLM: # BB#0:
-; SLM-NEXT: pabsw %xmm0, %xmm0 # sched: [1:0.50]
+; SLM: # %bb.0:
+; SLM-NEXT: pabsw %xmm0, %xmm1 # sched: [1:0.50]
+; SLM-NEXT: pabsw (%rdi), %xmm0 # sched: [4:1.00]
+; SLM-NEXT: por %xmm1, %xmm0 # sched: [1:0.50]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_pabsw:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vpabsw %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpabsw (%rdi), %xmm1 # sched: [7:0.50]
+; SANDY-NEXT: vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pabsw:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vpabsw %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpabsw (%rdi), %xmm1 # sched: [7:0.50]
+; HASWELL-NEXT: vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pabsw:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpabsw %xmm0, %xmm0 # sched: [1:0.50]
+; BROADWELL-NEXT: vpabsw (%rdi), %xmm1 # sched: [6:0.50]
+; BROADWELL-NEXT: vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pabsw:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpabsw %xmm0, %xmm0 # sched: [1:0.50]
+; SKYLAKE-NEXT: vpabsw (%rdi), %xmm1 # sched: [7:0.50]
+; SKYLAKE-NEXT: vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pabsw:
+; SKX: # %bb.0:
+; SKX-NEXT: vpabsw %xmm0, %xmm0 # sched: [1:0.50]
+; SKX-NEXT: vpabsw (%rdi), %xmm1 # sched: [7:0.50]
+; SKX-NEXT: vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_pabsw:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: vpabsw (%rdi), %xmm1 # sched: [6:1.00]
; BTVER2-NEXT: vpabsw %xmm0, %xmm0 # sched: [1:0.50]
+; BTVER2-NEXT: vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_pabsw:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vpabsw (%rdi), %xmm1 # sched: [8:0.50]
; ZNVER1-NEXT: vpabsw %xmm0, %xmm0 # sched: [1:0.25]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call <8 x i16> @llvm.x86.ssse3.pabs.w.128(<8 x i16> %a0)
%2 = load <8 x i16>, <8 x i16> *%a1, align 16
%3 = call <8 x i16> @llvm.x86.ssse3.pabs.w.128(<8 x i16> %2)
%4 = or <8 x i16> %1, %3
- ret <8 x i16> %1
+ ret <8 x i16> %4
}
declare <8 x i16> @llvm.x86.ssse3.pabs.w.128(<8 x i16>) nounwind readnone
define <8 x i16> @test_palignr(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; GENERIC-LABEL: test_palignr:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: palignr {{.*#+}} xmm1 = xmm0[6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5]
-; GENERIC-NEXT: palignr {{.*#+}} xmm1 = mem[14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
-; GENERIC-NEXT: movdqa %xmm1, %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: palignr {{.*#+}} xmm1 = xmm0[6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5] sched: [1:0.50]
+; GENERIC-NEXT: palignr {{.*#+}} xmm1 = mem[14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] sched: [7:0.50]
+; GENERIC-NEXT: movdqa %xmm1, %xmm0 # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_palignr:
-; ATOM: # BB#0:
-; ATOM-NEXT: palignr {{.*#+}} xmm1 = xmm0[6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5]
-; ATOM-NEXT: palignr {{.*#+}} xmm1 = mem[14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
-; ATOM-NEXT: movdqa %xmm1, %xmm0
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: palignr {{.*#+}} xmm1 = xmm0[6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5] sched: [1:1.00]
+; ATOM-NEXT: palignr {{.*#+}} xmm1 = mem[14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] sched: [1:1.00]
+; ATOM-NEXT: movdqa %xmm1, %xmm0 # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_palignr:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: palignr {{.*#+}} xmm1 = xmm0[6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5] sched: [1:1.00]
; SLM-NEXT: palignr {{.*#+}} xmm1 = mem[14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] sched: [4:1.00]
; SLM-NEXT: movdqa %xmm1, %xmm0 # sched: [1:0.50]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_palignr:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5] sched: [1:0.50]
-; SANDY-NEXT: vpalignr {{.*#+}} xmm0 = mem[14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpalignr {{.*#+}} xmm0 = mem[14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_palignr:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5] sched: [1:1.00]
-; HASWELL-NEXT: vpalignr {{.*#+}} xmm0 = mem[14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] sched: [5:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpalignr {{.*#+}} xmm0 = mem[14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] sched: [7:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_palignr:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5] sched: [1:1.00]
+; BROADWELL-NEXT: vpalignr {{.*#+}} xmm0 = mem[14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] sched: [6:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_palignr:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5] sched: [1:1.00]
+; SKYLAKE-NEXT: vpalignr {{.*#+}} xmm0 = mem[14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] sched: [7:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_palignr:
+; SKX: # %bb.0:
+; SKX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5] sched: [1:1.00]
+; SKX-NEXT: vpalignr {{.*#+}} xmm0 = mem[14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] sched: [7:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_palignr:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5] sched: [1:0.50]
; BTVER2-NEXT: vpalignr {{.*#+}} xmm0 = mem[14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_palignr:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5] sched: [1:0.25]
; ZNVER1-NEXT: vpalignr {{.*#+}} xmm0 = mem[14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] sched: [8:0.50]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10>
%2 = load <8 x i16>, <8 x i16> *%a2, align 16
%3 = shufflevector <8 x i16> %2, <8 x i16> %1, <8 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
@@ -229,46 +321,64 @@ define <8 x i16> @test_palignr(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
define <4 x i32> @test_phaddd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
; GENERIC-LABEL: test_phaddd:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: phaddd %xmm1, %xmm0
-; GENERIC-NEXT: phaddd (%rdi), %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: phaddd %xmm1, %xmm0 # sched: [3:1.50]
+; GENERIC-NEXT: phaddd (%rdi), %xmm0 # sched: [9:1.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_phaddd:
-; ATOM: # BB#0:
-; ATOM-NEXT: phaddd %xmm1, %xmm0
-; ATOM-NEXT: phaddd (%rdi), %xmm0
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: phaddd %xmm1, %xmm0 # sched: [3:1.50]
+; ATOM-NEXT: phaddd (%rdi), %xmm0 # sched: [4:2.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_phaddd:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: phaddd %xmm1, %xmm0 # sched: [1:0.50]
; SLM-NEXT: phaddd (%rdi), %xmm0 # sched: [4:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_phaddd:
-; SANDY: # BB#0:
-; SANDY-NEXT: vphaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vphaddd (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vphaddd %xmm1, %xmm0, %xmm0 # sched: [3:1.50]
+; SANDY-NEXT: vphaddd (%rdi), %xmm0, %xmm0 # sched: [9:1.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_phaddd:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vphaddd %xmm1, %xmm0, %xmm0 # sched: [3:2.00]
-; HASWELL-NEXT: vphaddd (%rdi), %xmm0, %xmm0 # sched: [6:2.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vphaddd (%rdi), %xmm0, %xmm0 # sched: [9:2.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_phaddd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vphaddd %xmm1, %xmm0, %xmm0 # sched: [3:2.00]
+; BROADWELL-NEXT: vphaddd (%rdi), %xmm0, %xmm0 # sched: [8:2.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_phaddd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vphaddd %xmm1, %xmm0, %xmm0 # sched: [3:2.00]
+; SKYLAKE-NEXT: vphaddd (%rdi), %xmm0, %xmm0 # sched: [9:2.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_phaddd:
+; SKX: # %bb.0:
+; SKX-NEXT: vphaddd %xmm1, %xmm0, %xmm0 # sched: [3:2.00]
+; SKX-NEXT: vphaddd (%rdi), %xmm0, %xmm0 # sched: [9:2.00]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_phaddd:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vphaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vphaddd (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_phaddd:
-; ZNVER1: # BB#0:
-; ZNVER1-NEXT: vphaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
-; ZNVER1-NEXT: vphaddd (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vphaddd %xmm1, %xmm0, %xmm0 # sched: [100:?]
+; ZNVER1-NEXT: vphaddd (%rdi), %xmm0, %xmm0 # sched: [100:?]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32> %a0, <4 x i32> %a1)
%2 = load <4 x i32>, <4 x i32> *%a2, align 16
%3 = call <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32> %1, <4 x i32> %2)
@@ -278,54 +388,64 @@ declare <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32>, <4 x i32>) nounwind rea
define <8 x i16> @test_phaddsw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; GENERIC-LABEL: test_phaddsw:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: phaddsw %xmm1, %xmm0
-; GENERIC-NEXT: phaddsw (%rdi), %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: phaddsw %xmm1, %xmm0 # sched: [3:1.50]
+; GENERIC-NEXT: phaddsw (%rdi), %xmm0 # sched: [9:1.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_phaddsw:
-; ATOM: # BB#0:
-; ATOM-NEXT: phaddsw %xmm1, %xmm0
-; ATOM-NEXT: phaddsw (%rdi), %xmm0
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: phaddsw %xmm1, %xmm0 # sched: [7:3.50]
+; ATOM-NEXT: phaddsw (%rdi), %xmm0 # sched: [8:4.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_phaddsw:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: phaddsw %xmm1, %xmm0 # sched: [1:0.50]
; SLM-NEXT: phaddsw (%rdi), %xmm0 # sched: [4:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_phaddsw:
-; SANDY: # BB#0:
-; SANDY-NEXT: vphaddsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vphaddsw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vphaddsw %xmm1, %xmm0, %xmm0 # sched: [3:1.50]
+; SANDY-NEXT: vphaddsw (%rdi), %xmm0, %xmm0 # sched: [9:1.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_phaddsw:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vphaddsw %xmm1, %xmm0, %xmm0 # sched: [3:2.00]
-; HASWELL-NEXT: vphaddsw (%rdi), %xmm0, %xmm0 # sched: [6:2.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vphaddsw (%rdi), %xmm0, %xmm0 # sched: [9:2.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_phaddsw:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vphaddsw %xmm1, %xmm0, %xmm0 # sched: [3:2.00]
+; BROADWELL-NEXT: vphaddsw (%rdi), %xmm0, %xmm0 # sched: [8:2.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_phaddsw:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vphaddsw %xmm1, %xmm0, %xmm0 # sched: [3:2.00]
+; SKYLAKE-NEXT: vphaddsw (%rdi), %xmm0, %xmm0 # sched: [9:2.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_phaddsw:
+; SKX: # %bb.0:
+; SKX-NEXT: vphaddsw %xmm1, %xmm0, %xmm0 # sched: [3:2.00]
+; SKX-NEXT: vphaddsw (%rdi), %xmm0, %xmm0 # sched: [9:2.00]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_phaddsw:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vphaddsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vphaddsw (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_phaddsw:
-; ZNVER1: # BB#0:
-; ZNVER1-NEXT: vphaddsw %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
-; ZNVER1-NEXT: vphaddsw (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vphaddsw %xmm1, %xmm0, %xmm0 # sched: [100:?]
+; ZNVER1-NEXT: vphaddsw (%rdi), %xmm0, %xmm0 # sched: [100:?]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call <8 x i16> @llvm.x86.ssse3.phadd.sw.128(<8 x i16> %a0, <8 x i16> %a1)
%2 = load <8 x i16>, <8 x i16> *%a2, align 16
%3 = call <8 x i16> @llvm.x86.ssse3.phadd.sw.128(<8 x i16> %1, <8 x i16> %2)
@@ -335,46 +455,64 @@ declare <8 x i16> @llvm.x86.ssse3.phadd.sw.128(<8 x i16>, <8 x i16>) nounwind re
define <8 x i16> @test_phaddw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; GENERIC-LABEL: test_phaddw:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: phaddw %xmm1, %xmm0
-; GENERIC-NEXT: phaddw (%rdi), %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: phaddw %xmm1, %xmm0 # sched: [3:1.50]
+; GENERIC-NEXT: phaddw (%rdi), %xmm0 # sched: [9:1.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_phaddw:
-; ATOM: # BB#0:
-; ATOM-NEXT: phaddw %xmm1, %xmm0
-; ATOM-NEXT: phaddw (%rdi), %xmm0
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: phaddw %xmm1, %xmm0 # sched: [7:3.50]
+; ATOM-NEXT: phaddw (%rdi), %xmm0 # sched: [8:4.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_phaddw:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: phaddw %xmm1, %xmm0 # sched: [1:0.50]
; SLM-NEXT: phaddw (%rdi), %xmm0 # sched: [4:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_phaddw:
-; SANDY: # BB#0:
-; SANDY-NEXT: vphaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vphaddw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vphaddw %xmm1, %xmm0, %xmm0 # sched: [3:1.50]
+; SANDY-NEXT: vphaddw (%rdi), %xmm0, %xmm0 # sched: [9:1.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_phaddw:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vphaddw %xmm1, %xmm0, %xmm0 # sched: [3:2.00]
-; HASWELL-NEXT: vphaddw (%rdi), %xmm0, %xmm0 # sched: [6:2.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vphaddw (%rdi), %xmm0, %xmm0 # sched: [9:2.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_phaddw:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vphaddw %xmm1, %xmm0, %xmm0 # sched: [3:2.00]
+; BROADWELL-NEXT: vphaddw (%rdi), %xmm0, %xmm0 # sched: [8:2.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_phaddw:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vphaddw %xmm1, %xmm0, %xmm0 # sched: [3:2.00]
+; SKYLAKE-NEXT: vphaddw (%rdi), %xmm0, %xmm0 # sched: [9:2.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_phaddw:
+; SKX: # %bb.0:
+; SKX-NEXT: vphaddw %xmm1, %xmm0, %xmm0 # sched: [3:2.00]
+; SKX-NEXT: vphaddw (%rdi), %xmm0, %xmm0 # sched: [9:2.00]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_phaddw:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vphaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vphaddw (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_phaddw:
-; ZNVER1: # BB#0:
-; ZNVER1-NEXT: vphaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
-; ZNVER1-NEXT: vphaddw (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vphaddw %xmm1, %xmm0, %xmm0 # sched: [100:?]
+; ZNVER1-NEXT: vphaddw (%rdi), %xmm0, %xmm0 # sched: [100:?]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16> %a0, <8 x i16> %a1)
%2 = load <8 x i16>, <8 x i16> *%a2, align 16
%3 = call <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16> %1, <8 x i16> %2)
@@ -384,46 +522,64 @@ declare <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16>, <8 x i16>) nounwind rea
define <4 x i32> @test_phsubd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
; GENERIC-LABEL: test_phsubd:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: phsubd %xmm1, %xmm0
-; GENERIC-NEXT: phsubd (%rdi), %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: phsubd %xmm1, %xmm0 # sched: [3:1.50]
+; GENERIC-NEXT: phsubd (%rdi), %xmm0 # sched: [9:1.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_phsubd:
-; ATOM: # BB#0:
-; ATOM-NEXT: phsubd %xmm1, %xmm0
-; ATOM-NEXT: phsubd (%rdi), %xmm0
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: phsubd %xmm1, %xmm0 # sched: [3:1.50]
+; ATOM-NEXT: phsubd (%rdi), %xmm0 # sched: [4:2.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_phsubd:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: phsubd %xmm1, %xmm0 # sched: [1:0.50]
; SLM-NEXT: phsubd (%rdi), %xmm0 # sched: [4:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_phsubd:
-; SANDY: # BB#0:
-; SANDY-NEXT: vphsubd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vphsubd (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vphsubd %xmm1, %xmm0, %xmm0 # sched: [3:1.50]
+; SANDY-NEXT: vphsubd (%rdi), %xmm0, %xmm0 # sched: [9:1.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_phsubd:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vphsubd %xmm1, %xmm0, %xmm0 # sched: [3:2.00]
-; HASWELL-NEXT: vphsubd (%rdi), %xmm0, %xmm0 # sched: [6:2.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vphsubd (%rdi), %xmm0, %xmm0 # sched: [9:2.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_phsubd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vphsubd %xmm1, %xmm0, %xmm0 # sched: [3:2.00]
+; BROADWELL-NEXT: vphsubd (%rdi), %xmm0, %xmm0 # sched: [8:2.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_phsubd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vphsubd %xmm1, %xmm0, %xmm0 # sched: [3:2.00]
+; SKYLAKE-NEXT: vphsubd (%rdi), %xmm0, %xmm0 # sched: [9:2.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_phsubd:
+; SKX: # %bb.0:
+; SKX-NEXT: vphsubd %xmm1, %xmm0, %xmm0 # sched: [3:2.00]
+; SKX-NEXT: vphsubd (%rdi), %xmm0, %xmm0 # sched: [9:2.00]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_phsubd:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vphsubd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vphsubd (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_phsubd:
-; ZNVER1: # BB#0:
-; ZNVER1-NEXT: vphsubd %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
-; ZNVER1-NEXT: vphsubd (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vphsubd %xmm1, %xmm0, %xmm0 # sched: [100:?]
+; ZNVER1-NEXT: vphsubd (%rdi), %xmm0, %xmm0 # sched: [100:?]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call <4 x i32> @llvm.x86.ssse3.phsub.d.128(<4 x i32> %a0, <4 x i32> %a1)
%2 = load <4 x i32>, <4 x i32> *%a2, align 16
%3 = call <4 x i32> @llvm.x86.ssse3.phsub.d.128(<4 x i32> %1, <4 x i32> %2)
@@ -433,54 +589,64 @@ declare <4 x i32> @llvm.x86.ssse3.phsub.d.128(<4 x i32>, <4 x i32>) nounwind rea
define <8 x i16> @test_phsubsw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; GENERIC-LABEL: test_phsubsw:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: phsubsw %xmm1, %xmm0
-; GENERIC-NEXT: phsubsw (%rdi), %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: phsubsw %xmm1, %xmm0 # sched: [3:1.50]
+; GENERIC-NEXT: phsubsw (%rdi), %xmm0 # sched: [9:1.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_phsubsw:
-; ATOM: # BB#0:
-; ATOM-NEXT: phsubsw %xmm1, %xmm0
-; ATOM-NEXT: phsubsw (%rdi), %xmm0
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: phsubsw %xmm1, %xmm0 # sched: [7:3.50]
+; ATOM-NEXT: phsubsw (%rdi), %xmm0 # sched: [8:4.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_phsubsw:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: phsubsw %xmm1, %xmm0 # sched: [1:0.50]
; SLM-NEXT: phsubsw (%rdi), %xmm0 # sched: [4:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_phsubsw:
-; SANDY: # BB#0:
-; SANDY-NEXT: vphsubsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vphsubsw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vphsubsw %xmm1, %xmm0, %xmm0 # sched: [3:1.50]
+; SANDY-NEXT: vphsubsw (%rdi), %xmm0, %xmm0 # sched: [9:1.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_phsubsw:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vphsubsw %xmm1, %xmm0, %xmm0 # sched: [3:2.00]
-; HASWELL-NEXT: vphsubsw (%rdi), %xmm0, %xmm0 # sched: [6:2.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vphsubsw (%rdi), %xmm0, %xmm0 # sched: [9:2.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_phsubsw:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vphsubsw %xmm1, %xmm0, %xmm0 # sched: [3:2.00]
+; BROADWELL-NEXT: vphsubsw (%rdi), %xmm0, %xmm0 # sched: [8:2.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_phsubsw:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vphsubsw %xmm1, %xmm0, %xmm0 # sched: [3:2.00]
+; SKYLAKE-NEXT: vphsubsw (%rdi), %xmm0, %xmm0 # sched: [9:2.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_phsubsw:
+; SKX: # %bb.0:
+; SKX-NEXT: vphsubsw %xmm1, %xmm0, %xmm0 # sched: [3:2.00]
+; SKX-NEXT: vphsubsw (%rdi), %xmm0, %xmm0 # sched: [9:2.00]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_phsubsw:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vphsubsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vphsubsw (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_phsubsw:
-; ZNVER1: # BB#0:
-; ZNVER1-NEXT: vphsubsw %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
-; ZNVER1-NEXT: vphsubsw (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vphsubsw %xmm1, %xmm0, %xmm0 # sched: [100:?]
+; ZNVER1-NEXT: vphsubsw (%rdi), %xmm0, %xmm0 # sched: [100:?]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call <8 x i16> @llvm.x86.ssse3.phsub.sw.128(<8 x i16> %a0, <8 x i16> %a1)
%2 = load <8 x i16>, <8 x i16> *%a2, align 16
%3 = call <8 x i16> @llvm.x86.ssse3.phsub.sw.128(<8 x i16> %1, <8 x i16> %2)
@@ -490,46 +656,64 @@ declare <8 x i16> @llvm.x86.ssse3.phsub.sw.128(<8 x i16>, <8 x i16>) nounwind re
define <8 x i16> @test_phsubw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; GENERIC-LABEL: test_phsubw:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: phsubw %xmm1, %xmm0
-; GENERIC-NEXT: phsubw (%rdi), %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: phsubw %xmm1, %xmm0 # sched: [3:1.50]
+; GENERIC-NEXT: phsubw (%rdi), %xmm0 # sched: [9:1.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_phsubw:
-; ATOM: # BB#0:
-; ATOM-NEXT: phsubw %xmm1, %xmm0
-; ATOM-NEXT: phsubw (%rdi), %xmm0
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: phsubw %xmm1, %xmm0 # sched: [7:3.50]
+; ATOM-NEXT: phsubw (%rdi), %xmm0 # sched: [8:4.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_phsubw:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: phsubw %xmm1, %xmm0 # sched: [1:0.50]
; SLM-NEXT: phsubw (%rdi), %xmm0 # sched: [4:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_phsubw:
-; SANDY: # BB#0:
-; SANDY-NEXT: vphsubw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vphsubw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vphsubw %xmm1, %xmm0, %xmm0 # sched: [3:1.50]
+; SANDY-NEXT: vphsubw (%rdi), %xmm0, %xmm0 # sched: [9:1.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_phsubw:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vphsubw %xmm1, %xmm0, %xmm0 # sched: [3:2.00]
-; HASWELL-NEXT: vphsubw (%rdi), %xmm0, %xmm0 # sched: [6:2.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vphsubw (%rdi), %xmm0, %xmm0 # sched: [9:2.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_phsubw:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vphsubw %xmm1, %xmm0, %xmm0 # sched: [3:2.00]
+; BROADWELL-NEXT: vphsubw (%rdi), %xmm0, %xmm0 # sched: [8:2.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_phsubw:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vphsubw %xmm1, %xmm0, %xmm0 # sched: [3:2.00]
+; SKYLAKE-NEXT: vphsubw (%rdi), %xmm0, %xmm0 # sched: [9:2.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_phsubw:
+; SKX: # %bb.0:
+; SKX-NEXT: vphsubw %xmm1, %xmm0, %xmm0 # sched: [3:2.00]
+; SKX-NEXT: vphsubw (%rdi), %xmm0, %xmm0 # sched: [9:2.00]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_phsubw:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vphsubw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vphsubw (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_phsubw:
-; ZNVER1: # BB#0:
-; ZNVER1-NEXT: vphsubw %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
-; ZNVER1-NEXT: vphsubw (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: vphsubw %xmm1, %xmm0, %xmm0 # sched: [100:?]
+; ZNVER1-NEXT: vphsubw (%rdi), %xmm0, %xmm0 # sched: [100:?]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16> %a0, <8 x i16> %a1)
%2 = load <8 x i16>, <8 x i16> *%a2, align 16
%3 = call <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16> %1, <8 x i16> %2)
@@ -539,46 +723,64 @@ declare <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16>, <8 x i16>) nounwind rea
define <8 x i16> @test_pmaddubsw(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; GENERIC-LABEL: test_pmaddubsw:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: pmaddubsw %xmm1, %xmm0
-; GENERIC-NEXT: pmaddubsw (%rdi), %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: pmaddubsw %xmm1, %xmm0 # sched: [3:1.00]
+; GENERIC-NEXT: pmaddubsw (%rdi), %xmm0 # sched: [9:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_pmaddubsw:
-; ATOM: # BB#0:
-; ATOM-NEXT: pmaddubsw %xmm1, %xmm0
-; ATOM-NEXT: pmaddubsw (%rdi), %xmm0
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: pmaddubsw %xmm1, %xmm0 # sched: [5:5.00]
+; ATOM-NEXT: pmaddubsw (%rdi), %xmm0 # sched: [5:5.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_pmaddubsw:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: pmaddubsw %xmm1, %xmm0 # sched: [4:1.00]
; SLM-NEXT: pmaddubsw (%rdi), %xmm0 # sched: [7:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_pmaddubsw:
-; SANDY: # BB#0:
-; SANDY-NEXT: vpmaddubsw %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vpmaddubsw %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; SANDY-NEXT: vpmaddubsw (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pmaddubsw:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vpmaddubsw %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT: vpmaddubsw (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpmaddubsw (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pmaddubsw:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpmaddubsw %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BROADWELL-NEXT: vpmaddubsw (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pmaddubsw:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpmaddubsw %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
+; SKYLAKE-NEXT: vpmaddubsw (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pmaddubsw:
+; SKX: # %bb.0:
+; SKX-NEXT: vpmaddubsw %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: vpmaddubsw (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_pmaddubsw:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vpmaddubsw %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
; BTVER2-NEXT: vpmaddubsw (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_pmaddubsw:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vpmaddubsw %xmm1, %xmm0, %xmm0 # sched: [4:1.00]
; ZNVER1-NEXT: vpmaddubsw (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8> %a0, <16 x i8> %a1)
%2 = load <16 x i8>, <16 x i8> *%a2, align 16
%3 = bitcast <8 x i16> %1 to <16 x i8>
@@ -589,88 +791,131 @@ declare <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8>, <16 x i8>) nounwind
define <8 x i16> @test_pmulhrsw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; GENERIC-LABEL: test_pmulhrsw:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: pmulhrsw %xmm1, %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: pmulhrsw %xmm1, %xmm0 # sched: [3:1.00]
+; GENERIC-NEXT: pmulhrsw (%rdi), %xmm0 # sched: [9:1.00]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_pmulhrsw:
-; ATOM: # BB#0:
-; ATOM-NEXT: pmulhrsw %xmm1, %xmm0
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: pmulhrsw %xmm1, %xmm0 # sched: [5:5.00]
+; ATOM-NEXT: pmulhrsw (%rdi), %xmm0 # sched: [5:5.00]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_pmulhrsw:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: pmulhrsw %xmm1, %xmm0 # sched: [4:1.00]
+; SLM-NEXT: pmulhrsw (%rdi), %xmm0 # sched: [7:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_pmulhrsw:
-; SANDY: # BB#0:
-; SANDY-NEXT: vpmulhrsw %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY: # %bb.0:
+; SANDY-NEXT: vpmulhrsw %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; SANDY-NEXT: vpmulhrsw (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pmulhrsw:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vpmulhrsw %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpmulhrsw (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pmulhrsw:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpmulhrsw %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; BROADWELL-NEXT: vpmulhrsw (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pmulhrsw:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpmulhrsw %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
+; SKYLAKE-NEXT: vpmulhrsw (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pmulhrsw:
+; SKX: # %bb.0:
+; SKX-NEXT: vpmulhrsw %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT: vpmulhrsw (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_pmulhrsw:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vpmulhrsw %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
+; BTVER2-NEXT: vpmulhrsw (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_pmulhrsw:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vpmulhrsw %xmm1, %xmm0, %xmm0 # sched: [4:1.00]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: vpmulhrsw (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call <8 x i16> @llvm.x86.ssse3.pmul.hr.sw.128(<8 x i16> %a0, <8 x i16> %a1)
%2 = load <8 x i16>, <8 x i16> *%a2, align 16
%3 = call <8 x i16> @llvm.x86.ssse3.pmul.hr.sw.128(<8 x i16> %1, <8 x i16> %2)
- ret <8 x i16> %1
+ ret <8 x i16> %3
}
declare <8 x i16> @llvm.x86.ssse3.pmul.hr.sw.128(<8 x i16>, <8 x i16>) nounwind readnone
define <16 x i8> @test_pshufb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; GENERIC-LABEL: test_pshufb:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: pshufb %xmm1, %xmm0
-; GENERIC-NEXT: pshufb (%rdi), %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: pshufb %xmm1, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT: pshufb (%rdi), %xmm0 # sched: [7:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_pshufb:
-; ATOM: # BB#0:
-; ATOM-NEXT: pshufb %xmm1, %xmm0
-; ATOM-NEXT: pshufb (%rdi), %xmm0
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: pshufb %xmm1, %xmm0 # sched: [4:2.00]
+; ATOM-NEXT: pshufb (%rdi), %xmm0 # sched: [5:2.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_pshufb:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: pshufb %xmm1, %xmm0 # sched: [1:1.00]
; SLM-NEXT: pshufb (%rdi), %xmm0 # sched: [4:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_pshufb:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vpshufb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpshufb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpshufb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_pshufb:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vpshufb %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; HASWELL-NEXT: vpshufb (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpshufb (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_pshufb:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpshufb %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; BROADWELL-NEXT: vpshufb (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_pshufb:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpshufb %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; SKYLAKE-NEXT: vpshufb (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_pshufb:
+; SKX: # %bb.0:
+; SKX-NEXT: vpshufb %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; SKX-NEXT: vpshufb (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_pshufb:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vpshufb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vpshufb (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_pshufb:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
; ZNVER1-NEXT: vpshufb (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> %a1)
%2 = load <16 x i8>, <16 x i8> *%a2, align 16
%3 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %1, <16 x i8> %2)
@@ -680,54 +925,68 @@ declare <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8>, <16 x i8>) nounwind rea
define <16 x i8> @test_psignb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; GENERIC-LABEL: test_psignb:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: psignb %xmm1, %xmm0
-; GENERIC-NEXT: psignb (%rdi), %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: psignb %xmm1, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT: psignb (%rdi), %xmm0 # sched: [7:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_psignb:
-; ATOM: # BB#0:
-; ATOM-NEXT: psignb %xmm1, %xmm0
-; ATOM-NEXT: psignb (%rdi), %xmm0
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: psignb %xmm1, %xmm0 # sched: [1:0.50]
+; ATOM-NEXT: psignb (%rdi), %xmm0 # sched: [1:1.00]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_psignb:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: psignb %xmm1, %xmm0 # sched: [1:0.50]
; SLM-NEXT: psignb (%rdi), %xmm0 # sched: [4:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_psignb:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vpsignb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpsignb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpsignb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_psignb:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vpsignb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpsignb (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpsignb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_psignb:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpsignb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BROADWELL-NEXT: vpsignb (%rdi), %xmm0, %xmm0 # sched: [6:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_psignb:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpsignb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; SKYLAKE-NEXT: vpsignb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_psignb:
+; SKX: # %bb.0:
+; SKX-NEXT: vpsignb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; SKX-NEXT: vpsignb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_psignb:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vpsignb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vpsignb (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_psignb:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vpsignb %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
; ZNVER1-NEXT: vpsignb (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call <16 x i8> @llvm.x86.ssse3.psign.b.128(<16 x i8> %a0, <16 x i8> %a1)
%2 = load <16 x i8>, <16 x i8> *%a2, align 16
%3 = call <16 x i8> @llvm.x86.ssse3.psign.b.128(<16 x i8> %1, <16 x i8> %2)
@@ -737,54 +996,68 @@ declare <16 x i8> @llvm.x86.ssse3.psign.b.128(<16 x i8>, <16 x i8>) nounwind rea
define <4 x i32> @test_psignd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
; GENERIC-LABEL: test_psignd:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: psignd %xmm1, %xmm0
-; GENERIC-NEXT: psignd (%rdi), %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: psignd %xmm1, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT: psignd (%rdi), %xmm0 # sched: [7:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_psignd:
-; ATOM: # BB#0:
-; ATOM-NEXT: psignd %xmm1, %xmm0
-; ATOM-NEXT: psignd (%rdi), %xmm0
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: psignd %xmm1, %xmm0 # sched: [1:0.50]
+; ATOM-NEXT: psignd (%rdi), %xmm0 # sched: [1:1.00]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_psignd:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: psignd %xmm1, %xmm0 # sched: [1:0.50]
; SLM-NEXT: psignd (%rdi), %xmm0 # sched: [4:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_psignd:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vpsignd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpsignd (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpsignd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_psignd:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vpsignd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpsignd (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpsignd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_psignd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpsignd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BROADWELL-NEXT: vpsignd (%rdi), %xmm0, %xmm0 # sched: [6:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_psignd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpsignd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; SKYLAKE-NEXT: vpsignd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_psignd:
+; SKX: # %bb.0:
+; SKX-NEXT: vpsignd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; SKX-NEXT: vpsignd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_psignd:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vpsignd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vpsignd (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_psignd:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vpsignd %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
; ZNVER1-NEXT: vpsignd (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call <4 x i32> @llvm.x86.ssse3.psign.d.128(<4 x i32> %a0, <4 x i32> %a1)
%2 = load <4 x i32>, <4 x i32> *%a2, align 16
%3 = call <4 x i32> @llvm.x86.ssse3.psign.d.128(<4 x i32> %1, <4 x i32> %2)
@@ -794,54 +1067,68 @@ declare <4 x i32> @llvm.x86.ssse3.psign.d.128(<4 x i32>, <4 x i32>) nounwind rea
define <8 x i16> @test_psignw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; GENERIC-LABEL: test_psignw:
-; GENERIC: # BB#0:
-; GENERIC-NEXT: psignw %xmm1, %xmm0
-; GENERIC-NEXT: psignw (%rdi), %xmm0
-; GENERIC-NEXT: retq
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: psignw %xmm1, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT: psignw (%rdi), %xmm0 # sched: [7:0.50]
+; GENERIC-NEXT: retq # sched: [1:1.00]
;
; ATOM-LABEL: test_psignw:
-; ATOM: # BB#0:
-; ATOM-NEXT: psignw %xmm1, %xmm0
-; ATOM-NEXT: psignw (%rdi), %xmm0
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: nop
-; ATOM-NEXT: retq
+; ATOM: # %bb.0:
+; ATOM-NEXT: psignw %xmm1, %xmm0 # sched: [1:0.50]
+; ATOM-NEXT: psignw (%rdi), %xmm0 # sched: [1:1.00]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: nop # sched: [1:0.50]
+; ATOM-NEXT: retq # sched: [79:39.50]
;
; SLM-LABEL: test_psignw:
-; SLM: # BB#0:
+; SLM: # %bb.0:
; SLM-NEXT: psignw %xmm1, %xmm0 # sched: [1:0.50]
; SLM-NEXT: psignw (%rdi), %xmm0 # sched: [4:1.00]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_psignw:
-; SANDY: # BB#0:
+; SANDY: # %bb.0:
; SANDY-NEXT: vpsignw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SANDY-NEXT: vpsignw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; SANDY-NEXT: retq # sched: [5:1.00]
+; SANDY-NEXT: vpsignw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SANDY-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_psignw:
-; HASWELL: # BB#0:
+; HASWELL: # %bb.0:
; HASWELL-NEXT: vpsignw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; HASWELL-NEXT: vpsignw (%rdi), %xmm0, %xmm0 # sched: [5:0.50]
-; HASWELL-NEXT: retq # sched: [1:1.00]
+; HASWELL-NEXT: vpsignw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; HASWELL-NEXT: retq # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_psignw:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: vpsignw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; BROADWELL-NEXT: vpsignw (%rdi), %xmm0, %xmm0 # sched: [6:0.50]
+; BROADWELL-NEXT: retq # sched: [7:1.00]
+;
+; SKYLAKE-LABEL: test_psignw:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: vpsignw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; SKYLAKE-NEXT: vpsignw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKYLAKE-NEXT: retq # sched: [7:1.00]
+;
+; SKX-LABEL: test_psignw:
+; SKX: # %bb.0:
+; SKX-NEXT: vpsignw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; SKX-NEXT: vpsignw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKX-NEXT: retq # sched: [7:1.00]
;
; BTVER2-LABEL: test_psignw:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vpsignw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vpsignw (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_psignw:
-; ZNVER1: # BB#0:
+; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vpsignw %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
; ZNVER1-NEXT: vpsignw (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
-; ZNVER1-NEXT: retq # sched: [5:0.50]
+; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call <8 x i16> @llvm.x86.ssse3.psign.w.128(<8 x i16> %a0, <8 x i16> %a1)
%2 = load <8 x i16>, <8 x i16> *%a2, align 16
%3 = call <8 x i16> @llvm.x86.ssse3.psign.w.128(<8 x i16> %1, <8 x i16> %2)
diff --git a/test/CodeGen/X86/stack-folding-bmi.ll b/test/CodeGen/X86/stack-folding-bmi.ll
index cabc88432be4..0bc6ef8f9bab 100644
--- a/test/CodeGen/X86/stack-folding-bmi.ll
+++ b/test/CodeGen/X86/stack-folding-bmi.ll
@@ -28,7 +28,7 @@ define i64 @stack_fold_andn_u64(i64 %a0, i64 %a1) {
define i32 @stack_fold_bextr_u32(i32 %a0, i32 %a1) {
;CHECK-LABEL: stack_fold_bextr_u32
- ;CHECK: # BB#0:
+ ;CHECK: # %bb.0:
;CHECK: bextrl %eax, {{-?[0-9]*}}(%rsp), %eax {{.*#+}} 4-byte Folded Reload
%1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
%2 = tail call i32 @llvm.x86.bmi.bextr.32(i32 %a0, i32 %a1)
@@ -38,7 +38,7 @@ declare i32 @llvm.x86.bmi.bextr.32(i32, i32)
define i64 @stack_fold_bextr_u64(i64 %a0, i64 %a1) {
;CHECK-LABEL: stack_fold_bextr_u64
- ;CHECK: # BB#0:
+ ;CHECK: # %bb.0:
;CHECK: bextrq %rax, {{-?[0-9]*}}(%rsp), %rax {{.*#+}} 8-byte Folded Reload
%1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
%2 = tail call i64 @llvm.x86.bmi.bextr.64(i64 %a0, i64 %a1)
diff --git a/test/CodeGen/X86/stack-folding-fp-avx512.ll b/test/CodeGen/X86/stack-folding-fp-avx512.ll
index 4d5e8c99f464..7bd46029f0eb 100644
--- a/test/CodeGen/X86/stack-folding-fp-avx512.ll
+++ b/test/CodeGen/X86/stack-folding-fp-avx512.ll
@@ -184,6 +184,30 @@ define <4 x float> @stack_fold_divss_int(<4 x float> %a0, <4 x float> %a1) {
ret <4 x float> %5
}
+define <8 x double> @stack_fold_cvtdq2pd(<8 x i32> %a0) {
+ ;CHECK-LABEL: stack_fold_cvtdq2pd
+ ;CHECK: vcvtdq2pd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = sitofp <8 x i32> %a0 to <8 x double>
+ ret <8 x double> %2
+}
+
+define <8 x double> @stack_fold_cvtudq2pd(<8 x i32> %a0) {
+ ;CHECK-LABEL: stack_fold_cvtudq2pd
+ ;CHECK: vcvtudq2pd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = uitofp <8 x i32> %a0 to <8 x double>
+ ret <8 x double> %2
+}
+
+define <8 x float> @stack_fold_cvtpd2ps(<8 x double> %a0) {
+ ;CHECK-LABEL: stack_fold_cvtpd2ps
+ ;CHECK: vcvtpd2ps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = fptrunc <8 x double> %a0 to <8 x float>
+ ret <8 x float> %2
+}
+
define <4 x float> @stack_fold_insertps(<4 x float> %a0, <4 x float> %a1) {
;CHECK-LABEL: stack_fold_insertps
;CHECK: vinsertps $17, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
@@ -458,7 +482,7 @@ define <4 x float> @stack_fold_extracti32x4(<16 x float> %a0, <16 x float> %a1)
define <2 x double> @stack_fold_extractf64x2(<8 x double> %a0, <8 x double> %a1) {
;CHECK-LABEL: stack_fold_extractf64x2
- ;CHECK: vextractf64x2 $3, {{%zmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 16-byte Folded Spill
+ ;CHECK: vextractf32x4 $3, {{%zmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 16-byte Folded Spill
%1 = shufflevector <8 x double> %a0, <8 x double> %a1, <2 x i32> <i32 6, i32 7>
%2 = tail call <2 x double> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
ret <2 x double> %1
@@ -466,7 +490,7 @@ define <2 x double> @stack_fold_extractf64x2(<8 x double> %a0, <8 x double> %a1)
define <8 x float> @stack_fold_extracti32x8(<16 x float> %a0, <16 x float> %a1) {
;CHECK-LABEL: stack_fold_extracti32x8
- ;CHECK: vextractf32x8 $1, {{%zmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 32-byte Folded Spill
+ ;CHECK: vextractf64x4 $1, {{%zmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 32-byte Folded Spill
%1 = shufflevector <16 x float> %a0, <16 x float> %a1, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
ret <8 x float> %1
@@ -482,7 +506,7 @@ define <4 x double> @stack_fold_extractf64x4(<8 x double> %a0, <8 x double> %a1)
define <16 x float> @stack_fold_insertf32x8(<8 x float> %a0, <8 x float> %a1) {
;CHECK-LABEL: stack_fold_insertf32x8
- ;CHECK: vinsertf32x8 $1, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+ ;CHECK: vinsertf64x4 $1, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = shufflevector <8 x float> %a0, <8 x float> %a1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
ret <16 x float> %2
diff --git a/test/CodeGen/X86/stack-folding-fp-avx512vl.ll b/test/CodeGen/X86/stack-folding-fp-avx512vl.ll
index 292829a01cb3..717e942fff17 100644
--- a/test/CodeGen/X86/stack-folding-fp-avx512vl.ll
+++ b/test/CodeGen/X86/stack-folding-fp-avx512vl.ll
@@ -216,6 +216,56 @@ define <8 x float> @stack_fold_divps_ymm(<8 x float> %a0, <8 x float> %a1) {
ret <8 x float> %2
}
+define <2 x double> @stack_fold_cvtdq2pd(<4 x i32> %a0) {
+ ;CHECK-LABEL: stack_fold_cvtdq2pd
+ ;CHECK: vcvtdq2pd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = shufflevector <4 x i32> %a0, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
+ %3 = sitofp <2 x i32> %2 to <2 x double>
+ ret <2 x double> %3
+}
+
+define <4 x double> @stack_fold_cvtdq2pd_ymm(<4 x i32> %a0) {
+ ;CHECK-LABEL: stack_fold_cvtdq2pd_ymm
+ ;CHECK: vcvtdq2pd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = sitofp <4 x i32> %a0 to <4 x double>
+ ret <4 x double> %2
+}
+
+define <2 x double> @stack_fold_cvtudq2pd(<4 x i32> %a0) {
+ ;CHECK-LABEL: stack_fold_cvtudq2pd
+ ;CHECK: vcvtudq2pd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = shufflevector <4 x i32> %a0, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
+ %3 = uitofp <2 x i32> %2 to <2 x double>
+ ret <2 x double> %3
+}
+
+define <4 x double> @stack_fold_cvtudq2pd_ymm(<4 x i32> %a0) {
+ ;CHECK-LABEL: stack_fold_cvtudq2pd_ymm
+ ;CHECK: vcvtudq2pd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = uitofp <4 x i32> %a0 to <4 x double>
+ ret <4 x double> %2
+}
+
+define <2 x float> @stack_fold_cvtpd2ps(<2 x double> %a0) {
+ ;CHECK-LABEL: stack_fold_cvtpd2ps
+ ;CHECK: vcvtpd2psx {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = fptrunc <2 x double> %a0 to <2 x float>
+ ret <2 x float> %2
+}
+
+define <4 x float> @stack_fold_cvtpd2ps_ymm(<4 x double> %a0) {
+ ;CHECK-LABEL: stack_fold_cvtpd2ps_ymm
+ ;CHECK: vcvtpd2psy {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+ %2 = fptrunc <4 x double> %a0 to <4 x float>
+ ret <4 x float> %2
+}
+
define <2 x double> @stack_fold_maxpd(<2 x double> %a0, <2 x double> %a1) #0 {
;CHECK-LABEL: stack_fold_maxpd
;CHECK: vmaxpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
diff --git a/test/CodeGen/X86/stack-folding-int-avx1.ll b/test/CodeGen/X86/stack-folding-int-avx1.ll
index c2f1053c6142..5e98cb2b8594 100644
--- a/test/CodeGen/X86/stack-folding-int-avx1.ll
+++ b/test/CodeGen/X86/stack-folding-int-avx1.ll
@@ -275,19 +275,27 @@ define <16 x i8> @stack_fold_pavgb(<16 x i8> %a0, <16 x i8> %a1) {
;CHECK-LABEL: stack_fold_pavgb
;CHECK: vpavgb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
- %2 = call <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> %a0, <16 x i8> %a1)
- ret <16 x i8> %2
+ %2 = zext <16 x i8> %a0 to <16 x i16>
+ %3 = zext <16 x i8> %a1 to <16 x i16>
+ %4 = add <16 x i16> %2, %3
+ %5 = add <16 x i16> %4, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+ %6 = lshr <16 x i16> %5, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+ %7 = trunc <16 x i16> %6 to <16 x i8>
+ ret <16 x i8> %7
}
-declare <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8>, <16 x i8>) nounwind readnone
define <8 x i16> @stack_fold_pavgw(<8 x i16> %a0, <8 x i16> %a1) {
;CHECK-LABEL: stack_fold_pavgw
;CHECK: vpavgw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
- %2 = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %a0, <8 x i16> %a1)
- ret <8 x i16> %2
+ %2 = zext <8 x i16> %a0 to <8 x i32>
+ %3 = zext <8 x i16> %a1 to <8 x i32>
+ %4 = add <8 x i32> %2, %3
+ %5 = add <8 x i32> %4, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+ %6 = lshr <8 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+ %7 = trunc <8 x i32> %6 to <8 x i16>
+ ret <8 x i16> %7
}
-declare <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16>, <8 x i16>) nounwind readnone
define <16 x i8> @stack_fold_pblendvb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %c) {
;CHECK-LABEL: stack_fold_pblendvb
@@ -763,7 +771,9 @@ define <4 x i32> @stack_fold_pshufd(<4 x i32> %a0) {
;CHECK: vpshufd $27, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = shufflevector <4 x i32> %a0, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
- ret <4 x i32> %2
+ ; add forces execution domain
+ %3 = add <4 x i32> %2, <i32 1, i32 1, i32 1, i32 1>
+ ret <4 x i32> %3
}
define <8 x i16> @stack_fold_pshufhw(<8 x i16> %a0) {
diff --git a/test/CodeGen/X86/stack-folding-int-avx2.ll b/test/CodeGen/X86/stack-folding-int-avx2.ll
index ef7fa2217145..19ad3bf5a25f 100644
--- a/test/CodeGen/X86/stack-folding-int-avx2.ll
+++ b/test/CodeGen/X86/stack-folding-int-avx2.ll
@@ -17,7 +17,6 @@ define <4 x double> @stack_fold_broadcastsd_ymm(<2 x double> %a0) {
%3 = fadd <4 x double> %2, <double 0x0, double 0x0, double 0x0, double 0x0>
ret <4 x double> %3
}
-declare <4 x double> @llvm.x86.avx2.vbroadcast.sd.pd.256(<2 x double>) nounwind readonly
define <4 x float> @stack_fold_broadcastss(<4 x float> %a0) {
;CHECK-LABEL: stack_fold_broadcastss
@@ -28,7 +27,6 @@ define <4 x float> @stack_fold_broadcastss(<4 x float> %a0) {
%3 = fadd <4 x float> %2, <float 0x0, float 0x0, float 0x0, float 0x0>
ret <4 x float> %3
}
-declare <4 x float> @llvm.x86.avx2.vbroadcast.ss.ps(<4 x float>) nounwind readonly
define <8 x float> @stack_fold_broadcastss_ymm(<4 x float> %a0) {
;CHECK-LABEL: stack_fold_broadcastss_ymm
@@ -39,7 +37,6 @@ define <8 x float> @stack_fold_broadcastss_ymm(<4 x float> %a0) {
%3 = fadd <8 x float> %2, <float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0>
ret <8 x float> %3
}
-declare <8 x float> @llvm.x86.avx2.vbroadcast.ss.ps.256(<4 x float>) nounwind readonly
define <4 x i32> @stack_fold_extracti128(<8 x i32> %a0, <8 x i32> %a1) {
;CHECK-LABEL: stack_fold_extracti128
@@ -234,19 +231,27 @@ define <32 x i8> @stack_fold_pavgb(<32 x i8> %a0, <32 x i8> %a1) {
;CHECK-LABEL: stack_fold_pavgb
;CHECK: vpavgb {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
- %2 = call <32 x i8> @llvm.x86.avx2.pavg.b(<32 x i8> %a0, <32 x i8> %a1)
- ret <32 x i8> %2
+ %2 = zext <32 x i8> %a0 to <32 x i16>
+ %3 = zext <32 x i8> %a1 to <32 x i16>
+ %4 = add <32 x i16> %2, %3
+ %5 = add <32 x i16> %4, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+ %6 = lshr <32 x i16> %5, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+ %7 = trunc <32 x i16> %6 to <32 x i8>
+ ret <32 x i8> %7
}
-declare <32 x i8> @llvm.x86.avx2.pavg.b(<32 x i8>, <32 x i8>) nounwind readnone
define <16 x i16> @stack_fold_pavgw(<16 x i16> %a0, <16 x i16> %a1) {
;CHECK-LABEL: stack_fold_pavgw
;CHECK: vpavgw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
- %2 = call <16 x i16> @llvm.x86.avx2.pavg.w(<16 x i16> %a0, <16 x i16> %a1)
- ret <16 x i16> %2
+ %2 = zext <16 x i16> %a0 to <16 x i32>
+ %3 = zext <16 x i16> %a1 to <16 x i32>
+ %4 = add <16 x i32> %2, %3
+ %5 = add <16 x i32> %4, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+ %6 = lshr <16 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+ %7 = trunc <16 x i32> %6 to <16 x i16>
+ ret <16 x i16> %7
}
-declare <16 x i16> @llvm.x86.avx2.pavg.w(<16 x i16>, <16 x i16>) nounwind readnone
define <4 x i32> @stack_fold_pblendd(<4 x i32> %a0, <4 x i32> %a1) {
;CHECK-LABEL: stack_fold_pblendd
@@ -445,7 +450,9 @@ define <8 x i32> @stack_fold_permd(<8 x i32> %a0, <8 x i32> %a1) {
;CHECK: vpermd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a1, <8 x i32> %a0)
- ret <8 x i32> %2
+ ; add forces execution domain
+ %3 = add <8 x i32> %2, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+ ret <8 x i32> %3
}
declare <8 x i32> @llvm.x86.avx2.permd(<8 x i32>, <8 x i32>) nounwind readonly
@@ -854,7 +861,9 @@ define <8 x i32> @stack_fold_pshufd(<8 x i32> %a0) {
;CHECK: vpshufd $27, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
- ret <8 x i32> %2
+ ; add forces execution domain
+ %3 = add <8 x i32> %2, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+ ret <8 x i32> %3
}
define <16 x i16> @stack_fold_vpshufhw(<16 x i16> %a0) {
diff --git a/test/CodeGen/X86/stack-folding-int-avx512.ll b/test/CodeGen/X86/stack-folding-int-avx512.ll
index 362e656b4f22..6bde51286dc9 100644
--- a/test/CodeGen/X86/stack-folding-int-avx512.ll
+++ b/test/CodeGen/X86/stack-folding-int-avx512.ll
@@ -70,52 +70,88 @@ define <64 x i8> @stack_fold_pavgb(<64 x i8> %a0, <64 x i8> %a1) {
;CHECK-LABEL: stack_fold_pavgb
;CHECK: vpavgb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
- %2 = call <64 x i8> @llvm.x86.avx512.mask.pavg.b.512(<64 x i8> %a0, <64 x i8> %a1, <64 x i8> undef, i64 -1)
- ret <64 x i8> %2
+ %2 = zext <64 x i8> %a0 to <64 x i16>
+ %3 = zext <64 x i8> %a1 to <64 x i16>
+ %4 = add <64 x i16> %2, %3
+ %5 = add <64 x i16> %4, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+ %6 = lshr <64 x i16> %5, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+ %7 = trunc <64 x i16> %6 to <64 x i8>
+ ret <64 x i8> %7
}
-declare <64 x i8> @llvm.x86.avx512.mask.pavg.b.512(<64 x i8>, <64 x i8>, <64 x i8>, i64) nounwind readnone
define <64 x i8> @stack_fold_pavgb_mask(<64 x i8>* %passthru, <64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
;CHECK-LABEL: stack_fold_pavgb_mask
;CHECK: vpavgb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = load <64 x i8>, <64 x i8>* %passthru
- %3 = call <64 x i8> @llvm.x86.avx512.mask.pavg.b.512(<64 x i8> %a0, <64 x i8> %a1, <64 x i8> %2, i64 %mask)
- ret <64 x i8> %3
+ %3 = zext <64 x i8> %a0 to <64 x i16>
+ %4 = zext <64 x i8> %a1 to <64 x i16>
+ %5 = add <64 x i16> %3, %4
+ %6 = add <64 x i16> %5, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+ %7 = lshr <64 x i16> %6, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+ %8 = trunc <64 x i16> %7 to <64 x i8>
+ %9 = bitcast i64 %mask to <64 x i1>
+ %10 = select <64 x i1> %9, <64 x i8> %8, <64 x i8> %2
+ ret <64 x i8> %10
}
define <64 x i8> @stack_fold_pavgb_maskz(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
;CHECK-LABEL: stack_fold_pavgb_maskz
;CHECK: vpavgb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
- %2 = call <64 x i8> @llvm.x86.avx512.mask.pavg.b.512(<64 x i8> %a0, <64 x i8> %a1, <64 x i8> zeroinitializer, i64 %mask)
- ret <64 x i8> %2
+ %2 = zext <64 x i8> %a0 to <64 x i16>
+ %3 = zext <64 x i8> %a1 to <64 x i16>
+ %4 = add <64 x i16> %2, %3
+ %5 = add <64 x i16> %4, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+ %6 = lshr <64 x i16> %5, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+ %7 = trunc <64 x i16> %6 to <64 x i8>
+ %8 = bitcast i64 %mask to <64 x i1>
+ %9 = select <64 x i1> %8, <64 x i8> %7, <64 x i8> zeroinitializer
+ ret <64 x i8> %9
}
define <32 x i16> @stack_fold_pavgw(<32 x i16> %a0, <32 x i16> %a1) {
;CHECK-LABEL: stack_fold_pavgw
;CHECK: vpavgw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
- %2 = call <32 x i16> @llvm.x86.avx512.mask.pavg.w.512(<32 x i16> %a0, <32 x i16> %a1, <32 x i16> undef, i32 -1)
- ret <32 x i16> %2
+ %2 = zext <32 x i16> %a0 to <32 x i32>
+ %3 = zext <32 x i16> %a1 to <32 x i32>
+ %4 = add <32 x i32> %2, %3
+ %5 = add <32 x i32> %4, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+ %6 = lshr <32 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+ %7 = trunc <32 x i32> %6 to <32 x i16>
+ ret <32 x i16> %7
}
-declare <32 x i16> @llvm.x86.avx512.mask.pavg.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32) nounwind readnone
define <32 x i16> @stack_fold_pavgw_mask(<32 x i16>* %passthru, <32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
;CHECK-LABEL: stack_fold_pavgw_mask
;CHECK: vpavgw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = load <32 x i16>, <32 x i16>* %passthru
- %3 = call <32 x i16> @llvm.x86.avx512.mask.pavg.w.512(<32 x i16> %a0, <32 x i16> %a1, <32 x i16> %2, i32 %mask)
- ret <32 x i16> %3
+ %3 = zext <32 x i16> %a0 to <32 x i32>
+ %4 = zext <32 x i16> %a1 to <32 x i32>
+ %5 = add <32 x i32> %3, %4
+ %6 = add <32 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+ %7 = lshr <32 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+ %8 = trunc <32 x i32> %7 to <32 x i16>
+ %9 = bitcast i32 %mask to <32 x i1>
+ %10 = select <32 x i1> %9, <32 x i16> %8, <32 x i16> %2
+ ret <32 x i16> %10
}
define <32 x i16> @stack_fold_pavgw_maskz(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
;CHECK-LABEL: stack_fold_pavgw_maskz
;CHECK: vpavgw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
- %2 = call <32 x i16> @llvm.x86.avx512.mask.pavg.w.512(<32 x i16> %a0, <32 x i16> %a1, <32 x i16> zeroinitializer, i32 %mask)
- ret <32 x i16> %2
+ %2 = zext <32 x i16> %a0 to <32 x i32>
+ %3 = zext <32 x i16> %a1 to <32 x i32>
+ %4 = add <32 x i32> %2, %3
+ %5 = add <32 x i32> %4, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+ %6 = lshr <32 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+ %7 = trunc <32 x i32> %6 to <32 x i16>
+ %8 = bitcast i32 %mask to <32 x i1>
+ %9 = select <32 x i1> %8, <32 x i16> %7, <32 x i16> zeroinitializer
+ ret <32 x i16> %9
}
define <4 x i32> @stack_fold_extracti32x4(<16 x i32> %a0, <16 x i32> %a1) {
@@ -130,7 +166,7 @@ define <4 x i32> @stack_fold_extracti32x4(<16 x i32> %a0, <16 x i32> %a1) {
define <2 x i64> @stack_fold_extracti64x2(<8 x i64> %a0, <8 x i64> %a1) {
;CHECK-LABEL: stack_fold_extracti64x2
- ;CHECK: vextracti64x2 $3, {{%zmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 16-byte Folded Spill
+ ;CHECK: vextracti32x4 $3, {{%zmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 16-byte Folded Spill
; add forces execution domain
%1 = add <8 x i64> %a0, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
%2 = shufflevector <8 x i64> %1, <8 x i64> %a1, <2 x i32> <i32 6, i32 7>
@@ -140,7 +176,7 @@ define <2 x i64> @stack_fold_extracti64x2(<8 x i64> %a0, <8 x i64> %a1) {
define <8 x i32> @stack_fold_extracti32x8(<16 x i32> %a0, <16 x i32> %a1) {
;CHECK-LABEL: stack_fold_extracti32x8
- ;CHECK: vextracti32x8 $1, {{%zmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 32-byte Folded Spill
+ ;CHECK: vextracti64x4 $1, {{%zmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 32-byte Folded Spill
; add forces execution domain
%1 = add <16 x i32> %a0, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
%2 = shufflevector <16 x i32> %1, <16 x i32> %a1, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -160,7 +196,7 @@ define <4 x i64> @stack_fold_extracti64x4(<8 x i64> %a0, <8 x i64> %a1) {
define <16 x i32> @stack_fold_inserti32x8(<8 x i32> %a0, <8 x i32> %a1) {
;CHECK-LABEL: stack_fold_inserti32x8
- ;CHECK: vinserti32x8 $1, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+ ;CHECK: vinserti64x4 $1, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = shufflevector <8 x i32> %a0, <8 x i32> %a1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
; add forces execution domain
@@ -595,7 +631,9 @@ define <16 x i32> @stack_fold_permd(<16 x i32> %a0, <16 x i32> %a1) {
;CHECK: vpermd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32> %a1, <16 x i32> %a0, <16 x i32> undef, i16 -1)
- ret <16 x i32> %2
+ ; add forces execution domain
+ %3 = add <16 x i32> %2, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+ ret <16 x i32> %3
}
declare <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) nounwind readonly
@@ -1216,7 +1254,8 @@ define <16 x i32> @stack_fold_pshufd_zmm(<16 x i32> %a0) {
;CHECK: vpshufd $27, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = shufflevector <16 x i32> %a0, <16 x i32> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
- ret <16 x i32> %2
+ %3 = add <16 x i32> %2, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+ ret <16 x i32> %3
}
define <16 x i32> @stack_fold_pshufd_zmm_mask(<16 x i32> %passthru, <16 x i32> %a0, i16 %mask) {
diff --git a/test/CodeGen/X86/stack-folding-int-avx512vl.ll b/test/CodeGen/X86/stack-folding-int-avx512vl.ll
index 26e97ea4e599..a55288fee513 100644
--- a/test/CodeGen/X86/stack-folding-int-avx512vl.ll
+++ b/test/CodeGen/X86/stack-folding-int-avx512vl.ll
@@ -49,37 +49,53 @@ define <16 x i8> @stack_fold_pavgb(<16 x i8> %a0, <16 x i8> %a1) {
;CHECK-LABEL: stack_fold_pavgb
;CHECK: vpavgb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
- %2 = call <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> %a0, <16 x i8> %a1)
- ret <16 x i8> %2
+ %2 = zext <16 x i8> %a0 to <16 x i16>
+ %3 = zext <16 x i8> %a1 to <16 x i16>
+ %4 = add <16 x i16> %2, %3
+ %5 = add <16 x i16> %4, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+ %6 = lshr <16 x i16> %5, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+ %7 = trunc <16 x i16> %6 to <16 x i8>
+ ret <16 x i8> %7
}
-declare <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8>, <16 x i8>) nounwind readnone
define <32 x i8> @stack_fold_pavgb_ymm(<32 x i8> %a0, <32 x i8> %a1) {
;CHECK-LABEL: stack_fold_pavgb_ymm
;CHECK: vpavgb {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
- %2 = call <32 x i8> @llvm.x86.avx2.pavg.b(<32 x i8> %a0, <32 x i8> %a1)
- ret <32 x i8> %2
+ %2 = zext <32 x i8> %a0 to <32 x i16>
+ %3 = zext <32 x i8> %a1 to <32 x i16>
+ %4 = add <32 x i16> %2, %3
+ %5 = add <32 x i16> %4, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+ %6 = lshr <32 x i16> %5, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+ %7 = trunc <32 x i16> %6 to <32 x i8>
+ ret <32 x i8> %7
}
-declare <32 x i8> @llvm.x86.avx2.pavg.b(<32 x i8>, <32 x i8>) nounwind readnone
define <8 x i16> @stack_fold_pavgw(<8 x i16> %a0, <8 x i16> %a1) {
;CHECK-LABEL: stack_fold_pavgw
;CHECK: vpavgw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
- %2 = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %a0, <8 x i16> %a1)
- ret <8 x i16> %2
+ %2 = zext <8 x i16> %a0 to <8 x i32>
+ %3 = zext <8 x i16> %a1 to <8 x i32>
+ %4 = add <8 x i32> %2, %3
+ %5 = add <8 x i32> %4, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+ %6 = lshr <8 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+ %7 = trunc <8 x i32> %6 to <8 x i16>
+ ret <8 x i16> %7
}
-declare <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16>, <8 x i16>) nounwind readnone
define <16 x i16> @stack_fold_pavgw_ymm(<16 x i16> %a0, <16 x i16> %a1) {
;CHECK-LABEL: stack_fold_pavgw_ymm
;CHECK: vpavgw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
- %2 = call <16 x i16> @llvm.x86.avx2.pavg.w(<16 x i16> %a0, <16 x i16> %a1)
- ret <16 x i16> %2
+ %2 = zext <16 x i16> %a0 to <16 x i32>
+ %3 = zext <16 x i16> %a1 to <16 x i32>
+ %4 = add <16 x i32> %2, %3
+ %5 = add <16 x i32> %4, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+ %6 = lshr <16 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+ %7 = trunc <16 x i32> %6 to <16 x i16>
+ ret <16 x i16> %7
}
-declare <16 x i16> @llvm.x86.avx2.pavg.w(<16 x i16>, <16 x i16>) nounwind readnone
define <4 x i32> @stack_fold_vpconflictd(<4 x i32> %a0) {
;CHECK-LABEL: stack_fold_vpconflictd
@@ -564,7 +580,9 @@ define <8 x i32> @stack_fold_permd(<8 x i32> %a0, <8 x i32> %a1) {
;CHECK: vpermd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a1, <8 x i32> %a0)
- ret <8 x i32> %2
+ ; add forces execution domain
+ %3 = add <8 x i32> %2, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+ ret <8 x i32> %3
}
declare <8 x i32> @llvm.x86.avx2.permd(<8 x i32>, <8 x i32>) nounwind readonly
@@ -1677,7 +1695,8 @@ define <4 x i32> @stack_fold_pshufd(<4 x i32> %a0) {
;CHECK: vpshufd $27, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = shufflevector <4 x i32> %a0, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
- ret <4 x i32> %2
+ %3 = add <4 x i32> %2, <i32 1, i32 1, i32 1, i32 1>
+ ret <4 x i32> %3
}
define <4 x i32> @stack_fold_pshufd_mask(<4 x i32> %passthru, <4 x i32> %a0, i8 %mask) {
@@ -1707,7 +1726,8 @@ define <8 x i32> @stack_fold_pshufd_ymm(<8 x i32> %a0) {
;CHECK: vpshufd $27, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
- ret <8 x i32> %2
+ %3 = add <8 x i32> %2, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+ ret <8 x i32> %3
}
define <8 x i32> @stack_fold_pshufd_ymm_mask(<8 x i32> %passthru, <8 x i32> %a0, i8 %mask) {
diff --git a/test/CodeGen/X86/stack-folding-int-sse42.ll b/test/CodeGen/X86/stack-folding-int-sse42.ll
index 3ca94b7b9467..136077e2917f 100644
--- a/test/CodeGen/X86/stack-folding-int-sse42.ll
+++ b/test/CodeGen/X86/stack-folding-int-sse42.ll
@@ -302,19 +302,27 @@ define <16 x i8> @stack_fold_pavgb(<16 x i8> %a0, <16 x i8> %a1) {
;CHECK-LABEL: stack_fold_pavgb
;CHECK: pavgb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
- %2 = call <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> %a0, <16 x i8> %a1)
- ret <16 x i8> %2
+ %2 = zext <16 x i8> %a0 to <16 x i16>
+ %3 = zext <16 x i8> %a1 to <16 x i16>
+ %4 = add <16 x i16> %2, %3
+ %5 = add <16 x i16> %4, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+ %6 = lshr <16 x i16> %5, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+ %7 = trunc <16 x i16> %6 to <16 x i8>
+ ret <16 x i8> %7
}
-declare <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8>, <16 x i8>) nounwind readnone
define <8 x i16> @stack_fold_pavgw(<8 x i16> %a0, <8 x i16> %a1) {
;CHECK-LABEL: stack_fold_pavgw
;CHECK: pavgw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
- %2 = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %a0, <8 x i16> %a1)
- ret <8 x i16> %2
+ %2 = zext <8 x i16> %a0 to <8 x i32>
+ %3 = zext <8 x i16> %a1 to <8 x i32>
+ %4 = add <8 x i32> %2, %3
+ %5 = add <8 x i32> %4, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+ %6 = lshr <8 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+ %7 = trunc <8 x i32> %6 to <8 x i16>
+ ret <8 x i16> %7
}
-declare <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16>, <8 x i16>) nounwind readnone
define <16 x i8> @stack_fold_pblendvb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %c) {
;CHECK-LABEL: stack_fold_pblendvb
diff --git a/test/CodeGen/X86/stack-folding-lwp.ll b/test/CodeGen/X86/stack-folding-lwp.ll
index edf2798ff846..30b933238832 100644
--- a/test/CodeGen/X86/stack-folding-lwp.ll
+++ b/test/CodeGen/X86/stack-folding-lwp.ll
@@ -10,7 +10,7 @@ target triple = "x86_64-unknown-unknown"
define i8 @stack_fold_lwpins_u32(i32 %a0, i32 %a1) {
; CHECK-LABEL: stack_fold_lwpins_u32
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK: lwpins $2814, {{-?[0-9]*}}(%rsp), %eax {{.*#+}} 4-byte Folded Reload
%1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
%2 = tail call i8 @llvm.x86.lwpins32(i32 %a0, i32 %a1, i32 2814)
@@ -20,7 +20,7 @@ declare i8 @llvm.x86.lwpins32(i32, i32, i32)
define i8 @stack_fold_lwpins_u64(i64 %a0, i32 %a1) {
; CHECK-LABEL: stack_fold_lwpins_u64
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK: lwpins $2814, {{-?[0-9]*}}(%rsp), %rax {{.*#+}} 4-byte Folded Reload
%1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
%2 = tail call i8 @llvm.x86.lwpins64(i64 %a0, i32 %a1, i32 2814)
@@ -30,7 +30,7 @@ declare i8 @llvm.x86.lwpins64(i64, i32, i32)
define void @stack_fold_lwpval_u32(i32 %a0, i32 %a1) {
; CHECK-LABEL: stack_fold_lwpval_u32
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK: lwpval $2814, {{-?[0-9]*}}(%rsp), %eax {{.*#+}} 4-byte Folded Reload
%1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
tail call void @llvm.x86.lwpval32(i32 %a0, i32 %a1, i32 2814)
@@ -40,7 +40,7 @@ declare void @llvm.x86.lwpval32(i32, i32, i32)
define void @stack_fold_lwpval_u64(i64 %a0, i32 %a1) {
; CHECK-LABEL: stack_fold_lwpval_u64
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK: lwpval $2814, {{-?[0-9]*}}(%rsp), %rax {{.*#+}} 4-byte Folded Reload
%1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
tail call void @llvm.x86.lwpval64(i64 %a0, i32 %a1, i32 2814)
diff --git a/test/CodeGen/X86/stack-folding-tbm.ll b/test/CodeGen/X86/stack-folding-tbm.ll
index fe3c828a69b0..ac7d97c826e2 100644
--- a/test/CodeGen/X86/stack-folding-tbm.ll
+++ b/test/CodeGen/X86/stack-folding-tbm.ll
@@ -10,7 +10,7 @@ target triple = "x86_64-unknown-unknown"
define i32 @stack_fold_bextri_u32(i32 %a0) {
;CHECK-LABEL: stack_fold_bextri_u32
- ;CHECK: # BB#0:
+ ;CHECK: # %bb.0:
;CHECK: bextr $2814, {{-?[0-9]*}}(%rsp), %eax {{.*#+}} 4-byte Folded Reload
%1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
%2 = tail call i32 @llvm.x86.tbm.bextri.u32(i32 %a0, i32 2814)
@@ -20,7 +20,7 @@ declare i32 @llvm.x86.tbm.bextri.u32(i32, i32)
define i64 @stack_fold_bextri_u64(i64 %a0) {
;CHECK-LABEL: stack_fold_bextri_u64
- ;CHECK: # BB#0:
+ ;CHECK: # %bb.0:
;CHECK: bextr $2814, {{-?[0-9]*}}(%rsp), %rax {{.*#+}} 8-byte Folded Reload
%1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
%2 = tail call i64 @llvm.x86.tbm.bextri.u64(i64 %a0, i64 2814)
diff --git a/test/CodeGen/X86/stack-protector-msvc.ll b/test/CodeGen/X86/stack-protector-msvc.ll
index 5eccc65f2dec..c1f79f9db2f6 100644
--- a/test/CodeGen/X86/stack-protector-msvc.ll
+++ b/test/CodeGen/X86/stack-protector-msvc.ll
@@ -1,19 +1,9 @@
+; RUN: llc -mtriple=i386-pc-windows-msvc < %s -o - | FileCheck -check-prefix=MSVC-X86 %s
+; RUN: llc -mtriple=x86_64-pc-windows-msvc < %s -o - | FileCheck -check-prefix=MSVC-X64 %s
-; RUN: llc -mtriple=i386-pc-windows-msvc < %s -o - | FileCheck -check-prefix=MSVC-I386 %s
-; RUN: llc -mtriple=x86_64-pc-windows-msvc < %s -o - | FileCheck -check-prefix=MSVC-64 %s
-
-; MSVC-I386: movl ___security_cookie, %[[REG1:[a-z]*]]
-; MSVC-I386: movl %[[REG1]], [[SLOT:[0-9]*]](%esp)
-; MSVC-I386: calll _strcpy
-; MSVC-I386: movl [[SLOT]](%esp), %ecx
-; MSVC-I386: calll @__security_check_cookie@4
-; MSVC-I386: retl
-
-; MSVC-64: movq __security_cookie(%rip), %[[REG1:[a-z]*]]
-; MSVC-64: movq %[[REG1]], [[SLOT:[0-9]*]](%rsp)
-; MSVC-64: callq strcpy
-; MSVC-64: movq [[SLOT]](%rsp), %rcx
-; MSVC-64: callq __security_check_cookie
+; Make sure fastisel falls back and does something secure.
+; RUN: llc -mtriple=i686-pc-windows-msvc -O0 < %s -o - | FileCheck -check-prefix=MSVC-X86-O0 %s
+; RUN: llc -mtriple=x86_64-pc-windows-msvc -O0 < %s -o - | FileCheck -check-prefix=MSVC-X64-O0 %s
@"\01LC" = internal constant [11 x i8] c"buf == %s\0A\00" ; <[11 x i8]*> [#uses=1]
@@ -21,7 +11,6 @@ define void @test(i8* %a) nounwind ssp {
entry:
%a_addr = alloca i8* ; <i8**> [#uses=2]
%buf = alloca [8 x i8] ; <[8 x i8]*> [#uses=2]
- %"alloca point" = bitcast i32 0 to i32 ; <i32> [#uses=0]
store i8* %a, i8** %a_addr
%buf1 = bitcast [8 x i8]* %buf to i8* ; <i8*> [#uses=1]
%0 = load i8*, i8** %a_addr, align 4 ; <i8*> [#uses=1]
@@ -34,6 +23,139 @@ return: ; preds = %entry
ret void
}
+; MSVC-X86-LABEL: _test:
+; MSVC-X86: movl ___security_cookie, %[[REG1:[^ ]*]]
+; MSVC-X86: xorl %esp, %[[REG1]]
+; MSVC-X86: movl %[[REG1]], [[SLOT:[0-9]*]](%esp)
+; MSVC-X86: calll _strcpy
+; MSVC-X86: movl [[SLOT]](%esp), %ecx
+; MSVC-X86: xorl %esp, %ecx
+; MSVC-X86: calll @__security_check_cookie@4
+; MSVC-X86: retl
+
+; MSVC-X64-LABEL: test:
+; MSVC-X64: movq __security_cookie(%rip), %[[REG1:[^ ]*]]
+; MSVC-X64: xorq %rsp, %[[REG1]]
+; MSVC-X64: movq %[[REG1]], [[SLOT:[0-9]*]](%rsp)
+; MSVC-X64: callq strcpy
+; MSVC-X64: movq [[SLOT]](%rsp), %rcx
+; MSVC-X64: xorq %rsp, %rcx
+; MSVC-X64: callq __security_check_cookie
+; MSVC-X64: retq
+
+; MSVC-X86-O0-LABEL: _test:
+; MSVC-X86-O0: movl ___security_cookie, %[[REG1:[^ ]*]]
+; MSVC-X86-O0: xorl %esp, %[[REG1]]
+; MSVC-X86-O0: movl %[[REG1]], [[SLOT:[0-9]*]](%esp)
+; MSVC-X86-O0: calll _strcpy
+; MSVC-X86-O0: movl [[SLOT]](%esp), %[[REG1:[^ ]*]]
+; MSVC-X86-O0: xorl %esp, %[[REG1]]
+; MSVC-X86-O0: movl %[[REG1]], %ecx
+; MSVC-X86-O0: calll @__security_check_cookie@4
+; MSVC-X86-O0: retl
+
+; MSVC-X64-O0-LABEL: test:
+; MSVC-X64-O0: movq __security_cookie(%rip), %[[REG1:[^ ]*]]
+; MSVC-X64-O0: xorq %rsp, %[[REG1]]
+; MSVC-X64-O0: movq %[[REG1]], [[SLOT:[0-9]*]](%rsp)
+; MSVC-X64-O0: callq strcpy
+; MSVC-X64-O0: movq [[SLOT]](%rsp), %[[REG1:[^ ]*]]
+; MSVC-X64-O0: xorq %rsp, %[[REG1]]
+; MSVC-X64-O0: movq %[[REG1]], %rcx
+; MSVC-X64-O0: callq __security_check_cookie
+; MSVC-X64-O0: retq
+
+
+declare void @escape(i32*)
+
+define void @test_vla(i32 %n) nounwind ssp {
+ %vla = alloca i32, i32 %n
+ call void @escape(i32* %vla)
+ ret void
+}
+
+; MSVC-X86-LABEL: _test_vla:
+; MSVC-X86: pushl %ebp
+; MSVC-X86: movl %esp, %ebp
+; MSVC-X86: movl ___security_cookie, %[[REG1:[^ ]*]]
+; MSVC-X86: xorl %ebp, %[[REG1]]
+; MSVC-X86: movl %[[REG1]], [[SLOT:-[0-9]*]](%ebp)
+; MSVC-X86: calll __chkstk
+; MSVC-X86: pushl
+; MSVC-X86: calll _escape
+; MSVC-X86: movl [[SLOT]](%ebp), %ecx
+; MSVC-X86: xorl %ebp, %ecx
+; MSVC-X86: calll @__security_check_cookie@4
+; MSVC-X86: movl %ebp, %esp
+; MSVC-X86: popl %ebp
+; MSVC-X86: retl
+
+; MSVC-X64-LABEL: test_vla:
+; MSVC-X64: pushq %rbp
+; MSVC-X64: subq $16, %rsp
+; MSVC-X64: leaq 16(%rsp), %rbp
+; MSVC-X64: movq __security_cookie(%rip), %[[REG1:[^ ]*]]
+; MSVC-X64: xorq %rbp, %[[REG1]]
+; MSVC-X64: movq %[[REG1]], [[SLOT:-[0-9]*]](%rbp)
+; MSVC-X64: callq __chkstk
+; MSVC-X64: callq escape
+; MSVC-X64: movq [[SLOT]](%rbp), %rcx
+; MSVC-X64: xorq %rbp, %rcx
+; MSVC-X64: callq __security_check_cookie
+; MSVC-X64: retq
+
+
+; This case is interesting because we address local variables with RBX but XOR
+; the guard value with RBP. That's fine, either value will do, as long as they
+; are the same across the life of the frame.
+
+define void @test_vla_realign(i32 %n) nounwind ssp {
+ %realign = alloca i32, align 32
+ %vla = alloca i32, i32 %n
+ call void @escape(i32* %realign)
+ call void @escape(i32* %vla)
+ ret void
+}
+
+; MSVC-X86-LABEL: _test_vla_realign:
+; MSVC-X86: pushl %ebp
+; MSVC-X86: movl %esp, %ebp
+; MSVC-X86: pushl %esi
+; MSVC-X86: andl $-32, %esp
+; MSVC-X86: subl $32, %esp
+; MSVC-X86: movl %esp, %esi
+; MSVC-X86: movl ___security_cookie, %[[REG1:[^ ]*]]
+; MSVC-X86: xorl %ebp, %[[REG1]]
+; MSVC-X86: movl %[[REG1]], [[SLOT:[0-9]*]](%esi)
+; MSVC-X86: calll __chkstk
+; MSVC-X86: pushl
+; MSVC-X86: calll _escape
+; MSVC-X86: movl [[SLOT]](%esi), %ecx
+; MSVC-X86: xorl %ebp, %ecx
+; MSVC-X86: calll @__security_check_cookie@4
+; MSVC-X86: leal -8(%ebp), %esp
+; MSVC-X86: popl %esi
+; MSVC-X86: popl %ebp
+; MSVC-X86: retl
+
+; MSVC-X64-LABEL: test_vla_realign:
+; MSVC-X64: pushq %rbp
+; MSVC-X64: pushq %rbx
+; MSVC-X64: subq $32, %rsp
+; MSVC-X64: leaq 32(%rsp), %rbp
+; MSVC-X64: andq $-32, %rsp
+; MSVC-X64: movq %rsp, %rbx
+; MSVC-X64: movq __security_cookie(%rip), %[[REG1:[^ ]*]]
+; MSVC-X64: xorq %rbp, %[[REG1]]
+; MSVC-X64: movq %[[REG1]], [[SLOT:[0-9]*]](%rbx)
+; MSVC-X64: callq __chkstk
+; MSVC-X64: callq escape
+; MSVC-X64: movq [[SLOT]](%rbx), %rcx
+; MSVC-X64: xorq %rbp, %rcx
+; MSVC-X64: callq __security_check_cookie
+; MSVC-X64: retq
+
+
declare i8* @strcpy(i8*, i8*) nounwind
declare i32 @printf(i8*, ...) nounwind
diff --git a/test/CodeGen/X86/stack-protector-vreg-to-vreg-copy.ll b/test/CodeGen/X86/stack-protector-vreg-to-vreg-copy.ll
index f3f9eebb26c4..7578e22225b8 100644
--- a/test/CodeGen/X86/stack-protector-vreg-to-vreg-copy.ll
+++ b/test/CodeGen/X86/stack-protector-vreg-to-vreg-copy.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple i386-unknown-freebsd10.0 -march=x86 --relocation-model=pic %s -o -
+; RUN: llc -mtriple i386-unknown-freebsd10.0 --relocation-model=pic %s -o -
; PR16979
diff --git a/test/CodeGen/X86/stack-protector-weight.ll b/test/CodeGen/X86/stack-protector-weight.ll
index d5a65ffb890b..e8a4746bb648 100644
--- a/test/CodeGen/X86/stack-protector-weight.ll
+++ b/test/CodeGen/X86/stack-protector-weight.ll
@@ -4,28 +4,29 @@
; RUN: llc -mtriple=i386-pc-windows-msvc -print-machineinstrs=expand-isel-pseudos -enable-selectiondag-sp=false %s -o /dev/null 2>&1 | FileCheck %s -check-prefix=MSVC-IR
; DARWIN-SELDAG: # Machine code for function test_branch_weights:
-; DARWIN-SELDAG: Successors according to CFG: BB#[[SUCCESS:[0-9]+]]({{[0-9a-fx/= ]+}}100.00%) BB#[[FAILURE:[0-9]+]]
-; DARWIN-SELDAG: BB#[[FAILURE]]:
-; DARWIN-SELDAG: CALL64pcrel32 <es:__stack_chk_fail>
-; DARWIN-SELDAG: BB#[[SUCCESS]]:
+; DARWIN-SELDAG: Successors according to CFG: %bb.[[SUCCESS:[0-9]+]]({{[0-9a-fx/= ]+}}100.00%) %bb.[[FAILURE:[0-9]+]]
+; DARWIN-SELDAG: %bb.[[FAILURE]]:
+; DARWIN-SELDAG: CALL64pcrel32 $__stack_chk_fail
+; DARWIN-SELDAG: %bb.[[SUCCESS]]:
; DARWIN-IR: # Machine code for function test_branch_weights:
-; DARWIN-IR: Successors according to CFG: BB#[[SUCCESS:[0-9]+]]({{[0-9a-fx/= ]+}}100.00%) BB#[[FAILURE:[0-9]+]]
-; DARWIN-IR: BB#[[SUCCESS]]:
-; DARWIN-IR: BB#[[FAILURE]]:
-; DARWIN-IR: CALL64pcrel32 <ga:@__stack_chk_fail>
+; DARWIN-IR: Successors according to CFG: %bb.[[SUCCESS:[0-9]+]]({{[0-9a-fx/= ]+}}100.00%) %bb.[[FAILURE:[0-9]+]]
+; DARWIN-IR: %bb.[[SUCCESS]]:
+; DARWIN-IR: %bb.[[FAILURE]]:
+; DARWIN-IR: CALL64pcrel32 @__stack_chk_fail
; MSVC-SELDAG: # Machine code for function test_branch_weights:
; MSVC-SELDAG: mem:Volatile LD4[@__security_cookie]
; MSVC-SELDAG: ST4[FixedStack0]
; MSVC-SELDAG: LD4[FixedStack0]
-; MSVC-SELDAG: CALLpcrel32 <ga:@__security_check_cookie>
+; MSVC-SELDAG: CALLpcrel32 @__security_check_cookie
+; MSVC always uses selection DAG now.
; MSVC-IR: # Machine code for function test_branch_weights:
; MSVC-IR: mem:Volatile LD4[@__security_cookie]
; MSVC-IR: ST4[FixedStack0]
-; MSVC-IR: LD4[%StackGuardSlot]
-; MSVC-IR: CALLpcrel32 <ga:@__security_check_cookie>
+; MSVC-IR: LD4[FixedStack0]
+; MSVC-IR: CALLpcrel32 @__security_check_cookie
define i32 @test_branch_weights(i32 %n) #0 {
entry:
diff --git a/test/CodeGen/X86/stack-size-section.ll b/test/CodeGen/X86/stack-size-section.ll
new file mode 100644
index 000000000000..28b26ae572ea
--- /dev/null
+++ b/test/CodeGen/X86/stack-size-section.ll
@@ -0,0 +1,30 @@
+; RUN: llc < %s -mtriple=x86_64-linux -stack-size-section | FileCheck %s
+
+; CHECK-LABEL: func1:
+; CHECK: .section .stack_sizes,"",@progbits
+; CHECK-NEXT: .quad func1
+; CHECK-NEXT: .byte 8
+define void @func1(i32, i32) #0 {
+ alloca i32, align 4
+ alloca i32, align 4
+ ret void
+}
+
+; CHECK-LABEL: func2:
+; CHECK: .section .stack_sizes,"",@progbits
+; CHECK-NEXT: .quad func2
+; CHECK-NEXT: .byte 24
+define void @func2() #0 {
+ alloca i32, align 4
+ call void @func1(i32 1, i32 2)
+ ret void
+}
+
+; CHECK-LABEL: dynalloc:
+; CHECK-NOT: .section .stack_sizes
+define void @dynalloc(i32 %N) #0 {
+ alloca i32, i32 %N
+ ret void
+}
+
+attributes #0 = { "no-frame-pointer-elim"="true" }
diff --git a/test/CodeGen/X86/stackmap-fast-isel.ll b/test/CodeGen/X86/stackmap-fast-isel.ll
index ae10a37756bc..dd25065f3063 100644
--- a/test/CodeGen/X86/stackmap-fast-isel.ll
+++ b/test/CodeGen/X86/stackmap-fast-isel.ll
@@ -157,7 +157,7 @@ define void @liveConstant() {
; CHECK-NEXT: .short 0
; 1 location
; CHECK-NEXT: .short 1
-; Loc 0: Direct RBP - ofs
+; Loc 0: Direct rbp - ofs
; CHECK-NEXT: .byte 2
; CHECK-NEXT: .byte 0
; CHECK-NEXT: .short 8
diff --git a/test/CodeGen/X86/stackmap-liveness.ll b/test/CodeGen/X86/stackmap-liveness.ll
index eb95b9c8df4e..4cbfe234ff41 100644
--- a/test/CodeGen/X86/stackmap-liveness.ll
+++ b/test/CodeGen/X86/stackmap-liveness.ll
@@ -48,7 +48,7 @@ entry:
; PATCH-NEXT: .short 0
; Num LiveOut Entries: 1
; PATCH-NEXT: .short 1
-; LiveOut Entry 1: %YMM2 (16 bytes) --> %XMM2
+; LiveOut Entry 1: %ymm2 (16 bytes) --> %xmm2
; PATCH-NEXT: .short 19
; PATCH-NEXT: .byte 0
; PATCH-NEXT: .byte 16
@@ -81,23 +81,23 @@ entry:
; PATCH-NEXT: .short 0
; Num LiveOut Entries: 5
; PATCH-NEXT: .short 5
-; LiveOut Entry 1: %RAX (1 bytes) --> %AL or %AH
+; LiveOut Entry 1: %rax (1 bytes) --> %al or %ah
; PATCH-NEXT: .short 0
; PATCH-NEXT: .byte 0
; PATCH-NEXT: .byte 1
-; LiveOut Entry 2: %R8 (8 bytes)
+; LiveOut Entry 2: %r8 (8 bytes)
; PATCH-NEXT: .short 8
; PATCH-NEXT: .byte 0
; PATCH-NEXT: .byte 8
-; LiveOut Entry 3: %YMM0 (32 bytes)
+; LiveOut Entry 3: %ymm0 (32 bytes)
; PATCH-NEXT: .short 17
; PATCH-NEXT: .byte 0
; PATCH-NEXT: .byte 32
-; LiveOut Entry 4: %YMM1 (32 bytes)
+; LiveOut Entry 4: %ymm1 (32 bytes)
; PATCH-NEXT: .short 18
; PATCH-NEXT: .byte 0
; PATCH-NEXT: .byte 32
-; LiveOut Entry 5: %YMM2 (16 bytes) --> %XMM2
+; LiveOut Entry 5: %ymm2 (16 bytes) --> %xmm2
; PATCH-NEXT: .short 19
; PATCH-NEXT: .byte 0
; PATCH-NEXT: .byte 16
@@ -127,11 +127,11 @@ entry:
; PATCH-NEXT: .short 0
; Num LiveOut Entries: 2
; PATCH-NEXT: .short 2
-; LiveOut Entry 1: %RSP (8 bytes)
+; LiveOut Entry 1: %rsp (8 bytes)
; PATCH-NEXT: .short 7
; PATCH-NEXT: .byte 0
; PATCH-NEXT: .byte 8
-; LiveOut Entry 2: %YMM2 (16 bytes) --> %XMM2
+; LiveOut Entry 2: %ymm2 (16 bytes) --> %xmm2
; PATCH-NEXT: .short 19
; PATCH-NEXT: .byte 0
; PATCH-NEXT: .byte 16
@@ -166,11 +166,11 @@ entry:
; PATCH-NEXT: .short 0
; Num LiveOut Entries: 2
; PATCH-NEXT: .short 2
-; LiveOut Entry 1: %RSP (8 bytes)
+; LiveOut Entry 1: %rsp (8 bytes)
; PATCH-NEXT: .short 7
; PATCH-NEXT: .byte 0
; PATCH-NEXT: .byte 8
-; LiveOut Entry 2: %YMM2 (16 bytes) --> %XMM2
+; LiveOut Entry 2: %ymm2 (16 bytes) --> %xmm2
; PATCH-NEXT: .short 19
; PATCH-NEXT: .byte 0
; PATCH-NEXT: .byte 16
diff --git a/test/CodeGen/X86/statepoint-allocas.ll b/test/CodeGen/X86/statepoint-allocas.ll
index b8e5c82913a5..bd820e0b83d3 100644
--- a/test/CodeGen/X86/statepoint-allocas.ll
+++ b/test/CodeGen/X86/statepoint-allocas.ll
@@ -96,7 +96,7 @@ declare token @llvm.experimental.gc.statepoint.p0f_i1f(i64, i32, i1 ()*, i32, i3
; CHECK: .short 0
; CHECK: .short 0
; CHECK: .long 0
-; Direct Spill Slot [RSP+0]
+; Direct Spill Slot [rsp+0]
; CHECK: .byte 2
; CHECK: .byte 0
; CHECK: .short 8
@@ -133,7 +133,7 @@ declare token @llvm.experimental.gc.statepoint.p0f_i1f(i64, i32, i1 ()*, i32, i3
; CHECK: .short 0
; CHECK: .short 0
; CHECK: .long 1
-; Direct Spill Slot [RSP+0]
+; Direct Spill Slot [rsp+0]
; CHECK: .byte 2
; CHECK: .byte 0
; CHECK: .short 8
diff --git a/test/CodeGen/X86/statepoint-live-in.ll b/test/CodeGen/X86/statepoint-live-in.ll
index 0179d37ad4e1..2c9b95916d8a 100644
--- a/test/CodeGen/X86/statepoint-live-in.ll
+++ b/test/CodeGen/X86/statepoint-live-in.ll
@@ -8,9 +8,8 @@ declare void @baz()
define void @test1(i32 %a) gc "statepoint-example" {
; CHECK-LABEL: test1:
-; CHECK: ## BB#0: ## %entry
+; CHECK: ## %bb.0: ## %entry
; CHECK-NEXT: pushq %rax
-; CHECK-NEXT: Lcfi0:
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: callq _bar
; CHECK-NEXT: Ltmp0:
@@ -24,19 +23,14 @@ entry:
define void @test2(i32 %a, i32 %b) gc "statepoint-example" {
; CHECK-LABEL: test2:
-; CHECK: ## BB#0: ## %entry
+; CHECK: ## %bb.0: ## %entry
; CHECK-NEXT: pushq %rbp
-; CHECK-NEXT: Lcfi1:
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: pushq %rbx
-; CHECK-NEXT: Lcfi2:
; CHECK-NEXT: .cfi_def_cfa_offset 24
; CHECK-NEXT: pushq %rax
-; CHECK-NEXT: Lcfi3:
; CHECK-NEXT: .cfi_def_cfa_offset 32
-; CHECK-NEXT: Lcfi4:
; CHECK-NEXT: .cfi_offset %rbx, -24
-; CHECK-NEXT: Lcfi5:
; CHECK-NEXT: .cfi_offset %rbp, -16
; CHECK-NEXT: movl %esi, %ebx
; CHECK-NEXT: movl %edi, %ebp
@@ -58,9 +52,8 @@ entry:
define void @test3(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h, i32 %i) gc "statepoint-example" {
; CHECK-LABEL: test3:
-; CHECK: ## BB#0: ## %entry
+; CHECK: ## %bb.0: ## %entry
; CHECK-NEXT: pushq %rax
-; CHECK-NEXT: Lcfi6:
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: callq _bar
; CHECK-NEXT: Ltmp3:
@@ -78,9 +71,8 @@ entry:
; stack slots into the statepoint.
define void @test4(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h, i32 %i, i32 %j, i32 %k, i32 %l, i32 %m, i32 %n, i32 %o, i32 %p, i32 %q, i32 %r, i32 %s, i32 %t, i32 %u, i32 %v, i32 %w, i32 %x, i32 %y, i32 %z) gc "statepoint-example" {
; CHECK-LABEL: test4:
-; CHECK: ## BB#0: ## %entry
+; CHECK: ## %bb.0: ## %entry
; CHECK-NEXT: pushq %rax
-; CHECK-NEXT: Lcfi7:
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: callq _bar
; CHECK-NEXT: Ltmp4:
@@ -97,9 +89,8 @@ entry:
; as to put less stress on the register allocator for no benefit.
define i32 addrspace(1)* @test5(i32 %a, i32 addrspace(1)* %p) gc "statepoint-example" {
; CHECK-LABEL: test5:
-; CHECK: ## BB#0: ## %entry
+; CHECK: ## %bb.0: ## %entry
; CHECK-NEXT: pushq %rax
-; CHECK-NEXT: Lcfi8:
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: movq %rsi, (%rsp)
; CHECK-NEXT: callq _bar
@@ -116,14 +107,11 @@ entry:
; Show the interaction of live-through spilling followed by live-in.
define void @test6(i32 %a) gc "statepoint-example" {
; CHECK-LABEL: test6:
-; CHECK: ## BB#0: ## %entry
+; CHECK: ## %bb.0: ## %entry
; CHECK-NEXT: pushq %rbx
-; CHECK-NEXT: Lcfi9:
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: subq $16, %rsp
-; CHECK-NEXT: Lcfi10:
; CHECK-NEXT: .cfi_def_cfa_offset 32
-; CHECK-NEXT: Lcfi11:
; CHECK-NEXT: .cfi_offset %rbx, -16
; CHECK-NEXT: movl %edi, %ebx
; CHECK-NEXT: movl %ebx, {{[0-9]+}}(%rsp)
diff --git a/test/CodeGen/X86/stdarg.ll b/test/CodeGen/X86/stdarg.ll
index 7b4f4e845fce..96d2f49cbbb0 100644
--- a/test/CodeGen/X86/stdarg.ll
+++ b/test/CodeGen/X86/stdarg.ll
@@ -14,8 +14,7 @@ entry:
;
; CHECK-DAG: movq {{.*}}, 192(%rsp)
; CHECK-DAG: movq {{.*}}, 184(%rsp)
-; CHECK-DAG: movl {{.*}}, 180(%rsp)
-; CHECK-DAG: movl {{.*}}, 176(%rsp)
+; CHECK-DAG: movq {{.*}}, 176(%rsp)
%ap3 = getelementptr inbounds [1 x %struct.__va_list_tag], [1 x %struct.__va_list_tag]* %ap, i64 0, i64 0; <%struct.__va_list_tag*> [#uses=1]
call void @bar(%struct.__va_list_tag* %ap3) nounwind
call void @llvm.va_end(i8* %ap12)
diff --git a/test/CodeGen/X86/store-empty-member.ll b/test/CodeGen/X86/store-empty-member.ll
index aea85b94d414..95d3bee98a58 100644
--- a/test/CodeGen/X86/store-empty-member.ll
+++ b/test/CodeGen/X86/store-empty-member.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | FileCheck %s
+; RUN: llc < %s -mtriple=i686-- | FileCheck %s
; Don't crash on an empty struct member.
diff --git a/test/CodeGen/X86/store-fp-constant.ll b/test/CodeGen/X86/store-fp-constant.ll
index 71df8d3109e6..caf3c28c3275 100644
--- a/test/CodeGen/X86/store-fp-constant.ll
+++ b/test/CodeGen/X86/store-fp-constant.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | FileCheck %s
+; RUN: llc < %s -mtriple=i686-- | FileCheck %s
; CHECK-NOT: rodata
; CHECK-NOT: literal
diff --git a/test/CodeGen/X86/store-global-address.ll b/test/CodeGen/X86/store-global-address.ll
index c8d4cbceea3d..31bb0d59c930 100644
--- a/test/CodeGen/X86/store-global-address.ll
+++ b/test/CodeGen/X86/store-global-address.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | grep movl | count 1
+; RUN: llc < %s -mtriple=i686-- | grep movl | count 1
@dst = global i32 0 ; <i32*> [#uses=1]
@ptr = global i32* null ; <i32**> [#uses=1]
diff --git a/test/CodeGen/X86/store-narrow.ll b/test/CodeGen/X86/store-narrow.ll
index 5e9e1e364fef..9fc166a533e4 100644
--- a/test/CodeGen/X86/store-narrow.ll
+++ b/test/CodeGen/X86/store-narrow.ll
@@ -1,9 +1,8 @@
; rdar://7860110
-; RUN: llc -asm-verbose=false < %s | FileCheck %s -check-prefix=X64
-; RUN: llc -march=x86 -asm-verbose=false -fixup-byte-word-insts=1 < %s | FileCheck %s -check-prefix=X32 -check-prefix=X32-BWON
-; RUN: llc -march=x86 -asm-verbose=false -fixup-byte-word-insts=0 < %s | FileCheck %s -check-prefix=X32 -check-prefix=X32-BWOFF
+; RUN: llc -mtriple=x86_64-apple-darwin10.2 -asm-verbose=false < %s | FileCheck %s -check-prefix=X64
+; RUN: llc -mtriple=i686-apple-darwin10.2 -asm-verbose=false -fixup-byte-word-insts=1 < %s | FileCheck %s -check-prefix=X32 -check-prefix=X32-BWON
+; RUN: llc -mtriple=i686-apple-darwin10.2 -asm-verbose=false -fixup-byte-word-insts=0 < %s | FileCheck %s -check-prefix=X32 -check-prefix=X32-BWOFF
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
-target triple = "x86_64-apple-darwin10.2"
define void @test1(i32* nocapture %a0, i8 zeroext %a1) nounwind ssp {
entry:
diff --git a/test/CodeGen/X86/stores-merging.ll b/test/CodeGen/X86/stores-merging.ll
index dbfb06881d82..5ccb5825934b 100644
--- a/test/CodeGen/X86/stores-merging.ll
+++ b/test/CodeGen/X86/stores-merging.ll
@@ -1,7 +1,5 @@
-; RUN: llc < %s | FileCheck %s
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx | FileCheck %s
%structTy = type { i8, i32, i32 }
@@ -12,12 +10,13 @@ target triple = "x86_64-unknown-linux-gnu"
;; order, the second in decreasing -- but in both cases should have
;; the same result in memory in the end.
-; CHECK-LABEL: redundant_stores_merging:
-; CHECK: movabsq $528280977409, %rax
-; CHECK: movq %rax, e+4(%rip)
-; CHECK: movl $456, e+8(%rip)
define void @redundant_stores_merging() {
-entry:
+; CHECK-LABEL: redundant_stores_merging:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movabsq $528280977409, %rax # imm = 0x7B00000001
+; CHECK-NEXT: movq %rax, e+{{.*}}(%rip)
+; CHECK-NEXT: movl $456, e+{{.*}}(%rip) # imm = 0x1C8
+; CHECK-NEXT: retq
store i32 1, i32* getelementptr inbounds (%structTy, %structTy* @e, i64 0, i32 1), align 4
store i32 123, i32* getelementptr inbounds (%structTy, %structTy* @e, i64 0, i32 2), align 4
store i32 456, i32* getelementptr inbounds (%structTy, %structTy* @e, i64 0, i32 2), align 4
@@ -25,12 +24,13 @@ entry:
}
;; This variant tests PR25154.
-; CHECK-LABEL: redundant_stores_merging_reverse:
-; CHECK: movabsq $528280977409, %rax
-; CHECK: movq %rax, e+4(%rip)
-; CHECK: movl $456, e+8(%rip)
define void @redundant_stores_merging_reverse() {
-entry:
+; CHECK-LABEL: redundant_stores_merging_reverse:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movabsq $528280977409, %rax # imm = 0x7B00000001
+; CHECK-NEXT: movq %rax, e+{{.*}}(%rip)
+; CHECK-NEXT: movl $456, e+{{.*}}(%rip) # imm = 0x1C8
+; CHECK-NEXT: retq
store i32 123, i32* getelementptr inbounds (%structTy, %structTy* @e, i64 0, i32 2), align 4
store i32 456, i32* getelementptr inbounds (%structTy, %structTy* @e, i64 0, i32 2), align 4
store i32 1, i32* getelementptr inbounds (%structTy, %structTy* @e, i64 0, i32 1), align 4
@@ -44,13 +44,180 @@ entry:
;; store to 3 comes first (e.g. by merging the stores to 0 and 2 into
;; a movl, after the store to 3).
-;; CHECK-LABEL: overlapping_stores_merging:
-;; CHECK: movl $1, b(%rip)
-;; CHECK: movw $2, b+3(%rip)
define void @overlapping_stores_merging() {
-entry:
+; CHECK-LABEL: overlapping_stores_merging:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movl $1, {{.*}}(%rip)
+; CHECK-NEXT: movw $2, b+{{.*}}(%rip)
+; CHECK-NEXT: retq
store i16 0, i16* bitcast (i8* getelementptr inbounds ([8 x i8], [8 x i8]* @b, i64 0, i64 2) to i16*), align 2
store i16 2, i16* bitcast (i8* getelementptr inbounds ([8 x i8], [8 x i8]* @b, i64 0, i64 3) to i16*), align 1
store i16 1, i16* bitcast (i8* getelementptr inbounds ([8 x i8], [8 x i8]* @b, i64 0, i64 0) to i16*), align 2
ret void
}
+
+define void @extract_vector_store_16_consecutive_bytes(<2 x i64> %v, i8* %ptr) #0 {
+; CHECK-LABEL: extract_vector_store_16_consecutive_bytes:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %xmm0, (%rdi)
+; CHECK-NEXT: retq
+ %bc = bitcast <2 x i64> %v to <16 x i8>
+ %ext00 = extractelement <16 x i8> %bc, i32 0
+ %ext01 = extractelement <16 x i8> %bc, i32 1
+ %ext02 = extractelement <16 x i8> %bc, i32 2
+ %ext03 = extractelement <16 x i8> %bc, i32 3
+ %ext04 = extractelement <16 x i8> %bc, i32 4
+ %ext05 = extractelement <16 x i8> %bc, i32 5
+ %ext06 = extractelement <16 x i8> %bc, i32 6
+ %ext07 = extractelement <16 x i8> %bc, i32 7
+ %ext08 = extractelement <16 x i8> %bc, i32 8
+ %ext09 = extractelement <16 x i8> %bc, i32 9
+ %ext10 = extractelement <16 x i8> %bc, i32 10
+ %ext11 = extractelement <16 x i8> %bc, i32 11
+ %ext12 = extractelement <16 x i8> %bc, i32 12
+ %ext13 = extractelement <16 x i8> %bc, i32 13
+ %ext14 = extractelement <16 x i8> %bc, i32 14
+ %ext15 = extractelement <16 x i8> %bc, i32 15
+ %gep00 = getelementptr inbounds i8, i8* %ptr, i64 0
+ %gep01 = getelementptr inbounds i8, i8* %ptr, i64 1
+ %gep02 = getelementptr inbounds i8, i8* %ptr, i64 2
+ %gep03 = getelementptr inbounds i8, i8* %ptr, i64 3
+ %gep04 = getelementptr inbounds i8, i8* %ptr, i64 4
+ %gep05 = getelementptr inbounds i8, i8* %ptr, i64 5
+ %gep06 = getelementptr inbounds i8, i8* %ptr, i64 6
+ %gep07 = getelementptr inbounds i8, i8* %ptr, i64 7
+ %gep08 = getelementptr inbounds i8, i8* %ptr, i64 8
+ %gep09 = getelementptr inbounds i8, i8* %ptr, i64 9
+ %gep10 = getelementptr inbounds i8, i8* %ptr, i64 10
+ %gep11 = getelementptr inbounds i8, i8* %ptr, i64 11
+ %gep12 = getelementptr inbounds i8, i8* %ptr, i64 12
+ %gep13 = getelementptr inbounds i8, i8* %ptr, i64 13
+ %gep14 = getelementptr inbounds i8, i8* %ptr, i64 14
+ %gep15 = getelementptr inbounds i8, i8* %ptr, i64 15
+ store i8 %ext00, i8* %gep00, align 1
+ store i8 %ext01, i8* %gep01, align 1
+ store i8 %ext02, i8* %gep02, align 1
+ store i8 %ext03, i8* %gep03, align 1
+ store i8 %ext04, i8* %gep04, align 1
+ store i8 %ext05, i8* %gep05, align 1
+ store i8 %ext06, i8* %gep06, align 1
+ store i8 %ext07, i8* %gep07, align 1
+ store i8 %ext08, i8* %gep08, align 1
+ store i8 %ext09, i8* %gep09, align 1
+ store i8 %ext10, i8* %gep10, align 1
+ store i8 %ext11, i8* %gep11, align 1
+ store i8 %ext12, i8* %gep12, align 1
+ store i8 %ext13, i8* %gep13, align 1
+ store i8 %ext14, i8* %gep14, align 1
+ store i8 %ext15, i8* %gep15, align 1
+ ret void
+}
+
+; PR34217 - https://bugs.llvm.org/show_bug.cgi?id=34217
+
+define void @extract_vector_store_32_consecutive_bytes(<4 x i64> %v, i8* %ptr) #0 {
+; CHECK-LABEL: extract_vector_store_32_consecutive_bytes:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovups %ymm0, (%rdi)
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %bc = bitcast <4 x i64> %v to <32 x i8>
+ %ext00 = extractelement <32 x i8> %bc, i32 0
+ %ext01 = extractelement <32 x i8> %bc, i32 1
+ %ext02 = extractelement <32 x i8> %bc, i32 2
+ %ext03 = extractelement <32 x i8> %bc, i32 3
+ %ext04 = extractelement <32 x i8> %bc, i32 4
+ %ext05 = extractelement <32 x i8> %bc, i32 5
+ %ext06 = extractelement <32 x i8> %bc, i32 6
+ %ext07 = extractelement <32 x i8> %bc, i32 7
+ %ext08 = extractelement <32 x i8> %bc, i32 8
+ %ext09 = extractelement <32 x i8> %bc, i32 9
+ %ext10 = extractelement <32 x i8> %bc, i32 10
+ %ext11 = extractelement <32 x i8> %bc, i32 11
+ %ext12 = extractelement <32 x i8> %bc, i32 12
+ %ext13 = extractelement <32 x i8> %bc, i32 13
+ %ext14 = extractelement <32 x i8> %bc, i32 14
+ %ext15 = extractelement <32 x i8> %bc, i32 15
+ %ext16 = extractelement <32 x i8> %bc, i32 16
+ %ext17 = extractelement <32 x i8> %bc, i32 17
+ %ext18 = extractelement <32 x i8> %bc, i32 18
+ %ext19 = extractelement <32 x i8> %bc, i32 19
+ %ext20 = extractelement <32 x i8> %bc, i32 20
+ %ext21 = extractelement <32 x i8> %bc, i32 21
+ %ext22 = extractelement <32 x i8> %bc, i32 22
+ %ext23 = extractelement <32 x i8> %bc, i32 23
+ %ext24 = extractelement <32 x i8> %bc, i32 24
+ %ext25 = extractelement <32 x i8> %bc, i32 25
+ %ext26 = extractelement <32 x i8> %bc, i32 26
+ %ext27 = extractelement <32 x i8> %bc, i32 27
+ %ext28 = extractelement <32 x i8> %bc, i32 28
+ %ext29 = extractelement <32 x i8> %bc, i32 29
+ %ext30 = extractelement <32 x i8> %bc, i32 30
+ %ext31 = extractelement <32 x i8> %bc, i32 31
+ %gep00 = getelementptr inbounds i8, i8* %ptr, i64 0
+ %gep01 = getelementptr inbounds i8, i8* %ptr, i64 1
+ %gep02 = getelementptr inbounds i8, i8* %ptr, i64 2
+ %gep03 = getelementptr inbounds i8, i8* %ptr, i64 3
+ %gep04 = getelementptr inbounds i8, i8* %ptr, i64 4
+ %gep05 = getelementptr inbounds i8, i8* %ptr, i64 5
+ %gep06 = getelementptr inbounds i8, i8* %ptr, i64 6
+ %gep07 = getelementptr inbounds i8, i8* %ptr, i64 7
+ %gep08 = getelementptr inbounds i8, i8* %ptr, i64 8
+ %gep09 = getelementptr inbounds i8, i8* %ptr, i64 9
+ %gep10 = getelementptr inbounds i8, i8* %ptr, i64 10
+ %gep11 = getelementptr inbounds i8, i8* %ptr, i64 11
+ %gep12 = getelementptr inbounds i8, i8* %ptr, i64 12
+ %gep13 = getelementptr inbounds i8, i8* %ptr, i64 13
+ %gep14 = getelementptr inbounds i8, i8* %ptr, i64 14
+ %gep15 = getelementptr inbounds i8, i8* %ptr, i64 15
+ %gep16 = getelementptr inbounds i8, i8* %ptr, i64 16
+ %gep17 = getelementptr inbounds i8, i8* %ptr, i64 17
+ %gep18 = getelementptr inbounds i8, i8* %ptr, i64 18
+ %gep19 = getelementptr inbounds i8, i8* %ptr, i64 19
+ %gep20 = getelementptr inbounds i8, i8* %ptr, i64 20
+ %gep21 = getelementptr inbounds i8, i8* %ptr, i64 21
+ %gep22 = getelementptr inbounds i8, i8* %ptr, i64 22
+ %gep23 = getelementptr inbounds i8, i8* %ptr, i64 23
+ %gep24 = getelementptr inbounds i8, i8* %ptr, i64 24
+ %gep25 = getelementptr inbounds i8, i8* %ptr, i64 25
+ %gep26 = getelementptr inbounds i8, i8* %ptr, i64 26
+ %gep27 = getelementptr inbounds i8, i8* %ptr, i64 27
+ %gep28 = getelementptr inbounds i8, i8* %ptr, i64 28
+ %gep29 = getelementptr inbounds i8, i8* %ptr, i64 29
+ %gep30 = getelementptr inbounds i8, i8* %ptr, i64 30
+ %gep31 = getelementptr inbounds i8, i8* %ptr, i64 31
+ store i8 %ext00, i8* %gep00, align 1
+ store i8 %ext01, i8* %gep01, align 1
+ store i8 %ext02, i8* %gep02, align 1
+ store i8 %ext03, i8* %gep03, align 1
+ store i8 %ext04, i8* %gep04, align 1
+ store i8 %ext05, i8* %gep05, align 1
+ store i8 %ext06, i8* %gep06, align 1
+ store i8 %ext07, i8* %gep07, align 1
+ store i8 %ext08, i8* %gep08, align 1
+ store i8 %ext09, i8* %gep09, align 1
+ store i8 %ext10, i8* %gep10, align 1
+ store i8 %ext11, i8* %gep11, align 1
+ store i8 %ext12, i8* %gep12, align 1
+ store i8 %ext13, i8* %gep13, align 1
+ store i8 %ext14, i8* %gep14, align 1
+ store i8 %ext15, i8* %gep15, align 1
+ store i8 %ext16, i8* %gep16, align 1
+ store i8 %ext17, i8* %gep17, align 1
+ store i8 %ext18, i8* %gep18, align 1
+ store i8 %ext19, i8* %gep19, align 1
+ store i8 %ext20, i8* %gep20, align 1
+ store i8 %ext21, i8* %gep21, align 1
+ store i8 %ext22, i8* %gep22, align 1
+ store i8 %ext23, i8* %gep23, align 1
+ store i8 %ext24, i8* %gep24, align 1
+ store i8 %ext25, i8* %gep25, align 1
+ store i8 %ext26, i8* %gep26, align 1
+ store i8 %ext27, i8* %gep27, align 1
+ store i8 %ext28, i8* %gep28, align 1
+ store i8 %ext29, i8* %gep29, align 1
+ store i8 %ext30, i8* %gep30, align 1
+ store i8 %ext31, i8* %gep31, align 1
+ ret void
+}
+
diff --git a/test/CodeGen/X86/storetrunc-fp.ll b/test/CodeGen/X86/storetrunc-fp.ll
index 03ad093ba860..0ead0bab410d 100644
--- a/test/CodeGen/X86/storetrunc-fp.ll
+++ b/test/CodeGen/X86/storetrunc-fp.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | not grep flds
+; RUN: llc < %s -mtriple=i686-- | not grep flds
define void @foo(x86_fp80 %a, x86_fp80 %b, float* %fp) {
%c = fadd x86_fp80 %a, %b
diff --git a/test/CodeGen/X86/stride-nine-with-base-reg.ll b/test/CodeGen/X86/stride-nine-with-base-reg.ll
index 551bd7c2541e..7c8f362a2cfc 100644
--- a/test/CodeGen/X86/stride-nine-with-base-reg.ll
+++ b/test/CodeGen/X86/stride-nine-with-base-reg.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -relocation-model=static | FileCheck %s
+; RUN: llc < %s -mtriple=i686-- -relocation-model=static | FileCheck %s
; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s
; CHECK-NOT: lea
diff --git a/test/CodeGen/X86/stride-reuse.ll b/test/CodeGen/X86/stride-reuse.ll
index af036f3a8f14..986207428cc7 100644
--- a/test/CodeGen/X86/stride-reuse.ll
+++ b/test/CodeGen/X86/stride-reuse.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mcpu=generic -march=x86 | FileCheck %s
+; RUN: llc < %s -mcpu=generic -mtriple=i686-- | FileCheck %s
; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux | FileCheck %s
; CHECK-NOT: lea
diff --git a/test/CodeGen/X86/sub.ll b/test/CodeGen/X86/sub.ll
index 3cf79a3deca2..3251cb606802 100644
--- a/test/CodeGen/X86/sub.ll
+++ b/test/CodeGen/X86/sub.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=x86 < %s | FileCheck %s
+; RUN: llc -mtriple=i686-- < %s | FileCheck %s
define i32 @test1(i32 %x) {
%xor = xor i32 %x, 31
diff --git a/test/CodeGen/X86/subcarry.ll b/test/CodeGen/X86/subcarry.ll
index df676328f682..862d489e138d 100644
--- a/test/CodeGen/X86/subcarry.ll
+++ b/test/CodeGen/X86/subcarry.ll
@@ -5,7 +5,7 @@
define %S @negate(%S* nocapture readonly %this) {
; CHECK-LABEL: negate:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movq (%rsi), %rax
; CHECK-NEXT: movq 8(%rsi), %rcx
; CHECK-NEXT: notq %rax
@@ -62,7 +62,7 @@ entry:
define %S @sub(%S* nocapture readonly %this, %S %arg.b) local_unnamed_addr {
; CHECK-LABEL: sub:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: notq %rdx
; CHECK-NEXT: xorl %r10d, %r10d
; CHECK-NEXT: addq (%rsi), %rdx
diff --git a/test/CodeGen/X86/subreg-to-reg-0.ll b/test/CodeGen/X86/subreg-to-reg-0.ll
index 251a754f4383..bd6007629648 100644
--- a/test/CodeGen/X86/subreg-to-reg-0.ll
+++ b/test/CodeGen/X86/subreg-to-reg-0.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 | grep mov | count 1
+; RUN: llc < %s -mtriple=x86_64-- | grep mov | count 1
; Do eliminate the zero-extension instruction and rely on
; x86-64's implicit zero-extension!
diff --git a/test/CodeGen/X86/subreg-to-reg-1.ll b/test/CodeGen/X86/subreg-to-reg-1.ll
index 2931bab0cdd1..8acdb6176f57 100644
--- a/test/CodeGen/X86/subreg-to-reg-1.ll
+++ b/test/CodeGen/X86/subreg-to-reg-1.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-- | FileCheck %s
; CHECK: {{leal .*[)], %e.*}}
; CHECK-NOT: {{leal .*[)], %e.*}}
diff --git a/test/CodeGen/X86/subreg-to-reg-3.ll b/test/CodeGen/X86/subreg-to-reg-3.ll
index 80ab1a2e2494..db9d0d12c3d7 100644
--- a/test/CodeGen/X86/subreg-to-reg-3.ll
+++ b/test/CodeGen/X86/subreg-to-reg-3.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-- | FileCheck %s
; CHECK: imull
diff --git a/test/CodeGen/X86/subreg-to-reg-4.ll b/test/CodeGen/X86/subreg-to-reg-4.ll
index 8340fc536140..2a9a490c570d 100644
--- a/test/CodeGen/X86/subreg-to-reg-4.ll
+++ b/test/CodeGen/X86/subreg-to-reg-4.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 > %t
+; RUN: llc < %s -mtriple=x86_64-- > %t
; RUN: not grep leaq %t
; RUN: not grep incq %t
; RUN: not grep decq %t
diff --git a/test/CodeGen/X86/subreg-to-reg-6.ll b/test/CodeGen/X86/subreg-to-reg-6.ll
index bef09fa944ef..6e4e24c08b6b 100644
--- a/test/CodeGen/X86/subreg-to-reg-6.ll
+++ b/test/CodeGen/X86/subreg-to-reg-6.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64
+; RUN: llc < %s -mtriple=x86_64--
define i64 @foo() nounwind {
entry:
diff --git a/test/CodeGen/X86/subvector-broadcast.ll b/test/CodeGen/X86/subvector-broadcast.ll
index 94d3b22a4c80..33cf2f453ba9 100644
--- a/test/CodeGen/X86/subvector-broadcast.ll
+++ b/test/CodeGen/X86/subvector-broadcast.ll
@@ -1,63 +1,30 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx | FileCheck %s --check-prefix=X32 --check-prefix=X32-AVX --check-prefix=X32-AVX1
-; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefix=X32 --check-prefix=X32-AVX --check-prefix=X32-AVX2
-; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx512vl | FileCheck %s --check-prefix=X32 --check-prefix=X32-AVX512 --check-prefix=X32-AVX512F
-; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=X32 --check-prefix=X32-AVX512 --check-prefix=X32-AVX512BW
-; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefix=X32 --check-prefix=X32-AVX512 --check-prefix=X32-AVX512DQ
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX1
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX2
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512vl | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX512 --check-prefix=X64-AVX512F
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX512 --check-prefix=X64-AVX512BW
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX512 --check-prefix=X64-AVX512DQ
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=X32 --check-prefix=X32-AVX --check-prefix=X32-AVX1
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X32 --check-prefix=X32-AVX --check-prefix=X32-AVX2
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=X32 --check-prefix=X32-AVX512 --check-prefix=X32-AVX512F
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=X32 --check-prefix=X32-AVX512 --check-prefix=X32-AVX512BW
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefix=X32 --check-prefix=X32-AVX512 --check-prefix=X32-AVX512DQ
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX512 --check-prefix=X64-AVX512F
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX512 --check-prefix=X64-AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX512 --check-prefix=X64-AVX512DQ
;
; Subvector Load + Broadcast
;
define <4 x double> @test_broadcast_2f64_4f64(<2 x double> *%p) nounwind {
-; X32-AVX-LABEL: test_broadcast_2f64_4f64:
-; X32-AVX: ## BB#0:
-; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
-; X32-AVX-NEXT: retl
-;
-; X32-AVX512F-LABEL: test_broadcast_2f64_4f64:
-; X32-AVX512F: ## BB#0:
-; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-AVX512F-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
-; X32-AVX512F-NEXT: retl
-;
-; X32-AVX512BW-LABEL: test_broadcast_2f64_4f64:
-; X32-AVX512BW: ## BB#0:
-; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-AVX512BW-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
-; X32-AVX512BW-NEXT: retl
-;
-; X32-AVX512DQ-LABEL: test_broadcast_2f64_4f64:
-; X32-AVX512DQ: ## BB#0:
-; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-AVX512DQ-NEXT: vbroadcastf64x2 {{.*#+}} ymm0 = mem[0,1,0,1]
-; X32-AVX512DQ-NEXT: retl
-;
-; X64-AVX-LABEL: test_broadcast_2f64_4f64:
-; X64-AVX: ## BB#0:
-; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
-; X64-AVX-NEXT: retq
-;
-; X64-AVX512F-LABEL: test_broadcast_2f64_4f64:
-; X64-AVX512F: ## BB#0:
-; X64-AVX512F-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
-; X64-AVX512F-NEXT: retq
-;
-; X64-AVX512BW-LABEL: test_broadcast_2f64_4f64:
-; X64-AVX512BW: ## BB#0:
-; X64-AVX512BW-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
-; X64-AVX512BW-NEXT: retq
+; X32-LABEL: test_broadcast_2f64_4f64:
+; X32: # %bb.0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
+; X32-NEXT: retl
;
-; X64-AVX512DQ-LABEL: test_broadcast_2f64_4f64:
-; X64-AVX512DQ: ## BB#0:
-; X64-AVX512DQ-NEXT: vbroadcastf64x2 {{.*#+}} ymm0 = mem[0,1,0,1]
-; X64-AVX512DQ-NEXT: retq
+; X64-LABEL: test_broadcast_2f64_4f64:
+; X64: # %bb.0:
+; X64-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
+; X64-NEXT: retq
%1 = load <2 x double>, <2 x double> *%p
%2 = shufflevector <2 x double> %1, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
ret <4 x double> %2
@@ -65,50 +32,28 @@ define <4 x double> @test_broadcast_2f64_4f64(<2 x double> *%p) nounwind {
define <8 x double> @test_broadcast_2f64_8f64(<2 x double> *%p) nounwind {
; X32-AVX-LABEL: test_broadcast_2f64_8f64:
-; X32-AVX: ## BB#0:
+; X32-AVX: # %bb.0:
; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
; X32-AVX-NEXT: vmovdqa %ymm0, %ymm1
; X32-AVX-NEXT: retl
;
-; X32-AVX512F-LABEL: test_broadcast_2f64_8f64:
-; X32-AVX512F: ## BB#0:
-; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-AVX512F-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; X32-AVX512F-NEXT: retl
-;
-; X32-AVX512BW-LABEL: test_broadcast_2f64_8f64:
-; X32-AVX512BW: ## BB#0:
-; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-AVX512BW-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; X32-AVX512BW-NEXT: retl
-;
-; X32-AVX512DQ-LABEL: test_broadcast_2f64_8f64:
-; X32-AVX512DQ: ## BB#0:
-; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-AVX512DQ-NEXT: vbroadcastf64x2 {{.*#+}} zmm0 = mem[0,1,0,1,0,1,0,1]
-; X32-AVX512DQ-NEXT: retl
+; X32-AVX512-LABEL: test_broadcast_2f64_8f64:
+; X32-AVX512: # %bb.0:
+; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-AVX512-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; X32-AVX512-NEXT: retl
;
; X64-AVX-LABEL: test_broadcast_2f64_8f64:
-; X64-AVX: ## BB#0:
+; X64-AVX: # %bb.0:
; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
; X64-AVX-NEXT: vmovdqa %ymm0, %ymm1
; X64-AVX-NEXT: retq
;
-; X64-AVX512F-LABEL: test_broadcast_2f64_8f64:
-; X64-AVX512F: ## BB#0:
-; X64-AVX512F-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; X64-AVX512F-NEXT: retq
-;
-; X64-AVX512BW-LABEL: test_broadcast_2f64_8f64:
-; X64-AVX512BW: ## BB#0:
-; X64-AVX512BW-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; X64-AVX512BW-NEXT: retq
-;
-; X64-AVX512DQ-LABEL: test_broadcast_2f64_8f64:
-; X64-AVX512DQ: ## BB#0:
-; X64-AVX512DQ-NEXT: vbroadcastf64x2 {{.*#+}} zmm0 = mem[0,1,0,1,0,1,0,1]
-; X64-AVX512DQ-NEXT: retq
+; X64-AVX512-LABEL: test_broadcast_2f64_8f64:
+; X64-AVX512: # %bb.0:
+; X64-AVX512-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; X64-AVX512-NEXT: retq
%1 = load <2 x double>, <2 x double> *%p
%2 = shufflevector <2 x double> %1, <2 x double> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
ret <8 x double> %2
@@ -116,26 +61,26 @@ define <8 x double> @test_broadcast_2f64_8f64(<2 x double> *%p) nounwind {
define <8 x double> @test_broadcast_4f64_8f64(<4 x double> *%p) nounwind {
; X32-AVX-LABEL: test_broadcast_4f64_8f64:
-; X32-AVX: ## BB#0:
+; X32-AVX: # %bb.0:
; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-AVX-NEXT: vmovaps (%eax), %ymm0
; X32-AVX-NEXT: vmovaps %ymm0, %ymm1
; X32-AVX-NEXT: retl
;
; X32-AVX512-LABEL: test_broadcast_4f64_8f64:
-; X32-AVX512: ## BB#0:
+; X32-AVX512: # %bb.0:
; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-AVX512-NEXT: vbroadcastf64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
; X32-AVX512-NEXT: retl
;
; X64-AVX-LABEL: test_broadcast_4f64_8f64:
-; X64-AVX: ## BB#0:
+; X64-AVX: # %bb.0:
; X64-AVX-NEXT: vmovaps (%rdi), %ymm0
; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
; X64-AVX-NEXT: retq
;
; X64-AVX512-LABEL: test_broadcast_4f64_8f64:
-; X64-AVX512: ## BB#0:
+; X64-AVX512: # %bb.0:
; X64-AVX512-NEXT: vbroadcastf64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
; X64-AVX512-NEXT: retq
%1 = load <4 x double>, <4 x double> *%p
@@ -145,48 +90,26 @@ define <8 x double> @test_broadcast_4f64_8f64(<4 x double> *%p) nounwind {
define <4 x i64> @test_broadcast_2i64_4i64(<2 x i64> *%p) nounwind {
; X32-AVX-LABEL: test_broadcast_2i64_4i64:
-; X32-AVX: ## BB#0:
+; X32-AVX: # %bb.0:
; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
; X32-AVX-NEXT: retl
;
-; X32-AVX512F-LABEL: test_broadcast_2i64_4i64:
-; X32-AVX512F: ## BB#0:
-; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
-; X32-AVX512F-NEXT: retl
-;
-; X32-AVX512BW-LABEL: test_broadcast_2i64_4i64:
-; X32-AVX512BW: ## BB#0:
-; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
-; X32-AVX512BW-NEXT: retl
-;
-; X32-AVX512DQ-LABEL: test_broadcast_2i64_4i64:
-; X32-AVX512DQ: ## BB#0:
-; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-AVX512DQ-NEXT: vbroadcasti64x2 {{.*#+}} ymm0 = mem[0,1,0,1]
-; X32-AVX512DQ-NEXT: retl
+; X32-AVX512-LABEL: test_broadcast_2i64_4i64:
+; X32-AVX512: # %bb.0:
+; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
+; X32-AVX512-NEXT: retl
;
; X64-AVX-LABEL: test_broadcast_2i64_4i64:
-; X64-AVX: ## BB#0:
+; X64-AVX: # %bb.0:
; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
; X64-AVX-NEXT: retq
;
-; X64-AVX512F-LABEL: test_broadcast_2i64_4i64:
-; X64-AVX512F: ## BB#0:
-; X64-AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
-; X64-AVX512F-NEXT: retq
-;
-; X64-AVX512BW-LABEL: test_broadcast_2i64_4i64:
-; X64-AVX512BW: ## BB#0:
-; X64-AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
-; X64-AVX512BW-NEXT: retq
-;
-; X64-AVX512DQ-LABEL: test_broadcast_2i64_4i64:
-; X64-AVX512DQ: ## BB#0:
-; X64-AVX512DQ-NEXT: vbroadcasti64x2 {{.*#+}} ymm0 = mem[0,1,0,1]
-; X64-AVX512DQ-NEXT: retq
+; X64-AVX512-LABEL: test_broadcast_2i64_4i64:
+; X64-AVX512: # %bb.0:
+; X64-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
+; X64-AVX512-NEXT: retq
%1 = load <2 x i64>, <2 x i64> *%p
%2 = shufflevector <2 x i64> %1, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
ret <4 x i64> %2
@@ -194,63 +117,41 @@ define <4 x i64> @test_broadcast_2i64_4i64(<2 x i64> *%p) nounwind {
define <8 x i64> @test_broadcast_2i64_8i64(<2 x i64> *%p) nounwind {
; X32-AVX1-LABEL: test_broadcast_2i64_8i64:
-; X32-AVX1: ## BB#0:
+; X32-AVX1: # %bb.0:
; X32-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
; X32-AVX1-NEXT: vmovdqa %ymm0, %ymm1
; X32-AVX1-NEXT: retl
;
; X32-AVX2-LABEL: test_broadcast_2i64_8i64:
-; X32-AVX2: ## BB#0:
+; X32-AVX2: # %bb.0:
; X32-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-AVX2-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
; X32-AVX2-NEXT: vmovaps %ymm0, %ymm1
; X32-AVX2-NEXT: retl
;
-; X32-AVX512F-LABEL: test_broadcast_2i64_8i64:
-; X32-AVX512F: ## BB#0:
-; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; X32-AVX512F-NEXT: retl
-;
-; X32-AVX512BW-LABEL: test_broadcast_2i64_8i64:
-; X32-AVX512BW: ## BB#0:
-; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; X32-AVX512BW-NEXT: retl
-;
-; X32-AVX512DQ-LABEL: test_broadcast_2i64_8i64:
-; X32-AVX512DQ: ## BB#0:
-; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-AVX512DQ-NEXT: vbroadcasti64x2 {{.*#+}} zmm0 = mem[0,1,0,1,0,1,0,1]
-; X32-AVX512DQ-NEXT: retl
+; X32-AVX512-LABEL: test_broadcast_2i64_8i64:
+; X32-AVX512: # %bb.0:
+; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; X32-AVX512-NEXT: retl
;
; X64-AVX1-LABEL: test_broadcast_2i64_8i64:
-; X64-AVX1: ## BB#0:
+; X64-AVX1: # %bb.0:
; X64-AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
; X64-AVX1-NEXT: vmovdqa %ymm0, %ymm1
; X64-AVX1-NEXT: retq
;
; X64-AVX2-LABEL: test_broadcast_2i64_8i64:
-; X64-AVX2: ## BB#0:
+; X64-AVX2: # %bb.0:
; X64-AVX2-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
; X64-AVX2-NEXT: vmovaps %ymm0, %ymm1
; X64-AVX2-NEXT: retq
;
-; X64-AVX512F-LABEL: test_broadcast_2i64_8i64:
-; X64-AVX512F: ## BB#0:
-; X64-AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; X64-AVX512F-NEXT: retq
-;
-; X64-AVX512BW-LABEL: test_broadcast_2i64_8i64:
-; X64-AVX512BW: ## BB#0:
-; X64-AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; X64-AVX512BW-NEXT: retq
-;
-; X64-AVX512DQ-LABEL: test_broadcast_2i64_8i64:
-; X64-AVX512DQ: ## BB#0:
-; X64-AVX512DQ-NEXT: vbroadcasti64x2 {{.*#+}} zmm0 = mem[0,1,0,1,0,1,0,1]
-; X64-AVX512DQ-NEXT: retq
+; X64-AVX512-LABEL: test_broadcast_2i64_8i64:
+; X64-AVX512: # %bb.0:
+; X64-AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; X64-AVX512-NEXT: retq
%1 = load <2 x i64>, <2 x i64> *%p
%2 = shufflevector <2 x i64> %1, <2 x i64> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
ret <8 x i64> %2
@@ -258,26 +159,26 @@ define <8 x i64> @test_broadcast_2i64_8i64(<2 x i64> *%p) nounwind {
define <8 x i64> @test_broadcast_4i64_8i64(<4 x i64> *%p) nounwind {
; X32-AVX-LABEL: test_broadcast_4i64_8i64:
-; X32-AVX: ## BB#0:
+; X32-AVX: # %bb.0:
; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-AVX-NEXT: vmovaps (%eax), %ymm0
; X32-AVX-NEXT: vmovaps %ymm0, %ymm1
; X32-AVX-NEXT: retl
;
; X32-AVX512-LABEL: test_broadcast_4i64_8i64:
-; X32-AVX512: ## BB#0:
+; X32-AVX512: # %bb.0:
; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
; X32-AVX512-NEXT: retl
;
; X64-AVX-LABEL: test_broadcast_4i64_8i64:
-; X64-AVX: ## BB#0:
+; X64-AVX: # %bb.0:
; X64-AVX-NEXT: vmovaps (%rdi), %ymm0
; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
; X64-AVX-NEXT: retq
;
; X64-AVX512-LABEL: test_broadcast_4i64_8i64:
-; X64-AVX512: ## BB#0:
+; X64-AVX512: # %bb.0:
; X64-AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
; X64-AVX512-NEXT: retq
%1 = load <4 x i64>, <4 x i64> *%p
@@ -287,13 +188,13 @@ define <8 x i64> @test_broadcast_4i64_8i64(<4 x i64> *%p) nounwind {
define <8 x float> @test_broadcast_4f32_8f32(<4 x float> *%p) nounwind {
; X32-LABEL: test_broadcast_4f32_8f32:
-; X32: ## BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
; X32-NEXT: retl
;
; X64-LABEL: test_broadcast_4f32_8f32:
-; X64: ## BB#0:
+; X64: # %bb.0:
; X64-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
; X64-NEXT: retq
%1 = load <4 x float>, <4 x float> *%p
@@ -303,26 +204,26 @@ define <8 x float> @test_broadcast_4f32_8f32(<4 x float> *%p) nounwind {
define <16 x float> @test_broadcast_4f32_16f32(<4 x float> *%p) nounwind {
; X32-AVX-LABEL: test_broadcast_4f32_16f32:
-; X32-AVX: ## BB#0:
+; X32-AVX: # %bb.0:
; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
; X32-AVX-NEXT: vmovdqa %ymm0, %ymm1
; X32-AVX-NEXT: retl
;
; X32-AVX512-LABEL: test_broadcast_4f32_16f32:
-; X32-AVX512: ## BB#0:
+; X32-AVX512: # %bb.0:
; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-AVX512-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; X32-AVX512-NEXT: retl
;
; X64-AVX-LABEL: test_broadcast_4f32_16f32:
-; X64-AVX: ## BB#0:
+; X64-AVX: # %bb.0:
; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
; X64-AVX-NEXT: vmovdqa %ymm0, %ymm1
; X64-AVX-NEXT: retq
;
; X64-AVX512-LABEL: test_broadcast_4f32_16f32:
-; X64-AVX512: ## BB#0:
+; X64-AVX512: # %bb.0:
; X64-AVX512-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; X64-AVX512-NEXT: retq
%1 = load <4 x float>, <4 x float> *%p
@@ -332,50 +233,28 @@ define <16 x float> @test_broadcast_4f32_16f32(<4 x float> *%p) nounwind {
define <16 x float> @test_broadcast_8f32_16f32(<8 x float> *%p) nounwind {
; X32-AVX-LABEL: test_broadcast_8f32_16f32:
-; X32-AVX: ## BB#0:
+; X32-AVX: # %bb.0:
; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-AVX-NEXT: vmovaps (%eax), %ymm0
; X32-AVX-NEXT: vmovaps %ymm0, %ymm1
; X32-AVX-NEXT: retl
;
-; X32-AVX512F-LABEL: test_broadcast_8f32_16f32:
-; X32-AVX512F: ## BB#0:
-; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-AVX512F-NEXT: vbroadcastf64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
-; X32-AVX512F-NEXT: retl
-;
-; X32-AVX512BW-LABEL: test_broadcast_8f32_16f32:
-; X32-AVX512BW: ## BB#0:
-; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-AVX512BW-NEXT: vbroadcastf64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
-; X32-AVX512BW-NEXT: retl
-;
-; X32-AVX512DQ-LABEL: test_broadcast_8f32_16f32:
-; X32-AVX512DQ: ## BB#0:
-; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-AVX512DQ-NEXT: vbroadcastf32x8 {{.*#+}} zmm0 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
-; X32-AVX512DQ-NEXT: retl
+; X32-AVX512-LABEL: test_broadcast_8f32_16f32:
+; X32-AVX512: # %bb.0:
+; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-AVX512-NEXT: vbroadcastf64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
+; X32-AVX512-NEXT: retl
;
; X64-AVX-LABEL: test_broadcast_8f32_16f32:
-; X64-AVX: ## BB#0:
+; X64-AVX: # %bb.0:
; X64-AVX-NEXT: vmovaps (%rdi), %ymm0
; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
; X64-AVX-NEXT: retq
;
-; X64-AVX512F-LABEL: test_broadcast_8f32_16f32:
-; X64-AVX512F: ## BB#0:
-; X64-AVX512F-NEXT: vbroadcastf64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
-; X64-AVX512F-NEXT: retq
-;
-; X64-AVX512BW-LABEL: test_broadcast_8f32_16f32:
-; X64-AVX512BW: ## BB#0:
-; X64-AVX512BW-NEXT: vbroadcastf64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
-; X64-AVX512BW-NEXT: retq
-;
-; X64-AVX512DQ-LABEL: test_broadcast_8f32_16f32:
-; X64-AVX512DQ: ## BB#0:
-; X64-AVX512DQ-NEXT: vbroadcastf32x8 {{.*#+}} zmm0 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
-; X64-AVX512DQ-NEXT: retq
+; X64-AVX512-LABEL: test_broadcast_8f32_16f32:
+; X64-AVX512: # %bb.0:
+; X64-AVX512-NEXT: vbroadcastf64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
+; X64-AVX512-NEXT: retq
%1 = load <8 x float>, <8 x float> *%p
%2 = shufflevector <8 x float> %1, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
ret <16 x float> %2
@@ -383,24 +262,24 @@ define <16 x float> @test_broadcast_8f32_16f32(<8 x float> *%p) nounwind {
define <8 x i32> @test_broadcast_4i32_8i32(<4 x i32> *%p) nounwind {
; X32-AVX-LABEL: test_broadcast_4i32_8i32:
-; X32-AVX: ## BB#0:
+; X32-AVX: # %bb.0:
; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
; X32-AVX-NEXT: retl
;
; X32-AVX512-LABEL: test_broadcast_4i32_8i32:
-; X32-AVX512: ## BB#0:
+; X32-AVX512: # %bb.0:
; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
; X32-AVX512-NEXT: retl
;
; X64-AVX-LABEL: test_broadcast_4i32_8i32:
-; X64-AVX: ## BB#0:
+; X64-AVX: # %bb.0:
; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
; X64-AVX-NEXT: retq
;
; X64-AVX512-LABEL: test_broadcast_4i32_8i32:
-; X64-AVX512: ## BB#0:
+; X64-AVX512: # %bb.0:
; X64-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
; X64-AVX512-NEXT: retq
%1 = load <4 x i32>, <4 x i32> *%p
@@ -410,39 +289,39 @@ define <8 x i32> @test_broadcast_4i32_8i32(<4 x i32> *%p) nounwind {
define <16 x i32> @test_broadcast_4i32_16i32(<4 x i32> *%p) nounwind {
; X32-AVX1-LABEL: test_broadcast_4i32_16i32:
-; X32-AVX1: ## BB#0:
+; X32-AVX1: # %bb.0:
; X32-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
; X32-AVX1-NEXT: vmovdqa %ymm0, %ymm1
; X32-AVX1-NEXT: retl
;
; X32-AVX2-LABEL: test_broadcast_4i32_16i32:
-; X32-AVX2: ## BB#0:
+; X32-AVX2: # %bb.0:
; X32-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-AVX2-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
; X32-AVX2-NEXT: vmovaps %ymm0, %ymm1
; X32-AVX2-NEXT: retl
;
; X32-AVX512-LABEL: test_broadcast_4i32_16i32:
-; X32-AVX512: ## BB#0:
+; X32-AVX512: # %bb.0:
; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; X32-AVX512-NEXT: retl
;
; X64-AVX1-LABEL: test_broadcast_4i32_16i32:
-; X64-AVX1: ## BB#0:
+; X64-AVX1: # %bb.0:
; X64-AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
; X64-AVX1-NEXT: vmovdqa %ymm0, %ymm1
; X64-AVX1-NEXT: retq
;
; X64-AVX2-LABEL: test_broadcast_4i32_16i32:
-; X64-AVX2: ## BB#0:
+; X64-AVX2: # %bb.0:
; X64-AVX2-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
; X64-AVX2-NEXT: vmovaps %ymm0, %ymm1
; X64-AVX2-NEXT: retq
;
; X64-AVX512-LABEL: test_broadcast_4i32_16i32:
-; X64-AVX512: ## BB#0:
+; X64-AVX512: # %bb.0:
; X64-AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; X64-AVX512-NEXT: retq
%1 = load <4 x i32>, <4 x i32> *%p
@@ -452,50 +331,28 @@ define <16 x i32> @test_broadcast_4i32_16i32(<4 x i32> *%p) nounwind {
define <16 x i32> @test_broadcast_8i32_16i32(<8 x i32> *%p) nounwind {
; X32-AVX-LABEL: test_broadcast_8i32_16i32:
-; X32-AVX: ## BB#0:
+; X32-AVX: # %bb.0:
; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-AVX-NEXT: vmovaps (%eax), %ymm0
; X32-AVX-NEXT: vmovaps %ymm0, %ymm1
; X32-AVX-NEXT: retl
;
-; X32-AVX512F-LABEL: test_broadcast_8i32_16i32:
-; X32-AVX512F: ## BB#0:
-; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
-; X32-AVX512F-NEXT: retl
-;
-; X32-AVX512BW-LABEL: test_broadcast_8i32_16i32:
-; X32-AVX512BW: ## BB#0:
-; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
-; X32-AVX512BW-NEXT: retl
-;
-; X32-AVX512DQ-LABEL: test_broadcast_8i32_16i32:
-; X32-AVX512DQ: ## BB#0:
-; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-AVX512DQ-NEXT: vbroadcasti32x8 {{.*#+}} zmm0 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
-; X32-AVX512DQ-NEXT: retl
+; X32-AVX512-LABEL: test_broadcast_8i32_16i32:
+; X32-AVX512: # %bb.0:
+; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
+; X32-AVX512-NEXT: retl
;
; X64-AVX-LABEL: test_broadcast_8i32_16i32:
-; X64-AVX: ## BB#0:
+; X64-AVX: # %bb.0:
; X64-AVX-NEXT: vmovaps (%rdi), %ymm0
; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
; X64-AVX-NEXT: retq
;
-; X64-AVX512F-LABEL: test_broadcast_8i32_16i32:
-; X64-AVX512F: ## BB#0:
-; X64-AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
-; X64-AVX512F-NEXT: retq
-;
-; X64-AVX512BW-LABEL: test_broadcast_8i32_16i32:
-; X64-AVX512BW: ## BB#0:
-; X64-AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
-; X64-AVX512BW-NEXT: retq
-;
-; X64-AVX512DQ-LABEL: test_broadcast_8i32_16i32:
-; X64-AVX512DQ: ## BB#0:
-; X64-AVX512DQ-NEXT: vbroadcasti32x8 {{.*#+}} zmm0 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
-; X64-AVX512DQ-NEXT: retq
+; X64-AVX512-LABEL: test_broadcast_8i32_16i32:
+; X64-AVX512: # %bb.0:
+; X64-AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
+; X64-AVX512-NEXT: retq
%1 = load <8 x i32>, <8 x i32> *%p
%2 = shufflevector <8 x i32> %1, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
ret <16 x i32> %2
@@ -503,24 +360,24 @@ define <16 x i32> @test_broadcast_8i32_16i32(<8 x i32> *%p) nounwind {
define <16 x i16> @test_broadcast_8i16_16i16(<8 x i16> *%p) nounwind {
; X32-AVX-LABEL: test_broadcast_8i16_16i16:
-; X32-AVX: ## BB#0:
+; X32-AVX: # %bb.0:
; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
; X32-AVX-NEXT: retl
;
; X32-AVX512-LABEL: test_broadcast_8i16_16i16:
-; X32-AVX512: ## BB#0:
+; X32-AVX512: # %bb.0:
; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
; X32-AVX512-NEXT: retl
;
; X64-AVX-LABEL: test_broadcast_8i16_16i16:
-; X64-AVX: ## BB#0:
+; X64-AVX: # %bb.0:
; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
; X64-AVX-NEXT: retq
;
; X64-AVX512-LABEL: test_broadcast_8i16_16i16:
-; X64-AVX512: ## BB#0:
+; X64-AVX512: # %bb.0:
; X64-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
; X64-AVX512-NEXT: retq
%1 = load <8 x i16>, <8 x i16> *%p
@@ -530,64 +387,64 @@ define <16 x i16> @test_broadcast_8i16_16i16(<8 x i16> *%p) nounwind {
define <32 x i16> @test_broadcast_8i16_32i16(<8 x i16> *%p) nounwind {
; X32-AVX1-LABEL: test_broadcast_8i16_32i16:
-; X32-AVX1: ## BB#0:
+; X32-AVX1: # %bb.0:
; X32-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
; X32-AVX1-NEXT: vmovdqa %ymm0, %ymm1
; X32-AVX1-NEXT: retl
;
; X32-AVX2-LABEL: test_broadcast_8i16_32i16:
-; X32-AVX2: ## BB#0:
+; X32-AVX2: # %bb.0:
; X32-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-AVX2-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
; X32-AVX2-NEXT: vmovaps %ymm0, %ymm1
; X32-AVX2-NEXT: retl
;
; X32-AVX512F-LABEL: test_broadcast_8i16_32i16:
-; X32-AVX512F: ## BB#0:
+; X32-AVX512F: # %bb.0:
; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
; X32-AVX512F-NEXT: vmovdqa %ymm0, %ymm1
; X32-AVX512F-NEXT: retl
;
; X32-AVX512BW-LABEL: test_broadcast_8i16_32i16:
-; X32-AVX512BW: ## BB#0:
+; X32-AVX512BW: # %bb.0:
; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; X32-AVX512BW-NEXT: retl
;
; X32-AVX512DQ-LABEL: test_broadcast_8i16_32i16:
-; X32-AVX512DQ: ## BB#0:
+; X32-AVX512DQ: # %bb.0:
; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
; X32-AVX512DQ-NEXT: vmovdqa %ymm0, %ymm1
; X32-AVX512DQ-NEXT: retl
;
; X64-AVX1-LABEL: test_broadcast_8i16_32i16:
-; X64-AVX1: ## BB#0:
+; X64-AVX1: # %bb.0:
; X64-AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
; X64-AVX1-NEXT: vmovdqa %ymm0, %ymm1
; X64-AVX1-NEXT: retq
;
; X64-AVX2-LABEL: test_broadcast_8i16_32i16:
-; X64-AVX2: ## BB#0:
+; X64-AVX2: # %bb.0:
; X64-AVX2-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
; X64-AVX2-NEXT: vmovaps %ymm0, %ymm1
; X64-AVX2-NEXT: retq
;
; X64-AVX512F-LABEL: test_broadcast_8i16_32i16:
-; X64-AVX512F: ## BB#0:
+; X64-AVX512F: # %bb.0:
; X64-AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
; X64-AVX512F-NEXT: vmovdqa %ymm0, %ymm1
; X64-AVX512F-NEXT: retq
;
; X64-AVX512BW-LABEL: test_broadcast_8i16_32i16:
-; X64-AVX512BW: ## BB#0:
+; X64-AVX512BW: # %bb.0:
; X64-AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; X64-AVX512BW-NEXT: retq
;
; X64-AVX512DQ-LABEL: test_broadcast_8i16_32i16:
-; X64-AVX512DQ: ## BB#0:
+; X64-AVX512DQ: # %bb.0:
; X64-AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
; X64-AVX512DQ-NEXT: vmovdqa %ymm0, %ymm1
; X64-AVX512DQ-NEXT: retq
@@ -598,51 +455,51 @@ define <32 x i16> @test_broadcast_8i16_32i16(<8 x i16> *%p) nounwind {
define <32 x i16> @test_broadcast_16i16_32i16(<16 x i16> *%p) nounwind {
; X32-AVX-LABEL: test_broadcast_16i16_32i16:
-; X32-AVX: ## BB#0:
+; X32-AVX: # %bb.0:
; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-AVX-NEXT: vmovaps (%eax), %ymm0
; X32-AVX-NEXT: vmovaps %ymm0, %ymm1
; X32-AVX-NEXT: retl
;
; X32-AVX512F-LABEL: test_broadcast_16i16_32i16:
-; X32-AVX512F: ## BB#0:
+; X32-AVX512F: # %bb.0:
; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-AVX512F-NEXT: vmovaps (%eax), %ymm0
; X32-AVX512F-NEXT: vmovaps %ymm0, %ymm1
; X32-AVX512F-NEXT: retl
;
; X32-AVX512BW-LABEL: test_broadcast_16i16_32i16:
-; X32-AVX512BW: ## BB#0:
+; X32-AVX512BW: # %bb.0:
; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
; X32-AVX512BW-NEXT: retl
;
; X32-AVX512DQ-LABEL: test_broadcast_16i16_32i16:
-; X32-AVX512DQ: ## BB#0:
+; X32-AVX512DQ: # %bb.0:
; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-AVX512DQ-NEXT: vmovaps (%eax), %ymm0
; X32-AVX512DQ-NEXT: vmovaps %ymm0, %ymm1
; X32-AVX512DQ-NEXT: retl
;
; X64-AVX-LABEL: test_broadcast_16i16_32i16:
-; X64-AVX: ## BB#0:
+; X64-AVX: # %bb.0:
; X64-AVX-NEXT: vmovaps (%rdi), %ymm0
; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
; X64-AVX-NEXT: retq
;
; X64-AVX512F-LABEL: test_broadcast_16i16_32i16:
-; X64-AVX512F: ## BB#0:
+; X64-AVX512F: # %bb.0:
; X64-AVX512F-NEXT: vmovaps (%rdi), %ymm0
; X64-AVX512F-NEXT: vmovaps %ymm0, %ymm1
; X64-AVX512F-NEXT: retq
;
; X64-AVX512BW-LABEL: test_broadcast_16i16_32i16:
-; X64-AVX512BW: ## BB#0:
+; X64-AVX512BW: # %bb.0:
; X64-AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
; X64-AVX512BW-NEXT: retq
;
; X64-AVX512DQ-LABEL: test_broadcast_16i16_32i16:
-; X64-AVX512DQ: ## BB#0:
+; X64-AVX512DQ: # %bb.0:
; X64-AVX512DQ-NEXT: vmovaps (%rdi), %ymm0
; X64-AVX512DQ-NEXT: vmovaps %ymm0, %ymm1
; X64-AVX512DQ-NEXT: retq
@@ -653,24 +510,24 @@ define <32 x i16> @test_broadcast_16i16_32i16(<16 x i16> *%p) nounwind {
define <32 x i8> @test_broadcast_16i8_32i8(<16 x i8> *%p) nounwind {
; X32-AVX-LABEL: test_broadcast_16i8_32i8:
-; X32-AVX: ## BB#0:
+; X32-AVX: # %bb.0:
; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
; X32-AVX-NEXT: retl
;
; X32-AVX512-LABEL: test_broadcast_16i8_32i8:
-; X32-AVX512: ## BB#0:
+; X32-AVX512: # %bb.0:
; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
; X32-AVX512-NEXT: retl
;
; X64-AVX-LABEL: test_broadcast_16i8_32i8:
-; X64-AVX: ## BB#0:
+; X64-AVX: # %bb.0:
; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
; X64-AVX-NEXT: retq
;
; X64-AVX512-LABEL: test_broadcast_16i8_32i8:
-; X64-AVX512: ## BB#0:
+; X64-AVX512: # %bb.0:
; X64-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
; X64-AVX512-NEXT: retq
%1 = load <16 x i8>, <16 x i8> *%p
@@ -680,64 +537,64 @@ define <32 x i8> @test_broadcast_16i8_32i8(<16 x i8> *%p) nounwind {
define <64 x i8> @test_broadcast_16i8_64i8(<16 x i8> *%p) nounwind {
; X32-AVX1-LABEL: test_broadcast_16i8_64i8:
-; X32-AVX1: ## BB#0:
+; X32-AVX1: # %bb.0:
; X32-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
; X32-AVX1-NEXT: vmovdqa %ymm0, %ymm1
; X32-AVX1-NEXT: retl
;
; X32-AVX2-LABEL: test_broadcast_16i8_64i8:
-; X32-AVX2: ## BB#0:
+; X32-AVX2: # %bb.0:
; X32-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-AVX2-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
; X32-AVX2-NEXT: vmovaps %ymm0, %ymm1
; X32-AVX2-NEXT: retl
;
; X32-AVX512F-LABEL: test_broadcast_16i8_64i8:
-; X32-AVX512F: ## BB#0:
+; X32-AVX512F: # %bb.0:
; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
; X32-AVX512F-NEXT: vmovdqa %ymm0, %ymm1
; X32-AVX512F-NEXT: retl
;
; X32-AVX512BW-LABEL: test_broadcast_16i8_64i8:
-; X32-AVX512BW: ## BB#0:
+; X32-AVX512BW: # %bb.0:
; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; X32-AVX512BW-NEXT: retl
;
; X32-AVX512DQ-LABEL: test_broadcast_16i8_64i8:
-; X32-AVX512DQ: ## BB#0:
+; X32-AVX512DQ: # %bb.0:
; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
; X32-AVX512DQ-NEXT: vmovdqa %ymm0, %ymm1
; X32-AVX512DQ-NEXT: retl
;
; X64-AVX1-LABEL: test_broadcast_16i8_64i8:
-; X64-AVX1: ## BB#0:
+; X64-AVX1: # %bb.0:
; X64-AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
; X64-AVX1-NEXT: vmovdqa %ymm0, %ymm1
; X64-AVX1-NEXT: retq
;
; X64-AVX2-LABEL: test_broadcast_16i8_64i8:
-; X64-AVX2: ## BB#0:
+; X64-AVX2: # %bb.0:
; X64-AVX2-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
; X64-AVX2-NEXT: vmovaps %ymm0, %ymm1
; X64-AVX2-NEXT: retq
;
; X64-AVX512F-LABEL: test_broadcast_16i8_64i8:
-; X64-AVX512F: ## BB#0:
+; X64-AVX512F: # %bb.0:
; X64-AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
; X64-AVX512F-NEXT: vmovdqa %ymm0, %ymm1
; X64-AVX512F-NEXT: retq
;
; X64-AVX512BW-LABEL: test_broadcast_16i8_64i8:
-; X64-AVX512BW: ## BB#0:
+; X64-AVX512BW: # %bb.0:
; X64-AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; X64-AVX512BW-NEXT: retq
;
; X64-AVX512DQ-LABEL: test_broadcast_16i8_64i8:
-; X64-AVX512DQ: ## BB#0:
+; X64-AVX512DQ: # %bb.0:
; X64-AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
; X64-AVX512DQ-NEXT: vmovdqa %ymm0, %ymm1
; X64-AVX512DQ-NEXT: retq
@@ -748,51 +605,51 @@ define <64 x i8> @test_broadcast_16i8_64i8(<16 x i8> *%p) nounwind {
define <64 x i8> @test_broadcast_32i8_64i8(<32 x i8> *%p) nounwind {
; X32-AVX-LABEL: test_broadcast_32i8_64i8:
-; X32-AVX: ## BB#0:
+; X32-AVX: # %bb.0:
; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-AVX-NEXT: vmovaps (%eax), %ymm0
; X32-AVX-NEXT: vmovaps %ymm0, %ymm1
; X32-AVX-NEXT: retl
;
; X32-AVX512F-LABEL: test_broadcast_32i8_64i8:
-; X32-AVX512F: ## BB#0:
+; X32-AVX512F: # %bb.0:
; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-AVX512F-NEXT: vmovaps (%eax), %ymm0
; X32-AVX512F-NEXT: vmovaps %ymm0, %ymm1
; X32-AVX512F-NEXT: retl
;
; X32-AVX512BW-LABEL: test_broadcast_32i8_64i8:
-; X32-AVX512BW: ## BB#0:
+; X32-AVX512BW: # %bb.0:
; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
; X32-AVX512BW-NEXT: retl
;
; X32-AVX512DQ-LABEL: test_broadcast_32i8_64i8:
-; X32-AVX512DQ: ## BB#0:
+; X32-AVX512DQ: # %bb.0:
; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-AVX512DQ-NEXT: vmovaps (%eax), %ymm0
; X32-AVX512DQ-NEXT: vmovaps %ymm0, %ymm1
; X32-AVX512DQ-NEXT: retl
;
; X64-AVX-LABEL: test_broadcast_32i8_64i8:
-; X64-AVX: ## BB#0:
+; X64-AVX: # %bb.0:
; X64-AVX-NEXT: vmovaps (%rdi), %ymm0
; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
; X64-AVX-NEXT: retq
;
; X64-AVX512F-LABEL: test_broadcast_32i8_64i8:
-; X64-AVX512F: ## BB#0:
+; X64-AVX512F: # %bb.0:
; X64-AVX512F-NEXT: vmovaps (%rdi), %ymm0
; X64-AVX512F-NEXT: vmovaps %ymm0, %ymm1
; X64-AVX512F-NEXT: retq
;
; X64-AVX512BW-LABEL: test_broadcast_32i8_64i8:
-; X64-AVX512BW: ## BB#0:
+; X64-AVX512BW: # %bb.0:
; X64-AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
; X64-AVX512BW-NEXT: retq
;
; X64-AVX512DQ-LABEL: test_broadcast_32i8_64i8:
-; X64-AVX512DQ: ## BB#0:
+; X64-AVX512DQ: # %bb.0:
; X64-AVX512DQ-NEXT: vmovaps (%rdi), %ymm0
; X64-AVX512DQ-NEXT: vmovaps %ymm0, %ymm1
; X64-AVX512DQ-NEXT: retq
@@ -806,69 +663,21 @@ define <64 x i8> @test_broadcast_32i8_64i8(<32 x i8> *%p) nounwind {
;
define <4 x double> @test_broadcast_2f64_4f64_reuse(<2 x double>* %p0, <2 x double>* %p1) {
-; X32-AVX-LABEL: test_broadcast_2f64_4f64_reuse:
-; X32-AVX: ## BB#0:
-; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-AVX-NEXT: vmovaps (%ecx), %xmm0
-; X32-AVX-NEXT: vmovaps %xmm0, (%eax)
-; X32-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; X32-AVX-NEXT: retl
-;
-; X32-AVX512F-LABEL: test_broadcast_2f64_4f64_reuse:
-; X32-AVX512F: ## BB#0:
-; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-AVX512F-NEXT: vmovaps (%ecx), %xmm0
-; X32-AVX512F-NEXT: vmovaps %xmm0, (%eax)
-; X32-AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; X32-AVX512F-NEXT: retl
-;
-; X32-AVX512BW-LABEL: test_broadcast_2f64_4f64_reuse:
-; X32-AVX512BW: ## BB#0:
-; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-AVX512BW-NEXT: vmovaps (%ecx), %xmm0
-; X32-AVX512BW-NEXT: vmovaps %xmm0, (%eax)
-; X32-AVX512BW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; X32-AVX512BW-NEXT: retl
-;
-; X32-AVX512DQ-LABEL: test_broadcast_2f64_4f64_reuse:
-; X32-AVX512DQ: ## BB#0:
-; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-AVX512DQ-NEXT: vmovapd (%ecx), %xmm0
-; X32-AVX512DQ-NEXT: vmovapd %xmm0, (%eax)
-; X32-AVX512DQ-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; X32-AVX512DQ-NEXT: retl
-;
-; X64-AVX-LABEL: test_broadcast_2f64_4f64_reuse:
-; X64-AVX: ## BB#0:
-; X64-AVX-NEXT: vmovaps (%rdi), %xmm0
-; X64-AVX-NEXT: vmovaps %xmm0, (%rsi)
-; X64-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; X64-AVX-NEXT: retq
-;
-; X64-AVX512F-LABEL: test_broadcast_2f64_4f64_reuse:
-; X64-AVX512F: ## BB#0:
-; X64-AVX512F-NEXT: vmovaps (%rdi), %xmm0
-; X64-AVX512F-NEXT: vmovaps %xmm0, (%rsi)
-; X64-AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; X64-AVX512F-NEXT: retq
-;
-; X64-AVX512BW-LABEL: test_broadcast_2f64_4f64_reuse:
-; X64-AVX512BW: ## BB#0:
-; X64-AVX512BW-NEXT: vmovaps (%rdi), %xmm0
-; X64-AVX512BW-NEXT: vmovaps %xmm0, (%rsi)
-; X64-AVX512BW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; X64-AVX512BW-NEXT: retq
+; X32-LABEL: test_broadcast_2f64_4f64_reuse:
+; X32: # %bb.0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: vmovaps (%ecx), %xmm0
+; X32-NEXT: vmovaps %xmm0, (%eax)
+; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X32-NEXT: retl
;
-; X64-AVX512DQ-LABEL: test_broadcast_2f64_4f64_reuse:
-; X64-AVX512DQ: ## BB#0:
-; X64-AVX512DQ-NEXT: vmovapd (%rdi), %xmm0
-; X64-AVX512DQ-NEXT: vmovapd %xmm0, (%rsi)
-; X64-AVX512DQ-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; X64-AVX512DQ-NEXT: retq
+; X64-LABEL: test_broadcast_2f64_4f64_reuse:
+; X64: # %bb.0:
+; X64-NEXT: vmovaps (%rdi), %xmm0
+; X64-NEXT: vmovaps %xmm0, (%rsi)
+; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X64-NEXT: retq
%1 = load <2 x double>, <2 x double>* %p0
store <2 x double> %1, <2 x double>* %p1
%2 = shufflevector <2 x double> %1, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
@@ -876,37 +685,21 @@ define <4 x double> @test_broadcast_2f64_4f64_reuse(<2 x double>* %p0, <2 x doub
}
define <4 x i64> @test_broadcast_2i64_4i64_reuse(<2 x i64>* %p0, <2 x i64>* %p1) {
-; X32-AVX-LABEL: test_broadcast_2i64_4i64_reuse:
-; X32-AVX: ## BB#0:
-; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-AVX-NEXT: vmovaps (%ecx), %xmm0
-; X32-AVX-NEXT: vmovaps %xmm0, (%eax)
-; X32-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; X32-AVX-NEXT: retl
-;
-; X32-AVX512-LABEL: test_broadcast_2i64_4i64_reuse:
-; X32-AVX512: ## BB#0:
-; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-AVX512-NEXT: vmovdqa (%ecx), %xmm0
-; X32-AVX512-NEXT: vmovdqa %xmm0, (%eax)
-; X32-AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; X32-AVX512-NEXT: retl
-;
-; X64-AVX-LABEL: test_broadcast_2i64_4i64_reuse:
-; X64-AVX: ## BB#0:
-; X64-AVX-NEXT: vmovaps (%rdi), %xmm0
-; X64-AVX-NEXT: vmovaps %xmm0, (%rsi)
-; X64-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; X64-AVX-NEXT: retq
+; X32-LABEL: test_broadcast_2i64_4i64_reuse:
+; X32: # %bb.0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: vmovaps (%ecx), %xmm0
+; X32-NEXT: vmovaps %xmm0, (%eax)
+; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X32-NEXT: retl
;
-; X64-AVX512-LABEL: test_broadcast_2i64_4i64_reuse:
-; X64-AVX512: ## BB#0:
-; X64-AVX512-NEXT: vmovdqa (%rdi), %xmm0
-; X64-AVX512-NEXT: vmovdqa %xmm0, (%rsi)
-; X64-AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; X64-AVX512-NEXT: retq
+; X64-LABEL: test_broadcast_2i64_4i64_reuse:
+; X64: # %bb.0:
+; X64-NEXT: vmovaps (%rdi), %xmm0
+; X64-NEXT: vmovaps %xmm0, (%rsi)
+; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X64-NEXT: retq
%1 = load <2 x i64>, <2 x i64>* %p0
store <2 x i64> %1, <2 x i64>* %p1
%2 = shufflevector <2 x i64> %1, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
@@ -915,7 +708,7 @@ define <4 x i64> @test_broadcast_2i64_4i64_reuse(<2 x i64>* %p0, <2 x i64>* %p1)
define <8 x float> @test_broadcast_4f32_8f32_reuse(<4 x float>* %p0, <4 x float>* %p1) {
; X32-LABEL: test_broadcast_4f32_8f32_reuse:
-; X32: ## BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: vmovaps (%ecx), %xmm0
@@ -924,7 +717,7 @@ define <8 x float> @test_broadcast_4f32_8f32_reuse(<4 x float>* %p0, <4 x float>
; X32-NEXT: retl
;
; X64-LABEL: test_broadcast_4f32_8f32_reuse:
-; X64: ## BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovaps (%rdi), %xmm0
; X64-NEXT: vmovaps %xmm0, (%rsi)
; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
@@ -936,37 +729,21 @@ define <8 x float> @test_broadcast_4f32_8f32_reuse(<4 x float>* %p0, <4 x float>
}
define <8 x i32> @test_broadcast_4i32_8i32_reuse(<4 x i32>* %p0, <4 x i32>* %p1) {
-; X32-AVX-LABEL: test_broadcast_4i32_8i32_reuse:
-; X32-AVX: ## BB#0:
-; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-AVX-NEXT: vmovaps (%ecx), %xmm0
-; X32-AVX-NEXT: vmovaps %xmm0, (%eax)
-; X32-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; X32-AVX-NEXT: retl
-;
-; X32-AVX512-LABEL: test_broadcast_4i32_8i32_reuse:
-; X32-AVX512: ## BB#0:
-; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-AVX512-NEXT: vmovdqa (%ecx), %xmm0
-; X32-AVX512-NEXT: vmovdqa %xmm0, (%eax)
-; X32-AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; X32-AVX512-NEXT: retl
-;
-; X64-AVX-LABEL: test_broadcast_4i32_8i32_reuse:
-; X64-AVX: ## BB#0:
-; X64-AVX-NEXT: vmovaps (%rdi), %xmm0
-; X64-AVX-NEXT: vmovaps %xmm0, (%rsi)
-; X64-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; X64-AVX-NEXT: retq
+; X32-LABEL: test_broadcast_4i32_8i32_reuse:
+; X32: # %bb.0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: vmovaps (%ecx), %xmm0
+; X32-NEXT: vmovaps %xmm0, (%eax)
+; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X32-NEXT: retl
;
-; X64-AVX512-LABEL: test_broadcast_4i32_8i32_reuse:
-; X64-AVX512: ## BB#0:
-; X64-AVX512-NEXT: vmovdqa (%rdi), %xmm0
-; X64-AVX512-NEXT: vmovdqa %xmm0, (%rsi)
-; X64-AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; X64-AVX512-NEXT: retq
+; X64-LABEL: test_broadcast_4i32_8i32_reuse:
+; X64: # %bb.0:
+; X64-NEXT: vmovaps (%rdi), %xmm0
+; X64-NEXT: vmovaps %xmm0, (%rsi)
+; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X64-NEXT: retq
%1 = load <4 x i32>, <4 x i32>* %p0
store <4 x i32> %1, <4 x i32>* %p1
%2 = shufflevector <4 x i32> %1, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
@@ -974,69 +751,21 @@ define <8 x i32> @test_broadcast_4i32_8i32_reuse(<4 x i32>* %p0, <4 x i32>* %p1)
}
define <16 x i16> @test_broadcast_8i16_16i16_reuse(<8 x i16> *%p0, <8 x i16> *%p1) nounwind {
-; X32-AVX-LABEL: test_broadcast_8i16_16i16_reuse:
-; X32-AVX: ## BB#0:
-; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-AVX-NEXT: vmovaps (%ecx), %xmm0
-; X32-AVX-NEXT: vmovaps %xmm0, (%eax)
-; X32-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; X32-AVX-NEXT: retl
-;
-; X32-AVX512F-LABEL: test_broadcast_8i16_16i16_reuse:
-; X32-AVX512F: ## BB#0:
-; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-AVX512F-NEXT: vmovdqa (%ecx), %xmm0
-; X32-AVX512F-NEXT: vmovdqa %xmm0, (%eax)
-; X32-AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; X32-AVX512F-NEXT: retl
-;
-; X32-AVX512BW-LABEL: test_broadcast_8i16_16i16_reuse:
-; X32-AVX512BW: ## BB#0:
-; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-AVX512BW-NEXT: vmovdqu (%ecx), %xmm0
-; X32-AVX512BW-NEXT: vmovdqu %xmm0, (%eax)
-; X32-AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; X32-AVX512BW-NEXT: retl
-;
-; X32-AVX512DQ-LABEL: test_broadcast_8i16_16i16_reuse:
-; X32-AVX512DQ: ## BB#0:
-; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-AVX512DQ-NEXT: vmovdqa (%ecx), %xmm0
-; X32-AVX512DQ-NEXT: vmovdqa %xmm0, (%eax)
-; X32-AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; X32-AVX512DQ-NEXT: retl
-;
-; X64-AVX-LABEL: test_broadcast_8i16_16i16_reuse:
-; X64-AVX: ## BB#0:
-; X64-AVX-NEXT: vmovaps (%rdi), %xmm0
-; X64-AVX-NEXT: vmovaps %xmm0, (%rsi)
-; X64-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; X64-AVX-NEXT: retq
-;
-; X64-AVX512F-LABEL: test_broadcast_8i16_16i16_reuse:
-; X64-AVX512F: ## BB#0:
-; X64-AVX512F-NEXT: vmovdqa (%rdi), %xmm0
-; X64-AVX512F-NEXT: vmovdqa %xmm0, (%rsi)
-; X64-AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; X64-AVX512F-NEXT: retq
-;
-; X64-AVX512BW-LABEL: test_broadcast_8i16_16i16_reuse:
-; X64-AVX512BW: ## BB#0:
-; X64-AVX512BW-NEXT: vmovdqu (%rdi), %xmm0
-; X64-AVX512BW-NEXT: vmovdqu %xmm0, (%rsi)
-; X64-AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; X64-AVX512BW-NEXT: retq
+; X32-LABEL: test_broadcast_8i16_16i16_reuse:
+; X32: # %bb.0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: vmovaps (%ecx), %xmm0
+; X32-NEXT: vmovaps %xmm0, (%eax)
+; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X32-NEXT: retl
;
-; X64-AVX512DQ-LABEL: test_broadcast_8i16_16i16_reuse:
-; X64-AVX512DQ: ## BB#0:
-; X64-AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
-; X64-AVX512DQ-NEXT: vmovdqa %xmm0, (%rsi)
-; X64-AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; X64-AVX512DQ-NEXT: retq
+; X64-LABEL: test_broadcast_8i16_16i16_reuse:
+; X64: # %bb.0:
+; X64-NEXT: vmovaps (%rdi), %xmm0
+; X64-NEXT: vmovaps %xmm0, (%rsi)
+; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X64-NEXT: retq
%1 = load <8 x i16>, <8 x i16> *%p0
store <8 x i16> %1, <8 x i16>* %p1
%2 = shufflevector <8 x i16> %1, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -1044,69 +773,21 @@ define <16 x i16> @test_broadcast_8i16_16i16_reuse(<8 x i16> *%p0, <8 x i16> *%p
}
define <32 x i8> @test_broadcast_16i8_32i8_reuse(<16 x i8> *%p0, <16 x i8> *%p1) nounwind {
-; X32-AVX-LABEL: test_broadcast_16i8_32i8_reuse:
-; X32-AVX: ## BB#0:
-; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-AVX-NEXT: vmovaps (%ecx), %xmm0
-; X32-AVX-NEXT: vmovaps %xmm0, (%eax)
-; X32-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; X32-AVX-NEXT: retl
-;
-; X32-AVX512F-LABEL: test_broadcast_16i8_32i8_reuse:
-; X32-AVX512F: ## BB#0:
-; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-AVX512F-NEXT: vmovdqa (%ecx), %xmm0
-; X32-AVX512F-NEXT: vmovdqa %xmm0, (%eax)
-; X32-AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; X32-AVX512F-NEXT: retl
-;
-; X32-AVX512BW-LABEL: test_broadcast_16i8_32i8_reuse:
-; X32-AVX512BW: ## BB#0:
-; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-AVX512BW-NEXT: vmovdqu (%ecx), %xmm0
-; X32-AVX512BW-NEXT: vmovdqu %xmm0, (%eax)
-; X32-AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; X32-AVX512BW-NEXT: retl
-;
-; X32-AVX512DQ-LABEL: test_broadcast_16i8_32i8_reuse:
-; X32-AVX512DQ: ## BB#0:
-; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-AVX512DQ-NEXT: vmovdqa (%ecx), %xmm0
-; X32-AVX512DQ-NEXT: vmovdqa %xmm0, (%eax)
-; X32-AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; X32-AVX512DQ-NEXT: retl
-;
-; X64-AVX-LABEL: test_broadcast_16i8_32i8_reuse:
-; X64-AVX: ## BB#0:
-; X64-AVX-NEXT: vmovaps (%rdi), %xmm0
-; X64-AVX-NEXT: vmovaps %xmm0, (%rsi)
-; X64-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; X64-AVX-NEXT: retq
-;
-; X64-AVX512F-LABEL: test_broadcast_16i8_32i8_reuse:
-; X64-AVX512F: ## BB#0:
-; X64-AVX512F-NEXT: vmovdqa (%rdi), %xmm0
-; X64-AVX512F-NEXT: vmovdqa %xmm0, (%rsi)
-; X64-AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; X64-AVX512F-NEXT: retq
-;
-; X64-AVX512BW-LABEL: test_broadcast_16i8_32i8_reuse:
-; X64-AVX512BW: ## BB#0:
-; X64-AVX512BW-NEXT: vmovdqu (%rdi), %xmm0
-; X64-AVX512BW-NEXT: vmovdqu %xmm0, (%rsi)
-; X64-AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; X64-AVX512BW-NEXT: retq
+; X32-LABEL: test_broadcast_16i8_32i8_reuse:
+; X32: # %bb.0:
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT: vmovaps (%ecx), %xmm0
+; X32-NEXT: vmovaps %xmm0, (%eax)
+; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X32-NEXT: retl
;
-; X64-AVX512DQ-LABEL: test_broadcast_16i8_32i8_reuse:
-; X64-AVX512DQ: ## BB#0:
-; X64-AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
-; X64-AVX512DQ-NEXT: vmovdqa %xmm0, (%rsi)
-; X64-AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; X64-AVX512DQ-NEXT: retq
+; X64-LABEL: test_broadcast_16i8_32i8_reuse:
+; X64: # %bb.0:
+; X64-NEXT: vmovaps (%rdi), %xmm0
+; X64-NEXT: vmovaps %xmm0, (%rsi)
+; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X64-NEXT: retq
%1 = load <16 x i8>, <16 x i8> *%p0
store <16 x i8> %1, <16 x i8>* %p1
%2 = shufflevector <16 x i8> %1, <16 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -1119,7 +800,7 @@ define <32 x i8> @test_broadcast_16i8_32i8_reuse(<16 x i8> *%p0, <16 x i8> *%p1)
define <8 x i32> @test_broadcast_4i32_8i32_chain(<4 x i32>* %p0, <4 x float>* %p1) {
; X32-AVX-LABEL: test_broadcast_4i32_8i32_chain:
-; X32-AVX: ## BB#0:
+; X32-AVX: # %bb.0:
; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-AVX-NEXT: vmovaps (%ecx), %xmm0
@@ -1129,37 +810,37 @@ define <8 x i32> @test_broadcast_4i32_8i32_chain(<4 x i32>* %p0, <4 x float>* %p
; X32-AVX-NEXT: retl
;
; X32-AVX512F-LABEL: test_broadcast_4i32_8i32_chain:
-; X32-AVX512F: ## BB#0:
+; X32-AVX512F: # %bb.0:
; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-AVX512F-NEXT: vmovdqa (%ecx), %xmm0
+; X32-AVX512F-NEXT: vmovaps (%ecx), %xmm0
; X32-AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
; X32-AVX512F-NEXT: vmovdqa %xmm1, (%eax)
-; X32-AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; X32-AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; X32-AVX512F-NEXT: retl
;
; X32-AVX512BW-LABEL: test_broadcast_4i32_8i32_chain:
-; X32-AVX512BW: ## BB#0:
+; X32-AVX512BW: # %bb.0:
; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-AVX512BW-NEXT: vmovdqa (%ecx), %xmm0
+; X32-AVX512BW-NEXT: vmovaps (%ecx), %xmm0
; X32-AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
; X32-AVX512BW-NEXT: vmovdqa %xmm1, (%eax)
-; X32-AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; X32-AVX512BW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; X32-AVX512BW-NEXT: retl
;
; X32-AVX512DQ-LABEL: test_broadcast_4i32_8i32_chain:
-; X32-AVX512DQ: ## BB#0:
+; X32-AVX512DQ: # %bb.0:
; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-AVX512DQ-NEXT: vmovdqa (%ecx), %xmm0
+; X32-AVX512DQ-NEXT: vmovaps (%ecx), %xmm0
; X32-AVX512DQ-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X32-AVX512DQ-NEXT: vmovaps %xmm1, (%eax)
-; X32-AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; X32-AVX512DQ-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; X32-AVX512DQ-NEXT: retl
;
; X64-AVX-LABEL: test_broadcast_4i32_8i32_chain:
-; X64-AVX: ## BB#0:
+; X64-AVX: # %bb.0:
; X64-AVX-NEXT: vmovaps (%rdi), %xmm0
; X64-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X64-AVX-NEXT: vmovaps %xmm1, (%rsi)
@@ -1167,27 +848,27 @@ define <8 x i32> @test_broadcast_4i32_8i32_chain(<4 x i32>* %p0, <4 x float>* %p
; X64-AVX-NEXT: retq
;
; X64-AVX512F-LABEL: test_broadcast_4i32_8i32_chain:
-; X64-AVX512F: ## BB#0:
-; X64-AVX512F-NEXT: vmovdqa (%rdi), %xmm0
+; X64-AVX512F: # %bb.0:
+; X64-AVX512F-NEXT: vmovaps (%rdi), %xmm0
; X64-AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
; X64-AVX512F-NEXT: vmovdqa %xmm1, (%rsi)
-; X64-AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; X64-AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; X64-AVX512F-NEXT: retq
;
; X64-AVX512BW-LABEL: test_broadcast_4i32_8i32_chain:
-; X64-AVX512BW: ## BB#0:
-; X64-AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
+; X64-AVX512BW: # %bb.0:
+; X64-AVX512BW-NEXT: vmovaps (%rdi), %xmm0
; X64-AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
; X64-AVX512BW-NEXT: vmovdqa %xmm1, (%rsi)
-; X64-AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; X64-AVX512BW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; X64-AVX512BW-NEXT: retq
;
; X64-AVX512DQ-LABEL: test_broadcast_4i32_8i32_chain:
-; X64-AVX512DQ: ## BB#0:
-; X64-AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
+; X64-AVX512DQ: # %bb.0:
+; X64-AVX512DQ-NEXT: vmovaps (%rdi), %xmm0
; X64-AVX512DQ-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X64-AVX512DQ-NEXT: vmovaps %xmm1, (%rsi)
-; X64-AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; X64-AVX512DQ-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; X64-AVX512DQ-NEXT: retq
%1 = load <4 x i32>, <4 x i32>* %p0
store <4 x float> zeroinitializer, <4 x float>* %p1
@@ -1197,7 +878,7 @@ define <8 x i32> @test_broadcast_4i32_8i32_chain(<4 x i32>* %p0, <4 x float>* %p
define <16 x i32> @test_broadcast_4i32_16i32_chain(<4 x i32>* %p0, <4 x float>* %p1) {
; X32-AVX-LABEL: test_broadcast_4i32_16i32_chain:
-; X32-AVX: ## BB#0:
+; X32-AVX: # %bb.0:
; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-AVX-NEXT: vmovaps (%ecx), %xmm0
@@ -1208,7 +889,7 @@ define <16 x i32> @test_broadcast_4i32_16i32_chain(<4 x i32>* %p0, <4 x float>*
; X32-AVX-NEXT: retl
;
; X32-AVX512F-LABEL: test_broadcast_4i32_16i32_chain:
-; X32-AVX512F: ## BB#0:
+; X32-AVX512F: # %bb.0:
; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-AVX512F-NEXT: vmovdqa (%ecx), %xmm0
@@ -1218,7 +899,7 @@ define <16 x i32> @test_broadcast_4i32_16i32_chain(<4 x i32>* %p0, <4 x float>*
; X32-AVX512F-NEXT: retl
;
; X32-AVX512BW-LABEL: test_broadcast_4i32_16i32_chain:
-; X32-AVX512BW: ## BB#0:
+; X32-AVX512BW: # %bb.0:
; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-AVX512BW-NEXT: vmovdqa (%ecx), %xmm0
@@ -1228,7 +909,7 @@ define <16 x i32> @test_broadcast_4i32_16i32_chain(<4 x i32>* %p0, <4 x float>*
; X32-AVX512BW-NEXT: retl
;
; X32-AVX512DQ-LABEL: test_broadcast_4i32_16i32_chain:
-; X32-AVX512DQ: ## BB#0:
+; X32-AVX512DQ: # %bb.0:
; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-AVX512DQ-NEXT: vmovdqa (%ecx), %xmm0
@@ -1238,7 +919,7 @@ define <16 x i32> @test_broadcast_4i32_16i32_chain(<4 x i32>* %p0, <4 x float>*
; X32-AVX512DQ-NEXT: retl
;
; X64-AVX-LABEL: test_broadcast_4i32_16i32_chain:
-; X64-AVX: ## BB#0:
+; X64-AVX: # %bb.0:
; X64-AVX-NEXT: vmovaps (%rdi), %xmm0
; X64-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X64-AVX-NEXT: vmovaps %xmm1, (%rsi)
@@ -1247,7 +928,7 @@ define <16 x i32> @test_broadcast_4i32_16i32_chain(<4 x i32>* %p0, <4 x float>*
; X64-AVX-NEXT: retq
;
; X64-AVX512F-LABEL: test_broadcast_4i32_16i32_chain:
-; X64-AVX512F: ## BB#0:
+; X64-AVX512F: # %bb.0:
; X64-AVX512F-NEXT: vmovdqa (%rdi), %xmm0
; X64-AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
; X64-AVX512F-NEXT: vmovdqa %xmm1, (%rsi)
@@ -1255,7 +936,7 @@ define <16 x i32> @test_broadcast_4i32_16i32_chain(<4 x i32>* %p0, <4 x float>*
; X64-AVX512F-NEXT: retq
;
; X64-AVX512BW-LABEL: test_broadcast_4i32_16i32_chain:
-; X64-AVX512BW: ## BB#0:
+; X64-AVX512BW: # %bb.0:
; X64-AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
; X64-AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
; X64-AVX512BW-NEXT: vmovdqa %xmm1, (%rsi)
@@ -1263,7 +944,7 @@ define <16 x i32> @test_broadcast_4i32_16i32_chain(<4 x i32>* %p0, <4 x float>*
; X64-AVX512BW-NEXT: retq
;
; X64-AVX512DQ-LABEL: test_broadcast_4i32_16i32_chain:
-; X64-AVX512DQ: ## BB#0:
+; X64-AVX512DQ: # %bb.0:
; X64-AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
; X64-AVX512DQ-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X64-AVX512DQ-NEXT: vmovaps %xmm1, (%rsi)
@@ -1285,7 +966,7 @@ define <16 x i32> @test_broadcast_4i32_16i32_chain(<4 x i32>* %p0, <4 x float>*
define void @fallback_broadcast_v4i64_to_v8i64(<4 x i64> %a, <8 x i64> %b) {
; X32-AVX1-LABEL: fallback_broadcast_v4i64_to_v8i64:
-; X32-AVX1: ## BB#0: ## %entry
+; X32-AVX1: # %bb.0: # %entry
; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; X32-AVX1-NEXT: vmovdqa {{.*#+}} ymm4 = [1,0,2,0,3,0,4,0]
; X32-AVX1-NEXT: vextractf128 $1, %ymm4, %xmm5
@@ -1302,39 +983,39 @@ define void @fallback_broadcast_v4i64_to_v8i64(<4 x i64> %a, <8 x i64> %b) {
; X32-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
; X32-AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1
; X32-AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2
-; X32-AVX1-NEXT: vmovups %ymm0, _ga4
-; X32-AVX1-NEXT: vmovups %ymm2, _gb4+32
-; X32-AVX1-NEXT: vmovups %ymm1, _gb4
+; X32-AVX1-NEXT: vmovups %ymm0, ga4
+; X32-AVX1-NEXT: vmovups %ymm2, gb4+32
+; X32-AVX1-NEXT: vmovups %ymm1, gb4
; X32-AVX1-NEXT: vzeroupper
; X32-AVX1-NEXT: retl
;
; X32-AVX2-LABEL: fallback_broadcast_v4i64_to_v8i64:
-; X32-AVX2: ## BB#0: ## %entry
+; X32-AVX2: # %bb.0: # %entry
; X32-AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [1,0,2,0,3,0,4,0]
; X32-AVX2-NEXT: vpaddq %ymm3, %ymm0, %ymm0
; X32-AVX2-NEXT: vpaddq %ymm3, %ymm2, %ymm2
; X32-AVX2-NEXT: vpaddq %ymm3, %ymm1, %ymm1
; X32-AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1
; X32-AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2
-; X32-AVX2-NEXT: vmovdqu %ymm0, _ga4
-; X32-AVX2-NEXT: vmovdqu %ymm2, _gb4+32
-; X32-AVX2-NEXT: vmovdqu %ymm1, _gb4
+; X32-AVX2-NEXT: vmovdqu %ymm0, ga4
+; X32-AVX2-NEXT: vmovdqu %ymm2, gb4+32
+; X32-AVX2-NEXT: vmovdqu %ymm1, gb4
; X32-AVX2-NEXT: vzeroupper
; X32-AVX2-NEXT: retl
;
; X32-AVX512-LABEL: fallback_broadcast_v4i64_to_v8i64:
-; X32-AVX512: ## BB#0: ## %entry
-; X32-AVX512-NEXT: vpaddq LCPI26_0, %ymm0, %ymm0
+; X32-AVX512: # %bb.0: # %entry
+; X32-AVX512-NEXT: vpaddq {{\.LCPI.*}}, %ymm0, %ymm0
; X32-AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [1,0,2,0,3,0,4,0,1,0,2,0,3,0,4,0]
; X32-AVX512-NEXT: vpaddq %zmm2, %zmm1, %zmm1
; X32-AVX512-NEXT: vpandq %zmm2, %zmm1, %zmm1
-; X32-AVX512-NEXT: vmovdqu %ymm0, _ga4
-; X32-AVX512-NEXT: vmovdqu64 %zmm1, _gb4
+; X32-AVX512-NEXT: vmovdqu %ymm0, ga4
+; X32-AVX512-NEXT: vmovdqu64 %zmm1, gb4
; X32-AVX512-NEXT: vzeroupper
; X32-AVX512-NEXT: retl
;
; X64-AVX1-LABEL: fallback_broadcast_v4i64_to_v8i64:
-; X64-AVX1: ## BB#0: ## %entry
+; X64-AVX1: # %bb.0: # %entry
; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [3,4]
; X64-AVX1-NEXT: vpaddq %xmm4, %xmm3, %xmm3
@@ -1353,13 +1034,13 @@ define void @fallback_broadcast_v4i64_to_v8i64(<4 x i64> %a, <8 x i64> %b) {
; X64-AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1
; X64-AVX1-NEXT: vandps %ymm3, %ymm2, %ymm2
; X64-AVX1-NEXT: vmovups %ymm0, {{.*}}(%rip)
-; X64-AVX1-NEXT: vmovups %ymm2, _gb4+{{.*}}(%rip)
+; X64-AVX1-NEXT: vmovups %ymm2, gb4+{{.*}}(%rip)
; X64-AVX1-NEXT: vmovups %ymm1, {{.*}}(%rip)
; X64-AVX1-NEXT: vzeroupper
; X64-AVX1-NEXT: retq
;
; X64-AVX2-LABEL: fallback_broadcast_v4i64_to_v8i64:
-; X64-AVX2: ## BB#0: ## %entry
+; X64-AVX2: # %bb.0: # %entry
; X64-AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [1,2,3,4]
; X64-AVX2-NEXT: vpaddq %ymm3, %ymm0, %ymm0
; X64-AVX2-NEXT: vpaddq %ymm3, %ymm2, %ymm2
@@ -1367,13 +1048,13 @@ define void @fallback_broadcast_v4i64_to_v8i64(<4 x i64> %a, <8 x i64> %b) {
; X64-AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1
; X64-AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2
; X64-AVX2-NEXT: vmovdqu %ymm0, {{.*}}(%rip)
-; X64-AVX2-NEXT: vmovdqu %ymm2, _gb4+{{.*}}(%rip)
+; X64-AVX2-NEXT: vmovdqu %ymm2, gb4+{{.*}}(%rip)
; X64-AVX2-NEXT: vmovdqu %ymm1, {{.*}}(%rip)
; X64-AVX2-NEXT: vzeroupper
; X64-AVX2-NEXT: retq
;
; X64-AVX512-LABEL: fallback_broadcast_v4i64_to_v8i64:
-; X64-AVX512: ## BB#0: ## %entry
+; X64-AVX512: # %bb.0: # %entry
; X64-AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [1,2,3,4]
; X64-AVX512-NEXT: vpaddq %ymm2, %ymm0, %ymm0
; X64-AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm2
@@ -1398,33 +1079,33 @@ entry:
define void @fallback_broadcast_v4f64_to_v8f64(<4 x double> %a, <8 x double> %b) {
; X32-AVX-LABEL: fallback_broadcast_v4f64_to_v8f64:
-; X32-AVX: ## BB#0: ## %entry
+; X32-AVX: # %bb.0: # %entry
; X32-AVX-NEXT: vmovapd {{.*#+}} ymm3 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00]
; X32-AVX-NEXT: vaddpd %ymm3, %ymm0, %ymm0
; X32-AVX-NEXT: vaddpd %ymm3, %ymm2, %ymm2
; X32-AVX-NEXT: vaddpd %ymm3, %ymm1, %ymm1
; X32-AVX-NEXT: vdivpd %ymm3, %ymm1, %ymm1
; X32-AVX-NEXT: vdivpd %ymm3, %ymm2, %ymm2
-; X32-AVX-NEXT: vmovupd %ymm0, _ga2
-; X32-AVX-NEXT: vmovupd %ymm2, _gb2+32
-; X32-AVX-NEXT: vmovupd %ymm1, _gb2
+; X32-AVX-NEXT: vmovupd %ymm0, ga2
+; X32-AVX-NEXT: vmovupd %ymm2, gb2+32
+; X32-AVX-NEXT: vmovupd %ymm1, gb2
; X32-AVX-NEXT: vzeroupper
; X32-AVX-NEXT: retl
;
; X32-AVX512-LABEL: fallback_broadcast_v4f64_to_v8f64:
-; X32-AVX512: ## BB#0: ## %entry
+; X32-AVX512: # %bb.0: # %entry
; X32-AVX512-NEXT: vmovapd {{.*#+}} ymm2 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00]
; X32-AVX512-NEXT: vaddpd %ymm2, %ymm0, %ymm0
; X32-AVX512-NEXT: vinsertf64x4 $1, %ymm2, %zmm2, %zmm2
; X32-AVX512-NEXT: vaddpd %zmm2, %zmm1, %zmm1
; X32-AVX512-NEXT: vdivpd %zmm2, %zmm1, %zmm1
-; X32-AVX512-NEXT: vmovupd %ymm0, _ga2
-; X32-AVX512-NEXT: vmovupd %zmm1, _gb2
+; X32-AVX512-NEXT: vmovupd %ymm0, ga2
+; X32-AVX512-NEXT: vmovupd %zmm1, gb2
; X32-AVX512-NEXT: vzeroupper
; X32-AVX512-NEXT: retl
;
; X64-AVX-LABEL: fallback_broadcast_v4f64_to_v8f64:
-; X64-AVX: ## BB#0: ## %entry
+; X64-AVX: # %bb.0: # %entry
; X64-AVX-NEXT: vmovapd {{.*#+}} ymm3 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00]
; X64-AVX-NEXT: vaddpd %ymm3, %ymm0, %ymm0
; X64-AVX-NEXT: vaddpd %ymm3, %ymm2, %ymm2
@@ -1432,13 +1113,13 @@ define void @fallback_broadcast_v4f64_to_v8f64(<4 x double> %a, <8 x double> %b)
; X64-AVX-NEXT: vdivpd %ymm3, %ymm1, %ymm1
; X64-AVX-NEXT: vdivpd %ymm3, %ymm2, %ymm2
; X64-AVX-NEXT: vmovupd %ymm0, {{.*}}(%rip)
-; X64-AVX-NEXT: vmovupd %ymm2, _gb2+{{.*}}(%rip)
+; X64-AVX-NEXT: vmovupd %ymm2, gb2+{{.*}}(%rip)
; X64-AVX-NEXT: vmovupd %ymm1, {{.*}}(%rip)
; X64-AVX-NEXT: vzeroupper
; X64-AVX-NEXT: retq
;
; X64-AVX512-LABEL: fallback_broadcast_v4f64_to_v8f64:
-; X64-AVX512: ## BB#0: ## %entry
+; X64-AVX512: # %bb.0: # %entry
; X64-AVX512-NEXT: vmovapd {{.*#+}} ymm2 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00]
; X64-AVX512-NEXT: vaddpd %ymm2, %ymm0, %ymm0
; X64-AVX512-NEXT: vinsertf64x4 $1, %ymm2, %zmm2, %zmm2
@@ -1456,3 +1137,547 @@ entry:
store <8 x double> %2, <8 x double>* @gb2, align 8
ret void
}
+
+;
+; Subvector Broadcast from register
+;
+
+define <4 x double> @reg_broadcast_2f64_4f64(<2 x double> %a0) nounwind {
+; X32-LABEL: reg_broadcast_2f64_4f64:
+; X32: # %bb.0:
+; X32-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
+; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: reg_broadcast_2f64_4f64:
+; X64: # %bb.0:
+; X64-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
+; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X64-NEXT: retq
+ %1 = shufflevector <2 x double> %a0, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+ ret <4 x double> %1
+}
+
+define <8 x double> @reg_broadcast_2f64_8f64(<2 x double> %a0) nounwind {
+; X32-AVX-LABEL: reg_broadcast_2f64_8f64:
+; X32-AVX: # %bb.0:
+; X32-AVX-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
+; X32-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X32-AVX-NEXT: vmovaps %ymm0, %ymm1
+; X32-AVX-NEXT: retl
+;
+; X32-AVX512-LABEL: reg_broadcast_2f64_8f64:
+; X32-AVX512: # %bb.0:
+; X32-AVX512-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
+; X32-AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X32-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
+; X32-AVX512-NEXT: retl
+;
+; X64-AVX-LABEL: reg_broadcast_2f64_8f64:
+; X64-AVX: # %bb.0:
+; X64-AVX-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
+; X64-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
+; X64-AVX-NEXT: retq
+;
+; X64-AVX512-LABEL: reg_broadcast_2f64_8f64:
+; X64-AVX512: # %bb.0:
+; X64-AVX512-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
+; X64-AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X64-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <2 x double> %a0, <2 x double> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+ ret <8 x double> %1
+}
+
+define <8 x double> @reg_broadcast_4f64_8f64(<4 x double> %a0) nounwind {
+; X32-AVX-LABEL: reg_broadcast_4f64_8f64:
+; X32-AVX: # %bb.0:
+; X32-AVX-NEXT: vmovaps %ymm0, %ymm1
+; X32-AVX-NEXT: retl
+;
+; X32-AVX512-LABEL: reg_broadcast_4f64_8f64:
+; X32-AVX512: # %bb.0:
+; X32-AVX512-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
+; X32-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
+; X32-AVX512-NEXT: retl
+;
+; X64-AVX-LABEL: reg_broadcast_4f64_8f64:
+; X64-AVX: # %bb.0:
+; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
+; X64-AVX-NEXT: retq
+;
+; X64-AVX512-LABEL: reg_broadcast_4f64_8f64:
+; X64-AVX512: # %bb.0:
+; X64-AVX512-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
+; X64-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <4 x double> %a0, <4 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+ ret <8 x double> %1
+}
+
+define <4 x i64> @reg_broadcast_2i64_4i64(<2 x i64> %a0) nounwind {
+; X32-LABEL: reg_broadcast_2i64_4i64:
+; X32: # %bb.0:
+; X32-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
+; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: reg_broadcast_2i64_4i64:
+; X64: # %bb.0:
+; X64-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
+; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X64-NEXT: retq
+ %1 = shufflevector <2 x i64> %a0, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+ ret <4 x i64> %1
+}
+
+define <8 x i64> @reg_broadcast_2i64_8i64(<2 x i64> %a0) nounwind {
+; X32-AVX-LABEL: reg_broadcast_2i64_8i64:
+; X32-AVX: # %bb.0:
+; X32-AVX-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
+; X32-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X32-AVX-NEXT: vmovaps %ymm0, %ymm1
+; X32-AVX-NEXT: retl
+;
+; X32-AVX512-LABEL: reg_broadcast_2i64_8i64:
+; X32-AVX512: # %bb.0:
+; X32-AVX512-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
+; X32-AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X32-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
+; X32-AVX512-NEXT: retl
+;
+; X64-AVX-LABEL: reg_broadcast_2i64_8i64:
+; X64-AVX: # %bb.0:
+; X64-AVX-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
+; X64-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
+; X64-AVX-NEXT: retq
+;
+; X64-AVX512-LABEL: reg_broadcast_2i64_8i64:
+; X64-AVX512: # %bb.0:
+; X64-AVX512-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
+; X64-AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X64-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <2 x i64> %a0, <2 x i64> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+ ret <8 x i64> %1
+}
+
+define <8 x i64> @reg_broadcast_4i64_8i64(<4 x i64> %a0) nounwind {
+; X32-AVX-LABEL: reg_broadcast_4i64_8i64:
+; X32-AVX: # %bb.0:
+; X32-AVX-NEXT: vmovaps %ymm0, %ymm1
+; X32-AVX-NEXT: retl
+;
+; X32-AVX512-LABEL: reg_broadcast_4i64_8i64:
+; X32-AVX512: # %bb.0:
+; X32-AVX512-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
+; X32-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
+; X32-AVX512-NEXT: retl
+;
+; X64-AVX-LABEL: reg_broadcast_4i64_8i64:
+; X64-AVX: # %bb.0:
+; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
+; X64-AVX-NEXT: retq
+;
+; X64-AVX512-LABEL: reg_broadcast_4i64_8i64:
+; X64-AVX512: # %bb.0:
+; X64-AVX512-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
+; X64-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <4 x i64> %a0, <4 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+ ret <8 x i64> %1
+}
+
+define <8 x float> @reg_broadcast_4f32_8f32(<4 x float> %a0) nounwind {
+; X32-LABEL: reg_broadcast_4f32_8f32:
+; X32: # %bb.0:
+; X32-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
+; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: reg_broadcast_4f32_8f32:
+; X64: # %bb.0:
+; X64-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
+; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X64-NEXT: retq
+ %1 = shufflevector <4 x float> %a0, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+ ret <8 x float> %1
+}
+
+define <16 x float> @reg_broadcast_4f32_16f32(<4 x float> %a0) nounwind {
+; X32-AVX-LABEL: reg_broadcast_4f32_16f32:
+; X32-AVX: # %bb.0:
+; X32-AVX-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
+; X32-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X32-AVX-NEXT: vmovaps %ymm0, %ymm1
+; X32-AVX-NEXT: retl
+;
+; X32-AVX512-LABEL: reg_broadcast_4f32_16f32:
+; X32-AVX512: # %bb.0:
+; X32-AVX512-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
+; X32-AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X32-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
+; X32-AVX512-NEXT: retl
+;
+; X64-AVX-LABEL: reg_broadcast_4f32_16f32:
+; X64-AVX: # %bb.0:
+; X64-AVX-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
+; X64-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
+; X64-AVX-NEXT: retq
+;
+; X64-AVX512-LABEL: reg_broadcast_4f32_16f32:
+; X64-AVX512: # %bb.0:
+; X64-AVX512-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
+; X64-AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X64-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <4 x float> %a0, <4 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+ ret <16 x float> %1
+}
+
+define <16 x float> @reg_broadcast_8f32_16f32(<8 x float> %a0) nounwind {
+; X32-AVX-LABEL: reg_broadcast_8f32_16f32:
+; X32-AVX: # %bb.0:
+; X32-AVX-NEXT: vmovaps %ymm0, %ymm1
+; X32-AVX-NEXT: retl
+;
+; X32-AVX512-LABEL: reg_broadcast_8f32_16f32:
+; X32-AVX512: # %bb.0:
+; X32-AVX512-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
+; X32-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
+; X32-AVX512-NEXT: retl
+;
+; X64-AVX-LABEL: reg_broadcast_8f32_16f32:
+; X64-AVX: # %bb.0:
+; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
+; X64-AVX-NEXT: retq
+;
+; X64-AVX512-LABEL: reg_broadcast_8f32_16f32:
+; X64-AVX512: # %bb.0:
+; X64-AVX512-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
+; X64-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <8 x float> %a0, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <16 x float> %1
+}
+
+define <8 x i32> @reg_broadcast_4i32_8i32(<4 x i32> %a0) nounwind {
+; X32-LABEL: reg_broadcast_4i32_8i32:
+; X32: # %bb.0:
+; X32-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
+; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: reg_broadcast_4i32_8i32:
+; X64: # %bb.0:
+; X64-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
+; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X64-NEXT: retq
+ %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+ ret <8 x i32> %1
+}
+
+define <16 x i32> @reg_broadcast_4i32_16i32(<4 x i32> %a0) nounwind {
+; X32-AVX-LABEL: reg_broadcast_4i32_16i32:
+; X32-AVX: # %bb.0:
+; X32-AVX-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
+; X32-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X32-AVX-NEXT: vmovaps %ymm0, %ymm1
+; X32-AVX-NEXT: retl
+;
+; X32-AVX512-LABEL: reg_broadcast_4i32_16i32:
+; X32-AVX512: # %bb.0:
+; X32-AVX512-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
+; X32-AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X32-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
+; X32-AVX512-NEXT: retl
+;
+; X64-AVX-LABEL: reg_broadcast_4i32_16i32:
+; X64-AVX: # %bb.0:
+; X64-AVX-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
+; X64-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
+; X64-AVX-NEXT: retq
+;
+; X64-AVX512-LABEL: reg_broadcast_4i32_16i32:
+; X64-AVX512: # %bb.0:
+; X64-AVX512-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
+; X64-AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X64-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+ ret <16 x i32> %1
+}
+
+define <16 x i32> @reg_broadcast_8i32_16i32(<8 x i32> %a0) nounwind {
+; X32-AVX-LABEL: reg_broadcast_8i32_16i32:
+; X32-AVX: # %bb.0:
+; X32-AVX-NEXT: vmovaps %ymm0, %ymm1
+; X32-AVX-NEXT: retl
+;
+; X32-AVX512-LABEL: reg_broadcast_8i32_16i32:
+; X32-AVX512: # %bb.0:
+; X32-AVX512-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
+; X32-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
+; X32-AVX512-NEXT: retl
+;
+; X64-AVX-LABEL: reg_broadcast_8i32_16i32:
+; X64-AVX: # %bb.0:
+; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
+; X64-AVX-NEXT: retq
+;
+; X64-AVX512-LABEL: reg_broadcast_8i32_16i32:
+; X64-AVX512: # %bb.0:
+; X64-AVX512-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
+; X64-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
+; X64-AVX512-NEXT: retq
+ %1 = shufflevector <8 x i32> %a0, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <16 x i32> %1
+}
+
+define <16 x i16> @reg_broadcast_8i16_16i16(<8 x i16> %a0) nounwind {
+; X32-LABEL: reg_broadcast_8i16_16i16:
+; X32: # %bb.0:
+; X32-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
+; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: reg_broadcast_8i16_16i16:
+; X64: # %bb.0:
+; X64-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
+; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X64-NEXT: retq
+ %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <16 x i16> %1
+}
+
+define <32 x i16> @reg_broadcast_8i16_32i16(<8 x i16> %a0) nounwind {
+; X32-AVX-LABEL: reg_broadcast_8i16_32i16:
+; X32-AVX: # %bb.0:
+; X32-AVX-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
+; X32-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X32-AVX-NEXT: vmovaps %ymm0, %ymm1
+; X32-AVX-NEXT: retl
+;
+; X32-AVX512F-LABEL: reg_broadcast_8i16_32i16:
+; X32-AVX512F: # %bb.0:
+; X32-AVX512F-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
+; X32-AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X32-AVX512F-NEXT: vmovaps %ymm0, %ymm1
+; X32-AVX512F-NEXT: retl
+;
+; X32-AVX512BW-LABEL: reg_broadcast_8i16_32i16:
+; X32-AVX512BW: # %bb.0:
+; X32-AVX512BW-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
+; X32-AVX512BW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X32-AVX512BW-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
+; X32-AVX512BW-NEXT: retl
+;
+; X32-AVX512DQ-LABEL: reg_broadcast_8i16_32i16:
+; X32-AVX512DQ: # %bb.0:
+; X32-AVX512DQ-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
+; X32-AVX512DQ-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X32-AVX512DQ-NEXT: vmovaps %ymm0, %ymm1
+; X32-AVX512DQ-NEXT: retl
+;
+; X64-AVX-LABEL: reg_broadcast_8i16_32i16:
+; X64-AVX: # %bb.0:
+; X64-AVX-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
+; X64-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
+; X64-AVX-NEXT: retq
+;
+; X64-AVX512F-LABEL: reg_broadcast_8i16_32i16:
+; X64-AVX512F: # %bb.0:
+; X64-AVX512F-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
+; X64-AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X64-AVX512F-NEXT: vmovaps %ymm0, %ymm1
+; X64-AVX512F-NEXT: retq
+;
+; X64-AVX512BW-LABEL: reg_broadcast_8i16_32i16:
+; X64-AVX512BW: # %bb.0:
+; X64-AVX512BW-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
+; X64-AVX512BW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X64-AVX512BW-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
+; X64-AVX512BW-NEXT: retq
+;
+; X64-AVX512DQ-LABEL: reg_broadcast_8i16_32i16:
+; X64-AVX512DQ: # %bb.0:
+; X64-AVX512DQ-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
+; X64-AVX512DQ-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X64-AVX512DQ-NEXT: vmovaps %ymm0, %ymm1
+; X64-AVX512DQ-NEXT: retq
+ %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <32 x i16> %1
+}
+
+define <32 x i16> @reg_broadcast_16i16_32i16(<16 x i16> %a0) nounwind {
+; X32-AVX-LABEL: reg_broadcast_16i16_32i16:
+; X32-AVX: # %bb.0:
+; X32-AVX-NEXT: vmovaps %ymm0, %ymm1
+; X32-AVX-NEXT: retl
+;
+; X32-AVX512F-LABEL: reg_broadcast_16i16_32i16:
+; X32-AVX512F: # %bb.0:
+; X32-AVX512F-NEXT: vmovaps %ymm0, %ymm1
+; X32-AVX512F-NEXT: retl
+;
+; X32-AVX512BW-LABEL: reg_broadcast_16i16_32i16:
+; X32-AVX512BW: # %bb.0:
+; X32-AVX512BW-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
+; X32-AVX512BW-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
+; X32-AVX512BW-NEXT: retl
+;
+; X32-AVX512DQ-LABEL: reg_broadcast_16i16_32i16:
+; X32-AVX512DQ: # %bb.0:
+; X32-AVX512DQ-NEXT: vmovaps %ymm0, %ymm1
+; X32-AVX512DQ-NEXT: retl
+;
+; X64-AVX-LABEL: reg_broadcast_16i16_32i16:
+; X64-AVX: # %bb.0:
+; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
+; X64-AVX-NEXT: retq
+;
+; X64-AVX512F-LABEL: reg_broadcast_16i16_32i16:
+; X64-AVX512F: # %bb.0:
+; X64-AVX512F-NEXT: vmovaps %ymm0, %ymm1
+; X64-AVX512F-NEXT: retq
+;
+; X64-AVX512BW-LABEL: reg_broadcast_16i16_32i16:
+; X64-AVX512BW: # %bb.0:
+; X64-AVX512BW-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
+; X64-AVX512BW-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
+; X64-AVX512BW-NEXT: retq
+;
+; X64-AVX512DQ-LABEL: reg_broadcast_16i16_32i16:
+; X64-AVX512DQ: # %bb.0:
+; X64-AVX512DQ-NEXT: vmovaps %ymm0, %ymm1
+; X64-AVX512DQ-NEXT: retq
+ %1 = shufflevector <16 x i16> %a0, <16 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ ret <32 x i16> %1
+}
+
+define <32 x i8> @reg_broadcast_16i8_32i8(<16 x i8> %a0) nounwind {
+; X32-LABEL: reg_broadcast_16i8_32i8:
+; X32: # %bb.0:
+; X32-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
+; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: reg_broadcast_16i8_32i8:
+; X64: # %bb.0:
+; X64-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
+; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X64-NEXT: retq
+ %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ ret <32 x i8> %1
+}
+
+define <64 x i8> @reg_broadcast_16i8_64i8(<16 x i8> %a0) nounwind {
+; X32-AVX-LABEL: reg_broadcast_16i8_64i8:
+; X32-AVX: # %bb.0:
+; X32-AVX-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
+; X32-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X32-AVX-NEXT: vmovaps %ymm0, %ymm1
+; X32-AVX-NEXT: retl
+;
+; X32-AVX512F-LABEL: reg_broadcast_16i8_64i8:
+; X32-AVX512F: # %bb.0:
+; X32-AVX512F-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
+; X32-AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X32-AVX512F-NEXT: vmovaps %ymm0, %ymm1
+; X32-AVX512F-NEXT: retl
+;
+; X32-AVX512BW-LABEL: reg_broadcast_16i8_64i8:
+; X32-AVX512BW: # %bb.0:
+; X32-AVX512BW-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
+; X32-AVX512BW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X32-AVX512BW-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
+; X32-AVX512BW-NEXT: retl
+;
+; X32-AVX512DQ-LABEL: reg_broadcast_16i8_64i8:
+; X32-AVX512DQ: # %bb.0:
+; X32-AVX512DQ-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
+; X32-AVX512DQ-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X32-AVX512DQ-NEXT: vmovaps %ymm0, %ymm1
+; X32-AVX512DQ-NEXT: retl
+;
+; X64-AVX-LABEL: reg_broadcast_16i8_64i8:
+; X64-AVX: # %bb.0:
+; X64-AVX-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
+; X64-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
+; X64-AVX-NEXT: retq
+;
+; X64-AVX512F-LABEL: reg_broadcast_16i8_64i8:
+; X64-AVX512F: # %bb.0:
+; X64-AVX512F-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
+; X64-AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X64-AVX512F-NEXT: vmovaps %ymm0, %ymm1
+; X64-AVX512F-NEXT: retq
+;
+; X64-AVX512BW-LABEL: reg_broadcast_16i8_64i8:
+; X64-AVX512BW: # %bb.0:
+; X64-AVX512BW-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
+; X64-AVX512BW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X64-AVX512BW-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
+; X64-AVX512BW-NEXT: retq
+;
+; X64-AVX512DQ-LABEL: reg_broadcast_16i8_64i8:
+; X64-AVX512DQ: # %bb.0:
+; X64-AVX512DQ-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
+; X64-AVX512DQ-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X64-AVX512DQ-NEXT: vmovaps %ymm0, %ymm1
+; X64-AVX512DQ-NEXT: retq
+ %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ ret <64 x i8> %1
+}
+
+define <64 x i8> @reg_broadcast_32i8_64i8(<32 x i8> %a0) nounwind {
+; X32-AVX-LABEL: reg_broadcast_32i8_64i8:
+; X32-AVX: # %bb.0:
+; X32-AVX-NEXT: vmovaps %ymm0, %ymm1
+; X32-AVX-NEXT: retl
+;
+; X32-AVX512F-LABEL: reg_broadcast_32i8_64i8:
+; X32-AVX512F: # %bb.0:
+; X32-AVX512F-NEXT: vmovaps %ymm0, %ymm1
+; X32-AVX512F-NEXT: retl
+;
+; X32-AVX512BW-LABEL: reg_broadcast_32i8_64i8:
+; X32-AVX512BW: # %bb.0:
+; X32-AVX512BW-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
+; X32-AVX512BW-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
+; X32-AVX512BW-NEXT: retl
+;
+; X32-AVX512DQ-LABEL: reg_broadcast_32i8_64i8:
+; X32-AVX512DQ: # %bb.0:
+; X32-AVX512DQ-NEXT: vmovaps %ymm0, %ymm1
+; X32-AVX512DQ-NEXT: retl
+;
+; X64-AVX-LABEL: reg_broadcast_32i8_64i8:
+; X64-AVX: # %bb.0:
+; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
+; X64-AVX-NEXT: retq
+;
+; X64-AVX512F-LABEL: reg_broadcast_32i8_64i8:
+; X64-AVX512F: # %bb.0:
+; X64-AVX512F-NEXT: vmovaps %ymm0, %ymm1
+; X64-AVX512F-NEXT: retq
+;
+; X64-AVX512BW-LABEL: reg_broadcast_32i8_64i8:
+; X64-AVX512BW: # %bb.0:
+; X64-AVX512BW-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
+; X64-AVX512BW-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
+; X64-AVX512BW-NEXT: retq
+;
+; X64-AVX512DQ-LABEL: reg_broadcast_32i8_64i8:
+; X64-AVX512DQ: # %bb.0:
+; X64-AVX512DQ-NEXT: vmovaps %ymm0, %ymm1
+; X64-AVX512DQ-NEXT: retq
+ %1 = shufflevector <32 x i8> %a0, <32 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+ ret <64 x i8> %1
+}
diff --git a/test/CodeGen/X86/swift-error.ll b/test/CodeGen/X86/swift-error.ll
new file mode 100644
index 000000000000..896166a369e3
--- /dev/null
+++ b/test/CodeGen/X86/swift-error.ll
@@ -0,0 +1,18 @@
+; RUN: llc -mtriple x86_64-unknown-windows-msvc -filetype asm -o - %s | FileCheck %s
+
+%swift.error = type opaque
+
+declare swiftcc void @f(%swift.error** swifterror)
+
+define swiftcc void @g(i8*, i8*, i8*, i8*, %swift.error** swifterror %error) {
+entry:
+ call swiftcc void @f(%swift.error** nonnull nocapture swifterror %error)
+ ret void
+}
+
+; CHECK-LABEL: g
+; CHECK-NOT: pushq %r12
+; CHECK: callq f
+; CHECK-NOT: popq %r12
+; CHECK: retq
+
diff --git a/test/CodeGen/X86/swiftcc.ll b/test/CodeGen/X86/swiftcc.ll
new file mode 100644
index 000000000000..dc36ee247f11
--- /dev/null
+++ b/test/CodeGen/X86/swiftcc.ll
@@ -0,0 +1,11 @@
+; RUN: llc -mtriple x86_64-unknown-windows-msvc -filetype asm -o - %s | FileCheck %s
+
+define swiftcc void @f() {
+ %1 = alloca i8
+ ret void
+}
+
+; CHECK-LABEL: f
+; CHECK: .seh_stackalloc 8
+; CHECK: .seh_endprologue
+
diff --git a/test/CodeGen/X86/switch-bt.ll b/test/CodeGen/X86/switch-bt.ll
index e4fbbeb26c3a..958d053c22f0 100644
--- a/test/CodeGen/X86/switch-bt.ll
+++ b/test/CodeGen/X86/switch-bt.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=x86-64 -asm-verbose=false < %s -jump-table-density=40 | FileCheck %s
+; RUN: llc -mtriple=x86_64-- -asm-verbose=false < %s -jump-table-density=40 -switch-peel-threshold=101 | FileCheck %s
; This switch should use bit tests, and the third bit test case is just
; testing for one possible value, so it doesn't need a bt.
diff --git a/test/CodeGen/X86/switch-crit-edge-constant.ll b/test/CodeGen/X86/switch-crit-edge-constant.ll
index e9a208d709ef..888edbcaaec6 100644
--- a/test/CodeGen/X86/switch-crit-edge-constant.ll
+++ b/test/CodeGen/X86/switch-crit-edge-constant.ll
@@ -1,5 +1,5 @@
; PR925
-; RUN: llc < %s -march=x86 | FileCheck %s
+; RUN: llc < %s | FileCheck %s
; CHECK: {{mov.*str1}}
; CHECK-NOT: {{mov.*str1}}
diff --git a/test/CodeGen/X86/switch-default-only.ll b/test/CodeGen/X86/switch-default-only.ll
index 360ace5b787f..4310e40b57a6 100644
--- a/test/CodeGen/X86/switch-default-only.ll
+++ b/test/CodeGen/X86/switch-default-only.ll
@@ -1,4 +1,4 @@
-; RUN: llc -O0 -fast-isel=false -march=x86 < %s | FileCheck %s
+; RUN: llc -O0 -fast-isel=false -mtriple=i686-- < %s | FileCheck %s
; No need for branching when the default and only destination follows
; immediately after the switch.
diff --git a/test/CodeGen/X86/switch-edge-weight.ll b/test/CodeGen/X86/switch-edge-weight.ll
index 3679433c372f..516c254223f1 100644
--- a/test/CodeGen/X86/switch-edge-weight.ll
+++ b/test/CodeGen/X86/switch-edge-weight.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=x86-64 -print-machineinstrs=expand-isel-pseudos %s -o /dev/null 2>&1 | FileCheck %s
+; RUN: llc -mtriple=x86_64-- -print-machineinstrs=expand-isel-pseudos %s -o /dev/null 2>&1 | FileCheck %s
declare void @foo(i32)
@@ -31,25 +31,25 @@ sw.epilog:
; Check if weights are correctly assigned to edges generated from switch
; statement.
;
-; CHECK: BB#0:
-; BB#0 to BB#4: [0, 1133] (65 = 60 + 5)
-; BB#0 to BB#5: [1134, UINT32_MAX] (25 = 20 + 5)
-; CHECK: Successors according to CFG: BB#4({{[0-9a-fx/= ]+}}72.22%) BB#5({{[0-9a-fx/= ]+}}27.78%)
+; CHECK: %bb.0:
+; %bb.0 to %bb.4: [0, 1133] (65 = 60 + 5)
+; %bb.0 to %bb.5: [1134, UINT32_MAX] (25 = 20 + 5)
+; CHECK: Successors according to CFG: %bb.4({{[0-9a-fx/= ]+}}72.22%) %bb.5({{[0-9a-fx/= ]+}}27.78%)
;
-; CHECK: BB#4:
-; BB#4 to BB#1: [155, 159] (50)
-; BB#4 to BB#5: [0, 1133] - [155, 159] (15 = 10 + 5)
-; CHECK: Successors according to CFG: BB#1({{[0-9a-fx/= ]+}}76.92%) BB#7({{[0-9a-fx/= ]+}}23.08%)
+; CHECK: %bb.4:
+; %bb.4 to %bb.1: [155, 159] (50)
+; %bb.4 to %bb.5: [0, 1133] - [155, 159] (15 = 10 + 5)
+; CHECK: Successors according to CFG: %bb.1({{[0-9a-fx/= ]+}}76.92%) %bb.7({{[0-9a-fx/= ]+}}23.08%)
;
-; CHECK: BB#5:
-; BB#5 to BB#1: {1140} (10)
-; BB#5 to BB#6: [1134, UINT32_MAX] - {1140} (15 = 10 + 5)
-; CHECK: Successors according to CFG: BB#1({{[0-9a-fx/= ]+}}40.00%) BB#6({{[0-9a-fx/= ]+}}60.00%)
+; CHECK: %bb.5:
+; %bb.5 to %bb.1: {1140} (10)
+; %bb.5 to %bb.6: [1134, UINT32_MAX] - {1140} (15 = 10 + 5)
+; CHECK: Successors according to CFG: %bb.1({{[0-9a-fx/= ]+}}40.00%) %bb.6({{[0-9a-fx/= ]+}}60.00%)
;
-; CHECK: BB#6:
-; BB#6 to BB#1: {1134} (10)
-; BB#6 to BB#2: [1134, UINT32_MAX] - {1134, 1140} (5)
-; CHECK: Successors according to CFG: BB#1({{[0-9a-fx/= ]+}}66.67%) BB#2({{[0-9a-fx/= ]+}}33.33%)
+; CHECK: %bb.6:
+; %bb.6 to %bb.1: {1134} (10)
+; %bb.6 to %bb.2: [1134, UINT32_MAX] - {1134, 1140} (5)
+; CHECK: Successors according to CFG: %bb.1({{[0-9a-fx/= ]+}}66.67%) %bb.2({{[0-9a-fx/= ]+}}33.33%)
}
; CHECK-LABEL: test2
@@ -99,19 +99,19 @@ sw.epilog:
; Check if weights are correctly assigned to edges generated from switch
; statement.
;
-; CHECK: BB#0:
-; BB#0 to BB#6: {0} + [15, UINT32_MAX] (5)
-; BB#0 to BB#8: [1, 14] (jump table) (65 = 60 + 5)
-; CHECK: Successors according to CFG: BB#6({{[0-9a-fx/= ]+}}7.14%) BB#8({{[0-9a-fx/= ]+}}92.86%
+; CHECK: %bb.0:
+; %bb.0 to %bb.6: {0} + [15, UINT32_MAX] (5)
+; %bb.0 to %bb.8: [1, 14] (jump table) (65 = 60 + 5)
+; CHECK: Successors according to CFG: %bb.6({{[0-9a-fx/= ]+}}7.14%) %bb.8({{[0-9a-fx/= ]+}}92.86%
;
-; CHECK: BB#8:
-; BB#8 to BB#1: {1} (10)
-; BB#8 to BB#6: [2, 9] (5)
-; BB#8 to BB#2: {10} (10)
-; BB#8 to BB#3: {11} (10)
-; BB#8 to BB#4: {12} (10)
-; BB#8 to BB#5: {13, 14} (20)
-; CHECK: Successors according to CFG: BB#1({{[0-9a-fx/= ]+}}15.38%) BB#6({{[0-9a-fx/= ]+}}7.69%) BB#2({{[0-9a-fx/= ]+}}15.38%) BB#3({{[0-9a-fx/= ]+}}15.38%) BB#4({{[0-9a-fx/= ]+}}15.38%) BB#5({{[0-9a-fx/= ]+}}30.77%)
+; CHECK: %bb.8:
+; %bb.8 to %bb.1: {1} (10)
+; %bb.8 to %bb.6: [2, 9] (5)
+; %bb.8 to %bb.2: {10} (10)
+; %bb.8 to %bb.3: {11} (10)
+; %bb.8 to %bb.4: {12} (10)
+; %bb.8 to %bb.5: {13, 14} (20)
+; CHECK: Successors according to CFG: %bb.1({{[0-9a-fx/= ]+}}15.38%) %bb.6({{[0-9a-fx/= ]+}}7.69%) %bb.2({{[0-9a-fx/= ]+}}15.38%) %bb.3({{[0-9a-fx/= ]+}}15.38%) %bb.4({{[0-9a-fx/= ]+}}15.38%) %bb.5({{[0-9a-fx/= ]+}}30.77%)
}
; CHECK-LABEL: test3
@@ -160,18 +160,18 @@ sw.epilog:
; Check if weights are correctly assigned to edges generated from switch
; statement.
;
-; CHECK: BB#0:
-; BB#0 to BB#6: [0, 9] + [15, UINT32_MAX] {10}
-; BB#0 to BB#8: [10, 14] (jump table) (50)
-; CHECK: Successors according to CFG: BB#6({{[0-9a-fx/= ]+}}16.67%) BB#8({{[0-9a-fx/= ]+}}83.33%)
+; CHECK: %bb.0:
+; %bb.0 to %bb.6: [0, 9] + [15, UINT32_MAX] {10}
+; %bb.0 to %bb.8: [10, 14] (jump table) (50)
+; CHECK: Successors according to CFG: %bb.6({{[0-9a-fx/= ]+}}16.67%) %bb.8({{[0-9a-fx/= ]+}}83.33%)
;
-; CHECK: BB#8:
-; BB#8 to BB#1: {10} (10)
-; BB#8 to BB#2: {11} (10)
-; BB#8 to BB#3: {12} (10)
-; BB#8 to BB#4: {13} (10)
-; BB#8 to BB#5: {14} (10)
-; CHECK: Successors according to CFG: BB#1({{[0-9a-fx/= ]+}}20.00%) BB#2({{[0-9a-fx/= ]+}}20.00%) BB#3({{[0-9a-fx/= ]+}}20.00%) BB#4({{[0-9a-fx/= ]+}}20.00%) BB#5({{[0-9a-fx/= ]+}}20.00%)
+; CHECK: %bb.8:
+; %bb.8 to %bb.1: {10} (10)
+; %bb.8 to %bb.2: {11} (10)
+; %bb.8 to %bb.3: {12} (10)
+; %bb.8 to %bb.4: {13} (10)
+; %bb.8 to %bb.5: {14} (10)
+; CHECK: Successors according to CFG: %bb.1({{[0-9a-fx/= ]+}}20.00%) %bb.2({{[0-9a-fx/= ]+}}20.00%) %bb.3({{[0-9a-fx/= ]+}}20.00%) %bb.4({{[0-9a-fx/= ]+}}20.00%) %bb.5({{[0-9a-fx/= ]+}}20.00%)
}
; CHECK-LABEL: test4
@@ -213,15 +213,15 @@ sw.epilog:
; Check if weights are correctly assigned to edges generated from switch
; statement.
;
-; CHECK: BB#0:
-; BB#0 to BB#6: [0, 110] + [116, UINT32_MAX] (20)
-; BB#0 to BB#7: [111, 115] (bit test) (50)
-; CHECK: Successors according to CFG: BB#6({{[0-9a-fx/= ]+}}28.57%) BB#7({{[0-9a-fx/= ]+}}71.43%)
+; CHECK: %bb.0:
+; %bb.0 to %bb.6: [0, 110] + [116, UINT32_MAX] (20)
+; %bb.0 to %bb.7: [111, 115] (bit test) (50)
+; CHECK: Successors according to CFG: %bb.6({{[0-9a-fx/= ]+}}28.57%) %bb.7({{[0-9a-fx/= ]+}}71.43%)
;
-; CHECK: BB#7:
-; BB#7 to BB#2: {111, 114, 115} (30)
-; BB#7 to BB#3: {112, 113} (20)
-; CHECK: Successors according to CFG: BB#2({{[0-9a-fx/= ]+}}60.00%) BB#3({{[0-9a-fx/= ]+}}40.00%)
+; CHECK: %bb.7:
+; %bb.7 to %bb.2: {111, 114, 115} (30)
+; %bb.7 to %bb.3: {112, 113} (20)
+; CHECK: Successors according to CFG: %bb.2({{[0-9a-fx/= ]+}}60.00%) %bb.3({{[0-9a-fx/= ]+}}40.00%)
}
; CHECK-LABEL: test5
@@ -270,10 +270,10 @@ sw.epilog:
; Check if weights are correctly assigned to edges generated from switch
; statement.
;
-; CHECK: BB#0:
-; BB#0 to BB#6: [10, UINT32_MAX] (15)
-; BB#0 to BB#8: [4, 20, 28, 36] (jump table) (45)
-; CHECK: Successors according to CFG: BB#8({{[0-9a-fx/= ]+}}25.00%) BB#9({{[0-9a-fx/= ]+}}75.00%)
+; CHECK: %bb.0:
+; %bb.0 to %bb.6: [10, UINT32_MAX] (15)
+; %bb.0 to %bb.8: [4, 20, 28, 36] (jump table) (45)
+; CHECK: Successors according to CFG: %bb.8({{[0-9a-fx/= ]+}}25.00%) %bb.9({{[0-9a-fx/= ]+}}75.00%)
}
!1 = !{!"branch_weights", i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10}
diff --git a/test/CodeGen/X86/switch-jump-table.ll b/test/CodeGen/X86/switch-jump-table.ll
index 6393c688e282..a4564dc2ac70 100644
--- a/test/CodeGen/X86/switch-jump-table.ll
+++ b/test/CodeGen/X86/switch-jump-table.ll
@@ -9,7 +9,7 @@ define void @foo(i32 %x, i32* %to) {
; CHECK: movl 4(%esp), [[REG:%e[a-z]{2}]]
; CHECK: cmpl $3, [[REG]]
; CHECK: ja .LBB0_6
-; CHECK-NEXT: # BB#1:
+; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: jmpl *.LJTI0_0(,[[REG]],4)
; CHECK: movl $4
; CHECK: retl
@@ -55,8 +55,8 @@ default:
define void @bar(i32 %x, i32* %to) {
; CHECK-JT-PROB-LABEL: bar:
-; CHECK-JT-PROB: Successors according to CFG: BB#6({{[0-9a-fx/= ]+}}14.29%) BB#8({{[0-9a-fx/= ]+}}85.71%)
-; CHECK-JT-PROB: Successors according to CFG: BB#1({{[0-9a-fx/= ]+}}16.67%) BB#2({{[0-9a-fx/= ]+}}16.67%) BB#3({{[0-9a-fx/= ]+}}16.67%) BB#4({{[0-9a-fx/= ]+}}16.67%) BB#5({{[0-9a-fx/= ]+}}33.33%)
+; CHECK-JT-PROB: Successors according to CFG: %bb.6({{[0-9a-fx/= ]+}}14.29%) %bb.8({{[0-9a-fx/= ]+}}85.71%)
+; CHECK-JT-PROB: Successors according to CFG: %bb.1({{[0-9a-fx/= ]+}}16.67%) %bb.2({{[0-9a-fx/= ]+}}16.67%) %bb.3({{[0-9a-fx/= ]+}}16.67%) %bb.4({{[0-9a-fx/= ]+}}16.67%) %bb.5({{[0-9a-fx/= ]+}}33.33%)
entry:
switch i32 %x, label %default [
diff --git a/test/CodeGen/X86/switch-lower-peel-top-case.ll b/test/CodeGen/X86/switch-lower-peel-top-case.ll
new file mode 100644
index 000000000000..8a169c418367
--- /dev/null
+++ b/test/CodeGen/X86/switch-lower-peel-top-case.ll
@@ -0,0 +1,135 @@
+; RUN: llc -mtriple=x86_64-linux-gnu -stop-after=expand-isel-pseudos < %s | FileCheck %s
+
+define i32 @foo(i32 %n) !prof !1 {
+entry:
+ switch i32 %n, label %bb_default [
+ i32 8, label %bb1
+ i32 -8826, label %bb2
+ i32 18312, label %bb3
+ i32 18568, label %bb4
+ i32 129, label %bb5
+ ], !prof !2
+
+; CHECK: successors: %[[PEELED_CASE_LABEL:.*]](0x5999999a), %[[PEELED_SWITCH_LABEL:.*]](0x26666666)
+; CHECK: %[[VAL:[0-9]+]]:gr32 = COPY %edi
+; CHECK: %{{[0-9]+}}:gr32 = SUB32ri %[[VAL]], 18568, implicit-def %eflags
+; CHECK: JE_1 %[[PEELED_CASE_LABEL]], implicit %eflags
+; CHECK: JMP_1 %[[PEELED_SWITCH_LABEL]]
+; CHECK: [[PEELED_SWITCH_LABEL]].{{[a-zA-Z0-9.]+}}:
+; CHECK: successors: %[[BB1_LABEL:.*]](0x0206d3a0), %[[BB2_LABEL:.*]](0x7df92c60)
+; CHECK: %{{[0-9]+}}:gr32 = SUB32ri %[[VAL]], 18311, implicit-def %eflags
+; CHECK: JG_1 %[[BB2_LABEL]], implicit %eflags
+; CHECK: JMP_1 %[[BB1_LABEL]]
+; CHECK: [[BB1_LABEL]].{{[a-zA-Z0-9.]+}}:
+; CHECK: successors: %[[CASE2_LABEL:.*]](0x35e50d5b), %[[BB3_LABEL:.*]](0x4a1af2a5)
+; CHECK: %{{[0-9]+}}:gr32 = SUB32ri %[[VAL]], -8826, implicit-def %eflags
+; CHECK: JE_1 %[[CASE2_LABEL]], implicit %eflags
+; CHECK: JMP_1 %[[BB3_LABEL]]
+; CHECK: [[BB3_LABEL]]
+; CHECK: successors: %[[CASE5_LABEL:.*]](0x45d173c8), %[[BB4_LABEL:.*]](0x3a2e8c38)
+; CHECK: %{{[0-9]+}}:gr32 = SUB32ri %[[VAL]], 129, implicit-def %eflags
+; CHECK: JE_1 %[[CASE5_LABEL]], implicit %eflags
+; CHECK: JMP_1 %[[BB4_LABEL]]
+; CHECK: [[BB4_LABEL:.*]].{{[a-zA-Z0-9.]+}}:
+; CHECK: successors: %[[CASE1_LABEL:.*]](0x66666666), %[[DEFAULT_BB_LABEL:.*]](0x1999999a)
+; CHECK: %{{[0-9]+}}:gr32 = SUB32ri8 %[[VAL]], 8, implicit-def %eflags
+; CHECK: JE_1 %[[CASE1_LABEL]], implicit %eflags
+; CHECK: JMP_1 %[[DEFAULT_BB_LABEL]]
+; CHECK: [[BB2_LABEL]].{{[a-zA-Z0-9.]+}}:
+; CHECK: successors: %[[CASE3_LABEL:.*]](0x7fe44107), %[[DEFAULT_BB_LABEL]](0x001bbef9)
+; CHECK: %{{[0-9]+}}:gr32 = SUB32ri %[[VAL]], 18312, implicit-def %eflags
+; CHECK: JE_1 %[[CASE3_LABEL]], implicit %eflags
+; CHECK: JMP_1 %[[DEFAULT_BB_LABEL]]
+
+bb1:
+ br label %return
+bb2:
+ br label %return
+bb3:
+ br label %return
+bb4:
+ br label %return
+bb5:
+ br label %return
+bb_default:
+ br label %return
+
+return:
+ %retval = phi i32 [ 0, %bb_default ], [ 5, %bb5 ], [ 4, %bb4 ], [ 3, %bb3 ], [ 2, %bb2 ], [ 1, %bb1 ]
+ ret i32 %retval
+}
+
+; Test the peeling of the merged cases value 85 and 86.
+define i32 @foo1(i32 %n) !prof !1 {
+entry:
+ switch i32 %n, label %bb_default [
+ i32 -40, label %bb1
+ i32 86, label %bb2
+ i32 85, label %bb2
+ i32 1, label %bb3
+ i32 5, label %bb4
+ i32 7, label %bb5
+ i32 49, label %bb6
+ ], !prof !3
+
+; CHECK: successors: %[[PEELED_CASE_LABEL:.*]](0x59999999), %[[PEELED_SWITCH_LABEL:.*]](0x26666667)
+; CHECK: %[[VAL:[0-9]+]]:gr32 = COPY %edi
+; CHECK: %{{[0-9]+}}:gr32 = ADD32ri8 %{{[0-9]+}}, -85, implicit-def dead %eflags
+; CHECK: %{{[0-9]+}}:gr32 = SUB32ri8 %{{[0-9]+}}, 2, implicit-def %eflags
+; CHECK: JB_1 %[[PEELED_CASE_LABEL]], implicit %eflags
+; CHECK: JMP_1 %[[PEELED_SWITCH_LABEL]]
+; CHECK: [[PEELED_SWITCH_LABEL]].{{[a-zA-Z0-9.]+}}:
+; CHECK: successors: %[[BB1_LABEL:.*]](0x0088888a), %[[BB2_LABEL:.*]](0x7f777776)
+; CHECK: %{{[0-9]+}}:gr32 = SUB32ri8 %[[VAL]], 4, implicit-def %eflags
+; CHECK: JG_1 %[[BB2_LABEL]], implicit %eflags
+; CHECK: JMP_1 %[[BB1_LABEL]]
+; CHECK: [[BB1_LABEL]].{{[a-zA-Z0-9.]+}}:
+; CHECK: successors: %[[CASE4_LABEL:.*]](0x7f775a4f), %[[BB3_LABEL:.*]](0x0088a5b1)
+; CHECK: %{{[0-9]+}}:gr32 = SUB32ri8 %[[VAL]], 1, implicit-def %eflags
+; CHECK: JE_1 %[[CASE4_LABEL]], implicit %eflags
+; CHECK: JMP_1 %[[BB3_LABEL]]
+; CHECK: [[BB3_LABEL]].{{[a-zA-Z0-9.]+}}:
+; CHECK: successors: %[[CASE1_LABEL:.*]](0x66666666), %[[DEFAULT_BB_LABEL:.*]](0x1999999a)
+; CHECK: %{{[0-9]+}}:gr32 = SUB32ri8 %[[VAL]], -40, implicit-def %eflags
+; CHECK: JE_1 %[[CASE1_LABEL]], implicit %eflags
+; CHECK: JMP_1 %[[DEFAULT_BB_LABEL]]
+; CHECK: [[BB2_LABEL]].{{[a-zA-Z0-9.]+}}:
+; CHECK: successors: %[[CASE5_LABEL:.*]](0x00000000), %[[BB4_LABEL:.*]](0x80000000)
+; CHECK: %{{[0-9]+}}:gr32 = SUB32ri8 %[[VAL]], 5, implicit-def %eflags
+; CHECK: JE_1 %[[CASE5_LABEL]], implicit %eflags
+; CHECK: JMP_1 %[[BB4_LABEL]]
+; CHECK: [[BB4_LABEL]].{{[a-zA-Z0-9.]+}}:
+; CHECK: successors: %[[CASE6_LABEL:.*]](0x00000000), %[[BB5_LABEL:.*]](0x80000000)
+; CHECK: %{{[0-9]+}}:gr32 = SUB32ri8 %[[VAL]], 7, implicit-def %eflags
+; CHECK: JE_1 %[[CASE6_LABEL]], implicit %eflags
+; CHECK: JMP_1 %[[BB5_LABEL]]
+; CHECK: [[BB5_LABEL]].{{[a-zA-Z0-9.]+}}:
+; CHECK: successors: %[[CASE7_LABEL:.*]](0x00000000), %[[DEFAULT_BB_LABEL]](0x80000000)
+; CHECK: %{{[0-9]+}}:gr32 = SUB32ri8 %[[VAL]], 49, implicit-def %eflags
+; CHECK: JE_1 %[[CASE7_LABEL]], implicit %eflags
+; CHECK: JMP_1 %[[DEFAULT_BB_LABEL]]
+
+
+bb1:
+ br label %return
+bb2:
+ br label %return
+bb3:
+ br label %return
+bb4:
+ br label %return
+bb5:
+ br label %return
+bb6:
+ br label %return
+bb_default:
+ br label %return
+
+return:
+ %retval = phi i32 [ 0, %bb_default ], [ 6, %bb6 ], [ 5, %bb5 ], [ 4, %bb4 ], [ 3, %bb3 ], [ 2, %bb2 ], [ 1, %bb1 ]
+ ret i32 %retval
+}
+!1 = !{!"function_entry_count", i64 100000}
+!2 = !{!"branch_weights", i32 50, i32 100, i32 200, i32 29500, i32 70000, i32 150}
+!3 = !{!"branch_weights", i32 50, i32 100, i32 500, i32 69500, i32 29850, i32 0, i32 0, i32 0}
+
diff --git a/test/CodeGen/X86/switch-or.ll b/test/CodeGen/X86/switch-or.ll
index 4642accfff8d..c0501c315bcd 100644
--- a/test/CodeGen/X86/switch-or.ll
+++ b/test/CodeGen/X86/switch-or.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=x86 -asm-verbose=false < %s | FileCheck %s
+; RUN: llc -mtriple=i686-- -asm-verbose=false < %s | FileCheck %s
; Check that merging switch cases that differ in one bit works.
; CHECK-LABEL: test1
diff --git a/test/CodeGen/X86/switch-zextload.ll b/test/CodeGen/X86/switch-zextload.ll
index 2dd3f0e3ae72..68cfade0484e 100644
--- a/test/CodeGen/X86/switch-zextload.ll
+++ b/test/CodeGen/X86/switch-zextload.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | grep mov | count 1
+; RUN: llc < %s | grep mov | count 1
; Do zextload, instead of a load and a separate zext.
diff --git a/test/CodeGen/X86/switch.ll b/test/CodeGen/X86/switch.ll
index 5d52f95e71cc..95b2ed0e618f 100644
--- a/test/CodeGen/X86/switch.ll
+++ b/test/CodeGen/X86/switch.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=x86_64-linux-gnu %s -o - -jump-table-density=40 -verify-machineinstrs | FileCheck %s
+; RUN: llc -mtriple=x86_64-linux-gnu %s -o - -jump-table-density=40 -switch-peel-threshold=101 -verify-machineinstrs | FileCheck %s
; RUN: llc -mtriple=x86_64-linux-gnu %s -o - -O0 -jump-table-density=40 -verify-machineinstrs | FileCheck --check-prefix=NOOPT %s
declare void @g(i32)
@@ -432,9 +432,9 @@ sw:
; Branch directly to the default.
; (In optimized builds the switch is removed earlier.)
; NOOPT-LABEL: default_only
-; NOOPT: .[[L:[A-Z0-9_]+]]:
+; NOOPT: .LBB[[L:[A-Z0-9_]+]]:
; NOOPT-NEXT: retq
-; NOOPT: jmp .[[L]]
+; NOOPT: jmp .LBB[[L]]
}
diff --git a/test/CodeGen/X86/swizzle-2.ll b/test/CodeGen/X86/swizzle-2.ll
index fd81573edec9..dad6a4d7d4fc 100644
--- a/test/CodeGen/X86/swizzle-2.ll
+++ b/test/CodeGen/X86/swizzle-2.ll
@@ -11,7 +11,7 @@
define <4 x i32> @swizzle_1(<4 x i32> %v) {
; CHECK-LABEL: swizzle_1:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,3,2]
; CHECK-NEXT: retq
%1 = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 0, i32 1>
@@ -21,7 +21,7 @@ define <4 x i32> @swizzle_1(<4 x i32> %v) {
define <4 x i32> @swizzle_2(<4 x i32> %v) {
; CHECK-LABEL: swizzle_2:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,3,0]
; CHECK-NEXT: retq
%1 = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 3, i32 1, i32 0, i32 2>
@@ -31,7 +31,7 @@ define <4 x i32> @swizzle_2(<4 x i32> %v) {
define <4 x i32> @swizzle_3(<4 x i32> %v) {
; CHECK-LABEL: swizzle_3:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,3,2]
; CHECK-NEXT: retq
%1 = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 1, i32 0>
@@ -41,7 +41,7 @@ define <4 x i32> @swizzle_3(<4 x i32> %v) {
define <4 x i32> @swizzle_4(<4 x i32> %v) {
; CHECK-LABEL: swizzle_4:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,0,2]
; CHECK-NEXT: retq
%1 = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 3, i32 0>
@@ -51,7 +51,7 @@ define <4 x i32> @swizzle_4(<4 x i32> %v) {
define <4 x i32> @swizzle_5(<4 x i32> %v) {
; CHECK-LABEL: swizzle_5:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
; CHECK-NEXT: retq
%1 = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 1, i32 2, i32 3, i32 0>
@@ -61,7 +61,7 @@ define <4 x i32> @swizzle_5(<4 x i32> %v) {
define <4 x i32> @swizzle_6(<4 x i32> %v) {
; CHECK-LABEL: swizzle_6:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,1,3]
; CHECK-NEXT: retq
%1 = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 1, i32 2, i32 0, i32 3>
@@ -71,7 +71,7 @@ define <4 x i32> @swizzle_6(<4 x i32> %v) {
define <4 x i32> @swizzle_7(<4 x i32> %v) {
; CHECK-LABEL: swizzle_7:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,3,1]
; CHECK-NEXT: retq
%1 = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 0, i32 3, i32 1, i32 2>
@@ -81,7 +81,7 @@ define <4 x i32> @swizzle_7(<4 x i32> %v) {
define <4 x i32> @swizzle_8(<4 x i32> %v) {
; CHECK-LABEL: swizzle_8:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,0]
; CHECK-NEXT: retq
%1 = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 3, i32 0, i32 2, i32 1>
@@ -91,7 +91,7 @@ define <4 x i32> @swizzle_8(<4 x i32> %v) {
define <4 x i32> @swizzle_9(<4 x i32> %v) {
; CHECK-LABEL: swizzle_9:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
; CHECK-NEXT: retq
%1 = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 3, i32 0, i32 1, i32 2>
@@ -101,7 +101,7 @@ define <4 x i32> @swizzle_9(<4 x i32> %v) {
define <4 x i32> @swizzle_10(<4 x i32> %v) {
; CHECK-LABEL: swizzle_10:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,2,0,3]
; CHECK-NEXT: retq
%1 = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 2, i32 0, i32 1, i32 3>
@@ -111,7 +111,7 @@ define <4 x i32> @swizzle_10(<4 x i32> %v) {
define <4 x i32> @swizzle_11(<4 x i32> %v) {
; CHECK-LABEL: swizzle_11:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
; CHECK-NEXT: retq
%1 = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 2, i32 0, i32 3, i32 1>
@@ -121,7 +121,7 @@ define <4 x i32> @swizzle_11(<4 x i32> %v) {
define <4 x i32> @swizzle_12(<4 x i32> %v) {
; CHECK-LABEL: swizzle_12:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,1,2]
; CHECK-NEXT: retq
%1 = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 3, i32 1>
@@ -131,7 +131,7 @@ define <4 x i32> @swizzle_12(<4 x i32> %v) {
define <4 x i32> @swizzle_13(<4 x i32> %v) {
; CHECK-LABEL: swizzle_13:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
; CHECK-NEXT: retq
%1 = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 0, i32 2>
@@ -141,7 +141,7 @@ define <4 x i32> @swizzle_13(<4 x i32> %v) {
define <4 x i32> @swizzle_14(<4 x i32> %v) {
; CHECK-LABEL: swizzle_14:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,0,2,1]
; CHECK-NEXT: retq
%1 = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
@@ -151,7 +151,7 @@ define <4 x i32> @swizzle_14(<4 x i32> %v) {
define <4 x float> @swizzle_15(<4 x float> %v) {
; CHECK-LABEL: swizzle_15:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0,3,2]
; CHECK-NEXT: retq
%1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 0, i32 1>
@@ -161,7 +161,7 @@ define <4 x float> @swizzle_15(<4 x float> %v) {
define <4 x float> @swizzle_16(<4 x float> %v) {
; CHECK-LABEL: swizzle_16:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,1,3,0]
; CHECK-NEXT: retq
%1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 1, i32 0, i32 2>
@@ -171,7 +171,7 @@ define <4 x float> @swizzle_16(<4 x float> %v) {
define <4 x float> @swizzle_17(<4 x float> %v) {
; CHECK-LABEL: swizzle_17:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0,3,2]
; CHECK-NEXT: retq
%1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 1, i32 0>
@@ -181,7 +181,7 @@ define <4 x float> @swizzle_17(<4 x float> %v) {
define <4 x float> @swizzle_18(<4 x float> %v) {
; CHECK-LABEL: swizzle_18:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,0,2]
; CHECK-NEXT: retq
%1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 2, i32 1, i32 3, i32 0>
@@ -191,7 +191,7 @@ define <4 x float> @swizzle_18(<4 x float> %v) {
define <4 x float> @swizzle_19(<4 x float> %v) {
; CHECK-LABEL: swizzle_19:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0]
; CHECK-NEXT: retq
%1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 1, i32 2, i32 3, i32 0>
@@ -201,7 +201,7 @@ define <4 x float> @swizzle_19(<4 x float> %v) {
define <4 x float> @swizzle_20(<4 x float> %v) {
; CHECK-LABEL: swizzle_20:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3]
; CHECK-NEXT: retq
%1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 1, i32 2, i32 0, i32 3>
@@ -211,7 +211,7 @@ define <4 x float> @swizzle_20(<4 x float> %v) {
define <4 x float> @swizzle_21(<4 x float> %v) {
; CHECK-LABEL: swizzle_21:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,3,1]
; CHECK-NEXT: retq
%1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 0, i32 3, i32 1, i32 2>
@@ -221,7 +221,7 @@ define <4 x float> @swizzle_21(<4 x float> %v) {
define <4 x float> @swizzle_22(<4 x float> %v) {
; CHECK-LABEL: swizzle_22:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3,2,0]
; CHECK-NEXT: retq
%1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 0, i32 2, i32 1>
@@ -231,7 +231,7 @@ define <4 x float> @swizzle_22(<4 x float> %v) {
define <4 x float> @swizzle_23(<4 x float> %v) {
; CHECK-LABEL: swizzle_23:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0]
; CHECK-NEXT: retq
%1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 0, i32 1, i32 2>
@@ -241,7 +241,7 @@ define <4 x float> @swizzle_23(<4 x float> %v) {
define <4 x float> @swizzle_24(<4 x float> %v) {
; CHECK-LABEL: swizzle_24:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2,0,3]
; CHECK-NEXT: retq
%1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 2, i32 0, i32 1, i32 3>
@@ -251,7 +251,7 @@ define <4 x float> @swizzle_24(<4 x float> %v) {
define <4 x float> @swizzle_25(<4 x float> %v) {
; CHECK-LABEL: swizzle_25:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,2,1,0]
; CHECK-NEXT: retq
%1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 2, i32 0, i32 3, i32 1>
@@ -261,7 +261,7 @@ define <4 x float> @swizzle_25(<4 x float> %v) {
define <4 x float> @swizzle_26(<4 x float> %v) {
; CHECK-LABEL: swizzle_26:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3,1,2]
; CHECK-NEXT: retq
%1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 3, i32 1>
@@ -271,7 +271,7 @@ define <4 x float> @swizzle_26(<4 x float> %v) {
define <4 x float> @swizzle_27(<4 x float> %v) {
; CHECK-LABEL: swizzle_27:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,2,1,0]
; CHECK-NEXT: retq
%1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 0, i32 2>
@@ -281,7 +281,7 @@ define <4 x float> @swizzle_27(<4 x float> %v) {
define <4 x float> @swizzle_28(<4 x float> %v) {
; CHECK-LABEL: swizzle_28:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0,2,1]
; CHECK-NEXT: retq
%1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
@@ -291,7 +291,7 @@ define <4 x float> @swizzle_28(<4 x float> %v) {
define <4 x float> @swizzle_29(<4 x float> %v) {
; CHECK-LABEL: swizzle_29:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3,2,0]
; CHECK-NEXT: retq
%1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 1, i32 2, i32 0>
@@ -304,7 +304,7 @@ define <4 x float> @swizzle_29(<4 x float> %v) {
define <8 x i16> @swizzle_30(<8 x i16> %v) {
; CHECK-LABEL: swizzle_30:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,0,4,5,6,7]
; CHECK-NEXT: retq
%1 = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 3, i32 1, i32 2, i32 0, i32 7, i32 5, i32 6, i32 4>
@@ -314,7 +314,7 @@ define <8 x i16> @swizzle_30(<8 x i16> %v) {
define <8 x i16> @swizzle_31(<8 x i16> %v) {
; CHECK-LABEL: swizzle_31:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,0,4,5,6,7]
; CHECK-NEXT: retq
%1 = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 3, i32 0, i32 2, i32 1, i32 7, i32 5, i32 6, i32 4>
@@ -324,7 +324,7 @@ define <8 x i16> @swizzle_31(<8 x i16> %v) {
define <8 x i16> @swizzle_32(<8 x i16> %v) {
; CHECK-LABEL: swizzle_32:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,2,3]
; CHECK-NEXT: retq
%1 = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 1, i32 2, i32 3, i32 0, i32 7, i32 5, i32 6, i32 4>
@@ -334,7 +334,7 @@ define <8 x i16> @swizzle_32(<8 x i16> %v) {
define <8 x i16> @swizzle_33(<8 x i16> %v) {
; CHECK-LABEL: swizzle_33:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,3,0,4,5,6,7]
; CHECK-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,7,6,4]
; CHECK-NEXT: retq
@@ -345,7 +345,7 @@ define <8 x i16> @swizzle_33(<8 x i16> %v) {
define <8 x i16> @swizzle_34(<8 x i16> %v) {
; CHECK-LABEL: swizzle_34:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,3,0,2,4,5,6,7]
; CHECK-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,4,5]
; CHECK-NEXT: retq
@@ -356,7 +356,7 @@ define <8 x i16> @swizzle_34(<8 x i16> %v) {
define <8 x i16> @swizzle_35(<8 x i16> %v) {
; CHECK-LABEL: swizzle_35:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7]
; CHECK-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,6]
; CHECK-NEXT: retq
@@ -367,7 +367,7 @@ define <8 x i16> @swizzle_35(<8 x i16> %v) {
define <8 x i16> @swizzle_36(<8 x i16> %v) {
; CHECK-LABEL: swizzle_36:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7]
; CHECK-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,7]
; CHECK-NEXT: retq
@@ -378,7 +378,7 @@ define <8 x i16> @swizzle_36(<8 x i16> %v) {
define <8 x i16> @swizzle_37(<8 x i16> %v) {
; CHECK-LABEL: swizzle_37:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5]
; CHECK-NEXT: retq
%1 = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 7, i32 5, i32 6, i32 4>
@@ -388,7 +388,7 @@ define <8 x i16> @swizzle_37(<8 x i16> %v) {
define <8 x i16> @swizzle_38(<8 x i16> %v) {
; CHECK-LABEL: swizzle_38:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7]
; CHECK-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,6,7]
; CHECK-NEXT: retq
@@ -399,7 +399,7 @@ define <8 x i16> @swizzle_38(<8 x i16> %v) {
define <8 x i16> @swizzle_39(<8 x i16> %v) {
; CHECK-LABEL: swizzle_39:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,3,1,0,4,5,6,7]
; CHECK-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,4,5]
; CHECK-NEXT: retq
@@ -410,7 +410,7 @@ define <8 x i16> @swizzle_39(<8 x i16> %v) {
define <8 x i16> @swizzle_40(<8 x i16> %v) {
; CHECK-LABEL: swizzle_40:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,0,4,5,6,7]
; CHECK-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,7]
; CHECK-NEXT: retq
@@ -421,7 +421,7 @@ define <8 x i16> @swizzle_40(<8 x i16> %v) {
define <8 x i16> @swizzle_41(<8 x i16> %v) {
; CHECK-LABEL: swizzle_41:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,3,2]
; CHECK-NEXT: retq
@@ -432,7 +432,7 @@ define <8 x i16> @swizzle_41(<8 x i16> %v) {
define <8 x i16> @swizzle_42(<8 x i16> %v) {
; CHECK-LABEL: swizzle_42:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6]
; CHECK-NEXT: retq
%1 = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 3, i32 2, i32 7, i32 6, i32 4, i32 5>
diff --git a/test/CodeGen/X86/swizzle-avx2.ll b/test/CodeGen/X86/swizzle-avx2.ll
index 6ca9126eb09d..14244c3f8c74 100644
--- a/test/CodeGen/X86/swizzle-avx2.ll
+++ b/test/CodeGen/X86/swizzle-avx2.ll
@@ -13,9 +13,9 @@
define <8 x i32> @swizzle_1(<8 x i32> %v) {
; CHECK-LABEL: swizzle_1:
-; CHECK: # BB#0:
-; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [1,3,2,0,4,5,6,7]
-; CHECK-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [1,3,2,0,4,5,6,7]
+; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0
; CHECK-NEXT: retq
%1 = shufflevector <8 x i32> %v, <8 x i32> undef, <8 x i32> <i32 3, i32 1, i32 2, i32 0, i32 7, i32 5, i32 6, i32 4>
%2 = shufflevector <8 x i32> %1, <8 x i32> undef, <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 7, i32 5, i32 6, i32 4>
@@ -24,8 +24,8 @@ define <8 x i32> @swizzle_1(<8 x i32> %v) {
define <8 x i32> @swizzle_2(<8 x i32> %v) {
; CHECK-LABEL: swizzle_2:
-; CHECK: # BB#0:
-; CHECK-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
; CHECK-NEXT: retq
%1 = shufflevector <8 x i32> %v, <8 x i32> undef, <8 x i32> <i32 6, i32 7, i32 4, i32 5, i32 0, i32 1, i32 2, i32 3>
%2 = shufflevector <8 x i32> %1, <8 x i32> undef, <8 x i32> <i32 6, i32 7, i32 4, i32 5, i32 0, i32 1, i32 2, i32 3>
@@ -34,8 +34,8 @@ define <8 x i32> @swizzle_2(<8 x i32> %v) {
define <8 x i32> @swizzle_3(<8 x i32> %v) {
; CHECK-LABEL: swizzle_3:
-; CHECK: # BB#0:
-; CHECK-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
; CHECK-NEXT: retq
%1 = shufflevector <8 x i32> %v, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 2, i32 3, i32 0, i32 1>
%2 = shufflevector <8 x i32> %1, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 2, i32 3, i32 0, i32 1>
@@ -44,9 +44,9 @@ define <8 x i32> @swizzle_3(<8 x i32> %v) {
define <8 x i32> @swizzle_4(<8 x i32> %v) {
; CHECK-LABEL: swizzle_4:
-; CHECK: # BB#0:
-; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [3,1,2,0,6,5,4,7]
-; CHECK-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [3,1,2,0,6,5,4,7]
+; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0
; CHECK-NEXT: retq
%1 = shufflevector <8 x i32> %v, <8 x i32> undef, <8 x i32> <i32 4, i32 7, i32 5, i32 6, i32 3, i32 2, i32 0, i32 1>
%2 = shufflevector <8 x i32> %1, <8 x i32> undef, <8 x i32> <i32 4, i32 7, i32 5, i32 6, i32 3, i32 2, i32 0, i32 1>
@@ -55,9 +55,9 @@ define <8 x i32> @swizzle_4(<8 x i32> %v) {
define <8 x i32> @swizzle_5(<8 x i32> %v) {
; CHECK-LABEL: swizzle_5:
-; CHECK: # BB#0:
-; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [3,0,1,2,7,6,4,5]
-; CHECK-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [3,0,1,2,7,6,4,5]
+; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0
; CHECK-NEXT: retq
%1 = shufflevector <8 x i32> %v, <8 x i32> undef, <8 x i32> <i32 7, i32 4, i32 6, i32 5, i32 0, i32 2, i32 1, i32 3>
%2 = shufflevector <8 x i32> %1, <8 x i32> undef, <8 x i32> <i32 7, i32 4, i32 6, i32 5, i32 0, i32 2, i32 1, i32 3>
@@ -66,9 +66,9 @@ define <8 x i32> @swizzle_5(<8 x i32> %v) {
define <8 x i32> @swizzle_6(<8 x i32> %v) {
; CHECK-LABEL: swizzle_6:
-; CHECK: # BB#0:
-; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [3,1,0,2,4,5,6,7]
-; CHECK-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [3,1,0,2,4,5,6,7]
+; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0
; CHECK-NEXT: retq
%1 = shufflevector <8 x i32> %v, <8 x i32> undef, <8 x i32> <i32 2, i32 1, i32 3, i32 0, i32 4, i32 7, i32 6, i32 5>
%2 = shufflevector <8 x i32> %1, <8 x i32> undef, <8 x i32> <i32 2, i32 1, i32 3, i32 0, i32 4, i32 7, i32 6, i32 5>
@@ -77,9 +77,9 @@ define <8 x i32> @swizzle_6(<8 x i32> %v) {
define <8 x i32> @swizzle_7(<8 x i32> %v) {
; CHECK-LABEL: swizzle_7:
-; CHECK: # BB#0:
-; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,3,1,4,5,6,7]
-; CHECK-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [0,2,3,1,4,5,6,7]
+; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0
; CHECK-NEXT: retq
%1 = shufflevector <8 x i32> %v, <8 x i32> undef, <8 x i32> <i32 0, i32 3, i32 1, i32 2, i32 5, i32 4, i32 6, i32 7>
%2 = shufflevector <8 x i32> %1, <8 x i32> undef, <8 x i32> <i32 0, i32 3, i32 1, i32 2, i32 5, i32 4, i32 6, i32 7>
diff --git a/test/CodeGen/X86/system-intrinsics-xgetbv.ll b/test/CodeGen/X86/system-intrinsics-xgetbv.ll
index a5ba026c8e63..ffabd3a23090 100644
--- a/test/CodeGen/X86/system-intrinsics-xgetbv.ll
+++ b/test/CodeGen/X86/system-intrinsics-xgetbv.ll
@@ -18,4 +18,4 @@ define i64 @test_xgetbv(i32 %in) {
ret i64 %1;
}
-declare i64 @llvm.x86.xgetbv(i32) \ No newline at end of file
+declare i64 @llvm.x86.xgetbv(i32)
diff --git a/test/CodeGen/X86/tail-call-conditional.mir b/test/CodeGen/X86/tail-call-conditional.mir
index e006138ba848..300b2734f52f 100644
--- a/test/CodeGen/X86/tail-call-conditional.mir
+++ b/test/CodeGen/X86/tail-call-conditional.mir
@@ -48,7 +48,7 @@ body: |
; CHECK-NEXT: %rdi = COPY %rsi
; CHECK-NEXT: %rsi = COPY %rax
; CHECK-NEXT: CMP64ri8 %rax, 9, implicit-def %eflags
- ; CHECK-NEXT: TCRETURNdi64cc @f1, 0, 3, csr_64, implicit %rsp, implicit %eflags, implicit %rsp, implicit %rdi, implicit %rsi, implicit %rax, implicit-def %rax, implicit %sil, implicit-def %sil, implicit %si, implicit-def %si, implicit %esi, implicit-def %esi, implicit %rsi, implicit-def %rsi, implicit %dil, implicit-def %dil, implicit %di, implicit-def %di, implicit %edi, implicit-def %edi, implicit %rdi, implicit-def %rdi, implicit %ah, implicit-def %ah, implicit %al, implicit-def %al, implicit %ax, implicit-def %ax, implicit %eax, implicit-def %eax
+ ; CHECK-NEXT: TCRETURNdi64cc @f1, 0, 3, csr_64, implicit %rsp, implicit %eflags, implicit %ssp, implicit %rsp, implicit %rdi, implicit %rsi, implicit %rax, implicit-def %rax, implicit %sil, implicit-def %sil, implicit %si, implicit-def %si, implicit %esi, implicit-def %esi, implicit %rsi, implicit-def %rsi, implicit %dil, implicit-def %dil, implicit %di, implicit-def %di, implicit %edi, implicit-def %edi, implicit %rdi, implicit-def %rdi, implicit %ah, implicit-def %ah, implicit %al, implicit-def %al, implicit %ax, implicit-def %ax, implicit %eax, implicit-def %eax
bb.1:
successors: %bb.2, %bb.3
diff --git a/test/CodeGen/X86/tail-call-legality.ll b/test/CodeGen/X86/tail-call-legality.ll
index 119610430b18..a5ae82db11fa 100644
--- a/test/CodeGen/X86/tail-call-legality.ll
+++ b/test/CodeGen/X86/tail-call-legality.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=x86 -o - < %s | FileCheck %s
+; RUN: llc -mtriple=i686-- -o - < %s | FileCheck %s
; This used to be classified as a tail call because of a mismatch in the
; arguments seen by Analysis.cpp and ISelLowering. As seen by ISelLowering, they
diff --git a/test/CodeGen/X86/tail-dup-debugloc.ll b/test/CodeGen/X86/tail-dup-debugloc.ll
index c5ca6fc5750c..4907e5244b60 100644
--- a/test/CodeGen/X86/tail-dup-debugloc.ll
+++ b/test/CodeGen/X86/tail-dup-debugloc.ll
@@ -1,10 +1,10 @@
-; RUN: llc -stop-after=tailduplication -march=x86-64 < %s | FileCheck %s
+; RUN: llc -stop-after=tailduplication < %s | FileCheck %s
;
-; Check that DebugLoc attached to the branch instruction of
+; Check that DebugLoc attached to the branch instruction of
; 'while.cond1.preheader.lr.ph' survives after tailduplication pass.
;
; CHECK: [[DLOC:![0-9]+]] = !DILocation(line: 9, column: 5, scope: !{{[0-9]+}})
-; CHECK: [[VREG:%[^ ]+]] = COPY %rdi
+; CHECK: [[VREG:%[^ ]+]]:gr64 = COPY %rdi
; CHECK: TEST64rr [[VREG]], [[VREG]]
; CHECK-NEXT: JE_1 {{.+}}, debug-location [[DLOC]]
; CHECK-NEXT: JMP_1 {{.+}}, debug-location [[DLOC]]
diff --git a/test/CodeGen/X86/tail-dup-repeat.ll b/test/CodeGen/X86/tail-dup-repeat.ll
index 7d9c0908e571..d635565c88c4 100644
--- a/test/CodeGen/X86/tail-dup-repeat.ll
+++ b/test/CodeGen/X86/tail-dup-repeat.ll
@@ -1,4 +1,4 @@
-; RUN: llc -O3 -tail-dup-placement-threshold=4 -o - %s | FileCheck %s
+; RUN: llc -O3 -o - %s | FileCheck %s
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
diff --git a/test/CodeGen/X86/tail-merge-after-mbp.mir b/test/CodeGen/X86/tail-merge-after-mbp.mir
index d1dc65336948..042ac72eead4 100644
--- a/test/CodeGen/X86/tail-merge-after-mbp.mir
+++ b/test/CodeGen/X86/tail-merge-after-mbp.mir
@@ -4,28 +4,28 @@
# check loop bb.7 is not merged with bb.10, bb.13
# check loop bb.9 is not merged with bb.12
# CHECK: bb.2:
-# CHECK-NEXT: successors: %bb.9(0x30000000), %bb.3(0x50000000)
-# CHECK: %rax = MOV64rm %r14, 1, _, 0, _
+# CHECK-NEXT: successors: %bb.3(0x30000000), %bb.4(0x50000000)
+# CHECK: %rax = MOV64rm %r14, 1, %noreg, 0, %noreg
# CHECK-NEXT: TEST64rr %rax, %rax
-# CHECK-NEXT: JE_1 %bb.9
-# CHECK: bb.3:
-# CHECK-NEXT: successors: %bb.4(0x30000000), %bb.8(0x50000000)
-# CHECK: CMP64mi8 killed %rax, 1, _, 8, _, 0
-# CHECK-NEXT: JNE_1 %bb.8
+# CHECK-NEXT: JE_1 %bb.3
# CHECK: bb.4:
-# CHECK-NEXT: successors: %bb.9(0x30000000), %bb.5(0x50000000)
-# CHECK: %rax = MOV64rm %r14, 1, _, 0, _
+# CHECK-NEXT: successors: %bb.5(0x30000000), %bb.10(0x50000000)
+# CHECK: CMP64mi8 killed %rax, 1, %noreg, 8, %noreg, 0
+# CHECK-NEXT: JNE_1 %bb.10
+# CHECK: bb.5:
+# CHECK-NEXT: successors: %bb.6(0x30000000), %bb.7(0x50000000)
+# CHECK: %rax = MOV64rm %r14, 1, %noreg, 0, %noreg
# CHECK-NEXT: TEST64rr %rax, %rax
-# CHECK-NEXT: JE_1 %bb.9
-# CHECK: bb.5
-# CHECK-NEXT: successors: %bb.6(0x71555555), %bb.8(0x0eaaaaab)
-# CHECK: CMP64mi8 killed %rax, 1, _, 8, _, 0
-# CHECK-NEXT: JNE_1 %bb.8
-# CHECK: bb.6:
-# CHECK-NEXT: successors: %bb.9(0x04000000), %bb.5(0x7c000000)
-# CHECK: %rax = MOV64rm %r14, 1, _, 0, _
+# CHECK-NEXT: JE_1 %bb.6
+# CHECK: bb.7
+# CHECK-NEXT: successors: %bb.8(0x71555555), %bb.10(0x0eaaaaab)
+# CHECK: CMP64mi8 killed %rax, 1, %noreg, 8, %noreg, 0
+# CHECK-NEXT: JNE_1 %bb.10
+# CHECK: bb.8:
+# CHECK-NEXT: successors: %bb.9(0x04000000), %bb.7(0x7c000000)
+# CHECK: %rax = MOV64rm %r14, 1, %noreg, 0, %noreg
# CHECK-NEXT: TEST64rr %rax, %rax
-# CHECK-NEXT: JNE_1 %bb.5
+# CHECK-NEXT: JNE_1 %bb.7
name: foo
body: |
@@ -44,7 +44,7 @@ body: |
bb.7:
successors: %bb.8(0x30000000), %bb.9(0x50000000)
- %rax = MOV64rm %r14, 1, _, 0, _ :: (load 8)
+ %rax = MOV64rm %r14, 1, %noreg, 0, %noreg :: (load 8)
TEST64rr %rax, %rax, implicit-def %eflags
JNE_1 %bb.9, implicit killed %eflags
@@ -57,13 +57,13 @@ body: |
bb.9:
successors: %bb.10(0x30000000), %bb.15(0x50000000)
- CMP64mi8 killed %rax, 1, _, 8, _, 0, implicit-def %eflags :: (load 8)
+ CMP64mi8 killed %rax, 1, %noreg, 8, %noreg, 0, implicit-def %eflags :: (load 8)
JNE_1 %bb.15, implicit %eflags
bb.10:
successors: %bb.11(0x30000000), %bb.12(0x50000000)
- %rax = MOV64rm %r14, 1, _, 0, _ :: (load 8)
+ %rax = MOV64rm %r14, 1, %noreg, 0, %noreg :: (load 8)
TEST64rr %rax, %rax, implicit-def %eflags
JNE_1 %bb.12, implicit %eflags
@@ -76,13 +76,13 @@ body: |
bb.12:
successors: %bb.13(0x71555555), %bb.15(0x0eaaaaab)
- CMP64mi8 killed %rax, 1, _, 8, _, 0, implicit-def %eflags :: (load 8), (load 8)
+ CMP64mi8 killed %rax, 1, %noreg, 8, %noreg, 0, implicit-def %eflags :: (load 8), (load 8)
JNE_1 %bb.15, implicit %eflags
bb.13:
successors: %bb.14(0x04000000), %bb.12(0x7c000000)
- %rax = MOV64rm %r14, 1, _, 0, _ :: (load 8)
+ %rax = MOV64rm %r14, 1, %noreg, 0, %noreg :: (load 8)
TEST64rr %rax, %rax, implicit-def %eflags
JNE_1 %bb.12, implicit %eflags
diff --git a/test/CodeGen/X86/tail-merge-debugloc.ll b/test/CodeGen/X86/tail-merge-debugloc.ll
index 197b0b803257..85ba0ab62617 100644
--- a/test/CodeGen/X86/tail-merge-debugloc.ll
+++ b/test/CodeGen/X86/tail-merge-debugloc.ll
@@ -6,7 +6,7 @@
; location info.
;
; CHECK: [[DLOC:![0-9]+]] = !DILocation(line: 2, column: 2, scope: !{{[0-9]+}})
-; CHECK: TEST64rr{{.*}}%rsi, %rsi, implicit-def %eflags
+; CHECK: TEST64rr{{.*}}%rsi, renamable %rsi, implicit-def %eflags
; CHECK-NEXT: JNE_1{{.*}}, debug-location [[DLOC]]
target triple = "x86_64-unknown-linux-gnu"
diff --git a/test/CodeGen/X86/tail-opts.ll b/test/CodeGen/X86/tail-opts.ll
index 96ff33ff5f7d..28749b33cfa0 100644
--- a/test/CodeGen/X86/tail-opts.ll
+++ b/test/CodeGen/X86/tail-opts.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mtriple=x86_64-unknown-linux-gnu -asm-verbose=false -post-RA-scheduler=true | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -asm-verbose=false -post-RA-scheduler=true | FileCheck %s
declare void @bar(i32)
declare void @car(i32)
diff --git a/test/CodeGen/X86/tailcall-64.ll b/test/CodeGen/X86/tailcall-64.ll
index 9e054fea5b35..65395a0947ae 100644
--- a/test/CodeGen/X86/tailcall-64.ll
+++ b/test/CodeGen/X86/tailcall-64.ll
@@ -181,8 +181,8 @@ define { i64, i64 } @crash(i8* %this) {
; Check that we can fold an indexed load into a tail call instruction.
; CHECK: fold_indexed_load
-; CHECK: leaq (%rsi,%rsi,4), %[[RAX:r..]]
-; CHECK: jmpq *16(%{{r..}},%[[RAX]],8) ## TAILCALL
+; CHECK: leaq (%rsi,%rsi,4), %[[rax:r..]]
+; CHECK: jmpq *16(%{{r..}},%[[rax]],8) ## TAILCALL
%struct.funcs = type { i32 (i8*, i32*, i32)*, i32 (i8*)*, i32 (i8*)*, i32 (i8*, i32)*, i32 }
@func_table = external global [0 x %struct.funcs]
define void @fold_indexed_load(i8* %mbstr, i64 %idxprom) nounwind uwtable ssp {
diff --git a/test/CodeGen/X86/tailcall-calleesave.ll b/test/CodeGen/X86/tailcall-calleesave.ll
index c748bcef36dc..1bb330bbe6a0 100644
--- a/test/CodeGen/X86/tailcall-calleesave.ll
+++ b/test/CodeGen/X86/tailcall-calleesave.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=x86 -tailcallopt -mcpu=core < %s | FileCheck %s
+; RUN: llc -tailcallopt -mcpu=core < %s | FileCheck %s
target triple = "i686-apple-darwin"
diff --git a/test/CodeGen/X86/tailcall-mem-intrinsics.ll b/test/CodeGen/X86/tailcall-mem-intrinsics.ll
index 8e1e4f464baa..7491ea659ba2 100644
--- a/test/CodeGen/X86/tailcall-mem-intrinsics.ll
+++ b/test/CodeGen/X86/tailcall-mem-intrinsics.ll
@@ -24,6 +24,30 @@ entry:
ret void
}
+; CHECK-LABEL: tail_memcpy_ret
+; CHECK: jmp memcpy
+define i8* @tail_memcpy_ret(i8* nocapture %p, i8* nocapture readonly %q, i32 %n) #0 {
+entry:
+ tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* %p, i8* %q, i32 %n, i32 1, i1 false)
+ ret i8* %p
+}
+
+; CHECK-LABEL: tail_memmove_ret
+; CHECK: jmp memmove
+define i8* @tail_memmove_ret(i8* nocapture %p, i8* nocapture readonly %q, i32 %n) #0 {
+entry:
+ tail call void @llvm.memmove.p0i8.p0i8.i32(i8* %p, i8* %q, i32 %n, i32 1, i1 false)
+ ret i8* %p
+}
+
+; CHECK-LABEL: tail_memset_ret
+; CHECK: jmp memset
+define i8* @tail_memset_ret(i8* nocapture %p, i8 %c, i32 %n) #0 {
+entry:
+ tail call void @llvm.memset.p0i8.i32(i8* %p, i8 %c, i32 %n, i32 1, i1 false)
+ ret i8* %p
+}
+
declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture readonly, i32, i32, i1) #0
declare void @llvm.memmove.p0i8.p0i8.i32(i8* nocapture, i8* nocapture readonly, i32, i32, i1) #0
declare void @llvm.memset.p0i8.i32(i8* nocapture, i8, i32, i32, i1) #0
diff --git a/test/CodeGen/X86/tailcall-returndup-void.ll b/test/CodeGen/X86/tailcall-returndup-void.ll
index 62c40164d798..62dd053c2a81 100644
--- a/test/CodeGen/X86/tailcall-returndup-void.ll
+++ b/test/CodeGen/X86/tailcall-returndup-void.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-- | FileCheck %s
; CHECK: rBM_info
; CHECK-NOT: ret
diff --git a/test/CodeGen/X86/tailcall.ll b/test/CodeGen/X86/tailcall.ll
index 36a38e0b69d0..1f49aee900aa 100644
--- a/test/CodeGen/X86/tailcall.ll
+++ b/test/CodeGen/X86/tailcall.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -tailcallopt | grep TAILCALL | count 7
+; RUN: llc < %s -mtriple=i686-- -tailcallopt | grep TAILCALL | count 7
; With -tailcallopt, CodeGen guarantees a tail call optimization
; for all of these.
diff --git a/test/CodeGen/X86/tailcallfp.ll b/test/CodeGen/X86/tailcallfp.ll
index 22a7930ba877..e42d5d5a9be9 100644
--- a/test/CodeGen/X86/tailcallfp.ll
+++ b/test/CodeGen/X86/tailcallfp.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -tailcallopt | FileCheck %s
+; RUN: llc < %s -mtriple=i686-- -tailcallopt | FileCheck %s
define fastcc i32 @bar(i32 %X, i32(double, i32) *%FP) {
%Y = tail call fastcc i32 %FP(double 0.0, i32 %X)
ret i32 %Y
diff --git a/test/CodeGen/X86/tailcallfp2.ll b/test/CodeGen/X86/tailcallfp2.ll
index 9ef0d27f7de6..d2ed0e62f58b 100644
--- a/test/CodeGen/X86/tailcallfp2.ll
+++ b/test/CodeGen/X86/tailcallfp2.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -tailcallopt | FileCheck %s
+; RUN: llc < %s -mtriple=i686-- -tailcallopt | FileCheck %s
declare i32 @putchar(i32)
diff --git a/test/CodeGen/X86/tbm-intrinsics-fast-isel-x86_64.ll b/test/CodeGen/X86/tbm-intrinsics-fast-isel-x86_64.ll
index f6c49cab71b2..72ff630b9677 100644
--- a/test/CodeGen/X86/tbm-intrinsics-fast-isel-x86_64.ll
+++ b/test/CodeGen/X86/tbm-intrinsics-fast-isel-x86_64.ll
@@ -5,7 +5,7 @@
define i64 @test__bextri_u64(i64 %a0) {
; X64-LABEL: test__bextri_u64:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: bextr $1, %rdi, %rax
; X64-NEXT: retq
%1 = call i64 @llvm.x86.tbm.bextri.u64(i64 %a0, i64 1)
@@ -14,7 +14,7 @@ define i64 @test__bextri_u64(i64 %a0) {
define i64 @test__blcfill_u64(i64 %a0) {
; X64-LABEL: test__blcfill_u64:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: leaq 1(%rdi), %rax
; X64-NEXT: andq %rdi, %rax
; X64-NEXT: retq
@@ -25,7 +25,7 @@ define i64 @test__blcfill_u64(i64 %a0) {
define i64 @test__blci_u64(i64 %a0) {
; X64-LABEL: test__blci_u64:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: leaq 1(%rdi), %rax
; X64-NEXT: xorq $-1, %rax
; X64-NEXT: orq %rdi, %rax
@@ -38,7 +38,7 @@ define i64 @test__blci_u64(i64 %a0) {
define i64 @test__blcic_u64(i64 %a0) {
; X64-LABEL: test__blcic_u64:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movq %rdi, %rax
; X64-NEXT: xorq $-1, %rax
; X64-NEXT: addq $1, %rdi
@@ -53,7 +53,7 @@ define i64 @test__blcic_u64(i64 %a0) {
define i64 @test__blcmsk_u64(i64 %a0) {
; X64-LABEL: test__blcmsk_u64:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: leaq 1(%rdi), %rax
; X64-NEXT: xorq %rdi, %rax
; X64-NEXT: retq
@@ -64,7 +64,7 @@ define i64 @test__blcmsk_u64(i64 %a0) {
define i64 @test__blcs_u64(i64 %a0) {
; X64-LABEL: test__blcs_u64:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: leaq 1(%rdi), %rax
; X64-NEXT: orq %rdi, %rax
; X64-NEXT: retq
@@ -75,7 +75,7 @@ define i64 @test__blcs_u64(i64 %a0) {
define i64 @test__blsfill_u64(i64 %a0) {
; X64-LABEL: test__blsfill_u64:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movq %rdi, %rax
; X64-NEXT: subq $1, %rax
; X64-NEXT: orq %rdi, %rax
@@ -87,7 +87,7 @@ define i64 @test__blsfill_u64(i64 %a0) {
define i64 @test__blsic_u64(i64 %a0) {
; X64-LABEL: test__blsic_u64:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movq %rdi, %rax
; X64-NEXT: xorq $-1, %rax
; X64-NEXT: subq $1, %rdi
@@ -102,7 +102,7 @@ define i64 @test__blsic_u64(i64 %a0) {
define i64 @test__t1mskc_u64(i64 %a0) {
; X64-LABEL: test__t1mskc_u64:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movq %rdi, %rax
; X64-NEXT: xorq $-1, %rax
; X64-NEXT: addq $1, %rdi
@@ -117,7 +117,7 @@ define i64 @test__t1mskc_u64(i64 %a0) {
define i64 @test__tzmsk_u64(i64 %a0) {
; X64-LABEL: test__tzmsk_u64:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movq %rdi, %rax
; X64-NEXT: xorq $-1, %rax
; X64-NEXT: subq $1, %rdi
diff --git a/test/CodeGen/X86/tbm-intrinsics-fast-isel.ll b/test/CodeGen/X86/tbm-intrinsics-fast-isel.ll
index a264adffe790..74084df8bb72 100644
--- a/test/CodeGen/X86/tbm-intrinsics-fast-isel.ll
+++ b/test/CodeGen/X86/tbm-intrinsics-fast-isel.ll
@@ -6,12 +6,12 @@
define i32 @test__bextri_u32(i32 %a0) {
; X32-LABEL: test__bextri_u32:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: bextr $1, {{[0-9]+}}(%esp), %eax
; X32-NEXT: retl
;
; X64-LABEL: test__bextri_u32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: bextr $1, %edi, %eax
; X64-NEXT: retq
%1 = call i32 @llvm.x86.tbm.bextri.u32(i32 %a0, i32 1)
@@ -20,15 +20,15 @@ define i32 @test__bextri_u32(i32 %a0) {
define i32 @test__blcfill_u32(i32 %a0) {
; X32-LABEL: test__blcfill_u32:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: leal 1(%ecx), %eax
; X32-NEXT: andl %ecx, %eax
; X32-NEXT: retl
;
; X64-LABEL: test__blcfill_u32:
-; X64: # BB#0:
-; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64: # %bb.0:
+; X64-NEXT: # kill: def %edi killed %edi def %rdi
; X64-NEXT: leal 1(%rdi), %eax
; X64-NEXT: andl %edi, %eax
; X64-NEXT: retq
@@ -39,7 +39,7 @@ define i32 @test__blcfill_u32(i32 %a0) {
define i32 @test__blci_u32(i32 %a0) {
; X32-LABEL: test__blci_u32:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: leal 1(%ecx), %eax
; X32-NEXT: xorl $-1, %eax
@@ -47,8 +47,8 @@ define i32 @test__blci_u32(i32 %a0) {
; X32-NEXT: retl
;
; X64-LABEL: test__blci_u32:
-; X64: # BB#0:
-; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64: # %bb.0:
+; X64-NEXT: # kill: def %edi killed %edi def %rdi
; X64-NEXT: leal 1(%rdi), %eax
; X64-NEXT: xorl $-1, %eax
; X64-NEXT: orl %edi, %eax
@@ -61,7 +61,7 @@ define i32 @test__blci_u32(i32 %a0) {
define i32 @test__blcic_u32(i32 %a0) {
; X32-LABEL: test__blcic_u32:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl %eax, %ecx
; X32-NEXT: xorl $-1, %ecx
@@ -70,7 +70,7 @@ define i32 @test__blcic_u32(i32 %a0) {
; X32-NEXT: retl
;
; X64-LABEL: test__blcic_u32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movl %edi, %eax
; X64-NEXT: xorl $-1, %eax
; X64-NEXT: addl $1, %edi
@@ -85,15 +85,15 @@ define i32 @test__blcic_u32(i32 %a0) {
define i32 @test__blcmsk_u32(i32 %a0) {
; X32-LABEL: test__blcmsk_u32:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: leal 1(%ecx), %eax
; X32-NEXT: xorl %ecx, %eax
; X32-NEXT: retl
;
; X64-LABEL: test__blcmsk_u32:
-; X64: # BB#0:
-; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64: # %bb.0:
+; X64-NEXT: # kill: def %edi killed %edi def %rdi
; X64-NEXT: leal 1(%rdi), %eax
; X64-NEXT: xorl %edi, %eax
; X64-NEXT: retq
@@ -104,15 +104,15 @@ define i32 @test__blcmsk_u32(i32 %a0) {
define i32 @test__blcs_u32(i32 %a0) {
; X32-LABEL: test__blcs_u32:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: leal 1(%ecx), %eax
; X32-NEXT: orl %ecx, %eax
; X32-NEXT: retl
;
; X64-LABEL: test__blcs_u32:
-; X64: # BB#0:
-; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64: # %bb.0:
+; X64-NEXT: # kill: def %edi killed %edi def %rdi
; X64-NEXT: leal 1(%rdi), %eax
; X64-NEXT: orl %edi, %eax
; X64-NEXT: retq
@@ -123,7 +123,7 @@ define i32 @test__blcs_u32(i32 %a0) {
define i32 @test__blsfill_u32(i32 %a0) {
; X32-LABEL: test__blsfill_u32:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: movl %ecx, %eax
; X32-NEXT: subl $1, %eax
@@ -131,7 +131,7 @@ define i32 @test__blsfill_u32(i32 %a0) {
; X32-NEXT: retl
;
; X64-LABEL: test__blsfill_u32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movl %edi, %eax
; X64-NEXT: subl $1, %eax
; X64-NEXT: orl %edi, %eax
@@ -143,7 +143,7 @@ define i32 @test__blsfill_u32(i32 %a0) {
define i32 @test__blsic_u32(i32 %a0) {
; X32-LABEL: test__blsic_u32:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl %eax, %ecx
; X32-NEXT: xorl $-1, %ecx
@@ -152,7 +152,7 @@ define i32 @test__blsic_u32(i32 %a0) {
; X32-NEXT: retl
;
; X64-LABEL: test__blsic_u32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movl %edi, %eax
; X64-NEXT: xorl $-1, %eax
; X64-NEXT: subl $1, %edi
@@ -167,7 +167,7 @@ define i32 @test__blsic_u32(i32 %a0) {
define i32 @test__t1mskc_u32(i32 %a0) {
; X32-LABEL: test__t1mskc_u32:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl %eax, %ecx
; X32-NEXT: xorl $-1, %ecx
@@ -176,7 +176,7 @@ define i32 @test__t1mskc_u32(i32 %a0) {
; X32-NEXT: retl
;
; X64-LABEL: test__t1mskc_u32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movl %edi, %eax
; X64-NEXT: xorl $-1, %eax
; X64-NEXT: addl $1, %edi
@@ -191,7 +191,7 @@ define i32 @test__t1mskc_u32(i32 %a0) {
define i32 @test__tzmsk_u32(i32 %a0) {
; X32-LABEL: test__tzmsk_u32:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl %eax, %ecx
; X32-NEXT: xorl $-1, %ecx
@@ -200,7 +200,7 @@ define i32 @test__tzmsk_u32(i32 %a0) {
; X32-NEXT: retl
;
; X64-LABEL: test__tzmsk_u32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movl %edi, %eax
; X64-NEXT: xorl $-1, %eax
; X64-NEXT: subl $1, %edi
diff --git a/test/CodeGen/X86/tbm-intrinsics-x86_64.ll b/test/CodeGen/X86/tbm-intrinsics-x86_64.ll
index 12218cc8ec4a..3c2e62276e59 100644
--- a/test/CodeGen/X86/tbm-intrinsics-x86_64.ll
+++ b/test/CodeGen/X86/tbm-intrinsics-x86_64.ll
@@ -1,10 +1,12 @@
-; RUN: llc -mtriple=x86_64-unknown-unknown -march=x86-64 -mattr=+tbm < %s | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+tbm < %s | FileCheck %s
define i32 @test_x86_tbm_bextri_u32(i32 %a) nounwind readnone {
+; CHECK-LABEL: test_x86_tbm_bextri_u32:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: bextr $2814, %edi, %eax # imm = 0xAFE
+; CHECK-NEXT: retq
entry:
- ; CHECK-LABEL: test_x86_tbm_bextri_u32:
- ; CHECK-NOT: mov
- ; CHECK: bextr $
%0 = tail call i32 @llvm.x86.tbm.bextri.u32(i32 %a, i32 2814)
ret i32 %0
}
@@ -12,20 +14,35 @@ entry:
declare i32 @llvm.x86.tbm.bextri.u32(i32, i32) nounwind readnone
define i32 @test_x86_tbm_bextri_u32_m(i32* nocapture %a) nounwind readonly {
+; CHECK-LABEL: test_x86_tbm_bextri_u32_m:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: bextr $2814, (%rdi), %eax # imm = 0xAFE
+; CHECK-NEXT: retq
entry:
- ; CHECK-LABEL: test_x86_tbm_bextri_u32_m:
- ; CHECK-NOT: mov
- ; CHECK: bextr $
%tmp1 = load i32, i32* %a, align 4
%0 = tail call i32 @llvm.x86.tbm.bextri.u32(i32 %tmp1, i32 2814)
ret i32 %0
}
+define i32 @test_x86_tbm_bextri_u32_z(i32 %a, i32 %b) nounwind readonly {
+; CHECK-LABEL: test_x86_tbm_bextri_u32_z:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: bextr $2814, %edi, %eax # imm = 0xAFE
+; CHECK-NEXT: cmovel %esi, %eax
+; CHECK-NEXT: retq
+entry:
+ %0 = tail call i32 @llvm.x86.tbm.bextri.u32(i32 %a, i32 2814)
+ %1 = icmp eq i32 %0, 0
+ %2 = select i1 %1, i32 %b, i32 %0
+ ret i32 %2
+}
+
define i64 @test_x86_tbm_bextri_u64(i64 %a) nounwind readnone {
+; CHECK-LABEL: test_x86_tbm_bextri_u64:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: bextr $2814, %rdi, %rax # imm = 0xAFE
+; CHECK-NEXT: retq
entry:
- ; CHECK-LABEL: test_x86_tbm_bextri_u64:
- ; CHECK-NOT: mov
- ; CHECK: bextr $
%0 = tail call i64 @llvm.x86.tbm.bextri.u64(i64 %a, i64 2814)
ret i64 %0
}
@@ -33,11 +50,25 @@ entry:
declare i64 @llvm.x86.tbm.bextri.u64(i64, i64) nounwind readnone
define i64 @test_x86_tbm_bextri_u64_m(i64* nocapture %a) nounwind readonly {
+; CHECK-LABEL: test_x86_tbm_bextri_u64_m:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: bextr $2814, (%rdi), %rax # imm = 0xAFE
+; CHECK-NEXT: retq
entry:
- ; CHECK-LABEL: test_x86_tbm_bextri_u64_m:
- ; CHECK-NOT: mov
- ; CHECK: bextr $
%tmp1 = load i64, i64* %a, align 8
%0 = tail call i64 @llvm.x86.tbm.bextri.u64(i64 %tmp1, i64 2814)
ret i64 %0
}
+
+define i64 @test_x86_tbm_bextri_u64_z(i64 %a, i64 %b) nounwind readnone {
+; CHECK-LABEL: test_x86_tbm_bextri_u64_z:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: bextr $2814, %rdi, %rax # imm = 0xAFE
+; CHECK-NEXT: cmoveq %rsi, %rax
+; CHECK-NEXT: retq
+entry:
+ %0 = tail call i64 @llvm.x86.tbm.bextri.u64(i64 %a, i64 2814)
+ %1 = icmp eq i64 %0, 0
+ %2 = select i1 %1, i64 %b, i64 %0
+ ret i64 %2
+}
diff --git a/test/CodeGen/X86/tbm-schedule.ll b/test/CodeGen/X86/tbm-schedule.ll
new file mode 100644
index 000000000000..9bb7870506fe
--- /dev/null
+++ b/test/CodeGen/X86/tbm-schedule.ll
@@ -0,0 +1,489 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=+tbm | FileCheck %s --check-prefix=GENERIC
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=bdver2 | FileCheck %s --check-prefix=BDVER --check-prefix=BDVER2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=bdver3 | FileCheck %s --check-prefix=BDVER --check-prefix=BDVER3
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=bdver4 | FileCheck %s --check-prefix=BDVER --check-prefix=BDVER4
+
+define i32 @test_x86_tbm_bextri_u32(i32 %a0, i32* nocapture %p1) nounwind {
+; GENERIC-LABEL: test_x86_tbm_bextri_u32:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: bextr $3076, %edi, %ecx # imm = 0xC04
+; GENERIC-NEXT: # sched: [1:0.33]
+; GENERIC-NEXT: bextr $3076, (%rsi), %eax # imm = 0xC04
+; GENERIC-NEXT: # sched: [5:0.50]
+; GENERIC-NEXT: addl %ecx, %eax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; BDVER-LABEL: test_x86_tbm_bextri_u32:
+; BDVER: # %bb.0:
+; BDVER-NEXT: bextr $3076, %edi, %ecx # imm = 0xC04
+; BDVER-NEXT: bextr $3076, (%rsi), %eax # imm = 0xC04
+; BDVER-NEXT: addl %ecx, %eax
+; BDVER-NEXT: retq
+ %a1 = load i32, i32* %p1
+ %r0 = lshr i32 %a0, 4
+ %m0 = lshr i32 %a1, 4
+ %r1 = and i32 %r0, 4095
+ %m1 = and i32 %m0, 4095
+ %res = add i32 %r1, %m1
+ ret i32 %res
+}
+
+define i64 @test_x86_tbm_bextri_u64(i64 %a0, i64* nocapture %p1) nounwind {
+; GENERIC-LABEL: test_x86_tbm_bextri_u64:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: bextr $3076, %edi, %ecx # imm = 0xC04
+; GENERIC-NEXT: # sched: [1:0.33]
+; GENERIC-NEXT: bextr $3076, (%rsi), %eax # imm = 0xC04
+; GENERIC-NEXT: # sched: [5:0.50]
+; GENERIC-NEXT: addq %rcx, %rax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; BDVER-LABEL: test_x86_tbm_bextri_u64:
+; BDVER: # %bb.0:
+; BDVER-NEXT: bextr $3076, %edi, %ecx # imm = 0xC04
+; BDVER-NEXT: bextr $3076, (%rsi), %eax # imm = 0xC04
+; BDVER-NEXT: addq %rcx, %rax
+; BDVER-NEXT: retq
+ %a1 = load i64, i64* %p1
+ %r0 = lshr i64 %a0, 4
+ %m0 = lshr i64 %a1, 4
+ %r1 = and i64 %r0, 4095
+ %m1 = and i64 %m0, 4095
+ %res = add i64 %r1, %m1
+ ret i64 %res
+}
+
+define i32 @test_x86_tbm_blcfill_u32(i32 %a0, i32* nocapture %p1) nounwind {
+; GENERIC-LABEL: test_x86_tbm_blcfill_u32:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: blcfill %edi, %ecx # sched: [1:0.33]
+; GENERIC-NEXT: blcfill (%rsi), %eax # sched: [5:0.50]
+; GENERIC-NEXT: addl %ecx, %eax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; BDVER-LABEL: test_x86_tbm_blcfill_u32:
+; BDVER: # %bb.0:
+; BDVER-NEXT: blcfill %edi, %ecx
+; BDVER-NEXT: blcfill (%rsi), %eax
+; BDVER-NEXT: addl %ecx, %eax
+; BDVER-NEXT: retq
+ %a1 = load i32, i32* %p1
+ %r0 = add i32 %a0, 1
+ %m0 = add i32 %a1, 1
+ %r1 = and i32 %r0, %a0
+ %m1 = and i32 %m0, %a1
+ %res = add i32 %r1, %m1
+ ret i32 %res
+}
+
+define i64 @test_x86_tbm_blcfill_u64(i64 %a0, i64* nocapture %p1) nounwind {
+; GENERIC-LABEL: test_x86_tbm_blcfill_u64:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: blcfill %rdi, %rcx # sched: [1:0.33]
+; GENERIC-NEXT: blcfill (%rsi), %rax # sched: [5:0.50]
+; GENERIC-NEXT: addq %rcx, %rax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; BDVER-LABEL: test_x86_tbm_blcfill_u64:
+; BDVER: # %bb.0:
+; BDVER-NEXT: blcfill %rdi, %rcx
+; BDVER-NEXT: blcfill (%rsi), %rax
+; BDVER-NEXT: addq %rcx, %rax
+; BDVER-NEXT: retq
+ %a1 = load i64, i64* %p1
+ %r0 = add i64 %a0, 1
+ %m0 = add i64 %a1, 1
+ %r1 = and i64 %r0, %a0
+ %m1 = and i64 %m0, %a1
+ %res = add i64 %r1, %m1
+ ret i64 %res
+}
+
+define i32 @test_x86_tbm_blci_u32(i32 %a0, i32* nocapture %p1) nounwind {
+; GENERIC-LABEL: test_x86_tbm_blci_u32:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: blci %edi, %ecx # sched: [1:0.33]
+; GENERIC-NEXT: blci (%rsi), %eax # sched: [5:0.50]
+; GENERIC-NEXT: addl %ecx, %eax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; BDVER-LABEL: test_x86_tbm_blci_u32:
+; BDVER: # %bb.0:
+; BDVER-NEXT: blci %edi, %ecx
+; BDVER-NEXT: blci (%rsi), %eax
+; BDVER-NEXT: addl %ecx, %eax
+; BDVER-NEXT: retq
+ %a1 = load i32, i32* %p1
+ %r0 = add i32 1, %a0
+ %m0 = add i32 1, %a1
+ %r1 = xor i32 %r0, -1
+ %m1 = xor i32 %m0, -1
+ %r2 = or i32 %r1, %a0
+ %m2 = or i32 %m1, %a1
+ %res = add i32 %r2, %m2
+ ret i32 %res
+}
+
+define i64 @test_x86_tbm_blci_u64(i64 %a0, i64* nocapture %p1) nounwind {
+; GENERIC-LABEL: test_x86_tbm_blci_u64:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: blci %rdi, %rcx # sched: [1:0.33]
+; GENERIC-NEXT: blci (%rsi), %rax # sched: [5:0.50]
+; GENERIC-NEXT: addq %rcx, %rax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; BDVER-LABEL: test_x86_tbm_blci_u64:
+; BDVER: # %bb.0:
+; BDVER-NEXT: blci %rdi, %rcx
+; BDVER-NEXT: blci (%rsi), %rax
+; BDVER-NEXT: addq %rcx, %rax
+; BDVER-NEXT: retq
+ %a1 = load i64, i64* %p1
+ %r0 = add i64 1, %a0
+ %m0 = add i64 1, %a1
+ %r1 = xor i64 %r0, -1
+ %m1 = xor i64 %m0, -1
+ %r2 = or i64 %r1, %a0
+ %m2 = or i64 %m1, %a1
+ %res = add i64 %r2, %m2
+ ret i64 %res
+}
+
+define i32 @test_x86_tbm_blcic_u32(i32 %a0, i32* nocapture %p1) nounwind {
+; GENERIC-LABEL: test_x86_tbm_blcic_u32:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: blcic %edi, %ecx # sched: [1:0.33]
+; GENERIC-NEXT: blcic (%rsi), %eax # sched: [5:0.50]
+; GENERIC-NEXT: addl %ecx, %eax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; BDVER-LABEL: test_x86_tbm_blcic_u32:
+; BDVER: # %bb.0:
+; BDVER-NEXT: blcic %edi, %ecx
+; BDVER-NEXT: blcic (%rsi), %eax
+; BDVER-NEXT: addl %ecx, %eax
+; BDVER-NEXT: retq
+ %a1 = load i32, i32* %p1
+ %r0 = xor i32 %a0, -1
+ %m0 = xor i32 %a1, -1
+ %r1 = add i32 %a0, 1
+ %m1 = add i32 %a1, 1
+ %r2 = and i32 %r1, %r0
+ %m2 = and i32 %m1, %m0
+ %res = add i32 %r2, %m2
+ ret i32 %res
+}
+
+define i64 @test_x86_tbm_blcic_u64(i64 %a0, i64* nocapture %p1) nounwind {
+; GENERIC-LABEL: test_x86_tbm_blcic_u64:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: blcic %rdi, %rcx # sched: [1:0.33]
+; GENERIC-NEXT: blcic (%rsi), %rax # sched: [5:0.50]
+; GENERIC-NEXT: addq %rcx, %rax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; BDVER-LABEL: test_x86_tbm_blcic_u64:
+; BDVER: # %bb.0:
+; BDVER-NEXT: blcic %rdi, %rcx
+; BDVER-NEXT: blcic (%rsi), %rax
+; BDVER-NEXT: addq %rcx, %rax
+; BDVER-NEXT: retq
+ %a1 = load i64, i64* %p1
+ %r0 = xor i64 %a0, -1
+ %m0 = xor i64 %a1, -1
+ %r1 = add i64 %a0, 1
+ %m1 = add i64 %a1, 1
+ %r2 = and i64 %r1, %r0
+ %m2 = and i64 %m1, %m0
+ %res = add i64 %r2, %m2
+ ret i64 %res
+}
+
+define i32 @test_x86_tbm_blcmsk_u32(i32 %a0, i32* nocapture %p1) nounwind {
+; GENERIC-LABEL: test_x86_tbm_blcmsk_u32:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: blcmsk %edi, %ecx # sched: [1:0.33]
+; GENERIC-NEXT: blcmsk (%rsi), %eax # sched: [5:0.50]
+; GENERIC-NEXT: addl %ecx, %eax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; BDVER-LABEL: test_x86_tbm_blcmsk_u32:
+; BDVER: # %bb.0:
+; BDVER-NEXT: blcmsk %edi, %ecx
+; BDVER-NEXT: blcmsk (%rsi), %eax
+; BDVER-NEXT: addl %ecx, %eax
+; BDVER-NEXT: retq
+ %a1 = load i32, i32* %p1
+ %r0 = add i32 %a0, 1
+ %m0 = add i32 %a1, 1
+ %r1 = xor i32 %r0, %a0
+ %m1 = xor i32 %m0, %a1
+ %res = add i32 %r1, %m1
+ ret i32 %res
+}
+
+define i64 @test_x86_tbm_blcmsk_u64(i64 %a0, i64* nocapture %p1) nounwind {
+; GENERIC-LABEL: test_x86_tbm_blcmsk_u64:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: blcmsk %rdi, %rcx # sched: [1:0.33]
+; GENERIC-NEXT: blcmsk (%rsi), %rax # sched: [5:0.50]
+; GENERIC-NEXT: addq %rcx, %rax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; BDVER-LABEL: test_x86_tbm_blcmsk_u64:
+; BDVER: # %bb.0:
+; BDVER-NEXT: blcmsk %rdi, %rcx
+; BDVER-NEXT: blcmsk (%rsi), %rax
+; BDVER-NEXT: addq %rcx, %rax
+; BDVER-NEXT: retq
+ %a1 = load i64, i64* %p1
+ %r0 = add i64 %a0, 1
+ %m0 = add i64 %a1, 1
+ %r1 = xor i64 %r0, %a0
+ %m1 = xor i64 %m0, %a1
+ %res = add i64 %r1, %m1
+ ret i64 %res
+}
+
+define i32 @test_x86_tbm_blcs_u32(i32 %a0, i32* nocapture %p1) nounwind {
+; GENERIC-LABEL: test_x86_tbm_blcs_u32:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: blcs %edi, %ecx # sched: [1:0.33]
+; GENERIC-NEXT: blcs (%rsi), %eax # sched: [5:0.50]
+; GENERIC-NEXT: addl %ecx, %eax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; BDVER-LABEL: test_x86_tbm_blcs_u32:
+; BDVER: # %bb.0:
+; BDVER-NEXT: blcs %edi, %ecx
+; BDVER-NEXT: blcs (%rsi), %eax
+; BDVER-NEXT: addl %ecx, %eax
+; BDVER-NEXT: retq
+ %a1 = load i32, i32* %p1
+ %r0 = add i32 %a0, 1
+ %m0 = add i32 %a1, 1
+ %r1 = or i32 %r0, %a0
+ %m1 = or i32 %m0, %a1
+ %res = add i32 %r1, %m1
+ ret i32 %res
+}
+
+define i64 @test_x86_tbm_blcs_u64(i64 %a0, i64* nocapture %p1) nounwind {
+; GENERIC-LABEL: test_x86_tbm_blcs_u64:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: blcs %rdi, %rcx # sched: [1:0.33]
+; GENERIC-NEXT: blcs (%rsi), %rax # sched: [5:0.50]
+; GENERIC-NEXT: addq %rcx, %rax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; BDVER-LABEL: test_x86_tbm_blcs_u64:
+; BDVER: # %bb.0:
+; BDVER-NEXT: blcs %rdi, %rcx
+; BDVER-NEXT: blcs (%rsi), %rax
+; BDVER-NEXT: addq %rcx, %rax
+; BDVER-NEXT: retq
+ %a1 = load i64, i64* %p1
+ %r0 = add i64 %a0, 1
+ %m0 = add i64 %a1, 1
+ %r1 = or i64 %r0, %a0
+ %m1 = or i64 %m0, %a1
+ %res = add i64 %r1, %m1
+ ret i64 %res
+}
+
+define i32 @test_x86_tbm_blsfill_u32(i32 %a0, i32* nocapture %p1) nounwind {
+; GENERIC-LABEL: test_x86_tbm_blsfill_u32:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: blsfill %edi, %ecx # sched: [1:0.33]
+; GENERIC-NEXT: blsfill (%rsi), %eax # sched: [5:0.50]
+; GENERIC-NEXT: addl %ecx, %eax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; BDVER-LABEL: test_x86_tbm_blsfill_u32:
+; BDVER: # %bb.0:
+; BDVER-NEXT: blsfill %edi, %ecx
+; BDVER-NEXT: blsfill (%rsi), %eax
+; BDVER-NEXT: addl %ecx, %eax
+; BDVER-NEXT: retq
+ %a1 = load i32, i32* %p1
+ %r0 = add i32 %a0, -1
+ %m0 = add i32 %a1, -1
+ %r1 = or i32 %r0, %a0
+ %m1 = or i32 %m0, %a1
+ %res = add i32 %r1, %m1
+ ret i32 %res
+}
+
+define i64 @test_x86_tbm_blsfill_u64(i64 %a0, i64* nocapture %p1) nounwind {
+; GENERIC-LABEL: test_x86_tbm_blsfill_u64:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: blsfill %rdi, %rcx # sched: [1:0.33]
+; GENERIC-NEXT: blsfill (%rsi), %rax # sched: [5:0.50]
+; GENERIC-NEXT: addq %rcx, %rax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; BDVER-LABEL: test_x86_tbm_blsfill_u64:
+; BDVER: # %bb.0:
+; BDVER-NEXT: blsfill %rdi, %rcx
+; BDVER-NEXT: blsfill (%rsi), %rax
+; BDVER-NEXT: addq %rcx, %rax
+; BDVER-NEXT: retq
+ %a1 = load i64, i64* %p1
+ %r0 = add i64 %a0, -1
+ %m0 = add i64 %a1, -1
+ %r1 = or i64 %r0, %a0
+ %m1 = or i64 %m0, %a1
+ %res = add i64 %r1, %m1
+ ret i64 %res
+}
+
+define i32 @test_x86_tbm_blsic_u32(i32 %a0, i32* nocapture %p1) nounwind {
+; GENERIC-LABEL: test_x86_tbm_blsic_u32:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: blsic %edi, %ecx # sched: [1:0.33]
+; GENERIC-NEXT: blsic (%rsi), %eax # sched: [5:0.50]
+; GENERIC-NEXT: addl %ecx, %eax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; BDVER-LABEL: test_x86_tbm_blsic_u32:
+; BDVER: # %bb.0:
+; BDVER-NEXT: blsic %edi, %ecx
+; BDVER-NEXT: blsic (%rsi), %eax
+; BDVER-NEXT: addl %ecx, %eax
+; BDVER-NEXT: retq
+ %a1 = load i32, i32* %p1
+ %r0 = xor i32 %a0, -1
+ %m0 = xor i32 %a1, -1
+ %r1 = add i32 %a0, -1
+ %m1 = add i32 %a1, -1
+ %r2 = or i32 %r0, %r1
+ %m2 = or i32 %m0, %m1
+ %res = add i32 %r2, %m2
+ ret i32 %res
+}
+
+define i64 @test_x86_tbm_blsic_u64(i64 %a0, i64* nocapture %p1) nounwind {
+; GENERIC-LABEL: test_x86_tbm_blsic_u64:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: blsic %rdi, %rcx # sched: [1:0.33]
+; GENERIC-NEXT: blsic (%rsi), %rax # sched: [5:0.50]
+; GENERIC-NEXT: addq %rcx, %rax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; BDVER-LABEL: test_x86_tbm_blsic_u64:
+; BDVER: # %bb.0:
+; BDVER-NEXT: blsic %rdi, %rcx
+; BDVER-NEXT: blsic (%rsi), %rax
+; BDVER-NEXT: addq %rcx, %rax
+; BDVER-NEXT: retq
+ %a1 = load i64, i64* %p1
+ %r0 = xor i64 %a0, -1
+ %m0 = xor i64 %a1, -1
+ %r1 = add i64 %a0, -1
+ %m1 = add i64 %a1, -1
+ %r2 = or i64 %r0, %r1
+ %m2 = or i64 %m0, %m1
+ %res = add i64 %r2, %m2
+ ret i64 %res
+}
+
+define i32 @test_x86_tbm_t1mskc_u32(i32 %a0, i32* nocapture %p1) nounwind {
+; GENERIC-LABEL: test_x86_tbm_t1mskc_u32:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: t1mskc %edi, %ecx # sched: [1:0.33]
+; GENERIC-NEXT: t1mskc (%rsi), %eax # sched: [5:0.50]
+; GENERIC-NEXT: addl %ecx, %eax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; BDVER-LABEL: test_x86_tbm_t1mskc_u32:
+; BDVER: # %bb.0:
+; BDVER-NEXT: t1mskc %edi, %ecx
+; BDVER-NEXT: t1mskc (%rsi), %eax
+; BDVER-NEXT: addl %ecx, %eax
+; BDVER-NEXT: retq
+ %a1 = load i32, i32* %p1
+ %r0 = xor i32 %a0, -1
+ %m0 = xor i32 %a1, -1
+ %r1 = add i32 %a0, 1
+ %m1 = add i32 %a1, 1
+ %r2 = or i32 %r0, %r1
+ %m2 = or i32 %m0, %m1
+ %res = add i32 %r2, %m2
+ ret i32 %res
+}
+
+define i64 @test_x86_tbm_t1mskc_u64(i64 %a0, i64* nocapture %p1) nounwind {
+; GENERIC-LABEL: test_x86_tbm_t1mskc_u64:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: t1mskc %rdi, %rcx # sched: [1:0.33]
+; GENERIC-NEXT: t1mskc (%rsi), %rax # sched: [5:0.50]
+; GENERIC-NEXT: addq %rcx, %rax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; BDVER-LABEL: test_x86_tbm_t1mskc_u64:
+; BDVER: # %bb.0:
+; BDVER-NEXT: t1mskc %rdi, %rcx
+; BDVER-NEXT: t1mskc (%rsi), %rax
+; BDVER-NEXT: addq %rcx, %rax
+; BDVER-NEXT: retq
+ %a1 = load i64, i64* %p1
+ %r0 = xor i64 %a0, -1
+ %m0 = xor i64 %a1, -1
+ %r1 = add i64 %a0, 1
+ %m1 = add i64 %a1, 1
+ %r2 = or i64 %r0, %r1
+ %m2 = or i64 %m0, %m1
+ %res = add i64 %r2, %m2
+ ret i64 %res
+}
+
+define i32 @test_x86_tbm_tzmsk_u32(i32 %a0, i32* nocapture %p1) nounwind {
+; GENERIC-LABEL: test_x86_tbm_tzmsk_u32:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: tzmsk %edi, %ecx # sched: [1:0.33]
+; GENERIC-NEXT: tzmsk (%rsi), %eax # sched: [5:0.50]
+; GENERIC-NEXT: addl %ecx, %eax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; BDVER-LABEL: test_x86_tbm_tzmsk_u32:
+; BDVER: # %bb.0:
+; BDVER-NEXT: tzmsk %edi, %ecx
+; BDVER-NEXT: tzmsk (%rsi), %eax
+; BDVER-NEXT: addl %ecx, %eax
+; BDVER-NEXT: retq
+ %a1 = load i32, i32* %p1
+ %r0 = xor i32 %a0, -1
+ %m0 = xor i32 %a1, -1
+ %r1 = add i32 %a0, -1
+ %m1 = add i32 %a1, -1
+ %r2 = and i32 %r0, %r1
+ %m2 = and i32 %m0, %m1
+ %res = add i32 %r2, %m2
+ ret i32 %res
+}
+
+define i64 @test_x86_tbm_tzmsk_u64(i64 %a0, i64* nocapture %p1) nounwind {
+; GENERIC-LABEL: test_x86_tbm_tzmsk_u64:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: tzmsk %rdi, %rcx # sched: [1:0.33]
+; GENERIC-NEXT: tzmsk (%rsi), %rax # sched: [5:0.50]
+; GENERIC-NEXT: addq %rcx, %rax # sched: [1:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; BDVER-LABEL: test_x86_tbm_tzmsk_u64:
+; BDVER: # %bb.0:
+; BDVER-NEXT: tzmsk %rdi, %rcx
+; BDVER-NEXT: tzmsk (%rsi), %rax
+; BDVER-NEXT: addq %rcx, %rax
+; BDVER-NEXT: retq
+ %a1 = load i64, i64* %p1
+ %r0 = xor i64 %a0, -1
+ %m0 = xor i64 %a1, -1
+ %r1 = add i64 %a0, -1
+ %m1 = add i64 %a1, -1
+ %r2 = and i64 %r0, %r1
+ %m2 = and i64 %m0, %m1
+ %res = add i64 %r2, %m2
+ ret i64 %res
+}
diff --git a/test/CodeGen/X86/tbm_patterns.ll b/test/CodeGen/X86/tbm_patterns.ll
index 5ce6bbd4b49e..5cf98b9b73a7 100644
--- a/test/CodeGen/X86/tbm_patterns.ll
+++ b/test/CodeGen/X86/tbm_patterns.ll
@@ -1,9 +1,11 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+tbm < %s | FileCheck %s
+; TODO - Patterns fail to fold with ZF flags and prevents TBM instruction selection.
+
define i32 @test_x86_tbm_bextri_u32(i32 %a) nounwind {
; CHECK-LABEL: test_x86_tbm_bextri_u32:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: bextr $3076, %edi, %eax # imm = 0xC04
; CHECK-NEXT: retq
%t0 = lshr i32 %a, 4
@@ -11,9 +13,21 @@ define i32 @test_x86_tbm_bextri_u32(i32 %a) nounwind {
ret i32 %t1
}
+; Make sure we still use AH subreg trick for extracting bits 15:8
+define i32 @test_x86_tbm_bextri_u32_subreg(i32 %a) nounwind {
+; CHECK-LABEL: test_x86_tbm_bextri_u32_subreg:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: movzbl %ah, %eax # NOREX
+; CHECK-NEXT: retq
+ %t0 = lshr i32 %a, 8
+ %t1 = and i32 %t0, 255
+ ret i32 %t1
+}
+
define i32 @test_x86_tbm_bextri_u32_m(i32* nocapture %a) nounwind {
; CHECK-LABEL: test_x86_tbm_bextri_u32_m:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: bextr $3076, (%rdi), %eax # imm = 0xC04
; CHECK-NEXT: retq
%t0 = load i32, i32* %a
@@ -22,9 +36,37 @@ define i32 @test_x86_tbm_bextri_u32_m(i32* nocapture %a) nounwind {
ret i32 %t2
}
+define i32 @test_x86_tbm_bextri_u32_z(i32 %a, i32 %b) nounwind {
+; CHECK-LABEL: test_x86_tbm_bextri_u32_z:
+; CHECK: # %bb.0:
+; CHECK-NEXT: bextr $3076, %edi, %eax # imm = 0xC04
+; CHECK-NEXT: cmovel %esi, %eax
+; CHECK-NEXT: retq
+ %t0 = lshr i32 %a, 4
+ %t1 = and i32 %t0, 4095
+ %t2 = icmp eq i32 %t1, 0
+ %t3 = select i1 %t2, i32 %b, i32 %t1
+ ret i32 %t3
+}
+
+define i32 @test_x86_tbm_bextri_u32_z2(i32 %a, i32 %b, i32 %c) nounwind {
+; CHECK-LABEL: test_x86_tbm_bextri_u32_z2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: shrl $4, %edi
+; CHECK-NEXT: testl $4095, %edi # imm = 0xFFF
+; CHECK-NEXT: cmovnel %edx, %esi
+; CHECK-NEXT: movl %esi, %eax
+; CHECK-NEXT: retq
+ %t0 = lshr i32 %a, 4
+ %t1 = and i32 %t0, 4095
+ %t2 = icmp eq i32 %t1, 0
+ %t3 = select i1 %t2, i32 %b, i32 %c
+ ret i32 %t3
+}
+
define i64 @test_x86_tbm_bextri_u64(i64 %a) nounwind {
; CHECK-LABEL: test_x86_tbm_bextri_u64:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: bextr $3076, %edi, %eax # imm = 0xC04
; CHECK-NEXT: retq
%t0 = lshr i64 %a, 4
@@ -32,9 +74,21 @@ define i64 @test_x86_tbm_bextri_u64(i64 %a) nounwind {
ret i64 %t1
}
+; Make sure we still use AH subreg trick for extracting bits 15:8
+define i64 @test_x86_tbm_bextri_u64_subreg(i64 %a) nounwind {
+; CHECK-LABEL: test_x86_tbm_bextri_u64_subreg:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movq %rdi, %rax
+; CHECK-NEXT: movzbl %ah, %eax # NOREX
+; CHECK-NEXT: retq
+ %t0 = lshr i64 %a, 8
+ %t1 = and i64 %t0, 255
+ ret i64 %t1
+}
+
define i64 @test_x86_tbm_bextri_u64_m(i64* nocapture %a) nounwind {
; CHECK-LABEL: test_x86_tbm_bextri_u64_m:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: bextr $3076, (%rdi), %eax # imm = 0xC04
; CHECK-NEXT: retq
%t0 = load i64, i64* %a
@@ -43,9 +97,37 @@ define i64 @test_x86_tbm_bextri_u64_m(i64* nocapture %a) nounwind {
ret i64 %t2
}
+define i64 @test_x86_tbm_bextri_u64_z(i64 %a, i64 %b) nounwind {
+; CHECK-LABEL: test_x86_tbm_bextri_u64_z:
+; CHECK: # %bb.0:
+; CHECK-NEXT: bextr $3076, %edi, %eax # imm = 0xC04
+; CHECK-NEXT: cmoveq %rsi, %rax
+; CHECK-NEXT: retq
+ %t0 = lshr i64 %a, 4
+ %t1 = and i64 %t0, 4095
+ %t2 = icmp eq i64 %t1, 0
+ %t3 = select i1 %t2, i64 %b, i64 %t1
+ ret i64 %t3
+}
+
+define i64 @test_x86_tbm_bextri_u64_z2(i64 %a, i64 %b, i64 %c) nounwind {
+; CHECK-LABEL: test_x86_tbm_bextri_u64_z2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: shrl $4, %edi
+; CHECK-NEXT: testl $4095, %edi # imm = 0xFFF
+; CHECK-NEXT: cmovneq %rdx, %rsi
+; CHECK-NEXT: movq %rsi, %rax
+; CHECK-NEXT: retq
+ %t0 = lshr i64 %a, 4
+ %t1 = and i64 %t0, 4095
+ %t2 = icmp eq i64 %t1, 0
+ %t3 = select i1 %t2, i64 %b, i64 %c
+ ret i64 %t3
+}
+
define i32 @test_x86_tbm_blcfill_u32(i32 %a) nounwind {
; CHECK-LABEL: test_x86_tbm_blcfill_u32:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: blcfill %edi, %eax
; CHECK-NEXT: retq
%t0 = add i32 %a, 1
@@ -53,9 +135,38 @@ define i32 @test_x86_tbm_blcfill_u32(i32 %a) nounwind {
ret i32 %t1
}
+define i32 @test_x86_tbm_blcfill_u32_z(i32 %a, i32 %b) nounwind {
+; CHECK-LABEL: test_x86_tbm_blcfill_u32_z:
+; CHECK: # %bb.0:
+; CHECK-NEXT: blcfill %edi, %eax
+; CHECK-NEXT: cmovel %esi, %eax
+; CHECK-NEXT: retq
+ %t0 = add i32 %a, 1
+ %t1 = and i32 %t0, %a
+ %t2 = icmp eq i32 %t1, 0
+ %t3 = select i1 %t2, i32 %b, i32 %t1
+ ret i32 %t3
+}
+
+define i32 @test_x86_tbm_blcfill_u32_z2(i32 %a, i32 %b, i32 %c) nounwind {
+; CHECK-LABEL: test_x86_tbm_blcfill_u32_z2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: # kill: def %edi killed %edi def %rdi
+; CHECK-NEXT: leal 1(%rdi), %eax
+; CHECK-NEXT: testl %edi, %eax
+; CHECK-NEXT: cmovnel %edx, %esi
+; CHECK-NEXT: movl %esi, %eax
+; CHECK-NEXT: retq
+ %t0 = add i32 %a, 1
+ %t1 = and i32 %t0, %a
+ %t2 = icmp eq i32 %t1, 0
+ %t3 = select i1 %t2, i32 %b, i32 %c
+ ret i32 %t3
+}
+
define i64 @test_x86_tbm_blcfill_u64(i64 %a) nounwind {
; CHECK-LABEL: test_x86_tbm_blcfill_u64:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: blcfill %rdi, %rax
; CHECK-NEXT: retq
%t0 = add i64 %a, 1
@@ -63,9 +174,37 @@ define i64 @test_x86_tbm_blcfill_u64(i64 %a) nounwind {
ret i64 %t1
}
+define i64 @test_x86_tbm_blcfill_u64_z(i64 %a, i64 %b) nounwind {
+; CHECK-LABEL: test_x86_tbm_blcfill_u64_z:
+; CHECK: # %bb.0:
+; CHECK-NEXT: blcfill %rdi, %rax
+; CHECK-NEXT: cmoveq %rsi, %rax
+; CHECK-NEXT: retq
+ %t0 = add i64 %a, 1
+ %t1 = and i64 %t0, %a
+ %t2 = icmp eq i64 %t1, 0
+ %t3 = select i1 %t2, i64 %b, i64 %t1
+ ret i64 %t3
+}
+
+define i64 @test_x86_tbm_blcfill_u64_z2(i64 %a, i64 %b, i64 %c) nounwind {
+; CHECK-LABEL: test_x86_tbm_blcfill_u64_z2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: leaq 1(%rdi), %rax
+; CHECK-NEXT: testq %rdi, %rax
+; CHECK-NEXT: cmovneq %rdx, %rsi
+; CHECK-NEXT: movq %rsi, %rax
+; CHECK-NEXT: retq
+ %t0 = add i64 %a, 1
+ %t1 = and i64 %t0, %a
+ %t2 = icmp eq i64 %t1, 0
+ %t3 = select i1 %t2, i64 %b, i64 %c
+ ret i64 %t3
+}
+
define i32 @test_x86_tbm_blci_u32(i32 %a) nounwind {
; CHECK-LABEL: test_x86_tbm_blci_u32:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: blci %edi, %eax
; CHECK-NEXT: retq
%t0 = add i32 1, %a
@@ -74,9 +213,41 @@ define i32 @test_x86_tbm_blci_u32(i32 %a) nounwind {
ret i32 %t2
}
+define i32 @test_x86_tbm_blci_u32_z(i32 %a, i32 %b) nounwind {
+; CHECK-LABEL: test_x86_tbm_blci_u32_z:
+; CHECK: # %bb.0:
+; CHECK-NEXT: blci %edi, %eax
+; CHECK-NEXT: cmovel %esi, %eax
+; CHECK-NEXT: retq
+ %t0 = add i32 1, %a
+ %t1 = xor i32 %t0, -1
+ %t2 = or i32 %t1, %a
+ %t3 = icmp eq i32 %t2, 0
+ %t4 = select i1 %t3, i32 %b, i32 %t2
+ ret i32 %t4
+}
+
+define i32 @test_x86_tbm_blci_u32_z2(i32 %a, i32 %b, i32 %c) nounwind {
+; CHECK-LABEL: test_x86_tbm_blci_u32_z2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: # kill: def %edi killed %edi def %rdi
+; CHECK-NEXT: leal 1(%rdi), %eax
+; CHECK-NEXT: notl %eax
+; CHECK-NEXT: orl %edi, %eax
+; CHECK-NEXT: cmovnel %edx, %esi
+; CHECK-NEXT: movl %esi, %eax
+; CHECK-NEXT: retq
+ %t0 = add i32 1, %a
+ %t1 = xor i32 %t0, -1
+ %t2 = or i32 %t1, %a
+ %t3 = icmp eq i32 %t2, 0
+ %t4 = select i1 %t3, i32 %b, i32 %c
+ ret i32 %t4
+}
+
define i64 @test_x86_tbm_blci_u64(i64 %a) nounwind {
; CHECK-LABEL: test_x86_tbm_blci_u64:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: blci %rdi, %rax
; CHECK-NEXT: retq
%t0 = add i64 1, %a
@@ -85,9 +256,40 @@ define i64 @test_x86_tbm_blci_u64(i64 %a) nounwind {
ret i64 %t2
}
+define i64 @test_x86_tbm_blci_u64_z(i64 %a, i64 %b) nounwind {
+; CHECK-LABEL: test_x86_tbm_blci_u64_z:
+; CHECK: # %bb.0:
+; CHECK-NEXT: blci %rdi, %rax
+; CHECK-NEXT: cmoveq %rsi, %rax
+; CHECK-NEXT: retq
+ %t0 = add i64 1, %a
+ %t1 = xor i64 %t0, -1
+ %t2 = or i64 %t1, %a
+ %t3 = icmp eq i64 %t2, 0
+ %t4 = select i1 %t3, i64 %b, i64 %t2
+ ret i64 %t4
+}
+
+define i64 @test_x86_tbm_blci_u64_z2(i64 %a, i64 %b, i64 %c) nounwind {
+; CHECK-LABEL: test_x86_tbm_blci_u64_z2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: leaq 1(%rdi), %rax
+; CHECK-NEXT: notq %rax
+; CHECK-NEXT: orq %rdi, %rax
+; CHECK-NEXT: cmovneq %rdx, %rsi
+; CHECK-NEXT: movq %rsi, %rax
+; CHECK-NEXT: retq
+ %t0 = add i64 1, %a
+ %t1 = xor i64 %t0, -1
+ %t2 = or i64 %t1, %a
+ %t3 = icmp eq i64 %t2, 0
+ %t4 = select i1 %t3, i64 %b, i64 %c
+ ret i64 %t4
+}
+
define i32 @test_x86_tbm_blci_u32_b(i32 %a) nounwind {
; CHECK-LABEL: test_x86_tbm_blci_u32_b:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: blci %edi, %eax
; CHECK-NEXT: retq
%t0 = sub i32 -2, %a
@@ -97,7 +299,7 @@ define i32 @test_x86_tbm_blci_u32_b(i32 %a) nounwind {
define i64 @test_x86_tbm_blci_u64_b(i64 %a) nounwind {
; CHECK-LABEL: test_x86_tbm_blci_u64_b:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: blci %rdi, %rax
; CHECK-NEXT: retq
%t0 = sub i64 -2, %a
@@ -107,7 +309,7 @@ define i64 @test_x86_tbm_blci_u64_b(i64 %a) nounwind {
define i32 @test_x86_tbm_blcic_u32(i32 %a) nounwind {
; CHECK-LABEL: test_x86_tbm_blcic_u32:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: blcic %edi, %eax
; CHECK-NEXT: retq
%t0 = xor i32 %a, -1
@@ -116,9 +318,41 @@ define i32 @test_x86_tbm_blcic_u32(i32 %a) nounwind {
ret i32 %t2
}
+define i32 @test_x86_tbm_blcic_u32_z(i32 %a, i32 %b) nounwind {
+; CHECK-LABEL: test_x86_tbm_blcic_u32_z:
+; CHECK: # %bb.0:
+; CHECK-NEXT: blcic %edi, %eax
+; CHECK-NEXT: cmovel %esi, %eax
+; CHECK-NEXT: retq
+ %t0 = xor i32 %a, -1
+ %t1 = add i32 %a, 1
+ %t2 = and i32 %t1, %t0
+ %t3 = icmp eq i32 %t2, 0
+ %t4 = select i1 %t3, i32 %b, i32 %t2
+ ret i32 %t4
+}
+
+define i32 @test_x86_tbm_blcic_u32_z2(i32 %a, i32 %b, i32 %c) nounwind {
+; CHECK-LABEL: test_x86_tbm_blcic_u32_z2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: notl %eax
+; CHECK-NEXT: incl %edi
+; CHECK-NEXT: testl %eax, %edi
+; CHECK-NEXT: cmovnel %edx, %esi
+; CHECK-NEXT: movl %esi, %eax
+; CHECK-NEXT: retq
+ %t0 = xor i32 %a, -1
+ %t1 = add i32 %a, 1
+ %t2 = and i32 %t1, %t0
+ %t3 = icmp eq i32 %t2, 0
+ %t4 = select i1 %t3, i32 %b, i32 %c
+ ret i32 %t4
+}
+
define i64 @test_x86_tbm_blcic_u64(i64 %a) nounwind {
; CHECK-LABEL: test_x86_tbm_blcic_u64:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: blcic %rdi, %rax
; CHECK-NEXT: retq
%t0 = xor i64 %a, -1
@@ -127,9 +361,41 @@ define i64 @test_x86_tbm_blcic_u64(i64 %a) nounwind {
ret i64 %t2
}
+define i64 @test_x86_tbm_blcic_u64_z(i64 %a, i64 %b) nounwind {
+; CHECK-LABEL: test_x86_tbm_blcic_u64_z:
+; CHECK: # %bb.0:
+; CHECK-NEXT: blcic %rdi, %rax
+; CHECK-NEXT: cmoveq %rsi, %rax
+; CHECK-NEXT: retq
+ %t0 = xor i64 %a, -1
+ %t1 = add i64 %a, 1
+ %t2 = and i64 %t1, %t0
+ %t3 = icmp eq i64 %t2, 0
+ %t4 = select i1 %t3, i64 %b, i64 %t2
+ ret i64 %t4
+}
+
+define i64 @test_x86_tbm_blcic_u64_z2(i64 %a, i64 %b, i64 %c) nounwind {
+; CHECK-LABEL: test_x86_tbm_blcic_u64_z2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movq %rdi, %rax
+; CHECK-NEXT: notq %rax
+; CHECK-NEXT: incq %rdi
+; CHECK-NEXT: testq %rax, %rdi
+; CHECK-NEXT: cmovneq %rdx, %rsi
+; CHECK-NEXT: movq %rsi, %rax
+; CHECK-NEXT: retq
+ %t0 = xor i64 %a, -1
+ %t1 = add i64 %a, 1
+ %t2 = and i64 %t1, %t0
+ %t3 = icmp eq i64 %t2, 0
+ %t4 = select i1 %t3, i64 %b, i64 %c
+ ret i64 %t4
+}
+
define i32 @test_x86_tbm_blcmsk_u32(i32 %a) nounwind {
; CHECK-LABEL: test_x86_tbm_blcmsk_u32:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: blcmsk %edi, %eax
; CHECK-NEXT: retq
%t0 = add i32 %a, 1
@@ -137,9 +403,38 @@ define i32 @test_x86_tbm_blcmsk_u32(i32 %a) nounwind {
ret i32 %t1
}
+define i32 @test_x86_tbm_blcmsk_u32_z(i32 %a, i32 %b) nounwind {
+; CHECK-LABEL: test_x86_tbm_blcmsk_u32_z:
+; CHECK: # %bb.0:
+; CHECK-NEXT: blcmsk %edi, %eax
+; CHECK-NEXT: cmovel %esi, %eax
+; CHECK-NEXT: retq
+ %t0 = add i32 %a, 1
+ %t1 = xor i32 %t0, %a
+ %t2 = icmp eq i32 %t1, 0
+ %t3 = select i1 %t2, i32 %b, i32 %t1
+ ret i32 %t3
+}
+
+define i32 @test_x86_tbm_blcmsk_u32_z2(i32 %a, i32 %b, i32 %c) nounwind {
+; CHECK-LABEL: test_x86_tbm_blcmsk_u32_z2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: # kill: def %edi killed %edi def %rdi
+; CHECK-NEXT: leal 1(%rdi), %eax
+; CHECK-NEXT: xorl %edi, %eax
+; CHECK-NEXT: cmovnel %edx, %esi
+; CHECK-NEXT: movl %esi, %eax
+; CHECK-NEXT: retq
+ %t0 = add i32 %a, 1
+ %t1 = xor i32 %t0, %a
+ %t2 = icmp eq i32 %t1, 0
+ %t3 = select i1 %t2, i32 %b, i32 %c
+ ret i32 %t3
+}
+
define i64 @test_x86_tbm_blcmsk_u64(i64 %a) nounwind {
; CHECK-LABEL: test_x86_tbm_blcmsk_u64:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: blcmsk %rdi, %rax
; CHECK-NEXT: retq
%t0 = add i64 %a, 1
@@ -147,9 +442,37 @@ define i64 @test_x86_tbm_blcmsk_u64(i64 %a) nounwind {
ret i64 %t1
}
+define i64 @test_x86_tbm_blcmsk_u64_z(i64 %a, i64 %b) nounwind {
+; CHECK-LABEL: test_x86_tbm_blcmsk_u64_z:
+; CHECK: # %bb.0:
+; CHECK-NEXT: blcmsk %rdi, %rax
+; CHECK-NEXT: cmoveq %rsi, %rax
+; CHECK-NEXT: retq
+ %t0 = add i64 %a, 1
+ %t1 = xor i64 %t0, %a
+ %t2 = icmp eq i64 %t1, 0
+ %t3 = select i1 %t2, i64 %b, i64 %t1
+ ret i64 %t3
+}
+
+define i64 @test_x86_tbm_blcmsk_u64_z2(i64 %a, i64 %b, i64 %c) nounwind {
+; CHECK-LABEL: test_x86_tbm_blcmsk_u64_z2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: leaq 1(%rdi), %rax
+; CHECK-NEXT: xorq %rdi, %rax
+; CHECK-NEXT: cmovneq %rdx, %rsi
+; CHECK-NEXT: movq %rsi, %rax
+; CHECK-NEXT: retq
+ %t0 = add i64 %a, 1
+ %t1 = xor i64 %t0, %a
+ %t2 = icmp eq i64 %t1, 0
+ %t3 = select i1 %t2, i64 %b, i64 %c
+ ret i64 %t3
+}
+
define i32 @test_x86_tbm_blcs_u32(i32 %a) nounwind {
; CHECK-LABEL: test_x86_tbm_blcs_u32:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: blcs %edi, %eax
; CHECK-NEXT: retq
%t0 = add i32 %a, 1
@@ -157,9 +480,38 @@ define i32 @test_x86_tbm_blcs_u32(i32 %a) nounwind {
ret i32 %t1
}
+define i32 @test_x86_tbm_blcs_u32_z(i32 %a, i32 %b) nounwind {
+; CHECK-LABEL: test_x86_tbm_blcs_u32_z:
+; CHECK: # %bb.0:
+; CHECK-NEXT: blcs %edi, %eax
+; CHECK-NEXT: cmovel %esi, %eax
+; CHECK-NEXT: retq
+ %t0 = add i32 %a, 1
+ %t1 = or i32 %t0, %a
+ %t2 = icmp eq i32 %t1, 0
+ %t3 = select i1 %t2, i32 %b, i32 %t1
+ ret i32 %t3
+}
+
+define i32 @test_x86_tbm_blcs_u32_z2(i32 %a, i32 %b, i32 %c) nounwind {
+; CHECK-LABEL: test_x86_tbm_blcs_u32_z2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: # kill: def %edi killed %edi def %rdi
+; CHECK-NEXT: leal 1(%rdi), %eax
+; CHECK-NEXT: orl %edi, %eax
+; CHECK-NEXT: cmovnel %edx, %esi
+; CHECK-NEXT: movl %esi, %eax
+; CHECK-NEXT: retq
+ %t0 = add i32 %a, 1
+ %t1 = or i32 %t0, %a
+ %t2 = icmp eq i32 %t1, 0
+ %t3 = select i1 %t2, i32 %b, i32 %c
+ ret i32 %t3
+}
+
define i64 @test_x86_tbm_blcs_u64(i64 %a) nounwind {
; CHECK-LABEL: test_x86_tbm_blcs_u64:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: blcs %rdi, %rax
; CHECK-NEXT: retq
%t0 = add i64 %a, 1
@@ -167,9 +519,37 @@ define i64 @test_x86_tbm_blcs_u64(i64 %a) nounwind {
ret i64 %t1
}
+define i64 @test_x86_tbm_blcs_u64_z(i64 %a, i64 %b) nounwind {
+; CHECK-LABEL: test_x86_tbm_blcs_u64_z:
+; CHECK: # %bb.0:
+; CHECK-NEXT: blcs %rdi, %rax
+; CHECK-NEXT: cmoveq %rsi, %rax
+; CHECK-NEXT: retq
+ %t0 = add i64 %a, 1
+ %t1 = or i64 %t0, %a
+ %t2 = icmp eq i64 %t1, 0
+ %t3 = select i1 %t2, i64 %b, i64 %t1
+ ret i64 %t3
+}
+
+define i64 @test_x86_tbm_blcs_u64_z2(i64 %a, i64 %b, i64 %c) nounwind {
+; CHECK-LABEL: test_x86_tbm_blcs_u64_z2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: leaq 1(%rdi), %rax
+; CHECK-NEXT: orq %rdi, %rax
+; CHECK-NEXT: cmovneq %rdx, %rsi
+; CHECK-NEXT: movq %rsi, %rax
+; CHECK-NEXT: retq
+ %t0 = add i64 %a, 1
+ %t1 = or i64 %t0, %a
+ %t2 = icmp eq i64 %t1, 0
+ %t3 = select i1 %t2, i64 %b, i64 %c
+ ret i64 %t3
+}
+
define i32 @test_x86_tbm_blsfill_u32(i32 %a) nounwind {
; CHECK-LABEL: test_x86_tbm_blsfill_u32:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: blsfill %edi, %eax
; CHECK-NEXT: retq
%t0 = add i32 %a, -1
@@ -177,9 +557,38 @@ define i32 @test_x86_tbm_blsfill_u32(i32 %a) nounwind {
ret i32 %t1
}
+define i32 @test_x86_tbm_blsfill_u32_z(i32 %a, i32 %b) nounwind {
+; CHECK-LABEL: test_x86_tbm_blsfill_u32_z:
+; CHECK: # %bb.0:
+; CHECK-NEXT: blsfill %edi, %eax
+; CHECK-NEXT: cmovel %esi, %eax
+; CHECK-NEXT: retq
+ %t0 = add i32 %a, -1
+ %t1 = or i32 %t0, %a
+ %t2 = icmp eq i32 %t1, 0
+ %t3 = select i1 %t2, i32 %b, i32 %t1
+ ret i32 %t3
+}
+
+define i32 @test_x86_tbm_blsfill_u32_z2(i32 %a, i32 %b, i32 %c) nounwind {
+; CHECK-LABEL: test_x86_tbm_blsfill_u32_z2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: # kill: def %edi killed %edi def %rdi
+; CHECK-NEXT: leal -1(%rdi), %eax
+; CHECK-NEXT: orl %edi, %eax
+; CHECK-NEXT: cmovnel %edx, %esi
+; CHECK-NEXT: movl %esi, %eax
+; CHECK-NEXT: retq
+ %t0 = add i32 %a, -1
+ %t1 = or i32 %t0, %a
+ %t2 = icmp eq i32 %t1, 0
+ %t3 = select i1 %t2, i32 %b, i32 %c
+ ret i32 %t3
+}
+
define i64 @test_x86_tbm_blsfill_u64(i64 %a) nounwind {
; CHECK-LABEL: test_x86_tbm_blsfill_u64:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: blsfill %rdi, %rax
; CHECK-NEXT: retq
%t0 = add i64 %a, -1
@@ -187,9 +596,37 @@ define i64 @test_x86_tbm_blsfill_u64(i64 %a) nounwind {
ret i64 %t1
}
+define i64 @test_x86_tbm_blsfill_u64_z(i64 %a, i64 %b) nounwind {
+; CHECK-LABEL: test_x86_tbm_blsfill_u64_z:
+; CHECK: # %bb.0:
+; CHECK-NEXT: blsfill %rdi, %rax
+; CHECK-NEXT: cmoveq %rsi, %rax
+; CHECK-NEXT: retq
+ %t0 = add i64 %a, -1
+ %t1 = or i64 %t0, %a
+ %t2 = icmp eq i64 %t1, 0
+ %t3 = select i1 %t2, i64 %b, i64 %t1
+ ret i64 %t3
+}
+
+define i64 @test_x86_tbm_blsfill_u64_z2(i64 %a, i64 %b, i64 %c) nounwind {
+; CHECK-LABEL: test_x86_tbm_blsfill_u64_z2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: leaq -1(%rdi), %rax
+; CHECK-NEXT: orq %rdi, %rax
+; CHECK-NEXT: cmovneq %rdx, %rsi
+; CHECK-NEXT: movq %rsi, %rax
+; CHECK-NEXT: retq
+ %t0 = add i64 %a, -1
+ %t1 = or i64 %t0, %a
+ %t2 = icmp eq i64 %t1, 0
+ %t3 = select i1 %t2, i64 %b, i64 %c
+ ret i64 %t3
+}
+
define i32 @test_x86_tbm_blsic_u32(i32 %a) nounwind {
; CHECK-LABEL: test_x86_tbm_blsic_u32:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: blsic %edi, %eax
; CHECK-NEXT: retq
%t0 = xor i32 %a, -1
@@ -198,9 +635,41 @@ define i32 @test_x86_tbm_blsic_u32(i32 %a) nounwind {
ret i32 %t2
}
+define i32 @test_x86_tbm_blsic_u32_z(i32 %a, i32 %b) nounwind {
+; CHECK-LABEL: test_x86_tbm_blsic_u32_z:
+; CHECK: # %bb.0:
+; CHECK-NEXT: blsic %edi, %eax
+; CHECK-NEXT: cmovel %esi, %eax
+; CHECK-NEXT: retq
+ %t0 = xor i32 %a, -1
+ %t1 = add i32 %a, -1
+ %t2 = or i32 %t0, %t1
+ %t3 = icmp eq i32 %t2, 0
+ %t4 = select i1 %t3, i32 %b, i32 %t2
+ ret i32 %t4
+}
+
+define i32 @test_x86_tbm_blsic_u32_z2(i32 %a, i32 %b, i32 %c) nounwind {
+; CHECK-LABEL: test_x86_tbm_blsic_u32_z2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: notl %eax
+; CHECK-NEXT: decl %edi
+; CHECK-NEXT: orl %eax, %edi
+; CHECK-NEXT: cmovnel %edx, %esi
+; CHECK-NEXT: movl %esi, %eax
+; CHECK-NEXT: retq
+ %t0 = xor i32 %a, -1
+ %t1 = add i32 %a, -1
+ %t2 = or i32 %t0, %t1
+ %t3 = icmp eq i32 %t2, 0
+ %t4 = select i1 %t3, i32 %b, i32 %c
+ ret i32 %t4
+}
+
define i64 @test_x86_tbm_blsic_u64(i64 %a) nounwind {
; CHECK-LABEL: test_x86_tbm_blsic_u64:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: blsic %rdi, %rax
; CHECK-NEXT: retq
%t0 = xor i64 %a, -1
@@ -209,9 +678,41 @@ define i64 @test_x86_tbm_blsic_u64(i64 %a) nounwind {
ret i64 %t2
}
+define i64 @test_x86_tbm_blsic_u64_z(i64 %a, i64 %b) nounwind {
+; CHECK-LABEL: test_x86_tbm_blsic_u64_z:
+; CHECK: # %bb.0:
+; CHECK-NEXT: blsic %rdi, %rax
+; CHECK-NEXT: cmoveq %rsi, %rax
+; CHECK-NEXT: retq
+ %t0 = xor i64 %a, -1
+ %t1 = add i64 %a, -1
+ %t2 = or i64 %t0, %t1
+ %t3 = icmp eq i64 %t2, 0
+ %t4 = select i1 %t3, i64 %b, i64 %t2
+ ret i64 %t4
+}
+
+define i64 @test_x86_tbm_blsic_u64_z2(i64 %a, i64 %b, i64 %c) nounwind {
+; CHECK-LABEL: test_x86_tbm_blsic_u64_z2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movq %rdi, %rax
+; CHECK-NEXT: notq %rax
+; CHECK-NEXT: decq %rdi
+; CHECK-NEXT: orq %rax, %rdi
+; CHECK-NEXT: cmovneq %rdx, %rsi
+; CHECK-NEXT: movq %rsi, %rax
+; CHECK-NEXT: retq
+ %t0 = xor i64 %a, -1
+ %t1 = add i64 %a, -1
+ %t2 = or i64 %t0, %t1
+ %t3 = icmp eq i64 %t2, 0
+ %t4 = select i1 %t3, i64 %b, i64 %c
+ ret i64 %t4
+}
+
define i32 @test_x86_tbm_t1mskc_u32(i32 %a) nounwind {
; CHECK-LABEL: test_x86_tbm_t1mskc_u32:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: t1mskc %edi, %eax
; CHECK-NEXT: retq
%t0 = xor i32 %a, -1
@@ -220,9 +721,42 @@ define i32 @test_x86_tbm_t1mskc_u32(i32 %a) nounwind {
ret i32 %t2
}
-define i64 @Ttest_x86_tbm_t1mskc_u64(i64 %a) nounwind {
-; CHECK-LABEL: Ttest_x86_tbm_t1mskc_u64:
-; CHECK: # BB#0:
+define i32 @test_x86_tbm_t1mskc_u32_z(i32 %a, i32 %b) nounwind {
+; CHECK-LABEL: test_x86_tbm_t1mskc_u32_z:
+; CHECK: # %bb.0:
+; CHECK-NEXT: t1mskc %edi, %eax
+; CHECK-NEXT: testl %eax, %eax
+; CHECK-NEXT: cmovel %esi, %eax
+; CHECK-NEXT: retq
+ %t0 = xor i32 %a, -1
+ %t1 = add i32 %a, 1
+ %t2 = or i32 %t0, %t1
+ %t3 = icmp eq i32 %t2, 0
+ %t4 = select i1 %t3, i32 %b, i32 %t2
+ ret i32 %t4
+}
+
+define i32 @test_x86_tbm_t1mskc_u32_z2(i32 %a, i32 %b, i32 %c) nounwind {
+; CHECK-LABEL: test_x86_tbm_t1mskc_u32_z2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: notl %eax
+; CHECK-NEXT: incl %edi
+; CHECK-NEXT: orl %eax, %edi
+; CHECK-NEXT: cmovnel %edx, %esi
+; CHECK-NEXT: movl %esi, %eax
+; CHECK-NEXT: retq
+ %t0 = xor i32 %a, -1
+ %t1 = add i32 %a, 1
+ %t2 = or i32 %t0, %t1
+ %t3 = icmp eq i32 %t2, 0
+ %t4 = select i1 %t3, i32 %b, i32 %c
+ ret i32 %t4
+}
+
+define i64 @test_x86_tbm_t1mskc_u64(i64 %a) nounwind {
+; CHECK-LABEL: test_x86_tbm_t1mskc_u64:
+; CHECK: # %bb.0:
; CHECK-NEXT: t1mskc %rdi, %rax
; CHECK-NEXT: retq
%t0 = xor i64 %a, -1
@@ -231,9 +765,42 @@ define i64 @Ttest_x86_tbm_t1mskc_u64(i64 %a) nounwind {
ret i64 %t2
}
+define i64 @test_x86_tbm_t1mskc_u64_z(i64 %a, i64 %b) nounwind {
+; CHECK-LABEL: test_x86_tbm_t1mskc_u64_z:
+; CHECK: # %bb.0:
+; CHECK-NEXT: t1mskc %rdi, %rax
+; CHECK-NEXT: testq %rax, %rax
+; CHECK-NEXT: cmoveq %rsi, %rax
+; CHECK-NEXT: retq
+ %t0 = xor i64 %a, -1
+ %t1 = add i64 %a, 1
+ %t2 = or i64 %t0, %t1
+ %t3 = icmp eq i64 %t2, 0
+ %t4 = select i1 %t3, i64 %b, i64 %t2
+ ret i64 %t4
+}
+
+define i64 @test_x86_tbm_t1mskc_u64_z2(i64 %a, i64 %b, i64 %c) nounwind {
+; CHECK-LABEL: test_x86_tbm_t1mskc_u64_z2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movq %rdi, %rax
+; CHECK-NEXT: notq %rax
+; CHECK-NEXT: incq %rdi
+; CHECK-NEXT: orq %rax, %rdi
+; CHECK-NEXT: cmovneq %rdx, %rsi
+; CHECK-NEXT: movq %rsi, %rax
+; CHECK-NEXT: retq
+ %t0 = xor i64 %a, -1
+ %t1 = add i64 %a, 1
+ %t2 = or i64 %t0, %t1
+ %t3 = icmp eq i64 %t2, 0
+ %t4 = select i1 %t3, i64 %b, i64 %c
+ ret i64 %t4
+}
+
define i32 @test_x86_tbm_tzmsk_u32(i32 %a) nounwind {
; CHECK-LABEL: test_x86_tbm_tzmsk_u32:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: tzmsk %edi, %eax
; CHECK-NEXT: retq
%t0 = xor i32 %a, -1
@@ -242,9 +809,42 @@ define i32 @test_x86_tbm_tzmsk_u32(i32 %a) nounwind {
ret i32 %t2
}
+define i32 @test_x86_tbm_tzmsk_u32_z(i32 %a, i32 %b) nounwind {
+; CHECK-LABEL: test_x86_tbm_tzmsk_u32_z:
+; CHECK: # %bb.0:
+; CHECK-NEXT: tzmsk %edi, %eax
+; CHECK-NEXT: testl %eax, %eax
+; CHECK-NEXT: cmovel %esi, %eax
+; CHECK-NEXT: retq
+ %t0 = xor i32 %a, -1
+ %t1 = add i32 %a, -1
+ %t2 = and i32 %t0, %t1
+ %t3 = icmp eq i32 %t2, 0
+ %t4 = select i1 %t3, i32 %b, i32 %t2
+ ret i32 %t4
+}
+
+define i32 @test_x86_tbm_tzmsk_u32_z2(i32 %a, i32 %b, i32 %c) nounwind {
+; CHECK-LABEL: test_x86_tbm_tzmsk_u32_z2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: notl %eax
+; CHECK-NEXT: decl %edi
+; CHECK-NEXT: testl %edi, %eax
+; CHECK-NEXT: cmovnel %edx, %esi
+; CHECK-NEXT: movl %esi, %eax
+; CHECK-NEXT: retq
+ %t0 = xor i32 %a, -1
+ %t1 = add i32 %a, -1
+ %t2 = and i32 %t0, %t1
+ %t3 = icmp eq i32 %t2, 0
+ %t4 = select i1 %t3, i32 %b, i32 %c
+ ret i32 %t4
+}
+
define i64 @test_x86_tbm_tzmsk_u64(i64 %a) nounwind {
; CHECK-LABEL: test_x86_tbm_tzmsk_u64:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: tzmsk %rdi, %rax
; CHECK-NEXT: retq
%t0 = xor i64 %a, -1
@@ -253,3 +853,56 @@ define i64 @test_x86_tbm_tzmsk_u64(i64 %a) nounwind {
ret i64 %t2
}
+define i64 @test_x86_tbm_tzmsk_u64_z(i64 %a, i64 %b) nounwind {
+; CHECK-LABEL: test_x86_tbm_tzmsk_u64_z:
+; CHECK: # %bb.0:
+; CHECK-NEXT: tzmsk %rdi, %rax
+; CHECK-NEXT: testq %rax, %rax
+; CHECK-NEXT: cmoveq %rsi, %rax
+; CHECK-NEXT: retq
+ %t0 = xor i64 %a, -1
+ %t1 = add i64 %a, -1
+ %t2 = and i64 %t0, %t1
+ %t3 = icmp eq i64 %t2, 0
+ %t4 = select i1 %t3, i64 %b, i64 %t2
+ ret i64 %t4
+}
+
+define i64 @test_x86_tbm_tzmsk_u64_z2(i64 %a, i64 %b, i64 %c) nounwind {
+; CHECK-LABEL: test_x86_tbm_tzmsk_u64_z2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movq %rdi, %rax
+; CHECK-NEXT: notq %rax
+; CHECK-NEXT: decq %rdi
+; CHECK-NEXT: testq %rdi, %rax
+; CHECK-NEXT: cmovneq %rdx, %rsi
+; CHECK-NEXT: movq %rsi, %rax
+; CHECK-NEXT: retq
+ %t0 = xor i64 %a, -1
+ %t1 = add i64 %a, -1
+ %t2 = and i64 %t0, %t1
+ %t3 = icmp eq i64 %t2, 0
+ %t4 = select i1 %t3, i64 %b, i64 %c
+ ret i64 %t4
+}
+
+define i64 @test_and_large_constant_mask(i64 %x) {
+; CHECK-LABEL: test_and_large_constant_mask:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: bextr $15872, %rdi, %rax # imm = 0x3E00
+; CHECK-NEXT: retq
+entry:
+ %and = and i64 %x, 4611686018427387903
+ ret i64 %and
+}
+
+define i64 @test_and_large_constant_mask_load(i64* %x) {
+; CHECK-LABEL: test_and_large_constant_mask_load:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: bextr $15872, (%rdi), %rax # imm = 0x3E00
+; CHECK-NEXT: retq
+entry:
+ %x1 = load i64, i64* %x
+ %and = and i64 %x1, 4611686018427387903
+ ret i64 %and
+}
diff --git a/test/CodeGen/X86/test-nofold.ll b/test/CodeGen/X86/test-nofold.ll
index 19fbaafc194f..8a49b9c9b25a 100644
--- a/test/CodeGen/X86/test-nofold.ll
+++ b/test/CodeGen/X86/test-nofold.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mcpu=yonah | FileCheck %s
+; RUN: llc < %s -mtriple=i686-- -mcpu=yonah | FileCheck %s
; rdar://5752025
; We want:
diff --git a/test/CodeGen/X86/test-shrink-bug.ll b/test/CodeGen/X86/test-shrink-bug.ll
index 1bb1e6384832..814e07f718b0 100644
--- a/test/CodeGen/X86/test-shrink-bug.ll
+++ b/test/CodeGen/X86/test-shrink-bug.ll
@@ -3,7 +3,7 @@
; Codegen shouldn't reduce the comparison down to testb $-1, %al
; because that changes the result of the signed test.
; PR5132
-; CHECK: testw $255, %ax
+; CHECK: testl $255, %eax
target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
target triple = "i386-apple-darwin10.0"
diff --git a/test/CodeGen/X86/test-shrink.ll b/test/CodeGen/X86/test-shrink.ll
index c9b76c88c1a2..9e59f9a2faa4 100644
--- a/test/CodeGen/X86/test-shrink.ll
+++ b/test/CodeGen/X86/test-shrink.ll
@@ -1,14 +1,39 @@
-; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s --check-prefix=CHECK-64
-; RUN: llc < %s -mtriple=x86_64-win32 | FileCheck %s --check-prefix=CHECK-64
-; RUN: llc < %s -march=x86 | FileCheck %s --check-prefix=CHECK-32
-
-; CHECK-64-LABEL: g64xh:
-; CHECK-64: testb $8, {{%ah|%ch}}
-; CHECK-64: ret
-; CHECK-32-LABEL: g64xh:
-; CHECK-32: testb $8, %ah
-; CHECK-32: ret
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s --check-prefix=CHECK-LINUX64
+; RUN: llc < %s -mtriple=x86_64-win32 | FileCheck %s --check-prefix=CHECK-WIN32-64
+; RUN: llc < %s -mtriple=i686-- | FileCheck %s --check-prefix=CHECK-X86
+
define void @g64xh(i64 inreg %x) nounwind {
+; CHECK-LINUX64-LABEL: g64xh:
+; CHECK-LINUX64: # %bb.0:
+; CHECK-LINUX64-NEXT: btl $11, %edi
+; CHECK-LINUX64-NEXT: jb .LBB0_2
+; CHECK-LINUX64-NEXT: # %bb.1: # %yes
+; CHECK-LINUX64-NEXT: pushq %rax
+; CHECK-LINUX64-NEXT: callq bar
+; CHECK-LINUX64-NEXT: popq %rax
+; CHECK-LINUX64-NEXT: .LBB0_2: # %no
+; CHECK-LINUX64-NEXT: retq
+;
+; CHECK-WIN32-64-LABEL: g64xh:
+; CHECK-WIN32-64: # %bb.0:
+; CHECK-WIN32-64-NEXT: subq $40, %rsp
+; CHECK-WIN32-64-NEXT: btl $11, %ecx
+; CHECK-WIN32-64-NEXT: jb .LBB0_2
+; CHECK-WIN32-64-NEXT: # %bb.1: # %yes
+; CHECK-WIN32-64-NEXT: callq bar
+; CHECK-WIN32-64-NEXT: .LBB0_2: # %no
+; CHECK-WIN32-64-NEXT: addq $40, %rsp
+; CHECK-WIN32-64-NEXT: retq
+;
+; CHECK-X86-LABEL: g64xh:
+; CHECK-X86: # %bb.0:
+; CHECK-X86-NEXT: btl $11, %eax
+; CHECK-X86-NEXT: jb .LBB0_2
+; CHECK-X86-NEXT: # %bb.1: # %yes
+; CHECK-X86-NEXT: calll bar
+; CHECK-X86-NEXT: .LBB0_2: # %no
+; CHECK-X86-NEXT: retl
%t = and i64 %x, 2048
%s = icmp eq i64 %t, 0
br i1 %s, label %yes, label %no
@@ -19,13 +44,38 @@ yes:
no:
ret void
}
-; CHECK-64-LABEL: g64xl:
-; CHECK-64: testb $8, [[A0L:%dil|%cl]]
-; CHECK-64: ret
-; CHECK-32-LABEL: g64xl:
-; CHECK-32: testb $8, %al
-; CHECK-32: ret
+
define void @g64xl(i64 inreg %x) nounwind {
+; CHECK-LINUX64-LABEL: g64xl:
+; CHECK-LINUX64: # %bb.0:
+; CHECK-LINUX64-NEXT: testb $8, %dil
+; CHECK-LINUX64-NEXT: jne .LBB1_2
+; CHECK-LINUX64-NEXT: # %bb.1: # %yes
+; CHECK-LINUX64-NEXT: pushq %rax
+; CHECK-LINUX64-NEXT: callq bar
+; CHECK-LINUX64-NEXT: popq %rax
+; CHECK-LINUX64-NEXT: .LBB1_2: # %no
+; CHECK-LINUX64-NEXT: retq
+;
+; CHECK-WIN32-64-LABEL: g64xl:
+; CHECK-WIN32-64: # %bb.0:
+; CHECK-WIN32-64-NEXT: subq $40, %rsp
+; CHECK-WIN32-64-NEXT: testb $8, %cl
+; CHECK-WIN32-64-NEXT: jne .LBB1_2
+; CHECK-WIN32-64-NEXT: # %bb.1: # %yes
+; CHECK-WIN32-64-NEXT: callq bar
+; CHECK-WIN32-64-NEXT: .LBB1_2: # %no
+; CHECK-WIN32-64-NEXT: addq $40, %rsp
+; CHECK-WIN32-64-NEXT: retq
+;
+; CHECK-X86-LABEL: g64xl:
+; CHECK-X86: # %bb.0:
+; CHECK-X86-NEXT: testb $8, %al
+; CHECK-X86-NEXT: jne .LBB1_2
+; CHECK-X86-NEXT: # %bb.1: # %yes
+; CHECK-X86-NEXT: calll bar
+; CHECK-X86-NEXT: .LBB1_2: # %no
+; CHECK-X86-NEXT: retl
%t = and i64 %x, 8
%s = icmp eq i64 %t, 0
br i1 %s, label %yes, label %no
@@ -36,13 +86,38 @@ yes:
no:
ret void
}
-; CHECK-64-LABEL: g32xh:
-; CHECK-64: testb $8, {{%ah|%ch}}
-; CHECK-64: ret
-; CHECK-32-LABEL: g32xh:
-; CHECK-32: testb $8, %ah
-; CHECK-32: ret
+
define void @g32xh(i32 inreg %x) nounwind {
+; CHECK-LINUX64-LABEL: g32xh:
+; CHECK-LINUX64: # %bb.0:
+; CHECK-LINUX64-NEXT: btl $11, %edi
+; CHECK-LINUX64-NEXT: jb .LBB2_2
+; CHECK-LINUX64-NEXT: # %bb.1: # %yes
+; CHECK-LINUX64-NEXT: pushq %rax
+; CHECK-LINUX64-NEXT: callq bar
+; CHECK-LINUX64-NEXT: popq %rax
+; CHECK-LINUX64-NEXT: .LBB2_2: # %no
+; CHECK-LINUX64-NEXT: retq
+;
+; CHECK-WIN32-64-LABEL: g32xh:
+; CHECK-WIN32-64: # %bb.0:
+; CHECK-WIN32-64-NEXT: subq $40, %rsp
+; CHECK-WIN32-64-NEXT: btl $11, %ecx
+; CHECK-WIN32-64-NEXT: jb .LBB2_2
+; CHECK-WIN32-64-NEXT: # %bb.1: # %yes
+; CHECK-WIN32-64-NEXT: callq bar
+; CHECK-WIN32-64-NEXT: .LBB2_2: # %no
+; CHECK-WIN32-64-NEXT: addq $40, %rsp
+; CHECK-WIN32-64-NEXT: retq
+;
+; CHECK-X86-LABEL: g32xh:
+; CHECK-X86: # %bb.0:
+; CHECK-X86-NEXT: btl $11, %eax
+; CHECK-X86-NEXT: jb .LBB2_2
+; CHECK-X86-NEXT: # %bb.1: # %yes
+; CHECK-X86-NEXT: calll bar
+; CHECK-X86-NEXT: .LBB2_2: # %no
+; CHECK-X86-NEXT: retl
%t = and i32 %x, 2048
%s = icmp eq i32 %t, 0
br i1 %s, label %yes, label %no
@@ -53,13 +128,38 @@ yes:
no:
ret void
}
-; CHECK-64-LABEL: g32xl:
-; CHECK-64: testb $8, [[A0L]]
-; CHECK-64: ret
-; CHECK-32-LABEL: g32xl:
-; CHECK-32: testb $8, %al
-; CHECK-32: ret
+
define void @g32xl(i32 inreg %x) nounwind {
+; CHECK-LINUX64-LABEL: g32xl:
+; CHECK-LINUX64: # %bb.0:
+; CHECK-LINUX64-NEXT: testb $8, %dil
+; CHECK-LINUX64-NEXT: jne .LBB3_2
+; CHECK-LINUX64-NEXT: # %bb.1: # %yes
+; CHECK-LINUX64-NEXT: pushq %rax
+; CHECK-LINUX64-NEXT: callq bar
+; CHECK-LINUX64-NEXT: popq %rax
+; CHECK-LINUX64-NEXT: .LBB3_2: # %no
+; CHECK-LINUX64-NEXT: retq
+;
+; CHECK-WIN32-64-LABEL: g32xl:
+; CHECK-WIN32-64: # %bb.0:
+; CHECK-WIN32-64-NEXT: subq $40, %rsp
+; CHECK-WIN32-64-NEXT: testb $8, %cl
+; CHECK-WIN32-64-NEXT: jne .LBB3_2
+; CHECK-WIN32-64-NEXT: # %bb.1: # %yes
+; CHECK-WIN32-64-NEXT: callq bar
+; CHECK-WIN32-64-NEXT: .LBB3_2: # %no
+; CHECK-WIN32-64-NEXT: addq $40, %rsp
+; CHECK-WIN32-64-NEXT: retq
+;
+; CHECK-X86-LABEL: g32xl:
+; CHECK-X86: # %bb.0:
+; CHECK-X86-NEXT: testb $8, %al
+; CHECK-X86-NEXT: jne .LBB3_2
+; CHECK-X86-NEXT: # %bb.1: # %yes
+; CHECK-X86-NEXT: calll bar
+; CHECK-X86-NEXT: .LBB3_2: # %no
+; CHECK-X86-NEXT: retl
%t = and i32 %x, 8
%s = icmp eq i32 %t, 0
br i1 %s, label %yes, label %no
@@ -70,13 +170,38 @@ yes:
no:
ret void
}
-; CHECK-64-LABEL: g16xh:
-; CHECK-64: testb $8, {{%ah|%ch}}
-; CHECK-64: ret
-; CHECK-32-LABEL: g16xh:
-; CHECK-32: testb $8, %ah
-; CHECK-32: ret
+
define void @g16xh(i16 inreg %x) nounwind {
+; CHECK-LINUX64-LABEL: g16xh:
+; CHECK-LINUX64: # %bb.0:
+; CHECK-LINUX64-NEXT: btl $11, %edi
+; CHECK-LINUX64-NEXT: jb .LBB4_2
+; CHECK-LINUX64-NEXT: # %bb.1: # %yes
+; CHECK-LINUX64-NEXT: pushq %rax
+; CHECK-LINUX64-NEXT: callq bar
+; CHECK-LINUX64-NEXT: popq %rax
+; CHECK-LINUX64-NEXT: .LBB4_2: # %no
+; CHECK-LINUX64-NEXT: retq
+;
+; CHECK-WIN32-64-LABEL: g16xh:
+; CHECK-WIN32-64: # %bb.0:
+; CHECK-WIN32-64-NEXT: subq $40, %rsp
+; CHECK-WIN32-64-NEXT: btl $11, %ecx
+; CHECK-WIN32-64-NEXT: jb .LBB4_2
+; CHECK-WIN32-64-NEXT: # %bb.1: # %yes
+; CHECK-WIN32-64-NEXT: callq bar
+; CHECK-WIN32-64-NEXT: .LBB4_2: # %no
+; CHECK-WIN32-64-NEXT: addq $40, %rsp
+; CHECK-WIN32-64-NEXT: retq
+;
+; CHECK-X86-LABEL: g16xh:
+; CHECK-X86: # %bb.0:
+; CHECK-X86-NEXT: btl $11, %eax
+; CHECK-X86-NEXT: jb .LBB4_2
+; CHECK-X86-NEXT: # %bb.1: # %yes
+; CHECK-X86-NEXT: calll bar
+; CHECK-X86-NEXT: .LBB4_2: # %no
+; CHECK-X86-NEXT: retl
%t = and i16 %x, 2048
%s = icmp eq i16 %t, 0
br i1 %s, label %yes, label %no
@@ -87,13 +212,38 @@ yes:
no:
ret void
}
-; CHECK-64-LABEL: g16xl:
-; CHECK-64: testb $8, [[A0L]]
-; CHECK-64: ret
-; CHECK-32-LABEL: g16xl:
-; CHECK-32: testb $8, %al
-; CHECK-32: ret
+
define void @g16xl(i16 inreg %x) nounwind {
+; CHECK-LINUX64-LABEL: g16xl:
+; CHECK-LINUX64: # %bb.0:
+; CHECK-LINUX64-NEXT: testb $8, %dil
+; CHECK-LINUX64-NEXT: jne .LBB5_2
+; CHECK-LINUX64-NEXT: # %bb.1: # %yes
+; CHECK-LINUX64-NEXT: pushq %rax
+; CHECK-LINUX64-NEXT: callq bar
+; CHECK-LINUX64-NEXT: popq %rax
+; CHECK-LINUX64-NEXT: .LBB5_2: # %no
+; CHECK-LINUX64-NEXT: retq
+;
+; CHECK-WIN32-64-LABEL: g16xl:
+; CHECK-WIN32-64: # %bb.0:
+; CHECK-WIN32-64-NEXT: subq $40, %rsp
+; CHECK-WIN32-64-NEXT: testb $8, %cl
+; CHECK-WIN32-64-NEXT: jne .LBB5_2
+; CHECK-WIN32-64-NEXT: # %bb.1: # %yes
+; CHECK-WIN32-64-NEXT: callq bar
+; CHECK-WIN32-64-NEXT: .LBB5_2: # %no
+; CHECK-WIN32-64-NEXT: addq $40, %rsp
+; CHECK-WIN32-64-NEXT: retq
+;
+; CHECK-X86-LABEL: g16xl:
+; CHECK-X86: # %bb.0:
+; CHECK-X86-NEXT: testb $8, %al
+; CHECK-X86-NEXT: jne .LBB5_2
+; CHECK-X86-NEXT: # %bb.1: # %yes
+; CHECK-X86-NEXT: calll bar
+; CHECK-X86-NEXT: .LBB5_2: # %no
+; CHECK-X86-NEXT: retl
%t = and i16 %x, 8
%s = icmp eq i16 %t, 0
br i1 %s, label %yes, label %no
@@ -104,13 +254,42 @@ yes:
no:
ret void
}
-; CHECK-64-LABEL: g64x16:
-; CHECK-64: testw $-32640, %[[A0W:di|cx]]
-; CHECK-64: ret
-; CHECK-32-LABEL: g64x16:
-; CHECK-32: testw $-32640, %ax
-; CHECK-32: ret
+
define void @g64x16(i64 inreg %x) nounwind {
+; CHECK-LINUX64-LABEL: g64x16:
+; CHECK-LINUX64: # %bb.0:
+; CHECK-LINUX64-NEXT: testl $32896, %edi # imm = 0x8080
+; CHECK-LINUX64-NEXT: je .LBB6_1
+; CHECK-LINUX64-NEXT: # %bb.2: # %no
+; CHECK-LINUX64-NEXT: retq
+; CHECK-LINUX64-NEXT: .LBB6_1: # %yes
+; CHECK-LINUX64-NEXT: pushq %rax
+; CHECK-LINUX64-NEXT: callq bar
+; CHECK-LINUX64-NEXT: popq %rax
+; CHECK-LINUX64-NEXT: retq
+;
+; CHECK-WIN32-64-LABEL: g64x16:
+; CHECK-WIN32-64: # %bb.0:
+; CHECK-WIN32-64-NEXT: subq $40, %rsp
+; CHECK-WIN32-64-NEXT: testl $32896, %ecx # imm = 0x8080
+; CHECK-WIN32-64-NEXT: je .LBB6_1
+; CHECK-WIN32-64-NEXT: # %bb.2: # %no
+; CHECK-WIN32-64-NEXT: addq $40, %rsp
+; CHECK-WIN32-64-NEXT: retq
+; CHECK-WIN32-64-NEXT: .LBB6_1: # %yes
+; CHECK-WIN32-64-NEXT: callq bar
+; CHECK-WIN32-64-NEXT: addq $40, %rsp
+; CHECK-WIN32-64-NEXT: retq
+;
+; CHECK-X86-LABEL: g64x16:
+; CHECK-X86: # %bb.0:
+; CHECK-X86-NEXT: testl $32896, %eax # imm = 0x8080
+; CHECK-X86-NEXT: je .LBB6_1
+; CHECK-X86-NEXT: # %bb.2: # %no
+; CHECK-X86-NEXT: retl
+; CHECK-X86-NEXT: .LBB6_1: # %yes
+; CHECK-X86-NEXT: calll bar
+; CHECK-X86-NEXT: retl
%t = and i64 %x, 32896
%s = icmp eq i64 %t, 0
br i1 %s, label %yes, label %no
@@ -121,13 +300,86 @@ yes:
no:
ret void
}
-; CHECK-64-LABEL: g32x16:
-; CHECK-64: testw $-32640, %[[A0W]]
-; CHECK-64: ret
-; CHECK-32-LABEL: g32x16:
-; CHECK-32: testw $-32640, %ax
-; CHECK-32: ret
+
+define void @g64x16minsize(i64 inreg %x) nounwind minsize {
+; CHECK-LINUX64-LABEL: g64x16minsize:
+; CHECK-LINUX64: # %bb.0:
+; CHECK-LINUX64-NEXT: testw $-32640, %di # imm = 0x8080
+; CHECK-LINUX64-NEXT: je .LBB7_1
+; CHECK-LINUX64-NEXT: # %bb.2: # %no
+; CHECK-LINUX64-NEXT: retq
+; CHECK-LINUX64-NEXT: .LBB7_1: # %yes
+; CHECK-LINUX64-NEXT: pushq %rax
+; CHECK-LINUX64-NEXT: callq bar
+; CHECK-LINUX64-NEXT: popq %rax
+; CHECK-LINUX64-NEXT: retq
+;
+; CHECK-WIN32-64-LABEL: g64x16minsize:
+; CHECK-WIN32-64: # %bb.0:
+; CHECK-WIN32-64-NEXT: subq $40, %rsp
+; CHECK-WIN32-64-NEXT: testw $-32640, %cx # imm = 0x8080
+; CHECK-WIN32-64-NEXT: jne .LBB7_2
+; CHECK-WIN32-64-NEXT: # %bb.1: # %yes
+; CHECK-WIN32-64-NEXT: callq bar
+; CHECK-WIN32-64-NEXT: .LBB7_2: # %no
+; CHECK-WIN32-64-NEXT: addq $40, %rsp
+; CHECK-WIN32-64-NEXT: retq
+;
+; CHECK-X86-LABEL: g64x16minsize:
+; CHECK-X86: # %bb.0:
+; CHECK-X86-NEXT: testw $-32640, %ax # imm = 0x8080
+; CHECK-X86-NEXT: je .LBB7_1
+; CHECK-X86-NEXT: # %bb.2: # %no
+; CHECK-X86-NEXT: retl
+; CHECK-X86-NEXT: .LBB7_1: # %yes
+; CHECK-X86-NEXT: calll bar
+; CHECK-X86-NEXT: retl
+ %t = and i64 %x, 32896
+ %s = icmp eq i64 %t, 0
+ br i1 %s, label %yes, label %no
+
+yes:
+ call void @bar()
+ ret void
+no:
+ ret void
+}
+
define void @g32x16(i32 inreg %x) nounwind {
+; CHECK-LINUX64-LABEL: g32x16:
+; CHECK-LINUX64: # %bb.0:
+; CHECK-LINUX64-NEXT: testl $32896, %edi # imm = 0x8080
+; CHECK-LINUX64-NEXT: je .LBB8_1
+; CHECK-LINUX64-NEXT: # %bb.2: # %no
+; CHECK-LINUX64-NEXT: retq
+; CHECK-LINUX64-NEXT: .LBB8_1: # %yes
+; CHECK-LINUX64-NEXT: pushq %rax
+; CHECK-LINUX64-NEXT: callq bar
+; CHECK-LINUX64-NEXT: popq %rax
+; CHECK-LINUX64-NEXT: retq
+;
+; CHECK-WIN32-64-LABEL: g32x16:
+; CHECK-WIN32-64: # %bb.0:
+; CHECK-WIN32-64-NEXT: subq $40, %rsp
+; CHECK-WIN32-64-NEXT: testl $32896, %ecx # imm = 0x8080
+; CHECK-WIN32-64-NEXT: je .LBB8_1
+; CHECK-WIN32-64-NEXT: # %bb.2: # %no
+; CHECK-WIN32-64-NEXT: addq $40, %rsp
+; CHECK-WIN32-64-NEXT: retq
+; CHECK-WIN32-64-NEXT: .LBB8_1: # %yes
+; CHECK-WIN32-64-NEXT: callq bar
+; CHECK-WIN32-64-NEXT: addq $40, %rsp
+; CHECK-WIN32-64-NEXT: retq
+;
+; CHECK-X86-LABEL: g32x16:
+; CHECK-X86: # %bb.0:
+; CHECK-X86-NEXT: testl $32896, %eax # imm = 0x8080
+; CHECK-X86-NEXT: je .LBB8_1
+; CHECK-X86-NEXT: # %bb.2: # %no
+; CHECK-X86-NEXT: retl
+; CHECK-X86-NEXT: .LBB8_1: # %yes
+; CHECK-X86-NEXT: calll bar
+; CHECK-X86-NEXT: retl
%t = and i32 %x, 32896
%s = icmp eq i32 %t, 0
br i1 %s, label %yes, label %no
@@ -138,13 +390,86 @@ yes:
no:
ret void
}
-; CHECK-64-LABEL: g64x32:
-; CHECK-64: testl $268468352, %e[[A0W]]
-; CHECK-64: ret
-; CHECK-32-LABEL: g64x32:
-; CHECK-32: testl $268468352, %eax
-; CHECK-32: ret
+
+define void @g32x16minsize(i32 inreg %x) nounwind minsize {
+; CHECK-LINUX64-LABEL: g32x16minsize:
+; CHECK-LINUX64: # %bb.0:
+; CHECK-LINUX64-NEXT: testw $-32640, %di # imm = 0x8080
+; CHECK-LINUX64-NEXT: je .LBB9_1
+; CHECK-LINUX64-NEXT: # %bb.2: # %no
+; CHECK-LINUX64-NEXT: retq
+; CHECK-LINUX64-NEXT: .LBB9_1: # %yes
+; CHECK-LINUX64-NEXT: pushq %rax
+; CHECK-LINUX64-NEXT: callq bar
+; CHECK-LINUX64-NEXT: popq %rax
+; CHECK-LINUX64-NEXT: retq
+;
+; CHECK-WIN32-64-LABEL: g32x16minsize:
+; CHECK-WIN32-64: # %bb.0:
+; CHECK-WIN32-64-NEXT: subq $40, %rsp
+; CHECK-WIN32-64-NEXT: testw $-32640, %cx # imm = 0x8080
+; CHECK-WIN32-64-NEXT: jne .LBB9_2
+; CHECK-WIN32-64-NEXT: # %bb.1: # %yes
+; CHECK-WIN32-64-NEXT: callq bar
+; CHECK-WIN32-64-NEXT: .LBB9_2: # %no
+; CHECK-WIN32-64-NEXT: addq $40, %rsp
+; CHECK-WIN32-64-NEXT: retq
+;
+; CHECK-X86-LABEL: g32x16minsize:
+; CHECK-X86: # %bb.0:
+; CHECK-X86-NEXT: testw $-32640, %ax # imm = 0x8080
+; CHECK-X86-NEXT: je .LBB9_1
+; CHECK-X86-NEXT: # %bb.2: # %no
+; CHECK-X86-NEXT: retl
+; CHECK-X86-NEXT: .LBB9_1: # %yes
+; CHECK-X86-NEXT: calll bar
+; CHECK-X86-NEXT: retl
+ %t = and i32 %x, 32896
+ %s = icmp eq i32 %t, 0
+ br i1 %s, label %yes, label %no
+
+yes:
+ call void @bar()
+ ret void
+no:
+ ret void
+}
+
define void @g64x32(i64 inreg %x) nounwind {
+; CHECK-LINUX64-LABEL: g64x32:
+; CHECK-LINUX64: # %bb.0:
+; CHECK-LINUX64-NEXT: testl $268468352, %edi # imm = 0x10008080
+; CHECK-LINUX64-NEXT: je .LBB10_1
+; CHECK-LINUX64-NEXT: # %bb.2: # %no
+; CHECK-LINUX64-NEXT: retq
+; CHECK-LINUX64-NEXT: .LBB10_1: # %yes
+; CHECK-LINUX64-NEXT: pushq %rax
+; CHECK-LINUX64-NEXT: callq bar
+; CHECK-LINUX64-NEXT: popq %rax
+; CHECK-LINUX64-NEXT: retq
+;
+; CHECK-WIN32-64-LABEL: g64x32:
+; CHECK-WIN32-64: # %bb.0:
+; CHECK-WIN32-64-NEXT: subq $40, %rsp
+; CHECK-WIN32-64-NEXT: testl $268468352, %ecx # imm = 0x10008080
+; CHECK-WIN32-64-NEXT: je .LBB10_1
+; CHECK-WIN32-64-NEXT: # %bb.2: # %no
+; CHECK-WIN32-64-NEXT: addq $40, %rsp
+; CHECK-WIN32-64-NEXT: retq
+; CHECK-WIN32-64-NEXT: .LBB10_1: # %yes
+; CHECK-WIN32-64-NEXT: callq bar
+; CHECK-WIN32-64-NEXT: addq $40, %rsp
+; CHECK-WIN32-64-NEXT: retq
+;
+; CHECK-X86-LABEL: g64x32:
+; CHECK-X86: # %bb.0:
+; CHECK-X86-NEXT: testl $268468352, %eax # imm = 0x10008080
+; CHECK-X86-NEXT: je .LBB10_1
+; CHECK-X86-NEXT: # %bb.2: # %no
+; CHECK-X86-NEXT: retl
+; CHECK-X86-NEXT: .LBB10_1: # %yes
+; CHECK-X86-NEXT: calll bar
+; CHECK-X86-NEXT: retl
%t = and i64 %x, 268468352
%s = icmp eq i64 %t, 0
br i1 %s, label %yes, label %no
diff --git a/test/CodeGen/X86/testb-je-fusion.ll b/test/CodeGen/X86/testb-je-fusion.ll
index 9e946ae4ca33..c085a422295d 100644
--- a/test/CodeGen/X86/testb-je-fusion.ll
+++ b/test/CodeGen/X86/testb-je-fusion.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mcpu=corei7-avx | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=corei7-avx | FileCheck %s
; testb should be scheduled right before je to enable macro-fusion.
diff --git a/test/CodeGen/X86/testl-commute.ll b/test/CodeGen/X86/testl-commute.ll
index a9a9e581d995..43e095aecd06 100644
--- a/test/CodeGen/X86/testl-commute.ll
+++ b/test/CodeGen/X86/testl-commute.ll
@@ -9,7 +9,7 @@ target triple = "x86_64-apple-darwin7"
define i32 @test(i32* %P, i32* %G) nounwind {
; CHECK-LABEL: test:
; CHECK-NOT: ret
-; CHECK: testl (%{{.*}}), %{{.*}}
+; CHECK: testl %{{.*}}, (%{{.*}})
; CHECK: ret
entry:
@@ -30,7 +30,7 @@ bb1: ; preds = %entry
define i32 @test2(i32* %P, i32* %G) nounwind {
; CHECK-LABEL: test2:
; CHECK-NOT: ret
-; CHECK: testl (%{{.*}}), %{{.*}}
+; CHECK: testl %{{.*}}, (%{{.*}})
; CHECK: ret
entry:
@@ -51,7 +51,7 @@ bb1: ; preds = %entry
define i32 @test3(i32* %P, i32* %G) nounwind {
; CHECK-LABEL: test3:
; CHECK-NOT: ret
-; CHECK: testl (%{{.*}}), %{{.*}}
+; CHECK: testl %{{.*}}, (%{{.*}})
; CHECK: ret
entry:
diff --git a/test/CodeGen/X86/tls-android-negative.ll b/test/CodeGen/X86/tls-android-negative.ll
index e90b8914ab28..7a767cabb058 100644
--- a/test/CodeGen/X86/tls-android-negative.ll
+++ b/test/CodeGen/X86/tls-android-negative.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -emulated-tls -march=x86 -mtriple=x86_64-linux-android -relocation-model=pic | FileCheck %s
-; RUN: llc < %s -emulated-tls -march=x86-64 -mtriple=x86_64-linux-android -relocation-model=pic | FileCheck %s
+; RUN: llc < %s -emulated-tls -mtriple=i686-linux-android -relocation-model=pic | FileCheck %s
+; RUN: llc < %s -emulated-tls -mtriple=x86_64-linux-android -relocation-model=pic | FileCheck %s
; Make sure that some symboles are not emitted in emulated TLS model.
diff --git a/test/CodeGen/X86/tls-android.ll b/test/CodeGen/X86/tls-android.ll
index 53717f564fac..ecb9b430a7c0 100644
--- a/test/CodeGen/X86/tls-android.ll
+++ b/test/CodeGen/X86/tls-android.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -emulated-tls -march=x86 -mtriple=x86_64-linux-android -relocation-model=pic | FileCheck %s
-; RUN: llc < %s -emulated-tls -march=x86-64 -mtriple=x86_64-linux-android -relocation-model=pic | FileCheck -check-prefix=X64 %s
+; RUN: llc < %s -emulated-tls -mtriple=i686-linux-android -relocation-model=pic | FileCheck %s
+; RUN: llc < %s -emulated-tls -mtriple=x86_64-linux-android -relocation-model=pic | FileCheck -check-prefix=X64 %s
; Make sure that TLS symboles are emitted in expected order.
diff --git a/test/CodeGen/X86/tls-local-dynamic.ll b/test/CodeGen/X86/tls-local-dynamic.ll
index 1f1b41a8a6d4..711376303a8c 100644
--- a/test/CodeGen/X86/tls-local-dynamic.ll
+++ b/test/CodeGen/X86/tls-local-dynamic.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mtriple=x86_64-linux-gnu -relocation-model=pic | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-linux-gnu -relocation-model=pic | FileCheck %s
@x = internal thread_local global i32 0, align 4
@y = internal thread_local global i32 0, align 4
diff --git a/test/CodeGen/X86/tls-models.ll b/test/CodeGen/X86/tls-models.ll
index 2377da4f025a..e0c3f05ad056 100644
--- a/test/CodeGen/X86/tls-models.ll
+++ b/test/CodeGen/X86/tls-models.ll
@@ -1,10 +1,10 @@
-; RUN: llc < %s -march=x86-64 -mtriple=x86_64-linux-gnu | FileCheck -check-prefix=X64 %s
-; RUN: llc < %s -march=x86-64 -mtriple=x86_64-linux-gnu -relocation-model=pic | FileCheck -check-prefix=X64_PIC %s
-; RUN: llc < %s -march=x86 -mtriple=i386-linux-gnu | FileCheck -check-prefix=X32 %s
-; RUN: llc < %s -march=x86 -mtriple=i386-linux-gnu -relocation-model=pic | FileCheck -check-prefix=X32_PIC %s
+; RUN: llc < %s -mtriple=x86_64-linux-gnu | FileCheck -check-prefix=X64 %s
+; RUN: llc < %s -mtriple=x86_64-linux-gnu -relocation-model=pic | FileCheck -check-prefix=X64_PIC %s
+; RUN: llc < %s -mtriple=i386-linux-gnu | FileCheck -check-prefix=X32 %s
+; RUN: llc < %s -mtriple=i386-linux-gnu -relocation-model=pic | FileCheck -check-prefix=X32_PIC %s
; Darwin always uses the same model.
-; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin | FileCheck -check-prefix=DARWIN %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin | FileCheck -check-prefix=DARWIN %s
@external_gd = external thread_local global i32
@internal_gd = internal thread_local global i32 42
diff --git a/test/CodeGen/X86/tls-pic.ll b/test/CodeGen/X86/tls-pic.ll
index ac0b43b2402f..15533c94c4dc 100644
--- a/test/CodeGen/X86/tls-pic.ll
+++ b/test/CodeGen/X86/tls-pic.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=x86 -mtriple=i386-linux-gnu -relocation-model=pic | FileCheck %s --check-prefix=X86
-; RUN: llc < %s -march=x86-64 -mtriple=x86_64-linux-gnu -relocation-model=pic | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -mtriple=i386-linux-gnu -relocation-model=pic | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -mtriple=x86_64-linux-gnu -relocation-model=pic | FileCheck %s --check-prefix=X64
@i = thread_local global i32 15
@j = internal thread_local global i32 42
diff --git a/test/CodeGen/X86/tls-pie.ll b/test/CodeGen/X86/tls-pie.ll
index 7a7e40362bcf..4f5c4f8fed58 100644
--- a/test/CodeGen/X86/tls-pie.ll
+++ b/test/CodeGen/X86/tls-pie.ll
@@ -1,24 +1,24 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -march=x86 -mcpu=generic -mtriple=i386-linux-gnu -relocation-model=pic | FileCheck %s --check-prefix=X86
-; RUN: llc < %s -march=x86-64 -mcpu=generic -mtriple=x86_64-linux-gnux32 -relocation-model=pic | FileCheck %s --check-prefix=X32
-; RUN: llc < %s -march=x86-64 -mcpu=generic -mtriple=x86_64-linux-gnu -relocation-model=pic | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -mcpu=generic -mtriple=i386-linux-gnu -relocation-model=pic | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux-gnux32 -relocation-model=pic | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux-gnu -relocation-model=pic | FileCheck %s --check-prefix=X64
@i = thread_local global i32 15
@i2 = external thread_local global i32
define i32 @f1() {
; X86-LABEL: f1:
-; X86: # BB#0: # %entry
+; X86: # %bb.0: # %entry
; X86-NEXT: movl %gs:i@NTPOFF, %eax
; X86-NEXT: retl
;
; X32-LABEL: f1:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: movl %fs:i@TPOFF, %eax
; X32-NEXT: retq
;
; X64-LABEL: f1:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: movl %fs:i@TPOFF, %eax
; X64-NEXT: retq
entry:
@@ -28,19 +28,19 @@ entry:
define i32* @f2() {
; X86-LABEL: f2:
-; X86: # BB#0: # %entry
+; X86: # %bb.0: # %entry
; X86-NEXT: movl %gs:0, %eax
; X86-NEXT: leal i@NTPOFF(%eax), %eax
; X86-NEXT: retl
;
; X32-LABEL: f2:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: movl %fs:0, %eax
; X32-NEXT: leal i@TPOFF(%rax), %eax
; X32-NEXT: retq
;
; X64-LABEL: f2:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: movq %fs:0, %rax
; X64-NEXT: leaq i@TPOFF(%rax), %rax
; X64-NEXT: retq
@@ -50,13 +50,11 @@ entry:
define i32 @f3() {
; X86-LABEL: f3:
-; X86: # BB#0: # %entry
+; X86: # %bb.0: # %entry
; X86-NEXT: calll .L2$pb
-; X86-NEXT: .Lcfi0:
; X86-NEXT: .cfi_adjust_cfa_offset 4
; X86-NEXT: .L2$pb:
; X86-NEXT: popl %eax
-; X86-NEXT: .Lcfi1:
; X86-NEXT: .cfi_adjust_cfa_offset -4
; X86-NEXT: .Ltmp0:
; X86-NEXT: addl $_GLOBAL_OFFSET_TABLE_+(.Ltmp0-.L2$pb), %eax
@@ -65,13 +63,13 @@ define i32 @f3() {
; X86-NEXT: retl
;
; X32-LABEL: f3:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: movl i2@{{.*}}(%rip), %eax
; X32-NEXT: movl %fs:(%eax), %eax
; X32-NEXT: retq
;
; X64-LABEL: f3:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: movq i2@{{.*}}(%rip), %rax
; X64-NEXT: movl %fs:(%rax), %eax
; X64-NEXT: retq
@@ -82,13 +80,11 @@ entry:
define i32* @f4() {
; X86-LABEL: f4:
-; X86: # BB#0: # %entry
+; X86: # %bb.0: # %entry
; X86-NEXT: calll .L3$pb
-; X86-NEXT: .Lcfi2:
; X86-NEXT: .cfi_adjust_cfa_offset 4
; X86-NEXT: .L3$pb:
; X86-NEXT: popl %ecx
-; X86-NEXT: .Lcfi3:
; X86-NEXT: .cfi_adjust_cfa_offset -4
; X86-NEXT: .Ltmp1:
; X86-NEXT: addl $_GLOBAL_OFFSET_TABLE_+(.Ltmp1-.L3$pb), %ecx
@@ -97,13 +93,13 @@ define i32* @f4() {
; X86-NEXT: retl
;
; X32-LABEL: f4:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: movl %fs:0, %eax
; X32-NEXT: addl i2@{{.*}}(%rip), %eax
; X32-NEXT: retq
;
; X64-LABEL: f4:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: movq %fs:0, %rax
; X64-NEXT: addq i2@{{.*}}(%rip), %rax
; X64-NEXT: retq
diff --git a/test/CodeGen/X86/tls-shrink-wrapping.ll b/test/CodeGen/X86/tls-shrink-wrapping.ll
index 806fae934f9a..216bb95f7218 100644
--- a/test/CodeGen/X86/tls-shrink-wrapping.ll
+++ b/test/CodeGen/X86/tls-shrink-wrapping.ll
@@ -37,18 +37,14 @@ if.end: ; preds = %if.then, %entry
; CHECK: g: # @g
; CHECK-NEXT: .cfi_startproc
-; CHECK-NEXT: # BB#0: # %entry
+; CHECK-NEXT: # %bb.0: # %entry
; CHECK-NEXT: pushq %rbp
-; CHECK-NEXT: .Lcfi0:
; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: .Lcfi1:
; CHECK-NEXT: .cfi_offset %rbp, -16
; CHECK-NEXT: movq %rsp, %rbp
-; CHECK-NEXT: .Lcfi2:
; CHECK-NEXT: .cfi_def_cfa_register %rbp
; CHECK-NEXT: pushq %rbx
; CHECK-NEXT: pushq %rax
-; CHECK-NEXT: .Lcfi3:
; CHECK-NEXT: .cfi_offset %rbx, -24
; CHECK-NEXT: data16
; CHECK-NEXT: leaq i@TLSGD(%rip), %rdi
diff --git a/test/CodeGen/X86/tls.ll b/test/CodeGen/X86/tls.ll
index d39716aab764..ddfebcd0b66a 100644
--- a/test/CodeGen/X86/tls.ll
+++ b/test/CodeGen/X86/tls.ll
@@ -1,9 +1,9 @@
-; RUN: llc < %s -march=x86 -mtriple=i386-linux-gnu | FileCheck -check-prefix=X86_LINUX %s
-; RUN: llc < %s -march=x86-64 -mtriple=x86_64-linux-gnu | FileCheck -check-prefix=X64_LINUX %s
-; RUN: llc < %s -march=x86 -mtriple=x86-pc-win32 | FileCheck -check-prefix=X86_WIN %s
-; RUN: llc < %s -march=x86-64 -mtriple=x86_64-pc-win32 | FileCheck -check-prefix=X64_WIN %s
-; RUN: llc < %s -march=x86 -mtriple=x86-pc-windows-gnu | FileCheck -check-prefix=MINGW32 %s
-; RUN: llc < %s -march=x86-64 -mtriple=x86_64-pc-windows-gnu | FileCheck -check-prefix=X64_WIN %s
+; RUN: llc < %s -mtriple=i386-linux-gnu | FileCheck -check-prefix=X86_LINUX %s
+; RUN: llc < %s -mtriple=x86_64-linux-gnu | FileCheck -check-prefix=X64_LINUX %s
+; RUN: llc < %s -mtriple=i686-pc-win32 | FileCheck -check-prefix=X86_WIN %s
+; RUN: llc < %s -mtriple=x86_64-pc-win32 | FileCheck -check-prefix=X64_WIN %s
+; RUN: llc < %s -mtriple=i686-pc-windows-gnu | FileCheck -check-prefix=MINGW32 %s
+; RUN: llc < %s -mtriple=x86_64-pc-windows-gnu | FileCheck -check-prefix=X64_WIN %s
@i1 = thread_local global i32 15
@i2 = external thread_local global i32
diff --git a/test/CodeGen/X86/token_landingpad.ll b/test/CodeGen/X86/token_landingpad.ll
index 087b68bfce8a..b72bfbbff646 100644
--- a/test/CodeGen/X86/token_landingpad.ll
+++ b/test/CodeGen/X86/token_landingpad.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s
+; RUN: llc < %s -mtriple=x86_64--
+; RUN: llc < %s -mtriple=i686--
; This test verifies that SelectionDAG can handle landingPad of token type and not crash LLVM.
diff --git a/test/CodeGen/X86/trunc-ext-ld-st.ll b/test/CodeGen/X86/trunc-ext-ld-st.ll
index 8624c7210f73..f926cfa91119 100644
--- a/test/CodeGen/X86/trunc-ext-ld-st.ll
+++ b/test/CodeGen/X86/trunc-ext-ld-st.ll
@@ -5,7 +5,7 @@
; A single 16-bit load + a single 16-bit store
define void @load_2_i8(<2 x i8>* %A) {
; SSE2-LABEL: load_2_i8:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movzwl (%rdi), %eax
; SSE2-NEXT: movd %eax, %xmm0
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
@@ -21,7 +21,7 @@ define void @load_2_i8(<2 x i8>* %A) {
; SSE2-NEXT: retq
;
; SSE41-LABEL: load_2_i8:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
; SSE41-NEXT: paddq {{.*}}(%rip), %xmm0
; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
@@ -36,7 +36,7 @@ define void @load_2_i8(<2 x i8>* %A) {
; Read 32-bits
define void @load_2_i16(<2 x i16>* %A) {
; SSE2-LABEL: load_2_i16:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,6,7]
@@ -47,7 +47,7 @@ define void @load_2_i16(<2 x i16>* %A) {
; SSE2-NEXT: retq
;
; SSE41-LABEL: load_2_i16:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero
; SSE41-NEXT: paddq {{.*}}(%rip), %xmm0
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
@@ -62,7 +62,7 @@ define void @load_2_i16(<2 x i16>* %A) {
define void @load_2_i32(<2 x i32>* %A) {
; SSE2-LABEL: load_2_i32:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
; SSE2-NEXT: paddd {{.*}}(%rip), %xmm0
@@ -71,7 +71,7 @@ define void @load_2_i32(<2 x i32>* %A) {
; SSE2-NEXT: retq
;
; SSE41-LABEL: load_2_i32:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
; SSE41-NEXT: paddd {{.*}}(%rip), %xmm0
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
@@ -85,7 +85,7 @@ define void @load_2_i32(<2 x i32>* %A) {
define void @load_4_i8(<4 x i8>* %A) {
; SSE2-LABEL: load_4_i8:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
@@ -97,7 +97,7 @@ define void @load_4_i8(<4 x i8>* %A) {
; SSE2-NEXT: retq
;
; SSE41-LABEL: load_4_i8:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; SSE41-NEXT: paddd {{.*}}(%rip), %xmm0
; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
@@ -111,7 +111,7 @@ define void @load_4_i8(<4 x i8>* %A) {
define void @load_4_i16(<4 x i16>* %A) {
; SSE2-LABEL: load_4_i16:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
; SSE2-NEXT: paddw {{.*}}(%rip), %xmm0
@@ -122,7 +122,7 @@ define void @load_4_i16(<4 x i16>* %A) {
; SSE2-NEXT: retq
;
; SSE41-LABEL: load_4_i16:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
; SSE41-NEXT: paddw {{.*}}(%rip), %xmm0
; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
@@ -136,7 +136,7 @@ define void @load_4_i16(<4 x i16>* %A) {
define void @load_8_i8(<8 x i8>* %A) {
; SSE2-LABEL: load_8_i8:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: paddb %xmm0, %xmm0
@@ -146,10 +146,10 @@ define void @load_8_i8(<8 x i8>* %A) {
; SSE2-NEXT: retq
;
; SSE41-LABEL: load_8_i8:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
; SSE41-NEXT: paddb %xmm0, %xmm0
-; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; SSE41-NEXT: packuswb %xmm0, %xmm0
; SSE41-NEXT: movq %xmm0, (%rdi)
; SSE41-NEXT: retq
%T = load <8 x i8>, <8 x i8>* %A
diff --git a/test/CodeGen/X86/trunc-store.ll b/test/CodeGen/X86/trunc-store.ll
index a241876ff419..3da9240fa632 100644
--- a/test/CodeGen/X86/trunc-store.ll
+++ b/test/CodeGen/X86/trunc-store.ll
@@ -28,14 +28,14 @@
define void @fn1() {
; CHECK-LABEL: fn1:
-; CHECK: # BB#0: # %for.cond
+; CHECK: # %bb.0: # %for.cond
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB0_1: # %vector.body
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
; CHECK-NEXT: movb $0, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: cmpq $8, %rax
; CHECK-NEXT: jne .LBB0_1
-; CHECK-NEXT: # BB#2: # %middle.block
+; CHECK-NEXT: # %bb.2: # %middle.block
; CHECK-NEXT: retq
for.cond:
br label %vector.body
diff --git a/test/CodeGen/X86/trunc-to-bool.ll b/test/CodeGen/X86/trunc-to-bool.ll
index 8e253f11e93e..d4f2e5852835 100644
--- a/test/CodeGen/X86/trunc-to-bool.ll
+++ b/test/CodeGen/X86/trunc-to-bool.ll
@@ -6,7 +6,7 @@
define zeroext i1 @test1(i32 %X) nounwind {
; CHECK-LABEL: test1:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movb {{[0-9]+}}(%esp), %al
; CHECK-NEXT: andb $1, %al
; CHECK-NEXT: retl
@@ -16,12 +16,12 @@ define zeroext i1 @test1(i32 %X) nounwind {
define i1 @test2(i32 %val, i32 %mask) nounwind {
; CHECK-LABEL: test2:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
; CHECK-NEXT: btl %ecx, %eax
; CHECK-NEXT: jae .LBB1_2
-; CHECK-NEXT: # BB#1: # %ret_true
+; CHECK-NEXT: # %bb.1: # %ret_true
; CHECK-NEXT: movb $1, %al
; CHECK-NEXT: retl
; CHECK-NEXT: .LBB1_2: # %ret_false
@@ -40,11 +40,11 @@ ret_false:
define i32 @test3(i8* %ptr) nounwind {
; CHECK-LABEL: test3:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: testb $1, (%eax)
; CHECK-NEXT: je .LBB2_2
-; CHECK-NEXT: # BB#1: # %cond_true
+; CHECK-NEXT: # %bb.1: # %cond_true
; CHECK-NEXT: movl $21, %eax
; CHECK-NEXT: retl
; CHECK-NEXT: .LBB2_2: # %cond_false
@@ -61,10 +61,10 @@ cond_false:
define i32 @test4(i8* %ptr) nounwind {
; CHECK-LABEL: test4:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: testb $1, {{[0-9]+}}(%esp)
; CHECK-NEXT: je .LBB3_2
-; CHECK-NEXT: # BB#1: # %cond_true
+; CHECK-NEXT: # %bb.1: # %cond_true
; CHECK-NEXT: movl $21, %eax
; CHECK-NEXT: retl
; CHECK-NEXT: .LBB3_2: # %cond_false
@@ -80,7 +80,7 @@ cond_false:
define i32 @test5(double %d) nounwind {
; CHECK-LABEL: test5:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: pushl %eax
; CHECK-NEXT: fldl {{[0-9]+}}(%esp)
; CHECK-NEXT: fnstcw (%esp)
@@ -92,7 +92,7 @@ define i32 @test5(double %d) nounwind {
; CHECK-NEXT: fldcw (%esp)
; CHECK-NEXT: testb $1, {{[0-9]+}}(%esp)
; CHECK-NEXT: je .LBB4_2
-; CHECK-NEXT: # BB#1: # %cond_true
+; CHECK-NEXT: # %bb.1: # %cond_true
; CHECK-NEXT: movl $21, %eax
; CHECK-NEXT: popl %ecx
; CHECK-NEXT: retl
diff --git a/test/CodeGen/X86/twoaddr-coalesce-2.ll b/test/CodeGen/X86/twoaddr-coalesce-2.ll
index 9da071f7ede6..a5667be299bb 100644
--- a/test/CodeGen/X86/twoaddr-coalesce-2.ll
+++ b/test/CodeGen/X86/twoaddr-coalesce-2.ll
@@ -1,5 +1,5 @@
; REQUIRES: asserts
-; RUN: llc < %s -march=x86 -mattr=+sse2 -mcpu=penryn -stats 2>&1 | \
+; RUN: llc < %s -mattr=+sse2 -mcpu=penryn -stats 2>&1 | \
; RUN: grep "twoaddressinstruction" | grep "Number of instructions aggressively commuted"
; rdar://6480363
diff --git a/test/CodeGen/X86/twoaddr-coalesce-3.ll b/test/CodeGen/X86/twoaddr-coalesce-3.ll
index f5a7326c970c..b53ff00dfab5 100644
--- a/test/CodeGen/X86/twoaddr-coalesce-3.ll
+++ b/test/CodeGen/X86/twoaddr-coalesce-3.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -relocation-model=pic | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-- -relocation-model=pic | FileCheck %s
; This test is to ensure the TwoAddrInstruction pass chooses the proper operands to
; merge and generates fewer mov insns.
diff --git a/test/CodeGen/X86/twoaddr-coalesce.ll b/test/CodeGen/X86/twoaddr-coalesce.ll
index c727f34cc9a5..81af9181126c 100644
--- a/test/CodeGen/X86/twoaddr-coalesce.ll
+++ b/test/CodeGen/X86/twoaddr-coalesce.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | grep mov | count 2
+; RUN: llc < %s -mtriple=i686-- | grep mov | count 2
; rdar://6523745
@"\01LC" = internal constant [4 x i8] c"%d\0A\00" ; <[4 x i8]*> [#uses=1]
diff --git a/test/CodeGen/X86/twoaddr-pass-sink.ll b/test/CodeGen/X86/twoaddr-pass-sink.ll
index 9a98e4794f9e..a06eaec894ca 100644
--- a/test/CodeGen/X86/twoaddr-pass-sink.ll
+++ b/test/CodeGen/X86/twoaddr-pass-sink.ll
@@ -1,5 +1,5 @@
; REQUIRES: asserts
-; RUN: llc < %s -march=x86 -mattr=+sse2 -stats 2>&1 | grep "Number of 3-address instructions sunk"
+; RUN: llc < %s -mtriple=i686-- -mattr=+sse2 -stats 2>&1 | grep "Number of 3-address instructions sunk"
define void @t2(<2 x i64>* %vDct, <2 x i64>* %vYp, i8* %skiplist, <2 x i64> %a1) nounwind {
entry:
diff --git a/test/CodeGen/X86/uint64-to-float.ll b/test/CodeGen/X86/uint64-to-float.ll
index 60f9487b4662..ac7371fdf1be 100644
--- a/test/CodeGen/X86/uint64-to-float.ll
+++ b/test/CodeGen/X86/uint64-to-float.ll
@@ -8,7 +8,7 @@
define float @test(i64 %a) nounwind {
; X86-LABEL: test:
-; X86: # BB#0: # %entry
+; X86: # %bb.0: # %entry
; X86-NEXT: pushl %ebp
; X86-NEXT: movl %esp, %ebp
; X86-NEXT: andl $-8, %esp
@@ -29,10 +29,10 @@ define float @test(i64 %a) nounwind {
; X86-NEXT: retl
;
; X64-LABEL: test:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: testq %rdi, %rdi
; X64-NEXT: js .LBB0_1
-; X64-NEXT: # BB#2: # %entry
+; X64-NEXT: # %bb.2: # %entry
; X64-NEXT: cvtsi2ssq %rdi, %xmm0
; X64-NEXT: retq
; X64-NEXT: .LBB0_1:
diff --git a/test/CodeGen/X86/uint_to_fp-2.ll b/test/CodeGen/X86/uint_to_fp-2.ll
index c006c3115b21..f925488632f8 100644
--- a/test/CodeGen/X86/uint_to_fp-2.ll
+++ b/test/CodeGen/X86/uint_to_fp-2.ll
@@ -1,10 +1,10 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i386-unknown-unknown -march=x86 -mattr=+sse2 | FileCheck %s
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=+sse2 | FileCheck %s
; rdar://6504833
define float @test1(i32 %x) nounwind readnone {
; CHECK-LABEL: test1:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: pushl %eax
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
@@ -24,7 +24,7 @@ entry:
; PR10802
define float @test2(<4 x i32> %x) nounwind readnone ssp {
; CHECK-LABEL: test2:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: pushl %eax
; CHECK-NEXT: xorps %xmm1, %xmm1
; CHECK-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
diff --git a/test/CodeGen/X86/uint_to_fp-3.ll b/test/CodeGen/X86/uint_to_fp-3.ll
index 47f8abfe0414..9efd9a5bef5f 100644
--- a/test/CodeGen/X86/uint_to_fp-3.ll
+++ b/test/CodeGen/X86/uint_to_fp-3.ll
@@ -8,25 +8,25 @@
define <4 x float> @mask_ucvt_4i32_4f32(<4 x i32> %a) {
; X32-SSE-LABEL: mask_ucvt_4i32_4f32:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: andps {{\.LCPI.*}}, %xmm0
; X32-SSE-NEXT: cvtdq2ps %xmm0, %xmm0
; X32-SSE-NEXT: retl
;
; X32-AVX-LABEL: mask_ucvt_4i32_4f32:
-; X32-AVX: # BB#0:
+; X32-AVX: # %bb.0:
; X32-AVX-NEXT: vandps {{\.LCPI.*}}, %xmm0, %xmm0
; X32-AVX-NEXT: vcvtdq2ps %xmm0, %xmm0
; X32-AVX-NEXT: retl
;
; X64-SSE-LABEL: mask_ucvt_4i32_4f32:
-; X64-SSE: # BB#0:
+; X64-SSE: # %bb.0:
; X64-SSE-NEXT: andps {{.*}}(%rip), %xmm0
; X64-SSE-NEXT: cvtdq2ps %xmm0, %xmm0
; X64-SSE-NEXT: retq
;
; X64-AVX-LABEL: mask_ucvt_4i32_4f32:
-; X64-AVX: # BB#0:
+; X64-AVX: # %bb.0:
; X64-AVX-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0
; X64-AVX-NEXT: vcvtdq2ps %xmm0, %xmm0
; X64-AVX-NEXT: retq
@@ -37,7 +37,7 @@ define <4 x float> @mask_ucvt_4i32_4f32(<4 x i32> %a) {
define <4 x double> @mask_ucvt_4i32_4f64(<4 x i32> %a) {
; X32-SSE-LABEL: mask_ucvt_4i32_4f64:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
; X32-SSE-NEXT: cvtdq2pd %xmm0, %xmm2
; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
@@ -46,13 +46,13 @@ define <4 x double> @mask_ucvt_4i32_4f64(<4 x i32> %a) {
; X32-SSE-NEXT: retl
;
; X32-AVX-LABEL: mask_ucvt_4i32_4f64:
-; X32-AVX: # BB#0:
+; X32-AVX: # %bb.0:
; X32-AVX-NEXT: vandps {{\.LCPI.*}}, %xmm0, %xmm0
; X32-AVX-NEXT: vcvtdq2pd %xmm0, %ymm0
; X32-AVX-NEXT: retl
;
; X64-SSE-LABEL: mask_ucvt_4i32_4f64:
-; X64-SSE: # BB#0:
+; X64-SSE: # %bb.0:
; X64-SSE-NEXT: pand {{.*}}(%rip), %xmm0
; X64-SSE-NEXT: cvtdq2pd %xmm0, %xmm2
; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
@@ -61,7 +61,7 @@ define <4 x double> @mask_ucvt_4i32_4f64(<4 x i32> %a) {
; X64-SSE-NEXT: retq
;
; X64-AVX-LABEL: mask_ucvt_4i32_4f64:
-; X64-AVX: # BB#0:
+; X64-AVX: # %bb.0:
; X64-AVX-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0
; X64-AVX-NEXT: vcvtdq2pd %xmm0, %ymm0
; X64-AVX-NEXT: retq
diff --git a/test/CodeGen/X86/uint_to_fp.ll b/test/CodeGen/X86/uint_to_fp.ll
index a2784fdcbbdd..afc5464fb70f 100644
--- a/test/CodeGen/X86/uint_to_fp.ll
+++ b/test/CodeGen/X86/uint_to_fp.ll
@@ -5,7 +5,7 @@
define void @test(i32 %x, float* %y) nounwind {
; X32-LABEL: test:
-; X32: ## BB#0: ## %entry
+; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: shrl $23, %ecx
@@ -14,7 +14,7 @@ define void @test(i32 %x, float* %y) nounwind {
; X32-NEXT: retl
;
; X64-LABEL: test:
-; X64: ## BB#0: ## %entry
+; X64: ## %bb.0: ## %entry
; X64-NEXT: shrl $23, %edi
; X64-NEXT: cvtsi2ssl %edi, %xmm0
; X64-NEXT: movss %xmm0, (%rsi)
diff --git a/test/CodeGen/X86/umul-with-carry.ll b/test/CodeGen/X86/umul-with-carry.ll
index 6435760e88a4..5f76dd390586 100644
--- a/test/CodeGen/X86/umul-with-carry.ll
+++ b/test/CodeGen/X86/umul-with-carry.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | grep "jc" | count 1
+; RUN: llc < %s -mtriple=i386-- | grep "jc" | count 1
; XFAIL: *
; FIXME: umul-with-overflow not supported yet.
diff --git a/test/CodeGen/X86/umul-with-overflow.ll b/test/CodeGen/X86/umul-with-overflow.ll
index 29cecbe5a0f6..5a57f9f12970 100644
--- a/test/CodeGen/X86/umul-with-overflow.ll
+++ b/test/CodeGen/X86/umul-with-overflow.ll
@@ -1,37 +1,71 @@
-; RUN: llc < %s -mtriple=i686-unknown-linux-gnu | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-linux-gnu | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu | FileCheck %s --check-prefix=X64
declare {i32, i1} @llvm.umul.with.overflow.i32(i32 %a, i32 %b)
+
define zeroext i1 @a(i32 %x) nounwind {
+; X86-LABEL: a:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl $3, %ecx
+; X86-NEXT: mull %ecx
+; X86-NEXT: seto %al
+; X86-NEXT: retl
+;
+; X64-LABEL: a:
+; X64: # %bb.0:
+; X64-NEXT: movl $3, %ecx
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: mull %ecx
+; X64-NEXT: seto %al
+; X64-NEXT: retq
%res = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %x, i32 3)
%obil = extractvalue {i32, i1} %res, 1
ret i1 %obil
-
-; CHECK-LABEL: a:
-; CHECK: mull
-; CHECK: seto %al
-; CHECK: ret
}
define i32 @test2(i32 %a, i32 %b) nounwind readnone {
+; X86-LABEL: test2:
+; X86: # %bb.0: # %entry
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: addl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: addl %eax, %eax
+; X86-NEXT: retl
+;
+; X64-LABEL: test2:
+; X64: # %bb.0: # %entry
+; X64-NEXT: # kill: def %edi killed %edi def %rdi
+; X64-NEXT: addl %esi, %edi
+; X64-NEXT: leal (%rdi,%rdi), %eax
+; X64-NEXT: retq
entry:
%tmp0 = add i32 %b, %a
%tmp1 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 %tmp0, i32 2)
%tmp2 = extractvalue { i32, i1 } %tmp1, 0
ret i32 %tmp2
-; CHECK-LABEL: test2:
-; CHECK: addl
-; CHECK-NEXT: addl
-; CHECK-NEXT: ret
}
define i32 @test3(i32 %a, i32 %b) nounwind readnone {
+; X86-LABEL: test3:
+; X86: # %bb.0: # %entry
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: addl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl $4, %ecx
+; X86-NEXT: mull %ecx
+; X86-NEXT: retl
+;
+; X64-LABEL: test3:
+; X64: # %bb.0: # %entry
+; X64-NEXT: # kill: def %esi killed %esi def %rsi
+; X64-NEXT: # kill: def %edi killed %edi def %rdi
+; X64-NEXT: leal (%rdi,%rsi), %eax
+; X64-NEXT: movl $4, %ecx
+; X64-NEXT: mull %ecx
+; X64-NEXT: retq
entry:
%tmp0 = add i32 %b, %a
%tmp1 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 %tmp0, i32 4)
%tmp2 = extractvalue { i32, i1 } %tmp1, 0
ret i32 %tmp2
-; CHECK-LABEL: test3:
-; CHECK: addl
-; CHECK: mull
-; CHECK-NEXT: ret
}
diff --git a/test/CodeGen/X86/unaligned-32-byte-memops.ll b/test/CodeGen/X86/unaligned-32-byte-memops.ll
index 391f7a38a379..c78254009104 100644
--- a/test/CodeGen/X86/unaligned-32-byte-memops.ll
+++ b/test/CodeGen/X86/unaligned-32-byte-memops.ll
@@ -7,18 +7,18 @@
define <8 x float> @load32bytes(<8 x float>* %Ap) {
; AVXSLOW-LABEL: load32bytes:
-; AVXSLOW: # BB#0:
+; AVXSLOW: # %bb.0:
; AVXSLOW-NEXT: vmovaps (%rdi), %xmm0
; AVXSLOW-NEXT: vinsertf128 $1, 16(%rdi), %ymm0, %ymm0
; AVXSLOW-NEXT: retq
;
; AVXFAST-LABEL: load32bytes:
-; AVXFAST: # BB#0:
+; AVXFAST: # %bb.0:
; AVXFAST-NEXT: vmovups (%rdi), %ymm0
; AVXFAST-NEXT: retq
;
; AVX2-LABEL: load32bytes:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vmovups (%rdi), %ymm0
; AVX2-NEXT: retq
%A = load <8 x float>, <8 x float>* %Ap, align 16
@@ -29,20 +29,20 @@ define <8 x float> @load32bytes(<8 x float>* %Ap) {
define void @store32bytes(<8 x float> %A, <8 x float>* %P) {
; AVXSLOW-LABEL: store32bytes:
-; AVXSLOW: # BB#0:
+; AVXSLOW: # %bb.0:
; AVXSLOW-NEXT: vextractf128 $1, %ymm0, 16(%rdi)
; AVXSLOW-NEXT: vmovaps %xmm0, (%rdi)
; AVXSLOW-NEXT: vzeroupper
; AVXSLOW-NEXT: retq
;
; AVXFAST-LABEL: store32bytes:
-; AVXFAST: # BB#0:
+; AVXFAST: # %bb.0:
; AVXFAST-NEXT: vmovups %ymm0, (%rdi)
; AVXFAST-NEXT: vzeroupper
; AVXFAST-NEXT: retq
;
; AVX2-LABEL: store32bytes:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vmovups %ymm0, (%rdi)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
@@ -54,18 +54,18 @@ define void @store32bytes(<8 x float> %A, <8 x float>* %P) {
define <8 x float> @combine_16_byte_loads_no_intrinsic(<4 x float>* %ptr) {
; AVXSLOW-LABEL: combine_16_byte_loads_no_intrinsic:
-; AVXSLOW: # BB#0:
+; AVXSLOW: # %bb.0:
; AVXSLOW-NEXT: vmovups 48(%rdi), %xmm0
; AVXSLOW-NEXT: vinsertf128 $1, 64(%rdi), %ymm0, %ymm0
; AVXSLOW-NEXT: retq
;
; AVXFAST-LABEL: combine_16_byte_loads_no_intrinsic:
-; AVXFAST: # BB#0:
+; AVXFAST: # %bb.0:
; AVXFAST-NEXT: vmovups 48(%rdi), %ymm0
; AVXFAST-NEXT: retq
;
; AVX2-LABEL: combine_16_byte_loads_no_intrinsic:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vmovups 48(%rdi), %ymm0
; AVX2-NEXT: retq
%ptr1 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 3
@@ -80,17 +80,17 @@ define <8 x float> @combine_16_byte_loads_no_intrinsic(<4 x float>* %ptr) {
define <8 x float> @combine_16_byte_loads_aligned(<4 x float>* %ptr) {
; AVXSLOW-LABEL: combine_16_byte_loads_aligned:
-; AVXSLOW: # BB#0:
+; AVXSLOW: # %bb.0:
; AVXSLOW-NEXT: vmovaps 48(%rdi), %ymm0
; AVXSLOW-NEXT: retq
;
; AVXFAST-LABEL: combine_16_byte_loads_aligned:
-; AVXFAST: # BB#0:
+; AVXFAST: # %bb.0:
; AVXFAST-NEXT: vmovaps 48(%rdi), %ymm0
; AVXFAST-NEXT: retq
;
; AVX2-LABEL: combine_16_byte_loads_aligned:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vmovaps 48(%rdi), %ymm0
; AVX2-NEXT: retq
%ptr1 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 3
@@ -105,18 +105,18 @@ define <8 x float> @combine_16_byte_loads_aligned(<4 x float>* %ptr) {
define <8 x float> @combine_16_byte_loads_no_intrinsic_swap(<4 x float>* %ptr) {
; AVXSLOW-LABEL: combine_16_byte_loads_no_intrinsic_swap:
-; AVXSLOW: # BB#0:
+; AVXSLOW: # %bb.0:
; AVXSLOW-NEXT: vmovups 64(%rdi), %xmm0
; AVXSLOW-NEXT: vinsertf128 $1, 80(%rdi), %ymm0, %ymm0
; AVXSLOW-NEXT: retq
;
; AVXFAST-LABEL: combine_16_byte_loads_no_intrinsic_swap:
-; AVXFAST: # BB#0:
+; AVXFAST: # %bb.0:
; AVXFAST-NEXT: vmovups 64(%rdi), %ymm0
; AVXFAST-NEXT: retq
;
; AVX2-LABEL: combine_16_byte_loads_no_intrinsic_swap:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vmovups 64(%rdi), %ymm0
; AVX2-NEXT: retq
%ptr1 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 4
@@ -133,7 +133,7 @@ define <8 x float> @combine_16_byte_loads_no_intrinsic_swap(<4 x float>* %ptr) {
define <4 x i64> @combine_16_byte_loads_i64(<2 x i64>* %ptr, <4 x i64> %x) {
; AVXSLOW-LABEL: combine_16_byte_loads_i64:
-; AVXSLOW: # BB#0:
+; AVXSLOW: # %bb.0:
; AVXSLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVXSLOW-NEXT: vpaddq 96(%rdi), %xmm1, %xmm1
; AVXSLOW-NEXT: vpaddq 80(%rdi), %xmm0, %xmm0
@@ -141,7 +141,7 @@ define <4 x i64> @combine_16_byte_loads_i64(<2 x i64>* %ptr, <4 x i64> %x) {
; AVXSLOW-NEXT: retq
;
; AVXFAST-LABEL: combine_16_byte_loads_i64:
-; AVXFAST: # BB#0:
+; AVXFAST: # %bb.0:
; AVXFAST-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVXFAST-NEXT: vpaddq 96(%rdi), %xmm1, %xmm1
; AVXFAST-NEXT: vpaddq 80(%rdi), %xmm0, %xmm0
@@ -149,7 +149,7 @@ define <4 x i64> @combine_16_byte_loads_i64(<2 x i64>* %ptr, <4 x i64> %x) {
; AVXFAST-NEXT: retq
;
; AVX2-LABEL: combine_16_byte_loads_i64:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpaddq 80(%rdi), %ymm0, %ymm0
; AVX2-NEXT: retq
%ptr1 = getelementptr inbounds <2 x i64>, <2 x i64>* %ptr, i64 5
@@ -163,7 +163,7 @@ define <4 x i64> @combine_16_byte_loads_i64(<2 x i64>* %ptr, <4 x i64> %x) {
define <8 x i32> @combine_16_byte_loads_i32(<4 x i32>* %ptr, <8 x i32> %x) {
; AVXSLOW-LABEL: combine_16_byte_loads_i32:
-; AVXSLOW: # BB#0:
+; AVXSLOW: # %bb.0:
; AVXSLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVXSLOW-NEXT: vpaddd 112(%rdi), %xmm1, %xmm1
; AVXSLOW-NEXT: vpaddd 96(%rdi), %xmm0, %xmm0
@@ -171,7 +171,7 @@ define <8 x i32> @combine_16_byte_loads_i32(<4 x i32>* %ptr, <8 x i32> %x) {
; AVXSLOW-NEXT: retq
;
; AVXFAST-LABEL: combine_16_byte_loads_i32:
-; AVXFAST: # BB#0:
+; AVXFAST: # %bb.0:
; AVXFAST-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVXFAST-NEXT: vpaddd 112(%rdi), %xmm1, %xmm1
; AVXFAST-NEXT: vpaddd 96(%rdi), %xmm0, %xmm0
@@ -179,7 +179,7 @@ define <8 x i32> @combine_16_byte_loads_i32(<4 x i32>* %ptr, <8 x i32> %x) {
; AVXFAST-NEXT: retq
;
; AVX2-LABEL: combine_16_byte_loads_i32:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpaddd 96(%rdi), %ymm0, %ymm0
; AVX2-NEXT: retq
%ptr1 = getelementptr inbounds <4 x i32>, <4 x i32>* %ptr, i64 6
@@ -193,7 +193,7 @@ define <8 x i32> @combine_16_byte_loads_i32(<4 x i32>* %ptr, <8 x i32> %x) {
define <16 x i16> @combine_16_byte_loads_i16(<8 x i16>* %ptr, <16 x i16> %x) {
; AVXSLOW-LABEL: combine_16_byte_loads_i16:
-; AVXSLOW: # BB#0:
+; AVXSLOW: # %bb.0:
; AVXSLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVXSLOW-NEXT: vpaddw 128(%rdi), %xmm1, %xmm1
; AVXSLOW-NEXT: vpaddw 112(%rdi), %xmm0, %xmm0
@@ -201,7 +201,7 @@ define <16 x i16> @combine_16_byte_loads_i16(<8 x i16>* %ptr, <16 x i16> %x) {
; AVXSLOW-NEXT: retq
;
; AVXFAST-LABEL: combine_16_byte_loads_i16:
-; AVXFAST: # BB#0:
+; AVXFAST: # %bb.0:
; AVXFAST-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVXFAST-NEXT: vpaddw 128(%rdi), %xmm1, %xmm1
; AVXFAST-NEXT: vpaddw 112(%rdi), %xmm0, %xmm0
@@ -209,7 +209,7 @@ define <16 x i16> @combine_16_byte_loads_i16(<8 x i16>* %ptr, <16 x i16> %x) {
; AVXFAST-NEXT: retq
;
; AVX2-LABEL: combine_16_byte_loads_i16:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpaddw 112(%rdi), %ymm0, %ymm0
; AVX2-NEXT: retq
%ptr1 = getelementptr inbounds <8 x i16>, <8 x i16>* %ptr, i64 7
@@ -223,7 +223,7 @@ define <16 x i16> @combine_16_byte_loads_i16(<8 x i16>* %ptr, <16 x i16> %x) {
define <32 x i8> @combine_16_byte_loads_i8(<16 x i8>* %ptr, <32 x i8> %x) {
; AVXSLOW-LABEL: combine_16_byte_loads_i8:
-; AVXSLOW: # BB#0:
+; AVXSLOW: # %bb.0:
; AVXSLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVXSLOW-NEXT: vpaddb 144(%rdi), %xmm1, %xmm1
; AVXSLOW-NEXT: vpaddb 128(%rdi), %xmm0, %xmm0
@@ -231,7 +231,7 @@ define <32 x i8> @combine_16_byte_loads_i8(<16 x i8>* %ptr, <32 x i8> %x) {
; AVXSLOW-NEXT: retq
;
; AVXFAST-LABEL: combine_16_byte_loads_i8:
-; AVXFAST: # BB#0:
+; AVXFAST: # %bb.0:
; AVXFAST-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVXFAST-NEXT: vpaddb 144(%rdi), %xmm1, %xmm1
; AVXFAST-NEXT: vpaddb 128(%rdi), %xmm0, %xmm0
@@ -239,7 +239,7 @@ define <32 x i8> @combine_16_byte_loads_i8(<16 x i8>* %ptr, <32 x i8> %x) {
; AVXFAST-NEXT: retq
;
; AVX2-LABEL: combine_16_byte_loads_i8:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpaddb 128(%rdi), %ymm0, %ymm0
; AVX2-NEXT: retq
%ptr1 = getelementptr inbounds <16 x i8>, <16 x i8>* %ptr, i64 8
@@ -253,19 +253,19 @@ define <32 x i8> @combine_16_byte_loads_i8(<16 x i8>* %ptr, <32 x i8> %x) {
define <4 x double> @combine_16_byte_loads_double(<2 x double>* %ptr, <4 x double> %x) {
; AVXSLOW-LABEL: combine_16_byte_loads_double:
-; AVXSLOW: # BB#0:
+; AVXSLOW: # %bb.0:
; AVXSLOW-NEXT: vmovups 144(%rdi), %xmm1
; AVXSLOW-NEXT: vinsertf128 $1, 160(%rdi), %ymm1, %ymm1
; AVXSLOW-NEXT: vaddpd %ymm0, %ymm1, %ymm0
; AVXSLOW-NEXT: retq
;
; AVXFAST-LABEL: combine_16_byte_loads_double:
-; AVXFAST: # BB#0:
+; AVXFAST: # %bb.0:
; AVXFAST-NEXT: vaddpd 144(%rdi), %ymm0, %ymm0
; AVXFAST-NEXT: retq
;
; AVX2-LABEL: combine_16_byte_loads_double:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vaddpd 144(%rdi), %ymm0, %ymm0
; AVX2-NEXT: retq
%ptr1 = getelementptr inbounds <2 x double>, <2 x double>* %ptr, i64 9
diff --git a/test/CodeGen/X86/update-terminator-debugloc.ll b/test/CodeGen/X86/update-terminator-debugloc.ll
index 359c348b42cb..17b98c3ee62c 100644
--- a/test/CodeGen/X86/update-terminator-debugloc.ll
+++ b/test/CodeGen/X86/update-terminator-debugloc.ll
@@ -1,4 +1,4 @@
-; RUN: llc -stop-after=machine-sink -march=x86-64 < %s | FileCheck %s
+; RUN: llc -stop-after=machine-sink < %s | FileCheck %s
;
; test code:
; 1 extern int bar(int x);
@@ -15,18 +15,18 @@
; 12 }
; 13 return ret;
; 14 }
-;
-; With the test code, LLVM-IR below shows that loop-control branches have a
+;
+; With the test code, LLVM-IR below shows that loop-control branches have a
; debug location of line 6 (branches in entry and for.body block). Make sure that
; these debug locations are propaged correctly to lowered instructions.
;
; CHECK: [[DLOC:![0-9]+]] = !DILocation(line: 6
-; CHECK-DAG: [[VREG1:%[^ ]+]] = COPY %rsi
-; CHECK-DAG: [[VREG2:%[^ ]+]] = COPY %rdi
+; CHECK-DAG: [[VREG1:%[^ ]+]]:gr64 = COPY %rsi
+; CHECK-DAG: [[VREG2:%[^ ]+]]:gr64 = COPY %rdi
; CHECK: SUB64rr [[VREG2]], [[VREG1]]
; CHECK-NEXT: JNE_1 {{.*}}, debug-location [[DLOC]]{{$}}
-; CHECK: [[VREG3:%[^ ]+]] = PHI [[VREG2]]
-; CHECK: [[VREG4:%[^ ]+]] = ADD64ri8 [[VREG3]], 4
+; CHECK: [[VREG3:%[^ ]+]]:gr64 = PHI [[VREG2]]
+; CHECK: [[VREG4:%[^ ]+]]:gr64 = ADD64ri8 [[VREG3]], 4
; CHECK: SUB64rr [[VREG1]], [[VREG4]]
; CHECK-NEXT: JNE_1 {{.*}}, debug-location [[DLOC]]{{$}}
; CHECK-NEXT: JMP_1 {{.*}}, debug-location [[DLOC]]{{$}}
diff --git a/test/CodeGen/X86/update-terminator.mir b/test/CodeGen/X86/update-terminator.mir
index 2e8e85b4ef66..4515521faf73 100644
--- a/test/CodeGen/X86/update-terminator.mir
+++ b/test/CodeGen/X86/update-terminator.mir
@@ -1,4 +1,4 @@
-# RUN: llc -march=x86-64 -verify-machineinstrs -run-pass block-placement -o - %s | FileCheck %s
+# RUN: llc -mtriple=x86_64-- -verify-machineinstrs -run-pass block-placement -o - %s | FileCheck %s
# Check the conditional jump in bb.1 is changed to unconditional after block placement swaps bb.2 and bb.3.
--- |
diff --git a/test/CodeGen/X86/urem-i8-constant.ll b/test/CodeGen/X86/urem-i8-constant.ll
index 2a659b20de8f..3e0ed75fc49e 100644
--- a/test/CodeGen/X86/urem-i8-constant.ll
+++ b/test/CodeGen/X86/urem-i8-constant.ll
@@ -1,15 +1,17 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=i386-unknown-unknown | FileCheck %s
+; computeKnownBits determines that we don't need a mask op that is required in the general case.
+
define i8 @foo(i8 %tmp325) {
; CHECK-LABEL: foo:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
; CHECK-NEXT: imull $111, %ecx, %eax
-; CHECK-NEXT: andl $28672, %eax # imm = 0x7000
; CHECK-NEXT: shrl $12, %eax
+; CHECK-NEXT: movzwl %ax, %eax
; CHECK-NEXT: movb $37, %dl
-; CHECK-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: # kill: def %al killed %al killed %eax
; CHECK-NEXT: mulb %dl
; CHECK-NEXT: subb %al, %cl
; CHECK-NEXT: movl %ecx, %eax
diff --git a/test/CodeGen/X86/urem-power-of-two.ll b/test/CodeGen/X86/urem-power-of-two.ll
index 1b56c87aad5f..2610beda415f 100644
--- a/test/CodeGen/X86/urem-power-of-two.ll
+++ b/test/CodeGen/X86/urem-power-of-two.ll
@@ -1,14 +1,22 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=X64
; The easy case: a constant power-of-2 divisor.
define i64 @const_pow_2(i64 %x) {
-; CHECK-LABEL: const_pow_2:
-; CHECK: # BB#0:
-; CHECK-NEXT: andl $31, %edi
-; CHECK-NEXT: movq %rdi, %rax
-; CHECK-NEXT: retq
+; X86-LABEL: const_pow_2:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: andl $31, %eax
+; X86-NEXT: xorl %edx, %edx
+; X86-NEXT: retl
+;
+; X64-LABEL: const_pow_2:
+; X64: # %bb.0:
+; X64-NEXT: andl $31, %edi
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: retq
%urem = urem i64 %x, 32
ret i64 %urem
}
@@ -16,14 +24,23 @@ define i64 @const_pow_2(i64 %x) {
; A left-shifted power-of-2 divisor. Use a weird type for wider coverage.
define i25 @shift_left_pow_2(i25 %x, i25 %y) {
-; CHECK-LABEL: shift_left_pow_2:
-; CHECK: # BB#0:
-; CHECK-NEXT: movl $1, %eax
-; CHECK-NEXT: movl %esi, %ecx
-; CHECK-NEXT: shll %cl, %eax
-; CHECK-NEXT: addl $33554431, %eax # imm = 0x1FFFFFF
-; CHECK-NEXT: andl %edi, %eax
-; CHECK-NEXT: retq
+; X86-LABEL: shift_left_pow_2:
+; X86: # %bb.0:
+; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
+; X86-NEXT: movl $1, %eax
+; X86-NEXT: shll %cl, %eax
+; X86-NEXT: addl $33554431, %eax # imm = 0x1FFFFFF
+; X86-NEXT: andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: retl
+;
+; X64-LABEL: shift_left_pow_2:
+; X64: # %bb.0:
+; X64-NEXT: movl $1, %eax
+; X64-NEXT: movl %esi, %ecx
+; X64-NEXT: shll %cl, %eax
+; X64-NEXT: addl $33554431, %eax # imm = 0x1FFFFFF
+; X64-NEXT: andl %edi, %eax
+; X64-NEXT: retq
%shl = shl i25 1, %y
%urem = urem i25 %x, %shl
ret i25 %urem
@@ -32,15 +49,25 @@ define i25 @shift_left_pow_2(i25 %x, i25 %y) {
; A logically right-shifted sign bit is a power-of-2 or UB.
define i16 @shift_right_pow_2(i16 %x, i16 %y) {
-; CHECK-LABEL: shift_right_pow_2:
-; CHECK: # BB#0:
-; CHECK-NEXT: movl $32768, %eax # imm = 0x8000
-; CHECK-NEXT: movl %esi, %ecx
-; CHECK-NEXT: shrl %cl, %eax
-; CHECK-NEXT: decl %eax
-; CHECK-NEXT: andl %edi, %eax
-; CHECK-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
-; CHECK-NEXT: retq
+; X86-LABEL: shift_right_pow_2:
+; X86: # %bb.0:
+; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
+; X86-NEXT: movl $32768, %eax # imm = 0x8000
+; X86-NEXT: shrl %cl, %eax
+; X86-NEXT: decl %eax
+; X86-NEXT: andw {{[0-9]+}}(%esp), %ax
+; X86-NEXT: # kill: def %ax killed %ax killed %eax
+; X86-NEXT: retl
+;
+; X64-LABEL: shift_right_pow_2:
+; X64: # %bb.0:
+; X64-NEXT: movl $32768, %eax # imm = 0x8000
+; X64-NEXT: movl %esi, %ecx
+; X64-NEXT: shrl %cl, %eax
+; X64-NEXT: decl %eax
+; X64-NEXT: andl %edi, %eax
+; X64-NEXT: # kill: def %ax killed %ax killed %eax
+; X64-NEXT: retq
%shr = lshr i16 -32768, %y
%urem = urem i16 %x, %shr
ret i16 %urem
@@ -49,28 +76,57 @@ define i16 @shift_right_pow_2(i16 %x, i16 %y) {
; FIXME: A zero divisor would be UB, so this could be reduced to an 'and' with 3.
define i8 @and_pow_2(i8 %x, i8 %y) {
-; CHECK-LABEL: and_pow_2:
-; CHECK: # BB#0:
-; CHECK-NEXT: andb $4, %sil
-; CHECK-NEXT: movzbl %dil, %eax
-; CHECK-NEXT: # kill: %EAX<def> %EAX<kill> %AX<def>
-; CHECK-NEXT: divb %sil
-; CHECK-NEXT: movzbl %ah, %eax # NOREX
-; CHECK-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
-; CHECK-NEXT: retq
+; X86-LABEL: and_pow_2:
+; X86: # %bb.0:
+; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
+; X86-NEXT: andb $4, %cl
+; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: # kill: def %eax killed %eax def %ax
+; X86-NEXT: divb %cl
+; X86-NEXT: movzbl %ah, %eax # NOREX
+; X86-NEXT: # kill: def %al killed %al killed %eax
+; X86-NEXT: retl
+;
+; X64-LABEL: and_pow_2:
+; X64: # %bb.0:
+; X64-NEXT: andb $4, %sil
+; X64-NEXT: movzbl %dil, %eax
+; X64-NEXT: # kill: def %eax killed %eax def %ax
+; X64-NEXT: divb %sil
+; X64-NEXT: movzbl %ah, %eax # NOREX
+; X64-NEXT: # kill: def %al killed %al killed %eax
+; X64-NEXT: retq
%and = and i8 %y, 4
%urem = urem i8 %x, %and
ret i8 %urem
}
-; A vector splat constant divisor should get the same treatment as a scalar.
+; A vector constant divisor should get the same treatment as a scalar.
-define <4 x i32> @vec_const_pow_2(<4 x i32> %x) {
-; CHECK-LABEL: vec_const_pow_2:
-; CHECK: # BB#0:
-; CHECK-NEXT: andps {{.*}}(%rip), %xmm0
-; CHECK-NEXT: retq
+define <4 x i32> @vec_const_uniform_pow_2(<4 x i32> %x) {
+; X86-LABEL: vec_const_uniform_pow_2:
+; X86: # %bb.0:
+; X86-NEXT: andps {{\.LCPI.*}}, %xmm0
+; X86-NEXT: retl
+;
+; X64-LABEL: vec_const_uniform_pow_2:
+; X64: # %bb.0:
+; X64-NEXT: andps {{.*}}(%rip), %xmm0
+; X64-NEXT: retq
%urem = urem <4 x i32> %x, <i32 16, i32 16, i32 16, i32 16>
ret <4 x i32> %urem
}
+define <4 x i32> @vec_const_nonuniform_pow_2(<4 x i32> %x) {
+; X86-LABEL: vec_const_nonuniform_pow_2:
+; X86: # %bb.0:
+; X86-NEXT: andps {{\.LCPI.*}}, %xmm0
+; X86-NEXT: retl
+;
+; X64-LABEL: vec_const_nonuniform_pow_2:
+; X64: # %bb.0:
+; X64-NEXT: andps {{.*}}(%rip), %xmm0
+; X64-NEXT: retq
+ %urem = urem <4 x i32> %x, <i32 2, i32 4, i32 8, i32 16>
+ ret <4 x i32> %urem
+}
diff --git a/test/CodeGen/X86/use-add-flags.ll b/test/CodeGen/X86/use-add-flags.ll
index da0002cc2520..37baef9fb2c1 100644
--- a/test/CodeGen/X86/use-add-flags.ll
+++ b/test/CodeGen/X86/use-add-flags.ll
@@ -1,18 +1,26 @@
-; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s
-; RUN: llc < %s -mtriple=x86_64-win32 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s --check-prefix=LNX
+; RUN: llc < %s -mtriple=x86_64-win32 | FileCheck %s --check-prefix=WIN
; Reuse the flags value from the add instructions instead of emitting separate
; testl instructions.
; Use the flags on the add.
-; CHECK-LABEL: test1:
-; CHECK: addl
-; CHECK-NOT: test
-; CHECK: cmovnsl
-; CHECK: ret
-
define i32 @test1(i32* %x, i32 %y, i32 %a, i32 %b) nounwind {
+; LNX-LABEL: test1:
+; LNX: # %bb.0:
+; LNX-NEXT: addl (%rdi), %esi
+; LNX-NEXT: cmovnsl %ecx, %edx
+; LNX-NEXT: movl %edx, %eax
+; LNX-NEXT: retq
+;
+; WIN-LABEL: test1:
+; WIN: # %bb.0:
+; WIN-NEXT: addl (%rcx), %edx
+; WIN-NEXT: cmovnsl %r9d, %r8d
+; WIN-NEXT: movl %r8d, %eax
+; WIN-NEXT: retq
%tmp2 = load i32, i32* %x, align 4 ; <i32> [#uses=1]
%tmp4 = add i32 %tmp2, %y ; <i32> [#uses=1]
%tmp5 = icmp slt i32 %tmp4, 0 ; <i1> [#uses=1]
@@ -25,10 +33,28 @@ declare void @foo(i32)
; Don't use the flags result of the and here, since the and has no
; other use. A simple test is better.
-; CHECK-LABEL: test2:
-; CHECK: testb $16, {{%dil|%cl}}
-
define void @test2(i32 %x) nounwind {
+; LNX-LABEL: test2:
+; LNX: # %bb.0:
+; LNX-NEXT: testb $16, %dil
+; LNX-NEXT: jne .LBB1_2
+; LNX-NEXT: # %bb.1: # %true
+; LNX-NEXT: pushq %rax
+; LNX-NEXT: callq foo
+; LNX-NEXT: popq %rax
+; LNX-NEXT: .LBB1_2: # %false
+; LNX-NEXT: retq
+;
+; WIN-LABEL: test2:
+; WIN: # %bb.0:
+; WIN-NEXT: subq $40, %rsp
+; WIN-NEXT: testb $16, %cl
+; WIN-NEXT: jne .LBB1_2
+; WIN-NEXT: # %bb.1: # %true
+; WIN-NEXT: callq foo
+; WIN-NEXT: .LBB1_2: # %false
+; WIN-NEXT: addq $40, %rsp
+; WIN-NEXT: retq
%y = and i32 %x, 16
%t = icmp eq i32 %y, 0
br i1 %t, label %true, label %false
@@ -41,11 +67,28 @@ false:
; Do use the flags result of the and here, since the and has another use.
-; CHECK-LABEL: test3:
-; CHECK: andl $16, %e
-; CHECK-NEXT: jne
-
define void @test3(i32 %x) nounwind {
+; LNX-LABEL: test3:
+; LNX: # %bb.0:
+; LNX-NEXT: andl $16, %edi
+; LNX-NEXT: jne .LBB2_2
+; LNX-NEXT: # %bb.1: # %true
+; LNX-NEXT: pushq %rax
+; LNX-NEXT: callq foo
+; LNX-NEXT: popq %rax
+; LNX-NEXT: .LBB2_2: # %false
+; LNX-NEXT: retq
+;
+; WIN-LABEL: test3:
+; WIN: # %bb.0:
+; WIN-NEXT: subq $40, %rsp
+; WIN-NEXT: andl $16, %ecx
+; WIN-NEXT: jne .LBB2_2
+; WIN-NEXT: # %bb.1: # %true
+; WIN-NEXT: callq foo
+; WIN-NEXT: .LBB2_2: # %false
+; WIN-NEXT: addq $40, %rsp
+; WIN-NEXT: retq
%y = and i32 %x, 16
%t = icmp eq i32 %y, 0
br i1 %t, label %true, label %false
@@ -55,3 +98,4 @@ true:
false:
ret void
}
+
diff --git a/test/CodeGen/X86/utf8.ll b/test/CodeGen/X86/utf8.ll
index 67bc5ae2fd41..baf01a2cb764 100644
--- a/test/CodeGen/X86/utf8.ll
+++ b/test/CodeGen/X86/utf8.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | FileCheck %s
+; RUN: llc < %s -mtriple=i686-- | FileCheck %s
; CHECK: iΔ
@"i\CE\94" = common global i32 0, align 4
diff --git a/test/CodeGen/X86/v2f32.ll b/test/CodeGen/X86/v2f32.ll
index 3b7160c71869..cabefa46c50b 100644
--- a/test/CodeGen/X86/v2f32.ll
+++ b/test/CodeGen/X86/v2f32.ll
@@ -1,18 +1,18 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-linux -mcpu=penryn -o - | FileCheck %s --check-prefix=X64
-; RUN: llc < %s -mcpu=yonah -march=x86 -mtriple=i386-linux-gnu -o - | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mcpu=yonah -mtriple=i386-linux-gnu -o - | FileCheck %s --check-prefix=X32
; PR7518
define void @test1(<2 x float> %Q, float *%P2) nounwind {
; X64-LABEL: test1:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; X64-NEXT: addss %xmm0, %xmm1
; X64-NEXT: movss %xmm1, (%rdi)
; X64-NEXT: retq
;
; X32-LABEL: test1:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; X32-NEXT: addss %xmm0, %xmm1
@@ -27,12 +27,12 @@ define void @test1(<2 x float> %Q, float *%P2) nounwind {
define <2 x float> @test2(<2 x float> %Q, <2 x float> %R, <2 x float> *%P) nounwind {
; X64-LABEL: test2:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: addps %xmm1, %xmm0
; X64-NEXT: retq
;
; X32-LABEL: test2:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: addps %xmm1, %xmm0
; X32-NEXT: retl
%Z = fadd <2 x float> %Q, %R
@@ -41,12 +41,12 @@ define <2 x float> @test2(<2 x float> %Q, <2 x float> %R, <2 x float> *%P) nounw
define <2 x float> @test3(<4 x float> %A) nounwind {
; X64-LABEL: test3:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: addps %xmm0, %xmm0
; X64-NEXT: retq
;
; X32-LABEL: test3:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: addps %xmm0, %xmm0
; X32-NEXT: retl
%B = shufflevector <4 x float> %A, <4 x float> undef, <2 x i32> <i32 0, i32 1>
@@ -56,12 +56,12 @@ define <2 x float> @test3(<4 x float> %A) nounwind {
define <2 x float> @test4(<2 x float> %A) nounwind {
; X64-LABEL: test4:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: addps %xmm0, %xmm0
; X64-NEXT: retq
;
; X32-LABEL: test4:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: addps %xmm0, %xmm0
; X32-NEXT: retl
%C = fadd <2 x float> %A, %A
@@ -70,13 +70,13 @@ define <2 x float> @test4(<2 x float> %A) nounwind {
define <4 x float> @test5(<4 x float> %A) nounwind {
; X64-LABEL: test5:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: addps %xmm0, %xmm0
; X64-NEXT: addps %xmm0, %xmm0
; X64-NEXT: retq
;
; X32-LABEL: test5:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: addps %xmm0, %xmm0
; X32-NEXT: addps %xmm0, %xmm0
; X32-NEXT: retl
diff --git a/test/CodeGen/X86/v4f32-immediate.ll b/test/CodeGen/X86/v4f32-immediate.ll
index 7945b1093f8e..cc73cd5a63a6 100644
--- a/test/CodeGen/X86/v4f32-immediate.ll
+++ b/test/CodeGen/X86/v4f32-immediate.ll
@@ -4,12 +4,12 @@
define <4 x float> @foo() {
; X32-LABEL: foo:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movaps {{.*#+}} xmm0 = [3.223542e+00,2.300000e+00,1.200000e+00,1.000000e-01]
; X32-NEXT: retl
;
; X64-LABEL: foo:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movaps {{.*#+}} xmm0 = [3.223542e+00,2.300000e+00,1.200000e+00,1.000000e-01]
; X64-NEXT: retq
ret <4 x float> <float 0x4009C9D0A0000000, float 0x4002666660000000, float 0x3FF3333340000000, float 0x3FB99999A0000000>
diff --git a/test/CodeGen/X86/v4i32load-crash.ll b/test/CodeGen/X86/v4i32load-crash.ll
index 8d019bc43d6b..359e89e11e1f 100644
--- a/test/CodeGen/X86/v4i32load-crash.ll
+++ b/test/CodeGen/X86/v4i32load-crash.ll
@@ -1,5 +1,5 @@
-; RUN: llc --march=x86 --mcpu=x86-64 --mattr=ssse3 < %s
-; RUN: llc --march=x86-64 --mcpu=x86-64 --mattr=ssse3 < %s
+; RUN: llc --mtriple=i686-- --mcpu=x86-64 --mattr=ssse3 < %s
+; RUN: llc --mtriple=x86_64-- --mcpu=x86-64 --mattr=ssse3 < %s
;PR18045:
;Issue of selection for 'v4i32 load'.
diff --git a/test/CodeGen/X86/v8i1-masks.ll b/test/CodeGen/X86/v8i1-masks.ll
index e378cf33dea4..5175850c734f 100644
--- a/test/CodeGen/X86/v8i1-masks.ll
+++ b/test/CodeGen/X86/v8i1-masks.ll
@@ -4,7 +4,7 @@
define void @and_masks(<8 x float>* %a, <8 x float>* %b, <8 x float>* %c) nounwind uwtable noinline ssp {
; X32-LABEL: and_masks:
-; X32: ## BB#0:
+; X32: ## %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
@@ -20,7 +20,7 @@ define void @and_masks(<8 x float>* %a, <8 x float>* %b, <8 x float>* %c) nounwi
; X32-NEXT: retl
;
; X64-LABEL: and_masks:
-; X64: ## BB#0:
+; X64: ## %bb.0:
; X64-NEXT: vmovups (%rdi), %ymm0
; X64-NEXT: vmovups (%rsi), %ymm1
; X64-NEXT: vcmpltps %ymm0, %ymm1, %ymm1
@@ -44,7 +44,7 @@ define void @and_masks(<8 x float>* %a, <8 x float>* %b, <8 x float>* %c) nounwi
define void @neg_masks(<8 x float>* %a, <8 x float>* %b, <8 x float>* %c) nounwind uwtable noinline ssp {
; X32-LABEL: neg_masks:
-; X32: ## BB#0:
+; X32: ## %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: vmovups (%ecx), %ymm0
@@ -55,7 +55,7 @@ define void @neg_masks(<8 x float>* %a, <8 x float>* %b, <8 x float>* %c) nounwi
; X32-NEXT: retl
;
; X64-LABEL: neg_masks:
-; X64: ## BB#0:
+; X64: ## %bb.0:
; X64-NEXT: vmovups (%rsi), %ymm0
; X64-NEXT: vcmpnltps (%rdi), %ymm0, %ymm0
; X64-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
diff --git a/test/CodeGen/X86/vaargs.ll b/test/CodeGen/X86/vaargs.ll
index 3767f41c2aae..7d27684c51c9 100644
--- a/test/CodeGen/X86/vaargs.ll
+++ b/test/CodeGen/X86/vaargs.ll
@@ -8,7 +8,7 @@ target triple = "x86_64-apple-macosx10.9.0"
define i32 @sum(i32 %count, ...) nounwind optsize ssp uwtable {
; CHECK: testb %al, %al
; CHECK-NEXT: je
-; CHECK-NEXT: ## BB#{{[0-9]+}}:
+; CHECK-NEXT: ## %bb.{{[0-9]+}}:
; CHECK-NEXT: vmovaps %xmm0, 48(%rsp)
; CHECK-NEXT: vmovaps %xmm1, 64(%rsp)
; CHECK-NEXT: vmovaps %xmm2, 80(%rsp)
diff --git a/test/CodeGen/X86/vaes-intrinsics-avx-x86.ll b/test/CodeGen/X86/vaes-intrinsics-avx-x86.ll
new file mode 100644
index 000000000000..06acb27218e8
--- /dev/null
+++ b/test/CodeGen/X86/vaes-intrinsics-avx-x86.ll
@@ -0,0 +1,13 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx,+vaes -show-mc-encoding | FileCheck %s --check-prefix=VAES_AVX
+
+; {vaes, avx}
+define <4 x i64> @test_x86_aesni_aesenc_256(<4 x i64> %a0, <4 x i64> %a1) {
+; VAES_AVX-LABEL: test_x86_aesni_aesenc_256:
+; VAES_AVX: # %bb.0:
+; VAES_AVX-NEXT: vaesenc %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe2,0x7d,0xdc,0xc1]
+; VAES_AVX-NEXT: retl # encoding: [0xc3]
+ %res = call <4 x i64> @llvm.x86.aesni.aesenc.256(<4 x i64> %a0, <4 x i64> %a1)
+ ret <4 x i64> %res
+}
+declare <4 x i64> @llvm.x86.aesni.aesenc.256(<4 x i64>, <4 x i64>) nounwind readnone
diff --git a/test/CodeGen/X86/vaes-intrinsics-avx512-x86.ll b/test/CodeGen/X86/vaes-intrinsics-avx512-x86.ll
new file mode 100644
index 000000000000..b36400df16d5
--- /dev/null
+++ b/test/CodeGen/X86/vaes-intrinsics-avx512-x86.ll
@@ -0,0 +1,42 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+vaes,+avx512f -show-mc-encoding | FileCheck %s --check-prefix=VAES_AVX512
+
+define <8 x i64> @test_x86_aesni_aesenc_512(<8 x i64> %a0, <8 x i64> %a1) {
+; VAES_AVX512-LABEL: test_x86_aesni_aesenc_512:
+; VAES_AVX512: # %bb.0:
+; VAES_AVX512-NEXT: vaesenc %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf2,0x7d,0x48,0xdc,0xc1]
+; VAES_AVX512-NEXT: retq # encoding: [0xc3]
+ %res = call <8 x i64> @llvm.x86.aesni.aesenc.512(<8 x i64> %a0, <8 x i64> %a1)
+ ret <8 x i64> %res
+}
+declare <8 x i64> @llvm.x86.aesni.aesenc.512(<8 x i64>, <8 x i64>) nounwind readnone
+
+define <8 x i64> @test_x86_aesni_aesenclast_512(<8 x i64> %a0, <8 x i64> %a1) {
+; VAES_AVX512-LABEL: test_x86_aesni_aesenclast_512:
+; VAES_AVX512: # %bb.0:
+; VAES_AVX512-NEXT: vaesenclast %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf2,0x7d,0x48,0xdd,0xc1]
+; VAES_AVX512-NEXT: retq # encoding: [0xc3]
+ %res = call <8 x i64> @llvm.x86.aesni.aesenclast.512(<8 x i64> %a0, <8 x i64> %a1)
+ ret <8 x i64> %res
+}
+declare <8 x i64> @llvm.x86.aesni.aesenclast.512(<8 x i64>, <8 x i64>) nounwind readnone
+
+define <8 x i64> @test_x86_aesni_aesdec_512(<8 x i64> %a0, <8 x i64> %a1) {
+; VAES_AVX512-LABEL: test_x86_aesni_aesdec_512:
+; VAES_AVX512: # %bb.0:
+; VAES_AVX512-NEXT: vaesdec %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf2,0x7d,0x48,0xde,0xc1]
+; VAES_AVX512-NEXT: retq # encoding: [0xc3]
+ %res = call <8 x i64> @llvm.x86.aesni.aesdec.512(<8 x i64> %a0, <8 x i64> %a1)
+ ret <8 x i64> %res
+}
+declare <8 x i64> @llvm.x86.aesni.aesdec.512(<8 x i64>, <8 x i64>) nounwind readnone
+
+define <8 x i64> @test_x86_aesni_aesdeclast_512(<8 x i64> %a0, <8 x i64> %a1) {
+; VAES_AVX512-LABEL: test_x86_aesni_aesdeclast_512:
+; VAES_AVX512: # %bb.0:
+; VAES_AVX512-NEXT: vaesdeclast %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf2,0x7d,0x48,0xdf,0xc1]
+; VAES_AVX512-NEXT: retq # encoding: [0xc3]
+ %res = call <8 x i64> @llvm.x86.aesni.aesdeclast.512(<8 x i64> %a0, <8 x i64> %a1)
+ ret <8 x i64> %res
+}
+declare <8 x i64> @llvm.x86.aesni.aesdeclast.512(<8 x i64>, <8 x i64>) nounwind readnone
+
diff --git a/test/CodeGen/X86/vaes-intrinsics-avx512vl-x86.ll b/test/CodeGen/X86/vaes-intrinsics-avx512vl-x86.ll
new file mode 100644
index 000000000000..79b3b7bfba56
--- /dev/null
+++ b/test/CodeGen/X86/vaes-intrinsics-avx512vl-x86.ll
@@ -0,0 +1,82 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+vaes,+avx512f,+avx512vl -show-mc-encoding | FileCheck %s --check-prefix=VAES_AVX512VL
+
+define <2 x i64> @test_x86_aesni_aesenc(<2 x i64> %a0, <2 x i64> %a1) {
+; VAES_AVX512VL-LABEL: test_x86_aesni_aesenc:
+; VAES_AVX512VL: # %bb.0:
+; VAES_AVX512VL-NEXT: vaesenc %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xdc,0xc1]
+; VAES_AVX512VL-NEXT: retq # encoding: [0xc3]
+ %res = call <2 x i64> @llvm.x86.aesni.aesenc(<2 x i64> %a0, <2 x i64> %a1)
+ ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.aesni.aesenc(<2 x i64>, <2 x i64>) nounwind readnone
+
+define <4 x i64> @test_x86_aesni_aesenc_256(<4 x i64> %a0, <4 x i64> %a1) {
+; VAES_AVX512VL-LABEL: test_x86_aesni_aesenc_256:
+; VAES_AVX512VL: # %bb.0:
+; VAES_AVX512VL-NEXT: vaesenc %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0xdc,0xc1]
+; VAES_AVX512VL-NEXT: retq # encoding: [0xc3]
+ %res = call <4 x i64> @llvm.x86.aesni.aesenc.256(<4 x i64> %a0, <4 x i64> %a1)
+ ret <4 x i64> %res
+}
+declare <4 x i64> @llvm.x86.aesni.aesenc.256(<4 x i64>, <4 x i64>) nounwind readnone
+
+define <2 x i64> @test_x86_aesni_aesenclast(<2 x i64> %a0, <2 x i64> %a1) {
+; VAES_AVX512VL-LABEL: test_x86_aesni_aesenclast:
+; VAES_AVX512VL: # %bb.0:
+; VAES_AVX512VL-NEXT: vaesenclast %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xdd,0xc1]
+; VAES_AVX512VL-NEXT: retq # encoding: [0xc3]
+ %res = call <2 x i64> @llvm.x86.aesni.aesenclast(<2 x i64> %a0, <2 x i64> %a1)
+ ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.aesni.aesenclast(<2 x i64>, <2 x i64>) nounwind readnone
+
+define <4 x i64> @test_x86_aesni_aesenclast_256(<4 x i64> %a0, <4 x i64> %a1) {
+; VAES_AVX512VL-LABEL: test_x86_aesni_aesenclast_256:
+; VAES_AVX512VL: # %bb.0:
+; VAES_AVX512VL-NEXT: vaesenclast %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0xdd,0xc1]
+; VAES_AVX512VL-NEXT: retq # encoding: [0xc3]
+ %res = call <4 x i64> @llvm.x86.aesni.aesenclast.256(<4 x i64> %a0, <4 x i64> %a1)
+ ret <4 x i64> %res
+}
+declare <4 x i64> @llvm.x86.aesni.aesenclast.256(<4 x i64>, <4 x i64>) nounwind readnone
+
+define <2 x i64> @test_x86_aesni_aesdec(<2 x i64> %a0, <2 x i64> %a1) {
+; VAES_AVX512VL-LABEL: test_x86_aesni_aesdec:
+; VAES_AVX512VL: # %bb.0:
+; VAES_AVX512VL-NEXT: vaesdec %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xde,0xc1]
+; VAES_AVX512VL-NEXT: retq # encoding: [0xc3]
+ %res = call <2 x i64> @llvm.x86.aesni.aesdec(<2 x i64> %a0, <2 x i64> %a1)
+ ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.aesni.aesdec(<2 x i64>, <2 x i64>) nounwind readnone
+
+define <4 x i64> @test_x86_aesni_aesdec_256(<4 x i64> %a0, <4 x i64> %a1) {
+; VAES_AVX512VL-LABEL: test_x86_aesni_aesdec_256:
+; VAES_AVX512VL: # %bb.0:
+; VAES_AVX512VL-NEXT: vaesdec %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0xde,0xc1]
+; VAES_AVX512VL-NEXT: retq # encoding: [0xc3]
+ %res = call <4 x i64> @llvm.x86.aesni.aesdec.256(<4 x i64> %a0, <4 x i64> %a1)
+ ret <4 x i64> %res
+}
+declare <4 x i64> @llvm.x86.aesni.aesdec.256(<4 x i64>, <4 x i64>) nounwind readnone
+
+define <2 x i64> @test_x86_aesni_aesdeclast(<2 x i64> %a0, <2 x i64> %a1) {
+; VAES_AVX512VL-LABEL: test_x86_aesni_aesdeclast:
+; VAES_AVX512VL: # %bb.0:
+; VAES_AVX512VL-NEXT: vaesdeclast %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xdf,0xc1]
+; VAES_AVX512VL-NEXT: retq # encoding: [0xc3]
+ %res = call <2 x i64> @llvm.x86.aesni.aesdeclast(<2 x i64> %a0, <2 x i64> %a1)
+ ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.aesni.aesdeclast(<2 x i64>, <2 x i64>) nounwind readnone
+
+define <4 x i64> @test_x86_aesni_aesdeclast_256(<4 x i64> %a0, <4 x i64> %a1) {
+; VAES_AVX512VL-LABEL: test_x86_aesni_aesdeclast_256:
+; VAES_AVX512VL: # %bb.0:
+; VAES_AVX512VL-NEXT: vaesdeclast %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0xdf,0xc1]
+; VAES_AVX512VL-NEXT: retq # encoding: [0xc3]
+ %res = call <4 x i64> @llvm.x86.aesni.aesdeclast.256(<4 x i64> %a0, <4 x i64> %a1)
+ ret <4 x i64> %res
+}
+declare <4 x i64> @llvm.x86.aesni.aesdeclast.256(<4 x i64>, <4 x i64>) nounwind readnone
+
diff --git a/test/CodeGen/X86/var-permute-128.ll b/test/CodeGen/X86/var-permute-128.ll
new file mode 100644
index 000000000000..fb5f02e8d5d2
--- /dev/null
+++ b/test/CodeGen/X86/var-permute-128.ll
@@ -0,0 +1,356 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSSE3
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVX512,AVXNOVLBW,AVX512F
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=AVX,AVX512,AVXNOVLBW,AVX512VL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX,AVX512,AVX512VLBW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+avx512vbmi | FileCheck %s --check-prefixes=AVX,AVX512,AVX512VLBW,VBMI
+
+define <2 x i64> @var_shuffle_v2i64(<2 x i64> %v, <2 x i64> %indices) nounwind {
+; SSSE3-LABEL: var_shuffle_v2i64:
+; SSSE3: # %bb.0:
+; SSSE3-NEXT: movq %xmm1, %rax
+; SSSE3-NEXT: andl $1, %eax
+; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSSE3-NEXT: movq %xmm1, %rcx
+; SSSE3-NEXT: andl $1, %ecx
+; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSSE3-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; SSSE3-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
+; SSSE3-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSSE3-NEXT: retq
+;
+; AVX-LABEL: var_shuffle_v2i64:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovq %xmm1, %rax
+; AVX-NEXT: andl $1, %eax
+; AVX-NEXT: vpextrq $1, %xmm1, %rcx
+; AVX-NEXT: andl $1, %ecx
+; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX-NEXT: retq
+ %index0 = extractelement <2 x i64> %indices, i32 0
+ %index1 = extractelement <2 x i64> %indices, i32 1
+ %v0 = extractelement <2 x i64> %v, i64 %index0
+ %v1 = extractelement <2 x i64> %v, i64 %index1
+ %ret0 = insertelement <2 x i64> undef, i64 %v0, i32 0
+ %ret1 = insertelement <2 x i64> %ret0, i64 %v1, i32 1
+ ret <2 x i64> %ret1
+}
+
+define <4 x i32> @var_shuffle_v4i32(<4 x i32> %v, <4 x i32> %indices) nounwind {
+; SSSE3-LABEL: var_shuffle_v4i32:
+; SSSE3: # %bb.0:
+; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; SSSE3-NEXT: movq %xmm2, %rax
+; SSSE3-NEXT: movq %rax, %rcx
+; SSSE3-NEXT: sarq $32, %rcx
+; SSSE3-NEXT: movq %xmm1, %rdx
+; SSSE3-NEXT: movq %rdx, %rsi
+; SSSE3-NEXT: sarq $32, %rsi
+; SSSE3-NEXT: andl $3, %edx
+; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSSE3-NEXT: andl $3, %esi
+; SSSE3-NEXT: andl $3, %eax
+; SSSE3-NEXT: andl $3, %ecx
+; SSSE3-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSSE3-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSSE3-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSSE3-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSSE3-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSSE3-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSSE3-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSSE3-NEXT: retq
+;
+; AVX-LABEL: var_shuffle_v4i32:
+; AVX: # %bb.0:
+; AVX-NEXT: vpextrq $1, %xmm1, %rax
+; AVX-NEXT: movq %rax, %rcx
+; AVX-NEXT: sarq $32, %rcx
+; AVX-NEXT: vmovq %xmm1, %rdx
+; AVX-NEXT: movq %rdx, %rsi
+; AVX-NEXT: sarq $32, %rsi
+; AVX-NEXT: andl $3, %edx
+; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: andl $3, %esi
+; AVX-NEXT: andl $3, %eax
+; AVX-NEXT: andl $3, %ecx
+; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT: vpinsrd $1, -24(%rsp,%rsi,4), %xmm0, %xmm0
+; AVX-NEXT: vpinsrd $2, -24(%rsp,%rax,4), %xmm0, %xmm0
+; AVX-NEXT: vpinsrd $3, -24(%rsp,%rcx,4), %xmm0, %xmm0
+; AVX-NEXT: retq
+ %index0 = extractelement <4 x i32> %indices, i32 0
+ %index1 = extractelement <4 x i32> %indices, i32 1
+ %index2 = extractelement <4 x i32> %indices, i32 2
+ %index3 = extractelement <4 x i32> %indices, i32 3
+ %v0 = extractelement <4 x i32> %v, i32 %index0
+ %v1 = extractelement <4 x i32> %v, i32 %index1
+ %v2 = extractelement <4 x i32> %v, i32 %index2
+ %v3 = extractelement <4 x i32> %v, i32 %index3
+ %ret0 = insertelement <4 x i32> undef, i32 %v0, i32 0
+ %ret1 = insertelement <4 x i32> %ret0, i32 %v1, i32 1
+ %ret2 = insertelement <4 x i32> %ret1, i32 %v2, i32 2
+ %ret3 = insertelement <4 x i32> %ret2, i32 %v3, i32 3
+ ret <4 x i32> %ret3
+}
+
+define <8 x i16> @var_shuffle_v8i16(<8 x i16> %v, <8 x i16> %indices) nounwind {
+; SSSE3-LABEL: var_shuffle_v8i16:
+; SSSE3: # %bb.0:
+; SSSE3-NEXT: movd %xmm1, %r8d
+; SSSE3-NEXT: pextrw $1, %xmm1, %r9d
+; SSSE3-NEXT: pextrw $2, %xmm1, %r10d
+; SSSE3-NEXT: pextrw $3, %xmm1, %esi
+; SSSE3-NEXT: pextrw $4, %xmm1, %edi
+; SSSE3-NEXT: pextrw $5, %xmm1, %eax
+; SSSE3-NEXT: pextrw $6, %xmm1, %ecx
+; SSSE3-NEXT: pextrw $7, %xmm1, %edx
+; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSSE3-NEXT: andl $7, %r8d
+; SSSE3-NEXT: andl $7, %r9d
+; SSSE3-NEXT: andl $7, %r10d
+; SSSE3-NEXT: andl $7, %esi
+; SSSE3-NEXT: andl $7, %edi
+; SSSE3-NEXT: andl $7, %eax
+; SSSE3-NEXT: andl $7, %ecx
+; SSSE3-NEXT: andl $7, %edx
+; SSSE3-NEXT: movzwl -24(%rsp,%rdx,2), %edx
+; SSSE3-NEXT: movd %edx, %xmm0
+; SSSE3-NEXT: movzwl -24(%rsp,%rcx,2), %ecx
+; SSSE3-NEXT: movd %ecx, %xmm1
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSSE3-NEXT: movzwl -24(%rsp,%rax,2), %eax
+; SSSE3-NEXT: movd %eax, %xmm0
+; SSSE3-NEXT: movzwl -24(%rsp,%rdi,2), %eax
+; SSSE3-NEXT: movd %eax, %xmm2
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSSE3-NEXT: movzwl -24(%rsp,%rsi,2), %eax
+; SSSE3-NEXT: movd %eax, %xmm0
+; SSSE3-NEXT: movzwl -24(%rsp,%r10,2), %eax
+; SSSE3-NEXT: movd %eax, %xmm1
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSSE3-NEXT: movzwl -24(%rsp,%r9,2), %eax
+; SSSE3-NEXT: movd %eax, %xmm3
+; SSSE3-NEXT: movzwl -24(%rsp,%r8,2), %eax
+; SSSE3-NEXT: movd %eax, %xmm0
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; SSSE3-NEXT: retq
+;
+; AVXNOVLBW-LABEL: var_shuffle_v8i16:
+; AVXNOVLBW: # %bb.0:
+; AVXNOVLBW-NEXT: vmovd %xmm1, %eax
+; AVXNOVLBW-NEXT: vpextrw $1, %xmm1, %r10d
+; AVXNOVLBW-NEXT: vpextrw $2, %xmm1, %ecx
+; AVXNOVLBW-NEXT: vpextrw $3, %xmm1, %edx
+; AVXNOVLBW-NEXT: vpextrw $4, %xmm1, %esi
+; AVXNOVLBW-NEXT: vpextrw $5, %xmm1, %edi
+; AVXNOVLBW-NEXT: vpextrw $6, %xmm1, %r8d
+; AVXNOVLBW-NEXT: vpextrw $7, %xmm1, %r9d
+; AVXNOVLBW-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVXNOVLBW-NEXT: andl $7, %eax
+; AVXNOVLBW-NEXT: andl $7, %r10d
+; AVXNOVLBW-NEXT: andl $7, %ecx
+; AVXNOVLBW-NEXT: andl $7, %edx
+; AVXNOVLBW-NEXT: andl $7, %esi
+; AVXNOVLBW-NEXT: andl $7, %edi
+; AVXNOVLBW-NEXT: andl $7, %r8d
+; AVXNOVLBW-NEXT: andl $7, %r9d
+; AVXNOVLBW-NEXT: movzwl -24(%rsp,%rax,2), %eax
+; AVXNOVLBW-NEXT: vmovd %eax, %xmm0
+; AVXNOVLBW-NEXT: vpinsrw $1, -24(%rsp,%r10,2), %xmm0, %xmm0
+; AVXNOVLBW-NEXT: vpinsrw $2, -24(%rsp,%rcx,2), %xmm0, %xmm0
+; AVXNOVLBW-NEXT: vpinsrw $3, -24(%rsp,%rdx,2), %xmm0, %xmm0
+; AVXNOVLBW-NEXT: vpinsrw $4, -24(%rsp,%rsi,2), %xmm0, %xmm0
+; AVXNOVLBW-NEXT: vpinsrw $5, -24(%rsp,%rdi,2), %xmm0, %xmm0
+; AVXNOVLBW-NEXT: vpinsrw $6, -24(%rsp,%r8,2), %xmm0, %xmm0
+; AVXNOVLBW-NEXT: vpinsrw $7, -24(%rsp,%r9,2), %xmm0, %xmm0
+; AVXNOVLBW-NEXT: retq
+;
+; AVX512VLBW-LABEL: var_shuffle_v8i16:
+; AVX512VLBW: # %bb.0:
+; AVX512VLBW-NEXT: vpermw %xmm0, %xmm1, %xmm0
+; AVX512VLBW-NEXT: retq
+ %index0 = extractelement <8 x i16> %indices, i32 0
+ %index1 = extractelement <8 x i16> %indices, i32 1
+ %index2 = extractelement <8 x i16> %indices, i32 2
+ %index3 = extractelement <8 x i16> %indices, i32 3
+ %index4 = extractelement <8 x i16> %indices, i32 4
+ %index5 = extractelement <8 x i16> %indices, i32 5
+ %index6 = extractelement <8 x i16> %indices, i32 6
+ %index7 = extractelement <8 x i16> %indices, i32 7
+ %v0 = extractelement <8 x i16> %v, i16 %index0
+ %v1 = extractelement <8 x i16> %v, i16 %index1
+ %v2 = extractelement <8 x i16> %v, i16 %index2
+ %v3 = extractelement <8 x i16> %v, i16 %index3
+ %v4 = extractelement <8 x i16> %v, i16 %index4
+ %v5 = extractelement <8 x i16> %v, i16 %index5
+ %v6 = extractelement <8 x i16> %v, i16 %index6
+ %v7 = extractelement <8 x i16> %v, i16 %index7
+ %ret0 = insertelement <8 x i16> undef, i16 %v0, i32 0
+ %ret1 = insertelement <8 x i16> %ret0, i16 %v1, i32 1
+ %ret2 = insertelement <8 x i16> %ret1, i16 %v2, i32 2
+ %ret3 = insertelement <8 x i16> %ret2, i16 %v3, i32 3
+ %ret4 = insertelement <8 x i16> %ret3, i16 %v4, i32 4
+ %ret5 = insertelement <8 x i16> %ret4, i16 %v5, i32 5
+ %ret6 = insertelement <8 x i16> %ret5, i16 %v6, i32 6
+ %ret7 = insertelement <8 x i16> %ret6, i16 %v7, i32 7
+ ret <8 x i16> %ret7
+}
+
+define <16 x i8> @var_shuffle_v16i8(<16 x i8> %v, <16 x i8> %indices) nounwind {
+; SSSE3-LABEL: var_shuffle_v16i8:
+; SSSE3: # %bb.0:
+; SSSE3-NEXT: pshufb %xmm0, %xmm1
+; SSSE3-NEXT: movdqa %xmm1, %xmm0
+; SSSE3-NEXT: retq
+;
+; AVX-LABEL: var_shuffle_v16i8:
+; AVX: # %bb.0:
+; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm0
+; AVX-NEXT: retq
+ %index0 = extractelement <16 x i8> %indices, i32 0
+ %index1 = extractelement <16 x i8> %indices, i32 1
+ %index2 = extractelement <16 x i8> %indices, i32 2
+ %index3 = extractelement <16 x i8> %indices, i32 3
+ %index4 = extractelement <16 x i8> %indices, i32 4
+ %index5 = extractelement <16 x i8> %indices, i32 5
+ %index6 = extractelement <16 x i8> %indices, i32 6
+ %index7 = extractelement <16 x i8> %indices, i32 7
+ %index8 = extractelement <16 x i8> %indices, i32 8
+ %index9 = extractelement <16 x i8> %indices, i32 9
+ %index10 = extractelement <16 x i8> %indices, i32 10
+ %index11 = extractelement <16 x i8> %indices, i32 11
+ %index12 = extractelement <16 x i8> %indices, i32 12
+ %index13 = extractelement <16 x i8> %indices, i32 13
+ %index14 = extractelement <16 x i8> %indices, i32 14
+ %index15 = extractelement <16 x i8> %indices, i32 15
+ %v0 = extractelement <16 x i8> %v, i8 %index0
+ %v1 = extractelement <16 x i8> %v, i8 %index1
+ %v2 = extractelement <16 x i8> %v, i8 %index2
+ %v3 = extractelement <16 x i8> %v, i8 %index3
+ %v4 = extractelement <16 x i8> %v, i8 %index4
+ %v5 = extractelement <16 x i8> %v, i8 %index5
+ %v6 = extractelement <16 x i8> %v, i8 %index6
+ %v7 = extractelement <16 x i8> %v, i8 %index7
+ %v8 = extractelement <16 x i8> %v, i8 %index8
+ %v9 = extractelement <16 x i8> %v, i8 %index9
+ %v10 = extractelement <16 x i8> %v, i8 %index10
+ %v11 = extractelement <16 x i8> %v, i8 %index11
+ %v12 = extractelement <16 x i8> %v, i8 %index12
+ %v13 = extractelement <16 x i8> %v, i8 %index13
+ %v14 = extractelement <16 x i8> %v, i8 %index14
+ %v15 = extractelement <16 x i8> %v, i8 %index15
+ %ret0 = insertelement <16 x i8> undef, i8 %v0, i32 0
+ %ret1 = insertelement <16 x i8> %ret0, i8 %v1, i32 1
+ %ret2 = insertelement <16 x i8> %ret1, i8 %v2, i32 2
+ %ret3 = insertelement <16 x i8> %ret2, i8 %v3, i32 3
+ %ret4 = insertelement <16 x i8> %ret3, i8 %v4, i32 4
+ %ret5 = insertelement <16 x i8> %ret4, i8 %v5, i32 5
+ %ret6 = insertelement <16 x i8> %ret5, i8 %v6, i32 6
+ %ret7 = insertelement <16 x i8> %ret6, i8 %v7, i32 7
+ %ret8 = insertelement <16 x i8> %ret7, i8 %v8, i32 8
+ %ret9 = insertelement <16 x i8> %ret8, i8 %v9, i32 9
+ %ret10 = insertelement <16 x i8> %ret9, i8 %v10, i32 10
+ %ret11 = insertelement <16 x i8> %ret10, i8 %v11, i32 11
+ %ret12 = insertelement <16 x i8> %ret11, i8 %v12, i32 12
+ %ret13 = insertelement <16 x i8> %ret12, i8 %v13, i32 13
+ %ret14 = insertelement <16 x i8> %ret13, i8 %v14, i32 14
+ %ret15 = insertelement <16 x i8> %ret14, i8 %v15, i32 15
+ ret <16 x i8> %ret15
+}
+
+define <2 x double> @var_shuffle_v2f64(<2 x double> %v, <2 x i64> %indices) nounwind {
+; SSSE3-LABEL: var_shuffle_v2f64:
+; SSSE3: # %bb.0:
+; SSSE3-NEXT: movq %xmm1, %rax
+; SSSE3-NEXT: andl $1, %eax
+; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSSE3-NEXT: movq %xmm1, %rcx
+; SSSE3-NEXT: andl $1, %ecx
+; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSSE3-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; SSSE3-NEXT: movhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; SSSE3-NEXT: retq
+;
+; AVX-LABEL: var_shuffle_v2f64:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovq %xmm1, %rax
+; AVX-NEXT: andl $1, %eax
+; AVX-NEXT: vpextrq $1, %xmm1, %rcx
+; AVX-NEXT: andl $1, %ecx
+; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; AVX-NEXT: retq
+ %index0 = extractelement <2 x i64> %indices, i32 0
+ %index1 = extractelement <2 x i64> %indices, i32 1
+ %v0 = extractelement <2 x double> %v, i64 %index0
+ %v1 = extractelement <2 x double> %v, i64 %index1
+ %ret0 = insertelement <2 x double> undef, double %v0, i32 0
+ %ret1 = insertelement <2 x double> %ret0, double %v1, i32 1
+ ret <2 x double> %ret1
+}
+
+define <4 x float> @var_shuffle_v4f32(<4 x float> %v, <4 x i32> %indices) nounwind {
+; SSSE3-LABEL: var_shuffle_v4f32:
+; SSSE3: # %bb.0:
+; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; SSSE3-NEXT: movq %xmm2, %rax
+; SSSE3-NEXT: movq %rax, %rcx
+; SSSE3-NEXT: sarq $32, %rcx
+; SSSE3-NEXT: movq %xmm1, %rdx
+; SSSE3-NEXT: movq %rdx, %rsi
+; SSSE3-NEXT: sarq $32, %rsi
+; SSSE3-NEXT: andl $3, %edx
+; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSSE3-NEXT: andl $3, %esi
+; SSSE3-NEXT: andl $3, %eax
+; SSSE3-NEXT: andl $3, %ecx
+; SSSE3-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSSE3-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSSE3-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSSE3-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSSE3-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSSE3-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSSE3-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSSE3-NEXT: retq
+;
+; AVX-LABEL: var_shuffle_v4f32:
+; AVX: # %bb.0:
+; AVX-NEXT: vpextrq $1, %xmm1, %rax
+; AVX-NEXT: movq %rax, %rcx
+; AVX-NEXT: sarq $32, %rcx
+; AVX-NEXT: vmovq %xmm1, %rdx
+; AVX-NEXT: movq %rdx, %rsi
+; AVX-NEXT: sarq $32, %rsi
+; AVX-NEXT: andl $3, %edx
+; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: andl $3, %esi
+; AVX-NEXT: andl $3, %eax
+; AVX-NEXT: andl $3, %ecx
+; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
+; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
+; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
+; AVX-NEXT: retq
+ %index0 = extractelement <4 x i32> %indices, i32 0
+ %index1 = extractelement <4 x i32> %indices, i32 1
+ %index2 = extractelement <4 x i32> %indices, i32 2
+ %index3 = extractelement <4 x i32> %indices, i32 3
+ %v0 = extractelement <4 x float> %v, i32 %index0
+ %v1 = extractelement <4 x float> %v, i32 %index1
+ %v2 = extractelement <4 x float> %v, i32 %index2
+ %v3 = extractelement <4 x float> %v, i32 %index3
+ %ret0 = insertelement <4 x float> undef, float %v0, i32 0
+ %ret1 = insertelement <4 x float> %ret0, float %v1, i32 1
+ %ret2 = insertelement <4 x float> %ret1, float %v2, i32 2
+ %ret3 = insertelement <4 x float> %ret2, float %v3, i32 3
+ ret <4 x float> %ret3
+}
diff --git a/test/CodeGen/X86/var-permute-256.ll b/test/CodeGen/X86/var-permute-256.ll
new file mode 100644
index 000000000000..82a790298f23
--- /dev/null
+++ b/test/CodeGen/X86/var-permute-256.ll
@@ -0,0 +1,1285 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,INT256,AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,INT256,AVX512,AVX512F
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,INT256,AVX512,AVX512VL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX,INT256,AVX512,AVX512VLBW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+avx512vbmi | FileCheck %s --check-prefixes=AVX,INT256,AVX512,AVX512VLBW,VBMI
+
+define <4 x i64> @var_shuffle_v4i64(<4 x i64> %v, <4 x i64> %indices) nounwind {
+; AVX1-LABEL: var_shuffle_v4i64:
+; AVX1: # %bb.0:
+; AVX1-NEXT: pushq %rbp
+; AVX1-NEXT: movq %rsp, %rbp
+; AVX1-NEXT: andq $-32, %rsp
+; AVX1-NEXT: subq $64, %rsp
+; AVX1-NEXT: vmovq %xmm1, %rax
+; AVX1-NEXT: andl $3, %eax
+; AVX1-NEXT: vpextrq $1, %xmm1, %rcx
+; AVX1-NEXT: andl $3, %ecx
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT: vmovq %xmm1, %rdx
+; AVX1-NEXT: andl $3, %edx
+; AVX1-NEXT: vpextrq $1, %xmm1, %rsi
+; AVX1-NEXT: andl $3, %esi
+; AVX1-NEXT: vmovaps %ymm0, (%rsp)
+; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX1-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX1-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX1-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
+; AVX1-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: movq %rbp, %rsp
+; AVX1-NEXT: popq %rbp
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: var_shuffle_v4i64:
+; AVX2: # %bb.0:
+; AVX2-NEXT: pushq %rbp
+; AVX2-NEXT: movq %rsp, %rbp
+; AVX2-NEXT: andq $-32, %rsp
+; AVX2-NEXT: subq $64, %rsp
+; AVX2-NEXT: vmovq %xmm1, %rax
+; AVX2-NEXT: andl $3, %eax
+; AVX2-NEXT: vpextrq $1, %xmm1, %rcx
+; AVX2-NEXT: andl $3, %ecx
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1
+; AVX2-NEXT: vmovq %xmm1, %rdx
+; AVX2-NEXT: andl $3, %edx
+; AVX2-NEXT: vpextrq $1, %xmm1, %rsi
+; AVX2-NEXT: andl $3, %esi
+; AVX2-NEXT: vmovaps %ymm0, (%rsp)
+; AVX2-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX2-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX2-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX2-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
+; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: movq %rbp, %rsp
+; AVX2-NEXT: popq %rbp
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: var_shuffle_v4i64:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: pushq %rbp
+; AVX512F-NEXT: movq %rsp, %rbp
+; AVX512F-NEXT: andq $-32, %rsp
+; AVX512F-NEXT: subq $64, %rsp
+; AVX512F-NEXT: vmovq %xmm1, %rax
+; AVX512F-NEXT: andl $3, %eax
+; AVX512F-NEXT: vpextrq $1, %xmm1, %rcx
+; AVX512F-NEXT: andl $3, %ecx
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm1
+; AVX512F-NEXT: vmovq %xmm1, %rdx
+; AVX512F-NEXT: andl $3, %edx
+; AVX512F-NEXT: vpextrq $1, %xmm1, %rsi
+; AVX512F-NEXT: andl $3, %esi
+; AVX512F-NEXT: vmovaps %ymm0, (%rsp)
+; AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX512F-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX512F-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512F-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX512F-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
+; AVX512F-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX512F-NEXT: movq %rbp, %rsp
+; AVX512F-NEXT: popq %rbp
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: var_shuffle_v4i64:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpermpd %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT: retq
+;
+; AVX512VLBW-LABEL: var_shuffle_v4i64:
+; AVX512VLBW: # %bb.0:
+; AVX512VLBW-NEXT: vpermpd %ymm0, %ymm1, %ymm0
+; AVX512VLBW-NEXT: retq
+ %index0 = extractelement <4 x i64> %indices, i32 0
+ %index1 = extractelement <4 x i64> %indices, i32 1
+ %index2 = extractelement <4 x i64> %indices, i32 2
+ %index3 = extractelement <4 x i64> %indices, i32 3
+ %v0 = extractelement <4 x i64> %v, i64 %index0
+ %v1 = extractelement <4 x i64> %v, i64 %index1
+ %v2 = extractelement <4 x i64> %v, i64 %index2
+ %v3 = extractelement <4 x i64> %v, i64 %index3
+ %ret0 = insertelement <4 x i64> undef, i64 %v0, i32 0
+ %ret1 = insertelement <4 x i64> %ret0, i64 %v1, i32 1
+ %ret2 = insertelement <4 x i64> %ret1, i64 %v2, i32 2
+ %ret3 = insertelement <4 x i64> %ret2, i64 %v3, i32 3
+ ret <4 x i64> %ret3
+}
+
+define <8 x i32> @var_shuffle_v8i32(<8 x i32> %v, <8 x i32> %indices) nounwind {
+; AVX1-LABEL: var_shuffle_v8i32:
+; AVX1: # %bb.0:
+; AVX1-NEXT: pushq %rbp
+; AVX1-NEXT: movq %rsp, %rbp
+; AVX1-NEXT: andq $-32, %rsp
+; AVX1-NEXT: subq $64, %rsp
+; AVX1-NEXT: vpextrq $1, %xmm1, %r8
+; AVX1-NEXT: movq %r8, %rcx
+; AVX1-NEXT: shrq $30, %rcx
+; AVX1-NEXT: vmovq %xmm1, %r9
+; AVX1-NEXT: movq %r9, %rsi
+; AVX1-NEXT: shrq $30, %rsi
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT: vpextrq $1, %xmm1, %r10
+; AVX1-NEXT: movq %r10, %rdi
+; AVX1-NEXT: shrq $30, %rdi
+; AVX1-NEXT: vmovq %xmm1, %rax
+; AVX1-NEXT: movq %rax, %rdx
+; AVX1-NEXT: shrq $30, %rdx
+; AVX1-NEXT: vmovaps %ymm0, (%rsp)
+; AVX1-NEXT: andl $7, %r9d
+; AVX1-NEXT: andl $28, %esi
+; AVX1-NEXT: andl $7, %r8d
+; AVX1-NEXT: andl $28, %ecx
+; AVX1-NEXT: andl $7, %eax
+; AVX1-NEXT: andl $28, %edx
+; AVX1-NEXT: andl $7, %r10d
+; AVX1-NEXT: andl $28, %edi
+; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX1-NEXT: movq %rsp, %rax
+; AVX1-NEXT: vpinsrd $1, (%rdx,%rax), %xmm0, %xmm0
+; AVX1-NEXT: vpinsrd $2, (%rsp,%r10,4), %xmm0, %xmm0
+; AVX1-NEXT: vpinsrd $3, (%rdi,%rax), %xmm0, %xmm0
+; AVX1-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX1-NEXT: vpinsrd $1, (%rsi,%rax), %xmm1, %xmm1
+; AVX1-NEXT: vpinsrd $2, (%rsp,%r8,4), %xmm1, %xmm1
+; AVX1-NEXT: vpinsrd $3, (%rcx,%rax), %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: movq %rbp, %rsp
+; AVX1-NEXT: popq %rbp
+; AVX1-NEXT: retq
+;
+; INT256-LABEL: var_shuffle_v8i32:
+; INT256: # %bb.0:
+; INT256-NEXT: vpermps %ymm0, %ymm1, %ymm0
+; INT256-NEXT: retq
+ %index0 = extractelement <8 x i32> %indices, i32 0
+ %index1 = extractelement <8 x i32> %indices, i32 1
+ %index2 = extractelement <8 x i32> %indices, i32 2
+ %index3 = extractelement <8 x i32> %indices, i32 3
+ %index4 = extractelement <8 x i32> %indices, i32 4
+ %index5 = extractelement <8 x i32> %indices, i32 5
+ %index6 = extractelement <8 x i32> %indices, i32 6
+ %index7 = extractelement <8 x i32> %indices, i32 7
+ %v0 = extractelement <8 x i32> %v, i32 %index0
+ %v1 = extractelement <8 x i32> %v, i32 %index1
+ %v2 = extractelement <8 x i32> %v, i32 %index2
+ %v3 = extractelement <8 x i32> %v, i32 %index3
+ %v4 = extractelement <8 x i32> %v, i32 %index4
+ %v5 = extractelement <8 x i32> %v, i32 %index5
+ %v6 = extractelement <8 x i32> %v, i32 %index6
+ %v7 = extractelement <8 x i32> %v, i32 %index7
+ %ret0 = insertelement <8 x i32> undef, i32 %v0, i32 0
+ %ret1 = insertelement <8 x i32> %ret0, i32 %v1, i32 1
+ %ret2 = insertelement <8 x i32> %ret1, i32 %v2, i32 2
+ %ret3 = insertelement <8 x i32> %ret2, i32 %v3, i32 3
+ %ret4 = insertelement <8 x i32> %ret3, i32 %v4, i32 4
+ %ret5 = insertelement <8 x i32> %ret4, i32 %v5, i32 5
+ %ret6 = insertelement <8 x i32> %ret5, i32 %v6, i32 6
+ %ret7 = insertelement <8 x i32> %ret6, i32 %v7, i32 7
+ ret <8 x i32> %ret7
+}
+
+define <16 x i16> @var_shuffle_v16i16(<16 x i16> %v, <16 x i16> %indices) nounwind {
+; AVX1-LABEL: var_shuffle_v16i16:
+; AVX1: # %bb.0:
+; AVX1-NEXT: pushq %rbp
+; AVX1-NEXT: movq %rsp, %rbp
+; AVX1-NEXT: andq $-32, %rsp
+; AVX1-NEXT: subq $64, %rsp
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vmovd %xmm2, %eax
+; AVX1-NEXT: vmovaps %ymm0, (%rsp)
+; AVX1-NEXT: andl $15, %eax
+; AVX1-NEXT: movzwl (%rsp,%rax,2), %eax
+; AVX1-NEXT: vmovd %eax, %xmm0
+; AVX1-NEXT: vpextrw $1, %xmm2, %eax
+; AVX1-NEXT: andl $15, %eax
+; AVX1-NEXT: vpinsrw $1, (%rsp,%rax,2), %xmm0, %xmm0
+; AVX1-NEXT: vpextrw $2, %xmm2, %eax
+; AVX1-NEXT: andl $15, %eax
+; AVX1-NEXT: vpinsrw $2, (%rsp,%rax,2), %xmm0, %xmm0
+; AVX1-NEXT: vpextrw $3, %xmm2, %eax
+; AVX1-NEXT: andl $15, %eax
+; AVX1-NEXT: vpinsrw $3, (%rsp,%rax,2), %xmm0, %xmm0
+; AVX1-NEXT: vpextrw $4, %xmm2, %eax
+; AVX1-NEXT: andl $15, %eax
+; AVX1-NEXT: vpinsrw $4, (%rsp,%rax,2), %xmm0, %xmm0
+; AVX1-NEXT: vpextrw $5, %xmm2, %eax
+; AVX1-NEXT: andl $15, %eax
+; AVX1-NEXT: vpinsrw $5, (%rsp,%rax,2), %xmm0, %xmm0
+; AVX1-NEXT: vpextrw $6, %xmm2, %eax
+; AVX1-NEXT: andl $15, %eax
+; AVX1-NEXT: vpinsrw $6, (%rsp,%rax,2), %xmm0, %xmm0
+; AVX1-NEXT: vpextrw $7, %xmm2, %eax
+; AVX1-NEXT: andl $15, %eax
+; AVX1-NEXT: vpinsrw $7, (%rsp,%rax,2), %xmm0, %xmm0
+; AVX1-NEXT: vmovd %xmm1, %eax
+; AVX1-NEXT: andl $15, %eax
+; AVX1-NEXT: movzwl (%rsp,%rax,2), %eax
+; AVX1-NEXT: vmovd %eax, %xmm2
+; AVX1-NEXT: vpextrw $1, %xmm1, %eax
+; AVX1-NEXT: andl $15, %eax
+; AVX1-NEXT: vpinsrw $1, (%rsp,%rax,2), %xmm2, %xmm2
+; AVX1-NEXT: vpextrw $2, %xmm1, %eax
+; AVX1-NEXT: andl $15, %eax
+; AVX1-NEXT: vpinsrw $2, (%rsp,%rax,2), %xmm2, %xmm2
+; AVX1-NEXT: vpextrw $3, %xmm1, %eax
+; AVX1-NEXT: andl $15, %eax
+; AVX1-NEXT: vpinsrw $3, (%rsp,%rax,2), %xmm2, %xmm2
+; AVX1-NEXT: vpextrw $4, %xmm1, %eax
+; AVX1-NEXT: andl $15, %eax
+; AVX1-NEXT: vpinsrw $4, (%rsp,%rax,2), %xmm2, %xmm2
+; AVX1-NEXT: vpextrw $5, %xmm1, %eax
+; AVX1-NEXT: andl $15, %eax
+; AVX1-NEXT: vpinsrw $5, (%rsp,%rax,2), %xmm2, %xmm2
+; AVX1-NEXT: vpextrw $6, %xmm1, %eax
+; AVX1-NEXT: andl $15, %eax
+; AVX1-NEXT: vpinsrw $6, (%rsp,%rax,2), %xmm2, %xmm2
+; AVX1-NEXT: vpextrw $7, %xmm1, %eax
+; AVX1-NEXT: andl $15, %eax
+; AVX1-NEXT: vpinsrw $7, (%rsp,%rax,2), %xmm2, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: movq %rbp, %rsp
+; AVX1-NEXT: popq %rbp
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: var_shuffle_v16i16:
+; AVX2: # %bb.0:
+; AVX2-NEXT: pushq %rbp
+; AVX2-NEXT: movq %rsp, %rbp
+; AVX2-NEXT: andq $-32, %rsp
+; AVX2-NEXT: subq $64, %rsp
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vmovd %xmm2, %eax
+; AVX2-NEXT: vmovaps %ymm0, (%rsp)
+; AVX2-NEXT: andl $15, %eax
+; AVX2-NEXT: movzwl (%rsp,%rax,2), %eax
+; AVX2-NEXT: vmovd %eax, %xmm0
+; AVX2-NEXT: vpextrw $1, %xmm2, %eax
+; AVX2-NEXT: andl $15, %eax
+; AVX2-NEXT: vpinsrw $1, (%rsp,%rax,2), %xmm0, %xmm0
+; AVX2-NEXT: vpextrw $2, %xmm2, %eax
+; AVX2-NEXT: andl $15, %eax
+; AVX2-NEXT: vpinsrw $2, (%rsp,%rax,2), %xmm0, %xmm0
+; AVX2-NEXT: vpextrw $3, %xmm2, %eax
+; AVX2-NEXT: andl $15, %eax
+; AVX2-NEXT: vpinsrw $3, (%rsp,%rax,2), %xmm0, %xmm0
+; AVX2-NEXT: vpextrw $4, %xmm2, %eax
+; AVX2-NEXT: andl $15, %eax
+; AVX2-NEXT: vpinsrw $4, (%rsp,%rax,2), %xmm0, %xmm0
+; AVX2-NEXT: vpextrw $5, %xmm2, %eax
+; AVX2-NEXT: andl $15, %eax
+; AVX2-NEXT: vpinsrw $5, (%rsp,%rax,2), %xmm0, %xmm0
+; AVX2-NEXT: vpextrw $6, %xmm2, %eax
+; AVX2-NEXT: andl $15, %eax
+; AVX2-NEXT: vpinsrw $6, (%rsp,%rax,2), %xmm0, %xmm0
+; AVX2-NEXT: vpextrw $7, %xmm2, %eax
+; AVX2-NEXT: andl $15, %eax
+; AVX2-NEXT: vpinsrw $7, (%rsp,%rax,2), %xmm0, %xmm0
+; AVX2-NEXT: vmovd %xmm1, %eax
+; AVX2-NEXT: andl $15, %eax
+; AVX2-NEXT: movzwl (%rsp,%rax,2), %eax
+; AVX2-NEXT: vmovd %eax, %xmm2
+; AVX2-NEXT: vpextrw $1, %xmm1, %eax
+; AVX2-NEXT: andl $15, %eax
+; AVX2-NEXT: vpinsrw $1, (%rsp,%rax,2), %xmm2, %xmm2
+; AVX2-NEXT: vpextrw $2, %xmm1, %eax
+; AVX2-NEXT: andl $15, %eax
+; AVX2-NEXT: vpinsrw $2, (%rsp,%rax,2), %xmm2, %xmm2
+; AVX2-NEXT: vpextrw $3, %xmm1, %eax
+; AVX2-NEXT: andl $15, %eax
+; AVX2-NEXT: vpinsrw $3, (%rsp,%rax,2), %xmm2, %xmm2
+; AVX2-NEXT: vpextrw $4, %xmm1, %eax
+; AVX2-NEXT: andl $15, %eax
+; AVX2-NEXT: vpinsrw $4, (%rsp,%rax,2), %xmm2, %xmm2
+; AVX2-NEXT: vpextrw $5, %xmm1, %eax
+; AVX2-NEXT: andl $15, %eax
+; AVX2-NEXT: vpinsrw $5, (%rsp,%rax,2), %xmm2, %xmm2
+; AVX2-NEXT: vpextrw $6, %xmm1, %eax
+; AVX2-NEXT: andl $15, %eax
+; AVX2-NEXT: vpinsrw $6, (%rsp,%rax,2), %xmm2, %xmm2
+; AVX2-NEXT: vpextrw $7, %xmm1, %eax
+; AVX2-NEXT: andl $15, %eax
+; AVX2-NEXT: vpinsrw $7, (%rsp,%rax,2), %xmm2, %xmm1
+; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: movq %rbp, %rsp
+; AVX2-NEXT: popq %rbp
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: var_shuffle_v16i16:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: pushq %rbp
+; AVX512F-NEXT: movq %rsp, %rbp
+; AVX512F-NEXT: andq $-32, %rsp
+; AVX512F-NEXT: subq $64, %rsp
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT: vmovd %xmm2, %eax
+; AVX512F-NEXT: vmovaps %ymm0, (%rsp)
+; AVX512F-NEXT: andl $15, %eax
+; AVX512F-NEXT: movzwl (%rsp,%rax,2), %eax
+; AVX512F-NEXT: vmovd %eax, %xmm0
+; AVX512F-NEXT: vpextrw $1, %xmm2, %eax
+; AVX512F-NEXT: andl $15, %eax
+; AVX512F-NEXT: vpinsrw $1, (%rsp,%rax,2), %xmm0, %xmm0
+; AVX512F-NEXT: vpextrw $2, %xmm2, %eax
+; AVX512F-NEXT: andl $15, %eax
+; AVX512F-NEXT: vpinsrw $2, (%rsp,%rax,2), %xmm0, %xmm0
+; AVX512F-NEXT: vpextrw $3, %xmm2, %eax
+; AVX512F-NEXT: andl $15, %eax
+; AVX512F-NEXT: vpinsrw $3, (%rsp,%rax,2), %xmm0, %xmm0
+; AVX512F-NEXT: vpextrw $4, %xmm2, %eax
+; AVX512F-NEXT: andl $15, %eax
+; AVX512F-NEXT: vpinsrw $4, (%rsp,%rax,2), %xmm0, %xmm0
+; AVX512F-NEXT: vpextrw $5, %xmm2, %eax
+; AVX512F-NEXT: andl $15, %eax
+; AVX512F-NEXT: vpinsrw $5, (%rsp,%rax,2), %xmm0, %xmm0
+; AVX512F-NEXT: vpextrw $6, %xmm2, %eax
+; AVX512F-NEXT: andl $15, %eax
+; AVX512F-NEXT: vpinsrw $6, (%rsp,%rax,2), %xmm0, %xmm0
+; AVX512F-NEXT: vpextrw $7, %xmm2, %eax
+; AVX512F-NEXT: andl $15, %eax
+; AVX512F-NEXT: vpinsrw $7, (%rsp,%rax,2), %xmm0, %xmm0
+; AVX512F-NEXT: vmovd %xmm1, %eax
+; AVX512F-NEXT: andl $15, %eax
+; AVX512F-NEXT: movzwl (%rsp,%rax,2), %eax
+; AVX512F-NEXT: vmovd %eax, %xmm2
+; AVX512F-NEXT: vpextrw $1, %xmm1, %eax
+; AVX512F-NEXT: andl $15, %eax
+; AVX512F-NEXT: vpinsrw $1, (%rsp,%rax,2), %xmm2, %xmm2
+; AVX512F-NEXT: vpextrw $2, %xmm1, %eax
+; AVX512F-NEXT: andl $15, %eax
+; AVX512F-NEXT: vpinsrw $2, (%rsp,%rax,2), %xmm2, %xmm2
+; AVX512F-NEXT: vpextrw $3, %xmm1, %eax
+; AVX512F-NEXT: andl $15, %eax
+; AVX512F-NEXT: vpinsrw $3, (%rsp,%rax,2), %xmm2, %xmm2
+; AVX512F-NEXT: vpextrw $4, %xmm1, %eax
+; AVX512F-NEXT: andl $15, %eax
+; AVX512F-NEXT: vpinsrw $4, (%rsp,%rax,2), %xmm2, %xmm2
+; AVX512F-NEXT: vpextrw $5, %xmm1, %eax
+; AVX512F-NEXT: andl $15, %eax
+; AVX512F-NEXT: vpinsrw $5, (%rsp,%rax,2), %xmm2, %xmm2
+; AVX512F-NEXT: vpextrw $6, %xmm1, %eax
+; AVX512F-NEXT: andl $15, %eax
+; AVX512F-NEXT: vpinsrw $6, (%rsp,%rax,2), %xmm2, %xmm2
+; AVX512F-NEXT: vpextrw $7, %xmm1, %eax
+; AVX512F-NEXT: andl $15, %eax
+; AVX512F-NEXT: vpinsrw $7, (%rsp,%rax,2), %xmm2, %xmm1
+; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512F-NEXT: movq %rbp, %rsp
+; AVX512F-NEXT: popq %rbp
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: var_shuffle_v16i16:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: pushq %rbp
+; AVX512VL-NEXT: movq %rsp, %rbp
+; AVX512VL-NEXT: andq $-32, %rsp
+; AVX512VL-NEXT: subq $64, %rsp
+; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512VL-NEXT: vmovd %xmm2, %eax
+; AVX512VL-NEXT: vmovaps %ymm0, (%rsp)
+; AVX512VL-NEXT: andl $15, %eax
+; AVX512VL-NEXT: movzwl (%rsp,%rax,2), %eax
+; AVX512VL-NEXT: vmovd %eax, %xmm0
+; AVX512VL-NEXT: vpextrw $1, %xmm2, %eax
+; AVX512VL-NEXT: andl $15, %eax
+; AVX512VL-NEXT: vpinsrw $1, (%rsp,%rax,2), %xmm0, %xmm0
+; AVX512VL-NEXT: vpextrw $2, %xmm2, %eax
+; AVX512VL-NEXT: andl $15, %eax
+; AVX512VL-NEXT: vpinsrw $2, (%rsp,%rax,2), %xmm0, %xmm0
+; AVX512VL-NEXT: vpextrw $3, %xmm2, %eax
+; AVX512VL-NEXT: andl $15, %eax
+; AVX512VL-NEXT: vpinsrw $3, (%rsp,%rax,2), %xmm0, %xmm0
+; AVX512VL-NEXT: vpextrw $4, %xmm2, %eax
+; AVX512VL-NEXT: andl $15, %eax
+; AVX512VL-NEXT: vpinsrw $4, (%rsp,%rax,2), %xmm0, %xmm0
+; AVX512VL-NEXT: vpextrw $5, %xmm2, %eax
+; AVX512VL-NEXT: andl $15, %eax
+; AVX512VL-NEXT: vpinsrw $5, (%rsp,%rax,2), %xmm0, %xmm0
+; AVX512VL-NEXT: vpextrw $6, %xmm2, %eax
+; AVX512VL-NEXT: andl $15, %eax
+; AVX512VL-NEXT: vpinsrw $6, (%rsp,%rax,2), %xmm0, %xmm0
+; AVX512VL-NEXT: vpextrw $7, %xmm2, %eax
+; AVX512VL-NEXT: andl $15, %eax
+; AVX512VL-NEXT: vpinsrw $7, (%rsp,%rax,2), %xmm0, %xmm0
+; AVX512VL-NEXT: vmovd %xmm1, %eax
+; AVX512VL-NEXT: andl $15, %eax
+; AVX512VL-NEXT: movzwl (%rsp,%rax,2), %eax
+; AVX512VL-NEXT: vmovd %eax, %xmm2
+; AVX512VL-NEXT: vpextrw $1, %xmm1, %eax
+; AVX512VL-NEXT: andl $15, %eax
+; AVX512VL-NEXT: vpinsrw $1, (%rsp,%rax,2), %xmm2, %xmm2
+; AVX512VL-NEXT: vpextrw $2, %xmm1, %eax
+; AVX512VL-NEXT: andl $15, %eax
+; AVX512VL-NEXT: vpinsrw $2, (%rsp,%rax,2), %xmm2, %xmm2
+; AVX512VL-NEXT: vpextrw $3, %xmm1, %eax
+; AVX512VL-NEXT: andl $15, %eax
+; AVX512VL-NEXT: vpinsrw $3, (%rsp,%rax,2), %xmm2, %xmm2
+; AVX512VL-NEXT: vpextrw $4, %xmm1, %eax
+; AVX512VL-NEXT: andl $15, %eax
+; AVX512VL-NEXT: vpinsrw $4, (%rsp,%rax,2), %xmm2, %xmm2
+; AVX512VL-NEXT: vpextrw $5, %xmm1, %eax
+; AVX512VL-NEXT: andl $15, %eax
+; AVX512VL-NEXT: vpinsrw $5, (%rsp,%rax,2), %xmm2, %xmm2
+; AVX512VL-NEXT: vpextrw $6, %xmm1, %eax
+; AVX512VL-NEXT: andl $15, %eax
+; AVX512VL-NEXT: vpinsrw $6, (%rsp,%rax,2), %xmm2, %xmm2
+; AVX512VL-NEXT: vpextrw $7, %xmm1, %eax
+; AVX512VL-NEXT: andl $15, %eax
+; AVX512VL-NEXT: vpinsrw $7, (%rsp,%rax,2), %xmm2, %xmm1
+; AVX512VL-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512VL-NEXT: movq %rbp, %rsp
+; AVX512VL-NEXT: popq %rbp
+; AVX512VL-NEXT: retq
+;
+; AVX512VLBW-LABEL: var_shuffle_v16i16:
+; AVX512VLBW: # %bb.0:
+; AVX512VLBW-NEXT: vpermw %ymm0, %ymm1, %ymm0
+; AVX512VLBW-NEXT: retq
+ %index0 = extractelement <16 x i16> %indices, i32 0
+ %index1 = extractelement <16 x i16> %indices, i32 1
+ %index2 = extractelement <16 x i16> %indices, i32 2
+ %index3 = extractelement <16 x i16> %indices, i32 3
+ %index4 = extractelement <16 x i16> %indices, i32 4
+ %index5 = extractelement <16 x i16> %indices, i32 5
+ %index6 = extractelement <16 x i16> %indices, i32 6
+ %index7 = extractelement <16 x i16> %indices, i32 7
+ %index8 = extractelement <16 x i16> %indices, i32 8
+ %index9 = extractelement <16 x i16> %indices, i32 9
+ %index10 = extractelement <16 x i16> %indices, i32 10
+ %index11 = extractelement <16 x i16> %indices, i32 11
+ %index12 = extractelement <16 x i16> %indices, i32 12
+ %index13 = extractelement <16 x i16> %indices, i32 13
+ %index14 = extractelement <16 x i16> %indices, i32 14
+ %index15 = extractelement <16 x i16> %indices, i32 15
+ %v0 = extractelement <16 x i16> %v, i16 %index0
+ %v1 = extractelement <16 x i16> %v, i16 %index1
+ %v2 = extractelement <16 x i16> %v, i16 %index2
+ %v3 = extractelement <16 x i16> %v, i16 %index3
+ %v4 = extractelement <16 x i16> %v, i16 %index4
+ %v5 = extractelement <16 x i16> %v, i16 %index5
+ %v6 = extractelement <16 x i16> %v, i16 %index6
+ %v7 = extractelement <16 x i16> %v, i16 %index7
+ %v8 = extractelement <16 x i16> %v, i16 %index8
+ %v9 = extractelement <16 x i16> %v, i16 %index9
+ %v10 = extractelement <16 x i16> %v, i16 %index10
+ %v11 = extractelement <16 x i16> %v, i16 %index11
+ %v12 = extractelement <16 x i16> %v, i16 %index12
+ %v13 = extractelement <16 x i16> %v, i16 %index13
+ %v14 = extractelement <16 x i16> %v, i16 %index14
+ %v15 = extractelement <16 x i16> %v, i16 %index15
+ %ret0 = insertelement <16 x i16> undef, i16 %v0, i32 0
+ %ret1 = insertelement <16 x i16> %ret0, i16 %v1, i32 1
+ %ret2 = insertelement <16 x i16> %ret1, i16 %v2, i32 2
+ %ret3 = insertelement <16 x i16> %ret2, i16 %v3, i32 3
+ %ret4 = insertelement <16 x i16> %ret3, i16 %v4, i32 4
+ %ret5 = insertelement <16 x i16> %ret4, i16 %v5, i32 5
+ %ret6 = insertelement <16 x i16> %ret5, i16 %v6, i32 6
+ %ret7 = insertelement <16 x i16> %ret6, i16 %v7, i32 7
+ %ret8 = insertelement <16 x i16> %ret7, i16 %v8, i32 8
+ %ret9 = insertelement <16 x i16> %ret8, i16 %v9, i32 9
+ %ret10 = insertelement <16 x i16> %ret9, i16 %v10, i32 10
+ %ret11 = insertelement <16 x i16> %ret10, i16 %v11, i32 11
+ %ret12 = insertelement <16 x i16> %ret11, i16 %v12, i32 12
+ %ret13 = insertelement <16 x i16> %ret12, i16 %v13, i32 13
+ %ret14 = insertelement <16 x i16> %ret13, i16 %v14, i32 14
+ %ret15 = insertelement <16 x i16> %ret14, i16 %v15, i32 15
+ ret <16 x i16> %ret15
+}
+
+define <32 x i8> @var_shuffle_v32i8(<32 x i8> %v, <32 x i8> %indices) nounwind {
+; AVX1-LABEL: var_shuffle_v32i8:
+; AVX1: # %bb.0:
+; AVX1-NEXT: pushq %rbp
+; AVX1-NEXT: movq %rsp, %rbp
+; AVX1-NEXT: andq $-32, %rsp
+; AVX1-NEXT: subq $64, %rsp
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpextrb $0, %xmm2, %eax
+; AVX1-NEXT: vmovaps %ymm0, (%rsp)
+; AVX1-NEXT: andl $31, %eax
+; AVX1-NEXT: movq %rsp, %rcx
+; AVX1-NEXT: movzbl (%rax,%rcx), %eax
+; AVX1-NEXT: vmovd %eax, %xmm0
+; AVX1-NEXT: vpextrb $1, %xmm2, %eax
+; AVX1-NEXT: andl $31, %eax
+; AVX1-NEXT: movzbl (%rax,%rcx), %eax
+; AVX1-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
+; AVX1-NEXT: vpextrb $2, %xmm2, %eax
+; AVX1-NEXT: andl $31, %eax
+; AVX1-NEXT: movzbl (%rax,%rcx), %eax
+; AVX1-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; AVX1-NEXT: vpextrb $3, %xmm2, %eax
+; AVX1-NEXT: andl $31, %eax
+; AVX1-NEXT: movzbl (%rax,%rcx), %eax
+; AVX1-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
+; AVX1-NEXT: vpextrb $4, %xmm2, %eax
+; AVX1-NEXT: andl $31, %eax
+; AVX1-NEXT: movzbl (%rax,%rcx), %eax
+; AVX1-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
+; AVX1-NEXT: vpextrb $5, %xmm2, %eax
+; AVX1-NEXT: andl $31, %eax
+; AVX1-NEXT: movzbl (%rax,%rcx), %eax
+; AVX1-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; AVX1-NEXT: vpextrb $6, %xmm2, %eax
+; AVX1-NEXT: andl $31, %eax
+; AVX1-NEXT: movzbl (%rax,%rcx), %eax
+; AVX1-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; AVX1-NEXT: vpextrb $7, %xmm2, %eax
+; AVX1-NEXT: andl $31, %eax
+; AVX1-NEXT: movzbl (%rax,%rcx), %eax
+; AVX1-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; AVX1-NEXT: vpextrb $8, %xmm2, %eax
+; AVX1-NEXT: andl $31, %eax
+; AVX1-NEXT: movzbl (%rax,%rcx), %eax
+; AVX1-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
+; AVX1-NEXT: vpextrb $9, %xmm2, %eax
+; AVX1-NEXT: andl $31, %eax
+; AVX1-NEXT: movzbl (%rax,%rcx), %eax
+; AVX1-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
+; AVX1-NEXT: vpextrb $10, %xmm2, %eax
+; AVX1-NEXT: andl $31, %eax
+; AVX1-NEXT: movzbl (%rax,%rcx), %eax
+; AVX1-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
+; AVX1-NEXT: vpextrb $11, %xmm2, %eax
+; AVX1-NEXT: andl $31, %eax
+; AVX1-NEXT: movzbl (%rax,%rcx), %eax
+; AVX1-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; AVX1-NEXT: vpextrb $12, %xmm2, %eax
+; AVX1-NEXT: andl $31, %eax
+; AVX1-NEXT: movzbl (%rax,%rcx), %eax
+; AVX1-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
+; AVX1-NEXT: vpextrb $13, %xmm2, %eax
+; AVX1-NEXT: andl $31, %eax
+; AVX1-NEXT: movzbl (%rax,%rcx), %eax
+; AVX1-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
+; AVX1-NEXT: vpextrb $14, %xmm2, %eax
+; AVX1-NEXT: andl $31, %eax
+; AVX1-NEXT: movzbl (%rax,%rcx), %eax
+; AVX1-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
+; AVX1-NEXT: vpextrb $15, %xmm2, %eax
+; AVX1-NEXT: andl $31, %eax
+; AVX1-NEXT: movzbl (%rax,%rcx), %eax
+; AVX1-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; AVX1-NEXT: vpextrb $0, %xmm1, %eax
+; AVX1-NEXT: andl $31, %eax
+; AVX1-NEXT: movzbl (%rax,%rcx), %eax
+; AVX1-NEXT: vmovd %eax, %xmm2
+; AVX1-NEXT: vpextrb $1, %xmm1, %eax
+; AVX1-NEXT: andl $31, %eax
+; AVX1-NEXT: vpinsrb $1, (%rax,%rcx), %xmm2, %xmm2
+; AVX1-NEXT: vpextrb $2, %xmm1, %eax
+; AVX1-NEXT: andl $31, %eax
+; AVX1-NEXT: vpinsrb $2, (%rax,%rcx), %xmm2, %xmm2
+; AVX1-NEXT: vpextrb $3, %xmm1, %eax
+; AVX1-NEXT: andl $31, %eax
+; AVX1-NEXT: vpinsrb $3, (%rax,%rcx), %xmm2, %xmm2
+; AVX1-NEXT: vpextrb $4, %xmm1, %eax
+; AVX1-NEXT: andl $31, %eax
+; AVX1-NEXT: vpinsrb $4, (%rax,%rcx), %xmm2, %xmm2
+; AVX1-NEXT: vpextrb $5, %xmm1, %eax
+; AVX1-NEXT: andl $31, %eax
+; AVX1-NEXT: vpinsrb $5, (%rax,%rcx), %xmm2, %xmm2
+; AVX1-NEXT: vpextrb $6, %xmm1, %eax
+; AVX1-NEXT: andl $31, %eax
+; AVX1-NEXT: vpinsrb $6, (%rax,%rcx), %xmm2, %xmm2
+; AVX1-NEXT: vpextrb $7, %xmm1, %eax
+; AVX1-NEXT: andl $31, %eax
+; AVX1-NEXT: vpinsrb $7, (%rax,%rcx), %xmm2, %xmm2
+; AVX1-NEXT: vpextrb $8, %xmm1, %eax
+; AVX1-NEXT: andl $31, %eax
+; AVX1-NEXT: vpinsrb $8, (%rax,%rcx), %xmm2, %xmm2
+; AVX1-NEXT: vpextrb $9, %xmm1, %eax
+; AVX1-NEXT: andl $31, %eax
+; AVX1-NEXT: vpinsrb $9, (%rax,%rcx), %xmm2, %xmm2
+; AVX1-NEXT: vpextrb $10, %xmm1, %eax
+; AVX1-NEXT: andl $31, %eax
+; AVX1-NEXT: vpinsrb $10, (%rax,%rcx), %xmm2, %xmm2
+; AVX1-NEXT: vpextrb $11, %xmm1, %eax
+; AVX1-NEXT: andl $31, %eax
+; AVX1-NEXT: vpinsrb $11, (%rax,%rcx), %xmm2, %xmm2
+; AVX1-NEXT: vpextrb $12, %xmm1, %eax
+; AVX1-NEXT: andl $31, %eax
+; AVX1-NEXT: vpinsrb $12, (%rax,%rcx), %xmm2, %xmm2
+; AVX1-NEXT: vpextrb $13, %xmm1, %eax
+; AVX1-NEXT: andl $31, %eax
+; AVX1-NEXT: vpinsrb $13, (%rax,%rcx), %xmm2, %xmm2
+; AVX1-NEXT: vpextrb $14, %xmm1, %eax
+; AVX1-NEXT: andl $31, %eax
+; AVX1-NEXT: vpinsrb $14, (%rax,%rcx), %xmm2, %xmm2
+; AVX1-NEXT: vpextrb $15, %xmm1, %eax
+; AVX1-NEXT: andl $31, %eax
+; AVX1-NEXT: movzbl (%rax,%rcx), %eax
+; AVX1-NEXT: vpinsrb $15, %eax, %xmm2, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: movq %rbp, %rsp
+; AVX1-NEXT: popq %rbp
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: var_shuffle_v32i8:
+; AVX2: # %bb.0:
+; AVX2-NEXT: pushq %rbp
+; AVX2-NEXT: movq %rsp, %rbp
+; AVX2-NEXT: andq $-32, %rsp
+; AVX2-NEXT: subq $64, %rsp
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpextrb $0, %xmm2, %eax
+; AVX2-NEXT: vmovaps %ymm0, (%rsp)
+; AVX2-NEXT: andl $31, %eax
+; AVX2-NEXT: movq %rsp, %rcx
+; AVX2-NEXT: movzbl (%rax,%rcx), %eax
+; AVX2-NEXT: vmovd %eax, %xmm0
+; AVX2-NEXT: vpextrb $1, %xmm2, %eax
+; AVX2-NEXT: andl $31, %eax
+; AVX2-NEXT: movzbl (%rax,%rcx), %eax
+; AVX2-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
+; AVX2-NEXT: vpextrb $2, %xmm2, %eax
+; AVX2-NEXT: andl $31, %eax
+; AVX2-NEXT: movzbl (%rax,%rcx), %eax
+; AVX2-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; AVX2-NEXT: vpextrb $3, %xmm2, %eax
+; AVX2-NEXT: andl $31, %eax
+; AVX2-NEXT: movzbl (%rax,%rcx), %eax
+; AVX2-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
+; AVX2-NEXT: vpextrb $4, %xmm2, %eax
+; AVX2-NEXT: andl $31, %eax
+; AVX2-NEXT: movzbl (%rax,%rcx), %eax
+; AVX2-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
+; AVX2-NEXT: vpextrb $5, %xmm2, %eax
+; AVX2-NEXT: andl $31, %eax
+; AVX2-NEXT: movzbl (%rax,%rcx), %eax
+; AVX2-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; AVX2-NEXT: vpextrb $6, %xmm2, %eax
+; AVX2-NEXT: andl $31, %eax
+; AVX2-NEXT: movzbl (%rax,%rcx), %eax
+; AVX2-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; AVX2-NEXT: vpextrb $7, %xmm2, %eax
+; AVX2-NEXT: andl $31, %eax
+; AVX2-NEXT: movzbl (%rax,%rcx), %eax
+; AVX2-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; AVX2-NEXT: vpextrb $8, %xmm2, %eax
+; AVX2-NEXT: andl $31, %eax
+; AVX2-NEXT: movzbl (%rax,%rcx), %eax
+; AVX2-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
+; AVX2-NEXT: vpextrb $9, %xmm2, %eax
+; AVX2-NEXT: andl $31, %eax
+; AVX2-NEXT: movzbl (%rax,%rcx), %eax
+; AVX2-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
+; AVX2-NEXT: vpextrb $10, %xmm2, %eax
+; AVX2-NEXT: andl $31, %eax
+; AVX2-NEXT: movzbl (%rax,%rcx), %eax
+; AVX2-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
+; AVX2-NEXT: vpextrb $11, %xmm2, %eax
+; AVX2-NEXT: andl $31, %eax
+; AVX2-NEXT: movzbl (%rax,%rcx), %eax
+; AVX2-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; AVX2-NEXT: vpextrb $12, %xmm2, %eax
+; AVX2-NEXT: andl $31, %eax
+; AVX2-NEXT: movzbl (%rax,%rcx), %eax
+; AVX2-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
+; AVX2-NEXT: vpextrb $13, %xmm2, %eax
+; AVX2-NEXT: andl $31, %eax
+; AVX2-NEXT: movzbl (%rax,%rcx), %eax
+; AVX2-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
+; AVX2-NEXT: vpextrb $14, %xmm2, %eax
+; AVX2-NEXT: andl $31, %eax
+; AVX2-NEXT: movzbl (%rax,%rcx), %eax
+; AVX2-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
+; AVX2-NEXT: vpextrb $15, %xmm2, %eax
+; AVX2-NEXT: andl $31, %eax
+; AVX2-NEXT: movzbl (%rax,%rcx), %eax
+; AVX2-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; AVX2-NEXT: vpextrb $0, %xmm1, %eax
+; AVX2-NEXT: andl $31, %eax
+; AVX2-NEXT: movzbl (%rax,%rcx), %eax
+; AVX2-NEXT: vmovd %eax, %xmm2
+; AVX2-NEXT: vpextrb $1, %xmm1, %eax
+; AVX2-NEXT: andl $31, %eax
+; AVX2-NEXT: vpinsrb $1, (%rax,%rcx), %xmm2, %xmm2
+; AVX2-NEXT: vpextrb $2, %xmm1, %eax
+; AVX2-NEXT: andl $31, %eax
+; AVX2-NEXT: vpinsrb $2, (%rax,%rcx), %xmm2, %xmm2
+; AVX2-NEXT: vpextrb $3, %xmm1, %eax
+; AVX2-NEXT: andl $31, %eax
+; AVX2-NEXT: vpinsrb $3, (%rax,%rcx), %xmm2, %xmm2
+; AVX2-NEXT: vpextrb $4, %xmm1, %eax
+; AVX2-NEXT: andl $31, %eax
+; AVX2-NEXT: vpinsrb $4, (%rax,%rcx), %xmm2, %xmm2
+; AVX2-NEXT: vpextrb $5, %xmm1, %eax
+; AVX2-NEXT: andl $31, %eax
+; AVX2-NEXT: vpinsrb $5, (%rax,%rcx), %xmm2, %xmm2
+; AVX2-NEXT: vpextrb $6, %xmm1, %eax
+; AVX2-NEXT: andl $31, %eax
+; AVX2-NEXT: vpinsrb $6, (%rax,%rcx), %xmm2, %xmm2
+; AVX2-NEXT: vpextrb $7, %xmm1, %eax
+; AVX2-NEXT: andl $31, %eax
+; AVX2-NEXT: vpinsrb $7, (%rax,%rcx), %xmm2, %xmm2
+; AVX2-NEXT: vpextrb $8, %xmm1, %eax
+; AVX2-NEXT: andl $31, %eax
+; AVX2-NEXT: vpinsrb $8, (%rax,%rcx), %xmm2, %xmm2
+; AVX2-NEXT: vpextrb $9, %xmm1, %eax
+; AVX2-NEXT: andl $31, %eax
+; AVX2-NEXT: vpinsrb $9, (%rax,%rcx), %xmm2, %xmm2
+; AVX2-NEXT: vpextrb $10, %xmm1, %eax
+; AVX2-NEXT: andl $31, %eax
+; AVX2-NEXT: vpinsrb $10, (%rax,%rcx), %xmm2, %xmm2
+; AVX2-NEXT: vpextrb $11, %xmm1, %eax
+; AVX2-NEXT: andl $31, %eax
+; AVX2-NEXT: vpinsrb $11, (%rax,%rcx), %xmm2, %xmm2
+; AVX2-NEXT: vpextrb $12, %xmm1, %eax
+; AVX2-NEXT: andl $31, %eax
+; AVX2-NEXT: vpinsrb $12, (%rax,%rcx), %xmm2, %xmm2
+; AVX2-NEXT: vpextrb $13, %xmm1, %eax
+; AVX2-NEXT: andl $31, %eax
+; AVX2-NEXT: vpinsrb $13, (%rax,%rcx), %xmm2, %xmm2
+; AVX2-NEXT: vpextrb $14, %xmm1, %eax
+; AVX2-NEXT: andl $31, %eax
+; AVX2-NEXT: vpinsrb $14, (%rax,%rcx), %xmm2, %xmm2
+; AVX2-NEXT: vpextrb $15, %xmm1, %eax
+; AVX2-NEXT: andl $31, %eax
+; AVX2-NEXT: movzbl (%rax,%rcx), %eax
+; AVX2-NEXT: vpinsrb $15, %eax, %xmm2, %xmm1
+; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: movq %rbp, %rsp
+; AVX2-NEXT: popq %rbp
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: var_shuffle_v32i8:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: pushq %rbp
+; AVX512F-NEXT: movq %rsp, %rbp
+; AVX512F-NEXT: andq $-32, %rsp
+; AVX512F-NEXT: subq $64, %rsp
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT: vpextrb $0, %xmm2, %eax
+; AVX512F-NEXT: vmovaps %ymm0, (%rsp)
+; AVX512F-NEXT: andl $31, %eax
+; AVX512F-NEXT: movq %rsp, %rcx
+; AVX512F-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512F-NEXT: vmovd %eax, %xmm0
+; AVX512F-NEXT: vpextrb $1, %xmm2, %eax
+; AVX512F-NEXT: andl $31, %eax
+; AVX512F-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512F-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: vpextrb $2, %xmm2, %eax
+; AVX512F-NEXT: andl $31, %eax
+; AVX512F-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512F-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: vpextrb $3, %xmm2, %eax
+; AVX512F-NEXT: andl $31, %eax
+; AVX512F-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512F-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: vpextrb $4, %xmm2, %eax
+; AVX512F-NEXT: andl $31, %eax
+; AVX512F-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512F-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: vpextrb $5, %xmm2, %eax
+; AVX512F-NEXT: andl $31, %eax
+; AVX512F-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512F-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: vpextrb $6, %xmm2, %eax
+; AVX512F-NEXT: andl $31, %eax
+; AVX512F-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512F-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: vpextrb $7, %xmm2, %eax
+; AVX512F-NEXT: andl $31, %eax
+; AVX512F-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512F-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: vpextrb $8, %xmm2, %eax
+; AVX512F-NEXT: andl $31, %eax
+; AVX512F-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512F-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: vpextrb $9, %xmm2, %eax
+; AVX512F-NEXT: andl $31, %eax
+; AVX512F-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512F-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: vpextrb $10, %xmm2, %eax
+; AVX512F-NEXT: andl $31, %eax
+; AVX512F-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512F-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: vpextrb $11, %xmm2, %eax
+; AVX512F-NEXT: andl $31, %eax
+; AVX512F-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512F-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: vpextrb $12, %xmm2, %eax
+; AVX512F-NEXT: andl $31, %eax
+; AVX512F-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512F-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: vpextrb $13, %xmm2, %eax
+; AVX512F-NEXT: andl $31, %eax
+; AVX512F-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512F-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: vpextrb $14, %xmm2, %eax
+; AVX512F-NEXT: andl $31, %eax
+; AVX512F-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512F-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: vpextrb $15, %xmm2, %eax
+; AVX512F-NEXT: andl $31, %eax
+; AVX512F-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512F-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: vpextrb $0, %xmm1, %eax
+; AVX512F-NEXT: andl $31, %eax
+; AVX512F-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512F-NEXT: vmovd %eax, %xmm2
+; AVX512F-NEXT: vpextrb $1, %xmm1, %eax
+; AVX512F-NEXT: andl $31, %eax
+; AVX512F-NEXT: vpinsrb $1, (%rax,%rcx), %xmm2, %xmm2
+; AVX512F-NEXT: vpextrb $2, %xmm1, %eax
+; AVX512F-NEXT: andl $31, %eax
+; AVX512F-NEXT: vpinsrb $2, (%rax,%rcx), %xmm2, %xmm2
+; AVX512F-NEXT: vpextrb $3, %xmm1, %eax
+; AVX512F-NEXT: andl $31, %eax
+; AVX512F-NEXT: vpinsrb $3, (%rax,%rcx), %xmm2, %xmm2
+; AVX512F-NEXT: vpextrb $4, %xmm1, %eax
+; AVX512F-NEXT: andl $31, %eax
+; AVX512F-NEXT: vpinsrb $4, (%rax,%rcx), %xmm2, %xmm2
+; AVX512F-NEXT: vpextrb $5, %xmm1, %eax
+; AVX512F-NEXT: andl $31, %eax
+; AVX512F-NEXT: vpinsrb $5, (%rax,%rcx), %xmm2, %xmm2
+; AVX512F-NEXT: vpextrb $6, %xmm1, %eax
+; AVX512F-NEXT: andl $31, %eax
+; AVX512F-NEXT: vpinsrb $6, (%rax,%rcx), %xmm2, %xmm2
+; AVX512F-NEXT: vpextrb $7, %xmm1, %eax
+; AVX512F-NEXT: andl $31, %eax
+; AVX512F-NEXT: vpinsrb $7, (%rax,%rcx), %xmm2, %xmm2
+; AVX512F-NEXT: vpextrb $8, %xmm1, %eax
+; AVX512F-NEXT: andl $31, %eax
+; AVX512F-NEXT: vpinsrb $8, (%rax,%rcx), %xmm2, %xmm2
+; AVX512F-NEXT: vpextrb $9, %xmm1, %eax
+; AVX512F-NEXT: andl $31, %eax
+; AVX512F-NEXT: vpinsrb $9, (%rax,%rcx), %xmm2, %xmm2
+; AVX512F-NEXT: vpextrb $10, %xmm1, %eax
+; AVX512F-NEXT: andl $31, %eax
+; AVX512F-NEXT: vpinsrb $10, (%rax,%rcx), %xmm2, %xmm2
+; AVX512F-NEXT: vpextrb $11, %xmm1, %eax
+; AVX512F-NEXT: andl $31, %eax
+; AVX512F-NEXT: vpinsrb $11, (%rax,%rcx), %xmm2, %xmm2
+; AVX512F-NEXT: vpextrb $12, %xmm1, %eax
+; AVX512F-NEXT: andl $31, %eax
+; AVX512F-NEXT: vpinsrb $12, (%rax,%rcx), %xmm2, %xmm2
+; AVX512F-NEXT: vpextrb $13, %xmm1, %eax
+; AVX512F-NEXT: andl $31, %eax
+; AVX512F-NEXT: vpinsrb $13, (%rax,%rcx), %xmm2, %xmm2
+; AVX512F-NEXT: vpextrb $14, %xmm1, %eax
+; AVX512F-NEXT: andl $31, %eax
+; AVX512F-NEXT: vpinsrb $14, (%rax,%rcx), %xmm2, %xmm2
+; AVX512F-NEXT: vpextrb $15, %xmm1, %eax
+; AVX512F-NEXT: andl $31, %eax
+; AVX512F-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512F-NEXT: vpinsrb $15, %eax, %xmm2, %xmm1
+; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512F-NEXT: movq %rbp, %rsp
+; AVX512F-NEXT: popq %rbp
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: var_shuffle_v32i8:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: pushq %rbp
+; AVX512VL-NEXT: movq %rsp, %rbp
+; AVX512VL-NEXT: andq $-32, %rsp
+; AVX512VL-NEXT: subq $64, %rsp
+; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512VL-NEXT: vpextrb $0, %xmm2, %eax
+; AVX512VL-NEXT: vmovaps %ymm0, (%rsp)
+; AVX512VL-NEXT: andl $31, %eax
+; AVX512VL-NEXT: movq %rsp, %rcx
+; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512VL-NEXT: vmovd %eax, %xmm0
+; AVX512VL-NEXT: vpextrb $1, %xmm2, %eax
+; AVX512VL-NEXT: andl $31, %eax
+; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512VL-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
+; AVX512VL-NEXT: vpextrb $2, %xmm2, %eax
+; AVX512VL-NEXT: andl $31, %eax
+; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512VL-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
+; AVX512VL-NEXT: vpextrb $3, %xmm2, %eax
+; AVX512VL-NEXT: andl $31, %eax
+; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512VL-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
+; AVX512VL-NEXT: vpextrb $4, %xmm2, %eax
+; AVX512VL-NEXT: andl $31, %eax
+; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512VL-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
+; AVX512VL-NEXT: vpextrb $5, %xmm2, %eax
+; AVX512VL-NEXT: andl $31, %eax
+; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512VL-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
+; AVX512VL-NEXT: vpextrb $6, %xmm2, %eax
+; AVX512VL-NEXT: andl $31, %eax
+; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512VL-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
+; AVX512VL-NEXT: vpextrb $7, %xmm2, %eax
+; AVX512VL-NEXT: andl $31, %eax
+; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512VL-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
+; AVX512VL-NEXT: vpextrb $8, %xmm2, %eax
+; AVX512VL-NEXT: andl $31, %eax
+; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512VL-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
+; AVX512VL-NEXT: vpextrb $9, %xmm2, %eax
+; AVX512VL-NEXT: andl $31, %eax
+; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512VL-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
+; AVX512VL-NEXT: vpextrb $10, %xmm2, %eax
+; AVX512VL-NEXT: andl $31, %eax
+; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512VL-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
+; AVX512VL-NEXT: vpextrb $11, %xmm2, %eax
+; AVX512VL-NEXT: andl $31, %eax
+; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512VL-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; AVX512VL-NEXT: vpextrb $12, %xmm2, %eax
+; AVX512VL-NEXT: andl $31, %eax
+; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512VL-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
+; AVX512VL-NEXT: vpextrb $13, %xmm2, %eax
+; AVX512VL-NEXT: andl $31, %eax
+; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512VL-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
+; AVX512VL-NEXT: vpextrb $14, %xmm2, %eax
+; AVX512VL-NEXT: andl $31, %eax
+; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512VL-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
+; AVX512VL-NEXT: vpextrb $15, %xmm2, %eax
+; AVX512VL-NEXT: andl $31, %eax
+; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512VL-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; AVX512VL-NEXT: vpextrb $0, %xmm1, %eax
+; AVX512VL-NEXT: andl $31, %eax
+; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512VL-NEXT: vmovd %eax, %xmm2
+; AVX512VL-NEXT: vpextrb $1, %xmm1, %eax
+; AVX512VL-NEXT: andl $31, %eax
+; AVX512VL-NEXT: vpinsrb $1, (%rax,%rcx), %xmm2, %xmm2
+; AVX512VL-NEXT: vpextrb $2, %xmm1, %eax
+; AVX512VL-NEXT: andl $31, %eax
+; AVX512VL-NEXT: vpinsrb $2, (%rax,%rcx), %xmm2, %xmm2
+; AVX512VL-NEXT: vpextrb $3, %xmm1, %eax
+; AVX512VL-NEXT: andl $31, %eax
+; AVX512VL-NEXT: vpinsrb $3, (%rax,%rcx), %xmm2, %xmm2
+; AVX512VL-NEXT: vpextrb $4, %xmm1, %eax
+; AVX512VL-NEXT: andl $31, %eax
+; AVX512VL-NEXT: vpinsrb $4, (%rax,%rcx), %xmm2, %xmm2
+; AVX512VL-NEXT: vpextrb $5, %xmm1, %eax
+; AVX512VL-NEXT: andl $31, %eax
+; AVX512VL-NEXT: vpinsrb $5, (%rax,%rcx), %xmm2, %xmm2
+; AVX512VL-NEXT: vpextrb $6, %xmm1, %eax
+; AVX512VL-NEXT: andl $31, %eax
+; AVX512VL-NEXT: vpinsrb $6, (%rax,%rcx), %xmm2, %xmm2
+; AVX512VL-NEXT: vpextrb $7, %xmm1, %eax
+; AVX512VL-NEXT: andl $31, %eax
+; AVX512VL-NEXT: vpinsrb $7, (%rax,%rcx), %xmm2, %xmm2
+; AVX512VL-NEXT: vpextrb $8, %xmm1, %eax
+; AVX512VL-NEXT: andl $31, %eax
+; AVX512VL-NEXT: vpinsrb $8, (%rax,%rcx), %xmm2, %xmm2
+; AVX512VL-NEXT: vpextrb $9, %xmm1, %eax
+; AVX512VL-NEXT: andl $31, %eax
+; AVX512VL-NEXT: vpinsrb $9, (%rax,%rcx), %xmm2, %xmm2
+; AVX512VL-NEXT: vpextrb $10, %xmm1, %eax
+; AVX512VL-NEXT: andl $31, %eax
+; AVX512VL-NEXT: vpinsrb $10, (%rax,%rcx), %xmm2, %xmm2
+; AVX512VL-NEXT: vpextrb $11, %xmm1, %eax
+; AVX512VL-NEXT: andl $31, %eax
+; AVX512VL-NEXT: vpinsrb $11, (%rax,%rcx), %xmm2, %xmm2
+; AVX512VL-NEXT: vpextrb $12, %xmm1, %eax
+; AVX512VL-NEXT: andl $31, %eax
+; AVX512VL-NEXT: vpinsrb $12, (%rax,%rcx), %xmm2, %xmm2
+; AVX512VL-NEXT: vpextrb $13, %xmm1, %eax
+; AVX512VL-NEXT: andl $31, %eax
+; AVX512VL-NEXT: vpinsrb $13, (%rax,%rcx), %xmm2, %xmm2
+; AVX512VL-NEXT: vpextrb $14, %xmm1, %eax
+; AVX512VL-NEXT: andl $31, %eax
+; AVX512VL-NEXT: vpinsrb $14, (%rax,%rcx), %xmm2, %xmm2
+; AVX512VL-NEXT: vpextrb $15, %xmm1, %eax
+; AVX512VL-NEXT: andl $31, %eax
+; AVX512VL-NEXT: movzbl (%rax,%rcx), %eax
+; AVX512VL-NEXT: vpinsrb $15, %eax, %xmm2, %xmm1
+; AVX512VL-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512VL-NEXT: movq %rbp, %rsp
+; AVX512VL-NEXT: popq %rbp
+; AVX512VL-NEXT: retq
+;
+; VBMI-LABEL: var_shuffle_v32i8:
+; VBMI: # %bb.0:
+; VBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0
+; VBMI-NEXT: retq
+ %index0 = extractelement <32 x i8> %indices, i32 0
+ %index1 = extractelement <32 x i8> %indices, i32 1
+ %index2 = extractelement <32 x i8> %indices, i32 2
+ %index3 = extractelement <32 x i8> %indices, i32 3
+ %index4 = extractelement <32 x i8> %indices, i32 4
+ %index5 = extractelement <32 x i8> %indices, i32 5
+ %index6 = extractelement <32 x i8> %indices, i32 6
+ %index7 = extractelement <32 x i8> %indices, i32 7
+ %index8 = extractelement <32 x i8> %indices, i32 8
+ %index9 = extractelement <32 x i8> %indices, i32 9
+ %index10 = extractelement <32 x i8> %indices, i32 10
+ %index11 = extractelement <32 x i8> %indices, i32 11
+ %index12 = extractelement <32 x i8> %indices, i32 12
+ %index13 = extractelement <32 x i8> %indices, i32 13
+ %index14 = extractelement <32 x i8> %indices, i32 14
+ %index15 = extractelement <32 x i8> %indices, i32 15
+ %index16 = extractelement <32 x i8> %indices, i32 16
+ %index17 = extractelement <32 x i8> %indices, i32 17
+ %index18 = extractelement <32 x i8> %indices, i32 18
+ %index19 = extractelement <32 x i8> %indices, i32 19
+ %index20 = extractelement <32 x i8> %indices, i32 20
+ %index21 = extractelement <32 x i8> %indices, i32 21
+ %index22 = extractelement <32 x i8> %indices, i32 22
+ %index23 = extractelement <32 x i8> %indices, i32 23
+ %index24 = extractelement <32 x i8> %indices, i32 24
+ %index25 = extractelement <32 x i8> %indices, i32 25
+ %index26 = extractelement <32 x i8> %indices, i32 26
+ %index27 = extractelement <32 x i8> %indices, i32 27
+ %index28 = extractelement <32 x i8> %indices, i32 28
+ %index29 = extractelement <32 x i8> %indices, i32 29
+ %index30 = extractelement <32 x i8> %indices, i32 30
+ %index31 = extractelement <32 x i8> %indices, i32 31
+ %v0 = extractelement <32 x i8> %v, i8 %index0
+ %v1 = extractelement <32 x i8> %v, i8 %index1
+ %v2 = extractelement <32 x i8> %v, i8 %index2
+ %v3 = extractelement <32 x i8> %v, i8 %index3
+ %v4 = extractelement <32 x i8> %v, i8 %index4
+ %v5 = extractelement <32 x i8> %v, i8 %index5
+ %v6 = extractelement <32 x i8> %v, i8 %index6
+ %v7 = extractelement <32 x i8> %v, i8 %index7
+ %v8 = extractelement <32 x i8> %v, i8 %index8
+ %v9 = extractelement <32 x i8> %v, i8 %index9
+ %v10 = extractelement <32 x i8> %v, i8 %index10
+ %v11 = extractelement <32 x i8> %v, i8 %index11
+ %v12 = extractelement <32 x i8> %v, i8 %index12
+ %v13 = extractelement <32 x i8> %v, i8 %index13
+ %v14 = extractelement <32 x i8> %v, i8 %index14
+ %v15 = extractelement <32 x i8> %v, i8 %index15
+ %v16 = extractelement <32 x i8> %v, i8 %index16
+ %v17 = extractelement <32 x i8> %v, i8 %index17
+ %v18 = extractelement <32 x i8> %v, i8 %index18
+ %v19 = extractelement <32 x i8> %v, i8 %index19
+ %v20 = extractelement <32 x i8> %v, i8 %index20
+ %v21 = extractelement <32 x i8> %v, i8 %index21
+ %v22 = extractelement <32 x i8> %v, i8 %index22
+ %v23 = extractelement <32 x i8> %v, i8 %index23
+ %v24 = extractelement <32 x i8> %v, i8 %index24
+ %v25 = extractelement <32 x i8> %v, i8 %index25
+ %v26 = extractelement <32 x i8> %v, i8 %index26
+ %v27 = extractelement <32 x i8> %v, i8 %index27
+ %v28 = extractelement <32 x i8> %v, i8 %index28
+ %v29 = extractelement <32 x i8> %v, i8 %index29
+ %v30 = extractelement <32 x i8> %v, i8 %index30
+ %v31 = extractelement <32 x i8> %v, i8 %index31
+ %ret0 = insertelement <32 x i8> undef, i8 %v0, i32 0
+ %ret1 = insertelement <32 x i8> %ret0, i8 %v1, i32 1
+ %ret2 = insertelement <32 x i8> %ret1, i8 %v2, i32 2
+ %ret3 = insertelement <32 x i8> %ret2, i8 %v3, i32 3
+ %ret4 = insertelement <32 x i8> %ret3, i8 %v4, i32 4
+ %ret5 = insertelement <32 x i8> %ret4, i8 %v5, i32 5
+ %ret6 = insertelement <32 x i8> %ret5, i8 %v6, i32 6
+ %ret7 = insertelement <32 x i8> %ret6, i8 %v7, i32 7
+ %ret8 = insertelement <32 x i8> %ret7, i8 %v8, i32 8
+ %ret9 = insertelement <32 x i8> %ret8, i8 %v9, i32 9
+ %ret10 = insertelement <32 x i8> %ret9, i8 %v10, i32 10
+ %ret11 = insertelement <32 x i8> %ret10, i8 %v11, i32 11
+ %ret12 = insertelement <32 x i8> %ret11, i8 %v12, i32 12
+ %ret13 = insertelement <32 x i8> %ret12, i8 %v13, i32 13
+ %ret14 = insertelement <32 x i8> %ret13, i8 %v14, i32 14
+ %ret15 = insertelement <32 x i8> %ret14, i8 %v15, i32 15
+ %ret16 = insertelement <32 x i8> %ret15, i8 %v16, i32 16
+ %ret17 = insertelement <32 x i8> %ret16, i8 %v17, i32 17
+ %ret18 = insertelement <32 x i8> %ret17, i8 %v18, i32 18
+ %ret19 = insertelement <32 x i8> %ret18, i8 %v19, i32 19
+ %ret20 = insertelement <32 x i8> %ret19, i8 %v20, i32 20
+ %ret21 = insertelement <32 x i8> %ret20, i8 %v21, i32 21
+ %ret22 = insertelement <32 x i8> %ret21, i8 %v22, i32 22
+ %ret23 = insertelement <32 x i8> %ret22, i8 %v23, i32 23
+ %ret24 = insertelement <32 x i8> %ret23, i8 %v24, i32 24
+ %ret25 = insertelement <32 x i8> %ret24, i8 %v25, i32 25
+ %ret26 = insertelement <32 x i8> %ret25, i8 %v26, i32 26
+ %ret27 = insertelement <32 x i8> %ret26, i8 %v27, i32 27
+ %ret28 = insertelement <32 x i8> %ret27, i8 %v28, i32 28
+ %ret29 = insertelement <32 x i8> %ret28, i8 %v29, i32 29
+ %ret30 = insertelement <32 x i8> %ret29, i8 %v30, i32 30
+ %ret31 = insertelement <32 x i8> %ret30, i8 %v31, i32 31
+ ret <32 x i8> %ret31
+}
+
+define <4 x double> @var_shuffle_v4f64(<4 x double> %v, <4 x i64> %indices) nounwind {
+; AVX1-LABEL: var_shuffle_v4f64:
+; AVX1: # %bb.0:
+; AVX1-NEXT: pushq %rbp
+; AVX1-NEXT: movq %rsp, %rbp
+; AVX1-NEXT: andq $-32, %rsp
+; AVX1-NEXT: subq $64, %rsp
+; AVX1-NEXT: vmovq %xmm1, %rax
+; AVX1-NEXT: andl $3, %eax
+; AVX1-NEXT: vpextrq $1, %xmm1, %rcx
+; AVX1-NEXT: andl $3, %ecx
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT: vmovq %xmm1, %rdx
+; AVX1-NEXT: andl $3, %edx
+; AVX1-NEXT: vpextrq $1, %xmm1, %rsi
+; AVX1-NEXT: andl $3, %esi
+; AVX1-NEXT: vmovaps %ymm0, (%rsp)
+; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX1-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; AVX1-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX1-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: movq %rbp, %rsp
+; AVX1-NEXT: popq %rbp
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: var_shuffle_v4f64:
+; AVX2: # %bb.0:
+; AVX2-NEXT: pushq %rbp
+; AVX2-NEXT: movq %rsp, %rbp
+; AVX2-NEXT: andq $-32, %rsp
+; AVX2-NEXT: subq $64, %rsp
+; AVX2-NEXT: vmovq %xmm1, %rax
+; AVX2-NEXT: andl $3, %eax
+; AVX2-NEXT: vpextrq $1, %xmm1, %rcx
+; AVX2-NEXT: andl $3, %ecx
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1
+; AVX2-NEXT: vmovq %xmm1, %rdx
+; AVX2-NEXT: andl $3, %edx
+; AVX2-NEXT: vpextrq $1, %xmm1, %rsi
+; AVX2-NEXT: andl $3, %esi
+; AVX2-NEXT: vmovaps %ymm0, (%rsp)
+; AVX2-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX2-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; AVX2-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX2-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: movq %rbp, %rsp
+; AVX2-NEXT: popq %rbp
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: var_shuffle_v4f64:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: pushq %rbp
+; AVX512F-NEXT: movq %rsp, %rbp
+; AVX512F-NEXT: andq $-32, %rsp
+; AVX512F-NEXT: subq $64, %rsp
+; AVX512F-NEXT: vmovq %xmm1, %rax
+; AVX512F-NEXT: andl $3, %eax
+; AVX512F-NEXT: vpextrq $1, %xmm1, %rcx
+; AVX512F-NEXT: andl $3, %ecx
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm1
+; AVX512F-NEXT: vmovq %xmm1, %rdx
+; AVX512F-NEXT: andl $3, %edx
+; AVX512F-NEXT: vpextrq $1, %xmm1, %rsi
+; AVX512F-NEXT: andl $3, %esi
+; AVX512F-NEXT: vmovaps %ymm0, (%rsp)
+; AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX512F-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; AVX512F-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX512F-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX512F-NEXT: movq %rbp, %rsp
+; AVX512F-NEXT: popq %rbp
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: var_shuffle_v4f64:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpermpd %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT: retq
+;
+; AVX512VLBW-LABEL: var_shuffle_v4f64:
+; AVX512VLBW: # %bb.0:
+; AVX512VLBW-NEXT: vpermpd %ymm0, %ymm1, %ymm0
+; AVX512VLBW-NEXT: retq
+ %index0 = extractelement <4 x i64> %indices, i32 0
+ %index1 = extractelement <4 x i64> %indices, i32 1
+ %index2 = extractelement <4 x i64> %indices, i32 2
+ %index3 = extractelement <4 x i64> %indices, i32 3
+ %v0 = extractelement <4 x double> %v, i64 %index0
+ %v1 = extractelement <4 x double> %v, i64 %index1
+ %v2 = extractelement <4 x double> %v, i64 %index2
+ %v3 = extractelement <4 x double> %v, i64 %index3
+ %ret0 = insertelement <4 x double> undef, double %v0, i32 0
+ %ret1 = insertelement <4 x double> %ret0, double %v1, i32 1
+ %ret2 = insertelement <4 x double> %ret1, double %v2, i32 2
+ %ret3 = insertelement <4 x double> %ret2, double %v3, i32 3
+ ret <4 x double> %ret3
+}
+
+define <8 x float> @var_shuffle_v8f32(<8 x float> %v, <8 x i32> %indices) nounwind {
+; AVX1-LABEL: var_shuffle_v8f32:
+; AVX1: # %bb.0:
+; AVX1-NEXT: pushq %rbp
+; AVX1-NEXT: movq %rsp, %rbp
+; AVX1-NEXT: andq $-32, %rsp
+; AVX1-NEXT: subq $64, %rsp
+; AVX1-NEXT: vpextrq $1, %xmm1, %r8
+; AVX1-NEXT: movq %r8, %rcx
+; AVX1-NEXT: shrq $30, %rcx
+; AVX1-NEXT: vmovq %xmm1, %r9
+; AVX1-NEXT: movq %r9, %rdx
+; AVX1-NEXT: shrq $30, %rdx
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT: vpextrq $1, %xmm1, %r10
+; AVX1-NEXT: movq %r10, %rdi
+; AVX1-NEXT: shrq $30, %rdi
+; AVX1-NEXT: vmovq %xmm1, %rax
+; AVX1-NEXT: movq %rax, %rsi
+; AVX1-NEXT: shrq $30, %rsi
+; AVX1-NEXT: vmovaps %ymm0, (%rsp)
+; AVX1-NEXT: andl $7, %r9d
+; AVX1-NEXT: andl $28, %edx
+; AVX1-NEXT: andl $7, %r8d
+; AVX1-NEXT: andl $28, %ecx
+; AVX1-NEXT: andl $7, %eax
+; AVX1-NEXT: andl $28, %esi
+; AVX1-NEXT: andl $7, %r10d
+; AVX1-NEXT: andl $28, %edi
+; AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX1-NEXT: movq %rsp, %rax
+; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
+; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
+; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
+; AVX1-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3]
+; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3]
+; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],mem[0]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: movq %rbp, %rsp
+; AVX1-NEXT: popq %rbp
+; AVX1-NEXT: retq
+;
+; INT256-LABEL: var_shuffle_v8f32:
+; INT256: # %bb.0:
+; INT256-NEXT: vpermps %ymm0, %ymm1, %ymm0
+; INT256-NEXT: retq
+ %index0 = extractelement <8 x i32> %indices, i32 0
+ %index1 = extractelement <8 x i32> %indices, i32 1
+ %index2 = extractelement <8 x i32> %indices, i32 2
+ %index3 = extractelement <8 x i32> %indices, i32 3
+ %index4 = extractelement <8 x i32> %indices, i32 4
+ %index5 = extractelement <8 x i32> %indices, i32 5
+ %index6 = extractelement <8 x i32> %indices, i32 6
+ %index7 = extractelement <8 x i32> %indices, i32 7
+ %v0 = extractelement <8 x float> %v, i32 %index0
+ %v1 = extractelement <8 x float> %v, i32 %index1
+ %v2 = extractelement <8 x float> %v, i32 %index2
+ %v3 = extractelement <8 x float> %v, i32 %index3
+ %v4 = extractelement <8 x float> %v, i32 %index4
+ %v5 = extractelement <8 x float> %v, i32 %index5
+ %v6 = extractelement <8 x float> %v, i32 %index6
+ %v7 = extractelement <8 x float> %v, i32 %index7
+ %ret0 = insertelement <8 x float> undef, float %v0, i32 0
+ %ret1 = insertelement <8 x float> %ret0, float %v1, i32 1
+ %ret2 = insertelement <8 x float> %ret1, float %v2, i32 2
+ %ret3 = insertelement <8 x float> %ret2, float %v3, i32 3
+ %ret4 = insertelement <8 x float> %ret3, float %v4, i32 4
+ %ret5 = insertelement <8 x float> %ret4, float %v5, i32 5
+ %ret6 = insertelement <8 x float> %ret5, float %v6, i32 6
+ %ret7 = insertelement <8 x float> %ret6, float %v7, i32 7
+ ret <8 x float> %ret7
+}
diff --git a/test/CodeGen/X86/var-permute-512.ll b/test/CodeGen/X86/var-permute-512.ll
new file mode 100644
index 000000000000..a5aa73cdf1a2
--- /dev/null
+++ b/test/CodeGen/X86/var-permute-512.ll
@@ -0,0 +1,1064 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,NOBW,NOVBMI,AVX512F
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=AVX512,NOVBMI,AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vbmi | FileCheck %s --check-prefixes=AVX512,AVX512BW,VBMI
+
+define <8 x i64> @var_shuffle_v8i64(<8 x i64> %v, <8 x i64> %indices) nounwind {
+; AVX512-LABEL: var_shuffle_v8i64:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; AVX512-NEXT: retq
+ %index0 = extractelement <8 x i64> %indices, i32 0
+ %index1 = extractelement <8 x i64> %indices, i32 1
+ %index2 = extractelement <8 x i64> %indices, i32 2
+ %index3 = extractelement <8 x i64> %indices, i32 3
+ %index4 = extractelement <8 x i64> %indices, i32 4
+ %index5 = extractelement <8 x i64> %indices, i32 5
+ %index6 = extractelement <8 x i64> %indices, i32 6
+ %index7 = extractelement <8 x i64> %indices, i32 7
+ %v0 = extractelement <8 x i64> %v, i64 %index0
+ %v1 = extractelement <8 x i64> %v, i64 %index1
+ %v2 = extractelement <8 x i64> %v, i64 %index2
+ %v3 = extractelement <8 x i64> %v, i64 %index3
+ %v4 = extractelement <8 x i64> %v, i64 %index4
+ %v5 = extractelement <8 x i64> %v, i64 %index5
+ %v6 = extractelement <8 x i64> %v, i64 %index6
+ %v7 = extractelement <8 x i64> %v, i64 %index7
+ %ret0 = insertelement <8 x i64> undef, i64 %v0, i32 0
+ %ret1 = insertelement <8 x i64> %ret0, i64 %v1, i32 1
+ %ret2 = insertelement <8 x i64> %ret1, i64 %v2, i32 2
+ %ret3 = insertelement <8 x i64> %ret2, i64 %v3, i32 3
+ %ret4 = insertelement <8 x i64> %ret3, i64 %v4, i32 4
+ %ret5 = insertelement <8 x i64> %ret4, i64 %v5, i32 5
+ %ret6 = insertelement <8 x i64> %ret5, i64 %v6, i32 6
+ %ret7 = insertelement <8 x i64> %ret6, i64 %v7, i32 7
+ ret <8 x i64> %ret7
+}
+
+define <16 x i32> @var_shuffle_v16i32(<16 x i32> %v, <16 x i32> %indices) nounwind {
+; AVX512-LABEL: var_shuffle_v16i32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpermps %zmm0, %zmm1, %zmm0
+; AVX512-NEXT: retq
+ %index0 = extractelement <16 x i32> %indices, i32 0
+ %index1 = extractelement <16 x i32> %indices, i32 1
+ %index2 = extractelement <16 x i32> %indices, i32 2
+ %index3 = extractelement <16 x i32> %indices, i32 3
+ %index4 = extractelement <16 x i32> %indices, i32 4
+ %index5 = extractelement <16 x i32> %indices, i32 5
+ %index6 = extractelement <16 x i32> %indices, i32 6
+ %index7 = extractelement <16 x i32> %indices, i32 7
+ %index8 = extractelement <16 x i32> %indices, i32 8
+ %index9 = extractelement <16 x i32> %indices, i32 9
+ %index10 = extractelement <16 x i32> %indices, i32 10
+ %index11 = extractelement <16 x i32> %indices, i32 11
+ %index12 = extractelement <16 x i32> %indices, i32 12
+ %index13 = extractelement <16 x i32> %indices, i32 13
+ %index14 = extractelement <16 x i32> %indices, i32 14
+ %index15 = extractelement <16 x i32> %indices, i32 15
+ %v0 = extractelement <16 x i32> %v, i32 %index0
+ %v1 = extractelement <16 x i32> %v, i32 %index1
+ %v2 = extractelement <16 x i32> %v, i32 %index2
+ %v3 = extractelement <16 x i32> %v, i32 %index3
+ %v4 = extractelement <16 x i32> %v, i32 %index4
+ %v5 = extractelement <16 x i32> %v, i32 %index5
+ %v6 = extractelement <16 x i32> %v, i32 %index6
+ %v7 = extractelement <16 x i32> %v, i32 %index7
+ %v8 = extractelement <16 x i32> %v, i32 %index8
+ %v9 = extractelement <16 x i32> %v, i32 %index9
+ %v10 = extractelement <16 x i32> %v, i32 %index10
+ %v11 = extractelement <16 x i32> %v, i32 %index11
+ %v12 = extractelement <16 x i32> %v, i32 %index12
+ %v13 = extractelement <16 x i32> %v, i32 %index13
+ %v14 = extractelement <16 x i32> %v, i32 %index14
+ %v15 = extractelement <16 x i32> %v, i32 %index15
+ %ret0 = insertelement <16 x i32> undef, i32 %v0, i32 0
+ %ret1 = insertelement <16 x i32> %ret0, i32 %v1, i32 1
+ %ret2 = insertelement <16 x i32> %ret1, i32 %v2, i32 2
+ %ret3 = insertelement <16 x i32> %ret2, i32 %v3, i32 3
+ %ret4 = insertelement <16 x i32> %ret3, i32 %v4, i32 4
+ %ret5 = insertelement <16 x i32> %ret4, i32 %v5, i32 5
+ %ret6 = insertelement <16 x i32> %ret5, i32 %v6, i32 6
+ %ret7 = insertelement <16 x i32> %ret6, i32 %v7, i32 7
+ %ret8 = insertelement <16 x i32> %ret7, i32 %v8, i32 8
+ %ret9 = insertelement <16 x i32> %ret8, i32 %v9, i32 9
+ %ret10 = insertelement <16 x i32> %ret9, i32 %v10, i32 10
+ %ret11 = insertelement <16 x i32> %ret10, i32 %v11, i32 11
+ %ret12 = insertelement <16 x i32> %ret11, i32 %v12, i32 12
+ %ret13 = insertelement <16 x i32> %ret12, i32 %v13, i32 13
+ %ret14 = insertelement <16 x i32> %ret13, i32 %v14, i32 14
+ %ret15 = insertelement <16 x i32> %ret14, i32 %v15, i32 15
+ ret <16 x i32> %ret15
+}
+
+define <32 x i16> @var_shuffle_v32i16(<32 x i16> %v, <32 x i16> %indices) nounwind {
+; NOBW-LABEL: var_shuffle_v32i16:
+; NOBW: # %bb.0:
+; NOBW-NEXT: pushq %rbp
+; NOBW-NEXT: movq %rsp, %rbp
+; NOBW-NEXT: andq $-64, %rsp
+; NOBW-NEXT: subq $2112, %rsp # imm = 0x840
+; NOBW-NEXT: vextracti128 $1, %ymm2, %xmm4
+; NOBW-NEXT: vmovd %xmm4, %eax
+; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: andl $31, %eax
+; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm0, (%rsp)
+; NOBW-NEXT: movzwl 1472(%rsp,%rax,2), %eax
+; NOBW-NEXT: vmovd %eax, %xmm0
+; NOBW-NEXT: vpextrw $1, %xmm4, %eax
+; NOBW-NEXT: andl $31, %eax
+; NOBW-NEXT: vpinsrw $1, 1408(%rsp,%rax,2), %xmm0, %xmm0
+; NOBW-NEXT: vpextrw $2, %xmm4, %eax
+; NOBW-NEXT: andl $31, %eax
+; NOBW-NEXT: vpinsrw $2, 1344(%rsp,%rax,2), %xmm0, %xmm0
+; NOBW-NEXT: vpextrw $3, %xmm4, %eax
+; NOBW-NEXT: andl $31, %eax
+; NOBW-NEXT: vpinsrw $3, 1280(%rsp,%rax,2), %xmm0, %xmm0
+; NOBW-NEXT: vpextrw $4, %xmm4, %eax
+; NOBW-NEXT: andl $31, %eax
+; NOBW-NEXT: vpinsrw $4, 1216(%rsp,%rax,2), %xmm0, %xmm0
+; NOBW-NEXT: vpextrw $5, %xmm4, %eax
+; NOBW-NEXT: andl $31, %eax
+; NOBW-NEXT: vpinsrw $5, 1152(%rsp,%rax,2), %xmm0, %xmm0
+; NOBW-NEXT: vpextrw $6, %xmm4, %eax
+; NOBW-NEXT: andl $31, %eax
+; NOBW-NEXT: vpinsrw $6, 1088(%rsp,%rax,2), %xmm0, %xmm0
+; NOBW-NEXT: vpextrw $7, %xmm4, %eax
+; NOBW-NEXT: andl $31, %eax
+; NOBW-NEXT: vpinsrw $7, 1024(%rsp,%rax,2), %xmm0, %xmm0
+; NOBW-NEXT: vmovd %xmm2, %eax
+; NOBW-NEXT: andl $31, %eax
+; NOBW-NEXT: movzwl 1984(%rsp,%rax,2), %eax
+; NOBW-NEXT: vmovd %eax, %xmm1
+; NOBW-NEXT: vpextrw $1, %xmm2, %eax
+; NOBW-NEXT: andl $31, %eax
+; NOBW-NEXT: vpinsrw $1, 1920(%rsp,%rax,2), %xmm1, %xmm1
+; NOBW-NEXT: vpextrw $2, %xmm2, %eax
+; NOBW-NEXT: andl $31, %eax
+; NOBW-NEXT: vpinsrw $2, 1856(%rsp,%rax,2), %xmm1, %xmm1
+; NOBW-NEXT: vpextrw $3, %xmm2, %eax
+; NOBW-NEXT: andl $31, %eax
+; NOBW-NEXT: vpinsrw $3, 1792(%rsp,%rax,2), %xmm1, %xmm1
+; NOBW-NEXT: vpextrw $4, %xmm2, %eax
+; NOBW-NEXT: andl $31, %eax
+; NOBW-NEXT: vpinsrw $4, 1728(%rsp,%rax,2), %xmm1, %xmm1
+; NOBW-NEXT: vpextrw $5, %xmm2, %eax
+; NOBW-NEXT: andl $31, %eax
+; NOBW-NEXT: vpinsrw $5, 1664(%rsp,%rax,2), %xmm1, %xmm1
+; NOBW-NEXT: vpextrw $6, %xmm2, %eax
+; NOBW-NEXT: andl $31, %eax
+; NOBW-NEXT: vpinsrw $6, 1600(%rsp,%rax,2), %xmm1, %xmm1
+; NOBW-NEXT: vpextrw $7, %xmm2, %eax
+; NOBW-NEXT: vextracti128 $1, %ymm3, %xmm2
+; NOBW-NEXT: andl $31, %eax
+; NOBW-NEXT: vpinsrw $7, 1536(%rsp,%rax,2), %xmm1, %xmm1
+; NOBW-NEXT: vmovd %xmm2, %eax
+; NOBW-NEXT: andl $31, %eax
+; NOBW-NEXT: movzwl 448(%rsp,%rax,2), %eax
+; NOBW-NEXT: vmovd %eax, %xmm4
+; NOBW-NEXT: vpextrw $1, %xmm2, %eax
+; NOBW-NEXT: andl $31, %eax
+; NOBW-NEXT: vpinsrw $1, 384(%rsp,%rax,2), %xmm4, %xmm4
+; NOBW-NEXT: vpextrw $2, %xmm2, %eax
+; NOBW-NEXT: andl $31, %eax
+; NOBW-NEXT: vpinsrw $2, 320(%rsp,%rax,2), %xmm4, %xmm4
+; NOBW-NEXT: vpextrw $3, %xmm2, %eax
+; NOBW-NEXT: andl $31, %eax
+; NOBW-NEXT: vpinsrw $3, 256(%rsp,%rax,2), %xmm4, %xmm4
+; NOBW-NEXT: vpextrw $4, %xmm2, %eax
+; NOBW-NEXT: andl $31, %eax
+; NOBW-NEXT: vpinsrw $4, 192(%rsp,%rax,2), %xmm4, %xmm4
+; NOBW-NEXT: vpextrw $5, %xmm2, %eax
+; NOBW-NEXT: andl $31, %eax
+; NOBW-NEXT: vpinsrw $5, 128(%rsp,%rax,2), %xmm4, %xmm4
+; NOBW-NEXT: vpextrw $6, %xmm2, %eax
+; NOBW-NEXT: andl $31, %eax
+; NOBW-NEXT: vpinsrw $6, 64(%rsp,%rax,2), %xmm4, %xmm4
+; NOBW-NEXT: vpextrw $7, %xmm2, %eax
+; NOBW-NEXT: andl $31, %eax
+; NOBW-NEXT: vpinsrw $7, (%rsp,%rax,2), %xmm4, %xmm2
+; NOBW-NEXT: vmovd %xmm3, %eax
+; NOBW-NEXT: andl $31, %eax
+; NOBW-NEXT: movzwl 960(%rsp,%rax,2), %eax
+; NOBW-NEXT: vmovd %eax, %xmm4
+; NOBW-NEXT: vpextrw $1, %xmm3, %eax
+; NOBW-NEXT: andl $31, %eax
+; NOBW-NEXT: vpinsrw $1, 896(%rsp,%rax,2), %xmm4, %xmm4
+; NOBW-NEXT: vpextrw $2, %xmm3, %eax
+; NOBW-NEXT: andl $31, %eax
+; NOBW-NEXT: vpinsrw $2, 832(%rsp,%rax,2), %xmm4, %xmm4
+; NOBW-NEXT: vpextrw $3, %xmm3, %eax
+; NOBW-NEXT: andl $31, %eax
+; NOBW-NEXT: vpinsrw $3, 768(%rsp,%rax,2), %xmm4, %xmm4
+; NOBW-NEXT: vpextrw $4, %xmm3, %eax
+; NOBW-NEXT: andl $31, %eax
+; NOBW-NEXT: vpinsrw $4, 704(%rsp,%rax,2), %xmm4, %xmm4
+; NOBW-NEXT: vpextrw $5, %xmm3, %eax
+; NOBW-NEXT: andl $31, %eax
+; NOBW-NEXT: vpinsrw $5, 640(%rsp,%rax,2), %xmm4, %xmm4
+; NOBW-NEXT: vpextrw $6, %xmm3, %eax
+; NOBW-NEXT: andl $31, %eax
+; NOBW-NEXT: vpinsrw $6, 576(%rsp,%rax,2), %xmm4, %xmm4
+; NOBW-NEXT: vpextrw $7, %xmm3, %eax
+; NOBW-NEXT: andl $31, %eax
+; NOBW-NEXT: vpinsrw $7, 512(%rsp,%rax,2), %xmm4, %xmm3
+; NOBW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; NOBW-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm1
+; NOBW-NEXT: movq %rbp, %rsp
+; NOBW-NEXT: popq %rbp
+; NOBW-NEXT: retq
+;
+; AVX512BW-LABEL: var_shuffle_v32i16:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT: retq
+ %index0 = extractelement <32 x i16> %indices, i32 0
+ %index1 = extractelement <32 x i16> %indices, i32 1
+ %index2 = extractelement <32 x i16> %indices, i32 2
+ %index3 = extractelement <32 x i16> %indices, i32 3
+ %index4 = extractelement <32 x i16> %indices, i32 4
+ %index5 = extractelement <32 x i16> %indices, i32 5
+ %index6 = extractelement <32 x i16> %indices, i32 6
+ %index7 = extractelement <32 x i16> %indices, i32 7
+ %index8 = extractelement <32 x i16> %indices, i32 8
+ %index9 = extractelement <32 x i16> %indices, i32 9
+ %index10 = extractelement <32 x i16> %indices, i32 10
+ %index11 = extractelement <32 x i16> %indices, i32 11
+ %index12 = extractelement <32 x i16> %indices, i32 12
+ %index13 = extractelement <32 x i16> %indices, i32 13
+ %index14 = extractelement <32 x i16> %indices, i32 14
+ %index15 = extractelement <32 x i16> %indices, i32 15
+ %index16 = extractelement <32 x i16> %indices, i32 16
+ %index17 = extractelement <32 x i16> %indices, i32 17
+ %index18 = extractelement <32 x i16> %indices, i32 18
+ %index19 = extractelement <32 x i16> %indices, i32 19
+ %index20 = extractelement <32 x i16> %indices, i32 20
+ %index21 = extractelement <32 x i16> %indices, i32 21
+ %index22 = extractelement <32 x i16> %indices, i32 22
+ %index23 = extractelement <32 x i16> %indices, i32 23
+ %index24 = extractelement <32 x i16> %indices, i32 24
+ %index25 = extractelement <32 x i16> %indices, i32 25
+ %index26 = extractelement <32 x i16> %indices, i32 26
+ %index27 = extractelement <32 x i16> %indices, i32 27
+ %index28 = extractelement <32 x i16> %indices, i32 28
+ %index29 = extractelement <32 x i16> %indices, i32 29
+ %index30 = extractelement <32 x i16> %indices, i32 30
+ %index31 = extractelement <32 x i16> %indices, i32 31
+ %v0 = extractelement <32 x i16> %v, i16 %index0
+ %v1 = extractelement <32 x i16> %v, i16 %index1
+ %v2 = extractelement <32 x i16> %v, i16 %index2
+ %v3 = extractelement <32 x i16> %v, i16 %index3
+ %v4 = extractelement <32 x i16> %v, i16 %index4
+ %v5 = extractelement <32 x i16> %v, i16 %index5
+ %v6 = extractelement <32 x i16> %v, i16 %index6
+ %v7 = extractelement <32 x i16> %v, i16 %index7
+ %v8 = extractelement <32 x i16> %v, i16 %index8
+ %v9 = extractelement <32 x i16> %v, i16 %index9
+ %v10 = extractelement <32 x i16> %v, i16 %index10
+ %v11 = extractelement <32 x i16> %v, i16 %index11
+ %v12 = extractelement <32 x i16> %v, i16 %index12
+ %v13 = extractelement <32 x i16> %v, i16 %index13
+ %v14 = extractelement <32 x i16> %v, i16 %index14
+ %v15 = extractelement <32 x i16> %v, i16 %index15
+ %v16 = extractelement <32 x i16> %v, i16 %index16
+ %v17 = extractelement <32 x i16> %v, i16 %index17
+ %v18 = extractelement <32 x i16> %v, i16 %index18
+ %v19 = extractelement <32 x i16> %v, i16 %index19
+ %v20 = extractelement <32 x i16> %v, i16 %index20
+ %v21 = extractelement <32 x i16> %v, i16 %index21
+ %v22 = extractelement <32 x i16> %v, i16 %index22
+ %v23 = extractelement <32 x i16> %v, i16 %index23
+ %v24 = extractelement <32 x i16> %v, i16 %index24
+ %v25 = extractelement <32 x i16> %v, i16 %index25
+ %v26 = extractelement <32 x i16> %v, i16 %index26
+ %v27 = extractelement <32 x i16> %v, i16 %index27
+ %v28 = extractelement <32 x i16> %v, i16 %index28
+ %v29 = extractelement <32 x i16> %v, i16 %index29
+ %v30 = extractelement <32 x i16> %v, i16 %index30
+ %v31 = extractelement <32 x i16> %v, i16 %index31
+ %ret0 = insertelement <32 x i16> undef, i16 %v0, i32 0
+ %ret1 = insertelement <32 x i16> %ret0, i16 %v1, i32 1
+ %ret2 = insertelement <32 x i16> %ret1, i16 %v2, i32 2
+ %ret3 = insertelement <32 x i16> %ret2, i16 %v3, i32 3
+ %ret4 = insertelement <32 x i16> %ret3, i16 %v4, i32 4
+ %ret5 = insertelement <32 x i16> %ret4, i16 %v5, i32 5
+ %ret6 = insertelement <32 x i16> %ret5, i16 %v6, i32 6
+ %ret7 = insertelement <32 x i16> %ret6, i16 %v7, i32 7
+ %ret8 = insertelement <32 x i16> %ret7, i16 %v8, i32 8
+ %ret9 = insertelement <32 x i16> %ret8, i16 %v9, i32 9
+ %ret10 = insertelement <32 x i16> %ret9, i16 %v10, i32 10
+ %ret11 = insertelement <32 x i16> %ret10, i16 %v11, i32 11
+ %ret12 = insertelement <32 x i16> %ret11, i16 %v12, i32 12
+ %ret13 = insertelement <32 x i16> %ret12, i16 %v13, i32 13
+ %ret14 = insertelement <32 x i16> %ret13, i16 %v14, i32 14
+ %ret15 = insertelement <32 x i16> %ret14, i16 %v15, i32 15
+ %ret16 = insertelement <32 x i16> %ret15, i16 %v16, i32 16
+ %ret17 = insertelement <32 x i16> %ret16, i16 %v17, i32 17
+ %ret18 = insertelement <32 x i16> %ret17, i16 %v18, i32 18
+ %ret19 = insertelement <32 x i16> %ret18, i16 %v19, i32 19
+ %ret20 = insertelement <32 x i16> %ret19, i16 %v20, i32 20
+ %ret21 = insertelement <32 x i16> %ret20, i16 %v21, i32 21
+ %ret22 = insertelement <32 x i16> %ret21, i16 %v22, i32 22
+ %ret23 = insertelement <32 x i16> %ret22, i16 %v23, i32 23
+ %ret24 = insertelement <32 x i16> %ret23, i16 %v24, i32 24
+ %ret25 = insertelement <32 x i16> %ret24, i16 %v25, i32 25
+ %ret26 = insertelement <32 x i16> %ret25, i16 %v26, i32 26
+ %ret27 = insertelement <32 x i16> %ret26, i16 %v27, i32 27
+ %ret28 = insertelement <32 x i16> %ret27, i16 %v28, i32 28
+ %ret29 = insertelement <32 x i16> %ret28, i16 %v29, i32 29
+ %ret30 = insertelement <32 x i16> %ret29, i16 %v30, i32 30
+ %ret31 = insertelement <32 x i16> %ret30, i16 %v31, i32 31
+ ret <32 x i16> %ret31
+}
+
+define <64 x i8> @var_shuffle_v64i8(<64 x i8> %v, <64 x i8> %indices) nounwind {
+; NOBW-LABEL: var_shuffle_v64i8:
+; NOBW: # %bb.0:
+; NOBW-NEXT: pushq %rbp
+; NOBW-NEXT: movq %rsp, %rbp
+; NOBW-NEXT: andq $-64, %rsp
+; NOBW-NEXT: subq $4160, %rsp # imm = 0x1040
+; NOBW-NEXT: vextracti128 $1, %ymm2, %xmm4
+; NOBW-NEXT: vpextrb $0, %xmm4, %eax
+; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: andl $63, %eax
+; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; NOBW-NEXT: vmovaps %ymm0, (%rsp)
+; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
+; NOBW-NEXT: movzbl (%rax,%rcx), %eax
+; NOBW-NEXT: vpextrb $1, %xmm4, %ecx
+; NOBW-NEXT: andl $63, %ecx
+; NOBW-NEXT: vmovd %eax, %xmm0
+; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rax
+; NOBW-NEXT: vpinsrb $1, (%rcx,%rax), %xmm0, %xmm0
+; NOBW-NEXT: vpextrb $2, %xmm4, %eax
+; NOBW-NEXT: andl $63, %eax
+; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
+; NOBW-NEXT: vpinsrb $2, (%rax,%rcx), %xmm0, %xmm0
+; NOBW-NEXT: vpextrb $3, %xmm4, %eax
+; NOBW-NEXT: andl $63, %eax
+; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
+; NOBW-NEXT: vpinsrb $3, (%rax,%rcx), %xmm0, %xmm0
+; NOBW-NEXT: vpextrb $4, %xmm4, %eax
+; NOBW-NEXT: andl $63, %eax
+; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
+; NOBW-NEXT: vpinsrb $4, (%rax,%rcx), %xmm0, %xmm0
+; NOBW-NEXT: vpextrb $5, %xmm4, %eax
+; NOBW-NEXT: andl $63, %eax
+; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
+; NOBW-NEXT: vpinsrb $5, (%rax,%rcx), %xmm0, %xmm0
+; NOBW-NEXT: vpextrb $6, %xmm4, %eax
+; NOBW-NEXT: andl $63, %eax
+; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
+; NOBW-NEXT: vpinsrb $6, (%rax,%rcx), %xmm0, %xmm0
+; NOBW-NEXT: vpextrb $7, %xmm4, %eax
+; NOBW-NEXT: andl $63, %eax
+; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
+; NOBW-NEXT: vpinsrb $7, (%rax,%rcx), %xmm0, %xmm0
+; NOBW-NEXT: vpextrb $8, %xmm4, %eax
+; NOBW-NEXT: andl $63, %eax
+; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
+; NOBW-NEXT: vpinsrb $8, (%rax,%rcx), %xmm0, %xmm0
+; NOBW-NEXT: vpextrb $9, %xmm4, %eax
+; NOBW-NEXT: andl $63, %eax
+; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
+; NOBW-NEXT: vpinsrb $9, (%rax,%rcx), %xmm0, %xmm0
+; NOBW-NEXT: vpextrb $10, %xmm4, %eax
+; NOBW-NEXT: andl $63, %eax
+; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
+; NOBW-NEXT: vpinsrb $10, (%rax,%rcx), %xmm0, %xmm0
+; NOBW-NEXT: vpextrb $11, %xmm4, %eax
+; NOBW-NEXT: andl $63, %eax
+; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
+; NOBW-NEXT: vpinsrb $11, (%rax,%rcx), %xmm0, %xmm0
+; NOBW-NEXT: vpextrb $12, %xmm4, %eax
+; NOBW-NEXT: andl $63, %eax
+; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
+; NOBW-NEXT: vpinsrb $12, (%rax,%rcx), %xmm0, %xmm0
+; NOBW-NEXT: vpextrb $13, %xmm4, %eax
+; NOBW-NEXT: andl $63, %eax
+; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
+; NOBW-NEXT: vpinsrb $13, (%rax,%rcx), %xmm0, %xmm0
+; NOBW-NEXT: vpextrb $14, %xmm4, %eax
+; NOBW-NEXT: andl $63, %eax
+; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
+; NOBW-NEXT: vpinsrb $14, (%rax,%rcx), %xmm0, %xmm0
+; NOBW-NEXT: vpextrb $15, %xmm4, %eax
+; NOBW-NEXT: andl $63, %eax
+; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
+; NOBW-NEXT: vpinsrb $15, (%rax,%rcx), %xmm0, %xmm0
+; NOBW-NEXT: vpextrb $0, %xmm2, %eax
+; NOBW-NEXT: andl $63, %eax
+; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
+; NOBW-NEXT: movzbl (%rax,%rcx), %eax
+; NOBW-NEXT: vpextrb $1, %xmm2, %ecx
+; NOBW-NEXT: andl $63, %ecx
+; NOBW-NEXT: vmovd %eax, %xmm1
+; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rax
+; NOBW-NEXT: vpinsrb $1, (%rcx,%rax), %xmm1, %xmm1
+; NOBW-NEXT: vpextrb $2, %xmm2, %eax
+; NOBW-NEXT: andl $63, %eax
+; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
+; NOBW-NEXT: vpinsrb $2, (%rax,%rcx), %xmm1, %xmm1
+; NOBW-NEXT: vpextrb $3, %xmm2, %eax
+; NOBW-NEXT: andl $63, %eax
+; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
+; NOBW-NEXT: vpinsrb $3, (%rax,%rcx), %xmm1, %xmm1
+; NOBW-NEXT: vpextrb $4, %xmm2, %eax
+; NOBW-NEXT: andl $63, %eax
+; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
+; NOBW-NEXT: vpinsrb $4, (%rax,%rcx), %xmm1, %xmm1
+; NOBW-NEXT: vpextrb $5, %xmm2, %eax
+; NOBW-NEXT: andl $63, %eax
+; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
+; NOBW-NEXT: vpinsrb $5, (%rax,%rcx), %xmm1, %xmm1
+; NOBW-NEXT: vpextrb $6, %xmm2, %eax
+; NOBW-NEXT: andl $63, %eax
+; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
+; NOBW-NEXT: vpinsrb $6, (%rax,%rcx), %xmm1, %xmm1
+; NOBW-NEXT: vpextrb $7, %xmm2, %eax
+; NOBW-NEXT: andl $63, %eax
+; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
+; NOBW-NEXT: vpinsrb $7, (%rax,%rcx), %xmm1, %xmm1
+; NOBW-NEXT: vpextrb $8, %xmm2, %eax
+; NOBW-NEXT: andl $63, %eax
+; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
+; NOBW-NEXT: vpinsrb $8, (%rax,%rcx), %xmm1, %xmm1
+; NOBW-NEXT: vpextrb $9, %xmm2, %eax
+; NOBW-NEXT: andl $63, %eax
+; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
+; NOBW-NEXT: vpinsrb $9, (%rax,%rcx), %xmm1, %xmm1
+; NOBW-NEXT: vpextrb $10, %xmm2, %eax
+; NOBW-NEXT: andl $63, %eax
+; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
+; NOBW-NEXT: vpinsrb $10, (%rax,%rcx), %xmm1, %xmm1
+; NOBW-NEXT: vpextrb $11, %xmm2, %eax
+; NOBW-NEXT: andl $63, %eax
+; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
+; NOBW-NEXT: vpinsrb $11, (%rax,%rcx), %xmm1, %xmm1
+; NOBW-NEXT: vpextrb $12, %xmm2, %eax
+; NOBW-NEXT: andl $63, %eax
+; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
+; NOBW-NEXT: vpinsrb $12, (%rax,%rcx), %xmm1, %xmm1
+; NOBW-NEXT: vpextrb $13, %xmm2, %eax
+; NOBW-NEXT: andl $63, %eax
+; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
+; NOBW-NEXT: vpinsrb $13, (%rax,%rcx), %xmm1, %xmm1
+; NOBW-NEXT: vpextrb $14, %xmm2, %eax
+; NOBW-NEXT: andl $63, %eax
+; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
+; NOBW-NEXT: vpinsrb $14, (%rax,%rcx), %xmm1, %xmm1
+; NOBW-NEXT: vpextrb $15, %xmm2, %eax
+; NOBW-NEXT: vextracti128 $1, %ymm3, %xmm2
+; NOBW-NEXT: andl $63, %eax
+; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
+; NOBW-NEXT: vpinsrb $15, (%rax,%rcx), %xmm1, %xmm1
+; NOBW-NEXT: vpextrb $0, %xmm2, %eax
+; NOBW-NEXT: andl $63, %eax
+; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
+; NOBW-NEXT: movzbl (%rax,%rcx), %eax
+; NOBW-NEXT: vpextrb $1, %xmm2, %ecx
+; NOBW-NEXT: andl $63, %ecx
+; NOBW-NEXT: vmovd %eax, %xmm4
+; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rax
+; NOBW-NEXT: vpinsrb $1, (%rcx,%rax), %xmm4, %xmm4
+; NOBW-NEXT: vpextrb $2, %xmm2, %eax
+; NOBW-NEXT: andl $63, %eax
+; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
+; NOBW-NEXT: vpinsrb $2, (%rax,%rcx), %xmm4, %xmm4
+; NOBW-NEXT: vpextrb $3, %xmm2, %eax
+; NOBW-NEXT: andl $63, %eax
+; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
+; NOBW-NEXT: vpinsrb $3, (%rax,%rcx), %xmm4, %xmm4
+; NOBW-NEXT: vpextrb $4, %xmm2, %eax
+; NOBW-NEXT: andl $63, %eax
+; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
+; NOBW-NEXT: vpinsrb $4, (%rax,%rcx), %xmm4, %xmm4
+; NOBW-NEXT: vpextrb $5, %xmm2, %eax
+; NOBW-NEXT: andl $63, %eax
+; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
+; NOBW-NEXT: vpinsrb $5, (%rax,%rcx), %xmm4, %xmm4
+; NOBW-NEXT: vpextrb $6, %xmm2, %eax
+; NOBW-NEXT: andl $63, %eax
+; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
+; NOBW-NEXT: vpinsrb $6, (%rax,%rcx), %xmm4, %xmm4
+; NOBW-NEXT: vpextrb $7, %xmm2, %eax
+; NOBW-NEXT: andl $63, %eax
+; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
+; NOBW-NEXT: vpinsrb $7, (%rax,%rcx), %xmm4, %xmm4
+; NOBW-NEXT: vpextrb $8, %xmm2, %eax
+; NOBW-NEXT: andl $63, %eax
+; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
+; NOBW-NEXT: vpinsrb $8, (%rax,%rcx), %xmm4, %xmm4
+; NOBW-NEXT: vpextrb $9, %xmm2, %eax
+; NOBW-NEXT: andl $63, %eax
+; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
+; NOBW-NEXT: vpinsrb $9, (%rax,%rcx), %xmm4, %xmm4
+; NOBW-NEXT: vpextrb $10, %xmm2, %eax
+; NOBW-NEXT: andl $63, %eax
+; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
+; NOBW-NEXT: vpinsrb $10, (%rax,%rcx), %xmm4, %xmm4
+; NOBW-NEXT: vpextrb $11, %xmm2, %eax
+; NOBW-NEXT: andl $63, %eax
+; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
+; NOBW-NEXT: vpinsrb $11, (%rax,%rcx), %xmm4, %xmm4
+; NOBW-NEXT: vpextrb $12, %xmm2, %eax
+; NOBW-NEXT: andl $63, %eax
+; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
+; NOBW-NEXT: vpinsrb $12, (%rax,%rcx), %xmm4, %xmm4
+; NOBW-NEXT: vpextrb $13, %xmm2, %eax
+; NOBW-NEXT: andl $63, %eax
+; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
+; NOBW-NEXT: vpinsrb $13, (%rax,%rcx), %xmm4, %xmm4
+; NOBW-NEXT: vpextrb $14, %xmm2, %eax
+; NOBW-NEXT: andl $63, %eax
+; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
+; NOBW-NEXT: vpinsrb $14, (%rax,%rcx), %xmm4, %xmm4
+; NOBW-NEXT: vpextrb $15, %xmm2, %eax
+; NOBW-NEXT: andl $63, %eax
+; NOBW-NEXT: movq %rsp, %rcx
+; NOBW-NEXT: vpinsrb $15, (%rax,%rcx), %xmm4, %xmm2
+; NOBW-NEXT: vpextrb $0, %xmm3, %eax
+; NOBW-NEXT: andl $63, %eax
+; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
+; NOBW-NEXT: movzbl (%rax,%rcx), %eax
+; NOBW-NEXT: vpextrb $1, %xmm3, %ecx
+; NOBW-NEXT: andl $63, %ecx
+; NOBW-NEXT: vmovd %eax, %xmm4
+; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rax
+; NOBW-NEXT: vpinsrb $1, (%rcx,%rax), %xmm4, %xmm4
+; NOBW-NEXT: vpextrb $2, %xmm3, %eax
+; NOBW-NEXT: andl $63, %eax
+; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
+; NOBW-NEXT: vpinsrb $2, (%rax,%rcx), %xmm4, %xmm4
+; NOBW-NEXT: vpextrb $3, %xmm3, %eax
+; NOBW-NEXT: andl $63, %eax
+; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
+; NOBW-NEXT: vpinsrb $3, (%rax,%rcx), %xmm4, %xmm4
+; NOBW-NEXT: vpextrb $4, %xmm3, %eax
+; NOBW-NEXT: andl $63, %eax
+; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
+; NOBW-NEXT: vpinsrb $4, (%rax,%rcx), %xmm4, %xmm4
+; NOBW-NEXT: vpextrb $5, %xmm3, %eax
+; NOBW-NEXT: andl $63, %eax
+; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
+; NOBW-NEXT: vpinsrb $5, (%rax,%rcx), %xmm4, %xmm4
+; NOBW-NEXT: vpextrb $6, %xmm3, %eax
+; NOBW-NEXT: andl $63, %eax
+; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
+; NOBW-NEXT: vpinsrb $6, (%rax,%rcx), %xmm4, %xmm4
+; NOBW-NEXT: vpextrb $7, %xmm3, %eax
+; NOBW-NEXT: andl $63, %eax
+; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
+; NOBW-NEXT: vpinsrb $7, (%rax,%rcx), %xmm4, %xmm4
+; NOBW-NEXT: vpextrb $8, %xmm3, %eax
+; NOBW-NEXT: andl $63, %eax
+; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
+; NOBW-NEXT: vpinsrb $8, (%rax,%rcx), %xmm4, %xmm4
+; NOBW-NEXT: vpextrb $9, %xmm3, %eax
+; NOBW-NEXT: andl $63, %eax
+; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
+; NOBW-NEXT: vpinsrb $9, (%rax,%rcx), %xmm4, %xmm4
+; NOBW-NEXT: vpextrb $10, %xmm3, %eax
+; NOBW-NEXT: andl $63, %eax
+; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
+; NOBW-NEXT: vpinsrb $10, (%rax,%rcx), %xmm4, %xmm4
+; NOBW-NEXT: vpextrb $11, %xmm3, %eax
+; NOBW-NEXT: andl $63, %eax
+; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
+; NOBW-NEXT: vpinsrb $11, (%rax,%rcx), %xmm4, %xmm4
+; NOBW-NEXT: vpextrb $12, %xmm3, %eax
+; NOBW-NEXT: andl $63, %eax
+; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
+; NOBW-NEXT: vpinsrb $12, (%rax,%rcx), %xmm4, %xmm4
+; NOBW-NEXT: vpextrb $13, %xmm3, %eax
+; NOBW-NEXT: andl $63, %eax
+; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
+; NOBW-NEXT: vpinsrb $13, (%rax,%rcx), %xmm4, %xmm4
+; NOBW-NEXT: vpextrb $14, %xmm3, %eax
+; NOBW-NEXT: andl $63, %eax
+; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
+; NOBW-NEXT: vpinsrb $14, (%rax,%rcx), %xmm4, %xmm4
+; NOBW-NEXT: vpextrb $15, %xmm3, %eax
+; NOBW-NEXT: andl $63, %eax
+; NOBW-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
+; NOBW-NEXT: vpinsrb $15, (%rax,%rcx), %xmm4, %xmm3
+; NOBW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; NOBW-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm1
+; NOBW-NEXT: movq %rbp, %rsp
+; NOBW-NEXT: popq %rbp
+; NOBW-NEXT: retq
+;
+; VBMI-LABEL: var_shuffle_v64i8:
+; VBMI: # %bb.0:
+; VBMI-NEXT: vpermb %zmm0, %zmm1, %zmm0
+; VBMI-NEXT: retq
+ %index0 = extractelement <64 x i8> %indices, i32 0
+ %index1 = extractelement <64 x i8> %indices, i32 1
+ %index2 = extractelement <64 x i8> %indices, i32 2
+ %index3 = extractelement <64 x i8> %indices, i32 3
+ %index4 = extractelement <64 x i8> %indices, i32 4
+ %index5 = extractelement <64 x i8> %indices, i32 5
+ %index6 = extractelement <64 x i8> %indices, i32 6
+ %index7 = extractelement <64 x i8> %indices, i32 7
+ %index8 = extractelement <64 x i8> %indices, i32 8
+ %index9 = extractelement <64 x i8> %indices, i32 9
+ %index10 = extractelement <64 x i8> %indices, i32 10
+ %index11 = extractelement <64 x i8> %indices, i32 11
+ %index12 = extractelement <64 x i8> %indices, i32 12
+ %index13 = extractelement <64 x i8> %indices, i32 13
+ %index14 = extractelement <64 x i8> %indices, i32 14
+ %index15 = extractelement <64 x i8> %indices, i32 15
+ %index16 = extractelement <64 x i8> %indices, i32 16
+ %index17 = extractelement <64 x i8> %indices, i32 17
+ %index18 = extractelement <64 x i8> %indices, i32 18
+ %index19 = extractelement <64 x i8> %indices, i32 19
+ %index20 = extractelement <64 x i8> %indices, i32 20
+ %index21 = extractelement <64 x i8> %indices, i32 21
+ %index22 = extractelement <64 x i8> %indices, i32 22
+ %index23 = extractelement <64 x i8> %indices, i32 23
+ %index24 = extractelement <64 x i8> %indices, i32 24
+ %index25 = extractelement <64 x i8> %indices, i32 25
+ %index26 = extractelement <64 x i8> %indices, i32 26
+ %index27 = extractelement <64 x i8> %indices, i32 27
+ %index28 = extractelement <64 x i8> %indices, i32 28
+ %index29 = extractelement <64 x i8> %indices, i32 29
+ %index30 = extractelement <64 x i8> %indices, i32 30
+ %index31 = extractelement <64 x i8> %indices, i32 31
+ %index32 = extractelement <64 x i8> %indices, i32 32
+ %index33 = extractelement <64 x i8> %indices, i32 33
+ %index34 = extractelement <64 x i8> %indices, i32 34
+ %index35 = extractelement <64 x i8> %indices, i32 35
+ %index36 = extractelement <64 x i8> %indices, i32 36
+ %index37 = extractelement <64 x i8> %indices, i32 37
+ %index38 = extractelement <64 x i8> %indices, i32 38
+ %index39 = extractelement <64 x i8> %indices, i32 39
+ %index40 = extractelement <64 x i8> %indices, i32 40
+ %index41 = extractelement <64 x i8> %indices, i32 41
+ %index42 = extractelement <64 x i8> %indices, i32 42
+ %index43 = extractelement <64 x i8> %indices, i32 43
+ %index44 = extractelement <64 x i8> %indices, i32 44
+ %index45 = extractelement <64 x i8> %indices, i32 45
+ %index46 = extractelement <64 x i8> %indices, i32 46
+ %index47 = extractelement <64 x i8> %indices, i32 47
+ %index48 = extractelement <64 x i8> %indices, i32 48
+ %index49 = extractelement <64 x i8> %indices, i32 49
+ %index50 = extractelement <64 x i8> %indices, i32 50
+ %index51 = extractelement <64 x i8> %indices, i32 51
+ %index52 = extractelement <64 x i8> %indices, i32 52
+ %index53 = extractelement <64 x i8> %indices, i32 53
+ %index54 = extractelement <64 x i8> %indices, i32 54
+ %index55 = extractelement <64 x i8> %indices, i32 55
+ %index56 = extractelement <64 x i8> %indices, i32 56
+ %index57 = extractelement <64 x i8> %indices, i32 57
+ %index58 = extractelement <64 x i8> %indices, i32 58
+ %index59 = extractelement <64 x i8> %indices, i32 59
+ %index60 = extractelement <64 x i8> %indices, i32 60
+ %index61 = extractelement <64 x i8> %indices, i32 61
+ %index62 = extractelement <64 x i8> %indices, i32 62
+ %index63 = extractelement <64 x i8> %indices, i32 63
+ %v0 = extractelement <64 x i8> %v, i8 %index0
+ %v1 = extractelement <64 x i8> %v, i8 %index1
+ %v2 = extractelement <64 x i8> %v, i8 %index2
+ %v3 = extractelement <64 x i8> %v, i8 %index3
+ %v4 = extractelement <64 x i8> %v, i8 %index4
+ %v5 = extractelement <64 x i8> %v, i8 %index5
+ %v6 = extractelement <64 x i8> %v, i8 %index6
+ %v7 = extractelement <64 x i8> %v, i8 %index7
+ %v8 = extractelement <64 x i8> %v, i8 %index8
+ %v9 = extractelement <64 x i8> %v, i8 %index9
+ %v10 = extractelement <64 x i8> %v, i8 %index10
+ %v11 = extractelement <64 x i8> %v, i8 %index11
+ %v12 = extractelement <64 x i8> %v, i8 %index12
+ %v13 = extractelement <64 x i8> %v, i8 %index13
+ %v14 = extractelement <64 x i8> %v, i8 %index14
+ %v15 = extractelement <64 x i8> %v, i8 %index15
+ %v16 = extractelement <64 x i8> %v, i8 %index16
+ %v17 = extractelement <64 x i8> %v, i8 %index17
+ %v18 = extractelement <64 x i8> %v, i8 %index18
+ %v19 = extractelement <64 x i8> %v, i8 %index19
+ %v20 = extractelement <64 x i8> %v, i8 %index20
+ %v21 = extractelement <64 x i8> %v, i8 %index21
+ %v22 = extractelement <64 x i8> %v, i8 %index22
+ %v23 = extractelement <64 x i8> %v, i8 %index23
+ %v24 = extractelement <64 x i8> %v, i8 %index24
+ %v25 = extractelement <64 x i8> %v, i8 %index25
+ %v26 = extractelement <64 x i8> %v, i8 %index26
+ %v27 = extractelement <64 x i8> %v, i8 %index27
+ %v28 = extractelement <64 x i8> %v, i8 %index28
+ %v29 = extractelement <64 x i8> %v, i8 %index29
+ %v30 = extractelement <64 x i8> %v, i8 %index30
+ %v31 = extractelement <64 x i8> %v, i8 %index31
+ %v32 = extractelement <64 x i8> %v, i8 %index32
+ %v33 = extractelement <64 x i8> %v, i8 %index33
+ %v34 = extractelement <64 x i8> %v, i8 %index34
+ %v35 = extractelement <64 x i8> %v, i8 %index35
+ %v36 = extractelement <64 x i8> %v, i8 %index36
+ %v37 = extractelement <64 x i8> %v, i8 %index37
+ %v38 = extractelement <64 x i8> %v, i8 %index38
+ %v39 = extractelement <64 x i8> %v, i8 %index39
+ %v40 = extractelement <64 x i8> %v, i8 %index40
+ %v41 = extractelement <64 x i8> %v, i8 %index41
+ %v42 = extractelement <64 x i8> %v, i8 %index42
+ %v43 = extractelement <64 x i8> %v, i8 %index43
+ %v44 = extractelement <64 x i8> %v, i8 %index44
+ %v45 = extractelement <64 x i8> %v, i8 %index45
+ %v46 = extractelement <64 x i8> %v, i8 %index46
+ %v47 = extractelement <64 x i8> %v, i8 %index47
+ %v48 = extractelement <64 x i8> %v, i8 %index48
+ %v49 = extractelement <64 x i8> %v, i8 %index49
+ %v50 = extractelement <64 x i8> %v, i8 %index50
+ %v51 = extractelement <64 x i8> %v, i8 %index51
+ %v52 = extractelement <64 x i8> %v, i8 %index52
+ %v53 = extractelement <64 x i8> %v, i8 %index53
+ %v54 = extractelement <64 x i8> %v, i8 %index54
+ %v55 = extractelement <64 x i8> %v, i8 %index55
+ %v56 = extractelement <64 x i8> %v, i8 %index56
+ %v57 = extractelement <64 x i8> %v, i8 %index57
+ %v58 = extractelement <64 x i8> %v, i8 %index58
+ %v59 = extractelement <64 x i8> %v, i8 %index59
+ %v60 = extractelement <64 x i8> %v, i8 %index60
+ %v61 = extractelement <64 x i8> %v, i8 %index61
+ %v62 = extractelement <64 x i8> %v, i8 %index62
+ %v63 = extractelement <64 x i8> %v, i8 %index63
+ %ret0 = insertelement <64 x i8> undef, i8 %v0, i32 0
+ %ret1 = insertelement <64 x i8> %ret0, i8 %v1, i32 1
+ %ret2 = insertelement <64 x i8> %ret1, i8 %v2, i32 2
+ %ret3 = insertelement <64 x i8> %ret2, i8 %v3, i32 3
+ %ret4 = insertelement <64 x i8> %ret3, i8 %v4, i32 4
+ %ret5 = insertelement <64 x i8> %ret4, i8 %v5, i32 5
+ %ret6 = insertelement <64 x i8> %ret5, i8 %v6, i32 6
+ %ret7 = insertelement <64 x i8> %ret6, i8 %v7, i32 7
+ %ret8 = insertelement <64 x i8> %ret7, i8 %v8, i32 8
+ %ret9 = insertelement <64 x i8> %ret8, i8 %v9, i32 9
+ %ret10 = insertelement <64 x i8> %ret9, i8 %v10, i32 10
+ %ret11 = insertelement <64 x i8> %ret10, i8 %v11, i32 11
+ %ret12 = insertelement <64 x i8> %ret11, i8 %v12, i32 12
+ %ret13 = insertelement <64 x i8> %ret12, i8 %v13, i32 13
+ %ret14 = insertelement <64 x i8> %ret13, i8 %v14, i32 14
+ %ret15 = insertelement <64 x i8> %ret14, i8 %v15, i32 15
+ %ret16 = insertelement <64 x i8> %ret15, i8 %v16, i32 16
+ %ret17 = insertelement <64 x i8> %ret16, i8 %v17, i32 17
+ %ret18 = insertelement <64 x i8> %ret17, i8 %v18, i32 18
+ %ret19 = insertelement <64 x i8> %ret18, i8 %v19, i32 19
+ %ret20 = insertelement <64 x i8> %ret19, i8 %v20, i32 20
+ %ret21 = insertelement <64 x i8> %ret20, i8 %v21, i32 21
+ %ret22 = insertelement <64 x i8> %ret21, i8 %v22, i32 22
+ %ret23 = insertelement <64 x i8> %ret22, i8 %v23, i32 23
+ %ret24 = insertelement <64 x i8> %ret23, i8 %v24, i32 24
+ %ret25 = insertelement <64 x i8> %ret24, i8 %v25, i32 25
+ %ret26 = insertelement <64 x i8> %ret25, i8 %v26, i32 26
+ %ret27 = insertelement <64 x i8> %ret26, i8 %v27, i32 27
+ %ret28 = insertelement <64 x i8> %ret27, i8 %v28, i32 28
+ %ret29 = insertelement <64 x i8> %ret28, i8 %v29, i32 29
+ %ret30 = insertelement <64 x i8> %ret29, i8 %v30, i32 30
+ %ret31 = insertelement <64 x i8> %ret30, i8 %v31, i32 31
+ %ret32 = insertelement <64 x i8> %ret31, i8 %v32, i32 32
+ %ret33 = insertelement <64 x i8> %ret32, i8 %v33, i32 33
+ %ret34 = insertelement <64 x i8> %ret33, i8 %v34, i32 34
+ %ret35 = insertelement <64 x i8> %ret34, i8 %v35, i32 35
+ %ret36 = insertelement <64 x i8> %ret35, i8 %v36, i32 36
+ %ret37 = insertelement <64 x i8> %ret36, i8 %v37, i32 37
+ %ret38 = insertelement <64 x i8> %ret37, i8 %v38, i32 38
+ %ret39 = insertelement <64 x i8> %ret38, i8 %v39, i32 39
+ %ret40 = insertelement <64 x i8> %ret39, i8 %v40, i32 40
+ %ret41 = insertelement <64 x i8> %ret40, i8 %v41, i32 41
+ %ret42 = insertelement <64 x i8> %ret41, i8 %v42, i32 42
+ %ret43 = insertelement <64 x i8> %ret42, i8 %v43, i32 43
+ %ret44 = insertelement <64 x i8> %ret43, i8 %v44, i32 44
+ %ret45 = insertelement <64 x i8> %ret44, i8 %v45, i32 45
+ %ret46 = insertelement <64 x i8> %ret45, i8 %v46, i32 46
+ %ret47 = insertelement <64 x i8> %ret46, i8 %v47, i32 47
+ %ret48 = insertelement <64 x i8> %ret47, i8 %v48, i32 48
+ %ret49 = insertelement <64 x i8> %ret48, i8 %v49, i32 49
+ %ret50 = insertelement <64 x i8> %ret49, i8 %v50, i32 50
+ %ret51 = insertelement <64 x i8> %ret50, i8 %v51, i32 51
+ %ret52 = insertelement <64 x i8> %ret51, i8 %v52, i32 52
+ %ret53 = insertelement <64 x i8> %ret52, i8 %v53, i32 53
+ %ret54 = insertelement <64 x i8> %ret53, i8 %v54, i32 54
+ %ret55 = insertelement <64 x i8> %ret54, i8 %v55, i32 55
+ %ret56 = insertelement <64 x i8> %ret55, i8 %v56, i32 56
+ %ret57 = insertelement <64 x i8> %ret56, i8 %v57, i32 57
+ %ret58 = insertelement <64 x i8> %ret57, i8 %v58, i32 58
+ %ret59 = insertelement <64 x i8> %ret58, i8 %v59, i32 59
+ %ret60 = insertelement <64 x i8> %ret59, i8 %v60, i32 60
+ %ret61 = insertelement <64 x i8> %ret60, i8 %v61, i32 61
+ %ret62 = insertelement <64 x i8> %ret61, i8 %v62, i32 62
+ %ret63 = insertelement <64 x i8> %ret62, i8 %v63, i32 63
+ ret <64 x i8> %ret63
+}
+
+define <8 x double> @var_shuffle_v8f64(<8 x double> %v, <8 x i64> %indices) nounwind {
+; AVX512-LABEL: var_shuffle_v8f64:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; AVX512-NEXT: retq
+ %index0 = extractelement <8 x i64> %indices, i32 0
+ %index1 = extractelement <8 x i64> %indices, i32 1
+ %index2 = extractelement <8 x i64> %indices, i32 2
+ %index3 = extractelement <8 x i64> %indices, i32 3
+ %index4 = extractelement <8 x i64> %indices, i32 4
+ %index5 = extractelement <8 x i64> %indices, i32 5
+ %index6 = extractelement <8 x i64> %indices, i32 6
+ %index7 = extractelement <8 x i64> %indices, i32 7
+ %v0 = extractelement <8 x double> %v, i64 %index0
+ %v1 = extractelement <8 x double> %v, i64 %index1
+ %v2 = extractelement <8 x double> %v, i64 %index2
+ %v3 = extractelement <8 x double> %v, i64 %index3
+ %v4 = extractelement <8 x double> %v, i64 %index4
+ %v5 = extractelement <8 x double> %v, i64 %index5
+ %v6 = extractelement <8 x double> %v, i64 %index6
+ %v7 = extractelement <8 x double> %v, i64 %index7
+ %ret0 = insertelement <8 x double> undef, double %v0, i32 0
+ %ret1 = insertelement <8 x double> %ret0, double %v1, i32 1
+ %ret2 = insertelement <8 x double> %ret1, double %v2, i32 2
+ %ret3 = insertelement <8 x double> %ret2, double %v3, i32 3
+ %ret4 = insertelement <8 x double> %ret3, double %v4, i32 4
+ %ret5 = insertelement <8 x double> %ret4, double %v5, i32 5
+ %ret6 = insertelement <8 x double> %ret5, double %v6, i32 6
+ %ret7 = insertelement <8 x double> %ret6, double %v7, i32 7
+ ret <8 x double> %ret7
+}
+
+define <16 x float> @var_shuffle_v16f32(<16 x float> %v, <16 x i32> %indices) nounwind {
+; AVX512-LABEL: var_shuffle_v16f32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpermps %zmm0, %zmm1, %zmm0
+; AVX512-NEXT: retq
+ %index0 = extractelement <16 x i32> %indices, i32 0
+ %index1 = extractelement <16 x i32> %indices, i32 1
+ %index2 = extractelement <16 x i32> %indices, i32 2
+ %index3 = extractelement <16 x i32> %indices, i32 3
+ %index4 = extractelement <16 x i32> %indices, i32 4
+ %index5 = extractelement <16 x i32> %indices, i32 5
+ %index6 = extractelement <16 x i32> %indices, i32 6
+ %index7 = extractelement <16 x i32> %indices, i32 7
+ %index8 = extractelement <16 x i32> %indices, i32 8
+ %index9 = extractelement <16 x i32> %indices, i32 9
+ %index10 = extractelement <16 x i32> %indices, i32 10
+ %index11 = extractelement <16 x i32> %indices, i32 11
+ %index12 = extractelement <16 x i32> %indices, i32 12
+ %index13 = extractelement <16 x i32> %indices, i32 13
+ %index14 = extractelement <16 x i32> %indices, i32 14
+ %index15 = extractelement <16 x i32> %indices, i32 15
+ %v0 = extractelement <16 x float> %v, i32 %index0
+ %v1 = extractelement <16 x float> %v, i32 %index1
+ %v2 = extractelement <16 x float> %v, i32 %index2
+ %v3 = extractelement <16 x float> %v, i32 %index3
+ %v4 = extractelement <16 x float> %v, i32 %index4
+ %v5 = extractelement <16 x float> %v, i32 %index5
+ %v6 = extractelement <16 x float> %v, i32 %index6
+ %v7 = extractelement <16 x float> %v, i32 %index7
+ %v8 = extractelement <16 x float> %v, i32 %index8
+ %v9 = extractelement <16 x float> %v, i32 %index9
+ %v10 = extractelement <16 x float> %v, i32 %index10
+ %v11 = extractelement <16 x float> %v, i32 %index11
+ %v12 = extractelement <16 x float> %v, i32 %index12
+ %v13 = extractelement <16 x float> %v, i32 %index13
+ %v14 = extractelement <16 x float> %v, i32 %index14
+ %v15 = extractelement <16 x float> %v, i32 %index15
+ %ret0 = insertelement <16 x float> undef, float %v0, i32 0
+ %ret1 = insertelement <16 x float> %ret0, float %v1, i32 1
+ %ret2 = insertelement <16 x float> %ret1, float %v2, i32 2
+ %ret3 = insertelement <16 x float> %ret2, float %v3, i32 3
+ %ret4 = insertelement <16 x float> %ret3, float %v4, i32 4
+ %ret5 = insertelement <16 x float> %ret4, float %v5, i32 5
+ %ret6 = insertelement <16 x float> %ret5, float %v6, i32 6
+ %ret7 = insertelement <16 x float> %ret6, float %v7, i32 7
+ %ret8 = insertelement <16 x float> %ret7, float %v8, i32 8
+ %ret9 = insertelement <16 x float> %ret8, float %v9, i32 9
+ %ret10 = insertelement <16 x float> %ret9, float %v10, i32 10
+ %ret11 = insertelement <16 x float> %ret10, float %v11, i32 11
+ %ret12 = insertelement <16 x float> %ret11, float %v12, i32 12
+ %ret13 = insertelement <16 x float> %ret12, float %v13, i32 13
+ %ret14 = insertelement <16 x float> %ret13, float %v14, i32 14
+ %ret15 = insertelement <16 x float> %ret14, float %v15, i32 15
+ ret <16 x float> %ret15
+}
diff --git a/test/CodeGen/X86/variable-sized-darwin-bzero.ll b/test/CodeGen/X86/variable-sized-darwin-bzero.ll
index 1e86d75bf09c..3971190f02cc 100644
--- a/test/CodeGen/X86/variable-sized-darwin-bzero.ll
+++ b/test/CodeGen/X86/variable-sized-darwin-bzero.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mtriple=i686-apple-darwin10 | grep __bzero
+; RUN: llc < %s -mtriple=i686-apple-darwin10 | grep __bzero
define void @foo(i8* %p, i64 %n) {
call void @llvm.memset.p0i8.i64(i8* %p, i8 0, i64 %n, i32 4, i1 false)
diff --git a/test/CodeGen/X86/vec-copysign-avx512.ll b/test/CodeGen/X86/vec-copysign-avx512.ll
index 535065d39aa5..9aa9b529290e 100644
--- a/test/CodeGen/X86/vec-copysign-avx512.ll
+++ b/test/CodeGen/X86/vec-copysign-avx512.ll
@@ -4,14 +4,14 @@
define <4 x float> @v4f32(<4 x float> %a, <4 x float> %b) nounwind {
; AVX512VL-LABEL: v4f32:
-; AVX512VL: ## BB#0:
+; AVX512VL: ## %bb.0:
; AVX512VL-NEXT: vpandd {{.*}}(%rip){1to4}, %xmm1, %xmm1
; AVX512VL-NEXT: vpandd {{.*}}(%rip){1to4}, %xmm0, %xmm0
; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512VL-NEXT: retq
;
; AVX512VLDQ-LABEL: v4f32:
-; AVX512VLDQ: ## BB#0:
+; AVX512VLDQ: ## %bb.0:
; AVX512VLDQ-NEXT: vandps {{.*}}(%rip){1to4}, %xmm1, %xmm1
; AVX512VLDQ-NEXT: vandps {{.*}}(%rip){1to4}, %xmm0, %xmm0
; AVX512VLDQ-NEXT: vorps %xmm1, %xmm0, %xmm0
@@ -22,14 +22,14 @@ define <4 x float> @v4f32(<4 x float> %a, <4 x float> %b) nounwind {
define <8 x float> @v8f32(<8 x float> %a, <8 x float> %b) nounwind {
; AVX512VL-LABEL: v8f32:
-; AVX512VL: ## BB#0:
+; AVX512VL: ## %bb.0:
; AVX512VL-NEXT: vpandd {{.*}}(%rip){1to8}, %ymm1, %ymm1
; AVX512VL-NEXT: vpandd {{.*}}(%rip){1to8}, %ymm0, %ymm0
; AVX512VL-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512VL-NEXT: retq
;
; AVX512VLDQ-LABEL: v8f32:
-; AVX512VLDQ: ## BB#0:
+; AVX512VLDQ: ## %bb.0:
; AVX512VLDQ-NEXT: vandps {{.*}}(%rip){1to8}, %ymm1, %ymm1
; AVX512VLDQ-NEXT: vandps {{.*}}(%rip){1to8}, %ymm0, %ymm0
; AVX512VLDQ-NEXT: vorps %ymm1, %ymm0, %ymm0
@@ -40,14 +40,14 @@ define <8 x float> @v8f32(<8 x float> %a, <8 x float> %b) nounwind {
define <16 x float> @v16f32(<16 x float> %a, <16 x float> %b) nounwind {
; AVX512VL-LABEL: v16f32:
-; AVX512VL: ## BB#0:
+; AVX512VL: ## %bb.0:
; AVX512VL-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm1, %zmm1
; AVX512VL-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm0
; AVX512VL-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512VL-NEXT: retq
;
; AVX512VLDQ-LABEL: v16f32:
-; AVX512VLDQ: ## BB#0:
+; AVX512VLDQ: ## %bb.0:
; AVX512VLDQ-NEXT: vandps {{.*}}(%rip){1to16}, %zmm1, %zmm1
; AVX512VLDQ-NEXT: vandps {{.*}}(%rip){1to16}, %zmm0, %zmm0
; AVX512VLDQ-NEXT: vorps %zmm1, %zmm0, %zmm0
@@ -58,14 +58,14 @@ define <16 x float> @v16f32(<16 x float> %a, <16 x float> %b) nounwind {
define <2 x double> @v2f64(<2 x double> %a, <2 x double> %b) nounwind {
; AVX512VL-LABEL: v2f64:
-; AVX512VL: ## BB#0:
+; AVX512VL: ## %bb.0:
; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512VL-NEXT: retq
;
; AVX512VLDQ-LABEL: v2f64:
-; AVX512VLDQ: ## BB#0:
+; AVX512VLDQ: ## %bb.0:
; AVX512VLDQ-NEXT: vandps {{.*}}(%rip), %xmm1, %xmm1
; AVX512VLDQ-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0
; AVX512VLDQ-NEXT: vorps %xmm1, %xmm0, %xmm0
@@ -76,14 +76,14 @@ define <2 x double> @v2f64(<2 x double> %a, <2 x double> %b) nounwind {
define <4 x double> @v4f64(<4 x double> %a, <4 x double> %b) nounwind {
; AVX512VL-LABEL: v4f64:
-; AVX512VL: ## BB#0:
+; AVX512VL: ## %bb.0:
; AVX512VL-NEXT: vpandq {{.*}}(%rip){1to4}, %ymm1, %ymm1
; AVX512VL-NEXT: vpandq {{.*}}(%rip){1to4}, %ymm0, %ymm0
; AVX512VL-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512VL-NEXT: retq
;
; AVX512VLDQ-LABEL: v4f64:
-; AVX512VLDQ: ## BB#0:
+; AVX512VLDQ: ## %bb.0:
; AVX512VLDQ-NEXT: vandpd {{.*}}(%rip){1to4}, %ymm1, %ymm1
; AVX512VLDQ-NEXT: vandpd {{.*}}(%rip){1to4}, %ymm0, %ymm0
; AVX512VLDQ-NEXT: vorpd %ymm1, %ymm0, %ymm0
@@ -94,14 +94,14 @@ define <4 x double> @v4f64(<4 x double> %a, <4 x double> %b) nounwind {
define <8 x double> @v8f64(<8 x double> %a, <8 x double> %b) nounwind {
; AVX512VL-LABEL: v8f64:
-; AVX512VL: ## BB#0:
+; AVX512VL: ## %bb.0:
; AVX512VL-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1
; AVX512VL-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm0
; AVX512VL-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512VL-NEXT: retq
;
; AVX512VLDQ-LABEL: v8f64:
-; AVX512VLDQ: ## BB#0:
+; AVX512VLDQ: ## %bb.0:
; AVX512VLDQ-NEXT: vandpd {{.*}}(%rip){1to8}, %zmm1, %zmm1
; AVX512VLDQ-NEXT: vandpd {{.*}}(%rip){1to8}, %zmm0, %zmm0
; AVX512VLDQ-NEXT: vorpd %zmm1, %zmm0, %zmm0
diff --git a/test/CodeGen/X86/vec-copysign.ll b/test/CodeGen/X86/vec-copysign.ll
index 1ebd7ceafced..852ebcd3e452 100644
--- a/test/CodeGen/X86/vec-copysign.ll
+++ b/test/CodeGen/X86/vec-copysign.ll
@@ -18,14 +18,14 @@
define <4 x float> @v4f32(<4 x float> %a, <4 x float> %b) nounwind {
; SSE2-LABEL: v4f32:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: andps [[SIGNMASK1]](%rip), %xmm1
; SSE2-NEXT: andps [[MAGMASK1]](%rip), %xmm0
; SSE2-NEXT: orps %xmm1, %xmm0
; SSE2-NEXT: retq
;
; AVX-LABEL: v4f32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vandps [[SIGNMASK1]](%rip), %xmm1, %xmm1
; AVX-NEXT: vandps [[MAGMASK1]](%rip), %xmm0, %xmm0
; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0
@@ -69,7 +69,7 @@ define <4 x float> @v4f32(<4 x float> %a, <4 x float> %b) nounwind {
define <8 x float> @v8f32(<8 x float> %a, <8 x float> %b) nounwind {
; SSE2-LABEL: v8f32:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movaps [[SIGNMASK2]](%rip), %xmm4
; SSE2-NEXT: andps %xmm4, %xmm2
; SSE2-NEXT: movaps [[MAGMASK2]](%rip), %xmm5
@@ -81,7 +81,7 @@ define <8 x float> @v8f32(<8 x float> %a, <8 x float> %b) nounwind {
; SSE2-NEXT: retq
;
; AVX-LABEL: v8f32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vandps [[SIGNMASK2]](%rip), %ymm1, %ymm1
; AVX-NEXT: vandps [[MAGMASK2]](%rip), %ymm0, %ymm0
; AVX-NEXT: vorps %ymm1, %ymm0, %ymm0
@@ -101,14 +101,14 @@ define <8 x float> @v8f32(<8 x float> %a, <8 x float> %b) nounwind {
define <2 x double> @v2f64(<2 x double> %a, <2 x double> %b) nounwind {
; SSE2-LABEL: v2f64:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: andps [[SIGNMASK3]](%rip), %xmm1
; SSE2-NEXT: andps [[MAGMASK3]](%rip), %xmm0
; SSE2-NEXT: orps %xmm1, %xmm0
; SSE2-NEXT: retq
;
; AVX-LABEL: v2f64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vandps [[SIGNMASK3]](%rip), %xmm1, %xmm1
; AVX-NEXT: vandps [[MAGMASK3]](%rip), %xmm0, %xmm0
; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0
@@ -140,7 +140,7 @@ define <2 x double> @v2f64(<2 x double> %a, <2 x double> %b) nounwind {
define <4 x double> @v4f64(<4 x double> %a, <4 x double> %b) nounwind {
; SSE2-LABEL: v4f64:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movaps [[SIGNMASK4]](%rip), %xmm4
; SSE2-NEXT: andps %xmm4, %xmm2
; SSE2-NEXT: movaps [[MAGMASK4]](%rip), %xmm5
@@ -152,7 +152,7 @@ define <4 x double> @v4f64(<4 x double> %a, <4 x double> %b) nounwind {
; SSE2-NEXT: retq
;
; AVX-LABEL: v4f64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vandps [[SIGNMASK4]](%rip), %ymm1, %ymm1
; AVX-NEXT: vandps [[MAGMASK4]](%rip), %ymm0, %ymm0
; AVX-NEXT: vorps %ymm1, %ymm0, %ymm0
diff --git a/test/CodeGen/X86/vec-trunc-store.ll b/test/CodeGen/X86/vec-trunc-store.ll
index e2d23242d5ea..23af5f4d48ae 100644
--- a/test/CodeGen/X86/vec-trunc-store.ll
+++ b/test/CodeGen/X86/vec-trunc-store.ll
@@ -3,7 +3,7 @@
define void @foo(<8 x i32>* %p) nounwind {
; CHECK-LABEL: foo:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movdqa (%rdi), %xmm0
; CHECK-NEXT: movdqa 16(%rdi), %xmm1
; CHECK-NEXT: pslld $16, %xmm1
@@ -21,7 +21,7 @@ define void @foo(<8 x i32>* %p) nounwind {
define void @bar(<4 x i32>* %p) nounwind {
; CHECK-LABEL: bar:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: pshuflw {{.*#+}} xmm0 = mem[0,2,2,3,4,5,6,7]
; CHECK-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
diff --git a/test/CodeGen/X86/vec3.ll b/test/CodeGen/X86/vec3.ll
index e9c47ffd21c6..6d3f71f47505 100644
--- a/test/CodeGen/X86/vec3.ll
+++ b/test/CodeGen/X86/vec3.ll
@@ -3,7 +3,7 @@
define <3 x float> @fadd(<3 x float> %v, float %d) {
; CHECK-LABEL: fadd:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,3]
; CHECK-NEXT: addps %xmm1, %xmm0
; CHECK-NEXT: retq
@@ -17,7 +17,7 @@ define <3 x float> @fadd(<3 x float> %v, float %d) {
define <3 x float> @fdiv(<3 x float> %v, float %d) {
; CHECK-LABEL: fdiv:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,3]
; CHECK-NEXT: divps %xmm0, %xmm1
; CHECK-NEXT: movaps %xmm1, %xmm0
diff --git a/test/CodeGen/X86/vec_add.ll b/test/CodeGen/X86/vec_add.ll
index 7c77d11a7b54..3d144e8ea3e9 100644
--- a/test/CodeGen/X86/vec_add.ll
+++ b/test/CodeGen/X86/vec_add.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2
+; RUN: llc < %s -mtriple=i686-- -mattr=+sse2
define <2 x i64> @test(<2 x i64> %a, <2 x i64> %b) {
entry:
diff --git a/test/CodeGen/X86/vec_anyext.ll b/test/CodeGen/X86/vec_anyext.ll
index c088d7f57b1a..b42fbb55b94e 100644
--- a/test/CodeGen/X86/vec_anyext.ll
+++ b/test/CodeGen/X86/vec_anyext.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64
+; RUN: llc < %s -mtriple=x86_64--
; PR 9267
define<4 x i16> @func_16_32() {
diff --git a/test/CodeGen/X86/vec_call.ll b/test/CodeGen/X86/vec_call.ll
index e0862ca8d1c4..8c2d8ce66b74 100644
--- a/test/CodeGen/X86/vec_call.ll
+++ b/test/CodeGen/X86/vec_call.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -mcpu=generic -march=x86 -mattr=+sse2 -mtriple=i686-apple-darwin8 | \
+; RUN: llc < %s -mcpu=generic -mattr=+sse2 -mtriple=i686-apple-darwin8 | \
; RUN: grep "subl.*60"
-; RUN: llc < %s -mcpu=generic -march=x86 -mattr=+sse2 -mtriple=i686-apple-darwin8 | \
+; RUN: llc < %s -mcpu=generic -mattr=+sse2 -mtriple=i686-apple-darwin8 | \
; RUN: grep "movaps.*32"
diff --git a/test/CodeGen/X86/vec_cast2.ll b/test/CodeGen/X86/vec_cast2.ll
index c03b330b88e0..262c29fb6296 100644
--- a/test/CodeGen/X86/vec_cast2.ll
+++ b/test/CodeGen/X86/vec_cast2.ll
@@ -4,7 +4,7 @@
define <8 x float> @foo1_8(<8 x i8> %src) {
; CHECK-LABEL: foo1_8:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4,4,5,5,6,6,7,7]
; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; CHECK-NEXT: vpslld $24, %xmm0, %xmm0
@@ -16,7 +16,7 @@ define <8 x float> @foo1_8(<8 x i8> %src) {
; CHECK-NEXT: retl
;
; CHECK-WIDE-LABEL: foo1_8:
-; CHECK-WIDE: ## BB#0:
+; CHECK-WIDE: ## %bb.0:
; CHECK-WIDE-NEXT: vpmovsxbd %xmm0, %xmm1
; CHECK-WIDE-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
; CHECK-WIDE-NEXT: vpmovsxbd %xmm0, %xmm0
@@ -29,14 +29,14 @@ define <8 x float> @foo1_8(<8 x i8> %src) {
define <4 x float> @foo1_4(<4 x i8> %src) {
; CHECK-LABEL: foo1_4:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpslld $24, %xmm0, %xmm0
; CHECK-NEXT: vpsrad $24, %xmm0, %xmm0
; CHECK-NEXT: vcvtdq2ps %xmm0, %xmm0
; CHECK-NEXT: retl
;
; CHECK-WIDE-LABEL: foo1_4:
-; CHECK-WIDE: ## BB#0:
+; CHECK-WIDE: ## %bb.0:
; CHECK-WIDE-NEXT: vpmovsxbd %xmm0, %xmm0
; CHECK-WIDE-NEXT: vcvtdq2ps %xmm0, %xmm0
; CHECK-WIDE-NEXT: retl
@@ -46,7 +46,7 @@ define <4 x float> @foo1_4(<4 x i8> %src) {
define <8 x float> @foo2_8(<8 x i8> %src) {
; CHECK-LABEL: foo2_8:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vpand LCPI2_0, %xmm0, %xmm0
; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
@@ -56,7 +56,7 @@ define <8 x float> @foo2_8(<8 x i8> %src) {
; CHECK-NEXT: retl
;
; CHECK-WIDE-LABEL: foo2_8:
-; CHECK-WIDE: ## BB#0:
+; CHECK-WIDE: ## %bb.0:
; CHECK-WIDE-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; CHECK-WIDE-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
; CHECK-WIDE-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
@@ -69,13 +69,13 @@ define <8 x float> @foo2_8(<8 x i8> %src) {
define <4 x float> @foo2_4(<4 x i8> %src) {
; CHECK-LABEL: foo2_4:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vandps LCPI3_0, %xmm0, %xmm0
; CHECK-NEXT: vcvtdq2ps %xmm0, %xmm0
; CHECK-NEXT: retl
;
; CHECK-WIDE-LABEL: foo2_4:
-; CHECK-WIDE: ## BB#0:
+; CHECK-WIDE: ## %bb.0:
; CHECK-WIDE-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; CHECK-WIDE-NEXT: vcvtdq2ps %xmm0, %xmm0
; CHECK-WIDE-NEXT: retl
@@ -85,18 +85,15 @@ define <4 x float> @foo2_4(<4 x i8> %src) {
define <8 x i8> @foo3_8(<8 x float> %src) {
; CHECK-LABEL: foo3_8:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vcvttps2dq %ymm0, %ymm0
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1
-; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; CHECK-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; CHECK-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; CHECK-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retl
;
; CHECK-WIDE-LABEL: foo3_8:
-; CHECK-WIDE: ## BB#0:
+; CHECK-WIDE: ## %bb.0:
; CHECK-WIDE-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; CHECK-WIDE-NEXT: vcvttss2si %xmm1, %eax
; CHECK-WIDE-NEXT: vcvttss2si %xmm0, %ecx
@@ -128,12 +125,12 @@ define <8 x i8> @foo3_8(<8 x float> %src) {
define <4 x i8> @foo3_4(<4 x float> %src) {
; CHECK-LABEL: foo3_4:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: vcvttps2dq %xmm0, %xmm0
; CHECK-NEXT: retl
;
; CHECK-WIDE-LABEL: foo3_4:
-; CHECK-WIDE: ## BB#0:
+; CHECK-WIDE: ## %bb.0:
; CHECK-WIDE-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; CHECK-WIDE-NEXT: vcvttss2si %xmm1, %eax
; CHECK-WIDE-NEXT: vcvttss2si %xmm0, %ecx
diff --git a/test/CodeGen/X86/vec_cmp_sint-128.ll b/test/CodeGen/X86/vec_cmp_sint-128.ll
index 1407f71de714..a1b60f8fb0e6 100644
--- a/test/CodeGen/X86/vec_cmp_sint-128.ll
+++ b/test/CodeGen/X86/vec_cmp_sint-128.ll
@@ -15,29 +15,29 @@
define <2 x i64> @eq_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
; SSE2-LABEL: eq_v2i64:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: pcmpeqd %xmm1, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
; SSE2-NEXT: pand %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: eq_v2i64:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pcmpeqq %xmm1, %xmm0
; SSE41-NEXT: retq
;
; SSE42-LABEL: eq_v2i64:
-; SSE42: # BB#0:
+; SSE42: # %bb.0:
; SSE42-NEXT: pcmpeqq %xmm1, %xmm0
; SSE42-NEXT: retq
;
; AVX-LABEL: eq_v2i64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
;
; XOP-LABEL: eq_v2i64:
-; XOP: # BB#0:
+; XOP: # %bb.0:
; XOP-NEXT: vpcomeqq %xmm1, %xmm0, %xmm0
; XOP-NEXT: retq
%1 = icmp eq <2 x i64> %a, %b
@@ -47,17 +47,17 @@ define <2 x i64> @eq_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
define <4 x i32> @eq_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
; SSE-LABEL: eq_v4i32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pcmpeqd %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: eq_v4i32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
;
; XOP-LABEL: eq_v4i32:
-; XOP: # BB#0:
+; XOP: # %bb.0:
; XOP-NEXT: vpcomeqd %xmm1, %xmm0, %xmm0
; XOP-NEXT: retq
%1 = icmp eq <4 x i32> %a, %b
@@ -67,17 +67,17 @@ define <4 x i32> @eq_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
define <8 x i16> @eq_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
; SSE-LABEL: eq_v8i16:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pcmpeqw %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: eq_v8i16:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
;
; XOP-LABEL: eq_v8i16:
-; XOP: # BB#0:
+; XOP: # %bb.0:
; XOP-NEXT: vpcomeqw %xmm1, %xmm0, %xmm0
; XOP-NEXT: retq
%1 = icmp eq <8 x i16> %a, %b
@@ -87,17 +87,17 @@ define <8 x i16> @eq_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
define <16 x i8> @eq_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; SSE-LABEL: eq_v16i8:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pcmpeqb %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: eq_v16i8:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
;
; XOP-LABEL: eq_v16i8:
-; XOP: # BB#0:
+; XOP: # %bb.0:
; XOP-NEXT: vpcomeqb %xmm1, %xmm0, %xmm0
; XOP-NEXT: retq
%1 = icmp eq <16 x i8> %a, %b
@@ -111,7 +111,7 @@ define <16 x i8> @eq_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
define <2 x i64> @ne_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
; SSE2-LABEL: ne_v2i64:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: pcmpeqd %xmm1, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
; SSE2-NEXT: pand %xmm1, %xmm0
@@ -120,28 +120,28 @@ define <2 x i64> @ne_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
; SSE2-NEXT: retq
;
; SSE41-LABEL: ne_v2i64:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pcmpeqq %xmm1, %xmm0
; SSE41-NEXT: pcmpeqd %xmm1, %xmm1
; SSE41-NEXT: pxor %xmm1, %xmm0
; SSE41-NEXT: retq
;
; SSE42-LABEL: ne_v2i64:
-; SSE42: # BB#0:
+; SSE42: # %bb.0:
; SSE42-NEXT: pcmpeqq %xmm1, %xmm0
; SSE42-NEXT: pcmpeqd %xmm1, %xmm1
; SSE42-NEXT: pxor %xmm1, %xmm0
; SSE42-NEXT: retq
;
; AVX-LABEL: ne_v2i64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
;
; XOP-LABEL: ne_v2i64:
-; XOP: # BB#0:
+; XOP: # %bb.0:
; XOP-NEXT: vpcomneqq %xmm1, %xmm0, %xmm0
; XOP-NEXT: retq
%1 = icmp ne <2 x i64> %a, %b
@@ -151,21 +151,21 @@ define <2 x i64> @ne_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
define <4 x i32> @ne_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
; SSE-LABEL: ne_v4i32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pcmpeqd %xmm1, %xmm0
; SSE-NEXT: pcmpeqd %xmm1, %xmm1
; SSE-NEXT: pxor %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: ne_v4i32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
;
; XOP-LABEL: ne_v4i32:
-; XOP: # BB#0:
+; XOP: # %bb.0:
; XOP-NEXT: vpcomneqd %xmm1, %xmm0, %xmm0
; XOP-NEXT: retq
%1 = icmp ne <4 x i32> %a, %b
@@ -175,21 +175,21 @@ define <4 x i32> @ne_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
define <8 x i16> @ne_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
; SSE-LABEL: ne_v8i16:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pcmpeqw %xmm1, %xmm0
; SSE-NEXT: pcmpeqd %xmm1, %xmm1
; SSE-NEXT: pxor %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: ne_v8i16:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
;
; XOP-LABEL: ne_v8i16:
-; XOP: # BB#0:
+; XOP: # %bb.0:
; XOP-NEXT: vpcomneqw %xmm1, %xmm0, %xmm0
; XOP-NEXT: retq
%1 = icmp ne <8 x i16> %a, %b
@@ -199,21 +199,21 @@ define <8 x i16> @ne_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
define <16 x i8> @ne_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; SSE-LABEL: ne_v16i8:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pcmpeqb %xmm1, %xmm0
; SSE-NEXT: pcmpeqd %xmm1, %xmm1
; SSE-NEXT: pxor %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: ne_v16i8:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
;
; XOP-LABEL: ne_v16i8:
-; XOP: # BB#0:
+; XOP: # %bb.0:
; XOP-NEXT: vpcomneqb %xmm1, %xmm0, %xmm0
; XOP-NEXT: retq
%1 = icmp ne <16 x i8> %a, %b
@@ -227,7 +227,7 @@ define <16 x i8> @ne_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
define <2 x i64> @ge_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
; SSE2-LABEL: ge_v2i64:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0]
; SSE2-NEXT: pxor %xmm2, %xmm0
; SSE2-NEXT: pxor %xmm2, %xmm1
@@ -244,7 +244,7 @@ define <2 x i64> @ge_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
; SSE2-NEXT: retq
;
; SSE41-LABEL: ge_v2i64:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0]
; SSE41-NEXT: pxor %xmm2, %xmm0
; SSE41-NEXT: pxor %xmm2, %xmm1
@@ -261,21 +261,21 @@ define <2 x i64> @ge_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
; SSE41-NEXT: retq
;
; SSE42-LABEL: ge_v2i64:
-; SSE42: # BB#0:
+; SSE42: # %bb.0:
; SSE42-NEXT: pcmpgtq %xmm0, %xmm1
; SSE42-NEXT: pcmpeqd %xmm0, %xmm0
; SSE42-NEXT: pxor %xmm1, %xmm0
; SSE42-NEXT: retq
;
; AVX-LABEL: ge_v2i64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
;
; XOP-LABEL: ge_v2i64:
-; XOP: # BB#0:
+; XOP: # %bb.0:
; XOP-NEXT: vpcomgeq %xmm1, %xmm0, %xmm0
; XOP-NEXT: retq
%1 = icmp sge <2 x i64> %a, %b
@@ -285,21 +285,21 @@ define <2 x i64> @ge_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
define <4 x i32> @ge_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
; SSE-LABEL: ge_v4i32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pcmpgtd %xmm0, %xmm1
; SSE-NEXT: pcmpeqd %xmm0, %xmm0
; SSE-NEXT: pxor %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: ge_v4i32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
;
; XOP-LABEL: ge_v4i32:
-; XOP: # BB#0:
+; XOP: # %bb.0:
; XOP-NEXT: vpcomged %xmm1, %xmm0, %xmm0
; XOP-NEXT: retq
%1 = icmp sge <4 x i32> %a, %b
@@ -309,21 +309,21 @@ define <4 x i32> @ge_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
define <8 x i16> @ge_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
; SSE-LABEL: ge_v8i16:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pcmpgtw %xmm0, %xmm1
; SSE-NEXT: pcmpeqd %xmm0, %xmm0
; SSE-NEXT: pxor %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: ge_v8i16:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
; AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
;
; XOP-LABEL: ge_v8i16:
-; XOP: # BB#0:
+; XOP: # %bb.0:
; XOP-NEXT: vpcomgew %xmm1, %xmm0, %xmm0
; XOP-NEXT: retq
%1 = icmp sge <8 x i16> %a, %b
@@ -333,21 +333,21 @@ define <8 x i16> @ge_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
define <16 x i8> @ge_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; SSE-LABEL: ge_v16i8:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pcmpgtb %xmm0, %xmm1
; SSE-NEXT: pcmpeqd %xmm0, %xmm0
; SSE-NEXT: pxor %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: ge_v16i8:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0
; AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
;
; XOP-LABEL: ge_v16i8:
-; XOP: # BB#0:
+; XOP: # %bb.0:
; XOP-NEXT: vpcomgeb %xmm1, %xmm0, %xmm0
; XOP-NEXT: retq
%1 = icmp sge <16 x i8> %a, %b
@@ -361,7 +361,7 @@ define <16 x i8> @ge_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
define <2 x i64> @gt_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
; SSE2-LABEL: gt_v2i64:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0]
; SSE2-NEXT: pxor %xmm2, %xmm1
; SSE2-NEXT: pxor %xmm2, %xmm0
@@ -376,7 +376,7 @@ define <2 x i64> @gt_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
; SSE2-NEXT: retq
;
; SSE41-LABEL: gt_v2i64:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0]
; SSE41-NEXT: pxor %xmm2, %xmm1
; SSE41-NEXT: pxor %xmm2, %xmm0
@@ -391,17 +391,17 @@ define <2 x i64> @gt_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
; SSE41-NEXT: retq
;
; SSE42-LABEL: gt_v2i64:
-; SSE42: # BB#0:
+; SSE42: # %bb.0:
; SSE42-NEXT: pcmpgtq %xmm1, %xmm0
; SSE42-NEXT: retq
;
; AVX-LABEL: gt_v2i64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
;
; XOP-LABEL: gt_v2i64:
-; XOP: # BB#0:
+; XOP: # %bb.0:
; XOP-NEXT: vpcomgtq %xmm1, %xmm0, %xmm0
; XOP-NEXT: retq
%1 = icmp sgt <2 x i64> %a, %b
@@ -411,17 +411,17 @@ define <2 x i64> @gt_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
define <4 x i32> @gt_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
; SSE-LABEL: gt_v4i32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pcmpgtd %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: gt_v4i32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
;
; XOP-LABEL: gt_v4i32:
-; XOP: # BB#0:
+; XOP: # %bb.0:
; XOP-NEXT: vpcomgtd %xmm1, %xmm0, %xmm0
; XOP-NEXT: retq
%1 = icmp sgt <4 x i32> %a, %b
@@ -431,17 +431,17 @@ define <4 x i32> @gt_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
define <8 x i16> @gt_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
; SSE-LABEL: gt_v8i16:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pcmpgtw %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: gt_v8i16:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
;
; XOP-LABEL: gt_v8i16:
-; XOP: # BB#0:
+; XOP: # %bb.0:
; XOP-NEXT: vpcomgtw %xmm1, %xmm0, %xmm0
; XOP-NEXT: retq
%1 = icmp sgt <8 x i16> %a, %b
@@ -451,17 +451,17 @@ define <8 x i16> @gt_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
define <16 x i8> @gt_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; SSE-LABEL: gt_v16i8:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pcmpgtb %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: gt_v16i8:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
;
; XOP-LABEL: gt_v16i8:
-; XOP: # BB#0:
+; XOP: # %bb.0:
; XOP-NEXT: vpcomgtb %xmm1, %xmm0, %xmm0
; XOP-NEXT: retq
%1 = icmp sgt <16 x i8> %a, %b
@@ -475,7 +475,7 @@ define <16 x i8> @gt_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
define <2 x i64> @le_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
; SSE2-LABEL: le_v2i64:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0]
; SSE2-NEXT: pxor %xmm2, %xmm1
; SSE2-NEXT: pxor %xmm2, %xmm0
@@ -492,7 +492,7 @@ define <2 x i64> @le_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
; SSE2-NEXT: retq
;
; SSE41-LABEL: le_v2i64:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0]
; SSE41-NEXT: pxor %xmm2, %xmm1
; SSE41-NEXT: pxor %xmm2, %xmm0
@@ -509,21 +509,21 @@ define <2 x i64> @le_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
; SSE41-NEXT: retq
;
; SSE42-LABEL: le_v2i64:
-; SSE42: # BB#0:
+; SSE42: # %bb.0:
; SSE42-NEXT: pcmpgtq %xmm1, %xmm0
; SSE42-NEXT: pcmpeqd %xmm1, %xmm1
; SSE42-NEXT: pxor %xmm1, %xmm0
; SSE42-NEXT: retq
;
; AVX-LABEL: le_v2i64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
;
; XOP-LABEL: le_v2i64:
-; XOP: # BB#0:
+; XOP: # %bb.0:
; XOP-NEXT: vpcomleq %xmm1, %xmm0, %xmm0
; XOP-NEXT: retq
%1 = icmp sle <2 x i64> %a, %b
@@ -533,21 +533,21 @@ define <2 x i64> @le_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
define <4 x i32> @le_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
; SSE-LABEL: le_v4i32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pcmpgtd %xmm1, %xmm0
; SSE-NEXT: pcmpeqd %xmm1, %xmm1
; SSE-NEXT: pxor %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: le_v4i32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
;
; XOP-LABEL: le_v4i32:
-; XOP: # BB#0:
+; XOP: # %bb.0:
; XOP-NEXT: vpcomled %xmm1, %xmm0, %xmm0
; XOP-NEXT: retq
%1 = icmp sle <4 x i32> %a, %b
@@ -557,21 +557,21 @@ define <4 x i32> @le_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
define <8 x i16> @le_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
; SSE-LABEL: le_v8i16:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pcmpgtw %xmm1, %xmm0
; SSE-NEXT: pcmpeqd %xmm1, %xmm1
; SSE-NEXT: pxor %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: le_v8i16:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
;
; XOP-LABEL: le_v8i16:
-; XOP: # BB#0:
+; XOP: # %bb.0:
; XOP-NEXT: vpcomlew %xmm1, %xmm0, %xmm0
; XOP-NEXT: retq
%1 = icmp sle <8 x i16> %a, %b
@@ -581,21 +581,21 @@ define <8 x i16> @le_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
define <16 x i8> @le_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; SSE-LABEL: le_v16i8:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pcmpgtb %xmm1, %xmm0
; SSE-NEXT: pcmpeqd %xmm1, %xmm1
; SSE-NEXT: pxor %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: le_v16i8:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
;
; XOP-LABEL: le_v16i8:
-; XOP: # BB#0:
+; XOP: # %bb.0:
; XOP-NEXT: vpcomleb %xmm1, %xmm0, %xmm0
; XOP-NEXT: retq
%1 = icmp sle <16 x i8> %a, %b
@@ -609,7 +609,7 @@ define <16 x i8> @le_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
define <2 x i64> @lt_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
; SSE2-LABEL: lt_v2i64:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0]
; SSE2-NEXT: pxor %xmm2, %xmm0
; SSE2-NEXT: pxor %xmm2, %xmm1
@@ -624,7 +624,7 @@ define <2 x i64> @lt_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
; SSE2-NEXT: retq
;
; SSE41-LABEL: lt_v2i64:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0]
; SSE41-NEXT: pxor %xmm2, %xmm0
; SSE41-NEXT: pxor %xmm2, %xmm1
@@ -639,18 +639,18 @@ define <2 x i64> @lt_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
; SSE41-NEXT: retq
;
; SSE42-LABEL: lt_v2i64:
-; SSE42: # BB#0:
+; SSE42: # %bb.0:
; SSE42-NEXT: pcmpgtq %xmm0, %xmm1
; SSE42-NEXT: movdqa %xmm1, %xmm0
; SSE42-NEXT: retq
;
; AVX-LABEL: lt_v2i64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
; AVX-NEXT: retq
;
; XOP-LABEL: lt_v2i64:
-; XOP: # BB#0:
+; XOP: # %bb.0:
; XOP-NEXT: vpcomltq %xmm1, %xmm0, %xmm0
; XOP-NEXT: retq
%1 = icmp slt <2 x i64> %a, %b
@@ -660,18 +660,18 @@ define <2 x i64> @lt_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
define <4 x i32> @lt_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
; SSE-LABEL: lt_v4i32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pcmpgtd %xmm0, %xmm1
; SSE-NEXT: movdqa %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: lt_v4i32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
; AVX-NEXT: retq
;
; XOP-LABEL: lt_v4i32:
-; XOP: # BB#0:
+; XOP: # %bb.0:
; XOP-NEXT: vpcomltd %xmm1, %xmm0, %xmm0
; XOP-NEXT: retq
%1 = icmp slt <4 x i32> %a, %b
@@ -681,18 +681,18 @@ define <4 x i32> @lt_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
define <8 x i16> @lt_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
; SSE-LABEL: lt_v8i16:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pcmpgtw %xmm0, %xmm1
; SSE-NEXT: movdqa %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: lt_v8i16:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
; AVX-NEXT: retq
;
; XOP-LABEL: lt_v8i16:
-; XOP: # BB#0:
+; XOP: # %bb.0:
; XOP-NEXT: vpcomltw %xmm1, %xmm0, %xmm0
; XOP-NEXT: retq
%1 = icmp slt <8 x i16> %a, %b
@@ -702,18 +702,18 @@ define <8 x i16> @lt_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
define <16 x i8> @lt_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; SSE-LABEL: lt_v16i8:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pcmpgtb %xmm0, %xmm1
; SSE-NEXT: movdqa %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: lt_v16i8:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0
; AVX-NEXT: retq
;
; XOP-LABEL: lt_v16i8:
-; XOP: # BB#0:
+; XOP: # %bb.0:
; XOP-NEXT: vpcomltb %xmm1, %xmm0, %xmm0
; XOP-NEXT: retq
%1 = icmp slt <16 x i8> %a, %b
diff --git a/test/CodeGen/X86/vec_cmp_uint-128.ll b/test/CodeGen/X86/vec_cmp_uint-128.ll
index cad7991c4f3b..4dbe444e1387 100644
--- a/test/CodeGen/X86/vec_cmp_uint-128.ll
+++ b/test/CodeGen/X86/vec_cmp_uint-128.ll
@@ -15,29 +15,29 @@
define <2 x i64> @eq_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
; SSE2-LABEL: eq_v2i64:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: pcmpeqd %xmm1, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
; SSE2-NEXT: pand %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: eq_v2i64:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pcmpeqq %xmm1, %xmm0
; SSE41-NEXT: retq
;
; SSE42-LABEL: eq_v2i64:
-; SSE42: # BB#0:
+; SSE42: # %bb.0:
; SSE42-NEXT: pcmpeqq %xmm1, %xmm0
; SSE42-NEXT: retq
;
; AVX-LABEL: eq_v2i64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
;
; XOP-LABEL: eq_v2i64:
-; XOP: # BB#0:
+; XOP: # %bb.0:
; XOP-NEXT: vpcomeqq %xmm1, %xmm0, %xmm0
; XOP-NEXT: retq
%1 = icmp eq <2 x i64> %a, %b
@@ -47,17 +47,17 @@ define <2 x i64> @eq_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
define <4 x i32> @eq_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
; SSE-LABEL: eq_v4i32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pcmpeqd %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: eq_v4i32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
;
; XOP-LABEL: eq_v4i32:
-; XOP: # BB#0:
+; XOP: # %bb.0:
; XOP-NEXT: vpcomeqd %xmm1, %xmm0, %xmm0
; XOP-NEXT: retq
%1 = icmp eq <4 x i32> %a, %b
@@ -67,17 +67,17 @@ define <4 x i32> @eq_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
define <8 x i16> @eq_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
; SSE-LABEL: eq_v8i16:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pcmpeqw %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: eq_v8i16:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
;
; XOP-LABEL: eq_v8i16:
-; XOP: # BB#0:
+; XOP: # %bb.0:
; XOP-NEXT: vpcomeqw %xmm1, %xmm0, %xmm0
; XOP-NEXT: retq
%1 = icmp eq <8 x i16> %a, %b
@@ -87,17 +87,17 @@ define <8 x i16> @eq_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
define <16 x i8> @eq_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; SSE-LABEL: eq_v16i8:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pcmpeqb %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: eq_v16i8:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
;
; XOP-LABEL: eq_v16i8:
-; XOP: # BB#0:
+; XOP: # %bb.0:
; XOP-NEXT: vpcomeqb %xmm1, %xmm0, %xmm0
; XOP-NEXT: retq
%1 = icmp eq <16 x i8> %a, %b
@@ -111,7 +111,7 @@ define <16 x i8> @eq_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
define <2 x i64> @ne_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
; SSE2-LABEL: ne_v2i64:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: pcmpeqd %xmm1, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
; SSE2-NEXT: pand %xmm1, %xmm0
@@ -120,28 +120,28 @@ define <2 x i64> @ne_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
; SSE2-NEXT: retq
;
; SSE41-LABEL: ne_v2i64:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pcmpeqq %xmm1, %xmm0
; SSE41-NEXT: pcmpeqd %xmm1, %xmm1
; SSE41-NEXT: pxor %xmm1, %xmm0
; SSE41-NEXT: retq
;
; SSE42-LABEL: ne_v2i64:
-; SSE42: # BB#0:
+; SSE42: # %bb.0:
; SSE42-NEXT: pcmpeqq %xmm1, %xmm0
; SSE42-NEXT: pcmpeqd %xmm1, %xmm1
; SSE42-NEXT: pxor %xmm1, %xmm0
; SSE42-NEXT: retq
;
; AVX-LABEL: ne_v2i64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
;
; XOP-LABEL: ne_v2i64:
-; XOP: # BB#0:
+; XOP: # %bb.0:
; XOP-NEXT: vpcomneqq %xmm1, %xmm0, %xmm0
; XOP-NEXT: retq
%1 = icmp ne <2 x i64> %a, %b
@@ -151,21 +151,21 @@ define <2 x i64> @ne_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
define <4 x i32> @ne_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
; SSE-LABEL: ne_v4i32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pcmpeqd %xmm1, %xmm0
; SSE-NEXT: pcmpeqd %xmm1, %xmm1
; SSE-NEXT: pxor %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: ne_v4i32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
;
; XOP-LABEL: ne_v4i32:
-; XOP: # BB#0:
+; XOP: # %bb.0:
; XOP-NEXT: vpcomneqd %xmm1, %xmm0, %xmm0
; XOP-NEXT: retq
%1 = icmp ne <4 x i32> %a, %b
@@ -175,21 +175,21 @@ define <4 x i32> @ne_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
define <8 x i16> @ne_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
; SSE-LABEL: ne_v8i16:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pcmpeqw %xmm1, %xmm0
; SSE-NEXT: pcmpeqd %xmm1, %xmm1
; SSE-NEXT: pxor %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: ne_v8i16:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
;
; XOP-LABEL: ne_v8i16:
-; XOP: # BB#0:
+; XOP: # %bb.0:
; XOP-NEXT: vpcomneqw %xmm1, %xmm0, %xmm0
; XOP-NEXT: retq
%1 = icmp ne <8 x i16> %a, %b
@@ -199,21 +199,21 @@ define <8 x i16> @ne_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
define <16 x i8> @ne_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; SSE-LABEL: ne_v16i8:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pcmpeqb %xmm1, %xmm0
; SSE-NEXT: pcmpeqd %xmm1, %xmm1
; SSE-NEXT: pxor %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: ne_v16i8:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
;
; XOP-LABEL: ne_v16i8:
-; XOP: # BB#0:
+; XOP: # %bb.0:
; XOP-NEXT: vpcomneqb %xmm1, %xmm0, %xmm0
; XOP-NEXT: retq
%1 = icmp ne <16 x i8> %a, %b
@@ -227,7 +227,7 @@ define <16 x i8> @ne_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
define <2 x i64> @ge_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
; SSE2-LABEL: ge_v2i64:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
; SSE2-NEXT: pxor %xmm2, %xmm0
; SSE2-NEXT: pxor %xmm2, %xmm1
@@ -244,7 +244,7 @@ define <2 x i64> @ge_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
; SSE2-NEXT: retq
;
; SSE41-LABEL: ge_v2i64:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
; SSE41-NEXT: pxor %xmm2, %xmm0
; SSE41-NEXT: pxor %xmm2, %xmm1
@@ -261,7 +261,7 @@ define <2 x i64> @ge_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
; SSE41-NEXT: retq
;
; SSE42-LABEL: ge_v2i64:
-; SSE42: # BB#0:
+; SSE42: # %bb.0:
; SSE42-NEXT: movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
; SSE42-NEXT: pxor %xmm2, %xmm0
; SSE42-NEXT: pxor %xmm1, %xmm2
@@ -270,20 +270,39 @@ define <2 x i64> @ge_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
; SSE42-NEXT: pxor %xmm2, %xmm0
; SSE42-NEXT: retq
;
-; AVX-LABEL: ge_v2i64:
-; AVX: # BB#0:
-; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
-; AVX-NEXT: vpxor %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vpxor %xmm2, %xmm1, %xmm1
-; AVX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
-; AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX-NEXT: retq
+; AVX1-LABEL: ge_v2i64:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: ge_v2i64:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: retq
;
; XOP-LABEL: ge_v2i64:
-; XOP: # BB#0:
+; XOP: # %bb.0:
; XOP-NEXT: vpcomgeuq %xmm1, %xmm0, %xmm0
; XOP-NEXT: retq
+;
+; AVX512-LABEL: ge_v2i64:
+; AVX512: # %bb.0:
+; AVX512-NEXT: # kill: def %xmm1 killed %xmm1 def %zmm1
+; AVX512-NEXT: # kill: def %xmm0 killed %xmm0 def %zmm0
+; AVX512-NEXT: vpmaxuq %zmm1, %zmm0, %zmm1
+; AVX512-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
%1 = icmp uge <2 x i64> %a, %b
%2 = sext <2 x i1> %1 to <2 x i64>
ret <2 x i64> %2
@@ -291,7 +310,7 @@ define <2 x i64> @ge_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
define <4 x i32> @ge_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
; SSE2-LABEL: ge_v4i32:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
; SSE2-NEXT: pxor %xmm2, %xmm0
; SSE2-NEXT: pxor %xmm1, %xmm2
@@ -301,25 +320,25 @@ define <4 x i32> @ge_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
; SSE2-NEXT: retq
;
; SSE41-LABEL: ge_v4i32:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pmaxud %xmm0, %xmm1
; SSE41-NEXT: pcmpeqd %xmm1, %xmm0
; SSE41-NEXT: retq
;
; SSE42-LABEL: ge_v4i32:
-; SSE42: # BB#0:
+; SSE42: # %bb.0:
; SSE42-NEXT: pmaxud %xmm0, %xmm1
; SSE42-NEXT: pcmpeqd %xmm1, %xmm0
; SSE42-NEXT: retq
;
; AVX-LABEL: ge_v4i32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpmaxud %xmm1, %xmm0, %xmm1
; AVX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
;
; XOP-LABEL: ge_v4i32:
-; XOP: # BB#0:
+; XOP: # %bb.0:
; XOP-NEXT: vpcomgeud %xmm1, %xmm0, %xmm0
; XOP-NEXT: retq
%1 = icmp uge <4 x i32> %a, %b
@@ -329,32 +348,32 @@ define <4 x i32> @ge_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
define <8 x i16> @ge_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
; SSE2-LABEL: ge_v8i16:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: psubusw %xmm0, %xmm1
; SSE2-NEXT: pxor %xmm0, %xmm0
; SSE2-NEXT: pcmpeqw %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: ge_v8i16:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pmaxuw %xmm0, %xmm1
; SSE41-NEXT: pcmpeqw %xmm1, %xmm0
; SSE41-NEXT: retq
;
; SSE42-LABEL: ge_v8i16:
-; SSE42: # BB#0:
+; SSE42: # %bb.0:
; SSE42-NEXT: pmaxuw %xmm0, %xmm1
; SSE42-NEXT: pcmpeqw %xmm1, %xmm0
; SSE42-NEXT: retq
;
; AVX-LABEL: ge_v8i16:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpmaxuw %xmm1, %xmm0, %xmm1
; AVX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
;
; XOP-LABEL: ge_v8i16:
-; XOP: # BB#0:
+; XOP: # %bb.0:
; XOP-NEXT: vpcomgeuw %xmm1, %xmm0, %xmm0
; XOP-NEXT: retq
%1 = icmp uge <8 x i16> %a, %b
@@ -364,19 +383,19 @@ define <8 x i16> @ge_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
define <16 x i8> @ge_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; SSE-LABEL: ge_v16i8:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pmaxub %xmm0, %xmm1
; SSE-NEXT: pcmpeqb %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: ge_v16i8:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm1
; AVX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
;
; XOP-LABEL: ge_v16i8:
-; XOP: # BB#0:
+; XOP: # %bb.0:
; XOP-NEXT: vpcomgeub %xmm1, %xmm0, %xmm0
; XOP-NEXT: retq
%1 = icmp uge <16 x i8> %a, %b
@@ -390,7 +409,7 @@ define <16 x i8> @ge_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
define <2 x i64> @gt_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
; SSE2-LABEL: gt_v2i64:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
; SSE2-NEXT: pxor %xmm2, %xmm1
; SSE2-NEXT: pxor %xmm2, %xmm0
@@ -405,7 +424,7 @@ define <2 x i64> @gt_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
; SSE2-NEXT: retq
;
; SSE41-LABEL: gt_v2i64:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
; SSE41-NEXT: pxor %xmm2, %xmm1
; SSE41-NEXT: pxor %xmm2, %xmm0
@@ -420,7 +439,7 @@ define <2 x i64> @gt_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
; SSE41-NEXT: retq
;
; SSE42-LABEL: gt_v2i64:
-; SSE42: # BB#0:
+; SSE42: # %bb.0:
; SSE42-NEXT: movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
; SSE42-NEXT: pxor %xmm2, %xmm1
; SSE42-NEXT: pxor %xmm2, %xmm0
@@ -428,7 +447,7 @@ define <2 x i64> @gt_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
; SSE42-NEXT: retq
;
; AVX-LABEL: gt_v2i64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
; AVX-NEXT: vpxor %xmm2, %xmm1, %xmm1
; AVX-NEXT: vpxor %xmm2, %xmm0, %xmm0
@@ -436,7 +455,7 @@ define <2 x i64> @gt_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
; AVX-NEXT: retq
;
; XOP-LABEL: gt_v2i64:
-; XOP: # BB#0:
+; XOP: # %bb.0:
; XOP-NEXT: vpcomgtuq %xmm1, %xmm0, %xmm0
; XOP-NEXT: retq
%1 = icmp ugt <2 x i64> %a, %b
@@ -446,7 +465,7 @@ define <2 x i64> @gt_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
define <4 x i32> @gt_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
; SSE-LABEL: gt_v4i32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
; SSE-NEXT: pxor %xmm2, %xmm1
; SSE-NEXT: pxor %xmm2, %xmm0
@@ -454,7 +473,7 @@ define <4 x i32> @gt_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
; SSE-NEXT: retq
;
; AVX1-LABEL: gt_v4i32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0
@@ -462,7 +481,7 @@ define <4 x i32> @gt_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: gt_v4i32:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0
@@ -470,12 +489,12 @@ define <4 x i32> @gt_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
; AVX2-NEXT: retq
;
; XOP-LABEL: gt_v4i32:
-; XOP: # BB#0:
+; XOP: # %bb.0:
; XOP-NEXT: vpcomgtud %xmm1, %xmm0, %xmm0
; XOP-NEXT: retq
;
; AVX512-LABEL: gt_v4i32:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
; AVX512-NEXT: vpxor %xmm2, %xmm1, %xmm1
; AVX512-NEXT: vpxor %xmm2, %xmm0, %xmm0
@@ -488,7 +507,7 @@ define <4 x i32> @gt_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
define <8 x i16> @gt_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
; SSE-LABEL: gt_v8i16:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
; SSE-NEXT: pxor %xmm2, %xmm1
; SSE-NEXT: pxor %xmm2, %xmm0
@@ -496,7 +515,7 @@ define <8 x i16> @gt_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
; SSE-NEXT: retq
;
; AVX-LABEL: gt_v8i16:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
; AVX-NEXT: vpxor %xmm2, %xmm1, %xmm1
; AVX-NEXT: vpxor %xmm2, %xmm0, %xmm0
@@ -504,7 +523,7 @@ define <8 x i16> @gt_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
; AVX-NEXT: retq
;
; XOP-LABEL: gt_v8i16:
-; XOP: # BB#0:
+; XOP: # %bb.0:
; XOP-NEXT: vpcomgtuw %xmm1, %xmm0, %xmm0
; XOP-NEXT: retq
%1 = icmp ugt <8 x i16> %a, %b
@@ -514,7 +533,7 @@ define <8 x i16> @gt_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
define <16 x i8> @gt_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; SSE-LABEL: gt_v16i8:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
; SSE-NEXT: pxor %xmm2, %xmm1
; SSE-NEXT: pxor %xmm2, %xmm0
@@ -522,7 +541,7 @@ define <16 x i8> @gt_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; SSE-NEXT: retq
;
; AVX-LABEL: gt_v16i8:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
; AVX-NEXT: vpxor %xmm2, %xmm1, %xmm1
; AVX-NEXT: vpxor %xmm2, %xmm0, %xmm0
@@ -530,7 +549,7 @@ define <16 x i8> @gt_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; AVX-NEXT: retq
;
; XOP-LABEL: gt_v16i8:
-; XOP: # BB#0:
+; XOP: # %bb.0:
; XOP-NEXT: vpcomgtub %xmm1, %xmm0, %xmm0
; XOP-NEXT: retq
%1 = icmp ugt <16 x i8> %a, %b
@@ -544,7 +563,7 @@ define <16 x i8> @gt_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
define <2 x i64> @le_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
; SSE2-LABEL: le_v2i64:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
; SSE2-NEXT: pxor %xmm2, %xmm1
; SSE2-NEXT: pxor %xmm2, %xmm0
@@ -561,7 +580,7 @@ define <2 x i64> @le_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
; SSE2-NEXT: retq
;
; SSE41-LABEL: le_v2i64:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
; SSE41-NEXT: pxor %xmm2, %xmm1
; SSE41-NEXT: pxor %xmm2, %xmm0
@@ -578,7 +597,7 @@ define <2 x i64> @le_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
; SSE41-NEXT: retq
;
; SSE42-LABEL: le_v2i64:
-; SSE42: # BB#0:
+; SSE42: # %bb.0:
; SSE42-NEXT: movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
; SSE42-NEXT: pxor %xmm2, %xmm1
; SSE42-NEXT: pxor %xmm2, %xmm0
@@ -587,20 +606,39 @@ define <2 x i64> @le_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
; SSE42-NEXT: pxor %xmm1, %xmm0
; SSE42-NEXT: retq
;
-; AVX-LABEL: le_v2i64:
-; AVX: # BB#0:
-; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
-; AVX-NEXT: vpxor %xmm2, %xmm1, %xmm1
-; AVX-NEXT: vpxor %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX-NEXT: retq
+; AVX1-LABEL: le_v2i64:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: le_v2i64:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: retq
;
; XOP-LABEL: le_v2i64:
-; XOP: # BB#0:
+; XOP: # %bb.0:
; XOP-NEXT: vpcomleuq %xmm1, %xmm0, %xmm0
; XOP-NEXT: retq
+;
+; AVX512-LABEL: le_v2i64:
+; AVX512: # %bb.0:
+; AVX512-NEXT: # kill: def %xmm1 killed %xmm1 def %zmm1
+; AVX512-NEXT: # kill: def %xmm0 killed %xmm0 def %zmm0
+; AVX512-NEXT: vpminuq %zmm1, %zmm0, %zmm1
+; AVX512-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
%1 = icmp ule <2 x i64> %a, %b
%2 = sext <2 x i1> %1 to <2 x i64>
ret <2 x i64> %2
@@ -608,7 +646,7 @@ define <2 x i64> @le_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
define <4 x i32> @le_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
; SSE2-LABEL: le_v4i32:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
; SSE2-NEXT: pxor %xmm2, %xmm1
; SSE2-NEXT: pxor %xmm2, %xmm0
@@ -618,25 +656,25 @@ define <4 x i32> @le_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
; SSE2-NEXT: retq
;
; SSE41-LABEL: le_v4i32:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pminud %xmm0, %xmm1
; SSE41-NEXT: pcmpeqd %xmm1, %xmm0
; SSE41-NEXT: retq
;
; SSE42-LABEL: le_v4i32:
-; SSE42: # BB#0:
+; SSE42: # %bb.0:
; SSE42-NEXT: pminud %xmm0, %xmm1
; SSE42-NEXT: pcmpeqd %xmm1, %xmm0
; SSE42-NEXT: retq
;
; AVX-LABEL: le_v4i32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpminud %xmm1, %xmm0, %xmm1
; AVX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
;
; XOP-LABEL: le_v4i32:
-; XOP: # BB#0:
+; XOP: # %bb.0:
; XOP-NEXT: vpcomleud %xmm1, %xmm0, %xmm0
; XOP-NEXT: retq
%1 = icmp ule <4 x i32> %a, %b
@@ -646,32 +684,32 @@ define <4 x i32> @le_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
define <8 x i16> @le_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
; SSE2-LABEL: le_v8i16:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: psubusw %xmm1, %xmm0
; SSE2-NEXT: pxor %xmm1, %xmm1
; SSE2-NEXT: pcmpeqw %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: le_v8i16:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pminuw %xmm0, %xmm1
; SSE41-NEXT: pcmpeqw %xmm1, %xmm0
; SSE41-NEXT: retq
;
; SSE42-LABEL: le_v8i16:
-; SSE42: # BB#0:
+; SSE42: # %bb.0:
; SSE42-NEXT: pminuw %xmm0, %xmm1
; SSE42-NEXT: pcmpeqw %xmm1, %xmm0
; SSE42-NEXT: retq
;
; AVX-LABEL: le_v8i16:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpminuw %xmm1, %xmm0, %xmm1
; AVX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
;
; XOP-LABEL: le_v8i16:
-; XOP: # BB#0:
+; XOP: # %bb.0:
; XOP-NEXT: vpcomleuw %xmm1, %xmm0, %xmm0
; XOP-NEXT: retq
%1 = icmp ule <8 x i16> %a, %b
@@ -681,19 +719,19 @@ define <8 x i16> @le_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
define <16 x i8> @le_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; SSE-LABEL: le_v16i8:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pminub %xmm0, %xmm1
; SSE-NEXT: pcmpeqb %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: le_v16i8:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpminub %xmm1, %xmm0, %xmm1
; AVX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
;
; XOP-LABEL: le_v16i8:
-; XOP: # BB#0:
+; XOP: # %bb.0:
; XOP-NEXT: vpcomleub %xmm1, %xmm0, %xmm0
; XOP-NEXT: retq
%1 = icmp ule <16 x i8> %a, %b
@@ -707,7 +745,7 @@ define <16 x i8> @le_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
define <2 x i64> @lt_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
; SSE2-LABEL: lt_v2i64:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
; SSE2-NEXT: pxor %xmm2, %xmm0
; SSE2-NEXT: pxor %xmm2, %xmm1
@@ -722,7 +760,7 @@ define <2 x i64> @lt_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
; SSE2-NEXT: retq
;
; SSE41-LABEL: lt_v2i64:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
; SSE41-NEXT: pxor %xmm2, %xmm0
; SSE41-NEXT: pxor %xmm2, %xmm1
@@ -737,7 +775,7 @@ define <2 x i64> @lt_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
; SSE41-NEXT: retq
;
; SSE42-LABEL: lt_v2i64:
-; SSE42: # BB#0:
+; SSE42: # %bb.0:
; SSE42-NEXT: movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
; SSE42-NEXT: pxor %xmm2, %xmm0
; SSE42-NEXT: pxor %xmm1, %xmm2
@@ -746,7 +784,7 @@ define <2 x i64> @lt_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
; SSE42-NEXT: retq
;
; AVX-LABEL: lt_v2i64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
; AVX-NEXT: vpxor %xmm2, %xmm0, %xmm0
; AVX-NEXT: vpxor %xmm2, %xmm1, %xmm1
@@ -754,7 +792,7 @@ define <2 x i64> @lt_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
; AVX-NEXT: retq
;
; XOP-LABEL: lt_v2i64:
-; XOP: # BB#0:
+; XOP: # %bb.0:
; XOP-NEXT: vpcomltuq %xmm1, %xmm0, %xmm0
; XOP-NEXT: retq
%1 = icmp ult <2 x i64> %a, %b
@@ -764,7 +802,7 @@ define <2 x i64> @lt_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
define <4 x i32> @lt_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
; SSE-LABEL: lt_v4i32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
; SSE-NEXT: pxor %xmm2, %xmm0
; SSE-NEXT: pxor %xmm1, %xmm2
@@ -773,7 +811,7 @@ define <4 x i32> @lt_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
; SSE-NEXT: retq
;
; AVX1-LABEL: lt_v4i32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1
@@ -781,7 +819,7 @@ define <4 x i32> @lt_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: lt_v4i32:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1
@@ -789,12 +827,12 @@ define <4 x i32> @lt_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
; AVX2-NEXT: retq
;
; XOP-LABEL: lt_v4i32:
-; XOP: # BB#0:
+; XOP: # %bb.0:
; XOP-NEXT: vpcomltud %xmm1, %xmm0, %xmm0
; XOP-NEXT: retq
;
; AVX512-LABEL: lt_v4i32:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
; AVX512-NEXT: vpxor %xmm2, %xmm0, %xmm0
; AVX512-NEXT: vpxor %xmm2, %xmm1, %xmm1
@@ -807,7 +845,7 @@ define <4 x i32> @lt_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
define <8 x i16> @lt_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
; SSE-LABEL: lt_v8i16:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
; SSE-NEXT: pxor %xmm2, %xmm0
; SSE-NEXT: pxor %xmm1, %xmm2
@@ -816,7 +854,7 @@ define <8 x i16> @lt_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
; SSE-NEXT: retq
;
; AVX-LABEL: lt_v8i16:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
; AVX-NEXT: vpxor %xmm2, %xmm0, %xmm0
; AVX-NEXT: vpxor %xmm2, %xmm1, %xmm1
@@ -824,7 +862,7 @@ define <8 x i16> @lt_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
; AVX-NEXT: retq
;
; XOP-LABEL: lt_v8i16:
-; XOP: # BB#0:
+; XOP: # %bb.0:
; XOP-NEXT: vpcomltuw %xmm1, %xmm0, %xmm0
; XOP-NEXT: retq
%1 = icmp ult <8 x i16> %a, %b
@@ -834,7 +872,7 @@ define <8 x i16> @lt_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
define <16 x i8> @lt_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; SSE-LABEL: lt_v16i8:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
; SSE-NEXT: pxor %xmm2, %xmm0
; SSE-NEXT: pxor %xmm1, %xmm2
@@ -843,7 +881,7 @@ define <16 x i8> @lt_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; SSE-NEXT: retq
;
; AVX-LABEL: lt_v16i8:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
; AVX-NEXT: vpxor %xmm2, %xmm0, %xmm0
; AVX-NEXT: vpxor %xmm2, %xmm1, %xmm1
@@ -851,7 +889,7 @@ define <16 x i8> @lt_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; AVX-NEXT: retq
;
; XOP-LABEL: lt_v16i8:
-; XOP: # BB#0:
+; XOP: # %bb.0:
; XOP-NEXT: vpcomltub %xmm1, %xmm0, %xmm0
; XOP-NEXT: retq
%1 = icmp ult <16 x i8> %a, %b
diff --git a/test/CodeGen/X86/vec_compare-sse4.ll b/test/CodeGen/X86/vec_compare-sse4.ll
index 714701897918..bd9ac20e8eeb 100644
--- a/test/CodeGen/X86/vec_compare-sse4.ll
+++ b/test/CodeGen/X86/vec_compare-sse4.ll
@@ -5,7 +5,7 @@
define <2 x i64> @test1(<2 x i64> %A, <2 x i64> %B) nounwind {
; SSE2-LABEL: test1:
-; SSE2: ## BB#0:
+; SSE2: ## %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0]
; SSE2-NEXT: pxor %xmm2, %xmm1
; SSE2-NEXT: pxor %xmm2, %xmm0
@@ -20,7 +20,7 @@ define <2 x i64> @test1(<2 x i64> %A, <2 x i64> %B) nounwind {
; SSE2-NEXT: retl
;
; SSE41-LABEL: test1:
-; SSE41: ## BB#0:
+; SSE41: ## %bb.0:
; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0]
; SSE41-NEXT: pxor %xmm2, %xmm1
; SSE41-NEXT: pxor %xmm2, %xmm0
@@ -35,7 +35,7 @@ define <2 x i64> @test1(<2 x i64> %A, <2 x i64> %B) nounwind {
; SSE41-NEXT: retl
;
; SSE42-LABEL: test1:
-; SSE42: ## BB#0:
+; SSE42: ## %bb.0:
; SSE42-NEXT: pcmpgtq %xmm1, %xmm0
; SSE42-NEXT: retl
%C = icmp sgt <2 x i64> %A, %B
@@ -45,19 +45,19 @@ define <2 x i64> @test1(<2 x i64> %A, <2 x i64> %B) nounwind {
define <2 x i64> @test2(<2 x i64> %A, <2 x i64> %B) nounwind {
; SSE2-LABEL: test2:
-; SSE2: ## BB#0:
+; SSE2: ## %bb.0:
; SSE2-NEXT: pcmpeqd %xmm1, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
; SSE2-NEXT: pand %xmm1, %xmm0
; SSE2-NEXT: retl
;
; SSE41-LABEL: test2:
-; SSE41: ## BB#0:
+; SSE41: ## %bb.0:
; SSE41-NEXT: pcmpeqq %xmm1, %xmm0
; SSE41-NEXT: retl
;
; SSE42-LABEL: test2:
-; SSE42: ## BB#0:
+; SSE42: ## %bb.0:
; SSE42-NEXT: pcmpeqq %xmm1, %xmm0
; SSE42-NEXT: retl
%C = icmp eq <2 x i64> %A, %B
diff --git a/test/CodeGen/X86/vec_compare.ll b/test/CodeGen/X86/vec_compare.ll
index df3eae3399f3..ed81cb5a78d5 100644
--- a/test/CodeGen/X86/vec_compare.ll
+++ b/test/CodeGen/X86/vec_compare.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mcpu=yonah -mtriple=i386-apple-darwin | FileCheck %s
+; RUN: llc < %s -mcpu=yonah -mtriple=i386-apple-darwin | FileCheck %s
define <4 x i32> @test1(<4 x i32> %A, <4 x i32> %B) nounwind {
diff --git a/test/CodeGen/X86/vec_ctbits.ll b/test/CodeGen/X86/vec_ctbits.ll
index 65279f7c8494..781c61b5789e 100644
--- a/test/CodeGen/X86/vec_ctbits.ll
+++ b/test/CodeGen/X86/vec_ctbits.ll
@@ -7,7 +7,7 @@ declare <2 x i64> @llvm.ctpop.v2i64(<2 x i64>)
define <2 x i64> @footz(<2 x i64> %a) nounwind {
; CHECK-LABEL: footz:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: pxor %xmm1, %xmm1
; CHECK-NEXT: pxor %xmm2, %xmm2
; CHECK-NEXT: psubq %xmm0, %xmm2
@@ -36,7 +36,7 @@ define <2 x i64> @footz(<2 x i64> %a) nounwind {
}
define <2 x i64> @foolz(<2 x i64> %a) nounwind {
; CHECK-LABEL: foolz:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movdqa %xmm0, %xmm1
; CHECK-NEXT: psrlq $1, %xmm1
; CHECK-NEXT: por %xmm0, %xmm1
@@ -81,7 +81,7 @@ define <2 x i64> @foolz(<2 x i64> %a) nounwind {
define <2 x i64> @foopop(<2 x i64> %a) nounwind {
; CHECK-LABEL: foopop:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movdqa %xmm0, %xmm1
; CHECK-NEXT: psrlq $1, %xmm1
; CHECK-NEXT: pand {{.*}}(%rip), %xmm1
@@ -110,7 +110,7 @@ declare <2 x i32> @llvm.ctpop.v2i32(<2 x i32>)
define <2 x i32> @promtz(<2 x i32> %a) nounwind {
; CHECK-LABEL: promtz:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: por {{.*}}(%rip), %xmm0
; CHECK-NEXT: pxor %xmm1, %xmm1
; CHECK-NEXT: pxor %xmm2, %xmm2
@@ -140,7 +140,7 @@ define <2 x i32> @promtz(<2 x i32> %a) nounwind {
}
define <2 x i32> @promlz(<2 x i32> %a) nounwind {
; CHECK-LABEL: promlz:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: pand {{.*}}(%rip), %xmm0
; CHECK-NEXT: pxor %xmm1, %xmm1
; CHECK-NEXT: movdqa %xmm0, %xmm2
@@ -187,7 +187,7 @@ define <2 x i32> @promlz(<2 x i32> %a) nounwind {
define <2 x i32> @prompop(<2 x i32> %a) nounwind {
; CHECK-LABEL: prompop:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: pand {{.*}}(%rip), %xmm0
; CHECK-NEXT: pxor %xmm2, %xmm2
; CHECK-NEXT: movdqa %xmm0, %xmm1
diff --git a/test/CodeGen/X86/vec_ext_inreg.ll b/test/CodeGen/X86/vec_ext_inreg.ll
index 1ee4b24b62f2..157f2cad6fa5 100644
--- a/test/CodeGen/X86/vec_ext_inreg.ll
+++ b/test/CodeGen/X86/vec_ext_inreg.ll
@@ -5,7 +5,7 @@
define <8 x i32> @a(<8 x i32> %a) nounwind {
; SSE-LABEL: a:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pslld $16, %xmm0
; SSE-NEXT: psrad $16, %xmm0
; SSE-NEXT: pslld $16, %xmm1
@@ -13,7 +13,7 @@ define <8 x i32> @a(<8 x i32> %a) nounwind {
; SSE-NEXT: retq
;
; AVX1-LABEL: a:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpslld $16, %xmm0, %xmm1
; AVX1-NEXT: vpsrad $16, %xmm1, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
@@ -23,7 +23,7 @@ define <8 x i32> @a(<8 x i32> %a) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: a:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpslld $16, %ymm0, %ymm0
; AVX2-NEXT: vpsrad $16, %ymm0, %ymm0
; AVX2-NEXT: retq
@@ -34,13 +34,13 @@ define <8 x i32> @a(<8 x i32> %a) nounwind {
define <3 x i32> @b(<3 x i32> %a) nounwind {
; SSE-LABEL: b:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pslld $16, %xmm0
; SSE-NEXT: psrad $16, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: b:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpslld $16, %xmm0, %xmm0
; AVX-NEXT: vpsrad $16, %xmm0, %xmm0
; AVX-NEXT: retq
@@ -51,7 +51,7 @@ define <3 x i32> @b(<3 x i32> %a) nounwind {
define <1 x i32> @c(<1 x i32> %a) nounwind {
; ALL-LABEL: c:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: movswl %di, %eax
; ALL-NEXT: retq
%b = trunc <1 x i32> %a to <1 x i16>
@@ -61,20 +61,20 @@ define <1 x i32> @c(<1 x i32> %a) nounwind {
define <8 x i32> @d(<8 x i32> %a) nounwind {
; SSE-LABEL: d:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm2 = [65535,0,65535,0,65535,0,65535,0]
; SSE-NEXT: andps %xmm2, %xmm0
; SSE-NEXT: andps %xmm2, %xmm1
; SSE-NEXT: retq
;
; AVX1-LABEL: d:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: d:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
; AVX2-NEXT: retq
%b = trunc <8 x i32> %a to <8 x i16>
@@ -84,12 +84,12 @@ define <8 x i32> @d(<8 x i32> %a) nounwind {
define <3 x i32> @e(<3 x i32> %a) nounwind {
; SSE-LABEL: e:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: andps {{.*}}(%rip), %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: e:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6,7]
; AVX-NEXT: retq
@@ -100,7 +100,7 @@ define <3 x i32> @e(<3 x i32> %a) nounwind {
define <1 x i32> @f(<1 x i32> %a) nounwind {
; ALL-LABEL: f:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: movzwl %di, %eax
; ALL-NEXT: retq
%b = trunc <1 x i32> %a to <1 x i16>
diff --git a/test/CodeGen/X86/vec_extract-avx.ll b/test/CodeGen/X86/vec_extract-avx.ll
index deaa7c7e4b03..3a9d8348ad54 100644
--- a/test/CodeGen/X86/vec_extract-avx.ll
+++ b/test/CodeGen/X86/vec_extract-avx.ll
@@ -10,14 +10,14 @@
; Extracting the low elements only requires using the right kind of store.
define void @low_v8f32_to_v4f32(<8 x float> %v, <4 x float>* %ptr) {
; X32-LABEL: low_v8f32_to_v4f32:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vmovaps %xmm0, (%eax)
; X32-NEXT: vzeroupper
; X32-NEXT: retl
;
; X64-LABEL: low_v8f32_to_v4f32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovaps %xmm0, (%rdi)
; X64-NEXT: vzeroupper
; X64-NEXT: retq
@@ -36,14 +36,14 @@ define void @low_v8f32_to_v4f32(<8 x float> %v, <4 x float>* %ptr) {
; Extracting the high elements requires just one AVX instruction.
define void @high_v8f32_to_v4f32(<8 x float> %v, <4 x float>* %ptr) {
; X32-LABEL: high_v8f32_to_v4f32:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vextractf128 $1, %ymm0, (%eax)
; X32-NEXT: vzeroupper
; X32-NEXT: retl
;
; X64-LABEL: high_v8f32_to_v4f32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vextractf128 $1, %ymm0, (%rdi)
; X64-NEXT: vzeroupper
; X64-NEXT: retq
@@ -64,14 +64,14 @@ define void @high_v8f32_to_v4f32(<8 x float> %v, <4 x float>* %ptr) {
; have AVX2, we should generate vextracti128 (the int version).
define void @high_v8i32_to_v4i32(<8 x i32> %v, <4 x i32>* %ptr) {
; X32-LABEL: high_v8i32_to_v4i32:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vextractf128 $1, %ymm0, (%eax)
; X32-NEXT: vzeroupper
; X32-NEXT: retl
;
; X64-LABEL: high_v8i32_to_v4i32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vextractf128 $1, %ymm0, (%rdi)
; X64-NEXT: vzeroupper
; X64-NEXT: retq
@@ -90,14 +90,14 @@ define void @high_v8i32_to_v4i32(<8 x i32> %v, <4 x i32>* %ptr) {
; Make sure that element size doesn't alter the codegen.
define void @high_v4f64_to_v2f64(<4 x double> %v, <2 x double>* %ptr) {
; X32-LABEL: high_v4f64_to_v2f64:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vextractf128 $1, %ymm0, (%eax)
; X32-NEXT: vzeroupper
; X32-NEXT: retl
;
; X64-LABEL: high_v4f64_to_v2f64:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vextractf128 $1, %ymm0, (%rdi)
; X64-NEXT: vzeroupper
; X64-NEXT: retq
@@ -114,20 +114,20 @@ define void @high_v4f64_to_v2f64(<4 x double> %v, <2 x double>* %ptr) {
define void @legal_vzmovl_2i32_8i32(<2 x i32>* %in, <8 x i32>* %out) {
; X32-LABEL: legal_vzmovl_2i32_8i32:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; X32-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; X32-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X32-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
; X32-NEXT: vmovaps %ymm0, (%eax)
; X32-NEXT: vzeroupper
; X32-NEXT: retl
;
; X64-LABEL: legal_vzmovl_2i32_8i32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; X64-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X64-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
; X64-NEXT: vmovaps %ymm0, (%rsi)
; X64-NEXT: vzeroupper
@@ -141,20 +141,20 @@ define void @legal_vzmovl_2i32_8i32(<2 x i32>* %in, <8 x i32>* %out) {
define void @legal_vzmovl_2i64_4i64(<2 x i64>* %in, <4 x i64>* %out) {
; X32-LABEL: legal_vzmovl_2i64_4i64:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: vmovupd (%ecx), %xmm0
-; X32-NEXT: vxorpd %ymm1, %ymm1, %ymm1
+; X32-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; X32-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
; X32-NEXT: vmovapd %ymm0, (%eax)
; X32-NEXT: vzeroupper
; X32-NEXT: retl
;
; X64-LABEL: legal_vzmovl_2i64_4i64:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovupd (%rdi), %xmm0
-; X64-NEXT: vxorpd %ymm1, %ymm1, %ymm1
+; X64-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; X64-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
; X64-NEXT: vmovapd %ymm0, (%rsi)
; X64-NEXT: vzeroupper
@@ -168,7 +168,7 @@ define void @legal_vzmovl_2i64_4i64(<2 x i64>* %in, <4 x i64>* %out) {
define void @legal_vzmovl_2f32_8f32(<2 x float>* %in, <8 x float>* %out) {
; X32-LABEL: legal_vzmovl_2f32_8f32:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
@@ -177,9 +177,9 @@ define void @legal_vzmovl_2f32_8f32(<2 x float>* %in, <8 x float>* %out) {
; X32-NEXT: retl
;
; X64-LABEL: legal_vzmovl_2f32_8f32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; X64-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X64-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
; X64-NEXT: vmovaps %ymm0, (%rsi)
; X64-NEXT: vzeroupper
@@ -193,20 +193,20 @@ define void @legal_vzmovl_2f32_8f32(<2 x float>* %in, <8 x float>* %out) {
define void @legal_vzmovl_2f64_4f64(<2 x double>* %in, <4 x double>* %out) {
; X32-LABEL: legal_vzmovl_2f64_4f64:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: vmovupd (%ecx), %xmm0
-; X32-NEXT: vxorpd %ymm1, %ymm1, %ymm1
+; X32-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; X32-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
; X32-NEXT: vmovapd %ymm0, (%eax)
; X32-NEXT: vzeroupper
; X32-NEXT: retl
;
; X64-LABEL: legal_vzmovl_2f64_4f64:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovupd (%rdi), %xmm0
-; X64-NEXT: vxorpd %ymm1, %ymm1, %ymm1
+; X64-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; X64-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
; X64-NEXT: vmovapd %ymm0, (%rsi)
; X64-NEXT: vzeroupper
diff --git a/test/CodeGen/X86/vec_extract-mmx.ll b/test/CodeGen/X86/vec_extract-mmx.ll
index a137d052d296..d8502d831fd0 100644
--- a/test/CodeGen/X86/vec_extract-mmx.ll
+++ b/test/CodeGen/X86/vec_extract-mmx.ll
@@ -4,7 +4,7 @@
define i32 @test0(<1 x i64>* %v4) nounwind {
; X32-LABEL: test0:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: pushl %ebp
; X32-NEXT: movl %esp, %ebp
; X32-NEXT: andl $-8, %esp
@@ -22,7 +22,7 @@ define i32 @test0(<1 x i64>* %v4) nounwind {
; X32-NEXT: retl
;
; X64-LABEL: test0:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: pshufw $238, (%rdi), %mm0 # mm0 = mem[2,3,2,3]
; X64-NEXT: movd %mm0, %eax
; X64-NEXT: addl $32, %eax
@@ -43,7 +43,7 @@ entry:
define i32 @test1(i32* nocapture readonly %ptr) nounwind {
; X32-LABEL: test1:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movd (%eax), %mm0
; X32-NEXT: pshufw $232, %mm0, %mm0 # mm0 = mm0[0,2,2,3]
@@ -52,7 +52,7 @@ define i32 @test1(i32* nocapture readonly %ptr) nounwind {
; X32-NEXT: retl
;
; X64-LABEL: test1:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: movd (%rdi), %mm0
; X64-NEXT: pshufw $232, %mm0, %mm0 # mm0 = mm0[0,2,2,3]
; X64-NEXT: movd %mm0, %eax
@@ -78,7 +78,7 @@ entry:
define i32 @test2(i32* nocapture readonly %ptr) nounwind {
; X32-LABEL: test2:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: pshufw $232, (%eax), %mm0 # mm0 = mem[0,2,2,3]
; X32-NEXT: movd %mm0, %eax
@@ -86,7 +86,7 @@ define i32 @test2(i32* nocapture readonly %ptr) nounwind {
; X32-NEXT: retl
;
; X64-LABEL: test2:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: pshufw $232, (%rdi), %mm0 # mm0 = mem[0,2,2,3]
; X64-NEXT: movd %mm0, %eax
; X64-NEXT: emms
@@ -106,12 +106,12 @@ entry:
define i32 @test3(x86_mmx %a) nounwind {
; X32-LABEL: test3:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movd %mm0, %eax
; X32-NEXT: retl
;
; X64-LABEL: test3:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movd %mm0, %eax
; X64-NEXT: retq
%tmp0 = bitcast x86_mmx %a to <2 x i32>
@@ -122,7 +122,7 @@ define i32 @test3(x86_mmx %a) nounwind {
; Verify we don't muck with extractelts from the upper lane.
define i32 @test4(x86_mmx %a) nounwind {
; X32-LABEL: test4:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pushl %ebp
; X32-NEXT: movl %esp, %ebp
; X32-NEXT: andl $-8, %esp
@@ -136,7 +136,7 @@ define i32 @test4(x86_mmx %a) nounwind {
; X32-NEXT: retl
;
; X64-LABEL: test4:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movq %mm0, -{{[0-9]+}}(%rsp)
; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,0,1]
diff --git a/test/CodeGen/X86/vec_extract-sse4.ll b/test/CodeGen/X86/vec_extract-sse4.ll
index f073f1538d2e..2d9eb7c5daa9 100644
--- a/test/CodeGen/X86/vec_extract-sse4.ll
+++ b/test/CodeGen/X86/vec_extract-sse4.ll
@@ -4,7 +4,7 @@
define void @t1(float* %R, <4 x float>* %P1) nounwind {
; X32-LABEL: t1:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
@@ -12,7 +12,7 @@ define void @t1(float* %R, <4 x float>* %P1) nounwind {
; X32-NEXT: retl
;
; X64-LABEL: t1:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X64-NEXT: movss %xmm0, (%rdi)
; X64-NEXT: retq
@@ -24,7 +24,7 @@ define void @t1(float* %R, <4 x float>* %P1) nounwind {
define float @t2(<4 x float>* %P1) nounwind {
; X32-LABEL: t2:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pushl %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movddup {{.*#+}} xmm0 = mem[0,0]
@@ -34,7 +34,7 @@ define float @t2(<4 x float>* %P1) nounwind {
; X32-NEXT: retl
;
; X64-LABEL: t2:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movddup {{.*#+}} xmm0 = mem[0,0]
; X64-NEXT: retq
%X = load <4 x float>, <4 x float>* %P1
@@ -44,7 +44,7 @@ define float @t2(<4 x float>* %P1) nounwind {
define void @t3(i32* %R, <4 x i32>* %P1) nounwind {
; X32-LABEL: t3:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: movl 12(%ecx), %ecx
@@ -52,7 +52,7 @@ define void @t3(i32* %R, <4 x i32>* %P1) nounwind {
; X32-NEXT: retl
;
; X64-LABEL: t3:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movl 12(%rsi), %eax
; X64-NEXT: movl %eax, (%rdi)
; X64-NEXT: retq
@@ -64,13 +64,13 @@ define void @t3(i32* %R, <4 x i32>* %P1) nounwind {
define i32 @t4(<4 x i32>* %P1) nounwind {
; X32-LABEL: t4:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl 12(%eax), %eax
; X32-NEXT: retl
;
; X64-LABEL: t4:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movl 12(%rdi), %eax
; X64-NEXT: retq
%X = load <4 x i32>, <4 x i32>* %P1
diff --git a/test/CodeGen/X86/vec_extract.ll b/test/CodeGen/X86/vec_extract.ll
index 58d8392b235a..7c1a532ab7cc 100644
--- a/test/CodeGen/X86/vec_extract.ll
+++ b/test/CodeGen/X86/vec_extract.ll
@@ -4,7 +4,7 @@
define void @test1(<4 x float>* %F, float* %f) nounwind {
; X32-LABEL: test1:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: movaps (%ecx), %xmm0
@@ -13,7 +13,7 @@ define void @test1(<4 x float>* %F, float* %f) nounwind {
; X32-NEXT: retl
;
; X64-LABEL: test1:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: movaps (%rdi), %xmm0
; X64-NEXT: addps %xmm0, %xmm0
; X64-NEXT: movss %xmm0, (%rsi)
@@ -28,7 +28,7 @@ entry:
define float @test2(<4 x float>* %F, float* %f) nounwind {
; X32-LABEL: test2:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: pushl %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movaps (%eax), %xmm0
@@ -40,7 +40,7 @@ define float @test2(<4 x float>* %F, float* %f) nounwind {
; X32-NEXT: retl
;
; X64-LABEL: test2:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: movaps (%rdi), %xmm0
; X64-NEXT: addps %xmm0, %xmm0
; X64-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
@@ -54,7 +54,7 @@ entry:
define void @test3(float* %R, <4 x float>* %P1) nounwind {
; X32-LABEL: test3:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
@@ -62,7 +62,7 @@ define void @test3(float* %R, <4 x float>* %P1) nounwind {
; X32-NEXT: retl
;
; X64-LABEL: test3:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X64-NEXT: movss %xmm0, (%rdi)
; X64-NEXT: retq
@@ -75,7 +75,7 @@ entry:
define double @test4(double %A) nounwind {
; X32-LABEL: test4:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: subl $12, %esp
; X32-NEXT: calll foo
; X32-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
@@ -86,7 +86,7 @@ define double @test4(double %A) nounwind {
; X32-NEXT: retl
;
; X64-LABEL: test4:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: pushq %rax
; X64-NEXT: movsd %xmm0, (%rsp) # 8-byte Spill
; X64-NEXT: callq foo
diff --git a/test/CodeGen/X86/vec_fabs.ll b/test/CodeGen/X86/vec_fabs.ll
index aef62774e177..892599a3d7f9 100644
--- a/test/CodeGen/X86/vec_fabs.ll
+++ b/test/CodeGen/X86/vec_fabs.ll
@@ -11,32 +11,32 @@
define <2 x double> @fabs_v2f64(<2 x double> %p) {
; X32_AVX-LABEL: fabs_v2f64:
-; X32_AVX: # BB#0:
+; X32_AVX: # %bb.0:
; X32_AVX-NEXT: vandps {{\.LCPI.*}}, %xmm0, %xmm0
; X32_AVX-NEXT: retl
;
; X32_AVX512VL-LABEL: fabs_v2f64:
-; X32_AVX512VL: # BB#0:
+; X32_AVX512VL: # %bb.0:
; X32_AVX512VL-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0
; X32_AVX512VL-NEXT: retl
;
; X32_AVX512VLDQ-LABEL: fabs_v2f64:
-; X32_AVX512VLDQ: # BB#0:
+; X32_AVX512VLDQ: # %bb.0:
; X32_AVX512VLDQ-NEXT: vandps {{\.LCPI.*}}, %xmm0, %xmm0
; X32_AVX512VLDQ-NEXT: retl
;
; X64_AVX-LABEL: fabs_v2f64:
-; X64_AVX: # BB#0:
+; X64_AVX: # %bb.0:
; X64_AVX-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0
; X64_AVX-NEXT: retq
;
; X64_AVX512VL-LABEL: fabs_v2f64:
-; X64_AVX512VL: # BB#0:
+; X64_AVX512VL: # %bb.0:
; X64_AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; X64_AVX512VL-NEXT: retq
;
; X64_AVX512VLDQ-LABEL: fabs_v2f64:
-; X64_AVX512VLDQ: # BB#0:
+; X64_AVX512VLDQ: # %bb.0:
; X64_AVX512VLDQ-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0
; X64_AVX512VLDQ-NEXT: retq
%t = call <2 x double> @llvm.fabs.v2f64(<2 x double> %p)
@@ -46,32 +46,32 @@ declare <2 x double> @llvm.fabs.v2f64(<2 x double> %p)
define <4 x float> @fabs_v4f32(<4 x float> %p) {
; X32_AVX-LABEL: fabs_v4f32:
-; X32_AVX: # BB#0:
+; X32_AVX: # %bb.0:
; X32_AVX-NEXT: vandps {{\.LCPI.*}}, %xmm0, %xmm0
; X32_AVX-NEXT: retl
;
; X32_AVX512VL-LABEL: fabs_v4f32:
-; X32_AVX512VL: # BB#0:
+; X32_AVX512VL: # %bb.0:
; X32_AVX512VL-NEXT: vpandd {{\.LCPI.*}}{1to4}, %xmm0, %xmm0
; X32_AVX512VL-NEXT: retl
;
; X32_AVX512VLDQ-LABEL: fabs_v4f32:
-; X32_AVX512VLDQ: # BB#0:
+; X32_AVX512VLDQ: # %bb.0:
; X32_AVX512VLDQ-NEXT: vandps {{\.LCPI.*}}{1to4}, %xmm0, %xmm0
; X32_AVX512VLDQ-NEXT: retl
;
; X64_AVX-LABEL: fabs_v4f32:
-; X64_AVX: # BB#0:
+; X64_AVX: # %bb.0:
; X64_AVX-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0
; X64_AVX-NEXT: retq
;
; X64_AVX512VL-LABEL: fabs_v4f32:
-; X64_AVX512VL: # BB#0:
+; X64_AVX512VL: # %bb.0:
; X64_AVX512VL-NEXT: vpandd {{.*}}(%rip){1to4}, %xmm0, %xmm0
; X64_AVX512VL-NEXT: retq
;
; X64_AVX512VLDQ-LABEL: fabs_v4f32:
-; X64_AVX512VLDQ: # BB#0:
+; X64_AVX512VLDQ: # %bb.0:
; X64_AVX512VLDQ-NEXT: vandps {{.*}}(%rip){1to4}, %xmm0, %xmm0
; X64_AVX512VLDQ-NEXT: retq
%t = call <4 x float> @llvm.fabs.v4f32(<4 x float> %p)
@@ -81,32 +81,32 @@ declare <4 x float> @llvm.fabs.v4f32(<4 x float> %p)
define <4 x double> @fabs_v4f64(<4 x double> %p) {
; X32_AVX-LABEL: fabs_v4f64:
-; X32_AVX: # BB#0:
+; X32_AVX: # %bb.0:
; X32_AVX-NEXT: vandps {{\.LCPI.*}}, %ymm0, %ymm0
; X32_AVX-NEXT: retl
;
; X32_AVX512VL-LABEL: fabs_v4f64:
-; X32_AVX512VL: # BB#0:
+; X32_AVX512VL: # %bb.0:
; X32_AVX512VL-NEXT: vpandq {{\.LCPI.*}}{1to4}, %ymm0, %ymm0
; X32_AVX512VL-NEXT: retl
;
; X32_AVX512VLDQ-LABEL: fabs_v4f64:
-; X32_AVX512VLDQ: # BB#0:
+; X32_AVX512VLDQ: # %bb.0:
; X32_AVX512VLDQ-NEXT: vandpd {{\.LCPI.*}}{1to4}, %ymm0, %ymm0
; X32_AVX512VLDQ-NEXT: retl
;
; X64_AVX-LABEL: fabs_v4f64:
-; X64_AVX: # BB#0:
+; X64_AVX: # %bb.0:
; X64_AVX-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
; X64_AVX-NEXT: retq
;
; X64_AVX512VL-LABEL: fabs_v4f64:
-; X64_AVX512VL: # BB#0:
+; X64_AVX512VL: # %bb.0:
; X64_AVX512VL-NEXT: vpandq {{.*}}(%rip){1to4}, %ymm0, %ymm0
; X64_AVX512VL-NEXT: retq
;
; X64_AVX512VLDQ-LABEL: fabs_v4f64:
-; X64_AVX512VLDQ: # BB#0:
+; X64_AVX512VLDQ: # %bb.0:
; X64_AVX512VLDQ-NEXT: vandpd {{.*}}(%rip){1to4}, %ymm0, %ymm0
; X64_AVX512VLDQ-NEXT: retq
%t = call <4 x double> @llvm.fabs.v4f64(<4 x double> %p)
@@ -116,32 +116,32 @@ declare <4 x double> @llvm.fabs.v4f64(<4 x double> %p)
define <8 x float> @fabs_v8f32(<8 x float> %p) {
; X32_AVX-LABEL: fabs_v8f32:
-; X32_AVX: # BB#0:
+; X32_AVX: # %bb.0:
; X32_AVX-NEXT: vandps {{\.LCPI.*}}, %ymm0, %ymm0
; X32_AVX-NEXT: retl
;
; X32_AVX512VL-LABEL: fabs_v8f32:
-; X32_AVX512VL: # BB#0:
+; X32_AVX512VL: # %bb.0:
; X32_AVX512VL-NEXT: vpandd {{\.LCPI.*}}{1to8}, %ymm0, %ymm0
; X32_AVX512VL-NEXT: retl
;
; X32_AVX512VLDQ-LABEL: fabs_v8f32:
-; X32_AVX512VLDQ: # BB#0:
+; X32_AVX512VLDQ: # %bb.0:
; X32_AVX512VLDQ-NEXT: vandps {{\.LCPI.*}}{1to8}, %ymm0, %ymm0
; X32_AVX512VLDQ-NEXT: retl
;
; X64_AVX-LABEL: fabs_v8f32:
-; X64_AVX: # BB#0:
+; X64_AVX: # %bb.0:
; X64_AVX-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
; X64_AVX-NEXT: retq
;
; X64_AVX512VL-LABEL: fabs_v8f32:
-; X64_AVX512VL: # BB#0:
+; X64_AVX512VL: # %bb.0:
; X64_AVX512VL-NEXT: vpandd {{.*}}(%rip){1to8}, %ymm0, %ymm0
; X64_AVX512VL-NEXT: retq
;
; X64_AVX512VLDQ-LABEL: fabs_v8f32:
-; X64_AVX512VLDQ: # BB#0:
+; X64_AVX512VLDQ: # %bb.0:
; X64_AVX512VLDQ-NEXT: vandps {{.*}}(%rip){1to8}, %ymm0, %ymm0
; X64_AVX512VLDQ-NEXT: retq
%t = call <8 x float> @llvm.fabs.v8f32(<8 x float> %p)
@@ -151,36 +151,36 @@ declare <8 x float> @llvm.fabs.v8f32(<8 x float> %p)
define <8 x double> @fabs_v8f64(<8 x double> %p) {
; X32_AVX-LABEL: fabs_v8f64:
-; X32_AVX: # BB#0:
+; X32_AVX: # %bb.0:
; X32_AVX-NEXT: vmovaps {{.*#+}} ymm2 = [{{(nan|1\.#QNAN0e\+00)}},{{(nan|1\.#QNAN0e\+00)}},{{(nan|1\.#QNAN0e\+00)}},{{(nan|1\.#QNAN0e\+00)}}]
; X32_AVX-NEXT: vandps %ymm2, %ymm0, %ymm0
; X32_AVX-NEXT: vandps %ymm2, %ymm1, %ymm1
; X32_AVX-NEXT: retl
;
; X32_AVX512VL-LABEL: fabs_v8f64:
-; X32_AVX512VL: # BB#0:
+; X32_AVX512VL: # %bb.0:
; X32_AVX512VL-NEXT: vpandq {{\.LCPI.*}}{1to8}, %zmm0, %zmm0
; X32_AVX512VL-NEXT: retl
;
; X32_AVX512VLDQ-LABEL: fabs_v8f64:
-; X32_AVX512VLDQ: # BB#0:
+; X32_AVX512VLDQ: # %bb.0:
; X32_AVX512VLDQ-NEXT: vandpd {{\.LCPI.*}}{1to8}, %zmm0, %zmm0
; X32_AVX512VLDQ-NEXT: retl
;
; X64_AVX-LABEL: fabs_v8f64:
-; X64_AVX: # BB#0:
+; X64_AVX: # %bb.0:
; X64_AVX-NEXT: vmovaps {{.*#+}} ymm2 = [{{(nan|1\.#QNAN0e\+00)}},{{(nan|1\.#QNAN0e\+00)}},{{(nan|1\.#QNAN0e\+00)}},{{(nan|1\.#QNAN0e\+00)}}]
; X64_AVX-NEXT: vandps %ymm2, %ymm0, %ymm0
; X64_AVX-NEXT: vandps %ymm2, %ymm1, %ymm1
; X64_AVX-NEXT: retq
;
; X64_AVX512VL-LABEL: fabs_v8f64:
-; X64_AVX512VL: # BB#0:
+; X64_AVX512VL: # %bb.0:
; X64_AVX512VL-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm0
; X64_AVX512VL-NEXT: retq
;
; X64_AVX512VLDQ-LABEL: fabs_v8f64:
-; X64_AVX512VLDQ: # BB#0:
+; X64_AVX512VLDQ: # %bb.0:
; X64_AVX512VLDQ-NEXT: vandpd {{.*}}(%rip){1to8}, %zmm0, %zmm0
; X64_AVX512VLDQ-NEXT: retq
%t = call <8 x double> @llvm.fabs.v8f64(<8 x double> %p)
@@ -190,36 +190,36 @@ declare <8 x double> @llvm.fabs.v8f64(<8 x double> %p)
define <16 x float> @fabs_v16f32(<16 x float> %p) {
; X32_AVX-LABEL: fabs_v16f32:
-; X32_AVX: # BB#0:
+; X32_AVX: # %bb.0:
; X32_AVX-NEXT: vmovaps {{.*#+}} ymm2 = [{{(nan|1\.#QNAN0e\+00)}},{{(nan|1\.#QNAN0e\+00)}},{{(nan|1\.#QNAN0e\+00)}},{{(nan|1\.#QNAN0e\+00)}},{{(nan|1\.#QNAN0e\+00)}},{{(nan|1\.#QNAN0e\+00)}},{{(nan|1\.#QNAN0e\+00)}},{{(nan|1\.#QNAN0e\+00)}}]
; X32_AVX-NEXT: vandps %ymm2, %ymm0, %ymm0
; X32_AVX-NEXT: vandps %ymm2, %ymm1, %ymm1
; X32_AVX-NEXT: retl
;
; X32_AVX512VL-LABEL: fabs_v16f32:
-; X32_AVX512VL: # BB#0:
+; X32_AVX512VL: # %bb.0:
; X32_AVX512VL-NEXT: vpandd {{\.LCPI.*}}{1to16}, %zmm0, %zmm0
; X32_AVX512VL-NEXT: retl
;
; X32_AVX512VLDQ-LABEL: fabs_v16f32:
-; X32_AVX512VLDQ: # BB#0:
+; X32_AVX512VLDQ: # %bb.0:
; X32_AVX512VLDQ-NEXT: vandps {{\.LCPI.*}}{1to16}, %zmm0, %zmm0
; X32_AVX512VLDQ-NEXT: retl
;
; X64_AVX-LABEL: fabs_v16f32:
-; X64_AVX: # BB#0:
+; X64_AVX: # %bb.0:
; X64_AVX-NEXT: vmovaps {{.*#+}} ymm2 = [{{(nan|1\.#QNAN0e\+00)}},{{(nan|1\.#QNAN0e\+00)}},{{(nan|1\.#QNAN0e\+00)}},{{(nan|1\.#QNAN0e\+00)}},{{(nan|1\.#QNAN0e\+00)}},{{(nan|1\.#QNAN0e\+00)}},{{(nan|1\.#QNAN0e\+00)}},{{(nan|1\.#QNAN0e\+00)}}]
; X64_AVX-NEXT: vandps %ymm2, %ymm0, %ymm0
; X64_AVX-NEXT: vandps %ymm2, %ymm1, %ymm1
; X64_AVX-NEXT: retq
;
; X64_AVX512VL-LABEL: fabs_v16f32:
-; X64_AVX512VL: # BB#0:
+; X64_AVX512VL: # %bb.0:
; X64_AVX512VL-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm0
; X64_AVX512VL-NEXT: retq
;
; X64_AVX512VLDQ-LABEL: fabs_v16f32:
-; X64_AVX512VLDQ: # BB#0:
+; X64_AVX512VLDQ: # %bb.0:
; X64_AVX512VLDQ-NEXT: vandps {{.*}}(%rip){1to16}, %zmm0, %zmm0
; X64_AVX512VLDQ-NEXT: retq
%t = call <16 x float> @llvm.fabs.v16f32(<16 x float> %p)
@@ -244,13 +244,13 @@ declare <16 x float> @llvm.fabs.v16f32(<16 x float> %p)
define i64 @fabs_v2f32_1() {
; X32-LABEL: fabs_v2f32_1:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: xorl %eax, %eax
; X32-NEXT: movl $2147483647, %edx # imm = 0x7FFFFFFF
; X32-NEXT: retl
;
; X64-LABEL: fabs_v2f32_1:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movabsq $9223372032559808512, %rax # imm = 0x7FFFFFFF00000000
; X64-NEXT: retq
%bitcast = bitcast i64 18446744069414584320 to <2 x float> ; 0xFFFF_FFFF_0000_0000
@@ -261,13 +261,13 @@ define i64 @fabs_v2f32_1() {
define i64 @fabs_v2f32_2() {
; X32-LABEL: fabs_v2f32_2:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF
; X32-NEXT: xorl %edx, %edx
; X32-NEXT: retl
;
; X64-LABEL: fabs_v2f32_2:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF
; X64-NEXT: retq
%bitcast = bitcast i64 4294967295 to <2 x float> ; 0x0000_0000_FFFF_FFFF
diff --git a/test/CodeGen/X86/vec_floor.ll b/test/CodeGen/X86/vec_floor.ll
index 4fa79bc7fa8b..d01c6f6ea904 100644
--- a/test/CodeGen/X86/vec_floor.ll
+++ b/test/CodeGen/X86/vec_floor.ll
@@ -1,17 +1,23 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE41
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s --check-prefix=AVX
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512vl | FileCheck %s --check-prefix=AVX512
define <2 x double> @floor_v2f64(<2 x double> %p) {
; SSE41-LABEL: floor_v2f64:
-; SSE41: ## BB#0:
+; SSE41: ## %bb.0:
; SSE41-NEXT: roundpd $9, %xmm0, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: floor_v2f64:
-; AVX: ## BB#0:
+; AVX: ## %bb.0:
; AVX-NEXT: vroundpd $9, %xmm0, %xmm0
; AVX-NEXT: retq
+;
+; AVX512-LABEL: floor_v2f64:
+; AVX512: ## %bb.0:
+; AVX512-NEXT: vrndscalepd $9, %xmm0, %xmm0
+; AVX512-NEXT: retq
%t = call <2 x double> @llvm.floor.v2f64(<2 x double> %p)
ret <2 x double> %t
}
@@ -19,14 +25,19 @@ declare <2 x double> @llvm.floor.v2f64(<2 x double> %p)
define <4 x float> @floor_v4f32(<4 x float> %p) {
; SSE41-LABEL: floor_v4f32:
-; SSE41: ## BB#0:
+; SSE41: ## %bb.0:
; SSE41-NEXT: roundps $9, %xmm0, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: floor_v4f32:
-; AVX: ## BB#0:
+; AVX: ## %bb.0:
; AVX-NEXT: vroundps $9, %xmm0, %xmm0
; AVX-NEXT: retq
+;
+; AVX512-LABEL: floor_v4f32:
+; AVX512: ## %bb.0:
+; AVX512-NEXT: vrndscaleps $9, %xmm0, %xmm0
+; AVX512-NEXT: retq
%t = call <4 x float> @llvm.floor.v4f32(<4 x float> %p)
ret <4 x float> %t
}
@@ -34,15 +45,20 @@ declare <4 x float> @llvm.floor.v4f32(<4 x float> %p)
define <4 x double> @floor_v4f64(<4 x double> %p){
; SSE41-LABEL: floor_v4f64:
-; SSE41: ## BB#0:
+; SSE41: ## %bb.0:
; SSE41-NEXT: roundpd $9, %xmm0, %xmm0
; SSE41-NEXT: roundpd $9, %xmm1, %xmm1
; SSE41-NEXT: retq
;
; AVX-LABEL: floor_v4f64:
-; AVX: ## BB#0:
+; AVX: ## %bb.0:
; AVX-NEXT: vroundpd $9, %ymm0, %ymm0
; AVX-NEXT: retq
+;
+; AVX512-LABEL: floor_v4f64:
+; AVX512: ## %bb.0:
+; AVX512-NEXT: vrndscalepd $9, %ymm0, %ymm0
+; AVX512-NEXT: retq
%t = call <4 x double> @llvm.floor.v4f64(<4 x double> %p)
ret <4 x double> %t
}
@@ -50,30 +66,88 @@ declare <4 x double> @llvm.floor.v4f64(<4 x double> %p)
define <8 x float> @floor_v8f32(<8 x float> %p) {
; SSE41-LABEL: floor_v8f32:
-; SSE41: ## BB#0:
+; SSE41: ## %bb.0:
; SSE41-NEXT: roundps $9, %xmm0, %xmm0
; SSE41-NEXT: roundps $9, %xmm1, %xmm1
; SSE41-NEXT: retq
;
; AVX-LABEL: floor_v8f32:
-; AVX: ## BB#0:
+; AVX: ## %bb.0:
; AVX-NEXT: vroundps $9, %ymm0, %ymm0
; AVX-NEXT: retq
+;
+; AVX512-LABEL: floor_v8f32:
+; AVX512: ## %bb.0:
+; AVX512-NEXT: vrndscaleps $9, %ymm0, %ymm0
+; AVX512-NEXT: retq
%t = call <8 x float> @llvm.floor.v8f32(<8 x float> %p)
ret <8 x float> %t
}
declare <8 x float> @llvm.floor.v8f32(<8 x float> %p)
+define <8 x double> @floor_v8f64(<8 x double> %p){
+; SSE41-LABEL: floor_v8f64:
+; SSE41: ## %bb.0:
+; SSE41-NEXT: roundpd $9, %xmm0, %xmm0
+; SSE41-NEXT: roundpd $9, %xmm1, %xmm1
+; SSE41-NEXT: roundpd $9, %xmm2, %xmm2
+; SSE41-NEXT: roundpd $9, %xmm3, %xmm3
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: floor_v8f64:
+; AVX: ## %bb.0:
+; AVX-NEXT: vroundpd $9, %ymm0, %ymm0
+; AVX-NEXT: vroundpd $9, %ymm1, %ymm1
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: floor_v8f64:
+; AVX512: ## %bb.0:
+; AVX512-NEXT: vrndscalepd $9, %zmm0, %zmm0
+; AVX512-NEXT: retq
+ %t = call <8 x double> @llvm.floor.v8f64(<8 x double> %p)
+ ret <8 x double> %t
+}
+declare <8 x double> @llvm.floor.v8f64(<8 x double> %p)
+
+define <16 x float> @floor_v16f32(<16 x float> %p) {
+; SSE41-LABEL: floor_v16f32:
+; SSE41: ## %bb.0:
+; SSE41-NEXT: roundps $9, %xmm0, %xmm0
+; SSE41-NEXT: roundps $9, %xmm1, %xmm1
+; SSE41-NEXT: roundps $9, %xmm2, %xmm2
+; SSE41-NEXT: roundps $9, %xmm3, %xmm3
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: floor_v16f32:
+; AVX: ## %bb.0:
+; AVX-NEXT: vroundps $9, %ymm0, %ymm0
+; AVX-NEXT: vroundps $9, %ymm1, %ymm1
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: floor_v16f32:
+; AVX512: ## %bb.0:
+; AVX512-NEXT: vrndscaleps $9, %zmm0, %zmm0
+; AVX512-NEXT: retq
+ %t = call <16 x float> @llvm.floor.v16f32(<16 x float> %p)
+ ret <16 x float> %t
+}
+declare <16 x float> @llvm.floor.v16f32(<16 x float> %p)
+
define <2 x double> @ceil_v2f64(<2 x double> %p) {
; SSE41-LABEL: ceil_v2f64:
-; SSE41: ## BB#0:
+; SSE41: ## %bb.0:
; SSE41-NEXT: roundpd $10, %xmm0, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: ceil_v2f64:
-; AVX: ## BB#0:
+; AVX: ## %bb.0:
; AVX-NEXT: vroundpd $10, %xmm0, %xmm0
; AVX-NEXT: retq
+;
+; AVX512-LABEL: ceil_v2f64:
+; AVX512: ## %bb.0:
+; AVX512-NEXT: vrndscalepd $10, %xmm0, %xmm0
+; AVX512-NEXT: retq
%t = call <2 x double> @llvm.ceil.v2f64(<2 x double> %p)
ret <2 x double> %t
}
@@ -81,14 +155,19 @@ declare <2 x double> @llvm.ceil.v2f64(<2 x double> %p)
define <4 x float> @ceil_v4f32(<4 x float> %p) {
; SSE41-LABEL: ceil_v4f32:
-; SSE41: ## BB#0:
+; SSE41: ## %bb.0:
; SSE41-NEXT: roundps $10, %xmm0, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: ceil_v4f32:
-; AVX: ## BB#0:
+; AVX: ## %bb.0:
; AVX-NEXT: vroundps $10, %xmm0, %xmm0
; AVX-NEXT: retq
+;
+; AVX512-LABEL: ceil_v4f32:
+; AVX512: ## %bb.0:
+; AVX512-NEXT: vrndscaleps $10, %xmm0, %xmm0
+; AVX512-NEXT: retq
%t = call <4 x float> @llvm.ceil.v4f32(<4 x float> %p)
ret <4 x float> %t
}
@@ -96,15 +175,20 @@ declare <4 x float> @llvm.ceil.v4f32(<4 x float> %p)
define <4 x double> @ceil_v4f64(<4 x double> %p) {
; SSE41-LABEL: ceil_v4f64:
-; SSE41: ## BB#0:
+; SSE41: ## %bb.0:
; SSE41-NEXT: roundpd $10, %xmm0, %xmm0
; SSE41-NEXT: roundpd $10, %xmm1, %xmm1
; SSE41-NEXT: retq
;
; AVX-LABEL: ceil_v4f64:
-; AVX: ## BB#0:
+; AVX: ## %bb.0:
; AVX-NEXT: vroundpd $10, %ymm0, %ymm0
; AVX-NEXT: retq
+;
+; AVX512-LABEL: ceil_v4f64:
+; AVX512: ## %bb.0:
+; AVX512-NEXT: vrndscalepd $10, %ymm0, %ymm0
+; AVX512-NEXT: retq
%t = call <4 x double> @llvm.ceil.v4f64(<4 x double> %p)
ret <4 x double> %t
}
@@ -112,30 +196,88 @@ declare <4 x double> @llvm.ceil.v4f64(<4 x double> %p)
define <8 x float> @ceil_v8f32(<8 x float> %p) {
; SSE41-LABEL: ceil_v8f32:
-; SSE41: ## BB#0:
+; SSE41: ## %bb.0:
; SSE41-NEXT: roundps $10, %xmm0, %xmm0
; SSE41-NEXT: roundps $10, %xmm1, %xmm1
; SSE41-NEXT: retq
;
; AVX-LABEL: ceil_v8f32:
-; AVX: ## BB#0:
+; AVX: ## %bb.0:
; AVX-NEXT: vroundps $10, %ymm0, %ymm0
; AVX-NEXT: retq
+;
+; AVX512-LABEL: ceil_v8f32:
+; AVX512: ## %bb.0:
+; AVX512-NEXT: vrndscaleps $10, %ymm0, %ymm0
+; AVX512-NEXT: retq
%t = call <8 x float> @llvm.ceil.v8f32(<8 x float> %p)
ret <8 x float> %t
}
declare <8 x float> @llvm.ceil.v8f32(<8 x float> %p)
+define <8 x double> @ceil_v8f64(<8 x double> %p){
+; SSE41-LABEL: ceil_v8f64:
+; SSE41: ## %bb.0:
+; SSE41-NEXT: roundpd $10, %xmm0, %xmm0
+; SSE41-NEXT: roundpd $10, %xmm1, %xmm1
+; SSE41-NEXT: roundpd $10, %xmm2, %xmm2
+; SSE41-NEXT: roundpd $10, %xmm3, %xmm3
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: ceil_v8f64:
+; AVX: ## %bb.0:
+; AVX-NEXT: vroundpd $10, %ymm0, %ymm0
+; AVX-NEXT: vroundpd $10, %ymm1, %ymm1
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: ceil_v8f64:
+; AVX512: ## %bb.0:
+; AVX512-NEXT: vrndscalepd $10, %zmm0, %zmm0
+; AVX512-NEXT: retq
+ %t = call <8 x double> @llvm.ceil.v8f64(<8 x double> %p)
+ ret <8 x double> %t
+}
+declare <8 x double> @llvm.ceil.v8f64(<8 x double> %p)
+
+define <16 x float> @ceil_v16f32(<16 x float> %p) {
+; SSE41-LABEL: ceil_v16f32:
+; SSE41: ## %bb.0:
+; SSE41-NEXT: roundps $10, %xmm0, %xmm0
+; SSE41-NEXT: roundps $10, %xmm1, %xmm1
+; SSE41-NEXT: roundps $10, %xmm2, %xmm2
+; SSE41-NEXT: roundps $10, %xmm3, %xmm3
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: ceil_v16f32:
+; AVX: ## %bb.0:
+; AVX-NEXT: vroundps $10, %ymm0, %ymm0
+; AVX-NEXT: vroundps $10, %ymm1, %ymm1
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: ceil_v16f32:
+; AVX512: ## %bb.0:
+; AVX512-NEXT: vrndscaleps $10, %zmm0, %zmm0
+; AVX512-NEXT: retq
+ %t = call <16 x float> @llvm.ceil.v16f32(<16 x float> %p)
+ ret <16 x float> %t
+}
+declare <16 x float> @llvm.ceil.v16f32(<16 x float> %p)
+
define <2 x double> @trunc_v2f64(<2 x double> %p) {
; SSE41-LABEL: trunc_v2f64:
-; SSE41: ## BB#0:
+; SSE41: ## %bb.0:
; SSE41-NEXT: roundpd $11, %xmm0, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: trunc_v2f64:
-; AVX: ## BB#0:
+; AVX: ## %bb.0:
; AVX-NEXT: vroundpd $11, %xmm0, %xmm0
; AVX-NEXT: retq
+;
+; AVX512-LABEL: trunc_v2f64:
+; AVX512: ## %bb.0:
+; AVX512-NEXT: vrndscalepd $11, %xmm0, %xmm0
+; AVX512-NEXT: retq
%t = call <2 x double> @llvm.trunc.v2f64(<2 x double> %p)
ret <2 x double> %t
}
@@ -143,14 +285,19 @@ declare <2 x double> @llvm.trunc.v2f64(<2 x double> %p)
define <4 x float> @trunc_v4f32(<4 x float> %p) {
; SSE41-LABEL: trunc_v4f32:
-; SSE41: ## BB#0:
+; SSE41: ## %bb.0:
; SSE41-NEXT: roundps $11, %xmm0, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: trunc_v4f32:
-; AVX: ## BB#0:
+; AVX: ## %bb.0:
; AVX-NEXT: vroundps $11, %xmm0, %xmm0
; AVX-NEXT: retq
+;
+; AVX512-LABEL: trunc_v4f32:
+; AVX512: ## %bb.0:
+; AVX512-NEXT: vrndscaleps $11, %xmm0, %xmm0
+; AVX512-NEXT: retq
%t = call <4 x float> @llvm.trunc.v4f32(<4 x float> %p)
ret <4 x float> %t
}
@@ -158,15 +305,20 @@ declare <4 x float> @llvm.trunc.v4f32(<4 x float> %p)
define <4 x double> @trunc_v4f64(<4 x double> %p) {
; SSE41-LABEL: trunc_v4f64:
-; SSE41: ## BB#0:
+; SSE41: ## %bb.0:
; SSE41-NEXT: roundpd $11, %xmm0, %xmm0
; SSE41-NEXT: roundpd $11, %xmm1, %xmm1
; SSE41-NEXT: retq
;
; AVX-LABEL: trunc_v4f64:
-; AVX: ## BB#0:
+; AVX: ## %bb.0:
; AVX-NEXT: vroundpd $11, %ymm0, %ymm0
; AVX-NEXT: retq
+;
+; AVX512-LABEL: trunc_v4f64:
+; AVX512: ## %bb.0:
+; AVX512-NEXT: vrndscalepd $11, %ymm0, %ymm0
+; AVX512-NEXT: retq
%t = call <4 x double> @llvm.trunc.v4f64(<4 x double> %p)
ret <4 x double> %t
}
@@ -174,30 +326,88 @@ declare <4 x double> @llvm.trunc.v4f64(<4 x double> %p)
define <8 x float> @trunc_v8f32(<8 x float> %p) {
; SSE41-LABEL: trunc_v8f32:
-; SSE41: ## BB#0:
+; SSE41: ## %bb.0:
; SSE41-NEXT: roundps $11, %xmm0, %xmm0
; SSE41-NEXT: roundps $11, %xmm1, %xmm1
; SSE41-NEXT: retq
;
; AVX-LABEL: trunc_v8f32:
-; AVX: ## BB#0:
+; AVX: ## %bb.0:
; AVX-NEXT: vroundps $11, %ymm0, %ymm0
; AVX-NEXT: retq
+;
+; AVX512-LABEL: trunc_v8f32:
+; AVX512: ## %bb.0:
+; AVX512-NEXT: vrndscaleps $11, %ymm0, %ymm0
+; AVX512-NEXT: retq
%t = call <8 x float> @llvm.trunc.v8f32(<8 x float> %p)
ret <8 x float> %t
}
declare <8 x float> @llvm.trunc.v8f32(<8 x float> %p)
+define <8 x double> @trunc_v8f64(<8 x double> %p){
+; SSE41-LABEL: trunc_v8f64:
+; SSE41: ## %bb.0:
+; SSE41-NEXT: roundpd $11, %xmm0, %xmm0
+; SSE41-NEXT: roundpd $11, %xmm1, %xmm1
+; SSE41-NEXT: roundpd $11, %xmm2, %xmm2
+; SSE41-NEXT: roundpd $11, %xmm3, %xmm3
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: trunc_v8f64:
+; AVX: ## %bb.0:
+; AVX-NEXT: vroundpd $11, %ymm0, %ymm0
+; AVX-NEXT: vroundpd $11, %ymm1, %ymm1
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: trunc_v8f64:
+; AVX512: ## %bb.0:
+; AVX512-NEXT: vrndscalepd $11, %zmm0, %zmm0
+; AVX512-NEXT: retq
+ %t = call <8 x double> @llvm.trunc.v8f64(<8 x double> %p)
+ ret <8 x double> %t
+}
+declare <8 x double> @llvm.trunc.v8f64(<8 x double> %p)
+
+define <16 x float> @trunc_v16f32(<16 x float> %p) {
+; SSE41-LABEL: trunc_v16f32:
+; SSE41: ## %bb.0:
+; SSE41-NEXT: roundps $11, %xmm0, %xmm0
+; SSE41-NEXT: roundps $11, %xmm1, %xmm1
+; SSE41-NEXT: roundps $11, %xmm2, %xmm2
+; SSE41-NEXT: roundps $11, %xmm3, %xmm3
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: trunc_v16f32:
+; AVX: ## %bb.0:
+; AVX-NEXT: vroundps $11, %ymm0, %ymm0
+; AVX-NEXT: vroundps $11, %ymm1, %ymm1
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: trunc_v16f32:
+; AVX512: ## %bb.0:
+; AVX512-NEXT: vrndscaleps $11, %zmm0, %zmm0
+; AVX512-NEXT: retq
+ %t = call <16 x float> @llvm.trunc.v16f32(<16 x float> %p)
+ ret <16 x float> %t
+}
+declare <16 x float> @llvm.trunc.v16f32(<16 x float> %p)
+
define <2 x double> @rint_v2f64(<2 x double> %p) {
; SSE41-LABEL: rint_v2f64:
-; SSE41: ## BB#0:
+; SSE41: ## %bb.0:
; SSE41-NEXT: roundpd $4, %xmm0, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: rint_v2f64:
-; AVX: ## BB#0:
+; AVX: ## %bb.0:
; AVX-NEXT: vroundpd $4, %xmm0, %xmm0
; AVX-NEXT: retq
+;
+; AVX512-LABEL: rint_v2f64:
+; AVX512: ## %bb.0:
+; AVX512-NEXT: vrndscalepd $4, %xmm0, %xmm0
+; AVX512-NEXT: retq
%t = call <2 x double> @llvm.rint.v2f64(<2 x double> %p)
ret <2 x double> %t
}
@@ -205,14 +415,19 @@ declare <2 x double> @llvm.rint.v2f64(<2 x double> %p)
define <4 x float> @rint_v4f32(<4 x float> %p) {
; SSE41-LABEL: rint_v4f32:
-; SSE41: ## BB#0:
+; SSE41: ## %bb.0:
; SSE41-NEXT: roundps $4, %xmm0, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: rint_v4f32:
-; AVX: ## BB#0:
+; AVX: ## %bb.0:
; AVX-NEXT: vroundps $4, %xmm0, %xmm0
; AVX-NEXT: retq
+;
+; AVX512-LABEL: rint_v4f32:
+; AVX512: ## %bb.0:
+; AVX512-NEXT: vrndscaleps $4, %xmm0, %xmm0
+; AVX512-NEXT: retq
%t = call <4 x float> @llvm.rint.v4f32(<4 x float> %p)
ret <4 x float> %t
}
@@ -220,15 +435,20 @@ declare <4 x float> @llvm.rint.v4f32(<4 x float> %p)
define <4 x double> @rint_v4f64(<4 x double> %p) {
; SSE41-LABEL: rint_v4f64:
-; SSE41: ## BB#0:
+; SSE41: ## %bb.0:
; SSE41-NEXT: roundpd $4, %xmm0, %xmm0
; SSE41-NEXT: roundpd $4, %xmm1, %xmm1
; SSE41-NEXT: retq
;
; AVX-LABEL: rint_v4f64:
-; AVX: ## BB#0:
+; AVX: ## %bb.0:
; AVX-NEXT: vroundpd $4, %ymm0, %ymm0
; AVX-NEXT: retq
+;
+; AVX512-LABEL: rint_v4f64:
+; AVX512: ## %bb.0:
+; AVX512-NEXT: vrndscalepd $4, %ymm0, %ymm0
+; AVX512-NEXT: retq
%t = call <4 x double> @llvm.rint.v4f64(<4 x double> %p)
ret <4 x double> %t
}
@@ -236,30 +456,88 @@ declare <4 x double> @llvm.rint.v4f64(<4 x double> %p)
define <8 x float> @rint_v8f32(<8 x float> %p) {
; SSE41-LABEL: rint_v8f32:
-; SSE41: ## BB#0:
+; SSE41: ## %bb.0:
; SSE41-NEXT: roundps $4, %xmm0, %xmm0
; SSE41-NEXT: roundps $4, %xmm1, %xmm1
; SSE41-NEXT: retq
;
; AVX-LABEL: rint_v8f32:
-; AVX: ## BB#0:
+; AVX: ## %bb.0:
; AVX-NEXT: vroundps $4, %ymm0, %ymm0
; AVX-NEXT: retq
+;
+; AVX512-LABEL: rint_v8f32:
+; AVX512: ## %bb.0:
+; AVX512-NEXT: vrndscaleps $4, %ymm0, %ymm0
+; AVX512-NEXT: retq
%t = call <8 x float> @llvm.rint.v8f32(<8 x float> %p)
ret <8 x float> %t
}
declare <8 x float> @llvm.rint.v8f32(<8 x float> %p)
+define <8 x double> @rint_v8f64(<8 x double> %p){
+; SSE41-LABEL: rint_v8f64:
+; SSE41: ## %bb.0:
+; SSE41-NEXT: roundpd $4, %xmm0, %xmm0
+; SSE41-NEXT: roundpd $4, %xmm1, %xmm1
+; SSE41-NEXT: roundpd $4, %xmm2, %xmm2
+; SSE41-NEXT: roundpd $4, %xmm3, %xmm3
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: rint_v8f64:
+; AVX: ## %bb.0:
+; AVX-NEXT: vroundpd $4, %ymm0, %ymm0
+; AVX-NEXT: vroundpd $4, %ymm1, %ymm1
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: rint_v8f64:
+; AVX512: ## %bb.0:
+; AVX512-NEXT: vrndscalepd $4, %zmm0, %zmm0
+; AVX512-NEXT: retq
+ %t = call <8 x double> @llvm.rint.v8f64(<8 x double> %p)
+ ret <8 x double> %t
+}
+declare <8 x double> @llvm.rint.v8f64(<8 x double> %p)
+
+define <16 x float> @rint_v16f32(<16 x float> %p) {
+; SSE41-LABEL: rint_v16f32:
+; SSE41: ## %bb.0:
+; SSE41-NEXT: roundps $4, %xmm0, %xmm0
+; SSE41-NEXT: roundps $4, %xmm1, %xmm1
+; SSE41-NEXT: roundps $4, %xmm2, %xmm2
+; SSE41-NEXT: roundps $4, %xmm3, %xmm3
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: rint_v16f32:
+; AVX: ## %bb.0:
+; AVX-NEXT: vroundps $4, %ymm0, %ymm0
+; AVX-NEXT: vroundps $4, %ymm1, %ymm1
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: rint_v16f32:
+; AVX512: ## %bb.0:
+; AVX512-NEXT: vrndscaleps $4, %zmm0, %zmm0
+; AVX512-NEXT: retq
+ %t = call <16 x float> @llvm.rint.v16f32(<16 x float> %p)
+ ret <16 x float> %t
+}
+declare <16 x float> @llvm.rint.v16f32(<16 x float> %p)
+
define <2 x double> @nearbyint_v2f64(<2 x double> %p) {
; SSE41-LABEL: nearbyint_v2f64:
-; SSE41: ## BB#0:
+; SSE41: ## %bb.0:
; SSE41-NEXT: roundpd $12, %xmm0, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: nearbyint_v2f64:
-; AVX: ## BB#0:
+; AVX: ## %bb.0:
; AVX-NEXT: vroundpd $12, %xmm0, %xmm0
; AVX-NEXT: retq
+;
+; AVX512-LABEL: nearbyint_v2f64:
+; AVX512: ## %bb.0:
+; AVX512-NEXT: vrndscalepd $12, %xmm0, %xmm0
+; AVX512-NEXT: retq
%t = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %p)
ret <2 x double> %t
}
@@ -267,14 +545,19 @@ declare <2 x double> @llvm.nearbyint.v2f64(<2 x double> %p)
define <4 x float> @nearbyint_v4f32(<4 x float> %p) {
; SSE41-LABEL: nearbyint_v4f32:
-; SSE41: ## BB#0:
+; SSE41: ## %bb.0:
; SSE41-NEXT: roundps $12, %xmm0, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: nearbyint_v4f32:
-; AVX: ## BB#0:
+; AVX: ## %bb.0:
; AVX-NEXT: vroundps $12, %xmm0, %xmm0
; AVX-NEXT: retq
+;
+; AVX512-LABEL: nearbyint_v4f32:
+; AVX512: ## %bb.0:
+; AVX512-NEXT: vrndscaleps $12, %xmm0, %xmm0
+; AVX512-NEXT: retq
%t = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %p)
ret <4 x float> %t
}
@@ -282,15 +565,20 @@ declare <4 x float> @llvm.nearbyint.v4f32(<4 x float> %p)
define <4 x double> @nearbyint_v4f64(<4 x double> %p) {
; SSE41-LABEL: nearbyint_v4f64:
-; SSE41: ## BB#0:
+; SSE41: ## %bb.0:
; SSE41-NEXT: roundpd $12, %xmm0, %xmm0
; SSE41-NEXT: roundpd $12, %xmm1, %xmm1
; SSE41-NEXT: retq
;
; AVX-LABEL: nearbyint_v4f64:
-; AVX: ## BB#0:
+; AVX: ## %bb.0:
; AVX-NEXT: vroundpd $12, %ymm0, %ymm0
; AVX-NEXT: retq
+;
+; AVX512-LABEL: nearbyint_v4f64:
+; AVX512: ## %bb.0:
+; AVX512-NEXT: vrndscalepd $12, %ymm0, %ymm0
+; AVX512-NEXT: retq
%t = call <4 x double> @llvm.nearbyint.v4f64(<4 x double> %p)
ret <4 x double> %t
}
@@ -298,104 +586,187 @@ declare <4 x double> @llvm.nearbyint.v4f64(<4 x double> %p)
define <8 x float> @nearbyint_v8f32(<8 x float> %p) {
; SSE41-LABEL: nearbyint_v8f32:
-; SSE41: ## BB#0:
+; SSE41: ## %bb.0:
; SSE41-NEXT: roundps $12, %xmm0, %xmm0
; SSE41-NEXT: roundps $12, %xmm1, %xmm1
; SSE41-NEXT: retq
;
; AVX-LABEL: nearbyint_v8f32:
-; AVX: ## BB#0:
+; AVX: ## %bb.0:
; AVX-NEXT: vroundps $12, %ymm0, %ymm0
; AVX-NEXT: retq
+;
+; AVX512-LABEL: nearbyint_v8f32:
+; AVX512: ## %bb.0:
+; AVX512-NEXT: vrndscaleps $12, %ymm0, %ymm0
+; AVX512-NEXT: retq
%t = call <8 x float> @llvm.nearbyint.v8f32(<8 x float> %p)
ret <8 x float> %t
}
declare <8 x float> @llvm.nearbyint.v8f32(<8 x float> %p)
+define <8 x double> @nearbyint_v8f64(<8 x double> %p){
+; SSE41-LABEL: nearbyint_v8f64:
+; SSE41: ## %bb.0:
+; SSE41-NEXT: roundpd $12, %xmm0, %xmm0
+; SSE41-NEXT: roundpd $12, %xmm1, %xmm1
+; SSE41-NEXT: roundpd $12, %xmm2, %xmm2
+; SSE41-NEXT: roundpd $12, %xmm3, %xmm3
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: nearbyint_v8f64:
+; AVX: ## %bb.0:
+; AVX-NEXT: vroundpd $12, %ymm0, %ymm0
+; AVX-NEXT: vroundpd $12, %ymm1, %ymm1
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: nearbyint_v8f64:
+; AVX512: ## %bb.0:
+; AVX512-NEXT: vrndscalepd $12, %zmm0, %zmm0
+; AVX512-NEXT: retq
+ %t = call <8 x double> @llvm.nearbyint.v8f64(<8 x double> %p)
+ ret <8 x double> %t
+}
+declare <8 x double> @llvm.nearbyint.v8f64(<8 x double> %p)
+
+define <16 x float> @nearbyint_v16f32(<16 x float> %p) {
+; SSE41-LABEL: nearbyint_v16f32:
+; SSE41: ## %bb.0:
+; SSE41-NEXT: roundps $12, %xmm0, %xmm0
+; SSE41-NEXT: roundps $12, %xmm1, %xmm1
+; SSE41-NEXT: roundps $12, %xmm2, %xmm2
+; SSE41-NEXT: roundps $12, %xmm3, %xmm3
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: nearbyint_v16f32:
+; AVX: ## %bb.0:
+; AVX-NEXT: vroundps $12, %ymm0, %ymm0
+; AVX-NEXT: vroundps $12, %ymm1, %ymm1
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: nearbyint_v16f32:
+; AVX512: ## %bb.0:
+; AVX512-NEXT: vrndscaleps $12, %zmm0, %zmm0
+; AVX512-NEXT: retq
+ %t = call <16 x float> @llvm.nearbyint.v16f32(<16 x float> %p)
+ ret <16 x float> %t
+}
+declare <16 x float> @llvm.nearbyint.v16f32(<16 x float> %p)
+
;
; Constant Folding
;
define <2 x double> @const_floor_v2f64() {
; SSE41-LABEL: const_floor_v2f64:
-; SSE41: ## BB#0:
+; SSE41: ## %bb.0:
; SSE41-NEXT: movaps {{.*#+}} xmm0 = [-2.000000e+00,2.000000e+00]
; SSE41-NEXT: retq
;
; AVX-LABEL: const_floor_v2f64:
-; AVX: ## BB#0:
+; AVX: ## %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [-2.000000e+00,2.000000e+00]
; AVX-NEXT: retq
+;
+; AVX512-LABEL: const_floor_v2f64:
+; AVX512: ## %bb.0:
+; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [-2.000000e+00,2.000000e+00]
+; AVX512-NEXT: retq
%t = call <2 x double> @llvm.floor.v2f64(<2 x double> <double -1.5, double 2.5>)
ret <2 x double> %t
}
define <4 x float> @const_floor_v4f32() {
; SSE41-LABEL: const_floor_v4f32:
-; SSE41: ## BB#0:
+; SSE41: ## %bb.0:
; SSE41-NEXT: movaps {{.*#+}} xmm0 = [-4.000000e+00,6.000000e+00,-9.000000e+00,2.000000e+00]
; SSE41-NEXT: retq
;
; AVX-LABEL: const_floor_v4f32:
-; AVX: ## BB#0:
+; AVX: ## %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [-4.000000e+00,6.000000e+00,-9.000000e+00,2.000000e+00]
; AVX-NEXT: retq
+;
+; AVX512-LABEL: const_floor_v4f32:
+; AVX512: ## %bb.0:
+; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [-4.000000e+00,6.000000e+00,-9.000000e+00,2.000000e+00]
+; AVX512-NEXT: retq
%t = call <4 x float> @llvm.floor.v4f32(<4 x float> <float -3.5, float 6.0, float -9.0, float 2.5>)
ret <4 x float> %t
}
define <2 x double> @const_ceil_v2f64() {
; SSE41-LABEL: const_ceil_v2f64:
-; SSE41: ## BB#0:
+; SSE41: ## %bb.0:
; SSE41-NEXT: movaps {{.*#+}} xmm0 = [-1.000000e+00,3.000000e+00]
; SSE41-NEXT: retq
;
; AVX-LABEL: const_ceil_v2f64:
-; AVX: ## BB#0:
+; AVX: ## %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [-1.000000e+00,3.000000e+00]
; AVX-NEXT: retq
+;
+; AVX512-LABEL: const_ceil_v2f64:
+; AVX512: ## %bb.0:
+; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [-1.000000e+00,3.000000e+00]
+; AVX512-NEXT: retq
%t = call <2 x double> @llvm.ceil.v2f64(<2 x double> <double -1.5, double 2.5>)
ret <2 x double> %t
}
define <4 x float> @const_ceil_v4f32() {
; SSE41-LABEL: const_ceil_v4f32:
-; SSE41: ## BB#0:
+; SSE41: ## %bb.0:
; SSE41-NEXT: movaps {{.*#+}} xmm0 = [-3.000000e+00,6.000000e+00,-9.000000e+00,3.000000e+00]
; SSE41-NEXT: retq
;
; AVX-LABEL: const_ceil_v4f32:
-; AVX: ## BB#0:
+; AVX: ## %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [-3.000000e+00,6.000000e+00,-9.000000e+00,3.000000e+00]
; AVX-NEXT: retq
+;
+; AVX512-LABEL: const_ceil_v4f32:
+; AVX512: ## %bb.0:
+; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [-3.000000e+00,6.000000e+00,-9.000000e+00,3.000000e+00]
+; AVX512-NEXT: retq
%t = call <4 x float> @llvm.ceil.v4f32(<4 x float> <float -3.5, float 6.0, float -9.0, float 2.5>)
ret <4 x float> %t
}
define <2 x double> @const_trunc_v2f64() {
; SSE41-LABEL: const_trunc_v2f64:
-; SSE41: ## BB#0:
+; SSE41: ## %bb.0:
; SSE41-NEXT: movaps {{.*#+}} xmm0 = [-1.000000e+00,2.000000e+00]
; SSE41-NEXT: retq
;
; AVX-LABEL: const_trunc_v2f64:
-; AVX: ## BB#0:
+; AVX: ## %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [-1.000000e+00,2.000000e+00]
; AVX-NEXT: retq
+;
+; AVX512-LABEL: const_trunc_v2f64:
+; AVX512: ## %bb.0:
+; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [-1.000000e+00,2.000000e+00]
+; AVX512-NEXT: retq
%t = call <2 x double> @llvm.trunc.v2f64(<2 x double> <double -1.5, double 2.5>)
ret <2 x double> %t
}
define <4 x float> @const_trunc_v4f32() {
; SSE41-LABEL: const_trunc_v4f32:
-; SSE41: ## BB#0:
+; SSE41: ## %bb.0:
; SSE41-NEXT: movaps {{.*#+}} xmm0 = [-3.000000e+00,6.000000e+00,-9.000000e+00,2.000000e+00]
; SSE41-NEXT: retq
;
; AVX-LABEL: const_trunc_v4f32:
-; AVX: ## BB#0:
+; AVX: ## %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [-3.000000e+00,6.000000e+00,-9.000000e+00,2.000000e+00]
; AVX-NEXT: retq
+;
+; AVX512-LABEL: const_trunc_v4f32:
+; AVX512: ## %bb.0:
+; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [-3.000000e+00,6.000000e+00,-9.000000e+00,2.000000e+00]
+; AVX512-NEXT: retq
%t = call <4 x float> @llvm.trunc.v4f32(<4 x float> <float -3.5, float 6.0, float -9.0, float 2.5>)
ret <4 x float> %t
}
diff --git a/test/CodeGen/X86/vec_fneg.ll b/test/CodeGen/X86/vec_fneg.ll
index 9804f0ef983b..d198964bf1d7 100644
--- a/test/CodeGen/X86/vec_fneg.ll
+++ b/test/CodeGen/X86/vec_fneg.ll
@@ -9,12 +9,12 @@
; This test verifies that we use an xor with a constant to flip the sign bits; no subtraction needed.
define <4 x float> @t1(<4 x float> %Q) nounwind {
; X32-SSE-LABEL: t1:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: xorps {{\.LCPI.*}}, %xmm0
; X32-SSE-NEXT: retl
;
; X64-SSE-LABEL: t1:
-; X64-SSE: # BB#0:
+; X64-SSE: # %bb.0:
; X64-SSE-NEXT: xorps {{.*}}(%rip), %xmm0
; X64-SSE-NEXT: retq
%tmp = fsub <4 x float> < float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00 >, %Q
@@ -24,14 +24,14 @@ define <4 x float> @t1(<4 x float> %Q) nounwind {
; This test verifies that we generate an FP subtraction because "0.0 - x" is not an fneg.
define <4 x float> @t2(<4 x float> %Q) nounwind {
; X32-SSE-LABEL: t2:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: xorps %xmm1, %xmm1
; X32-SSE-NEXT: subps %xmm0, %xmm1
; X32-SSE-NEXT: movaps %xmm1, %xmm0
; X32-SSE-NEXT: retl
;
; X64-SSE-LABEL: t2:
-; X64-SSE: # BB#0:
+; X64-SSE: # %bb.0:
; X64-SSE-NEXT: xorps %xmm1, %xmm1
; X64-SSE-NEXT: subps %xmm0, %xmm1
; X64-SSE-NEXT: movaps %xmm1, %xmm0
@@ -53,7 +53,7 @@ define <4 x float> @t2(<4 x float> %Q) nounwind {
define <2 x float> @fneg_bitcast(i64 %i) nounwind {
; X32-SSE1-LABEL: fneg_bitcast:
-; X32-SSE1: # BB#0:
+; X32-SSE1: # %bb.0:
; X32-SSE1-NEXT: pushl %ebp
; X32-SSE1-NEXT: movl %esp, %ebp
; X32-SSE1-NEXT: andl $-16, %esp
@@ -70,7 +70,7 @@ define <2 x float> @fneg_bitcast(i64 %i) nounwind {
; X32-SSE1-NEXT: retl
;
; X32-SSE2-LABEL: fneg_bitcast:
-; X32-SSE2: # BB#0:
+; X32-SSE2: # %bb.0:
; X32-SSE2-NEXT: movl $-2147483648, %eax # imm = 0x80000000
; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-SSE2-NEXT: xorl %eax, %ecx
@@ -81,7 +81,7 @@ define <2 x float> @fneg_bitcast(i64 %i) nounwind {
; X32-SSE2-NEXT: retl
;
; X64-SSE1-LABEL: fneg_bitcast:
-; X64-SSE1: # BB#0:
+; X64-SSE1: # %bb.0:
; X64-SSE1-NEXT: movabsq $-9223372034707292160, %rax # imm = 0x8000000080000000
; X64-SSE1-NEXT: xorq %rdi, %rax
; X64-SSE1-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
@@ -89,7 +89,7 @@ define <2 x float> @fneg_bitcast(i64 %i) nounwind {
; X64-SSE1-NEXT: retq
;
; X64-SSE2-LABEL: fneg_bitcast:
-; X64-SSE2: # BB#0:
+; X64-SSE2: # %bb.0:
; X64-SSE2-NEXT: movabsq $-9223372034707292160, %rax # imm = 0x8000000080000000
; X64-SSE2-NEXT: xorq %rdi, %rax
; X64-SSE2-NEXT: movq %rax, %xmm0
diff --git a/test/CodeGen/X86/vec_fp_to_int.ll b/test/CodeGen/X86/vec_fp_to_int.ll
index 6cfe41ac503d..bdfc96ba97d5 100644
--- a/test/CodeGen/X86/vec_fp_to_int.ll
+++ b/test/CodeGen/X86/vec_fp_to_int.ll
@@ -18,7 +18,7 @@
define <2 x i64> @fptosi_2f64_to_2i64(<2 x double> %a) {
; SSE-LABEL: fptosi_2f64_to_2i64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: cvttsd2si %xmm0, %rax
; SSE-NEXT: movq %rax, %xmm1
; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
@@ -29,7 +29,7 @@ define <2 x i64> @fptosi_2f64_to_2i64(<2 x double> %a) {
; SSE-NEXT: retq
;
; VEX-LABEL: fptosi_2f64_to_2i64:
-; VEX: # BB#0:
+; VEX: # %bb.0:
; VEX-NEXT: vcvttsd2si %xmm0, %rax
; VEX-NEXT: vmovq %rax, %xmm1
; VEX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
@@ -39,7 +39,7 @@ define <2 x i64> @fptosi_2f64_to_2i64(<2 x double> %a) {
; VEX-NEXT: retq
;
; AVX512F-LABEL: fptosi_2f64_to_2i64:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vcvttsd2si %xmm0, %rax
; AVX512F-NEXT: vmovq %rax, %xmm1
; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
@@ -49,7 +49,7 @@ define <2 x i64> @fptosi_2f64_to_2i64(<2 x double> %a) {
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: fptosi_2f64_to_2i64:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vcvttsd2si %xmm0, %rax
; AVX512VL-NEXT: vmovq %rax, %xmm1
; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
@@ -59,15 +59,15 @@ define <2 x i64> @fptosi_2f64_to_2i64(<2 x double> %a) {
; AVX512VL-NEXT: retq
;
; AVX512DQ-LABEL: fptosi_2f64_to_2i64:
-; AVX512DQ: # BB#0:
-; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: # kill: def %xmm0 killed %xmm0 def %zmm0
; AVX512DQ-NEXT: vcvttpd2qq %zmm0, %zmm0
-; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512DQ-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512VLDQ-LABEL: fptosi_2f64_to_2i64:
-; AVX512VLDQ: # BB#0:
+; AVX512VLDQ: # %bb.0:
; AVX512VLDQ-NEXT: vcvttpd2qq %xmm0, %xmm0
; AVX512VLDQ-NEXT: retq
%cvt = fptosi <2 x double> %a to <2 x i64>
@@ -76,12 +76,12 @@ define <2 x i64> @fptosi_2f64_to_2i64(<2 x double> %a) {
define <4 x i32> @fptosi_2f64_to_4i32(<2 x double> %a) {
; SSE-LABEL: fptosi_2f64_to_4i32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: cvttpd2dq %xmm0, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: fptosi_2f64_to_4i32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vcvttpd2dq %xmm0, %xmm0
; AVX-NEXT: retq
%cvt = fptosi <2 x double> %a to <2 x i32>
@@ -91,13 +91,13 @@ define <4 x i32> @fptosi_2f64_to_4i32(<2 x double> %a) {
define <2 x i32> @fptosi_2f64_to_2i32(<2 x double> %a) {
; SSE-LABEL: fptosi_2f64_to_2i32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: cvttpd2dq %xmm0, %xmm0
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
; SSE-NEXT: retq
;
; AVX-LABEL: fptosi_2f64_to_2i32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vcvttpd2dq %xmm0, %xmm0
; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX-NEXT: retq
@@ -107,15 +107,15 @@ define <2 x i32> @fptosi_2f64_to_2i32(<2 x double> %a) {
define <4 x i32> @fptosi_4f64_to_2i32(<2 x double> %a) {
; SSE-LABEL: fptosi_4f64_to_2i32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: cvttpd2dq %xmm0, %xmm1
; SSE-NEXT: cvttpd2dq %xmm0, %xmm0
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE-NEXT: retq
;
; AVX-LABEL: fptosi_4f64_to_2i32:
-; AVX: # BB#0:
-; AVX-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; AVX: # %bb.0:
+; AVX-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
; AVX-NEXT: vcvttpd2dq %ymm0, %xmm0
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
@@ -126,7 +126,7 @@ define <4 x i32> @fptosi_4f64_to_2i32(<2 x double> %a) {
define <4 x i64> @fptosi_4f64_to_4i64(<4 x double> %a) {
; SSE-LABEL: fptosi_4f64_to_4i64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: cvttsd2si %xmm0, %rax
; SSE-NEXT: movq %rax, %xmm2
; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
@@ -144,7 +144,7 @@ define <4 x i64> @fptosi_4f64_to_4i64(<4 x double> %a) {
; SSE-NEXT: retq
;
; AVX1-LABEL: fptosi_4f64_to_4i64:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vcvttsd2si %xmm1, %rax
; AVX1-NEXT: vmovq %rax, %xmm2
@@ -162,7 +162,7 @@ define <4 x i64> @fptosi_4f64_to_4i64(<4 x double> %a) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: fptosi_4f64_to_4i64:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX2-NEXT: vcvttsd2si %xmm1, %rax
; AVX2-NEXT: vmovq %rax, %xmm2
@@ -180,7 +180,7 @@ define <4 x i64> @fptosi_4f64_to_4i64(<4 x double> %a) {
; AVX2-NEXT: retq
;
; AVX512F-LABEL: fptosi_4f64_to_4i64:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX512F-NEXT: vcvttsd2si %xmm1, %rax
; AVX512F-NEXT: vmovq %rax, %xmm2
@@ -198,7 +198,7 @@ define <4 x i64> @fptosi_4f64_to_4i64(<4 x double> %a) {
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: fptosi_4f64_to_4i64:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX512VL-NEXT: vcvttsd2si %xmm1, %rax
; AVX512VL-NEXT: vmovq %rax, %xmm2
@@ -216,14 +216,14 @@ define <4 x i64> @fptosi_4f64_to_4i64(<4 x double> %a) {
; AVX512VL-NEXT: retq
;
; AVX512DQ-LABEL: fptosi_4f64_to_4i64:
-; AVX512DQ: # BB#0:
-; AVX512DQ-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; AVX512DQ-NEXT: vcvttpd2qq %zmm0, %zmm0
-; AVX512DQ-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512DQ-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
; AVX512DQ-NEXT: retq
;
; AVX512VLDQ-LABEL: fptosi_4f64_to_4i64:
-; AVX512VLDQ: # BB#0:
+; AVX512VLDQ: # %bb.0:
; AVX512VLDQ-NEXT: vcvttpd2qq %ymm0, %ymm0
; AVX512VLDQ-NEXT: retq
%cvt = fptosi <4 x double> %a to <4 x i64>
@@ -232,14 +232,14 @@ define <4 x i64> @fptosi_4f64_to_4i64(<4 x double> %a) {
define <4 x i32> @fptosi_4f64_to_4i32(<4 x double> %a) {
; SSE-LABEL: fptosi_4f64_to_4i32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: cvttpd2dq %xmm1, %xmm1
; SSE-NEXT: cvttpd2dq %xmm0, %xmm0
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE-NEXT: retq
;
; AVX-LABEL: fptosi_4f64_to_4i32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vcvttpd2dq %ymm0, %xmm0
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
@@ -253,7 +253,7 @@ define <4 x i32> @fptosi_4f64_to_4i32(<4 x double> %a) {
define <2 x i64> @fptoui_2f64_to_2i64(<2 x double> %a) {
; SSE-LABEL: fptoui_2f64_to_2i64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero
; SSE-NEXT: movapd %xmm0, %xmm1
; SSE-NEXT: subsd %xmm2, %xmm1
@@ -278,7 +278,7 @@ define <2 x i64> @fptoui_2f64_to_2i64(<2 x double> %a) {
; SSE-NEXT: retq
;
; VEX-LABEL: fptoui_2f64_to_2i64:
-; VEX: # BB#0:
+; VEX: # %bb.0:
; VEX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
; VEX-NEXT: vsubsd %xmm1, %xmm0, %xmm2
; VEX-NEXT: vcvttsd2si %xmm2, %rax
@@ -300,7 +300,7 @@ define <2 x i64> @fptoui_2f64_to_2i64(<2 x double> %a) {
; VEX-NEXT: retq
;
; AVX512F-LABEL: fptoui_2f64_to_2i64:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vcvttsd2usi %xmm0, %rax
; AVX512F-NEXT: vmovq %rax, %xmm1
; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
@@ -310,7 +310,7 @@ define <2 x i64> @fptoui_2f64_to_2i64(<2 x double> %a) {
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: fptoui_2f64_to_2i64:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vcvttsd2usi %xmm0, %rax
; AVX512VL-NEXT: vmovq %rax, %xmm1
; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
@@ -320,15 +320,15 @@ define <2 x i64> @fptoui_2f64_to_2i64(<2 x double> %a) {
; AVX512VL-NEXT: retq
;
; AVX512DQ-LABEL: fptoui_2f64_to_2i64:
-; AVX512DQ: # BB#0:
-; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: # kill: def %xmm0 killed %xmm0 def %zmm0
; AVX512DQ-NEXT: vcvttpd2uqq %zmm0, %zmm0
-; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512DQ-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512VLDQ-LABEL: fptoui_2f64_to_2i64:
-; AVX512VLDQ: # BB#0:
+; AVX512VLDQ: # %bb.0:
; AVX512VLDQ-NEXT: vcvttpd2uqq %xmm0, %xmm0
; AVX512VLDQ-NEXT: retq
%cvt = fptoui <2 x double> %a to <2 x i64>
@@ -337,7 +337,7 @@ define <2 x i64> @fptoui_2f64_to_2i64(<2 x double> %a) {
define <4 x i32> @fptoui_2f64_to_4i32(<2 x double> %a) {
; SSE-LABEL: fptoui_2f64_to_4i32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero
; SSE-NEXT: movapd %xmm0, %xmm1
; SSE-NEXT: subsd %xmm2, %xmm1
@@ -364,7 +364,7 @@ define <4 x i32> @fptoui_2f64_to_4i32(<2 x double> %a) {
; SSE-NEXT: retq
;
; VEX-LABEL: fptoui_2f64_to_4i32:
-; VEX: # BB#0:
+; VEX: # %bb.0:
; VEX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
; VEX-NEXT: vsubsd %xmm1, %xmm0, %xmm2
; VEX-NEXT: vcvttsd2si %xmm2, %rax
@@ -387,28 +387,28 @@ define <4 x i32> @fptoui_2f64_to_4i32(<2 x double> %a) {
; VEX-NEXT: retq
;
; AVX512F-LABEL: fptoui_2f64_to_4i32:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: # kill: def %xmm0 killed %xmm0 def %zmm0
; AVX512F-NEXT: vcvttpd2udq %zmm0, %ymm0
; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: fptoui_2f64_to_4i32:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vcvttpd2udq %xmm0, %xmm0
; AVX512VL-NEXT: retq
;
; AVX512DQ-LABEL: fptoui_2f64_to_4i32:
-; AVX512DQ: # BB#0:
-; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: # kill: def %xmm0 killed %xmm0 def %zmm0
; AVX512DQ-NEXT: vcvttpd2udq %zmm0, %ymm0
; AVX512DQ-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512VLDQ-LABEL: fptoui_2f64_to_4i32:
-; AVX512VLDQ: # BB#0:
+; AVX512VLDQ: # %bb.0:
; AVX512VLDQ-NEXT: vcvttpd2udq %xmm0, %xmm0
; AVX512VLDQ-NEXT: retq
%cvt = fptoui <2 x double> %a to <2 x i32>
@@ -418,7 +418,7 @@ define <4 x i32> @fptoui_2f64_to_4i32(<2 x double> %a) {
define <4 x i32> @fptoui_2f64_to_2i32(<2 x double> %a) {
; SSE-LABEL: fptoui_2f64_to_2i32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
; SSE-NEXT: movapd %xmm0, %xmm2
; SSE-NEXT: subsd %xmm1, %xmm2
@@ -443,7 +443,7 @@ define <4 x i32> @fptoui_2f64_to_2i32(<2 x double> %a) {
; SSE-NEXT: retq
;
; VEX-LABEL: fptoui_2f64_to_2i32:
-; VEX: # BB#0:
+; VEX: # %bb.0:
; VEX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
; VEX-NEXT: vsubsd %xmm1, %xmm0, %xmm2
; VEX-NEXT: vcvttsd2si %xmm2, %rax
@@ -466,28 +466,28 @@ define <4 x i32> @fptoui_2f64_to_2i32(<2 x double> %a) {
; VEX-NEXT: retq
;
; AVX512F-LABEL: fptoui_2f64_to_2i32:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: # kill: def %xmm0 killed %xmm0 def %zmm0
; AVX512F-NEXT: vcvttpd2udq %zmm0, %ymm0
-; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512F-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: fptoui_2f64_to_2i32:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vcvttpd2udq %xmm0, %xmm0
; AVX512VL-NEXT: retq
;
; AVX512DQ-LABEL: fptoui_2f64_to_2i32:
-; AVX512DQ: # BB#0:
-; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: # kill: def %xmm0 killed %xmm0 def %zmm0
; AVX512DQ-NEXT: vcvttpd2udq %zmm0, %ymm0
-; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512DQ-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512VLDQ-LABEL: fptoui_2f64_to_2i32:
-; AVX512VLDQ: # BB#0:
+; AVX512VLDQ: # %bb.0:
; AVX512VLDQ-NEXT: vcvttpd2udq %xmm0, %xmm0
; AVX512VLDQ-NEXT: retq
%cvt = fptoui <2 x double> %a to <2 x i32>
@@ -497,7 +497,7 @@ define <4 x i32> @fptoui_2f64_to_2i32(<2 x double> %a) {
define <4 x i32> @fptoui_4f64_to_2i32(<2 x double> %a) {
; SSE-LABEL: fptoui_4f64_to_2i32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero
; SSE-NEXT: movapd %xmm0, %xmm1
; SSE-NEXT: subsd %xmm2, %xmm1
@@ -529,7 +529,7 @@ define <4 x i32> @fptoui_4f64_to_2i32(<2 x double> %a) {
; SSE-NEXT: retq
;
; VEX-LABEL: fptoui_4f64_to_2i32:
-; VEX: # BB#0:
+; VEX: # %bb.0:
; VEX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; VEX-NEXT: vcvttsd2si %xmm1, %rax
; VEX-NEXT: vcvttsd2si %xmm0, %rcx
@@ -541,31 +541,31 @@ define <4 x i32> @fptoui_4f64_to_2i32(<2 x double> %a) {
; VEX-NEXT: retq
;
; AVX512F-LABEL: fptoui_4f64_to_2i32:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: # kill: def %xmm0 killed %xmm0 def %zmm0
; AVX512F-NEXT: vcvttpd2udq %zmm0, %ymm0
-; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512F-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: fptoui_4f64_to_2i32:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
; AVX512VL-NEXT: vcvttpd2udq %ymm0, %xmm0
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512DQ-LABEL: fptoui_4f64_to_2i32:
-; AVX512DQ: # BB#0:
-; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: # kill: def %xmm0 killed %xmm0 def %zmm0
; AVX512DQ-NEXT: vcvttpd2udq %zmm0, %ymm0
-; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512DQ-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512VLDQ-LABEL: fptoui_4f64_to_2i32:
-; AVX512VLDQ: # BB#0:
-; AVX512VLDQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; AVX512VLDQ: # %bb.0:
+; AVX512VLDQ-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
; AVX512VLDQ-NEXT: vcvttpd2udq %ymm0, %xmm0
; AVX512VLDQ-NEXT: vzeroupper
; AVX512VLDQ-NEXT: retq
@@ -576,7 +576,7 @@ define <4 x i32> @fptoui_4f64_to_2i32(<2 x double> %a) {
define <4 x i64> @fptoui_4f64_to_4i64(<4 x double> %a) {
; SSE-LABEL: fptoui_4f64_to_4i64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movapd %xmm0, %xmm2
; SSE-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero
; SSE-NEXT: subsd %xmm3, %xmm0
@@ -619,7 +619,7 @@ define <4 x i64> @fptoui_4f64_to_4i64(<4 x double> %a) {
; SSE-NEXT: retq
;
; AVX1-LABEL: fptoui_4f64_to_4i64:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
; AVX1-NEXT: vsubsd %xmm1, %xmm2, %xmm3
@@ -659,7 +659,7 @@ define <4 x i64> @fptoui_4f64_to_4i64(<4 x double> %a) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: fptoui_4f64_to_4i64:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX2-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
; AVX2-NEXT: vsubsd %xmm1, %xmm2, %xmm3
@@ -699,7 +699,7 @@ define <4 x i64> @fptoui_4f64_to_4i64(<4 x double> %a) {
; AVX2-NEXT: retq
;
; AVX512F-LABEL: fptoui_4f64_to_4i64:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX512F-NEXT: vcvttsd2usi %xmm1, %rax
; AVX512F-NEXT: vmovq %rax, %xmm2
@@ -717,7 +717,7 @@ define <4 x i64> @fptoui_4f64_to_4i64(<4 x double> %a) {
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: fptoui_4f64_to_4i64:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX512VL-NEXT: vcvttsd2usi %xmm1, %rax
; AVX512VL-NEXT: vmovq %rax, %xmm2
@@ -735,14 +735,14 @@ define <4 x i64> @fptoui_4f64_to_4i64(<4 x double> %a) {
; AVX512VL-NEXT: retq
;
; AVX512DQ-LABEL: fptoui_4f64_to_4i64:
-; AVX512DQ: # BB#0:
-; AVX512DQ-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; AVX512DQ-NEXT: vcvttpd2uqq %zmm0, %zmm0
-; AVX512DQ-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512DQ-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
; AVX512DQ-NEXT: retq
;
; AVX512VLDQ-LABEL: fptoui_4f64_to_4i64:
-; AVX512VLDQ: # BB#0:
+; AVX512VLDQ: # %bb.0:
; AVX512VLDQ-NEXT: vcvttpd2uqq %ymm0, %ymm0
; AVX512VLDQ-NEXT: retq
%cvt = fptoui <4 x double> %a to <4 x i64>
@@ -751,7 +751,7 @@ define <4 x i64> @fptoui_4f64_to_4i64(<4 x double> %a) {
define <4 x i32> @fptoui_4f64_to_4i32(<4 x double> %a) {
; SSE-LABEL: fptoui_4f64_to_4i32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero
; SSE-NEXT: movapd %xmm1, %xmm3
; SSE-NEXT: subsd %xmm2, %xmm3
@@ -795,7 +795,7 @@ define <4 x i32> @fptoui_4f64_to_4i32(<4 x double> %a) {
; SSE-NEXT: retq
;
; VEX-LABEL: fptoui_4f64_to_4i32:
-; VEX: # BB#0:
+; VEX: # %bb.0:
; VEX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; VEX-NEXT: vcvttsd2si %xmm1, %rax
; VEX-NEXT: vcvttsd2si %xmm0, %rcx
@@ -811,29 +811,29 @@ define <4 x i32> @fptoui_4f64_to_4i32(<4 x double> %a) {
; VEX-NEXT: retq
;
; AVX512F-LABEL: fptoui_4f64_to_4i32:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; AVX512F-NEXT: vcvttpd2udq %zmm0, %ymm0
-; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512F-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: fptoui_4f64_to_4i32:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vcvttpd2udq %ymm0, %xmm0
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512DQ-LABEL: fptoui_4f64_to_4i32:
-; AVX512DQ: # BB#0:
-; AVX512DQ-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; AVX512DQ-NEXT: vcvttpd2udq %zmm0, %ymm0
-; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512DQ-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512VLDQ-LABEL: fptoui_4f64_to_4i32:
-; AVX512VLDQ: # BB#0:
+; AVX512VLDQ: # %bb.0:
; AVX512VLDQ-NEXT: vcvttpd2udq %ymm0, %xmm0
; AVX512VLDQ-NEXT: vzeroupper
; AVX512VLDQ-NEXT: retq
@@ -847,13 +847,13 @@ define <4 x i32> @fptoui_4f64_to_4i32(<4 x double> %a) {
define <2 x i32> @fptosi_2f32_to_2i32(<2 x float> %a) {
; SSE-LABEL: fptosi_2f32_to_2i32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: cvttps2dq %xmm0, %xmm0
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
; SSE-NEXT: retq
;
; AVX-LABEL: fptosi_2f32_to_2i32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vcvttps2dq %xmm0, %xmm0
; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX-NEXT: retq
@@ -863,12 +863,12 @@ define <2 x i32> @fptosi_2f32_to_2i32(<2 x float> %a) {
define <4 x i32> @fptosi_4f32_to_4i32(<4 x float> %a) {
; SSE-LABEL: fptosi_4f32_to_4i32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: cvttps2dq %xmm0, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: fptosi_4f32_to_4i32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vcvttps2dq %xmm0, %xmm0
; AVX-NEXT: retq
%cvt = fptosi <4 x float> %a to <4 x i32>
@@ -877,7 +877,7 @@ define <4 x i32> @fptosi_4f32_to_4i32(<4 x float> %a) {
define <2 x i64> @fptosi_2f32_to_2i64(<4 x float> %a) {
; SSE-LABEL: fptosi_2f32_to_2i64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: cvttss2si %xmm0, %rax
; SSE-NEXT: movq %rax, %xmm1
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
@@ -888,7 +888,7 @@ define <2 x i64> @fptosi_2f32_to_2i64(<4 x float> %a) {
; SSE-NEXT: retq
;
; VEX-LABEL: fptosi_2f32_to_2i64:
-; VEX: # BB#0:
+; VEX: # %bb.0:
; VEX-NEXT: vcvttss2si %xmm0, %rax
; VEX-NEXT: vmovq %rax, %xmm1
; VEX-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
@@ -898,7 +898,7 @@ define <2 x i64> @fptosi_2f32_to_2i64(<4 x float> %a) {
; VEX-NEXT: retq
;
; AVX512F-LABEL: fptosi_2f32_to_2i64:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vcvttss2si %xmm0, %rax
; AVX512F-NEXT: vmovq %rax, %xmm1
; AVX512F-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
@@ -908,7 +908,7 @@ define <2 x i64> @fptosi_2f32_to_2i64(<4 x float> %a) {
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: fptosi_2f32_to_2i64:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vcvttss2si %xmm0, %rax
; AVX512VL-NEXT: vmovq %rax, %xmm1
; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
@@ -918,7 +918,7 @@ define <2 x i64> @fptosi_2f32_to_2i64(<4 x float> %a) {
; AVX512VL-NEXT: retq
;
; AVX512DQ-LABEL: fptosi_2f32_to_2i64:
-; AVX512DQ: # BB#0:
+; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vcvttss2si %xmm0, %rax
; AVX512DQ-NEXT: vmovq %rax, %xmm1
; AVX512DQ-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
@@ -928,7 +928,7 @@ define <2 x i64> @fptosi_2f32_to_2i64(<4 x float> %a) {
; AVX512DQ-NEXT: retq
;
; AVX512VLDQ-LABEL: fptosi_2f32_to_2i64:
-; AVX512VLDQ: # BB#0:
+; AVX512VLDQ: # %bb.0:
; AVX512VLDQ-NEXT: vcvttps2qq %xmm0, %xmm0
; AVX512VLDQ-NEXT: retq
%shuf = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 0, i32 1>
@@ -938,7 +938,7 @@ define <2 x i64> @fptosi_2f32_to_2i64(<4 x float> %a) {
define <2 x i64> @fptosi_4f32_to_2i64(<4 x float> %a) {
; SSE-LABEL: fptosi_4f32_to_2i64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: cvttss2si %xmm0, %rax
; SSE-NEXT: movq %rax, %xmm1
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
@@ -949,7 +949,7 @@ define <2 x i64> @fptosi_4f32_to_2i64(<4 x float> %a) {
; SSE-NEXT: retq
;
; VEX-LABEL: fptosi_4f32_to_2i64:
-; VEX: # BB#0:
+; VEX: # %bb.0:
; VEX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; VEX-NEXT: vcvttss2si %xmm1, %rax
; VEX-NEXT: vcvttss2si %xmm0, %rcx
@@ -959,7 +959,7 @@ define <2 x i64> @fptosi_4f32_to_2i64(<4 x float> %a) {
; VEX-NEXT: retq
;
; AVX512F-LABEL: fptosi_4f32_to_2i64:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; AVX512F-NEXT: vcvttss2si %xmm1, %rax
; AVX512F-NEXT: vcvttss2si %xmm0, %rcx
@@ -969,7 +969,7 @@ define <2 x i64> @fptosi_4f32_to_2i64(<4 x float> %a) {
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: fptosi_4f32_to_2i64:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; AVX512VL-NEXT: vcvttss2si %xmm1, %rax
; AVX512VL-NEXT: vcvttss2si %xmm0, %rcx
@@ -979,17 +979,17 @@ define <2 x i64> @fptosi_4f32_to_2i64(<4 x float> %a) {
; AVX512VL-NEXT: retq
;
; AVX512DQ-LABEL: fptosi_4f32_to_2i64:
-; AVX512DQ: # BB#0:
-; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
; AVX512DQ-NEXT: vcvttps2qq %ymm0, %zmm0
-; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512DQ-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512VLDQ-LABEL: fptosi_4f32_to_2i64:
-; AVX512VLDQ: # BB#0:
+; AVX512VLDQ: # %bb.0:
; AVX512VLDQ-NEXT: vcvttps2qq %xmm0, %ymm0
-; AVX512VLDQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512VLDQ-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX512VLDQ-NEXT: vzeroupper
; AVX512VLDQ-NEXT: retq
%cvt = fptosi <4 x float> %a to <4 x i64>
@@ -999,13 +999,13 @@ define <2 x i64> @fptosi_4f32_to_2i64(<4 x float> %a) {
define <8 x i32> @fptosi_8f32_to_8i32(<8 x float> %a) {
; SSE-LABEL: fptosi_8f32_to_8i32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: cvttps2dq %xmm0, %xmm0
; SSE-NEXT: cvttps2dq %xmm1, %xmm1
; SSE-NEXT: retq
;
; AVX-LABEL: fptosi_8f32_to_8i32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vcvttps2dq %ymm0, %ymm0
; AVX-NEXT: retq
%cvt = fptosi <8 x float> %a to <8 x i32>
@@ -1014,7 +1014,7 @@ define <8 x i32> @fptosi_8f32_to_8i32(<8 x float> %a) {
define <4 x i64> @fptosi_4f32_to_4i64(<8 x float> %a) {
; SSE-LABEL: fptosi_4f32_to_4i64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: cvttss2si %xmm0, %rax
; SSE-NEXT: movq %rax, %xmm2
; SSE-NEXT: movaps %xmm0, %xmm1
@@ -1034,7 +1034,7 @@ define <4 x i64> @fptosi_4f32_to_4i64(<8 x float> %a) {
; SSE-NEXT: retq
;
; AVX1-LABEL: fptosi_4f32_to_4i64:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
; AVX1-NEXT: vcvttss2si %xmm1, %rax
; AVX1-NEXT: vmovq %rax, %xmm1
@@ -1052,7 +1052,7 @@ define <4 x i64> @fptosi_4f32_to_4i64(<8 x float> %a) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: fptosi_4f32_to_4i64:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
; AVX2-NEXT: vcvttss2si %xmm1, %rax
; AVX2-NEXT: vmovq %rax, %xmm1
@@ -1070,7 +1070,7 @@ define <4 x i64> @fptosi_4f32_to_4i64(<8 x float> %a) {
; AVX2-NEXT: retq
;
; AVX512F-LABEL: fptosi_4f32_to_4i64:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
; AVX512F-NEXT: vcvttss2si %xmm1, %rax
; AVX512F-NEXT: vmovq %rax, %xmm1
@@ -1088,7 +1088,7 @@ define <4 x i64> @fptosi_4f32_to_4i64(<8 x float> %a) {
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: fptosi_4f32_to_4i64:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
; AVX512VL-NEXT: vcvttss2si %xmm1, %rax
; AVX512VL-NEXT: vmovq %rax, %xmm1
@@ -1106,13 +1106,13 @@ define <4 x i64> @fptosi_4f32_to_4i64(<8 x float> %a) {
; AVX512VL-NEXT: retq
;
; AVX512DQ-LABEL: fptosi_4f32_to_4i64:
-; AVX512DQ: # BB#0:
+; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vcvttps2qq %ymm0, %zmm0
-; AVX512DQ-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512DQ-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
; AVX512DQ-NEXT: retq
;
; AVX512VLDQ-LABEL: fptosi_4f32_to_4i64:
-; AVX512VLDQ: # BB#0:
+; AVX512VLDQ: # %bb.0:
; AVX512VLDQ-NEXT: vcvttps2qq %xmm0, %ymm0
; AVX512VLDQ-NEXT: retq
%shuf = shufflevector <8 x float> %a, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -1122,7 +1122,7 @@ define <4 x i64> @fptosi_4f32_to_4i64(<8 x float> %a) {
define <4 x i64> @fptosi_8f32_to_4i64(<8 x float> %a) {
; SSE-LABEL: fptosi_8f32_to_4i64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: cvttss2si %xmm0, %rax
; SSE-NEXT: movq %rax, %xmm2
; SSE-NEXT: movaps %xmm0, %xmm1
@@ -1142,7 +1142,7 @@ define <4 x i64> @fptosi_8f32_to_4i64(<8 x float> %a) {
; SSE-NEXT: retq
;
; AVX1-LABEL: fptosi_8f32_to_4i64:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
; AVX1-NEXT: vcvttss2si %xmm1, %rax
; AVX1-NEXT: vmovq %rax, %xmm1
@@ -1160,7 +1160,7 @@ define <4 x i64> @fptosi_8f32_to_4i64(<8 x float> %a) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: fptosi_8f32_to_4i64:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
; AVX2-NEXT: vcvttss2si %xmm1, %rax
; AVX2-NEXT: vmovq %rax, %xmm1
@@ -1178,7 +1178,7 @@ define <4 x i64> @fptosi_8f32_to_4i64(<8 x float> %a) {
; AVX2-NEXT: retq
;
; AVX512F-LABEL: fptosi_8f32_to_4i64:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; AVX512F-NEXT: vcvttss2si %xmm1, %rax
; AVX512F-NEXT: vcvttss2si %xmm0, %rcx
@@ -1196,7 +1196,7 @@ define <4 x i64> @fptosi_8f32_to_4i64(<8 x float> %a) {
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: fptosi_8f32_to_4i64:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; AVX512VL-NEXT: vcvttss2si %xmm1, %rax
; AVX512VL-NEXT: vcvttss2si %xmm0, %rcx
@@ -1214,15 +1214,15 @@ define <4 x i64> @fptosi_8f32_to_4i64(<8 x float> %a) {
; AVX512VL-NEXT: retq
;
; AVX512DQ-LABEL: fptosi_8f32_to_4i64:
-; AVX512DQ: # BB#0:
+; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vcvttps2qq %ymm0, %zmm0
-; AVX512DQ-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512DQ-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
; AVX512DQ-NEXT: retq
;
; AVX512VLDQ-LABEL: fptosi_8f32_to_4i64:
-; AVX512VLDQ: # BB#0:
+; AVX512VLDQ: # %bb.0:
; AVX512VLDQ-NEXT: vcvttps2qq %ymm0, %zmm0
-; AVX512VLDQ-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512VLDQ-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
; AVX512VLDQ-NEXT: retq
%cvt = fptosi <8 x float> %a to <8 x i64>
%shuf = shufflevector <8 x i64> %cvt, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -1235,7 +1235,7 @@ define <4 x i64> @fptosi_8f32_to_4i64(<8 x float> %a) {
define <2 x i32> @fptoui_2f32_to_2i32(<2 x float> %a) {
; SSE-LABEL: fptoui_2f32_to_2i32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; SSE-NEXT: movaps %xmm0, %xmm1
; SSE-NEXT: subss %xmm2, %xmm1
@@ -1260,7 +1260,7 @@ define <2 x i32> @fptoui_2f32_to_2i32(<2 x float> %a) {
; SSE-NEXT: retq
;
; VEX-LABEL: fptoui_2f32_to_2i32:
-; VEX: # BB#0:
+; VEX: # %bb.0:
; VEX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; VEX-NEXT: vsubss %xmm1, %xmm0, %xmm2
; VEX-NEXT: vcvttss2si %xmm2, %rax
@@ -1282,29 +1282,29 @@ define <2 x i32> @fptoui_2f32_to_2i32(<2 x float> %a) {
; VEX-NEXT: retq
;
; AVX512F-LABEL: fptoui_2f32_to_2i32:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: # kill: def %xmm0 killed %xmm0 def %zmm0
; AVX512F-NEXT: vcvttps2udq %zmm0, %zmm0
; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: fptoui_2f32_to_2i32:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vcvttps2udq %xmm0, %xmm0
; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX512VL-NEXT: retq
;
; AVX512DQ-LABEL: fptoui_2f32_to_2i32:
-; AVX512DQ: # BB#0:
-; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: # kill: def %xmm0 killed %xmm0 def %zmm0
; AVX512DQ-NEXT: vcvttps2udq %zmm0, %zmm0
; AVX512DQ-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512VLDQ-LABEL: fptoui_2f32_to_2i32:
-; AVX512VLDQ: # BB#0:
+; AVX512VLDQ: # %bb.0:
; AVX512VLDQ-NEXT: vcvttps2udq %xmm0, %xmm0
; AVX512VLDQ-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX512VLDQ-NEXT: retq
@@ -1314,7 +1314,7 @@ define <2 x i32> @fptoui_2f32_to_2i32(<2 x float> %a) {
define <4 x i32> @fptoui_4f32_to_4i32(<4 x float> %a) {
; SSE-LABEL: fptoui_4f32_to_4i32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps %xmm0, %xmm1
; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1,2,3]
; SSE-NEXT: cvttss2si %xmm1, %rax
@@ -1335,7 +1335,7 @@ define <4 x i32> @fptoui_4f32_to_4i32(<4 x float> %a) {
; SSE-NEXT: retq
;
; VEX-LABEL: fptoui_4f32_to_4i32:
-; VEX: # BB#0:
+; VEX: # %bb.0:
; VEX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; VEX-NEXT: vcvttss2si %xmm1, %rax
; VEX-NEXT: vcvttss2si %xmm0, %rcx
@@ -1350,28 +1350,28 @@ define <4 x i32> @fptoui_4f32_to_4i32(<4 x float> %a) {
; VEX-NEXT: retq
;
; AVX512F-LABEL: fptoui_4f32_to_4i32:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: # kill: def %xmm0 killed %xmm0 def %zmm0
; AVX512F-NEXT: vcvttps2udq %zmm0, %zmm0
-; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512F-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: fptoui_4f32_to_4i32:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vcvttps2udq %xmm0, %xmm0
; AVX512VL-NEXT: retq
;
; AVX512DQ-LABEL: fptoui_4f32_to_4i32:
-; AVX512DQ: # BB#0:
-; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: # kill: def %xmm0 killed %xmm0 def %zmm0
; AVX512DQ-NEXT: vcvttps2udq %zmm0, %zmm0
-; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512DQ-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512VLDQ-LABEL: fptoui_4f32_to_4i32:
-; AVX512VLDQ: # BB#0:
+; AVX512VLDQ: # %bb.0:
; AVX512VLDQ-NEXT: vcvttps2udq %xmm0, %xmm0
; AVX512VLDQ-NEXT: retq
%cvt = fptoui <4 x float> %a to <4 x i32>
@@ -1380,7 +1380,7 @@ define <4 x i32> @fptoui_4f32_to_4i32(<4 x float> %a) {
define <2 x i64> @fptoui_2f32_to_2i64(<4 x float> %a) {
; SSE-LABEL: fptoui_2f32_to_2i64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; SSE-NEXT: movaps %xmm0, %xmm1
; SSE-NEXT: subss %xmm2, %xmm1
@@ -1405,7 +1405,7 @@ define <2 x i64> @fptoui_2f32_to_2i64(<4 x float> %a) {
; SSE-NEXT: retq
;
; VEX-LABEL: fptoui_2f32_to_2i64:
-; VEX: # BB#0:
+; VEX: # %bb.0:
; VEX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; VEX-NEXT: vsubss %xmm1, %xmm0, %xmm2
; VEX-NEXT: vcvttss2si %xmm2, %rax
@@ -1427,7 +1427,7 @@ define <2 x i64> @fptoui_2f32_to_2i64(<4 x float> %a) {
; VEX-NEXT: retq
;
; AVX512F-LABEL: fptoui_2f32_to_2i64:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vcvttss2usi %xmm0, %rax
; AVX512F-NEXT: vmovq %rax, %xmm1
; AVX512F-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
@@ -1437,7 +1437,7 @@ define <2 x i64> @fptoui_2f32_to_2i64(<4 x float> %a) {
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: fptoui_2f32_to_2i64:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vcvttss2usi %xmm0, %rax
; AVX512VL-NEXT: vmovq %rax, %xmm1
; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
@@ -1447,7 +1447,7 @@ define <2 x i64> @fptoui_2f32_to_2i64(<4 x float> %a) {
; AVX512VL-NEXT: retq
;
; AVX512DQ-LABEL: fptoui_2f32_to_2i64:
-; AVX512DQ: # BB#0:
+; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vcvttss2usi %xmm0, %rax
; AVX512DQ-NEXT: vmovq %rax, %xmm1
; AVX512DQ-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
@@ -1457,7 +1457,7 @@ define <2 x i64> @fptoui_2f32_to_2i64(<4 x float> %a) {
; AVX512DQ-NEXT: retq
;
; AVX512VLDQ-LABEL: fptoui_2f32_to_2i64:
-; AVX512VLDQ: # BB#0:
+; AVX512VLDQ: # %bb.0:
; AVX512VLDQ-NEXT: vcvttps2uqq %xmm0, %xmm0
; AVX512VLDQ-NEXT: retq
%shuf = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 0, i32 1>
@@ -1467,7 +1467,7 @@ define <2 x i64> @fptoui_2f32_to_2i64(<4 x float> %a) {
define <2 x i64> @fptoui_4f32_to_2i64(<4 x float> %a) {
; SSE-LABEL: fptoui_4f32_to_2i64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; SSE-NEXT: movaps %xmm0, %xmm1
; SSE-NEXT: subss %xmm2, %xmm1
@@ -1492,7 +1492,7 @@ define <2 x i64> @fptoui_4f32_to_2i64(<4 x float> %a) {
; SSE-NEXT: retq
;
; VEX-LABEL: fptoui_4f32_to_2i64:
-; VEX: # BB#0:
+; VEX: # %bb.0:
; VEX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; VEX-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; VEX-NEXT: vsubss %xmm2, %xmm1, %xmm3
@@ -1514,7 +1514,7 @@ define <2 x i64> @fptoui_4f32_to_2i64(<4 x float> %a) {
; VEX-NEXT: retq
;
; AVX512F-LABEL: fptoui_4f32_to_2i64:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; AVX512F-NEXT: vcvttss2usi %xmm1, %rax
; AVX512F-NEXT: vcvttss2usi %xmm0, %rcx
@@ -1524,7 +1524,7 @@ define <2 x i64> @fptoui_4f32_to_2i64(<4 x float> %a) {
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: fptoui_4f32_to_2i64:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; AVX512VL-NEXT: vcvttss2usi %xmm1, %rax
; AVX512VL-NEXT: vcvttss2usi %xmm0, %rcx
@@ -1534,17 +1534,17 @@ define <2 x i64> @fptoui_4f32_to_2i64(<4 x float> %a) {
; AVX512VL-NEXT: retq
;
; AVX512DQ-LABEL: fptoui_4f32_to_2i64:
-; AVX512DQ: # BB#0:
-; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
; AVX512DQ-NEXT: vcvttps2uqq %ymm0, %zmm0
-; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512DQ-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512VLDQ-LABEL: fptoui_4f32_to_2i64:
-; AVX512VLDQ: # BB#0:
+; AVX512VLDQ: # %bb.0:
; AVX512VLDQ-NEXT: vcvttps2uqq %xmm0, %ymm0
-; AVX512VLDQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512VLDQ-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX512VLDQ-NEXT: vzeroupper
; AVX512VLDQ-NEXT: retq
%cvt = fptoui <4 x float> %a to <4 x i64>
@@ -1554,7 +1554,7 @@ define <2 x i64> @fptoui_4f32_to_2i64(<4 x float> %a) {
define <8 x i32> @fptoui_8f32_to_8i32(<8 x float> %a) {
; SSE-LABEL: fptoui_8f32_to_8i32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps %xmm0, %xmm2
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
; SSE-NEXT: cvttss2si %xmm0, %rax
@@ -1591,7 +1591,7 @@ define <8 x i32> @fptoui_8f32_to_8i32(<8 x float> %a) {
; SSE-NEXT: retq
;
; AVX1-LABEL: fptoui_8f32_to_8i32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
; AVX1-NEXT: vcvttss2si %xmm2, %rax
@@ -1619,7 +1619,7 @@ define <8 x i32> @fptoui_8f32_to_8i32(<8 x float> %a) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: fptoui_8f32_to_8i32:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX2-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
; AVX2-NEXT: vcvttss2si %xmm2, %rax
@@ -1647,26 +1647,26 @@ define <8 x i32> @fptoui_8f32_to_8i32(<8 x float> %a) {
; AVX2-NEXT: retq
;
; AVX512F-LABEL: fptoui_8f32_to_8i32:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; AVX512F-NEXT: vcvttps2udq %zmm0, %zmm0
-; AVX512F-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512F-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: fptoui_8f32_to_8i32:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vcvttps2udq %ymm0, %ymm0
; AVX512VL-NEXT: retq
;
; AVX512DQ-LABEL: fptoui_8f32_to_8i32:
-; AVX512DQ: # BB#0:
-; AVX512DQ-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; AVX512DQ-NEXT: vcvttps2udq %zmm0, %zmm0
-; AVX512DQ-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512DQ-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
; AVX512DQ-NEXT: retq
;
; AVX512VLDQ-LABEL: fptoui_8f32_to_8i32:
-; AVX512VLDQ: # BB#0:
+; AVX512VLDQ: # %bb.0:
; AVX512VLDQ-NEXT: vcvttps2udq %ymm0, %ymm0
; AVX512VLDQ-NEXT: retq
%cvt = fptoui <8 x float> %a to <8 x i32>
@@ -1675,7 +1675,7 @@ define <8 x i32> @fptoui_8f32_to_8i32(<8 x float> %a) {
define <4 x i64> @fptoui_4f32_to_4i64(<8 x float> %a) {
; SSE-LABEL: fptoui_4f32_to_4i64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; SSE-NEXT: movaps %xmm0, %xmm2
; SSE-NEXT: subss %xmm1, %xmm2
@@ -1721,7 +1721,7 @@ define <4 x i64> @fptoui_4f32_to_4i64(<8 x float> %a) {
; SSE-NEXT: retq
;
; AVX1-LABEL: fptoui_4f32_to_4i64:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,1,2,3]
; AVX1-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; AVX1-NEXT: vsubss %xmm1, %xmm2, %xmm3
@@ -1761,7 +1761,7 @@ define <4 x i64> @fptoui_4f32_to_4i64(<8 x float> %a) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: fptoui_4f32_to_4i64:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,1,2,3]
; AVX2-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; AVX2-NEXT: vsubss %xmm1, %xmm2, %xmm3
@@ -1801,7 +1801,7 @@ define <4 x i64> @fptoui_4f32_to_4i64(<8 x float> %a) {
; AVX2-NEXT: retq
;
; AVX512F-LABEL: fptoui_4f32_to_4i64:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
; AVX512F-NEXT: vcvttss2usi %xmm1, %rax
; AVX512F-NEXT: vmovq %rax, %xmm1
@@ -1819,7 +1819,7 @@ define <4 x i64> @fptoui_4f32_to_4i64(<8 x float> %a) {
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: fptoui_4f32_to_4i64:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
; AVX512VL-NEXT: vcvttss2usi %xmm1, %rax
; AVX512VL-NEXT: vmovq %rax, %xmm1
@@ -1837,13 +1837,13 @@ define <4 x i64> @fptoui_4f32_to_4i64(<8 x float> %a) {
; AVX512VL-NEXT: retq
;
; AVX512DQ-LABEL: fptoui_4f32_to_4i64:
-; AVX512DQ: # BB#0:
+; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vcvttps2uqq %ymm0, %zmm0
-; AVX512DQ-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512DQ-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
; AVX512DQ-NEXT: retq
;
; AVX512VLDQ-LABEL: fptoui_4f32_to_4i64:
-; AVX512VLDQ: # BB#0:
+; AVX512VLDQ: # %bb.0:
; AVX512VLDQ-NEXT: vcvttps2uqq %xmm0, %ymm0
; AVX512VLDQ-NEXT: retq
%shuf = shufflevector <8 x float> %a, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -1853,7 +1853,7 @@ define <4 x i64> @fptoui_4f32_to_4i64(<8 x float> %a) {
define <4 x i64> @fptoui_8f32_to_4i64(<8 x float> %a) {
; SSE-LABEL: fptoui_8f32_to_4i64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; SSE-NEXT: movaps %xmm0, %xmm2
; SSE-NEXT: subss %xmm1, %xmm2
@@ -1899,7 +1899,7 @@ define <4 x i64> @fptoui_8f32_to_4i64(<8 x float> %a) {
; SSE-NEXT: retq
;
; AVX1-LABEL: fptoui_8f32_to_4i64:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,1,2,3]
; AVX1-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; AVX1-NEXT: vsubss %xmm1, %xmm2, %xmm3
@@ -1939,7 +1939,7 @@ define <4 x i64> @fptoui_8f32_to_4i64(<8 x float> %a) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: fptoui_8f32_to_4i64:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,1,2,3]
; AVX2-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; AVX2-NEXT: vsubss %xmm1, %xmm2, %xmm3
@@ -1979,7 +1979,7 @@ define <4 x i64> @fptoui_8f32_to_4i64(<8 x float> %a) {
; AVX2-NEXT: retq
;
; AVX512F-LABEL: fptoui_8f32_to_4i64:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; AVX512F-NEXT: vcvttss2usi %xmm1, %rax
; AVX512F-NEXT: vcvttss2usi %xmm0, %rcx
@@ -1997,7 +1997,7 @@ define <4 x i64> @fptoui_8f32_to_4i64(<8 x float> %a) {
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: fptoui_8f32_to_4i64:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; AVX512VL-NEXT: vcvttss2usi %xmm1, %rax
; AVX512VL-NEXT: vcvttss2usi %xmm0, %rcx
@@ -2015,15 +2015,15 @@ define <4 x i64> @fptoui_8f32_to_4i64(<8 x float> %a) {
; AVX512VL-NEXT: retq
;
; AVX512DQ-LABEL: fptoui_8f32_to_4i64:
-; AVX512DQ: # BB#0:
+; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vcvttps2uqq %ymm0, %zmm0
-; AVX512DQ-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512DQ-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
; AVX512DQ-NEXT: retq
;
; AVX512VLDQ-LABEL: fptoui_8f32_to_4i64:
-; AVX512VLDQ: # BB#0:
+; AVX512VLDQ: # %bb.0:
; AVX512VLDQ-NEXT: vcvttps2uqq %ymm0, %zmm0
-; AVX512VLDQ-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512VLDQ-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
; AVX512VLDQ-NEXT: retq
%cvt = fptoui <8 x float> %a to <8 x i64>
%shuf = shufflevector <8 x i64> %cvt, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -2036,12 +2036,12 @@ define <4 x i64> @fptoui_8f32_to_4i64(<8 x float> %a) {
define <2 x i64> @fptosi_2f64_to_2i64_const() {
; SSE-LABEL: fptosi_2f64_to_2i64_const:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm0 = [1,18446744073709551615]
; SSE-NEXT: retq
;
; AVX-LABEL: fptosi_2f64_to_2i64_const:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [1,18446744073709551615]
; AVX-NEXT: retq
%cvt = fptosi <2 x double> <double 1.0, double -1.0> to <2 x i64>
@@ -2050,12 +2050,12 @@ define <2 x i64> @fptosi_2f64_to_2i64_const() {
define <4 x i32> @fptosi_2f64_to_2i32_const() {
; SSE-LABEL: fptosi_2f64_to_2i32_const:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm0 = <4294967295,1,u,u>
; SSE-NEXT: retq
;
; AVX-LABEL: fptosi_2f64_to_2i32_const:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} xmm0 = <4294967295,1,u,u>
; AVX-NEXT: retq
%cvt = fptosi <2 x double> <double -1.0, double 1.0> to <2 x i32>
@@ -2065,13 +2065,13 @@ define <4 x i32> @fptosi_2f64_to_2i32_const() {
define <4 x i64> @fptosi_4f64_to_4i64_const() {
; SSE-LABEL: fptosi_4f64_to_4i64_const:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm0 = [1,18446744073709551615]
; SSE-NEXT: movaps {{.*#+}} xmm1 = [2,18446744073709551613]
; SSE-NEXT: retq
;
; AVX-LABEL: fptosi_4f64_to_4i64_const:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [1,18446744073709551615,2,18446744073709551613]
; AVX-NEXT: retq
%cvt = fptosi <4 x double> <double 1.0, double -1.0, double 2.0, double -3.0> to <4 x i64>
@@ -2080,12 +2080,12 @@ define <4 x i64> @fptosi_4f64_to_4i64_const() {
define <4 x i32> @fptosi_4f64_to_4i32_const() {
; SSE-LABEL: fptosi_4f64_to_4i32_const:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm0 = [4294967295,1,4294967294,3]
; SSE-NEXT: retq
;
; AVX-LABEL: fptosi_4f64_to_4i32_const:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [4294967295,1,4294967294,3]
; AVX-NEXT: retq
%cvt = fptosi <4 x double> <double -1.0, double 1.0, double -2.0, double 3.0> to <4 x i32>
@@ -2094,12 +2094,12 @@ define <4 x i32> @fptosi_4f64_to_4i32_const() {
define <2 x i64> @fptoui_2f64_to_2i64_const() {
; SSE-LABEL: fptoui_2f64_to_2i64_const:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm0 = [2,4]
; SSE-NEXT: retq
;
; AVX-LABEL: fptoui_2f64_to_2i64_const:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [2,4]
; AVX-NEXT: retq
%cvt = fptoui <2 x double> <double 2.0, double 4.0> to <2 x i64>
@@ -2108,12 +2108,12 @@ define <2 x i64> @fptoui_2f64_to_2i64_const() {
define <4 x i32> @fptoui_2f64_to_2i32_const(<2 x double> %a) {
; SSE-LABEL: fptoui_2f64_to_2i32_const:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm0 = <2,4,u,u>
; SSE-NEXT: retq
;
; AVX-LABEL: fptoui_2f64_to_2i32_const:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} xmm0 = <2,4,u,u>
; AVX-NEXT: retq
%cvt = fptoui <2 x double> <double 2.0, double 4.0> to <2 x i32>
@@ -2123,13 +2123,13 @@ define <4 x i32> @fptoui_2f64_to_2i32_const(<2 x double> %a) {
define <4 x i64> @fptoui_4f64_to_4i64_const(<4 x double> %a) {
; SSE-LABEL: fptoui_4f64_to_4i64_const:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm0 = [2,4]
; SSE-NEXT: movaps {{.*#+}} xmm1 = [6,8]
; SSE-NEXT: retq
;
; AVX-LABEL: fptoui_4f64_to_4i64_const:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [2,4,6,8]
; AVX-NEXT: retq
%cvt = fptoui <4 x double> <double 2.0, double 4.0, double 6.0, double 8.0> to <4 x i64>
@@ -2138,12 +2138,12 @@ define <4 x i64> @fptoui_4f64_to_4i64_const(<4 x double> %a) {
define <4 x i32> @fptoui_4f64_to_4i32_const(<4 x double> %a) {
; SSE-LABEL: fptoui_4f64_to_4i32_const:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm0 = [2,4,6,8]
; SSE-NEXT: retq
;
; AVX-LABEL: fptoui_4f64_to_4i32_const:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [2,4,6,8]
; AVX-NEXT: retq
%cvt = fptoui <4 x double> <double 2.0, double 4.0, double 6.0, double 8.0> to <4 x i32>
@@ -2152,12 +2152,12 @@ define <4 x i32> @fptoui_4f64_to_4i32_const(<4 x double> %a) {
define <4 x i32> @fptosi_4f32_to_4i32_const() {
; SSE-LABEL: fptosi_4f32_to_4i32_const:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm0 = [1,4294967295,2,3]
; SSE-NEXT: retq
;
; AVX-LABEL: fptosi_4f32_to_4i32_const:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [1,4294967295,2,3]
; AVX-NEXT: retq
%cvt = fptosi <4 x float> <float 1.0, float -1.0, float 2.0, float 3.0> to <4 x i32>
@@ -2166,13 +2166,13 @@ define <4 x i32> @fptosi_4f32_to_4i32_const() {
define <4 x i64> @fptosi_4f32_to_4i64_const() {
; SSE-LABEL: fptosi_4f32_to_4i64_const:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm0 = [1,18446744073709551615]
; SSE-NEXT: movaps {{.*#+}} xmm1 = [2,3]
; SSE-NEXT: retq
;
; AVX-LABEL: fptosi_4f32_to_4i64_const:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [1,18446744073709551615,2,3]
; AVX-NEXT: retq
%cvt = fptosi <4 x float> <float 1.0, float -1.0, float 2.0, float 3.0> to <4 x i64>
@@ -2181,13 +2181,13 @@ define <4 x i64> @fptosi_4f32_to_4i64_const() {
define <8 x i32> @fptosi_8f32_to_8i32_const(<8 x float> %a) {
; SSE-LABEL: fptosi_8f32_to_8i32_const:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm0 = [1,4294967295,2,3]
; SSE-NEXT: movaps {{.*#+}} xmm1 = [6,4294967288,2,4294967295]
; SSE-NEXT: retq
;
; AVX-LABEL: fptosi_8f32_to_8i32_const:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [1,4294967295,2,3,6,4294967288,2,4294967295]
; AVX-NEXT: retq
%cvt = fptosi <8 x float> <float 1.0, float -1.0, float 2.0, float 3.0, float 6.0, float -8.0, float 2.0, float -1.0> to <8 x i32>
@@ -2196,12 +2196,12 @@ define <8 x i32> @fptosi_8f32_to_8i32_const(<8 x float> %a) {
define <4 x i32> @fptoui_4f32_to_4i32_const(<4 x float> %a) {
; SSE-LABEL: fptoui_4f32_to_4i32_const:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm0 = [1,2,4,6]
; SSE-NEXT: retq
;
; AVX-LABEL: fptoui_4f32_to_4i32_const:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [1,2,4,6]
; AVX-NEXT: retq
%cvt = fptoui <4 x float> <float 1.0, float 2.0, float 4.0, float 6.0> to <4 x i32>
@@ -2210,13 +2210,13 @@ define <4 x i32> @fptoui_4f32_to_4i32_const(<4 x float> %a) {
define <4 x i64> @fptoui_4f32_to_4i64_const() {
; SSE-LABEL: fptoui_4f32_to_4i64_const:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm0 = [1,2]
; SSE-NEXT: movaps {{.*#+}} xmm1 = [4,8]
; SSE-NEXT: retq
;
; AVX-LABEL: fptoui_4f32_to_4i64_const:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [1,2,4,8]
; AVX-NEXT: retq
%cvt = fptoui <4 x float> <float 1.0, float 2.0, float 4.0, float 8.0> to <4 x i64>
@@ -2225,13 +2225,13 @@ define <4 x i64> @fptoui_4f32_to_4i64_const() {
define <8 x i32> @fptoui_8f32_to_8i32_const(<8 x float> %a) {
; SSE-LABEL: fptoui_8f32_to_8i32_const:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm0 = [1,2,4,6]
; SSE-NEXT: movaps {{.*#+}} xmm1 = [8,6,4,1]
; SSE-NEXT: retq
;
; AVX-LABEL: fptoui_8f32_to_8i32_const:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [1,2,4,6,8,6,4,1]
; AVX-NEXT: retq
%cvt = fptoui <8 x float> <float 1.0, float 2.0, float 4.0, float 6.0, float 8.0, float 6.0, float 4.0, float 1.0> to <8 x i32>
@@ -2244,7 +2244,7 @@ define <8 x i32> @fptoui_8f32_to_8i32_const(<8 x float> %a) {
define <4 x i32> @fptosi_2f16_to_4i32(<2 x half> %a) nounwind {
; SSE-LABEL: fptosi_2f16_to_4i32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pushq %rax
; SSE-NEXT: movss %xmm1, {{[0-9]+}}(%rsp) # 4-byte Spill
; SSE-NEXT: callq __gnu_f2h_ieee
@@ -2267,7 +2267,7 @@ define <4 x i32> @fptosi_2f16_to_4i32(<2 x half> %a) nounwind {
; SSE-NEXT: retq
;
; VEX-LABEL: fptosi_2f16_to_4i32:
-; VEX: # BB#0:
+; VEX: # %bb.0:
; VEX-NEXT: pushq %rax
; VEX-NEXT: vmovss %xmm1, {{[0-9]+}}(%rsp) # 4-byte Spill
; VEX-NEXT: callq __gnu_f2h_ieee
@@ -2288,67 +2288,19 @@ define <4 x i32> @fptosi_2f16_to_4i32(<2 x half> %a) nounwind {
; VEX-NEXT: popq %rax
; VEX-NEXT: retq
;
-; AVX512F-LABEL: fptosi_2f16_to_4i32:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: # kill: %XMM1<def> %XMM1<kill> %ZMM1<def>
-; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
-; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm0
-; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0
-; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
-; AVX512F-NEXT: vcvtph2ps %ymm1, %zmm1
-; AVX512F-NEXT: vcvttss2si %xmm1, %rax
-; AVX512F-NEXT: vmovq %rax, %xmm1
-; AVX512F-NEXT: vcvttss2si %xmm0, %rax
-; AVX512F-NEXT: vmovq %rax, %xmm0
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: fptosi_2f16_to_4i32:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX512VL-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX512VL-NEXT: vcvttss2si %xmm1, %rax
-; AVX512VL-NEXT: vmovq %rax, %xmm1
-; AVX512VL-NEXT: vcvttss2si %xmm0, %rax
-; AVX512VL-NEXT: vmovq %rax, %xmm0
-; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
-; AVX512VL-NEXT: retq
-;
-; AVX512DQ-LABEL: fptosi_2f16_to_4i32:
-; AVX512DQ: # BB#0:
-; AVX512DQ-NEXT: # kill: %XMM1<def> %XMM1<kill> %ZMM1<def>
-; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
-; AVX512DQ-NEXT: vcvtps2ph $4, %zmm0, %ymm0
-; AVX512DQ-NEXT: vcvtph2ps %ymm0, %zmm0
-; AVX512DQ-NEXT: vcvtps2ph $4, %zmm1, %ymm1
-; AVX512DQ-NEXT: vcvtph2ps %ymm1, %zmm1
-; AVX512DQ-NEXT: vcvttss2si %xmm1, %rax
-; AVX512DQ-NEXT: vmovq %rax, %xmm1
-; AVX512DQ-NEXT: vcvttss2si %xmm0, %rax
-; AVX512DQ-NEXT: vmovq %rax, %xmm0
-; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512DQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
-;
-; AVX512VLDQ-LABEL: fptosi_2f16_to_4i32:
-; AVX512VLDQ: # BB#0:
-; AVX512VLDQ-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512VLDQ-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512VLDQ-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX512VLDQ-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX512VLDQ-NEXT: vcvttss2si %xmm1, %rax
-; AVX512VLDQ-NEXT: vmovq %rax, %xmm1
-; AVX512VLDQ-NEXT: vcvttss2si %xmm0, %rax
-; AVX512VLDQ-NEXT: vmovq %rax, %xmm0
-; AVX512VLDQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512VLDQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
-; AVX512VLDQ-NEXT: retq
+; AVX512-LABEL: fptosi_2f16_to_4i32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX512-NEXT: vcvttss2si %xmm1, %rax
+; AVX512-NEXT: vmovq %rax, %xmm1
+; AVX512-NEXT: vcvttss2si %xmm0, %rax
+; AVX512-NEXT: vmovq %rax, %xmm0
+; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; AVX512-NEXT: retq
%cvt = fptosi <2 x half> %a to <2 x i32>
%ext = shufflevector <2 x i32> %cvt, <2 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
ret <4 x i32> %ext
@@ -2356,7 +2308,7 @@ define <4 x i32> @fptosi_2f16_to_4i32(<2 x half> %a) nounwind {
define <4 x i32> @fptosi_2f80_to_4i32(<2 x x86_fp80> %a) nounwind {
; SSE-LABEL: fptosi_2f80_to_4i32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: fldt {{[0-9]+}}(%rsp)
; SSE-NEXT: fldt {{[0-9]+}}(%rsp)
; SSE-NEXT: fnstcw -{{[0-9]+}}(%rsp)
@@ -2373,22 +2325,22 @@ define <4 x i32> @fptosi_2f80_to_4i32(<2 x x86_fp80> %a) nounwind {
; SSE-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
; SSE-NEXT: fistpll -{{[0-9]+}}(%rsp)
; SSE-NEXT: fldcw -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
-; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE-NEXT: pxor %xmm1, %xmm1
+; SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
+; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT: xorps %xmm1, %xmm1
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3]
; SSE-NEXT: retq
;
; AVX-LABEL: fptosi_2f80_to_4i32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: fldt {{[0-9]+}}(%rsp)
; AVX-NEXT: fldt {{[0-9]+}}(%rsp)
; AVX-NEXT: fisttpll -{{[0-9]+}}(%rsp)
; AVX-NEXT: fisttpll -{{[0-9]+}}(%rsp)
-; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
-; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
; AVX-NEXT: retq
%cvt = fptosi <2 x x86_fp80> %a to <2 x i32>
@@ -2398,7 +2350,7 @@ define <4 x i32> @fptosi_2f80_to_4i32(<2 x x86_fp80> %a) nounwind {
define <4 x i32> @fptosi_2f128_to_4i32(<2 x fp128> %a) nounwind {
; SSE-LABEL: fptosi_2f128_to_4i32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pushq %r14
; SSE-NEXT: pushq %rbx
; SSE-NEXT: subq $24, %rsp
@@ -2423,7 +2375,7 @@ define <4 x i32> @fptosi_2f128_to_4i32(<2 x fp128> %a) nounwind {
; SSE-NEXT: retq
;
; AVX-LABEL: fptosi_2f128_to_4i32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: pushq %r14
; AVX-NEXT: pushq %rbx
; AVX-NEXT: subq $24, %rsp
diff --git a/test/CodeGen/X86/vec_fpext.ll b/test/CodeGen/X86/vec_fpext.ll
index 609ed0882092..6b546ea9e128 100644
--- a/test/CodeGen/X86/vec_fpext.ll
+++ b/test/CodeGen/X86/vec_fpext.ll
@@ -9,7 +9,7 @@
; PR11674
define void @fpext_frommem(<2 x float>* %in, <2 x double>* %out) {
; X32-SSE-LABEL: fpext_frommem:
-; X32-SSE: # BB#0: # %entry
+; X32-SSE: # %bb.0: # %entry
; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x04]
; X32-SSE-NEXT: cvtps2pd (%ecx), %xmm0 # encoding: [0x0f,0x5a,0x01]
@@ -17,7 +17,7 @@ define void @fpext_frommem(<2 x float>* %in, <2 x double>* %out) {
; X32-SSE-NEXT: retl # encoding: [0xc3]
;
; X32-AVX-LABEL: fpext_frommem:
-; X32-AVX: # BB#0: # %entry
+; X32-AVX: # %bb.0: # %entry
; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x04]
; X32-AVX-NEXT: vcvtps2pd (%ecx), %xmm0 # encoding: [0xc5,0xf8,0x5a,0x01]
@@ -25,7 +25,7 @@ define void @fpext_frommem(<2 x float>* %in, <2 x double>* %out) {
; X32-AVX-NEXT: retl # encoding: [0xc3]
;
; X32-AVX512VL-LABEL: fpext_frommem:
-; X32-AVX512VL: # BB#0: # %entry
+; X32-AVX512VL: # %bb.0: # %entry
; X32-AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
; X32-AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x04]
; X32-AVX512VL-NEXT: vcvtps2pd (%ecx), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x5a,0x01]
@@ -33,19 +33,19 @@ define void @fpext_frommem(<2 x float>* %in, <2 x double>* %out) {
; X32-AVX512VL-NEXT: retl # encoding: [0xc3]
;
; X64-SSE-LABEL: fpext_frommem:
-; X64-SSE: # BB#0: # %entry
+; X64-SSE: # %bb.0: # %entry
; X64-SSE-NEXT: cvtps2pd (%rdi), %xmm0 # encoding: [0x0f,0x5a,0x07]
; X64-SSE-NEXT: movups %xmm0, (%rsi) # encoding: [0x0f,0x11,0x06]
; X64-SSE-NEXT: retq # encoding: [0xc3]
;
; X64-AVX-LABEL: fpext_frommem:
-; X64-AVX: # BB#0: # %entry
+; X64-AVX: # %bb.0: # %entry
; X64-AVX-NEXT: vcvtps2pd (%rdi), %xmm0 # encoding: [0xc5,0xf8,0x5a,0x07]
; X64-AVX-NEXT: vmovups %xmm0, (%rsi) # encoding: [0xc5,0xf8,0x11,0x06]
; X64-AVX-NEXT: retq # encoding: [0xc3]
;
; X64-AVX512VL-LABEL: fpext_frommem:
-; X64-AVX512VL: # BB#0: # %entry
+; X64-AVX512VL: # %bb.0: # %entry
; X64-AVX512VL-NEXT: vcvtps2pd (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x5a,0x07]
; X64-AVX512VL-NEXT: vmovups %xmm0, (%rsi) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x11,0x06]
; X64-AVX512VL-NEXT: retq # encoding: [0xc3]
@@ -58,7 +58,7 @@ entry:
define void @fpext_frommem4(<4 x float>* %in, <4 x double>* %out) {
; X32-SSE-LABEL: fpext_frommem4:
-; X32-SSE: # BB#0: # %entry
+; X32-SSE: # %bb.0: # %entry
; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x04]
; X32-SSE-NEXT: cvtps2pd (%ecx), %xmm0 # encoding: [0x0f,0x5a,0x01]
@@ -68,7 +68,7 @@ define void @fpext_frommem4(<4 x float>* %in, <4 x double>* %out) {
; X32-SSE-NEXT: retl # encoding: [0xc3]
;
; X32-AVX-LABEL: fpext_frommem4:
-; X32-AVX: # BB#0: # %entry
+; X32-AVX: # %bb.0: # %entry
; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x04]
; X32-AVX-NEXT: vcvtps2pd (%ecx), %ymm0 # encoding: [0xc5,0xfc,0x5a,0x01]
@@ -77,7 +77,7 @@ define void @fpext_frommem4(<4 x float>* %in, <4 x double>* %out) {
; X32-AVX-NEXT: retl # encoding: [0xc3]
;
; X32-AVX512VL-LABEL: fpext_frommem4:
-; X32-AVX512VL: # BB#0: # %entry
+; X32-AVX512VL: # %bb.0: # %entry
; X32-AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
; X32-AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x04]
; X32-AVX512VL-NEXT: vcvtps2pd (%ecx), %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x5a,0x01]
@@ -86,7 +86,7 @@ define void @fpext_frommem4(<4 x float>* %in, <4 x double>* %out) {
; X32-AVX512VL-NEXT: retl # encoding: [0xc3]
;
; X64-SSE-LABEL: fpext_frommem4:
-; X64-SSE: # BB#0: # %entry
+; X64-SSE: # %bb.0: # %entry
; X64-SSE-NEXT: cvtps2pd (%rdi), %xmm0 # encoding: [0x0f,0x5a,0x07]
; X64-SSE-NEXT: cvtps2pd 8(%rdi), %xmm1 # encoding: [0x0f,0x5a,0x4f,0x08]
; X64-SSE-NEXT: movups %xmm1, 16(%rsi) # encoding: [0x0f,0x11,0x4e,0x10]
@@ -94,14 +94,14 @@ define void @fpext_frommem4(<4 x float>* %in, <4 x double>* %out) {
; X64-SSE-NEXT: retq # encoding: [0xc3]
;
; X64-AVX-LABEL: fpext_frommem4:
-; X64-AVX: # BB#0: # %entry
+; X64-AVX: # %bb.0: # %entry
; X64-AVX-NEXT: vcvtps2pd (%rdi), %ymm0 # encoding: [0xc5,0xfc,0x5a,0x07]
; X64-AVX-NEXT: vmovups %ymm0, (%rsi) # encoding: [0xc5,0xfc,0x11,0x06]
; X64-AVX-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
; X64-AVX-NEXT: retq # encoding: [0xc3]
;
; X64-AVX512VL-LABEL: fpext_frommem4:
-; X64-AVX512VL: # BB#0: # %entry
+; X64-AVX512VL: # %bb.0: # %entry
; X64-AVX512VL-NEXT: vcvtps2pd (%rdi), %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x5a,0x07]
; X64-AVX512VL-NEXT: vmovups %ymm0, (%rsi) # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x11,0x06]
; X64-AVX512VL-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
@@ -115,7 +115,7 @@ entry:
define void @fpext_frommem8(<8 x float>* %in, <8 x double>* %out) {
; X32-SSE-LABEL: fpext_frommem8:
-; X32-SSE: # BB#0: # %entry
+; X32-SSE: # %bb.0: # %entry
; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x04]
; X32-SSE-NEXT: cvtps2pd (%ecx), %xmm0 # encoding: [0x0f,0x5a,0x01]
@@ -129,7 +129,7 @@ define void @fpext_frommem8(<8 x float>* %in, <8 x double>* %out) {
; X32-SSE-NEXT: retl # encoding: [0xc3]
;
; X32-AVX-LABEL: fpext_frommem8:
-; X32-AVX: # BB#0: # %entry
+; X32-AVX: # %bb.0: # %entry
; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x04]
; X32-AVX-NEXT: vcvtps2pd (%ecx), %ymm0 # encoding: [0xc5,0xfc,0x5a,0x01]
@@ -140,7 +140,7 @@ define void @fpext_frommem8(<8 x float>* %in, <8 x double>* %out) {
; X32-AVX-NEXT: retl # encoding: [0xc3]
;
; X32-AVX512VL-LABEL: fpext_frommem8:
-; X32-AVX512VL: # BB#0: # %entry
+; X32-AVX512VL: # %bb.0: # %entry
; X32-AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
; X32-AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x04]
; X32-AVX512VL-NEXT: vcvtps2pd (%ecx), %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x5a,0x01]
@@ -149,7 +149,7 @@ define void @fpext_frommem8(<8 x float>* %in, <8 x double>* %out) {
; X32-AVX512VL-NEXT: retl # encoding: [0xc3]
;
; X64-SSE-LABEL: fpext_frommem8:
-; X64-SSE: # BB#0: # %entry
+; X64-SSE: # %bb.0: # %entry
; X64-SSE-NEXT: cvtps2pd (%rdi), %xmm0 # encoding: [0x0f,0x5a,0x07]
; X64-SSE-NEXT: cvtps2pd 8(%rdi), %xmm1 # encoding: [0x0f,0x5a,0x4f,0x08]
; X64-SSE-NEXT: cvtps2pd 16(%rdi), %xmm2 # encoding: [0x0f,0x5a,0x57,0x10]
@@ -161,7 +161,7 @@ define void @fpext_frommem8(<8 x float>* %in, <8 x double>* %out) {
; X64-SSE-NEXT: retq # encoding: [0xc3]
;
; X64-AVX-LABEL: fpext_frommem8:
-; X64-AVX: # BB#0: # %entry
+; X64-AVX: # %bb.0: # %entry
; X64-AVX-NEXT: vcvtps2pd (%rdi), %ymm0 # encoding: [0xc5,0xfc,0x5a,0x07]
; X64-AVX-NEXT: vcvtps2pd 16(%rdi), %ymm1 # encoding: [0xc5,0xfc,0x5a,0x4f,0x10]
; X64-AVX-NEXT: vmovups %ymm1, 32(%rsi) # encoding: [0xc5,0xfc,0x11,0x4e,0x20]
@@ -170,7 +170,7 @@ define void @fpext_frommem8(<8 x float>* %in, <8 x double>* %out) {
; X64-AVX-NEXT: retq # encoding: [0xc3]
;
; X64-AVX512VL-LABEL: fpext_frommem8:
-; X64-AVX512VL: # BB#0: # %entry
+; X64-AVX512VL: # %bb.0: # %entry
; X64-AVX512VL-NEXT: vcvtps2pd (%rdi), %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x5a,0x07]
; X64-AVX512VL-NEXT: vmovups %zmm0, (%rsi) # encoding: [0x62,0xf1,0x7c,0x48,0x11,0x06]
; X64-AVX512VL-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
@@ -184,42 +184,42 @@ entry:
define <2 x double> @fpext_fromconst() {
; X32-SSE-LABEL: fpext_fromconst:
-; X32-SSE: # BB#0: # %entry
+; X32-SSE: # %bb.0: # %entry
; X32-SSE-NEXT: movaps {{.*#+}} xmm0 = [1.000000e+00,-2.000000e+00]
; X32-SSE-NEXT: # encoding: [0x0f,0x28,0x05,A,A,A,A]
; X32-SSE-NEXT: # fixup A - offset: 3, value: {{\.LCPI.*}}, kind: FK_Data_4
; X32-SSE-NEXT: retl # encoding: [0xc3]
;
; X32-AVX-LABEL: fpext_fromconst:
-; X32-AVX: # BB#0: # %entry
+; X32-AVX: # %bb.0: # %entry
; X32-AVX-NEXT: vmovaps {{.*#+}} xmm0 = [1.000000e+00,-2.000000e+00]
; X32-AVX-NEXT: # encoding: [0xc5,0xf8,0x28,0x05,A,A,A,A]
; X32-AVX-NEXT: # fixup A - offset: 4, value: {{\.LCPI.*}}, kind: FK_Data_4
; X32-AVX-NEXT: retl # encoding: [0xc3]
;
; X32-AVX512VL-LABEL: fpext_fromconst:
-; X32-AVX512VL: # BB#0: # %entry
+; X32-AVX512VL: # %bb.0: # %entry
; X32-AVX512VL-NEXT: vmovaps {{\.LCPI.*}}, %xmm0 # EVEX TO VEX Compression xmm0 = [1.000000e+00,-2.000000e+00]
; X32-AVX512VL-NEXT: # encoding: [0xc5,0xf8,0x28,0x05,A,A,A,A]
; X32-AVX512VL-NEXT: # fixup A - offset: 4, value: {{\.LCPI.*}}, kind: FK_Data_4
; X32-AVX512VL-NEXT: retl # encoding: [0xc3]
;
; X64-SSE-LABEL: fpext_fromconst:
-; X64-SSE: # BB#0: # %entry
+; X64-SSE: # %bb.0: # %entry
; X64-SSE-NEXT: movaps {{.*#+}} xmm0 = [1.000000e+00,-2.000000e+00]
; X64-SSE-NEXT: # encoding: [0x0f,0x28,0x05,A,A,A,A]
; X64-SSE-NEXT: # fixup A - offset: 3, value: {{\.LCPI.*}}-4, kind: reloc_riprel_4byte
; X64-SSE-NEXT: retq # encoding: [0xc3]
;
; X64-AVX-LABEL: fpext_fromconst:
-; X64-AVX: # BB#0: # %entry
+; X64-AVX: # %bb.0: # %entry
; X64-AVX-NEXT: vmovaps {{.*#+}} xmm0 = [1.000000e+00,-2.000000e+00]
; X64-AVX-NEXT: # encoding: [0xc5,0xf8,0x28,0x05,A,A,A,A]
; X64-AVX-NEXT: # fixup A - offset: 4, value: {{\.LCPI.*}}-4, kind: reloc_riprel_4byte
; X64-AVX-NEXT: retq # encoding: [0xc3]
;
; X64-AVX512VL-LABEL: fpext_fromconst:
-; X64-AVX512VL: # BB#0: # %entry
+; X64-AVX512VL: # %bb.0: # %entry
; X64-AVX512VL-NEXT: vmovaps {{.*}}(%rip), %xmm0 # EVEX TO VEX Compression xmm0 = [1.000000e+00,-2.000000e+00]
; X64-AVX512VL-NEXT: # encoding: [0xc5,0xf8,0x28,0x05,A,A,A,A]
; X64-AVX512VL-NEXT: # fixup A - offset: 4, value: {{\.LCPI.*}}-4, kind: reloc_riprel_4byte
diff --git a/test/CodeGen/X86/vec_fptrunc.ll b/test/CodeGen/X86/vec_fptrunc.ll
index e6a0d52c5ae8..79abeb0c59f7 100644
--- a/test/CodeGen/X86/vec_fptrunc.ll
+++ b/test/CodeGen/X86/vec_fptrunc.ll
@@ -6,7 +6,7 @@
define void @fptrunc_frommem2(<2 x double>* %in, <2 x float>* %out) {
; X32-SSE-LABEL: fptrunc_frommem2:
-; X32-SSE: # BB#0: # %entry
+; X32-SSE: # %bb.0: # %entry
; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-SSE-NEXT: cvtpd2ps (%ecx), %xmm0
@@ -15,7 +15,7 @@ define void @fptrunc_frommem2(<2 x double>* %in, <2 x float>* %out) {
; X32-SSE-NEXT: retl
;
; X32-AVX-LABEL: fptrunc_frommem2:
-; X32-AVX: # BB#0: # %entry
+; X32-AVX: # %bb.0: # %entry
; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-AVX-NEXT: vcvtpd2psx (%ecx), %xmm0
@@ -24,13 +24,13 @@ define void @fptrunc_frommem2(<2 x double>* %in, <2 x float>* %out) {
; X32-AVX-NEXT: retl
;
; X64-SSE-LABEL: fptrunc_frommem2:
-; X64-SSE: # BB#0: # %entry
+; X64-SSE: # %bb.0: # %entry
; X64-SSE-NEXT: cvtpd2ps (%rdi), %xmm0
; X64-SSE-NEXT: movlpd %xmm0, (%rsi)
; X64-SSE-NEXT: retq
;
; X64-AVX-LABEL: fptrunc_frommem2:
-; X64-AVX: # BB#0: # %entry
+; X64-AVX: # %bb.0: # %entry
; X64-AVX-NEXT: vcvtpd2psx (%rdi), %xmm0
; X64-AVX-NEXT: vmovlpd %xmm0, (%rsi)
; X64-AVX-NEXT: retq
@@ -43,7 +43,7 @@ entry:
define void @fptrunc_frommem4(<4 x double>* %in, <4 x float>* %out) {
; X32-SSE-LABEL: fptrunc_frommem4:
-; X32-SSE: # BB#0: # %entry
+; X32-SSE: # %bb.0: # %entry
; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-SSE-NEXT: cvtpd2ps 16(%ecx), %xmm0
@@ -53,7 +53,7 @@ define void @fptrunc_frommem4(<4 x double>* %in, <4 x float>* %out) {
; X32-SSE-NEXT: retl
;
; X32-AVX-LABEL: fptrunc_frommem4:
-; X32-AVX: # BB#0: # %entry
+; X32-AVX: # %bb.0: # %entry
; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-AVX-NEXT: vcvtpd2psy (%ecx), %xmm0
@@ -61,7 +61,7 @@ define void @fptrunc_frommem4(<4 x double>* %in, <4 x float>* %out) {
; X32-AVX-NEXT: retl
;
; X64-SSE-LABEL: fptrunc_frommem4:
-; X64-SSE: # BB#0: # %entry
+; X64-SSE: # %bb.0: # %entry
; X64-SSE-NEXT: cvtpd2ps 16(%rdi), %xmm0
; X64-SSE-NEXT: cvtpd2ps (%rdi), %xmm1
; X64-SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0]
@@ -69,7 +69,7 @@ define void @fptrunc_frommem4(<4 x double>* %in, <4 x float>* %out) {
; X64-SSE-NEXT: retq
;
; X64-AVX-LABEL: fptrunc_frommem4:
-; X64-AVX: # BB#0: # %entry
+; X64-AVX: # %bb.0: # %entry
; X64-AVX-NEXT: vcvtpd2psy (%rdi), %xmm0
; X64-AVX-NEXT: vmovupd %xmm0, (%rsi)
; X64-AVX-NEXT: retq
@@ -82,7 +82,7 @@ entry:
define void @fptrunc_frommem8(<8 x double>* %in, <8 x float>* %out) {
; X32-SSE-LABEL: fptrunc_frommem8:
-; X32-SSE: # BB#0: # %entry
+; X32-SSE: # %bb.0: # %entry
; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-SSE-NEXT: cvtpd2ps 16(%ecx), %xmm0
@@ -96,7 +96,7 @@ define void @fptrunc_frommem8(<8 x double>* %in, <8 x float>* %out) {
; X32-SSE-NEXT: retl
;
; X32-AVX-LABEL: fptrunc_frommem8:
-; X32-AVX: # BB#0: # %entry
+; X32-AVX: # %bb.0: # %entry
; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-AVX-NEXT: vcvtpd2psy (%ecx), %xmm0
@@ -107,7 +107,7 @@ define void @fptrunc_frommem8(<8 x double>* %in, <8 x float>* %out) {
; X32-AVX-NEXT: retl
;
; X64-SSE-LABEL: fptrunc_frommem8:
-; X64-SSE: # BB#0: # %entry
+; X64-SSE: # %bb.0: # %entry
; X64-SSE-NEXT: cvtpd2ps 16(%rdi), %xmm0
; X64-SSE-NEXT: cvtpd2ps (%rdi), %xmm1
; X64-SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0]
@@ -119,7 +119,7 @@ define void @fptrunc_frommem8(<8 x double>* %in, <8 x float>* %out) {
; X64-SSE-NEXT: retq
;
; X64-AVX-LABEL: fptrunc_frommem8:
-; X64-AVX: # BB#0: # %entry
+; X64-AVX: # %bb.0: # %entry
; X64-AVX-NEXT: vcvtpd2psy (%rdi), %xmm0
; X64-AVX-NEXT: vcvtpd2psy 32(%rdi), %xmm1
; X64-AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
@@ -135,24 +135,24 @@ entry:
define <4 x float> @fptrunc_frommem2_zext(<2 x double> * %ld) {
; X32-SSE-LABEL: fptrunc_frommem2_zext:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-SSE-NEXT: cvtpd2ps (%eax), %xmm0
; X32-SSE-NEXT: retl
;
; X32-AVX-LABEL: fptrunc_frommem2_zext:
-; X32-AVX: # BB#0:
+; X32-AVX: # %bb.0:
; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-AVX-NEXT: vcvtpd2psx (%eax), %xmm0
; X32-AVX-NEXT: retl
;
; X64-SSE-LABEL: fptrunc_frommem2_zext:
-; X64-SSE: # BB#0:
+; X64-SSE: # %bb.0:
; X64-SSE-NEXT: cvtpd2ps (%rdi), %xmm0
; X64-SSE-NEXT: retq
;
; X64-AVX-LABEL: fptrunc_frommem2_zext:
-; X64-AVX: # BB#0:
+; X64-AVX: # %bb.0:
; X64-AVX-NEXT: vcvtpd2psx (%rdi), %xmm0
; X64-AVX-NEXT: retq
%arg = load <2 x double>, <2 x double> * %ld, align 16
@@ -163,22 +163,22 @@ define <4 x float> @fptrunc_frommem2_zext(<2 x double> * %ld) {
define <4 x float> @fptrunc_fromreg2_zext(<2 x double> %arg) {
; X32-SSE-LABEL: fptrunc_fromreg2_zext:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: cvtpd2ps %xmm0, %xmm0
; X32-SSE-NEXT: retl
;
; X32-AVX-LABEL: fptrunc_fromreg2_zext:
-; X32-AVX: # BB#0:
+; X32-AVX: # %bb.0:
; X32-AVX-NEXT: vcvtpd2ps %xmm0, %xmm0
; X32-AVX-NEXT: retl
;
; X64-SSE-LABEL: fptrunc_fromreg2_zext:
-; X64-SSE: # BB#0:
+; X64-SSE: # %bb.0:
; X64-SSE-NEXT: cvtpd2ps %xmm0, %xmm0
; X64-SSE-NEXT: retq
;
; X64-AVX-LABEL: fptrunc_fromreg2_zext:
-; X64-AVX: # BB#0:
+; X64-AVX: # %bb.0:
; X64-AVX-NEXT: vcvtpd2ps %xmm0, %xmm0
; X64-AVX-NEXT: retq
%cvt = fptrunc <2 x double> %arg to <2 x float>
@@ -189,26 +189,26 @@ define <4 x float> @fptrunc_fromreg2_zext(<2 x double> %arg) {
; FIXME: For exact truncations we should be able to fold this.
define <4 x float> @fptrunc_fromconst() {
; X32-SSE-LABEL: fptrunc_fromconst:
-; X32-SSE: # BB#0: # %entry
+; X32-SSE: # %bb.0: # %entry
; X32-SSE-NEXT: cvtpd2ps {{\.LCPI.*}}, %xmm1
; X32-SSE-NEXT: cvtpd2ps {{\.LCPI.*}}, %xmm0
; X32-SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; X32-SSE-NEXT: retl
;
; X32-AVX-LABEL: fptrunc_fromconst:
-; X32-AVX: # BB#0: # %entry
+; X32-AVX: # %bb.0: # %entry
; X32-AVX-NEXT: vcvtpd2psy {{\.LCPI.*}}, %xmm0
; X32-AVX-NEXT: retl
;
; X64-SSE-LABEL: fptrunc_fromconst:
-; X64-SSE: # BB#0: # %entry
+; X64-SSE: # %bb.0: # %entry
; X64-SSE-NEXT: cvtpd2ps {{.*}}(%rip), %xmm1
; X64-SSE-NEXT: cvtpd2ps {{.*}}(%rip), %xmm0
; X64-SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; X64-SSE-NEXT: retq
;
; X64-AVX-LABEL: fptrunc_fromconst:
-; X64-AVX: # BB#0: # %entry
+; X64-AVX: # %bb.0: # %entry
; X64-AVX-NEXT: vcvtpd2psy {{.*}}(%rip), %xmm0
; X64-AVX-NEXT: retq
entry:
diff --git a/test/CodeGen/X86/vec_i64.ll b/test/CodeGen/X86/vec_i64.ll
index 03d378896806..f9666a0cdef9 100644
--- a/test/CodeGen/X86/vec_i64.ll
+++ b/test/CodeGen/X86/vec_i64.ll
@@ -6,13 +6,13 @@
define <2 x i64> @foo1(i64* %y) nounwind {
; X32-LABEL: foo1:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; X32-NEXT: retl
;
; X64-LABEL: foo1:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; X64-NEXT: retq
entry:
@@ -25,13 +25,13 @@ entry:
define <4 x float> @foo2(i64* %p) nounwind {
; X32-LABEL: foo2:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; X32-NEXT: retl
;
; X64-LABEL: foo2:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; X64-NEXT: retq
entry:
diff --git a/test/CodeGen/X86/vec_ins_extract-1.ll b/test/CodeGen/X86/vec_ins_extract-1.ll
index 1dc8b7abd207..949ef569f65b 100644
--- a/test/CodeGen/X86/vec_ins_extract-1.ll
+++ b/test/CodeGen/X86/vec_ins_extract-1.ll
@@ -7,7 +7,7 @@
define i32 @t0(i32 inreg %t7, <4 x i32> inreg %t8) nounwind {
; X32-LABEL: t0:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pushl %ebp
; X32-NEXT: movl %esp, %ebp
; X32-NEXT: andl $-16, %esp
@@ -21,8 +21,8 @@ define i32 @t0(i32 inreg %t7, <4 x i32> inreg %t8) nounwind {
; X32-NEXT: retl
;
; X64-LABEL: t0:
-; X64: # BB#0:
-; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64: # %bb.0:
+; X64-NEXT: # kill: def %edi killed %edi def %rdi
; X64-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-NEXT: andl $3, %edi
; X64-NEXT: movl $76, -24(%rsp,%rdi,4)
@@ -35,7 +35,7 @@ define i32 @t0(i32 inreg %t7, <4 x i32> inreg %t8) nounwind {
define i32 @t1(i32 inreg %t7, <4 x i32> inreg %t8) nounwind {
; X32-LABEL: t1:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pushl %ebp
; X32-NEXT: movl %esp, %ebp
; X32-NEXT: andl $-16, %esp
@@ -50,8 +50,8 @@ define i32 @t1(i32 inreg %t7, <4 x i32> inreg %t8) nounwind {
; X32-NEXT: retl
;
; X64-LABEL: t1:
-; X64: # BB#0:
-; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64: # %bb.0:
+; X64-NEXT: # kill: def %edi killed %edi def %rdi
; X64-NEXT: movl $76, %eax
; X64-NEXT: pinsrd $0, %eax, %xmm0
; X64-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp)
@@ -65,7 +65,7 @@ define i32 @t1(i32 inreg %t7, <4 x i32> inreg %t8) nounwind {
define <4 x i32> @t2(i32 inreg %t7, <4 x i32> inreg %t8) nounwind {
; X32-LABEL: t2:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pushl %ebp
; X32-NEXT: movl %esp, %ebp
; X32-NEXT: andl $-16, %esp
@@ -78,8 +78,8 @@ define <4 x i32> @t2(i32 inreg %t7, <4 x i32> inreg %t8) nounwind {
; X32-NEXT: retl
;
; X64-LABEL: t2:
-; X64: # BB#0:
-; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64: # %bb.0:
+; X64-NEXT: # kill: def %edi killed %edi def %rdi
; X64-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp)
; X64-NEXT: andl $3, %edi
; X64-NEXT: pinsrd $0, -24(%rsp,%rdi,4), %xmm0
@@ -91,7 +91,7 @@ define <4 x i32> @t2(i32 inreg %t7, <4 x i32> inreg %t8) nounwind {
define <4 x i32> @t3(i32 inreg %t7, <4 x i32> inreg %t8) nounwind {
; X32-LABEL: t3:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pushl %ebp
; X32-NEXT: movl %esp, %ebp
; X32-NEXT: andl $-16, %esp
@@ -105,8 +105,8 @@ define <4 x i32> @t3(i32 inreg %t7, <4 x i32> inreg %t8) nounwind {
; X32-NEXT: retl
;
; X64-LABEL: t3:
-; X64: # BB#0:
-; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64: # %bb.0:
+; X64-NEXT: # kill: def %edi killed %edi def %rdi
; X64-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-NEXT: andl $3, %edi
; X64-NEXT: movss %xmm0, -24(%rsp,%rdi,4)
diff --git a/test/CodeGen/X86/vec_ins_extract.ll b/test/CodeGen/X86/vec_ins_extract.ll
index 5ff49eff6df3..e05c99778129 100644
--- a/test/CodeGen/X86/vec_ins_extract.ll
+++ b/test/CodeGen/X86/vec_ins_extract.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: opt < %s -sroa -instcombine | \
-; RUN: llc -march=x86 -mcpu=yonah | not grep sub.*esp
+; RUN: llc -mtriple=i686-- -mcpu=yonah | not grep sub.*esp
; This checks that various insert/extract idiom work without going to the
; stack.
diff --git a/test/CodeGen/X86/vec_insert-2.ll b/test/CodeGen/X86/vec_insert-2.ll
index 5604049d49ab..9fb0dc54f2a4 100644
--- a/test/CodeGen/X86/vec_insert-2.ll
+++ b/test/CodeGen/X86/vec_insert-2.ll
@@ -4,14 +4,14 @@
define <4 x float> @t1(float %s, <4 x float> %tmp) nounwind {
; X32-LABEL: t1:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X32-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0]
; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
; X32-NEXT: retl
;
; X64-LABEL: t1:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[2,0]
; X64-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
; X64-NEXT: movaps %xmm1, %xmm0
@@ -22,14 +22,14 @@ define <4 x float> @t1(float %s, <4 x float> %tmp) nounwind {
define <4 x i32> @t2(i32 %s, <4 x i32> %tmp) nounwind {
; X32-LABEL: t2:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X32-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0]
; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
; X32-NEXT: retl
;
; X64-LABEL: t2:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movd %edi, %xmm1
; X64-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0]
; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
@@ -40,14 +40,14 @@ define <4 x i32> @t2(i32 %s, <4 x i32> %tmp) nounwind {
define <2 x double> @t3(double %s, <2 x double> %tmp) nounwind {
; X32-LABEL: t3:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
; X32-NEXT: retl
;
; X64-LABEL: t3:
-; X64: # BB#0:
-; X64-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; X64-NEXT: movapd %xmm1, %xmm0
+; X64: # %bb.0:
+; X64-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; X64-NEXT: movaps %xmm1, %xmm0
; X64-NEXT: retq
%tmp1 = insertelement <2 x double> %tmp, double %s, i32 1
ret <2 x double> %tmp1
@@ -55,12 +55,12 @@ define <2 x double> @t3(double %s, <2 x double> %tmp) nounwind {
define <8 x i16> @t4(i16 %s, <8 x i16> %tmp) nounwind {
; X32-LABEL: t4:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pinsrw $5, {{[0-9]+}}(%esp), %xmm0
; X32-NEXT: retl
;
; X64-LABEL: t4:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: pinsrw $5, %edi, %xmm0
; X64-NEXT: retq
%tmp1 = insertelement <8 x i16> %tmp, i16 %s, i32 5
diff --git a/test/CodeGen/X86/vec_insert-3.ll b/test/CodeGen/X86/vec_insert-3.ll
index ff8b1f14c52d..8ec6fa1cf067 100644
--- a/test/CodeGen/X86/vec_insert-3.ll
+++ b/test/CodeGen/X86/vec_insert-3.ll
@@ -4,7 +4,7 @@
define <2 x i64> @t1(i64 %s, <2 x i64> %tmp) nounwind {
; X32-LABEL: t1:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X32-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
@@ -14,7 +14,7 @@ define <2 x i64> @t1(i64 %s, <2 x i64> %tmp) nounwind {
; X32-NEXT: retl
;
; X64-LABEL: t1:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movq %rdi, %xmm1
; X64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; X64-NEXT: retq
diff --git a/test/CodeGen/X86/vec_insert-4.ll b/test/CodeGen/X86/vec_insert-4.ll
index 82627c54e663..060216596302 100644
--- a/test/CodeGen/X86/vec_insert-4.ll
+++ b/test/CodeGen/X86/vec_insert-4.ll
@@ -4,7 +4,7 @@
define <8 x float> @f(<8 x float> %a, i32 %b) nounwind {
; X32-LABEL: f:
-; X32: ## BB#0: ## %entry
+; X32: ## %bb.0: ## %entry
; X32-NEXT: pushl %ebp
; X32-NEXT: movl %esp, %ebp
; X32-NEXT: andl $-32, %esp
@@ -21,12 +21,12 @@ define <8 x float> @f(<8 x float> %a, i32 %b) nounwind {
; X32-NEXT: retl
;
; X64-LABEL: f:
-; X64: ## BB#0: ## %entry
+; X64: ## %bb.0: ## %entry
; X64-NEXT: pushq %rbp
; X64-NEXT: movq %rsp, %rbp
; X64-NEXT: andq $-32, %rsp
; X64-NEXT: subq $64, %rsp
-; X64-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-NEXT: ## kill: def %edi killed %edi def %rdi
; X64-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
; X64-NEXT: movaps %xmm0, (%rsp)
; X64-NEXT: andl $7, %edi
diff --git a/test/CodeGen/X86/vec_insert-5.ll b/test/CodeGen/X86/vec_insert-5.ll
index e7c06a99df9c..d4a0c82e793a 100644
--- a/test/CodeGen/X86/vec_insert-5.ll
+++ b/test/CodeGen/X86/vec_insert-5.ll
@@ -6,7 +6,7 @@
define void @t1(i32 %a, x86_mmx* %P) nounwind {
; X32-LABEL: t1:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: shll $12, %ecx
@@ -16,8 +16,8 @@ define void @t1(i32 %a, x86_mmx* %P) nounwind {
; X32-NEXT: retl
;
; X64-LABEL: t1:
-; X64: # BB#0:
-; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64: # %bb.0:
+; X64-NEXT: # kill: def %edi killed %edi def %rdi
; X64-NEXT: shll $12, %edi
; X64-NEXT: movq %rdi, %xmm0
; X64-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
@@ -34,7 +34,7 @@ define void @t1(i32 %a, x86_mmx* %P) nounwind {
define <4 x float> @t2(<4 x float>* %P) nounwind {
; X32-LABEL: t2:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movaps (%eax), %xmm1
; X32-NEXT: xorps %xmm0, %xmm0
@@ -43,7 +43,7 @@ define <4 x float> @t2(<4 x float>* %P) nounwind {
; X32-NEXT: retl
;
; X64-LABEL: t2:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movaps (%rdi), %xmm1
; X64-NEXT: xorps %xmm0, %xmm0
; X64-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0]
@@ -56,14 +56,14 @@ define <4 x float> @t2(<4 x float>* %P) nounwind {
define <4 x float> @t3(<4 x float>* %P) nounwind {
; X32-LABEL: t3:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: xorps %xmm0, %xmm0
; X32-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
; X32-NEXT: retl
;
; X64-LABEL: t3:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: xorps %xmm0, %xmm0
; X64-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
; X64-NEXT: retq
@@ -74,7 +74,7 @@ define <4 x float> @t3(<4 x float>* %P) nounwind {
define <4 x float> @t4(<4 x float>* %P) nounwind {
; X32-LABEL: t4:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movaps (%eax), %xmm0
; X32-NEXT: xorps %xmm1, %xmm1
@@ -83,7 +83,7 @@ define <4 x float> @t4(<4 x float>* %P) nounwind {
; X32-NEXT: retl
;
; X64-LABEL: t4:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movaps (%rdi), %xmm0
; X64-NEXT: xorps %xmm1, %xmm1
; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[1,0]
@@ -96,12 +96,12 @@ define <4 x float> @t4(<4 x float>* %P) nounwind {
define <16 x i8> @t5(<16 x i8> %x) nounwind {
; X32-LABEL: t5:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: psrlw $8, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: t5:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: psrlw $8, %xmm0
; X64-NEXT: retq
%s = shufflevector <16 x i8> %x, <16 x i8> zeroinitializer, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 17>
@@ -110,12 +110,12 @@ define <16 x i8> @t5(<16 x i8> %x) nounwind {
define <16 x i8> @t6(<16 x i8> %x) nounwind {
; X32-LABEL: t6:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: psrlw $8, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: t6:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: psrlw $8, %xmm0
; X64-NEXT: retq
%s = shufflevector <16 x i8> %x, <16 x i8> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -124,12 +124,12 @@ define <16 x i8> @t6(<16 x i8> %x) nounwind {
define <16 x i8> @t7(<16 x i8> %x) nounwind {
; X32-LABEL: t7:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2]
; X32-NEXT: retl
;
; X64-LABEL: t7:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2]
; X64-NEXT: retq
%s = shufflevector <16 x i8> %x, <16 x i8> undef, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 2>
@@ -138,12 +138,12 @@ define <16 x i8> @t7(<16 x i8> %x) nounwind {
define <16 x i8> @t8(<16 x i8> %x) nounwind {
; X32-LABEL: t8:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: psrldq {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero
; X32-NEXT: retl
;
; X64-LABEL: t8:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: psrldq {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero
; X64-NEXT: retq
%s = shufflevector <16 x i8> %x, <16 x i8> zeroinitializer, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 8, i32 9, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 17>
@@ -152,12 +152,12 @@ define <16 x i8> @t8(<16 x i8> %x) nounwind {
define <16 x i8> @t9(<16 x i8> %x) nounwind {
; X32-LABEL: t9:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: psrldq {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero
; X32-NEXT: retl
;
; X64-LABEL: t9:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: psrldq {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero
; X64-NEXT: retq
%s = shufflevector <16 x i8> %x, <16 x i8> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 7, i32 8, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 14, i32 undef, i32 undef>
diff --git a/test/CodeGen/X86/vec_insert-7.ll b/test/CodeGen/X86/vec_insert-7.ll
index 02db6e6d8751..bfced4b3877d 100644
--- a/test/CodeGen/X86/vec_insert-7.ll
+++ b/test/CodeGen/X86/vec_insert-7.ll
@@ -2,12 +2,12 @@
; RUN: llc < %s -mtriple=i686-apple-darwin9 -mattr=+mmx,+sse4.2 | FileCheck %s --check-prefix=X32
; RUN: llc < %s -mtriple=x86_64-apple-darwin9 -mattr=+mmx,+sse4.2 | FileCheck %s --check-prefix=X64
-; MMX insertelement is not available; these are promoted to XMM.
+; MMX insertelement is not available; these are promoted to xmm.
; (Without SSE they are split to two ints, and the code is much better.)
define x86_mmx @mmx_movzl(x86_mmx %x) nounwind {
; X32-LABEL: mmx_movzl:
-; X32: ## BB#0:
+; X32: ## %bb.0:
; X32-NEXT: subl $20, %esp
; X32-NEXT: movq %mm0, {{[0-9]+}}(%esp)
; X32-NEXT: pmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
@@ -21,7 +21,7 @@ define x86_mmx @mmx_movzl(x86_mmx %x) nounwind {
; X32-NEXT: retl
;
; X64-LABEL: mmx_movzl:
-; X64: ## BB#0:
+; X64: ## %bb.0:
; X64-NEXT: movdq2q %xmm0, %mm0
; X64-NEXT: movq %mm0, -{{[0-9]+}}(%rsp)
; X64-NEXT: pmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
diff --git a/test/CodeGen/X86/vec_insert-8.ll b/test/CodeGen/X86/vec_insert-8.ll
index 4074b6d32353..a421ff292633 100644
--- a/test/CodeGen/X86/vec_insert-8.ll
+++ b/test/CodeGen/X86/vec_insert-8.ll
@@ -6,7 +6,7 @@
define <4 x i32> @var_insert(<4 x i32> %x, i32 %val, i32 %idx) nounwind {
; X32-LABEL: var_insert:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: pushl %ebp
; X32-NEXT: movl %esp, %ebp
; X32-NEXT: andl $-16, %esp
@@ -22,8 +22,8 @@ define <4 x i32> @var_insert(<4 x i32> %x, i32 %val, i32 %idx) nounwind {
; X32-NEXT: retl
;
; X64-LABEL: var_insert:
-; X64: # BB#0: # %entry
-; X64-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
+; X64: # %bb.0: # %entry
+; X64-NEXT: # kill: def %esi killed %esi def %rsi
; X64-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-NEXT: andl $3, %esi
; X64-NEXT: movl %edi, -24(%rsp,%rsi,4)
@@ -36,7 +36,7 @@ entry:
define i32 @var_extract(<4 x i32> %x, i32 %idx) nounwind {
; X32-LABEL: var_extract:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: pushl %ebp
; X32-NEXT: movl %esp, %ebp
; X32-NEXT: andl $-16, %esp
@@ -50,8 +50,8 @@ define i32 @var_extract(<4 x i32> %x, i32 %idx) nounwind {
; X32-NEXT: retl
;
; X64-LABEL: var_extract:
-; X64: # BB#0: # %entry
-; X64-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64: # %bb.0: # %entry
+; X64-NEXT: # kill: def %edi killed %edi def %rdi
; X64-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-NEXT: andl $3, %edi
; X64-NEXT: movl -24(%rsp,%rdi,4), %eax
diff --git a/test/CodeGen/X86/vec_insert-9.ll b/test/CodeGen/X86/vec_insert-9.ll
index ec4a0288e107..a750c6faac81 100644
--- a/test/CodeGen/X86/vec_insert-9.ll
+++ b/test/CodeGen/X86/vec_insert-9.ll
@@ -4,13 +4,13 @@
define <4 x i32> @var_insert2(<4 x i32> %x, i32 %val, i32 %idx) nounwind {
; X32-LABEL: var_insert2:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X32-NEXT: pinsrd $3, {{[0-9]+}}(%esp), %xmm0
; X32-NEXT: retl
;
; X64-LABEL: var_insert2:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: movd %edi, %xmm0
; X64-NEXT: pinsrd $3, %esi, %xmm0
; X64-NEXT: retq
diff --git a/test/CodeGen/X86/vec_insert-mmx.ll b/test/CodeGen/X86/vec_insert-mmx.ll
index fffafe7697da..39e21e90f01f 100644
--- a/test/CodeGen/X86/vec_insert-mmx.ll
+++ b/test/CodeGen/X86/vec_insert-mmx.ll
@@ -2,10 +2,10 @@
; RUN: llc < %s -mtriple=i686-darwin -mattr=+mmx,+sse2 | FileCheck %s --check-prefix=X32
; RUN: llc < %s -mtriple=x86_64-darwin -mattr=+mmx,+sse4.1 | FileCheck %s --check-prefix=X64
-; This is not an MMX operation; promoted to XMM.
+; This is not an MMX operation; promoted to xmm.
define x86_mmx @t0(i32 %A) nounwind {
; X32-LABEL: t0:
-; X32: ## BB#0:
+; X32: ## %bb.0:
; X32-NEXT: subl $12, %esp
; X32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,1,1]
@@ -15,8 +15,8 @@ define x86_mmx @t0(i32 %A) nounwind {
; X32-NEXT: retl
;
; X64-LABEL: t0:
-; X64: ## BB#0:
-; X64-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64: ## %bb.0:
+; X64-NEXT: ## kill: def %edi killed %edi def %rdi
; X64-NEXT: movq %rdi, %xmm0
; X64-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
@@ -28,12 +28,12 @@ define x86_mmx @t0(i32 %A) nounwind {
define <8 x i8> @t1(i8 zeroext %x) nounwind {
; X32-LABEL: t1:
-; X32: ## BB#0:
+; X32: ## %bb.0:
; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X32-NEXT: retl
;
; X64-LABEL: t1:
-; X64: ## BB#0:
+; X64: ## %bb.0:
; X64-NEXT: movd %edi, %xmm0
; X64-NEXT: retq
%r = insertelement <8 x i8> undef, i8 %x, i32 0
@@ -43,12 +43,12 @@ define <8 x i8> @t1(i8 zeroext %x) nounwind {
; PR2574
define <2 x float> @t2(<2 x float> %a0) {
; X32-LABEL: t2:
-; X32: ## BB#0:
+; X32: ## %bb.0:
; X32-NEXT: xorps %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: t2:
-; X64: ## BB#0:
+; X64: ## %bb.0:
; X64-NEXT: xorps %xmm0, %xmm0
; X64-NEXT: retq
%v1 = insertelement <2 x float> %a0, float 0.000000e+00, i32 0
@@ -62,7 +62,7 @@ define <2 x float> @t2(<2 x float> %a0) {
; PR2562
define void @t3() {
; X32-LABEL: t3:
-; X32: ## BB#0:
+; X32: ## %bb.0:
; X32-NEXT: movl L_g0$non_lazy_ptr, %eax
; X32-NEXT: movl L_g1$non_lazy_ptr, %ecx
; X32-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
@@ -77,7 +77,7 @@ define void @t3() {
; X32-NEXT: retl
;
; X64-LABEL: t3:
-; X64: ## BB#0:
+; X64: ## %bb.0:
; X64-NEXT: movq _g0@{{.*}}(%rip), %rax
; X64-NEXT: movq _g1@{{.*}}(%rip), %rcx
; X64-NEXT: pmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
diff --git a/test/CodeGen/X86/vec_int_to_fp.ll b/test/CodeGen/X86/vec_int_to_fp.ll
index 7cb1c95cb01a..30ba72760435 100644
--- a/test/CodeGen/X86/vec_int_to_fp.ll
+++ b/test/CodeGen/X86/vec_int_to_fp.ll
@@ -18,54 +18,54 @@
define <2 x double> @sitofp_2i64_to_2f64(<2 x i64> %a) {
; SSE-LABEL: sitofp_2i64_to_2f64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movq %xmm0, %rax
; SSE-NEXT: cvtsi2sdq %rax, %xmm1
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
; SSE-NEXT: movq %xmm0, %rax
; SSE-NEXT: xorps %xmm0, %xmm0
; SSE-NEXT: cvtsi2sdq %rax, %xmm0
-; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; SSE-NEXT: movapd %xmm1, %xmm0
+; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE-NEXT: movaps %xmm1, %xmm0
; SSE-NEXT: retq
;
; VEX-LABEL: sitofp_2i64_to_2f64:
-; VEX: # BB#0:
+; VEX: # %bb.0:
; VEX-NEXT: vpextrq $1, %xmm0, %rax
; VEX-NEXT: vcvtsi2sdq %rax, %xmm1, %xmm1
; VEX-NEXT: vmovq %xmm0, %rax
; VEX-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm0
-; VEX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; VEX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; VEX-NEXT: retq
;
; AVX512F-LABEL: sitofp_2i64_to_2f64:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vpextrq $1, %xmm0, %rax
; AVX512F-NEXT: vcvtsi2sdq %rax, %xmm1, %xmm1
; AVX512F-NEXT: vmovq %xmm0, %rax
; AVX512F-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm0
-; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512F-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: sitofp_2i64_to_2f64:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax
; AVX512VL-NEXT: vcvtsi2sdq %rax, %xmm1, %xmm1
; AVX512VL-NEXT: vmovq %xmm0, %rax
; AVX512VL-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm0
-; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512VL-NEXT: retq
;
; AVX512DQ-LABEL: sitofp_2i64_to_2f64:
-; AVX512DQ: # BB#0:
-; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: # kill: def %xmm0 killed %xmm0 def %zmm0
; AVX512DQ-NEXT: vcvtqq2pd %zmm0, %zmm0
-; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512DQ-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512VLDQ-LABEL: sitofp_2i64_to_2f64:
-; AVX512VLDQ: # BB#0:
+; AVX512VLDQ: # %bb.0:
; AVX512VLDQ-NEXT: vcvtqq2pd %xmm0, %xmm0
; AVX512VLDQ-NEXT: retq
%cvt = sitofp <2 x i64> %a to <2 x double>
@@ -74,12 +74,12 @@ define <2 x double> @sitofp_2i64_to_2f64(<2 x i64> %a) {
define <2 x double> @sitofp_2i32_to_2f64(<4 x i32> %a) {
; SSE-LABEL: sitofp_2i32_to_2f64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: cvtdq2pd %xmm0, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: sitofp_2i32_to_2f64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0
; AVX-NEXT: retq
%shuf = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
@@ -89,14 +89,14 @@ define <2 x double> @sitofp_2i32_to_2f64(<4 x i32> %a) {
define <2 x double> @sitofp_4i32_to_2f64(<4 x i32> %a) {
; SSE-LABEL: sitofp_4i32_to_2f64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: cvtdq2pd %xmm0, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: sitofp_4i32_to_2f64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0
-; AVX-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
%cvt = sitofp <4 x i32> %a to <4 x double>
@@ -106,14 +106,14 @@ define <2 x double> @sitofp_4i32_to_2f64(<4 x i32> %a) {
define <2 x double> @sitofp_2i16_to_2f64(<8 x i16> %a) {
; SSE-LABEL: sitofp_2i16_to_2f64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
; SSE-NEXT: psrad $16, %xmm0
; SSE-NEXT: cvtdq2pd %xmm0, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: sitofp_2i16_to_2f64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpmovsxwd %xmm0, %xmm0
; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0
; AVX-NEXT: retq
@@ -124,33 +124,33 @@ define <2 x double> @sitofp_2i16_to_2f64(<8 x i16> %a) {
define <2 x double> @sitofp_8i16_to_2f64(<8 x i16> %a) {
; SSE-LABEL: sitofp_8i16_to_2f64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
; SSE-NEXT: psrad $16, %xmm0
; SSE-NEXT: cvtdq2pd %xmm0, %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: sitofp_8i16_to_2f64:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0
; AVX1-NEXT: vcvtdq2pd %xmm0, %ymm0
-; AVX1-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX1-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: sitofp_8i16_to_2f64:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0
-; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512-LABEL: sitofp_8i16_to_2f64:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpmovsxwd %xmm0, %ymm0
; AVX512-NEXT: vcvtdq2pd %ymm0, %zmm0
-; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%cvt = sitofp <8 x i16> %a to <8 x double>
@@ -160,7 +160,7 @@ define <2 x double> @sitofp_8i16_to_2f64(<8 x i16> %a) {
define <2 x double> @sitofp_2i8_to_2f64(<16 x i8> %a) {
; SSE-LABEL: sitofp_2i8_to_2f64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
; SSE-NEXT: psrad $24, %xmm0
@@ -168,7 +168,7 @@ define <2 x double> @sitofp_2i8_to_2f64(<16 x i8> %a) {
; SSE-NEXT: retq
;
; AVX-LABEL: sitofp_2i8_to_2f64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpmovsxbd %xmm0, %xmm0
; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0
; AVX-NEXT: retq
@@ -179,7 +179,7 @@ define <2 x double> @sitofp_2i8_to_2f64(<16 x i8> %a) {
define <2 x double> @sitofp_16i8_to_2f64(<16 x i8> %a) {
; SSE-LABEL: sitofp_16i8_to_2f64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
; SSE-NEXT: psrad $24, %xmm0
@@ -187,26 +187,26 @@ define <2 x double> @sitofp_16i8_to_2f64(<16 x i8> %a) {
; SSE-NEXT: retq
;
; AVX1-LABEL: sitofp_16i8_to_2f64:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0
; AVX1-NEXT: vcvtdq2pd %xmm0, %ymm0
-; AVX1-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX1-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: sitofp_16i8_to_2f64:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpmovsxbd %xmm0, %ymm0
; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0
-; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512-LABEL: sitofp_16i8_to_2f64:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpmovsxbd %xmm0, %zmm0
; AVX512-NEXT: vcvtdq2pd %ymm0, %zmm0
-; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%cvt = sitofp <16 x i8> %a to <16 x double>
@@ -216,98 +216,98 @@ define <2 x double> @sitofp_16i8_to_2f64(<16 x i8> %a) {
define <4 x double> @sitofp_4i64_to_4f64(<4 x i64> %a) {
; SSE-LABEL: sitofp_4i64_to_4f64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movq %xmm0, %rax
; SSE-NEXT: cvtsi2sdq %rax, %xmm2
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
; SSE-NEXT: movq %xmm0, %rax
; SSE-NEXT: xorps %xmm0, %xmm0
; SSE-NEXT: cvtsi2sdq %rax, %xmm0
-; SSE-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm0[0]
+; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
; SSE-NEXT: movq %xmm1, %rax
; SSE-NEXT: cvtsi2sdq %rax, %xmm3
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
; SSE-NEXT: movq %xmm0, %rax
; SSE-NEXT: xorps %xmm0, %xmm0
; SSE-NEXT: cvtsi2sdq %rax, %xmm0
-; SSE-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm0[0]
-; SSE-NEXT: movapd %xmm2, %xmm0
-; SSE-NEXT: movapd %xmm3, %xmm1
+; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0]
+; SSE-NEXT: movaps %xmm2, %xmm0
+; SSE-NEXT: movaps %xmm3, %xmm1
; SSE-NEXT: retq
;
; AVX1-LABEL: sitofp_4i64_to_4f64:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpextrq $1, %xmm1, %rax
; AVX1-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm2
; AVX1-NEXT: vmovq %xmm1, %rax
; AVX1-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm1
-; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX1-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; AVX1-NEXT: vpextrq $1, %xmm0, %rax
; AVX1-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm2
; AVX1-NEXT: vmovq %xmm0, %rax
; AVX1-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm0
-; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: sitofp_4i64_to_4f64:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpextrq $1, %xmm1, %rax
; AVX2-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm2
; AVX2-NEXT: vmovq %xmm1, %rax
; AVX2-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm1
-; AVX2-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; AVX2-NEXT: vpextrq $1, %xmm0, %rax
; AVX2-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm2
; AVX2-NEXT: vmovq %xmm0, %rax
; AVX2-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm0
-; AVX2-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: sitofp_4i64_to_4f64:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512F-NEXT: vpextrq $1, %xmm1, %rax
; AVX512F-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm2
; AVX512F-NEXT: vmovq %xmm1, %rax
; AVX512F-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm1
-; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX512F-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; AVX512F-NEXT: vpextrq $1, %xmm0, %rax
; AVX512F-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm2
; AVX512F-NEXT: vmovq %xmm0, %rax
; AVX512F-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm0
-; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX512F-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; AVX512F-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: sitofp_4i64_to_4f64:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512VL-NEXT: vpextrq $1, %xmm1, %rax
; AVX512VL-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm2
; AVX512VL-NEXT: vmovq %xmm1, %rax
; AVX512VL-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm1
-; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax
; AVX512VL-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm2
; AVX512VL-NEXT: vmovq %xmm0, %rax
; AVX512VL-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm0
-; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX512VL-NEXT: retq
;
; AVX512DQ-LABEL: sitofp_4i64_to_4f64:
-; AVX512DQ: # BB#0:
-; AVX512DQ-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; AVX512DQ-NEXT: vcvtqq2pd %zmm0, %zmm0
-; AVX512DQ-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512DQ-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
; AVX512DQ-NEXT: retq
;
; AVX512VLDQ-LABEL: sitofp_4i64_to_4f64:
-; AVX512VLDQ: # BB#0:
+; AVX512VLDQ: # %bb.0:
; AVX512VLDQ-NEXT: vcvtqq2pd %ymm0, %ymm0
; AVX512VLDQ-NEXT: retq
%cvt = sitofp <4 x i64> %a to <4 x double>
@@ -316,7 +316,7 @@ define <4 x double> @sitofp_4i64_to_4f64(<4 x i64> %a) {
define <4 x double> @sitofp_4i32_to_4f64(<4 x i32> %a) {
; SSE-LABEL: sitofp_4i32_to_4f64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: cvtdq2pd %xmm0, %xmm2
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
; SSE-NEXT: cvtdq2pd %xmm0, %xmm1
@@ -324,7 +324,7 @@ define <4 x double> @sitofp_4i32_to_4f64(<4 x i32> %a) {
; SSE-NEXT: retq
;
; AVX-LABEL: sitofp_4i32_to_4f64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0
; AVX-NEXT: retq
%cvt = sitofp <4 x i32> %a to <4 x double>
@@ -333,7 +333,7 @@ define <4 x double> @sitofp_4i32_to_4f64(<4 x i32> %a) {
define <4 x double> @sitofp_4i16_to_4f64(<8 x i16> %a) {
; SSE-LABEL: sitofp_4i16_to_4f64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; SSE-NEXT: psrad $16, %xmm1
; SSE-NEXT: cvtdq2pd %xmm1, %xmm0
@@ -342,7 +342,7 @@ define <4 x double> @sitofp_4i16_to_4f64(<8 x i16> %a) {
; SSE-NEXT: retq
;
; AVX-LABEL: sitofp_4i16_to_4f64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpmovsxwd %xmm0, %xmm0
; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0
; AVX-NEXT: retq
@@ -353,7 +353,7 @@ define <4 x double> @sitofp_4i16_to_4f64(<8 x i16> %a) {
define <4 x double> @sitofp_8i16_to_4f64(<8 x i16> %a) {
; SSE-LABEL: sitofp_8i16_to_4f64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; SSE-NEXT: psrad $16, %xmm1
; SSE-NEXT: cvtdq2pd %xmm1, %xmm0
@@ -362,22 +362,22 @@ define <4 x double> @sitofp_8i16_to_4f64(<8 x i16> %a) {
; SSE-NEXT: retq
;
; AVX1-LABEL: sitofp_8i16_to_4f64:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0
; AVX1-NEXT: vcvtdq2pd %xmm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: sitofp_8i16_to_4f64:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: sitofp_8i16_to_4f64:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpmovsxwd %xmm0, %ymm0
; AVX512-NEXT: vcvtdq2pd %ymm0, %zmm0
-; AVX512-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
; AVX512-NEXT: retq
%cvt = sitofp <8 x i16> %a to <8 x double>
%shuf = shufflevector <8 x double> %cvt, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -386,7 +386,7 @@ define <4 x double> @sitofp_8i16_to_4f64(<8 x i16> %a) {
define <4 x double> @sitofp_4i8_to_4f64(<16 x i8> %a) {
; SSE-LABEL: sitofp_4i8_to_4f64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; SSE-NEXT: psrad $24, %xmm1
@@ -396,7 +396,7 @@ define <4 x double> @sitofp_4i8_to_4f64(<16 x i8> %a) {
; SSE-NEXT: retq
;
; AVX-LABEL: sitofp_4i8_to_4f64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpmovsxbd %xmm0, %xmm0
; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0
; AVX-NEXT: retq
@@ -407,7 +407,7 @@ define <4 x double> @sitofp_4i8_to_4f64(<16 x i8> %a) {
define <4 x double> @sitofp_16i8_to_4f64(<16 x i8> %a) {
; SSE-LABEL: sitofp_16i8_to_4f64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; SSE-NEXT: psrad $24, %xmm1
@@ -417,22 +417,22 @@ define <4 x double> @sitofp_16i8_to_4f64(<16 x i8> %a) {
; SSE-NEXT: retq
;
; AVX1-LABEL: sitofp_16i8_to_4f64:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0
; AVX1-NEXT: vcvtdq2pd %xmm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: sitofp_16i8_to_4f64:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpmovsxbd %xmm0, %ymm0
; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: sitofp_16i8_to_4f64:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpmovsxbd %xmm0, %zmm0
; AVX512-NEXT: vcvtdq2pd %ymm0, %zmm0
-; AVX512-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
; AVX512-NEXT: retq
%cvt = sitofp <16 x i8> %a to <16 x double>
%shuf = shufflevector <16 x double> %cvt, <16 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -445,7 +445,7 @@ define <4 x double> @sitofp_16i8_to_4f64(<16 x i8> %a) {
define <2 x double> @uitofp_2i64_to_2f64(<2 x i64> %a) {
; SSE-LABEL: uitofp_2i64_to_2f64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movdqa {{.*#+}} xmm1 = [1127219200,1160773632,0,0]
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
@@ -461,45 +461,45 @@ define <2 x double> @uitofp_2i64_to_2f64(<2 x i64> %a) {
; SSE-NEXT: retq
;
; VEX-LABEL: uitofp_2i64_to_2f64:
-; VEX: # BB#0:
-; VEX-NEXT: vmovdqa {{.*#+}} xmm1 = [1127219200,1160773632,0,0]
-; VEX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; VEX: # %bb.0:
+; VEX-NEXT: vmovapd {{.*#+}} xmm1 = [1127219200,1160773632,0,0]
+; VEX-NEXT: vunpcklps {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; VEX-NEXT: vmovapd {{.*#+}} xmm3 = [4.503600e+15,1.934281e+25]
; VEX-NEXT: vsubpd %xmm3, %xmm2, %xmm2
-; VEX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; VEX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; VEX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; VEX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; VEX-NEXT: vsubpd %xmm3, %xmm0, %xmm0
; VEX-NEXT: vhaddpd %xmm0, %xmm2, %xmm0
; VEX-NEXT: retq
;
; AVX512F-LABEL: uitofp_2i64_to_2f64:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vpextrq $1, %xmm0, %rax
; AVX512F-NEXT: vcvtusi2sdq %rax, %xmm1, %xmm1
; AVX512F-NEXT: vmovq %xmm0, %rax
; AVX512F-NEXT: vcvtusi2sdq %rax, %xmm2, %xmm0
-; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512F-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: uitofp_2i64_to_2f64:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax
; AVX512VL-NEXT: vcvtusi2sdq %rax, %xmm1, %xmm1
; AVX512VL-NEXT: vmovq %xmm0, %rax
; AVX512VL-NEXT: vcvtusi2sdq %rax, %xmm2, %xmm0
-; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512VL-NEXT: retq
;
; AVX512DQ-LABEL: uitofp_2i64_to_2f64:
-; AVX512DQ: # BB#0:
-; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: # kill: def %xmm0 killed %xmm0 def %zmm0
; AVX512DQ-NEXT: vcvtuqq2pd %zmm0, %zmm0
-; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512DQ-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512VLDQ-LABEL: uitofp_2i64_to_2f64:
-; AVX512VLDQ: # BB#0:
+; AVX512VLDQ: # %bb.0:
; AVX512VLDQ-NEXT: vcvtuqq2pd %xmm0, %xmm0
; AVX512VLDQ-NEXT: retq
%cvt = uitofp <2 x i64> %a to <2 x double>
@@ -508,7 +508,7 @@ define <2 x double> @uitofp_2i64_to_2f64(<2 x i64> %a) {
define <2 x double> @uitofp_2i32_to_2f64(<4 x i32> %a) {
; SSE-LABEL: uitofp_2i32_to_2f64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,0,65535,0,65535,0]
; SSE-NEXT: pand %xmm0, %xmm1
; SSE-NEXT: cvtdq2pd %xmm1, %xmm1
@@ -519,7 +519,7 @@ define <2 x double> @uitofp_2i32_to_2f64(<4 x i32> %a) {
; SSE-NEXT: retq
;
; VEX-LABEL: uitofp_2i32_to_2f64:
-; VEX: # BB#0:
+; VEX: # %bb.0:
; VEX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; VEX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
; VEX-NEXT: vcvtdq2pd %xmm1, %xmm1
@@ -530,28 +530,28 @@ define <2 x double> @uitofp_2i32_to_2f64(<4 x i32> %a) {
; VEX-NEXT: retq
;
; AVX512F-LABEL: uitofp_2i32_to_2f64:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
; AVX512F-NEXT: vcvtudq2pd %ymm0, %zmm0
-; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512F-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: uitofp_2i32_to_2f64:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vcvtudq2pd %xmm0, %xmm0
; AVX512VL-NEXT: retq
;
; AVX512DQ-LABEL: uitofp_2i32_to_2f64:
-; AVX512DQ: # BB#0:
-; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
; AVX512DQ-NEXT: vcvtudq2pd %ymm0, %zmm0
-; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512DQ-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512VLDQ-LABEL: uitofp_2i32_to_2f64:
-; AVX512VLDQ: # BB#0:
+; AVX512VLDQ: # %bb.0:
; AVX512VLDQ-NEXT: vcvtudq2pd %xmm0, %xmm0
; AVX512VLDQ-NEXT: retq
%shuf = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
@@ -561,7 +561,7 @@ define <2 x double> @uitofp_2i32_to_2f64(<4 x i32> %a) {
define <2 x double> @uitofp_4i32_to_2f64(<4 x i32> %a) {
; SSE-LABEL: uitofp_4i32_to_2f64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,0,65535,0,65535,0]
; SSE-NEXT: pand %xmm0, %xmm1
; SSE-NEXT: cvtdq2pd %xmm1, %xmm1
@@ -572,7 +572,7 @@ define <2 x double> @uitofp_4i32_to_2f64(<4 x i32> %a) {
; SSE-NEXT: retq
;
; AVX1-LABEL: uitofp_4i32_to_2f64:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
; AVX1-NEXT: vcvtdq2pd %xmm1, %ymm1
@@ -580,51 +580,51 @@ define <2 x double> @uitofp_4i32_to_2f64(<4 x i32> %a) {
; AVX1-NEXT: vcvtdq2pd %xmm0, %ymm0
; AVX1-NEXT: vmulpd {{.*}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: vaddpd %ymm1, %ymm0, %ymm0
-; AVX1-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX1-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: uitofp_4i32_to_2f64:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
; AVX2-NEXT: vcvtdq2pd %xmm1, %ymm1
-; AVX2-NEXT: vbroadcastsd {{.*}}(%rip), %ymm2
+; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm2 = [65536,65536,65536,65536]
; AVX2-NEXT: vmulpd %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vxorpd %xmm2, %xmm2, %xmm2
; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0
; AVX2-NEXT: vaddpd %ymm0, %ymm1, %ymm0
-; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512F-LABEL: uitofp_4i32_to_2f64:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
; AVX512F-NEXT: vcvtudq2pd %ymm0, %zmm0
-; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512F-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: uitofp_4i32_to_2f64:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vcvtudq2pd %xmm0, %ymm0
-; AVX512VL-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512VL-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512DQ-LABEL: uitofp_4i32_to_2f64:
-; AVX512DQ: # BB#0:
-; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
; AVX512DQ-NEXT: vcvtudq2pd %ymm0, %zmm0
-; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512DQ-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512VLDQ-LABEL: uitofp_4i32_to_2f64:
-; AVX512VLDQ: # BB#0:
+; AVX512VLDQ: # %bb.0:
; AVX512VLDQ-NEXT: vcvtudq2pd %xmm0, %ymm0
-; AVX512VLDQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512VLDQ-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX512VLDQ-NEXT: vzeroupper
; AVX512VLDQ-NEXT: retq
%cvt = uitofp <4 x i32> %a to <4 x double>
@@ -634,14 +634,14 @@ define <2 x double> @uitofp_4i32_to_2f64(<4 x i32> %a) {
define <2 x double> @uitofp_2i16_to_2f64(<8 x i16> %a) {
; SSE-LABEL: uitofp_2i16_to_2f64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pxor %xmm1, %xmm1
; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; SSE-NEXT: cvtdq2pd %xmm0, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: uitofp_2i16_to_2f64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0
; AVX-NEXT: retq
@@ -652,33 +652,33 @@ define <2 x double> @uitofp_2i16_to_2f64(<8 x i16> %a) {
define <2 x double> @uitofp_8i16_to_2f64(<8 x i16> %a) {
; SSE-LABEL: uitofp_8i16_to_2f64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pxor %xmm1, %xmm1
; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; SSE-NEXT: cvtdq2pd %xmm0, %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: uitofp_8i16_to_2f64:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; AVX1-NEXT: vcvtdq2pd %xmm0, %ymm0
-; AVX1-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX1-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: uitofp_8i16_to_2f64:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0
-; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512-LABEL: uitofp_8i16_to_2f64:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX512-NEXT: vcvtdq2pd %ymm0, %zmm0
-; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%cvt = uitofp <8 x i16> %a to <8 x double>
@@ -688,7 +688,7 @@ define <2 x double> @uitofp_8i16_to_2f64(<8 x i16> %a) {
define <2 x double> @uitofp_2i8_to_2f64(<16 x i8> %a) {
; SSE-LABEL: uitofp_2i8_to_2f64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pxor %xmm1, %xmm1
; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
@@ -696,7 +696,7 @@ define <2 x double> @uitofp_2i8_to_2f64(<16 x i8> %a) {
; SSE-NEXT: retq
;
; AVX-LABEL: uitofp_2i8_to_2f64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0
; AVX-NEXT: retq
@@ -707,7 +707,7 @@ define <2 x double> @uitofp_2i8_to_2f64(<16 x i8> %a) {
define <2 x double> @uitofp_16i8_to_2f64(<16 x i8> %a) {
; SSE-LABEL: uitofp_16i8_to_2f64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pxor %xmm1, %xmm1
; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
@@ -715,26 +715,26 @@ define <2 x double> @uitofp_16i8_to_2f64(<16 x i8> %a) {
; SSE-NEXT: retq
;
; AVX1-LABEL: uitofp_16i8_to_2f64:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; AVX1-NEXT: vcvtdq2pd %xmm0, %ymm0
-; AVX1-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX1-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: uitofp_16i8_to_2f64:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0
-; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512-LABEL: uitofp_16i8_to_2f64:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
; AVX512-NEXT: vcvtdq2pd %ymm0, %zmm0
-; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%cvt = uitofp <16 x i8> %a to <16 x double>
@@ -744,7 +744,7 @@ define <2 x double> @uitofp_16i8_to_2f64(<16 x i8> %a) {
define <4 x double> @uitofp_4i64_to_4f64(<4 x i64> %a) {
; SSE-LABEL: uitofp_4i64_to_4f64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movdqa {{.*#+}} xmm2 = [1127219200,1160773632,0,0]
; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
@@ -769,87 +769,67 @@ define <4 x double> @uitofp_4i64_to_4f64(<4 x i64> %a) {
; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; SSE-NEXT: retq
;
-; AVX1-LABEL: uitofp_4i64_to_4f64:
-; AVX1: # BB#0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1127219200,1160773632,0,0]
-; AVX1-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; AVX1-NEXT: vmovapd {{.*#+}} xmm4 = [4.503600e+15,1.934281e+25]
-; AVX1-NEXT: vsubpd %xmm4, %xmm3, %xmm3
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; AVX1-NEXT: vsubpd %xmm4, %xmm1, %xmm1
-; AVX1-NEXT: vhaddpd %xmm1, %xmm3, %xmm1
-; AVX1-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; AVX1-NEXT: vsubpd %xmm4, %xmm3, %xmm3
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; AVX1-NEXT: vsubpd %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vhaddpd %xmm0, %xmm3, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: uitofp_4i64_to_4f64:
-; AVX2: # BB#0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [1127219200,1160773632,0,0]
-; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; AVX2-NEXT: vmovapd {{.*#+}} xmm4 = [4.503600e+15,1.934281e+25]
-; AVX2-NEXT: vsubpd %xmm4, %xmm3, %xmm3
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; AVX2-NEXT: vsubpd %xmm4, %xmm1, %xmm1
-; AVX2-NEXT: vhaddpd %xmm1, %xmm3, %xmm1
-; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; AVX2-NEXT: vsubpd %xmm4, %xmm3, %xmm3
-; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; AVX2-NEXT: vsubpd %xmm4, %xmm0, %xmm0
-; AVX2-NEXT: vhaddpd %xmm0, %xmm3, %xmm0
-; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT: retq
+; VEX-LABEL: uitofp_4i64_to_4f64:
+; VEX: # %bb.0:
+; VEX-NEXT: vextractf128 $1, %ymm0, %xmm1
+; VEX-NEXT: vmovapd {{.*#+}} xmm2 = [1127219200,1160773632,0,0]
+; VEX-NEXT: vunpcklps {{.*#+}} xmm3 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; VEX-NEXT: vmovapd {{.*#+}} xmm4 = [4.503600e+15,1.934281e+25]
+; VEX-NEXT: vsubpd %xmm4, %xmm3, %xmm3
+; VEX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; VEX-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; VEX-NEXT: vsubpd %xmm4, %xmm1, %xmm1
+; VEX-NEXT: vhaddpd %xmm1, %xmm3, %xmm1
+; VEX-NEXT: vunpcklps {{.*#+}} xmm3 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; VEX-NEXT: vsubpd %xmm4, %xmm3, %xmm3
+; VEX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; VEX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; VEX-NEXT: vsubpd %xmm4, %xmm0, %xmm0
+; VEX-NEXT: vhaddpd %xmm0, %xmm3, %xmm0
+; VEX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; VEX-NEXT: retq
;
; AVX512F-LABEL: uitofp_4i64_to_4f64:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512F-NEXT: vpextrq $1, %xmm1, %rax
; AVX512F-NEXT: vcvtusi2sdq %rax, %xmm2, %xmm2
; AVX512F-NEXT: vmovq %xmm1, %rax
; AVX512F-NEXT: vcvtusi2sdq %rax, %xmm3, %xmm1
-; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX512F-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; AVX512F-NEXT: vpextrq $1, %xmm0, %rax
; AVX512F-NEXT: vcvtusi2sdq %rax, %xmm3, %xmm2
; AVX512F-NEXT: vmovq %xmm0, %rax
; AVX512F-NEXT: vcvtusi2sdq %rax, %xmm3, %xmm0
-; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX512F-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; AVX512F-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: uitofp_4i64_to_4f64:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512VL-NEXT: vpextrq $1, %xmm1, %rax
; AVX512VL-NEXT: vcvtusi2sdq %rax, %xmm2, %xmm2
; AVX512VL-NEXT: vmovq %xmm1, %rax
; AVX512VL-NEXT: vcvtusi2sdq %rax, %xmm3, %xmm1
-; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax
; AVX512VL-NEXT: vcvtusi2sdq %rax, %xmm3, %xmm2
; AVX512VL-NEXT: vmovq %xmm0, %rax
; AVX512VL-NEXT: vcvtusi2sdq %rax, %xmm3, %xmm0
-; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX512VL-NEXT: retq
;
; AVX512DQ-LABEL: uitofp_4i64_to_4f64:
-; AVX512DQ: # BB#0:
-; AVX512DQ-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; AVX512DQ-NEXT: vcvtuqq2pd %zmm0, %zmm0
-; AVX512DQ-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512DQ-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
; AVX512DQ-NEXT: retq
;
; AVX512VLDQ-LABEL: uitofp_4i64_to_4f64:
-; AVX512VLDQ: # BB#0:
+; AVX512VLDQ: # %bb.0:
; AVX512VLDQ-NEXT: vcvtuqq2pd %ymm0, %ymm0
; AVX512VLDQ-NEXT: retq
%cvt = uitofp <4 x i64> %a to <4 x double>
@@ -858,7 +838,7 @@ define <4 x double> @uitofp_4i64_to_4f64(<4 x i64> %a) {
define <4 x double> @uitofp_4i32_to_4f64(<4 x i32> %a) {
; SSE-LABEL: uitofp_4i32_to_4f64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movdqa %xmm0, %xmm1
; SSE-NEXT: psrld $16, %xmm1
; SSE-NEXT: cvtdq2pd %xmm1, %xmm1
@@ -879,7 +859,7 @@ define <4 x double> @uitofp_4i32_to_4f64(<4 x i32> %a) {
; SSE-NEXT: retq
;
; AVX1-LABEL: uitofp_4i32_to_4f64:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
; AVX1-NEXT: vcvtdq2pd %xmm1, %ymm1
@@ -890,10 +870,10 @@ define <4 x double> @uitofp_4i32_to_4f64(<4 x i32> %a) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: uitofp_4i32_to_4f64:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
; AVX2-NEXT: vcvtdq2pd %xmm1, %ymm1
-; AVX2-NEXT: vbroadcastsd {{.*}}(%rip), %ymm2
+; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm2 = [65536,65536,65536,65536]
; AVX2-NEXT: vmulpd %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vxorpd %xmm2, %xmm2, %xmm2
; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
@@ -902,26 +882,26 @@ define <4 x double> @uitofp_4i32_to_4f64(<4 x i32> %a) {
; AVX2-NEXT: retq
;
; AVX512F-LABEL: uitofp_4i32_to_4f64:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
; AVX512F-NEXT: vcvtudq2pd %ymm0, %zmm0
-; AVX512F-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512F-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: uitofp_4i32_to_4f64:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vcvtudq2pd %xmm0, %ymm0
; AVX512VL-NEXT: retq
;
; AVX512DQ-LABEL: uitofp_4i32_to_4f64:
-; AVX512DQ: # BB#0:
-; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
; AVX512DQ-NEXT: vcvtudq2pd %ymm0, %zmm0
-; AVX512DQ-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512DQ-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
; AVX512DQ-NEXT: retq
;
; AVX512VLDQ-LABEL: uitofp_4i32_to_4f64:
-; AVX512VLDQ: # BB#0:
+; AVX512VLDQ: # %bb.0:
; AVX512VLDQ-NEXT: vcvtudq2pd %xmm0, %ymm0
; AVX512VLDQ-NEXT: retq
%cvt = uitofp <4 x i32> %a to <4 x double>
@@ -930,7 +910,7 @@ define <4 x double> @uitofp_4i32_to_4f64(<4 x i32> %a) {
define <4 x double> @uitofp_4i16_to_4f64(<8 x i16> %a) {
; SSE-LABEL: uitofp_4i16_to_4f64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pxor %xmm1, %xmm1
; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; SSE-NEXT: cvtdq2pd %xmm0, %xmm2
@@ -940,7 +920,7 @@ define <4 x double> @uitofp_4i16_to_4f64(<8 x i16> %a) {
; SSE-NEXT: retq
;
; AVX-LABEL: uitofp_4i16_to_4f64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0
; AVX-NEXT: retq
@@ -951,7 +931,7 @@ define <4 x double> @uitofp_4i16_to_4f64(<8 x i16> %a) {
define <4 x double> @uitofp_8i16_to_4f64(<8 x i16> %a) {
; SSE-LABEL: uitofp_8i16_to_4f64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pxor %xmm1, %xmm1
; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; SSE-NEXT: cvtdq2pd %xmm0, %xmm2
@@ -961,22 +941,22 @@ define <4 x double> @uitofp_8i16_to_4f64(<8 x i16> %a) {
; SSE-NEXT: retq
;
; AVX1-LABEL: uitofp_8i16_to_4f64:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; AVX1-NEXT: vcvtdq2pd %xmm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: uitofp_8i16_to_4f64:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: uitofp_8i16_to_4f64:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX512-NEXT: vcvtdq2pd %ymm0, %zmm0
-; AVX512-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
; AVX512-NEXT: retq
%cvt = uitofp <8 x i16> %a to <8 x double>
%shuf = shufflevector <8 x double> %cvt, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -985,7 +965,7 @@ define <4 x double> @uitofp_8i16_to_4f64(<8 x i16> %a) {
define <4 x double> @uitofp_4i8_to_4f64(<16 x i8> %a) {
; SSE-LABEL: uitofp_4i8_to_4f64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pxor %xmm1, %xmm1
; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
@@ -996,7 +976,7 @@ define <4 x double> @uitofp_4i8_to_4f64(<16 x i8> %a) {
; SSE-NEXT: retq
;
; AVX-LABEL: uitofp_4i8_to_4f64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0
; AVX-NEXT: retq
@@ -1007,7 +987,7 @@ define <4 x double> @uitofp_4i8_to_4f64(<16 x i8> %a) {
define <4 x double> @uitofp_16i8_to_4f64(<16 x i8> %a) {
; SSE-LABEL: uitofp_16i8_to_4f64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pxor %xmm1, %xmm1
; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
@@ -1018,22 +998,22 @@ define <4 x double> @uitofp_16i8_to_4f64(<16 x i8> %a) {
; SSE-NEXT: retq
;
; AVX1-LABEL: uitofp_16i8_to_4f64:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; AVX1-NEXT: vcvtdq2pd %xmm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: uitofp_16i8_to_4f64:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: uitofp_16i8_to_4f64:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
; AVX512-NEXT: vcvtdq2pd %ymm0, %zmm0
-; AVX512-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
; AVX512-NEXT: retq
%cvt = uitofp <16 x i8> %a to <16 x double>
%shuf = shufflevector <16 x double> %cvt, <16 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -1046,7 +1026,7 @@ define <4 x double> @uitofp_16i8_to_4f64(<16 x i8> %a) {
define <4 x float> @sitofp_2i64_to_4f32(<2 x i64> %a) {
; SSE-LABEL: sitofp_2i64_to_4f32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movq %xmm0, %rax
; SSE-NEXT: cvtsi2ssq %rax, %xmm1
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
@@ -1058,7 +1038,7 @@ define <4 x float> @sitofp_2i64_to_4f32(<2 x i64> %a) {
; SSE-NEXT: retq
;
; VEX-LABEL: sitofp_2i64_to_4f32:
-; VEX: # BB#0:
+; VEX: # %bb.0:
; VEX-NEXT: vpextrq $1, %xmm0, %rax
; VEX-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
; VEX-NEXT: vmovq %xmm0, %rax
@@ -1069,7 +1049,7 @@ define <4 x float> @sitofp_2i64_to_4f32(<2 x i64> %a) {
; VEX-NEXT: retq
;
; AVX512F-LABEL: sitofp_2i64_to_4f32:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vpextrq $1, %xmm0, %rax
; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
; AVX512F-NEXT: vmovq %xmm0, %rax
@@ -1080,7 +1060,7 @@ define <4 x float> @sitofp_2i64_to_4f32(<2 x i64> %a) {
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: sitofp_2i64_to_4f32:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax
; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
; AVX512VL-NEXT: vmovq %xmm0, %rax
@@ -1091,15 +1071,15 @@ define <4 x float> @sitofp_2i64_to_4f32(<2 x i64> %a) {
; AVX512VL-NEXT: retq
;
; AVX512DQ-LABEL: sitofp_2i64_to_4f32:
-; AVX512DQ: # BB#0:
-; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: # kill: def %xmm0 killed %xmm0 def %zmm0
; AVX512DQ-NEXT: vcvtqq2ps %zmm0, %ymm0
-; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512DQ-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512VLDQ-LABEL: sitofp_2i64_to_4f32:
-; AVX512VLDQ: # BB#0:
+; AVX512VLDQ: # %bb.0:
; AVX512VLDQ-NEXT: vcvtqq2ps %xmm0, %xmm0
; AVX512VLDQ-NEXT: retq
%cvt = sitofp <2 x i64> %a to <2 x float>
@@ -1109,7 +1089,7 @@ define <4 x float> @sitofp_2i64_to_4f32(<2 x i64> %a) {
define <4 x float> @sitofp_2i64_to_4f32_zero(<2 x i64> %a) {
; SSE-LABEL: sitofp_2i64_to_4f32_zero:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; SSE-NEXT: movq %xmm1, %rax
; SSE-NEXT: xorps %xmm1, %xmm1
@@ -1117,12 +1097,12 @@ define <4 x float> @sitofp_2i64_to_4f32_zero(<2 x i64> %a) {
; SSE-NEXT: movq %xmm0, %rax
; SSE-NEXT: xorps %xmm0, %xmm0
; SSE-NEXT: cvtsi2ssq %rax, %xmm0
-; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
; SSE-NEXT: retq
;
; VEX-LABEL: sitofp_2i64_to_4f32_zero:
-; VEX: # BB#0:
+; VEX: # %bb.0:
; VEX-NEXT: vpextrq $1, %xmm0, %rax
; VEX-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
; VEX-NEXT: vmovq %xmm0, %rax
@@ -1131,7 +1111,7 @@ define <4 x float> @sitofp_2i64_to_4f32_zero(<2 x i64> %a) {
; VEX-NEXT: retq
;
; AVX512F-LABEL: sitofp_2i64_to_4f32_zero:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vpextrq $1, %xmm0, %rax
; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
; AVX512F-NEXT: vmovq %xmm0, %rax
@@ -1140,7 +1120,7 @@ define <4 x float> @sitofp_2i64_to_4f32_zero(<2 x i64> %a) {
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: sitofp_2i64_to_4f32_zero:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax
; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
; AVX512VL-NEXT: vmovq %xmm0, %rax
@@ -1150,15 +1130,15 @@ define <4 x float> @sitofp_2i64_to_4f32_zero(<2 x i64> %a) {
; AVX512VL-NEXT: retq
;
; AVX512DQ-LABEL: sitofp_2i64_to_4f32_zero:
-; AVX512DQ: # BB#0:
-; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: # kill: def %xmm0 killed %xmm0 def %zmm0
; AVX512DQ-NEXT: vcvtqq2ps %zmm0, %ymm0
; AVX512DQ-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512VLDQ-LABEL: sitofp_2i64_to_4f32_zero:
-; AVX512VLDQ: # BB#0:
+; AVX512VLDQ: # %bb.0:
; AVX512VLDQ-NEXT: vcvtqq2ps %xmm0, %xmm0
; AVX512VLDQ-NEXT: retq
%cvt = sitofp <2 x i64> %a to <2 x float>
@@ -1168,7 +1148,7 @@ define <4 x float> @sitofp_2i64_to_4f32_zero(<2 x i64> %a) {
define <4 x float> @sitofp_4i64_to_4f32_undef(<2 x i64> %a) {
; SSE-LABEL: sitofp_4i64_to_4f32_undef:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movq %xmm0, %rax
; SSE-NEXT: cvtsi2ssq %rax, %xmm1
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
@@ -1183,7 +1163,7 @@ define <4 x float> @sitofp_4i64_to_4f32_undef(<2 x i64> %a) {
; SSE-NEXT: retq
;
; VEX-LABEL: sitofp_4i64_to_4f32_undef:
-; VEX: # BB#0:
+; VEX: # %bb.0:
; VEX-NEXT: vpextrq $1, %xmm0, %rax
; VEX-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
; VEX-NEXT: vmovq %xmm0, %rax
@@ -1194,7 +1174,7 @@ define <4 x float> @sitofp_4i64_to_4f32_undef(<2 x i64> %a) {
; VEX-NEXT: retq
;
; AVX512F-LABEL: sitofp_4i64_to_4f32_undef:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vpextrq $1, %xmm0, %rax
; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
; AVX512F-NEXT: vmovq %xmm0, %rax
@@ -1205,7 +1185,7 @@ define <4 x float> @sitofp_4i64_to_4f32_undef(<2 x i64> %a) {
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: sitofp_4i64_to_4f32_undef:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax
; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
; AVX512VL-NEXT: vmovq %xmm0, %rax
@@ -1216,16 +1196,16 @@ define <4 x float> @sitofp_4i64_to_4f32_undef(<2 x i64> %a) {
; AVX512VL-NEXT: retq
;
; AVX512DQ-LABEL: sitofp_4i64_to_4f32_undef:
-; AVX512DQ: # BB#0:
-; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: # kill: def %xmm0 killed %xmm0 def %zmm0
; AVX512DQ-NEXT: vcvtqq2ps %zmm0, %ymm0
-; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512DQ-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512VLDQ-LABEL: sitofp_4i64_to_4f32_undef:
-; AVX512VLDQ: # BB#0:
-; AVX512VLDQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; AVX512VLDQ: # %bb.0:
+; AVX512VLDQ-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
; AVX512VLDQ-NEXT: vcvtqq2ps %ymm0, %xmm0
; AVX512VLDQ-NEXT: vzeroupper
; AVX512VLDQ-NEXT: retq
@@ -1236,12 +1216,12 @@ define <4 x float> @sitofp_4i64_to_4f32_undef(<2 x i64> %a) {
define <4 x float> @sitofp_4i32_to_4f32(<4 x i32> %a) {
; SSE-LABEL: sitofp_4i32_to_4f32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: cvtdq2ps %xmm0, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: sitofp_4i32_to_4f32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0
; AVX-NEXT: retq
%cvt = sitofp <4 x i32> %a to <4 x float>
@@ -1250,14 +1230,14 @@ define <4 x float> @sitofp_4i32_to_4f32(<4 x i32> %a) {
define <4 x float> @sitofp_4i16_to_4f32(<8 x i16> %a) {
; SSE-LABEL: sitofp_4i16_to_4f32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
; SSE-NEXT: psrad $16, %xmm0
; SSE-NEXT: cvtdq2ps %xmm0, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: sitofp_4i16_to_4f32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpmovsxwd %xmm0, %xmm0
; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0
; AVX-NEXT: retq
@@ -1268,36 +1248,36 @@ define <4 x float> @sitofp_4i16_to_4f32(<8 x i16> %a) {
define <4 x float> @sitofp_8i16_to_4f32(<8 x i16> %a) {
; SSE-LABEL: sitofp_8i16_to_4f32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
; SSE-NEXT: psrad $16, %xmm0
; SSE-NEXT: cvtdq2ps %xmm0, %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: sitofp_8i16_to_4f32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpmovsxwd %xmm0, %xmm1
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
-; AVX1-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX1-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: sitofp_8i16_to_4f32:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0
-; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512-LABEL: sitofp_8i16_to_4f32:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpmovsxwd %xmm0, %ymm0
; AVX512-NEXT: vcvtdq2ps %ymm0, %ymm0
-; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%cvt = sitofp <8 x i16> %a to <8 x float>
@@ -1307,7 +1287,7 @@ define <4 x float> @sitofp_8i16_to_4f32(<8 x i16> %a) {
define <4 x float> @sitofp_4i8_to_4f32(<16 x i8> %a) {
; SSE-LABEL: sitofp_4i8_to_4f32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
; SSE-NEXT: psrad $24, %xmm0
@@ -1315,7 +1295,7 @@ define <4 x float> @sitofp_4i8_to_4f32(<16 x i8> %a) {
; SSE-NEXT: retq
;
; AVX-LABEL: sitofp_4i8_to_4f32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpmovsxbd %xmm0, %xmm0
; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0
; AVX-NEXT: retq
@@ -1326,7 +1306,7 @@ define <4 x float> @sitofp_4i8_to_4f32(<16 x i8> %a) {
define <4 x float> @sitofp_16i8_to_4f32(<16 x i8> %a) {
; SSE-LABEL: sitofp_16i8_to_4f32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
; SSE-NEXT: psrad $24, %xmm0
@@ -1334,29 +1314,29 @@ define <4 x float> @sitofp_16i8_to_4f32(<16 x i8> %a) {
; SSE-NEXT: retq
;
; AVX1-LABEL: sitofp_16i8_to_4f32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpmovsxbd %xmm0, %xmm1
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
-; AVX1-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX1-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: sitofp_16i8_to_4f32:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpmovsxbd %xmm0, %ymm0
; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0
-; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512-LABEL: sitofp_16i8_to_4f32:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpmovsxbd %xmm0, %zmm0
; AVX512-NEXT: vcvtdq2ps %zmm0, %zmm0
-; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%cvt = sitofp <16 x i8> %a to <16 x float>
@@ -1366,7 +1346,7 @@ define <4 x float> @sitofp_16i8_to_4f32(<16 x i8> %a) {
define <4 x float> @sitofp_4i64_to_4f32(<4 x i64> %a) {
; SSE-LABEL: sitofp_4i64_to_4f32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movq %xmm1, %rax
; SSE-NEXT: cvtsi2ssq %rax, %xmm2
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
@@ -1382,12 +1362,12 @@ define <4 x float> @sitofp_4i64_to_4f32(<4 x i64> %a) {
; SSE-NEXT: xorps %xmm0, %xmm0
; SSE-NEXT: cvtsi2ssq %rax, %xmm0
; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; SSE-NEXT: movapd %xmm1, %xmm0
+; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSE-NEXT: movaps %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: sitofp_4i64_to_4f32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpextrq $1, %xmm0, %rax
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
; AVX1-NEXT: vmovq %xmm0, %rax
@@ -1404,7 +1384,7 @@ define <4 x float> @sitofp_4i64_to_4f32(<4 x i64> %a) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: sitofp_4i64_to_4f32:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpextrq $1, %xmm0, %rax
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
; AVX2-NEXT: vmovq %xmm0, %rax
@@ -1421,7 +1401,7 @@ define <4 x float> @sitofp_4i64_to_4f32(<4 x i64> %a) {
; AVX2-NEXT: retq
;
; AVX512F-LABEL: sitofp_4i64_to_4f32:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vpextrq $1, %xmm0, %rax
; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
; AVX512F-NEXT: vmovq %xmm0, %rax
@@ -1438,7 +1418,7 @@ define <4 x float> @sitofp_4i64_to_4f32(<4 x i64> %a) {
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: sitofp_4i64_to_4f32:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax
; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
; AVX512VL-NEXT: vmovq %xmm0, %rax
@@ -1455,15 +1435,15 @@ define <4 x float> @sitofp_4i64_to_4f32(<4 x i64> %a) {
; AVX512VL-NEXT: retq
;
; AVX512DQ-LABEL: sitofp_4i64_to_4f32:
-; AVX512DQ: # BB#0:
-; AVX512DQ-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; AVX512DQ-NEXT: vcvtqq2ps %zmm0, %ymm0
-; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512DQ-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512VLDQ-LABEL: sitofp_4i64_to_4f32:
-; AVX512VLDQ: # BB#0:
+; AVX512VLDQ: # %bb.0:
; AVX512VLDQ-NEXT: vcvtqq2ps %ymm0, %xmm0
; AVX512VLDQ-NEXT: vzeroupper
; AVX512VLDQ-NEXT: retq
@@ -1473,13 +1453,13 @@ define <4 x float> @sitofp_4i64_to_4f32(<4 x i64> %a) {
define <8 x float> @sitofp_8i32_to_8f32(<8 x i32> %a) {
; SSE-LABEL: sitofp_8i32_to_8f32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: cvtdq2ps %xmm0, %xmm0
; SSE-NEXT: cvtdq2ps %xmm1, %xmm1
; SSE-NEXT: retq
;
; AVX-LABEL: sitofp_8i32_to_8f32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vcvtdq2ps %ymm0, %ymm0
; AVX-NEXT: retq
%cvt = sitofp <8 x i32> %a to <8 x float>
@@ -1488,7 +1468,7 @@ define <8 x float> @sitofp_8i32_to_8f32(<8 x i32> %a) {
define <8 x float> @sitofp_8i16_to_8f32(<8 x i16> %a) {
; SSE-LABEL: sitofp_8i16_to_8f32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; SSE-NEXT: psrad $16, %xmm1
; SSE-NEXT: cvtdq2ps %xmm1, %xmm2
@@ -1499,7 +1479,7 @@ define <8 x float> @sitofp_8i16_to_8f32(<8 x i16> %a) {
; SSE-NEXT: retq
;
; AVX1-LABEL: sitofp_8i16_to_8f32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpmovsxwd %xmm0, %xmm1
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0
@@ -1508,13 +1488,13 @@ define <8 x float> @sitofp_8i16_to_8f32(<8 x i16> %a) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: sitofp_8i16_to_8f32:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: sitofp_8i16_to_8f32:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpmovsxwd %xmm0, %ymm0
; AVX512-NEXT: vcvtdq2ps %ymm0, %ymm0
; AVX512-NEXT: retq
@@ -1524,7 +1504,7 @@ define <8 x float> @sitofp_8i16_to_8f32(<8 x i16> %a) {
define <8 x float> @sitofp_8i8_to_8f32(<16 x i8> %a) {
; SSE-LABEL: sitofp_8i8_to_8f32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
; SSE-NEXT: psrad $24, %xmm1
@@ -1538,7 +1518,7 @@ define <8 x float> @sitofp_8i8_to_8f32(<16 x i8> %a) {
; SSE-NEXT: retq
;
; AVX1-LABEL: sitofp_8i8_to_8f32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpmovsxbd %xmm0, %xmm1
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0
@@ -1547,13 +1527,13 @@ define <8 x float> @sitofp_8i8_to_8f32(<16 x i8> %a) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: sitofp_8i8_to_8f32:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpmovsxbd %xmm0, %ymm0
; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: sitofp_8i8_to_8f32:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpmovsxbd %xmm0, %ymm0
; AVX512-NEXT: vcvtdq2ps %ymm0, %ymm0
; AVX512-NEXT: retq
@@ -1564,7 +1544,7 @@ define <8 x float> @sitofp_8i8_to_8f32(<16 x i8> %a) {
define <8 x float> @sitofp_16i8_to_8f32(<16 x i8> %a) {
; SSE-LABEL: sitofp_16i8_to_8f32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
; SSE-NEXT: psrad $24, %xmm1
@@ -1578,7 +1558,7 @@ define <8 x float> @sitofp_16i8_to_8f32(<16 x i8> %a) {
; SSE-NEXT: retq
;
; AVX1-LABEL: sitofp_16i8_to_8f32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpmovsxbd %xmm0, %xmm1
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0
@@ -1587,16 +1567,16 @@ define <8 x float> @sitofp_16i8_to_8f32(<16 x i8> %a) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: sitofp_16i8_to_8f32:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpmovsxbd %xmm0, %ymm0
; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: sitofp_16i8_to_8f32:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpmovsxbd %xmm0, %zmm0
; AVX512-NEXT: vcvtdq2ps %zmm0, %zmm0
-; AVX512-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
; AVX512-NEXT: retq
%cvt = sitofp <16 x i8> %a to <16 x float>
%shuf = shufflevector <16 x float> %cvt, <16 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -1609,12 +1589,12 @@ define <8 x float> @sitofp_16i8_to_8f32(<16 x i8> %a) {
define <4 x float> @uitofp_2i64_to_4f32(<2 x i64> %a) {
; SSE-LABEL: uitofp_2i64_to_4f32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movdqa %xmm0, %xmm1
; SSE-NEXT: movq %xmm1, %rax
; SSE-NEXT: testq %rax, %rax
; SSE-NEXT: js .LBB39_1
-; SSE-NEXT: # BB#2:
+; SSE-NEXT: # %bb.2:
; SSE-NEXT: xorps %xmm0, %xmm0
; SSE-NEXT: cvtsi2ssq %rax, %xmm0
; SSE-NEXT: jmp .LBB39_3
@@ -1631,7 +1611,7 @@ define <4 x float> @uitofp_2i64_to_4f32(<2 x i64> %a) {
; SSE-NEXT: movq %xmm1, %rax
; SSE-NEXT: testq %rax, %rax
; SSE-NEXT: js .LBB39_4
-; SSE-NEXT: # BB#5:
+; SSE-NEXT: # %bb.5:
; SSE-NEXT: xorps %xmm1, %xmm1
; SSE-NEXT: cvtsi2ssq %rax, %xmm1
; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
@@ -1648,11 +1628,11 @@ define <4 x float> @uitofp_2i64_to_4f32(<2 x i64> %a) {
; SSE-NEXT: retq
;
; VEX-LABEL: uitofp_2i64_to_4f32:
-; VEX: # BB#0:
+; VEX: # %bb.0:
; VEX-NEXT: vpextrq $1, %xmm0, %rax
; VEX-NEXT: testq %rax, %rax
; VEX-NEXT: js .LBB39_1
-; VEX-NEXT: # BB#2:
+; VEX-NEXT: # %bb.2:
; VEX-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
; VEX-NEXT: jmp .LBB39_3
; VEX-NEXT: .LBB39_1:
@@ -1666,7 +1646,7 @@ define <4 x float> @uitofp_2i64_to_4f32(<2 x i64> %a) {
; VEX-NEXT: vmovq %xmm0, %rax
; VEX-NEXT: testq %rax, %rax
; VEX-NEXT: js .LBB39_4
-; VEX-NEXT: # BB#5:
+; VEX-NEXT: # %bb.5:
; VEX-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm0
; VEX-NEXT: jmp .LBB39_6
; VEX-NEXT: .LBB39_4:
@@ -1678,17 +1658,17 @@ define <4 x float> @uitofp_2i64_to_4f32(<2 x i64> %a) {
; VEX-NEXT: vaddss %xmm0, %xmm0, %xmm0
; VEX-NEXT: .LBB39_6:
; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
-; VEX-NEXT: vxorps %xmm1, %xmm1, %xmm1
; VEX-NEXT: testq %rax, %rax
+; VEX-NEXT: vxorps %xmm1, %xmm1, %xmm1
; VEX-NEXT: js .LBB39_8
-; VEX-NEXT: # BB#7:
+; VEX-NEXT: # %bb.7:
; VEX-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm1
; VEX-NEXT: .LBB39_8:
; VEX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
; VEX-NEXT: retq
;
; AVX512F-LABEL: uitofp_2i64_to_4f32:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vpextrq $1, %xmm0, %rax
; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm1, %xmm1
; AVX512F-NEXT: vmovq %xmm0, %rax
@@ -1699,7 +1679,7 @@ define <4 x float> @uitofp_2i64_to_4f32(<2 x i64> %a) {
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: uitofp_2i64_to_4f32:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax
; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm1, %xmm1
; AVX512VL-NEXT: vmovq %xmm0, %rax
@@ -1710,15 +1690,15 @@ define <4 x float> @uitofp_2i64_to_4f32(<2 x i64> %a) {
; AVX512VL-NEXT: retq
;
; AVX512DQ-LABEL: uitofp_2i64_to_4f32:
-; AVX512DQ: # BB#0:
-; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: # kill: def %xmm0 killed %xmm0 def %zmm0
; AVX512DQ-NEXT: vcvtuqq2ps %zmm0, %ymm0
-; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512DQ-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512VLDQ-LABEL: uitofp_2i64_to_4f32:
-; AVX512VLDQ: # BB#0:
+; AVX512VLDQ: # %bb.0:
; AVX512VLDQ-NEXT: vcvtuqq2ps %xmm0, %xmm0
; AVX512VLDQ-NEXT: retq
%cvt = uitofp <2 x i64> %a to <2 x float>
@@ -1728,12 +1708,12 @@ define <4 x float> @uitofp_2i64_to_4f32(<2 x i64> %a) {
define <4 x float> @uitofp_2i64_to_2f32(<2 x i64> %a) {
; SSE-LABEL: uitofp_2i64_to_2f32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; SSE-NEXT: movq %xmm1, %rax
; SSE-NEXT: testq %rax, %rax
; SSE-NEXT: js .LBB40_1
-; SSE-NEXT: # BB#2:
+; SSE-NEXT: # %bb.2:
; SSE-NEXT: xorps %xmm1, %xmm1
; SSE-NEXT: cvtsi2ssq %rax, %xmm1
; SSE-NEXT: jmp .LBB40_3
@@ -1749,7 +1729,7 @@ define <4 x float> @uitofp_2i64_to_2f32(<2 x i64> %a) {
; SSE-NEXT: movq %xmm0, %rax
; SSE-NEXT: testq %rax, %rax
; SSE-NEXT: js .LBB40_4
-; SSE-NEXT: # BB#5:
+; SSE-NEXT: # %bb.5:
; SSE-NEXT: xorps %xmm0, %xmm0
; SSE-NEXT: cvtsi2ssq %rax, %xmm0
; SSE-NEXT: jmp .LBB40_6
@@ -1767,11 +1747,11 @@ define <4 x float> @uitofp_2i64_to_2f32(<2 x i64> %a) {
; SSE-NEXT: retq
;
; VEX-LABEL: uitofp_2i64_to_2f32:
-; VEX: # BB#0:
+; VEX: # %bb.0:
; VEX-NEXT: vpextrq $1, %xmm0, %rax
; VEX-NEXT: testq %rax, %rax
; VEX-NEXT: js .LBB40_1
-; VEX-NEXT: # BB#2:
+; VEX-NEXT: # %bb.2:
; VEX-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
; VEX-NEXT: jmp .LBB40_3
; VEX-NEXT: .LBB40_1:
@@ -1785,7 +1765,7 @@ define <4 x float> @uitofp_2i64_to_2f32(<2 x i64> %a) {
; VEX-NEXT: vmovq %xmm0, %rax
; VEX-NEXT: testq %rax, %rax
; VEX-NEXT: js .LBB40_4
-; VEX-NEXT: # BB#5:
+; VEX-NEXT: # %bb.5:
; VEX-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm0
; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
; VEX-NEXT: retq
@@ -1800,7 +1780,7 @@ define <4 x float> @uitofp_2i64_to_2f32(<2 x i64> %a) {
; VEX-NEXT: retq
;
; AVX512F-LABEL: uitofp_2i64_to_2f32:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vpextrq $1, %xmm0, %rax
; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm1, %xmm1
; AVX512F-NEXT: vmovq %xmm0, %rax
@@ -1809,7 +1789,7 @@ define <4 x float> @uitofp_2i64_to_2f32(<2 x i64> %a) {
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: uitofp_2i64_to_2f32:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax
; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm1, %xmm1
; AVX512VL-NEXT: vmovq %xmm0, %rax
@@ -1819,15 +1799,15 @@ define <4 x float> @uitofp_2i64_to_2f32(<2 x i64> %a) {
; AVX512VL-NEXT: retq
;
; AVX512DQ-LABEL: uitofp_2i64_to_2f32:
-; AVX512DQ: # BB#0:
-; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: # kill: def %xmm0 killed %xmm0 def %zmm0
; AVX512DQ-NEXT: vcvtuqq2ps %zmm0, %ymm0
; AVX512DQ-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512VLDQ-LABEL: uitofp_2i64_to_2f32:
-; AVX512VLDQ: # BB#0:
+; AVX512VLDQ: # %bb.0:
; AVX512VLDQ-NEXT: vcvtuqq2ps %xmm0, %xmm0
; AVX512VLDQ-NEXT: retq
%cvt = uitofp <2 x i64> %a to <2 x float>
@@ -1837,12 +1817,12 @@ define <4 x float> @uitofp_2i64_to_2f32(<2 x i64> %a) {
define <4 x float> @uitofp_4i64_to_4f32_undef(<2 x i64> %a) {
; SSE-LABEL: uitofp_4i64_to_4f32_undef:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movdqa %xmm0, %xmm1
; SSE-NEXT: movq %xmm1, %rax
; SSE-NEXT: testq %rax, %rax
; SSE-NEXT: js .LBB41_1
-; SSE-NEXT: # BB#2:
+; SSE-NEXT: # %bb.2:
; SSE-NEXT: xorps %xmm0, %xmm0
; SSE-NEXT: cvtsi2ssq %rax, %xmm0
; SSE-NEXT: jmp .LBB41_3
@@ -1859,7 +1839,7 @@ define <4 x float> @uitofp_4i64_to_4f32_undef(<2 x i64> %a) {
; SSE-NEXT: movq %xmm1, %rax
; SSE-NEXT: testq %rax, %rax
; SSE-NEXT: js .LBB41_4
-; SSE-NEXT: # BB#5:
+; SSE-NEXT: # %bb.5:
; SSE-NEXT: xorps %xmm1, %xmm1
; SSE-NEXT: cvtsi2ssq %rax, %xmm1
; SSE-NEXT: jmp .LBB41_6
@@ -1876,7 +1856,7 @@ define <4 x float> @uitofp_4i64_to_4f32_undef(<2 x i64> %a) {
; SSE-NEXT: testq %rax, %rax
; SSE-NEXT: xorps %xmm1, %xmm1
; SSE-NEXT: js .LBB41_8
-; SSE-NEXT: # BB#7:
+; SSE-NEXT: # %bb.7:
; SSE-NEXT: xorps %xmm1, %xmm1
; SSE-NEXT: cvtsi2ssq %rax, %xmm1
; SSE-NEXT: .LBB41_8:
@@ -1884,11 +1864,11 @@ define <4 x float> @uitofp_4i64_to_4f32_undef(<2 x i64> %a) {
; SSE-NEXT: retq
;
; VEX-LABEL: uitofp_4i64_to_4f32_undef:
-; VEX: # BB#0:
+; VEX: # %bb.0:
; VEX-NEXT: vpextrq $1, %xmm0, %rax
; VEX-NEXT: testq %rax, %rax
; VEX-NEXT: js .LBB41_1
-; VEX-NEXT: # BB#2:
+; VEX-NEXT: # %bb.2:
; VEX-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
; VEX-NEXT: jmp .LBB41_3
; VEX-NEXT: .LBB41_1:
@@ -1902,7 +1882,7 @@ define <4 x float> @uitofp_4i64_to_4f32_undef(<2 x i64> %a) {
; VEX-NEXT: vmovq %xmm0, %rax
; VEX-NEXT: testq %rax, %rax
; VEX-NEXT: js .LBB41_4
-; VEX-NEXT: # BB#5:
+; VEX-NEXT: # %bb.5:
; VEX-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm0
; VEX-NEXT: jmp .LBB41_6
; VEX-NEXT: .LBB41_4:
@@ -1914,17 +1894,17 @@ define <4 x float> @uitofp_4i64_to_4f32_undef(<2 x i64> %a) {
; VEX-NEXT: vaddss %xmm0, %xmm0, %xmm0
; VEX-NEXT: .LBB41_6:
; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
-; VEX-NEXT: vxorps %xmm1, %xmm1, %xmm1
; VEX-NEXT: testq %rax, %rax
+; VEX-NEXT: vxorps %xmm1, %xmm1, %xmm1
; VEX-NEXT: js .LBB41_8
-; VEX-NEXT: # BB#7:
+; VEX-NEXT: # %bb.7:
; VEX-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm1
; VEX-NEXT: .LBB41_8:
; VEX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
; VEX-NEXT: retq
;
; AVX512F-LABEL: uitofp_4i64_to_4f32_undef:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vpextrq $1, %xmm0, %rax
; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm1, %xmm1
; AVX512F-NEXT: vmovq %xmm0, %rax
@@ -1935,7 +1915,7 @@ define <4 x float> @uitofp_4i64_to_4f32_undef(<2 x i64> %a) {
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: uitofp_4i64_to_4f32_undef:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax
; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm1, %xmm1
; AVX512VL-NEXT: vmovq %xmm0, %rax
@@ -1946,16 +1926,16 @@ define <4 x float> @uitofp_4i64_to_4f32_undef(<2 x i64> %a) {
; AVX512VL-NEXT: retq
;
; AVX512DQ-LABEL: uitofp_4i64_to_4f32_undef:
-; AVX512DQ: # BB#0:
-; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: # kill: def %xmm0 killed %xmm0 def %zmm0
; AVX512DQ-NEXT: vcvtuqq2ps %zmm0, %ymm0
-; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512DQ-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512VLDQ-LABEL: uitofp_4i64_to_4f32_undef:
-; AVX512VLDQ: # BB#0:
-; AVX512VLDQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; AVX512VLDQ: # %bb.0:
+; AVX512VLDQ-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
; AVX512VLDQ-NEXT: vcvtuqq2ps %ymm0, %xmm0
; AVX512VLDQ-NEXT: vzeroupper
; AVX512VLDQ-NEXT: retq
@@ -1966,7 +1946,7 @@ define <4 x float> @uitofp_4i64_to_4f32_undef(<2 x i64> %a) {
define <4 x float> @uitofp_4i32_to_4f32(<4 x i32> %a) {
; SSE-LABEL: uitofp_4i32_to_4f32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535]
; SSE-NEXT: pand %xmm0, %xmm1
; SSE-NEXT: por {{.*}}(%rip), %xmm1
@@ -1977,7 +1957,7 @@ define <4 x float> @uitofp_4i32_to_4f32(<4 x i32> %a) {
; SSE-NEXT: retq
;
; AVX1-LABEL: uitofp_4i32_to_4f32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7]
; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7]
@@ -1986,40 +1966,40 @@ define <4 x float> @uitofp_4i32_to_4f32(<4 x i32> %a) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: uitofp_4i32_to_4f32:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1258291200,1258291200,1258291200,1258291200]
; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
; AVX2-NEXT: vpsrld $16, %xmm0, %xmm0
-; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2
+; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1392508928,1392508928,1392508928,1392508928]
; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
-; AVX2-NEXT: vbroadcastss {{.*}}(%rip), %xmm2
+; AVX2-NEXT: vbroadcastss {{.*#+}} xmm2 = [-5.49764202E+11,-5.49764202E+11,-5.49764202E+11,-5.49764202E+11]
; AVX2-NEXT: vaddps %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vaddps %xmm0, %xmm1, %xmm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: uitofp_4i32_to_4f32:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: # kill: def %xmm0 killed %xmm0 def %zmm0
; AVX512F-NEXT: vcvtudq2ps %zmm0, %zmm0
-; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512F-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: uitofp_4i32_to_4f32:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vcvtudq2ps %xmm0, %xmm0
; AVX512VL-NEXT: retq
;
; AVX512DQ-LABEL: uitofp_4i32_to_4f32:
-; AVX512DQ: # BB#0:
-; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: # kill: def %xmm0 killed %xmm0 def %zmm0
; AVX512DQ-NEXT: vcvtudq2ps %zmm0, %zmm0
-; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512DQ-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512VLDQ-LABEL: uitofp_4i32_to_4f32:
-; AVX512VLDQ: # BB#0:
+; AVX512VLDQ: # %bb.0:
; AVX512VLDQ-NEXT: vcvtudq2ps %xmm0, %xmm0
; AVX512VLDQ-NEXT: retq
%cvt = uitofp <4 x i32> %a to <4 x float>
@@ -2028,14 +2008,14 @@ define <4 x float> @uitofp_4i32_to_4f32(<4 x i32> %a) {
define <4 x float> @uitofp_4i16_to_4f32(<8 x i16> %a) {
; SSE-LABEL: uitofp_4i16_to_4f32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pxor %xmm1, %xmm1
; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; SSE-NEXT: cvtdq2ps %xmm0, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: uitofp_4i16_to_4f32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0
; AVX-NEXT: retq
@@ -2046,36 +2026,36 @@ define <4 x float> @uitofp_4i16_to_4f32(<8 x i16> %a) {
define <4 x float> @uitofp_8i16_to_4f32(<8 x i16> %a) {
; SSE-LABEL: uitofp_8i16_to_4f32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pxor %xmm1, %xmm1
; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; SSE-NEXT: cvtdq2ps %xmm0, %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: uitofp_8i16_to_4f32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
-; AVX1-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX1-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: uitofp_8i16_to_4f32:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0
-; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512-LABEL: uitofp_8i16_to_4f32:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX512-NEXT: vcvtdq2ps %ymm0, %ymm0
-; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%cvt = uitofp <8 x i16> %a to <8 x float>
@@ -2085,7 +2065,7 @@ define <4 x float> @uitofp_8i16_to_4f32(<8 x i16> %a) {
define <4 x float> @uitofp_4i8_to_4f32(<16 x i8> %a) {
; SSE-LABEL: uitofp_4i8_to_4f32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pxor %xmm1, %xmm1
; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
@@ -2093,7 +2073,7 @@ define <4 x float> @uitofp_4i8_to_4f32(<16 x i8> %a) {
; SSE-NEXT: retq
;
; AVX-LABEL: uitofp_4i8_to_4f32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0
; AVX-NEXT: retq
@@ -2104,7 +2084,7 @@ define <4 x float> @uitofp_4i8_to_4f32(<16 x i8> %a) {
define <4 x float> @uitofp_16i8_to_4f32(<16 x i8> %a) {
; SSE-LABEL: uitofp_16i8_to_4f32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pxor %xmm1, %xmm1
; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
@@ -2112,29 +2092,29 @@ define <4 x float> @uitofp_16i8_to_4f32(<16 x i8> %a) {
; SSE-NEXT: retq
;
; AVX1-LABEL: uitofp_16i8_to_4f32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
-; AVX1-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX1-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: uitofp_16i8_to_4f32:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0
-; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512-LABEL: uitofp_16i8_to_4f32:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
; AVX512-NEXT: vcvtdq2ps %zmm0, %zmm0
-; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%cvt = uitofp <16 x i8> %a to <16 x float>
@@ -2144,11 +2124,11 @@ define <4 x float> @uitofp_16i8_to_4f32(<16 x i8> %a) {
define <4 x float> @uitofp_4i64_to_4f32(<4 x i64> %a) {
; SSE-LABEL: uitofp_4i64_to_4f32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movq %xmm1, %rax
; SSE-NEXT: testq %rax, %rax
; SSE-NEXT: js .LBB47_1
-; SSE-NEXT: # BB#2:
+; SSE-NEXT: # %bb.2:
; SSE-NEXT: cvtsi2ssq %rax, %xmm2
; SSE-NEXT: jmp .LBB47_3
; SSE-NEXT: .LBB47_1:
@@ -2163,7 +2143,7 @@ define <4 x float> @uitofp_4i64_to_4f32(<4 x i64> %a) {
; SSE-NEXT: movq %xmm1, %rax
; SSE-NEXT: testq %rax, %rax
; SSE-NEXT: js .LBB47_4
-; SSE-NEXT: # BB#5:
+; SSE-NEXT: # %bb.5:
; SSE-NEXT: cvtsi2ssq %rax, %xmm3
; SSE-NEXT: jmp .LBB47_6
; SSE-NEXT: .LBB47_4:
@@ -2177,7 +2157,7 @@ define <4 x float> @uitofp_4i64_to_4f32(<4 x i64> %a) {
; SSE-NEXT: movq %xmm0, %rax
; SSE-NEXT: testq %rax, %rax
; SSE-NEXT: js .LBB47_7
-; SSE-NEXT: # BB#8:
+; SSE-NEXT: # %bb.8:
; SSE-NEXT: xorps %xmm1, %xmm1
; SSE-NEXT: cvtsi2ssq %rax, %xmm1
; SSE-NEXT: jmp .LBB47_9
@@ -2195,7 +2175,7 @@ define <4 x float> @uitofp_4i64_to_4f32(<4 x i64> %a) {
; SSE-NEXT: movq %xmm0, %rax
; SSE-NEXT: testq %rax, %rax
; SSE-NEXT: js .LBB47_10
-; SSE-NEXT: # BB#11:
+; SSE-NEXT: # %bb.11:
; SSE-NEXT: xorps %xmm0, %xmm0
; SSE-NEXT: cvtsi2ssq %rax, %xmm0
; SSE-NEXT: jmp .LBB47_12
@@ -2209,16 +2189,16 @@ define <4 x float> @uitofp_4i64_to_4f32(<4 x i64> %a) {
; SSE-NEXT: addss %xmm0, %xmm0
; SSE-NEXT: .LBB47_12:
; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; SSE-NEXT: movapd %xmm1, %xmm0
+; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSE-NEXT: movaps %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: uitofp_4i64_to_4f32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpextrq $1, %xmm0, %rax
; AVX1-NEXT: testq %rax, %rax
; AVX1-NEXT: js .LBB47_1
-; AVX1-NEXT: # BB#2:
+; AVX1-NEXT: # %bb.2:
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
; AVX1-NEXT: jmp .LBB47_3
; AVX1-NEXT: .LBB47_1:
@@ -2232,7 +2212,7 @@ define <4 x float> @uitofp_4i64_to_4f32(<4 x i64> %a) {
; AVX1-NEXT: vmovq %xmm0, %rax
; AVX1-NEXT: testq %rax, %rax
; AVX1-NEXT: js .LBB47_4
-; AVX1-NEXT: # BB#5:
+; AVX1-NEXT: # %bb.5:
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2
; AVX1-NEXT: jmp .LBB47_6
; AVX1-NEXT: .LBB47_4:
@@ -2248,7 +2228,7 @@ define <4 x float> @uitofp_4i64_to_4f32(<4 x i64> %a) {
; AVX1-NEXT: vmovq %xmm0, %rax
; AVX1-NEXT: testq %rax, %rax
; AVX1-NEXT: js .LBB47_7
-; AVX1-NEXT: # BB#8:
+; AVX1-NEXT: # %bb.8:
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2
; AVX1-NEXT: jmp .LBB47_9
; AVX1-NEXT: .LBB47_7:
@@ -2263,7 +2243,7 @@ define <4 x float> @uitofp_4i64_to_4f32(<4 x i64> %a) {
; AVX1-NEXT: vpextrq $1, %xmm0, %rax
; AVX1-NEXT: testq %rax, %rax
; AVX1-NEXT: js .LBB47_10
-; AVX1-NEXT: # BB#11:
+; AVX1-NEXT: # %bb.11:
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0
; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
; AVX1-NEXT: vzeroupper
@@ -2280,11 +2260,11 @@ define <4 x float> @uitofp_4i64_to_4f32(<4 x i64> %a) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: uitofp_4i64_to_4f32:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpextrq $1, %xmm0, %rax
; AVX2-NEXT: testq %rax, %rax
; AVX2-NEXT: js .LBB47_1
-; AVX2-NEXT: # BB#2:
+; AVX2-NEXT: # %bb.2:
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
; AVX2-NEXT: jmp .LBB47_3
; AVX2-NEXT: .LBB47_1:
@@ -2298,7 +2278,7 @@ define <4 x float> @uitofp_4i64_to_4f32(<4 x i64> %a) {
; AVX2-NEXT: vmovq %xmm0, %rax
; AVX2-NEXT: testq %rax, %rax
; AVX2-NEXT: js .LBB47_4
-; AVX2-NEXT: # BB#5:
+; AVX2-NEXT: # %bb.5:
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2
; AVX2-NEXT: jmp .LBB47_6
; AVX2-NEXT: .LBB47_4:
@@ -2314,7 +2294,7 @@ define <4 x float> @uitofp_4i64_to_4f32(<4 x i64> %a) {
; AVX2-NEXT: vmovq %xmm0, %rax
; AVX2-NEXT: testq %rax, %rax
; AVX2-NEXT: js .LBB47_7
-; AVX2-NEXT: # BB#8:
+; AVX2-NEXT: # %bb.8:
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2
; AVX2-NEXT: jmp .LBB47_9
; AVX2-NEXT: .LBB47_7:
@@ -2329,7 +2309,7 @@ define <4 x float> @uitofp_4i64_to_4f32(<4 x i64> %a) {
; AVX2-NEXT: vpextrq $1, %xmm0, %rax
; AVX2-NEXT: testq %rax, %rax
; AVX2-NEXT: js .LBB47_10
-; AVX2-NEXT: # BB#11:
+; AVX2-NEXT: # %bb.11:
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0
; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
; AVX2-NEXT: vzeroupper
@@ -2346,7 +2326,7 @@ define <4 x float> @uitofp_4i64_to_4f32(<4 x i64> %a) {
; AVX2-NEXT: retq
;
; AVX512F-LABEL: uitofp_4i64_to_4f32:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vpextrq $1, %xmm0, %rax
; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm1, %xmm1
; AVX512F-NEXT: vmovq %xmm0, %rax
@@ -2363,7 +2343,7 @@ define <4 x float> @uitofp_4i64_to_4f32(<4 x i64> %a) {
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: uitofp_4i64_to_4f32:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax
; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm1, %xmm1
; AVX512VL-NEXT: vmovq %xmm0, %rax
@@ -2380,15 +2360,15 @@ define <4 x float> @uitofp_4i64_to_4f32(<4 x i64> %a) {
; AVX512VL-NEXT: retq
;
; AVX512DQ-LABEL: uitofp_4i64_to_4f32:
-; AVX512DQ: # BB#0:
-; AVX512DQ-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; AVX512DQ-NEXT: vcvtuqq2ps %zmm0, %ymm0
-; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512DQ-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512VLDQ-LABEL: uitofp_4i64_to_4f32:
-; AVX512VLDQ: # BB#0:
+; AVX512VLDQ: # %bb.0:
; AVX512VLDQ-NEXT: vcvtuqq2ps %ymm0, %xmm0
; AVX512VLDQ-NEXT: vzeroupper
; AVX512VLDQ-NEXT: retq
@@ -2398,7 +2378,7 @@ define <4 x float> @uitofp_4i64_to_4f32(<4 x i64> %a) {
define <8 x float> @uitofp_8i32_to_8f32(<8 x i32> %a) {
; SSE-LABEL: uitofp_8i32_to_8f32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535]
; SSE-NEXT: movdqa %xmm0, %xmm3
; SSE-NEXT: pand %xmm2, %xmm3
@@ -2419,7 +2399,7 @@ define <8 x float> @uitofp_8i32_to_8f32(<8 x i32> %a) {
; SSE-NEXT: retq
;
; AVX1-LABEL: uitofp_8i32_to_8f32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vpsrld $16, %xmm2, %xmm2
@@ -2432,38 +2412,38 @@ define <8 x float> @uitofp_8i32_to_8f32(<8 x i32> %a) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: uitofp_8i32_to_8f32:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1258291200,1258291200,1258291200,1258291200,1258291200,1258291200,1258291200,1258291200]
; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0
-; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm2
+; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [1392508928,1392508928,1392508928,1392508928,1392508928,1392508928,1392508928,1392508928]
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7],ymm0[8],ymm2[9],ymm0[10],ymm2[11],ymm0[12],ymm2[13],ymm0[14],ymm2[15]
-; AVX2-NEXT: vbroadcastss {{.*}}(%rip), %ymm2
+; AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [-5.49764202E+11,-5.49764202E+11,-5.49764202E+11,-5.49764202E+11,-5.49764202E+11,-5.49764202E+11,-5.49764202E+11,-5.49764202E+11]
; AVX2-NEXT: vaddps %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vaddps %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: uitofp_8i32_to_8f32:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; AVX512F-NEXT: vcvtudq2ps %zmm0, %zmm0
-; AVX512F-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512F-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: uitofp_8i32_to_8f32:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vcvtudq2ps %ymm0, %ymm0
; AVX512VL-NEXT: retq
;
; AVX512DQ-LABEL: uitofp_8i32_to_8f32:
-; AVX512DQ: # BB#0:
-; AVX512DQ-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; AVX512DQ-NEXT: vcvtudq2ps %zmm0, %zmm0
-; AVX512DQ-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512DQ-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
; AVX512DQ-NEXT: retq
;
; AVX512VLDQ-LABEL: uitofp_8i32_to_8f32:
-; AVX512VLDQ: # BB#0:
+; AVX512VLDQ: # %bb.0:
; AVX512VLDQ-NEXT: vcvtudq2ps %ymm0, %ymm0
; AVX512VLDQ-NEXT: retq
%cvt = uitofp <8 x i32> %a to <8 x float>
@@ -2472,7 +2452,7 @@ define <8 x float> @uitofp_8i32_to_8f32(<8 x i32> %a) {
define <8 x float> @uitofp_8i16_to_8f32(<8 x i16> %a) {
; SSE-LABEL: uitofp_8i16_to_8f32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pxor %xmm1, %xmm1
; SSE-NEXT: movdqa %xmm0, %xmm2
; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
@@ -2483,7 +2463,7 @@ define <8 x float> @uitofp_8i16_to_8f32(<8 x i16> %a) {
; SSE-NEXT: retq
;
; AVX1-LABEL: uitofp_8i16_to_8f32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
@@ -2492,13 +2472,13 @@ define <8 x float> @uitofp_8i16_to_8f32(<8 x i16> %a) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: uitofp_8i16_to_8f32:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: uitofp_8i16_to_8f32:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX512-NEXT: vcvtdq2ps %ymm0, %ymm0
; AVX512-NEXT: retq
@@ -2508,7 +2488,7 @@ define <8 x float> @uitofp_8i16_to_8f32(<8 x i16> %a) {
define <8 x float> @uitofp_8i8_to_8f32(<16 x i8> %a) {
; SSE-LABEL: uitofp_8i8_to_8f32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pxor %xmm1, %xmm1
; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; SSE-NEXT: movdqa %xmm0, %xmm2
@@ -2520,7 +2500,7 @@ define <8 x float> @uitofp_8i8_to_8f32(<16 x i8> %a) {
; SSE-NEXT: retq
;
; AVX1-LABEL: uitofp_8i8_to_8f32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
@@ -2529,13 +2509,13 @@ define <8 x float> @uitofp_8i8_to_8f32(<16 x i8> %a) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: uitofp_8i8_to_8f32:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: uitofp_8i8_to_8f32:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
; AVX512-NEXT: vcvtdq2ps %ymm0, %ymm0
; AVX512-NEXT: retq
@@ -2546,7 +2526,7 @@ define <8 x float> @uitofp_8i8_to_8f32(<16 x i8> %a) {
define <8 x float> @uitofp_16i8_to_8f32(<16 x i8> %a) {
; SSE-LABEL: uitofp_16i8_to_8f32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pxor %xmm1, %xmm1
; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; SSE-NEXT: movdqa %xmm0, %xmm2
@@ -2558,7 +2538,7 @@ define <8 x float> @uitofp_16i8_to_8f32(<16 x i8> %a) {
; SSE-NEXT: retq
;
; AVX1-LABEL: uitofp_16i8_to_8f32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
@@ -2567,16 +2547,16 @@ define <8 x float> @uitofp_16i8_to_8f32(<16 x i8> %a) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: uitofp_16i8_to_8f32:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: uitofp_16i8_to_8f32:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
; AVX512-NEXT: vcvtdq2ps %zmm0, %zmm0
-; AVX512-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
; AVX512-NEXT: retq
%cvt = uitofp <16 x i8> %a to <16 x float>
%shuf = shufflevector <16 x float> %cvt, <16 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -2589,7 +2569,7 @@ define <8 x float> @uitofp_16i8_to_8f32(<16 x i8> %a) {
define <2 x double> @sitofp_load_2i64_to_2f64(<2 x i64> *%a) {
; SSE-LABEL: sitofp_load_2i64_to_2f64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movdqa (%rdi), %xmm1
; SSE-NEXT: movq %xmm1, %rax
; SSE-NEXT: cvtsi2sdq %rax, %xmm0
@@ -2597,49 +2577,49 @@ define <2 x double> @sitofp_load_2i64_to_2f64(<2 x i64> *%a) {
; SSE-NEXT: movq %xmm1, %rax
; SSE-NEXT: xorps %xmm1, %xmm1
; SSE-NEXT: cvtsi2sdq %rax, %xmm1
-; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE-NEXT: retq
;
; VEX-LABEL: sitofp_load_2i64_to_2f64:
-; VEX: # BB#0:
+; VEX: # %bb.0:
; VEX-NEXT: vmovdqa (%rdi), %xmm0
; VEX-NEXT: vpextrq $1, %xmm0, %rax
; VEX-NEXT: vcvtsi2sdq %rax, %xmm1, %xmm1
; VEX-NEXT: vmovq %xmm0, %rax
; VEX-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm0
-; VEX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; VEX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; VEX-NEXT: retq
;
; AVX512F-LABEL: sitofp_load_2i64_to_2f64:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
; AVX512F-NEXT: vpextrq $1, %xmm0, %rax
; AVX512F-NEXT: vcvtsi2sdq %rax, %xmm1, %xmm1
; AVX512F-NEXT: vmovq %xmm0, %rax
; AVX512F-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm0
-; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512F-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: sitofp_load_2i64_to_2f64:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax
; AVX512VL-NEXT: vcvtsi2sdq %rax, %xmm1, %xmm1
; AVX512VL-NEXT: vmovq %xmm0, %rax
; AVX512VL-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm0
-; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512VL-NEXT: retq
;
; AVX512DQ-LABEL: sitofp_load_2i64_to_2f64:
-; AVX512DQ: # BB#0:
+; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vmovaps (%rdi), %xmm0
; AVX512DQ-NEXT: vcvtqq2pd %zmm0, %zmm0
-; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512DQ-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512VLDQ-LABEL: sitofp_load_2i64_to_2f64:
-; AVX512VLDQ: # BB#0:
+; AVX512VLDQ: # %bb.0:
; AVX512VLDQ-NEXT: vcvtqq2pd (%rdi), %xmm0
; AVX512VLDQ-NEXT: retq
%ld = load <2 x i64>, <2 x i64> *%a
@@ -2649,38 +2629,14 @@ define <2 x double> @sitofp_load_2i64_to_2f64(<2 x i64> *%a) {
define <2 x double> @sitofp_load_2i32_to_2f64(<2 x i32> *%a) {
; SSE-LABEL: sitofp_load_2i32_to_2f64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: cvtdq2pd (%rdi), %xmm0
; SSE-NEXT: retq
;
-; VEX-LABEL: sitofp_load_2i32_to_2f64:
-; VEX: # BB#0:
-; VEX-NEXT: vcvtdq2pd (%rdi), %xmm0
-; VEX-NEXT: retq
-;
-; AVX512F-LABEL: sitofp_load_2i32_to_2f64:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vcvtdq2pd (%rdi), %xmm0
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: sitofp_load_2i32_to_2f64:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX512VL-NEXT: vcvtdq2pd %xmm0, %xmm0
-; AVX512VL-NEXT: retq
-;
-; AVX512DQ-LABEL: sitofp_load_2i32_to_2f64:
-; AVX512DQ: # BB#0:
-; AVX512DQ-NEXT: vcvtdq2pd (%rdi), %xmm0
-; AVX512DQ-NEXT: retq
-;
-; AVX512VLDQ-LABEL: sitofp_load_2i32_to_2f64:
-; AVX512VLDQ: # BB#0:
-; AVX512VLDQ-NEXT: vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
-; AVX512VLDQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX512VLDQ-NEXT: vcvtdq2pd %xmm0, %xmm0
-; AVX512VLDQ-NEXT: retq
+; AVX-LABEL: sitofp_load_2i32_to_2f64:
+; AVX: # %bb.0:
+; AVX-NEXT: vcvtdq2pd (%rdi), %xmm0
+; AVX-NEXT: retq
%ld = load <2 x i32>, <2 x i32> *%a
%cvt = sitofp <2 x i32> %ld to <2 x double>
ret <2 x double> %cvt
@@ -2688,7 +2644,7 @@ define <2 x double> @sitofp_load_2i32_to_2f64(<2 x i32> *%a) {
define <2 x double> @sitofp_load_2i16_to_2f64(<2 x i16> *%a) {
; SSE-LABEL: sitofp_load_2i16_to_2f64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
; SSE-NEXT: psrad $16, %xmm0
@@ -2696,7 +2652,7 @@ define <2 x double> @sitofp_load_2i16_to_2f64(<2 x i16> *%a) {
; SSE-NEXT: retq
;
; AVX-LABEL: sitofp_load_2i16_to_2f64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpmovsxwq (%rdi), %xmm0
; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0
@@ -2708,7 +2664,7 @@ define <2 x double> @sitofp_load_2i16_to_2f64(<2 x i16> *%a) {
define <2 x double> @sitofp_load_2i8_to_2f64(<2 x i8> *%a) {
; SSE-LABEL: sitofp_load_2i8_to_2f64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movzwl (%rdi), %eax
; SSE-NEXT: movd %eax, %xmm0
; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
@@ -2718,7 +2674,7 @@ define <2 x double> @sitofp_load_2i8_to_2f64(<2 x i8> *%a) {
; SSE-NEXT: retq
;
; AVX-LABEL: sitofp_load_2i8_to_2f64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpmovsxbq (%rdi), %xmm0
; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0
@@ -2730,7 +2686,7 @@ define <2 x double> @sitofp_load_2i8_to_2f64(<2 x i8> *%a) {
define <4 x double> @sitofp_load_4i64_to_4f64(<4 x i64> *%a) {
; SSE-LABEL: sitofp_load_4i64_to_4f64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movdqa (%rdi), %xmm1
; SSE-NEXT: movdqa 16(%rdi), %xmm2
; SSE-NEXT: movq %xmm1, %rax
@@ -2739,7 +2695,7 @@ define <4 x double> @sitofp_load_4i64_to_4f64(<4 x i64> *%a) {
; SSE-NEXT: movq %xmm1, %rax
; SSE-NEXT: xorps %xmm1, %xmm1
; SSE-NEXT: cvtsi2sdq %rax, %xmm1
-; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE-NEXT: movq %xmm2, %rax
; SSE-NEXT: xorps %xmm1, %xmm1
; SSE-NEXT: cvtsi2sdq %rax, %xmm1
@@ -2747,86 +2703,86 @@ define <4 x double> @sitofp_load_4i64_to_4f64(<4 x i64> *%a) {
; SSE-NEXT: movq %xmm2, %rax
; SSE-NEXT: xorps %xmm2, %xmm2
; SSE-NEXT: cvtsi2sdq %rax, %xmm2
-; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; SSE-NEXT: retq
;
; AVX1-LABEL: sitofp_load_4i64_to_4f64:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovdqa (%rdi), %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpextrq $1, %xmm1, %rax
; AVX1-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm2
; AVX1-NEXT: vmovq %xmm1, %rax
; AVX1-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm1
-; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX1-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; AVX1-NEXT: vpextrq $1, %xmm0, %rax
; AVX1-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm2
; AVX1-NEXT: vmovq %xmm0, %rax
; AVX1-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm0
-; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: sitofp_load_4i64_to_4f64:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa (%rdi), %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpextrq $1, %xmm1, %rax
; AVX2-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm2
; AVX2-NEXT: vmovq %xmm1, %rax
; AVX2-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm1
-; AVX2-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; AVX2-NEXT: vpextrq $1, %xmm0, %rax
; AVX2-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm2
; AVX2-NEXT: vmovq %xmm0, %rax
; AVX2-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm0
-; AVX2-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: sitofp_load_4i64_to_4f64:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512F-NEXT: vpextrq $1, %xmm1, %rax
; AVX512F-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm2
; AVX512F-NEXT: vmovq %xmm1, %rax
; AVX512F-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm1
-; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX512F-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; AVX512F-NEXT: vpextrq $1, %xmm0, %rax
; AVX512F-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm2
; AVX512F-NEXT: vmovq %xmm0, %rax
; AVX512F-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm0
-; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX512F-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; AVX512F-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: sitofp_load_4i64_to_4f64:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512VL-NEXT: vpextrq $1, %xmm1, %rax
; AVX512VL-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm2
; AVX512VL-NEXT: vmovq %xmm1, %rax
; AVX512VL-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm1
-; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax
; AVX512VL-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm2
; AVX512VL-NEXT: vmovq %xmm0, %rax
; AVX512VL-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm0
-; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX512VL-NEXT: retq
;
; AVX512DQ-LABEL: sitofp_load_4i64_to_4f64:
-; AVX512DQ: # BB#0:
+; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vmovaps (%rdi), %ymm0
; AVX512DQ-NEXT: vcvtqq2pd %zmm0, %zmm0
-; AVX512DQ-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512DQ-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
; AVX512DQ-NEXT: retq
;
; AVX512VLDQ-LABEL: sitofp_load_4i64_to_4f64:
-; AVX512VLDQ: # BB#0:
+; AVX512VLDQ: # %bb.0:
; AVX512VLDQ-NEXT: vcvtqq2pd (%rdi), %ymm0
; AVX512VLDQ-NEXT: retq
%ld = load <4 x i64>, <4 x i64> *%a
@@ -2836,7 +2792,7 @@ define <4 x double> @sitofp_load_4i64_to_4f64(<4 x i64> *%a) {
define <4 x double> @sitofp_load_4i32_to_4f64(<4 x i32> *%a) {
; SSE-LABEL: sitofp_load_4i32_to_4f64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movdqa (%rdi), %xmm1
; SSE-NEXT: cvtdq2pd %xmm1, %xmm0
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
@@ -2844,7 +2800,7 @@ define <4 x double> @sitofp_load_4i32_to_4f64(<4 x i32> *%a) {
; SSE-NEXT: retq
;
; AVX-LABEL: sitofp_load_4i32_to_4f64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vcvtdq2pd (%rdi), %ymm0
; AVX-NEXT: retq
%ld = load <4 x i32>, <4 x i32> *%a
@@ -2854,7 +2810,7 @@ define <4 x double> @sitofp_load_4i32_to_4f64(<4 x i32> *%a) {
define <4 x double> @sitofp_load_4i16_to_4f64(<4 x i16> *%a) {
; SSE-LABEL: sitofp_load_4i16_to_4f64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; SSE-NEXT: psrad $16, %xmm1
@@ -2864,7 +2820,7 @@ define <4 x double> @sitofp_load_4i16_to_4f64(<4 x i16> *%a) {
; SSE-NEXT: retq
;
; AVX-LABEL: sitofp_load_4i16_to_4f64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpmovsxwd (%rdi), %xmm0
; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0
; AVX-NEXT: retq
@@ -2875,7 +2831,7 @@ define <4 x double> @sitofp_load_4i16_to_4f64(<4 x i16> *%a) {
define <4 x double> @sitofp_load_4i8_to_4f64(<4 x i8> *%a) {
; SSE-LABEL: sitofp_load_4i8_to_4f64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
@@ -2886,7 +2842,7 @@ define <4 x double> @sitofp_load_4i8_to_4f64(<4 x i8> *%a) {
; SSE-NEXT: retq
;
; AVX-LABEL: sitofp_load_4i8_to_4f64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpmovsxbd (%rdi), %xmm0
; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0
; AVX-NEXT: retq
@@ -2901,7 +2857,7 @@ define <4 x double> @sitofp_load_4i8_to_4f64(<4 x i8> *%a) {
define <2 x double> @uitofp_load_2i64_to_2f64(<2 x i64> *%a) {
; SSE-LABEL: uitofp_load_2i64_to_2f64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movdqa (%rdi), %xmm1
; SSE-NEXT: movdqa {{.*#+}} xmm2 = [1127219200,1160773632,0,0]
; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
@@ -2918,48 +2874,48 @@ define <2 x double> @uitofp_load_2i64_to_2f64(<2 x i64> *%a) {
; SSE-NEXT: retq
;
; VEX-LABEL: uitofp_load_2i64_to_2f64:
-; VEX: # BB#0:
-; VEX-NEXT: vmovdqa (%rdi), %xmm0
-; VEX-NEXT: vmovdqa {{.*#+}} xmm1 = [1127219200,1160773632,0,0]
-; VEX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; VEX: # %bb.0:
+; VEX-NEXT: vmovapd (%rdi), %xmm0
+; VEX-NEXT: vmovapd {{.*#+}} xmm1 = [1127219200,1160773632,0,0]
+; VEX-NEXT: vunpcklps {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; VEX-NEXT: vmovapd {{.*#+}} xmm3 = [4.503600e+15,1.934281e+25]
; VEX-NEXT: vsubpd %xmm3, %xmm2, %xmm2
-; VEX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; VEX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; VEX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; VEX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; VEX-NEXT: vsubpd %xmm3, %xmm0, %xmm0
; VEX-NEXT: vhaddpd %xmm0, %xmm2, %xmm0
; VEX-NEXT: retq
;
; AVX512F-LABEL: uitofp_load_2i64_to_2f64:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
; AVX512F-NEXT: vpextrq $1, %xmm0, %rax
; AVX512F-NEXT: vcvtusi2sdq %rax, %xmm1, %xmm1
; AVX512F-NEXT: vmovq %xmm0, %rax
; AVX512F-NEXT: vcvtusi2sdq %rax, %xmm2, %xmm0
-; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512F-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: uitofp_load_2i64_to_2f64:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax
; AVX512VL-NEXT: vcvtusi2sdq %rax, %xmm1, %xmm1
; AVX512VL-NEXT: vmovq %xmm0, %rax
; AVX512VL-NEXT: vcvtusi2sdq %rax, %xmm2, %xmm0
-; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512VL-NEXT: retq
;
; AVX512DQ-LABEL: uitofp_load_2i64_to_2f64:
-; AVX512DQ: # BB#0:
+; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vmovaps (%rdi), %xmm0
; AVX512DQ-NEXT: vcvtuqq2pd %zmm0, %zmm0
-; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512DQ-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512VLDQ-LABEL: uitofp_load_2i64_to_2f64:
-; AVX512VLDQ: # BB#0:
+; AVX512VLDQ: # %bb.0:
; AVX512VLDQ-NEXT: vcvtuqq2pd (%rdi), %xmm0
; AVX512VLDQ-NEXT: retq
%ld = load <2 x i64>, <2 x i64> *%a
@@ -2969,7 +2925,7 @@ define <2 x double> @uitofp_load_2i64_to_2f64(<2 x i64> *%a) {
define <2 x double> @uitofp_load_2i32_to_2f64(<2 x i32> *%a) {
; SSE-LABEL: uitofp_load_2i32_to_2f64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,0,65535,0,65535,0]
; SSE-NEXT: pand %xmm0, %xmm1
@@ -2981,7 +2937,7 @@ define <2 x double> @uitofp_load_2i32_to_2f64(<2 x i32> *%a) {
; SSE-NEXT: retq
;
; VEX-LABEL: uitofp_load_2i32_to_2f64:
-; VEX: # BB#0:
+; VEX: # %bb.0:
; VEX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; VEX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; VEX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
@@ -2993,33 +2949,29 @@ define <2 x double> @uitofp_load_2i32_to_2f64(<2 x i32> *%a) {
; VEX-NEXT: retq
;
; AVX512F-LABEL: uitofp_load_2i32_to_2f64:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX512F-NEXT: vcvtudq2pd %ymm0, %zmm0
-; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512F-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: uitofp_load_2i32_to_2f64:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX512VL-NEXT: vcvtudq2pd %xmm0, %xmm0
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vcvtudq2pd (%rdi), %xmm0
; AVX512VL-NEXT: retq
;
; AVX512DQ-LABEL: uitofp_load_2i32_to_2f64:
-; AVX512DQ: # BB#0:
+; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX512DQ-NEXT: vcvtudq2pd %ymm0, %zmm0
-; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512DQ-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512VLDQ-LABEL: uitofp_load_2i32_to_2f64:
-; AVX512VLDQ: # BB#0:
-; AVX512VLDQ-NEXT: vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
-; AVX512VLDQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX512VLDQ-NEXT: vcvtudq2pd %xmm0, %xmm0
+; AVX512VLDQ: # %bb.0:
+; AVX512VLDQ-NEXT: vcvtudq2pd (%rdi), %xmm0
; AVX512VLDQ-NEXT: retq
%ld = load <2 x i32>, <2 x i32> *%a
%cvt = uitofp <2 x i32> %ld to <2 x double>
@@ -3028,51 +2980,19 @@ define <2 x double> @uitofp_load_2i32_to_2f64(<2 x i32> *%a) {
define <2 x double> @uitofp_load_2i16_to_2f64(<2 x i16> *%a) {
; SSE-LABEL: uitofp_load_2i16_to_2f64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE-NEXT: pxor %xmm1, %xmm1
; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; SSE-NEXT: cvtdq2pd %xmm0, %xmm0
; SSE-NEXT: retq
;
-; VEX-LABEL: uitofp_load_2i16_to_2f64:
-; VEX: # BB#0:
-; VEX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; VEX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; VEX-NEXT: vcvtdq2pd %xmm0, %xmm0
-; VEX-NEXT: retq
-;
-; AVX512F-LABEL: uitofp_load_2i16_to_2f64:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX512F-NEXT: vcvtdq2pd %xmm0, %xmm0
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: uitofp_load_2i16_to_2f64:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX512VL-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX512VL-NEXT: vcvtdq2pd %xmm0, %xmm0
-; AVX512VL-NEXT: retq
-;
-; AVX512DQ-LABEL: uitofp_load_2i16_to_2f64:
-; AVX512DQ: # BB#0:
-; AVX512DQ-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX512DQ-NEXT: vcvtdq2pd %xmm0, %xmm0
-; AVX512DQ-NEXT: retq
-;
-; AVX512VLDQ-LABEL: uitofp_load_2i16_to_2f64:
-; AVX512VLDQ: # BB#0:
-; AVX512VLDQ-NEXT: vpmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero
-; AVX512VLDQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX512VLDQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX512VLDQ-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX512VLDQ-NEXT: vcvtdq2pd %xmm0, %xmm0
-; AVX512VLDQ-NEXT: retq
+; AVX-LABEL: uitofp_load_2i16_to_2f64:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0
+; AVX-NEXT: retq
%ld = load <2 x i16>, <2 x i16> *%a
%cvt = uitofp <2 x i16> %ld to <2 x double>
ret <2 x double> %cvt
@@ -3080,7 +3000,7 @@ define <2 x double> @uitofp_load_2i16_to_2f64(<2 x i16> *%a) {
define <2 x double> @uitofp_load_2i8_to_2f64(<2 x i8> *%a) {
; SSE-LABEL: uitofp_load_2i8_to_2f64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movzwl (%rdi), %eax
; SSE-NEXT: movd %eax, %xmm0
; SSE-NEXT: pxor %xmm1, %xmm1
@@ -3089,45 +3009,12 @@ define <2 x double> @uitofp_load_2i8_to_2f64(<2 x i8> *%a) {
; SSE-NEXT: cvtdq2pd %xmm0, %xmm0
; SSE-NEXT: retq
;
-; VEX-LABEL: uitofp_load_2i8_to_2f64:
-; VEX: # BB#0:
-; VEX-NEXT: movzwl (%rdi), %eax
-; VEX-NEXT: vmovd %eax, %xmm0
-; VEX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; VEX-NEXT: vcvtdq2pd %xmm0, %xmm0
-; VEX-NEXT: retq
-;
-; AVX512F-LABEL: uitofp_load_2i8_to_2f64:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: movzwl (%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm0
-; AVX512F-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; AVX512F-NEXT: vcvtdq2pd %xmm0, %xmm0
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: uitofp_load_2i8_to_2f64:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512VL-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; AVX512VL-NEXT: vcvtdq2pd %xmm0, %xmm0
-; AVX512VL-NEXT: retq
-;
-; AVX512DQ-LABEL: uitofp_load_2i8_to_2f64:
-; AVX512DQ: # BB#0:
-; AVX512DQ-NEXT: movzwl (%rdi), %eax
-; AVX512DQ-NEXT: vmovd %eax, %xmm0
-; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; AVX512DQ-NEXT: vcvtdq2pd %xmm0, %xmm0
-; AVX512DQ-NEXT: retq
-;
-; AVX512VLDQ-LABEL: uitofp_load_2i8_to_2f64:
-; AVX512VLDQ: # BB#0:
-; AVX512VLDQ-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512VLDQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512VLDQ-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; AVX512VLDQ-NEXT: vcvtdq2pd %xmm0, %xmm0
-; AVX512VLDQ-NEXT: retq
+; AVX-LABEL: uitofp_load_2i8_to_2f64:
+; AVX: # %bb.0:
+; AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0
+; AVX-NEXT: retq
%ld = load <2 x i8>, <2 x i8> *%a
%cvt = uitofp <2 x i8> %ld to <2 x double>
ret <2 x double> %cvt
@@ -3135,7 +3022,7 @@ define <2 x double> @uitofp_load_2i8_to_2f64(<2 x i8> *%a) {
define <4 x double> @uitofp_load_4i64_to_4f64(<4 x i64> *%a) {
; SSE-LABEL: uitofp_load_4i64_to_4f64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movdqa (%rdi), %xmm1
; SSE-NEXT: movdqa 16(%rdi), %xmm2
; SSE-NEXT: movdqa {{.*#+}} xmm3 = [1127219200,1160773632,0,0]
@@ -3162,91 +3049,70 @@ define <4 x double> @uitofp_load_4i64_to_4f64(<4 x i64> *%a) {
; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; SSE-NEXT: retq
;
-; AVX1-LABEL: uitofp_load_4i64_to_4f64:
-; AVX1: # BB#0:
-; AVX1-NEXT: vmovdqa (%rdi), %ymm0
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1127219200,1160773632,0,0]
-; AVX1-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; AVX1-NEXT: vmovapd {{.*#+}} xmm4 = [4.503600e+15,1.934281e+25]
-; AVX1-NEXT: vsubpd %xmm4, %xmm3, %xmm3
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; AVX1-NEXT: vsubpd %xmm4, %xmm1, %xmm1
-; AVX1-NEXT: vhaddpd %xmm1, %xmm3, %xmm1
-; AVX1-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; AVX1-NEXT: vsubpd %xmm4, %xmm3, %xmm3
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; AVX1-NEXT: vsubpd %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vhaddpd %xmm0, %xmm3, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: uitofp_load_4i64_to_4f64:
-; AVX2: # BB#0:
-; AVX2-NEXT: vmovdqa (%rdi), %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [1127219200,1160773632,0,0]
-; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; AVX2-NEXT: vmovapd {{.*#+}} xmm4 = [4.503600e+15,1.934281e+25]
-; AVX2-NEXT: vsubpd %xmm4, %xmm3, %xmm3
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; AVX2-NEXT: vsubpd %xmm4, %xmm1, %xmm1
-; AVX2-NEXT: vhaddpd %xmm1, %xmm3, %xmm1
-; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; AVX2-NEXT: vsubpd %xmm4, %xmm3, %xmm3
-; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; AVX2-NEXT: vsubpd %xmm4, %xmm0, %xmm0
-; AVX2-NEXT: vhaddpd %xmm0, %xmm3, %xmm0
-; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT: retq
+; VEX-LABEL: uitofp_load_4i64_to_4f64:
+; VEX: # %bb.0:
+; VEX-NEXT: vmovapd (%rdi), %ymm0
+; VEX-NEXT: vextractf128 $1, %ymm0, %xmm1
+; VEX-NEXT: vmovapd {{.*#+}} xmm2 = [1127219200,1160773632,0,0]
+; VEX-NEXT: vunpcklps {{.*#+}} xmm3 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; VEX-NEXT: vmovapd {{.*#+}} xmm4 = [4.503600e+15,1.934281e+25]
+; VEX-NEXT: vsubpd %xmm4, %xmm3, %xmm3
+; VEX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; VEX-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; VEX-NEXT: vsubpd %xmm4, %xmm1, %xmm1
+; VEX-NEXT: vhaddpd %xmm1, %xmm3, %xmm1
+; VEX-NEXT: vunpcklps {{.*#+}} xmm3 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; VEX-NEXT: vsubpd %xmm4, %xmm3, %xmm3
+; VEX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; VEX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; VEX-NEXT: vsubpd %xmm4, %xmm0, %xmm0
+; VEX-NEXT: vhaddpd %xmm0, %xmm3, %xmm0
+; VEX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; VEX-NEXT: retq
;
; AVX512F-LABEL: uitofp_load_4i64_to_4f64:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512F-NEXT: vpextrq $1, %xmm1, %rax
; AVX512F-NEXT: vcvtusi2sdq %rax, %xmm2, %xmm2
; AVX512F-NEXT: vmovq %xmm1, %rax
; AVX512F-NEXT: vcvtusi2sdq %rax, %xmm3, %xmm1
-; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX512F-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; AVX512F-NEXT: vpextrq $1, %xmm0, %rax
; AVX512F-NEXT: vcvtusi2sdq %rax, %xmm3, %xmm2
; AVX512F-NEXT: vmovq %xmm0, %rax
; AVX512F-NEXT: vcvtusi2sdq %rax, %xmm3, %xmm0
-; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX512F-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; AVX512F-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: uitofp_load_4i64_to_4f64:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512VL-NEXT: vpextrq $1, %xmm1, %rax
; AVX512VL-NEXT: vcvtusi2sdq %rax, %xmm2, %xmm2
; AVX512VL-NEXT: vmovq %xmm1, %rax
; AVX512VL-NEXT: vcvtusi2sdq %rax, %xmm3, %xmm1
-; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax
; AVX512VL-NEXT: vcvtusi2sdq %rax, %xmm3, %xmm2
; AVX512VL-NEXT: vmovq %xmm0, %rax
; AVX512VL-NEXT: vcvtusi2sdq %rax, %xmm3, %xmm0
-; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX512VL-NEXT: retq
;
; AVX512DQ-LABEL: uitofp_load_4i64_to_4f64:
-; AVX512DQ: # BB#0:
+; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vmovaps (%rdi), %ymm0
; AVX512DQ-NEXT: vcvtuqq2pd %zmm0, %zmm0
-; AVX512DQ-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512DQ-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
; AVX512DQ-NEXT: retq
;
; AVX512VLDQ-LABEL: uitofp_load_4i64_to_4f64:
-; AVX512VLDQ: # BB#0:
+; AVX512VLDQ: # %bb.0:
; AVX512VLDQ-NEXT: vcvtuqq2pd (%rdi), %ymm0
; AVX512VLDQ-NEXT: retq
%ld = load <4 x i64>, <4 x i64> *%a
@@ -3256,7 +3122,7 @@ define <4 x double> @uitofp_load_4i64_to_4f64(<4 x i64> *%a) {
define <4 x double> @uitofp_load_4i32_to_4f64(<4 x i32> *%a) {
; SSE-LABEL: uitofp_load_4i32_to_4f64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movdqa (%rdi), %xmm0
; SSE-NEXT: movdqa %xmm0, %xmm1
; SSE-NEXT: psrld $16, %xmm1
@@ -3278,7 +3144,7 @@ define <4 x double> @uitofp_load_4i32_to_4f64(<4 x i32> *%a) {
; SSE-NEXT: retq
;
; AVX1-LABEL: uitofp_load_4i32_to_4f64:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovdqa (%rdi), %xmm0
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
@@ -3290,11 +3156,11 @@ define <4 x double> @uitofp_load_4i32_to_4f64(<4 x i32> *%a) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: uitofp_load_4i32_to_4f64:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa (%rdi), %xmm0
; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
; AVX2-NEXT: vcvtdq2pd %xmm1, %ymm1
-; AVX2-NEXT: vbroadcastsd {{.*}}(%rip), %ymm2
+; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm2 = [65536,65536,65536,65536]
; AVX2-NEXT: vmulpd %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vxorpd %xmm2, %xmm2, %xmm2
; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
@@ -3303,26 +3169,26 @@ define <4 x double> @uitofp_load_4i32_to_4f64(<4 x i32> *%a) {
; AVX2-NEXT: retq
;
; AVX512F-LABEL: uitofp_load_4i32_to_4f64:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovaps (%rdi), %xmm0
; AVX512F-NEXT: vcvtudq2pd %ymm0, %zmm0
-; AVX512F-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512F-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: uitofp_load_4i32_to_4f64:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vcvtudq2pd (%rdi), %ymm0
; AVX512VL-NEXT: retq
;
; AVX512DQ-LABEL: uitofp_load_4i32_to_4f64:
-; AVX512DQ: # BB#0:
+; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vmovaps (%rdi), %xmm0
; AVX512DQ-NEXT: vcvtudq2pd %ymm0, %zmm0
-; AVX512DQ-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512DQ-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
; AVX512DQ-NEXT: retq
;
; AVX512VLDQ-LABEL: uitofp_load_4i32_to_4f64:
-; AVX512VLDQ: # BB#0:
+; AVX512VLDQ: # %bb.0:
; AVX512VLDQ-NEXT: vcvtudq2pd (%rdi), %ymm0
; AVX512VLDQ-NEXT: retq
%ld = load <4 x i32>, <4 x i32> *%a
@@ -3332,7 +3198,7 @@ define <4 x double> @uitofp_load_4i32_to_4f64(<4 x i32> *%a) {
define <4 x double> @uitofp_load_4i16_to_4f64(<4 x i16> *%a) {
; SSE-LABEL: uitofp_load_4i16_to_4f64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
; SSE-NEXT: pxor %xmm0, %xmm0
; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
@@ -3342,7 +3208,7 @@ define <4 x double> @uitofp_load_4i16_to_4f64(<4 x i16> *%a) {
; SSE-NEXT: retq
;
; AVX-LABEL: uitofp_load_4i16_to_4f64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0
; AVX-NEXT: retq
@@ -3353,7 +3219,7 @@ define <4 x double> @uitofp_load_4i16_to_4f64(<4 x i16> *%a) {
define <4 x double> @uitofp_load_4i8_to_4f64(<4 x i8> *%a) {
; SSE-LABEL: uitofp_load_4i8_to_4f64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; SSE-NEXT: pxor %xmm0, %xmm0
; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
@@ -3364,7 +3230,7 @@ define <4 x double> @uitofp_load_4i8_to_4f64(<4 x i8> *%a) {
; SSE-NEXT: retq
;
; AVX-LABEL: uitofp_load_4i8_to_4f64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0
; AVX-NEXT: retq
@@ -3379,7 +3245,7 @@ define <4 x double> @uitofp_load_4i8_to_4f64(<4 x i8> *%a) {
define <4 x float> @sitofp_load_4i64_to_4f32(<4 x i64> *%a) {
; SSE-LABEL: sitofp_load_4i64_to_4f32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movdqa (%rdi), %xmm1
; SSE-NEXT: movdqa 16(%rdi), %xmm0
; SSE-NEXT: movq %xmm0, %rax
@@ -3397,11 +3263,11 @@ define <4 x float> @sitofp_load_4i64_to_4f32(<4 x i64> *%a) {
; SSE-NEXT: xorps %xmm1, %xmm1
; SSE-NEXT: cvtsi2ssq %rax, %xmm1
; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; SSE-NEXT: retq
;
; AVX1-LABEL: sitofp_load_4i64_to_4f32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovdqa (%rdi), %ymm0
; AVX1-NEXT: vpextrq $1, %xmm0, %rax
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
@@ -3419,7 +3285,7 @@ define <4 x float> @sitofp_load_4i64_to_4f32(<4 x i64> *%a) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: sitofp_load_4i64_to_4f32:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa (%rdi), %ymm0
; AVX2-NEXT: vpextrq $1, %xmm0, %rax
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
@@ -3437,7 +3303,7 @@ define <4 x float> @sitofp_load_4i64_to_4f32(<4 x i64> *%a) {
; AVX2-NEXT: retq
;
; AVX512F-LABEL: sitofp_load_4i64_to_4f32:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
; AVX512F-NEXT: vpextrq $1, %xmm0, %rax
; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
@@ -3455,7 +3321,7 @@ define <4 x float> @sitofp_load_4i64_to_4f32(<4 x i64> *%a) {
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: sitofp_load_4i64_to_4f32:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax
; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
@@ -3473,15 +3339,15 @@ define <4 x float> @sitofp_load_4i64_to_4f32(<4 x i64> *%a) {
; AVX512VL-NEXT: retq
;
; AVX512DQ-LABEL: sitofp_load_4i64_to_4f32:
-; AVX512DQ: # BB#0:
+; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vmovaps (%rdi), %ymm0
; AVX512DQ-NEXT: vcvtqq2ps %zmm0, %ymm0
-; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512DQ-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512VLDQ-LABEL: sitofp_load_4i64_to_4f32:
-; AVX512VLDQ: # BB#0:
+; AVX512VLDQ: # %bb.0:
; AVX512VLDQ-NEXT: vcvtqq2psy (%rdi), %xmm0
; AVX512VLDQ-NEXT: retq
%ld = load <4 x i64>, <4 x i64> *%a
@@ -3491,12 +3357,12 @@ define <4 x float> @sitofp_load_4i64_to_4f32(<4 x i64> *%a) {
define <4 x float> @sitofp_load_4i32_to_4f32(<4 x i32> *%a) {
; SSE-LABEL: sitofp_load_4i32_to_4f32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: cvtdq2ps (%rdi), %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: sitofp_load_4i32_to_4f32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vcvtdq2ps (%rdi), %xmm0
; AVX-NEXT: retq
%ld = load <4 x i32>, <4 x i32> *%a
@@ -3506,7 +3372,7 @@ define <4 x float> @sitofp_load_4i32_to_4f32(<4 x i32> *%a) {
define <4 x float> @sitofp_load_4i16_to_4f32(<4 x i16> *%a) {
; SSE-LABEL: sitofp_load_4i16_to_4f32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
; SSE-NEXT: psrad $16, %xmm0
@@ -3514,7 +3380,7 @@ define <4 x float> @sitofp_load_4i16_to_4f32(<4 x i16> *%a) {
; SSE-NEXT: retq
;
; AVX-LABEL: sitofp_load_4i16_to_4f32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpmovsxwd (%rdi), %xmm0
; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0
; AVX-NEXT: retq
@@ -3525,7 +3391,7 @@ define <4 x float> @sitofp_load_4i16_to_4f32(<4 x i16> *%a) {
define <4 x float> @sitofp_load_4i8_to_4f32(<4 x i8> *%a) {
; SSE-LABEL: sitofp_load_4i8_to_4f32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
@@ -3534,7 +3400,7 @@ define <4 x float> @sitofp_load_4i8_to_4f32(<4 x i8> *%a) {
; SSE-NEXT: retq
;
; AVX-LABEL: sitofp_load_4i8_to_4f32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpmovsxbd (%rdi), %xmm0
; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0
; AVX-NEXT: retq
@@ -3545,7 +3411,7 @@ define <4 x float> @sitofp_load_4i8_to_4f32(<4 x i8> *%a) {
define <8 x float> @sitofp_load_8i64_to_8f32(<8 x i64> *%a) {
; SSE-LABEL: sitofp_load_8i64_to_8f32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movdqa (%rdi), %xmm1
; SSE-NEXT: movdqa 16(%rdi), %xmm0
; SSE-NEXT: movdqa 32(%rdi), %xmm2
@@ -3565,7 +3431,7 @@ define <8 x float> @sitofp_load_8i64_to_8f32(<8 x i64> *%a) {
; SSE-NEXT: xorps %xmm1, %xmm1
; SSE-NEXT: cvtsi2ssq %rax, %xmm1
; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm4[0]
+; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0]
; SSE-NEXT: movq %xmm3, %rax
; SSE-NEXT: xorps %xmm4, %xmm4
; SSE-NEXT: cvtsi2ssq %rax, %xmm4
@@ -3582,11 +3448,11 @@ define <8 x float> @sitofp_load_8i64_to_8f32(<8 x i64> *%a) {
; SSE-NEXT: xorps %xmm2, %xmm2
; SSE-NEXT: cvtsi2ssq %rax, %xmm2
; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm4[0]
+; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm4[0]
; SSE-NEXT: retq
;
; AVX1-LABEL: sitofp_load_8i64_to_8f32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovdqa (%rdi), %ymm0
; AVX1-NEXT: vmovdqa 32(%rdi), %ymm1
; AVX1-NEXT: vpextrq $1, %xmm1, %rax
@@ -3617,7 +3483,7 @@ define <8 x float> @sitofp_load_8i64_to_8f32(<8 x i64> *%a) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: sitofp_load_8i64_to_8f32:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa (%rdi), %ymm0
; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1
; AVX2-NEXT: vpextrq $1, %xmm1, %rax
@@ -3648,7 +3514,7 @@ define <8 x float> @sitofp_load_8i64_to_8f32(<8 x i64> *%a) {
; AVX2-NEXT: retq
;
; AVX512F-LABEL: sitofp_load_8i64_to_8f32:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm0
; AVX512F-NEXT: vextracti32x4 $2, %zmm0, %xmm1
; AVX512F-NEXT: vpextrq $1, %xmm1, %rax
@@ -3668,7 +3534,7 @@ define <8 x float> @sitofp_load_8i64_to_8f32(<8 x i64> *%a) {
; AVX512F-NEXT: vmovq %xmm0, %rax
; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm3
; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
-; AVX512F-NEXT: vextracti32x4 $1, %zmm0, %xmm0
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX512F-NEXT: vmovq %xmm0, %rax
; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm3
; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3]
@@ -3679,7 +3545,7 @@ define <8 x float> @sitofp_load_8i64_to_8f32(<8 x i64> *%a) {
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: sitofp_load_8i64_to_8f32:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovdqa64 (%rdi), %zmm0
; AVX512VL-NEXT: vextracti32x4 $2, %zmm0, %xmm1
; AVX512VL-NEXT: vpextrq $1, %xmm1, %rax
@@ -3699,7 +3565,7 @@ define <8 x float> @sitofp_load_8i64_to_8f32(<8 x i64> *%a) {
; AVX512VL-NEXT: vmovq %xmm0, %rax
; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm3
; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
-; AVX512VL-NEXT: vextracti32x4 $1, %zmm0, %xmm0
+; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX512VL-NEXT: vmovq %xmm0, %rax
; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm3
; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3]
@@ -3710,12 +3576,12 @@ define <8 x float> @sitofp_load_8i64_to_8f32(<8 x i64> *%a) {
; AVX512VL-NEXT: retq
;
; AVX512DQ-LABEL: sitofp_load_8i64_to_8f32:
-; AVX512DQ: # BB#0:
+; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vcvtqq2ps (%rdi), %ymm0
; AVX512DQ-NEXT: retq
;
; AVX512VLDQ-LABEL: sitofp_load_8i64_to_8f32:
-; AVX512VLDQ: # BB#0:
+; AVX512VLDQ: # %bb.0:
; AVX512VLDQ-NEXT: vcvtqq2ps (%rdi), %ymm0
; AVX512VLDQ-NEXT: retq
%ld = load <8 x i64>, <8 x i64> *%a
@@ -3725,13 +3591,13 @@ define <8 x float> @sitofp_load_8i64_to_8f32(<8 x i64> *%a) {
define <8 x float> @sitofp_load_8i32_to_8f32(<8 x i32> *%a) {
; SSE-LABEL: sitofp_load_8i32_to_8f32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: cvtdq2ps (%rdi), %xmm0
; SSE-NEXT: cvtdq2ps 16(%rdi), %xmm1
; SSE-NEXT: retq
;
; AVX-LABEL: sitofp_load_8i32_to_8f32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vcvtdq2ps (%rdi), %ymm0
; AVX-NEXT: retq
%ld = load <8 x i32>, <8 x i32> *%a
@@ -3741,7 +3607,7 @@ define <8 x float> @sitofp_load_8i32_to_8f32(<8 x i32> *%a) {
define <8 x float> @sitofp_load_8i16_to_8f32(<8 x i16> *%a) {
; SSE-LABEL: sitofp_load_8i16_to_8f32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
; SSE-NEXT: psrad $16, %xmm0
@@ -3753,7 +3619,7 @@ define <8 x float> @sitofp_load_8i16_to_8f32(<8 x i16> *%a) {
; SSE-NEXT: retq
;
; AVX1-LABEL: sitofp_load_8i16_to_8f32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpmovsxwd (%rdi), %xmm0
; AVX1-NEXT: vpmovsxwd 8(%rdi), %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
@@ -3761,13 +3627,13 @@ define <8 x float> @sitofp_load_8i16_to_8f32(<8 x i16> *%a) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: sitofp_load_8i16_to_8f32:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpmovsxwd (%rdi), %ymm0
; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: sitofp_load_8i16_to_8f32:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpmovsxwd (%rdi), %ymm0
; AVX512-NEXT: vcvtdq2ps %ymm0, %ymm0
; AVX512-NEXT: retq
@@ -3778,7 +3644,7 @@ define <8 x float> @sitofp_load_8i16_to_8f32(<8 x i16> *%a) {
define <8 x float> @sitofp_load_8i8_to_8f32(<8 x i8> *%a) {
; SSE-LABEL: sitofp_load_8i8_to_8f32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
@@ -3792,7 +3658,7 @@ define <8 x float> @sitofp_load_8i8_to_8f32(<8 x i8> *%a) {
; SSE-NEXT: retq
;
; AVX1-LABEL: sitofp_load_8i8_to_8f32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpmovsxbw (%rdi), %xmm0
; AVX1-NEXT: vpmovsxwd %xmm0, %xmm1
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
@@ -3802,13 +3668,13 @@ define <8 x float> @sitofp_load_8i8_to_8f32(<8 x i8> *%a) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: sitofp_load_8i8_to_8f32:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpmovsxbd (%rdi), %ymm0
; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: sitofp_load_8i8_to_8f32:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpmovsxbd (%rdi), %ymm0
; AVX512-NEXT: vcvtdq2ps %ymm0, %ymm0
; AVX512-NEXT: retq
@@ -3823,13 +3689,13 @@ define <8 x float> @sitofp_load_8i8_to_8f32(<8 x i8> *%a) {
define <4 x float> @uitofp_load_4i64_to_4f32(<4 x i64> *%a) {
; SSE-LABEL: uitofp_load_4i64_to_4f32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movdqa (%rdi), %xmm2
; SSE-NEXT: movdqa 16(%rdi), %xmm0
; SSE-NEXT: movq %xmm0, %rax
; SSE-NEXT: testq %rax, %rax
; SSE-NEXT: js .LBB76_1
-; SSE-NEXT: # BB#2:
+; SSE-NEXT: # %bb.2:
; SSE-NEXT: cvtsi2ssq %rax, %xmm1
; SSE-NEXT: jmp .LBB76_3
; SSE-NEXT: .LBB76_1:
@@ -3844,7 +3710,7 @@ define <4 x float> @uitofp_load_4i64_to_4f32(<4 x i64> *%a) {
; SSE-NEXT: movq %xmm0, %rax
; SSE-NEXT: testq %rax, %rax
; SSE-NEXT: js .LBB76_4
-; SSE-NEXT: # BB#5:
+; SSE-NEXT: # %bb.5:
; SSE-NEXT: cvtsi2ssq %rax, %xmm3
; SSE-NEXT: jmp .LBB76_6
; SSE-NEXT: .LBB76_4:
@@ -3858,7 +3724,7 @@ define <4 x float> @uitofp_load_4i64_to_4f32(<4 x i64> *%a) {
; SSE-NEXT: movq %xmm2, %rax
; SSE-NEXT: testq %rax, %rax
; SSE-NEXT: js .LBB76_7
-; SSE-NEXT: # BB#8:
+; SSE-NEXT: # %bb.8:
; SSE-NEXT: xorps %xmm0, %xmm0
; SSE-NEXT: cvtsi2ssq %rax, %xmm0
; SSE-NEXT: jmp .LBB76_9
@@ -3876,7 +3742,7 @@ define <4 x float> @uitofp_load_4i64_to_4f32(<4 x i64> *%a) {
; SSE-NEXT: movq %xmm2, %rax
; SSE-NEXT: testq %rax, %rax
; SSE-NEXT: js .LBB76_10
-; SSE-NEXT: # BB#11:
+; SSE-NEXT: # %bb.11:
; SSE-NEXT: xorps %xmm2, %xmm2
; SSE-NEXT: cvtsi2ssq %rax, %xmm2
; SSE-NEXT: jmp .LBB76_12
@@ -3890,16 +3756,16 @@ define <4 x float> @uitofp_load_4i64_to_4f32(<4 x i64> *%a) {
; SSE-NEXT: addss %xmm2, %xmm2
; SSE-NEXT: .LBB76_12:
; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE-NEXT: retq
;
; AVX1-LABEL: uitofp_load_4i64_to_4f32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovdqa (%rdi), %ymm0
; AVX1-NEXT: vpextrq $1, %xmm0, %rax
; AVX1-NEXT: testq %rax, %rax
; AVX1-NEXT: js .LBB76_1
-; AVX1-NEXT: # BB#2:
+; AVX1-NEXT: # %bb.2:
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
; AVX1-NEXT: jmp .LBB76_3
; AVX1-NEXT: .LBB76_1:
@@ -3913,7 +3779,7 @@ define <4 x float> @uitofp_load_4i64_to_4f32(<4 x i64> *%a) {
; AVX1-NEXT: vmovq %xmm0, %rax
; AVX1-NEXT: testq %rax, %rax
; AVX1-NEXT: js .LBB76_4
-; AVX1-NEXT: # BB#5:
+; AVX1-NEXT: # %bb.5:
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2
; AVX1-NEXT: jmp .LBB76_6
; AVX1-NEXT: .LBB76_4:
@@ -3929,7 +3795,7 @@ define <4 x float> @uitofp_load_4i64_to_4f32(<4 x i64> *%a) {
; AVX1-NEXT: vmovq %xmm0, %rax
; AVX1-NEXT: testq %rax, %rax
; AVX1-NEXT: js .LBB76_7
-; AVX1-NEXT: # BB#8:
+; AVX1-NEXT: # %bb.8:
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2
; AVX1-NEXT: jmp .LBB76_9
; AVX1-NEXT: .LBB76_7:
@@ -3944,7 +3810,7 @@ define <4 x float> @uitofp_load_4i64_to_4f32(<4 x i64> *%a) {
; AVX1-NEXT: vpextrq $1, %xmm0, %rax
; AVX1-NEXT: testq %rax, %rax
; AVX1-NEXT: js .LBB76_10
-; AVX1-NEXT: # BB#11:
+; AVX1-NEXT: # %bb.11:
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0
; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
; AVX1-NEXT: vzeroupper
@@ -3961,12 +3827,12 @@ define <4 x float> @uitofp_load_4i64_to_4f32(<4 x i64> *%a) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: uitofp_load_4i64_to_4f32:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa (%rdi), %ymm0
; AVX2-NEXT: vpextrq $1, %xmm0, %rax
; AVX2-NEXT: testq %rax, %rax
; AVX2-NEXT: js .LBB76_1
-; AVX2-NEXT: # BB#2:
+; AVX2-NEXT: # %bb.2:
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
; AVX2-NEXT: jmp .LBB76_3
; AVX2-NEXT: .LBB76_1:
@@ -3980,7 +3846,7 @@ define <4 x float> @uitofp_load_4i64_to_4f32(<4 x i64> *%a) {
; AVX2-NEXT: vmovq %xmm0, %rax
; AVX2-NEXT: testq %rax, %rax
; AVX2-NEXT: js .LBB76_4
-; AVX2-NEXT: # BB#5:
+; AVX2-NEXT: # %bb.5:
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2
; AVX2-NEXT: jmp .LBB76_6
; AVX2-NEXT: .LBB76_4:
@@ -3996,7 +3862,7 @@ define <4 x float> @uitofp_load_4i64_to_4f32(<4 x i64> *%a) {
; AVX2-NEXT: vmovq %xmm0, %rax
; AVX2-NEXT: testq %rax, %rax
; AVX2-NEXT: js .LBB76_7
-; AVX2-NEXT: # BB#8:
+; AVX2-NEXT: # %bb.8:
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2
; AVX2-NEXT: jmp .LBB76_9
; AVX2-NEXT: .LBB76_7:
@@ -4011,7 +3877,7 @@ define <4 x float> @uitofp_load_4i64_to_4f32(<4 x i64> *%a) {
; AVX2-NEXT: vpextrq $1, %xmm0, %rax
; AVX2-NEXT: testq %rax, %rax
; AVX2-NEXT: js .LBB76_10
-; AVX2-NEXT: # BB#11:
+; AVX2-NEXT: # %bb.11:
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0
; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
; AVX2-NEXT: vzeroupper
@@ -4028,7 +3894,7 @@ define <4 x float> @uitofp_load_4i64_to_4f32(<4 x i64> *%a) {
; AVX2-NEXT: retq
;
; AVX512F-LABEL: uitofp_load_4i64_to_4f32:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
; AVX512F-NEXT: vpextrq $1, %xmm0, %rax
; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm1, %xmm1
@@ -4046,7 +3912,7 @@ define <4 x float> @uitofp_load_4i64_to_4f32(<4 x i64> *%a) {
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: uitofp_load_4i64_to_4f32:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax
; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm1, %xmm1
@@ -4064,15 +3930,15 @@ define <4 x float> @uitofp_load_4i64_to_4f32(<4 x i64> *%a) {
; AVX512VL-NEXT: retq
;
; AVX512DQ-LABEL: uitofp_load_4i64_to_4f32:
-; AVX512DQ: # BB#0:
+; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vmovaps (%rdi), %ymm0
; AVX512DQ-NEXT: vcvtuqq2ps %zmm0, %ymm0
-; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512DQ-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512VLDQ-LABEL: uitofp_load_4i64_to_4f32:
-; AVX512VLDQ: # BB#0:
+; AVX512VLDQ: # %bb.0:
; AVX512VLDQ-NEXT: vcvtuqq2psy (%rdi), %xmm0
; AVX512VLDQ-NEXT: retq
%ld = load <4 x i64>, <4 x i64> *%a
@@ -4082,7 +3948,7 @@ define <4 x float> @uitofp_load_4i64_to_4f32(<4 x i64> *%a) {
define <4 x float> @uitofp_load_4i32_to_4f32(<4 x i32> *%a) {
; SSE-LABEL: uitofp_load_4i32_to_4f32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movdqa (%rdi), %xmm0
; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535]
; SSE-NEXT: pand %xmm0, %xmm1
@@ -4094,7 +3960,7 @@ define <4 x float> @uitofp_load_4i32_to_4f32(<4 x i32> *%a) {
; SSE-NEXT: retq
;
; AVX1-LABEL: uitofp_load_4i32_to_4f32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovdqa (%rdi), %xmm0
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7]
; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0
@@ -4104,41 +3970,41 @@ define <4 x float> @uitofp_load_4i32_to_4f32(<4 x i32> *%a) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: uitofp_load_4i32_to_4f32:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa (%rdi), %xmm0
-; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1
+; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1258291200,1258291200,1258291200,1258291200]
; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
; AVX2-NEXT: vpsrld $16, %xmm0, %xmm0
-; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm2
+; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1392508928,1392508928,1392508928,1392508928]
; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
-; AVX2-NEXT: vbroadcastss {{.*}}(%rip), %xmm2
+; AVX2-NEXT: vbroadcastss {{.*#+}} xmm2 = [-5.49764202E+11,-5.49764202E+11,-5.49764202E+11,-5.49764202E+11]
; AVX2-NEXT: vaddps %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vaddps %xmm0, %xmm1, %xmm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: uitofp_load_4i32_to_4f32:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovaps (%rdi), %xmm0
; AVX512F-NEXT: vcvtudq2ps %zmm0, %zmm0
-; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512F-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: uitofp_load_4i32_to_4f32:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vcvtudq2ps (%rdi), %xmm0
; AVX512VL-NEXT: retq
;
; AVX512DQ-LABEL: uitofp_load_4i32_to_4f32:
-; AVX512DQ: # BB#0:
+; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vmovaps (%rdi), %xmm0
; AVX512DQ-NEXT: vcvtudq2ps %zmm0, %zmm0
-; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512DQ-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512VLDQ-LABEL: uitofp_load_4i32_to_4f32:
-; AVX512VLDQ: # BB#0:
+; AVX512VLDQ: # %bb.0:
; AVX512VLDQ-NEXT: vcvtudq2ps (%rdi), %xmm0
; AVX512VLDQ-NEXT: retq
%ld = load <4 x i32>, <4 x i32> *%a
@@ -4148,7 +4014,7 @@ define <4 x float> @uitofp_load_4i32_to_4f32(<4 x i32> *%a) {
define <4 x float> @uitofp_load_4i16_to_4f32(<4 x i16> *%a) {
; SSE-LABEL: uitofp_load_4i16_to_4f32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; SSE-NEXT: pxor %xmm1, %xmm1
; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
@@ -4156,7 +4022,7 @@ define <4 x float> @uitofp_load_4i16_to_4f32(<4 x i16> *%a) {
; SSE-NEXT: retq
;
; AVX-LABEL: uitofp_load_4i16_to_4f32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0
; AVX-NEXT: retq
@@ -4167,7 +4033,7 @@ define <4 x float> @uitofp_load_4i16_to_4f32(<4 x i16> *%a) {
define <4 x float> @uitofp_load_4i8_to_4f32(<4 x i8> *%a) {
; SSE-LABEL: uitofp_load_4i8_to_4f32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE-NEXT: pxor %xmm1, %xmm1
; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
@@ -4176,7 +4042,7 @@ define <4 x float> @uitofp_load_4i8_to_4f32(<4 x i8> *%a) {
; SSE-NEXT: retq
;
; AVX-LABEL: uitofp_load_4i8_to_4f32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0
; AVX-NEXT: retq
@@ -4187,7 +4053,7 @@ define <4 x float> @uitofp_load_4i8_to_4f32(<4 x i8> *%a) {
define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) {
; SSE-LABEL: uitofp_load_8i64_to_8f32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movdqa (%rdi), %xmm5
; SSE-NEXT: movdqa 16(%rdi), %xmm0
; SSE-NEXT: movdqa 32(%rdi), %xmm2
@@ -4195,7 +4061,7 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) {
; SSE-NEXT: movq %xmm0, %rax
; SSE-NEXT: testq %rax, %rax
; SSE-NEXT: js .LBB80_1
-; SSE-NEXT: # BB#2:
+; SSE-NEXT: # %bb.2:
; SSE-NEXT: cvtsi2ssq %rax, %xmm3
; SSE-NEXT: jmp .LBB80_3
; SSE-NEXT: .LBB80_1:
@@ -4210,7 +4076,7 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) {
; SSE-NEXT: movq %xmm0, %rax
; SSE-NEXT: testq %rax, %rax
; SSE-NEXT: js .LBB80_4
-; SSE-NEXT: # BB#5:
+; SSE-NEXT: # %bb.5:
; SSE-NEXT: cvtsi2ssq %rax, %xmm4
; SSE-NEXT: jmp .LBB80_6
; SSE-NEXT: .LBB80_4:
@@ -4224,7 +4090,7 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) {
; SSE-NEXT: movq %xmm5, %rax
; SSE-NEXT: testq %rax, %rax
; SSE-NEXT: js .LBB80_7
-; SSE-NEXT: # BB#8:
+; SSE-NEXT: # %bb.8:
; SSE-NEXT: xorps %xmm0, %xmm0
; SSE-NEXT: cvtsi2ssq %rax, %xmm0
; SSE-NEXT: jmp .LBB80_9
@@ -4241,7 +4107,7 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) {
; SSE-NEXT: movq %xmm5, %rax
; SSE-NEXT: testq %rax, %rax
; SSE-NEXT: js .LBB80_10
-; SSE-NEXT: # BB#11:
+; SSE-NEXT: # %bb.11:
; SSE-NEXT: cvtsi2ssq %rax, %xmm6
; SSE-NEXT: jmp .LBB80_12
; SSE-NEXT: .LBB80_10:
@@ -4255,7 +4121,7 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) {
; SSE-NEXT: movq %xmm1, %rax
; SSE-NEXT: testq %rax, %rax
; SSE-NEXT: js .LBB80_13
-; SSE-NEXT: # BB#14:
+; SSE-NEXT: # %bb.14:
; SSE-NEXT: xorps %xmm5, %xmm5
; SSE-NEXT: cvtsi2ssq %rax, %xmm5
; SSE-NEXT: jmp .LBB80_15
@@ -4272,7 +4138,7 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) {
; SSE-NEXT: movq %xmm1, %rax
; SSE-NEXT: testq %rax, %rax
; SSE-NEXT: js .LBB80_16
-; SSE-NEXT: # BB#17:
+; SSE-NEXT: # %bb.17:
; SSE-NEXT: cvtsi2ssq %rax, %xmm7
; SSE-NEXT: jmp .LBB80_18
; SSE-NEXT: .LBB80_16:
@@ -4288,7 +4154,7 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) {
; SSE-NEXT: movq %xmm2, %rax
; SSE-NEXT: testq %rax, %rax
; SSE-NEXT: js .LBB80_19
-; SSE-NEXT: # BB#20:
+; SSE-NEXT: # %bb.20:
; SSE-NEXT: xorps %xmm1, %xmm1
; SSE-NEXT: cvtsi2ssq %rax, %xmm1
; SSE-NEXT: jmp .LBB80_21
@@ -4301,13 +4167,13 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) {
; SSE-NEXT: cvtsi2ssq %rax, %xmm1
; SSE-NEXT: addss %xmm1, %xmm1
; SSE-NEXT: .LBB80_21:
-; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm3[0]
+; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0]
; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1]
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
; SSE-NEXT: movq %xmm2, %rax
; SSE-NEXT: testq %rax, %rax
; SSE-NEXT: js .LBB80_22
-; SSE-NEXT: # BB#23:
+; SSE-NEXT: # %bb.23:
; SSE-NEXT: xorps %xmm2, %xmm2
; SSE-NEXT: cvtsi2ssq %rax, %xmm2
; SSE-NEXT: jmp .LBB80_24
@@ -4321,17 +4187,17 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) {
; SSE-NEXT: addss %xmm2, %xmm2
; SSE-NEXT: .LBB80_24:
; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm5[0]
+; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm5[0]
; SSE-NEXT: retq
;
; AVX1-LABEL: uitofp_load_8i64_to_8f32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovdqa (%rdi), %ymm0
; AVX1-NEXT: vmovdqa 32(%rdi), %ymm2
; AVX1-NEXT: vpextrq $1, %xmm2, %rax
; AVX1-NEXT: testq %rax, %rax
; AVX1-NEXT: js .LBB80_1
-; AVX1-NEXT: # BB#2:
+; AVX1-NEXT: # %bb.2:
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
; AVX1-NEXT: jmp .LBB80_3
; AVX1-NEXT: .LBB80_1:
@@ -4345,7 +4211,7 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) {
; AVX1-NEXT: vmovq %xmm2, %rax
; AVX1-NEXT: testq %rax, %rax
; AVX1-NEXT: js .LBB80_4
-; AVX1-NEXT: # BB#5:
+; AVX1-NEXT: # %bb.5:
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm3
; AVX1-NEXT: jmp .LBB80_6
; AVX1-NEXT: .LBB80_4:
@@ -4360,7 +4226,7 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) {
; AVX1-NEXT: vmovq %xmm2, %rax
; AVX1-NEXT: testq %rax, %rax
; AVX1-NEXT: js .LBB80_7
-; AVX1-NEXT: # BB#8:
+; AVX1-NEXT: # %bb.8:
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm4
; AVX1-NEXT: jmp .LBB80_9
; AVX1-NEXT: .LBB80_7:
@@ -4374,7 +4240,7 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) {
; AVX1-NEXT: vpextrq $1, %xmm2, %rax
; AVX1-NEXT: testq %rax, %rax
; AVX1-NEXT: js .LBB80_10
-; AVX1-NEXT: # BB#11:
+; AVX1-NEXT: # %bb.11:
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm2
; AVX1-NEXT: jmp .LBB80_12
; AVX1-NEXT: .LBB80_10:
@@ -4388,7 +4254,7 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) {
; AVX1-NEXT: vpextrq $1, %xmm0, %rax
; AVX1-NEXT: testq %rax, %rax
; AVX1-NEXT: js .LBB80_13
-; AVX1-NEXT: # BB#14:
+; AVX1-NEXT: # %bb.14:
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm5
; AVX1-NEXT: jmp .LBB80_15
; AVX1-NEXT: .LBB80_13:
@@ -4403,7 +4269,7 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) {
; AVX1-NEXT: vmovq %xmm0, %rax
; AVX1-NEXT: testq %rax, %rax
; AVX1-NEXT: js .LBB80_16
-; AVX1-NEXT: # BB#17:
+; AVX1-NEXT: # %bb.17:
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm3
; AVX1-NEXT: jmp .LBB80_18
; AVX1-NEXT: .LBB80_16:
@@ -4420,7 +4286,7 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) {
; AVX1-NEXT: vmovq %xmm4, %rax
; AVX1-NEXT: testq %rax, %rax
; AVX1-NEXT: js .LBB80_19
-; AVX1-NEXT: # BB#20:
+; AVX1-NEXT: # %bb.20:
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm5
; AVX1-NEXT: jmp .LBB80_21
; AVX1-NEXT: .LBB80_19:
@@ -4436,7 +4302,7 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) {
; AVX1-NEXT: vpextrq $1, %xmm4, %rax
; AVX1-NEXT: testq %rax, %rax
; AVX1-NEXT: js .LBB80_22
-; AVX1-NEXT: # BB#23:
+; AVX1-NEXT: # %bb.23:
; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm2
; AVX1-NEXT: jmp .LBB80_24
; AVX1-NEXT: .LBB80_22:
@@ -4452,13 +4318,13 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: uitofp_load_8i64_to_8f32:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa (%rdi), %ymm0
; AVX2-NEXT: vmovdqa 32(%rdi), %ymm2
; AVX2-NEXT: vpextrq $1, %xmm2, %rax
; AVX2-NEXT: testq %rax, %rax
; AVX2-NEXT: js .LBB80_1
-; AVX2-NEXT: # BB#2:
+; AVX2-NEXT: # %bb.2:
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
; AVX2-NEXT: jmp .LBB80_3
; AVX2-NEXT: .LBB80_1:
@@ -4472,7 +4338,7 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) {
; AVX2-NEXT: vmovq %xmm2, %rax
; AVX2-NEXT: testq %rax, %rax
; AVX2-NEXT: js .LBB80_4
-; AVX2-NEXT: # BB#5:
+; AVX2-NEXT: # %bb.5:
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm3
; AVX2-NEXT: jmp .LBB80_6
; AVX2-NEXT: .LBB80_4:
@@ -4487,7 +4353,7 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) {
; AVX2-NEXT: vmovq %xmm2, %rax
; AVX2-NEXT: testq %rax, %rax
; AVX2-NEXT: js .LBB80_7
-; AVX2-NEXT: # BB#8:
+; AVX2-NEXT: # %bb.8:
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm4
; AVX2-NEXT: jmp .LBB80_9
; AVX2-NEXT: .LBB80_7:
@@ -4501,7 +4367,7 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) {
; AVX2-NEXT: vpextrq $1, %xmm2, %rax
; AVX2-NEXT: testq %rax, %rax
; AVX2-NEXT: js .LBB80_10
-; AVX2-NEXT: # BB#11:
+; AVX2-NEXT: # %bb.11:
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm2
; AVX2-NEXT: jmp .LBB80_12
; AVX2-NEXT: .LBB80_10:
@@ -4515,7 +4381,7 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) {
; AVX2-NEXT: vpextrq $1, %xmm0, %rax
; AVX2-NEXT: testq %rax, %rax
; AVX2-NEXT: js .LBB80_13
-; AVX2-NEXT: # BB#14:
+; AVX2-NEXT: # %bb.14:
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm5
; AVX2-NEXT: jmp .LBB80_15
; AVX2-NEXT: .LBB80_13:
@@ -4530,7 +4396,7 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) {
; AVX2-NEXT: vmovq %xmm0, %rax
; AVX2-NEXT: testq %rax, %rax
; AVX2-NEXT: js .LBB80_16
-; AVX2-NEXT: # BB#17:
+; AVX2-NEXT: # %bb.17:
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm3
; AVX2-NEXT: jmp .LBB80_18
; AVX2-NEXT: .LBB80_16:
@@ -4547,7 +4413,7 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) {
; AVX2-NEXT: vmovq %xmm4, %rax
; AVX2-NEXT: testq %rax, %rax
; AVX2-NEXT: js .LBB80_19
-; AVX2-NEXT: # BB#20:
+; AVX2-NEXT: # %bb.20:
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm5
; AVX2-NEXT: jmp .LBB80_21
; AVX2-NEXT: .LBB80_19:
@@ -4563,7 +4429,7 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) {
; AVX2-NEXT: vpextrq $1, %xmm4, %rax
; AVX2-NEXT: testq %rax, %rax
; AVX2-NEXT: js .LBB80_22
-; AVX2-NEXT: # BB#23:
+; AVX2-NEXT: # %bb.23:
; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm2
; AVX2-NEXT: jmp .LBB80_24
; AVX2-NEXT: .LBB80_22:
@@ -4579,7 +4445,7 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) {
; AVX2-NEXT: retq
;
; AVX512F-LABEL: uitofp_load_8i64_to_8f32:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm0
; AVX512F-NEXT: vextracti32x4 $2, %zmm0, %xmm1
; AVX512F-NEXT: vpextrq $1, %xmm1, %rax
@@ -4599,7 +4465,7 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) {
; AVX512F-NEXT: vmovq %xmm0, %rax
; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm3
; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
-; AVX512F-NEXT: vextracti32x4 $1, %zmm0, %xmm0
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX512F-NEXT: vmovq %xmm0, %rax
; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm3
; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3]
@@ -4610,7 +4476,7 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) {
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: uitofp_load_8i64_to_8f32:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovdqa64 (%rdi), %zmm0
; AVX512VL-NEXT: vextracti32x4 $2, %zmm0, %xmm1
; AVX512VL-NEXT: vpextrq $1, %xmm1, %rax
@@ -4630,7 +4496,7 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) {
; AVX512VL-NEXT: vmovq %xmm0, %rax
; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm3
; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
-; AVX512VL-NEXT: vextracti32x4 $1, %zmm0, %xmm0
+; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX512VL-NEXT: vmovq %xmm0, %rax
; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm3
; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3]
@@ -4641,12 +4507,12 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) {
; AVX512VL-NEXT: retq
;
; AVX512DQ-LABEL: uitofp_load_8i64_to_8f32:
-; AVX512DQ: # BB#0:
+; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vcvtuqq2ps (%rdi), %ymm0
; AVX512DQ-NEXT: retq
;
; AVX512VLDQ-LABEL: uitofp_load_8i64_to_8f32:
-; AVX512VLDQ: # BB#0:
+; AVX512VLDQ: # %bb.0:
; AVX512VLDQ-NEXT: vcvtuqq2ps (%rdi), %ymm0
; AVX512VLDQ-NEXT: retq
%ld = load <8 x i64>, <8 x i64> *%a
@@ -4656,7 +4522,7 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) {
define <8 x float> @uitofp_load_8i32_to_8f32(<8 x i32> *%a) {
; SSE-LABEL: uitofp_load_8i32_to_8f32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movdqa (%rdi), %xmm0
; SSE-NEXT: movdqa 16(%rdi), %xmm1
; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535]
@@ -4679,7 +4545,7 @@ define <8 x float> @uitofp_load_8i32_to_8f32(<8 x i32> *%a) {
; SSE-NEXT: retq
;
; AVX1-LABEL: uitofp_load_8i32_to_8f32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovdqa (%rdi), %ymm0
; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
@@ -4693,39 +4559,39 @@ define <8 x float> @uitofp_load_8i32_to_8f32(<8 x i32> *%a) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: uitofp_load_8i32_to_8f32:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa (%rdi), %ymm0
-; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1
+; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1258291200,1258291200,1258291200,1258291200,1258291200,1258291200,1258291200,1258291200]
; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0
-; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm2
+; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [1392508928,1392508928,1392508928,1392508928,1392508928,1392508928,1392508928,1392508928]
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7],ymm0[8],ymm2[9],ymm0[10],ymm2[11],ymm0[12],ymm2[13],ymm0[14],ymm2[15]
-; AVX2-NEXT: vbroadcastss {{.*}}(%rip), %ymm2
+; AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [-5.49764202E+11,-5.49764202E+11,-5.49764202E+11,-5.49764202E+11,-5.49764202E+11,-5.49764202E+11,-5.49764202E+11,-5.49764202E+11]
; AVX2-NEXT: vaddps %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vaddps %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: uitofp_load_8i32_to_8f32:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovaps (%rdi), %ymm0
; AVX512F-NEXT: vcvtudq2ps %zmm0, %zmm0
-; AVX512F-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512F-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: uitofp_load_8i32_to_8f32:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vcvtudq2ps (%rdi), %ymm0
; AVX512VL-NEXT: retq
;
; AVX512DQ-LABEL: uitofp_load_8i32_to_8f32:
-; AVX512DQ: # BB#0:
+; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vmovaps (%rdi), %ymm0
; AVX512DQ-NEXT: vcvtudq2ps %zmm0, %zmm0
-; AVX512DQ-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512DQ-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
; AVX512DQ-NEXT: retq
;
; AVX512VLDQ-LABEL: uitofp_load_8i32_to_8f32:
-; AVX512VLDQ: # BB#0:
+; AVX512VLDQ: # %bb.0:
; AVX512VLDQ-NEXT: vcvtudq2ps (%rdi), %ymm0
; AVX512VLDQ-NEXT: retq
%ld = load <8 x i32>, <8 x i32> *%a
@@ -4735,7 +4601,7 @@ define <8 x float> @uitofp_load_8i32_to_8f32(<8 x i32> *%a) {
define <8 x float> @uitofp_load_8i16_to_8f32(<8 x i16> *%a) {
; SSE-LABEL: uitofp_load_8i16_to_8f32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movdqa (%rdi), %xmm1
; SSE-NEXT: pxor %xmm2, %xmm2
; SSE-NEXT: movdqa %xmm1, %xmm0
@@ -4746,7 +4612,7 @@ define <8 x float> @uitofp_load_8i16_to_8f32(<8 x i16> *%a) {
; SSE-NEXT: retq
;
; AVX1-LABEL: uitofp_load_8i16_to_8f32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
@@ -4754,13 +4620,13 @@ define <8 x float> @uitofp_load_8i16_to_8f32(<8 x i16> *%a) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: uitofp_load_8i16_to_8f32:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: uitofp_load_8i16_to_8f32:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
; AVX512-NEXT: vcvtdq2ps %ymm0, %ymm0
; AVX512-NEXT: retq
@@ -4771,7 +4637,7 @@ define <8 x float> @uitofp_load_8i16_to_8f32(<8 x i16> *%a) {
define <8 x float> @uitofp_load_8i8_to_8f32(<8 x i8> *%a) {
; SSE-LABEL: uitofp_load_8i8_to_8f32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
; SSE-NEXT: pxor %xmm2, %xmm2
; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
@@ -4783,7 +4649,7 @@ define <8 x float> @uitofp_load_8i8_to_8f32(<8 x i8> *%a) {
; SSE-NEXT: retq
;
; AVX1-LABEL: uitofp_load_8i8_to_8f32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
@@ -4791,13 +4657,13 @@ define <8 x float> @uitofp_load_8i8_to_8f32(<8 x i8> *%a) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: uitofp_load_8i8_to_8f32:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: uitofp_load_8i8_to_8f32:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
; AVX512-NEXT: vcvtdq2ps %ymm0, %ymm0
; AVX512-NEXT: retq
@@ -4813,7 +4679,7 @@ define <8 x float> @uitofp_load_8i8_to_8f32(<8 x i8> *%a) {
%Arguments = type <{ <8 x i8>, <8 x i16>, <8 x float>* }>
define void @aggregate_sitofp_8i16_to_8f32(%Arguments* nocapture readonly %a0) {
; SSE-LABEL: aggregate_sitofp_8i16_to_8f32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movq 24(%rdi), %rax
; SSE-NEXT: movdqu 8(%rdi), %xmm0
; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
@@ -4827,7 +4693,7 @@ define void @aggregate_sitofp_8i16_to_8f32(%Arguments* nocapture readonly %a0) {
; SSE-NEXT: retq
;
; AVX1-LABEL: aggregate_sitofp_8i16_to_8f32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: movq 24(%rdi), %rax
; AVX1-NEXT: vmovdqu 8(%rdi), %xmm0
; AVX1-NEXT: vpmovsxwd %xmm0, %xmm1
@@ -4840,7 +4706,7 @@ define void @aggregate_sitofp_8i16_to_8f32(%Arguments* nocapture readonly %a0) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: aggregate_sitofp_8i16_to_8f32:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: movq 24(%rdi), %rax
; AVX2-NEXT: vpmovsxwd 8(%rdi), %ymm0
; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0
@@ -4849,7 +4715,7 @@ define void @aggregate_sitofp_8i16_to_8f32(%Arguments* nocapture readonly %a0) {
; AVX2-NEXT: retq
;
; AVX512-LABEL: aggregate_sitofp_8i16_to_8f32:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: movq 24(%rdi), %rax
; AVX512-NEXT: vpmovsxwd 8(%rdi), %ymm0
; AVX512-NEXT: vcvtdq2ps %ymm0, %ymm0
@@ -4866,12 +4732,12 @@ define void @aggregate_sitofp_8i16_to_8f32(%Arguments* nocapture readonly %a0) {
define <2 x double> @sitofp_i32_to_2f64(<2 x double> %a0, i32 %a1) nounwind {
; SSE-LABEL: sitofp_i32_to_2f64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: cvtsi2sdl %edi, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: sitofp_i32_to_2f64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vcvtsi2sdl %edi, %xmm0, %xmm0
; AVX-NEXT: retq
%cvt = sitofp i32 %a1 to double
@@ -4881,12 +4747,12 @@ define <2 x double> @sitofp_i32_to_2f64(<2 x double> %a0, i32 %a1) nounwind {
define <4 x float> @sitofp_i32_to_4f32(<4 x float> %a0, i32 %a1) nounwind {
; SSE-LABEL: sitofp_i32_to_4f32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: cvtsi2ssl %edi, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: sitofp_i32_to_4f32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vcvtsi2ssl %edi, %xmm0, %xmm0
; AVX-NEXT: retq
%cvt = sitofp i32 %a1 to float
@@ -4896,12 +4762,12 @@ define <4 x float> @sitofp_i32_to_4f32(<4 x float> %a0, i32 %a1) nounwind {
define <2 x double> @sitofp_i64_to_2f64(<2 x double> %a0, i64 %a1) nounwind {
; SSE-LABEL: sitofp_i64_to_2f64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: cvtsi2sdq %rdi, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: sitofp_i64_to_2f64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vcvtsi2sdq %rdi, %xmm0, %xmm0
; AVX-NEXT: retq
%cvt = sitofp i64 %a1 to double
@@ -4911,12 +4777,12 @@ define <2 x double> @sitofp_i64_to_2f64(<2 x double> %a0, i64 %a1) nounwind {
define <4 x float> @sitofp_i64_to_4f32(<4 x float> %a0, i64 %a1) nounwind {
; SSE-LABEL: sitofp_i64_to_4f32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: cvtsi2ssq %rdi, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: sitofp_i64_to_4f32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vcvtsi2ssq %rdi, %xmm0, %xmm0
; AVX-NEXT: retq
%cvt = sitofp i64 %a1 to float
diff --git a/test/CodeGen/X86/vec_loadsingles.ll b/test/CodeGen/X86/vec_loadsingles.ll
index b0d95c5d00da..8859270c5be9 100644
--- a/test/CodeGen/X86/vec_loadsingles.ll
+++ b/test/CodeGen/X86/vec_loadsingles.ll
@@ -4,7 +4,7 @@
define <4 x float> @merge_2_floats(float* nocapture %p) nounwind readonly {
; ALL-LABEL: merge_2_floats:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; ALL-NEXT: retq
%tmp1 = load float, float* %p
@@ -19,7 +19,7 @@ define <4 x float> @merge_2_floats(float* nocapture %p) nounwind readonly {
; two i64s of a <4 x i64> as a load of two i32s.
define <4 x i64> @merge_2_floats_into_4() {
; ALL-LABEL: merge_2_floats_into_4:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: movq (%rax), %rax
; ALL-NEXT: vmovups (%rax), %xmm0
; ALL-NEXT: retq
@@ -37,7 +37,7 @@ define <4 x i64> @merge_2_floats_into_4() {
define <4 x float> @merge_4_floats(float* %ptr) {
; ALL-LABEL: merge_4_floats:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vmovups (%rdi), %xmm0
; ALL-NEXT: retq
%a = load float, float* %ptr, align 8
@@ -61,12 +61,12 @@ define <4 x float> @merge_4_floats(float* %ptr) {
define <8 x float> @merge_8_floats(float* %ptr) {
; FAST32-LABEL: merge_8_floats:
-; FAST32: # BB#0:
+; FAST32: # %bb.0:
; FAST32-NEXT: vmovups (%rdi), %ymm0
; FAST32-NEXT: retq
;
; SLOW32-LABEL: merge_8_floats:
-; SLOW32: # BB#0:
+; SLOW32: # %bb.0:
; SLOW32-NEXT: vmovups (%rdi), %xmm0
; SLOW32-NEXT: vinsertf128 $1, 16(%rdi), %ymm0, %ymm0
; SLOW32-NEXT: retq
@@ -98,12 +98,12 @@ define <8 x float> @merge_8_floats(float* %ptr) {
define <4 x double> @merge_4_doubles(double* %ptr) {
; FAST32-LABEL: merge_4_doubles:
-; FAST32: # BB#0:
+; FAST32: # %bb.0:
; FAST32-NEXT: vmovups (%rdi), %ymm0
; FAST32-NEXT: retq
;
; SLOW32-LABEL: merge_4_doubles:
-; SLOW32: # BB#0:
+; SLOW32: # %bb.0:
; SLOW32-NEXT: vmovups (%rdi), %xmm0
; SLOW32-NEXT: vinsertf128 $1, 16(%rdi), %ymm0, %ymm0
; SLOW32-NEXT: retq
@@ -126,12 +126,12 @@ define <4 x double> @merge_4_doubles(double* %ptr) {
; first of the combined loads is offset from the base address.
define <4 x double> @merge_4_doubles_offset(double* %ptr) {
; FAST32-LABEL: merge_4_doubles_offset:
-; FAST32: # BB#0:
+; FAST32: # %bb.0:
; FAST32-NEXT: vmovups 32(%rdi), %ymm0
; FAST32-NEXT: retq
;
; SLOW32-LABEL: merge_4_doubles_offset:
-; SLOW32: # BB#0:
+; SLOW32: # %bb.0:
; SLOW32-NEXT: vmovups 32(%rdi), %xmm0
; SLOW32-NEXT: vinsertf128 $1, 48(%rdi), %ymm0, %ymm0
; SLOW32-NEXT: retq
diff --git a/test/CodeGen/X86/vec_logical.ll b/test/CodeGen/X86/vec_logical.ll
index 92ec76009f6a..ec29d4886a2b 100644
--- a/test/CodeGen/X86/vec_logical.ll
+++ b/test/CodeGen/X86/vec_logical.ll
@@ -4,13 +4,13 @@
define void @t(<4 x float> %A) {
; SSE-LABEL: t:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: xorps {{\.LCPI.*}}, %xmm0
; SSE-NEXT: movaps %xmm0, 0
; SSE-NEXT: retl
;
; AVX-LABEL: t:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vxorps {{\.LCPI.*}}, %xmm0, %xmm0
; AVX-NEXT: vmovaps %xmm0, 0
; AVX-NEXT: retl
@@ -21,12 +21,12 @@ define void @t(<4 x float> %A) {
define <4 x float> @t1(<4 x float> %a, <4 x float> %b) {
; SSE-LABEL: t1:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: xorps %xmm1, %xmm0
; SSE-NEXT: retl
;
; AVX-LABEL: t1:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vxorps %xmm1, %xmm0, %xmm0
; AVX-NEXT: retl
entry:
@@ -39,12 +39,12 @@ entry:
define <2 x double> @t2(<2 x double> %a, <2 x double> %b) {
; SSE-LABEL: t2:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: andps %xmm1, %xmm0
; SSE-NEXT: retl
;
; AVX-LABEL: t2:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vandps %xmm1, %xmm0, %xmm0
; AVX-NEXT: retl
entry:
@@ -57,7 +57,7 @@ entry:
define void @t3(<4 x float> %a, <4 x float> %b, <4 x float>* %c, <4 x float>* %d) {
; SSE-LABEL: t3:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
; SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
; SSE-NEXT: andnps %xmm1, %xmm0
@@ -66,7 +66,7 @@ define void @t3(<4 x float> %a, <4 x float> %b, <4 x float>* %c, <4 x float>* %d
; SSE-NEXT: retl
;
; AVX-LABEL: t3:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
; AVX-NEXT: vandnps %xmm1, %xmm0, %xmm0
@@ -88,13 +88,13 @@ entry:
define <2 x i64> @andn_double_xor(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c) {
; SSE-LABEL: andn_double_xor:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: xorps %xmm2, %xmm1
; SSE-NEXT: andnps %xmm1, %xmm0
; SSE-NEXT: retl
;
; AVX-LABEL: andn_double_xor:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vxorps %xmm2, %xmm1, %xmm1
; AVX-NEXT: vandnps %xmm1, %xmm0, %xmm0
; AVX-NEXT: retl
diff --git a/test/CodeGen/X86/vec_minmax_match.ll b/test/CodeGen/X86/vec_minmax_match.ll
index 98f77912779f..a3cef49c6a42 100644
--- a/test/CodeGen/X86/vec_minmax_match.ll
+++ b/test/CodeGen/X86/vec_minmax_match.ll
@@ -6,7 +6,7 @@
define <4 x i32> @smin_vec1(<4 x i32> %x) {
; CHECK-LABEL: smin_vec1:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; CHECK-NEXT: vpxor %xmm1, %xmm0, %xmm0
; CHECK-NEXT: vpminsd %xmm1, %xmm0, %xmm0
@@ -19,7 +19,7 @@ define <4 x i32> @smin_vec1(<4 x i32> %x) {
define <4 x i32> @smin_vec2(<4 x i32> %x) {
; CHECK-LABEL: smin_vec2:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; CHECK-NEXT: vpxor %xmm1, %xmm0, %xmm0
; CHECK-NEXT: vpminsd %xmm1, %xmm0, %xmm0
@@ -34,7 +34,7 @@ define <4 x i32> @smin_vec2(<4 x i32> %x) {
; (X >s Y) ? 0 : Z ==> (Z >s 0) ? 0 : Z ==> SMIN(Z, 0)
define <4 x i32> @smin_vec3(<4 x i32> %x, <4 x i32> %y) {
; CHECK-LABEL: smin_vec3:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpsubd %xmm1, %xmm0, %xmm0
; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
; CHECK-NEXT: vpminsd %xmm1, %xmm0, %xmm0
@@ -49,7 +49,7 @@ define <4 x i32> @smin_vec3(<4 x i32> %x, <4 x i32> %y) {
; (X <s Y) ? Z : 0 ==> (Z <s 0) ? Z : 0 ==> SMIN(Z, 0)
define <4 x i32> @smin_vec4(<4 x i32> %x, <4 x i32> %y) {
; CHECK-LABEL: smin_vec4:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpsubd %xmm1, %xmm0, %xmm0
; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
; CHECK-NEXT: vpminsd %xmm1, %xmm0, %xmm0
@@ -62,7 +62,7 @@ define <4 x i32> @smin_vec4(<4 x i32> %x, <4 x i32> %y) {
define <4 x i32> @smax_vec1(<4 x i32> %x) {
; CHECK-LABEL: smax_vec1:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; CHECK-NEXT: vpxor %xmm1, %xmm0, %xmm0
; CHECK-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
@@ -75,7 +75,7 @@ define <4 x i32> @smax_vec1(<4 x i32> %x) {
define <4 x i32> @smax_vec2(<4 x i32> %x) {
; CHECK-LABEL: smax_vec2:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; CHECK-NEXT: vpxor %xmm1, %xmm0, %xmm0
; CHECK-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
@@ -90,7 +90,7 @@ define <4 x i32> @smax_vec2(<4 x i32> %x) {
; (X <s Y) ? 0 : Z ==> (Z <s 0) ? 0 : Z ==> SMAX(Z, 0)
define <4 x i32> @smax_vec3(<4 x i32> %x, <4 x i32> %y) {
; CHECK-LABEL: smax_vec3:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpsubd %xmm1, %xmm0, %xmm0
; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
; CHECK-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
@@ -105,7 +105,7 @@ define <4 x i32> @smax_vec3(<4 x i32> %x, <4 x i32> %y) {
; (X >s Y) ? Z : 0 ==> (Z >s 0) ? Z : 0 ==> SMAX(Z, 0)
define <4 x i32> @smax_vec4(<4 x i32> %x, <4 x i32> %y) {
; CHECK-LABEL: smax_vec4:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpsubd %xmm1, %xmm0, %xmm0
; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
; CHECK-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
@@ -118,7 +118,7 @@ define <4 x i32> @smax_vec4(<4 x i32> %x, <4 x i32> %y) {
define <4 x i32> @umax_vec1(<4 x i32> %x) {
; CHECK-LABEL: umax_vec1:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpmaxud {{.*}}(%rip), %xmm0, %xmm0
; CHECK-NEXT: retq
%cmp = icmp slt <4 x i32> %x, zeroinitializer
@@ -128,7 +128,7 @@ define <4 x i32> @umax_vec1(<4 x i32> %x) {
define <4 x i32> @umax_vec2(<4 x i32> %x) {
; CHECK-LABEL: umax_vec2:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpmaxud {{.*}}(%rip), %xmm0, %xmm0
; CHECK-NEXT: retq
%cmp = icmp sgt <4 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1>
@@ -138,7 +138,7 @@ define <4 x i32> @umax_vec2(<4 x i32> %x) {
define <4 x i32> @umin_vec1(<4 x i32> %x) {
; CHECK-LABEL: umin_vec1:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm0
; CHECK-NEXT: retq
%cmp = icmp slt <4 x i32> %x, zeroinitializer
@@ -148,7 +148,7 @@ define <4 x i32> @umin_vec1(<4 x i32> %x) {
define <4 x i32> @umin_vec2(<4 x i32> %x) {
; CHECK-LABEL: umin_vec2:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm0
; CHECK-NEXT: retq
%cmp = icmp sgt <4 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1>
@@ -163,7 +163,7 @@ define <4 x i32> @umin_vec2(<4 x i32> %x) {
define <4 x i32> @clamp_signed1(<4 x i32> %x) {
; CHECK-LABEL: clamp_signed1:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpminsd {{.*}}(%rip), %xmm0, %xmm0
; CHECK-NEXT: vpmaxsd {{.*}}(%rip), %xmm0, %xmm0
; CHECK-NEXT: retq
@@ -178,7 +178,7 @@ define <4 x i32> @clamp_signed1(<4 x i32> %x) {
define <4 x i32> @clamp_signed2(<4 x i32> %x) {
; CHECK-LABEL: clamp_signed2:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpmaxsd {{.*}}(%rip), %xmm0, %xmm0
; CHECK-NEXT: vpminsd {{.*}}(%rip), %xmm0, %xmm0
; CHECK-NEXT: retq
@@ -193,7 +193,7 @@ define <4 x i32> @clamp_signed2(<4 x i32> %x) {
define <4 x i32> @clamp_unsigned1(<4 x i32> %x) {
; CHECK-LABEL: clamp_unsigned1:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm0
; CHECK-NEXT: vpmaxud {{.*}}(%rip), %xmm0, %xmm0
; CHECK-NEXT: retq
@@ -208,7 +208,7 @@ define <4 x i32> @clamp_unsigned1(<4 x i32> %x) {
define <4 x i32> @clamp_unsigned2(<4 x i32> %x) {
; CHECK-LABEL: clamp_unsigned2:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpmaxud {{.*}}(%rip), %xmm0, %xmm0
; CHECK-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm0
; CHECK-NEXT: retq
@@ -219,3 +219,33 @@ define <4 x i32> @clamp_unsigned2(<4 x i32> %x) {
ret <4 x i32> %r
}
+define <4 x i32> @wrong_pred_for_smin_with_not(<4 x i32> %x) {
+; CHECK-LABEL: wrong_pred_for_smin_with_not:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpxor %xmm1, %xmm0, %xmm1
+; CHECK-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-NEXT: vpcmpgtd {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-NEXT: vmovaps {{.*#+}} xmm2 = [4294967291,4294967291,4294967291,4294967291]
+; CHECK-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0
+; CHECK-NEXT: retq
+ %not_x = xor <4 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1>
+ %cmp = icmp ugt <4 x i32> %x, <i32 4, i32 4, i32 4, i32 4>
+ %sel = select <4 x i1> %cmp, <4 x i32> %not_x, <4 x i32> <i32 -5, i32 -5, i32 -5, i32 -5>
+ ret <4 x i32> %sel
+}
+
+define <4 x i32> @wrong_pred_for_smin_with_subnsw(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: wrong_pred_for_smin_with_subnsw:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpsubd %xmm1, %xmm0, %xmm2
+; CHECK-NEXT: vpminud %xmm1, %xmm0, %xmm1
+; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: vpand %xmm2, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %sub = sub nsw <4 x i32> %x, %y
+ %cmp = icmp ugt <4 x i32> %x, %y
+ %sel = select <4 x i1> %cmp, <4 x i32> zeroinitializer, <4 x i32> %sub
+ ret <4 x i32> %sel
+}
+
diff --git a/test/CodeGen/X86/vec_minmax_sint.ll b/test/CodeGen/X86/vec_minmax_sint.ll
index 5999116deb9c..df1699aa00d4 100644
--- a/test/CodeGen/X86/vec_minmax_sint.ll
+++ b/test/CodeGen/X86/vec_minmax_sint.ll
@@ -13,7 +13,7 @@
define <2 x i64> @max_gt_v2i64(<2 x i64> %a, <2 x i64> %b) {
; SSE2-LABEL: max_gt_v2i64:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0]
; SSE2-NEXT: movdqa %xmm1, %xmm3
; SSE2-NEXT: pxor %xmm2, %xmm3
@@ -32,7 +32,7 @@ define <2 x i64> @max_gt_v2i64(<2 x i64> %a, <2 x i64> %b) {
; SSE2-NEXT: retq
;
; SSE41-LABEL: max_gt_v2i64:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm2
; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,0,2147483648,0]
; SSE41-NEXT: movdqa %xmm1, %xmm3
@@ -51,18 +51,33 @@ define <2 x i64> @max_gt_v2i64(<2 x i64> %a, <2 x i64> %b) {
; SSE41-NEXT: retq
;
; SSE42-LABEL: max_gt_v2i64:
-; SSE42: # BB#0:
+; SSE42: # %bb.0:
; SSE42-NEXT: movdqa %xmm0, %xmm2
; SSE42-NEXT: pcmpgtq %xmm1, %xmm0
; SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm1
; SSE42-NEXT: movapd %xmm1, %xmm0
; SSE42-NEXT: retq
;
-; AVX-LABEL: max_gt_v2i64:
-; AVX: # BB#0:
-; AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
-; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX-NEXT: retq
+; AVX1-LABEL: max_gt_v2i64:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
+; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: max_gt_v2i64:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
+; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: max_gt_v2i64:
+; AVX512: # %bb.0:
+; AVX512-NEXT: # kill: def %xmm1 killed %xmm1 def %zmm1
+; AVX512-NEXT: # kill: def %xmm0 killed %xmm0 def %zmm0
+; AVX512-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
%1 = icmp sgt <2 x i64> %a, %b
%2 = select <2 x i1> %1, <2 x i64> %a, <2 x i64> %b
ret <2 x i64> %2
@@ -70,7 +85,7 @@ define <2 x i64> @max_gt_v2i64(<2 x i64> %a, <2 x i64> %b) {
define <4 x i64> @max_gt_v4i64(<4 x i64> %a, <4 x i64> %b) {
; SSE2-LABEL: max_gt_v4i64:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,0,2147483648,0]
; SSE2-NEXT: movdqa %xmm3, %xmm5
; SSE2-NEXT: pxor %xmm4, %xmm5
@@ -104,7 +119,7 @@ define <4 x i64> @max_gt_v4i64(<4 x i64> %a, <4 x i64> %b) {
; SSE2-NEXT: retq
;
; SSE41-LABEL: max_gt_v4i64:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm8
; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,0,2147483648,0]
; SSE41-NEXT: movdqa %xmm3, %xmm5
@@ -138,7 +153,7 @@ define <4 x i64> @max_gt_v4i64(<4 x i64> %a, <4 x i64> %b) {
; SSE41-NEXT: retq
;
; SSE42-LABEL: max_gt_v4i64:
-; SSE42: # BB#0:
+; SSE42: # %bb.0:
; SSE42-NEXT: movdqa %xmm0, %xmm4
; SSE42-NEXT: movdqa %xmm1, %xmm5
; SSE42-NEXT: pcmpgtq %xmm3, %xmm5
@@ -151,7 +166,7 @@ define <4 x i64> @max_gt_v4i64(<4 x i64> %a, <4 x i64> %b) {
; SSE42-NEXT: retq
;
; AVX1-LABEL: max_gt_v4i64:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
@@ -161,15 +176,17 @@ define <4 x i64> @max_gt_v4i64(<4 x i64> %a, <4 x i64> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: max_gt_v4i64:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2
; AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: max_gt_v4i64:
-; AVX512: # BB#0:
-; AVX512-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2
-; AVX512-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX512: # %bb.0:
+; AVX512-NEXT: # kill: def %ymm1 killed %ymm1 def %zmm1
+; AVX512-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
+; AVX512-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
; AVX512-NEXT: retq
%1 = icmp sgt <4 x i64> %a, %b
%2 = select <4 x i1> %1, <4 x i64> %a, <4 x i64> %b
@@ -178,7 +195,7 @@ define <4 x i64> @max_gt_v4i64(<4 x i64> %a, <4 x i64> %b) {
define <4 x i32> @max_gt_v4i32(<4 x i32> %a, <4 x i32> %b) {
; SSE2-LABEL: max_gt_v4i32:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
; SSE2-NEXT: pand %xmm2, %xmm0
@@ -188,17 +205,17 @@ define <4 x i32> @max_gt_v4i32(<4 x i32> %a, <4 x i32> %b) {
; SSE2-NEXT: retq
;
; SSE41-LABEL: max_gt_v4i32:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pmaxsd %xmm1, %xmm0
; SSE41-NEXT: retq
;
; SSE42-LABEL: max_gt_v4i32:
-; SSE42: # BB#0:
+; SSE42: # %bb.0:
; SSE42-NEXT: pmaxsd %xmm1, %xmm0
; SSE42-NEXT: retq
;
; AVX-LABEL: max_gt_v4i32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%1 = icmp sgt <4 x i32> %a, %b
@@ -208,7 +225,7 @@ define <4 x i32> @max_gt_v4i32(<4 x i32> %a, <4 x i32> %b) {
define <8 x i32> @max_gt_v8i32(<8 x i32> %a, <8 x i32> %b) {
; SSE2-LABEL: max_gt_v8i32:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm1, %xmm4
; SSE2-NEXT: pcmpgtd %xmm3, %xmm4
; SSE2-NEXT: movdqa %xmm0, %xmm5
@@ -224,19 +241,19 @@ define <8 x i32> @max_gt_v8i32(<8 x i32> %a, <8 x i32> %b) {
; SSE2-NEXT: retq
;
; SSE41-LABEL: max_gt_v8i32:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pmaxsd %xmm2, %xmm0
; SSE41-NEXT: pmaxsd %xmm3, %xmm1
; SSE41-NEXT: retq
;
; SSE42-LABEL: max_gt_v8i32:
-; SSE42: # BB#0:
+; SSE42: # %bb.0:
; SSE42-NEXT: pmaxsd %xmm2, %xmm0
; SSE42-NEXT: pmaxsd %xmm3, %xmm1
; SSE42-NEXT: retq
;
; AVX1-LABEL: max_gt_v8i32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpmaxsd %xmm2, %xmm3, %xmm2
@@ -245,12 +262,12 @@ define <8 x i32> @max_gt_v8i32(<8 x i32> %a, <8 x i32> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: max_gt_v8i32:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: max_gt_v8i32:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
; AVX512-NEXT: retq
%1 = icmp sgt <8 x i32> %a, %b
@@ -260,12 +277,12 @@ define <8 x i32> @max_gt_v8i32(<8 x i32> %a, <8 x i32> %b) {
define <8 x i16> @max_gt_v8i16(<8 x i16> %a, <8 x i16> %b) {
; SSE-LABEL: max_gt_v8i16:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pmaxsw %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: max_gt_v8i16:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%1 = icmp sgt <8 x i16> %a, %b
@@ -275,13 +292,13 @@ define <8 x i16> @max_gt_v8i16(<8 x i16> %a, <8 x i16> %b) {
define <16 x i16> @max_gt_v16i16(<16 x i16> %a, <16 x i16> %b) {
; SSE-LABEL: max_gt_v16i16:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pmaxsw %xmm2, %xmm0
; SSE-NEXT: pmaxsw %xmm3, %xmm1
; SSE-NEXT: retq
;
; AVX1-LABEL: max_gt_v16i16:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpmaxsw %xmm2, %xmm3, %xmm2
@@ -290,12 +307,12 @@ define <16 x i16> @max_gt_v16i16(<16 x i16> %a, <16 x i16> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: max_gt_v16i16:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: max_gt_v16i16:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
; AVX512-NEXT: retq
%1 = icmp sgt <16 x i16> %a, %b
@@ -305,7 +322,7 @@ define <16 x i16> @max_gt_v16i16(<16 x i16> %a, <16 x i16> %b) {
define <16 x i8> @max_gt_v16i8(<16 x i8> %a, <16 x i8> %b) {
; SSE2-LABEL: max_gt_v16i8:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: pcmpgtb %xmm1, %xmm2
; SSE2-NEXT: pand %xmm2, %xmm0
@@ -315,17 +332,17 @@ define <16 x i8> @max_gt_v16i8(<16 x i8> %a, <16 x i8> %b) {
; SSE2-NEXT: retq
;
; SSE41-LABEL: max_gt_v16i8:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pmaxsb %xmm1, %xmm0
; SSE41-NEXT: retq
;
; SSE42-LABEL: max_gt_v16i8:
-; SSE42: # BB#0:
+; SSE42: # %bb.0:
; SSE42-NEXT: pmaxsb %xmm1, %xmm0
; SSE42-NEXT: retq
;
; AVX-LABEL: max_gt_v16i8:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%1 = icmp sgt <16 x i8> %a, %b
@@ -335,7 +352,7 @@ define <16 x i8> @max_gt_v16i8(<16 x i8> %a, <16 x i8> %b) {
define <32 x i8> @max_gt_v32i8(<32 x i8> %a, <32 x i8> %b) {
; SSE2-LABEL: max_gt_v32i8:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm1, %xmm4
; SSE2-NEXT: pcmpgtb %xmm3, %xmm4
; SSE2-NEXT: movdqa %xmm0, %xmm5
@@ -351,19 +368,19 @@ define <32 x i8> @max_gt_v32i8(<32 x i8> %a, <32 x i8> %b) {
; SSE2-NEXT: retq
;
; SSE41-LABEL: max_gt_v32i8:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pmaxsb %xmm2, %xmm0
; SSE41-NEXT: pmaxsb %xmm3, %xmm1
; SSE41-NEXT: retq
;
; SSE42-LABEL: max_gt_v32i8:
-; SSE42: # BB#0:
+; SSE42: # %bb.0:
; SSE42-NEXT: pmaxsb %xmm2, %xmm0
; SSE42-NEXT: pmaxsb %xmm3, %xmm1
; SSE42-NEXT: retq
;
; AVX1-LABEL: max_gt_v32i8:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpmaxsb %xmm2, %xmm3, %xmm2
@@ -372,12 +389,12 @@ define <32 x i8> @max_gt_v32i8(<32 x i8> %a, <32 x i8> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: max_gt_v32i8:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: max_gt_v32i8:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
; AVX512-NEXT: retq
%1 = icmp sgt <32 x i8> %a, %b
@@ -391,7 +408,7 @@ define <32 x i8> @max_gt_v32i8(<32 x i8> %a, <32 x i8> %b) {
define <2 x i64> @max_ge_v2i64(<2 x i64> %a, <2 x i64> %b) {
; SSE2-LABEL: max_ge_v2i64:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0]
; SSE2-NEXT: movdqa %xmm0, %xmm3
; SSE2-NEXT: pxor %xmm2, %xmm3
@@ -413,7 +430,7 @@ define <2 x i64> @max_ge_v2i64(<2 x i64> %a, <2 x i64> %b) {
; SSE2-NEXT: retq
;
; SSE41-LABEL: max_ge_v2i64:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm2
; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,0,2147483648,0]
; SSE41-NEXT: movdqa %xmm2, %xmm3
@@ -434,7 +451,7 @@ define <2 x i64> @max_ge_v2i64(<2 x i64> %a, <2 x i64> %b) {
; SSE41-NEXT: retq
;
; SSE42-LABEL: max_ge_v2i64:
-; SSE42: # BB#0:
+; SSE42: # %bb.0:
; SSE42-NEXT: movdqa %xmm0, %xmm2
; SSE42-NEXT: movdqa %xmm1, %xmm3
; SSE42-NEXT: pcmpgtq %xmm2, %xmm3
@@ -444,13 +461,30 @@ define <2 x i64> @max_ge_v2i64(<2 x i64> %a, <2 x i64> %b) {
; SSE42-NEXT: movapd %xmm1, %xmm0
; SSE42-NEXT: retq
;
-; AVX-LABEL: max_ge_v2i64:
-; AVX: # BB#0:
-; AVX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
-; AVX-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
-; AVX-NEXT: vpxor %xmm3, %xmm2, %xmm2
-; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX-NEXT: retq
+; AVX1-LABEL: max_ge_v2i64:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
+; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: max_ge_v2i64:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
+; AVX2-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
+; AVX2-NEXT: vpxor %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: max_ge_v2i64:
+; AVX512: # %bb.0:
+; AVX512-NEXT: # kill: def %xmm1 killed %xmm1 def %zmm1
+; AVX512-NEXT: # kill: def %xmm0 killed %xmm0 def %zmm0
+; AVX512-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
%1 = icmp sge <2 x i64> %a, %b
%2 = select <2 x i1> %1, <2 x i64> %a, <2 x i64> %b
ret <2 x i64> %2
@@ -458,7 +492,7 @@ define <2 x i64> @max_ge_v2i64(<2 x i64> %a, <2 x i64> %b) {
define <4 x i64> @max_ge_v4i64(<4 x i64> %a, <4 x i64> %b) {
; SSE2-LABEL: max_ge_v4i64:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [2147483648,0,2147483648,0]
; SSE2-NEXT: movdqa %xmm1, %xmm4
; SSE2-NEXT: pxor %xmm7, %xmm4
@@ -498,7 +532,7 @@ define <4 x i64> @max_ge_v4i64(<4 x i64> %a, <4 x i64> %b) {
; SSE2-NEXT: retq
;
; SSE41-LABEL: max_ge_v4i64:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm8
; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,0,2147483648,0]
; SSE41-NEXT: movdqa %xmm1, %xmm5
@@ -535,7 +569,7 @@ define <4 x i64> @max_ge_v4i64(<4 x i64> %a, <4 x i64> %b) {
; SSE41-NEXT: retq
;
; SSE42-LABEL: max_ge_v4i64:
-; SSE42: # BB#0:
+; SSE42: # %bb.0:
; SSE42-NEXT: movdqa %xmm0, %xmm4
; SSE42-NEXT: movdqa %xmm3, %xmm5
; SSE42-NEXT: pcmpgtq %xmm1, %xmm5
@@ -552,7 +586,7 @@ define <4 x i64> @max_ge_v4i64(<4 x i64> %a, <4 x i64> %b) {
; SSE42-NEXT: retq
;
; AVX1-LABEL: max_ge_v4i64:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
@@ -565,7 +599,7 @@ define <4 x i64> @max_ge_v4i64(<4 x i64> %a, <4 x i64> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: max_ge_v4i64:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2
; AVX2-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3
; AVX2-NEXT: vpxor %ymm3, %ymm2, %ymm2
@@ -573,11 +607,11 @@ define <4 x i64> @max_ge_v4i64(<4 x i64> %a, <4 x i64> %b) {
; AVX2-NEXT: retq
;
; AVX512-LABEL: max_ge_v4i64:
-; AVX512: # BB#0:
-; AVX512-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2
-; AVX512-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3
-; AVX512-NEXT: vpxor %ymm3, %ymm2, %ymm2
-; AVX512-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX512: # %bb.0:
+; AVX512-NEXT: # kill: def %ymm1 killed %ymm1 def %zmm1
+; AVX512-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
+; AVX512-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
; AVX512-NEXT: retq
%1 = icmp sge <4 x i64> %a, %b
%2 = select <4 x i1> %1, <4 x i64> %a, <4 x i64> %b
@@ -586,7 +620,7 @@ define <4 x i64> @max_ge_v4i64(<4 x i64> %a, <4 x i64> %b) {
define <4 x i32> @max_ge_v4i32(<4 x i32> %a, <4 x i32> %b) {
; SSE2-LABEL: max_ge_v4i32:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm1, %xmm3
; SSE2-NEXT: pcmpgtd %xmm0, %xmm3
; SSE2-NEXT: pcmpeqd %xmm2, %xmm2
@@ -598,17 +632,17 @@ define <4 x i32> @max_ge_v4i32(<4 x i32> %a, <4 x i32> %b) {
; SSE2-NEXT: retq
;
; SSE41-LABEL: max_ge_v4i32:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pmaxsd %xmm1, %xmm0
; SSE41-NEXT: retq
;
; SSE42-LABEL: max_ge_v4i32:
-; SSE42: # BB#0:
+; SSE42: # %bb.0:
; SSE42-NEXT: pmaxsd %xmm1, %xmm0
; SSE42-NEXT: retq
;
; AVX-LABEL: max_ge_v4i32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%1 = icmp sge <4 x i32> %a, %b
@@ -618,7 +652,7 @@ define <4 x i32> @max_ge_v4i32(<4 x i32> %a, <4 x i32> %b) {
define <8 x i32> @max_ge_v8i32(<8 x i32> %a, <8 x i32> %b) {
; SSE2-LABEL: max_ge_v8i32:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm3, %xmm6
; SSE2-NEXT: pcmpgtd %xmm1, %xmm6
; SSE2-NEXT: pcmpeqd %xmm4, %xmm4
@@ -638,19 +672,19 @@ define <8 x i32> @max_ge_v8i32(<8 x i32> %a, <8 x i32> %b) {
; SSE2-NEXT: retq
;
; SSE41-LABEL: max_ge_v8i32:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pmaxsd %xmm2, %xmm0
; SSE41-NEXT: pmaxsd %xmm3, %xmm1
; SSE41-NEXT: retq
;
; SSE42-LABEL: max_ge_v8i32:
-; SSE42: # BB#0:
+; SSE42: # %bb.0:
; SSE42-NEXT: pmaxsd %xmm2, %xmm0
; SSE42-NEXT: pmaxsd %xmm3, %xmm1
; SSE42-NEXT: retq
;
; AVX1-LABEL: max_ge_v8i32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpmaxsd %xmm2, %xmm3, %xmm2
@@ -659,12 +693,12 @@ define <8 x i32> @max_ge_v8i32(<8 x i32> %a, <8 x i32> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: max_ge_v8i32:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: max_ge_v8i32:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
; AVX512-NEXT: retq
%1 = icmp sge <8 x i32> %a, %b
@@ -674,12 +708,12 @@ define <8 x i32> @max_ge_v8i32(<8 x i32> %a, <8 x i32> %b) {
define <8 x i16> @max_ge_v8i16(<8 x i16> %a, <8 x i16> %b) {
; SSE-LABEL: max_ge_v8i16:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pmaxsw %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: max_ge_v8i16:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%1 = icmp sge <8 x i16> %a, %b
@@ -689,13 +723,13 @@ define <8 x i16> @max_ge_v8i16(<8 x i16> %a, <8 x i16> %b) {
define <16 x i16> @max_ge_v16i16(<16 x i16> %a, <16 x i16> %b) {
; SSE-LABEL: max_ge_v16i16:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pmaxsw %xmm2, %xmm0
; SSE-NEXT: pmaxsw %xmm3, %xmm1
; SSE-NEXT: retq
;
; AVX1-LABEL: max_ge_v16i16:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpmaxsw %xmm2, %xmm3, %xmm2
@@ -704,12 +738,12 @@ define <16 x i16> @max_ge_v16i16(<16 x i16> %a, <16 x i16> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: max_ge_v16i16:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: max_ge_v16i16:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
; AVX512-NEXT: retq
%1 = icmp sge <16 x i16> %a, %b
@@ -719,7 +753,7 @@ define <16 x i16> @max_ge_v16i16(<16 x i16> %a, <16 x i16> %b) {
define <16 x i8> @max_ge_v16i8(<16 x i8> %a, <16 x i8> %b) {
; SSE2-LABEL: max_ge_v16i8:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm1, %xmm3
; SSE2-NEXT: pcmpgtb %xmm0, %xmm3
; SSE2-NEXT: pcmpeqd %xmm2, %xmm2
@@ -731,17 +765,17 @@ define <16 x i8> @max_ge_v16i8(<16 x i8> %a, <16 x i8> %b) {
; SSE2-NEXT: retq
;
; SSE41-LABEL: max_ge_v16i8:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pmaxsb %xmm1, %xmm0
; SSE41-NEXT: retq
;
; SSE42-LABEL: max_ge_v16i8:
-; SSE42: # BB#0:
+; SSE42: # %bb.0:
; SSE42-NEXT: pmaxsb %xmm1, %xmm0
; SSE42-NEXT: retq
;
; AVX-LABEL: max_ge_v16i8:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%1 = icmp sge <16 x i8> %a, %b
@@ -751,7 +785,7 @@ define <16 x i8> @max_ge_v16i8(<16 x i8> %a, <16 x i8> %b) {
define <32 x i8> @max_ge_v32i8(<32 x i8> %a, <32 x i8> %b) {
; SSE2-LABEL: max_ge_v32i8:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm3, %xmm6
; SSE2-NEXT: pcmpgtb %xmm1, %xmm6
; SSE2-NEXT: pcmpeqd %xmm4, %xmm4
@@ -771,19 +805,19 @@ define <32 x i8> @max_ge_v32i8(<32 x i8> %a, <32 x i8> %b) {
; SSE2-NEXT: retq
;
; SSE41-LABEL: max_ge_v32i8:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pmaxsb %xmm2, %xmm0
; SSE41-NEXT: pmaxsb %xmm3, %xmm1
; SSE41-NEXT: retq
;
; SSE42-LABEL: max_ge_v32i8:
-; SSE42: # BB#0:
+; SSE42: # %bb.0:
; SSE42-NEXT: pmaxsb %xmm2, %xmm0
; SSE42-NEXT: pmaxsb %xmm3, %xmm1
; SSE42-NEXT: retq
;
; AVX1-LABEL: max_ge_v32i8:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpmaxsb %xmm2, %xmm3, %xmm2
@@ -792,12 +826,12 @@ define <32 x i8> @max_ge_v32i8(<32 x i8> %a, <32 x i8> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: max_ge_v32i8:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: max_ge_v32i8:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
; AVX512-NEXT: retq
%1 = icmp sge <32 x i8> %a, %b
@@ -811,7 +845,7 @@ define <32 x i8> @max_ge_v32i8(<32 x i8> %a, <32 x i8> %b) {
define <2 x i64> @min_lt_v2i64(<2 x i64> %a, <2 x i64> %b) {
; SSE2-LABEL: min_lt_v2i64:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0]
; SSE2-NEXT: movdqa %xmm0, %xmm3
; SSE2-NEXT: pxor %xmm2, %xmm3
@@ -830,7 +864,7 @@ define <2 x i64> @min_lt_v2i64(<2 x i64> %a, <2 x i64> %b) {
; SSE2-NEXT: retq
;
; SSE41-LABEL: min_lt_v2i64:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm2
; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,0,2147483648,0]
; SSE41-NEXT: movdqa %xmm2, %xmm3
@@ -849,7 +883,7 @@ define <2 x i64> @min_lt_v2i64(<2 x i64> %a, <2 x i64> %b) {
; SSE41-NEXT: retq
;
; SSE42-LABEL: min_lt_v2i64:
-; SSE42: # BB#0:
+; SSE42: # %bb.0:
; SSE42-NEXT: movdqa %xmm0, %xmm2
; SSE42-NEXT: movdqa %xmm1, %xmm0
; SSE42-NEXT: pcmpgtq %xmm2, %xmm0
@@ -857,11 +891,26 @@ define <2 x i64> @min_lt_v2i64(<2 x i64> %a, <2 x i64> %b) {
; SSE42-NEXT: movapd %xmm1, %xmm0
; SSE42-NEXT: retq
;
-; AVX-LABEL: min_lt_v2i64:
-; AVX: # BB#0:
-; AVX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
-; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX-NEXT: retq
+; AVX1-LABEL: min_lt_v2i64:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
+; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: min_lt_v2i64:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
+; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: min_lt_v2i64:
+; AVX512: # %bb.0:
+; AVX512-NEXT: # kill: def %xmm1 killed %xmm1 def %zmm1
+; AVX512-NEXT: # kill: def %xmm0 killed %xmm0 def %zmm0
+; AVX512-NEXT: vpminsq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
%1 = icmp slt <2 x i64> %a, %b
%2 = select <2 x i1> %1, <2 x i64> %a, <2 x i64> %b
ret <2 x i64> %2
@@ -869,7 +918,7 @@ define <2 x i64> @min_lt_v2i64(<2 x i64> %a, <2 x i64> %b) {
define <4 x i64> @min_lt_v4i64(<4 x i64> %a, <4 x i64> %b) {
; SSE2-LABEL: min_lt_v4i64:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,0,2147483648,0]
; SSE2-NEXT: movdqa %xmm1, %xmm5
; SSE2-NEXT: pxor %xmm4, %xmm5
@@ -903,7 +952,7 @@ define <4 x i64> @min_lt_v4i64(<4 x i64> %a, <4 x i64> %b) {
; SSE2-NEXT: retq
;
; SSE41-LABEL: min_lt_v4i64:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm8
; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,0,2147483648,0]
; SSE41-NEXT: movdqa %xmm1, %xmm5
@@ -937,7 +986,7 @@ define <4 x i64> @min_lt_v4i64(<4 x i64> %a, <4 x i64> %b) {
; SSE41-NEXT: retq
;
; SSE42-LABEL: min_lt_v4i64:
-; SSE42: # BB#0:
+; SSE42: # %bb.0:
; SSE42-NEXT: movdqa %xmm0, %xmm4
; SSE42-NEXT: movdqa %xmm3, %xmm5
; SSE42-NEXT: pcmpgtq %xmm1, %xmm5
@@ -951,7 +1000,7 @@ define <4 x i64> @min_lt_v4i64(<4 x i64> %a, <4 x i64> %b) {
; SSE42-NEXT: retq
;
; AVX1-LABEL: min_lt_v4i64:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
@@ -961,15 +1010,17 @@ define <4 x i64> @min_lt_v4i64(<4 x i64> %a, <4 x i64> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: min_lt_v4i64:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2
; AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: min_lt_v4i64:
-; AVX512: # BB#0:
-; AVX512-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2
-; AVX512-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX512: # %bb.0:
+; AVX512-NEXT: # kill: def %ymm1 killed %ymm1 def %zmm1
+; AVX512-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
+; AVX512-NEXT: vpminsq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
; AVX512-NEXT: retq
%1 = icmp slt <4 x i64> %a, %b
%2 = select <4 x i1> %1, <4 x i64> %a, <4 x i64> %b
@@ -978,7 +1029,7 @@ define <4 x i64> @min_lt_v4i64(<4 x i64> %a, <4 x i64> %b) {
define <4 x i32> @min_lt_v4i32(<4 x i32> %a, <4 x i32> %b) {
; SSE2-LABEL: min_lt_v4i32:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm1, %xmm2
; SSE2-NEXT: pcmpgtd %xmm0, %xmm2
; SSE2-NEXT: pand %xmm2, %xmm0
@@ -987,17 +1038,17 @@ define <4 x i32> @min_lt_v4i32(<4 x i32> %a, <4 x i32> %b) {
; SSE2-NEXT: retq
;
; SSE41-LABEL: min_lt_v4i32:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pminsd %xmm1, %xmm0
; SSE41-NEXT: retq
;
; SSE42-LABEL: min_lt_v4i32:
-; SSE42: # BB#0:
+; SSE42: # %bb.0:
; SSE42-NEXT: pminsd %xmm1, %xmm0
; SSE42-NEXT: retq
;
; AVX-LABEL: min_lt_v4i32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpminsd %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%1 = icmp slt <4 x i32> %a, %b
@@ -1007,7 +1058,7 @@ define <4 x i32> @min_lt_v4i32(<4 x i32> %a, <4 x i32> %b) {
define <8 x i32> @min_lt_v8i32(<8 x i32> %a, <8 x i32> %b) {
; SSE2-LABEL: min_lt_v8i32:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm3, %xmm4
; SSE2-NEXT: pcmpgtd %xmm1, %xmm4
; SSE2-NEXT: movdqa %xmm2, %xmm5
@@ -1021,19 +1072,19 @@ define <8 x i32> @min_lt_v8i32(<8 x i32> %a, <8 x i32> %b) {
; SSE2-NEXT: retq
;
; SSE41-LABEL: min_lt_v8i32:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pminsd %xmm2, %xmm0
; SSE41-NEXT: pminsd %xmm3, %xmm1
; SSE41-NEXT: retq
;
; SSE42-LABEL: min_lt_v8i32:
-; SSE42: # BB#0:
+; SSE42: # %bb.0:
; SSE42-NEXT: pminsd %xmm2, %xmm0
; SSE42-NEXT: pminsd %xmm3, %xmm1
; SSE42-NEXT: retq
;
; AVX1-LABEL: min_lt_v8i32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpminsd %xmm2, %xmm3, %xmm2
@@ -1042,12 +1093,12 @@ define <8 x i32> @min_lt_v8i32(<8 x i32> %a, <8 x i32> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: min_lt_v8i32:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: min_lt_v8i32:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpminsd %ymm1, %ymm0, %ymm0
; AVX512-NEXT: retq
%1 = icmp slt <8 x i32> %a, %b
@@ -1057,12 +1108,12 @@ define <8 x i32> @min_lt_v8i32(<8 x i32> %a, <8 x i32> %b) {
define <8 x i16> @min_lt_v8i16(<8 x i16> %a, <8 x i16> %b) {
; SSE-LABEL: min_lt_v8i16:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pminsw %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: min_lt_v8i16:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpminsw %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%1 = icmp slt <8 x i16> %a, %b
@@ -1072,13 +1123,13 @@ define <8 x i16> @min_lt_v8i16(<8 x i16> %a, <8 x i16> %b) {
define <16 x i16> @min_lt_v16i16(<16 x i16> %a, <16 x i16> %b) {
; SSE-LABEL: min_lt_v16i16:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pminsw %xmm2, %xmm0
; SSE-NEXT: pminsw %xmm3, %xmm1
; SSE-NEXT: retq
;
; AVX1-LABEL: min_lt_v16i16:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpminsw %xmm2, %xmm3, %xmm2
@@ -1087,12 +1138,12 @@ define <16 x i16> @min_lt_v16i16(<16 x i16> %a, <16 x i16> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: min_lt_v16i16:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: min_lt_v16i16:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpminsw %ymm1, %ymm0, %ymm0
; AVX512-NEXT: retq
%1 = icmp slt <16 x i16> %a, %b
@@ -1102,7 +1153,7 @@ define <16 x i16> @min_lt_v16i16(<16 x i16> %a, <16 x i16> %b) {
define <16 x i8> @min_lt_v16i8(<16 x i8> %a, <16 x i8> %b) {
; SSE2-LABEL: min_lt_v16i8:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm1, %xmm2
; SSE2-NEXT: pcmpgtb %xmm0, %xmm2
; SSE2-NEXT: pand %xmm2, %xmm0
@@ -1111,17 +1162,17 @@ define <16 x i8> @min_lt_v16i8(<16 x i8> %a, <16 x i8> %b) {
; SSE2-NEXT: retq
;
; SSE41-LABEL: min_lt_v16i8:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pminsb %xmm1, %xmm0
; SSE41-NEXT: retq
;
; SSE42-LABEL: min_lt_v16i8:
-; SSE42: # BB#0:
+; SSE42: # %bb.0:
; SSE42-NEXT: pminsb %xmm1, %xmm0
; SSE42-NEXT: retq
;
; AVX-LABEL: min_lt_v16i8:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%1 = icmp slt <16 x i8> %a, %b
@@ -1131,7 +1182,7 @@ define <16 x i8> @min_lt_v16i8(<16 x i8> %a, <16 x i8> %b) {
define <32 x i8> @min_lt_v32i8(<32 x i8> %a, <32 x i8> %b) {
; SSE2-LABEL: min_lt_v32i8:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm3, %xmm4
; SSE2-NEXT: pcmpgtb %xmm1, %xmm4
; SSE2-NEXT: movdqa %xmm2, %xmm5
@@ -1145,19 +1196,19 @@ define <32 x i8> @min_lt_v32i8(<32 x i8> %a, <32 x i8> %b) {
; SSE2-NEXT: retq
;
; SSE41-LABEL: min_lt_v32i8:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pminsb %xmm2, %xmm0
; SSE41-NEXT: pminsb %xmm3, %xmm1
; SSE41-NEXT: retq
;
; SSE42-LABEL: min_lt_v32i8:
-; SSE42: # BB#0:
+; SSE42: # %bb.0:
; SSE42-NEXT: pminsb %xmm2, %xmm0
; SSE42-NEXT: pminsb %xmm3, %xmm1
; SSE42-NEXT: retq
;
; AVX1-LABEL: min_lt_v32i8:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpminsb %xmm2, %xmm3, %xmm2
@@ -1166,12 +1217,12 @@ define <32 x i8> @min_lt_v32i8(<32 x i8> %a, <32 x i8> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: min_lt_v32i8:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: min_lt_v32i8:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpminsb %ymm1, %ymm0, %ymm0
; AVX512-NEXT: retq
%1 = icmp slt <32 x i8> %a, %b
@@ -1185,7 +1236,7 @@ define <32 x i8> @min_lt_v32i8(<32 x i8> %a, <32 x i8> %b) {
define <2 x i64> @min_le_v2i64(<2 x i64> %a, <2 x i64> %b) {
; SSE2-LABEL: min_le_v2i64:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0]
; SSE2-NEXT: movdqa %xmm1, %xmm3
; SSE2-NEXT: pxor %xmm2, %xmm3
@@ -1207,7 +1258,7 @@ define <2 x i64> @min_le_v2i64(<2 x i64> %a, <2 x i64> %b) {
; SSE2-NEXT: retq
;
; SSE41-LABEL: min_le_v2i64:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm2
; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,0,2147483648,0]
; SSE41-NEXT: movdqa %xmm1, %xmm3
@@ -1228,7 +1279,7 @@ define <2 x i64> @min_le_v2i64(<2 x i64> %a, <2 x i64> %b) {
; SSE41-NEXT: retq
;
; SSE42-LABEL: min_le_v2i64:
-; SSE42: # BB#0:
+; SSE42: # %bb.0:
; SSE42-NEXT: movdqa %xmm0, %xmm2
; SSE42-NEXT: pcmpgtq %xmm1, %xmm0
; SSE42-NEXT: pcmpeqd %xmm3, %xmm3
@@ -1237,13 +1288,30 @@ define <2 x i64> @min_le_v2i64(<2 x i64> %a, <2 x i64> %b) {
; SSE42-NEXT: movapd %xmm1, %xmm0
; SSE42-NEXT: retq
;
-; AVX-LABEL: min_le_v2i64:
-; AVX: # BB#0:
-; AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
-; AVX-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
-; AVX-NEXT: vpxor %xmm3, %xmm2, %xmm2
-; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX-NEXT: retq
+; AVX1-LABEL: min_le_v2i64:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
+; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: min_le_v2i64:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
+; AVX2-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
+; AVX2-NEXT: vpxor %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: min_le_v2i64:
+; AVX512: # %bb.0:
+; AVX512-NEXT: # kill: def %xmm1 killed %xmm1 def %zmm1
+; AVX512-NEXT: # kill: def %xmm0 killed %xmm0 def %zmm0
+; AVX512-NEXT: vpminsq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
%1 = icmp sle <2 x i64> %a, %b
%2 = select <2 x i1> %1, <2 x i64> %a, <2 x i64> %b
ret <2 x i64> %2
@@ -1251,7 +1319,7 @@ define <2 x i64> @min_le_v2i64(<2 x i64> %a, <2 x i64> %b) {
define <4 x i64> @min_le_v4i64(<4 x i64> %a, <4 x i64> %b) {
; SSE2-LABEL: min_le_v4i64:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [2147483648,0,2147483648,0]
; SSE2-NEXT: movdqa %xmm3, %xmm4
; SSE2-NEXT: pxor %xmm7, %xmm4
@@ -1291,7 +1359,7 @@ define <4 x i64> @min_le_v4i64(<4 x i64> %a, <4 x i64> %b) {
; SSE2-NEXT: retq
;
; SSE41-LABEL: min_le_v4i64:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm8
; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,0,2147483648,0]
; SSE41-NEXT: movdqa %xmm3, %xmm5
@@ -1328,7 +1396,7 @@ define <4 x i64> @min_le_v4i64(<4 x i64> %a, <4 x i64> %b) {
; SSE41-NEXT: retq
;
; SSE42-LABEL: min_le_v4i64:
-; SSE42: # BB#0:
+; SSE42: # %bb.0:
; SSE42-NEXT: movdqa %xmm0, %xmm4
; SSE42-NEXT: movdqa %xmm1, %xmm5
; SSE42-NEXT: pcmpgtq %xmm3, %xmm5
@@ -1344,7 +1412,7 @@ define <4 x i64> @min_le_v4i64(<4 x i64> %a, <4 x i64> %b) {
; SSE42-NEXT: retq
;
; AVX1-LABEL: min_le_v4i64:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
@@ -1357,7 +1425,7 @@ define <4 x i64> @min_le_v4i64(<4 x i64> %a, <4 x i64> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: min_le_v4i64:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2
; AVX2-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3
; AVX2-NEXT: vpxor %ymm3, %ymm2, %ymm2
@@ -1365,11 +1433,11 @@ define <4 x i64> @min_le_v4i64(<4 x i64> %a, <4 x i64> %b) {
; AVX2-NEXT: retq
;
; AVX512-LABEL: min_le_v4i64:
-; AVX512: # BB#0:
-; AVX512-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2
-; AVX512-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3
-; AVX512-NEXT: vpxor %ymm3, %ymm2, %ymm2
-; AVX512-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX512: # %bb.0:
+; AVX512-NEXT: # kill: def %ymm1 killed %ymm1 def %zmm1
+; AVX512-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
+; AVX512-NEXT: vpminsq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
; AVX512-NEXT: retq
%1 = icmp sle <4 x i64> %a, %b
%2 = select <4 x i1> %1, <4 x i64> %a, <4 x i64> %b
@@ -1378,7 +1446,7 @@ define <4 x i64> @min_le_v4i64(<4 x i64> %a, <4 x i64> %b) {
define <4 x i32> @min_le_v4i32(<4 x i32> %a, <4 x i32> %b) {
; SSE2-LABEL: min_le_v4i32:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
; SSE2-NEXT: pcmpeqd %xmm3, %xmm3
@@ -1390,17 +1458,17 @@ define <4 x i32> @min_le_v4i32(<4 x i32> %a, <4 x i32> %b) {
; SSE2-NEXT: retq
;
; SSE41-LABEL: min_le_v4i32:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pminsd %xmm1, %xmm0
; SSE41-NEXT: retq
;
; SSE42-LABEL: min_le_v4i32:
-; SSE42: # BB#0:
+; SSE42: # %bb.0:
; SSE42-NEXT: pminsd %xmm1, %xmm0
; SSE42-NEXT: retq
;
; AVX-LABEL: min_le_v4i32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpminsd %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%1 = icmp sle <4 x i32> %a, %b
@@ -1410,7 +1478,7 @@ define <4 x i32> @min_le_v4i32(<4 x i32> %a, <4 x i32> %b) {
define <8 x i32> @min_le_v8i32(<8 x i32> %a, <8 x i32> %b) {
; SSE2-LABEL: min_le_v8i32:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm1, %xmm6
; SSE2-NEXT: pcmpgtd %xmm3, %xmm6
; SSE2-NEXT: pcmpeqd %xmm7, %xmm7
@@ -1430,19 +1498,19 @@ define <8 x i32> @min_le_v8i32(<8 x i32> %a, <8 x i32> %b) {
; SSE2-NEXT: retq
;
; SSE41-LABEL: min_le_v8i32:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pminsd %xmm2, %xmm0
; SSE41-NEXT: pminsd %xmm3, %xmm1
; SSE41-NEXT: retq
;
; SSE42-LABEL: min_le_v8i32:
-; SSE42: # BB#0:
+; SSE42: # %bb.0:
; SSE42-NEXT: pminsd %xmm2, %xmm0
; SSE42-NEXT: pminsd %xmm3, %xmm1
; SSE42-NEXT: retq
;
; AVX1-LABEL: min_le_v8i32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpminsd %xmm2, %xmm3, %xmm2
@@ -1451,12 +1519,12 @@ define <8 x i32> @min_le_v8i32(<8 x i32> %a, <8 x i32> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: min_le_v8i32:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: min_le_v8i32:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpminsd %ymm1, %ymm0, %ymm0
; AVX512-NEXT: retq
%1 = icmp sle <8 x i32> %a, %b
@@ -1466,12 +1534,12 @@ define <8 x i32> @min_le_v8i32(<8 x i32> %a, <8 x i32> %b) {
define <8 x i16> @min_le_v8i16(<8 x i16> %a, <8 x i16> %b) {
; SSE-LABEL: min_le_v8i16:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pminsw %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: min_le_v8i16:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpminsw %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%1 = icmp sle <8 x i16> %a, %b
@@ -1481,13 +1549,13 @@ define <8 x i16> @min_le_v8i16(<8 x i16> %a, <8 x i16> %b) {
define <16 x i16> @min_le_v16i16(<16 x i16> %a, <16 x i16> %b) {
; SSE-LABEL: min_le_v16i16:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pminsw %xmm2, %xmm0
; SSE-NEXT: pminsw %xmm3, %xmm1
; SSE-NEXT: retq
;
; AVX1-LABEL: min_le_v16i16:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpminsw %xmm2, %xmm3, %xmm2
@@ -1496,12 +1564,12 @@ define <16 x i16> @min_le_v16i16(<16 x i16> %a, <16 x i16> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: min_le_v16i16:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: min_le_v16i16:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpminsw %ymm1, %ymm0, %ymm0
; AVX512-NEXT: retq
%1 = icmp sle <16 x i16> %a, %b
@@ -1511,7 +1579,7 @@ define <16 x i16> @min_le_v16i16(<16 x i16> %a, <16 x i16> %b) {
define <16 x i8> @min_le_v16i8(<16 x i8> %a, <16 x i8> %b) {
; SSE2-LABEL: min_le_v16i8:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: pcmpgtb %xmm1, %xmm2
; SSE2-NEXT: pcmpeqd %xmm3, %xmm3
@@ -1523,17 +1591,17 @@ define <16 x i8> @min_le_v16i8(<16 x i8> %a, <16 x i8> %b) {
; SSE2-NEXT: retq
;
; SSE41-LABEL: min_le_v16i8:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pminsb %xmm1, %xmm0
; SSE41-NEXT: retq
;
; SSE42-LABEL: min_le_v16i8:
-; SSE42: # BB#0:
+; SSE42: # %bb.0:
; SSE42-NEXT: pminsb %xmm1, %xmm0
; SSE42-NEXT: retq
;
; AVX-LABEL: min_le_v16i8:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%1 = icmp sle <16 x i8> %a, %b
@@ -1543,7 +1611,7 @@ define <16 x i8> @min_le_v16i8(<16 x i8> %a, <16 x i8> %b) {
define <32 x i8> @min_le_v32i8(<32 x i8> %a, <32 x i8> %b) {
; SSE2-LABEL: min_le_v32i8:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm1, %xmm6
; SSE2-NEXT: pcmpgtb %xmm3, %xmm6
; SSE2-NEXT: pcmpeqd %xmm7, %xmm7
@@ -1563,19 +1631,19 @@ define <32 x i8> @min_le_v32i8(<32 x i8> %a, <32 x i8> %b) {
; SSE2-NEXT: retq
;
; SSE41-LABEL: min_le_v32i8:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pminsb %xmm2, %xmm0
; SSE41-NEXT: pminsb %xmm3, %xmm1
; SSE41-NEXT: retq
;
; SSE42-LABEL: min_le_v32i8:
-; SSE42: # BB#0:
+; SSE42: # %bb.0:
; SSE42-NEXT: pminsb %xmm2, %xmm0
; SSE42-NEXT: pminsb %xmm3, %xmm1
; SSE42-NEXT: retq
;
; AVX1-LABEL: min_le_v32i8:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpminsb %xmm2, %xmm3, %xmm2
@@ -1584,12 +1652,12 @@ define <32 x i8> @min_le_v32i8(<32 x i8> %a, <32 x i8> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: min_le_v32i8:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: min_le_v32i8:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpminsb %ymm1, %ymm0, %ymm0
; AVX512-NEXT: retq
%1 = icmp sle <32 x i8> %a, %b
@@ -1603,12 +1671,12 @@ define <32 x i8> @min_le_v32i8(<32 x i8> %a, <32 x i8> %b) {
define <2 x i64> @max_gt_v2i64c() {
; SSE-LABEL: max_gt_v2i64c:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm0 = [18446744073709551615,7]
; SSE-NEXT: retq
;
; AVX-LABEL: max_gt_v2i64c:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [18446744073709551615,7]
; AVX-NEXT: retq
%1 = insertelement <2 x i64> <i64 -7, i64 7>, i64 -7, i32 0
@@ -1620,13 +1688,13 @@ define <2 x i64> @max_gt_v2i64c() {
define <4 x i64> @max_gt_v4i64c() {
; SSE-LABEL: max_gt_v4i64c:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm1 = [7,7]
; SSE-NEXT: pcmpeqd %xmm0, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: max_gt_v4i64c:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [18446744073709551615,18446744073709551615,7,7]
; AVX-NEXT: retq
%1 = insertelement <4 x i64> <i64 -7, i64 -1, i64 1, i64 7>, i64 -7, i32 0
@@ -1638,12 +1706,12 @@ define <4 x i64> @max_gt_v4i64c() {
define <4 x i32> @max_gt_v4i32c() {
; SSE-LABEL: max_gt_v4i32c:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm0 = [4294967295,4294967295,7,7]
; SSE-NEXT: retq
;
; AVX-LABEL: max_gt_v4i32c:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [4294967295,4294967295,7,7]
; AVX-NEXT: retq
%1 = insertelement <4 x i32> <i32 -7, i32 -1, i32 1, i32 7>, i32 -7, i32 0
@@ -1655,13 +1723,13 @@ define <4 x i32> @max_gt_v4i32c() {
define <8 x i32> @max_gt_v8i32c() {
; SSE-LABEL: max_gt_v8i32c:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm0 = [4294967295,4294967293,4294967293,4294967295]
; SSE-NEXT: movaps {{.*#+}} xmm1 = [7,5,5,7]
; SSE-NEXT: retq
;
; AVX-LABEL: max_gt_v8i32c:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [4294967295,4294967293,4294967293,4294967295,7,5,5,7]
; AVX-NEXT: retq
%1 = insertelement <8 x i32> <i32 -7, i32 -5, i32 -3, i32 -1, i32 1, i32 3, i32 5, i32 7>, i32 -7, i32 0
@@ -1673,12 +1741,12 @@ define <8 x i32> @max_gt_v8i32c() {
define <8 x i16> @max_gt_v8i16c() {
; SSE-LABEL: max_gt_v8i16c:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm0 = [65535,65533,65533,65535,7,5,5,7]
; SSE-NEXT: retq
;
; AVX-LABEL: max_gt_v8i16c:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [65535,65533,65533,65535,7,5,5,7]
; AVX-NEXT: retq
%1 = insertelement <8 x i16> <i16 -7, i16 -5, i16 -3, i16 -1, i16 1, i16 3, i16 5, i16 7>, i16 -7, i32 0
@@ -1690,13 +1758,13 @@ define <8 x i16> @max_gt_v8i16c() {
define <16 x i16> @max_gt_v16i16c() {
; SSE-LABEL: max_gt_v16i16c:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm0 = [65535,65534,65533,65532,65533,65534,65535,0]
; SSE-NEXT: movaps {{.*#+}} xmm1 = [7,6,5,4,5,6,7,8]
; SSE-NEXT: retq
;
; AVX-LABEL: max_gt_v16i16c:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [65535,65534,65533,65532,65533,65534,65535,0,7,6,5,4,5,6,7,8]
; AVX-NEXT: retq
%1 = insertelement <16 x i16> <i16 -7, i16 -6, i16 -5, i16 -4, i16 -3, i16 -2, i16 -1, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8>, i16 -7, i32 0
@@ -1708,12 +1776,12 @@ define <16 x i16> @max_gt_v16i16c() {
define <16 x i8> @max_gt_v16i8c() {
; SSE-LABEL: max_gt_v16i8c:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm0 = [255,254,253,252,253,254,255,0,7,6,5,4,5,6,7,8]
; SSE-NEXT: retq
;
; AVX-LABEL: max_gt_v16i8c:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [255,254,253,252,253,254,255,0,7,6,5,4,5,6,7,8]
; AVX-NEXT: retq
%1 = insertelement <16 x i8> <i8 -7, i8 -6, i8 -5, i8 -4, i8 -3, i8 -2, i8 -1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8>, i8 -7, i32 0
@@ -1725,12 +1793,12 @@ define <16 x i8> @max_gt_v16i8c() {
define <2 x i64> @max_ge_v2i64c() {
; SSE-LABEL: max_ge_v2i64c:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm0 = [18446744073709551615,7]
; SSE-NEXT: retq
;
; AVX-LABEL: max_ge_v2i64c:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [18446744073709551615,7]
; AVX-NEXT: retq
%1 = insertelement <2 x i64> <i64 -7, i64 7>, i64 -7, i32 0
@@ -1742,13 +1810,13 @@ define <2 x i64> @max_ge_v2i64c() {
define <4 x i64> @max_ge_v4i64c() {
; SSE-LABEL: max_ge_v4i64c:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm1 = [7,7]
; SSE-NEXT: pcmpeqd %xmm0, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: max_ge_v4i64c:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [18446744073709551615,18446744073709551615,7,7]
; AVX-NEXT: retq
%1 = insertelement <4 x i64> <i64 -7, i64 -1, i64 1, i64 7>, i64 -7, i32 0
@@ -1760,12 +1828,12 @@ define <4 x i64> @max_ge_v4i64c() {
define <4 x i32> @max_ge_v4i32c() {
; SSE-LABEL: max_ge_v4i32c:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm0 = [4294967295,4294967295,7,7]
; SSE-NEXT: retq
;
; AVX-LABEL: max_ge_v4i32c:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [4294967295,4294967295,7,7]
; AVX-NEXT: retq
%1 = insertelement <4 x i32> <i32 -7, i32 -1, i32 1, i32 7>, i32 -7, i32 0
@@ -1777,13 +1845,13 @@ define <4 x i32> @max_ge_v4i32c() {
define <8 x i32> @max_ge_v8i32c() {
; SSE-LABEL: max_ge_v8i32c:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm0 = [4294967295,4294967293,4294967293,4294967295]
; SSE-NEXT: movaps {{.*#+}} xmm1 = [7,5,5,7]
; SSE-NEXT: retq
;
; AVX-LABEL: max_ge_v8i32c:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [4294967295,4294967293,4294967293,4294967295,7,5,5,7]
; AVX-NEXT: retq
%1 = insertelement <8 x i32> <i32 -7, i32 -5, i32 -3, i32 -1, i32 1, i32 3, i32 5, i32 7>, i32 -7, i32 0
@@ -1795,12 +1863,12 @@ define <8 x i32> @max_ge_v8i32c() {
define <8 x i16> @max_ge_v8i16c() {
; SSE-LABEL: max_ge_v8i16c:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm0 = [65535,65533,65533,65535,7,5,5,7]
; SSE-NEXT: retq
;
; AVX-LABEL: max_ge_v8i16c:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [65535,65533,65533,65535,7,5,5,7]
; AVX-NEXT: retq
%1 = insertelement <8 x i16> <i16 -7, i16 -5, i16 -3, i16 -1, i16 1, i16 3, i16 5, i16 7>, i16 -7, i32 0
@@ -1812,13 +1880,13 @@ define <8 x i16> @max_ge_v8i16c() {
define <16 x i16> @max_ge_v16i16c() {
; SSE-LABEL: max_ge_v16i16c:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm0 = [65535,65534,65533,65532,65533,65534,65535,0]
; SSE-NEXT: movaps {{.*#+}} xmm1 = [7,6,5,4,5,6,7,8]
; SSE-NEXT: retq
;
; AVX-LABEL: max_ge_v16i16c:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [65535,65534,65533,65532,65533,65534,65535,0,7,6,5,4,5,6,7,8]
; AVX-NEXT: retq
%1 = insertelement <16 x i16> <i16 -7, i16 -6, i16 -5, i16 -4, i16 -3, i16 -2, i16 -1, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8>, i16 -7, i32 0
@@ -1830,12 +1898,12 @@ define <16 x i16> @max_ge_v16i16c() {
define <16 x i8> @max_ge_v16i8c() {
; SSE-LABEL: max_ge_v16i8c:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm0 = [255,254,253,252,253,254,255,0,7,6,5,4,5,6,7,8]
; SSE-NEXT: retq
;
; AVX-LABEL: max_ge_v16i8c:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [255,254,253,252,253,254,255,0,7,6,5,4,5,6,7,8]
; AVX-NEXT: retq
%1 = insertelement <16 x i8> <i8 -7, i8 -6, i8 -5, i8 -4, i8 -3, i8 -2, i8 -1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8>, i8 -7, i32 0
@@ -1847,12 +1915,12 @@ define <16 x i8> @max_ge_v16i8c() {
define <2 x i64> @min_lt_v2i64c() {
; SSE-LABEL: min_lt_v2i64c:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm0 = [18446744073709551609,1]
; SSE-NEXT: retq
;
; AVX-LABEL: min_lt_v2i64c:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [18446744073709551609,1]
; AVX-NEXT: retq
%1 = insertelement <2 x i64> <i64 -7, i64 7>, i64 -7, i32 0
@@ -1864,13 +1932,13 @@ define <2 x i64> @min_lt_v2i64c() {
define <4 x i64> @min_lt_v4i64c() {
; SSE-LABEL: min_lt_v4i64c:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm0 = [18446744073709551609,18446744073709551609]
; SSE-NEXT: movaps {{.*#+}} xmm1 = [1,1]
; SSE-NEXT: retq
;
; AVX-LABEL: min_lt_v4i64c:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [18446744073709551609,18446744073709551609,1,1]
; AVX-NEXT: retq
%1 = insertelement <4 x i64> <i64 -7, i64 -1, i64 1, i64 7>, i64 -7, i32 0
@@ -1882,12 +1950,12 @@ define <4 x i64> @min_lt_v4i64c() {
define <4 x i32> @min_lt_v4i32c() {
; SSE-LABEL: min_lt_v4i32c:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm0 = [4294967289,4294967289,1,1]
; SSE-NEXT: retq
;
; AVX-LABEL: min_lt_v4i32c:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [4294967289,4294967289,1,1]
; AVX-NEXT: retq
%1 = insertelement <4 x i32> <i32 -7, i32 -1, i32 1, i32 7>, i32 -7, i32 0
@@ -1899,13 +1967,13 @@ define <4 x i32> @min_lt_v4i32c() {
define <8 x i32> @min_lt_v8i32c() {
; SSE-LABEL: min_lt_v8i32c:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm0 = [4294967289,4294967291,4294967291,4294967289]
; SSE-NEXT: movaps {{.*#+}} xmm1 = [1,3,3,1]
; SSE-NEXT: retq
;
; AVX-LABEL: min_lt_v8i32c:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [4294967289,4294967291,4294967291,4294967289,1,3,3,1]
; AVX-NEXT: retq
%1 = insertelement <8 x i32> <i32 -7, i32 -5, i32 -3, i32 -1, i32 1, i32 3, i32 5, i32 7>, i32 -7, i32 0
@@ -1917,12 +1985,12 @@ define <8 x i32> @min_lt_v8i32c() {
define <8 x i16> @min_lt_v8i16c() {
; SSE-LABEL: min_lt_v8i16c:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm0 = [65529,65531,65531,65529,1,3,3,1]
; SSE-NEXT: retq
;
; AVX-LABEL: min_lt_v8i16c:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [65529,65531,65531,65529,1,3,3,1]
; AVX-NEXT: retq
%1 = insertelement <8 x i16> <i16 -7, i16 -5, i16 -3, i16 -1, i16 1, i16 3, i16 5, i16 7>, i16 -7, i32 0
@@ -1934,13 +2002,13 @@ define <8 x i16> @min_lt_v8i16c() {
define <16 x i16> @min_lt_v16i16c() {
; SSE-LABEL: min_lt_v16i16c:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm0 = [65529,65530,65531,65532,65531,65530,65529,0]
; SSE-NEXT: movaps {{.*#+}} xmm1 = [1,2,3,4,3,2,1,0]
; SSE-NEXT: retq
;
; AVX-LABEL: min_lt_v16i16c:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [65529,65530,65531,65532,65531,65530,65529,0,1,2,3,4,3,2,1,0]
; AVX-NEXT: retq
%1 = insertelement <16 x i16> <i16 -7, i16 -6, i16 -5, i16 -4, i16 -3, i16 -2, i16 -1, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8>, i16 -7, i32 0
@@ -1952,12 +2020,12 @@ define <16 x i16> @min_lt_v16i16c() {
define <16 x i8> @min_lt_v16i8c() {
; SSE-LABEL: min_lt_v16i8c:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm0 = [249,250,251,252,251,250,249,0,1,2,3,4,3,2,1,0]
; SSE-NEXT: retq
;
; AVX-LABEL: min_lt_v16i8c:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [249,250,251,252,251,250,249,0,1,2,3,4,3,2,1,0]
; AVX-NEXT: retq
%1 = insertelement <16 x i8> <i8 -7, i8 -6, i8 -5, i8 -4, i8 -3, i8 -2, i8 -1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8>, i8 -7, i32 0
@@ -1969,12 +2037,12 @@ define <16 x i8> @min_lt_v16i8c() {
define <2 x i64> @min_le_v2i64c() {
; SSE-LABEL: min_le_v2i64c:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm0 = [18446744073709551609,1]
; SSE-NEXT: retq
;
; AVX-LABEL: min_le_v2i64c:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [18446744073709551609,1]
; AVX-NEXT: retq
%1 = insertelement <2 x i64> <i64 -7, i64 7>, i64 -7, i32 0
@@ -1986,13 +2054,13 @@ define <2 x i64> @min_le_v2i64c() {
define <4 x i64> @min_le_v4i64c() {
; SSE-LABEL: min_le_v4i64c:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm0 = [18446744073709551609,18446744073709551609]
; SSE-NEXT: movaps {{.*#+}} xmm1 = [1,1]
; SSE-NEXT: retq
;
; AVX-LABEL: min_le_v4i64c:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [18446744073709551609,18446744073709551609,1,1]
; AVX-NEXT: retq
%1 = insertelement <4 x i64> <i64 -7, i64 -1, i64 1, i64 7>, i64 -7, i32 0
@@ -2004,12 +2072,12 @@ define <4 x i64> @min_le_v4i64c() {
define <4 x i32> @min_le_v4i32c() {
; SSE-LABEL: min_le_v4i32c:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm0 = [4294967289,4294967289,1,1]
; SSE-NEXT: retq
;
; AVX-LABEL: min_le_v4i32c:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [4294967289,4294967289,1,1]
; AVX-NEXT: retq
%1 = insertelement <4 x i32> <i32 -7, i32 -1, i32 1, i32 7>, i32 -7, i32 0
@@ -2021,13 +2089,13 @@ define <4 x i32> @min_le_v4i32c() {
define <8 x i32> @min_le_v8i32c() {
; SSE-LABEL: min_le_v8i32c:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm0 = [4294967289,4294967291,4294967291,4294967289]
; SSE-NEXT: movaps {{.*#+}} xmm1 = [1,3,3,1]
; SSE-NEXT: retq
;
; AVX-LABEL: min_le_v8i32c:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [4294967289,4294967291,4294967291,4294967289,1,3,3,1]
; AVX-NEXT: retq
%1 = insertelement <8 x i32> <i32 -7, i32 -5, i32 -3, i32 -1, i32 1, i32 3, i32 5, i32 7>, i32 -7, i32 0
@@ -2039,12 +2107,12 @@ define <8 x i32> @min_le_v8i32c() {
define <8 x i16> @min_le_v8i16c() {
; SSE-LABEL: min_le_v8i16c:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm0 = [65529,65531,65531,65529,1,3,3,1]
; SSE-NEXT: retq
;
; AVX-LABEL: min_le_v8i16c:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [65529,65531,65531,65529,1,3,3,1]
; AVX-NEXT: retq
%1 = insertelement <8 x i16> <i16 -7, i16 -5, i16 -3, i16 -1, i16 1, i16 3, i16 5, i16 7>, i16 -7, i32 0
@@ -2056,13 +2124,13 @@ define <8 x i16> @min_le_v8i16c() {
define <16 x i16> @min_le_v16i16c() {
; SSE-LABEL: min_le_v16i16c:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm0 = [65529,65530,65531,65532,65531,65530,65529,0]
; SSE-NEXT: movaps {{.*#+}} xmm1 = [1,2,3,4,3,2,1,0]
; SSE-NEXT: retq
;
; AVX-LABEL: min_le_v16i16c:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [65529,65530,65531,65532,65531,65530,65529,0,1,2,3,4,3,2,1,0]
; AVX-NEXT: retq
%1 = insertelement <16 x i16> <i16 -7, i16 -6, i16 -5, i16 -4, i16 -3, i16 -2, i16 -1, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8>, i16 -7, i32 0
@@ -2074,12 +2142,12 @@ define <16 x i16> @min_le_v16i16c() {
define <16 x i8> @min_le_v16i8c() {
; SSE-LABEL: min_le_v16i8c:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm0 = [249,250,251,252,251,250,249,0,1,2,3,4,3,2,1,0]
; SSE-NEXT: retq
;
; AVX-LABEL: min_le_v16i8c:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [249,250,251,252,251,250,249,0,1,2,3,4,3,2,1,0]
; AVX-NEXT: retq
%1 = insertelement <16 x i8> <i8 -7, i8 -6, i8 -5, i8 -4, i8 -3, i8 -2, i8 -1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8>, i8 -7, i32 0
diff --git a/test/CodeGen/X86/vec_minmax_uint.ll b/test/CodeGen/X86/vec_minmax_uint.ll
index ec5f83ea396c..294d10c1cee9 100644
--- a/test/CodeGen/X86/vec_minmax_uint.ll
+++ b/test/CodeGen/X86/vec_minmax_uint.ll
@@ -13,7 +13,7 @@
define <2 x i64> @max_gt_v2i64(<2 x i64> %a, <2 x i64> %b) {
; SSE2-LABEL: max_gt_v2i64:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
; SSE2-NEXT: movdqa %xmm1, %xmm3
; SSE2-NEXT: pxor %xmm2, %xmm3
@@ -32,7 +32,7 @@ define <2 x i64> @max_gt_v2i64(<2 x i64> %a, <2 x i64> %b) {
; SSE2-NEXT: retq
;
; SSE41-LABEL: max_gt_v2i64:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm2
; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648,2147483648,2147483648]
; SSE41-NEXT: movdqa %xmm1, %xmm3
@@ -51,7 +51,7 @@ define <2 x i64> @max_gt_v2i64(<2 x i64> %a, <2 x i64> %b) {
; SSE41-NEXT: retq
;
; SSE42-LABEL: max_gt_v2i64:
-; SSE42: # BB#0:
+; SSE42: # %bb.0:
; SSE42-NEXT: movdqa %xmm0, %xmm2
; SSE42-NEXT: movdqa {{.*#+}} xmm0 = [9223372036854775808,9223372036854775808]
; SSE42-NEXT: movdqa %xmm1, %xmm3
@@ -62,14 +62,32 @@ define <2 x i64> @max_gt_v2i64(<2 x i64> %a, <2 x i64> %b) {
; SSE42-NEXT: movapd %xmm1, %xmm0
; SSE42-NEXT: retq
;
-; AVX-LABEL: max_gt_v2i64:
-; AVX: # BB#0:
-; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
-; AVX-NEXT: vpxor %xmm2, %xmm1, %xmm3
-; AVX-NEXT: vpxor %xmm2, %xmm0, %xmm2
-; AVX-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
-; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX-NEXT: retq
+; AVX1-LABEL: max_gt_v2i64:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm3
+; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm2
+; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: max_gt_v2i64:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm3
+; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm2
+; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: max_gt_v2i64:
+; AVX512: # %bb.0:
+; AVX512-NEXT: # kill: def %xmm1 killed %xmm1 def %zmm1
+; AVX512-NEXT: # kill: def %xmm0 killed %xmm0 def %zmm0
+; AVX512-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
%1 = icmp ugt <2 x i64> %a, %b
%2 = select <2 x i1> %1, <2 x i64> %a, <2 x i64> %b
ret <2 x i64> %2
@@ -77,7 +95,7 @@ define <2 x i64> @max_gt_v2i64(<2 x i64> %a, <2 x i64> %b) {
define <4 x i64> @max_gt_v4i64(<4 x i64> %a, <4 x i64> %b) {
; SSE2-LABEL: max_gt_v4i64:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
; SSE2-NEXT: movdqa %xmm3, %xmm5
; SSE2-NEXT: pxor %xmm4, %xmm5
@@ -111,7 +129,7 @@ define <4 x i64> @max_gt_v4i64(<4 x i64> %a, <4 x i64> %b) {
; SSE2-NEXT: retq
;
; SSE41-LABEL: max_gt_v4i64:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm8
; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648,2147483648,2147483648]
; SSE41-NEXT: movdqa %xmm3, %xmm5
@@ -145,7 +163,7 @@ define <4 x i64> @max_gt_v4i64(<4 x i64> %a, <4 x i64> %b) {
; SSE41-NEXT: retq
;
; SSE42-LABEL: max_gt_v4i64:
-; SSE42: # BB#0:
+; SSE42: # %bb.0:
; SSE42-NEXT: movdqa %xmm0, %xmm4
; SSE42-NEXT: movdqa {{.*#+}} xmm0 = [9223372036854775808,9223372036854775808]
; SSE42-NEXT: movdqa %xmm3, %xmm6
@@ -165,7 +183,7 @@ define <4 x i64> @max_gt_v4i64(<4 x i64> %a, <4 x i64> %b) {
; SSE42-NEXT: retq
;
; AVX1-LABEL: max_gt_v4i64:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2
@@ -180,7 +198,7 @@ define <4 x i64> @max_gt_v4i64(<4 x i64> %a, <4 x i64> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: max_gt_v4i64:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpbroadcastq {{.*}}(%rip), %ymm2
; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm3
; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm2
@@ -189,12 +207,11 @@ define <4 x i64> @max_gt_v4i64(<4 x i64> %a, <4 x i64> %b) {
; AVX2-NEXT: retq
;
; AVX512-LABEL: max_gt_v4i64:
-; AVX512: # BB#0:
-; AVX512-NEXT: vpbroadcastq {{.*}}(%rip), %ymm2
-; AVX512-NEXT: vpxor %ymm2, %ymm1, %ymm3
-; AVX512-NEXT: vpxor %ymm2, %ymm0, %ymm2
-; AVX512-NEXT: vpcmpgtq %ymm3, %ymm2, %ymm2
-; AVX512-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX512: # %bb.0:
+; AVX512-NEXT: # kill: def %ymm1 killed %ymm1 def %zmm1
+; AVX512-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
+; AVX512-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
; AVX512-NEXT: retq
%1 = icmp ugt <4 x i64> %a, %b
%2 = select <4 x i1> %1, <4 x i64> %a, <4 x i64> %b
@@ -203,7 +220,7 @@ define <4 x i64> @max_gt_v4i64(<4 x i64> %a, <4 x i64> %b) {
define <4 x i32> @max_gt_v4i32(<4 x i32> %a, <4 x i32> %b) {
; SSE2-LABEL: max_gt_v4i32:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
; SSE2-NEXT: movdqa %xmm1, %xmm3
; SSE2-NEXT: pxor %xmm2, %xmm3
@@ -215,17 +232,17 @@ define <4 x i32> @max_gt_v4i32(<4 x i32> %a, <4 x i32> %b) {
; SSE2-NEXT: retq
;
; SSE41-LABEL: max_gt_v4i32:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pmaxud %xmm1, %xmm0
; SSE41-NEXT: retq
;
; SSE42-LABEL: max_gt_v4i32:
-; SSE42: # BB#0:
+; SSE42: # %bb.0:
; SSE42-NEXT: pmaxud %xmm1, %xmm0
; SSE42-NEXT: retq
;
; AVX-LABEL: max_gt_v4i32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%1 = icmp ugt <4 x i32> %a, %b
@@ -235,7 +252,7 @@ define <4 x i32> @max_gt_v4i32(<4 x i32> %a, <4 x i32> %b) {
define <8 x i32> @max_gt_v8i32(<8 x i32> %a, <8 x i32> %b) {
; SSE2-LABEL: max_gt_v8i32:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648]
; SSE2-NEXT: movdqa %xmm3, %xmm6
; SSE2-NEXT: pxor %xmm5, %xmm6
@@ -256,19 +273,19 @@ define <8 x i32> @max_gt_v8i32(<8 x i32> %a, <8 x i32> %b) {
; SSE2-NEXT: retq
;
; SSE41-LABEL: max_gt_v8i32:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pmaxud %xmm2, %xmm0
; SSE41-NEXT: pmaxud %xmm3, %xmm1
; SSE41-NEXT: retq
;
; SSE42-LABEL: max_gt_v8i32:
-; SSE42: # BB#0:
+; SSE42: # %bb.0:
; SSE42-NEXT: pmaxud %xmm2, %xmm0
; SSE42-NEXT: pmaxud %xmm3, %xmm1
; SSE42-NEXT: retq
;
; AVX1-LABEL: max_gt_v8i32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpmaxud %xmm2, %xmm3, %xmm2
@@ -277,12 +294,12 @@ define <8 x i32> @max_gt_v8i32(<8 x i32> %a, <8 x i32> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: max_gt_v8i32:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: max_gt_v8i32:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
; AVX512-NEXT: retq
%1 = icmp ugt <8 x i32> %a, %b
@@ -292,7 +309,7 @@ define <8 x i32> @max_gt_v8i32(<8 x i32> %a, <8 x i32> %b) {
define <8 x i16> @max_gt_v8i16(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: max_gt_v8i16:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
; SSE2-NEXT: movdqa %xmm1, %xmm3
; SSE2-NEXT: pxor %xmm2, %xmm3
@@ -304,17 +321,17 @@ define <8 x i16> @max_gt_v8i16(<8 x i16> %a, <8 x i16> %b) {
; SSE2-NEXT: retq
;
; SSE41-LABEL: max_gt_v8i16:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pmaxuw %xmm1, %xmm0
; SSE41-NEXT: retq
;
; SSE42-LABEL: max_gt_v8i16:
-; SSE42: # BB#0:
+; SSE42: # %bb.0:
; SSE42-NEXT: pmaxuw %xmm1, %xmm0
; SSE42-NEXT: retq
;
; AVX-LABEL: max_gt_v8i16:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%1 = icmp ugt <8 x i16> %a, %b
@@ -324,7 +341,7 @@ define <8 x i16> @max_gt_v8i16(<8 x i16> %a, <8 x i16> %b) {
define <16 x i16> @max_gt_v16i16(<16 x i16> %a, <16 x i16> %b) {
; SSE2-LABEL: max_gt_v16i16:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [32768,32768,32768,32768,32768,32768,32768,32768]
; SSE2-NEXT: movdqa %xmm3, %xmm6
; SSE2-NEXT: pxor %xmm5, %xmm6
@@ -345,19 +362,19 @@ define <16 x i16> @max_gt_v16i16(<16 x i16> %a, <16 x i16> %b) {
; SSE2-NEXT: retq
;
; SSE41-LABEL: max_gt_v16i16:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pmaxuw %xmm2, %xmm0
; SSE41-NEXT: pmaxuw %xmm3, %xmm1
; SSE41-NEXT: retq
;
; SSE42-LABEL: max_gt_v16i16:
-; SSE42: # BB#0:
+; SSE42: # %bb.0:
; SSE42-NEXT: pmaxuw %xmm2, %xmm0
; SSE42-NEXT: pmaxuw %xmm3, %xmm1
; SSE42-NEXT: retq
;
; AVX1-LABEL: max_gt_v16i16:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpmaxuw %xmm2, %xmm3, %xmm2
@@ -366,12 +383,12 @@ define <16 x i16> @max_gt_v16i16(<16 x i16> %a, <16 x i16> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: max_gt_v16i16:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: max_gt_v16i16:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
; AVX512-NEXT: retq
%1 = icmp ugt <16 x i16> %a, %b
@@ -381,12 +398,12 @@ define <16 x i16> @max_gt_v16i16(<16 x i16> %a, <16 x i16> %b) {
define <16 x i8> @max_gt_v16i8(<16 x i8> %a, <16 x i8> %b) {
; SSE-LABEL: max_gt_v16i8:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pmaxub %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: max_gt_v16i8:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%1 = icmp ugt <16 x i8> %a, %b
@@ -396,13 +413,13 @@ define <16 x i8> @max_gt_v16i8(<16 x i8> %a, <16 x i8> %b) {
define <32 x i8> @max_gt_v32i8(<32 x i8> %a, <32 x i8> %b) {
; SSE-LABEL: max_gt_v32i8:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pmaxub %xmm2, %xmm0
; SSE-NEXT: pmaxub %xmm3, %xmm1
; SSE-NEXT: retq
;
; AVX1-LABEL: max_gt_v32i8:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpmaxub %xmm2, %xmm3, %xmm2
@@ -411,12 +428,12 @@ define <32 x i8> @max_gt_v32i8(<32 x i8> %a, <32 x i8> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: max_gt_v32i8:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: max_gt_v32i8:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
; AVX512-NEXT: retq
%1 = icmp ugt <32 x i8> %a, %b
@@ -430,7 +447,7 @@ define <32 x i8> @max_gt_v32i8(<32 x i8> %a, <32 x i8> %b) {
define <2 x i64> @max_ge_v2i64(<2 x i64> %a, <2 x i64> %b) {
; SSE2-LABEL: max_ge_v2i64:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
; SSE2-NEXT: movdqa %xmm0, %xmm3
; SSE2-NEXT: pxor %xmm2, %xmm3
@@ -452,7 +469,7 @@ define <2 x i64> @max_ge_v2i64(<2 x i64> %a, <2 x i64> %b) {
; SSE2-NEXT: retq
;
; SSE41-LABEL: max_ge_v2i64:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm2
; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648,2147483648,2147483648]
; SSE41-NEXT: movdqa %xmm2, %xmm3
@@ -473,7 +490,7 @@ define <2 x i64> @max_ge_v2i64(<2 x i64> %a, <2 x i64> %b) {
; SSE41-NEXT: retq
;
; SSE42-LABEL: max_ge_v2i64:
-; SSE42: # BB#0:
+; SSE42: # %bb.0:
; SSE42-NEXT: movdqa %xmm0, %xmm2
; SSE42-NEXT: movdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
; SSE42-NEXT: pxor %xmm3, %xmm0
@@ -485,16 +502,36 @@ define <2 x i64> @max_ge_v2i64(<2 x i64> %a, <2 x i64> %b) {
; SSE42-NEXT: movapd %xmm1, %xmm0
; SSE42-NEXT: retq
;
-; AVX-LABEL: max_ge_v2i64:
-; AVX: # BB#0:
-; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
-; AVX-NEXT: vpxor %xmm2, %xmm0, %xmm3
-; AVX-NEXT: vpxor %xmm2, %xmm1, %xmm2
-; AVX-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
-; AVX-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
-; AVX-NEXT: vpxor %xmm3, %xmm2, %xmm2
-; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX-NEXT: retq
+; AVX1-LABEL: max_ge_v2i64:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm3
+; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm2
+; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: max_ge_v2i64:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3
+; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm2
+; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
+; AVX2-NEXT: vpxor %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: max_ge_v2i64:
+; AVX512: # %bb.0:
+; AVX512-NEXT: # kill: def %xmm1 killed %xmm1 def %zmm1
+; AVX512-NEXT: # kill: def %xmm0 killed %xmm0 def %zmm0
+; AVX512-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
%1 = icmp uge <2 x i64> %a, %b
%2 = select <2 x i1> %1, <2 x i64> %a, <2 x i64> %b
ret <2 x i64> %2
@@ -502,7 +539,7 @@ define <2 x i64> @max_ge_v2i64(<2 x i64> %a, <2 x i64> %b) {
define <4 x i64> @max_ge_v4i64(<4 x i64> %a, <4 x i64> %b) {
; SSE2-LABEL: max_ge_v4i64:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [2147483648,2147483648,2147483648,2147483648]
; SSE2-NEXT: movdqa %xmm1, %xmm4
; SSE2-NEXT: pxor %xmm7, %xmm4
@@ -542,7 +579,7 @@ define <4 x i64> @max_ge_v4i64(<4 x i64> %a, <4 x i64> %b) {
; SSE2-NEXT: retq
;
; SSE41-LABEL: max_ge_v4i64:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm8
; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648,2147483648,2147483648]
; SSE41-NEXT: movdqa %xmm1, %xmm5
@@ -579,7 +616,7 @@ define <4 x i64> @max_ge_v4i64(<4 x i64> %a, <4 x i64> %b) {
; SSE41-NEXT: retq
;
; SSE42-LABEL: max_ge_v4i64:
-; SSE42: # BB#0:
+; SSE42: # %bb.0:
; SSE42-NEXT: movdqa %xmm0, %xmm4
; SSE42-NEXT: movdqa {{.*#+}} xmm0 = [9223372036854775808,9223372036854775808]
; SSE42-NEXT: movdqa %xmm1, %xmm6
@@ -602,7 +639,7 @@ define <4 x i64> @max_ge_v4i64(<4 x i64> %a, <4 x i64> %b) {
; SSE42-NEXT: retq
;
; AVX1-LABEL: max_ge_v4i64:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2
@@ -620,7 +657,7 @@ define <4 x i64> @max_ge_v4i64(<4 x i64> %a, <4 x i64> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: max_ge_v4i64:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpbroadcastq {{.*}}(%rip), %ymm2
; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm3
; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm2
@@ -631,14 +668,11 @@ define <4 x i64> @max_ge_v4i64(<4 x i64> %a, <4 x i64> %b) {
; AVX2-NEXT: retq
;
; AVX512-LABEL: max_ge_v4i64:
-; AVX512: # BB#0:
-; AVX512-NEXT: vpbroadcastq {{.*}}(%rip), %ymm2
-; AVX512-NEXT: vpxor %ymm2, %ymm0, %ymm3
-; AVX512-NEXT: vpxor %ymm2, %ymm1, %ymm2
-; AVX512-NEXT: vpcmpgtq %ymm3, %ymm2, %ymm2
-; AVX512-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3
-; AVX512-NEXT: vpxor %ymm3, %ymm2, %ymm2
-; AVX512-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX512: # %bb.0:
+; AVX512-NEXT: # kill: def %ymm1 killed %ymm1 def %zmm1
+; AVX512-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
+; AVX512-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
; AVX512-NEXT: retq
%1 = icmp uge <4 x i64> %a, %b
%2 = select <4 x i1> %1, <4 x i64> %a, <4 x i64> %b
@@ -647,7 +681,7 @@ define <4 x i64> @max_ge_v4i64(<4 x i64> %a, <4 x i64> %b) {
define <4 x i32> @max_ge_v4i32(<4 x i32> %a, <4 x i32> %b) {
; SSE2-LABEL: max_ge_v4i32:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: pxor %xmm3, %xmm2
@@ -662,17 +696,17 @@ define <4 x i32> @max_ge_v4i32(<4 x i32> %a, <4 x i32> %b) {
; SSE2-NEXT: retq
;
; SSE41-LABEL: max_ge_v4i32:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pmaxud %xmm1, %xmm0
; SSE41-NEXT: retq
;
; SSE42-LABEL: max_ge_v4i32:
-; SSE42: # BB#0:
+; SSE42: # %bb.0:
; SSE42-NEXT: pmaxud %xmm1, %xmm0
; SSE42-NEXT: retq
;
; AVX-LABEL: max_ge_v4i32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%1 = icmp uge <4 x i32> %a, %b
@@ -682,7 +716,7 @@ define <4 x i32> @max_ge_v4i32(<4 x i32> %a, <4 x i32> %b) {
define <8 x i32> @max_ge_v8i32(<8 x i32> %a, <8 x i32> %b) {
; SSE2-LABEL: max_ge_v8i32:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648]
; SSE2-NEXT: movdqa %xmm1, %xmm4
; SSE2-NEXT: pxor %xmm6, %xmm4
@@ -708,19 +742,19 @@ define <8 x i32> @max_ge_v8i32(<8 x i32> %a, <8 x i32> %b) {
; SSE2-NEXT: retq
;
; SSE41-LABEL: max_ge_v8i32:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pmaxud %xmm2, %xmm0
; SSE41-NEXT: pmaxud %xmm3, %xmm1
; SSE41-NEXT: retq
;
; SSE42-LABEL: max_ge_v8i32:
-; SSE42: # BB#0:
+; SSE42: # %bb.0:
; SSE42-NEXT: pmaxud %xmm2, %xmm0
; SSE42-NEXT: pmaxud %xmm3, %xmm1
; SSE42-NEXT: retq
;
; AVX1-LABEL: max_ge_v8i32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpmaxud %xmm2, %xmm3, %xmm2
@@ -729,12 +763,12 @@ define <8 x i32> @max_ge_v8i32(<8 x i32> %a, <8 x i32> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: max_ge_v8i32:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: max_ge_v8i32:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
; AVX512-NEXT: retq
%1 = icmp uge <8 x i32> %a, %b
@@ -744,7 +778,7 @@ define <8 x i32> @max_ge_v8i32(<8 x i32> %a, <8 x i32> %b) {
define <8 x i16> @max_ge_v8i16(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: max_ge_v8i16:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm1, %xmm2
; SSE2-NEXT: psubusw %xmm0, %xmm2
; SSE2-NEXT: pxor %xmm3, %xmm3
@@ -755,17 +789,17 @@ define <8 x i16> @max_ge_v8i16(<8 x i16> %a, <8 x i16> %b) {
; SSE2-NEXT: retq
;
; SSE41-LABEL: max_ge_v8i16:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pmaxuw %xmm1, %xmm0
; SSE41-NEXT: retq
;
; SSE42-LABEL: max_ge_v8i16:
-; SSE42: # BB#0:
+; SSE42: # %bb.0:
; SSE42-NEXT: pmaxuw %xmm1, %xmm0
; SSE42-NEXT: retq
;
; AVX-LABEL: max_ge_v8i16:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%1 = icmp uge <8 x i16> %a, %b
@@ -775,7 +809,7 @@ define <8 x i16> @max_ge_v8i16(<8 x i16> %a, <8 x i16> %b) {
define <16 x i16> @max_ge_v16i16(<16 x i16> %a, <16 x i16> %b) {
; SSE2-LABEL: max_ge_v16i16:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm3, %xmm4
; SSE2-NEXT: psubusw %xmm1, %xmm4
; SSE2-NEXT: pxor %xmm5, %xmm5
@@ -792,19 +826,19 @@ define <16 x i16> @max_ge_v16i16(<16 x i16> %a, <16 x i16> %b) {
; SSE2-NEXT: retq
;
; SSE41-LABEL: max_ge_v16i16:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pmaxuw %xmm2, %xmm0
; SSE41-NEXT: pmaxuw %xmm3, %xmm1
; SSE41-NEXT: retq
;
; SSE42-LABEL: max_ge_v16i16:
-; SSE42: # BB#0:
+; SSE42: # %bb.0:
; SSE42-NEXT: pmaxuw %xmm2, %xmm0
; SSE42-NEXT: pmaxuw %xmm3, %xmm1
; SSE42-NEXT: retq
;
; AVX1-LABEL: max_ge_v16i16:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpmaxuw %xmm2, %xmm3, %xmm2
@@ -813,12 +847,12 @@ define <16 x i16> @max_ge_v16i16(<16 x i16> %a, <16 x i16> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: max_ge_v16i16:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: max_ge_v16i16:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
; AVX512-NEXT: retq
%1 = icmp uge <16 x i16> %a, %b
@@ -828,12 +862,12 @@ define <16 x i16> @max_ge_v16i16(<16 x i16> %a, <16 x i16> %b) {
define <16 x i8> @max_ge_v16i8(<16 x i8> %a, <16 x i8> %b) {
; SSE-LABEL: max_ge_v16i8:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pmaxub %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: max_ge_v16i8:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%1 = icmp uge <16 x i8> %a, %b
@@ -843,13 +877,13 @@ define <16 x i8> @max_ge_v16i8(<16 x i8> %a, <16 x i8> %b) {
define <32 x i8> @max_ge_v32i8(<32 x i8> %a, <32 x i8> %b) {
; SSE-LABEL: max_ge_v32i8:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pmaxub %xmm2, %xmm0
; SSE-NEXT: pmaxub %xmm3, %xmm1
; SSE-NEXT: retq
;
; AVX1-LABEL: max_ge_v32i8:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpmaxub %xmm2, %xmm3, %xmm2
@@ -858,12 +892,12 @@ define <32 x i8> @max_ge_v32i8(<32 x i8> %a, <32 x i8> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: max_ge_v32i8:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: max_ge_v32i8:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
; AVX512-NEXT: retq
%1 = icmp uge <32 x i8> %a, %b
@@ -877,7 +911,7 @@ define <32 x i8> @max_ge_v32i8(<32 x i8> %a, <32 x i8> %b) {
define <2 x i64> @min_lt_v2i64(<2 x i64> %a, <2 x i64> %b) {
; SSE2-LABEL: min_lt_v2i64:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
; SSE2-NEXT: movdqa %xmm0, %xmm3
; SSE2-NEXT: pxor %xmm2, %xmm3
@@ -896,7 +930,7 @@ define <2 x i64> @min_lt_v2i64(<2 x i64> %a, <2 x i64> %b) {
; SSE2-NEXT: retq
;
; SSE41-LABEL: min_lt_v2i64:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm2
; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648,2147483648,2147483648]
; SSE41-NEXT: movdqa %xmm2, %xmm3
@@ -915,7 +949,7 @@ define <2 x i64> @min_lt_v2i64(<2 x i64> %a, <2 x i64> %b) {
; SSE41-NEXT: retq
;
; SSE42-LABEL: min_lt_v2i64:
-; SSE42: # BB#0:
+; SSE42: # %bb.0:
; SSE42-NEXT: movdqa %xmm0, %xmm2
; SSE42-NEXT: movdqa {{.*#+}} xmm0 = [9223372036854775808,9223372036854775808]
; SSE42-NEXT: movdqa %xmm2, %xmm3
@@ -926,14 +960,32 @@ define <2 x i64> @min_lt_v2i64(<2 x i64> %a, <2 x i64> %b) {
; SSE42-NEXT: movapd %xmm1, %xmm0
; SSE42-NEXT: retq
;
-; AVX-LABEL: min_lt_v2i64:
-; AVX: # BB#0:
-; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
-; AVX-NEXT: vpxor %xmm2, %xmm0, %xmm3
-; AVX-NEXT: vpxor %xmm2, %xmm1, %xmm2
-; AVX-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
-; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX-NEXT: retq
+; AVX1-LABEL: min_lt_v2i64:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm3
+; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm2
+; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: min_lt_v2i64:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3
+; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm2
+; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: min_lt_v2i64:
+; AVX512: # %bb.0:
+; AVX512-NEXT: # kill: def %xmm1 killed %xmm1 def %zmm1
+; AVX512-NEXT: # kill: def %xmm0 killed %xmm0 def %zmm0
+; AVX512-NEXT: vpminuq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
%1 = icmp ult <2 x i64> %a, %b
%2 = select <2 x i1> %1, <2 x i64> %a, <2 x i64> %b
ret <2 x i64> %2
@@ -941,7 +993,7 @@ define <2 x i64> @min_lt_v2i64(<2 x i64> %a, <2 x i64> %b) {
define <4 x i64> @min_lt_v4i64(<4 x i64> %a, <4 x i64> %b) {
; SSE2-LABEL: min_lt_v4i64:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
; SSE2-NEXT: movdqa %xmm1, %xmm5
; SSE2-NEXT: pxor %xmm4, %xmm5
@@ -975,7 +1027,7 @@ define <4 x i64> @min_lt_v4i64(<4 x i64> %a, <4 x i64> %b) {
; SSE2-NEXT: retq
;
; SSE41-LABEL: min_lt_v4i64:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm8
; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648,2147483648,2147483648]
; SSE41-NEXT: movdqa %xmm1, %xmm5
@@ -1009,7 +1061,7 @@ define <4 x i64> @min_lt_v4i64(<4 x i64> %a, <4 x i64> %b) {
; SSE41-NEXT: retq
;
; SSE42-LABEL: min_lt_v4i64:
-; SSE42: # BB#0:
+; SSE42: # %bb.0:
; SSE42-NEXT: movdqa %xmm0, %xmm4
; SSE42-NEXT: movdqa {{.*#+}} xmm0 = [9223372036854775808,9223372036854775808]
; SSE42-NEXT: movdqa %xmm1, %xmm6
@@ -1029,7 +1081,7 @@ define <4 x i64> @min_lt_v4i64(<4 x i64> %a, <4 x i64> %b) {
; SSE42-NEXT: retq
;
; AVX1-LABEL: min_lt_v4i64:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2
@@ -1044,7 +1096,7 @@ define <4 x i64> @min_lt_v4i64(<4 x i64> %a, <4 x i64> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: min_lt_v4i64:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpbroadcastq {{.*}}(%rip), %ymm2
; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm3
; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm2
@@ -1053,12 +1105,11 @@ define <4 x i64> @min_lt_v4i64(<4 x i64> %a, <4 x i64> %b) {
; AVX2-NEXT: retq
;
; AVX512-LABEL: min_lt_v4i64:
-; AVX512: # BB#0:
-; AVX512-NEXT: vpbroadcastq {{.*}}(%rip), %ymm2
-; AVX512-NEXT: vpxor %ymm2, %ymm0, %ymm3
-; AVX512-NEXT: vpxor %ymm2, %ymm1, %ymm2
-; AVX512-NEXT: vpcmpgtq %ymm3, %ymm2, %ymm2
-; AVX512-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX512: # %bb.0:
+; AVX512-NEXT: # kill: def %ymm1 killed %ymm1 def %zmm1
+; AVX512-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
+; AVX512-NEXT: vpminuq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
; AVX512-NEXT: retq
%1 = icmp ult <4 x i64> %a, %b
%2 = select <4 x i1> %1, <4 x i64> %a, <4 x i64> %b
@@ -1067,7 +1118,7 @@ define <4 x i64> @min_lt_v4i64(<4 x i64> %a, <4 x i64> %b) {
define <4 x i32> @min_lt_v4i32(<4 x i32> %a, <4 x i32> %b) {
; SSE2-LABEL: min_lt_v4i32:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
; SSE2-NEXT: movdqa %xmm0, %xmm3
; SSE2-NEXT: pxor %xmm2, %xmm3
@@ -1079,17 +1130,17 @@ define <4 x i32> @min_lt_v4i32(<4 x i32> %a, <4 x i32> %b) {
; SSE2-NEXT: retq
;
; SSE41-LABEL: min_lt_v4i32:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pminud %xmm1, %xmm0
; SSE41-NEXT: retq
;
; SSE42-LABEL: min_lt_v4i32:
-; SSE42: # BB#0:
+; SSE42: # %bb.0:
; SSE42-NEXT: pminud %xmm1, %xmm0
; SSE42-NEXT: retq
;
; AVX-LABEL: min_lt_v4i32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpminud %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%1 = icmp ult <4 x i32> %a, %b
@@ -1099,7 +1150,7 @@ define <4 x i32> @min_lt_v4i32(<4 x i32> %a, <4 x i32> %b) {
define <8 x i32> @min_lt_v8i32(<8 x i32> %a, <8 x i32> %b) {
; SSE2-LABEL: min_lt_v8i32:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
; SSE2-NEXT: movdqa %xmm1, %xmm5
; SSE2-NEXT: pxor %xmm4, %xmm5
@@ -1119,19 +1170,19 @@ define <8 x i32> @min_lt_v8i32(<8 x i32> %a, <8 x i32> %b) {
; SSE2-NEXT: retq
;
; SSE41-LABEL: min_lt_v8i32:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pminud %xmm2, %xmm0
; SSE41-NEXT: pminud %xmm3, %xmm1
; SSE41-NEXT: retq
;
; SSE42-LABEL: min_lt_v8i32:
-; SSE42: # BB#0:
+; SSE42: # %bb.0:
; SSE42-NEXT: pminud %xmm2, %xmm0
; SSE42-NEXT: pminud %xmm3, %xmm1
; SSE42-NEXT: retq
;
; AVX1-LABEL: min_lt_v8i32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpminud %xmm2, %xmm3, %xmm2
@@ -1140,12 +1191,12 @@ define <8 x i32> @min_lt_v8i32(<8 x i32> %a, <8 x i32> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: min_lt_v8i32:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: min_lt_v8i32:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpminud %ymm1, %ymm0, %ymm0
; AVX512-NEXT: retq
%1 = icmp ult <8 x i32> %a, %b
@@ -1155,7 +1206,7 @@ define <8 x i32> @min_lt_v8i32(<8 x i32> %a, <8 x i32> %b) {
define <8 x i16> @min_lt_v8i16(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: min_lt_v8i16:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
; SSE2-NEXT: movdqa %xmm0, %xmm3
; SSE2-NEXT: pxor %xmm2, %xmm3
@@ -1167,17 +1218,17 @@ define <8 x i16> @min_lt_v8i16(<8 x i16> %a, <8 x i16> %b) {
; SSE2-NEXT: retq
;
; SSE41-LABEL: min_lt_v8i16:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pminuw %xmm1, %xmm0
; SSE41-NEXT: retq
;
; SSE42-LABEL: min_lt_v8i16:
-; SSE42: # BB#0:
+; SSE42: # %bb.0:
; SSE42-NEXT: pminuw %xmm1, %xmm0
; SSE42-NEXT: retq
;
; AVX-LABEL: min_lt_v8i16:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpminuw %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%1 = icmp ult <8 x i16> %a, %b
@@ -1187,7 +1238,7 @@ define <8 x i16> @min_lt_v8i16(<8 x i16> %a, <8 x i16> %b) {
define <16 x i16> @min_lt_v16i16(<16 x i16> %a, <16 x i16> %b) {
; SSE2-LABEL: min_lt_v16i16:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [32768,32768,32768,32768,32768,32768,32768,32768]
; SSE2-NEXT: movdqa %xmm1, %xmm5
; SSE2-NEXT: pxor %xmm4, %xmm5
@@ -1207,19 +1258,19 @@ define <16 x i16> @min_lt_v16i16(<16 x i16> %a, <16 x i16> %b) {
; SSE2-NEXT: retq
;
; SSE41-LABEL: min_lt_v16i16:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pminuw %xmm2, %xmm0
; SSE41-NEXT: pminuw %xmm3, %xmm1
; SSE41-NEXT: retq
;
; SSE42-LABEL: min_lt_v16i16:
-; SSE42: # BB#0:
+; SSE42: # %bb.0:
; SSE42-NEXT: pminuw %xmm2, %xmm0
; SSE42-NEXT: pminuw %xmm3, %xmm1
; SSE42-NEXT: retq
;
; AVX1-LABEL: min_lt_v16i16:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpminuw %xmm2, %xmm3, %xmm2
@@ -1228,12 +1279,12 @@ define <16 x i16> @min_lt_v16i16(<16 x i16> %a, <16 x i16> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: min_lt_v16i16:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: min_lt_v16i16:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpminuw %ymm1, %ymm0, %ymm0
; AVX512-NEXT: retq
%1 = icmp ult <16 x i16> %a, %b
@@ -1243,12 +1294,12 @@ define <16 x i16> @min_lt_v16i16(<16 x i16> %a, <16 x i16> %b) {
define <16 x i8> @min_lt_v16i8(<16 x i8> %a, <16 x i8> %b) {
; SSE-LABEL: min_lt_v16i8:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pminub %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: min_lt_v16i8:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%1 = icmp ult <16 x i8> %a, %b
@@ -1258,13 +1309,13 @@ define <16 x i8> @min_lt_v16i8(<16 x i8> %a, <16 x i8> %b) {
define <32 x i8> @min_lt_v32i8(<32 x i8> %a, <32 x i8> %b) {
; SSE-LABEL: min_lt_v32i8:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pminub %xmm2, %xmm0
; SSE-NEXT: pminub %xmm3, %xmm1
; SSE-NEXT: retq
;
; AVX1-LABEL: min_lt_v32i8:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpminub %xmm2, %xmm3, %xmm2
@@ -1273,12 +1324,12 @@ define <32 x i8> @min_lt_v32i8(<32 x i8> %a, <32 x i8> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: min_lt_v32i8:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: min_lt_v32i8:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpminub %ymm1, %ymm0, %ymm0
; AVX512-NEXT: retq
%1 = icmp ult <32 x i8> %a, %b
@@ -1292,7 +1343,7 @@ define <32 x i8> @min_lt_v32i8(<32 x i8> %a, <32 x i8> %b) {
define <2 x i64> @min_le_v2i64(<2 x i64> %a, <2 x i64> %b) {
; SSE2-LABEL: min_le_v2i64:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
; SSE2-NEXT: movdqa %xmm1, %xmm3
; SSE2-NEXT: pxor %xmm2, %xmm3
@@ -1314,7 +1365,7 @@ define <2 x i64> @min_le_v2i64(<2 x i64> %a, <2 x i64> %b) {
; SSE2-NEXT: retq
;
; SSE41-LABEL: min_le_v2i64:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm2
; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648,2147483648,2147483648]
; SSE41-NEXT: movdqa %xmm1, %xmm3
@@ -1335,7 +1386,7 @@ define <2 x i64> @min_le_v2i64(<2 x i64> %a, <2 x i64> %b) {
; SSE41-NEXT: retq
;
; SSE42-LABEL: min_le_v2i64:
-; SSE42: # BB#0:
+; SSE42: # %bb.0:
; SSE42-NEXT: movdqa %xmm0, %xmm2
; SSE42-NEXT: movdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
; SSE42-NEXT: movdqa %xmm1, %xmm0
@@ -1348,16 +1399,36 @@ define <2 x i64> @min_le_v2i64(<2 x i64> %a, <2 x i64> %b) {
; SSE42-NEXT: movapd %xmm1, %xmm0
; SSE42-NEXT: retq
;
-; AVX-LABEL: min_le_v2i64:
-; AVX: # BB#0:
-; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
-; AVX-NEXT: vpxor %xmm2, %xmm1, %xmm3
-; AVX-NEXT: vpxor %xmm2, %xmm0, %xmm2
-; AVX-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
-; AVX-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
-; AVX-NEXT: vpxor %xmm3, %xmm2, %xmm2
-; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX-NEXT: retq
+; AVX1-LABEL: min_le_v2i64:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm3
+; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm2
+; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: min_le_v2i64:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm3
+; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm2
+; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
+; AVX2-NEXT: vpxor %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: min_le_v2i64:
+; AVX512: # %bb.0:
+; AVX512-NEXT: # kill: def %xmm1 killed %xmm1 def %zmm1
+; AVX512-NEXT: # kill: def %xmm0 killed %xmm0 def %zmm0
+; AVX512-NEXT: vpminuq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
%1 = icmp ule <2 x i64> %a, %b
%2 = select <2 x i1> %1, <2 x i64> %a, <2 x i64> %b
ret <2 x i64> %2
@@ -1365,7 +1436,7 @@ define <2 x i64> @min_le_v2i64(<2 x i64> %a, <2 x i64> %b) {
define <4 x i64> @min_le_v4i64(<4 x i64> %a, <4 x i64> %b) {
; SSE2-LABEL: min_le_v4i64:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [2147483648,2147483648,2147483648,2147483648]
; SSE2-NEXT: movdqa %xmm3, %xmm4
; SSE2-NEXT: pxor %xmm7, %xmm4
@@ -1405,7 +1476,7 @@ define <4 x i64> @min_le_v4i64(<4 x i64> %a, <4 x i64> %b) {
; SSE2-NEXT: retq
;
; SSE41-LABEL: min_le_v4i64:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm8
; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648,2147483648,2147483648]
; SSE41-NEXT: movdqa %xmm3, %xmm5
@@ -1442,7 +1513,7 @@ define <4 x i64> @min_le_v4i64(<4 x i64> %a, <4 x i64> %b) {
; SSE41-NEXT: retq
;
; SSE42-LABEL: min_le_v4i64:
-; SSE42: # BB#0:
+; SSE42: # %bb.0:
; SSE42-NEXT: movdqa %xmm0, %xmm4
; SSE42-NEXT: movdqa {{.*#+}} xmm0 = [9223372036854775808,9223372036854775808]
; SSE42-NEXT: movdqa %xmm3, %xmm6
@@ -1465,7 +1536,7 @@ define <4 x i64> @min_le_v4i64(<4 x i64> %a, <4 x i64> %b) {
; SSE42-NEXT: retq
;
; AVX1-LABEL: min_le_v4i64:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2
@@ -1483,7 +1554,7 @@ define <4 x i64> @min_le_v4i64(<4 x i64> %a, <4 x i64> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: min_le_v4i64:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpbroadcastq {{.*}}(%rip), %ymm2
; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm3
; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm2
@@ -1494,14 +1565,11 @@ define <4 x i64> @min_le_v4i64(<4 x i64> %a, <4 x i64> %b) {
; AVX2-NEXT: retq
;
; AVX512-LABEL: min_le_v4i64:
-; AVX512: # BB#0:
-; AVX512-NEXT: vpbroadcastq {{.*}}(%rip), %ymm2
-; AVX512-NEXT: vpxor %ymm2, %ymm1, %ymm3
-; AVX512-NEXT: vpxor %ymm2, %ymm0, %ymm2
-; AVX512-NEXT: vpcmpgtq %ymm3, %ymm2, %ymm2
-; AVX512-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3
-; AVX512-NEXT: vpxor %ymm3, %ymm2, %ymm2
-; AVX512-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX512: # %bb.0:
+; AVX512-NEXT: # kill: def %ymm1 killed %ymm1 def %zmm1
+; AVX512-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
+; AVX512-NEXT: vpminuq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
; AVX512-NEXT: retq
%1 = icmp ule <4 x i64> %a, %b
%2 = select <4 x i1> %1, <4 x i64> %a, <4 x i64> %b
@@ -1510,7 +1578,7 @@ define <4 x i64> @min_le_v4i64(<4 x i64> %a, <4 x i64> %b) {
define <4 x i32> @min_le_v4i32(<4 x i32> %a, <4 x i32> %b) {
; SSE2-LABEL: min_le_v4i32:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
; SSE2-NEXT: movdqa %xmm1, %xmm2
; SSE2-NEXT: pxor %xmm3, %xmm2
@@ -1525,17 +1593,17 @@ define <4 x i32> @min_le_v4i32(<4 x i32> %a, <4 x i32> %b) {
; SSE2-NEXT: retq
;
; SSE41-LABEL: min_le_v4i32:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pminud %xmm1, %xmm0
; SSE41-NEXT: retq
;
; SSE42-LABEL: min_le_v4i32:
-; SSE42: # BB#0:
+; SSE42: # %bb.0:
; SSE42-NEXT: pminud %xmm1, %xmm0
; SSE42-NEXT: retq
;
; AVX-LABEL: min_le_v4i32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpminud %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%1 = icmp ule <4 x i32> %a, %b
@@ -1545,7 +1613,7 @@ define <4 x i32> @min_le_v4i32(<4 x i32> %a, <4 x i32> %b) {
define <8 x i32> @min_le_v8i32(<8 x i32> %a, <8 x i32> %b) {
; SSE2-LABEL: min_le_v8i32:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648]
; SSE2-NEXT: movdqa %xmm3, %xmm4
; SSE2-NEXT: pxor %xmm6, %xmm4
@@ -1571,19 +1639,19 @@ define <8 x i32> @min_le_v8i32(<8 x i32> %a, <8 x i32> %b) {
; SSE2-NEXT: retq
;
; SSE41-LABEL: min_le_v8i32:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pminud %xmm2, %xmm0
; SSE41-NEXT: pminud %xmm3, %xmm1
; SSE41-NEXT: retq
;
; SSE42-LABEL: min_le_v8i32:
-; SSE42: # BB#0:
+; SSE42: # %bb.0:
; SSE42-NEXT: pminud %xmm2, %xmm0
; SSE42-NEXT: pminud %xmm3, %xmm1
; SSE42-NEXT: retq
;
; AVX1-LABEL: min_le_v8i32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpminud %xmm2, %xmm3, %xmm2
@@ -1592,12 +1660,12 @@ define <8 x i32> @min_le_v8i32(<8 x i32> %a, <8 x i32> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: min_le_v8i32:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: min_le_v8i32:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpminud %ymm1, %ymm0, %ymm0
; AVX512-NEXT: retq
%1 = icmp ule <8 x i32> %a, %b
@@ -1607,7 +1675,7 @@ define <8 x i32> @min_le_v8i32(<8 x i32> %a, <8 x i32> %b) {
define <8 x i16> @min_le_v8i16(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: min_le_v8i16:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: psubusw %xmm1, %xmm2
; SSE2-NEXT: pxor %xmm3, %xmm3
@@ -1618,17 +1686,17 @@ define <8 x i16> @min_le_v8i16(<8 x i16> %a, <8 x i16> %b) {
; SSE2-NEXT: retq
;
; SSE41-LABEL: min_le_v8i16:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pminuw %xmm1, %xmm0
; SSE41-NEXT: retq
;
; SSE42-LABEL: min_le_v8i16:
-; SSE42: # BB#0:
+; SSE42: # %bb.0:
; SSE42-NEXT: pminuw %xmm1, %xmm0
; SSE42-NEXT: retq
;
; AVX-LABEL: min_le_v8i16:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpminuw %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%1 = icmp ule <8 x i16> %a, %b
@@ -1638,7 +1706,7 @@ define <8 x i16> @min_le_v8i16(<8 x i16> %a, <8 x i16> %b) {
define <16 x i16> @min_le_v16i16(<16 x i16> %a, <16 x i16> %b) {
; SSE2-LABEL: min_le_v16i16:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm1, %xmm4
; SSE2-NEXT: psubusw %xmm3, %xmm4
; SSE2-NEXT: pxor %xmm6, %xmm6
@@ -1657,19 +1725,19 @@ define <16 x i16> @min_le_v16i16(<16 x i16> %a, <16 x i16> %b) {
; SSE2-NEXT: retq
;
; SSE41-LABEL: min_le_v16i16:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pminuw %xmm2, %xmm0
; SSE41-NEXT: pminuw %xmm3, %xmm1
; SSE41-NEXT: retq
;
; SSE42-LABEL: min_le_v16i16:
-; SSE42: # BB#0:
+; SSE42: # %bb.0:
; SSE42-NEXT: pminuw %xmm2, %xmm0
; SSE42-NEXT: pminuw %xmm3, %xmm1
; SSE42-NEXT: retq
;
; AVX1-LABEL: min_le_v16i16:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpminuw %xmm2, %xmm3, %xmm2
@@ -1678,12 +1746,12 @@ define <16 x i16> @min_le_v16i16(<16 x i16> %a, <16 x i16> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: min_le_v16i16:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: min_le_v16i16:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpminuw %ymm1, %ymm0, %ymm0
; AVX512-NEXT: retq
%1 = icmp ule <16 x i16> %a, %b
@@ -1693,12 +1761,12 @@ define <16 x i16> @min_le_v16i16(<16 x i16> %a, <16 x i16> %b) {
define <16 x i8> @min_le_v16i8(<16 x i8> %a, <16 x i8> %b) {
; SSE-LABEL: min_le_v16i8:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pminub %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: min_le_v16i8:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%1 = icmp ule <16 x i8> %a, %b
@@ -1708,13 +1776,13 @@ define <16 x i8> @min_le_v16i8(<16 x i8> %a, <16 x i8> %b) {
define <32 x i8> @min_le_v32i8(<32 x i8> %a, <32 x i8> %b) {
; SSE-LABEL: min_le_v32i8:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pminub %xmm2, %xmm0
; SSE-NEXT: pminub %xmm3, %xmm1
; SSE-NEXT: retq
;
; AVX1-LABEL: min_le_v32i8:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpminub %xmm2, %xmm3, %xmm2
@@ -1723,12 +1791,12 @@ define <32 x i8> @min_le_v32i8(<32 x i8> %a, <32 x i8> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: min_le_v32i8:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: min_le_v32i8:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpminub %ymm1, %ymm0, %ymm0
; AVX512-NEXT: retq
%1 = icmp ule <32 x i8> %a, %b
@@ -1742,12 +1810,12 @@ define <32 x i8> @min_le_v32i8(<32 x i8> %a, <32 x i8> %b) {
define <2 x i64> @max_gt_v2i64c() {
; SSE-LABEL: max_gt_v2i64c:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm0 = [18446744073709551615,7]
; SSE-NEXT: retq
;
; AVX-LABEL: max_gt_v2i64c:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [18446744073709551615,7]
; AVX-NEXT: retq
%1 = insertelement <2 x i64> <i64 -7, i64 7>, i64 -7, i32 0
@@ -1759,13 +1827,13 @@ define <2 x i64> @max_gt_v2i64c() {
define <4 x i64> @max_gt_v4i64c() {
; SSE-LABEL: max_gt_v4i64c:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm1 = [7,7]
; SSE-NEXT: pcmpeqd %xmm0, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: max_gt_v4i64c:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [18446744073709551615,18446744073709551615,7,7]
; AVX-NEXT: retq
%1 = insertelement <4 x i64> <i64 -7, i64 -1, i64 1, i64 7>, i64 -7, i32 0
@@ -1777,12 +1845,12 @@ define <4 x i64> @max_gt_v4i64c() {
define <4 x i32> @max_gt_v4i32c() {
; SSE-LABEL: max_gt_v4i32c:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm0 = [4294967295,4294967295,7,7]
; SSE-NEXT: retq
;
; AVX-LABEL: max_gt_v4i32c:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [4294967295,4294967295,7,7]
; AVX-NEXT: retq
%1 = insertelement <4 x i32> <i32 -7, i32 -1, i32 1, i32 7>, i32 -7, i32 0
@@ -1794,13 +1862,13 @@ define <4 x i32> @max_gt_v4i32c() {
define <8 x i32> @max_gt_v8i32c() {
; SSE-LABEL: max_gt_v8i32c:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm0 = [4294967295,4294967293,4294967293,4294967295]
; SSE-NEXT: movaps {{.*#+}} xmm1 = [7,5,5,7]
; SSE-NEXT: retq
;
; AVX-LABEL: max_gt_v8i32c:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [4294967295,4294967293,4294967293,4294967295,7,5,5,7]
; AVX-NEXT: retq
%1 = insertelement <8 x i32> <i32 -7, i32 -5, i32 -3, i32 -1, i32 1, i32 3, i32 5, i32 7>, i32 -7, i32 0
@@ -1812,12 +1880,12 @@ define <8 x i32> @max_gt_v8i32c() {
define <8 x i16> @max_gt_v8i16c() {
; SSE-LABEL: max_gt_v8i16c:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm0 = [65535,65533,65533,65535,7,5,5,7]
; SSE-NEXT: retq
;
; AVX-LABEL: max_gt_v8i16c:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [65535,65533,65533,65535,7,5,5,7]
; AVX-NEXT: retq
%1 = insertelement <8 x i16> <i16 -7, i16 -5, i16 -3, i16 -1, i16 1, i16 3, i16 5, i16 7>, i16 -7, i32 0
@@ -1829,13 +1897,13 @@ define <8 x i16> @max_gt_v8i16c() {
define <16 x i16> @max_gt_v16i16c() {
; SSE-LABEL: max_gt_v16i16c:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm0 = [65535,65534,65533,65532,65533,65534,65535,0]
; SSE-NEXT: movaps {{.*#+}} xmm1 = [7,6,5,4,5,6,7,8]
; SSE-NEXT: retq
;
; AVX-LABEL: max_gt_v16i16c:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [65535,65534,65533,65532,65533,65534,65535,0,7,6,5,4,5,6,7,8]
; AVX-NEXT: retq
%1 = insertelement <16 x i16> <i16 -7, i16 -6, i16 -5, i16 -4, i16 -3, i16 -2, i16 -1, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8>, i16 -7, i32 0
@@ -1847,12 +1915,12 @@ define <16 x i16> @max_gt_v16i16c() {
define <16 x i8> @max_gt_v16i8c() {
; SSE-LABEL: max_gt_v16i8c:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm0 = [255,254,253,252,253,254,255,0,7,6,5,4,5,6,7,8]
; SSE-NEXT: retq
;
; AVX-LABEL: max_gt_v16i8c:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [255,254,253,252,253,254,255,0,7,6,5,4,5,6,7,8]
; AVX-NEXT: retq
%1 = insertelement <16 x i8> <i8 -7, i8 -6, i8 -5, i8 -4, i8 -3, i8 -2, i8 -1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8>, i8 -7, i32 0
@@ -1864,12 +1932,12 @@ define <16 x i8> @max_gt_v16i8c() {
define <2 x i64> @max_ge_v2i64c() {
; SSE-LABEL: max_ge_v2i64c:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm0 = [18446744073709551615,7]
; SSE-NEXT: retq
;
; AVX-LABEL: max_ge_v2i64c:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [18446744073709551615,7]
; AVX-NEXT: retq
%1 = insertelement <2 x i64> <i64 -7, i64 7>, i64 -7, i32 0
@@ -1881,13 +1949,13 @@ define <2 x i64> @max_ge_v2i64c() {
define <4 x i64> @max_ge_v4i64c() {
; SSE-LABEL: max_ge_v4i64c:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm1 = [7,7]
; SSE-NEXT: pcmpeqd %xmm0, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: max_ge_v4i64c:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [18446744073709551615,18446744073709551615,7,7]
; AVX-NEXT: retq
%1 = insertelement <4 x i64> <i64 -7, i64 -1, i64 1, i64 7>, i64 -7, i32 0
@@ -1899,12 +1967,12 @@ define <4 x i64> @max_ge_v4i64c() {
define <4 x i32> @max_ge_v4i32c() {
; SSE-LABEL: max_ge_v4i32c:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm0 = [4294967295,4294967295,7,7]
; SSE-NEXT: retq
;
; AVX-LABEL: max_ge_v4i32c:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [4294967295,4294967295,7,7]
; AVX-NEXT: retq
%1 = insertelement <4 x i32> <i32 -7, i32 -1, i32 1, i32 7>, i32 -7, i32 0
@@ -1916,13 +1984,13 @@ define <4 x i32> @max_ge_v4i32c() {
define <8 x i32> @max_ge_v8i32c() {
; SSE-LABEL: max_ge_v8i32c:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm0 = [4294967295,4294967293,4294967293,4294967295]
; SSE-NEXT: movaps {{.*#+}} xmm1 = [7,5,5,7]
; SSE-NEXT: retq
;
; AVX-LABEL: max_ge_v8i32c:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [4294967295,4294967293,4294967293,4294967295,7,5,5,7]
; AVX-NEXT: retq
%1 = insertelement <8 x i32> <i32 -7, i32 -5, i32 -3, i32 -1, i32 1, i32 3, i32 5, i32 7>, i32 -7, i32 0
@@ -1934,12 +2002,12 @@ define <8 x i32> @max_ge_v8i32c() {
define <8 x i16> @max_ge_v8i16c() {
; SSE-LABEL: max_ge_v8i16c:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm0 = [65535,65533,65533,65535,7,5,5,7]
; SSE-NEXT: retq
;
; AVX-LABEL: max_ge_v8i16c:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [65535,65533,65533,65535,7,5,5,7]
; AVX-NEXT: retq
%1 = insertelement <8 x i16> <i16 -7, i16 -5, i16 -3, i16 -1, i16 1, i16 3, i16 5, i16 7>, i16 -7, i32 0
@@ -1951,13 +2019,13 @@ define <8 x i16> @max_ge_v8i16c() {
define <16 x i16> @max_ge_v16i16c() {
; SSE-LABEL: max_ge_v16i16c:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm0 = [65535,65534,65533,65532,65533,65534,65535,0]
; SSE-NEXT: movaps {{.*#+}} xmm1 = [7,6,5,4,5,6,7,8]
; SSE-NEXT: retq
;
; AVX-LABEL: max_ge_v16i16c:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [65535,65534,65533,65532,65533,65534,65535,0,7,6,5,4,5,6,7,8]
; AVX-NEXT: retq
%1 = insertelement <16 x i16> <i16 -7, i16 -6, i16 -5, i16 -4, i16 -3, i16 -2, i16 -1, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8>, i16 -7, i32 0
@@ -1969,12 +2037,12 @@ define <16 x i16> @max_ge_v16i16c() {
define <16 x i8> @max_ge_v16i8c() {
; SSE-LABEL: max_ge_v16i8c:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm0 = [255,254,253,252,253,254,255,0,7,6,5,4,5,6,7,8]
; SSE-NEXT: retq
;
; AVX-LABEL: max_ge_v16i8c:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [255,254,253,252,253,254,255,0,7,6,5,4,5,6,7,8]
; AVX-NEXT: retq
%1 = insertelement <16 x i8> <i8 -7, i8 -6, i8 -5, i8 -4, i8 -3, i8 -2, i8 -1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8>, i8 -7, i32 0
@@ -1986,12 +2054,12 @@ define <16 x i8> @max_ge_v16i8c() {
define <2 x i64> @min_lt_v2i64c() {
; SSE-LABEL: min_lt_v2i64c:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm0 = [18446744073709551609,1]
; SSE-NEXT: retq
;
; AVX-LABEL: min_lt_v2i64c:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [18446744073709551609,1]
; AVX-NEXT: retq
%1 = insertelement <2 x i64> <i64 -7, i64 7>, i64 -7, i32 0
@@ -2003,13 +2071,13 @@ define <2 x i64> @min_lt_v2i64c() {
define <4 x i64> @min_lt_v4i64c() {
; SSE-LABEL: min_lt_v4i64c:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm0 = [18446744073709551609,18446744073709551609]
; SSE-NEXT: movaps {{.*#+}} xmm1 = [1,1]
; SSE-NEXT: retq
;
; AVX-LABEL: min_lt_v4i64c:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [18446744073709551609,18446744073709551609,1,1]
; AVX-NEXT: retq
%1 = insertelement <4 x i64> <i64 -7, i64 -1, i64 1, i64 7>, i64 -7, i32 0
@@ -2021,12 +2089,12 @@ define <4 x i64> @min_lt_v4i64c() {
define <4 x i32> @min_lt_v4i32c() {
; SSE-LABEL: min_lt_v4i32c:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm0 = [4294967289,4294967289,1,1]
; SSE-NEXT: retq
;
; AVX-LABEL: min_lt_v4i32c:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [4294967289,4294967289,1,1]
; AVX-NEXT: retq
%1 = insertelement <4 x i32> <i32 -7, i32 -1, i32 1, i32 7>, i32 -7, i32 0
@@ -2038,13 +2106,13 @@ define <4 x i32> @min_lt_v4i32c() {
define <8 x i32> @min_lt_v8i32c() {
; SSE-LABEL: min_lt_v8i32c:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm0 = [4294967289,4294967291,4294967291,4294967289]
; SSE-NEXT: movaps {{.*#+}} xmm1 = [1,3,3,1]
; SSE-NEXT: retq
;
; AVX-LABEL: min_lt_v8i32c:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [4294967289,4294967291,4294967291,4294967289,1,3,3,1]
; AVX-NEXT: retq
%1 = insertelement <8 x i32> <i32 -7, i32 -5, i32 -3, i32 -1, i32 1, i32 3, i32 5, i32 7>, i32 -7, i32 0
@@ -2056,12 +2124,12 @@ define <8 x i32> @min_lt_v8i32c() {
define <8 x i16> @min_lt_v8i16c() {
; SSE-LABEL: min_lt_v8i16c:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm0 = [1,65531,65531,65529,1,3,3,1]
; SSE-NEXT: retq
;
; AVX-LABEL: min_lt_v8i16c:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [1,65531,65531,65529,1,3,3,1]
; AVX-NEXT: retq
%1 = insertelement <8 x i16> <i16 -7, i16 -5, i16 -3, i16 -1, i16 1, i16 3, i16 5, i16 7>, i16 -7, i32 0
@@ -2073,13 +2141,13 @@ define <8 x i16> @min_lt_v8i16c() {
define <16 x i16> @min_lt_v16i16c() {
; SSE-LABEL: min_lt_v16i16c:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm0 = [1,65530,65531,65532,65531,65530,65529,0]
; SSE-NEXT: movaps {{.*#+}} xmm1 = [1,2,3,4,3,2,1,0]
; SSE-NEXT: retq
;
; AVX-LABEL: min_lt_v16i16c:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [1,65530,65531,65532,65531,65530,65529,0,1,2,3,4,3,2,1,0]
; AVX-NEXT: retq
%1 = insertelement <16 x i16> <i16 -7, i16 -6, i16 -5, i16 -4, i16 -3, i16 -2, i16 -1, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8>, i16 -7, i32 0
@@ -2091,12 +2159,12 @@ define <16 x i16> @min_lt_v16i16c() {
define <16 x i8> @min_lt_v16i8c() {
; SSE-LABEL: min_lt_v16i8c:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm0 = [1,250,251,252,251,250,249,0,1,2,3,4,3,2,1,0]
; SSE-NEXT: retq
;
; AVX-LABEL: min_lt_v16i8c:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [1,250,251,252,251,250,249,0,1,2,3,4,3,2,1,0]
; AVX-NEXT: retq
%1 = insertelement <16 x i8> <i8 -7, i8 -6, i8 -5, i8 -4, i8 -3, i8 -2, i8 -1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8>, i8 -7, i32 0
@@ -2108,12 +2176,12 @@ define <16 x i8> @min_lt_v16i8c() {
define <2 x i64> @min_le_v2i64c() {
; SSE-LABEL: min_le_v2i64c:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm0 = [18446744073709551609,1]
; SSE-NEXT: retq
;
; AVX-LABEL: min_le_v2i64c:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [18446744073709551609,1]
; AVX-NEXT: retq
%1 = insertelement <2 x i64> <i64 -7, i64 7>, i64 -7, i32 0
@@ -2125,13 +2193,13 @@ define <2 x i64> @min_le_v2i64c() {
define <4 x i64> @min_le_v4i64c() {
; SSE-LABEL: min_le_v4i64c:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm0 = [18446744073709551609,18446744073709551609]
; SSE-NEXT: movaps {{.*#+}} xmm1 = [1,1]
; SSE-NEXT: retq
;
; AVX-LABEL: min_le_v4i64c:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [18446744073709551609,18446744073709551609,1,1]
; AVX-NEXT: retq
%1 = insertelement <4 x i64> <i64 -7, i64 -1, i64 1, i64 7>, i64 -7, i32 0
@@ -2143,12 +2211,12 @@ define <4 x i64> @min_le_v4i64c() {
define <4 x i32> @min_le_v4i32c() {
; SSE-LABEL: min_le_v4i32c:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm0 = [4294967289,4294967289,1,1]
; SSE-NEXT: retq
;
; AVX-LABEL: min_le_v4i32c:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [4294967289,4294967289,1,1]
; AVX-NEXT: retq
%1 = insertelement <4 x i32> <i32 -7, i32 -1, i32 1, i32 7>, i32 -7, i32 0
@@ -2160,13 +2228,13 @@ define <4 x i32> @min_le_v4i32c() {
define <8 x i32> @min_le_v8i32c() {
; SSE-LABEL: min_le_v8i32c:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm0 = [4294967289,4294967291,4294967291,4294967289]
; SSE-NEXT: movaps {{.*#+}} xmm1 = [1,3,3,1]
; SSE-NEXT: retq
;
; AVX-LABEL: min_le_v8i32c:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [4294967289,4294967291,4294967291,4294967289,1,3,3,1]
; AVX-NEXT: retq
%1 = insertelement <8 x i32> <i32 -7, i32 -5, i32 -3, i32 -1, i32 1, i32 3, i32 5, i32 7>, i32 -7, i32 0
@@ -2178,12 +2246,12 @@ define <8 x i32> @min_le_v8i32c() {
define <8 x i16> @min_le_v8i16c() {
; SSE-LABEL: min_le_v8i16c:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm0 = [65529,65531,65531,65529,1,3,3,1]
; SSE-NEXT: retq
;
; AVX-LABEL: min_le_v8i16c:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [65529,65531,65531,65529,1,3,3,1]
; AVX-NEXT: retq
%1 = insertelement <8 x i16> <i16 -7, i16 -5, i16 -3, i16 -1, i16 1, i16 3, i16 5, i16 7>, i16 -7, i32 0
@@ -2195,13 +2263,13 @@ define <8 x i16> @min_le_v8i16c() {
define <16 x i16> @min_le_v16i16c() {
; SSE-LABEL: min_le_v16i16c:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm0 = [65529,65530,65531,65532,65531,65530,65529,0]
; SSE-NEXT: movaps {{.*#+}} xmm1 = [1,2,3,4,3,2,1,0]
; SSE-NEXT: retq
;
; AVX-LABEL: min_le_v16i16c:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [65529,65530,65531,65532,65531,65530,65529,0,1,2,3,4,3,2,1,0]
; AVX-NEXT: retq
%1 = insertelement <16 x i16> <i16 -7, i16 -6, i16 -5, i16 -4, i16 -3, i16 -2, i16 -1, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8>, i16 -7, i32 0
@@ -2213,12 +2281,12 @@ define <16 x i16> @min_le_v16i16c() {
define <16 x i8> @min_le_v16i8c() {
; SSE-LABEL: min_le_v16i8c:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm0 = [249,250,251,252,251,250,249,0,1,2,3,4,3,2,1,0]
; SSE-NEXT: retq
;
; AVX-LABEL: min_le_v16i8c:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [249,250,251,252,251,250,249,0,1,2,3,4,3,2,1,0]
; AVX-NEXT: retq
%1 = insertelement <16 x i8> <i8 -7, i8 -6, i8 -5, i8 -4, i8 -3, i8 -2, i8 -1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8>, i8 -7, i32 0
diff --git a/test/CodeGen/X86/vec_partial.ll b/test/CodeGen/X86/vec_partial.ll
index ee15c2af6dd2..a9044c6ffb50 100644
--- a/test/CodeGen/X86/vec_partial.ll
+++ b/test/CodeGen/X86/vec_partial.ll
@@ -5,12 +5,12 @@
; PR11580
define <3 x float> @addf3(<3 x float> %x) {
; X86-LABEL: addf3:
-; X86: # BB#0: # %entry
+; X86: # %bb.0: # %entry
; X86-NEXT: addps {{\.LCPI.*}}, %xmm0
; X86-NEXT: retl
;
; X64-LABEL: addf3:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: addps {{.*}}(%rip), %xmm0
; X64-NEXT: retq
entry:
@@ -21,11 +21,11 @@ entry:
; PR11580
define <4 x float> @cvtf3_f4(<3 x float> %x) {
; X86-LABEL: cvtf3_f4:
-; X86: # BB#0: # %entry
+; X86: # %bb.0: # %entry
; X86-NEXT: retl
;
; X64-LABEL: cvtf3_f4:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: retq
entry:
%extractVec = shufflevector <3 x float> %x, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>
@@ -35,11 +35,11 @@ entry:
; PR11580
define <3 x float> @cvtf4_f3(<4 x float> %x) {
; X86-LABEL: cvtf4_f3:
-; X86: # BB#0: # %entry
+; X86: # %bb.0: # %entry
; X86-NEXT: retl
;
; X64-LABEL: cvtf4_f3:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: retq
entry:
%extractVec = shufflevector <4 x float> %x, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
diff --git a/test/CodeGen/X86/vec_reassociate.ll b/test/CodeGen/X86/vec_reassociate.ll
index 5234b0c8a77c..c8b61809d31a 100644
--- a/test/CodeGen/X86/vec_reassociate.ll
+++ b/test/CodeGen/X86/vec_reassociate.ll
@@ -4,12 +4,12 @@
define <4 x i32> @add_4i32(<4 x i32> %a0, <4 x i32> %a1) {
; X86-LABEL: add_4i32:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: paddd %xmm1, %xmm0
; X86-NEXT: retl
;
; X64-LABEL: add_4i32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: paddd %xmm1, %xmm0
; X64-NEXT: retq
%1 = add <4 x i32> %a0, <i32 1, i32 -2, i32 3, i32 -4>
@@ -20,12 +20,12 @@ define <4 x i32> @add_4i32(<4 x i32> %a0, <4 x i32> %a1) {
define <4 x i32> @add_4i32_commute(<4 x i32> %a0, <4 x i32> %a1) {
; X86-LABEL: add_4i32_commute:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: paddd %xmm1, %xmm0
; X86-NEXT: retl
;
; X64-LABEL: add_4i32_commute:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: paddd %xmm1, %xmm0
; X64-NEXT: retq
%1 = add <4 x i32> <i32 1, i32 -2, i32 3, i32 -4>, %a0
@@ -36,13 +36,13 @@ define <4 x i32> @add_4i32_commute(<4 x i32> %a0, <4 x i32> %a1) {
define <4 x i32> @mul_4i32(<4 x i32> %a0, <4 x i32> %a1) {
; X86-LABEL: mul_4i32:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: pmulld %xmm1, %xmm0
; X86-NEXT: pmulld {{\.LCPI.*}}, %xmm0
; X86-NEXT: retl
;
; X64-LABEL: mul_4i32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: pmulld %xmm1, %xmm0
; X64-NEXT: pmulld {{.*}}(%rip), %xmm0
; X64-NEXT: retq
@@ -54,13 +54,13 @@ define <4 x i32> @mul_4i32(<4 x i32> %a0, <4 x i32> %a1) {
define <4 x i32> @mul_4i32_commute(<4 x i32> %a0, <4 x i32> %a1) {
; X86-LABEL: mul_4i32_commute:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: pmulld %xmm1, %xmm0
; X86-NEXT: pmulld {{\.LCPI.*}}, %xmm0
; X86-NEXT: retl
;
; X64-LABEL: mul_4i32_commute:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: pmulld %xmm1, %xmm0
; X64-NEXT: pmulld {{.*}}(%rip), %xmm0
; X64-NEXT: retq
@@ -72,13 +72,13 @@ define <4 x i32> @mul_4i32_commute(<4 x i32> %a0, <4 x i32> %a1) {
define <4 x i32> @and_4i32(<4 x i32> %a0, <4 x i32> %a1) {
; X86-LABEL: and_4i32:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: andps %xmm1, %xmm0
; X86-NEXT: andps {{\.LCPI.*}}, %xmm0
; X86-NEXT: retl
;
; X64-LABEL: and_4i32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: andps %xmm1, %xmm0
; X64-NEXT: andps {{.*}}(%rip), %xmm0
; X64-NEXT: retq
@@ -90,13 +90,13 @@ define <4 x i32> @and_4i32(<4 x i32> %a0, <4 x i32> %a1) {
define <4 x i32> @and_4i32_commute(<4 x i32> %a0, <4 x i32> %a1) {
; X86-LABEL: and_4i32_commute:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: andps %xmm1, %xmm0
; X86-NEXT: andps {{\.LCPI.*}}, %xmm0
; X86-NEXT: retl
;
; X64-LABEL: and_4i32_commute:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: andps %xmm1, %xmm0
; X64-NEXT: andps {{.*}}(%rip), %xmm0
; X64-NEXT: retq
@@ -108,13 +108,13 @@ define <4 x i32> @and_4i32_commute(<4 x i32> %a0, <4 x i32> %a1) {
define <4 x i32> @or_4i32(<4 x i32> %a0, <4 x i32> %a1) {
; X86-LABEL: or_4i32:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: orps %xmm1, %xmm0
; X86-NEXT: orps {{\.LCPI.*}}, %xmm0
; X86-NEXT: retl
;
; X64-LABEL: or_4i32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: orps %xmm1, %xmm0
; X64-NEXT: orps {{.*}}(%rip), %xmm0
; X64-NEXT: retq
@@ -126,13 +126,13 @@ define <4 x i32> @or_4i32(<4 x i32> %a0, <4 x i32> %a1) {
define <4 x i32> @or_4i32_commute(<4 x i32> %a0, <4 x i32> %a1) {
; X86-LABEL: or_4i32_commute:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: orps %xmm1, %xmm0
; X86-NEXT: orps {{\.LCPI.*}}, %xmm0
; X86-NEXT: retl
;
; X64-LABEL: or_4i32_commute:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: orps %xmm1, %xmm0
; X64-NEXT: orps {{.*}}(%rip), %xmm0
; X64-NEXT: retq
@@ -144,13 +144,13 @@ define <4 x i32> @or_4i32_commute(<4 x i32> %a0, <4 x i32> %a1) {
define <4 x i32> @xor_4i32(<4 x i32> %a0, <4 x i32> %a1) {
; X86-LABEL: xor_4i32:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: xorps %xmm1, %xmm0
; X86-NEXT: xorps {{\.LCPI.*}}, %xmm0
; X86-NEXT: retl
;
; X64-LABEL: xor_4i32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: xorps %xmm1, %xmm0
; X64-NEXT: xorps {{.*}}(%rip), %xmm0
; X64-NEXT: retq
@@ -162,13 +162,13 @@ define <4 x i32> @xor_4i32(<4 x i32> %a0, <4 x i32> %a1) {
define <4 x i32> @xor_4i32_commute(<4 x i32> %a0, <4 x i32> %a1) {
; X86-LABEL: xor_4i32_commute:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: xorps %xmm1, %xmm0
; X86-NEXT: xorps {{\.LCPI.*}}, %xmm0
; X86-NEXT: retl
;
; X64-LABEL: xor_4i32_commute:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: xorps %xmm1, %xmm0
; X64-NEXT: xorps {{.*}}(%rip), %xmm0
; X64-NEXT: retq
diff --git a/test/CodeGen/X86/vec_return.ll b/test/CodeGen/X86/vec_return.ll
index 556e32d0c87b..f33b6a1c126a 100644
--- a/test/CodeGen/X86/vec_return.ll
+++ b/test/CodeGen/X86/vec_return.ll
@@ -4,7 +4,7 @@
; Without any typed operations, always use the smaller xorps.
define <2 x double> @test() {
; CHECK-LABEL: test:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: xorps %xmm0, %xmm0
; CHECK-NEXT: retl
ret <2 x double> zeroinitializer
@@ -13,7 +13,7 @@ define <2 x double> @test() {
; Prefer a constant pool load here.
define <4 x i32> @test2() nounwind {
; CHECK-LABEL: test2:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movaps {{.*#+}} xmm0 = [0,0,1,0]
; CHECK-NEXT: retl
ret <4 x i32> < i32 0, i32 0, i32 1, i32 0 >
diff --git a/test/CodeGen/X86/vec_sdiv_to_shift.ll b/test/CodeGen/X86/vec_sdiv_to_shift.ll
index f0c9069d8c79..b8c3bfcd9960 100644
--- a/test/CodeGen/X86/vec_sdiv_to_shift.ll
+++ b/test/CodeGen/X86/vec_sdiv_to_shift.ll
@@ -5,7 +5,7 @@
define <8 x i16> @sdiv_vec8x16(<8 x i16> %var) {
; SSE-LABEL: sdiv_vec8x16:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: movdqa %xmm0, %xmm1
; SSE-NEXT: psraw $15, %xmm1
; SSE-NEXT: psrlw $11, %xmm1
@@ -15,7 +15,7 @@ define <8 x i16> @sdiv_vec8x16(<8 x i16> %var) {
; SSE-NEXT: retq
;
; AVX-LABEL: sdiv_vec8x16:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vpsraw $15, %xmm0, %xmm1
; AVX-NEXT: vpsrlw $11, %xmm1, %xmm1
; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0
@@ -28,7 +28,7 @@ entry:
define <8 x i16> @sdiv_vec8x16_minsize(<8 x i16> %var) minsize {
; SSE-LABEL: sdiv_vec8x16_minsize:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: movdqa %xmm0, %xmm1
; SSE-NEXT: psraw $15, %xmm1
; SSE-NEXT: psrlw $11, %xmm1
@@ -38,7 +38,7 @@ define <8 x i16> @sdiv_vec8x16_minsize(<8 x i16> %var) minsize {
; SSE-NEXT: retq
;
; AVX-LABEL: sdiv_vec8x16_minsize:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vpsraw $15, %xmm0, %xmm1
; AVX-NEXT: vpsrlw $11, %xmm1, %xmm1
; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0
@@ -51,7 +51,7 @@ entry:
define <4 x i32> @sdiv_vec4x32(<4 x i32> %var) {
; SSE-LABEL: sdiv_vec4x32:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: movdqa %xmm0, %xmm1
; SSE-NEXT: psrad $31, %xmm1
; SSE-NEXT: psrld $28, %xmm1
@@ -61,7 +61,7 @@ define <4 x i32> @sdiv_vec4x32(<4 x i32> %var) {
; SSE-NEXT: retq
;
; AVX-LABEL: sdiv_vec4x32:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vpsrad $31, %xmm0, %xmm1
; AVX-NEXT: vpsrld $28, %xmm1, %xmm1
; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
@@ -74,7 +74,7 @@ ret <4 x i32> %0
define <4 x i32> @sdiv_negative(<4 x i32> %var) {
; SSE-LABEL: sdiv_negative:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: movdqa %xmm0, %xmm1
; SSE-NEXT: psrad $31, %xmm1
; SSE-NEXT: psrld $28, %xmm1
@@ -85,7 +85,7 @@ define <4 x i32> @sdiv_negative(<4 x i32> %var) {
; SSE-NEXT: retq
;
; AVX-LABEL: sdiv_negative:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vpsrad $31, %xmm0, %xmm1
; AVX-NEXT: vpsrld $28, %xmm1, %xmm1
; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
@@ -100,7 +100,7 @@ ret <4 x i32> %0
define <8 x i32> @sdiv8x32(<8 x i32> %var) {
; SSE-LABEL: sdiv8x32:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: movdqa %xmm0, %xmm2
; SSE-NEXT: psrad $31, %xmm2
; SSE-NEXT: psrld $26, %xmm2
@@ -116,7 +116,7 @@ define <8 x i32> @sdiv8x32(<8 x i32> %var) {
; SSE-NEXT: retq
;
; AVX1-LABEL: sdiv8x32:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vpsrad $31, %xmm0, %xmm1
; AVX1-NEXT: vpsrld $26, %xmm1, %xmm1
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm1
@@ -130,7 +130,7 @@ define <8 x i32> @sdiv8x32(<8 x i32> %var) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: sdiv8x32:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpsrad $31, %ymm0, %ymm1
; AVX2-NEXT: vpsrld $26, %ymm1, %ymm1
; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
@@ -143,7 +143,7 @@ ret <8 x i32> %0
define <16 x i16> @sdiv16x16(<16 x i16> %var) {
; SSE-LABEL: sdiv16x16:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: movdqa %xmm0, %xmm2
; SSE-NEXT: psraw $15, %xmm2
; SSE-NEXT: psrlw $14, %xmm2
@@ -159,7 +159,7 @@ define <16 x i16> @sdiv16x16(<16 x i16> %var) {
; SSE-NEXT: retq
;
; AVX1-LABEL: sdiv16x16:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vpsraw $15, %xmm0, %xmm1
; AVX1-NEXT: vpsrlw $14, %xmm1, %xmm1
; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm1
@@ -173,7 +173,7 @@ define <16 x i16> @sdiv16x16(<16 x i16> %var) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: sdiv16x16:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpsraw $15, %ymm0, %ymm1
; AVX2-NEXT: vpsrlw $14, %ymm1, %ymm1
; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0
@@ -188,11 +188,11 @@ entry:
define <4 x i32> @sdiv_non_splat(<4 x i32> %x) {
; SSE-LABEL: sdiv_non_splat:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: retq
;
; AVX-LABEL: sdiv_non_splat:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: retq
%y = sdiv <4 x i32> %x, <i32 2, i32 0, i32 0, i32 0>
ret <4 x i32> %y
diff --git a/test/CodeGen/X86/vec_set-2.ll b/test/CodeGen/X86/vec_set-2.ll
index 51c8b2111107..058e924aa4fc 100644
--- a/test/CodeGen/X86/vec_set-2.ll
+++ b/test/CodeGen/X86/vec_set-2.ll
@@ -4,12 +4,12 @@
define <4 x float> @test1(float %a) nounwind {
; X86-LABEL: test1:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X86-NEXT: retl
;
; X64-LABEL: test1:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: xorps %xmm1, %xmm1
; X64-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
; X64-NEXT: movaps %xmm1, %xmm0
@@ -23,12 +23,12 @@ define <4 x float> @test1(float %a) nounwind {
define <2 x i64> @test(i32 %a) nounwind {
; X86-LABEL: test:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X86-NEXT: retl
;
; X64-LABEL: test:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movd %edi, %xmm0
; X64-NEXT: retq
%tmp = insertelement <4 x i32> zeroinitializer, i32 %a, i32 0
diff --git a/test/CodeGen/X86/vec_set-3.ll b/test/CodeGen/X86/vec_set-3.ll
index b34f30924a8d..14f1587a6d46 100644
--- a/test/CodeGen/X86/vec_set-3.ll
+++ b/test/CodeGen/X86/vec_set-3.ll
@@ -4,12 +4,12 @@
define <4 x float> @test(float %a) {
; X86-LABEL: test:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: insertps {{.*#+}} xmm0 = zero,mem[0],zero,zero
; X86-NEXT: retl
;
; X64-LABEL: test:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: insertps {{.*#+}} xmm0 = zero,xmm0[0],zero,zero
; X64-NEXT: retq
%tmp = insertelement <4 x float> zeroinitializer, float %a, i32 1
@@ -20,13 +20,13 @@ define <4 x float> @test(float %a) {
define <2 x i64> @test2(i32 %a) {
; X86-LABEL: test2:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,0,1]
; X86-NEXT: retl
;
; X64-LABEL: test2:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movd %edi, %xmm0
; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,0,1]
; X64-NEXT: retq
@@ -38,12 +38,12 @@ define <2 x i64> @test2(i32 %a) {
define <4 x float> @test3(<4 x float> %A) {
; X86-LABEL: test3:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: insertps {{.*#+}} xmm0 = zero,xmm0[0],zero,zero
; X86-NEXT: retl
;
; X64-LABEL: test3:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: insertps {{.*#+}} xmm0 = zero,xmm0[0],zero,zero
; X64-NEXT: retq
%tmp0 = extractelement <4 x float> %A, i32 0
diff --git a/test/CodeGen/X86/vec_set-4.ll b/test/CodeGen/X86/vec_set-4.ll
index 09142e16aa6e..d01a913ea8a4 100644
--- a/test/CodeGen/X86/vec_set-4.ll
+++ b/test/CodeGen/X86/vec_set-4.ll
@@ -4,13 +4,13 @@
define <2 x i64> @test(i16 %a) nounwind {
; X86-LABEL: test:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: pxor %xmm0, %xmm0
; X86-NEXT: pinsrw $3, {{[0-9]+}}(%esp), %xmm0
; X86-NEXT: retl
;
; X64-LABEL: test:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: pxor %xmm0, %xmm0
; X64-NEXT: pinsrw $3, %edi, %xmm0
; X64-NEXT: retq
@@ -25,14 +25,14 @@ define <2 x i64> @test(i16 %a) nounwind {
define <2 x i64> @test2(i8 %a) nounwind {
; X86-LABEL: test2:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; X86-NEXT: pxor %xmm0, %xmm0
; X86-NEXT: pinsrw $5, %eax, %xmm0
; X86-NEXT: retl
;
; X64-LABEL: test2:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movzbl %dil, %eax
; X64-NEXT: pxor %xmm0, %xmm0
; X64-NEXT: pinsrw $5, %eax, %xmm0
diff --git a/test/CodeGen/X86/vec_set-6.ll b/test/CodeGen/X86/vec_set-6.ll
index 3c9aca3a02da..3f8997faf392 100644
--- a/test/CodeGen/X86/vec_set-6.ll
+++ b/test/CodeGen/X86/vec_set-6.ll
@@ -4,14 +4,14 @@
define <4 x float> @test(float %a, float %b, float %c) nounwind {
; X86-LABEL: test:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
; X86-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,1]
; X86-NEXT: retl
;
; X64-LABEL: test:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; X64-NEXT: xorps %xmm2, %xmm2
; X64-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
diff --git a/test/CodeGen/X86/vec_set-7.ll b/test/CodeGen/X86/vec_set-7.ll
index 757a0d44cd43..fced7e4c0792 100644
--- a/test/CodeGen/X86/vec_set-7.ll
+++ b/test/CodeGen/X86/vec_set-7.ll
@@ -4,13 +4,13 @@
define <2 x i64> @test(<2 x i64>* %p) nounwind {
; X86-LABEL: test:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; X86-NEXT: retl
;
; X64-LABEL: test:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; X64-NEXT: retq
%tmp = bitcast <2 x i64>* %p to double*
diff --git a/test/CodeGen/X86/vec_set-8.ll b/test/CodeGen/X86/vec_set-8.ll
index a9dceb90855a..e8cded6b2168 100644
--- a/test/CodeGen/X86/vec_set-8.ll
+++ b/test/CodeGen/X86/vec_set-8.ll
@@ -4,12 +4,12 @@
define <2 x i64> @test(i64 %i) nounwind {
; X86-LABEL: test:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; X86-NEXT: retl
;
; X64-LABEL: test:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movq %rdi, %xmm0
; X64-NEXT: retq
%tmp10 = insertelement <2 x i64> undef, i64 %i, i32 0
diff --git a/test/CodeGen/X86/vec_set-A.ll b/test/CodeGen/X86/vec_set-A.ll
index 259ace98d362..9c0e9388b25c 100644
--- a/test/CodeGen/X86/vec_set-A.ll
+++ b/test/CodeGen/X86/vec_set-A.ll
@@ -4,13 +4,13 @@
define <2 x i64> @test1() nounwind {
; X86-LABEL: test1:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movl $1, %eax
; X86-NEXT: movd %eax, %xmm0
; X86-NEXT: retl
;
; X64-LABEL: test1:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movl $1, %eax
; X64-NEXT: movq %rax, %xmm0
; X64-NEXT: retq
diff --git a/test/CodeGen/X86/vec_set-B.ll b/test/CodeGen/X86/vec_set-B.ll
index ecd9b57cfd0c..cd5ce9fbb07e 100644
--- a/test/CodeGen/X86/vec_set-B.ll
+++ b/test/CodeGen/X86/vec_set-B.ll
@@ -11,14 +11,14 @@
define <2 x i64> @test3(i64 %arg) nounwind {
; X86-LABEL: test3:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movl $1234567, %eax # imm = 0x12D687
; X86-NEXT: andl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movd %eax, %xmm0
; X86-NEXT: retl
;
; X64-LABEL: test3:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: andl $1234567, %edi # imm = 0x12D687
; X64-NEXT: movq %rdi, %xmm0
; X64-NEXT: retq
@@ -29,14 +29,14 @@ define <2 x i64> @test3(i64 %arg) nounwind {
define <2 x i64> @test2(i64 %arg) nounwind {
; X86-LABEL: test2:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movl $1234567, %eax # imm = 0x12D687
; X86-NEXT: andl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movd %eax, %xmm0
; X86-NEXT: retl
;
; X64-LABEL: test2:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: andl $1234567, %edi # imm = 0x12D687
; X64-NEXT: movq %rdi, %xmm0
; X64-NEXT: retq
diff --git a/test/CodeGen/X86/vec_set-C.ll b/test/CodeGen/X86/vec_set-C.ll
index 865e2fb83f17..877d99abbb95 100644
--- a/test/CodeGen/X86/vec_set-C.ll
+++ b/test/CodeGen/X86/vec_set-C.ll
@@ -4,12 +4,12 @@
define <2 x i64> @t1(i64 %x) nounwind {
; X86-LABEL: t1:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; X86-NEXT: retl
;
; X64-LABEL: t1:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movq %rdi, %xmm0
; X64-NEXT: retq
%tmp8 = insertelement <2 x i64> zeroinitializer, i64 %x, i32 0
diff --git a/test/CodeGen/X86/vec_set-D.ll b/test/CodeGen/X86/vec_set-D.ll
index 56499412d7d8..3dde040d9bad 100644
--- a/test/CodeGen/X86/vec_set-D.ll
+++ b/test/CodeGen/X86/vec_set-D.ll
@@ -3,7 +3,7 @@
define <4 x i32> @t(i32 %x, i32 %y) nounwind {
; CHECK-LABEL: t:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: retl
%tmp1 = insertelement <4 x i32> zeroinitializer, i32 %x, i32 0
diff --git a/test/CodeGen/X86/vec_set-F.ll b/test/CodeGen/X86/vec_set-F.ll
index a5239914b441..75ec319c103d 100644
--- a/test/CodeGen/X86/vec_set-F.ll
+++ b/test/CodeGen/X86/vec_set-F.ll
@@ -3,7 +3,7 @@
define <2 x i64> @t1(<2 x i64>* %ptr) nounwind {
; CHECK-LABEL: t1:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: retl
@@ -16,7 +16,7 @@ define <2 x i64> @t1(<2 x i64>* %ptr) nounwind {
define <2 x i64> @t2(i64 %x) nounwind {
; CHECK-LABEL: t2:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: retl
%tmp717 = bitcast i64 %x to double
diff --git a/test/CodeGen/X86/vec_set-H.ll b/test/CodeGen/X86/vec_set-H.ll
index af8ac70c5b3d..03324f02a4fe 100644
--- a/test/CodeGen/X86/vec_set-H.ll
+++ b/test/CodeGen/X86/vec_set-H.ll
@@ -3,7 +3,7 @@
define <2 x i64> @doload64(i16 signext %x) nounwind {
; CHECK-LABEL: doload64:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
diff --git a/test/CodeGen/X86/vec_set.ll b/test/CodeGen/X86/vec_set.ll
index 918430efea1d..5e1f09c9bae6 100644
--- a/test/CodeGen/X86/vec_set.ll
+++ b/test/CodeGen/X86/vec_set.ll
@@ -4,7 +4,7 @@
define void @test(<8 x i16>* %b, i16 %a0, i16 %a1, i16 %a2, i16 %a3, i16 %a4, i16 %a5, i16 %a6, i16 %a7) nounwind {
; X86-LABEL: test:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X86-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
@@ -25,7 +25,7 @@ define void @test(<8 x i16>* %b, i16 %a0, i16 %a1, i16 %a2, i16 %a3, i16 %a4, i1
; X86-NEXT: retl
;
; X64-LABEL: test:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X64-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
diff --git a/test/CodeGen/X86/vec_setcc.ll b/test/CodeGen/X86/vec_setcc.ll
index 1eef0be2dbbb..e9494d845b74 100644
--- a/test/CodeGen/X86/vec_setcc.ll
+++ b/test/CodeGen/X86/vec_setcc.ll
@@ -5,13 +5,13 @@
define <16 x i8> @v16i8_icmp_uge(<16 x i8> %a, <16 x i8> %b) nounwind readnone ssp uwtable {
; SSE-LABEL: v16i8_icmp_uge:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pmaxub %xmm0, %xmm1
; SSE-NEXT: pcmpeqb %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: v16i8_icmp_uge:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm1
; AVX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
@@ -22,13 +22,13 @@ define <16 x i8> @v16i8_icmp_uge(<16 x i8> %a, <16 x i8> %b) nounwind readnone s
define <16 x i8> @v16i8_icmp_ule(<16 x i8> %a, <16 x i8> %b) nounwind readnone ssp uwtable {
; SSE-LABEL: v16i8_icmp_ule:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pminub %xmm0, %xmm1
; SSE-NEXT: pcmpeqb %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: v16i8_icmp_ule:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpminub %xmm1, %xmm0, %xmm1
; AVX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
@@ -39,20 +39,20 @@ define <16 x i8> @v16i8_icmp_ule(<16 x i8> %a, <16 x i8> %b) nounwind readnone s
define <8 x i16> @v8i16_icmp_uge(<8 x i16> %a, <8 x i16> %b) nounwind readnone ssp uwtable {
; SSE2-LABEL: v8i16_icmp_uge:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: psubusw %xmm0, %xmm1
; SSE2-NEXT: pxor %xmm0, %xmm0
; SSE2-NEXT: pcmpeqw %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: v8i16_icmp_uge:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pmaxuw %xmm0, %xmm1
; SSE41-NEXT: pcmpeqw %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: v8i16_icmp_uge:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpmaxuw %xmm1, %xmm0, %xmm1
; AVX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
@@ -63,20 +63,20 @@ define <8 x i16> @v8i16_icmp_uge(<8 x i16> %a, <8 x i16> %b) nounwind readnone s
define <8 x i16> @v8i16_icmp_ule(<8 x i16> %a, <8 x i16> %b) nounwind readnone ssp uwtable {
; SSE2-LABEL: v8i16_icmp_ule:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: psubusw %xmm1, %xmm0
; SSE2-NEXT: pxor %xmm1, %xmm1
; SSE2-NEXT: pcmpeqw %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: v8i16_icmp_ule:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pminuw %xmm0, %xmm1
; SSE41-NEXT: pcmpeqw %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: v8i16_icmp_ule:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpminuw %xmm1, %xmm0, %xmm1
; AVX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
@@ -87,7 +87,7 @@ define <8 x i16> @v8i16_icmp_ule(<8 x i16> %a, <8 x i16> %b) nounwind readnone s
define <4 x i32> @v4i32_icmp_uge(<4 x i32> %a, <4 x i32> %b) nounwind readnone ssp uwtable {
; SSE2-LABEL: v4i32_icmp_uge:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
; SSE2-NEXT: pxor %xmm2, %xmm0
; SSE2-NEXT: pxor %xmm1, %xmm2
@@ -97,13 +97,13 @@ define <4 x i32> @v4i32_icmp_uge(<4 x i32> %a, <4 x i32> %b) nounwind readnone s
; SSE2-NEXT: retq
;
; SSE41-LABEL: v4i32_icmp_uge:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pmaxud %xmm0, %xmm1
; SSE41-NEXT: pcmpeqd %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: v4i32_icmp_uge:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpmaxud %xmm1, %xmm0, %xmm1
; AVX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
@@ -114,7 +114,7 @@ define <4 x i32> @v4i32_icmp_uge(<4 x i32> %a, <4 x i32> %b) nounwind readnone s
define <4 x i32> @v4i32_icmp_ule(<4 x i32> %a, <4 x i32> %b) nounwind readnone ssp uwtable {
; SSE2-LABEL: v4i32_icmp_ule:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
; SSE2-NEXT: pxor %xmm2, %xmm1
; SSE2-NEXT: pxor %xmm2, %xmm0
@@ -124,13 +124,13 @@ define <4 x i32> @v4i32_icmp_ule(<4 x i32> %a, <4 x i32> %b) nounwind readnone s
; SSE2-NEXT: retq
;
; SSE41-LABEL: v4i32_icmp_ule:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pminud %xmm0, %xmm1
; SSE41-NEXT: pcmpeqd %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: v4i32_icmp_ule:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpminud %xmm1, %xmm0, %xmm1
; AVX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
@@ -144,12 +144,12 @@ define <4 x i32> @v4i32_icmp_ule(<4 x i32> %a, <4 x i32> %b) nounwind readnone s
; should set all bits to 1.
define <16 x i8> @test_setcc_constfold_vi8(<16 x i8> %l, <16 x i8> %r) {
; SSE-LABEL: test_setcc_constfold_vi8:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pcmpeqd %xmm0, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test_setcc_constfold_vi8:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; AVX-NEXT: retq
%test1 = icmp eq <16 x i8> %l, %r
@@ -163,12 +163,12 @@ define <16 x i8> @test_setcc_constfold_vi8(<16 x i8> %l, <16 x i8> %r) {
; Make sure sensible results come from doing extension afterwards
define <16 x i8> @test_setcc_constfold_vi1(<16 x i8> %l, <16 x i8> %r) {
; SSE-LABEL: test_setcc_constfold_vi1:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pcmpeqd %xmm0, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test_setcc_constfold_vi1:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; AVX-NEXT: retq
%test1 = icmp eq <16 x i8> %l, %r
@@ -182,12 +182,12 @@ define <16 x i8> @test_setcc_constfold_vi1(<16 x i8> %l, <16 x i8> %r) {
; just 32-bits wide.
define <2 x i64> @test_setcc_constfold_vi64(<2 x i64> %l, <2 x i64> %r) {
; SSE-LABEL: test_setcc_constfold_vi64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pcmpeqd %xmm0, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test_setcc_constfold_vi64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; AVX-NEXT: retq
%test1 = icmp eq <2 x i64> %l, %r
diff --git a/test/CodeGen/X86/vec_shift.ll b/test/CodeGen/X86/vec_shift.ll
index 55b55936634d..66cf8a9c3dc6 100644
--- a/test/CodeGen/X86/vec_shift.ll
+++ b/test/CodeGen/X86/vec_shift.ll
@@ -4,12 +4,12 @@
define <2 x i64> @t1(<2 x i64> %b1, <2 x i64> %c) nounwind {
; X32-LABEL: t1:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: psllw %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: t1:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: psllw %xmm1, %xmm0
; X64-NEXT: retq
entry:
@@ -22,13 +22,13 @@ entry:
define <2 x i64> @t3(<2 x i64> %b1, i32 %c) nounwind {
; X32-LABEL: t3:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X32-NEXT: psraw %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: t3:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: movd %edi, %xmm1
; X64-NEXT: psraw %xmm1, %xmm0
; X64-NEXT: retq
@@ -45,12 +45,12 @@ declare <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16>, <8 x i16>) nounwind readnone
define <2 x i64> @t2(<2 x i64> %b1, <2 x i64> %c) nounwind {
; X32-LABEL: t2:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: psrlq %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: t2:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: psrlq %xmm1, %xmm0
; X64-NEXT: retq
entry:
diff --git a/test/CodeGen/X86/vec_shift2.ll b/test/CodeGen/X86/vec_shift2.ll
index 21d599fead08..7a1ade72a385 100644
--- a/test/CodeGen/X86/vec_shift2.ll
+++ b/test/CodeGen/X86/vec_shift2.ll
@@ -4,14 +4,14 @@
define <2 x i64> @t1(<2 x i64> %b1, <2 x i64> %c) nounwind {
; X32-LABEL: t1:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl $14, %eax
; X32-NEXT: movd %eax, %xmm1
; X32-NEXT: psrlw %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: t1:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movl $14, %eax
; X64-NEXT: movd %eax, %xmm1
; X64-NEXT: psrlw %xmm1, %xmm0
@@ -24,14 +24,14 @@ define <2 x i64> @t1(<2 x i64> %b1, <2 x i64> %c) nounwind {
define <4 x i32> @t2(<2 x i64> %b1, <2 x i64> %c) nounwind {
; X32-LABEL: t2:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl $14, %eax
; X32-NEXT: movd %eax, %xmm1
; X32-NEXT: pslld %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: t2:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movl $14, %eax
; X64-NEXT: movd %eax, %xmm1
; X64-NEXT: pslld %xmm1, %xmm0
diff --git a/test/CodeGen/X86/vec_shift3.ll b/test/CodeGen/X86/vec_shift3.ll
index 071f0d38b96d..b5fc1fafb61f 100644
--- a/test/CodeGen/X86/vec_shift3.ll
+++ b/test/CodeGen/X86/vec_shift3.ll
@@ -4,13 +4,13 @@
define <2 x i64> @t1(<2 x i64> %x1, i32 %bits) nounwind {
; X32-LABEL: t1:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X32-NEXT: psllq %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: t1:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: movd %edi, %xmm1
; X64-NEXT: psllq %xmm1, %xmm0
; X64-NEXT: retq
@@ -21,12 +21,12 @@ entry:
define <2 x i64> @t2(<2 x i64> %x1) nounwind {
; X32-LABEL: t2:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: psllq $10, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: t2:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: psllq $10, %xmm0
; X64-NEXT: retq
entry:
@@ -36,13 +36,13 @@ entry:
define <2 x i64> @t3(<2 x i64> %x1, i32 %bits) nounwind {
; X32-LABEL: t3:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X32-NEXT: psraw %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: t3:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: movd %edi, %xmm1
; X64-NEXT: psraw %xmm1, %xmm0
; X64-NEXT: retq
diff --git a/test/CodeGen/X86/vec_shift4.ll b/test/CodeGen/X86/vec_shift4.ll
index bef2438aecd1..04b4cb658f15 100644
--- a/test/CodeGen/X86/vec_shift4.ll
+++ b/test/CodeGen/X86/vec_shift4.ll
@@ -4,7 +4,7 @@
define <2 x i64> @shl1(<4 x i32> %r, <4 x i32> %a) nounwind readnone ssp {
; X32-LABEL: shl1:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: pslld $23, %xmm1
; X32-NEXT: paddd {{\.LCPI.*}}, %xmm1
; X32-NEXT: cvttps2dq %xmm1, %xmm1
@@ -12,7 +12,7 @@ define <2 x i64> @shl1(<4 x i32> %r, <4 x i32> %a) nounwind readnone ssp {
; X32-NEXT: retl
;
; X64-LABEL: shl1:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: pslld $23, %xmm1
; X64-NEXT: paddd {{.*}}(%rip), %xmm1
; X64-NEXT: cvttps2dq %xmm1, %xmm1
@@ -32,7 +32,7 @@ entry:
define <2 x i64> @shl2(<16 x i8> %r, <16 x i8> %a) nounwind readnone ssp {
; X32-LABEL: shl2:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: movdqa %xmm0, %xmm2
; X32-NEXT: psllw $5, %xmm1
; X32-NEXT: movdqa %xmm2, %xmm3
@@ -55,7 +55,7 @@ define <2 x i64> @shl2(<16 x i8> %r, <16 x i8> %a) nounwind readnone ssp {
; X32-NEXT: retl
;
; X64-LABEL: shl2:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: movdqa %xmm0, %xmm2
; X64-NEXT: psllw $5, %xmm1
; X64-NEXT: movdqa %xmm2, %xmm3
diff --git a/test/CodeGen/X86/vec_shift5.ll b/test/CodeGen/X86/vec_shift5.ll
index c0226d0a4c09..873de4b08349 100644
--- a/test/CodeGen/X86/vec_shift5.ll
+++ b/test/CodeGen/X86/vec_shift5.ll
@@ -8,12 +8,12 @@
define <8 x i16> @test1() {
; X32-LABEL: test1:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movaps {{.*#+}} xmm0 = [8,16,32,64,8,16,32,64]
; X32-NEXT: retl
;
; X64-LABEL: test1:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movaps {{.*#+}} xmm0 = [8,16,32,64,8,16,32,64]
; X64-NEXT: retq
%1 = tail call <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> <i16 1, i16 2, i16 4, i16 8, i16 1, i16 2, i16 4, i16 8>, i32 3)
@@ -22,12 +22,12 @@ define <8 x i16> @test1() {
define <8 x i16> @test2() {
; X32-LABEL: test2:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movaps {{.*#+}} xmm0 = [0,1,2,4,0,1,2,4]
; X32-NEXT: retl
;
; X64-LABEL: test2:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movaps {{.*#+}} xmm0 = [0,1,2,4,0,1,2,4]
; X64-NEXT: retq
%1 = tail call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> <i16 4, i16 8, i16 16, i16 32, i16 4, i16 8, i16 16, i16 32>, i32 3)
@@ -36,12 +36,12 @@ define <8 x i16> @test2() {
define <8 x i16> @test3() {
; X32-LABEL: test3:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movaps {{.*#+}} xmm0 = [0,1,2,4,0,1,2,4]
; X32-NEXT: retl
;
; X64-LABEL: test3:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movaps {{.*#+}} xmm0 = [0,1,2,4,0,1,2,4]
; X64-NEXT: retq
%1 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> <i16 4, i16 8, i16 16, i16 32, i16 4, i16 8, i16 16, i16 32>, i32 3)
@@ -50,12 +50,12 @@ define <8 x i16> @test3() {
define <4 x i32> @test4() {
; X32-LABEL: test4:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movaps {{.*#+}} xmm0 = [8,16,32,64]
; X32-NEXT: retl
;
; X64-LABEL: test4:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movaps {{.*#+}} xmm0 = [8,16,32,64]
; X64-NEXT: retq
%1 = tail call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> <i32 1, i32 2, i32 4, i32 8>, i32 3)
@@ -64,12 +64,12 @@ define <4 x i32> @test4() {
define <4 x i32> @test5() {
; X32-LABEL: test5:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movaps {{.*#+}} xmm0 = [0,1,2,4]
; X32-NEXT: retl
;
; X64-LABEL: test5:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movaps {{.*#+}} xmm0 = [0,1,2,4]
; X64-NEXT: retq
%1 = tail call <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> <i32 4, i32 8, i32 16, i32 32>, i32 3)
@@ -78,12 +78,12 @@ define <4 x i32> @test5() {
define <4 x i32> @test6() {
; X32-LABEL: test6:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movaps {{.*#+}} xmm0 = [0,1,2,4]
; X32-NEXT: retl
;
; X64-LABEL: test6:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movaps {{.*#+}} xmm0 = [0,1,2,4]
; X64-NEXT: retq
%1 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> <i32 4, i32 8, i32 16, i32 32>, i32 3)
@@ -92,12 +92,12 @@ define <4 x i32> @test6() {
define <2 x i64> @test7() {
; X32-LABEL: test7:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movaps {{.*#+}} xmm0 = [8,0,16,0]
; X32-NEXT: retl
;
; X64-LABEL: test7:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movaps {{.*#+}} xmm0 = [8,16]
; X64-NEXT: retq
%1 = tail call <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> <i64 1, i64 2>, i32 3)
@@ -106,12 +106,12 @@ define <2 x i64> @test7() {
define <2 x i64> @test8() {
; X32-LABEL: test8:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movaps {{.*#+}} xmm0 = [1,0,2,0]
; X32-NEXT: retl
;
; X64-LABEL: test8:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movaps {{.*#+}} xmm0 = [1,2]
; X64-NEXT: retq
%1 = tail call <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64> <i64 8, i64 16>, i32 3)
@@ -120,12 +120,12 @@ define <2 x i64> @test8() {
define <8 x i16> @test9() {
; X32-LABEL: test9:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movaps {{.*#+}} xmm0 = <1,1,u,u,3,u,8,16>
; X32-NEXT: retl
;
; X64-LABEL: test9:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movaps {{.*#+}} xmm0 = <1,1,u,u,3,u,8,16>
; X64-NEXT: retq
%1 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> <i16 15, i16 8, i16 undef, i16 undef, i16 31, i16 undef, i16 64, i16 128>, i32 3)
@@ -134,12 +134,12 @@ define <8 x i16> @test9() {
define <4 x i32> @test10() {
; X32-LABEL: test10:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movaps {{.*#+}} xmm0 = <u,1,u,4>
; X32-NEXT: retl
;
; X64-LABEL: test10:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movaps {{.*#+}} xmm0 = <u,1,u,4>
; X64-NEXT: retq
%1 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> <i32 undef, i32 8, i32 undef, i32 32>, i32 3)
@@ -148,12 +148,12 @@ define <4 x i32> @test10() {
define <2 x i64> @test11() {
; X32-LABEL: test11:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movaps {{.*#+}} xmm0 = <u,u,3,0>
; X32-NEXT: retl
;
; X64-LABEL: test11:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movaps {{.*#+}} xmm0 = <u,3>
; X64-NEXT: retq
%1 = tail call <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64> <i64 undef, i64 31>, i32 3)
@@ -162,12 +162,12 @@ define <2 x i64> @test11() {
define <8 x i16> @test12() {
; X32-LABEL: test12:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movaps {{.*#+}} xmm0 = <1,1,u,u,3,u,8,16>
; X32-NEXT: retl
;
; X64-LABEL: test12:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movaps {{.*#+}} xmm0 = <1,1,u,u,3,u,8,16>
; X64-NEXT: retq
%1 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> <i16 15, i16 8, i16 undef, i16 undef, i16 31, i16 undef, i16 64, i16 128>, i32 3)
@@ -176,12 +176,12 @@ define <8 x i16> @test12() {
define <4 x i32> @test13() {
; X32-LABEL: test13:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movaps {{.*#+}} xmm0 = <u,1,u,4>
; X32-NEXT: retl
;
; X64-LABEL: test13:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movaps {{.*#+}} xmm0 = <u,1,u,4>
; X64-NEXT: retq
%1 = tail call <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> <i32 undef, i32 8, i32 undef, i32 32>, i32 3)
@@ -190,12 +190,12 @@ define <4 x i32> @test13() {
define <8 x i16> @test14() {
; X32-LABEL: test14:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movaps {{.*#+}} xmm0 = <1,1,u,u,3,u,8,16>
; X32-NEXT: retl
;
; X64-LABEL: test14:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movaps {{.*#+}} xmm0 = <1,1,u,u,3,u,8,16>
; X64-NEXT: retq
%1 = tail call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> <i16 15, i16 8, i16 undef, i16 undef, i16 31, i16 undef, i16 64, i16 128>, i32 3)
@@ -204,12 +204,12 @@ define <8 x i16> @test14() {
define <4 x i32> @test15() {
; X32-LABEL: test15:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movaps {{.*#+}} xmm0 = <u,64,u,256>
; X32-NEXT: retl
;
; X64-LABEL: test15:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movaps {{.*#+}} xmm0 = <u,64,u,256>
; X64-NEXT: retq
%1 = tail call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> <i32 undef, i32 8, i32 undef, i32 32>, i32 3)
@@ -218,12 +218,12 @@ define <4 x i32> @test15() {
define <2 x i64> @test16() {
; X32-LABEL: test16:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movaps {{.*#+}} xmm0 = <u,u,248,0>
; X32-NEXT: retl
;
; X64-LABEL: test16:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movaps {{.*#+}} xmm0 = <u,248>
; X64-NEXT: retq
%1 = tail call <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> <i64 undef, i64 31>, i32 3)
diff --git a/test/CodeGen/X86/vec_shift6.ll b/test/CodeGen/X86/vec_shift6.ll
index 731760a4ea55..db8ef0b213c6 100644
--- a/test/CodeGen/X86/vec_shift6.ll
+++ b/test/CodeGen/X86/vec_shift6.ll
@@ -9,17 +9,17 @@
define <8 x i16> @test1(<8 x i16> %a) {
; SSE-LABEL: test1:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pmullw {{.*}}(%rip), %xmm0
; SSE-NEXT: retq
;
; AVX2-LABEL: test1:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: test1:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
; AVX512-NEXT: retq
%shl = shl <8 x i16> %a, <i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11>
@@ -28,17 +28,17 @@ define <8 x i16> @test1(<8 x i16> %a) {
define <8 x i16> @test2(<8 x i16> %a) {
; SSE-LABEL: test2:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pmullw {{.*}}(%rip), %xmm0
; SSE-NEXT: retq
;
; AVX2-LABEL: test2:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: test2:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
; AVX512-NEXT: retq
%shl = shl <8 x i16> %a, <i16 0, i16 undef, i16 0, i16 0, i16 1, i16 undef, i16 -1, i16 1>
@@ -51,17 +51,17 @@ define <8 x i16> @test2(<8 x i16> %a) {
define <4 x i32> @test3(<4 x i32> %a) {
; SSE-LABEL: test3:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pmulld {{.*}}(%rip), %xmm0
; SSE-NEXT: retq
;
; AVX2-LABEL: test3:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: test3:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
; AVX512-NEXT: retq
%shl = shl <4 x i32> %a, <i32 1, i32 -1, i32 2, i32 -3>
@@ -70,17 +70,17 @@ define <4 x i32> @test3(<4 x i32> %a) {
define <4 x i32> @test4(<4 x i32> %a) {
; SSE-LABEL: test4:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pmulld {{.*}}(%rip), %xmm0
; SSE-NEXT: retq
;
; AVX2-LABEL: test4:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: test4:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
; AVX512-NEXT: retq
%shl = shl <4 x i32> %a, <i32 0, i32 0, i32 1, i32 1>
@@ -93,19 +93,19 @@ define <4 x i32> @test4(<4 x i32> %a) {
define <16 x i16> @test5(<16 x i16> %a) {
; SSE-LABEL: test5:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movdqa {{.*#+}} xmm2 = [2,2,4,8,128,1,512,2048]
; SSE-NEXT: pmullw %xmm2, %xmm0
; SSE-NEXT: pmullw %xmm2, %xmm1
; SSE-NEXT: retq
;
; AVX2-LABEL: test5:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: test5:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
; AVX512-NEXT: retq
%shl = shl <16 x i16> %a, <i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11>
@@ -118,19 +118,19 @@ define <16 x i16> @test5(<16 x i16> %a) {
define <8 x i32> @test6(<8 x i32> %a) {
; SSE-LABEL: test6:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movdqa {{.*#+}} xmm2 = [2,2,4,8]
; SSE-NEXT: pmulld %xmm2, %xmm0
; SSE-NEXT: pmulld %xmm2, %xmm1
; SSE-NEXT: retq
;
; AVX2-LABEL: test6:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpsllvd {{.*}}(%rip), %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: test6:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpsllvd {{.*}}(%rip), %ymm0, %ymm0
; AVX512-NEXT: retq
%shl = shl <8 x i32> %a, <i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3>
@@ -143,7 +143,7 @@ define <8 x i32> @test6(<8 x i32> %a) {
define <32 x i16> @test7(<32 x i16> %a) {
; SSE-LABEL: test7:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movdqa {{.*#+}} xmm4 = [2,2,4,8,128,1,512,2048]
; SSE-NEXT: pmullw %xmm4, %xmm0
; SSE-NEXT: pmullw %xmm4, %xmm1
@@ -152,7 +152,7 @@ define <32 x i16> @test7(<32 x i16> %a) {
; SSE-NEXT: retq
;
; AVX2-LABEL: test7:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [2,2,4,8,128,1,512,2048,2,2,4,8,128,1,512,2048]
; AVX2-NEXT: # ymm2 = mem[0,1,0,1]
; AVX2-NEXT: vpmullw %ymm2, %ymm0, %ymm0
@@ -160,7 +160,7 @@ define <32 x i16> @test7(<32 x i16> %a) {
; AVX2-NEXT: retq
;
; AVX512-LABEL: test7:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [2,2,4,8,128,1,512,2048,2,2,4,8,128,1,512,2048]
; AVX512-NEXT: # ymm2 = mem[0,1,0,1]
; AVX512-NEXT: vpmullw %ymm2, %ymm0, %ymm0
@@ -175,7 +175,7 @@ define <32 x i16> @test7(<32 x i16> %a) {
define <16 x i32> @test8(<16 x i32> %a) {
; SSE-LABEL: test8:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movdqa {{.*#+}} xmm4 = [2,2,4,8]
; SSE-NEXT: pmulld %xmm4, %xmm0
; SSE-NEXT: pmulld %xmm4, %xmm1
@@ -184,7 +184,7 @@ define <16 x i32> @test8(<16 x i32> %a) {
; SSE-NEXT: retq
;
; AVX2-LABEL: test8:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,1,2,3,1,1,2,3]
; AVX2-NEXT: # ymm2 = mem[0,1,0,1]
; AVX2-NEXT: vpsllvd %ymm2, %ymm0, %ymm0
@@ -192,7 +192,7 @@ define <16 x i32> @test8(<16 x i32> %a) {
; AVX2-NEXT: retq
;
; AVX512-LABEL: test8:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpsllvd {{.*}}(%rip), %zmm0, %zmm0
; AVX512-NEXT: retq
%shl = shl <16 x i32> %a, <i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3>
@@ -203,7 +203,7 @@ define <16 x i32> @test8(<16 x i32> %a) {
define <8 x i64> @test9(<8 x i64> %a) {
; SSE-LABEL: test9:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movdqa %xmm1, %xmm4
; SSE-NEXT: psllq $3, %xmm4
; SSE-NEXT: psllq $2, %xmm1
@@ -217,14 +217,14 @@ define <8 x i64> @test9(<8 x i64> %a) {
; SSE-NEXT: retq
;
; AVX2-LABEL: test9:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [1,1,2,3]
; AVX2-NEXT: vpsllvq %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpsllvq %ymm2, %ymm1, %ymm1
; AVX2-NEXT: retq
;
; AVX512-LABEL: test9:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpsllvq {{.*}}(%rip), %zmm0, %zmm0
; AVX512-NEXT: retq
%shl = shl <8 x i64> %a, <i64 1, i64 1, i64 2, i64 3, i64 1, i64 1, i64 2, i64 3>
diff --git a/test/CodeGen/X86/vec_shift7.ll b/test/CodeGen/X86/vec_shift7.ll
index c13299b9cb38..1624ae7346ce 100644
--- a/test/CodeGen/X86/vec_shift7.ll
+++ b/test/CodeGen/X86/vec_shift7.ll
@@ -6,7 +6,7 @@
define i64 @test1(<2 x i64> %a) {
; X32-LABEL: test1:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: movdqa %xmm0, %xmm1
; X32-NEXT: psllq $2, %xmm1
; X32-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
@@ -16,7 +16,7 @@ define i64 @test1(<2 x i64> %a) {
; X32-NEXT: retl
;
; X64-LABEL: test1:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: movq %xmm0, %rax
; X64-NEXT: retq
entry:
diff --git a/test/CodeGen/X86/vec_split.ll b/test/CodeGen/X86/vec_split.ll
index 1df4cf2b2325..9ce9b5b15c09 100644
--- a/test/CodeGen/X86/vec_split.ll
+++ b/test/CodeGen/X86/vec_split.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=x86-64 -mattr=sse4.1 < %s | FileCheck %s -check-prefix=SSE4
-; RUN: llc -march=x86-64 -mattr=avx < %s | FileCheck %s -check-prefix=AVX1
-; RUN: llc -march=x86-64 -mattr=avx2 < %s | FileCheck %s -check-prefix=AVX2
+; RUN: llc -mtriple=x86_64-- -mattr=sse4.1 < %s | FileCheck %s -check-prefix=SSE4
+; RUN: llc -mtriple=x86_64-- -mattr=avx < %s | FileCheck %s -check-prefix=AVX1
+; RUN: llc -mtriple=x86_64-- -mattr=avx2 < %s | FileCheck %s -check-prefix=AVX2
define <16 x i16> @split16(<16 x i16> %a, <16 x i16> %b, <16 x i8> %__mask) {
; SSE4-LABEL: split16:
diff --git a/test/CodeGen/X86/vec_ss_load_fold.ll b/test/CodeGen/X86/vec_ss_load_fold.ll
index a74a4ed36d70..87634a9c708a 100644
--- a/test/CodeGen/X86/vec_ss_load_fold.ll
+++ b/test/CodeGen/X86/vec_ss_load_fold.ll
@@ -8,7 +8,7 @@
define i16 @test1(float %f) nounwind {
; X32-LABEL: test1:
-; X32: ## BB#0:
+; X32: ## %bb.0:
; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X32-NEXT: addss LCPI0_0, %xmm0
; X32-NEXT: mulss LCPI0_1, %xmm0
@@ -17,11 +17,11 @@ define i16 @test1(float %f) nounwind {
; X32-NEXT: minss LCPI0_2, %xmm0
; X32-NEXT: maxss %xmm1, %xmm0
; X32-NEXT: cvttss2si %xmm0, %eax
-; X32-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X32-NEXT: ## kill: def %ax killed %ax killed %eax
; X32-NEXT: retl
;
; X64-LABEL: test1:
-; X64: ## BB#0:
+; X64: ## %bb.0:
; X64-NEXT: addss {{.*}}(%rip), %xmm0
; X64-NEXT: mulss {{.*}}(%rip), %xmm0
; X64-NEXT: xorps %xmm1, %xmm1
@@ -29,57 +29,57 @@ define i16 @test1(float %f) nounwind {
; X64-NEXT: minss {{.*}}(%rip), %xmm0
; X64-NEXT: maxss %xmm1, %xmm0
; X64-NEXT: cvttss2si %xmm0, %eax
-; X64-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-NEXT: ## kill: def %ax killed %ax killed %eax
; X64-NEXT: retq
;
; X32_AVX1-LABEL: test1:
-; X32_AVX1: ## BB#0:
+; X32_AVX1: ## %bb.0:
; X32_AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X32_AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X32_AVX1-NEXT: vaddss LCPI0_0, %xmm0, %xmm0
; X32_AVX1-NEXT: vmulss LCPI0_1, %xmm0, %xmm0
+; X32_AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X32_AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; X32_AVX1-NEXT: vminss LCPI0_2, %xmm0, %xmm0
; X32_AVX1-NEXT: vmaxss %xmm1, %xmm0, %xmm0
; X32_AVX1-NEXT: vcvttss2si %xmm0, %eax
-; X32_AVX1-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X32_AVX1-NEXT: ## kill: def %ax killed %ax killed %eax
; X32_AVX1-NEXT: retl
;
; X64_AVX1-LABEL: test1:
-; X64_AVX1: ## BB#0:
-; X64_AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; X64_AVX1: ## %bb.0:
; X64_AVX1-NEXT: vaddss {{.*}}(%rip), %xmm0, %xmm0
; X64_AVX1-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0
+; X64_AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X64_AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; X64_AVX1-NEXT: vminss {{.*}}(%rip), %xmm0, %xmm0
; X64_AVX1-NEXT: vmaxss %xmm1, %xmm0, %xmm0
; X64_AVX1-NEXT: vcvttss2si %xmm0, %eax
-; X64_AVX1-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64_AVX1-NEXT: ## kill: def %ax killed %ax killed %eax
; X64_AVX1-NEXT: retq
;
; X32_AVX512-LABEL: test1:
-; X32_AVX512: ## BB#0:
+; X32_AVX512: ## %bb.0:
; X32_AVX512-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X32_AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X32_AVX512-NEXT: vaddss LCPI0_0, %xmm0, %xmm0
; X32_AVX512-NEXT: vmulss LCPI0_1, %xmm0, %xmm0
+; X32_AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X32_AVX512-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; X32_AVX512-NEXT: vminss LCPI0_2, %xmm0, %xmm0
; X32_AVX512-NEXT: vmaxss %xmm1, %xmm0, %xmm0
; X32_AVX512-NEXT: vcvttss2si %xmm0, %eax
-; X32_AVX512-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X32_AVX512-NEXT: ## kill: def %ax killed %ax killed %eax
; X32_AVX512-NEXT: retl
;
; X64_AVX512-LABEL: test1:
-; X64_AVX512: ## BB#0:
-; X64_AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; X64_AVX512: ## %bb.0:
; X64_AVX512-NEXT: vaddss {{.*}}(%rip), %xmm0, %xmm0
; X64_AVX512-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0
+; X64_AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X64_AVX512-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; X64_AVX512-NEXT: vminss {{.*}}(%rip), %xmm0, %xmm0
; X64_AVX512-NEXT: vmaxss %xmm1, %xmm0, %xmm0
; X64_AVX512-NEXT: vcvttss2si %xmm0, %eax
-; X64_AVX512-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64_AVX512-NEXT: ## kill: def %ax killed %ax killed %eax
; X64_AVX512-NEXT: retq
%tmp = insertelement <4 x float> undef, float %f, i32 0 ; <<4 x float>> [#uses=1]
%tmp10 = insertelement <4 x float> %tmp, float 0.000000e+00, i32 1 ; <<4 x float>> [#uses=1]
@@ -96,7 +96,7 @@ define i16 @test1(float %f) nounwind {
define i16 @test2(float %f) nounwind {
; X32-LABEL: test2:
-; X32: ## BB#0:
+; X32: ## %bb.0:
; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X32-NEXT: addss LCPI1_0, %xmm0
; X32-NEXT: mulss LCPI1_1, %xmm0
@@ -104,22 +104,22 @@ define i16 @test2(float %f) nounwind {
; X32-NEXT: xorps %xmm1, %xmm1
; X32-NEXT: maxss %xmm1, %xmm0
; X32-NEXT: cvttss2si %xmm0, %eax
-; X32-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X32-NEXT: ## kill: def %ax killed %ax killed %eax
; X32-NEXT: retl
;
; X64-LABEL: test2:
-; X64: ## BB#0:
+; X64: ## %bb.0:
; X64-NEXT: addss {{.*}}(%rip), %xmm0
; X64-NEXT: mulss {{.*}}(%rip), %xmm0
; X64-NEXT: minss {{.*}}(%rip), %xmm0
; X64-NEXT: xorps %xmm1, %xmm1
; X64-NEXT: maxss %xmm1, %xmm0
; X64-NEXT: cvttss2si %xmm0, %eax
-; X64-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-NEXT: ## kill: def %ax killed %ax killed %eax
; X64-NEXT: retq
;
; X32_AVX-LABEL: test2:
-; X32_AVX: ## BB#0:
+; X32_AVX: ## %bb.0:
; X32_AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X32_AVX-NEXT: vaddss LCPI1_0, %xmm0, %xmm0
; X32_AVX-NEXT: vmulss LCPI1_1, %xmm0, %xmm0
@@ -127,18 +127,18 @@ define i16 @test2(float %f) nounwind {
; X32_AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X32_AVX-NEXT: vmaxss %xmm1, %xmm0, %xmm0
; X32_AVX-NEXT: vcvttss2si %xmm0, %eax
-; X32_AVX-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X32_AVX-NEXT: ## kill: def %ax killed %ax killed %eax
; X32_AVX-NEXT: retl
;
; X64_AVX-LABEL: test2:
-; X64_AVX: ## BB#0:
+; X64_AVX: ## %bb.0:
; X64_AVX-NEXT: vaddss {{.*}}(%rip), %xmm0, %xmm0
; X64_AVX-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0
; X64_AVX-NEXT: vminss {{.*}}(%rip), %xmm0, %xmm0
; X64_AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X64_AVX-NEXT: vmaxss %xmm1, %xmm0, %xmm0
; X64_AVX-NEXT: vcvttss2si %xmm0, %eax
-; X64_AVX-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64_AVX-NEXT: ## kill: def %ax killed %ax killed %eax
; X64_AVX-NEXT: retq
%tmp28 = fsub float %f, 1.000000e+00 ; <float> [#uses=1]
%tmp37 = fmul float %tmp28, 5.000000e-01 ; <float> [#uses=1]
@@ -166,26 +166,37 @@ declare <4 x float> @f()
define <4 x float> @test3(<4 x float> %A, float *%b, i32 %C) nounwind {
; X32-LABEL: test3:
-; X32: ## BB#0:
+; X32: ## %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: roundss $4, (%eax), %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test3:
-; X64: ## BB#0:
+; X64: ## %bb.0:
; X64-NEXT: roundss $4, (%rdi), %xmm0
; X64-NEXT: retq
;
-; X32_AVX-LABEL: test3:
-; X32_AVX: ## BB#0:
-; X32_AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32_AVX-NEXT: vroundss $4, (%eax), %xmm0, %xmm0
-; X32_AVX-NEXT: retl
+; X32_AVX1-LABEL: test3:
+; X32_AVX1: ## %bb.0:
+; X32_AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32_AVX1-NEXT: vroundss $4, (%eax), %xmm0, %xmm0
+; X32_AVX1-NEXT: retl
;
-; X64_AVX-LABEL: test3:
-; X64_AVX: ## BB#0:
-; X64_AVX-NEXT: vroundss $4, (%rdi), %xmm0, %xmm0
-; X64_AVX-NEXT: retq
+; X64_AVX1-LABEL: test3:
+; X64_AVX1: ## %bb.0:
+; X64_AVX1-NEXT: vroundss $4, (%rdi), %xmm0, %xmm0
+; X64_AVX1-NEXT: retq
+;
+; X32_AVX512-LABEL: test3:
+; X32_AVX512: ## %bb.0:
+; X32_AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32_AVX512-NEXT: vrndscaless $4, (%eax), %xmm0, %xmm0
+; X32_AVX512-NEXT: retl
+;
+; X64_AVX512-LABEL: test3:
+; X64_AVX512: ## %bb.0:
+; X64_AVX512-NEXT: vrndscaless $4, (%rdi), %xmm0, %xmm0
+; X64_AVX512-NEXT: retq
%a = load float , float *%b
%B = insertelement <4 x float> undef, float %a, i32 0
%X = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %A, <4 x float> %B, i32 4)
@@ -194,7 +205,7 @@ define <4 x float> @test3(<4 x float> %A, float *%b, i32 %C) nounwind {
define <4 x float> @test4(<4 x float> %A, float *%b, i32 %C) nounwind {
; X32-LABEL: test4:
-; X32: ## BB#0:
+; X32: ## %bb.0:
; X32-NEXT: subl $28, %esp
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
@@ -205,7 +216,7 @@ define <4 x float> @test4(<4 x float> %A, float *%b, i32 %C) nounwind {
; X32-NEXT: retl
;
; X64-LABEL: test4:
-; X64: ## BB#0:
+; X64: ## %bb.0:
; X64-NEXT: subq $24, %rsp
; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X64-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill
@@ -214,26 +225,49 @@ define <4 x float> @test4(<4 x float> %A, float *%b, i32 %C) nounwind {
; X64-NEXT: addq $24, %rsp
; X64-NEXT: retq
;
-; X32_AVX-LABEL: test4:
-; X32_AVX: ## BB#0:
-; X32_AVX-NEXT: subl $28, %esp
-; X32_AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32_AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X32_AVX-NEXT: vmovaps %xmm0, (%esp) ## 16-byte Spill
-; X32_AVX-NEXT: calll _f
-; X32_AVX-NEXT: vroundss $4, (%esp), %xmm0, %xmm0 ## 16-byte Folded Reload
-; X32_AVX-NEXT: addl $28, %esp
-; X32_AVX-NEXT: retl
+; X32_AVX1-LABEL: test4:
+; X32_AVX1: ## %bb.0:
+; X32_AVX1-NEXT: subl $28, %esp
+; X32_AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32_AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32_AVX1-NEXT: vmovaps %xmm0, (%esp) ## 16-byte Spill
+; X32_AVX1-NEXT: calll _f
+; X32_AVX1-NEXT: vroundss $4, (%esp), %xmm0, %xmm0 ## 16-byte Folded Reload
+; X32_AVX1-NEXT: addl $28, %esp
+; X32_AVX1-NEXT: retl
;
-; X64_AVX-LABEL: test4:
-; X64_AVX: ## BB#0:
-; X64_AVX-NEXT: subq $24, %rsp
-; X64_AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X64_AVX-NEXT: vmovaps %xmm0, (%rsp) ## 16-byte Spill
-; X64_AVX-NEXT: callq _f
-; X64_AVX-NEXT: vroundss $4, (%rsp), %xmm0, %xmm0 ## 16-byte Folded Reload
-; X64_AVX-NEXT: addq $24, %rsp
-; X64_AVX-NEXT: retq
+; X64_AVX1-LABEL: test4:
+; X64_AVX1: ## %bb.0:
+; X64_AVX1-NEXT: subq $24, %rsp
+; X64_AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64_AVX1-NEXT: vmovaps %xmm0, (%rsp) ## 16-byte Spill
+; X64_AVX1-NEXT: callq _f
+; X64_AVX1-NEXT: vroundss $4, (%rsp), %xmm0, %xmm0 ## 16-byte Folded Reload
+; X64_AVX1-NEXT: addq $24, %rsp
+; X64_AVX1-NEXT: retq
+;
+; X32_AVX512-LABEL: test4:
+; X32_AVX512: ## %bb.0:
+; X32_AVX512-NEXT: subl $28, %esp
+; X32_AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32_AVX512-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32_AVX512-NEXT: vmovaps %xmm0, (%esp) ## 16-byte Spill
+; X32_AVX512-NEXT: calll _f
+; X32_AVX512-NEXT: vmovaps (%esp), %xmm1 ## 16-byte Reload
+; X32_AVX512-NEXT: vrndscaless $4, %xmm1, %xmm0, %xmm0
+; X32_AVX512-NEXT: addl $28, %esp
+; X32_AVX512-NEXT: retl
+;
+; X64_AVX512-LABEL: test4:
+; X64_AVX512: ## %bb.0:
+; X64_AVX512-NEXT: subq $24, %rsp
+; X64_AVX512-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64_AVX512-NEXT: vmovaps %xmm0, (%rsp) ## 16-byte Spill
+; X64_AVX512-NEXT: callq _f
+; X64_AVX512-NEXT: vmovaps (%rsp), %xmm1 ## 16-byte Reload
+; X64_AVX512-NEXT: vrndscaless $4, %xmm1, %xmm0, %xmm0
+; X64_AVX512-NEXT: addq $24, %rsp
+; X64_AVX512-NEXT: retq
%a = load float , float *%b
%B = insertelement <4 x float> undef, float %a, i32 0
%q = call <4 x float> @f()
@@ -244,28 +278,28 @@ define <4 x float> @test4(<4 x float> %A, float *%b, i32 %C) nounwind {
; PR13576
define <2 x double> @test5() nounwind uwtable readnone noinline {
; X32-LABEL: test5:
-; X32: ## BB#0: ## %entry
+; X32: ## %bb.0: ## %entry
; X32-NEXT: movaps {{.*#+}} xmm0 = [4.569870e+02,1.233210e+02]
; X32-NEXT: movl $128, %eax
; X32-NEXT: cvtsi2sdl %eax, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test5:
-; X64: ## BB#0: ## %entry
+; X64: ## %bb.0: ## %entry
; X64-NEXT: movaps {{.*#+}} xmm0 = [4.569870e+02,1.233210e+02]
; X64-NEXT: movl $128, %eax
; X64-NEXT: cvtsi2sdl %eax, %xmm0
; X64-NEXT: retq
;
; X32_AVX-LABEL: test5:
-; X32_AVX: ## BB#0: ## %entry
+; X32_AVX: ## %bb.0: ## %entry
; X32_AVX-NEXT: vmovaps {{.*#+}} xmm0 = [4.569870e+02,1.233210e+02]
; X32_AVX-NEXT: movl $128, %eax
; X32_AVX-NEXT: vcvtsi2sdl %eax, %xmm0, %xmm0
; X32_AVX-NEXT: retl
;
; X64_AVX-LABEL: test5:
-; X64_AVX: ## BB#0: ## %entry
+; X64_AVX: ## %bb.0: ## %entry
; X64_AVX-NEXT: vmovaps {{.*#+}} xmm0 = [4.569870e+02,1.233210e+02]
; X64_AVX-NEXT: movl $128, %eax
; X64_AVX-NEXT: vcvtsi2sdl %eax, %xmm0, %xmm0
@@ -279,24 +313,24 @@ declare <2 x double> @llvm.x86.sse2.cvtsi2sd(<2 x double>, i32) nounwind readnon
define <4 x float> @minss_fold(float* %x, <4 x float> %y) {
; X32-LABEL: minss_fold:
-; X32: ## BB#0: ## %entry
+; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: minss (%eax), %xmm0
; X32-NEXT: retl
;
; X64-LABEL: minss_fold:
-; X64: ## BB#0: ## %entry
+; X64: ## %bb.0: ## %entry
; X64-NEXT: minss (%rdi), %xmm0
; X64-NEXT: retq
;
; X32_AVX-LABEL: minss_fold:
-; X32_AVX: ## BB#0: ## %entry
+; X32_AVX: ## %bb.0: ## %entry
; X32_AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32_AVX-NEXT: vminss (%eax), %xmm0, %xmm0
; X32_AVX-NEXT: retl
;
; X64_AVX-LABEL: minss_fold:
-; X64_AVX: ## BB#0: ## %entry
+; X64_AVX: ## %bb.0: ## %entry
; X64_AVX-NEXT: vminss (%rdi), %xmm0, %xmm0
; X64_AVX-NEXT: retq
entry:
@@ -311,24 +345,24 @@ entry:
define <4 x float> @maxss_fold(float* %x, <4 x float> %y) {
; X32-LABEL: maxss_fold:
-; X32: ## BB#0: ## %entry
+; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: maxss (%eax), %xmm0
; X32-NEXT: retl
;
; X64-LABEL: maxss_fold:
-; X64: ## BB#0: ## %entry
+; X64: ## %bb.0: ## %entry
; X64-NEXT: maxss (%rdi), %xmm0
; X64-NEXT: retq
;
; X32_AVX-LABEL: maxss_fold:
-; X32_AVX: ## BB#0: ## %entry
+; X32_AVX: ## %bb.0: ## %entry
; X32_AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32_AVX-NEXT: vmaxss (%eax), %xmm0, %xmm0
; X32_AVX-NEXT: retl
;
; X64_AVX-LABEL: maxss_fold:
-; X64_AVX: ## BB#0: ## %entry
+; X64_AVX: ## %bb.0: ## %entry
; X64_AVX-NEXT: vmaxss (%rdi), %xmm0, %xmm0
; X64_AVX-NEXT: retq
entry:
@@ -343,24 +377,24 @@ entry:
define <4 x float> @cmpss_fold(float* %x, <4 x float> %y) {
; X32-LABEL: cmpss_fold:
-; X32: ## BB#0: ## %entry
+; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: cmpeqss (%eax), %xmm0
; X32-NEXT: retl
;
; X64-LABEL: cmpss_fold:
-; X64: ## BB#0: ## %entry
+; X64: ## %bb.0: ## %entry
; X64-NEXT: cmpeqss (%rdi), %xmm0
; X64-NEXT: retq
;
; X32_AVX-LABEL: cmpss_fold:
-; X32_AVX: ## BB#0: ## %entry
+; X32_AVX: ## %bb.0: ## %entry
; X32_AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32_AVX-NEXT: vcmpeqss (%eax), %xmm0, %xmm0
; X32_AVX-NEXT: retl
;
; X64_AVX-LABEL: cmpss_fold:
-; X64_AVX: ## BB#0: ## %entry
+; X64_AVX: ## %bb.0: ## %entry
; X64_AVX-NEXT: vcmpeqss (%rdi), %xmm0, %xmm0
; X64_AVX-NEXT: retq
entry:
@@ -377,7 +411,7 @@ declare <4 x float> @llvm.x86.sse.cmp.ss(<4 x float>, <4 x float>, i8) nounwind
define <4 x float> @double_fold(float* %x, <4 x float> %y) {
; X32-LABEL: double_fold:
-; X32: ## BB#0: ## %entry
+; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X32-NEXT: movaps %xmm0, %xmm2
@@ -387,7 +421,7 @@ define <4 x float> @double_fold(float* %x, <4 x float> %y) {
; X32-NEXT: retl
;
; X64-LABEL: double_fold:
-; X64: ## BB#0: ## %entry
+; X64: ## %bb.0: ## %entry
; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X64-NEXT: movaps %xmm0, %xmm2
; X64-NEXT: minss %xmm1, %xmm2
@@ -396,7 +430,7 @@ define <4 x float> @double_fold(float* %x, <4 x float> %y) {
; X64-NEXT: retq
;
; X32_AVX-LABEL: double_fold:
-; X32_AVX: ## BB#0: ## %entry
+; X32_AVX: ## %bb.0: ## %entry
; X32_AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32_AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X32_AVX-NEXT: vminss %xmm1, %xmm0, %xmm2
@@ -405,7 +439,7 @@ define <4 x float> @double_fold(float* %x, <4 x float> %y) {
; X32_AVX-NEXT: retl
;
; X64_AVX-LABEL: double_fold:
-; X64_AVX: ## BB#0: ## %entry
+; X64_AVX: ## %bb.0: ## %entry
; X64_AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X64_AVX-NEXT: vminss %xmm1, %xmm0, %xmm2
; X64_AVX-NEXT: vmaxss %xmm1, %xmm0, %xmm0
diff --git a/test/CodeGen/X86/vec_trunc_sext.ll b/test/CodeGen/X86/vec_trunc_sext.ll
index 66af87c78187..1e2de8e20c3d 100644
--- a/test/CodeGen/X86/vec_trunc_sext.ll
+++ b/test/CodeGen/X86/vec_trunc_sext.ll
@@ -10,7 +10,7 @@
define <4 x i32> @trunc_sext(<4 x i16>* %in) {
; NO_SSE_41-LABEL: trunc_sext:
-; NO_SSE_41: # BB#0:
+; NO_SSE_41: # %bb.0:
; NO_SSE_41-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; NO_SSE_41-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
; NO_SSE_41-NEXT: pslld $24, %xmm0
@@ -18,7 +18,7 @@ define <4 x i32> @trunc_sext(<4 x i16>* %in) {
; NO_SSE_41-NEXT: retq
;
; SSE_41-LABEL: trunc_sext:
-; SSE_41: # BB#0:
+; SSE_41: # %bb.0:
; SSE_41-NEXT: pmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
; SSE_41-NEXT: pslld $24, %xmm0
; SSE_41-NEXT: psrad $24, %xmm0
diff --git a/test/CodeGen/X86/vec_uint_to_fp-fastmath.ll b/test/CodeGen/X86/vec_uint_to_fp-fastmath.ll
index 7df3c3070422..bf4689953072 100644
--- a/test/CodeGen/X86/vec_uint_to_fp-fastmath.ll
+++ b/test/CodeGen/X86/vec_uint_to_fp-fastmath.ll
@@ -28,7 +28,7 @@
define <4 x float> @test_uitofp_v4i32_to_v4f32(<4 x i32> %arg) {
; SSE2-LABEL: test_uitofp_v4i32_to_v4f32:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movaps {{.*#+}} xmm1 = [65535,65535,65535,65535]
; SSE2-NEXT: andps %xmm0, %xmm1
; SSE2-NEXT: cvtdq2ps %xmm1, %xmm1
@@ -39,7 +39,7 @@ define <4 x float> @test_uitofp_v4i32_to_v4f32(<4 x i32> %arg) {
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_uitofp_v4i32_to_v4f32:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pxor %xmm1, %xmm1
; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
; SSE41-NEXT: cvtdq2ps %xmm1, %xmm1
@@ -50,7 +50,7 @@ define <4 x float> @test_uitofp_v4i32_to_v4f32(<4 x i32> %arg) {
; SSE41-NEXT: retq
;
; AVX-LABEL: test_uitofp_v4i32_to_v4f32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
; AVX-NEXT: vcvtdq2ps %xmm1, %xmm1
@@ -61,7 +61,7 @@ define <4 x float> @test_uitofp_v4i32_to_v4f32(<4 x i32> %arg) {
; AVX-NEXT: retq
;
; AVX2-LABEL: test_uitofp_v4i32_to_v4f32:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
; AVX2-NEXT: vcvtdq2ps %xmm1, %xmm1
; AVX2-NEXT: vbroadcastss [[FPMASKCSTADDR]](%rip), %xmm2
@@ -73,7 +73,7 @@ define <4 x float> @test_uitofp_v4i32_to_v4f32(<4 x i32> %arg) {
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test_uitofp_v4i32_to_v4f32:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: # kill
; AVX512F-NEXT: vcvtudq2ps %zmm0, %zmm0
; AVX512F-NEXT: # kill
@@ -81,7 +81,7 @@ define <4 x float> @test_uitofp_v4i32_to_v4f32(<4 x i32> %arg) {
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: test_uitofp_v4i32_to_v4f32:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vcvtudq2ps %xmm0, %xmm0
; AVX512VL-NEXT: retq
%tmp = uitofp <4 x i32> %arg to <4 x float>
@@ -105,7 +105,7 @@ define <4 x float> @test_uitofp_v4i32_to_v4f32(<4 x i32> %arg) {
define <8 x float> @test_uitofp_v8i32_to_v8f32(<8 x i32> %arg) {
; SSE2-LABEL: test_uitofp_v8i32_to_v8f32:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: psrld $16, %xmm2
; SSE2-NEXT: cvtdq2ps %xmm2, %xmm2
@@ -125,7 +125,7 @@ define <8 x float> @test_uitofp_v8i32_to_v8f32(<8 x i32> %arg) {
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_uitofp_v8i32_to_v8f32:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm2
; SSE41-NEXT: psrld $16, %xmm2
; SSE41-NEXT: cvtdq2ps %xmm2, %xmm2
@@ -145,7 +145,7 @@ define <8 x float> @test_uitofp_v8i32_to_v8f32(<8 x i32> %arg) {
; SSE41-NEXT: retq
;
; AVX-LABEL: test_uitofp_v8i32_to_v8f32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX-NEXT: vpsrld $16, %xmm2, %xmm2
@@ -158,26 +158,26 @@ define <8 x float> @test_uitofp_v8i32_to_v8f32(<8 x i32> %arg) {
; AVX-NEXT: retq
;
; AVX2-LABEL: test_uitofp_v8i32_to_v8f32:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpsrld $16, %ymm0, %ymm1
; AVX2-NEXT: vcvtdq2ps %ymm1, %ymm1
; AVX2-NEXT: vbroadcastss [[FPMASKCSTADDR_v8]](%rip), %ymm2
; AVX2-NEXT: vmulps %ymm2, %ymm1, %ymm1
-; AVX2-NEXT: vxorps %ymm2, %ymm2, %ymm2
+; AVX2-NEXT: vxorps %xmm2, %xmm2, %xmm2
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7],ymm0[8],ymm2[9],ymm0[10],ymm2[11],ymm0[12],ymm2[13],ymm0[14],ymm2[15]
; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0
; AVX2-NEXT: vaddps %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test_uitofp_v8i32_to_v8f32:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: # kill
; AVX512F-NEXT: vcvtudq2ps %zmm0, %zmm0
; AVX512F-NEXT: # kill
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: test_uitofp_v8i32_to_v8f32:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vcvtudq2ps %ymm0, %ymm0
; AVX512VL-NEXT: retq
%tmp = uitofp <8 x i32> %arg to <8 x float>
diff --git a/test/CodeGen/X86/vec_unsafe-fp-math.ll b/test/CodeGen/X86/vec_unsafe-fp-math.ll
index 745316effc98..340177ec49ae 100644
--- a/test/CodeGen/X86/vec_unsafe-fp-math.ll
+++ b/test/CodeGen/X86/vec_unsafe-fp-math.ll
@@ -6,7 +6,7 @@
; Subtracting zero is free.
define <4 x float> @vec_fsub_zero(<4 x float> %x) {
; CHECK-LABEL: vec_fsub_zero:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: retq
%sub = fsub <4 x float> %x, zeroinitializer
ret <4 x float> %sub
@@ -15,7 +15,7 @@ define <4 x float> @vec_fsub_zero(<4 x float> %x) {
; Negating doesn't require subtraction.
define <4 x float> @vec_fneg(<4 x float> %x) {
; CHECK-LABEL: vec_fneg:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: xorps {{.*}}(%rip), %xmm0
; CHECK-NEXT: retq
%sub = fsub <4 x float> zeroinitializer, %x
diff --git a/test/CodeGen/X86/vec_zero-2.ll b/test/CodeGen/X86/vec_zero-2.ll
index cdb030eb38bf..dce924cc6d4c 100644
--- a/test/CodeGen/X86/vec_zero-2.ll
+++ b/test/CodeGen/X86/vec_zero-2.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2
+; RUN: llc < %s -mtriple=i686-- -mattr=+sse2
define i32 @t() {
entry:
diff --git a/test/CodeGen/X86/vec_zero.ll b/test/CodeGen/X86/vec_zero.ll
index 1d900a0919f2..e95e61b97fea 100644
--- a/test/CodeGen/X86/vec_zero.ll
+++ b/test/CodeGen/X86/vec_zero.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 | FileCheck %s
+; RUN: llc < %s -mtriple=i686-- -mattr=+sse2 | FileCheck %s
; CHECK: foo
; CHECK: xorps
diff --git a/test/CodeGen/X86/vec_zero_cse.ll b/test/CodeGen/X86/vec_zero_cse.ll
index 75e85348ba8d..b868cc014074 100644
--- a/test/CodeGen/X86/vec_zero_cse.ll
+++ b/test/CodeGen/X86/vec_zero_cse.ll
@@ -12,7 +12,7 @@
define void @test1() {
; X32-LABEL: test1:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl $0, M1+4
; X32-NEXT: movl $0, M1
; X32-NEXT: xorps %xmm0, %xmm0
@@ -20,7 +20,7 @@ define void @test1() {
; X32-NEXT: retl
;
; X64-LABEL: test1:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movq $0, {{.*}}(%rip)
; X64-NEXT: movq $0, {{.*}}(%rip)
; X64-NEXT: retq
@@ -31,7 +31,7 @@ define void @test1() {
define void @test2() {
; X32-LABEL: test2:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl $-1, M1+4
; X32-NEXT: movl $-1, M1
; X32-NEXT: pcmpeqd %xmm0, %xmm0
@@ -39,7 +39,7 @@ define void @test2() {
; X32-NEXT: retl
;
; X64-LABEL: test2:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movq $-1, {{.*}}(%rip)
; X64-NEXT: movq {{.*}}(%rip), %rax
; X64-NEXT: movq %rax, {{.*}}(%rip)
@@ -51,14 +51,14 @@ define void @test2() {
define void @test3() {
; X32-LABEL: test3:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: xorps %xmm0, %xmm0
; X32-NEXT: movaps %xmm0, S1
; X32-NEXT: movaps %xmm0, S2
; X32-NEXT: retl
;
; X64-LABEL: test3:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: xorps %xmm0, %xmm0
; X64-NEXT: movaps %xmm0, {{.*}}(%rip)
; X64-NEXT: movaps %xmm0, {{.*}}(%rip)
@@ -70,14 +70,14 @@ define void @test3() {
define void @test4() {
; X32-LABEL: test4:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pcmpeqd %xmm0, %xmm0
; X32-NEXT: movdqa %xmm0, S1
; X32-NEXT: movdqa %xmm0, S2
; X32-NEXT: retl
;
; X64-LABEL: test4:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: pcmpeqd %xmm0, %xmm0
; X64-NEXT: movdqa %xmm0, {{.*}}(%rip)
; X64-NEXT: movdqa %xmm0, {{.*}}(%rip)
diff --git a/test/CodeGen/X86/vector-bitreverse.ll b/test/CodeGen/X86/vector-bitreverse.ll
index 2fb821555dba..1e8a693054d1 100644
--- a/test/CodeGen/X86/vector-bitreverse.ll
+++ b/test/CodeGen/X86/vector-bitreverse.ll
@@ -8,9 +8,12 @@
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX1
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2
+; Make sure we don't crash with avx512bw and xop
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx512bw
+
define i8 @test_bitreverse_i8(i8 %a) nounwind {
; SSE-LABEL: test_bitreverse_i8:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: rolb $4, %dil
; SSE-NEXT: movl %edi, %eax
; SSE-NEXT: andb $51, %al
@@ -28,7 +31,7 @@ define i8 @test_bitreverse_i8(i8 %a) nounwind {
; SSE-NEXT: retq
;
; AVX-LABEL: test_bitreverse_i8:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: rolb $4, %dil
; AVX-NEXT: movl %edi, %eax
; AVX-NEXT: andb $51, %al
@@ -46,11 +49,11 @@ define i8 @test_bitreverse_i8(i8 %a) nounwind {
; AVX-NEXT: retq
;
; XOP-LABEL: test_bitreverse_i8:
-; XOP: # BB#0:
+; XOP: # %bb.0:
; XOP-NEXT: vmovd %edi, %xmm0
; XOP-NEXT: vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0
; XOP-NEXT: vpextrb $0, %xmm0, %eax
-; XOP-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; XOP-NEXT: # kill: def %al killed %al killed %eax
; XOP-NEXT: retq
%b = call i8 @llvm.bitreverse.i8(i8 %a)
ret i8 %b
@@ -58,8 +61,8 @@ define i8 @test_bitreverse_i8(i8 %a) nounwind {
define i16 @test_bitreverse_i16(i16 %a) nounwind {
; SSE-LABEL: test_bitreverse_i16:
-; SSE: # BB#0:
-; SSE-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; SSE: # %bb.0:
+; SSE-NEXT: # kill: def %edi killed %edi def %rdi
; SSE-NEXT: rolw $8, %di
; SSE-NEXT: movl %edi, %eax
; SSE-NEXT: andl $3855, %eax # imm = 0xF0F
@@ -77,12 +80,12 @@ define i16 @test_bitreverse_i16(i16 %a) nounwind {
; SSE-NEXT: andl $43690, %eax # imm = 0xAAAA
; SSE-NEXT: shrl %eax
; SSE-NEXT: leal (%rax,%rcx,2), %eax
-; SSE-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; SSE-NEXT: # kill: def %ax killed %ax killed %eax
; SSE-NEXT: retq
;
; AVX-LABEL: test_bitreverse_i16:
-; AVX: # BB#0:
-; AVX-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; AVX: # %bb.0:
+; AVX-NEXT: # kill: def %edi killed %edi def %rdi
; AVX-NEXT: rolw $8, %di
; AVX-NEXT: movl %edi, %eax
; AVX-NEXT: andl $3855, %eax # imm = 0xF0F
@@ -100,15 +103,15 @@ define i16 @test_bitreverse_i16(i16 %a) nounwind {
; AVX-NEXT: andl $43690, %eax # imm = 0xAAAA
; AVX-NEXT: shrl %eax
; AVX-NEXT: leal (%rax,%rcx,2), %eax
-; AVX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX-NEXT: # kill: def %ax killed %ax killed %eax
; AVX-NEXT: retq
;
; XOP-LABEL: test_bitreverse_i16:
-; XOP: # BB#0:
+; XOP: # %bb.0:
; XOP-NEXT: vmovd %edi, %xmm0
; XOP-NEXT: vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0
; XOP-NEXT: vmovd %xmm0, %eax
-; XOP-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; XOP-NEXT: # kill: def %ax killed %ax killed %eax
; XOP-NEXT: retq
%b = call i16 @llvm.bitreverse.i16(i16 %a)
ret i16 %b
@@ -116,8 +119,8 @@ define i16 @test_bitreverse_i16(i16 %a) nounwind {
define i32 @test_bitreverse_i32(i32 %a) nounwind {
; SSE-LABEL: test_bitreverse_i32:
-; SSE: # BB#0:
-; SSE-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; SSE: # %bb.0:
+; SSE-NEXT: # kill: def %edi killed %edi def %rdi
; SSE-NEXT: bswapl %edi
; SSE-NEXT: movl %edi, %eax
; SSE-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F
@@ -138,8 +141,8 @@ define i32 @test_bitreverse_i32(i32 %a) nounwind {
; SSE-NEXT: retq
;
; AVX-LABEL: test_bitreverse_i32:
-; AVX: # BB#0:
-; AVX-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; AVX: # %bb.0:
+; AVX-NEXT: # kill: def %edi killed %edi def %rdi
; AVX-NEXT: bswapl %edi
; AVX-NEXT: movl %edi, %eax
; AVX-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F
@@ -160,7 +163,7 @@ define i32 @test_bitreverse_i32(i32 %a) nounwind {
; AVX-NEXT: retq
;
; XOP-LABEL: test_bitreverse_i32:
-; XOP: # BB#0:
+; XOP: # %bb.0:
; XOP-NEXT: vmovd %edi, %xmm0
; XOP-NEXT: vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0
; XOP-NEXT: vmovd %xmm0, %eax
@@ -171,7 +174,7 @@ define i32 @test_bitreverse_i32(i32 %a) nounwind {
define i64 @test_bitreverse_i64(i64 %a) nounwind {
; SSE-LABEL: test_bitreverse_i64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: bswapq %rdi
; SSE-NEXT: movabsq $1085102592571150095, %rax # imm = 0xF0F0F0F0F0F0F0F
; SSE-NEXT: andq %rdi, %rax
@@ -195,7 +198,7 @@ define i64 @test_bitreverse_i64(i64 %a) nounwind {
; SSE-NEXT: retq
;
; AVX-LABEL: test_bitreverse_i64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: bswapq %rdi
; AVX-NEXT: movabsq $1085102592571150095, %rax # imm = 0xF0F0F0F0F0F0F0F
; AVX-NEXT: andq %rdi, %rax
@@ -219,7 +222,7 @@ define i64 @test_bitreverse_i64(i64 %a) nounwind {
; AVX-NEXT: retq
;
; XOP-LABEL: test_bitreverse_i64:
-; XOP: # BB#0:
+; XOP: # %bb.0:
; XOP-NEXT: vmovq %rdi, %xmm0
; XOP-NEXT: vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0
; XOP-NEXT: vmovq %xmm0, %rax
@@ -230,7 +233,7 @@ define i64 @test_bitreverse_i64(i64 %a) nounwind {
define <16 x i8> @test_bitreverse_v16i8(<16 x i8> %a) nounwind {
; SSE2-LABEL: test_bitreverse_v16i8:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: pand %xmm1, %xmm2
@@ -259,7 +262,7 @@ define <16 x i8> @test_bitreverse_v16i8(<16 x i8> %a) nounwind {
; SSE2-NEXT: retq
;
; SSSE3-LABEL: test_bitreverse_v16i8:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; SSSE3-NEXT: movdqa %xmm0, %xmm2
; SSSE3-NEXT: pand %xmm1, %xmm2
@@ -274,7 +277,7 @@ define <16 x i8> @test_bitreverse_v16i8(<16 x i8> %a) nounwind {
; SSSE3-NEXT: retq
;
; AVX-LABEL: test_bitreverse_v16i8:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
@@ -287,7 +290,7 @@ define <16 x i8> @test_bitreverse_v16i8(<16 x i8> %a) nounwind {
; AVX-NEXT: retq
;
; XOP-LABEL: test_bitreverse_v16i8:
-; XOP: # BB#0:
+; XOP: # %bb.0:
; XOP-NEXT: vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0
; XOP-NEXT: retq
%b = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %a)
@@ -296,7 +299,7 @@ define <16 x i8> @test_bitreverse_v16i8(<16 x i8> %a) nounwind {
define <8 x i16> @test_bitreverse_v8i16(<8 x i16> %a) nounwind {
; SSE2-LABEL: test_bitreverse_v8i16:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: pxor %xmm1, %xmm1
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
@@ -334,7 +337,7 @@ define <8 x i16> @test_bitreverse_v8i16(<8 x i16> %a) nounwind {
; SSE2-NEXT: retq
;
; SSSE3-LABEL: test_bitreverse_v8i16:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; SSSE3-NEXT: movdqa %xmm0, %xmm2
@@ -350,7 +353,7 @@ define <8 x i16> @test_bitreverse_v8i16(<8 x i16> %a) nounwind {
; SSSE3-NEXT: retq
;
; AVX-LABEL: test_bitreverse_v8i16:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2
@@ -364,7 +367,7 @@ define <8 x i16> @test_bitreverse_v8i16(<8 x i16> %a) nounwind {
; AVX-NEXT: retq
;
; XOP-LABEL: test_bitreverse_v8i16:
-; XOP: # BB#0:
+; XOP: # %bb.0:
; XOP-NEXT: vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0
; XOP-NEXT: retq
%b = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %a)
@@ -373,7 +376,7 @@ define <8 x i16> @test_bitreverse_v8i16(<8 x i16> %a) nounwind {
define <4 x i32> @test_bitreverse_v4i32(<4 x i32> %a) nounwind {
; SSE2-LABEL: test_bitreverse_v4i32:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: pxor %xmm1, %xmm1
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
@@ -411,7 +414,7 @@ define <4 x i32> @test_bitreverse_v4i32(<4 x i32> %a) nounwind {
; SSE2-NEXT: retq
;
; SSSE3-LABEL: test_bitreverse_v4i32:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; SSSE3-NEXT: movdqa %xmm0, %xmm2
@@ -427,7 +430,7 @@ define <4 x i32> @test_bitreverse_v4i32(<4 x i32> %a) nounwind {
; SSSE3-NEXT: retq
;
; AVX-LABEL: test_bitreverse_v4i32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2
@@ -441,7 +444,7 @@ define <4 x i32> @test_bitreverse_v4i32(<4 x i32> %a) nounwind {
; AVX-NEXT: retq
;
; XOP-LABEL: test_bitreverse_v4i32:
-; XOP: # BB#0:
+; XOP: # %bb.0:
; XOP-NEXT: vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0
; XOP-NEXT: retq
%b = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %a)
@@ -450,7 +453,7 @@ define <4 x i32> @test_bitreverse_v4i32(<4 x i32> %a) nounwind {
define <2 x i64> @test_bitreverse_v2i64(<2 x i64> %a) nounwind {
; SSE2-LABEL: test_bitreverse_v2i64:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: pxor %xmm1, %xmm1
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
@@ -490,7 +493,7 @@ define <2 x i64> @test_bitreverse_v2i64(<2 x i64> %a) nounwind {
; SSE2-NEXT: retq
;
; SSSE3-LABEL: test_bitreverse_v2i64:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; SSSE3-NEXT: movdqa %xmm0, %xmm2
@@ -506,7 +509,7 @@ define <2 x i64> @test_bitreverse_v2i64(<2 x i64> %a) nounwind {
; SSSE3-NEXT: retq
;
; AVX-LABEL: test_bitreverse_v2i64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2
@@ -520,7 +523,7 @@ define <2 x i64> @test_bitreverse_v2i64(<2 x i64> %a) nounwind {
; AVX-NEXT: retq
;
; XOP-LABEL: test_bitreverse_v2i64:
-; XOP: # BB#0:
+; XOP: # %bb.0:
; XOP-NEXT: vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0
; XOP-NEXT: retq
%b = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %a)
@@ -529,7 +532,7 @@ define <2 x i64> @test_bitreverse_v2i64(<2 x i64> %a) nounwind {
define <32 x i8> @test_bitreverse_v32i8(<32 x i8> %a) nounwind {
; SSE2-LABEL: test_bitreverse_v32i8:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; SSE2-NEXT: movdqa %xmm0, %xmm3
; SSE2-NEXT: pand %xmm2, %xmm3
@@ -586,7 +589,7 @@ define <32 x i8> @test_bitreverse_v32i8(<32 x i8> %a) nounwind {
; SSE2-NEXT: retq
;
; SSSE3-LABEL: test_bitreverse_v32i8:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; SSSE3-NEXT: movdqa %xmm0, %xmm2
; SSSE3-NEXT: pand %xmm4, %xmm2
@@ -611,7 +614,7 @@ define <32 x i8> @test_bitreverse_v32i8(<32 x i8> %a) nounwind {
; SSSE3-NEXT: retq
;
; AVX1-LABEL: test_bitreverse_v32i8:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3
@@ -632,7 +635,7 @@ define <32 x i8> @test_bitreverse_v32i8(<32 x i8> %a) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_bitreverse_v32i8:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
@@ -645,7 +648,7 @@ define <32 x i8> @test_bitreverse_v32i8(<32 x i8> %a) nounwind {
; AVX2-NEXT: retq
;
; AVX512-LABEL: test_bitreverse_v32i8:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm2
; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
@@ -658,7 +661,7 @@ define <32 x i8> @test_bitreverse_v32i8(<32 x i8> %a) nounwind {
; AVX512-NEXT: retq
;
; XOPAVX1-LABEL: test_bitreverse_v32i8:
-; XOPAVX1: # BB#0:
+; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95]
; XOPAVX1-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1
@@ -667,7 +670,7 @@ define <32 x i8> @test_bitreverse_v32i8(<32 x i8> %a) nounwind {
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: test_bitreverse_v32i8:
-; XOPAVX2: # BB#0:
+; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95]
; XOPAVX2-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1
@@ -680,7 +683,7 @@ define <32 x i8> @test_bitreverse_v32i8(<32 x i8> %a) nounwind {
define <16 x i16> @test_bitreverse_v16i16(<16 x i16> %a) nounwind {
; SSE2-LABEL: test_bitreverse_v16i16:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: pxor %xmm4, %xmm4
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15]
@@ -754,7 +757,7 @@ define <16 x i16> @test_bitreverse_v16i16(<16 x i16> %a) nounwind {
; SSE2-NEXT: retq
;
; SSSE3-LABEL: test_bitreverse_v16i16:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
; SSSE3-NEXT: pshufb %xmm4, %xmm0
; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
@@ -782,7 +785,7 @@ define <16 x i16> @test_bitreverse_v16i16(<16 x i16> %a) nounwind {
; SSSE3-NEXT: retq
;
; AVX1-LABEL: test_bitreverse_v16i16:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
@@ -806,7 +809,7 @@ define <16 x i16> @test_bitreverse_v16i16(<16 x i16> %a) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_bitreverse_v16i16:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30]
; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
@@ -820,7 +823,7 @@ define <16 x i16> @test_bitreverse_v16i16(<16 x i16> %a) nounwind {
; AVX2-NEXT: retq
;
; AVX512-LABEL: test_bitreverse_v16i16:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30]
; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm2
@@ -834,7 +837,7 @@ define <16 x i16> @test_bitreverse_v16i16(<16 x i16> %a) nounwind {
; AVX512-NEXT: retq
;
; XOPAVX1-LABEL: test_bitreverse_v16i16:
-; XOPAVX1: # BB#0:
+; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [81,80,83,82,85,84,87,86,89,88,91,90,93,92,95,94]
; XOPAVX1-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1
@@ -843,7 +846,7 @@ define <16 x i16> @test_bitreverse_v16i16(<16 x i16> %a) nounwind {
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: test_bitreverse_v16i16:
-; XOPAVX2: # BB#0:
+; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [81,80,83,82,85,84,87,86,89,88,91,90,93,92,95,94]
; XOPAVX2-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1
@@ -856,7 +859,7 @@ define <16 x i16> @test_bitreverse_v16i16(<16 x i16> %a) nounwind {
define <8 x i32> @test_bitreverse_v8i32(<8 x i32> %a) nounwind {
; SSE2-LABEL: test_bitreverse_v8i32:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: pxor %xmm4, %xmm4
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15]
@@ -930,7 +933,7 @@ define <8 x i32> @test_bitreverse_v8i32(<8 x i32> %a) nounwind {
; SSE2-NEXT: retq
;
; SSSE3-LABEL: test_bitreverse_v8i32:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
; SSSE3-NEXT: pshufb %xmm4, %xmm0
; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
@@ -958,7 +961,7 @@ define <8 x i32> @test_bitreverse_v8i32(<8 x i32> %a) nounwind {
; SSSE3-NEXT: retq
;
; AVX1-LABEL: test_bitreverse_v8i32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
@@ -982,7 +985,7 @@ define <8 x i32> @test_bitreverse_v8i32(<8 x i32> %a) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_bitreverse_v8i32:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28]
; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
@@ -996,7 +999,7 @@ define <8 x i32> @test_bitreverse_v8i32(<8 x i32> %a) nounwind {
; AVX2-NEXT: retq
;
; AVX512-LABEL: test_bitreverse_v8i32:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28]
; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm2
@@ -1010,7 +1013,7 @@ define <8 x i32> @test_bitreverse_v8i32(<8 x i32> %a) nounwind {
; AVX512-NEXT: retq
;
; XOPAVX1-LABEL: test_bitreverse_v8i32:
-; XOPAVX1: # BB#0:
+; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [83,82,81,80,87,86,85,84,91,90,89,88,95,94,93,92]
; XOPAVX1-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1
@@ -1019,7 +1022,7 @@ define <8 x i32> @test_bitreverse_v8i32(<8 x i32> %a) nounwind {
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: test_bitreverse_v8i32:
-; XOPAVX2: # BB#0:
+; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [83,82,81,80,87,86,85,84,91,90,89,88,95,94,93,92]
; XOPAVX2-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1
@@ -1032,7 +1035,7 @@ define <8 x i32> @test_bitreverse_v8i32(<8 x i32> %a) nounwind {
define <4 x i64> @test_bitreverse_v4i64(<4 x i64> %a) nounwind {
; SSE2-LABEL: test_bitreverse_v4i64:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: pxor %xmm4, %xmm4
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15]
@@ -1110,7 +1113,7 @@ define <4 x i64> @test_bitreverse_v4i64(<4 x i64> %a) nounwind {
; SSE2-NEXT: retq
;
; SSSE3-LABEL: test_bitreverse_v4i64:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
; SSSE3-NEXT: pshufb %xmm4, %xmm0
; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
@@ -1138,7 +1141,7 @@ define <4 x i64> @test_bitreverse_v4i64(<4 x i64> %a) nounwind {
; SSSE3-NEXT: retq
;
; AVX1-LABEL: test_bitreverse_v4i64:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
@@ -1162,7 +1165,7 @@ define <4 x i64> @test_bitreverse_v4i64(<4 x i64> %a) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_bitreverse_v4i64:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24]
; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
@@ -1176,7 +1179,7 @@ define <4 x i64> @test_bitreverse_v4i64(<4 x i64> %a) nounwind {
; AVX2-NEXT: retq
;
; AVX512-LABEL: test_bitreverse_v4i64:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24]
; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm2
@@ -1190,7 +1193,7 @@ define <4 x i64> @test_bitreverse_v4i64(<4 x i64> %a) nounwind {
; AVX512-NEXT: retq
;
; XOPAVX1-LABEL: test_bitreverse_v4i64:
-; XOPAVX1: # BB#0:
+; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [87,86,85,84,83,82,81,80,95,94,93,92,91,90,89,88]
; XOPAVX1-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1
@@ -1199,7 +1202,7 @@ define <4 x i64> @test_bitreverse_v4i64(<4 x i64> %a) nounwind {
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: test_bitreverse_v4i64:
-; XOPAVX2: # BB#0:
+; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [87,86,85,84,83,82,81,80,95,94,93,92,91,90,89,88]
; XOPAVX2-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm1
@@ -1212,7 +1215,7 @@ define <4 x i64> @test_bitreverse_v4i64(<4 x i64> %a) nounwind {
define <64 x i8> @test_bitreverse_v64i8(<64 x i8> %a) nounwind {
; SSE2-LABEL: test_bitreverse_v64i8:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm13 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; SSE2-NEXT: movdqa %xmm0, %xmm5
; SSE2-NEXT: pand %xmm13, %xmm5
@@ -1315,7 +1318,7 @@ define <64 x i8> @test_bitreverse_v64i8(<64 x i8> %a) nounwind {
; SSE2-NEXT: retq
;
; SSSE3-LABEL: test_bitreverse_v64i8:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: movdqa %xmm0, %xmm5
; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; SSSE3-NEXT: pand %xmm8, %xmm0
@@ -1359,7 +1362,7 @@ define <64 x i8> @test_bitreverse_v64i8(<64 x i8> %a) nounwind {
; SSSE3-NEXT: retq
;
; AVX1-LABEL: test_bitreverse_v64i8:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4
@@ -1394,7 +1397,7 @@ define <64 x i8> @test_bitreverse_v64i8(<64 x i8> %a) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_bitreverse_v64i8:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm3
; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
@@ -1413,7 +1416,7 @@ define <64 x i8> @test_bitreverse_v64i8(<64 x i8> %a) nounwind {
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test_bitreverse_v64i8:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm3
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
@@ -1432,20 +1435,20 @@ define <64 x i8> @test_bitreverse_v64i8(<64 x i8> %a) nounwind {
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: test_bitreverse_v64i8:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm0
; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0
; AVX512BW-NEXT: retq
;
; XOPAVX1-LABEL: test_bitreverse_v64i8:
-; XOPAVX1: # BB#0:
+; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95]
; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2
@@ -1458,7 +1461,7 @@ define <64 x i8> @test_bitreverse_v64i8(<64 x i8> %a) nounwind {
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: test_bitreverse_v64i8:
-; XOPAVX2: # BB#0:
+; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95]
; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2
@@ -1475,7 +1478,7 @@ define <64 x i8> @test_bitreverse_v64i8(<64 x i8> %a) nounwind {
define <32 x i16> @test_bitreverse_v32i16(<32 x i16> %a) nounwind {
; SSE2-LABEL: test_bitreverse_v32i16:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: pxor %xmm14, %xmm14
; SSE2-NEXT: movdqa %xmm0, %xmm4
; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm14[8],xmm4[9],xmm14[9],xmm4[10],xmm14[10],xmm4[11],xmm14[11],xmm4[12],xmm14[12],xmm4[13],xmm14[13],xmm4[14],xmm14[14],xmm4[15],xmm14[15]
@@ -1611,7 +1614,7 @@ define <32 x i16> @test_bitreverse_v32i16(<32 x i16> %a) nounwind {
; SSE2-NEXT: retq
;
; SSSE3-LABEL: test_bitreverse_v32i16:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: movdqa %xmm1, %xmm5
; SSSE3-NEXT: movdqa %xmm0, %xmm1
; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
@@ -1661,7 +1664,7 @@ define <32 x i16> @test_bitreverse_v32i16(<32 x i16> %a) nounwind {
; SSSE3-NEXT: retq
;
; AVX1-LABEL: test_bitreverse_v32i16:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
@@ -1701,7 +1704,7 @@ define <32 x i16> @test_bitreverse_v32i16(<32 x i16> %a) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_bitreverse_v32i16:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
@@ -1723,7 +1726,7 @@ define <32 x i16> @test_bitreverse_v32i16(<32 x i16> %a) nounwind {
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test_bitreverse_v32i16:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
; AVX512F-NEXT: vpshufb %ymm2, %ymm0, %ymm0
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
@@ -1745,21 +1748,21 @@ define <32 x i16> @test_bitreverse_v32i16(<32 x i16> %a) nounwind {
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: test_bitreverse_v32i16:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30,33,32,35,34,37,36,39,38,41,40,43,42,45,44,47,46,49,48,51,50,53,52,55,54,57,56,59,58,61,60,63,62]
; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm0
; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0
; AVX512BW-NEXT: retq
;
; XOPAVX1-LABEL: test_bitreverse_v32i16:
-; XOPAVX1: # BB#0:
+; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [81,80,83,82,85,84,87,86,89,88,91,90,93,92,95,94]
; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2
@@ -1772,7 +1775,7 @@ define <32 x i16> @test_bitreverse_v32i16(<32 x i16> %a) nounwind {
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: test_bitreverse_v32i16:
-; XOPAVX2: # BB#0:
+; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [81,80,83,82,85,84,87,86,89,88,91,90,93,92,95,94]
; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2
@@ -1789,7 +1792,7 @@ define <32 x i16> @test_bitreverse_v32i16(<32 x i16> %a) nounwind {
define <16 x i32> @test_bitreverse_v16i32(<16 x i32> %a) nounwind {
; SSE2-LABEL: test_bitreverse_v16i32:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: pxor %xmm14, %xmm14
; SSE2-NEXT: movdqa %xmm0, %xmm4
; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm14[8],xmm4[9],xmm14[9],xmm4[10],xmm14[10],xmm4[11],xmm14[11],xmm4[12],xmm14[12],xmm4[13],xmm14[13],xmm4[14],xmm14[14],xmm4[15],xmm14[15]
@@ -1925,7 +1928,7 @@ define <16 x i32> @test_bitreverse_v16i32(<16 x i32> %a) nounwind {
; SSE2-NEXT: retq
;
; SSSE3-LABEL: test_bitreverse_v16i32:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: movdqa %xmm1, %xmm5
; SSSE3-NEXT: movdqa %xmm0, %xmm1
; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
@@ -1975,7 +1978,7 @@ define <16 x i32> @test_bitreverse_v16i32(<16 x i32> %a) nounwind {
; SSSE3-NEXT: retq
;
; AVX1-LABEL: test_bitreverse_v16i32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
@@ -2015,7 +2018,7 @@ define <16 x i32> @test_bitreverse_v16i32(<16 x i32> %a) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_bitreverse_v16i32:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
@@ -2037,7 +2040,7 @@ define <16 x i32> @test_bitreverse_v16i32(<16 x i32> %a) nounwind {
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test_bitreverse_v16i32:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vpsrld $24, %zmm0, %zmm1
; AVX512F-NEXT: vpsrld $8, %zmm0, %zmm2
; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
@@ -2051,35 +2054,35 @@ define <16 x i32> @test_bitreverse_v16i32(<16 x i32> %a) nounwind {
; AVX512F-NEXT: vpslld $4, %zmm1, %zmm1
; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm0
; AVX512F-NEXT: vpsrld $4, %zmm0, %zmm0
-; AVX512F-NEXT: vpord %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm1
; AVX512F-NEXT: vpslld $2, %zmm1, %zmm1
; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm0
; AVX512F-NEXT: vpsrld $2, %zmm0, %zmm0
-; AVX512F-NEXT: vpord %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm1
; AVX512F-NEXT: vpslld $1, %zmm1, %zmm1
; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm0
; AVX512F-NEXT: vpsrld $1, %zmm0, %zmm0
-; AVX512F-NEXT: vpord %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: test_bitreverse_v16i32:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28,35,34,33,32,39,38,37,36,43,42,41,40,47,46,45,44,51,50,49,48,55,54,53,52,59,58,57,56,63,62,61,60]
; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm0
; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0
; AVX512BW-NEXT: retq
;
; XOPAVX1-LABEL: test_bitreverse_v16i32:
-; XOPAVX1: # BB#0:
+; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [83,82,81,80,87,86,85,84,91,90,89,88,95,94,93,92]
; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2
@@ -2092,7 +2095,7 @@ define <16 x i32> @test_bitreverse_v16i32(<16 x i32> %a) nounwind {
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: test_bitreverse_v16i32:
-; XOPAVX2: # BB#0:
+; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [83,82,81,80,87,86,85,84,91,90,89,88,95,94,93,92]
; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2
@@ -2109,7 +2112,7 @@ define <16 x i32> @test_bitreverse_v16i32(<16 x i32> %a) nounwind {
define <8 x i64> @test_bitreverse_v8i64(<8 x i64> %a) nounwind {
; SSE2-LABEL: test_bitreverse_v8i64:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: pxor %xmm14, %xmm14
; SSE2-NEXT: movdqa %xmm0, %xmm4
; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm14[8],xmm4[9],xmm14[9],xmm4[10],xmm14[10],xmm4[11],xmm14[11],xmm4[12],xmm14[12],xmm4[13],xmm14[13],xmm4[14],xmm14[14],xmm4[15],xmm14[15]
@@ -2253,7 +2256,7 @@ define <8 x i64> @test_bitreverse_v8i64(<8 x i64> %a) nounwind {
; SSE2-NEXT: retq
;
; SSSE3-LABEL: test_bitreverse_v8i64:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: movdqa %xmm1, %xmm5
; SSSE3-NEXT: movdqa %xmm0, %xmm1
; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
@@ -2303,7 +2306,7 @@ define <8 x i64> @test_bitreverse_v8i64(<8 x i64> %a) nounwind {
; SSSE3-NEXT: retq
;
; AVX1-LABEL: test_bitreverse_v8i64:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
@@ -2343,7 +2346,7 @@ define <8 x i64> @test_bitreverse_v8i64(<8 x i64> %a) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_bitreverse_v8i64:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
@@ -2365,7 +2368,7 @@ define <8 x i64> @test_bitreverse_v8i64(<8 x i64> %a) nounwind {
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test_bitreverse_v8i64:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vpsrlq $56, %zmm0, %zmm1
; AVX512F-NEXT: vpsrlq $40, %zmm0, %zmm2
; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
@@ -2405,21 +2408,21 @@ define <8 x i64> @test_bitreverse_v8i64(<8 x i64> %a) nounwind {
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: test_bitreverse_v8i64:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24,39,38,37,36,35,34,33,32,47,46,45,44,43,42,41,40,55,54,53,52,51,50,49,48,63,62,61,60,59,58,57,56]
; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
+; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
+; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm0
; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0
; AVX512BW-NEXT: retq
;
; XOPAVX1-LABEL: test_bitreverse_v8i64:
-; XOPAVX1: # BB#0:
+; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [87,86,85,84,83,82,81,80,95,94,93,92,91,90,89,88]
; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2
@@ -2432,7 +2435,7 @@ define <8 x i64> @test_bitreverse_v8i64(<8 x i64> %a) nounwind {
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: test_bitreverse_v8i64:
-; XOPAVX2: # BB#0:
+; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [87,86,85,84,83,82,81,80,95,94,93,92,91,90,89,88]
; XOPAVX2-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm2
@@ -2453,7 +2456,7 @@ define <8 x i64> @test_bitreverse_v8i64(<8 x i64> %a) nounwind {
define i32 @fold_bitreverse_i32() nounwind {
; ALL-LABEL: fold_bitreverse_i32:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: movl $16711935, %eax # imm = 0xFF00FF
; ALL-NEXT: retq
%b = call i32 @llvm.bitreverse.i32(i32 4278255360)
@@ -2462,17 +2465,17 @@ define i32 @fold_bitreverse_i32() nounwind {
define <16 x i8> @fold_bitreverse_v16i8() nounwind {
; SSE-LABEL: fold_bitreverse_v16i8:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm0 = [0,255,64,191,32,223,96,159,16,239,80,175,48,207,112,143]
; SSE-NEXT: retq
;
; AVX-LABEL: fold_bitreverse_v16i8:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [0,255,64,191,32,223,96,159,16,239,80,175,48,207,112,143]
; AVX-NEXT: retq
;
; XOP-LABEL: fold_bitreverse_v16i8:
-; XOP: # BB#0:
+; XOP: # %bb.0:
; XOP-NEXT: vmovaps {{.*#+}} xmm0 = [0,255,64,191,32,223,96,159,16,239,80,175,48,207,112,143]
; XOP-NEXT: retq
%b = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> <i8 0, i8 -1, i8 2, i8 -3, i8 4, i8 -5, i8 6, i8 -7, i8 8, i8 -9, i8 10, i8 -11, i8 12, i8 -13, i8 14, i8 -15>)
@@ -2481,18 +2484,18 @@ define <16 x i8> @fold_bitreverse_v16i8() nounwind {
define <16 x i16> @fold_bitreverse_v16i16() nounwind {
; SSE-LABEL: fold_bitreverse_v16i16:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm0 = [0,65535,16384,49151,8192,57343,24576,40959]
; SSE-NEXT: movaps {{.*#+}} xmm1 = [4096,61439,20480,45055,12288,53247,28672,36863]
; SSE-NEXT: retq
;
; AVX-LABEL: fold_bitreverse_v16i16:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [0,65535,16384,49151,8192,57343,24576,40959,4096,61439,20480,45055,12288,53247,28672,36863]
; AVX-NEXT: retq
;
; XOP-LABEL: fold_bitreverse_v16i16:
-; XOP: # BB#0:
+; XOP: # %bb.0:
; XOP-NEXT: vmovaps {{.*#+}} ymm0 = [0,65535,16384,49151,8192,57343,24576,40959,4096,61439,20480,45055,12288,53247,28672,36863]
; XOP-NEXT: retq
%b = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> <i16 0, i16 -1, i16 2, i16 -3, i16 4, i16 -5, i16 6, i16 -7, i16 8, i16 -9, i16 10, i16 -11, i16 12, i16 -13, i16 14, i16 -15>)
@@ -2501,7 +2504,7 @@ define <16 x i16> @fold_bitreverse_v16i16() nounwind {
define <16 x i32> @fold_bitreverse_v16i32() nounwind {
; SSE-LABEL: fold_bitreverse_v16i32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm0 = [0,4294967295,1073741824,3221225471]
; SSE-NEXT: movaps {{.*#+}} xmm1 = [536870912,3758096383,1610612736,2684354559]
; SSE-NEXT: movaps {{.*#+}} xmm2 = [268435456,4026531839,1342177280,2952790015]
@@ -2509,24 +2512,24 @@ define <16 x i32> @fold_bitreverse_v16i32() nounwind {
; SSE-NEXT: retq
;
; AVX1-LABEL: fold_bitreverse_v16i32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559]
; AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103]
; AVX1-NEXT: retq
;
; AVX2-LABEL: fold_bitreverse_v16i32:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559]
; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = [268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103]
; AVX2-NEXT: retq
;
; AVX512-LABEL: fold_bitreverse_v16i32:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vmovaps {{.*#+}} zmm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559,268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103]
; AVX512-NEXT: retq
;
; XOP-LABEL: fold_bitreverse_v16i32:
-; XOP: # BB#0:
+; XOP: # %bb.0:
; XOP-NEXT: vmovaps {{.*#+}} ymm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559]
; XOP-NEXT: vmovaps {{.*#+}} ymm1 = [268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103]
; XOP-NEXT: retq
diff --git a/test/CodeGen/X86/vector-blend.ll b/test/CodeGen/X86/vector-blend.ll
index ab5fac59ebd1..30c72f760295 100644
--- a/test/CodeGen/X86/vector-blend.ll
+++ b/test/CodeGen/X86/vector-blend.ll
@@ -9,24 +9,24 @@
define <4 x float> @vsel_float(<4 x float> %v1, <4 x float> %v2) {
; SSE2-LABEL: vsel_float:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3]
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: vsel_float:
-; SSSE3: # BB#0: # %entry
+; SSSE3: # %bb.0: # %entry
; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3]
; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: vsel_float:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
; SSE41-NEXT: retq
;
; AVX-LABEL: vsel_float:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
; AVX-NEXT: retq
entry:
@@ -36,24 +36,24 @@ entry:
define <4 x float> @vsel_float2(<4 x float> %v1, <4 x float> %v2) {
; SSE2-LABEL: vsel_float2:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
; SSE2-NEXT: movaps %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: vsel_float2:
-; SSSE3: # BB#0: # %entry
+; SSSE3: # %bb.0: # %entry
; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
; SSSE3-NEXT: movaps %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: vsel_float2:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; SSE41-NEXT: retq
;
; AVX-LABEL: vsel_float2:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; AVX-NEXT: retq
entry:
@@ -63,30 +63,30 @@ entry:
define <4 x i8> @vsel_4xi8(<4 x i8> %v1, <4 x i8> %v2) {
; SSE2-LABEL: vsel_4xi8:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0]
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: vsel_4xi8:
-; SSSE3: # BB#0: # %entry
+; SSSE3: # %bb.0: # %entry
; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0]
; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: vsel_4xi8:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5],xmm0[6,7]
; SSE41-NEXT: retq
;
; AVX1-LABEL: vsel_4xi8:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5],xmm0[6,7]
; AVX1-NEXT: retq
;
; AVX2-LABEL: vsel_4xi8:
-; AVX2: # BB#0: # %entry
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
+; AVX2: # %bb.0: # %entry
+; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
; AVX2-NEXT: retq
entry:
%vsel = select <4 x i1> <i1 true, i1 true, i1 false, i1 true>, <4 x i8> %v1, <4 x i8> %v2
@@ -95,32 +95,32 @@ entry:
define <4 x i16> @vsel_4xi16(<4 x i16> %v1, <4 x i16> %v2) {
; SSE2-LABEL: vsel_4xi16:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0]
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
; SSE2-NEXT: movaps %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: vsel_4xi16:
-; SSSE3: # BB#0: # %entry
+; SSSE3: # %bb.0: # %entry
; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0]
; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
; SSSE3-NEXT: movaps %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: vsel_4xi16:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
; SSE41-NEXT: retq
;
; AVX1-LABEL: vsel_4xi16:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
; AVX1-NEXT: retq
;
; AVX2-LABEL: vsel_4xi16:
-; AVX2: # BB#0: # %entry
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX2: # %bb.0: # %entry
+; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
; AVX2-NEXT: retq
entry:
%vsel = select <4 x i1> <i1 true, i1 false, i1 true, i1 true>, <4 x i16> %v1, <4 x i16> %v2
@@ -129,32 +129,32 @@ entry:
define <4 x i32> @vsel_i32(<4 x i32> %v1, <4 x i32> %v2) {
; SSE2-LABEL: vsel_i32:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: vsel_i32:
-; SSSE3: # BB#0: # %entry
+; SSSE3: # %bb.0: # %entry
; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: vsel_i32:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
; SSE41-NEXT: retq
;
; AVX1-LABEL: vsel_i32:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
; AVX1-NEXT: retq
;
; AVX2-LABEL: vsel_i32:
-; AVX2: # BB#0: # %entry
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; AVX2: # %bb.0: # %entry
+; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
; AVX2-NEXT: retq
entry:
%vsel = select <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x i32> %v1, <4 x i32> %v2
@@ -163,24 +163,24 @@ entry:
define <2 x double> @vsel_double(<2 x double> %v1, <2 x double> %v2) {
; SSE2-LABEL: vsel_double:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
; SSE2-NEXT: movapd %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: vsel_double:
-; SSSE3: # BB#0: # %entry
+; SSSE3: # %bb.0: # %entry
; SSSE3-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
; SSSE3-NEXT: movapd %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: vsel_double:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
; SSE41-NEXT: retq
;
; AVX-LABEL: vsel_double:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
; AVX-NEXT: retq
entry:
@@ -190,30 +190,30 @@ entry:
define <2 x i64> @vsel_i64(<2 x i64> %v1, <2 x i64> %v2) {
; SSE2-LABEL: vsel_i64:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
; SSE2-NEXT: movapd %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: vsel_i64:
-; SSSE3: # BB#0: # %entry
+; SSSE3: # %bb.0: # %entry
; SSSE3-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
; SSSE3-NEXT: movapd %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: vsel_i64:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
; SSE41-NEXT: retq
;
; AVX1-LABEL: vsel_i64:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
; AVX1-NEXT: retq
;
; AVX2-LABEL: vsel_i64:
-; AVX2: # BB#0: # %entry
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX2: # %bb.0: # %entry
+; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
; AVX2-NEXT: retq
entry:
%vsel = select <2 x i1> <i1 true, i1 false>, <2 x i64> %v1, <2 x i64> %v2
@@ -222,7 +222,7 @@ entry:
define <8 x i16> @vsel_8xi16(<8 x i16> %v1, <8 x i16> %v2) {
; SSE2-LABEL: vsel_8xi16:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movaps {{.*#+}} xmm2 = [0,65535,65535,65535,0,65535,65535,65535]
; SSE2-NEXT: andps %xmm2, %xmm1
; SSE2-NEXT: andnps %xmm0, %xmm2
@@ -231,7 +231,7 @@ define <8 x i16> @vsel_8xi16(<8 x i16> %v1, <8 x i16> %v2) {
; SSE2-NEXT: retq
;
; SSSE3-LABEL: vsel_8xi16:
-; SSSE3: # BB#0: # %entry
+; SSSE3: # %bb.0: # %entry
; SSSE3-NEXT: movaps {{.*#+}} xmm2 = [0,65535,65535,65535,0,65535,65535,65535]
; SSSE3-NEXT: andps %xmm2, %xmm1
; SSSE3-NEXT: andnps %xmm0, %xmm2
@@ -240,12 +240,12 @@ define <8 x i16> @vsel_8xi16(<8 x i16> %v1, <8 x i16> %v2) {
; SSSE3-NEXT: retq
;
; SSE41-LABEL: vsel_8xi16:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
; SSE41-NEXT: retq
;
; AVX-LABEL: vsel_8xi16:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4],xmm1[5,6,7]
; AVX-NEXT: retq
entry:
@@ -255,7 +255,7 @@ entry:
define <16 x i8> @vsel_i8(<16 x i8> %v1, <16 x i8> %v2) {
; SSE2-LABEL: vsel_i8:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movaps {{.*#+}} xmm2 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
; SSE2-NEXT: andps %xmm2, %xmm1
; SSE2-NEXT: andnps %xmm0, %xmm2
@@ -264,14 +264,14 @@ define <16 x i8> @vsel_i8(<16 x i8> %v1, <16 x i8> %v2) {
; SSE2-NEXT: retq
;
; SSSE3-LABEL: vsel_i8:
-; SSSE3: # BB#0: # %entry
+; SSSE3: # %bb.0: # %entry
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[12],zero,zero,zero
; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = zero,xmm1[1,2,3],zero,xmm1[5,6,7],zero,xmm1[9,10,11],zero,xmm1[13,14,15]
; SSSE3-NEXT: por %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: vsel_i8:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: movdqa %xmm0, %xmm2
; SSE41-NEXT: movaps {{.*#+}} xmm0 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
@@ -279,7 +279,7 @@ define <16 x i8> @vsel_i8(<16 x i8> %v1, <16 x i8> %v2) {
; SSE41-NEXT: retq
;
; AVX-LABEL: vsel_i8:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
; AVX-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
; AVX-NEXT: retq
@@ -293,7 +293,7 @@ entry:
define <8 x float> @vsel_float8(<8 x float> %v1, <8 x float> %v2) {
; SSE2-LABEL: vsel_float8:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3]
; SSE2-NEXT: movss {{.*#+}} xmm3 = xmm1[0],xmm3[1,2,3]
; SSE2-NEXT: movaps %xmm2, %xmm0
@@ -301,7 +301,7 @@ define <8 x float> @vsel_float8(<8 x float> %v1, <8 x float> %v2) {
; SSE2-NEXT: retq
;
; SSSE3-LABEL: vsel_float8:
-; SSSE3: # BB#0: # %entry
+; SSSE3: # %bb.0: # %entry
; SSSE3-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3]
; SSSE3-NEXT: movss {{.*#+}} xmm3 = xmm1[0],xmm3[1,2,3]
; SSSE3-NEXT: movaps %xmm2, %xmm0
@@ -309,13 +309,13 @@ define <8 x float> @vsel_float8(<8 x float> %v1, <8 x float> %v2) {
; SSSE3-NEXT: retq
;
; SSE41-LABEL: vsel_float8:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3]
; SSE41-NEXT: retq
;
; AVX-LABEL: vsel_float8:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
; AVX-NEXT: retq
entry:
@@ -325,7 +325,7 @@ entry:
define <8 x i32> @vsel_i328(<8 x i32> %v1, <8 x i32> %v2) {
; SSE2-LABEL: vsel_i328:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3]
; SSE2-NEXT: movss {{.*#+}} xmm3 = xmm1[0],xmm3[1,2,3]
; SSE2-NEXT: movaps %xmm2, %xmm0
@@ -333,7 +333,7 @@ define <8 x i32> @vsel_i328(<8 x i32> %v1, <8 x i32> %v2) {
; SSE2-NEXT: retq
;
; SSSE3-LABEL: vsel_i328:
-; SSSE3: # BB#0: # %entry
+; SSSE3: # %bb.0: # %entry
; SSSE3-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3]
; SSSE3-NEXT: movss {{.*#+}} xmm3 = xmm1[0],xmm3[1,2,3]
; SSSE3-NEXT: movaps %xmm2, %xmm0
@@ -341,20 +341,15 @@ define <8 x i32> @vsel_i328(<8 x i32> %v1, <8 x i32> %v2) {
; SSSE3-NEXT: retq
;
; SSE41-LABEL: vsel_i328:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7]
; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3,4,5,6,7]
; SSE41-NEXT: retq
;
-; AVX1-LABEL: vsel_i328:
-; AVX1: # BB#0: # %entry
-; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: vsel_i328:
-; AVX2: # BB#0: # %entry
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
-; AVX2-NEXT: retq
+; AVX-LABEL: vsel_i328:
+; AVX: # %bb.0: # %entry
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
+; AVX-NEXT: retq
entry:
%vsel = select <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>, <8 x i32> %v1, <8 x i32> %v2
ret <8 x i32> %vsel
@@ -362,7 +357,7 @@ entry:
define <8 x double> @vsel_double8(<8 x double> %v1, <8 x double> %v2) {
; SSE2-LABEL: vsel_double8:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1]
; SSE2-NEXT: movsd {{.*#+}} xmm6 = xmm2[0],xmm6[1]
; SSE2-NEXT: movapd %xmm4, %xmm0
@@ -372,7 +367,7 @@ define <8 x double> @vsel_double8(<8 x double> %v1, <8 x double> %v2) {
; SSE2-NEXT: retq
;
; SSSE3-LABEL: vsel_double8:
-; SSSE3: # BB#0: # %entry
+; SSSE3: # %bb.0: # %entry
; SSSE3-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1]
; SSSE3-NEXT: movsd {{.*#+}} xmm6 = xmm2[0],xmm6[1]
; SSSE3-NEXT: movapd %xmm4, %xmm0
@@ -382,7 +377,7 @@ define <8 x double> @vsel_double8(<8 x double> %v1, <8 x double> %v2) {
; SSSE3-NEXT: retq
;
; SSE41-LABEL: vsel_double8:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm4[1]
; SSE41-NEXT: blendpd {{.*#+}} xmm2 = xmm2[0],xmm6[1]
; SSE41-NEXT: movaps %xmm5, %xmm1
@@ -390,7 +385,7 @@ define <8 x double> @vsel_double8(<8 x double> %v1, <8 x double> %v2) {
; SSE41-NEXT: retq
;
; AVX-LABEL: vsel_double8:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3]
; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm3[1,2,3]
; AVX-NEXT: retq
@@ -401,7 +396,7 @@ entry:
define <8 x i64> @vsel_i648(<8 x i64> %v1, <8 x i64> %v2) {
; SSE2-LABEL: vsel_i648:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1]
; SSE2-NEXT: movsd {{.*#+}} xmm6 = xmm2[0],xmm6[1]
; SSE2-NEXT: movapd %xmm4, %xmm0
@@ -411,7 +406,7 @@ define <8 x i64> @vsel_i648(<8 x i64> %v1, <8 x i64> %v2) {
; SSE2-NEXT: retq
;
; SSSE3-LABEL: vsel_i648:
-; SSSE3: # BB#0: # %entry
+; SSSE3: # %bb.0: # %entry
; SSSE3-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1]
; SSSE3-NEXT: movsd {{.*#+}} xmm6 = xmm2[0],xmm6[1]
; SSSE3-NEXT: movapd %xmm4, %xmm0
@@ -421,7 +416,7 @@ define <8 x i64> @vsel_i648(<8 x i64> %v1, <8 x i64> %v2) {
; SSSE3-NEXT: retq
;
; SSE41-LABEL: vsel_i648:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4,5,6,7]
; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm6[4,5,6,7]
; SSE41-NEXT: movaps %xmm5, %xmm1
@@ -429,15 +424,15 @@ define <8 x i64> @vsel_i648(<8 x i64> %v1, <8 x i64> %v2) {
; SSE41-NEXT: retq
;
; AVX1-LABEL: vsel_i648:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3]
; AVX1-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm3[1,2,3]
; AVX1-NEXT: retq
;
; AVX2-LABEL: vsel_i648:
-; AVX2: # BB#0: # %entry
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3,4,5,6,7]
+; AVX2: # %bb.0: # %entry
+; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7]
+; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3,4,5,6,7]
; AVX2-NEXT: retq
entry:
%vsel = select <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>, <8 x i64> %v1, <8 x i64> %v2
@@ -446,7 +441,7 @@ entry:
define <4 x double> @vsel_double4(<4 x double> %v1, <4 x double> %v2) {
; SSE2-LABEL: vsel_double4:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
; SSE2-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1]
; SSE2-NEXT: movapd %xmm2, %xmm0
@@ -454,7 +449,7 @@ define <4 x double> @vsel_double4(<4 x double> %v1, <4 x double> %v2) {
; SSE2-NEXT: retq
;
; SSSE3-LABEL: vsel_double4:
-; SSSE3: # BB#0: # %entry
+; SSSE3: # %bb.0: # %entry
; SSSE3-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
; SSSE3-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1]
; SSSE3-NEXT: movapd %xmm2, %xmm0
@@ -462,13 +457,13 @@ define <4 x double> @vsel_double4(<4 x double> %v1, <4 x double> %v2) {
; SSSE3-NEXT: retq
;
; SSE41-LABEL: vsel_double4:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm2[1]
; SSE41-NEXT: blendpd {{.*#+}} xmm1 = xmm1[0],xmm3[1]
; SSE41-NEXT: retq
;
; AVX-LABEL: vsel_double4:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3]
; AVX-NEXT: retq
entry:
@@ -478,7 +473,7 @@ entry:
define <2 x double> @testa(<2 x double> %x, <2 x double> %y) {
; SSE2-LABEL: testa:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movapd %xmm1, %xmm2
; SSE2-NEXT: cmplepd %xmm0, %xmm2
; SSE2-NEXT: andpd %xmm2, %xmm0
@@ -487,7 +482,7 @@ define <2 x double> @testa(<2 x double> %x, <2 x double> %y) {
; SSE2-NEXT: retq
;
; SSSE3-LABEL: testa:
-; SSSE3: # BB#0: # %entry
+; SSSE3: # %bb.0: # %entry
; SSSE3-NEXT: movapd %xmm1, %xmm2
; SSSE3-NEXT: cmplepd %xmm0, %xmm2
; SSSE3-NEXT: andpd %xmm2, %xmm0
@@ -496,7 +491,7 @@ define <2 x double> @testa(<2 x double> %x, <2 x double> %y) {
; SSSE3-NEXT: retq
;
; SSE41-LABEL: testa:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: movapd %xmm0, %xmm2
; SSE41-NEXT: movapd %xmm1, %xmm0
; SSE41-NEXT: cmplepd %xmm2, %xmm0
@@ -505,7 +500,7 @@ define <2 x double> @testa(<2 x double> %x, <2 x double> %y) {
; SSE41-NEXT: retq
;
; AVX-LABEL: testa:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vcmplepd %xmm0, %xmm1, %xmm2
; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; AVX-NEXT: retq
@@ -517,7 +512,7 @@ entry:
define <2 x double> @testb(<2 x double> %x, <2 x double> %y) {
; SSE2-LABEL: testb:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movapd %xmm1, %xmm2
; SSE2-NEXT: cmpnlepd %xmm0, %xmm2
; SSE2-NEXT: andpd %xmm2, %xmm0
@@ -526,7 +521,7 @@ define <2 x double> @testb(<2 x double> %x, <2 x double> %y) {
; SSE2-NEXT: retq
;
; SSSE3-LABEL: testb:
-; SSSE3: # BB#0: # %entry
+; SSSE3: # %bb.0: # %entry
; SSSE3-NEXT: movapd %xmm1, %xmm2
; SSSE3-NEXT: cmpnlepd %xmm0, %xmm2
; SSSE3-NEXT: andpd %xmm2, %xmm0
@@ -535,7 +530,7 @@ define <2 x double> @testb(<2 x double> %x, <2 x double> %y) {
; SSSE3-NEXT: retq
;
; SSE41-LABEL: testb:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: movapd %xmm0, %xmm2
; SSE41-NEXT: movapd %xmm1, %xmm0
; SSE41-NEXT: cmpnlepd %xmm2, %xmm0
@@ -544,7 +539,7 @@ define <2 x double> @testb(<2 x double> %x, <2 x double> %y) {
; SSE41-NEXT: retq
;
; AVX-LABEL: testb:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vcmpnlepd %xmm0, %xmm1, %xmm2
; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; AVX-NEXT: retq
@@ -558,27 +553,27 @@ entry:
; blend instruction with an immediate mask
define <4 x double> @constant_blendvpd_avx(<4 x double> %xy, <4 x double> %ab) {
; SSE2-LABEL: constant_blendvpd_avx:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1]
; SSE2-NEXT: movaps %xmm2, %xmm0
; SSE2-NEXT: movapd %xmm3, %xmm1
; SSE2-NEXT: retq
;
; SSSE3-LABEL: constant_blendvpd_avx:
-; SSSE3: # BB#0: # %entry
+; SSSE3: # %bb.0: # %entry
; SSSE3-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1]
; SSSE3-NEXT: movaps %xmm2, %xmm0
; SSSE3-NEXT: movapd %xmm3, %xmm1
; SSSE3-NEXT: retq
;
; SSE41-LABEL: constant_blendvpd_avx:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: blendpd {{.*#+}} xmm1 = xmm1[0],xmm3[1]
; SSE41-NEXT: movaps %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: constant_blendvpd_avx:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3]
; AVX-NEXT: retq
entry:
@@ -588,7 +583,7 @@ entry:
define <8 x float> @constant_blendvps_avx(<8 x float> %xyzw, <8 x float> %abcd) {
; SSE2-LABEL: constant_blendvps_avx:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm2[2,0]
; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0]
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm3[2,0]
@@ -598,7 +593,7 @@ define <8 x float> @constant_blendvps_avx(<8 x float> %xyzw, <8 x float> %abcd)
; SSE2-NEXT: retq
;
; SSSE3-LABEL: constant_blendvps_avx:
-; SSSE3: # BB#0: # %entry
+; SSSE3: # %bb.0: # %entry
; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm2[2,0]
; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0]
; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm3[2,0]
@@ -608,13 +603,13 @@ define <8 x float> @constant_blendvps_avx(<8 x float> %xyzw, <8 x float> %abcd)
; SSSE3-NEXT: retq
;
; SSE41-LABEL: constant_blendvps_avx:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3]
; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
; SSE41-NEXT: retq
;
; AVX-LABEL: constant_blendvps_avx:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7]
; AVX-NEXT: retq
entry:
@@ -624,7 +619,7 @@ entry:
define <32 x i8> @constant_pblendvb_avx2(<32 x i8> %xyzw, <32 x i8> %abcd) {
; SSE2-LABEL: constant_pblendvb_avx2:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movaps {{.*#+}} xmm4 = [255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255]
; SSE2-NEXT: movaps %xmm4, %xmm5
; SSE2-NEXT: andnps %xmm0, %xmm5
@@ -638,7 +633,7 @@ define <32 x i8> @constant_pblendvb_avx2(<32 x i8> %xyzw, <32 x i8> %abcd) {
; SSE2-NEXT: retq
;
; SSSE3-LABEL: constant_pblendvb_avx2:
-; SSSE3: # BB#0: # %entry
+; SSSE3: # %bb.0: # %entry
; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [128,128,2,128,4,5,6,128,128,128,10,128,12,13,14,128]
; SSSE3-NEXT: pshufb %xmm4, %xmm0
; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [0,1,128,3,128,128,128,7,8,9,128,11,128,128,128,15]
@@ -650,7 +645,7 @@ define <32 x i8> @constant_pblendvb_avx2(<32 x i8> %xyzw, <32 x i8> %abcd) {
; SSSE3-NEXT: retq
;
; SSE41-LABEL: constant_pblendvb_avx2:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: movdqa %xmm0, %xmm4
; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0]
; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm2
@@ -660,7 +655,7 @@ define <32 x i8> @constant_pblendvb_avx2(<32 x i8> %xyzw, <32 x i8> %abcd) {
; SSE41-NEXT: retq
;
; AVX1-LABEL: constant_pblendvb_avx2:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255]
; AVX1-NEXT: vandnps %ymm0, %ymm2, %ymm0
; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
@@ -668,7 +663,7 @@ define <32 x i8> @constant_pblendvb_avx2(<32 x i8> %xyzw, <32 x i8> %abcd) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: constant_pblendvb_avx2:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0]
; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
@@ -683,24 +678,24 @@ declare <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double>, <4 x double>, <4
;; 4 tests for shufflevectors that optimize to blend + immediate
define <4 x float> @blend_shufflevector_4xfloat(<4 x float> %a, <4 x float> %b) {
; SSE2-LABEL: blend_shufflevector_4xfloat:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3]
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: blend_shufflevector_4xfloat:
-; SSSE3: # BB#0: # %entry
+; SSSE3: # %bb.0: # %entry
; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3]
; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: blend_shufflevector_4xfloat:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
; SSE41-NEXT: retq
;
; AVX-LABEL: blend_shufflevector_4xfloat:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
; AVX-NEXT: retq
entry:
@@ -710,7 +705,7 @@ entry:
define <8 x float> @blend_shufflevector_8xfloat(<8 x float> %a, <8 x float> %b) {
; SSE2-LABEL: blend_shufflevector_8xfloat:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3]
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm3[3,0]
; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm1[0,2]
@@ -719,7 +714,7 @@ define <8 x float> @blend_shufflevector_8xfloat(<8 x float> %a, <8 x float> %b)
; SSE2-NEXT: retq
;
; SSSE3-LABEL: blend_shufflevector_8xfloat:
-; SSSE3: # BB#0: # %entry
+; SSSE3: # %bb.0: # %entry
; SSSE3-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3]
; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm3[3,0]
; SSSE3-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm1[0,2]
@@ -728,13 +723,13 @@ define <8 x float> @blend_shufflevector_8xfloat(<8 x float> %a, <8 x float> %b)
; SSSE3-NEXT: retq
;
; SSE41-LABEL: blend_shufflevector_8xfloat:
-; SSE41: # BB#0: # %entry
-; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3]
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
+; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3]
; SSE41-NEXT: retq
;
; AVX-LABEL: blend_shufflevector_8xfloat:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5],ymm0[6],ymm1[7]
; AVX-NEXT: retq
entry:
@@ -744,24 +739,24 @@ entry:
define <4 x double> @blend_shufflevector_4xdouble(<4 x double> %a, <4 x double> %b) {
; SSE2-LABEL: blend_shufflevector_4xdouble:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
; SSE2-NEXT: movapd %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: blend_shufflevector_4xdouble:
-; SSSE3: # BB#0: # %entry
+; SSSE3: # %bb.0: # %entry
; SSSE3-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
; SSSE3-NEXT: movapd %xmm2, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: blend_shufflevector_4xdouble:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm2[1]
; SSE41-NEXT: retq
;
; AVX-LABEL: blend_shufflevector_4xdouble:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3]
; AVX-NEXT: retq
entry:
@@ -771,31 +766,31 @@ entry:
define <4 x i64> @blend_shufflevector_4xi64(<4 x i64> %a, <4 x i64> %b) {
; SSE2-LABEL: blend_shufflevector_4xi64:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
; SSE2-NEXT: movaps %xmm3, %xmm1
; SSE2-NEXT: retq
;
; SSSE3-LABEL: blend_shufflevector_4xi64:
-; SSSE3: # BB#0: # %entry
+; SSSE3: # %bb.0: # %entry
; SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
; SSSE3-NEXT: movaps %xmm3, %xmm1
; SSSE3-NEXT: retq
;
; SSE41-LABEL: blend_shufflevector_4xi64:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
; SSE41-NEXT: movaps %xmm3, %xmm1
; SSE41-NEXT: retq
;
; AVX1-LABEL: blend_shufflevector_4xi64:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3]
; AVX1-NEXT: retq
;
; AVX2-LABEL: blend_shufflevector_4xi64:
-; AVX2: # BB#0: # %entry
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7]
+; AVX2: # %bb.0: # %entry
+; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7]
; AVX2-NEXT: retq
entry:
%select = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 7>
@@ -804,7 +799,7 @@ entry:
define <4 x i32> @blend_logic_v4i32(<4 x i32> %b, <4 x i32> %a, <4 x i32> %c) {
; SSE2-LABEL: blend_logic_v4i32:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: psrad $31, %xmm0
; SSE2-NEXT: pand %xmm0, %xmm1
; SSE2-NEXT: pandn %xmm2, %xmm0
@@ -812,7 +807,7 @@ define <4 x i32> @blend_logic_v4i32(<4 x i32> %b, <4 x i32> %a, <4 x i32> %c) {
; SSE2-NEXT: retq
;
; SSSE3-LABEL: blend_logic_v4i32:
-; SSSE3: # BB#0: # %entry
+; SSSE3: # %bb.0: # %entry
; SSSE3-NEXT: psrad $31, %xmm0
; SSSE3-NEXT: pand %xmm0, %xmm1
; SSSE3-NEXT: pandn %xmm2, %xmm0
@@ -820,14 +815,14 @@ define <4 x i32> @blend_logic_v4i32(<4 x i32> %b, <4 x i32> %a, <4 x i32> %c) {
; SSSE3-NEXT: retq
;
; SSE41-LABEL: blend_logic_v4i32:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: psrad $31, %xmm0
; SSE41-NEXT: pblendvb %xmm0, %xmm1, %xmm2
; SSE41-NEXT: movdqa %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: blend_logic_v4i32:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vpsrad $31, %xmm0, %xmm0
; AVX-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm0
; AVX-NEXT: retq
@@ -843,7 +838,7 @@ entry:
define <8 x i32> @blend_logic_v8i32(<8 x i32> %b, <8 x i32> %a, <8 x i32> %c) {
; SSE2-LABEL: blend_logic_v8i32:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: psrad $31, %xmm0
; SSE2-NEXT: psrad $31, %xmm1
; SSE2-NEXT: pand %xmm1, %xmm3
@@ -855,7 +850,7 @@ define <8 x i32> @blend_logic_v8i32(<8 x i32> %b, <8 x i32> %a, <8 x i32> %c) {
; SSE2-NEXT: retq
;
; SSSE3-LABEL: blend_logic_v8i32:
-; SSSE3: # BB#0: # %entry
+; SSSE3: # %bb.0: # %entry
; SSSE3-NEXT: psrad $31, %xmm0
; SSSE3-NEXT: psrad $31, %xmm1
; SSSE3-NEXT: pand %xmm1, %xmm3
@@ -867,7 +862,7 @@ define <8 x i32> @blend_logic_v8i32(<8 x i32> %b, <8 x i32> %a, <8 x i32> %c) {
; SSSE3-NEXT: retq
;
; SSE41-LABEL: blend_logic_v8i32:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: psrad $31, %xmm1
; SSE41-NEXT: psrad $31, %xmm0
; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm4
@@ -878,7 +873,7 @@ define <8 x i32> @blend_logic_v8i32(<8 x i32> %b, <8 x i32> %a, <8 x i32> %c) {
; SSE41-NEXT: retq
;
; AVX1-LABEL: blend_logic_v8i32:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vpsrad $31, %xmm0, %xmm3
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpsrad $31, %xmm0, %xmm0
@@ -889,7 +884,7 @@ define <8 x i32> @blend_logic_v8i32(<8 x i32> %b, <8 x i32> %a, <8 x i32> %c) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: blend_logic_v8i32:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpsrad $31, %ymm0, %ymm0
; AVX2-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm0
; AVX2-NEXT: retq
@@ -905,14 +900,14 @@ entry:
define <4 x i32> @blend_neg_logic_v4i32(<4 x i32> %a, <4 x i32> %b) {
; SSE-LABEL: blend_neg_logic_v4i32:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: psrad $31, %xmm1
; SSE-NEXT: pxor %xmm1, %xmm0
; SSE-NEXT: psubd %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: blend_neg_logic_v4i32:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vpsrad $31, %xmm1, %xmm1
; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpsubd %xmm1, %xmm0, %xmm0
@@ -929,7 +924,7 @@ entry:
define <8 x i32> @blend_neg_logic_v8i32(<8 x i32> %a, <8 x i32> %b) {
; SSE-LABEL: blend_neg_logic_v8i32:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: psrad $31, %xmm3
; SSE-NEXT: psrad $31, %xmm2
; SSE-NEXT: pxor %xmm2, %xmm0
@@ -939,7 +934,7 @@ define <8 x i32> @blend_neg_logic_v8i32(<8 x i32> %a, <8 x i32> %b) {
; SSE-NEXT: retq
;
; AVX1-LABEL: blend_neg_logic_v8i32:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vpsrad $31, %xmm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX1-NEXT: vpsrad $31, %xmm1, %xmm1
@@ -955,7 +950,7 @@ define <8 x i32> @blend_neg_logic_v8i32(<8 x i32> %a, <8 x i32> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: blend_neg_logic_v8i32:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpsrad $31, %ymm1, %ymm1
; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm0
@@ -972,7 +967,7 @@ entry:
define <4 x i32> @blend_neg_logic_v4i32_2(<4 x i32> %v, <4 x i32> %c) {
; SSE2-LABEL: blend_neg_logic_v4i32_2:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: psrad $31, %xmm1
; SSE2-NEXT: pxor %xmm1, %xmm0
; SSE2-NEXT: psubd %xmm0, %xmm1
@@ -980,7 +975,7 @@ define <4 x i32> @blend_neg_logic_v4i32_2(<4 x i32> %v, <4 x i32> %c) {
; SSE2-NEXT: retq
;
; SSSE3-LABEL: blend_neg_logic_v4i32_2:
-; SSSE3: # BB#0: # %entry
+; SSSE3: # %bb.0: # %entry
; SSSE3-NEXT: psrad $31, %xmm1
; SSSE3-NEXT: pxor %xmm1, %xmm0
; SSSE3-NEXT: psubd %xmm0, %xmm1
@@ -988,19 +983,17 @@ define <4 x i32> @blend_neg_logic_v4i32_2(<4 x i32> %v, <4 x i32> %c) {
; SSSE3-NEXT: retq
;
; SSE41-LABEL: blend_neg_logic_v4i32_2:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: psrad $31, %xmm1
; SSE41-NEXT: pxor %xmm3, %xmm3
; SSE41-NEXT: psubd %xmm2, %xmm3
-; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: movaps %xmm1, %xmm0
; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm3
; SSE41-NEXT: movaps %xmm3, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: blend_neg_logic_v4i32_2:
-; AVX: # BB#0: # %entry
-; AVX-NEXT: vpsrad $31, %xmm1, %xmm1
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX-NEXT: vpsubd %xmm0, %xmm2, %xmm2
; AVX-NEXT: vblendvps %xmm1, %xmm0, %xmm2, %xmm0
diff --git a/test/CodeGen/X86/vector-compare-all_of.ll b/test/CodeGen/X86/vector-compare-all_of.ll
index 202b8f7786b8..a055307c5df6 100644
--- a/test/CodeGen/X86/vector-compare-all_of.ll
+++ b/test/CodeGen/X86/vector-compare-all_of.ll
@@ -6,7 +6,7 @@
define i64 @test_v2f64_sext(<2 x double> %a0, <2 x double> %a1) {
; SSE-LABEL: test_v2f64_sext:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: cmpltpd %xmm0, %xmm1
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
; SSE-NEXT: pand %xmm1, %xmm0
@@ -14,7 +14,7 @@ define i64 @test_v2f64_sext(<2 x double> %a0, <2 x double> %a1) {
; SSE-NEXT: retq
;
; AVX-LABEL: test_v2f64_sext:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vcmpltpd %xmm0, %xmm1, %xmm0
; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX-NEXT: vandpd %xmm1, %xmm0, %xmm0
@@ -22,7 +22,7 @@ define i64 @test_v2f64_sext(<2 x double> %a0, <2 x double> %a1) {
; AVX-NEXT: retq
;
; AVX512-LABEL: test_v2f64_sext:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vcmpltpd %xmm0, %xmm1, %k1
; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; AVX512-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
@@ -40,7 +40,7 @@ define i64 @test_v2f64_sext(<2 x double> %a0, <2 x double> %a1) {
define i64 @test_v4f64_sext(<4 x double> %a0, <4 x double> %a1) {
; SSE-LABEL: test_v4f64_sext:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: cmpltpd %xmm1, %xmm3
; SSE-NEXT: cmpltpd %xmm0, %xmm2
; SSE-NEXT: andpd %xmm3, %xmm2
@@ -50,7 +50,7 @@ define i64 @test_v4f64_sext(<4 x double> %a0, <4 x double> %a1) {
; SSE-NEXT: retq
;
; AVX-LABEL: test_v4f64_sext:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vcmpltpd %ymm0, %ymm1, %ymm0
; AVX-NEXT: vmovmskpd %ymm0, %eax
; AVX-NEXT: xorl %ecx, %ecx
@@ -61,7 +61,7 @@ define i64 @test_v4f64_sext(<4 x double> %a0, <4 x double> %a1) {
; AVX-NEXT: retq
;
; AVX512-LABEL: test_v4f64_sext:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vcmpltpd %ymm0, %ymm1, %k1
; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; AVX512-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z}
@@ -84,10 +84,10 @@ define i64 @test_v4f64_sext(<4 x double> %a0, <4 x double> %a1) {
define i64 @test_v4f64_legal_sext(<4 x double> %a0, <4 x double> %a1) {
; SSE-LABEL: test_v4f64_legal_sext:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: cmpltpd %xmm1, %xmm3
; SSE-NEXT: cmpltpd %xmm0, %xmm2
-; SSE-NEXT: packsswb %xmm3, %xmm2
+; SSE-NEXT: packssdw %xmm3, %xmm2
; SSE-NEXT: movmskps %xmm2, %eax
; SSE-NEXT: xorl %ecx, %ecx
; SSE-NEXT: cmpl $15, %eax
@@ -97,10 +97,10 @@ define i64 @test_v4f64_legal_sext(<4 x double> %a0, <4 x double> %a1) {
; SSE-NEXT: retq
;
; AVX-LABEL: test_v4f64_legal_sext:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vcmpltpd %ymm0, %ymm1, %ymm0
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmovmskps %xmm0, %eax
; AVX-NEXT: xorl %ecx, %ecx
; AVX-NEXT: cmpl $15, %eax
@@ -111,7 +111,7 @@ define i64 @test_v4f64_legal_sext(<4 x double> %a0, <4 x double> %a1) {
; AVX-NEXT: retq
;
; AVX512-LABEL: test_v4f64_legal_sext:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vcmpltpd %ymm0, %ymm1, %k1
; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
@@ -136,7 +136,7 @@ define i64 @test_v4f64_legal_sext(<4 x double> %a0, <4 x double> %a1) {
define i32 @test_v4f32_sext(<4 x float> %a0, <4 x float> %a1) {
; SSE-LABEL: test_v4f32_sext:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: cmpltps %xmm0, %xmm1
; SSE-NEXT: movmskps %xmm1, %eax
; SSE-NEXT: xorl %ecx, %ecx
@@ -146,7 +146,7 @@ define i32 @test_v4f32_sext(<4 x float> %a0, <4 x float> %a1) {
; SSE-NEXT: retq
;
; AVX-LABEL: test_v4f32_sext:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vcmpltps %xmm0, %xmm1, %xmm0
; AVX-NEXT: vmovmskps %xmm0, %eax
; AVX-NEXT: xorl %ecx, %ecx
@@ -156,7 +156,7 @@ define i32 @test_v4f32_sext(<4 x float> %a0, <4 x float> %a1) {
; AVX-NEXT: retq
;
; AVX512-LABEL: test_v4f32_sext:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vcmpltps %xmm0, %xmm1, %k1
; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
@@ -178,7 +178,7 @@ define i32 @test_v4f32_sext(<4 x float> %a0, <4 x float> %a1) {
define i32 @test_v8f32_sext(<8 x float> %a0, <8 x float> %a1) {
; SSE-LABEL: test_v8f32_sext:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: cmpltps %xmm1, %xmm3
; SSE-NEXT: cmpltps %xmm0, %xmm2
; SSE-NEXT: andps %xmm3, %xmm2
@@ -190,7 +190,7 @@ define i32 @test_v8f32_sext(<8 x float> %a0, <8 x float> %a1) {
; SSE-NEXT: retq
;
; AVX-LABEL: test_v8f32_sext:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vcmpltps %ymm0, %ymm1, %ymm0
; AVX-NEXT: vmovmskps %ymm0, %eax
; AVX-NEXT: xorl %ecx, %ecx
@@ -201,7 +201,7 @@ define i32 @test_v8f32_sext(<8 x float> %a0, <8 x float> %a1) {
; AVX-NEXT: retq
;
; AVX512-LABEL: test_v8f32_sext:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vcmpltps %ymm0, %ymm1, %k1
; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; AVX512-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
@@ -228,10 +228,10 @@ define i32 @test_v8f32_sext(<8 x float> %a0, <8 x float> %a1) {
define i32 @test_v8f32_legal_sext(<8 x float> %a0, <8 x float> %a1) {
; SSE-LABEL: test_v8f32_legal_sext:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: cmpltps %xmm1, %xmm3
; SSE-NEXT: cmpltps %xmm0, %xmm2
-; SSE-NEXT: packsswb %xmm3, %xmm2
+; SSE-NEXT: packssdw %xmm3, %xmm2
; SSE-NEXT: pmovmskb %xmm2, %eax
; SSE-NEXT: xorl %ecx, %ecx
; SSE-NEXT: cmpl $65535, %eax # imm = 0xFFFF
@@ -240,10 +240,10 @@ define i32 @test_v8f32_legal_sext(<8 x float> %a0, <8 x float> %a1) {
; SSE-NEXT: retq
;
; AVX-LABEL: test_v8f32_legal_sext:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vcmpltps %ymm0, %ymm1, %ymm0
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpmovmskb %xmm0, %eax
; AVX-NEXT: xorl %ecx, %ecx
; AVX-NEXT: cmpl $65535, %eax # imm = 0xFFFF
@@ -253,7 +253,7 @@ define i32 @test_v8f32_legal_sext(<8 x float> %a0, <8 x float> %a1) {
; AVX-NEXT: retq
;
; AVX512-LABEL: test_v8f32_legal_sext:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vcmpltps %ymm0, %ymm1, %k0
; AVX512-NEXT: vpmovm2w %k0, %xmm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
@@ -281,7 +281,7 @@ define i32 @test_v8f32_legal_sext(<8 x float> %a0, <8 x float> %a1) {
define i64 @test_v2i64_sext(<2 x i64> %a0, <2 x i64> %a1) {
; SSE-LABEL: test_v2i64_sext:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pcmpgtq %xmm1, %xmm0
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; SSE-NEXT: pand %xmm0, %xmm1
@@ -289,7 +289,7 @@ define i64 @test_v2i64_sext(<2 x i64> %a0, <2 x i64> %a1) {
; SSE-NEXT: retq
;
; AVX-LABEL: test_v2i64_sext:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
@@ -297,7 +297,7 @@ define i64 @test_v2i64_sext(<2 x i64> %a0, <2 x i64> %a1) {
; AVX-NEXT: retq
;
; AVX512-LABEL: test_v2i64_sext:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpcmpgtq %xmm1, %xmm0, %k1
; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; AVX512-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
@@ -315,7 +315,7 @@ define i64 @test_v2i64_sext(<2 x i64> %a0, <2 x i64> %a1) {
define i64 @test_v4i64_sext(<4 x i64> %a0, <4 x i64> %a1) {
; SSE-LABEL: test_v4i64_sext:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pcmpgtq %xmm3, %xmm1
; SSE-NEXT: pcmpgtq %xmm2, %xmm0
; SSE-NEXT: pand %xmm1, %xmm0
@@ -325,7 +325,7 @@ define i64 @test_v4i64_sext(<4 x i64> %a0, <4 x i64> %a1) {
; SSE-NEXT: retq
;
; AVX1-LABEL: test_v4i64_sext:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
@@ -340,7 +340,7 @@ define i64 @test_v4i64_sext(<4 x i64> %a0, <4 x i64> %a1) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_v4i64_sext:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vmovmskpd %ymm0, %eax
; AVX2-NEXT: xorl %ecx, %ecx
@@ -351,7 +351,7 @@ define i64 @test_v4i64_sext(<4 x i64> %a0, <4 x i64> %a1) {
; AVX2-NEXT: retq
;
; AVX512-LABEL: test_v4i64_sext:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpcmpgtq %ymm1, %ymm0, %k1
; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; AVX512-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z}
@@ -374,10 +374,10 @@ define i64 @test_v4i64_sext(<4 x i64> %a0, <4 x i64> %a1) {
define i64 @test_v4i64_legal_sext(<4 x i64> %a0, <4 x i64> %a1) {
; SSE-LABEL: test_v4i64_legal_sext:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pcmpgtq %xmm3, %xmm1
; SSE-NEXT: pcmpgtq %xmm2, %xmm0
-; SSE-NEXT: packsswb %xmm1, %xmm0
+; SSE-NEXT: packssdw %xmm1, %xmm0
; SSE-NEXT: movmskps %xmm0, %eax
; SSE-NEXT: xorl %ecx, %ecx
; SSE-NEXT: cmpl $15, %eax
@@ -387,12 +387,12 @@ define i64 @test_v4i64_legal_sext(<4 x i64> %a0, <4 x i64> %a1) {
; SSE-NEXT: retq
;
; AVX1-LABEL: test_v4i64_legal_sext:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpacksswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vmovmskps %xmm0, %eax
; AVX1-NEXT: xorl %ecx, %ecx
; AVX1-NEXT: cmpl $15, %eax
@@ -403,10 +403,10 @@ define i64 @test_v4i64_legal_sext(<4 x i64> %a0, <4 x i64> %a1) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_v4i64_legal_sext:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vmovmskps %xmm0, %eax
; AVX2-NEXT: xorl %ecx, %ecx
; AVX2-NEXT: cmpl $15, %eax
@@ -417,7 +417,7 @@ define i64 @test_v4i64_legal_sext(<4 x i64> %a0, <4 x i64> %a1) {
; AVX2-NEXT: retq
;
; AVX512-LABEL: test_v4i64_legal_sext:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpcmpgtq %ymm1, %ymm0, %k1
; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
@@ -442,7 +442,7 @@ define i64 @test_v4i64_legal_sext(<4 x i64> %a0, <4 x i64> %a1) {
define i32 @test_v4i32_sext(<4 x i32> %a0, <4 x i32> %a1) {
; SSE-LABEL: test_v4i32_sext:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pcmpgtd %xmm1, %xmm0
; SSE-NEXT: movmskps %xmm0, %eax
; SSE-NEXT: xorl %ecx, %ecx
@@ -452,7 +452,7 @@ define i32 @test_v4i32_sext(<4 x i32> %a0, <4 x i32> %a1) {
; SSE-NEXT: retq
;
; AVX-LABEL: test_v4i32_sext:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmovmskps %xmm0, %eax
; AVX-NEXT: xorl %ecx, %ecx
@@ -462,7 +462,7 @@ define i32 @test_v4i32_sext(<4 x i32> %a0, <4 x i32> %a1) {
; AVX-NEXT: retq
;
; AVX512-LABEL: test_v4i32_sext:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpcmpgtd %xmm1, %xmm0, %k1
; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
@@ -484,7 +484,7 @@ define i32 @test_v4i32_sext(<4 x i32> %a0, <4 x i32> %a1) {
define i32 @test_v8i32_sext(<8 x i32> %a0, <8 x i32> %a1) {
; SSE-LABEL: test_v8i32_sext:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pcmpgtd %xmm3, %xmm1
; SSE-NEXT: pcmpgtd %xmm2, %xmm0
; SSE-NEXT: pand %xmm1, %xmm0
@@ -496,7 +496,7 @@ define i32 @test_v8i32_sext(<8 x i32> %a0, <8 x i32> %a1) {
; SSE-NEXT: retq
;
; AVX1-LABEL: test_v8i32_sext:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpcmpgtd %xmm2, %xmm3, %xmm2
@@ -511,7 +511,7 @@ define i32 @test_v8i32_sext(<8 x i32> %a0, <8 x i32> %a1) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_v8i32_sext:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vmovmskps %ymm0, %eax
; AVX2-NEXT: xorl %ecx, %ecx
@@ -522,7 +522,7 @@ define i32 @test_v8i32_sext(<8 x i32> %a0, <8 x i32> %a1) {
; AVX2-NEXT: retq
;
; AVX512-LABEL: test_v8i32_sext:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpcmpgtd %ymm1, %ymm0, %k1
; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; AVX512-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
@@ -549,10 +549,10 @@ define i32 @test_v8i32_sext(<8 x i32> %a0, <8 x i32> %a1) {
define i32 @test_v8i32_legal_sext(<8 x i32> %a0, <8 x i32> %a1) {
; SSE-LABEL: test_v8i32_legal_sext:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pcmpgtd %xmm3, %xmm1
; SSE-NEXT: pcmpgtd %xmm2, %xmm0
-; SSE-NEXT: packsswb %xmm1, %xmm0
+; SSE-NEXT: packssdw %xmm1, %xmm0
; SSE-NEXT: pmovmskb %xmm0, %eax
; SSE-NEXT: xorl %ecx, %ecx
; SSE-NEXT: cmpl $65535, %eax # imm = 0xFFFF
@@ -561,12 +561,12 @@ define i32 @test_v8i32_legal_sext(<8 x i32> %a0, <8 x i32> %a1) {
; SSE-NEXT: retq
;
; AVX1-LABEL: test_v8i32_legal_sext:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpcmpgtd %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpacksswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpmovmskb %xmm0, %eax
; AVX1-NEXT: xorl %ecx, %ecx
; AVX1-NEXT: cmpl $65535, %eax # imm = 0xFFFF
@@ -576,10 +576,10 @@ define i32 @test_v8i32_legal_sext(<8 x i32> %a0, <8 x i32> %a1) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_v8i32_legal_sext:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpmovmskb %xmm0, %eax
; AVX2-NEXT: xorl %ecx, %ecx
; AVX2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
@@ -589,7 +589,7 @@ define i32 @test_v8i32_legal_sext(<8 x i32> %a0, <8 x i32> %a1) {
; AVX2-NEXT: retq
;
; AVX512-LABEL: test_v8i32_legal_sext:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpcmpgtd %ymm1, %ymm0, %k0
; AVX512-NEXT: vpmovm2w %k0, %xmm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
@@ -617,29 +617,29 @@ define i32 @test_v8i32_legal_sext(<8 x i32> %a0, <8 x i32> %a1) {
define i16 @test_v8i16_sext(<8 x i16> %a0, <8 x i16> %a1) {
; SSE-LABEL: test_v8i16_sext:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pcmpgtw %xmm1, %xmm0
; SSE-NEXT: pmovmskb %xmm0, %eax
; SSE-NEXT: xorl %ecx, %ecx
; SSE-NEXT: cmpl $65535, %eax # imm = 0xFFFF
; SSE-NEXT: movl $-1, %eax
; SSE-NEXT: cmovnel %ecx, %eax
-; SSE-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; SSE-NEXT: # kill: def %ax killed %ax killed %eax
; SSE-NEXT: retq
;
; AVX-LABEL: test_v8i16_sext:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpmovmskb %xmm0, %eax
; AVX-NEXT: xorl %ecx, %ecx
; AVX-NEXT: cmpl $65535, %eax # imm = 0xFFFF
; AVX-NEXT: movl $-1, %eax
; AVX-NEXT: cmovnel %ecx, %eax
-; AVX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX-NEXT: # kill: def %ax killed %ax killed %eax
; AVX-NEXT: retq
;
; AVX512-LABEL: test_v8i16_sext:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpcmpgtw %xmm1, %xmm0, %k0
; AVX512-NEXT: vpmovm2w %k0, %xmm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
@@ -649,7 +649,7 @@ define i16 @test_v8i16_sext(<8 x i16> %a0, <8 x i16> %a1) {
; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vmovd %xmm0, %eax
-; AVX512-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX512-NEXT: # kill: def %ax killed %ax killed %eax
; AVX512-NEXT: retq
%c = icmp sgt <8 x i16> %a0, %a1
%s = sext <8 x i1> %c to <8 x i16>
@@ -665,7 +665,7 @@ define i16 @test_v8i16_sext(<8 x i16> %a0, <8 x i16> %a1) {
define i16 @test_v16i16_sext(<16 x i16> %a0, <16 x i16> %a1) {
; SSE-LABEL: test_v16i16_sext:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pcmpgtw %xmm3, %xmm1
; SSE-NEXT: pcmpgtw %xmm2, %xmm0
; SSE-NEXT: pand %xmm1, %xmm0
@@ -674,41 +674,41 @@ define i16 @test_v16i16_sext(<16 x i16> %a0, <16 x i16> %a1) {
; SSE-NEXT: cmpl $65535, %eax # imm = 0xFFFF
; SSE-NEXT: movl $-1, %eax
; SSE-NEXT: cmovnel %ecx, %eax
-; SSE-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; SSE-NEXT: # kill: def %ax killed %ax killed %eax
; SSE-NEXT: retq
;
; AVX1-LABEL: test_v16i16_sext:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpcmpgtw %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vmovd %xmm0, %eax
-; AVX1-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX1-NEXT: # kill: def %ax killed %ax killed %eax
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_v16i16_sext:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpmovmskb %ymm0, %ecx
; AVX2-NEXT: xorl %eax, %eax
; AVX2-NEXT: cmpl $-1, %ecx
; AVX2-NEXT: cmovel %ecx, %eax
-; AVX2-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX2-NEXT: # kill: def %ax killed %ax killed %eax
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512-LABEL: test_v16i16_sext:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpcmpgtw %ymm1, %ymm0, %k0
; AVX512-NEXT: vpmovm2w %k0, %ymm0
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
@@ -720,7 +720,7 @@ define i16 @test_v16i16_sext(<16 x i16> %a0, <16 x i16> %a1) {
; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512-NEXT: vmovd %xmm0, %eax
-; AVX512-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX512-NEXT: # kill: def %ax killed %ax killed %eax
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%c = icmp sgt <16 x i16> %a0, %a1
@@ -739,7 +739,7 @@ define i16 @test_v16i16_sext(<16 x i16> %a0, <16 x i16> %a1) {
define i16 @test_v16i16_legal_sext(<16 x i16> %a0, <16 x i16> %a1) {
; SSE-LABEL: test_v16i16_legal_sext:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pcmpgtw %xmm3, %xmm1
; SSE-NEXT: pcmpgtw %xmm2, %xmm0
; SSE-NEXT: packsswb %xmm1, %xmm0
@@ -748,11 +748,11 @@ define i16 @test_v16i16_legal_sext(<16 x i16> %a0, <16 x i16> %a1) {
; SSE-NEXT: cmpl $65535, %eax # imm = 0xFFFF
; SSE-NEXT: movl $-1, %eax
; SSE-NEXT: cmovnel %ecx, %eax
-; SSE-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; SSE-NEXT: # kill: def %ax killed %ax killed %eax
; SSE-NEXT: retq
;
; AVX1-LABEL: test_v16i16_legal_sext:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpcmpgtw %xmm2, %xmm3, %xmm2
@@ -763,12 +763,12 @@ define i16 @test_v16i16_legal_sext(<16 x i16> %a0, <16 x i16> %a1) {
; AVX1-NEXT: cmpl $65535, %eax # imm = 0xFFFF
; AVX1-NEXT: movl $-1, %eax
; AVX1-NEXT: cmovnel %ecx, %eax
-; AVX1-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX1-NEXT: # kill: def %ax killed %ax killed %eax
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_v16i16_legal_sext:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
@@ -777,12 +777,12 @@ define i16 @test_v16i16_legal_sext(<16 x i16> %a0, <16 x i16> %a1) {
; AVX2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
; AVX2-NEXT: movl $-1, %eax
; AVX2-NEXT: cmovnel %ecx, %eax
-; AVX2-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX2-NEXT: # kill: def %ax killed %ax killed %eax
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512-LABEL: test_v16i16_legal_sext:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpcmpgtw %ymm1, %ymm0, %k0
; AVX512-NEXT: vpmovm2b %k0, %xmm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
@@ -795,7 +795,7 @@ define i16 @test_v16i16_legal_sext(<16 x i16> %a0, <16 x i16> %a1) {
; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpextrb $0, %xmm0, %eax
; AVX512-NEXT: movsbl %al, %eax
-; AVX512-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX512-NEXT: # kill: def %ax killed %ax killed %eax
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%c = icmp sgt <16 x i16> %a0, %a1
@@ -815,29 +815,29 @@ define i16 @test_v16i16_legal_sext(<16 x i16> %a0, <16 x i16> %a1) {
define i8 @test_v16i8_sext(<16 x i8> %a0, <16 x i8> %a1) {
; SSE-LABEL: test_v16i8_sext:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pcmpgtb %xmm1, %xmm0
; SSE-NEXT: pmovmskb %xmm0, %eax
; SSE-NEXT: xorl %ecx, %ecx
; SSE-NEXT: cmpl $65535, %eax # imm = 0xFFFF
; SSE-NEXT: movl $-1, %eax
; SSE-NEXT: cmovnel %ecx, %eax
-; SSE-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; SSE-NEXT: # kill: def %al killed %al killed %eax
; SSE-NEXT: retq
;
; AVX-LABEL: test_v16i8_sext:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpmovmskb %xmm0, %eax
; AVX-NEXT: xorl %ecx, %ecx
; AVX-NEXT: cmpl $65535, %eax # imm = 0xFFFF
; AVX-NEXT: movl $-1, %eax
; AVX-NEXT: cmovnel %ecx, %eax
-; AVX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX-NEXT: # kill: def %al killed %al killed %eax
; AVX-NEXT: retq
;
; AVX512-LABEL: test_v16i8_sext:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpcmpgtb %xmm1, %xmm0, %k0
; AVX512-NEXT: vpmovm2b %k0, %xmm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
@@ -849,7 +849,7 @@ define i8 @test_v16i8_sext(<16 x i8> %a0, <16 x i8> %a1) {
; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpextrb $0, %xmm0, %eax
-; AVX512-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512-NEXT: # kill: def %al killed %al killed %eax
; AVX512-NEXT: retq
%c = icmp sgt <16 x i8> %a0, %a1
%s = sext <16 x i1> %c to <16 x i8>
@@ -867,7 +867,7 @@ define i8 @test_v16i8_sext(<16 x i8> %a0, <16 x i8> %a1) {
define i8 @test_v32i8_sext(<32 x i8> %a0, <32 x i8> %a1) {
; SSE-LABEL: test_v32i8_sext:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pcmpgtb %xmm3, %xmm1
; SSE-NEXT: pcmpgtb %xmm2, %xmm0
; SSE-NEXT: pand %xmm1, %xmm0
@@ -876,43 +876,43 @@ define i8 @test_v32i8_sext(<32 x i8> %a0, <32 x i8> %a1) {
; SSE-NEXT: cmpl $65535, %eax # imm = 0xFFFF
; SSE-NEXT: movl $-1, %eax
; SSE-NEXT: cmovnel %ecx, %eax
-; SSE-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; SSE-NEXT: # kill: def %al killed %al killed %eax
; SSE-NEXT: retq
;
; AVX1-LABEL: test_v32i8_sext:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpcmpgtb %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vpextrb $0, %xmm0, %eax
-; AVX1-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX1-NEXT: # kill: def %al killed %al killed %eax
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_v32i8_sext:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpmovmskb %ymm0, %ecx
; AVX2-NEXT: xorl %eax, %eax
; AVX2-NEXT: cmpl $-1, %ecx
; AVX2-NEXT: cmovel %ecx, %eax
-; AVX2-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX2-NEXT: # kill: def %al killed %al killed %eax
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512-LABEL: test_v32i8_sext:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpcmpgtb %ymm1, %ymm0, %k0
; AVX512-NEXT: vpmovm2b %k0, %ymm0
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
@@ -926,7 +926,7 @@ define i8 @test_v32i8_sext(<32 x i8> %a0, <32 x i8> %a1) {
; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512-NEXT: vpextrb $0, %xmm0, %eax
-; AVX512-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512-NEXT: # kill: def %al killed %al killed %eax
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%c = icmp sgt <32 x i8> %a0, %a1
diff --git a/test/CodeGen/X86/vector-compare-any_of.ll b/test/CodeGen/X86/vector-compare-any_of.ll
index 043ba28e8fa4..54d01461c142 100644
--- a/test/CodeGen/X86/vector-compare-any_of.ll
+++ b/test/CodeGen/X86/vector-compare-any_of.ll
@@ -6,7 +6,7 @@
define i64 @test_v2f64_sext(<2 x double> %a0, <2 x double> %a1) {
; SSE-LABEL: test_v2f64_sext:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: cmpltpd %xmm0, %xmm1
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
; SSE-NEXT: por %xmm1, %xmm0
@@ -14,7 +14,7 @@ define i64 @test_v2f64_sext(<2 x double> %a0, <2 x double> %a1) {
; SSE-NEXT: retq
;
; AVX-LABEL: test_v2f64_sext:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vcmpltpd %xmm0, %xmm1, %xmm0
; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX-NEXT: vorpd %xmm1, %xmm0, %xmm0
@@ -22,7 +22,7 @@ define i64 @test_v2f64_sext(<2 x double> %a0, <2 x double> %a1) {
; AVX-NEXT: retq
;
; AVX512-LABEL: test_v2f64_sext:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vcmpltpd %xmm0, %xmm1, %k1
; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; AVX512-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
@@ -40,7 +40,7 @@ define i64 @test_v2f64_sext(<2 x double> %a0, <2 x double> %a1) {
define i64 @test_v4f64_sext(<4 x double> %a0, <4 x double> %a1) {
; SSE-LABEL: test_v4f64_sext:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: cmpltpd %xmm1, %xmm3
; SSE-NEXT: cmpltpd %xmm0, %xmm2
; SSE-NEXT: orpd %xmm3, %xmm2
@@ -50,7 +50,7 @@ define i64 @test_v4f64_sext(<4 x double> %a0, <4 x double> %a1) {
; SSE-NEXT: retq
;
; AVX-LABEL: test_v4f64_sext:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vcmpltpd %ymm0, %ymm1, %ymm0
; AVX-NEXT: vmovmskpd %ymm0, %eax
; AVX-NEXT: negl %eax
@@ -59,7 +59,7 @@ define i64 @test_v4f64_sext(<4 x double> %a0, <4 x double> %a1) {
; AVX-NEXT: retq
;
; AVX512-LABEL: test_v4f64_sext:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vcmpltpd %ymm0, %ymm1, %k1
; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; AVX512-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z}
@@ -82,10 +82,10 @@ define i64 @test_v4f64_sext(<4 x double> %a0, <4 x double> %a1) {
define i64 @test_v4f64_legal_sext(<4 x double> %a0, <4 x double> %a1) {
; SSE-LABEL: test_v4f64_legal_sext:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: cmpltpd %xmm1, %xmm3
; SSE-NEXT: cmpltpd %xmm0, %xmm2
-; SSE-NEXT: packsswb %xmm3, %xmm2
+; SSE-NEXT: packssdw %xmm3, %xmm2
; SSE-NEXT: movmskps %xmm2, %eax
; SSE-NEXT: negl %eax
; SSE-NEXT: sbbl %eax, %eax
@@ -93,10 +93,10 @@ define i64 @test_v4f64_legal_sext(<4 x double> %a0, <4 x double> %a1) {
; SSE-NEXT: retq
;
; AVX-LABEL: test_v4f64_legal_sext:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vcmpltpd %ymm0, %ymm1, %ymm0
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmovmskps %xmm0, %eax
; AVX-NEXT: negl %eax
; AVX-NEXT: sbbl %eax, %eax
@@ -105,7 +105,7 @@ define i64 @test_v4f64_legal_sext(<4 x double> %a0, <4 x double> %a1) {
; AVX-NEXT: retq
;
; AVX512-LABEL: test_v4f64_legal_sext:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vcmpltpd %ymm0, %ymm1, %k1
; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
@@ -130,7 +130,7 @@ define i64 @test_v4f64_legal_sext(<4 x double> %a0, <4 x double> %a1) {
define i32 @test_v4f32_sext(<4 x float> %a0, <4 x float> %a1) {
; SSE-LABEL: test_v4f32_sext:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: cmpltps %xmm0, %xmm1
; SSE-NEXT: movmskps %xmm1, %eax
; SSE-NEXT: negl %eax
@@ -138,7 +138,7 @@ define i32 @test_v4f32_sext(<4 x float> %a0, <4 x float> %a1) {
; SSE-NEXT: retq
;
; AVX-LABEL: test_v4f32_sext:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vcmpltps %xmm0, %xmm1, %xmm0
; AVX-NEXT: vmovmskps %xmm0, %eax
; AVX-NEXT: negl %eax
@@ -146,7 +146,7 @@ define i32 @test_v4f32_sext(<4 x float> %a0, <4 x float> %a1) {
; AVX-NEXT: retq
;
; AVX512-LABEL: test_v4f32_sext:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vcmpltps %xmm0, %xmm1, %k1
; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
@@ -168,7 +168,7 @@ define i32 @test_v4f32_sext(<4 x float> %a0, <4 x float> %a1) {
define i32 @test_v8f32_sext(<8 x float> %a0, <8 x float> %a1) {
; SSE-LABEL: test_v8f32_sext:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: cmpltps %xmm1, %xmm3
; SSE-NEXT: cmpltps %xmm0, %xmm2
; SSE-NEXT: orps %xmm3, %xmm2
@@ -178,7 +178,7 @@ define i32 @test_v8f32_sext(<8 x float> %a0, <8 x float> %a1) {
; SSE-NEXT: retq
;
; AVX-LABEL: test_v8f32_sext:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vcmpltps %ymm0, %ymm1, %ymm0
; AVX-NEXT: vmovmskps %ymm0, %eax
; AVX-NEXT: negl %eax
@@ -187,7 +187,7 @@ define i32 @test_v8f32_sext(<8 x float> %a0, <8 x float> %a1) {
; AVX-NEXT: retq
;
; AVX512-LABEL: test_v8f32_sext:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vcmpltps %ymm0, %ymm1, %k1
; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; AVX512-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
@@ -214,20 +214,20 @@ define i32 @test_v8f32_sext(<8 x float> %a0, <8 x float> %a1) {
define i32 @test_v8f32_legal_sext(<8 x float> %a0, <8 x float> %a1) {
; SSE-LABEL: test_v8f32_legal_sext:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: cmpltps %xmm1, %xmm3
; SSE-NEXT: cmpltps %xmm0, %xmm2
-; SSE-NEXT: packsswb %xmm3, %xmm2
+; SSE-NEXT: packssdw %xmm3, %xmm2
; SSE-NEXT: pmovmskb %xmm2, %eax
; SSE-NEXT: negl %eax
; SSE-NEXT: sbbl %eax, %eax
; SSE-NEXT: retq
;
; AVX-LABEL: test_v8f32_legal_sext:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vcmpltps %ymm0, %ymm1, %ymm0
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpmovmskb %xmm0, %eax
; AVX-NEXT: negl %eax
; AVX-NEXT: sbbl %eax, %eax
@@ -235,7 +235,7 @@ define i32 @test_v8f32_legal_sext(<8 x float> %a0, <8 x float> %a1) {
; AVX-NEXT: retq
;
; AVX512-LABEL: test_v8f32_legal_sext:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vcmpltps %ymm0, %ymm1, %k0
; AVX512-NEXT: vpmovm2w %k0, %xmm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
@@ -263,7 +263,7 @@ define i32 @test_v8f32_legal_sext(<8 x float> %a0, <8 x float> %a1) {
define i64 @test_v2i64_sext(<2 x i64> %a0, <2 x i64> %a1) {
; SSE-LABEL: test_v2i64_sext:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pcmpgtq %xmm1, %xmm0
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; SSE-NEXT: por %xmm0, %xmm1
@@ -271,7 +271,7 @@ define i64 @test_v2i64_sext(<2 x i64> %a0, <2 x i64> %a1) {
; SSE-NEXT: retq
;
; AVX-LABEL: test_v2i64_sext:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
@@ -279,7 +279,7 @@ define i64 @test_v2i64_sext(<2 x i64> %a0, <2 x i64> %a1) {
; AVX-NEXT: retq
;
; AVX512-LABEL: test_v2i64_sext:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpcmpgtq %xmm1, %xmm0, %k1
; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; AVX512-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
@@ -297,7 +297,7 @@ define i64 @test_v2i64_sext(<2 x i64> %a0, <2 x i64> %a1) {
define i64 @test_v4i64_sext(<4 x i64> %a0, <4 x i64> %a1) {
; SSE-LABEL: test_v4i64_sext:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pcmpgtq %xmm3, %xmm1
; SSE-NEXT: pcmpgtq %xmm2, %xmm0
; SSE-NEXT: por %xmm1, %xmm0
@@ -307,7 +307,7 @@ define i64 @test_v4i64_sext(<4 x i64> %a0, <4 x i64> %a1) {
; SSE-NEXT: retq
;
; AVX1-LABEL: test_v4i64_sext:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
@@ -320,7 +320,7 @@ define i64 @test_v4i64_sext(<4 x i64> %a0, <4 x i64> %a1) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_v4i64_sext:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vmovmskpd %ymm0, %eax
; AVX2-NEXT: negl %eax
@@ -329,7 +329,7 @@ define i64 @test_v4i64_sext(<4 x i64> %a0, <4 x i64> %a1) {
; AVX2-NEXT: retq
;
; AVX512-LABEL: test_v4i64_sext:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpcmpgtq %ymm1, %ymm0, %k1
; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; AVX512-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z}
@@ -352,10 +352,10 @@ define i64 @test_v4i64_sext(<4 x i64> %a0, <4 x i64> %a1) {
define i64 @test_v4i64_legal_sext(<4 x i64> %a0, <4 x i64> %a1) {
; SSE-LABEL: test_v4i64_legal_sext:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pcmpgtq %xmm3, %xmm1
; SSE-NEXT: pcmpgtq %xmm2, %xmm0
-; SSE-NEXT: packsswb %xmm1, %xmm0
+; SSE-NEXT: packssdw %xmm1, %xmm0
; SSE-NEXT: movmskps %xmm0, %eax
; SSE-NEXT: negl %eax
; SSE-NEXT: sbbl %eax, %eax
@@ -363,12 +363,12 @@ define i64 @test_v4i64_legal_sext(<4 x i64> %a0, <4 x i64> %a1) {
; SSE-NEXT: retq
;
; AVX1-LABEL: test_v4i64_legal_sext:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpacksswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vmovmskps %xmm0, %eax
; AVX1-NEXT: negl %eax
; AVX1-NEXT: sbbl %eax, %eax
@@ -377,10 +377,10 @@ define i64 @test_v4i64_legal_sext(<4 x i64> %a0, <4 x i64> %a1) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_v4i64_legal_sext:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vmovmskps %xmm0, %eax
; AVX2-NEXT: negl %eax
; AVX2-NEXT: sbbl %eax, %eax
@@ -389,7 +389,7 @@ define i64 @test_v4i64_legal_sext(<4 x i64> %a0, <4 x i64> %a1) {
; AVX2-NEXT: retq
;
; AVX512-LABEL: test_v4i64_legal_sext:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpcmpgtq %ymm1, %ymm0, %k1
; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
@@ -414,7 +414,7 @@ define i64 @test_v4i64_legal_sext(<4 x i64> %a0, <4 x i64> %a1) {
define i32 @test_v4i32_sext(<4 x i32> %a0, <4 x i32> %a1) {
; SSE-LABEL: test_v4i32_sext:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pcmpgtd %xmm1, %xmm0
; SSE-NEXT: movmskps %xmm0, %eax
; SSE-NEXT: negl %eax
@@ -422,7 +422,7 @@ define i32 @test_v4i32_sext(<4 x i32> %a0, <4 x i32> %a1) {
; SSE-NEXT: retq
;
; AVX-LABEL: test_v4i32_sext:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmovmskps %xmm0, %eax
; AVX-NEXT: negl %eax
@@ -430,7 +430,7 @@ define i32 @test_v4i32_sext(<4 x i32> %a0, <4 x i32> %a1) {
; AVX-NEXT: retq
;
; AVX512-LABEL: test_v4i32_sext:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpcmpgtd %xmm1, %xmm0, %k1
; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
@@ -452,7 +452,7 @@ define i32 @test_v4i32_sext(<4 x i32> %a0, <4 x i32> %a1) {
define i32 @test_v8i32_sext(<8 x i32> %a0, <8 x i32> %a1) {
; SSE-LABEL: test_v8i32_sext:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pcmpgtd %xmm3, %xmm1
; SSE-NEXT: pcmpgtd %xmm2, %xmm0
; SSE-NEXT: por %xmm1, %xmm0
@@ -462,7 +462,7 @@ define i32 @test_v8i32_sext(<8 x i32> %a0, <8 x i32> %a1) {
; SSE-NEXT: retq
;
; AVX1-LABEL: test_v8i32_sext:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpcmpgtd %xmm2, %xmm3, %xmm2
@@ -475,7 +475,7 @@ define i32 @test_v8i32_sext(<8 x i32> %a0, <8 x i32> %a1) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_v8i32_sext:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vmovmskps %ymm0, %eax
; AVX2-NEXT: negl %eax
@@ -484,7 +484,7 @@ define i32 @test_v8i32_sext(<8 x i32> %a0, <8 x i32> %a1) {
; AVX2-NEXT: retq
;
; AVX512-LABEL: test_v8i32_sext:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpcmpgtd %ymm1, %ymm0, %k1
; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; AVX512-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
@@ -511,22 +511,22 @@ define i32 @test_v8i32_sext(<8 x i32> %a0, <8 x i32> %a1) {
define i32 @test_v8i32_legal_sext(<8 x i32> %a0, <8 x i32> %a1) {
; SSE-LABEL: test_v8i32_legal_sext:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pcmpgtd %xmm3, %xmm1
; SSE-NEXT: pcmpgtd %xmm2, %xmm0
-; SSE-NEXT: packsswb %xmm1, %xmm0
+; SSE-NEXT: packssdw %xmm1, %xmm0
; SSE-NEXT: pmovmskb %xmm0, %eax
; SSE-NEXT: negl %eax
; SSE-NEXT: sbbl %eax, %eax
; SSE-NEXT: retq
;
; AVX1-LABEL: test_v8i32_legal_sext:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpcmpgtd %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpacksswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpmovmskb %xmm0, %eax
; AVX1-NEXT: negl %eax
; AVX1-NEXT: sbbl %eax, %eax
@@ -534,10 +534,10 @@ define i32 @test_v8i32_legal_sext(<8 x i32> %a0, <8 x i32> %a1) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_v8i32_legal_sext:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpmovmskb %xmm0, %eax
; AVX2-NEXT: negl %eax
; AVX2-NEXT: sbbl %eax, %eax
@@ -545,7 +545,7 @@ define i32 @test_v8i32_legal_sext(<8 x i32> %a0, <8 x i32> %a1) {
; AVX2-NEXT: retq
;
; AVX512-LABEL: test_v8i32_legal_sext:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpcmpgtd %ymm1, %ymm0, %k0
; AVX512-NEXT: vpmovm2w %k0, %xmm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
@@ -573,25 +573,25 @@ define i32 @test_v8i32_legal_sext(<8 x i32> %a0, <8 x i32> %a1) {
define i16 @test_v8i16_sext(<8 x i16> %a0, <8 x i16> %a1) {
; SSE-LABEL: test_v8i16_sext:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pcmpgtw %xmm1, %xmm0
; SSE-NEXT: pmovmskb %xmm0, %eax
; SSE-NEXT: negl %eax
; SSE-NEXT: sbbl %eax, %eax
-; SSE-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; SSE-NEXT: # kill: def %ax killed %ax killed %eax
; SSE-NEXT: retq
;
; AVX-LABEL: test_v8i16_sext:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpmovmskb %xmm0, %eax
; AVX-NEXT: negl %eax
; AVX-NEXT: sbbl %eax, %eax
-; AVX-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX-NEXT: # kill: def %ax killed %ax killed %eax
; AVX-NEXT: retq
;
; AVX512-LABEL: test_v8i16_sext:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpcmpgtw %xmm1, %xmm0, %k0
; AVX512-NEXT: vpmovm2w %k0, %xmm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
@@ -601,7 +601,7 @@ define i16 @test_v8i16_sext(<8 x i16> %a0, <8 x i16> %a1) {
; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vmovd %xmm0, %eax
-; AVX512-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX512-NEXT: # kill: def %ax killed %ax killed %eax
; AVX512-NEXT: retq
%c = icmp sgt <8 x i16> %a0, %a1
%s = sext <8 x i1> %c to <8 x i16>
@@ -617,47 +617,47 @@ define i16 @test_v8i16_sext(<8 x i16> %a0, <8 x i16> %a1) {
define i16 @test_v16i16_sext(<16 x i16> %a0, <16 x i16> %a1) {
; SSE-LABEL: test_v16i16_sext:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pcmpgtw %xmm3, %xmm1
; SSE-NEXT: pcmpgtw %xmm2, %xmm0
; SSE-NEXT: por %xmm1, %xmm0
; SSE-NEXT: pmovmskb %xmm0, %eax
; SSE-NEXT: negl %eax
; SSE-NEXT: sbbl %eax, %eax
-; SSE-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; SSE-NEXT: # kill: def %ax killed %ax killed %eax
; SSE-NEXT: retq
;
; AVX1-LABEL: test_v16i16_sext:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpcmpgtw %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vmovd %xmm0, %eax
-; AVX1-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX1-NEXT: # kill: def %ax killed %ax killed %eax
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_v16i16_sext:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpmovmskb %ymm0, %eax
; AVX2-NEXT: negl %eax
; AVX2-NEXT: sbbl %eax, %eax
-; AVX2-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX2-NEXT: # kill: def %ax killed %ax killed %eax
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512-LABEL: test_v16i16_sext:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpcmpgtw %ymm1, %ymm0, %k0
; AVX512-NEXT: vpmovm2w %k0, %ymm0
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
@@ -669,7 +669,7 @@ define i16 @test_v16i16_sext(<16 x i16> %a0, <16 x i16> %a1) {
; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512-NEXT: vmovd %xmm0, %eax
-; AVX512-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX512-NEXT: # kill: def %ax killed %ax killed %eax
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%c = icmp sgt <16 x i16> %a0, %a1
@@ -688,18 +688,18 @@ define i16 @test_v16i16_sext(<16 x i16> %a0, <16 x i16> %a1) {
define i16 @test_v16i16_legal_sext(<16 x i16> %a0, <16 x i16> %a1) {
; SSE-LABEL: test_v16i16_legal_sext:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pcmpgtw %xmm3, %xmm1
; SSE-NEXT: pcmpgtw %xmm2, %xmm0
; SSE-NEXT: packsswb %xmm1, %xmm0
; SSE-NEXT: pmovmskb %xmm0, %eax
; SSE-NEXT: negl %eax
; SSE-NEXT: sbbl %eax, %eax
-; SSE-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; SSE-NEXT: # kill: def %ax killed %ax killed %eax
; SSE-NEXT: retq
;
; AVX1-LABEL: test_v16i16_legal_sext:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpcmpgtw %xmm2, %xmm3, %xmm2
@@ -708,24 +708,24 @@ define i16 @test_v16i16_legal_sext(<16 x i16> %a0, <16 x i16> %a1) {
; AVX1-NEXT: vpmovmskb %xmm0, %eax
; AVX1-NEXT: negl %eax
; AVX1-NEXT: sbbl %eax, %eax
-; AVX1-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX1-NEXT: # kill: def %ax killed %ax killed %eax
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_v16i16_legal_sext:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpmovmskb %xmm0, %eax
; AVX2-NEXT: negl %eax
; AVX2-NEXT: sbbl %eax, %eax
-; AVX2-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX2-NEXT: # kill: def %ax killed %ax killed %eax
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512-LABEL: test_v16i16_legal_sext:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpcmpgtw %ymm1, %ymm0, %k0
; AVX512-NEXT: vpmovm2b %k0, %xmm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
@@ -738,7 +738,7 @@ define i16 @test_v16i16_legal_sext(<16 x i16> %a0, <16 x i16> %a1) {
; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpextrb $0, %xmm0, %eax
; AVX512-NEXT: movsbl %al, %eax
-; AVX512-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX512-NEXT: # kill: def %ax killed %ax killed %eax
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%c = icmp sgt <16 x i16> %a0, %a1
@@ -758,25 +758,25 @@ define i16 @test_v16i16_legal_sext(<16 x i16> %a0, <16 x i16> %a1) {
define i8 @test_v16i8_sext(<16 x i8> %a0, <16 x i8> %a1) {
; SSE-LABEL: test_v16i8_sext:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pcmpgtb %xmm1, %xmm0
; SSE-NEXT: pmovmskb %xmm0, %eax
; SSE-NEXT: negl %eax
; SSE-NEXT: sbbl %eax, %eax
-; SSE-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; SSE-NEXT: # kill: def %al killed %al killed %eax
; SSE-NEXT: retq
;
; AVX-LABEL: test_v16i8_sext:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpmovmskb %xmm0, %eax
; AVX-NEXT: negl %eax
; AVX-NEXT: sbbl %eax, %eax
-; AVX-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX-NEXT: # kill: def %al killed %al killed %eax
; AVX-NEXT: retq
;
; AVX512-LABEL: test_v16i8_sext:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpcmpgtb %xmm1, %xmm0, %k0
; AVX512-NEXT: vpmovm2b %k0, %xmm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
@@ -788,7 +788,7 @@ define i8 @test_v16i8_sext(<16 x i8> %a0, <16 x i8> %a1) {
; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpextrb $0, %xmm0, %eax
-; AVX512-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512-NEXT: # kill: def %al killed %al killed %eax
; AVX512-NEXT: retq
%c = icmp sgt <16 x i8> %a0, %a1
%s = sext <16 x i1> %c to <16 x i8>
@@ -806,49 +806,49 @@ define i8 @test_v16i8_sext(<16 x i8> %a0, <16 x i8> %a1) {
define i8 @test_v32i8_sext(<32 x i8> %a0, <32 x i8> %a1) {
; SSE-LABEL: test_v32i8_sext:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pcmpgtb %xmm3, %xmm1
; SSE-NEXT: pcmpgtb %xmm2, %xmm0
; SSE-NEXT: por %xmm1, %xmm0
; SSE-NEXT: pmovmskb %xmm0, %eax
; SSE-NEXT: negl %eax
; SSE-NEXT: sbbl %eax, %eax
-; SSE-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; SSE-NEXT: # kill: def %al killed %al killed %eax
; SSE-NEXT: retq
;
; AVX1-LABEL: test_v32i8_sext:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpcmpgtb %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vpextrb $0, %xmm0, %eax
-; AVX1-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX1-NEXT: # kill: def %al killed %al killed %eax
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_v32i8_sext:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpmovmskb %ymm0, %eax
; AVX2-NEXT: negl %eax
; AVX2-NEXT: sbbl %eax, %eax
-; AVX2-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX2-NEXT: # kill: def %al killed %al killed %eax
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512-LABEL: test_v32i8_sext:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpcmpgtb %ymm1, %ymm0, %k0
; AVX512-NEXT: vpmovm2b %k0, %ymm0
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
@@ -862,7 +862,7 @@ define i8 @test_v32i8_sext(<32 x i8> %a0, <32 x i8> %a1) {
; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512-NEXT: vpextrb $0, %xmm0, %eax
-; AVX512-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512-NEXT: # kill: def %al killed %al killed %eax
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%c = icmp sgt <32 x i8> %a0, %a1
diff --git a/test/CodeGen/X86/vector-compare-combines.ll b/test/CodeGen/X86/vector-compare-combines.ll
index bd7cbfb4bac0..722de1009489 100644
--- a/test/CodeGen/X86/vector-compare-combines.ll
+++ b/test/CodeGen/X86/vector-compare-combines.ll
@@ -9,12 +9,12 @@ declare <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32>, <4 x i32>)
define <4 x i32> @PR27924_cmpeq(<4 x i32> %a, <4 x i32> %b) {
; SSE-LABEL: PR27924_cmpeq:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pcmpeqd %xmm0, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: PR27924_cmpeq:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; AVX-NEXT: retq
%cmp = icmp sgt <4 x i32> %a, %b
@@ -27,12 +27,12 @@ define <4 x i32> @PR27924_cmpeq(<4 x i32> %a, <4 x i32> %b) {
define <4 x i32> @PR27924_cmpgt(<4 x i32> %a, <4 x i32> %b) {
; SSE-LABEL: PR27924_cmpgt:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: xorps %xmm0, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: PR27924_cmpgt:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
; AVX-NEXT: retq
%cmp = icmp sgt <4 x i32> %a, %b
diff --git a/test/CodeGen/X86/vector-compare-results.ll b/test/CodeGen/X86/vector-compare-results.ll
index ce0b067f5043..12530adf15cb 100644
--- a/test/CodeGen/X86/vector-compare-results.ll
+++ b/test/CodeGen/X86/vector-compare-results.ll
@@ -13,13 +13,13 @@
define <2 x i1> @test_cmp_v2f64(<2 x double> %a0, <2 x double> %a1) nounwind {
; SSE-LABEL: test_cmp_v2f64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: cmpltpd %xmm0, %xmm1
; SSE-NEXT: movapd %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test_cmp_v2f64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vcmpltpd %xmm0, %xmm1, %xmm0
; AVX-NEXT: retq
%1 = fcmp ogt <2 x double> %a0, %a1
@@ -28,13 +28,13 @@ define <2 x i1> @test_cmp_v2f64(<2 x double> %a0, <2 x double> %a1) nounwind {
define <4 x i1> @test_cmp_v4f32(<4 x float> %a0, <4 x float> %a1) nounwind {
; SSE-LABEL: test_cmp_v4f32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: cmpltps %xmm0, %xmm1
; SSE-NEXT: movaps %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test_cmp_v4f32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vcmpltps %xmm0, %xmm1, %xmm0
; AVX-NEXT: retq
%1 = fcmp ogt <4 x float> %a0, %a1
@@ -43,7 +43,7 @@ define <4 x i1> @test_cmp_v4f32(<4 x float> %a0, <4 x float> %a1) nounwind {
define <2 x i1> @test_cmp_v2i64(<2 x i64> %a0, <2 x i64> %a1) nounwind {
; SSE2-LABEL: test_cmp_v2i64:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0]
; SSE2-NEXT: pxor %xmm2, %xmm1
; SSE2-NEXT: pxor %xmm2, %xmm0
@@ -58,12 +58,12 @@ define <2 x i1> @test_cmp_v2i64(<2 x i64> %a0, <2 x i64> %a1) nounwind {
; SSE2-NEXT: retq
;
; SSE42-LABEL: test_cmp_v2i64:
-; SSE42: # BB#0:
+; SSE42: # %bb.0:
; SSE42-NEXT: pcmpgtq %xmm1, %xmm0
; SSE42-NEXT: retq
;
; AVX-LABEL: test_cmp_v2i64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%1 = icmp sgt <2 x i64> %a0, %a1
@@ -72,12 +72,12 @@ define <2 x i1> @test_cmp_v2i64(<2 x i64> %a0, <2 x i64> %a1) nounwind {
define <4 x i1> @test_cmp_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind {
; SSE-LABEL: test_cmp_v4i32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pcmpgtd %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test_cmp_v4i32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%1 = icmp sgt <4 x i32> %a0, %a1
@@ -86,12 +86,12 @@ define <4 x i1> @test_cmp_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind {
define <8 x i1> @test_cmp_v8i16(<8 x i16> %a0, <8 x i16> %a1) nounwind {
; SSE-LABEL: test_cmp_v8i16:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pcmpgtw %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test_cmp_v8i16:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%1 = icmp sgt <8 x i16> %a0, %a1
@@ -100,12 +100,12 @@ define <8 x i1> @test_cmp_v8i16(<8 x i16> %a0, <8 x i16> %a1) nounwind {
define <16 x i1> @test_cmp_v16i8(<16 x i8> %a0, <16 x i8> %a1) nounwind {
; SSE-LABEL: test_cmp_v16i8:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pcmpgtb %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test_cmp_v16i8:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%1 = icmp sgt <16 x i8> %a0, %a1
@@ -118,34 +118,34 @@ define <16 x i1> @test_cmp_v16i8(<16 x i8> %a0, <16 x i8> %a1) nounwind {
define <4 x i1> @test_cmp_v4f64(<4 x double> %a0, <4 x double> %a1) nounwind {
; SSE-LABEL: test_cmp_v4f64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: cmpltpd %xmm1, %xmm3
; SSE-NEXT: cmpltpd %xmm0, %xmm2
-; SSE-NEXT: packsswb %xmm3, %xmm2
+; SSE-NEXT: packssdw %xmm3, %xmm2
; SSE-NEXT: movdqa %xmm2, %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: test_cmp_v4f64:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vcmpltpd %ymm0, %ymm1, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_cmp_v4f64:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vcmpltpd %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512-LABEL: test_cmp_v4f64:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vcmpltpd %ymm0, %ymm1, %ymm0
; AVX512-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = fcmp ogt <4 x double> %a0, %a1
@@ -154,34 +154,34 @@ define <4 x i1> @test_cmp_v4f64(<4 x double> %a0, <4 x double> %a1) nounwind {
define <8 x i1> @test_cmp_v8f32(<8 x float> %a0, <8 x float> %a1) nounwind {
; SSE-LABEL: test_cmp_v8f32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: cmpltps %xmm1, %xmm3
; SSE-NEXT: cmpltps %xmm0, %xmm2
-; SSE-NEXT: packsswb %xmm3, %xmm2
+; SSE-NEXT: packssdw %xmm3, %xmm2
; SSE-NEXT: movdqa %xmm2, %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: test_cmp_v8f32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vcmpltps %ymm0, %ymm1, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_cmp_v8f32:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vcmpltps %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512-LABEL: test_cmp_v8f32:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vcmpltps %ymm0, %ymm1, %ymm0
; AVX512-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = fcmp ogt <8 x float> %a0, %a1
@@ -190,7 +190,7 @@ define <8 x i1> @test_cmp_v8f32(<8 x float> %a0, <8 x float> %a1) nounwind {
define <4 x i1> @test_cmp_v4i64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
; SSE2-LABEL: test_cmp_v4i64:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,0,2147483648,0]
; SSE2-NEXT: pxor %xmm4, %xmm3
; SSE2-NEXT: pxor %xmm4, %xmm1
@@ -212,39 +212,39 @@ define <4 x i1> @test_cmp_v4i64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
; SSE2-NEXT: pand %xmm4, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
; SSE2-NEXT: por %xmm2, %xmm0
-; SSE2-NEXT: packsswb %xmm3, %xmm0
+; SSE2-NEXT: packssdw %xmm3, %xmm0
; SSE2-NEXT: retq
;
; SSE42-LABEL: test_cmp_v4i64:
-; SSE42: # BB#0:
+; SSE42: # %bb.0:
; SSE42-NEXT: pcmpgtq %xmm3, %xmm1
; SSE42-NEXT: pcmpgtq %xmm2, %xmm0
-; SSE42-NEXT: packsswb %xmm1, %xmm0
+; SSE42-NEXT: packssdw %xmm1, %xmm0
; SSE42-NEXT: retq
;
; AVX1-LABEL: test_cmp_v4i64:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpacksswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_cmp_v4i64:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512-LABEL: test_cmp_v4i64:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0
; AVX512-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = icmp sgt <4 x i64> %a0, %a1
@@ -253,35 +253,35 @@ define <4 x i1> @test_cmp_v4i64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
define <8 x i1> @test_cmp_v8i32(<8 x i32> %a0, <8 x i32> %a1) nounwind {
; SSE-LABEL: test_cmp_v8i32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pcmpgtd %xmm3, %xmm1
; SSE-NEXT: pcmpgtd %xmm2, %xmm0
-; SSE-NEXT: packsswb %xmm1, %xmm0
+; SSE-NEXT: packssdw %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: test_cmp_v8i32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpcmpgtd %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpacksswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_cmp_v8i32:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512-LABEL: test_cmp_v8i32:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0
; AVX512-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = icmp sgt <8 x i32> %a0, %a1
@@ -290,14 +290,14 @@ define <8 x i1> @test_cmp_v8i32(<8 x i32> %a0, <8 x i32> %a1) nounwind {
define <16 x i1> @test_cmp_v16i16(<16 x i16> %a0, <16 x i16> %a1) nounwind {
; SSE-LABEL: test_cmp_v16i16:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pcmpgtw %xmm3, %xmm1
; SSE-NEXT: pcmpgtw %xmm2, %xmm0
; SSE-NEXT: packsswb %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: test_cmp_v16i16:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpcmpgtw %xmm2, %xmm3, %xmm2
@@ -307,7 +307,7 @@ define <16 x i1> @test_cmp_v16i16(<16 x i16> %a0, <16 x i16> %a1) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_cmp_v16i16:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
@@ -315,7 +315,7 @@ define <16 x i1> @test_cmp_v16i16(<16 x i16> %a0, <16 x i16> %a1) nounwind {
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test_cmp_v16i16:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
@@ -323,7 +323,7 @@ define <16 x i1> @test_cmp_v16i16(<16 x i16> %a0, <16 x i16> %a1) nounwind {
; AVX512F-NEXT: retq
;
; AVX512DQ-LABEL: test_cmp_v16i16:
-; AVX512DQ: # BB#0:
+; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0
; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
@@ -331,10 +331,10 @@ define <16 x i1> @test_cmp_v16i16(<16 x i16> %a0, <16 x i16> %a1) nounwind {
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: test_cmp_v16i16:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
-; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512BW-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
%1 = icmp sgt <16 x i16> %a0, %a1
@@ -343,7 +343,7 @@ define <16 x i1> @test_cmp_v16i16(<16 x i16> %a0, <16 x i16> %a1) nounwind {
define <32 x i1> @test_cmp_v32i8(<32 x i8> %a0, <32 x i8> %a1) nounwind {
; SSE2-LABEL: test_cmp_v32i8:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: pcmpgtb %xmm2, %xmm0
; SSE2-NEXT: pcmpgtb %xmm3, %xmm1
; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp)
@@ -448,7 +448,7 @@ define <32 x i1> @test_cmp_v32i8(<32 x i8> %a0, <32 x i8> %a1) nounwind {
; SSE2-NEXT: retq
;
; SSE42-LABEL: test_cmp_v32i8:
-; SSE42: # BB#0:
+; SSE42: # %bb.0:
; SSE42-NEXT: pcmpgtb %xmm2, %xmm0
; SSE42-NEXT: pcmpgtb %xmm3, %xmm1
; SSE42-NEXT: pextrb $15, %xmm1, %eax
@@ -551,7 +551,7 @@ define <32 x i1> @test_cmp_v32i8(<32 x i8> %a0, <32 x i8> %a1) nounwind {
; SSE42-NEXT: retq
;
; AVX1-LABEL: test_cmp_v32i8:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpcmpgtb %xmm2, %xmm3, %xmm2
@@ -560,12 +560,12 @@ define <32 x i1> @test_cmp_v32i8(<32 x i8> %a0, <32 x i8> %a1) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_cmp_v32i8:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: test_cmp_v32i8:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0
; AVX512-NEXT: retq
%1 = icmp sgt <32 x i8> %a0, %a1
@@ -578,61 +578,63 @@ define <32 x i1> @test_cmp_v32i8(<32 x i8> %a0, <32 x i8> %a1) nounwind {
define <8 x i1> @test_cmp_v8f64(<8 x double> %a0, <8 x double> %a1) nounwind {
; SSE-LABEL: test_cmp_v8f64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: cmpltpd %xmm3, %xmm7
; SSE-NEXT: cmpltpd %xmm2, %xmm6
-; SSE-NEXT: packsswb %xmm7, %xmm6
+; SSE-NEXT: packssdw %xmm7, %xmm6
; SSE-NEXT: cmpltpd %xmm1, %xmm5
; SSE-NEXT: cmpltpd %xmm0, %xmm4
-; SSE-NEXT: packsswb %xmm5, %xmm4
-; SSE-NEXT: packsswb %xmm6, %xmm4
+; SSE-NEXT: packssdw %xmm5, %xmm4
+; SSE-NEXT: packssdw %xmm6, %xmm4
; SSE-NEXT: movdqa %xmm4, %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: test_cmp_v8f64:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vcmpltpd %ymm1, %ymm3, %ymm1
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT: vpacksswb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpackssdw %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vcmpltpd %ymm0, %ymm2, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vpacksswb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_cmp_v8f64:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vcmpltpd %ymm1, %ymm3, %ymm1
; AVX2-NEXT: vcmpltpd %ymm0, %ymm2, %ymm0
-; AVX2-NEXT: vpacksswb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test_cmp_v8f64:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vcmpltpd %zmm0, %zmm1, %k1
-; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; AVX512F-NEXT: vpmovqw %zmm0, %xmm0
+; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512F-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512DQ-LABEL: test_cmp_v8f64:
-; AVX512DQ: # BB#0:
+; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vcmpltpd %zmm0, %zmm1, %k0
-; AVX512DQ-NEXT: vpmovm2q %k0, %zmm0
-; AVX512DQ-NEXT: vpmovqw %zmm0, %xmm0
+; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
+; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512DQ-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: test_cmp_v8f64:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vcmpltpd %zmm0, %zmm1, %k0
; AVX512BW-NEXT: vpmovm2w %k0, %zmm0
-; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512BW-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
%1 = fcmp ogt <8 x double> %a0, %a1
@@ -641,31 +643,31 @@ define <8 x i1> @test_cmp_v8f64(<8 x double> %a0, <8 x double> %a1) nounwind {
define <16 x i1> @test_cmp_v16f32(<16 x float> %a0, <16 x float> %a1) nounwind {
; SSE-LABEL: test_cmp_v16f32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: cmpltps %xmm3, %xmm7
; SSE-NEXT: cmpltps %xmm2, %xmm6
-; SSE-NEXT: packsswb %xmm7, %xmm6
+; SSE-NEXT: packssdw %xmm7, %xmm6
; SSE-NEXT: cmpltps %xmm1, %xmm5
; SSE-NEXT: cmpltps %xmm0, %xmm4
-; SSE-NEXT: packsswb %xmm5, %xmm4
+; SSE-NEXT: packssdw %xmm5, %xmm4
; SSE-NEXT: packsswb %xmm6, %xmm4
; SSE-NEXT: movdqa %xmm4, %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: test_cmp_v16f32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vcmpltps %ymm1, %ymm3, %ymm1
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT: vpacksswb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpackssdw %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vcmpltps %ymm0, %ymm2, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vpacksswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_cmp_v16f32:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vcmpltps %ymm1, %ymm3, %ymm1
; AVX2-NEXT: vcmpltps %ymm0, %ymm2, %ymm0
; AVX2-NEXT: vpacksswb %ymm1, %ymm0, %ymm0
@@ -676,7 +678,7 @@ define <16 x i1> @test_cmp_v16f32(<16 x float> %a0, <16 x float> %a1) nounwind {
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test_cmp_v16f32:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vcmpltps %zmm0, %zmm1, %k1
; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
@@ -684,7 +686,7 @@ define <16 x i1> @test_cmp_v16f32(<16 x float> %a0, <16 x float> %a1) nounwind {
; AVX512F-NEXT: retq
;
; AVX512DQ-LABEL: test_cmp_v16f32:
-; AVX512DQ: # BB#0:
+; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vcmpltps %zmm0, %zmm1, %k0
; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
@@ -692,10 +694,10 @@ define <16 x i1> @test_cmp_v16f32(<16 x float> %a0, <16 x float> %a1) nounwind {
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: test_cmp_v16f32:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vcmpltps %zmm0, %zmm1, %k0
; AVX512BW-NEXT: vpmovm2b %k0, %zmm0
-; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512BW-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
%1 = fcmp ogt <16 x float> %a0, %a1
@@ -704,7 +706,7 @@ define <16 x i1> @test_cmp_v16f32(<16 x float> %a0, <16 x float> %a1) nounwind {
define <8 x i1> @test_cmp_v8i64(<8 x i64> %a0, <8 x i64> %a1) nounwind {
; SSE2-LABEL: test_cmp_v8i64:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,0,2147483648,0]
; SSE2-NEXT: pxor %xmm8, %xmm7
; SSE2-NEXT: pxor %xmm8, %xmm3
@@ -726,7 +728,7 @@ define <8 x i1> @test_cmp_v8i64(<8 x i64> %a0, <8 x i64> %a1) nounwind {
; SSE2-NEXT: pand %xmm9, %xmm6
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
; SSE2-NEXT: por %xmm6, %xmm2
-; SSE2-NEXT: packsswb %xmm7, %xmm2
+; SSE2-NEXT: packssdw %xmm7, %xmm2
; SSE2-NEXT: pxor %xmm8, %xmm5
; SSE2-NEXT: pxor %xmm8, %xmm1
; SSE2-NEXT: movdqa %xmm1, %xmm3
@@ -747,69 +749,71 @@ define <8 x i1> @test_cmp_v8i64(<8 x i64> %a0, <8 x i64> %a1) nounwind {
; SSE2-NEXT: pand %xmm5, %xmm4
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
; SSE2-NEXT: por %xmm4, %xmm0
-; SSE2-NEXT: packsswb %xmm3, %xmm0
-; SSE2-NEXT: packsswb %xmm2, %xmm0
+; SSE2-NEXT: packssdw %xmm3, %xmm0
+; SSE2-NEXT: packssdw %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE42-LABEL: test_cmp_v8i64:
-; SSE42: # BB#0:
+; SSE42: # %bb.0:
; SSE42-NEXT: pcmpgtq %xmm7, %xmm3
; SSE42-NEXT: pcmpgtq %xmm6, %xmm2
-; SSE42-NEXT: packsswb %xmm3, %xmm2
+; SSE42-NEXT: packssdw %xmm3, %xmm2
; SSE42-NEXT: pcmpgtq %xmm5, %xmm1
; SSE42-NEXT: pcmpgtq %xmm4, %xmm0
-; SSE42-NEXT: packsswb %xmm1, %xmm0
-; SSE42-NEXT: packsswb %xmm2, %xmm0
+; SSE42-NEXT: packssdw %xmm1, %xmm0
+; SSE42-NEXT: packssdw %xmm2, %xmm0
; SSE42-NEXT: retq
;
; AVX1-LABEL: test_cmp_v8i64:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
; AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4
; AVX1-NEXT: vpcmpgtq %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vpacksswb %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpackssdw %xmm4, %xmm1, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpacksswb %xmm3, %xmm0, %xmm0
-; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpackssdw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_cmp_v8i64:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpcmpgtq %ymm3, %ymm1, %ymm1
; AVX2-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm0
-; AVX2-NEXT: vpacksswb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test_cmp_v8i64:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vpcmpgtq %zmm1, %zmm0, %k1
-; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; AVX512F-NEXT: vpmovqw %zmm0, %xmm0
+; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512F-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512DQ-LABEL: test_cmp_v8i64:
-; AVX512DQ: # BB#0:
+; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vpcmpgtq %zmm1, %zmm0, %k0
-; AVX512DQ-NEXT: vpmovm2q %k0, %zmm0
-; AVX512DQ-NEXT: vpmovqw %zmm0, %xmm0
+; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
+; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512DQ-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: test_cmp_v8i64:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpcmpgtq %zmm1, %zmm0, %k0
; AVX512BW-NEXT: vpmovm2w %k0, %zmm0
-; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512BW-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
%1 = icmp sgt <8 x i64> %a0, %a1
@@ -818,34 +822,34 @@ define <8 x i1> @test_cmp_v8i64(<8 x i64> %a0, <8 x i64> %a1) nounwind {
define <16 x i1> @test_cmp_v16i32(<16 x i32> %a0, <16 x i32> %a1) nounwind {
; SSE-LABEL: test_cmp_v16i32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pcmpgtd %xmm7, %xmm3
; SSE-NEXT: pcmpgtd %xmm6, %xmm2
-; SSE-NEXT: packsswb %xmm3, %xmm2
+; SSE-NEXT: packssdw %xmm3, %xmm2
; SSE-NEXT: pcmpgtd %xmm5, %xmm1
; SSE-NEXT: pcmpgtd %xmm4, %xmm0
-; SSE-NEXT: packsswb %xmm1, %xmm0
+; SSE-NEXT: packssdw %xmm1, %xmm0
; SSE-NEXT: packsswb %xmm2, %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: test_cmp_v16i32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
; AVX1-NEXT: vpcmpgtd %xmm4, %xmm5, %xmm4
; AVX1-NEXT: vpcmpgtd %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vpacksswb %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpackssdw %xmm4, %xmm1, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
; AVX1-NEXT: vpcmpgtd %xmm3, %xmm4, %xmm3
; AVX1-NEXT: vpcmpgtd %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpacksswb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpackssdw %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_cmp_v16i32:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpcmpgtd %ymm3, %ymm1, %ymm1
; AVX2-NEXT: vpcmpgtd %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpacksswb %ymm1, %ymm0, %ymm0
@@ -856,7 +860,7 @@ define <16 x i1> @test_cmp_v16i32(<16 x i32> %a0, <16 x i32> %a1) nounwind {
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test_cmp_v16i32:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vpcmpgtd %zmm1, %zmm0, %k1
; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
@@ -864,7 +868,7 @@ define <16 x i1> @test_cmp_v16i32(<16 x i32> %a0, <16 x i32> %a1) nounwind {
; AVX512F-NEXT: retq
;
; AVX512DQ-LABEL: test_cmp_v16i32:
-; AVX512DQ: # BB#0:
+; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vpcmpgtd %zmm1, %zmm0, %k0
; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
@@ -872,10 +876,10 @@ define <16 x i1> @test_cmp_v16i32(<16 x i32> %a0, <16 x i32> %a1) nounwind {
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: test_cmp_v16i32:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpcmpgtd %zmm1, %zmm0, %k0
; AVX512BW-NEXT: vpmovm2b %k0, %zmm0
-; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512BW-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
%1 = icmp sgt <16 x i32> %a0, %a1
@@ -884,18 +888,13 @@ define <16 x i1> @test_cmp_v16i32(<16 x i32> %a0, <16 x i32> %a1) nounwind {
define <32 x i1> @test_cmp_v32i16(<32 x i16> %a0, <32 x i16> %a1) nounwind {
; SSE2-LABEL: test_cmp_v32i16:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: pcmpgtw %xmm5, %xmm1
-; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
-; SSE2-NEXT: pand %xmm5, %xmm1
; SSE2-NEXT: pcmpgtw %xmm4, %xmm0
-; SSE2-NEXT: pand %xmm5, %xmm0
-; SSE2-NEXT: packuswb %xmm1, %xmm0
+; SSE2-NEXT: packsswb %xmm1, %xmm0
; SSE2-NEXT: pcmpgtw %xmm7, %xmm3
-; SSE2-NEXT: pand %xmm5, %xmm3
; SSE2-NEXT: pcmpgtw %xmm6, %xmm2
-; SSE2-NEXT: pand %xmm5, %xmm2
-; SSE2-NEXT: packuswb %xmm3, %xmm2
+; SSE2-NEXT: packsswb %xmm3, %xmm2
; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp)
; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
; SSE2-NEXT: andb $1, %al
@@ -998,7 +997,7 @@ define <32 x i1> @test_cmp_v32i16(<32 x i16> %a0, <32 x i16> %a1) nounwind {
; SSE2-NEXT: retq
;
; SSE42-LABEL: test_cmp_v32i16:
-; SSE42: # BB#0:
+; SSE42: # %bb.0:
; SSE42-NEXT: pcmpgtw %xmm4, %xmm0
; SSE42-NEXT: pcmpgtw %xmm5, %xmm1
; SSE42-NEXT: pcmpgtw %xmm6, %xmm2
@@ -1103,7 +1102,7 @@ define <32 x i1> @test_cmp_v32i16(<32 x i16> %a0, <32 x i16> %a1) nounwind {
; SSE42-NEXT: retq
;
; AVX1-LABEL: test_cmp_v32i16:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
; AVX1-NEXT: vpcmpgtw %xmm4, %xmm5, %xmm4
@@ -1118,7 +1117,7 @@ define <32 x i1> @test_cmp_v32i16(<32 x i16> %a0, <32 x i16> %a1) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_cmp_v32i16:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpcmpgtw %ymm3, %ymm1, %ymm1
; AVX2-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpacksswb %ymm1, %ymm0, %ymm0
@@ -1126,7 +1125,7 @@ define <32 x i1> @test_cmp_v32i16(<32 x i16> %a0, <32 x i16> %a1) nounwind {
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test_cmp_v32i16:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm0
; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
@@ -1137,7 +1136,7 @@ define <32 x i1> @test_cmp_v32i16(<32 x i16> %a0, <32 x i16> %a1) nounwind {
; AVX512F-NEXT: retq
;
; AVX512DQ-LABEL: test_cmp_v32i16:
-; AVX512DQ: # BB#0:
+; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm0
; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
@@ -1148,10 +1147,10 @@ define <32 x i1> @test_cmp_v32i16(<32 x i16> %a0, <32 x i16> %a1) nounwind {
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: test_cmp_v32i16:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpcmpgtw %zmm1, %zmm0, %k0
; AVX512BW-NEXT: vpmovm2b %k0, %zmm0
-; AVX512BW-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512BW-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
; AVX512BW-NEXT: retq
%1 = icmp sgt <32 x i16> %a0, %a1
ret <32 x i1> %1
@@ -1159,7 +1158,7 @@ define <32 x i1> @test_cmp_v32i16(<32 x i16> %a0, <32 x i16> %a1) nounwind {
define <64 x i1> @test_cmp_v64i8(<64 x i8> %a0, <64 x i8> %a1) nounwind {
; SSE2-LABEL: test_cmp_v64i8:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: pcmpgtb %xmm4, %xmm0
; SSE2-NEXT: pcmpgtb %xmm5, %xmm1
; SSE2-NEXT: pcmpgtb %xmm6, %xmm2
@@ -1364,7 +1363,7 @@ define <64 x i1> @test_cmp_v64i8(<64 x i8> %a0, <64 x i8> %a1) nounwind {
; SSE2-NEXT: retq
;
; SSE42-LABEL: test_cmp_v64i8:
-; SSE42: # BB#0:
+; SSE42: # %bb.0:
; SSE42-NEXT: pcmpgtb %xmm4, %xmm0
; SSE42-NEXT: pcmpgtb %xmm5, %xmm1
; SSE42-NEXT: pcmpgtb %xmm6, %xmm2
@@ -1565,7 +1564,7 @@ define <64 x i1> @test_cmp_v64i8(<64 x i8> %a0, <64 x i8> %a1) nounwind {
; SSE42-NEXT: retq
;
; AVX1-LABEL: test_cmp_v64i8:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpcmpgtb %xmm2, %xmm0, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
@@ -1771,7 +1770,7 @@ define <64 x i1> @test_cmp_v64i8(<64 x i8> %a0, <64 x i8> %a1) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_cmp_v64i8:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpcmpgtb %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpcmpgtb %ymm3, %ymm1, %ymm1
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
@@ -1973,29 +1972,29 @@ define <64 x i1> @test_cmp_v64i8(<64 x i8> %a0, <64 x i8> %a1) nounwind {
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test_cmp_v64i8:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vpcmpgtb %ymm3, %ymm1, %ymm4
; AVX512F-NEXT: vpcmpgtb %ymm2, %ymm0, %ymm0
; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512F-NEXT: vextracti128 $1, %ymm4, %xmm3
-; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512F-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX512F-NEXT: vmovdqa %xmm4, %xmm2
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512DQ-LABEL: test_cmp_v64i8:
-; AVX512DQ: # BB#0:
+; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vpcmpgtb %ymm3, %ymm1, %ymm4
; AVX512DQ-NEXT: vpcmpgtb %ymm2, %ymm0, %ymm0
; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512DQ-NEXT: vextracti128 $1, %ymm4, %xmm3
-; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512DQ-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX512DQ-NEXT: vmovdqa %xmm4, %xmm2
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: test_cmp_v64i8:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpcmpgtb %zmm1, %zmm0, %k0
; AVX512BW-NEXT: vpmovm2b %k0, %zmm0
; AVX512BW-NEXT: retq
@@ -2009,7 +2008,7 @@ define <64 x i1> @test_cmp_v64i8(<64 x i8> %a0, <64 x i8> %a1) nounwind {
define <16 x i1> @test_cmp_v16f64(<16 x double> %a0, <16 x double> %a1) nounwind {
; SSE-LABEL: test_cmp_v16f64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movapd %xmm0, %xmm8
; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0
; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm11
@@ -2021,50 +2020,50 @@ define <16 x i1> @test_cmp_v16f64(<16 x double> %a0, <16 x double> %a1) nounwind
; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm15
; SSE-NEXT: cmpltpd %xmm7, %xmm15
; SSE-NEXT: cmpltpd %xmm6, %xmm14
-; SSE-NEXT: packsswb %xmm15, %xmm14
+; SSE-NEXT: packssdw %xmm15, %xmm14
; SSE-NEXT: cmpltpd %xmm5, %xmm13
; SSE-NEXT: cmpltpd %xmm4, %xmm9
-; SSE-NEXT: packsswb %xmm13, %xmm9
-; SSE-NEXT: packsswb %xmm14, %xmm9
+; SSE-NEXT: packssdw %xmm13, %xmm9
+; SSE-NEXT: packssdw %xmm14, %xmm9
; SSE-NEXT: cmpltpd %xmm3, %xmm12
; SSE-NEXT: cmpltpd %xmm2, %xmm10
-; SSE-NEXT: packsswb %xmm12, %xmm10
+; SSE-NEXT: packssdw %xmm12, %xmm10
; SSE-NEXT: cmpltpd %xmm1, %xmm11
; SSE-NEXT: cmpltpd %xmm8, %xmm0
-; SSE-NEXT: packsswb %xmm11, %xmm0
-; SSE-NEXT: packsswb %xmm10, %xmm0
+; SSE-NEXT: packssdw %xmm11, %xmm0
+; SSE-NEXT: packssdw %xmm10, %xmm0
; SSE-NEXT: packsswb %xmm9, %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: test_cmp_v16f64:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vcmpltpd %ymm3, %ymm7, %ymm3
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm7
-; AVX1-NEXT: vpacksswb %xmm7, %xmm3, %xmm3
+; AVX1-NEXT: vpackssdw %xmm7, %xmm3, %xmm3
; AVX1-NEXT: vcmpltpd %ymm2, %ymm6, %ymm2
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6
-; AVX1-NEXT: vpacksswb %xmm6, %xmm2, %xmm2
-; AVX1-NEXT: vpacksswb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpackssdw %xmm6, %xmm2, %xmm2
+; AVX1-NEXT: vpackssdw %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vcmpltpd %ymm1, %ymm5, %ymm1
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT: vpacksswb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpackssdw %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vcmpltpd %ymm0, %ymm4, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT: vpacksswb %xmm3, %xmm0, %xmm0
-; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpackssdw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpacksswb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_cmp_v16f64:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vcmpltpd %ymm3, %ymm7, %ymm3
; AVX2-NEXT: vcmpltpd %ymm2, %ymm6, %ymm2
-; AVX2-NEXT: vpacksswb %ymm3, %ymm2, %ymm2
+; AVX2-NEXT: vpackssdw %ymm3, %ymm2, %ymm2
; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
; AVX2-NEXT: vcmpltpd %ymm1, %ymm5, %ymm1
; AVX2-NEXT: vcmpltpd %ymm0, %ymm4, %ymm0
-; AVX2-NEXT: vpacksswb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX2-NEXT: vpacksswb %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
@@ -2074,350 +2073,32 @@ define <16 x i1> @test_cmp_v16f64(<16 x double> %a0, <16 x double> %a1) nounwind
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test_cmp_v16f64:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vextractf32x4 $3, %zmm2, %xmm4
-; AVX512F-NEXT: vextractf32x4 $3, %zmm0, %xmm5
-; AVX512F-NEXT: xorl %eax, %eax
-; AVX512F-NEXT: vucomisd %xmm4, %xmm5
-; AVX512F-NEXT: movq $-1, %rcx
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmovaq %rcx, %rdx
-; AVX512F-NEXT: vmovq %rdx, %xmm6
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm5 = xmm5[1,0]
-; AVX512F-NEXT: vucomisd %xmm4, %xmm5
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmovaq %rcx, %rdx
-; AVX512F-NEXT: vmovq %rdx, %xmm4
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm6[0],xmm4[0]
-; AVX512F-NEXT: vextractf32x4 $2, %zmm2, %xmm5
-; AVX512F-NEXT: vextractf32x4 $2, %zmm0, %xmm6
-; AVX512F-NEXT: vucomisd %xmm5, %xmm6
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmovaq %rcx, %rdx
-; AVX512F-NEXT: vmovq %rdx, %xmm7
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm5 = xmm5[1,0]
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm6 = xmm6[1,0]
-; AVX512F-NEXT: vucomisd %xmm5, %xmm6
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmovaq %rcx, %rdx
-; AVX512F-NEXT: vmovq %rdx, %xmm5
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm7[0],xmm5[0]
-; AVX512F-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4
-; AVX512F-NEXT: vextractf32x4 $1, %zmm2, %xmm5
-; AVX512F-NEXT: vextractf32x4 $1, %zmm0, %xmm6
-; AVX512F-NEXT: vucomisd %xmm5, %xmm6
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmovaq %rcx, %rdx
-; AVX512F-NEXT: vmovq %rdx, %xmm7
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm5 = xmm5[1,0]
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm6 = xmm6[1,0]
-; AVX512F-NEXT: vucomisd %xmm5, %xmm6
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmovaq %rcx, %rdx
-; AVX512F-NEXT: vmovq %rdx, %xmm5
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm7[0],xmm5[0]
-; AVX512F-NEXT: vucomisd %xmm2, %xmm0
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmovaq %rcx, %rdx
-; AVX512F-NEXT: vmovq %rdx, %xmm6
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX512F-NEXT: vucomisd %xmm2, %xmm0
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmovaq %rcx, %rdx
-; AVX512F-NEXT: vmovq %rdx, %xmm0
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm6[0],xmm0[0]
-; AVX512F-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm0
-; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0
-; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512F-NEXT: vextractf32x4 $3, %zmm3, %xmm2
-; AVX512F-NEXT: vextractf32x4 $3, %zmm1, %xmm4
-; AVX512F-NEXT: vucomisd %xmm2, %xmm4
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmovaq %rcx, %rdx
-; AVX512F-NEXT: vmovq %rdx, %xmm5
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
-; AVX512F-NEXT: vucomisd %xmm2, %xmm4
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmovaq %rcx, %rdx
-; AVX512F-NEXT: vmovq %rdx, %xmm2
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm5[0],xmm2[0]
-; AVX512F-NEXT: vextractf32x4 $2, %zmm3, %xmm4
-; AVX512F-NEXT: vextractf32x4 $2, %zmm1, %xmm5
-; AVX512F-NEXT: vucomisd %xmm4, %xmm5
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmovaq %rcx, %rdx
-; AVX512F-NEXT: vmovq %rdx, %xmm6
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm5 = xmm5[1,0]
-; AVX512F-NEXT: vucomisd %xmm4, %xmm5
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmovaq %rcx, %rdx
-; AVX512F-NEXT: vmovq %rdx, %xmm4
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm6[0],xmm4[0]
-; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2
-; AVX512F-NEXT: vextractf32x4 $1, %zmm3, %xmm4
-; AVX512F-NEXT: vextractf32x4 $1, %zmm1, %xmm5
-; AVX512F-NEXT: vucomisd %xmm4, %xmm5
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmovaq %rcx, %rdx
-; AVX512F-NEXT: vmovq %rdx, %xmm6
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm5 = xmm5[1,0]
-; AVX512F-NEXT: vucomisd %xmm4, %xmm5
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmovaq %rcx, %rdx
-; AVX512F-NEXT: vmovq %rdx, %xmm4
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm6[0],xmm4[0]
-; AVX512F-NEXT: vucomisd %xmm3, %xmm1
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmovaq %rcx, %rdx
-; AVX512F-NEXT: vmovq %rdx, %xmm5
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm3 = xmm3[1,0]
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
-; AVX512F-NEXT: vucomisd %xmm3, %xmm1
-; AVX512F-NEXT: cmovaq %rcx, %rax
-; AVX512F-NEXT: vmovq %rax, %xmm1
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm5[0],xmm1[0]
-; AVX512F-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1
-; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
-; AVX512F-NEXT: vpmovqd %zmm1, %ymm1
-; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vcmpltpd %zmm0, %zmm2, %k0
+; AVX512F-NEXT: vcmpltpd %zmm1, %zmm3, %k1
+; AVX512F-NEXT: kunpckbw %k0, %k1, %k1
+; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512DQ-LABEL: test_cmp_v16f64:
-; AVX512DQ: # BB#0:
-; AVX512DQ-NEXT: vextractf64x2 $3, %zmm2, %xmm4
-; AVX512DQ-NEXT: vextractf64x2 $3, %zmm0, %xmm5
-; AVX512DQ-NEXT: xorl %eax, %eax
-; AVX512DQ-NEXT: vucomisd %xmm4, %xmm5
-; AVX512DQ-NEXT: movq $-1, %rcx
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmovaq %rcx, %rdx
-; AVX512DQ-NEXT: vmovq %rdx, %xmm6
-; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
-; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm5 = xmm5[1,0]
-; AVX512DQ-NEXT: vucomisd %xmm4, %xmm5
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmovaq %rcx, %rdx
-; AVX512DQ-NEXT: vmovq %rdx, %xmm4
-; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm6[0],xmm4[0]
-; AVX512DQ-NEXT: vextractf64x2 $2, %zmm2, %xmm5
-; AVX512DQ-NEXT: vextractf64x2 $2, %zmm0, %xmm6
-; AVX512DQ-NEXT: vucomisd %xmm5, %xmm6
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmovaq %rcx, %rdx
-; AVX512DQ-NEXT: vmovq %rdx, %xmm7
-; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm5 = xmm5[1,0]
-; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm6 = xmm6[1,0]
-; AVX512DQ-NEXT: vucomisd %xmm5, %xmm6
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmovaq %rcx, %rdx
-; AVX512DQ-NEXT: vmovq %rdx, %xmm5
-; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm7[0],xmm5[0]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4
-; AVX512DQ-NEXT: vextractf64x2 $1, %zmm2, %xmm5
-; AVX512DQ-NEXT: vextractf64x2 $1, %zmm0, %xmm6
-; AVX512DQ-NEXT: vucomisd %xmm5, %xmm6
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmovaq %rcx, %rdx
-; AVX512DQ-NEXT: vmovq %rdx, %xmm7
-; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm5 = xmm5[1,0]
-; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm6 = xmm6[1,0]
-; AVX512DQ-NEXT: vucomisd %xmm5, %xmm6
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmovaq %rcx, %rdx
-; AVX512DQ-NEXT: vmovq %rdx, %xmm5
-; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm7[0],xmm5[0]
-; AVX512DQ-NEXT: vucomisd %xmm2, %xmm0
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmovaq %rcx, %rdx
-; AVX512DQ-NEXT: vmovq %rdx, %xmm6
-; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
-; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX512DQ-NEXT: vucomisd %xmm2, %xmm0
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmovaq %rcx, %rdx
-; AVX512DQ-NEXT: vmovq %rdx, %xmm0
-; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm6[0],xmm0[0]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm0
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0
-; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512DQ-NEXT: vextractf64x2 $3, %zmm3, %xmm2
-; AVX512DQ-NEXT: vextractf64x2 $3, %zmm1, %xmm4
-; AVX512DQ-NEXT: vucomisd %xmm2, %xmm4
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmovaq %rcx, %rdx
-; AVX512DQ-NEXT: vmovq %rdx, %xmm5
-; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
-; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
-; AVX512DQ-NEXT: vucomisd %xmm2, %xmm4
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmovaq %rcx, %rdx
-; AVX512DQ-NEXT: vmovq %rdx, %xmm2
-; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm5[0],xmm2[0]
-; AVX512DQ-NEXT: vextractf64x2 $2, %zmm3, %xmm4
-; AVX512DQ-NEXT: vextractf64x2 $2, %zmm1, %xmm5
-; AVX512DQ-NEXT: vucomisd %xmm4, %xmm5
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmovaq %rcx, %rdx
-; AVX512DQ-NEXT: vmovq %rdx, %xmm6
-; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
-; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm5 = xmm5[1,0]
-; AVX512DQ-NEXT: vucomisd %xmm4, %xmm5
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmovaq %rcx, %rdx
-; AVX512DQ-NEXT: vmovq %rdx, %xmm4
-; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm6[0],xmm4[0]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2
-; AVX512DQ-NEXT: vextractf64x2 $1, %zmm3, %xmm4
-; AVX512DQ-NEXT: vextractf64x2 $1, %zmm1, %xmm5
-; AVX512DQ-NEXT: vucomisd %xmm4, %xmm5
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmovaq %rcx, %rdx
-; AVX512DQ-NEXT: vmovq %rdx, %xmm6
-; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
-; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm5 = xmm5[1,0]
-; AVX512DQ-NEXT: vucomisd %xmm4, %xmm5
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmovaq %rcx, %rdx
-; AVX512DQ-NEXT: vmovq %rdx, %xmm4
-; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm6[0],xmm4[0]
-; AVX512DQ-NEXT: vucomisd %xmm3, %xmm1
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmovaq %rcx, %rdx
-; AVX512DQ-NEXT: vmovq %rdx, %xmm5
-; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm3 = xmm3[1,0]
-; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
-; AVX512DQ-NEXT: vucomisd %xmm3, %xmm1
-; AVX512DQ-NEXT: cmovaq %rcx, %rax
-; AVX512DQ-NEXT: vmovq %rax, %xmm1
-; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm5[0],xmm1[0]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
-; AVX512DQ-NEXT: vpmovqd %zmm1, %ymm1
-; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: vcmpltpd %zmm0, %zmm2, %k0
+; AVX512DQ-NEXT: vcmpltpd %zmm1, %zmm3, %k1
+; AVX512DQ-NEXT: kunpckbw %k0, %k1, %k0
+; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: test_cmp_v16f64:
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vextractf32x4 $3, %zmm2, %xmm4
-; AVX512BW-NEXT: vextractf32x4 $3, %zmm0, %xmm5
-; AVX512BW-NEXT: xorl %eax, %eax
-; AVX512BW-NEXT: vucomisd %xmm4, %xmm5
-; AVX512BW-NEXT: movq $-1, %rcx
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovaq %rcx, %rdx
-; AVX512BW-NEXT: vmovq %rdx, %xmm6
-; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
-; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm5 = xmm5[1,0]
-; AVX512BW-NEXT: vucomisd %xmm4, %xmm5
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovaq %rcx, %rdx
-; AVX512BW-NEXT: vmovq %rdx, %xmm4
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm6[0],xmm4[0]
-; AVX512BW-NEXT: vextractf32x4 $2, %zmm2, %xmm5
-; AVX512BW-NEXT: vextractf32x4 $2, %zmm0, %xmm6
-; AVX512BW-NEXT: vucomisd %xmm5, %xmm6
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovaq %rcx, %rdx
-; AVX512BW-NEXT: vmovq %rdx, %xmm7
-; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm5 = xmm5[1,0]
-; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm6 = xmm6[1,0]
-; AVX512BW-NEXT: vucomisd %xmm5, %xmm6
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovaq %rcx, %rdx
-; AVX512BW-NEXT: vmovq %rdx, %xmm5
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm7[0],xmm5[0]
-; AVX512BW-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4
-; AVX512BW-NEXT: vextractf32x4 $1, %zmm2, %xmm5
-; AVX512BW-NEXT: vextractf32x4 $1, %zmm0, %xmm6
-; AVX512BW-NEXT: vucomisd %xmm5, %xmm6
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovaq %rcx, %rdx
-; AVX512BW-NEXT: vmovq %rdx, %xmm7
-; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm5 = xmm5[1,0]
-; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm6 = xmm6[1,0]
-; AVX512BW-NEXT: vucomisd %xmm5, %xmm6
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovaq %rcx, %rdx
-; AVX512BW-NEXT: vmovq %rdx, %xmm5
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm7[0],xmm5[0]
-; AVX512BW-NEXT: vucomisd %xmm2, %xmm0
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovaq %rcx, %rdx
-; AVX512BW-NEXT: vmovq %rdx, %xmm6
-; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
-; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX512BW-NEXT: vucomisd %xmm2, %xmm0
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovaq %rcx, %rdx
-; AVX512BW-NEXT: vmovq %rdx, %xmm0
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm6[0],xmm0[0]
-; AVX512BW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm0
-; AVX512BW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0
-; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512BW-NEXT: vextractf32x4 $3, %zmm3, %xmm2
-; AVX512BW-NEXT: vextractf32x4 $3, %zmm1, %xmm4
-; AVX512BW-NEXT: vucomisd %xmm2, %xmm4
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovaq %rcx, %rdx
-; AVX512BW-NEXT: vmovq %rdx, %xmm5
-; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
-; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
-; AVX512BW-NEXT: vucomisd %xmm2, %xmm4
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovaq %rcx, %rdx
-; AVX512BW-NEXT: vmovq %rdx, %xmm2
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm5[0],xmm2[0]
-; AVX512BW-NEXT: vextractf32x4 $2, %zmm3, %xmm4
-; AVX512BW-NEXT: vextractf32x4 $2, %zmm1, %xmm5
-; AVX512BW-NEXT: vucomisd %xmm4, %xmm5
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovaq %rcx, %rdx
-; AVX512BW-NEXT: vmovq %rdx, %xmm6
-; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
-; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm5 = xmm5[1,0]
-; AVX512BW-NEXT: vucomisd %xmm4, %xmm5
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovaq %rcx, %rdx
-; AVX512BW-NEXT: vmovq %rdx, %xmm4
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm6[0],xmm4[0]
-; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2
-; AVX512BW-NEXT: vextractf32x4 $1, %zmm3, %xmm4
-; AVX512BW-NEXT: vextractf32x4 $1, %zmm1, %xmm5
-; AVX512BW-NEXT: vucomisd %xmm4, %xmm5
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovaq %rcx, %rdx
-; AVX512BW-NEXT: vmovq %rdx, %xmm6
-; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
-; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm5 = xmm5[1,0]
-; AVX512BW-NEXT: vucomisd %xmm4, %xmm5
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovaq %rcx, %rdx
-; AVX512BW-NEXT: vmovq %rdx, %xmm4
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm6[0],xmm4[0]
-; AVX512BW-NEXT: vucomisd %xmm3, %xmm1
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovaq %rcx, %rdx
-; AVX512BW-NEXT: vmovq %rdx, %xmm5
-; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm3 = xmm3[1,0]
-; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
-; AVX512BW-NEXT: vucomisd %xmm3, %xmm1
-; AVX512BW-NEXT: cmovaq %rcx, %rax
-; AVX512BW-NEXT: vmovq %rax, %xmm1
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm5[0],xmm1[0]
-; AVX512BW-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1
-; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
-; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1
-; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vcmpltpd %zmm0, %zmm2, %k0
+; AVX512BW-NEXT: vcmpltpd %zmm1, %zmm3, %k1
+; AVX512BW-NEXT: kunpckbw %k0, %k1, %k0
+; AVX512BW-NEXT: vpmovm2b %k0, %zmm0
+; AVX512BW-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
%1 = fcmp ogt <16 x double> %a0, %a1
@@ -2426,67 +2107,30 @@ define <16 x i1> @test_cmp_v16f64(<16 x double> %a0, <16 x double> %a1) nounwind
define <32 x i1> @test_cmp_v32f32(<32 x float> %a0, <32 x float> %a1) nounwind {
; SSE2-LABEL: test_cmp_v32f32:
-; SSE2: # BB#0:
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm8
+; SSE2: # %bb.0:
; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm9
-; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm10
; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm11
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm10
; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm12
+; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm8
; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm13
; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm14
; SSE2-NEXT: movaps {{[0-9]+}}(%rsp), %xmm15
; SSE2-NEXT: cmpltps %xmm3, %xmm15
-; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm15[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
; SSE2-NEXT: cmpltps %xmm2, %xmm14
-; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm14[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; SSE2-NEXT: psllw $15, %xmm2
-; SSE2-NEXT: psraw $15, %xmm2
-; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
-; SSE2-NEXT: pand %xmm3, %xmm2
+; SSE2-NEXT: packssdw %xmm15, %xmm14
; SSE2-NEXT: cmpltps %xmm1, %xmm13
-; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm13[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE2-NEXT: cmpltps %xmm0, %xmm12
-; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm12[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE2-NEXT: psllw $15, %xmm0
-; SSE2-NEXT: psraw $15, %xmm0
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: packuswb %xmm2, %xmm0
-; SSE2-NEXT: cmpltps %xmm7, %xmm11
-; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm11[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE2-NEXT: cmpltps %xmm0, %xmm8
+; SSE2-NEXT: packssdw %xmm13, %xmm8
+; SSE2-NEXT: packsswb %xmm14, %xmm8
+; SSE2-NEXT: cmpltps %xmm7, %xmm12
; SSE2-NEXT: cmpltps %xmm6, %xmm10
-; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm10[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
-; SSE2-NEXT: psllw $15, %xmm2
-; SSE2-NEXT: psraw $15, %xmm2
-; SSE2-NEXT: pand %xmm3, %xmm2
-; SSE2-NEXT: cmpltps %xmm5, %xmm9
-; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm9[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE2-NEXT: cmpltps %xmm4, %xmm8
-; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm8[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm1[0]
-; SSE2-NEXT: psllw $15, %xmm4
-; SSE2-NEXT: psraw $15, %xmm4
-; SSE2-NEXT: pand %xmm3, %xmm4
-; SSE2-NEXT: packuswb %xmm2, %xmm4
-; SSE2-NEXT: movdqa %xmm4, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: packssdw %xmm12, %xmm10
+; SSE2-NEXT: cmpltps %xmm5, %xmm11
+; SSE2-NEXT: cmpltps %xmm4, %xmm9
+; SSE2-NEXT: packssdw %xmm11, %xmm9
+; SSE2-NEXT: packsswb %xmm10, %xmm9
+; SSE2-NEXT: movdqa %xmm9, -{{[0-9]+}}(%rsp)
; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
; SSE2-NEXT: andb $1, %al
; SSE2-NEXT: movb %al, 2(%rdi)
@@ -2535,7 +2179,7 @@ define <32 x i1> @test_cmp_v32f32(<32 x float> %a0, <32 x float> %a1) nounwind {
; SSE2-NEXT: movb %cl, 2(%rdi)
; SSE2-NEXT: andb $1, %al
; SSE2-NEXT: movb %al, 2(%rdi)
-; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: movdqa %xmm8, -{{[0-9]+}}(%rsp)
; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
; SSE2-NEXT: andb $1, %al
; SSE2-NEXT: movb %al, (%rdi)
@@ -2588,135 +2232,114 @@ define <32 x i1> @test_cmp_v32f32(<32 x float> %a0, <32 x float> %a1) nounwind {
; SSE2-NEXT: retq
;
; SSE42-LABEL: test_cmp_v32f32:
-; SSE42: # BB#0:
-; SSE42-NEXT: movaps {{[0-9]+}}(%rsp), %xmm11
+; SSE42: # %bb.0:
+; SSE42-NEXT: movaps {{[0-9]+}}(%rsp), %xmm15
+; SSE42-NEXT: movaps {{[0-9]+}}(%rsp), %xmm14
+; SSE42-NEXT: movaps {{[0-9]+}}(%rsp), %xmm13
; SSE42-NEXT: movaps {{[0-9]+}}(%rsp), %xmm12
+; SSE42-NEXT: movaps {{[0-9]+}}(%rsp), %xmm11
; SSE42-NEXT: movaps {{[0-9]+}}(%rsp), %xmm10
-; SSE42-NEXT: movaps {{[0-9]+}}(%rsp), %xmm13
; SSE42-NEXT: movaps {{[0-9]+}}(%rsp), %xmm9
-; SSE42-NEXT: movaps {{[0-9]+}}(%rsp), %xmm14
; SSE42-NEXT: movaps {{[0-9]+}}(%rsp), %xmm8
-; SSE42-NEXT: movaps {{[0-9]+}}(%rsp), %xmm15
-; SSE42-NEXT: cmpltps %xmm1, %xmm15
-; SSE42-NEXT: movdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; SSE42-NEXT: pshufb %xmm1, %xmm15
; SSE42-NEXT: cmpltps %xmm0, %xmm8
-; SSE42-NEXT: pshufb %xmm1, %xmm8
-; SSE42-NEXT: punpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm15[0]
-; SSE42-NEXT: psllw $15, %xmm8
-; SSE42-NEXT: psraw $15, %xmm8
-; SSE42-NEXT: cmpltps %xmm3, %xmm14
-; SSE42-NEXT: pshufb %xmm1, %xmm14
-; SSE42-NEXT: cmpltps %xmm2, %xmm9
-; SSE42-NEXT: pshufb %xmm1, %xmm9
-; SSE42-NEXT: punpcklqdq {{.*#+}} xmm9 = xmm9[0],xmm14[0]
-; SSE42-NEXT: psllw $15, %xmm9
-; SSE42-NEXT: psraw $15, %xmm9
+; SSE42-NEXT: cmpltps %xmm1, %xmm9
+; SSE42-NEXT: cmpltps %xmm2, %xmm10
+; SSE42-NEXT: cmpltps %xmm3, %xmm11
+; SSE42-NEXT: cmpltps %xmm4, %xmm12
; SSE42-NEXT: cmpltps %xmm5, %xmm13
-; SSE42-NEXT: pshufb %xmm1, %xmm13
-; SSE42-NEXT: cmpltps %xmm4, %xmm10
-; SSE42-NEXT: pshufb %xmm1, %xmm10
-; SSE42-NEXT: punpcklqdq {{.*#+}} xmm10 = xmm10[0],xmm13[0]
-; SSE42-NEXT: psllw $15, %xmm10
-; SSE42-NEXT: psraw $15, %xmm10
-; SSE42-NEXT: cmpltps %xmm7, %xmm12
-; SSE42-NEXT: pshufb %xmm1, %xmm12
-; SSE42-NEXT: cmpltps %xmm6, %xmm11
-; SSE42-NEXT: pshufb %xmm1, %xmm11
-; SSE42-NEXT: punpcklqdq {{.*#+}} xmm11 = xmm11[0],xmm12[0]
-; SSE42-NEXT: psllw $15, %xmm11
-; SSE42-NEXT: psraw $15, %xmm11
-; SSE42-NEXT: pextrb $14, %xmm11, %eax
+; SSE42-NEXT: cmpltps %xmm6, %xmm14
+; SSE42-NEXT: cmpltps %xmm7, %xmm15
+; SSE42-NEXT: pextrb $12, %xmm15, %eax
; SSE42-NEXT: andb $1, %al
; SSE42-NEXT: movb %al, 2(%rdi)
-; SSE42-NEXT: pextrb $12, %xmm11, %eax
+; SSE42-NEXT: pextrb $8, %xmm15, %eax
; SSE42-NEXT: andb $1, %al
; SSE42-NEXT: movb %al, 2(%rdi)
-; SSE42-NEXT: pextrb $10, %xmm11, %eax
+; SSE42-NEXT: pextrb $4, %xmm15, %eax
; SSE42-NEXT: andb $1, %al
; SSE42-NEXT: movb %al, 2(%rdi)
-; SSE42-NEXT: pextrb $8, %xmm11, %eax
+; SSE42-NEXT: pextrb $0, %xmm15, %eax
; SSE42-NEXT: andb $1, %al
; SSE42-NEXT: movb %al, 2(%rdi)
-; SSE42-NEXT: pextrb $6, %xmm11, %eax
+; SSE42-NEXT: pextrb $12, %xmm14, %eax
; SSE42-NEXT: andb $1, %al
; SSE42-NEXT: movb %al, 2(%rdi)
-; SSE42-NEXT: pextrb $4, %xmm11, %eax
+; SSE42-NEXT: pextrb $8, %xmm14, %eax
; SSE42-NEXT: andb $1, %al
; SSE42-NEXT: movb %al, 2(%rdi)
-; SSE42-NEXT: pextrb $2, %xmm11, %eax
+; SSE42-NEXT: pextrb $4, %xmm14, %eax
; SSE42-NEXT: andb $1, %al
; SSE42-NEXT: movb %al, 2(%rdi)
-; SSE42-NEXT: pextrb $0, %xmm11, %eax
+; SSE42-NEXT: pextrb $0, %xmm14, %eax
; SSE42-NEXT: andb $1, %al
; SSE42-NEXT: movb %al, 2(%rdi)
-; SSE42-NEXT: pextrb $14, %xmm10, %eax
+; SSE42-NEXT: pextrb $12, %xmm13, %eax
; SSE42-NEXT: andb $1, %al
; SSE42-NEXT: movb %al, 2(%rdi)
-; SSE42-NEXT: pextrb $12, %xmm10, %eax
+; SSE42-NEXT: pextrb $8, %xmm13, %eax
; SSE42-NEXT: andb $1, %al
; SSE42-NEXT: movb %al, 2(%rdi)
-; SSE42-NEXT: pextrb $10, %xmm10, %eax
+; SSE42-NEXT: pextrb $4, %xmm13, %eax
; SSE42-NEXT: andb $1, %al
; SSE42-NEXT: movb %al, 2(%rdi)
-; SSE42-NEXT: pextrb $8, %xmm10, %eax
+; SSE42-NEXT: pextrb $0, %xmm13, %eax
; SSE42-NEXT: andb $1, %al
; SSE42-NEXT: movb %al, 2(%rdi)
-; SSE42-NEXT: pextrb $6, %xmm10, %eax
+; SSE42-NEXT: pextrb $12, %xmm12, %eax
; SSE42-NEXT: andb $1, %al
; SSE42-NEXT: movb %al, 2(%rdi)
-; SSE42-NEXT: pextrb $4, %xmm10, %eax
+; SSE42-NEXT: pextrb $8, %xmm12, %eax
; SSE42-NEXT: andb $1, %al
; SSE42-NEXT: movb %al, 2(%rdi)
-; SSE42-NEXT: pextrb $2, %xmm10, %eax
+; SSE42-NEXT: pextrb $4, %xmm12, %eax
; SSE42-NEXT: andb $1, %al
; SSE42-NEXT: movb %al, 2(%rdi)
-; SSE42-NEXT: pextrb $0, %xmm10, %eax
+; SSE42-NEXT: pextrb $0, %xmm12, %eax
; SSE42-NEXT: andb $1, %al
; SSE42-NEXT: movb %al, 2(%rdi)
-; SSE42-NEXT: pextrb $14, %xmm9, %eax
+; SSE42-NEXT: pextrb $12, %xmm11, %eax
; SSE42-NEXT: andb $1, %al
; SSE42-NEXT: movb %al, (%rdi)
-; SSE42-NEXT: pextrb $12, %xmm9, %eax
+; SSE42-NEXT: pextrb $8, %xmm11, %eax
; SSE42-NEXT: andb $1, %al
; SSE42-NEXT: movb %al, (%rdi)
-; SSE42-NEXT: pextrb $10, %xmm9, %eax
+; SSE42-NEXT: pextrb $4, %xmm11, %eax
; SSE42-NEXT: andb $1, %al
; SSE42-NEXT: movb %al, (%rdi)
-; SSE42-NEXT: pextrb $8, %xmm9, %eax
+; SSE42-NEXT: pextrb $0, %xmm11, %eax
; SSE42-NEXT: andb $1, %al
; SSE42-NEXT: movb %al, (%rdi)
-; SSE42-NEXT: pextrb $6, %xmm9, %eax
+; SSE42-NEXT: pextrb $12, %xmm10, %eax
; SSE42-NEXT: andb $1, %al
; SSE42-NEXT: movb %al, (%rdi)
-; SSE42-NEXT: pextrb $4, %xmm9, %eax
+; SSE42-NEXT: pextrb $8, %xmm10, %eax
; SSE42-NEXT: andb $1, %al
; SSE42-NEXT: movb %al, (%rdi)
-; SSE42-NEXT: pextrb $2, %xmm9, %eax
+; SSE42-NEXT: pextrb $4, %xmm10, %eax
; SSE42-NEXT: andb $1, %al
; SSE42-NEXT: movb %al, (%rdi)
-; SSE42-NEXT: pextrb $0, %xmm9, %eax
+; SSE42-NEXT: pextrb $0, %xmm10, %eax
; SSE42-NEXT: andb $1, %al
; SSE42-NEXT: movb %al, (%rdi)
-; SSE42-NEXT: pextrb $14, %xmm8, %eax
+; SSE42-NEXT: pextrb $12, %xmm9, %eax
; SSE42-NEXT: andb $1, %al
; SSE42-NEXT: movb %al, (%rdi)
-; SSE42-NEXT: pextrb $12, %xmm8, %eax
+; SSE42-NEXT: pextrb $8, %xmm9, %eax
; SSE42-NEXT: andb $1, %al
; SSE42-NEXT: movb %al, (%rdi)
-; SSE42-NEXT: pextrb $10, %xmm8, %eax
+; SSE42-NEXT: pextrb $4, %xmm9, %eax
; SSE42-NEXT: andb $1, %al
; SSE42-NEXT: movb %al, (%rdi)
-; SSE42-NEXT: pextrb $8, %xmm8, %eax
+; SSE42-NEXT: pextrb $0, %xmm9, %eax
; SSE42-NEXT: andb $1, %al
; SSE42-NEXT: movb %al, (%rdi)
-; SSE42-NEXT: pextrb $6, %xmm8, %eax
+; SSE42-NEXT: pextrb $12, %xmm8, %eax
; SSE42-NEXT: andb $1, %al
; SSE42-NEXT: movb %al, (%rdi)
-; SSE42-NEXT: pextrb $4, %xmm8, %eax
+; SSE42-NEXT: pextrb $8, %xmm8, %eax
; SSE42-NEXT: andb $1, %al
; SSE42-NEXT: movb %al, (%rdi)
-; SSE42-NEXT: pextrb $2, %xmm8, %eax
+; SSE42-NEXT: pextrb $4, %xmm8, %eax
; SSE42-NEXT: andb $1, %al
; SSE42-NEXT: movb %al, (%rdi)
; SSE42-NEXT: pextrb $0, %xmm8, %eax
@@ -2726,643 +2349,77 @@ define <32 x i1> @test_cmp_v32f32(<32 x float> %a0, <32 x float> %a1) nounwind {
; SSE42-NEXT: retq
;
; AVX1-LABEL: test_cmp_v32f32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vcmpltps %ymm3, %ymm7, %ymm3
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm7
-; AVX1-NEXT: vpacksswb %xmm7, %xmm3, %xmm3
+; AVX1-NEXT: vpackssdw %xmm7, %xmm3, %xmm3
; AVX1-NEXT: vcmpltps %ymm2, %ymm6, %ymm2
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6
-; AVX1-NEXT: vpacksswb %xmm6, %xmm2, %xmm2
+; AVX1-NEXT: vpackssdw %xmm6, %xmm2, %xmm2
; AVX1-NEXT: vpacksswb %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vcmpltps %ymm1, %ymm5, %ymm1
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT: vpacksswb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpackssdw %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vcmpltps %ymm0, %ymm4, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT: vpacksswb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpackssdw %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_cmp_v32f32:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vcmpltps %ymm3, %ymm7, %ymm3
; AVX2-NEXT: vcmpltps %ymm2, %ymm6, %ymm2
-; AVX2-NEXT: vpacksswb %ymm3, %ymm2, %ymm2
+; AVX2-NEXT: vpackssdw %ymm3, %ymm2, %ymm2
; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
; AVX2-NEXT: vcmpltps %ymm1, %ymm5, %ymm1
; AVX2-NEXT: vcmpltps %ymm0, %ymm4, %ymm0
-; AVX2-NEXT: vpacksswb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX2-NEXT: vpacksswb %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test_cmp_v32f32:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vextractf32x4 $3, %zmm2, %xmm4
-; AVX512F-NEXT: vmovshdup {{.*#+}} xmm5 = xmm4[1,1,3,3]
-; AVX512F-NEXT: vextractf32x4 $3, %zmm0, %xmm6
-; AVX512F-NEXT: vmovshdup {{.*#+}} xmm7 = xmm6[1,1,3,3]
-; AVX512F-NEXT: xorl %eax, %eax
-; AVX512F-NEXT: vucomiss %xmm5, %xmm7
-; AVX512F-NEXT: movl $-1, %ecx
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmoval %ecx, %edx
-; AVX512F-NEXT: vucomiss %xmm4, %xmm6
-; AVX512F-NEXT: movl $0, %esi
-; AVX512F-NEXT: cmoval %ecx, %esi
-; AVX512F-NEXT: vmovd %esi, %xmm5
-; AVX512F-NEXT: vpinsrd $1, %edx, %xmm5, %xmm8
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm7 = xmm4[1,0]
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm5 = xmm6[1,0]
-; AVX512F-NEXT: vucomiss %xmm7, %xmm5
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmoval %ecx, %edx
-; AVX512F-NEXT: vpinsrd $2, %edx, %xmm8, %xmm5
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[3,1,2,3]
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[3,1,2,3]
-; AVX512F-NEXT: vucomiss %xmm4, %xmm6
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmoval %ecx, %edx
-; AVX512F-NEXT: vpinsrd $3, %edx, %xmm5, %xmm8
-; AVX512F-NEXT: vextractf32x4 $2, %zmm2, %xmm5
-; AVX512F-NEXT: vmovshdup {{.*#+}} xmm6 = xmm5[1,1,3,3]
-; AVX512F-NEXT: vextractf32x4 $2, %zmm0, %xmm7
-; AVX512F-NEXT: vmovshdup {{.*#+}} xmm4 = xmm7[1,1,3,3]
-; AVX512F-NEXT: vucomiss %xmm6, %xmm4
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmoval %ecx, %edx
-; AVX512F-NEXT: vucomiss %xmm5, %xmm7
-; AVX512F-NEXT: movl $0, %esi
-; AVX512F-NEXT: cmoval %ecx, %esi
-; AVX512F-NEXT: vmovd %esi, %xmm4
-; AVX512F-NEXT: vpinsrd $1, %edx, %xmm4, %xmm9
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm6 = xmm5[1,0]
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm4 = xmm7[1,0]
-; AVX512F-NEXT: vucomiss %xmm6, %xmm4
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmoval %ecx, %edx
-; AVX512F-NEXT: vpinsrd $2, %edx, %xmm9, %xmm4
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[3,1,2,3]
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm6 = xmm7[3,1,2,3]
-; AVX512F-NEXT: vucomiss %xmm5, %xmm6
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmoval %ecx, %edx
-; AVX512F-NEXT: vpinsrd $3, %edx, %xmm4, %xmm4
-; AVX512F-NEXT: vinserti128 $1, %xmm8, %ymm4, %ymm8
-; AVX512F-NEXT: vextractf32x4 $1, %zmm2, %xmm5
-; AVX512F-NEXT: vmovshdup {{.*#+}} xmm6 = xmm5[1,1,3,3]
-; AVX512F-NEXT: vextractf32x4 $1, %zmm0, %xmm7
-; AVX512F-NEXT: vmovshdup {{.*#+}} xmm4 = xmm7[1,1,3,3]
-; AVX512F-NEXT: vucomiss %xmm6, %xmm4
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmoval %ecx, %edx
-; AVX512F-NEXT: vucomiss %xmm5, %xmm7
-; AVX512F-NEXT: movl $0, %esi
-; AVX512F-NEXT: cmoval %ecx, %esi
-; AVX512F-NEXT: vmovd %esi, %xmm4
-; AVX512F-NEXT: vpinsrd $1, %edx, %xmm4, %xmm9
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm6 = xmm5[1,0]
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm4 = xmm7[1,0]
-; AVX512F-NEXT: vucomiss %xmm6, %xmm4
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmoval %ecx, %edx
-; AVX512F-NEXT: vpinsrd $2, %edx, %xmm9, %xmm4
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[3,1,2,3]
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm6 = xmm7[3,1,2,3]
-; AVX512F-NEXT: vucomiss %xmm5, %xmm6
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmoval %ecx, %edx
-; AVX512F-NEXT: vpinsrd $3, %edx, %xmm4, %xmm4
-; AVX512F-NEXT: vmovshdup {{.*#+}} xmm5 = xmm2[1,1,3,3]
-; AVX512F-NEXT: vmovshdup {{.*#+}} xmm6 = xmm0[1,1,3,3]
-; AVX512F-NEXT: vucomiss %xmm5, %xmm6
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmoval %ecx, %edx
-; AVX512F-NEXT: vucomiss %xmm2, %xmm0
-; AVX512F-NEXT: movl $0, %esi
-; AVX512F-NEXT: cmoval %ecx, %esi
-; AVX512F-NEXT: vmovd %esi, %xmm5
-; AVX512F-NEXT: vpinsrd $1, %edx, %xmm5, %xmm5
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm6 = xmm2[1,0]
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm7 = xmm0[1,0]
-; AVX512F-NEXT: vucomiss %xmm6, %xmm7
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmoval %ecx, %edx
-; AVX512F-NEXT: vpinsrd $2, %edx, %xmm5, %xmm5
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3]
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX512F-NEXT: vucomiss %xmm2, %xmm0
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmoval %ecx, %edx
-; AVX512F-NEXT: vpinsrd $3, %edx, %xmm5, %xmm0
-; AVX512F-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0
-; AVX512F-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm0
-; AVX512F-NEXT: vpmovdb %zmm0, %xmm8
-; AVX512F-NEXT: vextractf32x4 $3, %zmm3, %xmm2
-; AVX512F-NEXT: vmovshdup {{.*#+}} xmm4 = xmm2[1,1,3,3]
-; AVX512F-NEXT: vextractf32x4 $3, %zmm1, %xmm5
-; AVX512F-NEXT: vmovshdup {{.*#+}} xmm6 = xmm5[1,1,3,3]
-; AVX512F-NEXT: vucomiss %xmm4, %xmm6
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmoval %ecx, %edx
-; AVX512F-NEXT: vucomiss %xmm2, %xmm5
-; AVX512F-NEXT: movl $0, %esi
-; AVX512F-NEXT: cmoval %ecx, %esi
-; AVX512F-NEXT: vmovd %esi, %xmm4
-; AVX512F-NEXT: vpinsrd $1, %edx, %xmm4, %xmm4
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm6 = xmm2[1,0]
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm7 = xmm5[1,0]
-; AVX512F-NEXT: vucomiss %xmm6, %xmm7
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmoval %ecx, %edx
-; AVX512F-NEXT: vpinsrd $2, %edx, %xmm4, %xmm4
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3]
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[3,1,2,3]
-; AVX512F-NEXT: vucomiss %xmm2, %xmm5
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmoval %ecx, %edx
-; AVX512F-NEXT: vpinsrd $3, %edx, %xmm4, %xmm2
-; AVX512F-NEXT: vextractf32x4 $2, %zmm3, %xmm4
-; AVX512F-NEXT: vmovshdup {{.*#+}} xmm5 = xmm4[1,1,3,3]
-; AVX512F-NEXT: vextractf32x4 $2, %zmm1, %xmm6
-; AVX512F-NEXT: vmovshdup {{.*#+}} xmm7 = xmm6[1,1,3,3]
-; AVX512F-NEXT: vucomiss %xmm5, %xmm7
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmoval %ecx, %edx
-; AVX512F-NEXT: vucomiss %xmm4, %xmm6
-; AVX512F-NEXT: movl $0, %esi
-; AVX512F-NEXT: cmoval %ecx, %esi
-; AVX512F-NEXT: vmovd %esi, %xmm5
-; AVX512F-NEXT: vpinsrd $1, %edx, %xmm5, %xmm5
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm7 = xmm4[1,0]
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm6[1,0]
-; AVX512F-NEXT: vucomiss %xmm7, %xmm0
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmoval %ecx, %edx
-; AVX512F-NEXT: vpinsrd $2, %edx, %xmm5, %xmm0
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[3,1,2,3]
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm5 = xmm6[3,1,2,3]
-; AVX512F-NEXT: vucomiss %xmm4, %xmm5
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmoval %ecx, %edx
-; AVX512F-NEXT: vpinsrd $3, %edx, %xmm0, %xmm0
-; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-NEXT: vextractf32x4 $1, %zmm3, %xmm0
-; AVX512F-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
-; AVX512F-NEXT: vextractf32x4 $1, %zmm1, %xmm5
-; AVX512F-NEXT: vmovshdup {{.*#+}} xmm6 = xmm5[1,1,3,3]
-; AVX512F-NEXT: vucomiss %xmm4, %xmm6
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmoval %ecx, %edx
-; AVX512F-NEXT: vucomiss %xmm0, %xmm5
-; AVX512F-NEXT: movl $0, %esi
-; AVX512F-NEXT: cmoval %ecx, %esi
-; AVX512F-NEXT: vmovd %esi, %xmm4
-; AVX512F-NEXT: vpinsrd $1, %edx, %xmm4, %xmm4
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm6 = xmm0[1,0]
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm7 = xmm5[1,0]
-; AVX512F-NEXT: vucomiss %xmm6, %xmm7
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmoval %ecx, %edx
-; AVX512F-NEXT: vpinsrd $2, %edx, %xmm4, %xmm4
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[3,1,2,3]
-; AVX512F-NEXT: vucomiss %xmm0, %xmm5
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmoval %ecx, %edx
-; AVX512F-NEXT: vpinsrd $3, %edx, %xmm4, %xmm0
-; AVX512F-NEXT: vmovshdup {{.*#+}} xmm4 = xmm3[1,1,3,3]
-; AVX512F-NEXT: vmovshdup {{.*#+}} xmm5 = xmm1[1,1,3,3]
-; AVX512F-NEXT: vucomiss %xmm4, %xmm5
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmoval %ecx, %edx
-; AVX512F-NEXT: vucomiss %xmm3, %xmm1
-; AVX512F-NEXT: movl $0, %esi
-; AVX512F-NEXT: cmoval %ecx, %esi
-; AVX512F-NEXT: vmovd %esi, %xmm4
-; AVX512F-NEXT: vpinsrd $1, %edx, %xmm4, %xmm4
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm5 = xmm3[1,0]
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm6 = xmm1[1,0]
-; AVX512F-NEXT: vucomiss %xmm5, %xmm6
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmoval %ecx, %edx
-; AVX512F-NEXT: vpinsrd $2, %edx, %xmm4, %xmm4
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[3,1,2,3]
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
-; AVX512F-NEXT: vucomiss %xmm3, %xmm1
-; AVX512F-NEXT: cmoval %ecx, %eax
-; AVX512F-NEXT: vpinsrd $3, %eax, %xmm4, %xmm1
-; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vcmpltps %zmm0, %zmm2, %k1
+; AVX512F-NEXT: movl {{.*}}(%rip), %eax
+; AVX512F-NEXT: vpbroadcastd %eax, %zmm0 {%k1} {z}
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm8, %ymm0
+; AVX512F-NEXT: vcmpltps %zmm1, %zmm3, %k1
+; AVX512F-NEXT: vpbroadcastd %eax, %zmm1 {%k1} {z}
+; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
+; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT: vpsllw $7, %ymm0, %ymm0
+; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512F-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0
; AVX512F-NEXT: retq
;
; AVX512DQ-LABEL: test_cmp_v32f32:
-; AVX512DQ: # BB#0:
-; AVX512DQ-NEXT: vextractf32x4 $3, %zmm2, %xmm4
-; AVX512DQ-NEXT: vmovshdup {{.*#+}} xmm5 = xmm4[1,1,3,3]
-; AVX512DQ-NEXT: vextractf32x4 $3, %zmm0, %xmm6
-; AVX512DQ-NEXT: vmovshdup {{.*#+}} xmm7 = xmm6[1,1,3,3]
-; AVX512DQ-NEXT: xorl %eax, %eax
-; AVX512DQ-NEXT: vucomiss %xmm5, %xmm7
-; AVX512DQ-NEXT: movl $-1, %ecx
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmoval %ecx, %edx
-; AVX512DQ-NEXT: vucomiss %xmm4, %xmm6
-; AVX512DQ-NEXT: movl $0, %esi
-; AVX512DQ-NEXT: cmoval %ecx, %esi
-; AVX512DQ-NEXT: vmovd %esi, %xmm5
-; AVX512DQ-NEXT: vpinsrd $1, %edx, %xmm5, %xmm8
-; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm7 = xmm4[1,0]
-; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm5 = xmm6[1,0]
-; AVX512DQ-NEXT: vucomiss %xmm7, %xmm5
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmoval %ecx, %edx
-; AVX512DQ-NEXT: vpinsrd $2, %edx, %xmm8, %xmm5
-; AVX512DQ-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[3,1,2,3]
-; AVX512DQ-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[3,1,2,3]
-; AVX512DQ-NEXT: vucomiss %xmm4, %xmm6
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmoval %ecx, %edx
-; AVX512DQ-NEXT: vpinsrd $3, %edx, %xmm5, %xmm8
-; AVX512DQ-NEXT: vextractf32x4 $2, %zmm2, %xmm5
-; AVX512DQ-NEXT: vmovshdup {{.*#+}} xmm6 = xmm5[1,1,3,3]
-; AVX512DQ-NEXT: vextractf32x4 $2, %zmm0, %xmm7
-; AVX512DQ-NEXT: vmovshdup {{.*#+}} xmm4 = xmm7[1,1,3,3]
-; AVX512DQ-NEXT: vucomiss %xmm6, %xmm4
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmoval %ecx, %edx
-; AVX512DQ-NEXT: vucomiss %xmm5, %xmm7
-; AVX512DQ-NEXT: movl $0, %esi
-; AVX512DQ-NEXT: cmoval %ecx, %esi
-; AVX512DQ-NEXT: vmovd %esi, %xmm4
-; AVX512DQ-NEXT: vpinsrd $1, %edx, %xmm4, %xmm9
-; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm6 = xmm5[1,0]
-; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm4 = xmm7[1,0]
-; AVX512DQ-NEXT: vucomiss %xmm6, %xmm4
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmoval %ecx, %edx
-; AVX512DQ-NEXT: vpinsrd $2, %edx, %xmm9, %xmm4
-; AVX512DQ-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[3,1,2,3]
-; AVX512DQ-NEXT: vpermilps {{.*#+}} xmm6 = xmm7[3,1,2,3]
-; AVX512DQ-NEXT: vucomiss %xmm5, %xmm6
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmoval %ecx, %edx
-; AVX512DQ-NEXT: vpinsrd $3, %edx, %xmm4, %xmm4
-; AVX512DQ-NEXT: vinserti128 $1, %xmm8, %ymm4, %ymm8
-; AVX512DQ-NEXT: vextractf32x4 $1, %zmm2, %xmm5
-; AVX512DQ-NEXT: vmovshdup {{.*#+}} xmm6 = xmm5[1,1,3,3]
-; AVX512DQ-NEXT: vextractf32x4 $1, %zmm0, %xmm7
-; AVX512DQ-NEXT: vmovshdup {{.*#+}} xmm4 = xmm7[1,1,3,3]
-; AVX512DQ-NEXT: vucomiss %xmm6, %xmm4
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmoval %ecx, %edx
-; AVX512DQ-NEXT: vucomiss %xmm5, %xmm7
-; AVX512DQ-NEXT: movl $0, %esi
-; AVX512DQ-NEXT: cmoval %ecx, %esi
-; AVX512DQ-NEXT: vmovd %esi, %xmm4
-; AVX512DQ-NEXT: vpinsrd $1, %edx, %xmm4, %xmm9
-; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm6 = xmm5[1,0]
-; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm4 = xmm7[1,0]
-; AVX512DQ-NEXT: vucomiss %xmm6, %xmm4
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmoval %ecx, %edx
-; AVX512DQ-NEXT: vpinsrd $2, %edx, %xmm9, %xmm4
-; AVX512DQ-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[3,1,2,3]
-; AVX512DQ-NEXT: vpermilps {{.*#+}} xmm6 = xmm7[3,1,2,3]
-; AVX512DQ-NEXT: vucomiss %xmm5, %xmm6
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmoval %ecx, %edx
-; AVX512DQ-NEXT: vpinsrd $3, %edx, %xmm4, %xmm4
-; AVX512DQ-NEXT: vmovshdup {{.*#+}} xmm5 = xmm2[1,1,3,3]
-; AVX512DQ-NEXT: vmovshdup {{.*#+}} xmm6 = xmm0[1,1,3,3]
-; AVX512DQ-NEXT: vucomiss %xmm5, %xmm6
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmoval %ecx, %edx
-; AVX512DQ-NEXT: vucomiss %xmm2, %xmm0
-; AVX512DQ-NEXT: movl $0, %esi
-; AVX512DQ-NEXT: cmoval %ecx, %esi
-; AVX512DQ-NEXT: vmovd %esi, %xmm5
-; AVX512DQ-NEXT: vpinsrd $1, %edx, %xmm5, %xmm5
-; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm6 = xmm2[1,0]
-; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm7 = xmm0[1,0]
-; AVX512DQ-NEXT: vucomiss %xmm6, %xmm7
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmoval %ecx, %edx
-; AVX512DQ-NEXT: vpinsrd $2, %edx, %xmm5, %xmm5
-; AVX512DQ-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3]
-; AVX512DQ-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX512DQ-NEXT: vucomiss %xmm2, %xmm0
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmoval %ecx, %edx
-; AVX512DQ-NEXT: vpinsrd $3, %edx, %xmm5, %xmm0
-; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0
-; AVX512DQ-NEXT: vinserti32x8 $1, %ymm8, %zmm0, %zmm0
-; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm8
-; AVX512DQ-NEXT: vextractf32x4 $3, %zmm3, %xmm2
-; AVX512DQ-NEXT: vmovshdup {{.*#+}} xmm4 = xmm2[1,1,3,3]
-; AVX512DQ-NEXT: vextractf32x4 $3, %zmm1, %xmm5
-; AVX512DQ-NEXT: vmovshdup {{.*#+}} xmm6 = xmm5[1,1,3,3]
-; AVX512DQ-NEXT: vucomiss %xmm4, %xmm6
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmoval %ecx, %edx
-; AVX512DQ-NEXT: vucomiss %xmm2, %xmm5
-; AVX512DQ-NEXT: movl $0, %esi
-; AVX512DQ-NEXT: cmoval %ecx, %esi
-; AVX512DQ-NEXT: vmovd %esi, %xmm4
-; AVX512DQ-NEXT: vpinsrd $1, %edx, %xmm4, %xmm4
-; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm6 = xmm2[1,0]
-; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm7 = xmm5[1,0]
-; AVX512DQ-NEXT: vucomiss %xmm6, %xmm7
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmoval %ecx, %edx
-; AVX512DQ-NEXT: vpinsrd $2, %edx, %xmm4, %xmm4
-; AVX512DQ-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3]
-; AVX512DQ-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[3,1,2,3]
-; AVX512DQ-NEXT: vucomiss %xmm2, %xmm5
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmoval %ecx, %edx
-; AVX512DQ-NEXT: vpinsrd $3, %edx, %xmm4, %xmm2
-; AVX512DQ-NEXT: vextractf32x4 $2, %zmm3, %xmm4
-; AVX512DQ-NEXT: vmovshdup {{.*#+}} xmm5 = xmm4[1,1,3,3]
-; AVX512DQ-NEXT: vextractf32x4 $2, %zmm1, %xmm6
-; AVX512DQ-NEXT: vmovshdup {{.*#+}} xmm7 = xmm6[1,1,3,3]
-; AVX512DQ-NEXT: vucomiss %xmm5, %xmm7
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmoval %ecx, %edx
-; AVX512DQ-NEXT: vucomiss %xmm4, %xmm6
-; AVX512DQ-NEXT: movl $0, %esi
-; AVX512DQ-NEXT: cmoval %ecx, %esi
-; AVX512DQ-NEXT: vmovd %esi, %xmm5
-; AVX512DQ-NEXT: vpinsrd $1, %edx, %xmm5, %xmm5
-; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm7 = xmm4[1,0]
-; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm0 = xmm6[1,0]
-; AVX512DQ-NEXT: vucomiss %xmm7, %xmm0
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmoval %ecx, %edx
-; AVX512DQ-NEXT: vpinsrd $2, %edx, %xmm5, %xmm0
-; AVX512DQ-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[3,1,2,3]
-; AVX512DQ-NEXT: vpermilps {{.*#+}} xmm5 = xmm6[3,1,2,3]
-; AVX512DQ-NEXT: vucomiss %xmm4, %xmm5
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmoval %ecx, %edx
-; AVX512DQ-NEXT: vpinsrd $3, %edx, %xmm0, %xmm0
-; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512DQ-NEXT: vextractf32x4 $1, %zmm3, %xmm0
-; AVX512DQ-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
-; AVX512DQ-NEXT: vextractf32x4 $1, %zmm1, %xmm5
-; AVX512DQ-NEXT: vmovshdup {{.*#+}} xmm6 = xmm5[1,1,3,3]
-; AVX512DQ-NEXT: vucomiss %xmm4, %xmm6
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmoval %ecx, %edx
-; AVX512DQ-NEXT: vucomiss %xmm0, %xmm5
-; AVX512DQ-NEXT: movl $0, %esi
-; AVX512DQ-NEXT: cmoval %ecx, %esi
-; AVX512DQ-NEXT: vmovd %esi, %xmm4
-; AVX512DQ-NEXT: vpinsrd $1, %edx, %xmm4, %xmm4
-; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm6 = xmm0[1,0]
-; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm7 = xmm5[1,0]
-; AVX512DQ-NEXT: vucomiss %xmm6, %xmm7
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmoval %ecx, %edx
-; AVX512DQ-NEXT: vpinsrd $2, %edx, %xmm4, %xmm4
-; AVX512DQ-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX512DQ-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[3,1,2,3]
-; AVX512DQ-NEXT: vucomiss %xmm0, %xmm5
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmoval %ecx, %edx
-; AVX512DQ-NEXT: vpinsrd $3, %edx, %xmm4, %xmm0
-; AVX512DQ-NEXT: vmovshdup {{.*#+}} xmm4 = xmm3[1,1,3,3]
-; AVX512DQ-NEXT: vmovshdup {{.*#+}} xmm5 = xmm1[1,1,3,3]
-; AVX512DQ-NEXT: vucomiss %xmm4, %xmm5
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmoval %ecx, %edx
-; AVX512DQ-NEXT: vucomiss %xmm3, %xmm1
-; AVX512DQ-NEXT: movl $0, %esi
-; AVX512DQ-NEXT: cmoval %ecx, %esi
-; AVX512DQ-NEXT: vmovd %esi, %xmm4
-; AVX512DQ-NEXT: vpinsrd $1, %edx, %xmm4, %xmm4
-; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm5 = xmm3[1,0]
-; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm6 = xmm1[1,0]
-; AVX512DQ-NEXT: vucomiss %xmm5, %xmm6
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmoval %ecx, %edx
-; AVX512DQ-NEXT: vpinsrd $2, %edx, %xmm4, %xmm4
-; AVX512DQ-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[3,1,2,3]
-; AVX512DQ-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
-; AVX512DQ-NEXT: vucomiss %xmm3, %xmm1
-; AVX512DQ-NEXT: cmoval %ecx, %eax
-; AVX512DQ-NEXT: vpinsrd $3, %eax, %xmm4, %xmm1
-; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512DQ-NEXT: vinserti32x8 $1, %ymm2, %zmm0, %zmm0
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: vcmpltps %zmm0, %zmm2, %k1
+; AVX512DQ-NEXT: movl {{.*}}(%rip), %eax
+; AVX512DQ-NEXT: vpbroadcastd %eax, %zmm0 {%k1} {z}
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm8, %ymm0
+; AVX512DQ-NEXT: vcmpltps %zmm1, %zmm3, %k1
+; AVX512DQ-NEXT: vpbroadcastd %eax, %zmm1 {%k1} {z}
+; AVX512DQ-NEXT: vpmovdb %zmm1, %xmm1
+; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512DQ-NEXT: vpsllw $7, %ymm0, %ymm0
+; AVX512DQ-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512DQ-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: test_cmp_v32f32:
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vextractf32x4 $3, %zmm2, %xmm4
-; AVX512BW-NEXT: vmovshdup {{.*#+}} xmm5 = xmm4[1,1,3,3]
-; AVX512BW-NEXT: vextractf32x4 $3, %zmm0, %xmm6
-; AVX512BW-NEXT: vmovshdup {{.*#+}} xmm7 = xmm6[1,1,3,3]
-; AVX512BW-NEXT: xorl %eax, %eax
-; AVX512BW-NEXT: vucomiss %xmm5, %xmm7
-; AVX512BW-NEXT: movl $-1, %ecx
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmoval %ecx, %edx
-; AVX512BW-NEXT: vucomiss %xmm4, %xmm6
-; AVX512BW-NEXT: movl $0, %esi
-; AVX512BW-NEXT: cmoval %ecx, %esi
-; AVX512BW-NEXT: vmovd %esi, %xmm5
-; AVX512BW-NEXT: vpinsrd $1, %edx, %xmm5, %xmm8
-; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm7 = xmm4[1,0]
-; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm5 = xmm6[1,0]
-; AVX512BW-NEXT: vucomiss %xmm7, %xmm5
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmoval %ecx, %edx
-; AVX512BW-NEXT: vpinsrd $2, %edx, %xmm8, %xmm5
-; AVX512BW-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[3,1,2,3]
-; AVX512BW-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[3,1,2,3]
-; AVX512BW-NEXT: vucomiss %xmm4, %xmm6
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmoval %ecx, %edx
-; AVX512BW-NEXT: vpinsrd $3, %edx, %xmm5, %xmm8
-; AVX512BW-NEXT: vextractf32x4 $2, %zmm2, %xmm5
-; AVX512BW-NEXT: vmovshdup {{.*#+}} xmm6 = xmm5[1,1,3,3]
-; AVX512BW-NEXT: vextractf32x4 $2, %zmm0, %xmm7
-; AVX512BW-NEXT: vmovshdup {{.*#+}} xmm4 = xmm7[1,1,3,3]
-; AVX512BW-NEXT: vucomiss %xmm6, %xmm4
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmoval %ecx, %edx
-; AVX512BW-NEXT: vucomiss %xmm5, %xmm7
-; AVX512BW-NEXT: movl $0, %esi
-; AVX512BW-NEXT: cmoval %ecx, %esi
-; AVX512BW-NEXT: vmovd %esi, %xmm4
-; AVX512BW-NEXT: vpinsrd $1, %edx, %xmm4, %xmm9
-; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm6 = xmm5[1,0]
-; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm4 = xmm7[1,0]
-; AVX512BW-NEXT: vucomiss %xmm6, %xmm4
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmoval %ecx, %edx
-; AVX512BW-NEXT: vpinsrd $2, %edx, %xmm9, %xmm4
-; AVX512BW-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[3,1,2,3]
-; AVX512BW-NEXT: vpermilps {{.*#+}} xmm6 = xmm7[3,1,2,3]
-; AVX512BW-NEXT: vucomiss %xmm5, %xmm6
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmoval %ecx, %edx
-; AVX512BW-NEXT: vpinsrd $3, %edx, %xmm4, %xmm4
-; AVX512BW-NEXT: vinserti128 $1, %xmm8, %ymm4, %ymm8
-; AVX512BW-NEXT: vextractf32x4 $1, %zmm2, %xmm5
-; AVX512BW-NEXT: vmovshdup {{.*#+}} xmm6 = xmm5[1,1,3,3]
-; AVX512BW-NEXT: vextractf32x4 $1, %zmm0, %xmm7
-; AVX512BW-NEXT: vmovshdup {{.*#+}} xmm4 = xmm7[1,1,3,3]
-; AVX512BW-NEXT: vucomiss %xmm6, %xmm4
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmoval %ecx, %edx
-; AVX512BW-NEXT: vucomiss %xmm5, %xmm7
-; AVX512BW-NEXT: movl $0, %esi
-; AVX512BW-NEXT: cmoval %ecx, %esi
-; AVX512BW-NEXT: vmovd %esi, %xmm4
-; AVX512BW-NEXT: vpinsrd $1, %edx, %xmm4, %xmm9
-; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm6 = xmm5[1,0]
-; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm4 = xmm7[1,0]
-; AVX512BW-NEXT: vucomiss %xmm6, %xmm4
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmoval %ecx, %edx
-; AVX512BW-NEXT: vpinsrd $2, %edx, %xmm9, %xmm4
-; AVX512BW-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[3,1,2,3]
-; AVX512BW-NEXT: vpermilps {{.*#+}} xmm6 = xmm7[3,1,2,3]
-; AVX512BW-NEXT: vucomiss %xmm5, %xmm6
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmoval %ecx, %edx
-; AVX512BW-NEXT: vpinsrd $3, %edx, %xmm4, %xmm4
-; AVX512BW-NEXT: vmovshdup {{.*#+}} xmm5 = xmm2[1,1,3,3]
-; AVX512BW-NEXT: vmovshdup {{.*#+}} xmm6 = xmm0[1,1,3,3]
-; AVX512BW-NEXT: vucomiss %xmm5, %xmm6
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmoval %ecx, %edx
-; AVX512BW-NEXT: vucomiss %xmm2, %xmm0
-; AVX512BW-NEXT: movl $0, %esi
-; AVX512BW-NEXT: cmoval %ecx, %esi
-; AVX512BW-NEXT: vmovd %esi, %xmm5
-; AVX512BW-NEXT: vpinsrd $1, %edx, %xmm5, %xmm5
-; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm6 = xmm2[1,0]
-; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm7 = xmm0[1,0]
-; AVX512BW-NEXT: vucomiss %xmm6, %xmm7
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmoval %ecx, %edx
-; AVX512BW-NEXT: vpinsrd $2, %edx, %xmm5, %xmm5
-; AVX512BW-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3]
-; AVX512BW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX512BW-NEXT: vucomiss %xmm2, %xmm0
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmoval %ecx, %edx
-; AVX512BW-NEXT: vpinsrd $3, %edx, %xmm5, %xmm0
-; AVX512BW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0
-; AVX512BW-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm0
-; AVX512BW-NEXT: vpmovdw %zmm0, %ymm8
-; AVX512BW-NEXT: vextractf32x4 $3, %zmm3, %xmm2
-; AVX512BW-NEXT: vmovshdup {{.*#+}} xmm4 = xmm2[1,1,3,3]
-; AVX512BW-NEXT: vextractf32x4 $3, %zmm1, %xmm5
-; AVX512BW-NEXT: vmovshdup {{.*#+}} xmm6 = xmm5[1,1,3,3]
-; AVX512BW-NEXT: vucomiss %xmm4, %xmm6
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmoval %ecx, %edx
-; AVX512BW-NEXT: vucomiss %xmm2, %xmm5
-; AVX512BW-NEXT: movl $0, %esi
-; AVX512BW-NEXT: cmoval %ecx, %esi
-; AVX512BW-NEXT: vmovd %esi, %xmm4
-; AVX512BW-NEXT: vpinsrd $1, %edx, %xmm4, %xmm4
-; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm6 = xmm2[1,0]
-; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm7 = xmm5[1,0]
-; AVX512BW-NEXT: vucomiss %xmm6, %xmm7
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmoval %ecx, %edx
-; AVX512BW-NEXT: vpinsrd $2, %edx, %xmm4, %xmm4
-; AVX512BW-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3]
-; AVX512BW-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[3,1,2,3]
-; AVX512BW-NEXT: vucomiss %xmm2, %xmm5
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmoval %ecx, %edx
-; AVX512BW-NEXT: vpinsrd $3, %edx, %xmm4, %xmm2
-; AVX512BW-NEXT: vextractf32x4 $2, %zmm3, %xmm4
-; AVX512BW-NEXT: vmovshdup {{.*#+}} xmm5 = xmm4[1,1,3,3]
-; AVX512BW-NEXT: vextractf32x4 $2, %zmm1, %xmm6
-; AVX512BW-NEXT: vmovshdup {{.*#+}} xmm7 = xmm6[1,1,3,3]
-; AVX512BW-NEXT: vucomiss %xmm5, %xmm7
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmoval %ecx, %edx
-; AVX512BW-NEXT: vucomiss %xmm4, %xmm6
-; AVX512BW-NEXT: movl $0, %esi
-; AVX512BW-NEXT: cmoval %ecx, %esi
-; AVX512BW-NEXT: vmovd %esi, %xmm5
-; AVX512BW-NEXT: vpinsrd $1, %edx, %xmm5, %xmm5
-; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm7 = xmm4[1,0]
-; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm0 = xmm6[1,0]
-; AVX512BW-NEXT: vucomiss %xmm7, %xmm0
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmoval %ecx, %edx
-; AVX512BW-NEXT: vpinsrd $2, %edx, %xmm5, %xmm0
-; AVX512BW-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[3,1,2,3]
-; AVX512BW-NEXT: vpermilps {{.*#+}} xmm5 = xmm6[3,1,2,3]
-; AVX512BW-NEXT: vucomiss %xmm4, %xmm5
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmoval %ecx, %edx
-; AVX512BW-NEXT: vpinsrd $3, %edx, %xmm0, %xmm0
-; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512BW-NEXT: vextractf32x4 $1, %zmm3, %xmm0
-; AVX512BW-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
-; AVX512BW-NEXT: vextractf32x4 $1, %zmm1, %xmm5
-; AVX512BW-NEXT: vmovshdup {{.*#+}} xmm6 = xmm5[1,1,3,3]
-; AVX512BW-NEXT: vucomiss %xmm4, %xmm6
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmoval %ecx, %edx
-; AVX512BW-NEXT: vucomiss %xmm0, %xmm5
-; AVX512BW-NEXT: movl $0, %esi
-; AVX512BW-NEXT: cmoval %ecx, %esi
-; AVX512BW-NEXT: vmovd %esi, %xmm4
-; AVX512BW-NEXT: vpinsrd $1, %edx, %xmm4, %xmm4
-; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm6 = xmm0[1,0]
-; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm7 = xmm5[1,0]
-; AVX512BW-NEXT: vucomiss %xmm6, %xmm7
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmoval %ecx, %edx
-; AVX512BW-NEXT: vpinsrd $2, %edx, %xmm4, %xmm4
-; AVX512BW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX512BW-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[3,1,2,3]
-; AVX512BW-NEXT: vucomiss %xmm0, %xmm5
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmoval %ecx, %edx
-; AVX512BW-NEXT: vpinsrd $3, %edx, %xmm4, %xmm0
-; AVX512BW-NEXT: vmovshdup {{.*#+}} xmm4 = xmm3[1,1,3,3]
-; AVX512BW-NEXT: vmovshdup {{.*#+}} xmm5 = xmm1[1,1,3,3]
-; AVX512BW-NEXT: vucomiss %xmm4, %xmm5
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmoval %ecx, %edx
-; AVX512BW-NEXT: vucomiss %xmm3, %xmm1
-; AVX512BW-NEXT: movl $0, %esi
-; AVX512BW-NEXT: cmoval %ecx, %esi
-; AVX512BW-NEXT: vmovd %esi, %xmm4
-; AVX512BW-NEXT: vpinsrd $1, %edx, %xmm4, %xmm4
-; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm5 = xmm3[1,0]
-; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm6 = xmm1[1,0]
-; AVX512BW-NEXT: vucomiss %xmm5, %xmm6
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmoval %ecx, %edx
-; AVX512BW-NEXT: vpinsrd $2, %edx, %xmm4, %xmm4
-; AVX512BW-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[3,1,2,3]
-; AVX512BW-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
-; AVX512BW-NEXT: vucomiss %xmm3, %xmm1
-; AVX512BW-NEXT: cmoval %ecx, %eax
-; AVX512BW-NEXT: vpinsrd $3, %eax, %xmm4, %xmm1
-; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
-; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm8, %zmm0
-; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vcmpltps %zmm0, %zmm2, %k0
+; AVX512BW-NEXT: vcmpltps %zmm1, %zmm3, %k1
+; AVX512BW-NEXT: kunpckwd %k0, %k1, %k0
+; AVX512BW-NEXT: vpmovm2b %k0, %zmm0
+; AVX512BW-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
; AVX512BW-NEXT: retq
%1 = fcmp ogt <32 x float> %a0, %a1
ret <32 x i1> %1
@@ -3370,7 +2427,7 @@ define <32 x i1> @test_cmp_v32f32(<32 x float> %a0, <32 x float> %a1) nounwind {
define <16 x i1> @test_cmp_v16i64(<16 x i64> %a0, <16 x i64> %a1) nounwind {
; SSE2-LABEL: test_cmp_v16i64:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,0,2147483648,0]
; SSE2-NEXT: pxor %xmm8, %xmm7
; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9
@@ -3394,7 +2451,7 @@ define <16 x i1> @test_cmp_v16i64(<16 x i64> %a0, <16 x i64> %a1) nounwind {
; SSE2-NEXT: pand %xmm11, %xmm7
; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm10[1,1,3,3]
; SSE2-NEXT: por %xmm7, %xmm10
-; SSE2-NEXT: packsswb %xmm9, %xmm10
+; SSE2-NEXT: packssdw %xmm9, %xmm10
; SSE2-NEXT: pxor %xmm8, %xmm5
; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm7
; SSE2-NEXT: pxor %xmm8, %xmm7
@@ -3417,8 +2474,8 @@ define <16 x i1> @test_cmp_v16i64(<16 x i64> %a0, <16 x i64> %a1) nounwind {
; SSE2-NEXT: pand %xmm9, %xmm5
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm7[1,1,3,3]
; SSE2-NEXT: por %xmm5, %xmm4
-; SSE2-NEXT: packsswb %xmm6, %xmm4
-; SSE2-NEXT: packsswb %xmm10, %xmm4
+; SSE2-NEXT: packssdw %xmm6, %xmm4
+; SSE2-NEXT: packssdw %xmm10, %xmm4
; SSE2-NEXT: pxor %xmm8, %xmm3
; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm5
; SSE2-NEXT: pxor %xmm8, %xmm5
@@ -3441,7 +2498,7 @@ define <16 x i1> @test_cmp_v16i64(<16 x i64> %a0, <16 x i64> %a1) nounwind {
; SSE2-NEXT: pand %xmm7, %xmm3
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3]
; SSE2-NEXT: por %xmm3, %xmm2
-; SSE2-NEXT: packsswb %xmm5, %xmm2
+; SSE2-NEXT: packssdw %xmm5, %xmm2
; SSE2-NEXT: pxor %xmm8, %xmm1
; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm3
; SSE2-NEXT: pxor %xmm8, %xmm3
@@ -3463,67 +2520,67 @@ define <16 x i1> @test_cmp_v16i64(<16 x i64> %a0, <16 x i64> %a1) nounwind {
; SSE2-NEXT: pand %xmm5, %xmm6
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
; SSE2-NEXT: por %xmm6, %xmm0
-; SSE2-NEXT: packsswb %xmm3, %xmm0
-; SSE2-NEXT: packsswb %xmm2, %xmm0
+; SSE2-NEXT: packssdw %xmm3, %xmm0
+; SSE2-NEXT: packssdw %xmm2, %xmm0
; SSE2-NEXT: packsswb %xmm4, %xmm0
; SSE2-NEXT: retq
;
; SSE42-LABEL: test_cmp_v16i64:
-; SSE42: # BB#0:
+; SSE42: # %bb.0:
; SSE42-NEXT: pcmpgtq {{[0-9]+}}(%rsp), %xmm7
; SSE42-NEXT: pcmpgtq {{[0-9]+}}(%rsp), %xmm6
-; SSE42-NEXT: packsswb %xmm7, %xmm6
+; SSE42-NEXT: packssdw %xmm7, %xmm6
; SSE42-NEXT: pcmpgtq {{[0-9]+}}(%rsp), %xmm5
; SSE42-NEXT: pcmpgtq {{[0-9]+}}(%rsp), %xmm4
-; SSE42-NEXT: packsswb %xmm5, %xmm4
-; SSE42-NEXT: packsswb %xmm6, %xmm4
+; SSE42-NEXT: packssdw %xmm5, %xmm4
+; SSE42-NEXT: packssdw %xmm6, %xmm4
; SSE42-NEXT: pcmpgtq {{[0-9]+}}(%rsp), %xmm3
; SSE42-NEXT: pcmpgtq {{[0-9]+}}(%rsp), %xmm2
-; SSE42-NEXT: packsswb %xmm3, %xmm2
+; SSE42-NEXT: packssdw %xmm3, %xmm2
; SSE42-NEXT: pcmpgtq {{[0-9]+}}(%rsp), %xmm1
; SSE42-NEXT: pcmpgtq {{[0-9]+}}(%rsp), %xmm0
-; SSE42-NEXT: packsswb %xmm1, %xmm0
-; SSE42-NEXT: packsswb %xmm2, %xmm0
+; SSE42-NEXT: packssdw %xmm1, %xmm0
+; SSE42-NEXT: packssdw %xmm2, %xmm0
; SSE42-NEXT: packsswb %xmm4, %xmm0
; SSE42-NEXT: retq
;
; AVX1-LABEL: test_cmp_v16i64:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm8
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm9
; AVX1-NEXT: vpcmpgtq %xmm8, %xmm9, %xmm8
; AVX1-NEXT: vpcmpgtq %xmm7, %xmm3, %xmm3
-; AVX1-NEXT: vpacksswb %xmm8, %xmm3, %xmm8
+; AVX1-NEXT: vpackssdw %xmm8, %xmm3, %xmm8
; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm7
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
; AVX1-NEXT: vpcmpgtq %xmm7, %xmm3, %xmm3
; AVX1-NEXT: vpcmpgtq %xmm6, %xmm2, %xmm2
-; AVX1-NEXT: vpacksswb %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpacksswb %xmm8, %xmm2, %xmm2
+; AVX1-NEXT: vpackssdw %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpackssdw %xmm8, %xmm2, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm3
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm6
; AVX1-NEXT: vpcmpgtq %xmm3, %xmm6, %xmm3
; AVX1-NEXT: vpcmpgtq %xmm5, %xmm1, %xmm1
-; AVX1-NEXT: vpacksswb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpackssdw %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm3
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
; AVX1-NEXT: vpcmpgtq %xmm3, %xmm5, %xmm3
; AVX1-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vpacksswb %xmm3, %xmm0, %xmm0
-; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpackssdw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpacksswb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_cmp_v16i64:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpcmpgtq %ymm7, %ymm3, %ymm3
; AVX2-NEXT: vpcmpgtq %ymm6, %ymm2, %ymm2
-; AVX2-NEXT: vpacksswb %ymm3, %ymm2, %ymm2
+; AVX2-NEXT: vpackssdw %ymm3, %ymm2, %ymm2
; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
; AVX2-NEXT: vpcmpgtq %ymm5, %ymm1, %ymm1
; AVX2-NEXT: vpcmpgtq %ymm4, %ymm0, %ymm0
-; AVX2-NEXT: vpacksswb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX2-NEXT: vpacksswb %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
@@ -3533,398 +2590,32 @@ define <16 x i1> @test_cmp_v16i64(<16 x i64> %a0, <16 x i64> %a1) nounwind {
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test_cmp_v16i64:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vextracti32x4 $3, %zmm2, %xmm4
-; AVX512F-NEXT: vpextrq $1, %xmm4, %rcx
-; AVX512F-NEXT: vextracti32x4 $3, %zmm0, %xmm5
-; AVX512F-NEXT: vpextrq $1, %xmm5, %rdx
-; AVX512F-NEXT: xorl %eax, %eax
-; AVX512F-NEXT: cmpq %rcx, %rdx
-; AVX512F-NEXT: movq $-1, %rcx
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmovgq %rcx, %rdx
-; AVX512F-NEXT: vmovq %rdx, %xmm6
-; AVX512F-NEXT: vmovq %xmm4, %rdx
-; AVX512F-NEXT: vmovq %xmm5, %rsi
-; AVX512F-NEXT: cmpq %rdx, %rsi
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmovgq %rcx, %rdx
-; AVX512F-NEXT: vmovq %rdx, %xmm4
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm6[0]
-; AVX512F-NEXT: vextracti32x4 $2, %zmm2, %xmm5
-; AVX512F-NEXT: vpextrq $1, %xmm5, %rdx
-; AVX512F-NEXT: vextracti32x4 $2, %zmm0, %xmm6
-; AVX512F-NEXT: vpextrq $1, %xmm6, %rsi
-; AVX512F-NEXT: cmpq %rdx, %rsi
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmovgq %rcx, %rdx
-; AVX512F-NEXT: vmovq %rdx, %xmm7
-; AVX512F-NEXT: vmovq %xmm5, %rdx
-; AVX512F-NEXT: vmovq %xmm6, %rsi
-; AVX512F-NEXT: cmpq %rdx, %rsi
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmovgq %rcx, %rdx
-; AVX512F-NEXT: vmovq %rdx, %xmm5
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm7[0]
-; AVX512F-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4
-; AVX512F-NEXT: vextracti32x4 $1, %zmm2, %xmm5
-; AVX512F-NEXT: vpextrq $1, %xmm5, %rdx
-; AVX512F-NEXT: vextracti32x4 $1, %zmm0, %xmm6
-; AVX512F-NEXT: vpextrq $1, %xmm6, %rsi
-; AVX512F-NEXT: cmpq %rdx, %rsi
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmovgq %rcx, %rdx
-; AVX512F-NEXT: vmovq %rdx, %xmm7
-; AVX512F-NEXT: vmovq %xmm5, %rdx
-; AVX512F-NEXT: vmovq %xmm6, %rsi
-; AVX512F-NEXT: cmpq %rdx, %rsi
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmovgq %rcx, %rdx
-; AVX512F-NEXT: vmovq %rdx, %xmm5
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm7[0]
-; AVX512F-NEXT: vpextrq $1, %xmm2, %rdx
-; AVX512F-NEXT: vpextrq $1, %xmm0, %rsi
-; AVX512F-NEXT: cmpq %rdx, %rsi
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmovgq %rcx, %rdx
-; AVX512F-NEXT: vmovq %rdx, %xmm6
-; AVX512F-NEXT: vmovq %xmm2, %rdx
-; AVX512F-NEXT: vmovq %xmm0, %rsi
-; AVX512F-NEXT: cmpq %rdx, %rsi
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmovgq %rcx, %rdx
-; AVX512F-NEXT: vmovq %rdx, %xmm0
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm6[0]
-; AVX512F-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm0
-; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0
-; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512F-NEXT: vextracti32x4 $3, %zmm3, %xmm2
-; AVX512F-NEXT: vpextrq $1, %xmm2, %rdx
-; AVX512F-NEXT: vextracti32x4 $3, %zmm1, %xmm4
-; AVX512F-NEXT: vpextrq $1, %xmm4, %rsi
-; AVX512F-NEXT: cmpq %rdx, %rsi
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmovgq %rcx, %rdx
-; AVX512F-NEXT: vmovq %rdx, %xmm5
-; AVX512F-NEXT: vmovq %xmm2, %rdx
-; AVX512F-NEXT: vmovq %xmm4, %rsi
-; AVX512F-NEXT: cmpq %rdx, %rsi
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmovgq %rcx, %rdx
-; AVX512F-NEXT: vmovq %rdx, %xmm2
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm5[0]
-; AVX512F-NEXT: vextracti32x4 $2, %zmm3, %xmm4
-; AVX512F-NEXT: vpextrq $1, %xmm4, %rdx
-; AVX512F-NEXT: vextracti32x4 $2, %zmm1, %xmm5
-; AVX512F-NEXT: vpextrq $1, %xmm5, %rsi
-; AVX512F-NEXT: cmpq %rdx, %rsi
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmovgq %rcx, %rdx
-; AVX512F-NEXT: vmovq %rdx, %xmm6
-; AVX512F-NEXT: vmovq %xmm4, %rdx
-; AVX512F-NEXT: vmovq %xmm5, %rsi
-; AVX512F-NEXT: cmpq %rdx, %rsi
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmovgq %rcx, %rdx
-; AVX512F-NEXT: vmovq %rdx, %xmm4
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm6[0]
-; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2
-; AVX512F-NEXT: vextracti32x4 $1, %zmm3, %xmm4
-; AVX512F-NEXT: vpextrq $1, %xmm4, %rdx
-; AVX512F-NEXT: vextracti32x4 $1, %zmm1, %xmm5
-; AVX512F-NEXT: vpextrq $1, %xmm5, %rsi
-; AVX512F-NEXT: cmpq %rdx, %rsi
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmovgq %rcx, %rdx
-; AVX512F-NEXT: vmovq %rdx, %xmm6
-; AVX512F-NEXT: vmovq %xmm4, %rdx
-; AVX512F-NEXT: vmovq %xmm5, %rsi
-; AVX512F-NEXT: cmpq %rdx, %rsi
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmovgq %rcx, %rdx
-; AVX512F-NEXT: vmovq %rdx, %xmm4
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm6[0]
-; AVX512F-NEXT: vpextrq $1, %xmm3, %rdx
-; AVX512F-NEXT: vpextrq $1, %xmm1, %rsi
-; AVX512F-NEXT: cmpq %rdx, %rsi
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmovgq %rcx, %rdx
-; AVX512F-NEXT: vmovq %rdx, %xmm5
-; AVX512F-NEXT: vmovq %xmm3, %rdx
-; AVX512F-NEXT: vmovq %xmm1, %rsi
-; AVX512F-NEXT: cmpq %rdx, %rsi
-; AVX512F-NEXT: cmovgq %rcx, %rax
-; AVX512F-NEXT: vmovq %rax, %xmm1
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0]
-; AVX512F-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1
-; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
-; AVX512F-NEXT: vpmovqd %zmm1, %ymm1
-; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpcmpgtq %zmm2, %zmm0, %k0
+; AVX512F-NEXT: vpcmpgtq %zmm3, %zmm1, %k1
+; AVX512F-NEXT: kunpckbw %k0, %k1, %k1
+; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512DQ-LABEL: test_cmp_v16i64:
-; AVX512DQ: # BB#0:
-; AVX512DQ-NEXT: vextracti64x2 $3, %zmm2, %xmm4
-; AVX512DQ-NEXT: vpextrq $1, %xmm4, %rcx
-; AVX512DQ-NEXT: vextracti64x2 $3, %zmm0, %xmm5
-; AVX512DQ-NEXT: vpextrq $1, %xmm5, %rdx
-; AVX512DQ-NEXT: xorl %eax, %eax
-; AVX512DQ-NEXT: cmpq %rcx, %rdx
-; AVX512DQ-NEXT: movq $-1, %rcx
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmovgq %rcx, %rdx
-; AVX512DQ-NEXT: vmovq %rdx, %xmm6
-; AVX512DQ-NEXT: vmovq %xmm4, %rdx
-; AVX512DQ-NEXT: vmovq %xmm5, %rsi
-; AVX512DQ-NEXT: cmpq %rdx, %rsi
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmovgq %rcx, %rdx
-; AVX512DQ-NEXT: vmovq %rdx, %xmm4
-; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm6[0]
-; AVX512DQ-NEXT: vextracti64x2 $2, %zmm2, %xmm5
-; AVX512DQ-NEXT: vpextrq $1, %xmm5, %rdx
-; AVX512DQ-NEXT: vextracti64x2 $2, %zmm0, %xmm6
-; AVX512DQ-NEXT: vpextrq $1, %xmm6, %rsi
-; AVX512DQ-NEXT: cmpq %rdx, %rsi
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmovgq %rcx, %rdx
-; AVX512DQ-NEXT: vmovq %rdx, %xmm7
-; AVX512DQ-NEXT: vmovq %xmm5, %rdx
-; AVX512DQ-NEXT: vmovq %xmm6, %rsi
-; AVX512DQ-NEXT: cmpq %rdx, %rsi
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmovgq %rcx, %rdx
-; AVX512DQ-NEXT: vmovq %rdx, %xmm5
-; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm7[0]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4
-; AVX512DQ-NEXT: vextracti64x2 $1, %zmm2, %xmm5
-; AVX512DQ-NEXT: vpextrq $1, %xmm5, %rdx
-; AVX512DQ-NEXT: vextracti64x2 $1, %zmm0, %xmm6
-; AVX512DQ-NEXT: vpextrq $1, %xmm6, %rsi
-; AVX512DQ-NEXT: cmpq %rdx, %rsi
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmovgq %rcx, %rdx
-; AVX512DQ-NEXT: vmovq %rdx, %xmm7
-; AVX512DQ-NEXT: vmovq %xmm5, %rdx
-; AVX512DQ-NEXT: vmovq %xmm6, %rsi
-; AVX512DQ-NEXT: cmpq %rdx, %rsi
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmovgq %rcx, %rdx
-; AVX512DQ-NEXT: vmovq %rdx, %xmm5
-; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm7[0]
-; AVX512DQ-NEXT: vpextrq $1, %xmm2, %rdx
-; AVX512DQ-NEXT: vpextrq $1, %xmm0, %rsi
-; AVX512DQ-NEXT: cmpq %rdx, %rsi
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmovgq %rcx, %rdx
-; AVX512DQ-NEXT: vmovq %rdx, %xmm6
-; AVX512DQ-NEXT: vmovq %xmm2, %rdx
-; AVX512DQ-NEXT: vmovq %xmm0, %rsi
-; AVX512DQ-NEXT: cmpq %rdx, %rsi
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmovgq %rcx, %rdx
-; AVX512DQ-NEXT: vmovq %rdx, %xmm0
-; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm6[0]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm0
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0
-; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512DQ-NEXT: vextracti64x2 $3, %zmm3, %xmm2
-; AVX512DQ-NEXT: vpextrq $1, %xmm2, %rdx
-; AVX512DQ-NEXT: vextracti64x2 $3, %zmm1, %xmm4
-; AVX512DQ-NEXT: vpextrq $1, %xmm4, %rsi
-; AVX512DQ-NEXT: cmpq %rdx, %rsi
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmovgq %rcx, %rdx
-; AVX512DQ-NEXT: vmovq %rdx, %xmm5
-; AVX512DQ-NEXT: vmovq %xmm2, %rdx
-; AVX512DQ-NEXT: vmovq %xmm4, %rsi
-; AVX512DQ-NEXT: cmpq %rdx, %rsi
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmovgq %rcx, %rdx
-; AVX512DQ-NEXT: vmovq %rdx, %xmm2
-; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm5[0]
-; AVX512DQ-NEXT: vextracti64x2 $2, %zmm3, %xmm4
-; AVX512DQ-NEXT: vpextrq $1, %xmm4, %rdx
-; AVX512DQ-NEXT: vextracti64x2 $2, %zmm1, %xmm5
-; AVX512DQ-NEXT: vpextrq $1, %xmm5, %rsi
-; AVX512DQ-NEXT: cmpq %rdx, %rsi
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmovgq %rcx, %rdx
-; AVX512DQ-NEXT: vmovq %rdx, %xmm6
-; AVX512DQ-NEXT: vmovq %xmm4, %rdx
-; AVX512DQ-NEXT: vmovq %xmm5, %rsi
-; AVX512DQ-NEXT: cmpq %rdx, %rsi
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmovgq %rcx, %rdx
-; AVX512DQ-NEXT: vmovq %rdx, %xmm4
-; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm6[0]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2
-; AVX512DQ-NEXT: vextracti64x2 $1, %zmm3, %xmm4
-; AVX512DQ-NEXT: vpextrq $1, %xmm4, %rdx
-; AVX512DQ-NEXT: vextracti64x2 $1, %zmm1, %xmm5
-; AVX512DQ-NEXT: vpextrq $1, %xmm5, %rsi
-; AVX512DQ-NEXT: cmpq %rdx, %rsi
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmovgq %rcx, %rdx
-; AVX512DQ-NEXT: vmovq %rdx, %xmm6
-; AVX512DQ-NEXT: vmovq %xmm4, %rdx
-; AVX512DQ-NEXT: vmovq %xmm5, %rsi
-; AVX512DQ-NEXT: cmpq %rdx, %rsi
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmovgq %rcx, %rdx
-; AVX512DQ-NEXT: vmovq %rdx, %xmm4
-; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm6[0]
-; AVX512DQ-NEXT: vpextrq $1, %xmm3, %rdx
-; AVX512DQ-NEXT: vpextrq $1, %xmm1, %rsi
-; AVX512DQ-NEXT: cmpq %rdx, %rsi
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmovgq %rcx, %rdx
-; AVX512DQ-NEXT: vmovq %rdx, %xmm5
-; AVX512DQ-NEXT: vmovq %xmm3, %rdx
-; AVX512DQ-NEXT: vmovq %xmm1, %rsi
-; AVX512DQ-NEXT: cmpq %rdx, %rsi
-; AVX512DQ-NEXT: cmovgq %rcx, %rax
-; AVX512DQ-NEXT: vmovq %rax, %xmm1
-; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
-; AVX512DQ-NEXT: vpmovqd %zmm1, %ymm1
-; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: vpcmpgtq %zmm2, %zmm0, %k0
+; AVX512DQ-NEXT: vpcmpgtq %zmm3, %zmm1, %k1
+; AVX512DQ-NEXT: kunpckbw %k0, %k1, %k0
+; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: test_cmp_v16i64:
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vextracti32x4 $3, %zmm2, %xmm4
-; AVX512BW-NEXT: vpextrq $1, %xmm4, %rcx
-; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm5
-; AVX512BW-NEXT: vpextrq $1, %xmm5, %rdx
-; AVX512BW-NEXT: xorl %eax, %eax
-; AVX512BW-NEXT: cmpq %rcx, %rdx
-; AVX512BW-NEXT: movq $-1, %rcx
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovgq %rcx, %rdx
-; AVX512BW-NEXT: vmovq %rdx, %xmm6
-; AVX512BW-NEXT: vmovq %xmm4, %rdx
-; AVX512BW-NEXT: vmovq %xmm5, %rsi
-; AVX512BW-NEXT: cmpq %rdx, %rsi
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovgq %rcx, %rdx
-; AVX512BW-NEXT: vmovq %rdx, %xmm4
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm6[0]
-; AVX512BW-NEXT: vextracti32x4 $2, %zmm2, %xmm5
-; AVX512BW-NEXT: vpextrq $1, %xmm5, %rdx
-; AVX512BW-NEXT: vextracti32x4 $2, %zmm0, %xmm6
-; AVX512BW-NEXT: vpextrq $1, %xmm6, %rsi
-; AVX512BW-NEXT: cmpq %rdx, %rsi
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovgq %rcx, %rdx
-; AVX512BW-NEXT: vmovq %rdx, %xmm7
-; AVX512BW-NEXT: vmovq %xmm5, %rdx
-; AVX512BW-NEXT: vmovq %xmm6, %rsi
-; AVX512BW-NEXT: cmpq %rdx, %rsi
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovgq %rcx, %rdx
-; AVX512BW-NEXT: vmovq %rdx, %xmm5
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm7[0]
-; AVX512BW-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4
-; AVX512BW-NEXT: vextracti32x4 $1, %zmm2, %xmm5
-; AVX512BW-NEXT: vpextrq $1, %xmm5, %rdx
-; AVX512BW-NEXT: vextracti32x4 $1, %zmm0, %xmm6
-; AVX512BW-NEXT: vpextrq $1, %xmm6, %rsi
-; AVX512BW-NEXT: cmpq %rdx, %rsi
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovgq %rcx, %rdx
-; AVX512BW-NEXT: vmovq %rdx, %xmm7
-; AVX512BW-NEXT: vmovq %xmm5, %rdx
-; AVX512BW-NEXT: vmovq %xmm6, %rsi
-; AVX512BW-NEXT: cmpq %rdx, %rsi
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovgq %rcx, %rdx
-; AVX512BW-NEXT: vmovq %rdx, %xmm5
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm7[0]
-; AVX512BW-NEXT: vpextrq $1, %xmm2, %rdx
-; AVX512BW-NEXT: vpextrq $1, %xmm0, %rsi
-; AVX512BW-NEXT: cmpq %rdx, %rsi
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovgq %rcx, %rdx
-; AVX512BW-NEXT: vmovq %rdx, %xmm6
-; AVX512BW-NEXT: vmovq %xmm2, %rdx
-; AVX512BW-NEXT: vmovq %xmm0, %rsi
-; AVX512BW-NEXT: cmpq %rdx, %rsi
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovgq %rcx, %rdx
-; AVX512BW-NEXT: vmovq %rdx, %xmm0
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm6[0]
-; AVX512BW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm0
-; AVX512BW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0
-; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512BW-NEXT: vextracti32x4 $3, %zmm3, %xmm2
-; AVX512BW-NEXT: vpextrq $1, %xmm2, %rdx
-; AVX512BW-NEXT: vextracti32x4 $3, %zmm1, %xmm4
-; AVX512BW-NEXT: vpextrq $1, %xmm4, %rsi
-; AVX512BW-NEXT: cmpq %rdx, %rsi
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovgq %rcx, %rdx
-; AVX512BW-NEXT: vmovq %rdx, %xmm5
-; AVX512BW-NEXT: vmovq %xmm2, %rdx
-; AVX512BW-NEXT: vmovq %xmm4, %rsi
-; AVX512BW-NEXT: cmpq %rdx, %rsi
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovgq %rcx, %rdx
-; AVX512BW-NEXT: vmovq %rdx, %xmm2
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm5[0]
-; AVX512BW-NEXT: vextracti32x4 $2, %zmm3, %xmm4
-; AVX512BW-NEXT: vpextrq $1, %xmm4, %rdx
-; AVX512BW-NEXT: vextracti32x4 $2, %zmm1, %xmm5
-; AVX512BW-NEXT: vpextrq $1, %xmm5, %rsi
-; AVX512BW-NEXT: cmpq %rdx, %rsi
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovgq %rcx, %rdx
-; AVX512BW-NEXT: vmovq %rdx, %xmm6
-; AVX512BW-NEXT: vmovq %xmm4, %rdx
-; AVX512BW-NEXT: vmovq %xmm5, %rsi
-; AVX512BW-NEXT: cmpq %rdx, %rsi
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovgq %rcx, %rdx
-; AVX512BW-NEXT: vmovq %rdx, %xmm4
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm6[0]
-; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2
-; AVX512BW-NEXT: vextracti32x4 $1, %zmm3, %xmm4
-; AVX512BW-NEXT: vpextrq $1, %xmm4, %rdx
-; AVX512BW-NEXT: vextracti32x4 $1, %zmm1, %xmm5
-; AVX512BW-NEXT: vpextrq $1, %xmm5, %rsi
-; AVX512BW-NEXT: cmpq %rdx, %rsi
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovgq %rcx, %rdx
-; AVX512BW-NEXT: vmovq %rdx, %xmm6
-; AVX512BW-NEXT: vmovq %xmm4, %rdx
-; AVX512BW-NEXT: vmovq %xmm5, %rsi
-; AVX512BW-NEXT: cmpq %rdx, %rsi
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovgq %rcx, %rdx
-; AVX512BW-NEXT: vmovq %rdx, %xmm4
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm6[0]
-; AVX512BW-NEXT: vpextrq $1, %xmm3, %rdx
-; AVX512BW-NEXT: vpextrq $1, %xmm1, %rsi
-; AVX512BW-NEXT: cmpq %rdx, %rsi
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovgq %rcx, %rdx
-; AVX512BW-NEXT: vmovq %rdx, %xmm5
-; AVX512BW-NEXT: vmovq %xmm3, %rdx
-; AVX512BW-NEXT: vmovq %xmm1, %rsi
-; AVX512BW-NEXT: cmpq %rdx, %rsi
-; AVX512BW-NEXT: cmovgq %rcx, %rax
-; AVX512BW-NEXT: vmovq %rax, %xmm1
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0]
-; AVX512BW-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1
-; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
-; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1
-; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpcmpgtq %zmm2, %zmm0, %k0
+; AVX512BW-NEXT: vpcmpgtq %zmm3, %zmm1, %k1
+; AVX512BW-NEXT: kunpckbw %k0, %k1, %k0
+; AVX512BW-NEXT: vpmovm2b %k0, %zmm0
+; AVX512BW-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
%1 = icmp sgt <16 x i64> %a0, %a1
@@ -3933,58 +2624,21 @@ define <16 x i1> @test_cmp_v16i64(<16 x i64> %a0, <16 x i64> %a1) nounwind {
define <32 x i1> @test_cmp_v32i32(<32 x i32> %a0, <32 x i32> %a1) nounwind {
; SSE2-LABEL: test_cmp_v32i32:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: pcmpgtd {{[0-9]+}}(%rsp), %xmm3
-; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
; SSE2-NEXT: pcmpgtd {{[0-9]+}}(%rsp), %xmm2
-; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; SSE2-NEXT: psllw $15, %xmm2
-; SSE2-NEXT: psraw $15, %xmm2
-; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
-; SSE2-NEXT: pand %xmm3, %xmm2
+; SSE2-NEXT: packssdw %xmm3, %xmm2
; SSE2-NEXT: pcmpgtd {{[0-9]+}}(%rsp), %xmm1
-; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; SSE2-NEXT: pcmpgtd {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE2-NEXT: psllw $15, %xmm0
-; SSE2-NEXT: psraw $15, %xmm0
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: packuswb %xmm2, %xmm0
+; SSE2-NEXT: packssdw %xmm1, %xmm0
+; SSE2-NEXT: packsswb %xmm2, %xmm0
; SSE2-NEXT: pcmpgtd {{[0-9]+}}(%rsp), %xmm7
-; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm7[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; SSE2-NEXT: pcmpgtd {{[0-9]+}}(%rsp), %xmm6
-; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm6[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
-; SSE2-NEXT: psllw $15, %xmm2
-; SSE2-NEXT: psraw $15, %xmm2
-; SSE2-NEXT: pand %xmm3, %xmm2
+; SSE2-NEXT: packssdw %xmm7, %xmm6
; SSE2-NEXT: pcmpgtd {{[0-9]+}}(%rsp), %xmm5
-; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm5[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; SSE2-NEXT: pcmpgtd {{[0-9]+}}(%rsp), %xmm4
-; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm1[0]
-; SSE2-NEXT: psllw $15, %xmm4
-; SSE2-NEXT: psraw $15, %xmm4
-; SSE2-NEXT: pand %xmm3, %xmm4
-; SSE2-NEXT: packuswb %xmm2, %xmm4
+; SSE2-NEXT: packssdw %xmm5, %xmm4
+; SSE2-NEXT: packsswb %xmm6, %xmm4
; SSE2-NEXT: movdqa %xmm4, -{{[0-9]+}}(%rsp)
; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
; SSE2-NEXT: andb $1, %al
@@ -4087,127 +2741,106 @@ define <32 x i1> @test_cmp_v32i32(<32 x i32> %a0, <32 x i32> %a1) nounwind {
; SSE2-NEXT: retq
;
; SSE42-LABEL: test_cmp_v32i32:
-; SSE42: # BB#0:
-; SSE42-NEXT: pcmpgtd {{[0-9]+}}(%rsp), %xmm1
-; SSE42-NEXT: movdqa {{.*#+}} xmm8 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; SSE42-NEXT: pshufb %xmm8, %xmm1
+; SSE42: # %bb.0:
; SSE42-NEXT: pcmpgtd {{[0-9]+}}(%rsp), %xmm0
-; SSE42-NEXT: pshufb %xmm8, %xmm0
-; SSE42-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE42-NEXT: psllw $15, %xmm0
-; SSE42-NEXT: psraw $15, %xmm0
-; SSE42-NEXT: pcmpgtd {{[0-9]+}}(%rsp), %xmm3
-; SSE42-NEXT: pshufb %xmm8, %xmm3
+; SSE42-NEXT: pcmpgtd {{[0-9]+}}(%rsp), %xmm1
; SSE42-NEXT: pcmpgtd {{[0-9]+}}(%rsp), %xmm2
-; SSE42-NEXT: pshufb %xmm8, %xmm2
-; SSE42-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; SSE42-NEXT: psllw $15, %xmm2
-; SSE42-NEXT: psraw $15, %xmm2
-; SSE42-NEXT: pcmpgtd {{[0-9]+}}(%rsp), %xmm5
-; SSE42-NEXT: pshufb %xmm8, %xmm5
+; SSE42-NEXT: pcmpgtd {{[0-9]+}}(%rsp), %xmm3
; SSE42-NEXT: pcmpgtd {{[0-9]+}}(%rsp), %xmm4
-; SSE42-NEXT: pshufb %xmm8, %xmm4
-; SSE42-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0]
-; SSE42-NEXT: psllw $15, %xmm4
-; SSE42-NEXT: psraw $15, %xmm4
-; SSE42-NEXT: pcmpgtd {{[0-9]+}}(%rsp), %xmm7
-; SSE42-NEXT: pshufb %xmm8, %xmm7
+; SSE42-NEXT: pcmpgtd {{[0-9]+}}(%rsp), %xmm5
; SSE42-NEXT: pcmpgtd {{[0-9]+}}(%rsp), %xmm6
-; SSE42-NEXT: pshufb %xmm8, %xmm6
-; SSE42-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm7[0]
-; SSE42-NEXT: psllw $15, %xmm6
-; SSE42-NEXT: psraw $15, %xmm6
-; SSE42-NEXT: pextrb $14, %xmm6, %eax
+; SSE42-NEXT: pcmpgtd {{[0-9]+}}(%rsp), %xmm7
+; SSE42-NEXT: pextrb $12, %xmm7, %eax
; SSE42-NEXT: andb $1, %al
; SSE42-NEXT: movb %al, 2(%rdi)
-; SSE42-NEXT: pextrb $12, %xmm6, %eax
+; SSE42-NEXT: pextrb $8, %xmm7, %eax
; SSE42-NEXT: andb $1, %al
; SSE42-NEXT: movb %al, 2(%rdi)
-; SSE42-NEXT: pextrb $10, %xmm6, %eax
+; SSE42-NEXT: pextrb $4, %xmm7, %eax
; SSE42-NEXT: andb $1, %al
; SSE42-NEXT: movb %al, 2(%rdi)
-; SSE42-NEXT: pextrb $8, %xmm6, %eax
+; SSE42-NEXT: pextrb $0, %xmm7, %eax
; SSE42-NEXT: andb $1, %al
; SSE42-NEXT: movb %al, 2(%rdi)
-; SSE42-NEXT: pextrb $6, %xmm6, %eax
+; SSE42-NEXT: pextrb $12, %xmm6, %eax
; SSE42-NEXT: andb $1, %al
; SSE42-NEXT: movb %al, 2(%rdi)
-; SSE42-NEXT: pextrb $4, %xmm6, %eax
+; SSE42-NEXT: pextrb $8, %xmm6, %eax
; SSE42-NEXT: andb $1, %al
; SSE42-NEXT: movb %al, 2(%rdi)
-; SSE42-NEXT: pextrb $2, %xmm6, %eax
+; SSE42-NEXT: pextrb $4, %xmm6, %eax
; SSE42-NEXT: andb $1, %al
; SSE42-NEXT: movb %al, 2(%rdi)
; SSE42-NEXT: pextrb $0, %xmm6, %eax
; SSE42-NEXT: andb $1, %al
; SSE42-NEXT: movb %al, 2(%rdi)
-; SSE42-NEXT: pextrb $14, %xmm4, %eax
+; SSE42-NEXT: pextrb $12, %xmm5, %eax
; SSE42-NEXT: andb $1, %al
; SSE42-NEXT: movb %al, 2(%rdi)
-; SSE42-NEXT: pextrb $12, %xmm4, %eax
+; SSE42-NEXT: pextrb $8, %xmm5, %eax
; SSE42-NEXT: andb $1, %al
; SSE42-NEXT: movb %al, 2(%rdi)
-; SSE42-NEXT: pextrb $10, %xmm4, %eax
+; SSE42-NEXT: pextrb $4, %xmm5, %eax
; SSE42-NEXT: andb $1, %al
; SSE42-NEXT: movb %al, 2(%rdi)
-; SSE42-NEXT: pextrb $8, %xmm4, %eax
+; SSE42-NEXT: pextrb $0, %xmm5, %eax
; SSE42-NEXT: andb $1, %al
; SSE42-NEXT: movb %al, 2(%rdi)
-; SSE42-NEXT: pextrb $6, %xmm4, %eax
+; SSE42-NEXT: pextrb $12, %xmm4, %eax
; SSE42-NEXT: andb $1, %al
; SSE42-NEXT: movb %al, 2(%rdi)
-; SSE42-NEXT: pextrb $4, %xmm4, %eax
+; SSE42-NEXT: pextrb $8, %xmm4, %eax
; SSE42-NEXT: andb $1, %al
; SSE42-NEXT: movb %al, 2(%rdi)
-; SSE42-NEXT: pextrb $2, %xmm4, %eax
+; SSE42-NEXT: pextrb $4, %xmm4, %eax
; SSE42-NEXT: andb $1, %al
; SSE42-NEXT: movb %al, 2(%rdi)
; SSE42-NEXT: pextrb $0, %xmm4, %eax
; SSE42-NEXT: andb $1, %al
; SSE42-NEXT: movb %al, 2(%rdi)
-; SSE42-NEXT: pextrb $14, %xmm2, %eax
+; SSE42-NEXT: pextrb $12, %xmm3, %eax
; SSE42-NEXT: andb $1, %al
; SSE42-NEXT: movb %al, (%rdi)
-; SSE42-NEXT: pextrb $12, %xmm2, %eax
+; SSE42-NEXT: pextrb $8, %xmm3, %eax
; SSE42-NEXT: andb $1, %al
; SSE42-NEXT: movb %al, (%rdi)
-; SSE42-NEXT: pextrb $10, %xmm2, %eax
+; SSE42-NEXT: pextrb $4, %xmm3, %eax
; SSE42-NEXT: andb $1, %al
; SSE42-NEXT: movb %al, (%rdi)
-; SSE42-NEXT: pextrb $8, %xmm2, %eax
+; SSE42-NEXT: pextrb $0, %xmm3, %eax
; SSE42-NEXT: andb $1, %al
; SSE42-NEXT: movb %al, (%rdi)
-; SSE42-NEXT: pextrb $6, %xmm2, %eax
+; SSE42-NEXT: pextrb $12, %xmm2, %eax
; SSE42-NEXT: andb $1, %al
; SSE42-NEXT: movb %al, (%rdi)
-; SSE42-NEXT: pextrb $4, %xmm2, %eax
+; SSE42-NEXT: pextrb $8, %xmm2, %eax
; SSE42-NEXT: andb $1, %al
; SSE42-NEXT: movb %al, (%rdi)
-; SSE42-NEXT: pextrb $2, %xmm2, %eax
+; SSE42-NEXT: pextrb $4, %xmm2, %eax
; SSE42-NEXT: andb $1, %al
; SSE42-NEXT: movb %al, (%rdi)
; SSE42-NEXT: pextrb $0, %xmm2, %eax
; SSE42-NEXT: andb $1, %al
; SSE42-NEXT: movb %al, (%rdi)
-; SSE42-NEXT: pextrb $14, %xmm0, %eax
+; SSE42-NEXT: pextrb $12, %xmm1, %eax
; SSE42-NEXT: andb $1, %al
; SSE42-NEXT: movb %al, (%rdi)
-; SSE42-NEXT: pextrb $12, %xmm0, %eax
+; SSE42-NEXT: pextrb $8, %xmm1, %eax
; SSE42-NEXT: andb $1, %al
; SSE42-NEXT: movb %al, (%rdi)
-; SSE42-NEXT: pextrb $10, %xmm0, %eax
+; SSE42-NEXT: pextrb $4, %xmm1, %eax
; SSE42-NEXT: andb $1, %al
; SSE42-NEXT: movb %al, (%rdi)
-; SSE42-NEXT: pextrb $8, %xmm0, %eax
+; SSE42-NEXT: pextrb $0, %xmm1, %eax
; SSE42-NEXT: andb $1, %al
; SSE42-NEXT: movb %al, (%rdi)
-; SSE42-NEXT: pextrb $6, %xmm0, %eax
+; SSE42-NEXT: pextrb $12, %xmm0, %eax
; SSE42-NEXT: andb $1, %al
; SSE42-NEXT: movb %al, (%rdi)
-; SSE42-NEXT: pextrb $4, %xmm0, %eax
+; SSE42-NEXT: pextrb $8, %xmm0, %eax
; SSE42-NEXT: andb $1, %al
; SSE42-NEXT: movb %al, (%rdi)
-; SSE42-NEXT: pextrb $2, %xmm0, %eax
+; SSE42-NEXT: pextrb $4, %xmm0, %eax
; SSE42-NEXT: andb $1, %al
; SSE42-NEXT: movb %al, (%rdi)
; SSE42-NEXT: pextrb $0, %xmm0, %eax
@@ -4217,699 +2850,85 @@ define <32 x i1> @test_cmp_v32i32(<32 x i32> %a0, <32 x i32> %a1) nounwind {
; SSE42-NEXT: retq
;
; AVX1-LABEL: test_cmp_v32i32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm8
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm9
; AVX1-NEXT: vpcmpgtd %xmm8, %xmm9, %xmm8
; AVX1-NEXT: vpcmpgtd %xmm7, %xmm3, %xmm3
-; AVX1-NEXT: vpacksswb %xmm8, %xmm3, %xmm8
+; AVX1-NEXT: vpackssdw %xmm8, %xmm3, %xmm8
; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm7
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
; AVX1-NEXT: vpcmpgtd %xmm7, %xmm3, %xmm3
; AVX1-NEXT: vpcmpgtd %xmm6, %xmm2, %xmm2
-; AVX1-NEXT: vpacksswb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpackssdw %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpacksswb %xmm8, %xmm2, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm3
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm6
; AVX1-NEXT: vpcmpgtd %xmm3, %xmm6, %xmm3
; AVX1-NEXT: vpcmpgtd %xmm5, %xmm1, %xmm1
-; AVX1-NEXT: vpacksswb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpackssdw %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm3
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
; AVX1-NEXT: vpcmpgtd %xmm3, %xmm5, %xmm3
; AVX1-NEXT: vpcmpgtd %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vpacksswb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpackssdw %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_cmp_v32i32:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpcmpgtd %ymm7, %ymm3, %ymm3
; AVX2-NEXT: vpcmpgtd %ymm6, %ymm2, %ymm2
-; AVX2-NEXT: vpacksswb %ymm3, %ymm2, %ymm2
+; AVX2-NEXT: vpackssdw %ymm3, %ymm2, %ymm2
; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
; AVX2-NEXT: vpcmpgtd %ymm5, %ymm1, %ymm1
; AVX2-NEXT: vpcmpgtd %ymm4, %ymm0, %ymm0
-; AVX2-NEXT: vpacksswb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX2-NEXT: vpacksswb %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test_cmp_v32i32:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vextracti32x4 $3, %zmm2, %xmm4
-; AVX512F-NEXT: vpextrd $1, %xmm4, %ecx
-; AVX512F-NEXT: vextracti32x4 $3, %zmm0, %xmm5
-; AVX512F-NEXT: vpextrd $1, %xmm5, %edx
-; AVX512F-NEXT: xorl %eax, %eax
-; AVX512F-NEXT: cmpl %ecx, %edx
-; AVX512F-NEXT: movl $-1, %ecx
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmovgl %ecx, %edx
-; AVX512F-NEXT: vmovd %xmm4, %esi
-; AVX512F-NEXT: vmovd %xmm5, %edi
-; AVX512F-NEXT: cmpl %esi, %edi
-; AVX512F-NEXT: movl $0, %esi
-; AVX512F-NEXT: cmovgl %ecx, %esi
-; AVX512F-NEXT: vmovd %esi, %xmm6
-; AVX512F-NEXT: vpinsrd $1, %edx, %xmm6, %xmm6
-; AVX512F-NEXT: vpextrd $2, %xmm4, %edx
-; AVX512F-NEXT: vpextrd $2, %xmm5, %esi
-; AVX512F-NEXT: cmpl %edx, %esi
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmovgl %ecx, %edx
-; AVX512F-NEXT: vpinsrd $2, %edx, %xmm6, %xmm6
-; AVX512F-NEXT: vpextrd $3, %xmm4, %edx
-; AVX512F-NEXT: vpextrd $3, %xmm5, %esi
-; AVX512F-NEXT: cmpl %edx, %esi
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmovgl %ecx, %edx
-; AVX512F-NEXT: vpinsrd $3, %edx, %xmm6, %xmm4
-; AVX512F-NEXT: vextracti32x4 $2, %zmm2, %xmm5
-; AVX512F-NEXT: vpextrd $1, %xmm5, %edx
-; AVX512F-NEXT: vextracti32x4 $2, %zmm0, %xmm6
-; AVX512F-NEXT: vpextrd $1, %xmm6, %esi
-; AVX512F-NEXT: cmpl %edx, %esi
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmovgl %ecx, %edx
-; AVX512F-NEXT: vmovd %xmm5, %esi
-; AVX512F-NEXT: vmovd %xmm6, %edi
-; AVX512F-NEXT: cmpl %esi, %edi
-; AVX512F-NEXT: movl $0, %esi
-; AVX512F-NEXT: cmovgl %ecx, %esi
-; AVX512F-NEXT: vmovd %esi, %xmm7
-; AVX512F-NEXT: vpinsrd $1, %edx, %xmm7, %xmm7
-; AVX512F-NEXT: vpextrd $2, %xmm5, %edx
-; AVX512F-NEXT: vpextrd $2, %xmm6, %esi
-; AVX512F-NEXT: cmpl %edx, %esi
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmovgl %ecx, %edx
-; AVX512F-NEXT: vpinsrd $2, %edx, %xmm7, %xmm7
-; AVX512F-NEXT: vpextrd $3, %xmm5, %edx
-; AVX512F-NEXT: vpextrd $3, %xmm6, %esi
-; AVX512F-NEXT: cmpl %edx, %esi
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmovgl %ecx, %edx
-; AVX512F-NEXT: vpinsrd $3, %edx, %xmm7, %xmm5
-; AVX512F-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4
-; AVX512F-NEXT: vextracti32x4 $1, %zmm2, %xmm5
-; AVX512F-NEXT: vpextrd $1, %xmm5, %edx
-; AVX512F-NEXT: vextracti32x4 $1, %zmm0, %xmm6
-; AVX512F-NEXT: vpextrd $1, %xmm6, %esi
-; AVX512F-NEXT: cmpl %edx, %esi
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmovgl %ecx, %edx
-; AVX512F-NEXT: vmovd %xmm5, %esi
-; AVX512F-NEXT: vmovd %xmm6, %edi
-; AVX512F-NEXT: cmpl %esi, %edi
-; AVX512F-NEXT: movl $0, %esi
-; AVX512F-NEXT: cmovgl %ecx, %esi
-; AVX512F-NEXT: vmovd %esi, %xmm7
-; AVX512F-NEXT: vpinsrd $1, %edx, %xmm7, %xmm7
-; AVX512F-NEXT: vpextrd $2, %xmm5, %edx
-; AVX512F-NEXT: vpextrd $2, %xmm6, %esi
-; AVX512F-NEXT: cmpl %edx, %esi
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmovgl %ecx, %edx
-; AVX512F-NEXT: vpinsrd $2, %edx, %xmm7, %xmm7
-; AVX512F-NEXT: vpextrd $3, %xmm5, %edx
-; AVX512F-NEXT: vpextrd $3, %xmm6, %esi
-; AVX512F-NEXT: cmpl %edx, %esi
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmovgl %ecx, %edx
-; AVX512F-NEXT: vpinsrd $3, %edx, %xmm7, %xmm5
-; AVX512F-NEXT: vpextrd $1, %xmm2, %edx
-; AVX512F-NEXT: vpextrd $1, %xmm0, %esi
-; AVX512F-NEXT: cmpl %edx, %esi
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmovgl %ecx, %edx
-; AVX512F-NEXT: vmovd %xmm2, %esi
-; AVX512F-NEXT: vmovd %xmm0, %edi
-; AVX512F-NEXT: cmpl %esi, %edi
-; AVX512F-NEXT: movl $0, %esi
-; AVX512F-NEXT: cmovgl %ecx, %esi
-; AVX512F-NEXT: vmovd %esi, %xmm6
-; AVX512F-NEXT: vpinsrd $1, %edx, %xmm6, %xmm6
-; AVX512F-NEXT: vpextrd $2, %xmm2, %edx
-; AVX512F-NEXT: vpextrd $2, %xmm0, %esi
-; AVX512F-NEXT: cmpl %edx, %esi
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmovgl %ecx, %edx
-; AVX512F-NEXT: vpinsrd $2, %edx, %xmm6, %xmm6
-; AVX512F-NEXT: vpextrd $3, %xmm2, %edx
-; AVX512F-NEXT: vpextrd $3, %xmm0, %esi
-; AVX512F-NEXT: cmpl %edx, %esi
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmovgl %ecx, %edx
-; AVX512F-NEXT: vpinsrd $3, %edx, %xmm6, %xmm0
-; AVX512F-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm0
-; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpcmpgtd %zmm2, %zmm0, %k1
+; AVX512F-NEXT: movl {{.*}}(%rip), %eax
+; AVX512F-NEXT: vpbroadcastd %eax, %zmm0 {%k1} {z}
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512F-NEXT: vextracti32x4 $3, %zmm3, %xmm2
-; AVX512F-NEXT: vpextrd $1, %xmm2, %edx
-; AVX512F-NEXT: vextracti32x4 $3, %zmm1, %xmm4
-; AVX512F-NEXT: vpextrd $1, %xmm4, %esi
-; AVX512F-NEXT: cmpl %edx, %esi
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmovgl %ecx, %edx
-; AVX512F-NEXT: vmovd %xmm2, %esi
-; AVX512F-NEXT: vmovd %xmm4, %edi
-; AVX512F-NEXT: cmpl %esi, %edi
-; AVX512F-NEXT: movl $0, %esi
-; AVX512F-NEXT: cmovgl %ecx, %esi
-; AVX512F-NEXT: vmovd %esi, %xmm5
-; AVX512F-NEXT: vpinsrd $1, %edx, %xmm5, %xmm5
-; AVX512F-NEXT: vpextrd $2, %xmm2, %edx
-; AVX512F-NEXT: vpextrd $2, %xmm4, %esi
-; AVX512F-NEXT: cmpl %edx, %esi
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmovgl %ecx, %edx
-; AVX512F-NEXT: vpinsrd $2, %edx, %xmm5, %xmm5
-; AVX512F-NEXT: vpextrd $3, %xmm2, %edx
-; AVX512F-NEXT: vpextrd $3, %xmm4, %esi
-; AVX512F-NEXT: cmpl %edx, %esi
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmovgl %ecx, %edx
-; AVX512F-NEXT: vpinsrd $3, %edx, %xmm5, %xmm2
-; AVX512F-NEXT: vextracti32x4 $2, %zmm3, %xmm4
-; AVX512F-NEXT: vpextrd $1, %xmm4, %edx
-; AVX512F-NEXT: vextracti32x4 $2, %zmm1, %xmm5
-; AVX512F-NEXT: vpextrd $1, %xmm5, %esi
-; AVX512F-NEXT: cmpl %edx, %esi
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmovgl %ecx, %edx
-; AVX512F-NEXT: vmovd %xmm4, %esi
-; AVX512F-NEXT: vmovd %xmm5, %edi
-; AVX512F-NEXT: cmpl %esi, %edi
-; AVX512F-NEXT: movl $0, %esi
-; AVX512F-NEXT: cmovgl %ecx, %esi
-; AVX512F-NEXT: vmovd %esi, %xmm6
-; AVX512F-NEXT: vpinsrd $1, %edx, %xmm6, %xmm6
-; AVX512F-NEXT: vpextrd $2, %xmm4, %edx
-; AVX512F-NEXT: vpextrd $2, %xmm5, %esi
-; AVX512F-NEXT: cmpl %edx, %esi
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmovgl %ecx, %edx
-; AVX512F-NEXT: vpinsrd $2, %edx, %xmm6, %xmm6
-; AVX512F-NEXT: vpextrd $3, %xmm4, %edx
-; AVX512F-NEXT: vpextrd $3, %xmm5, %esi
-; AVX512F-NEXT: cmpl %edx, %esi
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmovgl %ecx, %edx
-; AVX512F-NEXT: vpinsrd $3, %edx, %xmm6, %xmm4
-; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2
-; AVX512F-NEXT: vextracti32x4 $1, %zmm3, %xmm4
-; AVX512F-NEXT: vpextrd $1, %xmm4, %edx
-; AVX512F-NEXT: vextracti32x4 $1, %zmm1, %xmm5
-; AVX512F-NEXT: vpextrd $1, %xmm5, %esi
-; AVX512F-NEXT: cmpl %edx, %esi
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmovgl %ecx, %edx
-; AVX512F-NEXT: vmovd %xmm4, %esi
-; AVX512F-NEXT: vmovd %xmm5, %edi
-; AVX512F-NEXT: cmpl %esi, %edi
-; AVX512F-NEXT: movl $0, %esi
-; AVX512F-NEXT: cmovgl %ecx, %esi
-; AVX512F-NEXT: vmovd %esi, %xmm6
-; AVX512F-NEXT: vpinsrd $1, %edx, %xmm6, %xmm6
-; AVX512F-NEXT: vpextrd $2, %xmm4, %edx
-; AVX512F-NEXT: vpextrd $2, %xmm5, %esi
-; AVX512F-NEXT: cmpl %edx, %esi
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmovgl %ecx, %edx
-; AVX512F-NEXT: vpinsrd $2, %edx, %xmm6, %xmm6
-; AVX512F-NEXT: vpextrd $3, %xmm4, %edx
-; AVX512F-NEXT: vpextrd $3, %xmm5, %esi
-; AVX512F-NEXT: cmpl %edx, %esi
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmovgl %ecx, %edx
-; AVX512F-NEXT: vpinsrd $3, %edx, %xmm6, %xmm4
-; AVX512F-NEXT: vpextrd $1, %xmm3, %edx
-; AVX512F-NEXT: vpextrd $1, %xmm1, %esi
-; AVX512F-NEXT: cmpl %edx, %esi
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmovgl %ecx, %edx
-; AVX512F-NEXT: vmovd %xmm3, %esi
-; AVX512F-NEXT: vmovd %xmm1, %edi
-; AVX512F-NEXT: cmpl %esi, %edi
-; AVX512F-NEXT: movl $0, %esi
-; AVX512F-NEXT: cmovgl %ecx, %esi
-; AVX512F-NEXT: vmovd %esi, %xmm5
-; AVX512F-NEXT: vpinsrd $1, %edx, %xmm5, %xmm5
-; AVX512F-NEXT: vpextrd $2, %xmm3, %edx
-; AVX512F-NEXT: vpextrd $2, %xmm1, %esi
-; AVX512F-NEXT: cmpl %edx, %esi
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmovgl %ecx, %edx
-; AVX512F-NEXT: vpinsrd $2, %edx, %xmm5, %xmm5
-; AVX512F-NEXT: vpextrd $3, %xmm3, %edx
-; AVX512F-NEXT: vpextrd $3, %xmm1, %esi
-; AVX512F-NEXT: cmpl %edx, %esi
-; AVX512F-NEXT: cmovgl %ecx, %eax
-; AVX512F-NEXT: vpinsrd $3, %eax, %xmm5, %xmm1
-; AVX512F-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1
-; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512F-NEXT: vpcmpgtd %zmm3, %zmm1, %k1
+; AVX512F-NEXT: vpbroadcastd %eax, %zmm1 {%k1} {z}
; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT: vpsllw $7, %ymm0, %ymm0
+; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512F-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0
; AVX512F-NEXT: retq
;
; AVX512DQ-LABEL: test_cmp_v32i32:
-; AVX512DQ: # BB#0:
-; AVX512DQ-NEXT: vextracti32x4 $3, %zmm2, %xmm4
-; AVX512DQ-NEXT: vpextrd $1, %xmm4, %ecx
-; AVX512DQ-NEXT: vextracti32x4 $3, %zmm0, %xmm5
-; AVX512DQ-NEXT: vpextrd $1, %xmm5, %edx
-; AVX512DQ-NEXT: xorl %eax, %eax
-; AVX512DQ-NEXT: cmpl %ecx, %edx
-; AVX512DQ-NEXT: movl $-1, %ecx
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmovgl %ecx, %edx
-; AVX512DQ-NEXT: vmovd %xmm4, %esi
-; AVX512DQ-NEXT: vmovd %xmm5, %edi
-; AVX512DQ-NEXT: cmpl %esi, %edi
-; AVX512DQ-NEXT: movl $0, %esi
-; AVX512DQ-NEXT: cmovgl %ecx, %esi
-; AVX512DQ-NEXT: vmovd %esi, %xmm6
-; AVX512DQ-NEXT: vpinsrd $1, %edx, %xmm6, %xmm6
-; AVX512DQ-NEXT: vpextrd $2, %xmm4, %edx
-; AVX512DQ-NEXT: vpextrd $2, %xmm5, %esi
-; AVX512DQ-NEXT: cmpl %edx, %esi
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmovgl %ecx, %edx
-; AVX512DQ-NEXT: vpinsrd $2, %edx, %xmm6, %xmm6
-; AVX512DQ-NEXT: vpextrd $3, %xmm4, %edx
-; AVX512DQ-NEXT: vpextrd $3, %xmm5, %esi
-; AVX512DQ-NEXT: cmpl %edx, %esi
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmovgl %ecx, %edx
-; AVX512DQ-NEXT: vpinsrd $3, %edx, %xmm6, %xmm4
-; AVX512DQ-NEXT: vextracti32x4 $2, %zmm2, %xmm5
-; AVX512DQ-NEXT: vpextrd $1, %xmm5, %edx
-; AVX512DQ-NEXT: vextracti32x4 $2, %zmm0, %xmm6
-; AVX512DQ-NEXT: vpextrd $1, %xmm6, %esi
-; AVX512DQ-NEXT: cmpl %edx, %esi
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmovgl %ecx, %edx
-; AVX512DQ-NEXT: vmovd %xmm5, %esi
-; AVX512DQ-NEXT: vmovd %xmm6, %edi
-; AVX512DQ-NEXT: cmpl %esi, %edi
-; AVX512DQ-NEXT: movl $0, %esi
-; AVX512DQ-NEXT: cmovgl %ecx, %esi
-; AVX512DQ-NEXT: vmovd %esi, %xmm7
-; AVX512DQ-NEXT: vpinsrd $1, %edx, %xmm7, %xmm7
-; AVX512DQ-NEXT: vpextrd $2, %xmm5, %edx
-; AVX512DQ-NEXT: vpextrd $2, %xmm6, %esi
-; AVX512DQ-NEXT: cmpl %edx, %esi
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmovgl %ecx, %edx
-; AVX512DQ-NEXT: vpinsrd $2, %edx, %xmm7, %xmm7
-; AVX512DQ-NEXT: vpextrd $3, %xmm5, %edx
-; AVX512DQ-NEXT: vpextrd $3, %xmm6, %esi
-; AVX512DQ-NEXT: cmpl %edx, %esi
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmovgl %ecx, %edx
-; AVX512DQ-NEXT: vpinsrd $3, %edx, %xmm7, %xmm5
-; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4
-; AVX512DQ-NEXT: vextracti32x4 $1, %zmm2, %xmm5
-; AVX512DQ-NEXT: vpextrd $1, %xmm5, %edx
-; AVX512DQ-NEXT: vextracti32x4 $1, %zmm0, %xmm6
-; AVX512DQ-NEXT: vpextrd $1, %xmm6, %esi
-; AVX512DQ-NEXT: cmpl %edx, %esi
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmovgl %ecx, %edx
-; AVX512DQ-NEXT: vmovd %xmm5, %esi
-; AVX512DQ-NEXT: vmovd %xmm6, %edi
-; AVX512DQ-NEXT: cmpl %esi, %edi
-; AVX512DQ-NEXT: movl $0, %esi
-; AVX512DQ-NEXT: cmovgl %ecx, %esi
-; AVX512DQ-NEXT: vmovd %esi, %xmm7
-; AVX512DQ-NEXT: vpinsrd $1, %edx, %xmm7, %xmm7
-; AVX512DQ-NEXT: vpextrd $2, %xmm5, %edx
-; AVX512DQ-NEXT: vpextrd $2, %xmm6, %esi
-; AVX512DQ-NEXT: cmpl %edx, %esi
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmovgl %ecx, %edx
-; AVX512DQ-NEXT: vpinsrd $2, %edx, %xmm7, %xmm7
-; AVX512DQ-NEXT: vpextrd $3, %xmm5, %edx
-; AVX512DQ-NEXT: vpextrd $3, %xmm6, %esi
-; AVX512DQ-NEXT: cmpl %edx, %esi
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmovgl %ecx, %edx
-; AVX512DQ-NEXT: vpinsrd $3, %edx, %xmm7, %xmm5
-; AVX512DQ-NEXT: vpextrd $1, %xmm2, %edx
-; AVX512DQ-NEXT: vpextrd $1, %xmm0, %esi
-; AVX512DQ-NEXT: cmpl %edx, %esi
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmovgl %ecx, %edx
-; AVX512DQ-NEXT: vmovd %xmm2, %esi
-; AVX512DQ-NEXT: vmovd %xmm0, %edi
-; AVX512DQ-NEXT: cmpl %esi, %edi
-; AVX512DQ-NEXT: movl $0, %esi
-; AVX512DQ-NEXT: cmovgl %ecx, %esi
-; AVX512DQ-NEXT: vmovd %esi, %xmm6
-; AVX512DQ-NEXT: vpinsrd $1, %edx, %xmm6, %xmm6
-; AVX512DQ-NEXT: vpextrd $2, %xmm2, %edx
-; AVX512DQ-NEXT: vpextrd $2, %xmm0, %esi
-; AVX512DQ-NEXT: cmpl %edx, %esi
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmovgl %ecx, %edx
-; AVX512DQ-NEXT: vpinsrd $2, %edx, %xmm6, %xmm6
-; AVX512DQ-NEXT: vpextrd $3, %xmm2, %edx
-; AVX512DQ-NEXT: vpextrd $3, %xmm0, %esi
-; AVX512DQ-NEXT: cmpl %edx, %esi
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmovgl %ecx, %edx
-; AVX512DQ-NEXT: vpinsrd $3, %edx, %xmm6, %xmm0
-; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm0
-; AVX512DQ-NEXT: vinserti32x8 $1, %ymm4, %zmm0, %zmm0
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: vpcmpgtd %zmm2, %zmm0, %k1
+; AVX512DQ-NEXT: movl {{.*}}(%rip), %eax
+; AVX512DQ-NEXT: vpbroadcastd %eax, %zmm0 {%k1} {z}
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512DQ-NEXT: vextracti32x4 $3, %zmm3, %xmm2
-; AVX512DQ-NEXT: vpextrd $1, %xmm2, %edx
-; AVX512DQ-NEXT: vextracti32x4 $3, %zmm1, %xmm4
-; AVX512DQ-NEXT: vpextrd $1, %xmm4, %esi
-; AVX512DQ-NEXT: cmpl %edx, %esi
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmovgl %ecx, %edx
-; AVX512DQ-NEXT: vmovd %xmm2, %esi
-; AVX512DQ-NEXT: vmovd %xmm4, %edi
-; AVX512DQ-NEXT: cmpl %esi, %edi
-; AVX512DQ-NEXT: movl $0, %esi
-; AVX512DQ-NEXT: cmovgl %ecx, %esi
-; AVX512DQ-NEXT: vmovd %esi, %xmm5
-; AVX512DQ-NEXT: vpinsrd $1, %edx, %xmm5, %xmm5
-; AVX512DQ-NEXT: vpextrd $2, %xmm2, %edx
-; AVX512DQ-NEXT: vpextrd $2, %xmm4, %esi
-; AVX512DQ-NEXT: cmpl %edx, %esi
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmovgl %ecx, %edx
-; AVX512DQ-NEXT: vpinsrd $2, %edx, %xmm5, %xmm5
-; AVX512DQ-NEXT: vpextrd $3, %xmm2, %edx
-; AVX512DQ-NEXT: vpextrd $3, %xmm4, %esi
-; AVX512DQ-NEXT: cmpl %edx, %esi
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmovgl %ecx, %edx
-; AVX512DQ-NEXT: vpinsrd $3, %edx, %xmm5, %xmm2
-; AVX512DQ-NEXT: vextracti32x4 $2, %zmm3, %xmm4
-; AVX512DQ-NEXT: vpextrd $1, %xmm4, %edx
-; AVX512DQ-NEXT: vextracti32x4 $2, %zmm1, %xmm5
-; AVX512DQ-NEXT: vpextrd $1, %xmm5, %esi
-; AVX512DQ-NEXT: cmpl %edx, %esi
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmovgl %ecx, %edx
-; AVX512DQ-NEXT: vmovd %xmm4, %esi
-; AVX512DQ-NEXT: vmovd %xmm5, %edi
-; AVX512DQ-NEXT: cmpl %esi, %edi
-; AVX512DQ-NEXT: movl $0, %esi
-; AVX512DQ-NEXT: cmovgl %ecx, %esi
-; AVX512DQ-NEXT: vmovd %esi, %xmm6
-; AVX512DQ-NEXT: vpinsrd $1, %edx, %xmm6, %xmm6
-; AVX512DQ-NEXT: vpextrd $2, %xmm4, %edx
-; AVX512DQ-NEXT: vpextrd $2, %xmm5, %esi
-; AVX512DQ-NEXT: cmpl %edx, %esi
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmovgl %ecx, %edx
-; AVX512DQ-NEXT: vpinsrd $2, %edx, %xmm6, %xmm6
-; AVX512DQ-NEXT: vpextrd $3, %xmm4, %edx
-; AVX512DQ-NEXT: vpextrd $3, %xmm5, %esi
-; AVX512DQ-NEXT: cmpl %edx, %esi
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmovgl %ecx, %edx
-; AVX512DQ-NEXT: vpinsrd $3, %edx, %xmm6, %xmm4
-; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2
-; AVX512DQ-NEXT: vextracti32x4 $1, %zmm3, %xmm4
-; AVX512DQ-NEXT: vpextrd $1, %xmm4, %edx
-; AVX512DQ-NEXT: vextracti32x4 $1, %zmm1, %xmm5
-; AVX512DQ-NEXT: vpextrd $1, %xmm5, %esi
-; AVX512DQ-NEXT: cmpl %edx, %esi
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmovgl %ecx, %edx
-; AVX512DQ-NEXT: vmovd %xmm4, %esi
-; AVX512DQ-NEXT: vmovd %xmm5, %edi
-; AVX512DQ-NEXT: cmpl %esi, %edi
-; AVX512DQ-NEXT: movl $0, %esi
-; AVX512DQ-NEXT: cmovgl %ecx, %esi
-; AVX512DQ-NEXT: vmovd %esi, %xmm6
-; AVX512DQ-NEXT: vpinsrd $1, %edx, %xmm6, %xmm6
-; AVX512DQ-NEXT: vpextrd $2, %xmm4, %edx
-; AVX512DQ-NEXT: vpextrd $2, %xmm5, %esi
-; AVX512DQ-NEXT: cmpl %edx, %esi
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmovgl %ecx, %edx
-; AVX512DQ-NEXT: vpinsrd $2, %edx, %xmm6, %xmm6
-; AVX512DQ-NEXT: vpextrd $3, %xmm4, %edx
-; AVX512DQ-NEXT: vpextrd $3, %xmm5, %esi
-; AVX512DQ-NEXT: cmpl %edx, %esi
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmovgl %ecx, %edx
-; AVX512DQ-NEXT: vpinsrd $3, %edx, %xmm6, %xmm4
-; AVX512DQ-NEXT: vpextrd $1, %xmm3, %edx
-; AVX512DQ-NEXT: vpextrd $1, %xmm1, %esi
-; AVX512DQ-NEXT: cmpl %edx, %esi
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmovgl %ecx, %edx
-; AVX512DQ-NEXT: vmovd %xmm3, %esi
-; AVX512DQ-NEXT: vmovd %xmm1, %edi
-; AVX512DQ-NEXT: cmpl %esi, %edi
-; AVX512DQ-NEXT: movl $0, %esi
-; AVX512DQ-NEXT: cmovgl %ecx, %esi
-; AVX512DQ-NEXT: vmovd %esi, %xmm5
-; AVX512DQ-NEXT: vpinsrd $1, %edx, %xmm5, %xmm5
-; AVX512DQ-NEXT: vpextrd $2, %xmm3, %edx
-; AVX512DQ-NEXT: vpextrd $2, %xmm1, %esi
-; AVX512DQ-NEXT: cmpl %edx, %esi
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmovgl %ecx, %edx
-; AVX512DQ-NEXT: vpinsrd $2, %edx, %xmm5, %xmm5
-; AVX512DQ-NEXT: vpextrd $3, %xmm3, %edx
-; AVX512DQ-NEXT: vpextrd $3, %xmm1, %esi
-; AVX512DQ-NEXT: cmpl %edx, %esi
-; AVX512DQ-NEXT: cmovgl %ecx, %eax
-; AVX512DQ-NEXT: vpinsrd $3, %eax, %xmm5, %xmm1
-; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1
-; AVX512DQ-NEXT: vinserti32x8 $1, %ymm2, %zmm1, %zmm1
+; AVX512DQ-NEXT: vpcmpgtd %zmm3, %zmm1, %k1
+; AVX512DQ-NEXT: vpbroadcastd %eax, %zmm1 {%k1} {z}
; AVX512DQ-NEXT: vpmovdb %zmm1, %xmm1
; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512DQ-NEXT: vpsllw $7, %ymm0, %ymm0
+; AVX512DQ-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512DQ-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: test_cmp_v32i32:
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vextracti32x4 $3, %zmm2, %xmm4
-; AVX512BW-NEXT: vpextrd $1, %xmm4, %ecx
-; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm5
-; AVX512BW-NEXT: vpextrd $1, %xmm5, %edx
-; AVX512BW-NEXT: xorl %eax, %eax
-; AVX512BW-NEXT: cmpl %ecx, %edx
-; AVX512BW-NEXT: movl $-1, %ecx
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovgl %ecx, %edx
-; AVX512BW-NEXT: vmovd %xmm4, %esi
-; AVX512BW-NEXT: vmovd %xmm5, %edi
-; AVX512BW-NEXT: cmpl %esi, %edi
-; AVX512BW-NEXT: movl $0, %esi
-; AVX512BW-NEXT: cmovgl %ecx, %esi
-; AVX512BW-NEXT: vmovd %esi, %xmm6
-; AVX512BW-NEXT: vpinsrd $1, %edx, %xmm6, %xmm6
-; AVX512BW-NEXT: vpextrd $2, %xmm4, %edx
-; AVX512BW-NEXT: vpextrd $2, %xmm5, %esi
-; AVX512BW-NEXT: cmpl %edx, %esi
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovgl %ecx, %edx
-; AVX512BW-NEXT: vpinsrd $2, %edx, %xmm6, %xmm6
-; AVX512BW-NEXT: vpextrd $3, %xmm4, %edx
-; AVX512BW-NEXT: vpextrd $3, %xmm5, %esi
-; AVX512BW-NEXT: cmpl %edx, %esi
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovgl %ecx, %edx
-; AVX512BW-NEXT: vpinsrd $3, %edx, %xmm6, %xmm4
-; AVX512BW-NEXT: vextracti32x4 $2, %zmm2, %xmm5
-; AVX512BW-NEXT: vpextrd $1, %xmm5, %edx
-; AVX512BW-NEXT: vextracti32x4 $2, %zmm0, %xmm6
-; AVX512BW-NEXT: vpextrd $1, %xmm6, %esi
-; AVX512BW-NEXT: cmpl %edx, %esi
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovgl %ecx, %edx
-; AVX512BW-NEXT: vmovd %xmm5, %esi
-; AVX512BW-NEXT: vmovd %xmm6, %edi
-; AVX512BW-NEXT: cmpl %esi, %edi
-; AVX512BW-NEXT: movl $0, %esi
-; AVX512BW-NEXT: cmovgl %ecx, %esi
-; AVX512BW-NEXT: vmovd %esi, %xmm7
-; AVX512BW-NEXT: vpinsrd $1, %edx, %xmm7, %xmm7
-; AVX512BW-NEXT: vpextrd $2, %xmm5, %edx
-; AVX512BW-NEXT: vpextrd $2, %xmm6, %esi
-; AVX512BW-NEXT: cmpl %edx, %esi
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovgl %ecx, %edx
-; AVX512BW-NEXT: vpinsrd $2, %edx, %xmm7, %xmm7
-; AVX512BW-NEXT: vpextrd $3, %xmm5, %edx
-; AVX512BW-NEXT: vpextrd $3, %xmm6, %esi
-; AVX512BW-NEXT: cmpl %edx, %esi
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovgl %ecx, %edx
-; AVX512BW-NEXT: vpinsrd $3, %edx, %xmm7, %xmm5
-; AVX512BW-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4
-; AVX512BW-NEXT: vextracti32x4 $1, %zmm2, %xmm5
-; AVX512BW-NEXT: vpextrd $1, %xmm5, %edx
-; AVX512BW-NEXT: vextracti32x4 $1, %zmm0, %xmm6
-; AVX512BW-NEXT: vpextrd $1, %xmm6, %esi
-; AVX512BW-NEXT: cmpl %edx, %esi
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovgl %ecx, %edx
-; AVX512BW-NEXT: vmovd %xmm5, %esi
-; AVX512BW-NEXT: vmovd %xmm6, %edi
-; AVX512BW-NEXT: cmpl %esi, %edi
-; AVX512BW-NEXT: movl $0, %esi
-; AVX512BW-NEXT: cmovgl %ecx, %esi
-; AVX512BW-NEXT: vmovd %esi, %xmm7
-; AVX512BW-NEXT: vpinsrd $1, %edx, %xmm7, %xmm7
-; AVX512BW-NEXT: vpextrd $2, %xmm5, %edx
-; AVX512BW-NEXT: vpextrd $2, %xmm6, %esi
-; AVX512BW-NEXT: cmpl %edx, %esi
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovgl %ecx, %edx
-; AVX512BW-NEXT: vpinsrd $2, %edx, %xmm7, %xmm7
-; AVX512BW-NEXT: vpextrd $3, %xmm5, %edx
-; AVX512BW-NEXT: vpextrd $3, %xmm6, %esi
-; AVX512BW-NEXT: cmpl %edx, %esi
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovgl %ecx, %edx
-; AVX512BW-NEXT: vpinsrd $3, %edx, %xmm7, %xmm5
-; AVX512BW-NEXT: vpextrd $1, %xmm2, %edx
-; AVX512BW-NEXT: vpextrd $1, %xmm0, %esi
-; AVX512BW-NEXT: cmpl %edx, %esi
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovgl %ecx, %edx
-; AVX512BW-NEXT: vmovd %xmm2, %esi
-; AVX512BW-NEXT: vmovd %xmm0, %edi
-; AVX512BW-NEXT: cmpl %esi, %edi
-; AVX512BW-NEXT: movl $0, %esi
-; AVX512BW-NEXT: cmovgl %ecx, %esi
-; AVX512BW-NEXT: vmovd %esi, %xmm6
-; AVX512BW-NEXT: vpinsrd $1, %edx, %xmm6, %xmm6
-; AVX512BW-NEXT: vpextrd $2, %xmm2, %edx
-; AVX512BW-NEXT: vpextrd $2, %xmm0, %esi
-; AVX512BW-NEXT: cmpl %edx, %esi
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovgl %ecx, %edx
-; AVX512BW-NEXT: vpinsrd $2, %edx, %xmm6, %xmm6
-; AVX512BW-NEXT: vpextrd $3, %xmm2, %edx
-; AVX512BW-NEXT: vpextrd $3, %xmm0, %esi
-; AVX512BW-NEXT: cmpl %edx, %esi
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovgl %ecx, %edx
-; AVX512BW-NEXT: vpinsrd $3, %edx, %xmm6, %xmm0
-; AVX512BW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm0
-; AVX512BW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0
-; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512BW-NEXT: vextracti32x4 $3, %zmm3, %xmm2
-; AVX512BW-NEXT: vpextrd $1, %xmm2, %edx
-; AVX512BW-NEXT: vextracti32x4 $3, %zmm1, %xmm4
-; AVX512BW-NEXT: vpextrd $1, %xmm4, %esi
-; AVX512BW-NEXT: cmpl %edx, %esi
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovgl %ecx, %edx
-; AVX512BW-NEXT: vmovd %xmm2, %esi
-; AVX512BW-NEXT: vmovd %xmm4, %edi
-; AVX512BW-NEXT: cmpl %esi, %edi
-; AVX512BW-NEXT: movl $0, %esi
-; AVX512BW-NEXT: cmovgl %ecx, %esi
-; AVX512BW-NEXT: vmovd %esi, %xmm5
-; AVX512BW-NEXT: vpinsrd $1, %edx, %xmm5, %xmm5
-; AVX512BW-NEXT: vpextrd $2, %xmm2, %edx
-; AVX512BW-NEXT: vpextrd $2, %xmm4, %esi
-; AVX512BW-NEXT: cmpl %edx, %esi
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovgl %ecx, %edx
-; AVX512BW-NEXT: vpinsrd $2, %edx, %xmm5, %xmm5
-; AVX512BW-NEXT: vpextrd $3, %xmm2, %edx
-; AVX512BW-NEXT: vpextrd $3, %xmm4, %esi
-; AVX512BW-NEXT: cmpl %edx, %esi
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovgl %ecx, %edx
-; AVX512BW-NEXT: vpinsrd $3, %edx, %xmm5, %xmm2
-; AVX512BW-NEXT: vextracti32x4 $2, %zmm3, %xmm4
-; AVX512BW-NEXT: vpextrd $1, %xmm4, %edx
-; AVX512BW-NEXT: vextracti32x4 $2, %zmm1, %xmm5
-; AVX512BW-NEXT: vpextrd $1, %xmm5, %esi
-; AVX512BW-NEXT: cmpl %edx, %esi
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovgl %ecx, %edx
-; AVX512BW-NEXT: vmovd %xmm4, %esi
-; AVX512BW-NEXT: vmovd %xmm5, %edi
-; AVX512BW-NEXT: cmpl %esi, %edi
-; AVX512BW-NEXT: movl $0, %esi
-; AVX512BW-NEXT: cmovgl %ecx, %esi
-; AVX512BW-NEXT: vmovd %esi, %xmm6
-; AVX512BW-NEXT: vpinsrd $1, %edx, %xmm6, %xmm6
-; AVX512BW-NEXT: vpextrd $2, %xmm4, %edx
-; AVX512BW-NEXT: vpextrd $2, %xmm5, %esi
-; AVX512BW-NEXT: cmpl %edx, %esi
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovgl %ecx, %edx
-; AVX512BW-NEXT: vpinsrd $2, %edx, %xmm6, %xmm6
-; AVX512BW-NEXT: vpextrd $3, %xmm4, %edx
-; AVX512BW-NEXT: vpextrd $3, %xmm5, %esi
-; AVX512BW-NEXT: cmpl %edx, %esi
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovgl %ecx, %edx
-; AVX512BW-NEXT: vpinsrd $3, %edx, %xmm6, %xmm4
-; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2
-; AVX512BW-NEXT: vextracti32x4 $1, %zmm3, %xmm4
-; AVX512BW-NEXT: vpextrd $1, %xmm4, %edx
-; AVX512BW-NEXT: vextracti32x4 $1, %zmm1, %xmm5
-; AVX512BW-NEXT: vpextrd $1, %xmm5, %esi
-; AVX512BW-NEXT: cmpl %edx, %esi
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovgl %ecx, %edx
-; AVX512BW-NEXT: vmovd %xmm4, %esi
-; AVX512BW-NEXT: vmovd %xmm5, %edi
-; AVX512BW-NEXT: cmpl %esi, %edi
-; AVX512BW-NEXT: movl $0, %esi
-; AVX512BW-NEXT: cmovgl %ecx, %esi
-; AVX512BW-NEXT: vmovd %esi, %xmm6
-; AVX512BW-NEXT: vpinsrd $1, %edx, %xmm6, %xmm6
-; AVX512BW-NEXT: vpextrd $2, %xmm4, %edx
-; AVX512BW-NEXT: vpextrd $2, %xmm5, %esi
-; AVX512BW-NEXT: cmpl %edx, %esi
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovgl %ecx, %edx
-; AVX512BW-NEXT: vpinsrd $2, %edx, %xmm6, %xmm6
-; AVX512BW-NEXT: vpextrd $3, %xmm4, %edx
-; AVX512BW-NEXT: vpextrd $3, %xmm5, %esi
-; AVX512BW-NEXT: cmpl %edx, %esi
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovgl %ecx, %edx
-; AVX512BW-NEXT: vpinsrd $3, %edx, %xmm6, %xmm4
-; AVX512BW-NEXT: vpextrd $1, %xmm3, %edx
-; AVX512BW-NEXT: vpextrd $1, %xmm1, %esi
-; AVX512BW-NEXT: cmpl %edx, %esi
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovgl %ecx, %edx
-; AVX512BW-NEXT: vmovd %xmm3, %esi
-; AVX512BW-NEXT: vmovd %xmm1, %edi
-; AVX512BW-NEXT: cmpl %esi, %edi
-; AVX512BW-NEXT: movl $0, %esi
-; AVX512BW-NEXT: cmovgl %ecx, %esi
-; AVX512BW-NEXT: vmovd %esi, %xmm5
-; AVX512BW-NEXT: vpinsrd $1, %edx, %xmm5, %xmm5
-; AVX512BW-NEXT: vpextrd $2, %xmm3, %edx
-; AVX512BW-NEXT: vpextrd $2, %xmm1, %esi
-; AVX512BW-NEXT: cmpl %edx, %esi
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovgl %ecx, %edx
-; AVX512BW-NEXT: vpinsrd $2, %edx, %xmm5, %xmm5
-; AVX512BW-NEXT: vpextrd $3, %xmm3, %edx
-; AVX512BW-NEXT: vpextrd $3, %xmm1, %esi
-; AVX512BW-NEXT: cmpl %edx, %esi
-; AVX512BW-NEXT: cmovgl %ecx, %eax
-; AVX512BW-NEXT: vpinsrd $3, %eax, %xmm5, %xmm1
-; AVX512BW-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1
-; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
-; AVX512BW-NEXT: vpmovdw %zmm1, %ymm1
-; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpcmpgtd %zmm2, %zmm0, %k0
+; AVX512BW-NEXT: vpcmpgtd %zmm3, %zmm1, %k1
+; AVX512BW-NEXT: kunpckwd %k0, %k1, %k0
+; AVX512BW-NEXT: vpmovm2b %k0, %zmm0
+; AVX512BW-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
; AVX512BW-NEXT: retq
%1 = icmp sgt <32 x i32> %a0, %a1
ret <32 x i1> %1
@@ -4917,28 +2936,19 @@ define <32 x i1> @test_cmp_v32i32(<32 x i32> %a0, <32 x i32> %a1) nounwind {
define <64 x i1> @test_cmp_v64i16(<64 x i16> %a0, <64 x i16> %a1) nounwind {
; SSE2-LABEL: test_cmp_v64i16:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: pcmpgtw {{[0-9]+}}(%rsp), %xmm1
-; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255]
-; SSE2-NEXT: pand %xmm8, %xmm1
; SSE2-NEXT: pcmpgtw {{[0-9]+}}(%rsp), %xmm0
-; SSE2-NEXT: pand %xmm8, %xmm0
-; SSE2-NEXT: packuswb %xmm1, %xmm0
+; SSE2-NEXT: packsswb %xmm1, %xmm0
; SSE2-NEXT: pcmpgtw {{[0-9]+}}(%rsp), %xmm3
-; SSE2-NEXT: pand %xmm8, %xmm3
; SSE2-NEXT: pcmpgtw {{[0-9]+}}(%rsp), %xmm2
-; SSE2-NEXT: pand %xmm8, %xmm2
-; SSE2-NEXT: packuswb %xmm3, %xmm2
+; SSE2-NEXT: packsswb %xmm3, %xmm2
; SSE2-NEXT: pcmpgtw {{[0-9]+}}(%rsp), %xmm5
-; SSE2-NEXT: pand %xmm8, %xmm5
; SSE2-NEXT: pcmpgtw {{[0-9]+}}(%rsp), %xmm4
-; SSE2-NEXT: pand %xmm8, %xmm4
-; SSE2-NEXT: packuswb %xmm5, %xmm4
+; SSE2-NEXT: packsswb %xmm5, %xmm4
; SSE2-NEXT: pcmpgtw {{[0-9]+}}(%rsp), %xmm7
-; SSE2-NEXT: pand %xmm8, %xmm7
; SSE2-NEXT: pcmpgtw {{[0-9]+}}(%rsp), %xmm6
-; SSE2-NEXT: pand %xmm8, %xmm6
-; SSE2-NEXT: packuswb %xmm7, %xmm6
+; SSE2-NEXT: packsswb %xmm7, %xmm6
; SSE2-NEXT: movdqa %xmm6, -{{[0-9]+}}(%rsp)
; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
; SSE2-NEXT: andb $1, %al
@@ -5139,7 +3149,7 @@ define <64 x i1> @test_cmp_v64i16(<64 x i16> %a0, <64 x i16> %a1) nounwind {
; SSE2-NEXT: retq
;
; SSE42-LABEL: test_cmp_v64i16:
-; SSE42: # BB#0:
+; SSE42: # %bb.0:
; SSE42-NEXT: pcmpgtw {{[0-9]+}}(%rsp), %xmm0
; SSE42-NEXT: pcmpgtw {{[0-9]+}}(%rsp), %xmm1
; SSE42-NEXT: pcmpgtw {{[0-9]+}}(%rsp), %xmm2
@@ -5344,7 +3354,7 @@ define <64 x i1> @test_cmp_v64i16(<64 x i16> %a0, <64 x i16> %a1) nounwind {
; SSE42-NEXT: retq
;
; AVX1-LABEL: test_cmp_v64i16:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpcmpgtw %xmm4, %xmm0, %xmm8
; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
@@ -5558,7 +3568,7 @@ define <64 x i1> @test_cmp_v64i16(<64 x i16> %a0, <64 x i16> %a1) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_cmp_v64i16:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpcmpgtw %ymm4, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4
; AVX2-NEXT: vpcmpgtw %ymm5, %ymm1, %ymm1
@@ -5764,987 +3774,73 @@ define <64 x i1> @test_cmp_v64i16(<64 x i16> %a0, <64 x i16> %a1) nounwind {
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test_cmp_v64i16:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vpcmpgtw %ymm7, %ymm3, %ymm3
-; AVX512F-NEXT: vpmovsxwd %ymm3, %zmm3
-; AVX512F-NEXT: vpslld $31, %zmm3, %zmm3
-; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k0
-; AVX512F-NEXT: kshiftlw $14, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: kshiftlw $15, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %ecx
-; AVX512F-NEXT: vmovd %ecx, %xmm3
-; AVX512F-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3
-; AVX512F-NEXT: kshiftlw $13, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3
-; AVX512F-NEXT: kshiftlw $12, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3
-; AVX512F-NEXT: kshiftlw $11, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3
-; AVX512F-NEXT: kshiftlw $10, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3
-; AVX512F-NEXT: kshiftlw $9, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3
-; AVX512F-NEXT: kshiftlw $8, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3
-; AVX512F-NEXT: kshiftlw $7, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3
-; AVX512F-NEXT: kshiftlw $6, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3
-; AVX512F-NEXT: kshiftlw $5, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3
-; AVX512F-NEXT: kshiftlw $4, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3
-; AVX512F-NEXT: kshiftlw $3, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3
-; AVX512F-NEXT: kshiftlw $2, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3
-; AVX512F-NEXT: kshiftlw $1, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3
-; AVX512F-NEXT: kshiftrw $15, %k0, %k0
-; AVX512F-NEXT: kmovw %k0, %eax
-; AVX512F-NEXT: vpinsrb $15, %eax, %xmm3, %xmm3
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vpcmpgtw %ymm6, %ymm2, %ymm2
; AVX512F-NEXT: vpmovsxwd %ymm2, %zmm2
-; AVX512F-NEXT: vpslld $31, %zmm2, %zmm2
-; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k0
-; AVX512F-NEXT: kshiftlw $14, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: kshiftlw $15, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %ecx
-; AVX512F-NEXT: vmovd %ecx, %xmm2
-; AVX512F-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2
-; AVX512F-NEXT: kshiftlw $13, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2
-; AVX512F-NEXT: kshiftlw $12, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2
-; AVX512F-NEXT: kshiftlw $11, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
-; AVX512F-NEXT: kshiftlw $10, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2
-; AVX512F-NEXT: kshiftlw $9, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2
-; AVX512F-NEXT: kshiftlw $8, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2
-; AVX512F-NEXT: kshiftlw $7, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
-; AVX512F-NEXT: kshiftlw $6, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2
-; AVX512F-NEXT: kshiftlw $5, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2
-; AVX512F-NEXT: kshiftlw $4, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2
-; AVX512F-NEXT: kshiftlw $3, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
-; AVX512F-NEXT: kshiftlw $2, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2
-; AVX512F-NEXT: kshiftlw $1, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2
-; AVX512F-NEXT: kshiftrw $15, %k0, %k0
-; AVX512F-NEXT: kmovw %k0, %eax
-; AVX512F-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2
+; AVX512F-NEXT: vpmovdb %zmm2, %xmm2
+; AVX512F-NEXT: vpcmpgtw %ymm7, %ymm3, %ymm3
+; AVX512F-NEXT: vpmovsxwd %ymm3, %zmm3
+; AVX512F-NEXT: vpmovdb %zmm3, %xmm3
; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
; AVX512F-NEXT: vpsllw $7, %ymm2, %ymm2
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2
-; AVX512F-NEXT: vpxor %ymm6, %ymm6, %ymm6
+; AVX512F-NEXT: vpxor %xmm6, %xmm6, %xmm6
; AVX512F-NEXT: vpcmpgtb %ymm2, %ymm6, %ymm2
-; AVX512F-NEXT: vpcmpgtw %ymm5, %ymm1, %ymm1
-; AVX512F-NEXT: vpmovsxwd %ymm1, %zmm1
-; AVX512F-NEXT: vpslld $31, %zmm1, %zmm1
-; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0
-; AVX512F-NEXT: kshiftlw $14, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: kshiftlw $15, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %ecx
-; AVX512F-NEXT: vmovd %ecx, %xmm1
-; AVX512F-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
-; AVX512F-NEXT: kshiftlw $13, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
-; AVX512F-NEXT: kshiftlw $12, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1
-; AVX512F-NEXT: kshiftlw $11, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; AVX512F-NEXT: kshiftlw $10, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
-; AVX512F-NEXT: kshiftlw $9, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1
-; AVX512F-NEXT: kshiftlw $8, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1
-; AVX512F-NEXT: kshiftlw $7, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
-; AVX512F-NEXT: kshiftlw $6, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1
-; AVX512F-NEXT: kshiftlw $5, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1
-; AVX512F-NEXT: kshiftlw $4, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1
-; AVX512F-NEXT: kshiftlw $3, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
-; AVX512F-NEXT: kshiftlw $2, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
-; AVX512F-NEXT: kshiftlw $1, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1
-; AVX512F-NEXT: kshiftrw $15, %k0, %k0
-; AVX512F-NEXT: kmovw %k0, %eax
-; AVX512F-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1
; AVX512F-NEXT: vpcmpgtw %ymm4, %ymm0, %ymm0
; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
-; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
-; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
-; AVX512F-NEXT: kshiftlw $14, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: kshiftlw $15, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %ecx
-; AVX512F-NEXT: vmovd %ecx, %xmm0
-; AVX512F-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
-; AVX512F-NEXT: kshiftlw $13, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
-; AVX512F-NEXT: kshiftlw $12, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
-; AVX512F-NEXT: kshiftlw $11, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
-; AVX512F-NEXT: kshiftlw $10, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; AVX512F-NEXT: kshiftlw $9, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; AVX512F-NEXT: kshiftlw $8, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
-; AVX512F-NEXT: kshiftlw $7, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
-; AVX512F-NEXT: kshiftlw $6, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
-; AVX512F-NEXT: kshiftlw $5, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
-; AVX512F-NEXT: kshiftlw $4, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; AVX512F-NEXT: kshiftlw $3, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
-; AVX512F-NEXT: kshiftlw $2, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
-; AVX512F-NEXT: kshiftlw $1, %k0, %k1
-; AVX512F-NEXT: kshiftrw $15, %k1, %k1
-; AVX512F-NEXT: kmovw %k1, %eax
-; AVX512F-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
-; AVX512F-NEXT: kshiftrw $15, %k0, %k0
-; AVX512F-NEXT: kmovw %k0, %eax
-; AVX512F-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT: vpcmpgtw %ymm5, %ymm1, %ymm1
+; AVX512F-NEXT: vpmovsxwd %ymm1, %zmm1
+; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX512F-NEXT: vpsllw $7, %ymm0, %ymm0
; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm0
; AVX512F-NEXT: vpcmpgtb %ymm0, %ymm6, %ymm0
; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm3
-; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
-; AVX512F-NEXT: # kill: %XMM2<def> %XMM2<kill> %YMM2<kill>
+; AVX512F-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
+; AVX512F-NEXT: # kill: def %xmm2 killed %xmm2 killed %ymm2
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512DQ-LABEL: test_cmp_v64i16:
-; AVX512DQ: # BB#0:
-; AVX512DQ-NEXT: vpcmpgtw %ymm7, %ymm3, %ymm3
-; AVX512DQ-NEXT: vpmovsxwd %ymm3, %zmm3
-; AVX512DQ-NEXT: vpslld $31, %zmm3, %zmm3
-; AVX512DQ-NEXT: vptestmd %zmm3, %zmm3, %k0
-; AVX512DQ-NEXT: kshiftlw $14, %k0, %k1
-; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
-; AVX512DQ-NEXT: kmovw %k1, %eax
-; AVX512DQ-NEXT: kshiftlw $15, %k0, %k1
-; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
-; AVX512DQ-NEXT: kmovw %k1, %ecx
-; AVX512DQ-NEXT: vmovd %ecx, %xmm3
-; AVX512DQ-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3
-; AVX512DQ-NEXT: kshiftlw $13, %k0, %k1
-; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
-; AVX512DQ-NEXT: kmovw %k1, %eax
-; AVX512DQ-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3
-; AVX512DQ-NEXT: kshiftlw $12, %k0, %k1
-; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
-; AVX512DQ-NEXT: kmovw %k1, %eax
-; AVX512DQ-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3
-; AVX512DQ-NEXT: kshiftlw $11, %k0, %k1
-; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
-; AVX512DQ-NEXT: kmovw %k1, %eax
-; AVX512DQ-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3
-; AVX512DQ-NEXT: kshiftlw $10, %k0, %k1
-; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
-; AVX512DQ-NEXT: kmovw %k1, %eax
-; AVX512DQ-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3
-; AVX512DQ-NEXT: kshiftlw $9, %k0, %k1
-; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
-; AVX512DQ-NEXT: kmovw %k1, %eax
-; AVX512DQ-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3
-; AVX512DQ-NEXT: kshiftlw $8, %k0, %k1
-; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
-; AVX512DQ-NEXT: kmovw %k1, %eax
-; AVX512DQ-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3
-; AVX512DQ-NEXT: kshiftlw $7, %k0, %k1
-; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
-; AVX512DQ-NEXT: kmovw %k1, %eax
-; AVX512DQ-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3
-; AVX512DQ-NEXT: kshiftlw $6, %k0, %k1
-; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
-; AVX512DQ-NEXT: kmovw %k1, %eax
-; AVX512DQ-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3
-; AVX512DQ-NEXT: kshiftlw $5, %k0, %k1
-; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
-; AVX512DQ-NEXT: kmovw %k1, %eax
-; AVX512DQ-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3
-; AVX512DQ-NEXT: kshiftlw $4, %k0, %k1
-; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
-; AVX512DQ-NEXT: kmovw %k1, %eax
-; AVX512DQ-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3
-; AVX512DQ-NEXT: kshiftlw $3, %k0, %k1
-; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
-; AVX512DQ-NEXT: kmovw %k1, %eax
-; AVX512DQ-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3
-; AVX512DQ-NEXT: kshiftlw $2, %k0, %k1
-; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
-; AVX512DQ-NEXT: kmovw %k1, %eax
-; AVX512DQ-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3
-; AVX512DQ-NEXT: kshiftlw $1, %k0, %k1
-; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
-; AVX512DQ-NEXT: kmovw %k1, %eax
-; AVX512DQ-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3
-; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0
-; AVX512DQ-NEXT: kmovw %k0, %eax
-; AVX512DQ-NEXT: vpinsrb $15, %eax, %xmm3, %xmm3
+; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vpcmpgtw %ymm6, %ymm2, %ymm2
; AVX512DQ-NEXT: vpmovsxwd %ymm2, %zmm2
-; AVX512DQ-NEXT: vpslld $31, %zmm2, %zmm2
-; AVX512DQ-NEXT: vptestmd %zmm2, %zmm2, %k0
-; AVX512DQ-NEXT: kshiftlw $14, %k0, %k1
-; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
-; AVX512DQ-NEXT: kmovw %k1, %eax
-; AVX512DQ-NEXT: kshiftlw $15, %k0, %k1
-; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
-; AVX512DQ-NEXT: kmovw %k1, %ecx
-; AVX512DQ-NEXT: vmovd %ecx, %xmm2
-; AVX512DQ-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2
-; AVX512DQ-NEXT: kshiftlw $13, %k0, %k1
-; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
-; AVX512DQ-NEXT: kmovw %k1, %eax
-; AVX512DQ-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2
-; AVX512DQ-NEXT: kshiftlw $12, %k0, %k1
-; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
-; AVX512DQ-NEXT: kmovw %k1, %eax
-; AVX512DQ-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2
-; AVX512DQ-NEXT: kshiftlw $11, %k0, %k1
-; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
-; AVX512DQ-NEXT: kmovw %k1, %eax
-; AVX512DQ-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
-; AVX512DQ-NEXT: kshiftlw $10, %k0, %k1
-; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
-; AVX512DQ-NEXT: kmovw %k1, %eax
-; AVX512DQ-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2
-; AVX512DQ-NEXT: kshiftlw $9, %k0, %k1
-; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
-; AVX512DQ-NEXT: kmovw %k1, %eax
-; AVX512DQ-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2
-; AVX512DQ-NEXT: kshiftlw $8, %k0, %k1
-; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
-; AVX512DQ-NEXT: kmovw %k1, %eax
-; AVX512DQ-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2
-; AVX512DQ-NEXT: kshiftlw $7, %k0, %k1
-; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
-; AVX512DQ-NEXT: kmovw %k1, %eax
-; AVX512DQ-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
-; AVX512DQ-NEXT: kshiftlw $6, %k0, %k1
-; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
-; AVX512DQ-NEXT: kmovw %k1, %eax
-; AVX512DQ-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2
-; AVX512DQ-NEXT: kshiftlw $5, %k0, %k1
-; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
-; AVX512DQ-NEXT: kmovw %k1, %eax
-; AVX512DQ-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2
-; AVX512DQ-NEXT: kshiftlw $4, %k0, %k1
-; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
-; AVX512DQ-NEXT: kmovw %k1, %eax
-; AVX512DQ-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2
-; AVX512DQ-NEXT: kshiftlw $3, %k0, %k1
-; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
-; AVX512DQ-NEXT: kmovw %k1, %eax
-; AVX512DQ-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
-; AVX512DQ-NEXT: kshiftlw $2, %k0, %k1
-; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
-; AVX512DQ-NEXT: kmovw %k1, %eax
-; AVX512DQ-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2
-; AVX512DQ-NEXT: kshiftlw $1, %k0, %k1
-; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
-; AVX512DQ-NEXT: kmovw %k1, %eax
-; AVX512DQ-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2
-; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0
-; AVX512DQ-NEXT: kmovw %k0, %eax
-; AVX512DQ-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2
+; AVX512DQ-NEXT: vpmovdb %zmm2, %xmm2
+; AVX512DQ-NEXT: vpcmpgtw %ymm7, %ymm3, %ymm3
+; AVX512DQ-NEXT: vpmovsxwd %ymm3, %zmm3
+; AVX512DQ-NEXT: vpmovdb %zmm3, %xmm3
; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
; AVX512DQ-NEXT: vpsllw $7, %ymm2, %ymm2
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
; AVX512DQ-NEXT: vpand %ymm3, %ymm2, %ymm2
-; AVX512DQ-NEXT: vpxor %ymm6, %ymm6, %ymm6
+; AVX512DQ-NEXT: vpxor %xmm6, %xmm6, %xmm6
; AVX512DQ-NEXT: vpcmpgtb %ymm2, %ymm6, %ymm2
-; AVX512DQ-NEXT: vpcmpgtw %ymm5, %ymm1, %ymm1
-; AVX512DQ-NEXT: vpmovsxwd %ymm1, %zmm1
-; AVX512DQ-NEXT: vpslld $31, %zmm1, %zmm1
-; AVX512DQ-NEXT: vptestmd %zmm1, %zmm1, %k0
-; AVX512DQ-NEXT: kshiftlw $14, %k0, %k1
-; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
-; AVX512DQ-NEXT: kmovw %k1, %eax
-; AVX512DQ-NEXT: kshiftlw $15, %k0, %k1
-; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
-; AVX512DQ-NEXT: kmovw %k1, %ecx
-; AVX512DQ-NEXT: vmovd %ecx, %xmm1
-; AVX512DQ-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
-; AVX512DQ-NEXT: kshiftlw $13, %k0, %k1
-; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
-; AVX512DQ-NEXT: kmovw %k1, %eax
-; AVX512DQ-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
-; AVX512DQ-NEXT: kshiftlw $12, %k0, %k1
-; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
-; AVX512DQ-NEXT: kmovw %k1, %eax
-; AVX512DQ-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1
-; AVX512DQ-NEXT: kshiftlw $11, %k0, %k1
-; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
-; AVX512DQ-NEXT: kmovw %k1, %eax
-; AVX512DQ-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; AVX512DQ-NEXT: kshiftlw $10, %k0, %k1
-; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
-; AVX512DQ-NEXT: kmovw %k1, %eax
-; AVX512DQ-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
-; AVX512DQ-NEXT: kshiftlw $9, %k0, %k1
-; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
-; AVX512DQ-NEXT: kmovw %k1, %eax
-; AVX512DQ-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1
-; AVX512DQ-NEXT: kshiftlw $8, %k0, %k1
-; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
-; AVX512DQ-NEXT: kmovw %k1, %eax
-; AVX512DQ-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1
-; AVX512DQ-NEXT: kshiftlw $7, %k0, %k1
-; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
-; AVX512DQ-NEXT: kmovw %k1, %eax
-; AVX512DQ-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
-; AVX512DQ-NEXT: kshiftlw $6, %k0, %k1
-; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
-; AVX512DQ-NEXT: kmovw %k1, %eax
-; AVX512DQ-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1
-; AVX512DQ-NEXT: kshiftlw $5, %k0, %k1
-; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
-; AVX512DQ-NEXT: kmovw %k1, %eax
-; AVX512DQ-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1
-; AVX512DQ-NEXT: kshiftlw $4, %k0, %k1
-; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
-; AVX512DQ-NEXT: kmovw %k1, %eax
-; AVX512DQ-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1
-; AVX512DQ-NEXT: kshiftlw $3, %k0, %k1
-; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
-; AVX512DQ-NEXT: kmovw %k1, %eax
-; AVX512DQ-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
-; AVX512DQ-NEXT: kshiftlw $2, %k0, %k1
-; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
-; AVX512DQ-NEXT: kmovw %k1, %eax
-; AVX512DQ-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
-; AVX512DQ-NEXT: kshiftlw $1, %k0, %k1
-; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
-; AVX512DQ-NEXT: kmovw %k1, %eax
-; AVX512DQ-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1
-; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0
-; AVX512DQ-NEXT: kmovw %k0, %eax
-; AVX512DQ-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1
; AVX512DQ-NEXT: vpcmpgtw %ymm4, %ymm0, %ymm0
; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0
-; AVX512DQ-NEXT: vpslld $31, %zmm0, %zmm0
-; AVX512DQ-NEXT: vptestmd %zmm0, %zmm0, %k0
-; AVX512DQ-NEXT: kshiftlw $14, %k0, %k1
-; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
-; AVX512DQ-NEXT: kmovw %k1, %eax
-; AVX512DQ-NEXT: kshiftlw $15, %k0, %k1
-; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
-; AVX512DQ-NEXT: kmovw %k1, %ecx
-; AVX512DQ-NEXT: vmovd %ecx, %xmm0
-; AVX512DQ-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
-; AVX512DQ-NEXT: kshiftlw $13, %k0, %k1
-; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
-; AVX512DQ-NEXT: kmovw %k1, %eax
-; AVX512DQ-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
-; AVX512DQ-NEXT: kshiftlw $12, %k0, %k1
-; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
-; AVX512DQ-NEXT: kmovw %k1, %eax
-; AVX512DQ-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
-; AVX512DQ-NEXT: kshiftlw $11, %k0, %k1
-; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
-; AVX512DQ-NEXT: kmovw %k1, %eax
-; AVX512DQ-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
-; AVX512DQ-NEXT: kshiftlw $10, %k0, %k1
-; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
-; AVX512DQ-NEXT: kmovw %k1, %eax
-; AVX512DQ-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
-; AVX512DQ-NEXT: kshiftlw $9, %k0, %k1
-; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
-; AVX512DQ-NEXT: kmovw %k1, %eax
-; AVX512DQ-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
-; AVX512DQ-NEXT: kshiftlw $8, %k0, %k1
-; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
-; AVX512DQ-NEXT: kmovw %k1, %eax
-; AVX512DQ-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
-; AVX512DQ-NEXT: kshiftlw $7, %k0, %k1
-; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
-; AVX512DQ-NEXT: kmovw %k1, %eax
-; AVX512DQ-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
-; AVX512DQ-NEXT: kshiftlw $6, %k0, %k1
-; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
-; AVX512DQ-NEXT: kmovw %k1, %eax
-; AVX512DQ-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
-; AVX512DQ-NEXT: kshiftlw $5, %k0, %k1
-; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
-; AVX512DQ-NEXT: kmovw %k1, %eax
-; AVX512DQ-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
-; AVX512DQ-NEXT: kshiftlw $4, %k0, %k1
-; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
-; AVX512DQ-NEXT: kmovw %k1, %eax
-; AVX512DQ-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
-; AVX512DQ-NEXT: kshiftlw $3, %k0, %k1
-; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
-; AVX512DQ-NEXT: kmovw %k1, %eax
-; AVX512DQ-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
-; AVX512DQ-NEXT: kshiftlw $2, %k0, %k1
-; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
-; AVX512DQ-NEXT: kmovw %k1, %eax
-; AVX512DQ-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
-; AVX512DQ-NEXT: kshiftlw $1, %k0, %k1
-; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
-; AVX512DQ-NEXT: kmovw %k1, %eax
-; AVX512DQ-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
-; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0
-; AVX512DQ-NEXT: kmovw %k0, %eax
-; AVX512DQ-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT: vpcmpgtw %ymm5, %ymm1, %ymm1
+; AVX512DQ-NEXT: vpmovsxwd %ymm1, %zmm1
+; AVX512DQ-NEXT: vpmovdb %zmm1, %xmm1
; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX512DQ-NEXT: vpsllw $7, %ymm0, %ymm0
; AVX512DQ-NEXT: vpand %ymm3, %ymm0, %ymm0
; AVX512DQ-NEXT: vpcmpgtb %ymm0, %ymm6, %ymm0
; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3
-; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
-; AVX512DQ-NEXT: # kill: %XMM2<def> %XMM2<kill> %YMM2<kill>
+; AVX512DQ-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
+; AVX512DQ-NEXT: # kill: def %xmm2 killed %xmm2 killed %ymm2
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: test_cmp_v64i16:
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vextracti32x4 $3, %zmm2, %xmm4
-; AVX512BW-NEXT: vpextrw $1, %xmm4, %ecx
-; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm5
-; AVX512BW-NEXT: vpextrw $1, %xmm5, %edx
-; AVX512BW-NEXT: xorl %eax, %eax
-; AVX512BW-NEXT: cmpw %cx, %dx
-; AVX512BW-NEXT: movw $-1, %cx
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovgw %cx, %dx
-; AVX512BW-NEXT: vmovd %xmm4, %esi
-; AVX512BW-NEXT: vmovd %xmm5, %edi
-; AVX512BW-NEXT: cmpw %si, %di
-; AVX512BW-NEXT: movl $0, %esi
-; AVX512BW-NEXT: cmovgw %cx, %si
-; AVX512BW-NEXT: vmovd %esi, %xmm6
-; AVX512BW-NEXT: vpinsrw $1, %edx, %xmm6, %xmm6
-; AVX512BW-NEXT: vpextrw $2, %xmm4, %edx
-; AVX512BW-NEXT: vpextrw $2, %xmm5, %esi
-; AVX512BW-NEXT: cmpw %dx, %si
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovgw %cx, %dx
-; AVX512BW-NEXT: vpinsrw $2, %edx, %xmm6, %xmm6
-; AVX512BW-NEXT: vpextrw $3, %xmm4, %edx
-; AVX512BW-NEXT: vpextrw $3, %xmm5, %esi
-; AVX512BW-NEXT: cmpw %dx, %si
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovgw %cx, %dx
-; AVX512BW-NEXT: vpinsrw $3, %edx, %xmm6, %xmm6
-; AVX512BW-NEXT: vpextrw $4, %xmm4, %edx
-; AVX512BW-NEXT: vpextrw $4, %xmm5, %esi
-; AVX512BW-NEXT: cmpw %dx, %si
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovgw %cx, %dx
-; AVX512BW-NEXT: vpinsrw $4, %edx, %xmm6, %xmm6
-; AVX512BW-NEXT: vpextrw $5, %xmm4, %edx
-; AVX512BW-NEXT: vpextrw $5, %xmm5, %esi
-; AVX512BW-NEXT: cmpw %dx, %si
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovgw %cx, %dx
-; AVX512BW-NEXT: vpinsrw $5, %edx, %xmm6, %xmm6
-; AVX512BW-NEXT: vpextrw $6, %xmm4, %edx
-; AVX512BW-NEXT: vpextrw $6, %xmm5, %esi
-; AVX512BW-NEXT: cmpw %dx, %si
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovgw %cx, %dx
-; AVX512BW-NEXT: vpinsrw $6, %edx, %xmm6, %xmm6
-; AVX512BW-NEXT: vpextrw $7, %xmm4, %edx
-; AVX512BW-NEXT: vpextrw $7, %xmm5, %esi
-; AVX512BW-NEXT: cmpw %dx, %si
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovgw %cx, %dx
-; AVX512BW-NEXT: vpinsrw $7, %edx, %xmm6, %xmm4
-; AVX512BW-NEXT: vextracti32x4 $2, %zmm2, %xmm5
-; AVX512BW-NEXT: vpextrw $1, %xmm5, %edx
-; AVX512BW-NEXT: vextracti32x4 $2, %zmm0, %xmm6
-; AVX512BW-NEXT: vpextrw $1, %xmm6, %esi
-; AVX512BW-NEXT: cmpw %dx, %si
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovgw %cx, %dx
-; AVX512BW-NEXT: vmovd %xmm5, %esi
-; AVX512BW-NEXT: vmovd %xmm6, %edi
-; AVX512BW-NEXT: cmpw %si, %di
-; AVX512BW-NEXT: movl $0, %esi
-; AVX512BW-NEXT: cmovgw %cx, %si
-; AVX512BW-NEXT: vmovd %esi, %xmm7
-; AVX512BW-NEXT: vpinsrw $1, %edx, %xmm7, %xmm7
-; AVX512BW-NEXT: vpextrw $2, %xmm5, %edx
-; AVX512BW-NEXT: vpextrw $2, %xmm6, %esi
-; AVX512BW-NEXT: cmpw %dx, %si
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovgw %cx, %dx
-; AVX512BW-NEXT: vpinsrw $2, %edx, %xmm7, %xmm7
-; AVX512BW-NEXT: vpextrw $3, %xmm5, %edx
-; AVX512BW-NEXT: vpextrw $3, %xmm6, %esi
-; AVX512BW-NEXT: cmpw %dx, %si
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovgw %cx, %dx
-; AVX512BW-NEXT: vpinsrw $3, %edx, %xmm7, %xmm7
-; AVX512BW-NEXT: vpextrw $4, %xmm5, %edx
-; AVX512BW-NEXT: vpextrw $4, %xmm6, %esi
-; AVX512BW-NEXT: cmpw %dx, %si
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovgw %cx, %dx
-; AVX512BW-NEXT: vpinsrw $4, %edx, %xmm7, %xmm7
-; AVX512BW-NEXT: vpextrw $5, %xmm5, %edx
-; AVX512BW-NEXT: vpextrw $5, %xmm6, %esi
-; AVX512BW-NEXT: cmpw %dx, %si
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovgw %cx, %dx
-; AVX512BW-NEXT: vpinsrw $5, %edx, %xmm7, %xmm7
-; AVX512BW-NEXT: vpextrw $6, %xmm5, %edx
-; AVX512BW-NEXT: vpextrw $6, %xmm6, %esi
-; AVX512BW-NEXT: cmpw %dx, %si
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovgw %cx, %dx
-; AVX512BW-NEXT: vpinsrw $6, %edx, %xmm7, %xmm7
-; AVX512BW-NEXT: vpextrw $7, %xmm5, %edx
-; AVX512BW-NEXT: vpextrw $7, %xmm6, %esi
-; AVX512BW-NEXT: cmpw %dx, %si
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovgw %cx, %dx
-; AVX512BW-NEXT: vpinsrw $7, %edx, %xmm7, %xmm5
-; AVX512BW-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4
-; AVX512BW-NEXT: vextracti32x4 $1, %zmm2, %xmm5
-; AVX512BW-NEXT: vpextrw $1, %xmm5, %edx
-; AVX512BW-NEXT: vextracti32x4 $1, %zmm0, %xmm6
-; AVX512BW-NEXT: vpextrw $1, %xmm6, %esi
-; AVX512BW-NEXT: cmpw %dx, %si
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovgw %cx, %dx
-; AVX512BW-NEXT: vmovd %xmm5, %esi
-; AVX512BW-NEXT: vmovd %xmm6, %edi
-; AVX512BW-NEXT: cmpw %si, %di
-; AVX512BW-NEXT: movl $0, %esi
-; AVX512BW-NEXT: cmovgw %cx, %si
-; AVX512BW-NEXT: vmovd %esi, %xmm7
-; AVX512BW-NEXT: vpinsrw $1, %edx, %xmm7, %xmm7
-; AVX512BW-NEXT: vpextrw $2, %xmm5, %edx
-; AVX512BW-NEXT: vpextrw $2, %xmm6, %esi
-; AVX512BW-NEXT: cmpw %dx, %si
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovgw %cx, %dx
-; AVX512BW-NEXT: vpinsrw $2, %edx, %xmm7, %xmm7
-; AVX512BW-NEXT: vpextrw $3, %xmm5, %edx
-; AVX512BW-NEXT: vpextrw $3, %xmm6, %esi
-; AVX512BW-NEXT: cmpw %dx, %si
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovgw %cx, %dx
-; AVX512BW-NEXT: vpinsrw $3, %edx, %xmm7, %xmm7
-; AVX512BW-NEXT: vpextrw $4, %xmm5, %edx
-; AVX512BW-NEXT: vpextrw $4, %xmm6, %esi
-; AVX512BW-NEXT: cmpw %dx, %si
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovgw %cx, %dx
-; AVX512BW-NEXT: vpinsrw $4, %edx, %xmm7, %xmm7
-; AVX512BW-NEXT: vpextrw $5, %xmm5, %edx
-; AVX512BW-NEXT: vpextrw $5, %xmm6, %esi
-; AVX512BW-NEXT: cmpw %dx, %si
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovgw %cx, %dx
-; AVX512BW-NEXT: vpinsrw $5, %edx, %xmm7, %xmm7
-; AVX512BW-NEXT: vpextrw $6, %xmm5, %edx
-; AVX512BW-NEXT: vpextrw $6, %xmm6, %esi
-; AVX512BW-NEXT: cmpw %dx, %si
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovgw %cx, %dx
-; AVX512BW-NEXT: vpinsrw $6, %edx, %xmm7, %xmm7
-; AVX512BW-NEXT: vpextrw $7, %xmm5, %edx
-; AVX512BW-NEXT: vpextrw $7, %xmm6, %esi
-; AVX512BW-NEXT: cmpw %dx, %si
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovgw %cx, %dx
-; AVX512BW-NEXT: vpinsrw $7, %edx, %xmm7, %xmm5
-; AVX512BW-NEXT: vpextrw $1, %xmm2, %edx
-; AVX512BW-NEXT: vpextrw $1, %xmm0, %esi
-; AVX512BW-NEXT: cmpw %dx, %si
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovgw %cx, %dx
-; AVX512BW-NEXT: vmovd %xmm2, %esi
-; AVX512BW-NEXT: vmovd %xmm0, %edi
-; AVX512BW-NEXT: cmpw %si, %di
-; AVX512BW-NEXT: movl $0, %esi
-; AVX512BW-NEXT: cmovgw %cx, %si
-; AVX512BW-NEXT: vmovd %esi, %xmm6
-; AVX512BW-NEXT: vpinsrw $1, %edx, %xmm6, %xmm6
-; AVX512BW-NEXT: vpextrw $2, %xmm2, %edx
-; AVX512BW-NEXT: vpextrw $2, %xmm0, %esi
-; AVX512BW-NEXT: cmpw %dx, %si
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovgw %cx, %dx
-; AVX512BW-NEXT: vpinsrw $2, %edx, %xmm6, %xmm6
-; AVX512BW-NEXT: vpextrw $3, %xmm2, %edx
-; AVX512BW-NEXT: vpextrw $3, %xmm0, %esi
-; AVX512BW-NEXT: cmpw %dx, %si
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovgw %cx, %dx
-; AVX512BW-NEXT: vpinsrw $3, %edx, %xmm6, %xmm6
-; AVX512BW-NEXT: vpextrw $4, %xmm2, %edx
-; AVX512BW-NEXT: vpextrw $4, %xmm0, %esi
-; AVX512BW-NEXT: cmpw %dx, %si
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovgw %cx, %dx
-; AVX512BW-NEXT: vpinsrw $4, %edx, %xmm6, %xmm6
-; AVX512BW-NEXT: vpextrw $5, %xmm2, %edx
-; AVX512BW-NEXT: vpextrw $5, %xmm0, %esi
-; AVX512BW-NEXT: cmpw %dx, %si
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovgw %cx, %dx
-; AVX512BW-NEXT: vpinsrw $5, %edx, %xmm6, %xmm6
-; AVX512BW-NEXT: vpextrw $6, %xmm2, %edx
-; AVX512BW-NEXT: vpextrw $6, %xmm0, %esi
-; AVX512BW-NEXT: cmpw %dx, %si
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovgw %cx, %dx
-; AVX512BW-NEXT: vpinsrw $6, %edx, %xmm6, %xmm6
-; AVX512BW-NEXT: vpextrw $7, %xmm2, %edx
-; AVX512BW-NEXT: vpextrw $7, %xmm0, %esi
-; AVX512BW-NEXT: cmpw %dx, %si
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovgw %cx, %dx
-; AVX512BW-NEXT: vpinsrw $7, %edx, %xmm6, %xmm0
-; AVX512BW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm0
-; AVX512BW-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0
-; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
-; AVX512BW-NEXT: vextracti32x4 $3, %zmm3, %xmm2
-; AVX512BW-NEXT: vpextrw $1, %xmm2, %edx
-; AVX512BW-NEXT: vextracti32x4 $3, %zmm1, %xmm4
-; AVX512BW-NEXT: vpextrw $1, %xmm4, %esi
-; AVX512BW-NEXT: cmpw %dx, %si
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovgw %cx, %dx
-; AVX512BW-NEXT: vmovd %xmm2, %esi
-; AVX512BW-NEXT: vmovd %xmm4, %edi
-; AVX512BW-NEXT: cmpw %si, %di
-; AVX512BW-NEXT: movl $0, %esi
-; AVX512BW-NEXT: cmovgw %cx, %si
-; AVX512BW-NEXT: vmovd %esi, %xmm5
-; AVX512BW-NEXT: vpinsrw $1, %edx, %xmm5, %xmm5
-; AVX512BW-NEXT: vpextrw $2, %xmm2, %edx
-; AVX512BW-NEXT: vpextrw $2, %xmm4, %esi
-; AVX512BW-NEXT: cmpw %dx, %si
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovgw %cx, %dx
-; AVX512BW-NEXT: vpinsrw $2, %edx, %xmm5, %xmm5
-; AVX512BW-NEXT: vpextrw $3, %xmm2, %edx
-; AVX512BW-NEXT: vpextrw $3, %xmm4, %esi
-; AVX512BW-NEXT: cmpw %dx, %si
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovgw %cx, %dx
-; AVX512BW-NEXT: vpinsrw $3, %edx, %xmm5, %xmm5
-; AVX512BW-NEXT: vpextrw $4, %xmm2, %edx
-; AVX512BW-NEXT: vpextrw $4, %xmm4, %esi
-; AVX512BW-NEXT: cmpw %dx, %si
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovgw %cx, %dx
-; AVX512BW-NEXT: vpinsrw $4, %edx, %xmm5, %xmm5
-; AVX512BW-NEXT: vpextrw $5, %xmm2, %edx
-; AVX512BW-NEXT: vpextrw $5, %xmm4, %esi
-; AVX512BW-NEXT: cmpw %dx, %si
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovgw %cx, %dx
-; AVX512BW-NEXT: vpinsrw $5, %edx, %xmm5, %xmm5
-; AVX512BW-NEXT: vpextrw $6, %xmm2, %edx
-; AVX512BW-NEXT: vpextrw $6, %xmm4, %esi
-; AVX512BW-NEXT: cmpw %dx, %si
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovgw %cx, %dx
-; AVX512BW-NEXT: vpinsrw $6, %edx, %xmm5, %xmm5
-; AVX512BW-NEXT: vpextrw $7, %xmm2, %edx
-; AVX512BW-NEXT: vpextrw $7, %xmm4, %esi
-; AVX512BW-NEXT: cmpw %dx, %si
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovgw %cx, %dx
-; AVX512BW-NEXT: vpinsrw $7, %edx, %xmm5, %xmm2
-; AVX512BW-NEXT: vextracti32x4 $2, %zmm3, %xmm4
-; AVX512BW-NEXT: vpextrw $1, %xmm4, %edx
-; AVX512BW-NEXT: vextracti32x4 $2, %zmm1, %xmm5
-; AVX512BW-NEXT: vpextrw $1, %xmm5, %esi
-; AVX512BW-NEXT: cmpw %dx, %si
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovgw %cx, %dx
-; AVX512BW-NEXT: vmovd %xmm4, %esi
-; AVX512BW-NEXT: vmovd %xmm5, %edi
-; AVX512BW-NEXT: cmpw %si, %di
-; AVX512BW-NEXT: movl $0, %esi
-; AVX512BW-NEXT: cmovgw %cx, %si
-; AVX512BW-NEXT: vmovd %esi, %xmm6
-; AVX512BW-NEXT: vpinsrw $1, %edx, %xmm6, %xmm6
-; AVX512BW-NEXT: vpextrw $2, %xmm4, %edx
-; AVX512BW-NEXT: vpextrw $2, %xmm5, %esi
-; AVX512BW-NEXT: cmpw %dx, %si
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovgw %cx, %dx
-; AVX512BW-NEXT: vpinsrw $2, %edx, %xmm6, %xmm6
-; AVX512BW-NEXT: vpextrw $3, %xmm4, %edx
-; AVX512BW-NEXT: vpextrw $3, %xmm5, %esi
-; AVX512BW-NEXT: cmpw %dx, %si
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovgw %cx, %dx
-; AVX512BW-NEXT: vpinsrw $3, %edx, %xmm6, %xmm6
-; AVX512BW-NEXT: vpextrw $4, %xmm4, %edx
-; AVX512BW-NEXT: vpextrw $4, %xmm5, %esi
-; AVX512BW-NEXT: cmpw %dx, %si
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovgw %cx, %dx
-; AVX512BW-NEXT: vpinsrw $4, %edx, %xmm6, %xmm6
-; AVX512BW-NEXT: vpextrw $5, %xmm4, %edx
-; AVX512BW-NEXT: vpextrw $5, %xmm5, %esi
-; AVX512BW-NEXT: cmpw %dx, %si
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovgw %cx, %dx
-; AVX512BW-NEXT: vpinsrw $5, %edx, %xmm6, %xmm6
-; AVX512BW-NEXT: vpextrw $6, %xmm4, %edx
-; AVX512BW-NEXT: vpextrw $6, %xmm5, %esi
-; AVX512BW-NEXT: cmpw %dx, %si
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovgw %cx, %dx
-; AVX512BW-NEXT: vpinsrw $6, %edx, %xmm6, %xmm6
-; AVX512BW-NEXT: vpextrw $7, %xmm4, %edx
-; AVX512BW-NEXT: vpextrw $7, %xmm5, %esi
-; AVX512BW-NEXT: cmpw %dx, %si
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovgw %cx, %dx
-; AVX512BW-NEXT: vpinsrw $7, %edx, %xmm6, %xmm4
-; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2
-; AVX512BW-NEXT: vextracti32x4 $1, %zmm3, %xmm4
-; AVX512BW-NEXT: vpextrw $1, %xmm4, %edx
-; AVX512BW-NEXT: vextracti32x4 $1, %zmm1, %xmm5
-; AVX512BW-NEXT: vpextrw $1, %xmm5, %esi
-; AVX512BW-NEXT: cmpw %dx, %si
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovgw %cx, %dx
-; AVX512BW-NEXT: vmovd %xmm4, %esi
-; AVX512BW-NEXT: vmovd %xmm5, %edi
-; AVX512BW-NEXT: cmpw %si, %di
-; AVX512BW-NEXT: movl $0, %esi
-; AVX512BW-NEXT: cmovgw %cx, %si
-; AVX512BW-NEXT: vmovd %esi, %xmm6
-; AVX512BW-NEXT: vpinsrw $1, %edx, %xmm6, %xmm6
-; AVX512BW-NEXT: vpextrw $2, %xmm4, %edx
-; AVX512BW-NEXT: vpextrw $2, %xmm5, %esi
-; AVX512BW-NEXT: cmpw %dx, %si
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovgw %cx, %dx
-; AVX512BW-NEXT: vpinsrw $2, %edx, %xmm6, %xmm6
-; AVX512BW-NEXT: vpextrw $3, %xmm4, %edx
-; AVX512BW-NEXT: vpextrw $3, %xmm5, %esi
-; AVX512BW-NEXT: cmpw %dx, %si
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovgw %cx, %dx
-; AVX512BW-NEXT: vpinsrw $3, %edx, %xmm6, %xmm6
-; AVX512BW-NEXT: vpextrw $4, %xmm4, %edx
-; AVX512BW-NEXT: vpextrw $4, %xmm5, %esi
-; AVX512BW-NEXT: cmpw %dx, %si
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovgw %cx, %dx
-; AVX512BW-NEXT: vpinsrw $4, %edx, %xmm6, %xmm6
-; AVX512BW-NEXT: vpextrw $5, %xmm4, %edx
-; AVX512BW-NEXT: vpextrw $5, %xmm5, %esi
-; AVX512BW-NEXT: cmpw %dx, %si
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovgw %cx, %dx
-; AVX512BW-NEXT: vpinsrw $5, %edx, %xmm6, %xmm6
-; AVX512BW-NEXT: vpextrw $6, %xmm4, %edx
-; AVX512BW-NEXT: vpextrw $6, %xmm5, %esi
-; AVX512BW-NEXT: cmpw %dx, %si
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovgw %cx, %dx
-; AVX512BW-NEXT: vpinsrw $6, %edx, %xmm6, %xmm6
-; AVX512BW-NEXT: vpextrw $7, %xmm4, %edx
-; AVX512BW-NEXT: vpextrw $7, %xmm5, %esi
-; AVX512BW-NEXT: cmpw %dx, %si
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovgw %cx, %dx
-; AVX512BW-NEXT: vpinsrw $7, %edx, %xmm6, %xmm4
-; AVX512BW-NEXT: vpextrw $1, %xmm3, %edx
-; AVX512BW-NEXT: vpextrw $1, %xmm1, %esi
-; AVX512BW-NEXT: cmpw %dx, %si
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovgw %cx, %dx
-; AVX512BW-NEXT: vmovd %xmm3, %esi
-; AVX512BW-NEXT: vmovd %xmm1, %edi
-; AVX512BW-NEXT: cmpw %si, %di
-; AVX512BW-NEXT: movl $0, %esi
-; AVX512BW-NEXT: cmovgw %cx, %si
-; AVX512BW-NEXT: vmovd %esi, %xmm5
-; AVX512BW-NEXT: vpinsrw $1, %edx, %xmm5, %xmm5
-; AVX512BW-NEXT: vpextrw $2, %xmm3, %edx
-; AVX512BW-NEXT: vpextrw $2, %xmm1, %esi
-; AVX512BW-NEXT: cmpw %dx, %si
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovgw %cx, %dx
-; AVX512BW-NEXT: vpinsrw $2, %edx, %xmm5, %xmm5
-; AVX512BW-NEXT: vpextrw $3, %xmm3, %edx
-; AVX512BW-NEXT: vpextrw $3, %xmm1, %esi
-; AVX512BW-NEXT: cmpw %dx, %si
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovgw %cx, %dx
-; AVX512BW-NEXT: vpinsrw $3, %edx, %xmm5, %xmm5
-; AVX512BW-NEXT: vpextrw $4, %xmm3, %edx
-; AVX512BW-NEXT: vpextrw $4, %xmm1, %esi
-; AVX512BW-NEXT: cmpw %dx, %si
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovgw %cx, %dx
-; AVX512BW-NEXT: vpinsrw $4, %edx, %xmm5, %xmm5
-; AVX512BW-NEXT: vpextrw $5, %xmm3, %edx
-; AVX512BW-NEXT: vpextrw $5, %xmm1, %esi
-; AVX512BW-NEXT: cmpw %dx, %si
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovgw %cx, %dx
-; AVX512BW-NEXT: vpinsrw $5, %edx, %xmm5, %xmm5
-; AVX512BW-NEXT: vpextrw $6, %xmm3, %edx
-; AVX512BW-NEXT: vpextrw $6, %xmm1, %esi
-; AVX512BW-NEXT: cmpw %dx, %si
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovgw %cx, %dx
-; AVX512BW-NEXT: vpinsrw $6, %edx, %xmm5, %xmm5
-; AVX512BW-NEXT: vpextrw $7, %xmm3, %edx
-; AVX512BW-NEXT: vpextrw $7, %xmm1, %esi
-; AVX512BW-NEXT: cmpw %dx, %si
-; AVX512BW-NEXT: cmovgw %cx, %ax
-; AVX512BW-NEXT: vpinsrw $7, %eax, %xmm5, %xmm1
-; AVX512BW-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1
-; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
-; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1
-; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpcmpgtw %zmm2, %zmm0, %k0
+; AVX512BW-NEXT: vpcmpgtw %zmm3, %zmm1, %k1
+; AVX512BW-NEXT: kunpckdq %k0, %k1, %k0
+; AVX512BW-NEXT: vpmovm2b %k0, %zmm0
; AVX512BW-NEXT: retq
%1 = icmp sgt <64 x i16> %a0, %a1
ret <64 x i1> %1
@@ -6752,7 +3848,7 @@ define <64 x i1> @test_cmp_v64i16(<64 x i16> %a0, <64 x i16> %a1) nounwind {
define <128 x i1> @test_cmp_v128i8(<128 x i8> %a0, <128 x i8> %a1) nounwind {
; SSE2-LABEL: test_cmp_v128i8:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: pushq %rax
; SSE2-NEXT: pcmpgtb {{[0-9]+}}(%rsp), %xmm0
; SSE2-NEXT: pcmpgtb {{[0-9]+}}(%rsp), %xmm1
@@ -7159,7 +4255,7 @@ define <128 x i1> @test_cmp_v128i8(<128 x i8> %a0, <128 x i8> %a1) nounwind {
; SSE2-NEXT: retq
;
; SSE42-LABEL: test_cmp_v128i8:
-; SSE42: # BB#0:
+; SSE42: # %bb.0:
; SSE42-NEXT: pcmpgtb {{[0-9]+}}(%rsp), %xmm0
; SSE42-NEXT: pcmpgtb {{[0-9]+}}(%rsp), %xmm1
; SSE42-NEXT: pcmpgtb {{[0-9]+}}(%rsp), %xmm2
@@ -7556,7 +4652,7 @@ define <128 x i1> @test_cmp_v128i8(<128 x i8> %a0, <128 x i8> %a1) nounwind {
; SSE42-NEXT: retq
;
; AVX1-LABEL: test_cmp_v128i8:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpcmpgtb %xmm4, %xmm0, %xmm8
; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
@@ -7962,7 +5058,7 @@ define <128 x i1> @test_cmp_v128i8(<128 x i8> %a0, <128 x i8> %a1) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_cmp_v128i8:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpcmpgtb %ymm4, %ymm0, %ymm0
; AVX2-NEXT: vpcmpgtb %ymm5, %ymm1, %ymm1
; AVX2-NEXT: vpcmpgtb %ymm6, %ymm2, %ymm2
@@ -8360,7 +5456,7 @@ define <128 x i1> @test_cmp_v128i8(<128 x i8> %a0, <128 x i8> %a1) nounwind {
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test_cmp_v128i8:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vpcmpgtb %ymm4, %ymm0, %ymm0
; AVX512F-NEXT: vpcmpgtb %ymm5, %ymm1, %ymm1
; AVX512F-NEXT: vpcmpgtb %ymm6, %ymm2, %ymm2
@@ -8406,7 +5502,7 @@ define <128 x i1> @test_cmp_v128i8(<128 x i8> %a0, <128 x i8> %a1) nounwind {
; AVX512F-NEXT: retq
;
; AVX512DQ-LABEL: test_cmp_v128i8:
-; AVX512DQ: # BB#0:
+; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vpcmpgtb %ymm4, %ymm0, %ymm0
; AVX512DQ-NEXT: vpcmpgtb %ymm5, %ymm1, %ymm1
; AVX512DQ-NEXT: vpcmpgtb %ymm6, %ymm2, %ymm2
@@ -8452,7 +5548,7 @@ define <128 x i1> @test_cmp_v128i8(<128 x i8> %a0, <128 x i8> %a1) nounwind {
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: test_cmp_v128i8:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpcmpgtb %zmm3, %zmm1, %k0
; AVX512BW-NEXT: vpcmpgtb %zmm2, %zmm0, %k1
; AVX512BW-NEXT: vpmovm2b %k1, %zmm0
@@ -8468,7 +5564,7 @@ define <128 x i1> @test_cmp_v128i8(<128 x i8> %a0, <128 x i8> %a1) nounwind {
define <32 x i1> @test_cmp_v32f64(<32 x double> %a0, <32 x double> %a1) nounwind {
; SSE2-LABEL: test_cmp_v32f64:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movapd {{[0-9]+}}(%rsp), %xmm8
; SSE2-NEXT: cmpltpd %xmm7, %xmm8
; SSE2-NEXT: movapd {{[0-9]+}}(%rsp), %xmm7
@@ -8479,96 +5575,43 @@ define <32 x i1> @test_cmp_v32f64(<32 x double> %a0, <32 x double> %a1) nounwind
; SSE2-NEXT: movapd {{[0-9]+}}(%rsp), %xmm5
; SSE2-NEXT: cmpltpd %xmm4, %xmm5
; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm6[0,2]
-; SSE2-NEXT: pslld $31, %xmm7
-; SSE2-NEXT: psrad $31, %xmm7
-; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm7[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,2,2,3]
-; SSE2-NEXT: pslld $31, %xmm5
-; SSE2-NEXT: psrad $31, %xmm5
-; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm5[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm6[0]
-; SSE2-NEXT: movapd {{[0-9]+}}(%rsp), %xmm5
-; SSE2-NEXT: cmpltpd %xmm3, %xmm5
-; SSE2-NEXT: movapd {{[0-9]+}}(%rsp), %xmm6
-; SSE2-NEXT: cmpltpd %xmm2, %xmm6
+; SSE2-NEXT: packssdw %xmm7, %xmm5
+; SSE2-NEXT: movapd {{[0-9]+}}(%rsp), %xmm4
+; SSE2-NEXT: cmpltpd %xmm3, %xmm4
+; SSE2-NEXT: movapd {{[0-9]+}}(%rsp), %xmm3
+; SSE2-NEXT: cmpltpd %xmm2, %xmm3
+; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2]
; SSE2-NEXT: movapd {{[0-9]+}}(%rsp), %xmm2
-; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,2],xmm5[0,2]
-; SSE2-NEXT: movapd {{[0-9]+}}(%rsp), %xmm5
-; SSE2-NEXT: cmpltpd %xmm1, %xmm5
+; SSE2-NEXT: cmpltpd %xmm1, %xmm2
; SSE2-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1
; SSE2-NEXT: cmpltpd %xmm0, %xmm1
+; SSE2-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
+; SSE2-NEXT: movapd {{[0-9]+}}(%rsp), %xmm2
+; SSE2-NEXT: packssdw %xmm3, %xmm1
; SSE2-NEXT: movapd {{[0-9]+}}(%rsp), %xmm3
-; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2]
-; SSE2-NEXT: movapd {{[0-9]+}}(%rsp), %xmm5
-; SSE2-NEXT: psllw $15, %xmm4
-; SSE2-NEXT: psraw $15, %xmm4
-; SSE2-NEXT: pslld $31, %xmm6
-; SSE2-NEXT: psrad $31, %xmm6
-; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,2,2,3]
-; SSE2-NEXT: pslld $31, %xmm1
-; SSE2-NEXT: psrad $31, %xmm1
-; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm6[0]
-; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
-; SSE2-NEXT: pand %xmm1, %xmm4
-; SSE2-NEXT: psllw $15, %xmm0
-; SSE2-NEXT: psraw $15, %xmm0
-; SSE2-NEXT: pand %xmm1, %xmm0
-; SSE2-NEXT: packuswb %xmm4, %xmm0
-; SSE2-NEXT: movapd {{[0-9]+}}(%rsp), %xmm4
-; SSE2-NEXT: cmpltpd {{[0-9]+}}(%rsp), %xmm4
-; SSE2-NEXT: cmpltpd {{[0-9]+}}(%rsp), %xmm5
-; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm4[0,2]
+; SSE2-NEXT: packsswb %xmm5, %xmm1
; SSE2-NEXT: movapd {{[0-9]+}}(%rsp), %xmm4
; SSE2-NEXT: cmpltpd {{[0-9]+}}(%rsp), %xmm4
; SSE2-NEXT: cmpltpd {{[0-9]+}}(%rsp), %xmm3
; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2]
; SSE2-NEXT: movapd {{[0-9]+}}(%rsp), %xmm4
-; SSE2-NEXT: pslld $31, %xmm5
-; SSE2-NEXT: psrad $31, %xmm5
-; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3]
-; SSE2-NEXT: pslld $31, %xmm3
-; SSE2-NEXT: psrad $31, %xmm3
-; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm5[0]
-; SSE2-NEXT: movapd {{[0-9]+}}(%rsp), %xmm5
-; SSE2-NEXT: cmpltpd {{[0-9]+}}(%rsp), %xmm5
; SSE2-NEXT: cmpltpd {{[0-9]+}}(%rsp), %xmm4
-; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm5[0,2]
-; SSE2-NEXT: movapd {{[0-9]+}}(%rsp), %xmm5
-; SSE2-NEXT: cmpltpd {{[0-9]+}}(%rsp), %xmm5
; SSE2-NEXT: cmpltpd {{[0-9]+}}(%rsp), %xmm2
-; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm5[0,2]
-; SSE2-NEXT: pslld $31, %xmm4
-; SSE2-NEXT: psrad $31, %xmm4
-; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
-; SSE2-NEXT: pslld $31, %xmm2
-; SSE2-NEXT: psrad $31, %xmm2
-; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0]
-; SSE2-NEXT: psllw $15, %xmm3
-; SSE2-NEXT: psraw $15, %xmm3
-; SSE2-NEXT: pand %xmm1, %xmm3
-; SSE2-NEXT: psllw $15, %xmm2
-; SSE2-NEXT: psraw $15, %xmm2
-; SSE2-NEXT: pand %xmm1, %xmm2
-; SSE2-NEXT: packuswb %xmm3, %xmm2
-; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2]
+; SSE2-NEXT: movapd {{[0-9]+}}(%rsp), %xmm4
+; SSE2-NEXT: packssdw %xmm3, %xmm2
+; SSE2-NEXT: movapd {{[0-9]+}}(%rsp), %xmm3
+; SSE2-NEXT: cmpltpd {{[0-9]+}}(%rsp), %xmm3
+; SSE2-NEXT: cmpltpd {{[0-9]+}}(%rsp), %xmm4
+; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm3[0,2]
+; SSE2-NEXT: movapd {{[0-9]+}}(%rsp), %xmm3
+; SSE2-NEXT: cmpltpd {{[0-9]+}}(%rsp), %xmm3
+; SSE2-NEXT: cmpltpd {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2]
+; SSE2-NEXT: packssdw %xmm4, %xmm0
+; SSE2-NEXT: packsswb %xmm2, %xmm0
+; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp)
; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
; SSE2-NEXT: andb $1, %al
; SSE2-NEXT: movb %al, 2(%rdi)
@@ -8617,7 +5660,7 @@ define <32 x i1> @test_cmp_v32f64(<32 x double> %a0, <32 x double> %a1) nounwind
; SSE2-NEXT: movb %cl, 2(%rdi)
; SSE2-NEXT: andb $1, %al
; SSE2-NEXT: movb %al, 2(%rdi)
-; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp)
; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al
; SSE2-NEXT: andb $1, %al
; SSE2-NEXT: movb %al, (%rdi)
@@ -8670,7 +5713,7 @@ define <32 x i1> @test_cmp_v32f64(<32 x double> %a0, <32 x double> %a1) nounwind
; SSE2-NEXT: retq
;
; SSE42-LABEL: test_cmp_v32f64:
-; SSE42: # BB#0:
+; SSE42: # %bb.0:
; SSE42-NEXT: pushq %rbp
; SSE42-NEXT: pushq %r15
; SSE42-NEXT: pushq %r14
@@ -8678,178 +5721,147 @@ define <32 x i1> @test_cmp_v32f64(<32 x double> %a0, <32 x double> %a1) nounwind
; SSE42-NEXT: pushq %r12
; SSE42-NEXT: pushq %rbx
; SSE42-NEXT: movapd {{[0-9]+}}(%rsp), %xmm8
-; SSE42-NEXT: cmpltpd %xmm3, %xmm8
+; SSE42-NEXT: cmpltpd %xmm7, %xmm8
+; SSE42-NEXT: movapd {{[0-9]+}}(%rsp), %xmm7
+; SSE42-NEXT: cmpltpd %xmm6, %xmm7
+; SSE42-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2],xmm8[0,2]
+; SSE42-NEXT: movapd {{[0-9]+}}(%rsp), %xmm6
+; SSE42-NEXT: cmpltpd %xmm5, %xmm6
+; SSE42-NEXT: movapd {{[0-9]+}}(%rsp), %xmm5
+; SSE42-NEXT: cmpltpd %xmm4, %xmm5
+; SSE42-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm6[0,2]
+; SSE42-NEXT: packssdw %xmm7, %xmm5
+; SSE42-NEXT: movapd {{[0-9]+}}(%rsp), %xmm4
+; SSE42-NEXT: cmpltpd %xmm3, %xmm4
; SSE42-NEXT: movapd {{[0-9]+}}(%rsp), %xmm3
; SSE42-NEXT: cmpltpd %xmm2, %xmm3
-; SSE42-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm8[0,2]
+; SSE42-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2]
; SSE42-NEXT: movapd {{[0-9]+}}(%rsp), %xmm2
; SSE42-NEXT: cmpltpd %xmm1, %xmm2
; SSE42-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1
; SSE42-NEXT: cmpltpd %xmm0, %xmm1
-; SSE42-NEXT: pslld $31, %xmm3
-; SSE42-NEXT: psrad $31, %xmm3
-; SSE42-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
-; SSE42-NEXT: movdqa {{.*#+}} xmm8 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; SSE42-NEXT: pshufb %xmm8, %xmm3
-; SSE42-NEXT: pslld $31, %xmm1
-; SSE42-NEXT: psrad $31, %xmm1
-; SSE42-NEXT: pshufb %xmm8, %xmm1
-; SSE42-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
; SSE42-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0
-; SSE42-NEXT: cmpltpd %xmm7, %xmm0
-; SSE42-NEXT: movapd {{[0-9]+}}(%rsp), %xmm7
-; SSE42-NEXT: cmpltpd %xmm6, %xmm7
-; SSE42-NEXT: movapd {{[0-9]+}}(%rsp), %xmm3
-; SSE42-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2],xmm0[0,2]
-; SSE42-NEXT: movapd {{[0-9]+}}(%rsp), %xmm6
-; SSE42-NEXT: cmpltpd %xmm5, %xmm6
-; SSE42-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0
-; SSE42-NEXT: cmpltpd %xmm4, %xmm0
+; SSE42-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
; SSE42-NEXT: movapd {{[0-9]+}}(%rsp), %xmm2
-; SSE42-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm6[0,2]
+; SSE42-NEXT: packssdw %xmm3, %xmm1
+; SSE42-NEXT: movapd {{[0-9]+}}(%rsp), %xmm3
+; SSE42-NEXT: packsswb %xmm5, %xmm1
; SSE42-NEXT: movapd {{[0-9]+}}(%rsp), %xmm4
-; SSE42-NEXT: pslld $31, %xmm7
-; SSE42-NEXT: psrad $31, %xmm7
-; SSE42-NEXT: pshufb %xmm8, %xmm7
-; SSE42-NEXT: pslld $31, %xmm0
-; SSE42-NEXT: psrad $31, %xmm0
-; SSE42-NEXT: pshufb %xmm8, %xmm0
-; SSE42-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm7[0]
-; SSE42-NEXT: movapd {{[0-9]+}}(%rsp), %xmm5
-; SSE42-NEXT: cmpltpd {{[0-9]+}}(%rsp), %xmm5
; SSE42-NEXT: cmpltpd {{[0-9]+}}(%rsp), %xmm4
-; SSE42-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm5[0,2]
-; SSE42-NEXT: movapd {{[0-9]+}}(%rsp), %xmm5
-; SSE42-NEXT: cmpltpd {{[0-9]+}}(%rsp), %xmm5
-; SSE42-NEXT: cmpltpd {{[0-9]+}}(%rsp), %xmm2
-; SSE42-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm5[0,2]
-; SSE42-NEXT: movapd {{[0-9]+}}(%rsp), %xmm5
-; SSE42-NEXT: pslld $31, %xmm4
-; SSE42-NEXT: psrad $31, %xmm4
-; SSE42-NEXT: pshufb %xmm8, %xmm4
-; SSE42-NEXT: pslld $31, %xmm2
-; SSE42-NEXT: psrad $31, %xmm2
-; SSE42-NEXT: pshufb %xmm8, %xmm2
-; SSE42-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0]
+; SSE42-NEXT: cmpltpd {{[0-9]+}}(%rsp), %xmm3
+; SSE42-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2]
; SSE42-NEXT: movapd {{[0-9]+}}(%rsp), %xmm4
; SSE42-NEXT: cmpltpd {{[0-9]+}}(%rsp), %xmm4
-; SSE42-NEXT: cmpltpd {{[0-9]+}}(%rsp), %xmm5
-; SSE42-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm4[0,2]
+; SSE42-NEXT: cmpltpd {{[0-9]+}}(%rsp), %xmm2
+; SSE42-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2]
; SSE42-NEXT: movapd {{[0-9]+}}(%rsp), %xmm4
+; SSE42-NEXT: packssdw %xmm3, %xmm2
+; SSE42-NEXT: movapd {{[0-9]+}}(%rsp), %xmm3
+; SSE42-NEXT: cmpltpd {{[0-9]+}}(%rsp), %xmm3
; SSE42-NEXT: cmpltpd {{[0-9]+}}(%rsp), %xmm4
+; SSE42-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm3[0,2]
+; SSE42-NEXT: movapd {{[0-9]+}}(%rsp), %xmm3
; SSE42-NEXT: cmpltpd {{[0-9]+}}(%rsp), %xmm3
-; SSE42-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2]
-; SSE42-NEXT: pslld $31, %xmm5
-; SSE42-NEXT: psrad $31, %xmm5
-; SSE42-NEXT: pshufb %xmm8, %xmm5
-; SSE42-NEXT: pslld $31, %xmm3
-; SSE42-NEXT: psrad $31, %xmm3
-; SSE42-NEXT: pshufb %xmm8, %xmm3
-; SSE42-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm5[0]
-; SSE42-NEXT: psllw $15, %xmm3
-; SSE42-NEXT: psraw $15, %xmm3
-; SSE42-NEXT: pextrb $14, %xmm3, %ecx
-; SSE42-NEXT: pextrb $12, %xmm3, %edx
-; SSE42-NEXT: pextrb $10, %xmm3, %r8d
-; SSE42-NEXT: pextrb $8, %xmm3, %r10d
-; SSE42-NEXT: pextrb $6, %xmm3, %r14d
-; SSE42-NEXT: pextrb $4, %xmm3, %r12d
-; SSE42-NEXT: pextrb $2, %xmm3, %ebx
-; SSE42-NEXT: pextrb $0, %xmm3, %eax
-; SSE42-NEXT: psllw $15, %xmm2
-; SSE42-NEXT: psraw $15, %xmm2
-; SSE42-NEXT: andb $1, %cl
-; SSE42-NEXT: movb %cl, 2(%rdi)
-; SSE42-NEXT: andb $1, %dl
-; SSE42-NEXT: movb %dl, 2(%rdi)
-; SSE42-NEXT: pextrb $14, %xmm2, %edx
-; SSE42-NEXT: pextrb $12, %xmm2, %esi
-; SSE42-NEXT: pextrb $10, %xmm2, %r9d
-; SSE42-NEXT: pextrb $8, %xmm2, %r11d
-; SSE42-NEXT: pextrb $6, %xmm2, %r15d
-; SSE42-NEXT: pextrb $4, %xmm2, %r13d
-; SSE42-NEXT: pextrb $2, %xmm2, %ebp
-; SSE42-NEXT: pextrb $0, %xmm2, %ecx
-; SSE42-NEXT: psllw $15, %xmm0
-; SSE42-NEXT: psraw $15, %xmm0
+; SSE42-NEXT: cmpltpd {{[0-9]+}}(%rsp), %xmm0
+; SSE42-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2]
+; SSE42-NEXT: packssdw %xmm4, %xmm0
+; SSE42-NEXT: packsswb %xmm2, %xmm0
+; SSE42-NEXT: pextrb $15, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $14, %xmm0, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: pextrb $13, %xmm0, %r8d
+; SSE42-NEXT: pextrb $12, %xmm0, %r9d
+; SSE42-NEXT: pextrb $11, %xmm0, %r10d
+; SSE42-NEXT: pextrb $10, %xmm0, %r11d
+; SSE42-NEXT: pextrb $9, %xmm0, %r14d
+; SSE42-NEXT: pextrb $8, %xmm0, %r15d
+; SSE42-NEXT: pextrb $7, %xmm0, %r12d
+; SSE42-NEXT: pextrb $6, %xmm0, %r13d
+; SSE42-NEXT: pextrb $5, %xmm0, %ebx
+; SSE42-NEXT: pextrb $4, %xmm0, %ebp
+; SSE42-NEXT: pextrb $3, %xmm0, %eax
+; SSE42-NEXT: pextrb $2, %xmm0, %ecx
+; SSE42-NEXT: pextrb $1, %xmm0, %edx
+; SSE42-NEXT: pextrb $0, %xmm0, %esi
; SSE42-NEXT: andb $1, %r8b
; SSE42-NEXT: movb %r8b, 2(%rdi)
+; SSE42-NEXT: andb $1, %r9b
+; SSE42-NEXT: movb %r9b, 2(%rdi)
; SSE42-NEXT: andb $1, %r10b
; SSE42-NEXT: movb %r10b, 2(%rdi)
+; SSE42-NEXT: andb $1, %r11b
+; SSE42-NEXT: movb %r11b, 2(%rdi)
; SSE42-NEXT: andb $1, %r14b
; SSE42-NEXT: movb %r14b, 2(%rdi)
+; SSE42-NEXT: andb $1, %r15b
+; SSE42-NEXT: movb %r15b, 2(%rdi)
; SSE42-NEXT: andb $1, %r12b
; SSE42-NEXT: movb %r12b, 2(%rdi)
+; SSE42-NEXT: andb $1, %r13b
+; SSE42-NEXT: movb %r13b, 2(%rdi)
; SSE42-NEXT: andb $1, %bl
; SSE42-NEXT: movb %bl, 2(%rdi)
+; SSE42-NEXT: andb $1, %bpl
+; SSE42-NEXT: movb %bpl, 2(%rdi)
; SSE42-NEXT: andb $1, %al
; SSE42-NEXT: movb %al, 2(%rdi)
+; SSE42-NEXT: andb $1, %cl
+; SSE42-NEXT: movb %cl, 2(%rdi)
; SSE42-NEXT: andb $1, %dl
; SSE42-NEXT: movb %dl, 2(%rdi)
; SSE42-NEXT: andb $1, %sil
; SSE42-NEXT: movb %sil, 2(%rdi)
-; SSE42-NEXT: pextrb $14, %xmm0, %esi
-; SSE42-NEXT: pextrb $12, %xmm0, %edx
-; SSE42-NEXT: pextrb $10, %xmm0, %r8d
-; SSE42-NEXT: pextrb $8, %xmm0, %r10d
-; SSE42-NEXT: pextrb $6, %xmm0, %r14d
-; SSE42-NEXT: pextrb $4, %xmm0, %r12d
-; SSE42-NEXT: pextrb $2, %xmm0, %ebx
-; SSE42-NEXT: pextrb $0, %xmm0, %eax
-; SSE42-NEXT: psllw $15, %xmm1
-; SSE42-NEXT: psraw $15, %xmm1
-; SSE42-NEXT: andb $1, %r9b
-; SSE42-NEXT: movb %r9b, 2(%rdi)
-; SSE42-NEXT: andb $1, %r11b
-; SSE42-NEXT: movb %r11b, 2(%rdi)
-; SSE42-NEXT: andb $1, %r15b
-; SSE42-NEXT: movb %r15b, 2(%rdi)
-; SSE42-NEXT: andb $1, %r13b
-; SSE42-NEXT: movb %r13b, 2(%rdi)
-; SSE42-NEXT: andb $1, %bpl
-; SSE42-NEXT: movb %bpl, 2(%rdi)
-; SSE42-NEXT: andb $1, %cl
-; SSE42-NEXT: movb %cl, 2(%rdi)
-; SSE42-NEXT: andb $1, %sil
-; SSE42-NEXT: movb %sil, (%rdi)
-; SSE42-NEXT: andb $1, %dl
-; SSE42-NEXT: movb %dl, (%rdi)
-; SSE42-NEXT: pextrb $14, %xmm1, %r9d
-; SSE42-NEXT: pextrb $12, %xmm1, %r11d
-; SSE42-NEXT: pextrb $10, %xmm1, %r15d
-; SSE42-NEXT: pextrb $8, %xmm1, %r13d
-; SSE42-NEXT: pextrb $6, %xmm1, %ecx
-; SSE42-NEXT: pextrb $4, %xmm1, %edx
-; SSE42-NEXT: pextrb $2, %xmm1, %esi
-; SSE42-NEXT: pextrb $0, %xmm1, %ebp
+; SSE42-NEXT: pextrb $15, %xmm1, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $14, %xmm1, %eax
+; SSE42-NEXT: andb $1, %al
+; SSE42-NEXT: movb %al, (%rdi)
+; SSE42-NEXT: pextrb $13, %xmm1, %r8d
+; SSE42-NEXT: pextrb $12, %xmm1, %r9d
+; SSE42-NEXT: pextrb $11, %xmm1, %r10d
+; SSE42-NEXT: pextrb $10, %xmm1, %r11d
+; SSE42-NEXT: pextrb $9, %xmm1, %r14d
+; SSE42-NEXT: pextrb $8, %xmm1, %r15d
+; SSE42-NEXT: pextrb $7, %xmm1, %r12d
+; SSE42-NEXT: pextrb $6, %xmm1, %r13d
+; SSE42-NEXT: pextrb $5, %xmm1, %ebx
+; SSE42-NEXT: pextrb $4, %xmm1, %ebp
+; SSE42-NEXT: pextrb $3, %xmm1, %eax
+; SSE42-NEXT: pextrb $2, %xmm1, %ecx
+; SSE42-NEXT: pextrb $1, %xmm1, %edx
+; SSE42-NEXT: pextrb $0, %xmm1, %esi
; SSE42-NEXT: andb $1, %r8b
; SSE42-NEXT: movb %r8b, (%rdi)
+; SSE42-NEXT: andb $1, %r9b
+; SSE42-NEXT: movb %r9b, (%rdi)
; SSE42-NEXT: andb $1, %r10b
; SSE42-NEXT: movb %r10b, (%rdi)
+; SSE42-NEXT: andb $1, %r11b
+; SSE42-NEXT: movb %r11b, (%rdi)
; SSE42-NEXT: andb $1, %r14b
; SSE42-NEXT: movb %r14b, (%rdi)
+; SSE42-NEXT: andb $1, %r15b
+; SSE42-NEXT: movb %r15b, (%rdi)
; SSE42-NEXT: andb $1, %r12b
; SSE42-NEXT: movb %r12b, (%rdi)
+; SSE42-NEXT: andb $1, %r13b
+; SSE42-NEXT: movb %r13b, (%rdi)
; SSE42-NEXT: andb $1, %bl
; SSE42-NEXT: movb %bl, (%rdi)
+; SSE42-NEXT: andb $1, %bpl
+; SSE42-NEXT: movb %bpl, (%rdi)
; SSE42-NEXT: andb $1, %al
; SSE42-NEXT: movb %al, (%rdi)
-; SSE42-NEXT: andb $1, %r9b
-; SSE42-NEXT: movb %r9b, (%rdi)
-; SSE42-NEXT: andb $1, %r11b
-; SSE42-NEXT: movb %r11b, (%rdi)
-; SSE42-NEXT: andb $1, %r15b
-; SSE42-NEXT: movb %r15b, (%rdi)
-; SSE42-NEXT: andb $1, %r13b
-; SSE42-NEXT: movb %r13b, (%rdi)
; SSE42-NEXT: andb $1, %cl
; SSE42-NEXT: movb %cl, (%rdi)
; SSE42-NEXT: andb $1, %dl
; SSE42-NEXT: movb %dl, (%rdi)
; SSE42-NEXT: andb $1, %sil
; SSE42-NEXT: movb %sil, (%rdi)
-; SSE42-NEXT: andb $1, %bpl
-; SSE42-NEXT: movb %bpl, (%rdi)
; SSE42-NEXT: movq %rdi, %rax
; SSE42-NEXT: popq %rbx
; SSE42-NEXT: popq %r12
@@ -8860,7 +5872,7 @@ define <32 x i1> @test_cmp_v32f64(<32 x double> %a0, <32 x double> %a1) nounwind
; SSE42-NEXT: retq
;
; AVX1-LABEL: test_cmp_v32f64:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: pushq %rbp
; AVX1-NEXT: movq %rsp, %rbp
; AVX1-NEXT: andq $-32, %rsp
@@ -8875,33 +5887,33 @@ define <32 x i1> @test_cmp_v32f64(<32 x double> %a0, <32 x double> %a1) nounwind
; AVX1-NEXT: vmovapd 240(%rbp), %ymm15
; AVX1-NEXT: vcmpltpd %ymm7, %ymm15, %ymm15
; AVX1-NEXT: vextractf128 $1, %ymm15, %xmm7
-; AVX1-NEXT: vpacksswb %xmm7, %xmm15, %xmm15
+; AVX1-NEXT: vpackssdw %xmm7, %xmm15, %xmm15
; AVX1-NEXT: vcmpltpd %ymm6, %ymm14, %ymm6
; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm7
-; AVX1-NEXT: vpacksswb %xmm7, %xmm6, %xmm6
-; AVX1-NEXT: vpacksswb %xmm15, %xmm6, %xmm6
+; AVX1-NEXT: vpackssdw %xmm7, %xmm6, %xmm6
+; AVX1-NEXT: vpackssdw %xmm15, %xmm6, %xmm6
; AVX1-NEXT: vcmpltpd %ymm5, %ymm13, %ymm5
; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm7
-; AVX1-NEXT: vpacksswb %xmm7, %xmm5, %xmm5
+; AVX1-NEXT: vpackssdw %xmm7, %xmm5, %xmm5
; AVX1-NEXT: vcmpltpd %ymm4, %ymm12, %ymm4
; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm7
-; AVX1-NEXT: vpacksswb %xmm7, %xmm4, %xmm4
-; AVX1-NEXT: vpacksswb %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vpackssdw %xmm7, %xmm4, %xmm4
+; AVX1-NEXT: vpackssdw %xmm5, %xmm4, %xmm4
; AVX1-NEXT: vpacksswb %xmm6, %xmm4, %xmm4
; AVX1-NEXT: vcmpltpd %ymm3, %ymm11, %ymm3
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5
-; AVX1-NEXT: vpacksswb %xmm5, %xmm3, %xmm3
+; AVX1-NEXT: vpackssdw %xmm5, %xmm3, %xmm3
; AVX1-NEXT: vcmpltpd %ymm2, %ymm10, %ymm2
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
-; AVX1-NEXT: vpacksswb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT: vpacksswb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpackssdw %xmm5, %xmm2, %xmm2
+; AVX1-NEXT: vpackssdw %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vcmpltpd %ymm1, %ymm9, %ymm1
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT: vpacksswb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpackssdw %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vcmpltpd %ymm0, %ymm8, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT: vpacksswb %xmm3, %xmm0, %xmm0
-; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpackssdw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpacksswb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
; AVX1-NEXT: movq %rbp, %rsp
@@ -8909,7 +5921,7 @@ define <32 x i1> @test_cmp_v32f64(<32 x double> %a0, <32 x double> %a1) nounwind
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_cmp_v32f64:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: pushq %rbp
; AVX2-NEXT: movq %rsp, %rbp
; AVX2-NEXT: andq $-32, %rsp
@@ -8924,23 +5936,23 @@ define <32 x i1> @test_cmp_v32f64(<32 x double> %a0, <32 x double> %a1) nounwind
; AVX2-NEXT: vmovapd 240(%rbp), %ymm15
; AVX2-NEXT: vcmpltpd %ymm7, %ymm15, %ymm7
; AVX2-NEXT: vcmpltpd %ymm6, %ymm14, %ymm6
-; AVX2-NEXT: vpacksswb %ymm7, %ymm6, %ymm6
+; AVX2-NEXT: vpackssdw %ymm7, %ymm6, %ymm6
; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,1,3]
; AVX2-NEXT: vcmpltpd %ymm5, %ymm13, %ymm5
; AVX2-NEXT: vcmpltpd %ymm4, %ymm12, %ymm4
-; AVX2-NEXT: vpacksswb %ymm5, %ymm4, %ymm4
+; AVX2-NEXT: vpackssdw %ymm5, %ymm4, %ymm4
; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,1,3]
-; AVX2-NEXT: vpacksswb %ymm6, %ymm4, %ymm4
+; AVX2-NEXT: vpackssdw %ymm6, %ymm4, %ymm4
; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,1,3]
; AVX2-NEXT: vcmpltpd %ymm3, %ymm11, %ymm3
; AVX2-NEXT: vcmpltpd %ymm2, %ymm10, %ymm2
-; AVX2-NEXT: vpacksswb %ymm3, %ymm2, %ymm2
+; AVX2-NEXT: vpackssdw %ymm3, %ymm2, %ymm2
; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
; AVX2-NEXT: vcmpltpd %ymm1, %ymm9, %ymm1
; AVX2-NEXT: vcmpltpd %ymm0, %ymm8, %ymm0
-; AVX2-NEXT: vpacksswb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX2-NEXT: vpacksswb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpackssdw %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX2-NEXT: vpacksswb %ymm4, %ymm0, %ymm0
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
@@ -8949,682 +5961,56 @@ define <32 x i1> @test_cmp_v32f64(<32 x double> %a0, <32 x double> %a1) nounwind
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test_cmp_v32f64:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vextractf32x4 $3, %zmm4, %xmm8
-; AVX512F-NEXT: vextractf32x4 $3, %zmm0, %xmm9
-; AVX512F-NEXT: xorl %eax, %eax
-; AVX512F-NEXT: vucomisd %xmm8, %xmm9
-; AVX512F-NEXT: movq $-1, %rcx
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmovaq %rcx, %rdx
-; AVX512F-NEXT: vmovq %rdx, %xmm10
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm8 = xmm8[1,0]
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm9 = xmm9[1,0]
-; AVX512F-NEXT: vucomisd %xmm8, %xmm9
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmovaq %rcx, %rdx
-; AVX512F-NEXT: vmovq %rdx, %xmm8
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm10[0],xmm8[0]
-; AVX512F-NEXT: vextractf32x4 $2, %zmm4, %xmm9
-; AVX512F-NEXT: vextractf32x4 $2, %zmm0, %xmm10
-; AVX512F-NEXT: vucomisd %xmm9, %xmm10
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmovaq %rcx, %rdx
-; AVX512F-NEXT: vmovq %rdx, %xmm11
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm9 = xmm9[1,0]
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm10 = xmm10[1,0]
-; AVX512F-NEXT: vucomisd %xmm9, %xmm10
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmovaq %rcx, %rdx
-; AVX512F-NEXT: vmovq %rdx, %xmm9
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm9 = xmm11[0],xmm9[0]
-; AVX512F-NEXT: vinserti128 $1, %xmm8, %ymm9, %ymm8
-; AVX512F-NEXT: vextractf32x4 $1, %zmm4, %xmm9
-; AVX512F-NEXT: vextractf32x4 $1, %zmm0, %xmm10
-; AVX512F-NEXT: vucomisd %xmm9, %xmm10
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmovaq %rcx, %rdx
-; AVX512F-NEXT: vmovq %rdx, %xmm11
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm9 = xmm9[1,0]
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm10 = xmm10[1,0]
-; AVX512F-NEXT: vucomisd %xmm9, %xmm10
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmovaq %rcx, %rdx
-; AVX512F-NEXT: vmovq %rdx, %xmm9
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm9 = xmm11[0],xmm9[0]
-; AVX512F-NEXT: vucomisd %xmm4, %xmm0
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmovaq %rcx, %rdx
-; AVX512F-NEXT: vmovq %rdx, %xmm10
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX512F-NEXT: vucomisd %xmm4, %xmm0
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmovaq %rcx, %rdx
-; AVX512F-NEXT: vmovq %rdx, %xmm0
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm10[0],xmm0[0]
-; AVX512F-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm0
-; AVX512F-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm0
-; AVX512F-NEXT: vpmovqd %zmm0, %ymm8
-; AVX512F-NEXT: vextractf32x4 $3, %zmm5, %xmm4
-; AVX512F-NEXT: vextractf32x4 $3, %zmm1, %xmm0
-; AVX512F-NEXT: vucomisd %xmm4, %xmm0
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmovaq %rcx, %rdx
-; AVX512F-NEXT: vmovq %rdx, %xmm9
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX512F-NEXT: vucomisd %xmm4, %xmm0
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmovaq %rcx, %rdx
-; AVX512F-NEXT: vmovq %rdx, %xmm0
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm9 = xmm9[0],xmm0[0]
-; AVX512F-NEXT: vextractf32x4 $2, %zmm5, %xmm4
-; AVX512F-NEXT: vextractf32x4 $2, %zmm1, %xmm0
-; AVX512F-NEXT: vucomisd %xmm4, %xmm0
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmovaq %rcx, %rdx
-; AVX512F-NEXT: vmovq %rdx, %xmm10
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX512F-NEXT: vucomisd %xmm4, %xmm0
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmovaq %rcx, %rdx
-; AVX512F-NEXT: vmovq %rdx, %xmm0
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm10[0],xmm0[0]
-; AVX512F-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX512F-NEXT: vextractf32x4 $1, %zmm5, %xmm4
-; AVX512F-NEXT: vextractf32x4 $1, %zmm1, %xmm0
-; AVX512F-NEXT: vucomisd %xmm4, %xmm0
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmovaq %rcx, %rdx
-; AVX512F-NEXT: vmovq %rdx, %xmm10
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX512F-NEXT: vucomisd %xmm4, %xmm0
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmovaq %rcx, %rdx
-; AVX512F-NEXT: vmovq %rdx, %xmm0
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm10[0],xmm0[0]
-; AVX512F-NEXT: vucomisd %xmm5, %xmm1
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmovaq %rcx, %rdx
-; AVX512F-NEXT: vmovq %rdx, %xmm4
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm5 = xmm5[1,0]
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
-; AVX512F-NEXT: vucomisd %xmm5, %xmm1
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmovaq %rcx, %rdx
-; AVX512F-NEXT: vmovq %rdx, %xmm1
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0]
-; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512F-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm0
-; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm8, %zmm0
-; AVX512F-NEXT: vpmovdb %zmm0, %xmm8
-; AVX512F-NEXT: vextractf32x4 $3, %zmm6, %xmm1
-; AVX512F-NEXT: vextractf32x4 $3, %zmm2, %xmm4
-; AVX512F-NEXT: vucomisd %xmm1, %xmm4
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmovaq %rcx, %rdx
-; AVX512F-NEXT: vmovq %rdx, %xmm5
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
-; AVX512F-NEXT: vucomisd %xmm1, %xmm4
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmovaq %rcx, %rdx
-; AVX512F-NEXT: vmovq %rdx, %xmm1
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm5[0],xmm1[0]
-; AVX512F-NEXT: vextractf32x4 $2, %zmm6, %xmm4
-; AVX512F-NEXT: vextractf32x4 $2, %zmm2, %xmm5
-; AVX512F-NEXT: vucomisd %xmm4, %xmm5
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmovaq %rcx, %rdx
-; AVX512F-NEXT: vmovq %rdx, %xmm0
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm5 = xmm5[1,0]
-; AVX512F-NEXT: vucomisd %xmm4, %xmm5
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmovaq %rcx, %rdx
-; AVX512F-NEXT: vmovq %rdx, %xmm4
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
-; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512F-NEXT: vextractf32x4 $1, %zmm6, %xmm1
-; AVX512F-NEXT: vextractf32x4 $1, %zmm2, %xmm4
-; AVX512F-NEXT: vucomisd %xmm1, %xmm4
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmovaq %rcx, %rdx
-; AVX512F-NEXT: vmovq %rdx, %xmm5
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
-; AVX512F-NEXT: vucomisd %xmm1, %xmm4
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmovaq %rcx, %rdx
-; AVX512F-NEXT: vmovq %rdx, %xmm1
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm5[0],xmm1[0]
-; AVX512F-NEXT: vucomisd %xmm6, %xmm2
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmovaq %rcx, %rdx
-; AVX512F-NEXT: vmovq %rdx, %xmm4
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm5 = xmm6[1,0]
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
-; AVX512F-NEXT: vucomisd %xmm5, %xmm2
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmovaq %rcx, %rdx
-; AVX512F-NEXT: vmovq %rdx, %xmm2
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm4[0],xmm2[0]
-; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
-; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512F-NEXT: vextractf32x4 $3, %zmm7, %xmm1
-; AVX512F-NEXT: vextractf32x4 $3, %zmm3, %xmm2
-; AVX512F-NEXT: vucomisd %xmm1, %xmm2
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmovaq %rcx, %rdx
-; AVX512F-NEXT: vmovq %rdx, %xmm4
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
-; AVX512F-NEXT: vucomisd %xmm1, %xmm2
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmovaq %rcx, %rdx
-; AVX512F-NEXT: vmovq %rdx, %xmm1
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0]
-; AVX512F-NEXT: vextractf32x4 $2, %zmm7, %xmm2
-; AVX512F-NEXT: vextractf32x4 $2, %zmm3, %xmm4
-; AVX512F-NEXT: vucomisd %xmm2, %xmm4
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmovaq %rcx, %rdx
-; AVX512F-NEXT: vmovq %rdx, %xmm5
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
-; AVX512F-NEXT: vucomisd %xmm2, %xmm4
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmovaq %rcx, %rdx
-; AVX512F-NEXT: vmovq %rdx, %xmm2
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm5[0],xmm2[0]
-; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
-; AVX512F-NEXT: vextractf32x4 $1, %zmm7, %xmm2
-; AVX512F-NEXT: vextractf32x4 $1, %zmm3, %xmm4
-; AVX512F-NEXT: vucomisd %xmm2, %xmm4
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmovaq %rcx, %rdx
-; AVX512F-NEXT: vmovq %rdx, %xmm5
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
-; AVX512F-NEXT: vucomisd %xmm2, %xmm4
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmovaq %rcx, %rdx
-; AVX512F-NEXT: vmovq %rdx, %xmm2
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm5[0],xmm2[0]
-; AVX512F-NEXT: vucomisd %xmm7, %xmm3
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmovaq %rcx, %rdx
-; AVX512F-NEXT: vmovq %rdx, %xmm4
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm5 = xmm7[1,0]
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm3 = xmm3[1,0]
-; AVX512F-NEXT: vucomisd %xmm5, %xmm3
-; AVX512F-NEXT: cmovaq %rcx, %rax
-; AVX512F-NEXT: vmovq %rax, %xmm3
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
-; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2
-; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1
-; AVX512F-NEXT: vpmovqd %zmm1, %ymm1
-; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vcmpltpd %zmm0, %zmm4, %k0
+; AVX512F-NEXT: vcmpltpd %zmm1, %zmm5, %k1
+; AVX512F-NEXT: kunpckbw %k0, %k1, %k1
+; AVX512F-NEXT: movl {{.*}}(%rip), %eax
+; AVX512F-NEXT: vpbroadcastd %eax, %zmm0 {%k1} {z}
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm8, %ymm0
+; AVX512F-NEXT: vcmpltpd %zmm2, %zmm6, %k0
+; AVX512F-NEXT: vcmpltpd %zmm3, %zmm7, %k1
+; AVX512F-NEXT: kunpckbw %k0, %k1, %k1
+; AVX512F-NEXT: vpbroadcastd %eax, %zmm1 {%k1} {z}
+; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
+; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT: vpsllw $7, %ymm0, %ymm0
+; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512F-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0
; AVX512F-NEXT: retq
;
; AVX512DQ-LABEL: test_cmp_v32f64:
-; AVX512DQ: # BB#0:
-; AVX512DQ-NEXT: vextractf64x2 $3, %zmm4, %xmm8
-; AVX512DQ-NEXT: vextractf64x2 $3, %zmm0, %xmm9
-; AVX512DQ-NEXT: xorl %eax, %eax
-; AVX512DQ-NEXT: vucomisd %xmm8, %xmm9
-; AVX512DQ-NEXT: movq $-1, %rcx
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmovaq %rcx, %rdx
-; AVX512DQ-NEXT: vmovq %rdx, %xmm10
-; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm8 = xmm8[1,0]
-; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm9 = xmm9[1,0]
-; AVX512DQ-NEXT: vucomisd %xmm8, %xmm9
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmovaq %rcx, %rdx
-; AVX512DQ-NEXT: vmovq %rdx, %xmm8
-; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm10[0],xmm8[0]
-; AVX512DQ-NEXT: vextractf64x2 $2, %zmm4, %xmm9
-; AVX512DQ-NEXT: vextractf64x2 $2, %zmm0, %xmm10
-; AVX512DQ-NEXT: vucomisd %xmm9, %xmm10
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmovaq %rcx, %rdx
-; AVX512DQ-NEXT: vmovq %rdx, %xmm11
-; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm9 = xmm9[1,0]
-; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm10 = xmm10[1,0]
-; AVX512DQ-NEXT: vucomisd %xmm9, %xmm10
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmovaq %rcx, %rdx
-; AVX512DQ-NEXT: vmovq %rdx, %xmm9
-; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm9 = xmm11[0],xmm9[0]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm8, %ymm9, %ymm8
-; AVX512DQ-NEXT: vextractf64x2 $1, %zmm4, %xmm9
-; AVX512DQ-NEXT: vextractf64x2 $1, %zmm0, %xmm10
-; AVX512DQ-NEXT: vucomisd %xmm9, %xmm10
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmovaq %rcx, %rdx
-; AVX512DQ-NEXT: vmovq %rdx, %xmm11
-; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm9 = xmm9[1,0]
-; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm10 = xmm10[1,0]
-; AVX512DQ-NEXT: vucomisd %xmm9, %xmm10
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmovaq %rcx, %rdx
-; AVX512DQ-NEXT: vmovq %rdx, %xmm9
-; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm9 = xmm11[0],xmm9[0]
-; AVX512DQ-NEXT: vucomisd %xmm4, %xmm0
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmovaq %rcx, %rdx
-; AVX512DQ-NEXT: vmovq %rdx, %xmm10
-; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
-; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX512DQ-NEXT: vucomisd %xmm4, %xmm0
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmovaq %rcx, %rdx
-; AVX512DQ-NEXT: vmovq %rdx, %xmm0
-; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm10[0],xmm0[0]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm0
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm0
-; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm8
-; AVX512DQ-NEXT: vextractf64x2 $3, %zmm5, %xmm4
-; AVX512DQ-NEXT: vextractf64x2 $3, %zmm1, %xmm0
-; AVX512DQ-NEXT: vucomisd %xmm4, %xmm0
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmovaq %rcx, %rdx
-; AVX512DQ-NEXT: vmovq %rdx, %xmm9
-; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
-; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX512DQ-NEXT: vucomisd %xmm4, %xmm0
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmovaq %rcx, %rdx
-; AVX512DQ-NEXT: vmovq %rdx, %xmm0
-; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm9 = xmm9[0],xmm0[0]
-; AVX512DQ-NEXT: vextractf64x2 $2, %zmm5, %xmm4
-; AVX512DQ-NEXT: vextractf64x2 $2, %zmm1, %xmm0
-; AVX512DQ-NEXT: vucomisd %xmm4, %xmm0
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmovaq %rcx, %rdx
-; AVX512DQ-NEXT: vmovq %rdx, %xmm10
-; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
-; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX512DQ-NEXT: vucomisd %xmm4, %xmm0
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmovaq %rcx, %rdx
-; AVX512DQ-NEXT: vmovq %rdx, %xmm0
-; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm10[0],xmm0[0]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX512DQ-NEXT: vextractf64x2 $1, %zmm5, %xmm4
-; AVX512DQ-NEXT: vextractf64x2 $1, %zmm1, %xmm0
-; AVX512DQ-NEXT: vucomisd %xmm4, %xmm0
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmovaq %rcx, %rdx
-; AVX512DQ-NEXT: vmovq %rdx, %xmm10
-; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
-; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX512DQ-NEXT: vucomisd %xmm4, %xmm0
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmovaq %rcx, %rdx
-; AVX512DQ-NEXT: vmovq %rdx, %xmm0
-; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm10[0],xmm0[0]
-; AVX512DQ-NEXT: vucomisd %xmm5, %xmm1
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmovaq %rcx, %rdx
-; AVX512DQ-NEXT: vmovq %rdx, %xmm4
-; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm5 = xmm5[1,0]
-; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
-; AVX512DQ-NEXT: vucomisd %xmm5, %xmm1
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmovaq %rcx, %rdx
-; AVX512DQ-NEXT: vmovq %rdx, %xmm1
-; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm0
-; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512DQ-NEXT: vinserti32x8 $1, %ymm0, %zmm8, %zmm0
-; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm8
-; AVX512DQ-NEXT: vextractf64x2 $3, %zmm6, %xmm1
-; AVX512DQ-NEXT: vextractf64x2 $3, %zmm2, %xmm4
-; AVX512DQ-NEXT: vucomisd %xmm1, %xmm4
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmovaq %rcx, %rdx
-; AVX512DQ-NEXT: vmovq %rdx, %xmm5
-; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
-; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
-; AVX512DQ-NEXT: vucomisd %xmm1, %xmm4
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmovaq %rcx, %rdx
-; AVX512DQ-NEXT: vmovq %rdx, %xmm1
-; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm5[0],xmm1[0]
-; AVX512DQ-NEXT: vextractf64x2 $2, %zmm6, %xmm4
-; AVX512DQ-NEXT: vextractf64x2 $2, %zmm2, %xmm5
-; AVX512DQ-NEXT: vucomisd %xmm4, %xmm5
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmovaq %rcx, %rdx
-; AVX512DQ-NEXT: vmovq %rdx, %xmm0
-; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
-; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm5 = xmm5[1,0]
-; AVX512DQ-NEXT: vucomisd %xmm4, %xmm5
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmovaq %rcx, %rdx
-; AVX512DQ-NEXT: vmovq %rdx, %xmm4
-; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512DQ-NEXT: vextractf64x2 $1, %zmm6, %xmm1
-; AVX512DQ-NEXT: vextractf64x2 $1, %zmm2, %xmm4
-; AVX512DQ-NEXT: vucomisd %xmm1, %xmm4
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmovaq %rcx, %rdx
-; AVX512DQ-NEXT: vmovq %rdx, %xmm5
-; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
-; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
-; AVX512DQ-NEXT: vucomisd %xmm1, %xmm4
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmovaq %rcx, %rdx
-; AVX512DQ-NEXT: vmovq %rdx, %xmm1
-; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm5[0],xmm1[0]
-; AVX512DQ-NEXT: vucomisd %xmm6, %xmm2
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmovaq %rcx, %rdx
-; AVX512DQ-NEXT: vmovq %rdx, %xmm4
-; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm5 = xmm6[1,0]
-; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
-; AVX512DQ-NEXT: vucomisd %xmm5, %xmm2
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmovaq %rcx, %rdx
-; AVX512DQ-NEXT: vmovq %rdx, %xmm2
-; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm4[0],xmm2[0]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512DQ-NEXT: vextractf64x2 $3, %zmm7, %xmm1
-; AVX512DQ-NEXT: vextractf64x2 $3, %zmm3, %xmm2
-; AVX512DQ-NEXT: vucomisd %xmm1, %xmm2
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmovaq %rcx, %rdx
-; AVX512DQ-NEXT: vmovq %rdx, %xmm4
-; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
-; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
-; AVX512DQ-NEXT: vucomisd %xmm1, %xmm2
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmovaq %rcx, %rdx
-; AVX512DQ-NEXT: vmovq %rdx, %xmm1
-; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0]
-; AVX512DQ-NEXT: vextractf64x2 $2, %zmm7, %xmm2
-; AVX512DQ-NEXT: vextractf64x2 $2, %zmm3, %xmm4
-; AVX512DQ-NEXT: vucomisd %xmm2, %xmm4
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmovaq %rcx, %rdx
-; AVX512DQ-NEXT: vmovq %rdx, %xmm5
-; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
-; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
-; AVX512DQ-NEXT: vucomisd %xmm2, %xmm4
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmovaq %rcx, %rdx
-; AVX512DQ-NEXT: vmovq %rdx, %xmm2
-; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm5[0],xmm2[0]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
-; AVX512DQ-NEXT: vextractf64x2 $1, %zmm7, %xmm2
-; AVX512DQ-NEXT: vextractf64x2 $1, %zmm3, %xmm4
-; AVX512DQ-NEXT: vucomisd %xmm2, %xmm4
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmovaq %rcx, %rdx
-; AVX512DQ-NEXT: vmovq %rdx, %xmm5
-; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
-; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
-; AVX512DQ-NEXT: vucomisd %xmm2, %xmm4
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmovaq %rcx, %rdx
-; AVX512DQ-NEXT: vmovq %rdx, %xmm2
-; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm5[0],xmm2[0]
-; AVX512DQ-NEXT: vucomisd %xmm7, %xmm3
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmovaq %rcx, %rdx
-; AVX512DQ-NEXT: vmovq %rdx, %xmm4
-; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm5 = xmm7[1,0]
-; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm3 = xmm3[1,0]
-; AVX512DQ-NEXT: vucomisd %xmm5, %xmm3
-; AVX512DQ-NEXT: cmovaq %rcx, %rax
-; AVX512DQ-NEXT: vmovq %rax, %xmm3
-; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1
-; AVX512DQ-NEXT: vpmovqd %zmm1, %ymm1
-; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: vcmpltpd %zmm0, %zmm4, %k0
+; AVX512DQ-NEXT: vcmpltpd %zmm1, %zmm5, %k1
+; AVX512DQ-NEXT: kunpckbw %k0, %k1, %k1
+; AVX512DQ-NEXT: movl {{.*}}(%rip), %eax
+; AVX512DQ-NEXT: vpbroadcastd %eax, %zmm0 {%k1} {z}
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm8, %ymm0
+; AVX512DQ-NEXT: vcmpltpd %zmm2, %zmm6, %k0
+; AVX512DQ-NEXT: vcmpltpd %zmm3, %zmm7, %k1
+; AVX512DQ-NEXT: kunpckbw %k0, %k1, %k1
+; AVX512DQ-NEXT: vpbroadcastd %eax, %zmm1 {%k1} {z}
+; AVX512DQ-NEXT: vpmovdb %zmm1, %xmm1
+; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512DQ-NEXT: vpsllw $7, %ymm0, %ymm0
+; AVX512DQ-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512DQ-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: test_cmp_v32f64:
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vextractf32x4 $3, %zmm4, %xmm8
-; AVX512BW-NEXT: vextractf32x4 $3, %zmm0, %xmm9
-; AVX512BW-NEXT: xorl %eax, %eax
-; AVX512BW-NEXT: vucomisd %xmm8, %xmm9
-; AVX512BW-NEXT: movq $-1, %rcx
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovaq %rcx, %rdx
-; AVX512BW-NEXT: vmovq %rdx, %xmm10
-; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm8 = xmm8[1,0]
-; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm9 = xmm9[1,0]
-; AVX512BW-NEXT: vucomisd %xmm8, %xmm9
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovaq %rcx, %rdx
-; AVX512BW-NEXT: vmovq %rdx, %xmm8
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm10[0],xmm8[0]
-; AVX512BW-NEXT: vextractf32x4 $2, %zmm4, %xmm9
-; AVX512BW-NEXT: vextractf32x4 $2, %zmm0, %xmm10
-; AVX512BW-NEXT: vucomisd %xmm9, %xmm10
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovaq %rcx, %rdx
-; AVX512BW-NEXT: vmovq %rdx, %xmm11
-; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm9 = xmm9[1,0]
-; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm10 = xmm10[1,0]
-; AVX512BW-NEXT: vucomisd %xmm9, %xmm10
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovaq %rcx, %rdx
-; AVX512BW-NEXT: vmovq %rdx, %xmm9
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm9 = xmm11[0],xmm9[0]
-; AVX512BW-NEXT: vinserti128 $1, %xmm8, %ymm9, %ymm8
-; AVX512BW-NEXT: vextractf32x4 $1, %zmm4, %xmm9
-; AVX512BW-NEXT: vextractf32x4 $1, %zmm0, %xmm10
-; AVX512BW-NEXT: vucomisd %xmm9, %xmm10
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovaq %rcx, %rdx
-; AVX512BW-NEXT: vmovq %rdx, %xmm11
-; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm9 = xmm9[1,0]
-; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm10 = xmm10[1,0]
-; AVX512BW-NEXT: vucomisd %xmm9, %xmm10
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovaq %rcx, %rdx
-; AVX512BW-NEXT: vmovq %rdx, %xmm9
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm9 = xmm11[0],xmm9[0]
-; AVX512BW-NEXT: vucomisd %xmm4, %xmm0
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovaq %rcx, %rdx
-; AVX512BW-NEXT: vmovq %rdx, %xmm10
-; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
-; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX512BW-NEXT: vucomisd %xmm4, %xmm0
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovaq %rcx, %rdx
-; AVX512BW-NEXT: vmovq %rdx, %xmm0
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm10[0],xmm0[0]
-; AVX512BW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm0
-; AVX512BW-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm0
-; AVX512BW-NEXT: vpmovqd %zmm0, %ymm8
-; AVX512BW-NEXT: vextractf32x4 $3, %zmm5, %xmm4
-; AVX512BW-NEXT: vextractf32x4 $3, %zmm1, %xmm0
-; AVX512BW-NEXT: vucomisd %xmm4, %xmm0
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovaq %rcx, %rdx
-; AVX512BW-NEXT: vmovq %rdx, %xmm9
-; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
-; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX512BW-NEXT: vucomisd %xmm4, %xmm0
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovaq %rcx, %rdx
-; AVX512BW-NEXT: vmovq %rdx, %xmm0
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm9 = xmm9[0],xmm0[0]
-; AVX512BW-NEXT: vextractf32x4 $2, %zmm5, %xmm4
-; AVX512BW-NEXT: vextractf32x4 $2, %zmm1, %xmm0
-; AVX512BW-NEXT: vucomisd %xmm4, %xmm0
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovaq %rcx, %rdx
-; AVX512BW-NEXT: vmovq %rdx, %xmm10
-; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
-; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX512BW-NEXT: vucomisd %xmm4, %xmm0
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovaq %rcx, %rdx
-; AVX512BW-NEXT: vmovq %rdx, %xmm0
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm10[0],xmm0[0]
-; AVX512BW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX512BW-NEXT: vextractf32x4 $1, %zmm5, %xmm4
-; AVX512BW-NEXT: vextractf32x4 $1, %zmm1, %xmm0
-; AVX512BW-NEXT: vucomisd %xmm4, %xmm0
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovaq %rcx, %rdx
-; AVX512BW-NEXT: vmovq %rdx, %xmm10
-; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
-; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX512BW-NEXT: vucomisd %xmm4, %xmm0
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovaq %rcx, %rdx
-; AVX512BW-NEXT: vmovq %rdx, %xmm0
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm10[0],xmm0[0]
-; AVX512BW-NEXT: vucomisd %xmm5, %xmm1
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovaq %rcx, %rdx
-; AVX512BW-NEXT: vmovq %rdx, %xmm4
-; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm5 = xmm5[1,0]
-; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
-; AVX512BW-NEXT: vucomisd %xmm5, %xmm1
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovaq %rcx, %rdx
-; AVX512BW-NEXT: vmovq %rdx, %xmm1
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0]
-; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512BW-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm0
-; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm8, %zmm0
-; AVX512BW-NEXT: vpmovdw %zmm0, %ymm8
-; AVX512BW-NEXT: vextractf32x4 $3, %zmm6, %xmm1
-; AVX512BW-NEXT: vextractf32x4 $3, %zmm2, %xmm4
-; AVX512BW-NEXT: vucomisd %xmm1, %xmm4
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovaq %rcx, %rdx
-; AVX512BW-NEXT: vmovq %rdx, %xmm5
-; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
-; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
-; AVX512BW-NEXT: vucomisd %xmm1, %xmm4
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovaq %rcx, %rdx
-; AVX512BW-NEXT: vmovq %rdx, %xmm1
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm5[0],xmm1[0]
-; AVX512BW-NEXT: vextractf32x4 $2, %zmm6, %xmm4
-; AVX512BW-NEXT: vextractf32x4 $2, %zmm2, %xmm5
-; AVX512BW-NEXT: vucomisd %xmm4, %xmm5
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovaq %rcx, %rdx
-; AVX512BW-NEXT: vmovq %rdx, %xmm0
-; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
-; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm5 = xmm5[1,0]
-; AVX512BW-NEXT: vucomisd %xmm4, %xmm5
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovaq %rcx, %rdx
-; AVX512BW-NEXT: vmovq %rdx, %xmm4
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
-; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512BW-NEXT: vextractf32x4 $1, %zmm6, %xmm1
-; AVX512BW-NEXT: vextractf32x4 $1, %zmm2, %xmm4
-; AVX512BW-NEXT: vucomisd %xmm1, %xmm4
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovaq %rcx, %rdx
-; AVX512BW-NEXT: vmovq %rdx, %xmm5
-; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
-; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
-; AVX512BW-NEXT: vucomisd %xmm1, %xmm4
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovaq %rcx, %rdx
-; AVX512BW-NEXT: vmovq %rdx, %xmm1
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm5[0],xmm1[0]
-; AVX512BW-NEXT: vucomisd %xmm6, %xmm2
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovaq %rcx, %rdx
-; AVX512BW-NEXT: vmovq %rdx, %xmm4
-; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm5 = xmm6[1,0]
-; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
-; AVX512BW-NEXT: vucomisd %xmm5, %xmm2
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovaq %rcx, %rdx
-; AVX512BW-NEXT: vmovq %rdx, %xmm2
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm4[0],xmm2[0]
-; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
-; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512BW-NEXT: vextractf32x4 $3, %zmm7, %xmm1
-; AVX512BW-NEXT: vextractf32x4 $3, %zmm3, %xmm2
-; AVX512BW-NEXT: vucomisd %xmm1, %xmm2
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovaq %rcx, %rdx
-; AVX512BW-NEXT: vmovq %rdx, %xmm4
-; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
-; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
-; AVX512BW-NEXT: vucomisd %xmm1, %xmm2
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovaq %rcx, %rdx
-; AVX512BW-NEXT: vmovq %rdx, %xmm1
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0]
-; AVX512BW-NEXT: vextractf32x4 $2, %zmm7, %xmm2
-; AVX512BW-NEXT: vextractf32x4 $2, %zmm3, %xmm4
-; AVX512BW-NEXT: vucomisd %xmm2, %xmm4
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovaq %rcx, %rdx
-; AVX512BW-NEXT: vmovq %rdx, %xmm5
-; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
-; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
-; AVX512BW-NEXT: vucomisd %xmm2, %xmm4
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovaq %rcx, %rdx
-; AVX512BW-NEXT: vmovq %rdx, %xmm2
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm5[0],xmm2[0]
-; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
-; AVX512BW-NEXT: vextractf32x4 $1, %zmm7, %xmm2
-; AVX512BW-NEXT: vextractf32x4 $1, %zmm3, %xmm4
-; AVX512BW-NEXT: vucomisd %xmm2, %xmm4
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovaq %rcx, %rdx
-; AVX512BW-NEXT: vmovq %rdx, %xmm5
-; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
-; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
-; AVX512BW-NEXT: vucomisd %xmm2, %xmm4
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovaq %rcx, %rdx
-; AVX512BW-NEXT: vmovq %rdx, %xmm2
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm5[0],xmm2[0]
-; AVX512BW-NEXT: vucomisd %xmm7, %xmm3
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovaq %rcx, %rdx
-; AVX512BW-NEXT: vmovq %rdx, %xmm4
-; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm5 = xmm7[1,0]
-; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm3 = xmm3[1,0]
-; AVX512BW-NEXT: vucomisd %xmm5, %xmm3
-; AVX512BW-NEXT: cmovaq %rcx, %rax
-; AVX512BW-NEXT: vmovq %rax, %xmm3
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
-; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2
-; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1
-; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1
-; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm8, %zmm0
-; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vcmpltpd %zmm0, %zmm4, %k0
+; AVX512BW-NEXT: vcmpltpd %zmm1, %zmm5, %k1
+; AVX512BW-NEXT: kunpckbw %k0, %k1, %k0
+; AVX512BW-NEXT: vcmpltpd %zmm2, %zmm6, %k1
+; AVX512BW-NEXT: vcmpltpd %zmm3, %zmm7, %k2
+; AVX512BW-NEXT: kunpckbw %k1, %k2, %k1
+; AVX512BW-NEXT: kunpckwd %k0, %k1, %k0
+; AVX512BW-NEXT: vpmovm2b %k0, %zmm0
+; AVX512BW-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
; AVX512BW-NEXT: retq
%1 = fcmp ogt <32 x double> %a0, %a1
ret <32 x i1> %1
@@ -9632,7 +6018,7 @@ define <32 x i1> @test_cmp_v32f64(<32 x double> %a0, <32 x double> %a1) nounwind
define <32 x i1> @test_cmp_v32i64(<32 x i64> %a0, <32 x i64> %a1) nounwind {
; SSE2-LABEL: test_cmp_v32i64:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,0,2147483648,0]
; SSE2-NEXT: pxor %xmm8, %xmm7
; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9
@@ -9657,8 +6043,6 @@ define <32 x i1> @test_cmp_v32i64(<32 x i64> %a0, <32 x i64> %a1) nounwind {
; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm10[1,1,3,3]
; SSE2-NEXT: por %xmm6, %xmm7
; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2],xmm9[0,2]
-; SSE2-NEXT: pslld $31, %xmm7
-; SSE2-NEXT: psrad $31, %xmm7
; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm7[0,2,2,3,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,6,7]
; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm6[0,2,2,3]
@@ -9685,14 +6069,10 @@ define <32 x i1> @test_cmp_v32i64(<32 x i64> %a0, <32 x i64> %a1) nounwind {
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
; SSE2-NEXT: por %xmm4, %xmm5
; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm6[0,2]
-; SSE2-NEXT: pslld $31, %xmm5
-; SSE2-NEXT: psrad $31, %xmm5
; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm5[0,2,2,3,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7]
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,2,2,3]
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm9[0]
-; SSE2-NEXT: psllw $15, %xmm5
-; SSE2-NEXT: psraw $15, %xmm5
; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [255,255,255,255,255,255,255,255]
; SSE2-NEXT: pand %xmm9, %xmm5
; SSE2-NEXT: pxor %xmm8, %xmm3
@@ -9718,8 +6098,6 @@ define <32 x i1> @test_cmp_v32i64(<32 x i64> %a0, <32 x i64> %a1) nounwind {
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3]
; SSE2-NEXT: por %xmm2, %xmm3
; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2]
-; SSE2-NEXT: pslld $31, %xmm3
-; SSE2-NEXT: psrad $31, %xmm3
; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm3[0,2,2,3,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
@@ -9746,14 +6124,10 @@ define <32 x i1> @test_cmp_v32i64(<32 x i64> %a0, <32 x i64> %a1) nounwind {
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3]
; SSE2-NEXT: por %xmm0, %xmm1
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm3[0,2]
-; SSE2-NEXT: pslld $31, %xmm1
-; SSE2-NEXT: psrad $31, %xmm1
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; SSE2-NEXT: psllw $15, %xmm0
-; SSE2-NEXT: psraw $15, %xmm0
; SSE2-NEXT: pand %xmm9, %xmm0
; SSE2-NEXT: packuswb %xmm5, %xmm0
; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1
@@ -9781,8 +6155,6 @@ define <32 x i1> @test_cmp_v32i64(<32 x i64> %a0, <32 x i64> %a1) nounwind {
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
; SSE2-NEXT: por %xmm1, %xmm3
; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm2[0,2]
-; SSE2-NEXT: pslld $31, %xmm3
-; SSE2-NEXT: psrad $31, %xmm3
; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[0,2,2,3,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3]
@@ -9811,14 +6183,10 @@ define <32 x i1> @test_cmp_v32i64(<32 x i64> %a0, <32 x i64> %a1) nounwind {
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
; SSE2-NEXT: por %xmm1, %xmm4
; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm3[0,2]
-; SSE2-NEXT: pslld $31, %xmm4
-; SSE2-NEXT: psrad $31, %xmm4
; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[0,2,2,3,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; SSE2-NEXT: psllw $15, %xmm1
-; SSE2-NEXT: psraw $15, %xmm1
; SSE2-NEXT: pand %xmm9, %xmm1
; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm2
; SSE2-NEXT: pxor %xmm8, %xmm2
@@ -9845,8 +6213,6 @@ define <32 x i1> @test_cmp_v32i64(<32 x i64> %a0, <32 x i64> %a1) nounwind {
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
; SSE2-NEXT: por %xmm2, %xmm4
; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm3[0,2]
-; SSE2-NEXT: pslld $31, %xmm4
-; SSE2-NEXT: psrad $31, %xmm4
; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[0,2,2,3,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
@@ -9874,14 +6240,10 @@ define <32 x i1> @test_cmp_v32i64(<32 x i64> %a0, <32 x i64> %a1) nounwind {
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
; SSE2-NEXT: por %xmm3, %xmm5
; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm4[0,2]
-; SSE2-NEXT: pslld $31, %xmm5
-; SSE2-NEXT: psrad $31, %xmm5
; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[0,2,2,3,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7]
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0]
-; SSE2-NEXT: psllw $15, %xmm3
-; SSE2-NEXT: psraw $15, %xmm3
; SSE2-NEXT: pand %xmm9, %xmm3
; SSE2-NEXT: packuswb %xmm1, %xmm3
; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp)
@@ -9986,167 +6348,136 @@ define <32 x i1> @test_cmp_v32i64(<32 x i64> %a0, <32 x i64> %a1) nounwind {
; SSE2-NEXT: retq
;
; SSE42-LABEL: test_cmp_v32i64:
-; SSE42: # BB#0:
-; SSE42-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9
+; SSE42: # %bb.0:
+; SSE42-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
; SSE42-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm11
; SSE42-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm10
; SSE42-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm12
-; SSE42-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
+; SSE42-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9
; SSE42-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm14
; SSE42-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm13
; SSE42-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm15
-; SSE42-NEXT: pcmpgtq {{[0-9]+}}(%rsp), %xmm3
-; SSE42-NEXT: pcmpgtq {{[0-9]+}}(%rsp), %xmm2
-; SSE42-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2]
-; SSE42-NEXT: pslld $31, %xmm2
-; SSE42-NEXT: psrad $31, %xmm2
-; SSE42-NEXT: movdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; SSE42-NEXT: pshufb %xmm3, %xmm2
-; SSE42-NEXT: pcmpgtq {{[0-9]+}}(%rsp), %xmm1
-; SSE42-NEXT: pcmpgtq {{[0-9]+}}(%rsp), %xmm0
-; SSE42-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
-; SSE42-NEXT: pslld $31, %xmm0
-; SSE42-NEXT: psrad $31, %xmm0
-; SSE42-NEXT: pshufb %xmm3, %xmm0
-; SSE42-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; SSE42-NEXT: psllw $15, %xmm0
-; SSE42-NEXT: psraw $15, %xmm0
; SSE42-NEXT: pcmpgtq {{[0-9]+}}(%rsp), %xmm7
; SSE42-NEXT: pcmpgtq {{[0-9]+}}(%rsp), %xmm6
; SSE42-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,2],xmm7[0,2]
-; SSE42-NEXT: pslld $31, %xmm6
-; SSE42-NEXT: psrad $31, %xmm6
-; SSE42-NEXT: pshufb %xmm3, %xmm6
; SSE42-NEXT: pcmpgtq {{[0-9]+}}(%rsp), %xmm5
; SSE42-NEXT: pcmpgtq {{[0-9]+}}(%rsp), %xmm4
; SSE42-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm5[0,2]
-; SSE42-NEXT: pslld $31, %xmm4
-; SSE42-NEXT: psrad $31, %xmm4
-; SSE42-NEXT: pshufb %xmm3, %xmm4
-; SSE42-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm6[0]
-; SSE42-NEXT: psllw $15, %xmm4
-; SSE42-NEXT: psraw $15, %xmm4
+; SSE42-NEXT: packssdw %xmm6, %xmm4
+; SSE42-NEXT: pcmpgtq {{[0-9]+}}(%rsp), %xmm3
+; SSE42-NEXT: pcmpgtq {{[0-9]+}}(%rsp), %xmm2
+; SSE42-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2]
+; SSE42-NEXT: pcmpgtq {{[0-9]+}}(%rsp), %xmm1
+; SSE42-NEXT: pcmpgtq {{[0-9]+}}(%rsp), %xmm0
+; SSE42-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; SSE42-NEXT: packssdw %xmm2, %xmm0
+; SSE42-NEXT: packsswb %xmm4, %xmm0
; SSE42-NEXT: pcmpgtq {{[0-9]+}}(%rsp), %xmm15
; SSE42-NEXT: pcmpgtq {{[0-9]+}}(%rsp), %xmm13
; SSE42-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,2],xmm15[0,2]
-; SSE42-NEXT: pslld $31, %xmm13
-; SSE42-NEXT: psrad $31, %xmm13
-; SSE42-NEXT: pshufb %xmm3, %xmm13
; SSE42-NEXT: pcmpgtq {{[0-9]+}}(%rsp), %xmm14
-; SSE42-NEXT: pcmpgtq {{[0-9]+}}(%rsp), %xmm8
-; SSE42-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,2],xmm14[0,2]
-; SSE42-NEXT: pslld $31, %xmm8
-; SSE42-NEXT: psrad $31, %xmm8
-; SSE42-NEXT: pshufb %xmm3, %xmm8
-; SSE42-NEXT: punpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm13[0]
-; SSE42-NEXT: psllw $15, %xmm8
-; SSE42-NEXT: psraw $15, %xmm8
+; SSE42-NEXT: pcmpgtq {{[0-9]+}}(%rsp), %xmm9
+; SSE42-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,2],xmm14[0,2]
+; SSE42-NEXT: packssdw %xmm13, %xmm9
; SSE42-NEXT: pcmpgtq {{[0-9]+}}(%rsp), %xmm12
; SSE42-NEXT: pcmpgtq {{[0-9]+}}(%rsp), %xmm10
; SSE42-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm12[0,2]
-; SSE42-NEXT: pslld $31, %xmm10
-; SSE42-NEXT: psrad $31, %xmm10
-; SSE42-NEXT: pshufb %xmm3, %xmm10
; SSE42-NEXT: pcmpgtq {{[0-9]+}}(%rsp), %xmm11
-; SSE42-NEXT: pcmpgtq {{[0-9]+}}(%rsp), %xmm9
-; SSE42-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,2],xmm11[0,2]
-; SSE42-NEXT: pslld $31, %xmm9
-; SSE42-NEXT: psrad $31, %xmm9
-; SSE42-NEXT: pshufb %xmm3, %xmm9
-; SSE42-NEXT: punpcklqdq {{.*#+}} xmm9 = xmm9[0],xmm10[0]
-; SSE42-NEXT: psllw $15, %xmm9
-; SSE42-NEXT: psraw $15, %xmm9
-; SSE42-NEXT: pextrb $14, %xmm9, %eax
+; SSE42-NEXT: pcmpgtq {{[0-9]+}}(%rsp), %xmm8
+; SSE42-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,2],xmm11[0,2]
+; SSE42-NEXT: packssdw %xmm10, %xmm8
+; SSE42-NEXT: packsswb %xmm9, %xmm8
+; SSE42-NEXT: pextrb $15, %xmm8, %eax
; SSE42-NEXT: andb $1, %al
; SSE42-NEXT: movb %al, 2(%rdi)
-; SSE42-NEXT: pextrb $12, %xmm9, %eax
+; SSE42-NEXT: pextrb $14, %xmm8, %eax
; SSE42-NEXT: andb $1, %al
; SSE42-NEXT: movb %al, 2(%rdi)
-; SSE42-NEXT: pextrb $10, %xmm9, %eax
+; SSE42-NEXT: pextrb $13, %xmm8, %eax
; SSE42-NEXT: andb $1, %al
; SSE42-NEXT: movb %al, 2(%rdi)
-; SSE42-NEXT: pextrb $8, %xmm9, %eax
+; SSE42-NEXT: pextrb $12, %xmm8, %eax
; SSE42-NEXT: andb $1, %al
; SSE42-NEXT: movb %al, 2(%rdi)
-; SSE42-NEXT: pextrb $6, %xmm9, %eax
+; SSE42-NEXT: pextrb $11, %xmm8, %eax
; SSE42-NEXT: andb $1, %al
; SSE42-NEXT: movb %al, 2(%rdi)
-; SSE42-NEXT: pextrb $4, %xmm9, %eax
+; SSE42-NEXT: pextrb $10, %xmm8, %eax
; SSE42-NEXT: andb $1, %al
; SSE42-NEXT: movb %al, 2(%rdi)
-; SSE42-NEXT: pextrb $2, %xmm9, %eax
+; SSE42-NEXT: pextrb $9, %xmm8, %eax
; SSE42-NEXT: andb $1, %al
; SSE42-NEXT: movb %al, 2(%rdi)
-; SSE42-NEXT: pextrb $0, %xmm9, %eax
+; SSE42-NEXT: pextrb $8, %xmm8, %eax
; SSE42-NEXT: andb $1, %al
; SSE42-NEXT: movb %al, 2(%rdi)
-; SSE42-NEXT: pextrb $14, %xmm8, %eax
+; SSE42-NEXT: pextrb $7, %xmm8, %eax
; SSE42-NEXT: andb $1, %al
; SSE42-NEXT: movb %al, 2(%rdi)
-; SSE42-NEXT: pextrb $12, %xmm8, %eax
+; SSE42-NEXT: pextrb $6, %xmm8, %eax
; SSE42-NEXT: andb $1, %al
; SSE42-NEXT: movb %al, 2(%rdi)
-; SSE42-NEXT: pextrb $10, %xmm8, %eax
+; SSE42-NEXT: pextrb $5, %xmm8, %eax
; SSE42-NEXT: andb $1, %al
; SSE42-NEXT: movb %al, 2(%rdi)
-; SSE42-NEXT: pextrb $8, %xmm8, %eax
+; SSE42-NEXT: pextrb $4, %xmm8, %eax
; SSE42-NEXT: andb $1, %al
; SSE42-NEXT: movb %al, 2(%rdi)
-; SSE42-NEXT: pextrb $6, %xmm8, %eax
+; SSE42-NEXT: pextrb $3, %xmm8, %eax
; SSE42-NEXT: andb $1, %al
; SSE42-NEXT: movb %al, 2(%rdi)
-; SSE42-NEXT: pextrb $4, %xmm8, %eax
+; SSE42-NEXT: pextrb $2, %xmm8, %eax
; SSE42-NEXT: andb $1, %al
; SSE42-NEXT: movb %al, 2(%rdi)
-; SSE42-NEXT: pextrb $2, %xmm8, %eax
+; SSE42-NEXT: pextrb $1, %xmm8, %eax
; SSE42-NEXT: andb $1, %al
; SSE42-NEXT: movb %al, 2(%rdi)
; SSE42-NEXT: pextrb $0, %xmm8, %eax
; SSE42-NEXT: andb $1, %al
; SSE42-NEXT: movb %al, 2(%rdi)
-; SSE42-NEXT: pextrb $14, %xmm4, %eax
+; SSE42-NEXT: pextrb $15, %xmm0, %eax
; SSE42-NEXT: andb $1, %al
; SSE42-NEXT: movb %al, (%rdi)
-; SSE42-NEXT: pextrb $12, %xmm4, %eax
+; SSE42-NEXT: pextrb $14, %xmm0, %eax
; SSE42-NEXT: andb $1, %al
; SSE42-NEXT: movb %al, (%rdi)
-; SSE42-NEXT: pextrb $10, %xmm4, %eax
+; SSE42-NEXT: pextrb $13, %xmm0, %eax
; SSE42-NEXT: andb $1, %al
; SSE42-NEXT: movb %al, (%rdi)
-; SSE42-NEXT: pextrb $8, %xmm4, %eax
+; SSE42-NEXT: pextrb $12, %xmm0, %eax
; SSE42-NEXT: andb $1, %al
; SSE42-NEXT: movb %al, (%rdi)
-; SSE42-NEXT: pextrb $6, %xmm4, %eax
+; SSE42-NEXT: pextrb $11, %xmm0, %eax
; SSE42-NEXT: andb $1, %al
; SSE42-NEXT: movb %al, (%rdi)
-; SSE42-NEXT: pextrb $4, %xmm4, %eax
+; SSE42-NEXT: pextrb $10, %xmm0, %eax
; SSE42-NEXT: andb $1, %al
; SSE42-NEXT: movb %al, (%rdi)
-; SSE42-NEXT: pextrb $2, %xmm4, %eax
+; SSE42-NEXT: pextrb $9, %xmm0, %eax
; SSE42-NEXT: andb $1, %al
; SSE42-NEXT: movb %al, (%rdi)
-; SSE42-NEXT: pextrb $0, %xmm4, %eax
+; SSE42-NEXT: pextrb $8, %xmm0, %eax
; SSE42-NEXT: andb $1, %al
; SSE42-NEXT: movb %al, (%rdi)
-; SSE42-NEXT: pextrb $14, %xmm0, %eax
+; SSE42-NEXT: pextrb $7, %xmm0, %eax
; SSE42-NEXT: andb $1, %al
; SSE42-NEXT: movb %al, (%rdi)
-; SSE42-NEXT: pextrb $12, %xmm0, %eax
+; SSE42-NEXT: pextrb $6, %xmm0, %eax
; SSE42-NEXT: andb $1, %al
; SSE42-NEXT: movb %al, (%rdi)
-; SSE42-NEXT: pextrb $10, %xmm0, %eax
+; SSE42-NEXT: pextrb $5, %xmm0, %eax
; SSE42-NEXT: andb $1, %al
; SSE42-NEXT: movb %al, (%rdi)
-; SSE42-NEXT: pextrb $8, %xmm0, %eax
+; SSE42-NEXT: pextrb $4, %xmm0, %eax
; SSE42-NEXT: andb $1, %al
; SSE42-NEXT: movb %al, (%rdi)
-; SSE42-NEXT: pextrb $6, %xmm0, %eax
+; SSE42-NEXT: pextrb $3, %xmm0, %eax
; SSE42-NEXT: andb $1, %al
; SSE42-NEXT: movb %al, (%rdi)
-; SSE42-NEXT: pextrb $4, %xmm0, %eax
+; SSE42-NEXT: pextrb $2, %xmm0, %eax
; SSE42-NEXT: andb $1, %al
; SSE42-NEXT: movb %al, (%rdi)
-; SSE42-NEXT: pextrb $2, %xmm0, %eax
+; SSE42-NEXT: pextrb $1, %xmm0, %eax
; SSE42-NEXT: andb $1, %al
; SSE42-NEXT: movb %al, (%rdi)
; SSE42-NEXT: pextrb $0, %xmm0, %eax
@@ -10156,7 +6487,7 @@ define <32 x i1> @test_cmp_v32i64(<32 x i64> %a0, <32 x i64> %a1) nounwind {
; SSE42-NEXT: retq
;
; AVX1-LABEL: test_cmp_v32i64:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: pushq %rbp
; AVX1-NEXT: movq %rsp, %rbp
; AVX1-NEXT: andq $-32, %rsp
@@ -10167,53 +6498,53 @@ define <32 x i1> @test_cmp_v32i64(<32 x i64> %a0, <32 x i64> %a1) nounwind {
; AVX1-NEXT: vpcmpgtq %xmm9, %xmm10, %xmm9
; AVX1-NEXT: vmovdqa 208(%rbp), %ymm10
; AVX1-NEXT: vpcmpgtq %xmm8, %xmm7, %xmm7
-; AVX1-NEXT: vpacksswb %xmm9, %xmm7, %xmm8
+; AVX1-NEXT: vpackssdw %xmm9, %xmm7, %xmm8
; AVX1-NEXT: vextractf128 $1, %ymm10, %xmm9
; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm7
; AVX1-NEXT: vpcmpgtq %xmm9, %xmm7, %xmm7
; AVX1-NEXT: vpcmpgtq %xmm10, %xmm6, %xmm6
; AVX1-NEXT: vmovdqa 176(%rbp), %ymm9
-; AVX1-NEXT: vpacksswb %xmm7, %xmm6, %xmm6
-; AVX1-NEXT: vpacksswb %xmm8, %xmm6, %xmm8
+; AVX1-NEXT: vpackssdw %xmm7, %xmm6, %xmm6
+; AVX1-NEXT: vpackssdw %xmm8, %xmm6, %xmm8
; AVX1-NEXT: vextractf128 $1, %ymm9, %xmm7
; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm6
; AVX1-NEXT: vpcmpgtq %xmm7, %xmm6, %xmm6
; AVX1-NEXT: vmovdqa 144(%rbp), %ymm10
; AVX1-NEXT: vpcmpgtq %xmm9, %xmm5, %xmm5
-; AVX1-NEXT: vpacksswb %xmm6, %xmm5, %xmm5
+; AVX1-NEXT: vpackssdw %xmm6, %xmm5, %xmm5
; AVX1-NEXT: vextractf128 $1, %ymm10, %xmm6
; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm7
; AVX1-NEXT: vpcmpgtq %xmm6, %xmm7, %xmm6
; AVX1-NEXT: vpcmpgtq %xmm10, %xmm4, %xmm4
-; AVX1-NEXT: vpacksswb %xmm6, %xmm4, %xmm4
+; AVX1-NEXT: vpackssdw %xmm6, %xmm4, %xmm4
; AVX1-NEXT: vmovdqa 112(%rbp), %ymm6
-; AVX1-NEXT: vpacksswb %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vpackssdw %xmm5, %xmm4, %xmm4
; AVX1-NEXT: vpacksswb %xmm8, %xmm4, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm5
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm7
; AVX1-NEXT: vpcmpgtq %xmm5, %xmm7, %xmm5
; AVX1-NEXT: vmovdqa 80(%rbp), %ymm7
; AVX1-NEXT: vpcmpgtq %xmm6, %xmm3, %xmm3
-; AVX1-NEXT: vpacksswb %xmm5, %xmm3, %xmm3
+; AVX1-NEXT: vpackssdw %xmm5, %xmm3, %xmm3
; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm5
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6
; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5
; AVX1-NEXT: vpcmpgtq %xmm7, %xmm2, %xmm2
; AVX1-NEXT: vmovdqa 48(%rbp), %ymm6
-; AVX1-NEXT: vpacksswb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT: vpacksswb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpackssdw %xmm5, %xmm2, %xmm2
+; AVX1-NEXT: vpackssdw %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm3
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
; AVX1-NEXT: vpcmpgtq %xmm3, %xmm5, %xmm3
; AVX1-NEXT: vmovdqa 16(%rbp), %ymm5
; AVX1-NEXT: vpcmpgtq %xmm6, %xmm1, %xmm1
-; AVX1-NEXT: vpacksswb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpackssdw %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm3
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6
; AVX1-NEXT: vpcmpgtq %xmm3, %xmm6, %xmm3
; AVX1-NEXT: vpcmpgtq %xmm5, %xmm0, %xmm0
-; AVX1-NEXT: vpacksswb %xmm3, %xmm0, %xmm0
-; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpackssdw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpacksswb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
; AVX1-NEXT: movq %rbp, %rsp
@@ -10221,30 +6552,30 @@ define <32 x i1> @test_cmp_v32i64(<32 x i64> %a0, <32 x i64> %a1) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_cmp_v32i64:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: pushq %rbp
; AVX2-NEXT: movq %rsp, %rbp
; AVX2-NEXT: andq $-32, %rsp
; AVX2-NEXT: subq $32, %rsp
; AVX2-NEXT: vpcmpgtq 240(%rbp), %ymm7, %ymm7
; AVX2-NEXT: vpcmpgtq 208(%rbp), %ymm6, %ymm6
-; AVX2-NEXT: vpacksswb %ymm7, %ymm6, %ymm6
+; AVX2-NEXT: vpackssdw %ymm7, %ymm6, %ymm6
; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,1,3]
; AVX2-NEXT: vpcmpgtq 176(%rbp), %ymm5, %ymm5
; AVX2-NEXT: vpcmpgtq 144(%rbp), %ymm4, %ymm4
-; AVX2-NEXT: vpacksswb %ymm5, %ymm4, %ymm4
+; AVX2-NEXT: vpackssdw %ymm5, %ymm4, %ymm4
; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,1,3]
-; AVX2-NEXT: vpacksswb %ymm6, %ymm4, %ymm4
+; AVX2-NEXT: vpackssdw %ymm6, %ymm4, %ymm4
; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,1,3]
; AVX2-NEXT: vpcmpgtq 112(%rbp), %ymm3, %ymm3
; AVX2-NEXT: vpcmpgtq 80(%rbp), %ymm2, %ymm2
-; AVX2-NEXT: vpacksswb %ymm3, %ymm2, %ymm2
+; AVX2-NEXT: vpackssdw %ymm3, %ymm2, %ymm2
; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
; AVX2-NEXT: vpcmpgtq 48(%rbp), %ymm1, %ymm1
; AVX2-NEXT: vpcmpgtq 16(%rbp), %ymm0, %ymm0
-; AVX2-NEXT: vpacksswb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX2-NEXT: vpacksswb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpackssdw %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX2-NEXT: vpacksswb %ymm4, %ymm0, %ymm0
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
@@ -10253,778 +6584,56 @@ define <32 x i1> @test_cmp_v32i64(<32 x i64> %a0, <32 x i64> %a1) nounwind {
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test_cmp_v32i64:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vextracti32x4 $3, %zmm4, %xmm8
-; AVX512F-NEXT: vpextrq $1, %xmm8, %rcx
-; AVX512F-NEXT: vextracti32x4 $3, %zmm0, %xmm9
-; AVX512F-NEXT: vpextrq $1, %xmm9, %rdx
-; AVX512F-NEXT: xorl %eax, %eax
-; AVX512F-NEXT: cmpq %rcx, %rdx
-; AVX512F-NEXT: movq $-1, %rcx
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmovgq %rcx, %rdx
-; AVX512F-NEXT: vmovq %rdx, %xmm10
-; AVX512F-NEXT: vmovq %xmm8, %rdx
-; AVX512F-NEXT: vmovq %xmm9, %rsi
-; AVX512F-NEXT: cmpq %rdx, %rsi
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmovgq %rcx, %rdx
-; AVX512F-NEXT: vmovq %rdx, %xmm8
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm10[0]
-; AVX512F-NEXT: vextracti32x4 $2, %zmm4, %xmm9
-; AVX512F-NEXT: vpextrq $1, %xmm9, %rdx
-; AVX512F-NEXT: vextracti32x4 $2, %zmm0, %xmm10
-; AVX512F-NEXT: vpextrq $1, %xmm10, %rsi
-; AVX512F-NEXT: cmpq %rdx, %rsi
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmovgq %rcx, %rdx
-; AVX512F-NEXT: vmovq %rdx, %xmm11
-; AVX512F-NEXT: vmovq %xmm9, %rdx
-; AVX512F-NEXT: vmovq %xmm10, %rsi
-; AVX512F-NEXT: cmpq %rdx, %rsi
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmovgq %rcx, %rdx
-; AVX512F-NEXT: vmovq %rdx, %xmm9
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm9 = xmm9[0],xmm11[0]
-; AVX512F-NEXT: vinserti128 $1, %xmm8, %ymm9, %ymm8
-; AVX512F-NEXT: vextracti32x4 $1, %zmm4, %xmm9
-; AVX512F-NEXT: vpextrq $1, %xmm9, %rdx
-; AVX512F-NEXT: vextracti32x4 $1, %zmm0, %xmm10
-; AVX512F-NEXT: vpextrq $1, %xmm10, %rsi
-; AVX512F-NEXT: cmpq %rdx, %rsi
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmovgq %rcx, %rdx
-; AVX512F-NEXT: vmovq %rdx, %xmm11
-; AVX512F-NEXT: vmovq %xmm9, %rdx
-; AVX512F-NEXT: vmovq %xmm10, %rsi
-; AVX512F-NEXT: cmpq %rdx, %rsi
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmovgq %rcx, %rdx
-; AVX512F-NEXT: vmovq %rdx, %xmm9
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm9 = xmm9[0],xmm11[0]
-; AVX512F-NEXT: vpextrq $1, %xmm4, %rdx
-; AVX512F-NEXT: vpextrq $1, %xmm0, %rsi
-; AVX512F-NEXT: cmpq %rdx, %rsi
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmovgq %rcx, %rdx
-; AVX512F-NEXT: vmovq %rdx, %xmm10
-; AVX512F-NEXT: vmovq %xmm4, %rdx
-; AVX512F-NEXT: vmovq %xmm0, %rsi
-; AVX512F-NEXT: cmpq %rdx, %rsi
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmovgq %rcx, %rdx
-; AVX512F-NEXT: vmovq %rdx, %xmm0
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm10[0]
-; AVX512F-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm0
-; AVX512F-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm0
-; AVX512F-NEXT: vpmovqd %zmm0, %ymm8
-; AVX512F-NEXT: vextracti32x4 $3, %zmm5, %xmm4
-; AVX512F-NEXT: vpextrq $1, %xmm4, %rdx
-; AVX512F-NEXT: vextracti32x4 $3, %zmm1, %xmm0
-; AVX512F-NEXT: vpextrq $1, %xmm0, %rsi
-; AVX512F-NEXT: cmpq %rdx, %rsi
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmovgq %rcx, %rdx
-; AVX512F-NEXT: vmovq %rdx, %xmm9
-; AVX512F-NEXT: vmovq %xmm4, %rdx
-; AVX512F-NEXT: vmovq %xmm0, %rsi
-; AVX512F-NEXT: cmpq %rdx, %rsi
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmovgq %rcx, %rdx
-; AVX512F-NEXT: vmovq %rdx, %xmm0
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm9 = xmm0[0],xmm9[0]
-; AVX512F-NEXT: vextracti32x4 $2, %zmm5, %xmm4
-; AVX512F-NEXT: vpextrq $1, %xmm4, %rdx
-; AVX512F-NEXT: vextracti32x4 $2, %zmm1, %xmm0
-; AVX512F-NEXT: vpextrq $1, %xmm0, %rsi
-; AVX512F-NEXT: cmpq %rdx, %rsi
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmovgq %rcx, %rdx
-; AVX512F-NEXT: vmovq %rdx, %xmm10
-; AVX512F-NEXT: vmovq %xmm4, %rdx
-; AVX512F-NEXT: vmovq %xmm0, %rsi
-; AVX512F-NEXT: cmpq %rdx, %rsi
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmovgq %rcx, %rdx
-; AVX512F-NEXT: vmovq %rdx, %xmm0
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm10[0]
-; AVX512F-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX512F-NEXT: vextracti32x4 $1, %zmm5, %xmm0
-; AVX512F-NEXT: vpextrq $1, %xmm0, %rdx
-; AVX512F-NEXT: vextracti32x4 $1, %zmm1, %xmm4
-; AVX512F-NEXT: vpextrq $1, %xmm4, %rsi
-; AVX512F-NEXT: cmpq %rdx, %rsi
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmovgq %rcx, %rdx
-; AVX512F-NEXT: vmovq %rdx, %xmm10
-; AVX512F-NEXT: vmovq %xmm0, %rdx
-; AVX512F-NEXT: vmovq %xmm4, %rsi
-; AVX512F-NEXT: cmpq %rdx, %rsi
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmovgq %rcx, %rdx
-; AVX512F-NEXT: vmovq %rdx, %xmm0
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm10[0]
-; AVX512F-NEXT: vpextrq $1, %xmm5, %rdx
-; AVX512F-NEXT: vpextrq $1, %xmm1, %rsi
-; AVX512F-NEXT: cmpq %rdx, %rsi
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmovgq %rcx, %rdx
-; AVX512F-NEXT: vmovq %rdx, %xmm4
-; AVX512F-NEXT: vmovq %xmm5, %rdx
-; AVX512F-NEXT: vmovq %xmm1, %rsi
-; AVX512F-NEXT: cmpq %rdx, %rsi
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmovgq %rcx, %rdx
-; AVX512F-NEXT: vmovq %rdx, %xmm1
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm4[0]
-; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512F-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm0
-; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm8, %zmm0
-; AVX512F-NEXT: vpmovdb %zmm0, %xmm8
-; AVX512F-NEXT: vextracti32x4 $3, %zmm6, %xmm1
-; AVX512F-NEXT: vpextrq $1, %xmm1, %rdx
-; AVX512F-NEXT: vextracti32x4 $3, %zmm2, %xmm4
-; AVX512F-NEXT: vpextrq $1, %xmm4, %rsi
-; AVX512F-NEXT: cmpq %rdx, %rsi
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmovgq %rcx, %rdx
-; AVX512F-NEXT: vmovq %rdx, %xmm5
-; AVX512F-NEXT: vmovq %xmm1, %rdx
-; AVX512F-NEXT: vmovq %xmm4, %rsi
-; AVX512F-NEXT: cmpq %rdx, %rsi
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmovgq %rcx, %rdx
-; AVX512F-NEXT: vmovq %rdx, %xmm1
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0]
-; AVX512F-NEXT: vextracti32x4 $2, %zmm6, %xmm4
-; AVX512F-NEXT: vpextrq $1, %xmm4, %rdx
-; AVX512F-NEXT: vextracti32x4 $2, %zmm2, %xmm5
-; AVX512F-NEXT: vpextrq $1, %xmm5, %rsi
-; AVX512F-NEXT: cmpq %rdx, %rsi
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmovgq %rcx, %rdx
-; AVX512F-NEXT: vmovq %rdx, %xmm0
-; AVX512F-NEXT: vmovq %xmm4, %rdx
-; AVX512F-NEXT: vmovq %xmm5, %rsi
-; AVX512F-NEXT: cmpq %rdx, %rsi
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmovgq %rcx, %rdx
-; AVX512F-NEXT: vmovq %rdx, %xmm4
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm4[0],xmm0[0]
-; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512F-NEXT: vextracti32x4 $1, %zmm6, %xmm0
-; AVX512F-NEXT: vpextrq $1, %xmm0, %rdx
-; AVX512F-NEXT: vextracti32x4 $1, %zmm2, %xmm4
-; AVX512F-NEXT: vpextrq $1, %xmm4, %rsi
-; AVX512F-NEXT: cmpq %rdx, %rsi
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmovgq %rcx, %rdx
-; AVX512F-NEXT: vmovq %rdx, %xmm5
-; AVX512F-NEXT: vmovq %xmm0, %rdx
-; AVX512F-NEXT: vmovq %xmm4, %rsi
-; AVX512F-NEXT: cmpq %rdx, %rsi
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmovgq %rcx, %rdx
-; AVX512F-NEXT: vmovq %rdx, %xmm0
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm5[0]
-; AVX512F-NEXT: vpextrq $1, %xmm6, %rdx
-; AVX512F-NEXT: vpextrq $1, %xmm2, %rsi
-; AVX512F-NEXT: cmpq %rdx, %rsi
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmovgq %rcx, %rdx
-; AVX512F-NEXT: vmovq %rdx, %xmm4
-; AVX512F-NEXT: vmovq %xmm6, %rdx
-; AVX512F-NEXT: vmovq %xmm2, %rsi
-; AVX512F-NEXT: cmpq %rdx, %rsi
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmovgq %rcx, %rdx
-; AVX512F-NEXT: vmovq %rdx, %xmm2
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0]
-; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0
-; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-NEXT: vpmovqd %zmm0, %ymm1
-; AVX512F-NEXT: vextracti32x4 $3, %zmm7, %xmm0
-; AVX512F-NEXT: vpextrq $1, %xmm0, %rdx
-; AVX512F-NEXT: vextracti32x4 $3, %zmm3, %xmm2
-; AVX512F-NEXT: vpextrq $1, %xmm2, %rsi
-; AVX512F-NEXT: cmpq %rdx, %rsi
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmovgq %rcx, %rdx
-; AVX512F-NEXT: vmovq %rdx, %xmm4
-; AVX512F-NEXT: vmovq %xmm0, %rdx
-; AVX512F-NEXT: vmovq %xmm2, %rsi
-; AVX512F-NEXT: cmpq %rdx, %rsi
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmovgq %rcx, %rdx
-; AVX512F-NEXT: vmovq %rdx, %xmm0
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
-; AVX512F-NEXT: vextracti32x4 $2, %zmm7, %xmm2
-; AVX512F-NEXT: vpextrq $1, %xmm2, %rdx
-; AVX512F-NEXT: vextracti32x4 $2, %zmm3, %xmm4
-; AVX512F-NEXT: vpextrq $1, %xmm4, %rsi
-; AVX512F-NEXT: cmpq %rdx, %rsi
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmovgq %rcx, %rdx
-; AVX512F-NEXT: vmovq %rdx, %xmm5
-; AVX512F-NEXT: vmovq %xmm2, %rdx
-; AVX512F-NEXT: vmovq %xmm4, %rsi
-; AVX512F-NEXT: cmpq %rdx, %rsi
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmovgq %rcx, %rdx
-; AVX512F-NEXT: vmovq %rdx, %xmm2
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm5[0]
-; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm2
-; AVX512F-NEXT: vextracti32x4 $1, %zmm7, %xmm0
-; AVX512F-NEXT: vpextrq $1, %xmm0, %rdx
-; AVX512F-NEXT: vextracti32x4 $1, %zmm3, %xmm4
-; AVX512F-NEXT: vpextrq $1, %xmm4, %rsi
-; AVX512F-NEXT: cmpq %rdx, %rsi
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmovgq %rcx, %rdx
-; AVX512F-NEXT: vmovq %rdx, %xmm5
-; AVX512F-NEXT: vmovq %xmm0, %rdx
-; AVX512F-NEXT: vmovq %xmm4, %rsi
-; AVX512F-NEXT: cmpq %rdx, %rsi
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmovgq %rcx, %rdx
-; AVX512F-NEXT: vmovq %rdx, %xmm0
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm5[0]
-; AVX512F-NEXT: vpextrq $1, %xmm7, %rdx
-; AVX512F-NEXT: vpextrq $1, %xmm3, %rsi
-; AVX512F-NEXT: cmpq %rdx, %rsi
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: cmovgq %rcx, %rdx
-; AVX512F-NEXT: vmovq %rdx, %xmm4
-; AVX512F-NEXT: vmovq %xmm7, %rdx
-; AVX512F-NEXT: vmovq %xmm3, %rsi
-; AVX512F-NEXT: cmpq %rdx, %rsi
-; AVX512F-NEXT: cmovgq %rcx, %rax
-; AVX512F-NEXT: vmovq %rax, %xmm3
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0]
-; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0
-; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
-; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpcmpgtq %zmm4, %zmm0, %k0
+; AVX512F-NEXT: vpcmpgtq %zmm5, %zmm1, %k1
+; AVX512F-NEXT: kunpckbw %k0, %k1, %k1
+; AVX512F-NEXT: movl {{.*}}(%rip), %eax
+; AVX512F-NEXT: vpbroadcastd %eax, %zmm0 {%k1} {z}
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm8, %ymm0
+; AVX512F-NEXT: vpcmpgtq %zmm6, %zmm2, %k0
+; AVX512F-NEXT: vpcmpgtq %zmm7, %zmm3, %k1
+; AVX512F-NEXT: kunpckbw %k0, %k1, %k1
+; AVX512F-NEXT: vpbroadcastd %eax, %zmm1 {%k1} {z}
+; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
+; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT: vpsllw $7, %ymm0, %ymm0
+; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512F-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0
; AVX512F-NEXT: retq
;
; AVX512DQ-LABEL: test_cmp_v32i64:
-; AVX512DQ: # BB#0:
-; AVX512DQ-NEXT: vextracti64x2 $3, %zmm4, %xmm8
-; AVX512DQ-NEXT: vpextrq $1, %xmm8, %rcx
-; AVX512DQ-NEXT: vextracti64x2 $3, %zmm0, %xmm9
-; AVX512DQ-NEXT: vpextrq $1, %xmm9, %rdx
-; AVX512DQ-NEXT: xorl %eax, %eax
-; AVX512DQ-NEXT: cmpq %rcx, %rdx
-; AVX512DQ-NEXT: movq $-1, %rcx
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmovgq %rcx, %rdx
-; AVX512DQ-NEXT: vmovq %rdx, %xmm10
-; AVX512DQ-NEXT: vmovq %xmm8, %rdx
-; AVX512DQ-NEXT: vmovq %xmm9, %rsi
-; AVX512DQ-NEXT: cmpq %rdx, %rsi
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmovgq %rcx, %rdx
-; AVX512DQ-NEXT: vmovq %rdx, %xmm8
-; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm10[0]
-; AVX512DQ-NEXT: vextracti64x2 $2, %zmm4, %xmm9
-; AVX512DQ-NEXT: vpextrq $1, %xmm9, %rdx
-; AVX512DQ-NEXT: vextracti64x2 $2, %zmm0, %xmm10
-; AVX512DQ-NEXT: vpextrq $1, %xmm10, %rsi
-; AVX512DQ-NEXT: cmpq %rdx, %rsi
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmovgq %rcx, %rdx
-; AVX512DQ-NEXT: vmovq %rdx, %xmm11
-; AVX512DQ-NEXT: vmovq %xmm9, %rdx
-; AVX512DQ-NEXT: vmovq %xmm10, %rsi
-; AVX512DQ-NEXT: cmpq %rdx, %rsi
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmovgq %rcx, %rdx
-; AVX512DQ-NEXT: vmovq %rdx, %xmm9
-; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm9 = xmm9[0],xmm11[0]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm8, %ymm9, %ymm8
-; AVX512DQ-NEXT: vextracti64x2 $1, %zmm4, %xmm9
-; AVX512DQ-NEXT: vpextrq $1, %xmm9, %rdx
-; AVX512DQ-NEXT: vextracti64x2 $1, %zmm0, %xmm10
-; AVX512DQ-NEXT: vpextrq $1, %xmm10, %rsi
-; AVX512DQ-NEXT: cmpq %rdx, %rsi
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmovgq %rcx, %rdx
-; AVX512DQ-NEXT: vmovq %rdx, %xmm11
-; AVX512DQ-NEXT: vmovq %xmm9, %rdx
-; AVX512DQ-NEXT: vmovq %xmm10, %rsi
-; AVX512DQ-NEXT: cmpq %rdx, %rsi
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmovgq %rcx, %rdx
-; AVX512DQ-NEXT: vmovq %rdx, %xmm9
-; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm9 = xmm9[0],xmm11[0]
-; AVX512DQ-NEXT: vpextrq $1, %xmm4, %rdx
-; AVX512DQ-NEXT: vpextrq $1, %xmm0, %rsi
-; AVX512DQ-NEXT: cmpq %rdx, %rsi
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmovgq %rcx, %rdx
-; AVX512DQ-NEXT: vmovq %rdx, %xmm10
-; AVX512DQ-NEXT: vmovq %xmm4, %rdx
-; AVX512DQ-NEXT: vmovq %xmm0, %rsi
-; AVX512DQ-NEXT: cmpq %rdx, %rsi
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmovgq %rcx, %rdx
-; AVX512DQ-NEXT: vmovq %rdx, %xmm0
-; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm10[0]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm0
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm0
-; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm8
-; AVX512DQ-NEXT: vextracti64x2 $3, %zmm5, %xmm4
-; AVX512DQ-NEXT: vpextrq $1, %xmm4, %rdx
-; AVX512DQ-NEXT: vextracti64x2 $3, %zmm1, %xmm0
-; AVX512DQ-NEXT: vpextrq $1, %xmm0, %rsi
-; AVX512DQ-NEXT: cmpq %rdx, %rsi
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmovgq %rcx, %rdx
-; AVX512DQ-NEXT: vmovq %rdx, %xmm9
-; AVX512DQ-NEXT: vmovq %xmm4, %rdx
-; AVX512DQ-NEXT: vmovq %xmm0, %rsi
-; AVX512DQ-NEXT: cmpq %rdx, %rsi
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmovgq %rcx, %rdx
-; AVX512DQ-NEXT: vmovq %rdx, %xmm0
-; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm9 = xmm0[0],xmm9[0]
-; AVX512DQ-NEXT: vextracti64x2 $2, %zmm5, %xmm4
-; AVX512DQ-NEXT: vpextrq $1, %xmm4, %rdx
-; AVX512DQ-NEXT: vextracti64x2 $2, %zmm1, %xmm0
-; AVX512DQ-NEXT: vpextrq $1, %xmm0, %rsi
-; AVX512DQ-NEXT: cmpq %rdx, %rsi
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmovgq %rcx, %rdx
-; AVX512DQ-NEXT: vmovq %rdx, %xmm10
-; AVX512DQ-NEXT: vmovq %xmm4, %rdx
-; AVX512DQ-NEXT: vmovq %xmm0, %rsi
-; AVX512DQ-NEXT: cmpq %rdx, %rsi
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmovgq %rcx, %rdx
-; AVX512DQ-NEXT: vmovq %rdx, %xmm0
-; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm10[0]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX512DQ-NEXT: vextracti64x2 $1, %zmm5, %xmm0
-; AVX512DQ-NEXT: vpextrq $1, %xmm0, %rdx
-; AVX512DQ-NEXT: vextracti64x2 $1, %zmm1, %xmm4
-; AVX512DQ-NEXT: vpextrq $1, %xmm4, %rsi
-; AVX512DQ-NEXT: cmpq %rdx, %rsi
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmovgq %rcx, %rdx
-; AVX512DQ-NEXT: vmovq %rdx, %xmm10
-; AVX512DQ-NEXT: vmovq %xmm0, %rdx
-; AVX512DQ-NEXT: vmovq %xmm4, %rsi
-; AVX512DQ-NEXT: cmpq %rdx, %rsi
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmovgq %rcx, %rdx
-; AVX512DQ-NEXT: vmovq %rdx, %xmm0
-; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm10[0]
-; AVX512DQ-NEXT: vpextrq $1, %xmm5, %rdx
-; AVX512DQ-NEXT: vpextrq $1, %xmm1, %rsi
-; AVX512DQ-NEXT: cmpq %rdx, %rsi
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmovgq %rcx, %rdx
-; AVX512DQ-NEXT: vmovq %rdx, %xmm4
-; AVX512DQ-NEXT: vmovq %xmm5, %rdx
-; AVX512DQ-NEXT: vmovq %xmm1, %rsi
-; AVX512DQ-NEXT: cmpq %rdx, %rsi
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmovgq %rcx, %rdx
-; AVX512DQ-NEXT: vmovq %rdx, %xmm1
-; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm4[0]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm0
-; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512DQ-NEXT: vinserti32x8 $1, %ymm0, %zmm8, %zmm0
-; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm8
-; AVX512DQ-NEXT: vextracti64x2 $3, %zmm6, %xmm1
-; AVX512DQ-NEXT: vpextrq $1, %xmm1, %rdx
-; AVX512DQ-NEXT: vextracti64x2 $3, %zmm2, %xmm4
-; AVX512DQ-NEXT: vpextrq $1, %xmm4, %rsi
-; AVX512DQ-NEXT: cmpq %rdx, %rsi
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmovgq %rcx, %rdx
-; AVX512DQ-NEXT: vmovq %rdx, %xmm5
-; AVX512DQ-NEXT: vmovq %xmm1, %rdx
-; AVX512DQ-NEXT: vmovq %xmm4, %rsi
-; AVX512DQ-NEXT: cmpq %rdx, %rsi
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmovgq %rcx, %rdx
-; AVX512DQ-NEXT: vmovq %rdx, %xmm1
-; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0]
-; AVX512DQ-NEXT: vextracti64x2 $2, %zmm6, %xmm4
-; AVX512DQ-NEXT: vpextrq $1, %xmm4, %rdx
-; AVX512DQ-NEXT: vextracti64x2 $2, %zmm2, %xmm5
-; AVX512DQ-NEXT: vpextrq $1, %xmm5, %rsi
-; AVX512DQ-NEXT: cmpq %rdx, %rsi
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmovgq %rcx, %rdx
-; AVX512DQ-NEXT: vmovq %rdx, %xmm0
-; AVX512DQ-NEXT: vmovq %xmm4, %rdx
-; AVX512DQ-NEXT: vmovq %xmm5, %rsi
-; AVX512DQ-NEXT: cmpq %rdx, %rsi
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmovgq %rcx, %rdx
-; AVX512DQ-NEXT: vmovq %rdx, %xmm4
-; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm4[0],xmm0[0]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512DQ-NEXT: vextracti64x2 $1, %zmm6, %xmm0
-; AVX512DQ-NEXT: vpextrq $1, %xmm0, %rdx
-; AVX512DQ-NEXT: vextracti64x2 $1, %zmm2, %xmm4
-; AVX512DQ-NEXT: vpextrq $1, %xmm4, %rsi
-; AVX512DQ-NEXT: cmpq %rdx, %rsi
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmovgq %rcx, %rdx
-; AVX512DQ-NEXT: vmovq %rdx, %xmm5
-; AVX512DQ-NEXT: vmovq %xmm0, %rdx
-; AVX512DQ-NEXT: vmovq %xmm4, %rsi
-; AVX512DQ-NEXT: cmpq %rdx, %rsi
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmovgq %rcx, %rdx
-; AVX512DQ-NEXT: vmovq %rdx, %xmm0
-; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm5[0]
-; AVX512DQ-NEXT: vpextrq $1, %xmm6, %rdx
-; AVX512DQ-NEXT: vpextrq $1, %xmm2, %rsi
-; AVX512DQ-NEXT: cmpq %rdx, %rsi
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmovgq %rcx, %rdx
-; AVX512DQ-NEXT: vmovq %rdx, %xmm4
-; AVX512DQ-NEXT: vmovq %xmm6, %rdx
-; AVX512DQ-NEXT: vmovq %xmm2, %rsi
-; AVX512DQ-NEXT: cmpq %rdx, %rsi
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmovgq %rcx, %rdx
-; AVX512DQ-NEXT: vmovq %rdx, %xmm2
-; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm1
-; AVX512DQ-NEXT: vextracti64x2 $3, %zmm7, %xmm0
-; AVX512DQ-NEXT: vpextrq $1, %xmm0, %rdx
-; AVX512DQ-NEXT: vextracti64x2 $3, %zmm3, %xmm2
-; AVX512DQ-NEXT: vpextrq $1, %xmm2, %rsi
-; AVX512DQ-NEXT: cmpq %rdx, %rsi
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmovgq %rcx, %rdx
-; AVX512DQ-NEXT: vmovq %rdx, %xmm4
-; AVX512DQ-NEXT: vmovq %xmm0, %rdx
-; AVX512DQ-NEXT: vmovq %xmm2, %rsi
-; AVX512DQ-NEXT: cmpq %rdx, %rsi
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmovgq %rcx, %rdx
-; AVX512DQ-NEXT: vmovq %rdx, %xmm0
-; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
-; AVX512DQ-NEXT: vextracti64x2 $2, %zmm7, %xmm2
-; AVX512DQ-NEXT: vpextrq $1, %xmm2, %rdx
-; AVX512DQ-NEXT: vextracti64x2 $2, %zmm3, %xmm4
-; AVX512DQ-NEXT: vpextrq $1, %xmm4, %rsi
-; AVX512DQ-NEXT: cmpq %rdx, %rsi
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmovgq %rcx, %rdx
-; AVX512DQ-NEXT: vmovq %rdx, %xmm5
-; AVX512DQ-NEXT: vmovq %xmm2, %rdx
-; AVX512DQ-NEXT: vmovq %xmm4, %rsi
-; AVX512DQ-NEXT: cmpq %rdx, %rsi
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmovgq %rcx, %rdx
-; AVX512DQ-NEXT: vmovq %rdx, %xmm2
-; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm5[0]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm2
-; AVX512DQ-NEXT: vextracti64x2 $1, %zmm7, %xmm0
-; AVX512DQ-NEXT: vpextrq $1, %xmm0, %rdx
-; AVX512DQ-NEXT: vextracti64x2 $1, %zmm3, %xmm4
-; AVX512DQ-NEXT: vpextrq $1, %xmm4, %rsi
-; AVX512DQ-NEXT: cmpq %rdx, %rsi
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmovgq %rcx, %rdx
-; AVX512DQ-NEXT: vmovq %rdx, %xmm5
-; AVX512DQ-NEXT: vmovq %xmm0, %rdx
-; AVX512DQ-NEXT: vmovq %xmm4, %rsi
-; AVX512DQ-NEXT: cmpq %rdx, %rsi
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmovgq %rcx, %rdx
-; AVX512DQ-NEXT: vmovq %rdx, %xmm0
-; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm5[0]
-; AVX512DQ-NEXT: vpextrq $1, %xmm7, %rdx
-; AVX512DQ-NEXT: vpextrq $1, %xmm3, %rsi
-; AVX512DQ-NEXT: cmpq %rdx, %rsi
-; AVX512DQ-NEXT: movl $0, %edx
-; AVX512DQ-NEXT: cmovgq %rcx, %rdx
-; AVX512DQ-NEXT: vmovq %rdx, %xmm4
-; AVX512DQ-NEXT: vmovq %xmm7, %rdx
-; AVX512DQ-NEXT: vmovq %xmm3, %rsi
-; AVX512DQ-NEXT: cmpq %rdx, %rsi
-; AVX512DQ-NEXT: cmovgq %rcx, %rax
-; AVX512DQ-NEXT: vmovq %rax, %xmm3
-; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
-; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512DQ-NEXT: vinserti32x8 $1, %ymm0, %zmm1, %zmm0
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: vpcmpgtq %zmm4, %zmm0, %k0
+; AVX512DQ-NEXT: vpcmpgtq %zmm5, %zmm1, %k1
+; AVX512DQ-NEXT: kunpckbw %k0, %k1, %k1
+; AVX512DQ-NEXT: movl {{.*}}(%rip), %eax
+; AVX512DQ-NEXT: vpbroadcastd %eax, %zmm0 {%k1} {z}
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm8, %ymm0
+; AVX512DQ-NEXT: vpcmpgtq %zmm6, %zmm2, %k0
+; AVX512DQ-NEXT: vpcmpgtq %zmm7, %zmm3, %k1
+; AVX512DQ-NEXT: kunpckbw %k0, %k1, %k1
+; AVX512DQ-NEXT: vpbroadcastd %eax, %zmm1 {%k1} {z}
+; AVX512DQ-NEXT: vpmovdb %zmm1, %xmm1
+; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512DQ-NEXT: vpsllw $7, %ymm0, %ymm0
+; AVX512DQ-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512DQ-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: test_cmp_v32i64:
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vextracti32x4 $3, %zmm4, %xmm8
-; AVX512BW-NEXT: vpextrq $1, %xmm8, %rcx
-; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm9
-; AVX512BW-NEXT: vpextrq $1, %xmm9, %rdx
-; AVX512BW-NEXT: xorl %eax, %eax
-; AVX512BW-NEXT: cmpq %rcx, %rdx
-; AVX512BW-NEXT: movq $-1, %rcx
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovgq %rcx, %rdx
-; AVX512BW-NEXT: vmovq %rdx, %xmm10
-; AVX512BW-NEXT: vmovq %xmm8, %rdx
-; AVX512BW-NEXT: vmovq %xmm9, %rsi
-; AVX512BW-NEXT: cmpq %rdx, %rsi
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovgq %rcx, %rdx
-; AVX512BW-NEXT: vmovq %rdx, %xmm8
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm10[0]
-; AVX512BW-NEXT: vextracti32x4 $2, %zmm4, %xmm9
-; AVX512BW-NEXT: vpextrq $1, %xmm9, %rdx
-; AVX512BW-NEXT: vextracti32x4 $2, %zmm0, %xmm10
-; AVX512BW-NEXT: vpextrq $1, %xmm10, %rsi
-; AVX512BW-NEXT: cmpq %rdx, %rsi
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovgq %rcx, %rdx
-; AVX512BW-NEXT: vmovq %rdx, %xmm11
-; AVX512BW-NEXT: vmovq %xmm9, %rdx
-; AVX512BW-NEXT: vmovq %xmm10, %rsi
-; AVX512BW-NEXT: cmpq %rdx, %rsi
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovgq %rcx, %rdx
-; AVX512BW-NEXT: vmovq %rdx, %xmm9
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm9 = xmm9[0],xmm11[0]
-; AVX512BW-NEXT: vinserti128 $1, %xmm8, %ymm9, %ymm8
-; AVX512BW-NEXT: vextracti32x4 $1, %zmm4, %xmm9
-; AVX512BW-NEXT: vpextrq $1, %xmm9, %rdx
-; AVX512BW-NEXT: vextracti32x4 $1, %zmm0, %xmm10
-; AVX512BW-NEXT: vpextrq $1, %xmm10, %rsi
-; AVX512BW-NEXT: cmpq %rdx, %rsi
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovgq %rcx, %rdx
-; AVX512BW-NEXT: vmovq %rdx, %xmm11
-; AVX512BW-NEXT: vmovq %xmm9, %rdx
-; AVX512BW-NEXT: vmovq %xmm10, %rsi
-; AVX512BW-NEXT: cmpq %rdx, %rsi
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovgq %rcx, %rdx
-; AVX512BW-NEXT: vmovq %rdx, %xmm9
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm9 = xmm9[0],xmm11[0]
-; AVX512BW-NEXT: vpextrq $1, %xmm4, %rdx
-; AVX512BW-NEXT: vpextrq $1, %xmm0, %rsi
-; AVX512BW-NEXT: cmpq %rdx, %rsi
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovgq %rcx, %rdx
-; AVX512BW-NEXT: vmovq %rdx, %xmm10
-; AVX512BW-NEXT: vmovq %xmm4, %rdx
-; AVX512BW-NEXT: vmovq %xmm0, %rsi
-; AVX512BW-NEXT: cmpq %rdx, %rsi
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovgq %rcx, %rdx
-; AVX512BW-NEXT: vmovq %rdx, %xmm0
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm10[0]
-; AVX512BW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm0
-; AVX512BW-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm0
-; AVX512BW-NEXT: vpmovqd %zmm0, %ymm8
-; AVX512BW-NEXT: vextracti32x4 $3, %zmm5, %xmm4
-; AVX512BW-NEXT: vpextrq $1, %xmm4, %rdx
-; AVX512BW-NEXT: vextracti32x4 $3, %zmm1, %xmm0
-; AVX512BW-NEXT: vpextrq $1, %xmm0, %rsi
-; AVX512BW-NEXT: cmpq %rdx, %rsi
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovgq %rcx, %rdx
-; AVX512BW-NEXT: vmovq %rdx, %xmm9
-; AVX512BW-NEXT: vmovq %xmm4, %rdx
-; AVX512BW-NEXT: vmovq %xmm0, %rsi
-; AVX512BW-NEXT: cmpq %rdx, %rsi
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovgq %rcx, %rdx
-; AVX512BW-NEXT: vmovq %rdx, %xmm0
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm9 = xmm0[0],xmm9[0]
-; AVX512BW-NEXT: vextracti32x4 $2, %zmm5, %xmm4
-; AVX512BW-NEXT: vpextrq $1, %xmm4, %rdx
-; AVX512BW-NEXT: vextracti32x4 $2, %zmm1, %xmm0
-; AVX512BW-NEXT: vpextrq $1, %xmm0, %rsi
-; AVX512BW-NEXT: cmpq %rdx, %rsi
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovgq %rcx, %rdx
-; AVX512BW-NEXT: vmovq %rdx, %xmm10
-; AVX512BW-NEXT: vmovq %xmm4, %rdx
-; AVX512BW-NEXT: vmovq %xmm0, %rsi
-; AVX512BW-NEXT: cmpq %rdx, %rsi
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovgq %rcx, %rdx
-; AVX512BW-NEXT: vmovq %rdx, %xmm0
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm10[0]
-; AVX512BW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX512BW-NEXT: vextracti32x4 $1, %zmm5, %xmm0
-; AVX512BW-NEXT: vpextrq $1, %xmm0, %rdx
-; AVX512BW-NEXT: vextracti32x4 $1, %zmm1, %xmm4
-; AVX512BW-NEXT: vpextrq $1, %xmm4, %rsi
-; AVX512BW-NEXT: cmpq %rdx, %rsi
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovgq %rcx, %rdx
-; AVX512BW-NEXT: vmovq %rdx, %xmm10
-; AVX512BW-NEXT: vmovq %xmm0, %rdx
-; AVX512BW-NEXT: vmovq %xmm4, %rsi
-; AVX512BW-NEXT: cmpq %rdx, %rsi
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovgq %rcx, %rdx
-; AVX512BW-NEXT: vmovq %rdx, %xmm0
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm10[0]
-; AVX512BW-NEXT: vpextrq $1, %xmm5, %rdx
-; AVX512BW-NEXT: vpextrq $1, %xmm1, %rsi
-; AVX512BW-NEXT: cmpq %rdx, %rsi
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovgq %rcx, %rdx
-; AVX512BW-NEXT: vmovq %rdx, %xmm4
-; AVX512BW-NEXT: vmovq %xmm5, %rdx
-; AVX512BW-NEXT: vmovq %xmm1, %rsi
-; AVX512BW-NEXT: cmpq %rdx, %rsi
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovgq %rcx, %rdx
-; AVX512BW-NEXT: vmovq %rdx, %xmm1
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm4[0]
-; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512BW-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm0
-; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm8, %zmm0
-; AVX512BW-NEXT: vpmovdw %zmm0, %ymm8
-; AVX512BW-NEXT: vextracti32x4 $3, %zmm6, %xmm1
-; AVX512BW-NEXT: vpextrq $1, %xmm1, %rdx
-; AVX512BW-NEXT: vextracti32x4 $3, %zmm2, %xmm4
-; AVX512BW-NEXT: vpextrq $1, %xmm4, %rsi
-; AVX512BW-NEXT: cmpq %rdx, %rsi
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovgq %rcx, %rdx
-; AVX512BW-NEXT: vmovq %rdx, %xmm5
-; AVX512BW-NEXT: vmovq %xmm1, %rdx
-; AVX512BW-NEXT: vmovq %xmm4, %rsi
-; AVX512BW-NEXT: cmpq %rdx, %rsi
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovgq %rcx, %rdx
-; AVX512BW-NEXT: vmovq %rdx, %xmm1
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0]
-; AVX512BW-NEXT: vextracti32x4 $2, %zmm6, %xmm4
-; AVX512BW-NEXT: vpextrq $1, %xmm4, %rdx
-; AVX512BW-NEXT: vextracti32x4 $2, %zmm2, %xmm5
-; AVX512BW-NEXT: vpextrq $1, %xmm5, %rsi
-; AVX512BW-NEXT: cmpq %rdx, %rsi
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovgq %rcx, %rdx
-; AVX512BW-NEXT: vmovq %rdx, %xmm0
-; AVX512BW-NEXT: vmovq %xmm4, %rdx
-; AVX512BW-NEXT: vmovq %xmm5, %rsi
-; AVX512BW-NEXT: cmpq %rdx, %rsi
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovgq %rcx, %rdx
-; AVX512BW-NEXT: vmovq %rdx, %xmm4
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm4[0],xmm0[0]
-; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512BW-NEXT: vextracti32x4 $1, %zmm6, %xmm0
-; AVX512BW-NEXT: vpextrq $1, %xmm0, %rdx
-; AVX512BW-NEXT: vextracti32x4 $1, %zmm2, %xmm4
-; AVX512BW-NEXT: vpextrq $1, %xmm4, %rsi
-; AVX512BW-NEXT: cmpq %rdx, %rsi
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovgq %rcx, %rdx
-; AVX512BW-NEXT: vmovq %rdx, %xmm5
-; AVX512BW-NEXT: vmovq %xmm0, %rdx
-; AVX512BW-NEXT: vmovq %xmm4, %rsi
-; AVX512BW-NEXT: cmpq %rdx, %rsi
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovgq %rcx, %rdx
-; AVX512BW-NEXT: vmovq %rdx, %xmm0
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm5[0]
-; AVX512BW-NEXT: vpextrq $1, %xmm6, %rdx
-; AVX512BW-NEXT: vpextrq $1, %xmm2, %rsi
-; AVX512BW-NEXT: cmpq %rdx, %rsi
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovgq %rcx, %rdx
-; AVX512BW-NEXT: vmovq %rdx, %xmm4
-; AVX512BW-NEXT: vmovq %xmm6, %rdx
-; AVX512BW-NEXT: vmovq %xmm2, %rsi
-; AVX512BW-NEXT: cmpq %rdx, %rsi
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovgq %rcx, %rdx
-; AVX512BW-NEXT: vmovq %rdx, %xmm2
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0]
-; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0
-; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpmovqd %zmm0, %ymm1
-; AVX512BW-NEXT: vextracti32x4 $3, %zmm7, %xmm0
-; AVX512BW-NEXT: vpextrq $1, %xmm0, %rdx
-; AVX512BW-NEXT: vextracti32x4 $3, %zmm3, %xmm2
-; AVX512BW-NEXT: vpextrq $1, %xmm2, %rsi
-; AVX512BW-NEXT: cmpq %rdx, %rsi
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovgq %rcx, %rdx
-; AVX512BW-NEXT: vmovq %rdx, %xmm4
-; AVX512BW-NEXT: vmovq %xmm0, %rdx
-; AVX512BW-NEXT: vmovq %xmm2, %rsi
-; AVX512BW-NEXT: cmpq %rdx, %rsi
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovgq %rcx, %rdx
-; AVX512BW-NEXT: vmovq %rdx, %xmm0
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
-; AVX512BW-NEXT: vextracti32x4 $2, %zmm7, %xmm2
-; AVX512BW-NEXT: vpextrq $1, %xmm2, %rdx
-; AVX512BW-NEXT: vextracti32x4 $2, %zmm3, %xmm4
-; AVX512BW-NEXT: vpextrq $1, %xmm4, %rsi
-; AVX512BW-NEXT: cmpq %rdx, %rsi
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovgq %rcx, %rdx
-; AVX512BW-NEXT: vmovq %rdx, %xmm5
-; AVX512BW-NEXT: vmovq %xmm2, %rdx
-; AVX512BW-NEXT: vmovq %xmm4, %rsi
-; AVX512BW-NEXT: cmpq %rdx, %rsi
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovgq %rcx, %rdx
-; AVX512BW-NEXT: vmovq %rdx, %xmm2
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm5[0]
-; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm2
-; AVX512BW-NEXT: vextracti32x4 $1, %zmm7, %xmm0
-; AVX512BW-NEXT: vpextrq $1, %xmm0, %rdx
-; AVX512BW-NEXT: vextracti32x4 $1, %zmm3, %xmm4
-; AVX512BW-NEXT: vpextrq $1, %xmm4, %rsi
-; AVX512BW-NEXT: cmpq %rdx, %rsi
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovgq %rcx, %rdx
-; AVX512BW-NEXT: vmovq %rdx, %xmm5
-; AVX512BW-NEXT: vmovq %xmm0, %rdx
-; AVX512BW-NEXT: vmovq %xmm4, %rsi
-; AVX512BW-NEXT: cmpq %rdx, %rsi
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovgq %rcx, %rdx
-; AVX512BW-NEXT: vmovq %rdx, %xmm0
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm5[0]
-; AVX512BW-NEXT: vpextrq $1, %xmm7, %rdx
-; AVX512BW-NEXT: vpextrq $1, %xmm3, %rsi
-; AVX512BW-NEXT: cmpq %rdx, %rsi
-; AVX512BW-NEXT: movl $0, %edx
-; AVX512BW-NEXT: cmovgq %rcx, %rdx
-; AVX512BW-NEXT: vmovq %rdx, %xmm4
-; AVX512BW-NEXT: vmovq %xmm7, %rdx
-; AVX512BW-NEXT: vmovq %xmm3, %rsi
-; AVX512BW-NEXT: cmpq %rdx, %rsi
-; AVX512BW-NEXT: cmovgq %rcx, %rax
-; AVX512BW-NEXT: vmovq %rax, %xmm3
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0]
-; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0
-; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
-; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm8, %zmm0
-; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpcmpgtq %zmm4, %zmm0, %k0
+; AVX512BW-NEXT: vpcmpgtq %zmm5, %zmm1, %k1
+; AVX512BW-NEXT: kunpckbw %k0, %k1, %k0
+; AVX512BW-NEXT: vpcmpgtq %zmm6, %zmm2, %k1
+; AVX512BW-NEXT: vpcmpgtq %zmm7, %zmm3, %k2
+; AVX512BW-NEXT: kunpckbw %k1, %k2, %k1
+; AVX512BW-NEXT: kunpckwd %k0, %k1, %k0
+; AVX512BW-NEXT: vpmovm2b %k0, %zmm0
+; AVX512BW-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
; AVX512BW-NEXT: retq
%1 = icmp sgt <32 x i64> %a0, %a1
ret <32 x i1> %1
diff --git a/test/CodeGen/X86/vector-extend-inreg.ll b/test/CodeGen/X86/vector-extend-inreg.ll
index a8db0d4cd9d8..8d55e5da05da 100644
--- a/test/CodeGen/X86/vector-extend-inreg.ll
+++ b/test/CodeGen/X86/vector-extend-inreg.ll
@@ -6,7 +6,7 @@
define i64 @extract_any_extend_vector_inreg_v16i64(<16 x i64> %a0, i32 %a1) nounwind {
; X32-SSE-LABEL: extract_any_extend_vector_inreg_v16i64:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: pushl %ebp
; X32-SSE-NEXT: movl %esp, %ebp
; X32-SSE-NEXT: andl $-128, %esp
@@ -42,12 +42,12 @@ define i64 @extract_any_extend_vector_inreg_v16i64(<16 x i64> %a0, i32 %a1) noun
; X32-SSE-NEXT: retl
;
; X64-SSE-LABEL: extract_any_extend_vector_inreg_v16i64:
-; X64-SSE: # BB#0:
+; X64-SSE: # %bb.0:
; X64-SSE-NEXT: pushq %rbp
; X64-SSE-NEXT: movq %rsp, %rbp
; X64-SSE-NEXT: andq $-128, %rsp
; X64-SSE-NEXT: subq $256, %rsp # imm = 0x100
-; X64-SSE-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-SSE-NEXT: # kill: def %edi killed %edi def %rdi
; X64-SSE-NEXT: psrldq {{.*#+}} xmm7 = xmm7[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero
; X64-SSE-NEXT: xorps %xmm0, %xmm0
; X64-SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
@@ -65,14 +65,14 @@ define i64 @extract_any_extend_vector_inreg_v16i64(<16 x i64> %a0, i32 %a1) noun
; X64-SSE-NEXT: retq
;
; X32-AVX-LABEL: extract_any_extend_vector_inreg_v16i64:
-; X32-AVX: # BB#0:
+; X32-AVX: # %bb.0:
; X32-AVX-NEXT: pushl %ebp
; X32-AVX-NEXT: movl %esp, %ebp
; X32-AVX-NEXT: andl $-128, %esp
; X32-AVX-NEXT: subl $384, %esp # imm = 0x180
; X32-AVX-NEXT: movl 40(%ebp), %ecx
; X32-AVX-NEXT: vbroadcastsd 32(%ebp), %ymm0
-; X32-AVX-NEXT: vxorpd %ymm1, %ymm1, %ymm1
+; X32-AVX-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; X32-AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
; X32-AVX-NEXT: vmovapd %ymm1, {{[0-9]+}}(%esp)
; X32-AVX-NEXT: vmovapd %ymm1, {{[0-9]+}}(%esp)
@@ -94,14 +94,14 @@ define i64 @extract_any_extend_vector_inreg_v16i64(<16 x i64> %a0, i32 %a1) noun
; X32-AVX-NEXT: retl
;
; X64-AVX-LABEL: extract_any_extend_vector_inreg_v16i64:
-; X64-AVX: # BB#0:
+; X64-AVX: # %bb.0:
; X64-AVX-NEXT: pushq %rbp
; X64-AVX-NEXT: movq %rsp, %rbp
; X64-AVX-NEXT: andq $-128, %rsp
; X64-AVX-NEXT: subq $256, %rsp # imm = 0x100
-; X64-AVX-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-AVX-NEXT: vpermq {{.*#+}} ymm0 = ymm3[3,1,2,3]
-; X64-AVX-NEXT: vxorpd %ymm1, %ymm1, %ymm1
+; X64-AVX-NEXT: # kill: def %edi killed %edi def %rdi
+; X64-AVX-NEXT: vpermpd {{.*#+}} ymm0 = ymm3[3,1,2,3]
+; X64-AVX-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; X64-AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
; X64-AVX-NEXT: vmovapd %ymm1, {{[0-9]+}}(%rsp)
; X64-AVX-NEXT: vmovapd %ymm1, {{[0-9]+}}(%rsp)
diff --git a/test/CodeGen/X86/vector-half-conversions.ll b/test/CodeGen/X86/vector-half-conversions.ll
index a2a7363d7894..44fe38fa86b9 100644
--- a/test/CodeGen/X86/vector-half-conversions.ll
+++ b/test/CodeGen/X86/vector-half-conversions.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+f16c -verify-machineinstrs | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+f16c -verify-machineinstrs | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,-f16c -verify-machineinstrs | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f -verify-machineinstrs | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl -verify-machineinstrs | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512VL
;
@@ -9,35 +9,12 @@
;
define float @cvt_i16_to_f32(i16 %a0) nounwind {
-; AVX1-LABEL: cvt_i16_to_f32:
-; AVX1: # BB#0:
-; AVX1-NEXT: movswl %di, %eax
-; AVX1-NEXT: vmovd %eax, %xmm0
-; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: cvt_i16_to_f32:
-; AVX2: # BB#0:
-; AVX2-NEXT: movswl %di, %eax
-; AVX2-NEXT: vmovd %eax, %xmm0
-; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: cvt_i16_to_f32:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: movswl %di, %eax
-; AVX512F-NEXT: vmovd %eax, %xmm0
-; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0
-; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: cvt_i16_to_f32:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: movswl %di, %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm0
-; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512VL-NEXT: retq
+; ALL-LABEL: cvt_i16_to_f32:
+; ALL: # %bb.0:
+; ALL-NEXT: movswl %di, %eax
+; ALL-NEXT: vmovd %eax, %xmm0
+; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
+; ALL-NEXT: retq
%1 = bitcast i16 %a0 to half
%2 = fpext half %1 to float
ret float %2
@@ -45,13 +22,13 @@ define float @cvt_i16_to_f32(i16 %a0) nounwind {
define <4 x float> @cvt_4i16_to_4f32(<4 x i16> %a0) nounwind {
; AVX1-LABEL: cvt_4i16_to_4f32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; AVX1-NEXT: vmovq %xmm0, %rax
; AVX1-NEXT: movq %rax, %rcx
; AVX1-NEXT: movq %rax, %rdx
; AVX1-NEXT: movswl %ax, %esi
-; AVX1-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill>
+; AVX1-NEXT: # kill: def %eax killed %eax killed %rax
; AVX1-NEXT: shrl $16, %eax
; AVX1-NEXT: shrq $32, %rcx
; AVX1-NEXT: shrq $48, %rdx
@@ -72,13 +49,13 @@ define <4 x float> @cvt_4i16_to_4f32(<4 x i16> %a0) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: cvt_4i16_to_4f32:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; AVX2-NEXT: vmovq %xmm0, %rax
; AVX2-NEXT: movq %rax, %rcx
; AVX2-NEXT: movq %rax, %rdx
; AVX2-NEXT: movswl %ax, %esi
-; AVX2-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill>
+; AVX2-NEXT: # kill: def %eax killed %eax killed %rax
; AVX2-NEXT: shrl $16, %eax
; AVX2-NEXT: shrq $32, %rcx
; AVX2-NEXT: shrq $48, %rdx
@@ -99,41 +76,40 @@ define <4 x float> @cvt_4i16_to_4f32(<4 x i16> %a0) nounwind {
; AVX2-NEXT: retq
;
; AVX512F-LABEL: cvt_4i16_to_4f32:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; AVX512F-NEXT: vmovq %xmm0, %rax
; AVX512F-NEXT: movq %rax, %rcx
; AVX512F-NEXT: movq %rax, %rdx
; AVX512F-NEXT: movswl %ax, %esi
-; AVX512F-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill>
+; AVX512F-NEXT: # kill: def %eax killed %eax killed %rax
; AVX512F-NEXT: shrl $16, %eax
; AVX512F-NEXT: shrq $32, %rcx
; AVX512F-NEXT: shrq $48, %rdx
; AVX512F-NEXT: movswl %dx, %edx
; AVX512F-NEXT: vmovd %edx, %xmm0
-; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0
+; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX512F-NEXT: movswl %cx, %ecx
; AVX512F-NEXT: vmovd %ecx, %xmm1
-; AVX512F-NEXT: vcvtph2ps %ymm1, %zmm1
+; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm1
; AVX512F-NEXT: cwtl
; AVX512F-NEXT: vmovd %eax, %xmm2
-; AVX512F-NEXT: vcvtph2ps %ymm2, %zmm2
+; AVX512F-NEXT: vcvtph2ps %xmm2, %xmm2
; AVX512F-NEXT: vmovd %esi, %xmm3
-; AVX512F-NEXT: vcvtph2ps %ymm3, %zmm3
+; AVX512F-NEXT: vcvtph2ps %xmm3, %xmm3
; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: cvt_4i16_to_4f32:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpmovdw %xmm0, -{{[0-9]+}}(%rsp)
; AVX512VL-NEXT: movq -{{[0-9]+}}(%rsp), %rax
; AVX512VL-NEXT: movq %rax, %rcx
; AVX512VL-NEXT: movq %rax, %rdx
; AVX512VL-NEXT: movswl %ax, %esi
-; AVX512VL-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill>
+; AVX512VL-NEXT: # kill: def %eax killed %eax killed %rax
; AVX512VL-NEXT: shrl $16, %eax
; AVX512VL-NEXT: shrq $32, %rcx
; AVX512VL-NEXT: shrq $48, %rdx
@@ -159,12 +135,12 @@ define <4 x float> @cvt_4i16_to_4f32(<4 x i16> %a0) nounwind {
define <4 x float> @cvt_8i16_to_4f32(<8 x i16> %a0) nounwind {
; AVX1-LABEL: cvt_8i16_to_4f32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovq %xmm0, %rax
; AVX1-NEXT: movq %rax, %rcx
; AVX1-NEXT: movq %rax, %rdx
; AVX1-NEXT: movswl %ax, %esi
-; AVX1-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill>
+; AVX1-NEXT: # kill: def %eax killed %eax killed %rax
; AVX1-NEXT: shrl $16, %eax
; AVX1-NEXT: shrq $32, %rcx
; AVX1-NEXT: shrq $48, %rdx
@@ -185,12 +161,12 @@ define <4 x float> @cvt_8i16_to_4f32(<8 x i16> %a0) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: cvt_8i16_to_4f32:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vmovq %xmm0, %rax
; AVX2-NEXT: movq %rax, %rcx
; AVX2-NEXT: movq %rax, %rdx
; AVX2-NEXT: movswl %ax, %esi
-; AVX2-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill>
+; AVX2-NEXT: # kill: def %eax killed %eax killed %rax
; AVX2-NEXT: shrl $16, %eax
; AVX2-NEXT: shrq $32, %rcx
; AVX2-NEXT: shrq $48, %rdx
@@ -211,41 +187,40 @@ define <4 x float> @cvt_8i16_to_4f32(<8 x i16> %a0) nounwind {
; AVX2-NEXT: retq
;
; AVX512F-LABEL: cvt_8i16_to_4f32:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovq %xmm0, %rax
; AVX512F-NEXT: movq %rax, %rcx
; AVX512F-NEXT: movq %rax, %rdx
; AVX512F-NEXT: movswl %ax, %esi
-; AVX512F-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill>
+; AVX512F-NEXT: # kill: def %eax killed %eax killed %rax
; AVX512F-NEXT: shrl $16, %eax
; AVX512F-NEXT: shrq $32, %rcx
; AVX512F-NEXT: shrq $48, %rdx
; AVX512F-NEXT: movswl %dx, %edx
; AVX512F-NEXT: vmovd %edx, %xmm0
-; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0
+; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX512F-NEXT: movswl %cx, %ecx
; AVX512F-NEXT: vmovd %ecx, %xmm1
-; AVX512F-NEXT: vcvtph2ps %ymm1, %zmm1
+; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm1
; AVX512F-NEXT: cwtl
; AVX512F-NEXT: vmovd %eax, %xmm2
-; AVX512F-NEXT: vcvtph2ps %ymm2, %zmm2
+; AVX512F-NEXT: vcvtph2ps %xmm2, %xmm2
; AVX512F-NEXT: vmovd %esi, %xmm3
-; AVX512F-NEXT: vcvtph2ps %ymm3, %zmm3
+; AVX512F-NEXT: vcvtph2ps %xmm3, %xmm3
; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: cvt_8i16_to_4f32:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; AVX512VL-NEXT: vpmovdw %xmm0, -{{[0-9]+}}(%rsp)
; AVX512VL-NEXT: movq -{{[0-9]+}}(%rsp), %rax
; AVX512VL-NEXT: movq %rax, %rcx
; AVX512VL-NEXT: movq %rax, %rdx
; AVX512VL-NEXT: movswl %ax, %esi
-; AVX512VL-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill>
+; AVX512VL-NEXT: # kill: def %eax killed %eax killed %rax
; AVX512VL-NEXT: shrl $16, %eax
; AVX512VL-NEXT: shrq $32, %rcx
; AVX512VL-NEXT: shrq $48, %rdx
@@ -271,201 +246,54 @@ define <4 x float> @cvt_8i16_to_4f32(<8 x i16> %a0) nounwind {
}
define <8 x float> @cvt_8i16_to_8f32(<8 x i16> %a0) nounwind {
-; AVX1-LABEL: cvt_8i16_to_8f32:
-; AVX1: # BB#0:
-; AVX1-NEXT: vpextrq $1, %xmm0, %rdx
-; AVX1-NEXT: movq %rdx, %r8
-; AVX1-NEXT: movq %rdx, %r10
-; AVX1-NEXT: movswl %dx, %r9d
-; AVX1-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<kill>
-; AVX1-NEXT: shrl $16, %edx
-; AVX1-NEXT: shrq $32, %r8
-; AVX1-NEXT: shrq $48, %r10
-; AVX1-NEXT: vmovq %xmm0, %rdi
-; AVX1-NEXT: movq %rdi, %rax
-; AVX1-NEXT: movq %rdi, %rsi
-; AVX1-NEXT: movswl %di, %ecx
-; AVX1-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<kill>
-; AVX1-NEXT: shrl $16, %edi
-; AVX1-NEXT: shrq $32, %rax
-; AVX1-NEXT: shrq $48, %rsi
-; AVX1-NEXT: movswl %si, %esi
-; AVX1-NEXT: vmovd %esi, %xmm0
-; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX1-NEXT: cwtl
-; AVX1-NEXT: vmovd %eax, %xmm1
-; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX1-NEXT: movswl %di, %eax
-; AVX1-NEXT: vmovd %eax, %xmm2
-; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2
-; AVX1-NEXT: vmovd %ecx, %xmm3
-; AVX1-NEXT: vcvtph2ps %xmm3, %xmm3
-; AVX1-NEXT: movswl %r10w, %eax
-; AVX1-NEXT: vmovd %eax, %xmm4
-; AVX1-NEXT: vcvtph2ps %xmm4, %xmm4
-; AVX1-NEXT: movswl %r8w, %eax
-; AVX1-NEXT: vmovd %eax, %xmm5
-; AVX1-NEXT: vcvtph2ps %xmm5, %xmm5
-; AVX1-NEXT: movswl %dx, %eax
-; AVX1-NEXT: vmovd %eax, %xmm6
-; AVX1-NEXT: vcvtph2ps %xmm6, %xmm6
-; AVX1-NEXT: vmovd %r9d, %xmm7
-; AVX1-NEXT: vcvtph2ps %xmm7, %xmm7
-; AVX1-NEXT: vinsertps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[2,3]
-; AVX1-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3]
-; AVX1-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0]
-; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
-; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
-; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: cvt_8i16_to_8f32:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpextrq $1, %xmm0, %rdx
-; AVX2-NEXT: movq %rdx, %r8
-; AVX2-NEXT: movq %rdx, %r10
-; AVX2-NEXT: movswl %dx, %r9d
-; AVX2-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<kill>
-; AVX2-NEXT: shrl $16, %edx
-; AVX2-NEXT: shrq $32, %r8
-; AVX2-NEXT: shrq $48, %r10
-; AVX2-NEXT: vmovq %xmm0, %rdi
-; AVX2-NEXT: movq %rdi, %rax
-; AVX2-NEXT: movq %rdi, %rsi
-; AVX2-NEXT: movswl %di, %ecx
-; AVX2-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<kill>
-; AVX2-NEXT: shrl $16, %edi
-; AVX2-NEXT: shrq $32, %rax
-; AVX2-NEXT: shrq $48, %rsi
-; AVX2-NEXT: movswl %si, %esi
-; AVX2-NEXT: vmovd %esi, %xmm0
-; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX2-NEXT: cwtl
-; AVX2-NEXT: vmovd %eax, %xmm1
-; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX2-NEXT: movswl %di, %eax
-; AVX2-NEXT: vmovd %eax, %xmm2
-; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2
-; AVX2-NEXT: vmovd %ecx, %xmm3
-; AVX2-NEXT: vcvtph2ps %xmm3, %xmm3
-; AVX2-NEXT: movswl %r10w, %eax
-; AVX2-NEXT: vmovd %eax, %xmm4
-; AVX2-NEXT: vcvtph2ps %xmm4, %xmm4
-; AVX2-NEXT: movswl %r8w, %eax
-; AVX2-NEXT: vmovd %eax, %xmm5
-; AVX2-NEXT: vcvtph2ps %xmm5, %xmm5
-; AVX2-NEXT: movswl %dx, %eax
-; AVX2-NEXT: vmovd %eax, %xmm6
-; AVX2-NEXT: vcvtph2ps %xmm6, %xmm6
-; AVX2-NEXT: vmovd %r9d, %xmm7
-; AVX2-NEXT: vcvtph2ps %xmm7, %xmm7
-; AVX2-NEXT: vinsertps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[2,3]
-; AVX2-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3]
-; AVX2-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0]
-; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
-; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
-; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: cvt_8i16_to_8f32:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vpextrq $1, %xmm0, %rdx
-; AVX512F-NEXT: movq %rdx, %r8
-; AVX512F-NEXT: movq %rdx, %r9
-; AVX512F-NEXT: movswl %dx, %r10d
-; AVX512F-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<kill>
-; AVX512F-NEXT: shrl $16, %edx
-; AVX512F-NEXT: shrq $32, %r8
-; AVX512F-NEXT: shrq $48, %r9
-; AVX512F-NEXT: vmovq %xmm0, %rdi
-; AVX512F-NEXT: movq %rdi, %rax
-; AVX512F-NEXT: movq %rdi, %rcx
-; AVX512F-NEXT: movswl %di, %esi
-; AVX512F-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<kill>
-; AVX512F-NEXT: shrl $16, %edi
-; AVX512F-NEXT: shrq $32, %rax
-; AVX512F-NEXT: shrq $48, %rcx
-; AVX512F-NEXT: movswl %cx, %ecx
-; AVX512F-NEXT: vmovd %ecx, %xmm0
-; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0
-; AVX512F-NEXT: cwtl
-; AVX512F-NEXT: vmovd %eax, %xmm1
-; AVX512F-NEXT: vcvtph2ps %ymm1, %zmm1
-; AVX512F-NEXT: movswl %di, %eax
-; AVX512F-NEXT: vmovd %eax, %xmm2
-; AVX512F-NEXT: vcvtph2ps %ymm2, %zmm2
-; AVX512F-NEXT: vmovd %esi, %xmm3
-; AVX512F-NEXT: vcvtph2ps %ymm3, %zmm3
-; AVX512F-NEXT: movswl %r9w, %eax
-; AVX512F-NEXT: vmovd %eax, %xmm4
-; AVX512F-NEXT: vcvtph2ps %ymm4, %zmm4
-; AVX512F-NEXT: movswl %r8w, %eax
-; AVX512F-NEXT: vmovd %eax, %xmm5
-; AVX512F-NEXT: vcvtph2ps %ymm5, %zmm5
-; AVX512F-NEXT: movswl %dx, %eax
-; AVX512F-NEXT: vmovd %eax, %xmm6
-; AVX512F-NEXT: vcvtph2ps %ymm6, %zmm6
-; AVX512F-NEXT: vmovd %r10d, %xmm7
-; AVX512F-NEXT: vcvtph2ps %ymm7, %zmm7
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[2,3]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; AVX512F-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: cvt_8i16_to_8f32:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vpextrq $1, %xmm0, %rdx
-; AVX512VL-NEXT: movq %rdx, %r8
-; AVX512VL-NEXT: movq %rdx, %r10
-; AVX512VL-NEXT: movswl %dx, %r9d
-; AVX512VL-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<kill>
-; AVX512VL-NEXT: shrl $16, %edx
-; AVX512VL-NEXT: shrq $32, %r8
-; AVX512VL-NEXT: shrq $48, %r10
-; AVX512VL-NEXT: vmovq %xmm0, %rdi
-; AVX512VL-NEXT: movq %rdi, %rax
-; AVX512VL-NEXT: movq %rdi, %rsi
-; AVX512VL-NEXT: movswl %di, %ecx
-; AVX512VL-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<kill>
-; AVX512VL-NEXT: shrl $16, %edi
-; AVX512VL-NEXT: shrq $32, %rax
-; AVX512VL-NEXT: shrq $48, %rsi
-; AVX512VL-NEXT: movswl %si, %esi
-; AVX512VL-NEXT: vmovd %esi, %xmm0
-; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512VL-NEXT: cwtl
-; AVX512VL-NEXT: vmovd %eax, %xmm1
-; AVX512VL-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX512VL-NEXT: movswl %di, %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm2
-; AVX512VL-NEXT: vcvtph2ps %xmm2, %xmm2
-; AVX512VL-NEXT: vmovd %ecx, %xmm3
-; AVX512VL-NEXT: vcvtph2ps %xmm3, %xmm3
-; AVX512VL-NEXT: movswl %r10w, %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm4
-; AVX512VL-NEXT: vcvtph2ps %xmm4, %xmm4
-; AVX512VL-NEXT: movswl %r8w, %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm5
-; AVX512VL-NEXT: vcvtph2ps %xmm5, %xmm5
-; AVX512VL-NEXT: movswl %dx, %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm6
-; AVX512VL-NEXT: vcvtph2ps %xmm6, %xmm6
-; AVX512VL-NEXT: vmovd %r9d, %xmm7
-; AVX512VL-NEXT: vcvtph2ps %xmm7, %xmm7
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[2,3]
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3]
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0]
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; AVX512VL-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
-; AVX512VL-NEXT: retq
+; ALL-LABEL: cvt_8i16_to_8f32:
+; ALL: # %bb.0:
+; ALL-NEXT: vpextrq $1, %xmm0, %rdx
+; ALL-NEXT: movq %rdx, %r8
+; ALL-NEXT: movq %rdx, %r10
+; ALL-NEXT: movswl %dx, %r9d
+; ALL-NEXT: # kill: def %edx killed %edx killed %rdx
+; ALL-NEXT: shrl $16, %edx
+; ALL-NEXT: shrq $32, %r8
+; ALL-NEXT: shrq $48, %r10
+; ALL-NEXT: vmovq %xmm0, %rdi
+; ALL-NEXT: movq %rdi, %rax
+; ALL-NEXT: movq %rdi, %rsi
+; ALL-NEXT: movswl %di, %ecx
+; ALL-NEXT: # kill: def %edi killed %edi killed %rdi
+; ALL-NEXT: shrl $16, %edi
+; ALL-NEXT: shrq $32, %rax
+; ALL-NEXT: shrq $48, %rsi
+; ALL-NEXT: movswl %si, %esi
+; ALL-NEXT: vmovd %esi, %xmm0
+; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
+; ALL-NEXT: cwtl
+; ALL-NEXT: vmovd %eax, %xmm1
+; ALL-NEXT: vcvtph2ps %xmm1, %xmm1
+; ALL-NEXT: movswl %di, %eax
+; ALL-NEXT: vmovd %eax, %xmm2
+; ALL-NEXT: vcvtph2ps %xmm2, %xmm2
+; ALL-NEXT: vmovd %ecx, %xmm3
+; ALL-NEXT: vcvtph2ps %xmm3, %xmm3
+; ALL-NEXT: movswl %r10w, %eax
+; ALL-NEXT: vmovd %eax, %xmm4
+; ALL-NEXT: vcvtph2ps %xmm4, %xmm4
+; ALL-NEXT: movswl %r8w, %eax
+; ALL-NEXT: vmovd %eax, %xmm5
+; ALL-NEXT: vcvtph2ps %xmm5, %xmm5
+; ALL-NEXT: movswl %dx, %eax
+; ALL-NEXT: vmovd %eax, %xmm6
+; ALL-NEXT: vcvtph2ps %xmm6, %xmm6
+; ALL-NEXT: vmovd %r9d, %xmm7
+; ALL-NEXT: vcvtph2ps %xmm7, %xmm7
+; ALL-NEXT: vinsertps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[2,3]
+; ALL-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3]
+; ALL-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0]
+; ALL-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
+; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
+; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; ALL-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; ALL-NEXT: retq
%1 = bitcast <8 x i16> %a0 to <8 x half>
%2 = fpext <8 x half> %1 to <8 x float>
ret <8 x float> %2
@@ -473,7 +301,7 @@ define <8 x float> @cvt_8i16_to_8f32(<8 x i16> %a0) nounwind {
define <16 x float> @cvt_16i16_to_16f32(<16 x i16> %a0) nounwind {
; AVX1-LABEL: cvt_16i16_to_16f32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
; AVX1-NEXT: vmovq %xmm4, %rax
; AVX1-NEXT: movq %rax, %rcx
@@ -485,7 +313,7 @@ define <16 x float> @cvt_16i16_to_16f32(<16 x i16> %a0) nounwind {
; AVX1-NEXT: movswl %cx, %ecx
; AVX1-NEXT: vmovd %ecx, %xmm9
; AVX1-NEXT: movswl %ax, %ecx
-; AVX1-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill>
+; AVX1-NEXT: # kill: def %eax killed %eax killed %rax
; AVX1-NEXT: shrl $16, %eax
; AVX1-NEXT: cwtl
; AVX1-NEXT: vmovd %eax, %xmm10
@@ -500,7 +328,7 @@ define <16 x float> @cvt_16i16_to_16f32(<16 x i16> %a0) nounwind {
; AVX1-NEXT: movswl %cx, %ecx
; AVX1-NEXT: vmovd %ecx, %xmm13
; AVX1-NEXT: movswl %ax, %ecx
-; AVX1-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill>
+; AVX1-NEXT: # kill: def %eax killed %eax killed %rax
; AVX1-NEXT: shrl $16, %eax
; AVX1-NEXT: cwtl
; AVX1-NEXT: vmovd %eax, %xmm14
@@ -515,7 +343,7 @@ define <16 x float> @cvt_16i16_to_16f32(<16 x i16> %a0) nounwind {
; AVX1-NEXT: movswl %cx, %ecx
; AVX1-NEXT: vmovd %ecx, %xmm3
; AVX1-NEXT: movswl %ax, %ecx
-; AVX1-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill>
+; AVX1-NEXT: # kill: def %eax killed %eax killed %rax
; AVX1-NEXT: shrl $16, %eax
; AVX1-NEXT: cwtl
; AVX1-NEXT: vmovd %eax, %xmm4
@@ -568,7 +396,7 @@ define <16 x float> @cvt_16i16_to_16f32(<16 x i16> %a0) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: cvt_16i16_to_16f32:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4
; AVX2-NEXT: vmovq %xmm4, %rax
; AVX2-NEXT: movq %rax, %rcx
@@ -580,7 +408,7 @@ define <16 x float> @cvt_16i16_to_16f32(<16 x i16> %a0) nounwind {
; AVX2-NEXT: movswl %cx, %ecx
; AVX2-NEXT: vmovd %ecx, %xmm9
; AVX2-NEXT: movswl %ax, %ecx
-; AVX2-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill>
+; AVX2-NEXT: # kill: def %eax killed %eax killed %rax
; AVX2-NEXT: shrl $16, %eax
; AVX2-NEXT: cwtl
; AVX2-NEXT: vmovd %eax, %xmm10
@@ -595,7 +423,7 @@ define <16 x float> @cvt_16i16_to_16f32(<16 x i16> %a0) nounwind {
; AVX2-NEXT: movswl %cx, %ecx
; AVX2-NEXT: vmovd %ecx, %xmm13
; AVX2-NEXT: movswl %ax, %ecx
-; AVX2-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill>
+; AVX2-NEXT: # kill: def %eax killed %eax killed %rax
; AVX2-NEXT: shrl $16, %eax
; AVX2-NEXT: cwtl
; AVX2-NEXT: vmovd %eax, %xmm14
@@ -610,7 +438,7 @@ define <16 x float> @cvt_16i16_to_16f32(<16 x i16> %a0) nounwind {
; AVX2-NEXT: movswl %cx, %ecx
; AVX2-NEXT: vmovd %ecx, %xmm3
; AVX2-NEXT: movswl %ax, %ecx
-; AVX2-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill>
+; AVX2-NEXT: # kill: def %eax killed %eax killed %rax
; AVX2-NEXT: shrl $16, %eax
; AVX2-NEXT: cwtl
; AVX2-NEXT: vmovd %eax, %xmm4
@@ -663,103 +491,103 @@ define <16 x float> @cvt_16i16_to_16f32(<16 x i16> %a0) nounwind {
; AVX2-NEXT: retq
;
; AVX512F-LABEL: cvt_16i16_to_16f32:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm10
; AVX512F-NEXT: vmovq %xmm0, %rax
; AVX512F-NEXT: movq %rax, %rcx
; AVX512F-NEXT: shrq $48, %rcx
; AVX512F-NEXT: movswl %cx, %ecx
-; AVX512F-NEXT: vmovd %ecx, %xmm2
+; AVX512F-NEXT: vmovd %ecx, %xmm8
; AVX512F-NEXT: movq %rax, %rcx
; AVX512F-NEXT: shrq $32, %rcx
; AVX512F-NEXT: movswl %cx, %ecx
-; AVX512F-NEXT: vmovd %ecx, %xmm3
+; AVX512F-NEXT: vmovd %ecx, %xmm9
; AVX512F-NEXT: movswl %ax, %ecx
-; AVX512F-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill>
+; AVX512F-NEXT: # kill: def %eax killed %eax killed %rax
; AVX512F-NEXT: shrl $16, %eax
; AVX512F-NEXT: cwtl
-; AVX512F-NEXT: vmovd %eax, %xmm4
+; AVX512F-NEXT: vmovd %eax, %xmm11
; AVX512F-NEXT: vpextrq $1, %xmm0, %rax
-; AVX512F-NEXT: vmovd %ecx, %xmm0
+; AVX512F-NEXT: vmovd %ecx, %xmm12
; AVX512F-NEXT: movq %rax, %rcx
; AVX512F-NEXT: shrq $48, %rcx
; AVX512F-NEXT: movswl %cx, %ecx
-; AVX512F-NEXT: vmovd %ecx, %xmm5
+; AVX512F-NEXT: vmovd %ecx, %xmm13
; AVX512F-NEXT: movq %rax, %rcx
; AVX512F-NEXT: shrq $32, %rcx
; AVX512F-NEXT: movswl %cx, %ecx
-; AVX512F-NEXT: vmovd %ecx, %xmm6
+; AVX512F-NEXT: vmovd %ecx, %xmm14
; AVX512F-NEXT: movswl %ax, %ecx
-; AVX512F-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill>
+; AVX512F-NEXT: # kill: def %eax killed %eax killed %rax
; AVX512F-NEXT: shrl $16, %eax
; AVX512F-NEXT: cwtl
-; AVX512F-NEXT: vmovd %eax, %xmm7
-; AVX512F-NEXT: vmovq %xmm1, %rax
-; AVX512F-NEXT: vmovd %ecx, %xmm8
+; AVX512F-NEXT: vmovd %eax, %xmm15
+; AVX512F-NEXT: vmovq %xmm10, %rax
+; AVX512F-NEXT: vmovd %ecx, %xmm2
; AVX512F-NEXT: movq %rax, %rcx
; AVX512F-NEXT: shrq $48, %rcx
; AVX512F-NEXT: movswl %cx, %ecx
-; AVX512F-NEXT: vmovd %ecx, %xmm9
+; AVX512F-NEXT: vmovd %ecx, %xmm3
; AVX512F-NEXT: movq %rax, %rcx
; AVX512F-NEXT: shrq $32, %rcx
; AVX512F-NEXT: movswl %cx, %ecx
-; AVX512F-NEXT: vmovd %ecx, %xmm10
+; AVX512F-NEXT: vmovd %ecx, %xmm1
; AVX512F-NEXT: movswl %ax, %ecx
-; AVX512F-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill>
+; AVX512F-NEXT: # kill: def %eax killed %eax killed %rax
; AVX512F-NEXT: shrl $16, %eax
; AVX512F-NEXT: cwtl
-; AVX512F-NEXT: vmovd %eax, %xmm11
-; AVX512F-NEXT: vpextrq $1, %xmm1, %rax
-; AVX512F-NEXT: vmovd %ecx, %xmm1
+; AVX512F-NEXT: vmovd %eax, %xmm4
+; AVX512F-NEXT: vpextrq $1, %xmm10, %rax
+; AVX512F-NEXT: vmovd %ecx, %xmm10
; AVX512F-NEXT: movq %rax, %rcx
; AVX512F-NEXT: shrq $48, %rcx
; AVX512F-NEXT: movswl %cx, %ecx
-; AVX512F-NEXT: vmovd %ecx, %xmm12
+; AVX512F-NEXT: vmovd %ecx, %xmm5
; AVX512F-NEXT: movq %rax, %rcx
; AVX512F-NEXT: shrq $32, %rcx
; AVX512F-NEXT: movswl %cx, %ecx
-; AVX512F-NEXT: vmovd %ecx, %xmm13
+; AVX512F-NEXT: vmovd %ecx, %xmm6
; AVX512F-NEXT: movl %eax, %ecx
; AVX512F-NEXT: shrl $16, %ecx
; AVX512F-NEXT: movswl %cx, %ecx
-; AVX512F-NEXT: vmovd %ecx, %xmm14
+; AVX512F-NEXT: vmovd %ecx, %xmm7
; AVX512F-NEXT: cwtl
-; AVX512F-NEXT: vmovd %eax, %xmm15
-; AVX512F-NEXT: vcvtph2ps %ymm2, %zmm16
-; AVX512F-NEXT: vcvtph2ps %ymm3, %zmm3
-; AVX512F-NEXT: vcvtph2ps %ymm4, %zmm4
-; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0
-; AVX512F-NEXT: vcvtph2ps %ymm5, %zmm5
-; AVX512F-NEXT: vcvtph2ps %ymm6, %zmm6
-; AVX512F-NEXT: vcvtph2ps %ymm7, %zmm7
-; AVX512F-NEXT: vcvtph2ps %ymm8, %zmm8
-; AVX512F-NEXT: vcvtph2ps %ymm9, %zmm9
-; AVX512F-NEXT: vcvtph2ps %ymm10, %zmm10
-; AVX512F-NEXT: vcvtph2ps %ymm11, %zmm11
-; AVX512F-NEXT: vcvtph2ps %ymm1, %zmm1
-; AVX512F-NEXT: vcvtph2ps %ymm12, %zmm12
-; AVX512F-NEXT: vcvtph2ps %ymm13, %zmm13
-; AVX512F-NEXT: vcvtph2ps %ymm14, %zmm14
-; AVX512F-NEXT: vcvtph2ps %ymm15, %zmm15
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm15[0],xmm14[0],xmm15[2,3]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm13[0],xmm2[3]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm12[0]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[2,3]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm10[0],xmm1[3]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm9[0]
-; AVX512F-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm8[0],xmm7[0],xmm8[2,3]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm6[0],xmm2[3]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm5[0]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[2,3]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0],xmm0[3]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm16[0]
-; AVX512F-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX512F-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-NEXT: vmovd %eax, %xmm0
+; AVX512F-NEXT: vcvtph2ps %xmm8, %xmm8
+; AVX512F-NEXT: vcvtph2ps %xmm9, %xmm9
+; AVX512F-NEXT: vcvtph2ps %xmm11, %xmm11
+; AVX512F-NEXT: vcvtph2ps %xmm12, %xmm12
+; AVX512F-NEXT: vcvtph2ps %xmm13, %xmm13
+; AVX512F-NEXT: vcvtph2ps %xmm14, %xmm14
+; AVX512F-NEXT: vcvtph2ps %xmm15, %xmm15
+; AVX512F-NEXT: vcvtph2ps %xmm2, %xmm2
+; AVX512F-NEXT: vcvtph2ps %xmm3, %xmm3
+; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX512F-NEXT: vcvtph2ps %xmm4, %xmm4
+; AVX512F-NEXT: vcvtph2ps %xmm10, %xmm10
+; AVX512F-NEXT: vcvtph2ps %xmm5, %xmm5
+; AVX512F-NEXT: vcvtph2ps %xmm6, %xmm6
+; AVX512F-NEXT: vcvtph2ps %xmm7, %xmm7
+; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[2,3]
+; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm6[0],xmm0[3]
+; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm5[0]
+; AVX512F-NEXT: vinsertps {{.*#+}} xmm4 = xmm10[0],xmm4[0],xmm10[2,3]
+; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm4[0,1],xmm1[0],xmm4[3]
+; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[0]
+; AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm15[0],xmm2[2,3]
+; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm14[0],xmm1[3]
+; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm13[0]
+; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm12[0],xmm11[0],xmm12[2,3]
+; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3]
+; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0]
+; AVX512F-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX512F-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: cvt_16i16_to_16f32:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm10
; AVX512VL-NEXT: vmovq %xmm0, %rax
; AVX512VL-NEXT: movq %rax, %rcx
@@ -771,7 +599,7 @@ define <16 x float> @cvt_16i16_to_16f32(<16 x i16> %a0) nounwind {
; AVX512VL-NEXT: movswl %cx, %ecx
; AVX512VL-NEXT: vmovd %ecx, %xmm9
; AVX512VL-NEXT: movswl %ax, %ecx
-; AVX512VL-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill>
+; AVX512VL-NEXT: # kill: def %eax killed %eax killed %rax
; AVX512VL-NEXT: shrl $16, %eax
; AVX512VL-NEXT: cwtl
; AVX512VL-NEXT: vmovd %eax, %xmm11
@@ -786,7 +614,7 @@ define <16 x float> @cvt_16i16_to_16f32(<16 x i16> %a0) nounwind {
; AVX512VL-NEXT: movswl %cx, %ecx
; AVX512VL-NEXT: vmovd %ecx, %xmm14
; AVX512VL-NEXT: movswl %ax, %ecx
-; AVX512VL-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill>
+; AVX512VL-NEXT: # kill: def %eax killed %eax killed %rax
; AVX512VL-NEXT: shrl $16, %eax
; AVX512VL-NEXT: cwtl
; AVX512VL-NEXT: vmovd %eax, %xmm15
@@ -801,7 +629,7 @@ define <16 x float> @cvt_16i16_to_16f32(<16 x i16> %a0) nounwind {
; AVX512VL-NEXT: movswl %cx, %ecx
; AVX512VL-NEXT: vmovd %ecx, %xmm18
; AVX512VL-NEXT: movswl %ax, %ecx
-; AVX512VL-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill>
+; AVX512VL-NEXT: # kill: def %eax killed %eax killed %rax
; AVX512VL-NEXT: shrl $16, %eax
; AVX512VL-NEXT: cwtl
; AVX512VL-NEXT: vmovd %eax, %xmm19
@@ -863,35 +691,12 @@ define <16 x float> @cvt_16i16_to_16f32(<16 x i16> %a0) nounwind {
;
define float @load_cvt_i16_to_f32(i16* %a0) nounwind {
-; AVX1-LABEL: load_cvt_i16_to_f32:
-; AVX1: # BB#0:
-; AVX1-NEXT: movswl (%rdi), %eax
-; AVX1-NEXT: vmovd %eax, %xmm0
-; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: load_cvt_i16_to_f32:
-; AVX2: # BB#0:
-; AVX2-NEXT: movswl (%rdi), %eax
-; AVX2-NEXT: vmovd %eax, %xmm0
-; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: load_cvt_i16_to_f32:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: movswl (%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm0
-; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0
-; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: load_cvt_i16_to_f32:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: movswl (%rdi), %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm0
-; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512VL-NEXT: retq
+; ALL-LABEL: load_cvt_i16_to_f32:
+; ALL: # %bb.0:
+; ALL-NEXT: movswl (%rdi), %eax
+; ALL-NEXT: vmovd %eax, %xmm0
+; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
+; ALL-NEXT: retq
%1 = load i16, i16* %a0
%2 = bitcast i16 %1 to half
%3 = fpext half %2 to float
@@ -899,82 +704,24 @@ define float @load_cvt_i16_to_f32(i16* %a0) nounwind {
}
define <4 x float> @load_cvt_4i16_to_4f32(<4 x i16>* %a0) nounwind {
-; AVX1-LABEL: load_cvt_4i16_to_4f32:
-; AVX1: # BB#0:
-; AVX1-NEXT: movswl 6(%rdi), %eax
-; AVX1-NEXT: vmovd %eax, %xmm0
-; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX1-NEXT: movswl 4(%rdi), %eax
-; AVX1-NEXT: vmovd %eax, %xmm1
-; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX1-NEXT: movswl (%rdi), %eax
-; AVX1-NEXT: vmovd %eax, %xmm2
-; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2
-; AVX1-NEXT: movswl 2(%rdi), %eax
-; AVX1-NEXT: vmovd %eax, %xmm3
-; AVX1-NEXT: vcvtph2ps %xmm3, %xmm3
-; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
-; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
-; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: load_cvt_4i16_to_4f32:
-; AVX2: # BB#0:
-; AVX2-NEXT: movswl 6(%rdi), %eax
-; AVX2-NEXT: vmovd %eax, %xmm0
-; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX2-NEXT: movswl 4(%rdi), %eax
-; AVX2-NEXT: vmovd %eax, %xmm1
-; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX2-NEXT: movswl (%rdi), %eax
-; AVX2-NEXT: vmovd %eax, %xmm2
-; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2
-; AVX2-NEXT: movswl 2(%rdi), %eax
-; AVX2-NEXT: vmovd %eax, %xmm3
-; AVX2-NEXT: vcvtph2ps %xmm3, %xmm3
-; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
-; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
-; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: load_cvt_4i16_to_4f32:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: movswl 6(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm0
-; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0
-; AVX512F-NEXT: movswl 4(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm1
-; AVX512F-NEXT: vcvtph2ps %ymm1, %zmm1
-; AVX512F-NEXT: movswl (%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm2
-; AVX512F-NEXT: vcvtph2ps %ymm2, %zmm2
-; AVX512F-NEXT: movswl 2(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm3
-; AVX512F-NEXT: vcvtph2ps %ymm3, %zmm3
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: load_cvt_4i16_to_4f32:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: movswl 6(%rdi), %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm0
-; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512VL-NEXT: movswl 4(%rdi), %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm1
-; AVX512VL-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX512VL-NEXT: movswl (%rdi), %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm2
-; AVX512VL-NEXT: vcvtph2ps %xmm2, %xmm2
-; AVX512VL-NEXT: movswl 2(%rdi), %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm3
-; AVX512VL-NEXT: vcvtph2ps %xmm3, %xmm3
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; AVX512VL-NEXT: retq
+; ALL-LABEL: load_cvt_4i16_to_4f32:
+; ALL: # %bb.0:
+; ALL-NEXT: movswl 6(%rdi), %eax
+; ALL-NEXT: vmovd %eax, %xmm0
+; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
+; ALL-NEXT: movswl 4(%rdi), %eax
+; ALL-NEXT: vmovd %eax, %xmm1
+; ALL-NEXT: vcvtph2ps %xmm1, %xmm1
+; ALL-NEXT: movswl (%rdi), %eax
+; ALL-NEXT: vmovd %eax, %xmm2
+; ALL-NEXT: vcvtph2ps %xmm2, %xmm2
+; ALL-NEXT: movswl 2(%rdi), %eax
+; ALL-NEXT: vmovd %eax, %xmm3
+; ALL-NEXT: vcvtph2ps %xmm3, %xmm3
+; ALL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
+; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
+; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; ALL-NEXT: retq
%1 = load <4 x i16>, <4 x i16>* %a0
%2 = bitcast <4 x i16> %1 to <4 x half>
%3 = fpext <4 x half> %2 to <4 x float>
@@ -983,12 +730,12 @@ define <4 x float> @load_cvt_4i16_to_4f32(<4 x i16>* %a0) nounwind {
define <4 x float> @load_cvt_8i16_to_4f32(<8 x i16>* %a0) nounwind {
; AVX1-LABEL: load_cvt_8i16_to_4f32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: movq (%rdi), %rax
; AVX1-NEXT: movq %rax, %rcx
; AVX1-NEXT: movq %rax, %rdx
; AVX1-NEXT: movswl %ax, %esi
-; AVX1-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill>
+; AVX1-NEXT: # kill: def %eax killed %eax killed %rax
; AVX1-NEXT: shrl $16, %eax
; AVX1-NEXT: shrq $32, %rcx
; AVX1-NEXT: shrq $48, %rdx
@@ -1009,12 +756,12 @@ define <4 x float> @load_cvt_8i16_to_4f32(<8 x i16>* %a0) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: load_cvt_8i16_to_4f32:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: movq (%rdi), %rax
; AVX2-NEXT: movq %rax, %rcx
; AVX2-NEXT: movq %rax, %rdx
; AVX2-NEXT: movswl %ax, %esi
-; AVX2-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill>
+; AVX2-NEXT: # kill: def %eax killed %eax killed %rax
; AVX2-NEXT: shrl $16, %eax
; AVX2-NEXT: shrq $32, %rcx
; AVX2-NEXT: shrq $48, %rdx
@@ -1035,41 +782,40 @@ define <4 x float> @load_cvt_8i16_to_4f32(<8 x i16>* %a0) nounwind {
; AVX2-NEXT: retq
;
; AVX512F-LABEL: load_cvt_8i16_to_4f32:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: movq (%rdi), %rax
; AVX512F-NEXT: movq %rax, %rcx
; AVX512F-NEXT: movq %rax, %rdx
; AVX512F-NEXT: movswl %ax, %esi
-; AVX512F-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill>
+; AVX512F-NEXT: # kill: def %eax killed %eax killed %rax
; AVX512F-NEXT: shrl $16, %eax
; AVX512F-NEXT: shrq $32, %rcx
; AVX512F-NEXT: shrq $48, %rdx
; AVX512F-NEXT: movswl %dx, %edx
; AVX512F-NEXT: vmovd %edx, %xmm0
-; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0
+; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX512F-NEXT: movswl %cx, %ecx
; AVX512F-NEXT: vmovd %ecx, %xmm1
-; AVX512F-NEXT: vcvtph2ps %ymm1, %zmm1
+; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm1
; AVX512F-NEXT: cwtl
; AVX512F-NEXT: vmovd %eax, %xmm2
-; AVX512F-NEXT: vcvtph2ps %ymm2, %zmm2
+; AVX512F-NEXT: vcvtph2ps %xmm2, %xmm2
; AVX512F-NEXT: vmovd %esi, %xmm3
-; AVX512F-NEXT: vcvtph2ps %ymm3, %zmm3
+; AVX512F-NEXT: vcvtph2ps %xmm3, %xmm3
; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: load_cvt_8i16_to_4f32:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
; AVX512VL-NEXT: vpmovdw %xmm0, -{{[0-9]+}}(%rsp)
; AVX512VL-NEXT: movq -{{[0-9]+}}(%rsp), %rax
; AVX512VL-NEXT: movq %rax, %rcx
; AVX512VL-NEXT: movq %rax, %rdx
; AVX512VL-NEXT: movswl %ax, %esi
-; AVX512VL-NEXT: # kill: %EAX<def> %EAX<kill> %RAX<kill>
+; AVX512VL-NEXT: # kill: def %eax killed %eax killed %rax
; AVX512VL-NEXT: shrl $16, %eax
; AVX512VL-NEXT: shrq $32, %rcx
; AVX512VL-NEXT: shrq $48, %rdx
@@ -1096,145 +842,40 @@ define <4 x float> @load_cvt_8i16_to_4f32(<8 x i16>* %a0) nounwind {
}
define <8 x float> @load_cvt_8i16_to_8f32(<8 x i16>* %a0) nounwind {
-; AVX1-LABEL: load_cvt_8i16_to_8f32:
-; AVX1: # BB#0:
-; AVX1-NEXT: movswl 6(%rdi), %eax
-; AVX1-NEXT: vmovd %eax, %xmm0
-; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX1-NEXT: movswl 4(%rdi), %eax
-; AVX1-NEXT: vmovd %eax, %xmm1
-; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX1-NEXT: movswl (%rdi), %eax
-; AVX1-NEXT: vmovd %eax, %xmm2
-; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2
-; AVX1-NEXT: movswl 2(%rdi), %eax
-; AVX1-NEXT: vmovd %eax, %xmm3
-; AVX1-NEXT: vcvtph2ps %xmm3, %xmm3
-; AVX1-NEXT: movswl 14(%rdi), %eax
-; AVX1-NEXT: vmovd %eax, %xmm4
-; AVX1-NEXT: vcvtph2ps %xmm4, %xmm4
-; AVX1-NEXT: movswl 12(%rdi), %eax
-; AVX1-NEXT: vmovd %eax, %xmm5
-; AVX1-NEXT: vcvtph2ps %xmm5, %xmm5
-; AVX1-NEXT: movswl 8(%rdi), %eax
-; AVX1-NEXT: vmovd %eax, %xmm6
-; AVX1-NEXT: vcvtph2ps %xmm6, %xmm6
-; AVX1-NEXT: movswl 10(%rdi), %eax
-; AVX1-NEXT: vmovd %eax, %xmm7
-; AVX1-NEXT: vcvtph2ps %xmm7, %xmm7
-; AVX1-NEXT: vinsertps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[2,3]
-; AVX1-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3]
-; AVX1-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0]
-; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
-; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
-; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: load_cvt_8i16_to_8f32:
-; AVX2: # BB#0:
-; AVX2-NEXT: movswl 6(%rdi), %eax
-; AVX2-NEXT: vmovd %eax, %xmm0
-; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX2-NEXT: movswl 4(%rdi), %eax
-; AVX2-NEXT: vmovd %eax, %xmm1
-; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX2-NEXT: movswl (%rdi), %eax
-; AVX2-NEXT: vmovd %eax, %xmm2
-; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2
-; AVX2-NEXT: movswl 2(%rdi), %eax
-; AVX2-NEXT: vmovd %eax, %xmm3
-; AVX2-NEXT: vcvtph2ps %xmm3, %xmm3
-; AVX2-NEXT: movswl 14(%rdi), %eax
-; AVX2-NEXT: vmovd %eax, %xmm4
-; AVX2-NEXT: vcvtph2ps %xmm4, %xmm4
-; AVX2-NEXT: movswl 12(%rdi), %eax
-; AVX2-NEXT: vmovd %eax, %xmm5
-; AVX2-NEXT: vcvtph2ps %xmm5, %xmm5
-; AVX2-NEXT: movswl 8(%rdi), %eax
-; AVX2-NEXT: vmovd %eax, %xmm6
-; AVX2-NEXT: vcvtph2ps %xmm6, %xmm6
-; AVX2-NEXT: movswl 10(%rdi), %eax
-; AVX2-NEXT: vmovd %eax, %xmm7
-; AVX2-NEXT: vcvtph2ps %xmm7, %xmm7
-; AVX2-NEXT: vinsertps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[2,3]
-; AVX2-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3]
-; AVX2-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0]
-; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
-; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
-; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: load_cvt_8i16_to_8f32:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: movswl 6(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm0
-; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0
-; AVX512F-NEXT: movswl 4(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm1
-; AVX512F-NEXT: vcvtph2ps %ymm1, %zmm1
-; AVX512F-NEXT: movswl (%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm2
-; AVX512F-NEXT: vcvtph2ps %ymm2, %zmm2
-; AVX512F-NEXT: movswl 2(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm3
-; AVX512F-NEXT: vcvtph2ps %ymm3, %zmm3
-; AVX512F-NEXT: movswl 14(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm4
-; AVX512F-NEXT: vcvtph2ps %ymm4, %zmm4
-; AVX512F-NEXT: movswl 12(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm5
-; AVX512F-NEXT: vcvtph2ps %ymm5, %zmm5
-; AVX512F-NEXT: movswl 8(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm6
-; AVX512F-NEXT: vcvtph2ps %ymm6, %zmm6
-; AVX512F-NEXT: movswl 10(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm7
-; AVX512F-NEXT: vcvtph2ps %ymm7, %zmm7
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[2,3]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; AVX512F-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: load_cvt_8i16_to_8f32:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: movswl 6(%rdi), %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm0
-; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512VL-NEXT: movswl 4(%rdi), %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm1
-; AVX512VL-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX512VL-NEXT: movswl (%rdi), %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm2
-; AVX512VL-NEXT: vcvtph2ps %xmm2, %xmm2
-; AVX512VL-NEXT: movswl 2(%rdi), %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm3
-; AVX512VL-NEXT: vcvtph2ps %xmm3, %xmm3
-; AVX512VL-NEXT: movswl 14(%rdi), %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm4
-; AVX512VL-NEXT: vcvtph2ps %xmm4, %xmm4
-; AVX512VL-NEXT: movswl 12(%rdi), %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm5
-; AVX512VL-NEXT: vcvtph2ps %xmm5, %xmm5
-; AVX512VL-NEXT: movswl 8(%rdi), %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm6
-; AVX512VL-NEXT: vcvtph2ps %xmm6, %xmm6
-; AVX512VL-NEXT: movswl 10(%rdi), %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm7
-; AVX512VL-NEXT: vcvtph2ps %xmm7, %xmm7
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[2,3]
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3]
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0]
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; AVX512VL-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
-; AVX512VL-NEXT: retq
+; ALL-LABEL: load_cvt_8i16_to_8f32:
+; ALL: # %bb.0:
+; ALL-NEXT: movswl 6(%rdi), %eax
+; ALL-NEXT: vmovd %eax, %xmm0
+; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
+; ALL-NEXT: movswl 4(%rdi), %eax
+; ALL-NEXT: vmovd %eax, %xmm1
+; ALL-NEXT: vcvtph2ps %xmm1, %xmm1
+; ALL-NEXT: movswl (%rdi), %eax
+; ALL-NEXT: vmovd %eax, %xmm2
+; ALL-NEXT: vcvtph2ps %xmm2, %xmm2
+; ALL-NEXT: movswl 2(%rdi), %eax
+; ALL-NEXT: vmovd %eax, %xmm3
+; ALL-NEXT: vcvtph2ps %xmm3, %xmm3
+; ALL-NEXT: movswl 14(%rdi), %eax
+; ALL-NEXT: vmovd %eax, %xmm4
+; ALL-NEXT: vcvtph2ps %xmm4, %xmm4
+; ALL-NEXT: movswl 12(%rdi), %eax
+; ALL-NEXT: vmovd %eax, %xmm5
+; ALL-NEXT: vcvtph2ps %xmm5, %xmm5
+; ALL-NEXT: movswl 8(%rdi), %eax
+; ALL-NEXT: vmovd %eax, %xmm6
+; ALL-NEXT: vcvtph2ps %xmm6, %xmm6
+; ALL-NEXT: movswl 10(%rdi), %eax
+; ALL-NEXT: vmovd %eax, %xmm7
+; ALL-NEXT: vcvtph2ps %xmm7, %xmm7
+; ALL-NEXT: vinsertps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[2,3]
+; ALL-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3]
+; ALL-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0]
+; ALL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
+; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
+; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; ALL-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; ALL-NEXT: retq
%1 = load <8 x i16>, <8 x i16>* %a0
%2 = bitcast <8 x i16> %1 to <8 x half>
%3 = fpext <8 x half> %2 to <8 x float>
@@ -1243,7 +884,7 @@ define <8 x float> @load_cvt_8i16_to_8f32(<8 x i16>* %a0) nounwind {
define <16 x float> @load_cvt_16i16_to_16f32(<16 x i16>* %a0) nounwind {
; AVX1-LABEL: load_cvt_16i16_to_16f32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: movswl 22(%rdi), %eax
; AVX1-NEXT: vmovd %eax, %xmm0
; AVX1-NEXT: vcvtph2ps %xmm0, %xmm8
@@ -1309,7 +950,7 @@ define <16 x float> @load_cvt_16i16_to_16f32(<16 x i16>* %a0) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: load_cvt_16i16_to_16f32:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: movswl 22(%rdi), %eax
; AVX2-NEXT: vmovd %eax, %xmm0
; AVX2-NEXT: vcvtph2ps %xmm0, %xmm8
@@ -1375,74 +1016,74 @@ define <16 x float> @load_cvt_16i16_to_16f32(<16 x i16>* %a0) nounwind {
; AVX2-NEXT: retq
;
; AVX512F-LABEL: load_cvt_16i16_to_16f32:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: movswl 6(%rdi), %eax
; AVX512F-NEXT: vmovd %eax, %xmm0
-; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm16
+; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm8
; AVX512F-NEXT: movswl 4(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm1
-; AVX512F-NEXT: vcvtph2ps %ymm1, %zmm17
+; AVX512F-NEXT: vmovd %eax, %xmm0
+; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm9
; AVX512F-NEXT: movswl (%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm2
-; AVX512F-NEXT: vcvtph2ps %ymm2, %zmm2
+; AVX512F-NEXT: vmovd %eax, %xmm0
+; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm10
; AVX512F-NEXT: movswl 2(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm3
-; AVX512F-NEXT: vcvtph2ps %ymm3, %zmm3
+; AVX512F-NEXT: vmovd %eax, %xmm0
+; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm11
; AVX512F-NEXT: movswl 14(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm4
-; AVX512F-NEXT: vcvtph2ps %ymm4, %zmm4
+; AVX512F-NEXT: vmovd %eax, %xmm0
+; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm12
; AVX512F-NEXT: movswl 12(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm5
-; AVX512F-NEXT: vcvtph2ps %ymm5, %zmm5
+; AVX512F-NEXT: vmovd %eax, %xmm0
+; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm13
; AVX512F-NEXT: movswl 8(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm6
-; AVX512F-NEXT: vcvtph2ps %ymm6, %zmm6
+; AVX512F-NEXT: vmovd %eax, %xmm0
+; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm14
; AVX512F-NEXT: movswl 10(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm7
-; AVX512F-NEXT: vcvtph2ps %ymm7, %zmm7
+; AVX512F-NEXT: vmovd %eax, %xmm0
+; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm15
; AVX512F-NEXT: movswl 22(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm8
-; AVX512F-NEXT: vcvtph2ps %ymm8, %zmm8
+; AVX512F-NEXT: vmovd %eax, %xmm0
+; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX512F-NEXT: movswl 20(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm9
-; AVX512F-NEXT: vcvtph2ps %ymm9, %zmm9
+; AVX512F-NEXT: vmovd %eax, %xmm1
+; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm1
; AVX512F-NEXT: movswl 16(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm10
-; AVX512F-NEXT: vcvtph2ps %ymm10, %zmm10
+; AVX512F-NEXT: vmovd %eax, %xmm2
+; AVX512F-NEXT: vcvtph2ps %xmm2, %xmm2
; AVX512F-NEXT: movswl 18(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm11
-; AVX512F-NEXT: vcvtph2ps %ymm11, %zmm11
+; AVX512F-NEXT: vmovd %eax, %xmm3
+; AVX512F-NEXT: vcvtph2ps %xmm3, %xmm3
; AVX512F-NEXT: movswl 30(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm12
-; AVX512F-NEXT: vcvtph2ps %ymm12, %zmm12
+; AVX512F-NEXT: vmovd %eax, %xmm4
+; AVX512F-NEXT: vcvtph2ps %xmm4, %xmm4
; AVX512F-NEXT: movswl 28(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm13
-; AVX512F-NEXT: vcvtph2ps %ymm13, %zmm13
+; AVX512F-NEXT: vmovd %eax, %xmm5
+; AVX512F-NEXT: vcvtph2ps %xmm5, %xmm5
; AVX512F-NEXT: movswl 24(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm14
-; AVX512F-NEXT: vcvtph2ps %ymm14, %zmm14
+; AVX512F-NEXT: vmovd %eax, %xmm6
+; AVX512F-NEXT: vcvtph2ps %xmm6, %xmm6
; AVX512F-NEXT: movswl 26(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm15
-; AVX512F-NEXT: vcvtph2ps %ymm15, %zmm15
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm14[0],xmm15[0],xmm14[2,3]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm13[0],xmm0[3]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm12[0]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm10[0],xmm11[0],xmm10[2,3]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm9[0],xmm1[3]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm8[0]
-; AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm6[0],xmm7[0],xmm6[2,3]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm5[0],xmm1[3]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0]
+; AVX512F-NEXT: vmovd %eax, %xmm7
+; AVX512F-NEXT: vcvtph2ps %xmm7, %xmm7
+; AVX512F-NEXT: vinsertps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[2,3]
+; AVX512F-NEXT: vinsertps {{.*#+}} xmm5 = xmm6[0,1],xmm5[0],xmm6[3]
+; AVX512F-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0]
; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm17[0],xmm2[3]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm16[0]
+; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
+; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; AVX512F-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm14[0],xmm15[0],xmm14[2,3]
+; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm13[0],xmm1[3]
+; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm12[0]
+; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm10[0],xmm11[0],xmm10[2,3]
+; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm9[0],xmm2[3]
+; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm8[0]
; AVX512F-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
; AVX512F-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: load_cvt_16i16_to_16f32:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: movswl 6(%rdi), %eax
; AVX512VL-NEXT: vmovd %eax, %xmm0
; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm8
@@ -1518,38 +1159,13 @@ define <16 x float> @load_cvt_16i16_to_16f32(<16 x i16>* %a0) nounwind {
;
define double @cvt_i16_to_f64(i16 %a0) nounwind {
-; AVX1-LABEL: cvt_i16_to_f64:
-; AVX1: # BB#0:
-; AVX1-NEXT: movswl %di, %eax
-; AVX1-NEXT: vmovd %eax, %xmm0
-; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX1-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: cvt_i16_to_f64:
-; AVX2: # BB#0:
-; AVX2-NEXT: movswl %di, %eax
-; AVX2-NEXT: vmovd %eax, %xmm0
-; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX2-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: cvt_i16_to_f64:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: movswl %di, %eax
-; AVX512F-NEXT: vmovd %eax, %xmm0
-; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0
-; AVX512F-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: cvt_i16_to_f64:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: movswl %di, %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm0
-; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512VL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX512VL-NEXT: retq
+; ALL-LABEL: cvt_i16_to_f64:
+; ALL: # %bb.0:
+; ALL-NEXT: movswl %di, %eax
+; ALL-NEXT: vmovd %eax, %xmm0
+; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
+; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
+; ALL-NEXT: retq
%1 = bitcast i16 %a0 to half
%2 = fpext half %1 to double
ret double %2
@@ -1557,7 +1173,7 @@ define double @cvt_i16_to_f64(i16 %a0) nounwind {
define <2 x double> @cvt_2i16_to_2f64(<2 x i16> %a0) nounwind {
; AVX1-LABEL: cvt_2i16_to_2f64:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; AVX1-NEXT: vmovd %xmm0, %eax
@@ -1570,11 +1186,11 @@ define <2 x double> @cvt_2i16_to_2f64(<2 x i16> %a0) nounwind {
; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1
; AVX1-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX1-NEXT: retq
;
; AVX2-LABEL: cvt_2i16_to_2f64:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; AVX2-NEXT: vmovd %xmm0, %eax
@@ -1587,11 +1203,11 @@ define <2 x double> @cvt_2i16_to_2f64(<2 x i16> %a0) nounwind {
; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1
; AVX2-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX2-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX2-NEXT: retq
;
; AVX512F-LABEL: cvt_2i16_to_2f64:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; AVX512F-NEXT: vmovd %xmm0, %eax
@@ -1599,17 +1215,16 @@ define <2 x double> @cvt_2i16_to_2f64(<2 x i16> %a0) nounwind {
; AVX512F-NEXT: shrl $16, %eax
; AVX512F-NEXT: cwtl
; AVX512F-NEXT: vmovd %eax, %xmm0
-; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0
+; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX512F-NEXT: vmovd %ecx, %xmm1
-; AVX512F-NEXT: vcvtph2ps %ymm1, %zmm1
+; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm1
; AVX512F-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
; AVX512F-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: cvt_2i16_to_2f64:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpmovqw %xmm0, -{{[0-9]+}}(%rsp)
; AVX512VL-NEXT: movl -{{[0-9]+}}(%rsp), %eax
; AVX512VL-NEXT: movswl %ax, %ecx
@@ -1621,7 +1236,7 @@ define <2 x double> @cvt_2i16_to_2f64(<2 x i16> %a0) nounwind {
; AVX512VL-NEXT: vcvtph2ps %xmm1, %xmm1
; AVX512VL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
; AVX512VL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX512VL-NEXT: retq
%1 = bitcast <2 x i16> %a0 to <2 x half>
%2 = fpext <2 x half> %1 to <2 x double>
@@ -1630,7 +1245,7 @@ define <2 x double> @cvt_2i16_to_2f64(<2 x i16> %a0) nounwind {
define <4 x double> @cvt_4i16_to_4f64(<4 x i16> %a0) nounwind {
; AVX1-LABEL: cvt_4i16_to_4f64:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; AVX1-NEXT: vmovq %xmm0, %rax
; AVX1-NEXT: movq %rax, %rcx
@@ -1652,15 +1267,15 @@ define <4 x double> @cvt_4i16_to_4f64(<4 x i16> %a0) nounwind {
; AVX1-NEXT: vcvtph2ps %xmm3, %xmm3
; AVX1-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
; AVX1-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; AVX1-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
; AVX1-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: cvt_4i16_to_4f64:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; AVX2-NEXT: vmovq %xmm0, %rax
; AVX2-NEXT: movq %rax, %rcx
@@ -1682,15 +1297,15 @@ define <4 x double> @cvt_4i16_to_4f64(<4 x i16> %a0) nounwind {
; AVX2-NEXT: vcvtph2ps %xmm3, %xmm3
; AVX2-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
; AVX2-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; AVX2-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
; AVX2-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX2-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: cvt_4i16_to_4f64:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; AVX512F-NEXT: vmovq %xmm0, %rax
; AVX512F-NEXT: movq %rax, %rcx
@@ -1701,26 +1316,26 @@ define <4 x double> @cvt_4i16_to_4f64(<4 x i16> %a0) nounwind {
; AVX512F-NEXT: shrl $16, %edx
; AVX512F-NEXT: movswl %dx, %edx
; AVX512F-NEXT: vmovd %edx, %xmm0
-; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0
+; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX512F-NEXT: vmovd %esi, %xmm1
-; AVX512F-NEXT: vcvtph2ps %ymm1, %zmm1
+; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm1
; AVX512F-NEXT: movswl %cx, %ecx
; AVX512F-NEXT: vmovd %ecx, %xmm2
-; AVX512F-NEXT: vcvtph2ps %ymm2, %zmm2
+; AVX512F-NEXT: vcvtph2ps %xmm2, %xmm2
; AVX512F-NEXT: cwtl
; AVX512F-NEXT: vmovd %eax, %xmm3
-; AVX512F-NEXT: vcvtph2ps %ymm3, %zmm3
+; AVX512F-NEXT: vcvtph2ps %xmm3, %xmm3
; AVX512F-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
; AVX512F-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
-; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; AVX512F-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
; AVX512F-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
; AVX512F-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512F-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX512F-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: cvt_4i16_to_4f64:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpmovdw %xmm0, -{{[0-9]+}}(%rsp)
; AVX512VL-NEXT: movq -{{[0-9]+}}(%rsp), %rax
; AVX512VL-NEXT: movq %rax, %rcx
@@ -1742,10 +1357,10 @@ define <4 x double> @cvt_4i16_to_4f64(<4 x i16> %a0) nounwind {
; AVX512VL-NEXT: vcvtph2ps %xmm3, %xmm3
; AVX512VL-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
; AVX512VL-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
-; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
; AVX512VL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
; AVX512VL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX512VL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX512VL-NEXT: retq
%1 = bitcast <4 x i16> %a0 to <4 x half>
@@ -1755,7 +1370,7 @@ define <4 x double> @cvt_4i16_to_4f64(<4 x i16> %a0) nounwind {
define <2 x double> @cvt_8i16_to_2f64(<8 x i16> %a0) nounwind {
; AVX1-LABEL: cvt_8i16_to_2f64:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovd %xmm0, %eax
; AVX1-NEXT: movswl %ax, %ecx
; AVX1-NEXT: shrl $16, %eax
@@ -1766,11 +1381,11 @@ define <2 x double> @cvt_8i16_to_2f64(<8 x i16> %a0) nounwind {
; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1
; AVX1-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX1-NEXT: retq
;
; AVX2-LABEL: cvt_8i16_to_2f64:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vmovd %xmm0, %eax
; AVX2-NEXT: movswl %ax, %ecx
; AVX2-NEXT: shrl $16, %eax
@@ -1781,27 +1396,26 @@ define <2 x double> @cvt_8i16_to_2f64(<8 x i16> %a0) nounwind {
; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1
; AVX2-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX2-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX2-NEXT: retq
;
; AVX512F-LABEL: cvt_8i16_to_2f64:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovd %xmm0, %eax
; AVX512F-NEXT: movswl %ax, %ecx
; AVX512F-NEXT: shrl $16, %eax
; AVX512F-NEXT: cwtl
; AVX512F-NEXT: vmovd %eax, %xmm0
-; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0
+; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX512F-NEXT: vmovd %ecx, %xmm1
-; AVX512F-NEXT: vcvtph2ps %ymm1, %zmm1
+; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm1
; AVX512F-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
; AVX512F-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: cvt_8i16_to_2f64:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
; AVX512VL-NEXT: vpmovqw %xmm0, -{{[0-9]+}}(%rsp)
; AVX512VL-NEXT: movl -{{[0-9]+}}(%rsp), %eax
@@ -1814,7 +1428,7 @@ define <2 x double> @cvt_8i16_to_2f64(<8 x i16> %a0) nounwind {
; AVX512VL-NEXT: vcvtph2ps %xmm1, %xmm1
; AVX512VL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
; AVX512VL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX512VL-NEXT: retq
%1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
%2 = bitcast <2 x i16> %1 to <2 x half>
@@ -1824,7 +1438,7 @@ define <2 x double> @cvt_8i16_to_2f64(<8 x i16> %a0) nounwind {
define <4 x double> @cvt_8i16_to_4f64(<8 x i16> %a0) nounwind {
; AVX1-LABEL: cvt_8i16_to_4f64:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovq %xmm0, %rax
; AVX1-NEXT: movq %rax, %rcx
; AVX1-NEXT: movl %eax, %edx
@@ -1845,15 +1459,15 @@ define <4 x double> @cvt_8i16_to_4f64(<8 x i16> %a0) nounwind {
; AVX1-NEXT: vcvtph2ps %xmm3, %xmm3
; AVX1-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
; AVX1-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; AVX1-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
; AVX1-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: cvt_8i16_to_4f64:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vmovq %xmm0, %rax
; AVX2-NEXT: movq %rax, %rcx
; AVX2-NEXT: movl %eax, %edx
@@ -1874,15 +1488,15 @@ define <4 x double> @cvt_8i16_to_4f64(<8 x i16> %a0) nounwind {
; AVX2-NEXT: vcvtph2ps %xmm3, %xmm3
; AVX2-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
; AVX2-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; AVX2-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
; AVX2-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX2-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: cvt_8i16_to_4f64:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovq %xmm0, %rax
; AVX512F-NEXT: movq %rax, %rcx
; AVX512F-NEXT: movl %eax, %edx
@@ -1892,26 +1506,26 @@ define <4 x double> @cvt_8i16_to_4f64(<8 x i16> %a0) nounwind {
; AVX512F-NEXT: shrl $16, %edx
; AVX512F-NEXT: movswl %dx, %edx
; AVX512F-NEXT: vmovd %edx, %xmm0
-; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0
+; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX512F-NEXT: vmovd %esi, %xmm1
-; AVX512F-NEXT: vcvtph2ps %ymm1, %zmm1
+; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm1
; AVX512F-NEXT: movswl %cx, %ecx
; AVX512F-NEXT: vmovd %ecx, %xmm2
-; AVX512F-NEXT: vcvtph2ps %ymm2, %zmm2
+; AVX512F-NEXT: vcvtph2ps %xmm2, %xmm2
; AVX512F-NEXT: cwtl
; AVX512F-NEXT: vmovd %eax, %xmm3
-; AVX512F-NEXT: vcvtph2ps %ymm3, %zmm3
+; AVX512F-NEXT: vcvtph2ps %xmm3, %xmm3
; AVX512F-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
; AVX512F-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
-; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; AVX512F-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
; AVX512F-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
; AVX512F-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512F-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX512F-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: cvt_8i16_to_4f64:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; AVX512VL-NEXT: vpmovdw %xmm0, -{{[0-9]+}}(%rsp)
; AVX512VL-NEXT: movq -{{[0-9]+}}(%rsp), %rax
@@ -1934,10 +1548,10 @@ define <4 x double> @cvt_8i16_to_4f64(<8 x i16> %a0) nounwind {
; AVX512VL-NEXT: vcvtph2ps %xmm3, %xmm3
; AVX512VL-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
; AVX512VL-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
-; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
; AVX512VL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
; AVX512VL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX512VL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX512VL-NEXT: retq
%1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -1948,27 +1562,27 @@ define <4 x double> @cvt_8i16_to_4f64(<8 x i16> %a0) nounwind {
define <8 x double> @cvt_8i16_to_8f64(<8 x i16> %a0) nounwind {
; AVX1-LABEL: cvt_8i16_to_8f64:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovq %xmm0, %rdx
-; AVX1-NEXT: movq %rdx, %r8
+; AVX1-NEXT: movq %rdx, %r9
; AVX1-NEXT: movl %edx, %r10d
-; AVX1-NEXT: movswl %dx, %r9d
+; AVX1-NEXT: movswl %dx, %r8d
; AVX1-NEXT: shrq $48, %rdx
-; AVX1-NEXT: shrq $32, %r8
+; AVX1-NEXT: shrq $32, %r9
; AVX1-NEXT: shrl $16, %r10d
; AVX1-NEXT: vpextrq $1, %xmm0, %rdi
-; AVX1-NEXT: movq %rdi, %rax
-; AVX1-NEXT: movl %edi, %esi
+; AVX1-NEXT: movq %rdi, %rsi
+; AVX1-NEXT: movl %edi, %eax
; AVX1-NEXT: movswl %di, %ecx
; AVX1-NEXT: shrq $48, %rdi
-; AVX1-NEXT: shrq $32, %rax
-; AVX1-NEXT: shrl $16, %esi
-; AVX1-NEXT: movswl %si, %esi
-; AVX1-NEXT: vmovd %esi, %xmm0
+; AVX1-NEXT: shrq $32, %rsi
+; AVX1-NEXT: shrl $16, %eax
+; AVX1-NEXT: cwtl
+; AVX1-NEXT: vmovd %eax, %xmm0
; AVX1-NEXT: vcvtph2ps %xmm0, %xmm1
; AVX1-NEXT: vmovd %ecx, %xmm0
; AVX1-NEXT: vcvtph2ps %xmm0, %xmm2
-; AVX1-NEXT: cwtl
+; AVX1-NEXT: movswl %si, %eax
; AVX1-NEXT: vmovd %eax, %xmm0
; AVX1-NEXT: vcvtph2ps %xmm0, %xmm3
; AVX1-NEXT: movswl %di, %eax
@@ -1977,9 +1591,9 @@ define <8 x double> @cvt_8i16_to_8f64(<8 x i16> %a0) nounwind {
; AVX1-NEXT: movswl %r10w, %eax
; AVX1-NEXT: vmovd %eax, %xmm0
; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX1-NEXT: vmovd %r9d, %xmm5
+; AVX1-NEXT: vmovd %r8d, %xmm5
; AVX1-NEXT: vcvtph2ps %xmm5, %xmm5
-; AVX1-NEXT: movswl %r8w, %eax
+; AVX1-NEXT: movswl %r9w, %eax
; AVX1-NEXT: vmovd %eax, %xmm6
; AVX1-NEXT: vcvtph2ps %xmm6, %xmm6
; AVX1-NEXT: movswl %dx, %eax
@@ -1987,42 +1601,42 @@ define <8 x double> @cvt_8i16_to_8f64(<8 x i16> %a0) nounwind {
; AVX1-NEXT: vcvtph2ps %xmm7, %xmm7
; AVX1-NEXT: vcvtss2sd %xmm7, %xmm7, %xmm7
; AVX1-NEXT: vcvtss2sd %xmm6, %xmm6, %xmm6
-; AVX1-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],xmm7[0]
+; AVX1-NEXT: vmovlhps {{.*#+}} xmm6 = xmm6[0],xmm7[0]
; AVX1-NEXT: vcvtss2sd %xmm5, %xmm5, %xmm5
; AVX1-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm5[0],xmm0[0]
+; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm5[0],xmm0[0]
; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0
; AVX1-NEXT: vcvtss2sd %xmm4, %xmm4, %xmm4
; AVX1-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
-; AVX1-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],xmm4[0]
+; AVX1-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
; AVX1-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX1-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
; AVX1-NEXT: retq
;
; AVX2-LABEL: cvt_8i16_to_8f64:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vmovq %xmm0, %rdx
-; AVX2-NEXT: movq %rdx, %r8
+; AVX2-NEXT: movq %rdx, %r9
; AVX2-NEXT: movl %edx, %r10d
-; AVX2-NEXT: movswl %dx, %r9d
+; AVX2-NEXT: movswl %dx, %r8d
; AVX2-NEXT: shrq $48, %rdx
-; AVX2-NEXT: shrq $32, %r8
+; AVX2-NEXT: shrq $32, %r9
; AVX2-NEXT: shrl $16, %r10d
; AVX2-NEXT: vpextrq $1, %xmm0, %rdi
-; AVX2-NEXT: movq %rdi, %rax
-; AVX2-NEXT: movl %edi, %esi
+; AVX2-NEXT: movq %rdi, %rsi
+; AVX2-NEXT: movl %edi, %eax
; AVX2-NEXT: movswl %di, %ecx
; AVX2-NEXT: shrq $48, %rdi
-; AVX2-NEXT: shrq $32, %rax
-; AVX2-NEXT: shrl $16, %esi
-; AVX2-NEXT: movswl %si, %esi
-; AVX2-NEXT: vmovd %esi, %xmm0
+; AVX2-NEXT: shrq $32, %rsi
+; AVX2-NEXT: shrl $16, %eax
+; AVX2-NEXT: cwtl
+; AVX2-NEXT: vmovd %eax, %xmm0
; AVX2-NEXT: vcvtph2ps %xmm0, %xmm1
; AVX2-NEXT: vmovd %ecx, %xmm0
; AVX2-NEXT: vcvtph2ps %xmm0, %xmm2
-; AVX2-NEXT: cwtl
+; AVX2-NEXT: movswl %si, %eax
; AVX2-NEXT: vmovd %eax, %xmm0
; AVX2-NEXT: vcvtph2ps %xmm0, %xmm3
; AVX2-NEXT: movswl %di, %eax
@@ -2031,9 +1645,9 @@ define <8 x double> @cvt_8i16_to_8f64(<8 x i16> %a0) nounwind {
; AVX2-NEXT: movswl %r10w, %eax
; AVX2-NEXT: vmovd %eax, %xmm0
; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX2-NEXT: vmovd %r9d, %xmm5
+; AVX2-NEXT: vmovd %r8d, %xmm5
; AVX2-NEXT: vcvtph2ps %xmm5, %xmm5
-; AVX2-NEXT: movswl %r8w, %eax
+; AVX2-NEXT: movswl %r9w, %eax
; AVX2-NEXT: vmovd %eax, %xmm6
; AVX2-NEXT: vcvtph2ps %xmm6, %xmm6
; AVX2-NEXT: movswl %dx, %eax
@@ -2041,129 +1655,74 @@ define <8 x double> @cvt_8i16_to_8f64(<8 x i16> %a0) nounwind {
; AVX2-NEXT: vcvtph2ps %xmm7, %xmm7
; AVX2-NEXT: vcvtss2sd %xmm7, %xmm7, %xmm7
; AVX2-NEXT: vcvtss2sd %xmm6, %xmm6, %xmm6
-; AVX2-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],xmm7[0]
+; AVX2-NEXT: vmovlhps {{.*#+}} xmm6 = xmm6[0],xmm7[0]
; AVX2-NEXT: vcvtss2sd %xmm5, %xmm5, %xmm5
; AVX2-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX2-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm5[0],xmm0[0]
+; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm5[0],xmm0[0]
; AVX2-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0
; AVX2-NEXT: vcvtss2sd %xmm4, %xmm4, %xmm4
; AVX2-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
-; AVX2-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],xmm4[0]
+; AVX2-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
; AVX2-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
; AVX2-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
-; AVX2-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
; AVX2-NEXT: retq
;
-; AVX512F-LABEL: cvt_8i16_to_8f64:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vpextrq $1, %xmm0, %rdx
-; AVX512F-NEXT: movq %rdx, %r8
-; AVX512F-NEXT: movl %edx, %r9d
-; AVX512F-NEXT: movswl %dx, %r10d
-; AVX512F-NEXT: shrq $48, %rdx
-; AVX512F-NEXT: shrq $32, %r8
-; AVX512F-NEXT: shrl $16, %r9d
-; AVX512F-NEXT: vmovq %xmm0, %rdi
-; AVX512F-NEXT: movq %rdi, %rax
-; AVX512F-NEXT: movl %edi, %ecx
-; AVX512F-NEXT: movswl %di, %esi
-; AVX512F-NEXT: shrq $48, %rdi
-; AVX512F-NEXT: shrq $32, %rax
-; AVX512F-NEXT: shrl $16, %ecx
-; AVX512F-NEXT: movswl %cx, %ecx
-; AVX512F-NEXT: vmovd %ecx, %xmm0
-; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0
-; AVX512F-NEXT: vmovd %esi, %xmm1
-; AVX512F-NEXT: vcvtph2ps %ymm1, %zmm1
-; AVX512F-NEXT: cwtl
-; AVX512F-NEXT: vmovd %eax, %xmm2
-; AVX512F-NEXT: vcvtph2ps %ymm2, %zmm2
-; AVX512F-NEXT: movswl %di, %eax
-; AVX512F-NEXT: vmovd %eax, %xmm3
-; AVX512F-NEXT: vcvtph2ps %ymm3, %zmm3
-; AVX512F-NEXT: movswl %r9w, %eax
-; AVX512F-NEXT: vmovd %eax, %xmm4
-; AVX512F-NEXT: vcvtph2ps %ymm4, %zmm4
-; AVX512F-NEXT: vmovd %r10d, %xmm5
-; AVX512F-NEXT: vcvtph2ps %ymm5, %zmm5
-; AVX512F-NEXT: movswl %r8w, %eax
-; AVX512F-NEXT: vmovd %eax, %xmm6
-; AVX512F-NEXT: vcvtph2ps %ymm6, %zmm6
-; AVX512F-NEXT: movswl %dx, %eax
-; AVX512F-NEXT: vmovd %eax, %xmm7
-; AVX512F-NEXT: vcvtph2ps %ymm7, %zmm7
-; AVX512F-NEXT: vcvtss2sd %xmm7, %xmm7, %xmm7
-; AVX512F-NEXT: vcvtss2sd %xmm6, %xmm6, %xmm6
-; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],xmm7[0]
-; AVX512F-NEXT: vcvtss2sd %xmm5, %xmm5, %xmm5
-; AVX512F-NEXT: vcvtss2sd %xmm4, %xmm4, %xmm4
-; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm5[0],xmm4[0]
-; AVX512F-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4
-; AVX512F-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
-; AVX512F-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
-; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; AVX512F-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512F-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX512F-NEXT: vinsertf64x4 $1, %ymm4, %zmm0, %zmm0
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: cvt_8i16_to_8f64:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vpextrq $1, %xmm0, %rdx
-; AVX512VL-NEXT: movq %rdx, %r8
-; AVX512VL-NEXT: movl %edx, %r10d
-; AVX512VL-NEXT: movswl %dx, %r9d
-; AVX512VL-NEXT: shrq $48, %rdx
-; AVX512VL-NEXT: shrq $32, %r8
-; AVX512VL-NEXT: shrl $16, %r10d
-; AVX512VL-NEXT: vmovq %xmm0, %rdi
-; AVX512VL-NEXT: movq %rdi, %rax
-; AVX512VL-NEXT: movl %edi, %esi
-; AVX512VL-NEXT: movswl %di, %ecx
-; AVX512VL-NEXT: shrq $48, %rdi
-; AVX512VL-NEXT: shrq $32, %rax
-; AVX512VL-NEXT: shrl $16, %esi
-; AVX512VL-NEXT: movswl %si, %esi
-; AVX512VL-NEXT: vmovd %esi, %xmm0
-; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512VL-NEXT: vmovd %ecx, %xmm1
-; AVX512VL-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX512VL-NEXT: cwtl
-; AVX512VL-NEXT: vmovd %eax, %xmm2
-; AVX512VL-NEXT: vcvtph2ps %xmm2, %xmm2
-; AVX512VL-NEXT: movswl %di, %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm3
-; AVX512VL-NEXT: vcvtph2ps %xmm3, %xmm3
-; AVX512VL-NEXT: movswl %r10w, %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm4
-; AVX512VL-NEXT: vcvtph2ps %xmm4, %xmm4
-; AVX512VL-NEXT: vmovd %r9d, %xmm5
-; AVX512VL-NEXT: vcvtph2ps %xmm5, %xmm5
-; AVX512VL-NEXT: movswl %r8w, %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm6
-; AVX512VL-NEXT: vcvtph2ps %xmm6, %xmm6
-; AVX512VL-NEXT: movswl %dx, %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm7
-; AVX512VL-NEXT: vcvtph2ps %xmm7, %xmm7
-; AVX512VL-NEXT: vcvtss2sd %xmm7, %xmm7, %xmm7
-; AVX512VL-NEXT: vcvtss2sd %xmm6, %xmm6, %xmm6
-; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],xmm7[0]
-; AVX512VL-NEXT: vcvtss2sd %xmm5, %xmm5, %xmm5
-; AVX512VL-NEXT: vcvtss2sd %xmm4, %xmm4, %xmm4
-; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm5[0],xmm4[0]
-; AVX512VL-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4
-; AVX512VL-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
-; AVX512VL-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
-; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; AVX512VL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512VL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX512VL-NEXT: vinsertf64x4 $1, %ymm4, %zmm0, %zmm0
-; AVX512VL-NEXT: retq
+; AVX512-LABEL: cvt_8i16_to_8f64:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpextrq $1, %xmm0, %rdx
+; AVX512-NEXT: movq %rdx, %r9
+; AVX512-NEXT: movl %edx, %r10d
+; AVX512-NEXT: movswl %dx, %r8d
+; AVX512-NEXT: shrq $48, %rdx
+; AVX512-NEXT: shrq $32, %r9
+; AVX512-NEXT: shrl $16, %r10d
+; AVX512-NEXT: vmovq %xmm0, %rdi
+; AVX512-NEXT: movq %rdi, %rsi
+; AVX512-NEXT: movl %edi, %eax
+; AVX512-NEXT: movswl %di, %ecx
+; AVX512-NEXT: shrq $48, %rdi
+; AVX512-NEXT: shrq $32, %rsi
+; AVX512-NEXT: shrl $16, %eax
+; AVX512-NEXT: cwtl
+; AVX512-NEXT: vmovd %eax, %xmm0
+; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT: vmovd %ecx, %xmm1
+; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX512-NEXT: movswl %si, %eax
+; AVX512-NEXT: vmovd %eax, %xmm2
+; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2
+; AVX512-NEXT: movswl %di, %eax
+; AVX512-NEXT: vmovd %eax, %xmm3
+; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3
+; AVX512-NEXT: movswl %r10w, %eax
+; AVX512-NEXT: vmovd %eax, %xmm4
+; AVX512-NEXT: vcvtph2ps %xmm4, %xmm4
+; AVX512-NEXT: vmovd %r8d, %xmm5
+; AVX512-NEXT: vcvtph2ps %xmm5, %xmm5
+; AVX512-NEXT: movswl %r9w, %eax
+; AVX512-NEXT: vmovd %eax, %xmm6
+; AVX512-NEXT: vcvtph2ps %xmm6, %xmm6
+; AVX512-NEXT: movswl %dx, %eax
+; AVX512-NEXT: vmovd %eax, %xmm7
+; AVX512-NEXT: vcvtph2ps %xmm7, %xmm7
+; AVX512-NEXT: vcvtss2sd %xmm7, %xmm7, %xmm7
+; AVX512-NEXT: vcvtss2sd %xmm6, %xmm6, %xmm6
+; AVX512-NEXT: vmovlhps {{.*#+}} xmm6 = xmm6[0],xmm7[0]
+; AVX512-NEXT: vcvtss2sd %xmm5, %xmm5, %xmm5
+; AVX512-NEXT: vcvtss2sd %xmm4, %xmm4, %xmm4
+; AVX512-NEXT: vmovlhps {{.*#+}} xmm4 = xmm5[0],xmm4[0]
+; AVX512-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4
+; AVX512-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
+; AVX512-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
+; AVX512-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; AVX512-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
+; AVX512-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX512-NEXT: vinsertf64x4 $1, %ymm4, %zmm0, %zmm0
+; AVX512-NEXT: retq
%1 = bitcast <8 x i16> %a0 to <8 x half>
%2 = fpext <8 x half> %1 to <8 x double>
ret <8 x double> %2
@@ -2174,38 +1733,13 @@ define <8 x double> @cvt_8i16_to_8f64(<8 x i16> %a0) nounwind {
;
define double @load_cvt_i16_to_f64(i16* %a0) nounwind {
-; AVX1-LABEL: load_cvt_i16_to_f64:
-; AVX1: # BB#0:
-; AVX1-NEXT: movswl (%rdi), %eax
-; AVX1-NEXT: vmovd %eax, %xmm0
-; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX1-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: load_cvt_i16_to_f64:
-; AVX2: # BB#0:
-; AVX2-NEXT: movswl (%rdi), %eax
-; AVX2-NEXT: vmovd %eax, %xmm0
-; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX2-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: load_cvt_i16_to_f64:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: movswl (%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm0
-; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0
-; AVX512F-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: load_cvt_i16_to_f64:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: movswl (%rdi), %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm0
-; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512VL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX512VL-NEXT: retq
+; ALL-LABEL: load_cvt_i16_to_f64:
+; ALL: # %bb.0:
+; ALL-NEXT: movswl (%rdi), %eax
+; ALL-NEXT: vmovd %eax, %xmm0
+; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
+; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
+; ALL-NEXT: retq
%1 = load i16, i16* %a0
%2 = bitcast i16 %1 to half
%3 = fpext half %2 to double
@@ -2213,58 +1747,18 @@ define double @load_cvt_i16_to_f64(i16* %a0) nounwind {
}
define <2 x double> @load_cvt_2i16_to_2f64(<2 x i16>* %a0) nounwind {
-; AVX1-LABEL: load_cvt_2i16_to_2f64:
-; AVX1: # BB#0:
-; AVX1-NEXT: movswl (%rdi), %eax
-; AVX1-NEXT: vmovd %eax, %xmm0
-; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX1-NEXT: movswl 2(%rdi), %eax
-; AVX1-NEXT: vmovd %eax, %xmm1
-; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX1-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: load_cvt_2i16_to_2f64:
-; AVX2: # BB#0:
-; AVX2-NEXT: movswl (%rdi), %eax
-; AVX2-NEXT: vmovd %eax, %xmm0
-; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX2-NEXT: movswl 2(%rdi), %eax
-; AVX2-NEXT: vmovd %eax, %xmm1
-; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX2-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
-; AVX2-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX2-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: load_cvt_2i16_to_2f64:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: movswl (%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm0
-; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0
-; AVX512F-NEXT: movswl 2(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm1
-; AVX512F-NEXT: vcvtph2ps %ymm1, %zmm1
-; AVX512F-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: load_cvt_2i16_to_2f64:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: movswl (%rdi), %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm0
-; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512VL-NEXT: movswl 2(%rdi), %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm1
-; AVX512VL-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX512VL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512VL-NEXT: retq
+; ALL-LABEL: load_cvt_2i16_to_2f64:
+; ALL: # %bb.0:
+; ALL-NEXT: movswl (%rdi), %eax
+; ALL-NEXT: vmovd %eax, %xmm0
+; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
+; ALL-NEXT: movswl 2(%rdi), %eax
+; ALL-NEXT: vmovd %eax, %xmm1
+; ALL-NEXT: vcvtph2ps %xmm1, %xmm1
+; ALL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
+; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
+; ALL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; ALL-NEXT: retq
%1 = load <2 x i16>, <2 x i16>* %a0
%2 = bitcast <2 x i16> %1 to <2 x half>
%3 = fpext <2 x half> %2 to <2 x double>
@@ -2272,97 +1766,28 @@ define <2 x double> @load_cvt_2i16_to_2f64(<2 x i16>* %a0) nounwind {
}
define <4 x double> @load_cvt_4i16_to_4f64(<4 x i16>* %a0) nounwind {
-; AVX1-LABEL: load_cvt_4i16_to_4f64:
-; AVX1: # BB#0:
-; AVX1-NEXT: movswl (%rdi), %eax
-; AVX1-NEXT: vmovd %eax, %xmm0
-; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX1-NEXT: movswl 2(%rdi), %eax
-; AVX1-NEXT: vmovd %eax, %xmm1
-; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX1-NEXT: movswl 4(%rdi), %eax
-; AVX1-NEXT: vmovd %eax, %xmm2
-; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2
-; AVX1-NEXT: movswl 6(%rdi), %eax
-; AVX1-NEXT: vmovd %eax, %xmm3
-; AVX1-NEXT: vcvtph2ps %xmm3, %xmm3
-; AVX1-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
-; AVX1-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; AVX1-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: load_cvt_4i16_to_4f64:
-; AVX2: # BB#0:
-; AVX2-NEXT: movswl (%rdi), %eax
-; AVX2-NEXT: vmovd %eax, %xmm0
-; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX2-NEXT: movswl 2(%rdi), %eax
-; AVX2-NEXT: vmovd %eax, %xmm1
-; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX2-NEXT: movswl 4(%rdi), %eax
-; AVX2-NEXT: vmovd %eax, %xmm2
-; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2
-; AVX2-NEXT: movswl 6(%rdi), %eax
-; AVX2-NEXT: vmovd %eax, %xmm3
-; AVX2-NEXT: vcvtph2ps %xmm3, %xmm3
-; AVX2-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
-; AVX2-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; AVX2-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
-; AVX2-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX2-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: load_cvt_4i16_to_4f64:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: movswl (%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm0
-; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0
-; AVX512F-NEXT: movswl 2(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm1
-; AVX512F-NEXT: vcvtph2ps %ymm1, %zmm1
-; AVX512F-NEXT: movswl 4(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm2
-; AVX512F-NEXT: vcvtph2ps %ymm2, %zmm2
-; AVX512F-NEXT: movswl 6(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm3
-; AVX512F-NEXT: vcvtph2ps %ymm3, %zmm3
-; AVX512F-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
-; AVX512F-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
-; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; AVX512F-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512F-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: load_cvt_4i16_to_4f64:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: movswl (%rdi), %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm0
-; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512VL-NEXT: movswl 2(%rdi), %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm1
-; AVX512VL-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX512VL-NEXT: movswl 4(%rdi), %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm2
-; AVX512VL-NEXT: vcvtph2ps %xmm2, %xmm2
-; AVX512VL-NEXT: movswl 6(%rdi), %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm3
-; AVX512VL-NEXT: vcvtph2ps %xmm3, %xmm3
-; AVX512VL-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
-; AVX512VL-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
-; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; AVX512VL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512VL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX512VL-NEXT: retq
+; ALL-LABEL: load_cvt_4i16_to_4f64:
+; ALL: # %bb.0:
+; ALL-NEXT: movswl (%rdi), %eax
+; ALL-NEXT: vmovd %eax, %xmm0
+; ALL-NEXT: vcvtph2ps %xmm0, %xmm0
+; ALL-NEXT: movswl 2(%rdi), %eax
+; ALL-NEXT: vmovd %eax, %xmm1
+; ALL-NEXT: vcvtph2ps %xmm1, %xmm1
+; ALL-NEXT: movswl 4(%rdi), %eax
+; ALL-NEXT: vmovd %eax, %xmm2
+; ALL-NEXT: vcvtph2ps %xmm2, %xmm2
+; ALL-NEXT: movswl 6(%rdi), %eax
+; ALL-NEXT: vmovd %eax, %xmm3
+; ALL-NEXT: vcvtph2ps %xmm3, %xmm3
+; ALL-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
+; ALL-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
+; ALL-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; ALL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
+; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
+; ALL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; ALL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; ALL-NEXT: retq
%1 = load <4 x i16>, <4 x i16>* %a0
%2 = bitcast <4 x i16> %1 to <4 x half>
%3 = fpext <4 x half> %2 to <4 x double>
@@ -2371,7 +1796,7 @@ define <4 x double> @load_cvt_4i16_to_4f64(<4 x i16>* %a0) nounwind {
define <4 x double> @load_cvt_8i16_to_4f64(<8 x i16>* %a0) nounwind {
; AVX1-LABEL: load_cvt_8i16_to_4f64:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: movq (%rdi), %rax
; AVX1-NEXT: movq %rax, %rcx
; AVX1-NEXT: movl %eax, %edx
@@ -2392,15 +1817,15 @@ define <4 x double> @load_cvt_8i16_to_4f64(<8 x i16>* %a0) nounwind {
; AVX1-NEXT: vcvtph2ps %xmm3, %xmm3
; AVX1-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
; AVX1-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; AVX1-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
; AVX1-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: load_cvt_8i16_to_4f64:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: movq (%rdi), %rax
; AVX2-NEXT: movq %rax, %rcx
; AVX2-NEXT: movl %eax, %edx
@@ -2421,15 +1846,15 @@ define <4 x double> @load_cvt_8i16_to_4f64(<8 x i16>* %a0) nounwind {
; AVX2-NEXT: vcvtph2ps %xmm3, %xmm3
; AVX2-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
; AVX2-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; AVX2-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
; AVX2-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX2-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: load_cvt_8i16_to_4f64:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: movq (%rdi), %rax
; AVX512F-NEXT: movq %rax, %rcx
; AVX512F-NEXT: movl %eax, %edx
@@ -2439,26 +1864,26 @@ define <4 x double> @load_cvt_8i16_to_4f64(<8 x i16>* %a0) nounwind {
; AVX512F-NEXT: shrl $16, %edx
; AVX512F-NEXT: movswl %dx, %edx
; AVX512F-NEXT: vmovd %edx, %xmm0
-; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0
+; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0
; AVX512F-NEXT: vmovd %esi, %xmm1
-; AVX512F-NEXT: vcvtph2ps %ymm1, %zmm1
+; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm1
; AVX512F-NEXT: movswl %cx, %ecx
; AVX512F-NEXT: vmovd %ecx, %xmm2
-; AVX512F-NEXT: vcvtph2ps %ymm2, %zmm2
+; AVX512F-NEXT: vcvtph2ps %xmm2, %xmm2
; AVX512F-NEXT: cwtl
; AVX512F-NEXT: vmovd %eax, %xmm3
-; AVX512F-NEXT: vcvtph2ps %ymm3, %zmm3
+; AVX512F-NEXT: vcvtph2ps %xmm3, %xmm3
; AVX512F-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
; AVX512F-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
-; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; AVX512F-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
; AVX512F-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
; AVX512F-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512F-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX512F-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: load_cvt_8i16_to_4f64:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
; AVX512VL-NEXT: vpmovdw %xmm0, -{{[0-9]+}}(%rsp)
; AVX512VL-NEXT: movq -{{[0-9]+}}(%rsp), %rax
@@ -2481,10 +1906,10 @@ define <4 x double> @load_cvt_8i16_to_4f64(<8 x i16>* %a0) nounwind {
; AVX512VL-NEXT: vcvtph2ps %xmm3, %xmm3
; AVX512VL-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
; AVX512VL-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
-; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
; AVX512VL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
; AVX512VL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX512VL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX512VL-NEXT: retq
%1 = load <8 x i16>, <8 x i16>* %a0
@@ -2496,7 +1921,7 @@ define <4 x double> @load_cvt_8i16_to_4f64(<8 x i16>* %a0) nounwind {
define <8 x double> @load_cvt_8i16_to_8f64(<8 x i16>* %a0) nounwind {
; AVX1-LABEL: load_cvt_8i16_to_8f64:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: movswl 8(%rdi), %eax
; AVX1-NEXT: vmovd %eax, %xmm0
; AVX1-NEXT: vcvtph2ps %xmm0, %xmm1
@@ -2523,22 +1948,22 @@ define <8 x double> @load_cvt_8i16_to_8f64(<8 x i16>* %a0) nounwind {
; AVX1-NEXT: vcvtph2ps %xmm7, %xmm7
; AVX1-NEXT: vcvtss2sd %xmm7, %xmm7, %xmm7
; AVX1-NEXT: vcvtss2sd %xmm6, %xmm6, %xmm6
-; AVX1-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],xmm7[0]
+; AVX1-NEXT: vmovlhps {{.*#+}} xmm6 = xmm6[0],xmm7[0]
; AVX1-NEXT: vcvtss2sd %xmm5, %xmm5, %xmm5
; AVX1-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm5[0]
+; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm5[0]
; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0
; AVX1-NEXT: vcvtss2sd %xmm4, %xmm4, %xmm4
; AVX1-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
-; AVX1-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],xmm4[0]
+; AVX1-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
; AVX1-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX1-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
; AVX1-NEXT: retq
;
; AVX2-LABEL: load_cvt_8i16_to_8f64:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: movswl 8(%rdi), %eax
; AVX2-NEXT: vmovd %eax, %xmm0
; AVX2-NEXT: vcvtph2ps %xmm0, %xmm1
@@ -2565,105 +1990,62 @@ define <8 x double> @load_cvt_8i16_to_8f64(<8 x i16>* %a0) nounwind {
; AVX2-NEXT: vcvtph2ps %xmm7, %xmm7
; AVX2-NEXT: vcvtss2sd %xmm7, %xmm7, %xmm7
; AVX2-NEXT: vcvtss2sd %xmm6, %xmm6, %xmm6
-; AVX2-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],xmm7[0]
+; AVX2-NEXT: vmovlhps {{.*#+}} xmm6 = xmm6[0],xmm7[0]
; AVX2-NEXT: vcvtss2sd %xmm5, %xmm5, %xmm5
; AVX2-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX2-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm5[0]
+; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm5[0]
; AVX2-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0
; AVX2-NEXT: vcvtss2sd %xmm4, %xmm4, %xmm4
; AVX2-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
-; AVX2-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],xmm4[0]
+; AVX2-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
; AVX2-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
; AVX2-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
-; AVX2-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
; AVX2-NEXT: retq
;
-; AVX512F-LABEL: load_cvt_8i16_to_8f64:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: movswl (%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm0
-; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0
-; AVX512F-NEXT: movswl 2(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm1
-; AVX512F-NEXT: vcvtph2ps %ymm1, %zmm1
-; AVX512F-NEXT: movswl 4(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm2
-; AVX512F-NEXT: vcvtph2ps %ymm2, %zmm2
-; AVX512F-NEXT: movswl 6(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm3
-; AVX512F-NEXT: vcvtph2ps %ymm3, %zmm3
-; AVX512F-NEXT: movswl 8(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm4
-; AVX512F-NEXT: vcvtph2ps %ymm4, %zmm4
-; AVX512F-NEXT: movswl 10(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm5
-; AVX512F-NEXT: vcvtph2ps %ymm5, %zmm5
-; AVX512F-NEXT: movswl 12(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm6
-; AVX512F-NEXT: vcvtph2ps %ymm6, %zmm6
-; AVX512F-NEXT: movswl 14(%rdi), %eax
-; AVX512F-NEXT: vmovd %eax, %xmm7
-; AVX512F-NEXT: vcvtph2ps %ymm7, %zmm7
-; AVX512F-NEXT: vcvtss2sd %xmm7, %xmm7, %xmm7
-; AVX512F-NEXT: vcvtss2sd %xmm6, %xmm6, %xmm6
-; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],xmm7[0]
-; AVX512F-NEXT: vcvtss2sd %xmm5, %xmm5, %xmm5
-; AVX512F-NEXT: vcvtss2sd %xmm4, %xmm4, %xmm4
-; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm4[0],xmm5[0]
-; AVX512F-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4
-; AVX512F-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
-; AVX512F-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
-; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; AVX512F-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512F-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX512F-NEXT: vinsertf64x4 $1, %ymm4, %zmm0, %zmm0
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: load_cvt_8i16_to_8f64:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: movswl (%rdi), %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm0
-; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512VL-NEXT: movswl 2(%rdi), %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm1
-; AVX512VL-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX512VL-NEXT: movswl 4(%rdi), %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm2
-; AVX512VL-NEXT: vcvtph2ps %xmm2, %xmm2
-; AVX512VL-NEXT: movswl 6(%rdi), %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm3
-; AVX512VL-NEXT: vcvtph2ps %xmm3, %xmm3
-; AVX512VL-NEXT: movswl 8(%rdi), %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm4
-; AVX512VL-NEXT: vcvtph2ps %xmm4, %xmm4
-; AVX512VL-NEXT: movswl 10(%rdi), %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm5
-; AVX512VL-NEXT: vcvtph2ps %xmm5, %xmm5
-; AVX512VL-NEXT: movswl 12(%rdi), %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm6
-; AVX512VL-NEXT: vcvtph2ps %xmm6, %xmm6
-; AVX512VL-NEXT: movswl 14(%rdi), %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm7
-; AVX512VL-NEXT: vcvtph2ps %xmm7, %xmm7
-; AVX512VL-NEXT: vcvtss2sd %xmm7, %xmm7, %xmm7
-; AVX512VL-NEXT: vcvtss2sd %xmm6, %xmm6, %xmm6
-; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],xmm7[0]
-; AVX512VL-NEXT: vcvtss2sd %xmm5, %xmm5, %xmm5
-; AVX512VL-NEXT: vcvtss2sd %xmm4, %xmm4, %xmm4
-; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm4[0],xmm5[0]
-; AVX512VL-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4
-; AVX512VL-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
-; AVX512VL-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
-; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; AVX512VL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512VL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX512VL-NEXT: vinsertf64x4 $1, %ymm4, %zmm0, %zmm0
-; AVX512VL-NEXT: retq
+; AVX512-LABEL: load_cvt_8i16_to_8f64:
+; AVX512: # %bb.0:
+; AVX512-NEXT: movswl (%rdi), %eax
+; AVX512-NEXT: vmovd %eax, %xmm0
+; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512-NEXT: movswl 2(%rdi), %eax
+; AVX512-NEXT: vmovd %eax, %xmm1
+; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1
+; AVX512-NEXT: movswl 4(%rdi), %eax
+; AVX512-NEXT: vmovd %eax, %xmm2
+; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2
+; AVX512-NEXT: movswl 6(%rdi), %eax
+; AVX512-NEXT: vmovd %eax, %xmm3
+; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3
+; AVX512-NEXT: movswl 8(%rdi), %eax
+; AVX512-NEXT: vmovd %eax, %xmm4
+; AVX512-NEXT: vcvtph2ps %xmm4, %xmm4
+; AVX512-NEXT: movswl 10(%rdi), %eax
+; AVX512-NEXT: vmovd %eax, %xmm5
+; AVX512-NEXT: vcvtph2ps %xmm5, %xmm5
+; AVX512-NEXT: movswl 12(%rdi), %eax
+; AVX512-NEXT: vmovd %eax, %xmm6
+; AVX512-NEXT: vcvtph2ps %xmm6, %xmm6
+; AVX512-NEXT: movswl 14(%rdi), %eax
+; AVX512-NEXT: vmovd %eax, %xmm7
+; AVX512-NEXT: vcvtph2ps %xmm7, %xmm7
+; AVX512-NEXT: vcvtss2sd %xmm7, %xmm7, %xmm7
+; AVX512-NEXT: vcvtss2sd %xmm6, %xmm6, %xmm6
+; AVX512-NEXT: vmovlhps {{.*#+}} xmm6 = xmm6[0],xmm7[0]
+; AVX512-NEXT: vcvtss2sd %xmm5, %xmm5, %xmm5
+; AVX512-NEXT: vcvtss2sd %xmm4, %xmm4, %xmm4
+; AVX512-NEXT: vmovlhps {{.*#+}} xmm4 = xmm4[0],xmm5[0]
+; AVX512-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4
+; AVX512-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3
+; AVX512-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
+; AVX512-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; AVX512-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
+; AVX512-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX512-NEXT: vinsertf64x4 $1, %ymm4, %zmm0, %zmm0
+; AVX512-NEXT: retq
%1 = load <8 x i16>, <8 x i16>* %a0
%2 = bitcast <8 x i16> %1 to <8 x half>
%3 = fpext <8 x half> %2 to <8 x double>
@@ -2675,138 +2057,41 @@ define <8 x double> @load_cvt_8i16_to_8f64(<8 x i16>* %a0) nounwind {
;
define i16 @cvt_f32_to_i16(float %a0) nounwind {
-; AVX1-LABEL: cvt_f32_to_i16:
-; AVX1: # BB#0:
-; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX1-NEXT: vmovd %xmm0, %eax
-; AVX1-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: cvt_f32_to_i16:
-; AVX2: # BB#0:
-; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX2-NEXT: vmovd %xmm0, %eax
-; AVX2-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: cvt_f32_to_i16:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
-; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm0
-; AVX512F-NEXT: vmovd %xmm0, %eax
-; AVX512F-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: cvt_f32_to_i16:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512VL-NEXT: vmovd %xmm0, %eax
-; AVX512VL-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
-; AVX512VL-NEXT: retq
+; ALL-LABEL: cvt_f32_to_i16:
+; ALL: # %bb.0:
+; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; ALL-NEXT: vmovd %xmm0, %eax
+; ALL-NEXT: # kill: def %ax killed %ax killed %eax
+; ALL-NEXT: retq
%1 = fptrunc float %a0 to half
%2 = bitcast half %1 to i16
ret i16 %2
}
define <4 x i16> @cvt_4f32_to_4i16(<4 x float> %a0) nounwind {
-; AVX1-LABEL: cvt_4f32_to_4i16:
-; AVX1: # BB#0:
-; AVX1-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX1-NEXT: vmovd %xmm1, %eax
-; AVX1-NEXT: shll $16, %eax
-; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm1
-; AVX1-NEXT: vmovd %xmm1, %ecx
-; AVX1-NEXT: movzwl %cx, %ecx
-; AVX1-NEXT: orl %eax, %ecx
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX1-NEXT: vmovd %xmm1, %eax
-; AVX1-NEXT: shll $16, %eax
-; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX1-NEXT: vmovd %xmm0, %edx
-; AVX1-NEXT: movzwl %dx, %edx
-; AVX1-NEXT: orl %eax, %edx
-; AVX1-NEXT: shlq $32, %rdx
-; AVX1-NEXT: orq %rcx, %rdx
-; AVX1-NEXT: vmovq %rdx, %xmm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: cvt_4f32_to_4i16:
-; AVX2: # BB#0:
-; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX2-NEXT: vmovd %xmm1, %eax
-; AVX2-NEXT: shll $16, %eax
-; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm1
-; AVX2-NEXT: vmovd %xmm1, %ecx
-; AVX2-NEXT: movzwl %cx, %ecx
-; AVX2-NEXT: orl %eax, %ecx
-; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX2-NEXT: vmovd %xmm1, %eax
-; AVX2-NEXT: shll $16, %eax
-; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX2-NEXT: vmovd %xmm0, %edx
-; AVX2-NEXT: movzwl %dx, %edx
-; AVX2-NEXT: orl %eax, %edx
-; AVX2-NEXT: shlq $32, %rdx
-; AVX2-NEXT: orq %rcx, %rdx
-; AVX2-NEXT: vmovq %rdx, %xmm0
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: cvt_4f32_to_4i16:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
-; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm1
-; AVX512F-NEXT: vmovd %xmm1, %eax
-; AVX512F-NEXT: movzwl %ax, %eax
-; AVX512F-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
-; AVX512F-NEXT: vmovd %xmm1, %ecx
-; AVX512F-NEXT: shll $16, %ecx
-; AVX512F-NEXT: orl %eax, %ecx
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
-; AVX512F-NEXT: vmovd %xmm1, %eax
-; AVX512F-NEXT: movzwl %ax, %eax
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm0
-; AVX512F-NEXT: vmovd %xmm0, %edx
-; AVX512F-NEXT: shll $16, %edx
-; AVX512F-NEXT: orl %eax, %edx
-; AVX512F-NEXT: shlq $32, %rdx
-; AVX512F-NEXT: orq %rcx, %rdx
-; AVX512F-NEXT: vmovq %rdx, %xmm0
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: cvt_4f32_to_4i16:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX512VL-NEXT: vmovd %xmm1, %eax
-; AVX512VL-NEXT: shll $16, %eax
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm1
-; AVX512VL-NEXT: vmovd %xmm1, %ecx
-; AVX512VL-NEXT: movzwl %cx, %ecx
-; AVX512VL-NEXT: orl %eax, %ecx
-; AVX512VL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX512VL-NEXT: vmovd %xmm1, %eax
-; AVX512VL-NEXT: shll $16, %eax
-; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512VL-NEXT: vmovd %xmm0, %edx
-; AVX512VL-NEXT: movzwl %dx, %edx
-; AVX512VL-NEXT: orl %eax, %edx
-; AVX512VL-NEXT: shlq $32, %rdx
-; AVX512VL-NEXT: orq %rcx, %rdx
-; AVX512VL-NEXT: vmovq %rdx, %xmm0
-; AVX512VL-NEXT: retq
+; ALL-LABEL: cvt_4f32_to_4i16:
+; ALL: # %bb.0:
+; ALL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; ALL-NEXT: vmovd %xmm1, %eax
+; ALL-NEXT: shll $16, %eax
+; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm1
+; ALL-NEXT: vmovd %xmm1, %ecx
+; ALL-NEXT: movzwl %cx, %ecx
+; ALL-NEXT: orl %eax, %ecx
+; ALL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
+; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; ALL-NEXT: vmovd %xmm1, %eax
+; ALL-NEXT: shll $16, %eax
+; ALL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; ALL-NEXT: vmovd %xmm0, %edx
+; ALL-NEXT: movzwl %dx, %edx
+; ALL-NEXT: orl %eax, %edx
+; ALL-NEXT: shlq $32, %rdx
+; ALL-NEXT: orq %rcx, %rdx
+; ALL-NEXT: vmovq %rdx, %xmm0
+; ALL-NEXT: retq
%1 = fptrunc <4 x float> %a0 to <4 x half>
%2 = bitcast <4 x half> %1 to <4 x i16>
ret <4 x i16> %2
@@ -2814,7 +2099,7 @@ define <4 x i16> @cvt_4f32_to_4i16(<4 x float> %a0) nounwind {
define <8 x i16> @cvt_4f32_to_8i16_undef(<4 x float> %a0) nounwind {
; AVX1-LABEL: cvt_4f32_to_8i16_undef:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
; AVX1-NEXT: vmovd %xmm1, %eax
@@ -2839,7 +2124,7 @@ define <8 x i16> @cvt_4f32_to_8i16_undef(<4 x float> %a0) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: cvt_4f32_to_8i16_undef:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
; AVX2-NEXT: vmovd %xmm1, %eax
@@ -2864,34 +2149,32 @@ define <8 x i16> @cvt_4f32_to_8i16_undef(<4 x float> %a0) nounwind {
; AVX2-NEXT: retq
;
; AVX512F-LABEL: cvt_4f32_to_8i16_undef:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
-; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm1
-; AVX512F-NEXT: vmovd %xmm1, %eax
-; AVX512F-NEXT: movzwl %ax, %eax
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
+; AVX512F-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX512F-NEXT: vmovd %xmm1, %eax
+; AVX512F-NEXT: shll $16, %eax
+; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm1
; AVX512F-NEXT: vmovd %xmm1, %ecx
-; AVX512F-NEXT: shll $16, %ecx
+; AVX512F-NEXT: movzwl %cx, %ecx
; AVX512F-NEXT: orl %eax, %ecx
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
+; AVX512F-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
+; AVX512F-NEXT: vcvtps2ph $4, %xmm1, %xmm1
; AVX512F-NEXT: vmovd %xmm1, %eax
-; AVX512F-NEXT: movzwl %ax, %eax
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm0
+; AVX512F-NEXT: shll $16, %eax
+; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; AVX512F-NEXT: vmovd %xmm0, %edx
-; AVX512F-NEXT: shll $16, %edx
+; AVX512F-NEXT: movzwl %dx, %edx
; AVX512F-NEXT: orl %eax, %edx
; AVX512F-NEXT: shlq $32, %rdx
; AVX512F-NEXT: orq %rcx, %rdx
; AVX512F-NEXT: vmovq %rdx, %xmm0
; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: cvt_4f32_to_8i16_undef:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
; AVX512VL-NEXT: vmovd %xmm1, %eax
@@ -2923,7 +2206,7 @@ define <8 x i16> @cvt_4f32_to_8i16_undef(<4 x float> %a0) nounwind {
define <8 x i16> @cvt_4f32_to_8i16_zero(<4 x float> %a0) nounwind {
; AVX1-LABEL: cvt_4f32_to_8i16_zero:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
; AVX1-NEXT: vmovd %xmm1, %eax
@@ -2948,7 +2231,7 @@ define <8 x i16> @cvt_4f32_to_8i16_zero(<4 x float> %a0) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: cvt_4f32_to_8i16_zero:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
; AVX2-NEXT: vmovd %xmm1, %eax
@@ -2973,34 +2256,32 @@ define <8 x i16> @cvt_4f32_to_8i16_zero(<4 x float> %a0) nounwind {
; AVX2-NEXT: retq
;
; AVX512F-LABEL: cvt_4f32_to_8i16_zero:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
-; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm1
-; AVX512F-NEXT: vmovd %xmm1, %eax
-; AVX512F-NEXT: movzwl %ax, %eax
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
+; AVX512F-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX512F-NEXT: vmovd %xmm1, %eax
+; AVX512F-NEXT: shll $16, %eax
+; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm1
; AVX512F-NEXT: vmovd %xmm1, %ecx
-; AVX512F-NEXT: shll $16, %ecx
+; AVX512F-NEXT: movzwl %cx, %ecx
; AVX512F-NEXT: orl %eax, %ecx
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
+; AVX512F-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
+; AVX512F-NEXT: vcvtps2ph $4, %xmm1, %xmm1
; AVX512F-NEXT: vmovd %xmm1, %eax
-; AVX512F-NEXT: movzwl %ax, %eax
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm0
+; AVX512F-NEXT: shll $16, %eax
+; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; AVX512F-NEXT: vmovd %xmm0, %edx
-; AVX512F-NEXT: shll $16, %edx
+; AVX512F-NEXT: movzwl %dx, %edx
; AVX512F-NEXT: orl %eax, %edx
; AVX512F-NEXT: shlq $32, %rdx
; AVX512F-NEXT: orq %rcx, %rdx
; AVX512F-NEXT: vmovq %rdx, %xmm0
; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: cvt_4f32_to_8i16_zero:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
; AVX512VL-NEXT: vmovd %xmm1, %eax
@@ -3033,194 +2314,52 @@ define <8 x i16> @cvt_4f32_to_8i16_zero(<4 x float> %a0) nounwind {
}
define <8 x i16> @cvt_8f32_to_8i16(<8 x float> %a0) nounwind {
-; AVX1-LABEL: cvt_8f32_to_8i16:
-; AVX1: # BB#0:
-; AVX1-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX1-NEXT: vmovd %xmm1, %eax
-; AVX1-NEXT: shll $16, %eax
-; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm1
-; AVX1-NEXT: vmovd %xmm1, %ecx
-; AVX1-NEXT: movzwl %cx, %ecx
-; AVX1-NEXT: orl %eax, %ecx
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX1-NEXT: vmovd %xmm1, %edx
-; AVX1-NEXT: shll $16, %edx
-; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX1-NEXT: vmovd %xmm1, %eax
-; AVX1-NEXT: movzwl %ax, %eax
-; AVX1-NEXT: orl %edx, %eax
-; AVX1-NEXT: shlq $32, %rax
-; AVX1-NEXT: orq %rcx, %rax
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX1-NEXT: vmovd %xmm1, %ecx
-; AVX1-NEXT: shll $16, %ecx
-; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm1
-; AVX1-NEXT: vmovd %xmm1, %edx
-; AVX1-NEXT: movzwl %dx, %edx
-; AVX1-NEXT: orl %ecx, %edx
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX1-NEXT: vmovd %xmm1, %ecx
-; AVX1-NEXT: shll $16, %ecx
-; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX1-NEXT: vmovd %xmm0, %esi
-; AVX1-NEXT: movzwl %si, %esi
-; AVX1-NEXT: orl %ecx, %esi
-; AVX1-NEXT: shlq $32, %rsi
-; AVX1-NEXT: orq %rdx, %rsi
-; AVX1-NEXT: vmovq %rsi, %xmm0
-; AVX1-NEXT: vmovq %rax, %xmm1
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: cvt_8f32_to_8i16:
-; AVX2: # BB#0:
-; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX2-NEXT: vmovd %xmm1, %eax
-; AVX2-NEXT: shll $16, %eax
-; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm1
-; AVX2-NEXT: vmovd %xmm1, %ecx
-; AVX2-NEXT: movzwl %cx, %ecx
-; AVX2-NEXT: orl %eax, %ecx
-; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX2-NEXT: vmovd %xmm1, %edx
-; AVX2-NEXT: shll $16, %edx
-; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX2-NEXT: vmovd %xmm1, %eax
-; AVX2-NEXT: movzwl %ax, %eax
-; AVX2-NEXT: orl %edx, %eax
-; AVX2-NEXT: shlq $32, %rax
-; AVX2-NEXT: orq %rcx, %rax
-; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX2-NEXT: vmovd %xmm1, %ecx
-; AVX2-NEXT: shll $16, %ecx
-; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm1
-; AVX2-NEXT: vmovd %xmm1, %edx
-; AVX2-NEXT: movzwl %dx, %edx
-; AVX2-NEXT: orl %ecx, %edx
-; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX2-NEXT: vmovd %xmm1, %ecx
-; AVX2-NEXT: shll $16, %ecx
-; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX2-NEXT: vmovd %xmm0, %esi
-; AVX2-NEXT: movzwl %si, %esi
-; AVX2-NEXT: orl %ecx, %esi
-; AVX2-NEXT: shlq $32, %rsi
-; AVX2-NEXT: orq %rdx, %rsi
-; AVX2-NEXT: vmovq %rsi, %xmm0
-; AVX2-NEXT: vmovq %rax, %xmm1
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: cvt_8f32_to_8i16:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
-; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm1
-; AVX512F-NEXT: vmovd %xmm1, %eax
-; AVX512F-NEXT: movzwl %ax, %eax
-; AVX512F-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
-; AVX512F-NEXT: vmovd %xmm1, %ecx
-; AVX512F-NEXT: shll $16, %ecx
-; AVX512F-NEXT: orl %eax, %ecx
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
-; AVX512F-NEXT: vmovd %xmm1, %eax
-; AVX512F-NEXT: movzwl %ax, %edx
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
-; AVX512F-NEXT: vmovd %xmm1, %eax
-; AVX512F-NEXT: shll $16, %eax
-; AVX512F-NEXT: orl %edx, %eax
-; AVX512F-NEXT: shlq $32, %rax
-; AVX512F-NEXT: orq %rcx, %rax
-; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm1
-; AVX512F-NEXT: vmovd %xmm1, %ecx
-; AVX512F-NEXT: movzwl %cx, %ecx
-; AVX512F-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
-; AVX512F-NEXT: vmovd %xmm1, %edx
-; AVX512F-NEXT: shll $16, %edx
-; AVX512F-NEXT: orl %ecx, %edx
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
-; AVX512F-NEXT: vmovd %xmm1, %ecx
-; AVX512F-NEXT: movzwl %cx, %ecx
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm0
-; AVX512F-NEXT: vmovd %xmm0, %esi
-; AVX512F-NEXT: shll $16, %esi
-; AVX512F-NEXT: orl %ecx, %esi
-; AVX512F-NEXT: shlq $32, %rsi
-; AVX512F-NEXT: orq %rdx, %rsi
-; AVX512F-NEXT: vmovq %rsi, %xmm0
-; AVX512F-NEXT: vmovq %rax, %xmm1
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: cvt_8f32_to_8i16:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX512VL-NEXT: vmovd %xmm1, %eax
-; AVX512VL-NEXT: shll $16, %eax
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm1
-; AVX512VL-NEXT: vmovd %xmm1, %ecx
-; AVX512VL-NEXT: movzwl %cx, %ecx
-; AVX512VL-NEXT: orl %eax, %ecx
-; AVX512VL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX512VL-NEXT: vmovd %xmm1, %edx
-; AVX512VL-NEXT: shll $16, %edx
-; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX512VL-NEXT: vmovd %xmm1, %eax
-; AVX512VL-NEXT: movzwl %ax, %eax
-; AVX512VL-NEXT: orl %edx, %eax
-; AVX512VL-NEXT: shlq $32, %rax
-; AVX512VL-NEXT: orq %rcx, %rax
-; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX512VL-NEXT: vmovd %xmm1, %ecx
-; AVX512VL-NEXT: shll $16, %ecx
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm1
-; AVX512VL-NEXT: vmovd %xmm1, %edx
-; AVX512VL-NEXT: movzwl %dx, %edx
-; AVX512VL-NEXT: orl %ecx, %edx
-; AVX512VL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX512VL-NEXT: vmovd %xmm1, %ecx
-; AVX512VL-NEXT: shll $16, %ecx
-; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512VL-NEXT: vmovd %xmm0, %esi
-; AVX512VL-NEXT: movzwl %si, %esi
-; AVX512VL-NEXT: orl %ecx, %esi
-; AVX512VL-NEXT: shlq $32, %rsi
-; AVX512VL-NEXT: orq %rdx, %rsi
-; AVX512VL-NEXT: vmovq %rsi, %xmm0
-; AVX512VL-NEXT: vmovq %rax, %xmm1
-; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: retq
+; ALL-LABEL: cvt_8f32_to_8i16:
+; ALL: # %bb.0:
+; ALL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; ALL-NEXT: vmovd %xmm1, %eax
+; ALL-NEXT: shll $16, %eax
+; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm1
+; ALL-NEXT: vmovd %xmm1, %ecx
+; ALL-NEXT: movzwl %cx, %ecx
+; ALL-NEXT: orl %eax, %ecx
+; ALL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
+; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; ALL-NEXT: vmovd %xmm1, %edx
+; ALL-NEXT: shll $16, %edx
+; ALL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; ALL-NEXT: vmovd %xmm1, %eax
+; ALL-NEXT: movzwl %ax, %eax
+; ALL-NEXT: orl %edx, %eax
+; ALL-NEXT: shlq $32, %rax
+; ALL-NEXT: orq %rcx, %rax
+; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0
+; ALL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; ALL-NEXT: vmovd %xmm1, %ecx
+; ALL-NEXT: shll $16, %ecx
+; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm1
+; ALL-NEXT: vmovd %xmm1, %edx
+; ALL-NEXT: movzwl %dx, %edx
+; ALL-NEXT: orl %ecx, %edx
+; ALL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
+; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; ALL-NEXT: vmovd %xmm1, %ecx
+; ALL-NEXT: shll $16, %ecx
+; ALL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; ALL-NEXT: vmovd %xmm0, %esi
+; ALL-NEXT: movzwl %si, %esi
+; ALL-NEXT: orl %ecx, %esi
+; ALL-NEXT: shlq $32, %rsi
+; ALL-NEXT: orq %rdx, %rsi
+; ALL-NEXT: vmovq %rsi, %xmm0
+; ALL-NEXT: vmovq %rax, %xmm1
+; ALL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; ALL-NEXT: vzeroupper
+; ALL-NEXT: retq
%1 = fptrunc <8 x float> %a0 to <8 x half>
%2 = bitcast <8 x half> %1 to <8 x i16>
ret <8 x i16> %2
@@ -3228,7 +2367,7 @@ define <8 x i16> @cvt_8f32_to_8i16(<8 x float> %a0) nounwind {
define <16 x i16> @cvt_16f32_to_16i16(<16 x float> %a0) nounwind {
; AVX1-LABEL: cvt_16f32_to_16i16:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm2
; AVX1-NEXT: vmovd %xmm2, %eax
; AVX1-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
@@ -3295,7 +2434,7 @@ define <16 x i16> @cvt_16f32_to_16i16(<16 x float> %a0) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: cvt_16f32_to_16i16:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm2
; AVX2-NEXT: vmovd %xmm2, %eax
; AVX2-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
@@ -3361,141 +2500,73 @@ define <16 x i16> @cvt_16f32_to_16i16(<16 x float> %a0) nounwind {
; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
; AVX2-NEXT: retq
;
-; AVX512F-LABEL: cvt_16f32_to_16i16:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm2
-; AVX512F-NEXT: vmovd %xmm2, %eax
-; AVX512F-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm2, %ymm2
-; AVX512F-NEXT: vmovd %eax, %xmm3
-; AVX512F-NEXT: vmovd %xmm2, %eax
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm2, %ymm2
-; AVX512F-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
-; AVX512F-NEXT: vmovd %xmm2, %eax
-; AVX512F-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
-; AVX512F-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
-; AVX512F-NEXT: vmovd %xmm1, %eax
-; AVX512F-NEXT: vcvtps2ph $4, %zmm2, %ymm1
-; AVX512F-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3
-; AVX512F-NEXT: vmovd %xmm1, %eax
-; AVX512F-NEXT: vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
-; AVX512F-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
-; AVX512F-NEXT: vmovd %xmm1, %eax
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
-; AVX512F-NEXT: vpinsrw $5, %eax, %xmm3, %xmm3
-; AVX512F-NEXT: vmovd %xmm1, %eax
-; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm1
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm2, %ymm2
-; AVX512F-NEXT: vpinsrw $6, %eax, %xmm3, %xmm3
-; AVX512F-NEXT: vmovd %xmm2, %eax
-; AVX512F-NEXT: vpinsrw $7, %eax, %xmm3, %xmm2
-; AVX512F-NEXT: vmovd %xmm1, %eax
-; AVX512F-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
-; AVX512F-NEXT: vmovd %eax, %xmm3
-; AVX512F-NEXT: vmovd %xmm1, %eax
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
-; AVX512F-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
-; AVX512F-NEXT: vmovd %xmm1, %eax
-; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm0
-; AVX512F-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
-; AVX512F-NEXT: vmovd %xmm0, %eax
-; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm0
-; AVX512F-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3
-; AVX512F-NEXT: vmovd %xmm0, %eax
-; AVX512F-NEXT: vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm0
-; AVX512F-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
-; AVX512F-NEXT: vmovd %xmm0, %eax
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm1[1,0]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm0
-; AVX512F-NEXT: vpinsrw $5, %eax, %xmm3, %xmm3
-; AVX512F-NEXT: vmovd %xmm0, %eax
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[3,1,2,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm0
-; AVX512F-NEXT: vpinsrw $6, %eax, %xmm3, %xmm1
-; AVX512F-NEXT: vmovd %xmm0, %eax
-; AVX512F-NEXT: vpinsrw $7, %eax, %xmm1, %xmm0
-; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: cvt_16f32_to_16i16:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vextractf64x4 $1, %zmm0, %ymm1
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm2
-; AVX512VL-NEXT: vmovd %xmm2, %eax
-; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm2, %xmm2
-; AVX512VL-NEXT: vmovd %eax, %xmm3
-; AVX512VL-NEXT: vmovd %xmm2, %eax
-; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm2, %xmm2
-; AVX512VL-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
-; AVX512VL-NEXT: vmovd %xmm2, %eax
-; AVX512VL-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX512VL-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX512VL-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
-; AVX512VL-NEXT: vmovd %xmm1, %eax
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm2, %xmm1
-; AVX512VL-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3
-; AVX512VL-NEXT: vmovd %xmm1, %eax
-; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX512VL-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
-; AVX512VL-NEXT: vmovd %xmm1, %eax
-; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX512VL-NEXT: vpinsrw $5, %eax, %xmm3, %xmm3
-; AVX512VL-NEXT: vmovd %xmm1, %eax
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm1
-; AVX512VL-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm2, %xmm2
-; AVX512VL-NEXT: vpinsrw $6, %eax, %xmm3, %xmm3
-; AVX512VL-NEXT: vmovd %xmm2, %eax
-; AVX512VL-NEXT: vpinsrw $7, %eax, %xmm3, %xmm2
-; AVX512VL-NEXT: vmovd %xmm1, %eax
-; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX512VL-NEXT: vmovd %eax, %xmm3
-; AVX512VL-NEXT: vmovd %xmm1, %eax
-; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX512VL-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
-; AVX512VL-NEXT: vmovd %xmm1, %eax
-; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512VL-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
-; AVX512VL-NEXT: vmovd %xmm0, %eax
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm0
-; AVX512VL-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3
-; AVX512VL-NEXT: vmovd %xmm0, %eax
-; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512VL-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
-; AVX512VL-NEXT: vmovd %xmm0, %eax
-; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm1[1,0]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512VL-NEXT: vpinsrw $5, %eax, %xmm3, %xmm3
-; AVX512VL-NEXT: vmovd %xmm0, %eax
-; AVX512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[3,1,2,3]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512VL-NEXT: vpinsrw $6, %eax, %xmm3, %xmm1
-; AVX512VL-NEXT: vmovd %xmm0, %eax
-; AVX512VL-NEXT: vpinsrw $7, %eax, %xmm1, %xmm0
-; AVX512VL-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
-; AVX512VL-NEXT: retq
+; AVX512-LABEL: cvt_16f32_to_16i16:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1
+; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm2
+; AVX512-NEXT: vmovd %xmm2, %eax
+; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2
+; AVX512-NEXT: vmovd %eax, %xmm3
+; AVX512-NEXT: vmovd %xmm2, %eax
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
+; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2
+; AVX512-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
+; AVX512-NEXT: vmovd %xmm2, %eax
+; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
+; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX512-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
+; AVX512-NEXT: vmovd %xmm1, %eax
+; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm1
+; AVX512-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3
+; AVX512-NEXT: vmovd %xmm1, %eax
+; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX512-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
+; AVX512-NEXT: vmovd %xmm1, %eax
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0]
+; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX512-NEXT: vpinsrw $5, %eax, %xmm3, %xmm3
+; AVX512-NEXT: vmovd %xmm1, %eax
+; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm1
+; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3]
+; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2
+; AVX512-NEXT: vpinsrw $6, %eax, %xmm3, %xmm3
+; AVX512-NEXT: vmovd %xmm2, %eax
+; AVX512-NEXT: vpinsrw $7, %eax, %xmm3, %xmm2
+; AVX512-NEXT: vmovd %xmm1, %eax
+; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX512-NEXT: vmovd %eax, %xmm3
+; AVX512-NEXT: vmovd %xmm1, %eax
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX512-NEXT: vpinsrw $1, %eax, %xmm3, %xmm3
+; AVX512-NEXT: vmovd %xmm1, %eax
+; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT: vpinsrw $2, %eax, %xmm3, %xmm3
+; AVX512-NEXT: vmovd %xmm0, %eax
+; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm0
+; AVX512-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3
+; AVX512-NEXT: vmovd %xmm0, %eax
+; AVX512-NEXT: vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT: vpinsrw $4, %eax, %xmm3, %xmm3
+; AVX512-NEXT: vmovd %xmm0, %eax
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm1[1,0]
+; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT: vpinsrw $5, %eax, %xmm3, %xmm3
+; AVX512-NEXT: vmovd %xmm0, %eax
+; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[3,1,2,3]
+; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT: vpinsrw $6, %eax, %xmm3, %xmm1
+; AVX512-NEXT: vmovd %xmm0, %eax
+; AVX512-NEXT: vpinsrw $7, %eax, %xmm1, %xmm0
+; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX512-NEXT: retq
%1 = fptrunc <16 x float> %a0 to <16 x half>
%2 = bitcast <16 x half> %1 to <16 x i16>
ret <16 x i16> %2
@@ -3506,35 +2577,12 @@ define <16 x i16> @cvt_16f32_to_16i16(<16 x float> %a0) nounwind {
;
define void @store_cvt_f32_to_i16(float %a0, i16* %a1) nounwind {
-; AVX1-LABEL: store_cvt_f32_to_i16:
-; AVX1: # BB#0:
-; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX1-NEXT: vmovd %xmm0, %eax
-; AVX1-NEXT: movw %ax, (%rdi)
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: store_cvt_f32_to_i16:
-; AVX2: # BB#0:
-; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX2-NEXT: vmovd %xmm0, %eax
-; AVX2-NEXT: movw %ax, (%rdi)
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: store_cvt_f32_to_i16:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
-; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm0
-; AVX512F-NEXT: vmovd %xmm0, %eax
-; AVX512F-NEXT: movw %ax, (%rdi)
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: store_cvt_f32_to_i16:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512VL-NEXT: vmovd %xmm0, %eax
-; AVX512VL-NEXT: movw %ax, (%rdi)
-; AVX512VL-NEXT: retq
+; ALL-LABEL: store_cvt_f32_to_i16:
+; ALL: # %bb.0:
+; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; ALL-NEXT: vmovd %xmm0, %eax
+; ALL-NEXT: movw %ax, (%rdi)
+; ALL-NEXT: retq
%1 = fptrunc float %a0 to half
%2 = bitcast half %1 to i16
store i16 %2, i16* %a1
@@ -3542,83 +2590,24 @@ define void @store_cvt_f32_to_i16(float %a0, i16* %a1) nounwind {
}
define void @store_cvt_4f32_to_4i16(<4 x float> %a0, <4 x i16>* %a1) nounwind {
-; AVX1-LABEL: store_cvt_4f32_to_4i16:
-; AVX1: # BB#0:
-; AVX1-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX1-NEXT: vmovd %xmm1, %eax
-; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX1-NEXT: vmovd %xmm1, %ecx
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX1-NEXT: vmovd %xmm1, %edx
-; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX1-NEXT: vmovd %xmm0, %esi
-; AVX1-NEXT: movw %si, (%rdi)
-; AVX1-NEXT: movw %dx, 6(%rdi)
-; AVX1-NEXT: movw %cx, 4(%rdi)
-; AVX1-NEXT: movw %ax, 2(%rdi)
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: store_cvt_4f32_to_4i16:
-; AVX2: # BB#0:
-; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX2-NEXT: vmovd %xmm1, %eax
-; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX2-NEXT: vmovd %xmm1, %ecx
-; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX2-NEXT: vmovd %xmm1, %edx
-; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX2-NEXT: vmovd %xmm0, %esi
-; AVX2-NEXT: movw %si, (%rdi)
-; AVX2-NEXT: movw %dx, 6(%rdi)
-; AVX2-NEXT: movw %cx, 4(%rdi)
-; AVX2-NEXT: movw %ax, 2(%rdi)
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: store_cvt_4f32_to_4i16:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
-; AVX512F-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
-; AVX512F-NEXT: vmovd %xmm1, %eax
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
-; AVX512F-NEXT: vmovd %xmm1, %ecx
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
-; AVX512F-NEXT: vmovd %xmm1, %edx
-; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm0
-; AVX512F-NEXT: vmovd %xmm0, %esi
-; AVX512F-NEXT: movw %si, (%rdi)
-; AVX512F-NEXT: movw %dx, 6(%rdi)
-; AVX512F-NEXT: movw %cx, 4(%rdi)
-; AVX512F-NEXT: movw %ax, 2(%rdi)
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: store_cvt_4f32_to_4i16:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX512VL-NEXT: vmovd %xmm1, %eax
-; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX512VL-NEXT: vmovd %xmm1, %ecx
-; AVX512VL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX512VL-NEXT: vmovd %xmm1, %edx
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512VL-NEXT: vmovd %xmm0, %esi
-; AVX512VL-NEXT: movw %si, (%rdi)
-; AVX512VL-NEXT: movw %dx, 6(%rdi)
-; AVX512VL-NEXT: movw %cx, 4(%rdi)
-; AVX512VL-NEXT: movw %ax, 2(%rdi)
-; AVX512VL-NEXT: retq
+; ALL-LABEL: store_cvt_4f32_to_4i16:
+; ALL: # %bb.0:
+; ALL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; ALL-NEXT: vmovd %xmm1, %eax
+; ALL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; ALL-NEXT: vmovd %xmm1, %ecx
+; ALL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
+; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; ALL-NEXT: vmovd %xmm1, %edx
+; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; ALL-NEXT: vmovd %xmm0, %esi
+; ALL-NEXT: movw %si, (%rdi)
+; ALL-NEXT: movw %dx, 6(%rdi)
+; ALL-NEXT: movw %cx, 4(%rdi)
+; ALL-NEXT: movw %ax, 2(%rdi)
+; ALL-NEXT: retq
%1 = fptrunc <4 x float> %a0 to <4 x half>
%2 = bitcast <4 x half> %1 to <4 x i16>
store <4 x i16> %2, <4 x i16>* %a1
@@ -3627,7 +2616,7 @@ define void @store_cvt_4f32_to_4i16(<4 x float> %a0, <4 x i16>* %a1) nounwind {
define void @store_cvt_4f32_to_8i16_undef(<4 x float> %a0, <8 x i16>* %a1) nounwind {
; AVX1-LABEL: store_cvt_4f32_to_8i16_undef:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
; AVX1-NEXT: vmovd %xmm1, %eax
@@ -3653,7 +2642,7 @@ define void @store_cvt_4f32_to_8i16_undef(<4 x float> %a0, <8 x i16>* %a1) nounw
; AVX1-NEXT: retq
;
; AVX2-LABEL: store_cvt_4f32_to_8i16_undef:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
; AVX2-NEXT: vmovd %xmm1, %eax
@@ -3679,35 +2668,33 @@ define void @store_cvt_4f32_to_8i16_undef(<4 x float> %a0, <8 x i16>* %a1) nounw
; AVX2-NEXT: retq
;
; AVX512F-LABEL: store_cvt_4f32_to_8i16_undef:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
-; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm1
-; AVX512F-NEXT: vmovd %xmm1, %eax
-; AVX512F-NEXT: movzwl %ax, %eax
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
+; AVX512F-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX512F-NEXT: vmovd %xmm1, %eax
+; AVX512F-NEXT: shll $16, %eax
+; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm1
; AVX512F-NEXT: vmovd %xmm1, %ecx
-; AVX512F-NEXT: shll $16, %ecx
+; AVX512F-NEXT: movzwl %cx, %ecx
; AVX512F-NEXT: orl %eax, %ecx
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
+; AVX512F-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
+; AVX512F-NEXT: vcvtps2ph $4, %xmm1, %xmm1
; AVX512F-NEXT: vmovd %xmm1, %eax
-; AVX512F-NEXT: movzwl %ax, %eax
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm0
+; AVX512F-NEXT: shll $16, %eax
+; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; AVX512F-NEXT: vmovd %xmm0, %edx
-; AVX512F-NEXT: shll $16, %edx
+; AVX512F-NEXT: movzwl %dx, %edx
; AVX512F-NEXT: orl %eax, %edx
; AVX512F-NEXT: shlq $32, %rdx
; AVX512F-NEXT: orq %rcx, %rdx
; AVX512F-NEXT: vmovq %rdx, %xmm0
; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; AVX512F-NEXT: vmovdqa %xmm0, (%rdi)
-; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: store_cvt_4f32_to_8i16_undef:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
; AVX512VL-NEXT: vmovd %xmm1, %eax
@@ -3741,7 +2728,7 @@ define void @store_cvt_4f32_to_8i16_undef(<4 x float> %a0, <8 x i16>* %a1) nounw
define void @store_cvt_4f32_to_8i16_zero(<4 x float> %a0, <8 x i16>* %a1) nounwind {
; AVX1-LABEL: store_cvt_4f32_to_8i16_zero:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
; AVX1-NEXT: vmovd %xmm1, %eax
@@ -3767,7 +2754,7 @@ define void @store_cvt_4f32_to_8i16_zero(<4 x float> %a0, <8 x i16>* %a1) nounwi
; AVX1-NEXT: retq
;
; AVX2-LABEL: store_cvt_4f32_to_8i16_zero:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
; AVX2-NEXT: vmovd %xmm1, %eax
@@ -3793,35 +2780,33 @@ define void @store_cvt_4f32_to_8i16_zero(<4 x float> %a0, <8 x i16>* %a1) nounwi
; AVX2-NEXT: retq
;
; AVX512F-LABEL: store_cvt_4f32_to_8i16_zero:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
-; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm1
-; AVX512F-NEXT: vmovd %xmm1, %eax
-; AVX512F-NEXT: movzwl %ax, %eax
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
+; AVX512F-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX512F-NEXT: vmovd %xmm1, %eax
+; AVX512F-NEXT: shll $16, %eax
+; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm1
; AVX512F-NEXT: vmovd %xmm1, %ecx
-; AVX512F-NEXT: shll $16, %ecx
+; AVX512F-NEXT: movzwl %cx, %ecx
; AVX512F-NEXT: orl %eax, %ecx
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
+; AVX512F-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
+; AVX512F-NEXT: vcvtps2ph $4, %xmm1, %xmm1
; AVX512F-NEXT: vmovd %xmm1, %eax
-; AVX512F-NEXT: movzwl %ax, %eax
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm0
+; AVX512F-NEXT: shll $16, %eax
+; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; AVX512F-NEXT: vmovd %xmm0, %edx
-; AVX512F-NEXT: shll $16, %edx
+; AVX512F-NEXT: movzwl %dx, %edx
; AVX512F-NEXT: orl %eax, %edx
; AVX512F-NEXT: shlq $32, %rdx
; AVX512F-NEXT: orq %rcx, %rdx
; AVX512F-NEXT: vmovq %rdx, %xmm0
; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero
; AVX512F-NEXT: vmovdqa %xmm0, (%rdi)
-; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: store_cvt_4f32_to_8i16_zero:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
; AVX512VL-NEXT: vmovd %xmm1, %eax
@@ -3856,150 +2841,41 @@ define void @store_cvt_4f32_to_8i16_zero(<4 x float> %a0, <8 x i16>* %a1) nounwi
}
define void @store_cvt_8f32_to_8i16(<8 x float> %a0, <8 x i16>* %a1) nounwind {
-; AVX1-LABEL: store_cvt_8f32_to_8i16:
-; AVX1: # BB#0:
-; AVX1-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX1-NEXT: vmovd %xmm1, %r8d
-; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX1-NEXT: vmovd %xmm1, %r9d
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX1-NEXT: vmovd %xmm1, %r10d
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; AVX1-NEXT: vcvtps2ph $4, %xmm2, %xmm2
-; AVX1-NEXT: vmovd %xmm2, %r11d
-; AVX1-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
-; AVX1-NEXT: vcvtps2ph $4, %xmm2, %xmm2
-; AVX1-NEXT: vmovd %xmm2, %eax
-; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3]
-; AVX1-NEXT: vcvtps2ph $4, %xmm2, %xmm2
-; AVX1-NEXT: vmovd %xmm2, %ecx
-; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX1-NEXT: vmovd %xmm0, %edx
-; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm0
-; AVX1-NEXT: vmovd %xmm0, %esi
-; AVX1-NEXT: movw %si, 8(%rdi)
-; AVX1-NEXT: movw %dx, (%rdi)
-; AVX1-NEXT: movw %cx, 14(%rdi)
-; AVX1-NEXT: movw %ax, 12(%rdi)
-; AVX1-NEXT: movw %r11w, 10(%rdi)
-; AVX1-NEXT: movw %r10w, 6(%rdi)
-; AVX1-NEXT: movw %r9w, 4(%rdi)
-; AVX1-NEXT: movw %r8w, 2(%rdi)
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: store_cvt_8f32_to_8i16:
-; AVX2: # BB#0:
-; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX2-NEXT: vmovd %xmm1, %r8d
-; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX2-NEXT: vmovd %xmm1, %r9d
-; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX2-NEXT: vmovd %xmm1, %r10d
-; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; AVX2-NEXT: vcvtps2ph $4, %xmm2, %xmm2
-; AVX2-NEXT: vmovd %xmm2, %r11d
-; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
-; AVX2-NEXT: vcvtps2ph $4, %xmm2, %xmm2
-; AVX2-NEXT: vmovd %xmm2, %eax
-; AVX2-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3]
-; AVX2-NEXT: vcvtps2ph $4, %xmm2, %xmm2
-; AVX2-NEXT: vmovd %xmm2, %ecx
-; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX2-NEXT: vmovd %xmm0, %edx
-; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm0
-; AVX2-NEXT: vmovd %xmm0, %esi
-; AVX2-NEXT: movw %si, 8(%rdi)
-; AVX2-NEXT: movw %dx, (%rdi)
-; AVX2-NEXT: movw %cx, 14(%rdi)
-; AVX2-NEXT: movw %ax, 12(%rdi)
-; AVX2-NEXT: movw %r11w, 10(%rdi)
-; AVX2-NEXT: movw %r10w, 6(%rdi)
-; AVX2-NEXT: movw %r9w, 4(%rdi)
-; AVX2-NEXT: movw %r8w, 2(%rdi)
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: store_cvt_8f32_to_8i16:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
-; AVX512F-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
-; AVX512F-NEXT: vmovd %xmm1, %r8d
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
-; AVX512F-NEXT: vmovd %xmm1, %r9d
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
-; AVX512F-NEXT: vmovd %xmm1, %r10d
-; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX512F-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm2, %ymm2
-; AVX512F-NEXT: vmovd %xmm2, %r11d
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm2, %ymm2
-; AVX512F-NEXT: vmovd %xmm2, %eax
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm2, %ymm2
-; AVX512F-NEXT: vmovd %xmm2, %ecx
-; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm0
-; AVX512F-NEXT: vmovd %xmm0, %edx
-; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm0
-; AVX512F-NEXT: vmovd %xmm0, %esi
-; AVX512F-NEXT: movw %si, 8(%rdi)
-; AVX512F-NEXT: movw %dx, (%rdi)
-; AVX512F-NEXT: movw %cx, 14(%rdi)
-; AVX512F-NEXT: movw %ax, 12(%rdi)
-; AVX512F-NEXT: movw %r11w, 10(%rdi)
-; AVX512F-NEXT: movw %r10w, 6(%rdi)
-; AVX512F-NEXT: movw %r9w, 4(%rdi)
-; AVX512F-NEXT: movw %r8w, 2(%rdi)
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: store_cvt_8f32_to_8i16:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX512VL-NEXT: vmovd %xmm1, %r8d
-; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX512VL-NEXT: vmovd %xmm1, %r9d
-; AVX512VL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX512VL-NEXT: vmovd %xmm1, %r10d
-; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm2, %xmm2
-; AVX512VL-NEXT: vmovd %xmm2, %r11d
-; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm2, %xmm2
-; AVX512VL-NEXT: vmovd %xmm2, %eax
-; AVX512VL-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm2, %xmm2
-; AVX512VL-NEXT: vmovd %xmm2, %ecx
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512VL-NEXT: vmovd %xmm0, %edx
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm0
-; AVX512VL-NEXT: vmovd %xmm0, %esi
-; AVX512VL-NEXT: movw %si, 8(%rdi)
-; AVX512VL-NEXT: movw %dx, (%rdi)
-; AVX512VL-NEXT: movw %cx, 14(%rdi)
-; AVX512VL-NEXT: movw %ax, 12(%rdi)
-; AVX512VL-NEXT: movw %r11w, 10(%rdi)
-; AVX512VL-NEXT: movw %r10w, 6(%rdi)
-; AVX512VL-NEXT: movw %r9w, 4(%rdi)
-; AVX512VL-NEXT: movw %r8w, 2(%rdi)
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: retq
+; ALL-LABEL: store_cvt_8f32_to_8i16:
+; ALL: # %bb.0:
+; ALL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; ALL-NEXT: vmovd %xmm1, %r8d
+; ALL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; ALL-NEXT: vmovd %xmm1, %r9d
+; ALL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
+; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; ALL-NEXT: vmovd %xmm1, %r10d
+; ALL-NEXT: vextractf128 $1, %ymm0, %xmm1
+; ALL-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; ALL-NEXT: vcvtps2ph $4, %xmm2, %xmm2
+; ALL-NEXT: vmovd %xmm2, %r11d
+; ALL-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
+; ALL-NEXT: vcvtps2ph $4, %xmm2, %xmm2
+; ALL-NEXT: vmovd %xmm2, %eax
+; ALL-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3]
+; ALL-NEXT: vcvtps2ph $4, %xmm2, %xmm2
+; ALL-NEXT: vmovd %xmm2, %ecx
+; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; ALL-NEXT: vmovd %xmm0, %edx
+; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm0
+; ALL-NEXT: vmovd %xmm0, %esi
+; ALL-NEXT: movw %si, 8(%rdi)
+; ALL-NEXT: movw %dx, (%rdi)
+; ALL-NEXT: movw %cx, 14(%rdi)
+; ALL-NEXT: movw %ax, 12(%rdi)
+; ALL-NEXT: movw %r11w, 10(%rdi)
+; ALL-NEXT: movw %r10w, 6(%rdi)
+; ALL-NEXT: movw %r9w, 4(%rdi)
+; ALL-NEXT: movw %r8w, 2(%rdi)
+; ALL-NEXT: vzeroupper
+; ALL-NEXT: retq
%1 = fptrunc <8 x float> %a0 to <8 x half>
%2 = bitcast <8 x half> %1 to <8 x i16>
store <8 x i16> %2, <8 x i16>* %a1
@@ -4008,7 +2884,7 @@ define void @store_cvt_8f32_to_8i16(<8 x float> %a0, <8 x i16>* %a1) nounwind {
define void @store_cvt_16f32_to_16i16(<16 x float> %a0, <16 x i16>* %a1) nounwind {
; AVX1-LABEL: store_cvt_16f32_to_16i16:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
; AVX1-NEXT: vcvtps2ph $4, %xmm3, %xmm4
@@ -4075,7 +2951,7 @@ define void @store_cvt_16f32_to_16i16(<16 x float> %a0, <16 x i16>* %a1) nounwin
; AVX1-NEXT: retq
;
; AVX2-LABEL: store_cvt_16f32_to_16i16:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm3
; AVX2-NEXT: vcvtps2ph $4, %xmm3, %xmm4
@@ -4141,141 +3017,73 @@ define void @store_cvt_16f32_to_16i16(<16 x float> %a0, <16 x i16>* %a1) nounwin
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
-; AVX512F-LABEL: store_cvt_16f32_to_16i16:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, %ymm2
-; AVX512F-NEXT: vextractf128 $1, %ymm2, %xmm3
-; AVX512F-NEXT: vcvtps2ph $4, %zmm3, %ymm4
-; AVX512F-NEXT: vmovd %xmm4, %eax
-; AVX512F-NEXT: vcvtps2ph $4, %zmm2, %ymm4
-; AVX512F-NEXT: movw %ax, 24(%rdi)
-; AVX512F-NEXT: vmovd %xmm4, %eax
-; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm4
-; AVX512F-NEXT: movw %ax, 16(%rdi)
-; AVX512F-NEXT: vmovd %xmm4, %eax
-; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm4
-; AVX512F-NEXT: movw %ax, 8(%rdi)
-; AVX512F-NEXT: vmovd %xmm4, %eax
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm4 = xmm3[3,1,2,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm4, %ymm4
-; AVX512F-NEXT: movw %ax, (%rdi)
-; AVX512F-NEXT: vmovd %xmm4, %eax
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm4 = xmm3[1,0]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm4, %ymm4
-; AVX512F-NEXT: movw %ax, 30(%rdi)
-; AVX512F-NEXT: vmovd %xmm4, %eax
-; AVX512F-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm4, %ymm4
-; AVX512F-NEXT: vmovshdup {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm3, %ymm3
-; AVX512F-NEXT: movw %ax, 28(%rdi)
-; AVX512F-NEXT: vmovd %xmm3, %eax
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm3 = xmm2[3,1,2,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm3, %ymm3
-; AVX512F-NEXT: movw %ax, 26(%rdi)
-; AVX512F-NEXT: vmovd %xmm3, %eax
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm3, %ymm3
-; AVX512F-NEXT: movw %ax, 22(%rdi)
-; AVX512F-NEXT: vmovd %xmm3, %eax
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm3, %ymm3
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm0
-; AVX512F-NEXT: vmovshdup {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm2, %ymm2
-; AVX512F-NEXT: movw %ax, 20(%rdi)
-; AVX512F-NEXT: vmovd %xmm2, %eax
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm2, %ymm2
-; AVX512F-NEXT: movw %ax, 18(%rdi)
-; AVX512F-NEXT: vmovd %xmm2, %eax
-; AVX512F-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm2, %ymm2
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
-; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1
-; AVX512F-NEXT: movw %ax, 14(%rdi)
-; AVX512F-NEXT: vmovd %xmm1, %eax
-; AVX512F-NEXT: movw %ax, 12(%rdi)
-; AVX512F-NEXT: vmovd %xmm2, %eax
-; AVX512F-NEXT: movw %ax, 10(%rdi)
-; AVX512F-NEXT: vmovd %xmm0, %eax
-; AVX512F-NEXT: movw %ax, 6(%rdi)
-; AVX512F-NEXT: vmovd %xmm3, %eax
-; AVX512F-NEXT: movw %ax, 4(%rdi)
-; AVX512F-NEXT: vmovd %xmm4, %eax
-; AVX512F-NEXT: movw %ax, 2(%rdi)
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: store_cvt_16f32_to_16i16:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX512VL-NEXT: vextractf64x4 $1, %zmm0, %ymm2
-; AVX512VL-NEXT: vextractf128 $1, %ymm2, %xmm3
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm3, %xmm4
-; AVX512VL-NEXT: vmovd %xmm4, %eax
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm2, %xmm4
-; AVX512VL-NEXT: movw %ax, 24(%rdi)
-; AVX512VL-NEXT: vmovd %xmm4, %eax
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm4
-; AVX512VL-NEXT: movw %ax, 16(%rdi)
-; AVX512VL-NEXT: vmovd %xmm4, %eax
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm4
-; AVX512VL-NEXT: movw %ax, 8(%rdi)
-; AVX512VL-NEXT: vmovd %xmm4, %eax
-; AVX512VL-NEXT: vpermilps {{.*#+}} xmm4 = xmm3[3,1,2,3]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm4, %xmm4
-; AVX512VL-NEXT: movw %ax, (%rdi)
-; AVX512VL-NEXT: vmovd %xmm4, %eax
-; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm4 = xmm3[1,0]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm4, %xmm4
-; AVX512VL-NEXT: movw %ax, 30(%rdi)
-; AVX512VL-NEXT: vmovd %xmm4, %eax
-; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm4, %xmm4
-; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm3, %xmm3
-; AVX512VL-NEXT: movw %ax, 28(%rdi)
-; AVX512VL-NEXT: vmovd %xmm3, %eax
-; AVX512VL-NEXT: vpermilps {{.*#+}} xmm3 = xmm2[3,1,2,3]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm3, %xmm3
-; AVX512VL-NEXT: movw %ax, 26(%rdi)
-; AVX512VL-NEXT: vmovd %xmm3, %eax
-; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm3, %xmm3
-; AVX512VL-NEXT: movw %ax, 22(%rdi)
-; AVX512VL-NEXT: vmovd %xmm3, %eax
-; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm3, %xmm3
-; AVX512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm2, %xmm2
-; AVX512VL-NEXT: movw %ax, 20(%rdi)
-; AVX512VL-NEXT: vmovd %xmm2, %eax
-; AVX512VL-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm2, %xmm2
-; AVX512VL-NEXT: movw %ax, 18(%rdi)
-; AVX512VL-NEXT: vmovd %xmm2, %eax
-; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm2, %xmm2
-; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
-; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX512VL-NEXT: movw %ax, 14(%rdi)
-; AVX512VL-NEXT: vmovd %xmm1, %eax
-; AVX512VL-NEXT: movw %ax, 12(%rdi)
-; AVX512VL-NEXT: vmovd %xmm2, %eax
-; AVX512VL-NEXT: movw %ax, 10(%rdi)
-; AVX512VL-NEXT: vmovd %xmm0, %eax
-; AVX512VL-NEXT: movw %ax, 6(%rdi)
-; AVX512VL-NEXT: vmovd %xmm3, %eax
-; AVX512VL-NEXT: movw %ax, 4(%rdi)
-; AVX512VL-NEXT: vmovd %xmm4, %eax
-; AVX512VL-NEXT: movw %ax, 2(%rdi)
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: retq
+; AVX512-LABEL: store_cvt_16f32_to_16i16:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm2
+; AVX512-NEXT: vextractf128 $1, %ymm2, %xmm3
+; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm4
+; AVX512-NEXT: vmovd %xmm4, %eax
+; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm4
+; AVX512-NEXT: movw %ax, 24(%rdi)
+; AVX512-NEXT: vmovd %xmm4, %eax
+; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm4
+; AVX512-NEXT: movw %ax, 16(%rdi)
+; AVX512-NEXT: vmovd %xmm4, %eax
+; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm4
+; AVX512-NEXT: movw %ax, 8(%rdi)
+; AVX512-NEXT: vmovd %xmm4, %eax
+; AVX512-NEXT: vpermilps {{.*#+}} xmm4 = xmm3[3,1,2,3]
+; AVX512-NEXT: vcvtps2ph $4, %xmm4, %xmm4
+; AVX512-NEXT: movw %ax, (%rdi)
+; AVX512-NEXT: vmovd %xmm4, %eax
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm4 = xmm3[1,0]
+; AVX512-NEXT: vcvtps2ph $4, %xmm4, %xmm4
+; AVX512-NEXT: movw %ax, 30(%rdi)
+; AVX512-NEXT: vmovd %xmm4, %eax
+; AVX512-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
+; AVX512-NEXT: vcvtps2ph $4, %xmm4, %xmm4
+; AVX512-NEXT: vmovshdup {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm3
+; AVX512-NEXT: movw %ax, 28(%rdi)
+; AVX512-NEXT: vmovd %xmm3, %eax
+; AVX512-NEXT: vpermilps {{.*#+}} xmm3 = xmm2[3,1,2,3]
+; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm3
+; AVX512-NEXT: movw %ax, 26(%rdi)
+; AVX512-NEXT: vmovd %xmm3, %eax
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0]
+; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm3
+; AVX512-NEXT: movw %ax, 22(%rdi)
+; AVX512-NEXT: vmovd %xmm3, %eax
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
+; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm3
+; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2
+; AVX512-NEXT: movw %ax, 20(%rdi)
+; AVX512-NEXT: vmovd %xmm2, %eax
+; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3]
+; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2
+; AVX512-NEXT: movw %ax, 18(%rdi)
+; AVX512-NEXT: vmovd %xmm2, %eax
+; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
+; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1
+; AVX512-NEXT: movw %ax, 14(%rdi)
+; AVX512-NEXT: vmovd %xmm1, %eax
+; AVX512-NEXT: movw %ax, 12(%rdi)
+; AVX512-NEXT: vmovd %xmm2, %eax
+; AVX512-NEXT: movw %ax, 10(%rdi)
+; AVX512-NEXT: vmovd %xmm0, %eax
+; AVX512-NEXT: movw %ax, 6(%rdi)
+; AVX512-NEXT: vmovd %xmm3, %eax
+; AVX512-NEXT: movw %ax, 4(%rdi)
+; AVX512-NEXT: vmovd %xmm4, %eax
+; AVX512-NEXT: movw %ax, 2(%rdi)
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
%1 = fptrunc <16 x float> %a0 to <16 x half>
%2 = bitcast <16 x half> %1 to <16 x i16>
store <16 x i16> %2, <16 x i16>* %a1
@@ -4288,7 +3096,7 @@ define void @store_cvt_16f32_to_16i16(<16 x float> %a0, <16 x i16>* %a1) nounwin
define i16 @cvt_f64_to_i16(double %a0) nounwind {
; ALL-LABEL: cvt_f64_to_i16:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: jmp __truncdfhf2 # TAILCALL
%1 = fptrunc double %a0 to half
%2 = bitcast half %1 to i16
@@ -4297,13 +3105,13 @@ define i16 @cvt_f64_to_i16(double %a0) nounwind {
define <2 x i16> @cvt_2f64_to_2i16(<2 x double> %a0) nounwind {
; ALL-LABEL: cvt_2f64_to_2i16:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: pushq %rbx
; ALL-NEXT: subq $16, %rsp
; ALL-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
; ALL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; ALL-NEXT: callq __truncdfhf2
-; ALL-NEXT: movw %ax, %bx
+; ALL-NEXT: movl %eax, %ebx
; ALL-NEXT: shll $16, %ebx
; ALL-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
; ALL-NEXT: callq __truncdfhf2
@@ -4320,7 +3128,7 @@ define <2 x i16> @cvt_2f64_to_2i16(<2 x double> %a0) nounwind {
define <4 x i16> @cvt_4f64_to_4i16(<4 x double> %a0) nounwind {
; AVX1-LABEL: cvt_4f64_to_4i16:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: pushq %r14
; AVX1-NEXT: pushq %rbx
; AVX1-NEXT: subq $40, %rsp
@@ -4328,10 +3136,10 @@ define <4 x i16> @cvt_4f64_to_4i16(<4 x double> %a0) nounwind {
; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: callq __truncdfhf2
-; AVX1-NEXT: movw %ax, %bx
+; AVX1-NEXT: movl %eax, %ebx
; AVX1-NEXT: shll $16, %ebx
; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
-; AVX1-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX1-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: callq __truncdfhf2
; AVX1-NEXT: movzwl %ax, %r14d
@@ -4342,7 +3150,7 @@ define <4 x i16> @cvt_4f64_to_4i16(<4 x double> %a0) nounwind {
; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: callq __truncdfhf2
-; AVX1-NEXT: movw %ax, %bx
+; AVX1-NEXT: movl %eax, %ebx
; AVX1-NEXT: shll $16, %ebx
; AVX1-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
; AVX1-NEXT: callq __truncdfhf2
@@ -4357,7 +3165,7 @@ define <4 x i16> @cvt_4f64_to_4i16(<4 x double> %a0) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: cvt_4f64_to_4i16:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: pushq %r14
; AVX2-NEXT: pushq %rbx
; AVX2-NEXT: subq $40, %rsp
@@ -4365,10 +3173,10 @@ define <4 x i16> @cvt_4f64_to_4i16(<4 x double> %a0) nounwind {
; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: callq __truncdfhf2
-; AVX2-NEXT: movw %ax, %bx
+; AVX2-NEXT: movl %eax, %ebx
; AVX2-NEXT: shll $16, %ebx
; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
-; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: callq __truncdfhf2
; AVX2-NEXT: movzwl %ax, %r14d
@@ -4379,7 +3187,7 @@ define <4 x i16> @cvt_4f64_to_4i16(<4 x double> %a0) nounwind {
; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: callq __truncdfhf2
-; AVX2-NEXT: movw %ax, %bx
+; AVX2-NEXT: movl %eax, %ebx
; AVX2-NEXT: shll $16, %ebx
; AVX2-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
; AVX2-NEXT: callq __truncdfhf2
@@ -4393,79 +3201,42 @@ define <4 x i16> @cvt_4f64_to_4i16(<4 x double> %a0) nounwind {
; AVX2-NEXT: popq %r14
; AVX2-NEXT: retq
;
-; AVX512F-LABEL: cvt_4f64_to_4i16:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: pushq %r14
-; AVX512F-NEXT: pushq %rbx
-; AVX512F-NEXT: subq $40, %rsp
-; AVX512F-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: callq __truncdfhf2
-; AVX512F-NEXT: movw %ax, %bx
-; AVX512F-NEXT: shll $16, %ebx
-; AVX512F-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
-; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: callq __truncdfhf2
-; AVX512F-NEXT: movzwl %ax, %r14d
-; AVX512F-NEXT: orl %ebx, %r14d
-; AVX512F-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload
-; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX512F-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: callq __truncdfhf2
-; AVX512F-NEXT: movw %ax, %bx
-; AVX512F-NEXT: shll $16, %ebx
-; AVX512F-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
-; AVX512F-NEXT: callq __truncdfhf2
-; AVX512F-NEXT: movzwl %ax, %eax
-; AVX512F-NEXT: orl %ebx, %eax
-; AVX512F-NEXT: shlq $32, %rax
-; AVX512F-NEXT: orq %r14, %rax
-; AVX512F-NEXT: vmovq %rax, %xmm0
-; AVX512F-NEXT: addq $40, %rsp
-; AVX512F-NEXT: popq %rbx
-; AVX512F-NEXT: popq %r14
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: cvt_4f64_to_4i16:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: pushq %r14
-; AVX512VL-NEXT: pushq %rbx
-; AVX512VL-NEXT: subq $40, %rsp
-; AVX512VL-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill
-; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: callq __truncdfhf2
-; AVX512VL-NEXT: movw %ax, %bx
-; AVX512VL-NEXT: shll $16, %ebx
-; AVX512VL-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
-; AVX512VL-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: callq __truncdfhf2
-; AVX512VL-NEXT: movzwl %ax, %r14d
-; AVX512VL-NEXT: orl %ebx, %r14d
-; AVX512VL-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
-; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX512VL-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
-; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: callq __truncdfhf2
-; AVX512VL-NEXT: movw %ax, %bx
-; AVX512VL-NEXT: shll $16, %ebx
-; AVX512VL-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
-; AVX512VL-NEXT: callq __truncdfhf2
-; AVX512VL-NEXT: movzwl %ax, %eax
-; AVX512VL-NEXT: orl %ebx, %eax
-; AVX512VL-NEXT: shlq $32, %rax
-; AVX512VL-NEXT: orq %r14, %rax
-; AVX512VL-NEXT: vmovq %rax, %xmm0
-; AVX512VL-NEXT: addq $40, %rsp
-; AVX512VL-NEXT: popq %rbx
-; AVX512VL-NEXT: popq %r14
-; AVX512VL-NEXT: retq
+; AVX512-LABEL: cvt_4f64_to_4i16:
+; AVX512: # %bb.0:
+; AVX512-NEXT: pushq %r14
+; AVX512-NEXT: pushq %rbx
+; AVX512-NEXT: subq $40, %rsp
+; AVX512-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: callq __truncdfhf2
+; AVX512-NEXT: movl %eax, %ebx
+; AVX512-NEXT: shll $16, %ebx
+; AVX512-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
+; AVX512-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: callq __truncdfhf2
+; AVX512-NEXT: movzwl %ax, %r14d
+; AVX512-NEXT: orl %ebx, %r14d
+; AVX512-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload
+; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX512-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: callq __truncdfhf2
+; AVX512-NEXT: movl %eax, %ebx
+; AVX512-NEXT: shll $16, %ebx
+; AVX512-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
+; AVX512-NEXT: callq __truncdfhf2
+; AVX512-NEXT: movzwl %ax, %eax
+; AVX512-NEXT: orl %ebx, %eax
+; AVX512-NEXT: shlq $32, %rax
+; AVX512-NEXT: orq %r14, %rax
+; AVX512-NEXT: vmovq %rax, %xmm0
+; AVX512-NEXT: addq $40, %rsp
+; AVX512-NEXT: popq %rbx
+; AVX512-NEXT: popq %r14
+; AVX512-NEXT: retq
%1 = fptrunc <4 x double> %a0 to <4 x half>
%2 = bitcast <4 x half> %1 to <4 x i16>
ret <4 x i16> %2
@@ -4473,7 +3244,7 @@ define <4 x i16> @cvt_4f64_to_4i16(<4 x double> %a0) nounwind {
define <8 x i16> @cvt_4f64_to_8i16_undef(<4 x double> %a0) nounwind {
; AVX1-LABEL: cvt_4f64_to_8i16_undef:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: pushq %r14
; AVX1-NEXT: pushq %rbx
; AVX1-NEXT: subq $40, %rsp
@@ -4481,10 +3252,10 @@ define <8 x i16> @cvt_4f64_to_8i16_undef(<4 x double> %a0) nounwind {
; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: callq __truncdfhf2
-; AVX1-NEXT: movw %ax, %bx
+; AVX1-NEXT: movl %eax, %ebx
; AVX1-NEXT: shll $16, %ebx
; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
-; AVX1-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX1-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: callq __truncdfhf2
; AVX1-NEXT: movzwl %ax, %r14d
@@ -4495,7 +3266,7 @@ define <8 x i16> @cvt_4f64_to_8i16_undef(<4 x double> %a0) nounwind {
; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: callq __truncdfhf2
-; AVX1-NEXT: movw %ax, %bx
+; AVX1-NEXT: movl %eax, %ebx
; AVX1-NEXT: shll $16, %ebx
; AVX1-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
; AVX1-NEXT: callq __truncdfhf2
@@ -4511,7 +3282,7 @@ define <8 x i16> @cvt_4f64_to_8i16_undef(<4 x double> %a0) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: cvt_4f64_to_8i16_undef:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: pushq %r14
; AVX2-NEXT: pushq %rbx
; AVX2-NEXT: subq $40, %rsp
@@ -4519,10 +3290,10 @@ define <8 x i16> @cvt_4f64_to_8i16_undef(<4 x double> %a0) nounwind {
; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: callq __truncdfhf2
-; AVX2-NEXT: movw %ax, %bx
+; AVX2-NEXT: movl %eax, %ebx
; AVX2-NEXT: shll $16, %ebx
; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
-; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: callq __truncdfhf2
; AVX2-NEXT: movzwl %ax, %r14d
@@ -4533,7 +3304,7 @@ define <8 x i16> @cvt_4f64_to_8i16_undef(<4 x double> %a0) nounwind {
; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: callq __truncdfhf2
-; AVX2-NEXT: movw %ax, %bx
+; AVX2-NEXT: movl %eax, %ebx
; AVX2-NEXT: shll $16, %ebx
; AVX2-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
; AVX2-NEXT: callq __truncdfhf2
@@ -4549,7 +3320,7 @@ define <8 x i16> @cvt_4f64_to_8i16_undef(<4 x double> %a0) nounwind {
; AVX2-NEXT: retq
;
; AVX512F-LABEL: cvt_4f64_to_8i16_undef:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: pushq %r14
; AVX512F-NEXT: pushq %rbx
; AVX512F-NEXT: subq $40, %rsp
@@ -4557,10 +3328,10 @@ define <8 x i16> @cvt_4f64_to_8i16_undef(<4 x double> %a0) nounwind {
; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: callq __truncdfhf2
-; AVX512F-NEXT: movw %ax, %bx
+; AVX512F-NEXT: movl %eax, %ebx
; AVX512F-NEXT: shll $16, %ebx
; AVX512F-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
-; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512F-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: callq __truncdfhf2
; AVX512F-NEXT: movzwl %ax, %r14d
@@ -4571,7 +3342,7 @@ define <8 x i16> @cvt_4f64_to_8i16_undef(<4 x double> %a0) nounwind {
; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: callq __truncdfhf2
-; AVX512F-NEXT: movw %ax, %bx
+; AVX512F-NEXT: movl %eax, %ebx
; AVX512F-NEXT: shll $16, %ebx
; AVX512F-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
; AVX512F-NEXT: callq __truncdfhf2
@@ -4587,7 +3358,7 @@ define <8 x i16> @cvt_4f64_to_8i16_undef(<4 x double> %a0) nounwind {
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: cvt_4f64_to_8i16_undef:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: pushq %r14
; AVX512VL-NEXT: pushq %rbx
; AVX512VL-NEXT: subq $40, %rsp
@@ -4595,21 +3366,21 @@ define <8 x i16> @cvt_4f64_to_8i16_undef(<4 x double> %a0) nounwind {
; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: callq __truncdfhf2
-; AVX512VL-NEXT: movw %ax, %bx
+; AVX512VL-NEXT: movl %eax, %ebx
; AVX512VL-NEXT: shll $16, %ebx
; AVX512VL-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
-; AVX512VL-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512VL-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: callq __truncdfhf2
; AVX512VL-NEXT: movzwl %ax, %r14d
; AVX512VL-NEXT: orl %ebx, %r14d
-; AVX512VL-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
+; AVX512VL-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload
; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX512VL-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX512VL-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: callq __truncdfhf2
-; AVX512VL-NEXT: movw %ax, %bx
+; AVX512VL-NEXT: movl %eax, %ebx
; AVX512VL-NEXT: shll $16, %ebx
; AVX512VL-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
; AVX512VL-NEXT: callq __truncdfhf2
@@ -4632,7 +3403,7 @@ define <8 x i16> @cvt_4f64_to_8i16_undef(<4 x double> %a0) nounwind {
define <8 x i16> @cvt_4f64_to_8i16_zero(<4 x double> %a0) nounwind {
; AVX1-LABEL: cvt_4f64_to_8i16_zero:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: pushq %r14
; AVX1-NEXT: pushq %rbx
; AVX1-NEXT: subq $40, %rsp
@@ -4640,10 +3411,10 @@ define <8 x i16> @cvt_4f64_to_8i16_zero(<4 x double> %a0) nounwind {
; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: callq __truncdfhf2
-; AVX1-NEXT: movw %ax, %bx
+; AVX1-NEXT: movl %eax, %ebx
; AVX1-NEXT: shll $16, %ebx
; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
-; AVX1-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX1-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: callq __truncdfhf2
; AVX1-NEXT: movzwl %ax, %r14d
@@ -4654,7 +3425,7 @@ define <8 x i16> @cvt_4f64_to_8i16_zero(<4 x double> %a0) nounwind {
; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: callq __truncdfhf2
-; AVX1-NEXT: movw %ax, %bx
+; AVX1-NEXT: movl %eax, %ebx
; AVX1-NEXT: shll $16, %ebx
; AVX1-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
; AVX1-NEXT: callq __truncdfhf2
@@ -4670,7 +3441,7 @@ define <8 x i16> @cvt_4f64_to_8i16_zero(<4 x double> %a0) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: cvt_4f64_to_8i16_zero:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: pushq %r14
; AVX2-NEXT: pushq %rbx
; AVX2-NEXT: subq $40, %rsp
@@ -4678,10 +3449,10 @@ define <8 x i16> @cvt_4f64_to_8i16_zero(<4 x double> %a0) nounwind {
; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: callq __truncdfhf2
-; AVX2-NEXT: movw %ax, %bx
+; AVX2-NEXT: movl %eax, %ebx
; AVX2-NEXT: shll $16, %ebx
; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
-; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: callq __truncdfhf2
; AVX2-NEXT: movzwl %ax, %r14d
@@ -4692,7 +3463,7 @@ define <8 x i16> @cvt_4f64_to_8i16_zero(<4 x double> %a0) nounwind {
; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: callq __truncdfhf2
-; AVX2-NEXT: movw %ax, %bx
+; AVX2-NEXT: movl %eax, %ebx
; AVX2-NEXT: shll $16, %ebx
; AVX2-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
; AVX2-NEXT: callq __truncdfhf2
@@ -4708,7 +3479,7 @@ define <8 x i16> @cvt_4f64_to_8i16_zero(<4 x double> %a0) nounwind {
; AVX2-NEXT: retq
;
; AVX512F-LABEL: cvt_4f64_to_8i16_zero:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: pushq %r14
; AVX512F-NEXT: pushq %rbx
; AVX512F-NEXT: subq $40, %rsp
@@ -4716,10 +3487,10 @@ define <8 x i16> @cvt_4f64_to_8i16_zero(<4 x double> %a0) nounwind {
; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: callq __truncdfhf2
-; AVX512F-NEXT: movw %ax, %bx
+; AVX512F-NEXT: movl %eax, %ebx
; AVX512F-NEXT: shll $16, %ebx
; AVX512F-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
-; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512F-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: callq __truncdfhf2
; AVX512F-NEXT: movzwl %ax, %r14d
@@ -4730,7 +3501,7 @@ define <8 x i16> @cvt_4f64_to_8i16_zero(<4 x double> %a0) nounwind {
; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: callq __truncdfhf2
-; AVX512F-NEXT: movw %ax, %bx
+; AVX512F-NEXT: movl %eax, %ebx
; AVX512F-NEXT: shll $16, %ebx
; AVX512F-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
; AVX512F-NEXT: callq __truncdfhf2
@@ -4746,7 +3517,7 @@ define <8 x i16> @cvt_4f64_to_8i16_zero(<4 x double> %a0) nounwind {
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: cvt_4f64_to_8i16_zero:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: pushq %r14
; AVX512VL-NEXT: pushq %rbx
; AVX512VL-NEXT: subq $40, %rsp
@@ -4754,21 +3525,21 @@ define <8 x i16> @cvt_4f64_to_8i16_zero(<4 x double> %a0) nounwind {
; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: callq __truncdfhf2
-; AVX512VL-NEXT: movw %ax, %bx
+; AVX512VL-NEXT: movl %eax, %ebx
; AVX512VL-NEXT: shll $16, %ebx
; AVX512VL-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
-; AVX512VL-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512VL-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: callq __truncdfhf2
; AVX512VL-NEXT: movzwl %ax, %r14d
; AVX512VL-NEXT: orl %ebx, %r14d
-; AVX512VL-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
+; AVX512VL-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload
; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX512VL-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX512VL-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: callq __truncdfhf2
-; AVX512VL-NEXT: movw %ax, %bx
+; AVX512VL-NEXT: movl %eax, %ebx
; AVX512VL-NEXT: shll $16, %ebx
; AVX512VL-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
; AVX512VL-NEXT: callq __truncdfhf2
@@ -4793,7 +3564,7 @@ define <8 x i16> @cvt_4f64_to_8i16_zero(<4 x double> %a0) nounwind {
define <8 x i16> @cvt_8f64_to_8i16(<8 x double> %a0) nounwind {
; AVX1-LABEL: cvt_8f64_to_8i16:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: pushq %r15
; AVX1-NEXT: pushq %r14
; AVX1-NEXT: pushq %rbx
@@ -4803,10 +3574,10 @@ define <8 x i16> @cvt_8f64_to_8i16(<8 x double> %a0) nounwind {
; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: callq __truncdfhf2
-; AVX1-NEXT: movw %ax, %bx
+; AVX1-NEXT: movl %eax, %ebx
; AVX1-NEXT: shll $16, %ebx
; AVX1-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
-; AVX1-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX1-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: callq __truncdfhf2
; AVX1-NEXT: movzwl %ax, %r15d
@@ -4817,7 +3588,7 @@ define <8 x i16> @cvt_8f64_to_8i16(<8 x double> %a0) nounwind {
; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: callq __truncdfhf2
-; AVX1-NEXT: movw %ax, %bx
+; AVX1-NEXT: movl %eax, %ebx
; AVX1-NEXT: shll $16, %ebx
; AVX1-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
; AVX1-NEXT: callq __truncdfhf2
@@ -4828,10 +3599,10 @@ define <8 x i16> @cvt_8f64_to_8i16(<8 x double> %a0) nounwind {
; AVX1-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
; AVX1-NEXT: # xmm0 = mem[1,0]
; AVX1-NEXT: callq __truncdfhf2
-; AVX1-NEXT: movw %ax, %bx
+; AVX1-NEXT: movl %eax, %ebx
; AVX1-NEXT: shll $16, %ebx
; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
-; AVX1-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX1-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: callq __truncdfhf2
; AVX1-NEXT: movzwl %ax, %r15d
@@ -4842,7 +3613,7 @@ define <8 x i16> @cvt_8f64_to_8i16(<8 x double> %a0) nounwind {
; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: callq __truncdfhf2
-; AVX1-NEXT: movw %ax, %bx
+; AVX1-NEXT: movl %eax, %ebx
; AVX1-NEXT: shll $16, %ebx
; AVX1-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
; AVX1-NEXT: callq __truncdfhf2
@@ -4860,7 +3631,7 @@ define <8 x i16> @cvt_8f64_to_8i16(<8 x double> %a0) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: cvt_8f64_to_8i16:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: pushq %r15
; AVX2-NEXT: pushq %r14
; AVX2-NEXT: pushq %rbx
@@ -4870,10 +3641,10 @@ define <8 x i16> @cvt_8f64_to_8i16(<8 x double> %a0) nounwind {
; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: callq __truncdfhf2
-; AVX2-NEXT: movw %ax, %bx
+; AVX2-NEXT: movl %eax, %ebx
; AVX2-NEXT: shll $16, %ebx
; AVX2-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
-; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: callq __truncdfhf2
; AVX2-NEXT: movzwl %ax, %r15d
@@ -4884,7 +3655,7 @@ define <8 x i16> @cvt_8f64_to_8i16(<8 x double> %a0) nounwind {
; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: callq __truncdfhf2
-; AVX2-NEXT: movw %ax, %bx
+; AVX2-NEXT: movl %eax, %ebx
; AVX2-NEXT: shll $16, %ebx
; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
; AVX2-NEXT: callq __truncdfhf2
@@ -4895,10 +3666,10 @@ define <8 x i16> @cvt_8f64_to_8i16(<8 x double> %a0) nounwind {
; AVX2-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
; AVX2-NEXT: # xmm0 = mem[1,0]
; AVX2-NEXT: callq __truncdfhf2
-; AVX2-NEXT: movw %ax, %bx
+; AVX2-NEXT: movl %eax, %ebx
; AVX2-NEXT: shll $16, %ebx
; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
-; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: callq __truncdfhf2
; AVX2-NEXT: movzwl %ax, %r15d
@@ -4909,7 +3680,7 @@ define <8 x i16> @cvt_8f64_to_8i16(<8 x double> %a0) nounwind {
; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: callq __truncdfhf2
-; AVX2-NEXT: movw %ax, %bx
+; AVX2-NEXT: movl %eax, %ebx
; AVX2-NEXT: shll $16, %ebx
; AVX2-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
; AVX2-NEXT: callq __truncdfhf2
@@ -4926,143 +3697,74 @@ define <8 x i16> @cvt_8f64_to_8i16(<8 x double> %a0) nounwind {
; AVX2-NEXT: popq %r15
; AVX2-NEXT: retq
;
-; AVX512F-LABEL: cvt_8f64_to_8i16:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: pushq %r15
-; AVX512F-NEXT: pushq %r14
-; AVX512F-NEXT: pushq %rbx
-; AVX512F-NEXT: subq $96, %rsp
-; AVX512F-NEXT: vmovupd %zmm0, (%rsp) # 64-byte Spill
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: callq __truncdfhf2
-; AVX512F-NEXT: movw %ax, %bx
-; AVX512F-NEXT: shll $16, %ebx
-; AVX512F-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: callq __truncdfhf2
-; AVX512F-NEXT: movzwl %ax, %r15d
-; AVX512F-NEXT: orl %ebx, %r15d
-; AVX512F-NEXT: vmovupd (%rsp), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX512F-NEXT: vmovapd %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: callq __truncdfhf2
-; AVX512F-NEXT: movw %ax, %bx
-; AVX512F-NEXT: shll $16, %ebx
-; AVX512F-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
-; AVX512F-NEXT: callq __truncdfhf2
-; AVX512F-NEXT: movzwl %ax, %r14d
-; AVX512F-NEXT: orl %ebx, %r14d
-; AVX512F-NEXT: shlq $32, %r14
-; AVX512F-NEXT: orq %r15, %r14
-; AVX512F-NEXT: vmovupd (%rsp), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, %ymm0
-; AVX512F-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: callq __truncdfhf2
-; AVX512F-NEXT: movw %ax, %bx
-; AVX512F-NEXT: shll $16, %ebx
-; AVX512F-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
-; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: callq __truncdfhf2
-; AVX512F-NEXT: movzwl %ax, %r15d
-; AVX512F-NEXT: orl %ebx, %r15d
-; AVX512F-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload
-; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX512F-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: callq __truncdfhf2
-; AVX512F-NEXT: movw %ax, %bx
-; AVX512F-NEXT: shll $16, %ebx
-; AVX512F-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
-; AVX512F-NEXT: callq __truncdfhf2
-; AVX512F-NEXT: movzwl %ax, %eax
-; AVX512F-NEXT: orl %ebx, %eax
-; AVX512F-NEXT: shlq $32, %rax
-; AVX512F-NEXT: orq %r15, %rax
-; AVX512F-NEXT: vmovq %rax, %xmm0
-; AVX512F-NEXT: vmovq %r14, %xmm1
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512F-NEXT: addq $96, %rsp
-; AVX512F-NEXT: popq %rbx
-; AVX512F-NEXT: popq %r14
-; AVX512F-NEXT: popq %r15
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: cvt_8f64_to_8i16:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: pushq %r15
-; AVX512VL-NEXT: pushq %r14
-; AVX512VL-NEXT: pushq %rbx
-; AVX512VL-NEXT: subq $96, %rsp
-; AVX512VL-NEXT: vmovupd %zmm0, (%rsp) # 64-byte Spill
-; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: callq __truncdfhf2
-; AVX512VL-NEXT: movw %ax, %bx
-; AVX512VL-NEXT: shll $16, %ebx
-; AVX512VL-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload
-; AVX512VL-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: callq __truncdfhf2
-; AVX512VL-NEXT: movzwl %ax, %r15d
-; AVX512VL-NEXT: orl %ebx, %r15d
-; AVX512VL-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload
-; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX512VL-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
-; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: callq __truncdfhf2
-; AVX512VL-NEXT: movw %ax, %bx
-; AVX512VL-NEXT: shll $16, %ebx
-; AVX512VL-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
-; AVX512VL-NEXT: callq __truncdfhf2
-; AVX512VL-NEXT: movzwl %ax, %r14d
-; AVX512VL-NEXT: orl %ebx, %r14d
-; AVX512VL-NEXT: shlq $32, %r14
-; AVX512VL-NEXT: orq %r15, %r14
-; AVX512VL-NEXT: vmovupd (%rsp), %zmm0 # 64-byte Reload
-; AVX512VL-NEXT: vextractf64x4 $1, %zmm0, %ymm0
-; AVX512VL-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill
-; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: callq __truncdfhf2
-; AVX512VL-NEXT: movw %ax, %bx
-; AVX512VL-NEXT: shll $16, %ebx
-; AVX512VL-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
-; AVX512VL-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: callq __truncdfhf2
-; AVX512VL-NEXT: movzwl %ax, %r15d
-; AVX512VL-NEXT: orl %ebx, %r15d
-; AVX512VL-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
-; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX512VL-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
-; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: callq __truncdfhf2
-; AVX512VL-NEXT: movw %ax, %bx
-; AVX512VL-NEXT: shll $16, %ebx
-; AVX512VL-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
-; AVX512VL-NEXT: callq __truncdfhf2
-; AVX512VL-NEXT: movzwl %ax, %eax
-; AVX512VL-NEXT: orl %ebx, %eax
-; AVX512VL-NEXT: shlq $32, %rax
-; AVX512VL-NEXT: orq %r15, %rax
-; AVX512VL-NEXT: vmovq %rax, %xmm0
-; AVX512VL-NEXT: vmovq %r14, %xmm1
-; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512VL-NEXT: addq $96, %rsp
-; AVX512VL-NEXT: popq %rbx
-; AVX512VL-NEXT: popq %r14
-; AVX512VL-NEXT: popq %r15
-; AVX512VL-NEXT: retq
+; AVX512-LABEL: cvt_8f64_to_8i16:
+; AVX512: # %bb.0:
+; AVX512-NEXT: pushq %r15
+; AVX512-NEXT: pushq %r14
+; AVX512-NEXT: pushq %rbx
+; AVX512-NEXT: subq $96, %rsp
+; AVX512-NEXT: vmovupd %zmm0, (%rsp) # 64-byte Spill
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: callq __truncdfhf2
+; AVX512-NEXT: movl %eax, %ebx
+; AVX512-NEXT: shll $16, %ebx
+; AVX512-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload
+; AVX512-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: callq __truncdfhf2
+; AVX512-NEXT: movzwl %ax, %r15d
+; AVX512-NEXT: orl %ebx, %r15d
+; AVX512-NEXT: vmovupd (%rsp), %zmm0 # 64-byte Reload
+; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX512-NEXT: vmovapd %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: callq __truncdfhf2
+; AVX512-NEXT: movl %eax, %ebx
+; AVX512-NEXT: shll $16, %ebx
+; AVX512-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; AVX512-NEXT: callq __truncdfhf2
+; AVX512-NEXT: movzwl %ax, %r14d
+; AVX512-NEXT: orl %ebx, %r14d
+; AVX512-NEXT: shlq $32, %r14
+; AVX512-NEXT: orq %r15, %r14
+; AVX512-NEXT: vmovupd (%rsp), %zmm0 # 64-byte Reload
+; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm0
+; AVX512-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: callq __truncdfhf2
+; AVX512-NEXT: movl %eax, %ebx
+; AVX512-NEXT: shll $16, %ebx
+; AVX512-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
+; AVX512-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: callq __truncdfhf2
+; AVX512-NEXT: movzwl %ax, %r15d
+; AVX512-NEXT: orl %ebx, %r15d
+; AVX512-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload
+; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX512-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: callq __truncdfhf2
+; AVX512-NEXT: movl %eax, %ebx
+; AVX512-NEXT: shll $16, %ebx
+; AVX512-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
+; AVX512-NEXT: callq __truncdfhf2
+; AVX512-NEXT: movzwl %ax, %eax
+; AVX512-NEXT: orl %ebx, %eax
+; AVX512-NEXT: shlq $32, %rax
+; AVX512-NEXT: orq %r15, %rax
+; AVX512-NEXT: vmovq %rax, %xmm0
+; AVX512-NEXT: vmovq %r14, %xmm1
+; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512-NEXT: addq $96, %rsp
+; AVX512-NEXT: popq %rbx
+; AVX512-NEXT: popq %r14
+; AVX512-NEXT: popq %r15
+; AVX512-NEXT: retq
%1 = fptrunc <8 x double> %a0 to <8 x half>
%2 = bitcast <8 x half> %1 to <8 x i16>
ret <8 x i16> %2
@@ -5074,7 +3776,7 @@ define <8 x i16> @cvt_8f64_to_8i16(<8 x double> %a0) nounwind {
define void @store_cvt_f64_to_i16(double %a0, i16* %a1) nounwind {
; ALL-LABEL: store_cvt_f64_to_i16:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: pushq %rbx
; ALL-NEXT: movq %rdi, %rbx
; ALL-NEXT: callq __truncdfhf2
@@ -5089,7 +3791,7 @@ define void @store_cvt_f64_to_i16(double %a0, i16* %a1) nounwind {
define void @store_cvt_2f64_to_2i16(<2 x double> %a0, <2 x i16>* %a1) nounwind {
; ALL-LABEL: store_cvt_2f64_to_2i16:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: pushq %rbp
; ALL-NEXT: pushq %rbx
; ALL-NEXT: subq $24, %rsp
@@ -5114,7 +3816,7 @@ define void @store_cvt_2f64_to_2i16(<2 x double> %a0, <2 x i16>* %a1) nounwind {
define void @store_cvt_4f64_to_4i16(<4 x double> %a0, <4 x i16>* %a1) nounwind {
; AVX1-LABEL: store_cvt_4f64_to_4i16:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: pushq %rbp
; AVX1-NEXT: pushq %r15
; AVX1-NEXT: pushq %r14
@@ -5134,7 +3836,7 @@ define void @store_cvt_4f64_to_4i16(<4 x double> %a0, <4 x i16>* %a1) nounwind {
; AVX1-NEXT: callq __truncdfhf2
; AVX1-NEXT: movl %eax, %r15d
; AVX1-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
-; AVX1-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX1-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: callq __truncdfhf2
; AVX1-NEXT: movl %eax, %ebp
@@ -5152,7 +3854,7 @@ define void @store_cvt_4f64_to_4i16(<4 x double> %a0, <4 x i16>* %a1) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: store_cvt_4f64_to_4i16:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: pushq %rbp
; AVX2-NEXT: pushq %r15
; AVX2-NEXT: pushq %r14
@@ -5172,7 +3874,7 @@ define void @store_cvt_4f64_to_4i16(<4 x double> %a0, <4 x i16>* %a1) nounwind {
; AVX2-NEXT: callq __truncdfhf2
; AVX2-NEXT: movl %eax, %r15d
; AVX2-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
-; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: callq __truncdfhf2
; AVX2-NEXT: movl %eax, %ebp
@@ -5189,81 +3891,43 @@ define void @store_cvt_4f64_to_4i16(<4 x double> %a0, <4 x i16>* %a1) nounwind {
; AVX2-NEXT: popq %rbp
; AVX2-NEXT: retq
;
-; AVX512F-LABEL: store_cvt_4f64_to_4i16:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: pushq %rbp
-; AVX512F-NEXT: pushq %r15
-; AVX512F-NEXT: pushq %r14
-; AVX512F-NEXT: pushq %rbx
-; AVX512F-NEXT: subq $88, %rsp
-; AVX512F-NEXT: movq %rdi, %rbx
-; AVX512F-NEXT: vmovupd %ymm0, {{[0-9]+}}(%rsp) # 32-byte Spill
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: callq __truncdfhf2
-; AVX512F-NEXT: movl %eax, %r14d
-; AVX512F-NEXT: vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
-; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX512F-NEXT: vmovapd %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: callq __truncdfhf2
-; AVX512F-NEXT: movl %eax, %r15d
-; AVX512F-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
-; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: callq __truncdfhf2
-; AVX512F-NEXT: movl %eax, %ebp
-; AVX512F-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
-; AVX512F-NEXT: callq __truncdfhf2
-; AVX512F-NEXT: movw %ax, 4(%rbx)
-; AVX512F-NEXT: movw %bp, (%rbx)
-; AVX512F-NEXT: movw %r15w, 6(%rbx)
-; AVX512F-NEXT: movw %r14w, 2(%rbx)
-; AVX512F-NEXT: addq $88, %rsp
-; AVX512F-NEXT: popq %rbx
-; AVX512F-NEXT: popq %r14
-; AVX512F-NEXT: popq %r15
-; AVX512F-NEXT: popq %rbp
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: store_cvt_4f64_to_4i16:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: pushq %rbp
-; AVX512VL-NEXT: pushq %r15
-; AVX512VL-NEXT: pushq %r14
-; AVX512VL-NEXT: pushq %rbx
-; AVX512VL-NEXT: subq $88, %rsp
-; AVX512VL-NEXT: movq %rdi, %rbx
-; AVX512VL-NEXT: vmovupd %ymm0, {{[0-9]+}}(%rsp) # 32-byte Spill
-; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: callq __truncdfhf2
-; AVX512VL-NEXT: movl %eax, %r14d
-; AVX512VL-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
-; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX512VL-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
-; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: callq __truncdfhf2
-; AVX512VL-NEXT: movl %eax, %r15d
-; AVX512VL-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
-; AVX512VL-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: callq __truncdfhf2
-; AVX512VL-NEXT: movl %eax, %ebp
-; AVX512VL-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
-; AVX512VL-NEXT: callq __truncdfhf2
-; AVX512VL-NEXT: movw %ax, 4(%rbx)
-; AVX512VL-NEXT: movw %bp, (%rbx)
-; AVX512VL-NEXT: movw %r15w, 6(%rbx)
-; AVX512VL-NEXT: movw %r14w, 2(%rbx)
-; AVX512VL-NEXT: addq $88, %rsp
-; AVX512VL-NEXT: popq %rbx
-; AVX512VL-NEXT: popq %r14
-; AVX512VL-NEXT: popq %r15
-; AVX512VL-NEXT: popq %rbp
-; AVX512VL-NEXT: retq
+; AVX512-LABEL: store_cvt_4f64_to_4i16:
+; AVX512: # %bb.0:
+; AVX512-NEXT: pushq %rbp
+; AVX512-NEXT: pushq %r15
+; AVX512-NEXT: pushq %r14
+; AVX512-NEXT: pushq %rbx
+; AVX512-NEXT: subq $88, %rsp
+; AVX512-NEXT: movq %rdi, %rbx
+; AVX512-NEXT: vmovupd %ymm0, {{[0-9]+}}(%rsp) # 32-byte Spill
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: callq __truncdfhf2
+; AVX512-NEXT: movl %eax, %r14d
+; AVX512-NEXT: vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
+; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX512-NEXT: vmovapd %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: callq __truncdfhf2
+; AVX512-NEXT: movl %eax, %r15d
+; AVX512-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
+; AVX512-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: callq __truncdfhf2
+; AVX512-NEXT: movl %eax, %ebp
+; AVX512-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; AVX512-NEXT: callq __truncdfhf2
+; AVX512-NEXT: movw %ax, 4(%rbx)
+; AVX512-NEXT: movw %bp, (%rbx)
+; AVX512-NEXT: movw %r15w, 6(%rbx)
+; AVX512-NEXT: movw %r14w, 2(%rbx)
+; AVX512-NEXT: addq $88, %rsp
+; AVX512-NEXT: popq %rbx
+; AVX512-NEXT: popq %r14
+; AVX512-NEXT: popq %r15
+; AVX512-NEXT: popq %rbp
+; AVX512-NEXT: retq
%1 = fptrunc <4 x double> %a0 to <4 x half>
%2 = bitcast <4 x half> %1 to <4 x i16>
store <4 x i16> %2, <4 x i16>* %a1
@@ -5272,7 +3936,7 @@ define void @store_cvt_4f64_to_4i16(<4 x double> %a0, <4 x i16>* %a1) nounwind {
define void @store_cvt_4f64_to_8i16_undef(<4 x double> %a0, <8 x i16>* %a1) nounwind {
; AVX1-LABEL: store_cvt_4f64_to_8i16_undef:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: pushq %rbp
; AVX1-NEXT: pushq %r14
; AVX1-NEXT: pushq %rbx
@@ -5282,10 +3946,10 @@ define void @store_cvt_4f64_to_8i16_undef(<4 x double> %a0, <8 x i16>* %a1) noun
; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: callq __truncdfhf2
-; AVX1-NEXT: movw %ax, %bp
+; AVX1-NEXT: movl %eax, %ebp
; AVX1-NEXT: shll $16, %ebp
; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
-; AVX1-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX1-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: callq __truncdfhf2
; AVX1-NEXT: movzwl %ax, %ebx
@@ -5296,7 +3960,7 @@ define void @store_cvt_4f64_to_8i16_undef(<4 x double> %a0, <8 x i16>* %a1) noun
; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: callq __truncdfhf2
-; AVX1-NEXT: movw %ax, %bp
+; AVX1-NEXT: movl %eax, %ebp
; AVX1-NEXT: shll $16, %ebp
; AVX1-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
; AVX1-NEXT: callq __truncdfhf2
@@ -5314,7 +3978,7 @@ define void @store_cvt_4f64_to_8i16_undef(<4 x double> %a0, <8 x i16>* %a1) noun
; AVX1-NEXT: retq
;
; AVX2-LABEL: store_cvt_4f64_to_8i16_undef:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: pushq %rbp
; AVX2-NEXT: pushq %r14
; AVX2-NEXT: pushq %rbx
@@ -5324,10 +3988,10 @@ define void @store_cvt_4f64_to_8i16_undef(<4 x double> %a0, <8 x i16>* %a1) noun
; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: callq __truncdfhf2
-; AVX2-NEXT: movw %ax, %bp
+; AVX2-NEXT: movl %eax, %ebp
; AVX2-NEXT: shll $16, %ebp
; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
-; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: callq __truncdfhf2
; AVX2-NEXT: movzwl %ax, %ebx
@@ -5338,7 +4002,7 @@ define void @store_cvt_4f64_to_8i16_undef(<4 x double> %a0, <8 x i16>* %a1) noun
; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: callq __truncdfhf2
-; AVX2-NEXT: movw %ax, %bp
+; AVX2-NEXT: movl %eax, %ebp
; AVX2-NEXT: shll $16, %ebp
; AVX2-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
; AVX2-NEXT: callq __truncdfhf2
@@ -5356,7 +4020,7 @@ define void @store_cvt_4f64_to_8i16_undef(<4 x double> %a0, <8 x i16>* %a1) noun
; AVX2-NEXT: retq
;
; AVX512F-LABEL: store_cvt_4f64_to_8i16_undef:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: pushq %rbp
; AVX512F-NEXT: pushq %r14
; AVX512F-NEXT: pushq %rbx
@@ -5366,10 +4030,10 @@ define void @store_cvt_4f64_to_8i16_undef(<4 x double> %a0, <8 x i16>* %a1) noun
; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: callq __truncdfhf2
-; AVX512F-NEXT: movw %ax, %bp
+; AVX512F-NEXT: movl %eax, %ebp
; AVX512F-NEXT: shll $16, %ebp
; AVX512F-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
-; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512F-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: callq __truncdfhf2
; AVX512F-NEXT: movzwl %ax, %ebx
@@ -5380,7 +4044,7 @@ define void @store_cvt_4f64_to_8i16_undef(<4 x double> %a0, <8 x i16>* %a1) noun
; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: callq __truncdfhf2
-; AVX512F-NEXT: movw %ax, %bp
+; AVX512F-NEXT: movl %eax, %ebp
; AVX512F-NEXT: shll $16, %ebp
; AVX512F-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
; AVX512F-NEXT: callq __truncdfhf2
@@ -5398,7 +4062,7 @@ define void @store_cvt_4f64_to_8i16_undef(<4 x double> %a0, <8 x i16>* %a1) noun
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: store_cvt_4f64_to_8i16_undef:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: pushq %rbp
; AVX512VL-NEXT: pushq %r14
; AVX512VL-NEXT: pushq %rbx
@@ -5408,21 +4072,21 @@ define void @store_cvt_4f64_to_8i16_undef(<4 x double> %a0, <8 x i16>* %a1) noun
; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: callq __truncdfhf2
-; AVX512VL-NEXT: movw %ax, %bp
+; AVX512VL-NEXT: movl %eax, %ebp
; AVX512VL-NEXT: shll $16, %ebp
; AVX512VL-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
-; AVX512VL-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512VL-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: callq __truncdfhf2
; AVX512VL-NEXT: movzwl %ax, %ebx
; AVX512VL-NEXT: orl %ebp, %ebx
-; AVX512VL-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
+; AVX512VL-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload
; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX512VL-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX512VL-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: callq __truncdfhf2
-; AVX512VL-NEXT: movw %ax, %bp
+; AVX512VL-NEXT: movl %eax, %ebp
; AVX512VL-NEXT: shll $16, %ebp
; AVX512VL-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
; AVX512VL-NEXT: callq __truncdfhf2
@@ -5448,7 +4112,7 @@ define void @store_cvt_4f64_to_8i16_undef(<4 x double> %a0, <8 x i16>* %a1) noun
define void @store_cvt_4f64_to_8i16_zero(<4 x double> %a0, <8 x i16>* %a1) nounwind {
; AVX1-LABEL: store_cvt_4f64_to_8i16_zero:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: pushq %rbp
; AVX1-NEXT: pushq %r14
; AVX1-NEXT: pushq %rbx
@@ -5458,10 +4122,10 @@ define void @store_cvt_4f64_to_8i16_zero(<4 x double> %a0, <8 x i16>* %a1) nounw
; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: callq __truncdfhf2
-; AVX1-NEXT: movw %ax, %bp
+; AVX1-NEXT: movl %eax, %ebp
; AVX1-NEXT: shll $16, %ebp
; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
-; AVX1-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX1-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: callq __truncdfhf2
; AVX1-NEXT: movzwl %ax, %ebx
@@ -5472,7 +4136,7 @@ define void @store_cvt_4f64_to_8i16_zero(<4 x double> %a0, <8 x i16>* %a1) nounw
; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: callq __truncdfhf2
-; AVX1-NEXT: movw %ax, %bp
+; AVX1-NEXT: movl %eax, %ebp
; AVX1-NEXT: shll $16, %ebp
; AVX1-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
; AVX1-NEXT: callq __truncdfhf2
@@ -5490,7 +4154,7 @@ define void @store_cvt_4f64_to_8i16_zero(<4 x double> %a0, <8 x i16>* %a1) nounw
; AVX1-NEXT: retq
;
; AVX2-LABEL: store_cvt_4f64_to_8i16_zero:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: pushq %rbp
; AVX2-NEXT: pushq %r14
; AVX2-NEXT: pushq %rbx
@@ -5500,10 +4164,10 @@ define void @store_cvt_4f64_to_8i16_zero(<4 x double> %a0, <8 x i16>* %a1) nounw
; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: callq __truncdfhf2
-; AVX2-NEXT: movw %ax, %bp
+; AVX2-NEXT: movl %eax, %ebp
; AVX2-NEXT: shll $16, %ebp
; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
-; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: callq __truncdfhf2
; AVX2-NEXT: movzwl %ax, %ebx
@@ -5514,7 +4178,7 @@ define void @store_cvt_4f64_to_8i16_zero(<4 x double> %a0, <8 x i16>* %a1) nounw
; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: callq __truncdfhf2
-; AVX2-NEXT: movw %ax, %bp
+; AVX2-NEXT: movl %eax, %ebp
; AVX2-NEXT: shll $16, %ebp
; AVX2-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
; AVX2-NEXT: callq __truncdfhf2
@@ -5532,7 +4196,7 @@ define void @store_cvt_4f64_to_8i16_zero(<4 x double> %a0, <8 x i16>* %a1) nounw
; AVX2-NEXT: retq
;
; AVX512F-LABEL: store_cvt_4f64_to_8i16_zero:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: pushq %rbp
; AVX512F-NEXT: pushq %r14
; AVX512F-NEXT: pushq %rbx
@@ -5542,10 +4206,10 @@ define void @store_cvt_4f64_to_8i16_zero(<4 x double> %a0, <8 x i16>* %a1) nounw
; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: callq __truncdfhf2
-; AVX512F-NEXT: movw %ax, %bp
+; AVX512F-NEXT: movl %eax, %ebp
; AVX512F-NEXT: shll $16, %ebp
; AVX512F-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
-; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512F-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: callq __truncdfhf2
; AVX512F-NEXT: movzwl %ax, %ebx
@@ -5556,7 +4220,7 @@ define void @store_cvt_4f64_to_8i16_zero(<4 x double> %a0, <8 x i16>* %a1) nounw
; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: callq __truncdfhf2
-; AVX512F-NEXT: movw %ax, %bp
+; AVX512F-NEXT: movl %eax, %ebp
; AVX512F-NEXT: shll $16, %ebp
; AVX512F-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
; AVX512F-NEXT: callq __truncdfhf2
@@ -5574,7 +4238,7 @@ define void @store_cvt_4f64_to_8i16_zero(<4 x double> %a0, <8 x i16>* %a1) nounw
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: store_cvt_4f64_to_8i16_zero:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: pushq %rbp
; AVX512VL-NEXT: pushq %r14
; AVX512VL-NEXT: pushq %rbx
@@ -5584,21 +4248,21 @@ define void @store_cvt_4f64_to_8i16_zero(<4 x double> %a0, <8 x i16>* %a1) nounw
; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: callq __truncdfhf2
-; AVX512VL-NEXT: movw %ax, %bp
+; AVX512VL-NEXT: movl %eax, %ebp
; AVX512VL-NEXT: shll $16, %ebp
; AVX512VL-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
-; AVX512VL-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512VL-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: callq __truncdfhf2
; AVX512VL-NEXT: movzwl %ax, %ebx
; AVX512VL-NEXT: orl %ebp, %ebx
-; AVX512VL-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
+; AVX512VL-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload
; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX512VL-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX512VL-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: callq __truncdfhf2
-; AVX512VL-NEXT: movw %ax, %bp
+; AVX512VL-NEXT: movl %eax, %ebp
; AVX512VL-NEXT: shll $16, %ebp
; AVX512VL-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
; AVX512VL-NEXT: callq __truncdfhf2
@@ -5626,7 +4290,7 @@ define void @store_cvt_4f64_to_8i16_zero(<4 x double> %a0, <8 x i16>* %a1) nounw
define void @store_cvt_8f64_to_8i16(<8 x double> %a0, <8 x i16>* %a1) nounwind {
; AVX1-LABEL: store_cvt_8f64_to_8i16:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: pushq %rbp
; AVX1-NEXT: pushq %r15
; AVX1-NEXT: pushq %r14
@@ -5660,7 +4324,7 @@ define void @store_cvt_8f64_to_8i16(<8 x double> %a0, <8 x i16>* %a1) nounwind {
; AVX1-NEXT: callq __truncdfhf2
; AVX1-NEXT: movl %eax, %r13d
; AVX1-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
-; AVX1-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX1-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: callq __truncdfhf2
; AVX1-NEXT: movl %eax, %ebp
@@ -5668,7 +4332,7 @@ define void @store_cvt_8f64_to_8i16(<8 x double> %a0, <8 x i16>* %a1) nounwind {
; AVX1-NEXT: callq __truncdfhf2
; AVX1-NEXT: movl %eax, %r14d
; AVX1-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
-; AVX1-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX1-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: callq __truncdfhf2
; AVX1-NEXT: movl %eax, %r15d
@@ -5694,7 +4358,7 @@ define void @store_cvt_8f64_to_8i16(<8 x double> %a0, <8 x i16>* %a1) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: store_cvt_8f64_to_8i16:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: pushq %rbp
; AVX2-NEXT: pushq %r15
; AVX2-NEXT: pushq %r14
@@ -5728,7 +4392,7 @@ define void @store_cvt_8f64_to_8i16(<8 x double> %a0, <8 x i16>* %a1) nounwind {
; AVX2-NEXT: callq __truncdfhf2
; AVX2-NEXT: movl %eax, %r13d
; AVX2-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
-; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: callq __truncdfhf2
; AVX2-NEXT: movl %eax, %ebp
@@ -5736,7 +4400,7 @@ define void @store_cvt_8f64_to_8i16(<8 x double> %a0, <8 x i16>* %a1) nounwind {
; AVX2-NEXT: callq __truncdfhf2
; AVX2-NEXT: movl %eax, %r14d
; AVX2-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
-; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: callq __truncdfhf2
; AVX2-NEXT: movl %eax, %r15d
@@ -5761,145 +4425,75 @@ define void @store_cvt_8f64_to_8i16(<8 x double> %a0, <8 x i16>* %a1) nounwind {
; AVX2-NEXT: popq %rbp
; AVX2-NEXT: retq
;
-; AVX512F-LABEL: store_cvt_8f64_to_8i16:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: pushq %rbp
-; AVX512F-NEXT: pushq %r15
-; AVX512F-NEXT: pushq %r14
-; AVX512F-NEXT: pushq %r13
-; AVX512F-NEXT: pushq %r12
-; AVX512F-NEXT: pushq %rbx
-; AVX512F-NEXT: subq $200, %rsp
-; AVX512F-NEXT: movq %rdi, %rbx
-; AVX512F-NEXT: vmovupd %zmm0, {{[0-9]+}}(%rsp) # 64-byte Spill
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: callq __truncdfhf2
-; AVX512F-NEXT: movw %ax, {{[0-9]+}}(%rsp) # 2-byte Spill
-; AVX512F-NEXT: vmovupd {{[0-9]+}}(%rsp), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX512F-NEXT: vmovapd %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: callq __truncdfhf2
-; AVX512F-NEXT: movw %ax, {{[0-9]+}}(%rsp) # 2-byte Spill
-; AVX512F-NEXT: vmovupd {{[0-9]+}}(%rsp), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, %ymm0
-; AVX512F-NEXT: vmovupd %ymm0, {{[0-9]+}}(%rsp) # 32-byte Spill
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: callq __truncdfhf2
-; AVX512F-NEXT: movl %eax, %r12d
-; AVX512F-NEXT: vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
-; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX512F-NEXT: vmovapd %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: callq __truncdfhf2
-; AVX512F-NEXT: movl %eax, %r13d
-; AVX512F-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: callq __truncdfhf2
-; AVX512F-NEXT: movl %eax, %ebp
-; AVX512F-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
-; AVX512F-NEXT: callq __truncdfhf2
-; AVX512F-NEXT: movl %eax, %r14d
-; AVX512F-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
-; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: callq __truncdfhf2
-; AVX512F-NEXT: movl %eax, %r15d
-; AVX512F-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
-; AVX512F-NEXT: callq __truncdfhf2
-; AVX512F-NEXT: movw %ax, 12(%rbx)
-; AVX512F-NEXT: movw %r15w, 8(%rbx)
-; AVX512F-NEXT: movw %r14w, 4(%rbx)
-; AVX512F-NEXT: movw %bp, (%rbx)
-; AVX512F-NEXT: movw %r13w, 14(%rbx)
-; AVX512F-NEXT: movw %r12w, 10(%rbx)
-; AVX512F-NEXT: movzwl {{[0-9]+}}(%rsp), %eax # 2-byte Folded Reload
-; AVX512F-NEXT: movw %ax, 6(%rbx)
-; AVX512F-NEXT: movzwl {{[0-9]+}}(%rsp), %eax # 2-byte Folded Reload
-; AVX512F-NEXT: movw %ax, 2(%rbx)
-; AVX512F-NEXT: addq $200, %rsp
-; AVX512F-NEXT: popq %rbx
-; AVX512F-NEXT: popq %r12
-; AVX512F-NEXT: popq %r13
-; AVX512F-NEXT: popq %r14
-; AVX512F-NEXT: popq %r15
-; AVX512F-NEXT: popq %rbp
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: store_cvt_8f64_to_8i16:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: pushq %rbp
-; AVX512VL-NEXT: pushq %r15
-; AVX512VL-NEXT: pushq %r14
-; AVX512VL-NEXT: pushq %r13
-; AVX512VL-NEXT: pushq %r12
-; AVX512VL-NEXT: pushq %rbx
-; AVX512VL-NEXT: subq $200, %rsp
-; AVX512VL-NEXT: movq %rdi, %rbx
-; AVX512VL-NEXT: vmovupd %zmm0, {{[0-9]+}}(%rsp) # 64-byte Spill
-; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: callq __truncdfhf2
-; AVX512VL-NEXT: movw %ax, {{[0-9]+}}(%rsp) # 2-byte Spill
-; AVX512VL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm0 # 64-byte Reload
-; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX512VL-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
-; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: callq __truncdfhf2
-; AVX512VL-NEXT: movw %ax, {{[0-9]+}}(%rsp) # 2-byte Spill
-; AVX512VL-NEXT: vmovupd {{[0-9]+}}(%rsp), %zmm0 # 64-byte Reload
-; AVX512VL-NEXT: vextractf64x4 $1, %zmm0, %ymm0
-; AVX512VL-NEXT: vmovupd %ymm0, {{[0-9]+}}(%rsp) # 32-byte Spill
-; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: callq __truncdfhf2
-; AVX512VL-NEXT: movl %eax, %r12d
-; AVX512VL-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
-; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX512VL-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
-; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: callq __truncdfhf2
-; AVX512VL-NEXT: movl %eax, %r13d
-; AVX512VL-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm0 # 64-byte Reload
-; AVX512VL-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: callq __truncdfhf2
-; AVX512VL-NEXT: movl %eax, %ebp
-; AVX512VL-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
-; AVX512VL-NEXT: callq __truncdfhf2
-; AVX512VL-NEXT: movl %eax, %r14d
-; AVX512VL-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
-; AVX512VL-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: callq __truncdfhf2
-; AVX512VL-NEXT: movl %eax, %r15d
-; AVX512VL-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
-; AVX512VL-NEXT: callq __truncdfhf2
-; AVX512VL-NEXT: movw %ax, 12(%rbx)
-; AVX512VL-NEXT: movw %r15w, 8(%rbx)
-; AVX512VL-NEXT: movw %r14w, 4(%rbx)
-; AVX512VL-NEXT: movw %bp, (%rbx)
-; AVX512VL-NEXT: movw %r13w, 14(%rbx)
-; AVX512VL-NEXT: movw %r12w, 10(%rbx)
-; AVX512VL-NEXT: movzwl {{[0-9]+}}(%rsp), %eax # 2-byte Folded Reload
-; AVX512VL-NEXT: movw %ax, 6(%rbx)
-; AVX512VL-NEXT: movzwl {{[0-9]+}}(%rsp), %eax # 2-byte Folded Reload
-; AVX512VL-NEXT: movw %ax, 2(%rbx)
-; AVX512VL-NEXT: addq $200, %rsp
-; AVX512VL-NEXT: popq %rbx
-; AVX512VL-NEXT: popq %r12
-; AVX512VL-NEXT: popq %r13
-; AVX512VL-NEXT: popq %r14
-; AVX512VL-NEXT: popq %r15
-; AVX512VL-NEXT: popq %rbp
-; AVX512VL-NEXT: retq
+; AVX512-LABEL: store_cvt_8f64_to_8i16:
+; AVX512: # %bb.0:
+; AVX512-NEXT: pushq %rbp
+; AVX512-NEXT: pushq %r15
+; AVX512-NEXT: pushq %r14
+; AVX512-NEXT: pushq %r13
+; AVX512-NEXT: pushq %r12
+; AVX512-NEXT: pushq %rbx
+; AVX512-NEXT: subq $200, %rsp
+; AVX512-NEXT: movq %rdi, %rbx
+; AVX512-NEXT: vmovupd %zmm0, {{[0-9]+}}(%rsp) # 64-byte Spill
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: callq __truncdfhf2
+; AVX512-NEXT: movw %ax, {{[0-9]+}}(%rsp) # 2-byte Spill
+; AVX512-NEXT: vmovupd {{[0-9]+}}(%rsp), %zmm0 # 64-byte Reload
+; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX512-NEXT: vmovapd %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: callq __truncdfhf2
+; AVX512-NEXT: movw %ax, {{[0-9]+}}(%rsp) # 2-byte Spill
+; AVX512-NEXT: vmovupd {{[0-9]+}}(%rsp), %zmm0 # 64-byte Reload
+; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm0
+; AVX512-NEXT: vmovupd %ymm0, {{[0-9]+}}(%rsp) # 32-byte Spill
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: callq __truncdfhf2
+; AVX512-NEXT: movl %eax, %r12d
+; AVX512-NEXT: vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
+; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX512-NEXT: vmovapd %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: callq __truncdfhf2
+; AVX512-NEXT: movl %eax, %r13d
+; AVX512-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm0 # 64-byte Reload
+; AVX512-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: callq __truncdfhf2
+; AVX512-NEXT: movl %eax, %ebp
+; AVX512-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; AVX512-NEXT: callq __truncdfhf2
+; AVX512-NEXT: movl %eax, %r14d
+; AVX512-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
+; AVX512-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: callq __truncdfhf2
+; AVX512-NEXT: movl %eax, %r15d
+; AVX512-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; AVX512-NEXT: callq __truncdfhf2
+; AVX512-NEXT: movw %ax, 12(%rbx)
+; AVX512-NEXT: movw %r15w, 8(%rbx)
+; AVX512-NEXT: movw %r14w, 4(%rbx)
+; AVX512-NEXT: movw %bp, (%rbx)
+; AVX512-NEXT: movw %r13w, 14(%rbx)
+; AVX512-NEXT: movw %r12w, 10(%rbx)
+; AVX512-NEXT: movzwl {{[0-9]+}}(%rsp), %eax # 2-byte Folded Reload
+; AVX512-NEXT: movw %ax, 6(%rbx)
+; AVX512-NEXT: movzwl {{[0-9]+}}(%rsp), %eax # 2-byte Folded Reload
+; AVX512-NEXT: movw %ax, 2(%rbx)
+; AVX512-NEXT: addq $200, %rsp
+; AVX512-NEXT: popq %rbx
+; AVX512-NEXT: popq %r12
+; AVX512-NEXT: popq %r13
+; AVX512-NEXT: popq %r14
+; AVX512-NEXT: popq %r15
+; AVX512-NEXT: popq %rbp
+; AVX512-NEXT: retq
%1 = fptrunc <8 x double> %a0 to <8 x half>
%2 = bitcast <8 x half> %1 to <8 x i16>
store <8 x i16> %2, <8 x i16>* %a1
diff --git a/test/CodeGen/X86/vector-idiv-sdiv-128.ll b/test/CodeGen/X86/vector-idiv-sdiv-128.ll
index 87cf2026d1ef..61787fc19dfa 100644
--- a/test/CodeGen/X86/vector-idiv-sdiv-128.ll
+++ b/test/CodeGen/X86/vector-idiv-sdiv-128.ll
@@ -2,7 +2,8 @@
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2NOBW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX512BW
;
; sdiv by 7
@@ -10,7 +11,7 @@
define <2 x i64> @test_div7_2i64(<2 x i64> %a) nounwind {
; SSE2-LABEL: test_div7_2i64:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movq %xmm0, %rax
; SSE2-NEXT: movabsq $5270498306774157605, %rcx # imm = 0x4924924924924925
; SSE2-NEXT: imulq %rcx
@@ -32,7 +33,7 @@ define <2 x i64> @test_div7_2i64(<2 x i64> %a) nounwind {
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_div7_2i64:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pextrq $1, %xmm0, %rax
; SSE41-NEXT: movabsq $5270498306774157605, %rcx # imm = 0x4924924924924925
; SSE41-NEXT: imulq %rcx
@@ -52,7 +53,7 @@ define <2 x i64> @test_div7_2i64(<2 x i64> %a) nounwind {
; SSE41-NEXT: retq
;
; AVX-LABEL: test_div7_2i64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpextrq $1, %xmm0, %rax
; AVX-NEXT: movabsq $5270498306774157605, %rcx # imm = 0x4924924924924925
; AVX-NEXT: imulq %rcx
@@ -76,7 +77,7 @@ define <2 x i64> @test_div7_2i64(<2 x i64> %a) nounwind {
define <4 x i32> @test_div7_4i32(<4 x i32> %a) nounwind {
; SSE2-LABEL: test_div7_4i32:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027]
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: psrad $31, %xmm2
@@ -102,7 +103,7 @@ define <4 x i32> @test_div7_4i32(<4 x i32> %a) nounwind {
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_div7_4i32:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027]
; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
@@ -119,7 +120,7 @@ define <4 x i32> @test_div7_4i32(<4 x i32> %a) nounwind {
; SSE41-NEXT: retq
;
; AVX1-LABEL: test_div7_4i32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027]
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
@@ -134,7 +135,7 @@ define <4 x i32> @test_div7_4i32(<4 x i32> %a) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_div7_4i32:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027]
; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
@@ -153,7 +154,7 @@ define <4 x i32> @test_div7_4i32(<4 x i32> %a) nounwind {
define <8 x i16> @test_div7_8i16(<8 x i16> %a) nounwind {
; SSE-LABEL: test_div7_8i16:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pmulhw {{.*}}(%rip), %xmm0
; SSE-NEXT: movdqa %xmm0, %xmm1
; SSE-NEXT: psrlw $15, %xmm1
@@ -162,7 +163,7 @@ define <8 x i16> @test_div7_8i16(<8 x i16> %a) nounwind {
; SSE-NEXT: retq
;
; AVX-LABEL: test_div7_8i16:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: vpsrlw $15, %xmm0, %xmm1
; AVX-NEXT: vpsraw $1, %xmm0, %xmm0
@@ -174,7 +175,7 @@ define <8 x i16> @test_div7_8i16(<8 x i16> %a) nounwind {
define <16 x i8> @test_div7_16i8(<16 x i8> %a) nounwind {
; SSE2-LABEL: test_div7_16i8:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; SSE2-NEXT: psraw $8, %xmm2
@@ -201,7 +202,7 @@ define <16 x i8> @test_div7_16i8(<16 x i8> %a) nounwind {
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_div7_16i8:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pmovsxbw %xmm0, %xmm1
; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [65427,65427,65427,65427,65427,65427,65427,65427]
; SSE41-NEXT: pmullw %xmm2, %xmm1
@@ -225,7 +226,7 @@ define <16 x i8> @test_div7_16i8(<16 x i8> %a) nounwind {
; SSE41-NEXT: retq
;
; AVX1-LABEL: test_div7_16i8:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpmovsxbw %xmm0, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [65427,65427,65427,65427,65427,65427,65427,65427]
; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1
@@ -246,24 +247,42 @@ define <16 x i8> @test_div7_16i8(<16 x i8> %a) nounwind {
; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: test_div7_16i8:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpmovsxbw %xmm0, %ymm1
-; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1
-; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1
-; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: vpsrlw $2, %xmm0, %xmm1
-; AVX2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
-; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vpsubb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vpsrlw $7, %xmm0, %xmm0
-; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
+; AVX2NOBW-LABEL: test_div7_16i8:
+; AVX2NOBW: # %bb.0:
+; AVX2NOBW-NEXT: vpmovsxbw %xmm0, %ymm1
+; AVX2NOBW-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1
+; AVX2NOBW-NEXT: vpsrlw $8, %ymm1, %ymm1
+; AVX2NOBW-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2NOBW-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
+; AVX2NOBW-NEXT: vpaddb %xmm0, %xmm1, %xmm0
+; AVX2NOBW-NEXT: vpsrlw $2, %xmm0, %xmm1
+; AVX2NOBW-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX2NOBW-NEXT: vmovdqa {{.*#+}} xmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; AVX2NOBW-NEXT: vpxor %xmm2, %xmm1, %xmm1
+; AVX2NOBW-NEXT: vpsubb %xmm2, %xmm1, %xmm1
+; AVX2NOBW-NEXT: vpsrlw $7, %xmm0, %xmm0
+; AVX2NOBW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX2NOBW-NEXT: vpaddb %xmm0, %xmm1, %xmm0
+; AVX2NOBW-NEXT: vzeroupper
+; AVX2NOBW-NEXT: retq
+;
+; AVX512BW-LABEL: test_div7_16i8:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm1
+; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1
+; AVX512BW-NEXT: vpsrlw $8, %ymm1, %ymm1
+; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1
+; AVX512BW-NEXT: vpaddb %xmm0, %xmm1, %xmm0
+; AVX512BW-NEXT: vpsrlw $2, %xmm0, %xmm1
+; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; AVX512BW-NEXT: vpxor %xmm2, %xmm1, %xmm1
+; AVX512BW-NEXT: vpsubb %xmm2, %xmm1, %xmm1
+; AVX512BW-NEXT: vpsrlw $7, %xmm0, %xmm0
+; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512BW-NEXT: vpaddb %xmm0, %xmm1, %xmm0
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
%res = sdiv <16 x i8> %a, <i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7>
ret <16 x i8> %res
}
@@ -274,7 +293,7 @@ define <16 x i8> @test_div7_16i8(<16 x i8> %a) nounwind {
define <2 x i64> @test_rem7_2i64(<2 x i64> %a) nounwind {
; SSE2-LABEL: test_rem7_2i64:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movq %xmm0, %rcx
; SSE2-NEXT: movabsq $5270498306774157605, %rsi # imm = 0x4924924924924925
; SSE2-NEXT: movq %rcx, %rax
@@ -304,7 +323,7 @@ define <2 x i64> @test_rem7_2i64(<2 x i64> %a) nounwind {
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_rem7_2i64:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pextrq $1, %xmm0, %rcx
; SSE41-NEXT: movabsq $5270498306774157605, %rsi # imm = 0x4924924924924925
; SSE41-NEXT: movq %rcx, %rax
@@ -332,7 +351,7 @@ define <2 x i64> @test_rem7_2i64(<2 x i64> %a) nounwind {
; SSE41-NEXT: retq
;
; AVX-LABEL: test_rem7_2i64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpextrq $1, %xmm0, %rcx
; AVX-NEXT: movabsq $5270498306774157605, %rsi # imm = 0x4924924924924925
; AVX-NEXT: movq %rcx, %rax
@@ -364,7 +383,7 @@ define <2 x i64> @test_rem7_2i64(<2 x i64> %a) nounwind {
define <4 x i32> @test_rem7_4i32(<4 x i32> %a) nounwind {
; SSE2-LABEL: test_rem7_4i32:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027]
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: psrad $31, %xmm2
@@ -397,7 +416,7 @@ define <4 x i32> @test_rem7_4i32(<4 x i32> %a) nounwind {
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_rem7_4i32:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027]
; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
@@ -415,7 +434,7 @@ define <4 x i32> @test_rem7_4i32(<4 x i32> %a) nounwind {
; SSE41-NEXT: retq
;
; AVX1-LABEL: test_rem7_4i32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027]
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
@@ -432,7 +451,7 @@ define <4 x i32> @test_rem7_4i32(<4 x i32> %a) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_rem7_4i32:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027]
; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
@@ -454,7 +473,7 @@ define <4 x i32> @test_rem7_4i32(<4 x i32> %a) nounwind {
define <8 x i16> @test_rem7_8i16(<8 x i16> %a) nounwind {
; SSE-LABEL: test_rem7_8i16:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movdqa {{.*#+}} xmm1 = [18725,18725,18725,18725,18725,18725,18725,18725]
; SSE-NEXT: pmulhw %xmm0, %xmm1
; SSE-NEXT: movdqa %xmm1, %xmm2
@@ -466,7 +485,7 @@ define <8 x i16> @test_rem7_8i16(<8 x i16> %a) nounwind {
; SSE-NEXT: retq
;
; AVX-LABEL: test_rem7_8i16:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm1
; AVX-NEXT: vpsrlw $15, %xmm1, %xmm2
; AVX-NEXT: vpsraw $1, %xmm1, %xmm1
@@ -480,7 +499,7 @@ define <8 x i16> @test_rem7_8i16(<8 x i16> %a) nounwind {
define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind {
; SSE2-LABEL: test_rem7_16i8:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; SSE2-NEXT: psraw $8, %xmm2
@@ -519,7 +538,7 @@ define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind {
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_rem7_16i8:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pmovsxbw %xmm0, %xmm1
; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [65427,65427,65427,65427,65427,65427,65427,65427]
; SSE41-NEXT: pmullw %xmm2, %xmm1
@@ -553,7 +572,7 @@ define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind {
; SSE41-NEXT: retq
;
; AVX1-LABEL: test_rem7_16i8:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpmovsxbw %xmm0, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [65427,65427,65427,65427,65427,65427,65427,65427]
; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1
@@ -585,32 +604,54 @@ define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind {
; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: test_rem7_16i8:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpmovsxbw %xmm0, %ymm1
-; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1
-; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1
-; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm1
-; AVX2-NEXT: vpsrlw $2, %xmm1, %xmm2
-; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
-; AVX2-NEXT: vpxor %xmm3, %xmm2, %xmm2
-; AVX2-NEXT: vpsubb %xmm3, %xmm2, %xmm2
-; AVX2-NEXT: vpsrlw $7, %xmm1, %xmm1
-; AVX2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
-; AVX2-NEXT: vpaddb %xmm1, %xmm2, %xmm1
-; AVX2-NEXT: vpmovsxbw %xmm1, %ymm1
-; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1
-; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2
-; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
+; AVX2NOBW-LABEL: test_rem7_16i8:
+; AVX2NOBW: # %bb.0:
+; AVX2NOBW-NEXT: vpmovsxbw %xmm0, %ymm1
+; AVX2NOBW-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1
+; AVX2NOBW-NEXT: vpsrlw $8, %ymm1, %ymm1
+; AVX2NOBW-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2NOBW-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
+; AVX2NOBW-NEXT: vpaddb %xmm0, %xmm1, %xmm1
+; AVX2NOBW-NEXT: vpsrlw $2, %xmm1, %xmm2
+; AVX2NOBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX2NOBW-NEXT: vmovdqa {{.*#+}} xmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; AVX2NOBW-NEXT: vpxor %xmm3, %xmm2, %xmm2
+; AVX2NOBW-NEXT: vpsubb %xmm3, %xmm2, %xmm2
+; AVX2NOBW-NEXT: vpsrlw $7, %xmm1, %xmm1
+; AVX2NOBW-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX2NOBW-NEXT: vpaddb %xmm1, %xmm2, %xmm1
+; AVX2NOBW-NEXT: vpmovsxbw %xmm1, %ymm1
+; AVX2NOBW-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1
+; AVX2NOBW-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2NOBW-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2NOBW-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX2NOBW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX2NOBW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX2NOBW-NEXT: vpsubb %xmm1, %xmm0, %xmm0
+; AVX2NOBW-NEXT: vzeroupper
+; AVX2NOBW-NEXT: retq
+;
+; AVX512BW-LABEL: test_rem7_16i8:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm1
+; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1
+; AVX512BW-NEXT: vpsrlw $8, %ymm1, %ymm1
+; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1
+; AVX512BW-NEXT: vpaddb %xmm0, %xmm1, %xmm1
+; AVX512BW-NEXT: vpsrlw $2, %xmm1, %xmm2
+; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; AVX512BW-NEXT: vpxor %xmm3, %xmm2, %xmm2
+; AVX512BW-NEXT: vpsubb %xmm3, %xmm2, %xmm2
+; AVX512BW-NEXT: vpsrlw $7, %xmm1, %xmm1
+; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX512BW-NEXT: vpaddb %xmm1, %xmm2, %xmm1
+; AVX512BW-NEXT: vpmovsxbw %xmm1, %ymm1
+; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1
+; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1
+; AVX512BW-NEXT: vpsubb %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
%res = srem <16 x i8> %a, <i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7>
ret <16 x i8> %res
}
diff --git a/test/CodeGen/X86/vector-idiv-sdiv-256.ll b/test/CodeGen/X86/vector-idiv-sdiv-256.ll
index ce0ec6c3875a..a9d5976ee7d5 100644
--- a/test/CodeGen/X86/vector-idiv-sdiv-256.ll
+++ b/test/CodeGen/X86/vector-idiv-sdiv-256.ll
@@ -1,6 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX2 --check-prefix=AVX2NOBW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX2 --check-prefix=AVX512BW
;
; sdiv by 7
@@ -8,7 +9,7 @@
define <4 x i64> @test_div7_4i64(<4 x i64> %a) nounwind {
; AVX1-LABEL: test_div7_4i64:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpextrq $1, %xmm1, %rax
; AVX1-NEXT: movabsq $5270498306774157605, %rcx # imm = 0x4924924924924925
@@ -45,7 +46,7 @@ define <4 x i64> @test_div7_4i64(<4 x i64> %a) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_div7_4i64:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpextrq $1, %xmm1, %rax
; AVX2-NEXT: movabsq $5270498306774157605, %rcx # imm = 0x4924924924924925
@@ -86,7 +87,7 @@ define <4 x i64> @test_div7_4i64(<4 x i64> %a) nounwind {
define <8 x i32> @test_div7_8i32(<8 x i32> %a) nounwind {
; AVX1-LABEL: test_div7_8i32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovdqa {{.*#+}} ymm1 = [2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027]
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
@@ -114,7 +115,7 @@ define <8 x i32> @test_div7_8i32(<8 x i32> %a) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_div7_8i32:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027]
; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7]
; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7]
@@ -133,7 +134,7 @@ define <8 x i32> @test_div7_8i32(<8 x i32> %a) nounwind {
define <16 x i16> @test_div7_16i16(<16 x i16> %a) nounwind {
; AVX1-LABEL: test_div7_16i16:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [18725,18725,18725,18725,18725,18725,18725,18725]
; AVX1-NEXT: vpmulhw %xmm2, %xmm1, %xmm1
@@ -148,7 +149,7 @@ define <16 x i16> @test_div7_16i16(<16 x i16> %a) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_div7_16i16:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpmulhw {{.*}}(%rip), %ymm0, %ymm0
; AVX2-NEXT: vpsrlw $15, %ymm0, %ymm1
; AVX2-NEXT: vpsraw $1, %ymm0, %ymm0
@@ -160,7 +161,7 @@ define <16 x i16> @test_div7_16i16(<16 x i16> %a) nounwind {
define <32 x i8> @test_div7_32i8(<32 x i8> %a) nounwind {
; AVX1-LABEL: test_div7_32i8:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpmovsxbw %xmm1, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [65427,65427,65427,65427,65427,65427,65427,65427]
@@ -201,32 +202,46 @@ define <32 x i8> @test_div7_32i8(<32 x i8> %a) nounwind {
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: test_div7_32i8:
-; AVX2: # BB#0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147]
-; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX2-NEXT: vpmovsxbw %xmm2, %ymm2
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
-; AVX2-NEXT: vpmovsxbw %xmm3, %ymm3
-; AVX2-NEXT: vpmullw %ymm2, %ymm3, %ymm2
-; AVX2-NEXT: vpsrlw $8, %ymm2, %ymm2
-; AVX2-NEXT: vpmovsxbw %xmm1, %ymm1
-; AVX2-NEXT: vpmovsxbw %xmm0, %ymm3
-; AVX2-NEXT: vpmullw %ymm1, %ymm3, %ymm1
-; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1
-; AVX2-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm1[2,3],ymm2[2,3]
-; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
-; AVX2-NEXT: vpackuswb %ymm3, %ymm1, %ymm1
-; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0
-; AVX2-NEXT: vpsrlw $2, %ymm0, %ymm1
-; AVX2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
-; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm1
-; AVX2-NEXT: vpsubb %ymm2, %ymm1, %ymm1
-; AVX2-NEXT: vpsrlw $7, %ymm0, %ymm0
-; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
-; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0
-; AVX2-NEXT: retq
+; AVX2NOBW-LABEL: test_div7_32i8:
+; AVX2NOBW: # %bb.0:
+; AVX2NOBW-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2NOBW-NEXT: vpmovsxbw %xmm1, %ymm1
+; AVX2NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427]
+; AVX2NOBW-NEXT: vpmullw %ymm2, %ymm1, %ymm1
+; AVX2NOBW-NEXT: vpsrlw $8, %ymm1, %ymm1
+; AVX2NOBW-NEXT: vpmovsxbw %xmm0, %ymm3
+; AVX2NOBW-NEXT: vpmullw %ymm2, %ymm3, %ymm2
+; AVX2NOBW-NEXT: vpsrlw $8, %ymm2, %ymm2
+; AVX2NOBW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm2[2,3],ymm1[2,3]
+; AVX2NOBW-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
+; AVX2NOBW-NEXT: vpackuswb %ymm3, %ymm1, %ymm1
+; AVX2NOBW-NEXT: vpaddb %ymm0, %ymm1, %ymm0
+; AVX2NOBW-NEXT: vpsrlw $2, %ymm0, %ymm1
+; AVX2NOBW-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
+; AVX2NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; AVX2NOBW-NEXT: vpxor %ymm2, %ymm1, %ymm1
+; AVX2NOBW-NEXT: vpsubb %ymm2, %ymm1, %ymm1
+; AVX2NOBW-NEXT: vpsrlw $7, %ymm0, %ymm0
+; AVX2NOBW-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX2NOBW-NEXT: vpaddb %ymm0, %ymm1, %ymm0
+; AVX2NOBW-NEXT: retq
+;
+; AVX512BW-LABEL: test_div7_32i8:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm1
+; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %zmm1, %zmm1
+; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1
+; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1
+; AVX512BW-NEXT: vpaddb %ymm0, %ymm1, %ymm0
+; AVX512BW-NEXT: vpsrlw $2, %ymm0, %ymm1
+; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
+; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; AVX512BW-NEXT: vpxor %ymm2, %ymm1, %ymm1
+; AVX512BW-NEXT: vpsubb %ymm2, %ymm1, %ymm1
+; AVX512BW-NEXT: vpsrlw $7, %ymm0, %ymm0
+; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX512BW-NEXT: vpaddb %ymm0, %ymm1, %ymm0
+; AVX512BW-NEXT: retq
%res = sdiv <32 x i8> %a, <i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7>
ret <32 x i8> %res
}
@@ -237,7 +252,7 @@ define <32 x i8> @test_div7_32i8(<32 x i8> %a) nounwind {
define <4 x i64> @test_rem7_4i64(<4 x i64> %a) nounwind {
; AVX1-LABEL: test_rem7_4i64:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpextrq $1, %xmm1, %rcx
; AVX1-NEXT: movabsq $5270498306774157605, %rsi # imm = 0x4924924924924925
@@ -290,7 +305,7 @@ define <4 x i64> @test_rem7_4i64(<4 x i64> %a) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_rem7_4i64:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpextrq $1, %xmm1, %rcx
; AVX2-NEXT: movabsq $5270498306774157605, %rsi # imm = 0x4924924924924925
@@ -347,7 +362,7 @@ define <4 x i64> @test_rem7_4i64(<4 x i64> %a) nounwind {
define <8 x i32> @test_rem7_8i32(<8 x i32> %a) nounwind {
; AVX1-LABEL: test_rem7_8i32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovdqa {{.*#+}} ymm1 = [2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027]
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
@@ -380,7 +395,7 @@ define <8 x i32> @test_rem7_8i32(<8 x i32> %a) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_rem7_8i32:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027]
; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7]
; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7]
@@ -402,7 +417,7 @@ define <8 x i32> @test_rem7_8i32(<8 x i32> %a) nounwind {
define <16 x i16> @test_rem7_16i16(<16 x i16> %a) nounwind {
; AVX1-LABEL: test_rem7_16i16:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [18725,18725,18725,18725,18725,18725,18725,18725]
; AVX1-NEXT: vpmulhw %xmm2, %xmm1, %xmm3
@@ -422,7 +437,7 @@ define <16 x i16> @test_rem7_16i16(<16 x i16> %a) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_rem7_16i16:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpmulhw {{.*}}(%rip), %ymm0, %ymm1
; AVX2-NEXT: vpsrlw $15, %ymm1, %ymm2
; AVX2-NEXT: vpsraw $1, %ymm1, %ymm1
@@ -436,7 +451,7 @@ define <16 x i16> @test_rem7_16i16(<16 x i16> %a) nounwind {
define <32 x i8> @test_rem7_32i8(<32 x i8> %a) nounwind {
; AVX1-LABEL: test_rem7_32i8:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vpmovsxbw %xmm2, %xmm3
; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [65427,65427,65427,65427,65427,65427,65427,65427]
@@ -497,49 +512,67 @@ define <32 x i8> @test_rem7_32i8(<32 x i8> %a) nounwind {
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: test_rem7_32i8:
-; AVX2: # BB#0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147]
-; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX2-NEXT: vpmovsxbw %xmm2, %ymm2
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
-; AVX2-NEXT: vpmovsxbw %xmm3, %ymm3
-; AVX2-NEXT: vpmullw %ymm2, %ymm3, %ymm2
-; AVX2-NEXT: vpsrlw $8, %ymm2, %ymm2
-; AVX2-NEXT: vpmovsxbw %xmm1, %ymm1
-; AVX2-NEXT: vpmovsxbw %xmm0, %ymm3
-; AVX2-NEXT: vpmullw %ymm1, %ymm3, %ymm1
-; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1
-; AVX2-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm1[2,3],ymm2[2,3]
-; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
-; AVX2-NEXT: vpackuswb %ymm3, %ymm1, %ymm1
-; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm1
-; AVX2-NEXT: vpsrlw $2, %ymm1, %ymm2
-; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
-; AVX2-NEXT: vpxor %ymm3, %ymm2, %ymm2
-; AVX2-NEXT: vpsubb %ymm3, %ymm2, %ymm2
-; AVX2-NEXT: vpsrlw $7, %ymm1, %ymm1
-; AVX2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
-; AVX2-NEXT: vpaddb %ymm1, %ymm2, %ymm1
-; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX2-NEXT: vpmovsxbw %xmm2, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX2-NEXT: vpmullw %ymm3, %ymm2, %ymm2
-; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm4
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX2-NEXT: vpshufb %xmm5, %xmm4, %xmm4
-; AVX2-NEXT: vpshufb %xmm5, %xmm2, %xmm2
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0]
-; AVX2-NEXT: vpmovsxbw %xmm1, %ymm1
-; AVX2-NEXT: vpmullw %ymm3, %ymm1, %ymm1
-; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
-; AVX2-NEXT: vpshufb %xmm5, %xmm3, %xmm3
-; AVX2-NEXT: vpshufb %xmm5, %xmm1, %xmm1
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
-; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
-; AVX2-NEXT: vpsubb %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: retq
+; AVX2NOBW-LABEL: test_rem7_32i8:
+; AVX2NOBW: # %bb.0:
+; AVX2NOBW-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2NOBW-NEXT: vpmovsxbw %xmm1, %ymm1
+; AVX2NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427]
+; AVX2NOBW-NEXT: vpmullw %ymm2, %ymm1, %ymm1
+; AVX2NOBW-NEXT: vpsrlw $8, %ymm1, %ymm1
+; AVX2NOBW-NEXT: vpmovsxbw %xmm0, %ymm3
+; AVX2NOBW-NEXT: vpmullw %ymm2, %ymm3, %ymm2
+; AVX2NOBW-NEXT: vpsrlw $8, %ymm2, %ymm2
+; AVX2NOBW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm2[2,3],ymm1[2,3]
+; AVX2NOBW-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
+; AVX2NOBW-NEXT: vpackuswb %ymm3, %ymm1, %ymm1
+; AVX2NOBW-NEXT: vpaddb %ymm0, %ymm1, %ymm1
+; AVX2NOBW-NEXT: vpsrlw $2, %ymm1, %ymm2
+; AVX2NOBW-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX2NOBW-NEXT: vmovdqa {{.*#+}} ymm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; AVX2NOBW-NEXT: vpxor %ymm3, %ymm2, %ymm2
+; AVX2NOBW-NEXT: vpsubb %ymm3, %ymm2, %ymm2
+; AVX2NOBW-NEXT: vpsrlw $7, %ymm1, %ymm1
+; AVX2NOBW-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
+; AVX2NOBW-NEXT: vpaddb %ymm1, %ymm2, %ymm1
+; AVX2NOBW-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2NOBW-NEXT: vpmovsxbw %xmm2, %ymm2
+; AVX2NOBW-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX2NOBW-NEXT: vpmullw %ymm3, %ymm2, %ymm2
+; AVX2NOBW-NEXT: vextracti128 $1, %ymm2, %xmm4
+; AVX2NOBW-NEXT: vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2NOBW-NEXT: vpshufb %xmm5, %xmm4, %xmm4
+; AVX2NOBW-NEXT: vpshufb %xmm5, %xmm2, %xmm2
+; AVX2NOBW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0]
+; AVX2NOBW-NEXT: vpmovsxbw %xmm1, %ymm1
+; AVX2NOBW-NEXT: vpmullw %ymm3, %ymm1, %ymm1
+; AVX2NOBW-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX2NOBW-NEXT: vpshufb %xmm5, %xmm3, %xmm3
+; AVX2NOBW-NEXT: vpshufb %xmm5, %xmm1, %xmm1
+; AVX2NOBW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
+; AVX2NOBW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX2NOBW-NEXT: vpsubb %ymm1, %ymm0, %ymm0
+; AVX2NOBW-NEXT: retq
+;
+; AVX512BW-LABEL: test_rem7_32i8:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm1
+; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %zmm1, %zmm1
+; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1
+; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1
+; AVX512BW-NEXT: vpaddb %ymm0, %ymm1, %ymm1
+; AVX512BW-NEXT: vpsrlw $2, %ymm1, %ymm2
+; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; AVX512BW-NEXT: vpxor %ymm3, %ymm2, %ymm2
+; AVX512BW-NEXT: vpsubb %ymm3, %ymm2, %ymm2
+; AVX512BW-NEXT: vpsrlw $7, %ymm1, %ymm1
+; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
+; AVX512BW-NEXT: vpaddb %ymm1, %ymm2, %ymm1
+; AVX512BW-NEXT: vpmovsxbw %ymm1, %zmm1
+; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %zmm1, %zmm1
+; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1
+; AVX512BW-NEXT: vpsubb %ymm1, %ymm0, %ymm0
+; AVX512BW-NEXT: retq
%res = srem <32 x i8> %a, <i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7>
ret <32 x i8> %res
}
diff --git a/test/CodeGen/X86/vector-idiv-sdiv-512.ll b/test/CodeGen/X86/vector-idiv-sdiv-512.ll
index c954daa33630..d01c79f4c05f 100644
--- a/test/CodeGen/X86/vector-idiv-sdiv-512.ll
+++ b/test/CodeGen/X86/vector-idiv-sdiv-512.ll
@@ -8,7 +8,7 @@
define <8 x i64> @test_div7_8i64(<8 x i64> %a) nounwind {
; AVX-LABEL: test_div7_8i64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vextracti32x4 $3, %zmm0, %xmm1
; AVX-NEXT: vpextrq $1, %xmm1, %rax
; AVX-NEXT: movabsq $5270498306774157605, %rcx # imm = 0x4924924924924925
@@ -43,7 +43,7 @@ define <8 x i64> @test_div7_8i64(<8 x i64> %a) nounwind {
; AVX-NEXT: vmovq %rdx, %xmm2
; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
; AVX-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
-; AVX-NEXT: vextracti32x4 $1, %zmm0, %xmm2
+; AVX-NEXT: vextracti128 $1, %ymm0, %xmm2
; AVX-NEXT: vpextrq $1, %xmm2, %rax
; AVX-NEXT: imulq %rcx
; AVX-NEXT: movq %rdx, %rax
@@ -83,173 +83,18 @@ define <8 x i64> @test_div7_8i64(<8 x i64> %a) nounwind {
define <16 x i32> @test_div7_16i32(<16 x i32> %a) nounwind {
; AVX-LABEL: test_div7_16i32:
-; AVX: # BB#0:
-; AVX-NEXT: vextracti32x4 $3, %zmm0, %xmm1
-; AVX-NEXT: vpextrd $1, %xmm1, %eax
-; AVX-NEXT: cltq
-; AVX-NEXT: imulq $-1840700269, %rax, %rcx # imm = 0x92492493
-; AVX-NEXT: shrq $32, %rcx
-; AVX-NEXT: addl %ecx, %eax
-; AVX-NEXT: movl %eax, %ecx
-; AVX-NEXT: shrl $31, %ecx
-; AVX-NEXT: sarl $2, %eax
-; AVX-NEXT: addl %ecx, %eax
-; AVX-NEXT: vmovd %xmm1, %ecx
-; AVX-NEXT: movslq %ecx, %rcx
-; AVX-NEXT: imulq $-1840700269, %rcx, %rdx # imm = 0x92492493
-; AVX-NEXT: shrq $32, %rdx
-; AVX-NEXT: addl %edx, %ecx
-; AVX-NEXT: movl %ecx, %edx
-; AVX-NEXT: shrl $31, %edx
-; AVX-NEXT: sarl $2, %ecx
-; AVX-NEXT: addl %edx, %ecx
-; AVX-NEXT: vmovd %ecx, %xmm2
-; AVX-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2
-; AVX-NEXT: vpextrd $2, %xmm1, %eax
-; AVX-NEXT: cltq
-; AVX-NEXT: imulq $-1840700269, %rax, %rcx # imm = 0x92492493
-; AVX-NEXT: shrq $32, %rcx
-; AVX-NEXT: addl %ecx, %eax
-; AVX-NEXT: movl %eax, %ecx
-; AVX-NEXT: shrl $31, %ecx
-; AVX-NEXT: sarl $2, %eax
-; AVX-NEXT: addl %ecx, %eax
-; AVX-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2
-; AVX-NEXT: vpextrd $3, %xmm1, %eax
-; AVX-NEXT: cltq
-; AVX-NEXT: imulq $-1840700269, %rax, %rcx # imm = 0x92492493
-; AVX-NEXT: shrq $32, %rcx
-; AVX-NEXT: addl %ecx, %eax
-; AVX-NEXT: movl %eax, %ecx
-; AVX-NEXT: shrl $31, %ecx
-; AVX-NEXT: sarl $2, %eax
-; AVX-NEXT: addl %ecx, %eax
-; AVX-NEXT: vpinsrd $3, %eax, %xmm2, %xmm1
-; AVX-NEXT: vextracti32x4 $2, %zmm0, %xmm2
-; AVX-NEXT: vpextrd $1, %xmm2, %eax
-; AVX-NEXT: cltq
-; AVX-NEXT: imulq $-1840700269, %rax, %rcx # imm = 0x92492493
-; AVX-NEXT: shrq $32, %rcx
-; AVX-NEXT: addl %ecx, %eax
-; AVX-NEXT: movl %eax, %ecx
-; AVX-NEXT: shrl $31, %ecx
-; AVX-NEXT: sarl $2, %eax
-; AVX-NEXT: addl %ecx, %eax
-; AVX-NEXT: vmovd %xmm2, %ecx
-; AVX-NEXT: movslq %ecx, %rcx
-; AVX-NEXT: imulq $-1840700269, %rcx, %rdx # imm = 0x92492493
-; AVX-NEXT: shrq $32, %rdx
-; AVX-NEXT: addl %edx, %ecx
-; AVX-NEXT: movl %ecx, %edx
-; AVX-NEXT: shrl $31, %edx
-; AVX-NEXT: sarl $2, %ecx
-; AVX-NEXT: addl %edx, %ecx
-; AVX-NEXT: vmovd %ecx, %xmm3
-; AVX-NEXT: vpinsrd $1, %eax, %xmm3, %xmm3
-; AVX-NEXT: vpextrd $2, %xmm2, %eax
-; AVX-NEXT: cltq
-; AVX-NEXT: imulq $-1840700269, %rax, %rcx # imm = 0x92492493
-; AVX-NEXT: shrq $32, %rcx
-; AVX-NEXT: addl %ecx, %eax
-; AVX-NEXT: movl %eax, %ecx
-; AVX-NEXT: shrl $31, %ecx
-; AVX-NEXT: sarl $2, %eax
-; AVX-NEXT: addl %ecx, %eax
-; AVX-NEXT: vpinsrd $2, %eax, %xmm3, %xmm3
-; AVX-NEXT: vpextrd $3, %xmm2, %eax
-; AVX-NEXT: cltq
-; AVX-NEXT: imulq $-1840700269, %rax, %rcx # imm = 0x92492493
-; AVX-NEXT: shrq $32, %rcx
-; AVX-NEXT: addl %ecx, %eax
-; AVX-NEXT: movl %eax, %ecx
-; AVX-NEXT: shrl $31, %ecx
-; AVX-NEXT: sarl $2, %eax
-; AVX-NEXT: addl %ecx, %eax
-; AVX-NEXT: vpinsrd $3, %eax, %xmm3, %xmm2
-; AVX-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
-; AVX-NEXT: vextracti32x4 $1, %zmm0, %xmm2
-; AVX-NEXT: vpextrd $1, %xmm2, %eax
-; AVX-NEXT: cltq
-; AVX-NEXT: imulq $-1840700269, %rax, %rcx # imm = 0x92492493
-; AVX-NEXT: shrq $32, %rcx
-; AVX-NEXT: addl %ecx, %eax
-; AVX-NEXT: movl %eax, %ecx
-; AVX-NEXT: shrl $31, %ecx
-; AVX-NEXT: sarl $2, %eax
-; AVX-NEXT: addl %ecx, %eax
-; AVX-NEXT: vmovd %xmm2, %ecx
-; AVX-NEXT: movslq %ecx, %rcx
-; AVX-NEXT: imulq $-1840700269, %rcx, %rdx # imm = 0x92492493
-; AVX-NEXT: shrq $32, %rdx
-; AVX-NEXT: addl %edx, %ecx
-; AVX-NEXT: movl %ecx, %edx
-; AVX-NEXT: shrl $31, %edx
-; AVX-NEXT: sarl $2, %ecx
-; AVX-NEXT: addl %edx, %ecx
-; AVX-NEXT: vmovd %ecx, %xmm3
-; AVX-NEXT: vpinsrd $1, %eax, %xmm3, %xmm3
-; AVX-NEXT: vpextrd $2, %xmm2, %eax
-; AVX-NEXT: cltq
-; AVX-NEXT: imulq $-1840700269, %rax, %rcx # imm = 0x92492493
-; AVX-NEXT: shrq $32, %rcx
-; AVX-NEXT: addl %ecx, %eax
-; AVX-NEXT: movl %eax, %ecx
-; AVX-NEXT: shrl $31, %ecx
-; AVX-NEXT: sarl $2, %eax
-; AVX-NEXT: addl %ecx, %eax
-; AVX-NEXT: vpinsrd $2, %eax, %xmm3, %xmm3
-; AVX-NEXT: vpextrd $3, %xmm2, %eax
-; AVX-NEXT: cltq
-; AVX-NEXT: imulq $-1840700269, %rax, %rcx # imm = 0x92492493
-; AVX-NEXT: shrq $32, %rcx
-; AVX-NEXT: addl %ecx, %eax
-; AVX-NEXT: movl %eax, %ecx
-; AVX-NEXT: shrl $31, %ecx
-; AVX-NEXT: sarl $2, %eax
-; AVX-NEXT: addl %ecx, %eax
-; AVX-NEXT: vpinsrd $3, %eax, %xmm3, %xmm2
-; AVX-NEXT: vpextrd $1, %xmm0, %eax
-; AVX-NEXT: cltq
-; AVX-NEXT: imulq $-1840700269, %rax, %rcx # imm = 0x92492493
-; AVX-NEXT: shrq $32, %rcx
-; AVX-NEXT: addl %ecx, %eax
-; AVX-NEXT: movl %eax, %ecx
-; AVX-NEXT: shrl $31, %ecx
-; AVX-NEXT: sarl $2, %eax
-; AVX-NEXT: addl %ecx, %eax
-; AVX-NEXT: vmovd %xmm0, %ecx
-; AVX-NEXT: movslq %ecx, %rcx
-; AVX-NEXT: imulq $-1840700269, %rcx, %rdx # imm = 0x92492493
-; AVX-NEXT: shrq $32, %rdx
-; AVX-NEXT: addl %edx, %ecx
-; AVX-NEXT: movl %ecx, %edx
-; AVX-NEXT: shrl $31, %edx
-; AVX-NEXT: sarl $2, %ecx
-; AVX-NEXT: addl %edx, %ecx
-; AVX-NEXT: vmovd %ecx, %xmm3
-; AVX-NEXT: vpinsrd $1, %eax, %xmm3, %xmm3
-; AVX-NEXT: vpextrd $2, %xmm0, %eax
-; AVX-NEXT: cltq
-; AVX-NEXT: imulq $-1840700269, %rax, %rcx # imm = 0x92492493
-; AVX-NEXT: shrq $32, %rcx
-; AVX-NEXT: addl %ecx, %eax
-; AVX-NEXT: movl %eax, %ecx
-; AVX-NEXT: shrl $31, %ecx
-; AVX-NEXT: sarl $2, %eax
-; AVX-NEXT: addl %ecx, %eax
-; AVX-NEXT: vpinsrd $2, %eax, %xmm3, %xmm3
-; AVX-NEXT: vpextrd $3, %xmm0, %eax
-; AVX-NEXT: cltq
-; AVX-NEXT: imulq $-1840700269, %rax, %rcx # imm = 0x92492493
-; AVX-NEXT: shrq $32, %rcx
-; AVX-NEXT: addl %ecx, %eax
-; AVX-NEXT: movl %eax, %ecx
-; AVX-NEXT: shrl $31, %ecx
-; AVX-NEXT: sarl $2, %eax
-; AVX-NEXT: addl %ecx, %eax
-; AVX-NEXT: vpinsrd $3, %eax, %xmm3, %xmm0
-; AVX-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
-; AVX-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX: # %bb.0:
+; AVX-NEXT: vpbroadcastd {{.*#+}} zmm1 = [2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027]
+; AVX-NEXT: vpmuldq %zmm1, %zmm0, %zmm2
+; AVX-NEXT: vpshufd {{.*#+}} zmm1 = zmm1[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
+; AVX-NEXT: vpshufd {{.*#+}} zmm3 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
+; AVX-NEXT: vpmuldq %zmm1, %zmm3, %zmm1
+; AVX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31]
+; AVX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
+; AVX-NEXT: vpaddd %zmm0, %zmm3, %zmm0
+; AVX-NEXT: vpsrld $31, %zmm0, %zmm1
+; AVX-NEXT: vpsrad $2, %zmm0, %zmm0
+; AVX-NEXT: vpaddd %zmm1, %zmm0, %zmm0
; AVX-NEXT: retq
%res = sdiv <16 x i32> %a, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
ret <16 x i32> %res
@@ -257,7 +102,7 @@ define <16 x i32> @test_div7_16i32(<16 x i32> %a) nounwind {
define <32 x i16> @test_div7_32i16(<32 x i16> %a) nounwind {
; AVX512F-LABEL: test_div7_32i16:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725]
; AVX512F-NEXT: vpmulhw %ymm2, %ymm0, %ymm0
; AVX512F-NEXT: vpsrlw $15, %ymm0, %ymm3
@@ -270,7 +115,7 @@ define <32 x i16> @test_div7_32i16(<32 x i16> %a) nounwind {
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: test_div7_32i16:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpmulhw {{.*}}(%rip), %zmm0, %zmm0
; AVX512BW-NEXT: vpsrlw $15, %zmm0, %zmm1
; AVX512BW-NEXT: vpsraw $1, %zmm0, %zmm0
@@ -282,764 +127,71 @@ define <32 x i16> @test_div7_32i16(<32 x i16> %a) nounwind {
define <64 x i8> @test_div7_64i8(<64 x i8> %a) nounwind {
; AVX512F-LABEL: test_div7_64i8:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147]
-; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm3
-; AVX512F-NEXT: vpmovsxbw %xmm3, %ymm3
-; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm4
-; AVX512F-NEXT: vpmovsxbw %xmm4, %ymm4
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX512F-NEXT: vpmovsxbw %xmm2, %ymm2
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427]
+; AVX512F-NEXT: vpmullw %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2
+; AVX512F-NEXT: vpmovsxbw %xmm0, %ymm4
; AVX512F-NEXT: vpmullw %ymm3, %ymm4, %ymm4
; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4
-; AVX512F-NEXT: vpmovsxbw %xmm2, %ymm2
-; AVX512F-NEXT: vpmovsxbw %xmm0, %ymm5
-; AVX512F-NEXT: vpmullw %ymm2, %ymm5, %ymm5
-; AVX512F-NEXT: vpsrlw $8, %ymm5, %ymm5
-; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm5[2,3],ymm4[2,3]
-; AVX512F-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4
-; AVX512F-NEXT: vpackuswb %ymm6, %ymm4, %ymm4
-; AVX512F-NEXT: vpaddb %ymm0, %ymm4, %ymm0
-; AVX512F-NEXT: vpsrlw $7, %ymm0, %ymm4
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; AVX512F-NEXT: vpand %ymm5, %ymm4, %ymm4
+; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm4[2,3],ymm2[2,3]
+; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2
+; AVX512F-NEXT: vpackuswb %ymm5, %ymm2, %ymm2
+; AVX512F-NEXT: vpaddb %ymm0, %ymm2, %ymm0
+; AVX512F-NEXT: vpsrlw $7, %ymm0, %ymm2
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX512F-NEXT: vpand %ymm4, %ymm2, %ymm2
; AVX512F-NEXT: vpsrlw $2, %ymm0, %ymm0
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
-; AVX512F-NEXT: vpand %ymm6, %ymm0, %ymm0
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
-; AVX512F-NEXT: vpxor %ymm7, %ymm0, %ymm0
-; AVX512F-NEXT: vpsubb %ymm7, %ymm0, %ymm0
-; AVX512F-NEXT: vpaddb %ymm4, %ymm0, %ymm0
-; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm4
-; AVX512F-NEXT: vpmovsxbw %xmm4, %ymm4
-; AVX512F-NEXT: vpmullw %ymm3, %ymm4, %ymm3
-; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
-; AVX512F-NEXT: vpmovsxbw %xmm1, %ymm4
-; AVX512F-NEXT: vpmullw %ymm2, %ymm4, %ymm2
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
+; AVX512F-NEXT: vpand %ymm5, %ymm0, %ymm0
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; AVX512F-NEXT: vpxor %ymm6, %ymm0, %ymm0
+; AVX512F-NEXT: vpsubb %ymm6, %ymm0, %ymm0
+; AVX512F-NEXT: vpaddb %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512F-NEXT: vpmovsxbw %xmm2, %ymm2
+; AVX512F-NEXT: vpmullw %ymm3, %ymm2, %ymm2
; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2
-; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm2[2,3],ymm3[2,3]
-; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
-; AVX512F-NEXT: vpackuswb %ymm4, %ymm2, %ymm2
+; AVX512F-NEXT: vpmovsxbw %xmm1, %ymm7
+; AVX512F-NEXT: vpmullw %ymm3, %ymm7, %ymm3
+; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
+; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm3[2,3],ymm2[2,3]
+; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2
+; AVX512F-NEXT: vpackuswb %ymm7, %ymm2, %ymm2
; AVX512F-NEXT: vpaddb %ymm1, %ymm2, %ymm1
; AVX512F-NEXT: vpsrlw $7, %ymm1, %ymm2
-; AVX512F-NEXT: vpand %ymm5, %ymm2, %ymm2
+; AVX512F-NEXT: vpand %ymm4, %ymm2, %ymm2
; AVX512F-NEXT: vpsrlw $2, %ymm1, %ymm1
-; AVX512F-NEXT: vpand %ymm6, %ymm1, %ymm1
-; AVX512F-NEXT: vpxor %ymm7, %ymm1, %ymm1
-; AVX512F-NEXT: vpsubb %ymm7, %ymm1, %ymm1
+; AVX512F-NEXT: vpand %ymm5, %ymm1, %ymm1
+; AVX512F-NEXT: vpxor %ymm6, %ymm1, %ymm1
+; AVX512F-NEXT: vpsubb %ymm6, %ymm1, %ymm1
; AVX512F-NEXT: vpaddb %ymm2, %ymm1, %ymm1
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: test_div7_64i8:
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm1
-; AVX512BW-NEXT: vpextrb $1, %xmm1, %eax
-; AVX512BW-NEXT: movsbl %al, %eax
-; AVX512BW-NEXT: imull $-109, %eax, %ecx
-; AVX512BW-NEXT: shrl $8, %ecx
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movl %eax, %ecx
-; AVX512BW-NEXT: shrb $7, %cl
-; AVX512BW-NEXT: sarb $2, %al
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpextrb $0, %xmm1, %ecx
-; AVX512BW-NEXT: movsbl %cl, %ecx
-; AVX512BW-NEXT: imull $-109, %ecx, %edx
-; AVX512BW-NEXT: shrl $8, %edx
-; AVX512BW-NEXT: addb %dl, %cl
-; AVX512BW-NEXT: movl %ecx, %edx
-; AVX512BW-NEXT: shrb $7, %dl
-; AVX512BW-NEXT: sarb $2, %cl
-; AVX512BW-NEXT: addb %dl, %cl
-; AVX512BW-NEXT: movzbl %cl, %ecx
-; AVX512BW-NEXT: vmovd %ecx, %xmm2
-; AVX512BW-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT: vpextrb $2, %xmm1, %eax
-; AVX512BW-NEXT: movsbl %al, %eax
-; AVX512BW-NEXT: imull $-109, %eax, %ecx
-; AVX512BW-NEXT: shrl $8, %ecx
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movl %eax, %ecx
-; AVX512BW-NEXT: shrb $7, %cl
-; AVX512BW-NEXT: sarb $2, %al
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT: vpextrb $3, %xmm1, %eax
-; AVX512BW-NEXT: movsbl %al, %eax
-; AVX512BW-NEXT: imull $-109, %eax, %ecx
-; AVX512BW-NEXT: shrl $8, %ecx
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movl %eax, %ecx
-; AVX512BW-NEXT: shrb $7, %cl
-; AVX512BW-NEXT: sarb $2, %al
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT: vpextrb $4, %xmm1, %eax
-; AVX512BW-NEXT: movsbl %al, %eax
-; AVX512BW-NEXT: imull $-109, %eax, %ecx
-; AVX512BW-NEXT: shrl $8, %ecx
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movl %eax, %ecx
-; AVX512BW-NEXT: shrb $7, %cl
-; AVX512BW-NEXT: sarb $2, %al
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT: vpextrb $5, %xmm1, %eax
-; AVX512BW-NEXT: movsbl %al, %eax
-; AVX512BW-NEXT: imull $-109, %eax, %ecx
-; AVX512BW-NEXT: shrl $8, %ecx
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movl %eax, %ecx
-; AVX512BW-NEXT: shrb $7, %cl
-; AVX512BW-NEXT: sarb $2, %al
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT: vpextrb $6, %xmm1, %eax
-; AVX512BW-NEXT: movsbl %al, %eax
-; AVX512BW-NEXT: imull $-109, %eax, %ecx
-; AVX512BW-NEXT: shrl $8, %ecx
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movl %eax, %ecx
-; AVX512BW-NEXT: shrb $7, %cl
-; AVX512BW-NEXT: sarb $2, %al
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT: vpextrb $7, %xmm1, %eax
-; AVX512BW-NEXT: movsbl %al, %eax
-; AVX512BW-NEXT: imull $-109, %eax, %ecx
-; AVX512BW-NEXT: shrl $8, %ecx
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movl %eax, %ecx
-; AVX512BW-NEXT: shrb $7, %cl
-; AVX512BW-NEXT: sarb $2, %al
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT: vpextrb $8, %xmm1, %eax
-; AVX512BW-NEXT: movsbl %al, %eax
-; AVX512BW-NEXT: imull $-109, %eax, %ecx
-; AVX512BW-NEXT: shrl $8, %ecx
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movl %eax, %ecx
-; AVX512BW-NEXT: shrb $7, %cl
-; AVX512BW-NEXT: sarb $2, %al
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT: vpextrb $9, %xmm1, %eax
-; AVX512BW-NEXT: movsbl %al, %eax
-; AVX512BW-NEXT: imull $-109, %eax, %ecx
-; AVX512BW-NEXT: shrl $8, %ecx
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movl %eax, %ecx
-; AVX512BW-NEXT: shrb $7, %cl
-; AVX512BW-NEXT: sarb $2, %al
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT: vpextrb $10, %xmm1, %eax
-; AVX512BW-NEXT: movsbl %al, %eax
-; AVX512BW-NEXT: imull $-109, %eax, %ecx
-; AVX512BW-NEXT: shrl $8, %ecx
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movl %eax, %ecx
-; AVX512BW-NEXT: shrb $7, %cl
-; AVX512BW-NEXT: sarb $2, %al
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT: vpextrb $11, %xmm1, %eax
-; AVX512BW-NEXT: movsbl %al, %eax
-; AVX512BW-NEXT: imull $-109, %eax, %ecx
-; AVX512BW-NEXT: shrl $8, %ecx
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movl %eax, %ecx
-; AVX512BW-NEXT: shrb $7, %cl
-; AVX512BW-NEXT: sarb $2, %al
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT: vpextrb $12, %xmm1, %eax
-; AVX512BW-NEXT: movsbl %al, %eax
-; AVX512BW-NEXT: imull $-109, %eax, %ecx
-; AVX512BW-NEXT: shrl $8, %ecx
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movl %eax, %ecx
-; AVX512BW-NEXT: shrb $7, %cl
-; AVX512BW-NEXT: sarb $2, %al
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT: vpextrb $13, %xmm1, %eax
-; AVX512BW-NEXT: movsbl %al, %eax
-; AVX512BW-NEXT: imull $-109, %eax, %ecx
-; AVX512BW-NEXT: shrl $8, %ecx
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movl %eax, %ecx
-; AVX512BW-NEXT: shrb $7, %cl
-; AVX512BW-NEXT: sarb $2, %al
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT: vpextrb $14, %xmm1, %eax
-; AVX512BW-NEXT: movsbl %al, %eax
-; AVX512BW-NEXT: imull $-109, %eax, %ecx
-; AVX512BW-NEXT: shrl $8, %ecx
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movl %eax, %ecx
-; AVX512BW-NEXT: shrb $7, %cl
-; AVX512BW-NEXT: sarb $2, %al
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT: vpextrb $15, %xmm1, %eax
-; AVX512BW-NEXT: movsbl %al, %eax
-; AVX512BW-NEXT: imull $-109, %eax, %ecx
-; AVX512BW-NEXT: shrl $8, %ecx
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movl %eax, %ecx
-; AVX512BW-NEXT: shrb $7, %cl
-; AVX512BW-NEXT: sarb $2, %al
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm2, %xmm1
-; AVX512BW-NEXT: vextracti32x4 $2, %zmm0, %xmm2
-; AVX512BW-NEXT: vpextrb $1, %xmm2, %eax
-; AVX512BW-NEXT: movsbl %al, %eax
-; AVX512BW-NEXT: imull $-109, %eax, %ecx
-; AVX512BW-NEXT: shrl $8, %ecx
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movl %eax, %ecx
-; AVX512BW-NEXT: shrb $7, %cl
-; AVX512BW-NEXT: sarb $2, %al
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpextrb $0, %xmm2, %ecx
-; AVX512BW-NEXT: movsbl %cl, %ecx
-; AVX512BW-NEXT: imull $-109, %ecx, %edx
-; AVX512BW-NEXT: shrl $8, %edx
-; AVX512BW-NEXT: addb %dl, %cl
-; AVX512BW-NEXT: movl %ecx, %edx
-; AVX512BW-NEXT: shrb $7, %dl
-; AVX512BW-NEXT: sarb $2, %cl
-; AVX512BW-NEXT: addb %dl, %cl
-; AVX512BW-NEXT: movzbl %cl, %ecx
-; AVX512BW-NEXT: vmovd %ecx, %xmm3
-; AVX512BW-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $2, %xmm2, %eax
-; AVX512BW-NEXT: movsbl %al, %eax
-; AVX512BW-NEXT: imull $-109, %eax, %ecx
-; AVX512BW-NEXT: shrl $8, %ecx
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movl %eax, %ecx
-; AVX512BW-NEXT: shrb $7, %cl
-; AVX512BW-NEXT: sarb $2, %al
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $3, %xmm2, %eax
-; AVX512BW-NEXT: movsbl %al, %eax
-; AVX512BW-NEXT: imull $-109, %eax, %ecx
-; AVX512BW-NEXT: shrl $8, %ecx
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movl %eax, %ecx
-; AVX512BW-NEXT: shrb $7, %cl
-; AVX512BW-NEXT: sarb $2, %al
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $4, %xmm2, %eax
-; AVX512BW-NEXT: movsbl %al, %eax
-; AVX512BW-NEXT: imull $-109, %eax, %ecx
-; AVX512BW-NEXT: shrl $8, %ecx
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movl %eax, %ecx
-; AVX512BW-NEXT: shrb $7, %cl
-; AVX512BW-NEXT: sarb $2, %al
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $5, %xmm2, %eax
-; AVX512BW-NEXT: movsbl %al, %eax
-; AVX512BW-NEXT: imull $-109, %eax, %ecx
-; AVX512BW-NEXT: shrl $8, %ecx
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movl %eax, %ecx
-; AVX512BW-NEXT: shrb $7, %cl
-; AVX512BW-NEXT: sarb $2, %al
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $6, %xmm2, %eax
-; AVX512BW-NEXT: movsbl %al, %eax
-; AVX512BW-NEXT: imull $-109, %eax, %ecx
-; AVX512BW-NEXT: shrl $8, %ecx
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movl %eax, %ecx
-; AVX512BW-NEXT: shrb $7, %cl
-; AVX512BW-NEXT: sarb $2, %al
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $7, %xmm2, %eax
-; AVX512BW-NEXT: movsbl %al, %eax
-; AVX512BW-NEXT: imull $-109, %eax, %ecx
-; AVX512BW-NEXT: shrl $8, %ecx
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movl %eax, %ecx
-; AVX512BW-NEXT: shrb $7, %cl
-; AVX512BW-NEXT: sarb $2, %al
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $8, %xmm2, %eax
-; AVX512BW-NEXT: movsbl %al, %eax
-; AVX512BW-NEXT: imull $-109, %eax, %ecx
-; AVX512BW-NEXT: shrl $8, %ecx
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movl %eax, %ecx
-; AVX512BW-NEXT: shrb $7, %cl
-; AVX512BW-NEXT: sarb $2, %al
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $9, %xmm2, %eax
-; AVX512BW-NEXT: movsbl %al, %eax
-; AVX512BW-NEXT: imull $-109, %eax, %ecx
-; AVX512BW-NEXT: shrl $8, %ecx
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movl %eax, %ecx
-; AVX512BW-NEXT: shrb $7, %cl
-; AVX512BW-NEXT: sarb $2, %al
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $10, %xmm2, %eax
-; AVX512BW-NEXT: movsbl %al, %eax
-; AVX512BW-NEXT: imull $-109, %eax, %ecx
-; AVX512BW-NEXT: shrl $8, %ecx
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movl %eax, %ecx
-; AVX512BW-NEXT: shrb $7, %cl
-; AVX512BW-NEXT: sarb $2, %al
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $11, %xmm2, %eax
-; AVX512BW-NEXT: movsbl %al, %eax
-; AVX512BW-NEXT: imull $-109, %eax, %ecx
-; AVX512BW-NEXT: shrl $8, %ecx
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movl %eax, %ecx
-; AVX512BW-NEXT: shrb $7, %cl
-; AVX512BW-NEXT: sarb $2, %al
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $12, %xmm2, %eax
-; AVX512BW-NEXT: movsbl %al, %eax
-; AVX512BW-NEXT: imull $-109, %eax, %ecx
-; AVX512BW-NEXT: shrl $8, %ecx
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movl %eax, %ecx
-; AVX512BW-NEXT: shrb $7, %cl
-; AVX512BW-NEXT: sarb $2, %al
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $13, %xmm2, %eax
-; AVX512BW-NEXT: movsbl %al, %eax
-; AVX512BW-NEXT: imull $-109, %eax, %ecx
-; AVX512BW-NEXT: shrl $8, %ecx
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movl %eax, %ecx
-; AVX512BW-NEXT: shrb $7, %cl
-; AVX512BW-NEXT: sarb $2, %al
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $14, %xmm2, %eax
-; AVX512BW-NEXT: movsbl %al, %eax
-; AVX512BW-NEXT: imull $-109, %eax, %ecx
-; AVX512BW-NEXT: shrl $8, %ecx
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movl %eax, %ecx
-; AVX512BW-NEXT: shrb $7, %cl
-; AVX512BW-NEXT: sarb $2, %al
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $15, %xmm2, %eax
-; AVX512BW-NEXT: movsbl %al, %eax
-; AVX512BW-NEXT: imull $-109, %eax, %ecx
-; AVX512BW-NEXT: shrl $8, %ecx
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movl %eax, %ecx
-; AVX512BW-NEXT: shrb $7, %cl
-; AVX512BW-NEXT: sarb $2, %al
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm3, %xmm2
-; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
-; AVX512BW-NEXT: vextracti32x4 $1, %zmm0, %xmm2
-; AVX512BW-NEXT: vpextrb $1, %xmm2, %eax
-; AVX512BW-NEXT: movsbl %al, %eax
-; AVX512BW-NEXT: imull $-109, %eax, %ecx
-; AVX512BW-NEXT: shrl $8, %ecx
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movl %eax, %ecx
-; AVX512BW-NEXT: shrb $7, %cl
-; AVX512BW-NEXT: sarb $2, %al
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpextrb $0, %xmm2, %ecx
-; AVX512BW-NEXT: movsbl %cl, %ecx
-; AVX512BW-NEXT: imull $-109, %ecx, %edx
-; AVX512BW-NEXT: shrl $8, %edx
-; AVX512BW-NEXT: addb %dl, %cl
-; AVX512BW-NEXT: movl %ecx, %edx
-; AVX512BW-NEXT: shrb $7, %dl
-; AVX512BW-NEXT: sarb $2, %cl
-; AVX512BW-NEXT: addb %dl, %cl
-; AVX512BW-NEXT: movzbl %cl, %ecx
-; AVX512BW-NEXT: vmovd %ecx, %xmm3
-; AVX512BW-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $2, %xmm2, %eax
-; AVX512BW-NEXT: movsbl %al, %eax
-; AVX512BW-NEXT: imull $-109, %eax, %ecx
-; AVX512BW-NEXT: shrl $8, %ecx
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movl %eax, %ecx
-; AVX512BW-NEXT: shrb $7, %cl
-; AVX512BW-NEXT: sarb $2, %al
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $3, %xmm2, %eax
-; AVX512BW-NEXT: movsbl %al, %eax
-; AVX512BW-NEXT: imull $-109, %eax, %ecx
-; AVX512BW-NEXT: shrl $8, %ecx
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movl %eax, %ecx
-; AVX512BW-NEXT: shrb $7, %cl
-; AVX512BW-NEXT: sarb $2, %al
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $4, %xmm2, %eax
-; AVX512BW-NEXT: movsbl %al, %eax
-; AVX512BW-NEXT: imull $-109, %eax, %ecx
-; AVX512BW-NEXT: shrl $8, %ecx
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movl %eax, %ecx
-; AVX512BW-NEXT: shrb $7, %cl
-; AVX512BW-NEXT: sarb $2, %al
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $5, %xmm2, %eax
-; AVX512BW-NEXT: movsbl %al, %eax
-; AVX512BW-NEXT: imull $-109, %eax, %ecx
-; AVX512BW-NEXT: shrl $8, %ecx
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movl %eax, %ecx
-; AVX512BW-NEXT: shrb $7, %cl
-; AVX512BW-NEXT: sarb $2, %al
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $6, %xmm2, %eax
-; AVX512BW-NEXT: movsbl %al, %eax
-; AVX512BW-NEXT: imull $-109, %eax, %ecx
-; AVX512BW-NEXT: shrl $8, %ecx
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movl %eax, %ecx
-; AVX512BW-NEXT: shrb $7, %cl
-; AVX512BW-NEXT: sarb $2, %al
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $7, %xmm2, %eax
-; AVX512BW-NEXT: movsbl %al, %eax
-; AVX512BW-NEXT: imull $-109, %eax, %ecx
-; AVX512BW-NEXT: shrl $8, %ecx
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movl %eax, %ecx
-; AVX512BW-NEXT: shrb $7, %cl
-; AVX512BW-NEXT: sarb $2, %al
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $8, %xmm2, %eax
-; AVX512BW-NEXT: movsbl %al, %eax
-; AVX512BW-NEXT: imull $-109, %eax, %ecx
-; AVX512BW-NEXT: shrl $8, %ecx
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movl %eax, %ecx
-; AVX512BW-NEXT: shrb $7, %cl
-; AVX512BW-NEXT: sarb $2, %al
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $9, %xmm2, %eax
-; AVX512BW-NEXT: movsbl %al, %eax
-; AVX512BW-NEXT: imull $-109, %eax, %ecx
-; AVX512BW-NEXT: shrl $8, %ecx
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movl %eax, %ecx
-; AVX512BW-NEXT: shrb $7, %cl
-; AVX512BW-NEXT: sarb $2, %al
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $10, %xmm2, %eax
-; AVX512BW-NEXT: movsbl %al, %eax
-; AVX512BW-NEXT: imull $-109, %eax, %ecx
-; AVX512BW-NEXT: shrl $8, %ecx
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movl %eax, %ecx
-; AVX512BW-NEXT: shrb $7, %cl
-; AVX512BW-NEXT: sarb $2, %al
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $11, %xmm2, %eax
-; AVX512BW-NEXT: movsbl %al, %eax
-; AVX512BW-NEXT: imull $-109, %eax, %ecx
-; AVX512BW-NEXT: shrl $8, %ecx
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movl %eax, %ecx
-; AVX512BW-NEXT: shrb $7, %cl
-; AVX512BW-NEXT: sarb $2, %al
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $12, %xmm2, %eax
-; AVX512BW-NEXT: movsbl %al, %eax
-; AVX512BW-NEXT: imull $-109, %eax, %ecx
-; AVX512BW-NEXT: shrl $8, %ecx
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movl %eax, %ecx
-; AVX512BW-NEXT: shrb $7, %cl
-; AVX512BW-NEXT: sarb $2, %al
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $13, %xmm2, %eax
-; AVX512BW-NEXT: movsbl %al, %eax
-; AVX512BW-NEXT: imull $-109, %eax, %ecx
-; AVX512BW-NEXT: shrl $8, %ecx
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movl %eax, %ecx
-; AVX512BW-NEXT: shrb $7, %cl
-; AVX512BW-NEXT: sarb $2, %al
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $14, %xmm2, %eax
-; AVX512BW-NEXT: movsbl %al, %eax
-; AVX512BW-NEXT: imull $-109, %eax, %ecx
-; AVX512BW-NEXT: shrl $8, %ecx
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movl %eax, %ecx
-; AVX512BW-NEXT: shrb $7, %cl
-; AVX512BW-NEXT: sarb $2, %al
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $15, %xmm2, %eax
-; AVX512BW-NEXT: movsbl %al, %eax
-; AVX512BW-NEXT: imull $-109, %eax, %ecx
-; AVX512BW-NEXT: shrl $8, %ecx
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movl %eax, %ecx
-; AVX512BW-NEXT: shrb $7, %cl
-; AVX512BW-NEXT: sarb $2, %al
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm3, %xmm2
-; AVX512BW-NEXT: vpextrb $1, %xmm0, %eax
-; AVX512BW-NEXT: movsbl %al, %eax
-; AVX512BW-NEXT: imull $-109, %eax, %ecx
-; AVX512BW-NEXT: shrl $8, %ecx
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movl %eax, %ecx
-; AVX512BW-NEXT: shrb $7, %cl
-; AVX512BW-NEXT: sarb $2, %al
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpextrb $0, %xmm0, %ecx
-; AVX512BW-NEXT: movsbl %cl, %ecx
-; AVX512BW-NEXT: imull $-109, %ecx, %edx
-; AVX512BW-NEXT: shrl $8, %edx
-; AVX512BW-NEXT: addb %dl, %cl
-; AVX512BW-NEXT: movl %ecx, %edx
-; AVX512BW-NEXT: shrb $7, %dl
-; AVX512BW-NEXT: sarb $2, %cl
-; AVX512BW-NEXT: addb %dl, %cl
-; AVX512BW-NEXT: movzbl %cl, %ecx
-; AVX512BW-NEXT: vmovd %ecx, %xmm3
-; AVX512BW-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $2, %xmm0, %eax
-; AVX512BW-NEXT: movsbl %al, %eax
-; AVX512BW-NEXT: imull $-109, %eax, %ecx
-; AVX512BW-NEXT: shrl $8, %ecx
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movl %eax, %ecx
-; AVX512BW-NEXT: shrb $7, %cl
-; AVX512BW-NEXT: sarb $2, %al
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $3, %xmm0, %eax
-; AVX512BW-NEXT: movsbl %al, %eax
-; AVX512BW-NEXT: imull $-109, %eax, %ecx
-; AVX512BW-NEXT: shrl $8, %ecx
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movl %eax, %ecx
-; AVX512BW-NEXT: shrb $7, %cl
-; AVX512BW-NEXT: sarb $2, %al
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $4, %xmm0, %eax
-; AVX512BW-NEXT: movsbl %al, %eax
-; AVX512BW-NEXT: imull $-109, %eax, %ecx
-; AVX512BW-NEXT: shrl $8, %ecx
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movl %eax, %ecx
-; AVX512BW-NEXT: shrb $7, %cl
-; AVX512BW-NEXT: sarb $2, %al
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $5, %xmm0, %eax
-; AVX512BW-NEXT: movsbl %al, %eax
-; AVX512BW-NEXT: imull $-109, %eax, %ecx
-; AVX512BW-NEXT: shrl $8, %ecx
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movl %eax, %ecx
-; AVX512BW-NEXT: shrb $7, %cl
-; AVX512BW-NEXT: sarb $2, %al
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $6, %xmm0, %eax
-; AVX512BW-NEXT: movsbl %al, %eax
-; AVX512BW-NEXT: imull $-109, %eax, %ecx
-; AVX512BW-NEXT: shrl $8, %ecx
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movl %eax, %ecx
-; AVX512BW-NEXT: shrb $7, %cl
-; AVX512BW-NEXT: sarb $2, %al
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $7, %xmm0, %eax
-; AVX512BW-NEXT: movsbl %al, %eax
-; AVX512BW-NEXT: imull $-109, %eax, %ecx
-; AVX512BW-NEXT: shrl $8, %ecx
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movl %eax, %ecx
-; AVX512BW-NEXT: shrb $7, %cl
-; AVX512BW-NEXT: sarb $2, %al
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $8, %xmm0, %eax
-; AVX512BW-NEXT: movsbl %al, %eax
-; AVX512BW-NEXT: imull $-109, %eax, %ecx
-; AVX512BW-NEXT: shrl $8, %ecx
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movl %eax, %ecx
-; AVX512BW-NEXT: shrb $7, %cl
-; AVX512BW-NEXT: sarb $2, %al
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $9, %xmm0, %eax
-; AVX512BW-NEXT: movsbl %al, %eax
-; AVX512BW-NEXT: imull $-109, %eax, %ecx
-; AVX512BW-NEXT: shrl $8, %ecx
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movl %eax, %ecx
-; AVX512BW-NEXT: shrb $7, %cl
-; AVX512BW-NEXT: sarb $2, %al
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $10, %xmm0, %eax
-; AVX512BW-NEXT: movsbl %al, %eax
-; AVX512BW-NEXT: imull $-109, %eax, %ecx
-; AVX512BW-NEXT: shrl $8, %ecx
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movl %eax, %ecx
-; AVX512BW-NEXT: shrb $7, %cl
-; AVX512BW-NEXT: sarb $2, %al
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $11, %xmm0, %eax
-; AVX512BW-NEXT: movsbl %al, %eax
-; AVX512BW-NEXT: imull $-109, %eax, %ecx
-; AVX512BW-NEXT: shrl $8, %ecx
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movl %eax, %ecx
-; AVX512BW-NEXT: shrb $7, %cl
-; AVX512BW-NEXT: sarb $2, %al
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $12, %xmm0, %eax
-; AVX512BW-NEXT: movsbl %al, %eax
-; AVX512BW-NEXT: imull $-109, %eax, %ecx
-; AVX512BW-NEXT: shrl $8, %ecx
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movl %eax, %ecx
-; AVX512BW-NEXT: shrb $7, %cl
-; AVX512BW-NEXT: sarb $2, %al
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $13, %xmm0, %eax
-; AVX512BW-NEXT: movsbl %al, %eax
-; AVX512BW-NEXT: imull $-109, %eax, %ecx
-; AVX512BW-NEXT: shrl $8, %ecx
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movl %eax, %ecx
-; AVX512BW-NEXT: shrb $7, %cl
-; AVX512BW-NEXT: sarb $2, %al
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $14, %xmm0, %eax
-; AVX512BW-NEXT: movsbl %al, %eax
-; AVX512BW-NEXT: imull $-109, %eax, %ecx
-; AVX512BW-NEXT: shrl $8, %ecx
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movl %eax, %ecx
-; AVX512BW-NEXT: shrb $7, %cl
-; AVX512BW-NEXT: sarb $2, %al
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $15, %xmm0, %eax
-; AVX512BW-NEXT: movsbl %al, %eax
-; AVX512BW-NEXT: imull $-109, %eax, %ecx
-; AVX512BW-NEXT: shrl $8, %ecx
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movl %eax, %ecx
-; AVX512BW-NEXT: shrb $7, %cl
-; AVX512BW-NEXT: sarb $2, %al
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm3, %xmm0
-; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
-; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm1
+; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427]
+; AVX512BW-NEXT: vpmullw %zmm2, %zmm1, %zmm1
+; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1
+; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1
+; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; AVX512BW-NEXT: vpmovsxbw %ymm3, %zmm3
+; AVX512BW-NEXT: vpmullw %zmm2, %zmm3, %zmm2
+; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2
+; AVX512BW-NEXT: vpmovwb %zmm2, %ymm2
+; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT: vpsrlw $2, %zmm0, %zmm1
+; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
+; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; AVX512BW-NEXT: vpxorq %zmm2, %zmm1, %zmm1
+; AVX512BW-NEXT: vpsubb %zmm2, %zmm1, %zmm1
+; AVX512BW-NEXT: vpsrlw $7, %zmm0, %zmm0
+; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0
+; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0
; AVX512BW-NEXT: retq
%res = sdiv <64 x i8> %a, <i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7>
ret <64 x i8> %res
@@ -1051,7 +203,7 @@ define <64 x i8> @test_div7_64i8(<64 x i8> %a) nounwind {
define <8 x i64> @test_rem7_8i64(<8 x i64> %a) nounwind {
; AVX-LABEL: test_rem7_8i64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vextracti32x4 $3, %zmm0, %xmm1
; AVX-NEXT: vpextrq $1, %xmm1, %rcx
; AVX-NEXT: movabsq $5270498306774157605, %rsi # imm = 0x4924924924924925
@@ -1102,7 +254,7 @@ define <8 x i64> @test_rem7_8i64(<8 x i64> %a) nounwind {
; AVX-NEXT: vmovq %rcx, %xmm2
; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
; AVX-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
-; AVX-NEXT: vextracti32x4 $1, %zmm0, %xmm2
+; AVX-NEXT: vextracti128 $1, %ymm0, %xmm2
; AVX-NEXT: vpextrq $1, %xmm2, %rcx
; AVX-NEXT: movq %rcx, %rax
; AVX-NEXT: imulq %rsi
@@ -1158,221 +310,20 @@ define <8 x i64> @test_rem7_8i64(<8 x i64> %a) nounwind {
define <16 x i32> @test_rem7_16i32(<16 x i32> %a) nounwind {
; AVX-LABEL: test_rem7_16i32:
-; AVX: # BB#0:
-; AVX-NEXT: vextracti32x4 $3, %zmm0, %xmm1
-; AVX-NEXT: vpextrd $1, %xmm1, %eax
-; AVX-NEXT: cltq
-; AVX-NEXT: imulq $-1840700269, %rax, %rcx # imm = 0x92492493
-; AVX-NEXT: shrq $32, %rcx
-; AVX-NEXT: addl %eax, %ecx
-; AVX-NEXT: movl %ecx, %edx
-; AVX-NEXT: shrl $31, %edx
-; AVX-NEXT: sarl $2, %ecx
-; AVX-NEXT: addl %edx, %ecx
-; AVX-NEXT: leal (,%rcx,8), %edx
-; AVX-NEXT: subl %ecx, %edx
-; AVX-NEXT: subl %edx, %eax
-; AVX-NEXT: vmovd %xmm1, %ecx
-; AVX-NEXT: movslq %ecx, %rcx
-; AVX-NEXT: imulq $-1840700269, %rcx, %rdx # imm = 0x92492493
-; AVX-NEXT: shrq $32, %rdx
-; AVX-NEXT: addl %ecx, %edx
-; AVX-NEXT: movl %edx, %esi
-; AVX-NEXT: shrl $31, %esi
-; AVX-NEXT: sarl $2, %edx
-; AVX-NEXT: addl %esi, %edx
-; AVX-NEXT: leal (,%rdx,8), %esi
-; AVX-NEXT: subl %edx, %esi
-; AVX-NEXT: subl %esi, %ecx
-; AVX-NEXT: vmovd %ecx, %xmm2
-; AVX-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2
-; AVX-NEXT: vpextrd $2, %xmm1, %eax
-; AVX-NEXT: cltq
-; AVX-NEXT: imulq $-1840700269, %rax, %rcx # imm = 0x92492493
-; AVX-NEXT: shrq $32, %rcx
-; AVX-NEXT: addl %eax, %ecx
-; AVX-NEXT: movl %ecx, %edx
-; AVX-NEXT: shrl $31, %edx
-; AVX-NEXT: sarl $2, %ecx
-; AVX-NEXT: addl %edx, %ecx
-; AVX-NEXT: leal (,%rcx,8), %edx
-; AVX-NEXT: subl %ecx, %edx
-; AVX-NEXT: subl %edx, %eax
-; AVX-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2
-; AVX-NEXT: vpextrd $3, %xmm1, %eax
-; AVX-NEXT: cltq
-; AVX-NEXT: imulq $-1840700269, %rax, %rcx # imm = 0x92492493
-; AVX-NEXT: shrq $32, %rcx
-; AVX-NEXT: addl %eax, %ecx
-; AVX-NEXT: movl %ecx, %edx
-; AVX-NEXT: shrl $31, %edx
-; AVX-NEXT: sarl $2, %ecx
-; AVX-NEXT: addl %edx, %ecx
-; AVX-NEXT: leal (,%rcx,8), %edx
-; AVX-NEXT: subl %ecx, %edx
-; AVX-NEXT: subl %edx, %eax
-; AVX-NEXT: vpinsrd $3, %eax, %xmm2, %xmm1
-; AVX-NEXT: vextracti32x4 $2, %zmm0, %xmm2
-; AVX-NEXT: vpextrd $1, %xmm2, %eax
-; AVX-NEXT: cltq
-; AVX-NEXT: imulq $-1840700269, %rax, %rcx # imm = 0x92492493
-; AVX-NEXT: shrq $32, %rcx
-; AVX-NEXT: addl %eax, %ecx
-; AVX-NEXT: movl %ecx, %edx
-; AVX-NEXT: shrl $31, %edx
-; AVX-NEXT: sarl $2, %ecx
-; AVX-NEXT: addl %edx, %ecx
-; AVX-NEXT: leal (,%rcx,8), %edx
-; AVX-NEXT: subl %ecx, %edx
-; AVX-NEXT: subl %edx, %eax
-; AVX-NEXT: vmovd %xmm2, %ecx
-; AVX-NEXT: movslq %ecx, %rcx
-; AVX-NEXT: imulq $-1840700269, %rcx, %rdx # imm = 0x92492493
-; AVX-NEXT: shrq $32, %rdx
-; AVX-NEXT: addl %ecx, %edx
-; AVX-NEXT: movl %edx, %esi
-; AVX-NEXT: shrl $31, %esi
-; AVX-NEXT: sarl $2, %edx
-; AVX-NEXT: addl %esi, %edx
-; AVX-NEXT: leal (,%rdx,8), %esi
-; AVX-NEXT: subl %edx, %esi
-; AVX-NEXT: subl %esi, %ecx
-; AVX-NEXT: vmovd %ecx, %xmm3
-; AVX-NEXT: vpinsrd $1, %eax, %xmm3, %xmm3
-; AVX-NEXT: vpextrd $2, %xmm2, %eax
-; AVX-NEXT: cltq
-; AVX-NEXT: imulq $-1840700269, %rax, %rcx # imm = 0x92492493
-; AVX-NEXT: shrq $32, %rcx
-; AVX-NEXT: addl %eax, %ecx
-; AVX-NEXT: movl %ecx, %edx
-; AVX-NEXT: shrl $31, %edx
-; AVX-NEXT: sarl $2, %ecx
-; AVX-NEXT: addl %edx, %ecx
-; AVX-NEXT: leal (,%rcx,8), %edx
-; AVX-NEXT: subl %ecx, %edx
-; AVX-NEXT: subl %edx, %eax
-; AVX-NEXT: vpinsrd $2, %eax, %xmm3, %xmm3
-; AVX-NEXT: vpextrd $3, %xmm2, %eax
-; AVX-NEXT: cltq
-; AVX-NEXT: imulq $-1840700269, %rax, %rcx # imm = 0x92492493
-; AVX-NEXT: shrq $32, %rcx
-; AVX-NEXT: addl %eax, %ecx
-; AVX-NEXT: movl %ecx, %edx
-; AVX-NEXT: shrl $31, %edx
-; AVX-NEXT: sarl $2, %ecx
-; AVX-NEXT: addl %edx, %ecx
-; AVX-NEXT: leal (,%rcx,8), %edx
-; AVX-NEXT: subl %ecx, %edx
-; AVX-NEXT: subl %edx, %eax
-; AVX-NEXT: vpinsrd $3, %eax, %xmm3, %xmm2
-; AVX-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
-; AVX-NEXT: vextracti32x4 $1, %zmm0, %xmm2
-; AVX-NEXT: vpextrd $1, %xmm2, %eax
-; AVX-NEXT: cltq
-; AVX-NEXT: imulq $-1840700269, %rax, %rcx # imm = 0x92492493
-; AVX-NEXT: shrq $32, %rcx
-; AVX-NEXT: addl %eax, %ecx
-; AVX-NEXT: movl %ecx, %edx
-; AVX-NEXT: shrl $31, %edx
-; AVX-NEXT: sarl $2, %ecx
-; AVX-NEXT: addl %edx, %ecx
-; AVX-NEXT: leal (,%rcx,8), %edx
-; AVX-NEXT: subl %ecx, %edx
-; AVX-NEXT: subl %edx, %eax
-; AVX-NEXT: vmovd %xmm2, %ecx
-; AVX-NEXT: movslq %ecx, %rcx
-; AVX-NEXT: imulq $-1840700269, %rcx, %rdx # imm = 0x92492493
-; AVX-NEXT: shrq $32, %rdx
-; AVX-NEXT: addl %ecx, %edx
-; AVX-NEXT: movl %edx, %esi
-; AVX-NEXT: shrl $31, %esi
-; AVX-NEXT: sarl $2, %edx
-; AVX-NEXT: addl %esi, %edx
-; AVX-NEXT: leal (,%rdx,8), %esi
-; AVX-NEXT: subl %edx, %esi
-; AVX-NEXT: subl %esi, %ecx
-; AVX-NEXT: vmovd %ecx, %xmm3
-; AVX-NEXT: vpinsrd $1, %eax, %xmm3, %xmm3
-; AVX-NEXT: vpextrd $2, %xmm2, %eax
-; AVX-NEXT: cltq
-; AVX-NEXT: imulq $-1840700269, %rax, %rcx # imm = 0x92492493
-; AVX-NEXT: shrq $32, %rcx
-; AVX-NEXT: addl %eax, %ecx
-; AVX-NEXT: movl %ecx, %edx
-; AVX-NEXT: shrl $31, %edx
-; AVX-NEXT: sarl $2, %ecx
-; AVX-NEXT: addl %edx, %ecx
-; AVX-NEXT: leal (,%rcx,8), %edx
-; AVX-NEXT: subl %ecx, %edx
-; AVX-NEXT: subl %edx, %eax
-; AVX-NEXT: vpinsrd $2, %eax, %xmm3, %xmm3
-; AVX-NEXT: vpextrd $3, %xmm2, %eax
-; AVX-NEXT: cltq
-; AVX-NEXT: imulq $-1840700269, %rax, %rcx # imm = 0x92492493
-; AVX-NEXT: shrq $32, %rcx
-; AVX-NEXT: addl %eax, %ecx
-; AVX-NEXT: movl %ecx, %edx
-; AVX-NEXT: shrl $31, %edx
-; AVX-NEXT: sarl $2, %ecx
-; AVX-NEXT: addl %edx, %ecx
-; AVX-NEXT: leal (,%rcx,8), %edx
-; AVX-NEXT: subl %ecx, %edx
-; AVX-NEXT: subl %edx, %eax
-; AVX-NEXT: vpinsrd $3, %eax, %xmm3, %xmm2
-; AVX-NEXT: vpextrd $1, %xmm0, %eax
-; AVX-NEXT: cltq
-; AVX-NEXT: imulq $-1840700269, %rax, %rcx # imm = 0x92492493
-; AVX-NEXT: shrq $32, %rcx
-; AVX-NEXT: addl %eax, %ecx
-; AVX-NEXT: movl %ecx, %edx
-; AVX-NEXT: shrl $31, %edx
-; AVX-NEXT: sarl $2, %ecx
-; AVX-NEXT: addl %edx, %ecx
-; AVX-NEXT: leal (,%rcx,8), %edx
-; AVX-NEXT: subl %ecx, %edx
-; AVX-NEXT: subl %edx, %eax
-; AVX-NEXT: vmovd %xmm0, %ecx
-; AVX-NEXT: movslq %ecx, %rcx
-; AVX-NEXT: imulq $-1840700269, %rcx, %rdx # imm = 0x92492493
-; AVX-NEXT: shrq $32, %rdx
-; AVX-NEXT: addl %ecx, %edx
-; AVX-NEXT: movl %edx, %esi
-; AVX-NEXT: shrl $31, %esi
-; AVX-NEXT: sarl $2, %edx
-; AVX-NEXT: addl %esi, %edx
-; AVX-NEXT: leal (,%rdx,8), %esi
-; AVX-NEXT: subl %edx, %esi
-; AVX-NEXT: subl %esi, %ecx
-; AVX-NEXT: vmovd %ecx, %xmm3
-; AVX-NEXT: vpinsrd $1, %eax, %xmm3, %xmm3
-; AVX-NEXT: vpextrd $2, %xmm0, %eax
-; AVX-NEXT: cltq
-; AVX-NEXT: imulq $-1840700269, %rax, %rcx # imm = 0x92492493
-; AVX-NEXT: shrq $32, %rcx
-; AVX-NEXT: addl %eax, %ecx
-; AVX-NEXT: movl %ecx, %edx
-; AVX-NEXT: shrl $31, %edx
-; AVX-NEXT: sarl $2, %ecx
-; AVX-NEXT: addl %edx, %ecx
-; AVX-NEXT: leal (,%rcx,8), %edx
-; AVX-NEXT: subl %ecx, %edx
-; AVX-NEXT: subl %edx, %eax
-; AVX-NEXT: vpinsrd $2, %eax, %xmm3, %xmm3
-; AVX-NEXT: vpextrd $3, %xmm0, %eax
-; AVX-NEXT: cltq
-; AVX-NEXT: imulq $-1840700269, %rax, %rcx # imm = 0x92492493
-; AVX-NEXT: shrq $32, %rcx
-; AVX-NEXT: addl %eax, %ecx
-; AVX-NEXT: movl %ecx, %edx
-; AVX-NEXT: shrl $31, %edx
-; AVX-NEXT: sarl $2, %ecx
-; AVX-NEXT: addl %edx, %ecx
-; AVX-NEXT: leal (,%rcx,8), %edx
-; AVX-NEXT: subl %ecx, %edx
-; AVX-NEXT: subl %edx, %eax
-; AVX-NEXT: vpinsrd $3, %eax, %xmm3, %xmm0
-; AVX-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
-; AVX-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX: # %bb.0:
+; AVX-NEXT: vpbroadcastd {{.*#+}} zmm1 = [2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027]
+; AVX-NEXT: vpmuldq %zmm1, %zmm0, %zmm2
+; AVX-NEXT: vpshufd {{.*#+}} zmm1 = zmm1[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
+; AVX-NEXT: vpshufd {{.*#+}} zmm3 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
+; AVX-NEXT: vpmuldq %zmm1, %zmm3, %zmm1
+; AVX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31]
+; AVX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
+; AVX-NEXT: vpaddd %zmm0, %zmm3, %zmm1
+; AVX-NEXT: vpsrld $31, %zmm1, %zmm2
+; AVX-NEXT: vpsrad $2, %zmm1, %zmm1
+; AVX-NEXT: vpaddd %zmm2, %zmm1, %zmm1
+; AVX-NEXT: vpmulld {{.*}}(%rip){1to16}, %zmm1, %zmm1
+; AVX-NEXT: vpsubd %zmm1, %zmm0, %zmm0
; AVX-NEXT: retq
%res = srem <16 x i32> %a, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
ret <16 x i32> %res
@@ -1380,7 +331,7 @@ define <16 x i32> @test_rem7_16i32(<16 x i32> %a) nounwind {
define <32 x i16> @test_rem7_32i16(<32 x i16> %a) nounwind {
; AVX512F-LABEL: test_rem7_32i16:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725]
; AVX512F-NEXT: vpmulhw %ymm2, %ymm0, %ymm3
; AVX512F-NEXT: vpsrlw $15, %ymm3, %ymm4
@@ -1398,7 +349,7 @@ define <32 x i16> @test_rem7_32i16(<32 x i16> %a) nounwind {
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: test_rem7_32i16:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpmulhw {{.*}}(%rip), %zmm0, %zmm1
; AVX512BW-NEXT: vpsrlw $15, %zmm1, %zmm2
; AVX512BW-NEXT: vpsraw $1, %zmm1, %zmm1
@@ -1412,980 +363,104 @@ define <32 x i16> @test_rem7_32i16(<32 x i16> %a) nounwind {
define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind {
; AVX512F-LABEL: test_rem7_64i8:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147,147]
-; AVX512F-NEXT: vextracti128 $1, %ymm3, %xmm2
-; AVX512F-NEXT: vpmovsxbw %xmm2, %ymm2
-; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm4
-; AVX512F-NEXT: vpmovsxbw %xmm4, %ymm4
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX512F-NEXT: vpmovsxbw %xmm2, %ymm3
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427]
+; AVX512F-NEXT: vpmullw %ymm2, %ymm3, %ymm3
+; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
+; AVX512F-NEXT: vpmovsxbw %xmm0, %ymm4
; AVX512F-NEXT: vpmullw %ymm2, %ymm4, %ymm4
; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4
-; AVX512F-NEXT: vpmovsxbw %xmm3, %ymm3
-; AVX512F-NEXT: vpmovsxbw %xmm0, %ymm5
-; AVX512F-NEXT: vpmullw %ymm3, %ymm5, %ymm5
-; AVX512F-NEXT: vpsrlw $8, %ymm5, %ymm5
-; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm5[2,3],ymm4[2,3]
-; AVX512F-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4
-; AVX512F-NEXT: vpackuswb %ymm6, %ymm4, %ymm4
-; AVX512F-NEXT: vpaddb %ymm0, %ymm4, %ymm4
-; AVX512F-NEXT: vpsrlw $7, %ymm4, %ymm6
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm10 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; AVX512F-NEXT: vpand %ymm10, %ymm6, %ymm8
-; AVX512F-NEXT: vpsrlw $2, %ymm4, %ymm4
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
-; AVX512F-NEXT: vpand %ymm6, %ymm4, %ymm4
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
-; AVX512F-NEXT: vpxor %ymm7, %ymm4, %ymm4
-; AVX512F-NEXT: vpsubb %ymm7, %ymm4, %ymm4
-; AVX512F-NEXT: vpaddb %ymm8, %ymm4, %ymm8
-; AVX512F-NEXT: vpmovsxbw %xmm8, %ymm9
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512F-NEXT: vpmullw %ymm4, %ymm9, %ymm9
-; AVX512F-NEXT: vpmovsxwd %ymm9, %zmm9
-; AVX512F-NEXT: vpmovdb %zmm9, %xmm9
-; AVX512F-NEXT: vextracti128 $1, %ymm8, %xmm5
-; AVX512F-NEXT: vpmovsxbw %xmm5, %ymm5
-; AVX512F-NEXT: vpmullw %ymm4, %ymm5, %ymm5
-; AVX512F-NEXT: vpmovsxwd %ymm5, %zmm5
-; AVX512F-NEXT: vpmovdb %zmm5, %xmm5
-; AVX512F-NEXT: vinserti128 $1, %xmm5, %ymm9, %ymm5
-; AVX512F-NEXT: vpsubb %ymm5, %ymm0, %ymm0
-; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm5
-; AVX512F-NEXT: vpmovsxbw %xmm5, %ymm5
-; AVX512F-NEXT: vpmullw %ymm2, %ymm5, %ymm2
+; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm4[2,3],ymm3[2,3]
+; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3
+; AVX512F-NEXT: vpackuswb %ymm5, %ymm3, %ymm3
+; AVX512F-NEXT: vpaddb %ymm0, %ymm3, %ymm3
+; AVX512F-NEXT: vpsrlw $7, %ymm3, %ymm5
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX512F-NEXT: vpand %ymm4, %ymm5, %ymm7
+; AVX512F-NEXT: vpsrlw $2, %ymm3, %ymm3
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
+; AVX512F-NEXT: vpand %ymm5, %ymm3, %ymm3
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; AVX512F-NEXT: vpxor %ymm6, %ymm3, %ymm3
+; AVX512F-NEXT: vpsubb %ymm6, %ymm3, %ymm3
+; AVX512F-NEXT: vpaddb %ymm7, %ymm3, %ymm7
+; AVX512F-NEXT: vpmovsxbw %xmm7, %ymm8
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512F-NEXT: vpmullw %ymm3, %ymm8, %ymm8
+; AVX512F-NEXT: vpmovsxwd %ymm8, %zmm8
+; AVX512F-NEXT: vpmovdb %zmm8, %xmm8
+; AVX512F-NEXT: vextracti128 $1, %ymm7, %xmm7
+; AVX512F-NEXT: vpmovsxbw %xmm7, %ymm7
+; AVX512F-NEXT: vpmullw %ymm3, %ymm7, %ymm7
+; AVX512F-NEXT: vpmovsxwd %ymm7, %zmm7
+; AVX512F-NEXT: vpmovdb %zmm7, %xmm7
+; AVX512F-NEXT: vinserti128 $1, %xmm7, %ymm8, %ymm7
+; AVX512F-NEXT: vpsubb %ymm7, %ymm0, %ymm0
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm7
+; AVX512F-NEXT: vpmovsxbw %xmm7, %ymm7
+; AVX512F-NEXT: vpmullw %ymm2, %ymm7, %ymm7
+; AVX512F-NEXT: vpsrlw $8, %ymm7, %ymm7
+; AVX512F-NEXT: vpmovsxbw %xmm1, %ymm8
+; AVX512F-NEXT: vpmullw %ymm2, %ymm8, %ymm2
; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2
-; AVX512F-NEXT: vpmovsxbw %xmm1, %ymm5
-; AVX512F-NEXT: vpmullw %ymm3, %ymm5, %ymm3
-; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
-; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm3[2,3],ymm2[2,3]
-; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2
-; AVX512F-NEXT: vpackuswb %ymm5, %ymm2, %ymm2
+; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm2[2,3],ymm7[2,3]
+; AVX512F-NEXT: vinserti128 $1, %xmm7, %ymm2, %ymm2
+; AVX512F-NEXT: vpackuswb %ymm8, %ymm2, %ymm2
; AVX512F-NEXT: vpaddb %ymm1, %ymm2, %ymm2
-; AVX512F-NEXT: vpsrlw $7, %ymm2, %ymm3
-; AVX512F-NEXT: vpand %ymm10, %ymm3, %ymm3
+; AVX512F-NEXT: vpsrlw $7, %ymm2, %ymm7
+; AVX512F-NEXT: vpand %ymm4, %ymm7, %ymm4
; AVX512F-NEXT: vpsrlw $2, %ymm2, %ymm2
-; AVX512F-NEXT: vpand %ymm6, %ymm2, %ymm2
-; AVX512F-NEXT: vpxor %ymm7, %ymm2, %ymm2
-; AVX512F-NEXT: vpsubb %ymm7, %ymm2, %ymm2
-; AVX512F-NEXT: vpaddb %ymm3, %ymm2, %ymm2
-; AVX512F-NEXT: vpmovsxbw %xmm2, %ymm3
-; AVX512F-NEXT: vpmullw %ymm4, %ymm3, %ymm3
-; AVX512F-NEXT: vpmovsxwd %ymm3, %zmm3
-; AVX512F-NEXT: vpmovdb %zmm3, %xmm3
+; AVX512F-NEXT: vpand %ymm5, %ymm2, %ymm2
+; AVX512F-NEXT: vpxor %ymm6, %ymm2, %ymm2
+; AVX512F-NEXT: vpsubb %ymm6, %ymm2, %ymm2
+; AVX512F-NEXT: vpaddb %ymm4, %ymm2, %ymm2
+; AVX512F-NEXT: vpmovsxbw %xmm2, %ymm4
+; AVX512F-NEXT: vpmullw %ymm3, %ymm4, %ymm4
+; AVX512F-NEXT: vpmovsxwd %ymm4, %zmm4
+; AVX512F-NEXT: vpmovdb %zmm4, %xmm4
; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm2
; AVX512F-NEXT: vpmovsxbw %xmm2, %ymm2
-; AVX512F-NEXT: vpmullw %ymm4, %ymm2, %ymm2
+; AVX512F-NEXT: vpmullw %ymm3, %ymm2, %ymm2
; AVX512F-NEXT: vpmovsxwd %ymm2, %zmm2
; AVX512F-NEXT: vpmovdb %zmm2, %xmm2
-; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2
+; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2
; AVX512F-NEXT: vpsubb %ymm2, %ymm1, %ymm1
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: test_rem7_64i8:
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm1
-; AVX512BW-NEXT: vpextrb $1, %xmm1, %eax
-; AVX512BW-NEXT: movsbl %al, %edx
-; AVX512BW-NEXT: imull $-109, %edx, %eax
-; AVX512BW-NEXT: shrl $8, %eax
-; AVX512BW-NEXT: addb %dl, %al
-; AVX512BW-NEXT: movl %eax, %ecx
-; AVX512BW-NEXT: shrb $7, %cl
-; AVX512BW-NEXT: sarb $2, %al
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movb $7, %dil
-; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
-; AVX512BW-NEXT: mulb %dil
-; AVX512BW-NEXT: subb %al, %dl
-; AVX512BW-NEXT: movzbl %dl, %edx
-; AVX512BW-NEXT: vpextrb $0, %xmm1, %eax
-; AVX512BW-NEXT: movsbl %al, %esi
-; AVX512BW-NEXT: imull $-109, %esi, %eax
-; AVX512BW-NEXT: shrl $8, %eax
-; AVX512BW-NEXT: addb %sil, %al
-; AVX512BW-NEXT: movl %eax, %ecx
-; AVX512BW-NEXT: shrb $7, %cl
-; AVX512BW-NEXT: sarb $2, %al
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
-; AVX512BW-NEXT: mulb %dil
-; AVX512BW-NEXT: subb %al, %sil
-; AVX512BW-NEXT: movzbl %sil, %eax
-; AVX512BW-NEXT: vmovd %eax, %xmm2
-; AVX512BW-NEXT: vpinsrb $1, %edx, %xmm2, %xmm2
-; AVX512BW-NEXT: vpextrb $2, %xmm1, %eax
-; AVX512BW-NEXT: movsbl %al, %ecx
-; AVX512BW-NEXT: imull $-109, %ecx, %eax
-; AVX512BW-NEXT: shrl $8, %eax
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movl %eax, %edx
-; AVX512BW-NEXT: shrb $7, %dl
-; AVX512BW-NEXT: sarb $2, %al
-; AVX512BW-NEXT: addb %dl, %al
-; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
-; AVX512BW-NEXT: mulb %dil
-; AVX512BW-NEXT: subb %al, %cl
-; AVX512BW-NEXT: movzbl %cl, %eax
-; AVX512BW-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT: vpextrb $3, %xmm1, %eax
-; AVX512BW-NEXT: movsbl %al, %ecx
-; AVX512BW-NEXT: imull $-109, %ecx, %eax
-; AVX512BW-NEXT: shrl $8, %eax
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movl %eax, %edx
-; AVX512BW-NEXT: shrb $7, %dl
-; AVX512BW-NEXT: sarb $2, %al
-; AVX512BW-NEXT: addb %dl, %al
-; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
-; AVX512BW-NEXT: mulb %dil
-; AVX512BW-NEXT: subb %al, %cl
-; AVX512BW-NEXT: movzbl %cl, %eax
-; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT: vpextrb $4, %xmm1, %eax
-; AVX512BW-NEXT: movsbl %al, %ecx
-; AVX512BW-NEXT: imull $-109, %ecx, %eax
-; AVX512BW-NEXT: shrl $8, %eax
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movl %eax, %edx
-; AVX512BW-NEXT: shrb $7, %dl
-; AVX512BW-NEXT: sarb $2, %al
-; AVX512BW-NEXT: addb %dl, %al
-; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
-; AVX512BW-NEXT: mulb %dil
-; AVX512BW-NEXT: subb %al, %cl
-; AVX512BW-NEXT: movzbl %cl, %eax
-; AVX512BW-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT: vpextrb $5, %xmm1, %eax
-; AVX512BW-NEXT: movsbl %al, %ecx
-; AVX512BW-NEXT: imull $-109, %ecx, %eax
-; AVX512BW-NEXT: shrl $8, %eax
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movl %eax, %edx
-; AVX512BW-NEXT: shrb $7, %dl
-; AVX512BW-NEXT: sarb $2, %al
-; AVX512BW-NEXT: addb %dl, %al
-; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
-; AVX512BW-NEXT: mulb %dil
-; AVX512BW-NEXT: subb %al, %cl
-; AVX512BW-NEXT: movzbl %cl, %eax
-; AVX512BW-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT: vpextrb $6, %xmm1, %eax
-; AVX512BW-NEXT: movsbl %al, %ecx
-; AVX512BW-NEXT: imull $-109, %ecx, %eax
-; AVX512BW-NEXT: shrl $8, %eax
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movl %eax, %edx
-; AVX512BW-NEXT: shrb $7, %dl
-; AVX512BW-NEXT: sarb $2, %al
-; AVX512BW-NEXT: addb %dl, %al
-; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
-; AVX512BW-NEXT: mulb %dil
-; AVX512BW-NEXT: subb %al, %cl
-; AVX512BW-NEXT: movzbl %cl, %eax
-; AVX512BW-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT: vpextrb $7, %xmm1, %eax
-; AVX512BW-NEXT: movsbl %al, %ecx
-; AVX512BW-NEXT: imull $-109, %ecx, %eax
-; AVX512BW-NEXT: shrl $8, %eax
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movl %eax, %edx
-; AVX512BW-NEXT: shrb $7, %dl
-; AVX512BW-NEXT: sarb $2, %al
-; AVX512BW-NEXT: addb %dl, %al
-; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
-; AVX512BW-NEXT: mulb %dil
-; AVX512BW-NEXT: subb %al, %cl
-; AVX512BW-NEXT: movzbl %cl, %eax
-; AVX512BW-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT: vpextrb $8, %xmm1, %eax
-; AVX512BW-NEXT: movsbl %al, %ecx
-; AVX512BW-NEXT: imull $-109, %ecx, %eax
-; AVX512BW-NEXT: shrl $8, %eax
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movl %eax, %edx
-; AVX512BW-NEXT: shrb $7, %dl
-; AVX512BW-NEXT: sarb $2, %al
-; AVX512BW-NEXT: addb %dl, %al
-; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
-; AVX512BW-NEXT: mulb %dil
-; AVX512BW-NEXT: subb %al, %cl
-; AVX512BW-NEXT: movzbl %cl, %eax
-; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT: vpextrb $9, %xmm1, %eax
-; AVX512BW-NEXT: movsbl %al, %ecx
-; AVX512BW-NEXT: imull $-109, %ecx, %eax
-; AVX512BW-NEXT: shrl $8, %eax
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movl %eax, %edx
-; AVX512BW-NEXT: shrb $7, %dl
-; AVX512BW-NEXT: sarb $2, %al
-; AVX512BW-NEXT: addb %dl, %al
-; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
-; AVX512BW-NEXT: mulb %dil
-; AVX512BW-NEXT: subb %al, %cl
-; AVX512BW-NEXT: movzbl %cl, %eax
-; AVX512BW-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT: vpextrb $10, %xmm1, %eax
-; AVX512BW-NEXT: movsbl %al, %ecx
-; AVX512BW-NEXT: imull $-109, %ecx, %eax
-; AVX512BW-NEXT: shrl $8, %eax
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movl %eax, %edx
-; AVX512BW-NEXT: shrb $7, %dl
-; AVX512BW-NEXT: sarb $2, %al
-; AVX512BW-NEXT: addb %dl, %al
-; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
-; AVX512BW-NEXT: mulb %dil
-; AVX512BW-NEXT: subb %al, %cl
-; AVX512BW-NEXT: movzbl %cl, %eax
-; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT: vpextrb $11, %xmm1, %eax
-; AVX512BW-NEXT: movsbl %al, %ecx
-; AVX512BW-NEXT: imull $-109, %ecx, %eax
-; AVX512BW-NEXT: shrl $8, %eax
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movl %eax, %edx
-; AVX512BW-NEXT: shrb $7, %dl
-; AVX512BW-NEXT: sarb $2, %al
-; AVX512BW-NEXT: addb %dl, %al
-; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
-; AVX512BW-NEXT: mulb %dil
-; AVX512BW-NEXT: subb %al, %cl
-; AVX512BW-NEXT: movzbl %cl, %eax
-; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT: vpextrb $12, %xmm1, %eax
-; AVX512BW-NEXT: movsbl %al, %ecx
-; AVX512BW-NEXT: imull $-109, %ecx, %eax
-; AVX512BW-NEXT: shrl $8, %eax
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movl %eax, %edx
-; AVX512BW-NEXT: shrb $7, %dl
-; AVX512BW-NEXT: sarb $2, %al
-; AVX512BW-NEXT: addb %dl, %al
-; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
-; AVX512BW-NEXT: mulb %dil
-; AVX512BW-NEXT: subb %al, %cl
-; AVX512BW-NEXT: movzbl %cl, %eax
-; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT: vpextrb $13, %xmm1, %eax
-; AVX512BW-NEXT: movsbl %al, %ecx
-; AVX512BW-NEXT: imull $-109, %ecx, %eax
-; AVX512BW-NEXT: shrl $8, %eax
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movl %eax, %edx
-; AVX512BW-NEXT: shrb $7, %dl
-; AVX512BW-NEXT: sarb $2, %al
-; AVX512BW-NEXT: addb %dl, %al
-; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
-; AVX512BW-NEXT: mulb %dil
-; AVX512BW-NEXT: subb %al, %cl
-; AVX512BW-NEXT: movzbl %cl, %eax
-; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT: vpextrb $14, %xmm1, %eax
-; AVX512BW-NEXT: movsbl %al, %ecx
-; AVX512BW-NEXT: imull $-109, %ecx, %eax
-; AVX512BW-NEXT: shrl $8, %eax
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movl %eax, %edx
-; AVX512BW-NEXT: shrb $7, %dl
-; AVX512BW-NEXT: sarb $2, %al
-; AVX512BW-NEXT: addb %dl, %al
-; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
-; AVX512BW-NEXT: mulb %dil
-; AVX512BW-NEXT: subb %al, %cl
-; AVX512BW-NEXT: movzbl %cl, %eax
-; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT: vpextrb $15, %xmm1, %eax
-; AVX512BW-NEXT: movsbl %al, %ecx
-; AVX512BW-NEXT: imull $-109, %ecx, %eax
-; AVX512BW-NEXT: shrl $8, %eax
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movl %eax, %edx
-; AVX512BW-NEXT: shrb $7, %dl
-; AVX512BW-NEXT: sarb $2, %al
-; AVX512BW-NEXT: addb %dl, %al
-; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
-; AVX512BW-NEXT: mulb %dil
-; AVX512BW-NEXT: subb %al, %cl
-; AVX512BW-NEXT: movzbl %cl, %eax
-; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm2, %xmm1
-; AVX512BW-NEXT: vextracti32x4 $2, %zmm0, %xmm2
-; AVX512BW-NEXT: vpextrb $1, %xmm2, %eax
-; AVX512BW-NEXT: movsbl %al, %ecx
-; AVX512BW-NEXT: imull $-109, %ecx, %eax
-; AVX512BW-NEXT: shrl $8, %eax
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movl %eax, %edx
-; AVX512BW-NEXT: shrb $7, %dl
-; AVX512BW-NEXT: sarb $2, %al
-; AVX512BW-NEXT: addb %dl, %al
-; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
-; AVX512BW-NEXT: mulb %dil
-; AVX512BW-NEXT: subb %al, %cl
-; AVX512BW-NEXT: movzbl %cl, %esi
-; AVX512BW-NEXT: vpextrb $0, %xmm2, %eax
-; AVX512BW-NEXT: movsbl %al, %edx
-; AVX512BW-NEXT: imull $-109, %edx, %eax
-; AVX512BW-NEXT: shrl $8, %eax
-; AVX512BW-NEXT: addb %dl, %al
-; AVX512BW-NEXT: movl %eax, %ecx
-; AVX512BW-NEXT: shrb $7, %cl
-; AVX512BW-NEXT: sarb $2, %al
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
-; AVX512BW-NEXT: mulb %dil
-; AVX512BW-NEXT: subb %al, %dl
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vmovd %eax, %xmm3
-; AVX512BW-NEXT: vpinsrb $1, %esi, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $2, %xmm2, %eax
-; AVX512BW-NEXT: movsbl %al, %ecx
-; AVX512BW-NEXT: imull $-109, %ecx, %eax
-; AVX512BW-NEXT: shrl $8, %eax
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movl %eax, %edx
-; AVX512BW-NEXT: shrb $7, %dl
-; AVX512BW-NEXT: sarb $2, %al
-; AVX512BW-NEXT: addb %dl, %al
-; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
-; AVX512BW-NEXT: mulb %dil
-; AVX512BW-NEXT: subb %al, %cl
-; AVX512BW-NEXT: movzbl %cl, %eax
-; AVX512BW-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $3, %xmm2, %eax
-; AVX512BW-NEXT: movsbl %al, %ecx
-; AVX512BW-NEXT: imull $-109, %ecx, %eax
-; AVX512BW-NEXT: shrl $8, %eax
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movl %eax, %edx
-; AVX512BW-NEXT: shrb $7, %dl
-; AVX512BW-NEXT: sarb $2, %al
-; AVX512BW-NEXT: addb %dl, %al
-; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
-; AVX512BW-NEXT: mulb %dil
-; AVX512BW-NEXT: subb %al, %cl
-; AVX512BW-NEXT: movzbl %cl, %eax
-; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $4, %xmm2, %eax
-; AVX512BW-NEXT: movsbl %al, %ecx
-; AVX512BW-NEXT: imull $-109, %ecx, %eax
-; AVX512BW-NEXT: shrl $8, %eax
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movl %eax, %edx
-; AVX512BW-NEXT: shrb $7, %dl
-; AVX512BW-NEXT: sarb $2, %al
-; AVX512BW-NEXT: addb %dl, %al
-; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
-; AVX512BW-NEXT: mulb %dil
-; AVX512BW-NEXT: subb %al, %cl
-; AVX512BW-NEXT: movzbl %cl, %eax
-; AVX512BW-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $5, %xmm2, %eax
-; AVX512BW-NEXT: movsbl %al, %ecx
-; AVX512BW-NEXT: imull $-109, %ecx, %eax
-; AVX512BW-NEXT: shrl $8, %eax
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movl %eax, %edx
-; AVX512BW-NEXT: shrb $7, %dl
-; AVX512BW-NEXT: sarb $2, %al
-; AVX512BW-NEXT: addb %dl, %al
-; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
-; AVX512BW-NEXT: mulb %dil
-; AVX512BW-NEXT: subb %al, %cl
-; AVX512BW-NEXT: movzbl %cl, %eax
-; AVX512BW-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $6, %xmm2, %eax
-; AVX512BW-NEXT: movsbl %al, %ecx
-; AVX512BW-NEXT: imull $-109, %ecx, %eax
-; AVX512BW-NEXT: shrl $8, %eax
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movl %eax, %edx
-; AVX512BW-NEXT: shrb $7, %dl
-; AVX512BW-NEXT: sarb $2, %al
-; AVX512BW-NEXT: addb %dl, %al
-; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
-; AVX512BW-NEXT: mulb %dil
-; AVX512BW-NEXT: subb %al, %cl
-; AVX512BW-NEXT: movzbl %cl, %eax
-; AVX512BW-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $7, %xmm2, %eax
-; AVX512BW-NEXT: movsbl %al, %ecx
-; AVX512BW-NEXT: imull $-109, %ecx, %eax
-; AVX512BW-NEXT: shrl $8, %eax
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movl %eax, %edx
-; AVX512BW-NEXT: shrb $7, %dl
-; AVX512BW-NEXT: sarb $2, %al
-; AVX512BW-NEXT: addb %dl, %al
-; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
-; AVX512BW-NEXT: mulb %dil
-; AVX512BW-NEXT: subb %al, %cl
-; AVX512BW-NEXT: movzbl %cl, %eax
-; AVX512BW-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $8, %xmm2, %eax
-; AVX512BW-NEXT: movsbl %al, %ecx
-; AVX512BW-NEXT: imull $-109, %ecx, %eax
-; AVX512BW-NEXT: shrl $8, %eax
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movl %eax, %edx
-; AVX512BW-NEXT: shrb $7, %dl
-; AVX512BW-NEXT: sarb $2, %al
-; AVX512BW-NEXT: addb %dl, %al
-; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
-; AVX512BW-NEXT: mulb %dil
-; AVX512BW-NEXT: subb %al, %cl
-; AVX512BW-NEXT: movzbl %cl, %eax
-; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $9, %xmm2, %eax
-; AVX512BW-NEXT: movsbl %al, %ecx
-; AVX512BW-NEXT: imull $-109, %ecx, %eax
-; AVX512BW-NEXT: shrl $8, %eax
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movl %eax, %edx
-; AVX512BW-NEXT: shrb $7, %dl
-; AVX512BW-NEXT: sarb $2, %al
-; AVX512BW-NEXT: addb %dl, %al
-; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
-; AVX512BW-NEXT: mulb %dil
-; AVX512BW-NEXT: subb %al, %cl
-; AVX512BW-NEXT: movzbl %cl, %eax
-; AVX512BW-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $10, %xmm2, %eax
-; AVX512BW-NEXT: movsbl %al, %ecx
-; AVX512BW-NEXT: imull $-109, %ecx, %eax
-; AVX512BW-NEXT: shrl $8, %eax
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movl %eax, %edx
-; AVX512BW-NEXT: shrb $7, %dl
-; AVX512BW-NEXT: sarb $2, %al
-; AVX512BW-NEXT: addb %dl, %al
-; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
-; AVX512BW-NEXT: mulb %dil
-; AVX512BW-NEXT: subb %al, %cl
-; AVX512BW-NEXT: movzbl %cl, %eax
-; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $11, %xmm2, %eax
-; AVX512BW-NEXT: movsbl %al, %ecx
-; AVX512BW-NEXT: imull $-109, %ecx, %eax
-; AVX512BW-NEXT: shrl $8, %eax
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movl %eax, %edx
-; AVX512BW-NEXT: shrb $7, %dl
-; AVX512BW-NEXT: sarb $2, %al
-; AVX512BW-NEXT: addb %dl, %al
-; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
-; AVX512BW-NEXT: mulb %dil
-; AVX512BW-NEXT: subb %al, %cl
-; AVX512BW-NEXT: movzbl %cl, %eax
-; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $12, %xmm2, %eax
-; AVX512BW-NEXT: movsbl %al, %ecx
-; AVX512BW-NEXT: imull $-109, %ecx, %eax
-; AVX512BW-NEXT: shrl $8, %eax
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movl %eax, %edx
-; AVX512BW-NEXT: shrb $7, %dl
-; AVX512BW-NEXT: sarb $2, %al
-; AVX512BW-NEXT: addb %dl, %al
-; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
-; AVX512BW-NEXT: mulb %dil
-; AVX512BW-NEXT: subb %al, %cl
-; AVX512BW-NEXT: movzbl %cl, %eax
-; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $13, %xmm2, %eax
-; AVX512BW-NEXT: movsbl %al, %ecx
-; AVX512BW-NEXT: imull $-109, %ecx, %eax
-; AVX512BW-NEXT: shrl $8, %eax
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movl %eax, %edx
-; AVX512BW-NEXT: shrb $7, %dl
-; AVX512BW-NEXT: sarb $2, %al
-; AVX512BW-NEXT: addb %dl, %al
-; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
-; AVX512BW-NEXT: mulb %dil
-; AVX512BW-NEXT: subb %al, %cl
-; AVX512BW-NEXT: movzbl %cl, %eax
-; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $14, %xmm2, %eax
-; AVX512BW-NEXT: movsbl %al, %ecx
-; AVX512BW-NEXT: imull $-109, %ecx, %eax
-; AVX512BW-NEXT: shrl $8, %eax
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movl %eax, %edx
-; AVX512BW-NEXT: shrb $7, %dl
-; AVX512BW-NEXT: sarb $2, %al
-; AVX512BW-NEXT: addb %dl, %al
-; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
-; AVX512BW-NEXT: mulb %dil
-; AVX512BW-NEXT: subb %al, %cl
-; AVX512BW-NEXT: movzbl %cl, %eax
-; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $15, %xmm2, %eax
-; AVX512BW-NEXT: movsbl %al, %ecx
-; AVX512BW-NEXT: imull $-109, %ecx, %eax
-; AVX512BW-NEXT: shrl $8, %eax
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movl %eax, %edx
-; AVX512BW-NEXT: shrb $7, %dl
-; AVX512BW-NEXT: sarb $2, %al
-; AVX512BW-NEXT: addb %dl, %al
-; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
-; AVX512BW-NEXT: mulb %dil
-; AVX512BW-NEXT: subb %al, %cl
-; AVX512BW-NEXT: movzbl %cl, %eax
-; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm3, %xmm2
-; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
-; AVX512BW-NEXT: vextracti32x4 $1, %zmm0, %xmm2
-; AVX512BW-NEXT: vpextrb $1, %xmm2, %eax
-; AVX512BW-NEXT: movsbl %al, %ecx
-; AVX512BW-NEXT: imull $-109, %ecx, %eax
-; AVX512BW-NEXT: shrl $8, %eax
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movl %eax, %edx
-; AVX512BW-NEXT: shrb $7, %dl
-; AVX512BW-NEXT: sarb $2, %al
-; AVX512BW-NEXT: addb %dl, %al
-; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
-; AVX512BW-NEXT: mulb %dil
-; AVX512BW-NEXT: subb %al, %cl
-; AVX512BW-NEXT: movzbl %cl, %esi
-; AVX512BW-NEXT: vpextrb $0, %xmm2, %eax
-; AVX512BW-NEXT: movsbl %al, %edx
-; AVX512BW-NEXT: imull $-109, %edx, %eax
-; AVX512BW-NEXT: shrl $8, %eax
-; AVX512BW-NEXT: addb %dl, %al
-; AVX512BW-NEXT: movl %eax, %ecx
-; AVX512BW-NEXT: shrb $7, %cl
-; AVX512BW-NEXT: sarb $2, %al
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
-; AVX512BW-NEXT: mulb %dil
-; AVX512BW-NEXT: subb %al, %dl
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vmovd %eax, %xmm3
-; AVX512BW-NEXT: vpinsrb $1, %esi, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $2, %xmm2, %eax
-; AVX512BW-NEXT: movsbl %al, %ecx
-; AVX512BW-NEXT: imull $-109, %ecx, %eax
-; AVX512BW-NEXT: shrl $8, %eax
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movl %eax, %edx
-; AVX512BW-NEXT: shrb $7, %dl
-; AVX512BW-NEXT: sarb $2, %al
-; AVX512BW-NEXT: addb %dl, %al
-; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
-; AVX512BW-NEXT: mulb %dil
-; AVX512BW-NEXT: subb %al, %cl
-; AVX512BW-NEXT: movzbl %cl, %eax
-; AVX512BW-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $3, %xmm2, %eax
-; AVX512BW-NEXT: movsbl %al, %ecx
-; AVX512BW-NEXT: imull $-109, %ecx, %eax
-; AVX512BW-NEXT: shrl $8, %eax
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movl %eax, %edx
-; AVX512BW-NEXT: shrb $7, %dl
-; AVX512BW-NEXT: sarb $2, %al
-; AVX512BW-NEXT: addb %dl, %al
-; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
-; AVX512BW-NEXT: mulb %dil
-; AVX512BW-NEXT: subb %al, %cl
-; AVX512BW-NEXT: movzbl %cl, %eax
-; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $4, %xmm2, %eax
-; AVX512BW-NEXT: movsbl %al, %ecx
-; AVX512BW-NEXT: imull $-109, %ecx, %eax
-; AVX512BW-NEXT: shrl $8, %eax
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movl %eax, %edx
-; AVX512BW-NEXT: shrb $7, %dl
-; AVX512BW-NEXT: sarb $2, %al
-; AVX512BW-NEXT: addb %dl, %al
-; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
-; AVX512BW-NEXT: mulb %dil
-; AVX512BW-NEXT: subb %al, %cl
-; AVX512BW-NEXT: movzbl %cl, %eax
-; AVX512BW-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $5, %xmm2, %eax
-; AVX512BW-NEXT: movsbl %al, %ecx
-; AVX512BW-NEXT: imull $-109, %ecx, %eax
-; AVX512BW-NEXT: shrl $8, %eax
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movl %eax, %edx
-; AVX512BW-NEXT: shrb $7, %dl
-; AVX512BW-NEXT: sarb $2, %al
-; AVX512BW-NEXT: addb %dl, %al
-; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
-; AVX512BW-NEXT: mulb %dil
-; AVX512BW-NEXT: subb %al, %cl
-; AVX512BW-NEXT: movzbl %cl, %eax
-; AVX512BW-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $6, %xmm2, %eax
-; AVX512BW-NEXT: movsbl %al, %ecx
-; AVX512BW-NEXT: imull $-109, %ecx, %eax
-; AVX512BW-NEXT: shrl $8, %eax
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movl %eax, %edx
-; AVX512BW-NEXT: shrb $7, %dl
-; AVX512BW-NEXT: sarb $2, %al
-; AVX512BW-NEXT: addb %dl, %al
-; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
-; AVX512BW-NEXT: mulb %dil
-; AVX512BW-NEXT: subb %al, %cl
-; AVX512BW-NEXT: movzbl %cl, %eax
-; AVX512BW-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $7, %xmm2, %eax
-; AVX512BW-NEXT: movsbl %al, %ecx
-; AVX512BW-NEXT: imull $-109, %ecx, %eax
-; AVX512BW-NEXT: shrl $8, %eax
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movl %eax, %edx
-; AVX512BW-NEXT: shrb $7, %dl
-; AVX512BW-NEXT: sarb $2, %al
-; AVX512BW-NEXT: addb %dl, %al
-; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
-; AVX512BW-NEXT: mulb %dil
-; AVX512BW-NEXT: subb %al, %cl
-; AVX512BW-NEXT: movzbl %cl, %eax
-; AVX512BW-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $8, %xmm2, %eax
-; AVX512BW-NEXT: movsbl %al, %ecx
-; AVX512BW-NEXT: imull $-109, %ecx, %eax
-; AVX512BW-NEXT: shrl $8, %eax
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movl %eax, %edx
-; AVX512BW-NEXT: shrb $7, %dl
-; AVX512BW-NEXT: sarb $2, %al
-; AVX512BW-NEXT: addb %dl, %al
-; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
-; AVX512BW-NEXT: mulb %dil
-; AVX512BW-NEXT: subb %al, %cl
-; AVX512BW-NEXT: movzbl %cl, %eax
-; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $9, %xmm2, %eax
-; AVX512BW-NEXT: movsbl %al, %ecx
-; AVX512BW-NEXT: imull $-109, %ecx, %eax
-; AVX512BW-NEXT: shrl $8, %eax
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movl %eax, %edx
-; AVX512BW-NEXT: shrb $7, %dl
-; AVX512BW-NEXT: sarb $2, %al
-; AVX512BW-NEXT: addb %dl, %al
-; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
-; AVX512BW-NEXT: mulb %dil
-; AVX512BW-NEXT: subb %al, %cl
-; AVX512BW-NEXT: movzbl %cl, %eax
-; AVX512BW-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $10, %xmm2, %eax
-; AVX512BW-NEXT: movsbl %al, %ecx
-; AVX512BW-NEXT: imull $-109, %ecx, %eax
-; AVX512BW-NEXT: shrl $8, %eax
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movl %eax, %edx
-; AVX512BW-NEXT: shrb $7, %dl
-; AVX512BW-NEXT: sarb $2, %al
-; AVX512BW-NEXT: addb %dl, %al
-; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
-; AVX512BW-NEXT: mulb %dil
-; AVX512BW-NEXT: subb %al, %cl
-; AVX512BW-NEXT: movzbl %cl, %eax
-; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $11, %xmm2, %eax
-; AVX512BW-NEXT: movsbl %al, %ecx
-; AVX512BW-NEXT: imull $-109, %ecx, %eax
-; AVX512BW-NEXT: shrl $8, %eax
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movl %eax, %edx
-; AVX512BW-NEXT: shrb $7, %dl
-; AVX512BW-NEXT: sarb $2, %al
-; AVX512BW-NEXT: addb %dl, %al
-; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
-; AVX512BW-NEXT: mulb %dil
-; AVX512BW-NEXT: subb %al, %cl
-; AVX512BW-NEXT: movzbl %cl, %eax
-; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $12, %xmm2, %eax
-; AVX512BW-NEXT: movsbl %al, %ecx
-; AVX512BW-NEXT: imull $-109, %ecx, %eax
-; AVX512BW-NEXT: shrl $8, %eax
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movl %eax, %edx
-; AVX512BW-NEXT: shrb $7, %dl
-; AVX512BW-NEXT: sarb $2, %al
-; AVX512BW-NEXT: addb %dl, %al
-; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
-; AVX512BW-NEXT: mulb %dil
-; AVX512BW-NEXT: subb %al, %cl
-; AVX512BW-NEXT: movzbl %cl, %eax
-; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $13, %xmm2, %eax
-; AVX512BW-NEXT: movsbl %al, %ecx
-; AVX512BW-NEXT: imull $-109, %ecx, %eax
-; AVX512BW-NEXT: shrl $8, %eax
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movl %eax, %edx
-; AVX512BW-NEXT: shrb $7, %dl
-; AVX512BW-NEXT: sarb $2, %al
-; AVX512BW-NEXT: addb %dl, %al
-; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
-; AVX512BW-NEXT: mulb %dil
-; AVX512BW-NEXT: subb %al, %cl
-; AVX512BW-NEXT: movzbl %cl, %eax
-; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $14, %xmm2, %eax
-; AVX512BW-NEXT: movsbl %al, %ecx
-; AVX512BW-NEXT: imull $-109, %ecx, %eax
-; AVX512BW-NEXT: shrl $8, %eax
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movl %eax, %edx
-; AVX512BW-NEXT: shrb $7, %dl
-; AVX512BW-NEXT: sarb $2, %al
-; AVX512BW-NEXT: addb %dl, %al
-; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
-; AVX512BW-NEXT: mulb %dil
-; AVX512BW-NEXT: subb %al, %cl
-; AVX512BW-NEXT: movzbl %cl, %eax
-; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $15, %xmm2, %eax
-; AVX512BW-NEXT: movsbl %al, %ecx
-; AVX512BW-NEXT: imull $-109, %ecx, %eax
-; AVX512BW-NEXT: shrl $8, %eax
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movl %eax, %edx
-; AVX512BW-NEXT: shrb $7, %dl
-; AVX512BW-NEXT: sarb $2, %al
-; AVX512BW-NEXT: addb %dl, %al
-; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
-; AVX512BW-NEXT: mulb %dil
-; AVX512BW-NEXT: subb %al, %cl
-; AVX512BW-NEXT: movzbl %cl, %eax
-; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm3, %xmm2
-; AVX512BW-NEXT: vpextrb $1, %xmm0, %eax
-; AVX512BW-NEXT: movsbl %al, %ecx
-; AVX512BW-NEXT: imull $-109, %ecx, %eax
-; AVX512BW-NEXT: shrl $8, %eax
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movl %eax, %edx
-; AVX512BW-NEXT: shrb $7, %dl
-; AVX512BW-NEXT: sarb $2, %al
-; AVX512BW-NEXT: addb %dl, %al
-; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
-; AVX512BW-NEXT: mulb %dil
-; AVX512BW-NEXT: subb %al, %cl
-; AVX512BW-NEXT: movzbl %cl, %esi
-; AVX512BW-NEXT: vpextrb $0, %xmm0, %eax
-; AVX512BW-NEXT: movsbl %al, %edx
-; AVX512BW-NEXT: imull $-109, %edx, %eax
-; AVX512BW-NEXT: shrl $8, %eax
-; AVX512BW-NEXT: addb %dl, %al
-; AVX512BW-NEXT: movl %eax, %ecx
-; AVX512BW-NEXT: shrb $7, %cl
-; AVX512BW-NEXT: sarb $2, %al
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
-; AVX512BW-NEXT: mulb %dil
-; AVX512BW-NEXT: subb %al, %dl
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vmovd %eax, %xmm3
-; AVX512BW-NEXT: vpinsrb $1, %esi, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $2, %xmm0, %eax
-; AVX512BW-NEXT: movsbl %al, %ecx
-; AVX512BW-NEXT: imull $-109, %ecx, %eax
-; AVX512BW-NEXT: shrl $8, %eax
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movl %eax, %edx
-; AVX512BW-NEXT: shrb $7, %dl
-; AVX512BW-NEXT: sarb $2, %al
-; AVX512BW-NEXT: addb %dl, %al
-; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
-; AVX512BW-NEXT: mulb %dil
-; AVX512BW-NEXT: subb %al, %cl
-; AVX512BW-NEXT: movzbl %cl, %eax
-; AVX512BW-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $3, %xmm0, %eax
-; AVX512BW-NEXT: movsbl %al, %ecx
-; AVX512BW-NEXT: imull $-109, %ecx, %eax
-; AVX512BW-NEXT: shrl $8, %eax
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movl %eax, %edx
-; AVX512BW-NEXT: shrb $7, %dl
-; AVX512BW-NEXT: sarb $2, %al
-; AVX512BW-NEXT: addb %dl, %al
-; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
-; AVX512BW-NEXT: mulb %dil
-; AVX512BW-NEXT: subb %al, %cl
-; AVX512BW-NEXT: movzbl %cl, %eax
-; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $4, %xmm0, %eax
-; AVX512BW-NEXT: movsbl %al, %ecx
-; AVX512BW-NEXT: imull $-109, %ecx, %eax
-; AVX512BW-NEXT: shrl $8, %eax
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movl %eax, %edx
-; AVX512BW-NEXT: shrb $7, %dl
-; AVX512BW-NEXT: sarb $2, %al
-; AVX512BW-NEXT: addb %dl, %al
-; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
-; AVX512BW-NEXT: mulb %dil
-; AVX512BW-NEXT: subb %al, %cl
-; AVX512BW-NEXT: movzbl %cl, %eax
-; AVX512BW-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $5, %xmm0, %eax
-; AVX512BW-NEXT: movsbl %al, %ecx
-; AVX512BW-NEXT: imull $-109, %ecx, %eax
-; AVX512BW-NEXT: shrl $8, %eax
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movl %eax, %edx
-; AVX512BW-NEXT: shrb $7, %dl
-; AVX512BW-NEXT: sarb $2, %al
-; AVX512BW-NEXT: addb %dl, %al
-; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
-; AVX512BW-NEXT: mulb %dil
-; AVX512BW-NEXT: subb %al, %cl
-; AVX512BW-NEXT: movzbl %cl, %eax
-; AVX512BW-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $6, %xmm0, %eax
-; AVX512BW-NEXT: movsbl %al, %ecx
-; AVX512BW-NEXT: imull $-109, %ecx, %eax
-; AVX512BW-NEXT: shrl $8, %eax
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movl %eax, %edx
-; AVX512BW-NEXT: shrb $7, %dl
-; AVX512BW-NEXT: sarb $2, %al
-; AVX512BW-NEXT: addb %dl, %al
-; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
-; AVX512BW-NEXT: mulb %dil
-; AVX512BW-NEXT: subb %al, %cl
-; AVX512BW-NEXT: movzbl %cl, %eax
-; AVX512BW-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $7, %xmm0, %eax
-; AVX512BW-NEXT: movsbl %al, %ecx
-; AVX512BW-NEXT: imull $-109, %ecx, %eax
-; AVX512BW-NEXT: shrl $8, %eax
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movl %eax, %edx
-; AVX512BW-NEXT: shrb $7, %dl
-; AVX512BW-NEXT: sarb $2, %al
-; AVX512BW-NEXT: addb %dl, %al
-; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
-; AVX512BW-NEXT: mulb %dil
-; AVX512BW-NEXT: subb %al, %cl
-; AVX512BW-NEXT: movzbl %cl, %eax
-; AVX512BW-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $8, %xmm0, %eax
-; AVX512BW-NEXT: movsbl %al, %ecx
-; AVX512BW-NEXT: imull $-109, %ecx, %eax
-; AVX512BW-NEXT: shrl $8, %eax
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movl %eax, %edx
-; AVX512BW-NEXT: shrb $7, %dl
-; AVX512BW-NEXT: sarb $2, %al
-; AVX512BW-NEXT: addb %dl, %al
-; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
-; AVX512BW-NEXT: mulb %dil
-; AVX512BW-NEXT: subb %al, %cl
-; AVX512BW-NEXT: movzbl %cl, %eax
-; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $9, %xmm0, %eax
-; AVX512BW-NEXT: movsbl %al, %ecx
-; AVX512BW-NEXT: imull $-109, %ecx, %eax
-; AVX512BW-NEXT: shrl $8, %eax
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movl %eax, %edx
-; AVX512BW-NEXT: shrb $7, %dl
-; AVX512BW-NEXT: sarb $2, %al
-; AVX512BW-NEXT: addb %dl, %al
-; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
-; AVX512BW-NEXT: mulb %dil
-; AVX512BW-NEXT: subb %al, %cl
-; AVX512BW-NEXT: movzbl %cl, %eax
-; AVX512BW-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $10, %xmm0, %eax
-; AVX512BW-NEXT: movsbl %al, %ecx
-; AVX512BW-NEXT: imull $-109, %ecx, %eax
-; AVX512BW-NEXT: shrl $8, %eax
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movl %eax, %edx
-; AVX512BW-NEXT: shrb $7, %dl
-; AVX512BW-NEXT: sarb $2, %al
-; AVX512BW-NEXT: addb %dl, %al
-; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
-; AVX512BW-NEXT: mulb %dil
-; AVX512BW-NEXT: subb %al, %cl
-; AVX512BW-NEXT: movzbl %cl, %eax
-; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $11, %xmm0, %eax
-; AVX512BW-NEXT: movsbl %al, %ecx
-; AVX512BW-NEXT: imull $-109, %ecx, %eax
-; AVX512BW-NEXT: shrl $8, %eax
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movl %eax, %edx
-; AVX512BW-NEXT: shrb $7, %dl
-; AVX512BW-NEXT: sarb $2, %al
-; AVX512BW-NEXT: addb %dl, %al
-; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
-; AVX512BW-NEXT: mulb %dil
-; AVX512BW-NEXT: subb %al, %cl
-; AVX512BW-NEXT: movzbl %cl, %eax
-; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $12, %xmm0, %eax
-; AVX512BW-NEXT: movsbl %al, %ecx
-; AVX512BW-NEXT: imull $-109, %ecx, %eax
-; AVX512BW-NEXT: shrl $8, %eax
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movl %eax, %edx
-; AVX512BW-NEXT: shrb $7, %dl
-; AVX512BW-NEXT: sarb $2, %al
-; AVX512BW-NEXT: addb %dl, %al
-; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
-; AVX512BW-NEXT: mulb %dil
-; AVX512BW-NEXT: subb %al, %cl
-; AVX512BW-NEXT: movzbl %cl, %eax
-; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $13, %xmm0, %eax
-; AVX512BW-NEXT: movsbl %al, %ecx
-; AVX512BW-NEXT: imull $-109, %ecx, %eax
-; AVX512BW-NEXT: shrl $8, %eax
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movl %eax, %edx
-; AVX512BW-NEXT: shrb $7, %dl
-; AVX512BW-NEXT: sarb $2, %al
-; AVX512BW-NEXT: addb %dl, %al
-; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
-; AVX512BW-NEXT: mulb %dil
-; AVX512BW-NEXT: subb %al, %cl
-; AVX512BW-NEXT: movzbl %cl, %eax
-; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $14, %xmm0, %eax
-; AVX512BW-NEXT: movsbl %al, %ecx
-; AVX512BW-NEXT: imull $-109, %ecx, %eax
-; AVX512BW-NEXT: shrl $8, %eax
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movl %eax, %edx
-; AVX512BW-NEXT: shrb $7, %dl
-; AVX512BW-NEXT: sarb $2, %al
-; AVX512BW-NEXT: addb %dl, %al
-; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
-; AVX512BW-NEXT: mulb %dil
-; AVX512BW-NEXT: subb %al, %cl
-; AVX512BW-NEXT: movzbl %cl, %eax
-; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $15, %xmm0, %eax
-; AVX512BW-NEXT: movsbl %al, %ecx
-; AVX512BW-NEXT: imull $-109, %ecx, %eax
-; AVX512BW-NEXT: shrl $8, %eax
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: movl %eax, %edx
-; AVX512BW-NEXT: shrb $7, %dl
-; AVX512BW-NEXT: sarb $2, %al
-; AVX512BW-NEXT: addb %dl, %al
-; AVX512BW-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
-; AVX512BW-NEXT: mulb %dil
-; AVX512BW-NEXT: subb %al, %cl
-; AVX512BW-NEXT: movzbl %cl, %eax
-; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm3, %xmm0
-; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
-; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm1
+; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427,65427]
+; AVX512BW-NEXT: vpmullw %zmm2, %zmm1, %zmm1
+; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1
+; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1
+; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; AVX512BW-NEXT: vpmovsxbw %ymm3, %zmm3
+; AVX512BW-NEXT: vpmullw %zmm2, %zmm3, %zmm2
+; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2
+; AVX512BW-NEXT: vpmovwb %zmm2, %ymm2
+; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm1
+; AVX512BW-NEXT: vpsrlw $2, %zmm1, %zmm2
+; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; AVX512BW-NEXT: vpxorq %zmm3, %zmm2, %zmm2
+; AVX512BW-NEXT: vpsubb %zmm3, %zmm2, %zmm2
+; AVX512BW-NEXT: vpsrlw $7, %zmm1, %zmm1
+; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
+; AVX512BW-NEXT: vpaddb %zmm1, %zmm2, %zmm1
+; AVX512BW-NEXT: vpmovsxbw %ymm1, %zmm2
+; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512BW-NEXT: vpmullw %zmm3, %zmm2, %zmm2
+; AVX512BW-NEXT: vpmovwb %zmm2, %ymm2
+; AVX512BW-NEXT: vextracti64x4 $1, %zmm1, %ymm1
+; AVX512BW-NEXT: vpmovsxbw %ymm1, %zmm1
+; AVX512BW-NEXT: vpmullw %zmm3, %zmm1, %zmm1
+; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1
+; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1
+; AVX512BW-NEXT: vpsubb %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: retq
%res = srem <64 x i8> %a, <i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7>
ret <64 x i8> %res
diff --git a/test/CodeGen/X86/vector-idiv-udiv-128.ll b/test/CodeGen/X86/vector-idiv-udiv-128.ll
index 8138442b3eaf..9788cc037d41 100644
--- a/test/CodeGen/X86/vector-idiv-udiv-128.ll
+++ b/test/CodeGen/X86/vector-idiv-udiv-128.ll
@@ -2,7 +2,8 @@
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2NOBW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX512BW
;
; udiv by 7
@@ -10,7 +11,7 @@
define <2 x i64> @test_div7_2i64(<2 x i64> %a) nounwind {
; SSE2-LABEL: test_div7_2i64:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movq %xmm0, %rcx
; SSE2-NEXT: movabsq $2635249153387078803, %rsi # imm = 0x2492492492492493
; SSE2-NEXT: movq %rcx, %rax
@@ -34,7 +35,7 @@ define <2 x i64> @test_div7_2i64(<2 x i64> %a) nounwind {
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_div7_2i64:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pextrq $1, %xmm0, %rcx
; SSE41-NEXT: movabsq $2635249153387078803, %rsi # imm = 0x2492492492492493
; SSE41-NEXT: movq %rcx, %rax
@@ -56,7 +57,7 @@ define <2 x i64> @test_div7_2i64(<2 x i64> %a) nounwind {
; SSE41-NEXT: retq
;
; AVX-LABEL: test_div7_2i64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpextrq $1, %xmm0, %rcx
; AVX-NEXT: movabsq $2635249153387078803, %rsi # imm = 0x2492492492492493
; AVX-NEXT: movq %rcx, %rax
@@ -82,7 +83,7 @@ define <2 x i64> @test_div7_2i64(<2 x i64> %a) nounwind {
define <4 x i32> @test_div7_4i32(<4 x i32> %a) nounwind {
; SSE2-LABEL: test_div7_4i32:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757]
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: pmuludq %xmm1, %xmm2
@@ -99,7 +100,7 @@ define <4 x i32> @test_div7_4i32(<4 x i32> %a) nounwind {
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_div7_4i32:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757]
; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
@@ -114,7 +115,7 @@ define <4 x i32> @test_div7_4i32(<4 x i32> %a) nounwind {
; SSE41-NEXT: retq
;
; AVX1-LABEL: test_div7_4i32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757]
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
@@ -129,7 +130,7 @@ define <4 x i32> @test_div7_4i32(<4 x i32> %a) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_div7_4i32:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757]
; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
@@ -148,7 +149,7 @@ define <4 x i32> @test_div7_4i32(<4 x i32> %a) nounwind {
define <8 x i16> @test_div7_8i16(<8 x i16> %a) nounwind {
; SSE-LABEL: test_div7_8i16:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movdqa {{.*#+}} xmm1 = [9363,9363,9363,9363,9363,9363,9363,9363]
; SSE-NEXT: pmulhuw %xmm0, %xmm1
; SSE-NEXT: psubw %xmm1, %xmm0
@@ -158,7 +159,7 @@ define <8 x i16> @test_div7_8i16(<8 x i16> %a) nounwind {
; SSE-NEXT: retq
;
; AVX-LABEL: test_div7_8i16:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpmulhuw {{.*}}(%rip), %xmm0, %xmm1
; AVX-NEXT: vpsubw %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpsrlw $1, %xmm0, %xmm0
@@ -171,7 +172,7 @@ define <8 x i16> @test_div7_8i16(<8 x i16> %a) nounwind {
define <16 x i8> @test_div7_16i8(<16 x i8> %a) nounwind {
; SSE2-LABEL: test_div7_16i8:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: pxor %xmm1, %xmm1
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
@@ -192,7 +193,7 @@ define <16 x i8> @test_div7_16i8(<16 x i8> %a) nounwind {
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_div7_16i8:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [37,37,37,37,37,37,37,37]
; SSE41-NEXT: pmullw %xmm2, %xmm1
@@ -211,7 +212,7 @@ define <16 x i8> @test_div7_16i8(<16 x i8> %a) nounwind {
; SSE41-NEXT: retq
;
; AVX1-LABEL: test_div7_16i8:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [37,37,37,37,37,37,37,37]
; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1
@@ -229,21 +230,36 @@ define <16 x i8> @test_div7_16i8(<16 x i8> %a) nounwind {
; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: test_div7_16i8:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1
-; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1
-; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpsrlw $1, %xmm0, %xmm0
-; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpsrlw $2, %xmm0, %xmm0
-; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
+; AVX2NOBW-LABEL: test_div7_16i8:
+; AVX2NOBW: # %bb.0:
+; AVX2NOBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX2NOBW-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1
+; AVX2NOBW-NEXT: vpsrlw $8, %ymm1, %ymm1
+; AVX2NOBW-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2NOBW-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
+; AVX2NOBW-NEXT: vpsubb %xmm1, %xmm0, %xmm0
+; AVX2NOBW-NEXT: vpsrlw $1, %xmm0, %xmm0
+; AVX2NOBW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX2NOBW-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; AVX2NOBW-NEXT: vpsrlw $2, %xmm0, %xmm0
+; AVX2NOBW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX2NOBW-NEXT: vzeroupper
+; AVX2NOBW-NEXT: retq
+;
+; AVX512BW-LABEL: test_div7_16i8:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1
+; AVX512BW-NEXT: vpsrlw $8, %ymm1, %ymm1
+; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1
+; AVX512BW-NEXT: vpsubb %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT: vpsrlw $1, %xmm0, %xmm0
+; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512BW-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT: vpsrlw $2, %xmm0, %xmm0
+; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
%res = udiv <16 x i8> %a, <i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7>
ret <16 x i8> %res
}
@@ -254,7 +270,7 @@ define <16 x i8> @test_div7_16i8(<16 x i8> %a) nounwind {
define <2 x i64> @test_rem7_2i64(<2 x i64> %a) nounwind {
; SSE2-LABEL: test_rem7_2i64:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movq %xmm0, %rcx
; SSE2-NEXT: movabsq $2635249153387078803, %rsi # imm = 0x2492492492492493
; SSE2-NEXT: movq %rcx, %rax
@@ -286,7 +302,7 @@ define <2 x i64> @test_rem7_2i64(<2 x i64> %a) nounwind {
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_rem7_2i64:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pextrq $1, %xmm0, %rcx
; SSE41-NEXT: movabsq $2635249153387078803, %rsi # imm = 0x2492492492492493
; SSE41-NEXT: movq %rcx, %rax
@@ -316,7 +332,7 @@ define <2 x i64> @test_rem7_2i64(<2 x i64> %a) nounwind {
; SSE41-NEXT: retq
;
; AVX-LABEL: test_rem7_2i64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpextrq $1, %xmm0, %rcx
; AVX-NEXT: movabsq $2635249153387078803, %rsi # imm = 0x2492492492492493
; AVX-NEXT: movq %rcx, %rax
@@ -350,7 +366,7 @@ define <2 x i64> @test_rem7_2i64(<2 x i64> %a) nounwind {
define <4 x i32> @test_rem7_4i32(<4 x i32> %a) nounwind {
; SSE2-LABEL: test_rem7_4i32:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757]
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: pmuludq %xmm1, %xmm2
@@ -376,7 +392,7 @@ define <4 x i32> @test_rem7_4i32(<4 x i32> %a) nounwind {
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_rem7_4i32:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757]
; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
@@ -394,7 +410,7 @@ define <4 x i32> @test_rem7_4i32(<4 x i32> %a) nounwind {
; SSE41-NEXT: retq
;
; AVX1-LABEL: test_rem7_4i32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757]
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
@@ -411,7 +427,7 @@ define <4 x i32> @test_rem7_4i32(<4 x i32> %a) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_rem7_4i32:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757]
; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
@@ -433,7 +449,7 @@ define <4 x i32> @test_rem7_4i32(<4 x i32> %a) nounwind {
define <8 x i16> @test_rem7_8i16(<8 x i16> %a) nounwind {
; SSE-LABEL: test_rem7_8i16:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movdqa {{.*#+}} xmm1 = [9363,9363,9363,9363,9363,9363,9363,9363]
; SSE-NEXT: pmulhuw %xmm0, %xmm1
; SSE-NEXT: movdqa %xmm0, %xmm2
@@ -446,7 +462,7 @@ define <8 x i16> @test_rem7_8i16(<8 x i16> %a) nounwind {
; SSE-NEXT: retq
;
; AVX-LABEL: test_rem7_8i16:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpmulhuw {{.*}}(%rip), %xmm0, %xmm1
; AVX-NEXT: vpsubw %xmm1, %xmm0, %xmm2
; AVX-NEXT: vpsrlw $1, %xmm2, %xmm2
@@ -461,7 +477,7 @@ define <8 x i16> @test_rem7_8i16(<8 x i16> %a) nounwind {
define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind {
; SSE2-LABEL: test_rem7_16i8:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: pxor %xmm1, %xmm1
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
@@ -496,7 +512,7 @@ define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind {
; SSE2-NEXT: retq
;
; SSE41-LABEL: test_rem7_16i8:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [37,37,37,37,37,37,37,37]
; SSE41-NEXT: pmullw %xmm2, %xmm1
@@ -527,7 +543,7 @@ define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind {
; SSE41-NEXT: retq
;
; AVX1-LABEL: test_rem7_16i8:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [37,37,37,37,37,37,37,37]
; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1
@@ -556,29 +572,48 @@ define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind {
; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: test_rem7_16i8:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1
-; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1
-; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm2
-; AVX2-NEXT: vpsrlw $1, %xmm2, %xmm2
-; AVX2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX2-NEXT: vpaddb %xmm1, %xmm2, %xmm1
-; AVX2-NEXT: vpsrlw $2, %xmm1, %xmm1
-; AVX2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
-; AVX2-NEXT: vpmovsxbw %xmm1, %ymm1
-; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1
-; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2
-; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
+; AVX2NOBW-LABEL: test_rem7_16i8:
+; AVX2NOBW: # %bb.0:
+; AVX2NOBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX2NOBW-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1
+; AVX2NOBW-NEXT: vpsrlw $8, %ymm1, %ymm1
+; AVX2NOBW-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2NOBW-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
+; AVX2NOBW-NEXT: vpsubb %xmm1, %xmm0, %xmm2
+; AVX2NOBW-NEXT: vpsrlw $1, %xmm2, %xmm2
+; AVX2NOBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX2NOBW-NEXT: vpaddb %xmm1, %xmm2, %xmm1
+; AVX2NOBW-NEXT: vpsrlw $2, %xmm1, %xmm1
+; AVX2NOBW-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX2NOBW-NEXT: vpmovsxbw %xmm1, %ymm1
+; AVX2NOBW-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1
+; AVX2NOBW-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2NOBW-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2NOBW-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX2NOBW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX2NOBW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX2NOBW-NEXT: vpsubb %xmm1, %xmm0, %xmm0
+; AVX2NOBW-NEXT: vzeroupper
+; AVX2NOBW-NEXT: retq
+;
+; AVX512BW-LABEL: test_rem7_16i8:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1
+; AVX512BW-NEXT: vpsrlw $8, %ymm1, %ymm1
+; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1
+; AVX512BW-NEXT: vpsubb %xmm1, %xmm0, %xmm2
+; AVX512BW-NEXT: vpsrlw $1, %xmm2, %xmm2
+; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512BW-NEXT: vpaddb %xmm1, %xmm2, %xmm1
+; AVX512BW-NEXT: vpsrlw $2, %xmm1, %xmm1
+; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX512BW-NEXT: vpmovsxbw %xmm1, %ymm1
+; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %ymm1, %ymm1
+; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1
+; AVX512BW-NEXT: vpsubb %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
%res = urem <16 x i8> %a, <i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7>
ret <16 x i8> %res
}
diff --git a/test/CodeGen/X86/vector-idiv-udiv-256.ll b/test/CodeGen/X86/vector-idiv-udiv-256.ll
index b0433110f181..602f050935d6 100644
--- a/test/CodeGen/X86/vector-idiv-udiv-256.ll
+++ b/test/CodeGen/X86/vector-idiv-udiv-256.ll
@@ -1,6 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX2 --check-prefix=AVX2NOBW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX2 --check-prefix=AVX512BW
;
; udiv by 7
@@ -8,7 +9,7 @@
define <4 x i64> @test_div7_4i64(<4 x i64> %a) nounwind {
; AVX1-LABEL: test_div7_4i64:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpextrq $1, %xmm1, %rcx
; AVX1-NEXT: movabsq $2635249153387078803, %rsi # imm = 0x2492492492492493
@@ -49,7 +50,7 @@ define <4 x i64> @test_div7_4i64(<4 x i64> %a) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_div7_4i64:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpextrq $1, %xmm1, %rcx
; AVX2-NEXT: movabsq $2635249153387078803, %rsi # imm = 0x2492492492492493
@@ -94,7 +95,7 @@ define <4 x i64> @test_div7_4i64(<4 x i64> %a) nounwind {
define <8 x i32> @test_div7_8i32(<8 x i32> %a) nounwind {
; AVX1-LABEL: test_div7_8i32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovdqa {{.*#+}} ymm1 = [613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757]
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
@@ -122,7 +123,7 @@ define <8 x i32> @test_div7_8i32(<8 x i32> %a) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_div7_8i32:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757]
; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7]
; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7]
@@ -141,7 +142,7 @@ define <8 x i32> @test_div7_8i32(<8 x i32> %a) nounwind {
define <16 x i16> @test_div7_16i16(<16 x i16> %a) nounwind {
; AVX1-LABEL: test_div7_16i16:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [9363,9363,9363,9363,9363,9363,9363,9363]
; AVX1-NEXT: vpmulhuw %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vpsubw %xmm2, %xmm0, %xmm3
@@ -158,7 +159,7 @@ define <16 x i16> @test_div7_16i16(<16 x i16> %a) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_div7_16i16:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpmulhuw {{.*}}(%rip), %ymm0, %ymm1
; AVX2-NEXT: vpsubw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpsrlw $1, %ymm0, %ymm0
@@ -171,7 +172,7 @@ define <16 x i16> @test_div7_16i16(<16 x i16> %a) nounwind {
define <32 x i8> @test_div7_32i8(<32 x i8> %a) nounwind {
; AVX1-LABEL: test_div7_32i8:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [37,37,37,37,37,37,37,37]
@@ -207,29 +208,40 @@ define <32 x i8> @test_div7_32i8(<32 x i8> %a) nounwind {
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: test_div7_32i8:
-; AVX2: # BB#0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
-; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
-; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero,xmm3[8],zero,xmm3[9],zero,xmm3[10],zero,xmm3[11],zero,xmm3[12],zero,xmm3[13],zero,xmm3[14],zero,xmm3[15],zero
-; AVX2-NEXT: vpmullw %ymm2, %ymm3, %ymm2
-; AVX2-NEXT: vpsrlw $8, %ymm2, %ymm2
-; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX2-NEXT: vpmullw %ymm1, %ymm3, %ymm1
-; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1
-; AVX2-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm1[2,3],ymm2[2,3]
-; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
-; AVX2-NEXT: vpackuswb %ymm3, %ymm1, %ymm1
-; AVX2-NEXT: vpsubb %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpsrlw $1, %ymm0, %ymm0
-; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
-; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpsrlw $2, %ymm0, %ymm0
-; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
-; AVX2-NEXT: retq
+; AVX2NOBW-LABEL: test_div7_32i8:
+; AVX2NOBW: # %bb.0:
+; AVX2NOBW-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2NOBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX2NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
+; AVX2NOBW-NEXT: vpmullw %ymm2, %ymm1, %ymm1
+; AVX2NOBW-NEXT: vpsrlw $8, %ymm1, %ymm1
+; AVX2NOBW-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX2NOBW-NEXT: vpmullw %ymm2, %ymm3, %ymm2
+; AVX2NOBW-NEXT: vpsrlw $8, %ymm2, %ymm2
+; AVX2NOBW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm2[2,3],ymm1[2,3]
+; AVX2NOBW-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
+; AVX2NOBW-NEXT: vpackuswb %ymm3, %ymm1, %ymm1
+; AVX2NOBW-NEXT: vpsubb %ymm1, %ymm0, %ymm0
+; AVX2NOBW-NEXT: vpsrlw $1, %ymm0, %ymm0
+; AVX2NOBW-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX2NOBW-NEXT: vpaddb %ymm1, %ymm0, %ymm0
+; AVX2NOBW-NEXT: vpsrlw $2, %ymm0, %ymm0
+; AVX2NOBW-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX2NOBW-NEXT: retq
+;
+; AVX512BW-LABEL: test_div7_32i8:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %zmm1, %zmm1
+; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1
+; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1
+; AVX512BW-NEXT: vpsubb %ymm1, %ymm0, %ymm0
+; AVX512BW-NEXT: vpsrlw $1, %ymm0, %ymm0
+; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX512BW-NEXT: vpaddb %ymm1, %ymm0, %ymm0
+; AVX512BW-NEXT: vpsrlw $2, %ymm0, %ymm0
+; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX512BW-NEXT: retq
%res = udiv <32 x i8> %a, <i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7>
ret <32 x i8> %res
}
@@ -240,7 +252,7 @@ define <32 x i8> @test_div7_32i8(<32 x i8> %a) nounwind {
define <4 x i64> @test_rem7_4i64(<4 x i64> %a) nounwind {
; AVX1-LABEL: test_rem7_4i64:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpextrq $1, %xmm1, %rcx
; AVX1-NEXT: movabsq $2635249153387078803, %rsi # imm = 0x2492492492492493
@@ -297,7 +309,7 @@ define <4 x i64> @test_rem7_4i64(<4 x i64> %a) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_rem7_4i64:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpextrq $1, %xmm1, %rcx
; AVX2-NEXT: movabsq $2635249153387078803, %rsi # imm = 0x2492492492492493
@@ -358,7 +370,7 @@ define <4 x i64> @test_rem7_4i64(<4 x i64> %a) nounwind {
define <8 x i32> @test_rem7_8i32(<8 x i32> %a) nounwind {
; AVX1-LABEL: test_rem7_8i32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovdqa {{.*#+}} ymm1 = [613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757]
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
@@ -391,7 +403,7 @@ define <8 x i32> @test_rem7_8i32(<8 x i32> %a) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_rem7_8i32:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757]
; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7]
; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7]
@@ -413,7 +425,7 @@ define <8 x i32> @test_rem7_8i32(<8 x i32> %a) nounwind {
define <16 x i16> @test_rem7_16i16(<16 x i16> %a) nounwind {
; AVX1-LABEL: test_rem7_16i16:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9363,9363,9363,9363,9363,9363,9363,9363]
; AVX1-NEXT: vpmulhuw %xmm2, %xmm1, %xmm3
@@ -435,7 +447,7 @@ define <16 x i16> @test_rem7_16i16(<16 x i16> %a) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_rem7_16i16:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpmulhuw {{.*}}(%rip), %ymm0, %ymm1
; AVX2-NEXT: vpsubw %ymm1, %ymm0, %ymm2
; AVX2-NEXT: vpsrlw $1, %ymm2, %ymm2
@@ -450,7 +462,7 @@ define <16 x i16> @test_rem7_16i16(<16 x i16> %a) nounwind {
define <32 x i8> @test_rem7_32i8(<32 x i8> %a) nounwind {
; AVX1-LABEL: test_rem7_32i8:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [37,37,37,37,37,37,37,37]
@@ -506,46 +518,61 @@ define <32 x i8> @test_rem7_32i8(<32 x i8> %a) nounwind {
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: test_rem7_32i8:
-; AVX2: # BB#0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
-; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
-; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero,xmm3[8],zero,xmm3[9],zero,xmm3[10],zero,xmm3[11],zero,xmm3[12],zero,xmm3[13],zero,xmm3[14],zero,xmm3[15],zero
-; AVX2-NEXT: vpmullw %ymm2, %ymm3, %ymm2
-; AVX2-NEXT: vpsrlw $8, %ymm2, %ymm2
-; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX2-NEXT: vpmullw %ymm1, %ymm3, %ymm1
-; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1
-; AVX2-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm1[2,3],ymm2[2,3]
-; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
-; AVX2-NEXT: vpackuswb %ymm3, %ymm1, %ymm1
-; AVX2-NEXT: vpsubb %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vpsrlw $1, %ymm2, %ymm2
-; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
-; AVX2-NEXT: vpaddb %ymm1, %ymm2, %ymm1
-; AVX2-NEXT: vpsrlw $2, %ymm1, %ymm1
-; AVX2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
-; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX2-NEXT: vpmovsxbw %xmm2, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX2-NEXT: vpmullw %ymm3, %ymm2, %ymm2
-; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm4
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX2-NEXT: vpshufb %xmm5, %xmm4, %xmm4
-; AVX2-NEXT: vpshufb %xmm5, %xmm2, %xmm2
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0]
-; AVX2-NEXT: vpmovsxbw %xmm1, %ymm1
-; AVX2-NEXT: vpmullw %ymm3, %ymm1, %ymm1
-; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
-; AVX2-NEXT: vpshufb %xmm5, %xmm3, %xmm3
-; AVX2-NEXT: vpshufb %xmm5, %xmm1, %xmm1
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
-; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
-; AVX2-NEXT: vpsubb %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: retq
+; AVX2NOBW-LABEL: test_rem7_32i8:
+; AVX2NOBW: # %bb.0:
+; AVX2NOBW-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2NOBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX2NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
+; AVX2NOBW-NEXT: vpmullw %ymm2, %ymm1, %ymm1
+; AVX2NOBW-NEXT: vpsrlw $8, %ymm1, %ymm1
+; AVX2NOBW-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX2NOBW-NEXT: vpmullw %ymm2, %ymm3, %ymm2
+; AVX2NOBW-NEXT: vpsrlw $8, %ymm2, %ymm2
+; AVX2NOBW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm2[2,3],ymm1[2,3]
+; AVX2NOBW-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
+; AVX2NOBW-NEXT: vpackuswb %ymm3, %ymm1, %ymm1
+; AVX2NOBW-NEXT: vpsubb %ymm1, %ymm0, %ymm2
+; AVX2NOBW-NEXT: vpsrlw $1, %ymm2, %ymm2
+; AVX2NOBW-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX2NOBW-NEXT: vpaddb %ymm1, %ymm2, %ymm1
+; AVX2NOBW-NEXT: vpsrlw $2, %ymm1, %ymm1
+; AVX2NOBW-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
+; AVX2NOBW-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2NOBW-NEXT: vpmovsxbw %xmm2, %ymm2
+; AVX2NOBW-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX2NOBW-NEXT: vpmullw %ymm3, %ymm2, %ymm2
+; AVX2NOBW-NEXT: vextracti128 $1, %ymm2, %xmm4
+; AVX2NOBW-NEXT: vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2NOBW-NEXT: vpshufb %xmm5, %xmm4, %xmm4
+; AVX2NOBW-NEXT: vpshufb %xmm5, %xmm2, %xmm2
+; AVX2NOBW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0]
+; AVX2NOBW-NEXT: vpmovsxbw %xmm1, %ymm1
+; AVX2NOBW-NEXT: vpmullw %ymm3, %ymm1, %ymm1
+; AVX2NOBW-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX2NOBW-NEXT: vpshufb %xmm5, %xmm3, %xmm3
+; AVX2NOBW-NEXT: vpshufb %xmm5, %xmm1, %xmm1
+; AVX2NOBW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
+; AVX2NOBW-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX2NOBW-NEXT: vpsubb %ymm1, %ymm0, %ymm0
+; AVX2NOBW-NEXT: retq
+;
+; AVX512BW-LABEL: test_rem7_32i8:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %zmm1, %zmm1
+; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1
+; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1
+; AVX512BW-NEXT: vpsubb %ymm1, %ymm0, %ymm2
+; AVX512BW-NEXT: vpsrlw $1, %ymm2, %ymm2
+; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512BW-NEXT: vpaddb %ymm1, %ymm2, %ymm1
+; AVX512BW-NEXT: vpsrlw $2, %ymm1, %ymm1
+; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
+; AVX512BW-NEXT: vpmovsxbw %ymm1, %zmm1
+; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %zmm1, %zmm1
+; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1
+; AVX512BW-NEXT: vpsubb %ymm1, %ymm0, %ymm0
+; AVX512BW-NEXT: retq
%res = urem <32 x i8> %a, <i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7>
ret <32 x i8> %res
}
diff --git a/test/CodeGen/X86/vector-idiv-udiv-512.ll b/test/CodeGen/X86/vector-idiv-udiv-512.ll
index f457bb0d6a42..cd38e37fc254 100644
--- a/test/CodeGen/X86/vector-idiv-udiv-512.ll
+++ b/test/CodeGen/X86/vector-idiv-udiv-512.ll
@@ -8,7 +8,7 @@
define <8 x i64> @test_div7_8i64(<8 x i64> %a) nounwind {
; AVX-LABEL: test_div7_8i64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vextracti32x4 $3, %zmm0, %xmm1
; AVX-NEXT: vpextrq $1, %xmm1, %rcx
; AVX-NEXT: movabsq $2635249153387078803, %rsi # imm = 0x2492492492492493
@@ -47,7 +47,7 @@ define <8 x i64> @test_div7_8i64(<8 x i64> %a) nounwind {
; AVX-NEXT: vmovq %rcx, %xmm2
; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
; AVX-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
-; AVX-NEXT: vextracti32x4 $1, %zmm0, %xmm2
+; AVX-NEXT: vextracti128 $1, %ymm0, %xmm2
; AVX-NEXT: vpextrq $1, %xmm2, %rcx
; AVX-NEXT: movq %rcx, %rax
; AVX-NEXT: mulq %rsi
@@ -91,141 +91,18 @@ define <8 x i64> @test_div7_8i64(<8 x i64> %a) nounwind {
define <16 x i32> @test_div7_16i32(<16 x i32> %a) nounwind {
; AVX-LABEL: test_div7_16i32:
-; AVX: # BB#0:
-; AVX-NEXT: vextracti32x4 $3, %zmm0, %xmm1
-; AVX-NEXT: vpextrd $1, %xmm1, %eax
-; AVX-NEXT: imulq $613566757, %rax, %rcx # imm = 0x24924925
-; AVX-NEXT: shrq $32, %rcx
-; AVX-NEXT: subl %ecx, %eax
-; AVX-NEXT: shrl %eax
-; AVX-NEXT: addl %ecx, %eax
-; AVX-NEXT: shrl $2, %eax
-; AVX-NEXT: vmovd %xmm1, %ecx
-; AVX-NEXT: imulq $613566757, %rcx, %rdx # imm = 0x24924925
-; AVX-NEXT: shrq $32, %rdx
-; AVX-NEXT: subl %edx, %ecx
-; AVX-NEXT: shrl %ecx
-; AVX-NEXT: addl %edx, %ecx
-; AVX-NEXT: shrl $2, %ecx
-; AVX-NEXT: vmovd %ecx, %xmm2
-; AVX-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2
-; AVX-NEXT: vpextrd $2, %xmm1, %eax
-; AVX-NEXT: imulq $613566757, %rax, %rcx # imm = 0x24924925
-; AVX-NEXT: shrq $32, %rcx
-; AVX-NEXT: subl %ecx, %eax
-; AVX-NEXT: shrl %eax
-; AVX-NEXT: addl %ecx, %eax
-; AVX-NEXT: shrl $2, %eax
-; AVX-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2
-; AVX-NEXT: vpextrd $3, %xmm1, %eax
-; AVX-NEXT: imulq $613566757, %rax, %rcx # imm = 0x24924925
-; AVX-NEXT: shrq $32, %rcx
-; AVX-NEXT: subl %ecx, %eax
-; AVX-NEXT: shrl %eax
-; AVX-NEXT: addl %ecx, %eax
-; AVX-NEXT: shrl $2, %eax
-; AVX-NEXT: vpinsrd $3, %eax, %xmm2, %xmm1
-; AVX-NEXT: vextracti32x4 $2, %zmm0, %xmm2
-; AVX-NEXT: vpextrd $1, %xmm2, %eax
-; AVX-NEXT: imulq $613566757, %rax, %rcx # imm = 0x24924925
-; AVX-NEXT: shrq $32, %rcx
-; AVX-NEXT: subl %ecx, %eax
-; AVX-NEXT: shrl %eax
-; AVX-NEXT: addl %ecx, %eax
-; AVX-NEXT: shrl $2, %eax
-; AVX-NEXT: vmovd %xmm2, %ecx
-; AVX-NEXT: imulq $613566757, %rcx, %rdx # imm = 0x24924925
-; AVX-NEXT: shrq $32, %rdx
-; AVX-NEXT: subl %edx, %ecx
-; AVX-NEXT: shrl %ecx
-; AVX-NEXT: addl %edx, %ecx
-; AVX-NEXT: shrl $2, %ecx
-; AVX-NEXT: vmovd %ecx, %xmm3
-; AVX-NEXT: vpinsrd $1, %eax, %xmm3, %xmm3
-; AVX-NEXT: vpextrd $2, %xmm2, %eax
-; AVX-NEXT: imulq $613566757, %rax, %rcx # imm = 0x24924925
-; AVX-NEXT: shrq $32, %rcx
-; AVX-NEXT: subl %ecx, %eax
-; AVX-NEXT: shrl %eax
-; AVX-NEXT: addl %ecx, %eax
-; AVX-NEXT: shrl $2, %eax
-; AVX-NEXT: vpinsrd $2, %eax, %xmm3, %xmm3
-; AVX-NEXT: vpextrd $3, %xmm2, %eax
-; AVX-NEXT: imulq $613566757, %rax, %rcx # imm = 0x24924925
-; AVX-NEXT: shrq $32, %rcx
-; AVX-NEXT: subl %ecx, %eax
-; AVX-NEXT: shrl %eax
-; AVX-NEXT: addl %ecx, %eax
-; AVX-NEXT: shrl $2, %eax
-; AVX-NEXT: vpinsrd $3, %eax, %xmm3, %xmm2
-; AVX-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
-; AVX-NEXT: vextracti32x4 $1, %zmm0, %xmm2
-; AVX-NEXT: vpextrd $1, %xmm2, %eax
-; AVX-NEXT: imulq $613566757, %rax, %rcx # imm = 0x24924925
-; AVX-NEXT: shrq $32, %rcx
-; AVX-NEXT: subl %ecx, %eax
-; AVX-NEXT: shrl %eax
-; AVX-NEXT: addl %ecx, %eax
-; AVX-NEXT: shrl $2, %eax
-; AVX-NEXT: vmovd %xmm2, %ecx
-; AVX-NEXT: imulq $613566757, %rcx, %rdx # imm = 0x24924925
-; AVX-NEXT: shrq $32, %rdx
-; AVX-NEXT: subl %edx, %ecx
-; AVX-NEXT: shrl %ecx
-; AVX-NEXT: addl %edx, %ecx
-; AVX-NEXT: shrl $2, %ecx
-; AVX-NEXT: vmovd %ecx, %xmm3
-; AVX-NEXT: vpinsrd $1, %eax, %xmm3, %xmm3
-; AVX-NEXT: vpextrd $2, %xmm2, %eax
-; AVX-NEXT: imulq $613566757, %rax, %rcx # imm = 0x24924925
-; AVX-NEXT: shrq $32, %rcx
-; AVX-NEXT: subl %ecx, %eax
-; AVX-NEXT: shrl %eax
-; AVX-NEXT: addl %ecx, %eax
-; AVX-NEXT: shrl $2, %eax
-; AVX-NEXT: vpinsrd $2, %eax, %xmm3, %xmm3
-; AVX-NEXT: vpextrd $3, %xmm2, %eax
-; AVX-NEXT: imulq $613566757, %rax, %rcx # imm = 0x24924925
-; AVX-NEXT: shrq $32, %rcx
-; AVX-NEXT: subl %ecx, %eax
-; AVX-NEXT: shrl %eax
-; AVX-NEXT: addl %ecx, %eax
-; AVX-NEXT: shrl $2, %eax
-; AVX-NEXT: vpinsrd $3, %eax, %xmm3, %xmm2
-; AVX-NEXT: vpextrd $1, %xmm0, %eax
-; AVX-NEXT: imulq $613566757, %rax, %rcx # imm = 0x24924925
-; AVX-NEXT: shrq $32, %rcx
-; AVX-NEXT: subl %ecx, %eax
-; AVX-NEXT: shrl %eax
-; AVX-NEXT: addl %ecx, %eax
-; AVX-NEXT: shrl $2, %eax
-; AVX-NEXT: vmovd %xmm0, %ecx
-; AVX-NEXT: imulq $613566757, %rcx, %rdx # imm = 0x24924925
-; AVX-NEXT: shrq $32, %rdx
-; AVX-NEXT: subl %edx, %ecx
-; AVX-NEXT: shrl %ecx
-; AVX-NEXT: addl %edx, %ecx
-; AVX-NEXT: shrl $2, %ecx
-; AVX-NEXT: vmovd %ecx, %xmm3
-; AVX-NEXT: vpinsrd $1, %eax, %xmm3, %xmm3
-; AVX-NEXT: vpextrd $2, %xmm0, %eax
-; AVX-NEXT: imulq $613566757, %rax, %rcx # imm = 0x24924925
-; AVX-NEXT: shrq $32, %rcx
-; AVX-NEXT: subl %ecx, %eax
-; AVX-NEXT: shrl %eax
-; AVX-NEXT: addl %ecx, %eax
-; AVX-NEXT: shrl $2, %eax
-; AVX-NEXT: vpinsrd $2, %eax, %xmm3, %xmm3
-; AVX-NEXT: vpextrd $3, %xmm0, %eax
-; AVX-NEXT: imulq $613566757, %rax, %rcx # imm = 0x24924925
-; AVX-NEXT: shrq $32, %rcx
-; AVX-NEXT: subl %ecx, %eax
-; AVX-NEXT: shrl %eax
-; AVX-NEXT: addl %ecx, %eax
-; AVX-NEXT: shrl $2, %eax
-; AVX-NEXT: vpinsrd $3, %eax, %xmm3, %xmm0
-; AVX-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
-; AVX-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX: # %bb.0:
+; AVX-NEXT: vpbroadcastd {{.*#+}} zmm1 = [613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757]
+; AVX-NEXT: vpmuludq %zmm1, %zmm0, %zmm2
+; AVX-NEXT: vpshufd {{.*#+}} zmm1 = zmm1[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
+; AVX-NEXT: vpshufd {{.*#+}} zmm3 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
+; AVX-NEXT: vpmuludq %zmm1, %zmm3, %zmm1
+; AVX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31]
+; AVX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
+; AVX-NEXT: vpsubd %zmm3, %zmm0, %zmm0
+; AVX-NEXT: vpsrld $1, %zmm0, %zmm0
+; AVX-NEXT: vpaddd %zmm3, %zmm0, %zmm0
+; AVX-NEXT: vpsrld $2, %zmm0, %zmm0
; AVX-NEXT: retq
%res = udiv <16 x i32> %a, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
ret <16 x i32> %res
@@ -233,7 +110,7 @@ define <16 x i32> @test_div7_16i32(<16 x i32> %a) nounwind {
define <32 x i16> @test_div7_32i16(<32 x i16> %a) nounwind {
; AVX512F-LABEL: test_div7_32i16:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363]
; AVX512F-NEXT: vpmulhuw %ymm2, %ymm0, %ymm3
; AVX512F-NEXT: vpsubw %ymm3, %ymm0, %ymm0
@@ -248,7 +125,7 @@ define <32 x i16> @test_div7_32i16(<32 x i16> %a) nounwind {
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: test_div7_32i16:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpmulhuw {{.*}}(%rip), %zmm0, %zmm1
; AVX512BW-NEXT: vpsubw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpsrlw $1, %zmm0, %zmm0
@@ -261,631 +138,63 @@ define <32 x i16> @test_div7_32i16(<32 x i16> %a) nounwind {
define <64 x i8> @test_div7_64i8(<64 x i8> %a) nounwind {
; AVX512F-LABEL: test_div7_64i8:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
-; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm3
-; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero,xmm3[8],zero,xmm3[9],zero,xmm3[10],zero,xmm3[11],zero,xmm3[12],zero,xmm3[13],zero,xmm3[14],zero,xmm3[15],zero
-; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm4
-; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
+; AVX512F-NEXT: vpmullw %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2
+; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
; AVX512F-NEXT: vpmullw %ymm3, %ymm4, %ymm4
; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4
-; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
-; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm5 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512F-NEXT: vpmullw %ymm2, %ymm5, %ymm5
-; AVX512F-NEXT: vpsrlw $8, %ymm5, %ymm5
-; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm5[2,3],ymm4[2,3]
-; AVX512F-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4
-; AVX512F-NEXT: vpackuswb %ymm6, %ymm4, %ymm4
-; AVX512F-NEXT: vpsubb %ymm4, %ymm0, %ymm0
+; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm4[2,3],ymm2[2,3]
+; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2
+; AVX512F-NEXT: vpackuswb %ymm5, %ymm2, %ymm2
+; AVX512F-NEXT: vpsubb %ymm2, %ymm0, %ymm0
; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm0
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; AVX512F-NEXT: vpand %ymm5, %ymm0, %ymm0
-; AVX512F-NEXT: vpaddb %ymm4, %ymm0, %ymm0
-; AVX512F-NEXT: vpsrlw $2, %ymm0, %ymm0
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
; AVX512F-NEXT: vpand %ymm4, %ymm0, %ymm0
-; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm6
-; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero,xmm6[8],zero,xmm6[9],zero,xmm6[10],zero,xmm6[11],zero,xmm6[12],zero,xmm6[13],zero,xmm6[14],zero,xmm6[15],zero
+; AVX512F-NEXT: vpaddb %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vpsrlw $2, %ymm0, %ymm0
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
+; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm5
+; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero
+; AVX512F-NEXT: vpmullw %ymm3, %ymm5, %ymm5
+; AVX512F-NEXT: vpsrlw $8, %ymm5, %ymm5
+; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm6 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
; AVX512F-NEXT: vpmullw %ymm3, %ymm6, %ymm3
; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
-; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm6 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512F-NEXT: vpmullw %ymm2, %ymm6, %ymm2
-; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2
-; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm2[2,3],ymm3[2,3]
-; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
-; AVX512F-NEXT: vpackuswb %ymm6, %ymm2, %ymm2
-; AVX512F-NEXT: vpsubb %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm3[2,3],ymm5[2,3]
+; AVX512F-NEXT: vinserti128 $1, %xmm5, %ymm3, %ymm3
+; AVX512F-NEXT: vpackuswb %ymm6, %ymm3, %ymm3
+; AVX512F-NEXT: vpsubb %ymm3, %ymm1, %ymm1
; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1
-; AVX512F-NEXT: vpand %ymm5, %ymm1, %ymm1
-; AVX512F-NEXT: vpaddb %ymm2, %ymm1, %ymm1
-; AVX512F-NEXT: vpsrlw $2, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm4, %ymm1, %ymm1
+; AVX512F-NEXT: vpaddb %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT: vpsrlw $2, %ymm1, %ymm1
+; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: test_div7_64i8:
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm1
-; AVX512BW-NEXT: vpextrb $1, %xmm1, %eax
-; AVX512BW-NEXT: imull $37, %eax, %ecx
-; AVX512BW-NEXT: shrl $8, %ecx
-; AVX512BW-NEXT: subb %cl, %al
-; AVX512BW-NEXT: shrb %al
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: shrb $2, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpextrb $0, %xmm1, %ecx
-; AVX512BW-NEXT: imull $37, %ecx, %edx
-; AVX512BW-NEXT: shrl $8, %edx
-; AVX512BW-NEXT: subb %dl, %cl
-; AVX512BW-NEXT: shrb %cl
-; AVX512BW-NEXT: addb %dl, %cl
-; AVX512BW-NEXT: shrb $2, %cl
-; AVX512BW-NEXT: movzbl %cl, %ecx
-; AVX512BW-NEXT: vmovd %ecx, %xmm2
-; AVX512BW-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT: vpextrb $2, %xmm1, %eax
-; AVX512BW-NEXT: imull $37, %eax, %ecx
-; AVX512BW-NEXT: shrl $8, %ecx
-; AVX512BW-NEXT: subb %cl, %al
-; AVX512BW-NEXT: shrb %al
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: shrb $2, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT: vpextrb $3, %xmm1, %eax
-; AVX512BW-NEXT: imull $37, %eax, %ecx
-; AVX512BW-NEXT: shrl $8, %ecx
-; AVX512BW-NEXT: subb %cl, %al
-; AVX512BW-NEXT: shrb %al
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: shrb $2, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT: vpextrb $4, %xmm1, %eax
-; AVX512BW-NEXT: imull $37, %eax, %ecx
-; AVX512BW-NEXT: shrl $8, %ecx
-; AVX512BW-NEXT: subb %cl, %al
-; AVX512BW-NEXT: shrb %al
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: shrb $2, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT: vpextrb $5, %xmm1, %eax
-; AVX512BW-NEXT: imull $37, %eax, %ecx
-; AVX512BW-NEXT: shrl $8, %ecx
-; AVX512BW-NEXT: subb %cl, %al
-; AVX512BW-NEXT: shrb %al
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: shrb $2, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT: vpextrb $6, %xmm1, %eax
-; AVX512BW-NEXT: imull $37, %eax, %ecx
-; AVX512BW-NEXT: shrl $8, %ecx
-; AVX512BW-NEXT: subb %cl, %al
-; AVX512BW-NEXT: shrb %al
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: shrb $2, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT: vpextrb $7, %xmm1, %eax
-; AVX512BW-NEXT: imull $37, %eax, %ecx
-; AVX512BW-NEXT: shrl $8, %ecx
-; AVX512BW-NEXT: subb %cl, %al
-; AVX512BW-NEXT: shrb %al
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: shrb $2, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT: vpextrb $8, %xmm1, %eax
-; AVX512BW-NEXT: imull $37, %eax, %ecx
-; AVX512BW-NEXT: shrl $8, %ecx
-; AVX512BW-NEXT: subb %cl, %al
-; AVX512BW-NEXT: shrb %al
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: shrb $2, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT: vpextrb $9, %xmm1, %eax
-; AVX512BW-NEXT: imull $37, %eax, %ecx
-; AVX512BW-NEXT: shrl $8, %ecx
-; AVX512BW-NEXT: subb %cl, %al
-; AVX512BW-NEXT: shrb %al
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: shrb $2, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT: vpextrb $10, %xmm1, %eax
-; AVX512BW-NEXT: imull $37, %eax, %ecx
-; AVX512BW-NEXT: shrl $8, %ecx
-; AVX512BW-NEXT: subb %cl, %al
-; AVX512BW-NEXT: shrb %al
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: shrb $2, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT: vpextrb $11, %xmm1, %eax
-; AVX512BW-NEXT: imull $37, %eax, %ecx
-; AVX512BW-NEXT: shrl $8, %ecx
-; AVX512BW-NEXT: subb %cl, %al
-; AVX512BW-NEXT: shrb %al
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: shrb $2, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT: vpextrb $12, %xmm1, %eax
-; AVX512BW-NEXT: imull $37, %eax, %ecx
-; AVX512BW-NEXT: shrl $8, %ecx
-; AVX512BW-NEXT: subb %cl, %al
-; AVX512BW-NEXT: shrb %al
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: shrb $2, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT: vpextrb $13, %xmm1, %eax
-; AVX512BW-NEXT: imull $37, %eax, %ecx
-; AVX512BW-NEXT: shrl $8, %ecx
-; AVX512BW-NEXT: subb %cl, %al
-; AVX512BW-NEXT: shrb %al
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: shrb $2, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT: vpextrb $14, %xmm1, %eax
-; AVX512BW-NEXT: imull $37, %eax, %ecx
-; AVX512BW-NEXT: shrl $8, %ecx
-; AVX512BW-NEXT: subb %cl, %al
-; AVX512BW-NEXT: shrb %al
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: shrb $2, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT: vpextrb $15, %xmm1, %eax
-; AVX512BW-NEXT: imull $37, %eax, %ecx
-; AVX512BW-NEXT: shrl $8, %ecx
-; AVX512BW-NEXT: subb %cl, %al
-; AVX512BW-NEXT: shrb %al
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: shrb $2, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm2, %xmm1
-; AVX512BW-NEXT: vextracti32x4 $2, %zmm0, %xmm2
-; AVX512BW-NEXT: vpextrb $1, %xmm2, %eax
-; AVX512BW-NEXT: imull $37, %eax, %ecx
-; AVX512BW-NEXT: shrl $8, %ecx
-; AVX512BW-NEXT: subb %cl, %al
-; AVX512BW-NEXT: shrb %al
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: shrb $2, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpextrb $0, %xmm2, %ecx
-; AVX512BW-NEXT: imull $37, %ecx, %edx
-; AVX512BW-NEXT: shrl $8, %edx
-; AVX512BW-NEXT: subb %dl, %cl
-; AVX512BW-NEXT: shrb %cl
-; AVX512BW-NEXT: addb %dl, %cl
-; AVX512BW-NEXT: shrb $2, %cl
-; AVX512BW-NEXT: movzbl %cl, %ecx
-; AVX512BW-NEXT: vmovd %ecx, %xmm3
-; AVX512BW-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $2, %xmm2, %eax
-; AVX512BW-NEXT: imull $37, %eax, %ecx
-; AVX512BW-NEXT: shrl $8, %ecx
-; AVX512BW-NEXT: subb %cl, %al
-; AVX512BW-NEXT: shrb %al
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: shrb $2, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $3, %xmm2, %eax
-; AVX512BW-NEXT: imull $37, %eax, %ecx
-; AVX512BW-NEXT: shrl $8, %ecx
-; AVX512BW-NEXT: subb %cl, %al
-; AVX512BW-NEXT: shrb %al
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: shrb $2, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $4, %xmm2, %eax
-; AVX512BW-NEXT: imull $37, %eax, %ecx
-; AVX512BW-NEXT: shrl $8, %ecx
-; AVX512BW-NEXT: subb %cl, %al
-; AVX512BW-NEXT: shrb %al
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: shrb $2, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $5, %xmm2, %eax
-; AVX512BW-NEXT: imull $37, %eax, %ecx
-; AVX512BW-NEXT: shrl $8, %ecx
-; AVX512BW-NEXT: subb %cl, %al
-; AVX512BW-NEXT: shrb %al
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: shrb $2, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $6, %xmm2, %eax
-; AVX512BW-NEXT: imull $37, %eax, %ecx
-; AVX512BW-NEXT: shrl $8, %ecx
-; AVX512BW-NEXT: subb %cl, %al
-; AVX512BW-NEXT: shrb %al
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: shrb $2, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $7, %xmm2, %eax
-; AVX512BW-NEXT: imull $37, %eax, %ecx
-; AVX512BW-NEXT: shrl $8, %ecx
-; AVX512BW-NEXT: subb %cl, %al
-; AVX512BW-NEXT: shrb %al
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: shrb $2, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $8, %xmm2, %eax
-; AVX512BW-NEXT: imull $37, %eax, %ecx
-; AVX512BW-NEXT: shrl $8, %ecx
-; AVX512BW-NEXT: subb %cl, %al
-; AVX512BW-NEXT: shrb %al
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: shrb $2, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $9, %xmm2, %eax
-; AVX512BW-NEXT: imull $37, %eax, %ecx
-; AVX512BW-NEXT: shrl $8, %ecx
-; AVX512BW-NEXT: subb %cl, %al
-; AVX512BW-NEXT: shrb %al
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: shrb $2, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $10, %xmm2, %eax
-; AVX512BW-NEXT: imull $37, %eax, %ecx
-; AVX512BW-NEXT: shrl $8, %ecx
-; AVX512BW-NEXT: subb %cl, %al
-; AVX512BW-NEXT: shrb %al
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: shrb $2, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $11, %xmm2, %eax
-; AVX512BW-NEXT: imull $37, %eax, %ecx
-; AVX512BW-NEXT: shrl $8, %ecx
-; AVX512BW-NEXT: subb %cl, %al
-; AVX512BW-NEXT: shrb %al
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: shrb $2, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $12, %xmm2, %eax
-; AVX512BW-NEXT: imull $37, %eax, %ecx
-; AVX512BW-NEXT: shrl $8, %ecx
-; AVX512BW-NEXT: subb %cl, %al
-; AVX512BW-NEXT: shrb %al
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: shrb $2, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $13, %xmm2, %eax
-; AVX512BW-NEXT: imull $37, %eax, %ecx
-; AVX512BW-NEXT: shrl $8, %ecx
-; AVX512BW-NEXT: subb %cl, %al
-; AVX512BW-NEXT: shrb %al
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: shrb $2, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $14, %xmm2, %eax
-; AVX512BW-NEXT: imull $37, %eax, %ecx
-; AVX512BW-NEXT: shrl $8, %ecx
-; AVX512BW-NEXT: subb %cl, %al
-; AVX512BW-NEXT: shrb %al
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: shrb $2, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $15, %xmm2, %eax
-; AVX512BW-NEXT: imull $37, %eax, %ecx
-; AVX512BW-NEXT: shrl $8, %ecx
-; AVX512BW-NEXT: subb %cl, %al
-; AVX512BW-NEXT: shrb %al
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: shrb $2, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm3, %xmm2
-; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
-; AVX512BW-NEXT: vextracti32x4 $1, %zmm0, %xmm2
-; AVX512BW-NEXT: vpextrb $1, %xmm2, %eax
-; AVX512BW-NEXT: imull $37, %eax, %ecx
-; AVX512BW-NEXT: shrl $8, %ecx
-; AVX512BW-NEXT: subb %cl, %al
-; AVX512BW-NEXT: shrb %al
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: shrb $2, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpextrb $0, %xmm2, %ecx
-; AVX512BW-NEXT: imull $37, %ecx, %edx
-; AVX512BW-NEXT: shrl $8, %edx
-; AVX512BW-NEXT: subb %dl, %cl
-; AVX512BW-NEXT: shrb %cl
-; AVX512BW-NEXT: addb %dl, %cl
-; AVX512BW-NEXT: shrb $2, %cl
-; AVX512BW-NEXT: movzbl %cl, %ecx
-; AVX512BW-NEXT: vmovd %ecx, %xmm3
-; AVX512BW-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $2, %xmm2, %eax
-; AVX512BW-NEXT: imull $37, %eax, %ecx
-; AVX512BW-NEXT: shrl $8, %ecx
-; AVX512BW-NEXT: subb %cl, %al
-; AVX512BW-NEXT: shrb %al
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: shrb $2, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $3, %xmm2, %eax
-; AVX512BW-NEXT: imull $37, %eax, %ecx
-; AVX512BW-NEXT: shrl $8, %ecx
-; AVX512BW-NEXT: subb %cl, %al
-; AVX512BW-NEXT: shrb %al
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: shrb $2, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $4, %xmm2, %eax
-; AVX512BW-NEXT: imull $37, %eax, %ecx
-; AVX512BW-NEXT: shrl $8, %ecx
-; AVX512BW-NEXT: subb %cl, %al
-; AVX512BW-NEXT: shrb %al
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: shrb $2, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $5, %xmm2, %eax
-; AVX512BW-NEXT: imull $37, %eax, %ecx
-; AVX512BW-NEXT: shrl $8, %ecx
-; AVX512BW-NEXT: subb %cl, %al
-; AVX512BW-NEXT: shrb %al
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: shrb $2, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $6, %xmm2, %eax
-; AVX512BW-NEXT: imull $37, %eax, %ecx
-; AVX512BW-NEXT: shrl $8, %ecx
-; AVX512BW-NEXT: subb %cl, %al
-; AVX512BW-NEXT: shrb %al
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: shrb $2, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $7, %xmm2, %eax
-; AVX512BW-NEXT: imull $37, %eax, %ecx
-; AVX512BW-NEXT: shrl $8, %ecx
-; AVX512BW-NEXT: subb %cl, %al
-; AVX512BW-NEXT: shrb %al
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: shrb $2, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $8, %xmm2, %eax
-; AVX512BW-NEXT: imull $37, %eax, %ecx
-; AVX512BW-NEXT: shrl $8, %ecx
-; AVX512BW-NEXT: subb %cl, %al
-; AVX512BW-NEXT: shrb %al
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: shrb $2, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $9, %xmm2, %eax
-; AVX512BW-NEXT: imull $37, %eax, %ecx
-; AVX512BW-NEXT: shrl $8, %ecx
-; AVX512BW-NEXT: subb %cl, %al
-; AVX512BW-NEXT: shrb %al
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: shrb $2, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $10, %xmm2, %eax
-; AVX512BW-NEXT: imull $37, %eax, %ecx
-; AVX512BW-NEXT: shrl $8, %ecx
-; AVX512BW-NEXT: subb %cl, %al
-; AVX512BW-NEXT: shrb %al
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: shrb $2, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $11, %xmm2, %eax
-; AVX512BW-NEXT: imull $37, %eax, %ecx
-; AVX512BW-NEXT: shrl $8, %ecx
-; AVX512BW-NEXT: subb %cl, %al
-; AVX512BW-NEXT: shrb %al
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: shrb $2, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $12, %xmm2, %eax
-; AVX512BW-NEXT: imull $37, %eax, %ecx
-; AVX512BW-NEXT: shrl $8, %ecx
-; AVX512BW-NEXT: subb %cl, %al
-; AVX512BW-NEXT: shrb %al
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: shrb $2, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $13, %xmm2, %eax
-; AVX512BW-NEXT: imull $37, %eax, %ecx
-; AVX512BW-NEXT: shrl $8, %ecx
-; AVX512BW-NEXT: subb %cl, %al
-; AVX512BW-NEXT: shrb %al
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: shrb $2, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $14, %xmm2, %eax
-; AVX512BW-NEXT: imull $37, %eax, %ecx
-; AVX512BW-NEXT: shrl $8, %ecx
-; AVX512BW-NEXT: subb %cl, %al
-; AVX512BW-NEXT: shrb %al
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: shrb $2, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $15, %xmm2, %eax
-; AVX512BW-NEXT: imull $37, %eax, %ecx
-; AVX512BW-NEXT: shrl $8, %ecx
-; AVX512BW-NEXT: subb %cl, %al
-; AVX512BW-NEXT: shrb %al
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: shrb $2, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm3, %xmm2
-; AVX512BW-NEXT: vpextrb $1, %xmm0, %eax
-; AVX512BW-NEXT: imull $37, %eax, %ecx
-; AVX512BW-NEXT: shrl $8, %ecx
-; AVX512BW-NEXT: subb %cl, %al
-; AVX512BW-NEXT: shrb %al
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: shrb $2, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpextrb $0, %xmm0, %ecx
-; AVX512BW-NEXT: imull $37, %ecx, %edx
-; AVX512BW-NEXT: shrl $8, %edx
-; AVX512BW-NEXT: subb %dl, %cl
-; AVX512BW-NEXT: shrb %cl
-; AVX512BW-NEXT: addb %dl, %cl
-; AVX512BW-NEXT: shrb $2, %cl
-; AVX512BW-NEXT: movzbl %cl, %ecx
-; AVX512BW-NEXT: vmovd %ecx, %xmm3
-; AVX512BW-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $2, %xmm0, %eax
-; AVX512BW-NEXT: imull $37, %eax, %ecx
-; AVX512BW-NEXT: shrl $8, %ecx
-; AVX512BW-NEXT: subb %cl, %al
-; AVX512BW-NEXT: shrb %al
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: shrb $2, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $3, %xmm0, %eax
-; AVX512BW-NEXT: imull $37, %eax, %ecx
-; AVX512BW-NEXT: shrl $8, %ecx
-; AVX512BW-NEXT: subb %cl, %al
-; AVX512BW-NEXT: shrb %al
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: shrb $2, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $4, %xmm0, %eax
-; AVX512BW-NEXT: imull $37, %eax, %ecx
-; AVX512BW-NEXT: shrl $8, %ecx
-; AVX512BW-NEXT: subb %cl, %al
-; AVX512BW-NEXT: shrb %al
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: shrb $2, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $5, %xmm0, %eax
-; AVX512BW-NEXT: imull $37, %eax, %ecx
-; AVX512BW-NEXT: shrl $8, %ecx
-; AVX512BW-NEXT: subb %cl, %al
-; AVX512BW-NEXT: shrb %al
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: shrb $2, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $6, %xmm0, %eax
-; AVX512BW-NEXT: imull $37, %eax, %ecx
-; AVX512BW-NEXT: shrl $8, %ecx
-; AVX512BW-NEXT: subb %cl, %al
-; AVX512BW-NEXT: shrb %al
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: shrb $2, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $7, %xmm0, %eax
-; AVX512BW-NEXT: imull $37, %eax, %ecx
-; AVX512BW-NEXT: shrl $8, %ecx
-; AVX512BW-NEXT: subb %cl, %al
-; AVX512BW-NEXT: shrb %al
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: shrb $2, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $8, %xmm0, %eax
-; AVX512BW-NEXT: imull $37, %eax, %ecx
-; AVX512BW-NEXT: shrl $8, %ecx
-; AVX512BW-NEXT: subb %cl, %al
-; AVX512BW-NEXT: shrb %al
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: shrb $2, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $9, %xmm0, %eax
-; AVX512BW-NEXT: imull $37, %eax, %ecx
-; AVX512BW-NEXT: shrl $8, %ecx
-; AVX512BW-NEXT: subb %cl, %al
-; AVX512BW-NEXT: shrb %al
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: shrb $2, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $10, %xmm0, %eax
-; AVX512BW-NEXT: imull $37, %eax, %ecx
-; AVX512BW-NEXT: shrl $8, %ecx
-; AVX512BW-NEXT: subb %cl, %al
-; AVX512BW-NEXT: shrb %al
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: shrb $2, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $11, %xmm0, %eax
-; AVX512BW-NEXT: imull $37, %eax, %ecx
-; AVX512BW-NEXT: shrl $8, %ecx
-; AVX512BW-NEXT: subb %cl, %al
-; AVX512BW-NEXT: shrb %al
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: shrb $2, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $12, %xmm0, %eax
-; AVX512BW-NEXT: imull $37, %eax, %ecx
-; AVX512BW-NEXT: shrl $8, %ecx
-; AVX512BW-NEXT: subb %cl, %al
-; AVX512BW-NEXT: shrb %al
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: shrb $2, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $13, %xmm0, %eax
-; AVX512BW-NEXT: imull $37, %eax, %ecx
-; AVX512BW-NEXT: shrl $8, %ecx
-; AVX512BW-NEXT: subb %cl, %al
-; AVX512BW-NEXT: shrb %al
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: shrb $2, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $14, %xmm0, %eax
-; AVX512BW-NEXT: imull $37, %eax, %ecx
-; AVX512BW-NEXT: shrl $8, %ecx
-; AVX512BW-NEXT: subb %cl, %al
-; AVX512BW-NEXT: shrb %al
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: shrb $2, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $15, %xmm0, %eax
-; AVX512BW-NEXT: imull $37, %eax, %ecx
-; AVX512BW-NEXT: shrl $8, %ecx
-; AVX512BW-NEXT: subb %cl, %al
-; AVX512BW-NEXT: shrb %al
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: shrb $2, %al
-; AVX512BW-NEXT: movzbl %al, %eax
-; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm3, %xmm0
-; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
-; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
+; AVX512BW-NEXT: vpmullw %zmm2, %zmm1, %zmm1
+; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1
+; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1
+; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero,ymm3[16],zero,ymm3[17],zero,ymm3[18],zero,ymm3[19],zero,ymm3[20],zero,ymm3[21],zero,ymm3[22],zero,ymm3[23],zero,ymm3[24],zero,ymm3[25],zero,ymm3[26],zero,ymm3[27],zero,ymm3[28],zero,ymm3[29],zero,ymm3[30],zero,ymm3[31],zero
+; AVX512BW-NEXT: vpmullw %zmm2, %zmm3, %zmm2
+; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2
+; AVX512BW-NEXT: vpmovwb %zmm2, %ymm2
+; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512BW-NEXT: vpsubb %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpsrlw $1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0
+; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpsrlw $2, %zmm0, %zmm0
+; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0
; AVX512BW-NEXT: retq
%res = udiv <64 x i8> %a, <i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7>
ret <64 x i8> %res
@@ -897,7 +206,7 @@ define <64 x i8> @test_div7_64i8(<64 x i8> %a) nounwind {
define <8 x i64> @test_rem7_8i64(<8 x i64> %a) nounwind {
; AVX-LABEL: test_rem7_8i64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vextracti32x4 $3, %zmm0, %xmm1
; AVX-NEXT: vpextrq $1, %xmm1, %rcx
; AVX-NEXT: movabsq $2635249153387078803, %rsi # imm = 0x2492492492492493
@@ -952,7 +261,7 @@ define <8 x i64> @test_rem7_8i64(<8 x i64> %a) nounwind {
; AVX-NEXT: vmovq %rcx, %xmm2
; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
; AVX-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
-; AVX-NEXT: vextracti32x4 $1, %zmm0, %xmm2
+; AVX-NEXT: vextracti128 $1, %ymm0, %xmm2
; AVX-NEXT: vpextrq $1, %xmm2, %rcx
; AVX-NEXT: movq %rcx, %rax
; AVX-NEXT: mulq %rsi
@@ -1012,205 +321,20 @@ define <8 x i64> @test_rem7_8i64(<8 x i64> %a) nounwind {
define <16 x i32> @test_rem7_16i32(<16 x i32> %a) nounwind {
; AVX-LABEL: test_rem7_16i32:
-; AVX: # BB#0:
-; AVX-NEXT: vextracti32x4 $3, %zmm0, %xmm1
-; AVX-NEXT: vpextrd $1, %xmm1, %eax
-; AVX-NEXT: imulq $613566757, %rax, %rcx # imm = 0x24924925
-; AVX-NEXT: shrq $32, %rcx
-; AVX-NEXT: movl %eax, %edx
-; AVX-NEXT: subl %ecx, %edx
-; AVX-NEXT: shrl %edx
-; AVX-NEXT: addl %ecx, %edx
-; AVX-NEXT: shrl $2, %edx
-; AVX-NEXT: leal (,%rdx,8), %ecx
-; AVX-NEXT: subl %edx, %ecx
-; AVX-NEXT: subl %ecx, %eax
-; AVX-NEXT: vmovd %xmm1, %ecx
-; AVX-NEXT: imulq $613566757, %rcx, %rdx # imm = 0x24924925
-; AVX-NEXT: shrq $32, %rdx
-; AVX-NEXT: movl %ecx, %esi
-; AVX-NEXT: subl %edx, %esi
-; AVX-NEXT: shrl %esi
-; AVX-NEXT: addl %edx, %esi
-; AVX-NEXT: shrl $2, %esi
-; AVX-NEXT: leal (,%rsi,8), %edx
-; AVX-NEXT: subl %esi, %edx
-; AVX-NEXT: subl %edx, %ecx
-; AVX-NEXT: vmovd %ecx, %xmm2
-; AVX-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2
-; AVX-NEXT: vpextrd $2, %xmm1, %eax
-; AVX-NEXT: imulq $613566757, %rax, %rcx # imm = 0x24924925
-; AVX-NEXT: shrq $32, %rcx
-; AVX-NEXT: movl %eax, %edx
-; AVX-NEXT: subl %ecx, %edx
-; AVX-NEXT: shrl %edx
-; AVX-NEXT: addl %ecx, %edx
-; AVX-NEXT: shrl $2, %edx
-; AVX-NEXT: leal (,%rdx,8), %ecx
-; AVX-NEXT: subl %edx, %ecx
-; AVX-NEXT: subl %ecx, %eax
-; AVX-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2
-; AVX-NEXT: vpextrd $3, %xmm1, %eax
-; AVX-NEXT: imulq $613566757, %rax, %rcx # imm = 0x24924925
-; AVX-NEXT: shrq $32, %rcx
-; AVX-NEXT: movl %eax, %edx
-; AVX-NEXT: subl %ecx, %edx
-; AVX-NEXT: shrl %edx
-; AVX-NEXT: addl %ecx, %edx
-; AVX-NEXT: shrl $2, %edx
-; AVX-NEXT: leal (,%rdx,8), %ecx
-; AVX-NEXT: subl %edx, %ecx
-; AVX-NEXT: subl %ecx, %eax
-; AVX-NEXT: vpinsrd $3, %eax, %xmm2, %xmm1
-; AVX-NEXT: vextracti32x4 $2, %zmm0, %xmm2
-; AVX-NEXT: vpextrd $1, %xmm2, %eax
-; AVX-NEXT: imulq $613566757, %rax, %rcx # imm = 0x24924925
-; AVX-NEXT: shrq $32, %rcx
-; AVX-NEXT: movl %eax, %edx
-; AVX-NEXT: subl %ecx, %edx
-; AVX-NEXT: shrl %edx
-; AVX-NEXT: addl %ecx, %edx
-; AVX-NEXT: shrl $2, %edx
-; AVX-NEXT: leal (,%rdx,8), %ecx
-; AVX-NEXT: subl %edx, %ecx
-; AVX-NEXT: subl %ecx, %eax
-; AVX-NEXT: vmovd %xmm2, %ecx
-; AVX-NEXT: imulq $613566757, %rcx, %rdx # imm = 0x24924925
-; AVX-NEXT: shrq $32, %rdx
-; AVX-NEXT: movl %ecx, %esi
-; AVX-NEXT: subl %edx, %esi
-; AVX-NEXT: shrl %esi
-; AVX-NEXT: addl %edx, %esi
-; AVX-NEXT: shrl $2, %esi
-; AVX-NEXT: leal (,%rsi,8), %edx
-; AVX-NEXT: subl %esi, %edx
-; AVX-NEXT: subl %edx, %ecx
-; AVX-NEXT: vmovd %ecx, %xmm3
-; AVX-NEXT: vpinsrd $1, %eax, %xmm3, %xmm3
-; AVX-NEXT: vpextrd $2, %xmm2, %eax
-; AVX-NEXT: imulq $613566757, %rax, %rcx # imm = 0x24924925
-; AVX-NEXT: shrq $32, %rcx
-; AVX-NEXT: movl %eax, %edx
-; AVX-NEXT: subl %ecx, %edx
-; AVX-NEXT: shrl %edx
-; AVX-NEXT: addl %ecx, %edx
-; AVX-NEXT: shrl $2, %edx
-; AVX-NEXT: leal (,%rdx,8), %ecx
-; AVX-NEXT: subl %edx, %ecx
-; AVX-NEXT: subl %ecx, %eax
-; AVX-NEXT: vpinsrd $2, %eax, %xmm3, %xmm3
-; AVX-NEXT: vpextrd $3, %xmm2, %eax
-; AVX-NEXT: imulq $613566757, %rax, %rcx # imm = 0x24924925
-; AVX-NEXT: shrq $32, %rcx
-; AVX-NEXT: movl %eax, %edx
-; AVX-NEXT: subl %ecx, %edx
-; AVX-NEXT: shrl %edx
-; AVX-NEXT: addl %ecx, %edx
-; AVX-NEXT: shrl $2, %edx
-; AVX-NEXT: leal (,%rdx,8), %ecx
-; AVX-NEXT: subl %edx, %ecx
-; AVX-NEXT: subl %ecx, %eax
-; AVX-NEXT: vpinsrd $3, %eax, %xmm3, %xmm2
-; AVX-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
-; AVX-NEXT: vextracti32x4 $1, %zmm0, %xmm2
-; AVX-NEXT: vpextrd $1, %xmm2, %eax
-; AVX-NEXT: imulq $613566757, %rax, %rcx # imm = 0x24924925
-; AVX-NEXT: shrq $32, %rcx
-; AVX-NEXT: movl %eax, %edx
-; AVX-NEXT: subl %ecx, %edx
-; AVX-NEXT: shrl %edx
-; AVX-NEXT: addl %ecx, %edx
-; AVX-NEXT: shrl $2, %edx
-; AVX-NEXT: leal (,%rdx,8), %ecx
-; AVX-NEXT: subl %edx, %ecx
-; AVX-NEXT: subl %ecx, %eax
-; AVX-NEXT: vmovd %xmm2, %ecx
-; AVX-NEXT: imulq $613566757, %rcx, %rdx # imm = 0x24924925
-; AVX-NEXT: shrq $32, %rdx
-; AVX-NEXT: movl %ecx, %esi
-; AVX-NEXT: subl %edx, %esi
-; AVX-NEXT: shrl %esi
-; AVX-NEXT: addl %edx, %esi
-; AVX-NEXT: shrl $2, %esi
-; AVX-NEXT: leal (,%rsi,8), %edx
-; AVX-NEXT: subl %esi, %edx
-; AVX-NEXT: subl %edx, %ecx
-; AVX-NEXT: vmovd %ecx, %xmm3
-; AVX-NEXT: vpinsrd $1, %eax, %xmm3, %xmm3
-; AVX-NEXT: vpextrd $2, %xmm2, %eax
-; AVX-NEXT: imulq $613566757, %rax, %rcx # imm = 0x24924925
-; AVX-NEXT: shrq $32, %rcx
-; AVX-NEXT: movl %eax, %edx
-; AVX-NEXT: subl %ecx, %edx
-; AVX-NEXT: shrl %edx
-; AVX-NEXT: addl %ecx, %edx
-; AVX-NEXT: shrl $2, %edx
-; AVX-NEXT: leal (,%rdx,8), %ecx
-; AVX-NEXT: subl %edx, %ecx
-; AVX-NEXT: subl %ecx, %eax
-; AVX-NEXT: vpinsrd $2, %eax, %xmm3, %xmm3
-; AVX-NEXT: vpextrd $3, %xmm2, %eax
-; AVX-NEXT: imulq $613566757, %rax, %rcx # imm = 0x24924925
-; AVX-NEXT: shrq $32, %rcx
-; AVX-NEXT: movl %eax, %edx
-; AVX-NEXT: subl %ecx, %edx
-; AVX-NEXT: shrl %edx
-; AVX-NEXT: addl %ecx, %edx
-; AVX-NEXT: shrl $2, %edx
-; AVX-NEXT: leal (,%rdx,8), %ecx
-; AVX-NEXT: subl %edx, %ecx
-; AVX-NEXT: subl %ecx, %eax
-; AVX-NEXT: vpinsrd $3, %eax, %xmm3, %xmm2
-; AVX-NEXT: vpextrd $1, %xmm0, %eax
-; AVX-NEXT: imulq $613566757, %rax, %rcx # imm = 0x24924925
-; AVX-NEXT: shrq $32, %rcx
-; AVX-NEXT: movl %eax, %edx
-; AVX-NEXT: subl %ecx, %edx
-; AVX-NEXT: shrl %edx
-; AVX-NEXT: addl %ecx, %edx
-; AVX-NEXT: shrl $2, %edx
-; AVX-NEXT: leal (,%rdx,8), %ecx
-; AVX-NEXT: subl %edx, %ecx
-; AVX-NEXT: subl %ecx, %eax
-; AVX-NEXT: vmovd %xmm0, %ecx
-; AVX-NEXT: imulq $613566757, %rcx, %rdx # imm = 0x24924925
-; AVX-NEXT: shrq $32, %rdx
-; AVX-NEXT: movl %ecx, %esi
-; AVX-NEXT: subl %edx, %esi
-; AVX-NEXT: shrl %esi
-; AVX-NEXT: addl %edx, %esi
-; AVX-NEXT: shrl $2, %esi
-; AVX-NEXT: leal (,%rsi,8), %edx
-; AVX-NEXT: subl %esi, %edx
-; AVX-NEXT: subl %edx, %ecx
-; AVX-NEXT: vmovd %ecx, %xmm3
-; AVX-NEXT: vpinsrd $1, %eax, %xmm3, %xmm3
-; AVX-NEXT: vpextrd $2, %xmm0, %eax
-; AVX-NEXT: imulq $613566757, %rax, %rcx # imm = 0x24924925
-; AVX-NEXT: shrq $32, %rcx
-; AVX-NEXT: movl %eax, %edx
-; AVX-NEXT: subl %ecx, %edx
-; AVX-NEXT: shrl %edx
-; AVX-NEXT: addl %ecx, %edx
-; AVX-NEXT: shrl $2, %edx
-; AVX-NEXT: leal (,%rdx,8), %ecx
-; AVX-NEXT: subl %edx, %ecx
-; AVX-NEXT: subl %ecx, %eax
-; AVX-NEXT: vpinsrd $2, %eax, %xmm3, %xmm3
-; AVX-NEXT: vpextrd $3, %xmm0, %eax
-; AVX-NEXT: imulq $613566757, %rax, %rcx # imm = 0x24924925
-; AVX-NEXT: shrq $32, %rcx
-; AVX-NEXT: movl %eax, %edx
-; AVX-NEXT: subl %ecx, %edx
-; AVX-NEXT: shrl %edx
-; AVX-NEXT: addl %ecx, %edx
-; AVX-NEXT: shrl $2, %edx
-; AVX-NEXT: leal (,%rdx,8), %ecx
-; AVX-NEXT: subl %edx, %ecx
-; AVX-NEXT: subl %ecx, %eax
-; AVX-NEXT: vpinsrd $3, %eax, %xmm3, %xmm0
-; AVX-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
-; AVX-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX: # %bb.0:
+; AVX-NEXT: vpbroadcastd {{.*#+}} zmm1 = [613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757]
+; AVX-NEXT: vpmuludq %zmm1, %zmm0, %zmm2
+; AVX-NEXT: vpshufd {{.*#+}} zmm1 = zmm1[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
+; AVX-NEXT: vpshufd {{.*#+}} zmm3 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
+; AVX-NEXT: vpmuludq %zmm1, %zmm3, %zmm1
+; AVX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31]
+; AVX-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
+; AVX-NEXT: vpsubd %zmm3, %zmm0, %zmm1
+; AVX-NEXT: vpsrld $1, %zmm1, %zmm1
+; AVX-NEXT: vpaddd %zmm3, %zmm1, %zmm1
+; AVX-NEXT: vpsrld $2, %zmm1, %zmm1
+; AVX-NEXT: vpmulld {{.*}}(%rip){1to16}, %zmm1, %zmm1
+; AVX-NEXT: vpsubd %zmm1, %zmm0, %zmm0
; AVX-NEXT: retq
%res = urem <16 x i32> %a, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
ret <16 x i32> %res
@@ -1218,7 +342,7 @@ define <16 x i32> @test_rem7_16i32(<16 x i32> %a) nounwind {
define <32 x i16> @test_rem7_32i16(<32 x i16> %a) nounwind {
; AVX512F-LABEL: test_rem7_32i16:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363]
; AVX512F-NEXT: vpmulhuw %ymm2, %ymm0, %ymm3
; AVX512F-NEXT: vpsubw %ymm3, %ymm0, %ymm4
@@ -1238,7 +362,7 @@ define <32 x i16> @test_rem7_32i16(<32 x i16> %a) nounwind {
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: test_rem7_32i16:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpmulhuw {{.*}}(%rip), %zmm0, %zmm1
; AVX512BW-NEXT: vpsubw %zmm1, %zmm0, %zmm2
; AVX512BW-NEXT: vpsrlw $1, %zmm2, %zmm2
@@ -1253,57 +377,54 @@ define <32 x i16> @test_rem7_32i16(<32 x i16> %a) nounwind {
define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind {
; AVX512F-LABEL: test_rem7_64i8:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
-; AVX512F-NEXT: vextracti128 $1, %ymm3, %xmm2
-; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
-; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm4
-; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
-; AVX512F-NEXT: vpmullw %ymm2, %ymm4, %ymm4
-; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm5
-; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero,xmm3[8],zero,xmm3[9],zero,xmm3[10],zero,xmm3[11],zero,xmm3[12],zero,xmm3[13],zero,xmm3[14],zero,xmm3[15],zero
-; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512F-NEXT: vpmullw %ymm4, %ymm3, %ymm3
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
+; AVX512F-NEXT: vpmullw %ymm2, %ymm3, %ymm3
; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
-; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm3[2,3],ymm5[2,3]
-; AVX512F-NEXT: vinserti128 $1, %xmm5, %ymm3, %ymm3
-; AVX512F-NEXT: vpackuswb %ymm6, %ymm3, %ymm3
-; AVX512F-NEXT: vpsubb %ymm3, %ymm0, %ymm5
-; AVX512F-NEXT: vpsrlw $1, %ymm5, %ymm6
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; AVX512F-NEXT: vpand %ymm5, %ymm6, %ymm6
-; AVX512F-NEXT: vpaddb %ymm3, %ymm6, %ymm3
+; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512F-NEXT: vpmullw %ymm2, %ymm4, %ymm4
+; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4
+; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm4[2,3],ymm3[2,3]
+; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3
+; AVX512F-NEXT: vpackuswb %ymm5, %ymm3, %ymm3
+; AVX512F-NEXT: vpsubb %ymm3, %ymm0, %ymm4
+; AVX512F-NEXT: vpsrlw $1, %ymm4, %ymm5
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512F-NEXT: vpand %ymm4, %ymm5, %ymm5
+; AVX512F-NEXT: vpaddb %ymm3, %ymm5, %ymm3
; AVX512F-NEXT: vpsrlw $2, %ymm3, %ymm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
-; AVX512F-NEXT: vpand %ymm6, %ymm3, %ymm7
-; AVX512F-NEXT: vpmovsxbw %xmm7, %ymm8
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
+; AVX512F-NEXT: vpand %ymm5, %ymm3, %ymm6
+; AVX512F-NEXT: vpmovsxbw %xmm6, %ymm7
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512F-NEXT: vpmullw %ymm3, %ymm8, %ymm8
-; AVX512F-NEXT: vpmovsxwd %ymm8, %zmm8
-; AVX512F-NEXT: vpmovdb %zmm8, %xmm8
-; AVX512F-NEXT: vextracti128 $1, %ymm7, %xmm7
-; AVX512F-NEXT: vpmovsxbw %xmm7, %ymm7
; AVX512F-NEXT: vpmullw %ymm3, %ymm7, %ymm7
; AVX512F-NEXT: vpmovsxwd %ymm7, %zmm7
; AVX512F-NEXT: vpmovdb %zmm7, %xmm7
-; AVX512F-NEXT: vinserti128 $1, %xmm7, %ymm8, %ymm7
-; AVX512F-NEXT: vpsubb %ymm7, %ymm0, %ymm0
-; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm7
-; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm7 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero,xmm7[4],zero,xmm7[5],zero,xmm7[6],zero,xmm7[7],zero,xmm7[8],zero,xmm7[9],zero,xmm7[10],zero,xmm7[11],zero,xmm7[12],zero,xmm7[13],zero,xmm7[14],zero,xmm7[15],zero
+; AVX512F-NEXT: vextracti128 $1, %ymm6, %xmm6
+; AVX512F-NEXT: vpmovsxbw %xmm6, %ymm6
+; AVX512F-NEXT: vpmullw %ymm3, %ymm6, %ymm6
+; AVX512F-NEXT: vpmovsxwd %ymm6, %zmm6
+; AVX512F-NEXT: vpmovdb %zmm6, %xmm6
+; AVX512F-NEXT: vinserti128 $1, %xmm6, %ymm7, %ymm6
+; AVX512F-NEXT: vpsubb %ymm6, %ymm0, %ymm0
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm6
+; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero,xmm6[8],zero,xmm6[9],zero,xmm6[10],zero,xmm6[11],zero,xmm6[12],zero,xmm6[13],zero,xmm6[14],zero,xmm6[15],zero
+; AVX512F-NEXT: vpmullw %ymm2, %ymm6, %ymm6
+; AVX512F-NEXT: vpsrlw $8, %ymm6, %ymm6
+; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm7 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
; AVX512F-NEXT: vpmullw %ymm2, %ymm7, %ymm2
; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2
-; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm7 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512F-NEXT: vpmullw %ymm4, %ymm7, %ymm4
-; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm4
-; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm4[2,3],ymm2[2,3]
-; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2
+; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm2[2,3],ymm6[2,3]
+; AVX512F-NEXT: vinserti128 $1, %xmm6, %ymm2, %ymm2
; AVX512F-NEXT: vpackuswb %ymm7, %ymm2, %ymm2
-; AVX512F-NEXT: vpsubb %ymm2, %ymm1, %ymm4
-; AVX512F-NEXT: vpsrlw $1, %ymm4, %ymm4
-; AVX512F-NEXT: vpand %ymm5, %ymm4, %ymm4
+; AVX512F-NEXT: vpsubb %ymm2, %ymm1, %ymm6
+; AVX512F-NEXT: vpsrlw $1, %ymm6, %ymm6
+; AVX512F-NEXT: vpand %ymm4, %ymm6, %ymm4
; AVX512F-NEXT: vpaddb %ymm2, %ymm4, %ymm2
; AVX512F-NEXT: vpsrlw $2, %ymm2, %ymm2
-; AVX512F-NEXT: vpand %ymm6, %ymm2, %ymm2
+; AVX512F-NEXT: vpand %ymm5, %ymm2, %ymm2
; AVX512F-NEXT: vpmovsxbw %xmm2, %ymm4
; AVX512F-NEXT: vpmullw %ymm3, %ymm4, %ymm4
; AVX512F-NEXT: vpmovsxwd %ymm4, %zmm4
@@ -1318,782 +439,34 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind {
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: test_rem7_64i8:
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm1
-; AVX512BW-NEXT: vpextrb $1, %xmm1, %edx
-; AVX512BW-NEXT: imull $37, %edx, %ecx
-; AVX512BW-NEXT: shrl $8, %ecx
-; AVX512BW-NEXT: movl %edx, %eax
-; AVX512BW-NEXT: subb %cl, %al
-; AVX512BW-NEXT: shrb %al
-; AVX512BW-NEXT: addb %cl, %al
-; AVX512BW-NEXT: shrb $2, %al
-; AVX512BW-NEXT: movb $7, %cl
-; AVX512BW-NEXT: mulb %cl
-; AVX512BW-NEXT: subb %al, %dl
-; AVX512BW-NEXT: movzbl %dl, %edx
-; AVX512BW-NEXT: vpextrb $0, %xmm1, %esi
-; AVX512BW-NEXT: imull $37, %esi, %edi
-; AVX512BW-NEXT: shrl $8, %edi
-; AVX512BW-NEXT: movl %esi, %eax
-; AVX512BW-NEXT: subb %dil, %al
-; AVX512BW-NEXT: shrb %al
-; AVX512BW-NEXT: addb %dil, %al
-; AVX512BW-NEXT: shrb $2, %al
-; AVX512BW-NEXT: mulb %cl
-; AVX512BW-NEXT: subb %al, %sil
-; AVX512BW-NEXT: movzbl %sil, %eax
-; AVX512BW-NEXT: vmovd %eax, %xmm2
-; AVX512BW-NEXT: vpinsrb $1, %edx, %xmm2, %xmm2
-; AVX512BW-NEXT: vpextrb $2, %xmm1, %edx
-; AVX512BW-NEXT: imull $37, %edx, %esi
-; AVX512BW-NEXT: shrl $8, %esi
-; AVX512BW-NEXT: movl %edx, %eax
-; AVX512BW-NEXT: subb %sil, %al
-; AVX512BW-NEXT: shrb %al
-; AVX512BW-NEXT: addb %sil, %al
-; AVX512BW-NEXT: shrb $2, %al
-; AVX512BW-NEXT: mulb %cl
-; AVX512BW-NEXT: subb %al, %dl
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT: vpextrb $3, %xmm1, %edx
-; AVX512BW-NEXT: imull $37, %edx, %esi
-; AVX512BW-NEXT: shrl $8, %esi
-; AVX512BW-NEXT: movl %edx, %eax
-; AVX512BW-NEXT: subb %sil, %al
-; AVX512BW-NEXT: shrb %al
-; AVX512BW-NEXT: addb %sil, %al
-; AVX512BW-NEXT: shrb $2, %al
-; AVX512BW-NEXT: mulb %cl
-; AVX512BW-NEXT: subb %al, %dl
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT: vpextrb $4, %xmm1, %edx
-; AVX512BW-NEXT: imull $37, %edx, %esi
-; AVX512BW-NEXT: shrl $8, %esi
-; AVX512BW-NEXT: movl %edx, %eax
-; AVX512BW-NEXT: subb %sil, %al
-; AVX512BW-NEXT: shrb %al
-; AVX512BW-NEXT: addb %sil, %al
-; AVX512BW-NEXT: shrb $2, %al
-; AVX512BW-NEXT: mulb %cl
-; AVX512BW-NEXT: subb %al, %dl
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT: vpextrb $5, %xmm1, %edx
-; AVX512BW-NEXT: imull $37, %edx, %esi
-; AVX512BW-NEXT: shrl $8, %esi
-; AVX512BW-NEXT: movl %edx, %eax
-; AVX512BW-NEXT: subb %sil, %al
-; AVX512BW-NEXT: shrb %al
-; AVX512BW-NEXT: addb %sil, %al
-; AVX512BW-NEXT: shrb $2, %al
-; AVX512BW-NEXT: mulb %cl
-; AVX512BW-NEXT: subb %al, %dl
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT: vpextrb $6, %xmm1, %edx
-; AVX512BW-NEXT: imull $37, %edx, %esi
-; AVX512BW-NEXT: shrl $8, %esi
-; AVX512BW-NEXT: movl %edx, %eax
-; AVX512BW-NEXT: subb %sil, %al
-; AVX512BW-NEXT: shrb %al
-; AVX512BW-NEXT: addb %sil, %al
-; AVX512BW-NEXT: shrb $2, %al
-; AVX512BW-NEXT: mulb %cl
-; AVX512BW-NEXT: subb %al, %dl
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT: vpextrb $7, %xmm1, %edx
-; AVX512BW-NEXT: imull $37, %edx, %esi
-; AVX512BW-NEXT: shrl $8, %esi
-; AVX512BW-NEXT: movl %edx, %eax
-; AVX512BW-NEXT: subb %sil, %al
-; AVX512BW-NEXT: shrb %al
-; AVX512BW-NEXT: addb %sil, %al
-; AVX512BW-NEXT: shrb $2, %al
-; AVX512BW-NEXT: mulb %cl
-; AVX512BW-NEXT: subb %al, %dl
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT: vpextrb $8, %xmm1, %edx
-; AVX512BW-NEXT: imull $37, %edx, %esi
-; AVX512BW-NEXT: shrl $8, %esi
-; AVX512BW-NEXT: movl %edx, %eax
-; AVX512BW-NEXT: subb %sil, %al
-; AVX512BW-NEXT: shrb %al
-; AVX512BW-NEXT: addb %sil, %al
-; AVX512BW-NEXT: shrb $2, %al
-; AVX512BW-NEXT: mulb %cl
-; AVX512BW-NEXT: subb %al, %dl
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT: vpextrb $9, %xmm1, %edx
-; AVX512BW-NEXT: imull $37, %edx, %esi
-; AVX512BW-NEXT: shrl $8, %esi
-; AVX512BW-NEXT: movl %edx, %eax
-; AVX512BW-NEXT: subb %sil, %al
-; AVX512BW-NEXT: shrb %al
-; AVX512BW-NEXT: addb %sil, %al
-; AVX512BW-NEXT: shrb $2, %al
-; AVX512BW-NEXT: mulb %cl
-; AVX512BW-NEXT: subb %al, %dl
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT: vpextrb $10, %xmm1, %edx
-; AVX512BW-NEXT: imull $37, %edx, %esi
-; AVX512BW-NEXT: shrl $8, %esi
-; AVX512BW-NEXT: movl %edx, %eax
-; AVX512BW-NEXT: subb %sil, %al
-; AVX512BW-NEXT: shrb %al
-; AVX512BW-NEXT: addb %sil, %al
-; AVX512BW-NEXT: shrb $2, %al
-; AVX512BW-NEXT: mulb %cl
-; AVX512BW-NEXT: subb %al, %dl
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT: vpextrb $11, %xmm1, %edx
-; AVX512BW-NEXT: imull $37, %edx, %esi
-; AVX512BW-NEXT: shrl $8, %esi
-; AVX512BW-NEXT: movl %edx, %eax
-; AVX512BW-NEXT: subb %sil, %al
-; AVX512BW-NEXT: shrb %al
-; AVX512BW-NEXT: addb %sil, %al
-; AVX512BW-NEXT: shrb $2, %al
-; AVX512BW-NEXT: mulb %cl
-; AVX512BW-NEXT: subb %al, %dl
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT: vpextrb $12, %xmm1, %edx
-; AVX512BW-NEXT: imull $37, %edx, %esi
-; AVX512BW-NEXT: shrl $8, %esi
-; AVX512BW-NEXT: movl %edx, %eax
-; AVX512BW-NEXT: subb %sil, %al
-; AVX512BW-NEXT: shrb %al
-; AVX512BW-NEXT: addb %sil, %al
-; AVX512BW-NEXT: shrb $2, %al
-; AVX512BW-NEXT: mulb %cl
-; AVX512BW-NEXT: subb %al, %dl
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT: vpextrb $13, %xmm1, %edx
-; AVX512BW-NEXT: imull $37, %edx, %esi
-; AVX512BW-NEXT: shrl $8, %esi
-; AVX512BW-NEXT: movl %edx, %eax
-; AVX512BW-NEXT: subb %sil, %al
-; AVX512BW-NEXT: shrb %al
-; AVX512BW-NEXT: addb %sil, %al
-; AVX512BW-NEXT: shrb $2, %al
-; AVX512BW-NEXT: mulb %cl
-; AVX512BW-NEXT: subb %al, %dl
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT: vpextrb $14, %xmm1, %edx
-; AVX512BW-NEXT: imull $37, %edx, %esi
-; AVX512BW-NEXT: shrl $8, %esi
-; AVX512BW-NEXT: movl %edx, %eax
-; AVX512BW-NEXT: subb %sil, %al
-; AVX512BW-NEXT: shrb %al
-; AVX512BW-NEXT: addb %sil, %al
-; AVX512BW-NEXT: shrb $2, %al
-; AVX512BW-NEXT: mulb %cl
-; AVX512BW-NEXT: subb %al, %dl
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT: vpextrb $15, %xmm1, %edx
-; AVX512BW-NEXT: imull $37, %edx, %esi
-; AVX512BW-NEXT: shrl $8, %esi
-; AVX512BW-NEXT: movl %edx, %eax
-; AVX512BW-NEXT: subb %sil, %al
-; AVX512BW-NEXT: shrb %al
-; AVX512BW-NEXT: addb %sil, %al
-; AVX512BW-NEXT: shrb $2, %al
-; AVX512BW-NEXT: mulb %cl
-; AVX512BW-NEXT: subb %al, %dl
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm2, %xmm1
-; AVX512BW-NEXT: vextracti32x4 $2, %zmm0, %xmm2
-; AVX512BW-NEXT: vpextrb $1, %xmm2, %edx
-; AVX512BW-NEXT: imull $37, %edx, %esi
-; AVX512BW-NEXT: shrl $8, %esi
-; AVX512BW-NEXT: movl %edx, %eax
-; AVX512BW-NEXT: subb %sil, %al
-; AVX512BW-NEXT: shrb %al
-; AVX512BW-NEXT: addb %sil, %al
-; AVX512BW-NEXT: shrb $2, %al
-; AVX512BW-NEXT: mulb %cl
-; AVX512BW-NEXT: subb %al, %dl
-; AVX512BW-NEXT: movzbl %dl, %edx
-; AVX512BW-NEXT: vpextrb $0, %xmm2, %esi
-; AVX512BW-NEXT: imull $37, %esi, %edi
-; AVX512BW-NEXT: shrl $8, %edi
-; AVX512BW-NEXT: movl %esi, %eax
-; AVX512BW-NEXT: subb %dil, %al
-; AVX512BW-NEXT: shrb %al
-; AVX512BW-NEXT: addb %dil, %al
-; AVX512BW-NEXT: shrb $2, %al
-; AVX512BW-NEXT: mulb %cl
-; AVX512BW-NEXT: subb %al, %sil
-; AVX512BW-NEXT: movzbl %sil, %eax
-; AVX512BW-NEXT: vmovd %eax, %xmm3
-; AVX512BW-NEXT: vpinsrb $1, %edx, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $2, %xmm2, %edx
-; AVX512BW-NEXT: imull $37, %edx, %esi
-; AVX512BW-NEXT: shrl $8, %esi
-; AVX512BW-NEXT: movl %edx, %eax
-; AVX512BW-NEXT: subb %sil, %al
-; AVX512BW-NEXT: shrb %al
-; AVX512BW-NEXT: addb %sil, %al
-; AVX512BW-NEXT: shrb $2, %al
-; AVX512BW-NEXT: mulb %cl
-; AVX512BW-NEXT: subb %al, %dl
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $3, %xmm2, %edx
-; AVX512BW-NEXT: imull $37, %edx, %esi
-; AVX512BW-NEXT: shrl $8, %esi
-; AVX512BW-NEXT: movl %edx, %eax
-; AVX512BW-NEXT: subb %sil, %al
-; AVX512BW-NEXT: shrb %al
-; AVX512BW-NEXT: addb %sil, %al
-; AVX512BW-NEXT: shrb $2, %al
-; AVX512BW-NEXT: mulb %cl
-; AVX512BW-NEXT: subb %al, %dl
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $4, %xmm2, %edx
-; AVX512BW-NEXT: imull $37, %edx, %esi
-; AVX512BW-NEXT: shrl $8, %esi
-; AVX512BW-NEXT: movl %edx, %eax
-; AVX512BW-NEXT: subb %sil, %al
-; AVX512BW-NEXT: shrb %al
-; AVX512BW-NEXT: addb %sil, %al
-; AVX512BW-NEXT: shrb $2, %al
-; AVX512BW-NEXT: mulb %cl
-; AVX512BW-NEXT: subb %al, %dl
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $5, %xmm2, %edx
-; AVX512BW-NEXT: imull $37, %edx, %esi
-; AVX512BW-NEXT: shrl $8, %esi
-; AVX512BW-NEXT: movl %edx, %eax
-; AVX512BW-NEXT: subb %sil, %al
-; AVX512BW-NEXT: shrb %al
-; AVX512BW-NEXT: addb %sil, %al
-; AVX512BW-NEXT: shrb $2, %al
-; AVX512BW-NEXT: mulb %cl
-; AVX512BW-NEXT: subb %al, %dl
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $6, %xmm2, %edx
-; AVX512BW-NEXT: imull $37, %edx, %esi
-; AVX512BW-NEXT: shrl $8, %esi
-; AVX512BW-NEXT: movl %edx, %eax
-; AVX512BW-NEXT: subb %sil, %al
-; AVX512BW-NEXT: shrb %al
-; AVX512BW-NEXT: addb %sil, %al
-; AVX512BW-NEXT: shrb $2, %al
-; AVX512BW-NEXT: mulb %cl
-; AVX512BW-NEXT: subb %al, %dl
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $7, %xmm2, %edx
-; AVX512BW-NEXT: imull $37, %edx, %esi
-; AVX512BW-NEXT: shrl $8, %esi
-; AVX512BW-NEXT: movl %edx, %eax
-; AVX512BW-NEXT: subb %sil, %al
-; AVX512BW-NEXT: shrb %al
-; AVX512BW-NEXT: addb %sil, %al
-; AVX512BW-NEXT: shrb $2, %al
-; AVX512BW-NEXT: mulb %cl
-; AVX512BW-NEXT: subb %al, %dl
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $8, %xmm2, %edx
-; AVX512BW-NEXT: imull $37, %edx, %esi
-; AVX512BW-NEXT: shrl $8, %esi
-; AVX512BW-NEXT: movl %edx, %eax
-; AVX512BW-NEXT: subb %sil, %al
-; AVX512BW-NEXT: shrb %al
-; AVX512BW-NEXT: addb %sil, %al
-; AVX512BW-NEXT: shrb $2, %al
-; AVX512BW-NEXT: mulb %cl
-; AVX512BW-NEXT: subb %al, %dl
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $9, %xmm2, %edx
-; AVX512BW-NEXT: imull $37, %edx, %esi
-; AVX512BW-NEXT: shrl $8, %esi
-; AVX512BW-NEXT: movl %edx, %eax
-; AVX512BW-NEXT: subb %sil, %al
-; AVX512BW-NEXT: shrb %al
-; AVX512BW-NEXT: addb %sil, %al
-; AVX512BW-NEXT: shrb $2, %al
-; AVX512BW-NEXT: mulb %cl
-; AVX512BW-NEXT: subb %al, %dl
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $10, %xmm2, %edx
-; AVX512BW-NEXT: imull $37, %edx, %esi
-; AVX512BW-NEXT: shrl $8, %esi
-; AVX512BW-NEXT: movl %edx, %eax
-; AVX512BW-NEXT: subb %sil, %al
-; AVX512BW-NEXT: shrb %al
-; AVX512BW-NEXT: addb %sil, %al
-; AVX512BW-NEXT: shrb $2, %al
-; AVX512BW-NEXT: mulb %cl
-; AVX512BW-NEXT: subb %al, %dl
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $11, %xmm2, %edx
-; AVX512BW-NEXT: imull $37, %edx, %esi
-; AVX512BW-NEXT: shrl $8, %esi
-; AVX512BW-NEXT: movl %edx, %eax
-; AVX512BW-NEXT: subb %sil, %al
-; AVX512BW-NEXT: shrb %al
-; AVX512BW-NEXT: addb %sil, %al
-; AVX512BW-NEXT: shrb $2, %al
-; AVX512BW-NEXT: mulb %cl
-; AVX512BW-NEXT: subb %al, %dl
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $12, %xmm2, %edx
-; AVX512BW-NEXT: imull $37, %edx, %esi
-; AVX512BW-NEXT: shrl $8, %esi
-; AVX512BW-NEXT: movl %edx, %eax
-; AVX512BW-NEXT: subb %sil, %al
-; AVX512BW-NEXT: shrb %al
-; AVX512BW-NEXT: addb %sil, %al
-; AVX512BW-NEXT: shrb $2, %al
-; AVX512BW-NEXT: mulb %cl
-; AVX512BW-NEXT: subb %al, %dl
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $13, %xmm2, %edx
-; AVX512BW-NEXT: imull $37, %edx, %esi
-; AVX512BW-NEXT: shrl $8, %esi
-; AVX512BW-NEXT: movl %edx, %eax
-; AVX512BW-NEXT: subb %sil, %al
-; AVX512BW-NEXT: shrb %al
-; AVX512BW-NEXT: addb %sil, %al
-; AVX512BW-NEXT: shrb $2, %al
-; AVX512BW-NEXT: mulb %cl
-; AVX512BW-NEXT: subb %al, %dl
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $14, %xmm2, %edx
-; AVX512BW-NEXT: imull $37, %edx, %esi
-; AVX512BW-NEXT: shrl $8, %esi
-; AVX512BW-NEXT: movl %edx, %eax
-; AVX512BW-NEXT: subb %sil, %al
-; AVX512BW-NEXT: shrb %al
-; AVX512BW-NEXT: addb %sil, %al
-; AVX512BW-NEXT: shrb $2, %al
-; AVX512BW-NEXT: mulb %cl
-; AVX512BW-NEXT: subb %al, %dl
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $15, %xmm2, %edx
-; AVX512BW-NEXT: imull $37, %edx, %esi
-; AVX512BW-NEXT: shrl $8, %esi
-; AVX512BW-NEXT: movl %edx, %eax
-; AVX512BW-NEXT: subb %sil, %al
-; AVX512BW-NEXT: shrb %al
-; AVX512BW-NEXT: addb %sil, %al
-; AVX512BW-NEXT: shrb $2, %al
-; AVX512BW-NEXT: mulb %cl
-; AVX512BW-NEXT: subb %al, %dl
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm3, %xmm2
-; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
-; AVX512BW-NEXT: vextracti32x4 $1, %zmm0, %xmm2
-; AVX512BW-NEXT: vpextrb $1, %xmm2, %edx
-; AVX512BW-NEXT: imull $37, %edx, %esi
-; AVX512BW-NEXT: shrl $8, %esi
-; AVX512BW-NEXT: movl %edx, %eax
-; AVX512BW-NEXT: subb %sil, %al
-; AVX512BW-NEXT: shrb %al
-; AVX512BW-NEXT: addb %sil, %al
-; AVX512BW-NEXT: shrb $2, %al
-; AVX512BW-NEXT: mulb %cl
-; AVX512BW-NEXT: subb %al, %dl
-; AVX512BW-NEXT: movzbl %dl, %edx
-; AVX512BW-NEXT: vpextrb $0, %xmm2, %esi
-; AVX512BW-NEXT: imull $37, %esi, %edi
-; AVX512BW-NEXT: shrl $8, %edi
-; AVX512BW-NEXT: movl %esi, %eax
-; AVX512BW-NEXT: subb %dil, %al
-; AVX512BW-NEXT: shrb %al
-; AVX512BW-NEXT: addb %dil, %al
-; AVX512BW-NEXT: shrb $2, %al
-; AVX512BW-NEXT: mulb %cl
-; AVX512BW-NEXT: subb %al, %sil
-; AVX512BW-NEXT: movzbl %sil, %eax
-; AVX512BW-NEXT: vmovd %eax, %xmm3
-; AVX512BW-NEXT: vpinsrb $1, %edx, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $2, %xmm2, %edx
-; AVX512BW-NEXT: imull $37, %edx, %esi
-; AVX512BW-NEXT: shrl $8, %esi
-; AVX512BW-NEXT: movl %edx, %eax
-; AVX512BW-NEXT: subb %sil, %al
-; AVX512BW-NEXT: shrb %al
-; AVX512BW-NEXT: addb %sil, %al
-; AVX512BW-NEXT: shrb $2, %al
-; AVX512BW-NEXT: mulb %cl
-; AVX512BW-NEXT: subb %al, %dl
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $3, %xmm2, %edx
-; AVX512BW-NEXT: imull $37, %edx, %esi
-; AVX512BW-NEXT: shrl $8, %esi
-; AVX512BW-NEXT: movl %edx, %eax
-; AVX512BW-NEXT: subb %sil, %al
-; AVX512BW-NEXT: shrb %al
-; AVX512BW-NEXT: addb %sil, %al
-; AVX512BW-NEXT: shrb $2, %al
-; AVX512BW-NEXT: mulb %cl
-; AVX512BW-NEXT: subb %al, %dl
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $4, %xmm2, %edx
-; AVX512BW-NEXT: imull $37, %edx, %esi
-; AVX512BW-NEXT: shrl $8, %esi
-; AVX512BW-NEXT: movl %edx, %eax
-; AVX512BW-NEXT: subb %sil, %al
-; AVX512BW-NEXT: shrb %al
-; AVX512BW-NEXT: addb %sil, %al
-; AVX512BW-NEXT: shrb $2, %al
-; AVX512BW-NEXT: mulb %cl
-; AVX512BW-NEXT: subb %al, %dl
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $5, %xmm2, %edx
-; AVX512BW-NEXT: imull $37, %edx, %esi
-; AVX512BW-NEXT: shrl $8, %esi
-; AVX512BW-NEXT: movl %edx, %eax
-; AVX512BW-NEXT: subb %sil, %al
-; AVX512BW-NEXT: shrb %al
-; AVX512BW-NEXT: addb %sil, %al
-; AVX512BW-NEXT: shrb $2, %al
-; AVX512BW-NEXT: mulb %cl
-; AVX512BW-NEXT: subb %al, %dl
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $6, %xmm2, %edx
-; AVX512BW-NEXT: imull $37, %edx, %esi
-; AVX512BW-NEXT: shrl $8, %esi
-; AVX512BW-NEXT: movl %edx, %eax
-; AVX512BW-NEXT: subb %sil, %al
-; AVX512BW-NEXT: shrb %al
-; AVX512BW-NEXT: addb %sil, %al
-; AVX512BW-NEXT: shrb $2, %al
-; AVX512BW-NEXT: mulb %cl
-; AVX512BW-NEXT: subb %al, %dl
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $7, %xmm2, %edx
-; AVX512BW-NEXT: imull $37, %edx, %esi
-; AVX512BW-NEXT: shrl $8, %esi
-; AVX512BW-NEXT: movl %edx, %eax
-; AVX512BW-NEXT: subb %sil, %al
-; AVX512BW-NEXT: shrb %al
-; AVX512BW-NEXT: addb %sil, %al
-; AVX512BW-NEXT: shrb $2, %al
-; AVX512BW-NEXT: mulb %cl
-; AVX512BW-NEXT: subb %al, %dl
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $8, %xmm2, %edx
-; AVX512BW-NEXT: imull $37, %edx, %esi
-; AVX512BW-NEXT: shrl $8, %esi
-; AVX512BW-NEXT: movl %edx, %eax
-; AVX512BW-NEXT: subb %sil, %al
-; AVX512BW-NEXT: shrb %al
-; AVX512BW-NEXT: addb %sil, %al
-; AVX512BW-NEXT: shrb $2, %al
-; AVX512BW-NEXT: mulb %cl
-; AVX512BW-NEXT: subb %al, %dl
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $9, %xmm2, %edx
-; AVX512BW-NEXT: imull $37, %edx, %esi
-; AVX512BW-NEXT: shrl $8, %esi
-; AVX512BW-NEXT: movl %edx, %eax
-; AVX512BW-NEXT: subb %sil, %al
-; AVX512BW-NEXT: shrb %al
-; AVX512BW-NEXT: addb %sil, %al
-; AVX512BW-NEXT: shrb $2, %al
-; AVX512BW-NEXT: mulb %cl
-; AVX512BW-NEXT: subb %al, %dl
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $10, %xmm2, %edx
-; AVX512BW-NEXT: imull $37, %edx, %esi
-; AVX512BW-NEXT: shrl $8, %esi
-; AVX512BW-NEXT: movl %edx, %eax
-; AVX512BW-NEXT: subb %sil, %al
-; AVX512BW-NEXT: shrb %al
-; AVX512BW-NEXT: addb %sil, %al
-; AVX512BW-NEXT: shrb $2, %al
-; AVX512BW-NEXT: mulb %cl
-; AVX512BW-NEXT: subb %al, %dl
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $11, %xmm2, %edx
-; AVX512BW-NEXT: imull $37, %edx, %esi
-; AVX512BW-NEXT: shrl $8, %esi
-; AVX512BW-NEXT: movl %edx, %eax
-; AVX512BW-NEXT: subb %sil, %al
-; AVX512BW-NEXT: shrb %al
-; AVX512BW-NEXT: addb %sil, %al
-; AVX512BW-NEXT: shrb $2, %al
-; AVX512BW-NEXT: mulb %cl
-; AVX512BW-NEXT: subb %al, %dl
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $12, %xmm2, %edx
-; AVX512BW-NEXT: imull $37, %edx, %esi
-; AVX512BW-NEXT: shrl $8, %esi
-; AVX512BW-NEXT: movl %edx, %eax
-; AVX512BW-NEXT: subb %sil, %al
-; AVX512BW-NEXT: shrb %al
-; AVX512BW-NEXT: addb %sil, %al
-; AVX512BW-NEXT: shrb $2, %al
-; AVX512BW-NEXT: mulb %cl
-; AVX512BW-NEXT: subb %al, %dl
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $13, %xmm2, %edx
-; AVX512BW-NEXT: imull $37, %edx, %esi
-; AVX512BW-NEXT: shrl $8, %esi
-; AVX512BW-NEXT: movl %edx, %eax
-; AVX512BW-NEXT: subb %sil, %al
-; AVX512BW-NEXT: shrb %al
-; AVX512BW-NEXT: addb %sil, %al
-; AVX512BW-NEXT: shrb $2, %al
-; AVX512BW-NEXT: mulb %cl
-; AVX512BW-NEXT: subb %al, %dl
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $14, %xmm2, %edx
-; AVX512BW-NEXT: imull $37, %edx, %esi
-; AVX512BW-NEXT: shrl $8, %esi
-; AVX512BW-NEXT: movl %edx, %eax
-; AVX512BW-NEXT: subb %sil, %al
-; AVX512BW-NEXT: shrb %al
-; AVX512BW-NEXT: addb %sil, %al
-; AVX512BW-NEXT: shrb $2, %al
-; AVX512BW-NEXT: mulb %cl
-; AVX512BW-NEXT: subb %al, %dl
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $15, %xmm2, %edx
-; AVX512BW-NEXT: imull $37, %edx, %esi
-; AVX512BW-NEXT: shrl $8, %esi
-; AVX512BW-NEXT: movl %edx, %eax
-; AVX512BW-NEXT: subb %sil, %al
-; AVX512BW-NEXT: shrb %al
-; AVX512BW-NEXT: addb %sil, %al
-; AVX512BW-NEXT: shrb $2, %al
-; AVX512BW-NEXT: mulb %cl
-; AVX512BW-NEXT: subb %al, %dl
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm3, %xmm2
-; AVX512BW-NEXT: vpextrb $1, %xmm0, %edx
-; AVX512BW-NEXT: imull $37, %edx, %esi
-; AVX512BW-NEXT: shrl $8, %esi
-; AVX512BW-NEXT: movl %edx, %eax
-; AVX512BW-NEXT: subb %sil, %al
-; AVX512BW-NEXT: shrb %al
-; AVX512BW-NEXT: addb %sil, %al
-; AVX512BW-NEXT: shrb $2, %al
-; AVX512BW-NEXT: mulb %cl
-; AVX512BW-NEXT: subb %al, %dl
-; AVX512BW-NEXT: movzbl %dl, %edx
-; AVX512BW-NEXT: vpextrb $0, %xmm0, %esi
-; AVX512BW-NEXT: imull $37, %esi, %edi
-; AVX512BW-NEXT: shrl $8, %edi
-; AVX512BW-NEXT: movl %esi, %eax
-; AVX512BW-NEXT: subb %dil, %al
-; AVX512BW-NEXT: shrb %al
-; AVX512BW-NEXT: addb %dil, %al
-; AVX512BW-NEXT: shrb $2, %al
-; AVX512BW-NEXT: mulb %cl
-; AVX512BW-NEXT: subb %al, %sil
-; AVX512BW-NEXT: movzbl %sil, %eax
-; AVX512BW-NEXT: vmovd %eax, %xmm3
-; AVX512BW-NEXT: vpinsrb $1, %edx, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $2, %xmm0, %edx
-; AVX512BW-NEXT: imull $37, %edx, %esi
-; AVX512BW-NEXT: shrl $8, %esi
-; AVX512BW-NEXT: movl %edx, %eax
-; AVX512BW-NEXT: subb %sil, %al
-; AVX512BW-NEXT: shrb %al
-; AVX512BW-NEXT: addb %sil, %al
-; AVX512BW-NEXT: shrb $2, %al
-; AVX512BW-NEXT: mulb %cl
-; AVX512BW-NEXT: subb %al, %dl
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $3, %xmm0, %edx
-; AVX512BW-NEXT: imull $37, %edx, %esi
-; AVX512BW-NEXT: shrl $8, %esi
-; AVX512BW-NEXT: movl %edx, %eax
-; AVX512BW-NEXT: subb %sil, %al
-; AVX512BW-NEXT: shrb %al
-; AVX512BW-NEXT: addb %sil, %al
-; AVX512BW-NEXT: shrb $2, %al
-; AVX512BW-NEXT: mulb %cl
-; AVX512BW-NEXT: subb %al, %dl
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $4, %xmm0, %edx
-; AVX512BW-NEXT: imull $37, %edx, %esi
-; AVX512BW-NEXT: shrl $8, %esi
-; AVX512BW-NEXT: movl %edx, %eax
-; AVX512BW-NEXT: subb %sil, %al
-; AVX512BW-NEXT: shrb %al
-; AVX512BW-NEXT: addb %sil, %al
-; AVX512BW-NEXT: shrb $2, %al
-; AVX512BW-NEXT: mulb %cl
-; AVX512BW-NEXT: subb %al, %dl
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $5, %xmm0, %edx
-; AVX512BW-NEXT: imull $37, %edx, %esi
-; AVX512BW-NEXT: shrl $8, %esi
-; AVX512BW-NEXT: movl %edx, %eax
-; AVX512BW-NEXT: subb %sil, %al
-; AVX512BW-NEXT: shrb %al
-; AVX512BW-NEXT: addb %sil, %al
-; AVX512BW-NEXT: shrb $2, %al
-; AVX512BW-NEXT: mulb %cl
-; AVX512BW-NEXT: subb %al, %dl
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $6, %xmm0, %edx
-; AVX512BW-NEXT: imull $37, %edx, %esi
-; AVX512BW-NEXT: shrl $8, %esi
-; AVX512BW-NEXT: movl %edx, %eax
-; AVX512BW-NEXT: subb %sil, %al
-; AVX512BW-NEXT: shrb %al
-; AVX512BW-NEXT: addb %sil, %al
-; AVX512BW-NEXT: shrb $2, %al
-; AVX512BW-NEXT: mulb %cl
-; AVX512BW-NEXT: subb %al, %dl
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $7, %xmm0, %edx
-; AVX512BW-NEXT: imull $37, %edx, %esi
-; AVX512BW-NEXT: shrl $8, %esi
-; AVX512BW-NEXT: movl %edx, %eax
-; AVX512BW-NEXT: subb %sil, %al
-; AVX512BW-NEXT: shrb %al
-; AVX512BW-NEXT: addb %sil, %al
-; AVX512BW-NEXT: shrb $2, %al
-; AVX512BW-NEXT: mulb %cl
-; AVX512BW-NEXT: subb %al, %dl
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $8, %xmm0, %edx
-; AVX512BW-NEXT: imull $37, %edx, %esi
-; AVX512BW-NEXT: shrl $8, %esi
-; AVX512BW-NEXT: movl %edx, %eax
-; AVX512BW-NEXT: subb %sil, %al
-; AVX512BW-NEXT: shrb %al
-; AVX512BW-NEXT: addb %sil, %al
-; AVX512BW-NEXT: shrb $2, %al
-; AVX512BW-NEXT: mulb %cl
-; AVX512BW-NEXT: subb %al, %dl
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $9, %xmm0, %edx
-; AVX512BW-NEXT: imull $37, %edx, %esi
-; AVX512BW-NEXT: shrl $8, %esi
-; AVX512BW-NEXT: movl %edx, %eax
-; AVX512BW-NEXT: subb %sil, %al
-; AVX512BW-NEXT: shrb %al
-; AVX512BW-NEXT: addb %sil, %al
-; AVX512BW-NEXT: shrb $2, %al
-; AVX512BW-NEXT: mulb %cl
-; AVX512BW-NEXT: subb %al, %dl
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $10, %xmm0, %edx
-; AVX512BW-NEXT: imull $37, %edx, %esi
-; AVX512BW-NEXT: shrl $8, %esi
-; AVX512BW-NEXT: movl %edx, %eax
-; AVX512BW-NEXT: subb %sil, %al
-; AVX512BW-NEXT: shrb %al
-; AVX512BW-NEXT: addb %sil, %al
-; AVX512BW-NEXT: shrb $2, %al
-; AVX512BW-NEXT: mulb %cl
-; AVX512BW-NEXT: subb %al, %dl
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $11, %xmm0, %edx
-; AVX512BW-NEXT: imull $37, %edx, %esi
-; AVX512BW-NEXT: shrl $8, %esi
-; AVX512BW-NEXT: movl %edx, %eax
-; AVX512BW-NEXT: subb %sil, %al
-; AVX512BW-NEXT: shrb %al
-; AVX512BW-NEXT: addb %sil, %al
-; AVX512BW-NEXT: shrb $2, %al
-; AVX512BW-NEXT: mulb %cl
-; AVX512BW-NEXT: subb %al, %dl
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $12, %xmm0, %edx
-; AVX512BW-NEXT: imull $37, %edx, %esi
-; AVX512BW-NEXT: shrl $8, %esi
-; AVX512BW-NEXT: movl %edx, %eax
-; AVX512BW-NEXT: subb %sil, %al
-; AVX512BW-NEXT: shrb %al
-; AVX512BW-NEXT: addb %sil, %al
-; AVX512BW-NEXT: shrb $2, %al
-; AVX512BW-NEXT: mulb %cl
-; AVX512BW-NEXT: subb %al, %dl
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $13, %xmm0, %edx
-; AVX512BW-NEXT: imull $37, %edx, %esi
-; AVX512BW-NEXT: shrl $8, %esi
-; AVX512BW-NEXT: movl %edx, %eax
-; AVX512BW-NEXT: subb %sil, %al
-; AVX512BW-NEXT: shrb %al
-; AVX512BW-NEXT: addb %sil, %al
-; AVX512BW-NEXT: shrb $2, %al
-; AVX512BW-NEXT: mulb %cl
-; AVX512BW-NEXT: subb %al, %dl
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $14, %xmm0, %edx
-; AVX512BW-NEXT: imull $37, %edx, %esi
-; AVX512BW-NEXT: shrl $8, %esi
-; AVX512BW-NEXT: movl %edx, %eax
-; AVX512BW-NEXT: subb %sil, %al
-; AVX512BW-NEXT: shrb %al
-; AVX512BW-NEXT: addb %sil, %al
-; AVX512BW-NEXT: shrb $2, %al
-; AVX512BW-NEXT: mulb %cl
-; AVX512BW-NEXT: subb %al, %dl
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3
-; AVX512BW-NEXT: vpextrb $15, %xmm0, %edx
-; AVX512BW-NEXT: imull $37, %edx, %esi
-; AVX512BW-NEXT: shrl $8, %esi
-; AVX512BW-NEXT: movl %edx, %eax
-; AVX512BW-NEXT: subb %sil, %al
-; AVX512BW-NEXT: shrb %al
-; AVX512BW-NEXT: addb %sil, %al
-; AVX512BW-NEXT: shrb $2, %al
-; AVX512BW-NEXT: mulb %cl
-; AVX512BW-NEXT: subb %al, %dl
-; AVX512BW-NEXT: movzbl %dl, %eax
-; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm3, %xmm0
-; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
-; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37]
+; AVX512BW-NEXT: vpmullw %zmm2, %zmm1, %zmm1
+; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1
+; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1
+; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero,ymm3[16],zero,ymm3[17],zero,ymm3[18],zero,ymm3[19],zero,ymm3[20],zero,ymm3[21],zero,ymm3[22],zero,ymm3[23],zero,ymm3[24],zero,ymm3[25],zero,ymm3[26],zero,ymm3[27],zero,ymm3[28],zero,ymm3[29],zero,ymm3[30],zero,ymm3[31],zero
+; AVX512BW-NEXT: vpmullw %zmm2, %zmm3, %zmm2
+; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2
+; AVX512BW-NEXT: vpmovwb %zmm2, %ymm2
+; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512BW-NEXT: vpsubb %zmm1, %zmm0, %zmm2
+; AVX512BW-NEXT: vpsrlw $1, %zmm2, %zmm2
+; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512BW-NEXT: vpaddb %zmm1, %zmm2, %zmm1
+; AVX512BW-NEXT: vpsrlw $2, %zmm1, %zmm1
+; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
+; AVX512BW-NEXT: vpmovsxbw %ymm1, %zmm2
+; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512BW-NEXT: vpmullw %zmm3, %zmm2, %zmm2
+; AVX512BW-NEXT: vpmovwb %zmm2, %ymm2
+; AVX512BW-NEXT: vextracti64x4 $1, %zmm1, %ymm1
+; AVX512BW-NEXT: vpmovsxbw %ymm1, %zmm1
+; AVX512BW-NEXT: vpmullw %zmm3, %zmm1, %zmm1
+; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1
+; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1
+; AVX512BW-NEXT: vpsubb %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: retq
%res = urem <64 x i8> %a, <i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7>
ret <64 x i8> %res
diff --git a/test/CodeGen/X86/vector-idiv.ll b/test/CodeGen/X86/vector-idiv.ll
index c65c3e7fd004..e2f769761e17 100644
--- a/test/CodeGen/X86/vector-idiv.ll
+++ b/test/CodeGen/X86/vector-idiv.ll
@@ -6,12 +6,12 @@
define <2 x i16> @test_urem_unary_v2i16() nounwind {
; SSE-LABEL: test_urem_unary_v2i16:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: xorps %xmm0, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test_urem_unary_v2i16:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
; AVX-NEXT: retq
%I8 = insertelement <2 x i16> zeroinitializer, i16 -1, i32 0
@@ -22,7 +22,7 @@ define <2 x i16> @test_urem_unary_v2i16() nounwind {
define <4 x i32> @PR20355(<4 x i32> %a) nounwind {
; SSE2-LABEL: PR20355:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1431655766,1431655766,1431655766,1431655766]
; SSE2-NEXT: movdqa %xmm1, %xmm2
; SSE2-NEXT: psrad $31, %xmm2
@@ -45,7 +45,7 @@ define <4 x i32> @PR20355(<4 x i32> %a) nounwind {
; SSE2-NEXT: retq
;
; SSE41-LABEL: PR20355:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1431655766,1431655766,1431655766,1431655766]
; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
@@ -59,7 +59,7 @@ define <4 x i32> @PR20355(<4 x i32> %a) nounwind {
; SSE41-NEXT: retq
;
; AVX1-LABEL: PR20355:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1431655766,1431655766,1431655766,1431655766]
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
@@ -72,7 +72,7 @@ define <4 x i32> @PR20355(<4 x i32> %a) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: PR20355:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1431655766,1431655766,1431655766,1431655766]
; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
diff --git a/test/CodeGen/X86/vector-interleave.ll b/test/CodeGen/X86/vector-interleave.ll
index cadb02c6b5f3..04e6ccc00034 100644
--- a/test/CodeGen/X86/vector-interleave.ll
+++ b/test/CodeGen/X86/vector-interleave.ll
@@ -9,7 +9,7 @@
; PR21281
define <64 x i16> @interleave8x8(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c, <8 x i16> %d, <8 x i16> %e, <8 x i16> %f, <8 x i16> %h, <8 x i16> %g) {
; SSE-LABEL: interleave8x8:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movdqa %xmm0, %xmm8
; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3]
; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
@@ -58,23 +58,23 @@ define <64 x i16> @interleave8x8(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c, <8 x
; SSE-NEXT: retq
;
; AVX1-LABEL: interleave8x8:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
-; AVX1-NEXT: vunpckhps {{.*#+}} xmm9 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; AVX1-NEXT: vunpcklps {{.*#+}} xmm2 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; AVX1-NEXT: vunpckhps {{.*#+}} xmm3 = xmm8[2],xmm1[2],xmm8[3],xmm1[3]
-; AVX1-NEXT: vunpcklps {{.*#+}} xmm0 = xmm8[0],xmm1[0],xmm8[1],xmm1[1]
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm8[2],xmm1[2],xmm8[3],xmm1[3]
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm8[0],xmm1[0],xmm8[1],xmm1[1]
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
-; AVX1-NEXT: vunpckhps {{.*#+}} xmm7 = xmm4[2],xmm6[2],xmm4[3],xmm6[3]
-; AVX1-NEXT: vunpcklps {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1]
-; AVX1-NEXT: vunpckhps {{.*#+}} xmm6 = xmm1[2],xmm5[2],xmm1[3],xmm5[3]
-; AVX1-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1]
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm4[2],xmm6[2],xmm4[3],xmm6[3]
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1]
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm1[2],xmm5[2],xmm1[3],xmm5[3]
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1]
; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0
@@ -90,7 +90,7 @@ define <64 x i16> @interleave8x8(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c, <8 x
; AVX1-NEXT: retq
;
; AVX2-LABEL: interleave8x8:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
diff --git a/test/CodeGen/X86/vector-intrinsics.ll b/test/CodeGen/X86/vector-intrinsics.ll
index c140468d3006..436644e9f9cc 100644
--- a/test/CodeGen/X86/vector-intrinsics.ll
+++ b/test/CodeGen/X86/vector-intrinsics.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 | grep call | count 43
+; RUN: llc < %s -mtriple=x86_64-- | grep call | count 43
declare <4 x double> @llvm.sin.v4f64(<4 x double> %p)
declare <4 x double> @llvm.cos.v4f64(<4 x double> %p)
diff --git a/test/CodeGen/X86/vector-lzcnt-128.ll b/test/CodeGen/X86/vector-lzcnt-128.ll
index e3261d15538f..4abace0d9386 100644
--- a/test/CodeGen/X86/vector-lzcnt-128.ll
+++ b/test/CodeGen/X86/vector-lzcnt-128.ll
@@ -7,15 +7,15 @@
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X64 --check-prefix=NOBW --check-prefix=AVX --check-prefix=AVX2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=X64 --check-prefix=NOBW --check-prefix=AVX --check-prefix=AVX512VL
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw,+avx512dq | FileCheck %s --check-prefix=X64 --check-prefix=AVX512VLBWDQ
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512cd -mattr=+avx512vl | FileCheck %s --check-prefix=X64 --check-prefix=NOBW --check-prefix=AVX512 --check-prefix=AVX512VLCD
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512cd | FileCheck %s --check-prefix=X64 --check-prefix=NOBW --check-prefix=AVX512 --check-prefix=AVX512CD
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512cd,+avx512vl | FileCheck %s --check-prefix=X64 --check-prefix=NOBW --check-prefix=AVX512 --check-prefix=AVX512VLCD
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512cd | FileCheck %s --check-prefix=X64 --check-prefix=NOBW --check-prefix=AVX512 --check-prefix=AVX512CD
;
; Just one 32-bit run to make sure we do reasonable things for i64 lzcnt.
; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=X32-SSE
define <2 x i64> @testv2i64(<2 x i64> %in) nounwind {
; SSE2-LABEL: testv2i64:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrlq $1, %xmm1
; SSE2-NEXT: por %xmm0, %xmm1
@@ -55,7 +55,7 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind {
; SSE2-NEXT: retq
;
; SSE3-LABEL: testv2i64:
-; SSE3: # BB#0:
+; SSE3: # %bb.0:
; SSE3-NEXT: movdqa %xmm0, %xmm1
; SSE3-NEXT: psrlq $1, %xmm1
; SSE3-NEXT: por %xmm0, %xmm1
@@ -95,7 +95,7 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind {
; SSE3-NEXT: retq
;
; SSSE3-LABEL: testv2i64:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; SSSE3-NEXT: movdqa %xmm0, %xmm1
; SSSE3-NEXT: pand %xmm2, %xmm1
@@ -131,7 +131,7 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind {
; SSSE3-NEXT: retq
;
; SSE41-LABEL: testv2i64:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; SSE41-NEXT: movdqa %xmm0, %xmm1
; SSE41-NEXT: pand %xmm2, %xmm1
@@ -167,7 +167,7 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind {
; SSE41-NEXT: retq
;
; AVX-LABEL: testv2i64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
@@ -197,10 +197,10 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind {
; AVX-NEXT: retq
;
; AVX512VLBWDQ-LABEL: testv2i64:
-; AVX512VLBWDQ: # BB#0:
+; AVX512VLBWDQ: # %bb.0:
; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512VLBWDQ-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX512VLBWDQ-NEXT: vpshufb %xmm2, %xmm3, %xmm2
; AVX512VLBWDQ-NEXT: vpsrlw $4, %xmm0, %xmm4
; AVX512VLBWDQ-NEXT: vpand %xmm1, %xmm4, %xmm1
@@ -227,19 +227,20 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind {
; AVX512VLBWDQ-NEXT: retq
;
; AVX512VLCD-LABEL: testv2i64:
-; AVX512VLCD: # BB#0:
+; AVX512VLCD: # %bb.0:
; AVX512VLCD-NEXT: vplzcntq %xmm0, %xmm0
; AVX512VLCD-NEXT: retq
;
; AVX512CD-LABEL: testv2i64:
-; AVX512CD: # BB#0:
-; AVX512CD-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; AVX512CD: # %bb.0:
+; AVX512CD-NEXT: # kill: def %xmm0 killed %xmm0 def %zmm0
; AVX512CD-NEXT: vplzcntq %zmm0, %zmm0
-; AVX512CD-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512CD-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0
+; AVX512CD-NEXT: vzeroupper
; AVX512CD-NEXT: retq
;
; X32-SSE-LABEL: testv2i64:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; X32-SSE-NEXT: movdqa %xmm0, %xmm1
; X32-SSE-NEXT: pand %xmm2, %xmm1
@@ -280,7 +281,7 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind {
define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind {
; SSE2-LABEL: testv2i64u:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrlq $1, %xmm1
; SSE2-NEXT: por %xmm0, %xmm1
@@ -320,7 +321,7 @@ define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind {
; SSE2-NEXT: retq
;
; SSE3-LABEL: testv2i64u:
-; SSE3: # BB#0:
+; SSE3: # %bb.0:
; SSE3-NEXT: movdqa %xmm0, %xmm1
; SSE3-NEXT: psrlq $1, %xmm1
; SSE3-NEXT: por %xmm0, %xmm1
@@ -360,7 +361,7 @@ define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind {
; SSE3-NEXT: retq
;
; SSSE3-LABEL: testv2i64u:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; SSSE3-NEXT: movdqa %xmm0, %xmm1
; SSSE3-NEXT: pand %xmm2, %xmm1
@@ -396,7 +397,7 @@ define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind {
; SSSE3-NEXT: retq
;
; SSE41-LABEL: testv2i64u:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; SSE41-NEXT: movdqa %xmm0, %xmm1
; SSE41-NEXT: pand %xmm2, %xmm1
@@ -432,7 +433,7 @@ define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind {
; SSE41-NEXT: retq
;
; AVX-LABEL: testv2i64u:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
@@ -462,10 +463,10 @@ define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind {
; AVX-NEXT: retq
;
; AVX512VLBWDQ-LABEL: testv2i64u:
-; AVX512VLBWDQ: # BB#0:
+; AVX512VLBWDQ: # %bb.0:
; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512VLBWDQ-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX512VLBWDQ-NEXT: vpshufb %xmm2, %xmm3, %xmm2
; AVX512VLBWDQ-NEXT: vpsrlw $4, %xmm0, %xmm4
; AVX512VLBWDQ-NEXT: vpand %xmm1, %xmm4, %xmm1
@@ -492,19 +493,20 @@ define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind {
; AVX512VLBWDQ-NEXT: retq
;
; AVX512VLCD-LABEL: testv2i64u:
-; AVX512VLCD: # BB#0:
+; AVX512VLCD: # %bb.0:
; AVX512VLCD-NEXT: vplzcntq %xmm0, %xmm0
; AVX512VLCD-NEXT: retq
;
; AVX512CD-LABEL: testv2i64u:
-; AVX512CD: # BB#0:
-; AVX512CD-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; AVX512CD: # %bb.0:
+; AVX512CD-NEXT: # kill: def %xmm0 killed %xmm0 def %zmm0
; AVX512CD-NEXT: vplzcntq %zmm0, %zmm0
-; AVX512CD-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512CD-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0
+; AVX512CD-NEXT: vzeroupper
; AVX512CD-NEXT: retq
;
; X32-SSE-LABEL: testv2i64u:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; X32-SSE-NEXT: movdqa %xmm0, %xmm1
; X32-SSE-NEXT: pand %xmm2, %xmm1
@@ -545,7 +547,7 @@ define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind {
define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
; SSE2-LABEL: testv4i32:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrld $1, %xmm1
; SSE2-NEXT: por %xmm0, %xmm1
@@ -587,7 +589,7 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
; SSE2-NEXT: retq
;
; SSE3-LABEL: testv4i32:
-; SSE3: # BB#0:
+; SSE3: # %bb.0:
; SSE3-NEXT: movdqa %xmm0, %xmm1
; SSE3-NEXT: psrld $1, %xmm1
; SSE3-NEXT: por %xmm0, %xmm1
@@ -629,7 +631,7 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
; SSE3-NEXT: retq
;
; SSSE3-LABEL: testv4i32:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; SSSE3-NEXT: movdqa %xmm0, %xmm1
; SSSE3-NEXT: pand %xmm2, %xmm1
@@ -659,7 +661,7 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
; SSSE3-NEXT: retq
;
; SSE41-LABEL: testv4i32:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; SSE41-NEXT: movdqa %xmm0, %xmm1
; SSE41-NEXT: pand %xmm2, %xmm1
@@ -689,7 +691,7 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
; SSE41-NEXT: retq
;
; AVX-LABEL: testv4i32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
@@ -714,10 +716,10 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
; AVX-NEXT: retq
;
; AVX512VLBWDQ-LABEL: testv4i32:
-; AVX512VLBWDQ: # BB#0:
+; AVX512VLBWDQ: # %bb.0:
; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512VLBWDQ-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX512VLBWDQ-NEXT: vpshufb %xmm2, %xmm3, %xmm2
; AVX512VLBWDQ-NEXT: vpsrlw $4, %xmm0, %xmm4
; AVX512VLBWDQ-NEXT: vpand %xmm1, %xmm4, %xmm1
@@ -739,19 +741,20 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
; AVX512VLBWDQ-NEXT: retq
;
; AVX512VLCD-LABEL: testv4i32:
-; AVX512VLCD: # BB#0:
+; AVX512VLCD: # %bb.0:
; AVX512VLCD-NEXT: vplzcntd %xmm0, %xmm0
; AVX512VLCD-NEXT: retq
;
; AVX512CD-LABEL: testv4i32:
-; AVX512CD: # BB#0:
-; AVX512CD-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; AVX512CD: # %bb.0:
+; AVX512CD-NEXT: # kill: def %xmm0 killed %xmm0 def %zmm0
; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0
-; AVX512CD-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512CD-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0
+; AVX512CD-NEXT: vzeroupper
; AVX512CD-NEXT: retq
;
; X32-SSE-LABEL: testv4i32:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; X32-SSE-NEXT: movdqa %xmm0, %xmm1
; X32-SSE-NEXT: pand %xmm2, %xmm1
@@ -786,7 +789,7 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind {
; SSE2-LABEL: testv4i32u:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrld $1, %xmm1
; SSE2-NEXT: por %xmm0, %xmm1
@@ -828,7 +831,7 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind {
; SSE2-NEXT: retq
;
; SSE3-LABEL: testv4i32u:
-; SSE3: # BB#0:
+; SSE3: # %bb.0:
; SSE3-NEXT: movdqa %xmm0, %xmm1
; SSE3-NEXT: psrld $1, %xmm1
; SSE3-NEXT: por %xmm0, %xmm1
@@ -870,7 +873,7 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind {
; SSE3-NEXT: retq
;
; SSSE3-LABEL: testv4i32u:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; SSSE3-NEXT: movdqa %xmm0, %xmm1
; SSSE3-NEXT: pand %xmm2, %xmm1
@@ -900,7 +903,7 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind {
; SSSE3-NEXT: retq
;
; SSE41-LABEL: testv4i32u:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; SSE41-NEXT: movdqa %xmm0, %xmm1
; SSE41-NEXT: pand %xmm2, %xmm1
@@ -930,7 +933,7 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind {
; SSE41-NEXT: retq
;
; AVX-LABEL: testv4i32u:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
@@ -955,10 +958,10 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind {
; AVX-NEXT: retq
;
; AVX512VLBWDQ-LABEL: testv4i32u:
-; AVX512VLBWDQ: # BB#0:
+; AVX512VLBWDQ: # %bb.0:
; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512VLBWDQ-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX512VLBWDQ-NEXT: vpshufb %xmm2, %xmm3, %xmm2
; AVX512VLBWDQ-NEXT: vpsrlw $4, %xmm0, %xmm4
; AVX512VLBWDQ-NEXT: vpand %xmm1, %xmm4, %xmm1
@@ -980,19 +983,20 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind {
; AVX512VLBWDQ-NEXT: retq
;
; AVX512VLCD-LABEL: testv4i32u:
-; AVX512VLCD: # BB#0:
+; AVX512VLCD: # %bb.0:
; AVX512VLCD-NEXT: vplzcntd %xmm0, %xmm0
; AVX512VLCD-NEXT: retq
;
; AVX512CD-LABEL: testv4i32u:
-; AVX512CD: # BB#0:
-; AVX512CD-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; AVX512CD: # %bb.0:
+; AVX512CD-NEXT: # kill: def %xmm0 killed %xmm0 def %zmm0
; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0
-; AVX512CD-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512CD-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0
+; AVX512CD-NEXT: vzeroupper
; AVX512CD-NEXT: retq
;
; X32-SSE-LABEL: testv4i32u:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; X32-SSE-NEXT: movdqa %xmm0, %xmm1
; X32-SSE-NEXT: pand %xmm2, %xmm1
@@ -1027,7 +1031,7 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind {
define <8 x i16> @testv8i16(<8 x i16> %in) nounwind {
; SSE2-LABEL: testv8i16:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrlw $1, %xmm1
; SSE2-NEXT: por %xmm0, %xmm1
@@ -1063,7 +1067,7 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind {
; SSE2-NEXT: retq
;
; SSE3-LABEL: testv8i16:
-; SSE3: # BB#0:
+; SSE3: # %bb.0:
; SSE3-NEXT: movdqa %xmm0, %xmm1
; SSE3-NEXT: psrlw $1, %xmm1
; SSE3-NEXT: por %xmm0, %xmm1
@@ -1099,7 +1103,7 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind {
; SSE3-NEXT: retq
;
; SSSE3-LABEL: testv8i16:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; SSSE3-NEXT: movdqa %xmm0, %xmm1
; SSSE3-NEXT: pand %xmm2, %xmm1
@@ -1123,7 +1127,7 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind {
; SSSE3-NEXT: retq
;
; SSE41-LABEL: testv8i16:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; SSE41-NEXT: movdqa %xmm0, %xmm1
; SSE41-NEXT: pand %xmm2, %xmm1
@@ -1147,7 +1151,7 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind {
; SSE41-NEXT: retq
;
; AVX-LABEL: testv8i16:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
@@ -1167,10 +1171,10 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind {
; AVX-NEXT: retq
;
; AVX512VLBWDQ-LABEL: testv8i16:
-; AVX512VLBWDQ: # BB#0:
+; AVX512VLBWDQ: # %bb.0:
; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512VLBWDQ-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX512VLBWDQ-NEXT: vpshufb %xmm2, %xmm3, %xmm2
; AVX512VLBWDQ-NEXT: vpsrlw $4, %xmm0, %xmm4
; AVX512VLBWDQ-NEXT: vpand %xmm1, %xmm4, %xmm1
@@ -1187,23 +1191,25 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind {
; AVX512VLBWDQ-NEXT: retq
;
; AVX512VLCD-LABEL: testv8i16:
-; AVX512VLCD: # BB#0:
+; AVX512VLCD: # %bb.0:
; AVX512VLCD-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX512VLCD-NEXT: vplzcntd %ymm0, %ymm0
; AVX512VLCD-NEXT: vpmovdw %ymm0, %xmm0
; AVX512VLCD-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VLCD-NEXT: vzeroupper
; AVX512VLCD-NEXT: retq
;
; AVX512CD-LABEL: testv8i16:
-; AVX512CD: # BB#0:
+; AVX512CD: # %bb.0:
; AVX512CD-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0
; AVX512CD-NEXT: vpmovdw %zmm0, %ymm0
; AVX512CD-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0
+; AVX512CD-NEXT: vzeroupper
; AVX512CD-NEXT: retq
;
; X32-SSE-LABEL: testv8i16:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; X32-SSE-NEXT: movdqa %xmm0, %xmm1
; X32-SSE-NEXT: pand %xmm2, %xmm1
@@ -1231,7 +1237,7 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind {
define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind {
; SSE2-LABEL: testv8i16u:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrlw $1, %xmm1
; SSE2-NEXT: por %xmm0, %xmm1
@@ -1267,7 +1273,7 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind {
; SSE2-NEXT: retq
;
; SSE3-LABEL: testv8i16u:
-; SSE3: # BB#0:
+; SSE3: # %bb.0:
; SSE3-NEXT: movdqa %xmm0, %xmm1
; SSE3-NEXT: psrlw $1, %xmm1
; SSE3-NEXT: por %xmm0, %xmm1
@@ -1303,7 +1309,7 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind {
; SSE3-NEXT: retq
;
; SSSE3-LABEL: testv8i16u:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; SSSE3-NEXT: movdqa %xmm0, %xmm1
; SSSE3-NEXT: pand %xmm2, %xmm1
@@ -1327,7 +1333,7 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind {
; SSSE3-NEXT: retq
;
; SSE41-LABEL: testv8i16u:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; SSE41-NEXT: movdqa %xmm0, %xmm1
; SSE41-NEXT: pand %xmm2, %xmm1
@@ -1351,7 +1357,7 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind {
; SSE41-NEXT: retq
;
; AVX-LABEL: testv8i16u:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
@@ -1371,10 +1377,10 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind {
; AVX-NEXT: retq
;
; AVX512VLBWDQ-LABEL: testv8i16u:
-; AVX512VLBWDQ: # BB#0:
+; AVX512VLBWDQ: # %bb.0:
; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512VLBWDQ-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX512VLBWDQ-NEXT: vpshufb %xmm2, %xmm3, %xmm2
; AVX512VLBWDQ-NEXT: vpsrlw $4, %xmm0, %xmm4
; AVX512VLBWDQ-NEXT: vpand %xmm1, %xmm4, %xmm1
@@ -1391,23 +1397,25 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind {
; AVX512VLBWDQ-NEXT: retq
;
; AVX512VLCD-LABEL: testv8i16u:
-; AVX512VLCD: # BB#0:
+; AVX512VLCD: # %bb.0:
; AVX512VLCD-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX512VLCD-NEXT: vplzcntd %ymm0, %ymm0
; AVX512VLCD-NEXT: vpmovdw %ymm0, %xmm0
; AVX512VLCD-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VLCD-NEXT: vzeroupper
; AVX512VLCD-NEXT: retq
;
; AVX512CD-LABEL: testv8i16u:
-; AVX512CD: # BB#0:
+; AVX512CD: # %bb.0:
; AVX512CD-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0
; AVX512CD-NEXT: vpmovdw %zmm0, %ymm0
; AVX512CD-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0
+; AVX512CD-NEXT: vzeroupper
; AVX512CD-NEXT: retq
;
; X32-SSE-LABEL: testv8i16u:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; X32-SSE-NEXT: movdqa %xmm0, %xmm1
; X32-SSE-NEXT: pand %xmm2, %xmm1
@@ -1435,7 +1443,7 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind {
define <16 x i8> @testv16i8(<16 x i8> %in) nounwind {
; SSE2-LABEL: testv16i8:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrlw $1, %xmm1
; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
@@ -1468,7 +1476,7 @@ define <16 x i8> @testv16i8(<16 x i8> %in) nounwind {
; SSE2-NEXT: retq
;
; SSE3-LABEL: testv16i8:
-; SSE3: # BB#0:
+; SSE3: # %bb.0:
; SSE3-NEXT: movdqa %xmm0, %xmm1
; SSE3-NEXT: psrlw $1, %xmm1
; SSE3-NEXT: pand {{.*}}(%rip), %xmm1
@@ -1501,7 +1509,7 @@ define <16 x i8> @testv16i8(<16 x i8> %in) nounwind {
; SSE3-NEXT: retq
;
; SSSE3-LABEL: testv16i8:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; SSSE3-NEXT: movdqa %xmm0, %xmm3
; SSSE3-NEXT: pand %xmm2, %xmm3
@@ -1519,7 +1527,7 @@ define <16 x i8> @testv16i8(<16 x i8> %in) nounwind {
; SSSE3-NEXT: retq
;
; SSE41-LABEL: testv16i8:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; SSE41-NEXT: movdqa %xmm0, %xmm3
; SSE41-NEXT: pand %xmm2, %xmm3
@@ -1537,7 +1545,7 @@ define <16 x i8> @testv16i8(<16 x i8> %in) nounwind {
; SSE41-NEXT: retq
;
; AVX-LABEL: testv16i8:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
@@ -1552,10 +1560,10 @@ define <16 x i8> @testv16i8(<16 x i8> %in) nounwind {
; AVX-NEXT: retq
;
; AVX512VLBWDQ-LABEL: testv16i8:
-; AVX512VLBWDQ: # BB#0:
+; AVX512VLBWDQ: # %bb.0:
; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512VLBWDQ-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX512VLBWDQ-NEXT: vpshufb %xmm2, %xmm3, %xmm2
; AVX512VLBWDQ-NEXT: vpsrlw $4, %xmm0, %xmm0
; AVX512VLBWDQ-NEXT: vpand %xmm1, %xmm0, %xmm0
@@ -1567,15 +1575,16 @@ define <16 x i8> @testv16i8(<16 x i8> %in) nounwind {
; AVX512VLBWDQ-NEXT: retq
;
; AVX512-LABEL: testv16i8:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
; AVX512-NEXT: vplzcntd %zmm0, %zmm0
; AVX512-NEXT: vpmovdb %zmm0, %xmm0
; AVX512-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
;
; X32-SSE-LABEL: testv16i8:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; X32-SSE-NEXT: movdqa %xmm0, %xmm3
; X32-SSE-NEXT: pand %xmm2, %xmm3
@@ -1597,7 +1606,7 @@ define <16 x i8> @testv16i8(<16 x i8> %in) nounwind {
define <16 x i8> @testv16i8u(<16 x i8> %in) nounwind {
; SSE2-LABEL: testv16i8u:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrlw $1, %xmm1
; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
@@ -1630,7 +1639,7 @@ define <16 x i8> @testv16i8u(<16 x i8> %in) nounwind {
; SSE2-NEXT: retq
;
; SSE3-LABEL: testv16i8u:
-; SSE3: # BB#0:
+; SSE3: # %bb.0:
; SSE3-NEXT: movdqa %xmm0, %xmm1
; SSE3-NEXT: psrlw $1, %xmm1
; SSE3-NEXT: pand {{.*}}(%rip), %xmm1
@@ -1663,7 +1672,7 @@ define <16 x i8> @testv16i8u(<16 x i8> %in) nounwind {
; SSE3-NEXT: retq
;
; SSSE3-LABEL: testv16i8u:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; SSSE3-NEXT: movdqa %xmm0, %xmm3
; SSSE3-NEXT: pand %xmm2, %xmm3
@@ -1681,7 +1690,7 @@ define <16 x i8> @testv16i8u(<16 x i8> %in) nounwind {
; SSSE3-NEXT: retq
;
; SSE41-LABEL: testv16i8u:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; SSE41-NEXT: movdqa %xmm0, %xmm3
; SSE41-NEXT: pand %xmm2, %xmm3
@@ -1699,7 +1708,7 @@ define <16 x i8> @testv16i8u(<16 x i8> %in) nounwind {
; SSE41-NEXT: retq
;
; AVX-LABEL: testv16i8u:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
@@ -1714,10 +1723,10 @@ define <16 x i8> @testv16i8u(<16 x i8> %in) nounwind {
; AVX-NEXT: retq
;
; AVX512VLBWDQ-LABEL: testv16i8u:
-; AVX512VLBWDQ: # BB#0:
+; AVX512VLBWDQ: # %bb.0:
; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512VLBWDQ-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX512VLBWDQ-NEXT: vpshufb %xmm2, %xmm3, %xmm2
; AVX512VLBWDQ-NEXT: vpsrlw $4, %xmm0, %xmm0
; AVX512VLBWDQ-NEXT: vpand %xmm1, %xmm0, %xmm0
@@ -1729,15 +1738,16 @@ define <16 x i8> @testv16i8u(<16 x i8> %in) nounwind {
; AVX512VLBWDQ-NEXT: retq
;
; AVX512-LABEL: testv16i8u:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
; AVX512-NEXT: vplzcntd %zmm0, %zmm0
; AVX512-NEXT: vpmovdb %zmm0, %xmm0
; AVX512-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
;
; X32-SSE-LABEL: testv16i8u:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; X32-SSE-NEXT: movdqa %xmm0, %xmm3
; X32-SSE-NEXT: pand %xmm2, %xmm3
@@ -1759,25 +1769,25 @@ define <16 x i8> @testv16i8u(<16 x i8> %in) nounwind {
define <2 x i64> @foldv2i64() nounwind {
; SSE-LABEL: foldv2i64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movl $55, %eax
; SSE-NEXT: movq %rax, %xmm0
; SSE-NEXT: retq
;
; NOBW-LABEL: foldv2i64:
-; NOBW: # BB#0:
+; NOBW: # %bb.0:
; NOBW-NEXT: movl $55, %eax
; NOBW-NEXT: vmovq %rax, %xmm0
; NOBW-NEXT: retq
;
; AVX512VLBWDQ-LABEL: foldv2i64:
-; AVX512VLBWDQ: # BB#0:
+; AVX512VLBWDQ: # %bb.0:
; AVX512VLBWDQ-NEXT: movl $55, %eax
; AVX512VLBWDQ-NEXT: vmovq %rax, %xmm0
; AVX512VLBWDQ-NEXT: retq
;
; X32-SSE-LABEL: foldv2i64:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: movl $55, %eax
; X32-SSE-NEXT: movd %eax, %xmm0
; X32-SSE-NEXT: retl
@@ -1787,25 +1797,25 @@ define <2 x i64> @foldv2i64() nounwind {
define <2 x i64> @foldv2i64u() nounwind {
; SSE-LABEL: foldv2i64u:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movl $55, %eax
; SSE-NEXT: movq %rax, %xmm0
; SSE-NEXT: retq
;
; NOBW-LABEL: foldv2i64u:
-; NOBW: # BB#0:
+; NOBW: # %bb.0:
; NOBW-NEXT: movl $55, %eax
; NOBW-NEXT: vmovq %rax, %xmm0
; NOBW-NEXT: retq
;
; AVX512VLBWDQ-LABEL: foldv2i64u:
-; AVX512VLBWDQ: # BB#0:
+; AVX512VLBWDQ: # %bb.0:
; AVX512VLBWDQ-NEXT: movl $55, %eax
; AVX512VLBWDQ-NEXT: vmovq %rax, %xmm0
; AVX512VLBWDQ-NEXT: retq
;
; X32-SSE-LABEL: foldv2i64u:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: movl $55, %eax
; X32-SSE-NEXT: movd %eax, %xmm0
; X32-SSE-NEXT: retl
@@ -1815,22 +1825,22 @@ define <2 x i64> @foldv2i64u() nounwind {
define <4 x i32> @foldv4i32() nounwind {
; SSE-LABEL: foldv4i32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm0 = [23,0,32,24]
; SSE-NEXT: retq
;
; NOBW-LABEL: foldv4i32:
-; NOBW: # BB#0:
+; NOBW: # %bb.0:
; NOBW-NEXT: vmovaps {{.*#+}} xmm0 = [23,0,32,24]
; NOBW-NEXT: retq
;
; AVX512VLBWDQ-LABEL: foldv4i32:
-; AVX512VLBWDQ: # BB#0:
+; AVX512VLBWDQ: # %bb.0:
; AVX512VLBWDQ-NEXT: vmovaps {{.*#+}} xmm0 = [23,0,32,24]
; AVX512VLBWDQ-NEXT: retq
;
; X32-SSE-LABEL: foldv4i32:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: movaps {{.*#+}} xmm0 = [23,0,32,24]
; X32-SSE-NEXT: retl
%out = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> <i32 256, i32 -1, i32 0, i32 255>, i1 0)
@@ -1839,22 +1849,22 @@ define <4 x i32> @foldv4i32() nounwind {
define <4 x i32> @foldv4i32u() nounwind {
; SSE-LABEL: foldv4i32u:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm0 = [23,0,32,24]
; SSE-NEXT: retq
;
; NOBW-LABEL: foldv4i32u:
-; NOBW: # BB#0:
+; NOBW: # %bb.0:
; NOBW-NEXT: vmovaps {{.*#+}} xmm0 = [23,0,32,24]
; NOBW-NEXT: retq
;
; AVX512VLBWDQ-LABEL: foldv4i32u:
-; AVX512VLBWDQ: # BB#0:
+; AVX512VLBWDQ: # %bb.0:
; AVX512VLBWDQ-NEXT: vmovaps {{.*#+}} xmm0 = [23,0,32,24]
; AVX512VLBWDQ-NEXT: retq
;
; X32-SSE-LABEL: foldv4i32u:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: movaps {{.*#+}} xmm0 = [23,0,32,24]
; X32-SSE-NEXT: retl
%out = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> <i32 256, i32 -1, i32 0, i32 255>, i1 -1)
@@ -1863,22 +1873,22 @@ define <4 x i32> @foldv4i32u() nounwind {
define <8 x i16> @foldv8i16() nounwind {
; SSE-LABEL: foldv8i16:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9]
; SSE-NEXT: retq
;
; NOBW-LABEL: foldv8i16:
-; NOBW: # BB#0:
+; NOBW: # %bb.0:
; NOBW-NEXT: vmovaps {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9]
; NOBW-NEXT: retq
;
; AVX512VLBWDQ-LABEL: foldv8i16:
-; AVX512VLBWDQ: # BB#0:
-; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9]
+; AVX512VLBWDQ: # %bb.0:
+; AVX512VLBWDQ-NEXT: vmovaps {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9]
; AVX512VLBWDQ-NEXT: retq
;
; X32-SSE-LABEL: foldv8i16:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: movaps {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9]
; X32-SSE-NEXT: retl
%out = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> <i16 256, i16 -1, i16 0, i16 255, i16 -65536, i16 7, i16 24, i16 88>, i1 0)
@@ -1887,22 +1897,22 @@ define <8 x i16> @foldv8i16() nounwind {
define <8 x i16> @foldv8i16u() nounwind {
; SSE-LABEL: foldv8i16u:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9]
; SSE-NEXT: retq
;
; NOBW-LABEL: foldv8i16u:
-; NOBW: # BB#0:
+; NOBW: # %bb.0:
; NOBW-NEXT: vmovaps {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9]
; NOBW-NEXT: retq
;
; AVX512VLBWDQ-LABEL: foldv8i16u:
-; AVX512VLBWDQ: # BB#0:
-; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9]
+; AVX512VLBWDQ: # %bb.0:
+; AVX512VLBWDQ-NEXT: vmovaps {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9]
; AVX512VLBWDQ-NEXT: retq
;
; X32-SSE-LABEL: foldv8i16u:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: movaps {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9]
; X32-SSE-NEXT: retl
%out = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> <i16 256, i16 -1, i16 0, i16 255, i16 -65536, i16 7, i16 24, i16 88>, i1 -1)
@@ -1911,22 +1921,22 @@ define <8 x i16> @foldv8i16u() nounwind {
define <16 x i8> @foldv16i8() nounwind {
; SSE-LABEL: foldv16i8:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2]
; SSE-NEXT: retq
;
; NOBW-LABEL: foldv16i8:
-; NOBW: # BB#0:
+; NOBW: # %bb.0:
; NOBW-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2]
; NOBW-NEXT: retq
;
; AVX512VLBWDQ-LABEL: foldv16i8:
-; AVX512VLBWDQ: # BB#0:
-; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2]
+; AVX512VLBWDQ: # %bb.0:
+; AVX512VLBWDQ-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2]
; AVX512VLBWDQ-NEXT: retq
;
; X32-SSE-LABEL: foldv16i8:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2]
; X32-SSE-NEXT: retl
%out = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> <i8 256, i8 -1, i8 0, i8 255, i8 -65536, i8 7, i8 24, i8 88, i8 -2, i8 254, i8 1, i8 2, i8 4, i8 8, i8 16, i8 32>, i1 0)
@@ -1935,22 +1945,22 @@ define <16 x i8> @foldv16i8() nounwind {
define <16 x i8> @foldv16i8u() nounwind {
; SSE-LABEL: foldv16i8u:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2]
; SSE-NEXT: retq
;
; NOBW-LABEL: foldv16i8u:
-; NOBW: # BB#0:
+; NOBW: # %bb.0:
; NOBW-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2]
; NOBW-NEXT: retq
;
; AVX512VLBWDQ-LABEL: foldv16i8u:
-; AVX512VLBWDQ: # BB#0:
-; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2]
+; AVX512VLBWDQ: # %bb.0:
+; AVX512VLBWDQ-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2]
; AVX512VLBWDQ-NEXT: retq
;
; X32-SSE-LABEL: foldv16i8u:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2]
; X32-SSE-NEXT: retl
%out = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> <i8 256, i8 -1, i8 0, i8 255, i8 -65536, i8 7, i8 24, i8 88, i8 -2, i8 254, i8 1, i8 2, i8 4, i8 8, i8 16, i8 32>, i1 -1)
diff --git a/test/CodeGen/X86/vector-lzcnt-256.ll b/test/CodeGen/X86/vector-lzcnt-256.ll
index 185e1f4865ea..73f7b3c2ad8b 100644
--- a/test/CodeGen/X86/vector-lzcnt-256.ll
+++ b/test/CodeGen/X86/vector-lzcnt-256.ll
@@ -3,15 +3,15 @@
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X64 --check-prefix=NOBW --check-prefix=AVX --check-prefix=AVX2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=X64 --check-prefix=NOBW --check-prefix=AVX512VL
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw,+avx512dq | FileCheck %s --check-prefix=X64 --check-prefix=AVX512VLBWDQ
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512cd -mattr=+avx512vl | FileCheck %s --check-prefix=X64 --check-prefix=NOBW --check-prefix=AVX512 --check-prefix=AVX512VLCD
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512cd | FileCheck %s --check-prefix=X64 --check-prefix=NOBW --check-prefix=AVX512 --check-prefix=AVX512CD
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512cd,+avx512vl | FileCheck %s --check-prefix=X64 --check-prefix=NOBW --check-prefix=AVX512 --check-prefix=AVX512VLCD
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512cd | FileCheck %s --check-prefix=X64 --check-prefix=NOBW --check-prefix=AVX512 --check-prefix=AVX512CD
;
; Just one 32-bit run to make sure we do reasonable things for i64 lzcnt.
; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X32-AVX
define <4 x i64> @testv4i64(<4 x i64> %in) nounwind {
; AVX1-LABEL: testv4i64:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm1
@@ -66,14 +66,14 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: testv4i64:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm4
; AVX2-NEXT: vpand %ymm1, %ymm4, %ymm1
-; AVX2-NEXT: vpxor %ymm4, %ymm4, %ymm4
+; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4
; AVX2-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5
; AVX2-NEXT: vpand %ymm5, %ymm2, %ymm2
; AVX2-NEXT: vpshufb %ymm1, %ymm3, %ymm1
@@ -96,14 +96,14 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind {
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: testv4i64:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512VL-NEXT: vpand %ymm1, %ymm0, %ymm2
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX512VL-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm4
; AVX512VL-NEXT: vpand %ymm1, %ymm4, %ymm1
-; AVX512VL-NEXT: vpxor %ymm4, %ymm4, %ymm4
+; AVX512VL-NEXT: vpxor %xmm4, %xmm4, %xmm4
; AVX512VL-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5
; AVX512VL-NEXT: vpand %ymm5, %ymm2, %ymm2
; AVX512VL-NEXT: vpshufb %ymm1, %ymm3, %ymm1
@@ -126,14 +126,14 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind {
; AVX512VL-NEXT: retq
;
; AVX512VLBWDQ-LABEL: testv4i64:
-; AVX512VLBWDQ: # BB#0:
+; AVX512VLBWDQ: # %bb.0:
; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX512VLBWDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX512VLBWDQ-NEXT: vpsrlw $4, %ymm0, %ymm4
; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm4, %ymm1
-; AVX512VLBWDQ-NEXT: vpxor %ymm4, %ymm4, %ymm4
+; AVX512VLBWDQ-NEXT: vpxor %xmm4, %xmm4, %xmm4
; AVX512VLBWDQ-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5
; AVX512VLBWDQ-NEXT: vpand %ymm5, %ymm2, %ymm2
; AVX512VLBWDQ-NEXT: vpshufb %ymm1, %ymm3, %ymm1
@@ -156,26 +156,26 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind {
; AVX512VLBWDQ-NEXT: retq
;
; AVX512VLCD-LABEL: testv4i64:
-; AVX512VLCD: # BB#0:
+; AVX512VLCD: # %bb.0:
; AVX512VLCD-NEXT: vplzcntq %ymm0, %ymm0
; AVX512VLCD-NEXT: retq
;
; AVX512CD-LABEL: testv4i64:
-; AVX512CD: # BB#0:
-; AVX512CD-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512CD: # %bb.0:
+; AVX512CD-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; AVX512CD-NEXT: vplzcntq %zmm0, %zmm0
-; AVX512CD-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512CD-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
; AVX512CD-NEXT: retq
;
; X32-AVX-LABEL: testv4i64:
-; X32-AVX: # BB#0:
+; X32-AVX: # %bb.0:
; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm2
; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; X32-AVX-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; X32-AVX-NEXT: vpsrlw $4, %ymm0, %ymm4
; X32-AVX-NEXT: vpand %ymm1, %ymm4, %ymm1
-; X32-AVX-NEXT: vpxor %ymm4, %ymm4, %ymm4
+; X32-AVX-NEXT: vpxor %xmm4, %xmm4, %xmm4
; X32-AVX-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5
; X32-AVX-NEXT: vpand %ymm5, %ymm2, %ymm2
; X32-AVX-NEXT: vpshufb %ymm1, %ymm3, %ymm1
@@ -203,7 +203,7 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind {
define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind {
; AVX1-LABEL: testv4i64u:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm1
@@ -258,14 +258,14 @@ define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: testv4i64u:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm4
; AVX2-NEXT: vpand %ymm1, %ymm4, %ymm1
-; AVX2-NEXT: vpxor %ymm4, %ymm4, %ymm4
+; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4
; AVX2-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5
; AVX2-NEXT: vpand %ymm5, %ymm2, %ymm2
; AVX2-NEXT: vpshufb %ymm1, %ymm3, %ymm1
@@ -288,14 +288,14 @@ define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind {
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: testv4i64u:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512VL-NEXT: vpand %ymm1, %ymm0, %ymm2
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX512VL-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm4
; AVX512VL-NEXT: vpand %ymm1, %ymm4, %ymm1
-; AVX512VL-NEXT: vpxor %ymm4, %ymm4, %ymm4
+; AVX512VL-NEXT: vpxor %xmm4, %xmm4, %xmm4
; AVX512VL-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5
; AVX512VL-NEXT: vpand %ymm5, %ymm2, %ymm2
; AVX512VL-NEXT: vpshufb %ymm1, %ymm3, %ymm1
@@ -318,14 +318,14 @@ define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind {
; AVX512VL-NEXT: retq
;
; AVX512VLBWDQ-LABEL: testv4i64u:
-; AVX512VLBWDQ: # BB#0:
+; AVX512VLBWDQ: # %bb.0:
; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX512VLBWDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX512VLBWDQ-NEXT: vpsrlw $4, %ymm0, %ymm4
; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm4, %ymm1
-; AVX512VLBWDQ-NEXT: vpxor %ymm4, %ymm4, %ymm4
+; AVX512VLBWDQ-NEXT: vpxor %xmm4, %xmm4, %xmm4
; AVX512VLBWDQ-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5
; AVX512VLBWDQ-NEXT: vpand %ymm5, %ymm2, %ymm2
; AVX512VLBWDQ-NEXT: vpshufb %ymm1, %ymm3, %ymm1
@@ -348,26 +348,26 @@ define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind {
; AVX512VLBWDQ-NEXT: retq
;
; AVX512VLCD-LABEL: testv4i64u:
-; AVX512VLCD: # BB#0:
+; AVX512VLCD: # %bb.0:
; AVX512VLCD-NEXT: vplzcntq %ymm0, %ymm0
; AVX512VLCD-NEXT: retq
;
; AVX512CD-LABEL: testv4i64u:
-; AVX512CD: # BB#0:
-; AVX512CD-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512CD: # %bb.0:
+; AVX512CD-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; AVX512CD-NEXT: vplzcntq %zmm0, %zmm0
-; AVX512CD-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512CD-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
; AVX512CD-NEXT: retq
;
; X32-AVX-LABEL: testv4i64u:
-; X32-AVX: # BB#0:
+; X32-AVX: # %bb.0:
; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm2
; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; X32-AVX-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; X32-AVX-NEXT: vpsrlw $4, %ymm0, %ymm4
; X32-AVX-NEXT: vpand %ymm1, %ymm4, %ymm1
-; X32-AVX-NEXT: vpxor %ymm4, %ymm4, %ymm4
+; X32-AVX-NEXT: vpxor %xmm4, %xmm4, %xmm4
; X32-AVX-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5
; X32-AVX-NEXT: vpand %ymm5, %ymm2, %ymm2
; X32-AVX-NEXT: vpshufb %ymm1, %ymm3, %ymm1
@@ -395,7 +395,7 @@ define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind {
define <8 x i32> @testv8i32(<8 x i32> %in) nounwind {
; AVX1-LABEL: testv8i32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3
@@ -440,14 +440,14 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: testv8i32:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm4
; AVX2-NEXT: vpand %ymm1, %ymm4, %ymm1
-; AVX2-NEXT: vpxor %ymm4, %ymm4, %ymm4
+; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4
; AVX2-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5
; AVX2-NEXT: vpand %ymm5, %ymm2, %ymm2
; AVX2-NEXT: vpshufb %ymm1, %ymm3, %ymm1
@@ -465,14 +465,14 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind {
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: testv8i32:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512VL-NEXT: vpand %ymm1, %ymm0, %ymm2
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX512VL-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm4
; AVX512VL-NEXT: vpand %ymm1, %ymm4, %ymm1
-; AVX512VL-NEXT: vpxor %ymm4, %ymm4, %ymm4
+; AVX512VL-NEXT: vpxor %xmm4, %xmm4, %xmm4
; AVX512VL-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5
; AVX512VL-NEXT: vpand %ymm5, %ymm2, %ymm2
; AVX512VL-NEXT: vpshufb %ymm1, %ymm3, %ymm1
@@ -490,14 +490,14 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind {
; AVX512VL-NEXT: retq
;
; AVX512VLBWDQ-LABEL: testv8i32:
-; AVX512VLBWDQ: # BB#0:
+; AVX512VLBWDQ: # %bb.0:
; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX512VLBWDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX512VLBWDQ-NEXT: vpsrlw $4, %ymm0, %ymm4
; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm4, %ymm1
-; AVX512VLBWDQ-NEXT: vpxor %ymm4, %ymm4, %ymm4
+; AVX512VLBWDQ-NEXT: vpxor %xmm4, %xmm4, %xmm4
; AVX512VLBWDQ-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5
; AVX512VLBWDQ-NEXT: vpand %ymm5, %ymm2, %ymm2
; AVX512VLBWDQ-NEXT: vpshufb %ymm1, %ymm3, %ymm1
@@ -515,26 +515,26 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind {
; AVX512VLBWDQ-NEXT: retq
;
; AVX512VLCD-LABEL: testv8i32:
-; AVX512VLCD: # BB#0:
+; AVX512VLCD: # %bb.0:
; AVX512VLCD-NEXT: vplzcntd %ymm0, %ymm0
; AVX512VLCD-NEXT: retq
;
; AVX512CD-LABEL: testv8i32:
-; AVX512CD: # BB#0:
-; AVX512CD-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512CD: # %bb.0:
+; AVX512CD-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0
-; AVX512CD-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512CD-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
; AVX512CD-NEXT: retq
;
; X32-AVX-LABEL: testv8i32:
-; X32-AVX: # BB#0:
+; X32-AVX: # %bb.0:
; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm2
; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; X32-AVX-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; X32-AVX-NEXT: vpsrlw $4, %ymm0, %ymm4
; X32-AVX-NEXT: vpand %ymm1, %ymm4, %ymm1
-; X32-AVX-NEXT: vpxor %ymm4, %ymm4, %ymm4
+; X32-AVX-NEXT: vpxor %xmm4, %xmm4, %xmm4
; X32-AVX-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5
; X32-AVX-NEXT: vpand %ymm5, %ymm2, %ymm2
; X32-AVX-NEXT: vpshufb %ymm1, %ymm3, %ymm1
@@ -557,7 +557,7 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind {
define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind {
; AVX1-LABEL: testv8i32u:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3
@@ -602,14 +602,14 @@ define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: testv8i32u:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm4
; AVX2-NEXT: vpand %ymm1, %ymm4, %ymm1
-; AVX2-NEXT: vpxor %ymm4, %ymm4, %ymm4
+; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4
; AVX2-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5
; AVX2-NEXT: vpand %ymm5, %ymm2, %ymm2
; AVX2-NEXT: vpshufb %ymm1, %ymm3, %ymm1
@@ -627,14 +627,14 @@ define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind {
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: testv8i32u:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512VL-NEXT: vpand %ymm1, %ymm0, %ymm2
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX512VL-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm4
; AVX512VL-NEXT: vpand %ymm1, %ymm4, %ymm1
-; AVX512VL-NEXT: vpxor %ymm4, %ymm4, %ymm4
+; AVX512VL-NEXT: vpxor %xmm4, %xmm4, %xmm4
; AVX512VL-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5
; AVX512VL-NEXT: vpand %ymm5, %ymm2, %ymm2
; AVX512VL-NEXT: vpshufb %ymm1, %ymm3, %ymm1
@@ -652,14 +652,14 @@ define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind {
; AVX512VL-NEXT: retq
;
; AVX512VLBWDQ-LABEL: testv8i32u:
-; AVX512VLBWDQ: # BB#0:
+; AVX512VLBWDQ: # %bb.0:
; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX512VLBWDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX512VLBWDQ-NEXT: vpsrlw $4, %ymm0, %ymm4
; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm4, %ymm1
-; AVX512VLBWDQ-NEXT: vpxor %ymm4, %ymm4, %ymm4
+; AVX512VLBWDQ-NEXT: vpxor %xmm4, %xmm4, %xmm4
; AVX512VLBWDQ-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5
; AVX512VLBWDQ-NEXT: vpand %ymm5, %ymm2, %ymm2
; AVX512VLBWDQ-NEXT: vpshufb %ymm1, %ymm3, %ymm1
@@ -677,26 +677,26 @@ define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind {
; AVX512VLBWDQ-NEXT: retq
;
; AVX512VLCD-LABEL: testv8i32u:
-; AVX512VLCD: # BB#0:
+; AVX512VLCD: # %bb.0:
; AVX512VLCD-NEXT: vplzcntd %ymm0, %ymm0
; AVX512VLCD-NEXT: retq
;
; AVX512CD-LABEL: testv8i32u:
-; AVX512CD: # BB#0:
-; AVX512CD-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512CD: # %bb.0:
+; AVX512CD-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0
-; AVX512CD-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512CD-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
; AVX512CD-NEXT: retq
;
; X32-AVX-LABEL: testv8i32u:
-; X32-AVX: # BB#0:
+; X32-AVX: # %bb.0:
; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm2
; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; X32-AVX-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; X32-AVX-NEXT: vpsrlw $4, %ymm0, %ymm4
; X32-AVX-NEXT: vpand %ymm1, %ymm4, %ymm1
-; X32-AVX-NEXT: vpxor %ymm4, %ymm4, %ymm4
+; X32-AVX-NEXT: vpxor %xmm4, %xmm4, %xmm4
; X32-AVX-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5
; X32-AVX-NEXT: vpand %ymm5, %ymm2, %ymm2
; X32-AVX-NEXT: vpshufb %ymm1, %ymm3, %ymm1
@@ -719,7 +719,7 @@ define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind {
define <16 x i16> @testv16i16(<16 x i16> %in) nounwind {
; AVX1-LABEL: testv16i16:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3
@@ -754,14 +754,14 @@ define <16 x i16> @testv16i16(<16 x i16> %in) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: testv16i16:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm4
; AVX2-NEXT: vpand %ymm1, %ymm4, %ymm1
-; AVX2-NEXT: vpxor %ymm4, %ymm4, %ymm4
+; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4
; AVX2-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5
; AVX2-NEXT: vpand %ymm5, %ymm2, %ymm2
; AVX2-NEXT: vpshufb %ymm1, %ymm3, %ymm1
@@ -774,14 +774,14 @@ define <16 x i16> @testv16i16(<16 x i16> %in) nounwind {
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: testv16i16:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512VL-NEXT: vpand %ymm1, %ymm0, %ymm2
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX512VL-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm4
; AVX512VL-NEXT: vpand %ymm1, %ymm4, %ymm1
-; AVX512VL-NEXT: vpxor %ymm4, %ymm4, %ymm4
+; AVX512VL-NEXT: vpxor %xmm4, %xmm4, %xmm4
; AVX512VL-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5
; AVX512VL-NEXT: vpand %ymm5, %ymm2, %ymm2
; AVX512VL-NEXT: vpshufb %ymm1, %ymm3, %ymm1
@@ -794,14 +794,14 @@ define <16 x i16> @testv16i16(<16 x i16> %in) nounwind {
; AVX512VL-NEXT: retq
;
; AVX512VLBWDQ-LABEL: testv16i16:
-; AVX512VLBWDQ: # BB#0:
+; AVX512VLBWDQ: # %bb.0:
; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX512VLBWDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX512VLBWDQ-NEXT: vpsrlw $4, %ymm0, %ymm4
; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm4, %ymm1
-; AVX512VLBWDQ-NEXT: vpxor %ymm4, %ymm4, %ymm4
+; AVX512VLBWDQ-NEXT: vpxor %xmm4, %xmm4, %xmm4
; AVX512VLBWDQ-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5
; AVX512VLBWDQ-NEXT: vpand %ymm5, %ymm2, %ymm2
; AVX512VLBWDQ-NEXT: vpshufb %ymm1, %ymm3, %ymm1
@@ -814,7 +814,7 @@ define <16 x i16> @testv16i16(<16 x i16> %in) nounwind {
; AVX512VLBWDQ-NEXT: retq
;
; AVX512-LABEL: testv16i16:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
; AVX512-NEXT: vplzcntd %zmm0, %zmm0
; AVX512-NEXT: vpmovdw %zmm0, %ymm0
@@ -822,14 +822,14 @@ define <16 x i16> @testv16i16(<16 x i16> %in) nounwind {
; AVX512-NEXT: retq
;
; X32-AVX-LABEL: testv16i16:
-; X32-AVX: # BB#0:
+; X32-AVX: # %bb.0:
; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm2
; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; X32-AVX-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; X32-AVX-NEXT: vpsrlw $4, %ymm0, %ymm4
; X32-AVX-NEXT: vpand %ymm1, %ymm4, %ymm1
-; X32-AVX-NEXT: vpxor %ymm4, %ymm4, %ymm4
+; X32-AVX-NEXT: vpxor %xmm4, %xmm4, %xmm4
; X32-AVX-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5
; X32-AVX-NEXT: vpand %ymm5, %ymm2, %ymm2
; X32-AVX-NEXT: vpshufb %ymm1, %ymm3, %ymm1
@@ -846,7 +846,7 @@ define <16 x i16> @testv16i16(<16 x i16> %in) nounwind {
define <16 x i16> @testv16i16u(<16 x i16> %in) nounwind {
; AVX1-LABEL: testv16i16u:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3
@@ -881,14 +881,14 @@ define <16 x i16> @testv16i16u(<16 x i16> %in) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: testv16i16u:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm4
; AVX2-NEXT: vpand %ymm1, %ymm4, %ymm1
-; AVX2-NEXT: vpxor %ymm4, %ymm4, %ymm4
+; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4
; AVX2-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5
; AVX2-NEXT: vpand %ymm5, %ymm2, %ymm2
; AVX2-NEXT: vpshufb %ymm1, %ymm3, %ymm1
@@ -901,14 +901,14 @@ define <16 x i16> @testv16i16u(<16 x i16> %in) nounwind {
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: testv16i16u:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512VL-NEXT: vpand %ymm1, %ymm0, %ymm2
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX512VL-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm4
; AVX512VL-NEXT: vpand %ymm1, %ymm4, %ymm1
-; AVX512VL-NEXT: vpxor %ymm4, %ymm4, %ymm4
+; AVX512VL-NEXT: vpxor %xmm4, %xmm4, %xmm4
; AVX512VL-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5
; AVX512VL-NEXT: vpand %ymm5, %ymm2, %ymm2
; AVX512VL-NEXT: vpshufb %ymm1, %ymm3, %ymm1
@@ -921,14 +921,14 @@ define <16 x i16> @testv16i16u(<16 x i16> %in) nounwind {
; AVX512VL-NEXT: retq
;
; AVX512VLBWDQ-LABEL: testv16i16u:
-; AVX512VLBWDQ: # BB#0:
+; AVX512VLBWDQ: # %bb.0:
; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX512VLBWDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX512VLBWDQ-NEXT: vpsrlw $4, %ymm0, %ymm4
; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm4, %ymm1
-; AVX512VLBWDQ-NEXT: vpxor %ymm4, %ymm4, %ymm4
+; AVX512VLBWDQ-NEXT: vpxor %xmm4, %xmm4, %xmm4
; AVX512VLBWDQ-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5
; AVX512VLBWDQ-NEXT: vpand %ymm5, %ymm2, %ymm2
; AVX512VLBWDQ-NEXT: vpshufb %ymm1, %ymm3, %ymm1
@@ -941,7 +941,7 @@ define <16 x i16> @testv16i16u(<16 x i16> %in) nounwind {
; AVX512VLBWDQ-NEXT: retq
;
; AVX512-LABEL: testv16i16u:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
; AVX512-NEXT: vplzcntd %zmm0, %zmm0
; AVX512-NEXT: vpmovdw %zmm0, %ymm0
@@ -949,14 +949,14 @@ define <16 x i16> @testv16i16u(<16 x i16> %in) nounwind {
; AVX512-NEXT: retq
;
; X32-AVX-LABEL: testv16i16u:
-; X32-AVX: # BB#0:
+; X32-AVX: # %bb.0:
; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm2
; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; X32-AVX-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; X32-AVX-NEXT: vpsrlw $4, %ymm0, %ymm4
; X32-AVX-NEXT: vpand %ymm1, %ymm4, %ymm1
-; X32-AVX-NEXT: vpxor %ymm4, %ymm4, %ymm4
+; X32-AVX-NEXT: vpxor %xmm4, %xmm4, %xmm4
; X32-AVX-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5
; X32-AVX-NEXT: vpand %ymm5, %ymm2, %ymm2
; X32-AVX-NEXT: vpshufb %ymm1, %ymm3, %ymm1
@@ -973,7 +973,7 @@ define <16 x i16> @testv16i16u(<16 x i16> %in) nounwind {
define <32 x i8> @testv32i8(<32 x i8> %in) nounwind {
; AVX1-LABEL: testv32i8:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3
@@ -998,14 +998,14 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: testv32i8:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm1
; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm1
; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0
@@ -1013,14 +1013,14 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind {
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: testv32i8:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512VL-NEXT: vpand %ymm1, %ymm0, %ymm2
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX512VL-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX512VL-NEXT: vpand %ymm1, %ymm0, %ymm0
-; AVX512VL-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512VL-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm1
; AVX512VL-NEXT: vpand %ymm1, %ymm2, %ymm1
; AVX512VL-NEXT: vpshufb %ymm0, %ymm3, %ymm0
@@ -1028,14 +1028,14 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind {
; AVX512VL-NEXT: retq
;
; AVX512VLBWDQ-LABEL: testv32i8:
-; AVX512VLBWDQ: # BB#0:
+; AVX512VLBWDQ: # %bb.0:
; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX512VLBWDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX512VLBWDQ-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm0, %ymm0
-; AVX512VLBWDQ-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; AVX512VLBWDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512VLBWDQ-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm1
; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm2, %ymm1
; AVX512VLBWDQ-NEXT: vpshufb %ymm0, %ymm3, %ymm0
@@ -1043,7 +1043,7 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind {
; AVX512VLBWDQ-NEXT: retq
;
; AVX512-LABEL: testv32i8:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
; AVX512-NEXT: vplzcntd %zmm1, %zmm1
@@ -1058,14 +1058,14 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind {
; AVX512-NEXT: retq
;
; X32-AVX-LABEL: testv32i8:
-; X32-AVX: # BB#0:
+; X32-AVX: # %bb.0:
; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm2
; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; X32-AVX-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; X32-AVX-NEXT: vpsrlw $4, %ymm0, %ymm0
; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm0
-; X32-AVX-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; X32-AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; X32-AVX-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm1
; X32-AVX-NEXT: vpand %ymm1, %ymm2, %ymm1
; X32-AVX-NEXT: vpshufb %ymm0, %ymm3, %ymm0
@@ -1077,7 +1077,7 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind {
define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind {
; AVX1-LABEL: testv32i8u:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3
@@ -1102,14 +1102,14 @@ define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: testv32i8u:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm1
; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm1
; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0
@@ -1117,14 +1117,14 @@ define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind {
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: testv32i8u:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512VL-NEXT: vpand %ymm1, %ymm0, %ymm2
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX512VL-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX512VL-NEXT: vpand %ymm1, %ymm0, %ymm0
-; AVX512VL-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512VL-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm1
; AVX512VL-NEXT: vpand %ymm1, %ymm2, %ymm1
; AVX512VL-NEXT: vpshufb %ymm0, %ymm3, %ymm0
@@ -1132,14 +1132,14 @@ define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind {
; AVX512VL-NEXT: retq
;
; AVX512VLBWDQ-LABEL: testv32i8u:
-; AVX512VLBWDQ: # BB#0:
+; AVX512VLBWDQ: # %bb.0:
; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm0, %ymm2
-; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX512VLBWDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; AVX512VLBWDQ-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm0, %ymm0
-; AVX512VLBWDQ-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; AVX512VLBWDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512VLBWDQ-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm1
; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm2, %ymm1
; AVX512VLBWDQ-NEXT: vpshufb %ymm0, %ymm3, %ymm0
@@ -1147,7 +1147,7 @@ define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind {
; AVX512VLBWDQ-NEXT: retq
;
; AVX512-LABEL: testv32i8u:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
; AVX512-NEXT: vplzcntd %zmm1, %zmm1
@@ -1162,14 +1162,14 @@ define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind {
; AVX512-NEXT: retq
;
; X32-AVX-LABEL: testv32i8u:
-; X32-AVX: # BB#0:
+; X32-AVX: # %bb.0:
; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm2
; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; X32-AVX-NEXT: vpshufb %ymm2, %ymm3, %ymm2
; X32-AVX-NEXT: vpsrlw $4, %ymm0, %ymm0
; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm0
-; X32-AVX-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; X32-AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; X32-AVX-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm1
; X32-AVX-NEXT: vpand %ymm1, %ymm2, %ymm1
; X32-AVX-NEXT: vpshufb %ymm0, %ymm3, %ymm0
@@ -1181,12 +1181,12 @@ define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind {
define <4 x i64> @foldv4i64() nounwind {
; X64-LABEL: foldv4i64:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovaps {{.*#+}} ymm0 = [55,0,64,56]
; X64-NEXT: retq
;
; X32-AVX-LABEL: foldv4i64:
-; X32-AVX: # BB#0:
+; X32-AVX: # %bb.0:
; X32-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [55,0,0,0,64,0,56,0]
; X32-AVX-NEXT: retl
%out = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> <i64 256, i64 -1, i64 0, i64 255>, i1 0)
@@ -1195,12 +1195,12 @@ define <4 x i64> @foldv4i64() nounwind {
define <4 x i64> @foldv4i64u() nounwind {
; X64-LABEL: foldv4i64u:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovaps {{.*#+}} ymm0 = [55,0,64,56]
; X64-NEXT: retq
;
; X32-AVX-LABEL: foldv4i64u:
-; X32-AVX: # BB#0:
+; X32-AVX: # %bb.0:
; X32-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [55,0,0,0,64,0,56,0]
; X32-AVX-NEXT: retl
%out = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> <i64 256, i64 -1, i64 0, i64 255>, i1 -1)
@@ -1209,12 +1209,12 @@ define <4 x i64> @foldv4i64u() nounwind {
define <8 x i32> @foldv8i32() nounwind {
; X64-LABEL: foldv8i32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovaps {{.*#+}} ymm0 = [23,0,32,24,0,29,27,25]
; X64-NEXT: retq
;
; X32-AVX-LABEL: foldv8i32:
-; X32-AVX: # BB#0:
+; X32-AVX: # %bb.0:
; X32-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [23,0,32,24,0,29,27,25]
; X32-AVX-NEXT: retl
%out = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> <i32 256, i32 -1, i32 0, i32 255, i32 -65536, i32 7, i32 24, i32 88>, i1 0)
@@ -1223,12 +1223,12 @@ define <8 x i32> @foldv8i32() nounwind {
define <8 x i32> @foldv8i32u() nounwind {
; X64-LABEL: foldv8i32u:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovaps {{.*#+}} ymm0 = [23,0,32,24,0,29,27,25]
; X64-NEXT: retq
;
; X32-AVX-LABEL: foldv8i32u:
-; X32-AVX: # BB#0:
+; X32-AVX: # %bb.0:
; X32-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [23,0,32,24,0,29,27,25]
; X32-AVX-NEXT: retl
%out = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> <i32 256, i32 -1, i32 0, i32 255, i32 -65536, i32 7, i32 24, i32 88>, i1 -1)
@@ -1236,18 +1236,13 @@ define <8 x i32> @foldv8i32u() nounwind {
}
define <16 x i16> @foldv16i16() nounwind {
-; NOBW-LABEL: foldv16i16:
-; NOBW: # BB#0:
-; NOBW-NEXT: vmovaps {{.*#+}} ymm0 = [7,0,16,8,16,13,11,9,0,8,15,14,13,12,11,10]
-; NOBW-NEXT: retq
-;
-; AVX512VLBWDQ-LABEL: foldv16i16:
-; AVX512VLBWDQ: # BB#0:
-; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} ymm0 = [7,0,16,8,16,13,11,9,0,8,15,14,13,12,11,10]
-; AVX512VLBWDQ-NEXT: retq
+; X64-LABEL: foldv16i16:
+; X64: # %bb.0:
+; X64-NEXT: vmovaps {{.*#+}} ymm0 = [7,0,16,8,16,13,11,9,0,8,15,14,13,12,11,10]
+; X64-NEXT: retq
;
; X32-AVX-LABEL: foldv16i16:
-; X32-AVX: # BB#0:
+; X32-AVX: # %bb.0:
; X32-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [7,0,16,8,16,13,11,9,0,8,15,14,13,12,11,10]
; X32-AVX-NEXT: retl
%out = call <16 x i16> @llvm.ctlz.v16i16(<16 x i16> <i16 256, i16 -1, i16 0, i16 255, i16 -65536, i16 7, i16 24, i16 88, i16 -2, i16 254, i16 1, i16 2, i16 4, i16 8, i16 16, i16 32>, i1 0)
@@ -1255,18 +1250,13 @@ define <16 x i16> @foldv16i16() nounwind {
}
define <16 x i16> @foldv16i16u() nounwind {
-; NOBW-LABEL: foldv16i16u:
-; NOBW: # BB#0:
-; NOBW-NEXT: vmovaps {{.*#+}} ymm0 = [7,0,16,8,16,13,11,9,0,8,15,14,13,12,11,10]
-; NOBW-NEXT: retq
-;
-; AVX512VLBWDQ-LABEL: foldv16i16u:
-; AVX512VLBWDQ: # BB#0:
-; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} ymm0 = [7,0,16,8,16,13,11,9,0,8,15,14,13,12,11,10]
-; AVX512VLBWDQ-NEXT: retq
+; X64-LABEL: foldv16i16u:
+; X64: # %bb.0:
+; X64-NEXT: vmovaps {{.*#+}} ymm0 = [7,0,16,8,16,13,11,9,0,8,15,14,13,12,11,10]
+; X64-NEXT: retq
;
; X32-AVX-LABEL: foldv16i16u:
-; X32-AVX: # BB#0:
+; X32-AVX: # %bb.0:
; X32-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [7,0,16,8,16,13,11,9,0,8,15,14,13,12,11,10]
; X32-AVX-NEXT: retl
%out = call <16 x i16> @llvm.ctlz.v16i16(<16 x i16> <i16 256, i16 -1, i16 0, i16 255, i16 -65536, i16 7, i16 24, i16 88, i16 -2, i16 254, i16 1, i16 2, i16 4, i16 8, i16 16, i16 32>, i1 -1)
@@ -1274,18 +1264,13 @@ define <16 x i16> @foldv16i16u() nounwind {
}
define <32 x i8> @foldv32i8() nounwind {
-; NOBW-LABEL: foldv32i8:
-; NOBW: # BB#0:
-; NOBW-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2,1,0,8,8,0,0,0,0,0,0,0,0,6,5,5,1]
-; NOBW-NEXT: retq
-;
-; AVX512VLBWDQ-LABEL: foldv32i8:
-; AVX512VLBWDQ: # BB#0:
-; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} ymm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2,1,0,8,8,0,0,0,0,0,0,0,0,6,5,5,1]
-; AVX512VLBWDQ-NEXT: retq
+; X64-LABEL: foldv32i8:
+; X64: # %bb.0:
+; X64-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2,1,0,8,8,0,0,0,0,0,0,0,0,6,5,5,1]
+; X64-NEXT: retq
;
; X32-AVX-LABEL: foldv32i8:
-; X32-AVX: # BB#0:
+; X32-AVX: # %bb.0:
; X32-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2,1,0,8,8,0,0,0,0,0,0,0,0,6,5,5,1]
; X32-AVX-NEXT: retl
%out = call <32 x i8> @llvm.ctlz.v32i8(<32 x i8> <i8 256, i8 -1, i8 0, i8 255, i8 -65536, i8 7, i8 24, i8 88, i8 -2, i8 254, i8 1, i8 2, i8 4, i8 8, i8 16, i8 32, i8 64, i8 128, i8 256, i8 -256, i8 -128, i8 -64, i8 -32, i8 -16, i8 -8, i8 -4, i8 -2, i8 -1, i8 3, i8 5, i8 7, i8 127>, i1 0)
@@ -1293,18 +1278,13 @@ define <32 x i8> @foldv32i8() nounwind {
}
define <32 x i8> @foldv32i8u() nounwind {
-; NOBW-LABEL: foldv32i8u:
-; NOBW: # BB#0:
-; NOBW-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2,1,0,8,8,0,0,0,0,0,0,0,0,6,5,5,1]
-; NOBW-NEXT: retq
-;
-; AVX512VLBWDQ-LABEL: foldv32i8u:
-; AVX512VLBWDQ: # BB#0:
-; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} ymm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2,1,0,8,8,0,0,0,0,0,0,0,0,6,5,5,1]
-; AVX512VLBWDQ-NEXT: retq
+; X64-LABEL: foldv32i8u:
+; X64: # %bb.0:
+; X64-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2,1,0,8,8,0,0,0,0,0,0,0,0,6,5,5,1]
+; X64-NEXT: retq
;
; X32-AVX-LABEL: foldv32i8u:
-; X32-AVX: # BB#0:
+; X32-AVX: # %bb.0:
; X32-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2,1,0,8,8,0,0,0,0,0,0,0,0,6,5,5,1]
; X32-AVX-NEXT: retl
%out = call <32 x i8> @llvm.ctlz.v32i8(<32 x i8> <i8 256, i8 -1, i8 0, i8 255, i8 -65536, i8 7, i8 24, i8 88, i8 -2, i8 254, i8 1, i8 2, i8 4, i8 8, i8 16, i8 32, i8 64, i8 128, i8 256, i8 -256, i8 -128, i8 -64, i8 -32, i8 -16, i8 -8, i8 -4, i8 -2, i8 -1, i8 3, i8 5, i8 7, i8 127>, i1 -1)
diff --git a/test/CodeGen/X86/vector-lzcnt-512.ll b/test/CodeGen/X86/vector-lzcnt-512.ll
index 88378eb51a27..9b66983cae3f 100644
--- a/test/CodeGen/X86/vector-lzcnt-512.ll
+++ b/test/CodeGen/X86/vector-lzcnt-512.ll
@@ -1,22 +1,22 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512cd,-avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512CD
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512cd,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512CDBW
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=-avx512cd,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512BW
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=-avx512cd,-avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512DQ
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512cd,-avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512CD
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512cd,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512CDBW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,-avx512cd,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,-avx512cd,-avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512DQ
define <8 x i64> @testv8i64(<8 x i64> %in) nounwind {
; AVX512CD-LABEL: testv8i64:
-; AVX512CD: ## BB#0:
+; AVX512CD: # %bb.0:
; AVX512CD-NEXT: vplzcntq %zmm0, %zmm0
; AVX512CD-NEXT: retq
;
; AVX512CDBW-LABEL: testv8i64:
-; AVX512CDBW: ## BB#0:
+; AVX512CDBW: # %bb.0:
; AVX512CDBW-NEXT: vplzcntq %zmm0, %zmm0
; AVX512CDBW-NEXT: retq
;
; AVX512BW-LABEL: testv8i64:
-; AVX512BW: ## BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpsrlq $1, %zmm0, %zmm1
; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpsrlq $2, %zmm0, %zmm1
@@ -31,7 +31,7 @@ define <8 x i64> @testv8i64(<8 x i64> %in) nounwind {
; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandnq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4
; AVX512BW-NEXT: vpxorq %zmm4, %zmm0, %zmm0
@@ -39,12 +39,12 @@ define <8 x i64> @testv8i64(<8 x i64> %in) nounwind {
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0
; AVX512BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0
-; AVX512BW-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512BW-NEXT: vpsadbw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: testv8i64:
-; AVX512DQ: ## BB#0:
+; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vpsrlq $1, %zmm0, %zmm1
; AVX512DQ-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512DQ-NEXT: vpsrlq $2, %zmm0, %zmm1
@@ -68,7 +68,7 @@ define <8 x i64> @testv8i64(<8 x i64> %in) nounwind {
; AVX512DQ-NEXT: vpand %ymm2, %ymm1, %ymm1
; AVX512DQ-NEXT: vpshufb %ymm1, %ymm4, %ymm1
; AVX512DQ-NEXT: vpaddb %ymm3, %ymm1, %ymm1
-; AVX512DQ-NEXT: vpxor %ymm3, %ymm3, %ymm3
+; AVX512DQ-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX512DQ-NEXT: vpsadbw %ymm3, %ymm1, %ymm1
; AVX512DQ-NEXT: vpand %ymm2, %ymm0, %ymm5
; AVX512DQ-NEXT: vpshufb %ymm5, %ymm4, %ymm5
@@ -85,17 +85,17 @@ define <8 x i64> @testv8i64(<8 x i64> %in) nounwind {
define <8 x i64> @testv8i64u(<8 x i64> %in) nounwind {
; AVX512CD-LABEL: testv8i64u:
-; AVX512CD: ## BB#0:
+; AVX512CD: # %bb.0:
; AVX512CD-NEXT: vplzcntq %zmm0, %zmm0
; AVX512CD-NEXT: retq
;
; AVX512CDBW-LABEL: testv8i64u:
-; AVX512CDBW: ## BB#0:
+; AVX512CDBW: # %bb.0:
; AVX512CDBW-NEXT: vplzcntq %zmm0, %zmm0
; AVX512CDBW-NEXT: retq
;
; AVX512BW-LABEL: testv8i64u:
-; AVX512BW: ## BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpsrlq $1, %zmm0, %zmm1
; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpsrlq $2, %zmm0, %zmm1
@@ -110,7 +110,7 @@ define <8 x i64> @testv8i64u(<8 x i64> %in) nounwind {
; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandnq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4
; AVX512BW-NEXT: vpxorq %zmm4, %zmm0, %zmm0
@@ -118,12 +118,12 @@ define <8 x i64> @testv8i64u(<8 x i64> %in) nounwind {
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0
; AVX512BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0
-; AVX512BW-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512BW-NEXT: vpsadbw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: testv8i64u:
-; AVX512DQ: ## BB#0:
+; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vpsrlq $1, %zmm0, %zmm1
; AVX512DQ-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512DQ-NEXT: vpsrlq $2, %zmm0, %zmm1
@@ -147,7 +147,7 @@ define <8 x i64> @testv8i64u(<8 x i64> %in) nounwind {
; AVX512DQ-NEXT: vpand %ymm2, %ymm1, %ymm1
; AVX512DQ-NEXT: vpshufb %ymm1, %ymm4, %ymm1
; AVX512DQ-NEXT: vpaddb %ymm3, %ymm1, %ymm1
-; AVX512DQ-NEXT: vpxor %ymm3, %ymm3, %ymm3
+; AVX512DQ-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX512DQ-NEXT: vpsadbw %ymm3, %ymm1, %ymm1
; AVX512DQ-NEXT: vpand %ymm2, %ymm0, %ymm5
; AVX512DQ-NEXT: vpshufb %ymm5, %ymm4, %ymm5
@@ -164,19 +164,19 @@ define <8 x i64> @testv8i64u(<8 x i64> %in) nounwind {
define <16 x i32> @testv16i32(<16 x i32> %in) nounwind {
; AVX512CD-LABEL: testv16i32:
-; AVX512CD: ## BB#0:
+; AVX512CD: # %bb.0:
; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0
; AVX512CD-NEXT: retq
;
; AVX512CDBW-LABEL: testv16i32:
-; AVX512CDBW: ## BB#0:
+; AVX512CDBW: # %bb.0:
; AVX512CDBW-NEXT: vplzcntd %zmm0, %zmm0
; AVX512CDBW-NEXT: retq
;
; AVX512BW-LABEL: testv16i32:
-; AVX512BW: ## BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpsrld $1, %zmm0, %zmm1
-; AVX512BW-NEXT: vpord %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpsrld $2, %zmm0, %zmm1
; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpsrld $4, %zmm0, %zmm1
@@ -187,7 +187,7 @@ define <16 x i32> @testv16i32(<16 x i32> %in) nounwind {
; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandnq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4
; AVX512BW-NEXT: vpxorq %zmm4, %zmm0, %zmm0
@@ -195,7 +195,7 @@ define <16 x i32> @testv16i32(<16 x i32> %in) nounwind {
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0
; AVX512BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0
-; AVX512BW-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512BW-NEXT: vpunpckhdq {{.*#+}} zmm2 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
; AVX512BW-NEXT: vpsadbw %zmm1, %zmm2, %zmm2
; AVX512BW-NEXT: vpunpckldq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
@@ -204,9 +204,9 @@ define <16 x i32> @testv16i32(<16 x i32> %in) nounwind {
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: testv16i32:
-; AVX512DQ: ## BB#0:
+; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vpsrld $1, %zmm0, %zmm1
-; AVX512DQ-NEXT: vpord %zmm1, %zmm0, %zmm0
+; AVX512DQ-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512DQ-NEXT: vpsrld $2, %zmm0, %zmm1
; AVX512DQ-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512DQ-NEXT: vpsrld $4, %zmm0, %zmm1
@@ -226,7 +226,7 @@ define <16 x i32> @testv16i32(<16 x i32> %in) nounwind {
; AVX512DQ-NEXT: vpand %ymm2, %ymm1, %ymm1
; AVX512DQ-NEXT: vpshufb %ymm1, %ymm4, %ymm1
; AVX512DQ-NEXT: vpaddb %ymm3, %ymm1, %ymm1
-; AVX512DQ-NEXT: vpxor %ymm3, %ymm3, %ymm3
+; AVX512DQ-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} ymm5 = ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[6],ymm3[6],ymm1[7],ymm3[7]
; AVX512DQ-NEXT: vpsadbw %ymm3, %ymm5, %ymm5
; AVX512DQ-NEXT: vpunpckldq {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[4],ymm3[4],ymm1[5],ymm3[5]
@@ -251,19 +251,19 @@ define <16 x i32> @testv16i32(<16 x i32> %in) nounwind {
define <16 x i32> @testv16i32u(<16 x i32> %in) nounwind {
; AVX512CD-LABEL: testv16i32u:
-; AVX512CD: ## BB#0:
+; AVX512CD: # %bb.0:
; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0
; AVX512CD-NEXT: retq
;
; AVX512CDBW-LABEL: testv16i32u:
-; AVX512CDBW: ## BB#0:
+; AVX512CDBW: # %bb.0:
; AVX512CDBW-NEXT: vplzcntd %zmm0, %zmm0
; AVX512CDBW-NEXT: retq
;
; AVX512BW-LABEL: testv16i32u:
-; AVX512BW: ## BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpsrld $1, %zmm0, %zmm1
-; AVX512BW-NEXT: vpord %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpsrld $2, %zmm0, %zmm1
; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpsrld $4, %zmm0, %zmm1
@@ -274,7 +274,7 @@ define <16 x i32> @testv16i32u(<16 x i32> %in) nounwind {
; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandnq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4
; AVX512BW-NEXT: vpxorq %zmm4, %zmm0, %zmm0
@@ -282,7 +282,7 @@ define <16 x i32> @testv16i32u(<16 x i32> %in) nounwind {
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0
; AVX512BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0
-; AVX512BW-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512BW-NEXT: vpunpckhdq {{.*#+}} zmm2 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
; AVX512BW-NEXT: vpsadbw %zmm1, %zmm2, %zmm2
; AVX512BW-NEXT: vpunpckldq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
@@ -291,9 +291,9 @@ define <16 x i32> @testv16i32u(<16 x i32> %in) nounwind {
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: testv16i32u:
-; AVX512DQ: ## BB#0:
+; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vpsrld $1, %zmm0, %zmm1
-; AVX512DQ-NEXT: vpord %zmm1, %zmm0, %zmm0
+; AVX512DQ-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512DQ-NEXT: vpsrld $2, %zmm0, %zmm1
; AVX512DQ-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512DQ-NEXT: vpsrld $4, %zmm0, %zmm1
@@ -313,7 +313,7 @@ define <16 x i32> @testv16i32u(<16 x i32> %in) nounwind {
; AVX512DQ-NEXT: vpand %ymm2, %ymm1, %ymm1
; AVX512DQ-NEXT: vpshufb %ymm1, %ymm4, %ymm1
; AVX512DQ-NEXT: vpaddb %ymm3, %ymm1, %ymm1
-; AVX512DQ-NEXT: vpxor %ymm3, %ymm3, %ymm3
+; AVX512DQ-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} ymm5 = ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[6],ymm3[6],ymm1[7],ymm3[7]
; AVX512DQ-NEXT: vpsadbw %ymm3, %ymm5, %ymm5
; AVX512DQ-NEXT: vpunpckldq {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[4],ymm3[4],ymm1[5],ymm3[5]
@@ -338,7 +338,7 @@ define <16 x i32> @testv16i32u(<16 x i32> %in) nounwind {
define <32 x i16> @testv32i16(<32 x i16> %in) nounwind {
; AVX512CD-LABEL: testv32i16:
-; AVX512CD: ## BB#0:
+; AVX512CD: # %bb.0:
; AVX512CD-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0
; AVX512CD-NEXT: vpmovdw %zmm0, %ymm0
@@ -351,7 +351,7 @@ define <32 x i16> @testv32i16(<32 x i16> %in) nounwind {
; AVX512CD-NEXT: retq
;
; AVX512CDBW-LABEL: testv32i16:
-; AVX512CDBW: ## BB#0:
+; AVX512CDBW: # %bb.0:
; AVX512CDBW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512CDBW-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
; AVX512CDBW-NEXT: vplzcntd %zmm1, %zmm1
@@ -366,39 +366,36 @@ define <32 x i16> @testv32i16(<32 x i16> %in) nounwind {
; AVX512CDBW-NEXT: retq
;
; AVX512BW-LABEL: testv32i16:
-; AVX512BW: ## BB#0:
-; AVX512BW-NEXT: vpsrlw $1, %zmm0, %zmm1
-; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpsrlw $2, %zmm0, %zmm1
-; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm1
-; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm1
-; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512BW-NEXT: vpandnq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
+; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
-; AVX512BW-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4
-; AVX512BW-NEXT: vpxorq %zmm4, %zmm0, %zmm0
-; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
-; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0
-; AVX512BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0
-; AVX512BW-NEXT: vpsllw $8, %zmm0, %zmm1
-; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm4
+; AVX512BW-NEXT: vptestnmb %zmm1, %zmm4, %k0
+; AVX512BW-NEXT: vpmovm2b %k0, %zmm5
+; AVX512BW-NEXT: vpandq %zmm5, %zmm2, %zmm2
+; AVX512BW-NEXT: vpandq %zmm1, %zmm4, %zmm1
+; AVX512BW-NEXT: vpshufb %zmm1, %zmm3, %zmm1
+; AVX512BW-NEXT: vpaddb %zmm1, %zmm2, %zmm1
+; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512BW-NEXT: vpcmpeqb %zmm2, %zmm0, %k0
+; AVX512BW-NEXT: vpmovm2b %k0, %zmm0
; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0
+; AVX512BW-NEXT: vpandq %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1
+; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: testv32i16:
-; AVX512DQ: ## BB#0:
+; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512DQ-NEXT: vpand %ymm2, %ymm0, %ymm3
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX512DQ-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512DQ-NEXT: vpsrlw $4, %ymm0, %ymm5
; AVX512DQ-NEXT: vpand %ymm2, %ymm5, %ymm5
-; AVX512DQ-NEXT: vpxor %ymm6, %ymm6, %ymm6
+; AVX512DQ-NEXT: vpxor %xmm6, %xmm6, %xmm6
; AVX512DQ-NEXT: vpcmpeqb %ymm6, %ymm5, %ymm7
; AVX512DQ-NEXT: vpand %ymm7, %ymm3, %ymm3
; AVX512DQ-NEXT: vpshufb %ymm5, %ymm4, %ymm5
@@ -428,7 +425,7 @@ define <32 x i16> @testv32i16(<32 x i16> %in) nounwind {
define <32 x i16> @testv32i16u(<32 x i16> %in) nounwind {
; AVX512CD-LABEL: testv32i16u:
-; AVX512CD: ## BB#0:
+; AVX512CD: # %bb.0:
; AVX512CD-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0
; AVX512CD-NEXT: vpmovdw %zmm0, %ymm0
@@ -441,7 +438,7 @@ define <32 x i16> @testv32i16u(<32 x i16> %in) nounwind {
; AVX512CD-NEXT: retq
;
; AVX512CDBW-LABEL: testv32i16u:
-; AVX512CDBW: ## BB#0:
+; AVX512CDBW: # %bb.0:
; AVX512CDBW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512CDBW-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
; AVX512CDBW-NEXT: vplzcntd %zmm1, %zmm1
@@ -456,39 +453,36 @@ define <32 x i16> @testv32i16u(<32 x i16> %in) nounwind {
; AVX512CDBW-NEXT: retq
;
; AVX512BW-LABEL: testv32i16u:
-; AVX512BW: ## BB#0:
-; AVX512BW-NEXT: vpsrlw $1, %zmm0, %zmm1
-; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpsrlw $2, %zmm0, %zmm1
-; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm1
-; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm1
-; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512BW-NEXT: vpandnq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
+; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
-; AVX512BW-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4
-; AVX512BW-NEXT: vpxorq %zmm4, %zmm0, %zmm0
-; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
-; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0
-; AVX512BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0
-; AVX512BW-NEXT: vpsllw $8, %zmm0, %zmm1
-; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm4
+; AVX512BW-NEXT: vptestnmb %zmm1, %zmm4, %k0
+; AVX512BW-NEXT: vpmovm2b %k0, %zmm5
+; AVX512BW-NEXT: vpandq %zmm5, %zmm2, %zmm2
+; AVX512BW-NEXT: vpandq %zmm1, %zmm4, %zmm1
+; AVX512BW-NEXT: vpshufb %zmm1, %zmm3, %zmm1
+; AVX512BW-NEXT: vpaddb %zmm1, %zmm2, %zmm1
+; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512BW-NEXT: vpcmpeqb %zmm2, %zmm0, %k0
+; AVX512BW-NEXT: vpmovm2b %k0, %zmm0
; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0
+; AVX512BW-NEXT: vpandq %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1
+; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: testv32i16u:
-; AVX512DQ: ## BB#0:
+; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512DQ-NEXT: vpand %ymm2, %ymm0, %ymm3
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX512DQ-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512DQ-NEXT: vpsrlw $4, %ymm0, %ymm5
; AVX512DQ-NEXT: vpand %ymm2, %ymm5, %ymm5
-; AVX512DQ-NEXT: vpxor %ymm6, %ymm6, %ymm6
+; AVX512DQ-NEXT: vpxor %xmm6, %xmm6, %xmm6
; AVX512DQ-NEXT: vpcmpeqb %ymm6, %ymm5, %ymm7
; AVX512DQ-NEXT: vpand %ymm7, %ymm3, %ymm3
; AVX512DQ-NEXT: vpshufb %ymm5, %ymm4, %ymm5
@@ -518,7 +512,7 @@ define <32 x i16> @testv32i16u(<32 x i16> %in) nounwind {
define <64 x i8> @testv64i8(<64 x i8> %in) nounwind {
; AVX512CD-LABEL: testv64i8:
-; AVX512CD: ## BB#0:
+; AVX512CD: # %bb.0:
; AVX512CD-NEXT: vextracti128 $1, %ymm0, %xmm2
; AVX512CD-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
; AVX512CD-NEXT: vplzcntd %zmm2, %zmm2
@@ -543,7 +537,7 @@ define <64 x i8> @testv64i8(<64 x i8> %in) nounwind {
; AVX512CD-NEXT: retq
;
; AVX512CDBW-LABEL: testv64i8:
-; AVX512CDBW: ## BB#0:
+; AVX512CDBW: # %bb.0:
; AVX512CDBW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512CDBW-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX512CDBW-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
@@ -570,37 +564,29 @@ define <64 x i8> @testv64i8(<64 x i8> %in) nounwind {
; AVX512CDBW-NEXT: retq
;
; AVX512BW-LABEL: testv64i8:
-; AVX512BW: ## BB#0:
-; AVX512BW-NEXT: vpsrlw $1, %zmm0, %zmm1
-; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
-; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpsrlw $2, %zmm0, %zmm1
-; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
-; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm1
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512BW-NEXT: vpandq %zmm2, %zmm1, %zmm1
-; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpandnq %zmm2, %zmm0, %zmm1
-; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX512BW-NEXT: vpshufb %zmm1, %zmm3, %zmm1
-; AVX512BW-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4
-; AVX512BW-NEXT: vpxorq %zmm4, %zmm0, %zmm0
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
+; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
-; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT: vptestnmb %zmm1, %zmm0, %k0
+; AVX512BW-NEXT: vpmovm2b %k0, %zmm4
+; AVX512BW-NEXT: vpandq %zmm4, %zmm2, %zmm2
+; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0
-; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpaddb %zmm0, %zmm2, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: testv64i8:
-; AVX512DQ: ## BB#0:
+; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512DQ-NEXT: vpand %ymm2, %ymm0, %ymm3
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX512DQ-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512DQ-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX512DQ-NEXT: vpand %ymm2, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpxor %ymm5, %ymm5, %ymm5
+; AVX512DQ-NEXT: vpxor %xmm5, %xmm5, %xmm5
; AVX512DQ-NEXT: vpcmpeqb %ymm5, %ymm0, %ymm6
; AVX512DQ-NEXT: vpand %ymm6, %ymm3, %ymm3
; AVX512DQ-NEXT: vpshufb %ymm0, %ymm4, %ymm0
@@ -620,7 +606,7 @@ define <64 x i8> @testv64i8(<64 x i8> %in) nounwind {
define <64 x i8> @testv64i8u(<64 x i8> %in) nounwind {
; AVX512CD-LABEL: testv64i8u:
-; AVX512CD: ## BB#0:
+; AVX512CD: # %bb.0:
; AVX512CD-NEXT: vextracti128 $1, %ymm0, %xmm2
; AVX512CD-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
; AVX512CD-NEXT: vplzcntd %zmm2, %zmm2
@@ -645,7 +631,7 @@ define <64 x i8> @testv64i8u(<64 x i8> %in) nounwind {
; AVX512CD-NEXT: retq
;
; AVX512CDBW-LABEL: testv64i8u:
-; AVX512CDBW: ## BB#0:
+; AVX512CDBW: # %bb.0:
; AVX512CDBW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512CDBW-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX512CDBW-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
@@ -672,37 +658,29 @@ define <64 x i8> @testv64i8u(<64 x i8> %in) nounwind {
; AVX512CDBW-NEXT: retq
;
; AVX512BW-LABEL: testv64i8u:
-; AVX512BW: ## BB#0:
-; AVX512BW-NEXT: vpsrlw $1, %zmm0, %zmm1
-; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
-; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpsrlw $2, %zmm0, %zmm1
-; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
-; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm1
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512BW-NEXT: vpandq %zmm2, %zmm1, %zmm1
-; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpandnq %zmm2, %zmm0, %zmm1
-; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX512BW-NEXT: vpshufb %zmm1, %zmm3, %zmm1
-; AVX512BW-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4
-; AVX512BW-NEXT: vpxorq %zmm4, %zmm0, %zmm0
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
+; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
-; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT: vptestnmb %zmm1, %zmm0, %k0
+; AVX512BW-NEXT: vpmovm2b %k0, %zmm4
+; AVX512BW-NEXT: vpandq %zmm4, %zmm2, %zmm2
+; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0
-; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpaddb %zmm0, %zmm2, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: testv64i8u:
-; AVX512DQ: ## BB#0:
+; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512DQ-NEXT: vpand %ymm2, %ymm0, %ymm3
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX512DQ-NEXT: vpshufb %ymm3, %ymm4, %ymm3
; AVX512DQ-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX512DQ-NEXT: vpand %ymm2, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpxor %ymm5, %ymm5, %ymm5
+; AVX512DQ-NEXT: vpxor %xmm5, %xmm5, %xmm5
; AVX512DQ-NEXT: vpcmpeqb %ymm5, %ymm0, %ymm6
; AVX512DQ-NEXT: vpand %ymm6, %ymm3, %ymm3
; AVX512DQ-NEXT: vpshufb %ymm0, %ymm4, %ymm0
diff --git a/test/CodeGen/X86/vector-merge-store-fp-constants.ll b/test/CodeGen/X86/vector-merge-store-fp-constants.ll
index 94fd4df09cf5..c7b3a89e9ff1 100644
--- a/test/CodeGen/X86/vector-merge-store-fp-constants.ll
+++ b/test/CodeGen/X86/vector-merge-store-fp-constants.ll
@@ -4,7 +4,7 @@
define void @merge_8_float_zero_stores(float* %ptr) {
; DEFAULTCPU-LABEL: merge_8_float_zero_stores:
-; DEFAULTCPU: # BB#0:
+; DEFAULTCPU: # %bb.0:
; DEFAULTCPU-NEXT: movq $0, (%rdi)
; DEFAULTCPU-NEXT: movq $0, 8(%rdi)
; DEFAULTCPU-NEXT: movq $0, 16(%rdi)
@@ -12,7 +12,7 @@ define void @merge_8_float_zero_stores(float* %ptr) {
; DEFAULTCPU-NEXT: retq
;
; X64CPU-LABEL: merge_8_float_zero_stores:
-; X64CPU: # BB#0:
+; X64CPU: # %bb.0:
; X64CPU-NEXT: xorps %xmm0, %xmm0
; X64CPU-NEXT: movups %xmm0, (%rdi)
; X64CPU-NEXT: movups %xmm0, 16(%rdi)
diff --git a/test/CodeGen/X86/vector-mul.ll b/test/CodeGen/X86/vector-mul.ll
new file mode 100644
index 000000000000..642da7c0137c
--- /dev/null
+++ b/test/CodeGen/X86/vector-mul.ll
@@ -0,0 +1,1090 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop | FileCheck %s --check-prefix=X64-AVX --check-prefix=X64-XOP
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X64-AVX --check-prefix=X64-AVX2
+
+;
+; PowOf2 (uniform)
+;
+
+define <2 x i64> @mul_v2i64_8(<2 x i64> %a0) nounwind {
+; X86-LABEL: mul_v2i64_8:
+; X86: # %bb.0:
+; X86-NEXT: psllq $3, %xmm0
+; X86-NEXT: retl
+;
+; X64-LABEL: mul_v2i64_8:
+; X64: # %bb.0:
+; X64-NEXT: psllq $3, %xmm0
+; X64-NEXT: retq
+;
+; X64-AVX-LABEL: mul_v2i64_8:
+; X64-AVX: # %bb.0:
+; X64-AVX-NEXT: vpsllq $3, %xmm0, %xmm0
+; X64-AVX-NEXT: retq
+ %1 = mul <2 x i64> %a0, <i64 8, i64 8>
+ ret <2 x i64> %1
+}
+
+define <4 x i32> @mul_v4i32_8(<4 x i32> %a0) nounwind {
+; X86-LABEL: mul_v4i32_8:
+; X86: # %bb.0:
+; X86-NEXT: pslld $3, %xmm0
+; X86-NEXT: retl
+;
+; X64-LABEL: mul_v4i32_8:
+; X64: # %bb.0:
+; X64-NEXT: pslld $3, %xmm0
+; X64-NEXT: retq
+;
+; X64-AVX-LABEL: mul_v4i32_8:
+; X64-AVX: # %bb.0:
+; X64-AVX-NEXT: vpslld $3, %xmm0, %xmm0
+; X64-AVX-NEXT: retq
+ %1 = mul <4 x i32> %a0, <i32 8, i32 8, i32 8, i32 8>
+ ret <4 x i32> %1
+}
+
+define <8 x i16> @mul_v8i16_8(<8 x i16> %a0) nounwind {
+; X86-LABEL: mul_v8i16_8:
+; X86: # %bb.0:
+; X86-NEXT: psllw $3, %xmm0
+; X86-NEXT: retl
+;
+; X64-LABEL: mul_v8i16_8:
+; X64: # %bb.0:
+; X64-NEXT: psllw $3, %xmm0
+; X64-NEXT: retq
+;
+; X64-AVX-LABEL: mul_v8i16_8:
+; X64-AVX: # %bb.0:
+; X64-AVX-NEXT: vpsllw $3, %xmm0, %xmm0
+; X64-AVX-NEXT: retq
+ %1 = mul <8 x i16> %a0, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+ ret <8 x i16> %1
+}
+
+define <16 x i8> @mul_v16i8_32(<16 x i8> %a0) nounwind {
+; X86-LABEL: mul_v16i8_32:
+; X86: # %bb.0:
+; X86-NEXT: psllw $5, %xmm0
+; X86-NEXT: pand {{\.LCPI.*}}, %xmm0
+; X86-NEXT: retl
+;
+; X64-LABEL: mul_v16i8_32:
+; X64: # %bb.0:
+; X64-NEXT: psllw $5, %xmm0
+; X64-NEXT: pand {{.*}}(%rip), %xmm0
+; X64-NEXT: retq
+;
+; X64-XOP-LABEL: mul_v16i8_32:
+; X64-XOP: # %bb.0:
+; X64-XOP-NEXT: vpshlb {{.*}}(%rip), %xmm0, %xmm0
+; X64-XOP-NEXT: retq
+;
+; X64-AVX2-LABEL: mul_v16i8_32:
+; X64-AVX2: # %bb.0:
+; X64-AVX2-NEXT: vpsllw $5, %xmm0, %xmm0
+; X64-AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; X64-AVX2-NEXT: retq
+ %1 = mul <16 x i8> %a0, <i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32>
+ ret <16 x i8> %1
+}
+
+;
+; PowOf2 (non-uniform)
+;
+
+define <2 x i64> @mul_v2i64_32_8(<2 x i64> %a0) nounwind {
+; X86-LABEL: mul_v2i64_32_8:
+; X86: # %bb.0:
+; X86-NEXT: movdqa %xmm0, %xmm1
+; X86-NEXT: psllq $3, %xmm1
+; X86-NEXT: psllq $5, %xmm0
+; X86-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; X86-NEXT: retl
+;
+; X64-LABEL: mul_v2i64_32_8:
+; X64: # %bb.0:
+; X64-NEXT: movdqa %xmm0, %xmm1
+; X64-NEXT: psllq $3, %xmm1
+; X64-NEXT: psllq $5, %xmm0
+; X64-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; X64-NEXT: retq
+;
+; X64-XOP-LABEL: mul_v2i64_32_8:
+; X64-XOP: # %bb.0:
+; X64-XOP-NEXT: vpshlq {{.*}}(%rip), %xmm0, %xmm0
+; X64-XOP-NEXT: retq
+;
+; X64-AVX2-LABEL: mul_v2i64_32_8:
+; X64-AVX2: # %bb.0:
+; X64-AVX2-NEXT: vpsllvq {{.*}}(%rip), %xmm0, %xmm0
+; X64-AVX2-NEXT: retq
+ %1 = mul <2 x i64> %a0, <i64 32, i64 8>
+ ret <2 x i64> %1
+}
+
+define <4 x i32> @mul_v4i32_1_2_4_8(<4 x i32> %a0) nounwind {
+; X86-LABEL: mul_v4i32_1_2_4_8:
+; X86: # %bb.0:
+; X86-NEXT: pmulld {{\.LCPI.*}}, %xmm0
+; X86-NEXT: retl
+;
+; X64-LABEL: mul_v4i32_1_2_4_8:
+; X64: # %bb.0:
+; X64-NEXT: pmulld {{.*}}(%rip), %xmm0
+; X64-NEXT: retq
+;
+; X64-XOP-LABEL: mul_v4i32_1_2_4_8:
+; X64-XOP: # %bb.0:
+; X64-XOP-NEXT: vpshld {{.*}}(%rip), %xmm0, %xmm0
+; X64-XOP-NEXT: retq
+;
+; X64-AVX2-LABEL: mul_v4i32_1_2_4_8:
+; X64-AVX2: # %bb.0:
+; X64-AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
+; X64-AVX2-NEXT: retq
+ %1 = mul <4 x i32> %a0, <i32 1, i32 2, i32 4, i32 8>
+ ret <4 x i32> %1
+}
+
+define <8 x i16> @mul_v8i16_1_2_4_8_16_32_64_128(<8 x i16> %a0) nounwind {
+; X86-LABEL: mul_v8i16_1_2_4_8_16_32_64_128:
+; X86: # %bb.0:
+; X86-NEXT: pmullw {{\.LCPI.*}}, %xmm0
+; X86-NEXT: retl
+;
+; X64-LABEL: mul_v8i16_1_2_4_8_16_32_64_128:
+; X64: # %bb.0:
+; X64-NEXT: pmullw {{.*}}(%rip), %xmm0
+; X64-NEXT: retq
+;
+; X64-XOP-LABEL: mul_v8i16_1_2_4_8_16_32_64_128:
+; X64-XOP: # %bb.0:
+; X64-XOP-NEXT: vpshlw {{.*}}(%rip), %xmm0, %xmm0
+; X64-XOP-NEXT: retq
+;
+; X64-AVX2-LABEL: mul_v8i16_1_2_4_8_16_32_64_128:
+; X64-AVX2: # %bb.0:
+; X64-AVX2-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
+; X64-AVX2-NEXT: retq
+ %1 = mul <8 x i16> %a0, <i16 1, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128>
+ ret <8 x i16> %1
+}
+
+define <16 x i8> @mul_v16i8_1_2_4_8_1_2_4_8_1_2_4_8_1_2_4_8(<16 x i8> %a0) nounwind {
+; X86-LABEL: mul_v16i8_1_2_4_8_1_2_4_8_1_2_4_8_1_2_4_8:
+; X86: # %bb.0:
+; X86-NEXT: movdqa %xmm0, %xmm1
+; X86-NEXT: movdqa %xmm1, %xmm2
+; X86-NEXT: psllw $4, %xmm2
+; X86-NEXT: pand {{\.LCPI.*}}, %xmm2
+; X86-NEXT: movdqa {{.*#+}} xmm0 = [8192,24640,8192,24640,8192,24640,8192,24640]
+; X86-NEXT: pblendvb %xmm0, %xmm2, %xmm1
+; X86-NEXT: movdqa %xmm1, %xmm2
+; X86-NEXT: psllw $2, %xmm2
+; X86-NEXT: pand {{\.LCPI.*}}, %xmm2
+; X86-NEXT: paddb %xmm0, %xmm0
+; X86-NEXT: pblendvb %xmm0, %xmm2, %xmm1
+; X86-NEXT: movdqa %xmm1, %xmm2
+; X86-NEXT: paddb %xmm2, %xmm2
+; X86-NEXT: paddb %xmm0, %xmm0
+; X86-NEXT: pblendvb %xmm0, %xmm2, %xmm1
+; X86-NEXT: movdqa %xmm1, %xmm0
+; X86-NEXT: retl
+;
+; X64-LABEL: mul_v16i8_1_2_4_8_1_2_4_8_1_2_4_8_1_2_4_8:
+; X64: # %bb.0:
+; X64-NEXT: movdqa %xmm0, %xmm1
+; X64-NEXT: movdqa %xmm1, %xmm2
+; X64-NEXT: psllw $4, %xmm2
+; X64-NEXT: pand {{.*}}(%rip), %xmm2
+; X64-NEXT: movdqa {{.*#+}} xmm0 = [8192,24640,8192,24640,8192,24640,8192,24640]
+; X64-NEXT: pblendvb %xmm0, %xmm2, %xmm1
+; X64-NEXT: movdqa %xmm1, %xmm2
+; X64-NEXT: psllw $2, %xmm2
+; X64-NEXT: pand {{.*}}(%rip), %xmm2
+; X64-NEXT: paddb %xmm0, %xmm0
+; X64-NEXT: pblendvb %xmm0, %xmm2, %xmm1
+; X64-NEXT: movdqa %xmm1, %xmm2
+; X64-NEXT: paddb %xmm2, %xmm2
+; X64-NEXT: paddb %xmm0, %xmm0
+; X64-NEXT: pblendvb %xmm0, %xmm2, %xmm1
+; X64-NEXT: movdqa %xmm1, %xmm0
+; X64-NEXT: retq
+;
+; X64-XOP-LABEL: mul_v16i8_1_2_4_8_1_2_4_8_1_2_4_8_1_2_4_8:
+; X64-XOP: # %bb.0:
+; X64-XOP-NEXT: vpshlb {{.*}}(%rip), %xmm0, %xmm0
+; X64-XOP-NEXT: retq
+;
+; X64-AVX2-LABEL: mul_v16i8_1_2_4_8_1_2_4_8_1_2_4_8_1_2_4_8:
+; X64-AVX2: # %bb.0:
+; X64-AVX2-NEXT: vpsllw $4, %xmm0, %xmm1
+; X64-AVX2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
+; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [8192,24640,8192,24640,8192,24640,8192,24640]
+; X64-AVX2-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT: vpsllw $2, %xmm0, %xmm1
+; X64-AVX2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
+; X64-AVX2-NEXT: vpaddb %xmm2, %xmm2, %xmm2
+; X64-AVX2-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT: vpaddb %xmm0, %xmm0, %xmm1
+; X64-AVX2-NEXT: vpaddb %xmm2, %xmm2, %xmm2
+; X64-AVX2-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT: retq
+ %1 = mul <16 x i8> %a0, <i8 1, i8 2, i8 4, i8 8, i8 1, i8 2, i8 4, i8 8, i8 1, i8 2, i8 4, i8 8, i8 1, i8 2, i8 4, i8 8>
+ ret <16 x i8> %1
+}
+
+;
+; PowOf2 + 1 (uniform)
+;
+
+define <2 x i64> @mul_v2i64_17(<2 x i64> %a0) nounwind {
+; X86-LABEL: mul_v2i64_17:
+; X86: # %bb.0:
+; X86-NEXT: movdqa {{.*#+}} xmm1 = [17,0,17,0]
+; X86-NEXT: movdqa %xmm0, %xmm2
+; X86-NEXT: pmuludq %xmm1, %xmm2
+; X86-NEXT: psrlq $32, %xmm0
+; X86-NEXT: pmuludq %xmm1, %xmm0
+; X86-NEXT: psllq $32, %xmm0
+; X86-NEXT: paddq %xmm2, %xmm0
+; X86-NEXT: retl
+;
+; X64-LABEL: mul_v2i64_17:
+; X64: # %bb.0:
+; X64-NEXT: movdqa {{.*#+}} xmm1 = [17,17]
+; X64-NEXT: movdqa %xmm0, %xmm2
+; X64-NEXT: pmuludq %xmm1, %xmm2
+; X64-NEXT: psrlq $32, %xmm0
+; X64-NEXT: pmuludq %xmm1, %xmm0
+; X64-NEXT: psllq $32, %xmm0
+; X64-NEXT: paddq %xmm2, %xmm0
+; X64-NEXT: retq
+;
+; X64-AVX-LABEL: mul_v2i64_17:
+; X64-AVX: # %bb.0:
+; X64-AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [17,17]
+; X64-AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm2
+; X64-AVX-NEXT: vpsrlq $32, %xmm0, %xmm0
+; X64-AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vpsllq $32, %xmm0, %xmm0
+; X64-AVX-NEXT: vpaddq %xmm0, %xmm2, %xmm0
+; X64-AVX-NEXT: retq
+ %1 = mul <2 x i64> %a0, <i64 17, i64 17>
+ ret <2 x i64> %1
+}
+
+define <4 x i32> @mul_v4i32_17(<4 x i32> %a0) nounwind {
+; X86-LABEL: mul_v4i32_17:
+; X86: # %bb.0:
+; X86-NEXT: pmulld {{\.LCPI.*}}, %xmm0
+; X86-NEXT: retl
+;
+; X64-LABEL: mul_v4i32_17:
+; X64: # %bb.0:
+; X64-NEXT: pmulld {{.*}}(%rip), %xmm0
+; X64-NEXT: retq
+;
+; X64-XOP-LABEL: mul_v4i32_17:
+; X64-XOP: # %bb.0:
+; X64-XOP-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; X64-XOP-NEXT: retq
+;
+; X64-AVX2-LABEL: mul_v4i32_17:
+; X64-AVX2: # %bb.0:
+; X64-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [17,17,17,17]
+; X64-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT: retq
+ %1 = mul <4 x i32> %a0, <i32 17, i32 17, i32 17, i32 17>
+ ret <4 x i32> %1
+}
+
+define <8 x i16> @mul_v8i16_17(<8 x i16> %a0) nounwind {
+; X86-LABEL: mul_v8i16_17:
+; X86: # %bb.0:
+; X86-NEXT: pmullw {{\.LCPI.*}}, %xmm0
+; X86-NEXT: retl
+;
+; X64-LABEL: mul_v8i16_17:
+; X64: # %bb.0:
+; X64-NEXT: pmullw {{.*}}(%rip), %xmm0
+; X64-NEXT: retq
+;
+; X64-AVX-LABEL: mul_v8i16_17:
+; X64-AVX: # %bb.0:
+; X64-AVX-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
+; X64-AVX-NEXT: retq
+ %1 = mul <8 x i16> %a0, <i16 17, i16 17, i16 17, i16 17, i16 17, i16 17, i16 17, i16 17>
+ ret <8 x i16> %1
+}
+
+define <16 x i8> @mul_v16i8_17(<16 x i8> %a0) nounwind {
+; X86-LABEL: mul_v16i8_17:
+; X86: # %bb.0:
+; X86-NEXT: pmovsxbw %xmm0, %xmm1
+; X86-NEXT: movdqa {{.*#+}} xmm2 = [17,17,17,17,17,17,17,17]
+; X86-NEXT: pmullw %xmm2, %xmm1
+; X86-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
+; X86-NEXT: pand %xmm3, %xmm1
+; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; X86-NEXT: pmovsxbw %xmm0, %xmm0
+; X86-NEXT: pmullw %xmm2, %xmm0
+; X86-NEXT: pand %xmm3, %xmm0
+; X86-NEXT: packuswb %xmm0, %xmm1
+; X86-NEXT: movdqa %xmm1, %xmm0
+; X86-NEXT: retl
+;
+; X64-LABEL: mul_v16i8_17:
+; X64: # %bb.0:
+; X64-NEXT: pmovsxbw %xmm0, %xmm1
+; X64-NEXT: movdqa {{.*#+}} xmm2 = [17,17,17,17,17,17,17,17]
+; X64-NEXT: pmullw %xmm2, %xmm1
+; X64-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
+; X64-NEXT: pand %xmm3, %xmm1
+; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; X64-NEXT: pmovsxbw %xmm0, %xmm0
+; X64-NEXT: pmullw %xmm2, %xmm0
+; X64-NEXT: pand %xmm3, %xmm0
+; X64-NEXT: packuswb %xmm0, %xmm1
+; X64-NEXT: movdqa %xmm1, %xmm0
+; X64-NEXT: retq
+;
+; X64-XOP-LABEL: mul_v16i8_17:
+; X64-XOP: # %bb.0:
+; X64-XOP-NEXT: vpmovsxbw %xmm0, %xmm1
+; X64-XOP-NEXT: vmovdqa {{.*#+}} xmm2 = [17,17,17,17,17,17,17,17]
+; X64-XOP-NEXT: vpmullw %xmm2, %xmm1, %xmm1
+; X64-XOP-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; X64-XOP-NEXT: vpmovsxbw %xmm0, %xmm0
+; X64-XOP-NEXT: vpmullw %xmm2, %xmm0, %xmm0
+; X64-XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm1[0,2,4,6,8,10,12,14],xmm0[0,2,4,6,8,10,12,14]
+; X64-XOP-NEXT: retq
+;
+; X64-AVX2-LABEL: mul_v16i8_17:
+; X64-AVX2: # %bb.0:
+; X64-AVX2-NEXT: vpmovsxbw %xmm0, %ymm0
+; X64-AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
+; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; X64-AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; X64-AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; X64-AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+ %1 = mul <16 x i8> %a0, <i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17>
+ ret <16 x i8> %1
+}
+
+;
+; PowOf2 + 1 (non-uniform)
+;
+
+define <2 x i64> @mul_v2i64_17_65(<2 x i64> %a0) nounwind {
+; X86-LABEL: mul_v2i64_17_65:
+; X86: # %bb.0:
+; X86-NEXT: movdqa {{.*#+}} xmm1 = [17,0,65,0]
+; X86-NEXT: movdqa %xmm0, %xmm2
+; X86-NEXT: pmuludq %xmm1, %xmm2
+; X86-NEXT: psrlq $32, %xmm0
+; X86-NEXT: pmuludq %xmm1, %xmm0
+; X86-NEXT: psllq $32, %xmm0
+; X86-NEXT: paddq %xmm2, %xmm0
+; X86-NEXT: retl
+;
+; X64-LABEL: mul_v2i64_17_65:
+; X64: # %bb.0:
+; X64-NEXT: movdqa {{.*#+}} xmm1 = [17,65]
+; X64-NEXT: movdqa %xmm0, %xmm2
+; X64-NEXT: pmuludq %xmm1, %xmm2
+; X64-NEXT: psrlq $32, %xmm0
+; X64-NEXT: pmuludq %xmm1, %xmm0
+; X64-NEXT: psllq $32, %xmm0
+; X64-NEXT: paddq %xmm2, %xmm0
+; X64-NEXT: retq
+;
+; X64-AVX-LABEL: mul_v2i64_17_65:
+; X64-AVX: # %bb.0:
+; X64-AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [17,65]
+; X64-AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm2
+; X64-AVX-NEXT: vpsrlq $32, %xmm0, %xmm0
+; X64-AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vpsllq $32, %xmm0, %xmm0
+; X64-AVX-NEXT: vpaddq %xmm0, %xmm2, %xmm0
+; X64-AVX-NEXT: retq
+ %1 = mul <2 x i64> %a0, <i64 17, i64 65>
+ ret <2 x i64> %1
+}
+
+define <4 x i32> @mul_v4i32_5_17_33_65(<4 x i32> %a0) nounwind {
+; X86-LABEL: mul_v4i32_5_17_33_65:
+; X86: # %bb.0:
+; X86-NEXT: pmulld {{\.LCPI.*}}, %xmm0
+; X86-NEXT: retl
+;
+; X64-LABEL: mul_v4i32_5_17_33_65:
+; X64: # %bb.0:
+; X64-NEXT: pmulld {{.*}}(%rip), %xmm0
+; X64-NEXT: retq
+;
+; X64-AVX-LABEL: mul_v4i32_5_17_33_65:
+; X64-AVX: # %bb.0:
+; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; X64-AVX-NEXT: retq
+ %1 = mul <4 x i32> %a0, <i32 5, i32 17, i32 33, i32 65>
+ ret <4 x i32> %1
+}
+
+define <8 x i16> @mul_v8i16_2_3_9_17_33_65_129_257(<8 x i16> %a0) nounwind {
+; X86-LABEL: mul_v8i16_2_3_9_17_33_65_129_257:
+; X86: # %bb.0:
+; X86-NEXT: pmullw {{\.LCPI.*}}, %xmm0
+; X86-NEXT: retl
+;
+; X64-LABEL: mul_v8i16_2_3_9_17_33_65_129_257:
+; X64: # %bb.0:
+; X64-NEXT: pmullw {{.*}}(%rip), %xmm0
+; X64-NEXT: retq
+;
+; X64-AVX-LABEL: mul_v8i16_2_3_9_17_33_65_129_257:
+; X64-AVX: # %bb.0:
+; X64-AVX-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
+; X64-AVX-NEXT: retq
+ %1 = mul <8 x i16> %a0, <i16 2, i16 3, i16 9, i16 17, i16 33, i16 65, i16 129, i16 257>
+ ret <8 x i16> %1
+}
+
+define <16 x i8> @mul_v16i8_2_3_9_17_33_65_129_2_3_9_17_33_65_129_2_3(<16 x i8> %a0) nounwind {
+; X86-LABEL: mul_v16i8_2_3_9_17_33_65_129_2_3_9_17_33_65_129_2_3:
+; X86: # %bb.0:
+; X86-NEXT: pmovsxbw %xmm0, %xmm1
+; X86-NEXT: pmullw {{\.LCPI.*}}, %xmm1
+; X86-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; X86-NEXT: pand %xmm2, %xmm1
+; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; X86-NEXT: pmovsxbw %xmm0, %xmm0
+; X86-NEXT: pmullw {{\.LCPI.*}}, %xmm0
+; X86-NEXT: pand %xmm2, %xmm0
+; X86-NEXT: packuswb %xmm0, %xmm1
+; X86-NEXT: movdqa %xmm1, %xmm0
+; X86-NEXT: retl
+;
+; X64-LABEL: mul_v16i8_2_3_9_17_33_65_129_2_3_9_17_33_65_129_2_3:
+; X64: # %bb.0:
+; X64-NEXT: pmovsxbw %xmm0, %xmm1
+; X64-NEXT: pmullw {{.*}}(%rip), %xmm1
+; X64-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; X64-NEXT: pand %xmm2, %xmm1
+; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; X64-NEXT: pmovsxbw %xmm0, %xmm0
+; X64-NEXT: pmullw {{.*}}(%rip), %xmm0
+; X64-NEXT: pand %xmm2, %xmm0
+; X64-NEXT: packuswb %xmm0, %xmm1
+; X64-NEXT: movdqa %xmm1, %xmm0
+; X64-NEXT: retq
+;
+; X64-XOP-LABEL: mul_v16i8_2_3_9_17_33_65_129_2_3_9_17_33_65_129_2_3:
+; X64-XOP: # %bb.0:
+; X64-XOP-NEXT: vpmovsxbw %xmm0, %xmm1
+; X64-XOP-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm1
+; X64-XOP-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; X64-XOP-NEXT: vpmovsxbw %xmm0, %xmm0
+; X64-XOP-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
+; X64-XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm1[0,2,4,6,8,10,12,14],xmm0[0,2,4,6,8,10,12,14]
+; X64-XOP-NEXT: retq
+;
+; X64-AVX2-LABEL: mul_v16i8_2_3_9_17_33_65_129_2_3_9_17_33_65_129_2_3:
+; X64-AVX2: # %bb.0:
+; X64-AVX2-NEXT: vpmovsxbw %xmm0, %ymm0
+; X64-AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
+; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; X64-AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; X64-AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; X64-AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+ %1 = mul <16 x i8> %a0, <i8 2, i8 3, i8 9, i8 17, i8 33, i8 65, i8 129, i8 2, i8 3, i8 9, i8 17, i8 33, i8 65, i8 129, i8 2, i8 3>
+ ret <16 x i8> %1
+}
+
+;
+; PowOf2 - 1 (uniform)
+;
+
+define <2 x i64> @mul_v2i64_7(<2 x i64> %a0) nounwind {
+; X86-LABEL: mul_v2i64_7:
+; X86: # %bb.0:
+; X86-NEXT: movdqa {{.*#+}} xmm1 = [7,0,7,0]
+; X86-NEXT: movdqa %xmm0, %xmm2
+; X86-NEXT: pmuludq %xmm1, %xmm2
+; X86-NEXT: psrlq $32, %xmm0
+; X86-NEXT: pmuludq %xmm1, %xmm0
+; X86-NEXT: psllq $32, %xmm0
+; X86-NEXT: paddq %xmm2, %xmm0
+; X86-NEXT: retl
+;
+; X64-LABEL: mul_v2i64_7:
+; X64: # %bb.0:
+; X64-NEXT: movdqa {{.*#+}} xmm1 = [7,7]
+; X64-NEXT: movdqa %xmm0, %xmm2
+; X64-NEXT: pmuludq %xmm1, %xmm2
+; X64-NEXT: psrlq $32, %xmm0
+; X64-NEXT: pmuludq %xmm1, %xmm0
+; X64-NEXT: psllq $32, %xmm0
+; X64-NEXT: paddq %xmm2, %xmm0
+; X64-NEXT: retq
+;
+; X64-AVX-LABEL: mul_v2i64_7:
+; X64-AVX: # %bb.0:
+; X64-AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [7,7]
+; X64-AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm2
+; X64-AVX-NEXT: vpsrlq $32, %xmm0, %xmm0
+; X64-AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vpsllq $32, %xmm0, %xmm0
+; X64-AVX-NEXT: vpaddq %xmm0, %xmm2, %xmm0
+; X64-AVX-NEXT: retq
+ %1 = mul <2 x i64> %a0, <i64 7, i64 7>
+ ret <2 x i64> %1
+}
+
+define <4 x i32> @mul_v4i32_7(<4 x i32> %a0) nounwind {
+; X86-LABEL: mul_v4i32_7:
+; X86: # %bb.0:
+; X86-NEXT: pmulld {{\.LCPI.*}}, %xmm0
+; X86-NEXT: retl
+;
+; X64-LABEL: mul_v4i32_7:
+; X64: # %bb.0:
+; X64-NEXT: pmulld {{.*}}(%rip), %xmm0
+; X64-NEXT: retq
+;
+; X64-XOP-LABEL: mul_v4i32_7:
+; X64-XOP: # %bb.0:
+; X64-XOP-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; X64-XOP-NEXT: retq
+;
+; X64-AVX2-LABEL: mul_v4i32_7:
+; X64-AVX2: # %bb.0:
+; X64-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [7,7,7,7]
+; X64-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT: retq
+ %1 = mul <4 x i32> %a0, <i32 7, i32 7, i32 7, i32 7>
+ ret <4 x i32> %1
+}
+
+define <8 x i16> @mul_v8i16_7(<8 x i16> %a0) nounwind {
+; X86-LABEL: mul_v8i16_7:
+; X86: # %bb.0:
+; X86-NEXT: pmullw {{\.LCPI.*}}, %xmm0
+; X86-NEXT: retl
+;
+; X64-LABEL: mul_v8i16_7:
+; X64: # %bb.0:
+; X64-NEXT: pmullw {{.*}}(%rip), %xmm0
+; X64-NEXT: retq
+;
+; X64-AVX-LABEL: mul_v8i16_7:
+; X64-AVX: # %bb.0:
+; X64-AVX-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
+; X64-AVX-NEXT: retq
+ %1 = mul <8 x i16> %a0, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
+ ret <8 x i16> %1
+}
+
+define <16 x i8> @mul_v16i8_31(<16 x i8> %a0) nounwind {
+; X86-LABEL: mul_v16i8_31:
+; X86: # %bb.0:
+; X86-NEXT: pmovsxbw %xmm0, %xmm1
+; X86-NEXT: movdqa {{.*#+}} xmm2 = [31,31,31,31,31,31,31,31]
+; X86-NEXT: pmullw %xmm2, %xmm1
+; X86-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
+; X86-NEXT: pand %xmm3, %xmm1
+; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; X86-NEXT: pmovsxbw %xmm0, %xmm0
+; X86-NEXT: pmullw %xmm2, %xmm0
+; X86-NEXT: pand %xmm3, %xmm0
+; X86-NEXT: packuswb %xmm0, %xmm1
+; X86-NEXT: movdqa %xmm1, %xmm0
+; X86-NEXT: retl
+;
+; X64-LABEL: mul_v16i8_31:
+; X64: # %bb.0:
+; X64-NEXT: pmovsxbw %xmm0, %xmm1
+; X64-NEXT: movdqa {{.*#+}} xmm2 = [31,31,31,31,31,31,31,31]
+; X64-NEXT: pmullw %xmm2, %xmm1
+; X64-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
+; X64-NEXT: pand %xmm3, %xmm1
+; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; X64-NEXT: pmovsxbw %xmm0, %xmm0
+; X64-NEXT: pmullw %xmm2, %xmm0
+; X64-NEXT: pand %xmm3, %xmm0
+; X64-NEXT: packuswb %xmm0, %xmm1
+; X64-NEXT: movdqa %xmm1, %xmm0
+; X64-NEXT: retq
+;
+; X64-XOP-LABEL: mul_v16i8_31:
+; X64-XOP: # %bb.0:
+; X64-XOP-NEXT: vpmovsxbw %xmm0, %xmm1
+; X64-XOP-NEXT: vmovdqa {{.*#+}} xmm2 = [31,31,31,31,31,31,31,31]
+; X64-XOP-NEXT: vpmullw %xmm2, %xmm1, %xmm1
+; X64-XOP-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; X64-XOP-NEXT: vpmovsxbw %xmm0, %xmm0
+; X64-XOP-NEXT: vpmullw %xmm2, %xmm0, %xmm0
+; X64-XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm1[0,2,4,6,8,10,12,14],xmm0[0,2,4,6,8,10,12,14]
+; X64-XOP-NEXT: retq
+;
+; X64-AVX2-LABEL: mul_v16i8_31:
+; X64-AVX2: # %bb.0:
+; X64-AVX2-NEXT: vpmovsxbw %xmm0, %ymm0
+; X64-AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
+; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; X64-AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; X64-AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; X64-AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+ %1 = mul <16 x i8> %a0, <i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31, i8 31>
+ ret <16 x i8> %1
+}
+
+;
+; PowOf2 - 1 (non-uniform)
+;
+
+define <2 x i64> @mul_v2i64_15_63(<2 x i64> %a0) nounwind {
+; X86-LABEL: mul_v2i64_15_63:
+; X86: # %bb.0:
+; X86-NEXT: movdqa {{.*#+}} xmm1 = [15,0,63,0]
+; X86-NEXT: movdqa %xmm0, %xmm2
+; X86-NEXT: pmuludq %xmm1, %xmm2
+; X86-NEXT: psrlq $32, %xmm0
+; X86-NEXT: pmuludq %xmm1, %xmm0
+; X86-NEXT: psllq $32, %xmm0
+; X86-NEXT: paddq %xmm2, %xmm0
+; X86-NEXT: retl
+;
+; X64-LABEL: mul_v2i64_15_63:
+; X64: # %bb.0:
+; X64-NEXT: movdqa {{.*#+}} xmm1 = [15,63]
+; X64-NEXT: movdqa %xmm0, %xmm2
+; X64-NEXT: pmuludq %xmm1, %xmm2
+; X64-NEXT: psrlq $32, %xmm0
+; X64-NEXT: pmuludq %xmm1, %xmm0
+; X64-NEXT: psllq $32, %xmm0
+; X64-NEXT: paddq %xmm2, %xmm0
+; X64-NEXT: retq
+;
+; X64-AVX-LABEL: mul_v2i64_15_63:
+; X64-AVX: # %bb.0:
+; X64-AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,63]
+; X64-AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm2
+; X64-AVX-NEXT: vpsrlq $32, %xmm0, %xmm0
+; X64-AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vpsllq $32, %xmm0, %xmm0
+; X64-AVX-NEXT: vpaddq %xmm0, %xmm2, %xmm0
+; X64-AVX-NEXT: retq
+ %1 = mul <2 x i64> %a0, <i64 15, i64 63>
+ ret <2 x i64> %1
+}
+
+define <2 x i64> @mul_v2i64_neg_15_63(<2 x i64> %a0) nounwind {
+; X86-LABEL: mul_v2i64_neg_15_63:
+; X86: # %bb.0:
+; X86-NEXT: movdqa {{.*#+}} xmm1 = [4294967281,4294967295,4294967233,4294967295]
+; X86-NEXT: movdqa %xmm0, %xmm2
+; X86-NEXT: pmuludq %xmm1, %xmm2
+; X86-NEXT: movdqa %xmm0, %xmm3
+; X86-NEXT: psrlq $32, %xmm3
+; X86-NEXT: pmuludq %xmm1, %xmm3
+; X86-NEXT: pmuludq {{\.LCPI.*}}, %xmm0
+; X86-NEXT: paddq %xmm3, %xmm0
+; X86-NEXT: psllq $32, %xmm0
+; X86-NEXT: paddq %xmm2, %xmm0
+; X86-NEXT: retl
+;
+; X64-LABEL: mul_v2i64_neg_15_63:
+; X64: # %bb.0:
+; X64-NEXT: movdqa {{.*#+}} xmm1 = [18446744073709551601,18446744073709551553]
+; X64-NEXT: movdqa %xmm0, %xmm2
+; X64-NEXT: pmuludq %xmm1, %xmm2
+; X64-NEXT: movdqa %xmm0, %xmm3
+; X64-NEXT: psrlq $32, %xmm3
+; X64-NEXT: pmuludq %xmm1, %xmm3
+; X64-NEXT: pmuludq {{.*}}(%rip), %xmm0
+; X64-NEXT: paddq %xmm3, %xmm0
+; X64-NEXT: psllq $32, %xmm0
+; X64-NEXT: paddq %xmm2, %xmm0
+; X64-NEXT: retq
+;
+; X64-AVX-LABEL: mul_v2i64_neg_15_63:
+; X64-AVX: # %bb.0:
+; X64-AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744073709551601,18446744073709551553]
+; X64-AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm2
+; X64-AVX-NEXT: vpsrlq $32, %xmm0, %xmm3
+; X64-AVX-NEXT: vpmuludq %xmm1, %xmm3, %xmm1
+; X64-AVX-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm0
+; X64-AVX-NEXT: vpaddq %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vpsllq $32, %xmm0, %xmm0
+; X64-AVX-NEXT: vpaddq %xmm0, %xmm2, %xmm0
+; X64-AVX-NEXT: retq
+ %1 = mul <2 x i64> %a0, <i64 -15, i64 -63>
+ ret <2 x i64> %1
+}
+
+define <2 x i64> @mul_v2i64_neg_17_65(<2 x i64> %a0) nounwind {
+; X86-LABEL: mul_v2i64_neg_17_65:
+; X86: # %bb.0:
+; X86-NEXT: movdqa {{.*#+}} xmm1 = [4294967279,4294967295,4294967231,4294967295]
+; X86-NEXT: movdqa %xmm0, %xmm2
+; X86-NEXT: pmuludq %xmm1, %xmm2
+; X86-NEXT: movdqa %xmm0, %xmm3
+; X86-NEXT: psrlq $32, %xmm3
+; X86-NEXT: pmuludq %xmm1, %xmm3
+; X86-NEXT: pmuludq {{\.LCPI.*}}, %xmm0
+; X86-NEXT: paddq %xmm3, %xmm0
+; X86-NEXT: psllq $32, %xmm0
+; X86-NEXT: paddq %xmm2, %xmm0
+; X86-NEXT: retl
+;
+; X64-LABEL: mul_v2i64_neg_17_65:
+; X64: # %bb.0:
+; X64-NEXT: movdqa {{.*#+}} xmm1 = [18446744073709551599,18446744073709551551]
+; X64-NEXT: movdqa %xmm0, %xmm2
+; X64-NEXT: pmuludq %xmm1, %xmm2
+; X64-NEXT: movdqa %xmm0, %xmm3
+; X64-NEXT: psrlq $32, %xmm3
+; X64-NEXT: pmuludq %xmm1, %xmm3
+; X64-NEXT: pmuludq {{.*}}(%rip), %xmm0
+; X64-NEXT: paddq %xmm3, %xmm0
+; X64-NEXT: psllq $32, %xmm0
+; X64-NEXT: paddq %xmm2, %xmm0
+; X64-NEXT: retq
+;
+; X64-AVX-LABEL: mul_v2i64_neg_17_65:
+; X64-AVX: # %bb.0:
+; X64-AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744073709551599,18446744073709551551]
+; X64-AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm2
+; X64-AVX-NEXT: vpsrlq $32, %xmm0, %xmm3
+; X64-AVX-NEXT: vpmuludq %xmm1, %xmm3, %xmm1
+; X64-AVX-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm0
+; X64-AVX-NEXT: vpaddq %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vpsllq $32, %xmm0, %xmm0
+; X64-AVX-NEXT: vpaddq %xmm0, %xmm2, %xmm0
+; X64-AVX-NEXT: retq
+ %1 = mul <2 x i64> %a0, <i64 -17, i64 -65>
+ ret <2 x i64> %1
+}
+
+define <2 x i64> @mul_v2i64_0_1(<2 x i64> %a0) nounwind {
+; X86-LABEL: mul_v2i64_0_1:
+; X86: # %bb.0:
+; X86-NEXT: movdqa {{.*#+}} xmm1 = [0,0,1,0]
+; X86-NEXT: movdqa %xmm0, %xmm2
+; X86-NEXT: pmuludq %xmm1, %xmm2
+; X86-NEXT: psrlq $32, %xmm0
+; X86-NEXT: pmuludq %xmm1, %xmm0
+; X86-NEXT: psllq $32, %xmm0
+; X86-NEXT: paddq %xmm2, %xmm0
+; X86-NEXT: retl
+;
+; X64-LABEL: mul_v2i64_0_1:
+; X64: # %bb.0:
+; X64-NEXT: movl $1, %eax
+; X64-NEXT: movq %rax, %xmm1
+; X64-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]
+; X64-NEXT: movdqa %xmm0, %xmm2
+; X64-NEXT: pmuludq %xmm1, %xmm2
+; X64-NEXT: psrlq $32, %xmm0
+; X64-NEXT: pmuludq %xmm1, %xmm0
+; X64-NEXT: psllq $32, %xmm0
+; X64-NEXT: paddq %xmm2, %xmm0
+; X64-NEXT: retq
+;
+; X64-AVX-LABEL: mul_v2i64_0_1:
+; X64-AVX: # %bb.0:
+; X64-AVX-NEXT: movl $1, %eax
+; X64-AVX-NEXT: vmovq %rax, %xmm1
+; X64-AVX-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]
+; X64-AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm2
+; X64-AVX-NEXT: vpsrlq $32, %xmm0, %xmm0
+; X64-AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vpsllq $32, %xmm0, %xmm0
+; X64-AVX-NEXT: vpaddq %xmm0, %xmm2, %xmm0
+; X64-AVX-NEXT: retq
+ %1 = mul <2 x i64> %a0, <i64 0, i64 1>
+ ret <2 x i64> %1
+}
+
+define <2 x i64> @mul_v2i64_neg_0_1(<2 x i64> %a0) nounwind {
+; X86-LABEL: mul_v2i64_neg_0_1:
+; X86: # %bb.0:
+; X86-NEXT: movdqa {{.*#+}} xmm1 = [0,0,4294967295,4294967295]
+; X86-NEXT: movdqa %xmm0, %xmm2
+; X86-NEXT: pmuludq %xmm1, %xmm2
+; X86-NEXT: movdqa %xmm0, %xmm3
+; X86-NEXT: psrlq $32, %xmm3
+; X86-NEXT: pmuludq %xmm1, %xmm3
+; X86-NEXT: pmuludq {{\.LCPI.*}}, %xmm0
+; X86-NEXT: paddq %xmm3, %xmm0
+; X86-NEXT: psllq $32, %xmm0
+; X86-NEXT: paddq %xmm2, %xmm0
+; X86-NEXT: retl
+;
+; X64-LABEL: mul_v2i64_neg_0_1:
+; X64: # %bb.0:
+; X64-NEXT: movdqa %xmm0, %xmm1
+; X64-NEXT: psrlq $32, %xmm1
+; X64-NEXT: movq $-1, %rax
+; X64-NEXT: movq %rax, %xmm2
+; X64-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7]
+; X64-NEXT: pmuludq %xmm2, %xmm1
+; X64-NEXT: movl $4294967295, %eax # imm = 0xFFFFFFFF
+; X64-NEXT: movq %rax, %xmm3
+; X64-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4,5,6,7]
+; X64-NEXT: pmuludq %xmm0, %xmm3
+; X64-NEXT: paddq %xmm1, %xmm3
+; X64-NEXT: psllq $32, %xmm3
+; X64-NEXT: pmuludq %xmm2, %xmm0
+; X64-NEXT: paddq %xmm3, %xmm0
+; X64-NEXT: retq
+;
+; X64-AVX-LABEL: mul_v2i64_neg_0_1:
+; X64-AVX: # %bb.0:
+; X64-AVX-NEXT: vpsrlq $32, %xmm0, %xmm1
+; X64-AVX-NEXT: movq $-1, %rax
+; X64-AVX-NEXT: vmovq %rax, %xmm2
+; X64-AVX-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7]
+; X64-AVX-NEXT: vpmuludq %xmm2, %xmm1, %xmm1
+; X64-AVX-NEXT: movl $4294967295, %eax # imm = 0xFFFFFFFF
+; X64-AVX-NEXT: vmovq %rax, %xmm3
+; X64-AVX-NEXT: vpslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4,5,6,7]
+; X64-AVX-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
+; X64-AVX-NEXT: vpaddq %xmm1, %xmm3, %xmm1
+; X64-AVX-NEXT: vpsllq $32, %xmm1, %xmm1
+; X64-AVX-NEXT: vpmuludq %xmm2, %xmm0, %xmm0
+; X64-AVX-NEXT: vpaddq %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: retq
+ %1 = mul <2 x i64> %a0, <i64 0, i64 -1>
+ ret <2 x i64> %1
+}
+
+define <2 x i64> @mul_v2i64_15_neg_63(<2 x i64> %a0) nounwind {
+; X86-LABEL: mul_v2i64_15_neg_63:
+; X86: # %bb.0:
+; X86-NEXT: movdqa {{.*#+}} xmm1 = [15,0,4294967233,4294967295]
+; X86-NEXT: movdqa %xmm0, %xmm2
+; X86-NEXT: pmuludq %xmm1, %xmm2
+; X86-NEXT: movdqa %xmm0, %xmm3
+; X86-NEXT: psrlq $32, %xmm3
+; X86-NEXT: pmuludq %xmm1, %xmm3
+; X86-NEXT: pmuludq {{\.LCPI.*}}, %xmm0
+; X86-NEXT: paddq %xmm3, %xmm0
+; X86-NEXT: psllq $32, %xmm0
+; X86-NEXT: paddq %xmm2, %xmm0
+; X86-NEXT: retl
+;
+; X64-LABEL: mul_v2i64_15_neg_63:
+; X64: # %bb.0:
+; X64-NEXT: movdqa %xmm0, %xmm1
+; X64-NEXT: psrlq $32, %xmm1
+; X64-NEXT: movdqa {{.*#+}} xmm2 = [15,18446744073709551553]
+; X64-NEXT: pmuludq %xmm2, %xmm1
+; X64-NEXT: movl $4294967295, %eax # imm = 0xFFFFFFFF
+; X64-NEXT: movq %rax, %xmm3
+; X64-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4,5,6,7]
+; X64-NEXT: pmuludq %xmm0, %xmm3
+; X64-NEXT: paddq %xmm1, %xmm3
+; X64-NEXT: psllq $32, %xmm3
+; X64-NEXT: pmuludq %xmm2, %xmm0
+; X64-NEXT: paddq %xmm3, %xmm0
+; X64-NEXT: retq
+;
+; X64-AVX-LABEL: mul_v2i64_15_neg_63:
+; X64-AVX: # %bb.0:
+; X64-AVX-NEXT: vpsrlq $32, %xmm0, %xmm1
+; X64-AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [15,18446744073709551553]
+; X64-AVX-NEXT: vpmuludq %xmm2, %xmm1, %xmm1
+; X64-AVX-NEXT: movl $4294967295, %eax # imm = 0xFFFFFFFF
+; X64-AVX-NEXT: vmovq %rax, %xmm3
+; X64-AVX-NEXT: vpslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4,5,6,7]
+; X64-AVX-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
+; X64-AVX-NEXT: vpaddq %xmm1, %xmm3, %xmm1
+; X64-AVX-NEXT: vpsllq $32, %xmm1, %xmm1
+; X64-AVX-NEXT: vpmuludq %xmm2, %xmm0, %xmm0
+; X64-AVX-NEXT: vpaddq %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: retq
+ %1 = mul <2 x i64> %a0, <i64 15, i64 -63>
+ ret <2 x i64> %1
+}
+
+define <4 x i32> @mul_v4i32_0_15_31_7(<4 x i32> %a0) nounwind {
+; X86-LABEL: mul_v4i32_0_15_31_7:
+; X86: # %bb.0:
+; X86-NEXT: pmulld {{\.LCPI.*}}, %xmm0
+; X86-NEXT: retl
+;
+; X64-LABEL: mul_v4i32_0_15_31_7:
+; X64: # %bb.0:
+; X64-NEXT: pmulld {{.*}}(%rip), %xmm0
+; X64-NEXT: retq
+;
+; X64-AVX-LABEL: mul_v4i32_0_15_31_7:
+; X64-AVX: # %bb.0:
+; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; X64-AVX-NEXT: retq
+ %1 = mul <4 x i32> %a0, <i32 0, i32 15, i32 31, i32 7>
+ ret <4 x i32> %1
+}
+
+define <8 x i16> @mul_v8i16_0_1_7_15_31_63_127_255(<8 x i16> %a0) nounwind {
+; X86-LABEL: mul_v8i16_0_1_7_15_31_63_127_255:
+; X86: # %bb.0:
+; X86-NEXT: pmullw {{\.LCPI.*}}, %xmm0
+; X86-NEXT: retl
+;
+; X64-LABEL: mul_v8i16_0_1_7_15_31_63_127_255:
+; X64: # %bb.0:
+; X64-NEXT: pmullw {{.*}}(%rip), %xmm0
+; X64-NEXT: retq
+;
+; X64-AVX-LABEL: mul_v8i16_0_1_7_15_31_63_127_255:
+; X64-AVX: # %bb.0:
+; X64-AVX-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
+; X64-AVX-NEXT: retq
+ %1 = mul <8 x i16> %a0, <i16 0, i16 1, i16 7, i16 15, i16 31, i16 63, i16 127, i16 255>
+ ret <8 x i16> %1
+}
+
+define <16 x i8> @mul_v16i8_0_1_3_7_15_31_63_127_0_1_3_7_15_31_63_127(<16 x i8> %a0) nounwind {
+; X86-LABEL: mul_v16i8_0_1_3_7_15_31_63_127_0_1_3_7_15_31_63_127:
+; X86: # %bb.0:
+; X86-NEXT: pmovsxbw %xmm0, %xmm1
+; X86-NEXT: movdqa {{.*#+}} xmm2 = [0,1,3,7,15,31,63,127]
+; X86-NEXT: pmullw %xmm2, %xmm1
+; X86-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
+; X86-NEXT: pand %xmm3, %xmm1
+; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; X86-NEXT: pmovsxbw %xmm0, %xmm0
+; X86-NEXT: pmullw %xmm2, %xmm0
+; X86-NEXT: pand %xmm3, %xmm0
+; X86-NEXT: packuswb %xmm0, %xmm1
+; X86-NEXT: movdqa %xmm1, %xmm0
+; X86-NEXT: retl
+;
+; X64-LABEL: mul_v16i8_0_1_3_7_15_31_63_127_0_1_3_7_15_31_63_127:
+; X64: # %bb.0:
+; X64-NEXT: pmovsxbw %xmm0, %xmm1
+; X64-NEXT: movdqa {{.*#+}} xmm2 = [0,1,3,7,15,31,63,127]
+; X64-NEXT: pmullw %xmm2, %xmm1
+; X64-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
+; X64-NEXT: pand %xmm3, %xmm1
+; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; X64-NEXT: pmovsxbw %xmm0, %xmm0
+; X64-NEXT: pmullw %xmm2, %xmm0
+; X64-NEXT: pand %xmm3, %xmm0
+; X64-NEXT: packuswb %xmm0, %xmm1
+; X64-NEXT: movdqa %xmm1, %xmm0
+; X64-NEXT: retq
+;
+; X64-XOP-LABEL: mul_v16i8_0_1_3_7_15_31_63_127_0_1_3_7_15_31_63_127:
+; X64-XOP: # %bb.0:
+; X64-XOP-NEXT: vpmovsxbw %xmm0, %xmm1
+; X64-XOP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,3,7,15,31,63,127]
+; X64-XOP-NEXT: vpmullw %xmm2, %xmm1, %xmm1
+; X64-XOP-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; X64-XOP-NEXT: vpmovsxbw %xmm0, %xmm0
+; X64-XOP-NEXT: vpmullw %xmm2, %xmm0, %xmm0
+; X64-XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm1[0,2,4,6,8,10,12,14],xmm0[0,2,4,6,8,10,12,14]
+; X64-XOP-NEXT: retq
+;
+; X64-AVX2-LABEL: mul_v16i8_0_1_3_7_15_31_63_127_0_1_3_7_15_31_63_127:
+; X64-AVX2: # %bb.0:
+; X64-AVX2-NEXT: vpmovsxbw %xmm0, %ymm0
+; X64-AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
+; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; X64-AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; X64-AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; X64-AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X64-AVX2-NEXT: vzeroupper
+; X64-AVX2-NEXT: retq
+ %1 = mul <16 x i8> %a0, <i8 0, i8 1, i8 3, i8 7, i8 15, i8 31, i8 63, i8 127, i8 0, i8 1, i8 3, i8 7, i8 15, i8 31, i8 63, i8 127>
+ ret <16 x i8> %1
+}
+
+define <2 x i64> @mul_v2i64_68_132(<2 x i64> %x) nounwind {
+; X86-LABEL: mul_v2i64_68_132:
+; X86: # %bb.0:
+; X86-NEXT: movdqa {{.*#+}} xmm1 = [68,0,132,0]
+; X86-NEXT: movdqa %xmm0, %xmm2
+; X86-NEXT: pmuludq %xmm1, %xmm2
+; X86-NEXT: psrlq $32, %xmm0
+; X86-NEXT: pmuludq %xmm1, %xmm0
+; X86-NEXT: psllq $32, %xmm0
+; X86-NEXT: paddq %xmm2, %xmm0
+; X86-NEXT: retl
+;
+; X64-LABEL: mul_v2i64_68_132:
+; X64: # %bb.0:
+; X64-NEXT: movdqa {{.*#+}} xmm1 = [68,132]
+; X64-NEXT: movdqa %xmm0, %xmm2
+; X64-NEXT: pmuludq %xmm1, %xmm2
+; X64-NEXT: psrlq $32, %xmm0
+; X64-NEXT: pmuludq %xmm1, %xmm0
+; X64-NEXT: psllq $32, %xmm0
+; X64-NEXT: paddq %xmm2, %xmm0
+; X64-NEXT: retq
+;
+; X64-AVX-LABEL: mul_v2i64_68_132:
+; X64-AVX: # %bb.0:
+; X64-AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [68,132]
+; X64-AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm2
+; X64-AVX-NEXT: vpsrlq $32, %xmm0, %xmm0
+; X64-AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vpsllq $32, %xmm0, %xmm0
+; X64-AVX-NEXT: vpaddq %xmm0, %xmm2, %xmm0
+; X64-AVX-NEXT: retq
+ %mul = mul <2 x i64> %x, <i64 68, i64 132>
+ ret <2 x i64> %mul
+}
+
+define <2 x i64> @mul_v2i64_60_120(<2 x i64> %x) nounwind {
+; X86-LABEL: mul_v2i64_60_120:
+; X86: # %bb.0:
+; X86-NEXT: movdqa {{.*#+}} xmm1 = [60,0,124,0]
+; X86-NEXT: movdqa %xmm0, %xmm2
+; X86-NEXT: pmuludq %xmm1, %xmm2
+; X86-NEXT: psrlq $32, %xmm0
+; X86-NEXT: pmuludq %xmm1, %xmm0
+; X86-NEXT: psllq $32, %xmm0
+; X86-NEXT: paddq %xmm2, %xmm0
+; X86-NEXT: retl
+;
+; X64-LABEL: mul_v2i64_60_120:
+; X64: # %bb.0:
+; X64-NEXT: movdqa {{.*#+}} xmm1 = [60,124]
+; X64-NEXT: movdqa %xmm0, %xmm2
+; X64-NEXT: pmuludq %xmm1, %xmm2
+; X64-NEXT: psrlq $32, %xmm0
+; X64-NEXT: pmuludq %xmm1, %xmm0
+; X64-NEXT: psllq $32, %xmm0
+; X64-NEXT: paddq %xmm2, %xmm0
+; X64-NEXT: retq
+;
+; X64-AVX-LABEL: mul_v2i64_60_120:
+; X64-AVX: # %bb.0:
+; X64-AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [60,124]
+; X64-AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm2
+; X64-AVX-NEXT: vpsrlq $32, %xmm0, %xmm0
+; X64-AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vpsllq $32, %xmm0, %xmm0
+; X64-AVX-NEXT: vpaddq %xmm0, %xmm2, %xmm0
+; X64-AVX-NEXT: retq
+ %mul = mul <2 x i64> %x, <i64 60, i64 124>
+ ret <2 x i64> %mul
+}
diff --git a/test/CodeGen/X86/vector-narrow-binop.ll b/test/CodeGen/X86/vector-narrow-binop.ll
index 4d183f3172b3..9b05ce4485ed 100644
--- a/test/CodeGen/X86/vector-narrow-binop.ll
+++ b/test/CodeGen/X86/vector-narrow-binop.ll
@@ -11,7 +11,7 @@
define <8 x i32> @PR32790(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, <8 x i32> %d) {
; SSE-LABEL: PR32790:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: paddd %xmm2, %xmm0
; SSE-NEXT: paddd %xmm3, %xmm1
; SSE-NEXT: pand %xmm5, %xmm1
@@ -21,7 +21,7 @@ define <8 x i32> @PR32790(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, <8 x i32> %d
; SSE-NEXT: retq
;
; AVX1-LABEL: PR32790:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
@@ -36,14 +36,14 @@ define <8 x i32> @PR32790(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, <8 x i32> %d
; AVX1-NEXT: retq
;
; AVX2-LABEL: PR32790:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpsubd %ymm3, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: PR32790:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; AVX512-NEXT: vpand %ymm2, %ymm0, %ymm0
; AVX512-NEXT: vpsubd %ymm3, %ymm0, %ymm0
@@ -59,14 +59,14 @@ define <8 x i32> @PR32790(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, <8 x i32> %d
define <4 x i32> @do_not_use_256bit_op(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d) {
; SSE-LABEL: do_not_use_256bit_op:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pand %xmm2, %xmm0
; SSE-NEXT: pand %xmm3, %xmm1
; SSE-NEXT: psubd %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: do_not_use_256bit_op:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpand %xmm2, %xmm0, %xmm0
; AVX-NEXT: vpand %xmm3, %xmm1, %xmm1
; AVX-NEXT: vpsubd %xmm1, %xmm0, %xmm0
diff --git a/test/CodeGen/X86/vector-pcmp.ll b/test/CodeGen/X86/vector-pcmp.ll
index 99a05c3d49c0..782c72e2a4d4 100644
--- a/test/CodeGen/X86/vector-pcmp.ll
+++ b/test/CodeGen/X86/vector-pcmp.ll
@@ -9,13 +9,13 @@
define <16 x i8> @test_pcmpgtb(<16 x i8> %x) {
; SSE-LABEL: test_pcmpgtb:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pcmpeqd %xmm1, %xmm1
; SSE-NEXT: pcmpgtb %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test_pcmpgtb:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; AVX-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
@@ -26,13 +26,13 @@ define <16 x i8> @test_pcmpgtb(<16 x i8> %x) {
define <8 x i16> @test_pcmpgtw(<8 x i16> %x) {
; SSE-LABEL: test_pcmpgtw:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pcmpeqd %xmm1, %xmm1
; SSE-NEXT: pcmpgtw %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test_pcmpgtw:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; AVX-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
@@ -43,13 +43,13 @@ define <8 x i16> @test_pcmpgtw(<8 x i16> %x) {
define <4 x i32> @test_pcmpgtd(<4 x i32> %x) {
; SSE-LABEL: test_pcmpgtd:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pcmpeqd %xmm1, %xmm1
; SSE-NEXT: pcmpgtd %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test_pcmpgtd:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; AVX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
@@ -60,7 +60,7 @@ define <4 x i32> @test_pcmpgtd(<4 x i32> %x) {
define <2 x i64> @test_pcmpgtq(<2 x i64> %x) {
; SSE2-LABEL: test_pcmpgtq:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: psrad $31, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; SSE2-NEXT: pcmpeqd %xmm0, %xmm0
@@ -68,13 +68,13 @@ define <2 x i64> @test_pcmpgtq(<2 x i64> %x) {
; SSE2-NEXT: retq
;
; SSE42-LABEL: test_pcmpgtq:
-; SSE42: # BB#0:
+; SSE42: # %bb.0:
; SSE42-NEXT: pcmpeqd %xmm1, %xmm1
; SSE42-NEXT: pcmpgtq %xmm1, %xmm0
; SSE42-NEXT: retq
;
; AVX-LABEL: test_pcmpgtq:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
@@ -85,7 +85,7 @@ define <2 x i64> @test_pcmpgtq(<2 x i64> %x) {
define <1 x i128> @test_strange_type(<1 x i128> %x) {
; SSE2-LABEL: test_strange_type:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: sarq $63, %rsi
; SSE2-NEXT: movq %rsi, %xmm0
; SSE2-NEXT: notq %rsi
@@ -97,7 +97,7 @@ define <1 x i128> @test_strange_type(<1 x i128> %x) {
; SSE2-NEXT: retq
;
; SSE42-LABEL: test_strange_type:
-; SSE42: # BB#0:
+; SSE42: # %bb.0:
; SSE42-NEXT: sarq $63, %rsi
; SSE42-NEXT: movq %rsi, %xmm0
; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
@@ -108,7 +108,7 @@ define <1 x i128> @test_strange_type(<1 x i128> %x) {
; SSE42-NEXT: retq
;
; AVX1-LABEL: test_strange_type:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: sarq $63, %rsi
; AVX1-NEXT: vmovq %rsi, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
@@ -119,7 +119,7 @@ define <1 x i128> @test_strange_type(<1 x i128> %x) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_strange_type:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: sarq $63, %rsi
; AVX2-NEXT: vmovq %rsi, %xmm0
; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0
@@ -135,26 +135,26 @@ define <1 x i128> @test_strange_type(<1 x i128> %x) {
define <32 x i8> @test_pcmpgtb_256(<32 x i8> %x) {
; SSE-LABEL: test_pcmpgtb_256:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pcmpeqd %xmm2, %xmm2
; SSE-NEXT: pcmpgtb %xmm2, %xmm0
; SSE-NEXT: pcmpgtb %xmm2, %xmm1
; SSE-NEXT: retq
;
; AVX1-LABEL: test_pcmpgtb_256:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpcmpgtb %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX1-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1
; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_pcmpgtb_256:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; AVX2-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
@@ -165,25 +165,25 @@ define <32 x i8> @test_pcmpgtb_256(<32 x i8> %x) {
define <16 x i16> @test_pcmpgtw_256(<16 x i16> %x) {
; SSE-LABEL: test_pcmpgtw_256:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pcmpeqd %xmm2, %xmm2
; SSE-NEXT: pcmpgtw %xmm2, %xmm0
; SSE-NEXT: pcmpgtw %xmm2, %xmm1
; SSE-NEXT: retq
;
; AVX1-LABEL: test_pcmpgtw_256:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpsraw $15, %xmm0, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpsraw $15, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1
; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_pcmpgtw_256:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; AVX2-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
@@ -194,25 +194,25 @@ define <16 x i16> @test_pcmpgtw_256(<16 x i16> %x) {
define <8 x i32> @test_pcmpgtd_256(<8 x i32> %x) {
; SSE-LABEL: test_pcmpgtd_256:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pcmpeqd %xmm2, %xmm2
; SSE-NEXT: pcmpgtd %xmm2, %xmm0
; SSE-NEXT: pcmpgtd %xmm2, %xmm1
; SSE-NEXT: retq
;
; AVX1-LABEL: test_pcmpgtd_256:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpsrad $31, %xmm0, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpsrad $31, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1
; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_pcmpgtd_256:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; AVX2-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
@@ -223,7 +223,7 @@ define <8 x i32> @test_pcmpgtd_256(<8 x i32> %x) {
define <4 x i64> @test_pcmpgtq_256(<4 x i64> %x) {
; SSE2-LABEL: test_pcmpgtq_256:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: psrad $31, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSE2-NEXT: psrad $31, %xmm0
@@ -234,26 +234,26 @@ define <4 x i64> @test_pcmpgtq_256(<4 x i64> %x) {
; SSE2-NEXT: retq
;
; SSE42-LABEL: test_pcmpgtq_256:
-; SSE42: # BB#0:
+; SSE42: # %bb.0:
; SSE42-NEXT: pcmpeqd %xmm2, %xmm2
; SSE42-NEXT: pcmpgtq %xmm2, %xmm0
; SSE42-NEXT: pcmpgtq %xmm2, %xmm1
; SSE42-NEXT: retq
;
; AVX1-LABEL: test_pcmpgtq_256:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX1-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1
; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_pcmpgtq_256:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
@@ -264,13 +264,13 @@ define <4 x i64> @test_pcmpgtq_256(<4 x i64> %x) {
define <16 x i8> @cmpeq_zext_v16i8(<16 x i8> %a, <16 x i8> %b) {
; SSE-LABEL: cmpeq_zext_v16i8:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pcmpeqb %xmm1, %xmm0
; SSE-NEXT: pand {{.*}}(%rip), %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: cmpeq_zext_v16i8:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: retq
@@ -281,7 +281,7 @@ define <16 x i8> @cmpeq_zext_v16i8(<16 x i8> %a, <16 x i8> %b) {
define <16 x i16> @cmpeq_zext_v16i16(<16 x i16> %a, <16 x i16> %b) {
; SSE-LABEL: cmpeq_zext_v16i16:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pcmpeqw %xmm2, %xmm0
; SSE-NEXT: psrlw $15, %xmm0
; SSE-NEXT: pcmpeqw %xmm3, %xmm1
@@ -289,7 +289,7 @@ define <16 x i16> @cmpeq_zext_v16i16(<16 x i16> %a, <16 x i16> %b) {
; SSE-NEXT: retq
;
; AVX1-LABEL: cmpeq_zext_v16i16:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpcmpeqw %xmm2, %xmm3, %xmm2
@@ -299,7 +299,7 @@ define <16 x i16> @cmpeq_zext_v16i16(<16 x i16> %a, <16 x i16> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: cmpeq_zext_v16i16:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpsrlw $15, %ymm0, %ymm0
; AVX2-NEXT: retq
@@ -310,13 +310,13 @@ define <16 x i16> @cmpeq_zext_v16i16(<16 x i16> %a, <16 x i16> %b) {
define <4 x i32> @cmpeq_zext_v4i32(<4 x i32> %a, <4 x i32> %b) {
; SSE-LABEL: cmpeq_zext_v4i32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pcmpeqd %xmm1, %xmm0
; SSE-NEXT: psrld $31, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: cmpeq_zext_v4i32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpsrld $31, %xmm0, %xmm0
; AVX-NEXT: retq
@@ -327,7 +327,7 @@ define <4 x i32> @cmpeq_zext_v4i32(<4 x i32> %a, <4 x i32> %b) {
define <4 x i64> @cmpeq_zext_v4i64(<4 x i64> %a, <4 x i64> %b) {
; SSE2-LABEL: cmpeq_zext_v4i64:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: pcmpeqd %xmm2, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,0,3,2]
; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1,1]
@@ -340,7 +340,7 @@ define <4 x i64> @cmpeq_zext_v4i64(<4 x i64> %a, <4 x i64> %b) {
; SSE2-NEXT: retq
;
; SSE42-LABEL: cmpeq_zext_v4i64:
-; SSE42: # BB#0:
+; SSE42: # %bb.0:
; SSE42-NEXT: pcmpeqq %xmm2, %xmm0
; SSE42-NEXT: psrlq $63, %xmm0
; SSE42-NEXT: pcmpeqq %xmm3, %xmm1
@@ -348,7 +348,7 @@ define <4 x i64> @cmpeq_zext_v4i64(<4 x i64> %a, <4 x i64> %b) {
; SSE42-NEXT: retq
;
; AVX1-LABEL: cmpeq_zext_v4i64:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpcmpeqq %xmm2, %xmm3, %xmm2
@@ -358,7 +358,7 @@ define <4 x i64> @cmpeq_zext_v4i64(<4 x i64> %a, <4 x i64> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: cmpeq_zext_v4i64:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpsrlq $63, %ymm0, %ymm0
; AVX2-NEXT: retq
@@ -369,7 +369,7 @@ define <4 x i64> @cmpeq_zext_v4i64(<4 x i64> %a, <4 x i64> %b) {
define <32 x i8> @cmpgt_zext_v32i8(<32 x i8> %a, <32 x i8> %b) {
; SSE-LABEL: cmpgt_zext_v32i8:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pcmpgtb %xmm2, %xmm0
; SSE-NEXT: movdqa {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; SSE-NEXT: pand %xmm2, %xmm0
@@ -378,7 +378,7 @@ define <32 x i8> @cmpgt_zext_v32i8(<32 x i8> %a, <32 x i8> %b) {
; SSE-NEXT: retq
;
; AVX1-LABEL: cmpgt_zext_v32i8:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpcmpgtb %xmm2, %xmm3, %xmm2
@@ -388,7 +388,7 @@ define <32 x i8> @cmpgt_zext_v32i8(<32 x i8> %a, <32 x i8> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: cmpgt_zext_v32i8:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
; AVX2-NEXT: retq
@@ -399,13 +399,13 @@ define <32 x i8> @cmpgt_zext_v32i8(<32 x i8> %a, <32 x i8> %b) {
define <8 x i16> @cmpgt_zext_v8i16(<8 x i16> %a, <8 x i16> %b) {
; SSE-LABEL: cmpgt_zext_v8i16:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pcmpgtw %xmm1, %xmm0
; SSE-NEXT: psrlw $15, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: cmpgt_zext_v8i16:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpsrlw $15, %xmm0, %xmm0
; AVX-NEXT: retq
@@ -416,7 +416,7 @@ define <8 x i16> @cmpgt_zext_v8i16(<8 x i16> %a, <8 x i16> %b) {
define <8 x i32> @cmpgt_zext_v8i32(<8 x i32> %a, <8 x i32> %b) {
; SSE-LABEL: cmpgt_zext_v8i32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pcmpgtd %xmm2, %xmm0
; SSE-NEXT: psrld $31, %xmm0
; SSE-NEXT: pcmpgtd %xmm3, %xmm1
@@ -424,7 +424,7 @@ define <8 x i32> @cmpgt_zext_v8i32(<8 x i32> %a, <8 x i32> %b) {
; SSE-NEXT: retq
;
; AVX1-LABEL: cmpgt_zext_v8i32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpcmpgtd %xmm2, %xmm3, %xmm2
@@ -434,7 +434,7 @@ define <8 x i32> @cmpgt_zext_v8i32(<8 x i32> %a, <8 x i32> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: cmpgt_zext_v8i32:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpsrld $31, %ymm0, %ymm0
; AVX2-NEXT: retq
@@ -445,7 +445,7 @@ define <8 x i32> @cmpgt_zext_v8i32(<8 x i32> %a, <8 x i32> %b) {
define <2 x i64> @cmpgt_zext_v2i64(<2 x i64> %a, <2 x i64> %b) {
; SSE2-LABEL: cmpgt_zext_v2i64:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0]
; SSE2-NEXT: pxor %xmm2, %xmm1
; SSE2-NEXT: pxor %xmm2, %xmm0
@@ -461,13 +461,13 @@ define <2 x i64> @cmpgt_zext_v2i64(<2 x i64> %a, <2 x i64> %b) {
; SSE2-NEXT: retq
;
; SSE42-LABEL: cmpgt_zext_v2i64:
-; SSE42: # BB#0:
+; SSE42: # %bb.0:
; SSE42-NEXT: pcmpgtq %xmm1, %xmm0
; SSE42-NEXT: psrlq $63, %xmm0
; SSE42-NEXT: retq
;
; AVX-LABEL: cmpgt_zext_v2i64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpsrlq $63, %xmm0, %xmm0
; AVX-NEXT: retq
diff --git a/test/CodeGen/X86/vector-popcnt-128.ll b/test/CodeGen/X86/vector-popcnt-128.ll
index d2f33785530b..e3cb8f5b46a0 100644
--- a/test/CodeGen/X86/vector-popcnt-128.ll
+++ b/test/CodeGen/X86/vector-popcnt-128.ll
@@ -6,10 +6,13 @@
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vpopcntdq | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512VPOPCNTDQ
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vpopcntdq,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512VPOPCNTDQVL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bitalg | FileCheck %s --check-prefix=ALL --check-prefix=BITALG_NOVLX
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bitalg,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=BITALG
define <2 x i64> @testv2i64(<2 x i64> %in) nounwind {
; SSE2-LABEL: testv2i64:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrlq $1, %xmm1
; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
@@ -30,7 +33,7 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind {
; SSE2-NEXT: retq
;
; SSE3-LABEL: testv2i64:
-; SSE3: # BB#0:
+; SSE3: # %bb.0:
; SSE3-NEXT: movdqa %xmm0, %xmm1
; SSE3-NEXT: psrlq $1, %xmm1
; SSE3-NEXT: pand {{.*}}(%rip), %xmm1
@@ -51,7 +54,7 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind {
; SSE3-NEXT: retq
;
; SSSE3-LABEL: testv2i64:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; SSSE3-NEXT: movdqa %xmm0, %xmm2
; SSSE3-NEXT: pand %xmm1, %xmm2
@@ -67,7 +70,7 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind {
; SSSE3-NEXT: retq
;
; SSE41-LABEL: testv2i64:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; SSE41-NEXT: movdqa %xmm0, %xmm2
; SSE41-NEXT: pand %xmm1, %xmm2
@@ -83,7 +86,7 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind {
; SSE41-NEXT: retq
;
; AVX1-LABEL: testv2i64:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@@ -97,7 +100,7 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: testv2i64:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@@ -111,19 +114,52 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind {
; AVX2-NEXT: retq
;
; AVX512VPOPCNTDQ-LABEL: testv2i64:
-; AVX512VPOPCNTDQ: # BB#0:
-; AVX512VPOPCNTDQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; AVX512VPOPCNTDQ: # %bb.0:
+; AVX512VPOPCNTDQ-NEXT: # kill: def %xmm0 killed %xmm0 def %zmm0
; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512VPOPCNTDQ-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0
; AVX512VPOPCNTDQ-NEXT: vzeroupper
; AVX512VPOPCNTDQ-NEXT: retq
+;
+; AVX512VPOPCNTDQVL-LABEL: testv2i64:
+; AVX512VPOPCNTDQVL: # %bb.0:
+; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0
+; AVX512VPOPCNTDQVL-NEXT: retq
+;
+; BITALG_NOVLX-LABEL: testv2i64:
+; BITALG_NOVLX: # %bb.0:
+; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; BITALG_NOVLX-NEXT: vpand %xmm1, %xmm0, %xmm2
+; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; BITALG_NOVLX-NEXT: vpshufb %xmm2, %xmm3, %xmm2
+; BITALG_NOVLX-NEXT: vpsrlw $4, %xmm0, %xmm0
+; BITALG_NOVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; BITALG_NOVLX-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; BITALG_NOVLX-NEXT: vpaddb %xmm2, %xmm0, %xmm0
+; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
+; BITALG_NOVLX-NEXT: retq
+;
+; BITALG-LABEL: testv2i64:
+; BITALG: # %bb.0:
+; BITALG-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; BITALG-NEXT: vpand %xmm1, %xmm0, %xmm2
+; BITALG-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; BITALG-NEXT: vpshufb %xmm2, %xmm3, %xmm2
+; BITALG-NEXT: vpsrlw $4, %xmm0, %xmm0
+; BITALG-NEXT: vpand %xmm1, %xmm0, %xmm0
+; BITALG-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; BITALG-NEXT: vpaddb %xmm2, %xmm0, %xmm0
+; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
+; BITALG-NEXT: retq
%out = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %in)
ret <2 x i64> %out
}
define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
; SSE2-LABEL: testv4i32:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrld $1, %xmm1
; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
@@ -149,7 +185,7 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
; SSE2-NEXT: retq
;
; SSE3-LABEL: testv4i32:
-; SSE3: # BB#0:
+; SSE3: # %bb.0:
; SSE3-NEXT: movdqa %xmm0, %xmm1
; SSE3-NEXT: psrld $1, %xmm1
; SSE3-NEXT: pand {{.*}}(%rip), %xmm1
@@ -175,7 +211,7 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
; SSE3-NEXT: retq
;
; SSSE3-LABEL: testv4i32:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; SSSE3-NEXT: movdqa %xmm0, %xmm3
; SSSE3-NEXT: pand %xmm2, %xmm3
@@ -197,7 +233,7 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
; SSSE3-NEXT: retq
;
; SSE41-LABEL: testv4i32:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; SSE41-NEXT: movdqa %xmm0, %xmm2
; SSE41-NEXT: pand %xmm1, %xmm2
@@ -217,7 +253,7 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
; SSE41-NEXT: retq
;
; AVX1-LABEL: testv4i32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@@ -235,7 +271,7 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: testv4i32:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@@ -253,19 +289,60 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
; AVX2-NEXT: retq
;
; AVX512VPOPCNTDQ-LABEL: testv4i32:
-; AVX512VPOPCNTDQ: # BB#0:
-; AVX512VPOPCNTDQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; AVX512VPOPCNTDQ: # %bb.0:
+; AVX512VPOPCNTDQ-NEXT: # kill: def %xmm0 killed %xmm0 def %zmm0
; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512VPOPCNTDQ-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0
; AVX512VPOPCNTDQ-NEXT: vzeroupper
; AVX512VPOPCNTDQ-NEXT: retq
+;
+; AVX512VPOPCNTDQVL-LABEL: testv4i32:
+; AVX512VPOPCNTDQVL: # %bb.0:
+; AVX512VPOPCNTDQVL-NEXT: vpopcntd %xmm0, %xmm0
+; AVX512VPOPCNTDQVL-NEXT: retq
+;
+; BITALG_NOVLX-LABEL: testv4i32:
+; BITALG_NOVLX: # %bb.0:
+; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; BITALG_NOVLX-NEXT: vpand %xmm1, %xmm0, %xmm2
+; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; BITALG_NOVLX-NEXT: vpshufb %xmm2, %xmm3, %xmm2
+; BITALG_NOVLX-NEXT: vpsrlw $4, %xmm0, %xmm0
+; BITALG_NOVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; BITALG_NOVLX-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; BITALG_NOVLX-NEXT: vpaddb %xmm2, %xmm0, %xmm0
+; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; BITALG_NOVLX-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm2, %xmm2
+; BITALG_NOVLX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
+; BITALG_NOVLX-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; BITALG_NOVLX-NEXT: retq
+;
+; BITALG-LABEL: testv4i32:
+; BITALG: # %bb.0:
+; BITALG-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; BITALG-NEXT: vpand %xmm1, %xmm0, %xmm2
+; BITALG-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; BITALG-NEXT: vpshufb %xmm2, %xmm3, %xmm2
+; BITALG-NEXT: vpsrlw $4, %xmm0, %xmm0
+; BITALG-NEXT: vpand %xmm1, %xmm0, %xmm0
+; BITALG-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; BITALG-NEXT: vpaddb %xmm2, %xmm0, %xmm0
+; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; BITALG-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; BITALG-NEXT: vpsadbw %xmm1, %xmm2, %xmm2
+; BITALG-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
+; BITALG-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; BITALG-NEXT: retq
%out = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %in)
ret <4 x i32> %out
}
define <8 x i16> @testv8i16(<8 x i16> %in) nounwind {
; SSE2-LABEL: testv8i16:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrlw $1, %xmm1
; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
@@ -287,7 +364,7 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind {
; SSE2-NEXT: retq
;
; SSE3-LABEL: testv8i16:
-; SSE3: # BB#0:
+; SSE3: # %bb.0:
; SSE3-NEXT: movdqa %xmm0, %xmm1
; SSE3-NEXT: psrlw $1, %xmm1
; SSE3-NEXT: pand {{.*}}(%rip), %xmm1
@@ -309,7 +386,7 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind {
; SSE3-NEXT: retq
;
; SSSE3-LABEL: testv8i16:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; SSSE3-NEXT: movdqa %xmm0, %xmm2
; SSSE3-NEXT: pand %xmm1, %xmm2
@@ -327,7 +404,7 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind {
; SSSE3-NEXT: retq
;
; SSE41-LABEL: testv8i16:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; SSE41-NEXT: movdqa %xmm0, %xmm2
; SSE41-NEXT: pand %xmm1, %xmm2
@@ -345,7 +422,7 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind {
; SSE41-NEXT: retq
;
; AVX1-LABEL: testv8i16:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@@ -360,7 +437,7 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: testv8i16:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@@ -375,19 +452,41 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind {
; AVX2-NEXT: retq
;
; AVX512VPOPCNTDQ-LABEL: testv8i16:
-; AVX512VPOPCNTDQ: # BB#0:
-; AVX512VPOPCNTDQ-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT: vpmovqw %zmm0, %xmm0
+; AVX512VPOPCNTDQ: # %bb.0:
+; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0
+; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX512VPOPCNTDQ-NEXT: vzeroupper
; AVX512VPOPCNTDQ-NEXT: retq
+;
+; AVX512VPOPCNTDQVL-LABEL: testv8i16:
+; AVX512VPOPCNTDQVL: # %bb.0:
+; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512VPOPCNTDQVL-NEXT: vpopcntd %ymm0, %ymm0
+; AVX512VPOPCNTDQVL-NEXT: vpmovdw %ymm0, %xmm0
+; AVX512VPOPCNTDQVL-NEXT: vzeroupper
+; AVX512VPOPCNTDQVL-NEXT: retq
+;
+; BITALG_NOVLX-LABEL: testv8i16:
+; BITALG_NOVLX: # %bb.0:
+; BITALG_NOVLX-NEXT: # kill: def %xmm0 killed %xmm0 def %zmm0
+; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0
+; BITALG_NOVLX-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0
+; BITALG_NOVLX-NEXT: vzeroupper
+; BITALG_NOVLX-NEXT: retq
+;
+; BITALG-LABEL: testv8i16:
+; BITALG: # %bb.0:
+; BITALG-NEXT: vpopcntw %xmm0, %xmm0
+; BITALG-NEXT: retq
%out = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %in)
ret <8 x i16> %out
}
define <16 x i8> @testv16i8(<16 x i8> %in) nounwind {
; SSE2-LABEL: testv16i8:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrlw $1, %xmm1
; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
@@ -406,7 +505,7 @@ define <16 x i8> @testv16i8(<16 x i8> %in) nounwind {
; SSE2-NEXT: retq
;
; SSE3-LABEL: testv16i8:
-; SSE3: # BB#0:
+; SSE3: # %bb.0:
; SSE3-NEXT: movdqa %xmm0, %xmm1
; SSE3-NEXT: psrlw $1, %xmm1
; SSE3-NEXT: pand {{.*}}(%rip), %xmm1
@@ -425,7 +524,7 @@ define <16 x i8> @testv16i8(<16 x i8> %in) nounwind {
; SSE3-NEXT: retq
;
; SSSE3-LABEL: testv16i8:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; SSSE3-NEXT: movdqa %xmm0, %xmm3
; SSSE3-NEXT: pand %xmm2, %xmm3
@@ -440,7 +539,7 @@ define <16 x i8> @testv16i8(<16 x i8> %in) nounwind {
; SSSE3-NEXT: retq
;
; SSE41-LABEL: testv16i8:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; SSE41-NEXT: movdqa %xmm0, %xmm3
; SSE41-NEXT: pand %xmm2, %xmm3
@@ -455,7 +554,7 @@ define <16 x i8> @testv16i8(<16 x i8> %in) nounwind {
; SSE41-NEXT: retq
;
; AVX1-LABEL: testv16i8:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@@ -467,7 +566,7 @@ define <16 x i8> @testv16i8(<16 x i8> %in) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: testv16i8:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@@ -479,68 +578,129 @@ define <16 x i8> @testv16i8(<16 x i8> %in) nounwind {
; AVX2-NEXT: retq
;
; AVX512VPOPCNTDQ-LABEL: testv16i8:
-; AVX512VPOPCNTDQ: # BB#0:
+; AVX512VPOPCNTDQ: # %bb.0:
; AVX512VPOPCNTDQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0
; AVX512VPOPCNTDQ-NEXT: vpmovdb %zmm0, %xmm0
; AVX512VPOPCNTDQ-NEXT: vzeroupper
; AVX512VPOPCNTDQ-NEXT: retq
+;
+; AVX512VPOPCNTDQVL-LABEL: testv16i8:
+; AVX512VPOPCNTDQVL: # %bb.0:
+; AVX512VPOPCNTDQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512VPOPCNTDQVL-NEXT: vpopcntd %zmm0, %zmm0
+; AVX512VPOPCNTDQVL-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512VPOPCNTDQVL-NEXT: vzeroupper
+; AVX512VPOPCNTDQVL-NEXT: retq
+;
+; BITALG_NOVLX-LABEL: testv16i8:
+; BITALG_NOVLX: # %bb.0:
+; BITALG_NOVLX-NEXT: # kill: def %xmm0 killed %xmm0 def %zmm0
+; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0
+; BITALG_NOVLX-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0
+; BITALG_NOVLX-NEXT: vzeroupper
+; BITALG_NOVLX-NEXT: retq
+;
+; BITALG-LABEL: testv16i8:
+; BITALG: # %bb.0:
+; BITALG-NEXT: vpopcntb %xmm0, %xmm0
+; BITALG-NEXT: retq
%out = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %in)
ret <16 x i8> %out
}
define <2 x i64> @foldv2i64() nounwind {
; SSE-LABEL: foldv2i64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm0 = [1,64]
; SSE-NEXT: retq
;
; AVX-LABEL: foldv2i64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [1,64]
; AVX-NEXT: retq
+;
+; BITALG_NOVLX-LABEL: foldv2i64:
+; BITALG_NOVLX: # %bb.0:
+; BITALG_NOVLX-NEXT: vmovaps {{.*#+}} xmm0 = [1,64]
+; BITALG_NOVLX-NEXT: retq
+;
+; BITALG-LABEL: foldv2i64:
+; BITALG: # %bb.0:
+; BITALG-NEXT: vmovaps {{.*#+}} xmm0 = [1,64]
+; BITALG-NEXT: retq
%out = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> <i64 256, i64 -1>)
ret <2 x i64> %out
}
define <4 x i32> @foldv4i32() nounwind {
; SSE-LABEL: foldv4i32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm0 = [1,32,0,8]
; SSE-NEXT: retq
;
; AVX-LABEL: foldv4i32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [1,32,0,8]
; AVX-NEXT: retq
+;
+; BITALG_NOVLX-LABEL: foldv4i32:
+; BITALG_NOVLX: # %bb.0:
+; BITALG_NOVLX-NEXT: vmovaps {{.*#+}} xmm0 = [1,32,0,8]
+; BITALG_NOVLX-NEXT: retq
+;
+; BITALG-LABEL: foldv4i32:
+; BITALG: # %bb.0:
+; BITALG-NEXT: vmovaps {{.*#+}} xmm0 = [1,32,0,8]
+; BITALG-NEXT: retq
%out = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> <i32 256, i32 -1, i32 0, i32 255>)
ret <4 x i32> %out
}
define <8 x i16> @foldv8i16() nounwind {
; SSE-LABEL: foldv8i16:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm0 = [1,16,0,8,0,3,2,3]
; SSE-NEXT: retq
;
; AVX-LABEL: foldv8i16:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [1,16,0,8,0,3,2,3]
; AVX-NEXT: retq
+;
+; BITALG_NOVLX-LABEL: foldv8i16:
+; BITALG_NOVLX: # %bb.0:
+; BITALG_NOVLX-NEXT: vmovaps {{.*#+}} xmm0 = [1,16,0,8,0,3,2,3]
+; BITALG_NOVLX-NEXT: retq
+;
+; BITALG-LABEL: foldv8i16:
+; BITALG: # %bb.0:
+; BITALG-NEXT: vmovaps {{.*#+}} xmm0 = [1,16,0,8,0,3,2,3]
+; BITALG-NEXT: retq
%out = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> <i16 256, i16 -1, i16 0, i16 255, i16 -65536, i16 7, i16 24, i16 88>)
ret <8 x i16> %out
}
define <16 x i8> @foldv16i8() nounwind {
; SSE-LABEL: foldv16i8:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm0 = [0,8,0,8,0,3,2,3,7,7,1,1,1,1,1,1]
; SSE-NEXT: retq
;
; AVX-LABEL: foldv16i8:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [0,8,0,8,0,3,2,3,7,7,1,1,1,1,1,1]
; AVX-NEXT: retq
+;
+; BITALG_NOVLX-LABEL: foldv16i8:
+; BITALG_NOVLX: # %bb.0:
+; BITALG_NOVLX-NEXT: vmovaps {{.*#+}} xmm0 = [0,8,0,8,0,3,2,3,7,7,1,1,1,1,1,1]
+; BITALG_NOVLX-NEXT: retq
+;
+; BITALG-LABEL: foldv16i8:
+; BITALG: # %bb.0:
+; BITALG-NEXT: vmovaps {{.*#+}} xmm0 = [0,8,0,8,0,3,2,3,7,7,1,1,1,1,1,1]
+; BITALG-NEXT: retq
%out = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> <i8 256, i8 -1, i8 0, i8 255, i8 -65536, i8 7, i8 24, i8 88, i8 -2, i8 254, i8 1, i8 2, i8 4, i8 8, i8 16, i8 32>)
ret <16 x i8> %out
}
diff --git a/test/CodeGen/X86/vector-popcnt-256.ll b/test/CodeGen/X86/vector-popcnt-256.ll
index 4c5de2fed385..0f09b4f337d9 100644
--- a/test/CodeGen/X86/vector-popcnt-256.ll
+++ b/test/CodeGen/X86/vector-popcnt-256.ll
@@ -2,10 +2,13 @@
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vpopcntdq | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512VPOPCNTDQ
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vpopcntdq,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512VPOPCNTDQVL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bitalg | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=BITALG_NOVLX
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bitalg,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=BITALG
define <4 x i64> @testv4i64(<4 x i64> %in) nounwind {
; AVX1-LABEL: testv4i64:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3
@@ -28,7 +31,7 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: testv4i64:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@@ -37,23 +40,56 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind {
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0
; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0
-; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512VPOPCNTDQ-LABEL: testv4i64:
-; AVX512VPOPCNTDQ: # BB#0:
-; AVX512VPOPCNTDQ-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512VPOPCNTDQ: # %bb.0:
+; AVX512VPOPCNTDQ-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512VPOPCNTDQ-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
; AVX512VPOPCNTDQ-NEXT: retq
+;
+; AVX512VPOPCNTDQVL-LABEL: testv4i64:
+; AVX512VPOPCNTDQVL: # %bb.0:
+; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0
+; AVX512VPOPCNTDQVL-NEXT: retq
+;
+; BITALG_NOVLX-LABEL: testv4i64:
+; BITALG_NOVLX: # %bb.0:
+; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; BITALG_NOVLX-NEXT: vpand %ymm1, %ymm0, %ymm2
+; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; BITALG_NOVLX-NEXT: vpshufb %ymm2, %ymm3, %ymm2
+; BITALG_NOVLX-NEXT: vpsrlw $4, %ymm0, %ymm0
+; BITALG_NOVLX-NEXT: vpand %ymm1, %ymm0, %ymm0
+; BITALG_NOVLX-NEXT: vpshufb %ymm0, %ymm3, %ymm0
+; BITALG_NOVLX-NEXT: vpaddb %ymm2, %ymm0, %ymm0
+; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; BITALG_NOVLX-NEXT: vpsadbw %ymm1, %ymm0, %ymm0
+; BITALG_NOVLX-NEXT: retq
+;
+; BITALG-LABEL: testv4i64:
+; BITALG: # %bb.0:
+; BITALG-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; BITALG-NEXT: vpand %ymm1, %ymm0, %ymm2
+; BITALG-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; BITALG-NEXT: vpshufb %ymm2, %ymm3, %ymm2
+; BITALG-NEXT: vpsrlw $4, %ymm0, %ymm0
+; BITALG-NEXT: vpand %ymm1, %ymm0, %ymm0
+; BITALG-NEXT: vpshufb %ymm0, %ymm3, %ymm0
+; BITALG-NEXT: vpaddb %ymm2, %ymm0, %ymm0
+; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0
+; BITALG-NEXT: retq
%out = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %in)
ret <4 x i64> %out
}
define <8 x i32> @testv8i32(<8 x i32> %in) nounwind {
; AVX1-LABEL: testv8i32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3
@@ -84,7 +120,7 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: testv8i32:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@@ -93,7 +129,7 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind {
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0
; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0
-; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
; AVX2-NEXT: vpsadbw %ymm1, %ymm2, %ymm2
; AVX2-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
@@ -102,18 +138,59 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind {
; AVX2-NEXT: retq
;
; AVX512VPOPCNTDQ-LABEL: testv8i32:
-; AVX512VPOPCNTDQ: # BB#0:
-; AVX512VPOPCNTDQ-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512VPOPCNTDQ: # %bb.0:
+; AVX512VPOPCNTDQ-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512VPOPCNTDQ-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
; AVX512VPOPCNTDQ-NEXT: retq
+;
+; AVX512VPOPCNTDQVL-LABEL: testv8i32:
+; AVX512VPOPCNTDQVL: # %bb.0:
+; AVX512VPOPCNTDQVL-NEXT: vpopcntd %ymm0, %ymm0
+; AVX512VPOPCNTDQVL-NEXT: retq
+;
+; BITALG_NOVLX-LABEL: testv8i32:
+; BITALG_NOVLX: # %bb.0:
+; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; BITALG_NOVLX-NEXT: vpand %ymm1, %ymm0, %ymm2
+; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; BITALG_NOVLX-NEXT: vpshufb %ymm2, %ymm3, %ymm2
+; BITALG_NOVLX-NEXT: vpsrlw $4, %ymm0, %ymm0
+; BITALG_NOVLX-NEXT: vpand %ymm1, %ymm0, %ymm0
+; BITALG_NOVLX-NEXT: vpshufb %ymm0, %ymm3, %ymm0
+; BITALG_NOVLX-NEXT: vpaddb %ymm2, %ymm0, %ymm0
+; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; BITALG_NOVLX-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
+; BITALG_NOVLX-NEXT: vpsadbw %ymm1, %ymm2, %ymm2
+; BITALG_NOVLX-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
+; BITALG_NOVLX-NEXT: vpsadbw %ymm1, %ymm0, %ymm0
+; BITALG_NOVLX-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
+; BITALG_NOVLX-NEXT: retq
+;
+; BITALG-LABEL: testv8i32:
+; BITALG: # %bb.0:
+; BITALG-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; BITALG-NEXT: vpand %ymm1, %ymm0, %ymm2
+; BITALG-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; BITALG-NEXT: vpshufb %ymm2, %ymm3, %ymm2
+; BITALG-NEXT: vpsrlw $4, %ymm0, %ymm0
+; BITALG-NEXT: vpand %ymm1, %ymm0, %ymm0
+; BITALG-NEXT: vpshufb %ymm0, %ymm3, %ymm0
+; BITALG-NEXT: vpaddb %ymm2, %ymm0, %ymm0
+; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; BITALG-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
+; BITALG-NEXT: vpsadbw %ymm1, %ymm2, %ymm2
+; BITALG-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
+; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0
+; BITALG-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
+; BITALG-NEXT: retq
%out = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %in)
ret <8 x i32> %out
}
define <16 x i16> @testv16i16(<16 x i16> %in) nounwind {
; AVX1-LABEL: testv16i16:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@@ -139,7 +216,7 @@ define <16 x i16> @testv16i16(<16 x i16> %in) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: testv16i16:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@@ -154,18 +231,37 @@ define <16 x i16> @testv16i16(<16 x i16> %in) nounwind {
; AVX2-NEXT: retq
;
; AVX512VPOPCNTDQ-LABEL: testv16i16:
-; AVX512VPOPCNTDQ: # BB#0:
+; AVX512VPOPCNTDQ: # %bb.0:
; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0
; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0
; AVX512VPOPCNTDQ-NEXT: retq
+;
+; AVX512VPOPCNTDQVL-LABEL: testv16i16:
+; AVX512VPOPCNTDQVL: # %bb.0:
+; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512VPOPCNTDQVL-NEXT: vpopcntd %zmm0, %zmm0
+; AVX512VPOPCNTDQVL-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512VPOPCNTDQVL-NEXT: retq
+;
+; BITALG_NOVLX-LABEL: testv16i16:
+; BITALG_NOVLX: # %bb.0:
+; BITALG_NOVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
+; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0
+; BITALG_NOVLX-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
+; BITALG_NOVLX-NEXT: retq
+;
+; BITALG-LABEL: testv16i16:
+; BITALG: # %bb.0:
+; BITALG-NEXT: vpopcntw %ymm0, %ymm0
+; BITALG-NEXT: retq
%out = call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %in)
ret <16 x i16> %out
}
define <32 x i8> @testv32i8(<32 x i8> %in) nounwind {
; AVX1-LABEL: testv32i8:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3
@@ -185,7 +281,7 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: testv32i8:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2
; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@@ -197,7 +293,7 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind {
; AVX2-NEXT: retq
;
; AVX512VPOPCNTDQ-LABEL: testv32i8:
-; AVX512VPOPCNTDQ: # BB#0:
+; AVX512VPOPCNTDQ: # %bb.0:
; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm2
; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@@ -207,13 +303,37 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind {
; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm3, %ymm0
; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm0, %ymm0
; AVX512VPOPCNTDQ-NEXT: retq
+;
+; AVX512VPOPCNTDQVL-LABEL: testv32i8:
+; AVX512VPOPCNTDQVL: # %bb.0:
+; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm2
+; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQVL-NEXT: vpshufb %ymm2, %ymm3, %ymm2
+; AVX512VPOPCNTDQVL-NEXT: vpsrlw $4, %ymm0, %ymm0
+; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm0
+; AVX512VPOPCNTDQVL-NEXT: vpshufb %ymm0, %ymm3, %ymm0
+; AVX512VPOPCNTDQVL-NEXT: vpaddb %ymm2, %ymm0, %ymm0
+; AVX512VPOPCNTDQVL-NEXT: retq
+;
+; BITALG_NOVLX-LABEL: testv32i8:
+; BITALG_NOVLX: # %bb.0:
+; BITALG_NOVLX-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
+; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0
+; BITALG_NOVLX-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
+; BITALG_NOVLX-NEXT: retq
+;
+; BITALG-LABEL: testv32i8:
+; BITALG: # %bb.0:
+; BITALG-NEXT: vpopcntb %ymm0, %ymm0
+; BITALG-NEXT: retq
%out = call <32 x i8> @llvm.ctpop.v32i8(<32 x i8> %in)
ret <32 x i8> %out
}
define <4 x i64> @foldv4i64() nounwind {
; ALL-LABEL: foldv4i64:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vmovaps {{.*#+}} ymm0 = [1,64,0,8]
; ALL-NEXT: retq
%out = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> <i64 256, i64 -1, i64 0, i64 255>)
@@ -222,7 +342,7 @@ define <4 x i64> @foldv4i64() nounwind {
define <8 x i32> @foldv8i32() nounwind {
; ALL-LABEL: foldv8i32:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vmovaps {{.*#+}} ymm0 = [1,32,0,8,16,3,2,3]
; ALL-NEXT: retq
%out = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> <i32 256, i32 -1, i32 0, i32 255, i32 -65536, i32 7, i32 24, i32 88>)
@@ -231,7 +351,7 @@ define <8 x i32> @foldv8i32() nounwind {
define <16 x i16> @foldv16i16() nounwind {
; ALL-LABEL: foldv16i16:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vmovaps {{.*#+}} ymm0 = [1,16,0,8,0,3,2,3,15,7,1,1,1,1,1,1]
; ALL-NEXT: retq
%out = call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> <i16 256, i16 -1, i16 0, i16 255, i16 -65536, i16 7, i16 24, i16 88, i16 -2, i16 254, i16 1, i16 2, i16 4, i16 8, i16 16, i16 32>)
@@ -240,7 +360,7 @@ define <16 x i16> @foldv16i16() nounwind {
define <32 x i8> @foldv32i8() nounwind {
; ALL-LABEL: foldv32i8:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vmovaps {{.*#+}} ymm0 = [0,8,0,8,0,3,2,3,7,7,1,1,1,1,1,1,1,1,0,0,1,2,3,4,5,6,7,8,2,2,3,7]
; ALL-NEXT: retq
%out = call <32 x i8> @llvm.ctpop.v32i8(<32 x i8> <i8 256, i8 -1, i8 0, i8 255, i8 -65536, i8 7, i8 24, i8 88, i8 -2, i8 254, i8 1, i8 2, i8 4, i8 8, i8 16, i8 32, i8 64, i8 128, i8 256, i8 -256, i8 -128, i8 -64, i8 -32, i8 -16, i8 -8, i8 -4, i8 -2, i8 -1, i8 3, i8 5, i8 7, i8 127>)
diff --git a/test/CodeGen/X86/vector-popcnt-512.ll b/test/CodeGen/X86/vector-popcnt-512.ll
index a6f4e3342897..df5edc13c3ea 100644
--- a/test/CodeGen/X86/vector-popcnt-512.ll
+++ b/test/CodeGen/X86/vector-popcnt-512.ll
@@ -3,10 +3,11 @@
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BW
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vpopcntdq | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512VPOPCNTDQ --check-prefix=AVX512VPOPCNTDQ-NOBW
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vpopcntdq,+avx512bw | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512VPOPCNTDQ --check-prefix=AVX512VPOPCNTDQ-BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bitalg | FileCheck %s --check-prefix=AVX512 --check-prefix=BITALG
define <8 x i64> @testv8i64(<8 x i64> %in) nounwind {
; AVX512F-LABEL: testv8i64:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
@@ -16,7 +17,7 @@ define <8 x i64> @testv8i64(<8 x i64> %in) nounwind {
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
; AVX512F-NEXT: vpshufb %ymm1, %ymm4, %ymm1
; AVX512F-NEXT: vpaddb %ymm3, %ymm1, %ymm1
-; AVX512F-NEXT: vpxor %ymm3, %ymm3, %ymm3
+; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX512F-NEXT: vpsadbw %ymm3, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm5
; AVX512F-NEXT: vpshufb %ymm5, %ymm4, %ymm5
@@ -29,30 +30,44 @@ define <8 x i64> @testv8i64(<8 x i64> %in) nounwind {
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: testv8i64:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0
; AVX512BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0
-; AVX512BW-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512BW-NEXT: vpsadbw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512VPOPCNTDQ-LABEL: testv8i64:
-; AVX512VPOPCNTDQ: # BB#0:
+; AVX512VPOPCNTDQ: # %bb.0:
; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0
; AVX512VPOPCNTDQ-NEXT: retq
+;
+; BITALG-LABEL: testv8i64:
+; BITALG: # %bb.0:
+; BITALG-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; BITALG-NEXT: vpandq %zmm1, %zmm0, %zmm2
+; BITALG-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; BITALG-NEXT: vpshufb %zmm2, %zmm3, %zmm2
+; BITALG-NEXT: vpsrlw $4, %zmm0, %zmm0
+; BITALG-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; BITALG-NEXT: vpshufb %zmm0, %zmm3, %zmm0
+; BITALG-NEXT: vpaddb %zmm2, %zmm0, %zmm0
+; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; BITALG-NEXT: vpsadbw %zmm1, %zmm0, %zmm0
+; BITALG-NEXT: retq
%out = call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %in)
ret <8 x i64> %out
}
define <16 x i32> @testv16i32(<16 x i32> %in) nounwind {
; AVX512F-LABEL: testv16i32:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
@@ -62,7 +77,7 @@ define <16 x i32> @testv16i32(<16 x i32> %in) nounwind {
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
; AVX512F-NEXT: vpshufb %ymm1, %ymm4, %ymm1
; AVX512F-NEXT: vpaddb %ymm3, %ymm1, %ymm1
-; AVX512F-NEXT: vpxor %ymm3, %ymm3, %ymm3
+; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX512F-NEXT: vpunpckhdq {{.*#+}} ymm5 = ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[6],ymm3[6],ymm1[7],ymm3[7]
; AVX512F-NEXT: vpsadbw %ymm3, %ymm5, %ymm5
; AVX512F-NEXT: vpunpckldq {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[4],ymm3[4],ymm1[5],ymm3[5]
@@ -83,16 +98,16 @@ define <16 x i32> @testv16i32(<16 x i32> %in) nounwind {
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: testv16i32:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0
; AVX512BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0
-; AVX512BW-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512BW-NEXT: vpunpckhdq {{.*#+}} zmm2 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
; AVX512BW-NEXT: vpsadbw %zmm1, %zmm2, %zmm2
; AVX512BW-NEXT: vpunpckldq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
@@ -101,16 +116,34 @@ define <16 x i32> @testv16i32(<16 x i32> %in) nounwind {
; AVX512BW-NEXT: retq
;
; AVX512VPOPCNTDQ-LABEL: testv16i32:
-; AVX512VPOPCNTDQ: # BB#0:
+; AVX512VPOPCNTDQ: # %bb.0:
; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0
; AVX512VPOPCNTDQ-NEXT: retq
+;
+; BITALG-LABEL: testv16i32:
+; BITALG: # %bb.0:
+; BITALG-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; BITALG-NEXT: vpandq %zmm1, %zmm0, %zmm2
+; BITALG-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; BITALG-NEXT: vpshufb %zmm2, %zmm3, %zmm2
+; BITALG-NEXT: vpsrlw $4, %zmm0, %zmm0
+; BITALG-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; BITALG-NEXT: vpshufb %zmm0, %zmm3, %zmm0
+; BITALG-NEXT: vpaddb %zmm2, %zmm0, %zmm0
+; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; BITALG-NEXT: vpunpckhdq {{.*#+}} zmm2 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
+; BITALG-NEXT: vpsadbw %zmm1, %zmm2, %zmm2
+; BITALG-NEXT: vpunpckldq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
+; BITALG-NEXT: vpsadbw %zmm1, %zmm0, %zmm0
+; BITALG-NEXT: vpackuswb %zmm2, %zmm0, %zmm0
+; BITALG-NEXT: retq
%out = call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %in)
ret <16 x i32> %out
}
define <32 x i16> @testv32i16(<32 x i16> %in) nounwind {
; AVX512F-LABEL: testv32i16:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm3
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@@ -134,10 +167,10 @@ define <32 x i16> @testv32i16(<32 x i16> %in) nounwind {
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: testv32i16:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -149,7 +182,7 @@ define <32 x i16> @testv32i16(<32 x i16> %in) nounwind {
; AVX512BW-NEXT: retq
;
; AVX512VPOPCNTDQ-NOBW-LABEL: testv32i16:
-; AVX512VPOPCNTDQ-NOBW: # BB#0:
+; AVX512VPOPCNTDQ-NOBW: # %bb.0:
; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
; AVX512VPOPCNTDQ-NOBW-NEXT: vpopcntd %zmm0, %zmm0
; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovdw %zmm0, %ymm0
@@ -159,10 +192,10 @@ define <32 x i16> @testv32i16(<32 x i16> %in) nounwind {
; AVX512VPOPCNTDQ-NOBW-NEXT: retq
;
; AVX512VPOPCNTDQ-BW-LABEL: testv32i16:
-; AVX512VPOPCNTDQ-BW: # BB#0:
+; AVX512VPOPCNTDQ-BW: # %bb.0:
; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512VPOPCNTDQ-BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -172,13 +205,18 @@ define <32 x i16> @testv32i16(<32 x i16> %in) nounwind {
; AVX512VPOPCNTDQ-BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0
; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $8, %zmm0, %zmm0
; AVX512VPOPCNTDQ-BW-NEXT: retq
+;
+; BITALG-LABEL: testv32i16:
+; BITALG: # %bb.0:
+; BITALG-NEXT: vpopcntw %zmm0, %zmm0
+; BITALG-NEXT: retq
%out = call <32 x i16> @llvm.ctpop.v32i16(<32 x i16> %in)
ret <32 x i16> %out
}
define <64 x i8> @testv64i8(<64 x i8> %in) nounwind {
; AVX512F-LABEL: testv64i8:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm3
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@@ -196,10 +234,10 @@ define <64 x i8> @testv64i8(<64 x i8> %in) nounwind {
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: testv64i8:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -208,7 +246,7 @@ define <64 x i8> @testv64i8(<64 x i8> %in) nounwind {
; AVX512BW-NEXT: retq
;
; AVX512VPOPCNTDQ-NOBW-LABEL: testv64i8:
-; AVX512VPOPCNTDQ-NOBW: # BB#0:
+; AVX512VPOPCNTDQ-NOBW: # %bb.0:
; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm2, %ymm0, %ymm3
; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
@@ -226,16 +264,21 @@ define <64 x i8> @testv64i8(<64 x i8> %in) nounwind {
; AVX512VPOPCNTDQ-NOBW-NEXT: retq
;
; AVX512VPOPCNTDQ-BW-LABEL: testv64i8:
-; AVX512VPOPCNTDQ-BW: # BB#0:
+; AVX512VPOPCNTDQ-BW: # %bb.0:
; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512VPOPCNTDQ-BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512VPOPCNTDQ-BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512VPOPCNTDQ-BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
; AVX512VPOPCNTDQ-BW-NEXT: vpshufb %zmm0, %zmm3, %zmm0
; AVX512VPOPCNTDQ-BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0
; AVX512VPOPCNTDQ-BW-NEXT: retq
+;
+; BITALG-LABEL: testv64i8:
+; BITALG: # %bb.0:
+; BITALG-NEXT: vpopcntb %zmm0, %zmm0
+; BITALG-NEXT: retq
%out = call <64 x i8> @llvm.ctpop.v64i8(<64 x i8> %in)
ret <64 x i8> %out
}
diff --git a/test/CodeGen/X86/vector-rem.ll b/test/CodeGen/X86/vector-rem.ll
index 3e3e93a7d5b0..3f57bd833c08 100644
--- a/test/CodeGen/X86/vector-rem.ll
+++ b/test/CodeGen/X86/vector-rem.ll
@@ -3,7 +3,7 @@
define <4 x i32> @foo(<4 x i32> %t, <4 x i32> %u) nounwind {
; CHECK-LABEL: foo:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3]
; CHECK-NEXT: movd %xmm2, %eax
; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,1,2,3]
@@ -41,7 +41,7 @@ define <4 x i32> @foo(<4 x i32> %t, <4 x i32> %u) nounwind {
define <4 x i32> @bar(<4 x i32> %t, <4 x i32> %u) nounwind {
; CHECK-LABEL: bar:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3]
; CHECK-NEXT: movd %xmm2, %eax
; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,1,2,3]
@@ -79,7 +79,7 @@ define <4 x i32> @bar(<4 x i32> %t, <4 x i32> %u) nounwind {
define <4 x float> @qux(<4 x float> %t, <4 x float> %u) nounwind {
; CHECK-LABEL: qux:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: subq $72, %rsp
; CHECK-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill
; CHECK-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
@@ -108,7 +108,7 @@ define <4 x float> @qux(<4 x float> %t, <4 x float> %u) nounwind {
; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; CHECK-NEXT: unpcklpd (%rsp), %xmm1 # 16-byte Folded Reload
; CHECK-NEXT: # xmm1 = xmm1[0],mem[0]
-; CHECK-NEXT: movapd %xmm1, %xmm0
+; CHECK-NEXT: movaps %xmm1, %xmm0
; CHECK-NEXT: addq $72, %rsp
; CHECK-NEXT: retq
%m = frem <4 x float> %t, %u
diff --git a/test/CodeGen/X86/vector-rotate-128.ll b/test/CodeGen/X86/vector-rotate-128.ll
index 04378ee2ee01..b40c9eddd46b 100644
--- a/test/CodeGen/X86/vector-rotate-128.ll
+++ b/test/CodeGen/X86/vector-rotate-128.ll
@@ -17,26 +17,25 @@
define <2 x i64> @var_rotate_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
; SSE2-LABEL: var_rotate_v2i64:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [64,64]
; SSE2-NEXT: psubq %xmm1, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
-; SSE2-NEXT: movdqa %xmm0, %xmm4
-; SSE2-NEXT: psllq %xmm3, %xmm4
; SSE2-NEXT: movdqa %xmm0, %xmm3
; SSE2-NEXT: psllq %xmm1, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; SSE2-NEXT: movdqa %xmm0, %xmm4
+; SSE2-NEXT: psllq %xmm1, %xmm4
; SSE2-NEXT: movsd {{.*#+}} xmm4 = xmm3[0],xmm4[1]
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,0,1]
; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrlq %xmm3, %xmm1
+; SSE2-NEXT: psrlq %xmm2, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
; SSE2-NEXT: psrlq %xmm2, %xmm0
-; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
-; SSE2-NEXT: orpd %xmm4, %xmm1
-; SSE2-NEXT: movapd %xmm1, %xmm0
+; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE2-NEXT: orpd %xmm4, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: var_rotate_v2i64:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [64,64]
; SSE41-NEXT: psubq %xmm1, %xmm2
; SSE41-NEXT: movdqa %xmm0, %xmm3
@@ -54,7 +53,7 @@ define <2 x i64> @var_rotate_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
; SSE41-NEXT: retq
;
; AVX1-LABEL: var_rotate_v2i64:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [64,64]
; AVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm2
; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm3
@@ -69,7 +68,7 @@ define <2 x i64> @var_rotate_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: var_rotate_v2i64:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [64,64]
; AVX2-NEXT: vpsubq %xmm1, %xmm2, %xmm2
; AVX2-NEXT: vpsllvq %xmm1, %xmm0, %xmm1
@@ -78,41 +77,40 @@ define <2 x i64> @var_rotate_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
; AVX2-NEXT: retq
;
; AVX512BW-LABEL: var_rotate_v2i64:
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: # kill: %XMM1<def> %XMM1<kill> %ZMM1<def>
-; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: # kill: def %xmm1 killed %xmm1 def %zmm1
+; AVX512BW-NEXT: # kill: def %xmm0 killed %xmm0 def %zmm0
; AVX512BW-NEXT: vprolvq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512BW-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512VL-LABEL: var_rotate_v2i64:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vprolvq %xmm1, %xmm0, %xmm0
; AVX512VL-NEXT: retq
;
; XOP-LABEL: var_rotate_v2i64:
-; XOP: # BB#0:
+; XOP: # %bb.0:
; XOP-NEXT: vprotq %xmm1, %xmm0, %xmm0
; XOP-NEXT: retq
;
; X32-SSE-LABEL: var_rotate_v2i64:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [64,0,64,0]
; X32-SSE-NEXT: psubq %xmm1, %xmm2
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
-; X32-SSE-NEXT: movdqa %xmm0, %xmm4
-; X32-SSE-NEXT: psllq %xmm3, %xmm4
; X32-SSE-NEXT: movdqa %xmm0, %xmm3
; X32-SSE-NEXT: psllq %xmm1, %xmm3
+; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; X32-SSE-NEXT: movdqa %xmm0, %xmm4
+; X32-SSE-NEXT: psllq %xmm1, %xmm4
; X32-SSE-NEXT: movsd {{.*#+}} xmm4 = xmm3[0],xmm4[1]
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,0,1]
; X32-SSE-NEXT: movdqa %xmm0, %xmm1
-; X32-SSE-NEXT: psrlq %xmm3, %xmm1
+; X32-SSE-NEXT: psrlq %xmm2, %xmm1
+; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
; X32-SSE-NEXT: psrlq %xmm2, %xmm0
-; X32-SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
-; X32-SSE-NEXT: orpd %xmm4, %xmm1
-; X32-SSE-NEXT: movapd %xmm1, %xmm0
+; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; X32-SSE-NEXT: orpd %xmm4, %xmm0
; X32-SSE-NEXT: retl
%b64 = sub <2 x i64> <i64 64, i64 64>, %b
%shl = shl <2 x i64> %a, %b
@@ -123,7 +121,7 @@ define <2 x i64> @var_rotate_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
define <4 x i32> @var_rotate_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
; SSE2-LABEL: var_rotate_v4i32:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32,32,32,32]
; SSE2-NEXT: psubd %xmm1, %xmm2
; SSE2-NEXT: pslld $23, %xmm1
@@ -137,30 +135,30 @@ define <4 x i32> @var_rotate_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
; SSE2-NEXT: movdqa %xmm2, %xmm3
-; SSE2-NEXT: psrldq {{.*#+}} xmm3 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT: psrlq $32, %xmm3
; SSE2-NEXT: movdqa %xmm0, %xmm4
; SSE2-NEXT: psrld %xmm3, %xmm4
; SSE2-NEXT: movdqa %xmm2, %xmm3
-; SSE2-NEXT: psrlq $32, %xmm3
+; SSE2-NEXT: psrldq {{.*#+}} xmm3 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; SSE2-NEXT: movdqa %xmm0, %xmm5
; SSE2-NEXT: psrld %xmm3, %xmm5
-; SSE2-NEXT: movsd {{.*#+}} xmm4 = xmm5[0],xmm4[1]
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,3,2,3]
+; SSE2-NEXT: movsd {{.*#+}} xmm5 = xmm4[0],xmm5[1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,3,2,3]
; SSE2-NEXT: pxor %xmm4, %xmm4
; SSE2-NEXT: movdqa %xmm2, %xmm5
-; SSE2-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm4[2],xmm5[3],xmm4[3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
; SSE2-NEXT: movdqa %xmm0, %xmm6
; SSE2-NEXT: psrld %xmm5, %xmm6
-; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
+; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
; SSE2-NEXT: psrld %xmm2, %xmm0
-; SSE2-NEXT: movsd {{.*#+}} xmm6 = xmm0[0],xmm6[1]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,2,2,3]
+; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm6[0],xmm0[1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: var_rotate_v4i32:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [32,32,32,32]
; SSE41-NEXT: psubd %xmm1, %xmm2
; SSE41-NEXT: pslld $23, %xmm1
@@ -188,7 +186,7 @@ define <4 x i32> @var_rotate_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
; SSE41-NEXT: retq
;
; AVX1-LABEL: var_rotate_v4i32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [32,32,32,32]
; AVX1-NEXT: vpsubd %xmm1, %xmm2, %xmm2
; AVX1-NEXT: vpslld $23, %xmm1, %xmm1
@@ -211,7 +209,7 @@ define <4 x i32> @var_rotate_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: var_rotate_v4i32:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [32,32,32,32]
; AVX2-NEXT: vpsubd %xmm1, %xmm2, %xmm2
; AVX2-NEXT: vpsllvd %xmm1, %xmm0, %xmm1
@@ -220,26 +218,26 @@ define <4 x i32> @var_rotate_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
; AVX2-NEXT: retq
;
; AVX512BW-LABEL: var_rotate_v4i32:
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: # kill: %XMM1<def> %XMM1<kill> %ZMM1<def>
-; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: # kill: def %xmm1 killed %xmm1 def %zmm1
+; AVX512BW-NEXT: # kill: def %xmm0 killed %xmm0 def %zmm0
; AVX512BW-NEXT: vprolvd %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512BW-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512VL-LABEL: var_rotate_v4i32:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vprolvd %xmm1, %xmm0, %xmm0
; AVX512VL-NEXT: retq
;
; XOP-LABEL: var_rotate_v4i32:
-; XOP: # BB#0:
+; XOP: # %bb.0:
; XOP-NEXT: vprotd %xmm1, %xmm0, %xmm0
; XOP-NEXT: retq
;
; X32-SSE-LABEL: var_rotate_v4i32:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [32,32,32,32]
; X32-SSE-NEXT: psubd %xmm1, %xmm2
; X32-SSE-NEXT: pslld $23, %xmm1
@@ -253,24 +251,24 @@ define <4 x i32> @var_rotate_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3]
; X32-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
; X32-SSE-NEXT: movdqa %xmm2, %xmm3
-; X32-SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; X32-SSE-NEXT: psrlq $32, %xmm3
; X32-SSE-NEXT: movdqa %xmm0, %xmm4
; X32-SSE-NEXT: psrld %xmm3, %xmm4
; X32-SSE-NEXT: movdqa %xmm2, %xmm3
-; X32-SSE-NEXT: psrlq $32, %xmm3
+; X32-SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; X32-SSE-NEXT: movdqa %xmm0, %xmm5
; X32-SSE-NEXT: psrld %xmm3, %xmm5
-; X32-SSE-NEXT: movsd {{.*#+}} xmm4 = xmm5[0],xmm4[1]
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,3,2,3]
+; X32-SSE-NEXT: movsd {{.*#+}} xmm5 = xmm4[0],xmm5[1]
+; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,3,2,3]
; X32-SSE-NEXT: pxor %xmm4, %xmm4
; X32-SSE-NEXT: movdqa %xmm2, %xmm5
-; X32-SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm4[2],xmm5[3],xmm4[3]
+; X32-SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
; X32-SSE-NEXT: movdqa %xmm0, %xmm6
; X32-SSE-NEXT: psrld %xmm5, %xmm6
-; X32-SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
+; X32-SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
; X32-SSE-NEXT: psrld %xmm2, %xmm0
-; X32-SSE-NEXT: movsd {{.*#+}} xmm6 = xmm0[0],xmm6[1]
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,2,2,3]
+; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm6[0],xmm0[1]
+; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X32-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
; X32-SSE-NEXT: por %xmm1, %xmm0
; X32-SSE-NEXT: retl
@@ -283,7 +281,7 @@ define <4 x i32> @var_rotate_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
define <8 x i16> @var_rotate_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
; SSE2-LABEL: var_rotate_v8i16:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
; SSE2-NEXT: psubw %xmm1, %xmm3
; SSE2-NEXT: psllw $12, %xmm1
@@ -352,7 +350,7 @@ define <8 x i16> @var_rotate_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
; SSE2-NEXT: retq
;
; SSE41-LABEL: var_rotate_v8i16:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm3
; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [16,16,16,16,16,16,16,16]
; SSE41-NEXT: psubw %xmm1, %xmm2
@@ -410,7 +408,7 @@ define <8 x i16> @var_rotate_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
; SSE41-NEXT: retq
;
; AVX1-LABEL: var_rotate_v8i16:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [16,16,16,16,16,16,16,16]
; AVX1-NEXT: vpsubw %xmm1, %xmm2, %xmm2
; AVX1-NEXT: vpsllw $12, %xmm1, %xmm3
@@ -445,7 +443,7 @@ define <8 x i16> @var_rotate_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: var_rotate_v8i16:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [16,16,16,16,16,16,16,16]
; AVX2-NEXT: vpsubw %xmm1, %xmm2, %xmm2
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
@@ -463,9 +461,9 @@ define <8 x i16> @var_rotate_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
; AVX2-NEXT: retq
;
; AVX512BW-LABEL: var_rotate_v8i16:
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: # kill: %XMM1<def> %XMM1<kill> %ZMM1<def>
-; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: # kill: def %xmm1 killed %xmm1 def %zmm1
+; AVX512BW-NEXT: # kill: def %xmm0 killed %xmm0 def %zmm0
; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [16,16,16,16,16,16,16,16]
; AVX512BW-NEXT: vpsubw %xmm1, %xmm2, %xmm2
; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm1
@@ -475,8 +473,8 @@ define <8 x i16> @var_rotate_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
; AVX512BW-NEXT: retq
;
; AVX512VL-LABEL: var_rotate_v8i16:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vmovdqu {{.*#+}} xmm2 = [16,16,16,16,16,16,16,16]
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [16,16,16,16,16,16,16,16]
; AVX512VL-NEXT: vpsubw %xmm1, %xmm2, %xmm2
; AVX512VL-NEXT: vpsllvw %xmm1, %xmm0, %xmm1
; AVX512VL-NEXT: vpsrlvw %xmm2, %xmm0, %xmm0
@@ -484,12 +482,12 @@ define <8 x i16> @var_rotate_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
; AVX512VL-NEXT: retq
;
; XOP-LABEL: var_rotate_v8i16:
-; XOP: # BB#0:
+; XOP: # %bb.0:
; XOP-NEXT: vprotw %xmm1, %xmm0, %xmm0
; XOP-NEXT: retq
;
; X32-SSE-LABEL: var_rotate_v8i16:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
; X32-SSE-NEXT: psubw %xmm1, %xmm3
; X32-SSE-NEXT: psllw $12, %xmm1
@@ -565,7 +563,7 @@ define <8 x i16> @var_rotate_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
define <16 x i8> @var_rotate_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; SSE2-LABEL: var_rotate_v16i8:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
; SSE2-NEXT: psubb %xmm1, %xmm4
; SSE2-NEXT: psllw $5, %xmm1
@@ -625,7 +623,7 @@ define <16 x i8> @var_rotate_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; SSE2-NEXT: retq
;
; SSE41-LABEL: var_rotate_v16i8:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm1, %xmm3
; SSE41-NEXT: movdqa %xmm0, %xmm1
; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
@@ -672,7 +670,7 @@ define <16 x i8> @var_rotate_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; SSE41-NEXT: retq
;
; AVX-LABEL: var_rotate_v16i8:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
; AVX-NEXT: vpsubb %xmm1, %xmm2, %xmm2
; AVX-NEXT: vpsllw $5, %xmm1, %xmm1
@@ -701,43 +699,28 @@ define <16 x i8> @var_rotate_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0
; AVX-NEXT: retq
;
-; AVX512BW-LABEL: var_rotate_v16i8:
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512BW-NEXT: vpsubb %xmm1, %xmm2, %xmm2
-; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
-; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
-; AVX512BW-NEXT: vpsllvd %zmm1, %zmm0, %zmm1
-; AVX512BW-NEXT: vpmovdb %zmm1, %xmm1
-; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
-; AVX512BW-NEXT: vpsrlvd %zmm2, %zmm0, %zmm0
-; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512BW-NEXT: vpor %xmm0, %xmm1, %xmm0
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512VL-LABEL: var_rotate_v16i8:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vmovdqu {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VL-NEXT: vpsubb %xmm1, %xmm2, %xmm2
-; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
-; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
-; AVX512VL-NEXT: vpsllvd %zmm1, %zmm0, %zmm1
-; AVX512VL-NEXT: vpmovdb %zmm1, %xmm1
-; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
-; AVX512VL-NEXT: vpsrlvd %zmm2, %zmm0, %zmm0
-; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512VL-NEXT: vpor %xmm0, %xmm1, %xmm0
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: retq
+; AVX512-LABEL: var_rotate_v16i8:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512-NEXT: vpsubb %xmm1, %xmm2, %xmm2
+; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512-NEXT: vpsllvd %zmm1, %zmm0, %zmm1
+; AVX512-NEXT: vpmovdb %zmm1, %xmm1
+; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
+; AVX512-NEXT: vpsrlvd %zmm2, %zmm0, %zmm0
+; AVX512-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
;
; XOP-LABEL: var_rotate_v16i8:
-; XOP: # BB#0:
+; XOP: # %bb.0:
; XOP-NEXT: vprotb %xmm1, %xmm0, %xmm0
; XOP-NEXT: retq
;
; X32-SSE-LABEL: var_rotate_v16i8:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: movdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
; X32-SSE-NEXT: psubb %xmm1, %xmm4
; X32-SSE-NEXT: psllw $5, %xmm1
@@ -808,22 +791,21 @@ define <16 x i8> @var_rotate_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
define <2 x i64> @constant_rotate_v2i64(<2 x i64> %a) nounwind {
; SSE2-LABEL: constant_rotate_v2i64:
-; SSE2: # BB#0:
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: psllq $14, %xmm2
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psllq $4, %xmm1
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: psllq $14, %xmm2
; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrlq $50, %xmm1
-; SSE2-NEXT: psrlq $60, %xmm0
-; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
-; SSE2-NEXT: orpd %xmm2, %xmm1
-; SSE2-NEXT: movapd %xmm1, %xmm0
+; SSE2-NEXT: psrlq $60, %xmm1
+; SSE2-NEXT: psrlq $50, %xmm0
+; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE2-NEXT: orpd %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: constant_rotate_v2i64:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm1
; SSE41-NEXT: psllq $14, %xmm1
; SSE41-NEXT: movdqa %xmm0, %xmm2
@@ -837,7 +819,7 @@ define <2 x i64> @constant_rotate_v2i64(<2 x i64> %a) nounwind {
; SSE41-NEXT: retq
;
; AVX1-LABEL: constant_rotate_v2i64:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpsllq $14, %xmm0, %xmm1
; AVX1-NEXT: vpsllq $4, %xmm0, %xmm2
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
@@ -848,44 +830,43 @@ define <2 x i64> @constant_rotate_v2i64(<2 x i64> %a) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: constant_rotate_v2i64:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpsllvq {{.*}}(%rip), %xmm0, %xmm1
; AVX2-NEXT: vpsrlvq {{.*}}(%rip), %xmm0, %xmm0
; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0
; AVX2-NEXT: retq
;
; AVX512BW-LABEL: constant_rotate_v2i64:
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: # kill: def %xmm0 killed %xmm0 def %zmm0
; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [4,14]
; AVX512BW-NEXT: vprolvq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512BW-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512VL-LABEL: constant_rotate_v2i64:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vprolvq {{.*}}(%rip), %xmm0, %xmm0
; AVX512VL-NEXT: retq
;
; XOP-LABEL: constant_rotate_v2i64:
-; XOP: # BB#0:
+; XOP: # %bb.0:
; XOP-NEXT: vprotq {{.*}}(%rip), %xmm0, %xmm0
; XOP-NEXT: retq
;
; X32-SSE-LABEL: constant_rotate_v2i64:
-; X32-SSE: # BB#0:
-; X32-SSE-NEXT: movdqa %xmm0, %xmm2
-; X32-SSE-NEXT: psllq $14, %xmm2
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: movdqa %xmm0, %xmm1
; X32-SSE-NEXT: psllq $4, %xmm1
+; X32-SSE-NEXT: movdqa %xmm0, %xmm2
+; X32-SSE-NEXT: psllq $14, %xmm2
; X32-SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
; X32-SSE-NEXT: movdqa %xmm0, %xmm1
-; X32-SSE-NEXT: psrlq $50, %xmm1
-; X32-SSE-NEXT: psrlq $60, %xmm0
-; X32-SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
-; X32-SSE-NEXT: orpd %xmm2, %xmm1
-; X32-SSE-NEXT: movapd %xmm1, %xmm0
+; X32-SSE-NEXT: psrlq $60, %xmm1
+; X32-SSE-NEXT: psrlq $50, %xmm0
+; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; X32-SSE-NEXT: orpd %xmm2, %xmm0
; X32-SSE-NEXT: retl
%shl = shl <2 x i64> %a, <i64 4, i64 14>
%lshr = lshr <2 x i64> %a, <i64 60, i64 50>
@@ -895,7 +876,7 @@ define <2 x i64> @constant_rotate_v2i64(<2 x i64> %a) nounwind {
define <4 x i32> @constant_rotate_v4i32(<4 x i32> %a) nounwind {
; SSE2-LABEL: constant_rotate_v4i32:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [16,32,64,128]
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: pmuludq %xmm1, %xmm2
@@ -906,22 +887,22 @@ define <4 x i32> @constant_rotate_v4i32(<4 x i32> %a) nounwind {
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrld $25, %xmm1
+; SSE2-NEXT: psrld $27, %xmm1
; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: psrld $27, %xmm3
-; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm3[0],xmm1[1]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; SSE2-NEXT: psrld $25, %xmm3
+; SSE2-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: psrld $26, %xmm3
-; SSE2-NEXT: psrld $28, %xmm0
-; SSE2-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
+; SSE2-NEXT: psrld $28, %xmm3
+; SSE2-NEXT: psrld $26, %xmm0
+; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: por %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: constant_rotate_v4i32:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [16,32,64,128]
; SSE41-NEXT: pmulld %xmm0, %xmm1
; SSE41-NEXT: movdqa %xmm0, %xmm2
@@ -938,7 +919,7 @@ define <4 x i32> @constant_rotate_v4i32(<4 x i32> %a) nounwind {
; SSE41-NEXT: retq
;
; AVX1-LABEL: constant_rotate_v4i32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm1
; AVX1-NEXT: vpsrld $25, %xmm0, %xmm2
; AVX1-NEXT: vpsrld $27, %xmm0, %xmm3
@@ -951,33 +932,33 @@ define <4 x i32> @constant_rotate_v4i32(<4 x i32> %a) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: constant_rotate_v4i32:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm1
; AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0
; AVX2-NEXT: retq
;
; AVX512BW-LABEL: constant_rotate_v4i32:
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: # kill: def %xmm0 killed %xmm0 def %zmm0
; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5,6,7]
; AVX512BW-NEXT: vprolvd %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512BW-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512VL-LABEL: constant_rotate_v4i32:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vprolvd {{.*}}(%rip), %xmm0, %xmm0
; AVX512VL-NEXT: retq
;
; XOP-LABEL: constant_rotate_v4i32:
-; XOP: # BB#0:
+; XOP: # %bb.0:
; XOP-NEXT: vprotd {{.*}}(%rip), %xmm0, %xmm0
; XOP-NEXT: retq
;
; X32-SSE-LABEL: constant_rotate_v4i32:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [16,32,64,128]
; X32-SSE-NEXT: movdqa %xmm0, %xmm2
; X32-SSE-NEXT: pmuludq %xmm1, %xmm2
@@ -988,16 +969,16 @@ define <4 x i32> @constant_rotate_v4i32(<4 x i32> %a) nounwind {
; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3]
; X32-SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; X32-SSE-NEXT: movdqa %xmm0, %xmm1
-; X32-SSE-NEXT: psrld $25, %xmm1
+; X32-SSE-NEXT: psrld $27, %xmm1
; X32-SSE-NEXT: movdqa %xmm0, %xmm3
-; X32-SSE-NEXT: psrld $27, %xmm3
-; X32-SSE-NEXT: movsd {{.*#+}} xmm1 = xmm3[0],xmm1[1]
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; X32-SSE-NEXT: psrld $25, %xmm3
+; X32-SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1]
+; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
; X32-SSE-NEXT: movdqa %xmm0, %xmm3
-; X32-SSE-NEXT: psrld $26, %xmm3
-; X32-SSE-NEXT: psrld $28, %xmm0
-; X32-SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1]
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
+; X32-SSE-NEXT: psrld $28, %xmm3
+; X32-SSE-NEXT: psrld $26, %xmm0
+; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1]
+; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X32-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; X32-SSE-NEXT: por %xmm2, %xmm0
; X32-SSE-NEXT: retl
@@ -1009,7 +990,7 @@ define <4 x i32> @constant_rotate_v4i32(<4 x i32> %a) nounwind {
define <8 x i16> @constant_rotate_v8i16(<8 x i16> %a) nounwind {
; SSE2-LABEL: constant_rotate_v8i16:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128]
; SSE2-NEXT: pmullw %xmm0, %xmm2
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535]
@@ -1041,7 +1022,7 @@ define <8 x i16> @constant_rotate_v8i16(<8 x i16> %a) nounwind {
; SSE2-NEXT: retq
;
; SSE41-LABEL: constant_rotate_v8i16:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128]
; SSE41-NEXT: pmullw %xmm0, %xmm1
; SSE41-NEXT: movdqa %xmm0, %xmm2
@@ -1060,7 +1041,7 @@ define <8 x i16> @constant_rotate_v8i16(<8 x i16> %a) nounwind {
; SSE41-NEXT: retq
;
; AVX1-LABEL: constant_rotate_v8i16:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm1
; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm2
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7]
@@ -1074,7 +1055,7 @@ define <8 x i16> @constant_rotate_v8i16(<8 x i16> %a) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: constant_rotate_v8i16:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm1
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX2-NEXT: vpsrlvd {{.*}}(%rip), %ymm0, %ymm0
@@ -1085,8 +1066,8 @@ define <8 x i16> @constant_rotate_v8i16(<8 x i16> %a) nounwind {
; AVX2-NEXT: retq
;
; AVX512BW-LABEL: constant_rotate_v8i16:
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: # kill: def %xmm0 killed %xmm0 def %zmm0
; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7]
; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm1
; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [16,15,14,13,12,11,10,9]
@@ -1096,19 +1077,19 @@ define <8 x i16> @constant_rotate_v8i16(<8 x i16> %a) nounwind {
; AVX512BW-NEXT: retq
;
; AVX512VL-LABEL: constant_rotate_v8i16:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpsllvw {{.*}}(%rip), %xmm0, %xmm1
; AVX512VL-NEXT: vpsrlvw {{.*}}(%rip), %xmm0, %xmm0
; AVX512VL-NEXT: vpor %xmm0, %xmm1, %xmm0
; AVX512VL-NEXT: retq
;
; XOP-LABEL: constant_rotate_v8i16:
-; XOP: # BB#0:
+; XOP: # %bb.0:
; XOP-NEXT: vprotw {{.*}}(%rip), %xmm0, %xmm0
; XOP-NEXT: retq
;
; X32-SSE-LABEL: constant_rotate_v8i16:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128]
; X32-SSE-NEXT: pmullw %xmm0, %xmm2
; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535]
@@ -1146,7 +1127,7 @@ define <8 x i16> @constant_rotate_v8i16(<8 x i16> %a) nounwind {
define <16 x i8> @constant_rotate_v16i8(<16 x i8> %a) nounwind {
; SSE2-LABEL: constant_rotate_v16i8:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [8192,24640,41088,57536,57600,41152,24704,8256]
; SSE2-NEXT: pxor %xmm2, %xmm2
; SSE2-NEXT: pxor %xmm1, %xmm1
@@ -1204,7 +1185,7 @@ define <16 x i8> @constant_rotate_v16i8(<16 x i8> %a) nounwind {
; SSE2-NEXT: retq
;
; SSE41-LABEL: constant_rotate_v16i8:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm1
; SSE41-NEXT: movdqa %xmm1, %xmm3
; SSE41-NEXT: psllw $4, %xmm3
@@ -1241,7 +1222,7 @@ define <16 x i8> @constant_rotate_v16i8(<16 x i8> %a) nounwind {
; SSE41-NEXT: retq
;
; AVX-LABEL: constant_rotate_v16i8:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpsllw $4, %xmm0, %xmm1
; AVX-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [8192,24640,41088,57536,57600,41152,24704,8256]
@@ -1269,7 +1250,7 @@ define <16 x i8> @constant_rotate_v16i8(<16 x i8> %a) nounwind {
; AVX-NEXT: retq
;
; AVX512-LABEL: constant_rotate_v16i8:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
; AVX512-NEXT: vpsllvd {{.*}}(%rip), %zmm0, %zmm1
; AVX512-NEXT: vpmovdb %zmm1, %xmm1
@@ -1280,12 +1261,12 @@ define <16 x i8> @constant_rotate_v16i8(<16 x i8> %a) nounwind {
; AVX512-NEXT: retq
;
; XOP-LABEL: constant_rotate_v16i8:
-; XOP: # BB#0:
+; XOP: # %bb.0:
; XOP-NEXT: vprotb {{.*}}(%rip), %xmm0, %xmm0
; XOP-NEXT: retq
;
; X32-SSE-LABEL: constant_rotate_v16i8:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [8192,24640,41088,57536,57600,41152,24704,8256]
; X32-SSE-NEXT: pxor %xmm2, %xmm2
; X32-SSE-NEXT: pxor %xmm1, %xmm1
@@ -1353,7 +1334,7 @@ define <16 x i8> @constant_rotate_v16i8(<16 x i8> %a) nounwind {
define <2 x i64> @splatconstant_rotate_v2i64(<2 x i64> %a) nounwind {
; SSE-LABEL: splatconstant_rotate_v2i64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movdqa %xmm0, %xmm1
; SSE-NEXT: psllq $14, %xmm1
; SSE-NEXT: psrlq $50, %xmm0
@@ -1361,32 +1342,32 @@ define <2 x i64> @splatconstant_rotate_v2i64(<2 x i64> %a) nounwind {
; SSE-NEXT: retq
;
; AVX-LABEL: splatconstant_rotate_v2i64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpsllq $14, %xmm0, %xmm1
; AVX-NEXT: vpsrlq $50, %xmm0, %xmm0
; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0
; AVX-NEXT: retq
;
; AVX512BW-LABEL: splatconstant_rotate_v2i64:
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: # kill: def %xmm0 killed %xmm0 def %zmm0
; AVX512BW-NEXT: vprolq $14, %zmm0, %zmm0
-; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512BW-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512VL-LABEL: splatconstant_rotate_v2i64:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vprolq $14, %xmm0, %xmm0
; AVX512VL-NEXT: retq
;
; XOP-LABEL: splatconstant_rotate_v2i64:
-; XOP: # BB#0:
+; XOP: # %bb.0:
; XOP-NEXT: vprotq $14, %xmm0, %xmm0
; XOP-NEXT: retq
;
; X32-SSE-LABEL: splatconstant_rotate_v2i64:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: movdqa %xmm0, %xmm1
; X32-SSE-NEXT: psllq $14, %xmm1
; X32-SSE-NEXT: psrlq $50, %xmm0
@@ -1400,7 +1381,7 @@ define <2 x i64> @splatconstant_rotate_v2i64(<2 x i64> %a) nounwind {
define <4 x i32> @splatconstant_rotate_v4i32(<4 x i32> %a) nounwind {
; SSE-LABEL: splatconstant_rotate_v4i32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movdqa %xmm0, %xmm1
; SSE-NEXT: pslld $4, %xmm1
; SSE-NEXT: psrld $28, %xmm0
@@ -1408,32 +1389,32 @@ define <4 x i32> @splatconstant_rotate_v4i32(<4 x i32> %a) nounwind {
; SSE-NEXT: retq
;
; AVX-LABEL: splatconstant_rotate_v4i32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpslld $4, %xmm0, %xmm1
; AVX-NEXT: vpsrld $28, %xmm0, %xmm0
; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0
; AVX-NEXT: retq
;
; AVX512BW-LABEL: splatconstant_rotate_v4i32:
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: # kill: def %xmm0 killed %xmm0 def %zmm0
; AVX512BW-NEXT: vprold $4, %zmm0, %zmm0
-; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512BW-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512VL-LABEL: splatconstant_rotate_v4i32:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vprold $4, %xmm0, %xmm0
; AVX512VL-NEXT: retq
;
; XOP-LABEL: splatconstant_rotate_v4i32:
-; XOP: # BB#0:
+; XOP: # %bb.0:
; XOP-NEXT: vprotd $4, %xmm0, %xmm0
; XOP-NEXT: retq
;
; X32-SSE-LABEL: splatconstant_rotate_v4i32:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: movdqa %xmm0, %xmm1
; X32-SSE-NEXT: pslld $4, %xmm1
; X32-SSE-NEXT: psrld $28, %xmm0
@@ -1447,7 +1428,7 @@ define <4 x i32> @splatconstant_rotate_v4i32(<4 x i32> %a) nounwind {
define <8 x i16> @splatconstant_rotate_v8i16(<8 x i16> %a) nounwind {
; SSE-LABEL: splatconstant_rotate_v8i16:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movdqa %xmm0, %xmm1
; SSE-NEXT: psllw $7, %xmm1
; SSE-NEXT: psrlw $9, %xmm0
@@ -1455,26 +1436,26 @@ define <8 x i16> @splatconstant_rotate_v8i16(<8 x i16> %a) nounwind {
; SSE-NEXT: retq
;
; AVX-LABEL: splatconstant_rotate_v8i16:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpsllw $7, %xmm0, %xmm1
; AVX-NEXT: vpsrlw $9, %xmm0, %xmm0
; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0
; AVX-NEXT: retq
;
; AVX512-LABEL: splatconstant_rotate_v8i16:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpsllw $7, %xmm0, %xmm1
; AVX512-NEXT: vpsrlw $9, %xmm0, %xmm0
; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0
; AVX512-NEXT: retq
;
; XOP-LABEL: splatconstant_rotate_v8i16:
-; XOP: # BB#0:
+; XOP: # %bb.0:
; XOP-NEXT: vprotw $7, %xmm0, %xmm0
; XOP-NEXT: retq
;
; X32-SSE-LABEL: splatconstant_rotate_v8i16:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: movdqa %xmm0, %xmm1
; X32-SSE-NEXT: psllw $7, %xmm1
; X32-SSE-NEXT: psrlw $9, %xmm0
@@ -1488,7 +1469,7 @@ define <8 x i16> @splatconstant_rotate_v8i16(<8 x i16> %a) nounwind {
define <16 x i8> @splatconstant_rotate_v16i8(<16 x i8> %a) nounwind {
; SSE-LABEL: splatconstant_rotate_v16i8:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movdqa %xmm0, %xmm1
; SSE-NEXT: psllw $4, %xmm1
; SSE-NEXT: pand {{.*}}(%rip), %xmm1
@@ -1498,7 +1479,7 @@ define <16 x i8> @splatconstant_rotate_v16i8(<16 x i8> %a) nounwind {
; SSE-NEXT: retq
;
; AVX-LABEL: splatconstant_rotate_v16i8:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpsllw $4, %xmm0, %xmm1
; AVX-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0
@@ -1507,7 +1488,7 @@ define <16 x i8> @splatconstant_rotate_v16i8(<16 x i8> %a) nounwind {
; AVX-NEXT: retq
;
; AVX512-LABEL: splatconstant_rotate_v16i8:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpsllw $4, %xmm0, %xmm1
; AVX512-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
; AVX512-NEXT: vpsrlw $4, %xmm0, %xmm0
@@ -1516,12 +1497,12 @@ define <16 x i8> @splatconstant_rotate_v16i8(<16 x i8> %a) nounwind {
; AVX512-NEXT: retq
;
; XOP-LABEL: splatconstant_rotate_v16i8:
-; XOP: # BB#0:
+; XOP: # %bb.0:
; XOP-NEXT: vprotb $4, %xmm0, %xmm0
; XOP-NEXT: retq
;
; X32-SSE-LABEL: splatconstant_rotate_v16i8:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: movdqa %xmm0, %xmm1
; X32-SSE-NEXT: psllw $4, %xmm1
; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1
@@ -1541,46 +1522,41 @@ define <16 x i8> @splatconstant_rotate_v16i8(<16 x i8> %a) nounwind {
define <2 x i64> @splatconstant_rotate_mask_v2i64(<2 x i64> %a) nounwind {
; SSE-LABEL: splatconstant_rotate_mask_v2i64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: psrlq $49, %xmm0
; SSE-NEXT: pand {{.*}}(%rip), %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: splatconstant_rotate_mask_v2i64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpsrlq $49, %xmm0, %xmm0
; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: retq
;
; AVX512BW-LABEL: splatconstant_rotate_mask_v2i64:
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: # kill: def %xmm0 killed %xmm0 def %zmm0
; AVX512BW-NEXT: vprolq $15, %zmm0, %zmm0
; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512VL-LABEL: splatconstant_rotate_mask_v2i64:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vprolq $15, %xmm0, %xmm0
; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX512VL-NEXT: retq
;
; XOP-LABEL: splatconstant_rotate_mask_v2i64:
-; XOP: # BB#0:
+; XOP: # %bb.0:
; XOP-NEXT: vprotq $15, %xmm0, %xmm0
; XOP-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; XOP-NEXT: retq
;
; X32-SSE-LABEL: splatconstant_rotate_mask_v2i64:
-; X32-SSE: # BB#0:
-; X32-SSE-NEXT: movdqa %xmm0, %xmm1
-; X32-SSE-NEXT: psllq $15, %xmm1
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: psrlq $49, %xmm0
; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
-; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1
-; X32-SSE-NEXT: por %xmm0, %xmm1
-; X32-SSE-NEXT: movdqa %xmm1, %xmm0
; X32-SSE-NEXT: retl
%shl = shl <2 x i64> %a, <i64 15, i64 15>
%lshr = lshr <2 x i64> %a, <i64 49, i64 49>
@@ -1592,51 +1568,48 @@ define <2 x i64> @splatconstant_rotate_mask_v2i64(<2 x i64> %a) nounwind {
define <4 x i32> @splatconstant_rotate_mask_v4i32(<4 x i32> %a) nounwind {
; SSE-LABEL: splatconstant_rotate_mask_v4i32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movdqa %xmm0, %xmm1
; SSE-NEXT: pslld $4, %xmm1
; SSE-NEXT: psrld $28, %xmm0
-; SSE-NEXT: pand {{.*}}(%rip), %xmm0
; SSE-NEXT: pand {{.*}}(%rip), %xmm1
; SSE-NEXT: por %xmm0, %xmm1
; SSE-NEXT: movdqa %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: splatconstant_rotate_mask_v4i32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpslld $4, %xmm0, %xmm1
; AVX-NEXT: vpsrld $28, %xmm0, %xmm0
-; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0
; AVX-NEXT: retq
;
; AVX512BW-LABEL: splatconstant_rotate_mask_v4i32:
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: # kill: def %xmm0 killed %xmm0 def %zmm0
; AVX512BW-NEXT: vprold $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512VL-LABEL: splatconstant_rotate_mask_v4i32:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vprold $4, %xmm0, %xmm0
; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX512VL-NEXT: retq
;
; XOP-LABEL: splatconstant_rotate_mask_v4i32:
-; XOP: # BB#0:
+; XOP: # %bb.0:
; XOP-NEXT: vprotd $4, %xmm0, %xmm0
; XOP-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; XOP-NEXT: retq
;
; X32-SSE-LABEL: splatconstant_rotate_mask_v4i32:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: movdqa %xmm0, %xmm1
; X32-SSE-NEXT: pslld $4, %xmm1
; X32-SSE-NEXT: psrld $28, %xmm0
-; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1
; X32-SSE-NEXT: por %xmm0, %xmm1
; X32-SSE-NEXT: movdqa %xmm1, %xmm0
@@ -1651,7 +1624,7 @@ define <4 x i32> @splatconstant_rotate_mask_v4i32(<4 x i32> %a) nounwind {
define <8 x i16> @splatconstant_rotate_mask_v8i16(<8 x i16> %a) nounwind {
; SSE-LABEL: splatconstant_rotate_mask_v8i16:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movdqa %xmm0, %xmm1
; SSE-NEXT: psllw $5, %xmm1
; SSE-NEXT: psrlw $11, %xmm0
@@ -1662,7 +1635,7 @@ define <8 x i16> @splatconstant_rotate_mask_v8i16(<8 x i16> %a) nounwind {
; SSE-NEXT: retq
;
; AVX-LABEL: splatconstant_rotate_mask_v8i16:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpsllw $5, %xmm0, %xmm1
; AVX-NEXT: vpsrlw $11, %xmm0, %xmm0
; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
@@ -1671,7 +1644,7 @@ define <8 x i16> @splatconstant_rotate_mask_v8i16(<8 x i16> %a) nounwind {
; AVX-NEXT: retq
;
; AVX512-LABEL: splatconstant_rotate_mask_v8i16:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpsllw $5, %xmm0, %xmm1
; AVX512-NEXT: vpsrlw $11, %xmm0, %xmm0
; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
@@ -1680,13 +1653,13 @@ define <8 x i16> @splatconstant_rotate_mask_v8i16(<8 x i16> %a) nounwind {
; AVX512-NEXT: retq
;
; XOP-LABEL: splatconstant_rotate_mask_v8i16:
-; XOP: # BB#0:
+; XOP: # %bb.0:
; XOP-NEXT: vprotw $5, %xmm0, %xmm0
; XOP-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; XOP-NEXT: retq
;
; X32-SSE-LABEL: splatconstant_rotate_mask_v8i16:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: movdqa %xmm0, %xmm1
; X32-SSE-NEXT: psllw $5, %xmm1
; X32-SSE-NEXT: psrlw $11, %xmm0
@@ -1705,7 +1678,7 @@ define <8 x i16> @splatconstant_rotate_mask_v8i16(<8 x i16> %a) nounwind {
define <16 x i8> @splatconstant_rotate_mask_v16i8(<16 x i8> %a) nounwind {
; SSE-LABEL: splatconstant_rotate_mask_v16i8:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movdqa %xmm0, %xmm1
; SSE-NEXT: psllw $4, %xmm1
; SSE-NEXT: pand {{.*}}(%rip), %xmm1
@@ -1718,7 +1691,7 @@ define <16 x i8> @splatconstant_rotate_mask_v16i8(<16 x i8> %a) nounwind {
; SSE-NEXT: retq
;
; AVX-LABEL: splatconstant_rotate_mask_v16i8:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpsllw $4, %xmm0, %xmm1
; AVX-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0
@@ -1729,7 +1702,7 @@ define <16 x i8> @splatconstant_rotate_mask_v16i8(<16 x i8> %a) nounwind {
; AVX-NEXT: retq
;
; AVX512-LABEL: splatconstant_rotate_mask_v16i8:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpsllw $4, %xmm0, %xmm1
; AVX512-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
; AVX512-NEXT: vpsrlw $4, %xmm0, %xmm0
@@ -1740,13 +1713,13 @@ define <16 x i8> @splatconstant_rotate_mask_v16i8(<16 x i8> %a) nounwind {
; AVX512-NEXT: retq
;
; XOP-LABEL: splatconstant_rotate_mask_v16i8:
-; XOP: # BB#0:
+; XOP: # %bb.0:
; XOP-NEXT: vprotb $4, %xmm0, %xmm0
; XOP-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; XOP-NEXT: retq
;
; X32-SSE-LABEL: splatconstant_rotate_mask_v16i8:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: movdqa %xmm0, %xmm1
; X32-SSE-NEXT: psllw $4, %xmm1
; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm1
diff --git a/test/CodeGen/X86/vector-rotate-256.ll b/test/CodeGen/X86/vector-rotate-256.ll
index 3b65b68352b5..46bac267185e 100644
--- a/test/CodeGen/X86/vector-rotate-256.ll
+++ b/test/CodeGen/X86/vector-rotate-256.ll
@@ -12,7 +12,7 @@
define <4 x i64> @var_rotate_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
; AVX1-LABEL: var_rotate_v4i64:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [64,64]
; AVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm3
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
@@ -40,7 +40,7 @@ define <4 x i64> @var_rotate_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: var_rotate_v4i64:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [64,64,64,64]
; AVX2-NEXT: vpsubq %ymm1, %ymm2, %ymm2
; AVX2-NEXT: vpsllvq %ymm1, %ymm0, %ymm1
@@ -49,20 +49,20 @@ define <4 x i64> @var_rotate_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
; AVX2-NEXT: retq
;
; AVX512BW-LABEL: var_rotate_v4i64:
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
-; AVX512BW-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: # kill: def %ymm1 killed %ymm1 def %zmm1
+; AVX512BW-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; AVX512BW-NEXT: vprolvq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512BW-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
; AVX512BW-NEXT: retq
;
; AVX512VL-LABEL: var_rotate_v4i64:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vprolvq %ymm1, %ymm0, %ymm0
; AVX512VL-NEXT: retq
;
; XOPAVX1-LABEL: var_rotate_v4i64:
-; XOPAVX1: # BB#0:
+; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; XOPAVX1-NEXT: vprotq %xmm2, %xmm3, %xmm2
@@ -71,7 +71,7 @@ define <4 x i64> @var_rotate_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: var_rotate_v4i64:
-; XOPAVX2: # BB#0:
+; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
; XOPAVX2-NEXT: vprotq %xmm2, %xmm3, %xmm2
@@ -87,7 +87,7 @@ define <4 x i64> @var_rotate_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
define <8 x i32> @var_rotate_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
; AVX1-LABEL: var_rotate_v8i32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [32,32,32,32]
; AVX1-NEXT: vpsubd %xmm1, %xmm3, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
@@ -131,7 +131,7 @@ define <8 x i32> @var_rotate_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: var_rotate_v8i32:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [32,32,32,32,32,32,32,32]
; AVX2-NEXT: vpsubd %ymm1, %ymm2, %ymm2
; AVX2-NEXT: vpsllvd %ymm1, %ymm0, %ymm1
@@ -140,20 +140,20 @@ define <8 x i32> @var_rotate_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
; AVX2-NEXT: retq
;
; AVX512BW-LABEL: var_rotate_v8i32:
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
-; AVX512BW-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: # kill: def %ymm1 killed %ymm1 def %zmm1
+; AVX512BW-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; AVX512BW-NEXT: vprolvd %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512BW-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
; AVX512BW-NEXT: retq
;
; AVX512VL-LABEL: var_rotate_v8i32:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vprolvd %ymm1, %ymm0, %ymm0
; AVX512VL-NEXT: retq
;
; XOPAVX1-LABEL: var_rotate_v8i32:
-; XOPAVX1: # BB#0:
+; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; XOPAVX1-NEXT: vprotd %xmm2, %xmm3, %xmm2
@@ -162,7 +162,7 @@ define <8 x i32> @var_rotate_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: var_rotate_v8i32:
-; XOPAVX2: # BB#0:
+; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
; XOPAVX2-NEXT: vprotd %xmm2, %xmm3, %xmm2
@@ -178,7 +178,7 @@ define <8 x i32> @var_rotate_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
define <16 x i16> @var_rotate_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
; AVX1-LABEL: var_rotate_v16i16:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
; AVX1-NEXT: vpsubw %xmm1, %xmm3, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
@@ -246,10 +246,10 @@ define <16 x i16> @var_rotate_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: var_rotate_v16i16:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX2-NEXT: vpsubw %ymm1, %ymm2, %ymm2
-; AVX2-NEXT: vpxor %ymm3, %ymm3, %ymm3
+; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15]
; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15]
; AVX2-NEXT: vpsllvd %ymm4, %ymm5, %ymm4
@@ -270,9 +270,9 @@ define <16 x i16> @var_rotate_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
; AVX2-NEXT: retq
;
; AVX512BW-LABEL: var_rotate_v16i16:
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
-; AVX512BW-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: # kill: def %ymm1 killed %ymm1 def %zmm1
+; AVX512BW-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX512BW-NEXT: vpsubw %ymm1, %ymm2, %ymm2
; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm1
@@ -281,8 +281,8 @@ define <16 x i16> @var_rotate_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
; AVX512BW-NEXT: retq
;
; AVX512VL-LABEL: var_rotate_v16i16:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX512VL-NEXT: vpsubw %ymm1, %ymm2, %ymm2
; AVX512VL-NEXT: vpsllvw %ymm1, %ymm0, %ymm1
; AVX512VL-NEXT: vpsrlvw %ymm2, %ymm0, %ymm0
@@ -290,7 +290,7 @@ define <16 x i16> @var_rotate_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
; AVX512VL-NEXT: retq
;
; XOPAVX1-LABEL: var_rotate_v16i16:
-; XOPAVX1: # BB#0:
+; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; XOPAVX1-NEXT: vprotw %xmm2, %xmm3, %xmm2
@@ -299,7 +299,7 @@ define <16 x i16> @var_rotate_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: var_rotate_v16i16:
-; XOPAVX2: # BB#0:
+; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
; XOPAVX2-NEXT: vprotw %xmm2, %xmm3, %xmm2
@@ -315,7 +315,7 @@ define <16 x i16> @var_rotate_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
define <32 x i8> @var_rotate_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
; AVX1-LABEL: var_rotate_v32i8:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
; AVX1-NEXT: vpsubb %xmm1, %xmm3, %xmm8
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
@@ -378,7 +378,7 @@ define <32 x i8> @var_rotate_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: var_rotate_v32i8:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
; AVX2-NEXT: vpsubb %ymm1, %ymm2, %ymm2
; AVX2-NEXT: vpsllw $5, %ymm1, %ymm1
@@ -407,36 +407,22 @@ define <32 x i8> @var_rotate_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
;
-; AVX512BW-LABEL: var_rotate_v32i8:
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512BW-NEXT: vpsubb %ymm1, %ymm2, %ymm2
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
-; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm1
-; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero
-; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm0, %zmm0
-; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
-; AVX512BW-NEXT: vpor %ymm0, %ymm1, %ymm0
-; AVX512BW-NEXT: retq
-;
-; AVX512VL-LABEL: var_rotate_v32i8:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VL-NEXT: vpsubb %ymm1, %ymm2, %ymm2
-; AVX512VL-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
-; AVX512VL-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
-; AVX512VL-NEXT: vpsllvw %zmm1, %zmm0, %zmm1
-; AVX512VL-NEXT: vpmovwb %zmm1, %ymm1
-; AVX512VL-NEXT: vpmovzxbw {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero
-; AVX512VL-NEXT: vpsrlvw %zmm2, %zmm0, %zmm0
-; AVX512VL-NEXT: vpmovwb %zmm0, %ymm0
-; AVX512VL-NEXT: vpor %ymm0, %ymm1, %ymm0
-; AVX512VL-NEXT: retq
+; AVX512-LABEL: var_rotate_v32i8:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512-NEXT: vpsubb %ymm1, %ymm2, %ymm2
+; AVX512-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
+; AVX512-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512-NEXT: vpsllvw %zmm1, %zmm0, %zmm1
+; AVX512-NEXT: vpmovwb %zmm1, %ymm1
+; AVX512-NEXT: vpmovzxbw {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero
+; AVX512-NEXT: vpsrlvw %zmm2, %zmm0, %zmm0
+; AVX512-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0
+; AVX512-NEXT: retq
;
; XOPAVX1-LABEL: var_rotate_v32i8:
-; XOPAVX1: # BB#0:
+; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; XOPAVX1-NEXT: vprotb %xmm2, %xmm3, %xmm2
@@ -445,7 +431,7 @@ define <32 x i8> @var_rotate_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: var_rotate_v32i8:
-; XOPAVX2: # BB#0:
+; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
; XOPAVX2-NEXT: vprotb %xmm2, %xmm3, %xmm2
@@ -465,7 +451,7 @@ define <32 x i8> @var_rotate_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
define <4 x i64> @constant_rotate_v4i64(<4 x i64> %a) nounwind {
; AVX1-LABEL: constant_rotate_v4i64:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpsllq $60, %xmm1, %xmm2
; AVX1-NEXT: vpsllq $50, %xmm1, %xmm3
@@ -485,27 +471,27 @@ define <4 x i64> @constant_rotate_v4i64(<4 x i64> %a) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: constant_rotate_v4i64:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpsllvq {{.*}}(%rip), %ymm0, %ymm1
; AVX2-NEXT: vpsrlvq {{.*}}(%rip), %ymm0, %ymm0
; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
;
; AVX512BW-LABEL: constant_rotate_v4i64:
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [4,14,50,60]
; AVX512BW-NEXT: vprolvq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512BW-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
; AVX512BW-NEXT: retq
;
; AVX512VL-LABEL: constant_rotate_v4i64:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vprolvq {{.*}}(%rip), %ymm0, %ymm0
; AVX512VL-NEXT: retq
;
; XOPAVX1-LABEL: constant_rotate_v4i64:
-; XOPAVX1: # BB#0:
+; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vprotq {{.*}}(%rip), %xmm0, %xmm1
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; XOPAVX1-NEXT: vprotq {{.*}}(%rip), %xmm0, %xmm0
@@ -513,7 +499,7 @@ define <4 x i64> @constant_rotate_v4i64(<4 x i64> %a) nounwind {
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: constant_rotate_v4i64:
-; XOPAVX2: # BB#0:
+; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: vprotq {{.*}}(%rip), %xmm0, %xmm1
; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
; XOPAVX2-NEXT: vprotq {{.*}}(%rip), %xmm0, %xmm0
@@ -527,7 +513,7 @@ define <4 x i64> @constant_rotate_v4i64(<4 x i64> %a) nounwind {
define <8 x i32> @constant_rotate_v8i32(<8 x i32> %a) nounwind {
; AVX1-LABEL: constant_rotate_v8i32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm2, %xmm3
@@ -551,27 +537,27 @@ define <8 x i32> @constant_rotate_v8i32(<8 x i32> %a) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: constant_rotate_v8i32:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpsllvd {{.*}}(%rip), %ymm0, %ymm1
; AVX2-NEXT: vpsrlvd {{.*}}(%rip), %ymm0, %ymm0
; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
;
; AVX512BW-LABEL: constant_rotate_v8i32:
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,6,7,8,9,10,11]
; AVX512BW-NEXT: vprolvd %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512BW-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
; AVX512BW-NEXT: retq
;
; AVX512VL-LABEL: constant_rotate_v8i32:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vprolvd {{.*}}(%rip), %ymm0, %ymm0
; AVX512VL-NEXT: retq
;
; XOPAVX1-LABEL: constant_rotate_v8i32:
-; XOPAVX1: # BB#0:
+; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vprotd {{.*}}(%rip), %xmm0, %xmm1
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; XOPAVX1-NEXT: vprotd {{.*}}(%rip), %xmm0, %xmm0
@@ -579,7 +565,7 @@ define <8 x i32> @constant_rotate_v8i32(<8 x i32> %a) nounwind {
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: constant_rotate_v8i32:
-; XOPAVX2: # BB#0:
+; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: vprotd {{.*}}(%rip), %xmm0, %xmm1
; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
; XOPAVX2-NEXT: vprotd {{.*}}(%rip), %xmm0, %xmm0
@@ -593,7 +579,7 @@ define <8 x i32> @constant_rotate_v8i32(<8 x i32> %a) nounwind {
define <16 x i16> @constant_rotate_v16i16(<16 x i16> %a) nounwind {
; AVX1-LABEL: constant_rotate_v16i16:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm2, %xmm3
@@ -619,9 +605,9 @@ define <16 x i16> @constant_rotate_v16i16(<16 x i16> %a) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: constant_rotate_v16i16:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm1
-; AVX2-NEXT: vpxor %ymm2, %ymm2, %ymm2
+; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1]
; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15]
; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
@@ -636,8 +622,8 @@ define <16 x i16> @constant_rotate_v16i16(<16 x i16> %a) nounwind {
; AVX2-NEXT: retq
;
; AVX512BW-LABEL: constant_rotate_v16i16:
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm1
; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1]
@@ -646,14 +632,14 @@ define <16 x i16> @constant_rotate_v16i16(<16 x i16> %a) nounwind {
; AVX512BW-NEXT: retq
;
; AVX512VL-LABEL: constant_rotate_v16i16:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpsllvw {{.*}}(%rip), %ymm0, %ymm1
; AVX512VL-NEXT: vpsrlvw {{.*}}(%rip), %ymm0, %ymm0
; AVX512VL-NEXT: vpor %ymm0, %ymm1, %ymm0
; AVX512VL-NEXT: retq
;
; XOPAVX1-LABEL: constant_rotate_v16i16:
-; XOPAVX1: # BB#0:
+; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vprotw {{.*}}(%rip), %xmm0, %xmm1
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; XOPAVX1-NEXT: vprotw {{.*}}(%rip), %xmm0, %xmm0
@@ -661,7 +647,7 @@ define <16 x i16> @constant_rotate_v16i16(<16 x i16> %a) nounwind {
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: constant_rotate_v16i16:
-; XOPAVX2: # BB#0:
+; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: vprotw {{.*}}(%rip), %xmm0, %xmm1
; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
; XOPAVX2-NEXT: vprotw {{.*}}(%rip), %xmm0, %xmm0
@@ -675,7 +661,7 @@ define <16 x i16> @constant_rotate_v16i16(<16 x i16> %a) nounwind {
define <32 x i8> @constant_rotate_v32i8(<32 x i8> %a) nounwind {
; AVX1-LABEL: constant_rotate_v32i8:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpsllw $4, %xmm1, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
@@ -728,7 +714,7 @@ define <32 x i8> @constant_rotate_v32i8(<32 x i8> %a) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: constant_rotate_v32i8:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpsllw $4, %ymm0, %ymm1
; AVX2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256]
@@ -756,7 +742,7 @@ define <32 x i8> @constant_rotate_v32i8(<32 x i8> %a) nounwind {
; AVX2-NEXT: retq
;
; AVX512-LABEL: constant_rotate_v32i8:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
; AVX512-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm1
; AVX512-NEXT: vpmovwb %zmm1, %ymm1
@@ -766,7 +752,7 @@ define <32 x i8> @constant_rotate_v32i8(<32 x i8> %a) nounwind {
; AVX512-NEXT: retq
;
; XOPAVX1-LABEL: constant_rotate_v32i8:
-; XOPAVX1: # BB#0:
+; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,8,7,6,5,4,3,2,1]
; XOPAVX1-NEXT: vprotb %xmm2, %xmm1, %xmm1
@@ -775,7 +761,7 @@ define <32 x i8> @constant_rotate_v32i8(<32 x i8> %a) nounwind {
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: constant_rotate_v32i8:
-; XOPAVX2: # BB#0:
+; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,8,7,6,5,4,3,2,1]
; XOPAVX2-NEXT: vprotb %xmm2, %xmm1, %xmm1
@@ -794,7 +780,7 @@ define <32 x i8> @constant_rotate_v32i8(<32 x i8> %a) nounwind {
define <4 x i64> @splatconstant_rotate_v4i64(<4 x i64> %a) nounwind {
; AVX1-LABEL: splatconstant_rotate_v4i64:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpsllq $14, %xmm0, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vpsllq $14, %xmm2, %xmm3
@@ -806,26 +792,26 @@ define <4 x i64> @splatconstant_rotate_v4i64(<4 x i64> %a) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: splatconstant_rotate_v4i64:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpsllq $14, %ymm0, %ymm1
; AVX2-NEXT: vpsrlq $50, %ymm0, %ymm0
; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
;
; AVX512BW-LABEL: splatconstant_rotate_v4i64:
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; AVX512BW-NEXT: vprolq $14, %zmm0, %zmm0
-; AVX512BW-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512BW-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
; AVX512BW-NEXT: retq
;
; AVX512VL-LABEL: splatconstant_rotate_v4i64:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vprolq $14, %ymm0, %ymm0
; AVX512VL-NEXT: retq
;
; XOPAVX1-LABEL: splatconstant_rotate_v4i64:
-; XOPAVX1: # BB#0:
+; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vprotq $14, %xmm0, %xmm1
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; XOPAVX1-NEXT: vprotq $14, %xmm0, %xmm0
@@ -833,7 +819,7 @@ define <4 x i64> @splatconstant_rotate_v4i64(<4 x i64> %a) nounwind {
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: splatconstant_rotate_v4i64:
-; XOPAVX2: # BB#0:
+; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: vprotq $14, %xmm0, %xmm1
; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
; XOPAVX2-NEXT: vprotq $14, %xmm0, %xmm0
@@ -847,7 +833,7 @@ define <4 x i64> @splatconstant_rotate_v4i64(<4 x i64> %a) nounwind {
define <8 x i32> @splatconstant_rotate_v8i32(<8 x i32> %a) nounwind {
; AVX1-LABEL: splatconstant_rotate_v8i32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpslld $4, %xmm0, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vpslld $4, %xmm2, %xmm3
@@ -859,26 +845,26 @@ define <8 x i32> @splatconstant_rotate_v8i32(<8 x i32> %a) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: splatconstant_rotate_v8i32:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpslld $4, %ymm0, %ymm1
; AVX2-NEXT: vpsrld $28, %ymm0, %ymm0
; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
;
; AVX512BW-LABEL: splatconstant_rotate_v8i32:
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; AVX512BW-NEXT: vprold $4, %zmm0, %zmm0
-; AVX512BW-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512BW-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
; AVX512BW-NEXT: retq
;
; AVX512VL-LABEL: splatconstant_rotate_v8i32:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vprold $4, %ymm0, %ymm0
; AVX512VL-NEXT: retq
;
; XOPAVX1-LABEL: splatconstant_rotate_v8i32:
-; XOPAVX1: # BB#0:
+; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vprotd $4, %xmm0, %xmm1
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; XOPAVX1-NEXT: vprotd $4, %xmm0, %xmm0
@@ -886,7 +872,7 @@ define <8 x i32> @splatconstant_rotate_v8i32(<8 x i32> %a) nounwind {
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: splatconstant_rotate_v8i32:
-; XOPAVX2: # BB#0:
+; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: vprotd $4, %xmm0, %xmm1
; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
; XOPAVX2-NEXT: vprotd $4, %xmm0, %xmm0
@@ -900,7 +886,7 @@ define <8 x i32> @splatconstant_rotate_v8i32(<8 x i32> %a) nounwind {
define <16 x i16> @splatconstant_rotate_v16i16(<16 x i16> %a) nounwind {
; AVX1-LABEL: splatconstant_rotate_v16i16:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpsllw $7, %xmm0, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vpsllw $7, %xmm2, %xmm3
@@ -912,21 +898,21 @@ define <16 x i16> @splatconstant_rotate_v16i16(<16 x i16> %a) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: splatconstant_rotate_v16i16:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpsllw $7, %ymm0, %ymm1
; AVX2-NEXT: vpsrlw $9, %ymm0, %ymm0
; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: splatconstant_rotate_v16i16:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpsllw $7, %ymm0, %ymm1
; AVX512-NEXT: vpsrlw $9, %ymm0, %ymm0
; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0
; AVX512-NEXT: retq
;
; XOPAVX1-LABEL: splatconstant_rotate_v16i16:
-; XOPAVX1: # BB#0:
+; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vprotw $7, %xmm0, %xmm1
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; XOPAVX1-NEXT: vprotw $7, %xmm0, %xmm0
@@ -934,7 +920,7 @@ define <16 x i16> @splatconstant_rotate_v16i16(<16 x i16> %a) nounwind {
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: splatconstant_rotate_v16i16:
-; XOPAVX2: # BB#0:
+; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: vprotw $7, %xmm0, %xmm1
; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
; XOPAVX2-NEXT: vprotw $7, %xmm0, %xmm0
@@ -948,7 +934,7 @@ define <16 x i16> @splatconstant_rotate_v16i16(<16 x i16> %a) nounwind {
define <32 x i8> @splatconstant_rotate_v32i8(<32 x i8> %a) nounwind {
; AVX1-LABEL: splatconstant_rotate_v32i8:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpsllw $4, %xmm1, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
@@ -966,7 +952,7 @@ define <32 x i8> @splatconstant_rotate_v32i8(<32 x i8> %a) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: splatconstant_rotate_v32i8:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpsllw $4, %ymm0, %ymm1
; AVX2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
@@ -975,7 +961,7 @@ define <32 x i8> @splatconstant_rotate_v32i8(<32 x i8> %a) nounwind {
; AVX2-NEXT: retq
;
; AVX512-LABEL: splatconstant_rotate_v32i8:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpsllw $4, %ymm0, %ymm1
; AVX512-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
; AVX512-NEXT: vpsrlw $4, %ymm0, %ymm0
@@ -984,7 +970,7 @@ define <32 x i8> @splatconstant_rotate_v32i8(<32 x i8> %a) nounwind {
; AVX512-NEXT: retq
;
; XOPAVX1-LABEL: splatconstant_rotate_v32i8:
-; XOPAVX1: # BB#0:
+; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vprotb $4, %xmm0, %xmm1
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; XOPAVX1-NEXT: vprotb $4, %xmm0, %xmm0
@@ -992,7 +978,7 @@ define <32 x i8> @splatconstant_rotate_v32i8(<32 x i8> %a) nounwind {
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: splatconstant_rotate_v32i8:
-; XOPAVX2: # BB#0:
+; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: vprotb $4, %xmm0, %xmm1
; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
; XOPAVX2-NEXT: vprotb $4, %xmm0, %xmm0
@@ -1010,35 +996,35 @@ define <32 x i8> @splatconstant_rotate_v32i8(<32 x i8> %a) nounwind {
define <4 x i64> @splatconstant_rotate_mask_v4i64(<4 x i64> %a) nounwind {
; AVX1-LABEL: splatconstant_rotate_mask_v4i64:
-; AVX1: # BB#0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpsrlq $49, %xmm0, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpsrlq $49, %xmm0, %xmm0
-; AVX1-NEXT: vpsrlq $49, %xmm1, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: splatconstant_rotate_mask_v4i64:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpsrlq $49, %ymm0, %ymm0
; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512BW-LABEL: splatconstant_rotate_mask_v4i64:
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; AVX512BW-NEXT: vprolq $15, %zmm0, %zmm0
; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
; AVX512BW-NEXT: retq
;
; AVX512VL-LABEL: splatconstant_rotate_mask_v4i64:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vprolq $15, %ymm0, %ymm0
; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
; AVX512VL-NEXT: retq
;
; XOPAVX1-LABEL: splatconstant_rotate_mask_v4i64:
-; XOPAVX1: # BB#0:
+; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vprotq $15, %xmm0, %xmm1
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; XOPAVX1-NEXT: vprotq $15, %xmm0, %xmm0
@@ -1047,7 +1033,7 @@ define <4 x i64> @splatconstant_rotate_mask_v4i64(<4 x i64> %a) nounwind {
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: splatconstant_rotate_mask_v4i64:
-; XOPAVX2: # BB#0:
+; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: vprotq $15, %xmm0, %xmm1
; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
; XOPAVX2-NEXT: vprotq $15, %xmm0, %xmm0
@@ -1064,7 +1050,7 @@ define <4 x i64> @splatconstant_rotate_mask_v4i64(<4 x i64> %a) nounwind {
define <8 x i32> @splatconstant_rotate_mask_v8i32(<8 x i32> %a) nounwind {
; AVX1-LABEL: splatconstant_rotate_mask_v8i32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpslld $4, %xmm0, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vpslld $4, %xmm2, %xmm3
@@ -1078,7 +1064,7 @@ define <8 x i32> @splatconstant_rotate_mask_v8i32(<8 x i32> %a) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: splatconstant_rotate_mask_v8i32:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpslld $4, %ymm0, %ymm1
; AVX2-NEXT: vpsrld $28, %ymm0, %ymm0
; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
@@ -1087,20 +1073,20 @@ define <8 x i32> @splatconstant_rotate_mask_v8i32(<8 x i32> %a) nounwind {
; AVX2-NEXT: retq
;
; AVX512BW-LABEL: splatconstant_rotate_mask_v8i32:
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; AVX512BW-NEXT: vprold $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
; AVX512BW-NEXT: retq
;
; AVX512VL-LABEL: splatconstant_rotate_mask_v8i32:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vprold $4, %ymm0, %ymm0
; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
; AVX512VL-NEXT: retq
;
; XOPAVX1-LABEL: splatconstant_rotate_mask_v8i32:
-; XOPAVX1: # BB#0:
+; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vprotd $4, %xmm0, %xmm1
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; XOPAVX1-NEXT: vprotd $4, %xmm0, %xmm0
@@ -1109,7 +1095,7 @@ define <8 x i32> @splatconstant_rotate_mask_v8i32(<8 x i32> %a) nounwind {
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: splatconstant_rotate_mask_v8i32:
-; XOPAVX2: # BB#0:
+; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: vprotd $4, %xmm0, %xmm1
; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
; XOPAVX2-NEXT: vprotd $4, %xmm0, %xmm0
@@ -1126,7 +1112,7 @@ define <8 x i32> @splatconstant_rotate_mask_v8i32(<8 x i32> %a) nounwind {
define <16 x i16> @splatconstant_rotate_mask_v16i16(<16 x i16> %a) nounwind {
; AVX1-LABEL: splatconstant_rotate_mask_v16i16:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpsllw $5, %xmm0, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vpsllw $5, %xmm2, %xmm3
@@ -1140,7 +1126,7 @@ define <16 x i16> @splatconstant_rotate_mask_v16i16(<16 x i16> %a) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: splatconstant_rotate_mask_v16i16:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpsllw $5, %ymm0, %ymm1
; AVX2-NEXT: vpsrlw $11, %ymm0, %ymm0
; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
@@ -1149,7 +1135,7 @@ define <16 x i16> @splatconstant_rotate_mask_v16i16(<16 x i16> %a) nounwind {
; AVX2-NEXT: retq
;
; AVX512-LABEL: splatconstant_rotate_mask_v16i16:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpsllw $5, %ymm0, %ymm1
; AVX512-NEXT: vpsrlw $11, %ymm0, %ymm0
; AVX512-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
@@ -1158,7 +1144,7 @@ define <16 x i16> @splatconstant_rotate_mask_v16i16(<16 x i16> %a) nounwind {
; AVX512-NEXT: retq
;
; XOPAVX1-LABEL: splatconstant_rotate_mask_v16i16:
-; XOPAVX1: # BB#0:
+; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vprotw $5, %xmm0, %xmm1
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; XOPAVX1-NEXT: vprotw $5, %xmm0, %xmm0
@@ -1167,7 +1153,7 @@ define <16 x i16> @splatconstant_rotate_mask_v16i16(<16 x i16> %a) nounwind {
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: splatconstant_rotate_mask_v16i16:
-; XOPAVX2: # BB#0:
+; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: vprotw $5, %xmm0, %xmm1
; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
; XOPAVX2-NEXT: vprotw $5, %xmm0, %xmm0
@@ -1184,7 +1170,7 @@ define <16 x i16> @splatconstant_rotate_mask_v16i16(<16 x i16> %a) nounwind {
define <32 x i8> @splatconstant_rotate_mask_v32i8(<32 x i8> %a) nounwind {
; AVX1-LABEL: splatconstant_rotate_mask_v32i8:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpsllw $4, %xmm1, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
@@ -1204,7 +1190,7 @@ define <32 x i8> @splatconstant_rotate_mask_v32i8(<32 x i8> %a) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: splatconstant_rotate_mask_v32i8:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpsllw $4, %ymm0, %ymm1
; AVX2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0
@@ -1215,7 +1201,7 @@ define <32 x i8> @splatconstant_rotate_mask_v32i8(<32 x i8> %a) nounwind {
; AVX2-NEXT: retq
;
; AVX512-LABEL: splatconstant_rotate_mask_v32i8:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpsllw $4, %ymm0, %ymm1
; AVX512-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
; AVX512-NEXT: vpsrlw $4, %ymm0, %ymm0
@@ -1226,7 +1212,7 @@ define <32 x i8> @splatconstant_rotate_mask_v32i8(<32 x i8> %a) nounwind {
; AVX512-NEXT: retq
;
; XOPAVX1-LABEL: splatconstant_rotate_mask_v32i8:
-; XOPAVX1: # BB#0:
+; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vprotb $4, %xmm0, %xmm1
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; XOPAVX1-NEXT: vprotb $4, %xmm0, %xmm0
@@ -1235,7 +1221,7 @@ define <32 x i8> @splatconstant_rotate_mask_v32i8(<32 x i8> %a) nounwind {
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: splatconstant_rotate_mask_v32i8:
-; XOPAVX2: # BB#0:
+; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: vprotb $4, %xmm0, %xmm1
; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
; XOPAVX2-NEXT: vprotb $4, %xmm0, %xmm0
diff --git a/test/CodeGen/X86/vector-rotate-512.ll b/test/CodeGen/X86/vector-rotate-512.ll
index fa1b5c1c0cb4..8941be35c05a 100644
--- a/test/CodeGen/X86/vector-rotate-512.ll
+++ b/test/CodeGen/X86/vector-rotate-512.ll
@@ -10,7 +10,7 @@
define <8 x i64> @var_rotate_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind {
; AVX512-LABEL: var_rotate_v8i64:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vprolvq %zmm1, %zmm0, %zmm0
; AVX512-NEXT: retq
%b64 = sub <8 x i64> <i64 64, i64 64, i64 64, i64 64, i64 64, i64 64, i64 64, i64 64>, %b
@@ -22,7 +22,7 @@ define <8 x i64> @var_rotate_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind {
define <16 x i32> @var_rotate_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind {
; AVX512-LABEL: var_rotate_v16i32:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vprolvd %zmm1, %zmm0, %zmm0
; AVX512-NEXT: retq
%b32 = sub <16 x i32> <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32>, %b
@@ -34,7 +34,7 @@ define <16 x i32> @var_rotate_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind {
define <32 x i16> @var_rotate_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
; AVX512F-LABEL: var_rotate_v32i16:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX512F-NEXT: vpsubw %ymm2, %ymm4, %ymm5
; AVX512F-NEXT: vpsubw %ymm3, %ymm4, %ymm4
@@ -57,7 +57,7 @@ define <32 x i16> @var_rotate_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: var_rotate_v32i16:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm4 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX512VL-NEXT: vpsubw %ymm2, %ymm4, %ymm5
; AVX512VL-NEXT: vpsubw %ymm3, %ymm4, %ymm4
@@ -80,8 +80,8 @@ define <32 x i16> @var_rotate_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: var_rotate_v32i16:
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vmovdqu16 {{.*#+}} zmm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX512BW-NEXT: vpsubw %zmm1, %zmm2, %zmm2
; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm1
; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm0, %zmm0
@@ -89,8 +89,8 @@ define <32 x i16> @var_rotate_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
; AVX512BW-NEXT: retq
;
; AVX512VLBW-LABEL: var_rotate_v32i16:
-; AVX512VLBW: # BB#0:
-; AVX512VLBW-NEXT: vmovdqu16 {{.*#+}} zmm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VLBW: # %bb.0:
+; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX512VLBW-NEXT: vpsubw %zmm1, %zmm2, %zmm2
; AVX512VLBW-NEXT: vpsllvw %zmm1, %zmm0, %zmm1
; AVX512VLBW-NEXT: vpsrlvw %zmm2, %zmm0, %zmm0
@@ -105,7 +105,7 @@ define <32 x i16> @var_rotate_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
define <64 x i8> @var_rotate_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
; AVX512F-LABEL: var_rotate_v64i8:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
; AVX512F-NEXT: vpsubb %ymm2, %ymm5, %ymm4
; AVX512F-NEXT: vpsubb %ymm3, %ymm5, %ymm5
@@ -165,7 +165,7 @@ define <64 x i8> @var_rotate_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: var_rotate_v64i8:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm5 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
; AVX512VL-NEXT: vpsubb %ymm2, %ymm5, %ymm4
; AVX512VL-NEXT: vpsubb %ymm3, %ymm5, %ymm5
@@ -225,8 +225,8 @@ define <64 x i8> @var_rotate_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: var_rotate_v64i8:
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
; AVX512BW-NEXT: vpsubb %zmm1, %zmm2, %zmm2
; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm3
; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
@@ -260,8 +260,8 @@ define <64 x i8> @var_rotate_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
; AVX512BW-NEXT: retq
;
; AVX512VLBW-LABEL: var_rotate_v64i8:
-; AVX512VLBW: # BB#0:
-; AVX512VLBW-NEXT: vmovdqu8 {{.*#+}} zmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VLBW: # %bb.0:
+; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
; AVX512VLBW-NEXT: vpsubb %zmm1, %zmm2, %zmm2
; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm3
; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
@@ -306,7 +306,7 @@ define <64 x i8> @var_rotate_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
define <8 x i64> @constant_rotate_v8i64(<8 x i64> %a) nounwind {
; AVX512-LABEL: constant_rotate_v8i64:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vprolvq {{.*}}(%rip), %zmm0, %zmm0
; AVX512-NEXT: retq
%shl = shl <8 x i64> %a, <i64 4, i64 14, i64 50, i64 60, i64 4, i64 14, i64 50, i64 60>
@@ -317,7 +317,7 @@ define <8 x i64> @constant_rotate_v8i64(<8 x i64> %a) nounwind {
define <16 x i32> @constant_rotate_v16i32(<16 x i32> %a) nounwind {
; AVX512-LABEL: constant_rotate_v16i32:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vprolvd {{.*}}(%rip), %zmm0, %zmm0
; AVX512-NEXT: retq
%shl = shl <16 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
@@ -328,7 +328,7 @@ define <16 x i32> @constant_rotate_v16i32(<16 x i32> %a) nounwind {
define <32 x i16> @constant_rotate_v32i16(<32 x i16> %a) nounwind {
; AVX512F-LABEL: constant_rotate_v32i16:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
; AVX512F-NEXT: vpmullw %ymm2, %ymm1, %ymm3
; AVX512F-NEXT: vpmullw %ymm2, %ymm0, %ymm2
@@ -344,7 +344,7 @@ define <32 x i16> @constant_rotate_v32i16(<32 x i16> %a) nounwind {
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: constant_rotate_v32i16:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
; AVX512VL-NEXT: vpmullw %ymm2, %ymm1, %ymm3
; AVX512VL-NEXT: vpmullw %ymm2, %ymm0, %ymm2
@@ -360,14 +360,14 @@ define <32 x i16> @constant_rotate_v32i16(<32 x i16> %a) nounwind {
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: constant_rotate_v32i16:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm1
; AVX512BW-NEXT: vpsrlvw {{.*}}(%rip), %zmm0, %zmm0
; AVX512BW-NEXT: vporq %zmm0, %zmm1, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512VLBW-LABEL: constant_rotate_v32i16:
-; AVX512VLBW: # BB#0:
+; AVX512VLBW: # %bb.0:
; AVX512VLBW-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm1
; AVX512VLBW-NEXT: vpsrlvw {{.*}}(%rip), %zmm0, %zmm0
; AVX512VLBW-NEXT: vporq %zmm0, %zmm1, %zmm0
@@ -380,7 +380,7 @@ define <32 x i16> @constant_rotate_v32i16(<32 x i16> %a) nounwind {
define <64 x i8> @constant_rotate_v64i8(<64 x i8> %a) nounwind {
; AVX512F-LABEL: constant_rotate_v64i8:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vpsllw $4, %ymm1, %ymm2
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2
@@ -431,7 +431,7 @@ define <64 x i8> @constant_rotate_v64i8(<64 x i8> %a) nounwind {
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: constant_rotate_v64i8:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpsllw $4, %ymm1, %ymm2
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2
@@ -482,8 +482,8 @@ define <64 x i8> @constant_rotate_v64i8(<64 x i8> %a) nounwind {
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: constant_rotate_v64i8:
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm1 = [8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256]
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256]
; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm2
; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
@@ -496,7 +496,7 @@ define <64 x i8> @constant_rotate_v64i8(<64 x i8> %a) nounwind {
; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
; AVX512BW-NEXT: vpaddb %zmm2, %zmm2, %zmm2 {%k1}
-; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm1 = [57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536]
+; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536]
; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm3
; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
@@ -515,8 +515,8 @@ define <64 x i8> @constant_rotate_v64i8(<64 x i8> %a) nounwind {
; AVX512BW-NEXT: retq
;
; AVX512VLBW-LABEL: constant_rotate_v64i8:
-; AVX512VLBW: # BB#0:
-; AVX512VLBW-NEXT: vmovdqu8 {{.*#+}} zmm1 = [8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256]
+; AVX512VLBW: # %bb.0:
+; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256]
; AVX512VLBW-NEXT: vpmovb2m %zmm1, %k1
; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm2
; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
@@ -529,7 +529,7 @@ define <64 x i8> @constant_rotate_v64i8(<64 x i8> %a) nounwind {
; AVX512VLBW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
; AVX512VLBW-NEXT: vpmovb2m %zmm1, %k1
; AVX512VLBW-NEXT: vpaddb %zmm2, %zmm2, %zmm2 {%k1}
-; AVX512VLBW-NEXT: vmovdqu8 {{.*#+}} zmm1 = [57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536]
+; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536]
; AVX512VLBW-NEXT: vpmovb2m %zmm1, %k1
; AVX512VLBW-NEXT: vpsrlw $4, %zmm0, %zmm3
; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
@@ -558,7 +558,7 @@ define <64 x i8> @constant_rotate_v64i8(<64 x i8> %a) nounwind {
define <8 x i64> @splatconstant_rotate_v8i64(<8 x i64> %a) nounwind {
; AVX512-LABEL: splatconstant_rotate_v8i64:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vprolq $14, %zmm0, %zmm0
; AVX512-NEXT: retq
%shl = shl <8 x i64> %a, <i64 14, i64 14, i64 14, i64 14, i64 14, i64 14, i64 14, i64 14>
@@ -569,7 +569,7 @@ define <8 x i64> @splatconstant_rotate_v8i64(<8 x i64> %a) nounwind {
define <16 x i32> @splatconstant_rotate_v16i32(<16 x i32> %a) nounwind {
; AVX512-LABEL: splatconstant_rotate_v16i32:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vprold $4, %zmm0, %zmm0
; AVX512-NEXT: retq
%shl = shl <16 x i32> %a, <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
@@ -580,7 +580,7 @@ define <16 x i32> @splatconstant_rotate_v16i32(<16 x i32> %a) nounwind {
define <32 x i16> @splatconstant_rotate_v32i16(<32 x i16> %a) nounwind {
; AVX512F-LABEL: splatconstant_rotate_v32i16:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vpsllw $7, %ymm1, %ymm2
; AVX512F-NEXT: vpsllw $7, %ymm0, %ymm3
; AVX512F-NEXT: vpsrlw $9, %ymm1, %ymm1
@@ -590,7 +590,7 @@ define <32 x i16> @splatconstant_rotate_v32i16(<32 x i16> %a) nounwind {
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: splatconstant_rotate_v32i16:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpsllw $7, %ymm1, %ymm2
; AVX512VL-NEXT: vpsllw $7, %ymm0, %ymm3
; AVX512VL-NEXT: vpsrlw $9, %ymm1, %ymm1
@@ -600,14 +600,14 @@ define <32 x i16> @splatconstant_rotate_v32i16(<32 x i16> %a) nounwind {
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: splatconstant_rotate_v32i16:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpsllw $7, %zmm0, %zmm1
; AVX512BW-NEXT: vpsrlw $9, %zmm0, %zmm0
; AVX512BW-NEXT: vporq %zmm0, %zmm1, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512VLBW-LABEL: splatconstant_rotate_v32i16:
-; AVX512VLBW: # BB#0:
+; AVX512VLBW: # %bb.0:
; AVX512VLBW-NEXT: vpsllw $7, %zmm0, %zmm1
; AVX512VLBW-NEXT: vpsrlw $9, %zmm0, %zmm0
; AVX512VLBW-NEXT: vporq %zmm0, %zmm1, %zmm0
@@ -620,7 +620,7 @@ define <32 x i16> @splatconstant_rotate_v32i16(<32 x i16> %a) nounwind {
define <64 x i8> @splatconstant_rotate_v64i8(<64 x i8> %a) nounwind {
; AVX512F-LABEL: splatconstant_rotate_v64i8:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vpsllw $4, %ymm1, %ymm2
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2
@@ -636,7 +636,7 @@ define <64 x i8> @splatconstant_rotate_v64i8(<64 x i8> %a) nounwind {
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: splatconstant_rotate_v64i8:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpsllw $4, %ymm1, %ymm2
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm2
@@ -652,7 +652,7 @@ define <64 x i8> @splatconstant_rotate_v64i8(<64 x i8> %a) nounwind {
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: splatconstant_rotate_v64i8:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm1
; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
@@ -661,7 +661,7 @@ define <64 x i8> @splatconstant_rotate_v64i8(<64 x i8> %a) nounwind {
; AVX512BW-NEXT: retq
;
; AVX512VLBW-LABEL: splatconstant_rotate_v64i8:
-; AVX512VLBW: # BB#0:
+; AVX512VLBW: # %bb.0:
; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm1
; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
; AVX512VLBW-NEXT: vpsrlw $4, %zmm0, %zmm0
@@ -680,7 +680,7 @@ define <64 x i8> @splatconstant_rotate_v64i8(<64 x i8> %a) nounwind {
define <8 x i64> @splatconstant_rotate_mask_v8i64(<8 x i64> %a) nounwind {
; AVX512-LABEL: splatconstant_rotate_mask_v8i64:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vprolq $15, %zmm0, %zmm0
; AVX512-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0
; AVX512-NEXT: retq
@@ -694,9 +694,9 @@ define <8 x i64> @splatconstant_rotate_mask_v8i64(<8 x i64> %a) nounwind {
define <16 x i32> @splatconstant_rotate_mask_v16i32(<16 x i32> %a) nounwind {
; AVX512-LABEL: splatconstant_rotate_mask_v16i32:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vprold $4, %zmm0, %zmm0
-; AVX512-NEXT: vpandd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0
; AVX512-NEXT: retq
%shl = shl <16 x i32> %a, <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
%lshr = lshr <16 x i32> %a, <i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28>
@@ -708,7 +708,7 @@ define <16 x i32> @splatconstant_rotate_mask_v16i32(<16 x i32> %a) nounwind {
define <32 x i16> @splatconstant_rotate_mask_v32i16(<32 x i16> %a) nounwind {
; AVX512F-LABEL: splatconstant_rotate_mask_v32i16:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vpsllw $5, %ymm0, %ymm2
; AVX512F-NEXT: vpsllw $5, %ymm1, %ymm3
; AVX512F-NEXT: vpsrlw $11, %ymm0, %ymm0
@@ -724,7 +724,7 @@ define <32 x i16> @splatconstant_rotate_mask_v32i16(<32 x i16> %a) nounwind {
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: splatconstant_rotate_mask_v32i16:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpsllw $5, %ymm0, %ymm2
; AVX512VL-NEXT: vpsllw $5, %ymm1, %ymm3
; AVX512VL-NEXT: vpsrlw $11, %ymm0, %ymm0
@@ -740,7 +740,7 @@ define <32 x i16> @splatconstant_rotate_mask_v32i16(<32 x i16> %a) nounwind {
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: splatconstant_rotate_mask_v32i16:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpsllw $5, %zmm0, %zmm1
; AVX512BW-NEXT: vpsrlw $11, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0
@@ -749,7 +749,7 @@ define <32 x i16> @splatconstant_rotate_mask_v32i16(<32 x i16> %a) nounwind {
; AVX512BW-NEXT: retq
;
; AVX512VLBW-LABEL: splatconstant_rotate_mask_v32i16:
-; AVX512VLBW: # BB#0:
+; AVX512VLBW: # %bb.0:
; AVX512VLBW-NEXT: vpsllw $5, %zmm0, %zmm1
; AVX512VLBW-NEXT: vpsrlw $11, %zmm0, %zmm0
; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0
@@ -766,7 +766,7 @@ define <32 x i16> @splatconstant_rotate_mask_v32i16(<32 x i16> %a) nounwind {
define <64 x i8> @splatconstant_rotate_mask_v64i8(<64 x i8> %a) nounwind {
; AVX512F-LABEL: splatconstant_rotate_mask_v64i8:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm2
; AVX512F-NEXT: vpsllw $4, %ymm1, %ymm3
; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0
@@ -784,7 +784,7 @@ define <64 x i8> @splatconstant_rotate_mask_v64i8(<64 x i8> %a) nounwind {
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: splatconstant_rotate_mask_v64i8:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm2
; AVX512VL-NEXT: vpsllw $4, %ymm1, %ymm3
; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm0
@@ -802,7 +802,7 @@ define <64 x i8> @splatconstant_rotate_mask_v64i8(<64 x i8> %a) nounwind {
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: splatconstant_rotate_mask_v64i8:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm1
; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
@@ -813,7 +813,7 @@ define <64 x i8> @splatconstant_rotate_mask_v64i8(<64 x i8> %a) nounwind {
; AVX512BW-NEXT: retq
;
; AVX512VLBW-LABEL: splatconstant_rotate_mask_v64i8:
-; AVX512VLBW: # BB#0:
+; AVX512VLBW: # %bb.0:
; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm1
; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
; AVX512VLBW-NEXT: vpsrlw $4, %zmm0, %zmm0
diff --git a/test/CodeGen/X86/vector-sext.ll b/test/CodeGen/X86/vector-sext.ll
index 392c0de95f24..d46514a6dc7d 100644
--- a/test/CodeGen/X86/vector-sext.ll
+++ b/test/CodeGen/X86/vector-sext.ll
@@ -12,29 +12,29 @@
define <8 x i16> @sext_16i8_to_8i16(<16 x i8> %A) nounwind uwtable readnone ssp {
; SSE2-LABEL: sext_16i8_to_8i16:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: psraw $8, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: sext_16i8_to_8i16:
-; SSSE3: # BB#0: # %entry
+; SSSE3: # %bb.0: # %entry
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSSE3-NEXT: psraw $8, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: sext_16i8_to_8i16:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: pmovsxbw %xmm0, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: sext_16i8_to_8i16:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vpmovsxbw %xmm0, %xmm0
; AVX-NEXT: retq
;
; X32-SSE41-LABEL: sext_16i8_to_8i16:
-; X32-SSE41: # BB#0: # %entry
+; X32-SSE41: # %bb.0: # %entry
; X32-SSE41-NEXT: pmovsxbw %xmm0, %xmm0
; X32-SSE41-NEXT: retl
entry:
@@ -45,7 +45,7 @@ entry:
define <16 x i16> @sext_16i8_to_16i16(<16 x i8> %A) nounwind uwtable readnone ssp {
; SSE2-LABEL: sext_16i8_to_16i16:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
; SSE2-NEXT: psraw $8, %xmm2
; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
@@ -54,7 +54,7 @@ define <16 x i16> @sext_16i8_to_16i16(<16 x i8> %A) nounwind uwtable readnone ss
; SSE2-NEXT: retq
;
; SSSE3-LABEL: sext_16i8_to_16i16:
-; SSSE3: # BB#0: # %entry
+; SSSE3: # %bb.0: # %entry
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
; SSSE3-NEXT: psraw $8, %xmm2
; SSSE3-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
@@ -63,7 +63,7 @@ define <16 x i16> @sext_16i8_to_16i16(<16 x i8> %A) nounwind uwtable readnone ss
; SSSE3-NEXT: retq
;
; SSE41-LABEL: sext_16i8_to_16i16:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: pmovsxbw %xmm0, %xmm2
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
; SSE41-NEXT: pmovsxbw %xmm0, %xmm1
@@ -71,7 +71,7 @@ define <16 x i16> @sext_16i8_to_16i16(<16 x i8> %A) nounwind uwtable readnone ss
; SSE41-NEXT: retq
;
; AVX1-LABEL: sext_16i8_to_16i16:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vpmovsxbw %xmm0, %xmm1
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
; AVX1-NEXT: vpmovsxbw %xmm0, %xmm0
@@ -79,17 +79,17 @@ define <16 x i16> @sext_16i8_to_16i16(<16 x i8> %A) nounwind uwtable readnone ss
; AVX1-NEXT: retq
;
; AVX2-LABEL: sext_16i8_to_16i16:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: sext_16i8_to_16i16:
-; AVX512: # BB#0: # %entry
+; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vpmovsxbw %xmm0, %ymm0
; AVX512-NEXT: retq
;
; X32-SSE41-LABEL: sext_16i8_to_16i16:
-; X32-SSE41: # BB#0: # %entry
+; X32-SSE41: # %bb.0: # %entry
; X32-SSE41-NEXT: pmovsxbw %xmm0, %xmm2
; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
; X32-SSE41-NEXT: pmovsxbw %xmm0, %xmm1
@@ -102,7 +102,7 @@ entry:
define <32 x i16> @sext_32i8_to_32i16(<32 x i8> %A) nounwind uwtable readnone ssp {
; SSE2-LABEL: sext_32i8_to_32i16:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
; SSE2-NEXT: psraw $8, %xmm4
; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15]
@@ -116,7 +116,7 @@ define <32 x i16> @sext_32i8_to_32i16(<32 x i8> %A) nounwind uwtable readnone ss
; SSE2-NEXT: retq
;
; SSSE3-LABEL: sext_32i8_to_32i16:
-; SSSE3: # BB#0: # %entry
+; SSSE3: # %bb.0: # %entry
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
; SSSE3-NEXT: psraw $8, %xmm4
; SSSE3-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15]
@@ -130,7 +130,7 @@ define <32 x i16> @sext_32i8_to_32i16(<32 x i8> %A) nounwind uwtable readnone ss
; SSSE3-NEXT: retq
;
; SSE41-LABEL: sext_32i8_to_32i16:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: pmovsxbw %xmm0, %xmm5
; SSE41-NEXT: pmovsxbw %xmm1, %xmm2
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
@@ -142,7 +142,7 @@ define <32 x i16> @sext_32i8_to_32i16(<32 x i8> %A) nounwind uwtable readnone ss
; SSE41-NEXT: retq
;
; AVX1-LABEL: sext_32i8_to_32i16:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vpmovsxbw %xmm0, %xmm1
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
; AVX1-NEXT: vpmovsxbw %xmm2, %xmm2
@@ -156,7 +156,7 @@ define <32 x i16> @sext_32i8_to_32i16(<32 x i8> %A) nounwind uwtable readnone ss
; AVX1-NEXT: retq
;
; AVX2-LABEL: sext_32i8_to_32i16:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpmovsxbw %xmm0, %ymm2
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX2-NEXT: vpmovsxbw %xmm0, %ymm1
@@ -164,7 +164,7 @@ define <32 x i16> @sext_32i8_to_32i16(<32 x i8> %A) nounwind uwtable readnone ss
; AVX2-NEXT: retq
;
; AVX512F-LABEL: sext_32i8_to_32i16:
-; AVX512F: # BB#0: # %entry
+; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vpmovsxbw %xmm0, %ymm2
; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX512F-NEXT: vpmovsxbw %xmm0, %ymm1
@@ -172,12 +172,12 @@ define <32 x i16> @sext_32i8_to_32i16(<32 x i8> %A) nounwind uwtable readnone ss
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: sext_32i8_to_32i16:
-; AVX512BW: # BB#0: # %entry
+; AVX512BW: # %bb.0: # %entry
; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm0
; AVX512BW-NEXT: retq
;
; X32-SSE41-LABEL: sext_32i8_to_32i16:
-; X32-SSE41: # BB#0: # %entry
+; X32-SSE41: # %bb.0: # %entry
; X32-SSE41-NEXT: pmovsxbw %xmm0, %xmm5
; X32-SSE41-NEXT: pmovsxbw %xmm1, %xmm2
; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
@@ -194,31 +194,31 @@ entry:
define <4 x i32> @sext_16i8_to_4i32(<16 x i8> %A) nounwind uwtable readnone ssp {
; SSE2-LABEL: sext_16i8_to_4i32:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
; SSE2-NEXT: psrad $24, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: sext_16i8_to_4i32:
-; SSSE3: # BB#0: # %entry
+; SSSE3: # %bb.0: # %entry
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
; SSSE3-NEXT: psrad $24, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: sext_16i8_to_4i32:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: pmovsxbd %xmm0, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: sext_16i8_to_4i32:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vpmovsxbd %xmm0, %xmm0
; AVX-NEXT: retq
;
; X32-SSE41-LABEL: sext_16i8_to_4i32:
-; X32-SSE41: # BB#0: # %entry
+; X32-SSE41: # %bb.0: # %entry
; X32-SSE41-NEXT: pmovsxbd %xmm0, %xmm0
; X32-SSE41-NEXT: retl
entry:
@@ -229,7 +229,7 @@ entry:
define <8 x i32> @sext_16i8_to_8i32(<16 x i8> %A) nounwind uwtable readnone ssp {
; SSE2-LABEL: sext_16i8_to_8i32:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
; SSE2-NEXT: psrad $24, %xmm2
@@ -241,7 +241,7 @@ define <8 x i32> @sext_16i8_to_8i32(<16 x i8> %A) nounwind uwtable readnone ssp
; SSE2-NEXT: retq
;
; SSSE3-LABEL: sext_16i8_to_8i32:
-; SSSE3: # BB#0: # %entry
+; SSSE3: # %bb.0: # %entry
; SSSE3-NEXT: movdqa %xmm0, %xmm1
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
@@ -251,7 +251,7 @@ define <8 x i32> @sext_16i8_to_8i32(<16 x i8> %A) nounwind uwtable readnone ssp
; SSSE3-NEXT: retq
;
; SSE41-LABEL: sext_16i8_to_8i32:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: pmovsxbd %xmm0, %xmm2
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
; SSE41-NEXT: pmovsxbd %xmm0, %xmm1
@@ -259,7 +259,7 @@ define <8 x i32> @sext_16i8_to_8i32(<16 x i8> %A) nounwind uwtable readnone ssp
; SSE41-NEXT: retq
;
; AVX1-LABEL: sext_16i8_to_8i32:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vpmovsxbd %xmm0, %xmm1
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0
@@ -267,17 +267,17 @@ define <8 x i32> @sext_16i8_to_8i32(<16 x i8> %A) nounwind uwtable readnone ssp
; AVX1-NEXT: retq
;
; AVX2-LABEL: sext_16i8_to_8i32:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpmovsxbd %xmm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: sext_16i8_to_8i32:
-; AVX512: # BB#0: # %entry
+; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vpmovsxbd %xmm0, %ymm0
; AVX512-NEXT: retq
;
; X32-SSE41-LABEL: sext_16i8_to_8i32:
-; X32-SSE41: # BB#0: # %entry
+; X32-SSE41: # %bb.0: # %entry
; X32-SSE41-NEXT: pmovsxbd %xmm0, %xmm2
; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
; X32-SSE41-NEXT: pmovsxbd %xmm0, %xmm1
@@ -291,7 +291,7 @@ entry:
define <16 x i32> @sext_16i8_to_16i32(<16 x i8> %A) nounwind uwtable readnone ssp {
; SSE2-LABEL: sext_16i8_to_16i32:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
; SSE2-NEXT: psrad $24, %xmm4
@@ -310,7 +310,7 @@ define <16 x i32> @sext_16i8_to_16i32(<16 x i8> %A) nounwind uwtable readnone ss
; SSE2-NEXT: retq
;
; SSSE3-LABEL: sext_16i8_to_16i32:
-; SSSE3: # BB#0: # %entry
+; SSSE3: # %bb.0: # %entry
; SSSE3-NEXT: movdqa %xmm0, %xmm3
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
@@ -326,7 +326,7 @@ define <16 x i32> @sext_16i8_to_16i32(<16 x i8> %A) nounwind uwtable readnone ss
; SSSE3-NEXT: retq
;
; SSE41-LABEL: sext_16i8_to_16i32:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: pmovsxbd %xmm0, %xmm4
; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; SSE41-NEXT: pmovsxbd %xmm1, %xmm1
@@ -338,7 +338,7 @@ define <16 x i32> @sext_16i8_to_16i32(<16 x i8> %A) nounwind uwtable readnone ss
; SSE41-NEXT: retq
;
; AVX1-LABEL: sext_16i8_to_16i32:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vpmovsxbd %xmm0, %xmm1
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
; AVX1-NEXT: vpmovsxbd %xmm2, %xmm2
@@ -352,7 +352,7 @@ define <16 x i32> @sext_16i8_to_16i32(<16 x i8> %A) nounwind uwtable readnone ss
; AVX1-NEXT: retq
;
; AVX2-LABEL: sext_16i8_to_16i32:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpmovsxbd %xmm0, %ymm2
; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
; AVX2-NEXT: vpmovsxbd %xmm0, %ymm1
@@ -360,12 +360,12 @@ define <16 x i32> @sext_16i8_to_16i32(<16 x i8> %A) nounwind uwtable readnone ss
; AVX2-NEXT: retq
;
; AVX512-LABEL: sext_16i8_to_16i32:
-; AVX512: # BB#0: # %entry
+; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vpmovsxbd %xmm0, %zmm0
; AVX512-NEXT: retq
;
; X32-SSE41-LABEL: sext_16i8_to_16i32:
-; X32-SSE41: # BB#0: # %entry
+; X32-SSE41: # %bb.0: # %entry
; X32-SSE41-NEXT: pmovsxbd %xmm0, %xmm4
; X32-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; X32-SSE41-NEXT: pmovsxbd %xmm1, %xmm1
@@ -382,7 +382,7 @@ entry:
define <2 x i64> @sext_16i8_to_2i64(<16 x i8> %A) nounwind uwtable readnone ssp {
; SSE2-LABEL: sext_16i8_to_2i64:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
; SSE2-NEXT: movdqa %xmm0, %xmm1
@@ -392,7 +392,7 @@ define <2 x i64> @sext_16i8_to_2i64(<16 x i8> %A) nounwind uwtable readnone ssp
; SSE2-NEXT: retq
;
; SSSE3-LABEL: sext_16i8_to_2i64:
-; SSSE3: # BB#0: # %entry
+; SSSE3: # %bb.0: # %entry
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
; SSSE3-NEXT: movdqa %xmm0, %xmm1
@@ -402,17 +402,17 @@ define <2 x i64> @sext_16i8_to_2i64(<16 x i8> %A) nounwind uwtable readnone ssp
; SSSE3-NEXT: retq
;
; SSE41-LABEL: sext_16i8_to_2i64:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: pmovsxbq %xmm0, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: sext_16i8_to_2i64:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vpmovsxbq %xmm0, %xmm0
; AVX-NEXT: retq
;
; X32-SSE41-LABEL: sext_16i8_to_2i64:
-; X32-SSE41: # BB#0: # %entry
+; X32-SSE41: # %bb.0: # %entry
; X32-SSE41-NEXT: pmovsxbq %xmm0, %xmm0
; X32-SSE41-NEXT: retl
entry:
@@ -423,7 +423,7 @@ entry:
define <4 x i64> @sext_16i8_to_4i64(<16 x i8> %A) nounwind uwtable readnone ssp {
; SSE2-LABEL: sext_16i8_to_4i64:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
; SSE2-NEXT: movdqa %xmm2, %xmm1
@@ -441,7 +441,7 @@ define <4 x i64> @sext_16i8_to_4i64(<16 x i8> %A) nounwind uwtable readnone ssp
; SSE2-NEXT: retq
;
; SSSE3-LABEL: sext_16i8_to_4i64:
-; SSSE3: # BB#0: # %entry
+; SSSE3: # %bb.0: # %entry
; SSSE3-NEXT: movdqa %xmm0, %xmm1
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
@@ -457,7 +457,7 @@ define <4 x i64> @sext_16i8_to_4i64(<16 x i8> %A) nounwind uwtable readnone ssp
; SSSE3-NEXT: retq
;
; SSE41-LABEL: sext_16i8_to_4i64:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: pmovsxbq %xmm0, %xmm2
; SSE41-NEXT: psrld $16, %xmm0
; SSE41-NEXT: pmovsxbq %xmm0, %xmm1
@@ -465,7 +465,7 @@ define <4 x i64> @sext_16i8_to_4i64(<16 x i8> %A) nounwind uwtable readnone ssp
; SSE41-NEXT: retq
;
; AVX1-LABEL: sext_16i8_to_4i64:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vpmovsxbq %xmm0, %xmm1
; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0
; AVX1-NEXT: vpmovsxbq %xmm0, %xmm0
@@ -473,17 +473,17 @@ define <4 x i64> @sext_16i8_to_4i64(<16 x i8> %A) nounwind uwtable readnone ssp
; AVX1-NEXT: retq
;
; AVX2-LABEL: sext_16i8_to_4i64:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpmovsxbq %xmm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: sext_16i8_to_4i64:
-; AVX512: # BB#0: # %entry
+; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vpmovsxbq %xmm0, %ymm0
; AVX512-NEXT: retq
;
; X32-SSE41-LABEL: sext_16i8_to_4i64:
-; X32-SSE41: # BB#0: # %entry
+; X32-SSE41: # %bb.0: # %entry
; X32-SSE41-NEXT: pmovsxbq %xmm0, %xmm2
; X32-SSE41-NEXT: psrld $16, %xmm0
; X32-SSE41-NEXT: pmovsxbq %xmm0, %xmm1
@@ -497,7 +497,7 @@ entry:
define <8 x i64> @sext_16i8_to_8i64(<16 x i8> %A) nounwind uwtable readnone ssp {
; SSE2-LABEL: sext_16i8_to_8i64:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
@@ -529,7 +529,7 @@ define <8 x i64> @sext_16i8_to_8i64(<16 x i8> %A) nounwind uwtable readnone ssp
; SSE2-NEXT: retq
;
; SSSE3-LABEL: sext_16i8_to_8i64:
-; SSSE3: # BB#0: # %entry
+; SSSE3: # %bb.0: # %entry
; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = <u,u,u,2,u,u,u,3,u,u,u,255,u,u,u,255>
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,2,3]
@@ -558,7 +558,7 @@ define <8 x i64> @sext_16i8_to_8i64(<16 x i8> %A) nounwind uwtable readnone ssp
; SSSE3-NEXT: retq
;
; SSE41-LABEL: sext_16i8_to_8i64:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: pmovsxbq %xmm0, %xmm4
; SSE41-NEXT: movdqa %xmm0, %xmm1
; SSE41-NEXT: psrld $16, %xmm1
@@ -571,7 +571,7 @@ define <8 x i64> @sext_16i8_to_8i64(<16 x i8> %A) nounwind uwtable readnone ssp
; SSE41-NEXT: retq
;
; AVX1-LABEL: sext_16i8_to_8i64:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vpmovsxbq %xmm0, %xmm1
; AVX1-NEXT: vpsrld $16, %xmm0, %xmm2
; AVX1-NEXT: vpmovsxbq %xmm2, %xmm2
@@ -585,7 +585,7 @@ define <8 x i64> @sext_16i8_to_8i64(<16 x i8> %A) nounwind uwtable readnone ssp
; AVX1-NEXT: retq
;
; AVX2-LABEL: sext_16i8_to_8i64:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpmovsxbq %xmm0, %ymm2
; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
; AVX2-NEXT: vpmovsxbq %xmm0, %ymm1
@@ -593,12 +593,12 @@ define <8 x i64> @sext_16i8_to_8i64(<16 x i8> %A) nounwind uwtable readnone ssp
; AVX2-NEXT: retq
;
; AVX512-LABEL: sext_16i8_to_8i64:
-; AVX512: # BB#0: # %entry
+; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vpmovsxbq %xmm0, %zmm0
; AVX512-NEXT: retq
;
; X32-SSE41-LABEL: sext_16i8_to_8i64:
-; X32-SSE41: # BB#0: # %entry
+; X32-SSE41: # %bb.0: # %entry
; X32-SSE41-NEXT: pmovsxbq %xmm0, %xmm4
; X32-SSE41-NEXT: movdqa %xmm0, %xmm1
; X32-SSE41-NEXT: psrld $16, %xmm1
@@ -617,29 +617,29 @@ entry:
define <4 x i32> @sext_8i16_to_4i32(<8 x i16> %A) nounwind uwtable readnone ssp {
; SSE2-LABEL: sext_8i16_to_4i32:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
; SSE2-NEXT: psrad $16, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: sext_8i16_to_4i32:
-; SSSE3: # BB#0: # %entry
+; SSSE3: # %bb.0: # %entry
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
; SSSE3-NEXT: psrad $16, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: sext_8i16_to_4i32:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: pmovsxwd %xmm0, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: sext_8i16_to_4i32:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vpmovsxwd %xmm0, %xmm0
; AVX-NEXT: retq
;
; X32-SSE41-LABEL: sext_8i16_to_4i32:
-; X32-SSE41: # BB#0: # %entry
+; X32-SSE41: # %bb.0: # %entry
; X32-SSE41-NEXT: pmovsxwd %xmm0, %xmm0
; X32-SSE41-NEXT: retl
entry:
@@ -650,7 +650,7 @@ entry:
define <8 x i32> @sext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone ssp {
; SSE2-LABEL: sext_8i16_to_8i32:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
; SSE2-NEXT: psrad $16, %xmm2
; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
@@ -659,7 +659,7 @@ define <8 x i32> @sext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone ssp
; SSE2-NEXT: retq
;
; SSSE3-LABEL: sext_8i16_to_8i32:
-; SSSE3: # BB#0: # %entry
+; SSSE3: # %bb.0: # %entry
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
; SSSE3-NEXT: psrad $16, %xmm2
; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
@@ -668,7 +668,7 @@ define <8 x i32> @sext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone ssp
; SSSE3-NEXT: retq
;
; SSE41-LABEL: sext_8i16_to_8i32:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: pmovsxwd %xmm0, %xmm2
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
; SSE41-NEXT: pmovsxwd %xmm0, %xmm1
@@ -676,7 +676,7 @@ define <8 x i32> @sext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone ssp
; SSE41-NEXT: retq
;
; AVX1-LABEL: sext_8i16_to_8i32:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vpmovsxwd %xmm0, %xmm1
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0
@@ -684,17 +684,17 @@ define <8 x i32> @sext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone ssp
; AVX1-NEXT: retq
;
; AVX2-LABEL: sext_8i16_to_8i32:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: sext_8i16_to_8i32:
-; AVX512: # BB#0: # %entry
+; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vpmovsxwd %xmm0, %ymm0
; AVX512-NEXT: retq
;
; X32-SSE41-LABEL: sext_8i16_to_8i32:
-; X32-SSE41: # BB#0: # %entry
+; X32-SSE41: # %bb.0: # %entry
; X32-SSE41-NEXT: pmovsxwd %xmm0, %xmm2
; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
; X32-SSE41-NEXT: pmovsxwd %xmm0, %xmm1
@@ -707,7 +707,7 @@ entry:
define <16 x i32> @sext_16i16_to_16i32(<16 x i16> %A) nounwind uwtable readnone ssp {
; SSE2-LABEL: sext_16i16_to_16i32:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
; SSE2-NEXT: psrad $16, %xmm4
; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
@@ -721,7 +721,7 @@ define <16 x i32> @sext_16i16_to_16i32(<16 x i16> %A) nounwind uwtable readnone
; SSE2-NEXT: retq
;
; SSSE3-LABEL: sext_16i16_to_16i32:
-; SSSE3: # BB#0: # %entry
+; SSSE3: # %bb.0: # %entry
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
; SSSE3-NEXT: psrad $16, %xmm4
; SSSE3-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
@@ -735,7 +735,7 @@ define <16 x i32> @sext_16i16_to_16i32(<16 x i16> %A) nounwind uwtable readnone
; SSSE3-NEXT: retq
;
; SSE41-LABEL: sext_16i16_to_16i32:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: pmovsxwd %xmm0, %xmm5
; SSE41-NEXT: pmovsxwd %xmm1, %xmm2
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
@@ -747,7 +747,7 @@ define <16 x i32> @sext_16i16_to_16i32(<16 x i16> %A) nounwind uwtable readnone
; SSE41-NEXT: retq
;
; AVX1-LABEL: sext_16i16_to_16i32:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vpmovsxwd %xmm0, %xmm1
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
; AVX1-NEXT: vpmovsxwd %xmm2, %xmm2
@@ -761,7 +761,7 @@ define <16 x i32> @sext_16i16_to_16i32(<16 x i16> %A) nounwind uwtable readnone
; AVX1-NEXT: retq
;
; AVX2-LABEL: sext_16i16_to_16i32:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpmovsxwd %xmm0, %ymm2
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX2-NEXT: vpmovsxwd %xmm0, %ymm1
@@ -769,12 +769,12 @@ define <16 x i32> @sext_16i16_to_16i32(<16 x i16> %A) nounwind uwtable readnone
; AVX2-NEXT: retq
;
; AVX512-LABEL: sext_16i16_to_16i32:
-; AVX512: # BB#0: # %entry
+; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512-NEXT: retq
;
; X32-SSE41-LABEL: sext_16i16_to_16i32:
-; X32-SSE41: # BB#0: # %entry
+; X32-SSE41: # %bb.0: # %entry
; X32-SSE41-NEXT: pmovsxwd %xmm0, %xmm5
; X32-SSE41-NEXT: pmovsxwd %xmm1, %xmm2
; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
@@ -791,7 +791,7 @@ entry:
define <2 x i64> @sext_8i16_to_2i64(<8 x i16> %A) nounwind uwtable readnone ssp {
; SSE2-LABEL: sext_8i16_to_2i64:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrad $31, %xmm1
@@ -800,7 +800,7 @@ define <2 x i64> @sext_8i16_to_2i64(<8 x i16> %A) nounwind uwtable readnone ssp
; SSE2-NEXT: retq
;
; SSSE3-LABEL: sext_8i16_to_2i64:
-; SSSE3: # BB#0: # %entry
+; SSSE3: # %bb.0: # %entry
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
; SSSE3-NEXT: movdqa %xmm0, %xmm1
; SSSE3-NEXT: psrad $31, %xmm1
@@ -809,17 +809,17 @@ define <2 x i64> @sext_8i16_to_2i64(<8 x i16> %A) nounwind uwtable readnone ssp
; SSSE3-NEXT: retq
;
; SSE41-LABEL: sext_8i16_to_2i64:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: pmovsxwq %xmm0, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: sext_8i16_to_2i64:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vpmovsxwq %xmm0, %xmm0
; AVX-NEXT: retq
;
; X32-SSE41-LABEL: sext_8i16_to_2i64:
-; X32-SSE41: # BB#0: # %entry
+; X32-SSE41: # %bb.0: # %entry
; X32-SSE41-NEXT: pmovsxwq %xmm0, %xmm0
; X32-SSE41-NEXT: retl
entry:
@@ -830,7 +830,7 @@ entry:
define <4 x i64> @sext_8i16_to_4i64(<8 x i16> %A) nounwind uwtable readnone ssp {
; SSE2-LABEL: sext_8i16_to_4i64:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
; SSE2-NEXT: movdqa %xmm2, %xmm1
; SSE2-NEXT: psrad $31, %xmm1
@@ -846,7 +846,7 @@ define <4 x i64> @sext_8i16_to_4i64(<8 x i16> %A) nounwind uwtable readnone ssp
; SSE2-NEXT: retq
;
; SSSE3-LABEL: sext_8i16_to_4i64:
-; SSSE3: # BB#0: # %entry
+; SSSE3: # %bb.0: # %entry
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
; SSSE3-NEXT: movdqa %xmm2, %xmm1
; SSSE3-NEXT: psrad $31, %xmm1
@@ -862,7 +862,7 @@ define <4 x i64> @sext_8i16_to_4i64(<8 x i16> %A) nounwind uwtable readnone ssp
; SSSE3-NEXT: retq
;
; SSE41-LABEL: sext_8i16_to_4i64:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: pmovsxwq %xmm0, %xmm2
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
; SSE41-NEXT: pmovsxwq %xmm0, %xmm1
@@ -870,7 +870,7 @@ define <4 x i64> @sext_8i16_to_4i64(<8 x i16> %A) nounwind uwtable readnone ssp
; SSE41-NEXT: retq
;
; AVX1-LABEL: sext_8i16_to_4i64:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vpmovsxwq %xmm0, %xmm1
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
; AVX1-NEXT: vpmovsxwq %xmm0, %xmm0
@@ -878,17 +878,17 @@ define <4 x i64> @sext_8i16_to_4i64(<8 x i16> %A) nounwind uwtable readnone ssp
; AVX1-NEXT: retq
;
; AVX2-LABEL: sext_8i16_to_4i64:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpmovsxwq %xmm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: sext_8i16_to_4i64:
-; AVX512: # BB#0: # %entry
+; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vpmovsxwq %xmm0, %ymm0
; AVX512-NEXT: retq
;
; X32-SSE41-LABEL: sext_8i16_to_4i64:
-; X32-SSE41: # BB#0: # %entry
+; X32-SSE41: # %bb.0: # %entry
; X32-SSE41-NEXT: pmovsxwq %xmm0, %xmm2
; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
; X32-SSE41-NEXT: pmovsxwq %xmm0, %xmm1
@@ -902,7 +902,7 @@ entry:
define <8 x i64> @sext_8i16_to_8i64(<8 x i16> %A) nounwind uwtable readnone ssp {
; SSE2-LABEL: sext_8i16_to_8i64:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
; SSE2-NEXT: movdqa %xmm4, %xmm1
; SSE2-NEXT: psrad $31, %xmm1
@@ -929,7 +929,7 @@ define <8 x i64> @sext_8i16_to_8i64(<8 x i16> %A) nounwind uwtable readnone ssp
; SSE2-NEXT: retq
;
; SSSE3-LABEL: sext_8i16_to_8i64:
-; SSSE3: # BB#0: # %entry
+; SSSE3: # %bb.0: # %entry
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
; SSSE3-NEXT: movdqa %xmm4, %xmm1
; SSSE3-NEXT: psrad $31, %xmm1
@@ -956,7 +956,7 @@ define <8 x i64> @sext_8i16_to_8i64(<8 x i16> %A) nounwind uwtable readnone ssp
; SSSE3-NEXT: retq
;
; SSE41-LABEL: sext_8i16_to_8i64:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: pmovsxwq %xmm0, %xmm4
; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; SSE41-NEXT: pmovsxwq %xmm1, %xmm1
@@ -968,7 +968,7 @@ define <8 x i64> @sext_8i16_to_8i64(<8 x i16> %A) nounwind uwtable readnone ssp
; SSE41-NEXT: retq
;
; AVX1-LABEL: sext_8i16_to_8i64:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vpmovsxwq %xmm0, %xmm1
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
; AVX1-NEXT: vpmovsxwq %xmm2, %xmm2
@@ -982,7 +982,7 @@ define <8 x i64> @sext_8i16_to_8i64(<8 x i16> %A) nounwind uwtable readnone ssp
; AVX1-NEXT: retq
;
; AVX2-LABEL: sext_8i16_to_8i64:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpmovsxwq %xmm0, %ymm2
; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
; AVX2-NEXT: vpmovsxwq %xmm0, %ymm1
@@ -990,12 +990,12 @@ define <8 x i64> @sext_8i16_to_8i64(<8 x i16> %A) nounwind uwtable readnone ssp
; AVX2-NEXT: retq
;
; AVX512-LABEL: sext_8i16_to_8i64:
-; AVX512: # BB#0: # %entry
+; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vpmovsxwq %xmm0, %zmm0
; AVX512-NEXT: retq
;
; X32-SSE41-LABEL: sext_8i16_to_8i64:
-; X32-SSE41: # BB#0: # %entry
+; X32-SSE41: # %bb.0: # %entry
; X32-SSE41-NEXT: pmovsxwq %xmm0, %xmm4
; X32-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; X32-SSE41-NEXT: pmovsxwq %xmm1, %xmm1
@@ -1012,31 +1012,31 @@ entry:
define <2 x i64> @sext_4i32_to_2i64(<4 x i32> %A) nounwind uwtable readnone ssp {
; SSE2-LABEL: sext_4i32_to_2i64:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrad $31, %xmm1
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: sext_4i32_to_2i64:
-; SSSE3: # BB#0: # %entry
+; SSSE3: # %bb.0: # %entry
; SSSE3-NEXT: movdqa %xmm0, %xmm1
; SSSE3-NEXT: psrad $31, %xmm1
; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: sext_4i32_to_2i64:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: pmovsxdq %xmm0, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: sext_4i32_to_2i64:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vpmovsxdq %xmm0, %xmm0
; AVX-NEXT: retq
;
; X32-SSE41-LABEL: sext_4i32_to_2i64:
-; X32-SSE41: # BB#0: # %entry
+; X32-SSE41: # %bb.0: # %entry
; X32-SSE41-NEXT: pmovsxdq %xmm0, %xmm0
; X32-SSE41-NEXT: retl
entry:
@@ -1047,7 +1047,7 @@ entry:
define <4 x i64> @sext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ssp {
; SSE2-LABEL: sext_4i32_to_4i64:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: psrad $31, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
@@ -1058,7 +1058,7 @@ define <4 x i64> @sext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ssp
; SSE2-NEXT: retq
;
; SSSE3-LABEL: sext_4i32_to_4i64:
-; SSSE3: # BB#0: # %entry
+; SSSE3: # %bb.0: # %entry
; SSSE3-NEXT: movdqa %xmm0, %xmm2
; SSSE3-NEXT: psrad $31, %xmm2
; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
@@ -1069,7 +1069,7 @@ define <4 x i64> @sext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ssp
; SSSE3-NEXT: retq
;
; SSE41-LABEL: sext_4i32_to_4i64:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: pmovsxdq %xmm0, %xmm2
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
; SSE41-NEXT: pmovsxdq %xmm0, %xmm1
@@ -1077,7 +1077,7 @@ define <4 x i64> @sext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ssp
; SSE41-NEXT: retq
;
; AVX1-LABEL: sext_4i32_to_4i64:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0
@@ -1085,17 +1085,17 @@ define <4 x i64> @sext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ssp
; AVX1-NEXT: retq
;
; AVX2-LABEL: sext_4i32_to_4i64:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: sext_4i32_to_4i64:
-; AVX512: # BB#0: # %entry
+; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vpmovsxdq %xmm0, %ymm0
; AVX512-NEXT: retq
;
; X32-SSE41-LABEL: sext_4i32_to_4i64:
-; X32-SSE41: # BB#0: # %entry
+; X32-SSE41: # %bb.0: # %entry
; X32-SSE41-NEXT: pmovsxdq %xmm0, %xmm2
; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
; X32-SSE41-NEXT: pmovsxdq %xmm0, %xmm1
@@ -1108,7 +1108,7 @@ entry:
define <8 x i64> @sext_8i32_to_8i64(<8 x i32> %A) nounwind uwtable readnone ssp {
; SSE2-LABEL: sext_8i32_to_8i64:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa %xmm1, %xmm2
; SSE2-NEXT: movdqa %xmm0, %xmm3
; SSE2-NEXT: psrad $31, %xmm3
@@ -1127,7 +1127,7 @@ define <8 x i64> @sext_8i32_to_8i64(<8 x i32> %A) nounwind uwtable readnone ssp
; SSE2-NEXT: retq
;
; SSSE3-LABEL: sext_8i32_to_8i64:
-; SSSE3: # BB#0: # %entry
+; SSSE3: # %bb.0: # %entry
; SSSE3-NEXT: movdqa %xmm1, %xmm2
; SSSE3-NEXT: movdqa %xmm0, %xmm3
; SSSE3-NEXT: psrad $31, %xmm3
@@ -1146,7 +1146,7 @@ define <8 x i64> @sext_8i32_to_8i64(<8 x i32> %A) nounwind uwtable readnone ssp
; SSSE3-NEXT: retq
;
; SSE41-LABEL: sext_8i32_to_8i64:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: pmovsxdq %xmm0, %xmm5
; SSE41-NEXT: pmovsxdq %xmm1, %xmm2
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
@@ -1158,7 +1158,7 @@ define <8 x i64> @sext_8i32_to_8i64(<8 x i32> %A) nounwind uwtable readnone ssp
; SSE41-NEXT: retq
;
; AVX1-LABEL: sext_8i32_to_8i64:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
; AVX1-NEXT: vpmovsxdq %xmm2, %xmm2
@@ -1172,7 +1172,7 @@ define <8 x i64> @sext_8i32_to_8i64(<8 x i32> %A) nounwind uwtable readnone ssp
; AVX1-NEXT: retq
;
; AVX2-LABEL: sext_8i32_to_8i64:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpmovsxdq %xmm0, %ymm2
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX2-NEXT: vpmovsxdq %xmm0, %ymm1
@@ -1180,12 +1180,12 @@ define <8 x i64> @sext_8i32_to_8i64(<8 x i32> %A) nounwind uwtable readnone ssp
; AVX2-NEXT: retq
;
; AVX512-LABEL: sext_8i32_to_8i64:
-; AVX512: # BB#0: # %entry
+; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vpmovsxdq %ymm0, %zmm0
; AVX512-NEXT: retq
;
; X32-SSE41-LABEL: sext_8i32_to_8i64:
-; X32-SSE41: # BB#0: # %entry
+; X32-SSE41: # %bb.0: # %entry
; X32-SSE41-NEXT: pmovsxdq %xmm0, %xmm5
; X32-SSE41-NEXT: pmovsxdq %xmm1, %xmm2
; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
@@ -1202,7 +1202,7 @@ entry:
define <2 x i64> @load_sext_2i1_to_2i64(<2 x i1> *%ptr) {
; SSE-LABEL: load_sext_2i1_to_2i64:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: movzbl (%rdi), %eax
; SSE-NEXT: movq %rax, %rcx
; SSE-NEXT: shlq $62, %rcx
@@ -1215,7 +1215,7 @@ define <2 x i64> @load_sext_2i1_to_2i64(<2 x i1> *%ptr) {
; SSE-NEXT: retq
;
; AVX1-LABEL: load_sext_2i1_to_2i64:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: movzbl (%rdi), %eax
; AVX1-NEXT: movq %rax, %rcx
; AVX1-NEXT: shlq $62, %rcx
@@ -1228,7 +1228,7 @@ define <2 x i64> @load_sext_2i1_to_2i64(<2 x i1> *%ptr) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: load_sext_2i1_to_2i64:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: movzbl (%rdi), %eax
; AVX2-NEXT: movq %rax, %rcx
; AVX2-NEXT: shlq $62, %rcx
@@ -1241,25 +1241,25 @@ define <2 x i64> @load_sext_2i1_to_2i64(<2 x i1> *%ptr) {
; AVX2-NEXT: retq
;
; AVX512F-LABEL: load_sext_2i1_to_2i64:
-; AVX512F: # BB#0: # %entry
+; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: movzbl (%rdi), %eax
; AVX512F-NEXT: kmovw %eax, %k1
; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512F-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: load_sext_2i1_to_2i64:
-; AVX512BW: # BB#0: # %entry
+; AVX512BW: # %bb.0: # %entry
; AVX512BW-NEXT: movzbl (%rdi), %eax
; AVX512BW-NEXT: kmovd %eax, %k1
; AVX512BW-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512BW-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; X32-SSE41-LABEL: load_sext_2i1_to_2i64:
-; X32-SSE41: # BB#0: # %entry
+; X32-SSE41: # %bb.0: # %entry
; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-SSE41-NEXT: movzbl (%eax), %eax
; X32-SSE41-NEXT: movl %eax, %ecx
@@ -1280,7 +1280,7 @@ entry:
define <2 x i64> @load_sext_2i8_to_2i64(<2 x i8> *%ptr) {
; SSE2-LABEL: load_sext_2i8_to_2i64:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movzwl (%rdi), %eax
; SSE2-NEXT: movd %eax, %xmm0
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
@@ -1292,7 +1292,7 @@ define <2 x i64> @load_sext_2i8_to_2i64(<2 x i8> *%ptr) {
; SSE2-NEXT: retq
;
; SSSE3-LABEL: load_sext_2i8_to_2i64:
-; SSSE3: # BB#0: # %entry
+; SSSE3: # %bb.0: # %entry
; SSSE3-NEXT: movzwl (%rdi), %eax
; SSSE3-NEXT: movd %eax, %xmm0
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
@@ -1304,17 +1304,17 @@ define <2 x i64> @load_sext_2i8_to_2i64(<2 x i8> *%ptr) {
; SSSE3-NEXT: retq
;
; SSE41-LABEL: load_sext_2i8_to_2i64:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: pmovsxbq (%rdi), %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: load_sext_2i8_to_2i64:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vpmovsxbq (%rdi), %xmm0
; AVX-NEXT: retq
;
; X32-SSE41-LABEL: load_sext_2i8_to_2i64:
-; X32-SSE41: # BB#0: # %entry
+; X32-SSE41: # %bb.0: # %entry
; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-SSE41-NEXT: pmovsxbq (%eax), %xmm0
; X32-SSE41-NEXT: retl
@@ -1326,7 +1326,7 @@ entry:
define <4 x i32> @load_sext_4i1_to_4i32(<4 x i1> *%ptr) {
; SSE2-LABEL: load_sext_4i1_to_4i32:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movzbl (%rdi), %eax
; SSE2-NEXT: movq %rax, %rcx
; SSE2-NEXT: shlq $60, %rcx
@@ -1349,7 +1349,7 @@ define <4 x i32> @load_sext_4i1_to_4i32(<4 x i1> *%ptr) {
; SSE2-NEXT: retq
;
; SSSE3-LABEL: load_sext_4i1_to_4i32:
-; SSSE3: # BB#0: # %entry
+; SSSE3: # %bb.0: # %entry
; SSSE3-NEXT: movzbl (%rdi), %eax
; SSSE3-NEXT: movq %rax, %rcx
; SSSE3-NEXT: shlq $60, %rcx
@@ -1372,7 +1372,7 @@ define <4 x i32> @load_sext_4i1_to_4i32(<4 x i1> *%ptr) {
; SSSE3-NEXT: retq
;
; SSE41-LABEL: load_sext_4i1_to_4i32:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: movzbl (%rdi), %eax
; SSE41-NEXT: movq %rax, %rcx
; SSE41-NEXT: shlq $62, %rcx
@@ -1392,7 +1392,7 @@ define <4 x i32> @load_sext_4i1_to_4i32(<4 x i1> *%ptr) {
; SSE41-NEXT: retq
;
; AVX1-LABEL: load_sext_4i1_to_4i32:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: movzbl (%rdi), %eax
; AVX1-NEXT: movq %rax, %rcx
; AVX1-NEXT: shlq $62, %rcx
@@ -1412,7 +1412,7 @@ define <4 x i32> @load_sext_4i1_to_4i32(<4 x i1> *%ptr) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: load_sext_4i1_to_4i32:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: movzbl (%rdi), %eax
; AVX2-NEXT: movq %rax, %rcx
; AVX2-NEXT: shlq $62, %rcx
@@ -1432,27 +1432,25 @@ define <4 x i32> @load_sext_4i1_to_4i32(<4 x i1> *%ptr) {
; AVX2-NEXT: retq
;
; AVX512F-LABEL: load_sext_4i1_to_4i32:
-; AVX512F: # BB#0: # %entry
+; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: movzbl (%rdi), %eax
; AVX512F-NEXT: kmovw %eax, %k1
-; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: load_sext_4i1_to_4i32:
-; AVX512BW: # BB#0: # %entry
+; AVX512BW: # %bb.0: # %entry
; AVX512BW-NEXT: movzbl (%rdi), %eax
; AVX512BW-NEXT: kmovd %eax, %k1
-; AVX512BW-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; X32-SSE41-LABEL: load_sext_4i1_to_4i32:
-; X32-SSE41: # BB#0: # %entry
+; X32-SSE41: # %bb.0: # %entry
; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-SSE41-NEXT: movl (%eax), %eax
; X32-SSE41-NEXT: movl %eax, %ecx
@@ -1479,7 +1477,7 @@ entry:
define <4 x i32> @load_sext_4i8_to_4i32(<4 x i8> *%ptr) {
; SSE2-LABEL: load_sext_4i8_to_4i32:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
@@ -1487,7 +1485,7 @@ define <4 x i32> @load_sext_4i8_to_4i32(<4 x i8> *%ptr) {
; SSE2-NEXT: retq
;
; SSSE3-LABEL: load_sext_4i8_to_4i32:
-; SSSE3: # BB#0: # %entry
+; SSSE3: # %bb.0: # %entry
; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
@@ -1495,17 +1493,17 @@ define <4 x i32> @load_sext_4i8_to_4i32(<4 x i8> *%ptr) {
; SSSE3-NEXT: retq
;
; SSE41-LABEL: load_sext_4i8_to_4i32:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: pmovsxbd (%rdi), %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: load_sext_4i8_to_4i32:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vpmovsxbd (%rdi), %xmm0
; AVX-NEXT: retq
;
; X32-SSE41-LABEL: load_sext_4i8_to_4i32:
-; X32-SSE41: # BB#0: # %entry
+; X32-SSE41: # %bb.0: # %entry
; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-SSE41-NEXT: pmovsxbd (%eax), %xmm0
; X32-SSE41-NEXT: retl
@@ -1517,7 +1515,7 @@ entry:
define <4 x i64> @load_sext_4i1_to_4i64(<4 x i1> *%ptr) {
; SSE2-LABEL: load_sext_4i1_to_4i64:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movl (%rdi), %eax
; SSE2-NEXT: movl %eax, %ecx
; SSE2-NEXT: shrl $3, %ecx
@@ -1543,7 +1541,7 @@ define <4 x i64> @load_sext_4i1_to_4i64(<4 x i1> *%ptr) {
; SSE2-NEXT: retq
;
; SSSE3-LABEL: load_sext_4i1_to_4i64:
-; SSSE3: # BB#0: # %entry
+; SSSE3: # %bb.0: # %entry
; SSSE3-NEXT: movl (%rdi), %eax
; SSSE3-NEXT: movl %eax, %ecx
; SSSE3-NEXT: shrl $3, %ecx
@@ -1569,7 +1567,7 @@ define <4 x i64> @load_sext_4i1_to_4i64(<4 x i1> *%ptr) {
; SSSE3-NEXT: retq
;
; SSE41-LABEL: load_sext_4i1_to_4i64:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: movl (%rdi), %eax
; SSE41-NEXT: movl %eax, %ecx
; SSE41-NEXT: shrl %ecx
@@ -1592,7 +1590,7 @@ define <4 x i64> @load_sext_4i1_to_4i64(<4 x i1> *%ptr) {
; SSE41-NEXT: retq
;
; AVX1-LABEL: load_sext_4i1_to_4i64:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: movzbl (%rdi), %eax
; AVX1-NEXT: movq %rax, %rcx
; AVX1-NEXT: shlq $62, %rcx
@@ -1616,7 +1614,7 @@ define <4 x i64> @load_sext_4i1_to_4i64(<4 x i1> *%ptr) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: load_sext_4i1_to_4i64:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: movzbl (%rdi), %eax
; AVX2-NEXT: movq %rax, %rcx
; AVX2-NEXT: shlq $60, %rcx
@@ -1639,23 +1637,23 @@ define <4 x i64> @load_sext_4i1_to_4i64(<4 x i1> *%ptr) {
; AVX2-NEXT: retq
;
; AVX512F-LABEL: load_sext_4i1_to_4i64:
-; AVX512F: # BB#0: # %entry
+; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: movzbl (%rdi), %eax
; AVX512F-NEXT: kmovw %eax, %k1
; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; AVX512F-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512F-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: load_sext_4i1_to_4i64:
-; AVX512BW: # BB#0: # %entry
+; AVX512BW: # %bb.0: # %entry
; AVX512BW-NEXT: movzbl (%rdi), %eax
; AVX512BW-NEXT: kmovd %eax, %k1
; AVX512BW-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; AVX512BW-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512BW-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
; AVX512BW-NEXT: retq
;
; X32-SSE41-LABEL: load_sext_4i1_to_4i64:
-; X32-SSE41: # BB#0: # %entry
+; X32-SSE41: # %bb.0: # %entry
; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-SSE41-NEXT: movzbl (%eax), %eax
; X32-SSE41-NEXT: movl %eax, %ecx
@@ -1685,7 +1683,7 @@ entry:
define <4 x i64> @load_sext_4i8_to_4i64(<4 x i8> *%ptr) {
; SSE2-LABEL: load_sext_4i8_to_4i64:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movsbq 1(%rdi), %rax
; SSE2-NEXT: movq %rax, %xmm1
; SSE2-NEXT: movsbq (%rdi), %rax
@@ -1699,7 +1697,7 @@ define <4 x i64> @load_sext_4i8_to_4i64(<4 x i8> *%ptr) {
; SSE2-NEXT: retq
;
; SSSE3-LABEL: load_sext_4i8_to_4i64:
-; SSSE3: # BB#0: # %entry
+; SSSE3: # %bb.0: # %entry
; SSSE3-NEXT: movsbq 1(%rdi), %rax
; SSSE3-NEXT: movq %rax, %xmm1
; SSSE3-NEXT: movsbq (%rdi), %rax
@@ -1713,13 +1711,13 @@ define <4 x i64> @load_sext_4i8_to_4i64(<4 x i8> *%ptr) {
; SSSE3-NEXT: retq
;
; SSE41-LABEL: load_sext_4i8_to_4i64:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: pmovsxbq (%rdi), %xmm0
; SSE41-NEXT: pmovsxbq 2(%rdi), %xmm1
; SSE41-NEXT: retq
;
; AVX1-LABEL: load_sext_4i8_to_4i64:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vpmovsxbd (%rdi), %xmm0
; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
@@ -1728,17 +1726,17 @@ define <4 x i64> @load_sext_4i8_to_4i64(<4 x i8> *%ptr) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: load_sext_4i8_to_4i64:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpmovsxbq (%rdi), %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: load_sext_4i8_to_4i64:
-; AVX512: # BB#0: # %entry
+; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vpmovsxbq (%rdi), %ymm0
; AVX512-NEXT: retq
;
; X32-SSE41-LABEL: load_sext_4i8_to_4i64:
-; X32-SSE41: # BB#0: # %entry
+; X32-SSE41: # %bb.0: # %entry
; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-SSE41-NEXT: pmovsxbq (%eax), %xmm0
; X32-SSE41-NEXT: pmovsxbq 2(%eax), %xmm1
@@ -1751,7 +1749,7 @@ entry:
define <2 x i64> @load_sext_4i8_to_4i64_extract(<4 x i8> *%ptr) {
; SSE2-LABEL: load_sext_4i8_to_4i64_extract:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movsbq 3(%rdi), %rax
; SSE2-NEXT: movq %rax, %xmm1
; SSE2-NEXT: movsbq 2(%rdi), %rax
@@ -1760,7 +1758,7 @@ define <2 x i64> @load_sext_4i8_to_4i64_extract(<4 x i8> *%ptr) {
; SSE2-NEXT: retq
;
; SSSE3-LABEL: load_sext_4i8_to_4i64_extract:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: movsbq 3(%rdi), %rax
; SSSE3-NEXT: movq %rax, %xmm1
; SSSE3-NEXT: movsbq 2(%rdi), %rax
@@ -1769,33 +1767,33 @@ define <2 x i64> @load_sext_4i8_to_4i64_extract(<4 x i8> *%ptr) {
; SSSE3-NEXT: retq
;
; SSE41-LABEL: load_sext_4i8_to_4i64_extract:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pmovsxbq 2(%rdi), %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: load_sext_4i8_to_4i64_extract:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpmovsxbd (%rdi), %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: load_sext_4i8_to_4i64_extract:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpmovsxbq (%rdi), %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512-LABEL: load_sext_4i8_to_4i64_extract:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpmovsxbq (%rdi), %ymm0
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
;
; X32-SSE41-LABEL: load_sext_4i8_to_4i64_extract:
-; X32-SSE41: # BB#0:
+; X32-SSE41: # %bb.0:
; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-SSE41-NEXT: pmovsxbq 2(%eax), %xmm0
; X32-SSE41-NEXT: retl
@@ -1807,7 +1805,7 @@ define <2 x i64> @load_sext_4i8_to_4i64_extract(<4 x i8> *%ptr) {
define <8 x i16> @load_sext_8i1_to_8i16(<8 x i1> *%ptr) {
; SSE2-LABEL: load_sext_8i1_to_8i16:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movsbq (%rdi), %rax
; SSE2-NEXT: movq %rax, %rcx
; SSE2-NEXT: shrq $7, %rcx
@@ -1849,7 +1847,7 @@ define <8 x i16> @load_sext_8i1_to_8i16(<8 x i1> *%ptr) {
; SSE2-NEXT: retq
;
; SSSE3-LABEL: load_sext_8i1_to_8i16:
-; SSSE3: # BB#0: # %entry
+; SSSE3: # %bb.0: # %entry
; SSSE3-NEXT: movsbq (%rdi), %rax
; SSSE3-NEXT: movq %rax, %rcx
; SSSE3-NEXT: shrq $7, %rcx
@@ -1891,7 +1889,7 @@ define <8 x i16> @load_sext_8i1_to_8i16(<8 x i1> *%ptr) {
; SSSE3-NEXT: retq
;
; SSE41-LABEL: load_sext_8i1_to_8i16:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: movsbq (%rdi), %rax
; SSE41-NEXT: movq %rax, %rcx
; SSE41-NEXT: shlq $62, %rcx
@@ -1926,7 +1924,7 @@ define <8 x i16> @load_sext_8i1_to_8i16(<8 x i1> *%ptr) {
; SSE41-NEXT: retq
;
; AVX1-LABEL: load_sext_8i1_to_8i16:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: movsbq (%rdi), %rax
; AVX1-NEXT: movq %rax, %rcx
; AVX1-NEXT: shlq $62, %rcx
@@ -1961,7 +1959,7 @@ define <8 x i16> @load_sext_8i1_to_8i16(<8 x i1> *%ptr) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: load_sext_8i1_to_8i16:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: movsbq (%rdi), %rax
; AVX2-NEXT: movq %rax, %rcx
; AVX2-NEXT: shlq $62, %rcx
@@ -1996,25 +1994,26 @@ define <8 x i16> @load_sext_8i1_to_8i16(<8 x i1> *%ptr) {
; AVX2-NEXT: retq
;
; AVX512F-LABEL: load_sext_8i1_to_8i16:
-; AVX512F: # BB#0: # %entry
+; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: movzbl (%rdi), %eax
; AVX512F-NEXT: kmovw %eax, %k1
-; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; AVX512F-NEXT: vpmovqw %zmm0, %xmm0
+; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512F-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: load_sext_8i1_to_8i16:
-; AVX512BW: # BB#0: # %entry
+; AVX512BW: # %bb.0: # %entry
; AVX512BW-NEXT: movzbl (%rdi), %eax
; AVX512BW-NEXT: kmovd %eax, %k0
; AVX512BW-NEXT: vpmovm2w %k0, %zmm0
-; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512BW-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; X32-SSE41-LABEL: load_sext_8i1_to_8i16:
-; X32-SSE41: # BB#0: # %entry
+; X32-SSE41: # %bb.0: # %entry
; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-SSE41-NEXT: movsbl (%eax), %eax
; X32-SSE41-NEXT: movl %eax, %ecx
@@ -2056,31 +2055,31 @@ entry:
define <8 x i16> @load_sext_8i8_to_8i16(<8 x i8> *%ptr) {
; SSE2-LABEL: load_sext_8i8_to_8i16:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: psraw $8, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: load_sext_8i8_to_8i16:
-; SSSE3: # BB#0: # %entry
+; SSSE3: # %bb.0: # %entry
; SSSE3-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSSE3-NEXT: psraw $8, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: load_sext_8i8_to_8i16:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: pmovsxbw (%rdi), %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: load_sext_8i8_to_8i16:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vpmovsxbw (%rdi), %xmm0
; AVX-NEXT: retq
;
; X32-SSE41-LABEL: load_sext_8i8_to_8i16:
-; X32-SSE41: # BB#0: # %entry
+; X32-SSE41: # %bb.0: # %entry
; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-SSE41-NEXT: pmovsxbw (%eax), %xmm0
; X32-SSE41-NEXT: retl
@@ -2092,7 +2091,7 @@ entry:
define <8 x i64> @load_sext_8i8_to_8i64(<8 x i8> *%ptr) {
; SSE2-LABEL: load_sext_8i8_to_8i64:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movsbq 1(%rdi), %rax
; SSE2-NEXT: movq %rax, %xmm1
; SSE2-NEXT: movsbq (%rdi), %rax
@@ -2116,7 +2115,7 @@ define <8 x i64> @load_sext_8i8_to_8i64(<8 x i8> *%ptr) {
; SSE2-NEXT: retq
;
; SSSE3-LABEL: load_sext_8i8_to_8i64:
-; SSSE3: # BB#0: # %entry
+; SSSE3: # %bb.0: # %entry
; SSSE3-NEXT: movsbq 1(%rdi), %rax
; SSSE3-NEXT: movq %rax, %xmm1
; SSSE3-NEXT: movsbq (%rdi), %rax
@@ -2140,7 +2139,7 @@ define <8 x i64> @load_sext_8i8_to_8i64(<8 x i8> *%ptr) {
; SSSE3-NEXT: retq
;
; SSE41-LABEL: load_sext_8i8_to_8i64:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: pmovsxbq (%rdi), %xmm0
; SSE41-NEXT: pmovsxbq 2(%rdi), %xmm1
; SSE41-NEXT: pmovsxbq 4(%rdi), %xmm2
@@ -2148,7 +2147,7 @@ define <8 x i64> @load_sext_8i8_to_8i64(<8 x i8> *%ptr) {
; SSE41-NEXT: retq
;
; AVX1-LABEL: load_sext_8i8_to_8i64:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vpmovsxbd (%rdi), %xmm0
; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
@@ -2162,18 +2161,18 @@ define <8 x i64> @load_sext_8i8_to_8i64(<8 x i8> *%ptr) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: load_sext_8i8_to_8i64:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpmovsxbq (%rdi), %ymm0
; AVX2-NEXT: vpmovsxbq 4(%rdi), %ymm1
; AVX2-NEXT: retq
;
; AVX512-LABEL: load_sext_8i8_to_8i64:
-; AVX512: # BB#0: # %entry
+; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vpmovsxbq (%rdi), %zmm0
; AVX512-NEXT: retq
;
; X32-SSE41-LABEL: load_sext_8i8_to_8i64:
-; X32-SSE41: # BB#0: # %entry
+; X32-SSE41: # %bb.0: # %entry
; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-SSE41-NEXT: pmovsxbq (%eax), %xmm0
; X32-SSE41-NEXT: pmovsxbq 2(%eax), %xmm1
@@ -2188,7 +2187,7 @@ entry:
define <8 x i32> @load_sext_8i1_to_8i32(<8 x i1> *%ptr) {
; SSE2-LABEL: load_sext_8i1_to_8i32:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movzbl (%rdi), %eax
; SSE2-NEXT: movl %eax, %ecx
; SSE2-NEXT: shrl $3, %ecx
@@ -2237,7 +2236,7 @@ define <8 x i32> @load_sext_8i1_to_8i32(<8 x i1> *%ptr) {
; SSE2-NEXT: retq
;
; SSSE3-LABEL: load_sext_8i1_to_8i32:
-; SSSE3: # BB#0: # %entry
+; SSSE3: # %bb.0: # %entry
; SSSE3-NEXT: movzbl (%rdi), %eax
; SSSE3-NEXT: movl %eax, %ecx
; SSSE3-NEXT: shrl $3, %ecx
@@ -2286,7 +2285,7 @@ define <8 x i32> @load_sext_8i1_to_8i32(<8 x i1> *%ptr) {
; SSSE3-NEXT: retq
;
; SSE41-LABEL: load_sext_8i1_to_8i32:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: movzbl (%rdi), %eax
; SSE41-NEXT: movl %eax, %ecx
; SSE41-NEXT: shrl %ecx
@@ -2327,7 +2326,7 @@ define <8 x i32> @load_sext_8i1_to_8i32(<8 x i1> *%ptr) {
; SSE41-NEXT: retq
;
; AVX1-LABEL: load_sext_8i1_to_8i32:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: movsbq (%rdi), %rax
; AVX1-NEXT: movq %rax, %rcx
; AVX1-NEXT: shlq $58, %rcx
@@ -2363,7 +2362,7 @@ define <8 x i32> @load_sext_8i1_to_8i32(<8 x i1> *%ptr) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: load_sext_8i1_to_8i32:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: movsbq (%rdi), %rax
; AVX2-NEXT: movq %rax, %rcx
; AVX2-NEXT: shlq $58, %rcx
@@ -2399,23 +2398,23 @@ define <8 x i32> @load_sext_8i1_to_8i32(<8 x i1> *%ptr) {
; AVX2-NEXT: retq
;
; AVX512F-LABEL: load_sext_8i1_to_8i32:
-; AVX512F: # BB#0: # %entry
+; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: movzbl (%rdi), %eax
; AVX512F-NEXT: kmovw %eax, %k1
-; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
+; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: load_sext_8i1_to_8i32:
-; AVX512BW: # BB#0: # %entry
+; AVX512BW: # %bb.0: # %entry
; AVX512BW-NEXT: movzbl (%rdi), %eax
; AVX512BW-NEXT: kmovd %eax, %k1
-; AVX512BW-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
+; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
; AVX512BW-NEXT: retq
;
; X32-SSE41-LABEL: load_sext_8i1_to_8i32:
-; X32-SSE41: # BB#0: # %entry
+; X32-SSE41: # %bb.0: # %entry
; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-SSE41-NEXT: movzbl (%eax), %eax
; X32-SSE41-NEXT: movl %eax, %ecx
@@ -2462,7 +2461,7 @@ entry:
define <8 x i32> @load_sext_8i8_to_8i32(<8 x i8> *%ptr) {
; SSE2-LABEL: load_sext_8i8_to_8i32:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
@@ -2474,7 +2473,7 @@ define <8 x i32> @load_sext_8i8_to_8i32(<8 x i8> *%ptr) {
; SSE2-NEXT: retq
;
; SSSE3-LABEL: load_sext_8i8_to_8i32:
-; SSSE3: # BB#0: # %entry
+; SSSE3: # %bb.0: # %entry
; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
@@ -2486,13 +2485,13 @@ define <8 x i32> @load_sext_8i8_to_8i32(<8 x i8> *%ptr) {
; SSSE3-NEXT: retq
;
; SSE41-LABEL: load_sext_8i8_to_8i32:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: pmovsxbd (%rdi), %xmm0
; SSE41-NEXT: pmovsxbd 4(%rdi), %xmm1
; SSE41-NEXT: retq
;
; AVX1-LABEL: load_sext_8i8_to_8i32:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vpmovsxbw (%rdi), %xmm0
; AVX1-NEXT: vpmovsxwd %xmm0, %xmm1
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
@@ -2501,17 +2500,17 @@ define <8 x i32> @load_sext_8i8_to_8i32(<8 x i8> *%ptr) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: load_sext_8i8_to_8i32:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpmovsxbd (%rdi), %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: load_sext_8i8_to_8i32:
-; AVX512: # BB#0: # %entry
+; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vpmovsxbd (%rdi), %ymm0
; AVX512-NEXT: retq
;
; X32-SSE41-LABEL: load_sext_8i8_to_8i32:
-; X32-SSE41: # BB#0: # %entry
+; X32-SSE41: # %bb.0: # %entry
; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-SSE41-NEXT: pmovsxbd (%eax), %xmm0
; X32-SSE41-NEXT: pmovsxbd 4(%eax), %xmm1
@@ -2524,7 +2523,7 @@ entry:
define <16 x i8> @load_sext_16i1_to_16i8(<16 x i1> *%ptr) nounwind readnone {
; SSE2-LABEL: load_sext_16i1_to_16i8:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: pushq %rbp
; SSE2-NEXT: pushq %r15
; SSE2-NEXT: pushq %r14
@@ -2618,7 +2617,7 @@ define <16 x i8> @load_sext_16i1_to_16i8(<16 x i1> *%ptr) nounwind readnone {
; SSE2-NEXT: retq
;
; SSSE3-LABEL: load_sext_16i1_to_16i8:
-; SSSE3: # BB#0: # %entry
+; SSSE3: # %bb.0: # %entry
; SSSE3-NEXT: pushq %rbp
; SSSE3-NEXT: pushq %r15
; SSSE3-NEXT: pushq %r14
@@ -2712,7 +2711,7 @@ define <16 x i8> @load_sext_16i1_to_16i8(<16 x i1> *%ptr) nounwind readnone {
; SSSE3-NEXT: retq
;
; SSE41-LABEL: load_sext_16i1_to_16i8:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: movswq (%rdi), %rax
; SSE41-NEXT: movq %rax, %rcx
; SSE41-NEXT: shlq $62, %rcx
@@ -2778,7 +2777,7 @@ define <16 x i8> @load_sext_16i1_to_16i8(<16 x i1> *%ptr) nounwind readnone {
; SSE41-NEXT: retq
;
; AVX1-LABEL: load_sext_16i1_to_16i8:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: movswq (%rdi), %rax
; AVX1-NEXT: movq %rax, %rcx
; AVX1-NEXT: shlq $62, %rcx
@@ -2844,7 +2843,7 @@ define <16 x i8> @load_sext_16i1_to_16i8(<16 x i1> *%ptr) nounwind readnone {
; AVX1-NEXT: retq
;
; AVX2-LABEL: load_sext_16i1_to_16i8:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: movswq (%rdi), %rax
; AVX2-NEXT: movq %rax, %rcx
; AVX2-NEXT: shlq $62, %rcx
@@ -2910,7 +2909,7 @@ define <16 x i8> @load_sext_16i1_to_16i8(<16 x i1> *%ptr) nounwind readnone {
; AVX2-NEXT: retq
;
; AVX512F-LABEL: load_sext_16i1_to_16i8:
-; AVX512F: # BB#0: # %entry
+; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: kmovw (%rdi), %k1
; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
@@ -2918,15 +2917,15 @@ define <16 x i8> @load_sext_16i1_to_16i8(<16 x i1> *%ptr) nounwind readnone {
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: load_sext_16i1_to_16i8:
-; AVX512BW: # BB#0: # %entry
+; AVX512BW: # %bb.0: # %entry
; AVX512BW-NEXT: kmovw (%rdi), %k0
; AVX512BW-NEXT: vpmovm2b %k0, %zmm0
-; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512BW-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; X32-SSE41-LABEL: load_sext_16i1_to_16i8:
-; X32-SSE41: # BB#0: # %entry
+; X32-SSE41: # %bb.0: # %entry
; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-SSE41-NEXT: movswl (%eax), %eax
; X32-SSE41-NEXT: movl %eax, %ecx
@@ -2999,7 +2998,7 @@ entry:
define <16 x i16> @load_sext_16i1_to_16i16(<16 x i1> *%ptr) {
; SSE2-LABEL: load_sext_16i1_to_16i16:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movzwl (%rdi), %eax
; SSE2-NEXT: movl %eax, %ecx
; SSE2-NEXT: shrl $7, %ecx
@@ -3088,7 +3087,7 @@ define <16 x i16> @load_sext_16i1_to_16i16(<16 x i1> *%ptr) {
; SSE2-NEXT: retq
;
; SSSE3-LABEL: load_sext_16i1_to_16i16:
-; SSSE3: # BB#0: # %entry
+; SSSE3: # %bb.0: # %entry
; SSSE3-NEXT: movzwl (%rdi), %eax
; SSSE3-NEXT: movl %eax, %ecx
; SSSE3-NEXT: shrl $7, %ecx
@@ -3177,7 +3176,7 @@ define <16 x i16> @load_sext_16i1_to_16i16(<16 x i1> *%ptr) {
; SSSE3-NEXT: retq
;
; SSE41-LABEL: load_sext_16i1_to_16i16:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: movzwl (%rdi), %eax
; SSE41-NEXT: movl %eax, %ecx
; SSE41-NEXT: shrl %ecx
@@ -3250,36 +3249,24 @@ define <16 x i16> @load_sext_16i1_to_16i16(<16 x i1> *%ptr) {
; SSE41-NEXT: retq
;
; AVX1-LABEL: load_sext_16i1_to_16i16:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: pushq %rbp
-; AVX1-NEXT: .Lcfi0:
; AVX1-NEXT: .cfi_def_cfa_offset 16
; AVX1-NEXT: pushq %r15
-; AVX1-NEXT: .Lcfi1:
; AVX1-NEXT: .cfi_def_cfa_offset 24
; AVX1-NEXT: pushq %r14
-; AVX1-NEXT: .Lcfi2:
; AVX1-NEXT: .cfi_def_cfa_offset 32
; AVX1-NEXT: pushq %r13
-; AVX1-NEXT: .Lcfi3:
; AVX1-NEXT: .cfi_def_cfa_offset 40
; AVX1-NEXT: pushq %r12
-; AVX1-NEXT: .Lcfi4:
; AVX1-NEXT: .cfi_def_cfa_offset 48
; AVX1-NEXT: pushq %rbx
-; AVX1-NEXT: .Lcfi5:
; AVX1-NEXT: .cfi_def_cfa_offset 56
-; AVX1-NEXT: .Lcfi6:
; AVX1-NEXT: .cfi_offset %rbx, -56
-; AVX1-NEXT: .Lcfi7:
; AVX1-NEXT: .cfi_offset %r12, -48
-; AVX1-NEXT: .Lcfi8:
; AVX1-NEXT: .cfi_offset %r13, -40
-; AVX1-NEXT: .Lcfi9:
; AVX1-NEXT: .cfi_offset %r14, -32
-; AVX1-NEXT: .Lcfi10:
; AVX1-NEXT: .cfi_offset %r15, -24
-; AVX1-NEXT: .Lcfi11:
; AVX1-NEXT: .cfi_offset %rbp, -16
; AVX1-NEXT: movswq (%rdi), %rax
; AVX1-NEXT: movq %rax, %rcx
@@ -3353,36 +3340,24 @@ define <16 x i16> @load_sext_16i1_to_16i16(<16 x i1> *%ptr) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: load_sext_16i1_to_16i16:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: pushq %rbp
-; AVX2-NEXT: .Lcfi0:
; AVX2-NEXT: .cfi_def_cfa_offset 16
; AVX2-NEXT: pushq %r15
-; AVX2-NEXT: .Lcfi1:
; AVX2-NEXT: .cfi_def_cfa_offset 24
; AVX2-NEXT: pushq %r14
-; AVX2-NEXT: .Lcfi2:
; AVX2-NEXT: .cfi_def_cfa_offset 32
; AVX2-NEXT: pushq %r13
-; AVX2-NEXT: .Lcfi3:
; AVX2-NEXT: .cfi_def_cfa_offset 40
; AVX2-NEXT: pushq %r12
-; AVX2-NEXT: .Lcfi4:
; AVX2-NEXT: .cfi_def_cfa_offset 48
; AVX2-NEXT: pushq %rbx
-; AVX2-NEXT: .Lcfi5:
; AVX2-NEXT: .cfi_def_cfa_offset 56
-; AVX2-NEXT: .Lcfi6:
; AVX2-NEXT: .cfi_offset %rbx, -56
-; AVX2-NEXT: .Lcfi7:
; AVX2-NEXT: .cfi_offset %r12, -48
-; AVX2-NEXT: .Lcfi8:
; AVX2-NEXT: .cfi_offset %r13, -40
-; AVX2-NEXT: .Lcfi9:
; AVX2-NEXT: .cfi_offset %r14, -32
-; AVX2-NEXT: .Lcfi10:
; AVX2-NEXT: .cfi_offset %r15, -24
-; AVX2-NEXT: .Lcfi11:
; AVX2-NEXT: .cfi_offset %rbp, -16
; AVX2-NEXT: movswq (%rdi), %rax
; AVX2-NEXT: movq %rax, %rcx
@@ -3456,21 +3431,21 @@ define <16 x i16> @load_sext_16i1_to_16i16(<16 x i1> *%ptr) {
; AVX2-NEXT: retq
;
; AVX512F-LABEL: load_sext_16i1_to_16i16:
-; AVX512F: # BB#0: # %entry
+; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: kmovw (%rdi), %k1
; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: load_sext_16i1_to_16i16:
-; AVX512BW: # BB#0: # %entry
+; AVX512BW: # %bb.0: # %entry
; AVX512BW-NEXT: kmovw (%rdi), %k0
; AVX512BW-NEXT: vpmovm2w %k0, %zmm0
-; AVX512BW-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512BW-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
; AVX512BW-NEXT: retq
;
; X32-SSE41-LABEL: load_sext_16i1_to_16i16:
-; X32-SSE41: # BB#0: # %entry
+; X32-SSE41: # %bb.0: # %entry
; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-SSE41-NEXT: movzwl (%eax), %eax
; X32-SSE41-NEXT: movl %eax, %ecx
@@ -3549,7 +3524,7 @@ entry:
define <32 x i8> @load_sext_32i1_to_32i8(<32 x i1> *%ptr) nounwind readnone {
; SSE2-LABEL: load_sext_32i1_to_32i8:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: pushq %rbp
; SSE2-NEXT: pushq %r15
; SSE2-NEXT: pushq %r14
@@ -3721,7 +3696,7 @@ define <32 x i8> @load_sext_32i1_to_32i8(<32 x i1> *%ptr) nounwind readnone {
; SSE2-NEXT: retq
;
; SSSE3-LABEL: load_sext_32i1_to_32i8:
-; SSSE3: # BB#0: # %entry
+; SSSE3: # %bb.0: # %entry
; SSSE3-NEXT: pushq %rbp
; SSSE3-NEXT: pushq %r15
; SSSE3-NEXT: pushq %r14
@@ -3893,7 +3868,7 @@ define <32 x i8> @load_sext_32i1_to_32i8(<32 x i1> *%ptr) nounwind readnone {
; SSSE3-NEXT: retq
;
; SSE41-LABEL: load_sext_32i1_to_32i8:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: movswq (%rdi), %rax
; SSE41-NEXT: movq %rax, %rcx
; SSE41-NEXT: shlq $62, %rcx
@@ -4021,7 +3996,7 @@ define <32 x i8> @load_sext_32i1_to_32i8(<32 x i1> *%ptr) nounwind readnone {
; SSE41-NEXT: retq
;
; AVX1-LABEL: load_sext_32i1_to_32i8:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: pushq %rbp
; AVX1-NEXT: pushq %r15
; AVX1-NEXT: pushq %r14
@@ -4164,7 +4139,7 @@ define <32 x i8> @load_sext_32i1_to_32i8(<32 x i1> *%ptr) nounwind readnone {
; AVX1-NEXT: retq
;
; AVX2-LABEL: load_sext_32i1_to_32i8:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: pushq %rbp
; AVX2-NEXT: pushq %r15
; AVX2-NEXT: pushq %r14
@@ -4307,7 +4282,7 @@ define <32 x i8> @load_sext_32i1_to_32i8(<32 x i1> *%ptr) nounwind readnone {
; AVX2-NEXT: retq
;
; AVX512F-LABEL: load_sext_32i1_to_32i8:
-; AVX512F: # BB#0: # %entry
+; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: kmovw (%rdi), %k1
; AVX512F-NEXT: kmovw 2(%rdi), %k2
; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
@@ -4318,14 +4293,14 @@ define <32 x i8> @load_sext_32i1_to_32i8(<32 x i1> *%ptr) nounwind readnone {
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: load_sext_32i1_to_32i8:
-; AVX512BW: # BB#0: # %entry
+; AVX512BW: # %bb.0: # %entry
; AVX512BW-NEXT: kmovd (%rdi), %k0
; AVX512BW-NEXT: vpmovm2b %k0, %zmm0
-; AVX512BW-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512BW-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
; AVX512BW-NEXT: retq
;
; X32-SSE41-LABEL: load_sext_32i1_to_32i8:
-; X32-SSE41: # BB#0: # %entry
+; X32-SSE41: # %bb.0: # %entry
; X32-SSE41-NEXT: pushl %esi
; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-SSE41-NEXT: movswl (%eax), %ecx
@@ -4462,7 +4437,7 @@ entry:
define <16 x i16> @load_sext_16i8_to_16i16(<16 x i8> *%ptr) {
; SSE2-LABEL: load_sext_16i8_to_16i16:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: psraw $8, %xmm0
@@ -4472,7 +4447,7 @@ define <16 x i16> @load_sext_16i8_to_16i16(<16 x i8> *%ptr) {
; SSE2-NEXT: retq
;
; SSSE3-LABEL: load_sext_16i8_to_16i16:
-; SSSE3: # BB#0: # %entry
+; SSSE3: # %bb.0: # %entry
; SSSE3-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSSE3-NEXT: psraw $8, %xmm0
@@ -4482,30 +4457,30 @@ define <16 x i16> @load_sext_16i8_to_16i16(<16 x i8> *%ptr) {
; SSSE3-NEXT: retq
;
; SSE41-LABEL: load_sext_16i8_to_16i16:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: pmovsxbw (%rdi), %xmm0
; SSE41-NEXT: pmovsxbw 8(%rdi), %xmm1
; SSE41-NEXT: retq
;
; AVX1-LABEL: load_sext_16i8_to_16i16:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vpmovsxbw (%rdi), %xmm0
; AVX1-NEXT: vpmovsxbw 8(%rdi), %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: load_sext_16i8_to_16i16:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpmovsxbw (%rdi), %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: load_sext_16i8_to_16i16:
-; AVX512: # BB#0: # %entry
+; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vpmovsxbw (%rdi), %ymm0
; AVX512-NEXT: retq
;
; X32-SSE41-LABEL: load_sext_16i8_to_16i16:
-; X32-SSE41: # BB#0: # %entry
+; X32-SSE41: # %bb.0: # %entry
; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-SSE41-NEXT: pmovsxbw (%eax), %xmm0
; X32-SSE41-NEXT: pmovsxbw 8(%eax), %xmm1
@@ -4518,7 +4493,7 @@ entry:
define <2 x i64> @load_sext_2i16_to_2i64(<2 x i16> *%ptr) {
; SSE2-LABEL: load_sext_2i16_to_2i64:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
; SSE2-NEXT: movdqa %xmm0, %xmm1
@@ -4528,7 +4503,7 @@ define <2 x i64> @load_sext_2i16_to_2i64(<2 x i16> *%ptr) {
; SSE2-NEXT: retq
;
; SSSE3-LABEL: load_sext_2i16_to_2i64:
-; SSSE3: # BB#0: # %entry
+; SSSE3: # %bb.0: # %entry
; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
; SSSE3-NEXT: movdqa %xmm0, %xmm1
@@ -4538,17 +4513,17 @@ define <2 x i64> @load_sext_2i16_to_2i64(<2 x i16> *%ptr) {
; SSSE3-NEXT: retq
;
; SSE41-LABEL: load_sext_2i16_to_2i64:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: pmovsxwq (%rdi), %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: load_sext_2i16_to_2i64:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vpmovsxwq (%rdi), %xmm0
; AVX-NEXT: retq
;
; X32-SSE41-LABEL: load_sext_2i16_to_2i64:
-; X32-SSE41: # BB#0: # %entry
+; X32-SSE41: # %bb.0: # %entry
; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-SSE41-NEXT: pmovsxwq (%eax), %xmm0
; X32-SSE41-NEXT: retl
@@ -4560,31 +4535,31 @@ entry:
define <4 x i32> @load_sext_4i16_to_4i32(<4 x i16> *%ptr) {
; SSE2-LABEL: load_sext_4i16_to_4i32:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
; SSE2-NEXT: psrad $16, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: load_sext_4i16_to_4i32:
-; SSSE3: # BB#0: # %entry
+; SSSE3: # %bb.0: # %entry
; SSSE3-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
; SSSE3-NEXT: psrad $16, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: load_sext_4i16_to_4i32:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: pmovsxwd (%rdi), %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: load_sext_4i16_to_4i32:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vpmovsxwd (%rdi), %xmm0
; AVX-NEXT: retq
;
; X32-SSE41-LABEL: load_sext_4i16_to_4i32:
-; X32-SSE41: # BB#0: # %entry
+; X32-SSE41: # %bb.0: # %entry
; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-SSE41-NEXT: pmovsxwd (%eax), %xmm0
; X32-SSE41-NEXT: retl
@@ -4596,7 +4571,7 @@ entry:
define <4 x i64> @load_sext_4i16_to_4i64(<4 x i16> *%ptr) {
; SSE2-LABEL: load_sext_4i16_to_4i64:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movswq 2(%rdi), %rax
; SSE2-NEXT: movq %rax, %xmm1
; SSE2-NEXT: movswq (%rdi), %rax
@@ -4610,7 +4585,7 @@ define <4 x i64> @load_sext_4i16_to_4i64(<4 x i16> *%ptr) {
; SSE2-NEXT: retq
;
; SSSE3-LABEL: load_sext_4i16_to_4i64:
-; SSSE3: # BB#0: # %entry
+; SSSE3: # %bb.0: # %entry
; SSSE3-NEXT: movswq 2(%rdi), %rax
; SSSE3-NEXT: movq %rax, %xmm1
; SSSE3-NEXT: movswq (%rdi), %rax
@@ -4624,13 +4599,13 @@ define <4 x i64> @load_sext_4i16_to_4i64(<4 x i16> *%ptr) {
; SSSE3-NEXT: retq
;
; SSE41-LABEL: load_sext_4i16_to_4i64:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: pmovsxwq (%rdi), %xmm0
; SSE41-NEXT: pmovsxwq 4(%rdi), %xmm1
; SSE41-NEXT: retq
;
; AVX1-LABEL: load_sext_4i16_to_4i64:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vpmovsxwd (%rdi), %xmm0
; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
@@ -4639,17 +4614,17 @@ define <4 x i64> @load_sext_4i16_to_4i64(<4 x i16> *%ptr) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: load_sext_4i16_to_4i64:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpmovsxwq (%rdi), %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: load_sext_4i16_to_4i64:
-; AVX512: # BB#0: # %entry
+; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vpmovsxwq (%rdi), %ymm0
; AVX512-NEXT: retq
;
; X32-SSE41-LABEL: load_sext_4i16_to_4i64:
-; X32-SSE41: # BB#0: # %entry
+; X32-SSE41: # %bb.0: # %entry
; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-SSE41-NEXT: pmovsxwq (%eax), %xmm0
; X32-SSE41-NEXT: pmovsxwq 4(%eax), %xmm1
@@ -4662,7 +4637,7 @@ entry:
define <8 x i32> @load_sext_8i16_to_8i32(<8 x i16> *%ptr) {
; SSE2-LABEL: load_sext_8i16_to_8i32:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
; SSE2-NEXT: psrad $16, %xmm0
@@ -4672,7 +4647,7 @@ define <8 x i32> @load_sext_8i16_to_8i32(<8 x i16> *%ptr) {
; SSE2-NEXT: retq
;
; SSSE3-LABEL: load_sext_8i16_to_8i32:
-; SSSE3: # BB#0: # %entry
+; SSSE3: # %bb.0: # %entry
; SSSE3-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
; SSSE3-NEXT: psrad $16, %xmm0
@@ -4682,30 +4657,30 @@ define <8 x i32> @load_sext_8i16_to_8i32(<8 x i16> *%ptr) {
; SSSE3-NEXT: retq
;
; SSE41-LABEL: load_sext_8i16_to_8i32:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: pmovsxwd (%rdi), %xmm0
; SSE41-NEXT: pmovsxwd 8(%rdi), %xmm1
; SSE41-NEXT: retq
;
; AVX1-LABEL: load_sext_8i16_to_8i32:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vpmovsxwd (%rdi), %xmm0
; AVX1-NEXT: vpmovsxwd 8(%rdi), %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: load_sext_8i16_to_8i32:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpmovsxwd (%rdi), %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: load_sext_8i16_to_8i32:
-; AVX512: # BB#0: # %entry
+; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vpmovsxwd (%rdi), %ymm0
; AVX512-NEXT: retq
;
; X32-SSE41-LABEL: load_sext_8i16_to_8i32:
-; X32-SSE41: # BB#0: # %entry
+; X32-SSE41: # %bb.0: # %entry
; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-SSE41-NEXT: pmovsxwd (%eax), %xmm0
; X32-SSE41-NEXT: pmovsxwd 8(%eax), %xmm1
@@ -4718,7 +4693,7 @@ entry:
define <2 x i64> @load_sext_2i32_to_2i64(<2 x i32> *%ptr) {
; SSE2-LABEL: load_sext_2i32_to_2i64:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrad $31, %xmm1
@@ -4726,7 +4701,7 @@ define <2 x i64> @load_sext_2i32_to_2i64(<2 x i32> *%ptr) {
; SSE2-NEXT: retq
;
; SSSE3-LABEL: load_sext_2i32_to_2i64:
-; SSSE3: # BB#0: # %entry
+; SSSE3: # %bb.0: # %entry
; SSSE3-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; SSSE3-NEXT: movdqa %xmm0, %xmm1
; SSSE3-NEXT: psrad $31, %xmm1
@@ -4734,17 +4709,17 @@ define <2 x i64> @load_sext_2i32_to_2i64(<2 x i32> *%ptr) {
; SSSE3-NEXT: retq
;
; SSE41-LABEL: load_sext_2i32_to_2i64:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: pmovsxdq (%rdi), %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: load_sext_2i32_to_2i64:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vpmovsxdq (%rdi), %xmm0
; AVX-NEXT: retq
;
; X32-SSE41-LABEL: load_sext_2i32_to_2i64:
-; X32-SSE41: # BB#0: # %entry
+; X32-SSE41: # %bb.0: # %entry
; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-SSE41-NEXT: pmovsxdq (%eax), %xmm0
; X32-SSE41-NEXT: retl
@@ -4756,7 +4731,7 @@ entry:
define <4 x i64> @load_sext_4i32_to_4i64(<4 x i32> *%ptr) {
; SSE2-LABEL: load_sext_4i32_to_4i64:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa (%rdi), %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: psrad $31, %xmm2
@@ -4768,7 +4743,7 @@ define <4 x i64> @load_sext_4i32_to_4i64(<4 x i32> *%ptr) {
; SSE2-NEXT: retq
;
; SSSE3-LABEL: load_sext_4i32_to_4i64:
-; SSSE3: # BB#0: # %entry
+; SSSE3: # %bb.0: # %entry
; SSSE3-NEXT: movdqa (%rdi), %xmm0
; SSSE3-NEXT: movdqa %xmm0, %xmm2
; SSSE3-NEXT: psrad $31, %xmm2
@@ -4780,30 +4755,30 @@ define <4 x i64> @load_sext_4i32_to_4i64(<4 x i32> *%ptr) {
; SSSE3-NEXT: retq
;
; SSE41-LABEL: load_sext_4i32_to_4i64:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: pmovsxdq (%rdi), %xmm0
; SSE41-NEXT: pmovsxdq 8(%rdi), %xmm1
; SSE41-NEXT: retq
;
; AVX1-LABEL: load_sext_4i32_to_4i64:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vpmovsxdq (%rdi), %xmm0
; AVX1-NEXT: vpmovsxdq 8(%rdi), %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: load_sext_4i32_to_4i64:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpmovsxdq (%rdi), %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: load_sext_4i32_to_4i64:
-; AVX512: # BB#0: # %entry
+; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vpmovsxdq (%rdi), %ymm0
; AVX512-NEXT: retq
;
; X32-SSE41-LABEL: load_sext_4i32_to_4i64:
-; X32-SSE41: # BB#0: # %entry
+; X32-SSE41: # %bb.0: # %entry
; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-SSE41-NEXT: pmovsxdq (%eax), %xmm0
; X32-SSE41-NEXT: pmovsxdq 8(%eax), %xmm1
@@ -4816,35 +4791,34 @@ entry:
define i32 @sext_2i8_to_i32(<16 x i8> %A) nounwind uwtable readnone ssp {
; SSE2-LABEL: sext_2i8_to_i32:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: psraw $8, %xmm0
; SSE2-NEXT: movd %xmm0, %eax
; SSE2-NEXT: retq
;
; SSSE3-LABEL: sext_2i8_to_i32:
-; SSSE3: # BB#0: # %entry
+; SSSE3: # %bb.0: # %entry
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSSE3-NEXT: psraw $8, %xmm0
; SSSE3-NEXT: movd %xmm0, %eax
; SSSE3-NEXT: retq
;
; SSE41-LABEL: sext_2i8_to_i32:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: pmovsxbw %xmm0, %xmm0
; SSE41-NEXT: movd %xmm0, %eax
; SSE41-NEXT: retq
;
; AVX-LABEL: sext_2i8_to_i32:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vpmovsxbw %xmm0, %xmm0
; AVX-NEXT: vmovd %xmm0, %eax
; AVX-NEXT: retq
;
; X32-SSE41-LABEL: sext_2i8_to_i32:
-; X32-SSE41: # BB#0: # %entry
+; X32-SSE41: # %bb.0: # %entry
; X32-SSE41-NEXT: pushl %eax
-; X32-SSE41-NEXT: .Lcfi0:
; X32-SSE41-NEXT: .cfi_def_cfa_offset 8
; X32-SSE41-NEXT: pmovsxbw %xmm0, %xmm0
; X32-SSE41-NEXT: movd %xmm0, %eax
@@ -4859,7 +4833,7 @@ entry:
define <4 x i64> @sext_4i1_to_4i64(<4 x i1> %mask) {
; SSE2-LABEL: sext_4i1_to_4i64:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: pslld $31, %xmm0
; SSE2-NEXT: psrad $31, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm2
@@ -4872,7 +4846,7 @@ define <4 x i64> @sext_4i1_to_4i64(<4 x i1> %mask) {
; SSE2-NEXT: retq
;
; SSSE3-LABEL: sext_4i1_to_4i64:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: pslld $31, %xmm0
; SSSE3-NEXT: psrad $31, %xmm0
; SSSE3-NEXT: movdqa %xmm0, %xmm2
@@ -4885,7 +4859,7 @@ define <4 x i64> @sext_4i1_to_4i64(<4 x i1> %mask) {
; SSSE3-NEXT: retq
;
; SSE41-LABEL: sext_4i1_to_4i64:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pslld $31, %xmm0
; SSE41-NEXT: psrad $31, %xmm0
; SSE41-NEXT: pmovsxdq %xmm0, %xmm2
@@ -4895,7 +4869,7 @@ define <4 x i64> @sext_4i1_to_4i64(<4 x i1> %mask) {
; SSE41-NEXT: retq
;
; AVX1-LABEL: sext_4i1_to_4i64:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpslld $31, %xmm0, %xmm0
; AVX1-NEXT: vpsrad $31, %xmm0, %xmm0
; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1
@@ -4905,21 +4879,21 @@ define <4 x i64> @sext_4i1_to_4i64(<4 x i1> %mask) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: sext_4i1_to_4i64:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpslld $31, %xmm0, %xmm0
; AVX2-NEXT: vpsrad $31, %xmm0, %xmm0
; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: sext_4i1_to_4i64:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpslld $31, %xmm0, %xmm0
; AVX512-NEXT: vpsrad $31, %xmm0, %xmm0
; AVX512-NEXT: vpmovsxdq %xmm0, %ymm0
; AVX512-NEXT: retq
;
; X32-SSE41-LABEL: sext_4i1_to_4i64:
-; X32-SSE41: # BB#0:
+; X32-SSE41: # %bb.0:
; X32-SSE41-NEXT: pslld $31, %xmm0
; X32-SSE41-NEXT: psrad $31, %xmm0
; X32-SSE41-NEXT: pmovsxdq %xmm0, %xmm2
@@ -4933,7 +4907,7 @@ define <4 x i64> @sext_4i1_to_4i64(<4 x i1> %mask) {
define <4 x i64> @sext_4i8_to_4i64(<4 x i8> %mask) {
; SSE2-LABEL: sext_4i8_to_4i64:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: pslld $24, %xmm0
; SSE2-NEXT: psrad $24, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm2
@@ -4946,7 +4920,7 @@ define <4 x i64> @sext_4i8_to_4i64(<4 x i8> %mask) {
; SSE2-NEXT: retq
;
; SSSE3-LABEL: sext_4i8_to_4i64:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: pslld $24, %xmm0
; SSSE3-NEXT: psrad $24, %xmm0
; SSSE3-NEXT: movdqa %xmm0, %xmm2
@@ -4959,7 +4933,7 @@ define <4 x i64> @sext_4i8_to_4i64(<4 x i8> %mask) {
; SSSE3-NEXT: retq
;
; SSE41-LABEL: sext_4i8_to_4i64:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pslld $24, %xmm0
; SSE41-NEXT: psrad $24, %xmm0
; SSE41-NEXT: pmovsxdq %xmm0, %xmm2
@@ -4969,7 +4943,7 @@ define <4 x i64> @sext_4i8_to_4i64(<4 x i8> %mask) {
; SSE41-NEXT: retq
;
; AVX1-LABEL: sext_4i8_to_4i64:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpslld $24, %xmm0, %xmm0
; AVX1-NEXT: vpsrad $24, %xmm0, %xmm0
; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1
@@ -4979,21 +4953,21 @@ define <4 x i64> @sext_4i8_to_4i64(<4 x i8> %mask) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: sext_4i8_to_4i64:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpslld $24, %xmm0, %xmm0
; AVX2-NEXT: vpsrad $24, %xmm0, %xmm0
; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: sext_4i8_to_4i64:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpslld $24, %xmm0, %xmm0
; AVX512-NEXT: vpsrad $24, %xmm0, %xmm0
; AVX512-NEXT: vpmovsxdq %xmm0, %ymm0
; AVX512-NEXT: retq
;
; X32-SSE41-LABEL: sext_4i8_to_4i64:
-; X32-SSE41: # BB#0:
+; X32-SSE41: # %bb.0:
; X32-SSE41-NEXT: pslld $24, %xmm0
; X32-SSE41-NEXT: psrad $24, %xmm0
; X32-SSE41-NEXT: pmovsxdq %xmm0, %xmm2
@@ -5007,7 +4981,7 @@ define <4 x i64> @sext_4i8_to_4i64(<4 x i8> %mask) {
define <32 x i8> @sext_32xi1_to_32xi8(<32 x i16> %c1, <32 x i16> %c2)nounwind {
; SSE-LABEL: sext_32xi1_to_32xi8:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pcmpeqw %xmm5, %xmm1
; SSE-NEXT: pcmpeqw %xmm4, %xmm0
; SSE-NEXT: packsswb %xmm1, %xmm0
@@ -5018,7 +4992,7 @@ define <32 x i8> @sext_32xi1_to_32xi8(<32 x i16> %c1, <32 x i16> %c2)nounwind {
; SSE-NEXT: retq
;
; AVX1-LABEL: sext_32xi1_to_32xi8:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
; AVX1-NEXT: vpcmpeqw %xmm4, %xmm5, %xmm4
@@ -5033,7 +5007,7 @@ define <32 x i8> @sext_32xi1_to_32xi8(<32 x i16> %c1, <32 x i16> %c2)nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: sext_32xi1_to_32xi8:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpcmpeqw %ymm3, %ymm1, %ymm1
; AVX2-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpacksswb %ymm1, %ymm0, %ymm0
@@ -5041,7 +5015,7 @@ define <32 x i8> @sext_32xi1_to_32xi8(<32 x i16> %c1, <32 x i16> %c2)nounwind {
; AVX2-NEXT: retq
;
; AVX512F-LABEL: sext_32xi1_to_32xi8:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm0
; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
@@ -5052,14 +5026,14 @@ define <32 x i8> @sext_32xi1_to_32xi8(<32 x i16> %c1, <32 x i16> %c2)nounwind {
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: sext_32xi1_to_32xi8:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpcmpeqw %zmm1, %zmm0, %k0
; AVX512BW-NEXT: vpmovm2b %k0, %zmm0
-; AVX512BW-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512BW-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
; AVX512BW-NEXT: retq
;
; X32-SSE41-LABEL: sext_32xi1_to_32xi8:
-; X32-SSE41: # BB#0:
+; X32-SSE41: # %bb.0:
; X32-SSE41-NEXT: pushl %ebp
; X32-SSE41-NEXT: movl %esp, %ebp
; X32-SSE41-NEXT: andl $-16, %esp
@@ -5079,3 +5053,51 @@ define <32 x i8> @sext_32xi1_to_32xi8(<32 x i16> %c1, <32 x i16> %c2)nounwind {
%b = sext <32 x i1> %a to <32 x i8>
ret <32 x i8> %b
}
+
+define <2 x i32> @sext_2i8_to_2i32(<2 x i8>* %addr) {
+; SSE2-LABEL: sext_2i8_to_2i32:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movzwl (%rdi), %eax
+; SSE2-NEXT: movd %eax, %xmm0
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-NEXT: psrad $24, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; SSE2-NEXT: paddq %xmm0, %xmm0
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: sext_2i8_to_2i32:
+; SSSE3: # %bb.0:
+; SSSE3-NEXT: movzwl (%rdi), %eax
+; SSSE3-NEXT: movd %eax, %xmm0
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSSE3-NEXT: psrad $24, %xmm0
+; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; SSSE3-NEXT: paddq %xmm0, %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: sext_2i8_to_2i32:
+; SSE41: # %bb.0:
+; SSE41-NEXT: pmovsxbq (%rdi), %xmm0
+; SSE41-NEXT: paddq %xmm0, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: sext_2i8_to_2i32:
+; AVX: # %bb.0:
+; AVX-NEXT: vpmovsxbq (%rdi), %xmm0
+; AVX-NEXT: vpaddq %xmm0, %xmm0, %xmm0
+; AVX-NEXT: retq
+;
+; X32-SSE41-LABEL: sext_2i8_to_2i32:
+; X32-SSE41: # %bb.0:
+; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-SSE41-NEXT: pmovsxbq (%eax), %xmm0
+; X32-SSE41-NEXT: paddq %xmm0, %xmm0
+; X32-SSE41-NEXT: retl
+ %x = load <2 x i8>, <2 x i8>* %addr, align 1
+ %y = sext <2 x i8> %x to <2 x i32>
+ %z = add <2 x i32>%y, %y
+ ret <2 x i32>%z
+}
+
diff --git a/test/CodeGen/X86/vector-shift-ashr-128.ll b/test/CodeGen/X86/vector-shift-ashr-128.ll
index a5e2cb66eba8..ea33f22cc07a 100644
--- a/test/CodeGen/X86/vector-shift-ashr-128.ll
+++ b/test/CodeGen/X86/vector-shift-ashr-128.ll
@@ -5,10 +5,10 @@
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX1
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512DQ
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512DQVL
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512BWVL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512DQ
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512DQVL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512BWVL
;
; Just one 32-bit run to make sure we do reasonable things for i64 shifts.
; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=X32-SSE --check-prefix=X32-SSE2
@@ -19,24 +19,23 @@
define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
; SSE2-LABEL: var_shift_v2i64:
-; SSE2: # BB#0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
-; SSE2-NEXT: movdqa %xmm2, %xmm4
-; SSE2-NEXT: psrlq %xmm3, %xmm4
-; SSE2-NEXT: psrlq %xmm1, %xmm2
-; SSE2-NEXT: movsd {{.*#+}} xmm4 = xmm2[0],xmm4[1]
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: psrlq %xmm3, %xmm2
-; SSE2-NEXT: psrlq %xmm1, %xmm0
-; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
-; SSE2-NEXT: xorpd %xmm4, %xmm2
-; SSE2-NEXT: psubq %xmm4, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm0
+; SSE2-NEXT: movdqa %xmm2, %xmm3
+; SSE2-NEXT: psrlq %xmm1, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1]
+; SSE2-NEXT: psrlq %xmm4, %xmm2
+; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm3[0],xmm2[1]
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: psrlq %xmm1, %xmm3
+; SSE2-NEXT: psrlq %xmm4, %xmm0
+; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1]
+; SSE2-NEXT: xorpd %xmm2, %xmm0
+; SSE2-NEXT: psubq %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: var_shift_v2i64:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
; SSE41-NEXT: movdqa %xmm2, %xmm3
; SSE41-NEXT: psrlq %xmm1, %xmm3
@@ -52,7 +51,7 @@ define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
; SSE41-NEXT: retq
;
; AVX1-LABEL: var_shift_v2i64:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
; AVX1-NEXT: vpsrlq %xmm1, %xmm2, %xmm3
; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,3,0,1]
@@ -66,7 +65,7 @@ define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: var_shift_v2i64:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
; AVX2-NEXT: vpsrlvq %xmm1, %xmm2, %xmm3
; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0
@@ -75,40 +74,40 @@ define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
; AVX2-NEXT: retq
;
; XOP-LABEL: var_shift_v2i64:
-; XOP: # BB#0:
+; XOP: # %bb.0:
; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2
; XOP-NEXT: vpsubq %xmm1, %xmm2, %xmm1
; XOP-NEXT: vpshaq %xmm1, %xmm0, %xmm0
; XOP-NEXT: retq
;
; AVX512-LABEL: var_shift_v2i64:
-; AVX512: # BB#0:
-; AVX512-NEXT: # kill: %XMM1<def> %XMM1<kill> %ZMM1<def>
-; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; AVX512: # %bb.0:
+; AVX512-NEXT: # kill: def %xmm1 killed %xmm1 def %zmm1
+; AVX512-NEXT: # kill: def %xmm0 killed %xmm0 def %zmm0
; AVX512-NEXT: vpsravq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: var_shift_v2i64:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpsravq %xmm1, %xmm0, %xmm0
; AVX512VL-NEXT: retq
;
; X32-SSE-LABEL: var_shift_v2i64:
-; X32-SSE: # BB#0:
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648]
-; X32-SSE-NEXT: movdqa %xmm2, %xmm4
-; X32-SSE-NEXT: psrlq %xmm3, %xmm4
-; X32-SSE-NEXT: psrlq %xmm1, %xmm2
-; X32-SSE-NEXT: movsd {{.*#+}} xmm4 = xmm2[0],xmm4[1]
-; X32-SSE-NEXT: movdqa %xmm0, %xmm2
-; X32-SSE-NEXT: psrlq %xmm3, %xmm2
-; X32-SSE-NEXT: psrlq %xmm1, %xmm0
-; X32-SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
-; X32-SSE-NEXT: xorpd %xmm4, %xmm2
-; X32-SSE-NEXT: psubq %xmm4, %xmm2
-; X32-SSE-NEXT: movdqa %xmm2, %xmm0
+; X32-SSE-NEXT: movdqa %xmm2, %xmm3
+; X32-SSE-NEXT: psrlq %xmm1, %xmm3
+; X32-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1]
+; X32-SSE-NEXT: psrlq %xmm4, %xmm2
+; X32-SSE-NEXT: movsd {{.*#+}} xmm2 = xmm3[0],xmm2[1]
+; X32-SSE-NEXT: movdqa %xmm0, %xmm3
+; X32-SSE-NEXT: psrlq %xmm1, %xmm3
+; X32-SSE-NEXT: psrlq %xmm4, %xmm0
+; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1]
+; X32-SSE-NEXT: xorpd %xmm2, %xmm0
+; X32-SSE-NEXT: psubq %xmm2, %xmm0
; X32-SSE-NEXT: retl
%shift = ashr <2 x i64> %a, %b
ret <2 x i64> %shift
@@ -116,31 +115,31 @@ define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
; SSE2-LABEL: var_shift_v4i32:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: psrldq {{.*#+}} xmm2 = xmm2[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT: psrlq $32, %xmm2
; SSE2-NEXT: movdqa %xmm0, %xmm3
; SSE2-NEXT: psrad %xmm2, %xmm3
; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: psrlq $32, %xmm2
+; SSE2-NEXT: psrldq {{.*#+}} xmm2 = xmm2[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; SSE2-NEXT: movdqa %xmm0, %xmm4
; SSE2-NEXT: psrad %xmm2, %xmm4
-; SSE2-NEXT: movsd {{.*#+}} xmm3 = xmm4[0],xmm3[1]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,3,2,3]
+; SSE2-NEXT: movsd {{.*#+}} xmm4 = xmm3[0],xmm4[1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,3,2,3]
; SSE2-NEXT: pxor %xmm3, %xmm3
; SSE2-NEXT: movdqa %xmm1, %xmm4
-; SSE2-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
; SSE2-NEXT: movdqa %xmm0, %xmm5
; SSE2-NEXT: psrad %xmm4, %xmm5
-; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
; SSE2-NEXT: psrad %xmm1, %xmm0
-; SSE2-NEXT: movsd {{.*#+}} xmm5 = xmm0[0],xmm5[1]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,2,2,3]
+; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm5[0],xmm0[1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; SSE2-NEXT: retq
;
; SSE41-LABEL: var_shift_v4i32:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm1, %xmm2
; SSE41-NEXT: psrldq {{.*#+}} xmm2 = xmm2[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; SSE41-NEXT: movdqa %xmm0, %xmm3
@@ -161,7 +160,7 @@ define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
; SSE41-NEXT: retq
;
; AVX1-LABEL: var_shift_v4i32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; AVX1-NEXT: vpsrad %xmm2, %xmm0, %xmm2
; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm3
@@ -177,53 +176,53 @@ define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: var_shift_v4i32:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpsravd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: retq
;
; XOPAVX1-LABEL: var_shift_v4i32:
-; XOPAVX1: # BB#0:
+; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; XOPAVX1-NEXT: vpsubd %xmm1, %xmm2, %xmm1
; XOPAVX1-NEXT: vpshad %xmm1, %xmm0, %xmm0
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: var_shift_v4i32:
-; XOPAVX2: # BB#0:
+; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: vpsravd %xmm1, %xmm0, %xmm0
; XOPAVX2-NEXT: retq
;
; AVX512-LABEL: var_shift_v4i32:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpsravd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: var_shift_v4i32:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpsravd %xmm1, %xmm0, %xmm0
; AVX512VL-NEXT: retq
;
; X32-SSE-LABEL: var_shift_v4i32:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: movdqa %xmm1, %xmm2
-; X32-SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; X32-SSE-NEXT: psrlq $32, %xmm2
; X32-SSE-NEXT: movdqa %xmm0, %xmm3
; X32-SSE-NEXT: psrad %xmm2, %xmm3
; X32-SSE-NEXT: movdqa %xmm1, %xmm2
-; X32-SSE-NEXT: psrlq $32, %xmm2
+; X32-SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; X32-SSE-NEXT: movdqa %xmm0, %xmm4
; X32-SSE-NEXT: psrad %xmm2, %xmm4
-; X32-SSE-NEXT: movsd {{.*#+}} xmm3 = xmm4[0],xmm3[1]
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,3,2,3]
+; X32-SSE-NEXT: movsd {{.*#+}} xmm4 = xmm3[0],xmm4[1]
+; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,3,2,3]
; X32-SSE-NEXT: pxor %xmm3, %xmm3
; X32-SSE-NEXT: movdqa %xmm1, %xmm4
-; X32-SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; X32-SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
; X32-SSE-NEXT: movdqa %xmm0, %xmm5
; X32-SSE-NEXT: psrad %xmm4, %xmm5
-; X32-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; X32-SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
; X32-SSE-NEXT: psrad %xmm1, %xmm0
-; X32-SSE-NEXT: movsd {{.*#+}} xmm5 = xmm0[0],xmm5[1]
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,2,2,3]
+; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm5[0],xmm0[1]
+; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X32-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; X32-SSE-NEXT: retl
%shift = ashr <4 x i32> %a, %b
@@ -232,7 +231,7 @@ define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
; SSE2-LABEL: var_shift_v8i16:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: psllw $12, %xmm1
; SSE2-NEXT: movdqa %xmm1, %xmm2
; SSE2-NEXT: psraw $15, %xmm2
@@ -267,7 +266,7 @@ define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
; SSE2-NEXT: retq
;
; SSE41-LABEL: var_shift_v8i16:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm2
; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: psllw $12, %xmm0
@@ -297,7 +296,7 @@ define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
; SSE41-NEXT: retq
;
; AVX1-LABEL: var_shift_v8i16:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpsllw $12, %xmm1, %xmm2
; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1
; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1
@@ -315,55 +314,57 @@ define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: var_shift_v8i16:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
; AVX2-NEXT: vpsravd %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; XOP-LABEL: var_shift_v8i16:
-; XOP: # BB#0:
+; XOP: # %bb.0:
; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2
; XOP-NEXT: vpsubw %xmm1, %xmm2, %xmm1
; XOP-NEXT: vpshaw %xmm1, %xmm0, %xmm0
; XOP-NEXT: retq
;
; AVX512DQ-LABEL: var_shift_v8i16:
-; AVX512DQ: # BB#0:
+; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; AVX512DQ-NEXT: vpmovsxwd %xmm0, %ymm0
; AVX512DQ-NEXT: vpsravd %ymm1, %ymm0, %ymm0
; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512DQ-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: var_shift_v8i16:
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: # kill: %XMM1<def> %XMM1<kill> %ZMM1<def>
-; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: # kill: def %xmm1 killed %xmm1 def %zmm1
+; AVX512BW-NEXT: # kill: def %xmm0 killed %xmm0 def %zmm0
; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512BW-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQVL-LABEL: var_shift_v8i16:
-; AVX512DQVL: # BB#0:
+; AVX512DQVL: # %bb.0:
; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; AVX512DQVL-NEXT: vpmovsxwd %xmm0, %ymm0
; AVX512DQVL-NEXT: vpsravd %ymm1, %ymm0, %ymm0
; AVX512DQVL-NEXT: vpmovdw %ymm0, %xmm0
+; AVX512DQVL-NEXT: vzeroupper
; AVX512DQVL-NEXT: retq
;
; AVX512BWVL-LABEL: var_shift_v8i16:
-; AVX512BWVL: # BB#0:
+; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vpsravw %xmm1, %xmm0, %xmm0
; AVX512BWVL-NEXT: retq
;
; X32-SSE-LABEL: var_shift_v8i16:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: psllw $12, %xmm1
; X32-SSE-NEXT: movdqa %xmm1, %xmm2
; X32-SSE-NEXT: psraw $15, %xmm2
@@ -402,7 +403,7 @@ define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; SSE2-LABEL: var_shift_v16i8:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
; SSE2-NEXT: psllw $5, %xmm1
; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15]
@@ -460,7 +461,7 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; SSE2-NEXT: retq
;
; SSE41-LABEL: var_shift_v16i8:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm2
; SSE41-NEXT: psllw $5, %xmm1
; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
@@ -496,7 +497,7 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; SSE41-NEXT: retq
;
; AVX-LABEL: var_shift_v16i8:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpsllw $5, %xmm1, %xmm1
; AVX-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
; AVX-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
@@ -524,30 +525,32 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; AVX-NEXT: retq
;
; XOP-LABEL: var_shift_v16i8:
-; XOP: # BB#0:
+; XOP: # %bb.0:
; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2
; XOP-NEXT: vpsubb %xmm1, %xmm2, %xmm1
; XOP-NEXT: vpshab %xmm1, %xmm0, %xmm0
; XOP-NEXT: retq
;
; AVX512-LABEL: var_shift_v16i8:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
; AVX512-NEXT: vpmovsxbd %xmm0, %zmm0
; AVX512-NEXT: vpsravd %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: var_shift_v16i8:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
; AVX512VL-NEXT: vpmovsxbd %xmm0, %zmm0
; AVX512VL-NEXT: vpsravd %zmm1, %zmm0, %zmm0
; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; X32-SSE-LABEL: var_shift_v16i8:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
; X32-SSE-NEXT: psllw $5, %xmm1
; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15]
@@ -613,7 +616,7 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
define <2 x i64> @splatvar_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
; SSE-LABEL: splatvar_shift_v2i64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
; SSE-NEXT: psrlq %xmm1, %xmm2
; SSE-NEXT: psrlq %xmm1, %xmm0
@@ -622,7 +625,7 @@ define <2 x i64> @splatvar_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
; SSE-NEXT: retq
;
; AVX-LABEL: splatvar_shift_v2i64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
; AVX-NEXT: vpsrlq %xmm1, %xmm2, %xmm2
; AVX-NEXT: vpsrlq %xmm1, %xmm0, %xmm0
@@ -631,7 +634,7 @@ define <2 x i64> @splatvar_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
; AVX-NEXT: retq
;
; XOPAVX1-LABEL: splatvar_shift_v2i64:
-; XOPAVX1: # BB#0:
+; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; XOPAVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm1
@@ -639,7 +642,7 @@ define <2 x i64> @splatvar_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: splatvar_shift_v2i64:
-; XOPAVX2: # BB#0:
+; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: vpbroadcastq %xmm1, %xmm1
; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
; XOPAVX2-NEXT: vpsubq %xmm1, %xmm2, %xmm1
@@ -647,19 +650,20 @@ define <2 x i64> @splatvar_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
; XOPAVX2-NEXT: retq
;
; AVX512-LABEL: splatvar_shift_v2i64:
-; AVX512: # BB#0:
-; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; AVX512: # %bb.0:
+; AVX512-NEXT: # kill: def %xmm0 killed %xmm0 def %zmm0
; AVX512-NEXT: vpsraq %xmm1, %zmm0, %zmm0
-; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: splatvar_shift_v2i64:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpsraq %xmm1, %xmm0, %xmm0
; AVX512VL-NEXT: retq
;
; X32-SSE-LABEL: splatvar_shift_v2i64:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648]
; X32-SSE-NEXT: psrlq %xmm1, %xmm2
; X32-SSE-NEXT: psrlq %xmm1, %xmm0
@@ -673,44 +677,44 @@ define <2 x i64> @splatvar_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
define <4 x i32> @splatvar_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
; SSE2-LABEL: splatvar_shift_v4i32:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: xorps %xmm2, %xmm2
; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
; SSE2-NEXT: psrad %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: splatvar_shift_v4i32:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
; SSE41-NEXT: psrad %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: splatvar_shift_v4i32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
; AVX-NEXT: vpsrad %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
;
; XOP-LABEL: splatvar_shift_v4i32:
-; XOP: # BB#0:
+; XOP: # %bb.0:
; XOP-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
; XOP-NEXT: vpsrad %xmm1, %xmm0, %xmm0
; XOP-NEXT: retq
;
; AVX512-LABEL: splatvar_shift_v4i32:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
; AVX512-NEXT: vpsrad %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: splatvar_shift_v4i32:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
; AVX512VL-NEXT: vpsrad %xmm1, %xmm0, %xmm0
; AVX512VL-NEXT: retq
;
; X32-SSE-LABEL: splatvar_shift_v4i32:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: xorps %xmm2, %xmm2
; X32-SSE-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
; X32-SSE-NEXT: psrad %xmm2, %xmm0
@@ -722,44 +726,44 @@ define <4 x i32> @splatvar_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
define <8 x i16> @splatvar_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
; SSE2-LABEL: splatvar_shift_v8i16:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: pextrw $0, %xmm1, %eax
; SSE2-NEXT: movd %eax, %xmm1
; SSE2-NEXT: psraw %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: splatvar_shift_v8i16:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
; SSE41-NEXT: psraw %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: splatvar_shift_v8i16:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
; AVX-NEXT: vpsraw %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
;
; XOP-LABEL: splatvar_shift_v8i16:
-; XOP: # BB#0:
+; XOP: # %bb.0:
; XOP-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
; XOP-NEXT: vpsraw %xmm1, %xmm0, %xmm0
; XOP-NEXT: retq
;
; AVX512-LABEL: splatvar_shift_v8i16:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
; AVX512-NEXT: vpsraw %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: splatvar_shift_v8i16:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
; AVX512VL-NEXT: vpsraw %xmm1, %xmm0, %xmm0
; AVX512VL-NEXT: retq
;
; X32-SSE-LABEL: splatvar_shift_v8i16:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: pextrw $0, %xmm1, %eax
; X32-SSE-NEXT: movd %eax, %xmm1
; X32-SSE-NEXT: psraw %xmm1, %xmm0
@@ -771,7 +775,7 @@ define <8 x i16> @splatvar_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; SSE2-LABEL: splatvar_shift_v16i8:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,1,1]
@@ -832,7 +836,7 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; SSE2-NEXT: retq
;
; SSE41-LABEL: splatvar_shift_v16i8:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm2
; SSE41-NEXT: pxor %xmm0, %xmm0
; SSE41-NEXT: pshufb %xmm0, %xmm1
@@ -870,7 +874,7 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; SSE41-NEXT: retq
;
; AVX1-LABEL: splatvar_shift_v16i8:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpsllw $5, %xmm1, %xmm1
@@ -900,7 +904,7 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: splatvar_shift_v16i8:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1
; AVX2-NEXT: vpsllw $5, %xmm1, %xmm1
; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
@@ -929,7 +933,7 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; AVX2-NEXT: retq
;
; XOPAVX1-LABEL: splatvar_shift_v16i8:
-; XOPAVX1: # BB#0:
+; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; XOPAVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; XOPAVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm1
@@ -937,7 +941,7 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: splatvar_shift_v16i8:
-; XOPAVX2: # BB#0:
+; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1
; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
; XOPAVX2-NEXT: vpsubb %xmm1, %xmm2, %xmm1
@@ -945,25 +949,27 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; XOPAVX2-NEXT: retq
;
; AVX512-LABEL: splatvar_shift_v16i8:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpbroadcastb %xmm1, %xmm1
; AVX512-NEXT: vpmovsxbd %xmm0, %zmm0
; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
; AVX512-NEXT: vpsravd %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: splatvar_shift_v16i8:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpbroadcastb %xmm1, %xmm1
; AVX512VL-NEXT: vpmovsxbd %xmm0, %zmm0
; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
; AVX512VL-NEXT: vpsravd %zmm1, %zmm0, %zmm0
; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; X32-SSE-LABEL: splatvar_shift_v16i8:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,1,1]
@@ -1033,19 +1039,18 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
define <2 x i64> @constant_shift_v2i64(<2 x i64> %a) nounwind {
; SSE2-LABEL: constant_shift_v2i64:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrlq $7, %xmm1
-; SSE2-NEXT: psrlq $1, %xmm0
-; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
-; SSE2-NEXT: movapd {{.*#+}} xmm0 = [4611686018427387904,72057594037927936]
-; SSE2-NEXT: xorpd %xmm0, %xmm1
-; SSE2-NEXT: psubq %xmm0, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: psrlq $1, %xmm1
+; SSE2-NEXT: psrlq $7, %xmm0
+; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SSE2-NEXT: movapd {{.*#+}} xmm1 = [4611686018427387904,72057594037927936]
+; SSE2-NEXT: xorpd %xmm1, %xmm0
+; SSE2-NEXT: psubq %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: constant_shift_v2i64:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm1
; SSE41-NEXT: psrlq $7, %xmm1
; SSE41-NEXT: psrlq $1, %xmm0
@@ -1056,7 +1061,7 @@ define <2 x i64> @constant_shift_v2i64(<2 x i64> %a) nounwind {
; SSE41-NEXT: retq
;
; AVX1-LABEL: constant_shift_v2i64:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpsrlq $7, %xmm0, %xmm1
; AVX1-NEXT: vpsrlq $1, %xmm0, %xmm0
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
@@ -1066,7 +1071,7 @@ define <2 x i64> @constant_shift_v2i64(<2 x i64> %a) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: constant_shift_v2i64:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpsrlvq {{.*}}(%rip), %xmm0, %xmm0
; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [4611686018427387904,72057594037927936]
; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
@@ -1074,39 +1079,39 @@ define <2 x i64> @constant_shift_v2i64(<2 x i64> %a) nounwind {
; AVX2-NEXT: retq
;
; XOP-LABEL: constant_shift_v2i64:
-; XOP: # BB#0:
+; XOP: # %bb.0:
; XOP-NEXT: vpxor %xmm1, %xmm1, %xmm1
; XOP-NEXT: vpsubq {{.*}}(%rip), %xmm1, %xmm1
; XOP-NEXT: vpshaq %xmm1, %xmm0, %xmm0
; XOP-NEXT: retq
;
; AVX512-LABEL: constant_shift_v2i64:
-; AVX512: # BB#0:
-; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; AVX512: # %bb.0:
+; AVX512-NEXT: # kill: def %xmm0 killed %xmm0 def %zmm0
; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [1,7]
; AVX512-NEXT: vpsravq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: constant_shift_v2i64:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpsravq {{.*}}(%rip), %xmm0, %xmm0
; AVX512VL-NEXT: retq
;
; X32-SSE-LABEL: constant_shift_v2i64:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,2147483648,0,2147483648]
; X32-SSE-NEXT: movdqa %xmm1, %xmm2
-; X32-SSE-NEXT: psrlq $7, %xmm2
-; X32-SSE-NEXT: psrlq $1, %xmm1
-; X32-SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
-; X32-SSE-NEXT: movdqa %xmm0, %xmm1
+; X32-SSE-NEXT: psrlq $1, %xmm2
; X32-SSE-NEXT: psrlq $7, %xmm1
-; X32-SSE-NEXT: psrlq $1, %xmm0
-; X32-SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
-; X32-SSE-NEXT: xorpd %xmm2, %xmm1
-; X32-SSE-NEXT: psubq %xmm2, %xmm1
-; X32-SSE-NEXT: movdqa %xmm1, %xmm0
+; X32-SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
+; X32-SSE-NEXT: movdqa %xmm0, %xmm2
+; X32-SSE-NEXT: psrlq $1, %xmm2
+; X32-SSE-NEXT: psrlq $7, %xmm0
+; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
+; X32-SSE-NEXT: xorpd %xmm1, %xmm0
+; X32-SSE-NEXT: psubq %xmm1, %xmm0
; X32-SSE-NEXT: retl
%shift = ashr <2 x i64> %a, <i64 1, i64 7>
ret <2 x i64> %shift
@@ -1114,23 +1119,23 @@ define <2 x i64> @constant_shift_v2i64(<2 x i64> %a) nounwind {
define <4 x i32> @constant_shift_v4i32(<4 x i32> %a) nounwind {
; SSE2-LABEL: constant_shift_v4i32:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrad $7, %xmm1
+; SSE2-NEXT: psrad $5, %xmm1
; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: psrad $5, %xmm2
-; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; SSE2-NEXT: psrad $7, %xmm2
+; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: psrad $6, %xmm2
-; SSE2-NEXT: psrad $4, %xmm0
-; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
+; SSE2-NEXT: psrad $4, %xmm2
+; SSE2-NEXT: psrad $6, %xmm0
+; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: retq
;
; SSE41-LABEL: constant_shift_v4i32:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm1
; SSE41-NEXT: psrad $7, %xmm1
; SSE41-NEXT: movdqa %xmm0, %xmm2
@@ -1144,7 +1149,7 @@ define <4 x i32> @constant_shift_v4i32(<4 x i32> %a) nounwind {
; SSE41-NEXT: retq
;
; AVX1-LABEL: constant_shift_v4i32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpsrad $7, %xmm0, %xmm1
; AVX1-NEXT: vpsrad $5, %xmm0, %xmm2
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
@@ -1155,43 +1160,43 @@ define <4 x i32> @constant_shift_v4i32(<4 x i32> %a) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: constant_shift_v4i32:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0
; AVX2-NEXT: retq
;
; XOPAVX1-LABEL: constant_shift_v4i32:
-; XOPAVX1: # BB#0:
+; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vpshad {{.*}}(%rip), %xmm0, %xmm0
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: constant_shift_v4i32:
-; XOPAVX2: # BB#0:
+; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0
; XOPAVX2-NEXT: retq
;
; AVX512-LABEL: constant_shift_v4i32:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: constant_shift_v4i32:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0
; AVX512VL-NEXT: retq
;
; X32-SSE-LABEL: constant_shift_v4i32:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: movdqa %xmm0, %xmm1
-; X32-SSE-NEXT: psrad $7, %xmm1
+; X32-SSE-NEXT: psrad $5, %xmm1
; X32-SSE-NEXT: movdqa %xmm0, %xmm2
-; X32-SSE-NEXT: psrad $5, %xmm2
-; X32-SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; X32-SSE-NEXT: psrad $7, %xmm2
+; X32-SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
+; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
; X32-SSE-NEXT: movdqa %xmm0, %xmm2
-; X32-SSE-NEXT: psrad $6, %xmm2
-; X32-SSE-NEXT: psrad $4, %xmm0
-; X32-SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
+; X32-SSE-NEXT: psrad $4, %xmm2
+; X32-SSE-NEXT: psrad $6, %xmm0
+; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
+; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X32-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; X32-SSE-NEXT: retl
%shift = ashr <4 x i32> %a, <i32 4, i32 5, i32 6, i32 7>
@@ -1200,7 +1205,7 @@ define <4 x i32> @constant_shift_v4i32(<4 x i32> %a) nounwind {
define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) nounwind {
; SSE2-LABEL: constant_shift_v8i16:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psraw $4, %xmm1
; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
@@ -1217,7 +1222,7 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) nounwind {
; SSE2-NEXT: retq
;
; SSE41-LABEL: constant_shift_v8i16:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm1
; SSE41-NEXT: psraw $4, %xmm1
; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7]
@@ -1230,7 +1235,7 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) nounwind {
; SSE41-NEXT: retq
;
; AVX1-LABEL: constant_shift_v8i16:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpsraw $4, %xmm0, %xmm1
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
; AVX1-NEXT: vpsraw $2, %xmm0, %xmm1
@@ -1240,52 +1245,54 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: constant_shift_v8i16:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
; AVX2-NEXT: vpsravd {{.*}}(%rip), %ymm0, %ymm0
-; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; XOP-LABEL: constant_shift_v8i16:
-; XOP: # BB#0:
+; XOP: # %bb.0:
; XOP-NEXT: vpxor %xmm1, %xmm1, %xmm1
; XOP-NEXT: vpsubw {{.*}}(%rip), %xmm1, %xmm1
; XOP-NEXT: vpshaw %xmm1, %xmm0, %xmm0
; XOP-NEXT: retq
;
; AVX512DQ-LABEL: constant_shift_v8i16:
-; AVX512DQ: # BB#0:
+; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vpmovsxwd %xmm0, %ymm0
; AVX512DQ-NEXT: vpsravd {{.*}}(%rip), %ymm0, %ymm0
; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512DQ-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: constant_shift_v8i16:
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: # kill: def %xmm0 killed %xmm0 def %zmm0
; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7]
; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512BW-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQVL-LABEL: constant_shift_v8i16:
-; AVX512DQVL: # BB#0:
+; AVX512DQVL: # %bb.0:
; AVX512DQVL-NEXT: vpmovsxwd %xmm0, %ymm0
; AVX512DQVL-NEXT: vpsravd {{.*}}(%rip), %ymm0, %ymm0
; AVX512DQVL-NEXT: vpmovdw %ymm0, %xmm0
+; AVX512DQVL-NEXT: vzeroupper
; AVX512DQVL-NEXT: retq
;
; AVX512BWVL-LABEL: constant_shift_v8i16:
-; AVX512BWVL: # BB#0:
+; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vpsravw {{.*}}(%rip), %xmm0, %xmm0
; AVX512BWVL-NEXT: retq
;
; X32-SSE-LABEL: constant_shift_v8i16:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: movdqa %xmm0, %xmm1
; X32-SSE-NEXT: psraw $4, %xmm1
; X32-SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
@@ -1306,7 +1313,7 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) nounwind {
define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) nounwind {
; SSE2-LABEL: constant_shift_v16i8:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [8192,24640,41088,57536,49376,32928,16480,32]
; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15]
@@ -1364,7 +1371,7 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) nounwind {
; SSE2-NEXT: retq
;
; SSE41-LABEL: constant_shift_v16i8:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm1
; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [8192,24640,41088,57536,49376,32928,16480,32]
; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15]
@@ -1400,7 +1407,7 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) nounwind {
; SSE41-NEXT: retq
;
; AVX-LABEL: constant_shift_v16i8:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [8192,24640,41088,57536,49376,32928,16480,32]
; AVX-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
; AVX-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
@@ -1428,28 +1435,30 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) nounwind {
; AVX-NEXT: retq
;
; XOP-LABEL: constant_shift_v16i8:
-; XOP: # BB#0:
+; XOP: # %bb.0:
; XOP-NEXT: vpxor %xmm1, %xmm1, %xmm1
; XOP-NEXT: vpsubb {{.*}}(%rip), %xmm1, %xmm1
; XOP-NEXT: vpshab %xmm1, %xmm0, %xmm0
; XOP-NEXT: retq
;
; AVX512-LABEL: constant_shift_v16i8:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpmovsxbd %xmm0, %zmm0
; AVX512-NEXT: vpsravd {{.*}}(%rip), %zmm0, %zmm0
; AVX512-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: constant_shift_v16i8:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpmovsxbd %xmm0, %zmm0
; AVX512VL-NEXT: vpsravd {{.*}}(%rip), %zmm0, %zmm0
; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; X32-SSE-LABEL: constant_shift_v16i8:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [8192,24640,41088,57536,49376,32928,16480,32]
; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15]
@@ -1515,7 +1524,7 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) nounwind {
define <2 x i64> @splatconstant_shift_v2i64(<2 x i64> %a) nounwind {
; SSE2-LABEL: splatconstant_shift_v2i64:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrad $7, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
@@ -1525,7 +1534,7 @@ define <2 x i64> @splatconstant_shift_v2i64(<2 x i64> %a) nounwind {
; SSE2-NEXT: retq
;
; SSE41-LABEL: splatconstant_shift_v2i64:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm1
; SSE41-NEXT: psrad $7, %xmm1
; SSE41-NEXT: psrlq $7, %xmm0
@@ -1533,40 +1542,41 @@ define <2 x i64> @splatconstant_shift_v2i64(<2 x i64> %a) nounwind {
; SSE41-NEXT: retq
;
; AVX1-LABEL: splatconstant_shift_v2i64:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpsrad $7, %xmm0, %xmm1
; AVX1-NEXT: vpsrlq $7, %xmm0, %xmm0
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
; AVX1-NEXT: retq
;
; AVX2-LABEL: splatconstant_shift_v2i64:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpsrad $7, %xmm0, %xmm1
; AVX2-NEXT: vpsrlq $7, %xmm0, %xmm0
; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
; AVX2-NEXT: retq
;
; XOP-LABEL: splatconstant_shift_v2i64:
-; XOP: # BB#0:
+; XOP: # %bb.0:
; XOP-NEXT: vpxor %xmm1, %xmm1, %xmm1
; XOP-NEXT: vpsubq {{.*}}(%rip), %xmm1, %xmm1
; XOP-NEXT: vpshaq %xmm1, %xmm0, %xmm0
; XOP-NEXT: retq
;
; AVX512-LABEL: splatconstant_shift_v2i64:
-; AVX512: # BB#0:
-; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; AVX512: # %bb.0:
+; AVX512-NEXT: # kill: def %xmm0 killed %xmm0 def %zmm0
; AVX512-NEXT: vpsraq $7, %zmm0, %zmm0
-; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: splatconstant_shift_v2i64:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpsraq $7, %xmm0, %xmm0
; AVX512VL-NEXT: retq
;
; X32-SSE-LABEL: splatconstant_shift_v2i64:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: movdqa %xmm0, %xmm1
; X32-SSE-NEXT: psrad $7, %xmm1
; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
@@ -1580,32 +1590,32 @@ define <2 x i64> @splatconstant_shift_v2i64(<2 x i64> %a) nounwind {
define <4 x i32> @splatconstant_shift_v4i32(<4 x i32> %a) nounwind {
; SSE-LABEL: splatconstant_shift_v4i32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: psrad $5, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: splatconstant_shift_v4i32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpsrad $5, %xmm0, %xmm0
; AVX-NEXT: retq
;
; XOP-LABEL: splatconstant_shift_v4i32:
-; XOP: # BB#0:
+; XOP: # %bb.0:
; XOP-NEXT: vpsrad $5, %xmm0, %xmm0
; XOP-NEXT: retq
;
; AVX512-LABEL: splatconstant_shift_v4i32:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpsrad $5, %xmm0, %xmm0
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: splatconstant_shift_v4i32:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpsrad $5, %xmm0, %xmm0
; AVX512VL-NEXT: retq
;
; X32-SSE-LABEL: splatconstant_shift_v4i32:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: psrad $5, %xmm0
; X32-SSE-NEXT: retl
%shift = ashr <4 x i32> %a, <i32 5, i32 5, i32 5, i32 5>
@@ -1614,32 +1624,32 @@ define <4 x i32> @splatconstant_shift_v4i32(<4 x i32> %a) nounwind {
define <8 x i16> @splatconstant_shift_v8i16(<8 x i16> %a) nounwind {
; SSE-LABEL: splatconstant_shift_v8i16:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: psraw $3, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: splatconstant_shift_v8i16:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpsraw $3, %xmm0, %xmm0
; AVX-NEXT: retq
;
; XOP-LABEL: splatconstant_shift_v8i16:
-; XOP: # BB#0:
+; XOP: # %bb.0:
; XOP-NEXT: vpsraw $3, %xmm0, %xmm0
; XOP-NEXT: retq
;
; AVX512-LABEL: splatconstant_shift_v8i16:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpsraw $3, %xmm0, %xmm0
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: splatconstant_shift_v8i16:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpsraw $3, %xmm0, %xmm0
; AVX512VL-NEXT: retq
;
; X32-SSE-LABEL: splatconstant_shift_v8i16:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: psraw $3, %xmm0
; X32-SSE-NEXT: retl
%shift = ashr <8 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
@@ -1648,7 +1658,7 @@ define <8 x i16> @splatconstant_shift_v8i16(<8 x i16> %a) nounwind {
define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) nounwind {
; SSE-LABEL: splatconstant_shift_v16i8:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: psrlw $3, %xmm0
; SSE-NEXT: pand {{.*}}(%rip), %xmm0
; SSE-NEXT: movdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
@@ -1657,7 +1667,7 @@ define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) nounwind {
; SSE-NEXT: retq
;
; AVX-LABEL: splatconstant_shift_v16i8:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpsrlw $3, %xmm0, %xmm0
; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
@@ -1666,14 +1676,14 @@ define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) nounwind {
; AVX-NEXT: retq
;
; XOP-LABEL: splatconstant_shift_v16i8:
-; XOP: # BB#0:
+; XOP: # %bb.0:
; XOP-NEXT: vpxor %xmm1, %xmm1, %xmm1
; XOP-NEXT: vpsubb {{.*}}(%rip), %xmm1, %xmm1
; XOP-NEXT: vpshab %xmm1, %xmm0, %xmm0
; XOP-NEXT: retq
;
; AVX512-LABEL: splatconstant_shift_v16i8:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpsrlw $3, %xmm0, %xmm0
; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
@@ -1682,7 +1692,7 @@ define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) nounwind {
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: splatconstant_shift_v16i8:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpsrlw $3, %xmm0, %xmm0
; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
@@ -1691,7 +1701,7 @@ define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) nounwind {
; AVX512VL-NEXT: retq
;
; X32-SSE-LABEL: splatconstant_shift_v16i8:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: psrlw $3, %xmm0
; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
diff --git a/test/CodeGen/X86/vector-shift-ashr-256.ll b/test/CodeGen/X86/vector-shift-ashr-256.ll
index 6bb57d8f5f71..a99c70ebd7d9 100644
--- a/test/CodeGen/X86/vector-shift-ashr-256.ll
+++ b/test/CodeGen/X86/vector-shift-ashr-256.ll
@@ -3,10 +3,10 @@
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX1
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512DQ
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512DQVL
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512BWVL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512DQ
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512DQVL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512BWVL
;
; 32-bit runs to make sure we do reasonable things for i64 shifts.
; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=X32-AVX --check-prefix=X32-AVX1
@@ -18,7 +18,7 @@
define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
; AVX1-LABEL: var_shift_v4i64:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
; AVX1-NEXT: vpsrlq %xmm2, %xmm3, %xmm4
@@ -44,7 +44,7 @@ define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: var_shift_v4i64:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
; AVX2-NEXT: vpsrlvq %ymm1, %ymm2, %ymm3
; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0
@@ -53,7 +53,7 @@ define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
; AVX2-NEXT: retq
;
; XOPAVX1-LABEL: var_shift_v4i64:
-; XOPAVX1: # BB#0:
+; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
; XOPAVX1-NEXT: vpsubq %xmm2, %xmm3, %xmm2
@@ -65,7 +65,7 @@ define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: var_shift_v4i64:
-; XOPAVX2: # BB#0:
+; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
; XOPAVX2-NEXT: vpsrlvq %ymm1, %ymm2, %ymm3
; XOPAVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0
@@ -74,20 +74,20 @@ define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
; XOPAVX2-NEXT: retq
;
; AVX512-LABEL: var_shift_v4i64:
-; AVX512: # BB#0:
-; AVX512-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
-; AVX512-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512: # %bb.0:
+; AVX512-NEXT: # kill: def %ymm1 killed %ymm1 def %zmm1
+; AVX512-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; AVX512-NEXT: vpsravq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: var_shift_v4i64:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpsravq %ymm1, %ymm0, %ymm0
; AVX512VL-NEXT: retq
;
; X32-AVX1-LABEL: var_shift_v4i64:
-; X32-AVX1: # BB#0:
+; X32-AVX1: # %bb.0:
; X32-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,2147483648,0,2147483648]
; X32-AVX1-NEXT: vpsrlq %xmm2, %xmm3, %xmm4
@@ -113,7 +113,7 @@ define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
; X32-AVX1-NEXT: retl
;
; X32-AVX2-LABEL: var_shift_v4i64:
-; X32-AVX2: # BB#0:
+; X32-AVX2: # %bb.0:
; X32-AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648]
; X32-AVX2-NEXT: vpsrlvq %ymm1, %ymm2, %ymm3
; X32-AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0
@@ -126,7 +126,7 @@ define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
define <8 x i32> @var_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
; AVX1-LABEL: var_shift_v8i32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
; AVX1-NEXT: vpsrldq {{.*#+}} xmm4 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -156,12 +156,12 @@ define <8 x i32> @var_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: var_shift_v8i32:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpsravd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; XOPAVX1-LABEL: var_shift_v8i32:
-; XOPAVX1: # BB#0:
+; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
; XOPAVX1-NEXT: vpsubd %xmm2, %xmm3, %xmm2
@@ -173,22 +173,22 @@ define <8 x i32> @var_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: var_shift_v8i32:
-; XOPAVX2: # BB#0:
+; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: vpsravd %ymm1, %ymm0, %ymm0
; XOPAVX2-NEXT: retq
;
; AVX512-LABEL: var_shift_v8i32:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpsravd %ymm1, %ymm0, %ymm0
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: var_shift_v8i32:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpsravd %ymm1, %ymm0, %ymm0
; AVX512VL-NEXT: retq
;
; X32-AVX1-LABEL: var_shift_v8i32:
-; X32-AVX1: # BB#0:
+; X32-AVX1: # %bb.0:
; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; X32-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
; X32-AVX1-NEXT: vpsrldq {{.*#+}} xmm4 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -218,7 +218,7 @@ define <8 x i32> @var_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
; X32-AVX1-NEXT: retl
;
; X32-AVX2-LABEL: var_shift_v8i32:
-; X32-AVX2: # BB#0:
+; X32-AVX2: # %bb.0:
; X32-AVX2-NEXT: vpsravd %ymm1, %ymm0, %ymm0
; X32-AVX2-NEXT: retl
%shift = ashr <8 x i32> %a, %b
@@ -227,7 +227,7 @@ define <8 x i32> @var_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
; AVX1-LABEL: var_shift_v16i16:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vpsllw $12, %xmm2, %xmm3
; AVX1-NEXT: vpsllw $4, %xmm2, %xmm2
@@ -262,8 +262,8 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: var_shift_v16i16:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpxor %ymm2, %ymm2, %ymm2
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
; AVX2-NEXT: vpsravd %ymm3, %ymm4, %ymm3
@@ -276,7 +276,7 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
; AVX2-NEXT: retq
;
; XOPAVX1-LABEL: var_shift_v16i16:
-; XOPAVX1: # BB#0:
+; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
; XOPAVX1-NEXT: vpsubw %xmm2, %xmm3, %xmm2
@@ -288,7 +288,7 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: var_shift_v16i16:
-; XOPAVX2: # BB#0:
+; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
; XOPAVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
; XOPAVX2-NEXT: vpsubw %xmm2, %xmm3, %xmm2
@@ -300,7 +300,7 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
; XOPAVX2-NEXT: retq
;
; AVX512DQ-LABEL: var_shift_v16i16:
-; AVX512DQ: # BB#0:
+; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512DQ-NEXT: vpsravd %zmm1, %zmm0, %zmm0
@@ -308,15 +308,15 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: var_shift_v16i16:
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
-; AVX512BW-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: # kill: def %ymm1 killed %ymm1 def %zmm1
+; AVX512BW-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512BW-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
; AVX512BW-NEXT: retq
;
; AVX512DQVL-LABEL: var_shift_v16i16:
-; AVX512DQVL: # BB#0:
+; AVX512DQVL: # %bb.0:
; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
; AVX512DQVL-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512DQVL-NEXT: vpsravd %zmm1, %zmm0, %zmm0
@@ -324,12 +324,12 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
; AVX512DQVL-NEXT: retq
;
; AVX512BWVL-LABEL: var_shift_v16i16:
-; AVX512BWVL: # BB#0:
+; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vpsravw %ymm1, %ymm0, %ymm0
; AVX512BWVL-NEXT: retq
;
; X32-AVX1-LABEL: var_shift_v16i16:
-; X32-AVX1: # BB#0:
+; X32-AVX1: # %bb.0:
; X32-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; X32-AVX1-NEXT: vpsllw $12, %xmm2, %xmm3
; X32-AVX1-NEXT: vpsllw $4, %xmm2, %xmm2
@@ -364,8 +364,8 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
; X32-AVX1-NEXT: retl
;
; X32-AVX2-LABEL: var_shift_v16i16:
-; X32-AVX2: # BB#0:
-; X32-AVX2-NEXT: vpxor %ymm2, %ymm2, %ymm2
+; X32-AVX2: # %bb.0:
+; X32-AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
; X32-AVX2-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
; X32-AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
; X32-AVX2-NEXT: vpsravd %ymm3, %ymm4, %ymm3
@@ -382,7 +382,7 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
; AVX1-LABEL: var_shift_v32i8:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vpsllw $5, %xmm2, %xmm2
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
@@ -437,7 +437,7 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: var_shift_v32i8:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpsllw $5, %ymm1, %ymm1
; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
@@ -465,7 +465,7 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
; AVX2-NEXT: retq
;
; XOPAVX1-LABEL: var_shift_v32i8:
-; XOPAVX1: # BB#0:
+; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
; XOPAVX1-NEXT: vpsubb %xmm2, %xmm3, %xmm2
@@ -477,7 +477,7 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: var_shift_v32i8:
-; XOPAVX2: # BB#0:
+; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
; XOPAVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
; XOPAVX2-NEXT: vpsubb %xmm2, %xmm3, %xmm2
@@ -489,7 +489,7 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
; XOPAVX2-NEXT: retq
;
; AVX512DQ-LABEL: var_shift_v32i8:
-; AVX512DQ: # BB#0:
+; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vpsllw $5, %ymm1, %ymm1
; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
@@ -517,7 +517,7 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: var_shift_v32i8:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm0
; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0
@@ -525,7 +525,7 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
; AVX512BW-NEXT: retq
;
; AVX512DQVL-LABEL: var_shift_v32i8:
-; AVX512DQVL: # BB#0:
+; AVX512DQVL: # %bb.0:
; AVX512DQVL-NEXT: vpsllw $5, %ymm1, %ymm1
; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
@@ -553,7 +553,7 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
; AVX512DQVL-NEXT: retq
;
; AVX512BWVL-LABEL: var_shift_v32i8:
-; AVX512BWVL: # BB#0:
+; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
; AVX512BWVL-NEXT: vpmovsxbw %ymm0, %zmm0
; AVX512BWVL-NEXT: vpsravw %zmm1, %zmm0, %zmm0
@@ -561,7 +561,7 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
; AVX512BWVL-NEXT: retq
;
; X32-AVX1-LABEL: var_shift_v32i8:
-; X32-AVX1: # BB#0:
+; X32-AVX1: # %bb.0:
; X32-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; X32-AVX1-NEXT: vpsllw $5, %xmm2, %xmm2
; X32-AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
@@ -616,7 +616,7 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
; X32-AVX1-NEXT: retl
;
; X32-AVX2-LABEL: var_shift_v32i8:
-; X32-AVX2: # BB#0:
+; X32-AVX2: # %bb.0:
; X32-AVX2-NEXT: vpsllw $5, %ymm1, %ymm1
; X32-AVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
; X32-AVX2-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
@@ -652,7 +652,7 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
; AVX1-LABEL: splatvar_shift_v4i64:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
; AVX1-NEXT: vpsrlq %xmm1, %xmm2, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
@@ -666,7 +666,7 @@ define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: splatvar_shift_v4i64:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
; AVX2-NEXT: vpsrlq %xmm1, %ymm2, %ymm2
; AVX2-NEXT: vpsrlq %xmm1, %ymm0, %ymm0
@@ -675,7 +675,7 @@ define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
; AVX2-NEXT: retq
;
; XOPAVX1-LABEL: splatvar_shift_v4i64:
-; XOPAVX1: # BB#0:
+; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0]
; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; XOPAVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm1
@@ -686,7 +686,7 @@ define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: splatvar_shift_v4i64:
-; XOPAVX2: # BB#0:
+; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
; XOPAVX2-NEXT: vpsrlq %xmm1, %ymm2, %ymm2
; XOPAVX2-NEXT: vpsrlq %xmm1, %ymm0, %ymm0
@@ -695,21 +695,19 @@ define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
; XOPAVX2-NEXT: retq
;
; AVX512-LABEL: splatvar_shift_v4i64:
-; AVX512: # BB#0:
-; AVX512-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512: # %bb.0:
+; AVX512-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; AVX512-NEXT: vpsraq %xmm1, %zmm0, %zmm0
-; AVX512-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: splatvar_shift_v4i64:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpsraq %xmm1, %ymm0, %ymm0
; AVX512VL-NEXT: retq
;
; X32-AVX1-LABEL: splatvar_shift_v4i64:
-; X32-AVX1: # BB#0:
-; X32-AVX1-NEXT: vpextrd $1, %xmm1, %eax
-; X32-AVX1-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1
+; X32-AVX1: # %bb.0:
; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648]
; X32-AVX1-NEXT: vpsrlq %xmm1, %xmm2, %xmm2
; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
@@ -723,9 +721,7 @@ define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
; X32-AVX1-NEXT: retl
;
; X32-AVX2-LABEL: splatvar_shift_v4i64:
-; X32-AVX2: # BB#0:
-; X32-AVX2-NEXT: vpextrd $1, %xmm1, %eax
-; X32-AVX2-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1
+; X32-AVX2: # %bb.0:
; X32-AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648]
; X32-AVX2-NEXT: vpsrlq %xmm1, %ymm2, %ymm2
; X32-AVX2-NEXT: vpsrlq %xmm1, %ymm0, %ymm0
@@ -739,7 +735,7 @@ define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
; AVX1-LABEL: splatvar_shift_v8i32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
; AVX1-NEXT: vpsrad %xmm1, %xmm2, %xmm2
@@ -748,13 +744,13 @@ define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: splatvar_shift_v8i32:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
; AVX2-NEXT: vpsrad %xmm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; XOPAVX1-LABEL: splatvar_shift_v8i32:
-; XOPAVX1: # BB#0:
+; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; XOPAVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
; XOPAVX1-NEXT: vpsrad %xmm1, %xmm2, %xmm2
@@ -763,25 +759,25 @@ define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: splatvar_shift_v8i32:
-; XOPAVX2: # BB#0:
+; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
; XOPAVX2-NEXT: vpsrad %xmm1, %ymm0, %ymm0
; XOPAVX2-NEXT: retq
;
; AVX512-LABEL: splatvar_shift_v8i32:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
; AVX512-NEXT: vpsrad %xmm1, %ymm0, %ymm0
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: splatvar_shift_v8i32:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
; AVX512VL-NEXT: vpsrad %xmm1, %ymm0, %ymm0
; AVX512VL-NEXT: retq
;
; X32-AVX1-LABEL: splatvar_shift_v8i32:
-; X32-AVX1: # BB#0:
+; X32-AVX1: # %bb.0:
; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; X32-AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
; X32-AVX1-NEXT: vpsrad %xmm1, %xmm2, %xmm2
@@ -790,7 +786,7 @@ define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
; X32-AVX1-NEXT: retl
;
; X32-AVX2-LABEL: splatvar_shift_v8i32:
-; X32-AVX2: # BB#0:
+; X32-AVX2: # %bb.0:
; X32-AVX2-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
; X32-AVX2-NEXT: vpsrad %xmm1, %ymm0, %ymm0
; X32-AVX2-NEXT: retl
@@ -801,7 +797,7 @@ define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
; AVX1-LABEL: splatvar_shift_v16i16:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
; AVX1-NEXT: vpsraw %xmm1, %xmm2, %xmm2
@@ -810,13 +806,13 @@ define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind
; AVX1-NEXT: retq
;
; AVX2-LABEL: splatvar_shift_v16i16:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
; AVX2-NEXT: vpsraw %xmm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; XOPAVX1-LABEL: splatvar_shift_v16i16:
-; XOPAVX1: # BB#0:
+; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; XOPAVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
; XOPAVX1-NEXT: vpsraw %xmm1, %xmm2, %xmm2
@@ -825,25 +821,25 @@ define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: splatvar_shift_v16i16:
-; XOPAVX2: # BB#0:
+; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
; XOPAVX2-NEXT: vpsraw %xmm1, %ymm0, %ymm0
; XOPAVX2-NEXT: retq
;
; AVX512-LABEL: splatvar_shift_v16i16:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
; AVX512-NEXT: vpsraw %xmm1, %ymm0, %ymm0
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: splatvar_shift_v16i16:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
; AVX512VL-NEXT: vpsraw %xmm1, %ymm0, %ymm0
; AVX512VL-NEXT: retq
;
; X32-AVX1-LABEL: splatvar_shift_v16i16:
-; X32-AVX1: # BB#0:
+; X32-AVX1: # %bb.0:
; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; X32-AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
; X32-AVX1-NEXT: vpsraw %xmm1, %xmm2, %xmm2
@@ -852,7 +848,7 @@ define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind
; X32-AVX1-NEXT: retl
;
; X32-AVX2-LABEL: splatvar_shift_v16i16:
-; X32-AVX2: # BB#0:
+; X32-AVX2: # %bb.0:
; X32-AVX2-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
; X32-AVX2-NEXT: vpsraw %xmm1, %ymm0, %ymm0
; X32-AVX2-NEXT: retl
@@ -863,7 +859,7 @@ define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind
define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
; AVX1-LABEL: splatvar_shift_v32i8:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpsllw $5, %xmm1, %xmm1
@@ -912,7 +908,7 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: splatvar_shift_v32i8:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpbroadcastb %xmm1, %ymm1
; AVX2-NEXT: vpsllw $5, %ymm1, %ymm1
; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
@@ -941,7 +937,7 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
; AVX2-NEXT: retq
;
; XOPAVX1-LABEL: splatvar_shift_v32i8:
-; XOPAVX1: # BB#0:
+; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; XOPAVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; XOPAVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm1
@@ -952,7 +948,7 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: splatvar_shift_v32i8:
-; XOPAVX2: # BB#0:
+; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: vpbroadcastb %xmm1, %ymm1
; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
; XOPAVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
@@ -965,7 +961,7 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
; XOPAVX2-NEXT: retq
;
; AVX512DQ-LABEL: splatvar_shift_v32i8:
-; AVX512DQ: # BB#0:
+; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vpbroadcastb %xmm1, %ymm1
; AVX512DQ-NEXT: vpsllw $5, %ymm1, %ymm1
; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
@@ -994,7 +990,7 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: splatvar_shift_v32i8:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpbroadcastb %xmm1, %ymm1
; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm0
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
@@ -1003,7 +999,7 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
; AVX512BW-NEXT: retq
;
; AVX512DQVL-LABEL: splatvar_shift_v32i8:
-; AVX512DQVL: # BB#0:
+; AVX512DQVL: # %bb.0:
; AVX512DQVL-NEXT: vpbroadcastb %xmm1, %ymm1
; AVX512DQVL-NEXT: vpsllw $5, %ymm1, %ymm1
; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
@@ -1032,7 +1028,7 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
; AVX512DQVL-NEXT: retq
;
; AVX512BWVL-LABEL: splatvar_shift_v32i8:
-; AVX512BWVL: # BB#0:
+; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vpbroadcastb %xmm1, %ymm1
; AVX512BWVL-NEXT: vpmovsxbw %ymm0, %zmm0
; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
@@ -1041,7 +1037,7 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
; AVX512BWVL-NEXT: retq
;
; X32-AVX1-LABEL: splatvar_shift_v32i8:
-; X32-AVX1: # BB#0:
+; X32-AVX1: # %bb.0:
; X32-AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; X32-AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; X32-AVX1-NEXT: vpsllw $5, %xmm1, %xmm1
@@ -1090,7 +1086,7 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
; X32-AVX1-NEXT: retl
;
; X32-AVX2-LABEL: splatvar_shift_v32i8:
-; X32-AVX2: # BB#0:
+; X32-AVX2: # %bb.0:
; X32-AVX2-NEXT: vpbroadcastb %xmm1, %ymm1
; X32-AVX2-NEXT: vpsllw $5, %ymm1, %ymm1
; X32-AVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
@@ -1128,7 +1124,7 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
define <4 x i64> @constant_shift_v4i64(<4 x i64> %a) nounwind {
; AVX1-LABEL: constant_shift_v4i64:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpsrlq $62, %xmm1, %xmm2
; AVX1-NEXT: vpsrlq $31, %xmm1, %xmm1
@@ -1146,7 +1142,7 @@ define <4 x i64> @constant_shift_v4i64(<4 x i64> %a) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: constant_shift_v4i64:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpsrlvq {{.*}}(%rip), %ymm0, %ymm0
; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [4611686018427387904,72057594037927936,4294967296,2]
; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0
@@ -1154,7 +1150,7 @@ define <4 x i64> @constant_shift_v4i64(<4 x i64> %a) nounwind {
; AVX2-NEXT: retq
;
; XOPAVX1-LABEL: constant_shift_v4i64:
-; XOPAVX1: # BB#0:
+; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; XOPAVX1-NEXT: vpsubq {{.*}}(%rip), %xmm1, %xmm2
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
@@ -1165,7 +1161,7 @@ define <4 x i64> @constant_shift_v4i64(<4 x i64> %a) nounwind {
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: constant_shift_v4i64:
-; XOPAVX2: # BB#0:
+; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: vpsrlvq {{.*}}(%rip), %ymm0, %ymm0
; XOPAVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [4611686018427387904,72057594037927936,4294967296,2]
; XOPAVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0
@@ -1173,20 +1169,20 @@ define <4 x i64> @constant_shift_v4i64(<4 x i64> %a) nounwind {
; XOPAVX2-NEXT: retq
;
; AVX512-LABEL: constant_shift_v4i64:
-; AVX512: # BB#0:
-; AVX512-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512: # %bb.0:
+; AVX512-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [1,7,31,62]
; AVX512-NEXT: vpsravq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: constant_shift_v4i64:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpsravq {{.*}}(%rip), %ymm0, %ymm0
; AVX512VL-NEXT: retq
;
; X32-AVX1-LABEL: constant_shift_v4i64:
-; X32-AVX1: # BB#0:
+; X32-AVX1: # %bb.0:
; X32-AVX1-NEXT: vmovdqa {{.*#+}} ymm1 = [1,0,7,0,31,0,62,0]
; X32-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,2147483648,0,2147483648]
@@ -1213,7 +1209,7 @@ define <4 x i64> @constant_shift_v4i64(<4 x i64> %a) nounwind {
; X32-AVX1-NEXT: retl
;
; X32-AVX2-LABEL: constant_shift_v4i64:
-; X32-AVX2: # BB#0:
+; X32-AVX2: # %bb.0:
; X32-AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [1,0,7,0,31,0,62,0]
; X32-AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648]
; X32-AVX2-NEXT: vpsrlvq %ymm1, %ymm2, %ymm3
@@ -1227,7 +1223,7 @@ define <4 x i64> @constant_shift_v4i64(<4 x i64> %a) nounwind {
define <8 x i32> @constant_shift_v8i32(<8 x i32> %a) nounwind {
; AVX1-LABEL: constant_shift_v8i32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpsrad $7, %xmm0, %xmm1
; AVX1-NEXT: vpsrad $5, %xmm0, %xmm2
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
@@ -1245,12 +1241,12 @@ define <8 x i32> @constant_shift_v8i32(<8 x i32> %a) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: constant_shift_v8i32:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpsravd {{.*}}(%rip), %ymm0, %ymm0
; AVX2-NEXT: retq
;
; XOPAVX1-LABEL: constant_shift_v8i32:
-; XOPAVX1: # BB#0:
+; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vpshad {{.*}}(%rip), %xmm0, %xmm1
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; XOPAVX1-NEXT: vpshad {{.*}}(%rip), %xmm0, %xmm0
@@ -1258,22 +1254,22 @@ define <8 x i32> @constant_shift_v8i32(<8 x i32> %a) nounwind {
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: constant_shift_v8i32:
-; XOPAVX2: # BB#0:
+; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: vpsravd {{.*}}(%rip), %ymm0, %ymm0
; XOPAVX2-NEXT: retq
;
; AVX512-LABEL: constant_shift_v8i32:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpsravd {{.*}}(%rip), %ymm0, %ymm0
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: constant_shift_v8i32:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpsravd {{.*}}(%rip), %ymm0, %ymm0
; AVX512VL-NEXT: retq
;
; X32-AVX1-LABEL: constant_shift_v8i32:
-; X32-AVX1: # BB#0:
+; X32-AVX1: # %bb.0:
; X32-AVX1-NEXT: vpsrad $7, %xmm0, %xmm1
; X32-AVX1-NEXT: vpsrad $5, %xmm0, %xmm2
; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
@@ -1291,7 +1287,7 @@ define <8 x i32> @constant_shift_v8i32(<8 x i32> %a) nounwind {
; X32-AVX1-NEXT: retl
;
; X32-AVX2-LABEL: constant_shift_v8i32:
-; X32-AVX2: # BB#0:
+; X32-AVX2: # %bb.0:
; X32-AVX2-NEXT: vpsravd {{\.LCPI.*}}, %ymm0, %ymm0
; X32-AVX2-NEXT: retl
%shift = ashr <8 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 8, i32 7>
@@ -1300,7 +1296,7 @@ define <8 x i32> @constant_shift_v8i32(<8 x i32> %a) nounwind {
define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) nounwind {
; AVX1-LABEL: constant_shift_v16i16:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpsraw $8, %xmm1, %xmm1
; AVX1-NEXT: vpsraw $4, %xmm1, %xmm2
@@ -1319,8 +1315,8 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: constant_shift_v16i16:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15]
; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15]
@@ -1334,7 +1330,7 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) nounwind {
; AVX2-NEXT: retq
;
; XOPAVX1-LABEL: constant_shift_v16i16:
-; XOPAVX1: # BB#0:
+; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; XOPAVX1-NEXT: vpsubw {{.*}}(%rip), %xmm1, %xmm2
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
@@ -1345,7 +1341,7 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) nounwind {
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: constant_shift_v16i16:
-; XOPAVX2: # BB#0:
+; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; XOPAVX2-NEXT: vpsubw {{.*}}(%rip), %xmm1, %xmm2
; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
@@ -1356,34 +1352,34 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) nounwind {
; XOPAVX2-NEXT: retq
;
; AVX512DQ-LABEL: constant_shift_v16i16:
-; AVX512DQ: # BB#0:
+; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512DQ-NEXT: vpsravd {{.*}}(%rip), %zmm0, %zmm0
; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: constant_shift_v16i16:
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512BW-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
; AVX512BW-NEXT: retq
;
; AVX512DQVL-LABEL: constant_shift_v16i16:
-; AVX512DQVL: # BB#0:
+; AVX512DQVL: # %bb.0:
; AVX512DQVL-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512DQVL-NEXT: vpsravd {{.*}}(%rip), %zmm0, %zmm0
; AVX512DQVL-NEXT: vpmovdw %zmm0, %ymm0
; AVX512DQVL-NEXT: retq
;
; AVX512BWVL-LABEL: constant_shift_v16i16:
-; AVX512BWVL: # BB#0:
+; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vpsravw {{.*}}(%rip), %ymm0, %ymm0
; AVX512BWVL-NEXT: retq
;
; X32-AVX1-LABEL: constant_shift_v16i16:
-; X32-AVX1: # BB#0:
+; X32-AVX1: # %bb.0:
; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; X32-AVX1-NEXT: vpsraw $8, %xmm1, %xmm1
; X32-AVX1-NEXT: vpsraw $4, %xmm1, %xmm2
@@ -1402,8 +1398,8 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) nounwind {
; X32-AVX1-NEXT: retl
;
; X32-AVX2-LABEL: constant_shift_v16i16:
-; X32-AVX2: # BB#0:
-; X32-AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; X32-AVX2: # %bb.0:
+; X32-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; X32-AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
; X32-AVX2-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15]
; X32-AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15]
@@ -1421,7 +1417,7 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) nounwind {
define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind {
; AVX1-LABEL: constant_shift_v32i8:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [8192,24640,41088,57536,49376,32928,16480,32]
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
@@ -1468,7 +1464,7 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: constant_shift_v32i8:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [8192,24640,41088,57536,49376,32928,16480,32,8192,24640,41088,57536,49376,32928,16480,32]
; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
@@ -1496,7 +1492,7 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind {
; AVX2-NEXT: retq
;
; XOPAVX1-LABEL: constant_shift_v32i8:
-; XOPAVX1: # BB#0:
+; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; XOPAVX1-NEXT: vpsubb {{.*}}(%rip), %xmm1, %xmm1
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
@@ -1506,7 +1502,7 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind {
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: constant_shift_v32i8:
-; XOPAVX2: # BB#0:
+; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; XOPAVX2-NEXT: vpsubb {{.*}}(%rip), %xmm1, %xmm1
; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
@@ -1516,7 +1512,7 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind {
; XOPAVX2-NEXT: retq
;
; AVX512DQ-LABEL: constant_shift_v32i8:
-; AVX512DQ: # BB#0:
+; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [8192,24640,41088,57536,49376,32928,16480,32,8192,24640,41088,57536,49376,32928,16480,32]
; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
@@ -1544,14 +1540,14 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind {
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: constant_shift_v32i8:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm0
; AVX512BW-NEXT: vpsravw {{.*}}(%rip), %zmm0, %zmm0
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
; AVX512BW-NEXT: retq
;
; AVX512DQVL-LABEL: constant_shift_v32i8:
-; AVX512DQVL: # BB#0:
+; AVX512DQVL: # %bb.0:
; AVX512DQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [8192,24640,41088,57536,49376,32928,16480,32,8192,24640,41088,57536,49376,32928,16480,32]
; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
@@ -1579,14 +1575,14 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind {
; AVX512DQVL-NEXT: retq
;
; AVX512BWVL-LABEL: constant_shift_v32i8:
-; AVX512BWVL: # BB#0:
+; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vpmovsxbw %ymm0, %zmm0
; AVX512BWVL-NEXT: vpsravw {{.*}}(%rip), %zmm0, %zmm0
; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0
; AVX512BWVL-NEXT: retq
;
; X32-AVX1-LABEL: constant_shift_v32i8:
-; X32-AVX1: # BB#0:
+; X32-AVX1: # %bb.0:
; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [8192,24640,41088,57536,49376,32928,16480,32]
; X32-AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
@@ -1633,7 +1629,7 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind {
; X32-AVX1-NEXT: retl
;
; X32-AVX2-LABEL: constant_shift_v32i8:
-; X32-AVX2: # BB#0:
+; X32-AVX2: # %bb.0:
; X32-AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [8192,24640,41088,57536,49376,32928,16480,32,8192,24640,41088,57536,49376,32928,16480,32]
; X32-AVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
; X32-AVX2-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
@@ -1669,7 +1665,7 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind {
define <4 x i64> @splatconstant_shift_v4i64(<4 x i64> %a) nounwind {
; AVX1-LABEL: splatconstant_shift_v4i64:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpsrad $7, %xmm1, %xmm2
; AVX1-NEXT: vpsrlq $7, %xmm1, %xmm1
@@ -1681,14 +1677,14 @@ define <4 x i64> @splatconstant_shift_v4i64(<4 x i64> %a) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: splatconstant_shift_v4i64:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpsrad $7, %ymm0, %ymm1
; AVX2-NEXT: vpsrlq $7, %ymm0, %ymm0
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
; AVX2-NEXT: retq
;
; XOPAVX1-LABEL: splatconstant_shift_v4i64:
-; XOPAVX1: # BB#0:
+; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; XOPAVX1-NEXT: vpsubq {{.*}}(%rip), %xmm1, %xmm1
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
@@ -1698,26 +1694,26 @@ define <4 x i64> @splatconstant_shift_v4i64(<4 x i64> %a) nounwind {
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: splatconstant_shift_v4i64:
-; XOPAVX2: # BB#0:
+; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: vpsrad $7, %ymm0, %ymm1
; XOPAVX2-NEXT: vpsrlq $7, %ymm0, %ymm0
; XOPAVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
; XOPAVX2-NEXT: retq
;
; AVX512-LABEL: splatconstant_shift_v4i64:
-; AVX512: # BB#0:
-; AVX512-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512: # %bb.0:
+; AVX512-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; AVX512-NEXT: vpsraq $7, %zmm0, %zmm0
-; AVX512-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: splatconstant_shift_v4i64:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpsraq $7, %ymm0, %ymm0
; AVX512VL-NEXT: retq
;
; X32-AVX1-LABEL: splatconstant_shift_v4i64:
-; X32-AVX1: # BB#0:
+; X32-AVX1: # %bb.0:
; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; X32-AVX1-NEXT: vpsrad $7, %xmm1, %xmm2
; X32-AVX1-NEXT: vpsrlq $7, %xmm1, %xmm1
@@ -1729,7 +1725,7 @@ define <4 x i64> @splatconstant_shift_v4i64(<4 x i64> %a) nounwind {
; X32-AVX1-NEXT: retl
;
; X32-AVX2-LABEL: splatconstant_shift_v4i64:
-; X32-AVX2: # BB#0:
+; X32-AVX2: # %bb.0:
; X32-AVX2-NEXT: vpsrad $7, %ymm0, %ymm1
; X32-AVX2-NEXT: vpsrlq $7, %ymm0, %ymm0
; X32-AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
@@ -1740,7 +1736,7 @@ define <4 x i64> @splatconstant_shift_v4i64(<4 x i64> %a) nounwind {
define <8 x i32> @splatconstant_shift_v8i32(<8 x i32> %a) nounwind {
; AVX1-LABEL: splatconstant_shift_v8i32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpsrad $5, %xmm0, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpsrad $5, %xmm0, %xmm0
@@ -1748,12 +1744,12 @@ define <8 x i32> @splatconstant_shift_v8i32(<8 x i32> %a) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: splatconstant_shift_v8i32:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpsrad $5, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; XOPAVX1-LABEL: splatconstant_shift_v8i32:
-; XOPAVX1: # BB#0:
+; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vpsrad $5, %xmm0, %xmm1
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; XOPAVX1-NEXT: vpsrad $5, %xmm0, %xmm0
@@ -1761,22 +1757,22 @@ define <8 x i32> @splatconstant_shift_v8i32(<8 x i32> %a) nounwind {
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: splatconstant_shift_v8i32:
-; XOPAVX2: # BB#0:
+; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: vpsrad $5, %ymm0, %ymm0
; XOPAVX2-NEXT: retq
;
; AVX512-LABEL: splatconstant_shift_v8i32:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpsrad $5, %ymm0, %ymm0
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: splatconstant_shift_v8i32:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpsrad $5, %ymm0, %ymm0
; AVX512VL-NEXT: retq
;
; X32-AVX1-LABEL: splatconstant_shift_v8i32:
-; X32-AVX1: # BB#0:
+; X32-AVX1: # %bb.0:
; X32-AVX1-NEXT: vpsrad $5, %xmm0, %xmm1
; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; X32-AVX1-NEXT: vpsrad $5, %xmm0, %xmm0
@@ -1784,7 +1780,7 @@ define <8 x i32> @splatconstant_shift_v8i32(<8 x i32> %a) nounwind {
; X32-AVX1-NEXT: retl
;
; X32-AVX2-LABEL: splatconstant_shift_v8i32:
-; X32-AVX2: # BB#0:
+; X32-AVX2: # %bb.0:
; X32-AVX2-NEXT: vpsrad $5, %ymm0, %ymm0
; X32-AVX2-NEXT: retl
%shift = ashr <8 x i32> %a, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
@@ -1793,7 +1789,7 @@ define <8 x i32> @splatconstant_shift_v8i32(<8 x i32> %a) nounwind {
define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) nounwind {
; AVX1-LABEL: splatconstant_shift_v16i16:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpsraw $3, %xmm0, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpsraw $3, %xmm0, %xmm0
@@ -1801,12 +1797,12 @@ define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: splatconstant_shift_v16i16:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpsraw $3, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; XOPAVX1-LABEL: splatconstant_shift_v16i16:
-; XOPAVX1: # BB#0:
+; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vpsraw $3, %xmm0, %xmm1
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; XOPAVX1-NEXT: vpsraw $3, %xmm0, %xmm0
@@ -1814,22 +1810,22 @@ define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) nounwind {
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: splatconstant_shift_v16i16:
-; XOPAVX2: # BB#0:
+; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: vpsraw $3, %ymm0, %ymm0
; XOPAVX2-NEXT: retq
;
; AVX512-LABEL: splatconstant_shift_v16i16:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpsraw $3, %ymm0, %ymm0
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: splatconstant_shift_v16i16:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpsraw $3, %ymm0, %ymm0
; AVX512VL-NEXT: retq
;
; X32-AVX1-LABEL: splatconstant_shift_v16i16:
-; X32-AVX1: # BB#0:
+; X32-AVX1: # %bb.0:
; X32-AVX1-NEXT: vpsraw $3, %xmm0, %xmm1
; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; X32-AVX1-NEXT: vpsraw $3, %xmm0, %xmm0
@@ -1837,7 +1833,7 @@ define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) nounwind {
; X32-AVX1-NEXT: retl
;
; X32-AVX2-LABEL: splatconstant_shift_v16i16:
-; X32-AVX2: # BB#0:
+; X32-AVX2: # %bb.0:
; X32-AVX2-NEXT: vpsraw $3, %ymm0, %ymm0
; X32-AVX2-NEXT: retl
%shift = ashr <16 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
@@ -1846,7 +1842,7 @@ define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) nounwind {
define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) nounwind {
; AVX1-LABEL: splatconstant_shift_v32i8:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpsrlw $3, %xmm1, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
@@ -1862,7 +1858,7 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: splatconstant_shift_v32i8:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpsrlw $3, %ymm0, %ymm0
; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
@@ -1871,7 +1867,7 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) nounwind {
; AVX2-NEXT: retq
;
; XOPAVX1-LABEL: splatconstant_shift_v32i8:
-; XOPAVX1: # BB#0:
+; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; XOPAVX1-NEXT: vpsubb {{.*}}(%rip), %xmm1, %xmm1
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
@@ -1881,7 +1877,7 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) nounwind {
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: splatconstant_shift_v32i8:
-; XOPAVX2: # BB#0:
+; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: vpsrlw $3, %ymm0, %ymm0
; XOPAVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
; XOPAVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
@@ -1890,7 +1886,7 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) nounwind {
; XOPAVX2-NEXT: retq
;
; AVX512-LABEL: splatconstant_shift_v32i8:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpsrlw $3, %ymm0, %ymm0
; AVX512-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
@@ -1899,7 +1895,7 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) nounwind {
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: splatconstant_shift_v32i8:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpsrlw $3, %ymm0, %ymm0
; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
@@ -1908,7 +1904,7 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) nounwind {
; AVX512VL-NEXT: retq
;
; X32-AVX1-LABEL: splatconstant_shift_v32i8:
-; X32-AVX1: # BB#0:
+; X32-AVX1: # %bb.0:
; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; X32-AVX1-NEXT: vpsrlw $3, %xmm1, %xmm1
; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
@@ -1924,7 +1920,7 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) nounwind {
; X32-AVX1-NEXT: retl
;
; X32-AVX2-LABEL: splatconstant_shift_v32i8:
-; X32-AVX2: # BB#0:
+; X32-AVX2: # %bb.0:
; X32-AVX2-NEXT: vpsrlw $3, %ymm0, %ymm0
; X32-AVX2-NEXT: vpand {{\.LCPI.*}}, %ymm0, %ymm0
; X32-AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
diff --git a/test/CodeGen/X86/vector-shift-ashr-512.ll b/test/CodeGen/X86/vector-shift-ashr-512.ll
index 4d4b7f4e8223..77fb34a95a39 100644
--- a/test/CodeGen/X86/vector-shift-ashr-512.ll
+++ b/test/CodeGen/X86/vector-shift-ashr-512.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512DQ
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512DQ
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
;
; Variable Shifts
@@ -8,7 +8,7 @@
define <8 x i64> @var_shift_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind {
; ALL-LABEL: var_shift_v8i64:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vpsravq %zmm1, %zmm0, %zmm0
; ALL-NEXT: retq
%shift = ashr <8 x i64> %a, %b
@@ -17,7 +17,7 @@ define <8 x i64> @var_shift_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind {
define <16 x i32> @var_shift_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind {
; ALL-LABEL: var_shift_v16i32:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vpsravd %zmm1, %zmm0, %zmm0
; ALL-NEXT: retq
%shift = ashr <16 x i32> %a, %b
@@ -26,7 +26,7 @@ define <16 x i32> @var_shift_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind {
define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
; AVX512DQ-LABEL: var_shift_v32i16:
-; AVX512DQ: # BB#0:
+; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512DQ-NEXT: vpsravd %zmm2, %zmm0, %zmm0
@@ -38,7 +38,7 @@ define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: var_shift_v32i16:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: retq
%shift = ashr <32 x i16> %a, %b
@@ -47,7 +47,7 @@ define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
; AVX512DQ-LABEL: var_shift_v64i8:
-; AVX512DQ: # BB#0:
+; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vpsllw $5, %ymm2, %ymm2
; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31]
; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
@@ -99,7 +99,7 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: var_shift_v64i8:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
; AVX512BW-NEXT: vpsraw $4, %zmm2, %zmm3
; AVX512BW-NEXT: vpsllw $5, %zmm1, %zmm1
@@ -141,7 +141,7 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
define <8 x i64> @splatvar_shift_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind {
; ALL-LABEL: splatvar_shift_v8i64:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vpsraq %xmm1, %zmm0, %zmm0
; ALL-NEXT: retq
%splat = shufflevector <8 x i64> %b, <8 x i64> undef, <8 x i32> zeroinitializer
@@ -151,7 +151,7 @@ define <8 x i64> @splatvar_shift_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind {
define <16 x i32> @splatvar_shift_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind {
; ALL-LABEL: splatvar_shift_v16i32:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
; ALL-NEXT: vpsrad %xmm1, %zmm0, %zmm0
; ALL-NEXT: retq
@@ -162,14 +162,14 @@ define <16 x i32> @splatvar_shift_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind
define <32 x i16> @splatvar_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
; AVX512DQ-LABEL: splatvar_shift_v32i16:
-; AVX512DQ: # BB#0:
+; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
; AVX512DQ-NEXT: vpsraw %xmm2, %ymm0, %ymm0
; AVX512DQ-NEXT: vpsraw %xmm2, %ymm1, %ymm1
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: splatvar_shift_v32i16:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
; AVX512BW-NEXT: vpsraw %xmm1, %zmm0, %zmm0
; AVX512BW-NEXT: retq
@@ -180,7 +180,7 @@ define <32 x i16> @splatvar_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind
define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
; AVX512DQ-LABEL: splatvar_shift_v64i8:
-; AVX512DQ: # BB#0:
+; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vpbroadcastb %xmm2, %ymm2
; AVX512DQ-NEXT: vpsllw $5, %ymm2, %ymm2
; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31]
@@ -226,7 +226,7 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: splatvar_shift_v64i8:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpbroadcastb %xmm1, %zmm1
; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
; AVX512BW-NEXT: vpsraw $4, %zmm2, %zmm3
@@ -270,7 +270,7 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
define <8 x i64> @constant_shift_v8i64(<8 x i64> %a) nounwind {
; ALL-LABEL: constant_shift_v8i64:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vpsravq {{.*}}(%rip), %zmm0, %zmm0
; ALL-NEXT: retq
%shift = ashr <8 x i64> %a, <i64 1, i64 7, i64 31, i64 62, i64 1, i64 7, i64 31, i64 62>
@@ -279,7 +279,7 @@ define <8 x i64> @constant_shift_v8i64(<8 x i64> %a) nounwind {
define <16 x i32> @constant_shift_v16i32(<16 x i32> %a) nounwind {
; ALL-LABEL: constant_shift_v16i32:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vpsravd {{.*}}(%rip), %zmm0, %zmm0
; ALL-NEXT: retq
%shift = ashr <16 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 8, i32 7, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 8, i32 7>
@@ -288,7 +288,7 @@ define <16 x i32> @constant_shift_v16i32(<16 x i32> %a) nounwind {
define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) nounwind {
; AVX512DQ-LABEL: constant_shift_v32i16:
-; AVX512DQ: # BB#0:
+; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512DQ-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
; AVX512DQ-NEXT: vpsravd %zmm2, %zmm0, %zmm0
@@ -299,7 +299,7 @@ define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) nounwind {
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: constant_shift_v32i16:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpsravw {{.*}}(%rip), %zmm0, %zmm0
; AVX512BW-NEXT: retq
%shift = ashr <32 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
@@ -308,7 +308,7 @@ define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) nounwind {
define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) nounwind {
; AVX512DQ-LABEL: constant_shift_v64i8:
-; AVX512DQ: # BB#0:
+; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [8192,24640,41088,57536,49376,32928,16480,32,8192,24640,41088,57536,49376,32928,16480,32]
; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31]
; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
@@ -353,10 +353,10 @@ define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) nounwind {
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: constant_shift_v64i8:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm1 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
; AVX512BW-NEXT: vpsraw $4, %zmm1, %zmm2
-; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [8192,24640,41088,57536,49376,32928,16480,32,8192,24640,41088,57536,49376,32928,16480,32,8192,24640,41088,57536,49376,32928,16480,32,8192,24640,41088,57536,49376,32928,16480,32]
+; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [8192,24640,41088,57536,49376,32928,16480,32,8192,24640,41088,57536,49376,32928,16480,32,8192,24640,41088,57536,49376,32928,16480,32,8192,24640,41088,57536,49376,32928,16480,32]
; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm4 = zmm0[8],zmm3[8],zmm0[9],zmm3[9],zmm0[10],zmm3[10],zmm0[11],zmm3[11],zmm0[12],zmm3[12],zmm0[13],zmm3[13],zmm0[14],zmm3[14],zmm0[15],zmm3[15],zmm0[24],zmm3[24],zmm0[25],zmm3[25],zmm0[26],zmm3[26],zmm0[27],zmm3[27],zmm0[28],zmm3[28],zmm0[29],zmm3[29],zmm0[30],zmm3[30],zmm0[31],zmm3[31],zmm0[40],zmm3[40],zmm0[41],zmm3[41],zmm0[42],zmm3[42],zmm0[43],zmm3[43],zmm0[44],zmm3[44],zmm0[45],zmm3[45],zmm0[46],zmm3[46],zmm0[47],zmm3[47],zmm0[56],zmm3[56],zmm0[57],zmm3[57],zmm0[58],zmm3[58],zmm0[59],zmm3[59],zmm0[60],zmm3[60],zmm0[61],zmm3[61],zmm0[62],zmm3[62],zmm0[63],zmm3[63]
; AVX512BW-NEXT: vpmovb2m %zmm4, %k1
; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm1 {%k1}
@@ -395,7 +395,7 @@ define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) nounwind {
define <8 x i64> @splatconstant_shift_v8i64(<8 x i64> %a) nounwind {
; ALL-LABEL: splatconstant_shift_v8i64:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vpsraq $7, %zmm0, %zmm0
; ALL-NEXT: retq
%shift = ashr <8 x i64> %a, <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>
@@ -404,7 +404,7 @@ define <8 x i64> @splatconstant_shift_v8i64(<8 x i64> %a) nounwind {
define <16 x i32> @splatconstant_shift_v16i32(<16 x i32> %a) nounwind {
; ALL-LABEL: splatconstant_shift_v16i32:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vpsrad $5, %zmm0, %zmm0
; ALL-NEXT: retq
%shift = ashr <16 x i32> %a, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
@@ -413,13 +413,13 @@ define <16 x i32> @splatconstant_shift_v16i32(<16 x i32> %a) nounwind {
define <32 x i16> @splatconstant_shift_v32i16(<32 x i16> %a) nounwind {
; AVX512DQ-LABEL: splatconstant_shift_v32i16:
-; AVX512DQ: # BB#0:
+; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vpsraw $3, %ymm0, %ymm0
; AVX512DQ-NEXT: vpsraw $3, %ymm1, %ymm1
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: splatconstant_shift_v32i16:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpsraw $3, %zmm0, %zmm0
; AVX512BW-NEXT: retq
%shift = ashr <32 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
@@ -428,7 +428,7 @@ define <32 x i16> @splatconstant_shift_v32i16(<32 x i16> %a) nounwind {
define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) nounwind {
; AVX512DQ-LABEL: splatconstant_shift_v64i8:
-; AVX512DQ: # BB#0:
+; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vpsrlw $3, %ymm0, %ymm0
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
; AVX512DQ-NEXT: vpand %ymm2, %ymm0, %ymm0
@@ -442,7 +442,7 @@ define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) nounwind {
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: splatconstant_shift_v64i8:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpsrlw $3, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0
; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
@@ -455,15 +455,15 @@ define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) nounwind {
define <64 x i8> @ashr_const7_v64i8(<64 x i8> %a) {
; AVX512DQ-LABEL: ashr_const7_v64i8:
-; AVX512DQ: # BB#0:
-; AVX512DQ-NEXT: vpxor %ymm2, %ymm2, %ymm2
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512DQ-NEXT: vpcmpgtb %ymm0, %ymm2, %ymm0
; AVX512DQ-NEXT: vpcmpgtb %ymm1, %ymm2, %ymm1
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: ashr_const7_v64i8:
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512BW-NEXT: vpcmpgtb %zmm0, %zmm1, %k0
; AVX512BW-NEXT: vpmovm2b %k0, %zmm0
; AVX512BW-NEXT: retq
diff --git a/test/CodeGen/X86/vector-shift-lshr-128.ll b/test/CodeGen/X86/vector-shift-lshr-128.ll
index 9b44ad1dac30..307cf287219d 100644
--- a/test/CodeGen/X86/vector-shift-lshr-128.ll
+++ b/test/CodeGen/X86/vector-shift-lshr-128.ll
@@ -5,10 +5,10 @@
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX1
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512DQ
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512DQVL
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512BWVL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512DQ
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512DQVL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512BWVL
;
; Just one 32-bit run to make sure we do reasonable things for i64 shifts.
; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=X32-SSE --check-prefix=X32-SSE2
@@ -19,17 +19,16 @@
define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
; SSE2-LABEL: var_shift_v2i64:
-; SSE2: # BB#0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: psrlq %xmm3, %xmm2
+; SSE2-NEXT: psrlq %xmm1, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
; SSE2-NEXT: psrlq %xmm1, %xmm0
-; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
-; SSE2-NEXT: movapd %xmm2, %xmm0
+; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
; SSE2-NEXT: retq
;
; SSE41-LABEL: var_shift_v2i64:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm2
; SSE41-NEXT: psrlq %xmm1, %xmm2
; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
@@ -38,7 +37,7 @@ define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
; SSE41-NEXT: retq
;
; AVX1-LABEL: var_shift_v2i64:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0
@@ -46,40 +45,39 @@ define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: var_shift_v2i64:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
; AVX2-NEXT: retq
;
; XOPAVX1-LABEL: var_shift_v2i64:
-; XOPAVX1: # BB#0:
+; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; XOPAVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm1
; XOPAVX1-NEXT: vpshlq %xmm1, %xmm0, %xmm0
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: var_shift_v2i64:
-; XOPAVX2: # BB#0:
+; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
; XOPAVX2-NEXT: retq
;
; AVX512-LABEL: var_shift_v2i64:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: var_shift_v2i64:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0
; AVX512VL-NEXT: retq
;
; X32-SSE-LABEL: var_shift_v2i64:
-; X32-SSE: # BB#0:
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: movdqa %xmm0, %xmm2
-; X32-SSE-NEXT: psrlq %xmm3, %xmm2
+; X32-SSE-NEXT: psrlq %xmm1, %xmm2
+; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
; X32-SSE-NEXT: psrlq %xmm1, %xmm0
-; X32-SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
-; X32-SSE-NEXT: movapd %xmm2, %xmm0
+; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
; X32-SSE-NEXT: retl
%shift = lshr <2 x i64> %a, %b
ret <2 x i64> %shift
@@ -87,31 +85,31 @@ define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
; SSE2-LABEL: var_shift_v4i32:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: psrldq {{.*#+}} xmm2 = xmm2[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT: psrlq $32, %xmm2
; SSE2-NEXT: movdqa %xmm0, %xmm3
; SSE2-NEXT: psrld %xmm2, %xmm3
; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: psrlq $32, %xmm2
+; SSE2-NEXT: psrldq {{.*#+}} xmm2 = xmm2[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; SSE2-NEXT: movdqa %xmm0, %xmm4
; SSE2-NEXT: psrld %xmm2, %xmm4
-; SSE2-NEXT: movsd {{.*#+}} xmm3 = xmm4[0],xmm3[1]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,3,2,3]
+; SSE2-NEXT: movsd {{.*#+}} xmm4 = xmm3[0],xmm4[1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,3,2,3]
; SSE2-NEXT: pxor %xmm3, %xmm3
; SSE2-NEXT: movdqa %xmm1, %xmm4
-; SSE2-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
; SSE2-NEXT: movdqa %xmm0, %xmm5
; SSE2-NEXT: psrld %xmm4, %xmm5
-; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
; SSE2-NEXT: psrld %xmm1, %xmm0
-; SSE2-NEXT: movsd {{.*#+}} xmm5 = xmm0[0],xmm5[1]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,2,2,3]
+; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm5[0],xmm0[1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; SSE2-NEXT: retq
;
; SSE41-LABEL: var_shift_v4i32:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm1, %xmm2
; SSE41-NEXT: psrldq {{.*#+}} xmm2 = xmm2[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; SSE41-NEXT: movdqa %xmm0, %xmm3
@@ -132,7 +130,7 @@ define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
; SSE41-NEXT: retq
;
; AVX1-LABEL: var_shift_v4i32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; AVX1-NEXT: vpsrld %xmm2, %xmm0, %xmm2
; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm3
@@ -148,53 +146,53 @@ define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: var_shift_v4i32:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: retq
;
; XOPAVX1-LABEL: var_shift_v4i32:
-; XOPAVX1: # BB#0:
+; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; XOPAVX1-NEXT: vpsubd %xmm1, %xmm2, %xmm1
; XOPAVX1-NEXT: vpshld %xmm1, %xmm0, %xmm0
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: var_shift_v4i32:
-; XOPAVX2: # BB#0:
+; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
; XOPAVX2-NEXT: retq
;
; AVX512-LABEL: var_shift_v4i32:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: var_shift_v4i32:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
; AVX512VL-NEXT: retq
;
; X32-SSE-LABEL: var_shift_v4i32:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: movdqa %xmm1, %xmm2
-; X32-SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; X32-SSE-NEXT: psrlq $32, %xmm2
; X32-SSE-NEXT: movdqa %xmm0, %xmm3
; X32-SSE-NEXT: psrld %xmm2, %xmm3
; X32-SSE-NEXT: movdqa %xmm1, %xmm2
-; X32-SSE-NEXT: psrlq $32, %xmm2
+; X32-SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; X32-SSE-NEXT: movdqa %xmm0, %xmm4
; X32-SSE-NEXT: psrld %xmm2, %xmm4
-; X32-SSE-NEXT: movsd {{.*#+}} xmm3 = xmm4[0],xmm3[1]
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,3,2,3]
+; X32-SSE-NEXT: movsd {{.*#+}} xmm4 = xmm3[0],xmm4[1]
+; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,3,2,3]
; X32-SSE-NEXT: pxor %xmm3, %xmm3
; X32-SSE-NEXT: movdqa %xmm1, %xmm4
-; X32-SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; X32-SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
; X32-SSE-NEXT: movdqa %xmm0, %xmm5
; X32-SSE-NEXT: psrld %xmm4, %xmm5
-; X32-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; X32-SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
; X32-SSE-NEXT: psrld %xmm1, %xmm0
-; X32-SSE-NEXT: movsd {{.*#+}} xmm5 = xmm0[0],xmm5[1]
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,2,2,3]
+; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm5[0],xmm0[1]
+; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X32-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; X32-SSE-NEXT: retl
%shift = lshr <4 x i32> %a, %b
@@ -203,7 +201,7 @@ define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
; SSE2-LABEL: var_shift_v8i16:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: psllw $12, %xmm1
; SSE2-NEXT: movdqa %xmm1, %xmm2
; SSE2-NEXT: psraw $15, %xmm2
@@ -238,7 +236,7 @@ define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
; SSE2-NEXT: retq
;
; SSE41-LABEL: var_shift_v8i16:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm2
; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: psllw $12, %xmm0
@@ -268,7 +266,7 @@ define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
; SSE41-NEXT: retq
;
; AVX1-LABEL: var_shift_v8i16:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpsllw $12, %xmm1, %xmm2
; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1
; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1
@@ -286,55 +284,58 @@ define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: var_shift_v8i16:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX2-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; XOP-LABEL: var_shift_v8i16:
-; XOP: # BB#0:
+; XOP: # %bb.0:
; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2
; XOP-NEXT: vpsubw %xmm1, %xmm2, %xmm1
; XOP-NEXT: vpshlw %xmm1, %xmm0, %xmm0
; XOP-NEXT: retq
;
; AVX512DQ-LABEL: var_shift_v8i16:
-; AVX512DQ: # BB#0:
+; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX512DQ-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512DQ-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: var_shift_v8i16:
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: # kill: %XMM1<def> %XMM1<kill> %ZMM1<def>
-; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: # kill: def %xmm1 killed %xmm1 def %zmm1
+; AVX512BW-NEXT: # kill: def %xmm0 killed %xmm0 def %zmm0
; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512BW-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQVL-LABEL: var_shift_v8i16:
-; AVX512DQVL: # BB#0:
+; AVX512DQVL: # %bb.0:
; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX512DQVL-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
; AVX512DQVL-NEXT: vpmovdw %ymm0, %xmm0
+; AVX512DQVL-NEXT: vzeroupper
; AVX512DQVL-NEXT: retq
;
; AVX512BWVL-LABEL: var_shift_v8i16:
-; AVX512BWVL: # BB#0:
+; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vpsrlvw %xmm1, %xmm0, %xmm0
; AVX512BWVL-NEXT: retq
;
; X32-SSE-LABEL: var_shift_v8i16:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: psllw $12, %xmm1
; X32-SSE-NEXT: movdqa %xmm1, %xmm2
; X32-SSE-NEXT: psraw $15, %xmm2
@@ -373,7 +374,7 @@ define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; SSE2-LABEL: var_shift_v16i8:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: psllw $5, %xmm1
; SSE2-NEXT: pxor %xmm2, %xmm2
; SSE2-NEXT: pxor %xmm3, %xmm3
@@ -404,7 +405,7 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; SSE2-NEXT: retq
;
; SSE41-LABEL: var_shift_v16i8:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm2
; SSE41-NEXT: psllw $5, %xmm1
; SSE41-NEXT: movdqa %xmm2, %xmm3
@@ -428,7 +429,7 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; SSE41-NEXT: retq
;
; AVX-LABEL: var_shift_v16i8:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpsllw $5, %xmm1, %xmm1
; AVX-NEXT: vpsrlw $4, %xmm0, %xmm2
; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
@@ -444,30 +445,32 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; AVX-NEXT: retq
;
; XOP-LABEL: var_shift_v16i8:
-; XOP: # BB#0:
+; XOP: # %bb.0:
; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2
; XOP-NEXT: vpsubb %xmm1, %xmm2, %xmm1
; XOP-NEXT: vpshlb %xmm1, %xmm0, %xmm0
; XOP-NEXT: retq
;
; AVX512-LABEL: var_shift_v16i8:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
; AVX512-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: var_shift_v16i8:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
; AVX512VL-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0
; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; X32-SSE-LABEL: var_shift_v16i8:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: psllw $5, %xmm1
; X32-SSE-NEXT: pxor %xmm2, %xmm2
; X32-SSE-NEXT: pxor %xmm3, %xmm3
@@ -506,32 +509,32 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
define <2 x i64> @splatvar_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
; SSE-LABEL: splatvar_shift_v2i64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: psrlq %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: splatvar_shift_v2i64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpsrlq %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
;
; XOP-LABEL: splatvar_shift_v2i64:
-; XOP: # BB#0:
+; XOP: # %bb.0:
; XOP-NEXT: vpsrlq %xmm1, %xmm0, %xmm0
; XOP-NEXT: retq
;
; AVX512-LABEL: splatvar_shift_v2i64:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpsrlq %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: splatvar_shift_v2i64:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpsrlq %xmm1, %xmm0, %xmm0
; AVX512VL-NEXT: retq
;
; X32-SSE-LABEL: splatvar_shift_v2i64:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: psrlq %xmm1, %xmm0
; X32-SSE-NEXT: retl
%splat = shufflevector <2 x i64> %b, <2 x i64> undef, <2 x i32> zeroinitializer
@@ -541,44 +544,44 @@ define <2 x i64> @splatvar_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
define <4 x i32> @splatvar_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
; SSE2-LABEL: splatvar_shift_v4i32:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: xorps %xmm2, %xmm2
; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
; SSE2-NEXT: psrld %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: splatvar_shift_v4i32:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
; SSE41-NEXT: psrld %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: splatvar_shift_v4i32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
; AVX-NEXT: vpsrld %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
;
; XOP-LABEL: splatvar_shift_v4i32:
-; XOP: # BB#0:
+; XOP: # %bb.0:
; XOP-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
; XOP-NEXT: vpsrld %xmm1, %xmm0, %xmm0
; XOP-NEXT: retq
;
; AVX512-LABEL: splatvar_shift_v4i32:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
; AVX512-NEXT: vpsrld %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: splatvar_shift_v4i32:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
; AVX512VL-NEXT: vpsrld %xmm1, %xmm0, %xmm0
; AVX512VL-NEXT: retq
;
; X32-SSE-LABEL: splatvar_shift_v4i32:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: xorps %xmm2, %xmm2
; X32-SSE-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
; X32-SSE-NEXT: psrld %xmm2, %xmm0
@@ -590,44 +593,44 @@ define <4 x i32> @splatvar_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
define <8 x i16> @splatvar_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
; SSE2-LABEL: splatvar_shift_v8i16:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: pextrw $0, %xmm1, %eax
; SSE2-NEXT: movd %eax, %xmm1
; SSE2-NEXT: psrlw %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: splatvar_shift_v8i16:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
; SSE41-NEXT: psrlw %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: splatvar_shift_v8i16:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
; AVX-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
;
; XOP-LABEL: splatvar_shift_v8i16:
-; XOP: # BB#0:
+; XOP: # %bb.0:
; XOP-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
; XOP-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
; XOP-NEXT: retq
;
; AVX512-LABEL: splatvar_shift_v8i16:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
; AVX512-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: splatvar_shift_v8i16:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
; AVX512VL-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
; AVX512VL-NEXT: retq
;
; X32-SSE-LABEL: splatvar_shift_v8i16:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: pextrw $0, %xmm1, %eax
; X32-SSE-NEXT: movd %eax, %xmm1
; X32-SSE-NEXT: psrlw %xmm1, %xmm0
@@ -639,7 +642,7 @@ define <8 x i16> @splatvar_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; SSE2-LABEL: splatvar_shift_v16i8:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,1,1]
@@ -673,7 +676,7 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; SSE2-NEXT: retq
;
; SSE41-LABEL: splatvar_shift_v16i8:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm2
; SSE41-NEXT: pxor %xmm0, %xmm0
; SSE41-NEXT: pshufb %xmm0, %xmm1
@@ -700,7 +703,7 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; SSE41-NEXT: retq
;
; AVX1-LABEL: splatvar_shift_v16i8:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpsllw $5, %xmm1, %xmm1
@@ -718,7 +721,7 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: splatvar_shift_v16i8:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1
; AVX2-NEXT: vpsllw $5, %xmm1, %xmm1
; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm2
@@ -735,7 +738,7 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; AVX2-NEXT: retq
;
; XOPAVX1-LABEL: splatvar_shift_v16i8:
-; XOPAVX1: # BB#0:
+; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; XOPAVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; XOPAVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm1
@@ -743,7 +746,7 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: splatvar_shift_v16i8:
-; XOPAVX2: # BB#0:
+; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1
; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
; XOPAVX2-NEXT: vpsubb %xmm1, %xmm2, %xmm1
@@ -751,25 +754,27 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; XOPAVX2-NEXT: retq
;
; AVX512-LABEL: splatvar_shift_v16i8:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpbroadcastb %xmm1, %xmm1
; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
; AVX512-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: splatvar_shift_v16i8:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpbroadcastb %xmm1, %xmm1
; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
; AVX512VL-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0
; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; X32-SSE-LABEL: splatvar_shift_v16i8:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,1,1]
@@ -812,16 +817,15 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
define <2 x i64> @constant_shift_v2i64(<2 x i64> %a) nounwind {
; SSE2-LABEL: constant_shift_v2i64:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrlq $7, %xmm1
-; SSE2-NEXT: psrlq $1, %xmm0
-; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
-; SSE2-NEXT: movapd %xmm1, %xmm0
+; SSE2-NEXT: psrlq $1, %xmm1
+; SSE2-NEXT: psrlq $7, %xmm0
+; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; SSE2-NEXT: retq
;
; SSE41-LABEL: constant_shift_v2i64:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm1
; SSE41-NEXT: psrlq $7, %xmm1
; SSE41-NEXT: psrlq $1, %xmm0
@@ -829,46 +833,45 @@ define <2 x i64> @constant_shift_v2i64(<2 x i64> %a) nounwind {
; SSE41-NEXT: retq
;
; AVX1-LABEL: constant_shift_v2i64:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpsrlq $7, %xmm0, %xmm1
; AVX1-NEXT: vpsrlq $1, %xmm0, %xmm0
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
; AVX1-NEXT: retq
;
; AVX2-LABEL: constant_shift_v2i64:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpsrlvq {{.*}}(%rip), %xmm0, %xmm0
; AVX2-NEXT: retq
;
; XOPAVX1-LABEL: constant_shift_v2i64:
-; XOPAVX1: # BB#0:
+; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; XOPAVX1-NEXT: vpsubq {{.*}}(%rip), %xmm1, %xmm1
; XOPAVX1-NEXT: vpshlq %xmm1, %xmm0, %xmm0
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: constant_shift_v2i64:
-; XOPAVX2: # BB#0:
+; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: vpsrlvq {{.*}}(%rip), %xmm0, %xmm0
; XOPAVX2-NEXT: retq
;
; AVX512-LABEL: constant_shift_v2i64:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpsrlvq {{.*}}(%rip), %xmm0, %xmm0
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: constant_shift_v2i64:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpsrlvq {{.*}}(%rip), %xmm0, %xmm0
; AVX512VL-NEXT: retq
;
; X32-SSE-LABEL: constant_shift_v2i64:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: movdqa %xmm0, %xmm1
-; X32-SSE-NEXT: psrlq $7, %xmm1
-; X32-SSE-NEXT: psrlq $1, %xmm0
-; X32-SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
-; X32-SSE-NEXT: movapd %xmm1, %xmm0
+; X32-SSE-NEXT: psrlq $1, %xmm1
+; X32-SSE-NEXT: psrlq $7, %xmm0
+; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; X32-SSE-NEXT: retl
%shift = lshr <2 x i64> %a, <i64 1, i64 7>
ret <2 x i64> %shift
@@ -876,23 +879,23 @@ define <2 x i64> @constant_shift_v2i64(<2 x i64> %a) nounwind {
define <4 x i32> @constant_shift_v4i32(<4 x i32> %a) nounwind {
; SSE2-LABEL: constant_shift_v4i32:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrld $7, %xmm1
+; SSE2-NEXT: psrld $5, %xmm1
; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: psrld $5, %xmm2
-; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; SSE2-NEXT: psrld $7, %xmm2
+; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: psrld $6, %xmm2
-; SSE2-NEXT: psrld $4, %xmm0
-; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
+; SSE2-NEXT: psrld $4, %xmm2
+; SSE2-NEXT: psrld $6, %xmm0
+; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: retq
;
; SSE41-LABEL: constant_shift_v4i32:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm1
; SSE41-NEXT: psrld $7, %xmm1
; SSE41-NEXT: movdqa %xmm0, %xmm2
@@ -906,7 +909,7 @@ define <4 x i32> @constant_shift_v4i32(<4 x i32> %a) nounwind {
; SSE41-NEXT: retq
;
; AVX1-LABEL: constant_shift_v4i32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpsrld $7, %xmm0, %xmm1
; AVX1-NEXT: vpsrld $5, %xmm0, %xmm2
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
@@ -917,43 +920,43 @@ define <4 x i32> @constant_shift_v4i32(<4 x i32> %a) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: constant_shift_v4i32:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
; AVX2-NEXT: retq
;
; XOPAVX1-LABEL: constant_shift_v4i32:
-; XOPAVX1: # BB#0:
+; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vpshld {{.*}}(%rip), %xmm0, %xmm0
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: constant_shift_v4i32:
-; XOPAVX2: # BB#0:
+; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
; XOPAVX2-NEXT: retq
;
; AVX512-LABEL: constant_shift_v4i32:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: constant_shift_v4i32:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
; AVX512VL-NEXT: retq
;
; X32-SSE-LABEL: constant_shift_v4i32:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: movdqa %xmm0, %xmm1
-; X32-SSE-NEXT: psrld $7, %xmm1
+; X32-SSE-NEXT: psrld $5, %xmm1
; X32-SSE-NEXT: movdqa %xmm0, %xmm2
-; X32-SSE-NEXT: psrld $5, %xmm2
-; X32-SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; X32-SSE-NEXT: psrld $7, %xmm2
+; X32-SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
+; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
; X32-SSE-NEXT: movdqa %xmm0, %xmm2
-; X32-SSE-NEXT: psrld $6, %xmm2
-; X32-SSE-NEXT: psrld $4, %xmm0
-; X32-SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
+; X32-SSE-NEXT: psrld $4, %xmm2
+; X32-SSE-NEXT: psrld $6, %xmm0
+; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
+; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X32-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; X32-SSE-NEXT: retl
%shift = lshr <4 x i32> %a, <i32 4, i32 5, i32 6, i32 7>
@@ -962,7 +965,7 @@ define <4 x i32> @constant_shift_v4i32(<4 x i32> %a) nounwind {
define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) nounwind {
; SSE2-LABEL: constant_shift_v8i16:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrlw $4, %xmm1
; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
@@ -979,7 +982,7 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) nounwind {
; SSE2-NEXT: retq
;
; SSE41-LABEL: constant_shift_v8i16:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm1
; SSE41-NEXT: psrlw $4, %xmm1
; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7]
@@ -992,7 +995,7 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) nounwind {
; SSE41-NEXT: retq
;
; AVX1-LABEL: constant_shift_v8i16:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm1
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm1
@@ -1002,52 +1005,55 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: constant_shift_v8i16:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX2-NEXT: vpsrlvd {{.*}}(%rip), %ymm0, %ymm0
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; XOP-LABEL: constant_shift_v8i16:
-; XOP: # BB#0:
+; XOP: # %bb.0:
; XOP-NEXT: vpxor %xmm1, %xmm1, %xmm1
; XOP-NEXT: vpsubw {{.*}}(%rip), %xmm1, %xmm1
; XOP-NEXT: vpshlw %xmm1, %xmm0, %xmm0
; XOP-NEXT: retq
;
; AVX512DQ-LABEL: constant_shift_v8i16:
-; AVX512DQ: # BB#0:
+; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX512DQ-NEXT: vpsrlvd {{.*}}(%rip), %ymm0, %ymm0
; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512DQ-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: constant_shift_v8i16:
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: # kill: def %xmm0 killed %xmm0 def %zmm0
; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7]
; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512BW-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQVL-LABEL: constant_shift_v8i16:
-; AVX512DQVL: # BB#0:
+; AVX512DQVL: # %bb.0:
; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX512DQVL-NEXT: vpsrlvd {{.*}}(%rip), %ymm0, %ymm0
; AVX512DQVL-NEXT: vpmovdw %ymm0, %xmm0
+; AVX512DQVL-NEXT: vzeroupper
; AVX512DQVL-NEXT: retq
;
; AVX512BWVL-LABEL: constant_shift_v8i16:
-; AVX512BWVL: # BB#0:
+; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vpsrlvw {{.*}}(%rip), %xmm0, %xmm0
; AVX512BWVL-NEXT: retq
;
; X32-SSE-LABEL: constant_shift_v8i16:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: movdqa %xmm0, %xmm1
; X32-SSE-NEXT: psrlw $4, %xmm1
; X32-SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
@@ -1068,7 +1074,7 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) nounwind {
define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) nounwind {
; SSE2-LABEL: constant_shift_v16i8:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [8192,24640,41088,57536,49376,32928,16480,32]
; SSE2-NEXT: pxor %xmm1, %xmm1
; SSE2-NEXT: pxor %xmm3, %xmm3
@@ -1099,7 +1105,7 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) nounwind {
; SSE2-NEXT: retq
;
; SSE41-LABEL: constant_shift_v16i8:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm1
; SSE41-NEXT: movdqa %xmm1, %xmm2
; SSE41-NEXT: psrlw $4, %xmm2
@@ -1120,7 +1126,7 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) nounwind {
; SSE41-NEXT: retq
;
; AVX-LABEL: constant_shift_v16i8:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpsrlw $4, %xmm0, %xmm1
; AVX-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [8192,24640,41088,57536,49376,32928,16480,32]
@@ -1136,28 +1142,30 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) nounwind {
; AVX-NEXT: retq
;
; XOP-LABEL: constant_shift_v16i8:
-; XOP: # BB#0:
+; XOP: # %bb.0:
; XOP-NEXT: vpxor %xmm1, %xmm1, %xmm1
; XOP-NEXT: vpsubb {{.*}}(%rip), %xmm1, %xmm1
; XOP-NEXT: vpshlb %xmm1, %xmm0, %xmm0
; XOP-NEXT: retq
;
; AVX512-LABEL: constant_shift_v16i8:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
; AVX512-NEXT: vpsrlvd {{.*}}(%rip), %zmm0, %zmm0
; AVX512-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: constant_shift_v16i8:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
; AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %zmm0, %zmm0
; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; X32-SSE-LABEL: constant_shift_v16i8:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [8192,24640,41088,57536,49376,32928,16480,32]
; X32-SSE-NEXT: pxor %xmm1, %xmm1
; X32-SSE-NEXT: pxor %xmm3, %xmm3
@@ -1196,32 +1204,32 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) nounwind {
define <2 x i64> @splatconstant_shift_v2i64(<2 x i64> %a) nounwind {
; SSE-LABEL: splatconstant_shift_v2i64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: psrlq $7, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: splatconstant_shift_v2i64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpsrlq $7, %xmm0, %xmm0
; AVX-NEXT: retq
;
; XOP-LABEL: splatconstant_shift_v2i64:
-; XOP: # BB#0:
+; XOP: # %bb.0:
; XOP-NEXT: vpsrlq $7, %xmm0, %xmm0
; XOP-NEXT: retq
;
; AVX512-LABEL: splatconstant_shift_v2i64:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpsrlq $7, %xmm0, %xmm0
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: splatconstant_shift_v2i64:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpsrlq $7, %xmm0, %xmm0
; AVX512VL-NEXT: retq
;
; X32-SSE-LABEL: splatconstant_shift_v2i64:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: psrlq $7, %xmm0
; X32-SSE-NEXT: retl
%shift = lshr <2 x i64> %a, <i64 7, i64 7>
@@ -1230,32 +1238,32 @@ define <2 x i64> @splatconstant_shift_v2i64(<2 x i64> %a) nounwind {
define <4 x i32> @splatconstant_shift_v4i32(<4 x i32> %a) nounwind {
; SSE-LABEL: splatconstant_shift_v4i32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: psrld $5, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: splatconstant_shift_v4i32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpsrld $5, %xmm0, %xmm0
; AVX-NEXT: retq
;
; XOP-LABEL: splatconstant_shift_v4i32:
-; XOP: # BB#0:
+; XOP: # %bb.0:
; XOP-NEXT: vpsrld $5, %xmm0, %xmm0
; XOP-NEXT: retq
;
; AVX512-LABEL: splatconstant_shift_v4i32:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpsrld $5, %xmm0, %xmm0
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: splatconstant_shift_v4i32:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpsrld $5, %xmm0, %xmm0
; AVX512VL-NEXT: retq
;
; X32-SSE-LABEL: splatconstant_shift_v4i32:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: psrld $5, %xmm0
; X32-SSE-NEXT: retl
%shift = lshr <4 x i32> %a, <i32 5, i32 5, i32 5, i32 5>
@@ -1264,32 +1272,32 @@ define <4 x i32> @splatconstant_shift_v4i32(<4 x i32> %a) nounwind {
define <8 x i16> @splatconstant_shift_v8i16(<8 x i16> %a) nounwind {
; SSE-LABEL: splatconstant_shift_v8i16:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: psrlw $3, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: splatconstant_shift_v8i16:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpsrlw $3, %xmm0, %xmm0
; AVX-NEXT: retq
;
; XOP-LABEL: splatconstant_shift_v8i16:
-; XOP: # BB#0:
+; XOP: # %bb.0:
; XOP-NEXT: vpsrlw $3, %xmm0, %xmm0
; XOP-NEXT: retq
;
; AVX512-LABEL: splatconstant_shift_v8i16:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpsrlw $3, %xmm0, %xmm0
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: splatconstant_shift_v8i16:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpsrlw $3, %xmm0, %xmm0
; AVX512VL-NEXT: retq
;
; X32-SSE-LABEL: splatconstant_shift_v8i16:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: psrlw $3, %xmm0
; X32-SSE-NEXT: retl
%shift = lshr <8 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
@@ -1298,38 +1306,38 @@ define <8 x i16> @splatconstant_shift_v8i16(<8 x i16> %a) nounwind {
define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) nounwind {
; SSE-LABEL: splatconstant_shift_v16i8:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: psrlw $3, %xmm0
; SSE-NEXT: pand {{.*}}(%rip), %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: splatconstant_shift_v16i8:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpsrlw $3, %xmm0, %xmm0
; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: retq
;
; XOP-LABEL: splatconstant_shift_v16i8:
-; XOP: # BB#0:
+; XOP: # %bb.0:
; XOP-NEXT: vpxor %xmm1, %xmm1, %xmm1
; XOP-NEXT: vpsubb {{.*}}(%rip), %xmm1, %xmm1
; XOP-NEXT: vpshlb %xmm1, %xmm0, %xmm0
; XOP-NEXT: retq
;
; AVX512-LABEL: splatconstant_shift_v16i8:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpsrlw $3, %xmm0, %xmm0
; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: splatconstant_shift_v16i8:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpsrlw $3, %xmm0, %xmm0
; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX512VL-NEXT: retq
;
; X32-SSE-LABEL: splatconstant_shift_v16i8:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: psrlw $3, %xmm0
; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
; X32-SSE-NEXT: retl
diff --git a/test/CodeGen/X86/vector-shift-lshr-256.ll b/test/CodeGen/X86/vector-shift-lshr-256.ll
index 46be36b76e98..0192c8ac05df 100644
--- a/test/CodeGen/X86/vector-shift-lshr-256.ll
+++ b/test/CodeGen/X86/vector-shift-lshr-256.ll
@@ -3,10 +3,10 @@
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX1
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512DQ
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512DQVL
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512BWVL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512DQ
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512DQVL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512BWVL
;
; 32-bit runs to make sure we do reasonable things for i64 shifts.
; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=X32-AVX --check-prefix=X32-AVX1
@@ -18,7 +18,7 @@
define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
; AVX1-LABEL: var_shift_v4i64:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpsrlq %xmm2, %xmm3, %xmm4
@@ -33,12 +33,12 @@ define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: var_shift_v4i64:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; XOPAVX1-LABEL: var_shift_v4i64:
-; XOPAVX1: # BB#0:
+; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
; XOPAVX1-NEXT: vpsubq %xmm2, %xmm3, %xmm2
@@ -50,22 +50,22 @@ define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: var_shift_v4i64:
-; XOPAVX2: # BB#0:
+; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0
; XOPAVX2-NEXT: retq
;
; AVX512-LABEL: var_shift_v4i64:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: var_shift_v4i64:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0
; AVX512VL-NEXT: retq
;
; X32-AVX1-LABEL: var_shift_v4i64:
-; X32-AVX1: # BB#0:
+; X32-AVX1: # %bb.0:
; X32-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; X32-AVX1-NEXT: vpsrlq %xmm2, %xmm3, %xmm4
@@ -80,7 +80,7 @@ define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
; X32-AVX1-NEXT: retl
;
; X32-AVX2-LABEL: var_shift_v4i64:
-; X32-AVX2: # BB#0:
+; X32-AVX2: # %bb.0:
; X32-AVX2-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0
; X32-AVX2-NEXT: retl
%shift = lshr <4 x i64> %a, %b
@@ -89,7 +89,7 @@ define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
define <8 x i32> @var_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
; AVX1-LABEL: var_shift_v8i32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
; AVX1-NEXT: vpsrldq {{.*#+}} xmm4 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -119,12 +119,12 @@ define <8 x i32> @var_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: var_shift_v8i32:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; XOPAVX1-LABEL: var_shift_v8i32:
-; XOPAVX1: # BB#0:
+; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
; XOPAVX1-NEXT: vpsubd %xmm2, %xmm3, %xmm2
@@ -136,22 +136,22 @@ define <8 x i32> @var_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: var_shift_v8i32:
-; XOPAVX2: # BB#0:
+; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
; XOPAVX2-NEXT: retq
;
; AVX512-LABEL: var_shift_v8i32:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: var_shift_v8i32:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
; AVX512VL-NEXT: retq
;
; X32-AVX1-LABEL: var_shift_v8i32:
-; X32-AVX1: # BB#0:
+; X32-AVX1: # %bb.0:
; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; X32-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
; X32-AVX1-NEXT: vpsrldq {{.*#+}} xmm4 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -181,7 +181,7 @@ define <8 x i32> @var_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
; X32-AVX1-NEXT: retl
;
; X32-AVX2-LABEL: var_shift_v8i32:
-; X32-AVX2: # BB#0:
+; X32-AVX2: # %bb.0:
; X32-AVX2-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
; X32-AVX2-NEXT: retl
%shift = lshr <8 x i32> %a, %b
@@ -190,7 +190,7 @@ define <8 x i32> @var_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
; AVX1-LABEL: var_shift_v16i16:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vpsllw $12, %xmm2, %xmm3
; AVX1-NEXT: vpsllw $4, %xmm2, %xmm2
@@ -225,8 +225,8 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: var_shift_v16i16:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpxor %ymm2, %ymm2, %ymm2
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
; AVX2-NEXT: vpsrlvd %ymm3, %ymm4, %ymm3
@@ -239,7 +239,7 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
; AVX2-NEXT: retq
;
; XOPAVX1-LABEL: var_shift_v16i16:
-; XOPAVX1: # BB#0:
+; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
; XOPAVX1-NEXT: vpsubw %xmm2, %xmm3, %xmm2
@@ -251,7 +251,7 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: var_shift_v16i16:
-; XOPAVX2: # BB#0:
+; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
; XOPAVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
; XOPAVX2-NEXT: vpsubw %xmm2, %xmm3, %xmm2
@@ -263,7 +263,7 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
; XOPAVX2-NEXT: retq
;
; AVX512DQ-LABEL: var_shift_v16i16:
-; AVX512DQ: # BB#0:
+; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
; AVX512DQ-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0
@@ -271,15 +271,15 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: var_shift_v16i16:
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
-; AVX512BW-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: # kill: def %ymm1 killed %ymm1 def %zmm1
+; AVX512BW-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512BW-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
; AVX512BW-NEXT: retq
;
; AVX512DQVL-LABEL: var_shift_v16i16:
-; AVX512DQVL: # BB#0:
+; AVX512DQVL: # %bb.0:
; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
; AVX512DQVL-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0
@@ -287,12 +287,12 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
; AVX512DQVL-NEXT: retq
;
; AVX512BWVL-LABEL: var_shift_v16i16:
-; AVX512BWVL: # BB#0:
+; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vpsrlvw %ymm1, %ymm0, %ymm0
; AVX512BWVL-NEXT: retq
;
; X32-AVX1-LABEL: var_shift_v16i16:
-; X32-AVX1: # BB#0:
+; X32-AVX1: # %bb.0:
; X32-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; X32-AVX1-NEXT: vpsllw $12, %xmm2, %xmm3
; X32-AVX1-NEXT: vpsllw $4, %xmm2, %xmm2
@@ -327,8 +327,8 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
; X32-AVX1-NEXT: retl
;
; X32-AVX2-LABEL: var_shift_v16i16:
-; X32-AVX2: # BB#0:
-; X32-AVX2-NEXT: vpxor %ymm2, %ymm2, %ymm2
+; X32-AVX2: # %bb.0:
+; X32-AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
; X32-AVX2-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
; X32-AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
; X32-AVX2-NEXT: vpsrlvd %ymm3, %ymm4, %ymm3
@@ -345,7 +345,7 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
; AVX1-LABEL: var_shift_v32i8:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm3
; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
@@ -379,7 +379,7 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: var_shift_v32i8:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpsllw $5, %ymm1, %ymm1
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm2
; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
@@ -395,7 +395,7 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
; AVX2-NEXT: retq
;
; XOPAVX1-LABEL: var_shift_v32i8:
-; XOPAVX1: # BB#0:
+; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; XOPAVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
; XOPAVX1-NEXT: vpsubb %xmm2, %xmm3, %xmm2
@@ -407,7 +407,7 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: var_shift_v32i8:
-; XOPAVX2: # BB#0:
+; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
; XOPAVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
; XOPAVX2-NEXT: vpsubb %xmm2, %xmm3, %xmm2
@@ -419,7 +419,7 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
; XOPAVX2-NEXT: retq
;
; AVX512DQ-LABEL: var_shift_v32i8:
-; AVX512DQ: # BB#0:
+; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vpsllw $5, %ymm1, %ymm1
; AVX512DQ-NEXT: vpsrlw $4, %ymm0, %ymm2
; AVX512DQ-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
@@ -435,7 +435,7 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: var_shift_v32i8:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
@@ -443,7 +443,7 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
; AVX512BW-NEXT: retq
;
; AVX512DQVL-LABEL: var_shift_v32i8:
-; AVX512DQVL: # BB#0:
+; AVX512DQVL: # %bb.0:
; AVX512DQVL-NEXT: vpsllw $5, %ymm1, %ymm1
; AVX512DQVL-NEXT: vpsrlw $4, %ymm0, %ymm2
; AVX512DQVL-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
@@ -459,7 +459,7 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
; AVX512DQVL-NEXT: retq
;
; AVX512BWVL-LABEL: var_shift_v32i8:
-; AVX512BWVL: # BB#0:
+; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
; AVX512BWVL-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
@@ -467,7 +467,7 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
; AVX512BWVL-NEXT: retq
;
; X32-AVX1-LABEL: var_shift_v32i8:
-; X32-AVX1: # BB#0:
+; X32-AVX1: # %bb.0:
; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; X32-AVX1-NEXT: vpsrlw $4, %xmm2, %xmm3
; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
@@ -501,7 +501,7 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
; X32-AVX1-NEXT: retl
;
; X32-AVX2-LABEL: var_shift_v32i8:
-; X32-AVX2: # BB#0:
+; X32-AVX2: # %bb.0:
; X32-AVX2-NEXT: vpsllw $5, %ymm1, %ymm1
; X32-AVX2-NEXT: vpsrlw $4, %ymm0, %ymm2
; X32-AVX2-NEXT: vpand {{\.LCPI.*}}, %ymm2, %ymm2
@@ -525,7 +525,7 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
; AVX1-LABEL: splatvar_shift_v4i64:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vpsrlq %xmm1, %xmm2, %xmm2
; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0
@@ -533,12 +533,12 @@ define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: splatvar_shift_v4i64:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpsrlq %xmm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; XOPAVX1-LABEL: splatvar_shift_v4i64:
-; XOPAVX1: # BB#0:
+; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; XOPAVX1-NEXT: vpsrlq %xmm1, %xmm2, %xmm2
; XOPAVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0
@@ -546,24 +546,22 @@ define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: splatvar_shift_v4i64:
-; XOPAVX2: # BB#0:
+; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: vpsrlq %xmm1, %ymm0, %ymm0
; XOPAVX2-NEXT: retq
;
; AVX512-LABEL: splatvar_shift_v4i64:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpsrlq %xmm1, %ymm0, %ymm0
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: splatvar_shift_v4i64:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpsrlq %xmm1, %ymm0, %ymm0
; AVX512VL-NEXT: retq
;
; X32-AVX1-LABEL: splatvar_shift_v4i64:
-; X32-AVX1: # BB#0:
-; X32-AVX1-NEXT: vpextrd $1, %xmm1, %eax
-; X32-AVX1-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1
+; X32-AVX1: # %bb.0:
; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; X32-AVX1-NEXT: vpsrlq %xmm1, %xmm2, %xmm2
; X32-AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0
@@ -571,9 +569,7 @@ define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
; X32-AVX1-NEXT: retl
;
; X32-AVX2-LABEL: splatvar_shift_v4i64:
-; X32-AVX2: # BB#0:
-; X32-AVX2-NEXT: vpextrd $1, %xmm1, %eax
-; X32-AVX2-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1
+; X32-AVX2: # %bb.0:
; X32-AVX2-NEXT: vpsrlq %xmm1, %ymm0, %ymm0
; X32-AVX2-NEXT: retl
%splat = shufflevector <4 x i64> %b, <4 x i64> undef, <4 x i32> zeroinitializer
@@ -583,7 +579,7 @@ define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
; AVX1-LABEL: splatvar_shift_v8i32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
; AVX1-NEXT: vpsrld %xmm1, %xmm2, %xmm2
@@ -592,13 +588,13 @@ define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: splatvar_shift_v8i32:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
; AVX2-NEXT: vpsrld %xmm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; XOPAVX1-LABEL: splatvar_shift_v8i32:
-; XOPAVX1: # BB#0:
+; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; XOPAVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
; XOPAVX1-NEXT: vpsrld %xmm1, %xmm2, %xmm2
@@ -607,25 +603,25 @@ define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: splatvar_shift_v8i32:
-; XOPAVX2: # BB#0:
+; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
; XOPAVX2-NEXT: vpsrld %xmm1, %ymm0, %ymm0
; XOPAVX2-NEXT: retq
;
; AVX512-LABEL: splatvar_shift_v8i32:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
; AVX512-NEXT: vpsrld %xmm1, %ymm0, %ymm0
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: splatvar_shift_v8i32:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
; AVX512VL-NEXT: vpsrld %xmm1, %ymm0, %ymm0
; AVX512VL-NEXT: retq
;
; X32-AVX1-LABEL: splatvar_shift_v8i32:
-; X32-AVX1: # BB#0:
+; X32-AVX1: # %bb.0:
; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; X32-AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
; X32-AVX1-NEXT: vpsrld %xmm1, %xmm2, %xmm2
@@ -634,7 +630,7 @@ define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
; X32-AVX1-NEXT: retl
;
; X32-AVX2-LABEL: splatvar_shift_v8i32:
-; X32-AVX2: # BB#0:
+; X32-AVX2: # %bb.0:
; X32-AVX2-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
; X32-AVX2-NEXT: vpsrld %xmm1, %ymm0, %ymm0
; X32-AVX2-NEXT: retl
@@ -645,7 +641,7 @@ define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
; AVX1-LABEL: splatvar_shift_v16i16:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm2
@@ -654,13 +650,13 @@ define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind
; AVX1-NEXT: retq
;
; AVX2-LABEL: splatvar_shift_v16i16:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
; AVX2-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; XOPAVX1-LABEL: splatvar_shift_v16i16:
-; XOPAVX1: # BB#0:
+; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; XOPAVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
; XOPAVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm2
@@ -669,25 +665,25 @@ define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: splatvar_shift_v16i16:
-; XOPAVX2: # BB#0:
+; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
; XOPAVX2-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
; XOPAVX2-NEXT: retq
;
; AVX512-LABEL: splatvar_shift_v16i16:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
; AVX512-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: splatvar_shift_v16i16:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
; AVX512VL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
; AVX512VL-NEXT: retq
;
; X32-AVX1-LABEL: splatvar_shift_v16i16:
-; X32-AVX1: # BB#0:
+; X32-AVX1: # %bb.0:
; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; X32-AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
; X32-AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm2
@@ -696,7 +692,7 @@ define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind
; X32-AVX1-NEXT: retl
;
; X32-AVX2-LABEL: splatvar_shift_v16i16:
-; X32-AVX2: # BB#0:
+; X32-AVX2: # %bb.0:
; X32-AVX2-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
; X32-AVX2-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
; X32-AVX2-NEXT: retl
@@ -707,7 +703,7 @@ define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind
define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
; AVX1-LABEL: splatvar_shift_v32i8:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
@@ -739,7 +735,7 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: splatvar_shift_v32i8:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpbroadcastb %xmm1, %ymm1
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm2
; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
@@ -756,7 +752,7 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
; AVX2-NEXT: retq
;
; XOPAVX1-LABEL: splatvar_shift_v32i8:
-; XOPAVX1: # BB#0:
+; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; XOPAVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; XOPAVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm1
@@ -767,7 +763,7 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: splatvar_shift_v32i8:
-; XOPAVX2: # BB#0:
+; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: vpbroadcastb %xmm1, %ymm1
; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
; XOPAVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
@@ -780,7 +776,7 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
; XOPAVX2-NEXT: retq
;
; AVX512DQ-LABEL: splatvar_shift_v32i8:
-; AVX512DQ: # BB#0:
+; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vpbroadcastb %xmm1, %ymm1
; AVX512DQ-NEXT: vpsrlw $4, %ymm0, %ymm2
; AVX512DQ-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
@@ -797,7 +793,7 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: splatvar_shift_v32i8:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpbroadcastb %xmm1, %ymm1
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
@@ -806,7 +802,7 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
; AVX512BW-NEXT: retq
;
; AVX512DQVL-LABEL: splatvar_shift_v32i8:
-; AVX512DQVL: # BB#0:
+; AVX512DQVL: # %bb.0:
; AVX512DQVL-NEXT: vpbroadcastb %xmm1, %ymm1
; AVX512DQVL-NEXT: vpsrlw $4, %ymm0, %ymm2
; AVX512DQVL-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
@@ -823,7 +819,7 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
; AVX512DQVL-NEXT: retq
;
; AVX512BWVL-LABEL: splatvar_shift_v32i8:
-; AVX512BWVL: # BB#0:
+; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vpbroadcastb %xmm1, %ymm1
; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
@@ -832,7 +828,7 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
; AVX512BWVL-NEXT: retq
;
; X32-AVX1-LABEL: splatvar_shift_v32i8:
-; X32-AVX1: # BB#0:
+; X32-AVX1: # %bb.0:
; X32-AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; X32-AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
@@ -864,7 +860,7 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
; X32-AVX1-NEXT: retl
;
; X32-AVX2-LABEL: splatvar_shift_v32i8:
-; X32-AVX2: # BB#0:
+; X32-AVX2: # %bb.0:
; X32-AVX2-NEXT: vpbroadcastb %xmm1, %ymm1
; X32-AVX2-NEXT: vpsrlw $4, %ymm0, %ymm2
; X32-AVX2-NEXT: vpand {{\.LCPI.*}}, %ymm2, %ymm2
@@ -890,7 +886,7 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
define <4 x i64> @constant_shift_v4i64(<4 x i64> %a) nounwind {
; AVX1-LABEL: constant_shift_v4i64:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpsrlq $62, %xmm1, %xmm2
; AVX1-NEXT: vpsrlq $31, %xmm1, %xmm1
@@ -902,12 +898,12 @@ define <4 x i64> @constant_shift_v4i64(<4 x i64> %a) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: constant_shift_v4i64:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpsrlvq {{.*}}(%rip), %ymm0, %ymm0
; AVX2-NEXT: retq
;
; XOPAVX1-LABEL: constant_shift_v4i64:
-; XOPAVX1: # BB#0:
+; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; XOPAVX1-NEXT: vpsubq {{.*}}(%rip), %xmm1, %xmm2
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
@@ -918,22 +914,22 @@ define <4 x i64> @constant_shift_v4i64(<4 x i64> %a) nounwind {
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: constant_shift_v4i64:
-; XOPAVX2: # BB#0:
+; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: vpsrlvq {{.*}}(%rip), %ymm0, %ymm0
; XOPAVX2-NEXT: retq
;
; AVX512-LABEL: constant_shift_v4i64:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpsrlvq {{.*}}(%rip), %ymm0, %ymm0
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: constant_shift_v4i64:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpsrlvq {{.*}}(%rip), %ymm0, %ymm0
; AVX512VL-NEXT: retq
;
; X32-AVX1-LABEL: constant_shift_v4i64:
-; X32-AVX1: # BB#0:
+; X32-AVX1: # %bb.0:
; X32-AVX1-NEXT: vmovdqa {{.*#+}} ymm1 = [1,0,7,0,31,0,62,0]
; X32-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
@@ -949,7 +945,7 @@ define <4 x i64> @constant_shift_v4i64(<4 x i64> %a) nounwind {
; X32-AVX1-NEXT: retl
;
; X32-AVX2-LABEL: constant_shift_v4i64:
-; X32-AVX2: # BB#0:
+; X32-AVX2: # %bb.0:
; X32-AVX2-NEXT: vpsrlvq {{\.LCPI.*}}, %ymm0, %ymm0
; X32-AVX2-NEXT: retl
%shift = lshr <4 x i64> %a, <i64 1, i64 7, i64 31, i64 62>
@@ -958,7 +954,7 @@ define <4 x i64> @constant_shift_v4i64(<4 x i64> %a) nounwind {
define <8 x i32> @constant_shift_v8i32(<8 x i32> %a) nounwind {
; AVX1-LABEL: constant_shift_v8i32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpsrld $7, %xmm0, %xmm1
; AVX1-NEXT: vpsrld $5, %xmm0, %xmm2
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
@@ -976,12 +972,12 @@ define <8 x i32> @constant_shift_v8i32(<8 x i32> %a) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: constant_shift_v8i32:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpsrlvd {{.*}}(%rip), %ymm0, %ymm0
; AVX2-NEXT: retq
;
; XOPAVX1-LABEL: constant_shift_v8i32:
-; XOPAVX1: # BB#0:
+; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vpshld {{.*}}(%rip), %xmm0, %xmm1
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; XOPAVX1-NEXT: vpshld {{.*}}(%rip), %xmm0, %xmm0
@@ -989,22 +985,22 @@ define <8 x i32> @constant_shift_v8i32(<8 x i32> %a) nounwind {
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: constant_shift_v8i32:
-; XOPAVX2: # BB#0:
+; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: vpsrlvd {{.*}}(%rip), %ymm0, %ymm0
; XOPAVX2-NEXT: retq
;
; AVX512-LABEL: constant_shift_v8i32:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpsrlvd {{.*}}(%rip), %ymm0, %ymm0
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: constant_shift_v8i32:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %ymm0, %ymm0
; AVX512VL-NEXT: retq
;
; X32-AVX1-LABEL: constant_shift_v8i32:
-; X32-AVX1: # BB#0:
+; X32-AVX1: # %bb.0:
; X32-AVX1-NEXT: vpsrld $7, %xmm0, %xmm1
; X32-AVX1-NEXT: vpsrld $5, %xmm0, %xmm2
; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
@@ -1022,7 +1018,7 @@ define <8 x i32> @constant_shift_v8i32(<8 x i32> %a) nounwind {
; X32-AVX1-NEXT: retl
;
; X32-AVX2-LABEL: constant_shift_v8i32:
-; X32-AVX2: # BB#0:
+; X32-AVX2: # %bb.0:
; X32-AVX2-NEXT: vpsrlvd {{\.LCPI.*}}, %ymm0, %ymm0
; X32-AVX2-NEXT: retl
%shift = lshr <8 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 8, i32 7>
@@ -1031,7 +1027,7 @@ define <8 x i32> @constant_shift_v8i32(<8 x i32> %a) nounwind {
define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) nounwind {
; AVX1-LABEL: constant_shift_v16i16:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm2
@@ -1050,8 +1046,8 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: constant_shift_v16i16:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15]
; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15]
@@ -1065,7 +1061,7 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) nounwind {
; AVX2-NEXT: retq
;
; XOPAVX1-LABEL: constant_shift_v16i16:
-; XOPAVX1: # BB#0:
+; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; XOPAVX1-NEXT: vpsubw {{.*}}(%rip), %xmm1, %xmm2
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
@@ -1076,7 +1072,7 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) nounwind {
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: constant_shift_v16i16:
-; XOPAVX2: # BB#0:
+; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; XOPAVX2-NEXT: vpsubw {{.*}}(%rip), %xmm1, %xmm2
; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
@@ -1087,34 +1083,34 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) nounwind {
; XOPAVX2-NEXT: retq
;
; AVX512DQ-LABEL: constant_shift_v16i16:
-; AVX512DQ: # BB#0:
+; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
; AVX512DQ-NEXT: vpsrlvd {{.*}}(%rip), %zmm0, %zmm0
; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: constant_shift_v16i16:
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512BW-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
; AVX512BW-NEXT: retq
;
; AVX512DQVL-LABEL: constant_shift_v16i16:
-; AVX512DQVL: # BB#0:
+; AVX512DQVL: # %bb.0:
; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
; AVX512DQVL-NEXT: vpsrlvd {{.*}}(%rip), %zmm0, %zmm0
; AVX512DQVL-NEXT: vpmovdw %zmm0, %ymm0
; AVX512DQVL-NEXT: retq
;
; AVX512BWVL-LABEL: constant_shift_v16i16:
-; AVX512BWVL: # BB#0:
+; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vpsrlvw {{.*}}(%rip), %ymm0, %ymm0
; AVX512BWVL-NEXT: retq
;
; X32-AVX1-LABEL: constant_shift_v16i16:
-; X32-AVX1: # BB#0:
+; X32-AVX1: # %bb.0:
; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; X32-AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
; X32-AVX1-NEXT: vpsrlw $4, %xmm1, %xmm2
@@ -1133,8 +1129,8 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) nounwind {
; X32-AVX1-NEXT: retl
;
; X32-AVX2-LABEL: constant_shift_v16i16:
-; X32-AVX2: # BB#0:
-; X32-AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; X32-AVX2: # %bb.0:
+; X32-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; X32-AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
; X32-AVX2-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15]
; X32-AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15]
@@ -1152,7 +1148,7 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) nounwind {
define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind {
; AVX1-LABEL: constant_shift_v32i8:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
@@ -1182,7 +1178,7 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: constant_shift_v32i8:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm1
; AVX2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [8192,24640,41088,57536,49376,32928,16480,32,8192,24640,41088,57536,49376,32928,16480,32]
@@ -1198,7 +1194,7 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind {
; AVX2-NEXT: retq
;
; XOPAVX1-LABEL: constant_shift_v32i8:
-; XOPAVX1: # BB#0:
+; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; XOPAVX1-NEXT: vpsubb {{.*}}(%rip), %xmm1, %xmm1
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
@@ -1208,7 +1204,7 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind {
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: constant_shift_v32i8:
-; XOPAVX2: # BB#0:
+; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; XOPAVX2-NEXT: vpsubb {{.*}}(%rip), %xmm1, %xmm1
; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
@@ -1218,7 +1214,7 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind {
; XOPAVX2-NEXT: retq
;
; AVX512DQ-LABEL: constant_shift_v32i8:
-; AVX512DQ: # BB#0:
+; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vpsrlw $4, %ymm0, %ymm1
; AVX512DQ-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [8192,24640,41088,57536,49376,32928,16480,32,8192,24640,41088,57536,49376,32928,16480,32]
@@ -1234,14 +1230,14 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind {
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: constant_shift_v32i8:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
; AVX512BW-NEXT: vpsrlvw {{.*}}(%rip), %zmm0, %zmm0
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
; AVX512BW-NEXT: retq
;
; AVX512DQVL-LABEL: constant_shift_v32i8:
-; AVX512DQVL: # BB#0:
+; AVX512DQVL: # %bb.0:
; AVX512DQVL-NEXT: vpsrlw $4, %ymm0, %ymm1
; AVX512DQVL-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
; AVX512DQVL-NEXT: vmovdqa {{.*#+}} ymm2 = [8192,24640,41088,57536,49376,32928,16480,32,8192,24640,41088,57536,49376,32928,16480,32]
@@ -1257,14 +1253,14 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind {
; AVX512DQVL-NEXT: retq
;
; AVX512BWVL-LABEL: constant_shift_v32i8:
-; AVX512BWVL: # BB#0:
+; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
; AVX512BWVL-NEXT: vpsrlvw {{.*}}(%rip), %zmm0, %zmm0
; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0
; AVX512BWVL-NEXT: retq
;
; X32-AVX1-LABEL: constant_shift_v32i8:
-; X32-AVX1: # BB#0:
+; X32-AVX1: # %bb.0:
; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; X32-AVX1-NEXT: vpsrlw $4, %xmm1, %xmm2
; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
@@ -1294,7 +1290,7 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind {
; X32-AVX1-NEXT: retl
;
; X32-AVX2-LABEL: constant_shift_v32i8:
-; X32-AVX2: # BB#0:
+; X32-AVX2: # %bb.0:
; X32-AVX2-NEXT: vpsrlw $4, %ymm0, %ymm1
; X32-AVX2-NEXT: vpand {{\.LCPI.*}}, %ymm1, %ymm1
; X32-AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [8192,24640,41088,57536,49376,32928,16480,32,8192,24640,41088,57536,49376,32928,16480,32]
@@ -1318,7 +1314,7 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind {
define <4 x i64> @splatconstant_shift_v4i64(<4 x i64> %a) nounwind {
; AVX1-LABEL: splatconstant_shift_v4i64:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpsrlq $7, %xmm0, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpsrlq $7, %xmm0, %xmm0
@@ -1326,12 +1322,12 @@ define <4 x i64> @splatconstant_shift_v4i64(<4 x i64> %a) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: splatconstant_shift_v4i64:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpsrlq $7, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; XOPAVX1-LABEL: splatconstant_shift_v4i64:
-; XOPAVX1: # BB#0:
+; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vpsrlq $7, %xmm0, %xmm1
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; XOPAVX1-NEXT: vpsrlq $7, %xmm0, %xmm0
@@ -1339,22 +1335,22 @@ define <4 x i64> @splatconstant_shift_v4i64(<4 x i64> %a) nounwind {
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: splatconstant_shift_v4i64:
-; XOPAVX2: # BB#0:
+; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: vpsrlq $7, %ymm0, %ymm0
; XOPAVX2-NEXT: retq
;
; AVX512-LABEL: splatconstant_shift_v4i64:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpsrlq $7, %ymm0, %ymm0
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: splatconstant_shift_v4i64:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpsrlq $7, %ymm0, %ymm0
; AVX512VL-NEXT: retq
;
; X32-AVX1-LABEL: splatconstant_shift_v4i64:
-; X32-AVX1: # BB#0:
+; X32-AVX1: # %bb.0:
; X32-AVX1-NEXT: vpsrlq $7, %xmm0, %xmm1
; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; X32-AVX1-NEXT: vpsrlq $7, %xmm0, %xmm0
@@ -1362,7 +1358,7 @@ define <4 x i64> @splatconstant_shift_v4i64(<4 x i64> %a) nounwind {
; X32-AVX1-NEXT: retl
;
; X32-AVX2-LABEL: splatconstant_shift_v4i64:
-; X32-AVX2: # BB#0:
+; X32-AVX2: # %bb.0:
; X32-AVX2-NEXT: vpsrlq $7, %ymm0, %ymm0
; X32-AVX2-NEXT: retl
%shift = lshr <4 x i64> %a, <i64 7, i64 7, i64 7, i64 7>
@@ -1371,7 +1367,7 @@ define <4 x i64> @splatconstant_shift_v4i64(<4 x i64> %a) nounwind {
define <8 x i32> @splatconstant_shift_v8i32(<8 x i32> %a) nounwind {
; AVX1-LABEL: splatconstant_shift_v8i32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpsrld $5, %xmm0, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpsrld $5, %xmm0, %xmm0
@@ -1379,12 +1375,12 @@ define <8 x i32> @splatconstant_shift_v8i32(<8 x i32> %a) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: splatconstant_shift_v8i32:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpsrld $5, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; XOPAVX1-LABEL: splatconstant_shift_v8i32:
-; XOPAVX1: # BB#0:
+; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vpsrld $5, %xmm0, %xmm1
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; XOPAVX1-NEXT: vpsrld $5, %xmm0, %xmm0
@@ -1392,22 +1388,22 @@ define <8 x i32> @splatconstant_shift_v8i32(<8 x i32> %a) nounwind {
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: splatconstant_shift_v8i32:
-; XOPAVX2: # BB#0:
+; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: vpsrld $5, %ymm0, %ymm0
; XOPAVX2-NEXT: retq
;
; AVX512-LABEL: splatconstant_shift_v8i32:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpsrld $5, %ymm0, %ymm0
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: splatconstant_shift_v8i32:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpsrld $5, %ymm0, %ymm0
; AVX512VL-NEXT: retq
;
; X32-AVX1-LABEL: splatconstant_shift_v8i32:
-; X32-AVX1: # BB#0:
+; X32-AVX1: # %bb.0:
; X32-AVX1-NEXT: vpsrld $5, %xmm0, %xmm1
; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; X32-AVX1-NEXT: vpsrld $5, %xmm0, %xmm0
@@ -1415,7 +1411,7 @@ define <8 x i32> @splatconstant_shift_v8i32(<8 x i32> %a) nounwind {
; X32-AVX1-NEXT: retl
;
; X32-AVX2-LABEL: splatconstant_shift_v8i32:
-; X32-AVX2: # BB#0:
+; X32-AVX2: # %bb.0:
; X32-AVX2-NEXT: vpsrld $5, %ymm0, %ymm0
; X32-AVX2-NEXT: retl
%shift = lshr <8 x i32> %a, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
@@ -1424,7 +1420,7 @@ define <8 x i32> @splatconstant_shift_v8i32(<8 x i32> %a) nounwind {
define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) nounwind {
; AVX1-LABEL: splatconstant_shift_v16i16:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpsrlw $3, %xmm0, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpsrlw $3, %xmm0, %xmm0
@@ -1432,12 +1428,12 @@ define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: splatconstant_shift_v16i16:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpsrlw $3, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; XOPAVX1-LABEL: splatconstant_shift_v16i16:
-; XOPAVX1: # BB#0:
+; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vpsrlw $3, %xmm0, %xmm1
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; XOPAVX1-NEXT: vpsrlw $3, %xmm0, %xmm0
@@ -1445,22 +1441,22 @@ define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) nounwind {
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: splatconstant_shift_v16i16:
-; XOPAVX2: # BB#0:
+; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: vpsrlw $3, %ymm0, %ymm0
; XOPAVX2-NEXT: retq
;
; AVX512-LABEL: splatconstant_shift_v16i16:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpsrlw $3, %ymm0, %ymm0
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: splatconstant_shift_v16i16:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpsrlw $3, %ymm0, %ymm0
; AVX512VL-NEXT: retq
;
; X32-AVX1-LABEL: splatconstant_shift_v16i16:
-; X32-AVX1: # BB#0:
+; X32-AVX1: # %bb.0:
; X32-AVX1-NEXT: vpsrlw $3, %xmm0, %xmm1
; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; X32-AVX1-NEXT: vpsrlw $3, %xmm0, %xmm0
@@ -1468,7 +1464,7 @@ define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) nounwind {
; X32-AVX1-NEXT: retl
;
; X32-AVX2-LABEL: splatconstant_shift_v16i16:
-; X32-AVX2: # BB#0:
+; X32-AVX2: # %bb.0:
; X32-AVX2-NEXT: vpsrlw $3, %ymm0, %ymm0
; X32-AVX2-NEXT: retl
%shift = lshr <16 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
@@ -1477,7 +1473,7 @@ define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) nounwind {
define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) nounwind {
; AVX1-LABEL: splatconstant_shift_v32i8:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpsrlw $3, %xmm1, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
@@ -1488,13 +1484,13 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: splatconstant_shift_v32i8:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpsrlw $3, %ymm0, %ymm0
; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
; AVX2-NEXT: retq
;
; XOPAVX1-LABEL: splatconstant_shift_v32i8:
-; XOPAVX1: # BB#0:
+; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; XOPAVX1-NEXT: vpsubb {{.*}}(%rip), %xmm1, %xmm1
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
@@ -1504,25 +1500,25 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) nounwind {
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: splatconstant_shift_v32i8:
-; XOPAVX2: # BB#0:
+; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: vpsrlw $3, %ymm0, %ymm0
; XOPAVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
; XOPAVX2-NEXT: retq
;
; AVX512-LABEL: splatconstant_shift_v32i8:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpsrlw $3, %ymm0, %ymm0
; AVX512-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: splatconstant_shift_v32i8:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpsrlw $3, %ymm0, %ymm0
; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
; AVX512VL-NEXT: retq
;
; X32-AVX1-LABEL: splatconstant_shift_v32i8:
-; X32-AVX1: # BB#0:
+; X32-AVX1: # %bb.0:
; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; X32-AVX1-NEXT: vpsrlw $3, %xmm1, %xmm1
; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
@@ -1533,7 +1529,7 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) nounwind {
; X32-AVX1-NEXT: retl
;
; X32-AVX2-LABEL: splatconstant_shift_v32i8:
-; X32-AVX2: # BB#0:
+; X32-AVX2: # %bb.0:
; X32-AVX2-NEXT: vpsrlw $3, %ymm0, %ymm0
; X32-AVX2-NEXT: vpand {{\.LCPI.*}}, %ymm0, %ymm0
; X32-AVX2-NEXT: retl
diff --git a/test/CodeGen/X86/vector-shift-lshr-512.ll b/test/CodeGen/X86/vector-shift-lshr-512.ll
index c269f8159517..1157c0a66968 100644
--- a/test/CodeGen/X86/vector-shift-lshr-512.ll
+++ b/test/CodeGen/X86/vector-shift-lshr-512.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512DQ
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512DQ
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
;
; Variable Shifts
@@ -8,7 +8,7 @@
define <8 x i64> @var_shift_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind {
; ALL-LABEL: var_shift_v8i64:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vpsrlvq %zmm1, %zmm0, %zmm0
; ALL-NEXT: retq
%shift = lshr <8 x i64> %a, %b
@@ -17,7 +17,7 @@ define <8 x i64> @var_shift_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind {
define <16 x i32> @var_shift_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind {
; ALL-LABEL: var_shift_v16i32:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0
; ALL-NEXT: retq
%shift = lshr <16 x i32> %a, %b
@@ -26,7 +26,7 @@ define <16 x i32> @var_shift_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind {
define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
; AVX512DQ-LABEL: var_shift_v32i16:
-; AVX512DQ: # BB#0:
+; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
; AVX512DQ-NEXT: vpsrlvd %zmm2, %zmm0, %zmm0
@@ -38,7 +38,7 @@ define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: var_shift_v32i16:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: retq
%shift = lshr <32 x i16> %a, %b
@@ -47,7 +47,7 @@ define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
; AVX512DQ-LABEL: var_shift_v64i8:
-; AVX512DQ: # BB#0:
+; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vpsrlw $4, %ymm0, %ymm4
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512DQ-NEXT: vpand %ymm5, %ymm4, %ymm4
@@ -78,7 +78,7 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: var_shift_v64i8:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm2
; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
; AVX512BW-NEXT: vpsllw $5, %zmm1, %zmm1
@@ -105,7 +105,7 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
define <8 x i64> @splatvar_shift_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind {
; ALL-LABEL: splatvar_shift_v8i64:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vpsrlq %xmm1, %zmm0, %zmm0
; ALL-NEXT: retq
%splat = shufflevector <8 x i64> %b, <8 x i64> undef, <8 x i32> zeroinitializer
@@ -115,7 +115,7 @@ define <8 x i64> @splatvar_shift_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind {
define <16 x i32> @splatvar_shift_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind {
; ALL-LABEL: splatvar_shift_v16i32:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
; ALL-NEXT: vpsrld %xmm1, %zmm0, %zmm0
; ALL-NEXT: retq
@@ -126,14 +126,14 @@ define <16 x i32> @splatvar_shift_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind
define <32 x i16> @splatvar_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
; AVX512DQ-LABEL: splatvar_shift_v32i16:
-; AVX512DQ: # BB#0:
+; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
; AVX512DQ-NEXT: vpsrlw %xmm2, %ymm0, %ymm0
; AVX512DQ-NEXT: vpsrlw %xmm2, %ymm1, %ymm1
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: splatvar_shift_v32i16:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
; AVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm0
; AVX512BW-NEXT: retq
@@ -144,7 +144,7 @@ define <32 x i16> @splatvar_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind
define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
; AVX512DQ-LABEL: splatvar_shift_v64i8:
-; AVX512DQ: # BB#0:
+; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vpbroadcastb %xmm2, %ymm2
; AVX512DQ-NEXT: vpsrlw $4, %ymm0, %ymm3
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
@@ -173,7 +173,7 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: splatvar_shift_v64i8:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpbroadcastb %xmm1, %zmm1
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm2
; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
@@ -202,7 +202,7 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
define <8 x i64> @constant_shift_v8i64(<8 x i64> %a) nounwind {
; ALL-LABEL: constant_shift_v8i64:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vpsrlvq {{.*}}(%rip), %zmm0, %zmm0
; ALL-NEXT: retq
%shift = lshr <8 x i64> %a, <i64 1, i64 7, i64 31, i64 62, i64 1, i64 7, i64 31, i64 62>
@@ -211,7 +211,7 @@ define <8 x i64> @constant_shift_v8i64(<8 x i64> %a) nounwind {
define <16 x i32> @constant_shift_v16i32(<16 x i32> %a) nounwind {
; ALL-LABEL: constant_shift_v16i32:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vpsrlvd {{.*}}(%rip), %zmm0, %zmm0
; ALL-NEXT: retq
%shift = lshr <16 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 8, i32 7, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 8, i32 7>
@@ -220,7 +220,7 @@ define <16 x i32> @constant_shift_v16i32(<16 x i32> %a) nounwind {
define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) nounwind {
; AVX512DQ-LABEL: constant_shift_v32i16:
-; AVX512DQ: # BB#0:
+; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
; AVX512DQ-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
; AVX512DQ-NEXT: vpsrlvd %zmm2, %zmm0, %zmm0
@@ -231,7 +231,7 @@ define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) nounwind {
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: constant_shift_v32i16:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpsrlvw {{.*}}(%rip), %zmm0, %zmm0
; AVX512BW-NEXT: retq
%shift = lshr <32 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
@@ -240,7 +240,7 @@ define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) nounwind {
define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) nounwind {
; AVX512DQ-LABEL: constant_shift_v64i8:
-; AVX512DQ: # BB#0:
+; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vpsrlw $4, %ymm0, %ymm2
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512DQ-NEXT: vpand %ymm3, %ymm2, %ymm2
@@ -268,8 +268,8 @@ define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) nounwind {
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: constant_shift_v64i8:
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm1 = [8192,24640,41088,57536,49376,32928,16480,32,8192,24640,41088,57536,49376,32928,16480,32,8192,24640,41088,57536,49376,32928,16480,32,8192,24640,41088,57536,49376,32928,16480,32]
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [8192,24640,41088,57536,49376,32928,16480,32,8192,24640,41088,57536,49376,32928,16480,32,8192,24640,41088,57536,49376,32928,16480,32,8192,24640,41088,57536,49376,32928,16480,32]
; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm2
; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
@@ -295,7 +295,7 @@ define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) nounwind {
define <8 x i64> @splatconstant_shift_v8i64(<8 x i64> %a) nounwind {
; ALL-LABEL: splatconstant_shift_v8i64:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vpsrlq $7, %zmm0, %zmm0
; ALL-NEXT: retq
%shift = lshr <8 x i64> %a, <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>
@@ -304,7 +304,7 @@ define <8 x i64> @splatconstant_shift_v8i64(<8 x i64> %a) nounwind {
define <16 x i32> @splatconstant_shift_v16i32(<16 x i32> %a) nounwind {
; ALL-LABEL: splatconstant_shift_v16i32:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vpsrld $5, %zmm0, %zmm0
; ALL-NEXT: retq
%shift = lshr <16 x i32> %a, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
@@ -313,13 +313,13 @@ define <16 x i32> @splatconstant_shift_v16i32(<16 x i32> %a) nounwind {
define <32 x i16> @splatconstant_shift_v32i16(<32 x i16> %a) nounwind {
; AVX512DQ-LABEL: splatconstant_shift_v32i16:
-; AVX512DQ: # BB#0:
+; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vpsrlw $3, %ymm0, %ymm0
; AVX512DQ-NEXT: vpsrlw $3, %ymm1, %ymm1
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: splatconstant_shift_v32i16:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpsrlw $3, %zmm0, %zmm0
; AVX512BW-NEXT: retq
%shift = lshr <32 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
@@ -328,7 +328,7 @@ define <32 x i16> @splatconstant_shift_v32i16(<32 x i16> %a) nounwind {
define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) nounwind {
; AVX512DQ-LABEL: splatconstant_shift_v64i8:
-; AVX512DQ: # BB#0:
+; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vpsrlw $3, %ymm0, %ymm0
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
; AVX512DQ-NEXT: vpand %ymm2, %ymm0, %ymm0
@@ -337,7 +337,7 @@ define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) nounwind {
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: splatconstant_shift_v64i8:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpsrlw $3, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0
; AVX512BW-NEXT: retq
diff --git a/test/CodeGen/X86/vector-shift-shl-128.ll b/test/CodeGen/X86/vector-shift-shl-128.ll
index 568bf6e974f7..b518ad5fcffd 100644
--- a/test/CodeGen/X86/vector-shift-shl-128.ll
+++ b/test/CodeGen/X86/vector-shift-shl-128.ll
@@ -5,10 +5,10 @@
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX1
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512DQ
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512DQVL
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512BWVL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512DQ
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512DQVL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512BWVL
;
; Just one 32-bit run to make sure we do reasonable things for i64 shifts.
; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=X32-SSE --check-prefix=X32-SSE2
@@ -19,17 +19,16 @@
define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
; SSE2-LABEL: var_shift_v2i64:
-; SSE2: # BB#0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: psllq %xmm3, %xmm2
+; SSE2-NEXT: psllq %xmm1, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
; SSE2-NEXT: psllq %xmm1, %xmm0
-; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
-; SSE2-NEXT: movapd %xmm2, %xmm0
+; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
; SSE2-NEXT: retq
;
; SSE41-LABEL: var_shift_v2i64:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm2
; SSE41-NEXT: psllq %xmm1, %xmm2
; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
@@ -38,7 +37,7 @@ define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
; SSE41-NEXT: retq
;
; AVX1-LABEL: var_shift_v2i64:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm0
@@ -46,38 +45,37 @@ define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: var_shift_v2i64:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpsllvq %xmm1, %xmm0, %xmm0
; AVX2-NEXT: retq
;
; XOPAVX1-LABEL: var_shift_v2i64:
-; XOPAVX1: # BB#0:
+; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vpshlq %xmm1, %xmm0, %xmm0
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: var_shift_v2i64:
-; XOPAVX2: # BB#0:
+; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: vpsllvq %xmm1, %xmm0, %xmm0
; XOPAVX2-NEXT: retq
;
; AVX512-LABEL: var_shift_v2i64:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpsllvq %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: var_shift_v2i64:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpsllvq %xmm1, %xmm0, %xmm0
; AVX512VL-NEXT: retq
;
; X32-SSE-LABEL: var_shift_v2i64:
-; X32-SSE: # BB#0:
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: movdqa %xmm0, %xmm2
-; X32-SSE-NEXT: psllq %xmm3, %xmm2
+; X32-SSE-NEXT: psllq %xmm1, %xmm2
+; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
; X32-SSE-NEXT: psllq %xmm1, %xmm0
-; X32-SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
-; X32-SSE-NEXT: movapd %xmm2, %xmm0
+; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
; X32-SSE-NEXT: retl
%shift = shl <2 x i64> %a, %b
ret <2 x i64> %shift
@@ -85,7 +83,7 @@ define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
; SSE2-LABEL: var_shift_v4i32:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: pslld $23, %xmm1
; SSE2-NEXT: paddd {{.*}}(%rip), %xmm1
; SSE2-NEXT: cvttps2dq %xmm1, %xmm1
@@ -100,7 +98,7 @@ define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
; SSE2-NEXT: retq
;
; SSE41-LABEL: var_shift_v4i32:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pslld $23, %xmm1
; SSE41-NEXT: paddd {{.*}}(%rip), %xmm1
; SSE41-NEXT: cvttps2dq %xmm1, %xmm1
@@ -108,7 +106,7 @@ define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
; SSE41-NEXT: retq
;
; AVX1-LABEL: var_shift_v4i32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpslld $23, %xmm1, %xmm1
; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm1, %xmm1
; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1
@@ -116,32 +114,32 @@ define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: var_shift_v4i32:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: retq
;
; XOPAVX1-LABEL: var_shift_v4i32:
-; XOPAVX1: # BB#0:
+; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vpshld %xmm1, %xmm0, %xmm0
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: var_shift_v4i32:
-; XOPAVX2: # BB#0:
+; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
; XOPAVX2-NEXT: retq
;
; AVX512-LABEL: var_shift_v4i32:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: var_shift_v4i32:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
; AVX512VL-NEXT: retq
;
; X32-SSE-LABEL: var_shift_v4i32:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: pslld $23, %xmm1
; X32-SSE-NEXT: paddd {{\.LCPI.*}}, %xmm1
; X32-SSE-NEXT: cvttps2dq %xmm1, %xmm1
@@ -160,7 +158,7 @@ define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
; SSE2-LABEL: var_shift_v8i16:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: psllw $12, %xmm1
; SSE2-NEXT: movdqa %xmm1, %xmm2
; SSE2-NEXT: psraw $15, %xmm2
@@ -195,7 +193,7 @@ define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
; SSE2-NEXT: retq
;
; SSE41-LABEL: var_shift_v8i16:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm2
; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: psllw $12, %xmm0
@@ -225,7 +223,7 @@ define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
; SSE41-NEXT: retq
;
; AVX1-LABEL: var_shift_v8i16:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpsllw $12, %xmm1, %xmm2
; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1
; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1
@@ -243,53 +241,56 @@ define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: var_shift_v8i16:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX2-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; XOP-LABEL: var_shift_v8i16:
-; XOP: # BB#0:
+; XOP: # %bb.0:
; XOP-NEXT: vpshlw %xmm1, %xmm0, %xmm0
; XOP-NEXT: retq
;
; AVX512DQ-LABEL: var_shift_v8i16:
-; AVX512DQ: # BB#0:
+; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX512DQ-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512DQ-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: var_shift_v8i16:
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: # kill: %XMM1<def> %XMM1<kill> %ZMM1<def>
-; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: # kill: def %xmm1 killed %xmm1 def %zmm1
+; AVX512BW-NEXT: # kill: def %xmm0 killed %xmm0 def %zmm0
; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512BW-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQVL-LABEL: var_shift_v8i16:
-; AVX512DQVL: # BB#0:
+; AVX512DQVL: # %bb.0:
; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX512DQVL-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
; AVX512DQVL-NEXT: vpmovdw %ymm0, %xmm0
+; AVX512DQVL-NEXT: vzeroupper
; AVX512DQVL-NEXT: retq
;
; AVX512BWVL-LABEL: var_shift_v8i16:
-; AVX512BWVL: # BB#0:
+; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vpsllvw %xmm1, %xmm0, %xmm0
; AVX512BWVL-NEXT: retq
;
; X32-SSE-LABEL: var_shift_v8i16:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: psllw $12, %xmm1
; X32-SSE-NEXT: movdqa %xmm1, %xmm2
; X32-SSE-NEXT: psraw $15, %xmm2
@@ -328,7 +329,7 @@ define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; SSE2-LABEL: var_shift_v16i8:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: psllw $5, %xmm1
; SSE2-NEXT: pxor %xmm2, %xmm2
; SSE2-NEXT: pxor %xmm3, %xmm3
@@ -358,7 +359,7 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; SSE2-NEXT: retq
;
; SSE41-LABEL: var_shift_v16i8:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm2
; SSE41-NEXT: psllw $5, %xmm1
; SSE41-NEXT: movdqa %xmm2, %xmm3
@@ -381,7 +382,7 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; SSE41-NEXT: retq
;
; AVX-LABEL: var_shift_v16i8:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpsllw $5, %xmm1, %xmm1
; AVX-NEXT: vpsllw $4, %xmm0, %xmm2
; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
@@ -396,28 +397,30 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; AVX-NEXT: retq
;
; XOP-LABEL: var_shift_v16i8:
-; XOP: # BB#0:
+; XOP: # %bb.0:
; XOP-NEXT: vpshlb %xmm1, %xmm0, %xmm0
; XOP-NEXT: retq
;
; AVX512-LABEL: var_shift_v16i8:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
; AVX512-NEXT: vpsllvd %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: var_shift_v16i8:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
; AVX512VL-NEXT: vpsllvd %zmm1, %zmm0, %zmm0
; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; X32-SSE-LABEL: var_shift_v16i8:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: psllw $5, %xmm1
; X32-SSE-NEXT: pxor %xmm2, %xmm2
; X32-SSE-NEXT: pxor %xmm3, %xmm3
@@ -455,32 +458,32 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
define <2 x i64> @splatvar_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
; SSE-LABEL: splatvar_shift_v2i64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: psllq %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: splatvar_shift_v2i64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpsllq %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
;
; XOP-LABEL: splatvar_shift_v2i64:
-; XOP: # BB#0:
+; XOP: # %bb.0:
; XOP-NEXT: vpsllq %xmm1, %xmm0, %xmm0
; XOP-NEXT: retq
;
; AVX512-LABEL: splatvar_shift_v2i64:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpsllq %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: splatvar_shift_v2i64:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpsllq %xmm1, %xmm0, %xmm0
; AVX512VL-NEXT: retq
;
; X32-SSE-LABEL: splatvar_shift_v2i64:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: psllq %xmm1, %xmm0
; X32-SSE-NEXT: retl
%splat = shufflevector <2 x i64> %b, <2 x i64> undef, <2 x i32> zeroinitializer
@@ -490,44 +493,44 @@ define <2 x i64> @splatvar_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
define <4 x i32> @splatvar_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
; SSE2-LABEL: splatvar_shift_v4i32:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: xorps %xmm2, %xmm2
; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
; SSE2-NEXT: pslld %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: splatvar_shift_v4i32:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
; SSE41-NEXT: pslld %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: splatvar_shift_v4i32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
; AVX-NEXT: vpslld %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
;
; XOP-LABEL: splatvar_shift_v4i32:
-; XOP: # BB#0:
+; XOP: # %bb.0:
; XOP-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
; XOP-NEXT: vpslld %xmm1, %xmm0, %xmm0
; XOP-NEXT: retq
;
; AVX512-LABEL: splatvar_shift_v4i32:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
; AVX512-NEXT: vpslld %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: splatvar_shift_v4i32:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
; AVX512VL-NEXT: vpslld %xmm1, %xmm0, %xmm0
; AVX512VL-NEXT: retq
;
; X32-SSE-LABEL: splatvar_shift_v4i32:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: xorps %xmm2, %xmm2
; X32-SSE-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
; X32-SSE-NEXT: pslld %xmm2, %xmm0
@@ -539,44 +542,44 @@ define <4 x i32> @splatvar_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
define <8 x i16> @splatvar_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
; SSE2-LABEL: splatvar_shift_v8i16:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: pextrw $0, %xmm1, %eax
; SSE2-NEXT: movd %eax, %xmm1
; SSE2-NEXT: psllw %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: splatvar_shift_v8i16:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
; SSE41-NEXT: psllw %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: splatvar_shift_v8i16:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
; AVX-NEXT: vpsllw %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
;
; XOP-LABEL: splatvar_shift_v8i16:
-; XOP: # BB#0:
+; XOP: # %bb.0:
; XOP-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
; XOP-NEXT: vpsllw %xmm1, %xmm0, %xmm0
; XOP-NEXT: retq
;
; AVX512-LABEL: splatvar_shift_v8i16:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
; AVX512-NEXT: vpsllw %xmm1, %xmm0, %xmm0
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: splatvar_shift_v8i16:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
; AVX512VL-NEXT: vpsllw %xmm1, %xmm0, %xmm0
; AVX512VL-NEXT: retq
;
; X32-SSE-LABEL: splatvar_shift_v8i16:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: pextrw $0, %xmm1, %eax
; X32-SSE-NEXT: movd %eax, %xmm1
; X32-SSE-NEXT: psllw %xmm1, %xmm0
@@ -588,7 +591,7 @@ define <8 x i16> @splatvar_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; SSE2-LABEL: splatvar_shift_v16i8:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,1,1]
@@ -621,7 +624,7 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; SSE2-NEXT: retq
;
; SSE41-LABEL: splatvar_shift_v16i8:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm2
; SSE41-NEXT: pxor %xmm0, %xmm0
; SSE41-NEXT: pshufb %xmm0, %xmm1
@@ -647,7 +650,7 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; SSE41-NEXT: retq
;
; AVX1-LABEL: splatvar_shift_v16i8:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpsllw $5, %xmm1, %xmm1
@@ -664,7 +667,7 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: splatvar_shift_v16i8:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1
; AVX2-NEXT: vpsllw $5, %xmm1, %xmm1
; AVX2-NEXT: vpsllw $4, %xmm0, %xmm2
@@ -680,38 +683,40 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; AVX2-NEXT: retq
;
; XOPAVX1-LABEL: splatvar_shift_v16i8:
-; XOPAVX1: # BB#0:
+; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; XOPAVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; XOPAVX1-NEXT: vpshlb %xmm1, %xmm0, %xmm0
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: splatvar_shift_v16i8:
-; XOPAVX2: # BB#0:
+; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1
; XOPAVX2-NEXT: vpshlb %xmm1, %xmm0, %xmm0
; XOPAVX2-NEXT: retq
;
; AVX512-LABEL: splatvar_shift_v16i8:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpbroadcastb %xmm1, %xmm1
; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
; AVX512-NEXT: vpsllvd %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: splatvar_shift_v16i8:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpbroadcastb %xmm1, %xmm1
; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
; AVX512VL-NEXT: vpsllvd %zmm1, %zmm0, %zmm0
; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; X32-SSE-LABEL: splatvar_shift_v16i8:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,1,1]
@@ -753,16 +758,15 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
define <2 x i64> @constant_shift_v2i64(<2 x i64> %a) nounwind {
; SSE2-LABEL: constant_shift_v2i64:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psllq $7, %xmm1
-; SSE2-NEXT: psllq $1, %xmm0
-; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
-; SSE2-NEXT: movapd %xmm1, %xmm0
+; SSE2-NEXT: psllq $1, %xmm1
+; SSE2-NEXT: psllq $7, %xmm0
+; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; SSE2-NEXT: retq
;
; SSE41-LABEL: constant_shift_v2i64:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm1
; SSE41-NEXT: psllq $7, %xmm1
; SSE41-NEXT: psllq $1, %xmm0
@@ -770,44 +774,43 @@ define <2 x i64> @constant_shift_v2i64(<2 x i64> %a) nounwind {
; SSE41-NEXT: retq
;
; AVX1-LABEL: constant_shift_v2i64:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpsllq $7, %xmm0, %xmm1
; AVX1-NEXT: vpsllq $1, %xmm0, %xmm0
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
; AVX1-NEXT: retq
;
; AVX2-LABEL: constant_shift_v2i64:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpsllvq {{.*}}(%rip), %xmm0, %xmm0
; AVX2-NEXT: retq
;
; XOPAVX1-LABEL: constant_shift_v2i64:
-; XOPAVX1: # BB#0:
+; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vpshlq {{.*}}(%rip), %xmm0, %xmm0
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: constant_shift_v2i64:
-; XOPAVX2: # BB#0:
+; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: vpsllvq {{.*}}(%rip), %xmm0, %xmm0
; XOPAVX2-NEXT: retq
;
; AVX512-LABEL: constant_shift_v2i64:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpsllvq {{.*}}(%rip), %xmm0, %xmm0
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: constant_shift_v2i64:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpsllvq {{.*}}(%rip), %xmm0, %xmm0
; AVX512VL-NEXT: retq
;
; X32-SSE-LABEL: constant_shift_v2i64:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: movdqa %xmm0, %xmm1
-; X32-SSE-NEXT: psllq $7, %xmm1
-; X32-SSE-NEXT: psllq $1, %xmm0
-; X32-SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
-; X32-SSE-NEXT: movapd %xmm1, %xmm0
+; X32-SSE-NEXT: psllq $1, %xmm1
+; X32-SSE-NEXT: psllq $7, %xmm0
+; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; X32-SSE-NEXT: retl
%shift = shl <2 x i64> %a, <i64 1, i64 7>
ret <2 x i64> %shift
@@ -815,7 +818,7 @@ define <2 x i64> @constant_shift_v2i64(<2 x i64> %a) nounwind {
define <4 x i32> @constant_shift_v4i32(<4 x i32> %a) nounwind {
; SSE2-LABEL: constant_shift_v4i32:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [16,32,64,128]
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; SSE2-NEXT: pmuludq %xmm1, %xmm0
@@ -827,42 +830,42 @@ define <4 x i32> @constant_shift_v4i32(<4 x i32> %a) nounwind {
; SSE2-NEXT: retq
;
; SSE41-LABEL: constant_shift_v4i32:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: constant_shift_v4i32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: constant_shift_v4i32:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
; AVX2-NEXT: retq
;
; XOPAVX1-LABEL: constant_shift_v4i32:
-; XOPAVX1: # BB#0:
+; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vpshld {{.*}}(%rip), %xmm0, %xmm0
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: constant_shift_v4i32:
-; XOPAVX2: # BB#0:
+; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
; XOPAVX2-NEXT: retq
;
; AVX512-LABEL: constant_shift_v4i32:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: constant_shift_v4i32:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
; AVX512VL-NEXT: retq
;
; X32-SSE-LABEL: constant_shift_v4i32:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [16,32,64,128]
; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; X32-SSE-NEXT: pmuludq %xmm1, %xmm0
@@ -878,45 +881,46 @@ define <4 x i32> @constant_shift_v4i32(<4 x i32> %a) nounwind {
define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) nounwind {
; SSE-LABEL: constant_shift_v8i16:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pmullw {{.*}}(%rip), %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: constant_shift_v8i16:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: retq
;
; XOP-LABEL: constant_shift_v8i16:
-; XOP: # BB#0:
+; XOP: # %bb.0:
; XOP-NEXT: vpshlw {{.*}}(%rip), %xmm0, %xmm0
; XOP-NEXT: retq
;
; AVX512DQ-LABEL: constant_shift_v8i16:
-; AVX512DQ: # BB#0:
+; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: constant_shift_v8i16:
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: # kill: def %xmm0 killed %xmm0 def %zmm0
; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7]
; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512BW-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQVL-LABEL: constant_shift_v8i16:
-; AVX512DQVL: # BB#0:
+; AVX512DQVL: # %bb.0:
; AVX512DQVL-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
; AVX512DQVL-NEXT: retq
;
; AVX512BWVL-LABEL: constant_shift_v8i16:
-; AVX512BWVL: # BB#0:
+; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vpsllvw {{.*}}(%rip), %xmm0, %xmm0
; AVX512BWVL-NEXT: retq
;
; X32-SSE-LABEL: constant_shift_v8i16:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm0
; X32-SSE-NEXT: retl
%shift = shl <8 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
@@ -925,7 +929,7 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) nounwind {
define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) nounwind {
; SSE2-LABEL: constant_shift_v16i8:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [8192,24640,41088,57536,49376,32928,16480,32]
; SSE2-NEXT: pxor %xmm1, %xmm1
; SSE2-NEXT: pxor %xmm3, %xmm3
@@ -955,7 +959,7 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) nounwind {
; SSE2-NEXT: retq
;
; SSE41-LABEL: constant_shift_v16i8:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm1
; SSE41-NEXT: movdqa %xmm1, %xmm2
; SSE41-NEXT: psllw $4, %xmm2
@@ -975,7 +979,7 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) nounwind {
; SSE41-NEXT: retq
;
; AVX-LABEL: constant_shift_v16i8:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpsllw $4, %xmm0, %xmm1
; AVX-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [8192,24640,41088,57536,49376,32928,16480,32]
@@ -990,26 +994,28 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) nounwind {
; AVX-NEXT: retq
;
; XOP-LABEL: constant_shift_v16i8:
-; XOP: # BB#0:
+; XOP: # %bb.0:
; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm0, %xmm0
; XOP-NEXT: retq
;
; AVX512-LABEL: constant_shift_v16i8:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
; AVX512-NEXT: vpsllvd {{.*}}(%rip), %zmm0, %zmm0
; AVX512-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: constant_shift_v16i8:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
; AVX512VL-NEXT: vpsllvd {{.*}}(%rip), %zmm0, %zmm0
; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; X32-SSE-LABEL: constant_shift_v16i8:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [8192,24640,41088,57536,49376,32928,16480,32]
; X32-SSE-NEXT: pxor %xmm1, %xmm1
; X32-SSE-NEXT: pxor %xmm3, %xmm3
@@ -1047,32 +1053,32 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) nounwind {
define <2 x i64> @splatconstant_shift_v2i64(<2 x i64> %a) nounwind {
; SSE-LABEL: splatconstant_shift_v2i64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: psllq $7, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: splatconstant_shift_v2i64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpsllq $7, %xmm0, %xmm0
; AVX-NEXT: retq
;
; XOP-LABEL: splatconstant_shift_v2i64:
-; XOP: # BB#0:
+; XOP: # %bb.0:
; XOP-NEXT: vpsllq $7, %xmm0, %xmm0
; XOP-NEXT: retq
;
; AVX512-LABEL: splatconstant_shift_v2i64:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpsllq $7, %xmm0, %xmm0
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: splatconstant_shift_v2i64:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpsllq $7, %xmm0, %xmm0
; AVX512VL-NEXT: retq
;
; X32-SSE-LABEL: splatconstant_shift_v2i64:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: psllq $7, %xmm0
; X32-SSE-NEXT: retl
%shift = shl <2 x i64> %a, <i64 7, i64 7>
@@ -1081,32 +1087,32 @@ define <2 x i64> @splatconstant_shift_v2i64(<2 x i64> %a) nounwind {
define <4 x i32> @splatconstant_shift_v4i32(<4 x i32> %a) nounwind {
; SSE-LABEL: splatconstant_shift_v4i32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pslld $5, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: splatconstant_shift_v4i32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpslld $5, %xmm0, %xmm0
; AVX-NEXT: retq
;
; XOP-LABEL: splatconstant_shift_v4i32:
-; XOP: # BB#0:
+; XOP: # %bb.0:
; XOP-NEXT: vpslld $5, %xmm0, %xmm0
; XOP-NEXT: retq
;
; AVX512-LABEL: splatconstant_shift_v4i32:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpslld $5, %xmm0, %xmm0
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: splatconstant_shift_v4i32:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpslld $5, %xmm0, %xmm0
; AVX512VL-NEXT: retq
;
; X32-SSE-LABEL: splatconstant_shift_v4i32:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: pslld $5, %xmm0
; X32-SSE-NEXT: retl
%shift = shl <4 x i32> %a, <i32 5, i32 5, i32 5, i32 5>
@@ -1115,32 +1121,32 @@ define <4 x i32> @splatconstant_shift_v4i32(<4 x i32> %a) nounwind {
define <8 x i16> @splatconstant_shift_v8i16(<8 x i16> %a) nounwind {
; SSE-LABEL: splatconstant_shift_v8i16:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: psllw $3, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: splatconstant_shift_v8i16:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpsllw $3, %xmm0, %xmm0
; AVX-NEXT: retq
;
; XOP-LABEL: splatconstant_shift_v8i16:
-; XOP: # BB#0:
+; XOP: # %bb.0:
; XOP-NEXT: vpsllw $3, %xmm0, %xmm0
; XOP-NEXT: retq
;
; AVX512-LABEL: splatconstant_shift_v8i16:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpsllw $3, %xmm0, %xmm0
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: splatconstant_shift_v8i16:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpsllw $3, %xmm0, %xmm0
; AVX512VL-NEXT: retq
;
; X32-SSE-LABEL: splatconstant_shift_v8i16:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: psllw $3, %xmm0
; X32-SSE-NEXT: retl
%shift = shl <8 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
@@ -1149,36 +1155,36 @@ define <8 x i16> @splatconstant_shift_v8i16(<8 x i16> %a) nounwind {
define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) nounwind {
; SSE-LABEL: splatconstant_shift_v16i8:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: psllw $3, %xmm0
; SSE-NEXT: pand {{.*}}(%rip), %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: splatconstant_shift_v16i8:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpsllw $3, %xmm0, %xmm0
; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: retq
;
; XOP-LABEL: splatconstant_shift_v16i8:
-; XOP: # BB#0:
+; XOP: # %bb.0:
; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm0, %xmm0
; XOP-NEXT: retq
;
; AVX512-LABEL: splatconstant_shift_v16i8:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpsllw $3, %xmm0, %xmm0
; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: splatconstant_shift_v16i8:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpsllw $3, %xmm0, %xmm0
; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX512VL-NEXT: retq
;
; X32-SSE-LABEL: splatconstant_shift_v16i8:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: psllw $3, %xmm0
; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
; X32-SSE-NEXT: retl
diff --git a/test/CodeGen/X86/vector-shift-shl-256.ll b/test/CodeGen/X86/vector-shift-shl-256.ll
index 4a134f440a78..04713881271d 100644
--- a/test/CodeGen/X86/vector-shift-shl-256.ll
+++ b/test/CodeGen/X86/vector-shift-shl-256.ll
@@ -3,10 +3,10 @@
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX1
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512DQ
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512DQVL
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512BWVL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512DQ
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512DQVL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512BWVL
;
; 32-bit runs to make sure we do reasonable things for i64 shifts.
; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=X32-AVX --check-prefix=X32-AVX1
@@ -18,7 +18,7 @@
define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
; AVX1-LABEL: var_shift_v4i64:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpsllq %xmm2, %xmm3, %xmm4
@@ -33,12 +33,12 @@ define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: var_shift_v4i64:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpsllvq %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; XOPAVX1-LABEL: var_shift_v4i64:
-; XOPAVX1: # BB#0:
+; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; XOPAVX1-NEXT: vpshlq %xmm2, %xmm3, %xmm2
@@ -47,22 +47,22 @@ define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: var_shift_v4i64:
-; XOPAVX2: # BB#0:
+; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: vpsllvq %ymm1, %ymm0, %ymm0
; XOPAVX2-NEXT: retq
;
; AVX512-LABEL: var_shift_v4i64:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpsllvq %ymm1, %ymm0, %ymm0
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: var_shift_v4i64:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpsllvq %ymm1, %ymm0, %ymm0
; AVX512VL-NEXT: retq
;
; X32-AVX1-LABEL: var_shift_v4i64:
-; X32-AVX1: # BB#0:
+; X32-AVX1: # %bb.0:
; X32-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; X32-AVX1-NEXT: vpsllq %xmm2, %xmm3, %xmm4
@@ -77,7 +77,7 @@ define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
; X32-AVX1-NEXT: retl
;
; X32-AVX2-LABEL: var_shift_v4i64:
-; X32-AVX2: # BB#0:
+; X32-AVX2: # %bb.0:
; X32-AVX2-NEXT: vpsllvq %ymm1, %ymm0, %ymm0
; X32-AVX2-NEXT: retl
%shift = shl <4 x i64> %a, %b
@@ -86,7 +86,7 @@ define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
define <8 x i32> @var_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
; AVX1-LABEL: var_shift_v8i32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vpslld $23, %xmm2, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
@@ -102,12 +102,12 @@ define <8 x i32> @var_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: var_shift_v8i32:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; XOPAVX1-LABEL: var_shift_v8i32:
-; XOPAVX1: # BB#0:
+; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; XOPAVX1-NEXT: vpshld %xmm2, %xmm3, %xmm2
@@ -116,22 +116,22 @@ define <8 x i32> @var_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: var_shift_v8i32:
-; XOPAVX2: # BB#0:
+; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
; XOPAVX2-NEXT: retq
;
; AVX512-LABEL: var_shift_v8i32:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: var_shift_v8i32:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
; AVX512VL-NEXT: retq
;
; X32-AVX1-LABEL: var_shift_v8i32:
-; X32-AVX1: # BB#0:
+; X32-AVX1: # %bb.0:
; X32-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; X32-AVX1-NEXT: vpslld $23, %xmm2, %xmm2
; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
@@ -147,7 +147,7 @@ define <8 x i32> @var_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
; X32-AVX1-NEXT: retl
;
; X32-AVX2-LABEL: var_shift_v8i32:
-; X32-AVX2: # BB#0:
+; X32-AVX2: # %bb.0:
; X32-AVX2-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
; X32-AVX2-NEXT: retl
%shift = shl <8 x i32> %a, %b
@@ -156,7 +156,7 @@ define <8 x i32> @var_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
; AVX1-LABEL: var_shift_v16i16:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vpsllw $12, %xmm2, %xmm3
; AVX1-NEXT: vpsllw $4, %xmm2, %xmm2
@@ -191,8 +191,8 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: var_shift_v16i16:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpxor %ymm2, %ymm2, %ymm2
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
; AVX2-NEXT: vpsllvd %ymm3, %ymm4, %ymm3
@@ -205,7 +205,7 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
; AVX2-NEXT: retq
;
; XOPAVX1-LABEL: var_shift_v16i16:
-; XOPAVX1: # BB#0:
+; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; XOPAVX1-NEXT: vpshlw %xmm2, %xmm3, %xmm2
@@ -214,7 +214,7 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: var_shift_v16i16:
-; XOPAVX2: # BB#0:
+; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
; XOPAVX2-NEXT: vpshlw %xmm2, %xmm3, %xmm2
@@ -223,7 +223,7 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
; XOPAVX2-NEXT: retq
;
; AVX512DQ-LABEL: var_shift_v16i16:
-; AVX512DQ: # BB#0:
+; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
; AVX512DQ-NEXT: vpsllvd %zmm1, %zmm0, %zmm0
@@ -231,15 +231,15 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: var_shift_v16i16:
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
-; AVX512BW-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: # kill: def %ymm1 killed %ymm1 def %zmm1
+; AVX512BW-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512BW-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
; AVX512BW-NEXT: retq
;
; AVX512DQVL-LABEL: var_shift_v16i16:
-; AVX512DQVL: # BB#0:
+; AVX512DQVL: # %bb.0:
; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
; AVX512DQVL-NEXT: vpsllvd %zmm1, %zmm0, %zmm0
@@ -247,12 +247,12 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
; AVX512DQVL-NEXT: retq
;
; AVX512BWVL-LABEL: var_shift_v16i16:
-; AVX512BWVL: # BB#0:
+; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vpsllvw %ymm1, %ymm0, %ymm0
; AVX512BWVL-NEXT: retq
;
; X32-AVX1-LABEL: var_shift_v16i16:
-; X32-AVX1: # BB#0:
+; X32-AVX1: # %bb.0:
; X32-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; X32-AVX1-NEXT: vpsllw $12, %xmm2, %xmm3
; X32-AVX1-NEXT: vpsllw $4, %xmm2, %xmm2
@@ -287,8 +287,8 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
; X32-AVX1-NEXT: retl
;
; X32-AVX2-LABEL: var_shift_v16i16:
-; X32-AVX2: # BB#0:
-; X32-AVX2-NEXT: vpxor %ymm2, %ymm2, %ymm2
+; X32-AVX2: # %bb.0:
+; X32-AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
; X32-AVX2-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
; X32-AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
; X32-AVX2-NEXT: vpsllvd %ymm3, %ymm4, %ymm3
@@ -305,7 +305,7 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
; AVX1-LABEL: var_shift_v32i8:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vpsllw $4, %xmm2, %xmm3
; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
@@ -336,7 +336,7 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: var_shift_v32i8:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpsllw $5, %ymm1, %ymm1
; AVX2-NEXT: vpsllw $4, %ymm0, %ymm2
; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
@@ -351,7 +351,7 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
; AVX2-NEXT: retq
;
; XOPAVX1-LABEL: var_shift_v32i8:
-; XOPAVX1: # BB#0:
+; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; XOPAVX1-NEXT: vpshlb %xmm2, %xmm3, %xmm2
@@ -360,7 +360,7 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: var_shift_v32i8:
-; XOPAVX2: # BB#0:
+; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
; XOPAVX2-NEXT: vpshlb %xmm2, %xmm3, %xmm2
@@ -369,7 +369,7 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
; XOPAVX2-NEXT: retq
;
; AVX512DQ-LABEL: var_shift_v32i8:
-; AVX512DQ: # BB#0:
+; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vpsllw $5, %ymm1, %ymm1
; AVX512DQ-NEXT: vpsllw $4, %ymm0, %ymm2
; AVX512DQ-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
@@ -384,7 +384,7 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: var_shift_v32i8:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
@@ -392,7 +392,7 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
; AVX512BW-NEXT: retq
;
; AVX512DQVL-LABEL: var_shift_v32i8:
-; AVX512DQVL: # BB#0:
+; AVX512DQVL: # %bb.0:
; AVX512DQVL-NEXT: vpsllw $5, %ymm1, %ymm1
; AVX512DQVL-NEXT: vpsllw $4, %ymm0, %ymm2
; AVX512DQVL-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
@@ -407,7 +407,7 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
; AVX512DQVL-NEXT: retq
;
; AVX512BWVL-LABEL: var_shift_v32i8:
-; AVX512BWVL: # BB#0:
+; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
; AVX512BWVL-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
@@ -415,7 +415,7 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
; AVX512BWVL-NEXT: retq
;
; X32-AVX1-LABEL: var_shift_v32i8:
-; X32-AVX1: # BB#0:
+; X32-AVX1: # %bb.0:
; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; X32-AVX1-NEXT: vpsllw $4, %xmm2, %xmm3
; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
@@ -446,7 +446,7 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
; X32-AVX1-NEXT: retl
;
; X32-AVX2-LABEL: var_shift_v32i8:
-; X32-AVX2: # BB#0:
+; X32-AVX2: # %bb.0:
; X32-AVX2-NEXT: vpsllw $5, %ymm1, %ymm1
; X32-AVX2-NEXT: vpsllw $4, %ymm0, %ymm2
; X32-AVX2-NEXT: vpand {{\.LCPI.*}}, %ymm2, %ymm2
@@ -469,7 +469,7 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
; AVX1-LABEL: splatvar_shift_v4i64:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vpsllq %xmm1, %xmm2, %xmm2
; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm0
@@ -477,12 +477,12 @@ define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: splatvar_shift_v4i64:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpsllq %xmm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; XOPAVX1-LABEL: splatvar_shift_v4i64:
-; XOPAVX1: # BB#0:
+; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; XOPAVX1-NEXT: vpsllq %xmm1, %xmm2, %xmm2
; XOPAVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm0
@@ -490,24 +490,22 @@ define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: splatvar_shift_v4i64:
-; XOPAVX2: # BB#0:
+; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: vpsllq %xmm1, %ymm0, %ymm0
; XOPAVX2-NEXT: retq
;
; AVX512-LABEL: splatvar_shift_v4i64:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpsllq %xmm1, %ymm0, %ymm0
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: splatvar_shift_v4i64:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpsllq %xmm1, %ymm0, %ymm0
; AVX512VL-NEXT: retq
;
; X32-AVX1-LABEL: splatvar_shift_v4i64:
-; X32-AVX1: # BB#0:
-; X32-AVX1-NEXT: vpextrd $1, %xmm1, %eax
-; X32-AVX1-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1
+; X32-AVX1: # %bb.0:
; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; X32-AVX1-NEXT: vpsllq %xmm1, %xmm2, %xmm2
; X32-AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm0
@@ -515,9 +513,7 @@ define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
; X32-AVX1-NEXT: retl
;
; X32-AVX2-LABEL: splatvar_shift_v4i64:
-; X32-AVX2: # BB#0:
-; X32-AVX2-NEXT: vpextrd $1, %xmm1, %eax
-; X32-AVX2-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1
+; X32-AVX2: # %bb.0:
; X32-AVX2-NEXT: vpsllq %xmm1, %ymm0, %ymm0
; X32-AVX2-NEXT: retl
%splat = shufflevector <4 x i64> %b, <4 x i64> undef, <4 x i32> zeroinitializer
@@ -527,7 +523,7 @@ define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
; AVX1-LABEL: splatvar_shift_v8i32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
; AVX1-NEXT: vpslld %xmm1, %xmm2, %xmm2
@@ -536,13 +532,13 @@ define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: splatvar_shift_v8i32:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
; AVX2-NEXT: vpslld %xmm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; XOPAVX1-LABEL: splatvar_shift_v8i32:
-; XOPAVX1: # BB#0:
+; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; XOPAVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
; XOPAVX1-NEXT: vpslld %xmm1, %xmm2, %xmm2
@@ -551,25 +547,25 @@ define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: splatvar_shift_v8i32:
-; XOPAVX2: # BB#0:
+; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
; XOPAVX2-NEXT: vpslld %xmm1, %ymm0, %ymm0
; XOPAVX2-NEXT: retq
;
; AVX512-LABEL: splatvar_shift_v8i32:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
; AVX512-NEXT: vpslld %xmm1, %ymm0, %ymm0
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: splatvar_shift_v8i32:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
; AVX512VL-NEXT: vpslld %xmm1, %ymm0, %ymm0
; AVX512VL-NEXT: retq
;
; X32-AVX1-LABEL: splatvar_shift_v8i32:
-; X32-AVX1: # BB#0:
+; X32-AVX1: # %bb.0:
; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; X32-AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
; X32-AVX1-NEXT: vpslld %xmm1, %xmm2, %xmm2
@@ -578,7 +574,7 @@ define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
; X32-AVX1-NEXT: retl
;
; X32-AVX2-LABEL: splatvar_shift_v8i32:
-; X32-AVX2: # BB#0:
+; X32-AVX2: # %bb.0:
; X32-AVX2-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
; X32-AVX2-NEXT: vpslld %xmm1, %ymm0, %ymm0
; X32-AVX2-NEXT: retl
@@ -589,7 +585,7 @@ define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
; AVX1-LABEL: splatvar_shift_v16i16:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
; AVX1-NEXT: vpsllw %xmm1, %xmm2, %xmm2
@@ -598,13 +594,13 @@ define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind
; AVX1-NEXT: retq
;
; AVX2-LABEL: splatvar_shift_v16i16:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
; AVX2-NEXT: vpsllw %xmm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; XOPAVX1-LABEL: splatvar_shift_v16i16:
-; XOPAVX1: # BB#0:
+; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; XOPAVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
; XOPAVX1-NEXT: vpsllw %xmm1, %xmm2, %xmm2
@@ -613,25 +609,25 @@ define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: splatvar_shift_v16i16:
-; XOPAVX2: # BB#0:
+; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
; XOPAVX2-NEXT: vpsllw %xmm1, %ymm0, %ymm0
; XOPAVX2-NEXT: retq
;
; AVX512-LABEL: splatvar_shift_v16i16:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
; AVX512-NEXT: vpsllw %xmm1, %ymm0, %ymm0
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: splatvar_shift_v16i16:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
; AVX512VL-NEXT: vpsllw %xmm1, %ymm0, %ymm0
; AVX512VL-NEXT: retq
;
; X32-AVX1-LABEL: splatvar_shift_v16i16:
-; X32-AVX1: # BB#0:
+; X32-AVX1: # %bb.0:
; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; X32-AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
; X32-AVX1-NEXT: vpsllw %xmm1, %xmm2, %xmm2
@@ -640,7 +636,7 @@ define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind
; X32-AVX1-NEXT: retl
;
; X32-AVX2-LABEL: splatvar_shift_v16i16:
-; X32-AVX2: # BB#0:
+; X32-AVX2: # %bb.0:
; X32-AVX2-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
; X32-AVX2-NEXT: vpsllw %xmm1, %ymm0, %ymm0
; X32-AVX2-NEXT: retl
@@ -651,7 +647,7 @@ define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind
define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
; AVX1-LABEL: splatvar_shift_v32i8:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
@@ -680,7 +676,7 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: splatvar_shift_v32i8:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpbroadcastb %xmm1, %ymm1
; AVX2-NEXT: vpsllw $4, %ymm0, %ymm2
; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
@@ -696,7 +692,7 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
; AVX2-NEXT: retq
;
; XOPAVX1-LABEL: splatvar_shift_v32i8:
-; XOPAVX1: # BB#0:
+; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; XOPAVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
@@ -706,7 +702,7 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: splatvar_shift_v32i8:
-; XOPAVX2: # BB#0:
+; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: vpbroadcastb %xmm1, %ymm1
; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
@@ -716,7 +712,7 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
; XOPAVX2-NEXT: retq
;
; AVX512DQ-LABEL: splatvar_shift_v32i8:
-; AVX512DQ: # BB#0:
+; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vpbroadcastb %xmm1, %ymm1
; AVX512DQ-NEXT: vpsllw $4, %ymm0, %ymm2
; AVX512DQ-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
@@ -732,7 +728,7 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: splatvar_shift_v32i8:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpbroadcastb %xmm1, %ymm1
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
@@ -741,7 +737,7 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
; AVX512BW-NEXT: retq
;
; AVX512DQVL-LABEL: splatvar_shift_v32i8:
-; AVX512DQVL: # BB#0:
+; AVX512DQVL: # %bb.0:
; AVX512DQVL-NEXT: vpbroadcastb %xmm1, %ymm1
; AVX512DQVL-NEXT: vpsllw $4, %ymm0, %ymm2
; AVX512DQVL-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
@@ -757,7 +753,7 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
; AVX512DQVL-NEXT: retq
;
; AVX512BWVL-LABEL: splatvar_shift_v32i8:
-; AVX512BWVL: # BB#0:
+; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vpbroadcastb %xmm1, %ymm1
; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
@@ -766,7 +762,7 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
; AVX512BWVL-NEXT: retq
;
; X32-AVX1-LABEL: splatvar_shift_v32i8:
-; X32-AVX1: # BB#0:
+; X32-AVX1: # %bb.0:
; X32-AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; X32-AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
@@ -795,7 +791,7 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
; X32-AVX1-NEXT: retl
;
; X32-AVX2-LABEL: splatvar_shift_v32i8:
-; X32-AVX2: # BB#0:
+; X32-AVX2: # %bb.0:
; X32-AVX2-NEXT: vpbroadcastb %xmm1, %ymm1
; X32-AVX2-NEXT: vpsllw $4, %ymm0, %ymm2
; X32-AVX2-NEXT: vpand {{\.LCPI.*}}, %ymm2, %ymm2
@@ -820,7 +816,7 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
define <4 x i64> @constant_shift_v4i64(<4 x i64> %a) nounwind {
; AVX1-LABEL: constant_shift_v4i64:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpsllq $62, %xmm1, %xmm2
; AVX1-NEXT: vpsllq $31, %xmm1, %xmm1
@@ -832,12 +828,12 @@ define <4 x i64> @constant_shift_v4i64(<4 x i64> %a) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: constant_shift_v4i64:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpsllvq {{.*}}(%rip), %ymm0, %ymm0
; AVX2-NEXT: retq
;
; XOPAVX1-LABEL: constant_shift_v4i64:
-; XOPAVX1: # BB#0:
+; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vpshlq {{.*}}(%rip), %xmm0, %xmm1
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; XOPAVX1-NEXT: vpshlq {{.*}}(%rip), %xmm0, %xmm0
@@ -845,22 +841,22 @@ define <4 x i64> @constant_shift_v4i64(<4 x i64> %a) nounwind {
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: constant_shift_v4i64:
-; XOPAVX2: # BB#0:
+; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: vpsllvq {{.*}}(%rip), %ymm0, %ymm0
; XOPAVX2-NEXT: retq
;
; AVX512-LABEL: constant_shift_v4i64:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpsllvq {{.*}}(%rip), %ymm0, %ymm0
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: constant_shift_v4i64:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpsllvq {{.*}}(%rip), %ymm0, %ymm0
; AVX512VL-NEXT: retq
;
; X32-AVX1-LABEL: constant_shift_v4i64:
-; X32-AVX1: # BB#0:
+; X32-AVX1: # %bb.0:
; X32-AVX1-NEXT: vmovdqa {{.*#+}} ymm1 = [1,0,7,0,31,0,62,0]
; X32-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
@@ -876,7 +872,7 @@ define <4 x i64> @constant_shift_v4i64(<4 x i64> %a) nounwind {
; X32-AVX1-NEXT: retl
;
; X32-AVX2-LABEL: constant_shift_v4i64:
-; X32-AVX2: # BB#0:
+; X32-AVX2: # %bb.0:
; X32-AVX2-NEXT: vpsllvq {{\.LCPI.*}}, %ymm0, %ymm0
; X32-AVX2-NEXT: retl
%shift = shl <4 x i64> %a, <i64 1, i64 7, i64 31, i64 62>
@@ -885,7 +881,7 @@ define <4 x i64> @constant_shift_v4i64(<4 x i64> %a) nounwind {
define <8 x i32> @constant_shift_v8i32(<8 x i32> %a) nounwind {
; AVX1-LABEL: constant_shift_v8i32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
@@ -893,12 +889,12 @@ define <8 x i32> @constant_shift_v8i32(<8 x i32> %a) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: constant_shift_v8i32:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpsllvd {{.*}}(%rip), %ymm0, %ymm0
; AVX2-NEXT: retq
;
; XOPAVX1-LABEL: constant_shift_v8i32:
-; XOPAVX1: # BB#0:
+; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vpshld {{.*}}(%rip), %xmm0, %xmm1
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; XOPAVX1-NEXT: vpshld {{.*}}(%rip), %xmm0, %xmm0
@@ -906,22 +902,22 @@ define <8 x i32> @constant_shift_v8i32(<8 x i32> %a) nounwind {
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: constant_shift_v8i32:
-; XOPAVX2: # BB#0:
+; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: vpsllvd {{.*}}(%rip), %ymm0, %ymm0
; XOPAVX2-NEXT: retq
;
; AVX512-LABEL: constant_shift_v8i32:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpsllvd {{.*}}(%rip), %ymm0, %ymm0
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: constant_shift_v8i32:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpsllvd {{.*}}(%rip), %ymm0, %ymm0
; AVX512VL-NEXT: retq
;
; X32-AVX1-LABEL: constant_shift_v8i32:
-; X32-AVX1: # BB#0:
+; X32-AVX1: # %bb.0:
; X32-AVX1-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm1
; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; X32-AVX1-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0
@@ -929,7 +925,7 @@ define <8 x i32> @constant_shift_v8i32(<8 x i32> %a) nounwind {
; X32-AVX1-NEXT: retl
;
; X32-AVX2-LABEL: constant_shift_v8i32:
-; X32-AVX2: # BB#0:
+; X32-AVX2: # %bb.0:
; X32-AVX2-NEXT: vpsllvd {{\.LCPI.*}}, %ymm0, %ymm0
; X32-AVX2-NEXT: retl
%shift = shl <8 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 8, i32 7>
@@ -938,7 +934,7 @@ define <8 x i32> @constant_shift_v8i32(<8 x i32> %a) nounwind {
define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) nounwind {
; AVX1-LABEL: constant_shift_v16i16:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
@@ -946,12 +942,12 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: constant_shift_v16i16:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
; AVX2-NEXT: retq
;
; XOPAVX1-LABEL: constant_shift_v16i16:
-; XOPAVX1: # BB#0:
+; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vpshlw {{.*}}(%rip), %xmm0, %xmm1
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; XOPAVX1-NEXT: vpshlw {{.*}}(%rip), %xmm0, %xmm0
@@ -959,35 +955,35 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) nounwind {
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: constant_shift_v16i16:
-; XOPAVX2: # BB#0:
+; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
; XOPAVX2-NEXT: retq
;
; AVX512DQ-LABEL: constant_shift_v16i16:
-; AVX512DQ: # BB#0:
+; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: constant_shift_v16i16:
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512BW-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
; AVX512BW-NEXT: retq
;
; AVX512DQVL-LABEL: constant_shift_v16i16:
-; AVX512DQVL: # BB#0:
+; AVX512DQVL: # %bb.0:
; AVX512DQVL-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
; AVX512DQVL-NEXT: retq
;
; AVX512BWVL-LABEL: constant_shift_v16i16:
-; AVX512BWVL: # BB#0:
+; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vpsllvw {{.*}}(%rip), %ymm0, %ymm0
; AVX512BWVL-NEXT: retq
;
; X32-AVX1-LABEL: constant_shift_v16i16:
-; X32-AVX1: # BB#0:
+; X32-AVX1: # %bb.0:
; X32-AVX1-NEXT: vpmullw {{\.LCPI.*}}, %xmm0, %xmm1
; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; X32-AVX1-NEXT: vpmullw {{\.LCPI.*}}, %xmm0, %xmm0
@@ -995,7 +991,7 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) nounwind {
; X32-AVX1-NEXT: retl
;
; X32-AVX2-LABEL: constant_shift_v16i16:
-; X32-AVX2: # BB#0:
+; X32-AVX2: # %bb.0:
; X32-AVX2-NEXT: vpmullw {{\.LCPI.*}}, %ymm0, %ymm0
; X32-AVX2-NEXT: retl
%shift = shl <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
@@ -1004,7 +1000,7 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) nounwind {
define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind {
; AVX1-LABEL: constant_shift_v32i8:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpsllw $4, %xmm1, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
@@ -1031,7 +1027,7 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: constant_shift_v32i8:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpsllw $4, %ymm0, %ymm1
; AVX2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [8192,24640,41088,57536,49376,32928,16480,32,8192,24640,41088,57536,49376,32928,16480,32]
@@ -1046,7 +1042,7 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind {
; AVX2-NEXT: retq
;
; XOPAVX1-LABEL: constant_shift_v32i8:
-; XOPAVX1: # BB#0:
+; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
; XOPAVX1-NEXT: vpshlb %xmm2, %xmm1, %xmm1
@@ -1055,7 +1051,7 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind {
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: constant_shift_v32i8:
-; XOPAVX2: # BB#0:
+; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
; XOPAVX2-NEXT: vpshlb %xmm2, %xmm1, %xmm1
@@ -1064,7 +1060,7 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind {
; XOPAVX2-NEXT: retq
;
; AVX512DQ-LABEL: constant_shift_v32i8:
-; AVX512DQ: # BB#0:
+; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vpsllw $4, %ymm0, %ymm1
; AVX512DQ-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [8192,24640,41088,57536,49376,32928,16480,32,8192,24640,41088,57536,49376,32928,16480,32]
@@ -1079,14 +1075,14 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind {
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: constant_shift_v32i8:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
; AVX512BW-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm0
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
; AVX512BW-NEXT: retq
;
; AVX512DQVL-LABEL: constant_shift_v32i8:
-; AVX512DQVL: # BB#0:
+; AVX512DQVL: # %bb.0:
; AVX512DQVL-NEXT: vpsllw $4, %ymm0, %ymm1
; AVX512DQVL-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
; AVX512DQVL-NEXT: vmovdqa {{.*#+}} ymm2 = [8192,24640,41088,57536,49376,32928,16480,32,8192,24640,41088,57536,49376,32928,16480,32]
@@ -1101,14 +1097,14 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind {
; AVX512DQVL-NEXT: retq
;
; AVX512BWVL-LABEL: constant_shift_v32i8:
-; AVX512BWVL: # BB#0:
+; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
; AVX512BWVL-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm0
; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0
; AVX512BWVL-NEXT: retq
;
; X32-AVX1-LABEL: constant_shift_v32i8:
-; X32-AVX1: # BB#0:
+; X32-AVX1: # %bb.0:
; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; X32-AVX1-NEXT: vpsllw $4, %xmm1, %xmm2
; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
@@ -1135,7 +1131,7 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind {
; X32-AVX1-NEXT: retl
;
; X32-AVX2-LABEL: constant_shift_v32i8:
-; X32-AVX2: # BB#0:
+; X32-AVX2: # %bb.0:
; X32-AVX2-NEXT: vpsllw $4, %ymm0, %ymm1
; X32-AVX2-NEXT: vpand {{\.LCPI.*}}, %ymm1, %ymm1
; X32-AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [8192,24640,41088,57536,49376,32928,16480,32,8192,24640,41088,57536,49376,32928,16480,32]
@@ -1158,7 +1154,7 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind {
define <4 x i64> @splatconstant_shift_v4i64(<4 x i64> %a) nounwind {
; AVX1-LABEL: splatconstant_shift_v4i64:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpsllq $7, %xmm0, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpsllq $7, %xmm0, %xmm0
@@ -1166,12 +1162,12 @@ define <4 x i64> @splatconstant_shift_v4i64(<4 x i64> %a) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: splatconstant_shift_v4i64:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpsllq $7, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; XOPAVX1-LABEL: splatconstant_shift_v4i64:
-; XOPAVX1: # BB#0:
+; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vpsllq $7, %xmm0, %xmm1
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; XOPAVX1-NEXT: vpsllq $7, %xmm0, %xmm0
@@ -1179,22 +1175,22 @@ define <4 x i64> @splatconstant_shift_v4i64(<4 x i64> %a) nounwind {
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: splatconstant_shift_v4i64:
-; XOPAVX2: # BB#0:
+; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: vpsllq $7, %ymm0, %ymm0
; XOPAVX2-NEXT: retq
;
; AVX512-LABEL: splatconstant_shift_v4i64:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpsllq $7, %ymm0, %ymm0
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: splatconstant_shift_v4i64:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpsllq $7, %ymm0, %ymm0
; AVX512VL-NEXT: retq
;
; X32-AVX1-LABEL: splatconstant_shift_v4i64:
-; X32-AVX1: # BB#0:
+; X32-AVX1: # %bb.0:
; X32-AVX1-NEXT: vpsllq $7, %xmm0, %xmm1
; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; X32-AVX1-NEXT: vpsllq $7, %xmm0, %xmm0
@@ -1202,7 +1198,7 @@ define <4 x i64> @splatconstant_shift_v4i64(<4 x i64> %a) nounwind {
; X32-AVX1-NEXT: retl
;
; X32-AVX2-LABEL: splatconstant_shift_v4i64:
-; X32-AVX2: # BB#0:
+; X32-AVX2: # %bb.0:
; X32-AVX2-NEXT: vpsllq $7, %ymm0, %ymm0
; X32-AVX2-NEXT: retl
%shift = shl <4 x i64> %a, <i64 7, i64 7, i64 7, i64 7>
@@ -1211,7 +1207,7 @@ define <4 x i64> @splatconstant_shift_v4i64(<4 x i64> %a) nounwind {
define <8 x i32> @splatconstant_shift_v8i32(<8 x i32> %a) nounwind {
; AVX1-LABEL: splatconstant_shift_v8i32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpslld $5, %xmm0, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpslld $5, %xmm0, %xmm0
@@ -1219,12 +1215,12 @@ define <8 x i32> @splatconstant_shift_v8i32(<8 x i32> %a) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: splatconstant_shift_v8i32:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpslld $5, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; XOPAVX1-LABEL: splatconstant_shift_v8i32:
-; XOPAVX1: # BB#0:
+; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vpslld $5, %xmm0, %xmm1
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; XOPAVX1-NEXT: vpslld $5, %xmm0, %xmm0
@@ -1232,22 +1228,22 @@ define <8 x i32> @splatconstant_shift_v8i32(<8 x i32> %a) nounwind {
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: splatconstant_shift_v8i32:
-; XOPAVX2: # BB#0:
+; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: vpslld $5, %ymm0, %ymm0
; XOPAVX2-NEXT: retq
;
; AVX512-LABEL: splatconstant_shift_v8i32:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpslld $5, %ymm0, %ymm0
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: splatconstant_shift_v8i32:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpslld $5, %ymm0, %ymm0
; AVX512VL-NEXT: retq
;
; X32-AVX1-LABEL: splatconstant_shift_v8i32:
-; X32-AVX1: # BB#0:
+; X32-AVX1: # %bb.0:
; X32-AVX1-NEXT: vpslld $5, %xmm0, %xmm1
; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; X32-AVX1-NEXT: vpslld $5, %xmm0, %xmm0
@@ -1255,7 +1251,7 @@ define <8 x i32> @splatconstant_shift_v8i32(<8 x i32> %a) nounwind {
; X32-AVX1-NEXT: retl
;
; X32-AVX2-LABEL: splatconstant_shift_v8i32:
-; X32-AVX2: # BB#0:
+; X32-AVX2: # %bb.0:
; X32-AVX2-NEXT: vpslld $5, %ymm0, %ymm0
; X32-AVX2-NEXT: retl
%shift = shl <8 x i32> %a, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
@@ -1264,7 +1260,7 @@ define <8 x i32> @splatconstant_shift_v8i32(<8 x i32> %a) nounwind {
define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) nounwind {
; AVX1-LABEL: splatconstant_shift_v16i16:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpsllw $3, %xmm0, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpsllw $3, %xmm0, %xmm0
@@ -1272,12 +1268,12 @@ define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: splatconstant_shift_v16i16:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpsllw $3, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; XOPAVX1-LABEL: splatconstant_shift_v16i16:
-; XOPAVX1: # BB#0:
+; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vpsllw $3, %xmm0, %xmm1
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; XOPAVX1-NEXT: vpsllw $3, %xmm0, %xmm0
@@ -1285,22 +1281,22 @@ define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) nounwind {
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: splatconstant_shift_v16i16:
-; XOPAVX2: # BB#0:
+; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: vpsllw $3, %ymm0, %ymm0
; XOPAVX2-NEXT: retq
;
; AVX512-LABEL: splatconstant_shift_v16i16:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpsllw $3, %ymm0, %ymm0
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: splatconstant_shift_v16i16:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpsllw $3, %ymm0, %ymm0
; AVX512VL-NEXT: retq
;
; X32-AVX1-LABEL: splatconstant_shift_v16i16:
-; X32-AVX1: # BB#0:
+; X32-AVX1: # %bb.0:
; X32-AVX1-NEXT: vpsllw $3, %xmm0, %xmm1
; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; X32-AVX1-NEXT: vpsllw $3, %xmm0, %xmm0
@@ -1308,7 +1304,7 @@ define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) nounwind {
; X32-AVX1-NEXT: retl
;
; X32-AVX2-LABEL: splatconstant_shift_v16i16:
-; X32-AVX2: # BB#0:
+; X32-AVX2: # %bb.0:
; X32-AVX2-NEXT: vpsllw $3, %ymm0, %ymm0
; X32-AVX2-NEXT: retl
%shift = shl <16 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
@@ -1317,7 +1313,7 @@ define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) nounwind {
define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) nounwind {
; AVX1-LABEL: splatconstant_shift_v32i8:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpsllw $3, %xmm1, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248]
@@ -1328,13 +1324,13 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: splatconstant_shift_v32i8:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpsllw $3, %ymm0, %ymm0
; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
; AVX2-NEXT: retq
;
; XOPAVX1-LABEL: splatconstant_shift_v32i8:
-; XOPAVX1: # BB#0:
+; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
; XOPAVX1-NEXT: vpshlb %xmm2, %xmm1, %xmm1
@@ -1343,25 +1339,25 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) nounwind {
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: splatconstant_shift_v32i8:
-; XOPAVX2: # BB#0:
+; XOPAVX2: # %bb.0:
; XOPAVX2-NEXT: vpsllw $3, %ymm0, %ymm0
; XOPAVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
; XOPAVX2-NEXT: retq
;
; AVX512-LABEL: splatconstant_shift_v32i8:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpsllw $3, %ymm0, %ymm0
; AVX512-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
; AVX512-NEXT: retq
;
; AVX512VL-LABEL: splatconstant_shift_v32i8:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpsllw $3, %ymm0, %ymm0
; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
; AVX512VL-NEXT: retq
;
; X32-AVX1-LABEL: splatconstant_shift_v32i8:
-; X32-AVX1: # BB#0:
+; X32-AVX1: # %bb.0:
; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; X32-AVX1-NEXT: vpsllw $3, %xmm1, %xmm1
; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248]
@@ -1372,7 +1368,7 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) nounwind {
; X32-AVX1-NEXT: retl
;
; X32-AVX2-LABEL: splatconstant_shift_v32i8:
-; X32-AVX2: # BB#0:
+; X32-AVX2: # %bb.0:
; X32-AVX2-NEXT: vpsllw $3, %ymm0, %ymm0
; X32-AVX2-NEXT: vpand {{\.LCPI.*}}, %ymm0, %ymm0
; X32-AVX2-NEXT: retl
diff --git a/test/CodeGen/X86/vector-shift-shl-512.ll b/test/CodeGen/X86/vector-shift-shl-512.ll
index 39f8fe2f05dc..807319a4b247 100644
--- a/test/CodeGen/X86/vector-shift-shl-512.ll
+++ b/test/CodeGen/X86/vector-shift-shl-512.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512DQ
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512DQ
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
;
; Variable Shifts
@@ -8,7 +8,7 @@
define <8 x i64> @var_shift_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind {
; ALL-LABEL: var_shift_v8i64:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vpsllvq %zmm1, %zmm0, %zmm0
; ALL-NEXT: retq
%shift = shl <8 x i64> %a, %b
@@ -17,7 +17,7 @@ define <8 x i64> @var_shift_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind {
define <16 x i32> @var_shift_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind {
; ALL-LABEL: var_shift_v16i32:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vpsllvd %zmm1, %zmm0, %zmm0
; ALL-NEXT: retq
%shift = shl <16 x i32> %a, %b
@@ -26,7 +26,7 @@ define <16 x i32> @var_shift_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind {
define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
; AVX512DQ-LABEL: var_shift_v32i16:
-; AVX512DQ: # BB#0:
+; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
; AVX512DQ-NEXT: vpsllvd %zmm2, %zmm0, %zmm0
@@ -38,7 +38,7 @@ define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: var_shift_v32i16:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: retq
%shift = shl <32 x i16> %a, %b
@@ -47,7 +47,7 @@ define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
; AVX512DQ-LABEL: var_shift_v64i8:
-; AVX512DQ: # BB#0:
+; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vpsllw $4, %ymm0, %ymm4
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
; AVX512DQ-NEXT: vpand %ymm5, %ymm4, %ymm4
@@ -75,7 +75,7 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: var_shift_v64i8:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm2
; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
; AVX512BW-NEXT: vpsllw $5, %zmm1, %zmm1
@@ -100,7 +100,7 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
define <8 x i64> @splatvar_shift_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind {
; ALL-LABEL: splatvar_shift_v8i64:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vpsllq %xmm1, %zmm0, %zmm0
; ALL-NEXT: retq
%splat = shufflevector <8 x i64> %b, <8 x i64> undef, <8 x i32> zeroinitializer
@@ -110,7 +110,7 @@ define <8 x i64> @splatvar_shift_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind {
define <16 x i32> @splatvar_shift_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind {
; ALL-LABEL: splatvar_shift_v16i32:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
; ALL-NEXT: vpslld %xmm1, %zmm0, %zmm0
; ALL-NEXT: retq
@@ -121,14 +121,14 @@ define <16 x i32> @splatvar_shift_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind
define <32 x i16> @splatvar_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
; AVX512DQ-LABEL: splatvar_shift_v32i16:
-; AVX512DQ: # BB#0:
+; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
; AVX512DQ-NEXT: vpsllw %xmm2, %ymm0, %ymm0
; AVX512DQ-NEXT: vpsllw %xmm2, %ymm1, %ymm1
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: splatvar_shift_v32i16:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
; AVX512BW-NEXT: vpsllw %xmm1, %zmm0, %zmm0
; AVX512BW-NEXT: retq
@@ -139,7 +139,7 @@ define <32 x i16> @splatvar_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind
define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
; AVX512DQ-LABEL: splatvar_shift_v64i8:
-; AVX512DQ: # BB#0:
+; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vpbroadcastb %xmm2, %ymm2
; AVX512DQ-NEXT: vpsllw $4, %ymm0, %ymm3
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
@@ -165,7 +165,7 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: splatvar_shift_v64i8:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpbroadcastb %xmm1, %zmm1
; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm2
; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
@@ -192,7 +192,7 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
define <8 x i64> @constant_shift_v8i64(<8 x i64> %a) nounwind {
; ALL-LABEL: constant_shift_v8i64:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vpsllvq {{.*}}(%rip), %zmm0, %zmm0
; ALL-NEXT: retq
%shift = shl <8 x i64> %a, <i64 1, i64 7, i64 31, i64 62, i64 1, i64 7, i64 31, i64 62>
@@ -201,7 +201,7 @@ define <8 x i64> @constant_shift_v8i64(<8 x i64> %a) nounwind {
define <16 x i32> @constant_shift_v16i32(<16 x i32> %a) nounwind {
; ALL-LABEL: constant_shift_v16i32:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vpsllvd {{.*}}(%rip), %zmm0, %zmm0
; ALL-NEXT: retq
%shift = shl <16 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 8, i32 7, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 8, i32 7>
@@ -210,14 +210,14 @@ define <16 x i32> @constant_shift_v16i32(<16 x i32> %a) nounwind {
define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) nounwind {
; AVX512DQ-LABEL: constant_shift_v32i16:
-; AVX512DQ: # BB#0:
+; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
; AVX512DQ-NEXT: vpmullw %ymm2, %ymm0, %ymm0
; AVX512DQ-NEXT: vpmullw %ymm2, %ymm1, %ymm1
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: constant_shift_v32i16:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm0
; AVX512BW-NEXT: retq
%shift = shl <32 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
@@ -226,7 +226,7 @@ define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) nounwind {
define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) nounwind {
; AVX512DQ-LABEL: constant_shift_v64i8:
-; AVX512DQ: # BB#0:
+; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vpsllw $4, %ymm0, %ymm2
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
; AVX512DQ-NEXT: vpand %ymm3, %ymm2, %ymm2
@@ -251,8 +251,8 @@ define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) nounwind {
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: constant_shift_v64i8:
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm1 = [8192,24640,41088,57536,49376,32928,16480,32,8192,24640,41088,57536,49376,32928,16480,32,8192,24640,41088,57536,49376,32928,16480,32,8192,24640,41088,57536,49376,32928,16480,32]
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [8192,24640,41088,57536,49376,32928,16480,32,8192,24640,41088,57536,49376,32928,16480,32,8192,24640,41088,57536,49376,32928,16480,32,8192,24640,41088,57536,49376,32928,16480,32]
; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm2
; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
@@ -276,7 +276,7 @@ define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) nounwind {
define <8 x i64> @splatconstant_shift_v8i64(<8 x i64> %a) nounwind {
; ALL-LABEL: splatconstant_shift_v8i64:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vpsllq $7, %zmm0, %zmm0
; ALL-NEXT: retq
%shift = shl <8 x i64> %a, <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>
@@ -285,7 +285,7 @@ define <8 x i64> @splatconstant_shift_v8i64(<8 x i64> %a) nounwind {
define <16 x i32> @splatconstant_shift_v16i32(<16 x i32> %a) nounwind {
; ALL-LABEL: splatconstant_shift_v16i32:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vpslld $5, %zmm0, %zmm0
; ALL-NEXT: retq
%shift = shl <16 x i32> %a, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
@@ -294,13 +294,13 @@ define <16 x i32> @splatconstant_shift_v16i32(<16 x i32> %a) nounwind {
define <32 x i16> @splatconstant_shift_v32i16(<32 x i16> %a) nounwind {
; AVX512DQ-LABEL: splatconstant_shift_v32i16:
-; AVX512DQ: # BB#0:
+; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vpsllw $3, %ymm0, %ymm0
; AVX512DQ-NEXT: vpsllw $3, %ymm1, %ymm1
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: splatconstant_shift_v32i16:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpsllw $3, %zmm0, %zmm0
; AVX512BW-NEXT: retq
%shift = shl <32 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
@@ -309,7 +309,7 @@ define <32 x i16> @splatconstant_shift_v32i16(<32 x i16> %a) nounwind {
define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) nounwind {
; AVX512DQ-LABEL: splatconstant_shift_v64i8:
-; AVX512DQ: # BB#0:
+; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vpsllw $3, %ymm0, %ymm0
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248]
; AVX512DQ-NEXT: vpand %ymm2, %ymm0, %ymm0
@@ -318,7 +318,7 @@ define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) nounwind {
; AVX512DQ-NEXT: retq
;
; AVX512BW-LABEL: splatconstant_shift_v64i8:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpsllw $3, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0
; AVX512BW-NEXT: retq
diff --git a/test/CodeGen/X86/vector-shuffle-128-v16.ll b/test/CodeGen/X86/vector-shuffle-128-v16.ll
index 9f1ed021992d..2fcbd89b857e 100644
--- a/test/CodeGen/X86/vector-shuffle-128-v16.ll
+++ b/test/CodeGen/X86/vector-shuffle-128-v16.ll
@@ -8,32 +8,32 @@
define <16 x i8> @shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<16 x i8> %a, <16 x i8> %b) {
; SSE2-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: pxor %xmm1, %xmm1
; SSSE3-NEXT: pshufb %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pxor %xmm1, %xmm1
; SSE41-NEXT: pshufb %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpbroadcastb %xmm0, %xmm0
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
@@ -42,29 +42,29 @@ define <16 x i8> @shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(
define <16 x i8> @shuffle_v16i8_00_00_00_00_00_00_00_00_01_01_01_01_01_01_01_01(<16 x i8> %a, <16 x i8> %b) {
; SSE2-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_01_01_01_01_01_01_01_01:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,1,1,4,5,6,7]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_01_01_01_01_01_01_01_01:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_01_01_01_01_01_01_01_01:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
; SSE41-NEXT: retq
;
; AVX1OR2-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_01_01_01_01_01_01_01_01:
-; AVX1OR2: # BB#0:
+; AVX1OR2: # %bb.0:
; AVX1OR2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
; AVX1OR2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_01_01_01_01_01_01_01_01:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,1,1,4,5,6,7]
; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
@@ -75,7 +75,7 @@ define <16 x i8> @shuffle_v16i8_00_00_00_00_00_00_00_00_01_01_01_01_01_01_01_01(
define <16 x i8> @shuffle_v16i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08(<16 x i8> %a, <16 x i8> %b) {
; SSE2-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
@@ -84,22 +84,22 @@ define <16 x i8> @shuffle_v16i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08(
; SSE2-NEXT: retq
;
; SSSE3-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8]
; SSE41-NEXT: retq
;
; AVX1OR2-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08:
-; AVX1OR2: # BB#0:
+; AVX1OR2: # %bb.0:
; AVX1OR2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8]
; AVX1OR2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,1,1,8,8,9,9,8,8,9,9,10,10,11,11]
; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,2,4,5,6,7]
; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
@@ -110,13 +110,13 @@ define <16 x i8> @shuffle_v16i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08(
define <16 x i8> @shuffle_v16i8_00_00_00_00_01_01_01_01_02_02_02_02_03_03_03_03(<16 x i8> %a, <16 x i8> %b) {
; SSE-LABEL: shuffle_v16i8_00_00_00_00_01_01_01_01_02_02_02_02_03_03_03_03:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v16i8_00_00_00_00_01_01_01_01_02_02_02_02_03_03_03_03:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
; AVX-NEXT: retq
@@ -126,13 +126,13 @@ define <16 x i8> @shuffle_v16i8_00_00_00_00_01_01_01_01_02_02_02_02_03_03_03_03(
define <16 x i8> @shuffle_v16i8_04_04_04_04_05_05_05_05_06_06_06_06_07_07_07_07(<16 x i8> %a, <16 x i8> %b) {
; SSE-LABEL: shuffle_v16i8_04_04_04_04_05_05_05_05_06_06_06_06_07_07_07_07:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v16i8_04_04_04_04_05_05_05_05_06_06_06_06_07_07_07_07:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
; AVX-NEXT: retq
@@ -142,7 +142,7 @@ define <16 x i8> @shuffle_v16i8_04_04_04_04_05_05_05_05_06_06_06_06_07_07_07_07(
define <16 x i8> @shuffle_v16i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12(<16 x i8> %a, <16 x i8> %b) {
; SSE2-LABEL: shuffle_v16i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
@@ -152,17 +152,17 @@ define <16 x i8> @shuffle_v16i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12(
; SSE2-NEXT: retq
;
; SSSE3-LABEL: shuffle_v16i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v16i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12]
; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v16i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12]
; AVX-NEXT: retq
%shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4, i32 8, i32 8, i32 8, i32 8, i32 12, i32 12, i32 12, i32 12>
@@ -171,12 +171,12 @@ define <16 x i8> @shuffle_v16i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12(
define <16 x i8> @shuffle_v16i8_00_00_01_01_02_02_03_03_04_04_05_05_06_06_07_07(<16 x i8> %a, <16 x i8> %b) {
; SSE-LABEL: shuffle_v16i8_00_00_01_01_02_02_03_03_04_04_05_05_06_06_07_07:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v16i8_00_00_01_01_02_02_03_03_04_04_05_05_06_06_07_07:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; AVX-NEXT: retq
%shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
@@ -185,19 +185,19 @@ define <16 x i8> @shuffle_v16i8_00_00_01_01_02_02_03_03_04_04_05_05_06_06_07_07(
define <16 x i8> @shuffle_v16i8_0101010101010101(<16 x i8> %a, <16 x i8> %b) {
; SSE-LABEL: shuffle_v16i8_0101010101010101:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; SSE-NEXT: retq
;
; AVX1-LABEL: shuffle_v16i8_0101010101010101:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v16i8_0101010101010101:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpbroadcastw %xmm0, %xmm0
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
@@ -206,12 +206,12 @@ define <16 x i8> @shuffle_v16i8_0101010101010101(<16 x i8> %a, <16 x i8> %b) {
define <16 x i8> @shuffle_v16i8_00_16_01_17_02_18_03_19_04_20_05_21_06_22_07_23(<16 x i8> %a, <16 x i8> %b) {
; SSE-LABEL: shuffle_v16i8_00_16_01_17_02_18_03_19_04_20_05_21_06_22_07_23:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v16i8_00_16_01_17_02_18_03_19_04_20_05_21_06_22_07_23:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; AVX-NEXT: retq
%shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
@@ -220,12 +220,12 @@ define <16 x i8> @shuffle_v16i8_00_16_01_17_02_18_03_19_04_20_05_21_06_22_07_23(
define <16 x i8> @shuffle_v16i8_08_24_09_25_10_26_11_27_12_28_13_29_14_30_15_31(<16 x i8> %a, <16 x i8> %b) {
; SSE-LABEL: shuffle_v16i8_08_24_09_25_10_26_11_27_12_28_13_29_14_30_15_31:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v16i8_08_24_09_25_10_26_11_27_12_28_13_29_14_30_15_31:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
; AVX-NEXT: retq
%shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
@@ -234,7 +234,7 @@ define <16 x i8> @shuffle_v16i8_08_24_09_25_10_26_11_27_12_28_13_29_14_30_15_31(
define <16 x i8> @shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07(<16 x i8> %a, <16 x i8> %b) {
; SSE2-LABEL: shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: pxor %xmm2, %xmm2
; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,0,0,0,4,5,6,7]
@@ -244,7 +244,7 @@ define <16 x i8> @shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07(
; SSE2-NEXT: retq
;
; SSSE3-LABEL: shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
@@ -252,7 +252,7 @@ define <16 x i8> @shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07(
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
; SSE41-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
@@ -260,14 +260,14 @@ define <16 x i8> @shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07(
; SSE41-NEXT: retq
;
; AVX1-LABEL: shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpbroadcastb %xmm1, %xmm1
; AVX2OR512VL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; AVX2OR512VL-NEXT: retq
@@ -277,7 +277,7 @@ define <16 x i8> @shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07(
define <16 x i8> @shuffle_v16i8_03_02_01_00_07_06_05_04_11_10_09_08_15_14_13_12(<16 x i8> %a, <16 x i8> %b) {
; SSE2-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_11_10_09_08_15_14_13_12:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: pxor %xmm1, %xmm1
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
@@ -290,17 +290,17 @@ define <16 x i8> @shuffle_v16i8_03_02_01_00_07_06_05_04_11_10_09_08_15_14_13_12(
; SSE2-NEXT: retq
;
; SSSE3-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_11_10_09_08_15_14_13_12:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_11_10_09_08_15_14_13_12:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_11_10_09_08_15_14_13_12:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
; AVX-NEXT: retq
%shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
@@ -309,7 +309,7 @@ define <16 x i8> @shuffle_v16i8_03_02_01_00_07_06_05_04_11_10_09_08_15_14_13_12(
define <16 x i8> @shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20(<16 x i8> %a, <16 x i8> %b) {
; SSE2-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: pxor %xmm2, %xmm2
; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
@@ -321,19 +321,19 @@ define <16 x i8> @shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20(
; SSE2-NEXT: retq
;
; SSSE3-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[6,4,2,0,14,12,10,8,7,5,3,1,15,13,11,9]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[6,4,2,0,14,12,10,8,7,5,3,1,15,13,11,9]
; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,4,2,0,14,12,10,8,7,5,3,1,15,13,11,9]
; AVX-NEXT: retq
@@ -343,7 +343,7 @@ define <16 x i8> @shuffle_v16i8_03_02_01_00_07_06_05_04_19_18_17_16_23_22_21_20(
define <16 x i8> @shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20(<16 x i8> %a, <16 x i8> %b) {
; SSE2-LABEL: shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
@@ -363,21 +363,21 @@ define <16 x i8> @shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20(
; SSE2-NEXT: retq
;
; SSSE3-LABEL: shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[15,14,13,12,7,6,5,4,u,u,u,u,u,u,u,u]
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,11,10,9,8,u,u,u,u,u,u,u,u]
; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[15,14,13,12,7,6,5,4,u,u,u,u,u,u,u,u]
; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,11,10,9,8,u,u,u,u,u,u,u,u]
; SSE41-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,14,13,12,7,6,5,4,u,u,u,u,u,u,u,u]
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,11,10,9,8,u,u,u,u,u,u,u,u]
; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
@@ -388,7 +388,7 @@ define <16 x i8> @shuffle_v16i8_03_02_01_00_31_30_29_28_11_10_09_08_23_22_21_20(
define <16 x i8> @shuffle_v16i8_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31(<16 x i8> %a, <16 x i8> %b) {
; SSE2-LABEL: shuffle_v16i8_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movaps {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
; SSE2-NEXT: andps %xmm2, %xmm0
; SSE2-NEXT: andnps %xmm1, %xmm2
@@ -396,14 +396,14 @@ define <16 x i8> @shuffle_v16i8_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31(
; SSE2-NEXT: retq
;
; SSSE3-LABEL: shuffle_v16i8_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v16i8_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm2
; SSE41-NEXT: movaps {{.*#+}} xmm0 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
@@ -411,13 +411,13 @@ define <16 x i8> @shuffle_v16i8_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31(
; SSE41-NEXT: retq
;
; AVX1OR2-LABEL: shuffle_v16i8_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31:
-; AVX1OR2: # BB#0:
+; AVX1OR2: # %bb.0:
; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
; AVX1OR2-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
; AVX1OR2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i8_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: movw $-21846, %ax # imm = 0xAAAA
; AVX512VL-NEXT: kmovd %eax, %k1
; AVX512VL-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1}
@@ -428,7 +428,7 @@ define <16 x i8> @shuffle_v16i8_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31(
define <16 x i8> @shuffle_v16i8_00_01_02_19_04_05_06_23_08_09_10_27_12_13_14_31(<16 x i8> %a, <16 x i8> %b) {
; SSE2-LABEL: shuffle_v16i8_00_01_02_19_04_05_06_23_08_09_10_27_12_13_14_31:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movaps {{.*#+}} xmm2 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0]
; SSE2-NEXT: andps %xmm2, %xmm0
; SSE2-NEXT: andnps %xmm1, %xmm2
@@ -436,14 +436,14 @@ define <16 x i8> @shuffle_v16i8_00_01_02_19_04_05_06_23_08_09_10_27_12_13_14_31(
; SSE2-NEXT: retq
;
; SSSE3-LABEL: shuffle_v16i8_00_01_02_19_04_05_06_23_08_09_10_27_12_13_14_31:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[15]
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2],zero,xmm0[4,5,6],zero,xmm0[8,9,10],zero,xmm0[12,13,14],zero
; SSSE3-NEXT: por %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v16i8_00_01_02_19_04_05_06_23_08_09_10_27_12_13_14_31:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm2
; SSE41-NEXT: movaps {{.*#+}} xmm0 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0]
; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
@@ -451,13 +451,13 @@ define <16 x i8> @shuffle_v16i8_00_01_02_19_04_05_06_23_08_09_10_27_12_13_14_31(
; SSE41-NEXT: retq
;
; AVX1OR2-LABEL: shuffle_v16i8_00_01_02_19_04_05_06_23_08_09_10_27_12_13_14_31:
-; AVX1OR2: # BB#0:
+; AVX1OR2: # %bb.0:
; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0]
; AVX1OR2-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
; AVX1OR2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i8_00_01_02_19_04_05_06_23_08_09_10_27_12_13_14_31:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: movw $-30584, %ax # imm = 0x8888
; AVX512VL-NEXT: kmovd %eax, %k1
; AVX512VL-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1}
@@ -468,17 +468,17 @@ define <16 x i8> @shuffle_v16i8_00_01_02_19_04_05_06_23_08_09_10_27_12_13_14_31(
define <16 x i8> @shuffle_v16i8_00_01_02_zz_04_05_06_zz_08_09_10_zz_12_13_14_zz(<16 x i8> %a) {
; SSE-LABEL: shuffle_v16i8_00_01_02_zz_04_05_06_zz_08_09_10_zz_12_13_14_zz:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: andps {{.*}}(%rip), %xmm0
; SSE-NEXT: retq
;
; AVX1OR2-LABEL: shuffle_v16i8_00_01_02_zz_04_05_06_zz_08_09_10_zz_12_13_14_zz:
-; AVX1OR2: # BB#0:
+; AVX1OR2: # %bb.0:
; AVX1OR2-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0
; AVX1OR2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i8_00_01_02_zz_04_05_06_zz_08_09_10_zz_12_13_14_zz:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX512VL-NEXT: retq
%shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 19, i32 4, i32 5, i32 6, i32 23, i32 8, i32 9, i32 10, i32 27, i32 12, i32 13, i32 14, i32 31>
@@ -487,7 +487,7 @@ define <16 x i8> @shuffle_v16i8_00_01_02_zz_04_05_06_zz_08_09_10_zz_12_13_14_zz(
define <16 x i8> @shuffle_v16i8_00_01_02_03_20_05_06_23_08_09_10_11_28_13_14_31(<16 x i8> %a, <16 x i8> %b) {
; SSE2-LABEL: shuffle_v16i8_00_01_02_03_20_05_06_23_08_09_10_11_28_13_14_31:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movaps {{.*#+}} xmm2 = [255,255,255,255,0,255,255,0,255,255,255,255,0,255,255,0]
; SSE2-NEXT: andps %xmm2, %xmm0
; SSE2-NEXT: andnps %xmm1, %xmm2
@@ -495,14 +495,14 @@ define <16 x i8> @shuffle_v16i8_00_01_02_03_20_05_06_23_08_09_10_11_28_13_14_31(
; SSE2-NEXT: retq
;
; SSSE3-LABEL: shuffle_v16i8_00_01_02_03_20_05_06_23_08_09_10_11_28_13_14_31:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[4],zero,zero,xmm1[7],zero,zero,zero,zero,xmm1[12],zero,zero,xmm1[15]
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3],zero,xmm0[5,6],zero,xmm0[8,9,10,11],zero,xmm0[13,14],zero
; SSSE3-NEXT: por %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v16i8_00_01_02_03_20_05_06_23_08_09_10_11_28_13_14_31:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm2
; SSE41-NEXT: movaps {{.*#+}} xmm0 = [255,255,255,255,0,255,255,0,255,255,255,255,0,255,255,0]
; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
@@ -510,13 +510,13 @@ define <16 x i8> @shuffle_v16i8_00_01_02_03_20_05_06_23_08_09_10_11_28_13_14_31(
; SSE41-NEXT: retq
;
; AVX1OR2-LABEL: shuffle_v16i8_00_01_02_03_20_05_06_23_08_09_10_11_28_13_14_31:
-; AVX1OR2: # BB#0:
+; AVX1OR2: # %bb.0:
; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,0,255,255,0,255,255,255,255,0,255,255,0]
; AVX1OR2-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
; AVX1OR2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i8_00_01_02_03_20_05_06_23_08_09_10_11_28_13_14_31:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: movw $-28528, %ax # imm = 0x9090
; AVX512VL-NEXT: kmovd %eax, %k1
; AVX512VL-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1}
@@ -527,7 +527,7 @@ define <16 x i8> @shuffle_v16i8_00_01_02_03_20_05_06_23_08_09_10_11_28_13_14_31(
define <16 x i8> @shuffle_v16i8_16_17_18_19_04_05_06_07_24_25_10_11_28_13_30_15(<16 x i8> %a, <16 x i8> %b) {
; SSE2-LABEL: shuffle_v16i8_16_17_18_19_04_05_06_07_24_25_10_11_28_13_30_15:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movaps {{.*#+}} xmm2 = [255,255,255,255,0,0,0,0,255,255,0,0,255,0,255,0]
; SSE2-NEXT: andps %xmm2, %xmm1
; SSE2-NEXT: andnps %xmm0, %xmm2
@@ -536,14 +536,14 @@ define <16 x i8> @shuffle_v16i8_16_17_18_19_04_05_06_07_24_25_10_11_28_13_30_15(
; SSE2-NEXT: retq
;
; SSSE3-LABEL: shuffle_v16i8_16_17_18_19_04_05_06_07_24_25_10_11_28_13_30_15:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[4,5,6,7],zero,zero,xmm0[10,11],zero,xmm0[13],zero,xmm0[15]
; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,2,3],zero,zero,zero,zero,xmm1[8,9],zero,zero,xmm1[12],zero,xmm1[14],zero
; SSSE3-NEXT: por %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v16i8_16_17_18_19_04_05_06_07_24_25_10_11_28_13_30_15:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm2
; SSE41-NEXT: movaps {{.*#+}} xmm0 = [255,255,255,255,0,0,0,0,255,255,0,0,255,0,255,0]
; SSE41-NEXT: pblendvb %xmm0, %xmm1, %xmm2
@@ -551,13 +551,13 @@ define <16 x i8> @shuffle_v16i8_16_17_18_19_04_05_06_07_24_25_10_11_28_13_30_15(
; SSE41-NEXT: retq
;
; AVX1OR2-LABEL: shuffle_v16i8_16_17_18_19_04_05_06_07_24_25_10_11_28_13_30_15:
-; AVX1OR2: # BB#0:
+; AVX1OR2: # %bb.0:
; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,0,0,0,0,255,255,0,0,255,0,255,0]
; AVX1OR2-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
; AVX1OR2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i8_16_17_18_19_04_05_06_07_24_25_10_11_28_13_30_15:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: movw $-21264, %ax # imm = 0xACF0
; AVX512VL-NEXT: kmovd %eax, %k1
; AVX512VL-NEXT: vpblendmb %xmm0, %xmm1, %xmm0 {%k1}
@@ -568,24 +568,24 @@ define <16 x i8> @shuffle_v16i8_16_17_18_19_04_05_06_07_24_25_10_11_28_13_30_15(
define <16 x i8> @trunc_v4i32_shuffle(<16 x i8> %a) {
; SSE2-LABEL: trunc_v4i32_shuffle:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
; SSE2-NEXT: packuswb %xmm0, %xmm0
; SSE2-NEXT: packuswb %xmm0, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: trunc_v4i32_shuffle:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: trunc_v4i32_shuffle:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
; SSE41-NEXT: retq
;
; AVX-LABEL: trunc_v4i32_shuffle:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX-NEXT: retq
%shuffle = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -617,7 +617,7 @@ define <16 x i8> @undef_test1(<16 x i8> %s.0.5, <16 x i8> %s.0.8, <16 x i8> %s.0
; them because the result is 'undef'.
;
; ALL-LABEL: undef_test1:
-; ALL: # BB#0: # %entry
+; ALL: # %bb.0: # %entry
; ALL-NEXT: retq
entry:
%s.1.8 = shufflevector <16 x i8> %s.0.8, <16 x i8> undef, <16 x i32> <i32 9, i32 9, i32 undef, i32 undef, i32 undef, i32 2, i32 undef, i32 6, i32 undef, i32 6, i32 undef, i32 14, i32 14, i32 undef, i32 undef, i32 0>
@@ -639,24 +639,24 @@ entry:
define <16 x i8> @PR20540(<8 x i8> %a) {
; SSE2-LABEL: PR20540:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
; SSE2-NEXT: packuswb %xmm0, %xmm0
; SSE2-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
; SSE2-NEXT: retq
;
; SSSE3-LABEL: PR20540:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
; SSSE3-NEXT: retq
;
; SSE41-LABEL: PR20540:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
; SSE41-NEXT: retq
;
; AVX-LABEL: PR20540:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
; AVX-NEXT: retq
%shuffle = shufflevector <8 x i8> %a, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
@@ -665,13 +665,13 @@ define <16 x i8> @PR20540(<8 x i8> %a) {
define <16 x i8> @shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(i8 %i) {
; SSE-LABEL: shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movzbl %dil, %eax
; SSE-NEXT: movd %eax, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: movzbl %dil, %eax
; AVX-NEXT: vmovd %eax, %xmm0
; AVX-NEXT: retq
@@ -682,27 +682,27 @@ define <16 x i8> @shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(
define <16 x i8> @shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(i8 %i) {
; SSE2-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: shll $8, %edi
; SSE2-NEXT: pxor %xmm0, %xmm0
; SSE2-NEXT: pinsrw $2, %edi, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: shll $8, %edi
; SSSE3-NEXT: pxor %xmm0, %xmm0
; SSSE3-NEXT: pinsrw $2, %edi, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pxor %xmm0, %xmm0
; SSE41-NEXT: pinsrb $5, %edi, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0
; AVX-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0
; AVX-NEXT: retq
@@ -713,27 +713,27 @@ define <16 x i8> @shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(
define <16 x i8> @shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16(i8 %i) {
; SSE2-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: shll $8, %edi
; SSE2-NEXT: pxor %xmm0, %xmm0
; SSE2-NEXT: pinsrw $7, %edi, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: shll $8, %edi
; SSSE3-NEXT: pxor %xmm0, %xmm0
; SSSE3-NEXT: pinsrw $7, %edi, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pxor %xmm0, %xmm0
; SSE41-NEXT: pinsrb $15, %edi, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0
; AVX-NEXT: vpinsrb $15, %edi, %xmm0, %xmm0
; AVX-NEXT: retq
@@ -744,27 +744,27 @@ define <16 x i8> @shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16(
define <16 x i8> @shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(i8 %i) {
; SSE2-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movzbl %dil, %eax
; SSE2-NEXT: pxor %xmm0, %xmm0
; SSE2-NEXT: pinsrw $1, %eax, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: movzbl %dil, %eax
; SSSE3-NEXT: pxor %xmm0, %xmm0
; SSSE3-NEXT: pinsrw $1, %eax, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pxor %xmm0, %xmm0
; SSE41-NEXT: pinsrb $2, %edi, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0
; AVX-NEXT: vpinsrb $2, %edi, %xmm0, %xmm0
; AVX-NEXT: retq
@@ -775,12 +775,12 @@ define <16 x i8> @shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(
define <16 x i8> @shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_16_uu_18_uu(<16 x i8> %a) {
; SSE-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_16_uu_18_uu:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3]
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_16_uu_18_uu:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3]
; AVX-NEXT: retq
%shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 undef, i32 18, i32 undef>
@@ -789,12 +789,12 @@ define <16 x i8> @shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_16_uu_18_uu(
define <16 x i8> @shuffle_v16i8_28_uu_30_31_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(<16 x i8> %a) {
; SSE-LABEL: shuffle_v16i8_28_uu_30_31_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v16i8_28_uu_30_31_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; AVX-NEXT: retq
%shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 28, i32 undef, i32 30, i32 31, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 09, i32 0, i32 0, i32 0, i32 0, i32 0>
@@ -803,24 +803,24 @@ define <16 x i8> @shuffle_v16i8_28_uu_30_31_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(
define <16 x i8> @shuffle_v16i8_31_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14(<16 x i8> %a, <16 x i8> %b) {
; SSE2-LABEL: shuffle_v16i8_31_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: shuffle_v16i8_31_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: palignr {{.*#+}} xmm0 = xmm1[15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v16i8_31_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: palignr {{.*#+}} xmm0 = xmm1[15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v16i8_31_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
; AVX-NEXT: retq
%shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 31, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
@@ -829,7 +829,7 @@ define <16 x i8> @shuffle_v16i8_31_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14(
define <16 x i8> @shuffle_v16i8_15_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14(<16 x i8> %a, <16 x i8> %b) {
; SSE2-LABEL: shuffle_v16i8_15_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
@@ -837,17 +837,17 @@ define <16 x i8> @shuffle_v16i8_15_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14(
; SSE2-NEXT: retq
;
; SSSE3-LABEL: shuffle_v16i8_15_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: palignr {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v16i8_15_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: palignr {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v16i8_15_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
; AVX-NEXT: retq
%shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
@@ -856,24 +856,24 @@ define <16 x i8> @shuffle_v16i8_15_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14(
define <16 x i8> @shuffle_v16i8_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31_00(<16 x i8> %a, <16 x i8> %b) {
; SSE2-LABEL: shuffle_v16i8_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31_00:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero
; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0]
; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: shuffle_v16i8_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31_00:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: palignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v16i8_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31_00:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: palignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v16i8_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31_00:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0]
; AVX-NEXT: retq
%shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 0>
@@ -882,26 +882,26 @@ define <16 x i8> @shuffle_v16i8_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31_00(
define <16 x i8> @shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16(<16 x i8> %a, <16 x i8> %b) {
; SSE2-LABEL: shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero
; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: palignr {{.*#+}} xmm1 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0]
; SSSE3-NEXT: movdqa %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: palignr {{.*#+}} xmm1 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0]
; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0]
; AVX-NEXT: retq
%shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>
@@ -910,7 +910,7 @@ define <16 x i8> @shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16(
define <16 x i8> @shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00(<16 x i8> %a, <16 x i8> %b) {
; SSE2-LABEL: shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero
; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0]
@@ -918,17 +918,17 @@ define <16 x i8> @shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00(
; SSE2-NEXT: retq
;
; SSSE3-LABEL: shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: palignr {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: palignr {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0]
; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0]
; AVX-NEXT: retq
%shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0>
@@ -937,26 +937,26 @@ define <16 x i8> @shuffle_v16i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00(
define <16 x i8> @shuffle_v16i8_15_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30(<16 x i8> %a, <16 x i8> %b) {
; SSE2-LABEL: shuffle_v16i8_15_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: shuffle_v16i8_15_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: palignr {{.*#+}} xmm1 = xmm0[15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
; SSSE3-NEXT: movdqa %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v16i8_15_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: palignr {{.*#+}} xmm1 = xmm0[15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v16i8_15_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
; AVX-NEXT: retq
%shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
@@ -966,7 +966,7 @@ define <16 x i8> @shuffle_v16i8_15_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30(
; PR31151
define <16 x i8> @shuffle_v16i8_00_16_01_17_04_20_05_21_02_18_03_19_06_22_07_23(<16 x i8> %val1, <16 x i8> %val2) {
; SSE2-LABEL: shuffle_v16i8_00_16_01_17_04_20_05_21_02_18_03_19_06_22_07_23:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,1,3]
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
@@ -979,19 +979,19 @@ define <16 x i8> @shuffle_v16i8_00_16_01_17_04_20_05_21_02_18_03_19_06_22_07_23(
; SSE2-NEXT: retq
;
; SSSE3-LABEL: shuffle_v16i8_00_16_01_17_04_20_05_21_02_18_03_19_06_22_07_23:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v16i8_00_16_01_17_04_20_05_21_02_18_03_19_06_22_07_23:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v16i8_00_16_01_17_04_20_05_21_02_18_03_19_06_22_07_23:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
; AVX-NEXT: retq
@@ -1001,24 +1001,24 @@ define <16 x i8> @shuffle_v16i8_00_16_01_17_04_20_05_21_02_18_03_19_06_22_07_23(
define <16 x i8> @shuffle_v16i8_00_uu_uu_uu_uu_uu_uu_uu_01_uu_uu_uu_uu_uu_uu_uu(<16 x i8> %a) {
; SSE2-LABEL: shuffle_v16i8_00_uu_uu_uu_uu_uu_uu_uu_01_uu_uu_uu_uu_uu_uu_uu:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: shuffle_v16i8_00_uu_uu_uu_uu_uu_uu_uu_01_uu_uu_uu_uu_uu_uu_uu:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v16i8_00_uu_uu_uu_uu_uu_uu_uu_01_uu_uu_uu_uu_uu_uu_uu:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v16i8_00_uu_uu_uu_uu_uu_uu_uu_01_uu_uu_uu_uu_uu_uu_uu:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
; AVX-NEXT: retq
%shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -1027,7 +1027,7 @@ define <16 x i8> @shuffle_v16i8_00_uu_uu_uu_uu_uu_uu_uu_01_uu_uu_uu_uu_uu_uu_uu(
define <16 x i8> @shuffle_v16i8_00_zz_zz_zz_zz_zz_zz_zz_01_zz_zz_zz_zz_zz_zz_zz(<16 x i8> %a) {
; SSE2-LABEL: shuffle_v16i8_00_zz_zz_zz_zz_zz_zz_zz_01_zz_zz_zz_zz_zz_zz_zz:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: pxor %xmm1, %xmm1
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
@@ -1035,17 +1035,17 @@ define <16 x i8> @shuffle_v16i8_00_zz_zz_zz_zz_zz_zz_zz_01_zz_zz_zz_zz_zz_zz_zz(
; SSE2-NEXT: retq
;
; SSSE3-LABEL: shuffle_v16i8_00_zz_zz_zz_zz_zz_zz_zz_01_zz_zz_zz_zz_zz_zz_zz:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v16i8_00_zz_zz_zz_zz_zz_zz_zz_01_zz_zz_zz_zz_zz_zz_zz:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v16i8_00_zz_zz_zz_zz_zz_zz_zz_01_zz_zz_zz_zz_zz_zz_zz:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
; AVX-NEXT: retq
%shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 1, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
@@ -1054,24 +1054,24 @@ define <16 x i8> @shuffle_v16i8_00_zz_zz_zz_zz_zz_zz_zz_01_zz_zz_zz_zz_zz_zz_zz(
define <16 x i8> @shuffle_v16i8_00_uu_uu_uu_01_uu_uu_uu_02_uu_uu_uu_03_uu_uu_uu(<16 x i8> %a) {
; SSE2-LABEL: shuffle_v16i8_00_uu_uu_uu_01_uu_uu_uu_02_uu_uu_uu_03_uu_uu_uu:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: shuffle_v16i8_00_uu_uu_uu_01_uu_uu_uu_02_uu_uu_uu_03_uu_uu_uu:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v16i8_00_uu_uu_uu_01_uu_uu_uu_02_uu_uu_uu_03_uu_uu_uu:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v16i8_00_uu_uu_uu_01_uu_uu_uu_02_uu_uu_uu_03_uu_uu_uu:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; AVX-NEXT: retq
%shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 1, i32 undef, i32 undef, i32 undef, i32 2, i32 undef, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 undef>
@@ -1080,26 +1080,26 @@ define <16 x i8> @shuffle_v16i8_00_uu_uu_uu_01_uu_uu_uu_02_uu_uu_uu_03_uu_uu_uu(
define <16 x i8> @shuffle_v16i8_00_zz_zz_zz_01_zz_zz_zz_02_zz_zz_zz_03_zz_zz_zz(<16 x i8> %a) {
; SSE2-LABEL: shuffle_v16i8_00_zz_zz_zz_01_zz_zz_zz_02_zz_zz_zz_03_zz_zz_zz:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: pxor %xmm1, %xmm1
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: shuffle_v16i8_00_zz_zz_zz_01_zz_zz_zz_02_zz_zz_zz_03_zz_zz_zz:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: pxor %xmm1, %xmm1
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v16i8_00_zz_zz_zz_01_zz_zz_zz_02_zz_zz_zz_03_zz_zz_zz:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v16i8_00_zz_zz_zz_01_zz_zz_zz_02_zz_zz_zz_03_zz_zz_zz:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; AVX-NEXT: retq
%shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 17, i32 18, i32 19, i32 1, i32 21, i32 22, i32 23, i32 2, i32 25, i32 26, i32 27, i32 3, i32 29, i32 30, i32 31>
@@ -1108,22 +1108,22 @@ define <16 x i8> @shuffle_v16i8_00_zz_zz_zz_01_zz_zz_zz_02_zz_zz_zz_03_zz_zz_zz(
define <16 x i8> @shuffle_v16i8_00_uu_01_uu_02_uu_03_uu_04_uu_05_uu_06_uu_07_uu(<16 x i8> %a) {
; SSE2-LABEL: shuffle_v16i8_00_uu_01_uu_02_uu_03_uu_04_uu_05_uu_06_uu_07_uu:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: shuffle_v16i8_00_uu_01_uu_02_uu_03_uu_04_uu_05_uu_06_uu_07_uu:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v16i8_00_uu_01_uu_02_uu_03_uu_04_uu_05_uu_06_uu_07_uu:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v16i8_00_uu_01_uu_02_uu_03_uu_04_uu_05_uu_06_uu_07_uu:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX-NEXT: retq
%shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 undef, i32 1, i32 undef, i32 2, i32 undef, i32 3, i32 undef, i32 4, i32 undef, i32 5, i32 undef, i32 6, i32 undef, i32 7, i32 undef>
@@ -1132,24 +1132,24 @@ define <16 x i8> @shuffle_v16i8_00_uu_01_uu_02_uu_03_uu_04_uu_05_uu_06_uu_07_uu(
define <16 x i8> @shuffle_v16i8_00_zz_01_zz_02_zz_03_zz_04_zz_05_zz_06_zz_07_zz(<16 x i8> %a) {
; SSE2-LABEL: shuffle_v16i8_00_zz_01_zz_02_zz_03_zz_04_zz_05_zz_06_zz_07_zz:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: pxor %xmm1, %xmm1
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: shuffle_v16i8_00_zz_01_zz_02_zz_03_zz_04_zz_05_zz_06_zz_07_zz:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: pxor %xmm1, %xmm1
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v16i8_00_zz_01_zz_02_zz_03_zz_04_zz_05_zz_06_zz_07_zz:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v16i8_00_zz_01_zz_02_zz_03_zz_04_zz_05_zz_06_zz_07_zz:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX-NEXT: retq
%shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 17, i32 1, i32 19, i32 2, i32 21, i32 3, i32 23, i32 4, i32 25, i32 5, i32 27, i32 6, i32 29, i32 7, i32 31>
@@ -1158,7 +1158,7 @@ define <16 x i8> @shuffle_v16i8_00_zz_01_zz_02_zz_03_zz_04_zz_05_zz_06_zz_07_zz(
define <16 x i8> @shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00(<16 x i8> %a, <16 x i8> %b) {
; SSE2-LABEL: shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: pxor %xmm2, %xmm2
; SSE2-NEXT: movdqa %xmm0, %xmm3
; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
@@ -1190,21 +1190,21 @@ define <16 x i8> @shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00(
; SSE2-NEXT: retq
;
; SSSE3-LABEL: shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00:
-; SSSE3: # BB#0: # %entry
+; SSSE3: # %bb.0: # %entry
; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[u],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[2],zero,zero,zero
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[u,10,2,7],zero,xmm0[14,7,2],zero,xmm0[3,1,14],zero,xmm0[9,11,0]
; SSSE3-NEXT: por %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[u],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[2],zero,zero,zero
; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[u,10,2,7],zero,xmm0[14,7,2],zero,xmm0[3,1,14],zero,xmm0[9,11,0]
; SSE41-NEXT: por %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[2],zero,zero,zero
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,10,2,7],zero,xmm0[14,7,2],zero,xmm0[3,1,14],zero,xmm0[9,11,0]
; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
@@ -1215,6 +1215,28 @@ entry:
ret <16 x i8> %shuffle
}
+define <16 x i8> @shuffe_v16i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30(<8 x i16> %a0, <8 x i16> %a1) {
+; SSE-LABEL: shuffe_v16i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30:
+; SSE: # %bb.0:
+; SSE-NEXT: psrlw $8, %xmm0
+; SSE-NEXT: psrlw $8, %xmm1
+; SSE-NEXT: packuswb %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: shuffe_v16i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30:
+; AVX: # %bb.0:
+; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0
+; AVX-NEXT: vpsrlw $8, %xmm1, %xmm1
+; AVX-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
+ %1 = lshr <8 x i16> %a0, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+ %2 = lshr <8 x i16> %a1, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+ %3 = bitcast <8 x i16> %1 to <16 x i8>
+ %4 = bitcast <8 x i16> %2 to <16 x i8>
+ %5 = shufflevector <16 x i8> %3, <16 x i8> %4, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+ ret <16 x i8> %5
+}
+
define <16 x i8> @stress_test2(<16 x i8> %s.0.0, <16 x i8> %s.0.1, <16 x i8> %s.0.2) {
; Nothing interesting to test here. Just make sure we didn't crashe.
; ALL-LABEL: stress_test2:
@@ -1229,21 +1251,21 @@ entry:
define void @constant_gets_selected(<4 x i32>* %ptr1, <4 x i32>* %ptr2) {
; SSE-LABEL: constant_gets_selected:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: xorps %xmm0, %xmm0
; SSE-NEXT: movaps %xmm0, (%rdi)
; SSE-NEXT: movaps %xmm0, (%rsi)
; SSE-NEXT: retq
;
; AVX1OR2-LABEL: constant_gets_selected:
-; AVX1OR2: # BB#0: # %entry
+; AVX1OR2: # %bb.0: # %entry
; AVX1OR2-NEXT: vxorps %xmm0, %xmm0, %xmm0
; AVX1OR2-NEXT: vmovaps %xmm0, (%rdi)
; AVX1OR2-NEXT: vmovaps %xmm0, (%rsi)
; AVX1OR2-NEXT: retq
;
; AVX512VL-LABEL: constant_gets_selected:
-; AVX512VL: # BB#0: # %entry
+; AVX512VL: # %bb.0: # %entry
; AVX512VL-NEXT: vpxor %xmm0, %xmm0, %xmm0
; AVX512VL-NEXT: vmovdqa %xmm0, (%rdi)
; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi)
@@ -1263,12 +1285,12 @@ entry:
define <16 x i8> @shuffle_v16i8_zz_00_zz_02_zz_04_zz_06_zz_08_zz_10_zz_12_zz_14(<16 x i8> %a, <16 x i8> %b) {
; SSE-LABEL: shuffle_v16i8_zz_00_zz_02_zz_04_zz_06_zz_08_zz_10_zz_12_zz_14:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: psllw $8, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v16i8_zz_00_zz_02_zz_04_zz_06_zz_08_zz_10_zz_12_zz_14:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpsllw $8, %xmm0, %xmm0
; AVX-NEXT: retq
%shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32><i32 16, i32 0, i32 16, i32 2, i32 16, i32 4, i32 16, i32 6, i32 16, i32 8, i32 16, i32 10, i32 16, i32 12, i32 16, i32 14>
@@ -1277,12 +1299,12 @@ define <16 x i8> @shuffle_v16i8_zz_00_zz_02_zz_04_zz_06_zz_08_zz_10_zz_12_zz_14(
define <16 x i8> @shuffle_v16i8_zz_zz_zz_00_zz_zz_zz_04_zz_zz_zz_08_zz_zz_zz_12(<16 x i8> %a, <16 x i8> %b) {
; SSE-LABEL: shuffle_v16i8_zz_zz_zz_00_zz_zz_zz_04_zz_zz_zz_08_zz_zz_zz_12:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pslld $24, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v16i8_zz_zz_zz_00_zz_zz_zz_04_zz_zz_zz_08_zz_zz_zz_12:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpslld $24, %xmm0, %xmm0
; AVX-NEXT: retq
%shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32><i32 16, i32 16, i32 16, i32 0, i32 16, i32 16, i32 16, i32 4, i32 16, i32 16, i32 16, i32 8, i32 16, i32 16, i32 16, i32 12>
@@ -1291,12 +1313,12 @@ define <16 x i8> @shuffle_v16i8_zz_zz_zz_00_zz_zz_zz_04_zz_zz_zz_08_zz_zz_zz_12(
define <16 x i8> @shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_00_zz_zz_zz_zz_zz_zz_zz_08(<16 x i8> %a, <16 x i8> %b) {
; SSE-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_00_zz_zz_zz_zz_zz_zz_zz_08:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: psllq $56, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_00_zz_zz_zz_zz_zz_zz_zz_08:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpsllq $56, %xmm0, %xmm0
; AVX-NEXT: retq
%shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32><i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 0, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 8>
@@ -1305,12 +1327,12 @@ define <16 x i8> @shuffle_v16i8_zz_zz_zz_zz_zz_zz_zz_00_zz_zz_zz_zz_zz_zz_zz_08(
define <16 x i8> @shuffle_v16i8_zz_00_uu_02_03_uu_05_06_zz_08_09_uu_11_12_13_14(<16 x i8> %a, <16 x i8> %b) {
; SSE-LABEL: shuffle_v16i8_zz_00_uu_02_03_uu_05_06_zz_08_09_uu_11_12_13_14:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: psllq $8, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v16i8_zz_00_uu_02_03_uu_05_06_zz_08_09_uu_11_12_13_14:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpsllq $8, %xmm0, %xmm0
; AVX-NEXT: retq
%shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32><i32 16, i32 0, i32 undef, i32 2, i32 3, i32 undef, i32 5, i32 6, i32 16, i32 8, i32 9, i32 undef, i32 11, i32 12, i32 13, i32 14>
@@ -1319,12 +1341,12 @@ define <16 x i8> @shuffle_v16i8_zz_00_uu_02_03_uu_05_06_zz_08_09_uu_11_12_13_14(
define <16 x i8> @shuffle_v16i8_01_uu_uu_uu_uu_zz_uu_zz_uu_zz_11_zz_13_zz_15_zz(<16 x i8> %a, <16 x i8> %b) {
; SSE-LABEL: shuffle_v16i8_01_uu_uu_uu_uu_zz_uu_zz_uu_zz_11_zz_13_zz_15_zz:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: psrlw $8, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v16i8_01_uu_uu_uu_uu_zz_uu_zz_uu_zz_11_zz_13_zz_15_zz:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0
; AVX-NEXT: retq
%shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32><i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 16, i32 undef, i32 16, i32 undef, i32 16, i32 11, i32 16, i32 13, i32 16, i32 15, i32 16>
@@ -1333,12 +1355,12 @@ define <16 x i8> @shuffle_v16i8_01_uu_uu_uu_uu_zz_uu_zz_uu_zz_11_zz_13_zz_15_zz(
define <16 x i8> @shuffle_v16i8_02_03_zz_zz_06_07_uu_uu_uu_uu_uu_uu_14_15_zz_zz(<16 x i8> %a, <16 x i8> %b) {
; SSE-LABEL: shuffle_v16i8_02_03_zz_zz_06_07_uu_uu_uu_uu_uu_uu_14_15_zz_zz:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: psrld $16, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v16i8_02_03_zz_zz_06_07_uu_uu_uu_uu_uu_uu_14_15_zz_zz:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpsrld $16, %xmm0, %xmm0
; AVX-NEXT: retq
%shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32><i32 2, i32 3, i32 16, i32 16, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 14, i32 15, i32 16, i32 16>
@@ -1347,12 +1369,12 @@ define <16 x i8> @shuffle_v16i8_02_03_zz_zz_06_07_uu_uu_uu_uu_uu_uu_14_15_zz_zz(
define <16 x i8> @shuffle_v16i8_07_zz_zz_zz_zz_zz_uu_uu_15_uu_uu_uu_uu_uu_zz_zz(<16 x i8> %a, <16 x i8> %b) {
; SSE-LABEL: shuffle_v16i8_07_zz_zz_zz_zz_zz_uu_uu_15_uu_uu_uu_uu_uu_zz_zz:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: psrlq $56, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v16i8_07_zz_zz_zz_zz_zz_uu_uu_15_uu_uu_uu_uu_uu_zz_zz:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpsrlq $56, %xmm0, %xmm0
; AVX-NEXT: retq
%shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32><i32 7, i32 16, i32 16, i32 16, i32 16, i32 16, i32 undef, i32 undef, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 16, i32 16>
@@ -1361,7 +1383,7 @@ define <16 x i8> @shuffle_v16i8_07_zz_zz_zz_zz_zz_uu_uu_15_uu_uu_uu_uu_uu_zz_zz(
define <16 x i8> @PR12412(<16 x i8> %inval1, <16 x i8> %inval2) {
; SSE2-LABEL: PR12412:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
; SSE2-NEXT: pand %xmm2, %xmm1
; SSE2-NEXT: pand %xmm2, %xmm0
@@ -1369,7 +1391,7 @@ define <16 x i8> @PR12412(<16 x i8> %inval1, <16 x i8> %inval2) {
; SSE2-NEXT: retq
;
; SSSE3-LABEL: PR12412:
-; SSSE3: # BB#0: # %entry
+; SSSE3: # %bb.0: # %entry
; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
; SSSE3-NEXT: pshufb %xmm2, %xmm1
; SSSE3-NEXT: pshufb %xmm2, %xmm0
@@ -1377,28 +1399,20 @@ define <16 x i8> @PR12412(<16 x i8> %inval1, <16 x i8> %inval2) {
; SSSE3-NEXT: retq
;
; SSE41-LABEL: PR12412:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: movdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
; SSE41-NEXT: pshufb %xmm2, %xmm1
; SSE41-NEXT: pshufb %xmm2, %xmm0
; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE41-NEXT: retq
;
-; AVX1OR2-LABEL: PR12412:
-; AVX1OR2: # BB#0: # %entry
-; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX1OR2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX1OR2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX1OR2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX1OR2-NEXT: retq
-;
-; AVX512VL-LABEL: PR12412:
-; AVX512VL: # BB#0: # %entry
-; AVX512VL-NEXT: vmovdqu {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512VL-NEXT: retq
+; AVX-LABEL: PR12412:
+; AVX: # %bb.0: # %entry
+; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT: retq
entry:
%0 = shufflevector <16 x i8> %inval1, <16 x i8> %inval2, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
ret <16 x i8> %0
@@ -1406,12 +1420,12 @@ entry:
define <16 x i8> @shuffle_v16i8_uu_02_03_zz_uu_06_07_zz_uu_10_11_zz_uu_14_15_zz(<16 x i8> %a) {
; SSE-LABEL: shuffle_v16i8_uu_02_03_zz_uu_06_07_zz_uu_10_11_zz_uu_14_15_zz:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: psrld $8, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v16i8_uu_02_03_zz_uu_06_07_zz_uu_10_11_zz_uu_14_15_zz:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpsrld $8, %xmm0, %xmm0
; AVX-NEXT: retq
%shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> <i32 undef, i32 2, i32 3, i32 16, i32 undef, i32 6, i32 7, i32 16, i32 undef, i32 10, i32 11, i32 16, i32 undef, i32 14, i32 15, i32 16>
@@ -1420,12 +1434,12 @@ define <16 x i8> @shuffle_v16i8_uu_02_03_zz_uu_06_07_zz_uu_10_11_zz_uu_14_15_zz(
define <16 x i8> @shuffle_v16i8_bitcast_unpack(<16 x i8> %a, <16 x i8> %b) {
; SSE-LABEL: shuffle_v16i8_bitcast_unpack:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v16i8_bitcast_unpack:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; AVX-NEXT: retq
%shuffle8 = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 7, i32 23, i32 6, i32 22, i32 5, i32 21, i32 4, i32 20, i32 3, i32 19, i32 2, i32 18, i32 1, i32 17, i32 0, i32 16>
@@ -1439,7 +1453,7 @@ define <16 x i8> @shuffle_v16i8_bitcast_unpack(<16 x i8> %a, <16 x i8> %b) {
define <16 x i8> @insert_dup_mem_v16i8_i32(i32* %ptr) {
; SSE2-LABEL: insert_dup_mem_v16i8_i32:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
@@ -1447,28 +1461,28 @@ define <16 x i8> @insert_dup_mem_v16i8_i32(i32* %ptr) {
; SSE2-NEXT: retq
;
; SSSE3-LABEL: insert_dup_mem_v16i8_i32:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSSE3-NEXT: pxor %xmm1, %xmm1
; SSSE3-NEXT: pshufb %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: insert_dup_mem_v16i8_i32:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE41-NEXT: pxor %xmm1, %xmm1
; SSE41-NEXT: pshufb %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: insert_dup_mem_v16i8_i32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: insert_dup_mem_v16i8_i32:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpbroadcastb (%rdi), %xmm0
; AVX2OR512VL-NEXT: retq
%tmp = load i32, i32* %ptr, align 4
@@ -1480,7 +1494,7 @@ define <16 x i8> @insert_dup_mem_v16i8_i32(i32* %ptr) {
define <16 x i8> @insert_dup_mem_v16i8_sext_i8(i8* %ptr) {
; SSE2-LABEL: insert_dup_mem_v16i8_sext_i8:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movsbl (%rdi), %eax
; SSE2-NEXT: movd %eax, %xmm0
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
@@ -1489,7 +1503,7 @@ define <16 x i8> @insert_dup_mem_v16i8_sext_i8(i8* %ptr) {
; SSE2-NEXT: retq
;
; SSSE3-LABEL: insert_dup_mem_v16i8_sext_i8:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: movsbl (%rdi), %eax
; SSSE3-NEXT: movd %eax, %xmm0
; SSSE3-NEXT: pxor %xmm1, %xmm1
@@ -1497,7 +1511,7 @@ define <16 x i8> @insert_dup_mem_v16i8_sext_i8(i8* %ptr) {
; SSSE3-NEXT: retq
;
; SSE41-LABEL: insert_dup_mem_v16i8_sext_i8:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movsbl (%rdi), %eax
; SSE41-NEXT: movd %eax, %xmm0
; SSE41-NEXT: pxor %xmm1, %xmm1
@@ -1505,7 +1519,7 @@ define <16 x i8> @insert_dup_mem_v16i8_sext_i8(i8* %ptr) {
; SSE41-NEXT: retq
;
; AVX1-LABEL: insert_dup_mem_v16i8_sext_i8:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: movsbl (%rdi), %eax
; AVX1-NEXT: vmovd %eax, %xmm0
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
@@ -1513,7 +1527,7 @@ define <16 x i8> @insert_dup_mem_v16i8_sext_i8(i8* %ptr) {
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: insert_dup_mem_v16i8_sext_i8:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpbroadcastb (%rdi), %xmm0
; AVX2OR512VL-NEXT: retq
%tmp = load i8, i8* %ptr, align 1
@@ -1526,7 +1540,7 @@ define <16 x i8> @insert_dup_mem_v16i8_sext_i8(i8* %ptr) {
define <16 x i8> @insert_dup_elt1_mem_v16i8_i32(i32* %ptr) {
; SSE2-LABEL: insert_dup_elt1_mem_v16i8_i32:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
@@ -1534,25 +1548,25 @@ define <16 x i8> @insert_dup_elt1_mem_v16i8_i32(i32* %ptr) {
; SSE2-NEXT: retq
;
; SSSE3-LABEL: insert_dup_elt1_mem_v16i8_i32:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: insert_dup_elt1_mem_v16i8_i32:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; SSE41-NEXT: retq
;
; AVX1-LABEL: insert_dup_elt1_mem_v16i8_i32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: insert_dup_elt1_mem_v16i8_i32:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpbroadcastb 1(%rdi), %xmm0
; AVX2OR512VL-NEXT: retq
%tmp = load i32, i32* %ptr, align 4
@@ -1564,7 +1578,7 @@ define <16 x i8> @insert_dup_elt1_mem_v16i8_i32(i32* %ptr) {
define <16 x i8> @insert_dup_elt2_mem_v16i8_i32(i32* %ptr) {
; SSE2-LABEL: insert_dup_elt2_mem_v16i8_i32:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7]
@@ -1572,25 +1586,25 @@ define <16 x i8> @insert_dup_elt2_mem_v16i8_i32(i32* %ptr) {
; SSE2-NEXT: retq
;
; SSSE3-LABEL: insert_dup_elt2_mem_v16i8_i32:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: insert_dup_elt2_mem_v16i8_i32:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
; SSE41-NEXT: retq
;
; AVX1-LABEL: insert_dup_elt2_mem_v16i8_i32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: insert_dup_elt2_mem_v16i8_i32:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpbroadcastb 2(%rdi), %xmm0
; AVX2OR512VL-NEXT: retq
%tmp = load i32, i32* %ptr, align 4
@@ -1602,7 +1616,7 @@ define <16 x i8> @insert_dup_elt2_mem_v16i8_i32(i32* %ptr) {
define <16 x i8> @insert_dup_elt1_mem_v16i8_sext_i8(i8* %ptr) {
; SSE2-LABEL: insert_dup_elt1_mem_v16i8_sext_i8:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movsbl (%rdi), %eax
; SSE2-NEXT: movd %eax, %xmm0
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
@@ -1611,28 +1625,28 @@ define <16 x i8> @insert_dup_elt1_mem_v16i8_sext_i8(i8* %ptr) {
; SSE2-NEXT: retq
;
; SSSE3-LABEL: insert_dup_elt1_mem_v16i8_sext_i8:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: movsbl (%rdi), %eax
; SSSE3-NEXT: movd %eax, %xmm0
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: insert_dup_elt1_mem_v16i8_sext_i8:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movsbl (%rdi), %eax
; SSE41-NEXT: movd %eax, %xmm0
; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; SSE41-NEXT: retq
;
; AVX1-LABEL: insert_dup_elt1_mem_v16i8_sext_i8:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: movsbl (%rdi), %eax
; AVX1-NEXT: vmovd %eax, %xmm0
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; AVX1-NEXT: retq
;
; AVX2-LABEL: insert_dup_elt1_mem_v16i8_sext_i8:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: movsbl (%rdi), %eax
; AVX2-NEXT: shrl $8, %eax
; AVX2-NEXT: vmovd %eax, %xmm0
@@ -1640,7 +1654,7 @@ define <16 x i8> @insert_dup_elt1_mem_v16i8_sext_i8(i8* %ptr) {
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: insert_dup_elt1_mem_v16i8_sext_i8:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: movsbl (%rdi), %eax
; AVX512VL-NEXT: shrl $8, %eax
; AVX512VL-NEXT: vpbroadcastb %eax, %xmm0
@@ -1655,7 +1669,7 @@ define <16 x i8> @insert_dup_elt1_mem_v16i8_sext_i8(i8* %ptr) {
define <16 x i8> @insert_dup_elt2_mem_v16i8_sext_i8(i8* %ptr) {
; SSE2-LABEL: insert_dup_elt2_mem_v16i8_sext_i8:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movsbl (%rdi), %eax
; SSE2-NEXT: movd %eax, %xmm0
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
@@ -1664,28 +1678,28 @@ define <16 x i8> @insert_dup_elt2_mem_v16i8_sext_i8(i8* %ptr) {
; SSE2-NEXT: retq
;
; SSSE3-LABEL: insert_dup_elt2_mem_v16i8_sext_i8:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: movsbl (%rdi), %eax
; SSSE3-NEXT: movd %eax, %xmm0
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: insert_dup_elt2_mem_v16i8_sext_i8:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movsbl (%rdi), %eax
; SSE41-NEXT: movd %eax, %xmm0
; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
; SSE41-NEXT: retq
;
; AVX1-LABEL: insert_dup_elt2_mem_v16i8_sext_i8:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: movsbl (%rdi), %eax
; AVX1-NEXT: vmovd %eax, %xmm0
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
; AVX1-NEXT: retq
;
; AVX2-LABEL: insert_dup_elt2_mem_v16i8_sext_i8:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: movsbl (%rdi), %eax
; AVX2-NEXT: shrl $16, %eax
; AVX2-NEXT: vmovd %eax, %xmm0
@@ -1693,7 +1707,7 @@ define <16 x i8> @insert_dup_elt2_mem_v16i8_sext_i8(i8* %ptr) {
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: insert_dup_elt2_mem_v16i8_sext_i8:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: movsbl (%rdi), %eax
; AVX512VL-NEXT: shrl $16, %eax
; AVX512VL-NEXT: vpbroadcastb %eax, %xmm0
@@ -1708,7 +1722,7 @@ define <16 x i8> @insert_dup_elt2_mem_v16i8_sext_i8(i8* %ptr) {
define <16 x i8> @PR31364(i8* nocapture readonly %a, i8* nocapture readonly %b) {
; SSE2-LABEL: PR31364:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movzbl (%rdi), %eax
; SSE2-NEXT: movzbl (%rsi), %ecx
; SSE2-NEXT: shll $8, %ecx
@@ -1727,7 +1741,7 @@ define <16 x i8> @PR31364(i8* nocapture readonly %a, i8* nocapture readonly %b)
; SSE2-NEXT: retq
;
; SSSE3-LABEL: PR31364:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: movzbl (%rdi), %eax
; SSSE3-NEXT: movzbl (%rsi), %ecx
; SSSE3-NEXT: shll $8, %ecx
@@ -1738,7 +1752,7 @@ define <16 x i8> @PR31364(i8* nocapture readonly %a, i8* nocapture readonly %b)
; SSSE3-NEXT: retq
;
; SSE41-LABEL: PR31364:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pxor %xmm0, %xmm0
; SSE41-NEXT: pinsrb $0, (%rdi), %xmm0
; SSE41-NEXT: pinsrb $1, (%rsi), %xmm0
@@ -1746,7 +1760,7 @@ define <16 x i8> @PR31364(i8* nocapture readonly %a, i8* nocapture readonly %b)
; SSE41-NEXT: retq
;
; AVX-LABEL: PR31364:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0
; AVX-NEXT: vpinsrb $0, (%rdi), %xmm0, %xmm0
; AVX-NEXT: vpinsrb $1, (%rsi), %xmm0, %xmm0
@@ -1762,7 +1776,7 @@ define <16 x i8> @PR31364(i8* nocapture readonly %a, i8* nocapture readonly %b)
define <16 x i8> @PR31301(i8* nocapture readonly %x, i8* nocapture readonly %y) {
; SSE2-LABEL: PR31301:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movzbl (%rdi), %eax
; SSE2-NEXT: movd %eax, %xmm0
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
@@ -1777,7 +1791,7 @@ define <16 x i8> @PR31301(i8* nocapture readonly %x, i8* nocapture readonly %y)
; SSE2-NEXT: retq
;
; SSSE3-LABEL: PR31301:
-; SSSE3: # BB#0: # %entry
+; SSSE3: # %bb.0: # %entry
; SSSE3-NEXT: movzbl (%rdi), %eax
; SSSE3-NEXT: movd %eax, %xmm0
; SSSE3-NEXT: pxor %xmm1, %xmm1
@@ -1789,7 +1803,7 @@ define <16 x i8> @PR31301(i8* nocapture readonly %x, i8* nocapture readonly %y)
; SSSE3-NEXT: retq
;
; SSE41-LABEL: PR31301:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: movzbl (%rdi), %eax
; SSE41-NEXT: movd %eax, %xmm0
; SSE41-NEXT: pxor %xmm1, %xmm1
@@ -1801,7 +1815,7 @@ define <16 x i8> @PR31301(i8* nocapture readonly %x, i8* nocapture readonly %y)
; SSE41-NEXT: retq
;
; AVX1-LABEL: PR31301:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: movzbl (%rdi), %eax
; AVX1-NEXT: vmovd %eax, %xmm0
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
@@ -1813,7 +1827,7 @@ define <16 x i8> @PR31301(i8* nocapture readonly %x, i8* nocapture readonly %y)
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: PR31301:
-; AVX2OR512VL: # BB#0: # %entry
+; AVX2OR512VL: # %bb.0: # %entry
; AVX2OR512VL-NEXT: vpbroadcastb (%rdi), %xmm0
; AVX2OR512VL-NEXT: vpbroadcastb (%rsi), %xmm1
; AVX2OR512VL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
diff --git a/test/CodeGen/X86/vector-shuffle-128-v2.ll b/test/CodeGen/X86/vector-shuffle-128-v2.ll
index 669aee42fe48..4e8f8aeec28b 100644
--- a/test/CodeGen/X86/vector-shuffle-128-v2.ll
+++ b/test/CodeGen/X86/vector-shuffle-128-v2.ll
@@ -9,22 +9,22 @@
define <2 x i64> @shuffle_v2i64_00(<2 x i64> %a, <2 x i64> %b) {
; SSE-LABEL: shuffle_v2i64_00:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; SSE-NEXT: retq
;
; AVX1-LABEL: shuffle_v2i64_00:
-; AVX1: # BB#0:
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v2i64_00:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v2i64_00:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpbroadcastq %xmm0, %xmm0
; AVX512VL-NEXT: retq
%shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 0>
@@ -32,48 +32,48 @@ define <2 x i64> @shuffle_v2i64_00(<2 x i64> %a, <2 x i64> %b) {
}
define <2 x i64> @shuffle_v2i64_10(<2 x i64> %a, <2 x i64> %b) {
; SSE-LABEL: shuffle_v2i64_10:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v2i64_10:
-; AVX: # BB#0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX: # %bb.0:
+; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,0,1]
; AVX-NEXT: retq
%shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 0>
ret <2 x i64> %shuffle
}
define <2 x i64> @shuffle_v2i64_11(<2 x i64> %a, <2 x i64> %b) {
; SSE-LABEL: shuffle_v2i64_11:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v2i64_11:
-; AVX: # BB#0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX: # %bb.0:
+; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3]
; AVX-NEXT: retq
%shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 1>
ret <2 x i64> %shuffle
}
define <2 x i64> @shuffle_v2i64_22(<2 x i64> %a, <2 x i64> %b) {
; SSE-LABEL: shuffle_v2i64_22:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1]
; SSE-NEXT: retq
;
; AVX1-LABEL: shuffle_v2i64_22:
-; AVX1: # BB#0:
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,1,0,1]
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[0,1,0,1]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v2i64_22:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpbroadcastq %xmm1, %xmm0
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v2i64_22:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpbroadcastq %xmm1, %xmm0
; AVX512VL-NEXT: retq
%shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 2, i32 2>
@@ -81,26 +81,26 @@ define <2 x i64> @shuffle_v2i64_22(<2 x i64> %a, <2 x i64> %b) {
}
define <2 x i64> @shuffle_v2i64_32(<2 x i64> %a, <2 x i64> %b) {
; SSE-LABEL: shuffle_v2i64_32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v2i64_32:
-; AVX: # BB#0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; AVX: # %bb.0:
+; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[2,3,0,1]
; AVX-NEXT: retq
%shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 3, i32 2>
ret <2 x i64> %shuffle
}
define <2 x i64> @shuffle_v2i64_33(<2 x i64> %a, <2 x i64> %b) {
; SSE-LABEL: shuffle_v2i64_33:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v2i64_33:
-; AVX: # BB#0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; AVX: # %bb.0:
+; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[2,3,2,3]
; AVX-NEXT: retq
%shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 3, i32 3>
ret <2 x i64> %shuffle
@@ -108,27 +108,27 @@ define <2 x i64> @shuffle_v2i64_33(<2 x i64> %a, <2 x i64> %b) {
define <2 x double> @shuffle_v2f64_00(<2 x double> %a, <2 x double> %b) {
; SSE2-LABEL: shuffle_v2f64_00:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0]
; SSE2-NEXT: retq
;
; SSE3-LABEL: shuffle_v2f64_00:
-; SSE3: # BB#0:
+; SSE3: # %bb.0:
; SSE3-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0]
; SSE3-NEXT: retq
;
; SSSE3-LABEL: shuffle_v2f64_00:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v2f64_00:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0]
; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v2f64_00:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
; AVX-NEXT: retq
%shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 0, i32 0>
@@ -136,12 +136,12 @@ define <2 x double> @shuffle_v2f64_00(<2 x double> %a, <2 x double> %b) {
}
define <2 x double> @shuffle_v2f64_10(<2 x double> %a, <2 x double> %b) {
; SSE-LABEL: shuffle_v2f64_10:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0]
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v2f64_10:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX-NEXT: retq
@@ -150,12 +150,12 @@ define <2 x double> @shuffle_v2f64_10(<2 x double> %a, <2 x double> %b) {
}
define <2 x double> @shuffle_v2f64_11(<2 x double> %a, <2 x double> %b) {
; SSE-LABEL: shuffle_v2f64_11:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1]
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v2f64_11:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,1]
; AVX-NEXT: retq
%shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 1, i32 1>
@@ -163,28 +163,28 @@ define <2 x double> @shuffle_v2f64_11(<2 x double> %a, <2 x double> %b) {
}
define <2 x double> @shuffle_v2f64_22(<2 x double> %a, <2 x double> %b) {
; SSE2-LABEL: shuffle_v2f64_22:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0,0]
; SSE2-NEXT: movaps %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE3-LABEL: shuffle_v2f64_22:
-; SSE3: # BB#0:
+; SSE3: # %bb.0:
; SSE3-NEXT: movddup {{.*#+}} xmm0 = xmm1[0,0]
; SSE3-NEXT: retq
;
; SSSE3-LABEL: shuffle_v2f64_22:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: movddup {{.*#+}} xmm0 = xmm1[0,0]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v2f64_22:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movddup {{.*#+}} xmm0 = xmm1[0,0]
; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v2f64_22:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovddup {{.*#+}} xmm0 = xmm1[0,0]
; AVX-NEXT: retq
%shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 2, i32 2>
@@ -192,13 +192,13 @@ define <2 x double> @shuffle_v2f64_22(<2 x double> %a, <2 x double> %b) {
}
define <2 x double> @shuffle_v2f64_32(<2 x double> %a, <2 x double> %b) {
; SSE-LABEL: shuffle_v2f64_32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1,0]
; SSE-NEXT: movapd %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v2f64_32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm1[1,0]
; AVX-NEXT: retq
@@ -207,13 +207,13 @@ define <2 x double> @shuffle_v2f64_32(<2 x double> %a, <2 x double> %b) {
}
define <2 x double> @shuffle_v2f64_33(<2 x double> %a, <2 x double> %b) {
; SSE-LABEL: shuffle_v2f64_33:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
; SSE-NEXT: movaps %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v2f64_33:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm1[1,1]
; AVX-NEXT: retq
%shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 3, i32 3>
@@ -221,40 +221,40 @@ define <2 x double> @shuffle_v2f64_33(<2 x double> %a, <2 x double> %b) {
}
define <2 x double> @shuffle_v2f64_03(<2 x double> %a, <2 x double> %b) {
; SSE2-LABEL: shuffle_v2f64_03:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
; SSE2-NEXT: movapd %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE3-LABEL: shuffle_v2f64_03:
-; SSE3: # BB#0:
+; SSE3: # %bb.0:
; SSE3-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
; SSE3-NEXT: movapd %xmm1, %xmm0
; SSE3-NEXT: retq
;
; SSSE3-LABEL: shuffle_v2f64_03:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
; SSSE3-NEXT: movapd %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v2f64_03:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
; SSE41-NEXT: retq
;
; AVX1-LABEL: shuffle_v2f64_03:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v2f64_03:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v2f64_03:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
; AVX512VL-NEXT: retq
%shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 0, i32 3>
@@ -262,37 +262,37 @@ define <2 x double> @shuffle_v2f64_03(<2 x double> %a, <2 x double> %b) {
}
define <2 x double> @shuffle_v2f64_21(<2 x double> %a, <2 x double> %b) {
; SSE2-LABEL: shuffle_v2f64_21:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; SSE2-NEXT: retq
;
; SSE3-LABEL: shuffle_v2f64_21:
-; SSE3: # BB#0:
+; SSE3: # %bb.0:
; SSE3-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; SSE3-NEXT: retq
;
; SSSE3-LABEL: shuffle_v2f64_21:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v2f64_21:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; SSE41-NEXT: retq
;
; AVX1-LABEL: shuffle_v2f64_21:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v2f64_21:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v2f64_21:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; AVX512VL-NEXT: retq
%shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 2, i32 1>
@@ -302,139 +302,139 @@ define <2 x double> @shuffle_v2f64_21(<2 x double> %a, <2 x double> %b) {
define <2 x i64> @shuffle_v2i64_02(<2 x i64> %a, <2 x i64> %b) {
; SSE-LABEL: shuffle_v2i64_02:
-; SSE: # BB#0:
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE: # %bb.0:
+; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v2i64_02:
-; AVX: # BB#0:
-; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX: # %bb.0:
+; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX-NEXT: retq
%shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 2>
ret <2 x i64> %shuffle
}
define <2 x i64> @shuffle_v2i64_02_copy(<2 x i64> %nonce, <2 x i64> %a, <2 x i64> %b) {
; SSE-LABEL: shuffle_v2i64_02_copy:
-; SSE: # BB#0:
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; SSE-NEXT: movdqa %xmm1, %xmm0
+; SSE: # %bb.0:
+; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSE-NEXT: movaps %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v2i64_02_copy:
-; AVX: # BB#0:
-; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm2[0]
+; AVX: # %bb.0:
+; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm2[0]
; AVX-NEXT: retq
%shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 2>
ret <2 x i64> %shuffle
}
define <2 x i64> @shuffle_v2i64_03(<2 x i64> %a, <2 x i64> %b) {
; SSE2-LABEL: shuffle_v2i64_03:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
; SSE2-NEXT: movapd %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE3-LABEL: shuffle_v2i64_03:
-; SSE3: # BB#0:
+; SSE3: # %bb.0:
; SSE3-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
; SSE3-NEXT: movapd %xmm1, %xmm0
; SSE3-NEXT: retq
;
; SSSE3-LABEL: shuffle_v2i64_03:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
; SSSE3-NEXT: movapd %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v2i64_03:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
; SSE41-NEXT: retq
;
; AVX1-LABEL: shuffle_v2i64_03:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v2i64_03:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX2: # %bb.0:
+; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v2i64_03:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
; AVX512VL-NEXT: retq
%shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 3>
ret <2 x i64> %shuffle
}
define <2 x i64> @shuffle_v2i64_03_copy(<2 x i64> %nonce, <2 x i64> %a, <2 x i64> %b) {
; SSE2-LABEL: shuffle_v2i64_03_copy:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
; SSE2-NEXT: movapd %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE3-LABEL: shuffle_v2i64_03_copy:
-; SSE3: # BB#0:
+; SSE3: # %bb.0:
; SSE3-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
; SSE3-NEXT: movapd %xmm2, %xmm0
; SSE3-NEXT: retq
;
; SSSE3-LABEL: shuffle_v2i64_03_copy:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
; SSSE3-NEXT: movapd %xmm2, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v2i64_03_copy:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: shuffle_v2i64_03_copy:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm2[4,5,6,7]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v2i64_03_copy:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm2[2,3]
+; AVX2: # %bb.0:
+; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm2[2,3]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v2i64_03_copy:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm2[2,3]
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm2[2,3]
; AVX512VL-NEXT: retq
%shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 3>
ret <2 x i64> %shuffle
}
define <2 x i64> @shuffle_v2i64_12(<2 x i64> %a, <2 x i64> %b) {
; SSE2-LABEL: shuffle_v2i64_12:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0]
; SSE2-NEXT: retq
;
; SSE3-LABEL: shuffle_v2i64_12:
-; SSE3: # BB#0:
+; SSE3: # %bb.0:
; SSE3-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0]
; SSE3-NEXT: retq
;
; SSSE3-LABEL: shuffle_v2i64_12:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: palignr {{.*#+}} xmm1 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
; SSSE3-NEXT: movdqa %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v2i64_12:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: palignr {{.*#+}} xmm1 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v2i64_12:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
; AVX-NEXT: retq
%shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 2>
@@ -442,31 +442,31 @@ define <2 x i64> @shuffle_v2i64_12(<2 x i64> %a, <2 x i64> %b) {
}
define <2 x i64> @shuffle_v2i64_12_copy(<2 x i64> %nonce, <2 x i64> %a, <2 x i64> %b) {
; SSE2-LABEL: shuffle_v2i64_12_copy:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1],xmm2[0]
; SSE2-NEXT: movapd %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE3-LABEL: shuffle_v2i64_12_copy:
-; SSE3: # BB#0:
+; SSE3: # %bb.0:
; SSE3-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1],xmm2[0]
; SSE3-NEXT: movapd %xmm1, %xmm0
; SSE3-NEXT: retq
;
; SSSE3-LABEL: shuffle_v2i64_12_copy:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: palignr {{.*#+}} xmm2 = xmm1[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
; SSSE3-NEXT: movdqa %xmm2, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v2i64_12_copy:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: palignr {{.*#+}} xmm2 = xmm1[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
; SSE41-NEXT: movdqa %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v2i64_12_copy:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
; AVX-NEXT: retq
%shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 2>
@@ -474,164 +474,164 @@ define <2 x i64> @shuffle_v2i64_12_copy(<2 x i64> %nonce, <2 x i64> %a, <2 x i64
}
define <2 x i64> @shuffle_v2i64_13(<2 x i64> %a, <2 x i64> %b) {
; SSE-LABEL: shuffle_v2i64_13:
-; SSE: # BB#0:
-; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; SSE: # %bb.0:
+; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v2i64_13:
-; AVX: # BB#0:
-; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; AVX: # %bb.0:
+; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
; AVX-NEXT: retq
%shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3>
ret <2 x i64> %shuffle
}
define <2 x i64> @shuffle_v2i64_13_copy(<2 x i64> %nonce, <2 x i64> %a, <2 x i64> %b) {
; SSE-LABEL: shuffle_v2i64_13_copy:
-; SSE: # BB#0:
-; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm2[1]
-; SSE-NEXT: movdqa %xmm1, %xmm0
+; SSE: # %bb.0:
+; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
+; SSE-NEXT: movaps %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v2i64_13_copy:
-; AVX: # BB#0:
-; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm2[1]
+; AVX: # %bb.0:
+; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm2[1]
; AVX-NEXT: retq
%shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3>
ret <2 x i64> %shuffle
}
define <2 x i64> @shuffle_v2i64_20(<2 x i64> %a, <2 x i64> %b) {
; SSE-LABEL: shuffle_v2i64_20:
-; SSE: # BB#0:
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; SSE-NEXT: movdqa %xmm1, %xmm0
+; SSE: # %bb.0:
+; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE-NEXT: movaps %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v2i64_20:
-; AVX: # BB#0:
-; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX: # %bb.0:
+; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX-NEXT: retq
%shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 2, i32 0>
ret <2 x i64> %shuffle
}
define <2 x i64> @shuffle_v2i64_20_copy(<2 x i64> %nonce, <2 x i64> %a, <2 x i64> %b) {
; SSE-LABEL: shuffle_v2i64_20_copy:
-; SSE: # BB#0:
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
-; SSE-NEXT: movdqa %xmm2, %xmm0
+; SSE: # %bb.0:
+; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0]
+; SSE-NEXT: movaps %xmm2, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v2i64_20_copy:
-; AVX: # BB#0:
-; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm1[0]
+; AVX: # %bb.0:
+; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm2[0],xmm1[0]
; AVX-NEXT: retq
%shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 2, i32 0>
ret <2 x i64> %shuffle
}
define <2 x i64> @shuffle_v2i64_21(<2 x i64> %a, <2 x i64> %b) {
; SSE2-LABEL: shuffle_v2i64_21:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; SSE2-NEXT: retq
;
; SSE3-LABEL: shuffle_v2i64_21:
-; SSE3: # BB#0:
+; SSE3: # %bb.0:
; SSE3-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; SSE3-NEXT: retq
;
; SSSE3-LABEL: shuffle_v2i64_21:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v2i64_21:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
; SSE41-NEXT: retq
;
; AVX1-LABEL: shuffle_v2i64_21:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v2i64_21:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX2: # %bb.0:
+; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v2i64_21:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
; AVX512VL-NEXT: retq
%shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 2, i32 1>
ret <2 x i64> %shuffle
}
define <2 x i64> @shuffle_v2i64_21_copy(<2 x i64> %nonce, <2 x i64> %a, <2 x i64> %b) {
; SSE2-LABEL: shuffle_v2i64_21_copy:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
; SSE2-NEXT: movapd %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE3-LABEL: shuffle_v2i64_21_copy:
-; SSE3: # BB#0:
+; SSE3: # %bb.0:
; SSE3-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
; SSE3-NEXT: movapd %xmm1, %xmm0
; SSE3-NEXT: retq
;
; SSSE3-LABEL: shuffle_v2i64_21_copy:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
; SSSE3-NEXT: movapd %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v2i64_21_copy:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: shuffle_v2i64_21_copy:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm1[4,5,6,7]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v2i64_21_copy:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm1[2,3]
+; AVX2: # %bb.0:
+; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0,1],xmm1[2,3]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v2i64_21_copy:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm1[2,3]
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0,1],xmm1[2,3]
; AVX512VL-NEXT: retq
%shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 2, i32 1>
ret <2 x i64> %shuffle
}
define <2 x i64> @shuffle_v2i64_30(<2 x i64> %a, <2 x i64> %b) {
; SSE2-LABEL: shuffle_v2i64_30:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1],xmm0[0]
; SSE2-NEXT: movapd %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE3-LABEL: shuffle_v2i64_30:
-; SSE3: # BB#0:
+; SSE3: # %bb.0:
; SSE3-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1],xmm0[0]
; SSE3-NEXT: movapd %xmm1, %xmm0
; SSE3-NEXT: retq
;
; SSSE3-LABEL: shuffle_v2i64_30:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: palignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v2i64_30:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: palignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v2i64_30:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
; AVX-NEXT: retq
%shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 3, i32 0>
@@ -639,31 +639,31 @@ define <2 x i64> @shuffle_v2i64_30(<2 x i64> %a, <2 x i64> %b) {
}
define <2 x i64> @shuffle_v2i64_30_copy(<2 x i64> %nonce, <2 x i64> %a, <2 x i64> %b) {
; SSE2-LABEL: shuffle_v2i64_30_copy:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
; SSE2-NEXT: movapd %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE3-LABEL: shuffle_v2i64_30_copy:
-; SSE3: # BB#0:
+; SSE3: # %bb.0:
; SSE3-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
; SSE3-NEXT: movapd %xmm2, %xmm0
; SSE3-NEXT: retq
;
; SSSE3-LABEL: shuffle_v2i64_30_copy:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: palignr {{.*#+}} xmm1 = xmm2[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
; SSSE3-NEXT: movdqa %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v2i64_30_copy:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: palignr {{.*#+}} xmm1 = xmm2[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v2i64_30_copy:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm2[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
; AVX-NEXT: retq
%shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 3, i32 0>
@@ -671,28 +671,28 @@ define <2 x i64> @shuffle_v2i64_30_copy(<2 x i64> %nonce, <2 x i64> %a, <2 x i64
}
define <2 x i64> @shuffle_v2i64_31(<2 x i64> %a, <2 x i64> %b) {
; SSE-LABEL: shuffle_v2i64_31:
-; SSE: # BB#0:
-; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE-NEXT: movdqa %xmm1, %xmm0
+; SSE: # %bb.0:
+; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
+; SSE-NEXT: movaps %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v2i64_31:
-; AVX: # BB#0:
-; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1]
+; AVX: # %bb.0:
+; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
; AVX-NEXT: retq
%shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 3, i32 1>
ret <2 x i64> %shuffle
}
define <2 x i64> @shuffle_v2i64_31_copy(<2 x i64> %nonce, <2 x i64> %a, <2 x i64> %b) {
; SSE-LABEL: shuffle_v2i64_31_copy:
-; SSE: # BB#0:
-; SSE-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm1[1]
-; SSE-NEXT: movdqa %xmm2, %xmm0
+; SSE: # %bb.0:
+; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
+; SSE-NEXT: movaps %xmm2, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v2i64_31_copy:
-; AVX: # BB#0:
-; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm2[1],xmm1[1]
+; AVX: # %bb.0:
+; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm2[1],xmm1[1]
; AVX-NEXT: retq
%shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 3, i32 1>
ret <2 x i64> %shuffle
@@ -700,12 +700,12 @@ define <2 x i64> @shuffle_v2i64_31_copy(<2 x i64> %nonce, <2 x i64> %a, <2 x i64
define <2 x i64> @shuffle_v2i64_0z(<2 x i64> %a) {
; SSE-LABEL: shuffle_v2i64_0z:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v2i64_0z:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
; AVX-NEXT: retq
%shuffle = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32> <i32 0, i32 3>
@@ -714,12 +714,12 @@ define <2 x i64> @shuffle_v2i64_0z(<2 x i64> %a) {
define <2 x i64> @shuffle_v2i64_1z(<2 x i64> %a) {
; SSE-LABEL: shuffle_v2i64_1z:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v2i64_1z:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero
; AVX-NEXT: retq
%shuffle = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32> <i32 1, i32 3>
@@ -728,12 +728,12 @@ define <2 x i64> @shuffle_v2i64_1z(<2 x i64> %a) {
define <2 x i64> @shuffle_v2i64_z0(<2 x i64> %a) {
; SSE-LABEL: shuffle_v2i64_z0:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v2i64_z0:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
; AVX-NEXT: retq
%shuffle = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32> <i32 2, i32 0>
@@ -742,43 +742,43 @@ define <2 x i64> @shuffle_v2i64_z0(<2 x i64> %a) {
define <2 x i64> @shuffle_v2i64_z1(<2 x i64> %a) {
; SSE2-LABEL: shuffle_v2i64_z1:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: xorpd %xmm1, %xmm1
; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; SSE2-NEXT: retq
;
; SSE3-LABEL: shuffle_v2i64_z1:
-; SSE3: # BB#0:
+; SSE3: # %bb.0:
; SSE3-NEXT: xorpd %xmm1, %xmm1
; SSE3-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; SSE3-NEXT: retq
;
; SSSE3-LABEL: shuffle_v2i64_z1:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: xorpd %xmm1, %xmm1
; SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v2i64_z1:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pxor %xmm1, %xmm1
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
; SSE41-NEXT: retq
;
; AVX1-LABEL: shuffle_v2i64_z1:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v2i64_z1:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX2: # %bb.0:
+; AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v2i64_z1:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
; AVX512VL-NEXT: retq
@@ -788,12 +788,12 @@ define <2 x i64> @shuffle_v2i64_z1(<2 x i64> %a) {
define <2 x double> @shuffle_v2f64_0z(<2 x double> %a) {
; SSE-LABEL: shuffle_v2f64_0z:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v2f64_0z:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
; AVX-NEXT: retq
%shuffle = shufflevector <2 x double> %a, <2 x double> zeroinitializer, <2 x i32> <i32 0, i32 3>
@@ -802,27 +802,27 @@ define <2 x double> @shuffle_v2f64_0z(<2 x double> %a) {
define <2 x double> @shuffle_v2f64_1z(<2 x double> %a) {
; SSE-LABEL: shuffle_v2f64_1z:
-; SSE: # BB#0:
-; SSE-NEXT: xorpd %xmm1, %xmm1
+; SSE: # %bb.0:
+; SSE-NEXT: xorps %xmm1, %xmm1
; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
; SSE-NEXT: retq
;
; AVX1-LABEL: shuffle_v2f64_1z:
-; AVX1: # BB#0:
-; AVX1-NEXT: vxorpd %xmm1, %xmm1, %xmm1
+; AVX1: # %bb.0:
+; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v2f64_1z:
-; AVX2: # BB#0:
-; AVX2-NEXT: vxorpd %xmm1, %xmm1, %xmm1
+; AVX2: # %bb.0:
+; AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v2f64_1z:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; AVX512VL-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
; AVX512VL-NEXT: retq
%shuffle = shufflevector <2 x double> %a, <2 x double> zeroinitializer, <2 x i32> <i32 1, i32 3>
ret <2 x double> %shuffle
@@ -830,28 +830,28 @@ define <2 x double> @shuffle_v2f64_1z(<2 x double> %a) {
define <2 x double> @shuffle_v2f64_z0(<2 x double> %a) {
; SSE-LABEL: shuffle_v2f64_z0:
-; SSE: # BB#0:
-; SSE-NEXT: xorpd %xmm1, %xmm1
-; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; SSE-NEXT: movapd %xmm1, %xmm0
+; SSE: # %bb.0:
+; SSE-NEXT: xorps %xmm1, %xmm1
+; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE-NEXT: movaps %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: shuffle_v2f64_z0:
-; AVX1: # BB#0:
-; AVX1-NEXT: vxorpd %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX1: # %bb.0:
+; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v2f64_z0:
-; AVX2: # BB#0:
-; AVX2-NEXT: vxorpd %xmm1, %xmm1, %xmm1
-; AVX2-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX2: # %bb.0:
+; AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v2f64_z0:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX512VL-NEXT: retq
%shuffle = shufflevector <2 x double> %a, <2 x double> zeroinitializer, <2 x i32> <i32 2, i32 0>
ret <2 x double> %shuffle
@@ -859,43 +859,43 @@ define <2 x double> @shuffle_v2f64_z0(<2 x double> %a) {
define <2 x double> @shuffle_v2f64_z1(<2 x double> %a) {
; SSE2-LABEL: shuffle_v2f64_z1:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: xorpd %xmm1, %xmm1
; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; SSE2-NEXT: retq
;
; SSE3-LABEL: shuffle_v2f64_z1:
-; SSE3: # BB#0:
+; SSE3: # %bb.0:
; SSE3-NEXT: xorpd %xmm1, %xmm1
; SSE3-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; SSE3-NEXT: retq
;
; SSSE3-LABEL: shuffle_v2f64_z1:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: xorpd %xmm1, %xmm1
; SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v2f64_z1:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: xorpd %xmm1, %xmm1
; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; SSE41-NEXT: retq
;
; AVX1-LABEL: shuffle_v2f64_z1:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v2f64_z1:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v2f64_z1:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512VL-NEXT: vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; AVX512VL-NEXT: retq
@@ -905,27 +905,27 @@ define <2 x double> @shuffle_v2f64_z1(<2 x double> %a) {
define <2 x double> @shuffle_v2f64_bitcast_1z(<2 x double> %a) {
; SSE-LABEL: shuffle_v2f64_bitcast_1z:
-; SSE: # BB#0:
-; SSE-NEXT: xorpd %xmm1, %xmm1
+; SSE: # %bb.0:
+; SSE-NEXT: xorps %xmm1, %xmm1
; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
; SSE-NEXT: retq
;
; AVX1-LABEL: shuffle_v2f64_bitcast_1z:
-; AVX1: # BB#0:
-; AVX1-NEXT: vxorpd %xmm1, %xmm1, %xmm1
+; AVX1: # %bb.0:
+; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v2f64_bitcast_1z:
-; AVX2: # BB#0:
-; AVX2-NEXT: vxorpd %xmm1, %xmm1, %xmm1
+; AVX2: # %bb.0:
+; AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v2f64_bitcast_1z:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; AVX512VL-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
; AVX512VL-NEXT: retq
%shuffle64 = shufflevector <2 x double> %a, <2 x double> zeroinitializer, <2 x i32> <i32 2, i32 1>
%bitcast32 = bitcast <2 x double> %shuffle64 to <4 x float>
@@ -936,40 +936,40 @@ define <2 x double> @shuffle_v2f64_bitcast_1z(<2 x double> %a) {
define <2 x i64> @shuffle_v2i64_bitcast_z123(<2 x i64> %x) {
; SSE2-LABEL: shuffle_v2i64_bitcast_z123:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: andps {{.*}}(%rip), %xmm0
; SSE2-NEXT: retq
;
; SSE3-LABEL: shuffle_v2i64_bitcast_z123:
-; SSE3: # BB#0:
+; SSE3: # %bb.0:
; SSE3-NEXT: andps {{.*}}(%rip), %xmm0
; SSE3-NEXT: retq
;
; SSSE3-LABEL: shuffle_v2i64_bitcast_z123:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: andps {{.*}}(%rip), %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v2i64_bitcast_z123:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pxor %xmm1, %xmm1
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
; SSE41-NEXT: retq
;
; AVX1-LABEL: shuffle_v2i64_bitcast_z123:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v2i64_bitcast_z123:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX2: # %bb.0:
+; AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v2i64_bitcast_z123:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
; AVX512VL-NEXT: retq
@@ -982,12 +982,12 @@ define <2 x i64> @shuffle_v2i64_bitcast_z123(<2 x i64> %x) {
define <2 x i64> @insert_reg_and_zero_v2i64(i64 %a) {
; SSE-LABEL: insert_reg_and_zero_v2i64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movq %rdi, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: insert_reg_and_zero_v2i64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovq %rdi, %xmm0
; AVX-NEXT: retq
%v = insertelement <2 x i64> undef, i64 %a, i32 0
@@ -997,12 +997,12 @@ define <2 x i64> @insert_reg_and_zero_v2i64(i64 %a) {
define <2 x i64> @insert_mem_and_zero_v2i64(i64* %ptr) {
; SSE-LABEL: insert_mem_and_zero_v2i64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; SSE-NEXT: retq
;
; AVX-LABEL: insert_mem_and_zero_v2i64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: retq
%a = load i64, i64* %ptr
@@ -1013,12 +1013,12 @@ define <2 x i64> @insert_mem_and_zero_v2i64(i64* %ptr) {
define <2 x double> @insert_reg_and_zero_v2f64(double %a) {
; SSE-LABEL: insert_reg_and_zero_v2f64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
; SSE-NEXT: retq
;
; AVX-LABEL: insert_reg_and_zero_v2f64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
; AVX-NEXT: retq
%v = insertelement <2 x double> undef, double %a, i32 0
@@ -1028,12 +1028,12 @@ define <2 x double> @insert_reg_and_zero_v2f64(double %a) {
define <2 x double> @insert_mem_and_zero_v2f64(double* %ptr) {
; SSE-LABEL: insert_mem_and_zero_v2f64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; SSE-NEXT: retq
;
; AVX-LABEL: insert_mem_and_zero_v2f64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: retq
%a = load double, double* %ptr
@@ -1044,46 +1044,32 @@ define <2 x double> @insert_mem_and_zero_v2f64(double* %ptr) {
define <2 x i64> @insert_reg_lo_v2i64(i64 %a, <2 x i64> %b) {
; SSE2-LABEL: insert_reg_lo_v2i64:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movq %rdi, %xmm1
; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; SSE2-NEXT: retq
;
; SSE3-LABEL: insert_reg_lo_v2i64:
-; SSE3: # BB#0:
+; SSE3: # %bb.0:
; SSE3-NEXT: movq %rdi, %xmm1
; SSE3-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; SSE3-NEXT: retq
;
; SSSE3-LABEL: insert_reg_lo_v2i64:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: movq %rdi, %xmm1
; SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: insert_reg_lo_v2i64:
-; SSE41: # BB#0:
-; SSE41-NEXT: movq %rdi, %xmm1
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; SSE41: # %bb.0:
+; SSE41-NEXT: pinsrq $0, %rdi, %xmm0
; SSE41-NEXT: retq
;
-; AVX1-LABEL: insert_reg_lo_v2i64:
-; AVX1: # BB#0:
-; AVX1-NEXT: vmovq %rdi, %xmm1
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: insert_reg_lo_v2i64:
-; AVX2: # BB#0:
-; AVX2-NEXT: vmovq %rdi, %xmm1
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; AVX2-NEXT: retq
-;
-; AVX512VL-LABEL: insert_reg_lo_v2i64:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vmovq %rdi, %xmm1
-; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; AVX512VL-NEXT: retq
+; AVX-LABEL: insert_reg_lo_v2i64:
+; AVX: # %bb.0:
+; AVX-NEXT: vpinsrq $0, %rdi, %xmm0, %xmm0
+; AVX-NEXT: retq
%v = insertelement <2 x i64> undef, i64 %a, i32 0
%shuffle = shufflevector <2 x i64> %v, <2 x i64> %b, <2 x i32> <i32 0, i32 3>
ret <2 x i64> %shuffle
@@ -1091,43 +1077,29 @@ define <2 x i64> @insert_reg_lo_v2i64(i64 %a, <2 x i64> %b) {
define <2 x i64> @insert_mem_lo_v2i64(i64* %ptr, <2 x i64> %b) {
; SSE2-LABEL: insert_mem_lo_v2i64:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movlpd {{.*#+}} xmm0 = mem[0],xmm0[1]
; SSE2-NEXT: retq
;
; SSE3-LABEL: insert_mem_lo_v2i64:
-; SSE3: # BB#0:
+; SSE3: # %bb.0:
; SSE3-NEXT: movlpd {{.*#+}} xmm0 = mem[0],xmm0[1]
; SSE3-NEXT: retq
;
; SSSE3-LABEL: insert_mem_lo_v2i64:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: movlpd {{.*#+}} xmm0 = mem[0],xmm0[1]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: insert_mem_lo_v2i64:
-; SSE41: # BB#0:
-; SSE41-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; SSE41: # %bb.0:
+; SSE41-NEXT: pinsrq $0, (%rdi), %xmm0
; SSE41-NEXT: retq
;
-; AVX1-LABEL: insert_mem_lo_v2i64:
-; AVX1: # BB#0:
-; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: insert_mem_lo_v2i64:
-; AVX2: # BB#0:
-; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; AVX2-NEXT: retq
-;
-; AVX512VL-LABEL: insert_mem_lo_v2i64:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
-; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; AVX512VL-NEXT: retq
+; AVX-LABEL: insert_mem_lo_v2i64:
+; AVX: # %bb.0:
+; AVX-NEXT: vpinsrq $0, (%rdi), %xmm0, %xmm0
+; AVX-NEXT: retq
%a = load i64, i64* %ptr
%v = insertelement <2 x i64> undef, i64 %a, i32 0
%shuffle = shufflevector <2 x i64> %v, <2 x i64> %b, <2 x i32> <i32 0, i32 3>
@@ -1135,16 +1107,32 @@ define <2 x i64> @insert_mem_lo_v2i64(i64* %ptr, <2 x i64> %b) {
}
define <2 x i64> @insert_reg_hi_v2i64(i64 %a, <2 x i64> %b) {
-; SSE-LABEL: insert_reg_hi_v2i64:
-; SSE: # BB#0:
-; SSE-NEXT: movq %rdi, %xmm1
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE-NEXT: retq
+; SSE2-LABEL: insert_reg_hi_v2i64:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movq %rdi, %xmm1
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT: retq
+;
+; SSE3-LABEL: insert_reg_hi_v2i64:
+; SSE3: # %bb.0:
+; SSE3-NEXT: movq %rdi, %xmm1
+; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE3-NEXT: retq
+;
+; SSSE3-LABEL: insert_reg_hi_v2i64:
+; SSSE3: # %bb.0:
+; SSSE3-NEXT: movq %rdi, %xmm1
+; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: insert_reg_hi_v2i64:
+; SSE41: # %bb.0:
+; SSE41-NEXT: pinsrq $1, %rdi, %xmm0
+; SSE41-NEXT: retq
;
; AVX-LABEL: insert_reg_hi_v2i64:
-; AVX: # BB#0:
-; AVX-NEXT: vmovq %rdi, %xmm1
-; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX: # %bb.0:
+; AVX-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm0
; AVX-NEXT: retq
%v = insertelement <2 x i64> undef, i64 %a, i32 0
%shuffle = shufflevector <2 x i64> %v, <2 x i64> %b, <2 x i32> <i32 2, i32 0>
@@ -1152,16 +1140,32 @@ define <2 x i64> @insert_reg_hi_v2i64(i64 %a, <2 x i64> %b) {
}
define <2 x i64> @insert_mem_hi_v2i64(i64* %ptr, <2 x i64> %b) {
-; SSE-LABEL: insert_mem_hi_v2i64:
-; SSE: # BB#0:
-; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE-NEXT: retq
+; SSE2-LABEL: insert_mem_hi_v2i64:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
+; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT: retq
+;
+; SSE3-LABEL: insert_mem_hi_v2i64:
+; SSE3: # %bb.0:
+; SSE3-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
+; SSE3-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE3-NEXT: retq
+;
+; SSSE3-LABEL: insert_mem_hi_v2i64:
+; SSSE3: # %bb.0:
+; SSSE3-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
+; SSSE3-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: insert_mem_hi_v2i64:
+; SSE41: # %bb.0:
+; SSE41-NEXT: pinsrq $1, (%rdi), %xmm0
+; SSE41-NEXT: retq
;
; AVX-LABEL: insert_mem_hi_v2i64:
-; AVX: # BB#0:
-; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
-; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX: # %bb.0:
+; AVX-NEXT: vpinsrq $1, (%rdi), %xmm0, %xmm0
; AVX-NEXT: retq
%a = load i64, i64* %ptr
%v = insertelement <2 x i64> undef, i64 %a, i32 0
@@ -1171,40 +1175,40 @@ define <2 x i64> @insert_mem_hi_v2i64(i64* %ptr, <2 x i64> %b) {
define <2 x double> @insert_reg_lo_v2f64(double %a, <2 x double> %b) {
; SSE2-LABEL: insert_reg_lo_v2f64:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
; SSE2-NEXT: movapd %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE3-LABEL: insert_reg_lo_v2f64:
-; SSE3: # BB#0:
+; SSE3: # %bb.0:
; SSE3-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
; SSE3-NEXT: movapd %xmm1, %xmm0
; SSE3-NEXT: retq
;
; SSSE3-LABEL: insert_reg_lo_v2f64:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
; SSSE3-NEXT: movapd %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: insert_reg_lo_v2f64:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
; SSE41-NEXT: retq
;
; AVX1-LABEL: insert_reg_lo_v2f64:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
; AVX1-NEXT: retq
;
; AVX2-LABEL: insert_reg_lo_v2f64:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: insert_reg_lo_v2f64:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
; AVX512VL-NEXT: retq
%v = insertelement <2 x double> undef, double %a, i32 0
@@ -1214,12 +1218,12 @@ define <2 x double> @insert_reg_lo_v2f64(double %a, <2 x double> %b) {
define <2 x double> @insert_mem_lo_v2f64(double* %ptr, <2 x double> %b) {
; SSE-LABEL: insert_mem_lo_v2f64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movlpd {{.*#+}} xmm0 = mem[0],xmm0[1]
; SSE-NEXT: retq
;
; AVX-LABEL: insert_mem_lo_v2f64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovlpd {{.*#+}} xmm0 = mem[0],xmm0[1]
; AVX-NEXT: retq
%a = load double, double* %ptr
@@ -1230,14 +1234,14 @@ define <2 x double> @insert_mem_lo_v2f64(double* %ptr, <2 x double> %b) {
define <2 x double> @insert_reg_hi_v2f64(double %a, <2 x double> %b) {
; SSE-LABEL: insert_reg_hi_v2f64:
-; SSE: # BB#0:
-; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; SSE-NEXT: movapd %xmm1, %xmm0
+; SSE: # %bb.0:
+; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE-NEXT: movaps %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: insert_reg_hi_v2f64:
-; AVX: # BB#0:
-; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX: # %bb.0:
+; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX-NEXT: retq
%v = insertelement <2 x double> undef, double %a, i32 0
%shuffle = shufflevector <2 x double> %v, <2 x double> %b, <2 x i32> <i32 2, i32 0>
@@ -1246,12 +1250,12 @@ define <2 x double> @insert_reg_hi_v2f64(double %a, <2 x double> %b) {
define <2 x double> @insert_mem_hi_v2f64(double* %ptr, <2 x double> %b) {
; SSE-LABEL: insert_mem_hi_v2f64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
; SSE-NEXT: retq
;
; AVX-LABEL: insert_mem_hi_v2f64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
; AVX-NEXT: retq
%a = load double, double* %ptr
@@ -1262,27 +1266,27 @@ define <2 x double> @insert_mem_hi_v2f64(double* %ptr, <2 x double> %b) {
define <2 x double> @insert_dup_reg_v2f64(double %a) {
; SSE2-LABEL: insert_dup_reg_v2f64:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0]
; SSE2-NEXT: retq
;
; SSE3-LABEL: insert_dup_reg_v2f64:
-; SSE3: # BB#0:
+; SSE3: # %bb.0:
; SSE3-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0]
; SSE3-NEXT: retq
;
; SSSE3-LABEL: insert_dup_reg_v2f64:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: insert_dup_reg_v2f64:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0]
; SSE41-NEXT: retq
;
; AVX-LABEL: insert_dup_reg_v2f64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
; AVX-NEXT: retq
%v = insertelement <2 x double> undef, double %a, i32 0
@@ -1292,28 +1296,28 @@ define <2 x double> @insert_dup_reg_v2f64(double %a) {
define <2 x double> @insert_dup_mem_v2f64(double* %ptr) {
; SSE2-LABEL: insert_dup_mem_v2f64:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0]
; SSE2-NEXT: retq
;
; SSE3-LABEL: insert_dup_mem_v2f64:
-; SSE3: # BB#0:
+; SSE3: # %bb.0:
; SSE3-NEXT: movddup {{.*#+}} xmm0 = mem[0,0]
; SSE3-NEXT: retq
;
; SSSE3-LABEL: insert_dup_mem_v2f64:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: movddup {{.*#+}} xmm0 = mem[0,0]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: insert_dup_mem_v2f64:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movddup {{.*#+}} xmm0 = mem[0,0]
; SSE41-NEXT: retq
;
; AVX-LABEL: insert_dup_mem_v2f64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
; AVX-NEXT: retq
%a = load double, double* %ptr
@@ -1324,28 +1328,28 @@ define <2 x double> @insert_dup_mem_v2f64(double* %ptr) {
define <2 x double> @insert_dup_mem128_v2f64(<2 x double>* %ptr) nounwind {
; SSE2-LABEL: insert_dup_mem128_v2f64:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movaps (%rdi), %xmm0
; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0]
; SSE2-NEXT: retq
;
; SSE3-LABEL: insert_dup_mem128_v2f64:
-; SSE3: # BB#0:
+; SSE3: # %bb.0:
; SSE3-NEXT: movddup {{.*#+}} xmm0 = mem[0,0]
; SSE3-NEXT: retq
;
; SSSE3-LABEL: insert_dup_mem128_v2f64:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: movddup {{.*#+}} xmm0 = mem[0,0]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: insert_dup_mem128_v2f64:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movddup {{.*#+}} xmm0 = mem[0,0]
; SSE41-NEXT: retq
;
; AVX-LABEL: insert_dup_mem128_v2f64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
; AVX-NEXT: retq
%v = load <2 x double>, <2 x double>* %ptr
@@ -1356,24 +1360,24 @@ define <2 x double> @insert_dup_mem128_v2f64(<2 x double>* %ptr) nounwind {
define <2 x i64> @insert_dup_mem_v2i64(i64* %ptr) {
; SSE-LABEL: insert_dup_mem_v2i64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; SSE-NEXT: retq
;
; AVX1-LABEL: insert_dup_mem_v2i64:
-; AVX1: # BB#0:
-; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX1-NEXT: retq
;
; AVX2-LABEL: insert_dup_mem_v2i64:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpbroadcastq (%rdi), %xmm0
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: insert_dup_mem_v2i64:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpbroadcastq (%rdi), %xmm0
; AVX512VL-NEXT: retq
%tmp = load i64, i64* %ptr, align 1
@@ -1384,13 +1388,13 @@ define <2 x i64> @insert_dup_mem_v2i64(i64* %ptr) {
define <2 x double> @shuffle_mem_v2f64_10(<2 x double>* %ptr) {
; SSE-LABEL: shuffle_mem_v2f64_10:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movapd (%rdi), %xmm0
; SSE-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0]
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_mem_v2f64_10:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = mem[1,0]
; AVX-NEXT: retq
diff --git a/test/CodeGen/X86/vector-shuffle-128-v4.ll b/test/CodeGen/X86/vector-shuffle-128-v4.ll
index 7214b3ed326b..ba478e9e28d6 100644
--- a/test/CodeGen/X86/vector-shuffle-128-v4.ll
+++ b/test/CodeGen/X86/vector-shuffle-128-v4.ll
@@ -9,104 +9,104 @@
define <4 x i32> @shuffle_v4i32_0001(<4 x i32> %a, <4 x i32> %b) {
; SSE-LABEL: shuffle_v4i32_0001:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,1]
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v4i32_0001:
-; AVX: # BB#0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,1]
+; AVX: # %bb.0:
+; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,1]
; AVX-NEXT: retq
%shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
ret <4 x i32> %shuffle
}
define <4 x i32> @shuffle_v4i32_0020(<4 x i32> %a, <4 x i32> %b) {
; SSE-LABEL: shuffle_v4i32_0020:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,0]
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v4i32_0020:
-; AVX: # BB#0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,2,0]
+; AVX: # %bb.0:
+; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,2,0]
; AVX-NEXT: retq
%shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 0, i32 2, i32 0>
ret <4 x i32> %shuffle
}
define <4 x i32> @shuffle_v4i32_0112(<4 x i32> %a, <4 x i32> %b) {
; SSE-LABEL: shuffle_v4i32_0112:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,2]
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v4i32_0112:
-; AVX: # BB#0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,2]
+; AVX: # %bb.0:
+; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,2]
; AVX-NEXT: retq
%shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 1, i32 2>
ret <4 x i32> %shuffle
}
define <4 x i32> @shuffle_v4i32_0300(<4 x i32> %a, <4 x i32> %b) {
; SSE-LABEL: shuffle_v4i32_0300:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,0,0]
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v4i32_0300:
-; AVX: # BB#0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,0,0]
+; AVX: # %bb.0:
+; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,3,0,0]
; AVX-NEXT: retq
%shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 3, i32 0, i32 0>
ret <4 x i32> %shuffle
}
define <4 x i32> @shuffle_v4i32_1000(<4 x i32> %a, <4 x i32> %b) {
; SSE-LABEL: shuffle_v4i32_1000:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,0,0]
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v4i32_1000:
-; AVX: # BB#0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,0,0]
+; AVX: # %bb.0:
+; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,0,0,0]
; AVX-NEXT: retq
%shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 0, i32 0, i32 0>
ret <4 x i32> %shuffle
}
define <4 x i32> @shuffle_v4i32_2200(<4 x i32> %a, <4 x i32> %b) {
; SSE-LABEL: shuffle_v4i32_2200:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,0,0]
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v4i32_2200:
-; AVX: # BB#0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,0,0]
+; AVX: # %bb.0:
+; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,0,0]
; AVX-NEXT: retq
%shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 2, i32 0, i32 0>
ret <4 x i32> %shuffle
}
define <4 x i32> @shuffle_v4i32_3330(<4 x i32> %a, <4 x i32> %b) {
; SSE-LABEL: shuffle_v4i32_3330:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,0]
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v4i32_3330:
-; AVX: # BB#0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,0]
+; AVX: # %bb.0:
+; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,0]
; AVX-NEXT: retq
%shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 0>
ret <4 x i32> %shuffle
}
define <4 x i32> @shuffle_v4i32_3210(<4 x i32> %a, <4 x i32> %b) {
; SSE-LABEL: shuffle_v4i32_3210:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v4i32_3210:
-; AVX: # BB#0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; AVX: # %bb.0:
+; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
; AVX-NEXT: retq
%shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
ret <4 x i32> %shuffle
@@ -114,13 +114,13 @@ define <4 x i32> @shuffle_v4i32_3210(<4 x i32> %a, <4 x i32> %b) {
define <4 x i32> @shuffle_v4i32_2121(<4 x i32> %a, <4 x i32> %b) {
; SSE-LABEL: shuffle_v4i32_2121:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,1]
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v4i32_2121:
-; AVX: # BB#0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,1]
+; AVX: # %bb.0:
+; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,1,2,1]
; AVX-NEXT: retq
%shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 1, i32 2, i32 1>
ret <4 x i32> %shuffle
@@ -128,12 +128,12 @@ define <4 x i32> @shuffle_v4i32_2121(<4 x i32> %a, <4 x i32> %b) {
define <4 x float> @shuffle_v4f32_0001(<4 x float> %a, <4 x float> %b) {
; SSE-LABEL: shuffle_v4f32_0001:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,1]
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v4f32_0001:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,1]
; AVX-NEXT: retq
%shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
@@ -141,12 +141,12 @@ define <4 x float> @shuffle_v4f32_0001(<4 x float> %a, <4 x float> %b) {
}
define <4 x float> @shuffle_v4f32_0020(<4 x float> %a, <4 x float> %b) {
; SSE-LABEL: shuffle_v4f32_0020:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,2,0]
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v4f32_0020:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,2,0]
; AVX-NEXT: retq
%shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 2, i32 0>
@@ -154,12 +154,12 @@ define <4 x float> @shuffle_v4f32_0020(<4 x float> %a, <4 x float> %b) {
}
define <4 x float> @shuffle_v4f32_0300(<4 x float> %a, <4 x float> %b) {
; SSE-LABEL: shuffle_v4f32_0300:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3,0,0]
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v4f32_0300:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,3,0,0]
; AVX-NEXT: retq
%shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 3, i32 0, i32 0>
@@ -167,12 +167,12 @@ define <4 x float> @shuffle_v4f32_0300(<4 x float> %a, <4 x float> %b) {
}
define <4 x float> @shuffle_v4f32_1000(<4 x float> %a, <4 x float> %b) {
; SSE-LABEL: shuffle_v4f32_1000:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0,0,0]
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v4f32_1000:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,0,0,0]
; AVX-NEXT: retq
%shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 1, i32 0, i32 0, i32 0>
@@ -180,12 +180,12 @@ define <4 x float> @shuffle_v4f32_1000(<4 x float> %a, <4 x float> %b) {
}
define <4 x float> @shuffle_v4f32_2200(<4 x float> %a, <4 x float> %b) {
; SSE-LABEL: shuffle_v4f32_2200:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2,0,0]
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v4f32_2200:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,0,0]
; AVX-NEXT: retq
%shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 2, i32 0, i32 0>
@@ -193,12 +193,12 @@ define <4 x float> @shuffle_v4f32_2200(<4 x float> %a, <4 x float> %b) {
}
define <4 x float> @shuffle_v4f32_3330(<4 x float> %a, <4 x float> %b) {
; SSE-LABEL: shuffle_v4f32_3330:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,0]
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v4f32_3330:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,0]
; AVX-NEXT: retq
%shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 0>
@@ -206,12 +206,12 @@ define <4 x float> @shuffle_v4f32_3330(<4 x float> %a, <4 x float> %b) {
}
define <4 x float> @shuffle_v4f32_3210(<4 x float> %a, <4 x float> %b) {
; SSE-LABEL: shuffle_v4f32_3210:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,2,1,0]
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v4f32_3210:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
; AVX-NEXT: retq
%shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
@@ -219,12 +219,12 @@ define <4 x float> @shuffle_v4f32_3210(<4 x float> %a, <4 x float> %b) {
}
define <4 x float> @shuffle_v4f32_0011(<4 x float> %a, <4 x float> %b) {
; SSE-LABEL: shuffle_v4f32_0011:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0,0,1,1]
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v4f32_0011:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,1]
; AVX-NEXT: retq
%shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
@@ -232,12 +232,12 @@ define <4 x float> @shuffle_v4f32_0011(<4 x float> %a, <4 x float> %b) {
}
define <4 x float> @shuffle_v4f32_2233(<4 x float> %a, <4 x float> %b) {
; SSE-LABEL: shuffle_v4f32_2233:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2,2,3,3]
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v4f32_2233:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3]
; AVX-NEXT: retq
%shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 2, i32 3, i32 3>
@@ -245,27 +245,27 @@ define <4 x float> @shuffle_v4f32_2233(<4 x float> %a, <4 x float> %b) {
}
define <4 x float> @shuffle_v4f32_0022(<4 x float> %a, <4 x float> %b) {
; SSE2-LABEL: shuffle_v4f32_0022:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,2,2]
; SSE2-NEXT: retq
;
; SSE3-LABEL: shuffle_v4f32_0022:
-; SSE3: # BB#0:
+; SSE3: # %bb.0:
; SSE3-NEXT: movsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
; SSE3-NEXT: retq
;
; SSSE3-LABEL: shuffle_v4f32_0022:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: movsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v4f32_0022:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v4f32_0022:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
; AVX-NEXT: retq
%shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
@@ -273,27 +273,27 @@ define <4 x float> @shuffle_v4f32_0022(<4 x float> %a, <4 x float> %b) {
}
define <4 x float> @shuffle_v4f32_1133(<4 x float> %a, <4 x float> %b) {
; SSE2-LABEL: shuffle_v4f32_1133:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,3,3]
; SSE2-NEXT: retq
;
; SSE3-LABEL: shuffle_v4f32_1133:
-; SSE3: # BB#0:
+; SSE3: # %bb.0:
; SSE3-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
; SSE3-NEXT: retq
;
; SSSE3-LABEL: shuffle_v4f32_1133:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v4f32_1133:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v4f32_1133:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
; AVX-NEXT: retq
%shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
@@ -302,13 +302,13 @@ define <4 x float> @shuffle_v4f32_1133(<4 x float> %a, <4 x float> %b) {
define <4 x float> @shuffle_v4f32_0145(<4 x float> %a, <4 x float> %b) {
; SSE-LABEL: shuffle_v4f32_0145:
-; SSE: # BB#0:
-; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE: # %bb.0:
+; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v4f32_0145:
-; AVX: # BB#0:
-; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX: # %bb.0:
+; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX-NEXT: retq
%shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
ret <4 x float> %shuffle
@@ -316,12 +316,12 @@ define <4 x float> @shuffle_v4f32_0145(<4 x float> %a, <4 x float> %b) {
define <4 x float> @shuffle_v4f32_6723(<4 x float> %a, <4 x float> %b) {
; SSE-LABEL: shuffle_v4f32_6723:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1]
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v4f32_6723:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
; AVX-NEXT: retq
%shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 6, i32 7, i32 2, i32 3>
@@ -330,78 +330,78 @@ define <4 x float> @shuffle_v4f32_6723(<4 x float> %a, <4 x float> %b) {
define <4 x i32> @shuffle_v4i32_0124(<4 x i32> %a, <4 x i32> %b) {
; SSE2-LABEL: shuffle_v4i32_0124:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0]
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
; SSE2-NEXT: retq
;
; SSE3-LABEL: shuffle_v4i32_0124:
-; SSE3: # BB#0:
+; SSE3: # %bb.0:
; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0]
; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
; SSE3-NEXT: retq
;
; SSSE3-LABEL: shuffle_v4i32_0124:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0]
; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v4i32_0124:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
; SSE41-NEXT: retq
;
; AVX1-LABEL: shuffle_v4i32_0124:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v4i32_0124:
-; AVX2OR512VL: # BB#0:
-; AVX2OR512VL-NEXT: vpbroadcastd %xmm1, %xmm1
-; AVX2OR512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
+; AVX2OR512VL: # %bb.0:
+; AVX2OR512VL-NEXT: vbroadcastss %xmm1, %xmm1
+; AVX2OR512VL-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
ret <4 x i32> %shuffle
}
define <4 x i32> @shuffle_v4i32_0142(<4 x i32> %a, <4 x i32> %b) {
; SSE2-LABEL: shuffle_v4i32_0142:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0]
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
; SSE2-NEXT: retq
;
; SSE3-LABEL: shuffle_v4i32_0142:
-; SSE3: # BB#0:
+; SSE3: # %bb.0:
; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0]
; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
; SSE3-NEXT: retq
;
; SSSE3-LABEL: shuffle_v4i32_0142:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0]
; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v4i32_0142:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,2]
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5],xmm0[6,7]
; SSE41-NEXT: retq
;
; AVX1-LABEL: shuffle_v4i32_0142:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,2]
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5],xmm0[6,7]
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v4i32_0142:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpbroadcastq %xmm1, %xmm1
; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,2]
; AVX2OR512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
@@ -411,139 +411,139 @@ define <4 x i32> @shuffle_v4i32_0142(<4 x i32> %a, <4 x i32> %b) {
}
define <4 x i32> @shuffle_v4i32_0412(<4 x i32> %a, <4 x i32> %b) {
; SSE2-LABEL: shuffle_v4i32_0412:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0]
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[1,2]
; SSE2-NEXT: movaps %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE3-LABEL: shuffle_v4i32_0412:
-; SSE3: # BB#0:
+; SSE3: # %bb.0:
; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0]
; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[1,2]
; SSE3-NEXT: movaps %xmm1, %xmm0
; SSE3-NEXT: retq
;
; SSSE3-LABEL: shuffle_v4i32_0412:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0]
; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[1,2]
; SSSE3-NEXT: movaps %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v4i32_0412:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,2]
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
; SSE41-NEXT: retq
;
; AVX1-LABEL: shuffle_v4i32_0412:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,2]
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v4i32_0412:
-; AVX2OR512VL: # BB#0:
-; AVX2OR512VL-NEXT: vpbroadcastd %xmm1, %xmm1
-; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,2]
-; AVX2OR512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX2OR512VL: # %bb.0:
+; AVX2OR512VL-NEXT: vbroadcastss %xmm1, %xmm1
+; AVX2OR512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,2]
+; AVX2OR512VL-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 2>
ret <4 x i32> %shuffle
}
define <4 x i32> @shuffle_v4i32_4012(<4 x i32> %a, <4 x i32> %b) {
; SSE2-LABEL: shuffle_v4i32_4012:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0]
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,2]
; SSE2-NEXT: movaps %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE3-LABEL: shuffle_v4i32_4012:
-; SSE3: # BB#0:
+; SSE3: # %bb.0:
; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0]
; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,2]
; SSE3-NEXT: movaps %xmm1, %xmm0
; SSE3-NEXT: retq
;
; SSSE3-LABEL: shuffle_v4i32_4012:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0]
; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,2]
; SSSE3-NEXT: movaps %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v4i32_4012:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,2]
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
; SSE41-NEXT: retq
;
; AVX1-LABEL: shuffle_v4i32_4012:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,2]
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v4i32_4012:
-; AVX2OR512VL: # BB#0:
-; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,2]
-; AVX2OR512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX2OR512VL: # %bb.0:
+; AVX2OR512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,2]
+; AVX2OR512VL-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 0, i32 1, i32 2>
ret <4 x i32> %shuffle
}
define <4 x i32> @shuffle_v4i32_0145(<4 x i32> %a, <4 x i32> %b) {
; SSE-LABEL: shuffle_v4i32_0145:
-; SSE: # BB#0:
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE: # %bb.0:
+; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v4i32_0145:
-; AVX: # BB#0:
-; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX: # %bb.0:
+; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX-NEXT: retq
%shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
ret <4 x i32> %shuffle
}
define <4 x i32> @shuffle_v4i32_0451(<4 x i32> %a, <4 x i32> %b) {
; SSE2-LABEL: shuffle_v4i32_0451:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,3,2]
; SSE2-NEXT: retq
;
; SSE3-LABEL: shuffle_v4i32_0451:
-; SSE3: # BB#0:
+; SSE3: # %bb.0:
; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,3,2]
; SSE3-NEXT: retq
;
; SSSE3-LABEL: shuffle_v4i32_0451:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,3,2]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v4i32_0451:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5],xmm0[6,7]
; SSE41-NEXT: retq
;
; AVX1-LABEL: shuffle_v4i32_0451:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5],xmm0[6,7]
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v4i32_0451:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
; AVX2OR512VL-NEXT: vpbroadcastq %xmm0, %xmm0
; AVX2OR512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3]
@@ -553,53 +553,53 @@ define <4 x i32> @shuffle_v4i32_0451(<4 x i32> %a, <4 x i32> %b) {
}
define <4 x i32> @shuffle_v4i32_4501(<4 x i32> %a, <4 x i32> %b) {
; SSE-LABEL: shuffle_v4i32_4501:
-; SSE: # BB#0:
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; SSE-NEXT: movdqa %xmm1, %xmm0
+; SSE: # %bb.0:
+; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE-NEXT: movaps %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v4i32_4501:
-; AVX: # BB#0:
-; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX: # %bb.0:
+; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX-NEXT: retq
%shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
ret <4 x i32> %shuffle
}
define <4 x i32> @shuffle_v4i32_4015(<4 x i32> %a, <4 x i32> %b) {
; SSE2-LABEL: shuffle_v4i32_4015:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,2,3]
; SSE2-NEXT: retq
;
; SSE3-LABEL: shuffle_v4i32_4015:
-; SSE3: # BB#0:
+; SSE3: # %bb.0:
; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,2,3]
; SSE3-NEXT: retq
;
; SSSE3-LABEL: shuffle_v4i32_4015:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,2,3]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v4i32_4015:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7]
; SSE41-NEXT: retq
;
; AVX1-LABEL: shuffle_v4i32_4015:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7]
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v4i32_4015:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpbroadcastq %xmm1, %xmm1
; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; AVX2OR512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
@@ -610,40 +610,40 @@ define <4 x i32> @shuffle_v4i32_4015(<4 x i32> %a, <4 x i32> %b) {
define <4 x float> @shuffle_v4f32_4zzz(<4 x float> %a) {
; SSE2-LABEL: shuffle_v4f32_4zzz:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: xorps %xmm1, %xmm1
; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
; SSE2-NEXT: movaps %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE3-LABEL: shuffle_v4f32_4zzz:
-; SSE3: # BB#0:
+; SSE3: # %bb.0:
; SSE3-NEXT: xorps %xmm1, %xmm1
; SSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
; SSE3-NEXT: movaps %xmm1, %xmm0
; SSE3-NEXT: retq
;
; SSSE3-LABEL: shuffle_v4f32_4zzz:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: xorps %xmm1, %xmm1
; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
; SSSE3-NEXT: movaps %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v4f32_4zzz:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: xorps %xmm1, %xmm1
; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; SSE41-NEXT: retq
;
; AVX1OR2-LABEL: shuffle_v4f32_4zzz:
-; AVX1OR2: # BB#0:
+; AVX1OR2: # %bb.0:
; AVX1OR2-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX1OR2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; AVX1OR2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v4f32_4zzz:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX512VL-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; AVX512VL-NEXT: retq
@@ -653,33 +653,33 @@ define <4 x float> @shuffle_v4f32_4zzz(<4 x float> %a) {
define <4 x float> @shuffle_v4f32_z4zz(<4 x float> %a) {
; SSE2-LABEL: shuffle_v4f32_z4zz:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: xorps %xmm1, %xmm1
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0]
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
; SSE2-NEXT: retq
;
; SSE3-LABEL: shuffle_v4f32_z4zz:
-; SSE3: # BB#0:
+; SSE3: # %bb.0:
; SSE3-NEXT: xorps %xmm1, %xmm1
; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0]
; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
; SSE3-NEXT: retq
;
; SSSE3-LABEL: shuffle_v4f32_z4zz:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: xorps %xmm1, %xmm1
; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0]
; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v4f32_z4zz:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: insertps {{.*#+}} xmm0 = zero,xmm0[0],zero,zero
; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v4f32_z4zz:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vinsertps {{.*#+}} xmm0 = zero,xmm0[0],zero,zero
; AVX-NEXT: retq
%shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 2, i32 4, i32 3, i32 0>
@@ -688,7 +688,7 @@ define <4 x float> @shuffle_v4f32_z4zz(<4 x float> %a) {
define <4 x float> @shuffle_v4f32_zz4z(<4 x float> %a) {
; SSE2-LABEL: shuffle_v4f32_zz4z:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: xorps %xmm1, %xmm1
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0]
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2]
@@ -696,7 +696,7 @@ define <4 x float> @shuffle_v4f32_zz4z(<4 x float> %a) {
; SSE2-NEXT: retq
;
; SSE3-LABEL: shuffle_v4f32_zz4z:
-; SSE3: # BB#0:
+; SSE3: # %bb.0:
; SSE3-NEXT: xorps %xmm1, %xmm1
; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0]
; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2]
@@ -704,7 +704,7 @@ define <4 x float> @shuffle_v4f32_zz4z(<4 x float> %a) {
; SSE3-NEXT: retq
;
; SSSE3-LABEL: shuffle_v4f32_zz4z:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: xorps %xmm1, %xmm1
; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0]
; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2]
@@ -712,12 +712,12 @@ define <4 x float> @shuffle_v4f32_zz4z(<4 x float> %a) {
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v4f32_zz4z:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: insertps {{.*#+}} xmm0 = zero,zero,xmm0[0],zero
; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v4f32_zz4z:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vinsertps {{.*#+}} xmm0 = zero,zero,xmm0[0],zero
; AVX-NEXT: retq
%shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 0, i32 0, i32 4, i32 0>
@@ -726,33 +726,33 @@ define <4 x float> @shuffle_v4f32_zz4z(<4 x float> %a) {
define <4 x float> @shuffle_v4f32_zuu4(<4 x float> %a) {
; SSE2-LABEL: shuffle_v4f32_zuu4:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: xorps %xmm1, %xmm1
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
; SSE2-NEXT: movaps %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE3-LABEL: shuffle_v4f32_zuu4:
-; SSE3: # BB#0:
+; SSE3: # %bb.0:
; SSE3-NEXT: xorps %xmm1, %xmm1
; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
; SSE3-NEXT: movaps %xmm1, %xmm0
; SSE3-NEXT: retq
;
; SSSE3-LABEL: shuffle_v4f32_zuu4:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: xorps %xmm1, %xmm1
; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
; SSSE3-NEXT: movaps %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v4f32_zuu4:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: insertps {{.*#+}} xmm0 = zero,zero,zero,xmm0[0]
; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v4f32_zuu4:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vinsertps {{.*#+}} xmm0 = zero,zero,zero,xmm0[0]
; AVX-NEXT: retq
%shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 0, i32 undef, i32 undef, i32 4>
@@ -761,7 +761,7 @@ define <4 x float> @shuffle_v4f32_zuu4(<4 x float> %a) {
define <4 x float> @shuffle_v4f32_zzz7(<4 x float> %a) {
; SSE2-LABEL: shuffle_v4f32_zzz7:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: xorps %xmm1, %xmm1
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0]
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
@@ -769,7 +769,7 @@ define <4 x float> @shuffle_v4f32_zzz7(<4 x float> %a) {
; SSE2-NEXT: retq
;
; SSE3-LABEL: shuffle_v4f32_zzz7:
-; SSE3: # BB#0:
+; SSE3: # %bb.0:
; SSE3-NEXT: xorps %xmm1, %xmm1
; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0]
; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
@@ -777,7 +777,7 @@ define <4 x float> @shuffle_v4f32_zzz7(<4 x float> %a) {
; SSE3-NEXT: retq
;
; SSSE3-LABEL: shuffle_v4f32_zzz7:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: xorps %xmm1, %xmm1
; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0]
; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
@@ -785,13 +785,13 @@ define <4 x float> @shuffle_v4f32_zzz7(<4 x float> %a) {
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v4f32_zzz7:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: xorps %xmm1, %xmm1
; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v4f32_zzz7:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
; AVX-NEXT: retq
@@ -801,33 +801,33 @@ define <4 x float> @shuffle_v4f32_zzz7(<4 x float> %a) {
define <4 x float> @shuffle_v4f32_z6zz(<4 x float> %a) {
; SSE2-LABEL: shuffle_v4f32_z6zz:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: xorps %xmm1, %xmm1
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[0,0]
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
; SSE2-NEXT: retq
;
; SSE3-LABEL: shuffle_v4f32_z6zz:
-; SSE3: # BB#0:
+; SSE3: # %bb.0:
; SSE3-NEXT: xorps %xmm1, %xmm1
; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[0,0]
; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
; SSE3-NEXT: retq
;
; SSSE3-LABEL: shuffle_v4f32_z6zz:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: xorps %xmm1, %xmm1
; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[0,0]
; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v4f32_z6zz:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: insertps {{.*#+}} xmm0 = zero,xmm0[2],zero,zero
; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v4f32_z6zz:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vinsertps {{.*#+}} xmm0 = zero,xmm0[2],zero,zero
; AVX-NEXT: retq
%shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 0, i32 6, i32 2, i32 3>
@@ -836,7 +836,7 @@ define <4 x float> @shuffle_v4f32_z6zz(<4 x float> %a) {
define <4 x float> @shuffle_v4f32_0z23(<4 x float> %a) {
; SSE2-LABEL: shuffle_v4f32_0z23:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: xorps %xmm1, %xmm1
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0]
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
@@ -844,7 +844,7 @@ define <4 x float> @shuffle_v4f32_0z23(<4 x float> %a) {
; SSE2-NEXT: retq
;
; SSE3-LABEL: shuffle_v4f32_0z23:
-; SSE3: # BB#0:
+; SSE3: # %bb.0:
; SSE3-NEXT: xorps %xmm1, %xmm1
; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0]
; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
@@ -852,7 +852,7 @@ define <4 x float> @shuffle_v4f32_0z23(<4 x float> %a) {
; SSE3-NEXT: retq
;
; SSSE3-LABEL: shuffle_v4f32_0z23:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: xorps %xmm1, %xmm1
; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0]
; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
@@ -860,13 +860,13 @@ define <4 x float> @shuffle_v4f32_0z23(<4 x float> %a) {
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v4f32_0z23:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: xorps %xmm1, %xmm1
; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v4f32_0z23:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
; AVX-NEXT: retq
@@ -876,34 +876,34 @@ define <4 x float> @shuffle_v4f32_0z23(<4 x float> %a) {
define <4 x float> @shuffle_v4f32_01z3(<4 x float> %a) {
; SSE2-LABEL: shuffle_v4f32_01z3:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: xorps %xmm1, %xmm1
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0]
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
; SSE2-NEXT: retq
;
; SSE3-LABEL: shuffle_v4f32_01z3:
-; SSE3: # BB#0:
+; SSE3: # %bb.0:
; SSE3-NEXT: xorps %xmm1, %xmm1
; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0]
; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
; SSE3-NEXT: retq
;
; SSSE3-LABEL: shuffle_v4f32_01z3:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: xorps %xmm1, %xmm1
; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0]
; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v4f32_01z3:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: xorps %xmm1, %xmm1
; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v4f32_01z3:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
; AVX-NEXT: retq
@@ -913,34 +913,34 @@ define <4 x float> @shuffle_v4f32_01z3(<4 x float> %a) {
define <4 x float> @shuffle_v4f32_012z(<4 x float> %a) {
; SSE2-LABEL: shuffle_v4f32_012z:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: xorps %xmm1, %xmm1
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm0[2,0]
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
; SSE2-NEXT: retq
;
; SSE3-LABEL: shuffle_v4f32_012z:
-; SSE3: # BB#0:
+; SSE3: # %bb.0:
; SSE3-NEXT: xorps %xmm1, %xmm1
; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm0[2,0]
; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
; SSE3-NEXT: retq
;
; SSSE3-LABEL: shuffle_v4f32_012z:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: xorps %xmm1, %xmm1
; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm0[2,0]
; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v4f32_012z:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: xorps %xmm1, %xmm1
; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v4f32_012z:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
; AVX-NEXT: retq
@@ -950,34 +950,34 @@ define <4 x float> @shuffle_v4f32_012z(<4 x float> %a) {
define <4 x float> @shuffle_v4f32_0zz3(<4 x float> %a) {
; SSE2-LABEL: shuffle_v4f32_0zz3:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: xorps %xmm1, %xmm1
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[1,2]
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,3,1]
; SSE2-NEXT: retq
;
; SSE3-LABEL: shuffle_v4f32_0zz3:
-; SSE3: # BB#0:
+; SSE3: # %bb.0:
; SSE3-NEXT: xorps %xmm1, %xmm1
; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[1,2]
; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,3,1]
; SSE3-NEXT: retq
;
; SSSE3-LABEL: shuffle_v4f32_0zz3:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: xorps %xmm1, %xmm1
; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[1,2]
; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,3,1]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v4f32_0zz3:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: xorps %xmm1, %xmm1
; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3]
; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v4f32_0zz3:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3]
; AVX-NEXT: retq
@@ -987,34 +987,34 @@ define <4 x float> @shuffle_v4f32_0zz3(<4 x float> %a) {
define <4 x float> @shuffle_v4f32_0z2z(<4 x float> %v) {
; SSE2-LABEL: shuffle_v4f32_0z2z:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: xorps %xmm1, %xmm1
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,0]
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
; SSE2-NEXT: retq
;
; SSE3-LABEL: shuffle_v4f32_0z2z:
-; SSE3: # BB#0:
+; SSE3: # %bb.0:
; SSE3-NEXT: xorps %xmm1, %xmm1
; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,0]
; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
; SSE3-NEXT: retq
;
; SSSE3-LABEL: shuffle_v4f32_0z2z:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: xorps %xmm1, %xmm1
; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,0]
; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v4f32_0z2z:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: xorps %xmm1, %xmm1
; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v4f32_0z2z:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
; AVX-NEXT: retq
@@ -1024,13 +1024,13 @@ define <4 x float> @shuffle_v4f32_0z2z(<4 x float> %v) {
define <4 x float> @shuffle_v4f32_u051(<4 x float> %a, <4 x float> %b) {
; SSE-LABEL: shuffle_v4f32_u051:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; SSE-NEXT: movaps %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v4f32_u051:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; AVX-NEXT: retq
%shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 undef, i32 0, i32 5, i32 1>
@@ -1039,7 +1039,7 @@ define <4 x float> @shuffle_v4f32_u051(<4 x float> %a, <4 x float> %b) {
define <4 x float> @shuffle_v4f32_0zz4(<4 x float> %a, <4 x float> %b) {
; SSE2-LABEL: shuffle_v4f32_0zz4:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: xorps %xmm2, %xmm2
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm2[2,0]
; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0]
@@ -1048,7 +1048,7 @@ define <4 x float> @shuffle_v4f32_0zz4(<4 x float> %a, <4 x float> %b) {
; SSE2-NEXT: retq
;
; SSE3-LABEL: shuffle_v4f32_0zz4:
-; SSE3: # BB#0:
+; SSE3: # %bb.0:
; SSE3-NEXT: xorps %xmm2, %xmm2
; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm2[2,0]
; SSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0]
@@ -1057,7 +1057,7 @@ define <4 x float> @shuffle_v4f32_0zz4(<4 x float> %a, <4 x float> %b) {
; SSE3-NEXT: retq
;
; SSSE3-LABEL: shuffle_v4f32_0zz4:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: xorps %xmm2, %xmm2
; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm2[2,0]
; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0]
@@ -1066,12 +1066,12 @@ define <4 x float> @shuffle_v4f32_0zz4(<4 x float> %a, <4 x float> %b) {
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v4f32_0zz4:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm1[0]
; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v4f32_0zz4:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm1[0]
; AVX-NEXT: retq
%shuffle = shufflevector <4 x float> %b, <4 x float> zeroinitializer, <4 x i32> <i32 undef, i32 5, i32 6, i32 0>
@@ -1081,7 +1081,7 @@ define <4 x float> @shuffle_v4f32_0zz4(<4 x float> %a, <4 x float> %b) {
define <4 x float> @shuffle_v4f32_0zz6(<4 x float> %a, <4 x float> %b) {
; SSE2-LABEL: shuffle_v4f32_0zz6:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,2]
; SSE2-NEXT: xorps %xmm1, %xmm1
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm0[0,3]
@@ -1090,7 +1090,7 @@ define <4 x float> @shuffle_v4f32_0zz6(<4 x float> %a, <4 x float> %b) {
; SSE2-NEXT: retq
;
; SSE3-LABEL: shuffle_v4f32_0zz6:
-; SSE3: # BB#0:
+; SSE3: # %bb.0:
; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,2]
; SSE3-NEXT: xorps %xmm1, %xmm1
; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm0[0,3]
@@ -1099,7 +1099,7 @@ define <4 x float> @shuffle_v4f32_0zz6(<4 x float> %a, <4 x float> %b) {
; SSE3-NEXT: retq
;
; SSSE3-LABEL: shuffle_v4f32_0zz6:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,2]
; SSSE3-NEXT: xorps %xmm1, %xmm1
; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm0[0,3]
@@ -1108,12 +1108,12 @@ define <4 x float> @shuffle_v4f32_0zz6(<4 x float> %a, <4 x float> %b) {
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v4f32_0zz6:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm1[2]
; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v4f32_0zz6:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm1[2]
; AVX-NEXT: retq
%shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 undef, i32 undef, i32 6>
@@ -1123,7 +1123,7 @@ define <4 x float> @shuffle_v4f32_0zz6(<4 x float> %a, <4 x float> %b) {
define <4 x float> @shuffle_v4f32_0z24(<4 x float> %a, <4 x float> %b) {
; SSE2-LABEL: shuffle_v4f32_0z24:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0]
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
; SSE2-NEXT: xorps %xmm1, %xmm1
@@ -1133,7 +1133,7 @@ define <4 x float> @shuffle_v4f32_0z24(<4 x float> %a, <4 x float> %b) {
; SSE2-NEXT: retq
;
; SSE3-LABEL: shuffle_v4f32_0z24:
-; SSE3: # BB#0:
+; SSE3: # %bb.0:
; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0]
; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
; SSE3-NEXT: xorps %xmm1, %xmm1
@@ -1143,7 +1143,7 @@ define <4 x float> @shuffle_v4f32_0z24(<4 x float> %a, <4 x float> %b) {
; SSE3-NEXT: retq
;
; SSSE3-LABEL: shuffle_v4f32_0z24:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0]
; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
; SSSE3-NEXT: xorps %xmm1, %xmm1
@@ -1153,12 +1153,12 @@ define <4 x float> @shuffle_v4f32_0z24(<4 x float> %a, <4 x float> %b) {
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v4f32_0z24:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2],xmm1[0]
; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v4f32_0z24:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2],xmm1[0]
; AVX-NEXT: retq
%shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 undef, i32 2, i32 4>
@@ -1168,40 +1168,40 @@ define <4 x float> @shuffle_v4f32_0z24(<4 x float> %a, <4 x float> %b) {
define <4 x i32> @shuffle_v4i32_4zzz(<4 x i32> %a) {
; SSE2-LABEL: shuffle_v4i32_4zzz:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: xorps %xmm1, %xmm1
; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
; SSE2-NEXT: movaps %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE3-LABEL: shuffle_v4i32_4zzz:
-; SSE3: # BB#0:
+; SSE3: # %bb.0:
; SSE3-NEXT: xorps %xmm1, %xmm1
; SSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
; SSE3-NEXT: movaps %xmm1, %xmm0
; SSE3-NEXT: retq
;
; SSSE3-LABEL: shuffle_v4i32_4zzz:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: xorps %xmm1, %xmm1
; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
; SSSE3-NEXT: movaps %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v4i32_4zzz:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pxor %xmm1, %xmm1
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
; SSE41-NEXT: retq
;
; AVX1OR2-LABEL: shuffle_v4i32_4zzz:
-; AVX1OR2: # BB#0:
+; AVX1OR2: # %bb.0:
; AVX1OR2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1OR2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
; AVX1OR2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v4i32_4zzz:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX512VL-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; AVX512VL-NEXT: retq
@@ -1211,45 +1211,45 @@ define <4 x i32> @shuffle_v4i32_4zzz(<4 x i32> %a) {
define <4 x i32> @shuffle_v4i32_z4zz(<4 x i32> %a) {
; SSE2-LABEL: shuffle_v4i32_z4zz:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: xorps %xmm1, %xmm1
; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,0,1,1]
; SSE2-NEXT: retq
;
; SSE3-LABEL: shuffle_v4i32_z4zz:
-; SSE3: # BB#0:
+; SSE3: # %bb.0:
; SSE3-NEXT: xorps %xmm1, %xmm1
; SSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,0,1,1]
; SSE3-NEXT: retq
;
; SSSE3-LABEL: shuffle_v4i32_z4zz:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: xorps %xmm1, %xmm1
; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,0,1,1]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v4i32_z4zz:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pxor %xmm1, %xmm1
; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,0,1,1]
; SSE41-NEXT: retq
;
; AVX1OR2-LABEL: shuffle_v4i32_z4zz:
-; AVX1OR2: # BB#0:
+; AVX1OR2: # %bb.0:
; AVX1OR2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1OR2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
; AVX1OR2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,1,1]
; AVX1OR2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v4i32_z4zz:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX512VL-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,1,1]
+; AVX512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,0,1,1]
; AVX512VL-NEXT: retq
%shuffle = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32> <i32 2, i32 4, i32 3, i32 0>
ret <4 x i32> %shuffle
@@ -1257,45 +1257,45 @@ define <4 x i32> @shuffle_v4i32_z4zz(<4 x i32> %a) {
define <4 x i32> @shuffle_v4i32_zz4z(<4 x i32> %a) {
; SSE2-LABEL: shuffle_v4i32_zz4z:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: xorps %xmm1, %xmm1
; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,0,1]
; SSE2-NEXT: retq
;
; SSE3-LABEL: shuffle_v4i32_zz4z:
-; SSE3: # BB#0:
+; SSE3: # %bb.0:
; SSE3-NEXT: xorps %xmm1, %xmm1
; SSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,0,1]
; SSE3-NEXT: retq
;
; SSSE3-LABEL: shuffle_v4i32_zz4z:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: xorps %xmm1, %xmm1
; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,0,1]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v4i32_zz4z:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pxor %xmm1, %xmm1
; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,0,1]
; SSE41-NEXT: retq
;
; AVX1OR2-LABEL: shuffle_v4i32_zz4z:
-; AVX1OR2: # BB#0:
+; AVX1OR2: # %bb.0:
; AVX1OR2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1OR2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
; AVX1OR2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,0,1]
; AVX1OR2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v4i32_zz4z:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX512VL-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,0,1]
+; AVX512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,0,1]
; AVX512VL-NEXT: retq
%shuffle = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32> <i32 0, i32 0, i32 4, i32 0>
ret <4 x i32> %shuffle
@@ -1303,12 +1303,12 @@ define <4 x i32> @shuffle_v4i32_zz4z(<4 x i32> %a) {
define <4 x i32> @shuffle_v4i32_zuu4(<4 x i32> %a) {
; SSE-LABEL: shuffle_v4i32_zuu4:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3]
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v4i32_zuu4:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3]
; AVX-NEXT: retq
%shuffle = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32> <i32 0, i32 undef, i32 undef, i32 4>
@@ -1317,45 +1317,45 @@ define <4 x i32> @shuffle_v4i32_zuu4(<4 x i32> %a) {
define <4 x i32> @shuffle_v4i32_z6zz(<4 x i32> %a) {
; SSE2-LABEL: shuffle_v4i32_z6zz:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: xorps %xmm1, %xmm1
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[0,0]
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
; SSE2-NEXT: retq
;
; SSE3-LABEL: shuffle_v4i32_z6zz:
-; SSE3: # BB#0:
+; SSE3: # %bb.0:
; SSE3-NEXT: xorps %xmm1, %xmm1
; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[0,0]
; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
; SSE3-NEXT: retq
;
; SSSE3-LABEL: shuffle_v4i32_z6zz:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: xorps %xmm1, %xmm1
; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[0,0]
; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v4i32_z6zz:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3]
; SSE41-NEXT: pxor %xmm0, %xmm0
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
; SSE41-NEXT: retq
;
; AVX1-LABEL: shuffle_v4i32_z6zz:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v4i32_z6zz:
-; AVX2OR512VL: # BB#0:
-; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
-; AVX2OR512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX2OR512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
+; AVX2OR512VL: # %bb.0:
+; AVX2OR512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; AVX2OR512VL-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX2OR512VL-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32> <i32 0, i32 6, i32 2, i32 3>
ret <4 x i32> %shuffle
@@ -1363,31 +1363,31 @@ define <4 x i32> @shuffle_v4i32_z6zz(<4 x i32> %a) {
define <4 x i32> @shuffle_v4i32_7012(<4 x i32> %a, <4 x i32> %b) {
; SSE2-LABEL: shuffle_v4i32_7012:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm0[0,0]
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,2]
; SSE2-NEXT: movaps %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE3-LABEL: shuffle_v4i32_7012:
-; SSE3: # BB#0:
+; SSE3: # %bb.0:
; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm0[0,0]
; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,2]
; SSE3-NEXT: movaps %xmm1, %xmm0
; SSE3-NEXT: retq
;
; SSSE3-LABEL: shuffle_v4i32_7012:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: palignr {{.*#+}} xmm0 = xmm1[12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v4i32_7012:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: palignr {{.*#+}} xmm0 = xmm1[12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11]
; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v4i32_7012:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11]
; AVX-NEXT: retq
%shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 7, i32 0, i32 1, i32 2>
@@ -1396,29 +1396,29 @@ define <4 x i32> @shuffle_v4i32_7012(<4 x i32> %a, <4 x i32> %b) {
define <4 x i32> @shuffle_v4i32_6701(<4 x i32> %a, <4 x i32> %b) {
; SSE2-LABEL: shuffle_v4i32_6701:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1],xmm0[0]
; SSE2-NEXT: movapd %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE3-LABEL: shuffle_v4i32_6701:
-; SSE3: # BB#0:
+; SSE3: # %bb.0:
; SSE3-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1],xmm0[0]
; SSE3-NEXT: movapd %xmm1, %xmm0
; SSE3-NEXT: retq
;
; SSSE3-LABEL: shuffle_v4i32_6701:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: palignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v4i32_6701:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: palignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v4i32_6701:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
; AVX-NEXT: retq
%shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
@@ -1427,31 +1427,31 @@ define <4 x i32> @shuffle_v4i32_6701(<4 x i32> %a, <4 x i32> %b) {
define <4 x i32> @shuffle_v4i32_5670(<4 x i32> %a, <4 x i32> %b) {
; SSE2-LABEL: shuffle_v4i32_5670:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0]
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm0[2,0]
; SSE2-NEXT: movaps %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE3-LABEL: shuffle_v4i32_5670:
-; SSE3: # BB#0:
+; SSE3: # %bb.0:
; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0]
; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm0[2,0]
; SSE3-NEXT: movaps %xmm1, %xmm0
; SSE3-NEXT: retq
;
; SSSE3-LABEL: shuffle_v4i32_5670:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: palignr {{.*#+}} xmm0 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v4i32_5670:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: palignr {{.*#+}} xmm0 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3]
; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v4i32_5670:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3]
; AVX-NEXT: retq
%shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 5, i32 6, i32 7, i32 0>
@@ -1460,31 +1460,31 @@ define <4 x i32> @shuffle_v4i32_5670(<4 x i32> %a, <4 x i32> %b) {
define <4 x i32> @shuffle_v4i32_1234(<4 x i32> %a, <4 x i32> %b) {
; SSE2-LABEL: shuffle_v4i32_1234:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[2,0]
; SSE2-NEXT: retq
;
; SSE3-LABEL: shuffle_v4i32_1234:
-; SSE3: # BB#0:
+; SSE3: # %bb.0:
; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[2,0]
; SSE3-NEXT: retq
;
; SSSE3-LABEL: shuffle_v4i32_1234:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: palignr {{.*#+}} xmm1 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3]
; SSSE3-NEXT: movdqa %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v4i32_1234:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: palignr {{.*#+}} xmm1 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3]
; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v4i32_1234:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3]
; AVX-NEXT: retq
%shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
@@ -1493,29 +1493,29 @@ define <4 x i32> @shuffle_v4i32_1234(<4 x i32> %a, <4 x i32> %b) {
define <4 x i32> @shuffle_v4i32_2345(<4 x i32> %a, <4 x i32> %b) {
; SSE2-LABEL: shuffle_v4i32_2345:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0]
; SSE2-NEXT: retq
;
; SSE3-LABEL: shuffle_v4i32_2345:
-; SSE3: # BB#0:
+; SSE3: # %bb.0:
; SSE3-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0]
; SSE3-NEXT: retq
;
; SSSE3-LABEL: shuffle_v4i32_2345:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: palignr {{.*#+}} xmm1 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
; SSSE3-NEXT: movdqa %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v4i32_2345:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: palignr {{.*#+}} xmm1 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v4i32_2345:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
; AVX-NEXT: retq
%shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
@@ -1524,14 +1524,14 @@ define <4 x i32> @shuffle_v4i32_2345(<4 x i32> %a, <4 x i32> %b) {
define <4 x i32> @shuffle_v4i32_40u1(<4 x i32> %a, <4 x i32> %b) {
; SSE-LABEL: shuffle_v4i32_40u1:
-; SSE: # BB#0:
-; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE-NEXT: movdqa %xmm1, %xmm0
+; SSE: # %bb.0:
+; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE-NEXT: movaps %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v4i32_40u1:
-; AVX: # BB#0:
-; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; AVX: # %bb.0:
+; AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; AVX-NEXT: retq
%shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 0, i32 undef, i32 1>
ret <4 x i32> %shuffle
@@ -1539,31 +1539,31 @@ define <4 x i32> @shuffle_v4i32_40u1(<4 x i32> %a, <4 x i32> %b) {
define <4 x i32> @shuffle_v4i32_3456(<4 x i32> %a, <4 x i32> %b) {
; SSE2-LABEL: shuffle_v4i32_3456:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[0,0]
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,2]
; SSE2-NEXT: retq
;
; SSE3-LABEL: shuffle_v4i32_3456:
-; SSE3: # BB#0:
+; SSE3: # %bb.0:
; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[0,0]
; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,2]
; SSE3-NEXT: retq
;
; SSSE3-LABEL: shuffle_v4i32_3456:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: palignr {{.*#+}} xmm1 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11]
; SSSE3-NEXT: movdqa %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v4i32_3456:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: palignr {{.*#+}} xmm1 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11]
; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v4i32_3456:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11]
; AVX-NEXT: retq
%shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
@@ -1572,27 +1572,27 @@ define <4 x i32> @shuffle_v4i32_3456(<4 x i32> %a, <4 x i32> %b) {
define <4 x i32> @shuffle_v4i32_0u1u(<4 x i32> %a, <4 x i32> %b) {
; SSE2-LABEL: shuffle_v4i32_0u1u:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
; SSE2-NEXT: retq
;
; SSE3-LABEL: shuffle_v4i32_0u1u:
-; SSE3: # BB#0:
+; SSE3: # %bb.0:
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
; SSE3-NEXT: retq
;
; SSSE3-LABEL: shuffle_v4i32_0u1u:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v4i32_0u1u:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v4i32_0u1u:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX-NEXT: retq
%shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 undef, i32 1, i32 undef>
@@ -1601,30 +1601,30 @@ define <4 x i32> @shuffle_v4i32_0u1u(<4 x i32> %a, <4 x i32> %b) {
define <4 x i32> @shuffle_v4i32_0z1z(<4 x i32> %a) {
; SSE2-LABEL: shuffle_v4i32_0z1z:
-; SSE2: # BB#0:
-; SSE2-NEXT: pxor %xmm1, %xmm1
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2: # %bb.0:
+; SSE2-NEXT: xorps %xmm1, %xmm1
+; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: retq
;
; SSE3-LABEL: shuffle_v4i32_0z1z:
-; SSE3: # BB#0:
-; SSE3-NEXT: pxor %xmm1, %xmm1
-; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE3: # %bb.0:
+; SSE3-NEXT: xorps %xmm1, %xmm1
+; SSE3-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE3-NEXT: retq
;
; SSSE3-LABEL: shuffle_v4i32_0z1z:
-; SSSE3: # BB#0:
-; SSSE3-NEXT: pxor %xmm1, %xmm1
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSSE3: # %bb.0:
+; SSSE3-NEXT: xorps %xmm1, %xmm1
+; SSSE3-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v4i32_0z1z:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v4i32_0z1z:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX-NEXT: retq
%shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
@@ -1633,12 +1633,12 @@ define <4 x i32> @shuffle_v4i32_0z1z(<4 x i32> %a) {
define <4 x i32> @shuffle_v4i32_01zu(<4 x i32> %a) {
; SSE-LABEL: shuffle_v4i32_01zu:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v4i32_01zu:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
; AVX-NEXT: retq
%shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 7, i32 undef>
@@ -1647,36 +1647,36 @@ define <4 x i32> @shuffle_v4i32_01zu(<4 x i32> %a) {
define <4 x i32> @shuffle_v4i32_0z23(<4 x i32> %a) {
; SSE2-LABEL: shuffle_v4i32_0z23:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: andps {{.*}}(%rip), %xmm0
; SSE2-NEXT: retq
;
; SSE3-LABEL: shuffle_v4i32_0z23:
-; SSE3: # BB#0:
+; SSE3: # %bb.0:
; SSE3-NEXT: andps {{.*}}(%rip), %xmm0
; SSE3-NEXT: retq
;
; SSSE3-LABEL: shuffle_v4i32_0z23:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: andps {{.*}}(%rip), %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v4i32_0z23:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pxor %xmm1, %xmm1
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
; SSE41-NEXT: retq
;
; AVX1-LABEL: shuffle_v4i32_0z23:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v4i32_0z23:
-; AVX2OR512VL: # BB#0:
-; AVX2OR512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX2OR512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX2OR512VL: # %bb.0:
+; AVX2OR512VL-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX2OR512VL-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 4, i32 2, i32 3>
ret <4 x i32> %shuffle
@@ -1684,36 +1684,36 @@ define <4 x i32> @shuffle_v4i32_0z23(<4 x i32> %a) {
define <4 x i32> @shuffle_v4i32_01z3(<4 x i32> %a) {
; SSE2-LABEL: shuffle_v4i32_01z3:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: andps {{.*}}(%rip), %xmm0
; SSE2-NEXT: retq
;
; SSE3-LABEL: shuffle_v4i32_01z3:
-; SSE3: # BB#0:
+; SSE3: # %bb.0:
; SSE3-NEXT: andps {{.*}}(%rip), %xmm0
; SSE3-NEXT: retq
;
; SSSE3-LABEL: shuffle_v4i32_01z3:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: andps {{.*}}(%rip), %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v4i32_01z3:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pxor %xmm1, %xmm1
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5],xmm0[6,7]
; SSE41-NEXT: retq
;
; AVX1-LABEL: shuffle_v4i32_01z3:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5],xmm0[6,7]
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v4i32_01z3:
-; AVX2OR512VL: # BB#0:
-; AVX2OR512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX2OR512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
+; AVX2OR512VL: # %bb.0:
+; AVX2OR512VL-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX2OR512VL-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 4, i32 3>
ret <4 x i32> %shuffle
@@ -1721,36 +1721,36 @@ define <4 x i32> @shuffle_v4i32_01z3(<4 x i32> %a) {
define <4 x i32> @shuffle_v4i32_012z(<4 x i32> %a) {
; SSE2-LABEL: shuffle_v4i32_012z:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: andps {{.*}}(%rip), %xmm0
; SSE2-NEXT: retq
;
; SSE3-LABEL: shuffle_v4i32_012z:
-; SSE3: # BB#0:
+; SSE3: # %bb.0:
; SSE3-NEXT: andps {{.*}}(%rip), %xmm0
; SSE3-NEXT: retq
;
; SSSE3-LABEL: shuffle_v4i32_012z:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: andps {{.*}}(%rip), %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v4i32_012z:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pxor %xmm1, %xmm1
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
; SSE41-NEXT: retq
;
; AVX1-LABEL: shuffle_v4i32_012z:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v4i32_012z:
-; AVX2OR512VL: # BB#0:
-; AVX2OR512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX2OR512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
+; AVX2OR512VL: # %bb.0:
+; AVX2OR512VL-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX2OR512VL-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
ret <4 x i32> %shuffle
@@ -1758,36 +1758,36 @@ define <4 x i32> @shuffle_v4i32_012z(<4 x i32> %a) {
define <4 x i32> @shuffle_v4i32_0zz3(<4 x i32> %a) {
; SSE2-LABEL: shuffle_v4i32_0zz3:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: andps {{.*}}(%rip), %xmm0
; SSE2-NEXT: retq
;
; SSE3-LABEL: shuffle_v4i32_0zz3:
-; SSE3: # BB#0:
+; SSE3: # %bb.0:
; SSE3-NEXT: andps {{.*}}(%rip), %xmm0
; SSE3-NEXT: retq
;
; SSSE3-LABEL: shuffle_v4i32_0zz3:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: andps {{.*}}(%rip), %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v4i32_0zz3:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pxor %xmm1, %xmm1
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5],xmm0[6,7]
; SSE41-NEXT: retq
;
; AVX1-LABEL: shuffle_v4i32_0zz3:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5],xmm0[6,7]
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v4i32_0zz3:
-; AVX2OR512VL: # BB#0:
-; AVX2OR512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX2OR512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3]
+; AVX2OR512VL: # %bb.0:
+; AVX2OR512VL-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX2OR512VL-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3]
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 4, i32 4, i32 3>
ret <4 x i32> %shuffle
@@ -1795,13 +1795,13 @@ define <4 x i32> @shuffle_v4i32_0zz3(<4 x i32> %a) {
define <4 x i32> @shuffle_v4i32_bitcast_0415(<4 x i32> %a, <4 x i32> %b) {
; SSE-LABEL: shuffle_v4i32_bitcast_0415:
-; SSE: # BB#0:
-; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE: # %bb.0:
+; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v4i32_bitcast_0415:
-; AVX: # BB#0:
-; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX: # %bb.0:
+; AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; AVX-NEXT: retq
%shuffle32 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 5, i32 0, i32 4>
%bitcast64 = bitcast <4 x i32> %shuffle32 to <2 x double>
@@ -1812,20 +1812,20 @@ define <4 x i32> @shuffle_v4i32_bitcast_0415(<4 x i32> %a, <4 x i32> %b) {
define <4 x float> @shuffle_v4f32_bitcast_4401(<4 x float> %a, <4 x i32> %b) {
; SSE-LABEL: shuffle_v4f32_bitcast_4401:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,1]
; SSE-NEXT: movaps %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX1OR2-LABEL: shuffle_v4f32_bitcast_4401:
-; AVX1OR2: # BB#0:
+; AVX1OR2: # %bb.0:
; AVX1OR2-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,0],xmm0[0,1]
; AVX1OR2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v4f32_bitcast_4401:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
-; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,1,1]
+; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX512VL-NEXT: retq
%1 = shufflevector <4 x i32> %b, <4 x i32> undef, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
%2 = bitcast <4 x i32> %1 to <2 x double>
@@ -1837,12 +1837,12 @@ define <4 x float> @shuffle_v4f32_bitcast_4401(<4 x float> %a, <4 x i32> %b) {
define <4 x float> @shuffle_v4f32_bitcast_0045(<4 x float> %a, <4 x i32> %b) {
; SSE-LABEL: shuffle_v4f32_bitcast_0045:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,1]
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v4f32_bitcast_0045:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,1]
; AVX-NEXT: retq
%1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
@@ -1853,33 +1853,33 @@ define <4 x float> @shuffle_v4f32_bitcast_0045(<4 x float> %a, <4 x i32> %b) {
define <4 x float> @mask_v4f32_4127(<4 x float> %a, <4 x float> %b) {
; SSE2-LABEL: mask_v4f32_4127:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[1,2]
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,3,1]
; SSE2-NEXT: movaps %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE3-LABEL: mask_v4f32_4127:
-; SSE3: # BB#0:
+; SSE3: # %bb.0:
; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[1,2]
; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,3,1]
; SSE3-NEXT: movaps %xmm1, %xmm0
; SSE3-NEXT: retq
;
; SSSE3-LABEL: mask_v4f32_4127:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[1,2]
; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,3,1]
; SSSE3-NEXT: movaps %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: mask_v4f32_4127:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
; SSE41-NEXT: retq
;
; AVX-LABEL: mask_v4f32_4127:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
; AVX-NEXT: retq
%1 = bitcast <4 x float> %a to <4 x i32>
@@ -1893,33 +1893,33 @@ define <4 x float> @mask_v4f32_4127(<4 x float> %a, <4 x float> %b) {
define <4 x float> @mask_v4f32_0127(<4 x float> %a, <4 x float> %b) {
; SSE2-LABEL: mask_v4f32_0127:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0]
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
; SSE2-NEXT: movaps %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE3-LABEL: mask_v4f32_0127:
-; SSE3: # BB#0:
+; SSE3: # %bb.0:
; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0]
; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
; SSE3-NEXT: movaps %xmm1, %xmm0
; SSE3-NEXT: retq
;
; SSSE3-LABEL: mask_v4f32_0127:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0]
; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
; SSSE3-NEXT: movaps %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: mask_v4f32_0127:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
; SSE41-NEXT: retq
;
; AVX-LABEL: mask_v4f32_0127:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
; AVX-NEXT: retq
%1 = bitcast <4 x float> %a to <2 x i64>
@@ -1933,39 +1933,39 @@ define <4 x float> @mask_v4f32_0127(<4 x float> %a, <4 x float> %b) {
define <4 x i32> @mask_v4i32_0127(<4 x i32> %a, <4 x i32> %b) {
; SSE2-LABEL: mask_v4i32_0127:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0]
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
; SSE2-NEXT: movaps %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE3-LABEL: mask_v4i32_0127:
-; SSE3: # BB#0:
+; SSE3: # %bb.0:
; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0]
; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
; SSE3-NEXT: movaps %xmm1, %xmm0
; SSE3-NEXT: retq
;
; SSSE3-LABEL: mask_v4i32_0127:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0]
; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
; SSSE3-NEXT: movaps %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: mask_v4i32_0127:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7]
; SSE41-NEXT: retq
;
; AVX1-LABEL: mask_v4i32_0127:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7]
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: mask_v4i32_0127:
-; AVX2OR512VL: # BB#0:
-; AVX2OR512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
+; AVX2OR512VL: # %bb.0:
+; AVX2OR512VL-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
; AVX2OR512VL-NEXT: retq
%1 = bitcast <4 x i32> %a to <2 x i64>
%2 = bitcast <4 x i32> %b to <2 x i64>
@@ -1978,28 +1978,28 @@ define <4 x i32> @mask_v4i32_0127(<4 x i32> %a, <4 x i32> %b) {
define <4 x float> @broadcast_v4f32_0101_from_v2f32(<2 x float>* %x) {
; SSE2-LABEL: broadcast_v4f32_0101_from_v2f32:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; SSE2-NEXT: retq
;
; SSE3-LABEL: broadcast_v4f32_0101_from_v2f32:
-; SSE3: # BB#0:
+; SSE3: # %bb.0:
; SSE3-NEXT: movddup {{.*#+}} xmm0 = mem[0,0]
; SSE3-NEXT: retq
;
; SSSE3-LABEL: broadcast_v4f32_0101_from_v2f32:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: movddup {{.*#+}} xmm0 = mem[0,0]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: broadcast_v4f32_0101_from_v2f32:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movddup {{.*#+}} xmm0 = mem[0,0]
; SSE41-NEXT: retq
;
; AVX-LABEL: broadcast_v4f32_0101_from_v2f32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
; AVX-NEXT: retq
%1 = load <2 x float>, <2 x float>* %x, align 1
@@ -2009,12 +2009,12 @@ define <4 x float> @broadcast_v4f32_0101_from_v2f32(<2 x float>* %x) {
define <4 x i32> @insert_reg_and_zero_v4i32(i32 %a) {
; SSE-LABEL: insert_reg_and_zero_v4i32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movd %edi, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: insert_reg_and_zero_v4i32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovd %edi, %xmm0
; AVX-NEXT: retq
%v = insertelement <4 x i32> undef, i32 %a, i32 0
@@ -2024,12 +2024,12 @@ define <4 x i32> @insert_reg_and_zero_v4i32(i32 %a) {
define <4 x i32> @insert_mem_and_zero_v4i32(i32* %ptr) {
; SSE-LABEL: insert_mem_and_zero_v4i32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE-NEXT: retq
;
; AVX-LABEL: insert_mem_and_zero_v4i32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX-NEXT: retq
%a = load i32, i32* %ptr
@@ -2040,40 +2040,40 @@ define <4 x i32> @insert_mem_and_zero_v4i32(i32* %ptr) {
define <4 x float> @insert_reg_and_zero_v4f32(float %a) {
; SSE2-LABEL: insert_reg_and_zero_v4f32:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: xorps %xmm1, %xmm1
; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
; SSE2-NEXT: movaps %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE3-LABEL: insert_reg_and_zero_v4f32:
-; SSE3: # BB#0:
+; SSE3: # %bb.0:
; SSE3-NEXT: xorps %xmm1, %xmm1
; SSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
; SSE3-NEXT: movaps %xmm1, %xmm0
; SSE3-NEXT: retq
;
; SSSE3-LABEL: insert_reg_and_zero_v4f32:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: xorps %xmm1, %xmm1
; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
; SSSE3-NEXT: movaps %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: insert_reg_and_zero_v4f32:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: xorps %xmm1, %xmm1
; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; SSE41-NEXT: retq
;
; AVX1OR2-LABEL: insert_reg_and_zero_v4f32:
-; AVX1OR2: # BB#0:
+; AVX1OR2: # %bb.0:
; AVX1OR2-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX1OR2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; AVX1OR2-NEXT: retq
;
; AVX512VL-LABEL: insert_reg_and_zero_v4f32:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX512VL-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; AVX512VL-NEXT: retq
@@ -2084,12 +2084,12 @@ define <4 x float> @insert_reg_and_zero_v4f32(float %a) {
define <4 x float> @insert_mem_and_zero_v4f32(float* %ptr) {
; SSE-LABEL: insert_mem_and_zero_v4f32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE-NEXT: retq
;
; AVX-LABEL: insert_mem_and_zero_v4f32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX-NEXT: retq
%a = load float, float* %ptr
@@ -2100,37 +2100,37 @@ define <4 x float> @insert_mem_and_zero_v4f32(float* %ptr) {
define <4 x i32> @insert_reg_lo_v4i32(i64 %a, <4 x i32> %b) {
; SSE2-LABEL: insert_reg_lo_v4i32:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movq %rdi, %xmm1
; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; SSE2-NEXT: retq
;
; SSE3-LABEL: insert_reg_lo_v4i32:
-; SSE3: # BB#0:
+; SSE3: # %bb.0:
; SSE3-NEXT: movq %rdi, %xmm1
; SSE3-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; SSE3-NEXT: retq
;
; SSSE3-LABEL: insert_reg_lo_v4i32:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: movq %rdi, %xmm1
; SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: insert_reg_lo_v4i32:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movq %rdi, %xmm1
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
; SSE41-NEXT: retq
;
; AVX1-LABEL: insert_reg_lo_v4i32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovq %rdi, %xmm1
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: insert_reg_lo_v4i32:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vmovq %rdi, %xmm1
; AVX2OR512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
; AVX2OR512VL-NEXT: retq
@@ -2142,43 +2142,37 @@ define <4 x i32> @insert_reg_lo_v4i32(i64 %a, <4 x i32> %b) {
define <4 x i32> @insert_mem_lo_v4i32(<2 x i32>* %ptr, <4 x i32> %b) {
; SSE2-LABEL: insert_mem_lo_v4i32:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movlpd {{.*#+}} xmm0 = mem[0],xmm0[1]
; SSE2-NEXT: retq
;
; SSE3-LABEL: insert_mem_lo_v4i32:
-; SSE3: # BB#0:
+; SSE3: # %bb.0:
; SSE3-NEXT: movlpd {{.*#+}} xmm0 = mem[0],xmm0[1]
; SSE3-NEXT: retq
;
; SSSE3-LABEL: insert_mem_lo_v4i32:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: movlpd {{.*#+}} xmm0 = mem[0],xmm0[1]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: insert_mem_lo_v4i32:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
; SSE41-NEXT: retq
;
; AVX1-LABEL: insert_mem_lo_v4i32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
; AVX1-NEXT: retq
;
-; AVX2-LABEL: insert_mem_lo_v4i32:
-; AVX2: # BB#0:
-; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; AVX2-NEXT: retq
-;
-; AVX512VL-LABEL: insert_mem_lo_v4i32:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
-; AVX512VL-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,2],xmm0[2,3]
-; AVX512VL-NEXT: retq
+; AVX2OR512VL-LABEL: insert_mem_lo_v4i32:
+; AVX2OR512VL: # %bb.0:
+; AVX2OR512VL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX2OR512VL-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX2OR512VL-NEXT: retq
%a = load <2 x i32>, <2 x i32>* %ptr
%v = shufflevector <2 x i32> %a, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
%shuffle = shufflevector <4 x i32> %v, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
@@ -2187,13 +2181,13 @@ define <4 x i32> @insert_mem_lo_v4i32(<2 x i32>* %ptr, <4 x i32> %b) {
define <4 x i32> @insert_reg_hi_v4i32(i64 %a, <4 x i32> %b) {
; SSE-LABEL: insert_reg_hi_v4i32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movq %rdi, %xmm1
; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE-NEXT: retq
;
; AVX-LABEL: insert_reg_hi_v4i32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovq %rdi, %xmm1
; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX-NEXT: retq
@@ -2205,22 +2199,16 @@ define <4 x i32> @insert_reg_hi_v4i32(i64 %a, <4 x i32> %b) {
define <4 x i32> @insert_mem_hi_v4i32(<2 x i32>* %ptr, <4 x i32> %b) {
; SSE-LABEL: insert_mem_hi_v4i32:
-; SSE: # BB#0:
-; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE: # %bb.0:
+; SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
+; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE-NEXT: retq
;
-; AVX1OR2-LABEL: insert_mem_hi_v4i32:
-; AVX1OR2: # BB#0:
-; AVX1OR2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
-; AVX1OR2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX1OR2-NEXT: retq
-;
-; AVX512VL-LABEL: insert_mem_hi_v4i32:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
-; AVX512VL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
-; AVX512VL-NEXT: retq
+; AVX-LABEL: insert_mem_hi_v4i32:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT: retq
%a = load <2 x i32>, <2 x i32>* %ptr
%v = shufflevector <2 x i32> %a, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
%shuffle = shufflevector <4 x i32> %v, <4 x i32> %b, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
@@ -2229,35 +2217,35 @@ define <4 x i32> @insert_mem_hi_v4i32(<2 x i32>* %ptr, <4 x i32> %b) {
define <4 x float> @insert_reg_lo_v4f32(double %a, <4 x float> %b) {
; SSE2-LABEL: insert_reg_lo_v4f32:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
; SSE2-NEXT: movapd %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE3-LABEL: insert_reg_lo_v4f32:
-; SSE3: # BB#0:
+; SSE3: # %bb.0:
; SSE3-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
; SSE3-NEXT: movapd %xmm1, %xmm0
; SSE3-NEXT: retq
;
; SSSE3-LABEL: insert_reg_lo_v4f32:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
; SSSE3-NEXT: movapd %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: insert_reg_lo_v4f32:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
; SSE41-NEXT: retq
;
; AVX1OR2-LABEL: insert_reg_lo_v4f32:
-; AVX1OR2: # BB#0:
+; AVX1OR2: # %bb.0:
; AVX1OR2-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
; AVX1OR2-NEXT: retq
;
; AVX512VL-LABEL: insert_reg_lo_v4f32:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
; AVX512VL-NEXT: retq
%a.cast = bitcast double %a to <2 x float>
@@ -2268,12 +2256,12 @@ define <4 x float> @insert_reg_lo_v4f32(double %a, <4 x float> %b) {
define <4 x float> @insert_mem_lo_v4f32(<2 x float>* %ptr, <4 x float> %b) {
; SSE-LABEL: insert_mem_lo_v4f32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movlpd {{.*#+}} xmm0 = mem[0],xmm0[1]
; SSE-NEXT: retq
;
; AVX-LABEL: insert_mem_lo_v4f32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovlpd {{.*#+}} xmm0 = mem[0],xmm0[1]
; AVX-NEXT: retq
%a = load <2 x float>, <2 x float>* %ptr
@@ -2284,14 +2272,14 @@ define <4 x float> @insert_mem_lo_v4f32(<2 x float>* %ptr, <4 x float> %b) {
define <4 x float> @insert_reg_hi_v4f32(double %a, <4 x float> %b) {
; SSE-LABEL: insert_reg_hi_v4f32:
-; SSE: # BB#0:
-; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; SSE-NEXT: movapd %xmm1, %xmm0
+; SSE: # %bb.0:
+; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE-NEXT: movaps %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: insert_reg_hi_v4f32:
-; AVX: # BB#0:
-; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX: # %bb.0:
+; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX-NEXT: retq
%a.cast = bitcast double %a to <2 x float>
%v = shufflevector <2 x float> %a.cast, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
@@ -2301,12 +2289,12 @@ define <4 x float> @insert_reg_hi_v4f32(double %a, <4 x float> %b) {
define <4 x float> @insert_mem_hi_v4f32(<2 x float>* %ptr, <4 x float> %b) {
; SSE-LABEL: insert_mem_hi_v4f32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
; SSE-NEXT: retq
;
; AVX-LABEL: insert_mem_hi_v4f32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
; AVX-NEXT: retq
%a = load <2 x float>, <2 x float>* %ptr
@@ -2317,13 +2305,13 @@ define <4 x float> @insert_mem_hi_v4f32(<2 x float>* %ptr, <4 x float> %b) {
define <4 x float> @shuffle_mem_v4f32_3210(<4 x float>* %ptr) {
; SSE-LABEL: shuffle_mem_v4f32_3210:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps (%rdi), %xmm0
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,2,1,0]
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_mem_v4f32_3210:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpermilps {{.*#+}} xmm0 = mem[3,2,1,0]
; AVX-NEXT: retq
%a = load <4 x float>, <4 x float>* %ptr
@@ -2333,13 +2321,13 @@ define <4 x float> @shuffle_mem_v4f32_3210(<4 x float>* %ptr) {
define <4 x i32> @insert_dup_mem_v4i32(i32* %ptr) {
; SSE-LABEL: insert_dup_mem_v4i32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; SSE-NEXT: retq
;
; AVX-LABEL: insert_dup_mem_v4i32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vbroadcastss (%rdi), %xmm0
; AVX-NEXT: retq
%tmp = load i32, i32* %ptr, align 4
@@ -2354,12 +2342,12 @@ define <4 x i32> @insert_dup_mem_v4i32(i32* %ptr) {
define <4 x i32> @shuffle_v4i32_z0zX(<4 x i32> %a) {
; SSE-LABEL: shuffle_v4i32_z0zX:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: psllq $32, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v4i32_z0zX:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpsllq $32, %xmm0, %xmm0
; AVX-NEXT: retq
%shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> <i32 4, i32 0, i32 4, i32 undef>
@@ -2368,12 +2356,12 @@ define <4 x i32> @shuffle_v4i32_z0zX(<4 x i32> %a) {
define <4 x i32> @shuffle_v4i32_1z3z(<4 x i32> %a) {
; SSE-LABEL: shuffle_v4i32_1z3z:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: psrlq $32, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v4i32_1z3z:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpsrlq $32, %xmm0, %xmm0
; AVX-NEXT: retq
%shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> <i32 1, i32 4, i32 3, i32 4>
diff --git a/test/CodeGen/X86/vector-shuffle-128-v8.ll b/test/CodeGen/X86/vector-shuffle-128-v8.ll
index 1cf8453fc6ad..60bc36948d23 100644
--- a/test/CodeGen/X86/vector-shuffle-128-v8.ll
+++ b/test/CodeGen/X86/vector-shuffle-128-v8.ll
@@ -8,50 +8,50 @@
define <8 x i16> @shuffle_v8i16_01012323(<8 x i16> %a, <8 x i16> %b) {
; SSE-LABEL: shuffle_v8i16_01012323:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v8i16_01012323:
-; AVX: # BB#0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; AVX: # %bb.0:
+; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,1]
; AVX-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 2, i32 3, i32 2, i32 3>
ret <8 x i16> %shuffle
}
define <8 x i16> @shuffle_v8i16_67452301(<8 x i16> %a, <8 x i16> %b) {
; SSE-LABEL: shuffle_v8i16_67452301:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v8i16_67452301:
-; AVX: # BB#0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; AVX: # %bb.0:
+; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
; AVX-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 6, i32 7, i32 4, i32 5, i32 2, i32 3, i32 0, i32 1>
ret <8 x i16> %shuffle
}
define <8 x i16> @shuffle_v8i16_456789AB(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: shuffle_v8i16_456789AB:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: shuffle_v8i16_456789AB:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: palignr {{.*#+}} xmm1 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
; SSSE3-NEXT: movdqa %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v8i16_456789AB:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: palignr {{.*#+}} xmm1 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v8i16_456789AB:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
; AVX-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
@@ -60,19 +60,19 @@ define <8 x i16> @shuffle_v8i16_456789AB(<8 x i16> %a, <8 x i16> %b) {
define <8 x i16> @shuffle_v8i16_00000000(<8 x i16> %a, <8 x i16> %b) {
; SSE-LABEL: shuffle_v8i16_00000000:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; SSE-NEXT: retq
;
; AVX1-LABEL: shuffle_v8i16_00000000:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v8i16_00000000:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpbroadcastw %xmm0, %xmm0
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
@@ -80,13 +80,13 @@ define <8 x i16> @shuffle_v8i16_00000000(<8 x i16> %a, <8 x i16> %b) {
}
define <8 x i16> @shuffle_v8i16_00004444(<8 x i16> %a, <8 x i16> %b) {
; SSE-LABEL: shuffle_v8i16_00004444:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v8i16_00004444:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
; AVX-NEXT: retq
@@ -95,12 +95,12 @@ define <8 x i16> @shuffle_v8i16_00004444(<8 x i16> %a, <8 x i16> %b) {
}
define <8 x i16> @shuffle_v8i16_u0u1u2u3(<8 x i16> %a, <8 x i16> %b) {
; SSE-LABEL: shuffle_v8i16_u0u1u2u3:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v8i16_u0u1u2u3:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
; AVX-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 0, i32 undef, i32 1, i32 undef, i32 2, i32 undef, i32 3>
@@ -108,12 +108,12 @@ define <8 x i16> @shuffle_v8i16_u0u1u2u3(<8 x i16> %a, <8 x i16> %b) {
}
define <8 x i16> @shuffle_v8i16_u4u5u6u7(<8 x i16> %a, <8 x i16> %b) {
; SSE-LABEL: shuffle_v8i16_u4u5u6u7:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v8i16_u4u5u6u7:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
; AVX-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 4, i32 undef, i32 5, i32 undef, i32 6, i32 undef, i32 7>
@@ -121,13 +121,13 @@ define <8 x i16> @shuffle_v8i16_u4u5u6u7(<8 x i16> %a, <8 x i16> %b) {
}
define <8 x i16> @shuffle_v8i16_31206745(<8 x i16> %a, <8 x i16> %b) {
; SSE-LABEL: shuffle_v8i16_31206745:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,0,4,5,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,3,2]
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v8i16_31206745:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,0,4,5,6,7]
; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,2]
; AVX-NEXT: retq
@@ -136,24 +136,24 @@ define <8 x i16> @shuffle_v8i16_31206745(<8 x i16> %a, <8 x i16> %b) {
}
define <8 x i16> @shuffle_v8i16_44440000(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: shuffle_v8i16_44440000:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: shuffle_v8i16_44440000:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,8,9,0,1,0,1,0,1,0,1]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v8i16_44440000:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,8,9,0,1,0,1,0,1,0,1]
; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v8i16_44440000:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,8,9,0,1,0,1,0,1,0,1]
; AVX-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 4, i32 4, i32 4, i32 0, i32 0, i32 0, i32 0>
@@ -161,26 +161,26 @@ define <8 x i16> @shuffle_v8i16_44440000(<8 x i16> %a, <8 x i16> %b) {
}
define <8 x i16> @shuffle_v8i16_23016745(<8 x i16> %a, <8 x i16> %b) {
; SSE-LABEL: shuffle_v8i16_23016745:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,3,2]
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v8i16_23016745:
-; AVX: # BB#0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,3,2]
+; AVX: # %bb.0:
+; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,0,3,2]
; AVX-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5>
ret <8 x i16> %shuffle
}
define <8 x i16> @shuffle_v8i16_23026745(<8 x i16> %a, <8 x i16> %b) {
; SSE-LABEL: shuffle_v8i16_23026745:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,3,0,2,4,5,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,3,2]
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v8i16_23026745:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,3,0,2,4,5,6,7]
; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,2]
; AVX-NEXT: retq
@@ -189,13 +189,13 @@ define <8 x i16> @shuffle_v8i16_23026745(<8 x i16> %a, <8 x i16> %b) {
}
define <8 x i16> @shuffle_v8i16_23016747(<8 x i16> %a, <8 x i16> %b) {
; SSE-LABEL: shuffle_v8i16_23016747:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,2,3]
; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,4,7]
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v8i16_23016747:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,2,3]
; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,4,7]
; AVX-NEXT: retq
@@ -204,24 +204,24 @@ define <8 x i16> @shuffle_v8i16_23016747(<8 x i16> %a, <8 x i16> %b) {
}
define <8 x i16> @shuffle_v8i16_75643120(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: shuffle_v8i16_75643120:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,0,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,4]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: shuffle_v8i16_75643120:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[14,15,10,11,12,13,8,9,6,7,2,3,4,5,0,1]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v8i16_75643120:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[14,15,10,11,12,13,8,9,6,7,2,3,4,5,0,1]
; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v8i16_75643120:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[14,15,10,11,12,13,8,9,6,7,2,3,4,5,0,1]
; AVX-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 7, i32 5, i32 6, i32 4, i32 3, i32 1, i32 2, i32 0>
@@ -230,24 +230,24 @@ define <8 x i16> @shuffle_v8i16_75643120(<8 x i16> %a, <8 x i16> %b) {
define <8 x i16> @shuffle_v8i16_10545410(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: shuffle_v8i16_10545410:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,0]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: shuffle_v8i16_10545410:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,3,0,1,10,11,8,9,10,11,8,9,2,3,0,1]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v8i16_10545410:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,3,0,1,10,11,8,9,10,11,8,9,2,3,0,1]
; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v8i16_10545410:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,0,1,10,11,8,9,10,11,8,9,2,3,0,1]
; AVX-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 0, i32 5, i32 4, i32 5, i32 4, i32 1, i32 0>
@@ -255,24 +255,24 @@ define <8 x i16> @shuffle_v8i16_10545410(<8 x i16> %a, <8 x i16> %b) {
}
define <8 x i16> @shuffle_v8i16_54105410(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: shuffle_v8i16_54105410:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,0]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: shuffle_v8i16_54105410:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[10,11,8,9,2,3,0,1,10,11,8,9,2,3,0,1]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v8i16_54105410:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[10,11,8,9,2,3,0,1,10,11,8,9,2,3,0,1]
; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v8i16_54105410:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,8,9,2,3,0,1,10,11,8,9,2,3,0,1]
; AVX-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 5, i32 4, i32 1, i32 0, i32 5, i32 4, i32 1, i32 0>
@@ -280,24 +280,24 @@ define <8 x i16> @shuffle_v8i16_54105410(<8 x i16> %a, <8 x i16> %b) {
}
define <8 x i16> @shuffle_v8i16_54101054(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: shuffle_v8i16_54101054:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,0]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: shuffle_v8i16_54101054:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[10,11,8,9,2,3,0,1,2,3,0,1,10,11,8,9]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v8i16_54101054:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[10,11,8,9,2,3,0,1,2,3,0,1,10,11,8,9]
; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v8i16_54101054:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,8,9,2,3,0,1,2,3,0,1,10,11,8,9]
; AVX-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 5, i32 4, i32 1, i32 0, i32 1, i32 0, i32 5, i32 4>
@@ -305,24 +305,24 @@ define <8 x i16> @shuffle_v8i16_54101054(<8 x i16> %a, <8 x i16> %b) {
}
define <8 x i16> @shuffle_v8i16_04400440(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: shuffle_v8i16_04400440:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,0]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,0,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,4,6]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: shuffle_v8i16_04400440:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,0,1,8,9,8,9,0,1]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v8i16_04400440:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,0,1,8,9,8,9,0,1]
; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v8i16_04400440:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,0,1,8,9,8,9,0,1]
; AVX-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 4, i32 4, i32 0, i32 0, i32 4, i32 4, i32 0>
@@ -330,24 +330,24 @@ define <8 x i16> @shuffle_v8i16_04400440(<8 x i16> %a, <8 x i16> %b) {
}
define <8 x i16> @shuffle_v8i16_40044004(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: shuffle_v8i16_40044004:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,0]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,0,2,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,4]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: shuffle_v8i16_40044004:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[8,9,0,1,0,1,8,9,8,9,0,1,0,1,8,9]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v8i16_40044004:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[8,9,0,1,0,1,8,9,8,9,0,1,0,1,8,9]
; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v8i16_40044004:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,0,1,0,1,8,9,8,9,0,1,0,1,8,9]
; AVX-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 0, i32 0, i32 4, i32 4, i32 0, i32 0, i32 4>
@@ -356,7 +356,7 @@ define <8 x i16> @shuffle_v8i16_40044004(<8 x i16> %a, <8 x i16> %b) {
define <8 x i16> @shuffle_v8i16_26405173(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: shuffle_v8i16_26405173:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,4]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,1]
@@ -365,17 +365,17 @@ define <8 x i16> @shuffle_v8i16_26405173(<8 x i16> %a, <8 x i16> %b) {
; SSE2-NEXT: retq
;
; SSSE3-LABEL: shuffle_v8i16_26405173:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[4,5,12,13,8,9,0,1,10,11,2,3,14,15,6,7]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v8i16_26405173:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[4,5,12,13,8,9,0,1,10,11,2,3,14,15,6,7]
; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v8i16_26405173:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,12,13,8,9,0,1,10,11,2,3,14,15,6,7]
; AVX-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 2, i32 6, i32 4, i32 0, i32 5, i32 1, i32 7, i32 3>
@@ -383,7 +383,7 @@ define <8 x i16> @shuffle_v8i16_26405173(<8 x i16> %a, <8 x i16> %b) {
}
define <8 x i16> @shuffle_v8i16_20645173(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: shuffle_v8i16_20645173:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,4]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,1]
@@ -392,17 +392,17 @@ define <8 x i16> @shuffle_v8i16_20645173(<8 x i16> %a, <8 x i16> %b) {
; SSE2-NEXT: retq
;
; SSSE3-LABEL: shuffle_v8i16_20645173:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[4,5,0,1,12,13,8,9,10,11,2,3,14,15,6,7]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v8i16_20645173:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[4,5,0,1,12,13,8,9,10,11,2,3,14,15,6,7]
; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v8i16_20645173:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,0,1,12,13,8,9,10,11,2,3,14,15,6,7]
; AVX-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 2, i32 0, i32 6, i32 4, i32 5, i32 1, i32 7, i32 3>
@@ -410,7 +410,7 @@ define <8 x i16> @shuffle_v8i16_20645173(<8 x i16> %a, <8 x i16> %b) {
}
define <8 x i16> @shuffle_v8i16_26401375(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: shuffle_v8i16_26401375:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,4]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,1,2]
@@ -418,17 +418,17 @@ define <8 x i16> @shuffle_v8i16_26401375(<8 x i16> %a, <8 x i16> %b) {
; SSE2-NEXT: retq
;
; SSSE3-LABEL: shuffle_v8i16_26401375:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[4,5,12,13,8,9,0,1,2,3,6,7,14,15,10,11]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v8i16_26401375:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[4,5,12,13,8,9,0,1,2,3,6,7,14,15,10,11]
; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v8i16_26401375:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,12,13,8,9,0,1,2,3,6,7,14,15,10,11]
; AVX-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 2, i32 6, i32 4, i32 0, i32 1, i32 3, i32 7, i32 5>
@@ -437,7 +437,7 @@ define <8 x i16> @shuffle_v8i16_26401375(<8 x i16> %a, <8 x i16> %b) {
define <8 x i16> @shuffle_v8i16_66751643(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: shuffle_v8i16_66751643:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,7]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,0]
@@ -446,17 +446,17 @@ define <8 x i16> @shuffle_v8i16_66751643(<8 x i16> %a, <8 x i16> %b) {
; SSE2-NEXT: retq
;
; SSSE3-LABEL: shuffle_v8i16_66751643:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[12,13,12,13,14,15,10,11,2,3,12,13,8,9,6,7]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v8i16_66751643:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[12,13,12,13,14,15,10,11,2,3,12,13,8,9,6,7]
; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v8i16_66751643:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,12,13,14,15,10,11,2,3,12,13,8,9,6,7]
; AVX-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 6, i32 6, i32 7, i32 5, i32 1, i32 6, i32 4, i32 3>
@@ -465,7 +465,7 @@ define <8 x i16> @shuffle_v8i16_66751643(<8 x i16> %a, <8 x i16> %b) {
define <8 x i16> @shuffle_v8i16_60514754(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: shuffle_v8i16_60514754:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,3,1,4,5,6,7]
@@ -473,17 +473,17 @@ define <8 x i16> @shuffle_v8i16_60514754(<8 x i16> %a, <8 x i16> %b) {
; SSE2-NEXT: retq
;
; SSSE3-LABEL: shuffle_v8i16_60514754:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[12,13,0,1,10,11,2,3,8,9,14,15,10,11,8,9]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v8i16_60514754:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[12,13,0,1,10,11,2,3,8,9,14,15,10,11,8,9]
; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v8i16_60514754:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,0,1,10,11,2,3,8,9,14,15,10,11,8,9]
; AVX-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 6, i32 0, i32 5, i32 1, i32 4, i32 7, i32 5, i32 4>
@@ -492,24 +492,24 @@ define <8 x i16> @shuffle_v8i16_60514754(<8 x i16> %a, <8 x i16> %b) {
define <8 x i16> @shuffle_v8i16_00444444(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: shuffle_v8i16_00444444:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,2,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: shuffle_v8i16_00444444:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,8,9,8,9,8,9,8,9,8,9,8,9]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v8i16_00444444:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,8,9,8,9,8,9,8,9,8,9,8,9]
; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v8i16_00444444:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,8,9,8,9,8,9,8,9,8,9,8,9]
; AVX-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 0, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
@@ -517,24 +517,24 @@ define <8 x i16> @shuffle_v8i16_00444444(<8 x i16> %a, <8 x i16> %b) {
}
define <8 x i16> @shuffle_v8i16_44004444(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: shuffle_v8i16_44004444:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,2,0,0,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: shuffle_v8i16_44004444:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,0,1,0,1,8,9,8,9,8,9,8,9]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v8i16_44004444:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,0,1,0,1,8,9,8,9,8,9,8,9]
; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v8i16_44004444:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,0,1,0,1,8,9,8,9,8,9,8,9]
; AVX-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 4, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
@@ -542,24 +542,24 @@ define <8 x i16> @shuffle_v8i16_44004444(<8 x i16> %a, <8 x i16> %b) {
}
define <8 x i16> @shuffle_v8i16_04404444(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: shuffle_v8i16_04404444:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,0,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: shuffle_v8i16_04404444:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,8,9,8,9,8,9,8,9]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v8i16_04404444:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,8,9,8,9,8,9,8,9]
; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v8i16_04404444:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,8,9,8,9,8,9,8,9]
; AVX-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 4, i32 4, i32 0, i32 4, i32 4, i32 4, i32 4>
@@ -567,24 +567,24 @@ define <8 x i16> @shuffle_v8i16_04404444(<8 x i16> %a, <8 x i16> %b) {
}
define <8 x i16> @shuffle_v8i16_04400000(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: shuffle_v8i16_04400000:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,0,3]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,0,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: shuffle_v8i16_04400000:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,0,1,0,1,0,1,0,1]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v8i16_04400000:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,0,1,0,1,0,1,0,1]
; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v8i16_04400000:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,0,1,0,1,0,1,0,1]
; AVX-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 4, i32 4, i32 0, i32 0, i32 0, i32 0, i32 0>
@@ -592,13 +592,13 @@ define <8 x i16> @shuffle_v8i16_04400000(<8 x i16> %a, <8 x i16> %b) {
}
define <8 x i16> @shuffle_v8i16_04404567(<8 x i16> %a, <8 x i16> %b) {
; SSE-LABEL: shuffle_v8i16_04404567:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,0,4,5,6,7]
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v8i16_04404567:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,0,4,5,6,7]
; AVX-NEXT: retq
@@ -608,24 +608,24 @@ define <8 x i16> @shuffle_v8i16_04404567(<8 x i16> %a, <8 x i16> %b) {
define <8 x i16> @shuffle_v8i16_0X444444(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: shuffle_v8i16_0X444444:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,2,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: shuffle_v8i16_0X444444:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,8,9,8,9,8,9,8,9,8,9,8,9]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v8i16_0X444444:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,8,9,8,9,8,9,8,9,8,9,8,9]
; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v8i16_0X444444:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,8,9,8,9,8,9,8,9,8,9,8,9]
; AVX-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 undef, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
@@ -633,24 +633,24 @@ define <8 x i16> @shuffle_v8i16_0X444444(<8 x i16> %a, <8 x i16> %b) {
}
define <8 x i16> @shuffle_v8i16_44X04444(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: shuffle_v8i16_44X04444:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,0,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: shuffle_v8i16_44X04444:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,0,1,8,9,8,9,8,9,8,9]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v8i16_44X04444:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,0,1,8,9,8,9,8,9,8,9]
; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v8i16_44X04444:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,0,1,8,9,8,9,8,9,8,9]
; AVX-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 4, i32 undef, i32 0, i32 4, i32 4, i32 4, i32 4>
@@ -658,24 +658,24 @@ define <8 x i16> @shuffle_v8i16_44X04444(<8 x i16> %a, <8 x i16> %b) {
}
define <8 x i16> @shuffle_v8i16_X4404444(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: shuffle_v8i16_X4404444:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,0,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: shuffle_v8i16_X4404444:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,8,9,8,9,8,9,8,9]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v8i16_X4404444:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,8,9,8,9,8,9,8,9]
; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v8i16_X4404444:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,8,9,8,9,8,9,8,9]
; AVX-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 4, i32 4, i32 0, i32 4, i32 4, i32 4, i32 4>
@@ -684,29 +684,29 @@ define <8 x i16> @shuffle_v8i16_X4404444(<8 x i16> %a, <8 x i16> %b) {
define <8 x i16> @shuffle_v8i16_0127XXXX(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: shuffle_v8i16_0127XXXX:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: shuffle_v8i16_0127XXXX:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,14,15,4,5,14,15,12,13,14,15]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v8i16_0127XXXX:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,14,15,4,5,14,15,12,13,14,15]
; SSE41-NEXT: retq
;
; AVX1OR2-LABEL: shuffle_v8i16_0127XXXX:
-; AVX1OR2: # BB#0:
+; AVX1OR2: # %bb.0:
; AVX1OR2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,14,15,4,5,14,15,12,13,14,15]
; AVX1OR2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v8i16_0127XXXX:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
; AVX512VL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7]
; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
@@ -717,29 +717,29 @@ define <8 x i16> @shuffle_v8i16_0127XXXX(<8 x i16> %a, <8 x i16> %b) {
define <8 x i16> @shuffle_v8i16_XXXX4563(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: shuffle_v8i16_XXXX4563:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: shuffle_v8i16_XXXX4563:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[12,13,6,7,4,5,6,7,8,9,10,11,12,13,6,7]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v8i16_XXXX4563:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[12,13,6,7,4,5,6,7,8,9,10,11,12,13,6,7]
; SSE41-NEXT: retq
;
; AVX1OR2-LABEL: shuffle_v8i16_XXXX4563:
-; AVX1OR2: # BB#0:
+; AVX1OR2: # %bb.0:
; AVX1OR2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,6,7,4,5,6,7,8,9,10,11,12,13,6,7]
; AVX1OR2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v8i16_XXXX4563:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,0]
; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7]
; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,0]
@@ -750,29 +750,29 @@ define <8 x i16> @shuffle_v8i16_XXXX4563(<8 x i16> %a, <8 x i16> %b) {
define <8 x i16> @shuffle_v8i16_4563XXXX(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: shuffle_v8i16_4563XXXX:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,2,3]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: shuffle_v8i16_4563XXXX:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,6,7,8,9,10,11,0,1,2,3]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v8i16_4563XXXX:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,6,7,8,9,10,11,0,1,2,3]
; SSE41-NEXT: retq
;
; AVX1OR2-LABEL: shuffle_v8i16_4563XXXX:
-; AVX1OR2: # BB#0:
+; AVX1OR2: # %bb.0:
; AVX1OR2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,6,7,8,9,10,11,0,1,2,3]
; AVX1OR2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v8i16_4563XXXX:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,0]
; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7]
; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,0,2,3]
@@ -783,29 +783,29 @@ define <8 x i16> @shuffle_v8i16_4563XXXX(<8 x i16> %a, <8 x i16> %b) {
define <8 x i16> @shuffle_v8i16_01274563(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: shuffle_v8i16_01274563:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,1,2]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: shuffle_v8i16_01274563:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,14,15,8,9,10,11,12,13,6,7]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v8i16_01274563:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,14,15,8,9,10,11,12,13,6,7]
; SSE41-NEXT: retq
;
; AVX1OR2-LABEL: shuffle_v8i16_01274563:
-; AVX1OR2: # BB#0:
+; AVX1OR2: # %bb.0:
; AVX1OR2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,14,15,8,9,10,11,12,13,6,7]
; AVX1OR2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v8i16_01274563:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
; AVX512VL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7]
; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,1,2]
@@ -816,29 +816,29 @@ define <8 x i16> @shuffle_v8i16_01274563(<8 x i16> %a, <8 x i16> %b) {
define <8 x i16> @shuffle_v8i16_45630127(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: shuffle_v8i16_45630127:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,3,1]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: shuffle_v8i16_45630127:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,6,7,0,1,2,3,4,5,14,15]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v8i16_45630127:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,6,7,0,1,2,3,4,5,14,15]
; SSE41-NEXT: retq
;
; AVX1OR2-LABEL: shuffle_v8i16_45630127:
-; AVX1OR2: # BB#0:
+; AVX1OR2: # %bb.0:
; AVX1OR2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,6,7,0,1,2,3,4,5,14,15]
; AVX1OR2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v8i16_45630127:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,0]
; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7]
; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,0,3,1]
@@ -849,7 +849,7 @@ define <8 x i16> @shuffle_v8i16_45630127(<8 x i16> %a, <8 x i16> %b) {
define <8 x i16> @shuffle_v8i16_37102735(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: shuffle_v8i16_37102735:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,7]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,4]
@@ -859,17 +859,17 @@ define <8 x i16> @shuffle_v8i16_37102735(<8 x i16> %a, <8 x i16> %b) {
; SSE2-NEXT: retq
;
; SSSE3-LABEL: shuffle_v8i16_37102735:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[6,7,14,15,2,3,0,1,4,5,14,15,6,7,10,11]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v8i16_37102735:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[6,7,14,15,2,3,0,1,4,5,14,15,6,7,10,11]
; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v8i16_37102735:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,14,15,2,3,0,1,4,5,14,15,6,7,10,11]
; AVX-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 3, i32 7, i32 1, i32 0, i32 2, i32 7, i32 3, i32 5>
@@ -878,12 +878,12 @@ define <8 x i16> @shuffle_v8i16_37102735(<8 x i16> %a, <8 x i16> %b) {
define <8 x i16> @shuffle_v8i16_08192a3b(<8 x i16> %a, <8 x i16> %b) {
; SSE-LABEL: shuffle_v8i16_08192a3b:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v8i16_08192a3b:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
@@ -892,13 +892,13 @@ define <8 x i16> @shuffle_v8i16_08192a3b(<8 x i16> %a, <8 x i16> %b) {
define <8 x i16> @shuffle_v8i16_0c1d2e3f(<8 x i16> %a, <8 x i16> %b) {
; SSE-LABEL: shuffle_v8i16_0c1d2e3f:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v8i16_0c1d2e3f:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX-NEXT: retq
@@ -908,12 +908,12 @@ define <8 x i16> @shuffle_v8i16_0c1d2e3f(<8 x i16> %a, <8 x i16> %b) {
define <8 x i16> @shuffle_v8i16_4c5d6e7f(<8 x i16> %a, <8 x i16> %b) {
; SSE-LABEL: shuffle_v8i16_4c5d6e7f:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v8i16_4c5d6e7f:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; AVX-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
@@ -922,13 +922,13 @@ define <8 x i16> @shuffle_v8i16_4c5d6e7f(<8 x i16> %a, <8 x i16> %b) {
define <8 x i16> @shuffle_v8i16_48596a7b(<8 x i16> %a, <8 x i16> %b) {
; SSE-LABEL: shuffle_v8i16_48596a7b:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v8i16_48596a7b:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX-NEXT: retq
@@ -938,14 +938,14 @@ define <8 x i16> @shuffle_v8i16_48596a7b(<8 x i16> %a, <8 x i16> %b) {
define <8 x i16> @shuffle_v8i16_08196e7f(<8 x i16> %a, <8 x i16> %b) {
; SSE-LABEL: shuffle_v8i16_08196e7f:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3]
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v8i16_08196e7f:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,3]
; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
@@ -956,14 +956,14 @@ define <8 x i16> @shuffle_v8i16_08196e7f(<8 x i16> %a, <8 x i16> %b) {
define <8 x i16> @shuffle_v8i16_0c1d6879(<8 x i16> %a, <8 x i16> %b) {
; SSE-LABEL: shuffle_v8i16_0c1d6879:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,0,2,3]
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v8i16_0c1d6879:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,0,2,3]
; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
@@ -974,14 +974,14 @@ define <8 x i16> @shuffle_v8i16_0c1d6879(<8 x i16> %a, <8 x i16> %b) {
define <8 x i16> @shuffle_v8i16_109832ba(<8 x i16> %a, <8 x i16> %b) {
; SSE-LABEL: shuffle_v8i16_109832ba:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,3,1,4,5,6,7]
; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,7,5]
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v8i16_109832ba:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,3,1,4,5,6,7]
; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,7,5]
@@ -992,13 +992,13 @@ define <8 x i16> @shuffle_v8i16_109832ba(<8 x i16> %a, <8 x i16> %b) {
define <8 x i16> @shuffle_v8i16_8091a2b3(<8 x i16> %a, <8 x i16> %b) {
; SSE-LABEL: shuffle_v8i16_8091a2b3:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; SSE-NEXT: movdqa %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v8i16_8091a2b3:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; AVX-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 8, i32 0, i32 9, i32 1, i32 10, i32 2, i32 11, i32 3>
@@ -1006,13 +1006,13 @@ define <8 x i16> @shuffle_v8i16_8091a2b3(<8 x i16> %a, <8 x i16> %b) {
}
define <8 x i16> @shuffle_v8i16_c4d5e6f7(<8 x i16> %a, <8 x i16> %b) {
; SSE-LABEL: shuffle_v8i16_c4d5e6f7:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; SSE-NEXT: movdqa %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v8i16_c4d5e6f7:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; AVX-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 12, i32 4, i32 13, i32 5, i32 14, i32 6, i32 15, i32 7>
@@ -1021,7 +1021,7 @@ define <8 x i16> @shuffle_v8i16_c4d5e6f7(<8 x i16> %a, <8 x i16> %b) {
define <8 x i16> @shuffle_v8i16_0213cedf(<8 x i16> %a, <8 x i16> %b) {
; SSE-LABEL: shuffle_v8i16_0213cedf:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,1,3,4,5,6,7]
@@ -1029,7 +1029,7 @@ define <8 x i16> @shuffle_v8i16_0213cedf(<8 x i16> %a, <8 x i16> %b) {
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v8i16_0213cedf:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7]
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,1,3,4,5,6,7]
@@ -1041,7 +1041,7 @@ define <8 x i16> @shuffle_v8i16_0213cedf(<8 x i16> %a, <8 x i16> %b) {
define <8 x i16> @shuffle_v8i16_443aXXXX(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: shuffle_v8i16_443aXXXX:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,0,65535,65535,65535,65535,65535]
; SSE2-NEXT: pand %xmm2, %xmm0
; SSE2-NEXT: pandn %xmm1, %xmm2
@@ -1051,21 +1051,21 @@ define <8 x i16> @shuffle_v8i16_443aXXXX(<8 x i16> %a, <8 x i16> %b) {
; SSE2-NEXT: retq
;
; SSSE3-LABEL: shuffle_v8i16_443aXXXX:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,xmm1[4,5,u,u,u,u,u,u,u,u]
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,6,7],zero,zero,xmm0[u,u,u,u,u,u,u,u]
; SSSE3-NEXT: por %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v8i16_443aXXXX:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4,5,6,7]
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,3,2,4,5,6,7]
; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v8i16_443aXXXX:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4,5,6,7]
; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,3,2,4,5,6,7]
@@ -1076,7 +1076,7 @@ define <8 x i16> @shuffle_v8i16_443aXXXX(<8 x i16> %a, <8 x i16> %b) {
define <8 x i16> @shuffle_v8i16_032dXXXX(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: shuffle_v8i16_032dXXXX:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,0]
; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7]
@@ -1085,26 +1085,26 @@ define <8 x i16> @shuffle_v8i16_032dXXXX(<8 x i16> %a, <8 x i16> %b) {
; SSE2-NEXT: retq
;
; SSSE3-LABEL: shuffle_v8i16_032dXXXX:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,xmm1[10,11,u,u,u,u,u,u,u,u]
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,6,7,4,5],zero,zero,xmm0[u,u,u,u,u,u,u,u]
; SSSE3-NEXT: por %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v8i16_032dXXXX:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,6,7,4,5,10,11,0,1,10,11,0,1,2,3]
; SSE41-NEXT: retq
;
; AVX1-LABEL: shuffle_v8i16_032dXXXX:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,6,7,4,5,10,11,0,1,10,11,0,1,2,3]
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v8i16_032dXXXX:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,6,7,4,5,10,11,0,1,10,11,0,1,2,3]
; AVX2OR512VL-NEXT: retq
@@ -1113,13 +1113,13 @@ define <8 x i16> @shuffle_v8i16_032dXXXX(<8 x i16> %a, <8 x i16> %b) {
}
define <8 x i16> @shuffle_v8i16_XXXdXXXX(<8 x i16> %a, <8 x i16> %b) {
; SSE-LABEL: shuffle_v8i16_XXXdXXXX:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,2,3,3]
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v8i16_XXXdXXXX:
-; AVX: # BB#0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,2,3,3]
+; AVX: # %bb.0:
+; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[2,2,3,3]
; AVX-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 13, i32 undef, i32 undef, i32 undef, i32 undef>
ret <8 x i16> %shuffle
@@ -1127,7 +1127,7 @@ define <8 x i16> @shuffle_v8i16_XXXdXXXX(<8 x i16> %a, <8 x i16> %b) {
define <8 x i16> @shuffle_v8i16_012dXXXX(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: shuffle_v8i16_012dXXXX:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,65535,65535,65535,65535]
; SSE2-NEXT: pand %xmm2, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
@@ -1136,20 +1136,20 @@ define <8 x i16> @shuffle_v8i16_012dXXXX(<8 x i16> %a, <8 x i16> %b) {
; SSE2-NEXT: retq
;
; SSSE3-LABEL: shuffle_v8i16_012dXXXX:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,xmm1[10,11,u,u,u,u,u,u,u,u]
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],zero,zero,xmm0[u,u,u,u,u,u,u,u]
; SSSE3-NEXT: por %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v8i16_012dXXXX:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7]
; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v8i16_012dXXXX:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7]
; AVX-NEXT: retq
@@ -1159,7 +1159,7 @@ define <8 x i16> @shuffle_v8i16_012dXXXX(<8 x i16> %a, <8 x i16> %b) {
define <8 x i16> @shuffle_v8i16_XXXXcde3(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: shuffle_v8i16_XXXXcde3:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535,65535,65535,65535,0]
; SSE2-NEXT: pand %xmm2, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
@@ -1169,26 +1169,26 @@ define <8 x i16> @shuffle_v8i16_XXXXcde3(<8 x i16> %a, <8 x i16> %b) {
; SSE2-NEXT: retq
;
; SSSE3-LABEL: shuffle_v8i16_XXXXcde3:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,zero,xmm0[6,7]
; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,8,9,10,11,12,13],zero,zero
; SSSE3-NEXT: por %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v8i16_XXXXcde3:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,6],xmm0[7]
; SSE41-NEXT: retq
;
; AVX1-LABEL: shuffle_v8i16_XXXXcde3:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,6],xmm0[7]
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v8i16_XXXXcde3:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpbroadcastq %xmm0, %xmm0
; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,6],xmm0[7]
; AVX2OR512VL-NEXT: retq
@@ -1198,7 +1198,7 @@ define <8 x i16> @shuffle_v8i16_XXXXcde3(<8 x i16> %a, <8 x i16> %b) {
define <8 x i16> @shuffle_v8i16_cde3XXXX(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: shuffle_v8i16_cde3XXXX:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,65535,65535,65535,65535]
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
; SSE2-NEXT: pand %xmm2, %xmm1
@@ -1208,20 +1208,20 @@ define <8 x i16> @shuffle_v8i16_cde3XXXX(<8 x i16> %a, <8 x i16> %b) {
; SSE2-NEXT: retq
;
; SSSE3-LABEL: shuffle_v8i16_cde3XXXX:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[6,7,u,u,u,u,u,u,u,u]
; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13],zero,zero,xmm1[u,u,u,u,u,u,u,u]
; SSSE3-NEXT: por %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v8i16_cde3XXXX:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4,5,6,7]
; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v8i16_cde3XXXX:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4,5,6,7]
; AVX-NEXT: retq
@@ -1231,7 +1231,7 @@ define <8 x i16> @shuffle_v8i16_cde3XXXX(<8 x i16> %a, <8 x i16> %b) {
define <8 x i16> @shuffle_v8i16_012dcde3(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: shuffle_v8i16_012dcde3:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,3,2,1]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,0,4,5,6,7]
@@ -1243,26 +1243,26 @@ define <8 x i16> @shuffle_v8i16_012dcde3(<8 x i16> %a, <8 x i16> %b) {
; SSE2-NEXT: retq
;
; SSSE3-LABEL: shuffle_v8i16_012dcde3:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,xmm1[10,11,8,9,10,11,12,13],zero,zero
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],zero,zero,zero,zero,zero,zero,zero,zero,xmm0[6,7]
; SSSE3-NEXT: por %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v8i16_012dcde3:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,10,11,8,9,10,11,12,13,6,7]
; SSE41-NEXT: retq
;
; AVX1-LABEL: shuffle_v8i16_012dcde3:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,10,11,8,9,10,11,12,13,6,7]
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v8i16_012dcde3:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,10,11,8,9,10,11,12,13,6,7]
; AVX2OR512VL-NEXT: retq
@@ -1272,7 +1272,7 @@ define <8 x i16> @shuffle_v8i16_012dcde3(<8 x i16> %a, <8 x i16> %b) {
define <8 x i16> @shuffle_v8i16_0923cde7(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: shuffle_v8i16_0923cde7:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movaps {{.*#+}} xmm2 = [65535,0,65535,65535,0,0,0,65535]
; SSE2-NEXT: andps %xmm2, %xmm0
; SSE2-NEXT: andnps %xmm1, %xmm2
@@ -1280,7 +1280,7 @@ define <8 x i16> @shuffle_v8i16_0923cde7(<8 x i16> %a, <8 x i16> %b) {
; SSE2-NEXT: retq
;
; SSSE3-LABEL: shuffle_v8i16_0923cde7:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: movaps {{.*#+}} xmm2 = [65535,0,65535,65535,0,0,0,65535]
; SSSE3-NEXT: andps %xmm2, %xmm0
; SSSE3-NEXT: andnps %xmm1, %xmm2
@@ -1288,12 +1288,12 @@ define <8 x i16> @shuffle_v8i16_0923cde7(<8 x i16> %a, <8 x i16> %b) {
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v8i16_0923cde7:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4,5,6],xmm0[7]
; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v8i16_0923cde7:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4,5,6],xmm0[7]
; AVX-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 9, i32 2, i32 3, i32 12, i32 13, i32 14, i32 7>
@@ -1302,7 +1302,7 @@ define <8 x i16> @shuffle_v8i16_0923cde7(<8 x i16> %a, <8 x i16> %b) {
define <8 x i16> @shuffle_v8i16_XXX1X579(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: shuffle_v8i16_XXX1X579:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,1,2,0]
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,65535,65535,65535,0]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7]
@@ -1314,14 +1314,14 @@ define <8 x i16> @shuffle_v8i16_XXX1X579(<8 x i16> %a, <8 x i16> %b) {
; SSE2-NEXT: retq
;
; SSSE3-LABEL: shuffle_v8i16_XXX1X579:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u],zero,zero,xmm1[u,u],zero,zero,zero,zero,xmm1[2,3]
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,2,3,u,u,10,11,14,15],zero,zero
; SSSE3-NEXT: por %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v8i16_XXX1X579:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7]
; SSE41-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,7]
@@ -1329,7 +1329,7 @@ define <8 x i16> @shuffle_v8i16_XXX1X579(<8 x i16> %a, <8 x i16> %b) {
; SSE41-NEXT: retq
;
; AVX1-LABEL: shuffle_v8i16_XXX1X579:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7]
; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,7]
@@ -1337,7 +1337,7 @@ define <8 x i16> @shuffle_v8i16_XXX1X579(<8 x i16> %a, <8 x i16> %b) {
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v8i16_XXX1X579:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpbroadcastd %xmm1, %xmm1
; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7]
; AVX2OR512VL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,7]
@@ -1349,7 +1349,7 @@ define <8 x i16> @shuffle_v8i16_XXX1X579(<8 x i16> %a, <8 x i16> %b) {
define <8 x i16> @shuffle_v8i16_XX4X8acX(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: shuffle_v8i16_XX4X8acX:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,2,3,3]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0]
@@ -1358,28 +1358,28 @@ define <8 x i16> @shuffle_v8i16_XX4X8acX(<8 x i16> %a, <8 x i16> %b) {
; SSE2-NEXT: retq
;
; SSSE3-LABEL: shuffle_v8i16_XX4X8acX:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,8,9,u,u],zero,zero,zero,zero,zero,zero,xmm0[u,u]
; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[u,u,u,u],zero,zero,xmm1[u,u,0,1,4,5,8,9,u,u]
; SSSE3-NEXT: por %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v8i16_XX4X8acX:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,4,5,6,7,0,1,4,5,8,9,4,5]
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
; SSE41-NEXT: retq
;
; AVX1-LABEL: shuffle_v8i16_XX4X8acX:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,4,5,6,7,0,1,4,5,8,9,4,5]
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v8i16_XX4X8acX:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,4,5,6,7,0,1,4,5,8,9,4,5]
; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
; AVX2OR512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
@@ -1390,13 +1390,13 @@ define <8 x i16> @shuffle_v8i16_XX4X8acX(<8 x i16> %a, <8 x i16> %b) {
define <8 x i16> @shuffle_v8i16_8zzzzzzz(i16 %i) {
; SSE-LABEL: shuffle_v8i16_8zzzzzzz:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movzwl %di, %eax
; SSE-NEXT: movd %eax, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v8i16_8zzzzzzz:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: movzwl %di, %eax
; AVX-NEXT: vmovd %eax, %xmm0
; AVX-NEXT: retq
@@ -1407,13 +1407,13 @@ define <8 x i16> @shuffle_v8i16_8zzzzzzz(i16 %i) {
define <8 x i16> @shuffle_v8i16_z8zzzzzz(i16 %i) {
; SSE-LABEL: shuffle_v8i16_z8zzzzzz:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pxor %xmm0, %xmm0
; SSE-NEXT: pinsrw $1, %edi, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v8i16_z8zzzzzz:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0
; AVX-NEXT: vpinsrw $1, %edi, %xmm0, %xmm0
; AVX-NEXT: retq
@@ -1424,13 +1424,13 @@ define <8 x i16> @shuffle_v8i16_z8zzzzzz(i16 %i) {
define <8 x i16> @shuffle_v8i16_zzzzz8zz(i16 %i) {
; SSE-LABEL: shuffle_v8i16_zzzzz8zz:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pxor %xmm0, %xmm0
; SSE-NEXT: pinsrw $5, %edi, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v8i16_zzzzz8zz:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0
; AVX-NEXT: vpinsrw $5, %edi, %xmm0, %xmm0
; AVX-NEXT: retq
@@ -1441,13 +1441,13 @@ define <8 x i16> @shuffle_v8i16_zzzzz8zz(i16 %i) {
define <8 x i16> @shuffle_v8i16_zuuzuuz8(i16 %i) {
; SSE-LABEL: shuffle_v8i16_zuuzuuz8:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pxor %xmm0, %xmm0
; SSE-NEXT: pinsrw $7, %edi, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v8i16_zuuzuuz8:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0
; AVX-NEXT: vpinsrw $7, %edi, %xmm0, %xmm0
; AVX-NEXT: retq
@@ -1458,13 +1458,13 @@ define <8 x i16> @shuffle_v8i16_zuuzuuz8(i16 %i) {
define <8 x i16> @shuffle_v8i16_zzBzzzzz(i16 %i) {
; SSE-LABEL: shuffle_v8i16_zzBzzzzz:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pxor %xmm0, %xmm0
; SSE-NEXT: pinsrw $2, %edi, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v8i16_zzBzzzzz:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0
; AVX-NEXT: vpinsrw $2, %edi, %xmm0, %xmm0
; AVX-NEXT: retq
@@ -1475,24 +1475,24 @@ define <8 x i16> @shuffle_v8i16_zzBzzzzz(i16 %i) {
define <8 x i16> @shuffle_v8i16_def01234(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: shuffle_v8i16_def01234:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9]
; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: shuffle_v8i16_def01234:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: palignr {{.*#+}} xmm0 = xmm1[10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v8i16_def01234:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: palignr {{.*#+}} xmm0 = xmm1[10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9]
; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v8i16_def01234:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9]
; AVX-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4>
@@ -1501,24 +1501,24 @@ define <8 x i16> @shuffle_v8i16_def01234(<8 x i16> %a, <8 x i16> %b) {
define <8 x i16> @shuffle_v8i16_ueuu123u(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: shuffle_v8i16_ueuu123u:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9]
; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: shuffle_v8i16_ueuu123u:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: palignr {{.*#+}} xmm0 = xmm1[10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v8i16_ueuu123u:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: palignr {{.*#+}} xmm0 = xmm1[10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9]
; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v8i16_ueuu123u:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9]
; AVX-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 14, i32 undef, i32 undef, i32 1, i32 2, i32 3, i32 undef>
@@ -1527,7 +1527,7 @@ define <8 x i16> @shuffle_v8i16_ueuu123u(<8 x i16> %a, <8 x i16> %b) {
define <8 x i16> @shuffle_v8i16_56701234(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: shuffle_v8i16_56701234:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9]
@@ -1535,17 +1535,17 @@ define <8 x i16> @shuffle_v8i16_56701234(<8 x i16> %a, <8 x i16> %b) {
; SSE2-NEXT: retq
;
; SSSE3-LABEL: shuffle_v8i16_56701234:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: palignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v8i16_56701234:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: palignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9]
; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v8i16_56701234:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9]
; AVX-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4>
@@ -1554,7 +1554,7 @@ define <8 x i16> @shuffle_v8i16_56701234(<8 x i16> %a, <8 x i16> %b) {
define <8 x i16> @shuffle_v8i16_u6uu123u(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: shuffle_v8i16_u6uu123u:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9]
@@ -1562,17 +1562,17 @@ define <8 x i16> @shuffle_v8i16_u6uu123u(<8 x i16> %a, <8 x i16> %b) {
; SSE2-NEXT: retq
;
; SSSE3-LABEL: shuffle_v8i16_u6uu123u:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: palignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v8i16_u6uu123u:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: palignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9]
; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v8i16_u6uu123u:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9]
; AVX-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 6, i32 undef, i32 undef, i32 1, i32 2, i32 3, i32 undef>
@@ -1581,12 +1581,12 @@ define <8 x i16> @shuffle_v8i16_u6uu123u(<8 x i16> %a, <8 x i16> %b) {
define <8 x i16> @shuffle_v8i16_uuuu123u(<8 x i16> %a, <8 x i16> %b) {
; SSE-LABEL: shuffle_v8i16_uuuu123u:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9]
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v8i16_uuuu123u:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9]
; AVX-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 2, i32 3, i32 undef>
@@ -1595,24 +1595,24 @@ define <8 x i16> @shuffle_v8i16_uuuu123u(<8 x i16> %a, <8 x i16> %b) {
define <8 x i16> @shuffle_v8i16_bcdef012(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: shuffle_v8i16_bcdef012:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5]
; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: shuffle_v8i16_bcdef012:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: palignr {{.*#+}} xmm0 = xmm1[6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v8i16_bcdef012:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: palignr {{.*#+}} xmm0 = xmm1[6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5]
; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v8i16_bcdef012:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5]
; AVX-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2>
@@ -1621,24 +1621,24 @@ define <8 x i16> @shuffle_v8i16_bcdef012(<8 x i16> %a, <8 x i16> %b) {
define <8 x i16> @shuffle_v8i16_ucdeuu1u(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: shuffle_v8i16_ucdeuu1u:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5]
; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: shuffle_v8i16_ucdeuu1u:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: palignr {{.*#+}} xmm0 = xmm1[6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v8i16_ucdeuu1u:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: palignr {{.*#+}} xmm0 = xmm1[6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5]
; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v8i16_ucdeuu1u:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5]
; AVX-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 12, i32 13, i32 14, i32 undef, i32 undef, i32 1, i32 undef>
@@ -1647,7 +1647,7 @@ define <8 x i16> @shuffle_v8i16_ucdeuu1u(<8 x i16> %a, <8 x i16> %b) {
define <8 x i16> @shuffle_v8i16_34567012(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: shuffle_v8i16_34567012:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5]
@@ -1655,17 +1655,17 @@ define <8 x i16> @shuffle_v8i16_34567012(<8 x i16> %a, <8 x i16> %b) {
; SSE2-NEXT: retq
;
; SSSE3-LABEL: shuffle_v8i16_34567012:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: palignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v8i16_34567012:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: palignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v8i16_34567012:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
; AVX-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2>
@@ -1674,7 +1674,7 @@ define <8 x i16> @shuffle_v8i16_34567012(<8 x i16> %a, <8 x i16> %b) {
define <8 x i16> @shuffle_v8i16_u456uu1u(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: shuffle_v8i16_u456uu1u:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5]
@@ -1682,17 +1682,17 @@ define <8 x i16> @shuffle_v8i16_u456uu1u(<8 x i16> %a, <8 x i16> %b) {
; SSE2-NEXT: retq
;
; SSSE3-LABEL: shuffle_v8i16_u456uu1u:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: palignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v8i16_u456uu1u:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: palignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v8i16_u456uu1u:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
; AVX-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 4, i32 5, i32 6, i32 undef, i32 undef, i32 1, i32 undef>
@@ -1701,12 +1701,12 @@ define <8 x i16> @shuffle_v8i16_u456uu1u(<8 x i16> %a, <8 x i16> %b) {
define <8 x i16> @shuffle_v8i16_u456uuuu(<8 x i16> %a, <8 x i16> %b) {
; SSE-LABEL: shuffle_v8i16_u456uuuu:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v8i16_u456uuuu:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
; AVX-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 4, i32 5, i32 6, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -1715,26 +1715,26 @@ define <8 x i16> @shuffle_v8i16_u456uuuu(<8 x i16> %a, <8 x i16> %b) {
define <8 x i16> @shuffle_v8i16_3456789a(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: shuffle_v8i16_3456789a:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5]
; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: shuffle_v8i16_3456789a:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: palignr {{.*#+}} xmm1 = xmm0[6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5]
; SSSE3-NEXT: movdqa %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v8i16_3456789a:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: palignr {{.*#+}} xmm1 = xmm0[6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5]
; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v8i16_3456789a:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5]
; AVX-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10>
@@ -1743,26 +1743,26 @@ define <8 x i16> @shuffle_v8i16_3456789a(<8 x i16> %a, <8 x i16> %b) {
define <8 x i16> @shuffle_v8i16_u456uu9u(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: shuffle_v8i16_u456uu9u:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5]
; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: shuffle_v8i16_u456uu9u:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: palignr {{.*#+}} xmm1 = xmm0[6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5]
; SSSE3-NEXT: movdqa %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v8i16_u456uu9u:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: palignr {{.*#+}} xmm1 = xmm0[6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5]
; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v8i16_u456uu9u:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5]
; AVX-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 4, i32 5, i32 6, i32 undef, i32 undef, i32 9, i32 undef>
@@ -1771,26 +1771,26 @@ define <8 x i16> @shuffle_v8i16_u456uu9u(<8 x i16> %a, <8 x i16> %b) {
define <8 x i16> @shuffle_v8i16_56789abc(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: shuffle_v8i16_56789abc:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9]
; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: shuffle_v8i16_56789abc:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: palignr {{.*#+}} xmm1 = xmm0[10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9]
; SSSE3-NEXT: movdqa %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v8i16_56789abc:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: palignr {{.*#+}} xmm1 = xmm0[10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9]
; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v8i16_56789abc:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9]
; AVX-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12>
@@ -1799,26 +1799,26 @@ define <8 x i16> @shuffle_v8i16_56789abc(<8 x i16> %a, <8 x i16> %b) {
define <8 x i16> @shuffle_v8i16_u6uu9abu(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: shuffle_v8i16_u6uu9abu:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9]
; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: shuffle_v8i16_u6uu9abu:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: palignr {{.*#+}} xmm1 = xmm0[10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9]
; SSSE3-NEXT: movdqa %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v8i16_u6uu9abu:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: palignr {{.*#+}} xmm1 = xmm0[10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9]
; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v8i16_u6uu9abu:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9]
; AVX-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 6, i32 undef, i32 undef, i32 9, i32 10, i32 11, i32 undef>
@@ -1827,24 +1827,24 @@ define <8 x i16> @shuffle_v8i16_u6uu9abu(<8 x i16> %a, <8 x i16> %b) {
define <8 x i16> @shuffle_v8i16_0uuu1uuu(<8 x i16> %a) {
; SSE2-LABEL: shuffle_v8i16_0uuu1uuu:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,6,7]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: shuffle_v8i16_0uuu1uuu:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
; SSSE3-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,6,7]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v8i16_0uuu1uuu:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v8i16_0uuu1uuu:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
; AVX-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 1, i32 undef, i32 undef, i32 undef>
@@ -1853,26 +1853,26 @@ define <8 x i16> @shuffle_v8i16_0uuu1uuu(<8 x i16> %a) {
define <8 x i16> @shuffle_v8i16_0zzz1zzz(<8 x i16> %a) {
; SSE2-LABEL: shuffle_v8i16_0zzz1zzz:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: pxor %xmm1, %xmm1
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: shuffle_v8i16_0zzz1zzz:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: pxor %xmm1, %xmm1
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v8i16_0zzz1zzz:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v8i16_0zzz1zzz:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
; AVX-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 1, i32 13, i32 14, i32 15>
@@ -1881,22 +1881,22 @@ define <8 x i16> @shuffle_v8i16_0zzz1zzz(<8 x i16> %a) {
define <8 x i16> @shuffle_v8i16_0u1u2u3u(<8 x i16> %a) {
; SSE2-LABEL: shuffle_v8i16_0u1u2u3u:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: shuffle_v8i16_0u1u2u3u:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v8i16_0u1u2u3u:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v8i16_0u1u2u3u:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; AVX-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 undef, i32 1, i32 undef, i32 2, i32 undef, i32 3, i32 undef>
@@ -1905,41 +1905,119 @@ define <8 x i16> @shuffle_v8i16_0u1u2u3u(<8 x i16> %a) {
define <8 x i16> @shuffle_v8i16_0z1z2z3z(<8 x i16> %a) {
; SSE2-LABEL: shuffle_v8i16_0z1z2z3z:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: pxor %xmm1, %xmm1
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: shuffle_v8i16_0z1z2z3z:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: pxor %xmm1, %xmm1
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v8i16_0z1z2z3z:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v8i16_0z1z2z3z:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; AVX-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 9, i32 1, i32 11, i32 2, i32 13, i32 3, i32 15>
ret <8 x i16> %shuffle
}
+define <8 x i16> @shuffle_v8i16_01100110(<8 x i16> %a) {
+; SSE2-LABEL: shuffle_v8i16_01100110:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,0,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,4]
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: shuffle_v8i16_01100110:
+; SSSE3: # %bb.0:
+; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,2,3,0,1,0,1,2,3,2,3,0,1]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: shuffle_v8i16_01100110:
+; SSE41: # %bb.0:
+; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,2,3,0,1,0,1,2,3,2,3,0,1]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: shuffle_v8i16_01100110:
+; AVX: # %bb.0:
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,2,3,0,1,0,1,2,3,2,3,0,1]
+; AVX-NEXT: retq
+ %shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 1, i32 0, i32 0, i32 1, i32 1, i32 0>
+ ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_01u0u110(<8 x i16> %a) {
+; SSE2-LABEL: shuffle_v8i16_01u0u110:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,4]
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: shuffle_v8i16_01u0u110:
+; SSSE3: # %bb.0:
+; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,0,1,0,1,2,3,2,3,0,1]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: shuffle_v8i16_01u0u110:
+; SSE41: # %bb.0:
+; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,0,1,0,1,2,3,2,3,0,1]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: shuffle_v8i16_01u0u110:
+; AVX: # %bb.0:
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,0,1,0,1,2,3,2,3,0,1]
+; AVX-NEXT: retq
+ %shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 undef, i32 0, i32 undef, i32 1, i32 1, i32 0>
+ ret <8 x i16> %shuffle
+}
+
+define <8 x i16> @shuffle_v8i16_467uu675(<8 x i16> %a) {
+; SSE2-LABEL: shuffle_v8i16_467uu675:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,3,3,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,7,5]
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: shuffle_v8i16_467uu675:
+; SSSE3: # %bb.0:
+; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[8,9,12,13,14,15,14,15,8,9,12,13,14,15,10,11]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: shuffle_v8i16_467uu675:
+; SSE41: # %bb.0:
+; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[8,9,12,13,14,15,14,15,8,9,12,13,14,15,10,11]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: shuffle_v8i16_467uu675:
+; AVX: # %bb.0:
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,12,13,14,15,14,15,8,9,12,13,14,15,10,11]
+; AVX-NEXT: retq
+ %shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32> <i32 4, i32 6, i32 7, i32 undef, i32 undef, i32 6, i32 7, i32 5>
+ ret <8 x i16> %shuffle
+}
+
;
; Shuffle to logical bit shifts
;
define <8 x i16> @shuffle_v8i16_z0z2z4z6(<8 x i16> %a) {
; SSE-LABEL: shuffle_v8i16_z0z2z4z6:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pslld $16, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v8i16_z0z2z4z6:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpslld $16, %xmm0, %xmm0
; AVX-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32><i32 8, i32 0, i32 8, i32 2, i32 8, i32 4, i32 8, i32 6>
@@ -1948,12 +2026,12 @@ define <8 x i16> @shuffle_v8i16_z0z2z4z6(<8 x i16> %a) {
define <8 x i16> @shuffle_v8i16_zzz0zzz4(<8 x i16> %a) {
; SSE-LABEL: shuffle_v8i16_zzz0zzz4:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: psllq $48, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v8i16_zzz0zzz4:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpsllq $48, %xmm0, %xmm0
; AVX-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32><i32 8, i32 8, i32 8, i32 0, i32 8, i32 8, i32 8, i32 4>
@@ -1962,12 +2040,12 @@ define <8 x i16> @shuffle_v8i16_zzz0zzz4(<8 x i16> %a) {
define <8 x i16> @shuffle_v8i16_zz01zX4X(<8 x i16> %a) {
; SSE-LABEL: shuffle_v8i16_zz01zX4X:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: psllq $32, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v8i16_zz01zX4X:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpsllq $32, %xmm0, %xmm0
; AVX-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32><i32 8, i32 8, i32 0, i32 1, i32 8, i32 undef, i32 4, i32 undef>
@@ -1976,12 +2054,12 @@ define <8 x i16> @shuffle_v8i16_zz01zX4X(<8 x i16> %a) {
define <8 x i16> @shuffle_v8i16_z0X2z456(<8 x i16> %a) {
; SSE-LABEL: shuffle_v8i16_z0X2z456:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: psllq $16, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v8i16_z0X2z456:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpsllq $16, %xmm0, %xmm0
; AVX-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32><i32 8, i32 0, i32 undef, i32 2, i32 8, i32 4, i32 5, i32 6>
@@ -1990,12 +2068,12 @@ define <8 x i16> @shuffle_v8i16_z0X2z456(<8 x i16> %a) {
define <8 x i16> @shuffle_v8i16_1z3zXz7z(<8 x i16> %a) {
; SSE-LABEL: shuffle_v8i16_1z3zXz7z:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: psrld $16, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v8i16_1z3zXz7z:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpsrld $16, %xmm0, %xmm0
; AVX-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32><i32 1, i32 8, i32 3, i32 8, i32 undef, i32 8, i32 7, i32 8>
@@ -2004,12 +2082,12 @@ define <8 x i16> @shuffle_v8i16_1z3zXz7z(<8 x i16> %a) {
define <8 x i16> @shuffle_v8i16_1X3z567z(<8 x i16> %a) {
; SSE-LABEL: shuffle_v8i16_1X3z567z:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: psrlq $16, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v8i16_1X3z567z:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpsrlq $16, %xmm0, %xmm0
; AVX-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32><i32 1, i32 undef, i32 3, i32 8, i32 5, i32 6, i32 7, i32 8>
@@ -2018,12 +2096,12 @@ define <8 x i16> @shuffle_v8i16_1X3z567z(<8 x i16> %a) {
define <8 x i16> @shuffle_v8i16_23zz67zz(<8 x i16> %a) {
; SSE-LABEL: shuffle_v8i16_23zz67zz:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: psrlq $32, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v8i16_23zz67zz:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpsrlq $32, %xmm0, %xmm0
; AVX-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32><i32 2, i32 3, i32 8, i32 8, i32 6, i32 7, i32 8, i32 8>
@@ -2032,12 +2110,12 @@ define <8 x i16> @shuffle_v8i16_23zz67zz(<8 x i16> %a) {
define <8 x i16> @shuffle_v8i16_3zXXXzzz(<8 x i16> %a) {
; SSE-LABEL: shuffle_v8i16_3zXXXzzz:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: psrlq $48, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v8i16_3zXXXzzz:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpsrlq $48, %xmm0, %xmm0
; AVX-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32><i32 3, i32 8, i32 undef, i32 undef, i32 undef, i32 8, i32 8, i32 8>
@@ -2046,12 +2124,12 @@ define <8 x i16> @shuffle_v8i16_3zXXXzzz(<8 x i16> %a) {
define <8 x i16> @shuffle_v8i16_01u3zzuz(<8 x i16> %a) {
; SSE-LABEL: shuffle_v8i16_01u3zzuz:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v8i16_01u3zzuz:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
; AVX-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 undef, i32 3, i32 8, i32 8, i32 undef, i32 8>
@@ -2060,23 +2138,23 @@ define <8 x i16> @shuffle_v8i16_01u3zzuz(<8 x i16> %a) {
define <8 x i16> @shuffle_v8i16_0z234567(<8 x i16> %a) {
; SSE2-LABEL: shuffle_v8i16_0z234567:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: andps {{.*}}(%rip), %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: shuffle_v8i16_0z234567:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: andps {{.*}}(%rip), %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v8i16_0z234567:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pxor %xmm1, %xmm1
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v8i16_0z234567:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
; AVX-NEXT: retq
@@ -2086,23 +2164,23 @@ define <8 x i16> @shuffle_v8i16_0z234567(<8 x i16> %a) {
define <8 x i16> @shuffle_v8i16_0zzzz5z7(<8 x i16> %a) {
; SSE2-LABEL: shuffle_v8i16_0zzzz5z7:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: andps {{.*}}(%rip), %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: shuffle_v8i16_0zzzz5z7:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: andps {{.*}}(%rip), %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v8i16_0zzzz5z7:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pxor %xmm1, %xmm1
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4],xmm0[5],xmm1[6],xmm0[7]
; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v8i16_0zzzz5z7:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4],xmm0[5],xmm1[6],xmm0[7]
; AVX-NEXT: retq
@@ -2112,23 +2190,23 @@ define <8 x i16> @shuffle_v8i16_0zzzz5z7(<8 x i16> %a) {
define <8 x i16> @shuffle_v8i16_0123456z(<8 x i16> %a) {
; SSE2-LABEL: shuffle_v8i16_0123456z:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: andps {{.*}}(%rip), %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: shuffle_v8i16_0123456z:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: andps {{.*}}(%rip), %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v8i16_0123456z:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pxor %xmm1, %xmm1
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6],xmm1[7]
; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v8i16_0123456z:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6],xmm1[7]
; AVX-NEXT: retq
@@ -2138,7 +2216,7 @@ define <8 x i16> @shuffle_v8i16_0123456z(<8 x i16> %a) {
define <8 x i16> @shuffle_v8i16_fu3ucc5u(<8 x i16> %a, <8 x i16> %b) {
; SSE-LABEL: shuffle_v8i16_fu3ucc5u:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,4,4]
; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
@@ -2146,7 +2224,7 @@ define <8 x i16> @shuffle_v8i16_fu3ucc5u(<8 x i16> %a, <8 x i16> %b) {
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v8i16_fu3ucc5u:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
; AVX-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,4,4]
; AVX-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
@@ -2157,12 +2235,12 @@ define <8 x i16> @shuffle_v8i16_fu3ucc5u(<8 x i16> %a, <8 x i16> %b) {
define <8 x i16> @shuffle_v8i16_8012345u(<8 x i16> %a) {
; SSE-LABEL: shuffle_v8i16_8012345u:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v8i16_8012345u:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
; AVX-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32> <i32 8, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 undef>
@@ -2172,32 +2250,32 @@ define <8 x i16> @shuffle_v8i16_8012345u(<8 x i16> %a) {
define <8 x i16> @mask_v8i16_012345ef(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: mask_v8i16_012345ef:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0]
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
; SSE2-NEXT: movaps %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: mask_v8i16_012345ef:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0]
; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
; SSSE3-NEXT: movaps %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: mask_v8i16_012345ef:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7]
; SSE41-NEXT: retq
;
; AVX1-LABEL: mask_v8i16_012345ef:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7]
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: mask_v8i16_012345ef:
-; AVX2OR512VL: # BB#0:
-; AVX2OR512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
+; AVX2OR512VL: # %bb.0:
+; AVX2OR512VL-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
; AVX2OR512VL-NEXT: retq
%1 = bitcast <8 x i16> %a to <2 x i64>
%2 = bitcast <8 x i16> %b to <2 x i64>
@@ -2210,21 +2288,21 @@ define <8 x i16> @mask_v8i16_012345ef(<8 x i16> %a, <8 x i16> %b) {
define <8 x i16> @insert_dup_mem_v8i16_i32(i32* %ptr) {
; SSE-LABEL: insert_dup_mem_v8i16_i32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; SSE-NEXT: retq
;
; AVX1-LABEL: insert_dup_mem_v8i16_i32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: insert_dup_mem_v8i16_i32:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpbroadcastw (%rdi), %xmm0
; AVX2OR512VL-NEXT: retq
%tmp = load i32, i32* %ptr, align 4
@@ -2236,7 +2314,7 @@ define <8 x i16> @insert_dup_mem_v8i16_i32(i32* %ptr) {
define <8 x i16> @insert_dup_mem_v8i16_sext_i16(i16* %ptr) {
; SSE2-LABEL: insert_dup_mem_v8i16_sext_i16:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movswl (%rdi), %eax
; SSE2-NEXT: movd %eax, %xmm0
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
@@ -2244,35 +2322,35 @@ define <8 x i16> @insert_dup_mem_v8i16_sext_i16(i16* %ptr) {
; SSE2-NEXT: retq
;
; SSSE3-LABEL: insert_dup_mem_v8i16_sext_i16:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: movswl (%rdi), %eax
; SSSE3-NEXT: movd %eax, %xmm0
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: insert_dup_mem_v8i16_sext_i16:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movswl (%rdi), %eax
; SSE41-NEXT: movd %eax, %xmm0
; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
; SSE41-NEXT: retq
;
; AVX1-LABEL: insert_dup_mem_v8i16_sext_i16:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: movswl (%rdi), %eax
; AVX1-NEXT: vmovd %eax, %xmm0
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
; AVX1-NEXT: retq
;
; AVX2-LABEL: insert_dup_mem_v8i16_sext_i16:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: movswl (%rdi), %eax
; AVX2-NEXT: vmovd %eax, %xmm0
; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: insert_dup_mem_v8i16_sext_i16:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: movswl (%rdi), %eax
; AVX512VL-NEXT: vpbroadcastw %eax, %xmm0
; AVX512VL-NEXT: retq
@@ -2286,21 +2364,21 @@ define <8 x i16> @insert_dup_mem_v8i16_sext_i16(i16* %ptr) {
define <8 x i16> @insert_dup_elt1_mem_v8i16_i32(i32* %ptr) {
; SSE-LABEL: insert_dup_elt1_mem_v8i16_i32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; SSE-NEXT: retq
;
; AVX1-LABEL: insert_dup_elt1_mem_v8i16_i32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: insert_dup_elt1_mem_v8i16_i32:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpbroadcastw 2(%rdi), %xmm0
; AVX2OR512VL-NEXT: retq
%tmp = load i32, i32* %ptr, align 4
@@ -2312,7 +2390,7 @@ define <8 x i16> @insert_dup_elt1_mem_v8i16_i32(i32* %ptr) {
define <8 x i16> @insert_dup_elt3_mem_v8i16_i32(i32* %ptr) {
; SSE2-LABEL: insert_dup_elt3_mem_v8i16_i32:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,1,1]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7]
@@ -2320,25 +2398,25 @@ define <8 x i16> @insert_dup_elt3_mem_v8i16_i32(i32* %ptr) {
; SSE2-NEXT: retq
;
; SSSE3-LABEL: insert_dup_elt3_mem_v8i16_i32:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: insert_dup_elt3_mem_v8i16_i32:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3]
; SSE41-NEXT: retq
;
; AVX1-LABEL: insert_dup_elt3_mem_v8i16_i32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3]
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: insert_dup_elt3_mem_v8i16_i32:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpbroadcastw 2(%rdi), %xmm0
; AVX2OR512VL-NEXT: retq
%tmp = load i32, i32* %ptr, align 4
@@ -2350,7 +2428,7 @@ define <8 x i16> @insert_dup_elt3_mem_v8i16_i32(i32* %ptr) {
define <8 x i16> @insert_dup_elt1_mem_v8i16_sext_i16(i16* %ptr) {
; SSE2-LABEL: insert_dup_elt1_mem_v8i16_sext_i16:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movswl (%rdi), %eax
; SSE2-NEXT: movd %eax, %xmm0
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
@@ -2358,28 +2436,28 @@ define <8 x i16> @insert_dup_elt1_mem_v8i16_sext_i16(i16* %ptr) {
; SSE2-NEXT: retq
;
; SSSE3-LABEL: insert_dup_elt1_mem_v8i16_sext_i16:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: movswl (%rdi), %eax
; SSSE3-NEXT: movd %eax, %xmm0
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: insert_dup_elt1_mem_v8i16_sext_i16:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movswl (%rdi), %eax
; SSE41-NEXT: movd %eax, %xmm0
; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3]
; SSE41-NEXT: retq
;
; AVX1-LABEL: insert_dup_elt1_mem_v8i16_sext_i16:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: movswl (%rdi), %eax
; AVX1-NEXT: vmovd %eax, %xmm0
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3]
; AVX1-NEXT: retq
;
; AVX2-LABEL: insert_dup_elt1_mem_v8i16_sext_i16:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: movswl (%rdi), %eax
; AVX2-NEXT: shrl $16, %eax
; AVX2-NEXT: vmovd %eax, %xmm0
@@ -2387,7 +2465,7 @@ define <8 x i16> @insert_dup_elt1_mem_v8i16_sext_i16(i16* %ptr) {
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: insert_dup_elt1_mem_v8i16_sext_i16:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: movswl (%rdi), %eax
; AVX512VL-NEXT: shrl $16, %eax
; AVX512VL-NEXT: vpbroadcastw %eax, %xmm0
@@ -2402,7 +2480,7 @@ define <8 x i16> @insert_dup_elt1_mem_v8i16_sext_i16(i16* %ptr) {
define <8 x i16> @insert_dup_elt3_mem_v8i16_sext_i16(i16* %ptr) {
; SSE2-LABEL: insert_dup_elt3_mem_v8i16_sext_i16:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movswl (%rdi), %eax
; SSE2-NEXT: movd %eax, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,1,1]
@@ -2411,28 +2489,28 @@ define <8 x i16> @insert_dup_elt3_mem_v8i16_sext_i16(i16* %ptr) {
; SSE2-NEXT: retq
;
; SSSE3-LABEL: insert_dup_elt3_mem_v8i16_sext_i16:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: movswl (%rdi), %eax
; SSSE3-NEXT: movd %eax, %xmm0
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: insert_dup_elt3_mem_v8i16_sext_i16:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movswl (%rdi), %eax
; SSE41-NEXT: movd %eax, %xmm0
; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3]
; SSE41-NEXT: retq
;
; AVX1-LABEL: insert_dup_elt3_mem_v8i16_sext_i16:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: movswl (%rdi), %eax
; AVX1-NEXT: vmovd %eax, %xmm0
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3]
; AVX1-NEXT: retq
;
; AVX2-LABEL: insert_dup_elt3_mem_v8i16_sext_i16:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: movswl (%rdi), %eax
; AVX2-NEXT: shrl $16, %eax
; AVX2-NEXT: vmovd %eax, %xmm0
@@ -2440,7 +2518,7 @@ define <8 x i16> @insert_dup_elt3_mem_v8i16_sext_i16(i16* %ptr) {
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: insert_dup_elt3_mem_v8i16_sext_i16:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: movswl (%rdi), %eax
; AVX512VL-NEXT: shrl $16, %eax
; AVX512VL-NEXT: vpbroadcastw %eax, %xmm0
diff --git a/test/CodeGen/X86/vector-shuffle-256-v16.ll b/test/CodeGen/X86/vector-shuffle-256-v16.ll
index ba7c0894b932..11f25a2d687d 100644
--- a/test/CodeGen/X86/vector-shuffle-256-v16.ll
+++ b/test/CodeGen/X86/vector-shuffle-256-v16.ll
@@ -5,14 +5,14 @@
define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpbroadcastw %xmm0, %ymm0
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
@@ -21,7 +21,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_0
define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7]
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1]
@@ -29,7 +29,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_0
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1]
; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1]
; AVX2OR512VL-NEXT: retq
@@ -39,7 +39,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_0
define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7]
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1]
@@ -47,7 +47,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_0
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1]
; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1]
; AVX2OR512VL-NEXT: retq
@@ -57,7 +57,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_0
define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7]
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1]
@@ -65,7 +65,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_0
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1]
; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1]
; AVX2OR512VL-NEXT: retq
@@ -75,7 +75,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_0
define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_00(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_00:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7]
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1]
@@ -83,7 +83,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_0
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_00:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,8,9]
; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0]
; AVX2OR512VL-NEXT: retq
@@ -93,7 +93,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_0
define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_00:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7]
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1]
@@ -101,7 +101,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_0
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_00:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,10,11,0,1]
; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0]
; AVX2OR512VL-NEXT: retq
@@ -111,7 +111,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_0
define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_00:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7]
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1]
@@ -119,7 +119,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_0
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_00:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,12,13,0,1,0,1]
; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0]
; AVX2OR512VL-NEXT: retq
@@ -129,7 +129,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_0
define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7]
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
@@ -137,7 +137,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_0
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,14,15,0,1,0,1,0,1]
; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0]
; AVX2OR512VL-NEXT: retq
@@ -147,7 +147,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_0
define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,0,1,0,1,0,1,0,1,0,1,2,3]
@@ -157,9 +157,9 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_0
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpbroadcastw %xmm0, %xmm1
-; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15]
; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5]
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
@@ -167,8 +167,8 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_0
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm1 = [0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0]
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0]
; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0
; AVX512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
@@ -177,7 +177,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_0
define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_09_00_00_00_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_09_00_00_00_00_00_00_00_00_00:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,0,1,0,1,0,1,0,1,6,7,0,1]
@@ -187,16 +187,16 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_09_00_00_00_00_00_00_00_00_0
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_09_00_00_00_00_00_00_00_00_00:
-; AVX2: # BB#0:
-; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <0,0,255,255,u,u,u,u,u,u,u,u,u,u,u,u,255,255,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_09_00_00_00_00_00_00_00_00_00:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm1 = [0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0]
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0]
; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0
; AVX512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 9, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
@@ -205,7 +205,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_09_00_00_00_00_00_00_00_00_0
define <16 x i16> @shuffle_v16i16_00_00_00_00_00_10_00_00_00_00_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_10_00_00_00_00_00_00_00_00_00_00:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,0,1,0,1,0,1,10,11,0,1,0,1]
@@ -215,15 +215,15 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_10_00_00_00_00_00_00_00_00_00_0
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_10_00_00_00_00_00_00_00_00_00_00:
-; AVX2: # BB#0:
-; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_00_10_00_00_00_00_00_00_00_00_00_00:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm1 = [0,0,0,0,0,10,0,0,0,0,0,0,0,0,0,0]
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,10,0,0,0,0,0,0,0,0,0,0]
; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0
; AVX512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 10, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
@@ -232,7 +232,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_10_00_00_00_00_00_00_00_00_00_0
define <16 x i16> @shuffle_v16i16_00_00_00_00_11_00_00_00_00_00_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_00_00_00_00_11_00_00_00_00_00_00_00_00_00_00_00:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,0,1,0,1,14,15,0,1,0,1,0,1]
@@ -242,15 +242,15 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_11_00_00_00_00_00_00_00_00_00_00_0
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v16i16_00_00_00_00_11_00_00_00_00_00_00_00_00_00_00_00:
-; AVX2: # BB#0:
-; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_11_00_00_00_00_00_00_00_00_00_00_00:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm1 = [0,0,0,0,11,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,11,0,0,0,0,0,0,0,0,0,0,0]
; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0
; AVX512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 11, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
@@ -259,7 +259,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_11_00_00_00_00_00_00_00_00_00_00_0
define <16 x i16> @shuffle_v16i16_00_00_00_12_00_00_00_00_00_00_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_00_00_00_12_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7]
; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1]
@@ -269,14 +269,14 @@ define <16 x i16> @shuffle_v16i16_00_00_00_12_00_00_00_00_00_00_00_00_00_00_00_0
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v16i16_00_00_00_12_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1]
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_00_00_00_12_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm1 = [0,0,0,12,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,12,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0
; AVX512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 12, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
@@ -285,7 +285,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_12_00_00_00_00_00_00_00_00_00_00_00_0
define <16 x i16> @shuffle_v16i16_00_00_13_00_00_00_00_00_00_00_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_00_00_13_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7]
; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1]
@@ -295,14 +295,14 @@ define <16 x i16> @shuffle_v16i16_00_00_13_00_00_00_00_00_00_00_00_00_00_00_00_0
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v16i16_00_00_13_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1]
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_00_00_13_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm1 = [0,0,13,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,13,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0
; AVX512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 13, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
@@ -311,7 +311,7 @@ define <16 x i16> @shuffle_v16i16_00_00_13_00_00_00_00_00_00_00_00_00_00_00_00_0
define <16 x i16> @shuffle_v16i16_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7]
; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1]
@@ -321,14 +321,14 @@ define <16 x i16> @shuffle_v16i16_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_0
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v16i16_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1]
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm1 = [0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0
; AVX512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 14, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
@@ -337,7 +337,7 @@ define <16 x i16> @shuffle_v16i16_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_0
define <16 x i16> @shuffle_v16i16_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7]
; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
@@ -347,13 +347,13 @@ define <16 x i16> @shuffle_v16i16_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_0
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v16i16_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1]
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: movl $15, %eax
; AVX512VL-NEXT: vmovd %eax, %xmm1
; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0
@@ -364,7 +364,7 @@ define <16 x i16> @shuffle_v16i16_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_0
define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7]
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
@@ -374,7 +374,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_0
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15]
; AVX2OR512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5]
; AVX2OR512VL-NEXT: retq
@@ -384,7 +384,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_0
define <16 x i16> @shuffle_v16i16_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_15(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_15:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,7,7,7,7]
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
@@ -394,7 +394,7 @@ define <16 x i16> @shuffle_v16i16_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_1
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v16i16_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_15:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15]
; AVX2OR512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,3,3,6,6,7,7]
; AVX2OR512VL-NEXT: retq
@@ -404,7 +404,7 @@ define <16 x i16> @shuffle_v16i16_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_1
define <16 x i16> @shuffle_v16i16_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7]
; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
@@ -414,7 +414,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_1
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15]
; AVX2OR512VL-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
; AVX2OR512VL-NEXT: retq
@@ -424,7 +424,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_1
define <16 x i16> @shuffle_v16i16_03_03_03_03_07_07_07_07_11_11_11_11_15_15_15_15(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_03_03_03_03_07_07_07_07_11_11_11_11_15_15_15_15:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[3,3,3,3,4,5,6,7]
; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
@@ -434,7 +434,7 @@ define <16 x i16> @shuffle_v16i16_03_03_03_03_07_07_07_07_11_11_11_11_15_15_15_1
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v16i16_03_03_03_03_07_07_07_07_11_11_11_11_15_15_15_15:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[3,3,3,3,4,5,6,7,11,11,11,11,12,13,14,15]
; AVX2OR512VL-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15]
; AVX2OR512VL-NEXT: retq
@@ -444,7 +444,7 @@ define <16 x i16> @shuffle_v16i16_03_03_03_03_07_07_07_07_11_11_11_11_15_15_15_1
define <16 x i16> @shuffle_v16i16_00_00_02_02_04_04_06_06_08_08_10_10_12_12_14_14(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_00_00_02_02_04_04_06_06_08_08_10_10_12_12_14_14:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,2,4,5,6,7]
; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,6,6]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
@@ -454,7 +454,7 @@ define <16 x i16> @shuffle_v16i16_00_00_02_02_04_04_06_06_08_08_10_10_12_12_14_1
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_02_02_04_04_06_06_08_08_10_10_12_12_14_14:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,2,2,4,5,6,7,8,8,10,10,12,13,14,15]
; AVX2OR512VL-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,6,6,8,9,10,11,12,12,14,14]
; AVX2OR512VL-NEXT: retq
@@ -464,7 +464,7 @@ define <16 x i16> @shuffle_v16i16_00_00_02_02_04_04_06_06_08_08_10_10_12_12_14_1
define <16 x i16> @shuffle_v16i16_01_01_03_03_05_05_07_07_09_09_11_11_13_13_15_15(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_01_01_03_03_05_05_07_07_09_09_11_11_13_13_15_15:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[1,1,3,3,4,5,6,7]
; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,7,7]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
@@ -474,7 +474,7 @@ define <16 x i16> @shuffle_v16i16_01_01_03_03_05_05_07_07_09_09_11_11_13_13_15_1
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v16i16_01_01_03_03_05_05_07_07_09_09_11_11_13_13_15_15:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,1,3,3,4,5,6,7,9,9,11,11,12,13,14,15]
; AVX2OR512VL-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,5,5,7,7,8,9,10,11,13,13,15,15]
; AVX2OR512VL-NEXT: retq
@@ -484,161 +484,119 @@ define <16 x i16> @shuffle_v16i16_01_01_03_03_05_05_07_07_09_09_11_11_13_13_15_1
define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_01_00_00_00_00_00_00_00_01_00(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_00_00_00_00_00_00_01_00:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_00_00_00_00_00_00_01_00:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1]
-; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1,0,1]
-; AVX2-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_00_00_00_00_00_00_01_00:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1]
-; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
-; AVX512VL-NEXT: retq
+; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_00_00_00_00_00_00_01_00:
+; AVX2OR512VL: # %bb.0:
+; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1]
+; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
+; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0>
ret <16 x i16> %shuffle
}
define <16 x i16> @shuffle_v16i16_00_00_00_00_00_02_00_00_00_00_00_00_00_02_00_00(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_00_00_00_00_00_02_00_00:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_00_00_00_00_00_02_00_00:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1]
-; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1,0,1]
-; AVX2-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_00_00_00_00_00_02_00_00:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1]
-; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
-; AVX512VL-NEXT: retq
+; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_00_00_00_00_00_02_00_00:
+; AVX2OR512VL: # %bb.0:
+; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1]
+; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
+; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0>
ret <16 x i16> %shuffle
}
define <16 x i16> @shuffle_v16i16_00_00_00_00_03_00_00_00_00_00_00_00_03_00_00_00(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_00_00_00_00_03_00_00_00:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_00_00_00_00_03_00_00_00:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1]
-; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1,0,1]
-; AVX2-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_00_00_00_00_03_00_00_00:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1]
-; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
-; AVX512VL-NEXT: retq
+; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_00_00_00_00_03_00_00_00:
+; AVX2OR512VL: # %bb.0:
+; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1]
+; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
+; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0>
ret <16 x i16> %shuffle
}
define <16 x i16> @shuffle_v16i16_00_00_00_04_00_00_00_00_00_00_00_04_00_00_00_00(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_00_00_00_04_00_00_00_00_00_00_00_04_00_00_00_00:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v16i16_00_00_00_04_00_00_00_00_00_00_00_04_00_00_00_00:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1]
-; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1,0,1]
-; AVX2-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v16i16_00_00_00_04_00_00_00_00_00_00_00_04_00_00_00_00:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1]
-; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
-; AVX512VL-NEXT: retq
+; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_00_04_00_00_00_00_00_00_00_04_00_00_00_00:
+; AVX2OR512VL: # %bb.0:
+; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1]
+; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
+; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 4, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 4, i32 0, i32 0, i32 0, i32 0>
ret <16 x i16> %shuffle
}
define <16 x i16> @shuffle_v16i16_00_00_05_00_00_00_00_00_00_00_05_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_00_00_05_00_00_00_00_00_00_00_05_00_00_00_00_00:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v16i16_00_00_05_00_00_00_00_00_00_00_05_00_00_00_00_00:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1]
-; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1,0,1]
-; AVX2-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v16i16_00_00_05_00_00_00_00_00_00_00_05_00_00_00_00_00:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1]
-; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
-; AVX512VL-NEXT: retq
+; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_05_00_00_00_00_00_00_00_05_00_00_00_00_00:
+; AVX2OR512VL: # %bb.0:
+; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1]
+; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
+; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 5, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 5, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <16 x i16> %shuffle
}
define <16 x i16> @shuffle_v16i16_00_06_00_00_00_00_00_00_00_06_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_00_06_00_00_00_00_00_00_00_06_00_00_00_00_00_00:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v16i16_00_06_00_00_00_00_00_00_00_06_00_00_00_00_00_00:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1]
-; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1,0,1]
-; AVX2-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v16i16_00_06_00_00_00_00_00_00_00_06_00_00_00_00_00_00:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1]
-; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
-; AVX512VL-NEXT: retq
+; AVX2OR512VL-LABEL: shuffle_v16i16_00_06_00_00_00_00_00_00_00_06_00_00_00_00_00_00:
+; AVX2OR512VL: # %bb.0:
+; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
+; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 6, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 6, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <16 x i16> %shuffle
}
define <16 x i16> @shuffle_v16i16_07_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_07_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v16i16_07_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
-; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1,0,1]
-; AVX2-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v16i16_07_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
-; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
-; AVX512VL-NEXT: retq
+; AVX2OR512VL-LABEL: shuffle_v16i16_07_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00:
+; AVX2OR512VL: # %bb.0:
+; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
+; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <16 x i16> %shuffle
}
define <16 x i16> @shuffle_v16i16_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,0,65535,0,65535,0,65535,0,65535,0,65535,0,65535,0,65535,0]
; AVX1-NEXT: vandnps %ymm1, %ymm2, %ymm1
; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
@@ -646,7 +604,7 @@ define <16 x i16> @shuffle_v16i16_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_3
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v16i16_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
@@ -655,7 +613,7 @@ define <16 x i16> @shuffle_v16i16_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_3
define <16 x i16> @shuffle_v16i16_16_01_18_03_20_05_22_07_24_09_26_11_28_13_30_15(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_16_01_18_03_20_05_22_07_24_09_26_11_28_13_30_15:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,0,65535,0,65535,0,65535,0,65535,0,65535,0,65535,0,65535,0]
; AVX1-NEXT: vandnps %ymm0, %ymm2, %ymm0
; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
@@ -663,7 +621,7 @@ define <16 x i16> @shuffle_v16i16_16_01_18_03_20_05_22_07_24_09_26_11_28_13_30_1
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v16i16_16_01_18_03_20_05_22_07_24_09_26_11_28_13_30_15:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15]
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 16, i32 1, i32 18, i32 3, i32 20, i32 5, i32 22, i32 7, i32 24, i32 9, i32 26, i32 11, i32 28, i32 13, i32 30, i32 15>
@@ -671,28 +629,23 @@ define <16 x i16> @shuffle_v16i16_16_01_18_03_20_05_22_07_24_09_26_11_28_13_30_1
}
define <16 x i16> @shuffle_v16i16_00_01_18_19_04_05_22_23_08_09_26_27_12_13_30_31(<16 x i16> %a, <16 x i16> %b) {
-; AVX1-LABEL: shuffle_v16i16_00_01_18_19_04_05_22_23_08_09_26_27_12_13_30_31:
-; AVX1: # BB#0:
-; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
-; AVX1-NEXT: retq
-;
-; AVX2OR512VL-LABEL: shuffle_v16i16_00_01_18_19_04_05_22_23_08_09_26_27_12_13_30_31:
-; AVX2OR512VL: # BB#0:
-; AVX2OR512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
-; AVX2OR512VL-NEXT: retq
+; ALL-LABEL: shuffle_v16i16_00_01_18_19_04_05_22_23_08_09_26_27_12_13_30_31:
+; ALL: # %bb.0:
+; ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
+; ALL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 1, i32 18, i32 19, i32 4, i32 5, i32 22, i32 23, i32 8, i32 9, i32 26, i32 27, i32 12, i32 13, i32 30, i32 31>
ret <16 x i16> %shuffle
}
define <16 x i16> @shuffle_v16i16_16_17_18_19_04_05_06_07_24_25_26_27_12_13_14_15(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_16_17_18_19_04_05_06_07_24_25_26_27_12_13_14_15:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3]
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v16i16_16_17_18_19_04_05_06_07_24_25_26_27_12_13_14_15:
-; AVX2OR512VL: # BB#0:
-; AVX2OR512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
+; AVX2OR512VL: # %bb.0:
+; AVX2OR512VL-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 12, i32 13, i32 14, i32 15>
ret <16 x i16> %shuffle
@@ -700,7 +653,7 @@ define <16 x i16> @shuffle_v16i16_16_17_18_19_04_05_06_07_24_25_26_27_12_13_14_1
define <16 x i16> @shuffle_v16i16_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_31(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_31:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0]
; AVX1-NEXT: vandnps %ymm1, %ymm2, %ymm1
; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
@@ -708,13 +661,13 @@ define <16 x i16> @shuffle_v16i16_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_3
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v16i16_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_31:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0]
; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_31:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: movw $-32768, %ax # imm = 0x8000
; AVX512VL-NEXT: kmovd %eax, %k1
; AVX512VL-NEXT: vmovdqu16 %ymm1, %ymm0 {%k1}
@@ -725,7 +678,7 @@ define <16 x i16> @shuffle_v16i16_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_3
define <16 x i16> @shuffle_v16i16_16_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_16_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
; AVX1-NEXT: vandnps %ymm1, %ymm2, %ymm1
; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
@@ -733,13 +686,13 @@ define <16 x i16> @shuffle_v16i16_16_01_02_03_04_05_06_07_08_09_10_11_12_13_14_1
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v16i16_16_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_16_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: movw $1, %ax
; AVX512VL-NEXT: kmovd %eax, %k1
; AVX512VL-NEXT: vmovdqu16 %ymm1, %ymm0 {%k1}
@@ -750,7 +703,7 @@ define <16 x i16> @shuffle_v16i16_16_01_02_03_04_05_06_07_08_09_10_11_12_13_14_1
define <16 x i16> @shuffle_v16i16_00_17_02_19_04_21_06_23_24_09_26_11_28_13_30_15(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_00_17_02_19_04_21_06_23_24_09_26_11_28_13_30_15:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,0,65535,0,65535,0,65535,0,0,65535,0,65535,0,65535,0,65535]
; AVX1-NEXT: vandnps %ymm1, %ymm2, %ymm1
; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
@@ -758,13 +711,13 @@ define <16 x i16> @shuffle_v16i16_00_17_02_19_04_21_06_23_24_09_26_11_28_13_30_1
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v16i16_00_17_02_19_04_21_06_23_24_09_26_11_28_13_30_15:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,255,255,0,0,255,255,0,0,255,255,0,0,0,0,255,255,0,0,255,255,0,0,255,255,0,0,255,255]
; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_00_17_02_19_04_21_06_23_24_09_26_11_28_13_30_15:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: movw $21930, %ax # imm = 0x55AA
; AVX512VL-NEXT: kmovd %eax, %k1
; AVX512VL-NEXT: vmovdqu16 %ymm1, %ymm0 {%k1}
@@ -775,7 +728,7 @@ define <16 x i16> @shuffle_v16i16_00_17_02_19_04_21_06_23_24_09_26_11_28_13_30_1
define <16 x i16> @shuffle_v16i16_16_01_18_03_20_05_22_07_08_25_10_27_12_29_14_31(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_16_01_18_03_20_05_22_07_08_25_10_27_12_29_14_31:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [0,65535,0,65535,0,65535,0,65535,65535,0,65535,0,65535,0,65535,0]
; AVX1-NEXT: vandnps %ymm1, %ymm2, %ymm1
; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
@@ -783,13 +736,13 @@ define <16 x i16> @shuffle_v16i16_16_01_18_03_20_05_22_07_08_25_10_27_12_29_14_3
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v16i16_16_01_18_03_20_05_22_07_08_25_10_27_12_29_14_31:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,255,255,0,0,255,255,0,0,255,255,0,0,255,255,255,255,0,0,255,255,0,0,255,255,0,0,255,255,0,0]
; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_16_01_18_03_20_05_22_07_08_25_10_27_12_29_14_31:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: movw $-21931, %ax # imm = 0xAA55
; AVX512VL-NEXT: kmovd %eax, %k1
; AVX512VL-NEXT: vmovdqu16 %ymm1, %ymm0 {%k1}
@@ -799,29 +752,24 @@ define <16 x i16> @shuffle_v16i16_16_01_18_03_20_05_22_07_08_25_10_27_12_29_14_3
}
define <16 x i16> @shuffle_v16i16_00_01_18_19_20_21_06_07_08_09_26_27_12_13_30_31(<16 x i16> %a, <16 x i16> %b) {
-; AVX1-LABEL: shuffle_v16i16_00_01_18_19_20_21_06_07_08_09_26_27_12_13_30_31:
-; AVX1: # BB#0:
-; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5],ymm0[6],ymm1[7]
-; AVX1-NEXT: retq
-;
-; AVX2OR512VL-LABEL: shuffle_v16i16_00_01_18_19_20_21_06_07_08_09_26_27_12_13_30_31:
-; AVX2OR512VL: # BB#0:
-; AVX2OR512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5],ymm0[6],ymm1[7]
-; AVX2OR512VL-NEXT: retq
+; ALL-LABEL: shuffle_v16i16_00_01_18_19_20_21_06_07_08_09_26_27_12_13_30_31:
+; ALL: # %bb.0:
+; ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5],ymm0[6],ymm1[7]
+; ALL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 1, i32 18, i32 19, i32 20, i32 21, i32 6, i32 7, i32 8, i32 9, i32 26, i32 27, i32 12, i32 13, i32 30, i32 31>
ret <16 x i16> %shuffle
}
define <16 x i16> @shuffle_v16i16_00_16_00_16_00_16_00_16_00_16_00_16_00_16_00_16(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_00_16_00_16_00_16_00_16_00_16_00_16_00_16_00_16:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v16i16_00_16_00_16_00_16_00_16_00_16_00_16_00_16_00_16:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX2OR512VL-NEXT: vpbroadcastd %xmm0, %ymm0
; AVX2OR512VL-NEXT: retq
@@ -831,7 +779,7 @@ define <16 x i16> @shuffle_v16i16_00_16_00_16_00_16_00_16_00_16_00_16_00_16_00_1
define <16 x i16> @shuffle_v16i16_00_16_00_16_00_16_00_16_08_24_08_24_08_24_08_24(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_00_16_00_16_00_16_00_16_08_24_08_24_08_24_08_24:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
@@ -842,7 +790,7 @@ define <16 x i16> @shuffle_v16i16_00_16_00_16_00_16_00_16_08_24_08_24_08_24_08_2
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v16i16_00_16_00_16_00_16_00_16_08_24_08_24_08_24_08_24:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
; AVX2-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15]
; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,1,1,4,4,5,5]
@@ -850,8 +798,8 @@ define <16 x i16> @shuffle_v16i16_00_16_00_16_00_16_00_16_08_24_08_24_08_24_08_2
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_00_16_00_16_00_16_00_16_08_24_08_24_08_24_08_24:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [0,16,0,16,0,16,0,16,8,24,8,24,8,24,8,24]
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,16,0,16,0,16,0,16,8,24,8,24,8,24,8,24]
; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0
; AVX512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 16, i32 0, i32 16, i32 0, i32 16, i32 0, i32 16, i32 8, i32 24, i32 8, i32 24, i32 8, i32 24, i32 8, i32 24>
@@ -860,7 +808,7 @@ define <16 x i16> @shuffle_v16i16_00_16_00_16_00_16_00_16_08_24_08_24_08_24_08_2
define <16 x i16> @shuffle_v16i16_16_16_16_16_04_05_06_07_24_24_24_24_12_13_14_15(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_16_16_16_16_04_05_06_07_24_24_24_24_12_13_14_15:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,0,0,0,4,5,6,7]
@@ -871,14 +819,14 @@ define <16 x i16> @shuffle_v16i16_16_16_16_16_04_05_06_07_24_24_24_24_12_13_14_1
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v16i16_16_16_16_16_04_05_06_07_24_24_24_24_12_13_14_15:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15]
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_16_16_16_16_04_05_06_07_24_24_24_24_12_13_14_15:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [0,0,0,0,20,21,22,23,8,8,8,8,28,29,30,31]
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,20,21,22,23,8,8,8,8,28,29,30,31]
; AVX512VL-NEXT: vpermi2w %ymm0, %ymm1, %ymm2
; AVX512VL-NEXT: vmovdqa %ymm2, %ymm0
; AVX512VL-NEXT: retq
@@ -888,7 +836,7 @@ define <16 x i16> @shuffle_v16i16_16_16_16_16_04_05_06_07_24_24_24_24_12_13_14_1
define <16 x i16> @shuffle_v16i16_19_18_17_16_07_06_05_04_27_26_25_24_15_14_13_12(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_19_18_17_16_07_06_05_04_27_26_25_24_15_14_13_12:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
@@ -903,15 +851,15 @@ define <16 x i16> @shuffle_v16i16_19_18_17_16_07_06_05_04_27_26_25_24_15_14_13_1
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v16i16_19_18_17_16_07_06_05_04_27_26_25_24_15_14_13_12:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[3,2,1,0,4,5,6,7,11,10,9,8,12,13,14,15]
; AVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,6,5,4,8,9,10,11,15,14,13,12]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_19_18_17_16_07_06_05_04_27_26_25_24_15_14_13_12:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [3,2,1,0,23,22,21,20,11,10,9,8,31,30,29,28]
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [3,2,1,0,23,22,21,20,11,10,9,8,31,30,29,28]
; AVX512VL-NEXT: vpermi2w %ymm0, %ymm1, %ymm2
; AVX512VL-NEXT: vmovdqa %ymm2, %ymm0
; AVX512VL-NEXT: retq
@@ -921,7 +869,7 @@ define <16 x i16> @shuffle_v16i16_19_18_17_16_07_06_05_04_27_26_25_24_15_14_13_1
define <16 x i16> @shuffle_v16i16_19_18_17_16_03_02_01_00_27_26_25_24_11_10_09_08(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_19_18_17_16_03_02_01_00_27_26_25_24_11_10_09_08:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
@@ -933,7 +881,7 @@ define <16 x i16> @shuffle_v16i16_19_18_17_16_03_02_01_00_27_26_25_24_11_10_09_0
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v16i16_19_18_17_16_03_02_01_00_27_26_25_24_11_10_09_08:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[3,2,1,0,4,5,6,7,11,10,9,8,12,13,14,15]
; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5]
; AVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,6,5,4,8,9,10,11,15,14,13,12]
@@ -941,8 +889,8 @@ define <16 x i16> @shuffle_v16i16_19_18_17_16_03_02_01_00_27_26_25_24_11_10_09_0
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_19_18_17_16_03_02_01_00_27_26_25_24_11_10_09_08:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [3,2,1,0,19,18,17,16,11,10,9,8,27,26,25,24]
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [3,2,1,0,19,18,17,16,11,10,9,8,27,26,25,24]
; AVX512VL-NEXT: vpermi2w %ymm0, %ymm1, %ymm2
; AVX512VL-NEXT: vmovdqa %ymm2, %ymm0
; AVX512VL-NEXT: retq
@@ -952,7 +900,7 @@ define <16 x i16> @shuffle_v16i16_19_18_17_16_03_02_01_00_27_26_25_24_11_10_09_0
define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_01_00_08_08_08_08_08_08_09_08(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_08_08_08_08_08_08_09_08:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1]
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
@@ -961,7 +909,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_01_00_08_08_08_08_08_08_09_0
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_08_08_08_08_08_08_09_08:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1,16,17,16,17,16,17,16,17,16,17,16,17,18,19,16,17]
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 8>
@@ -970,7 +918,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_01_00_08_08_08_08_08_08_09_0
define <16 x i16> @shuffle_v16i16_00_00_00_00_00_02_00_00_08_08_08_08_08_10_08_08(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_08_08_08_08_08_10_08_08:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1]
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
@@ -979,7 +927,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_02_00_00_08_08_08_08_08_10_08_0
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_08_08_08_08_08_10_08_08:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1,16,17,16,17,16,17,16,17,16,17,20,21,16,17,16,17]
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 10, i32 8, i32 8>
@@ -988,7 +936,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_02_00_00_08_08_08_08_08_10_08_0
define <16 x i16> @shuffle_v16i16_00_00_00_00_03_00_00_00_08_08_08_08_11_08_08_08(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_08_08_08_08_11_08_08_08:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1]
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
@@ -997,7 +945,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_03_00_00_00_08_08_08_08_11_08_08_0
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_08_08_08_08_11_08_08_08:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1,16,17,16,17,16,17,16,17,22,23,16,17,16,17,16,17]
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 11, i32 8, i32 8, i32 8>
@@ -1006,7 +954,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_03_00_00_00_08_08_08_08_11_08_08_0
define <16 x i16> @shuffle_v16i16_00_00_00_04_00_00_00_00_08_08_08_12_08_08_08_08(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_00_00_00_04_00_00_00_00_08_08_08_12_08_08_08_08:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1]
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
@@ -1015,7 +963,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_04_00_00_00_00_08_08_08_12_08_08_08_0
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_00_04_00_00_00_00_08_08_08_12_08_08_08_08:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1,16,17,16,17,16,17,24,25,16,17,16,17,16,17,16,17]
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 4, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 12, i32 8, i32 8, i32 8, i32 8>
@@ -1024,7 +972,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_04_00_00_00_00_08_08_08_12_08_08_08_0
define <16 x i16> @shuffle_v16i16_00_00_05_00_00_00_00_00_08_08_13_08_08_08_08_08(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_00_00_05_00_00_00_00_00_08_08_13_08_08_08_08_08:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1]
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
@@ -1033,7 +981,7 @@ define <16 x i16> @shuffle_v16i16_00_00_05_00_00_00_00_00_08_08_13_08_08_08_08_0
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_05_00_00_00_00_00_08_08_13_08_08_08_08_08:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1,16,17,16,17,26,27,16,17,16,17,16,17,16,17,16,17]
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 5, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 13, i32 8, i32 8, i32 8, i32 8, i32 8>
@@ -1042,7 +990,7 @@ define <16 x i16> @shuffle_v16i16_00_00_05_00_00_00_00_00_08_08_13_08_08_08_08_0
define <16 x i16> @shuffle_v16i16_00_06_00_00_00_00_00_00_08_14_08_08_08_08_08_08(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_00_06_00_00_00_00_00_00_08_14_08_08_08_08_08_08:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1]
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
@@ -1051,7 +999,7 @@ define <16 x i16> @shuffle_v16i16_00_06_00_00_00_00_00_00_08_14_08_08_08_08_08_0
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v16i16_00_06_00_00_00_00_00_00_08_14_08_08_08_08_08_08:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1,16,17,28,29,16,17,16,17,16,17,16,17,16,17,16,17]
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 6, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 14, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
@@ -1060,7 +1008,7 @@ define <16 x i16> @shuffle_v16i16_00_06_00_00_00_00_00_00_08_14_08_08_08_08_08_0
define <16 x i16> @shuffle_v16i16_07_00_00_00_00_00_00_00_15_08_08_08_08_08_08_08(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_07_00_00_00_00_00_00_00_15_08_08_08_08_08_08_08:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
@@ -1069,7 +1017,7 @@ define <16 x i16> @shuffle_v16i16_07_00_00_00_00_00_00_00_15_08_08_08_08_08_08_0
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v16i16_07_00_00_00_00_00_00_00_15_08_08_08_08_08_08_08:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1,30,31,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 15, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
@@ -1078,7 +1026,7 @@ define <16 x i16> @shuffle_v16i16_07_00_00_00_00_00_00_00_15_08_08_08_08_08_08_0
define <16 x i16> @shuffle_v16i16_00_16_01_17_02_18_03_19_08_24_09_25_10_26_11_27(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_00_16_01_17_02_18_03_19_08_24_09_25_10_26_11_27:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
@@ -1087,7 +1035,7 @@ define <16 x i16> @shuffle_v16i16_00_16_01_17_02_18_03_19_08_24_09_25_10_26_11_2
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v16i16_00_16_01_17_02_18_03_19_08_24_09_25_10_26_11_27:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27>
@@ -1096,7 +1044,7 @@ define <16 x i16> @shuffle_v16i16_00_16_01_17_02_18_03_19_08_24_09_25_10_26_11_2
define <16 x i16> @shuffle_v16i16_04_20_05_21_06_22_07_23_12_28_13_29_14_30_15_31(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_04_20_05_21_06_22_07_23_12_28_13_29_14_30_15_31:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
@@ -1105,7 +1053,7 @@ define <16 x i16> @shuffle_v16i16_04_20_05_21_06_22_07_23_12_28_13_29_14_30_15_3
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v16i16_04_20_05_21_06_22_07_23_12_28_13_29_14_30_15_31:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15]
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
@@ -1114,7 +1062,7 @@ define <16 x i16> @shuffle_v16i16_04_20_05_21_06_22_07_23_12_28_13_29_14_30_15_3
define <16 x i16> @shuffle_v16i16_00_16_01_17_02_18_03_19_12_28_13_29_14_30_15_31(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_00_16_01_17_02_18_03_19_12_28_13_29_14_30_15_31:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
@@ -1123,15 +1071,15 @@ define <16 x i16> @shuffle_v16i16_00_16_01_17_02_18_03_19_12_28_13_29_14_30_15_3
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v16i16_00_16_01_17_02_18_03_19_12_28_13_29_14_30_15_31:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,0,1,u,u,2,3,u,u,4,5,u,u,6,7,u,u,24,25,u,u,26,27,u,u,28,29,u,u,30,31]
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,u,u,2,3,u,u,4,5,u,u,6,7,u,u,24,25,u,u,26,27,u,u,28,29,u,u,30,31,u,u]
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_00_16_01_17_02_18_03_19_12_28_13_29_14_30_15_31:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [0,16,1,17,2,18,3,19,12,28,13,29,14,30,15,31]
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,16,1,17,2,18,3,19,12,28,13,29,14,30,15,31]
; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0
; AVX512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
@@ -1140,7 +1088,7 @@ define <16 x i16> @shuffle_v16i16_00_16_01_17_02_18_03_19_12_28_13_29_14_30_15_3
define <16 x i16> @shuffle_v16i16_04_20_05_21_06_22_07_23_08_24_09_25_10_26_11_27(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_04_20_05_21_06_22_07_23_08_24_09_25_10_26_11_27:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
@@ -1149,15 +1097,15 @@ define <16 x i16> @shuffle_v16i16_04_20_05_21_06_22_07_23_08_24_09_25_10_26_11_2
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v16i16_04_20_05_21_06_22_07_23_08_24_09_25_10_26_11_27:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,8,9,u,u,10,11,u,u,12,13,u,u,14,15,u,u,16,17,u,u,18,19,u,u,20,21,u,u,22,23]
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,9,u,u,10,11,u,u,12,13,u,u,14,15,u,u,16,17,u,u,18,19,u,u,20,21,u,u,22,23,u,u]
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_04_20_05_21_06_22_07_23_08_24_09_25_10_26_11_27:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [4,20,5,21,6,22,7,23,8,24,9,25,10,26,11,27]
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [4,20,5,21,6,22,7,23,8,24,9,25,10,26,11,27]
; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0
; AVX512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27>
@@ -1166,7 +1114,7 @@ define <16 x i16> @shuffle_v16i16_04_20_05_21_06_22_07_23_08_24_09_25_10_26_11_2
define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_01_00_08_09_08_08_08_08_08_08(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_08_09_08_08_08_08_08_08:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,0,1,0,1,0,1,0,1,0,1,0,1]
@@ -1174,7 +1122,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_01_00_08_09_08_08_08_08_08_0
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_08_09_08_08_08_08_08_08:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1,16,17,18,19,16,17,16,17,16,17,16,17,16,17,16,17]
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0, i32 8, i32 9, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
@@ -1183,7 +1131,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_01_00_08_09_08_08_08_08_08_0
define <16 x i16> @shuffle_v16i16_00_00_00_00_00_02_00_00_08_08_10_08_08_08_08_08(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_08_08_10_08_08_08_08_08:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,4,5,0,1,0,1,0,1,0,1,0,1]
@@ -1191,7 +1139,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_02_00_00_08_08_10_08_08_08_08_0
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_08_08_10_08_08_08_08_08:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1,16,17,16,17,20,21,16,17,16,17,16,17,16,17,16,17]
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0, i32 8, i32 8, i32 10, i32 8, i32 8, i32 8, i32 8, i32 8>
@@ -1200,7 +1148,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_02_00_00_08_08_10_08_08_08_08_0
define <16 x i16> @shuffle_v16i16_00_00_00_00_03_00_00_00_08_08_08_11_08_08_08_08(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_08_08_08_11_08_08_08_08:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,6,7,0,1,0,1,0,1,0,1]
@@ -1208,7 +1156,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_03_00_00_00_08_08_08_11_08_08_08_0
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_08_08_08_11_08_08_08_08:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1,16,17,16,17,16,17,22,23,16,17,16,17,16,17,16,17]
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 11, i32 8, i32 8, i32 8, i32 8>
@@ -1217,7 +1165,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_03_00_00_00_08_08_08_11_08_08_08_0
define <16 x i16> @shuffle_v16i16_00_00_00_04_00_00_00_00_08_08_08_08_12_08_08_08(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_00_00_00_04_00_00_00_00_08_08_08_08_12_08_08_08:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,8,9,0,1,0,1,0,1]
@@ -1225,7 +1173,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_04_00_00_00_00_08_08_08_08_12_08_08_0
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_00_04_00_00_00_00_08_08_08_08_12_08_08_08:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,24,25,16,17,16,17,16,17]
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 4, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 12, i32 8, i32 8, i32 8>
@@ -1234,7 +1182,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_04_00_00_00_00_08_08_08_08_12_08_08_0
define <16 x i16> @shuffle_v16i16_00_00_05_00_00_00_00_00_08_08_08_08_08_13_08_08(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_00_00_05_00_00_00_00_00_08_08_08_08_08_13_08_08:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,10,11,0,1,0,1]
@@ -1242,7 +1190,7 @@ define <16 x i16> @shuffle_v16i16_00_00_05_00_00_00_00_00_08_08_08_08_08_13_08_0
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_05_00_00_00_00_00_08_08_08_08_08_13_08_08:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,26,27,16,17,16,17]
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 5, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 13, i32 8, i32 8>
@@ -1251,7 +1199,7 @@ define <16 x i16> @shuffle_v16i16_00_00_05_00_00_00_00_00_08_08_08_08_08_13_08_0
define <16 x i16> @shuffle_v16i16_00_06_00_00_00_00_00_00_08_08_08_08_08_08_14_08(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_00_06_00_00_00_00_00_00_08_08_08_08_08_08_14_08:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,12,13,0,1]
@@ -1259,7 +1207,7 @@ define <16 x i16> @shuffle_v16i16_00_06_00_00_00_00_00_00_08_08_08_08_08_08_14_0
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v16i16_00_06_00_00_00_00_00_00_08_08_08_08_08_08_14_08:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,28,29,16,17]
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 6, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 14, i32 8>
@@ -1268,7 +1216,7 @@ define <16 x i16> @shuffle_v16i16_00_06_00_00_00_00_00_00_08_08_08_08_08_08_14_0
define <16 x i16> @shuffle_v16i16_07_00_00_00_00_00_00_00_08_08_08_08_08_08_08_15(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_07_00_00_00_00_00_00_00_08_08_08_08_08_08_08_15:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,14,15]
@@ -1276,7 +1224,7 @@ define <16 x i16> @shuffle_v16i16_07_00_00_00_00_00_00_00_08_08_08_08_08_08_08_1
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v16i16_07_00_00_00_00_00_00_00_08_08_08_08_08_08_08_15:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,30,31]
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 15>
@@ -1285,7 +1233,7 @@ define <16 x i16> @shuffle_v16i16_07_00_00_00_00_00_00_00_08_08_08_08_08_08_08_1
define <16 x i16> @shuffle_v16i16_00_00_02_02_04_04_06_06_14_14_12_12_10_10_08_08(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_00_00_02_02_04_04_06_06_14_14_12_12_10_10_08_08:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,2,4,5,6,7]
; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,6,6]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
@@ -1294,7 +1242,7 @@ define <16 x i16> @shuffle_v16i16_00_00_02_02_04_04_06_06_14_14_12_12_10_10_08_0
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_02_02_04_04_06_06_14_14_12_12_10_10_08_08:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13,28,29,28,29,24,25,24,25,20,21,20,21,16,17,16,17]
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 14, i32 14, i32 12, i32 12, i32 10, i32 10, i32 8, i32 8>
@@ -1303,7 +1251,7 @@ define <16 x i16> @shuffle_v16i16_00_00_02_02_04_04_06_06_14_14_12_12_10_10_08_0
define <16 x i16> @shuffle_v16i16_04_04_04_04_00_00_00_00_08_08_08_08_12_12_12_12(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_04_04_04_04_00_00_00_00_08_08_08_08_12_12_12_12:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[8,9,8,9,8,9,8,9,0,1,0,1,0,1,0,1]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
@@ -1312,7 +1260,7 @@ define <16 x i16> @shuffle_v16i16_04_04_04_04_00_00_00_00_08_08_08_08_12_12_12_1
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v16i16_04_04_04_04_00_00_00_00_08_08_08_08_12_12_12_12:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,9,8,9,8,9,8,9,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,24,25,24,25,24,25,24,25]
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 4, i32 4, i32 4, i32 4, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 12, i32 12, i32 12, i32 12>
@@ -1321,7 +1269,7 @@ define <16 x i16> @shuffle_v16i16_04_04_04_04_00_00_00_00_08_08_08_08_12_12_12_1
define <16 x i16> @shuffle_v16i16_00_uu_uu_00_00_00_00_00_08_08_uu_uu_08_08_14_08(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_00_uu_uu_00_00_00_00_00_08_08_uu_uu_08_08_14_08:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7]
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
@@ -1330,7 +1278,7 @@ define <16 x i16> @shuffle_v16i16_00_uu_uu_00_00_00_00_00_08_08_uu_uu_08_08_14_0
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v16i16_00_uu_uu_00_00_00_00_00_08_08_uu_uu_08_08_14_08:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,u,u,u,u,0,1,0,1,0,1,0,1,0,1,16,17,16,17,u,u,u,u,16,17,16,17,28,29,16,17]
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 undef, i32 undef, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 undef, i32 undef, i32 8, i32 8, i32 14, i32 8>
@@ -1339,7 +1287,7 @@ define <16 x i16> @shuffle_v16i16_00_uu_uu_00_00_00_00_00_08_08_uu_uu_08_08_14_0
define <16 x i16> @shuffle_v16i16_07_uu_00_00_00_00_00_00_08_08_uu_uu_08_08_08_15(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_07_uu_00_00_00_00_00_00_08_08_uu_uu_08_08_08_15:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[14,15,2,3,0,1,0,1,0,1,0,1,0,1,0,1]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,4,5,6,7,0,1,0,1,0,1,14,15]
@@ -1347,7 +1295,7 @@ define <16 x i16> @shuffle_v16i16_07_uu_00_00_00_00_00_00_08_08_uu_uu_08_08_08_1
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v16i16_07_uu_00_00_00_00_00_00_08_08_uu_uu_08_08_08_15:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[14,15,u,u,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,u,u,u,u,16,17,16,17,16,17,30,31]
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 7, i32 undef, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 undef, i32 undef, i32 8, i32 8, i32 8, i32 15>
@@ -1356,7 +1304,7 @@ define <16 x i16> @shuffle_v16i16_07_uu_00_00_00_00_00_00_08_08_uu_uu_08_08_08_1
define <16 x i16> @shuffle_v16i16_00_uu_uu_02_04_04_uu_06_14_14_uu_12_10_10_08_08(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_00_uu_uu_02_04_04_uu_06_14_14_uu_12_10_10_08_08:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,1,2,2,4,5,6,7]
; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,6,6]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
@@ -1365,7 +1313,7 @@ define <16 x i16> @shuffle_v16i16_00_uu_uu_02_04_04_uu_06_14_14_uu_12_10_10_08_0
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v16i16_00_uu_uu_02_04_04_uu_06_14_14_uu_12_10_10_08_08:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,u,u,u,u,4,5,8,9,8,9,u,u,12,13,28,29,28,29,u,u,24,25,20,21,20,21,16,17,16,17]
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 undef, i32 undef, i32 2, i32 4, i32 4, i32 undef, i32 6, i32 14, i32 14, i32 undef, i32 12, i32 10, i32 10, i32 8, i32 8>
@@ -1374,7 +1322,7 @@ define <16 x i16> @shuffle_v16i16_00_uu_uu_02_04_04_uu_06_14_14_uu_12_10_10_08_0
define <16 x i16> @shuffle_v16i16_04_04_04_04_uu_uu_uu_uu_08_08_08_uu_uu_12_12_12(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_04_04_04_04_uu_uu_uu_uu_08_08_08_uu_uu_12_12_12:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,1,2,3]
; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
@@ -1384,7 +1332,7 @@ define <16 x i16> @shuffle_v16i16_04_04_04_04_uu_uu_uu_uu_08_08_08_uu_uu_12_12_1
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v16i16_04_04_04_04_uu_uu_uu_uu_08_08_08_uu_uu_12_12_12:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,9,8,9,8,9,8,9,u,u,u,u,u,u,u,u,16,17,16,17,16,17,u,u,u,u,24,25,24,25,24,25]
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 4, i32 4, i32 4, i32 4, i32 undef, i32 undef, i32 undef, i32 undef, i32 8, i32 8, i32 8, i32 undef, i32 undef, i32 12, i32 12, i32 12>
@@ -1393,7 +1341,7 @@ define <16 x i16> @shuffle_v16i16_04_04_04_04_uu_uu_uu_uu_08_08_08_uu_uu_12_12_1
define <16 x i16> @shuffle_v16i16_00_00_00_00_04_04_04_04_16_16_16_16_20_20_20_20(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_04_16_16_16_16_20_20_20_20:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
@@ -1402,15 +1350,15 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_04_04_04_04_16_16_16_16_20_20_20_2
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_04_16_16_16_16_20_20_20_20:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15]
; AVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_04_16_16_16_16_20_20_20_20:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [0,0,0,0,4,4,4,4,16,16,16,16,20,20,20,20]
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,4,4,4,4,16,16,16,16,20,20,20,20]
; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0
; AVX512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4, i32 16, i32 16, i32 16, i32 16, i32 20, i32 20, i32 20, i32 20>
@@ -1419,7 +1367,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_04_04_04_04_16_16_16_16_20_20_20_2
define <16 x i16> @shuffle_v16i16_08_08_08_08_12_12_12_12_16_16_16_16_20_20_20_20(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_08_08_08_08_12_12_12_12_16_16_16_16_20_20_20_20:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
@@ -1429,15 +1377,15 @@ define <16 x i16> @shuffle_v16i16_08_08_08_08_12_12_12_12_16_16_16_16_20_20_20_2
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v16i16_08_08_08_08_12_12_12_12_16_16_16_16_20_20_20_20:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15]
; AVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_08_08_08_08_12_12_12_12_16_16_16_16_20_20_20_20:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [8,8,8,8,12,12,12,12,16,16,16,16,20,20,20,20]
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [8,8,8,8,12,12,12,12,16,16,16,16,20,20,20,20]
; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0
; AVX512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 8, i32 8, i32 8, i32 8, i32 12, i32 12, i32 12, i32 12, i32 16, i32 16, i32 16, i32 16, i32 20, i32 20, i32 20, i32 20>
@@ -1446,7 +1394,7 @@ define <16 x i16> @shuffle_v16i16_08_08_08_08_12_12_12_12_16_16_16_16_20_20_20_2
define <16 x i16> @shuffle_v16i16_08_08_08_08_12_12_12_12_24_24_24_24_28_28_28_28(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_08_08_08_08_12_12_12_12_24_24_24_24_28_28_28_28:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
@@ -1457,15 +1405,15 @@ define <16 x i16> @shuffle_v16i16_08_08_08_08_12_12_12_12_24_24_24_24_28_28_28_2
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v16i16_08_08_08_08_12_12_12_12_24_24_24_24_28_28_28_28:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15]
; AVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_08_08_08_08_12_12_12_12_24_24_24_24_28_28_28_28:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [8,8,8,8,12,12,12,12,24,24,24,24,28,28,28,28]
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [8,8,8,8,12,12,12,12,24,24,24,24,28,28,28,28]
; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0
; AVX512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 8, i32 8, i32 8, i32 8, i32 12, i32 12, i32 12, i32 12, i32 24, i32 24, i32 24, i32 24, i32 28, i32 28, i32 28, i32 28>
@@ -1474,7 +1422,7 @@ define <16 x i16> @shuffle_v16i16_08_08_08_08_12_12_12_12_24_24_24_24_28_28_28_2
define <16 x i16> @shuffle_v16i16_00_00_00_00_04_04_04_04_24_24_24_24_28_28_28_28(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_04_24_24_24_24_28_28_28_28:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
@@ -1484,15 +1432,15 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_04_04_04_04_24_24_24_24_28_28_28_2
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_04_24_24_24_24_28_28_28_28:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15]
; AVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_04_24_24_24_24_28_28_28_28:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [0,0,0,0,4,4,4,4,24,24,24,24,28,28,28,28]
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,4,4,4,4,24,24,24,24,28,28,28,28]
; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0
; AVX512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4, i32 24, i32 24, i32 24, i32 24, i32 28, i32 28, i32 28, i32 28>
@@ -1501,22 +1449,22 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_04_04_04_04_24_24_24_24_28_28_28_2
define <16 x i16> @shuffle_v16i16_00_16_01_17_02_18_03_19_04_20_05_21_06_22_07_23(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_00_16_01_17_02_18_03_19_04_20_05_21_06_22_07_23:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v16i16_00_16_01_17_02_18_03_19_04_20_05_21_06_22_07_23:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_00_16_01_17_02_18_03_19_04_20_05_21_06_22_07_23:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23]
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23]
; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0
; AVX512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
@@ -1525,7 +1473,7 @@ define <16 x i16> @shuffle_v16i16_00_16_01_17_02_18_03_19_04_20_05_21_06_22_07_2
define <16 x i16> @shuffle_v16i16_zz_zz_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_24(<16 x i16> %a) {
; AVX1-LABEL: shuffle_v16i16_zz_zz_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_24:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1]
@@ -1533,7 +1481,7 @@ define <16 x i16> @shuffle_v16i16_zz_zz_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_2
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v16i16_zz_zz_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_24:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpslldq {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17]
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> zeroinitializer, <16 x i16> %a, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 24>
@@ -1542,7 +1490,7 @@ define <16 x i16> @shuffle_v16i16_zz_zz_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_2
define <16 x i16> @shuffle_v16i16_17_18_19_20_21_22_23_zz_25_26_27_28_29_30_31_zz(<16 x i16> %a) {
; AVX1-LABEL: shuffle_v16i16_17_18_19_20_21_22_23_zz_25_26_27_28_29_30_31_zz:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpsrldq {{.*#+}} xmm1 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
@@ -1550,7 +1498,7 @@ define <16 x i16> @shuffle_v16i16_17_18_19_20_21_22_23_zz_25_26_27_28_29_30_31_z
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v16i16_17_18_19_20_21_22_23_zz_25_26_27_28_29_30_31_zz:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,ymm0[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> zeroinitializer, <16 x i16> %a, <16 x i32> <i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 0, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 0>
@@ -1559,7 +1507,7 @@ define <16 x i16> @shuffle_v16i16_17_18_19_20_21_22_23_zz_25_26_27_28_29_30_31_z
define <16 x i16> @shuffle_v16i16_06_07_01_02_07_00_04_05_14_15_09_10_15_08_12_13(<16 x i16> %a) {
; AVX1-LABEL: shuffle_v16i16_06_07_01_02_07_00_04_05_14_15_09_10_15_08_12_13:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [12,13,14,15,2,3,4,5,14,15,0,1,8,9,10,11]
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
@@ -1568,7 +1516,7 @@ define <16 x i16> @shuffle_v16i16_06_07_01_02_07_00_04_05_14_15_09_10_15_08_12_1
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v16i16_06_07_01_02_07_00_04_05_14_15_09_10_15_08_12_13:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[12,13,14,15,2,3,4,5,14,15,0,1,8,9,10,11,28,29,30,31,18,19,20,21,30,31,16,17,24,25,26,27]
; AVX2OR512VL-NEXT: retq
%1 = shufflevector <16 x i16> %a, <16 x i16> undef, <16 x i32> <i32 6, i32 7, i32 1, i32 2, i32 7, i32 0, i32 4, i32 5, i32 14, i32 15, i32 9, i32 10, i32 15, i32 8, i32 12, i32 13>
@@ -1581,7 +1529,7 @@ define <16 x i16> @shuffle_v16i16_06_07_01_02_07_00_04_05_14_15_09_10_15_08_12_1
define <16 x i16> @shuffle_v16i16_zz_00_zz_02_zz_04_zz_06_zz_08_zz_10_zz_12_zz_14(<16 x i16> %a) {
; AVX1-LABEL: shuffle_v16i16_zz_00_zz_02_zz_04_zz_06_zz_08_zz_10_zz_12_zz_14:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpslld $16, %xmm0, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpslld $16, %xmm0, %xmm0
@@ -1589,7 +1537,7 @@ define <16 x i16> @shuffle_v16i16_zz_00_zz_02_zz_04_zz_06_zz_08_zz_10_zz_12_zz_1
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v16i16_zz_00_zz_02_zz_04_zz_06_zz_08_zz_10_zz_12_zz_14:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpslld $16, %ymm0, %ymm0
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> zeroinitializer, <16 x i32> <i32 16, i32 0, i32 16, i32 2, i32 16, i32 4, i32 16, i32 6, i32 16, i32 8, i32 16, i32 10, i32 16, i32 12, i32 16, i32 14>
@@ -1598,7 +1546,7 @@ define <16 x i16> @shuffle_v16i16_zz_00_zz_02_zz_04_zz_06_zz_08_zz_10_zz_12_zz_1
define <16 x i16> @shuffle_v16i16_zz_zz_zz_00_zz_zz_zz_04_zz_zz_zz_08_zz_zz_zz_12(<16 x i16> %a) {
; AVX1-LABEL: shuffle_v16i16_zz_zz_zz_00_zz_zz_zz_04_zz_zz_zz_08_zz_zz_zz_12:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpsllq $48, %xmm0, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpsllq $48, %xmm0, %xmm0
@@ -1606,7 +1554,7 @@ define <16 x i16> @shuffle_v16i16_zz_zz_zz_00_zz_zz_zz_04_zz_zz_zz_08_zz_zz_zz_1
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v16i16_zz_zz_zz_00_zz_zz_zz_04_zz_zz_zz_08_zz_zz_zz_12:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpsllq $48, %ymm0, %ymm0
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> zeroinitializer, <16 x i32> <i32 16, i32 16, i32 16, i32 0, i32 16, i32 16, i32 16, i32 4, i32 16, i32 16, i32 16, i32 8, i32 16, i32 16, i32 16, i32 12>
@@ -1615,7 +1563,7 @@ define <16 x i16> @shuffle_v16i16_zz_zz_zz_00_zz_zz_zz_04_zz_zz_zz_08_zz_zz_zz_1
define <16 x i16> @shuffle_v16i16_01_zz_03_zz_05_zz_07_zz_09_zz_11_zz_13_zz_15_zz(<16 x i16> %a) {
; AVX1-LABEL: shuffle_v16i16_01_zz_03_zz_05_zz_07_zz_09_zz_11_zz_13_zz_15_zz:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0
@@ -1623,7 +1571,7 @@ define <16 x i16> @shuffle_v16i16_01_zz_03_zz_05_zz_07_zz_09_zz_11_zz_13_zz_15_z
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v16i16_01_zz_03_zz_05_zz_07_zz_09_zz_11_zz_13_zz_15_zz:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpsrld $16, %ymm0, %ymm0
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> zeroinitializer, <16 x i32> <i32 1, i32 16, i32 3, i32 16, i32 5, i32 16, i32 7, i32 16, i32 9, i32 16, i32 11, i32 16, i32 13, i32 16, i32 15, i32 16>
@@ -1632,14 +1580,14 @@ define <16 x i16> @shuffle_v16i16_01_zz_03_zz_05_zz_07_zz_09_zz_11_zz_13_zz_15_z
define <16 x i16> @shuffle_v16i16_02_03_zz_zz_06_07_zz_zz_10_11_zz_zz_14_15_zz_zz(<16 x i16> %a) {
; AVX1-LABEL: shuffle_v16i16_02_03_zz_zz_06_07_zz_zz_10_11_zz_zz_14_15_zz_zz:
-; AVX1: # BB#0:
-; AVX1-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; AVX1: # %bb.0:
+; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v16i16_02_03_zz_zz_06_07_zz_zz_10_11_zz_zz_14_15_zz_zz:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpsrlq $32, %ymm0, %ymm0
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> zeroinitializer, <16 x i32> <i32 2, i32 3, i32 16, i32 16, i32 6, i32 7, i32 16, i32 16, i32 10, i32 11, i32 16, i32 16, i32 14, i32 15, i32 16, i32 16>
@@ -1648,7 +1596,7 @@ define <16 x i16> @shuffle_v16i16_02_03_zz_zz_06_07_zz_zz_10_11_zz_zz_14_15_zz_z
define <16 x i16> @shuffle_v16i16_16_zz_zz_zz_17_zz_zz_zz_18_zz_zz_zz_19_zz_zz_zz(<16 x i16> %a) {
; AVX1-LABEL: shuffle_v16i16_16_zz_zz_zz_17_zz_zz_zz_18_zz_zz_zz_19_zz_zz_zz:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
@@ -1656,7 +1604,7 @@ define <16 x i16> @shuffle_v16i16_16_zz_zz_zz_17_zz_zz_zz_18_zz_zz_zz_19_zz_zz_z
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v16i16_16_zz_zz_zz_17_zz_zz_zz_18_zz_zz_zz_19_zz_zz_zz:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> zeroinitializer, <16 x i16> %a, <16 x i32> <i32 16, i32 0, i32 0, i32 0, i32 17, i32 0, i32 0, i32 0, i32 18, i32 0, i32 0, i32 0, i32 19, i32 0, i32 0, i32 0>
@@ -1665,7 +1613,7 @@ define <16 x i16> @shuffle_v16i16_16_zz_zz_zz_17_zz_zz_zz_18_zz_zz_zz_19_zz_zz_z
define <16 x i16> @shuffle_v16i16_16_zz_17_zz_18_zz_19_zz_20_zz_21_zz_22_zz_22_zz(<16 x i16> %a) {
; AVX1-LABEL: shuffle_v16i16_16_zz_17_zz_18_zz_19_zz_20_zz_21_zz_22_zz_22_zz:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
@@ -1673,7 +1621,7 @@ define <16 x i16> @shuffle_v16i16_16_zz_17_zz_18_zz_19_zz_20_zz_21_zz_22_zz_22_z
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v16i16_16_zz_17_zz_18_zz_19_zz_20_zz_21_zz_22_zz_22_zz:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> zeroinitializer, <16 x i16> %a, <16 x i32> <i32 16, i32 0, i32 17, i32 0, i32 18, i32 0, i32 19, i32 0, i32 20, i32 0, i32 21, i32 0, i32 22, i32 0, i32 23, i32 0>
@@ -1682,7 +1630,7 @@ define <16 x i16> @shuffle_v16i16_16_zz_17_zz_18_zz_19_zz_20_zz_21_zz_22_zz_22_z
define <16 x i16> @shuffle_v16i16_28_zz_zz_zz_29_zz_zz_zz_30_zz_zz_zz_31_zz_zz_zz(<16 x i16> %a) {
; AVX1-LABEL: shuffle_v16i16_28_zz_zz_zz_29_zz_zz_zz_30_zz_zz_zz_31_zz_zz_zz:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
@@ -1692,21 +1640,21 @@ define <16 x i16> @shuffle_v16i16_28_zz_zz_zz_29_zz_zz_zz_30_zz_zz_zz_31_zz_zz_z
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v16i16_28_zz_zz_zz_29_zz_zz_zz_30_zz_zz_zz_31_zz_zz_zz:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_28_zz_zz_zz_29_zz_zz_zz_30_zz_zz_zz_31_zz_zz_zz:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [28,1,2,3,29,5,6,7,30,9,10,11,31,13,14,15]
-; AVX512VL-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [28,1,2,3,29,5,6,7,30,9,10,11,31,13,14,15]
+; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512VL-NEXT: vpermt2w %ymm0, %ymm2, %ymm1
; AVX512VL-NEXT: vmovdqa %ymm1, %ymm0
; AVX512VL-NEXT: retq
@@ -1716,7 +1664,7 @@ define <16 x i16> @shuffle_v16i16_28_zz_zz_zz_29_zz_zz_zz_30_zz_zz_zz_31_zz_zz_z
define <16 x i16> @shuffle_v16i16_23_00_01_02_03_04_05_06_31_08_09_10_11_12_13_14(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_23_00_01_02_03_04_05_06_31_08_09_10_11_12_13_14:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[14,15],xmm3[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
@@ -1725,7 +1673,7 @@ define <16 x i16> @shuffle_v16i16_23_00_01_02_03_04_05_06_31_08_09_10_11_12_13_1
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v16i16_23_00_01_02_03_04_05_06_31_08_09_10_11_12_13_14:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[14,15],ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13],ymm1[30,31],ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29]
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 23, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 31, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
@@ -1734,7 +1682,7 @@ define <16 x i16> @shuffle_v16i16_23_00_01_02_03_04_05_06_31_08_09_10_11_12_13_1
define <16 x i16> @shuffle_v16i16_01_02_03_04_05_06_07_16_09_10_11_12_13_14_15_24(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_01_02_03_04_05_06_07_16_09_10_11_12_13_14_15_24:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1]
@@ -1743,7 +1691,7 @@ define <16 x i16> @shuffle_v16i16_01_02_03_04_05_06_07_16_09_10_11_12_13_14_15_2
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v16i16_01_02_03_04_05_06_07_16_09_10_11_12_13_14_15_24:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm1[0,1],ymm0[18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm1[16,17]
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 24>
@@ -1752,7 +1700,7 @@ define <16 x i16> @shuffle_v16i16_01_02_03_04_05_06_07_16_09_10_11_12_13_14_15_2
define <16 x i16> @shuffle_v16i16_17_18_19_20_21_22_23_00_25_26_27_28_29_30_31_8(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_17_18_19_20_21_22_23_00_25_26_27_28_29_30_31_8:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1]
@@ -1761,7 +1709,7 @@ define <16 x i16> @shuffle_v16i16_17_18_19_20_21_22_23_00_25_26_27_28_29_30_31_8
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v16i16_17_18_19_20_21_22_23_00_25_26_27_28_29_30_31_8:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm0[0,1],ymm1[18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm0[16,17]
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 00, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 8>
@@ -1770,7 +1718,7 @@ define <16 x i16> @shuffle_v16i16_17_18_19_20_21_22_23_00_25_26_27_28_29_30_31_8
define <16 x i16> @shuffle_v16i16_07_16_17_18_19_20_21_22_15_24_25_26_27_28_29_30(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_07_16_17_18_19_20_21_22_15_24_25_26_27_28_29_30:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[14,15],xmm3[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
@@ -1779,7 +1727,7 @@ define <16 x i16> @shuffle_v16i16_07_16_17_18_19_20_21_22_15_24_25_26_27_28_29_3
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v16i16_07_16_17_18_19_20_21_22_15_24_25_26_27_28_29_30:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13],ymm0[30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27,28,29]
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 15, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
@@ -1788,21 +1736,21 @@ define <16 x i16> @shuffle_v16i16_07_16_17_18_19_20_21_22_15_24_25_26_27_28_29_3
define <16 x i16> @shuffle_v16i16_01_02_03_04_05_06_07_00_17_18_19_20_21_22_23_16(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_01_02_03_04_05_06_07_00_17_18_19_20_21_22_23_16:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1]
; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v16i16_01_02_03_04_05_06_07_00_17_18_19_20_21_22_23_16:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,18,19,20,21,22,23,24,25,26,27,28,29,30,31,16,17]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_01_02_03_04_05_06_07_00_17_18_19_20_21_22_23_16:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [1,2,3,4,5,6,7,0,17,18,19,20,21,22,23,16]
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [1,2,3,4,5,6,7,0,17,18,19,20,21,22,23,16]
; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0
; AVX512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 16>
@@ -1811,21 +1759,21 @@ define <16 x i16> @shuffle_v16i16_01_02_03_04_05_06_07_00_17_18_19_20_21_22_23_1
define <16 x i16> @shuffle_v16i16_07_00_01_02_03_04_05_06_23_16_17_18_19_20_21_22(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_07_00_01_02_03_04_05_06_23_16_17_18_19_20_21_22:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13]
; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v16i16_07_00_01_02_03_04_05_06_23_16_17_18_19_20_21_22:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,30,31,16,17,18,19,20,21,22,23,24,25,26,27,28,29]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_07_00_01_02_03_04_05_06_23_16_17_18_19_20_21_22:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [7,0,1,2,3,4,5,6,23,16,17,18,19,20,21,22]
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [7,0,1,2,3,4,5,6,23,16,17,18,19,20,21,22]
; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0
; AVX512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 23, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22>
@@ -1834,7 +1782,7 @@ define <16 x i16> @shuffle_v16i16_07_00_01_02_03_04_05_06_23_16_17_18_19_20_21_2
define <16 x i16> @shuffle_v16i16_00_01_00_01_02_03_02_11_08_09_08_09_10_11_10_11(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_00_01_00_01_02_03_02_11_08_09_08_09_10_11_10_11:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,0,2,4,5,6,7]
@@ -1843,22 +1791,27 @@ define <16 x i16> @shuffle_v16i16_00_01_00_01_02_03_02_11_08_09_08_09_10_11_10_1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2OR512VL-LABEL: shuffle_v16i16_00_01_00_01_02_03_02_11_08_09_08_09_10_11_10_11:
-; AVX2OR512VL: # BB#0:
-; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2OR512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,0,2,4,5,6,7]
-; AVX2OR512VL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7]
-; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
-; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2OR512VL-NEXT: retq
+; AVX2-LABEL: shuffle_v16i16_00_01_00_01_02_03_02_11_08_09_08_09_10_11_10_11:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-NEXT: vpbroadcastq %xmm1, %xmm1
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5]
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v16i16_00_01_00_01_02_03_02_11_08_09_08_09_10_11_10_11:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,0,1,2,3,2,11,8,9,8,9,10,11,10,11]
+; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 2, i32 3, i32 2, i32 11, i32 8, i32 9, i32 8, i32 9, i32 10, i32 11, i32 10, i32 11>
ret <16 x i16> %shuffle
}
define <16 x i16> @shuffle_v16i16_06_07_04_05_02_03_00_09_14_15_12_13_10_11_08_09(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_06_07_04_05_02_03_00_09_14_15_12_13_10_11_08_09:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
@@ -1866,21 +1819,26 @@ define <16 x i16> @shuffle_v16i16_06_07_04_05_02_03_00_09_14_15_12_13_10_11_08_0
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2OR512VL-LABEL: shuffle_v16i16_06_07_04_05_02_03_00_09_14_15_12_13_10_11_08_09:
-; AVX2OR512VL: # BB#0:
-; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
-; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
-; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,1,0]
-; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2OR512VL-NEXT: retq
+; AVX2-LABEL: shuffle_v16i16_06_07_04_05_02_03_00_09_14_15_12_13_10_11_08_09:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX2-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v16i16_06_07_04_05_02_03_00_09_14_15_12_13_10_11_08_09:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [6,7,4,5,2,3,0,9,14,15,12,13,10,11,8,9]
+; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 6, i32 7, i32 4, i32 5, i32 2, i32 3, i32 0, i32 9, i32 14, i32 15, i32 12, i32 13, i32 10, i32 11, i32 8, i32 9>
ret <16 x i16> %shuffle
}
define <16 x i16> @shuffle_v16i16_04_05_06_07_16_17_18_27_12_13_14_15_24_25_26_27(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_04_05_06_07_16_17_18_27_12_13_14_15_24_25_26_27:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
@@ -1891,18 +1849,17 @@ define <16 x i16> @shuffle_v16i16_04_05_06_07_16_17_18_27_12_13_14_15_24_25_26_2
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v16i16_04_05_06_07_16_17_18_27_12_13_14_15_24_25_26_27:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_04_05_06_07_16_17_18_27_12_13_14_15_24_25_26_27:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [4,5,6,7,16,17,18,27,12,13,14,15,24,25,26,27]
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [4,5,6,7,16,17,18,27,12,13,14,15,24,25,26,27]
; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0
; AVX512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 27, i32 12, i32 13, i32 14, i32 15, i32 24, i32 25, i32 26, i32 27>
@@ -1911,7 +1868,7 @@ define <16 x i16> @shuffle_v16i16_04_05_06_07_16_17_18_27_12_13_14_15_24_25_26_2
define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_08(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_08:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,2,3]
@@ -1920,21 +1877,28 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_0
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_08:
-; AVX2OR512VL: # BB#0:
-; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2OR512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,2,3]
-; AVX2OR512VL-NEXT: vpbroadcastw %xmm1, %xmm1
-; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2OR512VL-NEXT: retq
+; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_08:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-NEXT: vpbroadcastw %xmm1, %xmm1
+; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5]
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_08:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,8,8,8,8,8,8,8,8,8]
+; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
ret <16 x i16> %shuffle
}
define <16 x i16> @shuffle_v16i16_00_00_00_00_04_04_04_12_08_08_08_08_12_12_12_12(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_12_08_08_08_08_12_12_12_12:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpsllq $48, %xmm1, %xmm2
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
@@ -1945,24 +1909,28 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_04_04_04_12_08_08_08_08_12_12_12_1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_12_08_08_08_08_12_12_12_12:
-; AVX2OR512VL: # BB#0:
-; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2OR512VL-NEXT: vpsllq $48, %xmm1, %xmm2
-; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; AVX2OR512VL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,7]
-; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
-; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
-; AVX2OR512VL-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4]
-; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2OR512VL-NEXT: retq
+; AVX2-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_12_08_08_08_08_12_12_12_12:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-NEXT: vpsllq $48, %xmm1, %xmm1
+; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15]
+; AVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_12_08_08_08_08_12_12_12_12:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,4,4,4,12,8,8,8,8,12,12,12,12]
+; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 12, i32 8, i32 8, i32 8, i32 8, i32 12, i32 12, i32 12, i32 12>
ret <16 x i16> %shuffle
}
define <16 x i16> @shuffle_v16i16_uu_00_uu_01_uu_02_uu_11_uu_08_uu_09_uu_10_uu_11(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_uu_00_uu_01_uu_02_uu_11_uu_08_uu_09_uu_10_uu_11:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
@@ -1971,22 +1939,26 @@ define <16 x i16> @shuffle_v16i16_uu_00_uu_01_uu_02_uu_11_uu_08_uu_09_uu_10_uu_1
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2OR512VL-LABEL: shuffle_v16i16_uu_00_uu_01_uu_02_uu_11_uu_08_uu_09_uu_10_uu_11:
-; AVX2OR512VL: # BB#0:
-; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2OR512VL-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX2OR512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,2,4,5,6,7]
-; AVX2OR512VL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7]
-; AVX2OR512VL-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
-; AVX2OR512VL-NEXT: retq
+; AVX2-LABEL: shuffle_v16i16_uu_00_uu_01_uu_02_uu_11_uu_08_uu_09_uu_10_uu_11:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,255,255,255,255,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
+; AVX2-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v16i16_uu_00_uu_01_uu_02_uu_11_uu_08_uu_09_uu_10_uu_11:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = <u,0,u,1,u,2,u,11,u,8,u,9,u,10,u,11>
+; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 undef, i32 0, i32 undef, i32 1, i32 undef, i32 2, i32 undef, i32 11, i32 undef, i32 8, i32 undef, i32 9, i32 undef, i32 10, i32 undef, i32 11>
ret <16 x i16> %shuffle
}
define <16 x i16> @shuffle_v16i16_uu_04_uu_05_uu_06_uu_15_uu_12_uu_13_uu_14_uu_15(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_uu_04_uu_05_uu_06_uu_15_uu_12_uu_13_uu_14_uu_15:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
@@ -1995,22 +1967,25 @@ define <16 x i16> @shuffle_v16i16_uu_04_uu_05_uu_06_uu_15_uu_12_uu_13_uu_14_uu_1
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2OR512VL-LABEL: shuffle_v16i16_uu_04_uu_05_uu_06_uu_15_uu_12_uu_13_uu_14_uu_15:
-; AVX2OR512VL: # BB#0:
-; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2OR512VL-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX2OR512VL-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,2,4,5,6,7]
-; AVX2OR512VL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7]
-; AVX2OR512VL-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
-; AVX2OR512VL-NEXT: retq
+; AVX2-LABEL: shuffle_v16i16_uu_04_uu_05_uu_06_uu_15_uu_12_uu_13_uu_14_uu_15:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm0[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7]
+; AVX2-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v16i16_uu_04_uu_05_uu_06_uu_15_uu_12_uu_13_uu_14_uu_15:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = <u,4,u,5,u,6,u,15,u,12,u,13,u,14,u,15>
+; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 undef, i32 4, i32 undef, i32 5, i32 undef, i32 6, i32 undef, i32 15, i32 undef, i32 12, i32 undef, i32 13, i32 undef, i32 14, i32 undef, i32 15>
ret <16 x i16> %shuffle
}
define <16 x i16> @shuffle_v16i16_03_01_02_00_06_07_04_13_11_09_10_08_14_15_12_13(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_03_01_02_00_06_07_04_13_11_09_10_08_14_15_12_13:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7]
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,0,4,5,6,7]
@@ -2020,23 +1995,27 @@ define <16 x i16> @shuffle_v16i16_03_01_02_00_06_07_04_13_11_09_10_08_14_15_12_1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2OR512VL-LABEL: shuffle_v16i16_03_01_02_00_06_07_04_13_11_09_10_08_14_15_12_13:
-; AVX2OR512VL: # BB#0:
-; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7]
-; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,0,4,5,6,7]
-; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,2]
-; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,0,4,5,6,7]
-; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,2]
-; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2OR512VL-NEXT: retq
+; AVX2-LABEL: shuffle_v16i16_03_01_02_00_06_07_04_13_11_09_10_08_14_15_12_13:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[3,1,2,0,4,5,6,7,11,9,10,8,12,13,14,15]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,3,2,4,5,7,6]
+; AVX2-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v16i16_03_01_02_00_06_07_04_13_11_09_10_08_14_15_12_13:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [3,1,2,0,6,7,4,13,11,9,10,8,14,15,12,13]
+; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 3, i32 1, i32 2, i32 0, i32 6, i32 7, i32 4, i32 13, i32 11, i32 9, i32 10, i32 8, i32 14, i32 15, i32 12, i32 13>
ret <16 x i16> %shuffle
}
define <16 x i16> @shuffle_v16i16_04_04_04_04_00_00_00_08_12_12_12_12_08_08_08_08(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_04_04_04_04_00_00_00_08_12_12_12_12_08_08_08_08:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,8,9,0,1,0,1,0,1,14,15]
@@ -2045,22 +2024,27 @@ define <16 x i16> @shuffle_v16i16_04_04_04_04_00_00_00_08_12_12_12_12_08_08_08_0
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2OR512VL-LABEL: shuffle_v16i16_04_04_04_04_00_00_00_08_12_12_12_12_08_08_08_08:
-; AVX2OR512VL: # BB#0:
-; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2OR512VL-NEXT: vpbroadcastw %xmm1, %xmm2
-; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,8,9,0,1,0,1,0,1,14,15]
-; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
-; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,8,9,8,9,8,9,0,1,0,1,0,1,0,1]
-; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2OR512VL-NEXT: retq
+; AVX2-LABEL: shuffle_v16i16_04_04_04_04_00_00_00_08_12_12_12_12_08_08_08_08:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-NEXT: vpbroadcastw %xmm1, %xmm1
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,9,8,9,8,9,8,9,0,1,0,1,0,1,0,1,24,25,24,25,24,25,24,25,16,17,16,17,16,17,16,17]
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v16i16_04_04_04_04_00_00_00_08_12_12_12_12_08_08_08_08:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [4,4,4,4,0,0,0,8,12,12,12,12,8,8,8,8]
+; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 4, i32 4, i32 4, i32 4, i32 0, i32 0, i32 0, i32 8, i32 12, i32 12, i32 12, i32 12, i32 8, i32 8, i32 8, i32 8>
ret <16 x i16> %shuffle
}
define <16 x i16> @shuffle_v16i16_02_03_00_01_06_07_04_13_10_11_08_09_14_15_12_13(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_02_03_00_01_06_07_04_13_10_11_08_09_14_15_12_13:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7]
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,3,2]
@@ -2068,21 +2052,26 @@ define <16 x i16> @shuffle_v16i16_02_03_00_01_06_07_04_13_10_11_08_09_14_15_12_1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2OR512VL-LABEL: shuffle_v16i16_02_03_00_01_06_07_04_13_10_11_08_09_14_15_12_13:
-; AVX2OR512VL: # BB#0:
-; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7]
-; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,3,2]
-; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,0,3,2]
-; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2OR512VL-NEXT: retq
+; AVX2-LABEL: shuffle_v16i16_02_03_00_01_06_07_04_13_10_11_08_09_14_15_12_13:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6]
+; AVX2-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v16i16_02_03_00_01_06_07_04_13_10_11_08_09_14_15_12_13:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [2,3,0,1,6,7,4,13,10,11,8,9,14,15,12,13]
+; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 13, i32 10, i32 11, i32 8, i32 9, i32 14, i32 15, i32 12, i32 13>
ret <16 x i16> %shuffle
}
define <16 x i16> @shuffle_v16i16_02_03_00_02_06_07_04_13_10_11_08_10_14_15_12_13(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_02_03_00_02_06_07_04_13_10_11_08_10_14_15_12_13:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7]
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,3,0,2,4,5,6,7]
@@ -2092,23 +2081,27 @@ define <16 x i16> @shuffle_v16i16_02_03_00_02_06_07_04_13_10_11_08_10_14_15_12_1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2OR512VL-LABEL: shuffle_v16i16_02_03_00_02_06_07_04_13_10_11_08_10_14_15_12_13:
-; AVX2OR512VL: # BB#0:
-; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7]
-; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,3,0,2,4,5,6,7]
-; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,2]
-; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,3,0,2,4,5,6,7]
-; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,2]
-; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2OR512VL-NEXT: retq
+; AVX2-LABEL: shuffle_v16i16_02_03_00_02_06_07_04_13_10_11_08_10_14_15_12_13:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,u,u,255,255,255,255,255,255,0,0,255,255,255,255,255,255,u,u,255,255,255,255,255,255,255,255,255,255,255,255>
+; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[2,3,0,2,4,5,6,7,10,11,8,10,12,13,14,15]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,3,2,4,5,7,6]
+; AVX2-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v16i16_02_03_00_02_06_07_04_13_10_11_08_10_14_15_12_13:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [2,3,0,2,6,7,4,13,10,11,8,10,14,15,12,13]
+; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 2, i32 3, i32 0, i32 2, i32 6, i32 7, i32 4, i32 13, i32 10, i32 11, i32 8, i32 10, i32 14, i32 15, i32 12, i32 13>
ret <16 x i16> %shuffle
}
define <16 x i16> @shuffle_v16i16_02_03_00_01_06_07_04_15_10_11_08_09_14_15_12_15(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_02_03_00_01_06_07_04_15_10_11_08_09_14_15_12_15:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,3,2]
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7]
@@ -2117,22 +2110,27 @@ define <16 x i16> @shuffle_v16i16_02_03_00_01_06_07_04_15_10_11_08_09_14_15_12_1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2OR512VL-LABEL: shuffle_v16i16_02_03_00_01_06_07_04_15_10_11_08_09_14_15_12_15:
-; AVX2OR512VL: # BB#0:
-; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,3,2]
-; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7]
-; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,0,2,3]
-; AVX2OR512VL-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,7,4,7]
-; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2OR512VL-NEXT: retq
+; AVX2-LABEL: shuffle_v16i16_02_03_00_01_06_07_04_15_10_11_08_09_14_15_12_15:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,0,2,3,5,4,6,7]
+; AVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,6,7,4,7,8,9,10,11,14,15,12,15]
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v16i16_02_03_00_01_06_07_04_15_10_11_08_09_14_15_12_15:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [2,3,0,1,6,7,4,15,10,11,8,9,14,15,12,15]
+; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 15, i32 10, i32 11, i32 8, i32 9, i32 14, i32 15, i32 12, i32 15>
ret <16 x i16> %shuffle
}
define <16 x i16> @shuffle_v16i16_07_05_06_04_03_01_02_08_15_13_14_12_11_09_10_08(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_07_05_06_04_03_01_02_08_15_13_14_12_11_09_10_08:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [14,15,10,11,12,13,8,9,6,7,2,3,4,5,0,1]
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm3
@@ -2142,23 +2140,17 @@ define <16 x i16> @shuffle_v16i16_07_05_06_04_03_01_02_08_15_13_14_12_11_09_10_0
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v16i16_07_05_06_04_03_01_02_08_15_13_14_12_11_09_10_08:
-; AVX2: # BB#0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [14,15,10,11,12,13,8,9,6,7,2,3,4,5,0,1]
-; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm3
-; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7]
-; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[14,15,10,11,12,13,8,9,6,7,2,3,4,5,0,1,30,31,26,27,28,29,24,25,22,23,18,19,20,21,16,17]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_07_05_06_04_03_01_02_08_15_13_14_12_11_09_10_08:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512VL-NEXT: vmovdqu {{.*#+}} xmm2 = [14,15,10,11,12,13,8,9,6,7,2,3,4,5,0,1]
-; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm3
-; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7]
-; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512VL-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [7,5,6,4,3,1,2,8,15,13,14,12,11,9,10,8]
+; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0
; AVX512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 7, i32 5, i32 6, i32 4, i32 3, i32 1, i32 2, i32 8, i32 15, i32 13, i32 14, i32 12, i32 11, i32 9, i32 10, i32 8>
ret <16 x i16> %shuffle
@@ -2166,7 +2158,7 @@ define <16 x i16> @shuffle_v16i16_07_05_06_04_03_01_02_08_15_13_14_12_11_09_10_0
define <16 x i16> @shuffle_v16i16_01_00_05_04_05_04_01_08_09_08_13_12_13_12_09_08(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_01_00_05_04_05_04_01_08_09_08_13_12_13_12_09_08:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,0,1,10,11,8,9,10,11,8,9,2,3,2,3]
@@ -2175,22 +2167,27 @@ define <16 x i16> @shuffle_v16i16_01_00_05_04_05_04_01_08_09_08_13_12_13_12_09_0
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2OR512VL-LABEL: shuffle_v16i16_01_00_05_04_05_04_01_08_09_08_13_12_13_12_09_08:
-; AVX2OR512VL: # BB#0:
-; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2OR512VL-NEXT: vpbroadcastw %xmm1, %xmm2
-; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,0,1,10,11,8,9,10,11,8,9,2,3,2,3]
-; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
-; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[2,3,0,1,10,11,8,9,10,11,8,9,2,3,0,1]
-; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2OR512VL-NEXT: retq
+; AVX2-LABEL: shuffle_v16i16_01_00_05_04_05_04_01_08_09_08_13_12_13_12_09_08:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-NEXT: vpbroadcastw %xmm1, %xmm1
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,0,1,10,11,8,9,10,11,8,9,2,3,0,1,18,19,16,17,26,27,24,25,26,27,24,25,18,19,16,17]
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v16i16_01_00_05_04_05_04_01_08_09_08_13_12_13_12_09_08:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [1,0,5,4,5,4,1,8,9,8,13,12,13,12,9,8]
+; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 1, i32 0, i32 5, i32 4, i32 5, i32 4, i32 1, i32 8, i32 9, i32 8, i32 13, i32 12, i32 13, i32 12, i32 9, i32 8>
ret <16 x i16> %shuffle
}
define <16 x i16> @shuffle_v16i16_05_04_01_00_05_04_01_08_13_12_09_08_13_12_09_08(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_05_04_01_00_05_04_01_08_13_12_09_08_13_12_09_08:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,8,9,2,3,0,1,10,11,8,9,2,3,2,3]
@@ -2199,22 +2196,27 @@ define <16 x i16> @shuffle_v16i16_05_04_01_00_05_04_01_08_13_12_09_08_13_12_09_0
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2OR512VL-LABEL: shuffle_v16i16_05_04_01_00_05_04_01_08_13_12_09_08_13_12_09_08:
-; AVX2OR512VL: # BB#0:
-; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2OR512VL-NEXT: vpbroadcastw %xmm1, %xmm2
-; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,8,9,2,3,0,1,10,11,8,9,2,3,2,3]
-; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
-; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10,11,8,9,2,3,0,1,10,11,8,9,2,3,0,1]
-; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2OR512VL-NEXT: retq
+; AVX2-LABEL: shuffle_v16i16_05_04_01_00_05_04_01_08_13_12_09_08_13_12_09_08:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-NEXT: vpbroadcastw %xmm1, %xmm1
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[10,11,8,9,2,3,0,1,10,11,8,9,2,3,0,1,26,27,24,25,18,19,16,17,26,27,24,25,18,19,16,17]
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v16i16_05_04_01_00_05_04_01_08_13_12_09_08_13_12_09_08:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [5,4,1,0,5,4,1,8,13,12,9,8,13,12,9,8]
+; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 5, i32 4, i32 1, i32 0, i32 5, i32 4, i32 1, i32 8, i32 13, i32 12, i32 9, i32 8, i32 13, i32 12, i32 9, i32 8>
ret <16 x i16> %shuffle
}
define <16 x i16> @shuffle_v16i16_05_04_01_00_01_00_05_12_13_12_09_08_09_08_13_12(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_05_04_01_00_01_00_05_12_13_12_09_08_09_08_13_12:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpsllq $48, %xmm1, %xmm2
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,8,9,2,3,0,1,2,3,0,1,10,11,2,3]
@@ -2223,22 +2225,27 @@ define <16 x i16> @shuffle_v16i16_05_04_01_00_01_00_05_12_13_12_09_08_09_08_13_1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2OR512VL-LABEL: shuffle_v16i16_05_04_01_00_01_00_05_12_13_12_09_08_09_08_13_12:
-; AVX2OR512VL: # BB#0:
-; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2OR512VL-NEXT: vpsllq $48, %xmm1, %xmm2
-; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,8,9,2,3,0,1,2,3,0,1,10,11,2,3]
-; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
-; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10,11,8,9,2,3,0,1,2,3,0,1,10,11,8,9]
-; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2OR512VL-NEXT: retq
+; AVX2-LABEL: shuffle_v16i16_05_04_01_00_01_00_05_12_13_12_09_08_09_08_13_12:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-NEXT: vpsllq $48, %xmm1, %xmm1
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[10,11,8,9,2,3,0,1,2,3,0,1,10,11,8,9,26,27,24,25,18,19,16,17,18,19,16,17,26,27,24,25]
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v16i16_05_04_01_00_01_00_05_12_13_12_09_08_09_08_13_12:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [5,4,1,0,1,0,5,12,13,12,9,8,9,8,13,12]
+; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 5, i32 4, i32 1, i32 0, i32 1, i32 0, i32 5, i32 12, i32 13, i32 12, i32 9, i32 8, i32 9, i32 8, i32 13, i32 12>
ret <16 x i16> %shuffle
}
define <16 x i16> @shuffle_v16i16_00_04_04_00_00_04_04_08_08_12_12_08_08_12_12_08(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_00_04_04_00_00_04_04_08_08_12_12_08_08_12_12_08:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,0,1,8,9,8,9,2,3]
@@ -2247,22 +2254,27 @@ define <16 x i16> @shuffle_v16i16_00_04_04_00_00_04_04_08_08_12_12_08_08_12_12_0
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2OR512VL-LABEL: shuffle_v16i16_00_04_04_00_00_04_04_08_08_12_12_08_08_12_12_08:
-; AVX2OR512VL: # BB#0:
-; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2OR512VL-NEXT: vpbroadcastw %xmm1, %xmm2
-; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,0,1,8,9,8,9,2,3]
-; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
-; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,8,9,8,9,0,1,0,1,8,9,8,9,0,1]
-; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2OR512VL-NEXT: retq
+; AVX2-LABEL: shuffle_v16i16_00_04_04_00_00_04_04_08_08_12_12_08_08_12_12_08:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-NEXT: vpbroadcastw %xmm1, %xmm1
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,8,9,8,9,0,1,0,1,8,9,8,9,0,1,16,17,24,25,24,25,16,17,16,17,24,25,24,25,16,17]
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v16i16_00_04_04_00_00_04_04_08_08_12_12_08_08_12_12_08:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,4,4,0,0,4,4,8,8,12,12,8,8,12,12,8]
+; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 4, i32 4, i32 0, i32 0, i32 4, i32 4, i32 8, i32 8, i32 12, i32 12, i32 8, i32 8, i32 12, i32 12, i32 8>
ret <16 x i16> %shuffle
}
define <16 x i16> @shuffle_v16i16_04_00_00_04_04_00_00_12_12_08_08_12_12_08_08_12(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_04_00_00_04_04_00_00_12_12_08_08_12_12_08_08_12:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpsllq $48, %xmm1, %xmm2
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,0,1,0,1,8,9,8,9,0,1,0,1,2,3]
@@ -2271,22 +2283,27 @@ define <16 x i16> @shuffle_v16i16_04_00_00_04_04_00_00_12_12_08_08_12_12_08_08_1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2OR512VL-LABEL: shuffle_v16i16_04_00_00_04_04_00_00_12_12_08_08_12_12_08_08_12:
-; AVX2OR512VL: # BB#0:
-; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2OR512VL-NEXT: vpsllq $48, %xmm1, %xmm2
-; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,0,1,0,1,8,9,8,9,0,1,0,1,2,3]
-; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
-; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,0,1,0,1,8,9,8,9,0,1,0,1,8,9]
-; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2OR512VL-NEXT: retq
+; AVX2-LABEL: shuffle_v16i16_04_00_00_04_04_00_00_12_12_08_08_12_12_08_08_12:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-NEXT: vpsllq $48, %xmm1, %xmm1
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,9,0,1,0,1,8,9,8,9,0,1,0,1,8,9,24,25,16,17,16,17,24,25,24,25,16,17,16,17,24,25]
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v16i16_04_00_00_04_04_00_00_12_12_08_08_12_12_08_08_12:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [4,0,0,4,4,0,0,12,12,8,8,12,12,8,8,12]
+; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 4, i32 0, i32 0, i32 4, i32 4, i32 0, i32 0, i32 12, i32 12, i32 8, i32 8, i32 12, i32 12, i32 8, i32 8, i32 12>
ret <16 x i16> %shuffle
}
define <16 x i16> @shuffle_v16i16_02_06_04_00_05_01_07_11_10_14_12_08_13_09_15_11(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_02_06_04_00_05_01_07_11_10_14_12_08_13_09_15_11:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,12,13,8,9,0,1,10,11,2,3,14,15,6,7]
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm3
@@ -2296,23 +2313,17 @@ define <16 x i16> @shuffle_v16i16_02_06_04_00_05_01_07_11_10_14_12_08_13_09_15_1
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v16i16_02_06_04_00_05_01_07_11_10_14_12_08_13_09_15_11:
-; AVX2: # BB#0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,12,13,8,9,0,1,10,11,2,3,14,15,6,7]
-; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm3
-; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7]
-; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[4,5,12,13,8,9,0,1,10,11,2,3,14,15,6,7,20,21,28,29,24,25,16,17,26,27,18,19,30,31,22,23]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_02_06_04_00_05_01_07_11_10_14_12_08_13_09_15_11:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512VL-NEXT: vmovdqu {{.*#+}} xmm2 = [4,5,12,13,8,9,0,1,10,11,2,3,14,15,6,7]
-; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm3
-; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7]
-; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512VL-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [2,6,4,0,5,1,7,11,10,14,12,8,13,9,15,11]
+; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0
; AVX512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 2, i32 6, i32 4, i32 0, i32 5, i32 1, i32 7, i32 11, i32 10, i32 14, i32 12, i32 8, i32 13, i32 9, i32 15, i32 11>
ret <16 x i16> %shuffle
@@ -2320,7 +2331,7 @@ define <16 x i16> @shuffle_v16i16_02_06_04_00_05_01_07_11_10_14_12_08_13_09_15_1
define <16 x i16> @shuffle_v16i16_02_00_06_04_05_01_07_11_10_08_14_12_13_09_15_11(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_02_00_06_04_05_01_07_11_10_08_14_12_13_09_15_11:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,0,1,12,13,8,9,10,11,2,3,14,15,6,7]
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm3
@@ -2330,23 +2341,17 @@ define <16 x i16> @shuffle_v16i16_02_00_06_04_05_01_07_11_10_08_14_12_13_09_15_1
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v16i16_02_00_06_04_05_01_07_11_10_08_14_12_13_09_15_11:
-; AVX2: # BB#0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,0,1,12,13,8,9,10,11,2,3,14,15,6,7]
-; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm3
-; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7]
-; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[4,5,0,1,12,13,8,9,10,11,2,3,14,15,6,7,20,21,16,17,28,29,24,25,26,27,18,19,30,31,22,23]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_02_00_06_04_05_01_07_11_10_08_14_12_13_09_15_11:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512VL-NEXT: vmovdqu {{.*#+}} xmm2 = [4,5,0,1,12,13,8,9,10,11,2,3,14,15,6,7]
-; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm3
-; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7]
-; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512VL-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [2,0,6,4,5,1,7,11,10,8,14,12,13,9,15,11]
+; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0
; AVX512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 2, i32 0, i32 6, i32 4, i32 5, i32 1, i32 7, i32 11, i32 10, i32 8, i32 14, i32 12, i32 13, i32 9, i32 15, i32 11>
ret <16 x i16> %shuffle
@@ -2354,7 +2359,7 @@ define <16 x i16> @shuffle_v16i16_02_00_06_04_05_01_07_11_10_08_14_12_13_09_15_1
define <16 x i16> @shuffle_v16i16_02_06_04_00_01_03_07_13_10_14_12_08_09_11_15_13(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_02_06_04_00_01_03_07_13_10_14_12_08_09_11_15_13:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,12,13,8,9,0,1,2,3,6,7,14,15,10,11]
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm3
@@ -2364,23 +2369,17 @@ define <16 x i16> @shuffle_v16i16_02_06_04_00_01_03_07_13_10_14_12_08_09_11_15_1
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v16i16_02_06_04_00_01_03_07_13_10_14_12_08_09_11_15_13:
-; AVX2: # BB#0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,12,13,8,9,0,1,2,3,6,7,14,15,10,11]
-; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm3
-; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7]
-; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[4,5,12,13,8,9,0,1,2,3,6,7,14,15,10,11,20,21,28,29,24,25,16,17,18,19,22,23,30,31,26,27]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_02_06_04_00_01_03_07_13_10_14_12_08_09_11_15_13:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512VL-NEXT: vmovdqu {{.*#+}} xmm2 = [4,5,12,13,8,9,0,1,2,3,6,7,14,15,10,11]
-; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm3
-; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7]
-; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512VL-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [2,6,4,0,1,3,7,13,10,14,12,8,9,11,15,13]
+; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0
; AVX512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 2, i32 6, i32 4, i32 0, i32 1, i32 3, i32 7, i32 13, i32 10, i32 14, i32 12, i32 8, i32 9, i32 11, i32 15, i32 13>
ret <16 x i16> %shuffle
@@ -2388,7 +2387,7 @@ define <16 x i16> @shuffle_v16i16_02_06_04_00_01_03_07_13_10_14_12_08_09_11_15_1
define <16 x i16> @shuffle_v16i16_06_06_07_05_01_06_04_11_14_14_15_13_09_14_12_11(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_06_06_07_05_01_06_04_11_14_14_15_13_09_14_12_11:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [12,13,12,13,14,15,10,11,2,3,12,13,8,9,6,7]
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm3
@@ -2398,23 +2397,16 @@ define <16 x i16> @shuffle_v16i16_06_06_07_05_01_06_04_11_14_14_15_13_09_14_12_1
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v16i16_06_06_07_05_01_06_04_11_14_14_15_13_09_14_12_11:
-; AVX2: # BB#0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [12,13,12,13,14,15,10,11,2,3,12,13,8,9,6,7]
-; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm3
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
-; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7]
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[12,13,12,13,14,15,10,11,2,3,12,13,8,9,6,7,28,29,28,29,30,31,26,27,18,19,28,29,24,25,22,23]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_06_06_07_05_01_06_04_11_14_14_15_13_09_14_12_11:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512VL-NEXT: vmovdqu {{.*#+}} xmm2 = [12,13,12,13,14,15,10,11,2,3,12,13,8,9,6,7]
-; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm3
-; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
-; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512VL-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [6,6,7,5,1,6,4,11,14,14,15,13,9,14,12,11]
+; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0
; AVX512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 6, i32 6, i32 7, i32 5, i32 1, i32 6, i32 4, i32 11, i32 14, i32 14, i32 15, i32 13, i32 9, i32 14, i32 12, i32 11>
ret <16 x i16> %shuffle
@@ -2422,7 +2414,7 @@ define <16 x i16> @shuffle_v16i16_06_06_07_05_01_06_04_11_14_14_15_13_09_14_12_1
define <16 x i16> @shuffle_v16i16_00_00_04_04_04_04_04_12_08_08_12_12_12_12_12_12(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_00_00_04_04_04_04_04_12_08_08_12_12_12_12_12_12:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpsllq $48, %xmm1, %xmm2
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,8,9,8,9,8,9,8,9,8,9,14,15]
@@ -2431,22 +2423,27 @@ define <16 x i16> @shuffle_v16i16_00_00_04_04_04_04_04_12_08_08_12_12_12_12_12_1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_04_04_04_04_04_12_08_08_12_12_12_12_12_12:
-; AVX2OR512VL: # BB#0:
-; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2OR512VL-NEXT: vpsllq $48, %xmm1, %xmm2
-; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,8,9,8,9,8,9,8,9,8,9,14,15]
-; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
-; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,8,9,8,9,8,9,8,9,8,9,8,9]
-; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2OR512VL-NEXT: retq
+; AVX2-LABEL: shuffle_v16i16_00_00_04_04_04_04_04_12_08_08_12_12_12_12_12_12:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-NEXT: vpsllq $48, %xmm1, %xmm1
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,8,9,8,9,8,9,8,9,8,9,8,9,16,17,16,17,24,25,24,25,24,25,24,25,24,25,24,25]
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v16i16_00_00_04_04_04_04_04_12_08_08_12_12_12_12_12_12:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,4,4,4,4,4,12,8,8,12,12,12,12,12,12]
+; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 4, i32 4, i32 4, i32 4, i32 4, i32 12, i32 8, i32 8, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12>
ret <16 x i16> %shuffle
}
define <16 x i16> @shuffle_v16i16_04_04_00_00_04_04_04_12_12_12_08_08_12_12_12_12(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_04_04_00_00_04_04_04_12_12_12_08_08_12_12_12_12:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpsllq $48, %xmm1, %xmm2
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,0,1,0,1,8,9,8,9,8,9,14,15]
@@ -2455,22 +2452,27 @@ define <16 x i16> @shuffle_v16i16_04_04_00_00_04_04_04_12_12_12_08_08_12_12_12_1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2OR512VL-LABEL: shuffle_v16i16_04_04_00_00_04_04_04_12_12_12_08_08_12_12_12_12:
-; AVX2OR512VL: # BB#0:
-; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2OR512VL-NEXT: vpsllq $48, %xmm1, %xmm2
-; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,0,1,0,1,8,9,8,9,8,9,14,15]
-; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
-; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,8,9,0,1,0,1,8,9,8,9,8,9,8,9]
-; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2OR512VL-NEXT: retq
+; AVX2-LABEL: shuffle_v16i16_04_04_00_00_04_04_04_12_12_12_08_08_12_12_12_12:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-NEXT: vpsllq $48, %xmm1, %xmm1
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,9,8,9,0,1,0,1,8,9,8,9,8,9,8,9,24,25,24,25,16,17,16,17,24,25,24,25,24,25,24,25]
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v16i16_04_04_00_00_04_04_04_12_12_12_08_08_12_12_12_12:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [4,4,0,0,4,4,4,12,12,12,8,8,12,12,12,12]
+; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 4, i32 4, i32 0, i32 0, i32 4, i32 4, i32 4, i32 12, i32 12, i32 12, i32 8, i32 8, i32 12, i32 12, i32 12, i32 12>
ret <16 x i16> %shuffle
}
define <16 x i16> @shuffle_v16i16_00_04_04_00_04_04_04_12_08_12_12_08_12_12_12_12(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_00_04_04_00_04_04_04_12_08_12_12_08_12_12_12_12:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpsllq $48, %xmm1, %xmm2
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,8,9,8,9,8,9,14,15]
@@ -2479,22 +2481,27 @@ define <16 x i16> @shuffle_v16i16_00_04_04_00_04_04_04_12_08_12_12_08_12_12_12_1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2OR512VL-LABEL: shuffle_v16i16_00_04_04_00_04_04_04_12_08_12_12_08_12_12_12_12:
-; AVX2OR512VL: # BB#0:
-; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2OR512VL-NEXT: vpsllq $48, %xmm1, %xmm2
-; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,8,9,8,9,8,9,14,15]
-; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
-; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,8,9,8,9,0,1,8,9,8,9,8,9,8,9]
-; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2OR512VL-NEXT: retq
+; AVX2-LABEL: shuffle_v16i16_00_04_04_00_04_04_04_12_08_12_12_08_12_12_12_12:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-NEXT: vpsllq $48, %xmm1, %xmm1
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,8,9,8,9,0,1,8,9,8,9,8,9,8,9,16,17,24,25,24,25,16,17,24,25,24,25,24,25,24,25]
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v16i16_00_04_04_00_04_04_04_12_08_12_12_08_12_12_12_12:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,4,4,0,4,4,4,12,8,12,12,8,12,12,12,12]
+; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 4, i32 4, i32 0, i32 4, i32 4, i32 4, i32 12, i32 8, i32 12, i32 12, i32 8, i32 12, i32 12, i32 12, i32 12>
ret <16 x i16> %shuffle
}
define <16 x i16> @shuffle_v16i16_00_04_04_00_00_00_00_08_08_12_12_08_08_08_08_08(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_00_04_04_00_00_00_00_08_08_12_12_08_08_08_08_08:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,0,1,0,1,0,1,14,15]
@@ -2503,22 +2510,27 @@ define <16 x i16> @shuffle_v16i16_00_04_04_00_00_00_00_08_08_12_12_08_08_08_08_0
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2OR512VL-LABEL: shuffle_v16i16_00_04_04_00_00_00_00_08_08_12_12_08_08_08_08_08:
-; AVX2OR512VL: # BB#0:
-; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2OR512VL-NEXT: vpbroadcastw %xmm1, %xmm2
-; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,0,1,0,1,0,1,14,15]
-; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
-; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,8,9,8,9,0,1,0,1,0,1,0,1,0,1]
-; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2OR512VL-NEXT: retq
+; AVX2-LABEL: shuffle_v16i16_00_04_04_00_00_00_00_08_08_12_12_08_08_08_08_08:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-NEXT: vpbroadcastw %xmm1, %xmm1
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,8,9,8,9,0,1,0,1,0,1,0,1,0,1,16,17,24,25,24,25,16,17,16,17,16,17,16,17,16,17]
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v16i16_00_04_04_00_00_00_00_08_08_12_12_08_08_08_08_08:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,4,4,0,0,0,0,8,8,12,12,8,8,8,8,8]
+; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 4, i32 4, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 12, i32 12, i32 8, i32 8, i32 8, i32 8, i32 8>
ret <16 x i16> %shuffle
}
define <16 x i16> @shuffle_v16i16_00_04_04_00_04_05_06_15_08_12_12_08_12_13_14_15(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_00_04_04_00_04_05_06_15_08_12_12_08_12_13_14_15:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,0,4,5,6,7]
@@ -2528,23 +2540,27 @@ define <16 x i16> @shuffle_v16i16_00_04_04_00_04_05_06_15_08_12_12_08_12_13_14_1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2OR512VL-LABEL: shuffle_v16i16_00_04_04_00_04_05_06_15_08_12_12_08_12_13_14_15:
-; AVX2OR512VL: # BB#0:
-; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,0,4,5,6,7]
-; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7]
-; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,0,4,5,6,7]
-; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2OR512VL-NEXT: retq
+; AVX2-LABEL: shuffle_v16i16_00_04_04_00_04_05_06_15_08_12_12_08_12_13_14_15:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,2,2,0,4,5,6,7,8,10,10,8,12,13,14,15]
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v16i16_00_04_04_00_04_05_06_15_08_12_12_08_12_13_14_15:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,4,4,0,4,5,6,15,8,12,12,8,12,13,14,15]
+; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 4, i32 4, i32 0, i32 4, i32 5, i32 6, i32 15, i32 8, i32 12, i32 12, i32 8, i32 12, i32 13, i32 14, i32 15>
ret <16 x i16> %shuffle
}
define <16 x i16> @shuffle_v16i16_00_uu_04_04_04_04_04_12_08_uu_12_12_12_12_12_12(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_00_uu_04_04_04_04_04_12_08_uu_12_12_12_12_12_12:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpsllq $48, %xmm1, %xmm2
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,8,9,8,9,8,9,8,9,8,9,14,15]
@@ -2553,22 +2569,27 @@ define <16 x i16> @shuffle_v16i16_00_uu_04_04_04_04_04_12_08_uu_12_12_12_12_12_1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2OR512VL-LABEL: shuffle_v16i16_00_uu_04_04_04_04_04_12_08_uu_12_12_12_12_12_12:
-; AVX2OR512VL: # BB#0:
-; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2OR512VL-NEXT: vpsllq $48, %xmm1, %xmm2
-; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,8,9,8,9,8,9,8,9,8,9,14,15]
-; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
-; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,8,9,8,9,8,9,8,9,8,9,8,9]
-; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2OR512VL-NEXT: retq
+; AVX2-LABEL: shuffle_v16i16_00_uu_04_04_04_04_04_12_08_uu_12_12_12_12_12_12:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-NEXT: vpsllq $48, %xmm1, %xmm1
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,8,9,8,9,8,9,8,9,8,9,8,9,16,17,18,19,24,25,24,25,24,25,24,25,24,25,24,25]
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,u,u,255,255,255,255,255,255,255,255,255,255,0,0,255,255,u,u,255,255,255,255,255,255,255,255,255,255,255,255>
+; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v16i16_00_uu_04_04_04_04_04_12_08_uu_12_12_12_12_12_12:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,u,4,4,4,4,4,12,8,u,12,12,12,12,12,12>
+; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 undef, i32 4, i32 4, i32 4, i32 4, i32 4, i32 12, i32 8, i32 undef, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12>
ret <16 x i16> %shuffle
}
define <16 x i16> @shuffle_v16i16_04_04_uu_00_04_04_04_12_12_12_uu_08_12_12_12_12(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_04_04_uu_00_04_04_04_12_12_12_uu_08_12_12_12_12:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpsllq $48, %xmm1, %xmm2
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,0,1,8,9,8,9,8,9,14,15]
@@ -2577,22 +2598,27 @@ define <16 x i16> @shuffle_v16i16_04_04_uu_00_04_04_04_12_12_12_uu_08_12_12_12_1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2OR512VL-LABEL: shuffle_v16i16_04_04_uu_00_04_04_04_12_12_12_uu_08_12_12_12_12:
-; AVX2OR512VL: # BB#0:
-; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2OR512VL-NEXT: vpsllq $48, %xmm1, %xmm2
-; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,0,1,8,9,8,9,8,9,14,15]
-; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
-; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,8,9,8,9,0,1,8,9,8,9,8,9,8,9]
-; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2OR512VL-NEXT: retq
+; AVX2-LABEL: shuffle_v16i16_04_04_uu_00_04_04_04_12_12_12_uu_08_12_12_12_12:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-NEXT: vpsllq $48, %xmm1, %xmm1
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,9,8,9,8,9,0,1,8,9,8,9,8,9,8,9,24,25,24,25,24,25,16,17,24,25,24,25,24,25,24,25]
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,255,255,u,u,255,255,255,255,255,255,255,255,0,0,255,255,255,255,u,u,255,255,255,255,255,255,255,255,255,255>
+; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v16i16_04_04_uu_00_04_04_04_12_12_12_uu_08_12_12_12_12:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = <4,4,u,0,4,4,4,12,12,12,u,8,12,12,12,12>
+; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 4, i32 4, i32 undef, i32 0, i32 4, i32 4, i32 4, i32 12, i32 12, i32 12, i32 undef, i32 8, i32 12, i32 12, i32 12, i32 12>
ret <16 x i16> %shuffle
}
define <16 x i16> @shuffle_v16i16_uu_04_04_00_04_04_04_12_uu_12_12_08_12_12_12_12(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_uu_04_04_00_04_04_04_12_uu_12_12_08_12_12_12_12:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpsllq $48, %xmm1, %xmm2
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,8,9,8,9,8,9,14,15]
@@ -2601,22 +2627,27 @@ define <16 x i16> @shuffle_v16i16_uu_04_04_00_04_04_04_12_uu_12_12_08_12_12_12_1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2OR512VL-LABEL: shuffle_v16i16_uu_04_04_00_04_04_04_12_uu_12_12_08_12_12_12_12:
-; AVX2OR512VL: # BB#0:
-; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2OR512VL-NEXT: vpsllq $48, %xmm1, %xmm2
-; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,8,9,8,9,8,9,14,15]
-; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
-; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,8,9,8,9,0,1,8,9,8,9,8,9,8,9]
-; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2OR512VL-NEXT: retq
+; AVX2-LABEL: shuffle_v16i16_uu_04_04_00_04_04_04_12_uu_12_12_08_12_12_12_12:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-NEXT: vpsllq $48, %xmm1, %xmm1
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,8,9,8,9,0,1,8,9,8,9,8,9,8,9,16,17,24,25,24,25,16,17,24,25,24,25,24,25,24,25]
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <u,u,255,255,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,255,255,255,255>
+; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v16i16_uu_04_04_00_04_04_04_12_uu_12_12_08_12_12_12_12:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = <u,4,4,0,4,4,4,12,u,12,12,8,12,12,12,12>
+; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 undef, i32 4, i32 4, i32 0, i32 4, i32 4, i32 4, i32 12, i32 undef, i32 12, i32 12, i32 8, i32 12, i32 12, i32 12, i32 12>
ret <16 x i16> %shuffle
}
define <16 x i16> @shuffle_v16i16_00_01_02_07_uu_uu_uu_uu_08_09_10_15_uu_uu_uu_uu(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_00_01_02_07_uu_uu_uu_uu_08_09_10_15_uu_uu_uu_uu:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,14,15,4,5,14,15,12,13,14,15]
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
@@ -2625,12 +2656,12 @@ define <16 x i16> @shuffle_v16i16_00_01_02_07_uu_uu_uu_uu_08_09_10_15_uu_uu_uu_u
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v16i16_00_01_02_07_uu_uu_uu_uu_08_09_10_15_uu_uu_uu_uu:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,14,15,4,5,14,15,12,13,14,15,16,17,18,19,20,21,30,31,20,21,30,31,28,29,30,31]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_00_01_02_07_uu_uu_uu_uu_08_09_10_15_uu_uu_uu_uu:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
; AVX512VL-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,7,6,7,8,9,10,11,12,15,14,15]
; AVX512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
@@ -2641,7 +2672,7 @@ define <16 x i16> @shuffle_v16i16_00_01_02_07_uu_uu_uu_uu_08_09_10_15_uu_uu_uu_u
define <16 x i16> @shuffle_v16i16_uu_uu_uu_uu_04_05_06_11_uu_uu_uu_uu_12_13_14_11(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_uu_uu_uu_uu_04_05_06_11_uu_uu_uu_uu_12_13_14_11:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,1,0,1]
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
@@ -2650,23 +2681,15 @@ define <16 x i16> @shuffle_v16i16_uu_uu_uu_uu_04_05_06_11_uu_uu_uu_uu_12_13_14_1
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v16i16_uu_uu_uu_uu_04_05_06_11_uu_uu_uu_uu_12_13_14_11:
-; AVX2: # BB#0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpbroadcastq %xmm1, %xmm2
-; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
-; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[12,13,6,7,4,5,6,7,8,9,10,11,12,13,6,7]
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[12,13,6,7,4,5,6,7,8,9,10,11,12,13,6,7,28,29,22,23,20,21,22,23,24,25,26,27,28,29,22,23]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_uu_uu_uu_uu_04_05_06_11_uu_uu_uu_uu_12_13_14_11:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512VL-NEXT: vpbroadcastq %xmm1, %xmm2
-; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,0]
-; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
-; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = <u,u,u,u,4,5,6,11,u,u,u,u,12,13,14,11>
+; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0
; AVX512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 4, i32 5, i32 6, i32 11, i32 undef, i32 undef, i32 undef, i32 undef, i32 12, i32 13, i32 14, i32 11>
ret <16 x i16> %shuffle
@@ -2674,7 +2697,7 @@ define <16 x i16> @shuffle_v16i16_uu_uu_uu_uu_04_05_06_11_uu_uu_uu_uu_12_13_14_1
define <16 x i16> @shuffle_v16i16_04_05_06_03_uu_uu_uu_uu_12_13_14_11_uu_uu_uu_uu(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_04_05_06_03_uu_uu_uu_uu_12_13_14_11_uu_uu_uu_uu:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [8,9,10,11,12,13,6,7,8,9,10,11,0,1,2,3]
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
@@ -2683,12 +2706,12 @@ define <16 x i16> @shuffle_v16i16_04_05_06_03_uu_uu_uu_uu_12_13_14_11_uu_uu_uu_u
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v16i16_04_05_06_03_uu_uu_uu_uu_12_13_14_11_uu_uu_uu_uu:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,9,10,11,12,13,6,7,8,9,10,11,0,1,2,3,24,25,26,27,28,29,22,23,24,25,26,27,16,17,18,19]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_04_05_06_03_uu_uu_uu_uu_12_13_14_11_uu_uu_uu_uu:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,1,2,0,7,5,6,4]
; AVX512VL-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,3,2,3,4,5,6,7,8,11,10,11,12,13,14,15]
; AVX512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7]
@@ -2699,7 +2722,7 @@ define <16 x i16> @shuffle_v16i16_04_05_06_03_uu_uu_uu_uu_12_13_14_11_uu_uu_uu_u
define <16 x i16> @shuffle_v16i16_01_zz_02_zz_04_uu_06_07_08_09_10_11_12_13_14_15(<16 x i16> %a) {
; AVX1-LABEL: shuffle_v16i16_01_zz_02_zz_04_uu_06_07_08_09_10_11_12_13_14_15:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[1,1,2,3,4,5,6,7]
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4,5,6,7]
@@ -2707,7 +2730,7 @@ define <16 x i16> @shuffle_v16i16_01_zz_02_zz_04_uu_06_07_08_09_10_11_12_13_14_1
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v16i16_01_zz_02_zz_04_uu_06_07_08_09_10_11_12_13_14_15:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3],zero,zero,ymm0[4,5],zero,zero,ymm0[8,9,u,u,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> zeroinitializer, <16 x i32> <i32 1, i32 16, i32 2, i32 16, i32 4, i32 undef, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -2716,7 +2739,7 @@ define <16 x i16> @shuffle_v16i16_01_zz_02_zz_04_uu_06_07_08_09_10_11_12_13_14_1
define <16 x i16> @shuffle_v16i16_00_01_02_07_04_05_06_11_08_09_10_15_12_13_14_11(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_00_01_02_07_04_05_06_11_08_09_10_15_12_13_14_11:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,14,15,8,9,10,11,12,13,6,7]
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm3
@@ -2726,26 +2749,17 @@ define <16 x i16> @shuffle_v16i16_00_01_02_07_04_05_06_11_08_09_10_15_12_13_14_1
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v16i16_00_01_02_07_04_05_06_11_08_09_10_15_12_13_14_11:
-; AVX2: # BB#0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,14,15,8,9,10,11,12,13,6,7]
-; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm3
-; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7]
-; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,14,15,8,9,10,11,12,13,6,7,16,17,18,19,20,21,30,31,24,25,26,27,28,29,22,23]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_00_01_02_07_04_05_06_11_08_09_10_15_12_13_14_11:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7]
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
-; AVX512VL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7]
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,1,2]
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,1,3]
-; AVX512VL-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7]
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,1,2]
-; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,7,4,5,6,11,8,9,10,15,12,13,14,11]
+; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0
; AVX512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 7, i32 4, i32 5, i32 6, i32 11, i32 8, i32 9, i32 10, i32 15, i32 12, i32 13, i32 14, i32 11>
ret <16 x i16> %shuffle
@@ -2753,7 +2767,7 @@ define <16 x i16> @shuffle_v16i16_00_01_02_07_04_05_06_11_08_09_10_15_12_13_14_1
define <16 x i16> @shuffle_v16i16_04_05_06_03_00_01_02_15_12_13_14_11_08_09_10_15(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_04_05_06_03_00_01_02_15_12_13_14_11_08_09_10_15:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,6,7,0,1,2,3,4,5,2,3]
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7]
@@ -2762,23 +2776,17 @@ define <16 x i16> @shuffle_v16i16_04_05_06_03_00_01_02_15_12_13_14_11_08_09_10_1
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v16i16_04_05_06_03_00_01_02_15_12_13_14_11_08_09_10_15:
-; AVX2: # BB#0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,6,7,0,1,2,3,4,5,2,3]
-; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7]
-; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,6,7,0,1,2,3,4,5,14,15]
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[8,9,10,11,12,13,6,7,0,1,2,3,4,5,14,15,24,25,26,27,28,29,22,23,16,17,18,19,20,21,30,31]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_04_05_06_03_00_01_02_15_12_13_14_11_08_09_10_15:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,6,7,0,1,2,3,4,5,2,3]
-; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7]
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,0]
-; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7]
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,0,3,1]
-; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,6,3,0,1,2,15,12,13,14,11,8,9,10,15]
+; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0
; AVX512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 4, i32 5, i32 6, i32 3, i32 0, i32 1, i32 2, i32 15, i32 12, i32 13, i32 14, i32 11, i32 8, i32 9, i32 10, i32 15>
ret <16 x i16> %shuffle
@@ -2786,7 +2794,7 @@ define <16 x i16> @shuffle_v16i16_04_05_06_03_00_01_02_15_12_13_14_11_08_09_10_1
define <16 x i16> @shuffle_v16i16_03_07_01_00_02_07_03_13_11_15_09_08_10_15_11_13(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_03_07_01_00_02_07_03_13_11_15_09_08_10_15_11_13:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [6,7,14,15,2,3,0,1,4,5,14,15,6,7,10,11]
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm3
@@ -2796,23 +2804,16 @@ define <16 x i16> @shuffle_v16i16_03_07_01_00_02_07_03_13_11_15_09_08_10_15_11_1
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v16i16_03_07_01_00_02_07_03_13_11_15_09_08_10_15_11_13:
-; AVX2: # BB#0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [6,7,14,15,2,3,0,1,4,5,14,15,6,7,10,11]
-; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm3
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
-; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6,7]
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[6,7,14,15,2,3,0,1,4,5,14,15,6,7,10,11,22,23,30,31,18,19,16,17,20,21,30,31,22,23,26,27]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_03_07_01_00_02_07_03_13_11_15_09_08_10_15_11_13:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512VL-NEXT: vmovdqu {{.*#+}} xmm2 = [6,7,14,15,2,3,0,1,4,5,14,15,6,7,10,11]
-; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm3
-; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
-; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512VL-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [3,7,1,0,2,7,3,13,11,15,9,8,10,15,11,13]
+; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0
; AVX512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 3, i32 7, i32 1, i32 0, i32 2, i32 7, i32 3, i32 13, i32 11, i32 15, i32 9, i32 8, i32 10, i32 15, i32 11, i32 13>
ret <16 x i16> %shuffle
@@ -2820,7 +2821,7 @@ define <16 x i16> @shuffle_v16i16_03_07_01_00_02_07_03_13_11_15_09_08_10_15_11_1
define <16 x i16> @shuffle_v16i16_00_16_01_17_02_18_03_27_08_24_09_25_10_26_11_27(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_00_16_01_17_02_18_03_27_08_24_09_25_10_26_11_27:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
@@ -2831,20 +2832,16 @@ define <16 x i16> @shuffle_v16i16_00_16_01_17_02_18_03_27_08_24_09_25_10_26_11_2
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v16i16_00_16_01_17_02_18_03_27_08_24_09_25_10_26_11_27:
-; AVX2: # BB#0:
-; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,2,4,5,6,7]
-; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,6,7]
-; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
-; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
-; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,255,255,255,255,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_00_16_01_17_02_18_03_27_08_24_09_25_10_26_11_27:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [0,16,1,17,2,18,3,27,8,24,9,25,10,26,11,27]
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,16,1,17,2,18,3,27,8,24,9,25,10,26,11,27]
; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0
; AVX512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 27, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27>
@@ -2853,7 +2850,7 @@ define <16 x i16> @shuffle_v16i16_00_16_01_17_02_18_03_27_08_24_09_25_10_26_11_2
define <16 x i16> @shuffle_v16i16_00_20_01_21_02_22_03_31_08_28_09_29_10_30_11_31(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_00_20_01_21_02_22_03_31_08_28_09_29_10_30_11_31:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,3,0,1]
@@ -2865,19 +2862,17 @@ define <16 x i16> @shuffle_v16i16_00_20_01_21_02_22_03_31_08_28_09_29_10_30_11_3
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v16i16_00_20_01_21_02_22_03_31_08_28_09_29_10_30_11_31:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,8,9,2,3,10,11,4,5,12,13,6,7,14,15]
-; AVX2-NEXT: vpshufb %xmm1, %xmm0, %xmm2
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6],xmm0[7]
-; AVX2-NEXT: vpshufb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0
+; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,8,9,2,3,10,11,4,5,12,13,6,7,14,15,16,17,24,25,18,19,26,27,20,21,28,29,22,23,30,31]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_00_20_01_21_02_22_03_31_08_28_09_29_10_30_11_31:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [0,20,1,21,2,22,3,31,8,28,9,29,10,30,11,31]
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,20,1,21,2,22,3,31,8,28,9,29,10,30,11,31]
; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0
; AVX512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 20, i32 1, i32 21, i32 2, i32 22, i32 3, i32 31, i32 8, i32 28, i32 9, i32 29, i32 10, i32 30, i32 11, i32 31>
@@ -2886,7 +2881,7 @@ define <16 x i16> @shuffle_v16i16_00_20_01_21_02_22_03_31_08_28_09_29_10_30_11_3
define <16 x i16> @shuffle_v16i16_04_20_05_21_06_22_07_31_12_28_13_29_14_30_15_31(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_04_20_05_21_06_22_07_31_12_28_13_29_14_30_15_31:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
@@ -2897,20 +2892,17 @@ define <16 x i16> @shuffle_v16i16_04_20_05_21_06_22_07_31_12_28_13_29_14_30_15_3
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v16i16_04_20_05_21_06_22_07_31_12_28_13_29_14_30_15_31:
-; AVX2: # BB#0:
-; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,2,4,5,6,7]
-; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,6,7]
-; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
+; AVX2: # %bb.0:
; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15]
+; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15]
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6,7]
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_04_20_05_21_06_22_07_31_12_28_13_29_14_30_15_31:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [4,20,5,21,6,22,7,31,12,28,13,29,14,30,15,31]
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [4,20,5,21,6,22,7,31,12,28,13,29,14,30,15,31]
; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0
; AVX512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 31, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
@@ -2919,7 +2911,7 @@ define <16 x i16> @shuffle_v16i16_04_20_05_21_06_22_07_31_12_28_13_29_14_30_15_3
define <16 x i16> @shuffle_v16i16_04_16_05_17_06_18_07_27_12_24_13_25_14_26_15_27(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_04_16_05_17_06_18_07_27_12_24_13_25_14_26_15_27:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
@@ -2931,19 +2923,17 @@ define <16 x i16> @shuffle_v16i16_04_16_05_17_06_18_07_27_12_24_13_25_14_26_15_2
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v16i16_04_16_05_17_06_18_07_27_12_24_13_25_14_26_15_27:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7]
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [8,9,0,1,10,11,2,3,12,13,4,5,14,15,6,7]
-; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,9,0,1,10,11,2,3,12,13,4,5,14,15,6,7,24,25,16,17,26,27,18,19,28,29,20,21,30,31,22,23]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_04_16_05_17_06_18_07_27_12_24_13_25_14_26_15_27:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [4,16,5,17,6,18,7,27,12,24,13,25,14,26,15,27]
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [4,16,5,17,6,18,7,27,12,24,13,25,14,26,15,27]
; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0
; AVX512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 4, i32 16, i32 5, i32 17, i32 6, i32 18, i32 7, i32 27, i32 12, i32 24, i32 13, i32 25, i32 14, i32 26, i32 15, i32 27>
@@ -2952,7 +2942,7 @@ define <16 x i16> @shuffle_v16i16_04_16_05_17_06_18_07_27_12_24_13_25_14_26_15_2
define <16 x i16> @shuffle_v16i16_00_16_01_17_06_22_07_31_08_24_09_25_14_30_15_31(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_00_16_01_17_06_22_07_31_08_24_09_25_14_30_15_31:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,3,2,3]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
@@ -2968,22 +2958,19 @@ define <16 x i16> @shuffle_v16i16_00_16_01_17_06_22_07_31_08_24_09_25_14_30_15_3
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v16i16_00_16_01_17_06_22_07_31_08_24_09_25_14_30_15_31:
-; AVX2: # BB#0:
-; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,1,4,5,6,7]
-; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3]
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,2,1,4,5,6,7]
-; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
-; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
+; AVX2-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
+; AVX2-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5,6,7]
; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
; AVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,6,5,7,7,8,9,10,11,14,13,15,15]
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_00_16_01_17_06_22_07_31_08_24_09_25_14_30_15_31:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [0,16,1,17,6,22,7,31,8,24,9,25,14,30,15,31]
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,16,1,17,6,22,7,31,8,24,9,25,14,30,15,31]
; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0
; AVX512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 6, i32 22, i32 7, i32 31, i32 8, i32 24, i32 9, i32 25, i32 14, i32 30, i32 15, i32 31>
@@ -2992,7 +2979,7 @@ define <16 x i16> @shuffle_v16i16_00_16_01_17_06_22_07_31_08_24_09_25_14_30_15_3
define <16 x i16> @shuffle_v16i16_00_20_01_21_06_16_07_25_08_28_09_29_14_24_15_25(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_00_20_01_21_06_16_07_25_08_28_09_29_14_24_15_25:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,0,2,3]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
@@ -3006,21 +2993,19 @@ define <16 x i16> @shuffle_v16i16_00_20_01_21_06_16_07_25_08_28_09_29_14_24_15_2
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v16i16_00_20_01_21_06_16_07_25_08_28_09_29_14_24_15_25:
-; AVX2: # BB#0:
-; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,8,9,4,5,10,11,0,1,0,1,12,13,2,3]
-; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm4
-; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4,5,6,7]
-; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
-; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,0,0,u,u,u,u,255,255,255,255,u,u,u,u,255,255,255,255,u,u,u,u,255,255,255,255,u,u,u,u>
+; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[8,9,8,9,4,5,10,11,0,1,0,1,12,13,2,3,24,25,24,25,20,21,26,27,16,17,16,17,28,29,18,19]
; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
; AVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,6,5,7,7,8,9,10,11,14,13,15,15]
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_00_20_01_21_06_16_07_25_08_28_09_29_14_24_15_25:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [0,20,1,21,6,16,7,25,8,28,9,29,14,24,15,25]
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,20,1,21,6,16,7,25,8,28,9,29,14,24,15,25]
; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0
; AVX512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 20, i32 1, i32 21, i32 6, i32 16, i32 7, i32 25, i32 8, i32 28, i32 9, i32 29, i32 14, i32 24, i32 15, i32 25>
@@ -3029,7 +3014,7 @@ define <16 x i16> @shuffle_v16i16_00_20_01_21_06_16_07_25_08_28_09_29_14_24_15_2
define <16 x i16> @shuffle_v16i16_01_00_17_16_03_02_19_26_09_08_25_24_11_10_27_26(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_01_00_17_16_03_02_19_26_09_08_25_24_11_10_27_26:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,0,1,12,13,10,11,8,9,10,11,12,13,10,11]
@@ -3043,20 +3028,18 @@ define <16 x i16> @shuffle_v16i16_01_00_17_16_03_02_19_26_09_08_25_24_11_10_27_2
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v16i16_01_00_17_16_03_02_19_26_09_08_25_24_11_10_27_26:
-; AVX2: # BB#0:
-; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7]
-; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5]
-; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,2,3,0,1,8,9,10,11,6,7,4,5]
-; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,255,255,0,0,255,255,u,u,u,u,u,u,u,u,255,255,255,255,255,255,255,255,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,2,3,2,3,0,1,8,9,10,11,6,7,4,5,16,17,18,19,18,19,16,17,24,25,26,27,22,23,20,21]
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,0,1,4,5,6,7,6,7,4,5,4,5,6,7,18,19,16,17,20,21,22,23,22,23,20,21,20,21,22,23]
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_01_00_17_16_03_02_19_26_09_08_25_24_11_10_27_26:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [1,0,17,16,3,2,19,26,9,8,25,24,11,10,27,26]
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [1,0,17,16,3,2,19,26,9,8,25,24,11,10,27,26]
; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0
; AVX512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 1, i32 0, i32 17, i32 16, i32 3, i32 2, i32 19, i32 26, i32 9, i32 8, i32 25, i32 24, i32 11, i32 10, i32 27, i32 26>
@@ -3065,7 +3048,7 @@ define <16 x i16> @shuffle_v16i16_01_00_17_16_03_02_19_26_09_08_25_24_11_10_27_2
define <16 x i16> @shuffle_v16i16_16_00_17_01_18_02_19_11_24_08_25_09_26_10_27_11(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_16_00_17_01_18_02_19_11_24_08_25_09_26_10_27_11:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
@@ -3076,20 +3059,16 @@ define <16 x i16> @shuffle_v16i16_16_00_17_01_18_02_19_11_24_08_25_09_26_10_27_1
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v16i16_16_00_17_01_18_02_19_11_24_08_25_09_26_10_27_11:
-; AVX2: # BB#0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
-; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,2,4,5,6,7]
-; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7]
-; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
-; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11]
-; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15]
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,255,255,255,255,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpblendvb %ymm3, %ymm0, %ymm2, %ymm0
+; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_16_00_17_01_18_02_19_11_24_08_25_09_26_10_27_11:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [0,16,1,17,2,18,3,27,8,24,9,25,10,26,11,27]
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,16,1,17,2,18,3,27,8,24,9,25,10,26,11,27]
; AVX512VL-NEXT: vpermi2w %ymm0, %ymm1, %ymm2
; AVX512VL-NEXT: vmovdqa %ymm2, %ymm0
; AVX512VL-NEXT: retq
@@ -3099,7 +3078,7 @@ define <16 x i16> @shuffle_v16i16_16_00_17_01_18_02_19_11_24_08_25_09_26_10_27_1
define <16 x i16> @shuffle_v16i16_20_04_21_05_22_06_23_15_28_12_29_13_30_14_31_15(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_20_04_21_05_22_06_23_15_28_12_29_13_30_14_31_15:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
@@ -3110,20 +3089,17 @@ define <16 x i16> @shuffle_v16i16_20_04_21_05_22_06_23_15_28_12_29_13_30_14_31_1
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v16i16_20_04_21_05_22_06_23_15_28_12_29_13_30_14_31_15:
-; AVX2: # BB#0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
-; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,2,4,5,6,7]
-; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7]
-; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
+; AVX2: # %bb.0:
; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15]
+; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm0[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6,7]
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_20_04_21_05_22_06_23_15_28_12_29_13_30_14_31_15:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [4,20,5,21,6,22,7,31,12,28,13,29,14,30,15,31]
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [4,20,5,21,6,22,7,31,12,28,13,29,14,30,15,31]
; AVX512VL-NEXT: vpermi2w %ymm0, %ymm1, %ymm2
; AVX512VL-NEXT: vmovdqa %ymm2, %ymm0
; AVX512VL-NEXT: retq
@@ -3133,7 +3109,7 @@ define <16 x i16> @shuffle_v16i16_20_04_21_05_22_06_23_15_28_12_29_13_30_14_31_1
define <16 x i16> @shuffle_v16i16_00_02_01_03_20_22_21_31_08_10_09_11_28_30_29_31(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_00_02_01_03_20_22_21_31_08_10_09_11_28_30_29_31:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,1,3,4,5,6,7]
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
@@ -3148,20 +3124,18 @@ define <16 x i16> @shuffle_v16i16_00_02_01_03_20_22_21_31_08_10_09_11_28_30_29_3
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v16i16_00_02_01_03_20_22_21_31_08_10_09_11_28_30_29_31:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7]
-; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,7]
-; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7]
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,1,3,4,5,6,7]
-; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,5,7]
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,2,1,3,4,5,6,7,8,10,9,11,12,13,14,15]
+; AVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,6,5,7,8,9,10,11,12,14,13,15]
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_00_02_01_03_20_22_21_31_08_10_09_11_28_30_29_31:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [0,2,1,3,20,22,21,31,8,10,9,11,28,30,29,31]
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,1,3,20,22,21,31,8,10,9,11,28,30,29,31]
; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0
; AVX512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 2, i32 1, i32 3, i32 20, i32 22, i32 21, i32 31, i32 8, i32 10, i32 9, i32 11, i32 28, i32 30, i32 29, i32 31>
@@ -3170,7 +3144,7 @@ define <16 x i16> @shuffle_v16i16_00_02_01_03_20_22_21_31_08_10_09_11_28_30_29_3
define <16 x i16> @shuffle_v16i16_04_04_03_18_uu_uu_uu_uu_12_12_11_26_uu_uu_uu_uu(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_04_04_03_18_uu_uu_uu_uu_12_12_11_26_uu_uu_uu_uu:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1],xmm1[2],xmm0[3,4,5,6,7]
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3]
; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,3,2,4,5,6,7]
@@ -3183,15 +3157,15 @@ define <16 x i16> @shuffle_v16i16_04_04_03_18_uu_uu_uu_uu_12_12_11_26_uu_uu_uu_u
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v16i16_04_04_03_18_uu_uu_uu_uu_12_12_11_26_uu_uu_uu_uu:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6,7,8,9],ymm1[10],ymm0[11,12,13,14,15]
; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,1,2,3,6,5,6,7]
; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,3,2,4,5,6,7,8,8,11,10,12,13,14,15]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_04_04_03_18_uu_uu_uu_uu_12_12_11_26_uu_uu_uu_uu:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = <4,4,3,18,u,u,u,u,12,12,11,26,u,u,u,u>
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = <4,4,3,18,u,u,u,u,12,12,11,26,u,u,u,u>
; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0
; AVX512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 4, i32 4, i32 3, i32 18, i32 undef, i32 undef, i32 undef, i32 undef, i32 12, i32 12, i32 11, i32 26, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -3200,7 +3174,7 @@ define <16 x i16> @shuffle_v16i16_04_04_03_18_uu_uu_uu_uu_12_12_11_26_uu_uu_uu_u
define <16 x i16> @shuffle_v16i16_00_03_02_21_uu_uu_uu_uu_08_11_10_29_uu_uu_uu_uu(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_00_03_02_21_uu_uu_uu_uu_08_11_10_29_uu_uu_uu_uu:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
@@ -3212,14 +3186,14 @@ define <16 x i16> @shuffle_v16i16_00_03_02_21_uu_uu_uu_uu_08_11_10_29_uu_uu_uu_u
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v16i16_00_03_02_21_uu_uu_uu_uu_08_11_10_29_uu_uu_uu_uu:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,6,7,4,5,10,11,0,1,10,11,0,1,2,3,16,17,22,23,20,21,26,27,16,17,26,27,16,17,18,19]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_00_03_02_21_uu_uu_uu_uu_08_11_10_29_uu_uu_uu_uu:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = <0,3,2,21,u,u,u,u,8,11,10,29,u,u,u,u>
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = <0,3,2,21,u,u,u,u,8,11,10,29,u,u,u,u>
; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0
; AVX512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 3, i32 2, i32 21, i32 undef, i32 undef, i32 undef, i32 undef, i32 8, i32 11, i32 10, i32 29, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -3227,22 +3201,17 @@ define <16 x i16> @shuffle_v16i16_00_03_02_21_uu_uu_uu_uu_08_11_10_29_uu_uu_uu_u
}
define <16 x i16> @shuffle_v16i16_uu_uu_uu_21_uu_uu_uu_uu_uu_uu_uu_29_uu_uu_uu_uu(<16 x i16> %a, <16 x i16> %b) {
-; AVX1-LABEL: shuffle_v16i16_uu_uu_uu_21_uu_uu_uu_uu_uu_uu_uu_29_uu_uu_uu_uu:
-; AVX1: # BB#0:
-; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm1[0,2,2,3,4,6,6,7]
-; AVX1-NEXT: retq
-;
-; AVX2OR512VL-LABEL: shuffle_v16i16_uu_uu_uu_21_uu_uu_uu_uu_uu_uu_uu_29_uu_uu_uu_uu:
-; AVX2OR512VL: # BB#0:
-; AVX2OR512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2OR512VL-NEXT: retq
+; ALL-LABEL: shuffle_v16i16_uu_uu_uu_21_uu_uu_uu_uu_uu_uu_uu_29_uu_uu_uu_uu:
+; ALL: # %bb.0:
+; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm1[0,2,2,3,4,6,6,7]
+; ALL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 21, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 29, i32 undef, i32 undef, i32 undef, i32 undef>
ret <16 x i16> %shuffle
}
define <16 x i16> @shuffle_v16i16_00_01_02_21_uu_uu_uu_uu_08_09_10_29_uu_uu_uu_uu(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_00_01_02_21_uu_uu_uu_uu_08_09_10_29_uu_uu_uu_uu:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3]
@@ -3253,14 +3222,14 @@ define <16 x i16> @shuffle_v16i16_00_01_02_21_uu_uu_uu_uu_08_09_10_29_uu_uu_uu_u
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v16i16_00_01_02_21_uu_uu_uu_uu_08_09_10_29_uu_uu_uu_uu:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7,8,9,10],ymm1[11],ymm0[12,13,14,15]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_00_01_02_21_uu_uu_uu_uu_08_09_10_29_uu_uu_uu_uu:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = <0,1,2,21,u,u,u,u,8,9,10,29,u,u,u,u>
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = <0,1,2,21,u,u,u,u,8,9,10,29,u,u,u,u>
; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0
; AVX512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 21, i32 undef, i32 undef, i32 undef, i32 undef, i32 8, i32 9, i32 10, i32 29, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -3269,7 +3238,7 @@ define <16 x i16> @shuffle_v16i16_00_01_02_21_uu_uu_uu_uu_08_09_10_29_uu_uu_uu_u
define <16 x i16> @shuffle_v16i16_uu_uu_uu_uu_20_21_22_11_uu_uu_uu_uu_28_29_30_11(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_uu_uu_uu_uu_20_21_22_11_uu_uu_uu_uu_28_29_30_11:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
@@ -3279,14 +3248,14 @@ define <16 x i16> @shuffle_v16i16_uu_uu_uu_uu_20_21_22_11_uu_uu_uu_uu_28_29_30_1
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v16i16_uu_uu_uu_uu_20_21_22_11_uu_uu_uu_uu_28_29_30_11:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,2]
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7],ymm1[8,9,10,11,12,13,14],ymm0[15]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_uu_uu_uu_uu_20_21_22_11_uu_uu_uu_uu_28_29_30_11:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = <u,u,u,u,4,5,6,27,u,u,u,u,12,13,14,27>
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = <u,u,u,u,4,5,6,27,u,u,u,u,12,13,14,27>
; AVX512VL-NEXT: vpermi2w %ymm0, %ymm1, %ymm2
; AVX512VL-NEXT: vmovdqa %ymm2, %ymm0
; AVX512VL-NEXT: retq
@@ -3296,7 +3265,7 @@ define <16 x i16> @shuffle_v16i16_uu_uu_uu_uu_20_21_22_11_uu_uu_uu_uu_28_29_30_1
define <16 x i16> @shuffle_v16i16_20_21_22_03_uu_uu_uu_uu_28_29_30_11_uu_uu_uu_uu(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_20_21_22_03_uu_uu_uu_uu_28_29_30_11_uu_uu_uu_uu:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
@@ -3307,14 +3276,14 @@ define <16 x i16> @shuffle_v16i16_20_21_22_03_uu_uu_uu_uu_28_29_30_11_uu_uu_uu_u
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v16i16_20_21_22_03_uu_uu_uu_uu_28_29_30_11_uu_uu_uu_uu:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7]
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7,8,9,10],ymm0[11],ymm1[12,13,14,15]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_20_21_22_03_uu_uu_uu_uu_28_29_30_11_uu_uu_uu_uu:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = <4,5,6,19,u,u,u,u,12,13,14,27,u,u,u,u>
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = <4,5,6,19,u,u,u,u,12,13,14,27,u,u,u,u>
; AVX512VL-NEXT: vpermi2w %ymm0, %ymm1, %ymm2
; AVX512VL-NEXT: vmovdqa %ymm2, %ymm0
; AVX512VL-NEXT: retq
@@ -3324,7 +3293,7 @@ define <16 x i16> @shuffle_v16i16_20_21_22_03_uu_uu_uu_uu_28_29_30_11_uu_uu_uu_u
define <16 x i16> @shuffle_v16i16_00_01_02_21_20_21_22_11_08_09_10_29_28_29_30_11(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_00_01_02_21_20_21_22_11_08_09_10_29_28_29_30_11:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[0,2,2,3]
@@ -3336,19 +3305,17 @@ define <16 x i16> @shuffle_v16i16_00_01_02_21_20_21_22_11_08_09_10_29_28_29_30_1
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v16i16_00_01_02_21_20_21_22_11_08_09_10_29_28_29_30_11:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7]
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,10,11,8,9,10,11,12,13,6,7]
-; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,255,255,255,255,0,0,255,255,255,255,255,255,u,u,255,255,255,255,255,255,255,255,255,255,255,255,255,255,u,u>
+; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,10,11,8,9,10,11,12,13,6,7,16,17,18,19,20,21,26,27,24,25,26,27,28,29,22,23]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_00_01_02_21_20_21_22_11_08_09_10_29_28_29_30_11:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [0,1,2,21,20,21,22,11,8,9,10,29,28,29,30,11]
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,21,20,21,22,11,8,9,10,29,28,29,30,11]
; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0
; AVX512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 21, i32 20, i32 21, i32 22, i32 11, i32 8, i32 9, i32 10, i32 29, i32 28, i32 29, i32 30, i32 11>
@@ -3357,7 +3324,7 @@ define <16 x i16> @shuffle_v16i16_00_01_02_21_20_21_22_11_08_09_10_29_28_29_30_1
define <16 x i16> @shuffle_v16i16_00_17_02_03_20_21_22_15_08_25_10_11_28_29_30_15(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_00_17_02_03_20_21_22_15_08_25_10_11_28_29_30_15:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3],xmm2[4,5,6],xmm3[7]
@@ -3367,14 +3334,14 @@ define <16 x i16> @shuffle_v16i16_00_17_02_03_20_21_22_15_08_25_10_11_28_29_30_1
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v16i16_00_17_02_03_20_21_22_15_08_25_10_11_28_29_30_15:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4,5,6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12,13,14],ymm0[15]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_00_17_02_03_20_21_22_15_08_25_10_11_28_29_30_15:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [0,17,2,3,20,21,22,15,8,25,10,11,28,29,30,15]
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,17,2,3,20,21,22,15,8,25,10,11,28,29,30,15]
; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0
; AVX512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 17, i32 2, i32 3, i32 20, i32 21, i32 22, i32 15, i32 8, i32 25, i32 10, i32 11, i32 28, i32 29, i32 30, i32 15>
@@ -3383,7 +3350,7 @@ define <16 x i16> @shuffle_v16i16_00_17_02_03_20_21_22_15_08_25_10_11_28_29_30_1
define <16 x i16> @shuffle_v16i16_uu_uu_uu_01_uu_05_07_25_uu_uu_uu_09_uu_13_15_25(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_uu_uu_uu_01_uu_05_07_25_uu_uu_uu_09_uu_13_15_25:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
@@ -3397,7 +3364,7 @@ define <16 x i16> @shuffle_v16i16_uu_uu_uu_01_uu_05_07_25_uu_uu_uu_09_uu_13_15_2
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v16i16_uu_uu_uu_01_uu_05_07_25_uu_uu_uu_09_uu_13_15_25:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1
; AVX2-NEXT: vpbroadcastd %xmm1, %ymm1
; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,1,2,1,4,5,6,7,8,9,10,9,12,13,14,15]
@@ -3406,8 +3373,8 @@ define <16 x i16> @shuffle_v16i16_uu_uu_uu_01_uu_05_07_25_uu_uu_uu_09_uu_13_15_2
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_uu_uu_uu_01_uu_05_07_25_uu_uu_uu_09_uu_13_15_25:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = <u,u,u,1,u,5,7,25,u,u,u,9,u,13,15,25>
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = <u,u,u,1,u,5,7,25,u,u,u,9,u,13,15,25>
; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0
; AVX512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 1, i32 undef, i32 5, i32 7, i32 25, i32 undef, i32 undef, i32 undef, i32 9, i32 undef, i32 13, i32 15, i32 25>
@@ -3416,7 +3383,7 @@ define <16 x i16> @shuffle_v16i16_uu_uu_uu_01_uu_05_07_25_uu_uu_uu_09_uu_13_15_2
define <16 x i16> @shuffle_v16i16_uu_uu_04_uu_16_18_20_uu_uu_uu_12_uu_24_26_28_uu(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_uu_uu_04_uu_16_18_20_uu_uu_uu_12_uu_24_26_28_uu:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,4,5,4,5,6,7,0,1,4,5,8,9,4,5]
; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
@@ -3430,15 +3397,15 @@ define <16 x i16> @shuffle_v16i16_uu_uu_04_uu_16_18_20_uu_uu_uu_12_uu_24_26_28_u
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v16i16_uu_uu_04_uu_16_18_20_uu_uu_uu_12_uu_24_26_28_uu:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5,4,5,6,7,0,1,4,5,8,9,4,5,16,17,20,21,20,21,22,23,16,17,20,21,24,25,20,21]
; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_uu_uu_04_uu_16_18_20_uu_uu_uu_12_uu_24_26_28_uu:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = <u,u,20,u,0,2,4,u,u,u,28,u,8,10,12,u>
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = <u,u,20,u,0,2,4,u,u,u,28,u,8,10,12,u>
; AVX512VL-NEXT: vpermi2w %ymm0, %ymm1, %ymm2
; AVX512VL-NEXT: vmovdqa %ymm2, %ymm0
; AVX512VL-NEXT: retq
@@ -3448,7 +3415,7 @@ define <16 x i16> @shuffle_v16i16_uu_uu_04_uu_16_18_20_uu_uu_uu_12_uu_24_26_28_u
define <16 x i16> @shuffle_v16i16_21_22_23_00_01_02_03_12_29_30_31_08_09_10_11_12(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_21_22_23_00_01_02_03_12_29_30_31_08_09_10_11_12:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7,8,9]
@@ -3458,18 +3425,17 @@ define <16 x i16> @shuffle_v16i16_21_22_23_00_01_02_03_12_29_30_31_08_09_10_11_1
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v16i16_21_22_23_00_01_02_03_12_29_30_31_08_09_10_11_12:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15]
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6,7]
-; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9]
-; AVX2-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9]
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_21_22_23_00_01_02_03_12_29_30_31_08_09_10_11_12:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [21,22,23,0,1,2,3,12,29,30,31,8,9,10,11,12]
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [21,22,23,0,1,2,3,12,29,30,31,8,9,10,11,12]
; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0
; AVX512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 21, i32 22, i32 23, i32 0, i32 1, i32 2, i32 3, i32 12, i32 29, i32 30, i32 31, i32 8, i32 9, i32 10, i32 11, i32 12>
@@ -3478,7 +3444,7 @@ define <16 x i16> @shuffle_v16i16_21_22_23_00_01_02_03_12_29_30_31_08_09_10_11_1
define <16 x i16> @shuffle_v16i16_uu_22_uu_uu_01_02_03_uu_uu_30_uu_uu_09_10_11_uu(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_uu_22_uu_uu_01_02_03_uu_uu_30_uu_uu_09_10_11_uu:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7,8,9]
@@ -3487,7 +3453,7 @@ define <16 x i16> @shuffle_v16i16_uu_22_uu_uu_01_02_03_uu_uu_30_uu_uu_09_10_11_u
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v16i16_uu_22_uu_uu_01_02_03_uu_uu_30_uu_uu_09_10_11_uu:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7,8,9],ymm1[26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23,24,25]
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 undef, i32 22, i32 undef, i32 undef, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 30, i32 undef, i32 undef, i32 9, i32 10, i32 11, i32 undef>
@@ -3496,7 +3462,7 @@ define <16 x i16> @shuffle_v16i16_uu_22_uu_uu_01_02_03_uu_uu_30_uu_uu_09_10_11_u
define <16 x i16> @shuffle_v16i16_05_06_07_00_01_02_03_12_13_14_15_08_09_10_11_12(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_05_06_07_00_01_02_03_12_13_14_15_08_09_10_11_12:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6,7]
; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9]
@@ -3504,21 +3470,26 @@ define <16 x i16> @shuffle_v16i16_05_06_07_00_01_02_03_12_13_14_15_08_09_10_11_1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2OR512VL-LABEL: shuffle_v16i16_05_06_07_00_01_02_03_12_13_14_15_08_09_10_11_12:
-; AVX2OR512VL: # BB#0:
-; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6,7]
-; AVX2OR512VL-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9]
-; AVX2OR512VL-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9]
-; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2OR512VL-NEXT: retq
+; AVX2-LABEL: shuffle_v16i16_05_06_07_00_01_02_03_12_13_14_15_08_09_10_11_12:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25]
+; AVX2-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v16i16_05_06_07_00_01_02_03_12_13_14_15_08_09_10_11_12:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [5,6,7,0,1,2,3,12,13,14,15,8,9,10,11,12]
+; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12>
ret <16 x i16> %shuffle
}
define <16 x i16> @shuffle_v16i16_uu_06_uu_uu_01_02_03_uu_uu_14_uu_uu_09_10_11_uu(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_uu_06_uu_uu_01_02_03_uu_uu_14_uu_uu_09_10_11_uu:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9]
@@ -3526,7 +3497,7 @@ define <16 x i16> @shuffle_v16i16_uu_06_uu_uu_01_02_03_uu_uu_14_uu_uu_09_10_11_u
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v16i16_uu_06_uu_uu_01_02_03_uu_uu_14_uu_uu_09_10_11_uu:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25]
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 undef, i32 6, i32 undef, i32 undef, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 14, i32 undef, i32 undef, i32 9, i32 10, i32 11, i32 undef>
@@ -3535,7 +3506,7 @@ define <16 x i16> @shuffle_v16i16_uu_06_uu_uu_01_02_03_uu_uu_14_uu_uu_09_10_11_u
define <16 x i16> @shuffle_v16i16_uu_uu_uu_uu_01_02_03_uu_uu_uu_uu_uu_09_10_11_uu(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_uu_uu_uu_uu_01_02_03_uu_uu_uu_uu_uu_09_10_11_uu:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9]
@@ -3543,7 +3514,7 @@ define <16 x i16> @shuffle_v16i16_uu_uu_uu_uu_01_02_03_uu_uu_uu_uu_uu_09_10_11_u
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v16i16_uu_uu_uu_uu_01_02_03_uu_uu_uu_uu_uu_09_10_11_uu:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpslldq {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,ymm0[0,1,2,3,4,5,6,7,8,9],zero,zero,zero,zero,zero,zero,ymm0[16,17,18,19,20,21,22,23,24,25]
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 9, i32 10, i32 11, i32 undef>
@@ -3552,7 +3523,7 @@ define <16 x i16> @shuffle_v16i16_uu_uu_uu_uu_01_02_03_uu_uu_uu_uu_uu_09_10_11_u
define <16 x i16> @shuffle_v16i16_19_20_21_22_23_00_01_10_27_28_29_30_31_08_09_10(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_19_20_21_22_23_00_01_10_27_28_29_30_31_08_09_10:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5]
@@ -3564,18 +3535,17 @@ define <16 x i16> @shuffle_v16i16_19_20_21_22_23_00_01_10_27_28_29_30_31_08_09_1
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v16i16_19_20_21_22_23_00_01_10_27_28_29_30_31_08_09_10:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15]
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4,5,6,7]
-; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
-; AVX2-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20,21]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_19_20_21_22_23_00_01_10_27_28_29_30_31_08_09_10:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [3,4,5,6,7,16,17,26,11,12,13,14,15,24,25,26]
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [3,4,5,6,7,16,17,26,11,12,13,14,15,24,25,26]
; AVX512VL-NEXT: vpermi2w %ymm0, %ymm1, %ymm2
; AVX512VL-NEXT: vmovdqa %ymm2, %ymm0
; AVX512VL-NEXT: retq
@@ -3585,7 +3555,7 @@ define <16 x i16> @shuffle_v16i16_19_20_21_22_23_00_01_10_27_28_29_30_31_08_09_1
define <16 x i16> @shuffle_v16i16_uu_20_21_22_uu_uu_01_uu_uu_28_29_30_uu_uu_09_uu(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_uu_20_21_22_uu_uu_01_uu_uu_28_29_30_uu_uu_09_uu:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5]
@@ -3594,7 +3564,7 @@ define <16 x i16> @shuffle_v16i16_uu_20_21_22_uu_uu_01_uu_uu_28_29_30_uu_uu_09_u
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v16i16_uu_20_21_22_uu_uu_01_uu_uu_28_29_30_uu_uu_09_uu:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[6,7,8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5],ymm1[22,23,24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21]
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 undef, i32 20, i32 21, i32 22, i32 undef, i32 undef, i32 1, i32 undef, i32 undef, i32 28, i32 29, i32 30, i32 undef, i32 undef, i32 9, i32 undef>
@@ -3603,7 +3573,7 @@ define <16 x i16> @shuffle_v16i16_uu_20_21_22_uu_uu_01_uu_uu_28_29_30_uu_uu_09_u
define <16 x i16> @shuffle_v16i16_03_04_05_06_07_00_01_10_11_12_13_14_15_08_09_10(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_03_04_05_06_07_00_01_10_11_12_13_14_15_08_09_10:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4,5,6,7]
; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
@@ -3611,21 +3581,26 @@ define <16 x i16> @shuffle_v16i16_03_04_05_06_07_00_01_10_11_12_13_14_15_08_09_1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2OR512VL-LABEL: shuffle_v16i16_03_04_05_06_07_00_01_10_11_12_13_14_15_08_09_10:
-; AVX2OR512VL: # BB#0:
-; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4,5,6,7]
-; AVX2OR512VL-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
-; AVX2OR512VL-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
-; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2OR512VL-NEXT: retq
+; AVX2-LABEL: shuffle_v16i16_03_04_05_06_07_00_01_10_11_12_13_14_15_08_09_10:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20,21]
+; AVX2-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v16i16_03_04_05_06_07_00_01_10_11_12_13_14_15_08_09_10:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [3,4,5,6,7,0,1,10,11,12,13,14,15,8,9,10]
+; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10>
ret <16 x i16> %shuffle
}
define <16 x i16> @shuffle_v16i16_uu_04_05_06_uu_uu_01_uu_uu_12_13_14_uu_uu_09_uu(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_uu_04_05_06_uu_uu_01_uu_uu_12_13_14_uu_uu_09_uu:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
@@ -3633,7 +3608,7 @@ define <16 x i16> @shuffle_v16i16_uu_04_05_06_uu_uu_01_uu_uu_12_13_14_uu_uu_09_u
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v16i16_uu_04_05_06_uu_uu_01_uu_uu_12_13_14_uu_uu_09_uu:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20,21]
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 undef, i32 4, i32 5, i32 6, i32 undef, i32 undef, i32 1, i32 undef, i32 undef, i32 12, i32 13, i32 14, i32 undef, i32 undef, i32 9, i32 undef>
@@ -3642,7 +3617,7 @@ define <16 x i16> @shuffle_v16i16_uu_04_05_06_uu_uu_01_uu_uu_12_13_14_uu_uu_09_u
define <16 x i16> @shuffle_v16i16_uu_04_05_06_uu_uu_uu_uu_uu_12_13_14_uu_uu_uu_uu(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_uu_04_05_06_uu_uu_uu_uu_uu_12_13_14_uu_uu_uu_uu:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpsrldq {{.*#+}} xmm1 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
@@ -3650,7 +3625,7 @@ define <16 x i16> @shuffle_v16i16_uu_04_05_06_uu_uu_uu_uu_uu_12_13_14_uu_uu_uu_u
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v16i16_uu_04_05_06_uu_uu_uu_uu_uu_12_13_14_uu_uu_uu_uu:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm0[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 undef, i32 4, i32 5, i32 6, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 12, i32 13, i32 14, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -3659,7 +3634,7 @@ define <16 x i16> @shuffle_v16i16_uu_04_05_06_uu_uu_uu_uu_uu_12_13_14_uu_uu_uu_u
define <16 x i16> @shuffle_v16i16_03_04_05_06_07_16_17_26_11_12_13_14_15_24_25_26(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_03_04_05_06_07_16_17_26_11_12_13_14_15_24_25_26:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5]
@@ -3671,18 +3646,17 @@ define <16 x i16> @shuffle_v16i16_03_04_05_06_07_16_17_26_11_12_13_14_15_24_25_2
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v16i16_03_04_05_06_07_16_17_26_11_12_13_14_15_24_25_26:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4,5,6,7]
-; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
-; AVX2-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20,21]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_03_04_05_06_07_16_17_26_11_12_13_14_15_24_25_26:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [3,4,5,6,7,16,17,26,11,12,13,14,15,24,25,26]
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [3,4,5,6,7,16,17,26,11,12,13,14,15,24,25,26]
; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0
; AVX512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 26, i32 11, i32 12, i32 13, i32 14, i32 15, i32 24, i32 25, i32 26>
@@ -3691,7 +3665,7 @@ define <16 x i16> @shuffle_v16i16_03_04_05_06_07_16_17_26_11_12_13_14_15_24_25_2
define <16 x i16> @shuffle_v16i16_uu_04_05_06_uu_uu_17_uu_uu_12_13_14_uu_uu_25_uu(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_uu_04_05_06_uu_uu_17_uu_uu_12_13_14_uu_uu_25_uu:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5]
@@ -3700,7 +3674,7 @@ define <16 x i16> @shuffle_v16i16_uu_04_05_06_uu_uu_17_uu_uu_12_13_14_uu_uu_25_u
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v16i16_uu_04_05_06_uu_uu_17_uu_uu_12_13_14_uu_uu_25_uu:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5],ymm0[22,23,24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21]
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 undef, i32 4, i32 5, i32 6, i32 undef, i32 undef, i32 17, i32 undef, i32 undef, i32 12, i32 13, i32 14, i32 undef, i32 undef, i32 25, i32 undef>
@@ -3709,7 +3683,7 @@ define <16 x i16> @shuffle_v16i16_uu_04_05_06_uu_uu_17_uu_uu_12_13_14_uu_uu_25_u
define <16 x i16> @shuffle_v16i16_05_06_07_16_17_18_19_28_13_14_15_24_25_26_27_28(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_05_06_07_16_17_18_19_28_13_14_15_24_25_26_27_28:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7,8,9]
@@ -3719,18 +3693,17 @@ define <16 x i16> @shuffle_v16i16_05_06_07_16_17_18_19_28_13_14_15_24_25_26_27_2
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v16i16_05_06_07_16_17_18_19_28_13_14_15_24_25_26_27_28:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15]
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6,7]
-; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9]
-; AVX2-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9]
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_05_06_07_16_17_18_19_28_13_14_15_24_25_26_27_28:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [21,22,23,0,1,2,3,12,29,30,31,8,9,10,11,12]
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [21,22,23,0,1,2,3,12,29,30,31,8,9,10,11,12]
; AVX512VL-NEXT: vpermi2w %ymm0, %ymm1, %ymm2
; AVX512VL-NEXT: vmovdqa %ymm2, %ymm0
; AVX512VL-NEXT: retq
@@ -3740,7 +3713,7 @@ define <16 x i16> @shuffle_v16i16_05_06_07_16_17_18_19_28_13_14_15_24_25_26_27_2
define <16 x i16> @shuffle_v16i16_uu_06_uu_uu_17_18_19_uu_uu_14_uu_uu_25_26_27_uu(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_uu_06_uu_uu_17_18_19_uu_uu_14_uu_uu_25_26_27_uu:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7,8,9]
@@ -3749,7 +3722,7 @@ define <16 x i16> @shuffle_v16i16_uu_06_uu_uu_17_18_19_uu_uu_14_uu_uu_25_26_27_u
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v16i16_uu_06_uu_uu_17_18_19_uu_uu_14_uu_uu_25_26_27_uu:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9],ymm0[26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25]
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 undef, i32 6, i32 undef, i32 undef, i32 17, i32 18, i32 19, i32 undef, i32 undef, i32 14, i32 undef, i32 undef, i32 25, i32 26, i32 27, i32 undef>
@@ -3758,7 +3731,7 @@ define <16 x i16> @shuffle_v16i16_uu_06_uu_uu_17_18_19_uu_uu_14_uu_uu_25_26_27_u
define <16 x i16> @shuffle_v16i16_23_uu_03_uu_20_20_05_uu_31_uu_11_uu_28_28_13_uu(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_23_uu_03_uu_20_20_05_uu_31_uu_11_uu_28_28_13_uu:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,xmm2[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
@@ -3771,14 +3744,14 @@ define <16 x i16> @shuffle_v16i16_23_uu_03_uu_20_20_05_uu_31_uu_11_uu_28_28_13_u
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v16i16_23_uu_03_uu_20_20_05_uu_31_uu_11_uu_28_28_13_uu:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6],ymm1[7],ymm0[8,9,10,11],ymm1[12],ymm0[13,14],ymm1[15]
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[14,15,14,15,6,7,6,7,8,9,8,9,10,11,14,15,30,31,30,31,22,23,22,23,24,25,24,25,26,27,30,31]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_23_uu_03_uu_20_20_05_uu_31_uu_11_uu_28_28_13_uu:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = <7,u,19,u,4,4,21,u,15,u,27,u,12,12,29,u>
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = <7,u,19,u,4,4,21,u,15,u,27,u,12,12,29,u>
; AVX512VL-NEXT: vpermi2w %ymm0, %ymm1, %ymm2
; AVX512VL-NEXT: vmovdqa %ymm2, %ymm0
; AVX512VL-NEXT: retq
@@ -3788,13 +3761,13 @@ define <16 x i16> @shuffle_v16i16_23_uu_03_uu_20_20_05_uu_31_uu_11_uu_28_28_13_u
define <16 x i16> @shuffle_v16i16_u_u_u_u_u_u_u_u_0_16_1_17_2_18_3_19(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_u_u_u_u_u_u_u_u_0_16_1_17_2_18_3_19:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v16i16_u_u_u_u_u_u_u_u_0_16_1_17_2_18_3_19:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX2OR512VL-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; AVX2OR512VL-NEXT: retq
@@ -3804,14 +3777,14 @@ define <16 x i16> @shuffle_v16i16_u_u_u_u_u_u_u_u_0_16_1_17_2_18_3_19(<16 x i16>
define <16 x i16> @shuffle_v16i16_u_u_u_u_u_u_u_u_3_3_3_3_3_3_3_3(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_u_u_u_u_u_u_u_u_3_3_3_3_3_3_3_3:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7]
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v16i16_u_u_u_u_u_u_u_u_3_3_3_3_3_3_3_3:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7]
; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; AVX2OR512VL-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
@@ -3822,7 +3795,7 @@ define <16 x i16> @shuffle_v16i16_u_u_u_u_u_u_u_u_3_3_3_3_3_3_3_3(<16 x i16> %a,
define <16 x i16> @shuffle_v16i16_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
@@ -3830,7 +3803,7 @@ define <16 x i16> @shuffle_v16i16_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8(<16 x i16> %a,
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v16i16_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX2OR512VL-NEXT: vpbroadcastw %xmm0, %ymm0
; AVX2OR512VL-NEXT: retq
@@ -3840,7 +3813,7 @@ define <16 x i16> @shuffle_v16i16_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8(<16 x i16> %a,
define <16 x i16> @shuffle_v16i16_4_20_5_21_6_22_7_23_u_u_u_u_u_u_u_u(<16 x i16> %a, <16 x i16> %b) {
; ALL-LABEL: shuffle_v16i16_4_20_5_21_6_22_7_23_u_u_u_u_u_u_u_u:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; ALL-NEXT: retq
%shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -3849,7 +3822,7 @@ define <16 x i16> @shuffle_v16i16_4_20_5_21_6_22_7_23_u_u_u_u_u_u_u_u(<16 x i16>
define <16 x i16> @shuffle_v16i16_3_3_3_3_3_3_3_3_u_u_u_u_u_u_u_u(<16 x i16> %a, <16 x i16> %b) {
; ALL-LABEL: shuffle_v16i16_3_3_3_3_3_3_3_3_u_u_u_u_u_u_u_u:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7]
; ALL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; ALL-NEXT: retq
@@ -3859,14 +3832,14 @@ define <16 x i16> @shuffle_v16i16_3_3_3_3_3_3_3_3_u_u_u_u_u_u_u_u(<16 x i16> %a,
define <16 x i16> @shuffle_v16i16_9_9_9_9_9_9_9_9_u_u_u_u_u_u_u_u(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: shuffle_v16i16_9_9_9_9_9_9_9_9_u_u_u_u_u_u_u_u:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v16i16_9_9_9_9_9_9_9_9_u_u_u_u_u_u_u_u:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX2OR512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
@@ -3877,7 +3850,7 @@ define <16 x i16> @shuffle_v16i16_9_9_9_9_9_9_9_9_u_u_u_u_u_u_u_u(<16 x i16> %a,
define <16 x i16> @shuffle_v16i16_02_18_03_19_00_16_01_17_10_26_11_27_08_24_09_25(<16 x i16> %a0, <16 x i16> %a1) {
; AVX1-LABEL: shuffle_v16i16_02_18_03_19_00_16_01_17_10_26_11_27_08_24_09_25:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
@@ -3888,15 +3861,15 @@ define <16 x i16> @shuffle_v16i16_02_18_03_19_00_16_01_17_10_26_11_27_08_24_09_2
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v16i16_02_18_03_19_00_16_01_17_10_26_11_27_08_24_09_25:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5,4,5,6,7,0,1,0,1,12,13,2,3,16,17,20,21,20,21,22,23,16,17,16,17,28,29,18,19]
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[4,5,2,3,6,7,6,7,0,1,2,3,2,3,14,15,20,21,18,19,22,23,22,23,16,17,18,19,18,19,30,31]
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_02_18_03_19_00_16_01_17_10_26_11_27_08_24_09_25:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [2,18,3,19,0,16,1,17,10,26,11,27,8,24,9,25]
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [2,18,3,19,0,16,1,17,10,26,11,27,8,24,9,25]
; AVX512VL-NEXT: vpermt2w %ymm1, %ymm2, %ymm0
; AVX512VL-NEXT: retq
%1 = shufflevector <16 x i16> %a0, <16 x i16> %a1, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 0, i32 16, i32 1, i32 17, i32 10, i32 26, i32 11, i32 27, i32 8, i32 24, i32 9, i32 25>
@@ -3905,7 +3878,7 @@ define <16 x i16> @shuffle_v16i16_02_18_03_19_00_16_01_17_10_26_11_27_08_24_09_2
define <16 x i16> @shuffle_v16i16_02_18_03_19_10_26_11_27_00_16_01_17_08_24_09_25(<16 x i16> %a0, <16 x i16> %a1) {
; AVX1-LABEL: shuffle_v16i16_02_18_03_19_10_26_11_27_00_16_01_17_08_24_09_25:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,1,0,1]
@@ -3924,7 +3897,7 @@ define <16 x i16> @shuffle_v16i16_02_18_03_19_10_26_11_27_00_16_01_17_08_24_09_2
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v16i16_02_18_03_19_10_26_11_27_00_16_01_17_08_24_09_25:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5,4,5,6,7,0,1,0,1,12,13,2,3,16,17,20,21,20,21,22,23,16,17,16,17,28,29,18,19]
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[4,5,2,3,6,7,6,7,0,1,2,3,2,3,14,15,20,21,18,19,22,23,22,23,16,17,18,19,18,19,30,31]
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
@@ -3932,8 +3905,8 @@ define <16 x i16> @shuffle_v16i16_02_18_03_19_10_26_11_27_00_16_01_17_08_24_09_2
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_02_18_03_19_10_26_11_27_00_16_01_17_08_24_09_25:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [2,18,3,19,0,16,1,17,10,26,11,27,8,24,9,25]
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [2,18,3,19,0,16,1,17,10,26,11,27,8,24,9,25]
; AVX512VL-NEXT: vpermi2w %ymm1, %ymm0, %ymm2
; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm2[0,2,1,3]
; AVX512VL-NEXT: retq
@@ -3944,9 +3917,27 @@ define <16 x i16> @shuffle_v16i16_02_18_03_19_10_26_11_27_00_16_01_17_08_24_09_2
ret <16 x i16> %4
}
+define <16 x i16> @shuffle_v16i16_04_06_07_uu_uu_06_07_05_12_14_15_uu_uu_14_15_13(<16 x i16> %a) {
+; AVX1-LABEL: shuffle_v16i16_04_06_07_uu_uu_06_07_05_12_14_15_uu_uu_14_15_13:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [8,9,12,13,14,15,14,15,8,9,12,13,14,15,10,11]
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2OR512VL-LABEL: shuffle_v16i16_04_06_07_uu_uu_06_07_05_12_14_15_uu_uu_14_15_13:
+; AVX2OR512VL: # %bb.0:
+; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,9,12,13,14,15,14,15,8,9,12,13,14,15,10,11,24,25,28,29,30,31,30,31,24,25,28,29,30,31,26,27]
+; AVX2OR512VL-NEXT: retq
+ %shuffle = shufflevector <16 x i16> %a, <16 x i16> zeroinitializer, <16 x i32> <i32 4, i32 6, i32 7, i32 undef, i32 undef, i32 6, i32 7, i32 5, i32 12, i32 14, i32 15, i32 undef, i32 undef, i32 14, i32 15, i32 13>
+ ret <16 x i16> %shuffle
+}
+
define <16 x i16> @insert_v16i16_0elt_into_zero_vector(i16* %ptr) {
; ALL-LABEL: insert_v16i16_0elt_into_zero_vector:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: movzwl (%rdi), %eax
; ALL-NEXT: vmovd %eax, %xmm0
; ALL-NEXT: retq
@@ -3957,13 +3948,13 @@ define <16 x i16> @insert_v16i16_0elt_into_zero_vector(i16* %ptr) {
define <16 x i16> @concat_v16i16_0_1_2_3_4_5_6_7_24_25_26_27_28_29_30_31(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: concat_v16i16_0_1_2_3_4_5_6_7_24_25_26_27_28_29_30_31:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: concat_v16i16_0_1_2_3_4_5_6_7_24_25_26_27_28_29_30_31:
-; AVX2OR512VL: # BB#0:
-; AVX2OR512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX2OR512VL: # %bb.0:
+; AVX2OR512VL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX2OR512VL-NEXT: retq
%alo = shufflevector <16 x i16> %a, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%bhi = shufflevector <16 x i16> %b, <16 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -3972,10 +3963,20 @@ define <16 x i16> @concat_v16i16_0_1_2_3_4_5_6_7_24_25_26_27_28_29_30_31(<16 x i
}
define <16 x i16> @concat_v16i16_8_9_10_11_12_13_14_15_24_25_26_27_28_29_30_31_bc(<16 x i16> %a, <16 x i16> %b) {
-; ALL-LABEL: concat_v16i16_8_9_10_11_12_13_14_15_24_25_26_27_28_29_30_31_bc:
-; ALL: # BB#0:
-; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
-; ALL-NEXT: retq
+; AVX1-LABEL: concat_v16i16_8_9_10_11_12_13_14_15_24_25_26_27_28_29_30_31_bc:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: concat_v16i16_8_9_10_11_12_13_14_15_24_25_26_27_28_29_30_31_bc:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
+; AVX2-NEXT: retq
+;
+; AVX512VL-LABEL: concat_v16i16_8_9_10_11_12_13_14_15_24_25_26_27_28_29_30_31_bc:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
+; AVX512VL-NEXT: retq
%ahi = shufflevector <16 x i16> %a, <16 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%bhi = shufflevector <16 x i16> %b, <16 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%bc0hi = bitcast <8 x i16> %ahi to <16 x i8>
@@ -3987,7 +3988,7 @@ define <16 x i16> @concat_v16i16_8_9_10_11_12_13_14_15_24_25_26_27_28_29_30_31_b
define <16 x i16> @PR24935(<16 x i16> %a, <16 x i16> %b) {
; AVX1-LABEL: PR24935:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,0,1,1]
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
; AVX1-NEXT: vpalignr {{.*#+}} xmm4 = xmm3[2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1]
@@ -4006,24 +4007,24 @@ define <16 x i16> @PR24935(<16 x i16> %a, <16 x i16> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: PR24935:
-; AVX2: # BB#0:
-; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm1[2,3,0,1]
-; AVX2-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[6,7,4,5,0,1,10,11,4,5,10,11,4,5,6,7,22,23,20,21,16,17,26,27,20,21,26,27,20,21,22,23]
-; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[8,9,10,11,4,5,8,9,0,1,14,15,12,13,0,1,24,25,26,27,20,21,24,25,16,17,30,31,28,29,16,17]
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpshufb {{.*#+}} ymm2 = ymm1[8,9,10,11,4,5,8,9,0,1,14,15,12,13,0,1,24,25,26,27,20,21,24,25,16,17,30,31,28,29,16,17]
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1]
+; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[6,7,4,5,0,1,10,11,4,5,10,11,4,5,6,7,22,23,20,21,16,17,26,27,20,21,26,27,20,21,22,23]
; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,255,255,u,u,255,255,255,255,0,0,u,u,0,0,u,u,u,u,255,255,0,0,u,u,u,u,u,u,0,0>
-; AVX2-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1
-; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3,0,1]
-; AVX2-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,6,7,u,u,18,19,u,u,u,u,u,u,u,u,24,25,16,17,u,u]
-; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,5,5,6,7,8,9,10,11,13,13,14,15]
-; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2],ymm2[3],ymm0[4],ymm2[5,6,7,8],ymm0[9,10],ymm2[11],ymm0[12],ymm2[13,14,15]
+; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpshuflw {{.*#+}} ymm2 = ymm0[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX2-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,5,5,6,7,8,9,10,11,13,13,14,15]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,6,7,u,u,18,19,u,u,u,u,u,u,u,u,24,25,16,17,u,u]
+; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2],ymm0[3],ymm2[4],ymm0[5,6,7,8],ymm2[9,10],ymm0[11],ymm2[12],ymm0[13,14,15]
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,0,0,255,255,255,255,255,255,0,0,255,255,0,0,0,0,255,255,255,255,0,0,0,0,0,0,255,255]
; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: PR24935:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [11,10,17,13,10,7,27,0,17,25,0,12,29,20,16,8]
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [11,10,17,13,10,7,27,0,17,25,0,12,29,20,16,8]
; AVX512VL-NEXT: vpermi2w %ymm0, %ymm1, %ymm2
; AVX512VL-NEXT: vmovdqa %ymm2, %ymm0
; AVX512VL-NEXT: retq
@@ -4031,16 +4032,58 @@ define <16 x i16> @PR24935(<16 x i16> %a, <16 x i16> %b) {
ret <16 x i16> %shuffle
}
+define <16 x i16> @PR34369(<16 x i16> %vec, <16 x i16> %mask) {
+; AVX1-LABEL: PR34369:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[8,9,10,11,4,5,10,11,8,9,10,11,4,5,4,5]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,0,1,0,1,6,7,10,11,4,5,4,5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3],xmm0[4,5,6],xmm3[7]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[14,15,0,1,12,13,0,1,2,3,4,5,8,9,8,9]
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpcmpeqw %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpcmpeqw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT: vandps %ymm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: PR34369:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
+; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[8,9,10,11,4,5,10,11,8,9,10,11,4,5,4,5]
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[6,7,0,1,0,1,u,u,10,11,4,5,4,5,u,u,30,31,16,17,28,29,16,17,18,19,20,21,24,25,24,25]
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpblendvb %ymm3, %ymm0, %ymm2, %ymm0
+; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vpcmpeqw %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512VL-LABEL: PR34369:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [3,0,0,13,5,2,2,10,15,8,14,8,9,10,12,12]
+; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX512VL-NEXT: vpcmpeqw %ymm3, %ymm1, %k1
+; AVX512VL-NEXT: vpermw %ymm0, %ymm2, %ymm0 {%k1} {z}
+; AVX512VL-NEXT: retq
+ %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 0, i32 0, i32 13, i32 5, i32 2, i32 2, i32 10, i32 15, i32 8, i32 14, i32 8, i32 9, i32 10, i32 12, i32 12>
+ %cmp = icmp eq <16 x i16> %mask, zeroinitializer
+ %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
+ ret <16 x i16> %res
+}
+
define <16 x i16> @insert_dup_mem_v16i16_i32(i32* %ptr) {
; AVX1-LABEL: insert_dup_mem_v16i16_i32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: insert_dup_mem_v16i16_i32:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpbroadcastw (%rdi), %ymm0
; AVX2OR512VL-NEXT: retq
%tmp = load i32, i32* %ptr, align 4
@@ -4052,7 +4095,7 @@ define <16 x i16> @insert_dup_mem_v16i16_i32(i32* %ptr) {
define <16 x i16> @insert_dup_mem_v16i16_sext_i16(i16* %ptr) {
; AVX1-LABEL: insert_dup_mem_v16i16_sext_i16:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: movswl (%rdi), %eax
; AVX1-NEXT: vmovd %eax, %xmm0
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
@@ -4060,14 +4103,14 @@ define <16 x i16> @insert_dup_mem_v16i16_sext_i16(i16* %ptr) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: insert_dup_mem_v16i16_sext_i16:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: movswl (%rdi), %eax
; AVX2-NEXT: vmovd %eax, %xmm0
; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: insert_dup_mem_v16i16_sext_i16:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: movswl (%rdi), %eax
; AVX512VL-NEXT: vpbroadcastw %eax, %ymm0
; AVX512VL-NEXT: retq
@@ -4081,14 +4124,14 @@ define <16 x i16> @insert_dup_mem_v16i16_sext_i16(i16* %ptr) {
define <16 x i16> @insert_dup_elt1_mem_v16i16_i32(i32* %ptr) #0 {
; AVX1-LABEL: insert_dup_elt1_mem_v16i16_i32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: insert_dup_elt1_mem_v16i16_i32:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpbroadcastw 2(%rdi), %ymm0
; AVX2OR512VL-NEXT: retq
%tmp = load i32, i32* %ptr, align 4
@@ -4100,14 +4143,14 @@ define <16 x i16> @insert_dup_elt1_mem_v16i16_i32(i32* %ptr) #0 {
define <16 x i16> @insert_dup_elt3_mem_v16i16_i32(i32* %ptr) #0 {
; AVX1-LABEL: insert_dup_elt3_mem_v16i16_i32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: insert_dup_elt3_mem_v16i16_i32:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpbroadcastw 2(%rdi), %ymm0
; AVX2OR512VL-NEXT: retq
%tmp = load i32, i32* %ptr, align 4
diff --git a/test/CodeGen/X86/vector-shuffle-256-v32.ll b/test/CodeGen/X86/vector-shuffle-256-v32.ll
index d51b69415b93..01c7fc466eb8 100644
--- a/test/CodeGen/X86/vector-shuffle-256-v32.ll
+++ b/test/CodeGen/X86/vector-shuffle-256-v32.ll
@@ -5,14 +5,14 @@
define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpbroadcastb %xmm0, %ymm0
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
@@ -21,7 +21,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00(<32 x i8> %a, <32 x i8> %b) {
; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0]
@@ -29,7 +29,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0]
; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1]
; AVX2OR512VL-NEXT: retq
@@ -39,7 +39,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00(<32 x i8> %a, <32 x i8> %b) {
; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0]
@@ -47,7 +47,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0]
; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1]
; AVX2OR512VL-NEXT: retq
@@ -57,7 +57,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00(<32 x i8> %a, <32 x i8> %b) {
; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0]
@@ -65,7 +65,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0]
; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1]
; AVX2OR512VL-NEXT: retq
@@ -75,7 +75,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_00:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0]
@@ -83,7 +83,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_00:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0]
; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1]
; AVX2OR512VL-NEXT: retq
@@ -93,7 +93,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_00:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,5,0,0,0,0,0]
@@ -101,7 +101,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_00:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,5,0,0,0,0,0]
; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1]
; AVX2OR512VL-NEXT: retq
@@ -111,7 +111,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_00:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,6,0,0,0,0,0,0]
@@ -119,7 +119,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_00:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,6,0,0,0,0,0,0]
; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1]
; AVX2OR512VL-NEXT: retq
@@ -129,7 +129,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0]
@@ -137,7 +137,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0]
; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1]
; AVX2OR512VL-NEXT: retq
@@ -147,7 +147,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0]
@@ -155,7 +155,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8]
; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0]
; AVX2OR512VL-NEXT: retq
@@ -165,7 +165,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_09_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_09_00_00_00_00_00_00_00_00_00:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0]
@@ -173,7 +173,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_09_00_00_00_00_00_00_00_00_00:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0]
; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0]
; AVX2OR512VL-NEXT: retq
@@ -183,7 +183,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_10_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_10_00_00_00_00_00_00_00_00_00_00:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,10,0,0,0,0,0,0,0,0,0,0]
@@ -191,7 +191,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_10_00_00_00_00_00_00_00_00_00_00:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,10,0,0]
; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0]
; AVX2OR512VL-NEXT: retq
@@ -201,7 +201,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_11_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_11_00_00_00_00_00_00_00_00_00_00_00:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,11,0,0,0,0,0,0,0,0,0,0,0]
@@ -209,7 +209,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_11_00_00_00_00_00_00_00_00_00_00_00:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,11,0,0,0]
; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0]
; AVX2OR512VL-NEXT: retq
@@ -219,7 +219,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_12_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_12_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,12,0,0,0,0,0,0,0,0,0,0,0,0]
@@ -227,7 +227,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_12_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,12,0,0,0,0]
; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0]
; AVX2OR512VL-NEXT: retq
@@ -237,7 +237,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_13_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_13_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,13,0,0,0,0,0,0,0,0,0,0,0,0,0]
@@ -245,7 +245,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_13_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,13,0,0,0,0,0]
; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0]
; AVX2OR512VL-NEXT: retq
@@ -255,7 +255,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
@@ -263,7 +263,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,14,0,0,0,0,0,0]
; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0]
; AVX2OR512VL-NEXT: retq
@@ -273,7 +273,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: movl $15, %eax
; AVX1-NEXT: vmovd %eax, %xmm1
; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1
@@ -283,7 +283,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,15,0,0,0,0,0,0,0]
; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0]
; AVX2OR512VL-NEXT: retq
@@ -293,7 +293,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
@@ -303,9 +303,9 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX2: # BB#0:
-; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX2-NEXT: vpxor %ymm2, %ymm2, %ymm2
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
@@ -313,9 +313,9 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX512VL-NEXT: vpxor %ymm2, %ymm2, %ymm2
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VL-NEXT: vpshufb %ymm2, %ymm1, %ymm1
; AVX512VL-NEXT: vpbroadcastb %xmm0, %xmm0
; AVX512VL-NEXT: movl $32767, %eax # imm = 0x7FFF
@@ -329,7 +329,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_
define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_17_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_17_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
@@ -339,16 +339,16 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_17_00_
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_17_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX2: # BB#0:
-; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <0,255,u,u,u,u,u,u,u,u,u,u,u,u,u,u,255,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_17_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX512VL-NEXT: movl $1, %eax
; AVX512VL-NEXT: kmovd %eax, %k1
; AVX512VL-NEXT: vmovdqu8 %ymm0, %ymm1 {%k1}
@@ -360,7 +360,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_17_00_
define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_18_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_18_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
@@ -370,16 +370,16 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_18_00_00_
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_18_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX2: # BB#0:
-; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <0,0,255,255,u,u,u,u,u,u,u,u,u,u,u,u,255,255,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_18_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX512VL-NEXT: movw $1, %ax
; AVX512VL-NEXT: kmovd %eax, %k1
; AVX512VL-NEXT: vmovdqu16 %ymm0, %ymm1 {%k1}
@@ -391,7 +391,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_18_00_00_
define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_19_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_19_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
@@ -401,16 +401,16 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_19_00_00_00_
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_19_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX2: # BB#0:
-; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <0,0,255,255,u,u,u,u,u,u,u,u,u,u,u,u,255,255,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_19_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX512VL-NEXT: movw $1, %ax
; AVX512VL-NEXT: kmovd %eax, %k1
; AVX512VL-NEXT: vmovdqu16 %ymm0, %ymm1 {%k1}
@@ -422,7 +422,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_19_00_00_00_
define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_20_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_20_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
@@ -432,8 +432,8 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_20_00_00_00_00_
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_20_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX2OR512VL: # BB#0:
-; AVX2OR512VL-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2OR512VL: # %bb.0:
+; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX2OR512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX2OR512VL-NEXT: retq
@@ -443,7 +443,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_20_00_00_00_00_
define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_21_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_21_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
@@ -453,8 +453,8 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_21_00_00_00_00_00_
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_21_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX2OR512VL: # BB#0:
-; AVX2OR512VL-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2OR512VL: # %bb.0:
+; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX2OR512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,5,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX2OR512VL-NEXT: retq
@@ -464,7 +464,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_21_00_00_00_00_00_
define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_22_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_22_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
@@ -474,8 +474,8 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_22_00_00_00_00_00_00_
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_22_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX2OR512VL: # BB#0:
-; AVX2OR512VL-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2OR512VL: # %bb.0:
+; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX2OR512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,6,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX2OR512VL-NEXT: retq
@@ -485,7 +485,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_22_00_00_00_00_00_00_
define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_23_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_23_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
@@ -495,8 +495,8 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_23_00_00_00_00_00_00_00_
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_23_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX2OR512VL: # BB#0:
-; AVX2OR512VL-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2OR512VL: # %bb.0:
+; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX2OR512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX2OR512VL-NEXT: retq
@@ -506,7 +506,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_23_00_00_00_00_00_00_00_
define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_24_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_24_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
@@ -517,14 +517,14 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_24_00_00_00_00_00_00_00_00_
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_24_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1]
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_24_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX512VL-NEXT: retq
@@ -534,7 +534,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_24_00_00_00_00_00_00_00_00_
define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_25_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_25_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
@@ -545,14 +545,14 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_25_00_00_00_00_00_00_00_00_00_
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_25_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1]
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_25_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX512VL-NEXT: retq
@@ -562,7 +562,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_25_00_00_00_00_00_00_00_00_00_
define <32 x i8> @shuffle_v32i8_00_00_00_00_00_26_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_26_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
@@ -573,14 +573,14 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_26_00_00_00_00_00_00_00_00_00_00_
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_26_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1]
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,10,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v32i8_00_00_00_00_00_26_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,10,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX512VL-NEXT: retq
@@ -590,7 +590,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_26_00_00_00_00_00_00_00_00_00_00_
define <32 x i8> @shuffle_v32i8_00_00_00_00_27_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
; AVX1-LABEL: shuffle_v32i8_00_00_00_00_27_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
@@ -601,14 +601,14 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_27_00_00_00_00_00_00_00_00_00_00_00_
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v32i8_00_00_00_00_27_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1]
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,11,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v32i8_00_00_00_00_27_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,11,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX512VL-NEXT: retq
@@ -618,7 +618,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_27_00_00_00_00_00_00_00_00_00_00_00_
define <32 x i8> @shuffle_v32i8_00_00_00_28_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
; AVX1-LABEL: shuffle_v32i8_00_00_00_28_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
@@ -629,14 +629,14 @@ define <32 x i8> @shuffle_v32i8_00_00_00_28_00_00_00_00_00_00_00_00_00_00_00_00_
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v32i8_00_00_00_28_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1]
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,12,0,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v32i8_00_00_00_28_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,12,0,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX512VL-NEXT: retq
@@ -646,7 +646,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_28_00_00_00_00_00_00_00_00_00_00_00_00_
define <32 x i8> @shuffle_v32i8_00_00_29_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
; AVX1-LABEL: shuffle_v32i8_00_00_29_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
@@ -657,14 +657,14 @@ define <32 x i8> @shuffle_v32i8_00_00_29_00_00_00_00_00_00_00_00_00_00_00_00_00_
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v32i8_00_00_29_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1]
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,13,0,0,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v32i8_00_00_29_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,13,0,0,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX512VL-NEXT: retq
@@ -674,7 +674,7 @@ define <32 x i8> @shuffle_v32i8_00_00_29_00_00_00_00_00_00_00_00_00_00_00_00_00_
define <32 x i8> @shuffle_v32i8_00_30_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
; AVX1-LABEL: shuffle_v32i8_00_30_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
@@ -685,14 +685,14 @@ define <32 x i8> @shuffle_v32i8_00_30_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v32i8_00_30_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1]
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v32i8_00_30_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX512VL-NEXT: retq
@@ -702,7 +702,7 @@ define <32 x i8> @shuffle_v32i8_00_30_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
define <32 x i8> @shuffle_v32i8_31_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
; AVX1-LABEL: shuffle_v32i8_31_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1
; AVX1-NEXT: movl $128, %eax
@@ -715,7 +715,7 @@ define <32 x i8> @shuffle_v32i8_31_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v32i8_31_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1]
; AVX2-NEXT: movl $15, %eax
; AVX2-NEXT: vmovd %eax, %xmm1
@@ -723,8 +723,8 @@ define <32 x i8> @shuffle_v32i8_31_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v32i8_31_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
; AVX512VL-NEXT: movl $15, %eax
; AVX512VL-NEXT: vmovd %eax, %xmm1
@@ -736,7 +736,7 @@ define <32 x i8> @shuffle_v32i8_31_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16(<32 x i8> %a, <32 x i8> %b) {
; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
@@ -745,8 +745,8 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16:
-; AVX2OR512VL: # BB#0:
-; AVX2OR512VL-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; AVX2OR512VL: # %bb.0:
+; AVX2OR512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2OR512VL-NEXT: vpshufb %ymm1, %ymm0, %ymm0
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
@@ -755,7 +755,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
define <32 x i8> @shuffle_v32i8_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_31_31_31_31_31_31_31_31_31_31_31_31_31_31_31_31(<32 x i8> %a, <32 x i8> %b) {
; AVX1-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_31_31_31_31_31_31_31_31_31_31_31_31_31_31_31_31:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
@@ -764,7 +764,7 @@ define <32 x i8> @shuffle_v32i8_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_31_31_31_31_31_31_31_31_31_31_31_31_31_31_31_31:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
@@ -773,7 +773,7 @@ define <32 x i8> @shuffle_v32i8_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_
define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24(<32 x i8> %a, <32 x i8> %b) {
; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8]
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
@@ -782,7 +782,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8,16,16,16,16,16,16,16,16,24,24,24,24,24,24,24,24]
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24>
@@ -791,7 +791,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_
define <32 x i8> @shuffle_v32i8_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_15_23_23_23_23_23_23_23_23_31_31_31_31_31_31_31_31(<32 x i8> %a, <32 x i8> %b) {
; AVX1-LABEL: shuffle_v32i8_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_15_23_23_23_23_23_23_23_23_31_31_31_31_31_31_31_31:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,15,15,15,15,15,15,15,15]
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
@@ -800,7 +800,7 @@ define <32 x i8> @shuffle_v32i8_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_15_
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v32i8_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_15_23_23_23_23_23_23_23_23_31_31_31_31_31_31_31_31:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[7,7,7,7,7,7,7,7,15,15,15,15,15,15,15,15,23,23,23,23,23,23,23,23,31,31,31,31,31,31,31,31]
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
@@ -809,7 +809,7 @@ define <32 x i8> @shuffle_v32i8_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_15_
define <32 x i8> @shuffle_v32i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12_16_16_16_16_20_20_20_20_24_24_24_24_28_28_28_28(<32 x i8> %a, <32 x i8> %b) {
; AVX1-LABEL: shuffle_v32i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12_16_16_16_16_20_20_20_20_24_24_24_24_28_28_28_28:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12]
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
@@ -818,7 +818,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12_
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12_16_16_16_16_20_20_20_20_24_24_24_24_28_28_28_28:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12,16,16,16,16,20,20,20,20,24,24,24,24,28,28,28,28]
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4, i32 8, i32 8, i32 8, i32 8, i32 12, i32 12, i32 12, i32 12, i32 16, i32 16, i32 16, i32 16, i32 20, i32 20, i32 20, i32 20, i32 24, i32 24, i32 24, i32 24, i32 28, i32 28, i32 28, i32 28>
@@ -827,7 +827,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12_
define <32 x i8> @shuffle_v32i8_03_03_03_03_07_07_07_07_11_11_11_11_15_15_15_15_19_19_19_19_23_23_23_23_27_27_27_27_31_31_31_31(<32 x i8> %a, <32 x i8> %b) {
; AVX1-LABEL: shuffle_v32i8_03_03_03_03_07_07_07_07_11_11_11_11_15_15_15_15_19_19_19_19_23_23_23_23_27_27_27_27_31_31_31_31:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15]
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
@@ -836,7 +836,7 @@ define <32 x i8> @shuffle_v32i8_03_03_03_03_07_07_07_07_11_11_11_11_15_15_15_15_
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v32i8_03_03_03_03_07_07_07_07_11_11_11_11_15_15_15_15_19_19_19_19_23_23_23_23_27_27_27_27_31_31_31_31:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15,19,19,19,19,23,23,23,23,27,27,27,27,31,31,31,31]
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 7, i32 7, i32 7, i32 7, i32 11, i32 11, i32 11, i32 11, i32 15, i32 15, i32 15, i32 15, i32 19, i32 19, i32 19, i32 19, i32 23, i32 23, i32 23, i32 23, i32 27, i32 27, i32 27, i32 27, i32 31, i32 31, i32 31, i32 31>
@@ -845,7 +845,7 @@ define <32 x i8> @shuffle_v32i8_03_03_03_03_07_07_07_07_11_11_11_11_15_15_15_15_
define <32 x i8> @shuffle_v32i8_00_00_02_02_04_04_06_06_08_08_10_10_12_12_14_14_16_16_18_18_20_20_22_22_24_24_26_26_28_28_30_30(<32 x i8> %a, <32 x i8> %b) {
; AVX1-LABEL: shuffle_v32i8_00_00_02_02_04_04_06_06_08_08_10_10_12_12_14_14_16_16_18_18_20_20_22_22_24_24_26_26_28_28_30_30:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
@@ -854,7 +854,7 @@ define <32 x i8> @shuffle_v32i8_00_00_02_02_04_04_06_06_08_08_10_10_12_12_14_14_
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_02_02_04_04_06_06_08_08_10_10_12_12_14_14_16_16_18_18_20_20_22_22_24_24_26_26_28_28_30_30:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14,16,16,18,18,20,20,22,22,24,24,26,26,28,28,30,30]
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14, i32 16, i32 16, i32 18, i32 18, i32 20, i32 20, i32 22, i32 22, i32 24, i32 24, i32 26, i32 26, i32 28, i32 28, i32 30, i32 30>
@@ -863,7 +863,7 @@ define <32 x i8> @shuffle_v32i8_00_00_02_02_04_04_06_06_08_08_10_10_12_12_14_14_
define <32 x i8> @shuffle_v32i8_01_01_03_03_05_05_07_07_09_09_11_11_13_13_15_15_17_17_19_19_21_21_23_23_25_25_27_27_29_29_31_31(<32 x i8> %a, <32 x i8> %b) {
; AVX1-LABEL: shuffle_v32i8_01_01_03_03_05_05_07_07_09_09_11_11_13_13_15_15_17_17_19_19_21_21_23_23_25_25_27_27_29_29_31_31:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
@@ -872,7 +872,7 @@ define <32 x i8> @shuffle_v32i8_01_01_03_03_05_05_07_07_09_09_11_11_13_13_15_15_
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v32i8_01_01_03_03_05_05_07_07_09_09_11_11_13_13_15_15_17_17_19_19_21_21_23_23_25_25_27_27_29_29_31_31:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15,17,17,19,19,21,21,23,23,25,25,27,27,29,29,31,31]
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15, i32 17, i32 17, i32 19, i32 19, i32 21, i32 21, i32 23, i32 23, i32 25, i32 25, i32 27, i32 27, i32 29, i32 29, i32 31, i32 31>
@@ -881,145 +881,107 @@ define <32 x i8> @shuffle_v32i8_01_01_03_03_05_05_07_07_09_09_11_11_13_13_15_15_
define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00(<32 x i8> %a, <32 x i8> %b) {
; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0]
-; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1,0,1]
-; AVX2-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0]
-; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
-; AVX512VL-NEXT: retq
+; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00:
+; AVX2OR512VL: # %bb.0:
+; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0]
+; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
+; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0>
ret <32 x i8> %shuffle
}
define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00(<32 x i8> %a, <32 x i8> %b) {
; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0]
-; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1,0,1]
-; AVX2-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0]
-; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
-; AVX512VL-NEXT: retq
+; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00:
+; AVX2OR512VL: # %bb.0:
+; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0]
+; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
+; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0>
ret <32 x i8> %shuffle
}
define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0]
-; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1,0,1]
-; AVX2-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0]
-; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
-; AVX512VL-NEXT: retq
+; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00:
+; AVX2OR512VL: # %bb.0:
+; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0]
+; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
+; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <32 x i8> %shuffle
}
define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0]
-; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1,0,1]
-; AVX2-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0]
-; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
-; AVX512VL-NEXT: retq
+; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00:
+; AVX2OR512VL: # %bb.0:
+; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0]
+; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
+; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <32 x i8> %shuffle
}
define <32 x i8> @shuffle_v32i8_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
; AVX1-LABEL: shuffle_v32i8_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v32i8_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1,0,1]
-; AVX2-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v32i8_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
-; AVX512VL-NEXT: retq
+; AVX2OR512VL-LABEL: shuffle_v32i8_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX2OR512VL: # %bb.0:
+; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
+; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 14, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 14, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <32 x i8> %shuffle
}
define <32 x i8> @shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {
; AVX1-LABEL: shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: movl $15, %eax
; AVX1-NEXT: vmovd %eax, %xmm1
; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX2: # BB#0:
-; AVX2-NEXT: movl $15, %eax
-; AVX2-NEXT: vmovd %eax, %xmm1
-; AVX2-NEXT: vpshufb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1,0,1]
-; AVX2-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: movl $15, %eax
-; AVX512VL-NEXT: vmovd %eax, %xmm1
-; AVX512VL-NEXT: vpshufb %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
-; AVX512VL-NEXT: retq
+; AVX2OR512VL-LABEL: shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
+; AVX2OR512VL: # %bb.0:
+; AVX2OR512VL-NEXT: movl $15, %eax
+; AVX2OR512VL-NEXT: vmovd %eax, %xmm1
+; AVX2OR512VL-NEXT: vpshufb %xmm1, %xmm0, %xmm0
+; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
+; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <32 x i8> %shuffle
}
define <32 x i8> @shuffle_v32i8_00_33_02_35_04_37_06_39_08_41_10_43_12_45_14_47_16_49_18_51_20_53_22_55_24_57_26_59_28_61_30_63(<32 x i8> %a, <32 x i8> %b) {
; AVX1-LABEL: shuffle_v32i8_00_33_02_35_04_37_06_39_08_41_10_43_12_45_14_47_16_49_18_51_20_53_22_55_24_57_26_59_28_61_30_63:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
; AVX1-NEXT: vandnps %ymm1, %ymm2, %ymm1
; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
@@ -1027,13 +989,13 @@ define <32 x i8> @shuffle_v32i8_00_33_02_35_04_37_06_39_08_41_10_43_12_45_14_47_
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v32i8_00_33_02_35_04_37_06_39_08_41_10_43_12_45_14_47_16_49_18_51_20_53_22_55_24_57_26_59_28_61_30_63:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v32i8_00_33_02_35_04_37_06_39_08_41_10_43_12_45_14_47_16_49_18_51_20_53_22_55_24_57_26_59_28_61_30_63:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: movl $-1431655766, %eax # imm = 0xAAAAAAAA
; AVX512VL-NEXT: kmovd %eax, %k1
; AVX512VL-NEXT: vmovdqu8 %ymm1, %ymm0 {%k1}
@@ -1044,7 +1006,7 @@ define <32 x i8> @shuffle_v32i8_00_33_02_35_04_37_06_39_08_41_10_43_12_45_14_47_
define <32 x i8> @shuffle_v32i8_32_01_34_03_36_05_38_07_40_09_42_11_44_13_46_15_48_17_50_19_52_21_54_23_56_25_58_27_60_29_62_31(<32 x i8> %a, <32 x i8> %b) {
; AVX1-LABEL: shuffle_v32i8_32_01_34_03_36_05_38_07_40_09_42_11_44_13_46_15_48_17_50_19_52_21_54_23_56_25_58_27_60_29_62_31:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
; AVX1-NEXT: vandnps %ymm0, %ymm2, %ymm0
; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
@@ -1052,13 +1014,13 @@ define <32 x i8> @shuffle_v32i8_32_01_34_03_36_05_38_07_40_09_42_11_44_13_46_15_
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v32i8_32_01_34_03_36_05_38_07_40_09_42_11_44_13_46_15_48_17_50_19_52_21_54_23_56_25_58_27_60_29_62_31:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v32i8_32_01_34_03_36_05_38_07_40_09_42_11_44_13_46_15_48_17_50_19_52_21_54_23_56_25_58_27_60_29_62_31:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: movl $-1431655766, %eax # imm = 0xAAAAAAAA
; AVX512VL-NEXT: kmovd %eax, %k1
; AVX512VL-NEXT: vpblendmb %ymm0, %ymm1, %ymm0 {%k1}
@@ -1069,12 +1031,12 @@ define <32 x i8> @shuffle_v32i8_32_01_34_03_36_05_38_07_40_09_42_11_44_13_46_15_
define <32 x i8> @shuffle_v32i8_zz_01_zz_03_zz_05_zz_07_zz_09_zz_11_zz_13_zz_15_zz_17_zz_19_zz_21_zz_23_zz_25_zz_27_zz_29_zz_31(<32 x i8> %a) {
; AVX1OR2-LABEL: shuffle_v32i8_zz_01_zz_03_zz_05_zz_07_zz_09_zz_11_zz_13_zz_15_zz_17_zz_19_zz_21_zz_23_zz_25_zz_27_zz_29_zz_31:
-; AVX1OR2: # BB#0:
+; AVX1OR2: # %bb.0:
; AVX1OR2-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
; AVX1OR2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v32i8_zz_01_zz_03_zz_05_zz_07_zz_09_zz_11_zz_13_zz_15_zz_17_zz_19_zz_21_zz_23_zz_25_zz_27_zz_29_zz_31:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: movl $-1431655766, %eax # imm = 0xAAAAAAAA
; AVX512VL-NEXT: kmovd %eax, %k1
; AVX512VL-NEXT: vmovdqu8 %ymm0, %ymm0 {%k1} {z}
@@ -1085,13 +1047,13 @@ define <32 x i8> @shuffle_v32i8_zz_01_zz_03_zz_05_zz_07_zz_09_zz_11_zz_13_zz_15_
define <32 x i8> @shuffle_v32i8_01_zz_02_zz_04_uu_06_07_08_09_10_11_12_13_14_15_u6_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31(<32 x i8> %a) {
; AVX1-LABEL: shuffle_v32i8_01_zz_02_zz_04_uu_06_07_08_09_10_11_12_13_14_15_u6_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[1],zero,xmm0[2],zero,xmm0[4,u,6,7,8,9,10,11,12,13,14,15]
; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v32i8_01_zz_02_zz_04_uu_06_07_08_09_10_11_12_13_14_15_u6_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1],zero,ymm0[2],zero,ymm0[4,u,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> zeroinitializer, <32 x i32> <i32 1, i32 32, i32 2, i32 32, i32 4, i32 undef, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
@@ -1100,7 +1062,7 @@ define <32 x i8> @shuffle_v32i8_01_zz_02_zz_04_uu_06_07_08_09_10_11_12_13_14_15_
define <32 x i8> @shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32(<32 x i8> %a, <32 x i8> %b) {
; AVX1-LABEL: shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
@@ -1108,7 +1070,7 @@ define <32 x i8> @shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; AVX2OR512VL-NEXT: vpbroadcastw %xmm0, %ymm0
; AVX2OR512VL-NEXT: retq
@@ -1118,7 +1080,7 @@ define <32 x i8> @shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_
define <32 x i8> @shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_16_48_16_48_16_48_16_48_16_48_16_48_16_48_16_48(<32 x i8> %a, <32 x i8> %b) {
; AVX1-LABEL: shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_16_48_16_48_16_48_16_48_16_48_16_48_16_48_16_48:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7]
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,1,1]
@@ -1131,8 +1093,8 @@ define <32 x i8> @shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_16_48_16_48_16_48_16_48_16_48_16_48_16_48_16_48:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpxor %ymm2, %ymm2, %ymm2
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15]
; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5]
@@ -1141,10 +1103,10 @@ define <32 x i8> @shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_16_48_16_48_16_48_16_48_16_48_16_48_16_48_16_48:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15]
; AVX512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5]
-; AVX512VL-NEXT: vpxor %ymm2, %ymm2, %ymm2
+; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VL-NEXT: movl $-1431655766, %eax # imm = 0xAAAAAAAA
; AVX512VL-NEXT: kmovd %eax, %k1
; AVX512VL-NEXT: vpshufb %ymm2, %ymm1, %ymm0 {%k1}
@@ -1155,7 +1117,7 @@ define <32 x i8> @shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_
define <32 x i8> @shuffle_v32i8_32_32_32_32_32_32_32_32_08_09_10_11_12_13_14_15_48_48_48_48_48_48_48_48_24_25_26_27_28_29_30_31(<32 x i8> %a, <32 x i8> %b) {
; AVX1-LABEL: shuffle_v32i8_32_32_32_32_32_32_32_32_08_09_10_11_12_13_14_15_48_48_48_48_48_48_48_48_24_25_26_27_28_29_30_31:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
@@ -1168,8 +1130,8 @@ define <32 x i8> @shuffle_v32i8_32_32_32_32_32_32_32_32_08_09_10_11_12_13_14_15_
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v32i8_32_32_32_32_32_32_32_32_08_09_10_11_12_13_14_15_48_48_48_48_48_48_48_48_24_25_26_27_28_29_30_31:
-; AVX2OR512VL: # BB#0:
-; AVX2OR512VL-NEXT: vpxor %ymm2, %ymm2, %ymm2
+; AVX2OR512VL: # %bb.0:
+; AVX2OR512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX2OR512VL-NEXT: vpshufb %ymm2, %ymm1, %ymm1
; AVX2OR512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
; AVX2OR512VL-NEXT: retq
@@ -1179,7 +1141,7 @@ define <32 x i8> @shuffle_v32i8_32_32_32_32_32_32_32_32_08_09_10_11_12_13_14_15_
define <32 x i8> @shuffle_v32i8_39_38_37_36_35_34_33_32_15_14_13_12_11_10_09_08_55_54_53_52_51_50_49_48_31_30_29_28_27_26_25_24(<32 x i8> %a, <32 x i8> %b) {
; AVX1-LABEL: shuffle_v32i8_39_38_37_36_35_34_33_32_15_14_13_12_11_10_09_08_55_54_53_52_51_50_49_48_31_30_29_28_27_26_25_24:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <15,14,13,12,11,10,9,8,u,u,u,u,u,u,u,u>
; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
@@ -1194,7 +1156,7 @@ define <32 x i8> @shuffle_v32i8_39_38_37_36_35_34_33_32_15_14_13_12_11_10_09_08_
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v32i8_39_38_37_36_35_34_33_32_15_14_13_12_11_10_09_08_55_54_53_52_51_50_49_48_31_30_29_28_27_26_25_24:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24]
; AVX2OR512VL-NEXT: retq
@@ -1204,7 +1166,7 @@ define <32 x i8> @shuffle_v32i8_39_38_37_36_35_34_33_32_15_14_13_12_11_10_09_08_
define <32 x i8> @shuffle_v32i8_39_38_37_36_35_34_33_32_07_06_05_04_03_02_01_00_55_54_53_52_51_50_49_48_23_22_21_20_19_18_17_16(<32 x i8> %a, <32 x i8> %b) {
; AVX1-LABEL: shuffle_v32i8_39_38_37_36_35_34_33_32_07_06_05_04_03_02_01_00_55_54_53_52_51_50_49_48_23_22_21_20_19_18_17_16:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
@@ -1216,7 +1178,7 @@ define <32 x i8> @shuffle_v32i8_39_38_37_36_35_34_33_32_07_06_05_04_03_02_01_00_
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v32i8_39_38_37_36_35_34_33_32_07_06_05_04_03_02_01_00_55_54_53_52_51_50_49_48_23_22_21_20_19_18_17_16:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,7,6,5,4,3,2,1,0,u,u,u,u,u,u,u,u,23,22,21,20,19,18,17,16]
; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[7,6,5,4,3,2,1,0,u,u,u,u,u,u,u,u,23,22,21,20,19,18,17,16,u,u,u,u,u,u,u,u]
; AVX2OR512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
@@ -1227,7 +1189,7 @@ define <32 x i8> @shuffle_v32i8_39_38_37_36_35_34_33_32_07_06_05_04_03_02_01_00_
define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00_16_16_16_16_16_16_16_16_16_16_16_16_16_16_17_16(<32 x i8> %a, <32 x i8> %b) {
; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00_16_16_16_16_16_16_16_16_16_16_16_16_16_16_17_16:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0]
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
@@ -1236,7 +1198,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00_
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00_16_16_16_16_16_16_16_16_16_16_16_16_16_16_17_16:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,17,16]
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 17, i32 16>
@@ -1245,7 +1207,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00_
define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00_16_16_16_16_16_16_16_16_16_16_16_16_16_18_16_16(<32 x i8> %a, <32 x i8> %b) {
; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00_16_16_16_16_16_16_16_16_16_16_16_16_16_18_16_16:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0]
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
@@ -1254,7 +1216,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00_
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00_16_16_16_16_16_16_16_16_16_16_16_16_16_18_16_16:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,18,16,16]
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 18, i32 16, i32 16>
@@ -1263,7 +1225,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00_
define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_23_16_16_16_16_16_16_16(<32 x i8> %a, <32 x i8> %b) {
; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_23_16_16_16_16_16_16_16:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0]
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
@@ -1272,7 +1234,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00_
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_23_16_16_16_16_16_16_16:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,23,16,16,16,16,16,16,16]
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 23, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
@@ -1281,7 +1243,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00_
define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_24_16_16_16_16_16_16_16_16(<32 x i8> %a, <32 x i8> %b) {
; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_24_16_16_16_16_16_16_16_16:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0]
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
@@ -1290,7 +1252,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00_
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_24_16_16_16_16_16_16_16_16:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,24,16,16,16,16,16,16,16,16]
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 24, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
@@ -1299,7 +1261,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00_
define <32 x i8> @shuffle_v32i8_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_30_16_16_16_16_16_16_16_16_16_16_16_16_16_16(<32 x i8> %a, <32 x i8> %b) {
; AVX1-LABEL: shuffle_v32i8_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_30_16_16_16_16_16_16_16_16_16_16_16_16_16_16:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
@@ -1308,7 +1270,7 @@ define <32 x i8> @shuffle_v32i8_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v32i8_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_30_16_16_16_16_16_16_16_16_16_16_16_16_16_16:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,16,30,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 14, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 30, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
@@ -1317,7 +1279,7 @@ define <32 x i8> @shuffle_v32i8_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
define <32 x i8> @shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_31_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16(<32 x i8> %a, <32 x i8> %b) {
; AVX1-LABEL: shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_31_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: movl $15, %eax
; AVX1-NEXT: vmovd %eax, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
@@ -1327,7 +1289,7 @@ define <32 x i8> @shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_31_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[15,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,31,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 31, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
@@ -1336,7 +1298,7 @@ define <32 x i8> @shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
define <32 x i8> @shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_16_48_17_49_18_50_19_51_20_52_21_53_22_54_23_55(<32 x i8> %a, <32 x i8> %b) {
; AVX1-LABEL: shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_16_48_17_49_18_50_19_51_20_52_21_53_22_54_23_55:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
@@ -1345,7 +1307,7 @@ define <32 x i8> @shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_16_48_17_49_18_50_19_51_20_52_21_53_22_54_23_55:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55>
@@ -1354,7 +1316,7 @@ define <32 x i8> @shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_
define <32 x i8> @shuffle_v32i8_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47_24_56_25_57_26_58_27_59_28_60_29_61_30_62_31_63(<32 x i8> %a, <32 x i8> %b) {
; AVX1-LABEL: shuffle_v32i8_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47_24_56_25_57_26_58_27_59_28_60_29_61_30_62_31_63:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
@@ -1363,7 +1325,7 @@ define <32 x i8> @shuffle_v32i8_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47_
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v32i8_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47_24_56_25_57_26_58_27_59_28_60_29_61_30_62_31_63:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
@@ -1372,7 +1334,7 @@ define <32 x i8> @shuffle_v32i8_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47_
define <32 x i8> @shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_24_56_25_57_26_58_27_59_28_60_29_61_30_62_31_63(<32 x i8> %a, <32 x i8> %b) {
; AVX1-LABEL: shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_24_56_25_57_26_58_27_59_28_60_29_61_30_62_31_63:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
@@ -1381,7 +1343,7 @@ define <32 x i8> @shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_24_56_25_57_26_58_27_59_28_60_29_61_30_62_31_63:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,u,1,u,2,u,3,u,4,u,5,u,6,u,7,u,24,u,25,u,26,u,27,u,28,u,29,u,30,u,31,u]
; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,0,u,1,u,2,u,3,u,4,u,5,u,6,u,7,u,24,u,25,u,26,u,27,u,28,u,29,u,30,u,31]
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
@@ -1389,7 +1351,7 @@ define <32 x i8> @shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_24_56_25_57_26_58_27_59_28_60_29_61_30_62_31_63:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,u,1,u,2,u,3,u,4,u,5,u,6,u,7,u,24,u,25,u,26,u,27,u,28,u,29,u,30,u,31,u]
; AVX512VL-NEXT: movl $-1431655766, %eax # imm = 0xAAAAAAAA
; AVX512VL-NEXT: kmovd %eax, %k1
@@ -1401,7 +1363,7 @@ define <32 x i8> @shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_
define <32 x i8> @shuffle_v32i8_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47_16_48_17_49_18_50_19_51_20_52_21_53_22_54_23_55(<32 x i8> %a, <32 x i8> %b) {
; AVX1-LABEL: shuffle_v32i8_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47_16_48_17_49_18_50_19_51_20_52_21_53_22_54_23_55:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
@@ -1410,7 +1372,7 @@ define <32 x i8> @shuffle_v32i8_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47_
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v32i8_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47_16_48_17_49_18_50_19_51_20_52_21_53_22_54_23_55:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,u,9,u,10,u,11,u,12,u,13,u,14,u,15,u,16,u,17,u,18,u,19,u,20,u,21,u,22,u,23,u]
; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,8,u,9,u,10,u,11,u,12,u,13,u,14,u,15,u,16,u,17,u,18,u,19,u,20,u,21,u,22,u,23]
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
@@ -1418,7 +1380,7 @@ define <32 x i8> @shuffle_v32i8_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47_
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v32i8_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47_16_48_17_49_18_50_19_51_20_52_21_53_22_54_23_55:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,u,9,u,10,u,11,u,12,u,13,u,14,u,15,u,16,u,17,u,18,u,19,u,20,u,21,u,22,u,23,u]
; AVX512VL-NEXT: movl $-1431655766, %eax # imm = 0xAAAAAAAA
; AVX512VL-NEXT: kmovd %eax, %k1
@@ -1430,7 +1392,7 @@ define <32 x i8> @shuffle_v32i8_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47_
define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00_16_17_16_16_16_16_16_16_16_16_16_16_16_16_16_16(<32 x i8> %a, <32 x i8> %b) {
; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00_16_17_16_16_16_16_16_16_16_16_16_16_16_16_16_16:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
@@ -1438,7 +1400,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00_
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00_16_17_16_16_16_16_16_16_16_16_16_16_16_16_16_16:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,16,17,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0, i32 16, i32 17, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
@@ -1447,7 +1409,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00_
define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00_16_16_18_16_16_16_16_16_16_16_16_16_16_16_16_16(<32 x i8> %a, <32 x i8> %b) {
; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00_16_16_18_16_16_16_16_16_16_16_16_16_16_16_16_16:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0]
@@ -1455,7 +1417,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00_
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00_16_16_18_16_16_16_16_16_16_16_16_16_16_16_16_16:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,16,16,18,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0, i32 16, i32 16, i32 18, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
@@ -1464,7 +1426,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00_
define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00_16_16_16_16_16_16_16_23_16_16_16_16_16_16_16_16(<32 x i8> %a, <32 x i8> %b) {
; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00_16_16_16_16_16_16_16_23_16_16_16_16_16_16_16_16:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,7,0,0,0,0,0,0,0,0]
@@ -1472,7 +1434,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00_
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00_16_16_16_16_16_16_16_23_16_16_16_16_16_16_16_16:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0,16,16,16,16,16,16,16,23,16,16,16,16,16,16,16,16]
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 23, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
@@ -1481,7 +1443,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00_
define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_24_16_16_16_16_16_16_16(<32 x i8> %a, <32 x i8> %b) {
; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_24_16_16_16_16_16_16_16:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,8,0,0,0,0,0,0,0]
@@ -1489,7 +1451,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00_
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_24_16_16_16_16_16_16_16:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,24,16,16,16,16,16,16,16]
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 24, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
@@ -1498,7 +1460,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00_
define <32 x i8> @shuffle_v32i8_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_16_16_16_16_16_16_30_16(<32 x i8> %a, <32 x i8> %b) {
; AVX1-LABEL: shuffle_v32i8_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_16_16_16_16_16_16_30_16:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,14,0]
@@ -1506,7 +1468,7 @@ define <32 x i8> @shuffle_v32i8_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v32i8_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_16_16_16_16_16_16_30_16:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,30,16]
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 14, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 30, i32 16>
@@ -1515,7 +1477,7 @@ define <32 x i8> @shuffle_v32i8_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
define <32 x i8> @shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_31(<32 x i8> %a, <32 x i8> %b) {
; AVX1-LABEL: shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_31:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: movl $15, %eax
; AVX1-NEXT: vmovd %eax, %xmm1
; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1
@@ -1525,7 +1487,7 @@ define <32 x i8> @shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_31:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[15,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,31]
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 31>
@@ -1534,7 +1496,7 @@ define <32 x i8> @shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
define <32 x i8> @shuffle_v32i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12_28_28_28_28_24_24_24_24_20_20_20_20_16_16_16_16(<32 x i8> %a, <32 x i8> %b) {
; AVX1-LABEL: shuffle_v32i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12_28_28_28_28_24_24_24_24_20_20_20_20_16_16_16_16:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,12,12,12,8,8,8,8,4,4,4,4,0,0,0,0]
@@ -1542,7 +1504,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12_
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12_28_28_28_28_24_24_24_24_20_20_20_20_16_16_16_16:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12,28,28,28,28,24,24,24,24,20,20,20,20,16,16,16,16]
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4, i32 8, i32 8, i32 8, i32 8, i32 12, i32 12, i32 12, i32 12, i32 28, i32 28, i32 28, i32 28, i32 24, i32 24, i32 24, i32 24, i32 20, i32 20, i32 20, i32 20, i32 16, i32 16, i32 16, i32 16>
@@ -1551,7 +1513,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12_
define <32 x i8> @shuffle_v32i8_08_08_08_08_08_08_08_08_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24(<32 x i8> %a, <32 x i8> %b) {
; AVX1-LABEL: shuffle_v32i8_08_08_08_08_08_08_08_08_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[8,8,8,8,8,8,8,8,0,0,0,0,0,0,0,0]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8]
@@ -1559,7 +1521,7 @@ define <32 x i8> @shuffle_v32i8_08_08_08_08_08_08_08_08_00_00_00_00_00_00_00_00_
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v32i8_08_08_08_08_08_08_08_08_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,8,8,8,8,8,8,8,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,24,24,24,24,24,24,24,24]
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24>
@@ -1568,14 +1530,14 @@ define <32 x i8> @shuffle_v32i8_08_08_08_08_08_08_08_08_00_00_00_00_00_00_00_00_
define <32 x i8> @shuffle_v32i8_00_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_16_16_16_16_uu_uu_uu_uu_uu_16_16_16_16_16_30_16(<32 x i8> %a, <32 x i8> %b) {
; AVX1-LABEL: shuffle_v32i8_00_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_16_16_16_16_uu_uu_uu_uu_uu_16_16_16_16_16_30_16:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,0,0,0,u,u,u,u,u,0,0,0,0,0,14,0]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v32i8_00_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_16_16_16_16_uu_uu_uu_uu_uu_16_16_16_16_16_30_16:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,16,16,16,u,u,u,u,u,16,16,16,16,16,30,16]
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 16, i32 16, i32 16, i32 16, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 16, i32 16, i32 16, i32 16, i32 16, i32 30, i32 16>
@@ -1584,7 +1546,7 @@ define <32 x i8> @shuffle_v32i8_00_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_
define <32 x i8> @shuffle_v32i8_uu_14_uu_uu_00_00_00_00_00_00_00_00_00_00_00_00_16_16_uu_16_uu_uu_uu_uu_16_16_16_16_16_16_30_16(<32 x i8> %a, <32 x i8> %b) {
; AVX1-LABEL: shuffle_v32i8_uu_14_uu_uu_00_00_00_00_00_00_00_00_00_00_00_00_16_16_uu_16_uu_uu_uu_uu_16_16_16_16_16_16_30_16:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[14,14,1,1,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,u,0,u,u,u,u,0,0,0,0,0,0,14,0]
@@ -1592,7 +1554,7 @@ define <32 x i8> @shuffle_v32i8_uu_14_uu_uu_00_00_00_00_00_00_00_00_00_00_00_00_
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v32i8_uu_14_uu_uu_00_00_00_00_00_00_00_00_00_00_00_00_16_16_uu_16_uu_uu_uu_uu_16_16_16_16_16_16_30_16:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,14,u,u,0,0,0,0,0,0,0,0,0,0,0,0,16,16,u,16,u,u,u,u,16,16,16,16,16,16,30,16]
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 undef, i32 14, i32 undef, i32 undef, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 16, i32 undef, i32 16, i32 undef, i32 undef, i32 undef, i32 undef, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 30, i32 16>
@@ -1601,7 +1563,7 @@ define <32 x i8> @shuffle_v32i8_uu_14_uu_uu_00_00_00_00_00_00_00_00_00_00_00_00_
define <32 x i8> @shuffle_v32i8_00_00_00_uu_uu_uu_04_uu_08_08_08_08_uu_uu_12_uu_28_28_28_28_uu_uu_uu_24_20_20_20_20_16_16_16_16(<32 x i8> %a, <32 x i8> %b) {
; AVX1-LABEL: shuffle_v32i8_00_00_00_uu_uu_uu_04_uu_08_08_08_08_uu_uu_12_uu_28_28_28_28_uu_uu_uu_24_20_20_20_20_16_16_16_16:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,12,12,12,8,8,8,8,4,4,4,4,0,0,0,0]
@@ -1609,7 +1571,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_uu_uu_uu_04_uu_08_08_08_08_uu_uu_12_uu_
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_uu_uu_uu_04_uu_08_08_08_08_uu_uu_12_uu_28_28_28_28_uu_uu_uu_24_20_20_20_20_16_16_16_16:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,u,u,u,4,u,8,8,8,8,u,u,12,u,28,28,28,28,u,u,u,24,20,20,20,20,16,16,16,16]
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 undef, i32 undef, i32 undef, i32 4, i32 undef, i32 8, i32 8, i32 8, i32 8, i32 undef, i32 undef, i32 12, i32 undef, i32 28, i32 28, i32 28, i32 28, i32 undef, i32 undef, i32 undef, i32 24, i32 20, i32 20, i32 20, i32 20, i32 16, i32 16, i32 16, i32 16>
@@ -1618,7 +1580,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_uu_uu_uu_04_uu_08_08_08_08_uu_uu_12_uu_
define <32 x i8> @shuffle_v32i8_08_08_08_08_08_08_08_08_uu_uu_uu_uu_uu_uu_uu_uu_16_16_16_uu_uu_uu_uu_uu_uu_uu_24_24_24_24_24_24(<32 x i8> %a, <32 x i8> %b) {
; AVX1-LABEL: shuffle_v32i8_08_08_08_08_08_08_08_08_uu_uu_uu_uu_uu_uu_uu_uu_16_16_16_uu_uu_uu_uu_uu_uu_uu_24_24_24_24_24_24:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
@@ -1627,7 +1589,7 @@ define <32 x i8> @shuffle_v32i8_08_08_08_08_08_08_08_08_uu_uu_uu_uu_uu_uu_uu_uu_
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v32i8_08_08_08_08_08_08_08_08_uu_uu_uu_uu_uu_uu_uu_uu_16_16_16_uu_uu_uu_uu_uu_uu_uu_24_24_24_24_24_24:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,8,8,8,8,8,8,8,u,u,u,u,u,u,u,u,16,16,16,u,u,u,u,u,u,u,24,24,24,24,24,24]
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 16, i32 16, i32 16, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24>
@@ -1636,7 +1598,7 @@ define <32 x i8> @shuffle_v32i8_08_08_08_08_08_08_08_08_uu_uu_uu_uu_uu_uu_uu_uu_
define <32 x i8> @shuffle_v32i8_42_45_12_13_35_35_60_40_17_22_29_44_33_12_48_51_20_19_52_19_49_54_37_32_48_42_59_07_36_34_36_39(<32 x i8> %a, <32 x i8> %b) {
; AVX1-LABEL: shuffle_v32i8_42_45_12_13_35_35_60_40_17_22_29_44_33_12_48_51_20_19_52_19_49_54_37_32_48_42_59_07_36_34_36_39:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[u,u,4,u,1,6],zero,zero,xmm2[0],zero,xmm2[11,u],zero,zero,zero,zero
; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[u,u],zero,xmm1[u],zero,zero,xmm1[5,0],zero,xmm1[10],zero,xmm1[u,4,2,4,7]
@@ -1658,31 +1620,31 @@ define <32 x i8> @shuffle_v32i8_42_45_12_13_35_35_60_40_17_22_29_44_33_12_48_51_
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v32i8_42_45_12_13_35_35_60_40_17_22_29_44_33_12_48_51_20_19_52_19_49_54_37_32_48_42_59_07_36_34_36_39:
-; AVX2: # BB#0:
-; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm1[2,3,0,1]
-; AVX2-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,12,u,u,u,u,u,u,u,0,3,u,u,u,u,u,u,21,16,u,26,u,u,20,18,20,23]
-; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[10,13,u,u,3,3,u,8,u,u,u,12,1,u,u,u,u,u,20,u,17,22,u,u,16,u,27,u,u,u,u,u]
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpshufb {{.*#+}} ymm2 = ymm1[10,13,u,u,3,3,u,8,u,u,u,12,1,u,u,u,u,u,20,u,17,22,u,u,16,u,27,u,u,u,u,u]
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1]
+; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,12,u,u,u,u,u,u,u,0,3,u,u,u,u,u,u,21,16,u,26,u,u,20,18,20,23]
; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,u,u,255,255,0,255,u,u,u,255,255,u,0,0,u,u,255,u,255,255,0,0,255,0,255,u,0,0,0,0>
-; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
-; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3,0,1]
-; AVX2-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,1,6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,23,u,u,u,u]
-; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,12,13,u,u,u,u,u,u,u,u,u,12,u,u,20,19,u,19,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4,5],ymm2[6],ymm0[7]
+; AVX2-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpshufb {{.*#+}} ymm2 = ymm0[u,u,12,13,u,u,u,u,u,u,u,u,u,12,u,u,20,19,u,19,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,1,6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,23,u,u,u,u]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3,4,5],ymm0[6],ymm2[7]
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,0,255,255,255,255,0,0,0,255,255,0,255,255,0,0,255,0,255,255,255,255,255,255,255,0,255,255,255,255]
; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v32i8_42_45_12_13_35_35_60_40_17_22_29_44_33_12_48_51_20_19_52_19_49_54_37_32_48_42_59_07_36_34_36_39:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm1[2,3,0,1]
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
; AVX512VL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[10,13,u,u,3,3,u,8,u,u,u,12,1,u,u,u,u,u,20,u,17,22,u,u,16,u,27,u,u,u,u,u]
; AVX512VL-NEXT: movl $-222248896, %eax # imm = 0xF2C0C040
; AVX512VL-NEXT: kmovd %eax, %k1
; AVX512VL-NEXT: vpshufb {{.*#+}} ymm1 {%k1} = ymm2[u,u,u,u,u,u,12,u,u,u,u,u,u,u,0,3,u,u,u,u,u,u,21,16,u,26,u,u,20,18,20,23]
-; AVX512VL-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3,0,1]
-; AVX512VL-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,1,6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,23,u,u,u,u]
-; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,12,13,u,u,u,u,u,u,u,u,u,12,u,u,20,19,u,19,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4,5],ymm2[6],ymm0[7]
+; AVX512VL-NEXT: vpshufb {{.*#+}} ymm2 = ymm0[u,u,12,13,u,u,u,u,u,u,u,u,u,12,u,u,20,19,u,19,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,1,6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,23,u,u,u,u]
+; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3,4,5],ymm0[6],ymm2[7]
; AVX512VL-NEXT: movl $134948620, %eax # imm = 0x80B270C
; AVX512VL-NEXT: kmovd %eax, %k1
; AVX512VL-NEXT: vmovdqu8 %ymm0, %ymm1 {%k1}
@@ -1694,7 +1656,7 @@ define <32 x i8> @shuffle_v32i8_42_45_12_13_35_35_60_40_17_22_29_44_33_12_48_51_
define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_32_32_32_32_32_32_32_32_40_40_40_40_40_40_40_40(<32 x i8> %a, <32 x i8> %b) {
; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_32_32_32_32_32_32_32_32_40_40_40_40_40_40_40_40:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8]
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
@@ -1702,7 +1664,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_32_32_32_32_32_32_32_32_40_40_40_40_40_40_40_40:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8,16,16,16,16,16,16,16,16,24,24,24,24,24,24,24,24]
; AVX2OR512VL-NEXT: retq
@@ -1712,7 +1674,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_
define <32 x i8> @shuffle_v32i8_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24_32_32_32_32_32_32_32_32_40_40_40_40_40_40_40_40(<32 x i8> %a, <32 x i8> %b) {
; AVX1-LABEL: shuffle_v32i8_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24_32_32_32_32_32_32_32_32_40_40_40_40_40_40_40_40:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8]
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
@@ -1721,7 +1683,7 @@ define <32 x i8> @shuffle_v32i8_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24_
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v32i8_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24_32_32_32_32_32_32_32_32_40_40_40_40_40_40_40_40:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8,16,16,16,16,16,16,16,16,24,24,24,24,24,24,24,24]
; AVX2OR512VL-NEXT: retq
@@ -1731,7 +1693,7 @@ define <32 x i8> @shuffle_v32i8_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24_
define <32 x i8> @shuffle_v32i8_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24_48_48_48_48_48_48_48_48_56_56_56_56_56_56_56_56(<32 x i8> %a, <32 x i8> %b) {
; AVX1-LABEL: shuffle_v32i8_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24_48_48_48_48_48_48_48_48_56_56_56_56_56_56_56_56:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8]
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
@@ -1741,7 +1703,7 @@ define <32 x i8> @shuffle_v32i8_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24_
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v32i8_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24_48_48_48_48_48_48_48_48_56_56_56_56_56_56_56_56:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8,16,16,16,16,16,16,16,16,24,24,24,24,24,24,24,24]
; AVX2OR512VL-NEXT: retq
@@ -1751,7 +1713,7 @@ define <32 x i8> @shuffle_v32i8_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24_
define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_48_48_48_48_48_48_48_48_56_56_56_56_56_56_56_56(<32 x i8> %a, <32 x i8> %b) {
; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_48_48_48_48_48_48_48_48_56_56_56_56_56_56_56_56:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8]
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
@@ -1760,7 +1722,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_48_48_48_48_48_48_48_48_56_56_56_56_56_56_56_56:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8,16,16,16,16,16,16,16,16,24,24,24,24,24,24,24,24]
; AVX2OR512VL-NEXT: retq
@@ -1770,14 +1732,14 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_
define <32 x i8> @shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47(<32 x i8> %a, <32 x i8> %b) {
; AVX1-LABEL: shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
; AVX2OR512VL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; AVX2OR512VL-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
@@ -1788,7 +1750,7 @@ define <32 x i8> @shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_
define <32 x i8> @shuffle_v32i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_32_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_48(<32 x i8> %a) {
; AVX1-LABEL: shuffle_v32i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_32_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_48:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0]
@@ -1796,7 +1758,7 @@ define <32 x i8> @shuffle_v32i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_32_
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v32i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_32_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_48:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpslldq {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16]
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <32 x i8> zeroinitializer, <32 x i8> %a, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 32, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 48>
@@ -1805,7 +1767,7 @@ define <32 x i8> @shuffle_v32i8_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_32_
define <32 x i8> @shuffle_v32i8_47_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_63_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(<32 x i8> %a) {
; AVX1-LABEL: shuffle_v32i8_47_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_63_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpsrldq {{.*#+}} xmm1 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -1813,7 +1775,7 @@ define <32 x i8> @shuffle_v32i8_47_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v32i8_47_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_63_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <32 x i8> zeroinitializer, <32 x i8> %a, <32 x i32> <i32 47, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 63, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
@@ -1826,7 +1788,7 @@ define <32 x i8> @shuffle_v32i8_47_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_
define <32 x i8> @shuffle_v32i8_zz_00_zz_02_zz_04_zz_06_zz_08_zz_10_zz_12_zz_14_zz_16_zz_18_zz_20_zz_22_zz_24_zz_26_zz_28_zz_30(<32 x i8> %a) {
; AVX1-LABEL: shuffle_v32i8_zz_00_zz_02_zz_04_zz_06_zz_08_zz_10_zz_12_zz_14_zz_16_zz_18_zz_20_zz_22_zz_24_zz_26_zz_28_zz_30:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpsllw $8, %xmm0, %xmm0
@@ -1834,7 +1796,7 @@ define <32 x i8> @shuffle_v32i8_zz_00_zz_02_zz_04_zz_06_zz_08_zz_10_zz_12_zz_14_
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v32i8_zz_00_zz_02_zz_04_zz_06_zz_08_zz_10_zz_12_zz_14_zz_16_zz_18_zz_20_zz_22_zz_24_zz_26_zz_28_zz_30:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpsllw $8, %ymm0, %ymm0
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> zeroinitializer, <32 x i32> <i32 32, i32 0, i32 32, i32 2, i32 32, i32 4, i32 32, i32 6, i32 32, i32 8, i32 32, i32 10, i32 32, i32 12, i32 32, i32 14, i32 32, i32 16, i32 32, i32 18, i32 32, i32 20, i32 32, i32 22, i32 32, i32 24, i32 32, i32 26, i32 32, i32 28, i32 32, i32 30>
@@ -1843,7 +1805,7 @@ define <32 x i8> @shuffle_v32i8_zz_00_zz_02_zz_04_zz_06_zz_08_zz_10_zz_12_zz_14_
define <32 x i8> @shuffle_v32i8_zz_zz_00_01_zz_zz_04_05_zz_zz_08_09_zz_zz_12_13_zz_zz_16_17_zz_zz_20_21_zz_zz_24_25_zz_zz_28_29(<32 x i8> %a) {
; AVX1-LABEL: shuffle_v32i8_zz_zz_00_01_zz_zz_04_05_zz_zz_08_09_zz_zz_12_13_zz_zz_16_17_zz_zz_20_21_zz_zz_24_25_zz_zz_28_29:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpslld $16, %xmm0, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpslld $16, %xmm0, %xmm0
@@ -1851,7 +1813,7 @@ define <32 x i8> @shuffle_v32i8_zz_zz_00_01_zz_zz_04_05_zz_zz_08_09_zz_zz_12_13_
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v32i8_zz_zz_00_01_zz_zz_04_05_zz_zz_08_09_zz_zz_12_13_zz_zz_16_17_zz_zz_20_21_zz_zz_24_25_zz_zz_28_29:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpslld $16, %ymm0, %ymm0
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> zeroinitializer, <32 x i32> <i32 32, i32 32, i32 0, i32 1, i32 32, i32 32, i32 4, i32 5, i32 32, i32 32, i32 8, i32 9, i32 32, i32 32, i32 12, i32 13, i32 32, i32 32, i32 16, i32 17, i32 32, i32 32, i32 20, i32 21, i32 32, i32 32, i32 24, i32 25, i32 32, i32 32, i32 28, i32 29>
@@ -1860,7 +1822,7 @@ define <32 x i8> @shuffle_v32i8_zz_zz_00_01_zz_zz_04_05_zz_zz_08_09_zz_zz_12_13_
define <32 x i8> @shuffle_v32i8_zz_zz_zz_zz_zz_zz_00_01_zz_zz_zz_zz_zz_zz_08_09_zz_zz_zz_zz_zz_zz_16_17_zz_zz_zz_zz_zz_zz_24_25(<32 x i8> %a) {
; AVX1-LABEL: shuffle_v32i8_zz_zz_zz_zz_zz_zz_00_01_zz_zz_zz_zz_zz_zz_08_09_zz_zz_zz_zz_zz_zz_16_17_zz_zz_zz_zz_zz_zz_24_25:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpsllq $48, %xmm0, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpsllq $48, %xmm0, %xmm0
@@ -1868,7 +1830,7 @@ define <32 x i8> @shuffle_v32i8_zz_zz_zz_zz_zz_zz_00_01_zz_zz_zz_zz_zz_zz_08_09_
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v32i8_zz_zz_zz_zz_zz_zz_00_01_zz_zz_zz_zz_zz_zz_08_09_zz_zz_zz_zz_zz_zz_16_17_zz_zz_zz_zz_zz_zz_24_25:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpsllq $48, %ymm0, %ymm0
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> zeroinitializer, <32 x i32> <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 0, i32 1, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 8, i32 9, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 16, i32 17, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 24, i32 25>
@@ -1877,7 +1839,7 @@ define <32 x i8> @shuffle_v32i8_zz_zz_zz_zz_zz_zz_00_01_zz_zz_zz_zz_zz_zz_08_09_
define <32 x i8> @shuffle_v32i8_01_zz_03_zz_05_zz_07_zz_09_zz_11_zz_13_zz_15_zz_17_zz_19_zz_21_zz_23_zz_25_zz_27_zz_29_zz_31_zz(<32 x i8> %a) {
; AVX1-LABEL: shuffle_v32i8_01_zz_03_zz_05_zz_07_zz_09_zz_11_zz_13_zz_15_zz_17_zz_19_zz_21_zz_23_zz_25_zz_27_zz_29_zz_31_zz:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
@@ -1885,7 +1847,7 @@ define <32 x i8> @shuffle_v32i8_01_zz_03_zz_05_zz_07_zz_09_zz_11_zz_13_zz_15_zz_
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v32i8_01_zz_03_zz_05_zz_07_zz_09_zz_11_zz_13_zz_15_zz_17_zz_19_zz_21_zz_23_zz_25_zz_27_zz_29_zz_31_zz:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpsrlw $8, %ymm0, %ymm0
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> zeroinitializer, <32 x i32> <i32 1, i32 32, i32 3, i32 32, i32 5, i32 32, i32 7, i32 32, i32 9, i32 32, i32 11, i32 32, i32 13, i32 32, i32 15, i32 32, i32 17, i32 32, i32 19, i32 32, i32 21, i32 32, i32 23, i32 32, i32 25, i32 32, i32 27, i32 32, i32 29, i32 32, i32 31, i32 32>
@@ -1894,7 +1856,7 @@ define <32 x i8> @shuffle_v32i8_01_zz_03_zz_05_zz_07_zz_09_zz_11_zz_13_zz_15_zz_
define <32 x i8> @shuffle_v32i8_02_03_zz_zz_06_07_zz_zz_10_11_zz_zz_14_15_zz_zz_18_19_zz_zz_22_23_zz_zz_26_27_zz_zz_30_31_zz_zz(<32 x i8> %a) {
; AVX1-LABEL: shuffle_v32i8_02_03_zz_zz_06_07_zz_zz_10_11_zz_zz_14_15_zz_zz_18_19_zz_zz_22_23_zz_zz_26_27_zz_zz_30_31_zz_zz:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0
@@ -1902,7 +1864,7 @@ define <32 x i8> @shuffle_v32i8_02_03_zz_zz_06_07_zz_zz_10_11_zz_zz_14_15_zz_zz_
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v32i8_02_03_zz_zz_06_07_zz_zz_10_11_zz_zz_14_15_zz_zz_18_19_zz_zz_22_23_zz_zz_26_27_zz_zz_30_31_zz_zz:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpsrld $16, %ymm0, %ymm0
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> zeroinitializer, <32 x i32> <i32 2, i32 3, i32 32, i32 32, i32 6, i32 7, i32 32, i32 32, i32 10, i32 11, i32 32, i32 32, i32 14, i32 15, i32 32, i32 32, i32 18, i32 19, i32 32, i32 32, i32 22, i32 23, i32 32, i32 32, i32 26, i32 27, i32 32, i32 32, i32 30, i32 31, i32 32, i32 32>
@@ -1911,7 +1873,7 @@ define <32 x i8> @shuffle_v32i8_02_03_zz_zz_06_07_zz_zz_10_11_zz_zz_14_15_zz_zz_
define <32 x i8> @shuffle_v32i8_07_zz_zz_zz_zz_zz_zz_zz_15_zz_zz_zz_zz_z_zz_zz_23_zz_zz_zz_zz_zz_zz_zz_31_zz_zz_zz_zz_zz_zz_zz(<32 x i8> %a) {
; AVX1-LABEL: shuffle_v32i8_07_zz_zz_zz_zz_zz_zz_zz_15_zz_zz_zz_zz_z_zz_zz_23_zz_zz_zz_zz_zz_zz_zz_31_zz_zz_zz_zz_zz_zz_zz:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpsrlq $56, %xmm0, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpsrlq $56, %xmm0, %xmm0
@@ -1919,7 +1881,7 @@ define <32 x i8> @shuffle_v32i8_07_zz_zz_zz_zz_zz_zz_zz_15_zz_zz_zz_zz_z_zz_zz_2
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v32i8_07_zz_zz_zz_zz_zz_zz_zz_15_zz_zz_zz_zz_z_zz_zz_23_zz_zz_zz_zz_zz_zz_zz_31_zz_zz_zz_zz_zz_zz_zz:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpsrlq $56, %ymm0, %ymm0
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> zeroinitializer, <32 x i32> <i32 7, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 15, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 23, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32>
@@ -1928,7 +1890,7 @@ define <32 x i8> @shuffle_v32i8_07_zz_zz_zz_zz_zz_zz_zz_15_zz_zz_zz_zz_z_zz_zz_2
define <32 x i8> @shuffle_v32i8_32_zz_zz_zz_zz_zz_zz_zz_33_zz_zz_zz_zz_zz_zz_zz_34_zz_zz_zz_zz_zz_zz_zz_35_zz_zz_zz_zz_zz_zz_zz(<32 x i8> %a) {
; AVX1-LABEL: shuffle_v32i8_32_zz_zz_zz_zz_zz_zz_zz_33_zz_zz_zz_zz_zz_zz_zz_34_zz_zz_zz_zz_zz_zz_zz_35_zz_zz_zz_zz_zz_zz_zz:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0
; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
@@ -1936,7 +1898,7 @@ define <32 x i8> @shuffle_v32i8_32_zz_zz_zz_zz_zz_zz_zz_33_zz_zz_zz_zz_zz_zz_zz_
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v32i8_32_zz_zz_zz_zz_zz_zz_zz_33_zz_zz_zz_zz_zz_zz_zz_34_zz_zz_zz_zz_zz_zz_zz_35_zz_zz_zz_zz_zz_zz_zz:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <32 x i8> zeroinitializer, <32 x i8> %a, <32 x i32> <i32 32, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 33, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 34, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 35, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
@@ -1945,7 +1907,7 @@ define <32 x i8> @shuffle_v32i8_32_zz_zz_zz_zz_zz_zz_zz_33_zz_zz_zz_zz_zz_zz_zz_
define <32 x i8> @shuffle_v32i8_32_zz_zz_zz_33_zz_zz_zz_34_zz_zz_zz_35_zz_zz_zz_36_zz_zz_zz_37_zz_zz_zz_38_zz_zz_zz_39_zz_zz_zz(<32 x i8> %a) {
; AVX1-LABEL: shuffle_v32i8_32_zz_zz_zz_33_zz_zz_zz_34_zz_zz_zz_35_zz_zz_zz_36_zz_zz_zz_37_zz_zz_zz_38_zz_zz_zz_39_zz_zz_zz:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
@@ -1953,7 +1915,7 @@ define <32 x i8> @shuffle_v32i8_32_zz_zz_zz_33_zz_zz_zz_34_zz_zz_zz_35_zz_zz_zz_
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v32i8_32_zz_zz_zz_33_zz_zz_zz_34_zz_zz_zz_35_zz_zz_zz_36_zz_zz_zz_37_zz_zz_zz_38_zz_zz_zz_39_zz_zz_zz:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <32 x i8> zeroinitializer, <32 x i8> %a, <32 x i32> <i32 32, i32 0, i32 0, i32 0, i32 33, i32 0, i32 0, i32 0, i32 34, i32 0, i32 0, i32 0, i32 35, i32 0, i32 0, i32 0, i32 36, i32 0, i32 0, i32 0, i32 37, i32 0, i32 0, i32 0, i32 38, i32 0, i32 0, i32 0, i32 39, i32 0, i32 0, i32 0>
@@ -1962,7 +1924,7 @@ define <32 x i8> @shuffle_v32i8_32_zz_zz_zz_33_zz_zz_zz_34_zz_zz_zz_35_zz_zz_zz_
define <32 x i8> @shuffle_v32i8_32_zz_33_zz_34_zz_35_zz_36_zz_37_zz_38_zz_39_zz_40_zz_41_zz_42_zz_43_zz_44_zz_45_zz_46_zz_47_zz(<32 x i8> %a) {
; AVX1-LABEL: shuffle_v32i8_32_zz_33_zz_34_zz_35_zz_36_zz_37_zz_38_zz_39_zz_40_zz_41_zz_42_zz_43_zz_44_zz_45_zz_46_zz_47_zz:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
@@ -1970,7 +1932,7 @@ define <32 x i8> @shuffle_v32i8_32_zz_33_zz_34_zz_35_zz_36_zz_37_zz_38_zz_39_zz_
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v32i8_32_zz_33_zz_34_zz_35_zz_36_zz_37_zz_38_zz_39_zz_40_zz_41_zz_42_zz_43_zz_44_zz_45_zz_46_zz_47_zz:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <32 x i8> zeroinitializer, <32 x i8> %a, <32 x i32> <i32 32, i32 0, i32 33, i32 0, i32 34, i32 0, i32 35, i32 0, i32 36, i32 0, i32 37, i32 0, i32 38, i32 0, i32 39, i32 0, i32 40, i32 0, i32 41, i32 0, i32 42, i32 0, i32 43, i32 0, i32 44, i32 0, i32 45, i32 0, i32 46, i32 0, i32 47, i32 0>
@@ -1979,7 +1941,7 @@ define <32 x i8> @shuffle_v32i8_32_zz_33_zz_34_zz_35_zz_36_zz_37_zz_38_zz_39_zz_
define <32 x i8> @shuffle_v32i8_56_zz_zz_zz_57_zz_zz_zz_58_zz_zz_zz__zz_59_zz_zz_zz_60_zz_zz_zz_61_zz_zz_zz_62_zz_zz_zz_63_zz_zz_zz(<32 x i8> %a) {
; AVX1-LABEL: shuffle_v32i8_56_zz_zz_zz_57_zz_zz_zz_58_zz_zz_zz__zz_59_zz_zz_zz_60_zz_zz_zz_61_zz_zz_zz_62_zz_zz_zz_63_zz_zz_zz:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
@@ -1989,7 +1951,7 @@ define <32 x i8> @shuffle_v32i8_56_zz_zz_zz_57_zz_zz_zz_58_zz_zz_zz__zz_59_zz_zz
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v32i8_56_zz_zz_zz_57_zz_zz_zz_58_zz_zz_zz__zz_59_zz_zz_zz_60_zz_zz_zz_61_zz_zz_zz_62_zz_zz_zz_63_zz_zz_zz:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
@@ -2000,7 +1962,7 @@ define <32 x i8> @shuffle_v32i8_56_zz_zz_zz_57_zz_zz_zz_58_zz_zz_zz__zz_59_zz_zz
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v32i8_56_zz_zz_zz_57_zz_zz_zz_58_zz_zz_zz__zz_59_zz_zz_zz_60_zz_zz_zz_61_zz_zz_zz_62_zz_zz_zz_63_zz_zz_zz:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX512VL-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
@@ -2017,7 +1979,7 @@ define <32 x i8> @shuffle_v32i8_56_zz_zz_zz_57_zz_zz_zz_58_zz_zz_zz__zz_59_zz_zz
define <32 x i8> @shuffle_v32i8_47_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_63_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30(<32 x i8> %a, <32 x i8> %b) {
; AVX1-LABEL: shuffle_v32i8_47_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_63_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[15],xmm3[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
@@ -2026,7 +1988,7 @@ define <32 x i8> @shuffle_v32i8_47_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v32i8_47_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_63_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[15],ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],ymm1[31],ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 47, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 63, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
@@ -2035,7 +1997,7 @@ define <32 x i8> @shuffle_v32i8_47_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_
define <32 x i8> @shuffle_v32i8_uu_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_63_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30(<32 x i8> %a, <32 x i8> %b) {
; AVX1-LABEL: shuffle_v32i8_uu_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_63_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
@@ -2044,7 +2006,7 @@ define <32 x i8> @shuffle_v32i8_uu_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v32i8_uu_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_63_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[15],ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],ymm1[31],ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 undef, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 63, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
@@ -2053,7 +2015,7 @@ define <32 x i8> @shuffle_v32i8_uu_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_
define <32 x i8> @shuffle_v32i8_47_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_uu_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30(<32 x i8> %a, <32 x i8> %b) {
; AVX1-LABEL: shuffle_v32i8_47_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_uu_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpslldq {{.*#+}} xmm0 = zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
@@ -2061,7 +2023,7 @@ define <32 x i8> @shuffle_v32i8_47_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v32i8_47_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_uu_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[15],ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],ymm1[31],ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 47, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 undef, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
@@ -2070,7 +2032,7 @@ define <32 x i8> @shuffle_v32i8_47_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_
define <32 x i8> @shuffle_v32i8_uu_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_63_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu(<32 x i8> %a, <32 x i8> %b) {
; AVX1-LABEL: shuffle_v32i8_uu_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_63_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpslldq {{.*#+}} xmm0 = zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX1-NEXT: vpsrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -2078,7 +2040,7 @@ define <32 x i8> @shuffle_v32i8_uu_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v32i8_uu_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_63_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[15],ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],ymm1[31],ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 undef, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 63, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -2087,7 +2049,7 @@ define <32 x i8> @shuffle_v32i8_uu_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_
define <32 x i8> @shuffle_v32i8_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_63_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30(<32 x i8> %a, <32 x i8> %b) {
; AVX1-LABEL: shuffle_v32i8_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_63_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
@@ -2095,7 +2057,7 @@ define <32 x i8> @shuffle_v32i8_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v32i8_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_63_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[15],ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],ymm1[31],ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 63, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
@@ -2104,7 +2066,7 @@ define <32 x i8> @shuffle_v32i8_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_
define <32 x i8> @shuffle_v32i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_32_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31_48(<32 x i8> %a, <32 x i8> %b) {
; AVX1-LABEL: shuffle_v32i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_32_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31_48:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm3[0]
@@ -2113,7 +2075,7 @@ define <32 x i8> @shuffle_v32i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_32_
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v32i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_32_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31_48:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm1[0],ymm0[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm1[16]
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48>
@@ -2122,7 +2084,7 @@ define <32 x i8> @shuffle_v32i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_32_
define <32 x i8> @shuffle_v32i8_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47_00_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63_16(<32 x i8> %a, <32 x i8> %b) {
; AVX1-LABEL: shuffle_v32i8_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47_00_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63_16:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm3[0]
@@ -2131,7 +2093,7 @@ define <32 x i8> @shuffle_v32i8_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47_00_
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v32i8_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47_00_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63_16:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm0[0],ymm1[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm0[16]
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 00, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 16>
@@ -2140,7 +2102,7 @@ define <32 x i8> @shuffle_v32i8_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47_00_
define <32 x i8> @shuffle_v32i8_15_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_31_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62(<32 x i8> %a, <32 x i8> %b) {
; AVX1-LABEL: shuffle_v32i8_15_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_31_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[15],xmm3[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
@@ -2149,7 +2111,7 @@ define <32 x i8> @shuffle_v32i8_15_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v32i8_15_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_31_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],ymm0[31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 15, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 31, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62>
@@ -2158,7 +2120,7 @@ define <32 x i8> @shuffle_v32i8_15_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_
define <32 x i8> @shuffle_v32i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31_16(<32 x i8> %a, <32 x i8> %b) {
; AVX1-LABEL: shuffle_v32i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31_16:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0]
@@ -2166,7 +2128,7 @@ define <32 x i8> @shuffle_v32i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00_
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v32i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31_16:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,16]
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 16>
@@ -2175,7 +2137,7 @@ define <32 x i8> @shuffle_v32i8_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00_
define <32 x i8> @shuffle_v32i8_15_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_31_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30(<32 x i8> %a, <32 x i8> %b) {
; AVX1-LABEL: shuffle_v32i8_15_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_31_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm0[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
@@ -2183,7 +2145,7 @@ define <32 x i8> @shuffle_v32i8_15_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v32i8_15_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_31_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,31,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 31, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
@@ -2192,19 +2154,19 @@ define <32 x i8> @shuffle_v32i8_15_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_
define <32 x i8> @shuffle_v32i8_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_10_10_10_10_10_10_10_10_10_10_10_10_10_10_10_10(<32 x i8> %a, <32 x i8> %b) {
; AVX1-LABEL: shuffle_v32i8_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_10_10_10_10_10_10_10_10_10_10_10_10_10_10_10_10:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v32i8_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_10_10_10_10_10_10_10_10_10_10_10_10_10_10_10_10:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10]
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v32i8_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_10_10_10_10_10_10_10_10_10_10_10_10_10_10_10_10:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7]
; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
@@ -2216,7 +2178,7 @@ define <32 x i8> @shuffle_v32i8_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_
define <32 x i8> @shuffle_v32i8_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16(<32 x i8> %a, <32 x i8> %b) {
; AVX1-LABEL: shuffle_v32i8_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
@@ -2224,7 +2186,7 @@ define <32 x i8> @shuffle_v32i8_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v32i8_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX2OR512VL-NEXT: vpbroadcastb %xmm0, %ymm0
; AVX2OR512VL-NEXT: retq
@@ -2234,7 +2196,7 @@ define <32 x i8> @shuffle_v32i8_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16_
define <32 x i8> @shuffle_v32i8_15_15_15_15_15_15_15_15_32_32_32_32_32_32_32_32_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu(<32 x i8> %a, <32 x i8> %b) {
; AVX1-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_32_32_32_32_32_32_32_32_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,15,15,15,15,15,15,15,12,12,13,13,14,14,15,15]
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
@@ -2242,14 +2204,14 @@ define <32 x i8> @shuffle_v32i8_15_15_15_15_15_15_15_15_32_32_32_32_32_32_32_32_
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_32_32_32_32_32_32_32_32_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,15,15,15,15,15,15,15,12,12,13,13,14,14,15,15]
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_32_32_32_32_32_32_32_32_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpbroadcastb %xmm1, %xmm1
; AVX512VL-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
@@ -2262,12 +2224,12 @@ define <32 x i8> @shuffle_v32i8_15_15_15_15_15_15_15_15_32_32_32_32_32_32_32_32_
define <32 x i8> @shuffle_v32i8_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu(<32 x i8> %a, <32 x i8> %b) {
; AVX1OR2-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
-; AVX1OR2: # BB#0:
+; AVX1OR2: # %bb.0:
; AVX1OR2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX1OR2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; AVX512VL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7]
; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
@@ -2278,19 +2240,19 @@ define <32 x i8> @shuffle_v32i8_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_
define <32 x i8> @shuffle_v32i8_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu(<32 x i8> %a, <32 x i8> %b) {
; AVX1-LABEL: shuffle_v32i8_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v32i8_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v32i8_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX512VL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; AVX512VL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6]
@@ -2300,9 +2262,38 @@ define <32 x i8> @shuffle_v32i8_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_
ret <32 x i8> %shuffle
}
+define <32 x i8> @shuffe_v32i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30_32_34_36_38_40_42_44_46_48_50_52_54_56_58_60_62(<16 x i16> %a0, <16 x i16> %a1) {
+; AVX1-LABEL: shuffe_v32i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30_32_34_36_38_40_42_44_46_48_50_52_54_56_58_60_62:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
+; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
+; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2OR512VL-LABEL: shuffe_v32i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30_32_34_36_38_40_42_44_46_48_50_52_54_56_58_60_62:
+; AVX2OR512VL: # %bb.0:
+; AVX2OR512VL-NEXT: vpsrlw $8, %ymm0, %ymm0
+; AVX2OR512VL-NEXT: vpsrlw $8, %ymm1, %ymm1
+; AVX2OR512VL-NEXT: vpackuswb %ymm1, %ymm0, %ymm0
+; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX2OR512VL-NEXT: retq
+ %1 = lshr <16 x i16> %a0, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+ %2 = lshr <16 x i16> %a1, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+ %3 = bitcast <16 x i16> %1 to <32 x i8>
+ %4 = bitcast <16 x i16> %2 to <32 x i8>
+ %5 = shufflevector <32 x i8> %3, <32 x i8> %4, <32 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30, i32 32, i32 34, i32 36, i32 38, i32 40, i32 42, i32 44, i32 46, i32 48, i32 50, i32 52, i32 54, i32 56, i32 58, i32 60, i32 62>
+ ret <32 x i8> %5
+}
+
define <4 x i64> @PR28136(<32 x i8> %a0, <32 x i8> %a1) {
; AVX1-LABEL: PR28136:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [8,8,10,10,12,12,14,14,9,9,11,11,13,13,15,15]
@@ -2320,7 +2311,7 @@ define <4 x i64> @PR28136(<32 x i8> %a0, <32 x i8> %a1) {
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: PR28136:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX2OR512VL-NEXT: retq
@@ -2332,7 +2323,7 @@ define <4 x i64> @PR28136(<32 x i8> %a0, <32 x i8> %a1) {
define <32 x i8> @insert_dup_mem_v32i8_i32(i32* %ptr) {
; AVX1-LABEL: insert_dup_mem_v32i8_i32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
@@ -2340,7 +2331,7 @@ define <32 x i8> @insert_dup_mem_v32i8_i32(i32* %ptr) {
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: insert_dup_mem_v32i8_i32:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpbroadcastb (%rdi), %ymm0
; AVX2OR512VL-NEXT: retq
%tmp = load i32, i32* %ptr, align 4
@@ -2352,7 +2343,7 @@ define <32 x i8> @insert_dup_mem_v32i8_i32(i32* %ptr) {
define <32 x i8> @insert_dup_mem_v32i8_sext_i8(i8* %ptr) {
; AVX1-LABEL: insert_dup_mem_v32i8_sext_i8:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: movsbl (%rdi), %eax
; AVX1-NEXT: vmovd %eax, %xmm0
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
@@ -2361,7 +2352,7 @@ define <32 x i8> @insert_dup_mem_v32i8_sext_i8(i8* %ptr) {
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: insert_dup_mem_v32i8_sext_i8:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpbroadcastb (%rdi), %ymm0
; AVX2OR512VL-NEXT: retq
%tmp = load i8, i8* %ptr, align 1
@@ -2374,14 +2365,14 @@ define <32 x i8> @insert_dup_mem_v32i8_sext_i8(i8* %ptr) {
define <32 x i8> @insert_dup_elt1_mem_v32i8_i32(i32* %ptr) {
; AVX1-LABEL: insert_dup_elt1_mem_v32i8_i32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: insert_dup_elt1_mem_v32i8_i32:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpbroadcastb 1(%rdi), %ymm0
; AVX2OR512VL-NEXT: retq
%tmp = load i32, i32* %ptr, align 4
@@ -2393,14 +2384,14 @@ define <32 x i8> @insert_dup_elt1_mem_v32i8_i32(i32* %ptr) {
define <32 x i8> @insert_dup_elt3_mem_v32i8_i32(i32* %ptr) {
; AVX1-LABEL: insert_dup_elt3_mem_v32i8_i32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: insert_dup_elt3_mem_v32i8_i32:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpbroadcastb 3(%rdi), %ymm0
; AVX2OR512VL-NEXT: retq
%tmp = load i32, i32* %ptr, align 4
@@ -2412,7 +2403,7 @@ define <32 x i8> @insert_dup_elt3_mem_v32i8_i32(i32* %ptr) {
define <32 x i8> @insert_dup_elt1_mem_v32i8_sext_i8(i8* %ptr) {
; AVX1-LABEL: insert_dup_elt1_mem_v32i8_sext_i8:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: movsbl (%rdi), %eax
; AVX1-NEXT: vmovd %eax, %xmm0
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
@@ -2420,7 +2411,7 @@ define <32 x i8> @insert_dup_elt1_mem_v32i8_sext_i8(i8* %ptr) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: insert_dup_elt1_mem_v32i8_sext_i8:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: movsbl (%rdi), %eax
; AVX2-NEXT: shrl $8, %eax
; AVX2-NEXT: vmovd %eax, %xmm0
@@ -2428,7 +2419,7 @@ define <32 x i8> @insert_dup_elt1_mem_v32i8_sext_i8(i8* %ptr) {
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: insert_dup_elt1_mem_v32i8_sext_i8:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: movsbl (%rdi), %eax
; AVX512VL-NEXT: shrl $8, %eax
; AVX512VL-NEXT: vpbroadcastb %eax, %ymm0
diff --git a/test/CodeGen/X86/vector-shuffle-256-v4.ll b/test/CodeGen/X86/vector-shuffle-256-v4.ll
index 80579f625278..41dcb5032ee2 100644
--- a/test/CodeGen/X86/vector-shuffle-256-v4.ll
+++ b/test/CodeGen/X86/vector-shuffle-256-v4.ll
@@ -5,18 +5,18 @@
define <4 x double> @shuffle_v4f64_0000(<4 x double> %a, <4 x double> %b) {
; AVX1-LABEL: shuffle_v4f64_0000:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v4f64_0000:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vbroadcastsd %xmm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v4f64_0000:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vbroadcastsd %xmm0, %ymm0
; AVX512VL-NEXT: retq
%shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
@@ -25,18 +25,18 @@ define <4 x double> @shuffle_v4f64_0000(<4 x double> %a, <4 x double> %b) {
define <4 x double> @shuffle_v4f64_0001(<4 x double> %a, <4 x double> %b) {
; AVX1-LABEL: shuffle_v4f64_0001:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = xmm0[0,0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v4f64_0001:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,0,1]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v4f64_0001:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,0,1]
; AVX512VL-NEXT: retq
%shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
@@ -45,7 +45,7 @@ define <4 x double> @shuffle_v4f64_0001(<4 x double> %a, <4 x double> %b) {
define <4 x double> @shuffle_v4f64_0020(<4 x double> %a, <4 x double> %b) {
; AVX1-LABEL: shuffle_v4f64_0020:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0]
; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
@@ -53,12 +53,12 @@ define <4 x double> @shuffle_v4f64_0020(<4 x double> %a, <4 x double> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v4f64_0020:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,2,0]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v4f64_0020:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,2,0]
; AVX512VL-NEXT: retq
%shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 0, i32 2, i32 0>
@@ -67,19 +67,19 @@ define <4 x double> @shuffle_v4f64_0020(<4 x double> %a, <4 x double> %b) {
define <4 x double> @shuffle_v4f64_0300(<4 x double> %a, <4 x double> %b) {
; AVX1-LABEL: shuffle_v4f64_0300:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX1-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[0,1,2,2]
; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v4f64_0300:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,0,0]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v4f64_0300:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,0,0]
; AVX512VL-NEXT: retq
%shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 3, i32 0, i32 0>
@@ -88,19 +88,19 @@ define <4 x double> @shuffle_v4f64_0300(<4 x double> %a, <4 x double> %b) {
define <4 x double> @shuffle_v4f64_1000(<4 x double> %a, <4 x double> %b) {
; AVX1-LABEL: shuffle_v4f64_1000:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v4f64_1000:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,0,0,0]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v4f64_1000:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,0,0,0]
; AVX512VL-NEXT: retq
%shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 1, i32 0, i32 0, i32 0>
@@ -109,18 +109,18 @@ define <4 x double> @shuffle_v4f64_1000(<4 x double> %a, <4 x double> %b) {
define <4 x double> @shuffle_v4f64_2200(<4 x double> %a, <4 x double> %b) {
; AVX1-LABEL: shuffle_v4f64_2200:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v4f64_2200:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,0,0]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v4f64_2200:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,0,0]
; AVX512VL-NEXT: retq
%shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 2, i32 2, i32 0, i32 0>
@@ -129,18 +129,18 @@ define <4 x double> @shuffle_v4f64_2200(<4 x double> %a, <4 x double> %b) {
define <4 x double> @shuffle_v4f64_2222(<4 x double> %a, <4 x double> %b) {
; AVX1-LABEL: shuffle_v4f64_2222:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v4f64_2222:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v4f64_2222:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2]
; AVX512VL-NEXT: retq
%shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
@@ -149,18 +149,18 @@ define <4 x double> @shuffle_v4f64_2222(<4 x double> %a, <4 x double> %b) {
define <4 x double> @shuffle_v4f64_2222_bc(<4 x i64> %a, <4 x i64> %b) {
; AVX1-LABEL: shuffle_v4f64_2222_bc:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v4f64_2222_bc:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v4f64_2222_bc:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2]
; AVX512VL-NEXT: retq
%tmp0 = bitcast <4 x i64> %a to <4 x double>
@@ -171,19 +171,19 @@ define <4 x double> @shuffle_v4f64_2222_bc(<4 x i64> %a, <4 x i64> %b) {
define <4 x double> @shuffle_v4f64_3330(<4 x double> %a, <4 x double> %b) {
; AVX1-LABEL: shuffle_v4f64_3330:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3]
; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,1,3,2]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v4f64_3330:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,0]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v4f64_3330:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,0]
; AVX512VL-NEXT: retq
%shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 0>
@@ -192,18 +192,18 @@ define <4 x double> @shuffle_v4f64_3330(<4 x double> %a, <4 x double> %b) {
define <4 x double> @shuffle_v4f64_3210(<4 x double> %a, <4 x double> %b) {
; AVX1-LABEL: shuffle_v4f64_3210:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v4f64_3210:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,2,1,0]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v4f64_3210:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,2,1,0]
; AVX512VL-NEXT: retq
%shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
@@ -212,7 +212,7 @@ define <4 x double> @shuffle_v4f64_3210(<4 x double> %a, <4 x double> %b) {
define <4 x double> @shuffle_v4f64_0023(<4 x double> %a, <4 x double> %b) {
; ALL-LABEL: shuffle_v4f64_0023:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,0,2,3]
; ALL-NEXT: retq
@@ -222,7 +222,7 @@ define <4 x double> @shuffle_v4f64_0023(<4 x double> %a, <4 x double> %b) {
define <4 x double> @shuffle_v4f64_0022(<4 x double> %a, <4 x double> %b) {
; ALL-LABEL: shuffle_v4f64_0022:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
; ALL-NEXT: retq
%shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
@@ -231,7 +231,7 @@ define <4 x double> @shuffle_v4f64_0022(<4 x double> %a, <4 x double> %b) {
define <4 x double> @shuffle_v4f64mem_0022(<4 x double>* %ptr, <4 x double> %b) {
; ALL-LABEL: shuffle_v4f64mem_0022:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vmovddup {{.*#+}} ymm0 = mem[0,0,2,2]
; ALL-NEXT: retq
%a = load <4 x double>, <4 x double>* %ptr
@@ -241,7 +241,7 @@ define <4 x double> @shuffle_v4f64mem_0022(<4 x double>* %ptr, <4 x double> %b)
define <4 x double> @shuffle_v4f64_1032(<4 x double> %a, <4 x double> %b) {
; ALL-LABEL: shuffle_v4f64_1032:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
; ALL-NEXT: retq
%shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
@@ -250,7 +250,7 @@ define <4 x double> @shuffle_v4f64_1032(<4 x double> %a, <4 x double> %b) {
define <4 x double> @shuffle_v4f64_1133(<4 x double> %a, <4 x double> %b) {
; ALL-LABEL: shuffle_v4f64_1133:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,1,3,3]
; ALL-NEXT: retq
%shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
@@ -259,7 +259,7 @@ define <4 x double> @shuffle_v4f64_1133(<4 x double> %a, <4 x double> %b) {
define <4 x double> @shuffle_v4f64_1023(<4 x double> %a, <4 x double> %b) {
; ALL-LABEL: shuffle_v4f64_1023:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,3]
; ALL-NEXT: retq
%shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 1, i32 0, i32 2, i32 3>
@@ -268,16 +268,37 @@ define <4 x double> @shuffle_v4f64_1023(<4 x double> %a, <4 x double> %b) {
define <4 x double> @shuffle_v4f64_1022(<4 x double> %a, <4 x double> %b) {
; ALL-LABEL: shuffle_v4f64_1022:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,2]
; ALL-NEXT: retq
%shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 1, i32 0, i32 2, i32 2>
ret <4 x double> %shuffle
}
+define <4 x double> @shuffle_v4f64_0213(<4 x double> %a, <4 x double> %b) {
+; AVX1-LABEL: shuffle_v4f64_0213:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX1-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[0,0,3,2]
+; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v4f64_0213:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX2-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v4f64_0213:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX512VL-NEXT: retq
+ %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+ ret <4 x double> %shuffle
+}
+
define <4 x double> @shuffle_v4f64_0423(<4 x double> %a, <4 x double> %b) {
; ALL-LABEL: shuffle_v4f64_0423:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0]
; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3]
; ALL-NEXT: retq
@@ -287,7 +308,7 @@ define <4 x double> @shuffle_v4f64_0423(<4 x double> %a, <4 x double> %b) {
define <4 x double> @shuffle_v4f64_0462(<4 x double> %a, <4 x double> %b) {
; ALL-LABEL: shuffle_v4f64_0462:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vmovddup {{.*#+}} ymm1 = ymm1[0,0,2,2]
; ALL-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3]
@@ -298,7 +319,7 @@ define <4 x double> @shuffle_v4f64_0462(<4 x double> %a, <4 x double> %b) {
define <4 x double> @shuffle_v4f64_0426(<4 x double> %a, <4 x double> %b) {
; ALL-LABEL: shuffle_v4f64_0426:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
; ALL-NEXT: retq
%shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
@@ -307,7 +328,7 @@ define <4 x double> @shuffle_v4f64_0426(<4 x double> %a, <4 x double> %b) {
define <4 x double> @shuffle_v4f64_1537(<4 x double> %a, <4 x double> %b) {
; ALL-LABEL: shuffle_v4f64_1537:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
; ALL-NEXT: retq
%shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
@@ -316,7 +337,7 @@ define <4 x double> @shuffle_v4f64_1537(<4 x double> %a, <4 x double> %b) {
define <4 x double> @shuffle_v4f64_4062(<4 x double> %a, <4 x double> %b) {
; ALL-LABEL: shuffle_v4f64_4062:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
; ALL-NEXT: retq
%shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 4, i32 0, i32 6, i32 2>
@@ -325,7 +346,7 @@ define <4 x double> @shuffle_v4f64_4062(<4 x double> %a, <4 x double> %b) {
define <4 x double> @shuffle_v4f64_5173(<4 x double> %a, <4 x double> %b) {
; ALL-LABEL: shuffle_v4f64_5173:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
; ALL-NEXT: retq
%shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 5, i32 1, i32 7, i32 3>
@@ -334,7 +355,7 @@ define <4 x double> @shuffle_v4f64_5173(<4 x double> %a, <4 x double> %b) {
define <4 x double> @shuffle_v4f64_5163(<4 x double> %a, <4 x double> %b) {
; ALL-LABEL: shuffle_v4f64_5163:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[2],ymm0[3]
; ALL-NEXT: retq
%shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 5, i32 1, i32 6, i32 3>
@@ -343,7 +364,7 @@ define <4 x double> @shuffle_v4f64_5163(<4 x double> %a, <4 x double> %b) {
define <4 x double> @shuffle_v4f64_0527(<4 x double> %a, <4 x double> %b) {
; ALL-LABEL: shuffle_v4f64_0527:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3]
; ALL-NEXT: retq
%shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
@@ -352,7 +373,7 @@ define <4 x double> @shuffle_v4f64_0527(<4 x double> %a, <4 x double> %b) {
define <4 x double> @shuffle_v4f64_4163(<4 x double> %a, <4 x double> %b) {
; ALL-LABEL: shuffle_v4f64_4163:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3]
; ALL-NEXT: retq
%shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
@@ -361,7 +382,7 @@ define <4 x double> @shuffle_v4f64_4163(<4 x double> %a, <4 x double> %b) {
define <4 x double> @shuffle_v4f64_0145(<4 x double> %a, <4 x double> %b) {
; ALL-LABEL: shuffle_v4f64_0145:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; ALL-NEXT: retq
%shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
@@ -370,7 +391,7 @@ define <4 x double> @shuffle_v4f64_0145(<4 x double> %a, <4 x double> %b) {
define <4 x double> @shuffle_v4f64_4501(<4 x double> %a, <4 x double> %b) {
; ALL-LABEL: shuffle_v4f64_4501:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; ALL-NEXT: retq
%shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
@@ -379,7 +400,7 @@ define <4 x double> @shuffle_v4f64_4501(<4 x double> %a, <4 x double> %b) {
define <4 x double> @shuffle_v4f64_0167(<4 x double> %a, <4 x double> %b) {
; ALL-LABEL: shuffle_v4f64_0167:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
; ALL-NEXT: retq
%shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
@@ -388,7 +409,7 @@ define <4 x double> @shuffle_v4f64_0167(<4 x double> %a, <4 x double> %b) {
define <4 x double> @shuffle_v4f64_1054(<4 x double> %a, <4 x double> %b) {
; ALL-LABEL: shuffle_v4f64_1054:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
; ALL-NEXT: retq
@@ -398,7 +419,7 @@ define <4 x double> @shuffle_v4f64_1054(<4 x double> %a, <4 x double> %b) {
define <4 x double> @shuffle_v4f64_3254(<4 x double> %a, <4 x double> %b) {
; ALL-LABEL: shuffle_v4f64_3254:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
; ALL-NEXT: retq
@@ -408,7 +429,7 @@ define <4 x double> @shuffle_v4f64_3254(<4 x double> %a, <4 x double> %b) {
define <4 x double> @shuffle_v4f64_3276(<4 x double> %a, <4 x double> %b) {
; ALL-LABEL: shuffle_v4f64_3276:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
; ALL-NEXT: retq
@@ -418,7 +439,7 @@ define <4 x double> @shuffle_v4f64_3276(<4 x double> %a, <4 x double> %b) {
define <4 x double> @shuffle_v4f64_1076(<4 x double> %a, <4 x double> %b) {
; ALL-LABEL: shuffle_v4f64_1076:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
; ALL-NEXT: retq
@@ -428,21 +449,21 @@ define <4 x double> @shuffle_v4f64_1076(<4 x double> %a, <4 x double> %b) {
define <4 x double> @shuffle_v4f64_0415(<4 x double> %a, <4 x double> %b) {
; AVX1-LABEL: shuffle_v4f64_0415:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm0[1],xmm1[1]
-; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v4f64_0415:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,0,2,1]
; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3]
; AVX2-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v4f64_0415:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovapd {{.*#+}} ymm2 = [0,4,1,5]
; AVX512VL-NEXT: vpermt2pd %ymm1, %ymm2, %ymm0
; AVX512VL-NEXT: retq
@@ -452,7 +473,7 @@ define <4 x double> @shuffle_v4f64_0415(<4 x double> %a, <4 x double> %b) {
define <4 x double> @shuffle_v4f64_u062(<4 x double> %a, <4 x double> %b) {
; ALL-LABEL: shuffle_v4f64_u062:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
; ALL-NEXT: retq
%shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 undef, i32 0, i32 6, i32 2>
@@ -461,7 +482,7 @@ define <4 x double> @shuffle_v4f64_u062(<4 x double> %a, <4 x double> %b) {
define <4 x double> @shuffle_v4f64_15uu(<4 x double> %a, <4 x double> %b) {
; ALL-LABEL: shuffle_v4f64_15uu:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
; ALL-NEXT: retq
%shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 1, i32 5, i32 undef, i32 undef>
@@ -470,7 +491,7 @@ define <4 x double> @shuffle_v4f64_15uu(<4 x double> %a, <4 x double> %b) {
define <4 x double> @shuffle_v4f64_11uu(<4 x double> %a, <4 x double> %b) {
; ALL-LABEL: shuffle_v4f64_11uu:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,1]
; ALL-NEXT: retq
%shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 1, i32 1, i32 undef, i32 undef>
@@ -479,18 +500,18 @@ define <4 x double> @shuffle_v4f64_11uu(<4 x double> %a, <4 x double> %b) {
define <4 x double> @shuffle_v4f64_22uu(<4 x double> %a, <4 x double> %b) {
; AVX1-LABEL: shuffle_v4f64_22uu:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v4f64_22uu:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,3]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v4f64_22uu:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,3]
; AVX512VL-NEXT: retq
%shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 2, i32 2, i32 undef, i32 undef>
@@ -499,18 +520,18 @@ define <4 x double> @shuffle_v4f64_22uu(<4 x double> %a, <4 x double> %b) {
define <4 x double> @shuffle_v4f64_3333(<4 x double> %a, <4 x double> %b) {
; AVX1-LABEL: shuffle_v4f64_3333:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,1,3,3]
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v4f64_3333:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v4f64_3333:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3]
; AVX512VL-NEXT: retq
%shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
@@ -519,23 +540,23 @@ define <4 x double> @shuffle_v4f64_3333(<4 x double> %a, <4 x double> %b) {
define <4 x double> @shuffle_v4f64_0z3z(<4 x double> %a, <4 x double> %b) {
; AVX1-LABEL: shuffle_v4f64_0z3z:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,0,3,2]
-; AVX1-NEXT: vxorpd %ymm1, %ymm1, %ymm1
+; AVX1-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v4f64_0z3z:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,0,3,2]
-; AVX2-NEXT: vxorpd %ymm1, %ymm1, %ymm1
+; AVX2-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v4f64_0z3z:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,0,3,2]
-; AVX512VL-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512VL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3]
; AVX512VL-NEXT: retq
%shuffle = shufflevector <4 x double> %a, <4 x double> <double 0.000000e+00, double undef, double undef, double undef>, <4 x i32> <i32 0, i32 4, i32 3, i32 4>
@@ -544,8 +565,8 @@ define <4 x double> @shuffle_v4f64_0z3z(<4 x double> %a, <4 x double> %b) {
define <4 x double> @shuffle_v4f64_1z2z(<4 x double> %a, <4 x double> %b) {
; AVX1-LABEL: shuffle_v4f64_1z2z:
-; AVX1: # BB#0:
-; AVX1-NEXT: vxorpd %ymm1, %ymm1, %ymm1
+; AVX1: # %bb.0:
+; AVX1-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0]
@@ -554,15 +575,15 @@ define <4 x double> @shuffle_v4f64_1z2z(<4 x double> %a, <4 x double> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v4f64_1z2z:
-; AVX2: # BB#0:
-; AVX2-NEXT: vxorpd %ymm1, %ymm1, %ymm1
+; AVX2: # %bb.0:
+; AVX2-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3]
; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,0,2,0]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v4f64_1z2z:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512VL-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3]
; AVX512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,0,2,0]
; AVX512VL-NEXT: retq
@@ -572,18 +593,18 @@ define <4 x double> @shuffle_v4f64_1z2z(<4 x double> %a, <4 x double> %b) {
define <4 x i64> @shuffle_v4i64_0000(<4 x i64> %a, <4 x i64> %b) {
; AVX1-LABEL: shuffle_v4i64_0000:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v4i64_0000:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vbroadcastsd %xmm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v4i64_0000:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vbroadcastsd %xmm0, %ymm0
; AVX512VL-NEXT: retq
%shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
@@ -592,19 +613,19 @@ define <4 x i64> @shuffle_v4i64_0000(<4 x i64> %a, <4 x i64> %b) {
define <4 x i64> @shuffle_v4i64_0001(<4 x i64> %a, <4 x i64> %b) {
; AVX1-LABEL: shuffle_v4i64_0001:
-; AVX1: # BB#0:
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,1]
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,1,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v4i64_0001:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1]
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,0,1]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v4i64_0001:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1]
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,0,1]
; AVX512VL-NEXT: retq
%shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
ret <4 x i64> %shuffle
@@ -612,21 +633,21 @@ define <4 x i64> @shuffle_v4i64_0001(<4 x i64> %a, <4 x i64> %b) {
define <4 x i64> @shuffle_v4i64_0020(<4 x i64> %a, <4 x i64> %b) {
; AVX1-LABEL: shuffle_v4i64_0020:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; AVX1-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v4i64_0020:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,0]
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,2,0]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v4i64_0020:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,0]
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,2,0]
; AVX512VL-NEXT: retq
%shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 0, i32 2, i32 0>
ret <4 x i64> %shuffle
@@ -634,20 +655,20 @@ define <4 x i64> @shuffle_v4i64_0020(<4 x i64> %a, <4 x i64> %b) {
define <4 x i64> @shuffle_v4i64_0112(<4 x i64> %a, <4 x i64> %b) {
; AVX1-LABEL: shuffle_v4i64_0112:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v4i64_0112:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,2]
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,2]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v4i64_0112:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,2]
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,2]
; AVX512VL-NEXT: retq
%shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 1, i32 1, i32 2>
ret <4 x i64> %shuffle
@@ -655,20 +676,20 @@ define <4 x i64> @shuffle_v4i64_0112(<4 x i64> %a, <4 x i64> %b) {
define <4 x i64> @shuffle_v4i64_0300(<4 x i64> %a, <4 x i64> %b) {
; AVX1-LABEL: shuffle_v4i64_0300:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX1-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[0,1,2,2]
; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v4i64_0300:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,0]
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,0,0]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v4i64_0300:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,0]
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,0,0]
; AVX512VL-NEXT: retq
%shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 3, i32 0, i32 0>
ret <4 x i64> %shuffle
@@ -676,20 +697,20 @@ define <4 x i64> @shuffle_v4i64_0300(<4 x i64> %a, <4 x i64> %b) {
define <4 x i64> @shuffle_v4i64_1000(<4 x i64> %a, <4 x i64> %b) {
; AVX1-LABEL: shuffle_v4i64_1000:
-; AVX1: # BB#0:
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v4i64_1000:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,0,0,0]
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,0,0,0]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v4i64_1000:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,0,0,0]
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,0,0,0]
; AVX512VL-NEXT: retq
%shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 1, i32 0, i32 0, i32 0>
ret <4 x i64> %shuffle
@@ -697,19 +718,19 @@ define <4 x i64> @shuffle_v4i64_1000(<4 x i64> %a, <4 x i64> %b) {
define <4 x i64> @shuffle_v4i64_2200(<4 x i64> %a, <4 x i64> %b) {
; AVX1-LABEL: shuffle_v4i64_2200:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v4i64_2200:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,0,0]
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,0,0]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v4i64_2200:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,0,0]
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,0,0]
; AVX512VL-NEXT: retq
%shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 2, i32 2, i32 0, i32 0>
ret <4 x i64> %shuffle
@@ -717,20 +738,20 @@ define <4 x i64> @shuffle_v4i64_2200(<4 x i64> %a, <4 x i64> %b) {
define <4 x i64> @shuffle_v4i64_3330(<4 x i64> %a, <4 x i64> %b) {
; AVX1-LABEL: shuffle_v4i64_3330:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3]
; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,1,3,2]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v4i64_3330:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,3,3,0]
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,0]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v4i64_3330:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,3,3,0]
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,0]
; AVX512VL-NEXT: retq
%shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 0>
ret <4 x i64> %shuffle
@@ -738,42 +759,63 @@ define <4 x i64> @shuffle_v4i64_3330(<4 x i64> %a, <4 x i64> %b) {
define <4 x i64> @shuffle_v4i64_3210(<4 x i64> %a, <4 x i64> %b) {
; AVX1-LABEL: shuffle_v4i64_3210:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v4i64_3210:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,2,1,0]
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,2,1,0]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v4i64_3210:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,2,1,0]
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,2,1,0]
; AVX512VL-NEXT: retq
%shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
ret <4 x i64> %shuffle
}
+define <4 x i64> @shuffle_v4i64_0213(<4 x i64> %a, <4 x i64> %b) {
+; AVX1-LABEL: shuffle_v4i64_0213:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX1-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[0,0,3,2]
+; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v4i64_0213:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX2-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v4i64_0213:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX512VL-NEXT: retq
+ %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+ ret <4 x i64> %shuffle
+}
+
define <4 x i64> @shuffle_v4i64_0124(<4 x i64> %a, <4 x i64> %b) {
; AVX1-LABEL: shuffle_v4i64_0124:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v4i64_0124:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpbroadcastq %xmm1, %ymm1
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
+; AVX2: # %bb.0:
+; AVX2-NEXT: vbroadcastsd %xmm1, %ymm1
+; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v4i64_0124:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vpbroadcastq %xmm1, %ymm1
-; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vbroadcastsd %xmm1, %ymm1
+; AVX512VL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
; AVX512VL-NEXT: retq
%shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
ret <4 x i64> %shuffle
@@ -781,24 +823,24 @@ define <4 x i64> @shuffle_v4i64_0124(<4 x i64> %a, <4 x i64> %b) {
define <4 x i64> @shuffle_v4i64_0142(<4 x i64> %a, <4 x i64> %b) {
; AVX1-LABEL: shuffle_v4i64_0142:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,1,2,2]
; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v4i64_0142:
-; AVX2: # BB#0:
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,2]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
+; AVX2: # %bb.0:
+; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
+; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,2,2]
+; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v4i64_0142:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,2]
-; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
+; AVX512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,2,2]
+; AVX512VL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
; AVX512VL-NEXT: retq
%shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 2>
ret <4 x i64> %shuffle
@@ -806,23 +848,23 @@ define <4 x i64> @shuffle_v4i64_0142(<4 x i64> %a, <4 x i64> %b) {
define <4 x i64> @shuffle_v4i64_0412(<4 x i64> %a, <4 x i64> %b) {
; AVX1-LABEL: shuffle_v4i64_0412:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm0[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0]
; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v4i64_0412:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpbroadcastq %xmm1, %xmm1
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,2]
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v4i64_0412:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpbroadcastq %xmm1, %xmm1
; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,2]
; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
@@ -833,7 +875,7 @@ define <4 x i64> @shuffle_v4i64_0412(<4 x i64> %a, <4 x i64> %b) {
define <4 x i64> @shuffle_v4i64_4012(<4 x i64> %a, <4 x i64> %b) {
; AVX1-LABEL: shuffle_v4i64_4012:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm0[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
@@ -842,103 +884,83 @@ define <4 x i64> @shuffle_v4i64_4012(<4 x i64> %a, <4 x i64> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v4i64_4012:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,2]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,2]
+; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v4i64_4012:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,2]
-; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,2]
+; AVX512VL-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
; AVX512VL-NEXT: retq
%shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 4, i32 0, i32 1, i32 2>
ret <4 x i64> %shuffle
}
define <4 x i64> @shuffle_v4i64_0145(<4 x i64> %a, <4 x i64> %b) {
-; AVX1-LABEL: shuffle_v4i64_0145:
-; AVX1: # BB#0:
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: shuffle_v4i64_0145:
-; AVX2: # BB#0:
-; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v4i64_0145:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512VL-NEXT: retq
+; ALL-LABEL: shuffle_v4i64_0145:
+; ALL: # %bb.0:
+; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; ALL-NEXT: retq
%shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
ret <4 x i64> %shuffle
}
define <4 x i64> @shuffle_v4i64_0451(<4 x i64> %a, <4 x i64> %b) {
; AVX1-LABEL: shuffle_v4i64_0451:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm1[1],xmm0[1]
-; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v4i64_0451:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,3]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5],ymm0[6,7]
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,0,1,3]
+; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,2,1]
+; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5],ymm0[6,7]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v4i64_0451:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,3]
-; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1]
-; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5],ymm0[6,7]
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,0,1,3]
+; AVX512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,2,1]
+; AVX512VL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5],ymm0[6,7]
; AVX512VL-NEXT: retq
%shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 4, i32 5, i32 1>
ret <4 x i64> %shuffle
}
define <4 x i64> @shuffle_v4i64_4501(<4 x i64> %a, <4 x i64> %b) {
-; AVX1-LABEL: shuffle_v4i64_4501:
-; AVX1: # BB#0:
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: shuffle_v4i64_4501:
-; AVX2: # BB#0:
-; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v4i64_4501:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512VL-NEXT: retq
+; ALL-LABEL: shuffle_v4i64_4501:
+; ALL: # %bb.0:
+; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; ALL-NEXT: retq
%shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
ret <4 x i64> %shuffle
}
define <4 x i64> @shuffle_v4i64_4015(<4 x i64> %a, <4 x i64> %b) {
; AVX1-LABEL: shuffle_v4i64_4015:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm0[1],xmm1[1]
-; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v4i64_4015:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,3]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1]
+; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,3]
+; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v4i64_4015:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1]
-; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,3]
-; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1]
+; AVX512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,3]
+; AVX512VL-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
; AVX512VL-NEXT: retq
%shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 4, i32 0, i32 1, i32 5>
ret <4 x i64> %shuffle
@@ -946,22 +968,22 @@ define <4 x i64> @shuffle_v4i64_4015(<4 x i64> %a, <4 x i64> %b) {
define <4 x i64> @shuffle_v4i64_2u35(<4 x i64> %a, <4 x i64> %b) {
; AVX1-LABEL: shuffle_v4i64_2u35:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm0[1],xmm1[1]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v4i64_2u35:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,1]
+; AVX2: # %bb.0:
+; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
+; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,1]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v4i64_2u35:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
-; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,1]
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
+; AVX512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,1]
; AVX512VL-NEXT: retq
%shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 2, i32 undef, i32 3, i32 5>
ret <4 x i64> %shuffle
@@ -969,7 +991,7 @@ define <4 x i64> @shuffle_v4i64_2u35(<4 x i64> %a, <4 x i64> %b) {
define <4 x i64> @shuffle_v4i64_1251(<4 x i64> %a, <4 x i64> %b) {
; AVX1-LABEL: shuffle_v4i64_1251:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,0,1]
; AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm2[0],ymm0[2],ymm2[3]
; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
@@ -978,17 +1000,17 @@ define <4 x i64> @shuffle_v4i64_1251(<4 x i64> %a, <4 x i64> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v4i64_1251:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,2,2,1]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,3]
+; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,2,2,1]
+; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v4i64_1251:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3]
-; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,2,2,1]
-; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,3]
+; AVX512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,2,2,1]
+; AVX512VL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
; AVX512VL-NEXT: retq
%shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 1, i32 2, i32 5, i32 1>
ret <4 x i64> %shuffle
@@ -996,21 +1018,21 @@ define <4 x i64> @shuffle_v4i64_1251(<4 x i64> %a, <4 x i64> %b) {
define <4 x i64> @shuffle_v4i64_1054(<4 x i64> %a, <4 x i64> %b) {
; AVX1-LABEL: shuffle_v4i64_1054:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v4i64_1054:
-; AVX2: # BB#0:
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
+; AVX2: # %bb.0:
+; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v4i64_1054:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
; AVX512VL-NEXT: retq
%shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 1, i32 0, i32 5, i32 4>
ret <4 x i64> %shuffle
@@ -1018,19 +1040,19 @@ define <4 x i64> @shuffle_v4i64_1054(<4 x i64> %a, <4 x i64> %b) {
define <4 x i64> @shuffle_v4i64_3254(<4 x i64> %a, <4 x i64> %b) {
; AVX1-LABEL: shuffle_v4i64_3254:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v4i64_3254:
-; AVX2: # BB#0:
-; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
-; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
+; AVX2: # %bb.0:
+; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
+; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v4i64_3254:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
; AVX512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
; AVX512VL-NEXT: retq
@@ -1040,19 +1062,19 @@ define <4 x i64> @shuffle_v4i64_3254(<4 x i64> %a, <4 x i64> %b) {
define <4 x i64> @shuffle_v4i64_3276(<4 x i64> %a, <4 x i64> %b) {
; AVX1-LABEL: shuffle_v4i64_3276:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v4i64_3276:
-; AVX2: # BB#0:
-; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
-; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
+; AVX2: # %bb.0:
+; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
+; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v4i64_3276:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
; AVX512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
; AVX512VL-NEXT: retq
@@ -1062,21 +1084,21 @@ define <4 x i64> @shuffle_v4i64_3276(<4 x i64> %a, <4 x i64> %b) {
define <4 x i64> @shuffle_v4i64_1076(<4 x i64> %a, <4 x i64> %b) {
; AVX1-LABEL: shuffle_v4i64_1076:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v4i64_1076:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
+; AVX2: # %bb.0:
+; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v4i64_1076:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512VL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
; AVX512VL-NEXT: retq
%shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 1, i32 0, i32 7, i32 6>
ret <4 x i64> %shuffle
@@ -1084,24 +1106,24 @@ define <4 x i64> @shuffle_v4i64_1076(<4 x i64> %a, <4 x i64> %b) {
define <4 x i64> @shuffle_v4i64_0415(<4 x i64> %a, <4 x i64> %b) {
; AVX1-LABEL: shuffle_v4i64_0415:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm0[1],xmm1[1]
-; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v4i64_0415:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,0,2,1]
+; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3]
+; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v4i64_0415:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1]
-; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3]
-; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,0,2,1]
+; AVX512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3]
+; AVX512VL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
; AVX512VL-NEXT: retq
%shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
ret <4 x i64> %shuffle
@@ -1109,18 +1131,18 @@ define <4 x i64> @shuffle_v4i64_0415(<4 x i64> %a, <4 x i64> %b) {
define <4 x i64> @shuffle_v4i64_z4z6(<4 x i64> %a) {
; AVX1-LABEL: shuffle_v4i64_z4z6:
-; AVX1: # BB#0:
-; AVX1-NEXT: vxorpd %ymm1, %ymm1, %ymm1
+; AVX1: # %bb.0:
+; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v4i64_z4z6:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpslldq {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1,2,3,4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,18,19,20,21,22,23]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v4i64_z4z6:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpslldq {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1,2,3,4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,18,19,20,21,22,23]
; AVX512VL-NEXT: retq
%shuffle = shufflevector <4 x i64> zeroinitializer, <4 x i64> %a, <4 x i32> <i32 0, i32 4, i32 0, i32 6>
@@ -1129,18 +1151,18 @@ define <4 x i64> @shuffle_v4i64_z4z6(<4 x i64> %a) {
define <4 x i64> @shuffle_v4i64_5zuz(<4 x i64> %a) {
; AVX1-LABEL: shuffle_v4i64_5zuz:
-; AVX1: # BB#0:
-; AVX1-NEXT: vxorpd %ymm1, %ymm1, %ymm1
+; AVX1: # %bb.0:
+; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v4i64_5zuz:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v4i64_5zuz:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero
; AVX512VL-NEXT: retq
%shuffle = shufflevector <4 x i64> zeroinitializer, <4 x i64> %a, <4 x i32> <i32 5, i32 0, i32 undef, i32 0>
@@ -1148,28 +1170,18 @@ define <4 x i64> @shuffle_v4i64_5zuz(<4 x i64> %a) {
}
define <4 x i64> @shuffle_v4i64_40u2(<4 x i64> %a, <4 x i64> %b) {
-; AVX1-LABEL: shuffle_v4i64_40u2:
-; AVX1: # BB#0:
-; AVX1-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: shuffle_v4i64_40u2:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
-; AVX2-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v4i64_40u2:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
-; AVX512VL-NEXT: retq
+; ALL-LABEL: shuffle_v4i64_40u2:
+; ALL: # %bb.0:
+; ALL-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
+; ALL-NEXT: retq
%shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 4, i32 0, i32 undef, i32 2>
ret <4 x i64> %shuffle
}
define <4 x i64> @shuffle_v4i64_15uu(<4 x i64> %a, <4 x i64> %b) {
; ALL-LABEL: shuffle_v4i64_15uu:
-; ALL: # BB#0:
-; ALL-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; ALL: # %bb.0:
+; ALL-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
; ALL-NEXT: retq
%shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 1, i32 5, i32 undef, i32 undef>
ret <4 x i64> %shuffle
@@ -1177,8 +1189,8 @@ define <4 x i64> @shuffle_v4i64_15uu(<4 x i64> %a, <4 x i64> %b) {
define <4 x i64> @shuffle_v4i64_11uu(<4 x i64> %a, <4 x i64> %b) {
; ALL-LABEL: shuffle_v4i64_11uu:
-; ALL: # BB#0:
-; ALL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; ALL: # %bb.0:
+; ALL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3]
; ALL-NEXT: retq
%shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 1, i32 1, i32 undef, i32 undef>
ret <4 x i64> %shuffle
@@ -1186,19 +1198,19 @@ define <4 x i64> @shuffle_v4i64_11uu(<4 x i64> %a, <4 x i64> %b) {
define <4 x i64> @shuffle_v4i64_22uu(<4 x i64> %a, <4 x i64> %b) {
; AVX1-LABEL: shuffle_v4i64_22uu:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v4i64_22uu:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,3]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v4i64_22uu:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,3]
; AVX512VL-NEXT: retq
%shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 2, i32 2, i32 undef, i32 undef>
ret <4 x i64> %shuffle
@@ -1206,19 +1218,19 @@ define <4 x i64> @shuffle_v4i64_22uu(<4 x i64> %a, <4 x i64> %b) {
define <4 x i64> @shuffle_v4i64_3333(<4 x i64> %a, <4 x i64> %b) {
; AVX1-LABEL: shuffle_v4i64_3333:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,1,3,3]
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v4i64_3333:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,3,3,3]
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v4i64_3333:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,3,3,3]
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3]
; AVX512VL-NEXT: retq
%shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
ret <4 x i64> %shuffle
@@ -1226,8 +1238,8 @@ define <4 x i64> @shuffle_v4i64_3333(<4 x i64> %a, <4 x i64> %b) {
define <4 x i64> @shuffle_v4i64_1z3z(<4 x i64> %a, <4 x i64> %b) {
; AVX1-LABEL: shuffle_v4i64_1z3z:
-; AVX1: # BB#0:
-; AVX1-NEXT: vxorpd %ymm1, %ymm1, %ymm1
+; AVX1: # %bb.0:
+; AVX1-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1],xmm0[0]
@@ -1236,12 +1248,12 @@ define <4 x i64> @shuffle_v4i64_1z3z(<4 x i64> %a, <4 x i64> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v4i64_1z3z:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v4i64_1z3z:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero
; AVX512VL-NEXT: retq
%shuffle = shufflevector <4 x i64> %a, <4 x i64> <i64 0, i64 undef, i64 undef, i64 undef>, <4 x i32> <i32 1, i32 4, i32 3, i32 4>
@@ -1261,7 +1273,7 @@ define <4 x i64> @stress_test1(<4 x i64> %a, <4 x i64> %b) {
define <4 x i64> @insert_reg_and_zero_v4i64(i64 %a) {
; ALL-LABEL: insert_reg_and_zero_v4i64:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vmovq %rdi, %xmm0
; ALL-NEXT: retq
%v = insertelement <4 x i64> undef, i64 %a, i64 0
@@ -1271,7 +1283,7 @@ define <4 x i64> @insert_reg_and_zero_v4i64(i64 %a) {
define <4 x i64> @insert_mem_and_zero_v4i64(i64* %ptr) {
; ALL-LABEL: insert_mem_and_zero_v4i64:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; ALL-NEXT: retq
%a = load i64, i64* %ptr
@@ -1282,21 +1294,22 @@ define <4 x i64> @insert_mem_and_zero_v4i64(i64* %ptr) {
define <4 x double> @insert_reg_and_zero_v4f64(double %a) {
; AVX1-LABEL: insert_reg_and_zero_v4f64:
-; AVX1: # BB#0:
-; AVX1-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
-; AVX1-NEXT: vxorpd %ymm1, %ymm1, %ymm1
+; AVX1: # %bb.0:
+; AVX1-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
+; AVX1-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
; AVX1-NEXT: retq
;
; AVX2-LABEL: insert_reg_and_zero_v4f64:
-; AVX2: # BB#0:
-; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
-; AVX2-NEXT: vxorpd %ymm1, %ymm1, %ymm1
+; AVX2: # %bb.0:
+; AVX2-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
+; AVX2-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: insert_reg_and_zero_v4f64:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512VL-NEXT: vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
; AVX512VL-NEXT: retq
@@ -1307,7 +1320,7 @@ define <4 x double> @insert_reg_and_zero_v4f64(double %a) {
define <4 x double> @insert_mem_and_zero_v4f64(double* %ptr) {
; ALL-LABEL: insert_mem_and_zero_v4f64:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; ALL-NEXT: retq
%a = load double, double* %ptr
@@ -1318,7 +1331,7 @@ define <4 x double> @insert_mem_and_zero_v4f64(double* %ptr) {
define <4 x double> @splat_mem_v4f64(double* %ptr) {
; ALL-LABEL: splat_mem_v4f64:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vbroadcastsd (%rdi), %ymm0
; ALL-NEXT: retq
%a = load double, double* %ptr
@@ -1329,7 +1342,7 @@ define <4 x double> @splat_mem_v4f64(double* %ptr) {
define <4 x i64> @splat_mem_v4i64(i64* %ptr) {
; ALL-LABEL: splat_mem_v4i64:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vbroadcastsd (%rdi), %ymm0
; ALL-NEXT: retq
%a = load i64, i64* %ptr
@@ -1340,7 +1353,7 @@ define <4 x i64> @splat_mem_v4i64(i64* %ptr) {
define <4 x double> @splat_mem_v4f64_2(double* %p) {
; ALL-LABEL: splat_mem_v4f64_2:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vbroadcastsd (%rdi), %ymm0
; ALL-NEXT: retq
%1 = load double, double* %p
@@ -1351,18 +1364,18 @@ define <4 x double> @splat_mem_v4f64_2(double* %p) {
define <4 x double> @splat_v4f64(<2 x double> %r) {
; AVX1-LABEL: splat_v4f64:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: splat_v4f64:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vbroadcastsd %xmm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: splat_v4f64:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vbroadcastsd %xmm0, %ymm0
; AVX512VL-NEXT: retq
%1 = shufflevector <2 x double> %r, <2 x double> undef, <4 x i32> zeroinitializer
@@ -1371,7 +1384,7 @@ define <4 x double> @splat_v4f64(<2 x double> %r) {
define <4 x i64> @splat_mem_v4i64_from_v2i64(<2 x i64>* %ptr) {
; ALL-LABEL: splat_mem_v4i64_from_v2i64:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vbroadcastsd (%rdi), %ymm0
; ALL-NEXT: retq
%v = load <2 x i64>, <2 x i64>* %ptr
@@ -1381,7 +1394,7 @@ define <4 x i64> @splat_mem_v4i64_from_v2i64(<2 x i64>* %ptr) {
define <4 x double> @splat_mem_v4f64_from_v2f64(<2 x double>* %ptr) {
; ALL-LABEL: splat_mem_v4f64_from_v2f64:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vbroadcastsd (%rdi), %ymm0
; ALL-NEXT: retq
%v = load <2 x double>, <2 x double>* %ptr
@@ -1391,17 +1404,17 @@ define <4 x double> @splat_mem_v4f64_from_v2f64(<2 x double>* %ptr) {
define <4 x i64> @splat128_mem_v4i64_from_v2i64(<2 x i64>* %ptr) {
; AVX1-LABEL: splat128_mem_v4i64_from_v2i64:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
; AVX1-NEXT: retq
;
; AVX2-LABEL: splat128_mem_v4i64_from_v2i64:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: splat128_mem_v4i64_from_v2i64:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
; AVX512VL-NEXT: retq
%v = load <2 x i64>, <2 x i64>* %ptr
@@ -1411,7 +1424,7 @@ define <4 x i64> @splat128_mem_v4i64_from_v2i64(<2 x i64>* %ptr) {
define <4 x double> @splat128_mem_v4f64_from_v2f64(<2 x double>* %ptr) {
; ALL-LABEL: splat128_mem_v4f64_from_v2f64:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
; ALL-NEXT: retq
%v = load <2 x double>, <2 x double>* %ptr
@@ -1421,18 +1434,18 @@ define <4 x double> @splat128_mem_v4f64_from_v2f64(<2 x double>* %ptr) {
define <4 x double> @broadcast_v4f64_0000_from_v2i64(<2 x i64> %a0) {
; AVX1-LABEL: broadcast_v4f64_0000_from_v2i64:
-; AVX1: # BB#0:
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: broadcast_v4f64_0000_from_v2i64:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vbroadcastsd %xmm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: broadcast_v4f64_0000_from_v2i64:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vbroadcastsd %xmm0, %ymm0
; AVX512VL-NEXT: retq
%1 = shufflevector <2 x i64> %a0, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -1442,20 +1455,10 @@ define <4 x double> @broadcast_v4f64_0000_from_v2i64(<2 x i64> %a0) {
}
define <4 x double> @bitcast_v4f64_0426(<4 x double> %a, <4 x double> %b) {
-; AVX1-LABEL: bitcast_v4f64_0426:
-; AVX1: # BB#0:
-; AVX1-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: bitcast_v4f64_0426:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
-; AVX2-NEXT: retq
-;
-; AVX512VL-LABEL: bitcast_v4f64_0426:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
-; AVX512VL-NEXT: retq
+; ALL-LABEL: bitcast_v4f64_0426:
+; ALL: # %bb.0:
+; ALL-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; ALL-NEXT: retq
%shuffle64 = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 4, i32 0, i32 6, i32 2>
%bitcast32 = bitcast <4 x double> %shuffle64 to <8 x float>
%shuffle32 = shufflevector <8 x float> %bitcast32, <8 x float> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
@@ -1467,18 +1470,18 @@ define <4 x double> @bitcast_v4f64_0426(<4 x double> %a, <4 x double> %b) {
define <4 x i64> @concat_v4i64_0167(<4 x i64> %a0, <4 x i64> %a1) {
; AVX1-LABEL: concat_v4i64_0167:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
; AVX1-NEXT: retq
;
; AVX2-LABEL: concat_v4i64_0167:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX2: # %bb.0:
+; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: concat_v4i64_0167:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX512VL-NEXT: retq
%a0lo = shufflevector <4 x i64> %a0, <4 x i64> %a1, <2 x i32> <i32 0, i32 1>
%a1hi = shufflevector <4 x i64> %a0, <4 x i64> %a1, <2 x i32> <i32 6, i32 7>
@@ -1487,20 +1490,10 @@ define <4 x i64> @concat_v4i64_0167(<4 x i64> %a0, <4 x i64> %a1) {
}
define <4 x i64> @concat_v4i64_0145_bc(<4 x i64> %a0, <4 x i64> %a1) {
-; AVX1-LABEL: concat_v4i64_0145_bc:
-; AVX1: # BB#0:
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: concat_v4i64_0145_bc:
-; AVX2: # BB#0:
-; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512VL-LABEL: concat_v4i64_0145_bc:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512VL-NEXT: retq
+; ALL-LABEL: concat_v4i64_0145_bc:
+; ALL: # %bb.0:
+; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; ALL-NEXT: retq
%a0lo = shufflevector <4 x i64> %a0, <4 x i64> %a1, <2 x i32> <i32 0, i32 1>
%a1lo = shufflevector <4 x i64> %a0, <4 x i64> %a1, <2 x i32> <i32 4, i32 5>
%bc0lo = bitcast <2 x i64> %a0lo to <4 x i32>
@@ -1512,7 +1505,7 @@ define <4 x i64> @concat_v4i64_0145_bc(<4 x i64> %a0, <4 x i64> %a1) {
define <4 x i64> @insert_dup_mem_v4i64(i64* %ptr) {
; ALL-LABEL: insert_dup_mem_v4i64:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vbroadcastsd (%rdi), %ymm0
; ALL-NEXT: retq
%tmp = load i64, i64* %ptr, align 1
@@ -1523,20 +1516,20 @@ define <4 x i64> @insert_dup_mem_v4i64(i64* %ptr) {
define <4 x i64> @shuffle_v4i64_1234(<4 x i64> %a, <4 x i64> %b) {
; AVX1-LABEL: shuffle_v4i64_1234:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3]
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[0],ymm0[3],ymm1[2]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v4i64_1234:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,2,3,0]
+; AVX2: # %bb.0:
+; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
+; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,2,3,0]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v4i64_1234:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: valignq {{.*#+}} ymm0 = ymm0[1,2,3],ymm1[0]
; AVX512VL-NEXT: retq
%shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
@@ -1545,19 +1538,19 @@ define <4 x i64> @shuffle_v4i64_1234(<4 x i64> %a, <4 x i64> %b) {
define <4 x i64> @shuffle_v4i64_1230(<4 x i64> %a) {
; AVX1-LABEL: shuffle_v4i64_1230:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[0],ymm0[3],ymm1[2]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v4i64_1230:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,2,3,0]
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,2,3,0]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v4i64_1230:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,2,3,0]
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,2,3,0]
; AVX512VL-NEXT: retq
%shuffle = shufflevector <4 x i64> %a, <4 x i64> undef, <4 x i32> <i32 1, i32 2, i32 3, i32 0>
ret <4 x i64> %shuffle
@@ -1565,23 +1558,23 @@ define <4 x i64> @shuffle_v4i64_1230(<4 x i64> %a) {
define <4 x i64> @shuffle_v4i64_z0z3(<4 x i64> %a, <4 x i64> %b) {
; AVX1-LABEL: shuffle_v4i64_z0z3:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,0,2,3]
-; AVX1-NEXT: vxorpd %ymm1, %ymm1, %ymm1
+; AVX1-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v4i64_z0z3:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,3]
-; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,2,3]
+; AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v4i64_z0z3:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,3]
-; AVX512VL-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
; AVX512VL-NEXT: retq
%1 = shufflevector <4 x i64> %a, <4 x i64> <i64 0, i64 undef, i64 undef, i64 undef>, <4 x i32> <i32 4, i32 0, i32 4, i32 3>
@@ -1590,8 +1583,8 @@ define <4 x i64> @shuffle_v4i64_z0z3(<4 x i64> %a, <4 x i64> %b) {
define <4 x i64> @shuffle_v4i64_1z2z(<4 x i64> %a, <4 x i64> %b) {
; AVX1-LABEL: shuffle_v4i64_1z2z:
-; AVX1: # BB#0:
-; AVX1-NEXT: vxorpd %ymm1, %ymm1, %ymm1
+; AVX1: # %bb.0:
+; AVX1-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0]
@@ -1600,15 +1593,15 @@ define <4 x i64> @shuffle_v4i64_1z2z(<4 x i64> %a, <4 x i64> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v4i64_1z2z:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,0,2,0]
+; AVX2: # %bb.0:
+; AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
+; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,0,2,0]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v4i64_1z2z:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,0,2,0]
; AVX512VL-NEXT: retq
diff --git a/test/CodeGen/X86/vector-shuffle-256-v8.ll b/test/CodeGen/X86/vector-shuffle-256-v8.ll
index 4c9ab578c906..44d0217f5295 100644
--- a/test/CodeGen/X86/vector-shuffle-256-v8.ll
+++ b/test/CodeGen/X86/vector-shuffle-256-v8.ll
@@ -5,13 +5,13 @@
define <8 x float> @shuffle_v8f32_00000000(<8 x float> %a, <8 x float> %b) {
; AVX1-LABEL: shuffle_v8f32_00000000:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v8f32_00000000:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vbroadcastss %xmm0, %ymm0
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
@@ -20,14 +20,14 @@ define <8 x float> @shuffle_v8f32_00000000(<8 x float> %a, <8 x float> %b) {
define <8 x float> @shuffle_v8f32_00000010(<8 x float> %a, <8 x float> %b) {
; AVX1-LABEL: shuffle_v8f32_00000010:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,0,0,0]
; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v8f32_00000010:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,0]
; AVX2OR512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,0,1]
; AVX2OR512VL-NEXT: retq
@@ -37,14 +37,14 @@ define <8 x float> @shuffle_v8f32_00000010(<8 x float> %a, <8 x float> %b) {
define <8 x float> @shuffle_v8f32_00000200(<8 x float> %a, <8 x float> %b) {
; AVX1-LABEL: shuffle_v8f32_00000200:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,0,0,0]
; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,0,0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v8f32_00000200:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,2]
; AVX2OR512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,0]
; AVX2OR512VL-NEXT: retq
@@ -54,14 +54,14 @@ define <8 x float> @shuffle_v8f32_00000200(<8 x float> %a, <8 x float> %b) {
define <8 x float> @shuffle_v8f32_00003000(<8 x float> %a, <8 x float> %b) {
; AVX1-LABEL: shuffle_v8f32_00003000:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,0,0,0]
; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,0,0,0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v8f32_00003000:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,3,0]
; AVX2OR512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,0]
; AVX2OR512VL-NEXT: retq
@@ -71,7 +71,7 @@ define <8 x float> @shuffle_v8f32_00003000(<8 x float> %a, <8 x float> %b) {
define <8 x float> @shuffle_v8f32_00040000(<8 x float> %a, <8 x float> %b) {
; AVX1-LABEL: shuffle_v8f32_00040000:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,0,0,3]
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
@@ -79,7 +79,7 @@ define <8 x float> @shuffle_v8f32_00040000(<8 x float> %a, <8 x float> %b) {
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v8f32_00040000:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,4,0,0,0,0]
; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0
; AVX2OR512VL-NEXT: retq
@@ -89,14 +89,14 @@ define <8 x float> @shuffle_v8f32_00040000(<8 x float> %a, <8 x float> %b) {
define <8 x float> @shuffle_v8f32_00500000(<8 x float> %a, <8 x float> %b) {
; AVX1-LABEL: shuffle_v8f32_00500000:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,1,0,4,4,4,4]
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v8f32_00500000:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,5,0,0,0,0,0]
; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0
; AVX2OR512VL-NEXT: retq
@@ -106,14 +106,14 @@ define <8 x float> @shuffle_v8f32_00500000(<8 x float> %a, <8 x float> %b) {
define <8 x float> @shuffle_v8f32_06000000(<8 x float> %a, <8 x float> %b) {
; AVX1-LABEL: shuffle_v8f32_06000000:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,0,0,4,4,4,4]
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v8f32_06000000:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vmovaps {{.*#+}} ymm1 = [0,6,0,0,0,0,0,0]
; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0
; AVX2OR512VL-NEXT: retq
@@ -123,17 +123,17 @@ define <8 x float> @shuffle_v8f32_06000000(<8 x float> %a, <8 x float> %b) {
define <8 x float> @shuffle_v8f32_70000000(<8 x float> %a, <8 x float> %b) {
; AVX1-LABEL: shuffle_v8f32_70000000:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,0,0,0,4,4,4,4]
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v8f32_70000000:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: movl $7, %eax
; AVX2OR512VL-NEXT: vmovd %eax, %xmm1
-; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0
+; AVX2OR512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <8 x float> %shuffle
@@ -141,7 +141,7 @@ define <8 x float> @shuffle_v8f32_70000000(<8 x float> %a, <8 x float> %b) {
define <8 x float> @shuffle_v8f32_01014545(<8 x float> %a, <8 x float> %b) {
; ALL-LABEL: shuffle_v8f32_01014545:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
; ALL-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5>
@@ -150,14 +150,14 @@ define <8 x float> @shuffle_v8f32_01014545(<8 x float> %a, <8 x float> %b) {
define <8 x float> @shuffle_v8f32_00112233(<8 x float> %a, <8 x float> %b) {
; AVX1-LABEL: shuffle_v8f32_00112233:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,0,1,1]
; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v8f32_00112233:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,1,1,2,2,3,3]
; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0
; AVX2OR512VL-NEXT: retq
@@ -167,14 +167,14 @@ define <8 x float> @shuffle_v8f32_00112233(<8 x float> %a, <8 x float> %b) {
define <8 x float> @shuffle_v8f32_00001111(<8 x float> %a, <8 x float> %b) {
; AVX1-LABEL: shuffle_v8f32_00001111:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,0,0,0]
; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v8f32_00001111:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,1]
; AVX2OR512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,1]
; AVX2OR512VL-NEXT: retq
@@ -184,7 +184,7 @@ define <8 x float> @shuffle_v8f32_00001111(<8 x float> %a, <8 x float> %b) {
define <8 x float> @shuffle_v8f32_81a3c5e7(<8 x float> %a, <8 x float> %b) {
; ALL-LABEL: shuffle_v8f32_81a3c5e7:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7]
; ALL-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 8, i32 1, i32 10, i32 3, i32 12, i32 5, i32 14, i32 7>
@@ -193,14 +193,14 @@ define <8 x float> @shuffle_v8f32_81a3c5e7(<8 x float> %a, <8 x float> %b) {
define <8 x float> @shuffle_v8f32_08080808(<8 x float> %a, <8 x float> %b) {
; AVX1-LABEL: shuffle_v8f32_08080808:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0]
; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v8f32_08080808:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
; AVX2OR512VL-NEXT: vbroadcastsd %xmm0, %ymm0
; AVX2OR512VL-NEXT: retq
@@ -210,7 +210,7 @@ define <8 x float> @shuffle_v8f32_08080808(<8 x float> %a, <8 x float> %b) {
define <8 x float> @shuffle_v8f32_08084c4c(<8 x float> %a, <8 x float> %b) {
; ALL-LABEL: shuffle_v8f32_08084c4c:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[0,0],ymm0[4,4],ymm1[4,4]
; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
; ALL-NEXT: retq
@@ -220,7 +220,7 @@ define <8 x float> @shuffle_v8f32_08084c4c(<8 x float> %a, <8 x float> %b) {
define <8 x float> @shuffle_v8f32_8823cc67(<8 x float> %a, <8 x float> %b) {
; ALL-LABEL: shuffle_v8f32_8823cc67:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,0],ymm0[2,3],ymm1[4,4],ymm0[6,7]
; ALL-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 8, i32 8, i32 2, i32 3, i32 12, i32 12, i32 6, i32 7>
@@ -229,7 +229,7 @@ define <8 x float> @shuffle_v8f32_8823cc67(<8 x float> %a, <8 x float> %b) {
define <8 x float> @shuffle_v8f32_9832dc76(<8 x float> %a, <8 x float> %b) {
; ALL-LABEL: shuffle_v8f32_9832dc76:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,0],ymm0[3,2],ymm1[5,4],ymm0[7,6]
; ALL-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 9, i32 8, i32 3, i32 2, i32 13, i32 12, i32 7, i32 6>
@@ -238,7 +238,7 @@ define <8 x float> @shuffle_v8f32_9832dc76(<8 x float> %a, <8 x float> %b) {
define <8 x float> @shuffle_v8f32_9810dc54(<8 x float> %a, <8 x float> %b) {
; ALL-LABEL: shuffle_v8f32_9810dc54:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,0],ymm0[1,0],ymm1[5,4],ymm0[5,4]
; ALL-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 9, i32 8, i32 1, i32 0, i32 13, i32 12, i32 5, i32 4>
@@ -247,7 +247,7 @@ define <8 x float> @shuffle_v8f32_9810dc54(<8 x float> %a, <8 x float> %b) {
define <8 x float> @shuffle_v8f32_08194c5d(<8 x float> %a, <8 x float> %b) {
; ALL-LABEL: shuffle_v8f32_08194c5d:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
; ALL-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
@@ -256,7 +256,7 @@ define <8 x float> @shuffle_v8f32_08194c5d(<8 x float> %a, <8 x float> %b) {
define <8 x float> @shuffle_v8f32_2a3b6e7f(<8 x float> %a, <8 x float> %b) {
; ALL-LABEL: shuffle_v8f32_2a3b6e7f:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
; ALL-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
@@ -265,14 +265,14 @@ define <8 x float> @shuffle_v8f32_2a3b6e7f(<8 x float> %a, <8 x float> %b) {
define <8 x float> @shuffle_v8f32_08192a3b(<8 x float> %a, <8 x float> %b) {
; AVX1OR2-LABEL: shuffle_v8f32_08192a3b:
-; AVX1OR2: # BB#0:
+; AVX1OR2: # %bb.0:
; AVX1OR2-NEXT: vunpckhps {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX1OR2-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; AVX1OR2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1OR2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v8f32_08192a3b:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovaps {{.*#+}} ymm2 = [0,8,1,9,2,10,3,11]
; AVX512VL-NEXT: vpermt2ps %ymm1, %ymm2, %ymm0
; AVX512VL-NEXT: retq
@@ -282,7 +282,7 @@ define <8 x float> @shuffle_v8f32_08192a3b(<8 x float> %a, <8 x float> %b) {
define <8 x float> @shuffle_v8f32_08991abb(<8 x float> %a, <8 x float> %b) {
; AVX1-LABEL: shuffle_v8f32_08991abb:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,0],xmm1[0,0]
; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[1,1]
; AVX1-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
@@ -291,7 +291,7 @@ define <8 x float> @shuffle_v8f32_08991abb(<8 x float> %a, <8 x float> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v8f32_08991abb:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vmovaps {{.*#+}} ymm2 = <u,0,1,1,u,2,3,3>
; AVX2-NEXT: vpermps %ymm1, %ymm2, %ymm1
; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3]
@@ -300,7 +300,7 @@ define <8 x float> @shuffle_v8f32_08991abb(<8 x float> %a, <8 x float> %b) {
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v8f32_08991abb:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[0,1,1,3]
; AVX512VL-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,1,1,10,2,3,3]
; AVX512VL-NEXT: vpermi2ps %ymm2, %ymm1, %ymm0
@@ -311,7 +311,7 @@ define <8 x float> @shuffle_v8f32_08991abb(<8 x float> %a, <8 x float> %b) {
define <8 x float> @shuffle_v8f32_091b2d3f(<8 x float> %a, <8 x float> %b) {
; AVX1-LABEL: shuffle_v8f32_091b2d3f:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[0,1,1,3]
; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,1,3,3]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
@@ -319,14 +319,14 @@ define <8 x float> @shuffle_v8f32_091b2d3f(<8 x float> %a, <8 x float> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v8f32_091b2d3f:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vmovaps {{.*#+}} ymm2 = <0,u,1,u,2,u,3,u>
; AVX2-NEXT: vpermps %ymm0, %ymm2, %ymm0
; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v8f32_091b2d3f:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovaps {{.*#+}} ymm2 = [0,9,1,11,2,13,3,15]
; AVX512VL-NEXT: vpermt2ps %ymm1, %ymm2, %ymm0
; AVX512VL-NEXT: retq
@@ -336,14 +336,14 @@ define <8 x float> @shuffle_v8f32_091b2d3f(<8 x float> %a, <8 x float> %b) {
define <8 x float> @shuffle_v8f32_09ab1def(<8 x float> %a, <8 x float> %b) {
; AVX1-LABEL: shuffle_v8f32_09ab1def:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v8f32_09ab1def:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3]
; AVX2OR512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3]
; AVX2OR512VL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
@@ -354,7 +354,7 @@ define <8 x float> @shuffle_v8f32_09ab1def(<8 x float> %a, <8 x float> %b) {
define <8 x float> @shuffle_v8f32_00014445(<8 x float> %a, <8 x float> %b) {
; ALL-LABEL: shuffle_v8f32_00014445:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,1,4,4,4,5]
; ALL-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 1, i32 4, i32 4, i32 4, i32 5>
@@ -363,7 +363,7 @@ define <8 x float> @shuffle_v8f32_00014445(<8 x float> %a, <8 x float> %b) {
define <8 x float> @shuffle_v8f32_00204464(<8 x float> %a, <8 x float> %b) {
; ALL-LABEL: shuffle_v8f32_00204464:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,0,4,4,6,4]
; ALL-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 0, i32 4, i32 4, i32 6, i32 4>
@@ -372,7 +372,7 @@ define <8 x float> @shuffle_v8f32_00204464(<8 x float> %a, <8 x float> %b) {
define <8 x float> @shuffle_v8f32_03004744(<8 x float> %a, <8 x float> %b) {
; ALL-LABEL: shuffle_v8f32_03004744:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,3,0,0,4,7,4,4]
; ALL-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 3, i32 0, i32 0, i32 4, i32 7, i32 4, i32 4>
@@ -381,7 +381,7 @@ define <8 x float> @shuffle_v8f32_03004744(<8 x float> %a, <8 x float> %b) {
define <8 x float> @shuffle_v8f32_10005444(<8 x float> %a, <8 x float> %b) {
; ALL-LABEL: shuffle_v8f32_10005444:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,0,0,5,4,4,4]
; ALL-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 1, i32 0, i32 0, i32 0, i32 5, i32 4, i32 4, i32 4>
@@ -390,7 +390,7 @@ define <8 x float> @shuffle_v8f32_10005444(<8 x float> %a, <8 x float> %b) {
define <8 x float> @shuffle_v8f32_22006644(<8 x float> %a, <8 x float> %b) {
; ALL-LABEL: shuffle_v8f32_22006644:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,2,0,0,6,6,4,4]
; ALL-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 2, i32 2, i32 0, i32 0, i32 6, i32 6, i32 4, i32 4>
@@ -399,7 +399,7 @@ define <8 x float> @shuffle_v8f32_22006644(<8 x float> %a, <8 x float> %b) {
define <8 x float> @shuffle_v8f32_33307774(<8 x float> %a, <8 x float> %b) {
; ALL-LABEL: shuffle_v8f32_33307774:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,3,3,0,7,7,7,4]
; ALL-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 0, i32 7, i32 7, i32 7, i32 4>
@@ -408,7 +408,7 @@ define <8 x float> @shuffle_v8f32_33307774(<8 x float> %a, <8 x float> %b) {
define <8 x float> @shuffle_v8f32_32107654(<8 x float> %a, <8 x float> %b) {
; ALL-LABEL: shuffle_v8f32_32107654:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
; ALL-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
@@ -417,7 +417,7 @@ define <8 x float> @shuffle_v8f32_32107654(<8 x float> %a, <8 x float> %b) {
define <8 x float> @shuffle_v8f32_00234467(<8 x float> %a, <8 x float> %b) {
; ALL-LABEL: shuffle_v8f32_00234467:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,3,4,4,6,7]
; ALL-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 3, i32 4, i32 4, i32 6, i32 7>
@@ -426,7 +426,7 @@ define <8 x float> @shuffle_v8f32_00234467(<8 x float> %a, <8 x float> %b) {
define <8 x float> @shuffle_v8f32_00224466(<8 x float> %a, <8 x float> %b) {
; ALL-LABEL: shuffle_v8f32_00224466:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vmovsldup {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6]
; ALL-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
@@ -435,7 +435,7 @@ define <8 x float> @shuffle_v8f32_00224466(<8 x float> %a, <8 x float> %b) {
define <8 x float> @shuffle_v8f32_10325476(<8 x float> %a, <8 x float> %b) {
; ALL-LABEL: shuffle_v8f32_10325476:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6]
; ALL-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
@@ -444,7 +444,7 @@ define <8 x float> @shuffle_v8f32_10325476(<8 x float> %a, <8 x float> %b) {
define <8 x float> @shuffle_v8f32_11335577(<8 x float> %a, <8 x float> %b) {
; ALL-LABEL: shuffle_v8f32_11335577:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7]
; ALL-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
@@ -453,7 +453,7 @@ define <8 x float> @shuffle_v8f32_11335577(<8 x float> %a, <8 x float> %b) {
define <8 x float> @shuffle_v8f32_10235467(<8 x float> %a, <8 x float> %b) {
; ALL-LABEL: shuffle_v8f32_10235467:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,2,3,5,4,6,7]
; ALL-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7>
@@ -462,7 +462,7 @@ define <8 x float> @shuffle_v8f32_10235467(<8 x float> %a, <8 x float> %b) {
define <8 x float> @shuffle_v8f32_10225466(<8 x float> %a, <8 x float> %b) {
; ALL-LABEL: shuffle_v8f32_10225466:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,2,2,5,4,6,6]
; ALL-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 1, i32 0, i32 2, i32 2, i32 5, i32 4, i32 6, i32 6>
@@ -471,7 +471,7 @@ define <8 x float> @shuffle_v8f32_10225466(<8 x float> %a, <8 x float> %b) {
define <8 x float> @shuffle_v8f32_00015444(<8 x float> %a, <8 x float> %b) {
; ALL-LABEL: shuffle_v8f32_00015444:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,1,5,4,4,4]
; ALL-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 1, i32 5, i32 4, i32 4, i32 4>
@@ -480,7 +480,7 @@ define <8 x float> @shuffle_v8f32_00015444(<8 x float> %a, <8 x float> %b) {
define <8 x float> @shuffle_v8f32_00204644(<8 x float> %a, <8 x float> %b) {
; ALL-LABEL: shuffle_v8f32_00204644:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,0,4,6,4,4]
; ALL-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 0, i32 4, i32 6, i32 4, i32 4>
@@ -489,7 +489,7 @@ define <8 x float> @shuffle_v8f32_00204644(<8 x float> %a, <8 x float> %b) {
define <8 x float> @shuffle_v8f32_03004474(<8 x float> %a, <8 x float> %b) {
; ALL-LABEL: shuffle_v8f32_03004474:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,3,0,0,4,4,7,4]
; ALL-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 3, i32 0, i32 0, i32 4, i32 4, i32 7, i32 4>
@@ -498,7 +498,7 @@ define <8 x float> @shuffle_v8f32_03004474(<8 x float> %a, <8 x float> %b) {
define <8 x float> @shuffle_v8f32_10004444(<8 x float> %a, <8 x float> %b) {
; ALL-LABEL: shuffle_v8f32_10004444:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,0,0,4,4,4,4]
; ALL-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 1, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
@@ -507,7 +507,7 @@ define <8 x float> @shuffle_v8f32_10004444(<8 x float> %a, <8 x float> %b) {
define <8 x float> @shuffle_v8f32_22006446(<8 x float> %a, <8 x float> %b) {
; ALL-LABEL: shuffle_v8f32_22006446:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,2,0,0,6,4,4,6]
; ALL-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 2, i32 2, i32 0, i32 0, i32 6, i32 4, i32 4, i32 6>
@@ -516,7 +516,7 @@ define <8 x float> @shuffle_v8f32_22006446(<8 x float> %a, <8 x float> %b) {
define <8 x float> @shuffle_v8f32_33307474(<8 x float> %a, <8 x float> %b) {
; ALL-LABEL: shuffle_v8f32_33307474:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,3,3,0,7,4,7,4]
; ALL-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 0, i32 7, i32 4, i32 7, i32 4>
@@ -525,7 +525,7 @@ define <8 x float> @shuffle_v8f32_33307474(<8 x float> %a, <8 x float> %b) {
define <8 x float> @shuffle_v8f32_32104567(<8 x float> %a, <8 x float> %b) {
; ALL-LABEL: shuffle_v8f32_32104567:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,4,5,6,7]
; ALL-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7>
@@ -534,7 +534,7 @@ define <8 x float> @shuffle_v8f32_32104567(<8 x float> %a, <8 x float> %b) {
define <8 x float> @shuffle_v8f32_00236744(<8 x float> %a, <8 x float> %b) {
; ALL-LABEL: shuffle_v8f32_00236744:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,3,6,7,4,4]
; ALL-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 3, i32 6, i32 7, i32 4, i32 4>
@@ -543,7 +543,7 @@ define <8 x float> @shuffle_v8f32_00236744(<8 x float> %a, <8 x float> %b) {
define <8 x float> @shuffle_v8f32_00226644(<8 x float> %a, <8 x float> %b) {
; ALL-LABEL: shuffle_v8f32_00226644:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,2,6,6,4,4]
; ALL-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 6, i32 6, i32 4, i32 4>
@@ -552,7 +552,7 @@ define <8 x float> @shuffle_v8f32_00226644(<8 x float> %a, <8 x float> %b) {
define <8 x float> @shuffle_v8f32_10324567(<8 x float> %a, <8 x float> %b) {
; ALL-LABEL: shuffle_v8f32_10324567:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,4,5,6,7]
; ALL-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7>
@@ -561,7 +561,7 @@ define <8 x float> @shuffle_v8f32_10324567(<8 x float> %a, <8 x float> %b) {
define <8 x float> @shuffle_v8f32_11334567(<8 x float> %a, <8 x float> %b) {
; ALL-LABEL: shuffle_v8f32_11334567:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,1,3,3,4,5,6,7]
; ALL-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -570,7 +570,7 @@ define <8 x float> @shuffle_v8f32_11334567(<8 x float> %a, <8 x float> %b) {
define <8 x float> @shuffle_v8f32_01235467(<8 x float> %a, <8 x float> %b) {
; ALL-LABEL: shuffle_v8f32_01235467:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,2,3,5,4,6,7]
; ALL-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7>
@@ -579,7 +579,7 @@ define <8 x float> @shuffle_v8f32_01235467(<8 x float> %a, <8 x float> %b) {
define <8 x float> @shuffle_v8f32_01235466(<8 x float> %a, <8 x float> %b) {
; ALL-LABEL: shuffle_v8f32_01235466:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,2,3,5,4,6,6]
; ALL-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 6, i32 6>
@@ -588,7 +588,7 @@ define <8 x float> @shuffle_v8f32_01235466(<8 x float> %a, <8 x float> %b) {
define <8 x float> @shuffle_v8f32_002u6u44(<8 x float> %a, <8 x float> %b) {
; ALL-LABEL: shuffle_v8f32_002u6u44:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,u,6,u,4,4]
; ALL-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 undef, i32 6, i32 undef, i32 4, i32 4>
@@ -597,7 +597,7 @@ define <8 x float> @shuffle_v8f32_002u6u44(<8 x float> %a, <8 x float> %b) {
define <8 x float> @shuffle_v8f32_00uu66uu(<8 x float> %a, <8 x float> %b) {
; ALL-LABEL: shuffle_v8f32_00uu66uu:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,u,u,6,6,u,u]
; ALL-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 undef, i32 undef, i32 6, i32 6, i32 undef, i32 undef>
@@ -606,7 +606,7 @@ define <8 x float> @shuffle_v8f32_00uu66uu(<8 x float> %a, <8 x float> %b) {
define <8 x float> @shuffle_v8f32_103245uu(<8 x float> %a, <8 x float> %b) {
; ALL-LABEL: shuffle_v8f32_103245uu:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,4,5,u,u]
; ALL-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 4, i32 5, i32 undef, i32 undef>
@@ -615,7 +615,7 @@ define <8 x float> @shuffle_v8f32_103245uu(<8 x float> %a, <8 x float> %b) {
define <8 x float> @shuffle_v8f32_1133uu67(<8 x float> %a, <8 x float> %b) {
; ALL-LABEL: shuffle_v8f32_1133uu67:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,1,3,3,u,u,6,7]
; ALL-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 undef, i32 undef, i32 6, i32 7>
@@ -624,7 +624,7 @@ define <8 x float> @shuffle_v8f32_1133uu67(<8 x float> %a, <8 x float> %b) {
define <8 x float> @shuffle_v8f32_0uu354uu(<8 x float> %a, <8 x float> %b) {
; ALL-LABEL: shuffle_v8f32_0uu354uu:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,u,u,3,5,4,u,u]
; ALL-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 undef, i32 undef, i32 3, i32 5, i32 4, i32 undef, i32 undef>
@@ -633,7 +633,7 @@ define <8 x float> @shuffle_v8f32_0uu354uu(<8 x float> %a, <8 x float> %b) {
define <8 x float> @shuffle_v8f32_uuu3uu66(<8 x float> %a, <8 x float> %b) {
; ALL-LABEL: shuffle_v8f32_uuu3uu66:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[u,u,u,3,u,u,6,6]
; ALL-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 6, i32 6>
@@ -642,7 +642,7 @@ define <8 x float> @shuffle_v8f32_uuu3uu66(<8 x float> %a, <8 x float> %b) {
define <8 x float> @shuffle_v8f32_c348cda0(<8 x float> %a, <8 x float> %b) {
; AVX1-LABEL: shuffle_v8f32_c348cda0:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,0,1]
; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],ymm2[0,0],ymm0[4,7],ymm2[4,4]
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3,0,1]
@@ -652,7 +652,7 @@ define <8 x float> @shuffle_v8f32_c348cda0(<8 x float> %a, <8 x float> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v8f32_c348cda0:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vmovaps {{.*#+}} ymm2 = <4,u,u,0,4,5,2,u>
; AVX2-NEXT: vpermps %ymm1, %ymm2, %ymm1
; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,3,2,0,4,7,6,4]
@@ -661,7 +661,7 @@ define <8 x float> @shuffle_v8f32_c348cda0(<8 x float> %a, <8 x float> %b) {
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v8f32_c348cda0:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovaps {{.*#+}} ymm2 = [4,11,12,0,4,5,2,8]
; AVX512VL-NEXT: vpermi2ps %ymm0, %ymm1, %ymm2
; AVX512VL-NEXT: vmovaps %ymm2, %ymm0
@@ -672,7 +672,7 @@ define <8 x float> @shuffle_v8f32_c348cda0(<8 x float> %a, <8 x float> %b) {
define <8 x float> @shuffle_v8f32_f511235a(<8 x float> %a, <8 x float> %b) {
; AVX1-LABEL: shuffle_v8f32_f511235a:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[3,1,2,2,7,5,6,6]
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,0,1]
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,0,1]
@@ -682,7 +682,7 @@ define <8 x float> @shuffle_v8f32_f511235a(<8 x float> %a, <8 x float> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v8f32_f511235a:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[3,2,2,3,7,6,6,7]
; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,0]
; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,1,2,3,5,5,6,7]
@@ -691,7 +691,7 @@ define <8 x float> @shuffle_v8f32_f511235a(<8 x float> %a, <8 x float> %b) {
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v8f32_f511235a:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovaps {{.*#+}} ymm2 = [15,5,1,1,2,3,5,10]
; AVX512VL-NEXT: vpermt2ps %ymm1, %ymm2, %ymm0
; AVX512VL-NEXT: retq
@@ -701,49 +701,55 @@ define <8 x float> @shuffle_v8f32_f511235a(<8 x float> %a, <8 x float> %b) {
define <8 x float> @shuffle_v8f32_32103210(<8 x float> %a, <8 x float> %b) {
; AVX1-LABEL: shuffle_v8f32_32103210:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v8f32_32103210:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
-; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[0,1,0,1]
-; AVX2-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v8f32_32103210:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
-; AVX512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1]
-; AVX512VL-NEXT: retq
+; AVX2OR512VL-LABEL: shuffle_v8f32_32103210:
+; AVX2OR512VL: # %bb.0:
+; AVX2OR512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; AVX2OR512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1]
+; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 3, i32 2, i32 1, i32 0>
ret <8 x float> %shuffle
}
define <8 x float> @shuffle_v8f32_76547654(<8 x float> %a, <8 x float> %b) {
-; ALL-LABEL: shuffle_v8f32_76547654:
-; ALL: # BB#0:
-; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
-; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
-; ALL-NEXT: retq
+; AVX1-LABEL: shuffle_v8f32_76547654:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
+; AVX1-NEXT: retq
+;
+; AVX2OR512VL-LABEL: shuffle_v8f32_76547654:
+; AVX2OR512VL: # %bb.0:
+; AVX2OR512VL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX2OR512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,3]
+; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 7, i32 6, i32 5, i32 4>
ret <8 x float> %shuffle
}
define <8 x float> @shuffle_v8f32_76543210(<8 x float> %a, <8 x float> %b) {
-; ALL-LABEL: shuffle_v8f32_76543210:
-; ALL: # BB#0:
-; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
-; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
-; ALL-NEXT: retq
+; AVX1-LABEL: shuffle_v8f32_76543210:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX1-NEXT: retq
+;
+; AVX2OR512VL-LABEL: shuffle_v8f32_76543210:
+; AVX2OR512VL: # %bb.0:
+; AVX2OR512VL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX2OR512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
ret <8 x float> %shuffle
}
define <8 x float> @shuffle_v8f32_3210ba98(<8 x float> %a, <8 x float> %b) {
; ALL-LABEL: shuffle_v8f32_3210ba98:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
; ALL-NEXT: retq
@@ -753,7 +759,7 @@ define <8 x float> @shuffle_v8f32_3210ba98(<8 x float> %a, <8 x float> %b) {
define <8 x float> @shuffle_v8f32_3210fedc(<8 x float> %a, <8 x float> %b) {
; ALL-LABEL: shuffle_v8f32_3210fedc:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
; ALL-NEXT: retq
@@ -763,7 +769,7 @@ define <8 x float> @shuffle_v8f32_3210fedc(<8 x float> %a, <8 x float> %b) {
define <8 x float> @shuffle_v8f32_7654fedc(<8 x float> %a, <8 x float> %b) {
; ALL-LABEL: shuffle_v8f32_7654fedc:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
; ALL-NEXT: retq
@@ -773,7 +779,7 @@ define <8 x float> @shuffle_v8f32_7654fedc(<8 x float> %a, <8 x float> %b) {
define <8 x float> @shuffle_v8f32_fedc7654(<8 x float> %a, <8 x float> %b) {
; ALL-LABEL: shuffle_v8f32_fedc7654:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
; ALL-NEXT: retq
@@ -783,7 +789,7 @@ define <8 x float> @shuffle_v8f32_fedc7654(<8 x float> %a, <8 x float> %b) {
define <8 x float> @PR21138(<8 x float> %truc, <8 x float> %tchose) {
; AVX1-LABEL: PR21138:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,3],xmm2[1,3]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
@@ -793,7 +799,7 @@ define <8 x float> @PR21138(<8 x float> %truc, <8 x float> %tchose) {
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: PR21138:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
; AVX2OR512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX2OR512VL-NEXT: retq
@@ -803,7 +809,7 @@ define <8 x float> @PR21138(<8 x float> %truc, <8 x float> %tchose) {
define <8 x float> @shuffle_v8f32_ba987654(<8 x float> %a, <8 x float> %b) {
; ALL-LABEL: shuffle_v8f32_ba987654:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
; ALL-NEXT: retq
@@ -813,7 +819,7 @@ define <8 x float> @shuffle_v8f32_ba987654(<8 x float> %a, <8 x float> %b) {
define <8 x float> @shuffle_v8f32_ba983210(<8 x float> %a, <8 x float> %b) {
; ALL-LABEL: shuffle_v8f32_ba983210:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
; ALL-NEXT: retq
@@ -823,7 +829,7 @@ define <8 x float> @shuffle_v8f32_ba983210(<8 x float> %a, <8 x float> %b) {
define <8 x float> @shuffle_v8f32_80u1c4u5(<8 x float> %a, <8 x float> %b) {
; ALL-LABEL: shuffle_v8f32_80u1c4u5:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5]
; ALL-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 8, i32 0, i32 undef, i32 1, i32 12, i32 4, i32 undef, i32 5>
@@ -832,7 +838,7 @@ define <8 x float> @shuffle_v8f32_80u1c4u5(<8 x float> %a, <8 x float> %b) {
define <8 x float> @shuffle_v8f32_a2u3e6f7(<8 x float> %a, <8 x float> %b) {
; ALL-LABEL: shuffle_v8f32_a2u3e6f7:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7]
; ALL-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 10, i32 2, i32 undef, i32 3, i32 14, i32 6, i32 15, i32 7>
@@ -841,7 +847,7 @@ define <8 x float> @shuffle_v8f32_a2u3e6f7(<8 x float> %a, <8 x float> %b) {
define <8 x float> @shuffle_v8f32_uuuu1111(<8 x float> %a, <8 x float> %b) {
; ALL-LABEL: shuffle_v8f32_uuuu1111:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1]
; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; ALL-NEXT: retq
@@ -851,13 +857,13 @@ define <8 x float> @shuffle_v8f32_uuuu1111(<8 x float> %a, <8 x float> %b) {
define <8 x float> @shuffle_v8f32_44444444(<8 x float> %a, <8 x float> %b) {
; AVX1-LABEL: shuffle_v8f32_44444444:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v8f32_44444444:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX2OR512VL-NEXT: vbroadcastss %xmm0, %ymm0
; AVX2OR512VL-NEXT: retq
@@ -867,7 +873,7 @@ define <8 x float> @shuffle_v8f32_44444444(<8 x float> %a, <8 x float> %b) {
define <8 x float> @shuffle_v8f32_1188uuuu(<8 x float> %a, <8 x float> %b) {
; ALL-LABEL: shuffle_v8f32_1188uuuu:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,0]
; ALL-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 1, i32 1, i32 8, i32 8, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -876,7 +882,7 @@ define <8 x float> @shuffle_v8f32_1188uuuu(<8 x float> %a, <8 x float> %b) {
define <8 x float> @shuffle_v8f32_uuuu3210(<8 x float> %a, <8 x float> %b) {
; ALL-LABEL: shuffle_v8f32_uuuu3210:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; ALL-NEXT: retq
@@ -886,7 +892,7 @@ define <8 x float> @shuffle_v8f32_uuuu3210(<8 x float> %a, <8 x float> %b) {
define <8 x float> @shuffle_v8f32_uuuu1188(<8 x float> %a, <8 x float> %b) {
; ALL-LABEL: shuffle_v8f32_uuuu1188:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,0]
; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; ALL-NEXT: retq
@@ -896,7 +902,7 @@ define <8 x float> @shuffle_v8f32_uuuu1188(<8 x float> %a, <8 x float> %b) {
define <8 x float> @shuffle_v8f32_1111uuuu(<8 x float> %a, <8 x float> %b) {
; ALL-LABEL: shuffle_v8f32_1111uuuu:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1]
; ALL-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -905,7 +911,7 @@ define <8 x float> @shuffle_v8f32_1111uuuu(<8 x float> %a, <8 x float> %b) {
define <8 x float> @shuffle_v8f32_5555uuuu(<8 x float> %a, <8 x float> %b) {
; ALL-LABEL: shuffle_v8f32_5555uuuu:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0
; ALL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1]
; ALL-NEXT: retq
@@ -915,13 +921,13 @@ define <8 x float> @shuffle_v8f32_5555uuuu(<8 x float> %a, <8 x float> %b) {
define <8 x i32> @shuffle_v8i32_00000000(<8 x i32> %a, <8 x i32> %b) {
; AVX1-LABEL: shuffle_v8i32_00000000:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v8i32_00000000:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vbroadcastss %xmm0, %ymm0
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
@@ -930,16 +936,16 @@ define <8 x i32> @shuffle_v8i32_00000000(<8 x i32> %a, <8 x i32> %b) {
define <8 x i32> @shuffle_v8i32_00000010(<8 x i32> %a, <8 x i32> %b) {
; AVX1-LABEL: shuffle_v8i32_00000010:
-; AVX1: # BB#0:
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,0]
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,0,0,0]
+; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v8i32_00000010:
-; AVX2OR512VL: # BB#0:
-; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,0]
-; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1]
+; AVX2OR512VL: # %bb.0:
+; AVX2OR512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,0]
+; AVX2OR512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,0,1]
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0>
ret <8 x i32> %shuffle
@@ -947,16 +953,16 @@ define <8 x i32> @shuffle_v8i32_00000010(<8 x i32> %a, <8 x i32> %b) {
define <8 x i32> @shuffle_v8i32_00000200(<8 x i32> %a, <8 x i32> %b) {
; AVX1-LABEL: shuffle_v8i32_00000200:
-; AVX1: # BB#0:
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,0,0]
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,0,0,0]
+; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,0,0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v8i32_00000200:
-; AVX2OR512VL: # BB#0:
-; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,2]
-; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0]
+; AVX2OR512VL: # %bb.0:
+; AVX2OR512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,2]
+; AVX2OR512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,0]
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0>
ret <8 x i32> %shuffle
@@ -964,16 +970,16 @@ define <8 x i32> @shuffle_v8i32_00000200(<8 x i32> %a, <8 x i32> %b) {
define <8 x i32> @shuffle_v8i32_00003000(<8 x i32> %a, <8 x i32> %b) {
; AVX1-LABEL: shuffle_v8i32_00003000:
-; AVX1: # BB#0:
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,0,0,0]
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,0,0,0]
+; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,0,0,0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v8i32_00003000:
-; AVX2OR512VL: # BB#0:
-; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,3,0]
-; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0]
+; AVX2OR512VL: # %bb.0:
+; AVX2OR512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,3,0]
+; AVX2OR512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,0]
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0>
ret <8 x i32> %shuffle
@@ -981,7 +987,7 @@ define <8 x i32> @shuffle_v8i32_00003000(<8 x i32> %a, <8 x i32> %b) {
define <8 x i32> @shuffle_v8i32_00040000(<8 x i32> %a, <8 x i32> %b) {
; AVX1-LABEL: shuffle_v8i32_00040000:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,0,0,3]
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
@@ -989,9 +995,9 @@ define <8 x i32> @shuffle_v8i32_00040000(<8 x i32> %a, <8 x i32> %b) {
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v8i32_00040000:
-; AVX2OR512VL: # BB#0:
-; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,4,0,0,0,0]
-; AVX2OR512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2OR512VL: # %bb.0:
+; AVX2OR512VL-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,4,0,0,0,0]
+; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 4, i32 0, i32 0, i32 0, i32 0>
ret <8 x i32> %shuffle
@@ -999,16 +1005,16 @@ define <8 x i32> @shuffle_v8i32_00040000(<8 x i32> %a, <8 x i32> %b) {
define <8 x i32> @shuffle_v8i32_00500000(<8 x i32> %a, <8 x i32> %b) {
; AVX1-LABEL: shuffle_v8i32_00500000:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,1,0,4,4,4,4]
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v8i32_00500000:
-; AVX2OR512VL: # BB#0:
-; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,5,0,0,0,0,0]
-; AVX2OR512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2OR512VL: # %bb.0:
+; AVX2OR512VL-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,5,0,0,0,0,0]
+; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 5, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <8 x i32> %shuffle
@@ -1016,16 +1022,16 @@ define <8 x i32> @shuffle_v8i32_00500000(<8 x i32> %a, <8 x i32> %b) {
define <8 x i32> @shuffle_v8i32_06000000(<8 x i32> %a, <8 x i32> %b) {
; AVX1-LABEL: shuffle_v8i32_06000000:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,0,0,4,4,4,4]
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v8i32_06000000:
-; AVX2OR512VL: # BB#0:
-; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,6,0,0,0,0,0,0]
-; AVX2OR512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2OR512VL: # %bb.0:
+; AVX2OR512VL-NEXT: vmovaps {{.*#+}} ymm1 = [0,6,0,0,0,0,0,0]
+; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 6, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <8 x i32> %shuffle
@@ -1033,14 +1039,14 @@ define <8 x i32> @shuffle_v8i32_06000000(<8 x i32> %a, <8 x i32> %b) {
define <8 x i32> @shuffle_v8i32_70000000(<8 x i32> %a, <8 x i32> %b) {
; AVX1-LABEL: shuffle_v8i32_70000000:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,0,0,0,4,4,4,4]
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v8i32_70000000:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: movl $7, %eax
; AVX2OR512VL-NEXT: vmovd %eax, %xmm1
; AVX2OR512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0
@@ -1051,13 +1057,13 @@ define <8 x i32> @shuffle_v8i32_70000000(<8 x i32> %a, <8 x i32> %b) {
define <8 x i32> @shuffle_v8i32_01014545(<8 x i32> %a, <8 x i32> %b) {
; AVX1-LABEL: shuffle_v8i32_01014545:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v8i32_01014545:
-; AVX2OR512VL: # BB#0:
-; AVX2OR512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5]
+; AVX2OR512VL: # %bb.0:
+; AVX2OR512VL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5]
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5>
ret <8 x i32> %shuffle
@@ -1065,16 +1071,16 @@ define <8 x i32> @shuffle_v8i32_01014545(<8 x i32> %a, <8 x i32> %b) {
define <8 x i32> @shuffle_v8i32_00112233(<8 x i32> %a, <8 x i32> %b) {
; AVX1-LABEL: shuffle_v8i32_00112233:
-; AVX1: # BB#0:
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,1,1]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,0,1,1]
+; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v8i32_00112233:
-; AVX2OR512VL: # BB#0:
-; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,1,1,2,2,3,3]
-; AVX2OR512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2OR512VL: # %bb.0:
+; AVX2OR512VL-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,1,1,2,2,3,3]
+; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
ret <8 x i32> %shuffle
@@ -1082,47 +1088,42 @@ define <8 x i32> @shuffle_v8i32_00112233(<8 x i32> %a, <8 x i32> %b) {
define <8 x i32> @shuffle_v8i32_00001111(<8 x i32> %a, <8 x i32> %b) {
; AVX1-LABEL: shuffle_v8i32_00001111:
-; AVX1: # BB#0:
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,0,0,0]
+; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v8i32_00001111:
-; AVX2OR512VL: # BB#0:
-; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
-; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1]
+; AVX2OR512VL: # %bb.0:
+; AVX2OR512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; AVX2OR512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,1]
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
ret <8 x i32> %shuffle
}
define <8 x i32> @shuffle_v8i32_81a3c5e7(<8 x i32> %a, <8 x i32> %b) {
-; AVX1-LABEL: shuffle_v8i32_81a3c5e7:
-; AVX1: # BB#0:
-; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7]
-; AVX1-NEXT: retq
-;
-; AVX2OR512VL-LABEL: shuffle_v8i32_81a3c5e7:
-; AVX2OR512VL: # BB#0:
-; AVX2OR512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7]
-; AVX2OR512VL-NEXT: retq
+; ALL-LABEL: shuffle_v8i32_81a3c5e7:
+; ALL: # %bb.0:
+; ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7]
+; ALL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 8, i32 1, i32 10, i32 3, i32 12, i32 5, i32 14, i32 7>
ret <8 x i32> %shuffle
}
define <8 x i32> @shuffle_v8i32_08080808(<8 x i32> %a, <8 x i32> %b) {
; AVX1-LABEL: shuffle_v8i32_08080808:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0]
; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v8i32_08080808:
-; AVX2OR512VL: # BB#0:
-; AVX2OR512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX2OR512VL-NEXT: vpbroadcastq %xmm0, %ymm0
+; AVX2OR512VL: # %bb.0:
+; AVX2OR512VL-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX2OR512VL-NEXT: vbroadcastsd %xmm0, %ymm0
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 8, i32 0, i32 8, i32 0, i32 8, i32 0, i32 8>
ret <8 x i32> %shuffle
@@ -1130,16 +1131,16 @@ define <8 x i32> @shuffle_v8i32_08080808(<8 x i32> %a, <8 x i32> %b) {
define <8 x i32> @shuffle_v8i32_08084c4c(<8 x i32> %a, <8 x i32> %b) {
; AVX1-LABEL: shuffle_v8i32_08084c4c:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[0,0],ymm0[4,4],ymm1[4,4]
; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v8i32_08084c4c:
-; AVX2OR512VL: # BB#0:
-; AVX2OR512VL-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,2,0,4,4,6,4]
-; AVX2OR512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5]
-; AVX2OR512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
+; AVX2OR512VL: # %bb.0:
+; AVX2OR512VL-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,0,2,0,4,4,6,4]
+; AVX2OR512VL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5]
+; AVX2OR512VL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 8, i32 0, i32 8, i32 4, i32 12, i32 4, i32 12>
ret <8 x i32> %shuffle
@@ -1147,7 +1148,7 @@ define <8 x i32> @shuffle_v8i32_08084c4c(<8 x i32> %a, <8 x i32> %b) {
define <8 x i32> @shuffle_v8i32_8823cc67(<8 x i32> %a, <8 x i32> %b) {
; ALL-LABEL: shuffle_v8i32_8823cc67:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,0],ymm0[2,3],ymm1[4,4],ymm0[6,7]
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 8, i32 8, i32 2, i32 3, i32 12, i32 12, i32 6, i32 7>
@@ -1156,7 +1157,7 @@ define <8 x i32> @shuffle_v8i32_8823cc67(<8 x i32> %a, <8 x i32> %b) {
define <8 x i32> @shuffle_v8i32_9832dc76(<8 x i32> %a, <8 x i32> %b) {
; ALL-LABEL: shuffle_v8i32_9832dc76:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,0],ymm0[3,2],ymm1[5,4],ymm0[7,6]
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 9, i32 8, i32 3, i32 2, i32 13, i32 12, i32 7, i32 6>
@@ -1165,7 +1166,7 @@ define <8 x i32> @shuffle_v8i32_9832dc76(<8 x i32> %a, <8 x i32> %b) {
define <8 x i32> @shuffle_v8i32_9810dc54(<8 x i32> %a, <8 x i32> %b) {
; ALL-LABEL: shuffle_v8i32_9810dc54:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,0],ymm0[1,0],ymm1[5,4],ymm0[5,4]
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 9, i32 8, i32 1, i32 0, i32 13, i32 12, i32 5, i32 4>
@@ -1173,50 +1174,33 @@ define <8 x i32> @shuffle_v8i32_9810dc54(<8 x i32> %a, <8 x i32> %b) {
}
define <8 x i32> @shuffle_v8i32_08194c5d(<8 x i32> %a, <8 x i32> %b) {
-; AVX1-LABEL: shuffle_v8i32_08194c5d:
-; AVX1: # BB#0:
-; AVX1-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
-; AVX1-NEXT: retq
-;
-; AVX2OR512VL-LABEL: shuffle_v8i32_08194c5d:
-; AVX2OR512VL: # BB#0:
-; AVX2OR512VL-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
-; AVX2OR512VL-NEXT: retq
+; ALL-LABEL: shuffle_v8i32_08194c5d:
+; ALL: # %bb.0:
+; ALL-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
+; ALL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
ret <8 x i32> %shuffle
}
define <8 x i32> @shuffle_v8i32_2a3b6e7f(<8 x i32> %a, <8 x i32> %b) {
-; AVX1-LABEL: shuffle_v8i32_2a3b6e7f:
-; AVX1: # BB#0:
-; AVX1-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
-; AVX1-NEXT: retq
-;
-; AVX2OR512VL-LABEL: shuffle_v8i32_2a3b6e7f:
-; AVX2OR512VL: # BB#0:
-; AVX2OR512VL-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
-; AVX2OR512VL-NEXT: retq
+; ALL-LABEL: shuffle_v8i32_2a3b6e7f:
+; ALL: # %bb.0:
+; ALL-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
+; ALL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
ret <8 x i32> %shuffle
}
define <8 x i32> @shuffle_v8i32_08192a3b(<8 x i32> %a, <8 x i32> %b) {
-; AVX1-LABEL: shuffle_v8i32_08192a3b:
-; AVX1: # BB#0:
-; AVX1-NEXT: vunpckhps {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX1-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: shuffle_v8i32_08192a3b:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
-; AVX2-NEXT: retq
+; AVX1OR2-LABEL: shuffle_v8i32_08192a3b:
+; AVX1OR2: # %bb.0:
+; AVX1OR2-NEXT: vunpckhps {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1OR2-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX1OR2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1OR2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v8i32_08192a3b:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm0 = [0,8,2,9,4,10,6,11]
; AVX512VL-NEXT: vpermi2d %ymm1, %ymm2, %ymm0
@@ -1227,16 +1211,16 @@ define <8 x i32> @shuffle_v8i32_08192a3b(<8 x i32> %a, <8 x i32> %b) {
define <8 x i32> @shuffle_v8i32_08991abb(<8 x i32> %a, <8 x i32> %b) {
; AVX1-LABEL: shuffle_v8i32_08991abb:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,0],xmm1[0,0]
; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[1,1]
-; AVX1-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
-; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,2,3,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,2,3,3]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v8i32_08991abb:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <u,0,1,1,u,2,3,3>
; AVX2-NEXT: vpermd %ymm1, %ymm2, %ymm1
; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
@@ -1245,7 +1229,7 @@ define <8 x i32> @shuffle_v8i32_08991abb(<8 x i32> %a, <8 x i32> %b) {
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v8i32_08991abb:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm0 = [8,0,1,1,10,2,3,3]
; AVX512VL-NEXT: vpermi2d %ymm2, %ymm1, %ymm0
@@ -1256,15 +1240,15 @@ define <8 x i32> @shuffle_v8i32_08991abb(<8 x i32> %a, <8 x i32> %b) {
define <8 x i32> @shuffle_v8i32_091b2d3f(<8 x i32> %a, <8 x i32> %b) {
; AVX1-LABEL: shuffle_v8i32_091b2d3f:
-; AVX1: # BB#0:
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,1,3]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,3,3]
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[0,1,1,3]
+; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,1,3,3]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v8i32_091b2d3f:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; AVX2OR512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
; AVX2OR512VL-NEXT: retq
@@ -1274,14 +1258,14 @@ define <8 x i32> @shuffle_v8i32_091b2d3f(<8 x i32> %a, <8 x i32> %b) {
define <8 x i32> @shuffle_v8i32_09ab1def(<8 x i32> %a, <8 x i32> %b) {
; AVX1-LABEL: shuffle_v8i32_09ab1def:
-; AVX1: # BB#0:
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[1,1,3,3]
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v8i32_09ab1def:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3]
; AVX2OR512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
@@ -1291,197 +1275,142 @@ define <8 x i32> @shuffle_v8i32_09ab1def(<8 x i32> %a, <8 x i32> %b) {
}
define <8 x i32> @shuffle_v8i32_00014445(<8 x i32> %a, <8 x i32> %b) {
-; AVX1-LABEL: shuffle_v8i32_00014445:
-; AVX1: # BB#0:
-; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,1,4,4,4,5]
-; AVX1-NEXT: retq
-;
-; AVX2OR512VL-LABEL: shuffle_v8i32_00014445:
-; AVX2OR512VL: # BB#0:
-; AVX2OR512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,1,4,4,4,5]
-; AVX2OR512VL-NEXT: retq
+; ALL-LABEL: shuffle_v8i32_00014445:
+; ALL: # %bb.0:
+; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,1,4,4,4,5]
+; ALL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 1, i32 4, i32 4, i32 4, i32 5>
ret <8 x i32> %shuffle
}
define <8 x i32> @shuffle_v8i32_00204464(<8 x i32> %a, <8 x i32> %b) {
-; AVX1-LABEL: shuffle_v8i32_00204464:
-; AVX1: # BB#0:
-; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,0,4,4,6,4]
-; AVX1-NEXT: retq
-;
-; AVX2OR512VL-LABEL: shuffle_v8i32_00204464:
-; AVX2OR512VL: # BB#0:
-; AVX2OR512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,0,4,4,6,4]
-; AVX2OR512VL-NEXT: retq
+; ALL-LABEL: shuffle_v8i32_00204464:
+; ALL: # %bb.0:
+; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,0,4,4,6,4]
+; ALL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 0, i32 4, i32 4, i32 6, i32 4>
ret <8 x i32> %shuffle
}
define <8 x i32> @shuffle_v8i32_03004744(<8 x i32> %a, <8 x i32> %b) {
-; AVX1-LABEL: shuffle_v8i32_03004744:
-; AVX1: # BB#0:
-; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,3,0,0,4,7,4,4]
-; AVX1-NEXT: retq
-;
-; AVX2OR512VL-LABEL: shuffle_v8i32_03004744:
-; AVX2OR512VL: # BB#0:
-; AVX2OR512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,3,0,0,4,7,4,4]
-; AVX2OR512VL-NEXT: retq
+; ALL-LABEL: shuffle_v8i32_03004744:
+; ALL: # %bb.0:
+; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,3,0,0,4,7,4,4]
+; ALL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 3, i32 0, i32 0, i32 4, i32 7, i32 4, i32 4>
ret <8 x i32> %shuffle
}
define <8 x i32> @shuffle_v8i32_10005444(<8 x i32> %a, <8 x i32> %b) {
-; AVX1-LABEL: shuffle_v8i32_10005444:
-; AVX1: # BB#0:
-; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,0,0,5,4,4,4]
-; AVX1-NEXT: retq
-;
-; AVX2OR512VL-LABEL: shuffle_v8i32_10005444:
-; AVX2OR512VL: # BB#0:
-; AVX2OR512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,0,0,0,5,4,4,4]
-; AVX2OR512VL-NEXT: retq
+; ALL-LABEL: shuffle_v8i32_10005444:
+; ALL: # %bb.0:
+; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,0,0,5,4,4,4]
+; ALL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 0, i32 0, i32 0, i32 5, i32 4, i32 4, i32 4>
ret <8 x i32> %shuffle
}
define <8 x i32> @shuffle_v8i32_22006644(<8 x i32> %a, <8 x i32> %b) {
-; AVX1-LABEL: shuffle_v8i32_22006644:
-; AVX1: # BB#0:
-; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,2,0,0,6,6,4,4]
-; AVX1-NEXT: retq
-;
-; AVX2OR512VL-LABEL: shuffle_v8i32_22006644:
-; AVX2OR512VL: # BB#0:
-; AVX2OR512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,0,0,6,6,4,4]
-; AVX2OR512VL-NEXT: retq
+; ALL-LABEL: shuffle_v8i32_22006644:
+; ALL: # %bb.0:
+; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,2,0,0,6,6,4,4]
+; ALL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 2, i32 2, i32 0, i32 0, i32 6, i32 6, i32 4, i32 4>
ret <8 x i32> %shuffle
}
define <8 x i32> @shuffle_v8i32_33307774(<8 x i32> %a, <8 x i32> %b) {
-; AVX1-LABEL: shuffle_v8i32_33307774:
-; AVX1: # BB#0:
-; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,3,3,0,7,7,7,4]
-; AVX1-NEXT: retq
-;
-; AVX2OR512VL-LABEL: shuffle_v8i32_33307774:
-; AVX2OR512VL: # BB#0:
-; AVX2OR512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,3,3,0,7,7,7,4]
-; AVX2OR512VL-NEXT: retq
+; ALL-LABEL: shuffle_v8i32_33307774:
+; ALL: # %bb.0:
+; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,3,3,0,7,7,7,4]
+; ALL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 0, i32 7, i32 7, i32 7, i32 4>
ret <8 x i32> %shuffle
}
define <8 x i32> @shuffle_v8i32_32107654(<8 x i32> %a, <8 x i32> %b) {
-; AVX1-LABEL: shuffle_v8i32_32107654:
-; AVX1: # BB#0:
-; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
-; AVX1-NEXT: retq
-;
-; AVX2OR512VL-LABEL: shuffle_v8i32_32107654:
-; AVX2OR512VL: # BB#0:
-; AVX2OR512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
-; AVX2OR512VL-NEXT: retq
+; ALL-LABEL: shuffle_v8i32_32107654:
+; ALL: # %bb.0:
+; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; ALL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
ret <8 x i32> %shuffle
}
define <8 x i32> @shuffle_v8i32_00234467(<8 x i32> %a, <8 x i32> %b) {
-; AVX1-LABEL: shuffle_v8i32_00234467:
-; AVX1: # BB#0:
-; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,3,4,4,6,7]
-; AVX1-NEXT: retq
-;
-; AVX2OR512VL-LABEL: shuffle_v8i32_00234467:
-; AVX2OR512VL: # BB#0:
-; AVX2OR512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,3,4,4,6,7]
-; AVX2OR512VL-NEXT: retq
+; ALL-LABEL: shuffle_v8i32_00234467:
+; ALL: # %bb.0:
+; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,3,4,4,6,7]
+; ALL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 3, i32 4, i32 4, i32 6, i32 7>
ret <8 x i32> %shuffle
}
define <8 x i32> @shuffle_v8i32_00224466(<8 x i32> %a, <8 x i32> %b) {
; AVX1-LABEL: shuffle_v8i32_00224466:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovsldup {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6]
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v8i32_00224466:
-; AVX2OR512VL: # BB#0:
-; AVX2OR512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6]
+; AVX2OR512VL: # %bb.0:
+; AVX2OR512VL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6]
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
ret <8 x i32> %shuffle
}
define <8 x i32> @shuffle_v8i32_10325476(<8 x i32> %a, <8 x i32> %b) {
-; AVX1-LABEL: shuffle_v8i32_10325476:
-; AVX1: # BB#0:
-; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6]
-; AVX1-NEXT: retq
-;
-; AVX2OR512VL-LABEL: shuffle_v8i32_10325476:
-; AVX2OR512VL: # BB#0:
-; AVX2OR512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6]
-; AVX2OR512VL-NEXT: retq
+; ALL-LABEL: shuffle_v8i32_10325476:
+; ALL: # %bb.0:
+; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6]
+; ALL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
ret <8 x i32> %shuffle
}
define <8 x i32> @shuffle_v8i32_11335577(<8 x i32> %a, <8 x i32> %b) {
; AVX1-LABEL: shuffle_v8i32_11335577:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7]
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v8i32_11335577:
-; AVX2OR512VL: # BB#0:
-; AVX2OR512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7]
+; AVX2OR512VL: # %bb.0:
+; AVX2OR512VL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7]
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
ret <8 x i32> %shuffle
}
define <8 x i32> @shuffle_v8i32_10235467(<8 x i32> %a, <8 x i32> %b) {
-; AVX1-LABEL: shuffle_v8i32_10235467:
-; AVX1: # BB#0:
-; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,2,3,5,4,6,7]
-; AVX1-NEXT: retq
-;
-; AVX2OR512VL-LABEL: shuffle_v8i32_10235467:
-; AVX2OR512VL: # BB#0:
-; AVX2OR512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,0,2,3,5,4,6,7]
-; AVX2OR512VL-NEXT: retq
+; ALL-LABEL: shuffle_v8i32_10235467:
+; ALL: # %bb.0:
+; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,2,3,5,4,6,7]
+; ALL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7>
ret <8 x i32> %shuffle
}
define <8 x i32> @shuffle_v8i32_10225466(<8 x i32> %a, <8 x i32> %b) {
-; AVX1-LABEL: shuffle_v8i32_10225466:
-; AVX1: # BB#0:
-; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,2,2,5,4,6,6]
-; AVX1-NEXT: retq
-;
-; AVX2OR512VL-LABEL: shuffle_v8i32_10225466:
-; AVX2OR512VL: # BB#0:
-; AVX2OR512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,0,2,2,5,4,6,6]
-; AVX2OR512VL-NEXT: retq
+; ALL-LABEL: shuffle_v8i32_10225466:
+; ALL: # %bb.0:
+; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,2,2,5,4,6,6]
+; ALL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 0, i32 2, i32 2, i32 5, i32 4, i32 6, i32 6>
ret <8 x i32> %shuffle
}
define <8 x i32> @shuffle_v8i32_00015444(<8 x i32> %a, <8 x i32> %b) {
; AVX1-LABEL: shuffle_v8i32_00015444:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,1,5,4,4,4]
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v8i32_00015444:
-; AVX2OR512VL: # BB#0:
-; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,1,5,4,4,4]
-; AVX2OR512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2OR512VL: # %bb.0:
+; AVX2OR512VL-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,1,5,4,4,4]
+; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 1, i32 5, i32 4, i32 4, i32 4>
ret <8 x i32> %shuffle
@@ -1489,14 +1418,14 @@ define <8 x i32> @shuffle_v8i32_00015444(<8 x i32> %a, <8 x i32> %b) {
define <8 x i32> @shuffle_v8i32_00204644(<8 x i32> %a, <8 x i32> %b) {
; AVX1-LABEL: shuffle_v8i32_00204644:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,0,4,6,4,4]
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v8i32_00204644:
-; AVX2OR512VL: # BB#0:
-; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,2,0,4,6,4,4]
-; AVX2OR512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2OR512VL: # %bb.0:
+; AVX2OR512VL-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,2,0,4,6,4,4]
+; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 0, i32 4, i32 6, i32 4, i32 4>
ret <8 x i32> %shuffle
@@ -1504,14 +1433,14 @@ define <8 x i32> @shuffle_v8i32_00204644(<8 x i32> %a, <8 x i32> %b) {
define <8 x i32> @shuffle_v8i32_03004474(<8 x i32> %a, <8 x i32> %b) {
; AVX1-LABEL: shuffle_v8i32_03004474:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,3,0,0,4,4,7,4]
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v8i32_03004474:
-; AVX2OR512VL: # BB#0:
-; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,3,0,0,4,4,7,4]
-; AVX2OR512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2OR512VL: # %bb.0:
+; AVX2OR512VL-NEXT: vmovaps {{.*#+}} ymm1 = [0,3,0,0,4,4,7,4]
+; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 3, i32 0, i32 0, i32 4, i32 4, i32 7, i32 4>
ret <8 x i32> %shuffle
@@ -1519,14 +1448,14 @@ define <8 x i32> @shuffle_v8i32_03004474(<8 x i32> %a, <8 x i32> %b) {
define <8 x i32> @shuffle_v8i32_10004444(<8 x i32> %a, <8 x i32> %b) {
; AVX1-LABEL: shuffle_v8i32_10004444:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,0,0,4,4,4,4]
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v8i32_10004444:
-; AVX2OR512VL: # BB#0:
-; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [1,0,0,0,4,4,4,4]
-; AVX2OR512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2OR512VL: # %bb.0:
+; AVX2OR512VL-NEXT: vmovaps {{.*#+}} ymm1 = [1,0,0,0,4,4,4,4]
+; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
ret <8 x i32> %shuffle
@@ -1534,14 +1463,14 @@ define <8 x i32> @shuffle_v8i32_10004444(<8 x i32> %a, <8 x i32> %b) {
define <8 x i32> @shuffle_v8i32_22006446(<8 x i32> %a, <8 x i32> %b) {
; AVX1-LABEL: shuffle_v8i32_22006446:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,2,0,0,6,4,4,6]
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v8i32_22006446:
-; AVX2OR512VL: # BB#0:
-; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [2,2,0,0,6,4,4,6]
-; AVX2OR512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2OR512VL: # %bb.0:
+; AVX2OR512VL-NEXT: vmovaps {{.*#+}} ymm1 = [2,2,0,0,6,4,4,6]
+; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 2, i32 2, i32 0, i32 0, i32 6, i32 4, i32 4, i32 6>
ret <8 x i32> %shuffle
@@ -1549,14 +1478,14 @@ define <8 x i32> @shuffle_v8i32_22006446(<8 x i32> %a, <8 x i32> %b) {
define <8 x i32> @shuffle_v8i32_33307474(<8 x i32> %a, <8 x i32> %b) {
; AVX1-LABEL: shuffle_v8i32_33307474:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,3,3,0,7,4,7,4]
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v8i32_33307474:
-; AVX2OR512VL: # BB#0:
-; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [3,3,3,0,7,4,7,4]
-; AVX2OR512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2OR512VL: # %bb.0:
+; AVX2OR512VL-NEXT: vmovaps {{.*#+}} ymm1 = [3,3,3,0,7,4,7,4]
+; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 0, i32 7, i32 4, i32 7, i32 4>
ret <8 x i32> %shuffle
@@ -1564,14 +1493,14 @@ define <8 x i32> @shuffle_v8i32_33307474(<8 x i32> %a, <8 x i32> %b) {
define <8 x i32> @shuffle_v8i32_32104567(<8 x i32> %a, <8 x i32> %b) {
; AVX1-LABEL: shuffle_v8i32_32104567:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,4,5,6,7]
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v8i32_32104567:
-; AVX2OR512VL: # BB#0:
-; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [3,2,1,0,4,5,6,7]
-; AVX2OR512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2OR512VL: # %bb.0:
+; AVX2OR512VL-NEXT: vmovaps {{.*#+}} ymm1 = [3,2,1,0,4,5,6,7]
+; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7>
ret <8 x i32> %shuffle
@@ -1579,14 +1508,14 @@ define <8 x i32> @shuffle_v8i32_32104567(<8 x i32> %a, <8 x i32> %b) {
define <8 x i32> @shuffle_v8i32_00236744(<8 x i32> %a, <8 x i32> %b) {
; AVX1-LABEL: shuffle_v8i32_00236744:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,3,6,7,4,4]
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v8i32_00236744:
-; AVX2OR512VL: # BB#0:
-; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,2,3,6,7,4,4]
-; AVX2OR512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2OR512VL: # %bb.0:
+; AVX2OR512VL-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,2,3,6,7,4,4]
+; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 3, i32 6, i32 7, i32 4, i32 4>
ret <8 x i32> %shuffle
@@ -1594,14 +1523,14 @@ define <8 x i32> @shuffle_v8i32_00236744(<8 x i32> %a, <8 x i32> %b) {
define <8 x i32> @shuffle_v8i32_00226644(<8 x i32> %a, <8 x i32> %b) {
; AVX1-LABEL: shuffle_v8i32_00226644:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,2,6,6,4,4]
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v8i32_00226644:
-; AVX2OR512VL: # BB#0:
-; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,2,2,6,6,4,4]
-; AVX2OR512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2OR512VL: # %bb.0:
+; AVX2OR512VL-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,2,2,6,6,4,4]
+; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 6, i32 6, i32 4, i32 4>
ret <8 x i32> %shuffle
@@ -1609,14 +1538,14 @@ define <8 x i32> @shuffle_v8i32_00226644(<8 x i32> %a, <8 x i32> %b) {
define <8 x i32> @shuffle_v8i32_10324567(<8 x i32> %a, <8 x i32> %b) {
; AVX1-LABEL: shuffle_v8i32_10324567:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,4,5,6,7]
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v8i32_10324567:
-; AVX2OR512VL: # BB#0:
-; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [1,0,3,2,4,5,6,7]
-; AVX2OR512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2OR512VL: # %bb.0:
+; AVX2OR512VL-NEXT: vmovaps {{.*#+}} ymm1 = [1,0,3,2,4,5,6,7]
+; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7>
ret <8 x i32> %shuffle
@@ -1624,14 +1553,14 @@ define <8 x i32> @shuffle_v8i32_10324567(<8 x i32> %a, <8 x i32> %b) {
define <8 x i32> @shuffle_v8i32_11334567(<8 x i32> %a, <8 x i32> %b) {
; AVX1-LABEL: shuffle_v8i32_11334567:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,1,3,3,4,5,6,7]
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v8i32_11334567:
-; AVX2OR512VL: # BB#0:
-; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [1,1,3,3,4,5,6,7]
-; AVX2OR512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2OR512VL: # %bb.0:
+; AVX2OR512VL-NEXT: vmovaps {{.*#+}} ymm1 = [1,1,3,3,4,5,6,7]
+; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 4, i32 5, i32 6, i32 7>
ret <8 x i32> %shuffle
@@ -1639,14 +1568,14 @@ define <8 x i32> @shuffle_v8i32_11334567(<8 x i32> %a, <8 x i32> %b) {
define <8 x i32> @shuffle_v8i32_01235467(<8 x i32> %a, <8 x i32> %b) {
; AVX1-LABEL: shuffle_v8i32_01235467:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,2,3,5,4,6,7]
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v8i32_01235467:
-; AVX2OR512VL: # BB#0:
-; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,5,4,6,7]
-; AVX2OR512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2OR512VL: # %bb.0:
+; AVX2OR512VL-NEXT: vmovaps {{.*#+}} ymm1 = [0,1,2,3,5,4,6,7]
+; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7>
ret <8 x i32> %shuffle
@@ -1654,14 +1583,14 @@ define <8 x i32> @shuffle_v8i32_01235467(<8 x i32> %a, <8 x i32> %b) {
define <8 x i32> @shuffle_v8i32_01235466(<8 x i32> %a, <8 x i32> %b) {
; AVX1-LABEL: shuffle_v8i32_01235466:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,2,3,5,4,6,6]
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v8i32_01235466:
-; AVX2OR512VL: # BB#0:
-; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,5,4,6,6]
-; AVX2OR512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2OR512VL: # %bb.0:
+; AVX2OR512VL-NEXT: vmovaps {{.*#+}} ymm1 = [0,1,2,3,5,4,6,6]
+; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 6, i32 6>
ret <8 x i32> %shuffle
@@ -1669,14 +1598,14 @@ define <8 x i32> @shuffle_v8i32_01235466(<8 x i32> %a, <8 x i32> %b) {
define <8 x i32> @shuffle_v8i32_002u6u44(<8 x i32> %a, <8 x i32> %b) {
; AVX1-LABEL: shuffle_v8i32_002u6u44:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,u,6,u,4,4]
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v8i32_002u6u44:
-; AVX2OR512VL: # BB#0:
-; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,0,2,u,6,u,4,4>
-; AVX2OR512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2OR512VL: # %bb.0:
+; AVX2OR512VL-NEXT: vmovaps {{.*#+}} ymm1 = <0,0,2,u,6,u,4,4>
+; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 undef, i32 6, i32 undef, i32 4, i32 4>
ret <8 x i32> %shuffle
@@ -1684,14 +1613,14 @@ define <8 x i32> @shuffle_v8i32_002u6u44(<8 x i32> %a, <8 x i32> %b) {
define <8 x i32> @shuffle_v8i32_00uu66uu(<8 x i32> %a, <8 x i32> %b) {
; AVX1-LABEL: shuffle_v8i32_00uu66uu:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,u,u,6,6,u,u]
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v8i32_00uu66uu:
-; AVX2OR512VL: # BB#0:
-; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,0,u,u,6,6,u,u>
-; AVX2OR512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2OR512VL: # %bb.0:
+; AVX2OR512VL-NEXT: vmovaps {{.*#+}} ymm1 = <0,0,u,u,6,6,u,u>
+; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 undef, i32 undef, i32 6, i32 6, i32 undef, i32 undef>
ret <8 x i32> %shuffle
@@ -1699,14 +1628,14 @@ define <8 x i32> @shuffle_v8i32_00uu66uu(<8 x i32> %a, <8 x i32> %b) {
define <8 x i32> @shuffle_v8i32_103245uu(<8 x i32> %a, <8 x i32> %b) {
; AVX1-LABEL: shuffle_v8i32_103245uu:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,4,5,u,u]
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v8i32_103245uu:
-; AVX2OR512VL: # BB#0:
-; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} ymm1 = <1,0,3,2,4,5,u,u>
-; AVX2OR512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2OR512VL: # %bb.0:
+; AVX2OR512VL-NEXT: vmovaps {{.*#+}} ymm1 = <1,0,3,2,4,5,u,u>
+; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 4, i32 5, i32 undef, i32 undef>
ret <8 x i32> %shuffle
@@ -1714,14 +1643,14 @@ define <8 x i32> @shuffle_v8i32_103245uu(<8 x i32> %a, <8 x i32> %b) {
define <8 x i32> @shuffle_v8i32_1133uu67(<8 x i32> %a, <8 x i32> %b) {
; AVX1-LABEL: shuffle_v8i32_1133uu67:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,1,3,3,u,u,6,7]
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v8i32_1133uu67:
-; AVX2OR512VL: # BB#0:
-; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} ymm1 = <1,1,3,3,u,u,6,7>
-; AVX2OR512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2OR512VL: # %bb.0:
+; AVX2OR512VL-NEXT: vmovaps {{.*#+}} ymm1 = <1,1,3,3,u,u,6,7>
+; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 undef, i32 undef, i32 6, i32 7>
ret <8 x i32> %shuffle
@@ -1729,14 +1658,14 @@ define <8 x i32> @shuffle_v8i32_1133uu67(<8 x i32> %a, <8 x i32> %b) {
define <8 x i32> @shuffle_v8i32_0uu354uu(<8 x i32> %a, <8 x i32> %b) {
; AVX1-LABEL: shuffle_v8i32_0uu354uu:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,u,u,3,5,4,u,u]
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v8i32_0uu354uu:
-; AVX2OR512VL: # BB#0:
-; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,u,u,3,5,4,u,u>
-; AVX2OR512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2OR512VL: # %bb.0:
+; AVX2OR512VL-NEXT: vmovaps {{.*#+}} ymm1 = <0,u,u,3,5,4,u,u>
+; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 undef, i32 undef, i32 3, i32 5, i32 4, i32 undef, i32 undef>
ret <8 x i32> %shuffle
@@ -1744,14 +1673,14 @@ define <8 x i32> @shuffle_v8i32_0uu354uu(<8 x i32> %a, <8 x i32> %b) {
define <8 x i32> @shuffle_v8i32_uuu3uu66(<8 x i32> %a, <8 x i32> %b) {
; AVX1-LABEL: shuffle_v8i32_uuu3uu66:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[u,u,u,3,u,u,6,6]
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v8i32_uuu3uu66:
-; AVX2OR512VL: # BB#0:
-; AVX2OR512VL-NEXT: vmovdqa {{.*#+}} ymm1 = <u,u,u,3,u,u,6,6>
-; AVX2OR512VL-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2OR512VL: # %bb.0:
+; AVX2OR512VL-NEXT: vmovaps {{.*#+}} ymm1 = <u,u,u,3,u,u,6,6>
+; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 6, i32 6>
ret <8 x i32> %shuffle
@@ -1759,7 +1688,7 @@ define <8 x i32> @shuffle_v8i32_uuu3uu66(<8 x i32> %a, <8 x i32> %b) {
define <8 x i32> @shuffle_v8i32_6caa87e5(<8 x i32> %a, <8 x i32> %b) {
; AVX1-LABEL: shuffle_v8i32_6caa87e5:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3,0,1]
@@ -1768,15 +1697,15 @@ define <8 x i32> @shuffle_v8i32_6caa87e5(<8 x i32> %a, <8 x i32> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v8i32_6caa87e5:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,1,3,2]
-; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,2,2,4,4,6,6]
-; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,0,3]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4],ymm0[5],ymm1[6],ymm0[7]
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,1,3,2]
+; AVX2-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,0,2,2,4,4,6,6]
+; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,0,3]
+; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4],ymm0[5],ymm1[6],ymm0[7]
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v8i32_6caa87e5:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [14,4,2,2,0,15,6,13]
; AVX512VL-NEXT: vpermi2d %ymm0, %ymm1, %ymm2
; AVX512VL-NEXT: vmovdqa %ymm2, %ymm0
@@ -1787,37 +1716,31 @@ define <8 x i32> @shuffle_v8i32_6caa87e5(<8 x i32> %a, <8 x i32> %b) {
define <8 x i32> @shuffle_v8i32_32103210(<8 x i32> %a, <8 x i32> %b) {
; AVX1-LABEL: shuffle_v8i32_32103210:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v8i32_32103210:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
-; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1,0,1]
-; AVX2-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v8i32_32103210:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
-; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
-; AVX512VL-NEXT: retq
+; AVX2OR512VL-LABEL: shuffle_v8i32_32103210:
+; AVX2OR512VL: # %bb.0:
+; AVX2OR512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; AVX2OR512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1]
+; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 3, i32 2, i32 1, i32 0>
ret <8 x i32> %shuffle
}
define <8 x i32> @shuffle_v8i32_76547654(<8 x i32> %a, <8 x i32> %b) {
; AVX1-LABEL: shuffle_v8i32_76547654:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v8i32_76547654:
-; AVX2OR512VL: # BB#0:
-; AVX2OR512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
-; AVX2OR512VL-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
+; AVX2OR512VL: # %bb.0:
+; AVX2OR512VL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX2OR512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,3]
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 7, i32 6, i32 5, i32 4>
ret <8 x i32> %shuffle
@@ -1825,95 +1748,89 @@ define <8 x i32> @shuffle_v8i32_76547654(<8 x i32> %a, <8 x i32> %b) {
define <8 x i32> @shuffle_v8i32_76543210(<8 x i32> %a, <8 x i32> %b) {
; AVX1-LABEL: shuffle_v8i32_76543210:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v8i32_76543210:
-; AVX2OR512VL: # BB#0:
-; AVX2OR512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
-; AVX2OR512VL-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX2OR512VL: # %bb.0:
+; AVX2OR512VL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX2OR512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1]
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
ret <8 x i32> %shuffle
}
define <8 x i32> @shuffle_v8i32_3210ba98(<8 x i32> %a, <8 x i32> %b) {
-; AVX1-LABEL: shuffle_v8i32_3210ba98:
-; AVX1: # BB#0:
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
-; AVX1-NEXT: retq
-;
-; AVX2OR512VL-LABEL: shuffle_v8i32_3210ba98:
-; AVX2OR512VL: # BB#0:
-; AVX2OR512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2OR512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
-; AVX2OR512VL-NEXT: retq
+; ALL-LABEL: shuffle_v8i32_3210ba98:
+; ALL: # %bb.0:
+; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; ALL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 11, i32 10, i32 9, i32 8>
ret <8 x i32> %shuffle
}
define <8 x i32> @shuffle_v8i32_3210fedc(<8 x i32> %a, <8 x i32> %b) {
; AVX1-LABEL: shuffle_v8i32_3210fedc:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v8i32_3210fedc:
-; AVX2OR512VL: # BB#0:
-; AVX2OR512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX2OR512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX2OR512VL: # %bb.0:
+; AVX2OR512VL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX2OR512VL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12>
ret <8 x i32> %shuffle
}
define <8 x i32> @shuffle_v8i32_7654fedc(<8 x i32> %a, <8 x i32> %b) {
-; AVX1-LABEL: shuffle_v8i32_7654fedc:
-; AVX1: # BB#0:
-; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
-; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
-; AVX1-NEXT: retq
+; AVX1OR2-LABEL: shuffle_v8i32_7654fedc:
+; AVX1OR2: # %bb.0:
+; AVX1OR2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
+; AVX1OR2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX1OR2-NEXT: retq
;
-; AVX2OR512VL-LABEL: shuffle_v8i32_7654fedc:
-; AVX2OR512VL: # BB#0:
-; AVX2OR512VL-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
-; AVX2OR512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
-; AVX2OR512VL-NEXT: retq
+; AVX512VL-LABEL: shuffle_v8i32_7654fedc:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
+; AVX512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX512VL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 15, i32 14, i32 13, i32 12>
ret <8 x i32> %shuffle
}
define <8 x i32> @shuffle_v8i32_fedc7654(<8 x i32> %a, <8 x i32> %b) {
-; AVX1-LABEL: shuffle_v8i32_fedc7654:
-; AVX1: # BB#0:
-; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
-; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
-; AVX1-NEXT: retq
+; AVX1OR2-LABEL: shuffle_v8i32_fedc7654:
+; AVX1OR2: # %bb.0:
+; AVX1OR2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
+; AVX1OR2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX1OR2-NEXT: retq
;
-; AVX2OR512VL-LABEL: shuffle_v8i32_fedc7654:
-; AVX2OR512VL: # BB#0:
-; AVX2OR512VL-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
-; AVX2OR512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
-; AVX2OR512VL-NEXT: retq
+; AVX512VL-LABEL: shuffle_v8i32_fedc7654:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
+; AVX512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX512VL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 15, i32 14, i32 13, i32 12, i32 7, i32 6, i32 5, i32 4>
ret <8 x i32> %shuffle
}
define <8 x i32> @shuffle_v8i32_ba987654(<8 x i32> %a, <8 x i32> %b) {
; AVX1-LABEL: shuffle_v8i32_ba987654:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v8i32_ba987654:
-; AVX2OR512VL: # BB#0:
-; AVX2OR512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX2OR512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX2OR512VL: # %bb.0:
+; AVX2OR512VL-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX2OR512VL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4>
ret <8 x i32> %shuffle
@@ -1921,15 +1838,15 @@ define <8 x i32> @shuffle_v8i32_ba987654(<8 x i32> %a, <8 x i32> %b) {
define <8 x i32> @shuffle_v8i32_ba983210(<8 x i32> %a, <8 x i32> %b) {
; AVX1-LABEL: shuffle_v8i32_ba983210:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v8i32_ba983210:
-; AVX2OR512VL: # BB#0:
-; AVX2OR512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX2OR512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX2OR512VL: # %bb.0:
+; AVX2OR512VL-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX2OR512VL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4>
ret <8 x i32> %shuffle
@@ -1937,13 +1854,13 @@ define <8 x i32> @shuffle_v8i32_ba983210(<8 x i32> %a, <8 x i32> %b) {
define <8 x i32> @shuffle_v8i32_zuu8zuuc(<8 x i32> %a) {
; AVX1-LABEL: shuffle_v8i32_zuu8zuuc:
-; AVX1: # BB#0:
-; AVX1-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; AVX1: # %bb.0:
+; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4]
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v8i32_zuu8zuuc:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpslldq {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,18,19]
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <8 x i32> zeroinitializer, <8 x i32> %a, <8 x i32> <i32 0, i32 undef, i32 undef, i32 8, i32 0, i32 undef, i32 undef, i32 12>
@@ -1952,14 +1869,14 @@ define <8 x i32> @shuffle_v8i32_zuu8zuuc(<8 x i32> %a) {
define <8 x i32> @shuffle_v8i32_9ubzdefz(<8 x i32> %a) {
; AVX1-LABEL: shuffle_v8i32_9ubzdefz:
-; AVX1: # BB#0:
-; AVX1-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; AVX1: # %bb.0:
+; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,0],ymm0[3,0],ymm1[7,4],ymm0[7,4]
; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,2],ymm1[2,0],ymm0[5,6],ymm1[6,4]
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v8i32_9ubzdefz:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,ymm0[20,21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <8 x i32> zeroinitializer, <8 x i32> %a, <8 x i32> <i32 9, i32 undef, i32 11, i32 0, i32 13, i32 14, i32 15, i32 0>
@@ -1967,39 +1884,28 @@ define <8 x i32> @shuffle_v8i32_9ubzdefz(<8 x i32> %a) {
}
define <8 x i32> @shuffle_v8i32_80u1b4uu(<8 x i32> %a, <8 x i32> %b) {
-; AVX1-LABEL: shuffle_v8i32_80u1b4uu:
-; AVX1: # BB#0:
-; AVX1-NEXT: vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5]
-; AVX1-NEXT: retq
-;
-; AVX2OR512VL-LABEL: shuffle_v8i32_80u1b4uu:
-; AVX2OR512VL: # BB#0:
-; AVX2OR512VL-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5]
-; AVX2OR512VL-NEXT: retq
+; ALL-LABEL: shuffle_v8i32_80u1b4uu:
+; ALL: # %bb.0:
+; ALL-NEXT: vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5]
+; ALL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 8, i32 0, i32 undef, i32 1, i32 12, i32 4, i32 undef, i32 undef>
ret <8 x i32> %shuffle
}
define <8 x i32> @shuffle_v8i32_uuuu1111(<8 x i32> %a, <8 x i32> %b) {
-; AVX1-LABEL: shuffle_v8i32_uuuu1111:
-; AVX1: # BB#0:
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX2OR512VL-LABEL: shuffle_v8i32_uuuu1111:
-; AVX2OR512VL: # BB#0:
-; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; AVX2OR512VL-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2OR512VL-NEXT: retq
+; ALL-LABEL: shuffle_v8i32_uuuu1111:
+; ALL: # %bb.0:
+; ALL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; ALL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 1, i32 1, i32 1>
ret <8 x i32> %shuffle
}
define <8 x i32> @shuffle_v8i32_2222uuuu(<8 x i32> %a, <8 x i32> %b) {
; ALL-LABEL: shuffle_v8i32_2222uuuu:
-; ALL: # BB#0:
-; ALL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
+; ALL: # %bb.0:
+; ALL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,2,2]
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 undef, i32 undef, i32 undef, i32 undef>
ret <8 x i32> %shuffle
@@ -2007,8 +1913,8 @@ define <8 x i32> @shuffle_v8i32_2222uuuu(<8 x i32> %a, <8 x i32> %b) {
define <8 x i32> @shuffle_v8i32_2A3Buuuu(<8 x i32> %a, <8 x i32> %b) {
; ALL-LABEL: shuffle_v8i32_2A3Buuuu:
-; ALL: # BB#0:
-; ALL-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; ALL: # %bb.0:
+; ALL-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 undef, i32 undef, i32 undef, i32 undef>
ret <8 x i32> %shuffle
@@ -2016,35 +1922,29 @@ define <8 x i32> @shuffle_v8i32_2A3Buuuu(<8 x i32> %a, <8 x i32> %b) {
define <8 x i32> @shuffle_v8i32_44444444(<8 x i32> %a, <8 x i32> %b) {
; AVX1-LABEL: shuffle_v8i32_44444444:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
; AVX1-NEXT: retq
;
-; AVX2-LABEL: shuffle_v8i32_44444444:
-; AVX2: # BB#0:
-; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX2-NEXT: vbroadcastss %xmm0, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v8i32_44444444:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX512VL-NEXT: vpbroadcastd %xmm0, %ymm0
-; AVX512VL-NEXT: retq
+; AVX2OR512VL-LABEL: shuffle_v8i32_44444444:
+; AVX2OR512VL: # %bb.0:
+; AVX2OR512VL-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX2OR512VL-NEXT: vbroadcastss %xmm0, %ymm0
+; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
ret <8 x i32> %shuffle
}
define <8 x i32> @shuffle_v8i32_44444444_bc(<8 x float> %a, <8 x float> %b) {
; AVX1-LABEL: shuffle_v8i32_44444444_bc:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v8i32_44444444_bc:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX2OR512VL-NEXT: vbroadcastss %xmm0, %ymm0
; AVX2OR512VL-NEXT: retq
@@ -2055,17 +1955,11 @@ define <8 x i32> @shuffle_v8i32_44444444_bc(<8 x float> %a, <8 x float> %b) {
}
define <8 x i32> @shuffle_v8i32_5555uuuu(<8 x i32> %a, <8 x i32> %b) {
-; AVX1-LABEL: shuffle_v8i32_5555uuuu:
-; AVX1: # BB#0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; AVX1-NEXT: retq
-;
-; AVX2OR512VL-LABEL: shuffle_v8i32_5555uuuu:
-; AVX2OR512VL: # BB#0:
-; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; AVX2OR512VL-NEXT: retq
+; ALL-LABEL: shuffle_v8i32_5555uuuu:
+; ALL: # %bb.0:
+; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0
+; ALL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; ALL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 undef, i32 undef, i32 undef, i32 undef>
ret <8 x i32> %shuffle
}
@@ -2073,13 +1967,13 @@ define <8 x i32> @shuffle_v8i32_5555uuuu(<8 x i32> %a, <8 x i32> %b) {
; PR32453
define <8 x i32> @shuffle_v8i32_uuuuuu7u(<8 x i32> %a, <8 x i32> %b) nounwind {
; AVX1-LABEL: shuffle_v8i32_uuuuuu7u:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7]
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v8i32_uuuuuu7u:
-; AVX2OR512VL: # BB#0:
-; AVX2OR512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,3,3,4,5,7,7]
+; AVX2OR512VL: # %bb.0:
+; AVX2OR512VL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,3,3,4,5,7,7]
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 7, i32 undef>
ret <8 x i32> %shuffle
@@ -2087,7 +1981,7 @@ define <8 x i32> @shuffle_v8i32_uuuuuu7u(<8 x i32> %a, <8 x i32> %b) nounwind {
define <8 x float> @splat_mem_v8f32_2(float* %p) {
; ALL-LABEL: splat_mem_v8f32_2:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vbroadcastss (%rdi), %ymm0
; ALL-NEXT: retq
%1 = load float, float* %p
@@ -2098,13 +1992,13 @@ define <8 x float> @splat_mem_v8f32_2(float* %p) {
define <8 x float> @splat_v8f32(<4 x float> %r) {
; AVX1-LABEL: splat_v8f32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: splat_v8f32:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vbroadcastss %xmm0, %ymm0
; AVX2OR512VL-NEXT: retq
%1 = shufflevector <4 x float> %r, <4 x float> undef, <8 x i32> zeroinitializer
@@ -2117,14 +2011,14 @@ define <8 x float> @splat_v8f32(<4 x float> %r) {
define <8 x i32> @shuffle_v8i32_z0U2zUz6(<8 x i32> %a) {
; AVX1-LABEL: shuffle_v8i32_z0U2zUz6:
-; AVX1: # BB#0:
-; AVX1-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; AVX1: # %bb.0:
+; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,0,3,1,6,4,7,5]
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v8i32_z0U2zUz6:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpsllq $32, %ymm0, %ymm0
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> zeroinitializer, <8 x i32> <i32 8, i32 0, i32 undef, i32 2, i32 8, i32 undef, i32 8, i32 6>
@@ -2133,14 +2027,14 @@ define <8 x i32> @shuffle_v8i32_z0U2zUz6(<8 x i32> %a) {
define <8 x i32> @shuffle_v8i32_1U3z5zUU(<8 x i32> %a) {
; AVX1-LABEL: shuffle_v8i32_1U3z5zUU:
-; AVX1: # BB#0:
-; AVX1-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; AVX1: # %bb.0:
+; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v8i32_1U3z5zUU:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpsrlq $32, %ymm0, %ymm0
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> zeroinitializer, <8 x i32> <i32 1, i32 undef, i32 3, i32 8, i32 5, i32 8, i32 undef, i32 undef>
@@ -2149,13 +2043,13 @@ define <8 x i32> @shuffle_v8i32_1U3z5zUU(<8 x i32> %a) {
define <8 x i32> @shuffle_v8i32_B012F456(<8 x i32> %a, <8 x i32> %b) {
; AVX1-LABEL: shuffle_v8i32_B012F456:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,0],ymm0[0,0],ymm1[7,4],ymm0[4,4]
; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,2],ymm0[1,2],ymm1[4,6],ymm0[5,6]
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v8i32_B012F456:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[12,13,14,15],ymm0[0,1,2,3,4,5,6,7,8,9,10,11],ymm1[28,29,30,31],ymm0[16,17,18,19,20,21,22,23,24,25,26,27]
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 11, i32 0, i32 1, i32 2, i32 15, i32 4, i32 5, i32 6>
@@ -2164,13 +2058,13 @@ define <8 x i32> @shuffle_v8i32_B012F456(<8 x i32> %a, <8 x i32> %b) {
define <8 x i32> @shuffle_v8i32_1238567C(<8 x i32> %a, <8 x i32> %b) {
; AVX1-LABEL: shuffle_v8i32_1238567C:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0],ymm0[3,0],ymm1[4,4],ymm0[7,4]
; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,2],ymm1[2,0],ymm0[5,6],ymm1[6,4]
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v8i32_1238567C:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[4,5,6,7,8,9,10,11,12,13,14,15],ymm1[0,1,2,3],ymm0[20,21,22,23,24,25,26,27,28,29,30,31],ymm1[16,17,18,19]
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 2, i32 3, i32 8, i32 5, i32 6, i32 7, i32 12>
@@ -2179,13 +2073,13 @@ define <8 x i32> @shuffle_v8i32_1238567C(<8 x i32> %a, <8 x i32> %b) {
define <8 x i32> @shuffle_v8i32_9AB0DEF4(<8 x i32> %a, <8 x i32> %b) {
; AVX1-LABEL: shuffle_v8i32_9AB0DEF4:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[3,0],ymm0[4,4],ymm1[7,4]
; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,2],ymm0[2,0],ymm1[5,6],ymm0[6,4]
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v8i32_9AB0DEF4:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[4,5,6,7,8,9,10,11,12,13,14,15],ymm0[0,1,2,3],ymm1[20,21,22,23,24,25,26,27,28,29,30,31],ymm0[16,17,18,19]
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 9, i32 10, i32 11, i32 0, i32 13, i32 14, i32 15, i32 4>
@@ -2194,13 +2088,13 @@ define <8 x i32> @shuffle_v8i32_9AB0DEF4(<8 x i32> %a, <8 x i32> %b) {
define <8 x i32> @shuffle_v8i32_389A7CDE(<8 x i32> %a, <8 x i32> %b) {
; AVX1-LABEL: shuffle_v8i32_389A7CDE:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm1[0,0],ymm0[7,4],ymm1[4,4]
; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[1,2],ymm0[4,6],ymm1[5,6]
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v8i32_389A7CDE:
-; AVX2OR512VL: # BB#0:
+; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm0[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27]
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 3, i32 8, i32 9, i32 10, i32 7, i32 12, i32 13, i32 14>
@@ -2208,36 +2102,26 @@ define <8 x i32> @shuffle_v8i32_389A7CDE(<8 x i32> %a, <8 x i32> %b) {
}
define <8 x i32> @shuffle_v8i32_30127456(<8 x i32> %a, <8 x i32> %b) {
-; AVX1-LABEL: shuffle_v8i32_30127456:
-; AVX1: # BB#0:
-; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,0,1,2,7,4,5,6]
-; AVX1-NEXT: retq
-;
-; AVX2OR512VL-LABEL: shuffle_v8i32_30127456:
-; AVX2OR512VL: # BB#0:
-; AVX2OR512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,0,1,2,7,4,5,6]
-; AVX2OR512VL-NEXT: retq
+; ALL-LABEL: shuffle_v8i32_30127456:
+; ALL: # %bb.0:
+; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,0,1,2,7,4,5,6]
+; ALL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 3, i32 0, i32 1, i32 2, i32 7, i32 4, i32 5, i32 6>
ret <8 x i32> %shuffle
}
define <8 x i32> @shuffle_v8i32_12305674(<8 x i32> %a, <8 x i32> %b) {
-; AVX1-LABEL: shuffle_v8i32_12305674:
-; AVX1: # BB#0:
-; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,2,3,0,5,6,7,4]
-; AVX1-NEXT: retq
-;
-; AVX2OR512VL-LABEL: shuffle_v8i32_12305674:
-; AVX2OR512VL: # BB#0:
-; AVX2OR512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,2,3,0,5,6,7,4]
-; AVX2OR512VL-NEXT: retq
+; ALL-LABEL: shuffle_v8i32_12305674:
+; ALL: # %bb.0:
+; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,2,3,0,5,6,7,4]
+; ALL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 2, i32 3, i32 0, i32 5, i32 6, i32 7, i32 4>
ret <8 x i32> %shuffle
}
define <8x float> @concat_v2f32_1(<2 x float>* %tmp64, <2 x float>* %tmp65) {
; ALL-LABEL: concat_v2f32_1:
-; ALL: # BB#0: # %entry
+; ALL: # %bb.0: # %entry
; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; ALL-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
; ALL-NEXT: retq
@@ -2252,7 +2136,7 @@ entry:
define <8x float> @concat_v2f32_2(<2 x float>* %tmp64, <2 x float>* %tmp65) {
; ALL-LABEL: concat_v2f32_2:
-; ALL: # BB#0: # %entry
+; ALL: # %bb.0: # %entry
; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; ALL-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
; ALL-NEXT: retq
@@ -2265,7 +2149,7 @@ entry:
define <8x float> @concat_v2f32_3(<2 x float>* %tmp64, <2 x float>* %tmp65) {
; ALL-LABEL: concat_v2f32_3:
-; ALL: # BB#0: # %entry
+; ALL: # %bb.0: # %entry
; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; ALL-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
; ALL-NEXT: retq
@@ -2279,7 +2163,7 @@ entry:
define <8 x i32> @insert_mem_and_zero_v8i32(i32* %ptr) {
; ALL-LABEL: insert_mem_and_zero_v8i32:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; ALL-NEXT: retq
%a = load i32, i32* %ptr
@@ -2290,13 +2174,13 @@ define <8 x i32> @insert_mem_and_zero_v8i32(i32* %ptr) {
define <8 x i32> @concat_v8i32_0123CDEF(<8 x i32> %a, <8 x i32> %b) {
; AVX1-LABEL: concat_v8i32_0123CDEF:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: concat_v8i32_0123CDEF:
-; AVX2OR512VL: # BB#0:
-; AVX2OR512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX2OR512VL: # %bb.0:
+; AVX2OR512VL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX2OR512VL-NEXT: retq
%alo = shufflevector <8 x i32> %a, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%bhi = shufflevector <8 x i32> %b, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
@@ -2305,10 +2189,15 @@ define <8 x i32> @concat_v8i32_0123CDEF(<8 x i32> %a, <8 x i32> %b) {
}
define <8 x i32> @concat_v8i32_4567CDEF_bc(<8 x i32> %a0, <8 x i32> %a1) {
-; ALL-LABEL: concat_v8i32_4567CDEF_bc:
-; ALL: # BB#0:
-; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
-; ALL-NEXT: retq
+; AVX1OR2-LABEL: concat_v8i32_4567CDEF_bc:
+; AVX1OR2: # %bb.0:
+; AVX1OR2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
+; AVX1OR2-NEXT: retq
+;
+; AVX512VL-LABEL: concat_v8i32_4567CDEF_bc:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
+; AVX512VL-NEXT: retq
%a0hi = shufflevector <8 x i32> %a0, <8 x i32> %a1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%a1hi = shufflevector <8 x i32> %a0, <8 x i32> %a1, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
%bc0hi = bitcast <4 x i32> %a0hi to <2 x i64>
@@ -2320,7 +2209,7 @@ define <8 x i32> @concat_v8i32_4567CDEF_bc(<8 x i32> %a0, <8 x i32> %a1) {
define <8 x float> @concat_v8f32_4567CDEF_bc(<8 x float> %f0, <8 x float> %f1) {
; ALL-LABEL: concat_v8f32_4567CDEF_bc:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
; ALL-NEXT: retq
%a0 = bitcast <8 x float> %f0 to <4 x i64>
@@ -2336,7 +2225,7 @@ define <8 x float> @concat_v8f32_4567CDEF_bc(<8 x float> %f0, <8 x float> %f1) {
define <8 x i32> @insert_dup_mem_v8i32(i32* %ptr) {
; ALL-LABEL: insert_dup_mem_v8i32:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vbroadcastss (%rdi), %ymm0
; ALL-NEXT: retq
%tmp = load i32, i32* %ptr, align 4
@@ -2347,7 +2236,7 @@ define <8 x i32> @insert_dup_mem_v8i32(i32* %ptr) {
define <8 x i32> @shuffle_v8i32_12345678(<8 x i32> %a, <8 x i32> %b) {
; AVX1-LABEL: shuffle_v8i32_12345678:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7]
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0],ymm0[3,0],ymm1[4,4],ymm0[7,4]
@@ -2355,14 +2244,14 @@ define <8 x i32> @shuffle_v8i32_12345678(<8 x i32> %a, <8 x i32> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v8i32_12345678:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7]
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [1,2,3,4,5,6,7,0]
-; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2: # %bb.0:
+; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7]
+; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = [1,2,3,4,5,6,7,0]
+; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v8i32_12345678:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: valignd {{.*#+}} ymm0 = ymm0[1,2,3,4,5,6,7],ymm1[0]
; AVX512VL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
@@ -2371,20 +2260,20 @@ define <8 x i32> @shuffle_v8i32_12345678(<8 x i32> %a, <8 x i32> %b) {
define <8 x i32> @shuffle_v8i32_12345670(<8 x i32> %a) {
; AVX1-LABEL: shuffle_v8i32_12345670:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0],ymm0[3,0],ymm1[4,4],ymm0[7,4]
; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,2],ymm1[2,0],ymm0[5,6],ymm1[6,4]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v8i32_12345670:
-; AVX2: # BB#0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [1,2,3,4,5,6,7,0]
-; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = [1,2,3,4,5,6,7,0]
+; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v8i32_12345670:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: valignd {{.*#+}} ymm0 = ymm0[1,2,3,4,5,6,7,0]
; AVX512VL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> undef, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0>
diff --git a/test/CodeGen/X86/vector-shuffle-512-v16.ll b/test/CodeGen/X86/vector-shuffle-512-v16.ll
index 174a487160c7..fc189189eed7 100644
--- a/test/CodeGen/X86/vector-shuffle-512-v16.ll
+++ b/test/CodeGen/X86/vector-shuffle-512-v16.ll
@@ -6,7 +6,7 @@ target triple = "x86_64-unknown-unknown"
define <16 x float> @shuffle_v16f32_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<16 x float> %a, <16 x float> %b) {
; ALL-LABEL: shuffle_v16f32_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vbroadcastss %xmm0, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32><i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
@@ -15,7 +15,7 @@ define <16 x float> @shuffle_v16f32_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00
define <16 x float> @shuffle_v16f32_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08(<16 x float> %a, <16 x float> %b) {
; ALL-LABEL: shuffle_v16f32_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vextractf32x4 $2, %zmm0, %xmm0
; ALL-NEXT: vbroadcastss %xmm0, %zmm0
; ALL-NEXT: retq
@@ -25,9 +25,9 @@ define <16 x float> @shuffle_v16f32_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08
define <16 x float> @shuffle_v16f32_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_bc(<16 x i32> %a, <16 x i32> %b) {
; ALL-LABEL: shuffle_v16f32_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_bc:
-; ALL: # BB#0:
-; ALL-NEXT: vextracti32x4 $2, %zmm0, %xmm0
-; ALL-NEXT: vpbroadcastd %xmm0, %zmm0
+; ALL: # %bb.0:
+; ALL-NEXT: vextractf32x4 $2, %zmm0, %xmm0
+; ALL-NEXT: vbroadcastss %xmm0, %zmm0
; ALL-NEXT: retq
%tmp0 = bitcast <16 x i32> %a to <16 x float>
%tmp1 = bitcast <16 x i32> %b to <16 x float>
@@ -37,7 +37,7 @@ define <16 x float> @shuffle_v16f32_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08
define <16 x float> @shuffle_v16f32_00_10_01_11_04_14_05_15_08_18_09_19_0c_1c_0d_1d(<16 x float> %a, <16 x float> %b) {
; ALL-LABEL: shuffle_v16f32_00_10_01_11_04_14_05_15_08_18_09_19_0c_1c_0d_1d:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vunpcklps {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
; ALL-NEXT: retq
%shuffle = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32><i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
@@ -46,8 +46,8 @@ define <16 x float> @shuffle_v16f32_00_10_01_11_04_14_05_15_08_18_09_19_0c_1c_0d
define <16 x float> @shuffle_v16f32_00_zz_01_zz_04_zz_05_zz_08_zz_09_zz_0c_zz_0d_zz(<16 x float> %a, <16 x float> %b) {
; ALL-LABEL: shuffle_v16f32_00_zz_01_zz_04_zz_05_zz_08_zz_09_zz_0c_zz_0d_zz:
-; ALL: # BB#0:
-; ALL-NEXT: vxorps %zmm1, %zmm1, %zmm1
+; ALL: # %bb.0:
+; ALL-NEXT: vxorps %xmm1, %xmm1, %xmm1
; ALL-NEXT: vunpcklps {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
; ALL-NEXT: retq
%shuffle = shufflevector <16 x float> %a, <16 x float> zeroinitializer, <16 x i32><i32 0, i32 16, i32 1, i32 16, i32 4, i32 16, i32 5, i32 16, i32 8, i32 16, i32 9, i32 16, i32 12, i32 16, i32 13, i32 16>
@@ -56,17 +56,27 @@ define <16 x float> @shuffle_v16f32_00_zz_01_zz_04_zz_05_zz_08_zz_09_zz_0c_zz_0d
define <16 x float> @shuffle_v16f32_vunpcklps_swap(<16 x float> %a, <16 x float> %b) {
; ALL-LABEL: shuffle_v16f32_vunpcklps_swap:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vunpcklps {{.*#+}} zmm0 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[12],zmm0[12],zmm1[13],zmm0[13]
; ALL-NEXT: retq
%shuffle = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32> <i32 16, i32 0, i32 17, i32 1, i32 20, i32 4, i32 21, i32 5, i32 24, i32 8, i32 25, i32 9, i32 28, i32 12, i32 29, i32 13>
ret <16 x float> %shuffle
}
+; PR34382
+define <16 x float> @shuffle_v16f32_01_01_03_00_06_04_05_07_08_08_09_09_15_14_14_12(<16 x float> %a0) {
+; ALL-LABEL: shuffle_v16f32_01_01_03_00_06_04_05_07_08_08_09_09_15_14_14_12:
+; ALL: # %bb.0:
+; ALL-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[1,1,3,0,6,4,5,7,8,8,9,9,15,14,14,12]
+; ALL-NEXT: retq
+ %shuffle = shufflevector <16 x float> %a0, <16 x float> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 0, i32 6, i32 4, i32 5, i32 7, i32 8, i32 8, i32 9, i32 9, i32 15, i32 14, i32 14, i32 12>
+ ret <16 x float> %shuffle
+}
+
define <16 x i32> @shuffle_v16i32_00_10_01_11_04_14_05_15_08_18_09_19_0c_1c_0d_1d(<16 x i32> %a, <16 x i32> %b) {
; ALL-LABEL: shuffle_v16i32_00_10_01_11_04_14_05_15_08_18_09_19_0c_1c_0d_1d:
-; ALL: # BB#0:
-; ALL-NEXT: vpunpckldq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
+; ALL: # %bb.0:
+; ALL-NEXT: vunpcklps {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
; ALL-NEXT: retq
%shuffle = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32><i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
ret <16 x i32> %shuffle
@@ -74,9 +84,9 @@ define <16 x i32> @shuffle_v16i32_00_10_01_11_04_14_05_15_08_18_09_19_0c_1c_0d_1
define <16 x i32> @shuffle_v16i32_zz_10_zz_11_zz_14_zz_15_zz_18_zz_19_zz_1c_zz_1d(<16 x i32> %a, <16 x i32> %b) {
; ALL-LABEL: shuffle_v16i32_zz_10_zz_11_zz_14_zz_15_zz_18_zz_19_zz_1c_zz_1d:
-; ALL: # BB#0:
-; ALL-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; ALL-NEXT: vpunpckldq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
+; ALL: # %bb.0:
+; ALL-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; ALL-NEXT: vunpcklps {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
; ALL-NEXT: retq
%shuffle = shufflevector <16 x i32> zeroinitializer, <16 x i32> %b, <16 x i32><i32 15, i32 16, i32 13, i32 17, i32 11, i32 20, i32 9, i32 21, i32 7, i32 24, i32 5, i32 25, i32 3, i32 28, i32 1, i32 29>
ret <16 x i32> %shuffle
@@ -84,7 +94,7 @@ define <16 x i32> @shuffle_v16i32_zz_10_zz_11_zz_14_zz_15_zz_18_zz_19_zz_1c_zz_1
define <16 x float> @shuffle_v16f32_02_12_03_13_06_16_07_17_0a_1a_0b_1b_0e_1e_0f_1f(<16 x float> %a, <16 x float> %b) {
; ALL-LABEL: shuffle_v16f32_02_12_03_13_06_16_07_17_0a_1a_0b_1b_0e_1e_0f_1f:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vunpckhps {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
; ALL-NEXT: retq
%shuffle = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32><i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
@@ -93,8 +103,8 @@ define <16 x float> @shuffle_v16f32_02_12_03_13_06_16_07_17_0a_1a_0b_1b_0e_1e_0f
define <16 x float> @shuffle_v16f32_zz_12_zz_13_zz_16_zz_17_zz_1a_zz_1b_zz_1e_zz_1f(<16 x float> %a, <16 x float> %b) {
; ALL-LABEL: shuffle_v16f32_zz_12_zz_13_zz_16_zz_17_zz_1a_zz_1b_zz_1e_zz_1f:
-; ALL: # BB#0:
-; ALL-NEXT: vxorps %zmm0, %zmm0, %zmm0
+; ALL: # %bb.0:
+; ALL-NEXT: vxorps %xmm0, %xmm0, %xmm0
; ALL-NEXT: vunpckhps {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
; ALL-NEXT: retq
%shuffle = shufflevector <16 x float> zeroinitializer, <16 x float> %b, <16 x i32><i32 0, i32 18, i32 0, i32 19, i32 4, i32 22, i32 4, i32 23, i32 6, i32 26, i32 6, i32 27, i32 8, i32 30, i32 8, i32 31>
@@ -103,7 +113,7 @@ define <16 x float> @shuffle_v16f32_zz_12_zz_13_zz_16_zz_17_zz_1a_zz_1b_zz_1e_zz
define <16 x float> @shuffle_v16f32_00_00_02_02_04_04_06_06_08_08_10_10_12_12_14_14(<16 x float> %a, <16 x float> %b) {
; ALL-LABEL: shuffle_v16f32_00_00_02_02_04_04_06_06_08_08_10_10_12_12_14_14:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vmovsldup {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
; ALL-NEXT: retq
%shuffle = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32><i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
@@ -112,7 +122,7 @@ define <16 x float> @shuffle_v16f32_00_00_02_02_04_04_06_06_08_08_10_10_12_12_14
define <16 x float> @shuffle_v16f32_01_01_03_03_05_05_07_07_09_09_11_11_13_13_15_15(<16 x float> %a, <16 x float> %b) {
; ALL-LABEL: shuffle_v16f32_01_01_03_03_05_05_07_07_09_09_11_11_13_13_15_15:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vmovshdup {{.*#+}} zmm0 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
; ALL-NEXT: retq
%shuffle = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32><i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
@@ -121,7 +131,7 @@ define <16 x float> @shuffle_v16f32_01_01_03_03_05_05_07_07_09_09_11_11_13_13_15
define <16 x float> @shuffle_v16f32_00_01_00_01_06_07_06_07_08_09_10_11_12_13_12_13(<16 x float> %a, <16 x float> %b) {
; ALL-LABEL: shuffle_v16f32_00_01_00_01_06_07_06_07_08_09_10_11_12_13_12_13:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vpermilpd {{.*#+}} zmm0 = zmm0[0,0,3,3,4,5,6,6]
; ALL-NEXT: retq
%shuffle = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 6, i32 7, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 12, i32 13>
@@ -130,7 +140,7 @@ define <16 x float> @shuffle_v16f32_00_01_00_01_06_07_06_07_08_09_10_11_12_13_12
define <16 x float> @shuffle_v16f32_00_00_02_00_04_04_06_04_08_08_10_08_12_12_14_12(<16 x float> %a, <16 x float> %b) {
; ALL-LABEL: shuffle_v16f32_00_00_02_00_04_04_06_04_08_08_10_08_12_12_14_12:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[0,0,2,0,4,4,6,4,8,8,10,8,12,12,14,12]
; ALL-NEXT: retq
%shuffle = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32> <i32 0, i32 0, i32 2, i32 0, i32 4, i32 4, i32 6, i32 4, i32 8, i32 8, i32 10, i32 8, i32 12, i32 12, i32 14, i32 12>
@@ -139,7 +149,7 @@ define <16 x float> @shuffle_v16f32_00_00_02_00_04_04_06_04_08_08_10_08_12_12_14
define <16 x float> @shuffle_v16f32_03_uu_uu_uu_uu_04_uu_uu_uu_uu_11_uu_uu_uu_uu_12(<16 x float> %a, <16 x float> %b) {
; ALL-LABEL: shuffle_v16f32_03_uu_uu_uu_uu_04_uu_uu_uu_uu_11_uu_uu_uu_uu_12:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[3,0,3,0,7,4,7,4,11,8,11,8,15,12,15,12]
; ALL-NEXT: retq
%shuffle = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32> <i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 4, i32 undef, i32 undef, i32 undef, i32 undef, i32 11, i32 undef, i32 undef, i32 undef, i32 undef, i32 12>
@@ -148,7 +158,7 @@ define <16 x float> @shuffle_v16f32_03_uu_uu_uu_uu_04_uu_uu_uu_uu_11_uu_uu_uu_uu
define <16 x i32> @shuffle_v16i32_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<16 x i32> %a, <16 x i32> %b) {
; ALL-LABEL: shuffle_v16i32_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vbroadcastss %xmm0, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32><i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
@@ -157,9 +167,9 @@ define <16 x i32> @shuffle_v16i32_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_0
define <16 x i32> @shuffle_v16i32_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04(<16 x i32> %a, <16 x i32> %b) {
; ALL-LABEL: shuffle_v16i32_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04:
-; ALL: # BB#0:
-; ALL-NEXT: vextracti32x4 $1, %zmm0, %xmm0
-; ALL-NEXT: vpbroadcastd %xmm0, %zmm0
+; ALL: # %bb.0:
+; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0
+; ALL-NEXT: vbroadcastss %xmm0, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32><i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
ret <16 x i32> %shuffle
@@ -167,8 +177,8 @@ define <16 x i32> @shuffle_v16i32_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04_0
define <16 x i32> @shuffle_v16i32_02_12_03_13_06_16_07_17_0a_1a_0b_1b_0e_1e_0f_1f(<16 x i32> %a, <16 x i32> %b) {
; ALL-LABEL: shuffle_v16i32_02_12_03_13_06_16_07_17_0a_1a_0b_1b_0e_1e_0f_1f:
-; ALL: # BB#0:
-; ALL-NEXT: vpunpckhdq {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
+; ALL: # %bb.0:
+; ALL-NEXT: vunpckhps {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
; ALL-NEXT: retq
%shuffle = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32><i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
ret <16 x i32> %shuffle
@@ -176,9 +186,9 @@ define <16 x i32> @shuffle_v16i32_02_12_03_13_06_16_07_17_0a_1a_0b_1b_0e_1e_0f_1
define <16 x i32> @shuffle_v16i32_02_zz_03_zz_06_zz_07_zz_0a_zz_0b_zz_0e_zz_0f_zz(<16 x i32> %a, <16 x i32> %b) {
; ALL-LABEL: shuffle_v16i32_02_zz_03_zz_06_zz_07_zz_0a_zz_0b_zz_0e_zz_0f_zz:
-; ALL: # BB#0:
-; ALL-NEXT: vpxord %zmm1, %zmm1, %zmm1
-; ALL-NEXT: vpunpckhdq {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
+; ALL: # %bb.0:
+; ALL-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; ALL-NEXT: vunpckhps {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
; ALL-NEXT: retq
%shuffle = shufflevector <16 x i32> %a, <16 x i32> zeroinitializer, <16 x i32><i32 2, i32 30, i32 3, i32 28, i32 6, i32 26, i32 7, i32 24, i32 10, i32 22, i32 11, i32 20, i32 14, i32 18, i32 15, i32 16>
ret <16 x i32> %shuffle
@@ -186,13 +196,13 @@ define <16 x i32> @shuffle_v16i32_02_zz_03_zz_06_zz_07_zz_0a_zz_0b_zz_0e_zz_0f_z
define <16 x i32> @shuffle_v16i32_01_02_03_16_05_06_07_20_09_10_11_24_13_14_15_28(<16 x i32> %a, <16 x i32> %b) {
; AVX512F-LABEL: shuffle_v16i32_01_02_03_16_05_06_07_20_09_10_11_24_13_14_15_28:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa32 {{.*#+}} zmm2 = [1,2,3,16,5,6,7,20,9,10,11,24,13,14,15,28]
; AVX512F-NEXT: vpermt2d %zmm1, %zmm2, %zmm0
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: shuffle_v16i32_01_02_03_16_05_06_07_20_09_10_11_24_13_14_15_28:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpalignr {{.*#+}} zmm0 = zmm0[4,5,6,7,8,9,10,11,12,13,14,15],zmm1[0,1,2,3],zmm0[20,21,22,23,24,25,26,27,28,29,30,31],zmm1[16,17,18,19],zmm0[36,37,38,39,40,41,42,43,44,45,46,47],zmm1[32,33,34,35],zmm0[52,53,54,55,56,57,58,59,60,61,62,63],zmm1[48,49,50,51]
; AVX512BW-NEXT: retq
%shuffle = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32><i32 1, i32 2, i32 3, i32 16, i32 5, i32 6, i32 7, i32 20, i32 9, i32 10, i32 11, i32 24, i32 13, i32 14, i32 15, i32 28>
@@ -201,7 +211,7 @@ define <16 x i32> @shuffle_v16i32_01_02_03_16_05_06_07_20_09_10_11_24_13_14_15_2
define <16 x float> @shuffle_v16f32_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_01(<16 x float> %a) {
; ALL-LABEL: shuffle_v16f32_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_01:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vmovaps {{.*#+}} zmm1 = <2,5,u,u,7,u,10,1,0,5,u,4,7,u,10,1>
; ALL-NEXT: vpermps %zmm0, %zmm1, %zmm0
; ALL-NEXT: retq
@@ -211,9 +221,9 @@ define <16 x float> @shuffle_v16f32_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_01(<
define <16 x i32> @shuffle_v16i32_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_01(<16 x i32> %a) {
; ALL-LABEL: shuffle_v16i32_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_01:
-; ALL: # BB#0:
-; ALL-NEXT: vmovdqa32 {{.*#+}} zmm1 = <2,5,u,u,7,u,10,1,0,5,u,4,7,u,10,1>
-; ALL-NEXT: vpermd %zmm0, %zmm1, %zmm0
+; ALL: # %bb.0:
+; ALL-NEXT: vmovaps {{.*#+}} zmm1 = <2,5,u,u,7,u,10,1,0,5,u,4,7,u,10,1>
+; ALL-NEXT: vpermps %zmm0, %zmm1, %zmm0
; ALL-NEXT: retq
%c = shufflevector <16 x i32> %a, <16 x i32> undef, <16 x i32> <i32 2, i32 5, i32 undef, i32 undef, i32 7, i32 undef, i32 10, i32 1, i32 0, i32 5, i32 undef, i32 4, i32 7, i32 undef, i32 10, i32 1>
ret <16 x i32> %c
@@ -221,7 +231,7 @@ define <16 x i32> @shuffle_v16i32_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_01(<16
define <16 x i32> @shuffle_v16i32_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_18(<16 x i32> %a, <16 x i32> %b) {
; ALL-LABEL: shuffle_v16i32_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_18:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vmovdqa32 {{.*#+}} zmm2 = [15,31,14,22,13,29,4,28,11,27,10,26,9,25,8,24]
; ALL-NEXT: vpermt2d %zmm1, %zmm2, %zmm0
; ALL-NEXT: retq
@@ -231,7 +241,7 @@ define <16 x i32> @shuffle_v16i32_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_1
define <16 x float> @shuffle_v16f32_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_18(<16 x float> %a, <16 x float> %b) {
; ALL-LABEL: shuffle_v16f32_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_18:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vmovaps {{.*#+}} zmm2 = [15,31,14,22,13,29,4,28,11,27,10,26,9,25,8,24]
; ALL-NEXT: vpermt2ps %zmm1, %zmm2, %zmm0
; ALL-NEXT: retq
@@ -241,7 +251,7 @@ define <16 x float> @shuffle_v16f32_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08
define <16 x float> @shuffle_v16f32_load_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_18(<16 x float> %a, <16 x float>* %b) {
; ALL-LABEL: shuffle_v16f32_load_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_18:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vmovaps {{.*#+}} zmm1 = [15,31,14,22,13,29,4,28,11,27,10,26,9,25,8,24]
; ALL-NEXT: vpermt2ps (%rdi), %zmm1, %zmm0
; ALL-NEXT: retq
@@ -252,7 +262,7 @@ define <16 x float> @shuffle_v16f32_load_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_
define <16 x i32> @shuffle_v16i32_load_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_18(<16 x i32> %a, <16 x i32>* %b) {
; ALL-LABEL: shuffle_v16i32_load_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_18:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vmovdqa32 {{.*#+}} zmm1 = [15,31,14,22,13,29,4,28,11,27,10,26,9,25,8,24]
; ALL-NEXT: vpermt2d (%rdi), %zmm1, %zmm0
; ALL-NEXT: retq
@@ -262,19 +272,10 @@ define <16 x i32> @shuffle_v16i32_load_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19
}
define <16 x i32> @shuffle_v16i32_0_1_2_19_u_u_u_u_u_u_u_u_u_u_u_u(<16 x i32> %a, <16 x i32> %b) {
-; AVX512F-LABEL: shuffle_v16i32_0_1_2_19_u_u_u_u_u_u_u_u_u_u_u_u:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: movw $8, %ax
-; AVX512F-NEXT: kmovw %eax, %k1
-; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
-; AVX512F-NEXT: retq
-;
-; AVX512BW-LABEL: shuffle_v16i32_0_1_2_19_u_u_u_u_u_u_u_u_u_u_u_u:
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: movw $8, %ax
-; AVX512BW-NEXT: kmovd %eax, %k1
-; AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
-; AVX512BW-NEXT: retq
+; ALL-LABEL: shuffle_v16i32_0_1_2_19_u_u_u_u_u_u_u_u_u_u_u_u:
+; ALL: # %bb.0:
+; ALL-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
+; ALL-NEXT: retq
%c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 19, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
ret <16 x i32> %c
}
@@ -282,10 +283,10 @@ define <16 x i32> @shuffle_v16i32_0_1_2_19_u_u_u_u_u_u_u_u_u_u_u_u(<16 x i32> %a
;FIXME: can do better with vpcompress
define <8 x i32> @test_v16i32_1_3_5_7_9_11_13_15(<16 x i32> %v) {
; ALL-LABEL: test_v16i32_1_3_5_7_9_11_13_15:
-; ALL: # BB#0:
-; ALL-NEXT: vextracti32x8 $1, %zmm0, %ymm1
+; ALL: # %bb.0:
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm1
; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
-; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
; ALL-NEXT: retq
%res = shufflevector <16 x i32> %v, <16 x i32> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
ret <8 x i32> %res
@@ -294,14 +295,11 @@ define <8 x i32> @test_v16i32_1_3_5_7_9_11_13_15(<16 x i32> %v) {
;FIXME: can do better with vpcompress
define <4 x i32> @test_v16i32_0_1_2_12 (<16 x i32> %v) {
; ALL-LABEL: test_v16i32_0_1_2_12:
-; ALL: # BB#0:
-; ALL-NEXT: vpextrd $1, %xmm0, %eax
-; ALL-NEXT: vpinsrd $1, %eax, %xmm0, %xmm1
-; ALL-NEXT: vpextrd $2, %xmm0, %eax
-; ALL-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1
-; ALL-NEXT: vextracti32x4 $3, %zmm0, %xmm0
-; ALL-NEXT: vmovd %xmm0, %eax
-; ALL-NEXT: vpinsrd $3, %eax, %xmm1, %xmm0
+; ALL: # %bb.0:
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm1
+; ALL-NEXT: vextractf128 $1, %ymm1, %xmm1
+; ALL-NEXT: vbroadcastss %xmm1, %xmm1
+; ALL-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
; ALL-NEXT: vzeroupper
; ALL-NEXT: retq
%res = shufflevector <16 x i32> %v, <16 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 12>
@@ -310,7 +308,7 @@ define <4 x i32> @test_v16i32_0_1_2_12 (<16 x i32> %v) {
define <8 x float> @shuffle_v16f32_extract_256(float* %RET, float* %a) {
; ALL-LABEL: shuffle_v16f32_extract_256:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vmovups 32(%rsi), %ymm0
; ALL-NEXT: retq
%ptr_a = bitcast float* %a to <16 x float>*
@@ -322,8 +320,8 @@ define <8 x float> @shuffle_v16f32_extract_256(float* %RET, float* %a) {
;FIXME: can do better with vcompressp
define <8 x float> @test_v16f32_0_1_2_3_4_6_7_10 (<16 x float> %v) {
; ALL-LABEL: test_v16f32_0_1_2_3_4_6_7_10:
-; ALL: # BB#0:
-; ALL-NEXT: vextractf32x8 $1, %zmm0, %ymm1
+; ALL: # %bb.0:
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm1
; ALL-NEXT: vmovsldup {{.*#+}} xmm1 = xmm1[0,0,2,2]
; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,6,7,u]
@@ -336,11 +334,11 @@ define <8 x float> @test_v16f32_0_1_2_3_4_6_7_10 (<16 x float> %v) {
;FIXME: can do better with vcompressp
define <4 x float> @test_v16f32_0_1_3_6 (<16 x float> %v) {
; ALL-LABEL: test_v16f32_0_1_3_6:
-; ALL: # BB#0:
-; ALL-NEXT: vextractf32x4 $1, %zmm0, %xmm1
-; ALL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
-; ALL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,3,3]
-; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
+; ALL: # %bb.0:
+; ALL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,1,3,3]
+; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0
+; ALL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
; ALL-NEXT: vzeroupper
; ALL-NEXT: retq
%res = shufflevector <16 x float> %v, <16 x float> undef, <4 x i32> <i32 0, i32 1, i32 3, i32 6>
@@ -349,8 +347,8 @@ define <4 x float> @test_v16f32_0_1_3_6 (<16 x float> %v) {
define <16 x i32> @shuffle_v16i16_1_0_0_0_5_4_4_4_9_8_8_8_13_12_12_12(<16 x i32> %a, <16 x i32> %b) {
; ALL-LABEL: shuffle_v16i16_1_0_0_0_5_4_4_4_9_8_8_8_13_12_12_12:
-; ALL: # BB#0:
-; ALL-NEXT: vpshufd {{.*#+}} zmm0 = zmm0[1,0,0,0,5,4,4,4,9,8,8,8,13,12,12,12]
+; ALL: # %bb.0:
+; ALL-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[1,0,0,0,5,4,4,4,9,8,8,8,13,12,12,12]
; ALL-NEXT: retq
%c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 1, i32 0, i32 0, i32 0, i32 5, i32 4, i32 4, i32 4, i32 9, i32 8, i32 8, i32 8, i32 13, i32 12, i32 12, i32 12>
ret <16 x i32> %c
@@ -358,8 +356,8 @@ define <16 x i32> @shuffle_v16i16_1_0_0_0_5_4_4_4_9_8_8_8_13_12_12_12(<16 x i32>
define <16 x i32> @shuffle_v16i16_3_3_0_0_7_7_4_4_11_11_8_8_15_15_12_12(<16 x i32> %a, <16 x i32> %b) {
; ALL-LABEL: shuffle_v16i16_3_3_0_0_7_7_4_4_11_11_8_8_15_15_12_12:
-; ALL: # BB#0:
-; ALL-NEXT: vpshufd {{.*#+}} zmm0 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
+; ALL: # %bb.0:
+; ALL-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
; ALL-NEXT: retq
%c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5, i32 10, i32 11, i32 8, i32 9, i32 14, i32 15, i32 12, i32 13>
ret <16 x i32> %c
@@ -367,7 +365,7 @@ define <16 x i32> @shuffle_v16i16_3_3_0_0_7_7_4_4_11_11_8_8_15_15_12_12(<16 x i3
define <16 x float> @shuffle_v16f32_00_01_10_10_04_05_14_14_08_09_18_18_0c_0d_1c_1c(<16 x float> %a, <16 x float> %b) {
; ALL-LABEL: shuffle_v16f32_00_01_10_10_04_05_14_14_08_09_18_18_0c_0d_1c_1c:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vshufps {{.*#+}} zmm0 = zmm0[0,1],zmm1[0,0],zmm0[4,5],zmm1[4,4],zmm0[8,9],zmm1[8,8],zmm0[12,13],zmm1[12,12]
; ALL-NEXT: retq
%shuffle = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32> <i32 0, i32 1, i32 16, i32 16, i32 4, i32 5, i32 20, i32 20, i32 8, i32 9, i32 24, i32 24, i32 12, i32 13, i32 28, i32 28>
@@ -376,7 +374,7 @@ define <16 x float> @shuffle_v16f32_00_01_10_10_04_05_14_14_08_09_18_18_0c_0d_1c
define <16 x i32> @insert_mem_and_zero_v16i32(i32* %ptr) {
; ALL-LABEL: insert_mem_and_zero_v16i32:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; ALL-NEXT: retq
%a = load i32, i32* %ptr
@@ -388,7 +386,7 @@ define <16 x i32> @insert_mem_and_zero_v16i32(i32* %ptr) {
define <16 x i32> @shuffle_v16i32_0zzzzzzzzzzzzzzz(<16 x i32> %a) {
; ALL-LABEL: shuffle_v16i32_0zzzzzzzzzzzzzzz:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vxorps %xmm1, %xmm1, %xmm1
; ALL-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; ALL-NEXT: retq
@@ -398,7 +396,7 @@ define <16 x i32> @shuffle_v16i32_0zzzzzzzzzzzzzzz(<16 x i32> %a) {
define <16 x float> @shuffle_v16f32_0zzzzzzzzzzzzzzz(<16 x float> %a) {
; ALL-LABEL: shuffle_v16f32_0zzzzzzzzzzzzzzz:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vxorps %xmm1, %xmm1, %xmm1
; ALL-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; ALL-NEXT: retq
@@ -408,7 +406,7 @@ define <16 x float> @shuffle_v16f32_0zzzzzzzzzzzzzzz(<16 x float> %a) {
define <16 x i32> @shuffle_v16i32_16_zz_17_zz_18_zz_19_zz_20_zz_21_zz_22_zz_23_zz(<16 x i32> %a) {
; ALL-LABEL: shuffle_v16i32_16_zz_17_zz_18_zz_19_zz_20_zz_21_zz_22_zz_23_zz:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
; ALL-NEXT: retq
%shuffle = shufflevector <16 x i32> zeroinitializer, <16 x i32> %a, <16 x i32> <i32 16, i32 0, i32 17, i32 0, i32 18, i32 0, i32 19, i32 0, i32 20, i32 0, i32 21, i32 0, i32 22, i32 0, i32 23, i32 0>
@@ -417,7 +415,7 @@ define <16 x i32> @shuffle_v16i32_16_zz_17_zz_18_zz_19_zz_20_zz_21_zz_22_zz_23_z
define <16 x i32> @shuffle_v16i32_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16(<16 x i32> %a, <16 x i32> %b) {
; ALL-LABEL: shuffle_v16i32_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: valignd {{.*#+}} zmm0 = zmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm1[0]
; ALL-NEXT: retq
%shuffle = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32><i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>
@@ -426,7 +424,7 @@ define <16 x i32> @shuffle_v16i32_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_1
define <16 x i32> @shuffle_v16i32_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00(<16 x i32> %a) {
; ALL-LABEL: shuffle_v16i32_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: valignd {{.*#+}} zmm0 = zmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0]
; ALL-NEXT: retq
%shuffle = shufflevector <16 x i32> %a, <16 x i32> undef, <16 x i32><i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0>
@@ -435,7 +433,7 @@ define <16 x i32> @shuffle_v16i32_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_0
define <16 x i32> @shuffle_v16i32_00_03_16_19_04_07_20_23_08_11_24_27_12_15_28_31(<16 x i32> %a, <16 x i32> %b) {
; ALL-LABEL: shuffle_v16i32_00_03_16_19_04_07_20_23_08_11_24_27_12_15_28_31:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vshufps {{.*#+}} zmm0 = zmm0[0,3],zmm1[0,3],zmm0[4,7],zmm1[4,7],zmm0[8,11],zmm1[8,11],zmm0[12,15],zmm1[12,15]
; ALL-NEXT: retq
%shuffle = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 0, i32 3, i32 16, i32 19, i32 4, i32 7, i32 20, i32 23, i32 8, i32 11, i32 24, i32 27, i32 12, i32 15, i32 28, i32 31>
@@ -444,7 +442,7 @@ define <16 x i32> @shuffle_v16i32_00_03_16_19_04_07_20_23_08_11_24_27_12_15_28_3
define <16 x i32> @shuffle_v16i32_16_16_02_03_20_20_06_07_24_24_10_11_28_28_uu_uu(<16 x i32> %a, <16 x i32> %b) {
; ALL-LABEL: shuffle_v16i32_16_16_02_03_20_20_06_07_24_24_10_11_28_28_uu_uu:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vshufps {{.*#+}} zmm0 = zmm1[0,0],zmm0[2,3],zmm1[4,4],zmm0[6,7],zmm1[8,8],zmm0[10,11],zmm1[12,12],zmm0[14,15]
; ALL-NEXT: retq
%shuffle = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 16, i32 16, i32 02, i32 03, i32 20, i32 20, i32 06, i32 07, i32 24, i32 24, i32 10, i32 11, i32 28, i32 28, i32 undef, i32 undef>
@@ -453,7 +451,7 @@ define <16 x i32> @shuffle_v16i32_16_16_02_03_20_20_06_07_24_24_10_11_28_28_uu_u
define <16 x i32> @shuffle_v8i32_17_16_01_00_21_20_05_04_25_24_09_08_29_28_13_12(<16 x i32> %a, <16 x i32> %b) {
; ALL-LABEL: shuffle_v8i32_17_16_01_00_21_20_05_04_25_24_09_08_29_28_13_12:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vshufps {{.*#+}} zmm0 = zmm1[1,0],zmm0[1,0],zmm1[5,4],zmm0[5,4],zmm1[9,8],zmm0[9,8],zmm1[13,12],zmm0[13,12]
; ALL-NEXT: retq
%shuffle = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 17, i32 16, i32 01, i32 00, i32 21, i32 20, i32 05, i32 04, i32 25, i32 24, i32 09, i32 08, i32 29, i32 28, i32 13, i32 12>
@@ -462,7 +460,7 @@ define <16 x i32> @shuffle_v8i32_17_16_01_00_21_20_05_04_25_24_09_08_29_28_13_12
define <16 x float> @shuffle_v8f32_v16f32_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04(<8 x float> %a) {
; ALL-LABEL: shuffle_v8f32_v16f32_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0
; ALL-NEXT: vbroadcastss %xmm0, %zmm0
; ALL-NEXT: retq
@@ -472,14 +470,14 @@ define <16 x float> @shuffle_v8f32_v16f32_04_04_04_04_04_04_04_04_04_04_04_04_04
define <16 x i32> @mask_shuffle_v16i32_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00_01(<16 x i32> %a, <16 x i32> %passthru, i16 %mask) {
; AVX512F-LABEL: mask_shuffle_v16i32_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00_01:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: kmovw %edi, %k1
; AVX512F-NEXT: valignd {{.*#+}} zmm1 {%k1} = zmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1]
; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: mask_shuffle_v16i32_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00_01:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: kmovd %edi, %k1
; AVX512BW-NEXT: valignd {{.*#+}} zmm1 {%k1} = zmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1]
; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0
@@ -492,14 +490,14 @@ define <16 x i32> @mask_shuffle_v16i32_02_03_04_05_06_07_08_09_10_11_12_13_14_15
define <16 x i32> @mask_shuffle_v16i32_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16_17(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passthru, i16 %mask) {
; AVX512F-LABEL: mask_shuffle_v16i32_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16_17:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: kmovw %edi, %k1
; AVX512F-NEXT: valignd {{.*#+}} zmm2 {%k1} = zmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm1[0,1]
; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: mask_shuffle_v16i32_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16_17:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: kmovd %edi, %k1
; AVX512BW-NEXT: valignd {{.*#+}} zmm2 {%k1} = zmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm1[0,1]
; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0
@@ -512,13 +510,13 @@ define <16 x i32> @mask_shuffle_v16i32_02_03_04_05_06_07_08_09_10_11_12_13_14_15
define <16 x i32> @maskz_shuffle_v16i32_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00_01(<16 x i32> %a, i16 %mask) {
; AVX512F-LABEL: maskz_shuffle_v16i32_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00_01:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: kmovw %edi, %k1
; AVX512F-NEXT: valignd {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1]
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: maskz_shuffle_v16i32_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00_01:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: kmovd %edi, %k1
; AVX512BW-NEXT: valignd {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1]
; AVX512BW-NEXT: retq
@@ -530,13 +528,13 @@ define <16 x i32> @maskz_shuffle_v16i32_02_03_04_05_06_07_08_09_10_11_12_13_14_1
define <16 x i32> @maskz_shuffle_v16i32_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16_17(<16 x i32> %a, <16 x i32> %b, i16 %mask) {
; AVX512F-LABEL: maskz_shuffle_v16i32_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16_17:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: kmovw %edi, %k1
; AVX512F-NEXT: valignd {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm1[0,1]
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: maskz_shuffle_v16i32_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16_17:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: kmovd %edi, %k1
; AVX512BW-NEXT: valignd {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm1[0,1]
; AVX512BW-NEXT: retq
@@ -548,7 +546,7 @@ define <16 x i32> @maskz_shuffle_v16i32_02_03_04_05_06_07_08_09_10_11_12_13_14_1
define <16 x float> @test_vshuff32x4_512(<16 x float> %x, <16 x float> %x1) nounwind {
; ALL-LABEL: test_vshuff32x4_512:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[2,3,0,1]
; ALL-NEXT: retq
%res = shufflevector <16 x float> %x, <16 x float> %x1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 16, i32 17, i32 18, i32 19>
@@ -557,7 +555,7 @@ define <16 x float> @test_vshuff32x4_512(<16 x float> %x, <16 x float> %x1) noun
define <16 x i32> @test_vshufi32x4_512(<16 x i32> %x, <16 x i32> %x1) nounwind {
; ALL-LABEL: test_vshufi32x4_512:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[2,3,0,1]
; ALL-NEXT: retq
%res = shufflevector <16 x i32> %x, <16 x i32> %x1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 16, i32 17, i32 18, i32 19>
@@ -566,7 +564,7 @@ define <16 x i32> @test_vshufi32x4_512(<16 x i32> %x, <16 x i32> %x1) nounwind {
define <16 x float> @test_vshuff32x4_512_mask(<16 x float> %x, <16 x float> %x1, <16 x float> %y, <16 x i1> %mask) nounwind {
; AVX512F-LABEL: test_vshuff32x4_512_mask:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vpmovsxbd %xmm3, %zmm3
; AVX512F-NEXT: vpslld $31, %zmm3, %zmm3
; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k1
@@ -575,7 +573,7 @@ define <16 x float> @test_vshuff32x4_512_mask(<16 x float> %x, <16 x float> %x1,
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: test_vshuff32x4_512_mask:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpsllw $7, %xmm3, %xmm3
; AVX512BW-NEXT: vpmovb2m %zmm3, %k1
; AVX512BW-NEXT: vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[0,1,2,3,4,5,6,7],zmm1[4,5,6,7,0,1,2,3]
@@ -588,7 +586,7 @@ define <16 x float> @test_vshuff32x4_512_mask(<16 x float> %x, <16 x float> %x1,
define <16 x i32> @test_vshufi32x4_512_mask(<16 x i32> %x, <16 x i32> %x1, <16 x i32> %y, <16 x i1> %mask) nounwind {
; AVX512F-LABEL: test_vshufi32x4_512_mask:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vpmovsxbd %xmm3, %zmm3
; AVX512F-NEXT: vpslld $31, %zmm3, %zmm3
; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k1
@@ -597,7 +595,7 @@ define <16 x i32> @test_vshufi32x4_512_mask(<16 x i32> %x, <16 x i32> %x1, <16 x
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: test_vshufi32x4_512_mask:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpsllw $7, %xmm3, %xmm3
; AVX512BW-NEXT: vpmovb2m %zmm3, %k1
; AVX512BW-NEXT: vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[0,1,2,3,4,5,6,7],zmm1[4,5,6,7,0,1,2,3]
@@ -610,14 +608,14 @@ define <16 x i32> @test_vshufi32x4_512_mask(<16 x i32> %x, <16 x i32> %x1, <16 x
define <16 x float> @mask_shuffle_v16f32_00_01_02_03_04_05_06_07_16_17_18_19_20_21_22_23(<16 x float> %a, <16 x float> %b, <16 x float> %passthru, i16 %mask) {
; AVX512F-LABEL: mask_shuffle_v16f32_00_01_02_03_04_05_06_07_16_17_18_19_20_21_22_23:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: kmovw %edi, %k1
; AVX512F-NEXT: vinsertf32x8 $1, %ymm1, %zmm0, %zmm2 {%k1}
; AVX512F-NEXT: vmovaps %zmm2, %zmm0
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: mask_shuffle_v16f32_00_01_02_03_04_05_06_07_16_17_18_19_20_21_22_23:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: kmovd %edi, %k1
; AVX512BW-NEXT: vinsertf32x8 $1, %ymm1, %zmm0, %zmm2 {%k1}
; AVX512BW-NEXT: vmovaps %zmm2, %zmm0
@@ -630,14 +628,14 @@ define <16 x float> @mask_shuffle_v16f32_00_01_02_03_04_05_06_07_16_17_18_19_20_
define <16 x float> @mask_shuffle_v16f32_00_01_02_03_16_17_18_19_08_09_10_11_12_13_14_15(<16 x float> %a, <16 x float> %b, <16 x float> %passthru, i16 %mask) {
; AVX512F-LABEL: mask_shuffle_v16f32_00_01_02_03_16_17_18_19_08_09_10_11_12_13_14_15:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: kmovw %edi, %k1
; AVX512F-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm2 {%k1}
; AVX512F-NEXT: vmovaps %zmm2, %zmm0
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: mask_shuffle_v16f32_00_01_02_03_16_17_18_19_08_09_10_11_12_13_14_15:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: kmovd %edi, %k1
; AVX512BW-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm2 {%k1}
; AVX512BW-NEXT: vmovaps %zmm2, %zmm0
@@ -650,14 +648,14 @@ define <16 x float> @mask_shuffle_v16f32_00_01_02_03_16_17_18_19_08_09_10_11_12_
define <16 x i32> @mask_shuffle_v16i32_00_01_02_03_04_05_06_07_16_17_18_19_20_21_22_23(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passthru, i16 %mask) {
; AVX512F-LABEL: mask_shuffle_v16i32_00_01_02_03_04_05_06_07_16_17_18_19_20_21_22_23:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: kmovw %edi, %k1
; AVX512F-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm2 {%k1}
; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: mask_shuffle_v16i32_00_01_02_03_04_05_06_07_16_17_18_19_20_21_22_23:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: kmovd %edi, %k1
; AVX512BW-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm2 {%k1}
; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0
@@ -670,14 +668,14 @@ define <16 x i32> @mask_shuffle_v16i32_00_01_02_03_04_05_06_07_16_17_18_19_20_21
define <16 x i32> @mask_shuffle_v16i32_00_01_02_03_16_17_18_19_08_09_10_11_12_13_14_15(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passthru, i16 %mask) {
; AVX512F-LABEL: mask_shuffle_v16i32_00_01_02_03_16_17_18_19_08_09_10_11_12_13_14_15:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: kmovw %edi, %k1
; AVX512F-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm2 {%k1}
; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: mask_shuffle_v16i32_00_01_02_03_16_17_18_19_08_09_10_11_12_13_14_15:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: kmovd %edi, %k1
; AVX512BW-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm2 {%k1}
; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0
@@ -690,10 +688,10 @@ define <16 x i32> @mask_shuffle_v16i32_00_01_02_03_16_17_18_19_08_09_10_11_12_13
define <16 x i32> @mask_shuffle_v4i32_v16i32_00_01_02_03_00_01_02_03_00_01_02_03_00_01_02_03(<4 x i32> %a) {
; ALL-LABEL: mask_shuffle_v4i32_v16i32_00_01_02_03_00_01_02_03_00_01_02_03_00_01_02_03:
-; ALL: # BB#0:
-; ALL-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
-; ALL-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; ALL-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm0
+; ALL: # %bb.0:
+; ALL-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
+; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
; ALL-NEXT: retq
%res = shufflevector <4 x i32> %a, <4 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
ret <16 x i32> %res
@@ -701,10 +699,10 @@ define <16 x i32> @mask_shuffle_v4i32_v16i32_00_01_02_03_00_01_02_03_00_01_02_03
define <16 x float> @mask_shuffle_v4f32_v16f32_00_01_02_03_00_01_02_03_00_01_02_03_00_01_02_03(<4 x float> %a) {
; ALL-LABEL: mask_shuffle_v4f32_v16f32_00_01_02_03_00_01_02_03_00_01_02_03_00_01_02_03:
-; ALL: # BB#0:
-; ALL-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; ALL: # %bb.0:
+; ALL-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; ALL-NEXT: vinsertf32x8 $1, %ymm0, %zmm0, %zmm0
+; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
; ALL-NEXT: retq
%res = shufflevector <4 x float> %a, <4 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
ret <16 x float> %res
diff --git a/test/CodeGen/X86/vector-shuffle-512-v32.ll b/test/CodeGen/X86/vector-shuffle-512-v32.ll
index b8fc27ba5515..3e49957bf85e 100644
--- a/test/CodeGen/X86/vector-shuffle-512-v32.ll
+++ b/test/CodeGen/X86/vector-shuffle-512-v32.ll
@@ -6,13 +6,13 @@ target triple = "x86_64-unknown-unknown"
define <32 x i16> @shuffle_v32i16(<32 x i16> %a) {
; KNL-LABEL: shuffle_v32i16:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: vpbroadcastw %xmm0, %ymm0
; KNL-NEXT: vmovdqa %ymm0, %ymm1
; KNL-NEXT: retq
;
; SKX-LABEL: shuffle_v32i16:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpbroadcastw %xmm0, %zmm0
; SKX-NEXT: retq
%c = shufflevector <32 x i16> %a, <32 x i16> undef, <32 x i32> zeroinitializer
@@ -21,15 +21,15 @@ define <32 x i16> @shuffle_v32i16(<32 x i16> %a) {
define <32 x i16> @shuffle_v32i16_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08(<32 x i16> %a) {
; KNL-LABEL: shuffle_v32i16_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
; KNL-NEXT: vpbroadcastw %xmm0, %ymm0
; KNL-NEXT: vmovdqa %ymm0, %ymm1
; KNL-NEXT: retq
;
; SKX-LABEL: shuffle_v32i16_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08:
-; SKX: ## BB#0:
-; SKX-NEXT: vextracti32x4 $1, %zmm0, %xmm0
+; SKX: ## %bb.0:
+; SKX-NEXT: vextracti128 $1, %ymm0, %xmm0
; SKX-NEXT: vpbroadcastw %xmm0, %zmm0
; SKX-NEXT: retq
%c = shufflevector <32 x i16> %a, <32 x i16> undef, <32 x i32> <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
@@ -38,22 +38,22 @@ define <32 x i16> @shuffle_v32i16_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_0
define <32 x i16> @shuffle_v32i16_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_01_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_1f(<32 x i16> %a) {
; KNL-LABEL: shuffle_v32i16_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_01_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_1f:
-; KNL: ## BB#0:
-; KNL-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3,0,1]
-; KNL-NEXT: vpshufb {{.*#+}} ymm3 = ymm2[0,1,10,11,8,9,8,9,14,15,2,3,4,5,2,3,16,17,26,27,24,25,24,25,30,31,18,19,20,21,18,19]
-; KNL-NEXT: vpshufb {{.*#+}} ymm4 = ymm0[4,5,10,11,4,5,6,7,14,15,2,3,4,5,2,3,20,21,26,27,20,21,22,23,30,31,18,19,20,21,18,19]
-; KNL-NEXT: vmovdqa {{.*#+}} ymm0 = <0,0,0,0,u,u,u,u,0,0,u,u,255,255,0,0,255,255,255,255,u,u,255,255,255,255,u,u,0,0,255,255>
-; KNL-NEXT: vpblendvb %ymm0, %ymm3, %ymm4, %ymm0
-; KNL-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,1,10,11,8,9,8,9,14,15,6,7,4,5,14,15,16,17,26,27,24,25,24,25,30,31,22,23,20,21,30,31]
-; KNL-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,255,255,u,u,u,u,255,255,u,u,0,0,255,255,0,0,0,0,u,u,0,0,0,0,u,u,255,255,u,u>
-; KNL-NEXT: vpblendvb %ymm3, %ymm4, %ymm2, %ymm2
+; KNL: ## %bb.0:
+; KNL-NEXT: vpshufb {{.*#+}} ymm2 = ymm0[4,5,10,11,4,5,6,7,14,15,2,3,4,5,2,3,20,21,26,27,20,21,22,23,30,31,18,19,20,21,18,19]
+; KNL-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1]
+; KNL-NEXT: vpshufb {{.*#+}} ymm0 = ymm3[0,1,10,11,8,9,8,9,14,15,2,3,4,5,2,3,16,17,26,27,24,25,24,25,30,31,18,19,20,21,18,19]
+; KNL-NEXT: vmovdqa {{.*#+}} ymm4 = <0,0,0,0,u,u,u,u,0,0,u,u,255,255,0,0,255,255,255,255,u,u,255,255,255,255,u,u,0,0,255,255>
+; KNL-NEXT: vpblendvb %ymm4, %ymm0, %ymm2, %ymm0
+; KNL-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,1,10,11,8,9,8,9,14,15,6,7,4,5,14,15,16,17,26,27,24,25,24,25,30,31,22,23,20,21,30,31]
+; KNL-NEXT: vmovdqa {{.*#+}} ymm4 = <255,255,255,255,u,u,u,u,255,255,u,u,0,0,255,255,0,0,0,0,u,u,0,0,0,0,u,u,255,255,u,u>
+; KNL-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2
; KNL-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,255,255,u,u,u,u,255,255,u,u,255,255,255,255,255,255,255,255,u,u,255,255,255,255,u,u,255,255,0,0>
; KNL-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1
; KNL-NEXT: retq
;
; SKX-LABEL: shuffle_v32i16_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_01_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_1f:
-; SKX: ## BB#0:
-; SKX-NEXT: vmovdqu16 {{.*#+}} zmm1 = <2,5,u,u,7,u,10,1,0,5,u,4,7,u,10,1,2,5,u,u,7,u,10,1,0,5,u,4,7,u,10,31>
+; SKX: ## %bb.0:
+; SKX-NEXT: vmovdqa64 {{.*#+}} zmm1 = <2,5,u,u,7,u,10,1,0,5,u,4,7,u,10,1,2,5,u,u,7,u,10,1,0,5,u,4,7,u,10,31>
; SKX-NEXT: vpermw %zmm0, %zmm1, %zmm0
; SKX-NEXT: retq
%c = shufflevector <32 x i16> %a, <32 x i16> undef, <32 x i32> <i32 2, i32 5, i32 undef, i32 undef, i32 7, i32 undef, i32 10, i32 1, i32 0, i32 5, i32 undef, i32 4, i32 7, i32 undef, i32 10, i32 1, i32 2, i32 5, i32 undef, i32 undef, i32 7, i32 undef, i32 10, i32 1, i32 0, i32 5, i32 undef, i32 4, i32 7, i32 undef, i32 10, i32 31>
@@ -62,31 +62,25 @@ define <32 x i16> @shuffle_v32i16_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_01_02_
define <32 x i16> @shuffle_v32i16_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_18_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_38(<32 x i16> %a, <32 x i16> %b) {
; KNL-LABEL: shuffle_v32i16_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_18_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_38:
-; KNL: ## BB#0:
-; KNL-NEXT: vextracti128 $1, %ymm1, %xmm2
-; KNL-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; KNL-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[8,9,12,13,12,13,10,11,0,1,4,5,4,5,0,1]
-; KNL-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,1,0,3]
-; KNL-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[0,3,2,2,4,5,6,7]
-; KNL-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm1
-; KNL-NEXT: vextracti128 $1, %ymm0, %xmm5
-; KNL-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
-; KNL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,10,11,8,9,14,15,4,5,2,3,2,3,6,7]
-; KNL-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[6,7,2,3,4,5,6,7,2,3,2,3,0,1,14,15]
-; KNL-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm0
+; KNL: ## %bb.0:
+; KNL-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
+; KNL-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6],ymm2[7],ymm1[8,9,10,11],ymm2[12,13],ymm1[14],ymm2[15]
+; KNL-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[u,u,14,15,u,u,12,13,u,u,10,11,u,u,8,9,u,u,22,23,u,u,20,21,u,u,18,19,u,u,u,u]
+; KNL-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1]
+; KNL-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7],ymm0[8,9,10,11,12],ymm4[13,14,15]
+; KNL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[14,15,u,u,12,13,u,u,10,11,u,u,8,9,u,u,22,23,u,u,20,21,u,u,18,19,u,u,16,17,u,u]
; KNL-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
; KNL-NEXT: vextracti128 $1, %ymm3, %xmm3
; KNL-NEXT: vpbroadcastw %xmm3, %ymm3
-; KNL-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0]
-; KNL-NEXT: vpblendvb %ymm5, %ymm1, %ymm3, %ymm1
-; KNL-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,4]
-; KNL-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2
+; KNL-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0]
+; KNL-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm1
+; KNL-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,14,15,u,u,12,13,u,u,10,11,u,u,8,9,u,u,22,23,u,u,20,21,u,u,18,19,u,u,16,17]
; KNL-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7],ymm0[8],ymm2[9],ymm0[10],ymm2[11],ymm0[12],ymm2[13],ymm0[14],ymm2[15]
; KNL-NEXT: retq
;
; SKX-LABEL: shuffle_v32i16_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_18_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_38:
-; SKX: ## BB#0:
-; SKX-NEXT: vmovdqu16 {{.*#+}} zmm2 = [15,31,14,22,13,29,4,28,11,27,10,26,9,25,8,24,15,31,14,22,13,29,4,28,11,27,10,26,9,25,8,56]
+; SKX: ## %bb.0:
+; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,31,14,22,13,29,4,28,11,27,10,26,9,25,8,24,15,31,14,22,13,29,4,28,11,27,10,26,9,25,8,56]
; SKX-NEXT: vpermt2w %zmm1, %zmm2, %zmm0
; SKX-NEXT: retq
%c = shufflevector <32 x i16> %a, <32 x i16> %b, <32 x i32> <i32 15, i32 31, i32 14, i32 22, i32 13, i32 29, i32 4, i32 28, i32 11, i32 27, i32 10, i32 26, i32 9, i32 25, i32 8, i32 24, i32 15, i32 31, i32 14, i32 22, i32 13, i32 29, i32 4, i32 28, i32 11, i32 27, i32 10, i32 26, i32 9, i32 25, i32 8, i32 56>
@@ -95,13 +89,13 @@ define <32 x i16> @shuffle_v32i16_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_1
define <32 x i16> @shuffle_v16i32_0_32_1_33_2_34_3_35_8_40_9_41_u_u_u_u(<32 x i16> %a, <32 x i16> %b) {
; KNL-LABEL: shuffle_v16i32_0_32_1_33_2_34_3_35_8_40_9_41_u_u_u_u:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11]
; KNL-NEXT: retq
;
; SKX-LABEL: shuffle_v16i32_0_32_1_33_2_34_3_35_8_40_9_41_u_u_u_u:
-; SKX: ## BB#0:
-; SKX-NEXT: vpunpcklwd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27]
+; SKX: ## %bb.0:
+; SKX-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
; SKX-NEXT: retq
%c = shufflevector <32 x i16> %a, <32 x i16> %b, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 8, i32 40, i32 9, i32 41, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
ret <32 x i16> %c
@@ -109,13 +103,13 @@ define <32 x i16> @shuffle_v16i32_0_32_1_33_2_34_3_35_8_40_9_41_u_u_u_u(<32 x i1
define <32 x i16> @shuffle_v16i32_4_36_5_37_6_38_7_39_12_44_13_45_u_u_u_u(<32 x i16> %a, <32 x i16> %b) {
; KNL-LABEL: shuffle_v16i32_4_36_5_37_6_38_7_39_12_44_13_45_u_u_u_u:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15]
; KNL-NEXT: retq
;
; SKX-LABEL: shuffle_v16i32_4_36_5_37_6_38_7_39_12_44_13_45_u_u_u_u:
-; SKX: ## BB#0:
-; SKX-NEXT: vpunpckhwd {{.*#+}} zmm0 = zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31]
+; SKX: ## %bb.0:
+; SKX-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15]
; SKX-NEXT: retq
%c = shufflevector <32 x i16> %a, <32 x i16> %b, <32 x i32> <i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 12, i32 44, i32 13, i32 45, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
ret <32 x i16> %c
@@ -123,13 +117,13 @@ define <32 x i16> @shuffle_v16i32_4_36_5_37_6_38_7_39_12_44_13_45_u_u_u_u(<32 x
define <32 x i16> @shuffle_v32i16_1_z_3_z_5_z_7_z_9_z_11_z_13_z_15_z_17_z_19_z_21_z_23_z_25_z_27_z_29_z_31_z(<32 x i16> %a, <32 x i16> %b) {
; KNL-LABEL: shuffle_v32i16_1_z_3_z_5_z_7_z_9_z_11_z_13_z_15_z_17_z_19_z_21_z_23_z_25_z_27_z_29_z_31_z:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: vpsrld $16, %ymm0, %ymm0
; KNL-NEXT: vpsrld $16, %ymm1, %ymm1
; KNL-NEXT: retq
;
; SKX-LABEL: shuffle_v32i16_1_z_3_z_5_z_7_z_9_z_11_z_13_z_15_z_17_z_19_z_21_z_23_z_25_z_27_z_29_z_31_z:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpsrld $16, %zmm0, %zmm0
; SKX-NEXT: retq
%c = shufflevector <32 x i16> %a, <32 x i16> zeroinitializer, <32 x i32> <i32 1, i32 34, i32 3, i32 34, i32 5, i32 34, i32 7, i32 34, i32 9, i32 34, i32 11, i32 34, i32 13, i32 34, i32 15, i32 34, i32 17, i32 34, i32 19, i32 34, i32 21, i32 34, i32 23, i32 34, i32 25, i32 34, i32 27, i32 34, i32 29, i32 34, i32 31, i32 34>
@@ -138,13 +132,13 @@ define <32 x i16> @shuffle_v32i16_1_z_3_z_5_z_7_z_9_z_11_z_13_z_15_z_17_z_19_z_2
define <32 x i16> @shuffle_v32i16_z_0_z_2_z_4_z_6_z_8_z_10_z_12_z_14_z_16_z_18_z_20_z_22_z_24_z_26_z_28_z_30(<32 x i16> %a, <32 x i16> %b) {
; KNL-LABEL: shuffle_v32i16_z_0_z_2_z_4_z_6_z_8_z_10_z_12_z_14_z_16_z_18_z_20_z_22_z_24_z_26_z_28_z_30:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: vpslld $16, %ymm0, %ymm0
; KNL-NEXT: vpslld $16, %ymm1, %ymm1
; KNL-NEXT: retq
;
; SKX-LABEL: shuffle_v32i16_z_0_z_2_z_4_z_6_z_8_z_10_z_12_z_14_z_16_z_18_z_20_z_22_z_24_z_26_z_28_z_30:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpslld $16, %zmm0, %zmm0
; SKX-NEXT: retq
%c = shufflevector <32 x i16> %a, <32 x i16> zeroinitializer, <32 x i32> <i32 34, i32 0, i32 34, i32 2, i32 34, i32 4, i32 34, i32 6, i32 34, i32 8, i32 34, i32 10, i32 34, i32 12, i32 34, i32 14, i32 34, i32 16, i32 34, i32 18, i32 34, i32 20, i32 34, i32 22, i32 34, i32 24, i32 34, i32 26, i32 34, i32 28, i32 34, i32 30>
@@ -153,13 +147,13 @@ define <32 x i16> @shuffle_v32i16_z_0_z_2_z_4_z_6_z_8_z_10_z_12_z_14_z_16_z_18_z
define <32 x i16> @shuffle_v32i16_1_1_0_0_4_5_6_7_9_9_8_8_12_13_14_15_17_17_16_16_20_21_22_23_25_25_24_24_28_29_30_31(<32 x i16> %a, <32 x i16> %b) {
; KNL-LABEL: shuffle_v32i16_1_1_0_0_4_5_6_7_9_9_8_8_12_13_14_15_17_17_16_16_20_21_22_23_25_25_24_24_28_29_30_31:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,1,0,0,4,5,6,7,9,9,8,8,12,13,14,15]
; KNL-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[1,1,0,0,4,5,6,7,9,9,8,8,12,13,14,15]
; KNL-NEXT: retq
;
; SKX-LABEL: shuffle_v32i16_1_1_0_0_4_5_6_7_9_9_8_8_12_13_14_15_17_17_16_16_20_21_22_23_25_25_24_24_28_29_30_31:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpshuflw {{.*#+}} zmm0 = zmm0[1,1,0,0,4,5,6,7,9,9,8,8,12,13,14,15,17,17,16,16,20,21,22,23,25,25,24,24,28,29,30,31]
; SKX-NEXT: retq
%c = shufflevector <32 x i16> %a, <32 x i16> zeroinitializer, <32 x i32> <i32 1, i32 1, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7, i32 9, i32 9, i32 8, i32 8, i32 12, i32 13, i32 14, i32 15, i32 17, i32 17, i32 16, i32 16, i32 20, i32 21, i32 22, i32 23, i32 25, i32 25, i32 24, i32 24, i32 28, i32 29, i32 30, i32 31>
@@ -168,13 +162,13 @@ define <32 x i16> @shuffle_v32i16_1_1_0_0_4_5_6_7_9_9_8_8_12_13_14_15_17_17_16_1
define <32 x i16> @shuffle_v32i16_0_1_2_3_5_5_4_4_8_9_10_11_13_13_12_12_16_17_18_19_21_21_20_20_24_25_26_27_29_29_28_28(<32 x i16> %a, <32 x i16> %b) {
; KNL-LABEL: shuffle_v32i16_0_1_2_3_5_5_4_4_8_9_10_11_13_13_12_12_16_17_18_19_21_21_20_20_24_25_26_27_29_29_28_28:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,5,5,4,4,8,9,10,11,13,13,12,12]
; KNL-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,5,5,4,4,8,9,10,11,13,13,12,12]
; KNL-NEXT: retq
;
; SKX-LABEL: shuffle_v32i16_0_1_2_3_5_5_4_4_8_9_10_11_13_13_12_12_16_17_18_19_21_21_20_20_24_25_26_27_29_29_28_28:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpshufhw {{.*#+}} zmm0 = zmm0[0,1,2,3,5,5,4,4,8,9,10,11,13,13,12,12,16,17,18,19,21,21,20,20,24,25,26,27,29,29,28,28]
; SKX-NEXT: retq
%c = shufflevector <32 x i16> %a, <32 x i16> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 5, i32 4, i32 4, i32 8, i32 9, i32 10, i32 11, i32 13, i32 13, i32 12, i32 12, i32 16, i32 17, i32 18, i32 19, i32 21, i32 21, i32 20, i32 20, i32 24, i32 25, i32 26, i32 27, i32 29, i32 29, i32 28, i32 28>
@@ -183,7 +177,7 @@ define <32 x i16> @shuffle_v32i16_0_1_2_3_5_5_4_4_8_9_10_11_13_13_12_12_16_17_18
define <32 x i16> @shuffle_v32i16_1_1_0_0_5_5_4_4_9_9_11_11_13_13_12_12_17_17_19_19_21_21_20_20_25_25_27_27_29_29_28_28(<32 x i16> %a, <32 x i16> %b) {
; KNL-LABEL: shuffle_v32i16_1_1_0_0_5_5_4_4_9_9_11_11_13_13_12_12_17_17_19_19_21_21_20_20_25_25_27_27_29_29_28_28:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,1,0,0,4,5,6,7,9,9,8,8,12,13,14,15]
; KNL-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,5,5,4,4,8,9,10,11,13,13,12,12]
; KNL-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[1,1,0,0,4,5,6,7,9,9,8,8,12,13,14,15]
@@ -191,7 +185,7 @@ define <32 x i16> @shuffle_v32i16_1_1_0_0_5_5_4_4_9_9_11_11_13_13_12_12_17_17_19
; KNL-NEXT: retq
;
; SKX-LABEL: shuffle_v32i16_1_1_0_0_5_5_4_4_9_9_11_11_13_13_12_12_17_17_19_19_21_21_20_20_25_25_27_27_29_29_28_28:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpshuflw {{.*#+}} zmm0 = zmm0[1,1,0,0,4,5,6,7,9,9,8,8,12,13,14,15,17,17,16,16,20,21,22,23,25,25,24,24,28,29,30,31]
; SKX-NEXT: vpshufhw {{.*#+}} zmm0 = zmm0[0,1,2,3,5,5,4,4,8,9,10,11,13,13,12,12,16,17,18,19,21,21,20,20,24,25,26,27,29,29,28,28]
; SKX-NEXT: retq
@@ -201,15 +195,15 @@ define <32 x i16> @shuffle_v32i16_1_1_0_0_5_5_4_4_9_9_11_11_13_13_12_12_17_17_19
define <32 x i16> @shuffle_v32i16_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz(<32 x i16> %a) {
; KNL-LABEL: shuffle_v32i16_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: movl $65535, %eax ## imm = 0xFFFF
; KNL-NEXT: vmovd %eax, %xmm1
; KNL-NEXT: vpand %ymm1, %ymm0, %ymm0
-; KNL-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1
; KNL-NEXT: retq
;
; SKX-LABEL: shuffle_v32i16_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: movl $1, %eax
; SKX-NEXT: kmovd %eax, %k1
; SKX-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z}
@@ -220,13 +214,13 @@ define <32 x i16> @shuffle_v32i16_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz(<32 x i16> %a
define <32 x i16> @insert_dup_mem_v32i16_i32(i32* %ptr) {
; KNL-LABEL: insert_dup_mem_v32i16_i32:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: vpbroadcastw (%rdi), %ymm0
; KNL-NEXT: vmovdqa %ymm0, %ymm1
; KNL-NEXT: retq
;
; SKX-LABEL: insert_dup_mem_v32i16_i32:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: movl (%rdi), %eax
; SKX-NEXT: vpbroadcastw %eax, %zmm0
; SKX-NEXT: retq
@@ -239,7 +233,7 @@ define <32 x i16> @insert_dup_mem_v32i16_i32(i32* %ptr) {
define <32 x i16> @insert_dup_mem_v32i16_sext_i16(i16* %ptr) {
; KNL-LABEL: insert_dup_mem_v32i16_sext_i16:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: movswl (%rdi), %eax
; KNL-NEXT: vmovd %eax, %xmm0
; KNL-NEXT: vpbroadcastw %xmm0, %ymm0
@@ -247,7 +241,7 @@ define <32 x i16> @insert_dup_mem_v32i16_sext_i16(i16* %ptr) {
; KNL-NEXT: retq
;
; SKX-LABEL: insert_dup_mem_v32i16_sext_i16:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: movswl (%rdi), %eax
; SKX-NEXT: vpbroadcastw %eax, %zmm0
; SKX-NEXT: retq
@@ -261,13 +255,13 @@ define <32 x i16> @insert_dup_mem_v32i16_sext_i16(i16* %ptr) {
define <32 x i16> @insert_dup_elt1_mem_v32i16_i32(i32* %ptr) #0 {
; KNL-LABEL: insert_dup_elt1_mem_v32i16_i32:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: vpbroadcastw 2(%rdi), %ymm0
; KNL-NEXT: vmovdqa %ymm0, %ymm1
; KNL-NEXT: retq
;
; SKX-LABEL: insert_dup_elt1_mem_v32i16_i32:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: movzwl 2(%rdi), %eax
; SKX-NEXT: vpbroadcastw %eax, %zmm0
; SKX-NEXT: retq
@@ -280,13 +274,13 @@ define <32 x i16> @insert_dup_elt1_mem_v32i16_i32(i32* %ptr) #0 {
define <32 x i16> @insert_dup_elt3_mem_v32i16_i32(i32* %ptr) #0 {
; KNL-LABEL: insert_dup_elt3_mem_v32i16_i32:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: vpbroadcastw 2(%rdi), %ymm0
; KNL-NEXT: vmovdqa %ymm0, %ymm1
; KNL-NEXT: retq
;
; SKX-LABEL: insert_dup_elt3_mem_v32i16_i32:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: movzwl 2(%rdi), %eax
; SKX-NEXT: vpbroadcastw %eax, %zmm0
; SKX-NEXT: retq
@@ -299,7 +293,7 @@ define <32 x i16> @insert_dup_elt3_mem_v32i16_i32(i32* %ptr) #0 {
define <32 x i16> @shuffle_v32i16_32_zz_zz_zz_33_zz_zz_zz_34_zz_zz_zz_35_zz_zz_zz_36_zz_zz_zz_37_zz_zz_zz_38_zz_zz_zz_39_zz_zz_zz(<32 x i16> %a) {
; KNL-LABEL: shuffle_v32i16_32_zz_zz_zz_33_zz_zz_zz_34_zz_zz_zz_35_zz_zz_zz_36_zz_zz_zz_37_zz_zz_zz_38_zz_zz_zz_39_zz_zz_zz:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: vpmovzxwq {{.*#+}} ymm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; KNL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
; KNL-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
@@ -307,7 +301,7 @@ define <32 x i16> @shuffle_v32i16_32_zz_zz_zz_33_zz_zz_zz_34_zz_zz_zz_35_zz_zz_z
; KNL-NEXT: retq
;
; SKX-LABEL: shuffle_v32i16_32_zz_zz_zz_33_zz_zz_zz_34_zz_zz_zz_35_zz_zz_zz_36_zz_zz_zz_37_zz_zz_zz_38_zz_zz_zz_39_zz_zz_zz:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
; SKX-NEXT: retq
%shuffle = shufflevector <32 x i16> zeroinitializer, <32 x i16> %a, <32 x i32> <i32 32, i32 0, i32 0, i32 0, i32 33, i32 0, i32 0, i32 0, i32 34, i32 0, i32 0, i32 0, i32 35, i32 0, i32 0, i32 0, i32 36, i32 0, i32 0, i32 0, i32 37, i32 0, i32 0, i32 0, i32 38, i32 0, i32 0, i32 0, i32 39, i32 0, i32 0, i32 0>
@@ -316,7 +310,7 @@ define <32 x i16> @shuffle_v32i16_32_zz_zz_zz_33_zz_zz_zz_34_zz_zz_zz_35_zz_zz_z
define <32 x i16> @shuffle_v32i16_32_zz_33_zz_34_zz_35_zz_36_zz_37_zz_38_zz_39_zz_40_zz_41_zz_42_zz_43_zz_44_zz_45_zz_46_zz_47_zz(<32 x i16> %a) {
; KNL-LABEL: shuffle_v32i16_32_zz_33_zz_34_zz_35_zz_36_zz_37_zz_38_zz_39_zz_40_zz_41_zz_42_zz_43_zz_44_zz_45_zz_46_zz_47_zz:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
; KNL-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
@@ -324,7 +318,7 @@ define <32 x i16> @shuffle_v32i16_32_zz_33_zz_34_zz_35_zz_36_zz_37_zz_38_zz_39_z
; KNL-NEXT: retq
;
; SKX-LABEL: shuffle_v32i16_32_zz_33_zz_34_zz_35_zz_36_zz_37_zz_38_zz_39_zz_40_zz_41_zz_42_zz_43_zz_44_zz_45_zz_46_zz_47_zz:
-; SKX: ## BB#0:
+; SKX: ## %bb.0:
; SKX-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
; SKX-NEXT: retq
%shuffle = shufflevector <32 x i16> zeroinitializer, <32 x i16> %a, <32 x i32> <i32 32, i32 0, i32 33, i32 0, i32 34, i32 0, i32 35, i32 0, i32 36, i32 0, i32 37, i32 0, i32 38, i32 0, i32 39, i32 0, i32 40, i32 0, i32 41, i32 0, i32 42, i32 0, i32 43, i32 0, i32 44, i32 0, i32 45, i32 0, i32 46, i32 0, i32 47, i32 0>
@@ -333,7 +327,7 @@ define <32 x i16> @shuffle_v32i16_32_zz_33_zz_34_zz_35_zz_36_zz_37_zz_38_zz_39_z
define <8 x i16> @pr32967(<32 x i16> %v) {
; KNL-LABEL: pr32967:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: vextracti128 $1, %ymm1, %xmm2
; KNL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
; KNL-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7]
@@ -350,25 +344,11 @@ define <8 x i16> @pr32967(<32 x i16> %v) {
; KNL-NEXT: retq
;
; SKX-LABEL: pr32967:
-; SKX: ## BB#0:
-; SKX-NEXT: vpextrw $5, %xmm0, %eax
-; SKX-NEXT: vpextrw $1, %xmm0, %ecx
-; SKX-NEXT: vmovd %ecx, %xmm1
-; SKX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1
-; SKX-NEXT: vextracti32x4 $1, %zmm0, %xmm2
-; SKX-NEXT: vpextrw $1, %xmm2, %eax
-; SKX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1
-; SKX-NEXT: vpextrw $5, %xmm2, %eax
-; SKX-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1
-; SKX-NEXT: vextracti32x4 $2, %zmm0, %xmm2
-; SKX-NEXT: vpextrw $1, %xmm2, %eax
-; SKX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1
-; SKX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5],xmm1[6,7]
-; SKX-NEXT: vextracti32x4 $3, %zmm0, %xmm0
-; SKX-NEXT: vpextrw $1, %xmm0, %eax
-; SKX-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1
-; SKX-NEXT: vpextrw $5, %xmm0, %eax
-; SKX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm0
+; SKX: ## %bb.0:
+; SKX-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; SKX-NEXT: vmovdqa {{.*#+}} ymm1 = <1,5,9,13,17,21,25,29,u,u,u,u,u,u,u,u>
+; SKX-NEXT: vpermi2w %ymm2, %ymm0, %ymm1
+; SKX-NEXT: vmovdqa %xmm1, %xmm0
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%shuffle = shufflevector <32 x i16> %v, <32 x i16> undef, <8 x i32> <i32 1,i32 5,i32 9,i32 13,i32 17,i32 21,i32 25,i32 29>
diff --git a/test/CodeGen/X86/vector-shuffle-512-v64.ll b/test/CodeGen/X86/vector-shuffle-512-v64.ll
index 9dca3191e06b..cff6892caeeb 100644
--- a/test/CodeGen/X86/vector-shuffle-512-v64.ll
+++ b/test/CodeGen/X86/vector-shuffle-512-v64.ll
@@ -1,55 +1,38 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512F
-; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
-; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512DQ
-; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512vbmi | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512VBMI
-
-target triple = "x86_64-unknown-unknown"
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512DQ
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vbmi | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512VBMI
define <64 x i8> @shuffle_v64i8_02_03_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u(<64 x i8> %a) {
-; AVX512F-LABEL: shuffle_v64i8_02_03_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vpsrld $16, %xmm0, %xmm0
-; AVX512F-NEXT: retq
-;
-; AVX512BW-LABEL: shuffle_v64i8_02_03_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u:
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vpsrld $16, %zmm0, %zmm0
-; AVX512BW-NEXT: retq
-;
-; AVX512DQ-LABEL: shuffle_v64i8_02_03_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u:
-; AVX512DQ: # BB#0:
-; AVX512DQ-NEXT: vpsrld $16, %xmm0, %xmm0
-; AVX512DQ-NEXT: retq
-;
-; AVX512VBMI-LABEL: shuffle_v64i8_02_03_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u:
-; AVX512VBMI: # BB#0:
-; AVX512VBMI-NEXT: vpsrld $16, %zmm0, %zmm0
-; AVX512VBMI-NEXT: retq
+; ALL-LABEL: shuffle_v64i8_02_03_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u:
+; ALL: # %bb.0:
+; ALL-NEXT: vpsrld $16, %xmm0, %xmm0
+; ALL-NEXT: retq
%b = shufflevector <64 x i8> %a, <64 x i8> undef, <64 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
ret <64 x i8> %b
}
define <64 x i8> @shuffle_v64i8_zz_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_zz_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_zz_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_zz_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62(<64 x i8> %a, <64 x i8> %b) {
; AVX512F-LABEL: shuffle_v64i8_zz_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_zz_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_zz_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_zz_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vpslldq {{.*#+}} ymm0 = zero,ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],zero,ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
; AVX512F-NEXT: vpslldq {{.*#+}} ymm1 = zero,ymm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],zero,ymm1[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: shuffle_v64i8_zz_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_zz_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_zz_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_zz_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpslldq {{.*#+}} zmm0 = zero,zmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],zero,zmm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30],zero,zmm0[32,33,34,35,36,37,38,39,40,41,42,43,44,45,46],zero,zmm0[48,49,50,51,52,53,54,55,56,57,58,59,60,61,62]
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: shuffle_v64i8_zz_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_zz_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_zz_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_zz_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62:
-; AVX512DQ: # BB#0:
+; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vpslldq {{.*#+}} ymm0 = zero,ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],zero,ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
; AVX512DQ-NEXT: vpslldq {{.*#+}} ymm1 = zero,ymm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],zero,ymm1[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
; AVX512DQ-NEXT: retq
;
; AVX512VBMI-LABEL: shuffle_v64i8_zz_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_zz_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_zz_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_zz_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62:
-; AVX512VBMI: # BB#0:
+; AVX512VBMI: # %bb.0:
; AVX512VBMI-NEXT: vpslldq {{.*#+}} zmm0 = zero,zmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],zero,zmm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30],zero,zmm0[32,33,34,35,36,37,38,39,40,41,42,43,44,45,46],zero,zmm0[48,49,50,51,52,53,54,55,56,57,58,59,60,61,62]
; AVX512VBMI-NEXT: retq
%shuffle = shufflevector <64 x i8> %a, <64 x i8> zeroinitializer, <64 x i32> <i32 79, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 95, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 111, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 127, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62>
@@ -58,24 +41,24 @@ define <64 x i8> @shuffle_v64i8_zz_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_
define <64 x i8> @shuffle_v64i8_02_03_04_05_06_07_08_09_10_11_12_13_14_15_zz_zz_18_19_20_21_22_23_24_25_26_27_28_29_30_31_zz_zz_34_35_36_37_38_39_40_41_42_43_44_45_46_47_zz_zz_50_51_52_53_54_55_56_57_58_59_60_61_62_63_zz_zz(<64 x i8> %a, <64 x i8> %b) {
; AVX512F-LABEL: shuffle_v64i8_02_03_04_05_06_07_08_09_10_11_12_13_14_15_zz_zz_18_19_20_21_22_23_24_25_26_27_28_29_30_31_zz_zz_34_35_36_37_38_39_40_41_42_43_44_45_46_47_zz_zz_50_51_52_53_54_55_56_57_58_59_60_61_62_63_zz_zz:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,ymm0[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero
; AVX512F-NEXT: vpsrldq {{.*#+}} ymm1 = ymm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,ymm1[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: shuffle_v64i8_02_03_04_05_06_07_08_09_10_11_12_13_14_15_zz_zz_18_19_20_21_22_23_24_25_26_27_28_29_30_31_zz_zz_34_35_36_37_38_39_40_41_42_43_44_45_46_47_zz_zz_50_51_52_53_54_55_56_57_58_59_60_61_62_63_zz_zz:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpsrldq {{.*#+}} zmm0 = zmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zmm0[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero,zmm0[34,35,36,37,38,39,40,41,42,43,44,45,46,47],zero,zero,zmm0[50,51,52,53,54,55,56,57,58,59,60,61,62,63],zero,zero
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: shuffle_v64i8_02_03_04_05_06_07_08_09_10_11_12_13_14_15_zz_zz_18_19_20_21_22_23_24_25_26_27_28_29_30_31_zz_zz_34_35_36_37_38_39_40_41_42_43_44_45_46_47_zz_zz_50_51_52_53_54_55_56_57_58_59_60_61_62_63_zz_zz:
-; AVX512DQ: # BB#0:
+; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,ymm0[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero
; AVX512DQ-NEXT: vpsrldq {{.*#+}} ymm1 = ymm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,ymm1[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero
; AVX512DQ-NEXT: retq
;
; AVX512VBMI-LABEL: shuffle_v64i8_02_03_04_05_06_07_08_09_10_11_12_13_14_15_zz_zz_18_19_20_21_22_23_24_25_26_27_28_29_30_31_zz_zz_34_35_36_37_38_39_40_41_42_43_44_45_46_47_zz_zz_50_51_52_53_54_55_56_57_58_59_60_61_62_63_zz_zz:
-; AVX512VBMI: # BB#0:
+; AVX512VBMI: # %bb.0:
; AVX512VBMI-NEXT: vpsrldq {{.*#+}} zmm0 = zmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zmm0[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero,zmm0[34,35,36,37,38,39,40,41,42,43,44,45,46,47],zero,zero,zmm0[50,51,52,53,54,55,56,57,58,59,60,61,62,63],zero,zero
; AVX512VBMI-NEXT: retq
%shuffle = shufflevector <64 x i8> %a, <64 x i8> zeroinitializer, <64 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 64, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 64, i32 64, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 64, i32 64, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 64>
@@ -84,24 +67,24 @@ define <64 x i8> @shuffle_v64i8_02_03_04_05_06_07_08_09_10_11_12_13_14_15_zz_zz_
define <64 x i8> @shuffle_v64i8_79_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_95_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_111_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_127_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62(<64 x i8> %a, <64 x i8> %b) {
; AVX512F-LABEL: shuffle_v64i8_79_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_95_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_111_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_127_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vpalignr {{.*#+}} ymm0 = ymm2[15],ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],ymm2[31],ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
; AVX512F-NEXT: vpalignr {{.*#+}} ymm1 = ymm3[15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],ymm3[31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: shuffle_v64i8_79_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_95_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_111_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_127_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpalignr {{.*#+}} zmm0 = zmm1[15],zmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],zmm1[31],zmm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30],zmm1[47],zmm0[32,33,34,35,36,37,38,39,40,41,42,43,44,45,46],zmm1[63],zmm0[48,49,50,51,52,53,54,55,56,57,58,59,60,61,62]
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: shuffle_v64i8_79_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_95_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_111_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_127_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62:
-; AVX512DQ: # BB#0:
+; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm0 = ymm2[15],ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],ymm2[31],ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm1 = ymm3[15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],ymm3[31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
; AVX512DQ-NEXT: retq
;
; AVX512VBMI-LABEL: shuffle_v64i8_79_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_95_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_111_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_127_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62:
-; AVX512VBMI: # BB#0:
+; AVX512VBMI: # %bb.0:
; AVX512VBMI-NEXT: vpalignr {{.*#+}} zmm0 = zmm1[15],zmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],zmm1[31],zmm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30],zmm1[47],zmm0[32,33,34,35,36,37,38,39,40,41,42,43,44,45,46],zmm1[63],zmm0[48,49,50,51,52,53,54,55,56,57,58,59,60,61,62]
; AVX512VBMI-NEXT: retq
%shuffle = shufflevector <64 x i8> %a, <64 x i8> %b, <64 x i32> <i32 79, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 95, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 111, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 127, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62>
@@ -111,28 +94,28 @@ define <64 x i8> @shuffle_v64i8_79_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_
define <64 x i8> @shuffle_v64i8_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz(<64 x i8> %a) {
; AVX512F-LABEL: shuffle_v64i8_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: movl $255, %eax
; AVX512F-NEXT: vmovd %eax, %xmm1
; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0
-; AVX512F-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: shuffle_v64i8_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: shuffle_v64i8_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz:
-; AVX512DQ: # BB#0:
+; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: movl $255, %eax
; AVX512DQ-NEXT: vmovd %eax, %xmm1
; AVX512DQ-NEXT: vpand %ymm1, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512DQ-NEXT: retq
;
; AVX512VBMI-LABEL: shuffle_v64i8_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz:
-; AVX512VBMI: # BB#0:
+; AVX512VBMI: # %bb.0:
; AVX512VBMI-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; AVX512VBMI-NEXT: retq
%shuffle = shufflevector <64 x i8> %a, <64 x i8> zeroinitializer, <64 x i32> <i32 0, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64>
@@ -141,24 +124,24 @@ define <64 x i8> @shuffle_v64i8_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz(<64 x i8> %a) {
define <64 x i8> @shuffle_v64i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<64 x i8> %a, <64 x i8> %b) {
; AVX512F-LABEL: shuffle_v64i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vpbroadcastb %xmm0, %ymm0
; AVX512F-NEXT: vmovdqa %ymm0, %ymm1
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: shuffle_v64i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpbroadcastb %xmm0, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: shuffle_v64i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX512DQ: # BB#0:
+; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vpbroadcastb %xmm0, %ymm0
; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm1
; AVX512DQ-NEXT: retq
;
; AVX512VBMI-LABEL: shuffle_v64i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
-; AVX512VBMI: # BB#0:
+; AVX512VBMI: # %bb.0:
; AVX512VBMI-NEXT: vpbroadcastb %xmm0, %zmm0
; AVX512VBMI-NEXT: retq
%shuffle = shufflevector <64 x i8> %a, <64 x i8> %b, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
@@ -167,34 +150,34 @@ define <64 x i8> @shuffle_v64i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
define <64 x i8> @shuffle_v64i8_63_62_61_60_59_58_57_56_55_54_53_52_51_50_49_48_47_46_45_44_43_42_41_40_39_38_37_36_35_34_33_32_31_30_29_28_27_26_25_24_23_22_21_20_19_18_17_16_15_14_13_12_11_10_09_08_07_06_05_04_03_02_01_00(<64 x i8> %a) {
; AVX512F-LABEL: shuffle_v64i8_63_62_61_60_59_58_57_56_55_54_53_52_51_50_49_48_47_46_45_44_43_42_41_40_39_38_37_36_35_34_33_32_31_30_29_28_27_26_25_24_23_22_21_20_19_18_17_16_15_14_13_12_11_10_09_08_07_06_05_04_03_02_01_00:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
; AVX512F-NEXT: vpshufb %ymm3, %ymm1, %ymm1
-; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm1[2,3,0,1]
+; AVX512F-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm0, %ymm0
-; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX512F-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX512F-NEXT: vmovdqa %ymm2, %ymm0
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: shuffle_v64i8_63_62_61_60_59_58_57_56_55_54_53_52_51_50_49_48_47_46_45_44_43_42_41_40_39_38_37_36_35_34_33_32_31_30_29_28_27_26_25_24_23_22_21_20_19_18_17_16_15_14_13_12_11_10_09_08_07_06_05_04_03_02_01_00:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,47,46,45,44,43,42,41,40,39,38,37,36,35,34,33,32,63,62,61,60,59,58,57,56,55,54,53,52,51,50,49,48]
; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[6,7,4,5,2,3,0,1]
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: shuffle_v64i8_63_62_61_60_59_58_57_56_55_54_53_52_51_50_49_48_47_46_45_44_43_42_41_40_39_38_37_36_35_34_33_32_31_30_29_28_27_26_25_24_23_22_21_20_19_18_17_16_15_14_13_12_11_10_09_08_07_06_05_04_03_02_01_00:
-; AVX512DQ: # BB#0:
+; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
; AVX512DQ-NEXT: vpshufb %ymm3, %ymm1, %ymm1
-; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm1[2,3,0,1]
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
; AVX512DQ-NEXT: vpshufb %ymm3, %ymm0, %ymm0
-; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX512DQ-NEXT: vmovdqa %ymm2, %ymm0
; AVX512DQ-NEXT: retq
;
; AVX512VBMI-LABEL: shuffle_v64i8_63_62_61_60_59_58_57_56_55_54_53_52_51_50_49_48_47_46_45_44_43_42_41_40_39_38_37_36_35_34_33_32_31_30_29_28_27_26_25_24_23_22_21_20_19_18_17_16_15_14_13_12_11_10_09_08_07_06_05_04_03_02_01_00:
-; AVX512VBMI: # BB#0:
-; AVX512VBMI-NEXT: vmovdqu8 {{.*#+}} zmm1 = [63,62,61,60,59,58,57,56,55,54,53,52,51,50,49,48,47,46,45,44,43,42,41,40,39,38,37,36,35,34,33,32,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
+; AVX512VBMI: # %bb.0:
+; AVX512VBMI-NEXT: vmovdqa64 {{.*#+}} zmm1 = [63,62,61,60,59,58,57,56,55,54,53,52,51,50,49,48,47,46,45,44,43,42,41,40,39,38,37,36,35,34,33,32,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
; AVX512VBMI-NEXT: vpermb %zmm0, %zmm1, %zmm0
; AVX512VBMI-NEXT: retq
%shuffle = shufflevector <64 x i8> %a, <64 x i8> undef, <64 x i32> <i32 63, i32 62, i32 61, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
@@ -203,24 +186,24 @@ define <64 x i8> @shuffle_v64i8_63_62_61_60_59_58_57_56_55_54_53_52_51_50_49_48_
define <64 x i8> @insert_dup_mem_v64i8_i32(i32* %ptr) {
; AVX512F-LABEL: insert_dup_mem_v64i8_i32:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vpbroadcastb (%rdi), %ymm0
; AVX512F-NEXT: vmovdqa %ymm0, %ymm1
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: insert_dup_mem_v64i8_i32:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpbroadcastb (%rdi), %zmm0
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: insert_dup_mem_v64i8_i32:
-; AVX512DQ: # BB#0:
+; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vpbroadcastb (%rdi), %ymm0
; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm1
; AVX512DQ-NEXT: retq
;
; AVX512VBMI-LABEL: insert_dup_mem_v64i8_i32:
-; AVX512VBMI: # BB#0:
+; AVX512VBMI: # %bb.0:
; AVX512VBMI-NEXT: vpbroadcastb (%rdi), %zmm0
; AVX512VBMI-NEXT: retq
%tmp = load i32, i32* %ptr, align 4
@@ -232,24 +215,24 @@ define <64 x i8> @insert_dup_mem_v64i8_i32(i32* %ptr) {
define <64 x i8> @insert_dup_mem_v64i8_sext_i8(i8* %ptr) {
; AVX512F-LABEL: insert_dup_mem_v64i8_sext_i8:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vpbroadcastb (%rdi), %ymm0
; AVX512F-NEXT: vmovdqa %ymm0, %ymm1
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: insert_dup_mem_v64i8_sext_i8:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpbroadcastb (%rdi), %zmm0
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: insert_dup_mem_v64i8_sext_i8:
-; AVX512DQ: # BB#0:
+; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vpbroadcastb (%rdi), %ymm0
; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm1
; AVX512DQ-NEXT: retq
;
; AVX512VBMI-LABEL: insert_dup_mem_v64i8_sext_i8:
-; AVX512VBMI: # BB#0:
+; AVX512VBMI: # %bb.0:
; AVX512VBMI-NEXT: vpbroadcastb (%rdi), %zmm0
; AVX512VBMI-NEXT: retq
%tmp = load i8, i8* %ptr, align 1
@@ -262,24 +245,24 @@ define <64 x i8> @insert_dup_mem_v64i8_sext_i8(i8* %ptr) {
define <64 x i8> @insert_dup_elt1_mem_v64i8_i32(i32* %ptr) {
; AVX512F-LABEL: insert_dup_elt1_mem_v64i8_i32:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vpbroadcastb 1(%rdi), %ymm0
; AVX512F-NEXT: vmovdqa %ymm0, %ymm1
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: insert_dup_elt1_mem_v64i8_i32:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpbroadcastb 1(%rdi), %zmm0
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: insert_dup_elt1_mem_v64i8_i32:
-; AVX512DQ: # BB#0:
+; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vpbroadcastb 1(%rdi), %ymm0
; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm1
; AVX512DQ-NEXT: retq
;
; AVX512VBMI-LABEL: insert_dup_elt1_mem_v64i8_i32:
-; AVX512VBMI: # BB#0:
+; AVX512VBMI: # %bb.0:
; AVX512VBMI-NEXT: vpbroadcastb 1(%rdi), %zmm0
; AVX512VBMI-NEXT: retq
%tmp = load i32, i32* %ptr, align 4
@@ -291,24 +274,24 @@ define <64 x i8> @insert_dup_elt1_mem_v64i8_i32(i32* %ptr) {
define <64 x i8> @insert_dup_elt3_mem_v64i8_i32(i32* %ptr) {
; AVX512F-LABEL: insert_dup_elt3_mem_v64i8_i32:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vpbroadcastb 3(%rdi), %ymm0
; AVX512F-NEXT: vmovdqa %ymm0, %ymm1
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: insert_dup_elt3_mem_v64i8_i32:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpbroadcastb 3(%rdi), %zmm0
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: insert_dup_elt3_mem_v64i8_i32:
-; AVX512DQ: # BB#0:
+; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vpbroadcastb 3(%rdi), %ymm0
; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm1
; AVX512DQ-NEXT: retq
;
; AVX512VBMI-LABEL: insert_dup_elt3_mem_v64i8_i32:
-; AVX512VBMI: # BB#0:
+; AVX512VBMI: # %bb.0:
; AVX512VBMI-NEXT: vpbroadcastb 3(%rdi), %zmm0
; AVX512VBMI-NEXT: retq
%tmp = load i32, i32* %ptr, align 4
@@ -320,7 +303,7 @@ define <64 x i8> @insert_dup_elt3_mem_v64i8_i32(i32* %ptr) {
define <64 x i8> @insert_dup_elt1_mem_v64i8_sext_i8(i8* %ptr) {
; AVX512F-LABEL: insert_dup_elt1_mem_v64i8_sext_i8:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: movsbl (%rdi), %eax
; AVX512F-NEXT: shrl $8, %eax
; AVX512F-NEXT: vmovd %eax, %xmm0
@@ -329,14 +312,14 @@ define <64 x i8> @insert_dup_elt1_mem_v64i8_sext_i8(i8* %ptr) {
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: insert_dup_elt1_mem_v64i8_sext_i8:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: movsbl (%rdi), %eax
; AVX512BW-NEXT: shrl $8, %eax
; AVX512BW-NEXT: vpbroadcastb %eax, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: insert_dup_elt1_mem_v64i8_sext_i8:
-; AVX512DQ: # BB#0:
+; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: movsbl (%rdi), %eax
; AVX512DQ-NEXT: shrl $8, %eax
; AVX512DQ-NEXT: vmovd %eax, %xmm0
@@ -345,7 +328,7 @@ define <64 x i8> @insert_dup_elt1_mem_v64i8_sext_i8(i8* %ptr) {
; AVX512DQ-NEXT: retq
;
; AVX512VBMI-LABEL: insert_dup_elt1_mem_v64i8_sext_i8:
-; AVX512VBMI: # BB#0:
+; AVX512VBMI: # %bb.0:
; AVX512VBMI-NEXT: movsbl (%rdi), %eax
; AVX512VBMI-NEXT: shrl $8, %eax
; AVX512VBMI-NEXT: vpbroadcastb %eax, %zmm0
@@ -360,7 +343,7 @@ define <64 x i8> @insert_dup_elt1_mem_v64i8_sext_i8(i8* %ptr) {
define <64 x i8> @shuffle_v64i8_64_zz_zz_zz_zz_zz_zz_zz_65_zz_zz_zz_zz_zz_zz_zz_66_zz_zz_zz_zz_zz_zz_zz_67_zz_zz_zz_zz_zz_zz_zz_68_zz_zz_zz_zz_zz_zz_zz_69_zz_zz_zz_zz_zz_zz_zz_70_zz_zz_zz_zz_zz_zz_zz_71_zz_zz_zz_zz_zz_zz_zz(<64 x i8> %a) {
; AVX512F-LABEL: shuffle_v64i8_64_zz_zz_zz_zz_zz_zz_zz_65_zz_zz_zz_zz_zz_zz_zz_66_zz_zz_zz_zz_zz_zz_zz_67_zz_zz_zz_zz_zz_zz_zz_68_zz_zz_zz_zz_zz_zz_zz_69_zz_zz_zz_zz_zz_zz_zz_70_zz_zz_zz_zz_zz_zz_zz_71_zz_zz_zz_zz_zz_zz_zz:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vpmovzxbq {{.*#+}} ymm2 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
; AVX512F-NEXT: vpmovzxbq {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
@@ -368,12 +351,12 @@ define <64 x i8> @shuffle_v64i8_64_zz_zz_zz_zz_zz_zz_zz_65_zz_zz_zz_zz_zz_zz_zz_
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: shuffle_v64i8_64_zz_zz_zz_zz_zz_zz_zz_65_zz_zz_zz_zz_zz_zz_zz_66_zz_zz_zz_zz_zz_zz_zz_67_zz_zz_zz_zz_zz_zz_zz_68_zz_zz_zz_zz_zz_zz_zz_69_zz_zz_zz_zz_zz_zz_zz_70_zz_zz_zz_zz_zz_zz_zz_71_zz_zz_zz_zz_zz_zz_zz:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpmovzxbq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: shuffle_v64i8_64_zz_zz_zz_zz_zz_zz_zz_65_zz_zz_zz_zz_zz_zz_zz_66_zz_zz_zz_zz_zz_zz_zz_67_zz_zz_zz_zz_zz_zz_zz_68_zz_zz_zz_zz_zz_zz_zz_69_zz_zz_zz_zz_zz_zz_zz_70_zz_zz_zz_zz_zz_zz_zz_71_zz_zz_zz_zz_zz_zz_zz:
-; AVX512DQ: # BB#0:
+; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vpmovzxbq {{.*#+}} ymm2 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
; AVX512DQ-NEXT: vpmovzxbq {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
@@ -381,7 +364,7 @@ define <64 x i8> @shuffle_v64i8_64_zz_zz_zz_zz_zz_zz_zz_65_zz_zz_zz_zz_zz_zz_zz_
; AVX512DQ-NEXT: retq
;
; AVX512VBMI-LABEL: shuffle_v64i8_64_zz_zz_zz_zz_zz_zz_zz_65_zz_zz_zz_zz_zz_zz_zz_66_zz_zz_zz_zz_zz_zz_zz_67_zz_zz_zz_zz_zz_zz_zz_68_zz_zz_zz_zz_zz_zz_zz_69_zz_zz_zz_zz_zz_zz_zz_70_zz_zz_zz_zz_zz_zz_zz_71_zz_zz_zz_zz_zz_zz_zz:
-; AVX512VBMI: # BB#0:
+; AVX512VBMI: # %bb.0:
; AVX512VBMI-NEXT: vpmovzxbq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero
; AVX512VBMI-NEXT: retq
%shuffle = shufflevector <64 x i8> zeroinitializer, <64 x i8> %a, <64 x i32> <i32 64, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 65, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 66, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 67, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 68, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 69, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 70, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 71, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
@@ -390,7 +373,7 @@ define <64 x i8> @shuffle_v64i8_64_zz_zz_zz_zz_zz_zz_zz_65_zz_zz_zz_zz_zz_zz_zz_
define <64 x i8> @shuffle_v64i8_64_zz_zz_zz_65_zz_zz_zz_66_zz_zz_zz_67_zz_zz_zz_68_zz_zz_zz_69_zz_zz_zz_70_zz_zz_zz_71_zz_zz_zz_72_zz_zz_zz_73_zz_zz_zz_74_zz_zz_zz_75_zz_zz_zz_76_zz_zz_zz_77_zz_zz_zz_78_zz_zz_zz_79_zz_zz_zz(<64 x i8> %a) {
; AVX512F-LABEL: shuffle_v64i8_64_zz_zz_zz_65_zz_zz_zz_66_zz_zz_zz_67_zz_zz_zz_68_zz_zz_zz_69_zz_zz_zz_70_zz_zz_zz_71_zz_zz_zz_72_zz_zz_zz_73_zz_zz_zz_74_zz_zz_zz_75_zz_zz_zz_76_zz_zz_zz_77_zz_zz_zz_78_zz_zz_zz_79_zz_zz_zz:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vpmovzxbd {{.*#+}} ymm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
; AVX512F-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
@@ -398,12 +381,12 @@ define <64 x i8> @shuffle_v64i8_64_zz_zz_zz_65_zz_zz_zz_66_zz_zz_zz_67_zz_zz_zz_
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: shuffle_v64i8_64_zz_zz_zz_65_zz_zz_zz_66_zz_zz_zz_67_zz_zz_zz_68_zz_zz_zz_69_zz_zz_zz_70_zz_zz_zz_71_zz_zz_zz_72_zz_zz_zz_73_zz_zz_zz_74_zz_zz_zz_75_zz_zz_zz_76_zz_zz_zz_77_zz_zz_zz_78_zz_zz_zz_79_zz_zz_zz:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: shuffle_v64i8_64_zz_zz_zz_65_zz_zz_zz_66_zz_zz_zz_67_zz_zz_zz_68_zz_zz_zz_69_zz_zz_zz_70_zz_zz_zz_71_zz_zz_zz_72_zz_zz_zz_73_zz_zz_zz_74_zz_zz_zz_75_zz_zz_zz_76_zz_zz_zz_77_zz_zz_zz_78_zz_zz_zz_79_zz_zz_zz:
-; AVX512DQ: # BB#0:
+; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} ymm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
@@ -411,7 +394,7 @@ define <64 x i8> @shuffle_v64i8_64_zz_zz_zz_65_zz_zz_zz_66_zz_zz_zz_67_zz_zz_zz_
; AVX512DQ-NEXT: retq
;
; AVX512VBMI-LABEL: shuffle_v64i8_64_zz_zz_zz_65_zz_zz_zz_66_zz_zz_zz_67_zz_zz_zz_68_zz_zz_zz_69_zz_zz_zz_70_zz_zz_zz_71_zz_zz_zz_72_zz_zz_zz_73_zz_zz_zz_74_zz_zz_zz_75_zz_zz_zz_76_zz_zz_zz_77_zz_zz_zz_78_zz_zz_zz_79_zz_zz_zz:
-; AVX512VBMI: # BB#0:
+; AVX512VBMI: # %bb.0:
; AVX512VBMI-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
; AVX512VBMI-NEXT: retq
%shuffle = shufflevector <64 x i8> zeroinitializer, <64 x i8> %a, <64 x i32> <i32 64, i32 0, i32 0, i32 0, i32 65, i32 0, i32 0, i32 0, i32 66, i32 0, i32 0, i32 0, i32 67, i32 0, i32 0, i32 0, i32 68, i32 0, i32 0, i32 0, i32 69, i32 0, i32 0, i32 0, i32 70, i32 0, i32 0, i32 0, i32 71, i32 0, i32 0, i32 0, i32 72, i32 0, i32 0, i32 0, i32 73, i32 0, i32 0, i32 0, i32 74, i32 0, i32 0, i32 0, i32 75, i32 0, i32 0, i32 0, i32 76, i32 0, i32 0, i32 0, i32 77, i32 0, i32 0, i32 0, i32 78, i32 0, i32 0, i32 0, i32 79, i32 0, i32 0, i32 0>
@@ -420,7 +403,7 @@ define <64 x i8> @shuffle_v64i8_64_zz_zz_zz_65_zz_zz_zz_66_zz_zz_zz_67_zz_zz_zz_
define <64 x i8> @shuffle_v64i8_64_zz_65_zz_66_zz_67_zz_68_zz_69_zz_70_zz_71_zz_72_zz_73_zz_74_zz_75_zz_76_zz_77_zz_78_zz_79_zz_80_zz_81_zz_82_zz_83_zz_84_zz_85_zz_86_zz_87_zz_88_zz_89_zz_90_zz_91_zz_92_zz_93_zz_94_zz_95_zz(<64 x i8> %a) {
; AVX512F-LABEL: shuffle_v64i8_64_zz_65_zz_66_zz_67_zz_68_zz_69_zz_70_zz_71_zz_72_zz_73_zz_74_zz_75_zz_76_zz_77_zz_78_zz_79_zz_80_zz_81_zz_82_zz_83_zz_84_zz_85_zz_86_zz_87_zz_88_zz_89_zz_90_zz_91_zz_92_zz_93_zz_94_zz_95_zz:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
@@ -428,12 +411,12 @@ define <64 x i8> @shuffle_v64i8_64_zz_65_zz_66_zz_67_zz_68_zz_69_zz_70_zz_71_zz_
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: shuffle_v64i8_64_zz_65_zz_66_zz_67_zz_68_zz_69_zz_70_zz_71_zz_72_zz_73_zz_74_zz_75_zz_76_zz_77_zz_78_zz_79_zz_80_zz_81_zz_82_zz_83_zz_84_zz_85_zz_86_zz_87_zz_88_zz_89_zz_90_zz_91_zz_92_zz_93_zz_94_zz_95_zz:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: shuffle_v64i8_64_zz_65_zz_66_zz_67_zz_68_zz_69_zz_70_zz_71_zz_72_zz_73_zz_74_zz_75_zz_76_zz_77_zz_78_zz_79_zz_80_zz_81_zz_82_zz_83_zz_84_zz_85_zz_86_zz_87_zz_88_zz_89_zz_90_zz_91_zz_92_zz_93_zz_94_zz_95_zz:
-; AVX512DQ: # BB#0:
+; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
@@ -441,7 +424,7 @@ define <64 x i8> @shuffle_v64i8_64_zz_65_zz_66_zz_67_zz_68_zz_69_zz_70_zz_71_zz_
; AVX512DQ-NEXT: retq
;
; AVX512VBMI-LABEL: shuffle_v64i8_64_zz_65_zz_66_zz_67_zz_68_zz_69_zz_70_zz_71_zz_72_zz_73_zz_74_zz_75_zz_76_zz_77_zz_78_zz_79_zz_80_zz_81_zz_82_zz_83_zz_84_zz_85_zz_86_zz_87_zz_88_zz_89_zz_90_zz_91_zz_92_zz_93_zz_94_zz_95_zz:
-; AVX512VBMI: # BB#0:
+; AVX512VBMI: # %bb.0:
; AVX512VBMI-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
; AVX512VBMI-NEXT: retq
%shuffle = shufflevector <64 x i8> zeroinitializer, <64 x i8> %a, <64 x i32> <i32 64, i32 0, i32 65, i32 0, i32 66, i32 0, i32 67, i32 0, i32 68, i32 0, i32 69, i32 0, i32 70, i32 0, i32 71, i32 0, i32 72, i32 0, i32 73, i32 0, i32 74, i32 0, i32 75, i32 0, i32 76, i32 0, i32 77, i32 0, i32 78, i32 0, i32 79, i32 0, i32 80, i32 0, i32 81, i32 0, i32 82, i32 0, i32 83, i32 0, i32 84, i32 0, i32 85, i32 0, i32 86, i32 0, i32 87, i32 0, i32 88, i32 0, i32 89, i32 0, i32 90, i32 0, i32 91, i32 0, i32 92, i32 0, i32 93, i32 0, i32 94, i32 0, i32 95, i32 0>
@@ -450,49 +433,49 @@ define <64 x i8> @shuffle_v64i8_64_zz_65_zz_66_zz_67_zz_68_zz_69_zz_70_zz_71_zz_
define <64 x i8> @shuffle_v64i8_63_zz_61_zz_59_zz_57_zz_55_zz_53_zz_51_zz_49_zz_47_zz_45_zz_43_zz_41_zz_39_zz_37_zz_35_zz_33_zz_31_zz_29_zz_27_zz_25_zz_23_zz_21_zz_19_zz_17_zz_15_zz_13_zz_11_zz_9_zz_7_zz_5_zz_3_zz_1_zz(<64 x i8> %a) {
; AVX512F-LABEL: shuffle_v64i8_63_zz_61_zz_59_zz_57_zz_55_zz_53_zz_51_zz_49_zz_47_zz_45_zz_43_zz_41_zz_39_zz_37_zz_35_zz_33_zz_31_zz_29_zz_27_zz_25_zz_23_zz_21_zz_19_zz_17_zz_15_zz_13_zz_11_zz_9_zz_7_zz_5_zz_3_zz_1_zz:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = <15,u,13,u,11,u,9,u,7,u,5,u,3,u,1,u,15,u,13,u,11,u,9,u,7,u,5,u,3,u,1,u>
; AVX512F-NEXT: vpshufb %ymm3, %ymm1, %ymm1
-; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3,0,1]
+; AVX512F-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1]
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
; AVX512F-NEXT: vpand %ymm4, %ymm1, %ymm2
; AVX512F-NEXT: vpshufb %ymm3, %ymm0, %ymm0
-; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
; AVX512F-NEXT: vpand %ymm4, %ymm0, %ymm1
; AVX512F-NEXT: vmovdqa %ymm2, %ymm0
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: shuffle_v64i8_63_zz_61_zz_59_zz_57_zz_55_zz_53_zz_51_zz_49_zz_47_zz_45_zz_43_zz_41_zz_39_zz_37_zz_35_zz_33_zz_31_zz_29_zz_27_zz_25_zz_23_zz_21_zz_19_zz_17_zz_15_zz_13_zz_11_zz_9_zz_7_zz_5_zz_3_zz_1_zz:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = <15,u,13,u,11,u,9,u,7,u,5,u,3,u,1,u,15,u,13,u,11,u,9,u,7,u,5,u,3,u,1,u>
; AVX512BW-NEXT: vpshufb %ymm1, %ymm0, %ymm2
-; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm2[2,3,0,1]
+; AVX512BW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1]
; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm2
; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm0
; AVX512BW-NEXT: vpshufb %ymm1, %ymm0, %ymm0
-; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
; AVX512BW-NEXT: vpand %ymm3, %ymm0, %ymm0
; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: shuffle_v64i8_63_zz_61_zz_59_zz_57_zz_55_zz_53_zz_51_zz_49_zz_47_zz_45_zz_43_zz_41_zz_39_zz_37_zz_35_zz_33_zz_31_zz_29_zz_27_zz_25_zz_23_zz_21_zz_19_zz_17_zz_15_zz_13_zz_11_zz_9_zz_7_zz_5_zz_3_zz_1_zz:
-; AVX512DQ: # BB#0:
+; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = <15,u,13,u,11,u,9,u,7,u,5,u,3,u,1,u,15,u,13,u,11,u,9,u,7,u,5,u,3,u,1,u>
; AVX512DQ-NEXT: vpshufb %ymm3, %ymm1, %ymm1
-; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3,0,1]
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1]
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
; AVX512DQ-NEXT: vpand %ymm4, %ymm1, %ymm2
; AVX512DQ-NEXT: vpshufb %ymm3, %ymm0, %ymm0
-; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
; AVX512DQ-NEXT: vpand %ymm4, %ymm0, %ymm1
; AVX512DQ-NEXT: vmovdqa %ymm2, %ymm0
; AVX512DQ-NEXT: retq
;
; AVX512VBMI-LABEL: shuffle_v64i8_63_zz_61_zz_59_zz_57_zz_55_zz_53_zz_51_zz_49_zz_47_zz_45_zz_43_zz_41_zz_39_zz_37_zz_35_zz_33_zz_31_zz_29_zz_27_zz_25_zz_23_zz_21_zz_19_zz_17_zz_15_zz_13_zz_11_zz_9_zz_7_zz_5_zz_3_zz_1_zz:
-; AVX512VBMI: # BB#0:
-; AVX512VBMI-NEXT: vpxord %zmm1, %zmm1, %zmm1
-; AVX512VBMI-NEXT: vmovdqu8 {{.*#+}} zmm2 = [63,65,61,67,59,69,57,71,55,73,53,75,51,77,49,79,47,81,45,83,43,85,41,87,39,89,37,91,35,93,33,95,31,97,29,99,27,101,25,103,23,105,21,107,19,109,17,111,15,113,13,115,11,117,9,119,7,121,5,123,3,125,1,127]
+; AVX512VBMI: # %bb.0:
+; AVX512VBMI-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512VBMI-NEXT: vmovdqa64 {{.*#+}} zmm2 = [63,65,61,67,59,69,57,71,55,73,53,75,51,77,49,79,47,81,45,83,43,85,41,87,39,89,37,91,35,93,33,95,31,97,29,99,27,101,25,103,23,105,21,107,19,109,17,111,15,113,13,115,11,117,9,119,7,121,5,123,3,125,1,127]
; AVX512VBMI-NEXT: vpermt2b %zmm1, %zmm2, %zmm0
; AVX512VBMI-NEXT: retq
%shuffle = shufflevector <64 x i8> %a, <64 x i8> zeroinitializer, <64 x i32> <i32 63, i32 64, i32 61, i32 64, i32 59, i32 64, i32 57, i32 64, i32 55, i32 64, i32 53, i32 64, i32 51, i32 64, i32 49, i32 64, i32 47, i32 64, i32 45, i32 64, i32 43, i32 64, i32 41, i32 64, i32 39, i32 64, i32 37, i32 64, i32 35, i32 64, i32 33, i32 64, i32 31, i32 64, i32 29, i32 64, i32 27, i32 64, i32 25, i32 64, i32 23, i32 64, i32 21, i32 64, i32 19, i32 64, i32 17, i32 64, i32 15, i32 64, i32 13, i32 64, i32 11, i32 64, i32 9, i32 64, i32 7, i32 64, i32 5, i32 64, i32 3, i32 64, i32 1, i32 64>
@@ -501,57 +484,108 @@ define <64 x i8> @shuffle_v64i8_63_zz_61_zz_59_zz_57_zz_55_zz_53_zz_51_zz_49_zz_
define <64 x i8> @shuffle_v64i8_63_64_61_66_59_68_57_70_55_72_53_74_51_76_49_78_47_80_45_82_43_84_41_86_39_88_37_90_35_92_33_94_31_96_29_98_27_100_25_102_23_104_21_106_19_108_17_110_15_112_13_114_11_116_9_118_7_120_5_122_3_124_1_126(<64 x i8> %a, <64 x i8> %b) {
; AVX512F-LABEL: shuffle_v64i8_63_64_61_66_59_68_57_70_55_72_53_74_51_76_49_78_47_80_45_82_43_84_41_86_39_88_37_90_35_92_33_94_31_96_29_98_27_100_25_102_23_104_21_106_19_108_17_110_15_112_13_114_11_116_9_118_7_120_5_122_3_124_1_126:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vpbroadcastw {{.*}}(%rip), %ymm4
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX512F-NEXT: vpblendvb %ymm4, %ymm2, %ymm1, %ymm1
-; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm1[2,3,0,1]
+; AVX512F-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
; AVX512F-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [15,0,13,2,11,4,9,6,7,8,5,10,3,12,1,14,15,0,13,2,11,4,9,6,7,8,5,10,3,12,1,14]
; AVX512F-NEXT: vpshufb %ymm5, %ymm1, %ymm2
; AVX512F-NEXT: vpblendvb %ymm4, %ymm3, %ymm0, %ymm0
-; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX512F-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX512F-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm0
; AVX512F-NEXT: vpshufb %ymm5, %ymm0, %ymm1
; AVX512F-NEXT: vmovdqa %ymm2, %ymm0
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: shuffle_v64i8_63_64_61_66_59_68_57_70_55_72_53_74_51_76_49_78_47_80_45_82_43_84_41_86_39_88_37_90_35_92_33_94_31_96_29_98_27_100_25_102_23_104_21_106_19_108_17_110_15_112_13_114_11_116_9_118_7_120_5_122_3_124_1_126:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vextracti64x4 $1, %zmm1, %ymm2
-; AVX512BW-NEXT: vpbroadcastw {{.*}}(%rip), %ymm3
+; AVX512BW-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX512BW-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm2
-; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm2[2,3,0,1]
+; AVX512BW-NEXT: vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1]
; AVX512BW-NEXT: vpblendvb %ymm3, %ymm2, %ymm4, %ymm2
; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = [15,0,13,2,11,4,9,6,7,8,5,10,3,12,1,14,15,0,13,2,11,4,9,6,7,8,5,10,3,12,1,14]
; AVX512BW-NEXT: vpshufb %ymm4, %ymm2, %ymm2
; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm0
; AVX512BW-NEXT: vpblendvb %ymm3, %ymm1, %ymm0, %ymm0
-; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX512BW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX512BW-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0
; AVX512BW-NEXT: vpshufb %ymm4, %ymm0, %ymm0
; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: shuffle_v64i8_63_64_61_66_59_68_57_70_55_72_53_74_51_76_49_78_47_80_45_82_43_84_41_86_39_88_37_90_35_92_33_94_31_96_29_98_27_100_25_102_23_104_21_106_19_108_17_110_15_112_13_114_11_116_9_118_7_120_5_122_3_124_1_126:
-; AVX512DQ: # BB#0:
-; AVX512DQ-NEXT: vpbroadcastw {{.*}}(%rip), %ymm4
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX512DQ-NEXT: vpblendvb %ymm4, %ymm2, %ymm1, %ymm1
-; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm1[2,3,0,1]
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
; AVX512DQ-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [15,0,13,2,11,4,9,6,7,8,5,10,3,12,1,14,15,0,13,2,11,4,9,6,7,8,5,10,3,12,1,14]
; AVX512DQ-NEXT: vpshufb %ymm5, %ymm1, %ymm2
; AVX512DQ-NEXT: vpblendvb %ymm4, %ymm3, %ymm0, %ymm0
-; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX512DQ-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm0
; AVX512DQ-NEXT: vpshufb %ymm5, %ymm0, %ymm1
; AVX512DQ-NEXT: vmovdqa %ymm2, %ymm0
; AVX512DQ-NEXT: retq
;
; AVX512VBMI-LABEL: shuffle_v64i8_63_64_61_66_59_68_57_70_55_72_53_74_51_76_49_78_47_80_45_82_43_84_41_86_39_88_37_90_35_92_33_94_31_96_29_98_27_100_25_102_23_104_21_106_19_108_17_110_15_112_13_114_11_116_9_118_7_120_5_122_3_124_1_126:
-; AVX512VBMI: # BB#0:
-; AVX512VBMI-NEXT: vmovdqu8 {{.*#+}} zmm2 = [63,64,61,66,59,68,57,70,55,72,53,74,51,76,49,78,47,80,45,82,43,84,41,86,39,88,37,90,35,92,33,94,31,96,29,98,27,100,25,102,23,104,21,106,19,108,17,110,15,112,13,114,11,116,9,118,7,120,5,122,3,124,1,126]
+; AVX512VBMI: # %bb.0:
+; AVX512VBMI-NEXT: vmovdqa64 {{.*#+}} zmm2 = [63,64,61,66,59,68,57,70,55,72,53,74,51,76,49,78,47,80,45,82,43,84,41,86,39,88,37,90,35,92,33,94,31,96,29,98,27,100,25,102,23,104,21,106,19,108,17,110,15,112,13,114,11,116,9,118,7,120,5,122,3,124,1,126]
; AVX512VBMI-NEXT: vpermt2b %zmm1, %zmm2, %zmm0
; AVX512VBMI-NEXT: retq
%shuffle = shufflevector <64 x i8> %a, <64 x i8> %b, <64 x i32> <i32 63, i32 64, i32 61, i32 66, i32 59, i32 68, i32 57, i32 70, i32 55, i32 72, i32 53, i32 74, i32 51, i32 76, i32 49, i32 78, i32 47, i32 80, i32 45, i32 82, i32 43, i32 84, i32 41, i32 86, i32 39, i32 88, i32 37, i32 90, i32 35, i32 92, i32 33, i32 94, i32 31, i32 96, i32 29, i32 98, i32 27, i32 100, i32 25, i32 102, i32 23, i32 104, i32 21, i32 106, i32 19, i32 108, i32 17, i32 110, i32 15, i32 112, i32 13, i32 114, i32 11, i32 116, i32 9, i32 118, i32 7, i32 120, i32 5, i32 122, i32 3, i32 124, i32 1, i32 126>
ret <64 x i8> %shuffle
}
+
+define <64 x i8> @shuffle_v64i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30_32_34_36_38_40_42_44_46_48_50_52_54_56_58_60_62_64_66_68_70_72_74_76_78_80_82_84_86_88_90_92_94_96_98_100_102_104_106_108_110_112_114_116_118_120_122_124_126(<32 x i16> %a0, <32 x i16> %a1) {
+; AVX512F-LABEL: shuffle_v64i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30_32_34_36_38_40_42_44_46_48_50_52_54_56_58_60_62_64_66_68_70_72_74_76_78_80_82_84_86_88_90_92_94_96_98_100_102_104_106_108_110_112_114_116_118_120_122_124_126:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm1
+; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0
+; AVX512F-NEXT: vpackuswb %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm1
+; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2
+; AVX512F-NEXT: vpackuswb %ymm1, %ymm2, %ymm1
+; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX512F-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3]
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: shuffle_v64i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30_32_34_36_38_40_42_44_46_48_50_52_54_56_58_60_62_64_66_68_70_72_74_76_78_80_82_84_86_88_90_92_94_96_98_100_102_104_106_108_110_112_114_116_118_120_122_124_126:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0
+; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1
+; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; AVX512BW-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
+; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX512BW-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; AVX512BW-NEXT: vpackuswb %ymm2, %ymm1, %ymm1
+; AVX512BW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3]
+; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512BW-NEXT: retq
+;
+; AVX512DQ-LABEL: shuffle_v64i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30_32_34_36_38_40_42_44_46_48_50_52_54_56_58_60_62_64_66_68_70_72_74_76_78_80_82_84_86_88_90_92_94_96_98_100_102_104_106_108_110_112_114_116_118_120_122_124_126:
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: vpsrlw $8, %ymm1, %ymm1
+; AVX512DQ-NEXT: vpsrlw $8, %ymm0, %ymm0
+; AVX512DQ-NEXT: vpackuswb %ymm1, %ymm0, %ymm0
+; AVX512DQ-NEXT: vpsrlw $8, %ymm3, %ymm1
+; AVX512DQ-NEXT: vpsrlw $8, %ymm2, %ymm2
+; AVX512DQ-NEXT: vpackuswb %ymm1, %ymm2, %ymm1
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3]
+; AVX512DQ-NEXT: retq
+;
+; AVX512VBMI-LABEL: shuffle_v64i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30_32_34_36_38_40_42_44_46_48_50_52_54_56_58_60_62_64_66_68_70_72_74_76_78_80_82_84_86_88_90_92_94_96_98_100_102_104_106_108_110_112_114_116_118_120_122_124_126:
+; AVX512VBMI: # %bb.0:
+; AVX512VBMI-NEXT: vmovdqa64 {{.*#+}} zmm2 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31,33,35,37,39,41,43,45,47,49,51,53,55,57,59,61,63,65,67,69,71,73,75,77,79,81,83,85,87,89,91,93,95,97,99,101,103,105,107,109,111,113,115,117,119,121,123,125,127]
+; AVX512VBMI-NEXT: vpermt2b %zmm1, %zmm2, %zmm0
+; AVX512VBMI-NEXT: retq
+ %1 = lshr <32 x i16> %a0, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+ %2 = lshr <32 x i16> %a1, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+ %3 = bitcast <32 x i16> %1 to <64 x i8>
+ %4 = bitcast <32 x i16> %2 to <64 x i8>
+ %5 = shufflevector <64 x i8> %3, <64 x i8> %4, <64 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30, i32 32, i32 34, i32 36, i32 38, i32 40, i32 42, i32 44, i32 46, i32 48, i32 50, i32 52, i32 54, i32 56, i32 58, i32 60, i32 62, i32 64, i32 66, i32 68, i32 70, i32 72, i32 74, i32 76, i32 78, i32 80, i32 82, i32 84, i32 86, i32 88, i32 90, i32 92, i32 94, i32 96, i32 98, i32 100, i32 102, i32 104, i32 106, i32 108, i32 110, i32 112, i32 114, i32 116, i32 118, i32 120, i32 122, i32 124, i32 126>
+ ret <64 x i8> %5
+}
diff --git a/test/CodeGen/X86/vector-shuffle-512-v8.ll b/test/CodeGen/X86/vector-shuffle-512-v8.ll
index d0b7e4eb205c..5df15fbe0780 100644
--- a/test/CodeGen/X86/vector-shuffle-512-v8.ll
+++ b/test/CodeGen/X86/vector-shuffle-512-v8.ll
@@ -4,12 +4,12 @@
define <8 x double> @shuffle_v8f64_00000000(<8 x double> %a, <8 x double> %b) {
; AVX512F-LABEL: shuffle_v8f64_00000000:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vbroadcastsd %xmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_00000000:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: vbroadcastsd %xmm0, %zmm0
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
@@ -18,14 +18,14 @@ define <8 x double> @shuffle_v8f64_00000000(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_22222222(<8 x double> %a, <8 x double> %b) {
; AVX512F-LABEL: shuffle_v8f64_22222222:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vextractf32x4 $1, %zmm0, %xmm0
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX512F-NEXT: vbroadcastsd %xmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_22222222:
-; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vextractf32x4 $1, %zmm0, %xmm0
+; AVX512F-32: # %bb.0:
+; AVX512F-32-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX512F-32-NEXT: vbroadcastsd %xmm0, %zmm0
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
@@ -34,13 +34,13 @@ define <8 x double> @shuffle_v8f64_22222222(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_44444444(<8 x double> %a, <8 x double> %b) {
; AVX512F-LABEL: shuffle_v8f64_44444444:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vextractf32x4 $2, %zmm0, %xmm0
; AVX512F-NEXT: vbroadcastsd %xmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_44444444:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: vextractf32x4 $2, %zmm0, %xmm0
; AVX512F-32-NEXT: vbroadcastsd %xmm0, %zmm0
; AVX512F-32-NEXT: retl
@@ -50,15 +50,15 @@ define <8 x double> @shuffle_v8f64_44444444(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_44444444_bc(<8 x i64> %a, <8 x i64> %b) {
; AVX512F-LABEL: shuffle_v8f64_44444444_bc:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vextracti32x4 $2, %zmm0, %xmm0
-; AVX512F-NEXT: vpbroadcastq %xmm0, %zmm0
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vextractf32x4 $2, %zmm0, %xmm0
+; AVX512F-NEXT: vbroadcastsd %xmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_44444444_bc:
-; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vextracti32x4 $2, %zmm0, %xmm0
-; AVX512F-32-NEXT: vpbroadcastq %xmm0, %zmm0
+; AVX512F-32: # %bb.0:
+; AVX512F-32-NEXT: vextractf32x4 $2, %zmm0, %xmm0
+; AVX512F-32-NEXT: vbroadcastsd %xmm0, %zmm0
; AVX512F-32-NEXT: retl
%tmp0 = bitcast <8 x i64> %a to <8 x double>
%tmp1 = bitcast <8 x i64> %b to <8 x double>
@@ -68,14 +68,14 @@ define <8 x double> @shuffle_v8f64_44444444_bc(<8 x i64> %a, <8 x i64> %b) {
define <8 x double> @shuffle_v8f64_00000010(<8 x double> %a, <8 x double> %b) {
; AVX512F-LABEL: shuffle_v8f64_00000010:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovapd {{.*#+}} zmm1 = [0,0,0,0,0,0,1,0]
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,0,0,1,0]
; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_00000010:
-; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vmovapd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0]
+; AVX512F-32: # %bb.0:
+; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0]
; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0>
@@ -84,14 +84,14 @@ define <8 x double> @shuffle_v8f64_00000010(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_00000200(<8 x double> %a, <8 x double> %b) {
; AVX512F-LABEL: shuffle_v8f64_00000200:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovapd {{.*#+}} zmm1 = [0,0,0,0,0,2,0,0]
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,0,2,0,0]
; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_00000200:
-; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vmovapd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0]
+; AVX512F-32: # %bb.0:
+; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0]
; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0>
@@ -100,14 +100,14 @@ define <8 x double> @shuffle_v8f64_00000200(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_00003000(<8 x double> %a, <8 x double> %b) {
; AVX512F-LABEL: shuffle_v8f64_00003000:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovapd {{.*#+}} zmm1 = [0,0,0,0,3,0,0,0]
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,3,0,0,0]
; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_00003000:
-; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vmovapd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0]
+; AVX512F-32: # %bb.0:
+; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0]
; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0>
@@ -116,14 +116,14 @@ define <8 x double> @shuffle_v8f64_00003000(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_00040000(<8 x double> %a, <8 x double> %b) {
; AVX512F-LABEL: shuffle_v8f64_00040000:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovapd {{.*#+}} zmm1 = [0,0,0,4,0,0,0,0]
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,4,0,0,0,0]
; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_00040000:
-; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vmovapd {{.*#+}} zmm1 = [0,0,0,0,0,0,4,0,0,0,0,0,0,0,0,0]
+; AVX512F-32: # %bb.0:
+; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,0,0,4,0,0,0,0,0,0,0,0,0]
; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 4, i32 0, i32 0, i32 0, i32 0>
@@ -132,14 +132,14 @@ define <8 x double> @shuffle_v8f64_00040000(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_00500000(<8 x double> %a, <8 x double> %b) {
; AVX512F-LABEL: shuffle_v8f64_00500000:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovapd {{.*#+}} zmm1 = [0,0,5,0,0,0,0,0]
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,5,0,0,0,0,0]
; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_00500000:
-; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vmovapd {{.*#+}} zmm1 = [0,0,0,0,5,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512F-32: # %bb.0:
+; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,5,0,0,0,0,0,0,0,0,0,0,0]
; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 5, i32 0, i32 0, i32 0, i32 0, i32 0>
@@ -148,14 +148,14 @@ define <8 x double> @shuffle_v8f64_00500000(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_06000000(<8 x double> %a, <8 x double> %b) {
; AVX512F-LABEL: shuffle_v8f64_06000000:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovapd {{.*#+}} zmm1 = [0,6,0,0,0,0,0,0]
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,6,0,0,0,0,0,0]
; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_06000000:
-; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vmovapd {{.*#+}} zmm1 = [0,0,6,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512F-32: # %bb.0:
+; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,6,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 6, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
@@ -164,17 +164,17 @@ define <8 x double> @shuffle_v8f64_06000000(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_70000000(<8 x double> %a, <8 x double> %b) {
; AVX512F-LABEL: shuffle_v8f64_70000000:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: movl $7, %eax
; AVX512F-NEXT: vmovq %rax, %xmm1
-; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_70000000:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: movl $7, %eax
; AVX512F-32-NEXT: vmovd %eax, %xmm1
-; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpermq %zmm0, %zmm1, %zmm0
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <8 x double> %shuffle
@@ -182,12 +182,12 @@ define <8 x double> @shuffle_v8f64_70000000(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_01014545(<8 x double> %a, <8 x double> %b) {
; AVX512F-LABEL: shuffle_v8f64_01014545:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[0,1,0,1,4,5,4,5]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_01014545:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[0,1,0,1,4,5,4,5]
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5>
@@ -196,14 +196,14 @@ define <8 x double> @shuffle_v8f64_01014545(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_00112233(<8 x double> %a, <8 x double> %b) {
; AVX512F-LABEL: shuffle_v8f64_00112233:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovapd {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3]
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3]
; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_00112233:
-; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vmovapd {{.*#+}} zmm1 = [0,0,0,0,1,0,1,0,2,0,2,0,3,0,3,0]
+; AVX512F-32: # %bb.0:
+; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,1,0,1,0,2,0,2,0,3,0,3,0]
; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
@@ -212,14 +212,14 @@ define <8 x double> @shuffle_v8f64_00112233(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_00001111(<8 x double> %a, <8 x double> %b) {
; AVX512F-LABEL: shuffle_v8f64_00001111:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovapd {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1]
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1]
; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_00001111:
-; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vmovapd {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,0,1,0,1,0,1,0]
+; AVX512F-32: # %bb.0:
+; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,0,1,0,1,0,1,0]
; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
@@ -229,12 +229,12 @@ define <8 x double> @shuffle_v8f64_00001111(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_81a3c5e7(<8 x double> %a, <8 x double> %b) {
;
; AVX512F-LABEL: shuffle_v8f64_81a3c5e7:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vshufpd {{.*#+}} zmm0 = zmm1[0],zmm0[1],zmm1[2],zmm0[3],zmm1[4],zmm0[5],zmm1[6],zmm0[7]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_81a3c5e7:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: vshufpd {{.*#+}} zmm0 = zmm1[0],zmm0[1],zmm1[2],zmm0[3],zmm1[4],zmm0[5],zmm1[6],zmm0[7]
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 8, i32 1, i32 10, i32 3, i32 12, i32 5, i32 14, i32 7>
@@ -244,13 +244,13 @@ define <8 x double> @shuffle_v8f64_81a3c5e7(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_08080808(<8 x double> %a, <8 x double> %b) {
;
; AVX512F-LABEL: shuffle_v8f64_08080808:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovapd {{.*#+}} zmm2 = [0,8,0,8,0,8,0,8]
; AVX512F-NEXT: vpermt2pd %zmm1, %zmm2, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_08080808:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: vmovapd {{.*#+}} zmm2 = [0,0,8,0,0,0,8,0,0,0,8,0,0,0,8,0]
; AVX512F-32-NEXT: vpermt2pd %zmm1, %zmm2, %zmm0
; AVX512F-32-NEXT: retl
@@ -261,13 +261,13 @@ define <8 x double> @shuffle_v8f64_08080808(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_08084c4c(<8 x double> %a, <8 x double> %b) {
;
; AVX512F-LABEL: shuffle_v8f64_08084c4c:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovapd {{.*#+}} zmm2 = [0,8,0,8,4,12,4,12]
; AVX512F-NEXT: vpermt2pd %zmm1, %zmm2, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_08084c4c:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: vmovapd {{.*#+}} zmm2 = [0,0,8,0,0,0,8,0,4,0,12,0,4,0,12,0]
; AVX512F-32-NEXT: vpermt2pd %zmm1, %zmm2, %zmm0
; AVX512F-32-NEXT: retl
@@ -278,14 +278,14 @@ define <8 x double> @shuffle_v8f64_08084c4c(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_8823cc67(<8 x double> %a, <8 x double> %b) {
;
; AVX512F-LABEL: shuffle_v8f64_8823cc67:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovapd {{.*#+}} zmm2 = [0,0,10,11,4,4,14,15]
; AVX512F-NEXT: vpermi2pd %zmm0, %zmm1, %zmm2
; AVX512F-NEXT: vmovapd %zmm2, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_8823cc67:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: vmovapd {{.*#+}} zmm2 = [0,0,0,0,10,0,11,0,4,0,4,0,14,0,15,0]
; AVX512F-32-NEXT: vpermi2pd %zmm0, %zmm1, %zmm2
; AVX512F-32-NEXT: vmovapd %zmm2, %zmm0
@@ -297,14 +297,14 @@ define <8 x double> @shuffle_v8f64_8823cc67(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_9832dc76(<8 x double> %a, <8 x double> %b) {
;
; AVX512F-LABEL: shuffle_v8f64_9832dc76:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovapd {{.*#+}} zmm2 = [1,0,11,10,5,4,15,14]
; AVX512F-NEXT: vpermi2pd %zmm0, %zmm1, %zmm2
; AVX512F-NEXT: vmovapd %zmm2, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_9832dc76:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: vmovapd {{.*#+}} zmm2 = [1,0,0,0,11,0,10,0,5,0,4,0,15,0,14,0]
; AVX512F-32-NEXT: vpermi2pd %zmm0, %zmm1, %zmm2
; AVX512F-32-NEXT: vmovapd %zmm2, %zmm0
@@ -316,14 +316,14 @@ define <8 x double> @shuffle_v8f64_9832dc76(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_9810dc54(<8 x double> %a, <8 x double> %b) {
;
; AVX512F-LABEL: shuffle_v8f64_9810dc54:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovapd {{.*#+}} zmm2 = [1,0,9,8,5,4,13,12]
; AVX512F-NEXT: vpermi2pd %zmm0, %zmm1, %zmm2
; AVX512F-NEXT: vmovapd %zmm2, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_9810dc54:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: vmovapd {{.*#+}} zmm2 = [1,0,0,0,9,0,8,0,5,0,4,0,13,0,12,0]
; AVX512F-32-NEXT: vpermi2pd %zmm0, %zmm1, %zmm2
; AVX512F-32-NEXT: vmovapd %zmm2, %zmm0
@@ -335,13 +335,13 @@ define <8 x double> @shuffle_v8f64_9810dc54(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_08194c5d(<8 x double> %a, <8 x double> %b) {
;
; AVX512F-LABEL: shuffle_v8f64_08194c5d:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovapd {{.*#+}} zmm2 = [0,8,1,9,4,12,5,13]
; AVX512F-NEXT: vpermt2pd %zmm1, %zmm2, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_08194c5d:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: vmovapd {{.*#+}} zmm2 = [0,0,8,0,1,0,9,0,4,0,12,0,5,0,13,0]
; AVX512F-32-NEXT: vpermt2pd %zmm1, %zmm2, %zmm0
; AVX512F-32-NEXT: retl
@@ -352,13 +352,13 @@ define <8 x double> @shuffle_v8f64_08194c5d(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_2a3b6e7f(<8 x double> %a, <8 x double> %b) {
;
; AVX512F-LABEL: shuffle_v8f64_2a3b6e7f:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovapd {{.*#+}} zmm2 = [2,10,3,11,6,14,7,15]
; AVX512F-NEXT: vpermt2pd %zmm1, %zmm2, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_2a3b6e7f:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: vmovapd {{.*#+}} zmm2 = [2,0,10,0,3,0,11,0,6,0,14,0,7,0,15,0]
; AVX512F-32-NEXT: vpermt2pd %zmm1, %zmm2, %zmm0
; AVX512F-32-NEXT: retl
@@ -369,13 +369,13 @@ define <8 x double> @shuffle_v8f64_2a3b6e7f(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_08192a3b(<8 x double> %a, <8 x double> %b) {
;
; AVX512F-LABEL: shuffle_v8f64_08192a3b:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovapd {{.*#+}} zmm2 = [0,8,1,9,2,10,3,11]
; AVX512F-NEXT: vpermt2pd %zmm1, %zmm2, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_08192a3b:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: vmovapd {{.*#+}} zmm2 = [0,0,8,0,1,0,9,0,2,0,10,0,3,0,11,0]
; AVX512F-32-NEXT: vpermt2pd %zmm1, %zmm2, %zmm0
; AVX512F-32-NEXT: retl
@@ -386,14 +386,14 @@ define <8 x double> @shuffle_v8f64_08192a3b(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_08991abb(<8 x double> %a, <8 x double> %b) {
;
; AVX512F-LABEL: shuffle_v8f64_08991abb:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovapd {{.*#+}} zmm2 = [8,0,1,1,9,2,3,3]
; AVX512F-NEXT: vpermi2pd %zmm0, %zmm1, %zmm2
; AVX512F-NEXT: vmovapd %zmm2, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_08991abb:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: vmovapd {{.*#+}} zmm2 = [8,0,0,0,1,0,1,0,9,0,2,0,3,0,3,0]
; AVX512F-32-NEXT: vpermi2pd %zmm0, %zmm1, %zmm2
; AVX512F-32-NEXT: vmovapd %zmm2, %zmm0
@@ -405,13 +405,13 @@ define <8 x double> @shuffle_v8f64_08991abb(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_091b2d3f(<8 x double> %a, <8 x double> %b) {
;
; AVX512F-LABEL: shuffle_v8f64_091b2d3f:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovapd {{.*#+}} zmm2 = [0,9,1,11,2,13,3,15]
; AVX512F-NEXT: vpermt2pd %zmm1, %zmm2, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_091b2d3f:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: vmovapd {{.*#+}} zmm2 = [0,0,9,0,1,0,11,0,2,0,13,0,3,0,15,0]
; AVX512F-32-NEXT: vpermt2pd %zmm1, %zmm2, %zmm0
; AVX512F-32-NEXT: retl
@@ -422,14 +422,14 @@ define <8 x double> @shuffle_v8f64_091b2d3f(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_09ab1def(<8 x double> %a, <8 x double> %b) {
;
; AVX512F-LABEL: shuffle_v8f64_09ab1def:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovapd {{.*#+}} zmm2 = [8,1,2,3,9,5,6,7]
; AVX512F-NEXT: vpermi2pd %zmm0, %zmm1, %zmm2
; AVX512F-NEXT: vmovapd %zmm2, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_09ab1def:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: vmovapd {{.*#+}} zmm2 = [8,0,1,0,2,0,3,0,9,0,5,0,6,0,7,0]
; AVX512F-32-NEXT: vpermi2pd %zmm0, %zmm1, %zmm2
; AVX512F-32-NEXT: vmovapd %zmm2, %zmm0
@@ -441,12 +441,12 @@ define <8 x double> @shuffle_v8f64_09ab1def(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_00014445(<8 x double> %a, <8 x double> %b) {
;
; AVX512F-LABEL: shuffle_v8f64_00014445:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[0,0,0,1,4,4,4,5]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_00014445:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[0,0,0,1,4,4,4,5]
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 1, i32 4, i32 4, i32 4, i32 5>
@@ -456,12 +456,12 @@ define <8 x double> @shuffle_v8f64_00014445(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_00204464(<8 x double> %a, <8 x double> %b) {
;
; AVX512F-LABEL: shuffle_v8f64_00204464:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[0,0,2,0,4,4,6,4]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_00204464:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[0,0,2,0,4,4,6,4]
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 0, i32 4, i32 4, i32 6, i32 4>
@@ -471,12 +471,12 @@ define <8 x double> @shuffle_v8f64_00204464(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_03004744(<8 x double> %a, <8 x double> %b) {
;
; AVX512F-LABEL: shuffle_v8f64_03004744:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[0,3,0,0,4,7,4,4]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_03004744:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[0,3,0,0,4,7,4,4]
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 3, i32 0, i32 0, i32 4, i32 7, i32 4, i32 4>
@@ -486,12 +486,12 @@ define <8 x double> @shuffle_v8f64_03004744(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_10005444(<8 x double> %a, <8 x double> %b) {
;
; AVX512F-LABEL: shuffle_v8f64_10005444:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[1,0,0,0,5,4,4,4]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_10005444:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[1,0,0,0,5,4,4,4]
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 0, i32 0, i32 0, i32 5, i32 4, i32 4, i32 4>
@@ -501,12 +501,12 @@ define <8 x double> @shuffle_v8f64_10005444(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_22006644(<8 x double> %a, <8 x double> %b) {
;
; AVX512F-LABEL: shuffle_v8f64_22006644:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[2,2,0,0,6,6,4,4]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_22006644:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[2,2,0,0,6,6,4,4]
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 2, i32 2, i32 0, i32 0, i32 6, i32 6, i32 4, i32 4>
@@ -516,12 +516,12 @@ define <8 x double> @shuffle_v8f64_22006644(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_33307774(<8 x double> %a, <8 x double> %b) {
;
; AVX512F-LABEL: shuffle_v8f64_33307774:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[3,3,3,0,7,7,7,4]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_33307774:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[3,3,3,0,7,7,7,4]
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 0, i32 7, i32 7, i32 7, i32 4>
@@ -531,12 +531,12 @@ define <8 x double> @shuffle_v8f64_33307774(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_32107654(<8 x double> %a, <8 x double> %b) {
;
; AVX512F-LABEL: shuffle_v8f64_32107654:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_32107654:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4]
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
@@ -546,12 +546,12 @@ define <8 x double> @shuffle_v8f64_32107654(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_00234467(<8 x double> %a, <8 x double> %b) {
;
; AVX512F-LABEL: shuffle_v8f64_00234467:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vpermilpd {{.*#+}} zmm0 = zmm0[0,0,2,3,4,4,6,7]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_00234467:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: vpermilpd {{.*#+}} zmm0 = zmm0[0,0,2,3,4,4,6,7]
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 3, i32 4, i32 4, i32 6, i32 7>
@@ -561,12 +561,12 @@ define <8 x double> @shuffle_v8f64_00234467(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_00224466(<8 x double> %a, <8 x double> %b) {
;
; AVX512F-LABEL: shuffle_v8f64_00224466:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovddup {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_00224466:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: vmovddup {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6]
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
@@ -576,12 +576,12 @@ define <8 x double> @shuffle_v8f64_00224466(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_10325476(<8 x double> %a, <8 x double> %b) {
;
; AVX512F-LABEL: shuffle_v8f64_10325476:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vpermilpd {{.*#+}} zmm0 = zmm0[1,0,3,2,5,4,7,6]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_10325476:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: vpermilpd {{.*#+}} zmm0 = zmm0[1,0,3,2,5,4,7,6]
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
@@ -591,12 +591,12 @@ define <8 x double> @shuffle_v8f64_10325476(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_11335577(<8 x double> %a, <8 x double> %b) {
;
; AVX512F-LABEL: shuffle_v8f64_11335577:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vpermilpd {{.*#+}} zmm0 = zmm0[1,1,3,3,5,5,7,7]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_11335577:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: vpermilpd {{.*#+}} zmm0 = zmm0[1,1,3,3,5,5,7,7]
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
@@ -606,12 +606,12 @@ define <8 x double> @shuffle_v8f64_11335577(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_10235467(<8 x double> %a, <8 x double> %b) {
;
; AVX512F-LABEL: shuffle_v8f64_10235467:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vpermilpd {{.*#+}} zmm0 = zmm0[1,0,2,3,5,4,6,7]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_10235467:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: vpermilpd {{.*#+}} zmm0 = zmm0[1,0,2,3,5,4,6,7]
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7>
@@ -621,12 +621,12 @@ define <8 x double> @shuffle_v8f64_10235467(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_10225466(<8 x double> %a, <8 x double> %b) {
;
; AVX512F-LABEL: shuffle_v8f64_10225466:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vpermilpd {{.*#+}} zmm0 = zmm0[1,0,2,2,5,4,6,6]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_10225466:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: vpermilpd {{.*#+}} zmm0 = zmm0[1,0,2,2,5,4,6,6]
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 0, i32 2, i32 2, i32 5, i32 4, i32 6, i32 6>
@@ -636,14 +636,14 @@ define <8 x double> @shuffle_v8f64_10225466(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_00015444(<8 x double> %a, <8 x double> %b) {
;
; AVX512F-LABEL: shuffle_v8f64_00015444:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovapd {{.*#+}} zmm1 = [0,0,0,1,5,4,4,4]
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,1,5,4,4,4]
; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_00015444:
-; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vmovapd {{.*#+}} zmm1 = [0,0,0,0,0,0,1,0,5,0,4,0,4,0,4,0]
+; AVX512F-32: # %bb.0:
+; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,0,0,1,0,5,0,4,0,4,0,4,0]
; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 1, i32 5, i32 4, i32 4, i32 4>
@@ -653,14 +653,14 @@ define <8 x double> @shuffle_v8f64_00015444(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_00204644(<8 x double> %a, <8 x double> %b) {
;
; AVX512F-LABEL: shuffle_v8f64_00204644:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovapd {{.*#+}} zmm1 = [0,0,2,0,4,6,4,4]
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,2,0,4,6,4,4]
; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_00204644:
-; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vmovapd {{.*#+}} zmm1 = [0,0,0,0,2,0,0,0,4,0,6,0,4,0,4,0]
+; AVX512F-32: # %bb.0:
+; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,2,0,0,0,4,0,6,0,4,0,4,0]
; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 0, i32 4, i32 6, i32 4, i32 4>
@@ -670,14 +670,14 @@ define <8 x double> @shuffle_v8f64_00204644(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_03004474(<8 x double> %a, <8 x double> %b) {
;
; AVX512F-LABEL: shuffle_v8f64_03004474:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovapd {{.*#+}} zmm1 = [0,3,0,0,4,4,7,4]
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,3,0,0,4,4,7,4]
; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_03004474:
-; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vmovapd {{.*#+}} zmm1 = [0,0,3,0,0,0,0,0,4,0,4,0,7,0,4,0]
+; AVX512F-32: # %bb.0:
+; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,3,0,0,0,0,0,4,0,4,0,7,0,4,0]
; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 3, i32 0, i32 0, i32 4, i32 4, i32 7, i32 4>
@@ -687,14 +687,14 @@ define <8 x double> @shuffle_v8f64_03004474(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_10004444(<8 x double> %a, <8 x double> %b) {
;
; AVX512F-LABEL: shuffle_v8f64_10004444:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovapd {{.*#+}} zmm1 = [1,0,0,0,4,4,4,4]
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [1,0,0,0,4,4,4,4]
; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_10004444:
-; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vmovapd {{.*#+}} zmm1 = [1,0,0,0,0,0,0,0,4,0,4,0,4,0,4,0]
+; AVX512F-32: # %bb.0:
+; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [1,0,0,0,0,0,0,0,4,0,4,0,4,0,4,0]
; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
@@ -704,14 +704,14 @@ define <8 x double> @shuffle_v8f64_10004444(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_22006446(<8 x double> %a, <8 x double> %b) {
;
; AVX512F-LABEL: shuffle_v8f64_22006446:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovapd {{.*#+}} zmm1 = [2,2,0,0,6,4,4,6]
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [2,2,0,0,6,4,4,6]
; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_22006446:
-; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vmovapd {{.*#+}} zmm1 = [2,0,2,0,0,0,0,0,6,0,4,0,4,0,6,0]
+; AVX512F-32: # %bb.0:
+; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [2,0,2,0,0,0,0,0,6,0,4,0,4,0,6,0]
; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 2, i32 2, i32 0, i32 0, i32 6, i32 4, i32 4, i32 6>
@@ -721,14 +721,14 @@ define <8 x double> @shuffle_v8f64_22006446(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_33307474(<8 x double> %a, <8 x double> %b) {
;
; AVX512F-LABEL: shuffle_v8f64_33307474:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovapd {{.*#+}} zmm1 = [3,3,3,0,7,4,7,4]
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [3,3,3,0,7,4,7,4]
; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_33307474:
-; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vmovapd {{.*#+}} zmm1 = [3,0,3,0,3,0,0,0,7,0,4,0,7,0,4,0]
+; AVX512F-32: # %bb.0:
+; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [3,0,3,0,3,0,0,0,7,0,4,0,7,0,4,0]
; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 0, i32 7, i32 4, i32 7, i32 4>
@@ -738,14 +738,14 @@ define <8 x double> @shuffle_v8f64_33307474(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_32104567(<8 x double> %a, <8 x double> %b) {
;
; AVX512F-LABEL: shuffle_v8f64_32104567:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovapd {{.*#+}} zmm1 = [3,2,1,0,4,5,6,7]
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [3,2,1,0,4,5,6,7]
; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_32104567:
-; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vmovapd {{.*#+}} zmm1 = [3,0,2,0,1,0,0,0,4,0,5,0,6,0,7,0]
+; AVX512F-32: # %bb.0:
+; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [3,0,2,0,1,0,0,0,4,0,5,0,6,0,7,0]
; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7>
@@ -755,14 +755,14 @@ define <8 x double> @shuffle_v8f64_32104567(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_00236744(<8 x double> %a, <8 x double> %b) {
;
; AVX512F-LABEL: shuffle_v8f64_00236744:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovapd {{.*#+}} zmm1 = [0,0,2,3,6,7,4,4]
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,2,3,6,7,4,4]
; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_00236744:
-; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vmovapd {{.*#+}} zmm1 = [0,0,0,0,2,0,3,0,6,0,7,0,4,0,4,0]
+; AVX512F-32: # %bb.0:
+; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,2,0,3,0,6,0,7,0,4,0,4,0]
; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 3, i32 6, i32 7, i32 4, i32 4>
@@ -772,14 +772,14 @@ define <8 x double> @shuffle_v8f64_00236744(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_00226644(<8 x double> %a, <8 x double> %b) {
;
; AVX512F-LABEL: shuffle_v8f64_00226644:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovapd {{.*#+}} zmm1 = [0,0,2,2,6,6,4,4]
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,2,2,6,6,4,4]
; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_00226644:
-; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vmovapd {{.*#+}} zmm1 = [0,0,0,0,2,0,2,0,6,0,6,0,4,0,4,0]
+; AVX512F-32: # %bb.0:
+; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,2,0,2,0,6,0,6,0,4,0,4,0]
; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 6, i32 6, i32 4, i32 4>
@@ -789,12 +789,12 @@ define <8 x double> @shuffle_v8f64_00226644(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_10324567(<8 x double> %a, <8 x double> %b) {
;
; AVX512F-LABEL: shuffle_v8f64_10324567:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vpermilpd {{.*#+}} zmm0 = zmm0[1,0,3,2,4,5,6,7]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_10324567:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: vpermilpd {{.*#+}} zmm0 = zmm0[1,0,3,2,4,5,6,7]
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7>
@@ -804,12 +804,12 @@ define <8 x double> @shuffle_v8f64_10324567(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_11334567(<8 x double> %a, <8 x double> %b) {
;
; AVX512F-LABEL: shuffle_v8f64_11334567:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vpermilpd {{.*#+}} zmm0 = zmm0[1,1,3,3,4,5,6,7]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_11334567:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: vpermilpd {{.*#+}} zmm0 = zmm0[1,1,3,3,4,5,6,7]
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -819,12 +819,12 @@ define <8 x double> @shuffle_v8f64_11334567(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_01235467(<8 x double> %a, <8 x double> %b) {
;
; AVX512F-LABEL: shuffle_v8f64_01235467:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vpermilpd {{.*#+}} zmm0 = zmm0[0,1,2,3,5,4,6,7]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_01235467:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: vpermilpd {{.*#+}} zmm0 = zmm0[0,1,2,3,5,4,6,7]
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7>
@@ -834,12 +834,12 @@ define <8 x double> @shuffle_v8f64_01235467(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_01235466(<8 x double> %a, <8 x double> %b) {
;
; AVX512F-LABEL: shuffle_v8f64_01235466:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vpermilpd {{.*#+}} zmm0 = zmm0[0,1,2,3,5,4,6,6]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_01235466:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: vpermilpd {{.*#+}} zmm0 = zmm0[0,1,2,3,5,4,6,6]
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 6, i32 6>
@@ -849,14 +849,14 @@ define <8 x double> @shuffle_v8f64_01235466(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_002u6u44(<8 x double> %a, <8 x double> %b) {
;
; AVX512F-LABEL: shuffle_v8f64_002u6u44:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovapd {{.*#+}} zmm1 = <0,0,2,u,6,u,4,4>
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = <0,0,2,u,6,u,4,4>
; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_002u6u44:
-; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vmovapd {{.*#+}} zmm1 = <0,0,0,0,2,0,u,u,6,0,u,u,4,0,4,0>
+; AVX512F-32: # %bb.0:
+; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = <0,0,0,0,2,0,u,u,6,0,u,u,4,0,4,0>
; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 undef, i32 6, i32 undef, i32 4, i32 4>
@@ -866,14 +866,14 @@ define <8 x double> @shuffle_v8f64_002u6u44(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_00uu66uu(<8 x double> %a, <8 x double> %b) {
;
; AVX512F-LABEL: shuffle_v8f64_00uu66uu:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovapd {{.*#+}} zmm1 = <0,0,u,u,6,6,u,u>
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = <0,0,u,u,6,6,u,u>
; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_00uu66uu:
-; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vmovapd {{.*#+}} zmm1 = <0,0,0,0,u,u,u,u,6,0,6,0,u,u,u,u>
+; AVX512F-32: # %bb.0:
+; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = <0,0,0,0,u,u,u,u,6,0,6,0,u,u,u,u>
; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 undef, i32 undef, i32 6, i32 6, i32 undef, i32 undef>
@@ -883,12 +883,12 @@ define <8 x double> @shuffle_v8f64_00uu66uu(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_103245uu(<8 x double> %a, <8 x double> %b) {
;
; AVX512F-LABEL: shuffle_v8f64_103245uu:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vpermilpd {{.*#+}} zmm0 = zmm0[1,0,3,2,4,5,6,6]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_103245uu:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: vpermilpd {{.*#+}} zmm0 = zmm0[1,0,3,2,4,5,6,6]
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 4, i32 5, i32 undef, i32 undef>
@@ -898,12 +898,12 @@ define <8 x double> @shuffle_v8f64_103245uu(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_1133uu67(<8 x double> %a, <8 x double> %b) {
;
; AVX512F-LABEL: shuffle_v8f64_1133uu67:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vpermilpd {{.*#+}} zmm0 = zmm0[1,1,3,3,4,4,6,7]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_1133uu67:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: vpermilpd {{.*#+}} zmm0 = zmm0[1,1,3,3,4,4,6,7]
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 undef, i32 undef, i32 6, i32 7>
@@ -913,12 +913,12 @@ define <8 x double> @shuffle_v8f64_1133uu67(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_0uu354uu(<8 x double> %a, <8 x double> %b) {
;
; AVX512F-LABEL: shuffle_v8f64_0uu354uu:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vpermilpd {{.*#+}} zmm0 = zmm0[0,0,2,3,5,4,6,6]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_0uu354uu:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: vpermilpd {{.*#+}} zmm0 = zmm0[0,0,2,3,5,4,6,6]
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 undef, i32 undef, i32 3, i32 5, i32 4, i32 undef, i32 undef>
@@ -928,12 +928,12 @@ define <8 x double> @shuffle_v8f64_0uu354uu(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_uuu3uu66(<8 x double> %a, <8 x double> %b) {
;
; AVX512F-LABEL: shuffle_v8f64_uuu3uu66:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vpermilpd {{.*#+}} zmm0 = zmm0[0,0,2,3,4,4,6,6]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_uuu3uu66:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: vpermilpd {{.*#+}} zmm0 = zmm0[0,0,2,3,4,4,6,6]
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 6, i32 6>
@@ -943,14 +943,14 @@ define <8 x double> @shuffle_v8f64_uuu3uu66(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_c348cda0(<8 x double> %a, <8 x double> %b) {
;
; AVX512F-LABEL: shuffle_v8f64_c348cda0:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovapd {{.*#+}} zmm2 = [4,11,12,0,4,5,2,8]
; AVX512F-NEXT: vpermi2pd %zmm0, %zmm1, %zmm2
; AVX512F-NEXT: vmovapd %zmm2, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_c348cda0:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: vmovapd {{.*#+}} zmm2 = [4,0,11,0,12,0,0,0,4,0,5,0,2,0,8,0]
; AVX512F-32-NEXT: vpermi2pd %zmm0, %zmm1, %zmm2
; AVX512F-32-NEXT: vmovapd %zmm2, %zmm0
@@ -962,13 +962,13 @@ define <8 x double> @shuffle_v8f64_c348cda0(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_f511235a(<8 x double> %a, <8 x double> %b) {
;
; AVX512F-LABEL: shuffle_v8f64_f511235a:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovapd {{.*#+}} zmm2 = [15,5,1,1,2,3,5,10]
; AVX512F-NEXT: vpermt2pd %zmm1, %zmm2, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_f511235a:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: vmovapd {{.*#+}} zmm2 = [15,0,5,0,1,0,1,0,2,0,3,0,5,0,10,0]
; AVX512F-32-NEXT: vpermt2pd %zmm1, %zmm2, %zmm0
; AVX512F-32-NEXT: retl
@@ -978,15 +978,15 @@ define <8 x double> @shuffle_v8f64_f511235a(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_1z2z5z6z(<8 x double> %a, <8 x double> %b) {
; AVX512F-LABEL: shuffle_v8f64_1z2z5z6z:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; AVX512F-NEXT: vmovapd {{.*#+}} zmm2 = [1,8,2,8,5,8,6,8]
; AVX512F-NEXT: vpermt2pd %zmm1, %zmm2, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_1z2z5z6z:
-; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; AVX512F-32: # %bb.0:
+; AVX512F-32-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; AVX512F-32-NEXT: vmovapd {{.*#+}} zmm2 = [1,0,8,0,2,0,8,0,5,0,8,0,6,0,8,0]
; AVX512F-32-NEXT: vpermt2pd %zmm1, %zmm2, %zmm0
; AVX512F-32-NEXT: retl
@@ -997,12 +997,12 @@ define <8 x double> @shuffle_v8f64_1z2z5z6z(<8 x double> %a, <8 x double> %b) {
define <8 x i64> @shuffle_v8i64_00000000(<8 x i64> %a, <8 x i64> %b) {
;
; AVX512F-LABEL: shuffle_v8i64_00000000:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vbroadcastsd %xmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8i64_00000000:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: vbroadcastsd %xmm0, %zmm0
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
@@ -1011,15 +1011,15 @@ define <8 x i64> @shuffle_v8i64_00000000(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_44444444(<8 x i64> %a, <8 x i64> %b) {
; AVX512F-LABEL: shuffle_v8i64_44444444:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vextracti32x4 $2, %zmm0, %xmm0
-; AVX512F-NEXT: vpbroadcastq %xmm0, %zmm0
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vextractf32x4 $2, %zmm0, %xmm0
+; AVX512F-NEXT: vbroadcastsd %xmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8i64_44444444:
-; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vextracti32x4 $2, %zmm0, %xmm0
-; AVX512F-32-NEXT: vpbroadcastq %xmm0, %zmm0
+; AVX512F-32: # %bb.0:
+; AVX512F-32-NEXT: vextractf32x4 $2, %zmm0, %xmm0
+; AVX512F-32-NEXT: vbroadcastsd %xmm0, %zmm0
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
ret <8 x i64> %shuffle
@@ -1027,15 +1027,15 @@ define <8 x i64> @shuffle_v8i64_44444444(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_66666666(<8 x i64> %a, <8 x i64> %b) {
; AVX512F-LABEL: shuffle_v8i64_66666666:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vextracti32x4 $3, %zmm0, %xmm0
-; AVX512F-NEXT: vpbroadcastq %xmm0, %zmm0
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vextractf32x4 $3, %zmm0, %xmm0
+; AVX512F-NEXT: vbroadcastsd %xmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8i64_66666666:
-; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vextracti32x4 $3, %zmm0, %xmm0
-; AVX512F-32-NEXT: vpbroadcastq %xmm0, %zmm0
+; AVX512F-32: # %bb.0:
+; AVX512F-32-NEXT: vextractf32x4 $3, %zmm0, %xmm0
+; AVX512F-32-NEXT: vbroadcastsd %xmm0, %zmm0
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6>
ret <8 x i64> %shuffle
@@ -1044,15 +1044,15 @@ define <8 x i64> @shuffle_v8i64_66666666(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_00000010(<8 x i64> %a, <8 x i64> %b) {
;
; AVX512F-LABEL: shuffle_v8i64_00000010:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,1,0]
-; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,0,0,1,0]
+; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8i64_00000010:
-; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0]
-; AVX512F-32-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-32: # %bb.0:
+; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0]
+; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0>
ret <8 x i64> %shuffle
@@ -1061,15 +1061,15 @@ define <8 x i64> @shuffle_v8i64_00000010(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_00000200(<8 x i64> %a, <8 x i64> %b) {
;
; AVX512F-LABEL: shuffle_v8i64_00000200:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,2,0,0]
-; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,0,2,0,0]
+; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8i64_00000200:
-; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0]
-; AVX512F-32-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-32: # %bb.0:
+; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0]
+; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0>
ret <8 x i64> %shuffle
@@ -1078,15 +1078,15 @@ define <8 x i64> @shuffle_v8i64_00000200(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_00003000(<8 x i64> %a, <8 x i64> %b) {
;
; AVX512F-LABEL: shuffle_v8i64_00003000:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,3,0,0,0]
-; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,3,0,0,0]
+; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8i64_00003000:
-; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0]
-; AVX512F-32-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-32: # %bb.0:
+; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0]
+; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0>
ret <8 x i64> %shuffle
@@ -1095,15 +1095,15 @@ define <8 x i64> @shuffle_v8i64_00003000(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_00040000(<8 x i64> %a, <8 x i64> %b) {
;
; AVX512F-LABEL: shuffle_v8i64_00040000:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,4,0,0,0,0]
-; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,4,0,0,0,0]
+; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8i64_00040000:
-; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,4,0,0,0,0,0,0,0,0,0]
-; AVX512F-32-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-32: # %bb.0:
+; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,0,0,4,0,0,0,0,0,0,0,0,0]
+; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 4, i32 0, i32 0, i32 0, i32 0>
ret <8 x i64> %shuffle
@@ -1112,15 +1112,15 @@ define <8 x i64> @shuffle_v8i64_00040000(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_00500000(<8 x i64> %a, <8 x i64> %b) {
;
; AVX512F-LABEL: shuffle_v8i64_00500000:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,5,0,0,0,0,0]
-; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,5,0,0,0,0,0]
+; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8i64_00500000:
-; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,5,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512F-32-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-32: # %bb.0:
+; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,5,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 5, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <8 x i64> %shuffle
@@ -1129,15 +1129,15 @@ define <8 x i64> @shuffle_v8i64_00500000(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_06000000(<8 x i64> %a, <8 x i64> %b) {
;
; AVX512F-LABEL: shuffle_v8i64_06000000:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,6,0,0,0,0,0,0]
-; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,6,0,0,0,0,0,0]
+; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8i64_06000000:
-; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,6,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512F-32-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-32: # %bb.0:
+; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,6,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 6, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <8 x i64> %shuffle
@@ -1146,14 +1146,14 @@ define <8 x i64> @shuffle_v8i64_06000000(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_70000000(<8 x i64> %a, <8 x i64> %b) {
;
; AVX512F-LABEL: shuffle_v8i64_70000000:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: movl $7, %eax
; AVX512F-NEXT: vmovq %rax, %xmm1
; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8i64_70000000:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: movl $7, %eax
; AVX512F-32-NEXT: vmovd %eax, %xmm1
; AVX512F-32-NEXT: vpermq %zmm0, %zmm1, %zmm0
@@ -1164,31 +1164,48 @@ define <8 x i64> @shuffle_v8i64_70000000(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_01014545(<8 x i64> %a, <8 x i64> %b) {
; AVX512F-LABEL: shuffle_v8i64_01014545:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,4,5,4,5]
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[0,1,0,1,4,5,4,5]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8i64_01014545:
-; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,4,5,4,5]
+; AVX512F-32: # %bb.0:
+; AVX512F-32-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[0,1,0,1,4,5,4,5]
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5>
ret <8 x i64> %shuffle
}
+define <8 x i64> @shuffle_v8i64_01014545_mem(<8 x i64>* %ptr, <8 x i64> %b) {
+; AVX512F-LABEL: shuffle_v8i64_01014545_mem:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpermpd {{.*#+}} zmm0 = mem[0,1,0,1,4,5,4,5]
+; AVX512F-NEXT: retq
+;
+; AVX512F-32-LABEL: shuffle_v8i64_01014545_mem:
+; AVX512F-32: # %bb.0:
+; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT: vpermpd {{.*#+}} zmm0 = mem[0,1,0,1,4,5,4,5]
+; AVX512F-32-NEXT: retl
+
+ %a = load <8 x i64>, <8 x i64>* %ptr
+ %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5>
+ ret <8 x i64> %shuffle
+}
+
define <8 x i64> @shuffle_v8i64_00112233(<8 x i64> %a, <8 x i64> %b) {
;
; AVX512F-LABEL: shuffle_v8i64_00112233:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3]
-; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3]
+; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8i64_00112233:
-; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,1,0,1,0,2,0,2,0,3,0,3,0]
-; AVX512F-32-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-32: # %bb.0:
+; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,1,0,1,0,2,0,2,0,3,0,3,0]
+; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
ret <8 x i64> %shuffle
@@ -1197,15 +1214,15 @@ define <8 x i64> @shuffle_v8i64_00112233(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_00001111(<8 x i64> %a, <8 x i64> %b) {
;
; AVX512F-LABEL: shuffle_v8i64_00001111:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1]
-; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1]
+; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8i64_00001111:
-; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,0,1,0,1,0,1,0]
-; AVX512F-32-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-32: # %bb.0:
+; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,0,1,0,1,0,1,0]
+; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
ret <8 x i64> %shuffle
@@ -1214,14 +1231,14 @@ define <8 x i64> @shuffle_v8i64_00001111(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_81a3c5e7(<8 x i64> %a, <8 x i64> %b) {
;
; AVX512F-LABEL: shuffle_v8i64_81a3c5e7:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: movb $-86, %al
; AVX512F-NEXT: kmovw %eax, %k1
; AVX512F-NEXT: vpblendmq %zmm0, %zmm1, %zmm0 {%k1}
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8i64_81a3c5e7:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: movb $-86, %al
; AVX512F-32-NEXT: kmovw %eax, %k1
; AVX512F-32-NEXT: vpblendmq %zmm0, %zmm1, %zmm0 {%k1}
@@ -1233,13 +1250,13 @@ define <8 x i64> @shuffle_v8i64_81a3c5e7(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_08080808(<8 x i64> %a, <8 x i64> %b) {
;
; AVX512F-LABEL: shuffle_v8i64_08080808:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,0,8,0,8,0,8]
; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8i64_08080808:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,8,0,0,0,8,0,0,0,8,0,0,0,8,0]
; AVX512F-32-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
; AVX512F-32-NEXT: retl
@@ -1250,13 +1267,13 @@ define <8 x i64> @shuffle_v8i64_08080808(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_08084c4c(<8 x i64> %a, <8 x i64> %b) {
;
; AVX512F-LABEL: shuffle_v8i64_08084c4c:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,0,8,4,12,4,12]
; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8i64_08084c4c:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,8,0,0,0,8,0,4,0,12,0,4,0,12,0]
; AVX512F-32-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
; AVX512F-32-NEXT: retl
@@ -1267,14 +1284,14 @@ define <8 x i64> @shuffle_v8i64_08084c4c(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_8823cc67(<8 x i64> %a, <8 x i64> %b) {
;
; AVX512F-LABEL: shuffle_v8i64_8823cc67:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,10,11,4,4,14,15]
; AVX512F-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8i64_8823cc67:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,0,0,10,0,11,0,4,0,4,0,14,0,15,0]
; AVX512F-32-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0
@@ -1286,14 +1303,14 @@ define <8 x i64> @shuffle_v8i64_8823cc67(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_9832dc76(<8 x i64> %a, <8 x i64> %b) {
;
; AVX512F-LABEL: shuffle_v8i64_9832dc76:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [1,0,11,10,5,4,15,14]
; AVX512F-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8i64_9832dc76:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm2 = [1,0,0,0,11,0,10,0,5,0,4,0,15,0,14,0]
; AVX512F-32-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0
@@ -1305,14 +1322,14 @@ define <8 x i64> @shuffle_v8i64_9832dc76(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_9810dc54(<8 x i64> %a, <8 x i64> %b) {
;
; AVX512F-LABEL: shuffle_v8i64_9810dc54:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [1,0,9,8,5,4,13,12]
; AVX512F-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8i64_9810dc54:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm2 = [1,0,0,0,9,0,8,0,5,0,4,0,13,0,12,0]
; AVX512F-32-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0
@@ -1324,13 +1341,13 @@ define <8 x i64> @shuffle_v8i64_9810dc54(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_08194c5d(<8 x i64> %a, <8 x i64> %b) {
;
; AVX512F-LABEL: shuffle_v8i64_08194c5d:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,1,9,4,12,5,13]
; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8i64_08194c5d:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,8,0,1,0,9,0,4,0,12,0,5,0,13,0]
; AVX512F-32-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
; AVX512F-32-NEXT: retl
@@ -1341,13 +1358,13 @@ define <8 x i64> @shuffle_v8i64_08194c5d(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_2a3b6e7f(<8 x i64> %a, <8 x i64> %b) {
;
; AVX512F-LABEL: shuffle_v8i64_2a3b6e7f:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [2,10,3,11,6,14,7,15]
; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8i64_2a3b6e7f:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm2 = [2,0,10,0,3,0,11,0,6,0,14,0,7,0,15,0]
; AVX512F-32-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
; AVX512F-32-NEXT: retl
@@ -1358,13 +1375,13 @@ define <8 x i64> @shuffle_v8i64_2a3b6e7f(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_08192a3b(<8 x i64> %a, <8 x i64> %b) {
;
; AVX512F-LABEL: shuffle_v8i64_08192a3b:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,1,9,2,10,3,11]
; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8i64_08192a3b:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,8,0,1,0,9,0,2,0,10,0,3,0,11,0]
; AVX512F-32-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
; AVX512F-32-NEXT: retl
@@ -1375,14 +1392,14 @@ define <8 x i64> @shuffle_v8i64_08192a3b(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_08991abb(<8 x i64> %a, <8 x i64> %b) {
;
; AVX512F-LABEL: shuffle_v8i64_08991abb:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,0,1,1,9,2,3,3]
; AVX512F-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8i64_08991abb:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,0,0,0,1,0,1,0,9,0,2,0,3,0,3,0]
; AVX512F-32-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0
@@ -1394,13 +1411,13 @@ define <8 x i64> @shuffle_v8i64_08991abb(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_091b2d3f(<8 x i64> %a, <8 x i64> %b) {
;
; AVX512F-LABEL: shuffle_v8i64_091b2d3f:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,9,1,11,2,13,3,15]
; AVX512F-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8i64_091b2d3f:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,9,0,1,0,11,0,2,0,13,0,3,0,15,0]
; AVX512F-32-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
; AVX512F-32-NEXT: retl
@@ -1411,14 +1428,14 @@ define <8 x i64> @shuffle_v8i64_091b2d3f(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_09ab1def(<8 x i64> %a, <8 x i64> %b) {
;
; AVX512F-LABEL: shuffle_v8i64_09ab1def:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,1,2,3,9,5,6,7]
; AVX512F-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8i64_09ab1def:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,0,1,0,2,0,3,0,9,0,5,0,6,0,7,0]
; AVX512F-32-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0
@@ -1430,13 +1447,13 @@ define <8 x i64> @shuffle_v8i64_09ab1def(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_00014445(<8 x i64> %a, <8 x i64> %b) {
;
; AVX512F-LABEL: shuffle_v8i64_00014445:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,0,0,1,4,4,4,5]
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[0,0,0,1,4,4,4,5]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8i64_00014445:
-; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,0,0,1,4,4,4,5]
+; AVX512F-32: # %bb.0:
+; AVX512F-32-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[0,0,0,1,4,4,4,5]
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 1, i32 4, i32 4, i32 4, i32 5>
ret <8 x i64> %shuffle
@@ -1445,13 +1462,13 @@ define <8 x i64> @shuffle_v8i64_00014445(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_00204464(<8 x i64> %a, <8 x i64> %b) {
;
; AVX512F-LABEL: shuffle_v8i64_00204464:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,0,2,0,4,4,6,4]
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[0,0,2,0,4,4,6,4]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8i64_00204464:
-; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,0,2,0,4,4,6,4]
+; AVX512F-32: # %bb.0:
+; AVX512F-32-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[0,0,2,0,4,4,6,4]
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 0, i32 4, i32 4, i32 6, i32 4>
ret <8 x i64> %shuffle
@@ -1460,13 +1477,13 @@ define <8 x i64> @shuffle_v8i64_00204464(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_03004744(<8 x i64> %a, <8 x i64> %b) {
;
; AVX512F-LABEL: shuffle_v8i64_03004744:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,3,0,0,4,7,4,4]
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[0,3,0,0,4,7,4,4]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8i64_03004744:
-; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,3,0,0,4,7,4,4]
+; AVX512F-32: # %bb.0:
+; AVX512F-32-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[0,3,0,0,4,7,4,4]
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 3, i32 0, i32 0, i32 4, i32 7, i32 4, i32 4>
ret <8 x i64> %shuffle
@@ -1475,13 +1492,13 @@ define <8 x i64> @shuffle_v8i64_03004744(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_10005444(<8 x i64> %a, <8 x i64> %b) {
;
; AVX512F-LABEL: shuffle_v8i64_10005444:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vpermq {{.*#+}} zmm0 = zmm0[1,0,0,0,5,4,4,4]
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[1,0,0,0,5,4,4,4]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8i64_10005444:
-; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vpermq {{.*#+}} zmm0 = zmm0[1,0,0,0,5,4,4,4]
+; AVX512F-32: # %bb.0:
+; AVX512F-32-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[1,0,0,0,5,4,4,4]
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 0, i32 0, i32 0, i32 5, i32 4, i32 4, i32 4>
ret <8 x i64> %shuffle
@@ -1490,13 +1507,13 @@ define <8 x i64> @shuffle_v8i64_10005444(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_22006644(<8 x i64> %a, <8 x i64> %b) {
;
; AVX512F-LABEL: shuffle_v8i64_22006644:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,2,0,0,6,6,4,4]
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[2,2,0,0,6,6,4,4]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8i64_22006644:
-; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,2,0,0,6,6,4,4]
+; AVX512F-32: # %bb.0:
+; AVX512F-32-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[2,2,0,0,6,6,4,4]
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 2, i32 2, i32 0, i32 0, i32 6, i32 6, i32 4, i32 4>
ret <8 x i64> %shuffle
@@ -1505,13 +1522,13 @@ define <8 x i64> @shuffle_v8i64_22006644(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_33307774(<8 x i64> %a, <8 x i64> %b) {
;
; AVX512F-LABEL: shuffle_v8i64_33307774:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vpermq {{.*#+}} zmm0 = zmm0[3,3,3,0,7,7,7,4]
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[3,3,3,0,7,7,7,4]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8i64_33307774:
-; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vpermq {{.*#+}} zmm0 = zmm0[3,3,3,0,7,7,7,4]
+; AVX512F-32: # %bb.0:
+; AVX512F-32-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[3,3,3,0,7,7,7,4]
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 0, i32 7, i32 7, i32 7, i32 4>
ret <8 x i64> %shuffle
@@ -1520,13 +1537,13 @@ define <8 x i64> @shuffle_v8i64_33307774(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_32107654(<8 x i64> %a, <8 x i64> %b) {
;
; AVX512F-LABEL: shuffle_v8i64_32107654:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vpermq {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4]
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8i64_32107654:
-; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vpermq {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4]
+; AVX512F-32: # %bb.0:
+; AVX512F-32-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4]
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
ret <8 x i64> %shuffle
@@ -1535,13 +1552,13 @@ define <8 x i64> @shuffle_v8i64_32107654(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_00234467(<8 x i64> %a, <8 x i64> %b) {
;
; AVX512F-LABEL: shuffle_v8i64_00234467:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,0,2,3,4,4,6,7]
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[0,0,2,3,4,4,6,7]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8i64_00234467:
-; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,0,2,3,4,4,6,7]
+; AVX512F-32: # %bb.0:
+; AVX512F-32-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[0,0,2,3,4,4,6,7]
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 3, i32 4, i32 4, i32 6, i32 7>
ret <8 x i64> %shuffle
@@ -1550,13 +1567,13 @@ define <8 x i64> @shuffle_v8i64_00234467(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_00224466(<8 x i64> %a, <8 x i64> %b) {
;
; AVX512F-LABEL: shuffle_v8i64_00224466:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vpshufd {{.*#+}} zmm0 = zmm0[0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13]
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8i64_00224466:
-; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vpshufd {{.*#+}} zmm0 = zmm0[0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13]
+; AVX512F-32: # %bb.0:
+; AVX512F-32-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13]
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
ret <8 x i64> %shuffle
@@ -1565,13 +1582,13 @@ define <8 x i64> @shuffle_v8i64_00224466(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_10325476(<8 x i64> %a, <8 x i64> %b) {
;
; AVX512F-LABEL: shuffle_v8i64_10325476:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vpshufd {{.*#+}} zmm0 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8i64_10325476:
-; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vpshufd {{.*#+}} zmm0 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
+; AVX512F-32: # %bb.0:
+; AVX512F-32-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
ret <8 x i64> %shuffle
@@ -1580,13 +1597,13 @@ define <8 x i64> @shuffle_v8i64_10325476(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_11335577(<8 x i64> %a, <8 x i64> %b) {
;
; AVX512F-LABEL: shuffle_v8i64_11335577:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vpshufd {{.*#+}} zmm0 = zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15]
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8i64_11335577:
-; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vpshufd {{.*#+}} zmm0 = zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15]
+; AVX512F-32: # %bb.0:
+; AVX512F-32-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15]
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
ret <8 x i64> %shuffle
@@ -1595,13 +1612,13 @@ define <8 x i64> @shuffle_v8i64_11335577(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_10235467(<8 x i64> %a, <8 x i64> %b) {
;
; AVX512F-LABEL: shuffle_v8i64_10235467:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vpermq {{.*#+}} zmm0 = zmm0[1,0,2,3,5,4,6,7]
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[1,0,2,3,5,4,6,7]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8i64_10235467:
-; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vpermq {{.*#+}} zmm0 = zmm0[1,0,2,3,5,4,6,7]
+; AVX512F-32: # %bb.0:
+; AVX512F-32-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[1,0,2,3,5,4,6,7]
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7>
ret <8 x i64> %shuffle
@@ -1610,13 +1627,13 @@ define <8 x i64> @shuffle_v8i64_10235467(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_10225466(<8 x i64> %a, <8 x i64> %b) {
;
; AVX512F-LABEL: shuffle_v8i64_10225466:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vpermq {{.*#+}} zmm0 = zmm0[1,0,2,2,5,4,6,6]
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[1,0,2,2,5,4,6,6]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8i64_10225466:
-; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vpermq {{.*#+}} zmm0 = zmm0[1,0,2,2,5,4,6,6]
+; AVX512F-32: # %bb.0:
+; AVX512F-32-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[1,0,2,2,5,4,6,6]
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 0, i32 2, i32 2, i32 5, i32 4, i32 6, i32 6>
ret <8 x i64> %shuffle
@@ -1625,15 +1642,15 @@ define <8 x i64> @shuffle_v8i64_10225466(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_00015444(<8 x i64> %a, <8 x i64> %b) {
;
; AVX512F-LABEL: shuffle_v8i64_00015444:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,1,5,4,4,4]
-; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,1,5,4,4,4]
+; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8i64_00015444:
-; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,1,0,5,0,4,0,4,0,4,0]
-; AVX512F-32-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-32: # %bb.0:
+; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,0,0,1,0,5,0,4,0,4,0,4,0]
+; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 1, i32 5, i32 4, i32 4, i32 4>
ret <8 x i64> %shuffle
@@ -1642,15 +1659,15 @@ define <8 x i64> @shuffle_v8i64_00015444(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_00204644(<8 x i64> %a, <8 x i64> %b) {
;
; AVX512F-LABEL: shuffle_v8i64_00204644:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,2,0,4,6,4,4]
-; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,2,0,4,6,4,4]
+; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8i64_00204644:
-; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,2,0,0,0,4,0,6,0,4,0,4,0]
-; AVX512F-32-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-32: # %bb.0:
+; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,2,0,0,0,4,0,6,0,4,0,4,0]
+; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 0, i32 4, i32 6, i32 4, i32 4>
ret <8 x i64> %shuffle
@@ -1659,15 +1676,15 @@ define <8 x i64> @shuffle_v8i64_00204644(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_03004474(<8 x i64> %a, <8 x i64> %b) {
;
; AVX512F-LABEL: shuffle_v8i64_03004474:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,3,0,0,4,4,7,4]
-; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,3,0,0,4,4,7,4]
+; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8i64_03004474:
-; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,3,0,0,0,0,0,4,0,4,0,7,0,4,0]
-; AVX512F-32-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-32: # %bb.0:
+; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,3,0,0,0,0,0,4,0,4,0,7,0,4,0]
+; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 3, i32 0, i32 0, i32 4, i32 4, i32 7, i32 4>
ret <8 x i64> %shuffle
@@ -1676,15 +1693,15 @@ define <8 x i64> @shuffle_v8i64_03004474(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_10004444(<8 x i64> %a, <8 x i64> %b) {
;
; AVX512F-LABEL: shuffle_v8i64_10004444:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [1,0,0,0,4,4,4,4]
-; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [1,0,0,0,4,4,4,4]
+; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8i64_10004444:
-; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [1,0,0,0,0,0,0,0,4,0,4,0,4,0,4,0]
-; AVX512F-32-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-32: # %bb.0:
+; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [1,0,0,0,0,0,0,0,4,0,4,0,4,0,4,0]
+; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
ret <8 x i64> %shuffle
@@ -1693,15 +1710,15 @@ define <8 x i64> @shuffle_v8i64_10004444(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_22006446(<8 x i64> %a, <8 x i64> %b) {
;
; AVX512F-LABEL: shuffle_v8i64_22006446:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,0,0,6,4,4,6]
-; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [2,2,0,0,6,4,4,6]
+; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8i64_22006446:
-; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,0,2,0,0,0,0,0,6,0,4,0,4,0,6,0]
-; AVX512F-32-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-32: # %bb.0:
+; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [2,0,2,0,0,0,0,0,6,0,4,0,4,0,6,0]
+; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 2, i32 2, i32 0, i32 0, i32 6, i32 4, i32 4, i32 6>
ret <8 x i64> %shuffle
@@ -1710,15 +1727,15 @@ define <8 x i64> @shuffle_v8i64_22006446(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_33307474(<8 x i64> %a, <8 x i64> %b) {
;
; AVX512F-LABEL: shuffle_v8i64_33307474:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,3,3,0,7,4,7,4]
-; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [3,3,3,0,7,4,7,4]
+; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8i64_33307474:
-; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,0,3,0,3,0,0,0,7,0,4,0,7,0,4,0]
-; AVX512F-32-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-32: # %bb.0:
+; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [3,0,3,0,3,0,0,0,7,0,4,0,7,0,4,0]
+; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 0, i32 7, i32 4, i32 7, i32 4>
ret <8 x i64> %shuffle
@@ -1727,15 +1744,15 @@ define <8 x i64> @shuffle_v8i64_33307474(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_32104567(<8 x i64> %a, <8 x i64> %b) {
;
; AVX512F-LABEL: shuffle_v8i64_32104567:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,2,1,0,4,5,6,7]
-; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [3,2,1,0,4,5,6,7]
+; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8i64_32104567:
-; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,0,2,0,1,0,0,0,4,0,5,0,6,0,7,0]
-; AVX512F-32-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-32: # %bb.0:
+; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [3,0,2,0,1,0,0,0,4,0,5,0,6,0,7,0]
+; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7>
ret <8 x i64> %shuffle
@@ -1744,15 +1761,15 @@ define <8 x i64> @shuffle_v8i64_32104567(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_00236744(<8 x i64> %a, <8 x i64> %b) {
;
; AVX512F-LABEL: shuffle_v8i64_00236744:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,2,3,6,7,4,4]
-; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,2,3,6,7,4,4]
+; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8i64_00236744:
-; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,2,0,3,0,6,0,7,0,4,0,4,0]
-; AVX512F-32-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-32: # %bb.0:
+; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,2,0,3,0,6,0,7,0,4,0,4,0]
+; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 3, i32 6, i32 7, i32 4, i32 4>
ret <8 x i64> %shuffle
@@ -1761,15 +1778,15 @@ define <8 x i64> @shuffle_v8i64_00236744(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_00226644(<8 x i64> %a, <8 x i64> %b) {
;
; AVX512F-LABEL: shuffle_v8i64_00226644:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,2,2,6,6,4,4]
-; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,2,2,6,6,4,4]
+; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8i64_00226644:
-; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,2,0,2,0,6,0,6,0,4,0,4,0]
-; AVX512F-32-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-32: # %bb.0:
+; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,2,0,2,0,6,0,6,0,4,0,4,0]
+; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 6, i32 6, i32 4, i32 4>
ret <8 x i64> %shuffle
@@ -1778,15 +1795,15 @@ define <8 x i64> @shuffle_v8i64_00226644(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_10324567(<8 x i64> %a, <8 x i64> %b) {
;
; AVX512F-LABEL: shuffle_v8i64_10324567:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [1,0,3,2,4,5,6,7]
-; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [1,0,3,2,4,5,6,7]
+; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8i64_10324567:
-; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [1,0,0,0,3,0,2,0,4,0,5,0,6,0,7,0]
-; AVX512F-32-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-32: # %bb.0:
+; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [1,0,0,0,3,0,2,0,4,0,5,0,6,0,7,0]
+; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7>
ret <8 x i64> %shuffle
@@ -1795,15 +1812,15 @@ define <8 x i64> @shuffle_v8i64_10324567(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_11334567(<8 x i64> %a, <8 x i64> %b) {
;
; AVX512F-LABEL: shuffle_v8i64_11334567:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [1,1,3,3,4,5,6,7]
-; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [1,1,3,3,4,5,6,7]
+; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8i64_11334567:
-; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [1,0,1,0,3,0,3,0,4,0,5,0,6,0,7,0]
-; AVX512F-32-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-32: # %bb.0:
+; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [1,0,1,0,3,0,3,0,4,0,5,0,6,0,7,0]
+; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 4, i32 5, i32 6, i32 7>
ret <8 x i64> %shuffle
@@ -1812,15 +1829,15 @@ define <8 x i64> @shuffle_v8i64_11334567(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_01235467(<8 x i64> %a, <8 x i64> %b) {
;
; AVX512F-LABEL: shuffle_v8i64_01235467:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,5,4,6,7]
-; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,1,2,3,5,4,6,7]
+; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8i64_01235467:
-; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,1,0,2,0,3,0,5,0,4,0,6,0,7,0]
-; AVX512F-32-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-32: # %bb.0:
+; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,1,0,2,0,3,0,5,0,4,0,6,0,7,0]
+; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7>
ret <8 x i64> %shuffle
@@ -1829,15 +1846,15 @@ define <8 x i64> @shuffle_v8i64_01235467(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_01235466(<8 x i64> %a, <8 x i64> %b) {
;
; AVX512F-LABEL: shuffle_v8i64_01235466:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,5,4,6,6]
-; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,1,2,3,5,4,6,6]
+; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8i64_01235466:
-; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,1,0,2,0,3,0,5,0,4,0,6,0,6,0]
-; AVX512F-32-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-32: # %bb.0:
+; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,1,0,2,0,3,0,5,0,4,0,6,0,6,0]
+; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 6, i32 6>
ret <8 x i64> %shuffle
@@ -1846,15 +1863,15 @@ define <8 x i64> @shuffle_v8i64_01235466(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_002u6u44(<8 x i64> %a, <8 x i64> %b) {
;
; AVX512F-LABEL: shuffle_v8i64_002u6u44:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,0,2,u,6,u,4,4>
-; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = <0,0,2,u,6,u,4,4>
+; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8i64_002u6u44:
-; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,0,0,0,2,0,u,u,6,0,u,u,4,0,4,0>
-; AVX512F-32-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-32: # %bb.0:
+; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = <0,0,0,0,2,0,u,u,6,0,u,u,4,0,4,0>
+; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 undef, i32 6, i32 undef, i32 4, i32 4>
ret <8 x i64> %shuffle
@@ -1863,15 +1880,15 @@ define <8 x i64> @shuffle_v8i64_002u6u44(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_00uu66uu(<8 x i64> %a, <8 x i64> %b) {
;
; AVX512F-LABEL: shuffle_v8i64_00uu66uu:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,0,u,u,6,6,u,u>
-; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = <0,0,u,u,6,6,u,u>
+; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8i64_00uu66uu:
-; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,0,0,0,u,u,u,u,6,0,6,0,u,u,u,u>
-; AVX512F-32-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-32: # %bb.0:
+; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = <0,0,0,0,u,u,u,u,6,0,6,0,u,u,u,u>
+; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 undef, i32 undef, i32 6, i32 6, i32 undef, i32 undef>
ret <8 x i64> %shuffle
@@ -1880,15 +1897,15 @@ define <8 x i64> @shuffle_v8i64_00uu66uu(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_103245uu(<8 x i64> %a, <8 x i64> %b) {
;
; AVX512F-LABEL: shuffle_v8i64_103245uu:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = <1,0,3,2,4,5,u,u>
-; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = <1,0,3,2,4,5,u,u>
+; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8i64_103245uu:
-; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = <1,0,0,0,3,0,2,0,4,0,5,0,u,u,u,u>
-; AVX512F-32-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-32: # %bb.0:
+; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = <1,0,0,0,3,0,2,0,4,0,5,0,u,u,u,u>
+; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 4, i32 5, i32 undef, i32 undef>
ret <8 x i64> %shuffle
@@ -1897,15 +1914,15 @@ define <8 x i64> @shuffle_v8i64_103245uu(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_1133uu67(<8 x i64> %a, <8 x i64> %b) {
;
; AVX512F-LABEL: shuffle_v8i64_1133uu67:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = <1,1,3,3,u,u,6,7>
-; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = <1,1,3,3,u,u,6,7>
+; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8i64_1133uu67:
-; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = <1,0,1,0,3,0,3,0,u,u,u,u,6,0,7,0>
-; AVX512F-32-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-32: # %bb.0:
+; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = <1,0,1,0,3,0,3,0,u,u,u,u,6,0,7,0>
+; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 undef, i32 undef, i32 6, i32 7>
ret <8 x i64> %shuffle
@@ -1914,15 +1931,15 @@ define <8 x i64> @shuffle_v8i64_1133uu67(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_0uu354uu(<8 x i64> %a, <8 x i64> %b) {
;
; AVX512F-LABEL: shuffle_v8i64_0uu354uu:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,u,u,3,5,4,u,u>
-; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = <0,u,u,3,5,4,u,u>
+; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8i64_0uu354uu:
-; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,0,u,u,u,u,3,0,5,0,4,0,u,u,u,u>
-; AVX512F-32-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-32: # %bb.0:
+; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = <0,0,u,u,u,u,3,0,5,0,4,0,u,u,u,u>
+; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 undef, i32 undef, i32 3, i32 5, i32 4, i32 undef, i32 undef>
ret <8 x i64> %shuffle
@@ -1931,15 +1948,15 @@ define <8 x i64> @shuffle_v8i64_0uu354uu(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_uuu3uu66(<8 x i64> %a, <8 x i64> %b) {
;
; AVX512F-LABEL: shuffle_v8i64_uuu3uu66:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = <u,u,u,3,u,u,6,6>
-; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = <u,u,u,3,u,u,6,6>
+; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8i64_uuu3uu66:
-; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = <u,u,u,u,u,u,3,0,u,u,u,u,6,0,6,0>
-; AVX512F-32-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-32: # %bb.0:
+; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = <u,u,u,u,u,u,3,0,u,u,u,u,6,0,6,0>
+; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 6, i32 6>
ret <8 x i64> %shuffle
@@ -1948,14 +1965,14 @@ define <8 x i64> @shuffle_v8i64_uuu3uu66(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_6caa87e5(<8 x i64> %a, <8 x i64> %b) {
;
; AVX512F-LABEL: shuffle_v8i64_6caa87e5:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [14,4,2,2,0,15,6,13]
; AVX512F-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8i64_6caa87e5:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm2 = [14,0,4,0,2,0,2,0,0,0,15,0,6,0,13,0]
; AVX512F-32-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0
@@ -1967,12 +1984,12 @@ define <8 x i64> @shuffle_v8i64_6caa87e5(<8 x i64> %a, <8 x i64> %b) {
define <8 x double> @shuffle_v8f64_082a4c6e(<8 x double> %a, <8 x double> %b) {
;
; AVX512F-LABEL: shuffle_v8f64_082a4c6e:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vunpcklpd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_082a4c6e:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: vunpcklpd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32><i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
@@ -1982,14 +1999,14 @@ define <8 x double> @shuffle_v8f64_082a4c6e(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_0z2z4z6z(<8 x double> %a, <8 x double> %b) {
;
; AVX512F-LABEL: shuffle_v8f64_0z2z4z6z:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX512F-NEXT: vunpcklpd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_0z2z4z6z:
-; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; AVX512F-32: # %bb.0:
+; AVX512F-32-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX512F-32-NEXT: vunpcklpd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x double> %a, <8 x double> zeroinitializer, <8 x i32><i32 0, i32 8, i32 2, i32 8, i32 4, i32 8, i32 6, i32 8>
@@ -1999,13 +2016,13 @@ define <8 x double> @shuffle_v8f64_0z2z4z6z(<8 x double> %a, <8 x double> %b) {
define <8 x i64> @shuffle_v8i64_082a4c6e(<8 x i64> %a, <8 x i64> %b) {
;
; AVX512F-LABEL: shuffle_v8i64_082a4c6e:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vunpcklpd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8i64_082a4c6e:
-; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vpunpcklqdq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
+; AVX512F-32: # %bb.0:
+; AVX512F-32-NEXT: vunpcklpd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32><i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
ret <8 x i64> %shuffle
@@ -2014,15 +2031,15 @@ define <8 x i64> @shuffle_v8i64_082a4c6e(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_z8zazcze(<8 x i64> %a, <8 x i64> %b) {
;
; AVX512F-LABEL: shuffle_v8i64_z8zazcze:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX512F-NEXT: vunpcklpd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8i64_z8zazcze:
-; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vpxord %zmm0, %zmm0, %zmm0
-; AVX512F-32-NEXT: vpunpcklqdq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
+; AVX512F-32: # %bb.0:
+; AVX512F-32-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX512F-32-NEXT: vunpcklpd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x i64> zeroinitializer, <8 x i64> %b, <8 x i32><i32 7, i32 8, i32 5, i32 10, i32 3, i32 12, i32 1, i32 14>
ret <8 x i64> %shuffle
@@ -2031,12 +2048,12 @@ define <8 x i64> @shuffle_v8i64_z8zazcze(<8 x i64> %a, <8 x i64> %b) {
define <8 x double> @shuffle_v8f64_193b5d7f(<8 x double> %a, <8 x double> %b) {
;
; AVX512F-LABEL: shuffle_v8f64_193b5d7f:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_193b5d7f:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32><i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
@@ -2046,14 +2063,14 @@ define <8 x double> @shuffle_v8f64_193b5d7f(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_z9zbzdzf(<8 x double> %a, <8 x double> %b) {
;
; AVX512F-LABEL: shuffle_v8f64_z9zbzdzf:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vxorps %xmm0, %xmm0, %xmm0
; AVX512F-NEXT: vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_z9zbzdzf:
-; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vpxord %zmm0, %zmm0, %zmm0
+; AVX512F-32: # %bb.0:
+; AVX512F-32-NEXT: vxorps %xmm0, %xmm0, %xmm0
; AVX512F-32-NEXT: vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x double> zeroinitializer, <8 x double> %b, <8 x i32><i32 0, i32 9, i32 0, i32 11, i32 0, i32 13, i32 0, i32 15>
@@ -2063,13 +2080,13 @@ define <8 x double> @shuffle_v8f64_z9zbzdzf(<8 x double> %a, <8 x double> %b) {
define <8 x i64> @shuffle_v8i64_193b5d7f(<8 x i64> %a, <8 x i64> %b) {
;
; AVX512F-LABEL: shuffle_v8i64_193b5d7f:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8i64_193b5d7f:
-; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
+; AVX512F-32: # %bb.0:
+; AVX512F-32-NEXT: vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32><i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
ret <8 x i64> %shuffle
@@ -2078,15 +2095,15 @@ define <8 x i64> @shuffle_v8i64_193b5d7f(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_1z3z5z7z(<8 x i64> %a, <8 x i64> %b) {
;
; AVX512F-LABEL: shuffle_v8i64_1z3z5z7z:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vpxord %zmm1, %zmm1, %zmm1
-; AVX512F-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX512F-NEXT: vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8i64_1z3z5z7z:
-; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vpxord %zmm1, %zmm1, %zmm1
-; AVX512F-32-NEXT: vpunpckhqdq {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
+; AVX512F-32: # %bb.0:
+; AVX512F-32-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX512F-32-NEXT: vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x i64> %a, <8 x i64> zeroinitializer, <8 x i32><i32 1, i32 8, i32 3, i32 15, i32 5, i32 8, i32 7, i32 15>
ret <8 x i64> %shuffle
@@ -2094,12 +2111,12 @@ define <8 x i64> @shuffle_v8i64_1z3z5z7z(<8 x i64> %a, <8 x i64> %b) {
define <8 x double> @test_vshuff64x2_512(<8 x double> %x, <8 x double> %x1) nounwind {
; AVX512F-LABEL: test_vshuff64x2_512:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5],zmm1[2,3,0,1]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: test_vshuff64x2_512:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5],zmm1[2,3,0,1]
; AVX512F-32-NEXT: retl
%res = shufflevector <8 x double> %x, <8 x double> %x1, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 10, i32 11, i32 8, i32 9>
@@ -2108,7 +2125,7 @@ define <8 x double> @test_vshuff64x2_512(<8 x double> %x, <8 x double> %x1) noun
define <8 x double> @test_vshuff64x2_512_maskz(<8 x double> %x, <8 x double> %x1, <8 x i1> %mask) nounwind {
; AVX512F-LABEL: test_vshuff64x2_512_maskz:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vpmovsxwq %xmm2, %zmm2
; AVX512F-NEXT: vpsllq $63, %zmm2, %zmm2
; AVX512F-NEXT: vptestmq %zmm2, %zmm2, %k1
@@ -2116,7 +2133,7 @@ define <8 x double> @test_vshuff64x2_512_maskz(<8 x double> %x, <8 x double> %x1
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: test_vshuff64x2_512_maskz:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: vpmovsxwq %xmm2, %zmm2
; AVX512F-32-NEXT: vpsllq $63, %zmm2, %zmm2
; AVX512F-32-NEXT: vptestmq %zmm2, %zmm2, %k1
@@ -2129,7 +2146,7 @@ define <8 x double> @test_vshuff64x2_512_maskz(<8 x double> %x, <8 x double> %x1
define <8 x i64> @test_vshufi64x2_512_mask(<8 x i64> %x, <8 x i64> %x1, <8 x i1> %mask) nounwind {
; AVX512F-LABEL: test_vshufi64x2_512_mask:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vpmovsxwq %xmm2, %zmm2
; AVX512F-NEXT: vpsllq $63, %zmm2, %zmm2
; AVX512F-NEXT: vptestmq %zmm2, %zmm2, %k1
@@ -2137,7 +2154,7 @@ define <8 x i64> @test_vshufi64x2_512_mask(<8 x i64> %x, <8 x i64> %x1, <8 x i1>
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: test_vshufi64x2_512_mask:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: vpmovsxwq %xmm2, %zmm2
; AVX512F-32-NEXT: vpsllq $63, %zmm2, %zmm2
; AVX512F-32-NEXT: vptestmq %zmm2, %zmm2, %k1
@@ -2150,12 +2167,12 @@ define <8 x i64> @test_vshufi64x2_512_mask(<8 x i64> %x, <8 x i64> %x1, <8 x i1>
define <8 x double> @test_vshuff64x2_512_mem(<8 x double> %x, <8 x double> *%ptr) nounwind {
; AVX512F-LABEL: test_vshuff64x2_512_mem:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5],mem[2,3,0,1]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: test_vshuff64x2_512_mem:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX512F-32-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5],mem[2,3,0,1]
; AVX512F-32-NEXT: retl
@@ -2166,7 +2183,7 @@ define <8 x double> @test_vshuff64x2_512_mem(<8 x double> %x, <8 x double> *%ptr
define <8 x double> @test_vshuff64x2_512_mem_mask(<8 x double> %x, <8 x double> *%ptr, <8 x i1> %mask) nounwind {
; AVX512F-LABEL: test_vshuff64x2_512_mem_mask:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vpmovsxwq %xmm1, %zmm1
; AVX512F-NEXT: vpsllq $63, %zmm1, %zmm1
; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k1
@@ -2174,7 +2191,7 @@ define <8 x double> @test_vshuff64x2_512_mem_mask(<8 x double> %x, <8 x double>
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: test_vshuff64x2_512_mem_mask:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: vpmovsxwq %xmm1, %zmm1
; AVX512F-32-NEXT: vpsllq $63, %zmm1, %zmm1
; AVX512F-32-NEXT: vptestmq %zmm1, %zmm1, %k1
@@ -2189,7 +2206,7 @@ define <8 x double> @test_vshuff64x2_512_mem_mask(<8 x double> %x, <8 x double>
define <8 x double> @test_vshuff64x2_512_mem_maskz(<8 x double> %x, <8 x double> *%ptr, <8 x i1> %mask) nounwind {
; AVX512F-LABEL: test_vshuff64x2_512_mem_maskz:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vpmovsxwq %xmm1, %zmm1
; AVX512F-NEXT: vpsllq $63, %zmm1, %zmm1
; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k1
@@ -2197,7 +2214,7 @@ define <8 x double> @test_vshuff64x2_512_mem_maskz(<8 x double> %x, <8 x double>
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: test_vshuff64x2_512_mem_maskz:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: vpmovsxwq %xmm1, %zmm1
; AVX512F-32-NEXT: vpsllq $63, %zmm1, %zmm1
; AVX512F-32-NEXT: vptestmq %zmm1, %zmm1, %k1
@@ -2212,12 +2229,12 @@ define <8 x double> @test_vshuff64x2_512_mem_maskz(<8 x double> %x, <8 x double>
define <8 x double> @shuffle_v8f64_23014567(<8 x double> %a0, <8 x double> %a1) {
; AVX512F-LABEL: shuffle_v8f64_23014567:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm1[2,3,0,1,4,5,6,7]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_23014567:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm1[2,3,0,1,4,5,6,7]
; AVX512F-32-NEXT: retl
%1 = shufflevector <8 x double> %a1, <8 x double> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 4, i32 5, i32 6, i32 7>
@@ -2226,12 +2243,12 @@ define <8 x double> @shuffle_v8f64_23014567(<8 x double> %a0, <8 x double> %a1)
define <8 x double> @shuffle_v8f64_2301uu67(<8 x double> %a0, <8 x double> %a1) {
; AVX512F-LABEL: shuffle_v8f64_2301uu67:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm1[2,3,0,1,0,1,6,7]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_2301uu67:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm1[2,3,0,1,0,1,6,7]
; AVX512F-32-NEXT: retl
%1 = shufflevector <8 x double> %a1, <8 x double> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 undef, i32 undef, i32 6, i32 7>
@@ -2240,13 +2257,13 @@ define <8 x double> @shuffle_v8f64_2301uu67(<8 x double> %a0, <8 x double> %a1)
define <8 x double> @shuffle_v8f64_2301uuuu(<8 x double> %a0, <8 x double> %a1) {
; AVX512F-LABEL: shuffle_v8f64_2301uuuu:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vpermpd {{.*#+}} zmm0 = zmm1[2,3,0,1,6,7,4,5]
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpermpd {{.*#+}} ymm0 = ymm1[2,3,0,1]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_2301uuuu:
-; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vpermpd {{.*#+}} zmm0 = zmm1[2,3,0,1,6,7,4,5]
+; AVX512F-32: # %bb.0:
+; AVX512F-32-NEXT: vpermpd {{.*#+}} ymm0 = ymm1[2,3,0,1]
; AVX512F-32-NEXT: retl
%1 = shufflevector <8 x double> %a1, <8 x double> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef>
ret <8 x double> %1
@@ -2254,12 +2271,12 @@ define <8 x double> @shuffle_v8f64_2301uuuu(<8 x double> %a0, <8 x double> %a1)
define <8 x double> @shuffle_v8f64_uuu2301(<8 x double> %a0, <8 x double> %a1) {
; AVX512F-LABEL: shuffle_v8f64_uuu2301:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1],zmm1[2,3,0,1]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_uuu2301:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1],zmm1[2,3,0,1]
; AVX512F-32-NEXT: retl
%1 = shufflevector <8 x double> %a1, <8 x double> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 2, i32 3, i32 0, i32 1>
@@ -2268,13 +2285,13 @@ define <8 x double> @shuffle_v8f64_uuu2301(<8 x double> %a0, <8 x double> %a1) {
define <8 x i64> @shuffle_v8i64_0zzzzzzz(<8 x i64> %a) {
; AVX512F-LABEL: shuffle_v8i64_0zzzzzzz:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8i64_0zzzzzzz:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; AVX512F-32-NEXT: vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
; AVX512F-32-NEXT: retl
@@ -2284,13 +2301,13 @@ define <8 x i64> @shuffle_v8i64_0zzzzzzz(<8 x i64> %a) {
define <8 x double> @shuffle_v8f64_0zzzzzzz(<8 x double> %a) {
; AVX512F-LABEL: shuffle_v8f64_0zzzzzzz:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_0zzzzzzz:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; AVX512F-32-NEXT: vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
; AVX512F-32-NEXT: retl
@@ -2301,12 +2318,12 @@ define <8 x double> @shuffle_v8f64_0zzzzzzz(<8 x double> %a) {
define <8 x i64> @shuffle_v8i64_12345678(<8 x i64> %a, <8 x i64> %b) {
;
; AVX512F-LABEL: shuffle_v8i64_12345678:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: valignq {{.*#+}} zmm0 = zmm0[1,2,3,4,5,6,7],zmm1[0]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8i64_12345678:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: valignq {{.*#+}} zmm0 = zmm0[1,2,3,4,5,6,7],zmm1[0]
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
@@ -2316,12 +2333,12 @@ define <8 x i64> @shuffle_v8i64_12345678(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_12345670(<8 x i64> %a) {
;
; AVX512F-LABEL: shuffle_v8i64_12345670:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: valignq {{.*#+}} zmm0 = zmm0[1,2,3,4,5,6,7,0]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8i64_12345670:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: valignq {{.*#+}} zmm0 = zmm0[1,2,3,4,5,6,7,0]
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x i64> %a, <8 x i64> undef, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0>
@@ -2331,14 +2348,14 @@ define <8 x i64> @shuffle_v8i64_12345670(<8 x i64> %a) {
define <8 x i64> @mask_shuffle_v8i64_12345678(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passthru, i8 %mask) {
;
; AVX512F-LABEL: mask_shuffle_v8i64_12345678:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: kmovw %edi, %k1
; AVX512F-NEXT: valignq {{.*#+}} zmm2 {%k1} = zmm0[1,2,3,4,5,6,7],zmm1[0]
; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: mask_shuffle_v8i64_12345678:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; AVX512F-32-NEXT: kmovw %eax, %k1
; AVX512F-32-NEXT: valignq {{.*#+}} zmm2 {%k1} = zmm0[1,2,3,4,5,6,7],zmm1[0]
@@ -2353,14 +2370,14 @@ define <8 x i64> @mask_shuffle_v8i64_12345678(<8 x i64> %a, <8 x i64> %b, <8 x i
define <8 x i64> @mask_shuffle_v8i64_12345670(<8 x i64> %a, <8 x i64> %passthru, i8 %mask) {
;
; AVX512F-LABEL: mask_shuffle_v8i64_12345670:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: kmovw %edi, %k1
; AVX512F-NEXT: valignq {{.*#+}} zmm1 {%k1} = zmm0[1,2,3,4,5,6,7,0]
; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: mask_shuffle_v8i64_12345670:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; AVX512F-32-NEXT: kmovw %eax, %k1
; AVX512F-32-NEXT: valignq {{.*#+}} zmm1 {%k1} = zmm0[1,2,3,4,5,6,7,0]
@@ -2375,13 +2392,13 @@ define <8 x i64> @mask_shuffle_v8i64_12345670(<8 x i64> %a, <8 x i64> %passthru,
define <8 x i64> @maskz_shuffle_v8i64_12345678(<8 x i64> %a, <8 x i64> %b, i8 %mask) {
;
; AVX512F-LABEL: maskz_shuffle_v8i64_12345678:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: kmovw %edi, %k1
; AVX512F-NEXT: valignq {{.*#+}} zmm0 {%k1} {z} = zmm0[1,2,3,4,5,6,7],zmm1[0]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: maskz_shuffle_v8i64_12345678:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; AVX512F-32-NEXT: kmovw %eax, %k1
; AVX512F-32-NEXT: valignq {{.*#+}} zmm0 {%k1} {z} = zmm0[1,2,3,4,5,6,7],zmm1[0]
@@ -2395,13 +2412,13 @@ define <8 x i64> @maskz_shuffle_v8i64_12345678(<8 x i64> %a, <8 x i64> %b, i8 %m
define <8 x i64> @maskz_shuffle_v8i64_12345670(<8 x i64> %a, i8 %mask) {
;
; AVX512F-LABEL: maskz_shuffle_v8i64_12345670:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: kmovw %edi, %k1
; AVX512F-NEXT: valignq {{.*#+}} zmm0 {%k1} {z} = zmm0[1,2,3,4,5,6,7,0]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: maskz_shuffle_v8i64_12345670:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; AVX512F-32-NEXT: kmovw %eax, %k1
; AVX512F-32-NEXT: valignq {{.*#+}} zmm0 {%k1} {z} = zmm0[1,2,3,4,5,6,7,0]
@@ -2414,12 +2431,12 @@ define <8 x i64> @maskz_shuffle_v8i64_12345670(<8 x i64> %a, i8 %mask) {
define <8 x double> @shuffle_v8f64_012389AB(<8 x double> %a, <8 x double> %b) {
; AVX512F-LABEL: shuffle_v8f64_012389AB:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_012389AB:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
@@ -2428,12 +2445,12 @@ define <8 x double> @shuffle_v8f64_012389AB(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_89AB0123(<8 x double> %a, <8 x double> %b) {
; AVX512F-LABEL: shuffle_v8f64_89AB0123:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_89AB0123:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 0, i32 1, i32 2, i32 3>
@@ -2442,12 +2459,12 @@ define <8 x double> @shuffle_v8f64_89AB0123(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_01230123(<8 x double> %a, <8 x double> %b) {
; AVX512F-LABEL: shuffle_v8f64_01230123:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_01230123:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
@@ -2456,13 +2473,13 @@ define <8 x double> @shuffle_v8f64_01230123(<8 x double> %a, <8 x double> %b) {
define <8 x i64> @shuffle_v8i64_012389AB(<8 x i64> %a, <8 x i64> %b) {
; AVX512F-LABEL: shuffle_v8i64_012389AB:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8i64_012389AB:
-; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-32: # %bb.0:
+; AVX512F-32-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
ret <8 x i64> %shuffle
@@ -2470,13 +2487,13 @@ define <8 x i64> @shuffle_v8i64_012389AB(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_89AB0123(<8 x i64> %a, <8 x i64> %b) {
; AVX512F-LABEL: shuffle_v8i64_89AB0123:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8i64_89AB0123:
-; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-32: # %bb.0:
+; AVX512F-32-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 0, i32 1, i32 2, i32 3>
ret <8 x i64> %shuffle
@@ -2484,13 +2501,13 @@ define <8 x i64> @shuffle_v8i64_89AB0123(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_01230123(<8 x i64> %a, <8 x i64> %b) {
; AVX512F-LABEL: shuffle_v8i64_01230123:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8i64_01230123:
-; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
+; AVX512F-32: # %bb.0:
+; AVX512F-32-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
ret <8 x i64> %shuffle
@@ -2498,12 +2515,12 @@ define <8 x i64> @shuffle_v8i64_01230123(<8 x i64> %a, <8 x i64> %b) {
define <8 x double> @shuffle_v8f64_89234567(<8 x double> %a, <8 x double> %b) {
; AVX512F-LABEL: shuffle_v8f64_89234567:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vinsertf32x4 $0, %xmm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_89234567:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: vinsertf32x4 $0, %xmm1, %zmm0, %zmm0
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -2512,12 +2529,12 @@ define <8 x double> @shuffle_v8f64_89234567(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_01894567(<8 x double> %a, <8 x double> %b) {
; AVX512F-LABEL: shuffle_v8f64_01894567:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_01894567:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm0
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
@@ -2526,12 +2543,12 @@ define <8 x double> @shuffle_v8f64_01894567(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_01238967(<8 x double> %a, <8 x double> %b) {
; AVX512F-LABEL: shuffle_v8f64_01238967:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vinsertf32x4 $2, %xmm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_01238967:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: vinsertf32x4 $2, %xmm1, %zmm0, %zmm0
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
@@ -2540,12 +2557,12 @@ define <8 x double> @shuffle_v8f64_01238967(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_01234589(<8 x double> %a, <8 x double> %b) {
; AVX512F-LABEL: shuffle_v8f64_01234589:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vinsertf32x4 $3, %xmm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_01234589:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: vinsertf32x4 $3, %xmm1, %zmm0, %zmm0
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
@@ -2554,13 +2571,13 @@ define <8 x double> @shuffle_v8f64_01234589(<8 x double> %a, <8 x double> %b) {
define <8 x i64> @shuffle_v8i64_89234567(<8 x i64> %a, <8 x i64> %b) {
; AVX512F-LABEL: shuffle_v8i64_89234567:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vinsertf32x4 $0, %xmm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8i64_89234567:
-; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0
+; AVX512F-32: # %bb.0:
+; AVX512F-32-NEXT: vinsertf32x4 $0, %xmm1, %zmm0, %zmm0
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
ret <8 x i64> %shuffle
@@ -2568,13 +2585,13 @@ define <8 x i64> @shuffle_v8i64_89234567(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_01894567(<8 x i64> %a, <8 x i64> %b) {
; AVX512F-LABEL: shuffle_v8i64_01894567:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8i64_01894567:
-; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0
+; AVX512F-32: # %bb.0:
+; AVX512F-32-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm0
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
ret <8 x i64> %shuffle
@@ -2582,13 +2599,13 @@ define <8 x i64> @shuffle_v8i64_01894567(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_01238967(<8 x i64> %a, <8 x i64> %b) {
; AVX512F-LABEL: shuffle_v8i64_01238967:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vinsertf32x4 $2, %xmm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8i64_01238967:
-; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0
+; AVX512F-32: # %bb.0:
+; AVX512F-32-NEXT: vinsertf32x4 $2, %xmm1, %zmm0, %zmm0
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
ret <8 x i64> %shuffle
@@ -2596,13 +2613,13 @@ define <8 x i64> @shuffle_v8i64_01238967(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_01234589(<8 x i64> %a, <8 x i64> %b) {
; AVX512F-LABEL: shuffle_v8i64_01234589:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vinserti32x4 $3, %xmm1, %zmm0, %zmm0
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vinsertf32x4 $3, %xmm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8i64_01234589:
-; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vinserti32x4 $3, %xmm1, %zmm0, %zmm0
+; AVX512F-32: # %bb.0:
+; AVX512F-32-NEXT: vinsertf32x4 $3, %xmm1, %zmm0, %zmm0
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
ret <8 x i64> %shuffle
@@ -2610,13 +2627,13 @@ define <8 x i64> @shuffle_v8i64_01234589(<8 x i64> %a, <8 x i64> %b) {
define <8 x double> @shuffle_v4f64_v8f64_22222222(<4 x double> %a) {
; AVX512F-LABEL: shuffle_v4f64_v8f64_22222222:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX512F-NEXT: vbroadcastsd %xmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v4f64_v8f64_22222222:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX512F-32-NEXT: vbroadcastsd %xmm0, %zmm0
; AVX512F-32-NEXT: retl
@@ -2626,17 +2643,17 @@ define <8 x double> @shuffle_v4f64_v8f64_22222222(<4 x double> %a) {
define <8 x i64> @shuffle_v2i64_v8i64_01010101(<2 x i64> %a) {
; AVX512F-LABEL: shuffle_v2i64_v8i64_01010101:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
-; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
+; AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX512F-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v2i64_v8i64_01010101:
-; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
-; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
+; AVX512F-32: # %bb.0:
+; AVX512F-32-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
+; AVX512F-32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX512F-32-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <2 x i64> %a, <2 x i64> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
ret <8 x i64> %shuffle
@@ -2644,15 +2661,15 @@ define <8 x i64> @shuffle_v2i64_v8i64_01010101(<2 x i64> %a) {
define <8 x double> @shuffle_v2f64_v8f64_01010101(<2 x double> %a) {
; AVX512F-LABEL: shuffle_v2f64_v8f64_01010101:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
; AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX512F-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v2f64_v8f64_01010101:
-; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; AVX512F-32: # %bb.0:
+; AVX512F-32-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
; AVX512F-32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX512F-32-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
; AVX512F-32-NEXT: retl
@@ -2663,7 +2680,7 @@ define <8 x double> @shuffle_v2f64_v8f64_01010101(<2 x double> %a) {
;FIXME: compressp
define <4 x double> @test_v8f64_2346 (<8 x double> %v) {
; AVX512F-LABEL: test_v8f64_2346:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX512F-NEXT: vextractf64x4 $1, %zmm0, %ymm0
; AVX512F-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,2]
@@ -2671,7 +2688,7 @@ define <4 x double> @test_v8f64_2346 (<8 x double> %v) {
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: test_v8f64_2346:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX512F-32-NEXT: vextractf64x4 $1, %zmm0, %ymm0
; AVX512F-32-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,2]
@@ -2684,17 +2701,17 @@ define <4 x double> @test_v8f64_2346 (<8 x double> %v) {
;FIXME: compressp
define <2 x double> @test_v8f64_34 (<8 x double> %v) {
; AVX512F-LABEL: test_v8f64_34:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vextractf32x4 $2, %zmm0, %xmm1
-; AVX512F-NEXT: vextractf32x4 $1, %zmm0, %xmm0
+; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX512F-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0]
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: test_v8f64_34:
-; AVX512F-32: # BB#0:
+; AVX512F-32: # %bb.0:
; AVX512F-32-NEXT: vextractf32x4 $2, %zmm0, %xmm1
-; AVX512F-32-NEXT: vextractf32x4 $1, %zmm0, %xmm0
+; AVX512F-32-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX512F-32-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0]
; AVX512F-32-NEXT: vzeroupper
; AVX512F-32-NEXT: retl
@@ -2705,19 +2722,19 @@ define <2 x double> @test_v8f64_34 (<8 x double> %v) {
; FIXME: vpcompress
define <4 x i64> @test_v8i64_1257 (<8 x i64> %v) {
; AVX512F-LABEL: test_v8i64_1257:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3]
-; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,2,2,3]
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vextractf64x4 $1, %zmm0, %ymm1
+; AVX512F-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,3]
+; AVX512F-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,2,2,3]
+; AVX512F-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: test_v8i64_1257:
-; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-32-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3]
-; AVX512F-32-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,2,2,3]
-; AVX512F-32-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-32: # %bb.0:
+; AVX512F-32-NEXT: vextractf64x4 $1, %zmm0, %ymm1
+; AVX512F-32-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,3]
+; AVX512F-32-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,2,2,3]
+; AVX512F-32-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX512F-32-NEXT: retl
%res = shufflevector <8 x i64> %v, <8 x i64> undef, <4 x i32> <i32 1, i32 2, i32 5, i32 7>
ret <4 x i64> %res
@@ -2725,23 +2742,18 @@ define <4 x i64> @test_v8i64_1257 (<8 x i64> %v) {
define <2 x i64> @test_v8i64_2_5 (<8 x i64> %v) {
; AVX512F-LABEL: test_v8i64_2_5:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vextracti32x4 $2, %zmm0, %xmm1
-; AVX512F-NEXT: vextracti32x4 $1, %zmm0, %xmm0
-; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vextractf64x4 $1, %zmm0, %ymm1
+; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX512F-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: test_v8i64_2_5:
-; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vextracti32x4 $1, %zmm0, %xmm1
-; AVX512F-32-NEXT: vpextrd $1, %xmm1, %eax
-; AVX512F-32-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1
-; AVX512F-32-NEXT: vextracti32x4 $2, %zmm0, %xmm0
-; AVX512F-32-NEXT: vpextrd $2, %xmm0, %eax
-; AVX512F-32-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1
-; AVX512F-32-NEXT: vpextrd $3, %xmm0, %eax
-; AVX512F-32-NEXT: vpinsrd $3, %eax, %xmm1, %xmm0
+; AVX512F-32: # %bb.0:
+; AVX512F-32-NEXT: vextractf64x4 $1, %zmm0, %ymm1
+; AVX512F-32-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX512F-32-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
; AVX512F-32-NEXT: vzeroupper
; AVX512F-32-NEXT: retl
%res = shufflevector <8 x i64> %v, <8 x i64> undef, <2 x i32> <i32 2, i32 5>
diff --git a/test/CodeGen/X86/vector-shuffle-avx512.ll b/test/CodeGen/X86/vector-shuffle-avx512.ll
index 706edd27a3f1..b066f123861a 100644
--- a/test/CodeGen/X86/vector-shuffle-avx512.ll
+++ b/test/CodeGen/X86/vector-shuffle-avx512.ll
@@ -7,32 +7,32 @@
;expand 128 -> 256 include <4 x float> <2 x double>
define <8 x float> @expand(<4 x float> %a) {
; SKX64-LABEL: expand:
-; SKX64: # BB#0:
-; SKX64-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; SKX64: # %bb.0:
+; SKX64-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
; SKX64-NEXT: movb $5, %al
; SKX64-NEXT: kmovd %eax, %k1
; SKX64-NEXT: vexpandps %ymm0, %ymm0 {%k1} {z}
; SKX64-NEXT: retq
;
; KNL64-LABEL: expand:
-; KNL64: # BB#0:
+; KNL64: # %bb.0:
; KNL64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3]
-; KNL64-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; KNL64-NEXT: vxorps %xmm1, %xmm1, %xmm1
; KNL64-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3,4,5,6,7]
; KNL64-NEXT: retq
;
; SKX32-LABEL: expand:
-; SKX32: # BB#0:
-; SKX32-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; SKX32: # %bb.0:
+; SKX32-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
; SKX32-NEXT: movb $5, %al
; SKX32-NEXT: kmovd %eax, %k1
; SKX32-NEXT: vexpandps %ymm0, %ymm0 {%k1} {z}
; SKX32-NEXT: retl
;
; KNL32-LABEL: expand:
-; KNL32: # BB#0:
+; KNL32: # %bb.0:
; KNL32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3]
-; KNL32-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; KNL32-NEXT: vxorps %xmm1, %xmm1, %xmm1
; KNL32-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3,4,5,6,7]
; KNL32-NEXT: retl
%res = shufflevector <4 x float> %a, <4 x float> zeroinitializer, <8 x i32> <i32 0, i32 5, i32 1, i32 5, i32 5, i32 5, i32 5, i32 5>
@@ -41,36 +41,36 @@ define <8 x float> @expand(<4 x float> %a) {
define <8 x float> @expand1(<4 x float> %a ) {
; SKX64-LABEL: expand1:
-; SKX64: # BB#0:
-; SKX64-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; SKX64: # %bb.0:
+; SKX64-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
; SKX64-NEXT: movb $-86, %al
; SKX64-NEXT: kmovd %eax, %k1
; SKX64-NEXT: vexpandps %ymm0, %ymm0 {%k1} {z}
; SKX64-NEXT: retq
;
; KNL64-LABEL: expand1:
-; KNL64: # BB#0:
-; KNL64-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; KNL64: # %bb.0:
+; KNL64-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
; KNL64-NEXT: vmovaps {{.*#+}} ymm1 = <u,0,u,1,u,2,u,3>
; KNL64-NEXT: vpermps %ymm0, %ymm1, %ymm0
-; KNL64-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; KNL64-NEXT: vxorps %xmm1, %xmm1, %xmm1
; KNL64-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7]
; KNL64-NEXT: retq
;
; SKX32-LABEL: expand1:
-; SKX32: # BB#0:
-; SKX32-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; SKX32: # %bb.0:
+; SKX32-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
; SKX32-NEXT: movb $-86, %al
; SKX32-NEXT: kmovd %eax, %k1
; SKX32-NEXT: vexpandps %ymm0, %ymm0 {%k1} {z}
; SKX32-NEXT: retl
;
; KNL32-LABEL: expand1:
-; KNL32: # BB#0:
-; KNL32-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; KNL32: # %bb.0:
+; KNL32-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
; KNL32-NEXT: vmovaps {{.*#+}} ymm1 = <u,0,u,1,u,2,u,3>
; KNL32-NEXT: vpermps %ymm0, %ymm1, %ymm0
-; KNL32-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; KNL32-NEXT: vxorps %xmm1, %xmm1, %xmm1
; KNL32-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7]
; KNL32-NEXT: retl
%res = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
@@ -80,34 +80,34 @@ define <8 x float> @expand1(<4 x float> %a ) {
;Expand 128 -> 256 test <2 x double> -> <4 x double>
define <4 x double> @expand2(<2 x double> %a) {
; SKX64-LABEL: expand2:
-; SKX64: # BB#0:
-; SKX64-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; SKX64: # %bb.0:
+; SKX64-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
; SKX64-NEXT: movb $9, %al
; SKX64-NEXT: kmovd %eax, %k1
; SKX64-NEXT: vexpandpd %ymm0, %ymm0 {%k1} {z}
; SKX64-NEXT: retq
;
; KNL64-LABEL: expand2:
-; KNL64: # BB#0:
-; KNL64-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; KNL64: # %bb.0:
+; KNL64-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
; KNL64-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,2,1]
-; KNL64-NEXT: vxorpd %ymm1, %ymm1, %ymm1
+; KNL64-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; KNL64-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3]
; KNL64-NEXT: retq
;
; SKX32-LABEL: expand2:
-; SKX32: # BB#0:
-; SKX32-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; SKX32: # %bb.0:
+; SKX32-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
; SKX32-NEXT: movb $9, %al
; SKX32-NEXT: kmovd %eax, %k1
; SKX32-NEXT: vexpandpd %ymm0, %ymm0 {%k1} {z}
; SKX32-NEXT: retl
;
; KNL32-LABEL: expand2:
-; KNL32: # BB#0:
-; KNL32-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; KNL32: # %bb.0:
+; KNL32-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
; KNL32-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,2,1]
-; KNL32-NEXT: vxorpd %ymm1, %ymm1, %ymm1
+; KNL32-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; KNL32-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3]
; KNL32-NEXT: retl
%res = shufflevector <2 x double> %a, <2 x double> zeroinitializer, <4 x i32> <i32 0, i32 2, i32 2, i32 1>
@@ -117,33 +117,33 @@ define <4 x double> @expand2(<2 x double> %a) {
;expand 128 -> 256 include case <4 x i32> <8 x i32>
define <8 x i32> @expand3(<4 x i32> %a ) {
; SKX64-LABEL: expand3:
-; SKX64: # BB#0:
-; SKX64-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; SKX64: # %bb.0:
+; SKX64-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
; SKX64-NEXT: movb $-127, %al
; SKX64-NEXT: kmovd %eax, %k1
; SKX64-NEXT: vpexpandd %ymm0, %ymm0 {%k1} {z}
; SKX64-NEXT: retq
;
; KNL64-LABEL: expand3:
-; KNL64: # BB#0:
-; KNL64-NEXT: vpbroadcastq %xmm0, %ymm0
-; KNL64-NEXT: vpxor %ymm1, %ymm1, %ymm1
-; KNL64-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6],ymm0[7]
+; KNL64: # %bb.0:
+; KNL64-NEXT: vbroadcastsd %xmm0, %ymm0
+; KNL64-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; KNL64-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6],ymm0[7]
; KNL64-NEXT: retq
;
; SKX32-LABEL: expand3:
-; SKX32: # BB#0:
-; SKX32-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; SKX32: # %bb.0:
+; SKX32-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
; SKX32-NEXT: movb $-127, %al
; SKX32-NEXT: kmovd %eax, %k1
; SKX32-NEXT: vpexpandd %ymm0, %ymm0 {%k1} {z}
; SKX32-NEXT: retl
;
; KNL32-LABEL: expand3:
-; KNL32: # BB#0:
-; KNL32-NEXT: vpbroadcastq %xmm0, %ymm0
-; KNL32-NEXT: vpxor %ymm1, %ymm1, %ymm1
-; KNL32-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6],ymm0[7]
+; KNL32: # %bb.0:
+; KNL32-NEXT: vbroadcastsd %xmm0, %ymm0
+; KNL32-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; KNL32-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6],ymm0[7]
; KNL32-NEXT: retl
%res = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <8 x i32> <i32 4, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0,i32 5>
ret <8 x i32> %res
@@ -152,35 +152,35 @@ define <8 x i32> @expand3(<4 x i32> %a ) {
;expand 128 -> 256 include case <2 x i64> <4 x i64>
define <4 x i64> @expand4(<2 x i64> %a ) {
; SKX64-LABEL: expand4:
-; SKX64: # BB#0:
-; SKX64-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; SKX64: # %bb.0:
+; SKX64-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
; SKX64-NEXT: movb $9, %al
; SKX64-NEXT: kmovd %eax, %k1
; SKX64-NEXT: vpexpandq %ymm0, %ymm0 {%k1} {z}
; SKX64-NEXT: retq
;
; KNL64-LABEL: expand4:
-; KNL64: # BB#0:
-; KNL64-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
-; KNL64-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1]
-; KNL64-NEXT: vpxor %ymm1, %ymm1, %ymm1
-; KNL64-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5],ymm0[6,7]
+; KNL64: # %bb.0:
+; KNL64-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
+; KNL64-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,2,1]
+; KNL64-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; KNL64-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5],ymm0[6,7]
; KNL64-NEXT: retq
;
; SKX32-LABEL: expand4:
-; SKX32: # BB#0:
-; SKX32-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; SKX32: # %bb.0:
+; SKX32-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
; SKX32-NEXT: movb $9, %al
; SKX32-NEXT: kmovd %eax, %k1
; SKX32-NEXT: vpexpandq %ymm0, %ymm0 {%k1} {z}
; SKX32-NEXT: retl
;
; KNL32-LABEL: expand4:
-; KNL32: # BB#0:
-; KNL32-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
-; KNL32-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1]
-; KNL32-NEXT: vpxor %ymm1, %ymm1, %ymm1
-; KNL32-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5],ymm0[6,7]
+; KNL32: # %bb.0:
+; KNL32-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
+; KNL32-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,2,1]
+; KNL32-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; KNL32-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5],ymm0[6,7]
; KNL32-NEXT: retl
%res = shufflevector <2 x i64> zeroinitializer, <2 x i64> %a, <4 x i32> <i32 2, i32 0, i32 0, i32 3>
ret <4 x i64> %res
@@ -189,30 +189,30 @@ define <4 x i64> @expand4(<2 x i64> %a ) {
;Negative test for 128-> 256
define <8 x float> @expand5(<4 x float> %a ) {
; SKX64-LABEL: expand5:
-; SKX64: # BB#0:
+; SKX64: # %bb.0:
; SKX64-NEXT: vbroadcastss %xmm0, %ymm0
-; SKX64-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; SKX64-NEXT: vxorps %xmm1, %xmm1, %xmm1
; SKX64-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7]
; SKX64-NEXT: retq
;
; KNL64-LABEL: expand5:
-; KNL64: # BB#0:
+; KNL64: # %bb.0:
; KNL64-NEXT: vbroadcastss %xmm0, %ymm0
-; KNL64-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; KNL64-NEXT: vxorps %xmm1, %xmm1, %xmm1
; KNL64-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7]
; KNL64-NEXT: retq
;
; SKX32-LABEL: expand5:
-; SKX32: # BB#0:
+; SKX32: # %bb.0:
; SKX32-NEXT: vbroadcastss %xmm0, %ymm0
-; SKX32-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; SKX32-NEXT: vxorps %xmm1, %xmm1, %xmm1
; SKX32-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7]
; SKX32-NEXT: retl
;
; KNL32-LABEL: expand5:
-; KNL32: # BB#0:
+; KNL32: # %bb.0:
; KNL32-NEXT: vbroadcastss %xmm0, %ymm0
-; KNL32-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; KNL32-NEXT: vxorps %xmm1, %xmm1, %xmm1
; KNL32-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7]
; KNL32-NEXT: retl
%res = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <8 x i32> <i32 0, i32 4, i32 1, i32 4, i32 2, i32 4, i32 3, i32 4>
@@ -222,25 +222,25 @@ define <8 x float> @expand5(<4 x float> %a ) {
;expand 256 -> 512 include <8 x float> <16 x float>
define <8 x float> @expand6(<4 x float> %a ) {
; SKX64-LABEL: expand6:
-; SKX64: # BB#0:
+; SKX64: # %bb.0:
; SKX64-NEXT: vxorps %xmm1, %xmm1, %xmm1
; SKX64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; SKX64-NEXT: retq
;
; KNL64-LABEL: expand6:
-; KNL64: # BB#0:
+; KNL64: # %bb.0:
; KNL64-NEXT: vxorps %xmm1, %xmm1, %xmm1
; KNL64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; KNL64-NEXT: retq
;
; SKX32-LABEL: expand6:
-; SKX32: # BB#0:
+; SKX32: # %bb.0:
; SKX32-NEXT: vxorps %xmm1, %xmm1, %xmm1
; SKX32-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; SKX32-NEXT: retl
;
; KNL32-LABEL: expand6:
-; KNL32: # BB#0:
+; KNL32: # %bb.0:
; KNL32-NEXT: vxorps %xmm1, %xmm1, %xmm1
; KNL32-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; KNL32-NEXT: retl
@@ -250,32 +250,32 @@ define <8 x float> @expand6(<4 x float> %a ) {
define <16 x float> @expand7(<8 x float> %a) {
; SKX64-LABEL: expand7:
-; SKX64: # BB#0:
-; SKX64-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; SKX64: # %bb.0:
+; SKX64-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; SKX64-NEXT: movw $1285, %ax # imm = 0x505
; SKX64-NEXT: kmovd %eax, %k1
; SKX64-NEXT: vexpandps %zmm0, %zmm0 {%k1} {z}
; SKX64-NEXT: retq
;
; KNL64-LABEL: expand7:
-; KNL64: # BB#0:
-; KNL64-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; KNL64: # %bb.0:
+; KNL64-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; KNL64-NEXT: movw $1285, %ax # imm = 0x505
; KNL64-NEXT: kmovw %eax, %k1
; KNL64-NEXT: vexpandps %zmm0, %zmm0 {%k1} {z}
; KNL64-NEXT: retq
;
; SKX32-LABEL: expand7:
-; SKX32: # BB#0:
-; SKX32-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; SKX32: # %bb.0:
+; SKX32-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; SKX32-NEXT: movw $1285, %ax # imm = 0x505
; SKX32-NEXT: kmovd %eax, %k1
; SKX32-NEXT: vexpandps %zmm0, %zmm0 {%k1} {z}
; SKX32-NEXT: retl
;
; KNL32-LABEL: expand7:
-; KNL32: # BB#0:
-; KNL32-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; KNL32: # %bb.0:
+; KNL32-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; KNL32-NEXT: movw $1285, %ax # imm = 0x505
; KNL32-NEXT: kmovw %eax, %k1
; KNL32-NEXT: vexpandps %zmm0, %zmm0 {%k1} {z}
@@ -286,32 +286,32 @@ define <16 x float> @expand7(<8 x float> %a) {
define <16 x float> @expand8(<8 x float> %a ) {
; SKX64-LABEL: expand8:
-; SKX64: # BB#0:
-; SKX64-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; SKX64: # %bb.0:
+; SKX64-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; SKX64-NEXT: movw $-21846, %ax # imm = 0xAAAA
; SKX64-NEXT: kmovd %eax, %k1
; SKX64-NEXT: vexpandps %zmm0, %zmm0 {%k1} {z}
; SKX64-NEXT: retq
;
; KNL64-LABEL: expand8:
-; KNL64: # BB#0:
-; KNL64-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; KNL64: # %bb.0:
+; KNL64-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; KNL64-NEXT: movw $-21846, %ax # imm = 0xAAAA
; KNL64-NEXT: kmovw %eax, %k1
; KNL64-NEXT: vexpandps %zmm0, %zmm0 {%k1} {z}
; KNL64-NEXT: retq
;
; SKX32-LABEL: expand8:
-; SKX32: # BB#0:
-; SKX32-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; SKX32: # %bb.0:
+; SKX32-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; SKX32-NEXT: movw $-21846, %ax # imm = 0xAAAA
; SKX32-NEXT: kmovd %eax, %k1
; SKX32-NEXT: vexpandps %zmm0, %zmm0 {%k1} {z}
; SKX32-NEXT: retl
;
; KNL32-LABEL: expand8:
-; KNL32: # BB#0:
-; KNL32-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; KNL32: # %bb.0:
+; KNL32-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; KNL32-NEXT: movw $-21846, %ax # imm = 0xAAAA
; KNL32-NEXT: kmovw %eax, %k1
; KNL32-NEXT: vexpandps %zmm0, %zmm0 {%k1} {z}
@@ -323,32 +323,32 @@ define <16 x float> @expand8(<8 x float> %a ) {
;expand 256 -> 512 include <4 x double> <8 x double>
define <8 x double> @expand9(<4 x double> %a) {
; SKX64-LABEL: expand9:
-; SKX64: # BB#0:
-; SKX64-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; SKX64: # %bb.0:
+; SKX64-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; SKX64-NEXT: movb $-127, %al
; SKX64-NEXT: kmovd %eax, %k1
; SKX64-NEXT: vexpandpd %zmm0, %zmm0 {%k1} {z}
; SKX64-NEXT: retq
;
; KNL64-LABEL: expand9:
-; KNL64: # BB#0:
-; KNL64-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; KNL64: # %bb.0:
+; KNL64-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; KNL64-NEXT: movb $-127, %al
; KNL64-NEXT: kmovw %eax, %k1
; KNL64-NEXT: vexpandpd %zmm0, %zmm0 {%k1} {z}
; KNL64-NEXT: retq
;
; SKX32-LABEL: expand9:
-; SKX32: # BB#0:
-; SKX32-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; SKX32: # %bb.0:
+; SKX32-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; SKX32-NEXT: movb $-127, %al
; SKX32-NEXT: kmovd %eax, %k1
; SKX32-NEXT: vexpandpd %zmm0, %zmm0 {%k1} {z}
; SKX32-NEXT: retl
;
; KNL32-LABEL: expand9:
-; KNL32: # BB#0:
-; KNL32-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; KNL32: # %bb.0:
+; KNL32-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; KNL32-NEXT: movb $-127, %al
; KNL32-NEXT: kmovw %eax, %k1
; KNL32-NEXT: vexpandpd %zmm0, %zmm0 {%k1} {z}
@@ -359,32 +359,32 @@ define <8 x double> @expand9(<4 x double> %a) {
define <16 x i32> @expand10(<8 x i32> %a ) {
; SKX64-LABEL: expand10:
-; SKX64: # BB#0:
-; SKX64-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; SKX64: # %bb.0:
+; SKX64-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; SKX64-NEXT: movw $-21846, %ax # imm = 0xAAAA
; SKX64-NEXT: kmovd %eax, %k1
; SKX64-NEXT: vpexpandd %zmm0, %zmm0 {%k1} {z}
; SKX64-NEXT: retq
;
; KNL64-LABEL: expand10:
-; KNL64: # BB#0:
-; KNL64-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; KNL64: # %bb.0:
+; KNL64-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; KNL64-NEXT: movw $-21846, %ax # imm = 0xAAAA
; KNL64-NEXT: kmovw %eax, %k1
; KNL64-NEXT: vpexpandd %zmm0, %zmm0 {%k1} {z}
; KNL64-NEXT: retq
;
; SKX32-LABEL: expand10:
-; SKX32: # BB#0:
-; SKX32-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; SKX32: # %bb.0:
+; SKX32-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; SKX32-NEXT: movw $-21846, %ax # imm = 0xAAAA
; SKX32-NEXT: kmovd %eax, %k1
; SKX32-NEXT: vpexpandd %zmm0, %zmm0 {%k1} {z}
; SKX32-NEXT: retl
;
; KNL32-LABEL: expand10:
-; KNL32: # BB#0:
-; KNL32-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; KNL32: # %bb.0:
+; KNL32-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; KNL32-NEXT: movw $-21846, %ax # imm = 0xAAAA
; KNL32-NEXT: kmovw %eax, %k1
; KNL32-NEXT: vpexpandd %zmm0, %zmm0 {%k1} {z}
@@ -395,32 +395,32 @@ define <16 x i32> @expand10(<8 x i32> %a ) {
define <8 x i64> @expand11(<4 x i64> %a) {
; SKX64-LABEL: expand11:
-; SKX64: # BB#0:
-; SKX64-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; SKX64: # %bb.0:
+; SKX64-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; SKX64-NEXT: movb $-127, %al
; SKX64-NEXT: kmovd %eax, %k1
; SKX64-NEXT: vpexpandq %zmm0, %zmm0 {%k1} {z}
; SKX64-NEXT: retq
;
; KNL64-LABEL: expand11:
-; KNL64: # BB#0:
-; KNL64-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; KNL64: # %bb.0:
+; KNL64-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; KNL64-NEXT: movb $-127, %al
; KNL64-NEXT: kmovw %eax, %k1
; KNL64-NEXT: vpexpandq %zmm0, %zmm0 {%k1} {z}
; KNL64-NEXT: retq
;
; SKX32-LABEL: expand11:
-; SKX32: # BB#0:
-; SKX32-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; SKX32: # %bb.0:
+; SKX32-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; SKX32-NEXT: movb $-127, %al
; SKX32-NEXT: kmovd %eax, %k1
; SKX32-NEXT: vpexpandq %zmm0, %zmm0 {%k1} {z}
; SKX32-NEXT: retl
;
; KNL32-LABEL: expand11:
-; KNL32: # BB#0:
-; KNL32-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; KNL32: # %bb.0:
+; KNL32-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; KNL32-NEXT: movb $-127, %al
; KNL32-NEXT: kmovw %eax, %k1
; KNL32-NEXT: vpexpandq %zmm0, %zmm0 {%k1} {z}
@@ -432,37 +432,37 @@ define <8 x i64> @expand11(<4 x i64> %a) {
;Negative test for 256-> 512
define <16 x float> @expand12(<8 x float> %a) {
; SKX64-LABEL: expand12:
-; SKX64: # BB#0:
-; SKX64-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; SKX64: # %bb.0:
+; SKX64-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; SKX64-NEXT: vmovaps {{.*#+}} zmm2 = [0,16,2,16,4,16,6,16,0,16,1,16,2,16,3,16]
-; SKX64-NEXT: vxorps %zmm1, %zmm1, %zmm1
+; SKX64-NEXT: vxorps %xmm1, %xmm1, %xmm1
; SKX64-NEXT: vpermt2ps %zmm0, %zmm2, %zmm1
; SKX64-NEXT: vmovaps %zmm1, %zmm0
; SKX64-NEXT: retq
;
; KNL64-LABEL: expand12:
-; KNL64: # BB#0:
-; KNL64-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; KNL64: # %bb.0:
+; KNL64-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; KNL64-NEXT: vmovaps {{.*#+}} zmm2 = [0,16,2,16,4,16,6,16,0,16,1,16,2,16,3,16]
-; KNL64-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; KNL64-NEXT: vxorps %xmm1, %xmm1, %xmm1
; KNL64-NEXT: vpermt2ps %zmm0, %zmm2, %zmm1
; KNL64-NEXT: vmovaps %zmm1, %zmm0
; KNL64-NEXT: retq
;
; SKX32-LABEL: expand12:
-; SKX32: # BB#0:
-; SKX32-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; SKX32: # %bb.0:
+; SKX32-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; SKX32-NEXT: vmovaps {{.*#+}} zmm2 = [0,16,2,16,4,16,6,16,0,16,1,16,2,16,3,16]
-; SKX32-NEXT: vxorps %zmm1, %zmm1, %zmm1
+; SKX32-NEXT: vxorps %xmm1, %xmm1, %xmm1
; SKX32-NEXT: vpermt2ps %zmm0, %zmm2, %zmm1
; SKX32-NEXT: vmovaps %zmm1, %zmm0
; SKX32-NEXT: retl
;
; KNL32-LABEL: expand12:
-; KNL32: # BB#0:
-; KNL32-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; KNL32: # %bb.0:
+; KNL32-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; KNL32-NEXT: vmovaps {{.*#+}} zmm2 = [0,16,2,16,4,16,6,16,0,16,1,16,2,16,3,16]
-; KNL32-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; KNL32-NEXT: vxorps %xmm1, %xmm1, %xmm1
; KNL32-NEXT: vpermt2ps %zmm0, %zmm2, %zmm1
; KNL32-NEXT: vmovaps %zmm1, %zmm0
; KNL32-NEXT: retl
@@ -472,26 +472,26 @@ define <16 x float> @expand12(<8 x float> %a) {
define <16 x float> @expand13(<8 x float> %a ) {
; SKX64-LABEL: expand13:
-; SKX64: # BB#0:
-; SKX64-NEXT: vxorps %ymm1, %ymm1, %ymm1
-; SKX64-NEXT: vinsertf32x8 $1, %ymm0, %zmm1, %zmm0
+; SKX64: # %bb.0:
+; SKX64-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; SKX64-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
; SKX64-NEXT: retq
;
; KNL64-LABEL: expand13:
-; KNL64: # BB#0:
-; KNL64-NEXT: vxorpd %ymm1, %ymm1, %ymm1
+; KNL64: # %bb.0:
+; KNL64-NEXT: vxorps %xmm1, %xmm1, %xmm1
; KNL64-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
; KNL64-NEXT: retq
;
; SKX32-LABEL: expand13:
-; SKX32: # BB#0:
-; SKX32-NEXT: vxorps %ymm1, %ymm1, %ymm1
-; SKX32-NEXT: vinsertf32x8 $1, %ymm0, %zmm1, %zmm0
+; SKX32: # %bb.0:
+; SKX32-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; SKX32-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
; SKX32-NEXT: retl
;
; KNL32-LABEL: expand13:
-; KNL32: # BB#0:
-; KNL32-NEXT: vxorpd %ymm1, %ymm1, %ymm1
+; KNL32: # %bb.0:
+; KNL32-NEXT: vxorps %xmm1, %xmm1, %xmm1
; KNL32-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
; KNL32-NEXT: retl
%res = shufflevector <8 x float> zeroinitializer, <8 x float> %a, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -502,15 +502,15 @@ define <16 x float> @expand13(<8 x float> %a ) {
define <8 x float> @expand14(<4 x float> %a) {
; SKX64-LABEL: expand14:
-; SKX64: # BB#0:
-; SKX64-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; SKX64: # %bb.0:
+; SKX64-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
; SKX64-NEXT: movb $20, %al
; SKX64-NEXT: kmovd %eax, %k1
; SKX64-NEXT: vexpandps %ymm0, %ymm0 {%k1} {z}
; SKX64-NEXT: retq
;
; KNL64-LABEL: expand14:
-; KNL64: # BB#0:
+; KNL64: # %bb.0:
; KNL64-NEXT: vpermilps {{.*#+}} xmm1 = mem[3,3,0,0]
; KNL64-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,1]
; KNL64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3]
@@ -519,15 +519,15 @@ define <8 x float> @expand14(<4 x float> %a) {
; KNL64-NEXT: retq
;
; SKX32-LABEL: expand14:
-; SKX32: # BB#0:
-; SKX32-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; SKX32: # %bb.0:
+; SKX32-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
; SKX32-NEXT: movb $20, %al
; SKX32-NEXT: kmovd %eax, %k1
; SKX32-NEXT: vexpandps %ymm0, %ymm0 {%k1} {z}
; SKX32-NEXT: retl
;
; KNL32-LABEL: expand14:
-; KNL32: # BB#0:
+; KNL32: # %bb.0:
; KNL32-NEXT: vpermilps {{.*#+}} xmm1 = mem[3,3,0,0]
; KNL32-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,1]
; KNL32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3]
@@ -542,7 +542,7 @@ define <8 x float> @expand14(<4 x float> %a) {
;Negative test.
define <8 x float> @expand15(<4 x float> %a) {
; SKX64-LABEL: expand15:
-; SKX64: # BB#0:
+; SKX64: # %bb.0:
; SKX64-NEXT: vpermilps {{.*#+}} xmm1 = mem[0,1,0,0]
; SKX64-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[0,1,1,3]
; SKX64-NEXT: vmovaps {{.*#+}} ymm0 = [0,1,8,3,10,3,2,3]
@@ -550,7 +550,7 @@ define <8 x float> @expand15(<4 x float> %a) {
; SKX64-NEXT: retq
;
; KNL64-LABEL: expand15:
-; KNL64: # BB#0:
+; KNL64: # %bb.0:
; KNL64-NEXT: vpermilps {{.*#+}} xmm1 = mem[0,1,0,0]
; KNL64-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,1]
; KNL64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3]
@@ -559,7 +559,7 @@ define <8 x float> @expand15(<4 x float> %a) {
; KNL64-NEXT: retq
;
; SKX32-LABEL: expand15:
-; SKX32: # BB#0:
+; SKX32: # %bb.0:
; SKX32-NEXT: vpermilps {{.*#+}} xmm1 = mem[0,1,0,0]
; SKX32-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[0,1,1,3]
; SKX32-NEXT: vmovaps {{.*#+}} ymm0 = [0,1,8,3,10,3,2,3]
@@ -567,7 +567,7 @@ define <8 x float> @expand15(<4 x float> %a) {
; SKX32-NEXT: retl
;
; KNL32-LABEL: expand15:
-; KNL32: # BB#0:
+; KNL32: # %bb.0:
; KNL32-NEXT: vpermilps {{.*#+}} xmm1 = mem[0,1,0,0]
; KNL32-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,1]
; KNL32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3]
@@ -584,21 +584,21 @@ define <8 x float> @expand15(<4 x float> %a) {
define <64 x i8> @test_mm512_mask_blend_epi8(<64 x i8> %A, <64 x i8> %W){
; SKX64-LABEL: test_mm512_mask_blend_epi8:
-; SKX64: # BB#0: # %entry
+; SKX64: # %bb.0: # %entry
; SKX64-NEXT: movabsq $-6148914691236517206, %rax # imm = 0xAAAAAAAAAAAAAAAA
; SKX64-NEXT: kmovq %rax, %k1
; SKX64-NEXT: vpblendmb %zmm0, %zmm1, %zmm0 {%k1}
; SKX64-NEXT: retq
;
; KNL64-LABEL: test_mm512_mask_blend_epi8:
-; KNL64: # BB#0: # %entry
-; KNL64-NEXT: vpbroadcastw {{.*}}(%rip), %ymm4
+; KNL64: # %bb.0: # %entry
+; KNL64-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; KNL64-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm0
; KNL64-NEXT: vpblendvb %ymm4, %ymm3, %ymm1, %ymm1
; KNL64-NEXT: retq
;
; SKX32-LABEL: test_mm512_mask_blend_epi8:
-; SKX32: # BB#0: # %entry
+; SKX32: # %bb.0: # %entry
; SKX32-NEXT: movl $-1431655766, %eax # imm = 0xAAAAAAAA
; SKX32-NEXT: kmovd %eax, %k0
; SKX32-NEXT: kunpckdq %k0, %k0, %k1
@@ -606,18 +606,15 @@ define <64 x i8> @test_mm512_mask_blend_epi8(<64 x i8> %A, <64 x i8> %W){
; SKX32-NEXT: retl
;
; KNL32-LABEL: test_mm512_mask_blend_epi8:
-; KNL32: # BB#0: # %entry
+; KNL32: # %bb.0: # %entry
; KNL32-NEXT: pushl %ebp
-; KNL32-NEXT: .Lcfi0:
; KNL32-NEXT: .cfi_def_cfa_offset 8
-; KNL32-NEXT: .Lcfi1:
; KNL32-NEXT: .cfi_offset %ebp, -8
; KNL32-NEXT: movl %esp, %ebp
-; KNL32-NEXT: .Lcfi2:
; KNL32-NEXT: .cfi_def_cfa_register %ebp
; KNL32-NEXT: andl $-32, %esp
; KNL32-NEXT: subl $32, %esp
-; KNL32-NEXT: vpbroadcastw {{\.LCPI.*}}, %ymm3
+; KNL32-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; KNL32-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0
; KNL32-NEXT: vpblendvb %ymm3, 8(%ebp), %ymm1, %ymm1
; KNL32-NEXT: movl %ebp, %esp
@@ -630,34 +627,31 @@ entry:
define <32 x i16> @test_mm512_mask_blend_epi16(<32 x i16> %A, <32 x i16> %W){
; SKX64-LABEL: test_mm512_mask_blend_epi16:
-; SKX64: # BB#0: # %entry
+; SKX64: # %bb.0: # %entry
; SKX64-NEXT: movl $-1431655766, %eax # imm = 0xAAAAAAAA
; SKX64-NEXT: kmovd %eax, %k1
; SKX64-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1}
; SKX64-NEXT: retq
;
; KNL64-LABEL: test_mm512_mask_blend_epi16:
-; KNL64: # BB#0: # %entry
+; KNL64: # %bb.0: # %entry
; KNL64-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2],ymm0[3],ymm2[4],ymm0[5],ymm2[6],ymm0[7],ymm2[8],ymm0[9],ymm2[10],ymm0[11],ymm2[12],ymm0[13],ymm2[14],ymm0[15]
; KNL64-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2],ymm1[3],ymm3[4],ymm1[5],ymm3[6],ymm1[7],ymm3[8],ymm1[9],ymm3[10],ymm1[11],ymm3[12],ymm1[13],ymm3[14],ymm1[15]
; KNL64-NEXT: retq
;
; SKX32-LABEL: test_mm512_mask_blend_epi16:
-; SKX32: # BB#0: # %entry
+; SKX32: # %bb.0: # %entry
; SKX32-NEXT: movl $-1431655766, %eax # imm = 0xAAAAAAAA
; SKX32-NEXT: kmovd %eax, %k1
; SKX32-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1}
; SKX32-NEXT: retl
;
; KNL32-LABEL: test_mm512_mask_blend_epi16:
-; KNL32: # BB#0: # %entry
+; KNL32: # %bb.0: # %entry
; KNL32-NEXT: pushl %ebp
-; KNL32-NEXT: .Lcfi3:
; KNL32-NEXT: .cfi_def_cfa_offset 8
-; KNL32-NEXT: .Lcfi4:
; KNL32-NEXT: .cfi_offset %ebp, -8
; KNL32-NEXT: movl %esp, %ebp
-; KNL32-NEXT: .Lcfi5:
; KNL32-NEXT: .cfi_def_cfa_register %ebp
; KNL32-NEXT: andl $-32, %esp
; KNL32-NEXT: subl $32, %esp
@@ -673,28 +667,28 @@ entry:
define <16 x i32> @test_mm512_mask_blend_epi32(<16 x i32> %A, <16 x i32> %W){
; SKX64-LABEL: test_mm512_mask_blend_epi32:
-; SKX64: # BB#0: # %entry
+; SKX64: # %bb.0: # %entry
; SKX64-NEXT: movw $-21846, %ax # imm = 0xAAAA
; SKX64-NEXT: kmovd %eax, %k1
; SKX64-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
; SKX64-NEXT: retq
;
; KNL64-LABEL: test_mm512_mask_blend_epi32:
-; KNL64: # BB#0: # %entry
+; KNL64: # %bb.0: # %entry
; KNL64-NEXT: movw $-21846, %ax # imm = 0xAAAA
; KNL64-NEXT: kmovw %eax, %k1
; KNL64-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
; KNL64-NEXT: retq
;
; SKX32-LABEL: test_mm512_mask_blend_epi32:
-; SKX32: # BB#0: # %entry
+; SKX32: # %bb.0: # %entry
; SKX32-NEXT: movw $-21846, %ax # imm = 0xAAAA
; SKX32-NEXT: kmovd %eax, %k1
; SKX32-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
; SKX32-NEXT: retl
;
; KNL32-LABEL: test_mm512_mask_blend_epi32:
-; KNL32: # BB#0: # %entry
+; KNL32: # %bb.0: # %entry
; KNL32-NEXT: movw $-21846, %ax # imm = 0xAAAA
; KNL32-NEXT: kmovw %eax, %k1
; KNL32-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
@@ -706,28 +700,28 @@ entry:
define <8 x i64> @test_mm512_mask_blend_epi64(<8 x i64> %A, <8 x i64> %W){
; SKX64-LABEL: test_mm512_mask_blend_epi64:
-; SKX64: # BB#0: # %entry
+; SKX64: # %bb.0: # %entry
; SKX64-NEXT: movb $-86, %al
; SKX64-NEXT: kmovd %eax, %k1
; SKX64-NEXT: vpblendmq %zmm0, %zmm1, %zmm0 {%k1}
; SKX64-NEXT: retq
;
; KNL64-LABEL: test_mm512_mask_blend_epi64:
-; KNL64: # BB#0: # %entry
+; KNL64: # %bb.0: # %entry
; KNL64-NEXT: movb $-86, %al
; KNL64-NEXT: kmovw %eax, %k1
; KNL64-NEXT: vpblendmq %zmm0, %zmm1, %zmm0 {%k1}
; KNL64-NEXT: retq
;
; SKX32-LABEL: test_mm512_mask_blend_epi64:
-; SKX32: # BB#0: # %entry
+; SKX32: # %bb.0: # %entry
; SKX32-NEXT: movb $-86, %al
; SKX32-NEXT: kmovd %eax, %k1
; SKX32-NEXT: vpblendmq %zmm0, %zmm1, %zmm0 {%k1}
; SKX32-NEXT: retl
;
; KNL32-LABEL: test_mm512_mask_blend_epi64:
-; KNL32: # BB#0: # %entry
+; KNL32: # %bb.0: # %entry
; KNL32-NEXT: movb $-86, %al
; KNL32-NEXT: kmovw %eax, %k1
; KNL32-NEXT: vpblendmq %zmm0, %zmm1, %zmm0 {%k1}
@@ -739,28 +733,28 @@ entry:
define <16 x float> @test_mm512_mask_blend_ps(<16 x float> %A, <16 x float> %W){
; SKX64-LABEL: test_mm512_mask_blend_ps:
-; SKX64: # BB#0: # %entry
+; SKX64: # %bb.0: # %entry
; SKX64-NEXT: movw $-21846, %ax # imm = 0xAAAA
; SKX64-NEXT: kmovd %eax, %k1
; SKX64-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1}
; SKX64-NEXT: retq
;
; KNL64-LABEL: test_mm512_mask_blend_ps:
-; KNL64: # BB#0: # %entry
+; KNL64: # %bb.0: # %entry
; KNL64-NEXT: movw $-21846, %ax # imm = 0xAAAA
; KNL64-NEXT: kmovw %eax, %k1
; KNL64-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1}
; KNL64-NEXT: retq
;
; SKX32-LABEL: test_mm512_mask_blend_ps:
-; SKX32: # BB#0: # %entry
+; SKX32: # %bb.0: # %entry
; SKX32-NEXT: movw $-21846, %ax # imm = 0xAAAA
; SKX32-NEXT: kmovd %eax, %k1
; SKX32-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1}
; SKX32-NEXT: retl
;
; KNL32-LABEL: test_mm512_mask_blend_ps:
-; KNL32: # BB#0: # %entry
+; KNL32: # %bb.0: # %entry
; KNL32-NEXT: movw $-21846, %ax # imm = 0xAAAA
; KNL32-NEXT: kmovw %eax, %k1
; KNL32-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1}
@@ -772,28 +766,28 @@ entry:
define <8 x double> @test_mm512_mask_blend_pd(<8 x double> %A, <8 x double> %W){
; SKX64-LABEL: test_mm512_mask_blend_pd:
-; SKX64: # BB#0: # %entry
+; SKX64: # %bb.0: # %entry
; SKX64-NEXT: movb $-88, %al
; SKX64-NEXT: kmovd %eax, %k1
; SKX64-NEXT: vblendmpd %zmm0, %zmm1, %zmm0 {%k1}
; SKX64-NEXT: retq
;
; KNL64-LABEL: test_mm512_mask_blend_pd:
-; KNL64: # BB#0: # %entry
+; KNL64: # %bb.0: # %entry
; KNL64-NEXT: movb $-88, %al
; KNL64-NEXT: kmovw %eax, %k1
; KNL64-NEXT: vblendmpd %zmm0, %zmm1, %zmm0 {%k1}
; KNL64-NEXT: retq
;
; SKX32-LABEL: test_mm512_mask_blend_pd:
-; SKX32: # BB#0: # %entry
+; SKX32: # %bb.0: # %entry
; SKX32-NEXT: movb $-88, %al
; SKX32-NEXT: kmovd %eax, %k1
; SKX32-NEXT: vblendmpd %zmm0, %zmm1, %zmm0 {%k1}
; SKX32-NEXT: retl
;
; KNL32-LABEL: test_mm512_mask_blend_pd:
-; KNL32: # BB#0: # %entry
+; KNL32: # %bb.0: # %entry
; KNL32-NEXT: movb $-88, %al
; KNL32-NEXT: kmovw %eax, %k1
; KNL32-NEXT: vblendmpd %zmm0, %zmm1, %zmm0 {%k1}
@@ -806,27 +800,27 @@ entry:
define <32 x i8> @test_mm256_mask_blend_epi8(<32 x i8> %A, <32 x i8> %W){
; SKX64-LABEL: test_mm256_mask_blend_epi8:
-; SKX64: # BB#0: # %entry
+; SKX64: # %bb.0: # %entry
; SKX64-NEXT: movl $-1431655766, %eax # imm = 0xAAAAAAAA
; SKX64-NEXT: kmovd %eax, %k1
; SKX64-NEXT: vpblendmb %ymm0, %ymm1, %ymm0 {%k1}
; SKX64-NEXT: retq
;
; KNL64-LABEL: test_mm256_mask_blend_epi8:
-; KNL64: # BB#0: # %entry
+; KNL64: # %bb.0: # %entry
; KNL64-NEXT: vmovdqa {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
; KNL64-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
; KNL64-NEXT: retq
;
; SKX32-LABEL: test_mm256_mask_blend_epi8:
-; SKX32: # BB#0: # %entry
+; SKX32: # %bb.0: # %entry
; SKX32-NEXT: movl $-1431655766, %eax # imm = 0xAAAAAAAA
; SKX32-NEXT: kmovd %eax, %k1
; SKX32-NEXT: vpblendmb %ymm0, %ymm1, %ymm0 {%k1}
; SKX32-NEXT: retl
;
; KNL32-LABEL: test_mm256_mask_blend_epi8:
-; KNL32: # BB#0: # %entry
+; KNL32: # %bb.0: # %entry
; KNL32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
; KNL32-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
; KNL32-NEXT: retl
@@ -837,27 +831,27 @@ entry:
define <16 x i8> @test_mm_mask_blend_epi8(<16 x i8> %A, <16 x i8> %W){
; SKX64-LABEL: test_mm_mask_blend_epi8:
-; SKX64: # BB#0: # %entry
+; SKX64: # %bb.0: # %entry
; SKX64-NEXT: movw $-21846, %ax # imm = 0xAAAA
; SKX64-NEXT: kmovd %eax, %k1
; SKX64-NEXT: vpblendmb %xmm0, %xmm1, %xmm0 {%k1}
; SKX64-NEXT: retq
;
; KNL64-LABEL: test_mm_mask_blend_epi8:
-; KNL64: # BB#0: # %entry
+; KNL64: # %bb.0: # %entry
; KNL64-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
; KNL64-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
; KNL64-NEXT: retq
;
; SKX32-LABEL: test_mm_mask_blend_epi8:
-; SKX32: # BB#0: # %entry
+; SKX32: # %bb.0: # %entry
; SKX32-NEXT: movw $-21846, %ax # imm = 0xAAAA
; SKX32-NEXT: kmovd %eax, %k1
; SKX32-NEXT: vpblendmb %xmm0, %xmm1, %xmm0 {%k1}
; SKX32-NEXT: retl
;
; KNL32-LABEL: test_mm_mask_blend_epi8:
-; KNL32: # BB#0: # %entry
+; KNL32: # %bb.0: # %entry
; KNL32-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
; KNL32-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
; KNL32-NEXT: retl
diff --git a/test/CodeGen/X86/vector-shuffle-combining-avx.ll b/test/CodeGen/X86/vector-shuffle-combining-avx.ll
index 1e22fded8c99..e230aa2bc7e8 100644
--- a/test/CodeGen/X86/vector-shuffle-combining-avx.ll
+++ b/test/CodeGen/X86/vector-shuffle-combining-avx.ll
@@ -1,10 +1,10 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx | FileCheck %s --check-prefix=X32
-; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X32
-; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx512f | FileCheck %s --check-prefix=X32
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefix=X64
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X64
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx | FileCheck %s --check-prefix=X32 --check-prefix=X32-AVX --check-prefix=X32-AVX1
+; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X32 --check-prefix=X32-AVX --check-prefix=X32-AVX2
+; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx512f | FileCheck %s --check-prefix=X32 --check-prefix=X32-AVX512
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX512
;
; Combine tests involving AVX target shuffles
@@ -24,11 +24,11 @@ declare <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double>, <4 x double>,
define <4 x float> @combine_vpermilvar_4f32_identity(<4 x float> %a0) {
; X32-LABEL: combine_vpermilvar_4f32_identity:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: retl
;
; X64-LABEL: combine_vpermilvar_4f32_identity:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: retq
%1 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> <i32 3, i32 2, i32 1, i32 0>)
%2 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %1, <4 x i32> <i32 3, i32 2, i32 1, i32 0>)
@@ -37,12 +37,12 @@ define <4 x float> @combine_vpermilvar_4f32_identity(<4 x float> %a0) {
define <4 x float> @combine_vpermilvar_4f32_movddup(<4 x float> %a0) {
; X32-LABEL: combine_vpermilvar_4f32_movddup:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
; X32-NEXT: retl
;
; X64-LABEL: combine_vpermilvar_4f32_movddup:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
; X64-NEXT: retq
%1 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> <i32 0, i32 1, i32 0, i32 1>)
@@ -50,13 +50,13 @@ define <4 x float> @combine_vpermilvar_4f32_movddup(<4 x float> %a0) {
}
define <4 x float> @combine_vpermilvar_4f32_movddup_load(<4 x float> *%a0) {
; X32-LABEL: combine_vpermilvar_4f32_movddup_load:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
; X32-NEXT: retl
;
; X64-LABEL: combine_vpermilvar_4f32_movddup_load:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
; X64-NEXT: retq
%1 = load <4 x float>, <4 x float> *%a0
@@ -66,12 +66,12 @@ define <4 x float> @combine_vpermilvar_4f32_movddup_load(<4 x float> *%a0) {
define <4 x float> @combine_vpermilvar_4f32_movshdup(<4 x float> %a0) {
; X32-LABEL: combine_vpermilvar_4f32_movshdup:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
; X32-NEXT: retl
;
; X64-LABEL: combine_vpermilvar_4f32_movshdup:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
; X64-NEXT: retq
%1 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> <i32 undef, i32 1, i32 3, i32 3>)
@@ -80,12 +80,12 @@ define <4 x float> @combine_vpermilvar_4f32_movshdup(<4 x float> %a0) {
define <4 x float> @combine_vpermilvar_4f32_movsldup(<4 x float> %a0) {
; X32-LABEL: combine_vpermilvar_4f32_movsldup:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
; X32-NEXT: retl
;
; X64-LABEL: combine_vpermilvar_4f32_movsldup:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
; X64-NEXT: retq
%1 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> <i32 0, i32 0, i32 2, i32 undef>)
@@ -94,12 +94,12 @@ define <4 x float> @combine_vpermilvar_4f32_movsldup(<4 x float> %a0) {
define <4 x float> @combine_vpermilvar_4f32_unpckh(<4 x float> %a0) {
; X32-LABEL: combine_vpermilvar_4f32_unpckh:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3]
; X32-NEXT: retl
;
; X64-LABEL: combine_vpermilvar_4f32_unpckh:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3]
; X64-NEXT: retq
%1 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> <i32 2, i32 2, i32 3, i32 3>)
@@ -108,12 +108,12 @@ define <4 x float> @combine_vpermilvar_4f32_unpckh(<4 x float> %a0) {
define <4 x float> @combine_vpermilvar_4f32_unpckl(<4 x float> %a0) {
; X32-LABEL: combine_vpermilvar_4f32_unpckl:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,1]
; X32-NEXT: retl
;
; X64-LABEL: combine_vpermilvar_4f32_unpckl:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,1]
; X64-NEXT: retq
%1 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> <i32 0, i32 0, i32 1, i32 1>)
@@ -122,11 +122,11 @@ define <4 x float> @combine_vpermilvar_4f32_unpckl(<4 x float> %a0) {
define <8 x float> @combine_vpermilvar_8f32_identity(<8 x float> %a0) {
; X32-LABEL: combine_vpermilvar_8f32_identity:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: retl
;
; X64-LABEL: combine_vpermilvar_8f32_identity:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: retq
%1 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 2, i32 3, i32 0, i32 undef>)
%2 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %1, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 2, i32 3, i32 0, i32 1>)
@@ -135,12 +135,12 @@ define <8 x float> @combine_vpermilvar_8f32_identity(<8 x float> %a0) {
define <8 x float> @combine_vpermilvar_8f32_10326u4u(<8 x float> %a0) {
; X32-LABEL: combine_vpermilvar_8f32_10326u4u:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,6,u,4,u]
; X32-NEXT: retl
;
; X64-LABEL: combine_vpermilvar_8f32_10326u4u:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,6,u,4,u]
; X64-NEXT: retq
%1 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 0, i32 1, i32 2, i32 undef>)
@@ -149,15 +149,35 @@ define <8 x float> @combine_vpermilvar_8f32_10326u4u(<8 x float> %a0) {
}
define <8 x float> @combine_vpermilvar_vperm2f128_8f32(<8 x float> %a0) {
-; X32-LABEL: combine_vpermilvar_vperm2f128_8f32:
-; X32: # BB#0:
-; X32-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
-; X32-NEXT: retl
+; X32-AVX1-LABEL: combine_vpermilvar_vperm2f128_8f32:
+; X32-AVX1: # %bb.0:
+; X32-AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; X32-AVX1-NEXT: retl
;
-; X64-LABEL: combine_vpermilvar_vperm2f128_8f32:
-; X64: # BB#0:
-; X64-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
-; X64-NEXT: retq
+; X32-AVX2-LABEL: combine_vpermilvar_vperm2f128_8f32:
+; X32-AVX2: # %bb.0:
+; X32-AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; X32-AVX2-NEXT: retl
+;
+; X32-AVX512-LABEL: combine_vpermilvar_vperm2f128_8f32:
+; X32-AVX512: # %bb.0:
+; X32-AVX512-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; X32-AVX512-NEXT: retl
+;
+; X64-AVX1-LABEL: combine_vpermilvar_vperm2f128_8f32:
+; X64-AVX1: # %bb.0:
+; X64-AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; X64-AVX1-NEXT: retq
+;
+; X64-AVX2-LABEL: combine_vpermilvar_vperm2f128_8f32:
+; X64-AVX2: # %bb.0:
+; X64-AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: combine_vpermilvar_vperm2f128_8f32:
+; X64-AVX512: # %bb.0:
+; X64-AVX512-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; X64-AVX512-NEXT: retq
%1 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 3, i32 2, i32 1, i32 0>)
%2 = shufflevector <8 x float> %1, <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
%3 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %2, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 3, i32 2, i32 1, i32 0>)
@@ -166,12 +186,12 @@ define <8 x float> @combine_vpermilvar_vperm2f128_8f32(<8 x float> %a0) {
define <8 x float> @combine_vpermilvar_vperm2f128_zero_8f32(<8 x float> %a0) {
; X32-LABEL: combine_vpermilvar_vperm2f128_zero_8f32:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1]
; X32-NEXT: retl
;
; X64-LABEL: combine_vpermilvar_vperm2f128_zero_8f32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1]
; X64-NEXT: retq
%1 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 3, i32 2, i32 1, i32 0>)
@@ -182,14 +202,14 @@ define <8 x float> @combine_vpermilvar_vperm2f128_zero_8f32(<8 x float> %a0) {
define <4 x double> @combine_vperm2f128_vpermilvar_as_vpblendpd(<4 x double> %a0) {
; X32-LABEL: combine_vperm2f128_vpermilvar_as_vpblendpd:
-; X32: # BB#0:
-; X32-NEXT: vxorpd %ymm1, %ymm1, %ymm1
+; X32: # %bb.0:
+; X32-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; X32-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
; X32-NEXT: retl
;
; X64-LABEL: combine_vperm2f128_vpermilvar_as_vpblendpd:
-; X64: # BB#0:
-; X64-NEXT: vxorpd %ymm1, %ymm1, %ymm1
+; X64: # %bb.0:
+; X64-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; X64-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
; X64-NEXT: retq
%1 = tail call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %a0, <4 x i64> <i64 2, i64 0, i64 2, i64 0>)
@@ -200,12 +220,12 @@ define <4 x double> @combine_vperm2f128_vpermilvar_as_vpblendpd(<4 x double> %a0
define <8 x float> @combine_vpermilvar_8f32_movddup(<8 x float> %a0) {
; X32-LABEL: combine_vpermilvar_8f32_movddup:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
; X32-NEXT: retl
;
; X64-LABEL: combine_vpermilvar_8f32_movddup:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
; X64-NEXT: retq
%1 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5>)
@@ -213,13 +233,13 @@ define <8 x float> @combine_vpermilvar_8f32_movddup(<8 x float> %a0) {
}
define <8 x float> @combine_vpermilvar_8f32_movddup_load(<8 x float> *%a0) {
; X32-LABEL: combine_vpermilvar_8f32_movddup_load:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vmovddup {{.*#+}} ymm0 = mem[0,0,2,2]
; X32-NEXT: retl
;
; X64-LABEL: combine_vpermilvar_8f32_movddup_load:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovddup {{.*#+}} ymm0 = mem[0,0,2,2]
; X64-NEXT: retq
%1 = load <8 x float>, <8 x float> *%a0
@@ -229,12 +249,12 @@ define <8 x float> @combine_vpermilvar_8f32_movddup_load(<8 x float> *%a0) {
define <8 x float> @combine_vpermilvar_8f32_movshdup(<8 x float> %a0) {
; X32-LABEL: combine_vpermilvar_8f32_movshdup:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7]
; X32-NEXT: retl
;
; X64-LABEL: combine_vpermilvar_8f32_movshdup:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7]
; X64-NEXT: retq
%1 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 undef, i32 5, i32 7, i32 7>)
@@ -243,12 +263,12 @@ define <8 x float> @combine_vpermilvar_8f32_movshdup(<8 x float> %a0) {
define <8 x float> @combine_vpermilvar_8f32_movsldup(<8 x float> %a0) {
; X32-LABEL: combine_vpermilvar_8f32_movsldup:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vmovsldup {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6]
; X32-NEXT: retl
;
; X64-LABEL: combine_vpermilvar_8f32_movsldup:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovsldup {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6]
; X64-NEXT: retq
%1 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>)
@@ -257,11 +277,11 @@ define <8 x float> @combine_vpermilvar_8f32_movsldup(<8 x float> %a0) {
define <2 x double> @combine_vpermilvar_2f64_identity(<2 x double> %a0) {
; X32-LABEL: combine_vpermilvar_2f64_identity:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: retl
;
; X64-LABEL: combine_vpermilvar_2f64_identity:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: retq
%1 = tail call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %a0, <2 x i64> <i64 2, i64 0>)
%2 = tail call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %1, <2 x i64> <i64 2, i64 0>)
@@ -270,12 +290,12 @@ define <2 x double> @combine_vpermilvar_2f64_identity(<2 x double> %a0) {
define <2 x double> @combine_vpermilvar_2f64_movddup(<2 x double> %a0) {
; X32-LABEL: combine_vpermilvar_2f64_movddup:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
; X32-NEXT: retl
;
; X64-LABEL: combine_vpermilvar_2f64_movddup:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
; X64-NEXT: retq
%1 = tail call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %a0, <2 x i64> <i64 0, i64 0>)
@@ -284,11 +304,11 @@ define <2 x double> @combine_vpermilvar_2f64_movddup(<2 x double> %a0) {
define <4 x double> @combine_vpermilvar_4f64_identity(<4 x double> %a0) {
; X32-LABEL: combine_vpermilvar_4f64_identity:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: retl
;
; X64-LABEL: combine_vpermilvar_4f64_identity:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: retq
%1 = tail call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %a0, <4 x i64> <i64 2, i64 0, i64 2, i64 0>)
%2 = tail call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %1, <4 x i64> <i64 2, i64 0, i64 2, i64 0>)
@@ -297,12 +317,12 @@ define <4 x double> @combine_vpermilvar_4f64_identity(<4 x double> %a0) {
define <4 x double> @combine_vpermilvar_4f64_movddup(<4 x double> %a0) {
; X32-LABEL: combine_vpermilvar_4f64_movddup:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
; X32-NEXT: retl
;
; X64-LABEL: combine_vpermilvar_4f64_movddup:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
; X64-NEXT: retq
%1 = tail call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %a0, <4 x i64> <i64 0, i64 0, i64 4, i64 4>)
@@ -311,12 +331,12 @@ define <4 x double> @combine_vpermilvar_4f64_movddup(<4 x double> %a0) {
define <4 x float> @combine_vpermilvar_4f32_4stage(<4 x float> %a0) {
; X32-LABEL: combine_vpermilvar_4f32_4stage:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,0,3,1]
; X32-NEXT: retl
;
; X64-LABEL: combine_vpermilvar_4f32_4stage:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,0,3,1]
; X64-NEXT: retq
%1 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> <i32 3, i32 2, i32 1, i32 0>)
@@ -328,12 +348,12 @@ define <4 x float> @combine_vpermilvar_4f32_4stage(<4 x float> %a0) {
define <8 x float> @combine_vpermilvar_8f32_4stage(<8 x float> %a0) {
; X32-LABEL: combine_vpermilvar_8f32_4stage:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,0,3,1,6,4,7,5]
; X32-NEXT: retl
;
; X64-LABEL: combine_vpermilvar_8f32_4stage:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,0,3,1,6,4,7,5]
; X64-NEXT: retq
%1 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 3, i32 2, i32 1, i32 0>)
@@ -345,12 +365,12 @@ define <8 x float> @combine_vpermilvar_8f32_4stage(<8 x float> %a0) {
define <4 x float> @combine_vpermilvar_4f32_as_insertps(<4 x float> %a0) {
; X32-LABEL: combine_vpermilvar_4f32_as_insertps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[1],zero,xmm0[2],zero
; X32-NEXT: retl
;
; X64-LABEL: combine_vpermilvar_4f32_as_insertps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[1],zero,xmm0[2],zero
; X64-NEXT: retq
%1 = call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> <i32 3, i32 2, i32 1, i32 0>)
@@ -360,12 +380,12 @@ define <4 x float> @combine_vpermilvar_4f32_as_insertps(<4 x float> %a0) {
define <2 x double> @constant_fold_vpermilvar_pd() {
; X32-LABEL: constant_fold_vpermilvar_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vmovaps {{.*#+}} xmm0 = [2.000000e+00,1.000000e+00]
; X32-NEXT: retl
;
; X64-LABEL: constant_fold_vpermilvar_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovaps {{.*#+}} xmm0 = [2.000000e+00,1.000000e+00]
; X64-NEXT: retq
%1 = call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> <double 1.0, double 2.0>, <2 x i64> <i64 2, i64 0>)
@@ -374,12 +394,12 @@ define <2 x double> @constant_fold_vpermilvar_pd() {
define <4 x double> @constant_fold_vpermilvar_pd_256() {
; X32-LABEL: constant_fold_vpermilvar_pd_256:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vmovaps {{.*#+}} ymm0 = [2.000000e+00,1.000000e+00,3.000000e+00,4.000000e+00]
; X32-NEXT: retl
;
; X64-LABEL: constant_fold_vpermilvar_pd_256:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovaps {{.*#+}} ymm0 = [2.000000e+00,1.000000e+00,3.000000e+00,4.000000e+00]
; X64-NEXT: retq
%1 = call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> <double 1.0, double 2.0, double 3.0, double 4.0>, <4 x i64> <i64 2, i64 0, i64 0, i64 2>)
@@ -388,12 +408,12 @@ define <4 x double> @constant_fold_vpermilvar_pd_256() {
define <4 x float> @constant_fold_vpermilvar_ps() {
; X32-LABEL: constant_fold_vpermilvar_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vmovaps {{.*#+}} xmm0 = [4.000000e+00,1.000000e+00,3.000000e+00,2.000000e+00]
; X32-NEXT: retl
;
; X64-LABEL: constant_fold_vpermilvar_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovaps {{.*#+}} xmm0 = [4.000000e+00,1.000000e+00,3.000000e+00,2.000000e+00]
; X64-NEXT: retq
%1 = call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, <4 x i32> <i32 3, i32 0, i32 2, i32 1>)
@@ -402,12 +422,12 @@ define <4 x float> @constant_fold_vpermilvar_ps() {
define <8 x float> @constant_fold_vpermilvar_ps_256() {
; X32-LABEL: constant_fold_vpermilvar_ps_256:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vmovaps {{.*#+}} ymm0 = [1.000000e+00,1.000000e+00,3.000000e+00,2.000000e+00,5.000000e+00,6.000000e+00,6.000000e+00,6.000000e+00]
; X32-NEXT: retl
;
; X64-LABEL: constant_fold_vpermilvar_ps_256:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovaps {{.*#+}} ymm0 = [1.000000e+00,1.000000e+00,3.000000e+00,2.000000e+00,5.000000e+00,6.000000e+00,6.000000e+00,6.000000e+00]
; X64-NEXT: retq
%1 = call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>, <8 x i32> <i32 4, i32 0, i32 2, i32 1, i32 0, i32 1, i32 1, i32 1>)
diff --git a/test/CodeGen/X86/vector-shuffle-combining-avx2.ll b/test/CodeGen/X86/vector-shuffle-combining-avx2.ll
index 202acbcd3500..8c17978d2373 100644
--- a/test/CodeGen/X86/vector-shuffle-combining-avx2.ll
+++ b/test/CodeGen/X86/vector-shuffle-combining-avx2.ll
@@ -11,13 +11,13 @@ declare <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8>, <32 x i8>)
define <32 x i8> @combine_pshufb_pslldq(<32 x i8> %a0) {
; X32-LABEL: combine_pshufb_pslldq:
-; X32: # BB#0:
-; X32-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; X32: # %bb.0:
+; X32-NEXT: vxorps %xmm0, %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: combine_pshufb_pslldq:
-; X64: # BB#0:
-; X64-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; X64: # %bb.0:
+; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0
; X64-NEXT: retq
%1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>)
%2 = shufflevector <32 x i8> %1, <32 x i8> zeroinitializer, <32 x i32> <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
@@ -26,13 +26,13 @@ define <32 x i8> @combine_pshufb_pslldq(<32 x i8> %a0) {
define <32 x i8> @combine_pshufb_psrldq(<32 x i8> %a0) {
; X32-LABEL: combine_pshufb_psrldq:
-; X32: # BB#0:
-; X32-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; X32: # %bb.0:
+; X32-NEXT: vxorps %xmm0, %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: combine_pshufb_psrldq:
-; X64: # BB#0:
-; X64-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; X64: # %bb.0:
+; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0
; X64-NEXT: retq
%1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128>)
%2 = shufflevector <32 x i8> %1, <32 x i8> zeroinitializer, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32>
@@ -41,12 +41,12 @@ define <32 x i8> @combine_pshufb_psrldq(<32 x i8> %a0) {
define <32 x i8> @combine_pshufb_vpermd(<8 x i32> %a) {
; X32-LABEL: combine_pshufb_vpermd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,16,17,18,18]
; X32-NEXT: retl
;
; X64-LABEL: combine_pshufb_vpermd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,16,17,18,18]
; X64-NEXT: retq
%tmp0 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 4>)
@@ -57,12 +57,12 @@ define <32 x i8> @combine_pshufb_vpermd(<8 x i32> %a) {
define <32 x i8> @combine_pshufb_vpermps(<8 x float> %a) {
; X32-LABEL: combine_pshufb_vpermps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,16,17,18,18]
; X32-NEXT: retl
;
; X64-LABEL: combine_pshufb_vpermps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,16,17,18,18]
; X64-NEXT: retq
%tmp0 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 4>)
@@ -73,14 +73,14 @@ define <32 x i8> @combine_pshufb_vpermps(<8 x float> %a) {
define <32 x i8> @combine_and_pshufb(<32 x i8> %a0) {
; X32-LABEL: combine_and_pshufb:
-; X32: # BB#0:
-; X32-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; X32: # %bb.0:
+; X32-NEXT: vpxor %xmm1, %xmm1, %xmm1
; X32-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15]
; X32-NEXT: retl
;
; X64-LABEL: combine_and_pshufb:
-; X64: # BB#0:
-; X64-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; X64: # %bb.0:
+; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1
; X64-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15]
; X64-NEXT: retq
%1 = shufflevector <32 x i8> %a0, <32 x i8> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 32, i32 32, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
@@ -90,14 +90,14 @@ define <32 x i8> @combine_and_pshufb(<32 x i8> %a0) {
define <32 x i8> @combine_pshufb_and(<32 x i8> %a0) {
; X32-LABEL: combine_pshufb_and:
-; X32: # BB#0:
-; X32-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; X32: # %bb.0:
+; X32-NEXT: vpxor %xmm1, %xmm1, %xmm1
; X32-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15]
; X32-NEXT: retl
;
; X64-LABEL: combine_pshufb_and:
-; X64: # BB#0:
-; X64-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; X64: # %bb.0:
+; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1
; X64-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15]
; X64-NEXT: retq
%1 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 0, i8 1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 8, i8 9, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 8, i8 9, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
@@ -107,13 +107,13 @@ define <32 x i8> @combine_pshufb_and(<32 x i8> %a0) {
define <4 x i64> @combine_permq_pshufb_as_vperm2i128(<4 x i64> %a0) {
; X32-LABEL: combine_permq_pshufb_as_vperm2i128:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero
; X32-NEXT: vpaddq {{\.LCPI.*}}, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: combine_permq_pshufb_as_vperm2i128:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero
; X64-NEXT: vpaddq {{.*}}(%rip), %ymm0, %ymm0
; X64-NEXT: retq
@@ -127,15 +127,15 @@ define <4 x i64> @combine_permq_pshufb_as_vperm2i128(<4 x i64> %a0) {
define <8 x i32> @combine_as_vpermd(<8 x i32> %a0) {
; X32-LABEL: combine_as_vpermd:
-; X32: # BB#0:
-; X32-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,4,5,6,7,0,7]
-; X32-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; X32: # %bb.0:
+; X32-NEXT: vmovaps {{.*#+}} ymm1 = [4,5,4,5,6,7,0,7]
+; X32-NEXT: vpermps %ymm0, %ymm1, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: combine_as_vpermd:
-; X64: # BB#0:
-; X64-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,4,5,6,7,0,7]
-; X64-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; X64: # %bb.0:
+; X64-NEXT: vmovaps {{.*#+}} ymm1 = [4,5,4,5,6,7,0,7]
+; X64-NEXT: vpermps %ymm0, %ymm1, %ymm0
; X64-NEXT: retq
%1 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
%2 = tail call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a0, <8 x i32> <i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 7, i32 6>)
@@ -145,13 +145,13 @@ define <8 x i32> @combine_as_vpermd(<8 x i32> %a0) {
define <8 x float> @combine_as_vpermps(<8 x float> %a0) {
; X32-LABEL: combine_as_vpermps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vmovaps {{.*#+}} ymm1 = <6,4,7,5,1,u,4,7>
; X32-NEXT: vpermps %ymm0, %ymm1, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: combine_as_vpermps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovaps {{.*#+}} ymm1 = <6,4,7,5,1,u,4,7>
; X64-NEXT: vpermps %ymm0, %ymm1, %ymm0
; X64-NEXT: retq
@@ -163,15 +163,15 @@ define <8 x float> @combine_as_vpermps(<8 x float> %a0) {
define <32 x i8> @combine_permq_pshufb_as_vpblendd(<4 x i64> %a0) {
; X32-LABEL: combine_permq_pshufb_as_vpblendd:
-; X32: # BB#0:
-; X32-NEXT: vpxor %ymm1, %ymm1, %ymm1
-; X32-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; X32: # %bb.0:
+; X32-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; X32-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; X32-NEXT: retl
;
; X64-LABEL: combine_permq_pshufb_as_vpblendd:
-; X64: # BB#0:
-; X64-NEXT: vpxor %ymm1, %ymm1, %ymm1
-; X64-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; X64: # %bb.0:
+; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; X64-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; X64-NEXT: retq
%1 = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
%2 = bitcast <4 x i64> %1 to <32 x i8>
@@ -181,12 +181,12 @@ define <32 x i8> @combine_permq_pshufb_as_vpblendd(<4 x i64> %a0) {
define <16 x i8> @combine_pshufb_as_vpbroadcastb128(<16 x i8> %a) {
; X32-LABEL: combine_pshufb_as_vpbroadcastb128:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpbroadcastb %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: combine_pshufb_as_vpbroadcastb128:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpbroadcastb %xmm0, %xmm0
; X64-NEXT: retq
%1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a, <16 x i8> zeroinitializer)
@@ -195,14 +195,14 @@ define <16 x i8> @combine_pshufb_as_vpbroadcastb128(<16 x i8> %a) {
define <32 x i8> @combine_pshufb_as_vpbroadcastb256(<2 x i64> %a) {
; X32-LABEL: combine_pshufb_as_vpbroadcastb256:
-; X32: # BB#0:
-; X32-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; X32: # %bb.0:
+; X32-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
; X32-NEXT: vpbroadcastb %xmm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: combine_pshufb_as_vpbroadcastb256:
-; X64: # BB#0:
-; X64-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; X64: # %bb.0:
+; X64-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
; X64-NEXT: vpbroadcastb %xmm0, %ymm0
; X64-NEXT: retq
%1 = shufflevector <2 x i64> %a, <2 x i64> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
@@ -216,12 +216,12 @@ define <32 x i8> @combine_pshufb_as_vpbroadcastb256(<2 x i64> %a) {
define <16 x i8> @combine_pshufb_as_vpbroadcastw128(<16 x i8> %a) {
; X32-LABEL: combine_pshufb_as_vpbroadcastw128:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpbroadcastw %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: combine_pshufb_as_vpbroadcastw128:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpbroadcastw %xmm0, %xmm0
; X64-NEXT: retq
%1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a, <16 x i8> <i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1>)
@@ -230,14 +230,14 @@ define <16 x i8> @combine_pshufb_as_vpbroadcastw128(<16 x i8> %a) {
define <32 x i8> @combine_pshufb_as_vpbroadcastw256(<2 x i64> %a) {
; X32-LABEL: combine_pshufb_as_vpbroadcastw256:
-; X32: # BB#0:
-; X32-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; X32: # %bb.0:
+; X32-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
; X32-NEXT: vpbroadcastw %xmm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: combine_pshufb_as_vpbroadcastw256:
-; X64: # BB#0:
-; X64-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; X64: # %bb.0:
+; X64-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
; X64-NEXT: vpbroadcastw %xmm0, %ymm0
; X64-NEXT: retq
%1 = shufflevector <2 x i64> %a, <2 x i64> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
@@ -251,13 +251,13 @@ define <32 x i8> @combine_pshufb_as_vpbroadcastw256(<2 x i64> %a) {
define <16 x i8> @combine_pshufb_as_vpbroadcastd128(<16 x i8> %a) {
; X32-LABEL: combine_pshufb_as_vpbroadcastd128:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpbroadcastd %xmm0, %xmm0
; X32-NEXT: vpaddb {{\.LCPI.*}}, %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: combine_pshufb_as_vpbroadcastd128:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpbroadcastd %xmm0, %xmm0
; X64-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
; X64-NEXT: retq
@@ -268,15 +268,15 @@ define <16 x i8> @combine_pshufb_as_vpbroadcastd128(<16 x i8> %a) {
define <8 x i32> @combine_permd_as_vpbroadcastd256(<4 x i32> %a) {
; X32-LABEL: combine_permd_as_vpbroadcastd256:
-; X32: # BB#0:
-; X32-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; X32: # %bb.0:
+; X32-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
; X32-NEXT: vpbroadcastd %xmm0, %ymm0
; X32-NEXT: vpaddd {{\.LCPI.*}}, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: combine_permd_as_vpbroadcastd256:
-; X64: # BB#0:
-; X64-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; X64: # %bb.0:
+; X64-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
; X64-NEXT: vpbroadcastd %xmm0, %ymm0
; X64-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0
; X64-NEXT: retq
@@ -288,12 +288,12 @@ define <8 x i32> @combine_permd_as_vpbroadcastd256(<4 x i32> %a) {
define <16 x i8> @combine_pshufb_as_vpbroadcastq128(<16 x i8> %a) {
; X32-LABEL: combine_pshufb_as_vpbroadcastq128:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpbroadcastq %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: combine_pshufb_as_vpbroadcastq128:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpbroadcastq %xmm0, %xmm0
; X64-NEXT: retq
%1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>)
@@ -302,15 +302,15 @@ define <16 x i8> @combine_pshufb_as_vpbroadcastq128(<16 x i8> %a) {
define <8 x i32> @combine_permd_as_vpbroadcastq256(<4 x i32> %a) {
; X32-LABEL: combine_permd_as_vpbroadcastq256:
-; X32: # BB#0:
-; X32-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; X32: # %bb.0:
+; X32-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
; X32-NEXT: vpbroadcastq %xmm0, %ymm0
; X32-NEXT: vpaddd {{\.LCPI.*}}, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: combine_permd_as_vpbroadcastq256:
-; X64: # BB#0:
-; X64-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; X64: # %bb.0:
+; X64-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
; X64-NEXT: vpbroadcastq %xmm0, %ymm0
; X64-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0
; X64-NEXT: retq
@@ -322,12 +322,12 @@ define <8 x i32> @combine_permd_as_vpbroadcastq256(<4 x i32> %a) {
define <4 x float> @combine_pshufb_as_vpbroadcastss128(<4 x float> %a) {
; X32-LABEL: combine_pshufb_as_vpbroadcastss128:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vbroadcastss %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: combine_pshufb_as_vpbroadcastss128:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vbroadcastss %xmm0, %xmm0
; X64-NEXT: retq
%1 = bitcast <4 x float> %a to <16 x i8>
@@ -338,14 +338,14 @@ define <4 x float> @combine_pshufb_as_vpbroadcastss128(<4 x float> %a) {
define <8 x float> @combine_permps_as_vpbroadcastss256(<4 x float> %a) {
; X32-LABEL: combine_permps_as_vpbroadcastss256:
-; X32: # BB#0:
-; X32-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; X32: # %bb.0:
+; X32-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
; X32-NEXT: vbroadcastss %xmm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: combine_permps_as_vpbroadcastss256:
-; X64: # BB#0:
-; X64-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; X64: # %bb.0:
+; X64-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
; X64-NEXT: vbroadcastss %xmm0, %ymm0
; X64-NEXT: retq
%1 = shufflevector <4 x float> %a, <4 x float> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -355,14 +355,14 @@ define <8 x float> @combine_permps_as_vpbroadcastss256(<4 x float> %a) {
define <4 x double> @combine_permps_as_vpbroadcastsd256(<2 x double> %a) {
; X32-LABEL: combine_permps_as_vpbroadcastsd256:
-; X32: # BB#0:
-; X32-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; X32: # %bb.0:
+; X32-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
; X32-NEXT: vbroadcastsd %xmm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: combine_permps_as_vpbroadcastsd256:
-; X64: # BB#0:
-; X64-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; X64: # %bb.0:
+; X64-NEXT: # kill: def %xmm0 killed %xmm0 def %ymm0
; X64-NEXT: vbroadcastsd %xmm0, %ymm0
; X64-NEXT: retq
%1 = shufflevector <2 x double> %a, <2 x double> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
@@ -374,12 +374,12 @@ define <4 x double> @combine_permps_as_vpbroadcastsd256(<2 x double> %a) {
define <16 x i8> @combine_vpbroadcast_pshufb_as_vpbroadcastb128(<16 x i8> %a) {
; X32-LABEL: combine_vpbroadcast_pshufb_as_vpbroadcastb128:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpbroadcastb %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: combine_vpbroadcast_pshufb_as_vpbroadcastb128:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpbroadcastb %xmm0, %xmm0
; X64-NEXT: retq
%1 = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> zeroinitializer
@@ -389,12 +389,12 @@ define <16 x i8> @combine_vpbroadcast_pshufb_as_vpbroadcastb128(<16 x i8> %a) {
define <32 x i8> @combine_vpbroadcast_pshufb_as_vpbroadcastb256(<32 x i8> %a) {
; X32-LABEL: combine_vpbroadcast_pshufb_as_vpbroadcastb256:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpbroadcastb %xmm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: combine_vpbroadcast_pshufb_as_vpbroadcastb256:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpbroadcastb %xmm0, %ymm0
; X64-NEXT: retq
%1 = shufflevector <32 x i8> %a, <32 x i8> undef, <32 x i32> zeroinitializer
@@ -404,12 +404,12 @@ define <32 x i8> @combine_vpbroadcast_pshufb_as_vpbroadcastb256(<32 x i8> %a) {
define <4 x float> @combine_vpbroadcast_pshufb_as_vpbroadcastss128(<4 x float> %a) {
; X32-LABEL: combine_vpbroadcast_pshufb_as_vpbroadcastss128:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vbroadcastss %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: combine_vpbroadcast_pshufb_as_vpbroadcastss128:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vbroadcastss %xmm0, %xmm0
; X64-NEXT: retq
%1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> zeroinitializer
@@ -421,13 +421,13 @@ define <4 x float> @combine_vpbroadcast_pshufb_as_vpbroadcastss128(<4 x float> %
define <8 x float> @combine_vpbroadcast_permd_as_vpbroadcastss256(<4 x float> %a) {
; X32-LABEL: combine_vpbroadcast_permd_as_vpbroadcastss256:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vbroadcastss %xmm0, %ymm0
; X32-NEXT: vbroadcastss %xmm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: combine_vpbroadcast_permd_as_vpbroadcastss256:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vbroadcastss %xmm0, %ymm0
; X64-NEXT: vbroadcastss %xmm0, %ymm0
; X64-NEXT: retq
@@ -438,13 +438,13 @@ define <8 x float> @combine_vpbroadcast_permd_as_vpbroadcastss256(<4 x float> %a
define <4 x double> @combine_vpbroadcast_permd_as_vpbroadcastsd256(<2 x double> %a) {
; X32-LABEL: combine_vpbroadcast_permd_as_vpbroadcastsd256:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vbroadcastsd %xmm0, %ymm0
; X32-NEXT: vbroadcastsd %xmm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: combine_vpbroadcast_permd_as_vpbroadcastsd256:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vbroadcastsd %xmm0, %ymm0
; X64-NEXT: vbroadcastsd %xmm0, %ymm0
; X64-NEXT: retq
@@ -457,13 +457,13 @@ define <4 x double> @combine_vpbroadcast_permd_as_vpbroadcastsd256(<2 x double>
define <8 x i32> @combine_permd_as_permq(<8 x i32> %a) {
; X32-LABEL: combine_permd_as_permq:
-; X32: # BB#0:
-; X32-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,1]
+; X32: # %bb.0:
+; X32-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,1]
; X32-NEXT: retl
;
; X64-LABEL: combine_permd_as_permq:
-; X64: # BB#0:
-; X64-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,1]
+; X64: # %bb.0:
+; X64-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,1]
; X64-NEXT: retq
%1 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 4, i32 5, i32 2, i32 3>)
ret <8 x i32> %1
@@ -471,12 +471,12 @@ define <8 x i32> @combine_permd_as_permq(<8 x i32> %a) {
define <8 x float> @combine_permps_as_permpd(<8 x float> %a) {
; X32-LABEL: combine_permps_as_permpd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,2,0,1]
; X32-NEXT: retl
;
; X64-LABEL: combine_permps_as_permpd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,2,0,1]
; X64-NEXT: retq
%1 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a, <8 x i32> <i32 6, i32 7, i32 4, i32 5, i32 0, i32 1, i32 2, i32 3>)
@@ -485,12 +485,12 @@ define <8 x float> @combine_permps_as_permpd(<8 x float> %a) {
define <4 x i64> @combine_pshufb_as_zext(<32 x i8> %a0) {
; X32-LABEL: combine_pshufb_as_zext:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; X32-NEXT: retl
;
; X64-LABEL: combine_pshufb_as_zext:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; X64-NEXT: retq
%1 = shufflevector <32 x i8> %a0, <32 x i8> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -501,16 +501,16 @@ define <4 x i64> @combine_pshufb_as_zext(<32 x i8> %a0) {
define <4 x i64> @combine_pshufb_as_zext128(<32 x i8> %a0) {
; X32-LABEL: combine_pshufb_as_zext128:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
-; X32-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1,0,1]
+; X32-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
; X32-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[15,14],zero,zero,zero,zero,zero,zero,ymm0[13,12],zero,zero,zero,zero,zero,zero,ymm0[31,30],zero,zero,zero,zero,zero,zero,ymm0[29,28],zero,zero,zero,zero,zero,zero
; X32-NEXT: retl
;
; X64-LABEL: combine_pshufb_as_zext128:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
-; X64-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1,0,1]
+; X64-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
; X64-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[15,14],zero,zero,zero,zero,zero,zero,ymm0[13,12],zero,zero,zero,zero,zero,zero,ymm0[31,30],zero,zero,zero,zero,zero,zero,ymm0[29,28],zero,zero,zero,zero,zero,zero
; X64-NEXT: retq
%1 = shufflevector <32 x i8> %a0, <32 x i8> undef, <32 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
@@ -521,25 +521,25 @@ define <4 x i64> @combine_pshufb_as_zext128(<32 x i8> %a0) {
define <4 x double> @combine_pshufb_as_vzmovl_64(<4 x double> %a0) {
; X32-AVX2-LABEL: combine_pshufb_as_vzmovl_64:
-; X32-AVX2: # BB#0:
-; X32-AVX2-NEXT: vxorpd %ymm1, %ymm1, %ymm1
+; X32-AVX2: # %bb.0:
+; X32-AVX2-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; X32-AVX2-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
; X32-AVX2-NEXT: retl
;
; X32-AVX512-LABEL: combine_pshufb_as_vzmovl_64:
-; X32-AVX512: # BB#0:
+; X32-AVX512: # %bb.0:
; X32-AVX512-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; X32-AVX512-NEXT: vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
; X32-AVX512-NEXT: retl
;
; X64-AVX2-LABEL: combine_pshufb_as_vzmovl_64:
-; X64-AVX2: # BB#0:
-; X64-AVX2-NEXT: vxorpd %ymm1, %ymm1, %ymm1
+; X64-AVX2: # %bb.0:
+; X64-AVX2-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; X64-AVX2-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
; X64-AVX2-NEXT: retq
;
; X64-AVX512-LABEL: combine_pshufb_as_vzmovl_64:
-; X64-AVX512: # BB#0:
+; X64-AVX512: # %bb.0:
; X64-AVX512-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; X64-AVX512-NEXT: vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
; X64-AVX512-NEXT: retq
@@ -551,25 +551,25 @@ define <4 x double> @combine_pshufb_as_vzmovl_64(<4 x double> %a0) {
define <8 x float> @combine_pshufb_as_vzmovl_32(<8 x float> %a0) {
; X32-AVX2-LABEL: combine_pshufb_as_vzmovl_32:
-; X32-AVX2: # BB#0:
-; X32-AVX2-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; X32-AVX2: # %bb.0:
+; X32-AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X32-AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
; X32-AVX2-NEXT: retl
;
; X32-AVX512-LABEL: combine_pshufb_as_vzmovl_32:
-; X32-AVX512: # BB#0:
+; X32-AVX512: # %bb.0:
; X32-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X32-AVX512-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; X32-AVX512-NEXT: retl
;
; X64-AVX2-LABEL: combine_pshufb_as_vzmovl_32:
-; X64-AVX2: # BB#0:
-; X64-AVX2-NEXT: vxorps %ymm1, %ymm1, %ymm1
+; X64-AVX2: # %bb.0:
+; X64-AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X64-AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
; X64-AVX2-NEXT: retq
;
; X64-AVX512-LABEL: combine_pshufb_as_vzmovl_32:
-; X64-AVX512: # BB#0:
+; X64-AVX512: # %bb.0:
; X64-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X64-AVX512-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; X64-AVX512-NEXT: retq
@@ -581,12 +581,12 @@ define <8 x float> @combine_pshufb_as_vzmovl_32(<8 x float> %a0) {
define <32 x i8> @combine_pshufb_as_pslldq(<32 x i8> %a0) {
; X32-LABEL: combine_pshufb_as_pslldq:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpslldq {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1,2,3,4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,18,19,20,21]
; X32-NEXT: retl
;
; X64-LABEL: combine_pshufb_as_pslldq:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpslldq {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1,2,3,4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,18,19,20,21]
; X64-NEXT: retq
%res0 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5>)
@@ -595,12 +595,12 @@ define <32 x i8> @combine_pshufb_as_pslldq(<32 x i8> %a0) {
define <32 x i8> @combine_pshufb_as_psrldq(<32 x i8> %a0) {
; X32-LABEL: combine_pshufb_as_psrldq:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; X32-NEXT: retl
;
; X64-LABEL: combine_pshufb_as_psrldq:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; X64-NEXT: retq
%res0 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 15, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 15, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128>)
@@ -609,12 +609,12 @@ define <32 x i8> @combine_pshufb_as_psrldq(<32 x i8> %a0) {
define <32 x i8> @combine_pshufb_as_psrlw(<32 x i8> %a0) {
; X32-LABEL: combine_pshufb_as_psrlw:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpsrlw $8, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: combine_pshufb_as_psrlw:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpsrlw $8, %ymm0, %ymm0
; X64-NEXT: retq
%res0 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 1, i8 128, i8 3, i8 128, i8 5, i8 128, i8 7, i8 128, i8 9, i8 128, i8 11, i8 128, i8 13, i8 128, i8 15, i8 128, i8 17, i8 128, i8 19, i8 128, i8 21, i8 128, i8 23, i8 128, i8 25, i8 128, i8 27, i8 128, i8 29, i8 128, i8 31, i8 128>)
@@ -623,12 +623,12 @@ define <32 x i8> @combine_pshufb_as_psrlw(<32 x i8> %a0) {
define <32 x i8> @combine_pshufb_as_pslld(<32 x i8> %a0) {
; X32-LABEL: combine_pshufb_as_pslld:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpslld $24, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: combine_pshufb_as_pslld:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpslld $24, %ymm0, %ymm0
; X64-NEXT: retq
%res0 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 128, i8 128, i8 128, i8 0, i8 128, i8 128, i8 128, i8 4, i8 128, i8 128, i8 128, i8 8, i8 128, i8 128, i8 128, i8 12, i8 128, i8 128, i8 128, i8 16, i8 128, i8 128, i8 128, i8 20, i8 128, i8 128, i8 128, i8 24, i8 128, i8 128, i8 128, i8 28>)
@@ -637,12 +637,12 @@ define <32 x i8> @combine_pshufb_as_pslld(<32 x i8> %a0) {
define <32 x i8> @combine_pshufb_as_psrlq(<32 x i8> %a0) {
; X32-LABEL: combine_pshufb_as_psrlq:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpsrlq $40, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: combine_pshufb_as_psrlq:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpsrlq $40, %ymm0, %ymm0
; X64-NEXT: retq
%res0 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 5, i8 6, i8 7, i8 128, i8 128, i8 128, i8 128, i8 128, i8 13, i8 14, i8 15, i8 128, i8 128, i8 128, i8 128, i8 128, i8 21, i8 22, i8 23, i8 128, i8 128, i8 128, i8 128, i8 128, i8 29, i8 30, i8 31, i8 128, i8 128, i8 128, i8 128, i8 128>)
@@ -651,12 +651,12 @@ define <32 x i8> @combine_pshufb_as_psrlq(<32 x i8> %a0) {
define <32 x i8> @combine_pshufb_as_pshuflw(<32 x i8> %a0) {
; X32-LABEL: combine_pshufb_as_pshuflw:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15]
; X32-NEXT: retl
;
; X64-LABEL: combine_pshufb_as_pshuflw:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15]
; X64-NEXT: retq
%res0 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 2, i8 3, i8 0, i8 1, i8 6, i8 7, i8 4, i8 5, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 2, i8 3, i8 0, i8 1, i8 6, i8 7, i8 4, i8 5, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>)
@@ -665,12 +665,12 @@ define <32 x i8> @combine_pshufb_as_pshuflw(<32 x i8> %a0) {
define <32 x i8> @combine_pshufb_as_pshufhw(<32 x i8> %a0) {
; X32-LABEL: combine_pshufb_as_pshufhw:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,5,4,7,6,8,9,10,11,13,12,15,14]
; X32-NEXT: retl
;
; X64-LABEL: combine_pshufb_as_pshufhw:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,5,4,7,6,8,9,10,11,13,12,15,14]
; X64-NEXT: retq
%res0 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 10, i8 11, i8 8, i8 9, i8 14, i8 15, i8 12, i8 13, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 10, i8 11, i8 8, i8 9, i8 14, i8 15, i8 12, i8 13>)
@@ -679,12 +679,12 @@ define <32 x i8> @combine_pshufb_as_pshufhw(<32 x i8> %a0) {
define <32 x i8> @combine_pshufb_not_as_pshufw(<32 x i8> %a0) {
; X32-LABEL: combine_pshufb_not_as_pshufw:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13,18,19,16,17,22,23,20,21,26,27,24,25,30,31,28,29]
; X32-NEXT: retl
;
; X64-LABEL: combine_pshufb_not_as_pshufw:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13,18,19,16,17,22,23,20,21,26,27,24,25,30,31,28,29]
; X64-NEXT: retq
%res0 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 2, i8 3, i8 0, i8 1, i8 6, i8 7, i8 4, i8 5, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 2, i8 3, i8 0, i8 1, i8 6, i8 7, i8 4, i8 5, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>)
@@ -694,11 +694,11 @@ define <32 x i8> @combine_pshufb_not_as_pshufw(<32 x i8> %a0) {
define <32 x i8> @combine_pshufb_as_unpacklo_undef(<32 x i8> %a0) {
; X32-LABEL: combine_pshufb_as_unpacklo_undef:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: retl
;
; X64-LABEL: combine_pshufb_as_unpacklo_undef:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: retq
%1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 undef, i8 0, i8 undef, i8 1, i8 undef, i8 2, i8 undef, i8 3, i8 undef, i8 4, i8 undef, i8 5, i8 undef, i8 6, i8 undef, i8 7, i8 undef, i8 16, i8 undef, i8 17, i8 undef, i8 18, i8 undef, i8 19, i8 undef, i8 20, i8 undef, i8 21, i8 undef, i8 22, i8 undef, i8 23>)
%2 = shufflevector <32 x i8> %1, <32 x i8> undef, <32 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14, i32 16, i32 16, i32 18, i32 18, i32 20, i32 20, i32 22, i32 22, i32 24, i32 24, i32 26, i32 26, i32 28, i32 28, i32 30, i32 30>
@@ -707,14 +707,14 @@ define <32 x i8> @combine_pshufb_as_unpacklo_undef(<32 x i8> %a0) {
define <32 x i8> @combine_pshufb_as_unpacklo_zero(<32 x i8> %a0) {
; X32-LABEL: combine_pshufb_as_unpacklo_zero:
-; X32: # BB#0:
-; X32-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; X32: # %bb.0:
+; X32-NEXT: vpxor %xmm1, %xmm1, %xmm1
; X32-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
; X32-NEXT: retl
;
; X64-LABEL: combine_pshufb_as_unpacklo_zero:
-; X64: # BB#0:
-; X64-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; X64: # %bb.0:
+; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1
; X64-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
; X64-NEXT: retq
%1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 0, i8 1, i8 -1, i8 -1, i8 2, i8 3, i8 -1, i8 -1, i8 4, i8 5, i8 -1, i8 -1, i8 6, i8 7, i8 -1, i8 -1, i8 16, i8 17, i8 -1, i8 -1, i8 18, i8 19, i8 -1, i8 -1, i8 20, i8 21, i8 -1, i8 -1, i8 22, i8 23, i8 -1, i8 -1>)
@@ -723,14 +723,14 @@ define <32 x i8> @combine_pshufb_as_unpacklo_zero(<32 x i8> %a0) {
define <32 x i8> @combine_pshufb_as_unpackhi_zero(<32 x i8> %a0) {
; X32-LABEL: combine_pshufb_as_unpackhi_zero:
-; X32: # BB#0:
-; X32-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; X32: # %bb.0:
+; X32-NEXT: vpxor %xmm1, %xmm1, %xmm1
; X32-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
; X32-NEXT: retl
;
; X64-LABEL: combine_pshufb_as_unpackhi_zero:
-; X64: # BB#0:
-; X64-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; X64: # %bb.0:
+; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1
; X64-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
; X64-NEXT: retq
%1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 -1, i8 8, i8 -1, i8 9, i8 -1, i8 10, i8 -1, i8 11, i8 -1, i8 12, i8 -1, i8 13, i8 -1, i8 14, i8 -1, i8 15, i8 -1, i8 24, i8 -1, i8 25, i8 -1, i8 26, i8 -1, i8 27, i8 -1, i8 28, i8 -1, i8 29, i8 -1, i8 30, i8 -1, i8 31>)
@@ -739,12 +739,12 @@ define <32 x i8> @combine_pshufb_as_unpackhi_zero(<32 x i8> %a0) {
define <32 x i8> @combine_psrlw_pshufb(<16 x i16> %a0) {
; X32-LABEL: combine_psrlw_pshufb:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vandps {{\.LCPI.*}}, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: combine_psrlw_pshufb:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
; X64-NEXT: retq
%1 = lshr <16 x i16> %a0, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
@@ -755,12 +755,12 @@ define <32 x i8> @combine_psrlw_pshufb(<16 x i16> %a0) {
define <32 x i8> @combine_pslld_pshufb(<8 x i32> %a0) {
; X32-LABEL: combine_pslld_pshufb:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vandps {{\.LCPI.*}}, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: combine_pslld_pshufb:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
; X64-NEXT: retq
%1 = shl <8 x i32> %a0, <i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24>
@@ -771,12 +771,12 @@ define <32 x i8> @combine_pslld_pshufb(<8 x i32> %a0) {
define <32 x i8> @combine_psrlq_pshufb(<4 x i64> %a0) {
; X32-LABEL: combine_psrlq_pshufb:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[7,6,5,4],zero,zero,zero,zero,ymm0[15,14,13,12],zero,zero,zero,zero,ymm0[23,22,21],zero,zero,zero,zero,ymm0[31,30,29,28],zero
; X32-NEXT: retl
;
; X64-LABEL: combine_psrlq_pshufb:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[7,6,5,4],zero,zero,zero,zero,ymm0[15,14,13,12],zero,zero,zero,zero,ymm0[23,22,21],zero,zero,zero,zero,ymm0[31,30,29,28],zero
; X64-NEXT: retq
%1 = lshr <4 x i64> %a0, <i64 32, i64 32, i64 32, i64 32>
@@ -787,12 +787,12 @@ define <32 x i8> @combine_psrlq_pshufb(<4 x i64> %a0) {
define <32 x i8> @combine_unpack_unpack_pshufb(<32 x i8> %a0) {
; X32-LABEL: combine_unpack_unpack_pshufb:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,4,8,1,1,5,9,2,2,6,10,3,3,7,11,16,16,20,24,17,17,21,25,18,18,22,26,19,19,23,27]
; X32-NEXT: retl
;
; X64-LABEL: combine_unpack_unpack_pshufb:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,4,8,1,1,5,9,2,2,6,10,3,3,7,11,16,16,20,24,17,17,21,25,18,18,22,26,19,19,23,27]
; X64-NEXT: retq
%1 = shufflevector <32 x i8> %a0, <32 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 16, i32 17, i32 18, i32 19, i32 16, i32 17, i32 18, i32 19, i32 16, i32 17, i32 18, i32 19>
@@ -804,14 +804,88 @@ define <32 x i8> @combine_unpack_unpack_pshufb(<32 x i8> %a0) {
ret <32 x i8> %6
}
+define <16 x i16> @shuffle_combine_packssdw_pshufb(<8 x i32> %a0) {
+; X32-LABEL: shuffle_combine_packssdw_pshufb:
+; X32: # %bb.0:
+; X32-NEXT: vpsrad $31, %ymm0, %ymm0
+; X32-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[12,13,8,9,4,5,0,1,12,13,8,9,4,5,0,1,16,17,20,21,24,25,28,29,28,29,24,25,20,21,16,17]
+; X32-NEXT: retl
+;
+; X64-LABEL: shuffle_combine_packssdw_pshufb:
+; X64: # %bb.0:
+; X64-NEXT: vpsrad $31, %ymm0, %ymm0
+; X64-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[12,13,8,9,4,5,0,1,12,13,8,9,4,5,0,1,16,17,20,21,24,25,28,29,28,29,24,25,20,21,16,17]
+; X64-NEXT: retq
+ %1 = ashr <8 x i32> %a0, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+ %2 = tail call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %1, <8 x i32> %1)
+ %3 = shufflevector <16 x i16> %2, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 3, i32 2, i32 1, i32 0, i32 8, i32 9, i32 10, i32 11, i32 11, i32 10, i32 9, i32 8>
+ ret <16 x i16> %3
+}
+declare <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32>, <8 x i32>) nounwind readnone
+
+define <32 x i8> @shuffle_combine_packsswb_pshufb(<16 x i16> %a0, <16 x i16> %a1) {
+; X32-LABEL: shuffle_combine_packsswb_pshufb:
+; X32: # %bb.0:
+; X32-NEXT: vpsraw $15, %ymm0, %ymm0
+; X32-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[14,12,10,8,6,4,2,0,14,12,10,8,6,4,2,0,30,28,26,24,22,20,18,16,30,28,26,24,22,20,18,16]
+; X32-NEXT: retl
+;
+; X64-LABEL: shuffle_combine_packsswb_pshufb:
+; X64: # %bb.0:
+; X64-NEXT: vpsraw $15, %ymm0, %ymm0
+; X64-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[14,12,10,8,6,4,2,0,14,12,10,8,6,4,2,0,30,28,26,24,22,20,18,16,30,28,26,24,22,20,18,16]
+; X64-NEXT: retq
+ %1 = ashr <16 x i16> %a0, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
+ %2 = ashr <16 x i16> %a1, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
+ %3 = tail call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %1, <16 x i16> %2)
+ %4 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %3, <32 x i8> <i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+ ret <32 x i8> %4
+}
+declare <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16>, <16 x i16>) nounwind readnone
+
+define <16 x i16> @shuffle_combine_packusdw_pshufb(<8 x i32> %a0, <8 x i32> %a1) {
+; X32-LABEL: shuffle_combine_packusdw_pshufb:
+; X32: # %bb.0:
+; X32-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[14,15,10,11,6,7,2,3,14,15,10,11,6,7,2,3,18,19,22,23,26,27,30,31,30,31,26,27,22,23,18,19]
+; X32-NEXT: retl
+;
+; X64-LABEL: shuffle_combine_packusdw_pshufb:
+; X64: # %bb.0:
+; X64-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[14,15,10,11,6,7,2,3,14,15,10,11,6,7,2,3,18,19,22,23,26,27,30,31,30,31,26,27,22,23,18,19]
+; X64-NEXT: retq
+ %1 = lshr <8 x i32> %a0, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+ %2 = tail call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %1, <8 x i32> %1)
+ %3 = shufflevector <16 x i16> %2, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 3, i32 2, i32 1, i32 0, i32 8, i32 9, i32 10, i32 11, i32 11, i32 10, i32 9, i32 8>
+ ret <16 x i16> %3
+}
+declare <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32>, <8 x i32>) nounwind readnone
+
+define <32 x i8> @shuffle_combine_packuswb_pshufb(<16 x i16> %a0, <16 x i16> %a1) {
+; X32-LABEL: shuffle_combine_packuswb_pshufb:
+; X32: # %bb.0:
+; X32-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[15,13,11,9,7,5,3,1,15,13,11,9,7,5,3,1,31,29,27,25,23,21,19,17,31,29,27,25,23,21,19,17]
+; X32-NEXT: retl
+;
+; X64-LABEL: shuffle_combine_packuswb_pshufb:
+; X64: # %bb.0:
+; X64-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[15,13,11,9,7,5,3,1,15,13,11,9,7,5,3,1,31,29,27,25,23,21,19,17,31,29,27,25,23,21,19,17]
+; X64-NEXT: retq
+ %1 = lshr <16 x i16> %a0, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+ %2 = lshr <16 x i16> %a1, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+ %3 = tail call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %1, <16 x i16> %2)
+ %4 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %3, <32 x i8> <i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+ ret <32 x i8> %4
+}
+declare <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16>, <16 x i16>) nounwind readnone
+
define <16 x i8> @combine_pshufb_insertion_as_broadcast_v2i64(i64 %a0) {
; X32-LABEL: combine_pshufb_insertion_as_broadcast_v2i64:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpbroadcastq {{[0-9]+}}(%esp), %xmm0
; X32-NEXT: retl
;
; X64-LABEL: combine_pshufb_insertion_as_broadcast_v2i64:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovq %rdi, %xmm0
; X64-NEXT: vpbroadcastq %xmm0, %xmm0
; X64-NEXT: retq
@@ -823,12 +897,12 @@ define <16 x i8> @combine_pshufb_insertion_as_broadcast_v2i64(i64 %a0) {
define <8 x i32> @combine_permd_insertion_as_broadcast_v4i64(i64 %a0) {
; X32-LABEL: combine_permd_insertion_as_broadcast_v4i64:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vbroadcastsd {{[0-9]+}}(%esp), %ymm0
; X32-NEXT: retl
;
; X64-LABEL: combine_permd_insertion_as_broadcast_v4i64:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovq %rdi, %xmm0
; X64-NEXT: vpbroadcastq %xmm0, %ymm0
; X64-NEXT: retq
@@ -840,12 +914,12 @@ define <8 x i32> @combine_permd_insertion_as_broadcast_v4i64(i64 %a0) {
define <8 x i32> @constant_fold_permd() {
; X32-LABEL: constant_fold_permd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vmovaps {{.*#+}} ymm0 = [5,7,3,2,8,2,6,1]
; X32-NEXT: retl
;
; X64-LABEL: constant_fold_permd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovaps {{.*#+}} ymm0 = [5,7,3,2,8,2,6,1]
; X64-NEXT: retq
%1 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>, <8 x i32> <i32 4, i32 6, i32 2, i32 1, i32 7, i32 1, i32 5, i32 0>)
@@ -854,12 +928,12 @@ define <8 x i32> @constant_fold_permd() {
define <8 x float> @constant_fold_permps() {
; X32-LABEL: constant_fold_permps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vmovaps {{.*#+}} ymm0 = [5.000000e+00,7.000000e+00,3.000000e+00,2.000000e+00,8.000000e+00,2.000000e+00,6.000000e+00,1.000000e+00]
; X32-NEXT: retl
;
; X64-LABEL: constant_fold_permps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovaps {{.*#+}} ymm0 = [5.000000e+00,7.000000e+00,3.000000e+00,2.000000e+00,8.000000e+00,2.000000e+00,6.000000e+00,1.000000e+00]
; X64-NEXT: retq
%1 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>, <8 x i32> <i32 4, i32 6, i32 2, i32 1, i32 7, i32 1, i32 5, i32 0>)
@@ -868,12 +942,12 @@ define <8 x float> @constant_fold_permps() {
define <32 x i8> @constant_fold_pshufb_256() {
; X32-LABEL: constant_fold_pshufb_256:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vmovaps {{.*#+}} ymm0 = <14,0,0,0,u,u,0,0,0,0,0,0,0,0,8,9,255,0,0,0,u,u,0,0,241,0,0,0,0,0,249,250>
; X32-NEXT: retl
;
; X64-LABEL: constant_fold_pshufb_256:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovaps {{.*#+}} ymm0 = <14,0,0,0,u,u,0,0,0,0,0,0,0,0,8,9,255,0,0,0,u,u,0,0,241,0,0,0,0,0,249,250>
; X64-NEXT: retq
%1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> <i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 -1, i8 -2, i8 -3, i8 -4, i8 -5, i8 -6, i8 -7, i8 -8, i8 -9, i8 -10, i8 -11, i8 -12, i8 -13, i8 -14, i8 -15>, <32 x i8> <i8 1, i8 -1, i8 -1, i8 -1, i8 undef, i8 undef, i8 -1, i8 -1, i8 15, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 7, i8 6, i8 1, i8 -1, i8 -1, i8 -1, i8 undef, i8 undef, i8 -1, i8 -1, i8 15, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 7, i8 6>)
@@ -882,26 +956,68 @@ define <32 x i8> @constant_fold_pshufb_256() {
define <32 x i8> @PR27320(<8 x i32> %a0) {
; X32-LABEL: PR27320:
-; X32: # BB#0:
-; X32-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[12,13,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X32-NEXT: vextracti128 $1, %ymm0, %xmm2
-; X32-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,xmm2[0,0,1,2,3,3,4,5,6,6,7]
-; X32-NEXT: vpor %xmm1, %xmm2, %xmm1
-; X32-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,1,2,3,4,4,5,6,7,7,8,9,10,10,11]
-; X32-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; X32: # %bb.0:
+; X32-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1]
+; X32-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,1,2,3,4,4,5,6,7,7,8,9,10,10,11,28,29,29,30,31,16,16,17,18,19,19,20,21,22,22,23]
; X32-NEXT: retl
;
; X64-LABEL: PR27320:
-; X64: # BB#0:
-; X64-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[12,13,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X64-NEXT: vextracti128 $1, %ymm0, %xmm2
-; X64-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,xmm2[0,0,1,2,3,3,4,5,6,6,7]
-; X64-NEXT: vpor %xmm1, %xmm2, %xmm1
-; X64-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,1,2,3,4,4,5,6,7,7,8,9,10,10,11]
-; X64-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; X64: # %bb.0:
+; X64-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1]
+; X64-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,1,2,3,4,4,5,6,7,7,8,9,10,10,11,28,29,29,30,31,16,16,17,18,19,19,20,21,22,22,23]
; X64-NEXT: retq
%1 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 undef, i32 3, i32 4, i32 5, i32 undef>
%2 = bitcast <8 x i32> %1 to <32 x i8>
%3 = shufflevector <32 x i8> %2, <32 x i8> undef, <32 x i32> <i32 0, i32 1, i32 1, i32 2, i32 3, i32 4, i32 4, i32 5, i32 6, i32 7, i32 7, i32 8, i32 9, i32 10, i32 10, i32 11, i32 16, i32 17, i32 17, i32 18, i32 19, i32 20, i32 20, i32 21, i32 22, i32 23, i32 23, i32 24, i32 25, i32 26, i32 26, i32 27>
ret <32 x i8> %3
}
+
+define internal fastcc <8 x float> @PR34577(<8 x float> %inp0, <8 x float> %inp1, <8 x float> %inp2) {
+; X32-AVX2-LABEL: PR34577:
+; X32-AVX2: # %bb.0: # %entry
+; X32-AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; X32-AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; X32-AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; X32-AVX2-NEXT: vmovapd {{.*#+}} ymm2 = <u,u,7,2,u,u,3,2>
+; X32-AVX2-NEXT: vpermps %ymm1, %ymm2, %ymm1
+; X32-AVX2-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3]
+; X32-AVX2-NEXT: retl
+;
+; X32-AVX512-LABEL: PR34577:
+; X32-AVX512: # %bb.0: # %entry
+; X32-AVX512-NEXT: vmovapd {{.*#+}} ymm2 = <1,u,u,u,2,u,5,0>
+; X32-AVX512-NEXT: vpermps %ymm0, %ymm2, %ymm0
+; X32-AVX512-NEXT: vxorpd %xmm2, %xmm2, %xmm2
+; X32-AVX512-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3]
+; X32-AVX512-NEXT: vmovapd {{.*#+}} ymm2 = <u,u,7,2,u,u,3,2>
+; X32-AVX512-NEXT: vpermps %ymm1, %ymm2, %ymm1
+; X32-AVX512-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3]
+; X32-AVX512-NEXT: retl
+;
+; X64-AVX2-LABEL: PR34577:
+; X64-AVX2: # %bb.0: # %entry
+; X64-AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; X64-AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; X64-AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; X64-AVX2-NEXT: vmovapd {{.*#+}} ymm2 = <u,u,7,2,u,u,3,2>
+; X64-AVX2-NEXT: vpermps %ymm1, %ymm2, %ymm1
+; X64-AVX2-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3]
+; X64-AVX2-NEXT: retq
+;
+; X64-AVX512-LABEL: PR34577:
+; X64-AVX512: # %bb.0: # %entry
+; X64-AVX512-NEXT: vmovapd {{.*#+}} ymm2 = <1,u,u,u,2,u,5,0>
+; X64-AVX512-NEXT: vpermps %ymm0, %ymm2, %ymm0
+; X64-AVX512-NEXT: vxorpd %xmm2, %xmm2, %xmm2
+; X64-AVX512-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3]
+; X64-AVX512-NEXT: vmovapd {{.*#+}} ymm2 = <u,u,7,2,u,u,3,2>
+; X64-AVX512-NEXT: vpermps %ymm1, %ymm2, %ymm1
+; X64-AVX512-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3]
+; X64-AVX512-NEXT: retq
+entry:
+ %shuf0 = shufflevector <8 x float> %inp0, <8 x float> %inp2, <8 x i32> <i32 1, i32 10, i32 11, i32 13, i32 2, i32 13, i32 5, i32 0>
+ %sel = select <8 x i1> <i1 false, i1 true, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false>, <8 x float> %shuf0, <8 x float> zeroinitializer
+ %shuf1 = shufflevector <8 x float> zeroinitializer, <8 x float> %sel, <8 x i32> <i32 6, i32 11, i32 6, i32 15, i32 12, i32 11, i32 1, i32 3>
+ %shuf2 = shufflevector <8 x float> %inp1, <8 x float> %shuf1, <8 x i32> <i32 15, i32 10, i32 7, i32 2, i32 12, i32 undef, i32 3, i32 2>
+ ret <8 x float> %shuf2
+}
diff --git a/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll b/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll
index b68f609fc65d..474fdabfb467 100644
--- a/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll
+++ b/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll
@@ -27,11 +27,11 @@ declare <32 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.512(<32 x i16>, <32 x i16
define <8 x double> @combine_permvar_8f64_identity(<8 x double> %x0, <8 x double> %x1) {
; X32-LABEL: combine_permvar_8f64_identity:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: retl
;
; X64-LABEL: combine_permvar_8f64_identity:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: retq
%res0 = call <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double> %x0, <8 x i64> <i64 7, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>, <8 x double> %x1, i8 -1)
%res1 = call <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double> %res0, <8 x i64> <i64 7, i64 14, i64 5, i64 12, i64 3, i64 10, i64 1, i64 8>, <8 x double> %res0, i8 -1)
@@ -39,7 +39,7 @@ define <8 x double> @combine_permvar_8f64_identity(<8 x double> %x0, <8 x double
}
define <8 x double> @combine_permvar_8f64_identity_mask(<8 x double> %x0, <8 x double> %x1, i8 %m) {
; X32-LABEL: combine_permvar_8f64_identity_mask:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; X32-NEXT: kmovd %eax, %k1
; X32-NEXT: vmovapd {{.*#+}} zmm2 = [7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0]
@@ -50,7 +50,7 @@ define <8 x double> @combine_permvar_8f64_identity_mask(<8 x double> %x0, <8 x d
; X32-NEXT: retl
;
; X64-LABEL: combine_permvar_8f64_identity_mask:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: kmovd %edi, %k1
; X64-NEXT: vmovapd {{.*#+}} zmm2 = [7,6,5,4,3,2,1,0]
; X64-NEXT: vpermpd %zmm0, %zmm2, %zmm1 {%k1}
@@ -65,11 +65,11 @@ define <8 x double> @combine_permvar_8f64_identity_mask(<8 x double> %x0, <8 x d
define <8 x i64> @combine_permvar_8i64_identity(<8 x i64> %x0, <8 x i64> %x1) {
; X32-LABEL: combine_permvar_8i64_identity:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: retl
;
; X64-LABEL: combine_permvar_8i64_identity:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: retq
%res0 = call <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64> %x0, <8 x i64> <i64 7, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>, <8 x i64> %x1, i8 -1)
%res1 = call <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64> %res0, <8 x i64> <i64 7, i64 14, i64 5, i64 12, i64 3, i64 10, i64 1, i64 8>, <8 x i64> %res0, i8 -1)
@@ -77,7 +77,7 @@ define <8 x i64> @combine_permvar_8i64_identity(<8 x i64> %x0, <8 x i64> %x1) {
}
define <8 x i64> @combine_permvar_8i64_identity_mask(<8 x i64> %x0, <8 x i64> %x1, i8 %m) {
; X32-LABEL: combine_permvar_8i64_identity_mask:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; X32-NEXT: kmovd %eax, %k1
; X32-NEXT: vmovdqa64 {{.*#+}} zmm2 = [7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0]
@@ -88,7 +88,7 @@ define <8 x i64> @combine_permvar_8i64_identity_mask(<8 x i64> %x0, <8 x i64> %x
; X32-NEXT: retl
;
; X64-LABEL: combine_permvar_8i64_identity_mask:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: kmovd %edi, %k1
; X64-NEXT: vmovdqa64 {{.*#+}} zmm2 = [7,6,5,4,3,2,1,0]
; X64-NEXT: vpermq %zmm0, %zmm2, %zmm1 {%k1}
@@ -103,11 +103,11 @@ define <8 x i64> @combine_permvar_8i64_identity_mask(<8 x i64> %x0, <8 x i64> %x
define <8 x double> @combine_vpermt2var_8f64_identity(<8 x double> %x0, <8 x double> %x1) {
; X32-LABEL: combine_vpermt2var_8f64_identity:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: retl
;
; X64-LABEL: combine_vpermt2var_8f64_identity:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: retq
%res0 = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> <i64 7, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>, <8 x double> %x0, <8 x double> %x1, i8 -1)
%res1 = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> <i64 7, i64 14, i64 5, i64 12, i64 3, i64 10, i64 1, i64 8>, <8 x double> %res0, <8 x double> %res0, i8 -1)
@@ -115,7 +115,7 @@ define <8 x double> @combine_vpermt2var_8f64_identity(<8 x double> %x0, <8 x dou
}
define <8 x double> @combine_vpermt2var_8f64_identity_mask(<8 x double> %x0, <8 x double> %x1, i8 %m) {
; X32-LABEL: combine_vpermt2var_8f64_identity_mask:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; X32-NEXT: kmovd %eax, %k1
; X32-NEXT: vmovapd {{.*#+}} zmm2 = [7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0]
@@ -125,7 +125,7 @@ define <8 x double> @combine_vpermt2var_8f64_identity_mask(<8 x double> %x0, <8
; X32-NEXT: retl
;
; X64-LABEL: combine_vpermt2var_8f64_identity_mask:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: kmovd %edi, %k1
; X64-NEXT: vmovapd {{.*#+}} zmm2 = [7,6,5,4,3,2,1,0]
; X64-NEXT: vpermi2pd %zmm1, %zmm0, %zmm2 {%k1} {z}
@@ -139,12 +139,12 @@ define <8 x double> @combine_vpermt2var_8f64_identity_mask(<8 x double> %x0, <8
define <8 x double> @combine_vpermt2var_8f64_movddup(<8 x double> %x0, <8 x double> %x1) {
; X32-LABEL: combine_vpermt2var_8f64_movddup:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vmovddup {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6]
; X32-NEXT: retl
;
; X64-LABEL: combine_vpermt2var_8f64_movddup:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovddup {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6]
; X64-NEXT: retq
%res0 = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> <i64 0, i64 0, i64 2, i64 2, i64 4, i64 4, i64 undef, i64 undef>, <8 x double> %x0, <8 x double> %x1, i8 -1)
@@ -152,13 +152,13 @@ define <8 x double> @combine_vpermt2var_8f64_movddup(<8 x double> %x0, <8 x doub
}
define <8 x double> @combine_vpermt2var_8f64_movddup_load(<8 x double> *%p0, <8 x double> %x1) {
; X32-LABEL: combine_vpermt2var_8f64_movddup_load:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vmovddup {{.*#+}} zmm0 = mem[0,0,2,2,4,4,6,6]
; X32-NEXT: retl
;
; X64-LABEL: combine_vpermt2var_8f64_movddup_load:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovddup {{.*#+}} zmm0 = mem[0,0,2,2,4,4,6,6]
; X64-NEXT: retq
%x0 = load <8 x double>, <8 x double> *%p0
@@ -167,14 +167,14 @@ define <8 x double> @combine_vpermt2var_8f64_movddup_load(<8 x double> *%p0, <8
}
define <8 x double> @combine_vpermt2var_8f64_movddup_mask(<8 x double> %x0, <8 x double> %x1, i8 %m) {
; X32-LABEL: combine_vpermt2var_8f64_movddup_mask:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; X32-NEXT: kmovd %eax, %k1
; X32-NEXT: vmovddup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6]
; X32-NEXT: retl
;
; X64-LABEL: combine_vpermt2var_8f64_movddup_mask:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: kmovd %edi, %k1
; X64-NEXT: vmovddup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6]
; X64-NEXT: retq
@@ -184,11 +184,11 @@ define <8 x double> @combine_vpermt2var_8f64_movddup_mask(<8 x double> %x0, <8 x
define <8 x i64> @combine_vpermt2var_8i64_identity(<8 x i64> %x0, <8 x i64> %x1) {
; X32-LABEL: combine_vpermt2var_8i64_identity:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: retl
;
; X64-LABEL: combine_vpermt2var_8i64_identity:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: retq
%res0 = call <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64> <i64 undef, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>, <8 x i64> %x0, <8 x i64> %x1, i8 -1)
%res1 = call <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64> <i64 undef, i64 14, i64 5, i64 12, i64 3, i64 10, i64 1, i64 8>, <8 x i64> %res0, <8 x i64> %res0, i8 -1)
@@ -196,7 +196,7 @@ define <8 x i64> @combine_vpermt2var_8i64_identity(<8 x i64> %x0, <8 x i64> %x1)
}
define <8 x i64> @combine_vpermt2var_8i64_identity_mask(<8 x i64> %x0, <8 x i64> %x1, i8 %m) {
; X32-LABEL: combine_vpermt2var_8i64_identity_mask:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; X32-NEXT: kmovd %eax, %k1
; X32-NEXT: vmovdqa64 {{.*#+}} zmm2 = [7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0]
@@ -206,7 +206,7 @@ define <8 x i64> @combine_vpermt2var_8i64_identity_mask(<8 x i64> %x0, <8 x i64>
; X32-NEXT: retl
;
; X64-LABEL: combine_vpermt2var_8i64_identity_mask:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: kmovd %edi, %k1
; X64-NEXT: vmovdqa64 {{.*#+}} zmm2 = [7,6,5,4,3,2,1,0]
; X64-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 {%k1} {z}
@@ -220,11 +220,11 @@ define <8 x i64> @combine_vpermt2var_8i64_identity_mask(<8 x i64> %x0, <8 x i64>
define <16 x float> @combine_vpermt2var_16f32_identity(<16 x float> %x0, <16 x float> %x1) {
; X32-LABEL: combine_vpermt2var_16f32_identity:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: retl
;
; X64-LABEL: combine_vpermt2var_16f32_identity:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: retq
%res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>, <16 x float> %x0, <16 x float> %x1, i16 -1)
%res1 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 15, i32 30, i32 13, i32 28, i32 11, i32 26, i32 9, i32 24, i32 7, i32 22, i32 5, i32 20, i32 3, i32 18, i32 1, i32 16>, <16 x float> %res0, <16 x float> %res0, i16 -1)
@@ -232,7 +232,7 @@ define <16 x float> @combine_vpermt2var_16f32_identity(<16 x float> %x0, <16 x f
}
define <16 x float> @combine_vpermt2var_16f32_identity_mask(<16 x float> %x0, <16 x float> %x1, i16 %m) {
; X32-LABEL: combine_vpermt2var_16f32_identity_mask:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
; X32-NEXT: vmovaps {{.*#+}} zmm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
; X32-NEXT: vpermi2ps %zmm1, %zmm0, %zmm2 {%k1} {z}
@@ -241,7 +241,7 @@ define <16 x float> @combine_vpermt2var_16f32_identity_mask(<16 x float> %x0, <1
; X32-NEXT: retl
;
; X64-LABEL: combine_vpermt2var_16f32_identity_mask:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: kmovd %edi, %k1
; X64-NEXT: vmovaps {{.*#+}} zmm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
; X64-NEXT: vpermi2ps %zmm1, %zmm0, %zmm2 {%k1} {z}
@@ -255,13 +255,13 @@ define <16 x float> @combine_vpermt2var_16f32_identity_mask(<16 x float> %x0, <1
define <16 x float> @combine_vpermt2var_16f32_vmovddup(<16 x float> %x0, <16 x float> %x1) {
; X32-LABEL: combine_vpermt2var_16f32_vmovddup:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vmovaps {{.*#+}} zmm2 = [0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13]
; X32-NEXT: vpermt2ps %zmm1, %zmm2, %zmm0
; X32-NEXT: retl
;
; X64-LABEL: combine_vpermt2var_16f32_vmovddup:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovaps {{.*#+}} zmm2 = [0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13]
; X64-NEXT: vpermt2ps %zmm1, %zmm2, %zmm0
; X64-NEXT: retq
@@ -270,7 +270,7 @@ define <16 x float> @combine_vpermt2var_16f32_vmovddup(<16 x float> %x0, <16 x f
}
define <16 x float> @combine_vpermt2var_16f32_vmovddup_load(<16 x float> *%p0, <16 x float> %x1) {
; X32-LABEL: combine_vpermt2var_16f32_vmovddup_load:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vmovaps (%eax), %zmm2
; X32-NEXT: vmovaps {{.*#+}} zmm1 = [0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13]
@@ -279,7 +279,7 @@ define <16 x float> @combine_vpermt2var_16f32_vmovddup_load(<16 x float> *%p0, <
; X32-NEXT: retl
;
; X64-LABEL: combine_vpermt2var_16f32_vmovddup_load:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovaps (%rdi), %zmm2
; X64-NEXT: vmovaps {{.*#+}} zmm1 = [0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13]
; X64-NEXT: vpermi2ps %zmm0, %zmm2, %zmm1
@@ -291,14 +291,14 @@ define <16 x float> @combine_vpermt2var_16f32_vmovddup_load(<16 x float> *%p0, <
}
define <16 x float> @combine_vpermt2var_16f32_vmovddup_mask(<16 x float> %x0, <16 x float> %x1, i16 %m) {
; X32-LABEL: combine_vpermt2var_16f32_vmovddup_mask:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
; X32-NEXT: vmovaps {{.*#+}} zmm2 = [0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13]
; X32-NEXT: vpermt2ps %zmm1, %zmm2, %zmm0 {%k1} {z}
; X32-NEXT: retl
;
; X64-LABEL: combine_vpermt2var_16f32_vmovddup_mask:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: kmovd %edi, %k1
; X64-NEXT: vmovaps {{.*#+}} zmm2 = [0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13]
; X64-NEXT: vpermt2ps %zmm1, %zmm2, %zmm0 {%k1} {z}
@@ -308,7 +308,7 @@ define <16 x float> @combine_vpermt2var_16f32_vmovddup_mask(<16 x float> %x0, <1
}
define <16 x float> @combine_vpermt2var_16f32_vmovddup_mask_load(<16 x float> *%p0, <16 x float> %x1, i16 %m) {
; X32-LABEL: combine_vpermt2var_16f32_vmovddup_mask_load:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vmovaps (%eax), %zmm2
@@ -318,7 +318,7 @@ define <16 x float> @combine_vpermt2var_16f32_vmovddup_mask_load(<16 x float> *%
; X32-NEXT: retl
;
; X64-LABEL: combine_vpermt2var_16f32_vmovddup_mask_load:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: kmovd %esi, %k1
; X64-NEXT: vmovaps (%rdi), %zmm2
; X64-NEXT: vmovaps {{.*#+}} zmm1 = [0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13]
@@ -332,12 +332,12 @@ define <16 x float> @combine_vpermt2var_16f32_vmovddup_mask_load(<16 x float> *%
define <16 x float> @combine_vpermt2var_16f32_vmovshdup(<16 x float> %x0, <16 x float> %x1) {
; X32-LABEL: combine_vpermt2var_16f32_vmovshdup:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vmovshdup {{.*#+}} zmm0 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
; X32-NEXT: retl
;
; X64-LABEL: combine_vpermt2var_16f32_vmovshdup:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovshdup {{.*#+}} zmm0 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
; X64-NEXT: retq
%res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>, <16 x float> %x0, <16 x float> %x1, i16 -1)
@@ -345,13 +345,13 @@ define <16 x float> @combine_vpermt2var_16f32_vmovshdup(<16 x float> %x0, <16 x
}
define <16 x float> @combine_vpermt2var_16f32_vmovshdup_load(<16 x float> *%p0, <16 x float> %x1) {
; X32-LABEL: combine_vpermt2var_16f32_vmovshdup_load:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vmovshdup {{.*#+}} zmm0 = mem[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
; X32-NEXT: retl
;
; X64-LABEL: combine_vpermt2var_16f32_vmovshdup_load:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovshdup {{.*#+}} zmm0 = mem[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
; X64-NEXT: retq
%x0 = load <16 x float>, <16 x float> *%p0
@@ -360,13 +360,13 @@ define <16 x float> @combine_vpermt2var_16f32_vmovshdup_load(<16 x float> *%p0,
}
define <16 x float> @combine_vpermt2var_16f32_vmovshdup_mask(<16 x float> %x0, <16 x float> %x1, i16 %m) {
; X32-LABEL: combine_vpermt2var_16f32_vmovshdup_mask:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
; X32-NEXT: vmovshdup {{.*#+}} zmm0 {%k1} {z} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
; X32-NEXT: retl
;
; X64-LABEL: combine_vpermt2var_16f32_vmovshdup_mask:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: kmovd %edi, %k1
; X64-NEXT: vmovshdup {{.*#+}} zmm0 {%k1} {z} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
; X64-NEXT: retq
@@ -376,12 +376,12 @@ define <16 x float> @combine_vpermt2var_16f32_vmovshdup_mask(<16 x float> %x0, <
define <16 x float> @combine_vpermt2var_16f32_vmovsldup(<16 x float> %x0, <16 x float> %x1) {
; X32-LABEL: combine_vpermt2var_16f32_vmovsldup:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vmovsldup {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
; X32-NEXT: retl
;
; X64-LABEL: combine_vpermt2var_16f32_vmovsldup:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovsldup {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
; X64-NEXT: retq
%res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>, <16 x float> %x0, <16 x float> %x1, i16 -1)
@@ -389,13 +389,13 @@ define <16 x float> @combine_vpermt2var_16f32_vmovsldup(<16 x float> %x0, <16 x
}
define <16 x float> @combine_vpermt2var_16f32_vmovsldup_load(<16 x float> *%p0, <16 x float> %x1) {
; X32-LABEL: combine_vpermt2var_16f32_vmovsldup_load:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vmovsldup {{.*#+}} zmm0 = mem[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
; X32-NEXT: retl
;
; X64-LABEL: combine_vpermt2var_16f32_vmovsldup_load:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovsldup {{.*#+}} zmm0 = mem[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
; X64-NEXT: retq
%x0 = load <16 x float>, <16 x float> *%p0
@@ -404,13 +404,13 @@ define <16 x float> @combine_vpermt2var_16f32_vmovsldup_load(<16 x float> *%p0,
}
define <16 x float> @combine_vpermt2var_16f32_vmovsldup_mask(<16 x float> %x0, <16 x float> %x1, i16 %m) {
; X32-LABEL: combine_vpermt2var_16f32_vmovsldup_mask:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
; X32-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
; X32-NEXT: retl
;
; X64-LABEL: combine_vpermt2var_16f32_vmovsldup_mask:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: kmovd %edi, %k1
; X64-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
; X64-NEXT: retq
@@ -419,14 +419,14 @@ define <16 x float> @combine_vpermt2var_16f32_vmovsldup_mask(<16 x float> %x0, <
}
define <16 x float> @combine_vpermt2var_16f32_vmovsldup_mask_load(<16 x float> *%p0, <16 x float> %x1, i16 %m) {
; X32-LABEL: combine_vpermt2var_16f32_vmovsldup_mask_load:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
; X32-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} {z} = mem[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
; X32-NEXT: retl
;
; X64-LABEL: combine_vpermt2var_16f32_vmovsldup_mask_load:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: kmovd %esi, %k1
; X64-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} {z} = mem[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
; X64-NEXT: retq
@@ -437,12 +437,12 @@ define <16 x float> @combine_vpermt2var_16f32_vmovsldup_mask_load(<16 x float> *
define <16 x float> @combine_vpermt2var_16f32_vpermilps(<16 x float> %x0, <16 x float> %x1) {
; X32-LABEL: combine_vpermt2var_16f32_vpermilps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
; X32-NEXT: retl
;
; X64-LABEL: combine_vpermt2var_16f32_vpermilps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
; X64-NEXT: retq
%res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>, <16 x float> %x0, <16 x float> %x1, i16 -1)
@@ -450,13 +450,13 @@ define <16 x float> @combine_vpermt2var_16f32_vpermilps(<16 x float> %x0, <16 x
}
define <16 x float> @combine_vpermt2var_16f32_vpermilps_load(<16 x float> *%p0, <16 x float> %x1) {
; X32-LABEL: combine_vpermt2var_16f32_vpermilps_load:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpermilps {{.*#+}} zmm0 = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
; X32-NEXT: retl
;
; X64-LABEL: combine_vpermt2var_16f32_vpermilps_load:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpermilps {{.*#+}} zmm0 = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
; X64-NEXT: retq
%x0 = load <16 x float>, <16 x float> *%p0
@@ -465,13 +465,13 @@ define <16 x float> @combine_vpermt2var_16f32_vpermilps_load(<16 x float> *%p0,
}
define <16 x float> @combine_vpermt2var_16f32_vpermilps_mask(<16 x float> %x0, <16 x float> %x1, i16 %m) {
; X32-LABEL: combine_vpermt2var_16f32_vpermilps_mask:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
; X32-NEXT: vpermilps {{.*#+}} zmm0 {%k1} {z} = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
; X32-NEXT: retl
;
; X64-LABEL: combine_vpermt2var_16f32_vpermilps_mask:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: kmovd %edi, %k1
; X64-NEXT: vpermilps {{.*#+}} zmm0 {%k1} {z} = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
; X64-NEXT: retq
@@ -480,14 +480,14 @@ define <16 x float> @combine_vpermt2var_16f32_vpermilps_mask(<16 x float> %x0, <
}
define <16 x float> @combine_vpermt2var_16f32_vpermilps_mask_load(<16 x float> *%p0, <16 x float> %x1, i16 %m) {
; X32-LABEL: combine_vpermt2var_16f32_vpermilps_mask_load:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
; X32-NEXT: vpermilps {{.*#+}} zmm0 {%k1} {z} = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
; X32-NEXT: retl
;
; X64-LABEL: combine_vpermt2var_16f32_vpermilps_mask_load:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: kmovd %esi, %k1
; X64-NEXT: vpermilps {{.*#+}} zmm0 {%k1} {z} = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
; X64-NEXT: retq
@@ -498,11 +498,11 @@ define <16 x float> @combine_vpermt2var_16f32_vpermilps_mask_load(<16 x float> *
define <16 x i32> @combine_vpermt2var_16i32_identity(<16 x i32> %x0, <16 x i32> %x1) {
; X32-LABEL: combine_vpermt2var_16i32_identity:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: retl
;
; X64-LABEL: combine_vpermt2var_16i32_identity:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: retq
%res0 = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 undef>, <16 x i32> %x0, <16 x i32> %x1, i16 -1)
%res1 = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> <i32 15, i32 30, i32 13, i32 28, i32 undef, i32 26, i32 9, i32 24, i32 7, i32 22, i32 5, i32 20, i32 3, i32 18, i32 1, i32 16>, <16 x i32> %res0, <16 x i32> %res0, i16 -1)
@@ -510,7 +510,7 @@ define <16 x i32> @combine_vpermt2var_16i32_identity(<16 x i32> %x0, <16 x i32>
}
define <16 x i32> @combine_vpermt2var_16i32_identity_mask(<16 x i32> %x0, <16 x i32> %x1, i16 %m) {
; X32-LABEL: combine_vpermt2var_16i32_identity_mask:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
; X32-NEXT: vmovdqa32 {{.*#+}} zmm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
; X32-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 {%k1} {z}
@@ -519,7 +519,7 @@ define <16 x i32> @combine_vpermt2var_16i32_identity_mask(<16 x i32> %x0, <16 x
; X32-NEXT: retl
;
; X64-LABEL: combine_vpermt2var_16i32_identity_mask:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: kmovd %edi, %k1
; X64-NEXT: vmovdqa32 {{.*#+}} zmm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
; X64-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 {%k1} {z}
@@ -533,11 +533,11 @@ define <16 x i32> @combine_vpermt2var_16i32_identity_mask(<16 x i32> %x0, <16 x
define <32 x i16> @combine_vpermt2var_32i16_identity(<32 x i16> %x0, <32 x i16> %x1) {
; X32-LABEL: combine_vpermt2var_32i16_identity:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: retl
;
; X64-LABEL: combine_vpermt2var_32i16_identity:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: retq
%res0 = call <32 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.512(<32 x i16> <i16 31, i16 30, i16 29, i16 28, i16 27, i16 26, i16 25, i16 24, i16 23, i16 22, i16 21, i16 20, i16 19, i16 18, i16 17, i16 16, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>, <32 x i16> %x0, <32 x i16> %x1, i32 -1)
%res1 = call <32 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.512(<32 x i16> <i16 63, i16 30, i16 61, i16 28, i16 59, i16 26, i16 57, i16 24, i16 55, i16 22, i16 53, i16 20, i16 51, i16 18, i16 49, i16 16, i16 47, i16 46, i16 13, i16 44, i16 11, i16 42, i16 9, i16 40, i16 7, i16 38, i16 5, i16 36, i16 3, i16 34, i16 1, i16 32>, <32 x i16> %res0, <32 x i16> %res0, i32 -1)
@@ -545,20 +545,20 @@ define <32 x i16> @combine_vpermt2var_32i16_identity(<32 x i16> %x0, <32 x i16>
}
define <32 x i16> @combine_vpermt2var_32i16_identity_mask(<32 x i16> %x0, <32 x i16> %x1, i32 %m) {
; X32-LABEL: combine_vpermt2var_32i16_identity_mask:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
-; X32-NEXT: vmovdqu16 {{.*#+}} zmm2 = [31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
+; X32-NEXT: vmovdqa64 {{.*#+}} zmm2 = [31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
; X32-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 {%k1} {z}
-; X32-NEXT: vmovdqu16 {{.*#+}} zmm0 = [63,30,61,28,59,26,57,24,55,22,53,20,51,18,49,16,47,46,13,44,11,42,9,40,7,38,5,36,3,34,1,32]
+; X32-NEXT: vmovdqa64 {{.*#+}} zmm0 = [63,30,61,28,59,26,57,24,55,22,53,20,51,18,49,16,47,46,13,44,11,42,9,40,7,38,5,36,3,34,1,32]
; X32-NEXT: vpermi2w %zmm2, %zmm2, %zmm0 {%k1} {z}
; X32-NEXT: retl
;
; X64-LABEL: combine_vpermt2var_32i16_identity_mask:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: kmovd %edi, %k1
-; X64-NEXT: vmovdqu16 {{.*#+}} zmm2 = [31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
+; X64-NEXT: vmovdqa64 {{.*#+}} zmm2 = [31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
; X64-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 {%k1} {z}
-; X64-NEXT: vmovdqu16 {{.*#+}} zmm0 = [63,30,61,28,59,26,57,24,55,22,53,20,51,18,49,16,47,46,13,44,11,42,9,40,7,38,5,36,3,34,1,32]
+; X64-NEXT: vmovdqa64 {{.*#+}} zmm0 = [63,30,61,28,59,26,57,24,55,22,53,20,51,18,49,16,47,46,13,44,11,42,9,40,7,38,5,36,3,34,1,32]
; X64-NEXT: vpermi2w %zmm2, %zmm2, %zmm0 {%k1} {z}
; X64-NEXT: retq
%res0 = call <32 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.512(<32 x i16> <i16 31, i16 30, i16 29, i16 28, i16 27, i16 26, i16 25, i16 24, i16 23, i16 22, i16 21, i16 20, i16 19, i16 18, i16 17, i16 16, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>, <32 x i16> %x0, <32 x i16> %x1, i32 %m)
@@ -568,11 +568,11 @@ define <32 x i16> @combine_vpermt2var_32i16_identity_mask(<32 x i16> %x0, <32 x
define <64 x i8> @combine_pshufb_identity(<64 x i8> %x0) {
; X32-LABEL: combine_pshufb_identity:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: retl
;
; X64-LABEL: combine_pshufb_identity:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: retq
%select = bitcast <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1> to <64 x i8>
%mask = bitcast <16 x i32> <i32 202182159, i32 134810123, i32 67438087, i32 66051, i32 202182159, i32 undef, i32 67438087, i32 66051, i32 202182159, i32 134810123, i32 67438087, i32 66051, i32 202182159, i32 134810123, i32 67438087, i32 66051> to <64 x i8>
@@ -582,9 +582,9 @@ define <64 x i8> @combine_pshufb_identity(<64 x i8> %x0) {
}
define <64 x i8> @combine_pshufb_identity_mask(<64 x i8> %x0, i64 %m) {
; X32-LABEL: combine_pshufb_identity_mask:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
-; X32-NEXT: vmovdqu8 {{.*#+}} zmm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
+; X32-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
; X32-NEXT: kmovq {{[0-9]+}}(%esp), %k1
; X32-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3
; X32-NEXT: vpshufb %zmm2, %zmm0, %zmm3 {%k1}
@@ -593,9 +593,9 @@ define <64 x i8> @combine_pshufb_identity_mask(<64 x i8> %x0, i64 %m) {
; X32-NEXT: retl
;
; X64-LABEL: combine_pshufb_identity_mask:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
-; X64-NEXT: vmovdqu8 {{.*#+}} zmm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
+; X64-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
; X64-NEXT: kmovq %rdi, %k1
; X64-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3
; X64-NEXT: vpshufb %zmm2, %zmm0, %zmm3 {%k1}
@@ -611,12 +611,12 @@ define <64 x i8> @combine_pshufb_identity_mask(<64 x i8> %x0, i64 %m) {
define <32 x i16> @combine_permvar_as_vpbroadcastw512(<32 x i16> %x0) {
; X32-LABEL: combine_permvar_as_vpbroadcastw512:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpbroadcastw %xmm0, %zmm0
; X32-NEXT: retl
;
; X64-LABEL: combine_permvar_as_vpbroadcastw512:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpbroadcastw %xmm0, %zmm0
; X64-NEXT: retq
%1 = call <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16> %x0, <32 x i16> zeroinitializer, <32 x i16> undef, i32 -1)
@@ -625,12 +625,12 @@ define <32 x i16> @combine_permvar_as_vpbroadcastw512(<32 x i16> %x0) {
define <16 x i32> @combine_permvar_as_vpbroadcastd512(<16 x i32> %x0) {
; X32-LABEL: combine_permvar_as_vpbroadcastd512:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vbroadcastss %xmm0, %zmm0
; X32-NEXT: retl
;
; X64-LABEL: combine_permvar_as_vpbroadcastd512:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vbroadcastss %xmm0, %zmm0
; X64-NEXT: retq
%1 = call <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32> %x0, <16 x i32> zeroinitializer, <16 x i32> undef, i16 -1)
@@ -639,12 +639,12 @@ define <16 x i32> @combine_permvar_as_vpbroadcastd512(<16 x i32> %x0) {
define <8 x i64> @combine_permvar_as_vpbroadcastq512(<8 x i64> %x0) {
; X32-LABEL: combine_permvar_as_vpbroadcastq512:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vbroadcastsd %xmm0, %zmm0
; X32-NEXT: retl
;
; X64-LABEL: combine_permvar_as_vpbroadcastq512:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vbroadcastsd %xmm0, %zmm0
; X64-NEXT: retq
%1 = call <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64> %x0, <8 x i64> zeroinitializer, <8 x i64> undef, i8 -1)
@@ -653,20 +653,20 @@ define <8 x i64> @combine_permvar_as_vpbroadcastq512(<8 x i64> %x0) {
define <8 x i64> @combine_permvar_8i64_as_permq(<8 x i64> %x0, <8 x i64> %x1) {
; X32-LABEL: combine_permvar_8i64_as_permq:
-; X32: # BB#0:
-; X32-NEXT: vpermq {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4]
+; X32: # %bb.0:
+; X32-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4]
; X32-NEXT: retl
;
; X64-LABEL: combine_permvar_8i64_as_permq:
-; X64: # BB#0:
-; X64-NEXT: vpermq {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4]
+; X64: # %bb.0:
+; X64-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4]
; X64-NEXT: retq
%1 = call <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64> %x0, <8 x i64> <i64 3, i64 2, i64 1, i64 undef, i64 undef, i64 6, i64 5, i64 4>, <8 x i64> %x1, i8 -1)
ret <8 x i64> %1
}
define <8 x i64> @combine_permvar_8i64_as_permq_mask(<8 x i64> %x0, <8 x i64> %x1, i8 %m) {
; X32-LABEL: combine_permvar_8i64_as_permq_mask:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; X32-NEXT: kmovd %eax, %k1
; X32-NEXT: vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,2,1,0,7,6,5,4]
@@ -674,7 +674,7 @@ define <8 x i64> @combine_permvar_8i64_as_permq_mask(<8 x i64> %x0, <8 x i64> %x
; X32-NEXT: retl
;
; X64-LABEL: combine_permvar_8i64_as_permq_mask:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: kmovd %edi, %k1
; X64-NEXT: vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,2,1,0,7,6,5,4]
; X64-NEXT: vmovdqa64 %zmm1, %zmm0
@@ -685,12 +685,12 @@ define <8 x i64> @combine_permvar_8i64_as_permq_mask(<8 x i64> %x0, <8 x i64> %x
define <8 x double> @combine_permvar_8f64_as_permpd(<8 x double> %x0, <8 x double> %x1) {
; X32-LABEL: combine_permvar_8f64_as_permpd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4]
; X32-NEXT: retl
;
; X64-LABEL: combine_permvar_8f64_as_permpd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4]
; X64-NEXT: retq
%1 = call <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double> %x0, <8 x i64> <i64 3, i64 2, i64 1, i64 undef, i64 undef, i64 6, i64 5, i64 4>, <8 x double> %x1, i8 -1)
@@ -698,7 +698,7 @@ define <8 x double> @combine_permvar_8f64_as_permpd(<8 x double> %x0, <8 x doubl
}
define <8 x double> @combine_permvar_8f64_as_permpd_mask(<8 x double> %x0, <8 x double> %x1, i8 %m) {
; X32-LABEL: combine_permvar_8f64_as_permpd_mask:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; X32-NEXT: kmovd %eax, %k1
; X32-NEXT: vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,2,1,0,7,6,5,4]
@@ -706,7 +706,7 @@ define <8 x double> @combine_permvar_8f64_as_permpd_mask(<8 x double> %x0, <8 x
; X32-NEXT: retl
;
; X64-LABEL: combine_permvar_8f64_as_permpd_mask:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: kmovd %edi, %k1
; X64-NEXT: vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,2,1,0,7,6,5,4]
; X64-NEXT: vmovapd %zmm1, %zmm0
@@ -717,12 +717,12 @@ define <8 x double> @combine_permvar_8f64_as_permpd_mask(<8 x double> %x0, <8 x
define <16 x float> @combine_vpermilvar_16f32_230146759A8BCFDE(<16 x float> %x0) {
; X32-LABEL: combine_vpermilvar_16f32_230146759A8BCFDE:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[2,3,0,1,4,6,7,5,9,10,8,11,12,15,13,14]
; X32-NEXT: retl
;
; X64-LABEL: combine_vpermilvar_16f32_230146759A8BCFDE:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[2,3,0,1,4,6,7,5,9,10,8,11,12,15,13,14]
; X64-NEXT: retq
%res0 = call <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 3, i32 2, i32 1, i32 0, i32 2, i32 3, i32 0, i32 1, i32 1, i32 0, i32 3, i32 2>, <16 x float> undef, i16 -1)
@@ -732,12 +732,12 @@ define <16 x float> @combine_vpermilvar_16f32_230146759A8BCFDE(<16 x float> %x0)
define <64 x i8> @combine_pshufb_as_pslldq(<64 x i8> %a0) {
; X32-LABEL: combine_pshufb_as_pslldq:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpshufb {{.*#+}} zmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[0,1,2,3,4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[16,17,18,19,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[32,33,34,35,36,37],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[48,49,50,51,52,53]
; X32-NEXT: retl
;
; X64-LABEL: combine_pshufb_as_pslldq:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpshufb {{.*#+}} zmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[0,1,2,3,4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[16,17,18,19,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[32,33,34,35,36,37],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[48,49,50,51,52,53]
; X64-NEXT: retq
%res0 = call <64 x i8> @llvm.x86.avx512.mask.pshuf.b.512(<64 x i8> %a0, <64 x i8> <i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5>, <64 x i8> undef, i64 -1)
@@ -745,13 +745,13 @@ define <64 x i8> @combine_pshufb_as_pslldq(<64 x i8> %a0) {
}
define <64 x i8> @combine_pshufb_as_pslldq_mask(<64 x i8> %a0, i64 %m) {
; X32-LABEL: combine_pshufb_as_pslldq_mask:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: kmovq {{[0-9]+}}(%esp), %k1
; X32-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[0,1,2,3,4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[16,17,18,19,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[32,33,34,35,36,37],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[48,49,50,51,52,53]
; X32-NEXT: retl
;
; X64-LABEL: combine_pshufb_as_pslldq_mask:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: kmovq %rdi, %k1
; X64-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[0,1,2,3,4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[16,17,18,19,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[32,33,34,35,36,37],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[48,49,50,51,52,53]
; X64-NEXT: retq
@@ -761,12 +761,12 @@ define <64 x i8> @combine_pshufb_as_pslldq_mask(<64 x i8> %a0, i64 %m) {
define <64 x i8> @combine_pshufb_as_psrldq(<64 x i8> %a0) {
; X32-LABEL: combine_pshufb_as_psrldq:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[47],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; X32-NEXT: retl
;
; X64-LABEL: combine_pshufb_as_psrldq:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[47],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; X64-NEXT: retq
%res0 = call <64 x i8> @llvm.x86.avx512.mask.pshuf.b.512(<64 x i8> %a0, <64 x i8> <i8 15, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 15, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 15, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 15, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128>, <64 x i8> undef, i64 -1)
@@ -774,13 +774,13 @@ define <64 x i8> @combine_pshufb_as_psrldq(<64 x i8> %a0) {
}
define <64 x i8> @combine_pshufb_as_psrldq_mask(<64 x i8> %a0, i64 %m) {
; X32-LABEL: combine_pshufb_as_psrldq_mask:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: kmovq {{[0-9]+}}(%esp), %k1
; X32-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[47],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; X32-NEXT: retl
;
; X64-LABEL: combine_pshufb_as_psrldq_mask:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: kmovq %rdi, %k1
; X64-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[47],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; X64-NEXT: retq
@@ -790,12 +790,12 @@ define <64 x i8> @combine_pshufb_as_psrldq_mask(<64 x i8> %a0, i64 %m) {
define <32 x i16> @combine_permvar_as_pshuflw(<32 x i16> %a0) {
; X32-LABEL: combine_permvar_as_pshuflw:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpshuflw {{.*#+}} zmm0 = zmm0[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15,17,16,19,18,20,21,22,23,25,24,27,26,28,29,30,31]
; X32-NEXT: retl
;
; X64-LABEL: combine_permvar_as_pshuflw:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpshuflw {{.*#+}} zmm0 = zmm0[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15,17,16,19,18,20,21,22,23,25,24,27,26,28,29,30,31]
; X64-NEXT: retq
%res0 = call <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16> %a0, <32 x i16> <i16 1, i16 0, i16 3, i16 2, i16 4, i16 5, i16 6, i16 7, i16 9, i16 8, i16 11, i16 10, i16 12, i16 13, i16 14, i16 15, i16 17, i16 16, i16 19, i16 18, i16 20, i16 21, i16 22, i16 23, i16 25, i16 24, i16 27, i16 26, i16 28, i16 29, i16 30, i16 31>, <32 x i16> undef, i32 -1)
@@ -804,12 +804,12 @@ define <32 x i16> @combine_permvar_as_pshuflw(<32 x i16> %a0) {
define <32 x i16> @combine_pshufb_as_pshufhw(<32 x i16> %a0) {
; X32-LABEL: combine_pshufb_as_pshufhw:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpshufhw {{.*#+}} zmm0 = zmm0[0,1,2,3,5,4,7,6,8,9,10,11,13,12,15,14,16,17,18,19,21,20,23,22,24,25,26,27,29,28,31,30]
; X32-NEXT: retl
;
; X64-LABEL: combine_pshufb_as_pshufhw:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpshufhw {{.*#+}} zmm0 = zmm0[0,1,2,3,5,4,7,6,8,9,10,11,13,12,15,14,16,17,18,19,21,20,23,22,24,25,26,27,29,28,31,30]
; X64-NEXT: retq
%res0 = call <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16> %a0, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 5, i16 4, i16 7, i16 6, i16 8, i16 9, i16 10, i16 11, i16 13, i16 12, i16 15, i16 14, i16 16, i16 17, i16 18, i16 19, i16 21, i16 20, i16 23, i16 22, i16 24, i16 25, i16 26, i16 27, i16 29, i16 28, i16 31, i16 30>, <32 x i16> undef, i32 -1)
@@ -818,12 +818,12 @@ define <32 x i16> @combine_pshufb_as_pshufhw(<32 x i16> %a0) {
define <32 x i16> @combine_vpermi2var_32i16_as_pshufb(<32 x i16> %a0) {
; X32-LABEL: combine_vpermi2var_32i16_as_pshufb:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13,18,19,16,17,22,23,20,21,26,27,24,25,30,31,28,29,34,35,32,33,38,39,36,37,42,43,40,41,46,47,44,45,50,51,48,49,54,55,52,53,58,59,56,57,62,63,60,61]
; X32-NEXT: retl
;
; X64-LABEL: combine_vpermi2var_32i16_as_pshufb:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13,18,19,16,17,22,23,20,21,26,27,24,25,30,31,28,29,34,35,32,33,38,39,36,37,42,43,40,41,46,47,44,45,50,51,48,49,54,55,52,53,58,59,56,57,62,63,60,61]
; X64-NEXT: retq
%res0 = call <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16> %a0, <32 x i16> <i16 1, i16 0, i16 3, i16 2, i16 4, i16 5, i16 6, i16 7, i16 9, i16 8, i16 11, i16 10, i16 12, i16 13, i16 14, i16 15, i16 17, i16 16, i16 19, i16 18, i16 20, i16 21, i16 22, i16 23, i16 25, i16 24, i16 27, i16 26, i16 28, i16 29, i16 30, i16 31>, <32 x i16> undef, i32 -1)
@@ -833,11 +833,11 @@ define <32 x i16> @combine_vpermi2var_32i16_as_pshufb(<32 x i16> %a0) {
define <8 x double> @combine_vpermi2var_8f64_identity(<8 x double> %x0, <8 x double> %x1) {
; X32-LABEL: combine_vpermi2var_8f64_identity:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: retl
;
; X64-LABEL: combine_vpermi2var_8f64_identity:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: retq
%res0 = call <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double> %x0, <8 x i64> <i64 7, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>, <8 x double> %x1, i8 -1)
%res1 = call <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double> %res0, <8 x i64> <i64 7, i64 14, i64 5, i64 12, i64 3, i64 10, i64 1, i64 8>, <8 x double> %res0, i8 -1)
@@ -846,12 +846,12 @@ define <8 x double> @combine_vpermi2var_8f64_identity(<8 x double> %x0, <8 x dou
define <8 x double> @combine_vpermi2var_8f64_as_shufpd(<8 x double> %x0, <8 x double> %x1) {
; X32-LABEL: combine_vpermi2var_8f64_as_shufpd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vshufpd {{.*#+}} zmm0 = zmm0[1],zmm1[0],zmm0[2],zmm1[2],zmm0[5],zmm1[5],zmm0[6],zmm1[7]
; X32-NEXT: retl
;
; X64-LABEL: combine_vpermi2var_8f64_as_shufpd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vshufpd {{.*#+}} zmm0 = zmm0[1],zmm1[0],zmm0[2],zmm1[2],zmm0[5],zmm1[5],zmm0[6],zmm1[7]
; X64-NEXT: retq
%1 = call <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double> %x0, <8 x i64> <i64 1, i64 8, i64 2, i64 10, i64 5, i64 13, i64 6, i64 15>, <8 x double> %x1, i8 -1)
@@ -860,11 +860,11 @@ define <8 x double> @combine_vpermi2var_8f64_as_shufpd(<8 x double> %x0, <8 x do
define <8 x i64> @combine_vpermi2var_8i64_identity(<8 x i64> %x0, <8 x i64> %x1) {
; X32-LABEL: combine_vpermi2var_8i64_identity:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: retl
;
; X64-LABEL: combine_vpermi2var_8i64_identity:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: retq
%res0 = call <8 x i64> @llvm.x86.avx512.mask.vpermi2var.q.512(<8 x i64> %x0, <8 x i64> <i64 undef, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>, <8 x i64> %x1, i8 -1)
%res1 = call <8 x i64> @llvm.x86.avx512.mask.vpermi2var.q.512(<8 x i64> %res0, <8 x i64> <i64 undef, i64 14, i64 5, i64 12, i64 3, i64 10, i64 1, i64 8>, <8 x i64> %res0, i8 -1)
@@ -873,11 +873,11 @@ define <8 x i64> @combine_vpermi2var_8i64_identity(<8 x i64> %x0, <8 x i64> %x1)
define <16 x float> @combine_vpermi2var_16f32_identity(<16 x float> %x0, <16 x float> %x1) {
; X32-LABEL: combine_vpermi2var_16f32_identity:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: retl
;
; X64-LABEL: combine_vpermi2var_16f32_identity:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: retq
%res0 = call <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float> %x0, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>, <16 x float> %x1, i16 -1)
%res1 = call <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float> %res0, <16 x i32> <i32 15, i32 30, i32 13, i32 28, i32 11, i32 26, i32 9, i32 24, i32 7, i32 22, i32 5, i32 20, i32 3, i32 18, i32 1, i32 16>, <16 x float> %res0, i16 -1)
@@ -886,11 +886,11 @@ define <16 x float> @combine_vpermi2var_16f32_identity(<16 x float> %x0, <16 x f
define <16 x i32> @combine_vpermi2var_16i32_identity(<16 x i32> %x0, <16 x i32> %x1) {
; X32-LABEL: combine_vpermi2var_16i32_identity:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: retl
;
; X64-LABEL: combine_vpermi2var_16i32_identity:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: retq
%res0 = call <16 x i32> @llvm.x86.avx512.mask.vpermi2var.d.512(<16 x i32> %x0, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 undef>, <16 x i32> %x1, i16 -1)
%res1 = call <16 x i32> @llvm.x86.avx512.mask.vpermi2var.d.512(<16 x i32> %res0, <16 x i32> <i32 15, i32 30, i32 13, i32 28, i32 undef, i32 26, i32 9, i32 24, i32 7, i32 22, i32 5, i32 20, i32 3, i32 18, i32 1, i32 16>, <16 x i32> %res0, i16 -1)
@@ -899,12 +899,12 @@ define <16 x i32> @combine_vpermi2var_16i32_identity(<16 x i32> %x0, <16 x i32>
define <16 x float> @combine_vpermt2var_vpermi2var_16f32_as_unpckhps(<16 x float> %a0, <16 x float> %a1) {
; X32-LABEL: combine_vpermt2var_vpermi2var_16f32_as_unpckhps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vunpckhps {{.*#+}} zmm0 = zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[14],zmm0[14],zmm1[15],zmm0[15]
; X32-NEXT: retl
;
; X64-LABEL: combine_vpermt2var_vpermi2var_16f32_as_unpckhps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vunpckhps {{.*#+}} zmm0 = zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[14],zmm0[14],zmm1[15],zmm0[15]
; X64-NEXT: retq
%res0 = call <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float> %a0, <16 x i32> <i32 18, i32 2, i32 19, i32 3, i32 22, i32 6, i32 23, i32 7, i32 26, i32 10, i32 27, i32 11, i32 30, i32 14, i32 31, i32 15>, <16 x float> %a1, i16 -1)
@@ -913,13 +913,13 @@ define <16 x float> @combine_vpermt2var_vpermi2var_16f32_as_unpckhps(<16 x float
define <16 x i32> @vpermt2var_vpermi2var_16i32_as_unpckldq(<16 x i32> %a0, <16 x i32> %a1) {
; X32-LABEL: vpermt2var_vpermi2var_16i32_as_unpckldq:
-; X32: # BB#0:
-; X32-NEXT: vpunpckldq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
+; X32: # %bb.0:
+; X32-NEXT: vunpcklps {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
; X32-NEXT: retl
;
; X64-LABEL: vpermt2var_vpermi2var_16i32_as_unpckldq:
-; X64: # BB#0:
-; X64-NEXT: vpunpckldq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
+; X64: # %bb.0:
+; X64-NEXT: vunpcklps {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
; X64-NEXT: retq
%res0 = call <16 x i32> @llvm.x86.avx512.mask.vpermi2var.d.512(<16 x i32> %a0, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>, <16 x i32> %a1, i16 -1)
ret <16 x i32> %res0
@@ -927,11 +927,11 @@ define <16 x i32> @vpermt2var_vpermi2var_16i32_as_unpckldq(<16 x i32> %a0, <16 x
define <32 x i16> @combine_vpermi2var_32i16_identity(<32 x i16> %x0, <32 x i16> %x1) {
; X32-LABEL: combine_vpermi2var_32i16_identity:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: retl
;
; X64-LABEL: combine_vpermi2var_32i16_identity:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: retq
%res0 = call <32 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.512(<32 x i16> %x0, <32 x i16> <i16 31, i16 30, i16 29, i16 28, i16 27, i16 26, i16 25, i16 24, i16 23, i16 22, i16 21, i16 20, i16 19, i16 18, i16 17, i16 16, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>, <32 x i16> %x1, i32 -1)
%res1 = call <32 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.512(<32 x i16> %res0, <32 x i16> <i16 63, i16 30, i16 61, i16 28, i16 59, i16 26, i16 57, i16 24, i16 55, i16 22, i16 53, i16 20, i16 51, i16 18, i16 49, i16 16, i16 47, i16 46, i16 13, i16 44, i16 11, i16 42, i16 9, i16 40, i16 7, i16 38, i16 5, i16 36, i16 3, i16 34, i16 1, i16 32>, <32 x i16> %res0, i32 -1)
@@ -940,14 +940,14 @@ define <32 x i16> @combine_vpermi2var_32i16_identity(<32 x i16> %x0, <32 x i16>
define <8 x double> @combine_vpermi2var_8f64_as_vpermpd(<8 x double> %x0, <8 x double> %x1) {
; X32-LABEL: combine_vpermi2var_8f64_as_vpermpd:
-; X32: # BB#0:
-; X32-NEXT: vmovapd {{.*#+}} zmm1 = [7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0]
+; X32: # %bb.0:
+; X32-NEXT: vmovaps {{.*#+}} zmm1 = [7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0]
; X32-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; X32-NEXT: retl
;
; X64-LABEL: combine_vpermi2var_8f64_as_vpermpd:
-; X64: # BB#0:
-; X64-NEXT: vmovapd {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
+; X64: # %bb.0:
+; X64-NEXT: vmovaps {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
; X64-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; X64-NEXT: retq
%res0 = call <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double> %x0, <8 x i64> <i64 3, i64 2, i64 1, i64 0, i64 7, i64 6, i64 5, i64 4>, <8 x double> %x1, i8 -1)
@@ -957,15 +957,15 @@ define <8 x double> @combine_vpermi2var_8f64_as_vpermpd(<8 x double> %x0, <8 x d
define <8 x i64> @combine_vpermt2var_8i64_as_vpermq(<8 x i64> %x0, <8 x i64> %x1) {
; X32-LABEL: combine_vpermt2var_8i64_as_vpermq:
-; X32: # BB#0:
-; X32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0]
-; X32-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; X32: # %bb.0:
+; X32-NEXT: vmovaps {{.*#+}} zmm1 = [7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0]
+; X32-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; X32-NEXT: retl
;
; X64-LABEL: combine_vpermt2var_8i64_as_vpermq:
-; X64: # BB#0:
-; X64-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
-; X64-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; X64: # %bb.0:
+; X64-NEXT: vmovaps {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
+; X64-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; X64-NEXT: retq
%res0 = call <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64> <i64 3, i64 2, i64 1, i64 0, i64 7, i64 6, i64 5, i64 4>, <8 x i64> %x0, <8 x i64> %x1, i8 -1)
%res1 = call <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64> <i64 12, i64 5, i64 14, i64 7, i64 8, i64 1, i64 10, i64 3>, <8 x i64> %res0, <8 x i64> %res0, i8 -1)
@@ -974,13 +974,13 @@ define <8 x i64> @combine_vpermt2var_8i64_as_vpermq(<8 x i64> %x0, <8 x i64> %x1
define <16 x float> @combine_vpermi2var_16f32_as_vpermps(<16 x float> %x0, <16 x float> %x1) {
; X32-LABEL: combine_vpermi2var_16f32_as_vpermps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vmovaps {{.*#+}} zmm1 = [7,7,5,5,3,3,1,1,15,15,13,13,11,11,9,9]
; X32-NEXT: vpermps %zmm0, %zmm1, %zmm0
; X32-NEXT: retl
;
; X64-LABEL: combine_vpermi2var_16f32_as_vpermps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovaps {{.*#+}} zmm1 = [7,7,5,5,3,3,1,1,15,15,13,13,11,11,9,9]
; X64-NEXT: vpermps %zmm0, %zmm1, %zmm0
; X64-NEXT: retq
@@ -991,15 +991,15 @@ define <16 x float> @combine_vpermi2var_16f32_as_vpermps(<16 x float> %x0, <16 x
define <16 x i32> @combine_vpermt2var_16i32_as_vpermd(<16 x i32> %x0, <16 x i32> %x1) {
; X32-LABEL: combine_vpermt2var_16i32_as_vpermd:
-; X32: # BB#0:
-; X32-NEXT: vmovdqa32 {{.*#+}} zmm1 = [7,7,5,5,3,3,1,1,15,15,13,13,11,11,9,9]
-; X32-NEXT: vpermd %zmm0, %zmm1, %zmm0
+; X32: # %bb.0:
+; X32-NEXT: vmovaps {{.*#+}} zmm1 = [7,7,5,5,3,3,1,1,15,15,13,13,11,11,9,9]
+; X32-NEXT: vpermps %zmm0, %zmm1, %zmm0
; X32-NEXT: retl
;
; X64-LABEL: combine_vpermt2var_16i32_as_vpermd:
-; X64: # BB#0:
-; X64-NEXT: vmovdqa32 {{.*#+}} zmm1 = [7,7,5,5,3,3,1,1,15,15,13,13,11,11,9,9]
-; X64-NEXT: vpermd %zmm0, %zmm1, %zmm0
+; X64: # %bb.0:
+; X64-NEXT: vmovaps {{.*#+}} zmm1 = [7,7,5,5,3,3,1,1,15,15,13,13,11,11,9,9]
+; X64-NEXT: vpermps %zmm0, %zmm1, %zmm0
; X64-NEXT: retq
%res0 = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>, <16 x i32> %x0, <16 x i32> %x1, i16 -1)
%res1 = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>, <16 x i32> %res0, <16 x i32> %res0, i16 -1)
@@ -1008,14 +1008,14 @@ define <16 x i32> @combine_vpermt2var_16i32_as_vpermd(<16 x i32> %x0, <16 x i32>
define <32 x i16> @combine_vpermi2var_32i16_as_permw(<32 x i16> %x0, <32 x i16> %x1) {
; X32-LABEL: combine_vpermi2var_32i16_as_permw:
-; X32: # BB#0:
-; X32-NEXT: vmovdqu16 {{.*#+}} zmm1 = [15,16,14,17,13,18,12,19,11,20,10,21,9,22,8,23,7,24,6,25,5,26,4,27,3,28,2,29,1,30,0,31]
+; X32: # %bb.0:
+; X32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,16,14,17,13,18,12,19,11,20,10,21,9,22,8,23,7,24,6,25,5,26,4,27,3,28,2,29,1,30,0,31]
; X32-NEXT: vpermw %zmm0, %zmm1, %zmm0
; X32-NEXT: retl
;
; X64-LABEL: combine_vpermi2var_32i16_as_permw:
-; X64: # BB#0:
-; X64-NEXT: vmovdqu16 {{.*#+}} zmm1 = [15,16,14,17,13,18,12,19,11,20,10,21,9,22,8,23,7,24,6,25,5,26,4,27,3,28,2,29,1,30,0,31]
+; X64: # %bb.0:
+; X64-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,16,14,17,13,18,12,19,11,20,10,21,9,22,8,23,7,24,6,25,5,26,4,27,3,28,2,29,1,30,0,31]
; X64-NEXT: vpermw %zmm0, %zmm1, %zmm0
; X64-NEXT: retq
%res0 = call <32 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.512(<32 x i16> %x0, <32 x i16> <i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0, i16 31, i16 30, i16 29, i16 28, i16 27, i16 26, i16 25, i16 24, i16 23, i16 22, i16 21, i16 20, i16 19, i16 18, i16 17, i16 16>, <32 x i16> %x1, i32 -1)
@@ -1025,14 +1025,14 @@ define <32 x i16> @combine_vpermi2var_32i16_as_permw(<32 x i16> %x0, <32 x i16>
define <8 x double> @combine_vpermi2var_vpermt2var_8f64_as_vperm2(<8 x double> %x0, <8 x double> %x1) {
; X32-LABEL: combine_vpermi2var_vpermt2var_8f64_as_vperm2:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vmovapd {{.*#+}} zmm2 = [4,0,14,0,3,0,12,0,7,0,8,0,0,0,15,0]
; X32-NEXT: vpermi2pd %zmm0, %zmm1, %zmm2
; X32-NEXT: vmovapd %zmm2, %zmm0
; X32-NEXT: retl
;
; X64-LABEL: combine_vpermi2var_vpermt2var_8f64_as_vperm2:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovapd {{.*#+}} zmm2 = [4,14,3,12,7,8,0,15]
; X64-NEXT: vpermi2pd %zmm0, %zmm1, %zmm2
; X64-NEXT: vmovapd %zmm2, %zmm0
@@ -1044,13 +1044,13 @@ define <8 x double> @combine_vpermi2var_vpermt2var_8f64_as_vperm2(<8 x double> %
define <16 x i32> @combine_vpermi2var_vpermt2var_16i32_as_vpermd(<16 x i32> %x0, <16 x i32> %x1) {
; X32-LABEL: combine_vpermi2var_vpermt2var_16i32_as_vpermd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,31,2,2,4,29,6,27,8,25,10,23,12,21,14,19]
; X32-NEXT: vpermt2d %zmm1, %zmm2, %zmm0
; X32-NEXT: retl
;
; X64-LABEL: combine_vpermi2var_vpermt2var_16i32_as_vpermd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,31,2,2,4,29,6,27,8,25,10,23,12,21,14,19]
; X64-NEXT: vpermt2d %zmm1, %zmm2, %zmm0
; X64-NEXT: retq
@@ -1061,15 +1061,15 @@ define <16 x i32> @combine_vpermi2var_vpermt2var_16i32_as_vpermd(<16 x i32> %x0,
define <32 x i16> @combine_vpermt2var_vpermi2var_32i16_as_permw(<32 x i16> %x0, <32 x i16> %x1) {
; X32-LABEL: combine_vpermt2var_vpermi2var_32i16_as_permw:
-; X32: # BB#0:
-; X32-NEXT: vmovdqu16 {{.*#+}} zmm2 = [17,39,19,38,21,37,23,36,25,35,27,34,29,33,31,32,1,47,3,46,5,45,7,44,9,43,11,42,13,41,15,40]
+; X32: # %bb.0:
+; X32-NEXT: vmovdqa64 {{.*#+}} zmm2 = [17,39,19,38,21,37,23,36,25,35,27,34,29,33,31,32,1,47,3,46,5,45,7,44,9,43,11,42,13,41,15,40]
; X32-NEXT: vpermi2w %zmm0, %zmm1, %zmm2
; X32-NEXT: vmovdqa64 %zmm2, %zmm0
; X32-NEXT: retl
;
; X64-LABEL: combine_vpermt2var_vpermi2var_32i16_as_permw:
-; X64: # BB#0:
-; X64-NEXT: vmovdqu16 {{.*#+}} zmm2 = [17,39,19,38,21,37,23,36,25,35,27,34,29,33,31,32,1,47,3,46,5,45,7,44,9,43,11,42,13,41,15,40]
+; X64: # %bb.0:
+; X64-NEXT: vmovdqa64 {{.*#+}} zmm2 = [17,39,19,38,21,37,23,36,25,35,27,34,29,33,31,32,1,47,3,46,5,45,7,44,9,43,11,42,13,41,15,40]
; X64-NEXT: vpermi2w %zmm0, %zmm1, %zmm2
; X64-NEXT: vmovdqa64 %zmm2, %zmm0
; X64-NEXT: retq
@@ -1080,15 +1080,15 @@ define <32 x i16> @combine_vpermt2var_vpermi2var_32i16_as_permw(<32 x i16> %x0,
define <8 x double> @combine_vpermi2var_vpermvar_8f64_as_vperm2_zero(<8 x double> %x0) {
; X32-LABEL: combine_vpermi2var_vpermvar_8f64_as_vperm2_zero:
-; X32: # BB#0:
-; X32-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; X32: # %bb.0:
+; X32-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; X32-NEXT: vmovapd {{.*#+}} zmm2 = [8,0,3,0,10,0,11,0,1,0,7,0,14,0,5,0]
; X32-NEXT: vpermt2pd %zmm1, %zmm2, %zmm0
; X32-NEXT: retl
;
; X64-LABEL: combine_vpermi2var_vpermvar_8f64_as_vperm2_zero:
-; X64: # BB#0:
-; X64-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; X64: # %bb.0:
+; X64-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; X64-NEXT: vmovapd {{.*#+}} zmm2 = [8,3,10,11,1,7,14,5]
; X64-NEXT: vpermt2pd %zmm1, %zmm2, %zmm0
; X64-NEXT: retq
@@ -1099,15 +1099,15 @@ define <8 x double> @combine_vpermi2var_vpermvar_8f64_as_vperm2_zero(<8 x double
define <16 x float> @combine_vpermi2var_vpermvar_16f32_as_vperm2_zero(<16 x float> %x0) {
; X32-LABEL: combine_vpermi2var_vpermvar_16f32_as_vperm2_zero:
-; X32: # BB#0:
-; X32-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; X32: # %bb.0:
+; X32-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X32-NEXT: vmovaps {{.*#+}} zmm2 = [0,13,1,12,4,9,22,12,4,25,26,9,5,29,30,8]
; X32-NEXT: vpermt2ps %zmm1, %zmm2, %zmm0
; X32-NEXT: retl
;
; X64-LABEL: combine_vpermi2var_vpermvar_16f32_as_vperm2_zero:
-; X64: # BB#0:
-; X64-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; X64: # %bb.0:
+; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X64-NEXT: vmovaps {{.*#+}} zmm2 = [0,13,1,12,4,9,22,12,4,25,26,9,5,29,30,8]
; X64-NEXT: vpermt2ps %zmm1, %zmm2, %zmm0
; X64-NEXT: retq
@@ -1118,12 +1118,12 @@ define <16 x float> @combine_vpermi2var_vpermvar_16f32_as_vperm2_zero(<16 x floa
define <8 x i64> @combine_vpermvar_insertion_as_broadcast_v8i64(i64 %a0) {
; X32-LABEL: combine_vpermvar_insertion_as_broadcast_v8i64:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vbroadcastsd {{[0-9]+}}(%esp), %zmm0
; X32-NEXT: retl
;
; X64-LABEL: combine_vpermvar_insertion_as_broadcast_v8i64:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovq %rdi, %xmm0
; X64-NEXT: vpbroadcastq %xmm0, %zmm0
; X64-NEXT: retq
diff --git a/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll b/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll
index 954dbe5edc63..f55aba61a1c0 100644
--- a/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll
+++ b/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll
@@ -8,11 +8,11 @@ declare <16 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.256(<16 x i16>, <16 x i16
define <16 x i16> @combine_vpermt2var_16i16_identity(<16 x i16> %x0, <16 x i16> %x1) {
; X32-LABEL: combine_vpermt2var_16i16_identity:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: retl
;
; X64-LABEL: combine_vpermt2var_16i16_identity:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: retq
%res0 = call <16 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.256(<16 x i16> <i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>, <16 x i16> %x0, <16 x i16> %x1, i16 -1)
%res1 = call <16 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.256(<16 x i16> <i16 15, i16 30, i16 13, i16 28, i16 11, i16 26, i16 9, i16 24, i16 7, i16 22, i16 5, i16 20, i16 3, i16 18, i16 1, i16 16>, <16 x i16> %res0, <16 x i16> %res0, i16 -1)
@@ -20,20 +20,20 @@ define <16 x i16> @combine_vpermt2var_16i16_identity(<16 x i16> %x0, <16 x i16>
}
define <16 x i16> @combine_vpermt2var_16i16_identity_mask(<16 x i16> %x0, <16 x i16> %x1, i16 %m) {
; X32-LABEL: combine_vpermt2var_16i16_identity_mask:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
-; X32-NEXT: vmovdqu {{.*#+}} ymm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
+; X32-NEXT: vmovdqa {{.*#+}} ymm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
; X32-NEXT: vpermi2w %ymm1, %ymm0, %ymm2 {%k1} {z}
-; X32-NEXT: vmovdqu {{.*#+}} ymm0 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16]
+; X32-NEXT: vmovdqa {{.*#+}} ymm0 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16]
; X32-NEXT: vpermi2w %ymm2, %ymm2, %ymm0 {%k1} {z}
; X32-NEXT: retl
;
; X64-LABEL: combine_vpermt2var_16i16_identity_mask:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: kmovd %edi, %k1
-; X64-NEXT: vmovdqu {{.*#+}} ymm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
+; X64-NEXT: vmovdqa {{.*#+}} ymm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
; X64-NEXT: vpermi2w %ymm1, %ymm0, %ymm2 {%k1} {z}
-; X64-NEXT: vmovdqu {{.*#+}} ymm0 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16]
+; X64-NEXT: vmovdqa {{.*#+}} ymm0 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16]
; X64-NEXT: vpermi2w %ymm2, %ymm2, %ymm0 {%k1} {z}
; X64-NEXT: retq
%res0 = call <16 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.256(<16 x i16> <i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>, <16 x i16> %x0, <16 x i16> %x1, i16 %m)
@@ -43,14 +43,14 @@ define <16 x i16> @combine_vpermt2var_16i16_identity_mask(<16 x i16> %x0, <16 x
define <16 x i16> @combine_vpermi2var_16i16_as_permw(<16 x i16> %x0, <16 x i16> %x1) {
; X32-LABEL: combine_vpermi2var_16i16_as_permw:
-; X32: # BB#0:
-; X32-NEXT: vmovdqu {{.*#+}} ymm1 = [15,0,14,1,13,2,12,3,11,4,10,5,9,6,8,7]
+; X32: # %bb.0:
+; X32-NEXT: vmovdqa {{.*#+}} ymm1 = [15,0,14,1,13,2,12,3,11,4,10,5,9,6,8,7]
; X32-NEXT: vpermw %ymm0, %ymm1, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: combine_vpermi2var_16i16_as_permw:
-; X64: # BB#0:
-; X64-NEXT: vmovdqu {{.*#+}} ymm1 = [15,0,14,1,13,2,12,3,11,4,10,5,9,6,8,7]
+; X64: # %bb.0:
+; X64-NEXT: vmovdqa {{.*#+}} ymm1 = [15,0,14,1,13,2,12,3,11,4,10,5,9,6,8,7]
; X64-NEXT: vpermw %ymm0, %ymm1, %ymm0
; X64-NEXT: retq
%res0 = call <16 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.256(<16 x i16> %x0, <16 x i16> <i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 0>, <16 x i16> %x1, i16 -1)
@@ -60,14 +60,14 @@ define <16 x i16> @combine_vpermi2var_16i16_as_permw(<16 x i16> %x0, <16 x i16>
define <16 x i16> @combine_vpermt2var_vpermi2var_16i16_as_vperm2(<16 x i16> %x0, <16 x i16> %x1) {
; X32-LABEL: combine_vpermt2var_vpermi2var_16i16_as_vperm2:
-; X32: # BB#0:
-; X32-NEXT: vmovdqu {{.*#+}} ymm2 = [0,31,2,2,4,29,6,27,8,25,10,23,12,21,14,19]
+; X32: # %bb.0:
+; X32-NEXT: vmovdqa {{.*#+}} ymm2 = [0,31,2,2,4,29,6,27,8,25,10,23,12,21,14,19]
; X32-NEXT: vpermt2w %ymm1, %ymm2, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: combine_vpermt2var_vpermi2var_16i16_as_vperm2:
-; X64: # BB#0:
-; X64-NEXT: vmovdqu {{.*#+}} ymm2 = [0,31,2,2,4,29,6,27,8,25,10,23,12,21,14,19]
+; X64: # %bb.0:
+; X64-NEXT: vmovdqa {{.*#+}} ymm2 = [0,31,2,2,4,29,6,27,8,25,10,23,12,21,14,19]
; X64-NEXT: vpermt2w %ymm1, %ymm2, %ymm0
; X64-NEXT: retq
%res0 = call <16 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.256(<16 x i16> %x0, <16 x i16> <i16 0, i16 31, i16 2, i16 29, i16 4, i16 27, i16 6, i16 25, i16 8, i16 23, i16 10, i16 21, i16 12, i16 19, i16 14, i16 17>, <16 x i16> %x1, i16 -1)
@@ -77,12 +77,12 @@ define <16 x i16> @combine_vpermt2var_vpermi2var_16i16_as_vperm2(<16 x i16> %x0,
define <16 x i16> @combine_vpermt2var_vpermi2var_16i16_as_unpckhwd(<16 x i16> %a0, <16 x i16> %a1) {
; X32-LABEL: combine_vpermt2var_vpermi2var_16i16_as_unpckhwd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15]
; X32-NEXT: retl
;
; X64-LABEL: combine_vpermt2var_vpermi2var_16i16_as_unpckhwd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15]
; X64-NEXT: retq
%res0 = call <16 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.256(<16 x i16> %a0, <16 x i16> <i16 20, i16 4, i16 21, i16 5, i16 22, i16 6, i16 23, i16 7, i16 28, i16 12, i16 29, i16 13, i16 30, i16 14, i16 31, i16 15>, <16 x i16> %a1, i16 -1)
@@ -91,12 +91,12 @@ define <16 x i16> @combine_vpermt2var_vpermi2var_16i16_as_unpckhwd(<16 x i16> %a
define <16 x i16> @combine_vpermt2var_vpermi2var_16i16_as_unpcklwd(<16 x i16> %a0, <16 x i16> %a1) {
; X32-LABEL: combine_vpermt2var_vpermi2var_16i16_as_unpcklwd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
; X32-NEXT: retl
;
; X64-LABEL: combine_vpermt2var_vpermi2var_16i16_as_unpcklwd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
; X64-NEXT: retq
%res0 = call <16 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.256(<16 x i16> <i16 0, i16 16, i16 1, i16 17, i16 2, i16 18, i16 3, i16 19, i16 8, i16 24, i16 9, i16 25, i16 10, i16 26, i16 11, i16 27>, <16 x i16> %a0, <16 x i16> %a1, i16 -1)
diff --git a/test/CodeGen/X86/vector-shuffle-combining-avx512vbmi.ll b/test/CodeGen/X86/vector-shuffle-combining-avx512vbmi.ll
index ad6b5ee05494..44add0416f2b 100644
--- a/test/CodeGen/X86/vector-shuffle-combining-avx512vbmi.ll
+++ b/test/CodeGen/X86/vector-shuffle-combining-avx512vbmi.ll
@@ -23,11 +23,11 @@ declare <64 x i8> @llvm.x86.avx512.mask.pshuf.b.512(<64 x i8>, <64 x i8>, <64 x
define <16 x i8> @combine_vpermt2var_16i8_identity(<16 x i8> %x0, <16 x i8> %x1) {
; X32-LABEL: combine_vpermt2var_16i8_identity:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: retl
;
; X64-LABEL: combine_vpermt2var_16i8_identity:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: retq
%res0 = call <16 x i8> @llvm.x86.avx512.maskz.vpermt2var.qi.128(<16 x i8> <i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>, <16 x i8> %x0, <16 x i8> %x1, i16 -1)
%res1 = call <16 x i8> @llvm.x86.avx512.maskz.vpermt2var.qi.128(<16 x i8> <i8 15, i8 30, i8 13, i8 28, i8 11, i8 26, i8 9, i8 24, i8 7, i8 22, i8 5, i8 20, i8 3, i8 18, i8 1, i8 16>, <16 x i8> %res0, <16 x i8> %res0, i16 -1)
@@ -35,20 +35,20 @@ define <16 x i8> @combine_vpermt2var_16i8_identity(<16 x i8> %x0, <16 x i8> %x1)
}
define <16 x i8> @combine_vpermt2var_16i8_identity_mask(<16 x i8> %x0, <16 x i8> %x1, i16 %m) {
; X32-LABEL: combine_vpermt2var_16i8_identity_mask:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
-; X32-NEXT: vmovdqu {{.*#+}} xmm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
+; X32-NEXT: vmovdqa {{.*#+}} xmm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
; X32-NEXT: vpermi2b %xmm1, %xmm0, %xmm2 {%k1} {z}
-; X32-NEXT: vmovdqu {{.*#+}} xmm0 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16]
+; X32-NEXT: vmovdqa {{.*#+}} xmm0 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16]
; X32-NEXT: vpermi2b %xmm2, %xmm2, %xmm0 {%k1} {z}
; X32-NEXT: retl
;
; X64-LABEL: combine_vpermt2var_16i8_identity_mask:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: kmovd %edi, %k1
-; X64-NEXT: vmovdqu {{.*#+}} xmm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
+; X64-NEXT: vmovdqa {{.*#+}} xmm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
; X64-NEXT: vpermi2b %xmm1, %xmm0, %xmm2 {%k1} {z}
-; X64-NEXT: vmovdqu {{.*#+}} xmm0 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16]
+; X64-NEXT: vmovdqa {{.*#+}} xmm0 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16]
; X64-NEXT: vpermi2b %xmm2, %xmm2, %xmm0 {%k1} {z}
; X64-NEXT: retq
%res0 = call <16 x i8> @llvm.x86.avx512.maskz.vpermt2var.qi.128(<16 x i8> <i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>, <16 x i8> %x0, <16 x i8> %x1, i16 %m)
@@ -58,12 +58,12 @@ define <16 x i8> @combine_vpermt2var_16i8_identity_mask(<16 x i8> %x0, <16 x i8>
define <16 x i8> @combine_vpermi2var_16i8_as_vpshufb(<16 x i8> %x0, <16 x i8> %x1) {
; X32-LABEL: combine_vpermi2var_16i8_as_vpshufb:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,14,1,13,2,12,3,11,4,10,5,9,6,8,7]
; X32-NEXT: retl
;
; X64-LABEL: combine_vpermi2var_16i8_as_vpshufb:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,0,14,1,13,2,12,3,11,4,10,5,9,6,8,7]
; X64-NEXT: retq
%res0 = call <16 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.128(<16 x i8> %x0, <16 x i8> <i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>, <16 x i8> %x1, i16 -1)
@@ -72,14 +72,14 @@ define <16 x i8> @combine_vpermi2var_16i8_as_vpshufb(<16 x i8> %x0, <16 x i8> %x
}
define <32 x i8> @combine_vpermi2var_32i8_as_vpermb(<32 x i8> %x0, <32 x i8> %x1) {
; X32-LABEL: combine_vpermi2var_32i8_as_vpermb:
-; X32: # BB#0:
-; X32-NEXT: vmovdqu {{.*#+}} ymm1 = [0,0,1,23,2,22,3,21,4,22,5,21,6,20,7,19,0,0,1,23,2,22,3,21,4,22,5,21,6,20,7,19]
+; X32: # %bb.0:
+; X32-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,1,23,2,22,3,21,4,22,5,21,6,20,7,19,0,0,1,23,2,22,3,21,4,22,5,21,6,20,7,19]
; X32-NEXT: vpermb %ymm0, %ymm1, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: combine_vpermi2var_32i8_as_vpermb:
-; X64: # BB#0:
-; X64-NEXT: vmovdqu {{.*#+}} ymm1 = [0,0,1,23,2,22,3,21,4,22,5,21,6,20,7,19,0,0,1,23,2,22,3,21,4,22,5,21,6,20,7,19]
+; X64: # %bb.0:
+; X64-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,1,23,2,22,3,21,4,22,5,21,6,20,7,19,0,0,1,23,2,22,3,21,4,22,5,21,6,20,7,19]
; X64-NEXT: vpermb %ymm0, %ymm1, %ymm0
; X64-NEXT: retq
%res0 = shufflevector <32 x i8> %x0, <32 x i8> %x1, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55>
@@ -88,14 +88,14 @@ define <32 x i8> @combine_vpermi2var_32i8_as_vpermb(<32 x i8> %x0, <32 x i8> %x1
}
define <64 x i8> @combine_vpermi2var_64i8_as_vpermb(<64 x i8> %x0, <64 x i8> %x1) {
; X32-LABEL: combine_vpermi2var_64i8_as_vpermb:
-; X32: # BB#0:
-; X32-NEXT: vmovdqu8 {{.*#+}} zmm1 = [0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19,0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19,0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19,0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19]
+; X32: # %bb.0:
+; X32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19,0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19,0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19,0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19]
; X32-NEXT: vpermb %zmm0, %zmm1, %zmm0
; X32-NEXT: retl
;
; X64-LABEL: combine_vpermi2var_64i8_as_vpermb:
-; X64: # BB#0:
-; X64-NEXT: vmovdqu8 {{.*#+}} zmm1 = [0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19,0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19,0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19,0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19]
+; X64: # %bb.0:
+; X64-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19,0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19,0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19,0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19]
; X64-NEXT: vpermb %zmm0, %zmm1, %zmm0
; X64-NEXT: retq
%res0 = shufflevector <64 x i8> %x0, <64 x i8> %x1, <64 x i32> <i32 0, i32 64, i32 1, i32 65, i32 2, i32 66, i32 3, i32 67, i32 4, i32 68, i32 5, i32 69, i32 6, i32 70, i32 7, i32 71, i32 16, i32 80, i32 17, i32 81, i32 18, i32 82, i32 19, i32 83, i32 20, i32 84, i32 21, i32 85, i32 22, i32 86, i32 23, i32 87, i32 32, i32 96, i32 33, i32 97, i32 34, i32 98, i32 35, i32 99, i32 36, i32 100, i32 37, i32 101, i32 38, i32 102, i32 39, i32 103, i32 48, i32 112, i32 49, i32 113, i32 50, i32 114, i32 51, i32 115, i32 52, i32 116, i32 53, i32 117, i32 54, i32 118, i32 55, i32 119>
@@ -105,18 +105,18 @@ define <64 x i8> @combine_vpermi2var_64i8_as_vpermb(<64 x i8> %x0, <64 x i8> %x1
define <16 x i8> @combine_vpermt2var_vpermi2var_16i8_as_vperm2(<16 x i8> %x0, <16 x i8> %x1) {
; X32-LABEL: combine_vpermt2var_vpermi2var_16i8_as_vperm2:
-; X32: # BB#0:
-; X32-NEXT: vmovdqu {{.*#+}} xmm2 = [0,31,2,29,4,27,6,25,8,23,10,21,12,19,14,17]
+; X32: # %bb.0:
+; X32-NEXT: vmovdqa {{.*#+}} xmm2 = [0,31,2,29,4,27,6,25,8,23,10,21,12,19,14,17]
; X32-NEXT: vpermi2b %xmm1, %xmm0, %xmm2
-; X32-NEXT: vmovdqu {{.*#+}} xmm0 = [0,17,2,18,4,19,6,21,8,23,10,25,12,27,14,29]
+; X32-NEXT: vmovdqa {{.*#+}} xmm0 = [0,17,2,18,4,19,6,21,8,23,10,25,12,27,14,29]
; X32-NEXT: vpermi2b %xmm2, %xmm2, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: combine_vpermt2var_vpermi2var_16i8_as_vperm2:
-; X64: # BB#0:
-; X64-NEXT: vmovdqu {{.*#+}} xmm2 = [0,31,2,29,4,27,6,25,8,23,10,21,12,19,14,17]
+; X64: # %bb.0:
+; X64-NEXT: vmovdqa {{.*#+}} xmm2 = [0,31,2,29,4,27,6,25,8,23,10,21,12,19,14,17]
; X64-NEXT: vpermi2b %xmm1, %xmm0, %xmm2
-; X64-NEXT: vmovdqu {{.*#+}} xmm0 = [0,17,2,18,4,19,6,21,8,23,10,25,12,27,14,29]
+; X64-NEXT: vmovdqa {{.*#+}} xmm0 = [0,17,2,18,4,19,6,21,8,23,10,25,12,27,14,29]
; X64-NEXT: vpermi2b %xmm2, %xmm2, %xmm0
; X64-NEXT: retq
%res0 = call <16 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.128(<16 x i8> %x0, <16 x i8> <i8 0, i8 31, i8 2, i8 29, i8 4, i8 27, i8 6, i8 25, i8 8, i8 23, i8 10, i8 21, i8 12, i8 19, i8 14, i8 17>, <16 x i8> %x1, i16 -1)
@@ -125,14 +125,14 @@ define <16 x i8> @combine_vpermt2var_vpermi2var_16i8_as_vperm2(<16 x i8> %x0, <1
}
define <32 x i8> @combine_vpermi2var_32i8_as_vperm2(<32 x i8> %x0, <32 x i8> %x1) {
; X32-LABEL: combine_vpermi2var_32i8_as_vperm2:
-; X32: # BB#0:
-; X32-NEXT: vmovdqu {{.*#+}} ymm2 = [0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19,0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19]
+; X32: # %bb.0:
+; X32-NEXT: vmovdqa {{.*#+}} ymm2 = [0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19,0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19]
; X32-NEXT: vpermt2b %ymm1, %ymm2, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: combine_vpermi2var_32i8_as_vperm2:
-; X64: # BB#0:
-; X64-NEXT: vmovdqu {{.*#+}} ymm2 = [0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19,0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19]
+; X64: # %bb.0:
+; X64-NEXT: vmovdqa {{.*#+}} ymm2 = [0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19,0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19]
; X64-NEXT: vpermt2b %ymm1, %ymm2, %ymm0
; X64-NEXT: retq
%res0 = shufflevector <32 x i8> %x0, <32 x i8> %x1, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55>
@@ -141,14 +141,14 @@ define <32 x i8> @combine_vpermi2var_32i8_as_vperm2(<32 x i8> %x0, <32 x i8> %x1
}
define <64 x i8> @combine_vpermi2var_64i8_as_vperm2(<64 x i8> %x0, <64 x i8> %x1) {
; X32-LABEL: combine_vpermi2var_64i8_as_vperm2:
-; X32: # BB#0:
-; X32-NEXT: vmovdqu8 {{.*#+}} zmm2 = [0,80,1,70,2,54,3,49,4,36,5,23,6,18,7,5,0,90,1,100,2,110,3,120,4,22,5,21,6,20,7,19,0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19,0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19]
+; X32: # %bb.0:
+; X32-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,80,1,70,2,54,3,49,4,36,5,23,6,18,7,5,0,90,1,100,2,110,3,120,4,22,5,21,6,20,7,19,0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19,0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19]
; X32-NEXT: vpermt2b %zmm1, %zmm2, %zmm0
; X32-NEXT: retl
;
; X64-LABEL: combine_vpermi2var_64i8_as_vperm2:
-; X64: # BB#0:
-; X64-NEXT: vmovdqu8 {{.*#+}} zmm2 = [0,80,1,70,2,54,3,49,4,36,5,23,6,18,7,5,0,90,1,100,2,110,3,120,4,22,5,21,6,20,7,19,0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19,0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19]
+; X64: # %bb.0:
+; X64-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,80,1,70,2,54,3,49,4,36,5,23,6,18,7,5,0,90,1,100,2,110,3,120,4,22,5,21,6,20,7,19,0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19,0,32,1,23,2,22,3,21,4,22,5,21,6,20,7,19]
; X64-NEXT: vpermt2b %zmm1, %zmm2, %zmm0
; X64-NEXT: retq
%res0 = shufflevector <64 x i8> %x0, <64 x i8> %x1, <64 x i32> <i32 0, i32 64, i32 1, i32 65, i32 2, i32 66, i32 3, i32 67, i32 4, i32 68, i32 5, i32 69, i32 6, i32 70, i32 7, i32 71, i32 16, i32 80, i32 17, i32 81, i32 18, i32 82, i32 19, i32 83, i32 20, i32 84, i32 21, i32 85, i32 22, i32 86, i32 23, i32 87, i32 32, i32 96, i32 33, i32 97, i32 34, i32 98, i32 35, i32 99, i32 36, i32 100, i32 37, i32 101, i32 38, i32 102, i32 39, i32 103, i32 48, i32 112, i32 49, i32 113, i32 50, i32 114, i32 51, i32 115, i32 52, i32 116, i32 53, i32 117, i32 54, i32 118, i32 55, i32 119>
diff --git a/test/CodeGen/X86/vector-shuffle-combining-sse41.ll b/test/CodeGen/X86/vector-shuffle-combining-sse41.ll
index 29e2124a168c..27ccdefe4d50 100644
--- a/test/CodeGen/X86/vector-shuffle-combining-sse41.ll
+++ b/test/CodeGen/X86/vector-shuffle-combining-sse41.ll
@@ -10,12 +10,12 @@ declare <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8>, <16 x i8>)
define <16 x i8> @combine_vpshufb_as_movzx(<16 x i8> %a0) {
; SSE-LABEL: combine_vpshufb_as_movzx:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vpshufb_as_movzx:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX-NEXT: retq
%res0 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 -1, i8 -1, i8 -1, i8 -1, i8 undef, i8 undef, i8 undef, i8 undef, i8 -1, i8 -1, i8 -1, i8 -1>)
diff --git a/test/CodeGen/X86/vector-shuffle-combining-sse4a.ll b/test/CodeGen/X86/vector-shuffle-combining-sse4a.ll
index af69a5ac2283..5da94190ccaf 100644
--- a/test/CodeGen/X86/vector-shuffle-combining-sse4a.ll
+++ b/test/CodeGen/X86/vector-shuffle-combining-sse4a.ll
@@ -10,7 +10,7 @@ declare <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8>, <16 x i8>)
define <16 x i8> @combine_extrqi_pshufb_16i8(<16 x i8> %a0) {
; ALL-LABEL: combine_extrqi_pshufb_16i8:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: extrq {{.*#+}} xmm0 = xmm0[1,2],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
; ALL-NEXT: retq
%1 = shufflevector <16 x i8> %a0, <16 x i8> zeroinitializer, <16 x i32> <i32 1, i32 2, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -20,7 +20,7 @@ define <16 x i8> @combine_extrqi_pshufb_16i8(<16 x i8> %a0) {
define <8 x i16> @combine_extrqi_pshufb_8i16(<8 x i16> %a0) {
; ALL-LABEL: combine_extrqi_pshufb_8i16:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: extrq {{.*#+}} xmm0 = xmm0[2,3],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
; ALL-NEXT: retq
%1 = shufflevector <8 x i16> %a0, <8 x i16> zeroinitializer, <8 x i32> <i32 1, i32 2, i32 8, i32 8, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -32,18 +32,18 @@ define <8 x i16> @combine_extrqi_pshufb_8i16(<8 x i16> %a0) {
define <16 x i8> @combine_insertqi_pshufb_16i8(<16 x i8> %a0, <16 x i8> %a1) {
; SSSE3-LABEL: combine_insertqi_pshufb_16i8:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: extrq {{.*#+}} xmm1 = xmm1[0,1],zero,zero,zero,zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
; SSSE3-NEXT: movdqa %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; SSE42-LABEL: combine_insertqi_pshufb_16i8:
-; SSE42: # BB#0:
+; SSE42: # %bb.0:
; SSE42-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
; SSE42-NEXT: retq
;
; AVX-LABEL: combine_insertqi_pshufb_16i8:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
; AVX-NEXT: retq
%1 = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> <i32 16, i32 17, i32 18, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -53,18 +53,18 @@ define <16 x i8> @combine_insertqi_pshufb_16i8(<16 x i8> %a0, <16 x i8> %a1) {
define <8 x i16> @combine_insertqi_pshufb_8i16(<8 x i16> %a0, <8 x i16> %a1) {
; SSSE3-LABEL: combine_insertqi_pshufb_8i16:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: extrq {{.*#+}} xmm1 = xmm1[0,1],zero,zero,zero,zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
; SSSE3-NEXT: movdqa %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; SSE42-LABEL: combine_insertqi_pshufb_8i16:
-; SSE42: # BB#0:
+; SSE42: # %bb.0:
; SSE42-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
; SSE42-NEXT: retq
;
; AVX-LABEL: combine_insertqi_pshufb_8i16:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
; AVX-NEXT: retq
%1 = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> <i32 8, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -76,7 +76,7 @@ define <8 x i16> @combine_insertqi_pshufb_8i16(<8 x i16> %a0, <8 x i16> %a1) {
define <16 x i8> @combine_pshufb_insertqi_pshufb(<16 x i8> %a0, <16 x i8> %a1) {
; ALL-LABEL: combine_pshufb_insertqi_pshufb:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: insertq {{.*#+}} xmm0 = xmm0[0],xmm1[0,1],xmm0[3,4,5,6,7,u,u,u,u,u,u,u,u]
; ALL-NEXT: retq
%1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>)
diff --git a/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll b/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll
index 02314857c6d7..dbb86624cc37 100644
--- a/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll
+++ b/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll
@@ -11,12 +11,12 @@ declare <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8>, <16 x i8>)
define <16 x i8> @combine_vpshufb_as_zero(<16 x i8> %a0) {
; SSE-LABEL: combine_vpshufb_as_zero:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: xorps %xmm0, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vpshufb_as_zero:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
; AVX-NEXT: retq
%res0 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 128, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>)
@@ -27,12 +27,12 @@ define <16 x i8> @combine_vpshufb_as_zero(<16 x i8> %a0) {
define <16 x i8> @combine_vpshufb_as_movq(<16 x i8> %a0) {
; SSE-LABEL: combine_vpshufb_as_movq:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vpshufb_as_movq:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
; AVX-NEXT: retq
%res0 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 0, i8 128, i8 1, i8 128, i8 2, i8 128, i8 3, i8 128, i8 4, i8 128, i8 5, i8 128, i8 6, i8 128, i8 7, i8 128>)
@@ -42,28 +42,28 @@ define <16 x i8> @combine_vpshufb_as_movq(<16 x i8> %a0) {
define <2 x double> @combine_pshufb_as_movsd(<2 x double> %a0, <2 x double> %a1) {
; SSSE3-LABEL: combine_pshufb_as_movsd:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
; SSSE3-NEXT: movapd %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: combine_pshufb_as_movsd:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
; SSE41-NEXT: retq
;
; AVX1-LABEL: combine_pshufb_as_movsd:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
; AVX1-NEXT: retq
;
; AVX2-LABEL: combine_pshufb_as_movsd:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
; AVX2-NEXT: retq
;
; AVX512F-LABEL: combine_pshufb_as_movsd:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
; AVX512F-NEXT: retq
%1 = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> <i32 3, i32 0>
@@ -75,27 +75,27 @@ define <2 x double> @combine_pshufb_as_movsd(<2 x double> %a0, <2 x double> %a1)
define <4 x float> @combine_pshufb_as_movss(<4 x float> %a0, <4 x float> %a1) {
; SSSE3-LABEL: combine_pshufb_as_movss:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: combine_pshufb_as_movss:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
; SSE41-NEXT: retq
;
; AVX1-LABEL: combine_pshufb_as_movss:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
; AVX1-NEXT: retq
;
; AVX2-LABEL: combine_pshufb_as_movss:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
; AVX2-NEXT: retq
;
; AVX512F-LABEL: combine_pshufb_as_movss:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
; AVX512F-NEXT: retq
%1 = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 4, i32 3, i32 2, i32 1>
@@ -107,17 +107,17 @@ define <4 x float> @combine_pshufb_as_movss(<4 x float> %a0, <4 x float> %a1) {
define <4 x i32> @combine_pshufb_as_zext(<16 x i8> %a0) {
; SSSE3-LABEL: combine_pshufb_as_zext:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; SSSE3-NEXT: retq
;
; SSE41-LABEL: combine_pshufb_as_zext:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; SSE41-NEXT: retq
;
; AVX-LABEL: combine_pshufb_as_zext:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; AVX-NEXT: retq
%1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 0, i8 -1, i8 -1, i8 -1, i8 1, i8 -1, i8 -1, i8 -1, i8 2, i8 -1, i8 -1, i8 -1, i8 3, i8 -1, i8 -1, i8 -1>)
@@ -127,12 +127,12 @@ define <4 x i32> @combine_pshufb_as_zext(<16 x i8> %a0) {
define <2 x double> @combine_pshufb_as_vzmovl_64(<2 x double> %a0) {
; SSE-LABEL: combine_pshufb_as_vzmovl_64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
; SSE-NEXT: retq
;
; AVX-LABEL: combine_pshufb_as_vzmovl_64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
; AVX-NEXT: retq
%1 = bitcast <2 x double> %a0 to <16 x i8>
@@ -143,32 +143,32 @@ define <2 x double> @combine_pshufb_as_vzmovl_64(<2 x double> %a0) {
define <4 x float> @combine_pshufb_as_vzmovl_32(<4 x float> %a0) {
; SSSE3-LABEL: combine_pshufb_as_vzmovl_32:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: xorps %xmm1, %xmm1
; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
; SSSE3-NEXT: movaps %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: combine_pshufb_as_vzmovl_32:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: xorps %xmm1, %xmm1
; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; SSE41-NEXT: retq
;
; AVX1-LABEL: combine_pshufb_as_vzmovl_32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; AVX1-NEXT: retq
;
; AVX2-LABEL: combine_pshufb_as_vzmovl_32:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; AVX2-NEXT: retq
;
; AVX512F-LABEL: combine_pshufb_as_vzmovl_32:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX512F-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; AVX512F-NEXT: retq
@@ -180,12 +180,12 @@ define <4 x float> @combine_pshufb_as_vzmovl_32(<4 x float> %a0) {
define <4 x float> @combine_pshufb_movddup(<4 x float> %a0) {
; SSE-LABEL: combine_pshufb_movddup:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[5,5,5,5,7,7,7,7,5,5,5,5,7,7,7,7]
; SSE-NEXT: retq
;
; AVX-LABEL: combine_pshufb_movddup:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,5,5,5,7,7,7,7,5,5,5,5,7,7,7,7]
; AVX-NEXT: retq
%1 = bitcast <4 x float> %a0 to <16 x i8>
@@ -197,12 +197,12 @@ define <4 x float> @combine_pshufb_movddup(<4 x float> %a0) {
define <4 x float> @combine_pshufb_movshdup(<4 x float> %a0) {
; SSE-LABEL: combine_pshufb_movshdup:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[7,7,7,7,7,7,7,7,3,3,3,3,3,3,3,3]
; SSE-NEXT: retq
;
; AVX-LABEL: combine_pshufb_movshdup:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,7,7,7,7,7,7,7,3,3,3,3,3,3,3,3]
; AVX-NEXT: retq
%1 = bitcast <4 x float> %a0 to <16 x i8>
@@ -214,12 +214,12 @@ define <4 x float> @combine_pshufb_movshdup(<4 x float> %a0) {
define <4 x float> @combine_pshufb_movsldup(<4 x float> %a0) {
; SSE-LABEL: combine_pshufb_movsldup:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[5,5,5,5,5,5,5,5,1,1,1,1,1,1,1,1]
; SSE-NEXT: retq
;
; AVX-LABEL: combine_pshufb_movsldup:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,5,5,5,5,5,5,5,1,1,1,1,1,1,1,1]
; AVX-NEXT: retq
%1 = bitcast <4 x float> %a0 to <16 x i8>
@@ -231,13 +231,13 @@ define <4 x float> @combine_pshufb_movsldup(<4 x float> %a0) {
define <16 x i8> @combine_pshufb_palignr(<16 x i8> %a0, <16 x i8> %a1) {
; SSE-LABEL: combine_pshufb_palignr:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; SSE-NEXT: retq
;
; AVX-LABEL: combine_pshufb_palignr:
-; AVX: # BB#0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX: # %bb.0:
+; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3]
; AVX-NEXT: retq
%1 = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
%2 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %1, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>)
@@ -246,12 +246,12 @@ define <16 x i8> @combine_pshufb_palignr(<16 x i8> %a0, <16 x i8> %a1) {
define <16 x i8> @combine_pshufb_pslldq(<16 x i8> %a0) {
; SSE-LABEL: combine_pshufb_pslldq:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: xorps %xmm0, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_pshufb_pslldq:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
; AVX-NEXT: retq
%1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>)
@@ -261,12 +261,12 @@ define <16 x i8> @combine_pshufb_pslldq(<16 x i8> %a0) {
define <16 x i8> @combine_pshufb_psrldq(<16 x i8> %a0) {
; SSE-LABEL: combine_pshufb_psrldq:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: xorps %xmm0, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_pshufb_psrldq:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
; AVX-NEXT: retq
%1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128>)
@@ -276,18 +276,18 @@ define <16 x i8> @combine_pshufb_psrldq(<16 x i8> %a0) {
define <16 x i8> @combine_and_pshufb(<16 x i8> %a0) {
; SSSE3-LABEL: combine_and_pshufb:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: andps {{.*}}(%rip), %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: combine_and_pshufb:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pxor %xmm1, %xmm1
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5,6,7]
; SSE41-NEXT: retq
;
; AVX-LABEL: combine_and_pshufb:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5,6,7]
; AVX-NEXT: retq
@@ -298,18 +298,18 @@ define <16 x i8> @combine_and_pshufb(<16 x i8> %a0) {
define <16 x i8> @combine_pshufb_and(<16 x i8> %a0) {
; SSSE3-LABEL: combine_pshufb_and:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: andps {{.*}}(%rip), %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: combine_pshufb_and:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pxor %xmm1, %xmm1
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5,6,7]
; SSE41-NEXT: retq
;
; AVX-LABEL: combine_pshufb_and:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5,6,7]
; AVX-NEXT: retq
@@ -320,12 +320,12 @@ define <16 x i8> @combine_pshufb_and(<16 x i8> %a0) {
define <16 x i8> @combine_pshufb_as_palignr(<16 x i8> %a0) {
; SSE-LABEL: combine_pshufb_as_palignr:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: palignr {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0]
; SSE-NEXT: retq
;
; AVX-LABEL: combine_pshufb_as_palignr:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0]
; AVX-NEXT: retq
%res0 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 undef, i8 undef, i8 0>)
@@ -334,12 +334,12 @@ define <16 x i8> @combine_pshufb_as_palignr(<16 x i8> %a0) {
define <16 x i8> @combine_pshufb_as_pslldq(<16 x i8> %a0) {
; SSE-LABEL: combine_pshufb_as_pslldq:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5]
; SSE-NEXT: retq
;
; AVX-LABEL: combine_pshufb_as_pslldq:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5]
; AVX-NEXT: retq
%res0 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5>)
@@ -348,12 +348,12 @@ define <16 x i8> @combine_pshufb_as_pslldq(<16 x i8> %a0) {
define <16 x i8> @combine_pshufb_as_psrldq(<16 x i8> %a0) {
; SSE-LABEL: combine_pshufb_as_psrldq:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; SSE-NEXT: retq
;
; AVX-LABEL: combine_pshufb_as_psrldq:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; AVX-NEXT: retq
%res0 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 15, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128>)
@@ -362,12 +362,12 @@ define <16 x i8> @combine_pshufb_as_psrldq(<16 x i8> %a0) {
define <16 x i8> @combine_pshufb_as_psrlw(<16 x i8> %a0) {
; SSE-LABEL: combine_pshufb_as_psrlw:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: psrlw $8, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_pshufb_as_psrlw:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0
; AVX-NEXT: retq
%res0 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 1, i8 128, i8 3, i8 128, i8 5, i8 128, i8 7, i8 128, i8 9, i8 128, i8 11, i8 128, i8 13, i8 128, i8 15, i8 128>)
@@ -376,12 +376,12 @@ define <16 x i8> @combine_pshufb_as_psrlw(<16 x i8> %a0) {
define <16 x i8> @combine_pshufb_as_pslld(<16 x i8> %a0) {
; SSE-LABEL: combine_pshufb_as_pslld:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pslld $24, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_pshufb_as_pslld:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpslld $24, %xmm0, %xmm0
; AVX-NEXT: retq
%res0 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 128, i8 128, i8 128, i8 0, i8 128, i8 128, i8 128, i8 4, i8 128, i8 128, i8 128, i8 8, i8 128, i8 128, i8 128, i8 12>)
@@ -390,12 +390,12 @@ define <16 x i8> @combine_pshufb_as_pslld(<16 x i8> %a0) {
define <16 x i8> @combine_pshufb_as_psrlq(<16 x i8> %a0) {
; SSE-LABEL: combine_pshufb_as_psrlq:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: psrlq $40, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_pshufb_as_psrlq:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpsrlq $40, %xmm0, %xmm0
; AVX-NEXT: retq
%res0 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 5, i8 6, i8 7, i8 128, i8 128, i8 128, i8 128, i8 128, i8 13, i8 14, i8 15, i8 128, i8 128, i8 128, i8 128, i8 128>)
@@ -404,12 +404,12 @@ define <16 x i8> @combine_pshufb_as_psrlq(<16 x i8> %a0) {
define <16 x i8> @combine_pshufb_as_pshuflw(<16 x i8> %a0) {
; SSE-LABEL: combine_pshufb_as_pshuflw:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7]
; SSE-NEXT: retq
;
; AVX-LABEL: combine_pshufb_as_pshuflw:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7]
; AVX-NEXT: retq
%res0 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 2, i8 3, i8 0, i8 1, i8 6, i8 7, i8 4, i8 5, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>)
@@ -418,12 +418,12 @@ define <16 x i8> @combine_pshufb_as_pshuflw(<16 x i8> %a0) {
define <16 x i8> @combine_pshufb_as_pshufhw(<16 x i8> %a0) {
; SSE-LABEL: combine_pshufb_as_pshufhw:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6]
; SSE-NEXT: retq
;
; AVX-LABEL: combine_pshufb_as_pshufhw:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6]
; AVX-NEXT: retq
%res0 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 10, i8 11, i8 8, i8 9, i8 14, i8 15, i8 12, i8 13>)
@@ -432,12 +432,12 @@ define <16 x i8> @combine_pshufb_as_pshufhw(<16 x i8> %a0) {
define <16 x i8> @combine_pshufb_not_as_pshufw(<16 x i8> %a0) {
; SSE-LABEL: combine_pshufb_not_as_pshufw:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
; SSE-NEXT: retq
;
; AVX-LABEL: combine_pshufb_not_as_pshufw:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13]
; AVX-NEXT: retq
%res0 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 2, i8 3, i8 0, i8 1, i8 6, i8 7, i8 4, i8 5, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>)
@@ -447,12 +447,12 @@ define <16 x i8> @combine_pshufb_not_as_pshufw(<16 x i8> %a0) {
define <16 x i8> @combine_vpshufb_as_pshuflw_not_pslld(<16 x i8> *%a0) {
; SSE-LABEL: combine_vpshufb_as_pshuflw_not_pslld:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = mem[0,0,2,2,4,5,6,7]
; SSE-NEXT: retq
;
; AVX-LABEL: combine_vpshufb_as_pshuflw_not_pslld:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = mem[0,0,2,2,4,5,6,7]
; AVX-NEXT: retq
%res0 = load <16 x i8>, <16 x i8> *%a0, align 16
@@ -462,12 +462,12 @@ define <16 x i8> @combine_vpshufb_as_pshuflw_not_pslld(<16 x i8> *%a0) {
define <16 x i8> @combine_pshufb_as_unary_unpcklbw(<16 x i8> %a0) {
; SSE-LABEL: combine_pshufb_as_unary_unpcklbw:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE-NEXT: retq
;
; AVX-LABEL: combine_pshufb_as_unary_unpcklbw:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; AVX-NEXT: retq
%1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 0, i8 undef, i8 undef, i8 1, i8 2, i8 2, i8 3, i8 3, i8 4, i8 4, i8 5, i8 5, i8 6, i8 6, i8 7, i8 7>)
@@ -476,12 +476,12 @@ define <16 x i8> @combine_pshufb_as_unary_unpcklbw(<16 x i8> %a0) {
define <16 x i8> @combine_pshufb_as_unary_unpckhwd(<16 x i8> %a0) {
; SSE-LABEL: combine_pshufb_as_unary_unpckhwd:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
; SSE-NEXT: retq
;
; AVX-LABEL: combine_pshufb_as_unary_unpckhwd:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
; AVX-NEXT: retq
%1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 8, i8 9, i8 8, i8 9, i8 10, i8 11, i8 10, i8 11, i8 12, i8 13, i8 12, i8 13, i8 14, i8 15, i8 undef, i8 undef>)
@@ -490,7 +490,7 @@ define <16 x i8> @combine_pshufb_as_unary_unpckhwd(<16 x i8> %a0) {
define <8 x i16> @combine_pshufb_as_unpacklo_undef(<16 x i8> %a0) {
; ALL-LABEL: combine_pshufb_as_unpacklo_undef:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: retq
%1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 undef, i8 undef, i8 0, i8 1, i8 undef, i8 undef, i8 2, i8 3, i8 undef, i8 undef, i8 4, i8 5, i8 undef, i8 undef, i8 6, i8 7>)
%2 = bitcast <16 x i8> %1 to <8 x i16>
@@ -500,7 +500,7 @@ define <8 x i16> @combine_pshufb_as_unpacklo_undef(<16 x i8> %a0) {
define <16 x i8> @combine_pshufb_as_unpackhi_undef(<16 x i8> %a0) {
; ALL-LABEL: combine_pshufb_as_unpackhi_undef:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: retq
%1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 8, i8 undef, i8 9, i8 undef, i8 10, i8 undef, i8 11, i8 undef, i8 12, i8 undef, i8 13, i8 undef, i8 14, i8 undef, i8 15, i8 undef>)
%2 = shufflevector <16 x i8> %1, <16 x i8> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
@@ -509,16 +509,16 @@ define <16 x i8> @combine_pshufb_as_unpackhi_undef(<16 x i8> %a0) {
define <16 x i8> @combine_pshufb_as_unpacklo_zero(<16 x i8> %a0) {
; SSE-LABEL: combine_pshufb_as_unpacklo_zero:
-; SSE: # BB#0:
-; SSE-NEXT: pxor %xmm1, %xmm1
-; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE-NEXT: movdqa %xmm1, %xmm0
+; SSE: # %bb.0:
+; SSE-NEXT: xorps %xmm1, %xmm1
+; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE-NEXT: movaps %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_pshufb_as_unpacklo_zero:
-; AVX: # BB#0:
-; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; AVX: # %bb.0:
+; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; AVX-NEXT: retq
%1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 1, i8 2, i8 3, i8 -1, i8 -1, i8 -1, i8 -1, i8 4, i8 5, i8 6, i8 7>)
ret <16 x i8> %1
@@ -526,13 +526,13 @@ define <16 x i8> @combine_pshufb_as_unpacklo_zero(<16 x i8> %a0) {
define <16 x i8> @combine_pshufb_as_unpackhi_zero(<16 x i8> %a0) {
; SSE-LABEL: combine_pshufb_as_unpackhi_zero:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pxor %xmm1, %xmm1
; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
; SSE-NEXT: retq
;
; AVX-LABEL: combine_pshufb_as_unpackhi_zero:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
; AVX-NEXT: retq
@@ -542,12 +542,12 @@ define <16 x i8> @combine_pshufb_as_unpackhi_zero(<16 x i8> %a0) {
define <16 x i8> @combine_psrlw_pshufb(<8 x i16> %a0) {
; SSE-LABEL: combine_psrlw_pshufb:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[1],zero,zero,zero
; SSE-NEXT: retq
;
; AVX-LABEL: combine_psrlw_pshufb:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[1],zero,zero,zero
; AVX-NEXT: retq
%1 = lshr <8 x i16> %a0, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
@@ -558,12 +558,12 @@ define <16 x i8> @combine_psrlw_pshufb(<8 x i16> %a0) {
define <16 x i8> @combine_pslld_pshufb(<4 x i32> %a0) {
; SSE-LABEL: combine_pslld_pshufb:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,1,0],zero,xmm0[6,5,4],zero,xmm0[10,9,8],zero,xmm0[14,13,12],zero
; SSE-NEXT: retq
;
; AVX-LABEL: combine_pslld_pshufb:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,1,0],zero,xmm0[6,5,4],zero,xmm0[10,9,8],zero,xmm0[14,13,12],zero
; AVX-NEXT: retq
%1 = shl <4 x i32> %a0, <i32 8, i32 8, i32 8, i32 8>
@@ -574,12 +574,12 @@ define <16 x i8> @combine_pslld_pshufb(<4 x i32> %a0) {
define <16 x i8> @combine_psrlq_pshufb(<2 x i64> %a0) {
; SSE-LABEL: combine_psrlq_pshufb:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[7,6],zero,zero,zero,zero,zero,zero,xmm0[15,14]
; SSE-NEXT: retq
;
; AVX-LABEL: combine_psrlq_pshufb:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[7,6],zero,zero,zero,zero,zero,zero,xmm0[15,14]
; AVX-NEXT: retq
%1 = lshr <2 x i64> %a0, <i64 48, i64 48>
@@ -590,12 +590,12 @@ define <16 x i8> @combine_psrlq_pshufb(<2 x i64> %a0) {
define <16 x i8> @combine_unpckl_arg0_pshufb(<16 x i8> %a0, <16 x i8> %a1) {
; SSE-LABEL: combine_unpckl_arg0_pshufb:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero
; SSE-NEXT: retq
;
; AVX-LABEL: combine_unpckl_arg0_pshufb:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero
; AVX-NEXT: retq
%1 = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
@@ -605,13 +605,13 @@ define <16 x i8> @combine_unpckl_arg0_pshufb(<16 x i8> %a0, <16 x i8> %a1) {
define <16 x i8> @combine_unpckl_arg1_pshufb(<16 x i8> %a0, <16 x i8> %a1) {
; SSE-LABEL: combine_unpckl_arg1_pshufb:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero
; SSE-NEXT: movdqa %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_unpckl_arg1_pshufb:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero
; AVX-NEXT: retq
%1 = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
@@ -621,12 +621,12 @@ define <16 x i8> @combine_unpckl_arg1_pshufb(<16 x i8> %a0, <16 x i8> %a1) {
define <8 x i16> @shuffle_combine_unpack_insert(<8 x i16> %a0) {
; SSE-LABEL: shuffle_combine_unpack_insert:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[4,5,4,5,4,5,8,9,8,9,8,9,10,11,10,11]
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_combine_unpack_insert:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,4,5,4,5,8,9,8,9,8,9,10,11,10,11]
; AVX-NEXT: retq
%1 = extractelement <8 x i16> %a0, i32 2
@@ -640,14 +640,72 @@ define <8 x i16> @shuffle_combine_unpack_insert(<8 x i16> %a0) {
ret <8 x i16> %8
}
+define <16 x i8> @shuffle_combine_packssdw_pshufb(<4 x i32> %a0) {
+; SSE-LABEL: shuffle_combine_packssdw_pshufb:
+; SSE: # %bb.0:
+; SSE-NEXT: psrad $31, %xmm0
+; SSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[13,12,9,8,5,4,1,0,13,12,9,8,5,4,1,0]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: shuffle_combine_packssdw_pshufb:
+; AVX: # %bb.0:
+; AVX-NEXT: vpsrad $31, %xmm0, %xmm0
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[13,12,9,8,5,4,1,0,13,12,9,8,5,4,1,0]
+; AVX-NEXT: retq
+ %1 = ashr <4 x i32> %a0, <i32 31, i32 31, i32 31, i32 31>
+ %2 = tail call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %1, <4 x i32> %1)
+ %3 = bitcast <8 x i16> %2 to <16 x i8>
+ %4 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %3, <16 x i8> <i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8>)
+ ret <16 x i8> %4
+}
+declare <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <16 x i8> @shuffle_combine_packsswb_pshufb(<8 x i16> %a0, <8 x i16> %a1) {
+; SSE-LABEL: shuffle_combine_packsswb_pshufb:
+; SSE: # %bb.0:
+; SSE-NEXT: psraw $15, %xmm0
+; SSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[14,12,10,8,6,4,2,0,14,12,10,8,6,4,2,0]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: shuffle_combine_packsswb_pshufb:
+; AVX: # %bb.0:
+; AVX-NEXT: vpsraw $15, %xmm0, %xmm0
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[14,12,10,8,6,4,2,0,14,12,10,8,6,4,2,0]
+; AVX-NEXT: retq
+ %1 = ashr <8 x i16> %a0, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
+ %2 = ashr <8 x i16> %a1, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
+ %3 = tail call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %1, <8 x i16> %2)
+ %4 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %3, <16 x i8> <i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+ ret <16 x i8> %4
+}
+declare <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <16 x i8> @shuffle_combine_packuswb_pshufb(<8 x i16> %a0, <8 x i16> %a1) {
+; SSE-LABEL: shuffle_combine_packuswb_pshufb:
+; SSE: # %bb.0:
+; SSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[15,13,11,9,7,5,3,1,15,13,11,9,7,5,3,1]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: shuffle_combine_packuswb_pshufb:
+; AVX: # %bb.0:
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,13,11,9,7,5,3,1,15,13,11,9,7,5,3,1]
+; AVX-NEXT: retq
+ %1 = lshr <8 x i16> %a0, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+ %2 = lshr <8 x i16> %a1, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+ %3 = tail call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %1, <8 x i16> %2)
+ %4 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %3, <16 x i8> <i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+ ret <16 x i8> %4
+}
+declare <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16>, <8 x i16>) nounwind readnone
+
define <16 x i8> @constant_fold_pshufb() {
; SSE-LABEL: constant_fold_pshufb:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm0 = <14,0,0,0,u,u,0,0,0,0,0,0,0,0,8,9>
; SSE-NEXT: retq
;
; AVX-LABEL: constant_fold_pshufb:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} xmm0 = <14,0,0,0,u,u,0,0,0,0,0,0,0,0,8,9>
; AVX-NEXT: retq
%1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> <i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>, <16 x i8> <i8 1, i8 -1, i8 -1, i8 -1, i8 undef, i8 undef, i8 -1, i8 -1, i8 15, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 7, i8 6>)
@@ -657,7 +715,7 @@ define <16 x i8> @constant_fold_pshufb() {
; FIXME - unnecessary pshufb/broadcast being used - pshufb mask only needs lowest byte.
define <16 x i8> @constant_fold_pshufb_2() {
; SSE-LABEL: constant_fold_pshufb_2:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movl $2, %eax
; SSE-NEXT: movd %eax, %xmm0
; SSE-NEXT: pxor %xmm1, %xmm1
@@ -665,7 +723,7 @@ define <16 x i8> @constant_fold_pshufb_2() {
; SSE-NEXT: retq
;
; AVX1-LABEL: constant_fold_pshufb_2:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: movl $2, %eax
; AVX1-NEXT: vmovd %eax, %xmm0
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
@@ -673,14 +731,14 @@ define <16 x i8> @constant_fold_pshufb_2() {
; AVX1-NEXT: retq
;
; AVX2-LABEL: constant_fold_pshufb_2:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: movl $2, %eax
; AVX2-NEXT: vmovd %eax, %xmm0
; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: constant_fold_pshufb_2:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: movl $2, %eax
; AVX512F-NEXT: vmovd %eax, %xmm0
; AVX512F-NEXT: vpbroadcastb %xmm0, %xmm0
@@ -688,3 +746,73 @@ define <16 x i8> @constant_fold_pshufb_2() {
%1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> <i8 2, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <16 x i8> <i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>)
ret <16 x i8> %1
}
+
+define i32 @mask_zzz3_v16i8(<16 x i8> %a0) {
+; SSSE3-LABEL: mask_zzz3_v16i8:
+; SSSE3: # %bb.0:
+; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = zero,zero,zero,xmm0[14,u,u,u,u,u,u,u,u,u,u,u,u]
+; SSSE3-NEXT: movd %xmm0, %eax
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: mask_zzz3_v16i8:
+; SSE41: # %bb.0:
+; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[14]
+; SSE41-NEXT: pextrd $3, %xmm0, %eax
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: mask_zzz3_v16i8:
+; AVX: # %bb.0:
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[14]
+; AVX-NEXT: vpextrd $3, %xmm0, %eax
+; AVX-NEXT: retq
+ %1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 0, i8 2, i8 4, i8 6, i8 8, i8 10, i8 12, i8 14, i8 0, i8 2, i8 4, i8 6, i8 8, i8 10, i8 12, i8 14>)
+ %2 = bitcast <16 x i8> %1 to <4 x i32>
+ %3 = extractelement <4 x i32> %2, i32 3
+ %4 = and i32 %3, 4278190080
+ ret i32 %4
+}
+
+define i32 @mask_z1z3_v16i8(<16 x i8> %a0) {
+; SSSE3-LABEL: mask_z1z3_v16i8:
+; SSSE3: # %bb.0:
+; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = zero,xmm0[10],zero,xmm0[14,u,u,u,u,u,u,u,u,u,u,u,u]
+; SSSE3-NEXT: movd %xmm0, %eax
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: mask_z1z3_v16i8:
+; SSE41: # %bb.0:
+; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u,u,u,u],zero,xmm0[10],zero,xmm0[14]
+; SSE41-NEXT: pextrd $3, %xmm0, %eax
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: mask_z1z3_v16i8:
+; AVX: # %bb.0:
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u,u,u,u],zero,xmm0[10],zero,xmm0[14]
+; AVX-NEXT: vpextrd $3, %xmm0, %eax
+; AVX-NEXT: retq
+ %1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 0, i8 2, i8 4, i8 6, i8 8, i8 10, i8 12, i8 14, i8 0, i8 2, i8 4, i8 6, i8 8, i8 10, i8 12, i8 14>)
+ %2 = bitcast <16 x i8> %1 to <4 x i32>
+ %3 = extractelement <4 x i32> %2, i32 3
+ %4 = and i32 %3, 4278255360
+ ret i32 %4
+}
+
+define i32 @PR22415(double %a0) {
+; SSE-LABEL: PR22415:
+; SSE: # %bb.0:
+; SSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
+; SSE-NEXT: movd %xmm0, %eax
+; SSE-NEXT: retq
+;
+; AVX-LABEL: PR22415:
+; AVX: # %bb.0:
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX-NEXT: vmovd %xmm0, %eax
+; AVX-NEXT: retq
+ %1 = bitcast double %a0 to <8 x i8>
+ %2 = shufflevector <8 x i8> %1, <8 x i8> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 undef>
+ %3 = shufflevector <4 x i8> %2, <4 x i8> undef, <3 x i32> <i32 0, i32 1, i32 2>
+ %4 = bitcast <3 x i8> %3 to i24
+ %5 = zext i24 %4 to i32
+ ret i32 %5
+}
diff --git a/test/CodeGen/X86/vector-shuffle-combining-xop.ll b/test/CodeGen/X86/vector-shuffle-combining-xop.ll
index a9dff9164316..83001cf5fb99 100644
--- a/test/CodeGen/X86/vector-shuffle-combining-xop.ll
+++ b/test/CodeGen/X86/vector-shuffle-combining-xop.ll
@@ -14,12 +14,12 @@ declare <16 x i8> @llvm.x86.xop.vpperm(<16 x i8>, <16 x i8>, <16 x i8>) nounwind
define <2 x double> @combine_vpermil2pd_identity(<2 x double> %a0, <2 x double> %a1) {
; X32-LABEL: combine_vpermil2pd_identity:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vmovaps %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: combine_vpermil2pd_identity:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovaps %xmm1, %xmm0
; X64-NEXT: retq
%res0 = call <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double> %a1, <2 x double> %a0, <2 x i64> <i64 2, i64 0>, i8 0)
@@ -29,12 +29,12 @@ define <2 x double> @combine_vpermil2pd_identity(<2 x double> %a0, <2 x double>
define <4 x double> @combine_vpermil2pd256_identity(<4 x double> %a0, <4 x double> %a1) {
; X32-LABEL: combine_vpermil2pd256_identity:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vmovaps %ymm1, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: combine_vpermil2pd256_identity:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovaps %ymm1, %ymm0
; X64-NEXT: retq
%res0 = call <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double> %a1, <4 x double> %a0, <4 x i64> <i64 2, i64 0, i64 2, i64 0>, i8 0)
@@ -44,12 +44,12 @@ define <4 x double> @combine_vpermil2pd256_identity(<4 x double> %a0, <4 x doubl
define <4 x double> @combine_vpermil2pd256_0z73(<4 x double> %a0, <4 x double> %a1) {
; X32-LABEL: combine_vpermil2pd256_0z73:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpermil2pd {{.*#+}} ymm0 = ymm0[0],zero,ymm1[3],ymm0[3]
; X32-NEXT: retl
;
; X64-LABEL: combine_vpermil2pd256_0z73:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpermil2pd {{.*#+}} ymm0 = ymm0[0],zero,ymm1[3],ymm0[3]
; X64-NEXT: retq
%res0 = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 0, i32 undef, i32 7, i32 3>
@@ -59,12 +59,12 @@ define <4 x double> @combine_vpermil2pd256_0z73(<4 x double> %a0, <4 x double> %
define <4 x float> @combine_vpermil2ps_identity(<4 x float> %a0, <4 x float> %a1) {
; X32-LABEL: combine_vpermil2ps_identity:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vmovaps %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: combine_vpermil2ps_identity:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovaps %xmm1, %xmm0
; X64-NEXT: retq
%res0 = call <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float> %a1, <4 x float> %a0, <4 x i32> <i32 3, i32 2, i32 1, i32 0>, i8 0)
@@ -74,14 +74,14 @@ define <4 x float> @combine_vpermil2ps_identity(<4 x float> %a0, <4 x float> %a1
define <4 x float> @combine_vpermil2ps_1z74(<4 x float> %a0, <4 x float> %a1) {
; X32-LABEL: combine_vpermil2ps_1z74:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[3,0]
; X32-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X32-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
; X32-NEXT: retl
;
; X64-LABEL: combine_vpermil2ps_1z74:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[3,0]
; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X64-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
@@ -93,12 +93,12 @@ define <4 x float> @combine_vpermil2ps_1z74(<4 x float> %a0, <4 x float> %a1) {
define <4 x float> @combine_vpermil2ps_02zu(<4 x float> %a0, <4 x float> %a1) {
; X32-LABEL: combine_vpermil2ps_02zu:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
; X32-NEXT: retl
;
; X64-LABEL: combine_vpermil2ps_02zu:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
; X64-NEXT: retq
%res0 = call <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float> %a0, <4 x float> zeroinitializer, <4 x i32> <i32 0, i32 2, i32 4, i32 undef>, i8 0)
@@ -107,12 +107,12 @@ define <4 x float> @combine_vpermil2ps_02zu(<4 x float> %a0, <4 x float> %a1) {
define <8 x float> @combine_vpermil2ps256_identity(<8 x float> %a0, <8 x float> %a1) {
; X32-LABEL: combine_vpermil2ps256_identity:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vmovaps %ymm1, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: combine_vpermil2ps256_identity:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovaps %ymm1, %ymm0
; X64-NEXT: retq
%res0 = call <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float> %a1, <8 x float> %a0, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 1, i32 0, i32 3, i32 2>, i8 0)
@@ -122,12 +122,12 @@ define <8 x float> @combine_vpermil2ps256_identity(<8 x float> %a0, <8 x float>
define <8 x float> @combine_vpermil2ps256_08z945Az(<8 x float> %a0, <8 x float> %a1) {
; X32-LABEL: combine_vpermil2ps256_08z945Az:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpermil2ps {{.*#+}} ymm0 = ymm0[0],ymm1[0],zero,ymm1[1],ymm0[4,5],ymm1[6],zero
; X32-NEXT: retl
;
; X64-LABEL: combine_vpermil2ps256_08z945Az:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpermil2ps {{.*#+}} ymm0 = ymm0[0],ymm1[0],zero,ymm1[1],ymm0[4,5],ymm1[6],zero
; X64-NEXT: retq
%res0 = call <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 0, i32 1, i32 6, i32 7>, i8 0)
@@ -137,13 +137,13 @@ define <8 x float> @combine_vpermil2ps256_08z945Az(<8 x float> %a0, <8 x float>
define <8 x float> @combine_vpermil2ps256_zero(<8 x float> %a0, <8 x float> %a1) {
; X32-LABEL: combine_vpermil2ps256_zero:
-; X32: # BB#0:
-; X32-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; X32: # %bb.0:
+; X32-NEXT: vxorps %xmm0, %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: combine_vpermil2ps256_zero:
-; X64: # BB#0:
-; X64-NEXT: vxorps %ymm0, %ymm0, %ymm0
+; X64: # %bb.0:
+; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0
; X64-NEXT: retq
%res0 = call <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float> %a1, <8 x float> %a0, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 8, i32 9, i32 10, i32 11>, i8 2)
ret <8 x float> %res0
@@ -151,13 +151,13 @@ define <8 x float> @combine_vpermil2ps256_zero(<8 x float> %a0, <8 x float> %a1)
define <4 x float> @combine_vpermil2ps_blend_with_zero(<4 x float> %a0, <4 x float> %a1) {
; X32-LABEL: combine_vpermil2ps_blend_with_zero:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X32-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
; X32-NEXT: retl
;
; X64-LABEL: combine_vpermil2ps_blend_with_zero:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X64-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
; X64-NEXT: retq
@@ -167,12 +167,12 @@ define <4 x float> @combine_vpermil2ps_blend_with_zero(<4 x float> %a0, <4 x flo
define <2 x double> @combine_vpermil2pd_as_shufpd(<2 x double> %a0, <2 x double> %a1) {
; X32-LABEL: combine_vpermil2pd_as_shufpd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0]
; X32-NEXT: retl
;
; X64-LABEL: combine_vpermil2pd_as_shufpd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0]
; X64-NEXT: retq
%res0 = call <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double> %a0, <2 x double> %a1, <2 x i64> <i64 2, i64 4>, i8 0)
@@ -181,12 +181,12 @@ define <2 x double> @combine_vpermil2pd_as_shufpd(<2 x double> %a0, <2 x double>
define <4 x double> @combine_vpermil2pd256_as_shufpd(<4 x double> %a0, <4 x double> %a1) {
; X32-LABEL: combine_vpermil2pd256_as_shufpd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[3],ymm1[3]
; X32-NEXT: retl
;
; X64-LABEL: combine_vpermil2pd256_as_shufpd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[3],ymm1[3]
; X64-NEXT: retq
%res0 = call <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double> %a0, <4 x double> %a1, <4 x i64> <i64 0, i64 4, i64 2, i64 7>, i8 0)
@@ -195,12 +195,12 @@ define <4 x double> @combine_vpermil2pd256_as_shufpd(<4 x double> %a0, <4 x doub
define <16 x i8> @combine_vpperm_identity(<16 x i8> %a0, <16 x i8> %a1) {
; X32-LABEL: combine_vpperm_identity:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vmovaps %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: combine_vpperm_identity:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovaps %xmm1, %xmm0
; X64-NEXT: retq
%res0 = call <16 x i8> @llvm.x86.xop.vpperm(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> <i8 31, i8 30, i8 29, i8 28, i8 27, i8 26, i8 25, i8 24, i8 23, i8 22, i8 21, i8 20, i8 19, i8 18, i8 17, i8 16>)
@@ -210,12 +210,12 @@ define <16 x i8> @combine_vpperm_identity(<16 x i8> %a0, <16 x i8> %a1) {
define <16 x i8> @combine_vpperm_zero(<16 x i8> %a0, <16 x i8> %a1) {
; X32-LABEL: combine_vpperm_zero:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vxorps %xmm0, %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: combine_vpperm_zero:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vxorps %xmm0, %xmm0, %xmm0
; X64-NEXT: retq
%res0 = call <16 x i8> @llvm.x86.xop.vpperm(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> <i8 128, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>)
@@ -226,12 +226,12 @@ define <16 x i8> @combine_vpperm_zero(<16 x i8> %a0, <16 x i8> %a1) {
define <16 x i8> @combine_vpperm_identity_bitcast(<16 x i8> %a0, <16 x i8> %a1) {
; X32-LABEL: combine_vpperm_identity_bitcast:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpaddq {{\.LCPI.*}}, %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: combine_vpperm_identity_bitcast:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpaddq {{.*}}(%rip), %xmm0, %xmm0
; X64-NEXT: retq
%mask = bitcast <2 x i64> <i64 1084818905618843912, i64 506097522914230528> to <16 x i8>
@@ -245,13 +245,13 @@ define <16 x i8> @combine_vpperm_identity_bitcast(<16 x i8> %a0, <16 x i8> %a1)
define <16 x i8> @combine_vpperm_as_blend_with_zero(<16 x i8> %a0, <16 x i8> %a1) {
; X32-LABEL: combine_vpperm_as_blend_with_zero:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpxor %xmm1, %xmm1, %xmm1
; X32-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4,5,6,7]
; X32-NEXT: retl
;
; X64-LABEL: combine_vpperm_as_blend_with_zero:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1
; X64-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4,5,6,7]
; X64-NEXT: retq
@@ -261,12 +261,12 @@ define <16 x i8> @combine_vpperm_as_blend_with_zero(<16 x i8> %a0, <16 x i8> %a1
define <16 x i8> @combine_vpperm_as_unary_unpckhbw(<16 x i8> %a0, <16 x i8> %a1) {
; X32-LABEL: combine_vpperm_as_unary_unpckhbw:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; X32-NEXT: retl
;
; X64-LABEL: combine_vpperm_as_unary_unpckhbw:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; X64-NEXT: retq
%res0 = call <16 x i8> @llvm.x86.xop.vpperm(<16 x i8> %a0, <16 x i8> %a0, <16 x i8> <i8 8, i8 undef, i8 9, i8 25, i8 10, i8 26, i8 11, i8 27, i8 12, i8 28, i8 13, i8 29, i8 14, i8 30, i8 15, i8 31>)
@@ -275,12 +275,12 @@ define <16 x i8> @combine_vpperm_as_unary_unpckhbw(<16 x i8> %a0, <16 x i8> %a1)
define <16 x i8> @combine_vpperm_as_unpckhbw(<16 x i8> %a0, <16 x i8> %a1) {
; X32-LABEL: combine_vpperm_as_unpckhbw:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
; X32-NEXT: retl
;
; X64-LABEL: combine_vpperm_as_unpckhbw:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
; X64-NEXT: retq
%res0 = call <16 x i8> @llvm.x86.xop.vpperm(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> <i8 8, i8 24, i8 9, i8 25, i8 10, i8 26, i8 11, i8 27, i8 12, i8 28, i8 13, i8 29, i8 14, i8 30, i8 15, i8 31>)
@@ -289,12 +289,12 @@ define <16 x i8> @combine_vpperm_as_unpckhbw(<16 x i8> %a0, <16 x i8> %a1) {
define <16 x i8> @combine_vpperm_as_unpcklbw(<16 x i8> %a0, <16 x i8> %a1) {
; X32-LABEL: combine_vpperm_as_unpcklbw:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; X32-NEXT: retl
;
; X64-LABEL: combine_vpperm_as_unpcklbw:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; X64-NEXT: retq
%res0 = call <16 x i8> @llvm.x86.xop.vpperm(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> <i8 16, i8 0, i8 17, i8 1, i8 18, i8 2, i8 19, i8 3, i8 20, i8 4, i8 21, i8 5, i8 22, i8 6, i8 23, i8 7>)
@@ -303,12 +303,12 @@ define <16 x i8> @combine_vpperm_as_unpcklbw(<16 x i8> %a0, <16 x i8> %a1) {
define <4 x i32> @combine_vpperm_10zz32BA(<4 x i32> %a0, <4 x i32> %a1) {
; X32-LABEL: combine_vpperm_10zz32BA:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpperm {{.*#+}} xmm0 = xmm0[2,3,0,1],zero,zero,zero,zero,xmm0[6,7,4,5],xmm1[6,7,4,5]
; X32-NEXT: retl
;
; X64-LABEL: combine_vpperm_10zz32BA:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpperm {{.*#+}} xmm0 = xmm0[2,3,0,1],zero,zero,zero,zero,xmm0[6,7,4,5],xmm1[6,7,4,5]
; X64-NEXT: retq
%res0 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
@@ -321,7 +321,7 @@ define <4 x i32> @combine_vpperm_10zz32BA(<4 x i32> %a0, <4 x i32> %a1) {
; FIXME: Duplicated load in i686
define void @buildvector_v4f32_0404(float %a, float %b, <4 x float>* %ptr) {
; X32-LABEL: buildvector_v4f32_0404:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; X32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
@@ -330,7 +330,7 @@ define void @buildvector_v4f32_0404(float %a, float %b, <4 x float>* %ptr) {
; X32-NEXT: retl
;
; X64-LABEL: buildvector_v4f32_0404:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpermil2ps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[0],xmm1[0]
; X64-NEXT: vmovaps %xmm0, (%rdi)
; X64-NEXT: retq
@@ -344,7 +344,7 @@ define void @buildvector_v4f32_0404(float %a, float %b, <4 x float>* %ptr) {
define void @buildvector_v4f32_07z6(float %a, <4 x float> %b, <4 x float>* %ptr) {
; X32-LABEL: buildvector_v4f32_07z6:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X32-NEXT: vpermil2ps {{.*#+}} xmm0 = xmm1[0],xmm0[3],zero,xmm0[2]
@@ -352,7 +352,7 @@ define void @buildvector_v4f32_07z6(float %a, <4 x float> %b, <4 x float>* %ptr)
; X32-NEXT: retl
;
; X64-LABEL: buildvector_v4f32_07z6:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpermil2ps {{.*#+}} xmm0 = xmm0[0],xmm1[3],zero,xmm1[2]
; X64-NEXT: vmovaps %xmm0, (%rdi)
; X64-NEXT: retq
@@ -368,12 +368,12 @@ define void @buildvector_v4f32_07z6(float %a, <4 x float> %b, <4 x float>* %ptr)
define <2 x double> @constant_fold_vpermil2pd() {
; X32-LABEL: constant_fold_vpermil2pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vmovaps {{.*#+}} xmm0 = [-2.000000e+00,2.000000e+00]
; X32-NEXT: retl
;
; X64-LABEL: constant_fold_vpermil2pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovaps {{.*#+}} xmm0 = [-2.000000e+00,2.000000e+00]
; X64-NEXT: retq
%1 = call <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double> <double 1.0, double 2.0>, <2 x double> <double -2.0, double -1.0>, <2 x i64> <i64 4, i64 2>, i8 2)
@@ -382,12 +382,12 @@ define <2 x double> @constant_fold_vpermil2pd() {
define <4 x double> @constant_fold_vpermil2pd_256() {
; X32-LABEL: constant_fold_vpermil2pd_256:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vmovaps {{.*#+}} ymm0 = [-4.000000e+00,0.000000e+00,4.000000e+00,3.000000e+00]
; X32-NEXT: retl
;
; X64-LABEL: constant_fold_vpermil2pd_256:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovaps {{.*#+}} ymm0 = [-4.000000e+00,0.000000e+00,4.000000e+00,3.000000e+00]
; X64-NEXT: retq
%1 = call <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double> <double 1.0, double 2.0, double 3.0, double 4.0>, <4 x double> <double -4.0, double -3.0, double -2.0, double -1.0>, <4 x i64> <i64 4, i64 8, i64 2, i64 0>, i8 2)
@@ -396,12 +396,12 @@ define <4 x double> @constant_fold_vpermil2pd_256() {
define <4 x float> @constant_fold_vpermil2ps() {
; X32-LABEL: constant_fold_vpermil2ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vmovaps {{.*#+}} xmm0 = [-4.000000e+00,1.000000e+00,3.000000e+00,0.000000e+00]
; X32-NEXT: retl
;
; X64-LABEL: constant_fold_vpermil2ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovaps {{.*#+}} xmm0 = [-4.000000e+00,1.000000e+00,3.000000e+00,0.000000e+00]
; X64-NEXT: retq
%1 = call <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, <4 x float> <float -4.0, float -3.0, float -2.0, float -1.0>, <4 x i32> <i32 4, i32 0, i32 2, i32 8>, i8 2)
@@ -410,12 +410,12 @@ define <4 x float> @constant_fold_vpermil2ps() {
define <8 x float> @constant_fold_vpermil2ps_256() {
; X32-LABEL: constant_fold_vpermil2ps_256:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vmovaps {{.*#+}} ymm0 = [-8.000000e+00,1.000000e+00,3.000000e+00,0.000000e+00,5.000000e+00,0.000000e+00,5.000000e+00,7.000000e+00]
; X32-NEXT: retl
;
; X64-LABEL: constant_fold_vpermil2ps_256:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovaps {{.*#+}} ymm0 = [-8.000000e+00,1.000000e+00,3.000000e+00,0.000000e+00,5.000000e+00,0.000000e+00,5.000000e+00,7.000000e+00]
; X64-NEXT: retq
%1 = call <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>, <8 x float> <float -8.0, float -7.0, float -6.0, float -5.0, float -4.0, float -3.0, float -2.0, float -1.0>, <8 x i32> <i32 4, i32 0, i32 2, i32 8, i32 0, i32 8, i32 0, i32 2>, i8 2)
@@ -424,12 +424,12 @@ define <8 x float> @constant_fold_vpermil2ps_256() {
define <16 x i8> @constant_fold_vpperm() {
; X32-LABEL: constant_fold_vpperm:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vmovaps {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
; X32-NEXT: retl
;
; X64-LABEL: constant_fold_vpperm:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vmovaps {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
; X64-NEXT: retq
%1 = call <16 x i8> @llvm.x86.xop.vpperm(<16 x i8> <i8 0, i8 -1, i8 -2, i8 -3, i8 -4, i8 -5, i8 -6, i8 -7, i8 -8, i8 -9, i8 -10, i8 -11, i8 -12, i8 -13, i8 -14, i8 -15>, <16 x i8> <i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>, <16 x i8> <i8 31, i8 30, i8 29, i8 28, i8 27, i8 26, i8 25, i8 24, i8 23, i8 22, i8 21, i8 20, i8 19, i8 18, i8 17, i8 16>)
@@ -438,14 +438,14 @@ define <16 x i8> @constant_fold_vpperm() {
define <4 x float> @PR31296(i8* %in) {
; X32-LABEL: PR31296:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],zero,zero,mem[0]
; X32-NEXT: retl
;
; X64-LABEL: PR31296:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: movl (%rdi), %eax
; X64-NEXT: vmovq %rax, %xmm0
; X64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],zero,zero,mem[0]
diff --git a/test/CodeGen/X86/vector-shuffle-combining.ll b/test/CodeGen/X86/vector-shuffle-combining.ll
index e04c5321fa25..5e98ec47c0bd 100644
--- a/test/CodeGen/X86/vector-shuffle-combining.ll
+++ b/test/CodeGen/X86/vector-shuffle-combining.ll
@@ -18,7 +18,7 @@ declare <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16>, i8)
define <4 x i32> @combine_pshufd1(<4 x i32> %a) {
; ALL-LABEL: combine_pshufd1:
-; ALL: # BB#0: # %entry
+; ALL: # %bb.0: # %entry
; ALL-NEXT: retq
entry:
%b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 27)
@@ -28,7 +28,7 @@ entry:
define <4 x i32> @combine_pshufd2(<4 x i32> %a) {
; ALL-LABEL: combine_pshufd2:
-; ALL: # BB#0: # %entry
+; ALL: # %bb.0: # %entry
; ALL-NEXT: retq
entry:
%b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 27)
@@ -41,7 +41,7 @@ entry:
define <4 x i32> @combine_pshufd3(<4 x i32> %a) {
; ALL-LABEL: combine_pshufd3:
-; ALL: # BB#0: # %entry
+; ALL: # %bb.0: # %entry
; ALL-NEXT: retq
entry:
%b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 27)
@@ -54,12 +54,12 @@ entry:
define <4 x i32> @combine_pshufd4(<4 x i32> %a) {
; SSE-LABEL: combine_pshufd4:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
; SSE-NEXT: retq
;
; AVX-LABEL: combine_pshufd4:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
; AVX-NEXT: retq
entry:
@@ -73,12 +73,12 @@ entry:
define <4 x i32> @combine_pshufd5(<4 x i32> %a) {
; SSE-LABEL: combine_pshufd5:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
; SSE-NEXT: retq
;
; AVX-LABEL: combine_pshufd5:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
; AVX-NEXT: retq
entry:
@@ -92,17 +92,17 @@ entry:
define <4 x i32> @combine_pshufd6(<4 x i32> %a) {
; SSE-LABEL: combine_pshufd6:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; SSE-NEXT: retq
;
; AVX1-LABEL: combine_pshufd6:
-; AVX1: # BB#0: # %entry
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX1: # %bb.0: # %entry
+; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
; AVX1-NEXT: retq
;
; AVX2-LABEL: combine_pshufd6:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vbroadcastss %xmm0, %xmm0
; AVX2-NEXT: retq
entry:
@@ -113,7 +113,7 @@ entry:
define <8 x i16> @combine_pshuflw1(<8 x i16> %a) {
; ALL-LABEL: combine_pshuflw1:
-; ALL: # BB#0: # %entry
+; ALL: # %bb.0: # %entry
; ALL-NEXT: retq
entry:
%b = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %a, i8 27)
@@ -123,7 +123,7 @@ entry:
define <8 x i16> @combine_pshuflw2(<8 x i16> %a) {
; ALL-LABEL: combine_pshuflw2:
-; ALL: # BB#0: # %entry
+; ALL: # %bb.0: # %entry
; ALL-NEXT: retq
entry:
%b = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %a, i8 27)
@@ -134,12 +134,12 @@ entry:
define <8 x i16> @combine_pshuflw3(<8 x i16> %a) {
; SSE-LABEL: combine_pshuflw3:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
; SSE-NEXT: retq
;
; AVX-LABEL: combine_pshuflw3:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
; AVX-NEXT: retq
entry:
@@ -151,12 +151,12 @@ entry:
define <8 x i16> @combine_pshufhw1(<8 x i16> %a) {
; SSE-LABEL: combine_pshufhw1:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
; SSE-NEXT: retq
;
; AVX-LABEL: combine_pshufhw1:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
; AVX-NEXT: retq
entry:
@@ -168,15 +168,15 @@ entry:
define <4 x i32> @combine_bitwise_ops_test1(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
; SSE-LABEL: combine_bitwise_ops_test1:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pand %xmm1, %xmm0
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
; SSE-NEXT: retq
;
; AVX-LABEL: combine_bitwise_ops_test1:
-; AVX: # BB#0:
-; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; AVX: # %bb.0:
+; AVX-NEXT: vandps %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3]
; AVX-NEXT: retq
%shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
%shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
@@ -186,15 +186,15 @@ define <4 x i32> @combine_bitwise_ops_test1(<4 x i32> %a, <4 x i32> %b, <4 x i32
define <4 x i32> @combine_bitwise_ops_test2(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
; SSE-LABEL: combine_bitwise_ops_test2:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: por %xmm1, %xmm0
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
; SSE-NEXT: retq
;
; AVX-LABEL: combine_bitwise_ops_test2:
-; AVX: # BB#0:
-; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; AVX: # %bb.0:
+; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3]
; AVX-NEXT: retq
%shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
%shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
@@ -204,15 +204,15 @@ define <4 x i32> @combine_bitwise_ops_test2(<4 x i32> %a, <4 x i32> %b, <4 x i32
define <4 x i32> @combine_bitwise_ops_test3(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
; SSE-LABEL: combine_bitwise_ops_test3:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pxor %xmm1, %xmm0
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
; SSE-NEXT: retq
;
; AVX-LABEL: combine_bitwise_ops_test3:
-; AVX: # BB#0:
-; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; AVX: # %bb.0:
+; AVX-NEXT: vxorps %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3]
; AVX-NEXT: retq
%shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
%shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
@@ -222,15 +222,15 @@ define <4 x i32> @combine_bitwise_ops_test3(<4 x i32> %a, <4 x i32> %b, <4 x i32
define <4 x i32> @combine_bitwise_ops_test4(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
; SSE-LABEL: combine_bitwise_ops_test4:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pand %xmm1, %xmm0
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
; SSE-NEXT: retq
;
; AVX-LABEL: combine_bitwise_ops_test4:
-; AVX: # BB#0:
-; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; AVX: # %bb.0:
+; AVX-NEXT: vandps %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3]
; AVX-NEXT: retq
%shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 4, i32 6, i32 5, i32 7>
%shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 4, i32 6, i32 5, i32 7>
@@ -240,15 +240,15 @@ define <4 x i32> @combine_bitwise_ops_test4(<4 x i32> %a, <4 x i32> %b, <4 x i32
define <4 x i32> @combine_bitwise_ops_test5(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
; SSE-LABEL: combine_bitwise_ops_test5:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: por %xmm1, %xmm0
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
; SSE-NEXT: retq
;
; AVX-LABEL: combine_bitwise_ops_test5:
-; AVX: # BB#0:
-; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; AVX: # %bb.0:
+; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3]
; AVX-NEXT: retq
%shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 4, i32 6, i32 5, i32 7>
%shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 4, i32 6, i32 5, i32 7>
@@ -258,15 +258,15 @@ define <4 x i32> @combine_bitwise_ops_test5(<4 x i32> %a, <4 x i32> %b, <4 x i32
define <4 x i32> @combine_bitwise_ops_test6(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
; SSE-LABEL: combine_bitwise_ops_test6:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pxor %xmm1, %xmm0
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
; SSE-NEXT: retq
;
; AVX-LABEL: combine_bitwise_ops_test6:
-; AVX: # BB#0:
-; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; AVX: # %bb.0:
+; AVX-NEXT: vxorps %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3]
; AVX-NEXT: retq
%shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 4, i32 6, i32 5, i32 7>
%shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 4, i32 6, i32 5, i32 7>
@@ -280,7 +280,7 @@ define <4 x i32> @combine_bitwise_ops_test6(<4 x i32> %a, <4 x i32> %b, <4 x i32
define <4 x i32> @combine_bitwise_ops_test1b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
; SSE2-LABEL: combine_bitwise_ops_test1b:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: pand %xmm1, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
@@ -288,7 +288,7 @@ define <4 x i32> @combine_bitwise_ops_test1b(<4 x i32> %a, <4 x i32> %b, <4 x i3
; SSE2-NEXT: retq
;
; SSSE3-LABEL: combine_bitwise_ops_test1b:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: pand %xmm1, %xmm0
; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
@@ -296,21 +296,21 @@ define <4 x i32> @combine_bitwise_ops_test1b(<4 x i32> %a, <4 x i32> %b, <4 x i3
; SSSE3-NEXT: retq
;
; SSE41-LABEL: combine_bitwise_ops_test1b:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pand %xmm1, %xmm0
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
; SSE41-NEXT: retq
;
; AVX1-LABEL: combine_bitwise_ops_test1b:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
; AVX1-NEXT: retq
;
; AVX2-LABEL: combine_bitwise_ops_test1b:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
+; AVX2: # %bb.0:
+; AVX2-NEXT: vandps %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
; AVX2-NEXT: retq
%shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
%shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
@@ -320,7 +320,7 @@ define <4 x i32> @combine_bitwise_ops_test1b(<4 x i32> %a, <4 x i32> %b, <4 x i3
define <4 x i32> @combine_bitwise_ops_test2b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
; SSE2-LABEL: combine_bitwise_ops_test2b:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
@@ -328,7 +328,7 @@ define <4 x i32> @combine_bitwise_ops_test2b(<4 x i32> %a, <4 x i32> %b, <4 x i3
; SSE2-NEXT: retq
;
; SSSE3-LABEL: combine_bitwise_ops_test2b:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: por %xmm1, %xmm0
; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
@@ -336,21 +336,21 @@ define <4 x i32> @combine_bitwise_ops_test2b(<4 x i32> %a, <4 x i32> %b, <4 x i3
; SSSE3-NEXT: retq
;
; SSE41-LABEL: combine_bitwise_ops_test2b:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: por %xmm1, %xmm0
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
; SSE41-NEXT: retq
;
; AVX1-LABEL: combine_bitwise_ops_test2b:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
; AVX1-NEXT: retq
;
; AVX2-LABEL: combine_bitwise_ops_test2b:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
+; AVX2: # %bb.0:
+; AVX2-NEXT: vorps %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
; AVX2-NEXT: retq
%shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
%shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
@@ -360,36 +360,36 @@ define <4 x i32> @combine_bitwise_ops_test2b(<4 x i32> %a, <4 x i32> %b, <4 x i3
define <4 x i32> @combine_bitwise_ops_test3b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
; SSE2-LABEL: combine_bitwise_ops_test3b:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: xorps %xmm1, %xmm0
; SSE2-NEXT: andps {{.*}}(%rip), %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: combine_bitwise_ops_test3b:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: xorps %xmm1, %xmm0
; SSSE3-NEXT: andps {{.*}}(%rip), %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: combine_bitwise_ops_test3b:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pxor %xmm1, %xmm0
; SSE41-NEXT: pxor %xmm1, %xmm1
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
; SSE41-NEXT: retq
;
; AVX1-LABEL: combine_bitwise_ops_test3b:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
; AVX1-NEXT: retq
;
; AVX2-LABEL: combine_bitwise_ops_test3b:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; AVX2: # %bb.0:
+; AVX2-NEXT: vxorps %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
; AVX2-NEXT: retq
%shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
%shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
@@ -399,7 +399,7 @@ define <4 x i32> @combine_bitwise_ops_test3b(<4 x i32> %a, <4 x i32> %b, <4 x i3
define <4 x i32> @combine_bitwise_ops_test4b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
; SSE2-LABEL: combine_bitwise_ops_test4b:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: pand %xmm1, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
@@ -407,7 +407,7 @@ define <4 x i32> @combine_bitwise_ops_test4b(<4 x i32> %a, <4 x i32> %b, <4 x i3
; SSE2-NEXT: retq
;
; SSSE3-LABEL: combine_bitwise_ops_test4b:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: pand %xmm1, %xmm0
; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
@@ -415,21 +415,21 @@ define <4 x i32> @combine_bitwise_ops_test4b(<4 x i32> %a, <4 x i32> %b, <4 x i3
; SSSE3-NEXT: retq
;
; SSE41-LABEL: combine_bitwise_ops_test4b:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pand %xmm1, %xmm0
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7]
; SSE41-NEXT: retq
;
; AVX1-LABEL: combine_bitwise_ops_test4b:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7]
; AVX1-NEXT: retq
;
; AVX2-LABEL: combine_bitwise_ops_test4b:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3]
+; AVX2: # %bb.0:
+; AVX2-NEXT: vandps %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3]
; AVX2-NEXT: retq
%shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 5, i32 2, i32 7>
%shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 5, i32 2, i32 7>
@@ -439,7 +439,7 @@ define <4 x i32> @combine_bitwise_ops_test4b(<4 x i32> %a, <4 x i32> %b, <4 x i3
define <4 x i32> @combine_bitwise_ops_test5b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
; SSE2-LABEL: combine_bitwise_ops_test5b:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
@@ -447,7 +447,7 @@ define <4 x i32> @combine_bitwise_ops_test5b(<4 x i32> %a, <4 x i32> %b, <4 x i3
; SSE2-NEXT: retq
;
; SSSE3-LABEL: combine_bitwise_ops_test5b:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: por %xmm1, %xmm0
; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
@@ -455,21 +455,21 @@ define <4 x i32> @combine_bitwise_ops_test5b(<4 x i32> %a, <4 x i32> %b, <4 x i3
; SSSE3-NEXT: retq
;
; SSE41-LABEL: combine_bitwise_ops_test5b:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: por %xmm1, %xmm0
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7]
; SSE41-NEXT: retq
;
; AVX1-LABEL: combine_bitwise_ops_test5b:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7]
; AVX1-NEXT: retq
;
; AVX2-LABEL: combine_bitwise_ops_test5b:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3]
+; AVX2: # %bb.0:
+; AVX2-NEXT: vorps %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3]
; AVX2-NEXT: retq
%shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 5, i32 2, i32 7>
%shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 5, i32 2, i32 7>
@@ -479,36 +479,36 @@ define <4 x i32> @combine_bitwise_ops_test5b(<4 x i32> %a, <4 x i32> %b, <4 x i3
define <4 x i32> @combine_bitwise_ops_test6b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
; SSE2-LABEL: combine_bitwise_ops_test6b:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: xorps %xmm1, %xmm0
; SSE2-NEXT: andps {{.*}}(%rip), %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: combine_bitwise_ops_test6b:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: xorps %xmm1, %xmm0
; SSSE3-NEXT: andps {{.*}}(%rip), %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: combine_bitwise_ops_test6b:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pxor %xmm1, %xmm0
; SSE41-NEXT: pxor %xmm1, %xmm1
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
; SSE41-NEXT: retq
;
; AVX1-LABEL: combine_bitwise_ops_test6b:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
; AVX1-NEXT: retq
;
; AVX2-LABEL: combine_bitwise_ops_test6b:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
+; AVX2: # %bb.0:
+; AVX2-NEXT: vxorps %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
; AVX2-NEXT: retq
%shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 5, i32 2, i32 7>
%shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 5, i32 2, i32 7>
@@ -518,13 +518,13 @@ define <4 x i32> @combine_bitwise_ops_test6b(<4 x i32> %a, <4 x i32> %b, <4 x i3
define <4 x i32> @combine_bitwise_ops_test1c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
; SSE-LABEL: combine_bitwise_ops_test1c:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: andps %xmm1, %xmm0
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3]
; SSE-NEXT: retq
;
; AVX-LABEL: combine_bitwise_ops_test1c:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vandps %xmm1, %xmm0, %xmm0
; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3]
; AVX-NEXT: retq
@@ -536,13 +536,13 @@ define <4 x i32> @combine_bitwise_ops_test1c(<4 x i32> %a, <4 x i32> %b, <4 x i3
define <4 x i32> @combine_bitwise_ops_test2c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
; SSE-LABEL: combine_bitwise_ops_test2c:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: orps %xmm1, %xmm0
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3]
; SSE-NEXT: retq
;
; AVX-LABEL: combine_bitwise_ops_test2c:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0
; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3]
; AVX-NEXT: retq
@@ -554,27 +554,27 @@ define <4 x i32> @combine_bitwise_ops_test2c(<4 x i32> %a, <4 x i32> %b, <4 x i3
define <4 x i32> @combine_bitwise_ops_test3c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
; SSE2-LABEL: combine_bitwise_ops_test3c:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: xorps %xmm1, %xmm0
; SSE2-NEXT: xorps %xmm1, %xmm1
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: combine_bitwise_ops_test3c:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: xorps %xmm1, %xmm0
; SSSE3-NEXT: xorps %xmm1, %xmm1
; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: combine_bitwise_ops_test3c:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: xorps %xmm1, %xmm0
; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
; SSE41-NEXT: retq
;
; AVX-LABEL: combine_bitwise_ops_test3c:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vxorps %xmm1, %xmm0, %xmm0
; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
; AVX-NEXT: retq
@@ -586,14 +586,14 @@ define <4 x i32> @combine_bitwise_ops_test3c(<4 x i32> %a, <4 x i32> %b, <4 x i3
define <4 x i32> @combine_bitwise_ops_test4c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
; SSE-LABEL: combine_bitwise_ops_test4c:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: andps %xmm1, %xmm0
; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3]
; SSE-NEXT: movaps %xmm2, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_bitwise_ops_test4c:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vandps %xmm1, %xmm0, %xmm0
; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm2[0,2],xmm0[1,3]
; AVX-NEXT: retq
@@ -605,14 +605,14 @@ define <4 x i32> @combine_bitwise_ops_test4c(<4 x i32> %a, <4 x i32> %b, <4 x i3
define <4 x i32> @combine_bitwise_ops_test5c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
; SSE-LABEL: combine_bitwise_ops_test5c:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: orps %xmm1, %xmm0
; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3]
; SSE-NEXT: movaps %xmm2, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_bitwise_ops_test5c:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0
; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm2[0,2],xmm0[1,3]
; AVX-NEXT: retq
@@ -624,7 +624,7 @@ define <4 x i32> @combine_bitwise_ops_test5c(<4 x i32> %a, <4 x i32> %b, <4 x i3
define <4 x i32> @combine_bitwise_ops_test6c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
; SSE2-LABEL: combine_bitwise_ops_test6c:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: xorps %xmm1, %xmm0
; SSE2-NEXT: xorps %xmm1, %xmm1
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[1,3]
@@ -632,7 +632,7 @@ define <4 x i32> @combine_bitwise_ops_test6c(<4 x i32> %a, <4 x i32> %b, <4 x i3
; SSE2-NEXT: retq
;
; SSSE3-LABEL: combine_bitwise_ops_test6c:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: xorps %xmm1, %xmm0
; SSSE3-NEXT: xorps %xmm1, %xmm1
; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[1,3]
@@ -640,13 +640,13 @@ define <4 x i32> @combine_bitwise_ops_test6c(<4 x i32> %a, <4 x i32> %b, <4 x i3
; SSSE3-NEXT: retq
;
; SSE41-LABEL: combine_bitwise_ops_test6c:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: xorps %xmm1, %xmm0
; SSE41-NEXT: insertps {{.*#+}} xmm0 = zero,zero,xmm0[1,3]
; SSE41-NEXT: retq
;
; AVX-LABEL: combine_bitwise_ops_test6c:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vxorps %xmm1, %xmm0, %xmm0
; AVX-NEXT: vinsertps {{.*#+}} xmm0 = zero,zero,xmm0[1,3]
; AVX-NEXT: retq
@@ -658,13 +658,13 @@ define <4 x i32> @combine_bitwise_ops_test6c(<4 x i32> %a, <4 x i32> %b, <4 x i3
define <4 x i32> @combine_nested_undef_test1(<4 x i32> %A, <4 x i32> %B) {
; SSE-LABEL: combine_nested_undef_test1:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,0,1]
; SSE-NEXT: retq
;
; AVX-LABEL: combine_nested_undef_test1:
-; AVX: # BB#0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,0,1]
+; AVX: # %bb.0:
+; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,0,1]
; AVX-NEXT: retq
%1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 3, i32 1>
%2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 3>
@@ -673,13 +673,13 @@ define <4 x i32> @combine_nested_undef_test1(<4 x i32> %A, <4 x i32> %B) {
define <4 x i32> @combine_nested_undef_test2(<4 x i32> %A, <4 x i32> %B) {
; SSE-LABEL: combine_nested_undef_test2:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
; SSE-NEXT: retq
;
; AVX-LABEL: combine_nested_undef_test2:
-; AVX: # BB#0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
+; AVX: # %bb.0:
+; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,1,0,3]
; AVX-NEXT: retq
%1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 3>
%2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 3>
@@ -688,13 +688,13 @@ define <4 x i32> @combine_nested_undef_test2(<4 x i32> %A, <4 x i32> %B) {
define <4 x i32> @combine_nested_undef_test3(<4 x i32> %A, <4 x i32> %B) {
; SSE-LABEL: combine_nested_undef_test3:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
; SSE-NEXT: retq
;
; AVX-LABEL: combine_nested_undef_test3:
-; AVX: # BB#0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
+; AVX: # %bb.0:
+; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,1,0,3]
; AVX-NEXT: retq
%1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 6, i32 2, i32 3>
%2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 3>
@@ -703,17 +703,17 @@ define <4 x i32> @combine_nested_undef_test3(<4 x i32> %A, <4 x i32> %B) {
define <4 x i32> @combine_nested_undef_test4(<4 x i32> %A, <4 x i32> %B) {
; SSE-LABEL: combine_nested_undef_test4:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; SSE-NEXT: retq
;
; AVX1-LABEL: combine_nested_undef_test4:
-; AVX1: # BB#0:
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX1-NEXT: retq
;
; AVX2-LABEL: combine_nested_undef_test4:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0
; AVX2-NEXT: retq
%1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 7, i32 1>
@@ -723,13 +723,13 @@ define <4 x i32> @combine_nested_undef_test4(<4 x i32> %A, <4 x i32> %B) {
define <4 x i32> @combine_nested_undef_test5(<4 x i32> %A, <4 x i32> %B) {
; SSE-LABEL: combine_nested_undef_test5:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; SSE-NEXT: retq
;
; AVX-LABEL: combine_nested_undef_test5:
-; AVX: # BB#0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX: # %bb.0:
+; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3]
; AVX-NEXT: retq
%1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 5, i32 5, i32 2, i32 3>
%2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 4, i32 3>
@@ -738,13 +738,13 @@ define <4 x i32> @combine_nested_undef_test5(<4 x i32> %A, <4 x i32> %B) {
define <4 x i32> @combine_nested_undef_test6(<4 x i32> %A, <4 x i32> %B) {
; SSE-LABEL: combine_nested_undef_test6:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
; SSE-NEXT: retq
;
; AVX-LABEL: combine_nested_undef_test6:
-; AVX: # BB#0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX: # %bb.0:
+; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,0,1]
; AVX-NEXT: retq
%1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 6, i32 2, i32 4>
%2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 4>
@@ -753,13 +753,13 @@ define <4 x i32> @combine_nested_undef_test6(<4 x i32> %A, <4 x i32> %B) {
define <4 x i32> @combine_nested_undef_test7(<4 x i32> %A, <4 x i32> %B) {
; SSE-LABEL: combine_nested_undef_test7:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,0,2]
; SSE-NEXT: retq
;
; AVX-LABEL: combine_nested_undef_test7:
-; AVX: # BB#0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,0,2]
+; AVX: # %bb.0:
+; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,0,2]
; AVX-NEXT: retq
%1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
%2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 0, i32 2>
@@ -768,13 +768,13 @@ define <4 x i32> @combine_nested_undef_test7(<4 x i32> %A, <4 x i32> %B) {
define <4 x i32> @combine_nested_undef_test8(<4 x i32> %A, <4 x i32> %B) {
; SSE-LABEL: combine_nested_undef_test8:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
; SSE-NEXT: retq
;
; AVX-LABEL: combine_nested_undef_test8:
-; AVX: # BB#0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX: # %bb.0:
+; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,3,3]
; AVX-NEXT: retq
%1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
%2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 4, i32 3, i32 4>
@@ -783,13 +783,13 @@ define <4 x i32> @combine_nested_undef_test8(<4 x i32> %A, <4 x i32> %B) {
define <4 x i32> @combine_nested_undef_test9(<4 x i32> %A, <4 x i32> %B) {
; SSE-LABEL: combine_nested_undef_test9:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,2]
; SSE-NEXT: retq
;
; AVX-LABEL: combine_nested_undef_test9:
-; AVX: # BB#0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,2]
+; AVX: # %bb.0:
+; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,3,2,2]
; AVX-NEXT: retq
%1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 3, i32 2, i32 5>
%2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 4, i32 2>
@@ -798,13 +798,13 @@ define <4 x i32> @combine_nested_undef_test9(<4 x i32> %A, <4 x i32> %B) {
define <4 x i32> @combine_nested_undef_test10(<4 x i32> %A, <4 x i32> %B) {
; SSE-LABEL: combine_nested_undef_test10:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,3]
; SSE-NEXT: retq
;
; AVX-LABEL: combine_nested_undef_test10:
-; AVX: # BB#0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,3]
+; AVX: # %bb.0:
+; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,3]
; AVX-NEXT: retq
%1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 1, i32 5, i32 5>
%2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 4>
@@ -813,13 +813,13 @@ define <4 x i32> @combine_nested_undef_test10(<4 x i32> %A, <4 x i32> %B) {
define <4 x i32> @combine_nested_undef_test11(<4 x i32> %A, <4 x i32> %B) {
; SSE-LABEL: combine_nested_undef_test11:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,1]
; SSE-NEXT: retq
;
; AVX-LABEL: combine_nested_undef_test11:
-; AVX: # BB#0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,1]
+; AVX: # %bb.0:
+; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,2,1]
; AVX-NEXT: retq
%1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 2, i32 5, i32 4>
%2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 0>
@@ -828,17 +828,17 @@ define <4 x i32> @combine_nested_undef_test11(<4 x i32> %A, <4 x i32> %B) {
define <4 x i32> @combine_nested_undef_test12(<4 x i32> %A, <4 x i32> %B) {
; SSE-LABEL: combine_nested_undef_test12:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; SSE-NEXT: retq
;
; AVX1-LABEL: combine_nested_undef_test12:
-; AVX1: # BB#0:
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX1-NEXT: retq
;
; AVX2-LABEL: combine_nested_undef_test12:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0
; AVX2-NEXT: retq
%1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 0, i32 2, i32 4>
@@ -849,7 +849,7 @@ define <4 x i32> @combine_nested_undef_test12(<4 x i32> %A, <4 x i32> %B) {
; The following pair of shuffles is folded into vector %A.
define <4 x i32> @combine_nested_undef_test13(<4 x i32> %A, <4 x i32> %B) {
; ALL-LABEL: combine_nested_undef_test13:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: retq
%1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 4, i32 2, i32 6>
%2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 4, i32 0, i32 2, i32 4>
@@ -859,12 +859,12 @@ define <4 x i32> @combine_nested_undef_test13(<4 x i32> %A, <4 x i32> %B) {
; The following pair of shuffles is folded into vector %B.
define <4 x i32> @combine_nested_undef_test14(<4 x i32> %A, <4 x i32> %B) {
; SSE-LABEL: combine_nested_undef_test14:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_nested_undef_test14:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovaps %xmm1, %xmm0
; AVX-NEXT: retq
%1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 6, i32 2, i32 4>
@@ -881,38 +881,38 @@ define <4 x i32> @combine_nested_undef_test14(<4 x i32> %A, <4 x i32> %B) {
define <4 x i32> @combine_nested_undef_test15(<4 x i32> %A, <4 x i32> %B) {
; SSE2-LABEL: combine_nested_undef_test15:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[0,1]
; SSE2-NEXT: movaps %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: combine_nested_undef_test15:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[0,1]
; SSSE3-NEXT: movaps %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: combine_nested_undef_test15:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,0,1]
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
; SSE41-NEXT: retq
;
; AVX1-LABEL: combine_nested_undef_test15:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,0,1]
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
; AVX1-NEXT: retq
;
; AVX2-LABEL: combine_nested_undef_test15:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpbroadcastd %xmm1, %xmm1
-; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,0,1]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX2: # %bb.0:
+; AVX2-NEXT: vbroadcastss %xmm1, %xmm1
+; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,0,1]
+; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
; AVX2-NEXT: retq
%1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 3, i32 1>
%2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3>
@@ -921,35 +921,35 @@ define <4 x i32> @combine_nested_undef_test15(<4 x i32> %A, <4 x i32> %B) {
define <4 x i32> @combine_nested_undef_test16(<4 x i32> %A, <4 x i32> %B) {
; SSE2-LABEL: combine_nested_undef_test16:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: combine_nested_undef_test16:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,2,3]
; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: combine_nested_undef_test16:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
; SSE41-NEXT: retq
;
; AVX1-LABEL: combine_nested_undef_test16:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
; AVX1-NEXT: retq
;
; AVX2-LABEL: combine_nested_undef_test16:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
; AVX2-NEXT: retq
%1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
%2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3>
@@ -958,33 +958,33 @@ define <4 x i32> @combine_nested_undef_test16(<4 x i32> %A, <4 x i32> %B) {
define <4 x i32> @combine_nested_undef_test17(<4 x i32> %A, <4 x i32> %B) {
; SSE2-LABEL: combine_nested_undef_test17:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[1,0]
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[0,2]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: combine_nested_undef_test17:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[1,0]
; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[0,2]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: combine_nested_undef_test17:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,0,1]
; SSE41-NEXT: retq
;
; AVX1-LABEL: combine_nested_undef_test17:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,0,1]
; AVX1-NEXT: retq
;
; AVX2-LABEL: combine_nested_undef_test17:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,0,1]
+; AVX2: # %bb.0:
+; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,0,1]
; AVX2-NEXT: retq
%1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 1, i32 3, i32 1>
%2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3>
@@ -993,13 +993,13 @@ define <4 x i32> @combine_nested_undef_test17(<4 x i32> %A, <4 x i32> %B) {
define <4 x i32> @combine_nested_undef_test18(<4 x i32> %A, <4 x i32> %B) {
; SSE-LABEL: combine_nested_undef_test18:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,0,3]
; SSE-NEXT: retq
;
; AVX-LABEL: combine_nested_undef_test18:
-; AVX: # BB#0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[1,1,0,3]
+; AVX: # %bb.0:
+; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[1,1,0,3]
; AVX-NEXT: retq
%1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 5, i32 2, i32 7>
%2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 0, i32 3>
@@ -1008,33 +1008,33 @@ define <4 x i32> @combine_nested_undef_test18(<4 x i32> %A, <4 x i32> %B) {
define <4 x i32> @combine_nested_undef_test19(<4 x i32> %A, <4 x i32> %B) {
; SSE2-LABEL: combine_nested_undef_test19:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,0,0,0]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: combine_nested_undef_test19:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,0,0,0]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: combine_nested_undef_test19:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,0,0]
; SSE41-NEXT: retq
;
; AVX1-LABEL: combine_nested_undef_test19:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,0,0]
; AVX1-NEXT: retq
;
; AVX2-LABEL: combine_nested_undef_test19:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,0,0]
+; AVX2: # %bb.0:
+; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,0,0,0]
; AVX2-NEXT: retq
%1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 5, i32 6>
%2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 0, i32 0, i32 0>
@@ -1043,35 +1043,35 @@ define <4 x i32> @combine_nested_undef_test19(<4 x i32> %A, <4 x i32> %B) {
define <4 x i32> @combine_nested_undef_test20(<4 x i32> %A, <4 x i32> %B) {
; SSE2-LABEL: combine_nested_undef_test20:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,3]
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,3,1]
; SSE2-NEXT: movaps %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: combine_nested_undef_test20:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,3]
; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,3,1]
; SSSE3-NEXT: movaps %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: combine_nested_undef_test20:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,3,0]
; SSE41-NEXT: retq
;
; AVX1-LABEL: combine_nested_undef_test20:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,3,0]
; AVX1-NEXT: retq
;
; AVX2-LABEL: combine_nested_undef_test20:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,3,0]
+; AVX2: # %bb.0:
+; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,3,0]
; AVX2-NEXT: retq
%1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 3, i32 2, i32 4, i32 4>
%2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3>
@@ -1080,31 +1080,31 @@ define <4 x i32> @combine_nested_undef_test20(<4 x i32> %A, <4 x i32> %B) {
define <4 x i32> @combine_nested_undef_test21(<4 x i32> %A, <4 x i32> %B) {
; SSE2-LABEL: combine_nested_undef_test21:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,3,0,3]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: combine_nested_undef_test21:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,3,0,3]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: combine_nested_undef_test21:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; SSE41-NEXT: retq
;
; AVX1-LABEL: combine_nested_undef_test21:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX1-NEXT: retq
;
; AVX2-LABEL: combine_nested_undef_test21:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0
; AVX2-NEXT: retq
@@ -1119,13 +1119,13 @@ define <4 x i32> @combine_nested_undef_test21(<4 x i32> %A, <4 x i32> %B) {
define <4 x i32> @combine_nested_undef_test22(<4 x i32> %A, <4 x i32> %B) {
; SSE-LABEL: combine_nested_undef_test22:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,3]
; SSE-NEXT: retq
;
; AVX-LABEL: combine_nested_undef_test22:
-; AVX: # BB#0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[1,1,1,3]
+; AVX: # %bb.0:
+; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[1,1,1,3]
; AVX-NEXT: retq
%1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 5, i32 2, i32 7>
%2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 3>
@@ -1134,13 +1134,13 @@ define <4 x i32> @combine_nested_undef_test22(<4 x i32> %A, <4 x i32> %B) {
define <4 x i32> @combine_nested_undef_test23(<4 x i32> %A, <4 x i32> %B) {
; SSE-LABEL: combine_nested_undef_test23:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,3]
; SSE-NEXT: retq
;
; AVX-LABEL: combine_nested_undef_test23:
-; AVX: # BB#0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,1,0,3]
+; AVX: # %bb.0:
+; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[0,1,0,3]
; AVX-NEXT: retq
%1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 5, i32 2, i32 7>
%2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 3>
@@ -1149,13 +1149,13 @@ define <4 x i32> @combine_nested_undef_test23(<4 x i32> %A, <4 x i32> %B) {
define <4 x i32> @combine_nested_undef_test24(<4 x i32> %A, <4 x i32> %B) {
; SSE-LABEL: combine_nested_undef_test24:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,3,2,3]
; SSE-NEXT: retq
;
; AVX-LABEL: combine_nested_undef_test24:
-; AVX: # BB#0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,3,2,3]
+; AVX: # %bb.0:
+; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[0,3,2,3]
; AVX-NEXT: retq
%1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 1, i32 6, i32 7>
%2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 3, i32 2, i32 4>
@@ -1164,17 +1164,17 @@ define <4 x i32> @combine_nested_undef_test24(<4 x i32> %A, <4 x i32> %B) {
define <4 x i32> @combine_nested_undef_test25(<4 x i32> %A, <4 x i32> %B) {
; SSE-LABEL: combine_nested_undef_test25:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; SSE-NEXT: retq
;
; AVX1-LABEL: combine_nested_undef_test25:
-; AVX1: # BB#0:
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX1-NEXT: retq
;
; AVX2-LABEL: combine_nested_undef_test25:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0
; AVX2-NEXT: retq
%1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> <i32 1, i32 5, i32 2, i32 4>
@@ -1184,13 +1184,13 @@ define <4 x i32> @combine_nested_undef_test25(<4 x i32> %A, <4 x i32> %B) {
define <4 x i32> @combine_nested_undef_test26(<4 x i32> %A, <4 x i32> %B) {
; SSE-LABEL: combine_nested_undef_test26:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; SSE-NEXT: retq
;
; AVX-LABEL: combine_nested_undef_test26:
-; AVX: # BB#0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX: # %bb.0:
+; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3]
; AVX-NEXT: retq
%1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> <i32 1, i32 2, i32 6, i32 7>
%2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
@@ -1199,17 +1199,17 @@ define <4 x i32> @combine_nested_undef_test26(<4 x i32> %A, <4 x i32> %B) {
define <4 x i32> @combine_nested_undef_test27(<4 x i32> %A, <4 x i32> %B) {
; SSE-LABEL: combine_nested_undef_test27:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; SSE-NEXT: retq
;
; AVX1-LABEL: combine_nested_undef_test27:
-; AVX1: # BB#0:
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX1-NEXT: retq
;
; AVX2-LABEL: combine_nested_undef_test27:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0
; AVX2-NEXT: retq
%1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> <i32 2, i32 1, i32 5, i32 4>
@@ -1219,13 +1219,13 @@ define <4 x i32> @combine_nested_undef_test27(<4 x i32> %A, <4 x i32> %B) {
define <4 x i32> @combine_nested_undef_test28(<4 x i32> %A, <4 x i32> %B) {
; SSE-LABEL: combine_nested_undef_test28:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,0]
; SSE-NEXT: retq
;
; AVX-LABEL: combine_nested_undef_test28:
-; AVX: # BB#0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,0]
+; AVX: # %bb.0:
+; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,0]
; AVX-NEXT: retq
%1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> <i32 1, i32 2, i32 4, i32 5>
%2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 3, i32 2>
@@ -1234,12 +1234,12 @@ define <4 x i32> @combine_nested_undef_test28(<4 x i32> %A, <4 x i32> %B) {
define <4 x float> @combine_test1(<4 x float> %a, <4 x float> %b) {
; SSE-LABEL: combine_test1:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_test1:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovaps %xmm1, %xmm0
; AVX-NEXT: retq
%1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
@@ -1249,24 +1249,24 @@ define <4 x float> @combine_test1(<4 x float> %a, <4 x float> %b) {
define <4 x float> @combine_test2(<4 x float> %a, <4 x float> %b) {
; SSE2-LABEL: combine_test2:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
; SSE2-NEXT: movaps %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: combine_test2:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
; SSSE3-NEXT: movaps %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: combine_test2:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; SSE41-NEXT: retq
;
; AVX-LABEL: combine_test2:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; AVX-NEXT: retq
%1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
@@ -1276,13 +1276,13 @@ define <4 x float> @combine_test2(<4 x float> %a, <4 x float> %b) {
define <4 x float> @combine_test3(<4 x float> %a, <4 x float> %b) {
; SSE-LABEL: combine_test3:
-; SSE: # BB#0:
-; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE: # %bb.0:
+; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE-NEXT: retq
;
; AVX-LABEL: combine_test3:
-; AVX: # BB#0:
-; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX: # %bb.0:
+; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX-NEXT: retq
%1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
%2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 1>
@@ -1291,12 +1291,12 @@ define <4 x float> @combine_test3(<4 x float> %a, <4 x float> %b) {
define <4 x float> @combine_test4(<4 x float> %a, <4 x float> %b) {
; SSE-LABEL: combine_test4:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1]
; SSE-NEXT: retq
;
; AVX-LABEL: combine_test4:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
; AVX-NEXT: retq
%1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
@@ -1306,24 +1306,24 @@ define <4 x float> @combine_test4(<4 x float> %a, <4 x float> %b) {
define <4 x float> @combine_test5(<4 x float> %a, <4 x float> %b) {
; SSE2-LABEL: combine_test5:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: combine_test5:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: combine_test5:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
; SSE41-NEXT: retq
;
; AVX-LABEL: combine_test5:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
; AVX-NEXT: retq
%1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
@@ -1333,12 +1333,12 @@ define <4 x float> @combine_test5(<4 x float> %a, <4 x float> %b) {
define <4 x i32> @combine_test6(<4 x i32> %a, <4 x i32> %b) {
; SSE-LABEL: combine_test6:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_test6:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovaps %xmm1, %xmm0
; AVX-NEXT: retq
%1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
@@ -1348,30 +1348,30 @@ define <4 x i32> @combine_test6(<4 x i32> %a, <4 x i32> %b) {
define <4 x i32> @combine_test7(<4 x i32> %a, <4 x i32> %b) {
; SSE2-LABEL: combine_test7:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
; SSE2-NEXT: movaps %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: combine_test7:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
; SSSE3-NEXT: movaps %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: combine_test7:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
; SSE41-NEXT: retq
;
; AVX1-LABEL: combine_test7:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
; AVX1-NEXT: retq
;
; AVX2-LABEL: combine_test7:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX2: # %bb.0:
+; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; AVX2-NEXT: retq
%1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
%2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 3>
@@ -1380,13 +1380,13 @@ define <4 x i32> @combine_test7(<4 x i32> %a, <4 x i32> %b) {
define <4 x i32> @combine_test8(<4 x i32> %a, <4 x i32> %b) {
; SSE-LABEL: combine_test8:
-; SSE: # BB#0:
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE: # %bb.0:
+; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE-NEXT: retq
;
; AVX-LABEL: combine_test8:
-; AVX: # BB#0:
-; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX: # %bb.0:
+; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX-NEXT: retq
%1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
%2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 1>
@@ -1395,14 +1395,14 @@ define <4 x i32> @combine_test8(<4 x i32> %a, <4 x i32> %b) {
define <4 x i32> @combine_test9(<4 x i32> %a, <4 x i32> %b) {
; SSE-LABEL: combine_test9:
-; SSE: # BB#0:
-; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE-NEXT: movdqa %xmm1, %xmm0
+; SSE: # %bb.0:
+; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
+; SSE-NEXT: movaps %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_test9:
-; AVX: # BB#0:
-; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1]
+; AVX: # %bb.0:
+; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
; AVX-NEXT: retq
%1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
%2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
@@ -1411,30 +1411,30 @@ define <4 x i32> @combine_test9(<4 x i32> %a, <4 x i32> %b) {
define <4 x i32> @combine_test10(<4 x i32> %a, <4 x i32> %b) {
; SSE2-LABEL: combine_test10:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: combine_test10:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: combine_test10:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
; SSE41-NEXT: retq
;
; AVX1-LABEL: combine_test10:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
; AVX1-NEXT: retq
;
; AVX2-LABEL: combine_test10:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
+; AVX2: # %bb.0:
+; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
; AVX2-NEXT: retq
%1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
%2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
@@ -1443,7 +1443,7 @@ define <4 x i32> @combine_test10(<4 x i32> %a, <4 x i32> %b) {
define <4 x float> @combine_test11(<4 x float> %a, <4 x float> %b) {
; ALL-LABEL: combine_test11:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: retq
%1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
%2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
@@ -1452,24 +1452,24 @@ define <4 x float> @combine_test11(<4 x float> %a, <4 x float> %b) {
define <4 x float> @combine_test12(<4 x float> %a, <4 x float> %b) {
; SSE2-LABEL: combine_test12:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
; SSE2-NEXT: movaps %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: combine_test12:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
; SSSE3-NEXT: movaps %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: combine_test12:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; SSE41-NEXT: retq
;
; AVX-LABEL: combine_test12:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; AVX-NEXT: retq
%1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
@@ -1479,13 +1479,13 @@ define <4 x float> @combine_test12(<4 x float> %a, <4 x float> %b) {
define <4 x float> @combine_test13(<4 x float> %a, <4 x float> %b) {
; SSE-LABEL: combine_test13:
-; SSE: # BB#0:
-; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE: # %bb.0:
+; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE-NEXT: retq
;
; AVX-LABEL: combine_test13:
-; AVX: # BB#0:
-; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX: # %bb.0:
+; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX-NEXT: retq
%1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
%2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
@@ -1494,12 +1494,12 @@ define <4 x float> @combine_test13(<4 x float> %a, <4 x float> %b) {
define <4 x float> @combine_test14(<4 x float> %a, <4 x float> %b) {
; SSE-LABEL: combine_test14:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
; SSE-NEXT: retq
;
; AVX-LABEL: combine_test14:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
; AVX-NEXT: retq
%1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 6, i32 7, i32 5, i32 5>
@@ -1509,24 +1509,24 @@ define <4 x float> @combine_test14(<4 x float> %a, <4 x float> %b) {
define <4 x float> @combine_test15(<4 x float> %a, <4 x float> %b) {
; SSE2-LABEL: combine_test15:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: combine_test15:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: combine_test15:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
; SSE41-NEXT: retq
;
; AVX-LABEL: combine_test15:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
; AVX-NEXT: retq
%1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 7>
@@ -1536,7 +1536,7 @@ define <4 x float> @combine_test15(<4 x float> %a, <4 x float> %b) {
define <4 x i32> @combine_test16(<4 x i32> %a, <4 x i32> %b) {
; ALL-LABEL: combine_test16:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: retq
%1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
%2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
@@ -1545,30 +1545,30 @@ define <4 x i32> @combine_test16(<4 x i32> %a, <4 x i32> %b) {
define <4 x i32> @combine_test17(<4 x i32> %a, <4 x i32> %b) {
; SSE2-LABEL: combine_test17:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
; SSE2-NEXT: movaps %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: combine_test17:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
; SSSE3-NEXT: movaps %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: combine_test17:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
; SSE41-NEXT: retq
;
; AVX1-LABEL: combine_test17:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
; AVX1-NEXT: retq
;
; AVX2-LABEL: combine_test17:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX2: # %bb.0:
+; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; AVX2-NEXT: retq
%1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
%2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
@@ -1577,13 +1577,13 @@ define <4 x i32> @combine_test17(<4 x i32> %a, <4 x i32> %b) {
define <4 x i32> @combine_test18(<4 x i32> %a, <4 x i32> %b) {
; SSE-LABEL: combine_test18:
-; SSE: # BB#0:
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE: # %bb.0:
+; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE-NEXT: retq
;
; AVX-LABEL: combine_test18:
-; AVX: # BB#0:
-; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX: # %bb.0:
+; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX-NEXT: retq
%1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
%2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
@@ -1592,13 +1592,13 @@ define <4 x i32> @combine_test18(<4 x i32> %a, <4 x i32> %b) {
define <4 x i32> @combine_test19(<4 x i32> %a, <4 x i32> %b) {
; SSE-LABEL: combine_test19:
-; SSE: # BB#0:
-; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; SSE: # %bb.0:
+; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
; SSE-NEXT: retq
;
; AVX-LABEL: combine_test19:
-; AVX: # BB#0:
-; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; AVX: # %bb.0:
+; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
; AVX-NEXT: retq
%1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 6, i32 7, i32 5, i32 5>
%2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
@@ -1607,30 +1607,30 @@ define <4 x i32> @combine_test19(<4 x i32> %a, <4 x i32> %b) {
define <4 x i32> @combine_test20(<4 x i32> %a, <4 x i32> %b) {
; SSE2-LABEL: combine_test20:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: combine_test20:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: combine_test20:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
; SSE41-NEXT: retq
;
; AVX1-LABEL: combine_test20:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
; AVX1-NEXT: retq
;
; AVX2-LABEL: combine_test20:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
+; AVX2: # %bb.0:
+; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
; AVX2-NEXT: retq
%1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 7>
%2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 0, i32 5, i32 2, i32 3>
@@ -1639,30 +1639,21 @@ define <4 x i32> @combine_test20(<4 x i32> %a, <4 x i32> %b) {
define <4 x i32> @combine_test21(<8 x i32> %a, <4 x i32>* %ptr) {
; SSE-LABEL: combine_test21:
-; SSE: # BB#0:
-; SSE-NEXT: movdqa %xmm0, %xmm2
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
-; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; SSE-NEXT: movdqa %xmm2, (%rdi)
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: combine_test21:
-; AVX1: # BB#0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm0[0],xmm1[0]
-; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; AVX1-NEXT: vmovdqa %xmm2, (%rdi)
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
+; SSE: # %bb.0:
+; SSE-NEXT: movaps %xmm0, %xmm2
+; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0]
+; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; SSE-NEXT: movaps %xmm2, (%rdi)
+; SSE-NEXT: retq
;
-; AVX2-LABEL: combine_test21:
-; AVX2: # BB#0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm0[0],xmm1[0]
-; AVX2-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; AVX2-NEXT: vmovdqa %xmm2, (%rdi)
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
+; AVX-LABEL: combine_test21:
+; AVX: # %bb.0:
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm1[0]
+; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; AVX-NEXT: vmovaps %xmm2, (%rdi)
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
%1 = shufflevector <8 x i32> %a, <8 x i32> %a, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
%2 = shufflevector <8 x i32> %a, <8 x i32> %a, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
store <4 x i32> %1, <4 x i32>* %ptr, align 16
@@ -1671,13 +1662,13 @@ define <4 x i32> @combine_test21(<8 x i32> %a, <4 x i32>* %ptr) {
define <8 x float> @combine_test22(<2 x float>* %a, <2 x float>* %b) {
; SSE-LABEL: combine_test22:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; SSE-NEXT: movhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
; SSE-NEXT: retq
;
; AVX-LABEL: combine_test22:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
; AVX-NEXT: retq
@@ -1691,12 +1682,12 @@ define <8 x float> @combine_test22(<2 x float>* %a, <2 x float>* %b) {
; PR22359
define void @combine_test23(<8 x float> %v, <2 x float>* %ptr) {
; SSE-LABEL: combine_test23:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movups %xmm0, (%rdi)
; SSE-NEXT: retq
;
; AVX-LABEL: combine_test23:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovups %xmm0, (%rdi)
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
@@ -1713,13 +1704,13 @@ define void @combine_test23(<8 x float> %v, <2 x float>* %ptr) {
define <4 x float> @combine_test1b(<4 x float> %a, <4 x float> %b) {
; SSE-LABEL: combine_test1b:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1,2,0]
; SSE-NEXT: movaps %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_test1b:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[0,1,2,0]
; AVX-NEXT: retq
%1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
@@ -1729,23 +1720,23 @@ define <4 x float> @combine_test1b(<4 x float> %a, <4 x float> %b) {
define <4 x float> @combine_test2b(<4 x float> %a, <4 x float> %b) {
; SSE2-LABEL: combine_test2b:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0,0]
; SSE2-NEXT: movaps %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: combine_test2b:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: movddup {{.*#+}} xmm0 = xmm1[0,0]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: combine_test2b:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movddup {{.*#+}} xmm0 = xmm1[0,0]
; SSE41-NEXT: retq
;
; AVX-LABEL: combine_test2b:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovddup {{.*#+}} xmm0 = xmm1[0,0]
; AVX-NEXT: retq
%1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
@@ -1755,25 +1746,25 @@ define <4 x float> @combine_test2b(<4 x float> %a, <4 x float> %b) {
define <4 x float> @combine_test3b(<4 x float> %a, <4 x float> %b) {
; SSE2-LABEL: combine_test3b:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0]
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: combine_test3b:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0]
; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: combine_test3b:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3,2,3]
; SSE41-NEXT: retq
;
; AVX-LABEL: combine_test3b:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,3,2,3]
; AVX-NEXT: retq
@@ -1784,13 +1775,13 @@ define <4 x float> @combine_test3b(<4 x float> %a, <4 x float> %b) {
define <4 x float> @combine_test4b(<4 x float> %a, <4 x float> %b) {
; SSE-LABEL: combine_test4b:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,2,3]
; SSE-NEXT: movaps %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_test4b:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[1,1,2,3]
; AVX-NEXT: retq
%1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
@@ -1803,7 +1794,7 @@ define <4 x float> @combine_test4b(<4 x float> %a, <4 x float> %b) {
define <4 x i8> @combine_test1c(<4 x i8>* %a, <4 x i8>* %b) {
; SSE2-LABEL: combine_test1c:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
@@ -1812,7 +1803,7 @@ define <4 x i8> @combine_test1c(<4 x i8>* %a, <4 x i8>* %b) {
; SSE2-NEXT: retq
;
; SSSE3-LABEL: combine_test1c:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
@@ -1821,21 +1812,21 @@ define <4 x i8> @combine_test1c(<4 x i8>* %a, <4 x i8>* %b) {
; SSSE3-NEXT: retq
;
; SSE41-LABEL: combine_test1c:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
; SSE41-NEXT: retq
;
; AVX1-LABEL: combine_test1c:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
; AVX1-NEXT: retq
;
; AVX2-LABEL: combine_test1c:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
@@ -1849,7 +1840,7 @@ define <4 x i8> @combine_test1c(<4 x i8>* %a, <4 x i8>* %b) {
define <4 x i8> @combine_test2c(<4 x i8>* %a, <4 x i8>* %b) {
; SSE2-LABEL: combine_test2c:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
@@ -1860,7 +1851,7 @@ define <4 x i8> @combine_test2c(<4 x i8>* %a, <4 x i8>* %b) {
; SSE2-NEXT: retq
;
; SSSE3-LABEL: combine_test2c:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
@@ -1871,14 +1862,14 @@ define <4 x i8> @combine_test2c(<4 x i8>* %a, <4 x i8>* %b) {
; SSSE3-NEXT: retq
;
; SSE41-LABEL: combine_test2c:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE41-NEXT: retq
;
; AVX-LABEL: combine_test2c:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; AVX-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
@@ -1892,7 +1883,7 @@ define <4 x i8> @combine_test2c(<4 x i8>* %a, <4 x i8>* %b) {
define <4 x i8> @combine_test3c(<4 x i8>* %a, <4 x i8>* %b) {
; SSE2-LABEL: combine_test3c:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
@@ -1903,7 +1894,7 @@ define <4 x i8> @combine_test3c(<4 x i8>* %a, <4 x i8>* %b) {
; SSE2-NEXT: retq
;
; SSSE3-LABEL: combine_test3c:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
@@ -1914,14 +1905,14 @@ define <4 x i8> @combine_test3c(<4 x i8>* %a, <4 x i8>* %b) {
; SSSE3-NEXT: retq
;
; SSE41-LABEL: combine_test3c:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; SSE41-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
; SSE41-NEXT: retq
;
; AVX-LABEL: combine_test3c:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; AVX-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1]
@@ -1935,7 +1926,7 @@ define <4 x i8> @combine_test3c(<4 x i8>* %a, <4 x i8>* %b) {
define <4 x i8> @combine_test4c(<4 x i8>* %a, <4 x i8>* %b) {
; SSE2-LABEL: combine_test4c:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
@@ -1947,7 +1938,7 @@ define <4 x i8> @combine_test4c(<4 x i8>* %a, <4 x i8>* %b) {
; SSE2-NEXT: retq
;
; SSSE3-LABEL: combine_test4c:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
@@ -1959,21 +1950,21 @@ define <4 x i8> @combine_test4c(<4 x i8>* %a, <4 x i8>* %b) {
; SSSE3-NEXT: retq
;
; SSE41-LABEL: combine_test4c:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
; SSE41-NEXT: retq
;
; AVX1-LABEL: combine_test4c:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
; AVX1-NEXT: retq
;
; AVX2-LABEL: combine_test4c:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
@@ -2017,22 +2008,22 @@ define <4 x i8> @combine_test4c(<4 x i8>* %a, <4 x i8>* %b) {
define <4 x float> @combine_blend_01(<4 x float> %a, <4 x float> %b) {
; SSE2-LABEL: combine_blend_01:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: combine_blend_01:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: combine_blend_01:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; SSE41-NEXT: retq
;
; AVX-LABEL: combine_blend_01:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; AVX-NEXT: retq
%shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 undef, i32 2, i32 3>
@@ -2042,26 +2033,26 @@ define <4 x float> @combine_blend_01(<4 x float> %a, <4 x float> %b) {
define <4 x float> @combine_blend_02(<4 x float> %a, <4 x float> %b) {
; SSE2-LABEL: combine_blend_02:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,3]
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,1,3]
; SSE2-NEXT: movaps %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: combine_blend_02:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,3]
; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,1,3]
; SSSE3-NEXT: movaps %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: combine_blend_02:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
; SSE41-NEXT: retq
;
; AVX-LABEL: combine_blend_02:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
; AVX-NEXT: retq
%shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 undef, i32 3>
@@ -2071,24 +2062,24 @@ define <4 x float> @combine_blend_02(<4 x float> %a, <4 x float> %b) {
define <4 x float> @combine_blend_123(<4 x float> %a, <4 x float> %b) {
; SSE2-LABEL: combine_blend_123:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
; SSE2-NEXT: movaps %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: combine_blend_123:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
; SSSE3-NEXT: movaps %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: combine_blend_123:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; SSE41-NEXT: retq
;
; AVX-LABEL: combine_blend_123:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; AVX-NEXT: retq
%shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 undef, i32 undef>
@@ -2099,14 +2090,14 @@ define <4 x float> @combine_blend_123(<4 x float> %a, <4 x float> %b) {
define <4 x i32> @combine_test_movhl_1(<4 x i32> %a, <4 x i32> %b) {
; SSE-LABEL: combine_test_movhl_1:
-; SSE: # BB#0:
-; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE-NEXT: movdqa %xmm1, %xmm0
+; SSE: # %bb.0:
+; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
+; SSE-NEXT: movaps %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_test_movhl_1:
-; AVX: # BB#0:
-; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1]
+; AVX: # %bb.0:
+; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
; AVX-NEXT: retq
%1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 7, i32 5, i32 3>
%2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 6, i32 1, i32 0, i32 3>
@@ -2115,14 +2106,14 @@ define <4 x i32> @combine_test_movhl_1(<4 x i32> %a, <4 x i32> %b) {
define <4 x i32> @combine_test_movhl_2(<4 x i32> %a, <4 x i32> %b) {
; SSE-LABEL: combine_test_movhl_2:
-; SSE: # BB#0:
-; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE-NEXT: movdqa %xmm1, %xmm0
+; SSE: # %bb.0:
+; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
+; SSE-NEXT: movaps %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_test_movhl_2:
-; AVX: # BB#0:
-; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1]
+; AVX: # %bb.0:
+; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
; AVX-NEXT: retq
%1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 0, i32 3, i32 6>
%2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 3, i32 7, i32 0, i32 2>
@@ -2131,14 +2122,14 @@ define <4 x i32> @combine_test_movhl_2(<4 x i32> %a, <4 x i32> %b) {
define <4 x i32> @combine_test_movhl_3(<4 x i32> %a, <4 x i32> %b) {
; SSE-LABEL: combine_test_movhl_3:
-; SSE: # BB#0:
-; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm0[1]
-; SSE-NEXT: movdqa %xmm1, %xmm0
+; SSE: # %bb.0:
+; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
+; SSE-NEXT: movaps %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: combine_test_movhl_3:
-; AVX: # BB#0:
-; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1]
+; AVX: # %bb.0:
+; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
; AVX-NEXT: retq
%1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 7, i32 6, i32 3, i32 2>
%2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 6, i32 0, i32 3, i32 2>
@@ -2151,22 +2142,22 @@ define <4 x i32> @combine_test_movhl_3(<4 x i32> %a, <4 x i32> %b) {
define <4 x float> @combine_undef_input_test1(<4 x float> %a, <4 x float> %b) {
; SSE2-LABEL: combine_undef_input_test1:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: combine_undef_input_test1:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: combine_undef_input_test1:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; SSE41-NEXT: retq
;
; AVX-LABEL: combine_undef_input_test1:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; AVX-NEXT: retq
%1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 4, i32 2, i32 3, i32 1>
@@ -2176,13 +2167,13 @@ define <4 x float> @combine_undef_input_test1(<4 x float> %a, <4 x float> %b) {
define <4 x float> @combine_undef_input_test2(<4 x float> %a, <4 x float> %b) {
; SSE-LABEL: combine_undef_input_test2:
-; SSE: # BB#0:
-; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE: # %bb.0:
+; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE-NEXT: retq
;
; AVX-LABEL: combine_undef_input_test2:
-; AVX: # BB#0:
-; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX: # %bb.0:
+; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX-NEXT: retq
%1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 6, i32 0, i32 1, i32 7>
%2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 1, i32 2, i32 4, i32 5>
@@ -2191,13 +2182,13 @@ define <4 x float> @combine_undef_input_test2(<4 x float> %a, <4 x float> %b) {
define <4 x float> @combine_undef_input_test3(<4 x float> %a, <4 x float> %b) {
; SSE-LABEL: combine_undef_input_test3:
-; SSE: # BB#0:
-; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE: # %bb.0:
+; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE-NEXT: retq
;
; AVX-LABEL: combine_undef_input_test3:
-; AVX: # BB#0:
-; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX: # %bb.0:
+; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX-NEXT: retq
%1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
%2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 1>
@@ -2206,12 +2197,12 @@ define <4 x float> @combine_undef_input_test3(<4 x float> %a, <4 x float> %b) {
define <4 x float> @combine_undef_input_test4(<4 x float> %a, <4 x float> %b) {
; SSE-LABEL: combine_undef_input_test4:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1]
; SSE-NEXT: retq
;
; AVX-LABEL: combine_undef_input_test4:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
; AVX-NEXT: retq
%1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
@@ -2221,24 +2212,24 @@ define <4 x float> @combine_undef_input_test4(<4 x float> %a, <4 x float> %b) {
define <4 x float> @combine_undef_input_test5(<4 x float> %a, <4 x float> %b) {
; SSE2-LABEL: combine_undef_input_test5:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
; SSE2-NEXT: movapd %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: combine_undef_input_test5:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
; SSSE3-NEXT: movapd %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: combine_undef_input_test5:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
; SSE41-NEXT: retq
;
; AVX-LABEL: combine_undef_input_test5:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
; AVX-NEXT: retq
%1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 3>
@@ -2252,7 +2243,7 @@ define <4 x float> @combine_undef_input_test5(<4 x float> %a, <4 x float> %b) {
define <4 x float> @combine_undef_input_test6(<4 x float> %a) {
; ALL-LABEL: combine_undef_input_test6:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: retq
%1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 4, i32 2, i32 3, i32 1>
%2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 4, i32 5, i32 1, i32 2>
@@ -2261,22 +2252,22 @@ define <4 x float> @combine_undef_input_test6(<4 x float> %a) {
define <4 x float> @combine_undef_input_test7(<4 x float> %a) {
; SSE2-LABEL: combine_undef_input_test7:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: combine_undef_input_test7:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: combine_undef_input_test7:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0]
; SSE41-NEXT: retq
;
; AVX-LABEL: combine_undef_input_test7:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
; AVX-NEXT: retq
%1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 6, i32 0, i32 1, i32 7>
@@ -2286,22 +2277,22 @@ define <4 x float> @combine_undef_input_test7(<4 x float> %a) {
define <4 x float> @combine_undef_input_test8(<4 x float> %a) {
; SSE2-LABEL: combine_undef_input_test8:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: combine_undef_input_test8:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: combine_undef_input_test8:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0]
; SSE41-NEXT: retq
;
; AVX-LABEL: combine_undef_input_test8:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
; AVX-NEXT: retq
%1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
@@ -2311,12 +2302,12 @@ define <4 x float> @combine_undef_input_test8(<4 x float> %a) {
define <4 x float> @combine_undef_input_test9(<4 x float> %a) {
; SSE-LABEL: combine_undef_input_test9:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1]
; SSE-NEXT: retq
;
; AVX-LABEL: combine_undef_input_test9:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,1]
; AVX-NEXT: retq
%1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
@@ -2326,7 +2317,7 @@ define <4 x float> @combine_undef_input_test9(<4 x float> %a) {
define <4 x float> @combine_undef_input_test10(<4 x float> %a) {
; ALL-LABEL: combine_undef_input_test10:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: retq
%1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 3>
%2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 2, i32 6, i32 7>
@@ -2335,22 +2326,22 @@ define <4 x float> @combine_undef_input_test10(<4 x float> %a) {
define <4 x float> @combine_undef_input_test11(<4 x float> %a, <4 x float> %b) {
; SSE2-LABEL: combine_undef_input_test11:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: combine_undef_input_test11:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: combine_undef_input_test11:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; SSE41-NEXT: retq
;
; AVX-LABEL: combine_undef_input_test11:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; AVX-NEXT: retq
%1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 4, i32 2, i32 3, i32 1>
@@ -2360,13 +2351,13 @@ define <4 x float> @combine_undef_input_test11(<4 x float> %a, <4 x float> %b) {
define <4 x float> @combine_undef_input_test12(<4 x float> %a, <4 x float> %b) {
; SSE-LABEL: combine_undef_input_test12:
-; SSE: # BB#0:
-; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE: # %bb.0:
+; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE-NEXT: retq
;
; AVX-LABEL: combine_undef_input_test12:
-; AVX: # BB#0:
-; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX: # %bb.0:
+; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX-NEXT: retq
%1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 6, i32 0, i32 1, i32 7>
%2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 5, i32 6, i32 0, i32 1>
@@ -2375,13 +2366,13 @@ define <4 x float> @combine_undef_input_test12(<4 x float> %a, <4 x float> %b) {
define <4 x float> @combine_undef_input_test13(<4 x float> %a, <4 x float> %b) {
; SSE-LABEL: combine_undef_input_test13:
-; SSE: # BB#0:
-; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE: # %bb.0:
+; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE-NEXT: retq
;
; AVX-LABEL: combine_undef_input_test13:
-; AVX: # BB#0:
-; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX: # %bb.0:
+; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX-NEXT: retq
%1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
%2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 4, i32 5, i32 0, i32 5>
@@ -2390,12 +2381,12 @@ define <4 x float> @combine_undef_input_test13(<4 x float> %a, <4 x float> %b) {
define <4 x float> @combine_undef_input_test14(<4 x float> %a, <4 x float> %b) {
; SSE-LABEL: combine_undef_input_test14:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1]
; SSE-NEXT: retq
;
; AVX-LABEL: combine_undef_input_test14:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
; AVX-NEXT: retq
%1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
@@ -2405,24 +2396,24 @@ define <4 x float> @combine_undef_input_test14(<4 x float> %a, <4 x float> %b) {
define <4 x float> @combine_undef_input_test15(<4 x float> %a, <4 x float> %b) {
; SSE2-LABEL: combine_undef_input_test15:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
; SSE2-NEXT: movapd %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: combine_undef_input_test15:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
; SSSE3-NEXT: movapd %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: combine_undef_input_test15:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
; SSE41-NEXT: retq
;
; AVX-LABEL: combine_undef_input_test15:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
; AVX-NEXT: retq
%1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 3>
@@ -2442,7 +2433,7 @@ define <4 x float> @combine_undef_input_test15(<4 x float> %a, <4 x float> %b) {
define <4 x float> @combine_undef_input_test16(<4 x float> %a) {
; ALL-LABEL: combine_undef_input_test16:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: retq
%1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 4, i32 2, i32 3, i32 1>
%2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 0, i32 1, i32 5, i32 3>
@@ -2451,22 +2442,22 @@ define <4 x float> @combine_undef_input_test16(<4 x float> %a) {
define <4 x float> @combine_undef_input_test17(<4 x float> %a) {
; SSE2-LABEL: combine_undef_input_test17:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: combine_undef_input_test17:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: combine_undef_input_test17:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0]
; SSE41-NEXT: retq
;
; AVX-LABEL: combine_undef_input_test17:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
; AVX-NEXT: retq
%1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 6, i32 0, i32 1, i32 7>
@@ -2476,22 +2467,22 @@ define <4 x float> @combine_undef_input_test17(<4 x float> %a) {
define <4 x float> @combine_undef_input_test18(<4 x float> %a) {
; SSE2-LABEL: combine_undef_input_test18:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: combine_undef_input_test18:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: combine_undef_input_test18:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0]
; SSE41-NEXT: retq
;
; AVX-LABEL: combine_undef_input_test18:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
; AVX-NEXT: retq
%1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 1, i32 7>
@@ -2501,12 +2492,12 @@ define <4 x float> @combine_undef_input_test18(<4 x float> %a) {
define <4 x float> @combine_undef_input_test19(<4 x float> %a) {
; SSE-LABEL: combine_undef_input_test19:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1]
; SSE-NEXT: retq
;
; AVX-LABEL: combine_undef_input_test19:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,1]
; AVX-NEXT: retq
%1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 5, i32 5>
@@ -2516,7 +2507,7 @@ define <4 x float> @combine_undef_input_test19(<4 x float> %a) {
define <4 x float> @combine_undef_input_test20(<4 x float> %a) {
; ALL-LABEL: combine_undef_input_test20:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: retq
%1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 3>
%2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 4, i32 6, i32 2, i32 3>
@@ -2530,14 +2521,14 @@ define <4 x float> @combine_undef_input_test20(<4 x float> %a) {
define <8 x i32> @combine_unneeded_subvector1(<8 x i32> %a) {
; SSE-LABEL: combine_unneeded_subvector1:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: paddd {{.*}}(%rip), %xmm1
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,2,1,0]
; SSE-NEXT: movdqa %xmm0, %xmm1
; SSE-NEXT: retq
;
; AVX1-LABEL: combine_unneeded_subvector1:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
@@ -2546,10 +2537,10 @@ define <8 x i32> @combine_unneeded_subvector1(<8 x i32> %a) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: combine_unneeded_subvector1:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0
; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
-; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
; AVX2-NEXT: retq
%b = add <8 x i32> %a, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
%c = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 7, i32 6, i32 5, i32 4>
@@ -2558,14 +2549,14 @@ define <8 x i32> @combine_unneeded_subvector1(<8 x i32> %a) {
define <8 x i32> @combine_unneeded_subvector2(<8 x i32> %a, <8 x i32> %b) {
; SSE-LABEL: combine_unneeded_subvector2:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: paddd {{.*}}(%rip), %xmm1
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[3,2,1,0]
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,2,1,0]
; SSE-NEXT: retq
;
; AVX1-LABEL: combine_unneeded_subvector2:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
@@ -2574,7 +2565,7 @@ define <8 x i32> @combine_unneeded_subvector2(<8 x i32> %a, <8 x i32> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: combine_unneeded_subvector2:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3]
; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
@@ -2586,26 +2577,26 @@ define <8 x i32> @combine_unneeded_subvector2(<8 x i32> %a, <8 x i32> %b) {
define <4 x float> @combine_insertps1(<4 x float> %a, <4 x float> %b) {
; SSE2-LABEL: combine_insertps1:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[1,0]
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3]
; SSE2-NEXT: movaps %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: combine_insertps1:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[1,0]
; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3]
; SSSE3-NEXT: movaps %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: combine_insertps1:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm1[2],xmm0[1,2,3]
; SSE41-NEXT: retq
;
; AVX-LABEL: combine_insertps1:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[2],xmm0[1,2,3]
; AVX-NEXT: retq
@@ -2616,26 +2607,26 @@ define <4 x float> @combine_insertps1(<4 x float> %a, <4 x float> %b) {
define <4 x float> @combine_insertps2(<4 x float> %a, <4 x float> %b) {
; SSE2-LABEL: combine_insertps2:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[0,0]
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
; SSE2-NEXT: movaps %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: combine_insertps2:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[0,0]
; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
; SSSE3-NEXT: movaps %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: combine_insertps2:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[2],xmm0[2,3]
; SSE41-NEXT: retq
;
; AVX-LABEL: combine_insertps2:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[2],xmm0[2,3]
; AVX-NEXT: retq
@@ -2646,24 +2637,24 @@ define <4 x float> @combine_insertps2(<4 x float> %a, <4 x float> %b) {
define <4 x float> @combine_insertps3(<4 x float> %a, <4 x float> %b) {
; SSE2-LABEL: combine_insertps3:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: combine_insertps3:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: combine_insertps3:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
; SSE41-NEXT: retq
;
; AVX-LABEL: combine_insertps3:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
; AVX-NEXT: retq
@@ -2674,24 +2665,24 @@ define <4 x float> @combine_insertps3(<4 x float> %a, <4 x float> %b) {
define <4 x float> @combine_insertps4(<4 x float> %a, <4 x float> %b) {
; SSE2-LABEL: combine_insertps4:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0]
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: combine_insertps4:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0]
; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: combine_insertps4:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
; SSE41-NEXT: retq
;
; AVX-LABEL: combine_insertps4:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
; AVX-NEXT: retq
@@ -2702,13 +2693,13 @@ define <4 x float> @combine_insertps4(<4 x float> %a, <4 x float> %b) {
define void @combine_scalar_load_with_blend_with_zero(double* %a0, <4 x float>* %a1) {
; SSE-LABEL: combine_scalar_load_with_blend_with_zero:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; SSE-NEXT: movaps %xmm0, (%rsi)
; SSE-NEXT: retq
;
; AVX-LABEL: combine_scalar_load_with_blend_with_zero:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: vmovaps %xmm0, (%rsi)
; AVX-NEXT: retq
@@ -2724,26 +2715,26 @@ define void @combine_scalar_load_with_blend_with_zero(double* %a0, <4 x float>*
; PR30371
define <4 x float> @combine_constant_insertion_v4f32(float %f) {
; SSE2-LABEL: combine_constant_insertion_v4f32:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movaps {{.*#+}} xmm1 = <u,4,5,3>
; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
; SSE2-NEXT: movaps %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: combine_constant_insertion_v4f32:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: movaps {{.*#+}} xmm1 = <u,4,5,3>
; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
; SSSE3-NEXT: movaps %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: combine_constant_insertion_v4f32:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],mem[1,2,3]
; SSE41-NEXT: retq
;
; AVX-LABEL: combine_constant_insertion_v4f32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],mem[1,2,3]
; AVX-NEXT: retq
%a0 = insertelement <4 x float> undef, float %f, i32 0
@@ -2753,36 +2744,30 @@ define <4 x float> @combine_constant_insertion_v4f32(float %f) {
define <4 x i32> @combine_constant_insertion_v4i32(i32 %f) {
; SSE2-LABEL: combine_constant_insertion_v4i32:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movd %edi, %xmm1
; SSE2-NEXT: movaps {{.*#+}} xmm0 = <u,4,5,30>
; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: combine_constant_insertion_v4i32:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: movd %edi, %xmm1
; SSSE3-NEXT: movaps {{.*#+}} xmm0 = <u,4,5,30>
; SSSE3-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: combine_constant_insertion_v4i32:
-; SSE41: # BB#0:
-; SSE41-NEXT: movd %edi, %xmm0
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],mem[2,3,4,5,6,7]
+; SSE41: # %bb.0:
+; SSE41-NEXT: movdqa {{.*#+}} xmm0 = <u,4,5,30>
+; SSE41-NEXT: pinsrd $0, %edi, %xmm0
; SSE41-NEXT: retq
;
-; AVX1-LABEL: combine_constant_insertion_v4i32:
-; AVX1: # BB#0:
-; AVX1-NEXT: vmovd %edi, %xmm0
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],mem[2,3,4,5,6,7]
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: combine_constant_insertion_v4i32:
-; AVX2: # BB#0:
-; AVX2-NEXT: vmovd %edi, %xmm0
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],mem[1,2,3]
-; AVX2-NEXT: retq
+; AVX-LABEL: combine_constant_insertion_v4i32:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovdqa {{.*#+}} xmm0 = <u,4,5,30>
+; AVX-NEXT: vpinsrd $0, %edi, %xmm0, %xmm0
+; AVX-NEXT: retq
%a0 = insertelement <4 x i32> undef, i32 %f, i32 0
%ret = shufflevector <4 x i32> %a0, <4 x i32> <i32 undef, i32 4, i32 5, i32 30>, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
ret <4 x i32> %ret
@@ -2790,7 +2775,7 @@ define <4 x i32> @combine_constant_insertion_v4i32(i32 %f) {
define <4 x float> @PR22377(<4 x float> %a, <4 x float> %b) {
; SSE-LABEL: PR22377:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: movaps %xmm0, %xmm1
; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3,1,3]
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,0,2]
@@ -2799,7 +2784,7 @@ define <4 x float> @PR22377(<4 x float> %a, <4 x float> %b) {
; SSE-NEXT: retq
;
; AVX-LABEL: PR22377:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,3,1,3]
; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,0,2]
; AVX-NEXT: vaddps %xmm0, %xmm1, %xmm1
@@ -2815,7 +2800,7 @@ entry:
define <4 x float> @PR22390(<4 x float> %a, <4 x float> %b) {
; SSE2-LABEL: PR22390:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0,1,2]
; SSE2-NEXT: movaps %xmm0, %xmm2
; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
@@ -2824,7 +2809,7 @@ define <4 x float> @PR22390(<4 x float> %a, <4 x float> %b) {
; SSE2-NEXT: retq
;
; SSSE3-LABEL: PR22390:
-; SSSE3: # BB#0: # %entry
+; SSSE3: # %bb.0: # %entry
; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0,1,2]
; SSSE3-NEXT: movaps %xmm0, %xmm2
; SSSE3-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
@@ -2833,14 +2818,14 @@ define <4 x float> @PR22390(<4 x float> %a, <4 x float> %b) {
; SSSE3-NEXT: retq
;
; SSE41-LABEL: PR22390:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0,1,2]
; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3]
; SSE41-NEXT: addps %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: PR22390:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,0,1,2]
; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3]
; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0
@@ -2854,7 +2839,7 @@ entry:
define <8 x float> @PR22412(<8 x float> %a, <8 x float> %b) {
; SSE2-LABEL: PR22412:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
; SSE2-NEXT: movapd %xmm2, %xmm0
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm3[3,2]
@@ -2863,7 +2848,7 @@ define <8 x float> @PR22412(<8 x float> %a, <8 x float> %b) {
; SSE2-NEXT: retq
;
; SSSE3-LABEL: PR22412:
-; SSSE3: # BB#0: # %entry
+; SSSE3: # %bb.0: # %entry
; SSSE3-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
; SSSE3-NEXT: movapd %xmm2, %xmm0
; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm3[3,2]
@@ -2872,7 +2857,7 @@ define <8 x float> @PR22412(<8 x float> %a, <8 x float> %b) {
; SSSE3-NEXT: retq
;
; SSE41-LABEL: PR22412:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm2[1]
; SSE41-NEXT: movapd %xmm0, %xmm1
; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm3[3,2]
@@ -2882,14 +2867,14 @@ define <8 x float> @PR22412(<8 x float> %a, <8 x float> %b) {
; SSE41-NEXT: retq
;
; AVX1-LABEL: PR22412:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0],ymm1[3,2],ymm0[5,4],ymm1[7,6]
; AVX1-NEXT: retq
;
; AVX2-LABEL: PR22412:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6]
; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,1]
@@ -2902,7 +2887,7 @@ entry:
define <4 x float> @PR30264(<4 x float> %x) {
; SSE2-LABEL: PR30264:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: xorps %xmm1, %xmm1
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0]
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],mem[2,3]
@@ -2910,7 +2895,7 @@ define <4 x float> @PR30264(<4 x float> %x) {
; SSE2-NEXT: retq
;
; SSSE3-LABEL: PR30264:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: xorps %xmm1, %xmm1
; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0]
; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],mem[2,3]
@@ -2918,14 +2903,14 @@ define <4 x float> @PR30264(<4 x float> %x) {
; SSSE3-NEXT: retq
;
; SSE41-LABEL: PR30264:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movaps {{.*#+}} xmm1 = <u,u,4,1>
; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm0[0],zero,xmm1[2,3]
; SSE41-NEXT: movaps %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: PR30264:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} xmm1 = <u,u,4,1>
; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],zero,xmm1[2,3]
; AVX-NEXT: retq
diff --git a/test/CodeGen/X86/vector-shuffle-masked.ll b/test/CodeGen/X86/vector-shuffle-masked.ll
index 91d686460499..ee8ab50b5887 100644
--- a/test/CodeGen/X86/vector-shuffle-masked.ll
+++ b/test/CodeGen/X86/vector-shuffle-masked.ll
@@ -1,10 +1,10 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefix=CHECK
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512dq,+avx512bw | FileCheck %s --check-prefix=CHECK
define <4 x i32> @mask_shuffle_v4i32_1234(<4 x i32> %a, <4 x i32> %b, <4 x i32> %passthru, i8 %mask) {
; CHECK-LABEL: mask_shuffle_v4i32_1234:
-; CHECK: # BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
; CHECK-NEXT: valignd {{.*#+}} xmm2 {%k1} = xmm0[1,2,3],xmm1[0]
; CHECK-NEXT: vmovdqa %xmm2, %xmm0
; CHECK-NEXT: retq
@@ -17,8 +17,8 @@ define <4 x i32> @mask_shuffle_v4i32_1234(<4 x i32> %a, <4 x i32> %b, <4 x i32>
define <4 x i32> @maskz_shuffle_v4i32_1234(<4 x i32> %a, <4 x i32> %b, i8 %mask) {
; CHECK-LABEL: maskz_shuffle_v4i32_1234:
-; CHECK: # BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
; CHECK-NEXT: valignd {{.*#+}} xmm0 {%k1} {z} = xmm0[1,2,3],xmm1[0]
; CHECK-NEXT: retq
%shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
@@ -30,8 +30,8 @@ define <4 x i32> @maskz_shuffle_v4i32_1234(<4 x i32> %a, <4 x i32> %b, i8 %mask)
define <4 x i32> @mask_shuffle_v4i32_2345(<4 x i32> %a, <4 x i32> %b, <4 x i32> %passthru, i8 %mask) {
; CHECK-LABEL: mask_shuffle_v4i32_2345:
-; CHECK: # BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
; CHECK-NEXT: valignd {{.*#+}} xmm2 {%k1} = xmm0[2,3],xmm1[0,1]
; CHECK-NEXT: vmovdqa %xmm2, %xmm0
; CHECK-NEXT: retq
@@ -44,8 +44,8 @@ define <4 x i32> @mask_shuffle_v4i32_2345(<4 x i32> %a, <4 x i32> %b, <4 x i32>
define <4 x i32> @maskz_shuffle_v4i32_2345(<4 x i32> %a, <4 x i32> %b, i8 %mask) {
; CHECK-LABEL: maskz_shuffle_v4i32_2345:
-; CHECK: # BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
; CHECK-NEXT: valignd {{.*#+}} xmm0 {%k1} {z} = xmm0[2,3],xmm1[0,1]
; CHECK-NEXT: retq
%shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
@@ -57,8 +57,8 @@ define <4 x i32> @maskz_shuffle_v4i32_2345(<4 x i32> %a, <4 x i32> %b, i8 %mask)
define <2 x i64> @mask_shuffle_v2i64_12(<2 x i64> %a, <2 x i64> %b, <2 x i64> %passthru, i8 %mask) {
; CHECK-LABEL: mask_shuffle_v2i64_12:
-; CHECK: # BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
; CHECK-NEXT: valignq {{.*#+}} xmm2 {%k1} = xmm0[1],xmm1[0]
; CHECK-NEXT: vmovdqa %xmm2, %xmm0
; CHECK-NEXT: retq
@@ -71,8 +71,8 @@ define <2 x i64> @mask_shuffle_v2i64_12(<2 x i64> %a, <2 x i64> %b, <2 x i64> %p
define <2 x i64> @maskz_shuffle_v2i64_12(<2 x i64> %a, <2 x i64> %b, i8 %mask) {
; CHECK-LABEL: maskz_shuffle_v2i64_12:
-; CHECK: # BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
; CHECK-NEXT: valignq {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[0]
; CHECK-NEXT: retq
%shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 2>
@@ -84,8 +84,8 @@ define <2 x i64> @maskz_shuffle_v2i64_12(<2 x i64> %a, <2 x i64> %b, i8 %mask) {
define <4 x i64> @mask_shuffle_v4i64_1234(<4 x i64> %a, <4 x i64> %b, <4 x i64> %passthru, i8 %mask) {
; CHECK-LABEL: mask_shuffle_v4i64_1234:
-; CHECK: # BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
; CHECK-NEXT: valignq {{.*#+}} ymm2 {%k1} = ymm0[1,2,3],ymm1[0]
; CHECK-NEXT: vmovdqa %ymm2, %ymm0
; CHECK-NEXT: retq
@@ -98,8 +98,8 @@ define <4 x i64> @mask_shuffle_v4i64_1234(<4 x i64> %a, <4 x i64> %b, <4 x i64>
define <4 x i64> @maskz_shuffle_v4i64_1234(<4 x i64> %a, <4 x i64> %b, i8 %mask) {
; CHECK-LABEL: maskz_shuffle_v4i64_1234:
-; CHECK: # BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
; CHECK-NEXT: valignq {{.*#+}} ymm0 {%k1} {z} = ymm0[1,2,3],ymm1[0]
; CHECK-NEXT: retq
%shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
@@ -111,8 +111,8 @@ define <4 x i64> @maskz_shuffle_v4i64_1234(<4 x i64> %a, <4 x i64> %b, i8 %mask)
define <4 x i64> @mask_shuffle_v4i64_1230(<4 x i64> %a, <4 x i64> %passthru, i8 %mask) {
; CHECK-LABEL: mask_shuffle_v4i64_1230:
-; CHECK: # BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
; CHECK-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[1,2,3,0]
; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
@@ -125,8 +125,8 @@ define <4 x i64> @mask_shuffle_v4i64_1230(<4 x i64> %a, <4 x i64> %passthru, i8
define <4 x i64> @maskz_shuffle_v4i64_1230(<4 x i64> %a, i8 %mask) {
; CHECK-LABEL: maskz_shuffle_v4i64_1230:
-; CHECK: # BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[1,2,3,0]
; CHECK-NEXT: retq
%shuffle = shufflevector <4 x i64> %a, <4 x i64> undef, <4 x i32> <i32 1, i32 2, i32 3, i32 0>
@@ -138,8 +138,8 @@ define <4 x i64> @maskz_shuffle_v4i64_1230(<4 x i64> %a, i8 %mask) {
define <8 x i32> @mask_shuffle_v8i32_12345678(<8 x i32> %a, <8 x i32> %b, <8 x i32> %passthru, i8 %mask) {
; CHECK-LABEL: mask_shuffle_v8i32_12345678:
-; CHECK: # BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
; CHECK-NEXT: valignd {{.*#+}} ymm2 {%k1} = ymm0[1,2,3,4,5,6,7],ymm1[0]
; CHECK-NEXT: vmovdqa %ymm2, %ymm0
; CHECK-NEXT: retq
@@ -151,8 +151,8 @@ define <8 x i32> @mask_shuffle_v8i32_12345678(<8 x i32> %a, <8 x i32> %b, <8 x i
define <8 x i32> @maskz_shuffle_v8i32_12345678(<8 x i32> %a, <8 x i32> %b, i8 %mask) {
; CHECK-LABEL: maskz_shuffle_v8i32_12345678:
-; CHECK: # BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
; CHECK-NEXT: valignd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,2,3,4,5,6,7],ymm1[0]
; CHECK-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
@@ -163,8 +163,8 @@ define <8 x i32> @maskz_shuffle_v8i32_12345678(<8 x i32> %a, <8 x i32> %b, i8 %m
define <8 x i32> @mask_shuffle_v8i32_23456789(<8 x i32> %a, <8 x i32> %b, <8 x i32> %passthru, i8 %mask) {
; CHECK-LABEL: mask_shuffle_v8i32_23456789:
-; CHECK: # BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
; CHECK-NEXT: valignd {{.*#+}} ymm2 {%k1} = ymm0[2,3,4,5,6,7],ymm1[0,1]
; CHECK-NEXT: vmovdqa %ymm2, %ymm0
; CHECK-NEXT: retq
@@ -176,8 +176,8 @@ define <8 x i32> @mask_shuffle_v8i32_23456789(<8 x i32> %a, <8 x i32> %b, <8 x i
define <8 x i32> @maskz_shuffle_v8i32_23456789(<8 x i32> %a, <8 x i32> %b, i8 %mask) {
; CHECK-LABEL: maskz_shuffle_v8i32_23456789:
-; CHECK: # BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
; CHECK-NEXT: valignd {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3,4,5,6,7],ymm1[0,1]
; CHECK-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9>
@@ -188,8 +188,8 @@ define <8 x i32> @maskz_shuffle_v8i32_23456789(<8 x i32> %a, <8 x i32> %b, i8 %m
define <8 x i32> @mask_shuffle_v8i32_12345670(<8 x i32> %a, <8 x i32> %passthru, i8 %mask) {
; CHECK-LABEL: mask_shuffle_v8i32_12345670:
-; CHECK: # BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
; CHECK-NEXT: valignd {{.*#+}} ymm1 {%k1} = ymm0[1,2,3,4,5,6,7,0]
; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
@@ -201,8 +201,8 @@ define <8 x i32> @mask_shuffle_v8i32_12345670(<8 x i32> %a, <8 x i32> %passthru,
define <8 x i32> @maskz_shuffle_v8i32_12345670(<8 x i32> %a, i8 %mask) {
; CHECK-LABEL: maskz_shuffle_v8i32_12345670:
-; CHECK: # BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
; CHECK-NEXT: valignd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,2,3,4,5,6,7,0]
; CHECK-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> undef, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0>
@@ -213,9 +213,9 @@ define <8 x i32> @maskz_shuffle_v8i32_12345670(<8 x i32> %a, i8 %mask) {
define <8 x i32> @mask_shuffle_v8i32_23456701(<8 x i32> %a, <8 x i32> %passthru, i8 %mask) {
; CHECK-LABEL: mask_shuffle_v8i32_23456701:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,2,3,0]
-; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: kmovd %edi, %k1
; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
; CHECK-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1>
@@ -226,9 +226,9 @@ define <8 x i32> @mask_shuffle_v8i32_23456701(<8 x i32> %a, <8 x i32> %passthru,
define <8 x i32> @maskz_shuffle_v8i32_23456701(<8 x i32> %a, i8 %mask) {
; CHECK-LABEL: maskz_shuffle_v8i32_23456701:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,2,3,0]
-; CHECK-NEXT: kmovw %edi, %k1
+; CHECK-NEXT: kmovd %edi, %k1
; CHECK-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
; CHECK-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1>
@@ -237,12 +237,239 @@ define <8 x i32> @maskz_shuffle_v8i32_23456701(<8 x i32> %a, i8 %mask) {
ret <8 x i32> %res
}
+define <4 x i32> @mask_extract_v8i32_v4i32_0(<8 x i32> %a, <4 x i32> %passthru, i8 %mask) {
+; CHECK-LABEL: mask_extract_v8i32_v4i32_0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %shuffle = shufflevector <8 x i32> %a, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %mask.cast = bitcast i8 %mask to <8 x i1>
+ %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %res = select <4 x i1> %mask.extract, <4 x i32> %shuffle, <4 x i32> %passthru
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @mask_extract_v8i32_v4i32_0_z(<8 x i32> %a, i8 %mask) {
+; CHECK-LABEL: mask_extract_v8i32_v4i32_0_z:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %shuffle = shufflevector <8 x i32> %a, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %mask.cast = bitcast i8 %mask to <8 x i1>
+ %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %res = select <4 x i1> %mask.extract, <4 x i32> %shuffle, <4 x i32> zeroinitializer
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @mask_extract_v8i32_v4i32_1(<8 x i32> %a, <4 x i32> %passthru, i8 %mask) {
+; CHECK-LABEL: mask_extract_v8i32_v4i32_1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vextracti32x4 $1, %ymm0, %xmm1 {%k1}
+; CHECK-NEXT: vmovdqa %xmm1, %xmm0
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %shuffle = shufflevector <8 x i32> %a, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %mask.cast = bitcast i8 %mask to <8 x i1>
+ %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %res = select <4 x i1> %mask.extract, <4 x i32> %shuffle, <4 x i32> %passthru
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @mask_extract_v8i32_v4i32_1_z(<8 x i32> %a, i8 %mask) {
+; CHECK-LABEL: mask_extract_v8i32_v4i32_1_z:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vextracti32x4 $1, %ymm0, %xmm0 {%k1} {z}
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %shuffle = shufflevector <8 x i32> %a, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %mask.cast = bitcast i8 %mask to <8 x i1>
+ %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %res = select <4 x i1> %mask.extract, <4 x i32> %shuffle, <4 x i32> zeroinitializer
+ ret <4 x i32> %res
+}
+
+define <4 x float> @mask_extract_v8f32_v4f32_0(<8 x float> %a, <4 x float> %passthru, i8 %mask) {
+; CHECK-LABEL: mask_extract_v8f32_v4f32_0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vblendmps %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %shuffle = shufflevector <8 x float> %a, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %mask.cast = bitcast i8 %mask to <8 x i1>
+ %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %res = select <4 x i1> %mask.extract, <4 x float> %shuffle, <4 x float> %passthru
+ ret <4 x float> %res
+}
+
+define <4 x float> @mask_extract_v8f32_v4f32_0_z(<8 x float> %a, i8 %mask) {
+; CHECK-LABEL: mask_extract_v8f32_v4f32_0_z:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vmovaps %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %shuffle = shufflevector <8 x float> %a, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %mask.cast = bitcast i8 %mask to <8 x i1>
+ %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %res = select <4 x i1> %mask.extract, <4 x float> %shuffle, <4 x float> zeroinitializer
+ ret <4 x float> %res
+}
+
+define <4 x float> @mask_extract_v8f32_v4f32_1(<8 x float> %a, <4 x float> %passthru, i8 %mask) {
+; CHECK-LABEL: mask_extract_v8f32_v4f32_1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vextractf32x4 $1, %ymm0, %xmm1 {%k1}
+; CHECK-NEXT: vmovaps %xmm1, %xmm0
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %shuffle = shufflevector <8 x float> %a, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %mask.cast = bitcast i8 %mask to <8 x i1>
+ %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %res = select <4 x i1> %mask.extract, <4 x float> %shuffle, <4 x float> %passthru
+ ret <4 x float> %res
+}
+
+define <4 x float> @mask_extract_v8f32_v4f32_1_z(<8 x float> %a, i8 %mask) {
+; CHECK-LABEL: mask_extract_v8f32_v4f32_1_z:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vextractf32x4 $1, %ymm0, %xmm0 {%k1} {z}
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %shuffle = shufflevector <8 x float> %a, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %mask.cast = bitcast i8 %mask to <8 x i1>
+ %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %res = select <4 x i1> %mask.extract, <4 x float> %shuffle, <4 x float> zeroinitializer
+ ret <4 x float> %res
+}
+
+define <2 x i64> @mask_extract_v4i64_v2i64_0(<4 x i64> %a, <2 x i64> %passthru, i8 %mask) {
+; CHECK-LABEL: mask_extract_v4i64_v2i64_0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %shuffle = shufflevector <4 x i64> %a, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
+ %mask.cast = bitcast i8 %mask to <8 x i1>
+ %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
+ %res = select <2 x i1> %mask.extract, <2 x i64> %shuffle, <2 x i64> %passthru
+ ret <2 x i64> %res
+}
+
+define <2 x i64> @mask_extract_v4i64_v2i64_0_z(<4 x i64> %a, i8 %mask) {
+; CHECK-LABEL: mask_extract_v4i64_v2i64_0_z:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %shuffle = shufflevector <4 x i64> %a, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
+ %mask.cast = bitcast i8 %mask to <8 x i1>
+ %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
+ %res = select <2 x i1> %mask.extract, <2 x i64> %shuffle, <2 x i64> zeroinitializer
+ ret <2 x i64> %res
+}
+
+define <2 x i64> @mask_extract_v4i64_v2i64_1(<4 x i64> %a, <2 x i64> %passthru, i8 %mask) {
+; CHECK-LABEL: mask_extract_v4i64_v2i64_1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vextracti64x2 $1, %ymm0, %xmm1 {%k1}
+; CHECK-NEXT: vmovdqa %xmm1, %xmm0
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %shuffle = shufflevector <4 x i64> %a, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
+ %mask.cast = bitcast i8 %mask to <8 x i1>
+ %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
+ %res = select <2 x i1> %mask.extract, <2 x i64> %shuffle, <2 x i64> %passthru
+ ret <2 x i64> %res
+}
+
+define <2 x i64> @mask_extract_v4i64_v2i64_1_z(<4 x i64> %a, i8 %mask) {
+; CHECK-LABEL: mask_extract_v4i64_v2i64_1_z:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vextracti64x2 $1, %ymm0, %xmm0 {%k1} {z}
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %shuffle = shufflevector <4 x i64> %a, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
+ %mask.cast = bitcast i8 %mask to <8 x i1>
+ %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
+ %res = select <2 x i1> %mask.extract, <2 x i64> %shuffle, <2 x i64> zeroinitializer
+ ret <2 x i64> %res
+}
+
+define <2 x double> @mask_extract_v4f64_v2f64_0(<4 x double> %a, <2 x double> %passthru, i8 %mask) {
+; CHECK-LABEL: mask_extract_v4f64_v2f64_0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vblendmpd %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %shuffle = shufflevector <4 x double> %a, <4 x double> undef, <2 x i32> <i32 0, i32 1>
+ %mask.cast = bitcast i8 %mask to <8 x i1>
+ %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
+ %res = select <2 x i1> %mask.extract, <2 x double> %shuffle, <2 x double> %passthru
+ ret <2 x double> %res
+}
+
+define <2 x double> @mask_extract_v4f64_v2f64_0_z(<4 x double> %a, i8 %mask) {
+; CHECK-LABEL: mask_extract_v4f64_v2f64_0_z:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vmovapd %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %shuffle = shufflevector <4 x double> %a, <4 x double> undef, <2 x i32> <i32 0, i32 1>
+ %mask.cast = bitcast i8 %mask to <8 x i1>
+ %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
+ %res = select <2 x i1> %mask.extract, <2 x double> %shuffle, <2 x double> zeroinitializer
+ ret <2 x double> %res
+}
+
+define <2 x double> @mask_extract_v4f64_v2f64_1(<4 x double> %a, <2 x double> %passthru, i8 %mask) {
+; CHECK-LABEL: mask_extract_v4f64_v2f64_1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vextractf64x2 $1, %ymm0, %xmm1 {%k1}
+; CHECK-NEXT: vmovapd %xmm1, %xmm0
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %shuffle = shufflevector <4 x double> %a, <4 x double> undef, <2 x i32> <i32 2, i32 3>
+ %mask.cast = bitcast i8 %mask to <8 x i1>
+ %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
+ %res = select <2 x i1> %mask.extract, <2 x double> %shuffle, <2 x double> %passthru
+ ret <2 x double> %res
+}
+
+define <2 x double> @mask_extract_v4f64_v2f64_1_z(<4 x double> %a, i8 %mask) {
+; CHECK-LABEL: mask_extract_v4f64_v2f64_1_z:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vextractf64x2 $1, %ymm0, %xmm0 {%k1} {z}
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %shuffle = shufflevector <4 x double> %a, <4 x double> undef, <2 x i32> <i32 2, i32 3>
+ %mask.cast = bitcast i8 %mask to <8 x i1>
+ %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
+ %res = select <2 x i1> %mask.extract, <2 x double> %shuffle, <2 x double> zeroinitializer
+ ret <2 x double> %res
+}
+
define <4 x i32> @mask_extract_v16i32_v4i32_0(<16 x i32> %a, <4 x i32> %passthru, i8 %mask) {
; CHECK-LABEL: mask_extract_v16i32_v4i32_0:
-; CHECK: # BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vextracti32x4 $0, %zmm0, %xmm1 {%k1}
-; CHECK-NEXT: vmovdqa %xmm1, %xmm0
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuffle = shufflevector <16 x i32> %a, <16 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -252,10 +479,24 @@ define <4 x i32> @mask_extract_v16i32_v4i32_0(<16 x i32> %a, <4 x i32> %passthru
ret <4 x i32> %res
}
+define <4 x i32> @mask_extract_v16i32_v4i32_0_z(<16 x i32> %a, i8 %mask) {
+; CHECK-LABEL: mask_extract_v16i32_v4i32_0_z:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %shuffle = shufflevector <16 x i32> %a, <16 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %mask.cast = bitcast i8 %mask to <8 x i1>
+ %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %res = select <4 x i1> %mask.extract, <4 x i32> %shuffle, <4 x i32> zeroinitializer
+ ret <4 x i32> %res
+}
+
define <4 x i32> @mask_extract_v16i32_v4i32_1(<16 x i32> %a, <4 x i32> %passthru, i8 %mask) {
; CHECK-LABEL: mask_extract_v16i32_v4i32_1:
-; CHECK: # BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
; CHECK-NEXT: vextracti32x4 $1, %zmm0, %xmm1 {%k1}
; CHECK-NEXT: vmovdqa %xmm1, %xmm0
; CHECK-NEXT: vzeroupper
@@ -267,10 +508,24 @@ define <4 x i32> @mask_extract_v16i32_v4i32_1(<16 x i32> %a, <4 x i32> %passthru
ret <4 x i32> %res
}
+define <4 x i32> @mask_extract_v16i32_v4i32_1_z(<16 x i32> %a, i8 %mask) {
+; CHECK-LABEL: mask_extract_v16i32_v4i32_1_z:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vextracti32x4 $1, %zmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %shuffle = shufflevector <16 x i32> %a, <16 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %mask.cast = bitcast i8 %mask to <8 x i1>
+ %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %res = select <4 x i1> %mask.extract, <4 x i32> %shuffle, <4 x i32> zeroinitializer
+ ret <4 x i32> %res
+}
+
define <4 x i32> @mask_extract_v16i32_v4i32_2(<16 x i32> %a, <4 x i32> %passthru, i8 %mask) {
; CHECK-LABEL: mask_extract_v16i32_v4i32_2:
-; CHECK: # BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
; CHECK-NEXT: vextracti32x4 $2, %zmm0, %xmm1 {%k1}
; CHECK-NEXT: vmovdqa %xmm1, %xmm0
; CHECK-NEXT: vzeroupper
@@ -284,8 +539,8 @@ define <4 x i32> @mask_extract_v16i32_v4i32_2(<16 x i32> %a, <4 x i32> %passthru
define <4 x i32> @mask_extract_v16i32_v4i32_3(<16 x i32> %a, <4 x i32> %passthru, i8 %mask) {
; CHECK-LABEL: mask_extract_v16i32_v4i32_3:
-; CHECK: # BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
; CHECK-NEXT: vextracti32x4 $3, %zmm0, %xmm1 {%k1}
; CHECK-NEXT: vmovdqa %xmm1, %xmm0
; CHECK-NEXT: vzeroupper
@@ -299,10 +554,9 @@ define <4 x i32> @mask_extract_v16i32_v4i32_3(<16 x i32> %a, <4 x i32> %passthru
define <4 x float> @mask_extract_v16f32_v4f32_0(<16 x float> %a, <4 x float> %passthru, i8 %mask) {
; CHECK-LABEL: mask_extract_v16f32_v4f32_0:
-; CHECK: # BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vextractf32x4 $0, %zmm0, %xmm1 {%k1}
-; CHECK-NEXT: vmovaps %xmm1, %xmm0
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vblendmps %xmm0, %xmm1, %xmm0 {%k1}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuffle = shufflevector <16 x float> %a, <16 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -312,10 +566,24 @@ define <4 x float> @mask_extract_v16f32_v4f32_0(<16 x float> %a, <4 x float> %pa
ret <4 x float> %res
}
+define <4 x float> @mask_extract_v16f32_v4f32_0_z(<16 x float> %a, i8 %mask) {
+; CHECK-LABEL: mask_extract_v16f32_v4f32_0_z:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vmovaps %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %shuffle = shufflevector <16 x float> %a, <16 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %mask.cast = bitcast i8 %mask to <8 x i1>
+ %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %res = select <4 x i1> %mask.extract, <4 x float> %shuffle, <4 x float> zeroinitializer
+ ret <4 x float> %res
+}
+
define <4 x float> @mask_extract_v16f32_v4f32_1(<16 x float> %a, <4 x float> %passthru, i8 %mask) {
; CHECK-LABEL: mask_extract_v16f32_v4f32_1:
-; CHECK: # BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
; CHECK-NEXT: vextractf32x4 $1, %zmm0, %xmm1 {%k1}
; CHECK-NEXT: vmovaps %xmm1, %xmm0
; CHECK-NEXT: vzeroupper
@@ -327,10 +595,24 @@ define <4 x float> @mask_extract_v16f32_v4f32_1(<16 x float> %a, <4 x float> %pa
ret <4 x float> %res
}
+define <4 x float> @mask_extract_v16f32_v4f32_1_z(<16 x float> %a, i8 %mask) {
+; CHECK-LABEL: mask_extract_v16f32_v4f32_1_z:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vextractf32x4 $1, %zmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %shuffle = shufflevector <16 x float> %a, <16 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %mask.cast = bitcast i8 %mask to <8 x i1>
+ %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %res = select <4 x i1> %mask.extract, <4 x float> %shuffle, <4 x float> zeroinitializer
+ ret <4 x float> %res
+}
+
define <4 x float> @mask_extract_v16f32_v4f32_2(<16 x float> %a, <4 x float> %passthru, i8 %mask) {
; CHECK-LABEL: mask_extract_v16f32_v4f32_2:
-; CHECK: # BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
; CHECK-NEXT: vextractf32x4 $2, %zmm0, %xmm1 {%k1}
; CHECK-NEXT: vmovaps %xmm1, %xmm0
; CHECK-NEXT: vzeroupper
@@ -344,8 +626,8 @@ define <4 x float> @mask_extract_v16f32_v4f32_2(<16 x float> %a, <4 x float> %pa
define <4 x float> @mask_extract_v16f32_v4f32_3(<16 x float> %a, <4 x float> %passthru, i8 %mask) {
; CHECK-LABEL: mask_extract_v16f32_v4f32_3:
-; CHECK: # BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
; CHECK-NEXT: vextractf32x4 $3, %zmm0, %xmm1 {%k1}
; CHECK-NEXT: vmovaps %xmm1, %xmm0
; CHECK-NEXT: vzeroupper
@@ -359,10 +641,9 @@ define <4 x float> @mask_extract_v16f32_v4f32_3(<16 x float> %a, <4 x float> %pa
define <8 x i32> @mask_extract_v16i32_v8i32_0(<16 x i32> %a, <8 x i32> %passthru, i8 %mask) {
; CHECK-LABEL: mask_extract_v16i32_v8i32_0:
-; CHECK: # BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vextracti32x8 $0, %zmm0, %ymm1 {%k1}
-; CHECK-NEXT: vmovdqa %ymm1, %ymm0
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
; CHECK-NEXT: retq
%shuffle = shufflevector <16 x i32> %a, <16 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%mask.cast = bitcast i8 %mask to <8 x i1>
@@ -370,10 +651,22 @@ define <8 x i32> @mask_extract_v16i32_v8i32_0(<16 x i32> %a, <8 x i32> %passthru
ret <8 x i32> %res
}
+define <8 x i32> @mask_extract_v16i32_v8i32_0_z(<16 x i32> %a, i8 %mask) {
+; CHECK-LABEL: mask_extract_v16i32_v8i32_0_z:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %shuffle = shufflevector <16 x i32> %a, <16 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %mask.cast = bitcast i8 %mask to <8 x i1>
+ %res = select <8 x i1> %mask.cast, <8 x i32> %shuffle, <8 x i32> zeroinitializer
+ ret <8 x i32> %res
+}
+
define <8 x i32> @mask_extract_v16i32_v8i32_1(<16 x i32> %a, <8 x i32> %passthru, i8 %mask) {
; CHECK-LABEL: mask_extract_v16i32_v8i32_1:
-; CHECK: # BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
; CHECK-NEXT: vextracti32x8 $1, %zmm0, %ymm1 {%k1}
; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
@@ -383,12 +676,23 @@ define <8 x i32> @mask_extract_v16i32_v8i32_1(<16 x i32> %a, <8 x i32> %passthru
ret <8 x i32> %res
}
+define <8 x i32> @mask_extract_v16i32_v8i32_1_z(<16 x i32> %a, i8 %mask) {
+; CHECK-LABEL: mask_extract_v16i32_v8i32_1_z:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vextracti32x8 $1, %zmm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %shuffle = shufflevector <16 x i32> %a, <16 x i32> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %mask.cast = bitcast i8 %mask to <8 x i1>
+ %res = select <8 x i1> %mask.cast, <8 x i32> %shuffle, <8 x i32> zeroinitializer
+ ret <8 x i32> %res
+}
+
define <8 x float> @mask_extract_v16f32_v8f32_0(<16 x float> %a, <8 x float> %passthru, i8 %mask) {
; CHECK-LABEL: mask_extract_v16f32_v8f32_0:
-; CHECK: # BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vextractf32x8 $0, %zmm0, %ymm1 {%k1}
-; CHECK-NEXT: vmovaps %ymm1, %ymm0
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1}
; CHECK-NEXT: retq
%shuffle = shufflevector <16 x float> %a, <16 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%mask.cast = bitcast i8 %mask to <8 x i1>
@@ -396,10 +700,22 @@ define <8 x float> @mask_extract_v16f32_v8f32_0(<16 x float> %a, <8 x float> %pa
ret <8 x float> %res
}
+define <8 x float> @mask_extract_v16f32_v8f32_0_z(<16 x float> %a, i8 %mask) {
+; CHECK-LABEL: mask_extract_v16f32_v8f32_0_z:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %shuffle = shufflevector <16 x float> %a, <16 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %mask.cast = bitcast i8 %mask to <8 x i1>
+ %res = select <8 x i1> %mask.cast, <8 x float> %shuffle, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+
define <8 x float> @mask_extract_v16f32_v8f32_1(<16 x float> %a, <8 x float> %passthru, i8 %mask) {
; CHECK-LABEL: mask_extract_v16f32_v8f32_1:
-; CHECK: # BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
; CHECK-NEXT: vextractf32x8 $1, %zmm0, %ymm1 {%k1}
; CHECK-NEXT: vmovaps %ymm1, %ymm0
; CHECK-NEXT: retq
@@ -409,12 +725,23 @@ define <8 x float> @mask_extract_v16f32_v8f32_1(<16 x float> %a, <8 x float> %pa
ret <8 x float> %res
}
+define <8 x float> @mask_extract_v16f32_v8f32_1_z(<16 x float> %a, i8 %mask) {
+; CHECK-LABEL: mask_extract_v16f32_v8f32_1_z:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vextractf32x8 $1, %zmm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %shuffle = shufflevector <16 x float> %a, <16 x float> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %mask.cast = bitcast i8 %mask to <8 x i1>
+ %res = select <8 x i1> %mask.cast, <8 x float> %shuffle, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+
define <2 x i64> @mask_extract_v8i64_v2i64_0(<8 x i64> %a, <2 x i64> %passthru, i8 %mask) {
; CHECK-LABEL: mask_extract_v8i64_v2i64_0:
-; CHECK: # BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vextracti64x2 $0, %zmm0, %xmm1 {%k1}
-; CHECK-NEXT: vmovdqa %xmm1, %xmm0
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuffle = shufflevector <8 x i64> %a, <8 x i64> undef, <2 x i32> <i32 0, i32 1>
@@ -424,10 +751,24 @@ define <2 x i64> @mask_extract_v8i64_v2i64_0(<8 x i64> %a, <2 x i64> %passthru,
ret <2 x i64> %res
}
+define <2 x i64> @mask_extract_v8i64_v2i64_0_z(<8 x i64> %a, i8 %mask) {
+; CHECK-LABEL: mask_extract_v8i64_v2i64_0_z:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %shuffle = shufflevector <8 x i64> %a, <8 x i64> undef, <2 x i32> <i32 0, i32 1>
+ %mask.cast = bitcast i8 %mask to <8 x i1>
+ %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
+ %res = select <2 x i1> %mask.extract, <2 x i64> %shuffle, <2 x i64> zeroinitializer
+ ret <2 x i64> %res
+}
+
define <2 x i64> @mask_extract_v8i64_v2i64_1(<8 x i64> %a, <2 x i64> %passthru, i8 %mask) {
; CHECK-LABEL: mask_extract_v8i64_v2i64_1:
-; CHECK: # BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
; CHECK-NEXT: vextracti64x2 $1, %zmm0, %xmm1 {%k1}
; CHECK-NEXT: vmovdqa %xmm1, %xmm0
; CHECK-NEXT: vzeroupper
@@ -439,10 +780,24 @@ define <2 x i64> @mask_extract_v8i64_v2i64_1(<8 x i64> %a, <2 x i64> %passthru,
ret <2 x i64> %res
}
+define <2 x i64> @mask_extract_v8i64_v2i64_1_z(<8 x i64> %a, i8 %mask) {
+; CHECK-LABEL: mask_extract_v8i64_v2i64_1_z:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vextracti64x2 $1, %zmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %shuffle = shufflevector <8 x i64> %a, <8 x i64> undef, <2 x i32> <i32 2, i32 3>
+ %mask.cast = bitcast i8 %mask to <8 x i1>
+ %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
+ %res = select <2 x i1> %mask.extract, <2 x i64> %shuffle, <2 x i64> zeroinitializer
+ ret <2 x i64> %res
+}
+
define <2 x i64> @mask_extract_v8i64_v2i64_2(<8 x i64> %a, <2 x i64> %passthru, i8 %mask) {
; CHECK-LABEL: mask_extract_v8i64_v2i64_2:
-; CHECK: # BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
; CHECK-NEXT: vextracti64x2 $2, %zmm0, %xmm1 {%k1}
; CHECK-NEXT: vmovdqa %xmm1, %xmm0
; CHECK-NEXT: vzeroupper
@@ -456,8 +811,8 @@ define <2 x i64> @mask_extract_v8i64_v2i64_2(<8 x i64> %a, <2 x i64> %passthru,
define <2 x i64> @mask_extract_v8i64_v2i64_3(<8 x i64> %a, <2 x i64> %passthru, i8 %mask) {
; CHECK-LABEL: mask_extract_v8i64_v2i64_3:
-; CHECK: # BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
; CHECK-NEXT: vextracti64x2 $3, %zmm0, %xmm1 {%k1}
; CHECK-NEXT: vmovdqa %xmm1, %xmm0
; CHECK-NEXT: vzeroupper
@@ -471,10 +826,9 @@ define <2 x i64> @mask_extract_v8i64_v2i64_3(<8 x i64> %a, <2 x i64> %passthru,
define <2 x double> @mask_extract_v8f64_v2f64_0(<8 x double> %a, <2 x double> %passthru, i8 %mask) {
; CHECK-LABEL: mask_extract_v8f64_v2f64_0:
-; CHECK: # BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vextractf64x2 $0, %zmm0, %xmm1 {%k1}
-; CHECK-NEXT: vmovapd %xmm1, %xmm0
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vblendmpd %xmm0, %xmm1, %xmm0 {%k1}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> undef, <2 x i32> <i32 0, i32 1>
@@ -484,10 +838,24 @@ define <2 x double> @mask_extract_v8f64_v2f64_0(<8 x double> %a, <2 x double> %p
ret <2 x double> %res
}
+define <2 x double> @mask_extract_v8f64_v2f64_0_z(<8 x double> %a, i8 %mask) {
+; CHECK-LABEL: mask_extract_v8f64_v2f64_0_z:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vmovapd %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %shuffle = shufflevector <8 x double> %a, <8 x double> undef, <2 x i32> <i32 0, i32 1>
+ %mask.cast = bitcast i8 %mask to <8 x i1>
+ %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
+ %res = select <2 x i1> %mask.extract, <2 x double> %shuffle, <2 x double> zeroinitializer
+ ret <2 x double> %res
+}
+
define <2 x double> @mask_extract_v8f64_v2f64_1(<8 x double> %a, <2 x double> %passthru, i8 %mask) {
; CHECK-LABEL: mask_extract_v8f64_v2f64_1:
-; CHECK: # BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
; CHECK-NEXT: vextractf64x2 $1, %zmm0, %xmm1 {%k1}
; CHECK-NEXT: vmovapd %xmm1, %xmm0
; CHECK-NEXT: vzeroupper
@@ -499,10 +867,24 @@ define <2 x double> @mask_extract_v8f64_v2f64_1(<8 x double> %a, <2 x double> %p
ret <2 x double> %res
}
+define <2 x double> @mask_extract_v8f64_v2f64_1_z(<8 x double> %a, i8 %mask) {
+; CHECK-LABEL: mask_extract_v8f64_v2f64_1_z:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vextractf64x2 $1, %zmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %shuffle = shufflevector <8 x double> %a, <8 x double> undef, <2 x i32> <i32 2, i32 3>
+ %mask.cast = bitcast i8 %mask to <8 x i1>
+ %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
+ %res = select <2 x i1> %mask.extract, <2 x double> %shuffle, <2 x double> zeroinitializer
+ ret <2 x double> %res
+}
+
define <2 x double> @mask_extract_v8f64_v2f64_2(<8 x double> %a, <2 x double> %passthru, i8 %mask) {
; CHECK-LABEL: mask_extract_v8f64_v2f64_2:
-; CHECK: # BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
; CHECK-NEXT: vextractf64x2 $2, %zmm0, %xmm1 {%k1}
; CHECK-NEXT: vmovapd %xmm1, %xmm0
; CHECK-NEXT: vzeroupper
@@ -516,8 +898,8 @@ define <2 x double> @mask_extract_v8f64_v2f64_2(<8 x double> %a, <2 x double> %p
define <2 x double> @mask_extract_v8f64_v2f64_3(<8 x double> %a, <2 x double> %passthru, i8 %mask) {
; CHECK-LABEL: mask_extract_v8f64_v2f64_3:
-; CHECK: # BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
; CHECK-NEXT: vextractf64x2 $3, %zmm0, %xmm1 {%k1}
; CHECK-NEXT: vmovapd %xmm1, %xmm0
; CHECK-NEXT: vzeroupper
@@ -531,10 +913,9 @@ define <2 x double> @mask_extract_v8f64_v2f64_3(<8 x double> %a, <2 x double> %p
define <4 x i64> @mask_extract_v8i64_v4i64_0(<8 x i64> %a, <4 x i64> %passthru, i8 %mask) {
; CHECK-LABEL: mask_extract_v8i64_v4i64_0:
-; CHECK: # BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vextracti64x4 $0, %zmm0, %ymm1 {%k1}
-; CHECK-NEXT: vmovdqa %ymm1, %ymm0
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
; CHECK-NEXT: retq
%shuffle = shufflevector <8 x i64> %a, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%mask.cast = bitcast i8 %mask to <8 x i1>
@@ -543,10 +924,23 @@ define <4 x i64> @mask_extract_v8i64_v4i64_0(<8 x i64> %a, <4 x i64> %passthru,
ret <4 x i64> %res
}
+define <4 x i64> @mask_extract_v8i64_v4i64_0_z(<8 x i64> %a, i8 %mask) {
+; CHECK-LABEL: mask_extract_v8i64_v4i64_0_z:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %shuffle = shufflevector <8 x i64> %a, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %mask.cast = bitcast i8 %mask to <8 x i1>
+ %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %res = select <4 x i1> %mask.extract, <4 x i64> %shuffle, <4 x i64> zeroinitializer
+ ret <4 x i64> %res
+}
+
define <4 x i64> @mask_extract_v8i64_v4i64_1(<8 x i64> %a, <4 x i64> %passthru, i8 %mask) {
; CHECK-LABEL: mask_extract_v8i64_v4i64_1:
-; CHECK: # BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 {%k1}
; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
@@ -557,12 +951,24 @@ define <4 x i64> @mask_extract_v8i64_v4i64_1(<8 x i64> %a, <4 x i64> %passthru,
ret <4 x i64> %res
}
+define <4 x i64> @mask_extract_v8i64_v4i64_1_z(<8 x i64> %a, i8 %mask) {
+; CHECK-LABEL: mask_extract_v8i64_v4i64_1_z:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %shuffle = shufflevector <8 x i64> %a, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %mask.cast = bitcast i8 %mask to <8 x i1>
+ %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %res = select <4 x i1> %mask.extract, <4 x i64> %shuffle, <4 x i64> zeroinitializer
+ ret <4 x i64> %res
+}
+
define <4 x double> @mask_extract_v8f64_v4f64_0(<8 x double> %a, <4 x double> %passthru, i8 %mask) {
; CHECK-LABEL: mask_extract_v8f64_v4f64_0:
-; CHECK: # BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
-; CHECK-NEXT: vextractf64x4 $0, %zmm0, %ymm1 {%k1}
-; CHECK-NEXT: vmovapd %ymm1, %ymm0
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
; CHECK-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%mask.cast = bitcast i8 %mask to <8 x i1>
@@ -571,10 +977,23 @@ define <4 x double> @mask_extract_v8f64_v4f64_0(<8 x double> %a, <4 x double> %p
ret <4 x double> %res
}
+define <4 x double> @mask_extract_v8f64_v4f64_0_z(<8 x double> %a, i8 %mask) {
+; CHECK-LABEL: mask_extract_v8f64_v4f64_0_z:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %shuffle = shufflevector <8 x double> %a, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %mask.cast = bitcast i8 %mask to <8 x i1>
+ %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %res = select <4 x i1> %mask.extract, <4 x double> %shuffle, <4 x double> zeroinitializer
+ ret <4 x double> %res
+}
+
define <4 x double> @mask_extract_v8f64_v4f64_1(<8 x double> %a, <4 x double> %passthru, i8 %mask) {
; CHECK-LABEL: mask_extract_v8f64_v4f64_1:
-; CHECK: # BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm1 {%k1}
; CHECK-NEXT: vmovapd %ymm1, %ymm0
; CHECK-NEXT: retq
@@ -585,10 +1004,49 @@ define <4 x double> @mask_extract_v8f64_v4f64_1(<8 x double> %a, <4 x double> %p
ret <4 x double> %res
}
-define <8 x i32> @mask_extract_v8i64_v8i32_1(<8 x i64> %a, <8 x i32> %passthru, i8 %mask) {
-; CHECK-LABEL: mask_extract_v8i64_v8i32_1:
-; CHECK: # BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
+define <4 x double> @mask_extract_v8f64_v4f64_1_z(<8 x double> %a, i8 %mask) {
+; CHECK-LABEL: mask_extract_v8f64_v4f64_1_z:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %shuffle = shufflevector <8 x double> %a, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %mask.cast = bitcast i8 %mask to <8 x i1>
+ %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %res = select <4 x i1> %mask.extract, <4 x double> %shuffle, <4 x double> zeroinitializer
+ ret <4 x double> %res
+}
+
+define <8 x i32> @mask_cast_extract_v8i64_v8i32_0(<8 x i64> %a, <8 x i32> %passthru, i8 %mask) {
+; CHECK-LABEL: mask_cast_extract_v8i64_v8i32_0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: retq
+ %shuffle = shufflevector <8 x i64> %a, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %shuffle.cast = bitcast <4 x i64> %shuffle to <8 x i32>
+ %mask.cast = bitcast i8 %mask to <8 x i1>
+ %res = select <8 x i1> %mask.cast, <8 x i32> %shuffle.cast, <8 x i32> %passthru
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @mask_cast_extract_v8i64_v8i32_0_z(<8 x i64> %a, i8 %mask) {
+; CHECK-LABEL: mask_cast_extract_v8i64_v8i32_0_z:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %shuffle = shufflevector <8 x i64> %a, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %shuffle.cast = bitcast <4 x i64> %shuffle to <8 x i32>
+ %mask.cast = bitcast i8 %mask to <8 x i1>
+ %res = select <8 x i1> %mask.cast, <8 x i32> %shuffle.cast, <8 x i32> zeroinitializer
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @mask_cast_extract_v8i64_v8i32_1(<8 x i64> %a, <8 x i32> %passthru, i8 %mask) {
+; CHECK-LABEL: mask_cast_extract_v8i64_v8i32_1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
; CHECK-NEXT: vextracti32x8 $1, %zmm0, %ymm1 {%k1}
; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
@@ -599,10 +1057,49 @@ define <8 x i32> @mask_extract_v8i64_v8i32_1(<8 x i64> %a, <8 x i32> %passthru,
ret <8 x i32> %res
}
-define <8 x float> @mask_extract_v8f64_v8f32_1(<8 x double> %a, <8 x float> %passthru, i8 %mask) {
-; CHECK-LABEL: mask_extract_v8f64_v8f32_1:
-; CHECK: # BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
+define <8 x i32> @mask_cast_extract_v8i64_v8i32_1_z(<8 x i64> %a, i8 %mask) {
+; CHECK-LABEL: mask_cast_extract_v8i64_v8i32_1_z:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vextracti32x8 $1, %zmm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %shuffle = shufflevector <8 x i64> %a, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %shuffle.cast = bitcast <4 x i64> %shuffle to <8 x i32>
+ %mask.cast = bitcast i8 %mask to <8 x i1>
+ %res = select <8 x i1> %mask.cast, <8 x i32> %shuffle.cast, <8 x i32> zeroinitializer
+ ret <8 x i32> %res
+}
+
+define <8 x float> @mask_cast_extract_v8f64_v8f32_0(<8 x double> %a, <8 x float> %passthru, i8 %mask) {
+; CHECK-LABEL: mask_cast_extract_v8f64_v8f32_0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: retq
+ %shuffle = shufflevector <8 x double> %a, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %shuffle.cast = bitcast <4 x double> %shuffle to <8 x float>
+ %mask.cast = bitcast i8 %mask to <8 x i1>
+ %res = select <8 x i1> %mask.cast, <8 x float> %shuffle.cast, <8 x float> %passthru
+ ret <8 x float> %res
+}
+
+define <8 x float> @mask_cast_extract_v8f64_v8f32_0_z(<8 x double> %a, i8 %mask) {
+; CHECK-LABEL: mask_cast_extract_v8f64_v8f32_0_z:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %shuffle = shufflevector <8 x double> %a, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %shuffle.cast = bitcast <4 x double> %shuffle to <8 x float>
+ %mask.cast = bitcast i8 %mask to <8 x i1>
+ %res = select <8 x i1> %mask.cast, <8 x float> %shuffle.cast, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+
+define <8 x float> @mask_cast_extract_v8f64_v8f32_1(<8 x double> %a, <8 x float> %passthru, i8 %mask) {
+; CHECK-LABEL: mask_cast_extract_v8f64_v8f32_1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
; CHECK-NEXT: vextractf32x8 $1, %zmm0, %ymm1 {%k1}
; CHECK-NEXT: vmovaps %ymm1, %ymm0
; CHECK-NEXT: retq
@@ -613,10 +1110,53 @@ define <8 x float> @mask_extract_v8f64_v8f32_1(<8 x double> %a, <8 x float> %pas
ret <8 x float> %res
}
+define <8 x float> @mask_cast_extract_v8f64_v8f32_1_z(<8 x double> %a, i8 %mask) {
+; CHECK-LABEL: mask_cast_extract_v8f64_v8f32_1_z:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vextractf32x8 $1, %zmm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %shuffle = shufflevector <8 x double> %a, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %shuffle.cast = bitcast <4 x double> %shuffle to <8 x float>
+ %mask.cast = bitcast i8 %mask to <8 x i1>
+ %res = select <8 x i1> %mask.cast, <8 x float> %shuffle.cast, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+
+define <4 x i32> @mask_cast_extract_v8i64_v4i32_0(<8 x i64> %a, <4 x i32> %passthru, i8 %mask) {
+; CHECK-LABEL: mask_cast_extract_v8i64_v4i32_0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %shuffle = shufflevector <8 x i64> %a, <8 x i64> undef, <2 x i32> <i32 0, i32 1>
+ %shuffle.cast = bitcast <2 x i64> %shuffle to <4 x i32>
+ %mask.cast = bitcast i8 %mask to <8 x i1>
+ %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %res = select <4 x i1> %mask.extract, <4 x i32> %shuffle.cast, <4 x i32> %passthru
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @mask_cast_extract_v8i64_v4i32_0_z(<8 x i64> %a, i8 %mask) {
+; CHECK-LABEL: mask_cast_extract_v8i64_v4i32_0_z:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %shuffle = shufflevector <8 x i64> %a, <8 x i64> undef, <2 x i32> <i32 0, i32 1>
+ %shuffle.cast = bitcast <2 x i64> %shuffle to <4 x i32>
+ %mask.cast = bitcast i8 %mask to <8 x i1>
+ %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %res = select <4 x i1> %mask.extract, <4 x i32> %shuffle.cast, <4 x i32> zeroinitializer
+ ret <4 x i32> %res
+}
+
define <4 x i32> @mask_cast_extract_v8i64_v4i32_1(<8 x i64> %a, <4 x i32> %passthru, i8 %mask) {
; CHECK-LABEL: mask_cast_extract_v8i64_v4i32_1:
-; CHECK: # BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
; CHECK-NEXT: vextracti32x4 $1, %zmm0, %xmm1 {%k1}
; CHECK-NEXT: vmovdqa %xmm1, %xmm0
; CHECK-NEXT: vzeroupper
@@ -629,10 +1169,55 @@ define <4 x i32> @mask_cast_extract_v8i64_v4i32_1(<8 x i64> %a, <4 x i32> %passt
ret <4 x i32> %res
}
+define <4 x i32> @mask_cast_extract_v8i64_v4i32_1_z(<8 x i64> %a, i8 %mask) {
+; CHECK-LABEL: mask_cast_extract_v8i64_v4i32_1_z:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vextracti32x4 $1, %zmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %shuffle = shufflevector <8 x i64> %a, <8 x i64> undef, <2 x i32> <i32 2, i32 3>
+ %shuffle.cast = bitcast <2 x i64> %shuffle to <4 x i32>
+ %mask.cast = bitcast i8 %mask to <8 x i1>
+ %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %res = select <4 x i1> %mask.extract, <4 x i32> %shuffle.cast, <4 x i32> zeroinitializer
+ ret <4 x i32> %res
+}
+
+define <4 x float> @mask_cast_extract_v8f64_v4f32_0(<8 x double> %a, <4 x float> %passthru, i8 %mask) {
+; CHECK-LABEL: mask_cast_extract_v8f64_v4f32_0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vblendmps %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %shuffle = shufflevector <8 x double> %a, <8 x double> undef, <2 x i32> <i32 0, i32 1>
+ %shuffle.cast = bitcast <2 x double> %shuffle to <4 x float>
+ %mask.cast = bitcast i8 %mask to <8 x i1>
+ %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %res = select <4 x i1> %mask.extract, <4 x float> %shuffle.cast, <4 x float> %passthru
+ ret <4 x float> %res
+}
+
+define <4 x float> @mask_cast_extract_v8f64_v4f32_0_z(<8 x double> %a, i8 %mask) {
+; CHECK-LABEL: mask_cast_extract_v8f64_v4f32_0_z:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vmovaps %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %shuffle = shufflevector <8 x double> %a, <8 x double> undef, <2 x i32> <i32 0, i32 1>
+ %shuffle.cast = bitcast <2 x double> %shuffle to <4 x float>
+ %mask.cast = bitcast i8 %mask to <8 x i1>
+ %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %res = select <4 x i1> %mask.extract, <4 x float> %shuffle.cast, <4 x float> zeroinitializer
+ ret <4 x float> %res
+}
+
define <4 x float> @mask_cast_extract_v8f64_v4f32_1(<8 x double> %a, <4 x float> %passthru, i8 %mask) {
; CHECK-LABEL: mask_cast_extract_v8f64_v4f32_1:
-; CHECK: # BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
; CHECK-NEXT: vextractf32x4 $1, %zmm0, %xmm1 {%k1}
; CHECK-NEXT: vmovaps %xmm1, %xmm0
; CHECK-NEXT: vzeroupper
@@ -645,10 +1230,53 @@ define <4 x float> @mask_cast_extract_v8f64_v4f32_1(<8 x double> %a, <4 x float>
ret <4 x float> %res
}
+define <4 x float> @mask_cast_extract_v8f64_v4f32_1_z(<8 x double> %a, i8 %mask) {
+; CHECK-LABEL: mask_cast_extract_v8f64_v4f32_1_z:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vextractf32x4 $1, %zmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %shuffle = shufflevector <8 x double> %a, <8 x double> undef, <2 x i32> <i32 2, i32 3>
+ %shuffle.cast = bitcast <2 x double> %shuffle to <4 x float>
+ %mask.cast = bitcast i8 %mask to <8 x i1>
+ %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %res = select <4 x i1> %mask.extract, <4 x float> %shuffle.cast, <4 x float> zeroinitializer
+ ret <4 x float> %res
+}
+
+define <4 x i64> @mask_cast_extract_v16i32_v4i64_0(<16 x i32> %a, <4 x i64> %passthru, i8 %mask) {
+; CHECK-LABEL: mask_cast_extract_v16i32_v4i64_0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: retq
+ %shuffle = shufflevector <16 x i32> %a, <16 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %shuffle.cast = bitcast <8 x i32> %shuffle to <4 x i64>
+ %mask.cast = bitcast i8 %mask to <8 x i1>
+ %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %res = select <4 x i1> %mask.extract, <4 x i64> %shuffle.cast, <4 x i64> %passthru
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @mask_cast_extract_v16i32_v4i64_0_z(<16 x i32> %a, i8 %mask) {
+; CHECK-LABEL: mask_cast_extract_v16i32_v4i64_0_z:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %shuffle = shufflevector <16 x i32> %a, <16 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %shuffle.cast = bitcast <8 x i32> %shuffle to <4 x i64>
+ %mask.cast = bitcast i8 %mask to <8 x i1>
+ %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %res = select <4 x i1> %mask.extract, <4 x i64> %shuffle.cast, <4 x i64> zeroinitializer
+ ret <4 x i64> %res
+}
+
define <4 x i64> @mask_cast_extract_v16i32_v4i64_1(<16 x i32> %a, <4 x i64> %passthru, i8 %mask) {
; CHECK-LABEL: mask_cast_extract_v16i32_v4i64_1:
-; CHECK: # BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 {%k1}
; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
@@ -660,10 +1288,52 @@ define <4 x i64> @mask_cast_extract_v16i32_v4i64_1(<16 x i32> %a, <4 x i64> %pas
ret <4 x i64> %res
}
+define <4 x i64> @mask_cast_extract_v16i32_v4i64_1_z(<16 x i32> %a, i8 %mask) {
+; CHECK-LABEL: mask_cast_extract_v16i32_v4i64_1_z:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %shuffle = shufflevector <16 x i32> %a, <16 x i32> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %shuffle.cast = bitcast <8 x i32> %shuffle to <4 x i64>
+ %mask.cast = bitcast i8 %mask to <8 x i1>
+ %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %res = select <4 x i1> %mask.extract, <4 x i64> %shuffle.cast, <4 x i64> zeroinitializer
+ ret <4 x i64> %res
+}
+
+define <4 x double> @mask_cast_extract_v16f32_v4f64_0(<16 x float> %a, <4 x double> %passthru, i8 %mask) {
+; CHECK-LABEL: mask_cast_extract_v16f32_v4f64_0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: retq
+ %shuffle = shufflevector <16 x float> %a, <16 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %shuffle.cast = bitcast <8 x float> %shuffle to <4 x double>
+ %mask.cast = bitcast i8 %mask to <8 x i1>
+ %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %res = select <4 x i1> %mask.extract, <4 x double> %shuffle.cast, <4 x double> %passthru
+ ret <4 x double> %res
+}
+
+define <4 x double> @mask_cast_extract_v16f32_v4f64_0_z(<16 x float> %a, i8 %mask) {
+; CHECK-LABEL: mask_cast_extract_v16f32_v4f64_0_z:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %shuffle = shufflevector <16 x float> %a, <16 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %shuffle.cast = bitcast <8 x float> %shuffle to <4 x double>
+ %mask.cast = bitcast i8 %mask to <8 x i1>
+ %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %res = select <4 x i1> %mask.extract, <4 x double> %shuffle.cast, <4 x double> zeroinitializer
+ ret <4 x double> %res
+}
+
define <4 x double> @mask_cast_extract_v16f32_v4f64_1(<16 x float> %a, <4 x double> %passthru, i8 %mask) {
; CHECK-LABEL: mask_cast_extract_v16f32_v4f64_1:
-; CHECK: # BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm1 {%k1}
; CHECK-NEXT: vmovapd %ymm1, %ymm0
; CHECK-NEXT: retq
@@ -675,10 +1345,54 @@ define <4 x double> @mask_cast_extract_v16f32_v4f64_1(<16 x float> %a, <4 x doub
ret <4 x double> %res
}
+define <4 x double> @mask_cast_extract_v16f32_v4f64_1_z(<16 x float> %a, i8 %mask) {
+; CHECK-LABEL: mask_cast_extract_v16f32_v4f64_1_z:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %shuffle = shufflevector <16 x float> %a, <16 x float> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %shuffle.cast = bitcast <8 x float> %shuffle to <4 x double>
+ %mask.cast = bitcast i8 %mask to <8 x i1>
+ %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %res = select <4 x i1> %mask.extract, <4 x double> %shuffle.cast, <4 x double> zeroinitializer
+ ret <4 x double> %res
+}
+
+define <2 x i64> @mask_cast_extract_v16i32_v2i64_0(<16 x i32> %a, <2 x i64> %passthru, i8 %mask) {
+; CHECK-LABEL: mask_cast_extract_v16i32_v2i64_0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %shuffle = shufflevector <16 x i32> %a, <16 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %shuffle.cast = bitcast <4 x i32> %shuffle to <2 x i64>
+ %mask.cast = bitcast i8 %mask to <8 x i1>
+ %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
+ %res = select <2 x i1> %mask.extract, <2 x i64> %shuffle.cast, <2 x i64> %passthru
+ ret <2 x i64> %res
+}
+
+define <2 x i64> @mask_cast_extract_v16i32_v2i64_0_z(<16 x i32> %a, i8 %mask) {
+; CHECK-LABEL: mask_cast_extract_v16i32_v2i64_0_z:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %shuffle = shufflevector <16 x i32> %a, <16 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %shuffle.cast = bitcast <4 x i32> %shuffle to <2 x i64>
+ %mask.cast = bitcast i8 %mask to <8 x i1>
+ %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
+ %res = select <2 x i1> %mask.extract, <2 x i64> %shuffle.cast, <2 x i64> zeroinitializer
+ ret <2 x i64> %res
+}
+
define <2 x i64> @mask_cast_extract_v16i32_v2i64_1(<16 x i32> %a, <2 x i64> %passthru, i8 %mask) {
; CHECK-LABEL: mask_cast_extract_v16i32_v2i64_1:
-; CHECK: # BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
; CHECK-NEXT: vextracti64x2 $1, %zmm0, %xmm1 {%k1}
; CHECK-NEXT: vmovdqa %xmm1, %xmm0
; CHECK-NEXT: vzeroupper
@@ -691,10 +1405,55 @@ define <2 x i64> @mask_cast_extract_v16i32_v2i64_1(<16 x i32> %a, <2 x i64> %pas
ret <2 x i64> %res
}
+define <2 x i64> @mask_cast_extract_v16i32_v2i64_1_z(<16 x i32> %a, i8 %mask) {
+; CHECK-LABEL: mask_cast_extract_v16i32_v2i64_1_z:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vextracti64x2 $1, %zmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %shuffle = shufflevector <16 x i32> %a, <16 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %shuffle.cast = bitcast <4 x i32> %shuffle to <2 x i64>
+ %mask.cast = bitcast i8 %mask to <8 x i1>
+ %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
+ %res = select <2 x i1> %mask.extract, <2 x i64> %shuffle.cast, <2 x i64> zeroinitializer
+ ret <2 x i64> %res
+}
+
+define <2 x double> @mask_cast_extract_v16f32_v2f64_0(<16 x float> %a, <2 x double> %passthru, i8 %mask) {
+; CHECK-LABEL: mask_cast_extract_v16f32_v2f64_0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vblendmpd %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %shuffle = shufflevector <16 x float> %a, <16 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %shuffle.cast = bitcast <4 x float> %shuffle to <2 x double>
+ %mask.cast = bitcast i8 %mask to <8 x i1>
+ %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
+ %res = select <2 x i1> %mask.extract, <2 x double> %shuffle.cast, <2 x double> %passthru
+ ret <2 x double> %res
+}
+
+define <2 x double> @mask_cast_extract_v16f32_v2f64_0_z(<16 x float> %a, i8 %mask) {
+; CHECK-LABEL: mask_cast_extract_v16f32_v2f64_0_z:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vmovapd %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %shuffle = shufflevector <16 x float> %a, <16 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %shuffle.cast = bitcast <4 x float> %shuffle to <2 x double>
+ %mask.cast = bitcast i8 %mask to <8 x i1>
+ %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
+ %res = select <2 x i1> %mask.extract, <2 x double> %shuffle.cast, <2 x double> zeroinitializer
+ ret <2 x double> %res
+}
+
define <2 x double> @mask_cast_extract_v16f32_v2f64_1(<16 x float> %a, <2 x double> %passthru, i8 %mask) {
; CHECK-LABEL: mask_cast_extract_v16f32_v2f64_1:
-; CHECK: # BB#0:
-; CHECK-NEXT: kmovw %edi, %k1
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
; CHECK-NEXT: vextractf64x2 $1, %zmm0, %xmm1 {%k1}
; CHECK-NEXT: vmovapd %xmm1, %xmm0
; CHECK-NEXT: vzeroupper
@@ -707,10 +1466,25 @@ define <2 x double> @mask_cast_extract_v16f32_v2f64_1(<16 x float> %a, <2 x doub
ret <2 x double> %res
}
+define <2 x double> @mask_cast_extract_v16f32_v2f64_1_z(<16 x float> %a, i8 %mask) {
+; CHECK-LABEL: mask_cast_extract_v16f32_v2f64_1_z:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vextractf64x2 $1, %zmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %shuffle = shufflevector <16 x float> %a, <16 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %shuffle.cast = bitcast <4 x float> %shuffle to <2 x double>
+ %mask.cast = bitcast i8 %mask to <8 x i1>
+ %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
+ %res = select <2 x i1> %mask.extract, <2 x double> %shuffle.cast, <2 x double> zeroinitializer
+ ret <2 x double> %res
+}
+
define <2 x double> @broadcast_v4f32_0101_from_v2f32_mask(double* %x, <2 x double> %passthru, i8 %mask) {
; CHECK-LABEL: broadcast_v4f32_0101_from_v2f32_mask:
-; CHECK: # BB#0:
-; CHECK-NEXT: kmovw %esi, %k1
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: vmovddup {{.*#+}} xmm0 {%k1} = mem[0,0]
; CHECK-NEXT: retq
%q = load double, double* %x, align 1
@@ -724,8 +1498,8 @@ define <2 x double> @broadcast_v4f32_0101_from_v2f32_mask(double* %x, <2 x doubl
define <2 x double> @broadcast_v4f32_0101_from_v2f32_maskz(double* %x, i8 %mask) {
; CHECK-LABEL: broadcast_v4f32_0101_from_v2f32_maskz:
-; CHECK: # BB#0:
-; CHECK-NEXT: kmovw %esi, %k1
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: vmovddup {{.*#+}} xmm0 {%k1} {z} = mem[0,0]
; CHECK-NEXT: retq
%q = load double, double* %x, align 1
@@ -739,8 +1513,8 @@ define <2 x double> @broadcast_v4f32_0101_from_v2f32_maskz(double* %x, i8 %mask)
define <8 x float> @test_broadcast_2f64_8f32(<2 x double> *%p, i8 %mask) nounwind {
; CHECK-LABEL: test_broadcast_2f64_8f32:
-; CHECK: # BB#0:
-; CHECK-NEXT: kmovw %esi, %k1
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: vbroadcastf32x4 {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,0,1,2,3]
; CHECK-NEXT: retq
%1 = load <2 x double>, <2 x double> *%p
@@ -753,8 +1527,8 @@ define <8 x float> @test_broadcast_2f64_8f32(<2 x double> *%p, i8 %mask) nounwin
define <8 x i32> @test_broadcast_2i64_8i32(<2 x i64> *%p, i8 %mask) nounwind {
; CHECK-LABEL: test_broadcast_2i64_8i32:
-; CHECK: # BB#0:
-; CHECK-NEXT: kmovw %esi, %k1
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: vbroadcasti32x4 {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,0,1,2,3]
; CHECK-NEXT: retq
%1 = load <2 x i64>, <2 x i64> *%p
@@ -767,8 +1541,8 @@ define <8 x i32> @test_broadcast_2i64_8i32(<2 x i64> *%p, i8 %mask) nounwind {
define <16 x float> @test_broadcast_2f64_16f32(<2 x double> *%p, i16 %mask) nounwind {
; CHECK-LABEL: test_broadcast_2f64_16f32:
-; CHECK: # BB#0:
-; CHECK-NEXT: kmovw %esi, %k1
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; CHECK-NEXT: retq
%1 = load <2 x double>, <2 x double> *%p
@@ -781,8 +1555,8 @@ define <16 x float> @test_broadcast_2f64_16f32(<2 x double> *%p, i16 %mask) noun
define <16 x i32> @test_broadcast_2i64_16i32(<2 x i64> *%p, i16 %mask) nounwind {
; CHECK-LABEL: test_broadcast_2i64_16i32:
-; CHECK: # BB#0:
-; CHECK-NEXT: kmovw %esi, %k1
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; CHECK-NEXT: retq
%1 = load <2 x i64>, <2 x i64> *%p
@@ -795,8 +1569,8 @@ define <16 x i32> @test_broadcast_2i64_16i32(<2 x i64> *%p, i16 %mask) nounwind
define <16 x float> @test_broadcast_4f64_16f32(<4 x double> *%p, i16 %mask) nounwind {
; CHECK-LABEL: test_broadcast_4f64_16f32:
-; CHECK: # BB#0:
-; CHECK-NEXT: kmovw %esi, %k1
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: vbroadcastf32x8 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
; CHECK-NEXT: retq
%1 = load <4 x double>, <4 x double> *%p
@@ -809,8 +1583,8 @@ define <16 x float> @test_broadcast_4f64_16f32(<4 x double> *%p, i16 %mask) noun
define <16 x i32> @test_broadcast_4i64_16i32(<4 x i64> *%p, i16 %mask) nounwind {
; CHECK-LABEL: test_broadcast_4i64_16i32:
-; CHECK: # BB#0:
-; CHECK-NEXT: kmovw %esi, %k1
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: vbroadcasti32x8 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
; CHECK-NEXT: retq
%1 = load <4 x i64>, <4 x i64> *%p
@@ -823,8 +1597,8 @@ define <16 x i32> @test_broadcast_4i64_16i32(<4 x i64> *%p, i16 %mask) nounwind
define <4 x double> @test_broadcast_4f32_4f64(<4 x float> *%p, i8 %mask) nounwind {
; CHECK-LABEL: test_broadcast_4f32_4f64:
-; CHECK: # BB#0:
-; CHECK-NEXT: kmovw %esi, %k1
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: vbroadcastf64x2 {{.*#+}} ymm0 {%k1} {z} = mem[0,1,0,1]
; CHECK-NEXT: retq
%1 = load <4 x float>, <4 x float> *%p
@@ -838,8 +1612,8 @@ define <4 x double> @test_broadcast_4f32_4f64(<4 x float> *%p, i8 %mask) nounwin
define <4 x i64> @test_broadcast_4i32_4i64(<4 x i32> *%p, i8 %mask) nounwind {
; CHECK-LABEL: test_broadcast_4i32_4i64:
-; CHECK: # BB#0:
-; CHECK-NEXT: kmovw %esi, %k1
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: vbroadcasti64x2 {{.*#+}} ymm0 {%k1} {z} = mem[0,1,0,1]
; CHECK-NEXT: retq
%1 = load <4 x i32>, <4 x i32> *%p
@@ -853,8 +1627,8 @@ define <4 x i64> @test_broadcast_4i32_4i64(<4 x i32> *%p, i8 %mask) nounwind {
define <8 x double> @test_broadcast_4f32_8f64(<4 x float> *%p, i8 %mask) nounwind {
; CHECK-LABEL: test_broadcast_4f32_8f64:
-; CHECK: # BB#0:
-; CHECK-NEXT: kmovw %esi, %k1
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: vbroadcastf64x2 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,0,1,0,1,0,1]
; CHECK-NEXT: retq
%1 = load <4 x float>, <4 x float> *%p
@@ -867,8 +1641,8 @@ define <8 x double> @test_broadcast_4f32_8f64(<4 x float> *%p, i8 %mask) nounwin
define <8 x i64> @test_broadcast_4i32_8i64(<4 x i32> *%p, i8 %mask) nounwind {
; CHECK-LABEL: test_broadcast_4i32_8i64:
-; CHECK: # BB#0:
-; CHECK-NEXT: kmovw %esi, %k1
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: vbroadcasti64x2 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,0,1,0,1,0,1]
; CHECK-NEXT: retq
%1 = load <4 x i32>, <4 x i32> *%p
@@ -881,8 +1655,8 @@ define <8 x i64> @test_broadcast_4i32_8i64(<4 x i32> *%p, i8 %mask) nounwind {
define <8 x double> @test_broadcast_8f32_8f64(<8 x float> *%p, i8 %mask) nounwind {
; CHECK-LABEL: test_broadcast_8f32_8f64:
-; CHECK: # BB#0:
-; CHECK-NEXT: kmovw %esi, %k1
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: vbroadcastf64x4 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,0,1,2,3]
; CHECK-NEXT: retq
%1 = load <8 x float>, <8 x float> *%p
@@ -895,8 +1669,8 @@ define <8 x double> @test_broadcast_8f32_8f64(<8 x float> *%p, i8 %mask) nounwin
define <8 x i64> @test_broadcast_8i32_8i64(<8 x i32> *%p, i8 %mask) nounwind {
; CHECK-LABEL: test_broadcast_8i32_8i64:
-; CHECK: # BB#0:
-; CHECK-NEXT: kmovw %esi, %k1
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,0,1,2,3]
; CHECK-NEXT: retq
%1 = load <8 x i32>, <8 x i32> *%p
@@ -906,3 +1680,233 @@ define <8 x i64> @test_broadcast_8i32_8i64(<8 x i32> *%p, i8 %mask) nounwind {
%res = select <8 x i1> %mask.cast, <8 x i64> %3, <8 x i64> zeroinitializer
ret <8 x i64> %res
}
+
+define <4 x float> @test_broadcastf32x2_v4f32(<4 x float> %vec, <4 x float> %passthru, i8 %mask) {
+; CHECK-LABEL: test_broadcastf32x2_v4f32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vblendmps %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+ %mask.cast = bitcast i8 %mask to <8 x i1>
+ %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %res = select <4 x i1> %mask.extract, <4 x float> %shuf, <4 x float> %passthru
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_broadcastf32x2_v4f32_z(<4 x float> %vec, i8 %mask) {
+; CHECK-LABEL: test_broadcastf32x2_v4f32_z:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vmovaps %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+ %mask.cast = bitcast i8 %mask to <8 x i1>
+ %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %res = select <4 x i1> %mask.extract, <4 x float> %shuf, <4 x float> zeroinitializer
+ ret <4 x float> %res
+}
+
+define <4 x i32> @test_broadcasti32x2_v4i32(<4 x i32> %vec, <4 x i32> %passthru, i8 %mask) {
+; CHECK-LABEL: test_broadcasti32x2_v4i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} xmm1 {%k1} = xmm0[0,1,0,1]
+; CHECK-NEXT: vmovdqa %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+ %mask.cast = bitcast i8 %mask to <8 x i1>
+ %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %res = select <4 x i1> %mask.extract, <4 x i32> %shuf, <4 x i32> %passthru
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @test_broadcasti32x2_v4i32_z(<4 x i32> %vec, i8 %mask) {
+; CHECK-LABEL: test_broadcasti32x2_v4i32_z:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,0,1]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+ %mask.cast = bitcast i8 %mask to <8 x i1>
+ %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %res = select <4 x i1> %mask.extract, <4 x i32> %shuf, <4 x i32> zeroinitializer
+ ret <4 x i32> %res
+}
+
+define <8 x float> @test_broadcastf32x2_v8f32(<8 x float> %vec, <8 x float> %passthru, i8 %mask) {
+; CHECK-LABEL: test_broadcastf32x2_v8f32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vbroadcastf32x2 {{.*#+}} ymm1 {%k1} = xmm0[0,1,0,1,0,1,0,1]
+; CHECK-NEXT: vmovapd %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+ %mask.cast = bitcast i8 %mask to <8 x i1>
+ %res = select <8 x i1> %mask.cast, <8 x float> %shuf, <8 x float> %passthru
+ ret <8 x float> %res
+}
+
+define <8 x float> @test_broadcastf32x2_v8f32_z(<8 x float> %vec, i8 %mask) {
+; CHECK-LABEL: test_broadcastf32x2_v8f32_z:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vbroadcastf32x2 {{.*#+}} ymm0 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+ %mask.cast = bitcast i8 %mask to <8 x i1>
+ %res = select <8 x i1> %mask.cast, <8 x float> %shuf, <8 x float> zeroinitializer
+ ret <8 x float> %res
+}
+
+define <8 x i32> @test_broadcasti32x2_v8i32(<8 x i32> %vec, <8 x i32> %passthru, i8 %mask) {
+; CHECK-LABEL: test_broadcasti32x2_v8i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} ymm1 {%k1} = xmm0[0,1,0,1,0,1,0,1]
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+ %mask.cast = bitcast i8 %mask to <8 x i1>
+ %res = select <8 x i1> %mask.cast, <8 x i32> %shuf, <8 x i32> %passthru
+ ret <8 x i32> %res
+}
+
+define <8 x i32> @test_broadcasti32x2_v8i32_z(<8 x i32> %vec, i8 %mask) {
+; CHECK-LABEL: test_broadcasti32x2_v8i32_z:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} ymm0 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+ %mask.cast = bitcast i8 %mask to <8 x i1>
+ %res = select <8 x i1> %mask.cast, <8 x i32> %shuf, <8 x i32> zeroinitializer
+ ret <8 x i32> %res
+}
+
+define <16 x float> @test_broadcastf32x2_v16f32_z(<16 x float> %vec, i16 %mask) {
+; CHECK-LABEL: test_broadcastf32x2_v16f32_z:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vbroadcastf32x2 {{.*#+}} zmm0 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+ %mask.cast = bitcast i16 %mask to <16 x i1>
+ %res = select <16 x i1> %mask.cast, <16 x float> %shuf, <16 x float> zeroinitializer
+ ret <16 x float> %res
+}
+
+define <16 x i32> @test_broadcasti32x2_v16i32(<16 x i32> %vec, <16 x i32> %passthru, i16 %mask) {
+; CHECK-LABEL: test_broadcasti32x2_v16i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} zmm1 {%k1} = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+ %mask.cast = bitcast i16 %mask to <16 x i1>
+ %res = select <16 x i1> %mask.cast, <16 x i32> %shuf, <16 x i32> %passthru
+ ret <16 x i32> %res
+}
+
+define <16 x float> @test_broadcastf32x2_v16f32(<16 x float> %vec, <16 x float> %passthru, i16 %mask) {
+; CHECK-LABEL: test_broadcastf32x2_v16f32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vbroadcastf32x2 {{.*#+}} zmm1 {%k1} = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; CHECK-NEXT: vmovapd %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+ %mask.cast = bitcast i16 %mask to <16 x i1>
+ %res = select <16 x i1> %mask.cast, <16 x float> %shuf, <16 x float> %passthru
+ ret <16 x float> %res
+}
+
+define <16 x i32> @test_broadcasti32x2_v16i32_z(<16 x i32> %vec, i16 %mask) {
+; CHECK-LABEL: test_broadcasti32x2_v16i32_z:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} zmm0 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+ %mask.cast = bitcast i16 %mask to <16 x i1>
+ %res = select <16 x i1> %mask.cast, <16 x i32> %shuf, <16 x i32> zeroinitializer
+ ret <16 x i32> %res
+}
+
+define <16 x i8> @mask_shuffle_v16i8_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15_16(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passthru, i16 %mask) {
+; CHECK-LABEL: mask_shuffle_v16i8_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15_16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vpalignr {{.*#+}} xmm2 {%k1} = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0]
+; CHECK-NEXT: vmovdqa %xmm2, %xmm0
+; CHECK-NEXT: retq
+ %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>
+ %mask.cast = bitcast i16 %mask to <16 x i1>
+ %res = select <16 x i1> %mask.cast, <16 x i8> %shuffle, <16 x i8> %passthru
+ ret <16 x i8> %res
+}
+
+define <16 x i8> @maskz_shuffle_v16i8_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15_16(<16 x i8> %a, <16 x i8> %b, i16 %mask) {
+; CHECK-LABEL: maskz_shuffle_v16i8_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15_16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vpalignr {{.*#+}} xmm0 {%k1} {z} = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0]
+; CHECK-NEXT: retq
+ %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>
+ %mask.cast = bitcast i16 %mask to <16 x i1>
+ %res = select <16 x i1> %mask.cast, <16 x i8> %shuffle, <16 x i8> zeroinitializer
+ ret <16 x i8> %res
+}
+
+define <16 x i8> @mask_shuffle_v16i8_4_5_6_7_8_9_10_11_12_13_14_15_16_17_18_19(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passthru, i16 %mask) {
+; CHECK-LABEL: mask_shuffle_v16i8_4_5_6_7_8_9_10_11_12_13_14_15_16_17_18_19:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vpalignr {{.*#+}} xmm2 {%k1} = xmm0[4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3]
+; CHECK-NEXT: vmovdqa %xmm2, %xmm0
+; CHECK-NEXT: retq
+ %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
+ %mask.cast = bitcast i16 %mask to <16 x i1>
+ %res = select <16 x i1> %mask.cast, <16 x i8> %shuffle, <16 x i8> %passthru
+ ret <16 x i8> %res
+}
+
+define <16 x i8> @maskz_shuffle_v16i8_4_5_6_7_8_9_10_11_12_13_14_15_16_17_18_19(<16 x i8> %a, <16 x i8> %b, i16 %mask) {
+; CHECK-LABEL: maskz_shuffle_v16i8_4_5_6_7_8_9_10_11_12_13_14_15_16_17_18_19:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vpalignr {{.*#+}} xmm0 {%k1} {z} = xmm0[4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3]
+; CHECK-NEXT: retq
+ %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
+ %mask.cast = bitcast i16 %mask to <16 x i1>
+ %res = select <16 x i1> %mask.cast, <16 x i8> %shuffle, <16 x i8> zeroinitializer
+ ret <16 x i8> %res
+}
+
+define <16 x i8> @mask_shuffle_v16i8_8_9_10_11_12_13_14_15_16_17_18_19_20_21_22_23(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passthru, i16 %mask) {
+; CHECK-LABEL: mask_shuffle_v16i8_8_9_10_11_12_13_14_15_16_17_18_19_20_21_22_23:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vpalignr {{.*#+}} xmm2 {%k1} = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
+; CHECK-NEXT: vmovdqa %xmm2, %xmm0
+; CHECK-NEXT: retq
+ %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+ %mask.cast = bitcast i16 %mask to <16 x i1>
+ %res = select <16 x i1> %mask.cast, <16 x i8> %shuffle, <16 x i8> %passthru
+ ret <16 x i8> %res
+}
+
+define <16 x i8> @maskz_shuffle_v16i8_8_9_10_11_12_13_14_15_16_17_18_19_20_21_22_23(<16 x i8> %a, <16 x i8> %b, i16 %mask) {
+; CHECK-LABEL: maskz_shuffle_v16i8_8_9_10_11_12_13_14_15_16_17_18_19_20_21_22_23:
+; CHECK: # %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vpalignr {{.*#+}} xmm0 {%k1} {z} = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
+; CHECK-NEXT: retq
+ %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+ %mask.cast = bitcast i16 %mask to <16 x i1>
+ %res = select <16 x i1> %mask.cast, <16 x i8> %shuffle, <16 x i8> zeroinitializer
+ ret <16 x i8> %res
+}
diff --git a/test/CodeGen/X86/vector-shuffle-mmx.ll b/test/CodeGen/X86/vector-shuffle-mmx.ll
index cfad89ec6fa4..c235e83a0d57 100644
--- a/test/CodeGen/X86/vector-shuffle-mmx.ll
+++ b/test/CodeGen/X86/vector-shuffle-mmx.ll
@@ -6,7 +6,7 @@
define void @test0(<1 x i64>* %x) {
; X32-LABEL: test0:
-; X32: ## BB#0: ## %entry
+; X32: ## %bb.0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
@@ -14,7 +14,7 @@ define void @test0(<1 x i64>* %x) {
; X32-NEXT: retl
;
; X64-LABEL: test0:
-; X64: ## BB#0: ## %entry
+; X64: ## %bb.0: ## %entry
; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
; X64-NEXT: movq %xmm0, (%rdi)
@@ -30,14 +30,11 @@ entry:
define void @test1() {
; X32-LABEL: test1:
-; X32: ## BB#0: ## %entry
+; X32: ## %bb.0: ## %entry
; X32-NEXT: pushl %edi
-; X32-NEXT: Lcfi0:
; X32-NEXT: .cfi_def_cfa_offset 8
; X32-NEXT: subl $16, %esp
-; X32-NEXT: Lcfi1:
; X32-NEXT: .cfi_def_cfa_offset 24
-; X32-NEXT: Lcfi2:
; X32-NEXT: .cfi_offset %edi, -8
; X32-NEXT: xorps %xmm0, %xmm0
; X32-NEXT: movlps %xmm0, (%esp)
@@ -52,7 +49,7 @@ define void @test1() {
; X32-NEXT: retl
;
; X64-LABEL: test1:
-; X64: ## BB#0: ## %entry
+; X64: ## %bb.0: ## %entry
; X64-NEXT: xorps %xmm0, %xmm0
; X64-NEXT: movlps %xmm0, -{{[0-9]+}}(%rsp)
; X64-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
@@ -78,7 +75,7 @@ entry:
define void @test2() nounwind {
; X32-LABEL: test2:
-; X32: ## BB#0: ## %entry
+; X32: ## %bb.0: ## %entry
; X32-NEXT: movl L_tmp_V2i$non_lazy_ptr, %eax
; X32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0,0,1,1]
@@ -86,7 +83,7 @@ define void @test2() nounwind {
; X32-NEXT: retl
;
; X64-LABEL: test2:
-; X64: ## BB#0: ## %entry
+; X64: ## %bb.0: ## %entry
; X64-NEXT: movq _tmp_V2i@{{.*}}(%rip), %rax
; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
diff --git a/test/CodeGen/X86/vector-shuffle-sse1.ll b/test/CodeGen/X86/vector-shuffle-sse1.ll
index dfab9c9fec0c..d3597564afdb 100644
--- a/test/CodeGen/X86/vector-shuffle-sse1.ll
+++ b/test/CodeGen/X86/vector-shuffle-sse1.ll
@@ -3,7 +3,7 @@
define <4 x float> @shuffle_v4f32_0001(<4 x float> %a, <4 x float> %b) {
; SSE1-LABEL: shuffle_v4f32_0001:
-; SSE1: # BB#0:
+; SSE1: # %bb.0:
; SSE1-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,1]
; SSE1-NEXT: retq
%shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
@@ -12,7 +12,7 @@ define <4 x float> @shuffle_v4f32_0001(<4 x float> %a, <4 x float> %b) {
define <4 x float> @shuffle_v4f32_0020(<4 x float> %a, <4 x float> %b) {
; SSE1-LABEL: shuffle_v4f32_0020:
-; SSE1: # BB#0:
+; SSE1: # %bb.0:
; SSE1-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,2,0]
; SSE1-NEXT: retq
%shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 2, i32 0>
@@ -21,7 +21,7 @@ define <4 x float> @shuffle_v4f32_0020(<4 x float> %a, <4 x float> %b) {
define <4 x float> @shuffle_v4f32_0300(<4 x float> %a, <4 x float> %b) {
; SSE1-LABEL: shuffle_v4f32_0300:
-; SSE1: # BB#0:
+; SSE1: # %bb.0:
; SSE1-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3,0,0]
; SSE1-NEXT: retq
%shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 3, i32 0, i32 0>
@@ -30,7 +30,7 @@ define <4 x float> @shuffle_v4f32_0300(<4 x float> %a, <4 x float> %b) {
define <4 x float> @shuffle_v4f32_1000(<4 x float> %a, <4 x float> %b) {
; SSE1-LABEL: shuffle_v4f32_1000:
-; SSE1: # BB#0:
+; SSE1: # %bb.0:
; SSE1-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0,0,0]
; SSE1-NEXT: retq
%shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 1, i32 0, i32 0, i32 0>
@@ -39,7 +39,7 @@ define <4 x float> @shuffle_v4f32_1000(<4 x float> %a, <4 x float> %b) {
define <4 x float> @shuffle_v4f32_2200(<4 x float> %a, <4 x float> %b) {
; SSE1-LABEL: shuffle_v4f32_2200:
-; SSE1: # BB#0:
+; SSE1: # %bb.0:
; SSE1-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2,0,0]
; SSE1-NEXT: retq
%shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 2, i32 0, i32 0>
@@ -48,7 +48,7 @@ define <4 x float> @shuffle_v4f32_2200(<4 x float> %a, <4 x float> %b) {
define <4 x float> @shuffle_v4f32_3330(<4 x float> %a, <4 x float> %b) {
; SSE1-LABEL: shuffle_v4f32_3330:
-; SSE1: # BB#0:
+; SSE1: # %bb.0:
; SSE1-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,0]
; SSE1-NEXT: retq
%shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 0>
@@ -57,7 +57,7 @@ define <4 x float> @shuffle_v4f32_3330(<4 x float> %a, <4 x float> %b) {
define <4 x float> @shuffle_v4f32_3210(<4 x float> %a, <4 x float> %b) {
; SSE1-LABEL: shuffle_v4f32_3210:
-; SSE1: # BB#0:
+; SSE1: # %bb.0:
; SSE1-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,2,1,0]
; SSE1-NEXT: retq
%shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
@@ -66,7 +66,7 @@ define <4 x float> @shuffle_v4f32_3210(<4 x float> %a, <4 x float> %b) {
define <4 x float> @shuffle_v4f32_0011(<4 x float> %a, <4 x float> %b) {
; SSE1-LABEL: shuffle_v4f32_0011:
-; SSE1: # BB#0:
+; SSE1: # %bb.0:
; SSE1-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0,0,1,1]
; SSE1-NEXT: retq
%shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
@@ -75,7 +75,7 @@ define <4 x float> @shuffle_v4f32_0011(<4 x float> %a, <4 x float> %b) {
define <4 x float> @shuffle_v4f32_2233(<4 x float> %a, <4 x float> %b) {
; SSE1-LABEL: shuffle_v4f32_2233:
-; SSE1: # BB#0:
+; SSE1: # %bb.0:
; SSE1-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2,2,3,3]
; SSE1-NEXT: retq
%shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 2, i32 3, i32 3>
@@ -84,7 +84,7 @@ define <4 x float> @shuffle_v4f32_2233(<4 x float> %a, <4 x float> %b) {
define <4 x float> @shuffle_v4f32_0022(<4 x float> %a, <4 x float> %b) {
; SSE1-LABEL: shuffle_v4f32_0022:
-; SSE1: # BB#0:
+; SSE1: # %bb.0:
; SSE1-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,2,2]
; SSE1-NEXT: retq
%shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
@@ -93,7 +93,7 @@ define <4 x float> @shuffle_v4f32_0022(<4 x float> %a, <4 x float> %b) {
define <4 x float> @shuffle_v4f32_1133(<4 x float> %a, <4 x float> %b) {
; SSE1-LABEL: shuffle_v4f32_1133:
-; SSE1: # BB#0:
+; SSE1: # %bb.0:
; SSE1-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,3,3]
; SSE1-NEXT: retq
%shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
@@ -102,16 +102,34 @@ define <4 x float> @shuffle_v4f32_1133(<4 x float> %a, <4 x float> %b) {
define <4 x float> @shuffle_v4f32_0145(<4 x float> %a, <4 x float> %b) {
; SSE1-LABEL: shuffle_v4f32_0145:
-; SSE1: # BB#0:
+; SSE1: # %bb.0:
; SSE1-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE1-NEXT: retq
%shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
ret <4 x float> %shuffle
}
+define <4 x float> @shuffle_v4f32_0101(<4 x float> %a, <4 x float> %b) {
+; SSE1-LABEL: shuffle_v4f32_0101:
+; SSE1: # %bb.0:
+; SSE1-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0]
+; SSE1-NEXT: retq
+ %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+ ret <4 x float> %shuffle
+}
+
+define <4 x float> @shuffle_v4f32_2323(<4 x float> %a, <4 x float> %b) {
+; SSE1-LABEL: shuffle_v4f32_2323:
+; SSE1: # %bb.0:
+; SSE1-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE1-NEXT: retq
+ %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
+ ret <4 x float> %shuffle
+}
+
define <4 x float> @shuffle_v4f32_6723(<4 x float> %a, <4 x float> %b) {
; SSE1-LABEL: shuffle_v4f32_6723:
-; SSE1: # BB#0:
+; SSE1: # %bb.0:
; SSE1-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1]
; SSE1-NEXT: retq
%shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 6, i32 7, i32 2, i32 3>
@@ -120,7 +138,7 @@ define <4 x float> @shuffle_v4f32_6723(<4 x float> %a, <4 x float> %b) {
define <4 x float> @shuffle_v4f32_4zzz(<4 x float> %a) {
; SSE1-LABEL: shuffle_v4f32_4zzz:
-; SSE1: # BB#0:
+; SSE1: # %bb.0:
; SSE1-NEXT: xorps %xmm1, %xmm1
; SSE1-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
; SSE1-NEXT: movaps %xmm1, %xmm0
@@ -131,7 +149,7 @@ define <4 x float> @shuffle_v4f32_4zzz(<4 x float> %a) {
define <4 x float> @shuffle_v4f32_z4zz(<4 x float> %a) {
; SSE1-LABEL: shuffle_v4f32_z4zz:
-; SSE1: # BB#0:
+; SSE1: # %bb.0:
; SSE1-NEXT: xorps %xmm1, %xmm1
; SSE1-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0]
; SSE1-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
@@ -142,7 +160,7 @@ define <4 x float> @shuffle_v4f32_z4zz(<4 x float> %a) {
define <4 x float> @shuffle_v4f32_zz4z(<4 x float> %a) {
; SSE1-LABEL: shuffle_v4f32_zz4z:
-; SSE1: # BB#0:
+; SSE1: # %bb.0:
; SSE1-NEXT: xorps %xmm1, %xmm1
; SSE1-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0]
; SSE1-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2]
@@ -154,7 +172,7 @@ define <4 x float> @shuffle_v4f32_zz4z(<4 x float> %a) {
define <4 x float> @shuffle_v4f32_zuu4(<4 x float> %a) {
; SSE1-LABEL: shuffle_v4f32_zuu4:
-; SSE1: # BB#0:
+; SSE1: # %bb.0:
; SSE1-NEXT: xorps %xmm1, %xmm1
; SSE1-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
; SSE1-NEXT: movaps %xmm1, %xmm0
@@ -165,7 +183,7 @@ define <4 x float> @shuffle_v4f32_zuu4(<4 x float> %a) {
define <4 x float> @shuffle_v4f32_zzz7(<4 x float> %a) {
; SSE1-LABEL: shuffle_v4f32_zzz7:
-; SSE1: # BB#0:
+; SSE1: # %bb.0:
; SSE1-NEXT: xorps %xmm1, %xmm1
; SSE1-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0]
; SSE1-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
@@ -177,7 +195,7 @@ define <4 x float> @shuffle_v4f32_zzz7(<4 x float> %a) {
define <4 x float> @shuffle_v4f32_z6zz(<4 x float> %a) {
; SSE1-LABEL: shuffle_v4f32_z6zz:
-; SSE1: # BB#0:
+; SSE1: # %bb.0:
; SSE1-NEXT: xorps %xmm1, %xmm1
; SSE1-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[0,0]
; SSE1-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
@@ -188,7 +206,7 @@ define <4 x float> @shuffle_v4f32_z6zz(<4 x float> %a) {
define <4 x float> @insert_reg_and_zero_v4f32(float %a) {
; SSE1-LABEL: insert_reg_and_zero_v4f32:
-; SSE1: # BB#0:
+; SSE1: # %bb.0:
; SSE1-NEXT: xorps %xmm1, %xmm1
; SSE1-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
; SSE1-NEXT: movaps %xmm1, %xmm0
@@ -200,7 +218,7 @@ define <4 x float> @insert_reg_and_zero_v4f32(float %a) {
define <4 x float> @insert_mem_and_zero_v4f32(float* %ptr) {
; SSE1-LABEL: insert_mem_and_zero_v4f32:
-; SSE1: # BB#0:
+; SSE1: # %bb.0:
; SSE1-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE1-NEXT: retq
%a = load float, float* %ptr
@@ -211,7 +229,7 @@ define <4 x float> @insert_mem_and_zero_v4f32(float* %ptr) {
define <4 x float> @insert_mem_lo_v4f32(<2 x float>* %ptr, <4 x float> %b) {
; SSE1-LABEL: insert_mem_lo_v4f32:
-; SSE1: # BB#0:
+; SSE1: # %bb.0:
; SSE1-NEXT: movq (%rdi), %rax
; SSE1-NEXT: movl %eax, -{{[0-9]+}}(%rsp)
; SSE1-NEXT: shrq $32, %rax
@@ -232,7 +250,7 @@ define <4 x float> @insert_mem_lo_v4f32(<2 x float>* %ptr, <4 x float> %b) {
define <4 x float> @insert_mem_hi_v4f32(<2 x float>* %ptr, <4 x float> %b) {
; SSE1-LABEL: insert_mem_hi_v4f32:
-; SSE1: # BB#0:
+; SSE1: # %bb.0:
; SSE1-NEXT: movq (%rdi), %rax
; SSE1-NEXT: movl %eax, -{{[0-9]+}}(%rsp)
; SSE1-NEXT: shrq $32, %rax
@@ -252,7 +270,7 @@ define <4 x float> @insert_mem_hi_v4f32(<2 x float>* %ptr, <4 x float> %b) {
define <4 x float> @shuffle_mem_v4f32_3210(<4 x float>* %ptr) {
; SSE1-LABEL: shuffle_mem_v4f32_3210:
-; SSE1: # BB#0:
+; SSE1: # %bb.0:
; SSE1-NEXT: movaps (%rdi), %xmm0
; SSE1-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,2,1,0]
; SSE1-NEXT: retq
@@ -263,7 +281,7 @@ define <4 x float> @shuffle_mem_v4f32_3210(<4 x float>* %ptr) {
define <4 x float> @shuffle_mem_v4f32_0145(<4 x float> %a, <4 x float>* %pb) {
; SSE1-LABEL: shuffle_mem_v4f32_0145:
-; SSE1: # BB#0:
+; SSE1: # %bb.0:
; SSE1-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
; SSE1-NEXT: retq
%b = load <4 x float>, <4 x float>* %pb, align 16
@@ -273,7 +291,7 @@ define <4 x float> @shuffle_mem_v4f32_0145(<4 x float> %a, <4 x float>* %pb) {
define <4 x float> @shuffle_mem_v4f32_6723(<4 x float> %a, <4 x float>* %pb) {
; SSE1-LABEL: shuffle_mem_v4f32_6723:
-; SSE1: # BB#0:
+; SSE1: # %bb.0:
; SSE1-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
; SSE1-NEXT: retq
%b = load <4 x float>, <4 x float>* %pb, align 16
diff --git a/test/CodeGen/X86/vector-shuffle-sse41.ll b/test/CodeGen/X86/vector-shuffle-sse41.ll
index be9a4b950778..bcf706fc06f1 100644
--- a/test/CodeGen/X86/vector-shuffle-sse41.ll
+++ b/test/CodeGen/X86/vector-shuffle-sse41.ll
@@ -4,12 +4,12 @@
define <8 x i16> @blend_packusdw(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2, <4 x i32> %a3) {
; SSE41-LABEL: blend_packusdw:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: packusdw %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: blend_packusdw:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
; AVX-NEXT: retq
%p0 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a0, <4 x i32> %a1)
@@ -20,12 +20,12 @@ define <8 x i16> @blend_packusdw(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2, <4
define <16 x i8> @blend_packuswb(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> %a2, <8 x i16> %a3) {
; SSE41-LABEL: blend_packuswb:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: packuswb %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX-LABEL: blend_packuswb:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
; AVX-NEXT: retq
%p0 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %a0, <8 x i16> %a1)
@@ -36,14 +36,14 @@ define <16 x i8> @blend_packuswb(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> %a2, <8
define <8 x i16> @blend_packusdw_packuswb(<4 x i32> %a0, <4 x i32> %a1, <8 x i16> %a2, <8 x i16> %a3) {
; SSE41-LABEL: blend_packusdw_packuswb:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: packusdw %xmm1, %xmm0
; SSE41-NEXT: packuswb %xmm3, %xmm2
; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; SSE41-NEXT: retq
;
; AVX-LABEL: blend_packusdw_packuswb:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpackuswb %xmm3, %xmm2, %xmm1
; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
diff --git a/test/CodeGen/X86/vector-shuffle-sse4a.ll b/test/CodeGen/X86/vector-shuffle-sse4a.ll
index e458bb6fa52f..501d91572605 100644
--- a/test/CodeGen/X86/vector-shuffle-sse4a.ll
+++ b/test/CodeGen/X86/vector-shuffle-sse4a.ll
@@ -10,7 +10,7 @@
; A length of zero is equivalent to a bit length of 64.
define <2 x i64> @extrqi_len0_idx0(<2 x i64> %a) {
; ALL-LABEL: extrqi_len0_idx0:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: retq
%1 = tail call <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64> %a, i8 0, i8 0)
ret <2 x i64> %1
@@ -18,7 +18,7 @@ define <2 x i64> @extrqi_len0_idx0(<2 x i64> %a) {
define <2 x i64> @extrqi_len8_idx16(<2 x i64> %a) {
; ALL-LABEL: extrqi_len8_idx16:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: extrq {{.*#+}} xmm0 = xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
; ALL-NEXT: retq
%1 = tail call <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64> %a, i8 8, i8 16)
@@ -28,7 +28,7 @@ define <2 x i64> @extrqi_len8_idx16(<2 x i64> %a) {
; If the length + index exceeds the bottom 64 bits the result is undefined.
define <2 x i64> @extrqi_len32_idx48(<2 x i64> %a) {
; ALL-LABEL: extrqi_len32_idx48:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: extrq {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
; ALL-NEXT: retq
%1 = tail call <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64> %a, i8 32, i8 48)
@@ -37,17 +37,17 @@ define <2 x i64> @extrqi_len32_idx48(<2 x i64> %a) {
define <16 x i8> @shuf_0zzzuuuuuuuuuuuu(<16 x i8> %a0) {
; AMD10H-LABEL: shuf_0zzzuuuuuuuuuuuu:
-; AMD10H: # BB#0:
+; AMD10H: # %bb.0:
; AMD10H-NEXT: extrq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
; AMD10H-NEXT: retq
;
; BTVER1-LABEL: shuf_0zzzuuuuuuuuuuuu:
-; BTVER1: # BB#0:
+; BTVER1: # %bb.0:
; BTVER1-NEXT: extrq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
; BTVER1-NEXT: retq
;
; BTVER2-LABEL: shuf_0zzzuuuuuuuuuuuu:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
; BTVER2-NEXT: retq
%s = shufflevector <16 x i8> %a0, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 16, i32 16, i32 16, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -56,7 +56,7 @@ define <16 x i8> @shuf_0zzzuuuuuuuuuuuu(<16 x i8> %a0) {
define <16 x i8> @shuf_0zzzzzzz1zzzzzzz(<16 x i8> %a0) {
; AMD10H-LABEL: shuf_0zzzzzzz1zzzzzzz:
-; AMD10H: # BB#0:
+; AMD10H: # %bb.0:
; AMD10H-NEXT: movdqa %xmm0, %xmm1
; AMD10H-NEXT: extrq {{.*#+}} xmm1 = xmm1[1],zero,zero,zero,zero,zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
; AMD10H-NEXT: extrq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
@@ -64,12 +64,12 @@ define <16 x i8> @shuf_0zzzzzzz1zzzzzzz(<16 x i8> %a0) {
; AMD10H-NEXT: retq
;
; BTVER1-LABEL: shuf_0zzzzzzz1zzzzzzz:
-; BTVER1: # BB#0:
+; BTVER1: # %bb.0:
; BTVER1-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
; BTVER1-NEXT: retq
;
; BTVER2-LABEL: shuf_0zzzzzzz1zzzzzzz:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
; BTVER2-NEXT: retq
%s = shufflevector <16 x i8> %a0, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 1, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
@@ -78,7 +78,7 @@ define <16 x i8> @shuf_0zzzzzzz1zzzzzzz(<16 x i8> %a0) {
define <16 x i8> @shuf_2zzzzzzz3zzzzzzz(<16 x i8> %a0) {
; AMD10H-LABEL: shuf_2zzzzzzz3zzzzzzz:
-; AMD10H: # BB#0:
+; AMD10H: # %bb.0:
; AMD10H-NEXT: movdqa %xmm0, %xmm1
; AMD10H-NEXT: extrq {{.*#+}} xmm1 = xmm1[3],zero,zero,zero,zero,zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
; AMD10H-NEXT: extrq {{.*#+}} xmm0 = xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
@@ -86,12 +86,12 @@ define <16 x i8> @shuf_2zzzzzzz3zzzzzzz(<16 x i8> %a0) {
; AMD10H-NEXT: retq
;
; BTVER1-LABEL: shuf_2zzzzzzz3zzzzzzz:
-; BTVER1: # BB#0:
+; BTVER1: # %bb.0:
; BTVER1-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
; BTVER1-NEXT: retq
;
; BTVER2-LABEL: shuf_2zzzzzzz3zzzzzzz:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vpsrld $16, %xmm0, %xmm0
; BTVER2-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
; BTVER2-NEXT: retq
@@ -101,17 +101,17 @@ define <16 x i8> @shuf_2zzzzzzz3zzzzzzz(<16 x i8> %a0) {
define <16 x i8> @shuf_01zzuuuuuuuuuuuu(<16 x i8> %a0) {
; AMD10H-LABEL: shuf_01zzuuuuuuuuuuuu:
-; AMD10H: # BB#0:
+; AMD10H: # %bb.0:
; AMD10H-NEXT: extrq {{.*#+}} xmm0 = xmm0[0,1],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
; AMD10H-NEXT: retq
;
; BTVER1-LABEL: shuf_01zzuuuuuuuuuuuu:
-; BTVER1: # BB#0:
+; BTVER1: # %bb.0:
; BTVER1-NEXT: extrq {{.*#+}} xmm0 = xmm0[0,1],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
; BTVER1-NEXT: retq
;
; BTVER2-LABEL: shuf_01zzuuuuuuuuuuuu:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
; BTVER2-NEXT: retq
%s = shufflevector <16 x i8> %a0, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 16, i32 16, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -120,7 +120,7 @@ define <16 x i8> @shuf_01zzuuuuuuuuuuuu(<16 x i8> %a0) {
define <16 x i8> @shuf_01zzzzzz23zzzzzz(<16 x i8> %a0) {
; AMD10H-LABEL: shuf_01zzzzzz23zzzzzz:
-; AMD10H: # BB#0:
+; AMD10H: # %bb.0:
; AMD10H-NEXT: movdqa %xmm0, %xmm1
; AMD10H-NEXT: extrq {{.*#+}} xmm1 = xmm1[2,3],zero,zero,zero,zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
; AMD10H-NEXT: extrq {{.*#+}} xmm0 = xmm0[0,1],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
@@ -128,12 +128,12 @@ define <16 x i8> @shuf_01zzzzzz23zzzzzz(<16 x i8> %a0) {
; AMD10H-NEXT: retq
;
; BTVER1-LABEL: shuf_01zzzzzz23zzzzzz:
-; BTVER1: # BB#0:
+; BTVER1: # %bb.0:
; BTVER1-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1],zero,zero,zero,zero,zero,zero,xmm0[2,3],zero,zero,zero,zero,zero,zero
; BTVER1-NEXT: retq
;
; BTVER2-LABEL: shuf_01zzzzzz23zzzzzz:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
; BTVER2-NEXT: retq
%s = shufflevector <16 x i8> %a0, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 2, i32 3, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
@@ -142,7 +142,7 @@ define <16 x i8> @shuf_01zzzzzz23zzzzzz(<16 x i8> %a0) {
define <16 x i8> @shuf_1zzzuuuuuuuuuuuu(<16 x i8> %a0) {
; ALL-LABEL: shuf_1zzzuuuuuuuuuuuu:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: extrq {{.*#+}} xmm0 = xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
; ALL-NEXT: retq
%s = shufflevector <16 x i8> %a0, <16 x i8> zeroinitializer, <16 x i32> <i32 1, i32 16, i32 16, i32 16, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -151,7 +151,7 @@ define <16 x i8> @shuf_1zzzuuuuuuuuuuuu(<16 x i8> %a0) {
define <8 x i16> @shuf_1zzzuuuu(<8 x i16> %a0) {
; ALL-LABEL: shuf_1zzzuuuu:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: extrq {{.*#+}} xmm0 = xmm0[2,3],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
; ALL-NEXT: retq
%s = shufflevector <8 x i16> %a0, <8 x i16> zeroinitializer, <8 x i32> <i32 1, i32 8, i32 8, i32 8, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -160,7 +160,7 @@ define <8 x i16> @shuf_1zzzuuuu(<8 x i16> %a0) {
define <8 x i16> @shuf_12zzuuuu(<8 x i16> %a0) {
; ALL-LABEL: shuf_12zzuuuu:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: extrq {{.*#+}} xmm0 = xmm0[2,3,4,5],zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
; ALL-NEXT: retq
%s = shufflevector <8 x i16> %a0, <8 x i16> zeroinitializer, <8 x i32> <i32 1, i32 2, i32 8, i32 8, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -169,17 +169,17 @@ define <8 x i16> @shuf_12zzuuuu(<8 x i16> %a0) {
define <8 x i16> @shuf_012zuuuu(<8 x i16> %a0) {
; AMD10H-LABEL: shuf_012zuuuu:
-; AMD10H: # BB#0:
+; AMD10H: # %bb.0:
; AMD10H-NEXT: extrq {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],zero,zero,xmm0[u,u,u,u,u,u,u,u]
; AMD10H-NEXT: retq
;
; BTVER1-LABEL: shuf_012zuuuu:
-; BTVER1: # BB#0:
+; BTVER1: # %bb.0:
; BTVER1-NEXT: extrq {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],zero,zero,xmm0[u,u,u,u,u,u,u,u]
; BTVER1-NEXT: retq
;
; BTVER2-LABEL: shuf_012zuuuu:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; BTVER2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7]
; BTVER2-NEXT: retq
@@ -189,7 +189,7 @@ define <8 x i16> @shuf_012zuuuu(<8 x i16> %a0) {
define <8 x i16> @shuf_0zzz1zzz(<8 x i16> %a0) {
; AMD10H-LABEL: shuf_0zzz1zzz:
-; AMD10H: # BB#0:
+; AMD10H: # %bb.0:
; AMD10H-NEXT: movdqa %xmm0, %xmm1
; AMD10H-NEXT: extrq {{.*#+}} xmm1 = xmm1[2,3],zero,zero,zero,zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
; AMD10H-NEXT: extrq {{.*#+}} xmm0 = xmm0[0,1],zero,zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
@@ -197,12 +197,12 @@ define <8 x i16> @shuf_0zzz1zzz(<8 x i16> %a0) {
; AMD10H-NEXT: retq
;
; BTVER1-LABEL: shuf_0zzz1zzz:
-; BTVER1: # BB#0:
+; BTVER1: # %bb.0:
; BTVER1-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1],zero,zero,zero,zero,zero,zero,xmm0[2,3],zero,zero,zero,zero,zero,zero
; BTVER1-NEXT: retq
;
; BTVER2-LABEL: shuf_0zzz1zzz:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
; BTVER2-NEXT: retq
%s = shufflevector <8 x i16> %a0, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 8, i32 8, i32 8, i32 1, i32 8, i32 8, i32 8>
@@ -211,19 +211,19 @@ define <8 x i16> @shuf_0zzz1zzz(<8 x i16> %a0) {
define <4 x i32> @shuf_0z1z(<4 x i32> %a0) {
; AMD10H-LABEL: shuf_0z1z:
-; AMD10H: # BB#0:
-; AMD10H-NEXT: pxor %xmm1, %xmm1
-; AMD10H-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AMD10H: # %bb.0:
+; AMD10H-NEXT: xorps %xmm1, %xmm1
+; AMD10H-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; AMD10H-NEXT: retq
;
; BTVER1-LABEL: shuf_0z1z:
-; BTVER1: # BB#0:
-; BTVER1-NEXT: pxor %xmm1, %xmm1
-; BTVER1-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; BTVER1: # %bb.0:
+; BTVER1-NEXT: xorps %xmm1, %xmm1
+; BTVER1-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; BTVER1-NEXT: retq
;
; BTVER2-LABEL: shuf_0z1z:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; BTVER2-NEXT: retq
%s = shufflevector <4 x i32> %a0, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 4, i32 1, i32 4>
@@ -237,17 +237,17 @@ define <4 x i32> @shuf_0z1z(<4 x i32> %a0) {
; A length of zero is equivalent to a bit length of 64.
define <2 x i64> @insertqi_len0_idx0(<2 x i64> %a, <2 x i64> %b) {
; AMD10H-LABEL: insertqi_len0_idx0:
-; AMD10H: # BB#0:
+; AMD10H: # %bb.0:
; AMD10H-NEXT: movaps %xmm1, %xmm0
; AMD10H-NEXT: retq
;
; BTVER1-LABEL: insertqi_len0_idx0:
-; BTVER1: # BB#0:
+; BTVER1: # %bb.0:
; BTVER1-NEXT: movaps %xmm1, %xmm0
; BTVER1-NEXT: retq
;
; BTVER2-LABEL: insertqi_len0_idx0:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vmovaps %xmm1, %xmm0
; BTVER2-NEXT: retq
%1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %a, <2 x i64> %b, i8 0, i8 0)
@@ -256,7 +256,7 @@ define <2 x i64> @insertqi_len0_idx0(<2 x i64> %a, <2 x i64> %b) {
define <2 x i64> @insertqi_len8_idx16(<2 x i64> %a, <2 x i64> %b) {
; ALL-LABEL: insertqi_len8_idx16:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: insertq {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3,4,5,6,7,u,u,u,u,u,u,u,u]
; ALL-NEXT: retq
%1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %a, <2 x i64> %b, i8 8, i8 16)
@@ -266,7 +266,7 @@ define <2 x i64> @insertqi_len8_idx16(<2 x i64> %a, <2 x i64> %b) {
; If the length + index exceeds the bottom 64 bits the result is undefined
define <2 x i64> @insertqi_len32_idx48(<2 x i64> %a, <2 x i64> %b) {
; ALL-LABEL: insertqi_len32_idx48:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: insertq {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
; ALL-NEXT: retq
%1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %a, <2 x i64> %b, i8 32, i8 48)
@@ -275,7 +275,7 @@ define <2 x i64> @insertqi_len32_idx48(<2 x i64> %a, <2 x i64> %b) {
define <16 x i8> @shuf_0_0_2_3_uuuu_uuuu_uuuu(<16 x i8> %a0, <16 x i8> %a1) {
; ALL-LABEL: shuf_0_0_2_3_uuuu_uuuu_uuuu:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: insertq {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7,u,u,u,u,u,u,u,u]
; ALL-NEXT: retq
%s = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> <i32 0, i32 0, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -284,7 +284,7 @@ define <16 x i8> @shuf_0_0_2_3_uuuu_uuuu_uuuu(<16 x i8> %a0, <16 x i8> %a1) {
define <16 x i8> @shuf_0_16_2_3_uuuu_uuuu_uuuu(<16 x i8> %a0, <16 x i8> %a1) {
; ALL-LABEL: shuf_0_16_2_3_uuuu_uuuu_uuuu:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: insertq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3,4,5,6,7,u,u,u,u,u,u,u,u]
; ALL-NEXT: retq
%s = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> <i32 0, i32 16, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -293,7 +293,7 @@ define <16 x i8> @shuf_0_16_2_3_uuuu_uuuu_uuuu(<16 x i8> %a0, <16 x i8> %a1) {
define <16 x i8> @shuf_16_1_2_3_uuuu_uuuu_uuuu(<16 x i8> %a0, <16 x i8> %a1) {
; ALL-LABEL: shuf_16_1_2_3_uuuu_uuuu_uuuu:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: insertq {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7,u,u,u,u,u,u,u,u]
; ALL-NEXT: retq
%s = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> <i32 16, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -302,7 +302,7 @@ define <16 x i8> @shuf_16_1_2_3_uuuu_uuuu_uuuu(<16 x i8> %a0, <16 x i8> %a1) {
define <8 x i16> @shuf_0823uuuu(<8 x i16> %a0, <8 x i16> %a1) {
; ALL-LABEL: shuf_0823uuuu:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: insertq {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,1],xmm0[4,5,6,7,u,u,u,u,u,u,u,u]
; ALL-NEXT: retq
%s = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> <i32 0, i32 8, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -311,7 +311,7 @@ define <8 x i16> @shuf_0823uuuu(<8 x i16> %a0, <8 x i16> %a1) {
define <8 x i16> @shuf_0183uuuu(<8 x i16> %a0, <8 x i16> %a1) {
; ALL-LABEL: shuf_0183uuuu:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: insertq {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[0,1],xmm0[6,7,u,u,u,u,u,u,u,u]
; ALL-NEXT: retq
%s = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> <i32 0, i32 1, i32 8, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -320,7 +320,7 @@ define <8 x i16> @shuf_0183uuuu(<8 x i16> %a0, <8 x i16> %a1) {
define <8 x i16> @shuf_0128uuuu(<8 x i16> %a0, <8 x i16> %a1) {
; ALL-LABEL: shuf_0128uuuu:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: insertq {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[0,1],xmm0[u,u,u,u,u,u,u,u]
; ALL-NEXT: retq
%s = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 8, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -329,7 +329,7 @@ define <8 x i16> @shuf_0128uuuu(<8 x i16> %a0, <8 x i16> %a1) {
define <8 x i16> @shuf_0893uuuu(<8 x i16> %a0, <8 x i16> %a1) {
; ALL-LABEL: shuf_0893uuuu:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: insertq {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,1,2,3],xmm0[6,7,u,u,u,u,u,u,u,u]
; ALL-NEXT: retq
%s = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> <i32 0, i32 8, i32 9, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -338,7 +338,7 @@ define <8 x i16> @shuf_0893uuuu(<8 x i16> %a0, <8 x i16> %a1) {
define <8 x i16> @shuf_089Auuuu(<8 x i16> %a0, <8 x i16> %a1) {
; ALL-LABEL: shuf_089Auuuu:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: insertq {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,1,2,3,4,5],xmm0[u,u,u,u,u,u,u,u]
; ALL-NEXT: retq
%s = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> <i32 0, i32 8, i32 9, i32 10, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -347,7 +347,7 @@ define <8 x i16> @shuf_089Auuuu(<8 x i16> %a0, <8 x i16> %a1) {
define <8 x i16> @shuf_089uuuuu(<8 x i16> %a0, <8 x i16> %a1) {
; ALL-LABEL: shuf_089uuuuu:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: insertq {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,1,2,3],xmm0[6,7,u,u,u,u,u,u,u,u]
; ALL-NEXT: retq
%s = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> <i32 0, i32 8, i32 9, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -361,7 +361,7 @@ define <8 x i16> @shuf_089uuuuu(<8 x i16> %a0, <8 x i16> %a1) {
; Out of range.
define <16 x i8> @shuffle_8_18_uuuuuuuuuuuuuu(<16 x i8> %a, <16 x i8> %b) {
; AMD10H-LABEL: shuffle_8_18_uuuuuuuuuuuuuu:
-; AMD10H: # BB#0:
+; AMD10H: # %bb.0:
; AMD10H-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; AMD10H-NEXT: andpd {{.*}}(%rip), %xmm0
; AMD10H-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
@@ -370,14 +370,14 @@ define <16 x i8> @shuffle_8_18_uuuuuuuuuuuuuu(<16 x i8> %a, <16 x i8> %b) {
; AMD10H-NEXT: retq
;
; BTVER1-LABEL: shuffle_8_18_uuuuuuuuuuuuuu:
-; BTVER1: # BB#0:
+; BTVER1: # %bb.0:
; BTVER1-NEXT: psrld $16, %xmm1
; BTVER1-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
; BTVER1-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; BTVER1-NEXT: retq
;
; BTVER2-LABEL: shuffle_8_18_uuuuuuuuuuuuuu:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vpsrld $16, %xmm1, %xmm1
; BTVER2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
; BTVER2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
@@ -388,19 +388,19 @@ define <16 x i8> @shuffle_8_18_uuuuuuuuuuuuuu(<16 x i8> %a, <16 x i8> %b) {
define <16 x i8> @shuffle_uu_0_5_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu(<16 x i8> %v) {
; AMD10H-LABEL: shuffle_uu_0_5_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
-; AMD10H: # BB#0:
+; AMD10H: # %bb.0:
; AMD10H-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; AMD10H-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AMD10H-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7]
; AMD10H-NEXT: retq
;
; BTVER1-LABEL: shuffle_uu_0_5_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
-; BTVER1: # BB#0:
+; BTVER1: # %bb.0:
; BTVER1-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,0,5,5,4,4,5,5,4,4,5,5,6,6,7,7]
; BTVER1-NEXT: retq
;
; BTVER2-LABEL: shuffle_uu_0_5_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,5,5,4,4,5,5,4,4,5,5,6,6,7,7]
; BTVER2-NEXT: retq
%1 = shufflevector <16 x i8> %v, <16 x i8> zeroinitializer, <16 x i32> <i32 undef, i32 0, i32 5, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -409,18 +409,18 @@ define <16 x i8> @shuffle_uu_0_5_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu(<16 x i8
define <16 x i8> @shuffle_uu_16_4_16_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu(<16 x i8> %v) {
; AMD10H-LABEL: shuffle_uu_16_4_16_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
-; AMD10H: # BB#0:
+; AMD10H: # %bb.0:
; AMD10H-NEXT: psrlq $16, %xmm0
; AMD10H-NEXT: pand {{.*}}(%rip), %xmm0
; AMD10H-NEXT: retq
;
; BTVER1-LABEL: shuffle_uu_16_4_16_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
-; BTVER1: # BB#0:
+; BTVER1: # %bb.0:
; BTVER1-NEXT: pshufb {{.*#+}} xmm0 = xmm0[u],zero,xmm0[4],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
; BTVER1-NEXT: retq
;
; BTVER2-LABEL: shuffle_uu_16_4_16_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
-; BTVER2: # BB#0:
+; BTVER2: # %bb.0:
; BTVER2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u],zero,xmm0[4],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
; BTVER2-NEXT: retq
%1 = shufflevector <16 x i8> %v, <16 x i8> zeroinitializer, <16 x i32> <i32 undef, i32 16, i32 4, i32 16, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -429,7 +429,7 @@ define <16 x i8> @shuffle_uu_16_4_16_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu(<16 x i
define <16 x i8> @shuffle_uu_uu_4_16_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu(<16 x i8> %v) {
; ALL-LABEL: shuffle_uu_uu_4_16_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: extrq {{.*#+}} xmm0 = xmm0[2,3,4],zero,zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
; ALL-NEXT: retq
%1 = shufflevector <16 x i8> %v, <16 x i8> zeroinitializer, <16 x i32> <i32 undef, i32 undef, i32 4, i32 16, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
diff --git a/test/CodeGen/X86/vector-shuffle-v1.ll b/test/CodeGen/X86/vector-shuffle-v1.ll
index 4bcf18cc727e..9c92ca756ebd 100644
--- a/test/CodeGen/X86/vector-shuffle-v1.ll
+++ b/test/CodeGen/X86/vector-shuffle-v1.ll
@@ -1,15 +1,28 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512F
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=AVX512VL
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw -mattr=+avx512vl -mattr=+avx512dq| FileCheck %s --check-prefix=VL_BW_DQ
define <2 x i1> @shuf2i1_1_0(<2 x i1> %a) {
; AVX512F-LABEL: shuf2i1_1_0:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,0,1]
; AVX512F-NEXT: retq
;
+; AVX512VL-LABEL: shuf2i1_1_0:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpsllq $63, %xmm0, %xmm0
+; AVX512VL-NEXT: vptestmq %xmm0, %xmm0, %k1
+; AVX512VL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX512VL-NEXT: vmovdqa64 %xmm0, %xmm1 {%k1} {z}
+; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX512VL-NEXT: vpsllq $63, %xmm1, %xmm1
+; AVX512VL-NEXT: vptestmq %xmm1, %xmm1, %k1
+; AVX512VL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
+; AVX512VL-NEXT: retq
+;
; VL_BW_DQ-LABEL: shuf2i1_1_0:
-; VL_BW_DQ: # BB#0:
+; VL_BW_DQ: # %bb.0:
; VL_BW_DQ-NEXT: vpsllq $63, %xmm0, %xmm0
; VL_BW_DQ-NEXT: vptestmq %xmm0, %xmm0, %k0
; VL_BW_DQ-NEXT: vpmovm2q %k0, %xmm0
@@ -23,14 +36,29 @@ define <2 x i1> @shuf2i1_1_0(<2 x i1> %a) {
define <2 x i1> @shuf2i1_1_2(<2 x i1> %a) {
; AVX512F-LABEL: shuf2i1_1_2:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: movl $1, %eax
; AVX512F-NEXT: vmovq %rax, %xmm1
; AVX512F-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
; AVX512F-NEXT: retq
;
+; AVX512VL-LABEL: shuf2i1_1_2:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpsllq $63, %xmm0, %xmm0
+; AVX512VL-NEXT: vptestmq %xmm0, %xmm0, %k1
+; AVX512VL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX512VL-NEXT: vmovdqa64 %xmm0, %xmm1 {%k1} {z}
+; AVX512VL-NEXT: movb $1, %al
+; AVX512VL-NEXT: kmovw %eax, %k1
+; AVX512VL-NEXT: vmovdqa64 %xmm0, %xmm2 {%k1} {z}
+; AVX512VL-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
+; AVX512VL-NEXT: vpsllq $63, %xmm1, %xmm1
+; AVX512VL-NEXT: vptestmq %xmm1, %xmm1, %k1
+; AVX512VL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
+; AVX512VL-NEXT: retq
+;
; VL_BW_DQ-LABEL: shuf2i1_1_2:
-; VL_BW_DQ: # BB#0:
+; VL_BW_DQ: # %bb.0:
; VL_BW_DQ-NEXT: vpsllq $63, %xmm0, %xmm0
; VL_BW_DQ-NEXT: vptestmq %xmm0, %xmm0, %k0
; VL_BW_DQ-NEXT: movb $1, %al
@@ -48,12 +76,24 @@ define <2 x i1> @shuf2i1_1_2(<2 x i1> %a) {
define <4 x i1> @shuf4i1_3_2_10(<4 x i1> %a) {
; AVX512F-LABEL: shuf4i1_3_2_10:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
; AVX512F-NEXT: retq
;
+; AVX512VL-LABEL: shuf4i1_3_2_10:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpslld $31, %xmm0, %xmm0
+; AVX512VL-NEXT: vptestmd %xmm0, %xmm0, %k1
+; AVX512VL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+; AVX512VL-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1} {z}
+; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,1,0]
+; AVX512VL-NEXT: vpslld $31, %xmm1, %xmm1
+; AVX512VL-NEXT: vptestmd %xmm1, %xmm1, %k1
+; AVX512VL-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
+; AVX512VL-NEXT: retq
+;
; VL_BW_DQ-LABEL: shuf4i1_3_2_10:
-; VL_BW_DQ: # BB#0:
+; VL_BW_DQ: # %bb.0:
; VL_BW_DQ-NEXT: vpslld $31, %xmm0, %xmm0
; VL_BW_DQ-NEXT: vptestmd %xmm0, %xmm0, %k0
; VL_BW_DQ-NEXT: vpmovm2d %k0, %xmm0
@@ -67,20 +107,35 @@ define <4 x i1> @shuf4i1_3_2_10(<4 x i1> %a) {
define <8 x i1> @shuf8i1_3_6_1_0_3_7_7_0(<8 x i64> %a, <8 x i64> %b, <8 x i64> %a1, <8 x i64> %b1) {
; AVX512F-LABEL: shuf8i1_3_6_1_0_3_7_7_0:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vpcmpeqq %zmm2, %zmm0, %k1
; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,6,1,0,3,7,7,0]
; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0
; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0
; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1
-; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; AVX512F-NEXT: vpmovqw %zmm0, %xmm0
+; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512F-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
+; AVX512VL-LABEL: shuf8i1_3_6_1_0_3_7_7_0:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpcmpeqq %zmm2, %zmm0, %k1
+; AVX512VL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,6,1,0,3,7,7,0]
+; AVX512VL-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512VL-NEXT: vpsllq $63, %zmm0, %zmm0
+; AVX512VL-NEXT: vptestmq %zmm0, %zmm0, %k1
+; AVX512VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
+; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
+;
; VL_BW_DQ-LABEL: shuf8i1_3_6_1_0_3_7_7_0:
-; VL_BW_DQ: # BB#0:
+; VL_BW_DQ: # %bb.0:
; VL_BW_DQ-NEXT: vpcmpeqq %zmm2, %zmm0, %k0
; VL_BW_DQ-NEXT: vpmovm2q %k0, %zmm0
; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,6,1,0,3,7,7,0]
@@ -97,7 +152,7 @@ define <8 x i1> @shuf8i1_3_6_1_0_3_7_7_0(<8 x i64> %a, <8 x i64> %b, <8 x i64> %
define <16 x i1> @shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(<16 x i32> %a, <16 x i32> %b, <16 x i32> %a1, <16 x i32> %b1) {
; AVX512F-LABEL: shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vpcmpeqd %zmm2, %zmm0, %k1
; AVX512F-NEXT: vpcmpeqd %zmm3, %zmm1, %k2
; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
@@ -111,8 +166,23 @@ define <16 x i1> @shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(<16 x i32> %a, <1
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
+; AVX512VL-LABEL: shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpcmpeqd %zmm2, %zmm0, %k1
+; AVX512VL-NEXT: vpcmpeqd %zmm3, %zmm1, %k2
+; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
+; AVX512VL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; AVX512VL-NEXT: vmovdqa32 {{.*#+}} zmm2 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
+; AVX512VL-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
+; AVX512VL-NEXT: vpslld $31, %zmm2, %zmm0
+; AVX512VL-NEXT: vptestmd %zmm0, %zmm0, %k1
+; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
+;
; VL_BW_DQ-LABEL: shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0:
-; VL_BW_DQ: # BB#0:
+; VL_BW_DQ: # %bb.0:
; VL_BW_DQ-NEXT: vpcmpeqd %zmm2, %zmm0, %k0
; VL_BW_DQ-NEXT: vpcmpeqd %zmm3, %zmm1, %k1
; VL_BW_DQ-NEXT: vpmovm2d %k1, %zmm0
@@ -131,20 +201,29 @@ define <16 x i1> @shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(<16 x i32> %a, <1
define <32 x i1> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(<32 x i1> %a) {
; AVX512F-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX512F-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,6,u,u,u,u,u,u,u,u,u,u,5,u,u,19,22,u,28,19,23,23,16,19,22,17,29,19,u,23,16]
-; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[3,6,u,12,3,7,7,0,3,6,1,13,3,u,7,0,u,u,22,u,u,u,u,u,u,u,u,u,u,21,u,u]
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[3,6,u,12,3,7,7,0,3,6,1,13,3,u,7,0,u,u,22,u,u,u,u,u,u,u,u,u,u,21,u,u]
+; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,6,u,u,u,u,u,u,u,u,u,u,5,u,u,19,22,u,28,19,23,23,16,19,22,17,29,19,u,23,16]
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,255,255,255,255,255,255,255,255,255,255,0,255,255,0,0,255,0,0,0,0,0,0,0,0,0,0,255,0,0]
-; AVX512F-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX512F-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: retq
;
+; AVX512VL-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[3,6,u,12,3,7,7,0,3,6,1,13,3,u,7,0,u,u,22,u,u,u,u,u,u,u,u,u,u,21,u,u]
+; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,6,u,u,u,u,u,u,u,u,u,u,5,u,u,19,22,u,28,19,23,23,16,19,22,17,29,19,u,23,16]
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,255,255,255,255,255,255,255,255,255,255,0,255,255,0,0,255,0,0,0,0,0,0,0,0,0,0,255,0,0]
+; AVX512VL-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retq
+;
; VL_BW_DQ-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0:
-; VL_BW_DQ: # BB#0:
+; VL_BW_DQ: # %bb.0:
; VL_BW_DQ-NEXT: vpsllw $7, %ymm0, %ymm0
; VL_BW_DQ-NEXT: vpmovb2m %ymm0, %k0
; VL_BW_DQ-NEXT: vpmovm2w %k0, %zmm0
-; VL_BW_DQ-NEXT: vmovdqu16 {{.*#+}} zmm1 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0,3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
+; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0,3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
; VL_BW_DQ-NEXT: vpermw %zmm0, %zmm1, %zmm0
; VL_BW_DQ-NEXT: vpmovw2m %zmm0, %k0
; VL_BW_DQ-NEXT: vpmovm2b %k0, %ymm0
@@ -155,23 +234,38 @@ define <32 x i1> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0
define <8 x i1> @shuf8i1_u_2_u_u_2_u_2_u(i8 %a) {
; AVX512F-LABEL: shuf8i1_u_2_u_u_2_u_2_u:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: kmovw %edi, %k1
; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; AVX512F-NEXT: vextracti32x4 $1, %zmm0, %xmm0
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX512F-NEXT: vpbroadcastq %xmm0, %zmm0
; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0
; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1
-; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; AVX512F-NEXT: vpmovqw %zmm0, %xmm0
+; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512F-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
+; AVX512VL-LABEL: shuf8i1_u_2_u_u_2_u_2_u:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: kmovw %edi, %k1
+; AVX512VL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX512VL-NEXT: vpbroadcastq %xmm0, %zmm0
+; AVX512VL-NEXT: vpsllq $63, %zmm0, %zmm0
+; AVX512VL-NEXT: vptestmq %zmm0, %zmm0, %k1
+; AVX512VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
+; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
+;
; VL_BW_DQ-LABEL: shuf8i1_u_2_u_u_2_u_2_u:
-; VL_BW_DQ: # BB#0:
+; VL_BW_DQ: # %bb.0:
; VL_BW_DQ-NEXT: kmovd %edi, %k0
; VL_BW_DQ-NEXT: vpmovm2q %k0, %zmm0
-; VL_BW_DQ-NEXT: vextracti64x2 $1, %zmm0, %xmm0
+; VL_BW_DQ-NEXT: vextracti128 $1, %ymm0, %xmm0
; VL_BW_DQ-NEXT: vpbroadcastq %xmm0, %zmm0
; VL_BW_DQ-NEXT: vpmovq2m %zmm0, %k0
; VL_BW_DQ-NEXT: vpmovm2w %k0, %xmm0
@@ -184,29 +278,43 @@ define <8 x i1> @shuf8i1_u_2_u_u_2_u_2_u(i8 %a) {
define i8 @shuf8i1_10_2_9_u_3_u_2_u(i8 %a) {
; AVX512F-LABEL: shuf8i1_10_2_9_u_3_u_2_u:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: kmovw %edi, %k1
; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; AVX512F-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = <8,2,10,u,3,u,2,u>
; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
; AVX512F-NEXT: vpsllq $63, %zmm2, %zmm0
; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0
; AVX512F-NEXT: kmovw %k0, %eax
-; AVX512F-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512F-NEXT: # kill: def %al killed %al killed %eax
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
+; AVX512VL-LABEL: shuf8i1_10_2_9_u_3_u_2_u:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: kmovw %edi, %k1
+; AVX512VL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm2 = <8,2,10,u,3,u,2,u>
+; AVX512VL-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
+; AVX512VL-NEXT: vpsllq $63, %zmm2, %zmm0
+; AVX512VL-NEXT: vptestmq %zmm0, %zmm0, %k0
+; AVX512VL-NEXT: kmovw %k0, %eax
+; AVX512VL-NEXT: # kill: def %al killed %al killed %eax
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
+;
; VL_BW_DQ-LABEL: shuf8i1_10_2_9_u_3_u_2_u:
-; VL_BW_DQ: # BB#0:
+; VL_BW_DQ: # %bb.0:
; VL_BW_DQ-NEXT: kmovd %edi, %k0
; VL_BW_DQ-NEXT: vpmovm2q %k0, %zmm0
-; VL_BW_DQ-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; VL_BW_DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = <8,2,10,u,3,u,2,u>
; VL_BW_DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
; VL_BW_DQ-NEXT: vpmovq2m %zmm2, %k0
; VL_BW_DQ-NEXT: kmovd %k0, %eax
-; VL_BW_DQ-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VL_BW_DQ-NEXT: # kill: def %al killed %al killed %eax
; VL_BW_DQ-NEXT: vzeroupper
; VL_BW_DQ-NEXT: retq
%b = bitcast i8 %a to <8 x i1>
@@ -217,25 +325,37 @@ define i8 @shuf8i1_10_2_9_u_3_u_2_u(i8 %a) {
define i8 @shuf8i1_0_1_4_5_u_u_u_u(i8 %a) {
; AVX512F-LABEL: shuf8i1_0_1_4_5_u_u_u_u:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: kmovw %edi, %k1
; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5,0,1,0,1]
; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0
; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0
; AVX512F-NEXT: kmovw %k0, %eax
-; AVX512F-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512F-NEXT: # kill: def %al killed %al killed %eax
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
+; AVX512VL-LABEL: shuf8i1_0_1_4_5_u_u_u_u:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: kmovw %edi, %k1
+; AVX512VL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VL-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5,0,1,0,1]
+; AVX512VL-NEXT: vpsllq $63, %zmm0, %zmm0
+; AVX512VL-NEXT: vptestmq %zmm0, %zmm0, %k0
+; AVX512VL-NEXT: kmovw %k0, %eax
+; AVX512VL-NEXT: # kill: def %al killed %al killed %eax
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
+;
; VL_BW_DQ-LABEL: shuf8i1_0_1_4_5_u_u_u_u:
-; VL_BW_DQ: # BB#0:
+; VL_BW_DQ: # %bb.0:
; VL_BW_DQ-NEXT: kmovd %edi, %k0
; VL_BW_DQ-NEXT: vpmovm2q %k0, %zmm0
; VL_BW_DQ-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5,0,1,0,1]
; VL_BW_DQ-NEXT: vpmovq2m %zmm0, %k0
; VL_BW_DQ-NEXT: kmovd %k0, %eax
-; VL_BW_DQ-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VL_BW_DQ-NEXT: # kill: def %al killed %al killed %eax
; VL_BW_DQ-NEXT: vzeroupper
; VL_BW_DQ-NEXT: retq
%b = bitcast i8 %a to <8 x i1>
@@ -246,29 +366,43 @@ define i8 @shuf8i1_0_1_4_5_u_u_u_u(i8 %a) {
define i8 @shuf8i1_9_6_1_0_3_7_7_0(i8 %a) {
; AVX512F-LABEL: shuf8i1_9_6_1_0_3_7_7_0:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: kmovw %edi, %k1
; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; AVX512F-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,6,1,0,3,7,7,0]
; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
; AVX512F-NEXT: vpsllq $63, %zmm2, %zmm0
; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0
; AVX512F-NEXT: kmovw %k0, %eax
-; AVX512F-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512F-NEXT: # kill: def %al killed %al killed %eax
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
+; AVX512VL-LABEL: shuf8i1_9_6_1_0_3_7_7_0:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: kmovw %edi, %k1
+; AVX512VL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,6,1,0,3,7,7,0]
+; AVX512VL-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
+; AVX512VL-NEXT: vpsllq $63, %zmm2, %zmm0
+; AVX512VL-NEXT: vptestmq %zmm0, %zmm0, %k0
+; AVX512VL-NEXT: kmovw %k0, %eax
+; AVX512VL-NEXT: # kill: def %al killed %al killed %eax
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
+;
; VL_BW_DQ-LABEL: shuf8i1_9_6_1_0_3_7_7_0:
-; VL_BW_DQ: # BB#0:
+; VL_BW_DQ: # %bb.0:
; VL_BW_DQ-NEXT: kmovd %edi, %k0
; VL_BW_DQ-NEXT: vpmovm2q %k0, %zmm0
-; VL_BW_DQ-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; VL_BW_DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,6,1,0,3,7,7,0]
; VL_BW_DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
; VL_BW_DQ-NEXT: vpmovq2m %zmm2, %k0
; VL_BW_DQ-NEXT: kmovd %k0, %eax
-; VL_BW_DQ-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VL_BW_DQ-NEXT: # kill: def %al killed %al killed %eax
; VL_BW_DQ-NEXT: vzeroupper
; VL_BW_DQ-NEXT: retq
%b = bitcast i8 %a to <8 x i1>
@@ -279,29 +413,43 @@ define i8 @shuf8i1_9_6_1_0_3_7_7_0(i8 %a) {
define i8 @shuf8i1_9_6_1_10_3_7_7_0(i8 %a) {
; AVX512F-LABEL: shuf8i1_9_6_1_10_3_7_7_0:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: kmovw %edi, %k1
; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [9,1,2,10,4,5,6,7]
-; AVX512F-NEXT: vpxord %zmm2, %zmm2, %zmm2
+; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm2
; AVX512F-NEXT: vpsllq $63, %zmm2, %zmm0
; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0
; AVX512F-NEXT: kmovw %k0, %eax
-; AVX512F-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512F-NEXT: # kill: def %al killed %al killed %eax
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
+; AVX512VL-LABEL: shuf8i1_9_6_1_10_3_7_7_0:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: kmovw %edi, %k1
+; AVX512VL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm1 = [9,1,2,10,4,5,6,7]
+; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512VL-NEXT: vpermt2q %zmm0, %zmm1, %zmm2
+; AVX512VL-NEXT: vpsllq $63, %zmm2, %zmm0
+; AVX512VL-NEXT: vptestmq %zmm0, %zmm0, %k0
+; AVX512VL-NEXT: kmovw %k0, %eax
+; AVX512VL-NEXT: # kill: def %al killed %al killed %eax
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
+;
; VL_BW_DQ-LABEL: shuf8i1_9_6_1_10_3_7_7_0:
-; VL_BW_DQ: # BB#0:
+; VL_BW_DQ: # %bb.0:
; VL_BW_DQ-NEXT: kmovd %edi, %k0
; VL_BW_DQ-NEXT: vpmovm2q %k0, %zmm0
; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [9,1,2,10,4,5,6,7]
-; VL_BW_DQ-NEXT: vpxord %zmm2, %zmm2, %zmm2
+; VL_BW_DQ-NEXT: vpxor %xmm2, %xmm2, %xmm2
; VL_BW_DQ-NEXT: vpermt2q %zmm0, %zmm1, %zmm2
; VL_BW_DQ-NEXT: vpmovq2m %zmm2, %k0
; VL_BW_DQ-NEXT: kmovd %k0, %eax
-; VL_BW_DQ-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VL_BW_DQ-NEXT: # kill: def %al killed %al killed %eax
; VL_BW_DQ-NEXT: vzeroupper
; VL_BW_DQ-NEXT: retq
%b = bitcast i8 %a to <8 x i1>
@@ -312,7 +460,7 @@ define i8 @shuf8i1_9_6_1_10_3_7_7_0(i8 %a) {
define i8 @shuf8i1__9_6_1_10_3_7_7_1(i8 %a) {
; AVX512F-LABEL: shuf8i1__9_6_1_10_3_7_7_1:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: kmovw %edi, %k1
; AVX512F-NEXT: movb $51, %al
; AVX512F-NEXT: kmovw %eax, %k2
@@ -323,22 +471,36 @@ define i8 @shuf8i1__9_6_1_10_3_7_7_1(i8 %a) {
; AVX512F-NEXT: vpsllq $63, %zmm2, %zmm0
; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0
; AVX512F-NEXT: kmovw %k0, %eax
-; AVX512F-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512F-NEXT: # kill: def %al killed %al killed %eax
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
+; AVX512VL-LABEL: shuf8i1__9_6_1_10_3_7_7_1:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: kmovw %edi, %k1
+; AVX512VL-NEXT: movb $51, %al
+; AVX512VL-NEXT: kmovw %eax, %k2
+; AVX512VL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
+; AVX512VL-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm2 = [9,6,1,0,3,7,7,1]
+; AVX512VL-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
+; AVX512VL-NEXT: vpsllq $63, %zmm2, %zmm0
+; AVX512VL-NEXT: vptestmq %zmm0, %zmm0, %k0
+; AVX512VL-NEXT: kmovw %k0, %eax
+; AVX512VL-NEXT: # kill: def %al killed %al killed %eax
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
+;
; VL_BW_DQ-LABEL: shuf8i1__9_6_1_10_3_7_7_1:
-; VL_BW_DQ: # BB#0:
+; VL_BW_DQ: # %bb.0:
; VL_BW_DQ-NEXT: kmovd %edi, %k0
-; VL_BW_DQ-NEXT: movb $51, %al
-; VL_BW_DQ-NEXT: kmovd %eax, %k1
-; VL_BW_DQ-NEXT: vpmovm2q %k1, %zmm0
-; VL_BW_DQ-NEXT: vpmovm2q %k0, %zmm1
-; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [9,6,1,0,3,7,7,1]
-; VL_BW_DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
+; VL_BW_DQ-NEXT: vpmovm2q %k0, %zmm0
+; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [9,6,1,0,3,7,7,1]
+; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [18446744073709551615,18446744073709551615,0,0,18446744073709551615,18446744073709551615,0,0]
+; VL_BW_DQ-NEXT: vpermt2q %zmm0, %zmm1, %zmm2
; VL_BW_DQ-NEXT: vpmovq2m %zmm2, %k0
; VL_BW_DQ-NEXT: kmovd %k0, %eax
-; VL_BW_DQ-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VL_BW_DQ-NEXT: # kill: def %al killed %al killed %eax
; VL_BW_DQ-NEXT: vzeroupper
; VL_BW_DQ-NEXT: retq
%b = bitcast i8 %a to <8 x i1>
@@ -349,7 +511,7 @@ define i8 @shuf8i1__9_6_1_10_3_7_7_1(i8 %a) {
define i8 @shuf8i1_9_6_1_10_3_7_7_0_all_ones(<8 x i1> %a) {
; AVX512F-LABEL: shuf8i1_9_6_1_10_3_7_7_0_all_ones:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0
; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0
; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1
@@ -360,12 +522,28 @@ define i8 @shuf8i1_9_6_1_10_3_7_7_0_all_ones(<8 x i1> %a) {
; AVX512F-NEXT: vpsllq $63, %zmm2, %zmm0
; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0
; AVX512F-NEXT: kmovw %k0, %eax
-; AVX512F-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512F-NEXT: # kill: def %al killed %al killed %eax
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
+; AVX512VL-LABEL: shuf8i1_9_6_1_10_3_7_7_0_all_ones:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpmovsxwq %xmm0, %zmm0
+; AVX512VL-NEXT: vpsllq $63, %zmm0, %zmm0
+; AVX512VL-NEXT: vptestmq %zmm0, %zmm0, %k1
+; AVX512VL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm1 = [9,1,2,3,4,5,6,7]
+; AVX512VL-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2
+; AVX512VL-NEXT: vpermt2q %zmm0, %zmm1, %zmm2
+; AVX512VL-NEXT: vpsllq $63, %zmm2, %zmm0
+; AVX512VL-NEXT: vptestmq %zmm0, %zmm0, %k0
+; AVX512VL-NEXT: kmovw %k0, %eax
+; AVX512VL-NEXT: # kill: def %al killed %al killed %eax
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
+;
; VL_BW_DQ-LABEL: shuf8i1_9_6_1_10_3_7_7_0_all_ones:
-; VL_BW_DQ: # BB#0:
+; VL_BW_DQ: # %bb.0:
; VL_BW_DQ-NEXT: vpsllw $15, %xmm0, %xmm0
; VL_BW_DQ-NEXT: vpmovw2m %xmm0, %k0
; VL_BW_DQ-NEXT: vpmovm2q %k0, %zmm0
@@ -374,7 +552,7 @@ define i8 @shuf8i1_9_6_1_10_3_7_7_0_all_ones(<8 x i1> %a) {
; VL_BW_DQ-NEXT: vpermt2q %zmm0, %zmm1, %zmm2
; VL_BW_DQ-NEXT: vpmovq2m %zmm2, %k0
; VL_BW_DQ-NEXT: kmovd %k0, %eax
-; VL_BW_DQ-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
+; VL_BW_DQ-NEXT: # kill: def %al killed %al killed %eax
; VL_BW_DQ-NEXT: vzeroupper
; VL_BW_DQ-NEXT: retq
%c = shufflevector <8 x i1> <i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1>, <8 x i1> %a, <8 x i32> <i32 9, i32 6, i32 1, i32 0, i32 3, i32 7, i32 7, i32 0>
@@ -385,25 +563,37 @@ define i8 @shuf8i1_9_6_1_10_3_7_7_0_all_ones(<8 x i1> %a) {
define i16 @shuf16i1_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0(i16 %a) {
; AVX512F-LABEL: shuf16i1_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: kmovw %edi, %k1
; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512F-NEXT: vpbroadcastd %xmm0, %zmm0
; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0
; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
; AVX512F-NEXT: kmovw %k0, %eax
-; AVX512F-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX512F-NEXT: # kill: def %ax killed %ax killed %eax
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
+; AVX512VL-LABEL: shuf16i1_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: kmovw %edi, %k1
+; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VL-NEXT: vpbroadcastd %xmm0, %zmm0
+; AVX512VL-NEXT: vpslld $31, %zmm0, %zmm0
+; AVX512VL-NEXT: vptestmd %zmm0, %zmm0, %k0
+; AVX512VL-NEXT: kmovw %k0, %eax
+; AVX512VL-NEXT: # kill: def %ax killed %ax killed %eax
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
+;
; VL_BW_DQ-LABEL: shuf16i1_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0:
-; VL_BW_DQ: # BB#0:
+; VL_BW_DQ: # %bb.0:
; VL_BW_DQ-NEXT: kmovd %edi, %k0
; VL_BW_DQ-NEXT: vpmovm2d %k0, %zmm0
; VL_BW_DQ-NEXT: vpbroadcastd %xmm0, %zmm0
; VL_BW_DQ-NEXT: vpmovd2m %zmm0, %k0
; VL_BW_DQ-NEXT: kmovd %k0, %eax
-; VL_BW_DQ-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
+; VL_BW_DQ-NEXT: # kill: def %ax killed %ax killed %eax
; VL_BW_DQ-NEXT: vzeroupper
; VL_BW_DQ-NEXT: retq
%b = bitcast i16 %a to <16 x i1>
@@ -414,14 +604,11 @@ define i16 @shuf16i1_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0(i16 %a) {
define i64 @shuf64i1_zero(i64 %a) {
; AVX512F-LABEL: shuf64i1_zero:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: pushq %rbp
-; AVX512F-NEXT: .Lcfi0:
; AVX512F-NEXT: .cfi_def_cfa_offset 16
-; AVX512F-NEXT: .Lcfi1:
; AVX512F-NEXT: .cfi_offset %rbp, -16
; AVX512F-NEXT: movq %rsp, %rbp
-; AVX512F-NEXT: .Lcfi2:
; AVX512F-NEXT: .cfi_def_cfa_register %rbp
; AVX512F-NEXT: andq $-32, %rsp
; AVX512F-NEXT: subq $96, %rsp
@@ -448,8 +635,40 @@ define i64 @shuf64i1_zero(i64 %a) {
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
+; AVX512VL-LABEL: shuf64i1_zero:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: pushq %rbp
+; AVX512VL-NEXT: .cfi_def_cfa_offset 16
+; AVX512VL-NEXT: .cfi_offset %rbp, -16
+; AVX512VL-NEXT: movq %rsp, %rbp
+; AVX512VL-NEXT: .cfi_def_cfa_register %rbp
+; AVX512VL-NEXT: andq $-32, %rsp
+; AVX512VL-NEXT: subq $96, %rsp
+; AVX512VL-NEXT: movl %edi, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: kmovw {{[0-9]+}}(%rsp), %k1
+; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512VL-NEXT: vpbroadcastb %xmm0, %ymm0
+; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512VL-NEXT: vpmovsxbd %xmm1, %zmm1
+; AVX512VL-NEXT: vpslld $31, %zmm1, %zmm1
+; AVX512VL-NEXT: vptestmd %zmm1, %zmm1, %k0
+; AVX512VL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: vpmovsxbd %xmm0, %zmm0
+; AVX512VL-NEXT: vpslld $31, %zmm0, %zmm0
+; AVX512VL-NEXT: vptestmd %zmm0, %zmm0, %k0
+; AVX512VL-NEXT: kmovw %k0, (%rsp)
+; AVX512VL-NEXT: movl (%rsp), %ecx
+; AVX512VL-NEXT: movq %rcx, %rax
+; AVX512VL-NEXT: shlq $32, %rax
+; AVX512VL-NEXT: orq %rcx, %rax
+; AVX512VL-NEXT: movq %rbp, %rsp
+; AVX512VL-NEXT: popq %rbp
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
+;
; VL_BW_DQ-LABEL: shuf64i1_zero:
-; VL_BW_DQ: # BB#0:
+; VL_BW_DQ: # %bb.0:
; VL_BW_DQ-NEXT: kmovq %rdi, %k0
; VL_BW_DQ-NEXT: vpmovm2b %k0, %zmm0
; VL_BW_DQ-NEXT: vpbroadcastb %xmm0, %zmm0
diff --git a/test/CodeGen/X86/vector-shuffle-v48.ll b/test/CodeGen/X86/vector-shuffle-v48.ll
index 06b7c2e64723..3042d117d33a 100644
--- a/test/CodeGen/X86/vector-shuffle-v48.ll
+++ b/test/CodeGen/X86/vector-shuffle-v48.ll
@@ -2,19 +2,18 @@
; RUN: llc -mtriple=x86_64-pc-linux -mattr=+avx2 < %s | FileCheck %s
define <32 x i8> @foo(<48 x i8>* %x0, <16 x i32> %x1, <16 x i32> %x2) {
; CHECK-LABEL: foo:
-; CHECK: # BB#0:
-; CHECK-NEXT: vmovdqu (%rdi), %ymm0
-; CHECK-NEXT: vmovdqu 32(%rdi), %xmm1
-; CHECK-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,1,2,4,5,7,8,10,11,13,14]
-; CHECK-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm2
-; CHECK-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,2,3,5,6]
-; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,3,4,6,7,9,10,12,13,15],zero,zero,zero,zero,zero
-; CHECK-NEXT: vpor %xmm3, %xmm0, %xmm0
-; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[8,9,11,12,14,15,u,u,u,u,u,u,u,u,u,u]
-; CHECK-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovdqu 32(%rdi), %xmm0
+; CHECK-NEXT: vmovdqu (%rdi), %ymm1
+; CHECK-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
+; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,u,u,u,0,2,3,5,6]
+; CHECK-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,3,4,6,7,9,10,12,13,15,u,u,u,u,u,24,25,27,28,30,31,u,u,u,u,u,u,u,u,u,u]
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,u,u,u,u,u,u,u,u,u,u>
+; CHECK-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
+; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,1,2,4,5,7,8,10,11,13,14]
+; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0]
-; CHECK-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; CHECK-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
; CHECK-NEXT: retq
%1 = load <48 x i8>, <48 x i8>* %x0, align 1
%2 = shufflevector <48 x i8> %1, <48 x i8> undef, <32 x i32> <i32 0, i32 1, i32 3, i32 4, i32 6, i32 7, i32 9, i32 10, i32 12, i32 13, i32 15, i32 16, i32 18, i32 19, i32 21, i32 22, i32 24, i32 25, i32 27, i32 28, i32 30, i32 31, i32 33, i32 34, i32 36, i32 37, i32 39, i32 40, i32 42, i32 43, i32 45, i32 46>
diff --git a/test/CodeGen/X86/vector-shuffle-variable-128.ll b/test/CodeGen/X86/vector-shuffle-variable-128.ll
index 5e9e78d0b1f0..0367737dda60 100644
--- a/test/CodeGen/X86/vector-shuffle-variable-128.ll
+++ b/test/CodeGen/X86/vector-shuffle-variable-128.ll
@@ -11,7 +11,7 @@
define <2 x double> @var_shuffle_v2f64_v2f64_xx_i64(<2 x double> %x, i64 %i0, i64 %i1) nounwind {
; SSE-LABEL: var_shuffle_v2f64_v2f64_xx_i64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: andl $1, %esi
; SSE-NEXT: andl $1, %edi
; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
@@ -20,7 +20,7 @@ define <2 x double> @var_shuffle_v2f64_v2f64_xx_i64(<2 x double> %x, i64 %i0, i6
; SSE-NEXT: retq
;
; AVX-LABEL: var_shuffle_v2f64_v2f64_xx_i64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: andl $1, %esi
; AVX-NEXT: andl $1, %edi
; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
@@ -36,27 +36,27 @@ define <2 x double> @var_shuffle_v2f64_v2f64_xx_i64(<2 x double> %x, i64 %i0, i6
define <2 x i64> @var_shuffle_v2i64_v2i64_xx_i64(<2 x i64> %x, i32 %i0, i32 %i1) nounwind {
; SSE-LABEL: var_shuffle_v2i64_v2i64_xx_i64:
-; SSE: # BB#0:
-; SSE-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
-; SSE-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; SSE: # %bb.0:
+; SSE-NEXT: # kill: def %esi killed %esi def %rsi
+; SSE-NEXT: # kill: def %edi killed %edi def %rdi
; SSE-NEXT: andl $1, %edi
; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; SSE-NEXT: andl $1, %esi
-; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
-; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
+; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE-NEXT: retq
;
; AVX-LABEL: var_shuffle_v2i64_v2i64_xx_i64:
-; AVX: # BB#0:
-; AVX-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
-; AVX-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; AVX: # %bb.0:
+; AVX-NEXT: # kill: def %esi killed %esi def %rsi
+; AVX-NEXT: # kill: def %edi killed %edi def %rdi
; AVX-NEXT: andl $1, %edi
; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
; AVX-NEXT: andl $1, %esi
-; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
-; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX-NEXT: retq
%x0 = extractelement <2 x i64> %x, i32 %i0
%x1 = extractelement <2 x i64> %x, i32 %i1
@@ -67,11 +67,11 @@ define <2 x i64> @var_shuffle_v2i64_v2i64_xx_i64(<2 x i64> %x, i32 %i0, i32 %i1)
define <4 x float> @var_shuffle_v4f32_v4f32_xxxx_i32(<4 x float> %x, i32 %i0, i32 %i1, i32 %i2, i32 %i3) nounwind {
; SSE2-LABEL: var_shuffle_v4f32_v4f32_xxxx_i32:
-; SSE2: # BB#0:
-; SSE2-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def>
-; SSE2-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<def>
-; SSE2-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
-; SSE2-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; SSE2: # %bb.0:
+; SSE2-NEXT: # kill: def %ecx killed %ecx def %rcx
+; SSE2-NEXT: # kill: def %edx killed %edx def %rdx
+; SSE2-NEXT: # kill: def %esi killed %esi def %rsi
+; SSE2-NEXT: # kill: def %edi killed %edi def %rdi
; SSE2-NEXT: andl $3, %edi
; SSE2-NEXT: andl $3, %esi
; SSE2-NEXT: andl $3, %edx
@@ -83,15 +83,15 @@ define <4 x float> @var_shuffle_v4f32_v4f32_xxxx_i32(<4 x float> %x, i32 %i0, i3
; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE2-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: var_shuffle_v4f32_v4f32_xxxx_i32:
-; SSSE3: # BB#0:
-; SSSE3-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def>
-; SSSE3-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<def>
-; SSSE3-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
-; SSSE3-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; SSSE3: # %bb.0:
+; SSSE3-NEXT: # kill: def %ecx killed %ecx def %rcx
+; SSSE3-NEXT: # kill: def %edx killed %edx def %rdx
+; SSSE3-NEXT: # kill: def %esi killed %esi def %rsi
+; SSSE3-NEXT: # kill: def %edi killed %edi def %rdi
; SSSE3-NEXT: andl $3, %edi
; SSSE3-NEXT: andl $3, %esi
; SSSE3-NEXT: andl $3, %edx
@@ -103,15 +103,15 @@ define <4 x float> @var_shuffle_v4f32_v4f32_xxxx_i32(<4 x float> %x, i32 %i0, i3
; SSSE3-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; SSSE3-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSSE3-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSSE3-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSSE3-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: var_shuffle_v4f32_v4f32_xxxx_i32:
-; SSE41: # BB#0:
-; SSE41-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def>
-; SSE41-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<def>
-; SSE41-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
-; SSE41-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; SSE41: # %bb.0:
+; SSE41-NEXT: # kill: def %ecx killed %ecx def %rcx
+; SSE41-NEXT: # kill: def %edx killed %edx def %rdx
+; SSE41-NEXT: # kill: def %esi killed %esi def %rsi
+; SSE41-NEXT: # kill: def %edi killed %edi def %rdi
; SSE41-NEXT: andl $3, %edi
; SSE41-NEXT: andl $3, %esi
; SSE41-NEXT: andl $3, %edx
@@ -124,11 +124,11 @@ define <4 x float> @var_shuffle_v4f32_v4f32_xxxx_i32(<4 x float> %x, i32 %i0, i3
; SSE41-NEXT: retq
;
; AVX-LABEL: var_shuffle_v4f32_v4f32_xxxx_i32:
-; AVX: # BB#0:
-; AVX-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def>
-; AVX-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<def>
-; AVX-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
-; AVX-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; AVX: # %bb.0:
+; AVX-NEXT: # kill: def %ecx killed %ecx def %rcx
+; AVX-NEXT: # kill: def %edx killed %edx def %rdx
+; AVX-NEXT: # kill: def %esi killed %esi def %rsi
+; AVX-NEXT: # kill: def %edi killed %edi def %rdi
; AVX-NEXT: andl $3, %edi
; AVX-NEXT: andl $3, %esi
; AVX-NEXT: andl $3, %edx
@@ -152,51 +152,51 @@ define <4 x float> @var_shuffle_v4f32_v4f32_xxxx_i32(<4 x float> %x, i32 %i0, i3
define <4 x i32> @var_shuffle_v4i32_v4i32_xxxx_i32(<4 x i32> %x, i32 %i0, i32 %i1, i32 %i2, i32 %i3) nounwind {
; SSE2-LABEL: var_shuffle_v4i32_v4i32_xxxx_i32:
-; SSE2: # BB#0:
-; SSE2-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def>
-; SSE2-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<def>
-; SSE2-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
-; SSE2-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; SSE2: # %bb.0:
+; SSE2-NEXT: # kill: def %ecx killed %ecx def %rcx
+; SSE2-NEXT: # kill: def %edx killed %edx def %rdx
+; SSE2-NEXT: # kill: def %esi killed %esi def %rsi
+; SSE2-NEXT: # kill: def %edi killed %edi def %rdi
; SSE2-NEXT: andl $3, %edi
; SSE2-NEXT: andl $3, %esi
; SSE2-NEXT: andl $3, %edx
; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; SSE2-NEXT: andl $3, %ecx
-; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: var_shuffle_v4i32_v4i32_xxxx_i32:
-; SSSE3: # BB#0:
-; SSSE3-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def>
-; SSSE3-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<def>
-; SSSE3-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
-; SSSE3-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; SSSE3: # %bb.0:
+; SSSE3-NEXT: # kill: def %ecx killed %ecx def %rcx
+; SSSE3-NEXT: # kill: def %edx killed %edx def %rdx
+; SSSE3-NEXT: # kill: def %esi killed %esi def %rsi
+; SSSE3-NEXT: # kill: def %edi killed %edi def %rdi
; SSSE3-NEXT: andl $3, %edi
; SSSE3-NEXT: andl $3, %esi
; SSSE3-NEXT: andl $3, %edx
; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; SSSE3-NEXT: andl $3, %ecx
-; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSSE3-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSSE3-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSSE3-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSSE3-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSSE3-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSSE3-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSSE3-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: var_shuffle_v4i32_v4i32_xxxx_i32:
-; SSE41: # BB#0:
-; SSE41-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def>
-; SSE41-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<def>
-; SSE41-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
-; SSE41-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; SSE41: # %bb.0:
+; SSE41-NEXT: # kill: def %ecx killed %ecx def %rcx
+; SSE41-NEXT: # kill: def %edx killed %edx def %rdx
+; SSE41-NEXT: # kill: def %esi killed %esi def %rsi
+; SSE41-NEXT: # kill: def %edi killed %edi def %rdi
; SSE41-NEXT: andl $3, %edi
; SSE41-NEXT: andl $3, %esi
; SSE41-NEXT: andl $3, %edx
@@ -209,11 +209,11 @@ define <4 x i32> @var_shuffle_v4i32_v4i32_xxxx_i32(<4 x i32> %x, i32 %i0, i32 %i
; SSE41-NEXT: retq
;
; AVX-LABEL: var_shuffle_v4i32_v4i32_xxxx_i32:
-; AVX: # BB#0:
-; AVX-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def>
-; AVX-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<def>
-; AVX-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
-; AVX-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; AVX: # %bb.0:
+; AVX-NEXT: # kill: def %ecx killed %ecx def %rcx
+; AVX-NEXT: # kill: def %edx killed %edx def %rdx
+; AVX-NEXT: # kill: def %esi killed %esi def %rsi
+; AVX-NEXT: # kill: def %edi killed %edi def %rdi
; AVX-NEXT: andl $3, %edi
; AVX-NEXT: andl $3, %esi
; AVX-NEXT: andl $3, %edx
@@ -237,13 +237,13 @@ define <4 x i32> @var_shuffle_v4i32_v4i32_xxxx_i32(<4 x i32> %x, i32 %i0, i32 %i
define <8 x i16> @var_shuffle_v8i16_v8i16_xxxxxxxx_i16(<8 x i16> %x, i16 %i0, i16 %i1, i16 %i2, i16 %i3, i16 %i4, i16 %i5, i16 %i6, i16 %i7) nounwind {
; SSE2-LABEL: var_shuffle_v8i16_v8i16_xxxxxxxx_i16:
-; SSE2: # BB#0:
-; SSE2-NEXT: # kill: %R9D<def> %R9D<kill> %R9<def>
-; SSE2-NEXT: # kill: %R8D<def> %R8D<kill> %R8<def>
-; SSE2-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def>
-; SSE2-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<def>
-; SSE2-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
-; SSE2-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; SSE2: # %bb.0:
+; SSE2-NEXT: # kill: def %r9d killed %r9d def %r9
+; SSE2-NEXT: # kill: def %r8d killed %r8d def %r8
+; SSE2-NEXT: # kill: def %ecx killed %ecx def %rcx
+; SSE2-NEXT: # kill: def %edx killed %edx def %rdx
+; SSE2-NEXT: # kill: def %esi killed %esi def %rsi
+; SSE2-NEXT: # kill: def %edi killed %edi def %rdi
; SSE2-NEXT: andl $7, %edi
; SSE2-NEXT: andl $7, %esi
; SSE2-NEXT: andl $7, %edx
@@ -255,39 +255,39 @@ define <8 x i16> @var_shuffle_v8i16_v8i16_xxxxxxxx_i16(<8 x i16> %x, i16 %i0, i1
; SSE2-NEXT: andl $7, %r10d
; SSE2-NEXT: movzwl {{[0-9]+}}(%rsp), %eax
; SSE2-NEXT: andl $7, %eax
-; SSE2-NEXT: movzwl -24(%rsp,%rax,2), %eax
-; SSE2-NEXT: movd %eax, %xmm0
-; SSE2-NEXT: movzwl -24(%rsp,%r10,2), %eax
-; SSE2-NEXT: movd %eax, %xmm1
+; SSE2-NEXT: movzwl -24(%rsp,%rcx,2), %ecx
+; SSE2-NEXT: movd %ecx, %xmm0
+; SSE2-NEXT: movzwl -24(%rsp,%rdx,2), %ecx
+; SSE2-NEXT: movd %ecx, %xmm1
; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-NEXT: movzwl -24(%rsp,%r9,2), %eax
-; SSE2-NEXT: movd %eax, %xmm0
-; SSE2-NEXT: movzwl -24(%rsp,%r8,2), %eax
-; SSE2-NEXT: movd %eax, %xmm2
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSE2-NEXT: movzwl -24(%rsp,%rcx,2), %eax
-; SSE2-NEXT: movd %eax, %xmm0
-; SSE2-NEXT: movzwl -24(%rsp,%rdx,2), %eax
+; SSE2-NEXT: movzwl -24(%rsp,%rsi,2), %ecx
+; SSE2-NEXT: movd %ecx, %xmm2
+; SSE2-NEXT: movzwl -24(%rsp,%rdi,2), %ecx
+; SSE2-NEXT: movd %ecx, %xmm0
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT: movzwl -24(%rsp,%r9,2), %ecx
+; SSE2-NEXT: movd %ecx, %xmm1
+; SSE2-NEXT: movzwl -24(%rsp,%r8,2), %ecx
+; SSE2-NEXT: movd %ecx, %xmm2
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-NEXT: movzwl -24(%rsp,%rax,2), %eax
; SSE2-NEXT: movd %eax, %xmm1
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-NEXT: movzwl -24(%rsp,%rsi,2), %eax
+; SSE2-NEXT: movzwl -24(%rsp,%r10,2), %eax
; SSE2-NEXT: movd %eax, %xmm3
-; SSE2-NEXT: movzwl -24(%rsp,%rdi,2), %eax
-; SSE2-NEXT: movd %eax, %xmm0
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: var_shuffle_v8i16_v8i16_xxxxxxxx_i16:
-; SSSE3: # BB#0:
-; SSSE3-NEXT: # kill: %R9D<def> %R9D<kill> %R9<def>
-; SSSE3-NEXT: # kill: %R8D<def> %R8D<kill> %R8<def>
-; SSSE3-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def>
-; SSSE3-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<def>
-; SSSE3-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
-; SSSE3-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; SSSE3: # %bb.0:
+; SSSE3-NEXT: # kill: def %r9d killed %r9d def %r9
+; SSSE3-NEXT: # kill: def %r8d killed %r8d def %r8
+; SSSE3-NEXT: # kill: def %ecx killed %ecx def %rcx
+; SSSE3-NEXT: # kill: def %edx killed %edx def %rdx
+; SSSE3-NEXT: # kill: def %esi killed %esi def %rsi
+; SSSE3-NEXT: # kill: def %edi killed %edi def %rdi
; SSSE3-NEXT: andl $7, %edi
; SSSE3-NEXT: andl $7, %esi
; SSSE3-NEXT: andl $7, %edx
@@ -299,39 +299,39 @@ define <8 x i16> @var_shuffle_v8i16_v8i16_xxxxxxxx_i16(<8 x i16> %x, i16 %i0, i1
; SSSE3-NEXT: andl $7, %r10d
; SSSE3-NEXT: movzwl {{[0-9]+}}(%rsp), %eax
; SSSE3-NEXT: andl $7, %eax
-; SSSE3-NEXT: movzwl -24(%rsp,%rax,2), %eax
-; SSSE3-NEXT: movd %eax, %xmm0
-; SSSE3-NEXT: movzwl -24(%rsp,%r10,2), %eax
-; SSSE3-NEXT: movd %eax, %xmm1
+; SSSE3-NEXT: movzwl -24(%rsp,%rcx,2), %ecx
+; SSSE3-NEXT: movd %ecx, %xmm0
+; SSSE3-NEXT: movzwl -24(%rsp,%rdx,2), %ecx
+; SSSE3-NEXT: movd %ecx, %xmm1
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSSE3-NEXT: movzwl -24(%rsp,%r9,2), %eax
-; SSSE3-NEXT: movd %eax, %xmm0
-; SSSE3-NEXT: movzwl -24(%rsp,%r8,2), %eax
-; SSSE3-NEXT: movd %eax, %xmm2
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSSE3-NEXT: movzwl -24(%rsp,%rcx,2), %eax
-; SSSE3-NEXT: movd %eax, %xmm0
-; SSSE3-NEXT: movzwl -24(%rsp,%rdx,2), %eax
+; SSSE3-NEXT: movzwl -24(%rsp,%rsi,2), %ecx
+; SSSE3-NEXT: movd %ecx, %xmm2
+; SSSE3-NEXT: movzwl -24(%rsp,%rdi,2), %ecx
+; SSSE3-NEXT: movd %ecx, %xmm0
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSSE3-NEXT: movzwl -24(%rsp,%r9,2), %ecx
+; SSSE3-NEXT: movd %ecx, %xmm1
+; SSSE3-NEXT: movzwl -24(%rsp,%r8,2), %ecx
+; SSSE3-NEXT: movd %ecx, %xmm2
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSSE3-NEXT: movzwl -24(%rsp,%rax,2), %eax
; SSSE3-NEXT: movd %eax, %xmm1
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSSE3-NEXT: movzwl -24(%rsp,%rsi,2), %eax
+; SSSE3-NEXT: movzwl -24(%rsp,%r10,2), %eax
; SSSE3-NEXT: movd %eax, %xmm3
-; SSSE3-NEXT: movzwl -24(%rsp,%rdi,2), %eax
-; SSSE3-NEXT: movd %eax, %xmm0
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: var_shuffle_v8i16_v8i16_xxxxxxxx_i16:
-; SSE41: # BB#0:
-; SSE41-NEXT: # kill: %R9D<def> %R9D<kill> %R9<def>
-; SSE41-NEXT: # kill: %R8D<def> %R8D<kill> %R8<def>
-; SSE41-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def>
-; SSE41-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<def>
-; SSE41-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
-; SSE41-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; SSE41: # %bb.0:
+; SSE41-NEXT: # kill: def %r9d killed %r9d def %r9
+; SSE41-NEXT: # kill: def %r8d killed %r8d def %r8
+; SSE41-NEXT: # kill: def %ecx killed %ecx def %rcx
+; SSE41-NEXT: # kill: def %edx killed %edx def %rdx
+; SSE41-NEXT: # kill: def %esi killed %esi def %rsi
+; SSE41-NEXT: # kill: def %edi killed %edi def %rdi
; SSE41-NEXT: andl $7, %edi
; SSE41-NEXT: andl $7, %esi
; SSE41-NEXT: andl $7, %edx
@@ -355,13 +355,13 @@ define <8 x i16> @var_shuffle_v8i16_v8i16_xxxxxxxx_i16(<8 x i16> %x, i16 %i0, i1
; SSE41-NEXT: retq
;
; AVX-LABEL: var_shuffle_v8i16_v8i16_xxxxxxxx_i16:
-; AVX: # BB#0:
-; AVX-NEXT: # kill: %R9D<def> %R9D<kill> %R9<def>
-; AVX-NEXT: # kill: %R8D<def> %R8D<kill> %R8<def>
-; AVX-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def>
-; AVX-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<def>
-; AVX-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
-; AVX-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; AVX: # %bb.0:
+; AVX-NEXT: # kill: def %r9d killed %r9d def %r9
+; AVX-NEXT: # kill: def %r8d killed %r8d def %r8
+; AVX-NEXT: # kill: def %ecx killed %ecx def %rcx
+; AVX-NEXT: # kill: def %edx killed %edx def %rdx
+; AVX-NEXT: # kill: def %esi killed %esi def %rsi
+; AVX-NEXT: # kill: def %edi killed %edi def %rdi
; AVX-NEXT: andl $7, %edi
; AVX-NEXT: andl $7, %esi
; AVX-NEXT: andl $7, %edx
@@ -404,13 +404,13 @@ define <8 x i16> @var_shuffle_v8i16_v8i16_xxxxxxxx_i16(<8 x i16> %x, i16 %i0, i1
define <16 x i8> @var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8 %i0, i8 %i1, i8 %i2, i8 %i3, i8 %i4, i8 %i5, i8 %i6, i8 %i7, i8 %i8, i8 %i9, i8 %i10, i8 %i11, i8 %i12, i8 %i13, i8 %i14, i8 %i15) nounwind {
; SSE2-LABEL: var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8:
-; SSE2: # BB#0:
-; SSE2-NEXT: # kill: %R9D<def> %R9D<kill> %R9<def>
-; SSE2-NEXT: # kill: %R8D<def> %R8D<kill> %R8<def>
-; SSE2-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def>
-; SSE2-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<def>
-; SSE2-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
-; SSE2-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; SSE2: # %bb.0:
+; SSE2-NEXT: # kill: def %r9d killed %r9d def %r9
+; SSE2-NEXT: # kill: def %r8d killed %r8d def %r8
+; SSE2-NEXT: # kill: def %ecx killed %ecx def %rcx
+; SSE2-NEXT: # kill: def %edx killed %edx def %rdx
+; SSE2-NEXT: # kill: def %esi killed %esi def %rsi
+; SSE2-NEXT: # kill: def %edi killed %edi def %rdi
; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
; SSE2-NEXT: andl $15, %eax
@@ -489,13 +489,13 @@ define <16 x i8> @var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8 %
; SSE2-NEXT: retq
;
; SSSE3-LABEL: var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8:
-; SSSE3: # BB#0:
-; SSSE3-NEXT: # kill: %R9D<def> %R9D<kill> %R9<def>
-; SSSE3-NEXT: # kill: %R8D<def> %R8D<kill> %R8<def>
-; SSSE3-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def>
-; SSSE3-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<def>
-; SSSE3-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
-; SSSE3-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; SSSE3: # %bb.0:
+; SSSE3-NEXT: # kill: def %r9d killed %r9d def %r9
+; SSSE3-NEXT: # kill: def %r8d killed %r8d def %r8
+; SSSE3-NEXT: # kill: def %ecx killed %ecx def %rcx
+; SSSE3-NEXT: # kill: def %edx killed %edx def %rdx
+; SSSE3-NEXT: # kill: def %esi killed %esi def %rsi
+; SSSE3-NEXT: # kill: def %edi killed %edi def %rdi
; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax
; SSSE3-NEXT: andl $15, %eax
@@ -574,13 +574,13 @@ define <16 x i8> @var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8 %
; SSSE3-NEXT: retq
;
; SSE41-LABEL: var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8:
-; SSE41: # BB#0:
-; SSE41-NEXT: # kill: %R9D<def> %R9D<kill> %R9<def>
-; SSE41-NEXT: # kill: %R8D<def> %R8D<kill> %R8<def>
-; SSE41-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def>
-; SSE41-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<def>
-; SSE41-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
-; SSE41-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; SSE41: # %bb.0:
+; SSE41-NEXT: # kill: def %r9d killed %r9d def %r9
+; SSE41-NEXT: # kill: def %r8d killed %r8d def %r8
+; SSE41-NEXT: # kill: def %ecx killed %ecx def %rcx
+; SSE41-NEXT: # kill: def %edx killed %edx def %rdx
+; SSE41-NEXT: # kill: def %esi killed %esi def %rsi
+; SSE41-NEXT: # kill: def %edi killed %edi def %rdi
; SSE41-NEXT: andl $15, %edi
; SSE41-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; SSE41-NEXT: leaq -{{[0-9]+}}(%rsp), %rax
@@ -629,13 +629,13 @@ define <16 x i8> @var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8 %
; SSE41-NEXT: retq
;
; AVX-LABEL: var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8:
-; AVX: # BB#0:
-; AVX-NEXT: # kill: %R9D<def> %R9D<kill> %R9<def>
-; AVX-NEXT: # kill: %R8D<def> %R8D<kill> %R8<def>
-; AVX-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def>
-; AVX-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<def>
-; AVX-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
-; AVX-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; AVX: # %bb.0:
+; AVX-NEXT: # kill: def %r9d killed %r9d def %r9
+; AVX-NEXT: # kill: def %r8d killed %r8d def %r8
+; AVX-NEXT: # kill: def %ecx killed %ecx def %rcx
+; AVX-NEXT: # kill: def %edx killed %edx def %rdx
+; AVX-NEXT: # kill: def %esi killed %esi def %rsi
+; AVX-NEXT: # kill: def %edi killed %edi def %rdi
; AVX-NEXT: andl $15, %edi
; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
; AVX-NEXT: leaq -{{[0-9]+}}(%rsp), %rax
@@ -723,7 +723,7 @@ define <16 x i8> @var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8 %
define <4 x i32> @mem_shuffle_v4i32_v4i32_xxxx_i32(<4 x i32> %x, i32* %i) nounwind {
; SSE2-LABEL: mem_shuffle_v4i32_v4i32_xxxx_i32:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movl (%rdi), %eax
; SSE2-NEXT: movl 4(%rdi), %ecx
; SSE2-NEXT: andl $3, %eax
@@ -733,17 +733,17 @@ define <4 x i32> @mem_shuffle_v4i32_v4i32_xxxx_i32(<4 x i32> %x, i32* %i) nounwi
; SSE2-NEXT: andl $3, %edx
; SSE2-NEXT: movl 12(%rdi), %esi
; SSE2-NEXT: andl $3, %esi
-; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: mem_shuffle_v4i32_v4i32_xxxx_i32:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: movl (%rdi), %eax
; SSSE3-NEXT: movl 4(%rdi), %ecx
; SSSE3-NEXT: andl $3, %eax
@@ -753,17 +753,17 @@ define <4 x i32> @mem_shuffle_v4i32_v4i32_xxxx_i32(<4 x i32> %x, i32* %i) nounwi
; SSSE3-NEXT: andl $3, %edx
; SSSE3-NEXT: movl 12(%rdi), %esi
; SSSE3-NEXT: andl $3, %esi
-; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSSE3-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSSE3-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSSE3-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSSE3-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSSE3-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSSE3-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSSE3-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: mem_shuffle_v4i32_v4i32_xxxx_i32:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movl (%rdi), %eax
; SSE41-NEXT: movl 4(%rdi), %ecx
; SSE41-NEXT: andl $3, %eax
@@ -780,7 +780,7 @@ define <4 x i32> @mem_shuffle_v4i32_v4i32_xxxx_i32(<4 x i32> %x, i32* %i) nounwi
; SSE41-NEXT: retq
;
; AVX-LABEL: mem_shuffle_v4i32_v4i32_xxxx_i32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: movl (%rdi), %eax
; AVX-NEXT: movl 4(%rdi), %ecx
; AVX-NEXT: andl $3, %eax
@@ -816,7 +816,7 @@ define <4 x i32> @mem_shuffle_v4i32_v4i32_xxxx_i32(<4 x i32> %x, i32* %i) nounwi
define <16 x i8> @mem_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8* %i) nounwind {
; SSE2-LABEL: mem_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movzbl (%rdi), %eax
; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; SSE2-NEXT: movzbl 15(%rdi), %edx
@@ -901,7 +901,7 @@ define <16 x i8> @mem_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8*
; SSE2-NEXT: retq
;
; SSSE3-LABEL: mem_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: movzbl (%rdi), %eax
; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; SSSE3-NEXT: movzbl 15(%rdi), %edx
@@ -986,7 +986,7 @@ define <16 x i8> @mem_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8*
; SSSE3-NEXT: retq
;
; SSE41-LABEL: mem_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: movzbl (%rdi), %eax
; SSE41-NEXT: andl $15, %eax
; SSE41-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
@@ -1041,7 +1041,7 @@ define <16 x i8> @mem_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8*
; SSE41-NEXT: retq
;
; AVX-LABEL: mem_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: movzbl (%rdi), %eax
; AVX-NEXT: andl $15, %eax
; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
@@ -1167,10 +1167,10 @@ define <16 x i8> @mem_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8*
define <4 x float> @var_shuffle_v4f32_v4f32_x0yx_i32(<4 x float> %x, <4 x float> %y, i32 %i0, i32 %i1, i32 %i2, i32 %i3) nounwind {
; SSE-LABEL: var_shuffle_v4f32_v4f32_x0yx_i32:
-; SSE: # BB#0:
-; SSE-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def>
-; SSE-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<def>
-; SSE-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; SSE: # %bb.0:
+; SSE-NEXT: # kill: def %ecx killed %ecx def %rcx
+; SSE-NEXT: # kill: def %edx killed %edx def %rdx
+; SSE-NEXT: # kill: def %edi killed %edi def %rdi
; SSE-NEXT: andl $3, %edi
; SSE-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
; SSE-NEXT: andl $3, %edx
@@ -1180,14 +1180,14 @@ define <4 x float> @var_shuffle_v4f32_v4f32_x0yx_i32(<4 x float> %x, <4 x float>
; SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; SSE-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE-NEXT: retq
;
; AVX-LABEL: var_shuffle_v4f32_v4f32_x0yx_i32:
-; AVX: # BB#0:
-; AVX-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def>
-; AVX-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<def>
-; AVX-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; AVX: # %bb.0:
+; AVX-NEXT: # kill: def %ecx killed %ecx def %rcx
+; AVX-NEXT: # kill: def %edx killed %edx def %rdx
+; AVX-NEXT: # kill: def %edi killed %edi def %rdi
; AVX-NEXT: andl $3, %edi
; AVX-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp)
; AVX-NEXT: andl $3, %edx
@@ -1197,7 +1197,7 @@ define <4 x float> @var_shuffle_v4f32_v4f32_x0yx_i32(<4 x float> %x, <4 x float>
; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; AVX-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; AVX-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX-NEXT: retq
%x0 = extractelement <4 x float> %x, i32 %i0
%x1 = extractelement <4 x float> %x, i32 %i1
@@ -1212,13 +1212,13 @@ define <4 x float> @var_shuffle_v4f32_v4f32_x0yx_i32(<4 x float> %x, <4 x float>
define <8 x i16> @var_shuffle_v8i16_v8i16_xyxyxy00_i16(<8 x i16> %x, <8 x i16> %y, i16 %i0, i16 %i1, i16 %i2, i16 %i3, i16 %i4, i16 %i5, i16 %i6, i16 %i7) nounwind {
; SSE2-LABEL: var_shuffle_v8i16_v8i16_xyxyxy00_i16:
-; SSE2: # BB#0:
-; SSE2-NEXT: # kill: %R9D<def> %R9D<kill> %R9<def>
-; SSE2-NEXT: # kill: %R8D<def> %R8D<kill> %R8<def>
-; SSE2-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def>
-; SSE2-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<def>
-; SSE2-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
-; SSE2-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; SSE2: # %bb.0:
+; SSE2-NEXT: # kill: def %r9d killed %r9d def %r9
+; SSE2-NEXT: # kill: def %r8d killed %r8d def %r8
+; SSE2-NEXT: # kill: def %ecx killed %ecx def %rcx
+; SSE2-NEXT: # kill: def %edx killed %edx def %rdx
+; SSE2-NEXT: # kill: def %esi killed %esi def %rsi
+; SSE2-NEXT: # kill: def %edi killed %edi def %rdi
; SSE2-NEXT: andl $7, %edi
; SSE2-NEXT: andl $7, %esi
; SSE2-NEXT: andl $7, %edx
@@ -1249,13 +1249,13 @@ define <8 x i16> @var_shuffle_v8i16_v8i16_xyxyxy00_i16(<8 x i16> %x, <8 x i16> %
; SSE2-NEXT: retq
;
; SSSE3-LABEL: var_shuffle_v8i16_v8i16_xyxyxy00_i16:
-; SSSE3: # BB#0:
-; SSSE3-NEXT: # kill: %R9D<def> %R9D<kill> %R9<def>
-; SSSE3-NEXT: # kill: %R8D<def> %R8D<kill> %R8<def>
-; SSSE3-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def>
-; SSSE3-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<def>
-; SSSE3-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
-; SSSE3-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; SSSE3: # %bb.0:
+; SSSE3-NEXT: # kill: def %r9d killed %r9d def %r9
+; SSSE3-NEXT: # kill: def %r8d killed %r8d def %r8
+; SSSE3-NEXT: # kill: def %ecx killed %ecx def %rcx
+; SSSE3-NEXT: # kill: def %edx killed %edx def %rdx
+; SSSE3-NEXT: # kill: def %esi killed %esi def %rsi
+; SSSE3-NEXT: # kill: def %edi killed %edi def %rdi
; SSSE3-NEXT: andl $7, %edi
; SSSE3-NEXT: andl $7, %esi
; SSSE3-NEXT: andl $7, %edx
@@ -1286,13 +1286,13 @@ define <8 x i16> @var_shuffle_v8i16_v8i16_xyxyxy00_i16(<8 x i16> %x, <8 x i16> %
; SSSE3-NEXT: retq
;
; SSE41-LABEL: var_shuffle_v8i16_v8i16_xyxyxy00_i16:
-; SSE41: # BB#0:
-; SSE41-NEXT: # kill: %R9D<def> %R9D<kill> %R9<def>
-; SSE41-NEXT: # kill: %R8D<def> %R8D<kill> %R8<def>
-; SSE41-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def>
-; SSE41-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<def>
-; SSE41-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
-; SSE41-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; SSE41: # %bb.0:
+; SSE41-NEXT: # kill: def %r9d killed %r9d def %r9
+; SSE41-NEXT: # kill: def %r8d killed %r8d def %r8
+; SSE41-NEXT: # kill: def %ecx killed %ecx def %rcx
+; SSE41-NEXT: # kill: def %edx killed %edx def %rdx
+; SSE41-NEXT: # kill: def %esi killed %esi def %rsi
+; SSE41-NEXT: # kill: def %edi killed %edi def %rdi
; SSE41-NEXT: andl $7, %edi
; SSE41-NEXT: andl $7, %esi
; SSE41-NEXT: andl $7, %edx
@@ -1311,13 +1311,13 @@ define <8 x i16> @var_shuffle_v8i16_v8i16_xyxyxy00_i16(<8 x i16> %x, <8 x i16> %
; SSE41-NEXT: retq
;
; AVX-LABEL: var_shuffle_v8i16_v8i16_xyxyxy00_i16:
-; AVX: # BB#0:
-; AVX-NEXT: # kill: %R9D<def> %R9D<kill> %R9<def>
-; AVX-NEXT: # kill: %R8D<def> %R8D<kill> %R8<def>
-; AVX-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def>
-; AVX-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<def>
-; AVX-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
-; AVX-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; AVX: # %bb.0:
+; AVX-NEXT: # kill: def %r9d killed %r9d def %r9
+; AVX-NEXT: # kill: def %r8d killed %r8d def %r8
+; AVX-NEXT: # kill: def %ecx killed %ecx def %rcx
+; AVX-NEXT: # kill: def %edx killed %edx def %rdx
+; AVX-NEXT: # kill: def %esi killed %esi def %rsi
+; AVX-NEXT: # kill: def %edi killed %edi def %rdi
; AVX-NEXT: andl $7, %edi
; AVX-NEXT: andl $7, %esi
; AVX-NEXT: andl $7, %edx
diff --git a/test/CodeGen/X86/vector-shuffle-variable-256.ll b/test/CodeGen/X86/vector-shuffle-variable-256.ll
index 4ca878ef7970..91672d07b052 100644
--- a/test/CodeGen/X86/vector-shuffle-variable-256.ll
+++ b/test/CodeGen/X86/vector-shuffle-variable-256.ll
@@ -8,7 +8,7 @@
define <4 x double> @var_shuffle_v4f64_v4f64_xxxx_i64(<4 x double> %x, i64 %i0, i64 %i1, i64 %i2, i64 %i3) nounwind {
; ALL-LABEL: var_shuffle_v4f64_v4f64_xxxx_i64:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: pushq %rbp
; ALL-NEXT: movq %rsp, %rbp
; ALL-NEXT: andq $-32, %rsp
@@ -39,7 +39,7 @@ define <4 x double> @var_shuffle_v4f64_v4f64_xxxx_i64(<4 x double> %x, i64 %i0,
define <4 x double> @var_shuffle_v4f64_v4f64_uxx0_i64(<4 x double> %x, i64 %i0, i64 %i1, i64 %i2, i64 %i3) nounwind {
; ALL-LABEL: var_shuffle_v4f64_v4f64_uxx0_i64:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: pushq %rbp
; ALL-NEXT: movq %rsp, %rbp
; ALL-NEXT: andq $-32, %rsp
@@ -67,7 +67,7 @@ define <4 x double> @var_shuffle_v4f64_v4f64_uxx0_i64(<4 x double> %x, i64 %i0,
define <4 x double> @var_shuffle_v4f64_v2f64_xxxx_i64(<2 x double> %x, i64 %i0, i64 %i1, i64 %i2, i64 %i3) nounwind {
; ALL-LABEL: var_shuffle_v4f64_v2f64_xxxx_i64:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: andl $1, %esi
; ALL-NEXT: andl $1, %edi
; ALL-NEXT: andl $1, %ecx
@@ -91,49 +91,27 @@ define <4 x double> @var_shuffle_v4f64_v2f64_xxxx_i64(<2 x double> %x, i64 %i0,
}
define <4 x i64> @var_shuffle_v4i64_v4i64_xxxx_i64(<4 x i64> %x, i64 %i0, i64 %i1, i64 %i2, i64 %i3) nounwind {
-; AVX1-LABEL: var_shuffle_v4i64_v4i64_xxxx_i64:
-; AVX1: # BB#0:
-; AVX1-NEXT: pushq %rbp
-; AVX1-NEXT: movq %rsp, %rbp
-; AVX1-NEXT: andq $-32, %rsp
-; AVX1-NEXT: subq $64, %rsp
-; AVX1-NEXT: andl $3, %edi
-; AVX1-NEXT: andl $3, %esi
-; AVX1-NEXT: andl $3, %edx
-; AVX1-NEXT: andl $3, %ecx
-; AVX1-NEXT: vmovaps %ymm0, (%rsp)
-; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
-; AVX1-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT: movq %rbp, %rsp
-; AVX1-NEXT: popq %rbp
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: var_shuffle_v4i64_v4i64_xxxx_i64:
-; AVX2: # BB#0:
-; AVX2-NEXT: pushq %rbp
-; AVX2-NEXT: movq %rsp, %rbp
-; AVX2-NEXT: andq $-32, %rsp
-; AVX2-NEXT: subq $64, %rsp
-; AVX2-NEXT: andl $3, %edi
-; AVX2-NEXT: andl $3, %esi
-; AVX2-NEXT: andl $3, %edx
-; AVX2-NEXT: andl $3, %ecx
-; AVX2-NEXT: vmovaps %ymm0, (%rsp)
-; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
-; AVX2-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX2-NEXT: movq %rbp, %rsp
-; AVX2-NEXT: popq %rbp
-; AVX2-NEXT: retq
+; ALL-LABEL: var_shuffle_v4i64_v4i64_xxxx_i64:
+; ALL: # %bb.0:
+; ALL-NEXT: pushq %rbp
+; ALL-NEXT: movq %rsp, %rbp
+; ALL-NEXT: andq $-32, %rsp
+; ALL-NEXT: subq $64, %rsp
+; ALL-NEXT: andl $3, %edi
+; ALL-NEXT: andl $3, %esi
+; ALL-NEXT: andl $3, %edx
+; ALL-NEXT: andl $3, %ecx
+; ALL-NEXT: vmovaps %ymm0, (%rsp)
+; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; ALL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
+; ALL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; ALL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
+; ALL-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
+; ALL-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; ALL-NEXT: movq %rbp, %rsp
+; ALL-NEXT: popq %rbp
+; ALL-NEXT: retq
%x0 = extractelement <4 x i64> %x, i64 %i0
%x1 = extractelement <4 x i64> %x, i64 %i1
%x2 = extractelement <4 x i64> %x, i64 %i2
@@ -146,41 +124,22 @@ define <4 x i64> @var_shuffle_v4i64_v4i64_xxxx_i64(<4 x i64> %x, i64 %i0, i64 %i
}
define <4 x i64> @var_shuffle_v4i64_v4i64_xx00_i64(<4 x i64> %x, i64 %i0, i64 %i1, i64 %i2, i64 %i3) nounwind {
-; AVX1-LABEL: var_shuffle_v4i64_v4i64_xx00_i64:
-; AVX1: # BB#0:
-; AVX1-NEXT: pushq %rbp
-; AVX1-NEXT: movq %rsp, %rbp
-; AVX1-NEXT: andq $-32, %rsp
-; AVX1-NEXT: subq $64, %rsp
-; AVX1-NEXT: andl $3, %edi
-; AVX1-NEXT: andl $3, %esi
-; AVX1-NEXT: vmovaps %ymm0, (%rsp)
-; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX1-NEXT: movq %rbp, %rsp
-; AVX1-NEXT: popq %rbp
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: var_shuffle_v4i64_v4i64_xx00_i64:
-; AVX2: # BB#0:
-; AVX2-NEXT: pushq %rbp
-; AVX2-NEXT: movq %rsp, %rbp
-; AVX2-NEXT: andq $-32, %rsp
-; AVX2-NEXT: subq $64, %rsp
-; AVX2-NEXT: andl $3, %edi
-; AVX2-NEXT: andl $3, %esi
-; AVX2-NEXT: vmovaps %ymm0, (%rsp)
-; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT: movq %rbp, %rsp
-; AVX2-NEXT: popq %rbp
-; AVX2-NEXT: retq
+; ALL-LABEL: var_shuffle_v4i64_v4i64_xx00_i64:
+; ALL: # %bb.0:
+; ALL-NEXT: pushq %rbp
+; ALL-NEXT: movq %rsp, %rbp
+; ALL-NEXT: andq $-32, %rsp
+; ALL-NEXT: subq $64, %rsp
+; ALL-NEXT: andl $3, %edi
+; ALL-NEXT: andl $3, %esi
+; ALL-NEXT: vmovaps %ymm0, (%rsp)
+; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; ALL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
+; ALL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; ALL-NEXT: vmovaps %xmm0, %xmm0
+; ALL-NEXT: movq %rbp, %rsp
+; ALL-NEXT: popq %rbp
+; ALL-NEXT: retq
%x0 = extractelement <4 x i64> %x, i64 %i0
%x1 = extractelement <4 x i64> %x, i64 %i1
%x2 = extractelement <4 x i64> %x, i64 %i2
@@ -193,37 +152,21 @@ define <4 x i64> @var_shuffle_v4i64_v4i64_xx00_i64(<4 x i64> %x, i64 %i0, i64 %i
}
define <4 x i64> @var_shuffle_v4i64_v2i64_xxxx_i64(<2 x i64> %x, i64 %i0, i64 %i1, i64 %i2, i64 %i3) nounwind {
-; AVX1-LABEL: var_shuffle_v4i64_v2i64_xxxx_i64:
-; AVX1: # BB#0:
-; AVX1-NEXT: andl $1, %edi
-; AVX1-NEXT: andl $1, %esi
-; AVX1-NEXT: andl $1, %edx
-; AVX1-NEXT: andl $1, %ecx
-; AVX1-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
-; AVX1-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: var_shuffle_v4i64_v2i64_xxxx_i64:
-; AVX2: # BB#0:
-; AVX2-NEXT: andl $1, %edi
-; AVX2-NEXT: andl $1, %esi
-; AVX2-NEXT: andl $1, %edx
-; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
-; AVX2-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX2-NEXT: retq
+; ALL-LABEL: var_shuffle_v4i64_v2i64_xxxx_i64:
+; ALL: # %bb.0:
+; ALL-NEXT: andl $1, %edi
+; ALL-NEXT: andl $1, %esi
+; ALL-NEXT: andl $1, %edx
+; ALL-NEXT: andl $1, %ecx
+; ALL-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; ALL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
+; ALL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; ALL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
+; ALL-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
+; ALL-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; ALL-NEXT: retq
%x0 = extractelement <2 x i64> %x, i64 %i0
%x1 = extractelement <2 x i64> %x, i64 %i1
%x2 = extractelement <2 x i64> %x, i64 %i2
@@ -237,17 +180,17 @@ define <4 x i64> @var_shuffle_v4i64_v2i64_xxxx_i64(<2 x i64> %x, i64 %i0, i64 %i
define <8 x float> @var_shuffle_v8f32_v8f32_xxxxxxxx_i32(<8 x float> %x, i32 %i0, i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7) nounwind {
; ALL-LABEL: var_shuffle_v8f32_v8f32_xxxxxxxx_i32:
-; ALL: # BB#0:
+; ALL: # %bb.0:
; ALL-NEXT: pushq %rbp
; ALL-NEXT: movq %rsp, %rbp
; ALL-NEXT: andq $-32, %rsp
; ALL-NEXT: subq $64, %rsp
-; ALL-NEXT: # kill: %R9D<def> %R9D<kill> %R9<def>
-; ALL-NEXT: # kill: %R8D<def> %R8D<kill> %R8<def>
-; ALL-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def>
-; ALL-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<def>
-; ALL-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
-; ALL-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; ALL-NEXT: # kill: def %r9d killed %r9d def %r9
+; ALL-NEXT: # kill: def %r8d killed %r8d def %r8
+; ALL-NEXT: # kill: def %ecx killed %ecx def %rcx
+; ALL-NEXT: # kill: def %edx killed %edx def %rdx
+; ALL-NEXT: # kill: def %esi killed %esi def %rsi
+; ALL-NEXT: # kill: def %edi killed %edi def %rdi
; ALL-NEXT: andl $7, %edi
; ALL-NEXT: andl $7, %esi
; ALL-NEXT: andl $7, %edx
@@ -292,13 +235,13 @@ define <8 x float> @var_shuffle_v8f32_v8f32_xxxxxxxx_i32(<8 x float> %x, i32 %i0
define <8 x float> @var_shuffle_v8f32_v4f32_xxxxxxxx_i32(<4 x float> %x, i32 %i0, i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7) nounwind {
; ALL-LABEL: var_shuffle_v8f32_v4f32_xxxxxxxx_i32:
-; ALL: # BB#0:
-; ALL-NEXT: # kill: %R9D<def> %R9D<kill> %R9<def>
-; ALL-NEXT: # kill: %R8D<def> %R8D<kill> %R8<def>
-; ALL-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def>
-; ALL-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<def>
-; ALL-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
-; ALL-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; ALL: # %bb.0:
+; ALL-NEXT: # kill: def %r9d killed %r9d def %r9
+; ALL-NEXT: # kill: def %r8d killed %r8d def %r8
+; ALL-NEXT: # kill: def %ecx killed %ecx def %rcx
+; ALL-NEXT: # kill: def %edx killed %edx def %rdx
+; ALL-NEXT: # kill: def %esi killed %esi def %rsi
+; ALL-NEXT: # kill: def %edi killed %edi def %rdi
; ALL-NEXT: andl $3, %edi
; ALL-NEXT: andl $3, %esi
; ALL-NEXT: andl $3, %edx
@@ -341,125 +284,125 @@ define <8 x float> @var_shuffle_v8f32_v4f32_xxxxxxxx_i32(<4 x float> %x, i32 %i0
define <16 x i16> @var_shuffle_v16i16_v16i16_xxxxxxxxxxxxxxxx_i16(<16 x i16> %x, i32 %i0, i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7, i32 %i8, i32 %i9, i32 %i10, i32 %i11, i32 %i12, i32 %i13, i32 %i14, i32 %i15) nounwind {
; AVX1-LABEL: var_shuffle_v16i16_v16i16_xxxxxxxxxxxxxxxx_i16:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: pushq %rbp
; AVX1-NEXT: movq %rsp, %rbp
; AVX1-NEXT: andq $-32, %rsp
; AVX1-NEXT: subq $64, %rsp
-; AVX1-NEXT: # kill: %R9D<def> %R9D<kill> %R9<def>
-; AVX1-NEXT: # kill: %R8D<def> %R8D<kill> %R8<def>
-; AVX1-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def>
-; AVX1-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<def>
-; AVX1-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
-; AVX1-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; AVX1-NEXT: # kill: def %r9d killed %r9d def %r9
+; AVX1-NEXT: # kill: def %r8d killed %r8d def %r8
+; AVX1-NEXT: # kill: def %ecx killed %ecx def %rcx
+; AVX1-NEXT: # kill: def %edx killed %edx def %rdx
+; AVX1-NEXT: # kill: def %esi killed %esi def %rsi
+; AVX1-NEXT: # kill: def %edi killed %edi def %rdi
+; AVX1-NEXT: andl $15, %edi
; AVX1-NEXT: vmovaps %ymm0, (%rsp)
+; AVX1-NEXT: movzwl (%rsp,%rdi,2), %eax
+; AVX1-NEXT: vmovd %eax, %xmm0
+; AVX1-NEXT: andl $15, %esi
+; AVX1-NEXT: vpinsrw $1, (%rsp,%rsi,2), %xmm0, %xmm0
+; AVX1-NEXT: andl $15, %edx
+; AVX1-NEXT: vpinsrw $2, (%rsp,%rdx,2), %xmm0, %xmm0
+; AVX1-NEXT: andl $15, %ecx
+; AVX1-NEXT: vpinsrw $3, (%rsp,%rcx,2), %xmm0, %xmm0
+; AVX1-NEXT: andl $15, %r8d
+; AVX1-NEXT: vpinsrw $4, (%rsp,%r8,2), %xmm0, %xmm0
+; AVX1-NEXT: andl $15, %r9d
+; AVX1-NEXT: vpinsrw $5, (%rsp,%r9,2), %xmm0, %xmm0
+; AVX1-NEXT: movl 16(%rbp), %eax
+; AVX1-NEXT: andl $15, %eax
+; AVX1-NEXT: vpinsrw $6, (%rsp,%rax,2), %xmm0, %xmm0
+; AVX1-NEXT: movl 24(%rbp), %eax
+; AVX1-NEXT: andl $15, %eax
+; AVX1-NEXT: vpinsrw $7, (%rsp,%rax,2), %xmm0, %xmm0
; AVX1-NEXT: movl 32(%rbp), %eax
; AVX1-NEXT: andl $15, %eax
; AVX1-NEXT: movzwl (%rsp,%rax,2), %eax
-; AVX1-NEXT: vmovd %eax, %xmm0
+; AVX1-NEXT: vmovd %eax, %xmm1
; AVX1-NEXT: movl 40(%rbp), %eax
; AVX1-NEXT: andl $15, %eax
-; AVX1-NEXT: vpinsrw $1, (%rsp,%rax,2), %xmm0, %xmm0
+; AVX1-NEXT: vpinsrw $1, (%rsp,%rax,2), %xmm1, %xmm1
; AVX1-NEXT: movl 48(%rbp), %eax
; AVX1-NEXT: andl $15, %eax
-; AVX1-NEXT: vpinsrw $2, (%rsp,%rax,2), %xmm0, %xmm0
+; AVX1-NEXT: vpinsrw $2, (%rsp,%rax,2), %xmm1, %xmm1
; AVX1-NEXT: movl 56(%rbp), %eax
; AVX1-NEXT: andl $15, %eax
-; AVX1-NEXT: vpinsrw $3, (%rsp,%rax,2), %xmm0, %xmm0
+; AVX1-NEXT: vpinsrw $3, (%rsp,%rax,2), %xmm1, %xmm1
; AVX1-NEXT: movl 64(%rbp), %eax
; AVX1-NEXT: andl $15, %eax
-; AVX1-NEXT: vpinsrw $4, (%rsp,%rax,2), %xmm0, %xmm0
+; AVX1-NEXT: vpinsrw $4, (%rsp,%rax,2), %xmm1, %xmm1
; AVX1-NEXT: movl 72(%rbp), %eax
; AVX1-NEXT: andl $15, %eax
-; AVX1-NEXT: vpinsrw $5, (%rsp,%rax,2), %xmm0, %xmm0
+; AVX1-NEXT: vpinsrw $5, (%rsp,%rax,2), %xmm1, %xmm1
; AVX1-NEXT: movl 80(%rbp), %eax
; AVX1-NEXT: andl $15, %eax
-; AVX1-NEXT: vpinsrw $6, (%rsp,%rax,2), %xmm0, %xmm0
-; AVX1-NEXT: movl 88(%rbp), %eax
-; AVX1-NEXT: andl $15, %eax
-; AVX1-NEXT: vpinsrw $7, (%rsp,%rax,2), %xmm0, %xmm0
-; AVX1-NEXT: andl $15, %edi
-; AVX1-NEXT: movzwl (%rsp,%rdi,2), %eax
-; AVX1-NEXT: vmovd %eax, %xmm1
-; AVX1-NEXT: andl $15, %esi
-; AVX1-NEXT: vpinsrw $1, (%rsp,%rsi,2), %xmm1, %xmm1
-; AVX1-NEXT: andl $15, %edx
-; AVX1-NEXT: vpinsrw $2, (%rsp,%rdx,2), %xmm1, %xmm1
-; AVX1-NEXT: andl $15, %ecx
-; AVX1-NEXT: vpinsrw $3, (%rsp,%rcx,2), %xmm1, %xmm1
-; AVX1-NEXT: andl $15, %r8d
-; AVX1-NEXT: vpinsrw $4, (%rsp,%r8,2), %xmm1, %xmm1
-; AVX1-NEXT: andl $15, %r9d
-; AVX1-NEXT: vpinsrw $5, (%rsp,%r9,2), %xmm1, %xmm1
-; AVX1-NEXT: movl 16(%rbp), %eax
-; AVX1-NEXT: andl $15, %eax
; AVX1-NEXT: vpinsrw $6, (%rsp,%rax,2), %xmm1, %xmm1
-; AVX1-NEXT: movl 24(%rbp), %eax
+; AVX1-NEXT: movl 88(%rbp), %eax
; AVX1-NEXT: andl $15, %eax
; AVX1-NEXT: vpinsrw $7, (%rsp,%rax,2), %xmm1, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: movq %rbp, %rsp
; AVX1-NEXT: popq %rbp
; AVX1-NEXT: retq
;
; AVX2-LABEL: var_shuffle_v16i16_v16i16_xxxxxxxxxxxxxxxx_i16:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: pushq %rbp
; AVX2-NEXT: movq %rsp, %rbp
; AVX2-NEXT: andq $-32, %rsp
; AVX2-NEXT: subq $64, %rsp
-; AVX2-NEXT: # kill: %R9D<def> %R9D<kill> %R9<def>
-; AVX2-NEXT: # kill: %R8D<def> %R8D<kill> %R8<def>
-; AVX2-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def>
-; AVX2-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<def>
-; AVX2-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
-; AVX2-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; AVX2-NEXT: # kill: def %r9d killed %r9d def %r9
+; AVX2-NEXT: # kill: def %r8d killed %r8d def %r8
+; AVX2-NEXT: # kill: def %ecx killed %ecx def %rcx
+; AVX2-NEXT: # kill: def %edx killed %edx def %rdx
+; AVX2-NEXT: # kill: def %esi killed %esi def %rsi
+; AVX2-NEXT: # kill: def %edi killed %edi def %rdi
+; AVX2-NEXT: andl $15, %edi
; AVX2-NEXT: vmovaps %ymm0, (%rsp)
+; AVX2-NEXT: movzwl (%rsp,%rdi,2), %eax
+; AVX2-NEXT: vmovd %eax, %xmm0
+; AVX2-NEXT: andl $15, %esi
+; AVX2-NEXT: vpinsrw $1, (%rsp,%rsi,2), %xmm0, %xmm0
+; AVX2-NEXT: andl $15, %edx
+; AVX2-NEXT: vpinsrw $2, (%rsp,%rdx,2), %xmm0, %xmm0
+; AVX2-NEXT: andl $15, %ecx
+; AVX2-NEXT: vpinsrw $3, (%rsp,%rcx,2), %xmm0, %xmm0
+; AVX2-NEXT: andl $15, %r8d
+; AVX2-NEXT: vpinsrw $4, (%rsp,%r8,2), %xmm0, %xmm0
+; AVX2-NEXT: andl $15, %r9d
+; AVX2-NEXT: vpinsrw $5, (%rsp,%r9,2), %xmm0, %xmm0
+; AVX2-NEXT: movl 16(%rbp), %eax
+; AVX2-NEXT: andl $15, %eax
+; AVX2-NEXT: vpinsrw $6, (%rsp,%rax,2), %xmm0, %xmm0
+; AVX2-NEXT: movl 24(%rbp), %eax
+; AVX2-NEXT: andl $15, %eax
+; AVX2-NEXT: vpinsrw $7, (%rsp,%rax,2), %xmm0, %xmm0
; AVX2-NEXT: movl 32(%rbp), %eax
; AVX2-NEXT: andl $15, %eax
; AVX2-NEXT: movzwl (%rsp,%rax,2), %eax
-; AVX2-NEXT: vmovd %eax, %xmm0
+; AVX2-NEXT: vmovd %eax, %xmm1
; AVX2-NEXT: movl 40(%rbp), %eax
; AVX2-NEXT: andl $15, %eax
-; AVX2-NEXT: vpinsrw $1, (%rsp,%rax,2), %xmm0, %xmm0
+; AVX2-NEXT: vpinsrw $1, (%rsp,%rax,2), %xmm1, %xmm1
; AVX2-NEXT: movl 48(%rbp), %eax
; AVX2-NEXT: andl $15, %eax
-; AVX2-NEXT: vpinsrw $2, (%rsp,%rax,2), %xmm0, %xmm0
+; AVX2-NEXT: vpinsrw $2, (%rsp,%rax,2), %xmm1, %xmm1
; AVX2-NEXT: movl 56(%rbp), %eax
; AVX2-NEXT: andl $15, %eax
-; AVX2-NEXT: vpinsrw $3, (%rsp,%rax,2), %xmm0, %xmm0
+; AVX2-NEXT: vpinsrw $3, (%rsp,%rax,2), %xmm1, %xmm1
; AVX2-NEXT: movl 64(%rbp), %eax
; AVX2-NEXT: andl $15, %eax
-; AVX2-NEXT: vpinsrw $4, (%rsp,%rax,2), %xmm0, %xmm0
+; AVX2-NEXT: vpinsrw $4, (%rsp,%rax,2), %xmm1, %xmm1
; AVX2-NEXT: movl 72(%rbp), %eax
; AVX2-NEXT: andl $15, %eax
-; AVX2-NEXT: vpinsrw $5, (%rsp,%rax,2), %xmm0, %xmm0
+; AVX2-NEXT: vpinsrw $5, (%rsp,%rax,2), %xmm1, %xmm1
; AVX2-NEXT: movl 80(%rbp), %eax
; AVX2-NEXT: andl $15, %eax
-; AVX2-NEXT: vpinsrw $6, (%rsp,%rax,2), %xmm0, %xmm0
-; AVX2-NEXT: movl 88(%rbp), %eax
-; AVX2-NEXT: andl $15, %eax
-; AVX2-NEXT: vpinsrw $7, (%rsp,%rax,2), %xmm0, %xmm0
-; AVX2-NEXT: andl $15, %edi
-; AVX2-NEXT: movzwl (%rsp,%rdi,2), %eax
-; AVX2-NEXT: vmovd %eax, %xmm1
-; AVX2-NEXT: andl $15, %esi
-; AVX2-NEXT: vpinsrw $1, (%rsp,%rsi,2), %xmm1, %xmm1
-; AVX2-NEXT: andl $15, %edx
-; AVX2-NEXT: vpinsrw $2, (%rsp,%rdx,2), %xmm1, %xmm1
-; AVX2-NEXT: andl $15, %ecx
-; AVX2-NEXT: vpinsrw $3, (%rsp,%rcx,2), %xmm1, %xmm1
-; AVX2-NEXT: andl $15, %r8d
-; AVX2-NEXT: vpinsrw $4, (%rsp,%r8,2), %xmm1, %xmm1
-; AVX2-NEXT: andl $15, %r9d
-; AVX2-NEXT: vpinsrw $5, (%rsp,%r9,2), %xmm1, %xmm1
-; AVX2-NEXT: movl 16(%rbp), %eax
-; AVX2-NEXT: andl $15, %eax
; AVX2-NEXT: vpinsrw $6, (%rsp,%rax,2), %xmm1, %xmm1
-; AVX2-NEXT: movl 24(%rbp), %eax
+; AVX2-NEXT: movl 88(%rbp), %eax
; AVX2-NEXT: andl $15, %eax
; AVX2-NEXT: vpinsrw $7, (%rsp,%rax,2), %xmm1, %xmm1
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX2-NEXT: movq %rbp, %rsp
; AVX2-NEXT: popq %rbp
; AVX2-NEXT: retq
@@ -500,115 +443,115 @@ define <16 x i16> @var_shuffle_v16i16_v16i16_xxxxxxxxxxxxxxxx_i16(<16 x i16> %x,
define <16 x i16> @var_shuffle_v16i16_v8i16_xxxxxxxxxxxxxxxx_i16(<8 x i16> %x, i32 %i0, i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7, i32 %i8, i32 %i9, i32 %i10, i32 %i11, i32 %i12, i32 %i13, i32 %i14, i32 %i15) nounwind {
; AVX1-LABEL: var_shuffle_v16i16_v8i16_xxxxxxxxxxxxxxxx_i16:
-; AVX1: # BB#0:
-; AVX1-NEXT: # kill: %R9D<def> %R9D<kill> %R9<def>
-; AVX1-NEXT: # kill: %R8D<def> %R8D<kill> %R8<def>
-; AVX1-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def>
-; AVX1-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<def>
-; AVX1-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
-; AVX1-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; AVX1: # %bb.0:
+; AVX1-NEXT: # kill: def %r9d killed %r9d def %r9
+; AVX1-NEXT: # kill: def %r8d killed %r8d def %r8
+; AVX1-NEXT: # kill: def %ecx killed %ecx def %rcx
+; AVX1-NEXT: # kill: def %edx killed %edx def %rdx
+; AVX1-NEXT: # kill: def %esi killed %esi def %rsi
+; AVX1-NEXT: # kill: def %edi killed %edi def %rdi
+; AVX1-NEXT: andl $7, %edi
; AVX1-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT: movzwl -24(%rsp,%rdi,2), %eax
+; AVX1-NEXT: vmovd %eax, %xmm0
+; AVX1-NEXT: andl $7, %esi
+; AVX1-NEXT: vpinsrw $1, -24(%rsp,%rsi,2), %xmm0, %xmm0
+; AVX1-NEXT: andl $7, %edx
+; AVX1-NEXT: vpinsrw $2, -24(%rsp,%rdx,2), %xmm0, %xmm0
+; AVX1-NEXT: andl $7, %ecx
+; AVX1-NEXT: vpinsrw $3, -24(%rsp,%rcx,2), %xmm0, %xmm0
+; AVX1-NEXT: andl $7, %r8d
+; AVX1-NEXT: vpinsrw $4, -24(%rsp,%r8,2), %xmm0, %xmm0
+; AVX1-NEXT: andl $7, %r9d
+; AVX1-NEXT: vpinsrw $5, -24(%rsp,%r9,2), %xmm0, %xmm0
; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax
; AVX1-NEXT: andl $7, %eax
-; AVX1-NEXT: movzwl -24(%rsp,%rax,2), %eax
-; AVX1-NEXT: vmovd %eax, %xmm0
+; AVX1-NEXT: vpinsrw $6, -24(%rsp,%rax,2), %xmm0, %xmm0
; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax
; AVX1-NEXT: andl $7, %eax
-; AVX1-NEXT: vpinsrw $1, -24(%rsp,%rax,2), %xmm0, %xmm0
+; AVX1-NEXT: vpinsrw $7, -24(%rsp,%rax,2), %xmm0, %xmm0
; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax
; AVX1-NEXT: andl $7, %eax
-; AVX1-NEXT: vpinsrw $2, -24(%rsp,%rax,2), %xmm0, %xmm0
+; AVX1-NEXT: movzwl -24(%rsp,%rax,2), %eax
+; AVX1-NEXT: vmovd %eax, %xmm1
; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax
; AVX1-NEXT: andl $7, %eax
-; AVX1-NEXT: vpinsrw $3, -24(%rsp,%rax,2), %xmm0, %xmm0
+; AVX1-NEXT: vpinsrw $1, -24(%rsp,%rax,2), %xmm1, %xmm1
; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax
; AVX1-NEXT: andl $7, %eax
-; AVX1-NEXT: vpinsrw $4, -24(%rsp,%rax,2), %xmm0, %xmm0
+; AVX1-NEXT: vpinsrw $2, -24(%rsp,%rax,2), %xmm1, %xmm1
; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax
; AVX1-NEXT: andl $7, %eax
-; AVX1-NEXT: vpinsrw $5, -24(%rsp,%rax,2), %xmm0, %xmm0
+; AVX1-NEXT: vpinsrw $3, -24(%rsp,%rax,2), %xmm1, %xmm1
; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax
; AVX1-NEXT: andl $7, %eax
-; AVX1-NEXT: vpinsrw $6, -24(%rsp,%rax,2), %xmm0, %xmm0
+; AVX1-NEXT: vpinsrw $4, -24(%rsp,%rax,2), %xmm1, %xmm1
; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax
; AVX1-NEXT: andl $7, %eax
-; AVX1-NEXT: vpinsrw $7, -24(%rsp,%rax,2), %xmm0, %xmm0
-; AVX1-NEXT: andl $7, %edi
-; AVX1-NEXT: movzwl -24(%rsp,%rdi,2), %eax
-; AVX1-NEXT: vmovd %eax, %xmm1
-; AVX1-NEXT: andl $7, %esi
-; AVX1-NEXT: vpinsrw $1, -24(%rsp,%rsi,2), %xmm1, %xmm1
-; AVX1-NEXT: andl $7, %edx
-; AVX1-NEXT: vpinsrw $2, -24(%rsp,%rdx,2), %xmm1, %xmm1
-; AVX1-NEXT: andl $7, %ecx
-; AVX1-NEXT: vpinsrw $3, -24(%rsp,%rcx,2), %xmm1, %xmm1
-; AVX1-NEXT: andl $7, %r8d
-; AVX1-NEXT: vpinsrw $4, -24(%rsp,%r8,2), %xmm1, %xmm1
-; AVX1-NEXT: andl $7, %r9d
-; AVX1-NEXT: vpinsrw $5, -24(%rsp,%r9,2), %xmm1, %xmm1
+; AVX1-NEXT: vpinsrw $5, -24(%rsp,%rax,2), %xmm1, %xmm1
; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax
; AVX1-NEXT: andl $7, %eax
; AVX1-NEXT: vpinsrw $6, -24(%rsp,%rax,2), %xmm1, %xmm1
; AVX1-NEXT: movl {{[0-9]+}}(%rsp), %eax
; AVX1-NEXT: andl $7, %eax
; AVX1-NEXT: vpinsrw $7, -24(%rsp,%rax,2), %xmm1, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: var_shuffle_v16i16_v8i16_xxxxxxxxxxxxxxxx_i16:
-; AVX2: # BB#0:
-; AVX2-NEXT: # kill: %R9D<def> %R9D<kill> %R9<def>
-; AVX2-NEXT: # kill: %R8D<def> %R8D<kill> %R8<def>
-; AVX2-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def>
-; AVX2-NEXT: # kill: %EDX<def> %EDX<kill> %RDX<def>
-; AVX2-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
-; AVX2-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
+; AVX2: # %bb.0:
+; AVX2-NEXT: # kill: def %r9d killed %r9d def %r9
+; AVX2-NEXT: # kill: def %r8d killed %r8d def %r8
+; AVX2-NEXT: # kill: def %ecx killed %ecx def %rcx
+; AVX2-NEXT: # kill: def %edx killed %edx def %rdx
+; AVX2-NEXT: # kill: def %esi killed %esi def %rsi
+; AVX2-NEXT: # kill: def %edi killed %edi def %rdi
+; AVX2-NEXT: andl $7, %edi
; AVX2-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: movzwl -24(%rsp,%rdi,2), %eax
+; AVX2-NEXT: vmovd %eax, %xmm0
+; AVX2-NEXT: andl $7, %esi
+; AVX2-NEXT: vpinsrw $1, -24(%rsp,%rsi,2), %xmm0, %xmm0
+; AVX2-NEXT: andl $7, %edx
+; AVX2-NEXT: vpinsrw $2, -24(%rsp,%rdx,2), %xmm0, %xmm0
+; AVX2-NEXT: andl $7, %ecx
+; AVX2-NEXT: vpinsrw $3, -24(%rsp,%rcx,2), %xmm0, %xmm0
+; AVX2-NEXT: andl $7, %r8d
+; AVX2-NEXT: vpinsrw $4, -24(%rsp,%r8,2), %xmm0, %xmm0
+; AVX2-NEXT: andl $7, %r9d
+; AVX2-NEXT: vpinsrw $5, -24(%rsp,%r9,2), %xmm0, %xmm0
; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax
; AVX2-NEXT: andl $7, %eax
-; AVX2-NEXT: movzwl -24(%rsp,%rax,2), %eax
-; AVX2-NEXT: vmovd %eax, %xmm0
+; AVX2-NEXT: vpinsrw $6, -24(%rsp,%rax,2), %xmm0, %xmm0
; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax
; AVX2-NEXT: andl $7, %eax
-; AVX2-NEXT: vpinsrw $1, -24(%rsp,%rax,2), %xmm0, %xmm0
+; AVX2-NEXT: vpinsrw $7, -24(%rsp,%rax,2), %xmm0, %xmm0
; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax
; AVX2-NEXT: andl $7, %eax
-; AVX2-NEXT: vpinsrw $2, -24(%rsp,%rax,2), %xmm0, %xmm0
+; AVX2-NEXT: movzwl -24(%rsp,%rax,2), %eax
+; AVX2-NEXT: vmovd %eax, %xmm1
; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax
; AVX2-NEXT: andl $7, %eax
-; AVX2-NEXT: vpinsrw $3, -24(%rsp,%rax,2), %xmm0, %xmm0
+; AVX2-NEXT: vpinsrw $1, -24(%rsp,%rax,2), %xmm1, %xmm1
; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax
; AVX2-NEXT: andl $7, %eax
-; AVX2-NEXT: vpinsrw $4, -24(%rsp,%rax,2), %xmm0, %xmm0
+; AVX2-NEXT: vpinsrw $2, -24(%rsp,%rax,2), %xmm1, %xmm1
; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax
; AVX2-NEXT: andl $7, %eax
-; AVX2-NEXT: vpinsrw $5, -24(%rsp,%rax,2), %xmm0, %xmm0
+; AVX2-NEXT: vpinsrw $3, -24(%rsp,%rax,2), %xmm1, %xmm1
; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax
; AVX2-NEXT: andl $7, %eax
-; AVX2-NEXT: vpinsrw $6, -24(%rsp,%rax,2), %xmm0, %xmm0
+; AVX2-NEXT: vpinsrw $4, -24(%rsp,%rax,2), %xmm1, %xmm1
; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax
; AVX2-NEXT: andl $7, %eax
-; AVX2-NEXT: vpinsrw $7, -24(%rsp,%rax,2), %xmm0, %xmm0
-; AVX2-NEXT: andl $7, %edi
-; AVX2-NEXT: movzwl -24(%rsp,%rdi,2), %eax
-; AVX2-NEXT: vmovd %eax, %xmm1
-; AVX2-NEXT: andl $7, %esi
-; AVX2-NEXT: vpinsrw $1, -24(%rsp,%rsi,2), %xmm1, %xmm1
-; AVX2-NEXT: andl $7, %edx
-; AVX2-NEXT: vpinsrw $2, -24(%rsp,%rdx,2), %xmm1, %xmm1
-; AVX2-NEXT: andl $7, %ecx
-; AVX2-NEXT: vpinsrw $3, -24(%rsp,%rcx,2), %xmm1, %xmm1
-; AVX2-NEXT: andl $7, %r8d
-; AVX2-NEXT: vpinsrw $4, -24(%rsp,%r8,2), %xmm1, %xmm1
-; AVX2-NEXT: andl $7, %r9d
-; AVX2-NEXT: vpinsrw $5, -24(%rsp,%r9,2), %xmm1, %xmm1
+; AVX2-NEXT: vpinsrw $5, -24(%rsp,%rax,2), %xmm1, %xmm1
; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax
; AVX2-NEXT: andl $7, %eax
; AVX2-NEXT: vpinsrw $6, -24(%rsp,%rax,2), %xmm1, %xmm1
; AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax
; AVX2-NEXT: andl $7, %eax
; AVX2-NEXT: vpinsrw $7, -24(%rsp,%rax,2), %xmm1, %xmm1
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX2-NEXT: retq
%x0 = extractelement <8 x i16> %x, i32 %i0
%x1 = extractelement <8 x i16> %x, i32 %i1
@@ -650,57 +593,31 @@ define <16 x i16> @var_shuffle_v16i16_v8i16_xxxxxxxxxxxxxxxx_i16(<8 x i16> %x, i
;
define <4 x i64> @mem_shuffle_v4i64_v4i64_xxxx_i64(<4 x i64> %x, i64* %i) nounwind {
-; AVX1-LABEL: mem_shuffle_v4i64_v4i64_xxxx_i64:
-; AVX1: # BB#0:
-; AVX1-NEXT: pushq %rbp
-; AVX1-NEXT: movq %rsp, %rbp
-; AVX1-NEXT: andq $-32, %rsp
-; AVX1-NEXT: subq $64, %rsp
-; AVX1-NEXT: movq (%rdi), %rax
-; AVX1-NEXT: movq 8(%rdi), %rcx
-; AVX1-NEXT: andl $3, %eax
-; AVX1-NEXT: andl $3, %ecx
-; AVX1-NEXT: movq 16(%rdi), %rdx
-; AVX1-NEXT: andl $3, %edx
-; AVX1-NEXT: movq 24(%rdi), %rsi
-; AVX1-NEXT: andl $3, %esi
-; AVX1-NEXT: vmovaps %ymm0, (%rsp)
-; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
-; AVX1-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT: movq %rbp, %rsp
-; AVX1-NEXT: popq %rbp
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: mem_shuffle_v4i64_v4i64_xxxx_i64:
-; AVX2: # BB#0:
-; AVX2-NEXT: pushq %rbp
-; AVX2-NEXT: movq %rsp, %rbp
-; AVX2-NEXT: andq $-32, %rsp
-; AVX2-NEXT: subq $64, %rsp
-; AVX2-NEXT: movq (%rdi), %rax
-; AVX2-NEXT: movq 8(%rdi), %rcx
-; AVX2-NEXT: andl $3, %eax
-; AVX2-NEXT: andl $3, %ecx
-; AVX2-NEXT: movq 16(%rdi), %rdx
-; AVX2-NEXT: andl $3, %edx
-; AVX2-NEXT: movq 24(%rdi), %rsi
-; AVX2-NEXT: andl $3, %esi
-; AVX2-NEXT: vmovaps %ymm0, (%rsp)
-; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
-; AVX2-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX2-NEXT: movq %rbp, %rsp
-; AVX2-NEXT: popq %rbp
-; AVX2-NEXT: retq
+; ALL-LABEL: mem_shuffle_v4i64_v4i64_xxxx_i64:
+; ALL: # %bb.0:
+; ALL-NEXT: pushq %rbp
+; ALL-NEXT: movq %rsp, %rbp
+; ALL-NEXT: andq $-32, %rsp
+; ALL-NEXT: subq $64, %rsp
+; ALL-NEXT: movq (%rdi), %rax
+; ALL-NEXT: movq 8(%rdi), %rcx
+; ALL-NEXT: andl $3, %eax
+; ALL-NEXT: andl $3, %ecx
+; ALL-NEXT: movq 16(%rdi), %rdx
+; ALL-NEXT: andl $3, %edx
+; ALL-NEXT: movq 24(%rdi), %rsi
+; ALL-NEXT: andl $3, %esi
+; ALL-NEXT: vmovaps %ymm0, (%rsp)
+; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; ALL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
+; ALL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; ALL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
+; ALL-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
+; ALL-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; ALL-NEXT: movq %rbp, %rsp
+; ALL-NEXT: popq %rbp
+; ALL-NEXT: retq
%p0 = getelementptr inbounds i64, i64* %i, i32 0
%p1 = getelementptr inbounds i64, i64* %i, i32 1
%p2 = getelementptr inbounds i64, i64* %i, i32 2
@@ -721,45 +638,25 @@ define <4 x i64> @mem_shuffle_v4i64_v4i64_xxxx_i64(<4 x i64> %x, i64* %i) nounwi
}
define <4 x i64> @mem_shuffle_v4i64_v2i64_xxxx_i64(<2 x i64> %x, i64* %i) nounwind {
-; AVX1-LABEL: mem_shuffle_v4i64_v2i64_xxxx_i64:
-; AVX1: # BB#0:
-; AVX1-NEXT: movq (%rdi), %rax
-; AVX1-NEXT: movq 8(%rdi), %rcx
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: andl $1, %ecx
-; AVX1-NEXT: movq 16(%rdi), %rdx
-; AVX1-NEXT: andl $1, %edx
-; AVX1-NEXT: movq 24(%rdi), %rsi
-; AVX1-NEXT: andl $1, %esi
-; AVX1-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
-; AVX1-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: mem_shuffle_v4i64_v2i64_xxxx_i64:
-; AVX2: # BB#0:
-; AVX2-NEXT: movq (%rdi), %rax
-; AVX2-NEXT: movq 8(%rdi), %rcx
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: andl $1, %ecx
-; AVX2-NEXT: movq 16(%rdi), %rdx
-; AVX2-NEXT: andl $1, %edx
-; AVX2-NEXT: movq 24(%rdi), %rsi
-; AVX2-NEXT: andl $1, %esi
-; AVX2-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
-; AVX2-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX2-NEXT: retq
+; ALL-LABEL: mem_shuffle_v4i64_v2i64_xxxx_i64:
+; ALL: # %bb.0:
+; ALL-NEXT: movq (%rdi), %rax
+; ALL-NEXT: movq 8(%rdi), %rcx
+; ALL-NEXT: andl $1, %eax
+; ALL-NEXT: andl $1, %ecx
+; ALL-NEXT: movq 16(%rdi), %rdx
+; ALL-NEXT: andl $1, %edx
+; ALL-NEXT: movq 24(%rdi), %rsi
+; ALL-NEXT: andl $1, %esi
+; ALL-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; ALL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
+; ALL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; ALL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
+; ALL-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
+; ALL-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; ALL-NEXT: retq
%p0 = getelementptr inbounds i64, i64* %i, i32 0
%p1 = getelementptr inbounds i64, i64* %i, i32 1
%p2 = getelementptr inbounds i64, i64* %i, i32 2
diff --git a/test/CodeGen/X86/vector-sqrt.ll b/test/CodeGen/X86/vector-sqrt.ll
index c5ac4466b5fa..1e6b3c1358b3 100644
--- a/test/CodeGen/X86/vector-sqrt.ll
+++ b/test/CodeGen/X86/vector-sqrt.ll
@@ -4,7 +4,7 @@
; Function Attrs: nounwind readonly uwtable
define <2 x double> @sqrtd2(double* nocapture readonly %v) local_unnamed_addr #0 {
; CHECK-LABEL: sqrtd2:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vsqrtsd (%rdi), %xmm0, %xmm0
; CHECK-NEXT: vsqrtsd 8(%rdi), %xmm1, %xmm1
; CHECK-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
@@ -26,7 +26,7 @@ declare double @sqrt(double) local_unnamed_addr #1
; Function Attrs: nounwind readonly uwtable
define <4 x float> @sqrtf4(float* nocapture readonly %v) local_unnamed_addr #0 {
; CHECK-LABEL: sqrtf4:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vsqrtss (%rdi), %xmm0, %xmm0
; CHECK-NEXT: vsqrtss 4(%rdi), %xmm1, %xmm1
; CHECK-NEXT: vsqrtss 8(%rdi), %xmm2, %xmm2
diff --git a/test/CodeGen/X86/vector-trunc-math.ll b/test/CodeGen/X86/vector-trunc-math.ll
index d4fbb72bbe6d..7d26cec9928a 100644
--- a/test/CodeGen/X86/vector-trunc-math.ll
+++ b/test/CodeGen/X86/vector-trunc-math.ll
@@ -12,14 +12,14 @@
define <4 x i32> @trunc_add_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
; SSE-LABEL: trunc_add_v4i64_v4i32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: paddq %xmm3, %xmm1
; SSE-NEXT: paddq %xmm2, %xmm0
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_add_v4i64_v4i32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2
@@ -29,19 +29,19 @@ define <4 x i32> @trunc_add_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_add_v4i64_v4i32:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512-LABEL: trunc_add_v4i64_v4i32:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpaddq %ymm1, %ymm0, %ymm0
; AVX512-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = add <4 x i64> %a0, %a1
@@ -51,27 +51,26 @@ define <4 x i32> @trunc_add_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
define <8 x i16> @trunc_add_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
; SSE-LABEL: trunc_add_v8i64_v8i16:
-; SSE: # BB#0:
-; SSE-NEXT: paddq %xmm4, %xmm0
-; SSE-NEXT: paddq %xmm5, %xmm1
+; SSE: # %bb.0:
; SSE-NEXT: paddq %xmm6, %xmm2
; SSE-NEXT: paddq %xmm7, %xmm3
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; SSE-NEXT: paddq %xmm4, %xmm0
+; SSE-NEXT: paddq %xmm5, %xmm1
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
+; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
-; SSE-NEXT: movapd %xmm2, %xmm0
+; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_add_v8i64_v8i16:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
@@ -92,7 +91,7 @@ define <8 x i16> @trunc_add_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_add_v8i64_v8i16:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpaddq %ymm3, %ymm1, %ymm1
; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
@@ -102,12 +101,12 @@ define <8 x i16> @trunc_add_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512-LABEL: trunc_add_v8i64_v8i16:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vpmovqw %zmm0, %xmm0
; AVX512-NEXT: vzeroupper
@@ -119,7 +118,7 @@ define <8 x i16> @trunc_add_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
define <8 x i16> @trunc_add_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
; SSE-LABEL: trunc_add_v8i32_v8i16:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: paddd %xmm2, %xmm0
; SSE-NEXT: paddd %xmm3, %xmm1
; SSE-NEXT: pslld $16, %xmm1
@@ -130,7 +129,7 @@ define <8 x i16> @trunc_add_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_add_v8i32_v8i16:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
@@ -143,19 +142,19 @@ define <8 x i16> @trunc_add_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_add_v8i32_v8i16:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512-LABEL: trunc_add_v8i32_v8i16:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; AVX512-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = add <8 x i32> %a0, %a1
@@ -165,7 +164,7 @@ define <8 x i16> @trunc_add_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
define <16 x i8> @trunc_add_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind {
; SSE-LABEL: trunc_add_v16i64_v16i8:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm0
; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm1
; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm2
@@ -193,7 +192,7 @@ define <16 x i8> @trunc_add_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_add_v16i64_v16i8:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpaddq %xmm4, %xmm0, %xmm8
; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
@@ -230,7 +229,7 @@ define <16 x i8> @trunc_add_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_add_v16i64_v16i8:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpaddq %ymm5, %ymm1, %ymm1
; AVX2-NEXT: vpaddq %ymm4, %ymm0, %ymm0
; AVX2-NEXT: vpaddq %ymm7, %ymm3, %ymm3
@@ -257,38 +256,16 @@ define <16 x i8> @trunc_add_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
-; AVX512F-LABEL: trunc_add_v16i64_v16i8:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vpaddq %zmm3, %zmm1, %zmm1
-; AVX512F-NEXT: vpaddq %zmm2, %zmm0, %zmm0
-; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512F-NEXT: vpmovqd %zmm1, %ymm1
-; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512BW-LABEL: trunc_add_v16i64_v16i8:
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vpaddq %zmm3, %zmm1, %zmm1
-; AVX512BW-NEXT: vpaddq %zmm2, %zmm0, %zmm0
-; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1
-; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512DQ-LABEL: trunc_add_v16i64_v16i8:
-; AVX512DQ: # BB#0:
-; AVX512DQ-NEXT: vpaddq %zmm3, %zmm1, %zmm1
-; AVX512DQ-NEXT: vpaddq %zmm2, %zmm0, %zmm0
-; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512DQ-NEXT: vpmovqd %zmm1, %ymm1
-; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0
-; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
+; AVX512-LABEL: trunc_add_v16i64_v16i8:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpaddq %zmm3, %zmm1, %zmm1
+; AVX512-NEXT: vpaddq %zmm2, %zmm0, %zmm0
+; AVX512-NEXT: vpmovqd %zmm0, %ymm0
+; AVX512-NEXT: vpmovqd %zmm1, %ymm1
+; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
%1 = add <16 x i64> %a0, %a1
%2 = trunc <16 x i64> %1 to <16 x i8>
ret <16 x i8> %2
@@ -296,7 +273,7 @@ define <16 x i8> @trunc_add_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin
define <16 x i8> @trunc_add_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind {
; SSE-LABEL: trunc_add_v16i32_v16i8:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: paddd %xmm4, %xmm0
; SSE-NEXT: paddd %xmm5, %xmm1
; SSE-NEXT: paddd %xmm6, %xmm2
@@ -312,7 +289,7 @@ define <16 x i8> @trunc_add_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwin
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_add_v16i32_v16i8:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
@@ -333,7 +310,7 @@ define <16 x i8> @trunc_add_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwin
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_add_v16i32_v16i8:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpaddd %ymm3, %ymm1, %ymm1
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
@@ -349,7 +326,7 @@ define <16 x i8> @trunc_add_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwin
; AVX2-NEXT: retq
;
; AVX512-LABEL: trunc_add_v16i32_v16i8:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vpmovdb %zmm0, %xmm0
; AVX512-NEXT: vzeroupper
@@ -361,7 +338,7 @@ define <16 x i8> @trunc_add_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwin
define <16 x i8> @trunc_add_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind {
; SSE-LABEL: trunc_add_v16i16_v16i8:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: paddw %xmm2, %xmm0
; SSE-NEXT: paddw %xmm3, %xmm1
; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
@@ -371,7 +348,7 @@ define <16 x i8> @trunc_add_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwin
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_add_v16i16_v16i8:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
@@ -384,7 +361,7 @@ define <16 x i8> @trunc_add_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwin
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_add_v16i16_v16i8:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
@@ -395,7 +372,7 @@ define <16 x i8> @trunc_add_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwin
; AVX2-NEXT: retq
;
; AVX512F-LABEL: trunc_add_v16i16_v16i8:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vpaddw %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
@@ -403,15 +380,15 @@ define <16 x i8> @trunc_add_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwin
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: trunc_add_v16i16_v16i8:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpaddw %ymm1, %ymm0, %ymm0
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
-; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512BW-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: trunc_add_v16i16_v16i8:
-; AVX512DQ: # BB#0:
+; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vpaddw %ymm1, %ymm0, %ymm0
; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
@@ -424,7 +401,7 @@ define <16 x i8> @trunc_add_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwin
define <8 x i16> @trunc_add_v8i32_v8i16_sext_8i8(<16 x i8> %a0, <8 x i32> %a1) {
; SSE-LABEL: trunc_add_v8i32_v8i16_sext_8i8:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pslld $16, %xmm2
; SSE-NEXT: psrad $16, %xmm2
; SSE-NEXT: pslld $16, %xmm1
@@ -436,7 +413,7 @@ define <8 x i16> @trunc_add_v8i32_v8i16_sext_8i8(<16 x i8> %a0, <8 x i32> %a1) {
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_add_v8i32_v8i16_sext_8i8:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
@@ -448,7 +425,7 @@ define <8 x i16> @trunc_add_v8i32_v8i16_sext_8i8(<16 x i8> %a0, <8 x i32> %a1) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_add_v8i32_v8i16_sext_8i8:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpmovsxbw %xmm0, %xmm0
; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
@@ -457,8 +434,8 @@ define <8 x i16> @trunc_add_v8i32_v8i16_sext_8i8(<16 x i8> %a0, <8 x i32> %a1) {
; AVX2-NEXT: retq
;
; AVX512-LABEL: trunc_add_v8i32_v8i16_sext_8i8:
-; AVX512: # BB#0:
-; AVX512-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; AVX512: # %bb.0:
+; AVX512-NEXT: # kill: def %ymm1 killed %ymm1 def %zmm1
; AVX512-NEXT: vpmovdw %zmm1, %ymm1
; AVX512-NEXT: vpmovsxbw %xmm0, %xmm0
; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
@@ -477,13 +454,13 @@ define <8 x i16> @trunc_add_v8i32_v8i16_sext_8i8(<16 x i8> %a0, <8 x i32> %a1) {
define <4 x i32> @trunc_add_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
; SSE-LABEL: trunc_add_const_v4i64_v4i32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; SSE-NEXT: paddd {{.*}}(%rip), %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_add_const_v4i64_v4i32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
@@ -491,7 +468,7 @@ define <4 x i32> @trunc_add_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_add_const_v4i64_v4i32:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
@@ -499,8 +476,8 @@ define <4 x i32> @trunc_add_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
; AVX2-NEXT: retq
;
; AVX512-LABEL: trunc_add_const_v4i64_v4i32:
-; AVX512: # BB#0:
-; AVX512-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512: # %bb.0:
+; AVX512-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; AVX512-NEXT: vpmovqd %zmm0, %ymm0
; AVX512-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
; AVX512-NEXT: vzeroupper
@@ -512,24 +489,23 @@ define <4 x i32> @trunc_add_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
define <8 x i16> @trunc_add_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
; SSE-LABEL: trunc_add_const_v8i64_v8i16:
-; SSE: # BB#0:
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; SSE: # %bb.0:
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
+; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
-; SSE-NEXT: paddw {{.*}}(%rip), %xmm2
-; SSE-NEXT: movdqa %xmm2, %xmm0
+; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
+; SSE-NEXT: paddw {{.*}}(%rip), %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_add_const_v8i64_v8i16:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
@@ -545,7 +521,7 @@ define <8 x i16> @trunc_add_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_add_const_v8i64_v8i16:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
@@ -558,7 +534,7 @@ define <8 x i16> @trunc_add_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
; AVX2-NEXT: retq
;
; AVX512-LABEL: trunc_add_const_v8i64_v8i16:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpmovqw %zmm0, %xmm0
; AVX512-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0
; AVX512-NEXT: vzeroupper
@@ -570,7 +546,7 @@ define <8 x i16> @trunc_add_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
define <8 x i16> @trunc_add_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
; SSE-LABEL: trunc_add_const_v8i32_v8i16:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pslld $16, %xmm1
; SSE-NEXT: psrad $16, %xmm1
; SSE-NEXT: pslld $16, %xmm0
@@ -580,7 +556,7 @@ define <8 x i16> @trunc_add_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_add_const_v8i32_v8i16:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
@@ -591,7 +567,7 @@ define <8 x i16> @trunc_add_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_add_const_v8i32_v8i16:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0
@@ -599,8 +575,8 @@ define <8 x i16> @trunc_add_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
; AVX2-NEXT: retq
;
; AVX512-LABEL: trunc_add_const_v8i32_v8i16:
-; AVX512: # BB#0:
-; AVX512-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512: # %bb.0:
+; AVX512-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; AVX512-NEXT: vpmovdw %zmm0, %ymm0
; AVX512-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0
; AVX512-NEXT: vzeroupper
@@ -612,7 +588,7 @@ define <8 x i16> @trunc_add_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
define <16 x i8> @trunc_add_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
; SSE-LABEL: trunc_add_const_v16i64_v16i8:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
; SSE-NEXT: pand %xmm8, %xmm7
; SSE-NEXT: pand %xmm8, %xmm6
@@ -633,7 +609,7 @@ define <16 x i8> @trunc_add_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_add_const_v16i64_v16i8:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
; AVX1-NEXT: vpand %xmm5, %xmm4, %xmm4
@@ -659,7 +635,7 @@ define <16 x i8> @trunc_add_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_add_const_v16i64_v16i8:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
@@ -683,35 +659,15 @@ define <16 x i8> @trunc_add_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
-; AVX512F-LABEL: trunc_add_const_v16i64_v16i8:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512F-NEXT: vpmovqd %zmm1, %ymm1
-; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512F-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512BW-LABEL: trunc_add_const_v16i64_v16i8:
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1
-; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512BW-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512DQ-LABEL: trunc_add_const_v16i64_v16i8:
-; AVX512DQ: # BB#0:
-; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512DQ-NEXT: vpmovqd %zmm1, %ymm1
-; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0
-; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512DQ-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
+; AVX512-LABEL: trunc_add_const_v16i64_v16i8:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpmovqd %zmm0, %ymm0
+; AVX512-NEXT: vpmovqd %zmm1, %ymm1
+; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
%1 = add <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
%2 = trunc <16 x i64> %1 to <16 x i8>
ret <16 x i8> %2
@@ -719,7 +675,7 @@ define <16 x i8> @trunc_add_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
define <16 x i8> @trunc_add_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
; SSE-LABEL: trunc_add_const_v16i32_v16i8:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
; SSE-NEXT: pand %xmm4, %xmm3
; SSE-NEXT: pand %xmm4, %xmm2
@@ -732,7 +688,7 @@ define <16 x i8> @trunc_add_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_add_const_v16i32_v16i8:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
@@ -748,7 +704,7 @@ define <16 x i8> @trunc_add_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_add_const_v16i32_v16i8:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
@@ -763,7 +719,7 @@ define <16 x i8> @trunc_add_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
; AVX2-NEXT: retq
;
; AVX512-LABEL: trunc_add_const_v16i32_v16i8:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpmovdb %zmm0, %xmm0
; AVX512-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
; AVX512-NEXT: vzeroupper
@@ -775,7 +731,7 @@ define <16 x i8> @trunc_add_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
define <16 x i8> @trunc_add_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
; SSE-LABEL: trunc_add_const_v16i16_v16i8:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
; SSE-NEXT: pand %xmm2, %xmm1
; SSE-NEXT: pand %xmm2, %xmm0
@@ -784,7 +740,7 @@ define <16 x i8> @trunc_add_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_add_const_v16i16_v16i8:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
@@ -795,7 +751,7 @@ define <16 x i8> @trunc_add_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_add_const_v16i16_v16i8:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
@@ -806,7 +762,7 @@ define <16 x i8> @trunc_add_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
; AVX2-NEXT: retq
;
; AVX512F-LABEL: trunc_add_const_v16i16_v16i8:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
; AVX512F-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
@@ -814,15 +770,15 @@ define <16 x i8> @trunc_add_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: trunc_add_const_v16i16_v16i8:
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
; AVX512BW-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: trunc_add_const_v16i16_v16i8:
-; AVX512DQ: # BB#0:
+; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
; AVX512DQ-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
@@ -839,14 +795,14 @@ define <16 x i8> @trunc_add_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
define <4 x i32> @trunc_sub_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
; SSE-LABEL: trunc_sub_v4i64_v4i32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: psubq %xmm3, %xmm1
; SSE-NEXT: psubq %xmm2, %xmm0
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_sub_v4i64_v4i32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpsubq %xmm2, %xmm3, %xmm2
@@ -856,19 +812,19 @@ define <4 x i32> @trunc_sub_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_sub_v4i64_v4i32:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512-LABEL: trunc_sub_v4i64_v4i32:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpsubq %ymm1, %ymm0, %ymm0
; AVX512-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = sub <4 x i64> %a0, %a1
@@ -878,27 +834,26 @@ define <4 x i32> @trunc_sub_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
define <8 x i16> @trunc_sub_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
; SSE-LABEL: trunc_sub_v8i64_v8i16:
-; SSE: # BB#0:
-; SSE-NEXT: psubq %xmm4, %xmm0
-; SSE-NEXT: psubq %xmm5, %xmm1
+; SSE: # %bb.0:
; SSE-NEXT: psubq %xmm6, %xmm2
; SSE-NEXT: psubq %xmm7, %xmm3
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; SSE-NEXT: psubq %xmm4, %xmm0
+; SSE-NEXT: psubq %xmm5, %xmm1
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
+; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
-; SSE-NEXT: movapd %xmm2, %xmm0
+; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_sub_v8i64_v8i16:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
@@ -919,7 +874,7 @@ define <8 x i16> @trunc_sub_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_sub_v8i64_v8i16:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpsubq %ymm3, %ymm1, %ymm1
; AVX2-NEXT: vpsubq %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
@@ -929,12 +884,12 @@ define <8 x i16> @trunc_sub_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512-LABEL: trunc_sub_v8i64_v8i16:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpsubq %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vpmovqw %zmm0, %xmm0
; AVX512-NEXT: vzeroupper
@@ -946,7 +901,7 @@ define <8 x i16> @trunc_sub_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
define <8 x i16> @trunc_sub_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
; SSE-LABEL: trunc_sub_v8i32_v8i16:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: psubd %xmm2, %xmm0
; SSE-NEXT: psubd %xmm3, %xmm1
; SSE-NEXT: pslld $16, %xmm1
@@ -957,7 +912,7 @@ define <8 x i16> @trunc_sub_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_sub_v8i32_v8i16:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
@@ -970,19 +925,19 @@ define <8 x i16> @trunc_sub_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_sub_v8i32_v8i16:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512-LABEL: trunc_sub_v8i32_v8i16:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpsubd %ymm1, %ymm0, %ymm0
; AVX512-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = sub <8 x i32> %a0, %a1
@@ -992,7 +947,7 @@ define <8 x i16> @trunc_sub_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
define <16 x i8> @trunc_sub_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind {
; SSE-LABEL: trunc_sub_v16i64_v16i8:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm0
; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm1
; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm2
@@ -1020,7 +975,7 @@ define <16 x i8> @trunc_sub_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_sub_v16i64_v16i8:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpsubq %xmm4, %xmm0, %xmm8
; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
@@ -1057,7 +1012,7 @@ define <16 x i8> @trunc_sub_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_sub_v16i64_v16i8:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpsubq %ymm5, %ymm1, %ymm1
; AVX2-NEXT: vpsubq %ymm4, %ymm0, %ymm0
; AVX2-NEXT: vpsubq %ymm7, %ymm3, %ymm3
@@ -1084,38 +1039,16 @@ define <16 x i8> @trunc_sub_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
-; AVX512F-LABEL: trunc_sub_v16i64_v16i8:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vpsubq %zmm3, %zmm1, %zmm1
-; AVX512F-NEXT: vpsubq %zmm2, %zmm0, %zmm0
-; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512F-NEXT: vpmovqd %zmm1, %ymm1
-; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512BW-LABEL: trunc_sub_v16i64_v16i8:
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vpsubq %zmm3, %zmm1, %zmm1
-; AVX512BW-NEXT: vpsubq %zmm2, %zmm0, %zmm0
-; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1
-; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512DQ-LABEL: trunc_sub_v16i64_v16i8:
-; AVX512DQ: # BB#0:
-; AVX512DQ-NEXT: vpsubq %zmm3, %zmm1, %zmm1
-; AVX512DQ-NEXT: vpsubq %zmm2, %zmm0, %zmm0
-; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512DQ-NEXT: vpmovqd %zmm1, %ymm1
-; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0
-; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
+; AVX512-LABEL: trunc_sub_v16i64_v16i8:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpsubq %zmm3, %zmm1, %zmm1
+; AVX512-NEXT: vpsubq %zmm2, %zmm0, %zmm0
+; AVX512-NEXT: vpmovqd %zmm0, %ymm0
+; AVX512-NEXT: vpmovqd %zmm1, %ymm1
+; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
%1 = sub <16 x i64> %a0, %a1
%2 = trunc <16 x i64> %1 to <16 x i8>
ret <16 x i8> %2
@@ -1123,7 +1056,7 @@ define <16 x i8> @trunc_sub_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin
define <16 x i8> @trunc_sub_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind {
; SSE-LABEL: trunc_sub_v16i32_v16i8:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: psubd %xmm4, %xmm0
; SSE-NEXT: psubd %xmm5, %xmm1
; SSE-NEXT: psubd %xmm6, %xmm2
@@ -1139,7 +1072,7 @@ define <16 x i8> @trunc_sub_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwin
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_sub_v16i32_v16i8:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
@@ -1160,7 +1093,7 @@ define <16 x i8> @trunc_sub_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwin
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_sub_v16i32_v16i8:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpsubd %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpsubd %ymm3, %ymm1, %ymm1
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
@@ -1176,7 +1109,7 @@ define <16 x i8> @trunc_sub_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwin
; AVX2-NEXT: retq
;
; AVX512-LABEL: trunc_sub_v16i32_v16i8:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpsubd %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vpmovdb %zmm0, %xmm0
; AVX512-NEXT: vzeroupper
@@ -1188,7 +1121,7 @@ define <16 x i8> @trunc_sub_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwin
define <16 x i8> @trunc_sub_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind {
; SSE-LABEL: trunc_sub_v16i16_v16i8:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: psubw %xmm2, %xmm0
; SSE-NEXT: psubw %xmm3, %xmm1
; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
@@ -1198,7 +1131,7 @@ define <16 x i8> @trunc_sub_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwin
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_sub_v16i16_v16i8:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
@@ -1211,7 +1144,7 @@ define <16 x i8> @trunc_sub_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwin
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_sub_v16i16_v16i8:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpsubw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
@@ -1222,7 +1155,7 @@ define <16 x i8> @trunc_sub_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwin
; AVX2-NEXT: retq
;
; AVX512F-LABEL: trunc_sub_v16i16_v16i8:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vpsubw %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
@@ -1230,15 +1163,15 @@ define <16 x i8> @trunc_sub_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwin
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: trunc_sub_v16i16_v16i8:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpsubw %ymm1, %ymm0, %ymm0
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
-; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512BW-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: trunc_sub_v16i16_v16i8:
-; AVX512DQ: # BB#0:
+; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vpsubw %ymm1, %ymm0, %ymm0
; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
@@ -1255,7 +1188,7 @@ define <16 x i8> @trunc_sub_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwin
define <4 x i32> @trunc_sub_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
; SSE-LABEL: trunc_sub_const_v4i64_v4i32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movl $1, %eax
; SSE-NEXT: movq %rax, %xmm2
; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7]
@@ -1265,7 +1198,7 @@ define <4 x i32> @trunc_sub_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_sub_const_v4i64_v4i32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: movl $1, %eax
; AVX1-NEXT: vmovq %rax, %xmm1
; AVX1-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]
@@ -1277,19 +1210,19 @@ define <4 x i32> @trunc_sub_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_sub_const_v4i64_v4i32:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpsubq {{.*}}(%rip), %ymm0, %ymm0
; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512-LABEL: trunc_sub_const_v4i64_v4i32:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpsubq {{.*}}(%rip), %ymm0, %ymm0
; AVX512-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = sub <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3>
@@ -1299,7 +1232,7 @@ define <4 x i32> @trunc_sub_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
define <8 x i16> @trunc_sub_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
; SSE-LABEL: trunc_sub_const_v8i64_v8i16:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movl $1, %eax
; SSE-NEXT: movq %rax, %xmm4
; SSE-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5,6,7]
@@ -1322,7 +1255,7 @@ define <8 x i16> @trunc_sub_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_sub_const_v8i64_v8i16:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: movl $1, %eax
; AVX1-NEXT: vmovq %rax, %xmm2
; AVX1-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7]
@@ -1344,7 +1277,7 @@ define <8 x i16> @trunc_sub_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_sub_const_v8i64_v8i16:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpsubq {{.*}}(%rip), %ymm1, %ymm1
; AVX2-NEXT: vpsubq {{.*}}(%rip), %ymm0, %ymm0
; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
@@ -1354,12 +1287,12 @@ define <8 x i16> @trunc_sub_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512-LABEL: trunc_sub_const_v8i64_v8i16:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpsubq {{.*}}(%rip), %zmm0, %zmm0
; AVX512-NEXT: vpmovqw %zmm0, %xmm0
; AVX512-NEXT: vzeroupper
@@ -1371,7 +1304,7 @@ define <8 x i16> @trunc_sub_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
define <8 x i16> @trunc_sub_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
; SSE-LABEL: trunc_sub_const_v8i32_v8i16:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: psubd {{.*}}(%rip), %xmm0
; SSE-NEXT: psubd {{.*}}(%rip), %xmm1
; SSE-NEXT: pslld $16, %xmm1
@@ -1382,7 +1315,7 @@ define <8 x i16> @trunc_sub_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_sub_const_v8i32_v8i16:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpsubd {{.*}}(%rip), %xmm0, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpsubd {{.*}}(%rip), %xmm0, %xmm0
@@ -1394,19 +1327,19 @@ define <8 x i16> @trunc_sub_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_sub_const_v8i32_v8i16:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpsubd {{.*}}(%rip), %ymm0, %ymm0
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512-LABEL: trunc_sub_const_v8i32_v8i16:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpsubd {{.*}}(%rip), %ymm0, %ymm0
; AVX512-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = sub <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -1416,7 +1349,7 @@ define <8 x i16> @trunc_sub_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
define <16 x i8> @trunc_sub_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
; SSE-LABEL: trunc_sub_const_v16i64_v16i8:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movl $1, %eax
; SSE-NEXT: movq %rax, %xmm8
; SSE-NEXT: pslldq {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,xmm8[0,1,2,3,4,5,6,7]
@@ -1447,7 +1380,7 @@ define <16 x i8> @trunc_sub_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_sub_const_v16i64_v16i8:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: movl $1, %eax
; AVX1-NEXT: vmovq %rax, %xmm4
; AVX1-NEXT: vpslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5,6,7]
@@ -1483,7 +1416,7 @@ define <16 x i8> @trunc_sub_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_sub_const_v16i64_v16i8:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpsubq {{.*}}(%rip), %ymm1, %ymm1
; AVX2-NEXT: vpsubq {{.*}}(%rip), %ymm0, %ymm0
; AVX2-NEXT: vpsubq {{.*}}(%rip), %ymm3, %ymm3
@@ -1510,38 +1443,16 @@ define <16 x i8> @trunc_sub_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
-; AVX512F-LABEL: trunc_sub_const_v16i64_v16i8:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vpsubq {{.*}}(%rip), %zmm1, %zmm1
-; AVX512F-NEXT: vpsubq {{.*}}(%rip), %zmm0, %zmm0
-; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512F-NEXT: vpmovqd %zmm1, %ymm1
-; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512BW-LABEL: trunc_sub_const_v16i64_v16i8:
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vpsubq {{.*}}(%rip), %zmm1, %zmm1
-; AVX512BW-NEXT: vpsubq {{.*}}(%rip), %zmm0, %zmm0
-; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1
-; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512DQ-LABEL: trunc_sub_const_v16i64_v16i8:
-; AVX512DQ: # BB#0:
-; AVX512DQ-NEXT: vpsubq {{.*}}(%rip), %zmm1, %zmm1
-; AVX512DQ-NEXT: vpsubq {{.*}}(%rip), %zmm0, %zmm0
-; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512DQ-NEXT: vpmovqd %zmm1, %ymm1
-; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0
-; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
+; AVX512-LABEL: trunc_sub_const_v16i64_v16i8:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpsubq {{.*}}(%rip), %zmm1, %zmm1
+; AVX512-NEXT: vpsubq {{.*}}(%rip), %zmm0, %zmm0
+; AVX512-NEXT: vpmovqd %zmm0, %ymm0
+; AVX512-NEXT: vpmovqd %zmm1, %ymm1
+; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
%1 = sub <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
%2 = trunc <16 x i64> %1 to <16 x i8>
ret <16 x i8> %2
@@ -1549,7 +1460,7 @@ define <16 x i8> @trunc_sub_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
define <16 x i8> @trunc_sub_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
; SSE-LABEL: trunc_sub_const_v16i32_v16i8:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: psubd {{.*}}(%rip), %xmm0
; SSE-NEXT: psubd {{.*}}(%rip), %xmm1
; SSE-NEXT: psubd {{.*}}(%rip), %xmm2
@@ -1565,7 +1476,7 @@ define <16 x i8> @trunc_sub_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_sub_const_v16i32_v16i8:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpsubd {{.*}}(%rip), %xmm0, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpsubd {{.*}}(%rip), %xmm0, %xmm0
@@ -1584,7 +1495,7 @@ define <16 x i8> @trunc_sub_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_sub_const_v16i32_v16i8:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpsubd {{.*}}(%rip), %ymm0, %ymm0
; AVX2-NEXT: vpsubd {{.*}}(%rip), %ymm1, %ymm1
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
@@ -1600,7 +1511,7 @@ define <16 x i8> @trunc_sub_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
; AVX2-NEXT: retq
;
; AVX512-LABEL: trunc_sub_const_v16i32_v16i8:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpsubd {{.*}}(%rip), %zmm0, %zmm0
; AVX512-NEXT: vpmovdb %zmm0, %xmm0
; AVX512-NEXT: vzeroupper
@@ -1612,7 +1523,7 @@ define <16 x i8> @trunc_sub_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
define <16 x i8> @trunc_sub_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
; SSE-LABEL: trunc_sub_const_v16i16_v16i8:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: psubw {{.*}}(%rip), %xmm0
; SSE-NEXT: psubw {{.*}}(%rip), %xmm1
; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
@@ -1622,7 +1533,7 @@ define <16 x i8> @trunc_sub_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_sub_const_v16i16_v16i8:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0
@@ -1634,7 +1545,7 @@ define <16 x i8> @trunc_sub_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_sub_const_v16i16_v16i8:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpsubw {{.*}}(%rip), %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
@@ -1645,7 +1556,7 @@ define <16 x i8> @trunc_sub_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
; AVX2-NEXT: retq
;
; AVX512F-LABEL: trunc_sub_const_v16i16_v16i8:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vpsubw {{.*}}(%rip), %ymm0, %ymm0
; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
@@ -1653,15 +1564,15 @@ define <16 x i8> @trunc_sub_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: trunc_sub_const_v16i16_v16i8:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpsubw {{.*}}(%rip), %ymm0, %ymm0
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
-; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512BW-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: trunc_sub_const_v16i16_v16i8:
-; AVX512DQ: # BB#0:
+; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vpsubw {{.*}}(%rip), %ymm0, %ymm0
; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
@@ -1678,7 +1589,7 @@ define <16 x i8> @trunc_sub_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
define <4 x i32> @trunc_mul_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
; SSE-LABEL: trunc_mul_v4i64_v4i32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movdqa %xmm1, %xmm4
; SSE-NEXT: psrlq $32, %xmm4
; SSE-NEXT: pmuludq %xmm3, %xmm4
@@ -1703,7 +1614,7 @@ define <4 x i32> @trunc_mul_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_mul_v4i64_v4i32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
@@ -1713,7 +1624,7 @@ define <4 x i32> @trunc_mul_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_mul_v4i64_v4i32:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
@@ -1723,9 +1634,9 @@ define <4 x i32> @trunc_mul_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
; AVX2-NEXT: retq
;
; AVX512F-LABEL: trunc_mul_v4i64_v4i32:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
-; AVX512F-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: # kill: def %ymm1 killed %ymm1 def %zmm1
+; AVX512F-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; AVX512F-NEXT: vpmovqd %zmm1, %ymm1
; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
; AVX512F-NEXT: vpmulld %xmm1, %xmm0, %xmm0
@@ -1733,9 +1644,9 @@ define <4 x i32> @trunc_mul_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: trunc_mul_v4i64_v4i32:
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
-; AVX512BW-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: # kill: def %ymm1 killed %ymm1 def %zmm1
+; AVX512BW-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1
; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
; AVX512BW-NEXT: vpmulld %xmm1, %xmm0, %xmm0
@@ -1743,12 +1654,12 @@ define <4 x i32> @trunc_mul_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: trunc_mul_v4i64_v4i32:
-; AVX512DQ: # BB#0:
-; AVX512DQ-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
-; AVX512DQ-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: # kill: def %ymm1 killed %ymm1 def %zmm1
+; AVX512DQ-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0
; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512DQ-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
%1 = mul <4 x i64> %a0, %a1
@@ -1758,35 +1669,34 @@ define <4 x i32> @trunc_mul_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
define <8 x i16> @trunc_mul_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
; SSE-LABEL: trunc_mul_v8i64_v8i16:
-; SSE: # BB#0:
-; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,0,2,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,1,0,2,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
+; SSE: # %bb.0:
; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,2,2,3,4,5,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7]
; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
+; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,0,2,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,1,0,2,4,5,6,7]
+; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm4[0],xmm6[1]
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
+; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
-; SSE-NEXT: pmullw %xmm6, %xmm2
-; SSE-NEXT: movdqa %xmm2, %xmm0
+; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
+; SSE-NEXT: pmullw %xmm6, %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_mul_v8i64_v8i16:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1,2,3],xmm4[4],xmm5[5,6,7]
@@ -1811,7 +1721,7 @@ define <8 x i16> @trunc_mul_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_mul_v8i64_v8i16:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
@@ -1832,7 +1742,7 @@ define <8 x i16> @trunc_mul_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
; AVX2-NEXT: retq
;
; AVX512F-LABEL: trunc_mul_v8i64_v8i16:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vpmovqw %zmm1, %xmm1
; AVX512F-NEXT: vpmovqw %zmm0, %xmm0
; AVX512F-NEXT: vpmullw %xmm1, %xmm0, %xmm0
@@ -1840,7 +1750,7 @@ define <8 x i16> @trunc_mul_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: trunc_mul_v8i64_v8i16:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpmovqw %zmm1, %xmm1
; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0
; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0
@@ -1848,7 +1758,7 @@ define <8 x i16> @trunc_mul_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: trunc_mul_v8i64_v8i16:
-; AVX512DQ: # BB#0:
+; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0
; AVX512DQ-NEXT: vpmovqw %zmm0, %xmm0
; AVX512DQ-NEXT: vzeroupper
@@ -1860,7 +1770,7 @@ define <8 x i16> @trunc_mul_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
define <8 x i16> @trunc_mul_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
; SSE-LABEL: trunc_mul_v8i32_v8i16:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
; SSE-NEXT: pmuludq %xmm2, %xmm0
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
@@ -1883,7 +1793,7 @@ define <8 x i16> @trunc_mul_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_mul_v8i32_v8i16:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
@@ -1896,19 +1806,19 @@ define <8 x i16> @trunc_mul_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_mul_v8i32_v8i16:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512-LABEL: trunc_mul_v8i32_v8i16:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpmulld %ymm1, %ymm0, %ymm0
; AVX512-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = mul <8 x i32> %a0, %a1
@@ -1918,7 +1828,7 @@ define <8 x i16> @trunc_mul_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
define <16 x i8> @trunc_mul_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind {
; SSE-LABEL: trunc_mul_v16i64_v16i8:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
; SSE-NEXT: movdqa %xmm0, %xmm9
; SSE-NEXT: psrlq $32, %xmm9
@@ -2026,7 +1936,7 @@ define <16 x i8> @trunc_mul_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_mul_v16i64_v16i8:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm8
; AVX1-NEXT: vpmuludq %xmm4, %xmm8, %xmm8
; AVX1-NEXT: vpsrlq $32, %xmm4, %xmm9
@@ -2119,7 +2029,7 @@ define <16 x i8> @trunc_mul_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_mul_v16i64_v16i8:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,2,2,3,4,6,6,7]
; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,2,3]
; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
@@ -2155,7 +2065,7 @@ define <16 x i8> @trunc_mul_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin
; AVX2-NEXT: retq
;
; AVX512F-LABEL: trunc_mul_v16i64_v16i8:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vpmovqd %zmm3, %ymm3
; AVX512F-NEXT: vpmovqd %zmm1, %ymm1
; AVX512F-NEXT: vpmulld %ymm3, %ymm1, %ymm1
@@ -2168,7 +2078,7 @@ define <16 x i8> @trunc_mul_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: trunc_mul_v16i64_v16i8:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpmovqd %zmm3, %ymm3
; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1
; AVX512BW-NEXT: vpmulld %ymm3, %ymm1, %ymm1
@@ -2181,12 +2091,12 @@ define <16 x i8> @trunc_mul_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: trunc_mul_v16i64_v16i8:
-; AVX512DQ: # BB#0:
+; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vpmullq %zmm3, %zmm1, %zmm1
; AVX512DQ-NEXT: vpmullq %zmm2, %zmm0, %zmm0
; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0
; AVX512DQ-NEXT: vpmovqd %zmm1, %ymm1
-; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
@@ -2197,7 +2107,7 @@ define <16 x i8> @trunc_mul_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin
define <16 x i8> @trunc_mul_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind {
; SSE-LABEL: trunc_mul_v16i32_v16i8:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm0[1,1,3,3]
; SSE-NEXT: pmuludq %xmm4, %xmm0
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
@@ -2237,7 +2147,7 @@ define <16 x i8> @trunc_mul_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwin
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_mul_v16i32_v16i8:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpmulld %xmm2, %xmm0, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
@@ -2258,7 +2168,7 @@ define <16 x i8> @trunc_mul_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwin
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_mul_v16i32_v16i8:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpmulld %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpmulld %ymm3, %ymm1, %ymm1
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
@@ -2274,7 +2184,7 @@ define <16 x i8> @trunc_mul_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwin
; AVX2-NEXT: retq
;
; AVX512-LABEL: trunc_mul_v16i32_v16i8:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpmulld %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vpmovdb %zmm0, %xmm0
; AVX512-NEXT: vzeroupper
@@ -2286,7 +2196,7 @@ define <16 x i8> @trunc_mul_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwin
define <16 x i8> @trunc_mul_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind {
; SSE-LABEL: trunc_mul_v16i16_v16i8:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pmullw %xmm2, %xmm0
; SSE-NEXT: pmullw %xmm3, %xmm1
; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
@@ -2296,7 +2206,7 @@ define <16 x i8> @trunc_mul_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwin
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_mul_v16i16_v16i8:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
@@ -2309,7 +2219,7 @@ define <16 x i8> @trunc_mul_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwin
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_mul_v16i16_v16i8:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
@@ -2320,7 +2230,7 @@ define <16 x i8> @trunc_mul_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwin
; AVX2-NEXT: retq
;
; AVX512F-LABEL: trunc_mul_v16i16_v16i8:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vpmullw %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
@@ -2328,15 +2238,15 @@ define <16 x i8> @trunc_mul_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwin
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: trunc_mul_v16i16_v16i8:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpmullw %ymm1, %ymm0, %ymm0
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
-; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512BW-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: trunc_mul_v16i16_v16i8:
-; AVX512DQ: # BB#0:
+; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0
; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
@@ -2349,7 +2259,7 @@ define <16 x i8> @trunc_mul_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwin
define <8 x i16> @trunc_mul_v8i32_v8i16_zext_8i8(<16 x i8> %a0, <8 x i32> %a1) {
; SSE-LABEL: trunc_mul_v8i32_v8i16_zext_8i8:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pxor %xmm3, %xmm3
; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
; SSE-NEXT: pslld $16, %xmm2
@@ -2361,7 +2271,7 @@ define <8 x i16> @trunc_mul_v8i32_v8i16_zext_8i8(<16 x i8> %a0, <8 x i32> %a1) {
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_mul_v8i32_v8i16_zext_8i8:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
@@ -2373,7 +2283,7 @@ define <8 x i16> @trunc_mul_v8i32_v8i16_zext_8i8(<16 x i8> %a0, <8 x i32> %a1) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_mul_v8i32_v8i16_zext_8i8:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
@@ -2382,8 +2292,8 @@ define <8 x i16> @trunc_mul_v8i32_v8i16_zext_8i8(<16 x i8> %a0, <8 x i32> %a1) {
; AVX2-NEXT: retq
;
; AVX512-LABEL: trunc_mul_v8i32_v8i16_zext_8i8:
-; AVX512: # BB#0:
-; AVX512-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; AVX512: # %bb.0:
+; AVX512-NEXT: # kill: def %ymm1 killed %ymm1 def %zmm1
; AVX512-NEXT: vpmovdw %zmm1, %ymm1
; AVX512-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0
@@ -2402,7 +2312,7 @@ define <8 x i16> @trunc_mul_v8i32_v8i16_zext_8i8(<16 x i8> %a0, <8 x i32> %a1) {
define <4 x i32> @trunc_mul_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
; SSE-LABEL: trunc_mul_const_v4i64_v4i32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movdqa {{.*#+}} xmm2 = [2,3]
; SSE-NEXT: movdqa %xmm1, %xmm3
; SSE-NEXT: pmuludq %xmm2, %xmm3
@@ -2423,7 +2333,7 @@ define <4 x i32> @trunc_mul_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_mul_const_v4i64_v4i32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
@@ -2431,7 +2341,7 @@ define <4 x i32> @trunc_mul_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_mul_const_v4i64_v4i32:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
@@ -2439,8 +2349,8 @@ define <4 x i32> @trunc_mul_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
; AVX2-NEXT: retq
;
; AVX512-LABEL: trunc_mul_const_v4i64_v4i32:
-; AVX512: # BB#0:
-; AVX512-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512: # %bb.0:
+; AVX512-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; AVX512-NEXT: vpmovqd %zmm0, %ymm0
; AVX512-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
; AVX512-NEXT: vzeroupper
@@ -2452,24 +2362,23 @@ define <4 x i32> @trunc_mul_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
define <8 x i16> @trunc_mul_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
; SSE-LABEL: trunc_mul_const_v8i64_v8i16:
-; SSE: # BB#0:
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; SSE: # %bb.0:
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
+; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
-; SSE-NEXT: pmullw {{.*}}(%rip), %xmm2
-; SSE-NEXT: movdqa %xmm2, %xmm0
+; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
+; SSE-NEXT: pmullw {{.*}}(%rip), %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_mul_const_v8i64_v8i16:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
@@ -2485,7 +2394,7 @@ define <8 x i16> @trunc_mul_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_mul_const_v8i64_v8i16:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
@@ -2498,7 +2407,7 @@ define <8 x i16> @trunc_mul_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
; AVX2-NEXT: retq
;
; AVX512-LABEL: trunc_mul_const_v8i64_v8i16:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpmovqw %zmm0, %xmm0
; AVX512-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
; AVX512-NEXT: vzeroupper
@@ -2510,7 +2419,7 @@ define <8 x i16> @trunc_mul_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
define <8 x i16> @trunc_mul_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
; SSE-LABEL: trunc_mul_const_v8i32_v8i16:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pslld $16, %xmm1
; SSE-NEXT: psrad $16, %xmm1
; SSE-NEXT: pslld $16, %xmm0
@@ -2520,7 +2429,7 @@ define <8 x i16> @trunc_mul_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_mul_const_v8i32_v8i16:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
@@ -2531,7 +2440,7 @@ define <8 x i16> @trunc_mul_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_mul_const_v8i32_v8i16:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
@@ -2539,8 +2448,8 @@ define <8 x i16> @trunc_mul_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
; AVX2-NEXT: retq
;
; AVX512-LABEL: trunc_mul_const_v8i32_v8i16:
-; AVX512: # BB#0:
-; AVX512-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512: # %bb.0:
+; AVX512-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; AVX512-NEXT: vpmovdw %zmm0, %ymm0
; AVX512-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
; AVX512-NEXT: vzeroupper
@@ -2552,7 +2461,7 @@ define <8 x i16> @trunc_mul_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
define <16 x i8> @trunc_mul_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
; SSE-LABEL: trunc_mul_const_v16i64_v16i8:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movl $1, %eax
; SSE-NEXT: movq %rax, %xmm8
; SSE-NEXT: pslldq {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,xmm8[0,1,2,3,4,5,6,7]
@@ -2630,7 +2539,7 @@ define <16 x i8> @trunc_mul_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_mul_const_v16i64_v16i8:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: movl $1, %eax
; AVX1-NEXT: vmovq %rax, %xmm4
; AVX1-NEXT: vpslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5,6,7]
@@ -2705,7 +2614,7 @@ define <16 x i8> @trunc_mul_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_mul_const_v16i64_v16i8:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
; AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm2, %xmm2
@@ -2732,38 +2641,16 @@ define <16 x i8> @trunc_mul_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
-; AVX512F-LABEL: trunc_mul_const_v16i64_v16i8:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512F-NEXT: vpmulld {{.*}}(%rip), %ymm0, %ymm0
-; AVX512F-NEXT: vpmovqd %zmm1, %ymm1
-; AVX512F-NEXT: vpmulld {{.*}}(%rip), %ymm1, %ymm1
-; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512BW-LABEL: trunc_mul_const_v16i64_v16i8:
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512BW-NEXT: vpmulld {{.*}}(%rip), %ymm0, %ymm0
-; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1
-; AVX512BW-NEXT: vpmulld {{.*}}(%rip), %ymm1, %ymm1
-; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512DQ-LABEL: trunc_mul_const_v16i64_v16i8:
-; AVX512DQ: # BB#0:
-; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512DQ-NEXT: vpmulld {{.*}}(%rip), %ymm0, %ymm0
-; AVX512DQ-NEXT: vpmovqd %zmm1, %ymm1
-; AVX512DQ-NEXT: vpmulld {{.*}}(%rip), %ymm1, %ymm1
-; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0
-; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
+; AVX512-LABEL: trunc_mul_const_v16i64_v16i8:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpmovqd %zmm0, %ymm0
+; AVX512-NEXT: vpmulld {{.*}}(%rip), %ymm0, %ymm0
+; AVX512-NEXT: vpmovqd %zmm1, %ymm1
+; AVX512-NEXT: vpmulld {{.*}}(%rip), %ymm1, %ymm1
+; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
%1 = mul <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
%2 = trunc <16 x i64> %1 to <16 x i8>
ret <16 x i8> %2
@@ -2771,7 +2658,7 @@ define <16 x i8> @trunc_mul_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
define <16 x i8> @trunc_mul_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
; SSE-LABEL: trunc_mul_const_v16i32_v16i8:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movdqa {{.*#+}} xmm4 = [0,1,2,3]
; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
; SSE-NEXT: pmuludq %xmm4, %xmm0
@@ -2815,7 +2702,7 @@ define <16 x i8> @trunc_mul_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_mul_const_v16i32_v16i8:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
@@ -2834,7 +2721,7 @@ define <16 x i8> @trunc_mul_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_mul_const_v16i32_v16i8:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
@@ -2850,7 +2737,7 @@ define <16 x i8> @trunc_mul_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
; AVX2-NEXT: retq
;
; AVX512-LABEL: trunc_mul_const_v16i32_v16i8:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpmulld {{.*}}(%rip), %zmm0, %zmm0
; AVX512-NEXT: vpmovdb %zmm0, %xmm0
; AVX512-NEXT: vzeroupper
@@ -2862,7 +2749,7 @@ define <16 x i8> @trunc_mul_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
define <16 x i8> @trunc_mul_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
; SSE-LABEL: trunc_mul_const_v16i16_v16i8:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pmullw {{.*}}(%rip), %xmm0
; SSE-NEXT: pmullw {{.*}}(%rip), %xmm1
; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
@@ -2872,7 +2759,7 @@ define <16 x i8> @trunc_mul_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_mul_const_v16i16_v16i8:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
@@ -2884,7 +2771,7 @@ define <16 x i8> @trunc_mul_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_mul_const_v16i16_v16i8:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
@@ -2895,7 +2782,7 @@ define <16 x i8> @trunc_mul_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
; AVX2-NEXT: retq
;
; AVX512F-LABEL: trunc_mul_const_v16i16_v16i8:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
@@ -2903,15 +2790,15 @@ define <16 x i8> @trunc_mul_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: trunc_mul_const_v16i16_v16i8:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
-; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512BW-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: trunc_mul_const_v16i16_v16i8:
-; AVX512DQ: # BB#0:
+; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
@@ -2928,14 +2815,14 @@ define <16 x i8> @trunc_mul_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
define <4 x i32> @trunc_and_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
; SSE-LABEL: trunc_and_v4i64_v4i32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: andps %xmm3, %xmm1
; SSE-NEXT: andps %xmm2, %xmm0
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_and_v4i64_v4i32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
@@ -2943,19 +2830,19 @@ define <4 x i32> @trunc_and_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_and_v4i64_v4i32:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2: # %bb.0:
+; AVX2-NEXT: vandps %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512-LABEL: trunc_and_v4i64_v4i32:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = and <4 x i64> %a0, %a1
@@ -2965,27 +2852,26 @@ define <4 x i32> @trunc_and_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
define <8 x i16> @trunc_and_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
; SSE-LABEL: trunc_and_v8i64_v8i16:
-; SSE: # BB#0:
-; SSE-NEXT: pand %xmm4, %xmm0
-; SSE-NEXT: pand %xmm5, %xmm1
+; SSE: # %bb.0:
; SSE-NEXT: pand %xmm6, %xmm2
; SSE-NEXT: pand %xmm7, %xmm3
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; SSE-NEXT: pand %xmm4, %xmm0
+; SSE-NEXT: pand %xmm5, %xmm1
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
+; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
-; SSE-NEXT: movapd %xmm2, %xmm0
+; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_and_v8i64_v8i16:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
@@ -3002,7 +2888,7 @@ define <8 x i16> @trunc_and_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_and_v8i64_v8i16:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1
; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
@@ -3012,12 +2898,12 @@ define <8 x i16> @trunc_and_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512-LABEL: trunc_and_v8i64_v8i16:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vpmovqw %zmm0, %xmm0
; AVX512-NEXT: vzeroupper
@@ -3029,7 +2915,7 @@ define <8 x i16> @trunc_and_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
define <8 x i16> @trunc_and_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
; SSE-LABEL: trunc_and_v8i32_v8i16:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pand %xmm3, %xmm1
; SSE-NEXT: pslld $16, %xmm1
; SSE-NEXT: psrad $16, %xmm1
@@ -3040,7 +2926,7 @@ define <8 x i16> @trunc_and_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_and_v8i32_v8i16:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
@@ -3051,19 +2937,19 @@ define <8 x i16> @trunc_and_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_and_v8i32_v8i16:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512-LABEL: trunc_and_v8i32_v8i16:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = and <8 x i32> %a0, %a1
@@ -3073,7 +2959,7 @@ define <8 x i16> @trunc_and_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
define <16 x i8> @trunc_and_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind {
; SSE-LABEL: trunc_and_v16i64_v16i8:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm0
; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm1
; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm2
@@ -3101,7 +2987,7 @@ define <16 x i8> @trunc_and_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_and_v16i64_v16i8:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0
; AVX1-NEXT: vandps %ymm5, %ymm1, %ymm1
; AVX1-NEXT: vandps %ymm6, %ymm2, %ymm2
@@ -3130,7 +3016,7 @@ define <16 x i8> @trunc_and_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_and_v16i64_v16i8:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpand %ymm5, %ymm1, %ymm1
; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm7, %ymm3, %ymm3
@@ -3157,38 +3043,16 @@ define <16 x i8> @trunc_and_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
-; AVX512F-LABEL: trunc_and_v16i64_v16i8:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vpandq %zmm3, %zmm1, %zmm1
-; AVX512F-NEXT: vpandq %zmm2, %zmm0, %zmm0
-; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512F-NEXT: vpmovqd %zmm1, %ymm1
-; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512BW-LABEL: trunc_and_v16i64_v16i8:
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vpandq %zmm3, %zmm1, %zmm1
-; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0
-; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1
-; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512DQ-LABEL: trunc_and_v16i64_v16i8:
-; AVX512DQ: # BB#0:
-; AVX512DQ-NEXT: vpandq %zmm3, %zmm1, %zmm1
-; AVX512DQ-NEXT: vpandq %zmm2, %zmm0, %zmm0
-; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512DQ-NEXT: vpmovqd %zmm1, %ymm1
-; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0
-; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
+; AVX512-LABEL: trunc_and_v16i64_v16i8:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpandq %zmm3, %zmm1, %zmm1
+; AVX512-NEXT: vpandq %zmm2, %zmm0, %zmm0
+; AVX512-NEXT: vpmovqd %zmm0, %ymm0
+; AVX512-NEXT: vpmovqd %zmm1, %ymm1
+; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
%1 = and <16 x i64> %a0, %a1
%2 = trunc <16 x i64> %1 to <16 x i8>
ret <16 x i8> %2
@@ -3196,7 +3060,7 @@ define <16 x i8> @trunc_and_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin
define <16 x i8> @trunc_and_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind {
; SSE-LABEL: trunc_and_v16i32_v16i8:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
; SSE-NEXT: pand %xmm8, %xmm7
; SSE-NEXT: pand %xmm3, %xmm7
@@ -3212,7 +3076,7 @@ define <16 x i8> @trunc_and_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwin
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_and_v16i32_v16i8:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
@@ -3229,7 +3093,7 @@ define <16 x i8> @trunc_and_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwin
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_and_v16i32_v16i8:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
@@ -3245,8 +3109,8 @@ define <16 x i8> @trunc_and_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwin
; AVX2-NEXT: retq
;
; AVX512-LABEL: trunc_and_v16i32_v16i8:
-; AVX512: # BB#0:
-; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vpmovdb %zmm0, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
@@ -3257,7 +3121,7 @@ define <16 x i8> @trunc_and_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwin
define <16 x i8> @trunc_and_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind {
; SSE-LABEL: trunc_and_v16i16_v16i8:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
; SSE-NEXT: pand %xmm4, %xmm3
; SSE-NEXT: pand %xmm1, %xmm3
@@ -3267,7 +3131,7 @@ define <16 x i8> @trunc_and_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwin
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_and_v16i16_v16i8:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
@@ -3278,7 +3142,7 @@ define <16 x i8> @trunc_and_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwin
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_and_v16i16_v16i8:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
@@ -3289,7 +3153,7 @@ define <16 x i8> @trunc_and_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwin
; AVX2-NEXT: retq
;
; AVX512F-LABEL: trunc_and_v16i16_v16i8:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
@@ -3297,15 +3161,15 @@ define <16 x i8> @trunc_and_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwin
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: trunc_and_v16i16_v16i8:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
-; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512BW-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: trunc_and_v16i16_v16i8:
-; AVX512DQ: # BB#0:
+; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
@@ -3322,13 +3186,13 @@ define <16 x i8> @trunc_and_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwin
define <4 x i32> @trunc_and_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
; SSE-LABEL: trunc_and_const_v4i64_v4i32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; SSE-NEXT: andps {{.*}}(%rip), %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_and_const_v4i64_v4i32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; AVX1-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0
@@ -3336,16 +3200,16 @@ define <4 x i32> @trunc_and_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_and_const_v4i64_v4i32:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512-LABEL: trunc_and_const_v4i64_v4i32:
-; AVX512: # BB#0:
-; AVX512-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512: # %bb.0:
+; AVX512-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; AVX512-NEXT: vpmovqd %zmm0, %ymm0
; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX512-NEXT: vzeroupper
@@ -3357,24 +3221,23 @@ define <4 x i32> @trunc_and_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
define <8 x i16> @trunc_and_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
; SSE-LABEL: trunc_and_const_v8i64_v8i16:
-; SSE: # BB#0:
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; SSE: # %bb.0:
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
+; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
-; SSE-NEXT: andpd {{.*}}(%rip), %xmm2
-; SSE-NEXT: movapd %xmm2, %xmm0
+; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
+; SSE-NEXT: andpd {{.*}}(%rip), %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_and_const_v8i64_v8i16:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
@@ -3390,7 +3253,7 @@ define <8 x i16> @trunc_and_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_and_const_v8i64_v8i16:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
@@ -3403,7 +3266,7 @@ define <8 x i16> @trunc_and_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
; AVX2-NEXT: retq
;
; AVX512-LABEL: trunc_and_const_v8i64_v8i16:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpmovqw %zmm0, %xmm0
; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX512-NEXT: vzeroupper
@@ -3415,7 +3278,7 @@ define <8 x i16> @trunc_and_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
define <8 x i16> @trunc_and_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
; SSE-LABEL: trunc_and_const_v8i32_v8i16:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pslld $16, %xmm1
; SSE-NEXT: psrad $16, %xmm1
; SSE-NEXT: pslld $16, %xmm0
@@ -3425,7 +3288,7 @@ define <8 x i16> @trunc_and_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_and_const_v8i32_v8i16:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
@@ -3436,7 +3299,7 @@ define <8 x i16> @trunc_and_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_and_const_v8i32_v8i16:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
@@ -3444,8 +3307,8 @@ define <8 x i16> @trunc_and_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
; AVX2-NEXT: retq
;
; AVX512-LABEL: trunc_and_const_v8i32_v8i16:
-; AVX512: # BB#0:
-; AVX512-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512: # %bb.0:
+; AVX512-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; AVX512-NEXT: vpmovdw %zmm0, %ymm0
; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX512-NEXT: vzeroupper
@@ -3457,7 +3320,7 @@ define <8 x i16> @trunc_and_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
define <16 x i8> @trunc_and_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
; SSE-LABEL: trunc_and_const_v16i64_v16i8:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
; SSE-NEXT: pand %xmm8, %xmm7
; SSE-NEXT: pand %xmm8, %xmm6
@@ -3478,7 +3341,7 @@ define <16 x i8> @trunc_and_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_and_const_v16i64_v16i8:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
; AVX1-NEXT: vpand %xmm5, %xmm4, %xmm4
@@ -3504,7 +3367,7 @@ define <16 x i8> @trunc_and_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_and_const_v16i64_v16i8:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
@@ -3528,35 +3391,15 @@ define <16 x i8> @trunc_and_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
-; AVX512F-LABEL: trunc_and_const_v16i64_v16i8:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512F-NEXT: vpmovqd %zmm1, %ymm1
-; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512BW-LABEL: trunc_and_const_v16i64_v16i8:
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1
-; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512DQ-LABEL: trunc_and_const_v16i64_v16i8:
-; AVX512DQ: # BB#0:
-; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512DQ-NEXT: vpmovqd %zmm1, %ymm1
-; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0
-; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512DQ-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
+; AVX512-LABEL: trunc_and_const_v16i64_v16i8:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpmovqd %zmm0, %ymm0
+; AVX512-NEXT: vpmovqd %zmm1, %ymm1
+; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
%1 = and <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
%2 = trunc <16 x i64> %1 to <16 x i8>
ret <16 x i8> %2
@@ -3564,7 +3407,7 @@ define <16 x i8> @trunc_and_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
define <16 x i8> @trunc_and_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
; SSE-LABEL: trunc_and_const_v16i32_v16i8:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
; SSE-NEXT: pand %xmm4, %xmm3
; SSE-NEXT: pand %xmm4, %xmm2
@@ -3577,7 +3420,7 @@ define <16 x i8> @trunc_and_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_and_const_v16i32_v16i8:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
@@ -3593,7 +3436,7 @@ define <16 x i8> @trunc_and_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_and_const_v16i32_v16i8:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
@@ -3608,7 +3451,7 @@ define <16 x i8> @trunc_and_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
; AVX2-NEXT: retq
;
; AVX512-LABEL: trunc_and_const_v16i32_v16i8:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpmovdb %zmm0, %xmm0
; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX512-NEXT: vzeroupper
@@ -3620,7 +3463,7 @@ define <16 x i8> @trunc_and_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
define <16 x i8> @trunc_and_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
; SSE-LABEL: trunc_and_const_v16i16_v16i8:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
; SSE-NEXT: pand %xmm2, %xmm1
; SSE-NEXT: pand %xmm2, %xmm0
@@ -3629,7 +3472,7 @@ define <16 x i8> @trunc_and_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_and_const_v16i16_v16i8:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
@@ -3640,7 +3483,7 @@ define <16 x i8> @trunc_and_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_and_const_v16i16_v16i8:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
@@ -3651,7 +3494,7 @@ define <16 x i8> @trunc_and_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
; AVX2-NEXT: retq
;
; AVX512F-LABEL: trunc_and_const_v16i16_v16i8:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
@@ -3659,15 +3502,15 @@ define <16 x i8> @trunc_and_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: trunc_and_const_v16i16_v16i8:
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: trunc_and_const_v16i16_v16i8:
-; AVX512DQ: # BB#0:
+; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
; AVX512DQ-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
@@ -3684,14 +3527,14 @@ define <16 x i8> @trunc_and_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
define <4 x i32> @trunc_xor_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
; SSE-LABEL: trunc_xor_v4i64_v4i32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: xorps %xmm3, %xmm1
; SSE-NEXT: xorps %xmm2, %xmm0
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_xor_v4i64_v4i32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
@@ -3699,19 +3542,19 @@ define <4 x i32> @trunc_xor_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_xor_v4i64_v4i32:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2: # %bb.0:
+; AVX2-NEXT: vxorps %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512-LABEL: trunc_xor_v4i64_v4i32:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpxor %ymm1, %ymm0, %ymm0
; AVX512-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = xor <4 x i64> %a0, %a1
@@ -3721,27 +3564,26 @@ define <4 x i32> @trunc_xor_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
define <8 x i16> @trunc_xor_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
; SSE-LABEL: trunc_xor_v8i64_v8i16:
-; SSE: # BB#0:
-; SSE-NEXT: pxor %xmm4, %xmm0
-; SSE-NEXT: pxor %xmm5, %xmm1
+; SSE: # %bb.0:
; SSE-NEXT: pxor %xmm6, %xmm2
; SSE-NEXT: pxor %xmm7, %xmm3
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; SSE-NEXT: pxor %xmm4, %xmm0
+; SSE-NEXT: pxor %xmm5, %xmm1
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
+; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
-; SSE-NEXT: movapd %xmm2, %xmm0
+; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_xor_v8i64_v8i16:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vxorps %ymm2, %ymm0, %ymm0
; AVX1-NEXT: vxorps %ymm3, %ymm1, %ymm1
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
@@ -3758,7 +3600,7 @@ define <8 x i16> @trunc_xor_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_xor_v8i64_v8i16:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm1
; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
@@ -3768,12 +3610,12 @@ define <8 x i16> @trunc_xor_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512-LABEL: trunc_xor_v8i64_v8i16:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vpmovqw %zmm0, %xmm0
; AVX512-NEXT: vzeroupper
@@ -3785,7 +3627,7 @@ define <8 x i16> @trunc_xor_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
define <8 x i16> @trunc_xor_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
; SSE-LABEL: trunc_xor_v8i32_v8i16:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pxor %xmm3, %xmm1
; SSE-NEXT: pslld $16, %xmm1
; SSE-NEXT: psrad $16, %xmm1
@@ -3796,7 +3638,7 @@ define <8 x i16> @trunc_xor_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_xor_v8i32_v8i16:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
@@ -3807,19 +3649,19 @@ define <8 x i16> @trunc_xor_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_xor_v8i32_v8i16:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512-LABEL: trunc_xor_v8i32_v8i16:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpxor %ymm1, %ymm0, %ymm0
; AVX512-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = xor <8 x i32> %a0, %a1
@@ -3829,7 +3671,7 @@ define <8 x i16> @trunc_xor_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
define <16 x i8> @trunc_xor_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind {
; SSE-LABEL: trunc_xor_v16i64_v16i8:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm0
; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm1
; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm2
@@ -3857,7 +3699,7 @@ define <16 x i8> @trunc_xor_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_xor_v16i64_v16i8:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vxorps %ymm4, %ymm0, %ymm0
; AVX1-NEXT: vxorps %ymm5, %ymm1, %ymm1
; AVX1-NEXT: vxorps %ymm6, %ymm2, %ymm2
@@ -3886,7 +3728,7 @@ define <16 x i8> @trunc_xor_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_xor_v16i64_v16i8:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpxor %ymm5, %ymm1, %ymm1
; AVX2-NEXT: vpxor %ymm4, %ymm0, %ymm0
; AVX2-NEXT: vpxor %ymm7, %ymm3, %ymm3
@@ -3913,38 +3755,16 @@ define <16 x i8> @trunc_xor_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
-; AVX512F-LABEL: trunc_xor_v16i64_v16i8:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vpxorq %zmm3, %zmm1, %zmm1
-; AVX512F-NEXT: vpxorq %zmm2, %zmm0, %zmm0
-; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512F-NEXT: vpmovqd %zmm1, %ymm1
-; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512BW-LABEL: trunc_xor_v16i64_v16i8:
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vpxorq %zmm3, %zmm1, %zmm1
-; AVX512BW-NEXT: vpxorq %zmm2, %zmm0, %zmm0
-; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1
-; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512DQ-LABEL: trunc_xor_v16i64_v16i8:
-; AVX512DQ: # BB#0:
-; AVX512DQ-NEXT: vpxorq %zmm3, %zmm1, %zmm1
-; AVX512DQ-NEXT: vpxorq %zmm2, %zmm0, %zmm0
-; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512DQ-NEXT: vpmovqd %zmm1, %ymm1
-; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0
-; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
+; AVX512-LABEL: trunc_xor_v16i64_v16i8:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpxorq %zmm3, %zmm1, %zmm1
+; AVX512-NEXT: vpxorq %zmm2, %zmm0, %zmm0
+; AVX512-NEXT: vpmovqd %zmm0, %ymm0
+; AVX512-NEXT: vpmovqd %zmm1, %ymm1
+; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
%1 = xor <16 x i64> %a0, %a1
%2 = trunc <16 x i64> %1 to <16 x i8>
ret <16 x i8> %2
@@ -3952,7 +3772,7 @@ define <16 x i8> @trunc_xor_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin
define <16 x i8> @trunc_xor_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind {
; SSE-LABEL: trunc_xor_v16i32_v16i8:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pxor %xmm4, %xmm0
; SSE-NEXT: pxor %xmm5, %xmm1
; SSE-NEXT: pxor %xmm6, %xmm2
@@ -3968,7 +3788,7 @@ define <16 x i8> @trunc_xor_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwin
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_xor_v16i32_v16i8:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vxorps %ymm2, %ymm0, %ymm0
; AVX1-NEXT: vxorps %ymm3, %ymm1, %ymm1
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
@@ -3985,7 +3805,7 @@ define <16 x i8> @trunc_xor_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwin
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_xor_v16i32_v16i8:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm1
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
@@ -4001,8 +3821,8 @@ define <16 x i8> @trunc_xor_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwin
; AVX2-NEXT: retq
;
; AVX512-LABEL: trunc_xor_v16i32_v16i8:
-; AVX512: # BB#0:
-; AVX512-NEXT: vpxord %zmm1, %zmm0, %zmm0
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vpmovdb %zmm0, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
@@ -4013,7 +3833,7 @@ define <16 x i8> @trunc_xor_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwin
define <16 x i8> @trunc_xor_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind {
; SSE-LABEL: trunc_xor_v16i16_v16i8:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pxor %xmm2, %xmm0
; SSE-NEXT: pxor %xmm3, %xmm1
; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
@@ -4023,7 +3843,7 @@ define <16 x i8> @trunc_xor_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwin
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_xor_v16i16_v16i8:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
@@ -4034,7 +3854,7 @@ define <16 x i8> @trunc_xor_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwin
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_xor_v16i16_v16i8:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
@@ -4045,7 +3865,7 @@ define <16 x i8> @trunc_xor_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwin
; AVX2-NEXT: retq
;
; AVX512F-LABEL: trunc_xor_v16i16_v16i8:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vpxor %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
@@ -4053,15 +3873,15 @@ define <16 x i8> @trunc_xor_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwin
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: trunc_xor_v16i16_v16i8:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpxor %ymm1, %ymm0, %ymm0
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
-; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512BW-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: trunc_xor_v16i16_v16i8:
-; AVX512DQ: # BB#0:
+; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vpxor %ymm1, %ymm0, %ymm0
; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
@@ -4078,13 +3898,13 @@ define <16 x i8> @trunc_xor_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwin
define <4 x i32> @trunc_xor_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
; SSE-LABEL: trunc_xor_const_v4i64_v4i32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; SSE-NEXT: xorps {{.*}}(%rip), %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_xor_const_v4i64_v4i32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; AVX1-NEXT: vxorps {{.*}}(%rip), %xmm0, %xmm0
@@ -4092,16 +3912,16 @@ define <4 x i32> @trunc_xor_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_xor_const_v4i64_v4i32:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vxorps {{.*}}(%rip), %xmm0, %xmm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512-LABEL: trunc_xor_const_v4i64_v4i32:
-; AVX512: # BB#0:
-; AVX512-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512: # %bb.0:
+; AVX512-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; AVX512-NEXT: vpmovqd %zmm0, %ymm0
; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
; AVX512-NEXT: vzeroupper
@@ -4113,24 +3933,23 @@ define <4 x i32> @trunc_xor_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
define <8 x i16> @trunc_xor_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
; SSE-LABEL: trunc_xor_const_v8i64_v8i16:
-; SSE: # BB#0:
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; SSE: # %bb.0:
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
+; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
-; SSE-NEXT: xorpd {{.*}}(%rip), %xmm2
-; SSE-NEXT: movapd %xmm2, %xmm0
+; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
+; SSE-NEXT: xorpd {{.*}}(%rip), %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_xor_const_v8i64_v8i16:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
@@ -4146,7 +3965,7 @@ define <8 x i16> @trunc_xor_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_xor_const_v8i64_v8i16:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
@@ -4159,7 +3978,7 @@ define <8 x i16> @trunc_xor_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
; AVX2-NEXT: retq
;
; AVX512-LABEL: trunc_xor_const_v8i64_v8i16:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpmovqw %zmm0, %xmm0
; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
; AVX512-NEXT: vzeroupper
@@ -4171,7 +3990,7 @@ define <8 x i16> @trunc_xor_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
define <8 x i16> @trunc_xor_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
; SSE-LABEL: trunc_xor_const_v8i32_v8i16:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pslld $16, %xmm1
; SSE-NEXT: psrad $16, %xmm1
; SSE-NEXT: pslld $16, %xmm0
@@ -4181,7 +4000,7 @@ define <8 x i16> @trunc_xor_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_xor_const_v8i32_v8i16:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
@@ -4192,7 +4011,7 @@ define <8 x i16> @trunc_xor_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_xor_const_v8i32_v8i16:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
@@ -4200,8 +4019,8 @@ define <8 x i16> @trunc_xor_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
; AVX2-NEXT: retq
;
; AVX512-LABEL: trunc_xor_const_v8i32_v8i16:
-; AVX512: # BB#0:
-; AVX512-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512: # %bb.0:
+; AVX512-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; AVX512-NEXT: vpmovdw %zmm0, %ymm0
; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
; AVX512-NEXT: vzeroupper
@@ -4213,7 +4032,7 @@ define <8 x i16> @trunc_xor_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
define <16 x i8> @trunc_xor_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
; SSE-LABEL: trunc_xor_const_v16i64_v16i8:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
; SSE-NEXT: pand %xmm8, %xmm7
; SSE-NEXT: pand %xmm8, %xmm6
@@ -4234,7 +4053,7 @@ define <16 x i8> @trunc_xor_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_xor_const_v16i64_v16i8:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
; AVX1-NEXT: vpand %xmm5, %xmm4, %xmm4
@@ -4260,7 +4079,7 @@ define <16 x i8> @trunc_xor_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_xor_const_v16i64_v16i8:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
@@ -4284,35 +4103,15 @@ define <16 x i8> @trunc_xor_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
-; AVX512F-LABEL: trunc_xor_const_v16i64_v16i8:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512F-NEXT: vpmovqd %zmm1, %ymm1
-; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512F-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512BW-LABEL: trunc_xor_const_v16i64_v16i8:
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1
-; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512BW-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512DQ-LABEL: trunc_xor_const_v16i64_v16i8:
-; AVX512DQ: # BB#0:
-; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512DQ-NEXT: vpmovqd %zmm1, %ymm1
-; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0
-; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512DQ-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
+; AVX512-LABEL: trunc_xor_const_v16i64_v16i8:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpmovqd %zmm0, %ymm0
+; AVX512-NEXT: vpmovqd %zmm1, %ymm1
+; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
%1 = xor <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
%2 = trunc <16 x i64> %1 to <16 x i8>
ret <16 x i8> %2
@@ -4320,7 +4119,7 @@ define <16 x i8> @trunc_xor_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
define <16 x i8> @trunc_xor_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
; SSE-LABEL: trunc_xor_const_v16i32_v16i8:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
; SSE-NEXT: pand %xmm4, %xmm3
; SSE-NEXT: pand %xmm4, %xmm2
@@ -4333,7 +4132,7 @@ define <16 x i8> @trunc_xor_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_xor_const_v16i32_v16i8:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
@@ -4349,7 +4148,7 @@ define <16 x i8> @trunc_xor_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_xor_const_v16i32_v16i8:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
@@ -4364,7 +4163,7 @@ define <16 x i8> @trunc_xor_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
; AVX2-NEXT: retq
;
; AVX512-LABEL: trunc_xor_const_v16i32_v16i8:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpmovdb %zmm0, %xmm0
; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
; AVX512-NEXT: vzeroupper
@@ -4376,7 +4175,7 @@ define <16 x i8> @trunc_xor_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
define <16 x i8> @trunc_xor_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
; SSE-LABEL: trunc_xor_const_v16i16_v16i8:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
; SSE-NEXT: pand %xmm2, %xmm1
; SSE-NEXT: pand %xmm2, %xmm0
@@ -4385,7 +4184,7 @@ define <16 x i8> @trunc_xor_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_xor_const_v16i16_v16i8:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
@@ -4396,7 +4195,7 @@ define <16 x i8> @trunc_xor_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_xor_const_v16i16_v16i8:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
@@ -4407,7 +4206,7 @@ define <16 x i8> @trunc_xor_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
; AVX2-NEXT: retq
;
; AVX512F-LABEL: trunc_xor_const_v16i16_v16i8:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
; AVX512F-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
@@ -4415,15 +4214,15 @@ define <16 x i8> @trunc_xor_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: trunc_xor_const_v16i16_v16i8:
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
; AVX512BW-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: trunc_xor_const_v16i16_v16i8:
-; AVX512DQ: # BB#0:
+; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
; AVX512DQ-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
@@ -4440,14 +4239,14 @@ define <16 x i8> @trunc_xor_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
define <4 x i32> @trunc_or_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
; SSE-LABEL: trunc_or_v4i64_v4i32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: orps %xmm3, %xmm1
; SSE-NEXT: orps %xmm2, %xmm0
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_or_v4i64_v4i32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
@@ -4455,19 +4254,19 @@ define <4 x i32> @trunc_or_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_or_v4i64_v4i32:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2: # %bb.0:
+; AVX2-NEXT: vorps %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512-LABEL: trunc_or_v4i64_v4i32:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = or <4 x i64> %a0, %a1
@@ -4477,27 +4276,26 @@ define <4 x i32> @trunc_or_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
define <8 x i16> @trunc_or_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
; SSE-LABEL: trunc_or_v8i64_v8i16:
-; SSE: # BB#0:
-; SSE-NEXT: por %xmm4, %xmm0
-; SSE-NEXT: por %xmm5, %xmm1
+; SSE: # %bb.0:
; SSE-NEXT: por %xmm6, %xmm2
; SSE-NEXT: por %xmm7, %xmm3
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; SSE-NEXT: por %xmm4, %xmm0
+; SSE-NEXT: por %xmm5, %xmm1
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
+; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
-; SSE-NEXT: movapd %xmm2, %xmm0
+; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_or_v8i64_v8i16:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0
; AVX1-NEXT: vorps %ymm3, %ymm1, %ymm1
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
@@ -4514,7 +4312,7 @@ define <8 x i16> @trunc_or_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_or_v8i64_v8i16:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpor %ymm3, %ymm1, %ymm1
; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
@@ -4524,12 +4322,12 @@ define <8 x i16> @trunc_or_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512-LABEL: trunc_or_v8i64_v8i16:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vpmovqw %zmm0, %xmm0
; AVX512-NEXT: vzeroupper
@@ -4541,7 +4339,7 @@ define <8 x i16> @trunc_or_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
define <8 x i16> @trunc_or_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
; SSE-LABEL: trunc_or_v8i32_v8i16:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: por %xmm3, %xmm1
; SSE-NEXT: pslld $16, %xmm1
; SSE-NEXT: psrad $16, %xmm1
@@ -4552,7 +4350,7 @@ define <8 x i16> @trunc_or_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_or_v8i32_v8i16:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
@@ -4563,19 +4361,19 @@ define <8 x i16> @trunc_or_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_or_v8i32_v8i16:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512-LABEL: trunc_or_v8i32_v8i16:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = or <8 x i32> %a0, %a1
@@ -4585,7 +4383,7 @@ define <8 x i16> @trunc_or_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
define <16 x i8> @trunc_or_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind {
; SSE-LABEL: trunc_or_v16i64_v16i8:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm0
; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm1
; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm2
@@ -4613,7 +4411,7 @@ define <16 x i8> @trunc_or_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_or_v16i64_v16i8:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vorps %ymm4, %ymm0, %ymm0
; AVX1-NEXT: vorps %ymm5, %ymm1, %ymm1
; AVX1-NEXT: vorps %ymm6, %ymm2, %ymm2
@@ -4642,7 +4440,7 @@ define <16 x i8> @trunc_or_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_or_v16i64_v16i8:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpor %ymm5, %ymm1, %ymm1
; AVX2-NEXT: vpor %ymm4, %ymm0, %ymm0
; AVX2-NEXT: vpor %ymm7, %ymm3, %ymm3
@@ -4669,38 +4467,16 @@ define <16 x i8> @trunc_or_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
-; AVX512F-LABEL: trunc_or_v16i64_v16i8:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vporq %zmm3, %zmm1, %zmm1
-; AVX512F-NEXT: vporq %zmm2, %zmm0, %zmm0
-; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512F-NEXT: vpmovqd %zmm1, %ymm1
-; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512BW-LABEL: trunc_or_v16i64_v16i8:
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vporq %zmm3, %zmm1, %zmm1
-; AVX512BW-NEXT: vporq %zmm2, %zmm0, %zmm0
-; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1
-; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512DQ-LABEL: trunc_or_v16i64_v16i8:
-; AVX512DQ: # BB#0:
-; AVX512DQ-NEXT: vporq %zmm3, %zmm1, %zmm1
-; AVX512DQ-NEXT: vporq %zmm2, %zmm0, %zmm0
-; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512DQ-NEXT: vpmovqd %zmm1, %ymm1
-; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0
-; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
+; AVX512-LABEL: trunc_or_v16i64_v16i8:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vporq %zmm3, %zmm1, %zmm1
+; AVX512-NEXT: vporq %zmm2, %zmm0, %zmm0
+; AVX512-NEXT: vpmovqd %zmm0, %ymm0
+; AVX512-NEXT: vpmovqd %zmm1, %ymm1
+; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
%1 = or <16 x i64> %a0, %a1
%2 = trunc <16 x i64> %1 to <16 x i8>
ret <16 x i8> %2
@@ -4708,7 +4484,7 @@ define <16 x i8> @trunc_or_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind
define <16 x i8> @trunc_or_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind {
; SSE-LABEL: trunc_or_v16i32_v16i8:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: por %xmm4, %xmm0
; SSE-NEXT: por %xmm5, %xmm1
; SSE-NEXT: por %xmm6, %xmm2
@@ -4724,7 +4500,7 @@ define <16 x i8> @trunc_or_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_or_v16i32_v16i8:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0
; AVX1-NEXT: vorps %ymm3, %ymm1, %ymm1
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
@@ -4741,7 +4517,7 @@ define <16 x i8> @trunc_or_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_or_v16i32_v16i8:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpor %ymm3, %ymm1, %ymm1
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
@@ -4757,8 +4533,8 @@ define <16 x i8> @trunc_or_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind
; AVX2-NEXT: retq
;
; AVX512-LABEL: trunc_or_v16i32_v16i8:
-; AVX512: # BB#0:
-; AVX512-NEXT: vpord %zmm1, %zmm0, %zmm0
+; AVX512: # %bb.0:
+; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vpmovdb %zmm0, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
@@ -4769,7 +4545,7 @@ define <16 x i8> @trunc_or_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind
define <16 x i8> @trunc_or_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind {
; SSE-LABEL: trunc_or_v16i16_v16i8:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: por %xmm2, %xmm0
; SSE-NEXT: por %xmm3, %xmm1
; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
@@ -4779,7 +4555,7 @@ define <16 x i8> @trunc_or_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_or_v16i16_v16i8:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
@@ -4790,7 +4566,7 @@ define <16 x i8> @trunc_or_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_or_v16i16_v16i8:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
@@ -4801,7 +4577,7 @@ define <16 x i8> @trunc_or_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind
; AVX2-NEXT: retq
;
; AVX512F-LABEL: trunc_or_v16i16_v16i8:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
@@ -4809,15 +4585,15 @@ define <16 x i8> @trunc_or_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: trunc_or_v16i16_v16i8:
-; AVX512BW: # BB#0:
+; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
-; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512BW-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: trunc_or_v16i16_v16i8:
-; AVX512DQ: # BB#0:
+; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
@@ -4834,13 +4610,13 @@ define <16 x i8> @trunc_or_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind
define <4 x i32> @trunc_or_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
; SSE-LABEL: trunc_or_const_v4i64_v4i32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; SSE-NEXT: orps {{.*}}(%rip), %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_or_const_v4i64_v4i32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; AVX1-NEXT: vorps {{.*}}(%rip), %xmm0, %xmm0
@@ -4848,16 +4624,16 @@ define <4 x i32> @trunc_or_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_or_const_v4i64_v4i32:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vorps {{.*}}(%rip), %xmm0, %xmm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512-LABEL: trunc_or_const_v4i64_v4i32:
-; AVX512: # BB#0:
-; AVX512-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512: # %bb.0:
+; AVX512-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; AVX512-NEXT: vpmovqd %zmm0, %ymm0
; AVX512-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
; AVX512-NEXT: vzeroupper
@@ -4869,24 +4645,23 @@ define <4 x i32> @trunc_or_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
define <8 x i16> @trunc_or_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
; SSE-LABEL: trunc_or_const_v8i64_v8i16:
-; SSE: # BB#0:
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; SSE: # %bb.0:
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
+; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
-; SSE-NEXT: orpd {{.*}}(%rip), %xmm2
-; SSE-NEXT: movapd %xmm2, %xmm0
+; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
+; SSE-NEXT: orpd {{.*}}(%rip), %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_or_const_v8i64_v8i16:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
@@ -4902,7 +4677,7 @@ define <8 x i16> @trunc_or_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_or_const_v8i64_v8i16:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
@@ -4915,7 +4690,7 @@ define <8 x i16> @trunc_or_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
; AVX2-NEXT: retq
;
; AVX512-LABEL: trunc_or_const_v8i64_v8i16:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpmovqw %zmm0, %xmm0
; AVX512-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
; AVX512-NEXT: vzeroupper
@@ -4927,7 +4702,7 @@ define <8 x i16> @trunc_or_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
define <8 x i16> @trunc_or_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
; SSE-LABEL: trunc_or_const_v8i32_v8i16:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pslld $16, %xmm1
; SSE-NEXT: psrad $16, %xmm1
; SSE-NEXT: pslld $16, %xmm0
@@ -4937,7 +4712,7 @@ define <8 x i16> @trunc_or_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_or_const_v8i32_v8i16:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
@@ -4948,7 +4723,7 @@ define <8 x i16> @trunc_or_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_or_const_v8i32_v8i16:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
@@ -4956,8 +4731,8 @@ define <8 x i16> @trunc_or_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
; AVX2-NEXT: retq
;
; AVX512-LABEL: trunc_or_const_v8i32_v8i16:
-; AVX512: # BB#0:
-; AVX512-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512: # %bb.0:
+; AVX512-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; AVX512-NEXT: vpmovdw %zmm0, %ymm0
; AVX512-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
; AVX512-NEXT: vzeroupper
@@ -4969,7 +4744,7 @@ define <8 x i16> @trunc_or_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
define <16 x i8> @trunc_or_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
; SSE-LABEL: trunc_or_const_v16i64_v16i8:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
; SSE-NEXT: pand %xmm8, %xmm7
; SSE-NEXT: pand %xmm8, %xmm6
@@ -4990,7 +4765,7 @@ define <16 x i8> @trunc_or_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_or_const_v16i64_v16i8:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
; AVX1-NEXT: vpand %xmm5, %xmm4, %xmm4
@@ -5016,7 +4791,7 @@ define <16 x i8> @trunc_or_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_or_const_v16i64_v16i8:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
@@ -5040,35 +4815,15 @@ define <16 x i8> @trunc_or_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
-; AVX512F-LABEL: trunc_or_const_v16i64_v16i8:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512F-NEXT: vpmovqd %zmm1, %ymm1
-; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512F-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512BW-LABEL: trunc_or_const_v16i64_v16i8:
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1
-; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512BW-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512DQ-LABEL: trunc_or_const_v16i64_v16i8:
-; AVX512DQ: # BB#0:
-; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512DQ-NEXT: vpmovqd %zmm1, %ymm1
-; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0
-; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512DQ-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
+; AVX512-LABEL: trunc_or_const_v16i64_v16i8:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpmovqd %zmm0, %ymm0
+; AVX512-NEXT: vpmovqd %zmm1, %ymm1
+; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
%1 = or <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
%2 = trunc <16 x i64> %1 to <16 x i8>
ret <16 x i8> %2
@@ -5076,7 +4831,7 @@ define <16 x i8> @trunc_or_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
define <16 x i8> @trunc_or_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
; SSE-LABEL: trunc_or_const_v16i32_v16i8:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
; SSE-NEXT: pand %xmm4, %xmm3
; SSE-NEXT: pand %xmm4, %xmm2
@@ -5089,7 +4844,7 @@ define <16 x i8> @trunc_or_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_or_const_v16i32_v16i8:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
@@ -5105,7 +4860,7 @@ define <16 x i8> @trunc_or_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_or_const_v16i32_v16i8:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
@@ -5120,7 +4875,7 @@ define <16 x i8> @trunc_or_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
; AVX2-NEXT: retq
;
; AVX512-LABEL: trunc_or_const_v16i32_v16i8:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpmovdb %zmm0, %xmm0
; AVX512-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
; AVX512-NEXT: vzeroupper
@@ -5132,7 +4887,7 @@ define <16 x i8> @trunc_or_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
define <16 x i8> @trunc_or_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
; SSE-LABEL: trunc_or_const_v16i16_v16i8:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
; SSE-NEXT: pand %xmm2, %xmm1
; SSE-NEXT: pand %xmm2, %xmm0
@@ -5141,7 +4896,7 @@ define <16 x i8> @trunc_or_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_or_const_v16i16_v16i8:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
@@ -5152,7 +4907,7 @@ define <16 x i8> @trunc_or_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_or_const_v16i16_v16i8:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
@@ -5163,7 +4918,7 @@ define <16 x i8> @trunc_or_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
; AVX2-NEXT: retq
;
; AVX512F-LABEL: trunc_or_const_v16i16_v16i8:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
; AVX512F-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
@@ -5171,15 +4926,15 @@ define <16 x i8> @trunc_or_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: trunc_or_const_v16i16_v16i8:
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
; AVX512BW-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: trunc_or_const_v16i16_v16i8:
-; AVX512DQ: # BB#0:
+; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
; AVX512DQ-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
@@ -5196,7 +4951,7 @@ define <16 x i8> @trunc_or_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
define <4 x i32> @mul_add_const_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind {
; SSE-LABEL: mul_add_const_v4i64_v4i32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movdqa %xmm0, %xmm2
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,1,3]
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,3,3]
@@ -5227,7 +4982,7 @@ define <4 x i32> @mul_add_const_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwi
; SSE-NEXT: retq
;
; AVX-LABEL: mul_add_const_v4i64_v4i32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: retq
@@ -5241,7 +4996,7 @@ define <4 x i32> @mul_add_const_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwi
define <4 x i32> @mul_add_self_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind {
; SSE-LABEL: mul_add_self_v4i64_v4i32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
; SSE-NEXT: movdqa %xmm2, %xmm3
; SSE-NEXT: psrad $31, %xmm3
@@ -5282,7 +5037,7 @@ define <4 x i32> @mul_add_self_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwin
; SSE-NEXT: retq
;
; AVX-LABEL: mul_add_self_v4i64_v4i32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpaddd %xmm0, %xmm0, %xmm0
; AVX-NEXT: retq
@@ -5296,7 +5051,7 @@ define <4 x i32> @mul_add_self_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwin
define <4 x i32> @mul_add_multiuse_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind {
; SSE-LABEL: mul_add_multiuse_v4i64_v4i32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3]
; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,1,3,3]
; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,1,1,3]
@@ -5326,7 +5081,7 @@ define <4 x i32> @mul_add_multiuse_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nou
; SSE-NEXT: retq
;
; AVX-LABEL: mul_add_multiuse_v4i64_v4i32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm1
; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
diff --git a/test/CodeGen/X86/vector-trunc.ll b/test/CodeGen/X86/vector-trunc.ll
index 58f7407eeec4..d25117ca715c 100644
--- a/test/CodeGen/X86/vector-trunc.ll
+++ b/test/CodeGen/X86/vector-trunc.ll
@@ -11,14 +11,14 @@
define <8 x i32> @trunc8i64_8i32(<8 x i64> %a) {
; SSE-LABEL: trunc8i64_8i32:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2]
; SSE-NEXT: movaps %xmm2, %xmm1
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc8i64_8i32:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
@@ -27,7 +27,131 @@ define <8 x i32> @trunc8i64_8i32(<8 x i64> %a) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc8i64_8i32:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
+; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: trunc8i64_8i32:
+; AVX512: # %bb.0: # %entry
+; AVX512-NEXT: vpmovqd %zmm0, %ymm0
+; AVX512-NEXT: retq
+entry:
+ %0 = trunc <8 x i64> %a to <8 x i32>
+ ret <8 x i32> %0
+}
+
+define <8 x i32> @trunc8i64_8i32_ashr(<8 x i64> %a) {
+; SSE2-LABEL: trunc8i64_8i32_ashr:
+; SSE2: # %bb.0: # %entry
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,3,2,3]
+; SSE2-NEXT: psrad $31, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
+; SSE2-NEXT: psrad $31, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm3[0,2]
+; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm4[0,2]
+; SSE2-NEXT: movaps %xmm2, %xmm1
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: trunc8i64_8i32_ashr:
+; SSSE3: # %bb.0: # %entry
+; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,3,2,3]
+; SSSE3-NEXT: psrad $31, %xmm3
+; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
+; SSSE3-NEXT: psrad $31, %xmm1
+; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
+; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm3[0,2]
+; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm4[0,2]
+; SSSE3-NEXT: movaps %xmm2, %xmm1
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: trunc8i64_8i32_ashr:
+; SSE41: # %bb.0: # %entry
+; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
+; SSE41-NEXT: psrad $31, %xmm3
+; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3],xmm4[4,5],xmm3[6,7]
+; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
+; SSE41-NEXT: psrad $31, %xmm1
+; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3],xmm4[4,5],xmm1[6,7]
+; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[0,2]
+; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[0,2]
+; SSE41-NEXT: movaps %xmm2, %xmm1
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: trunc8i64_8i32_ashr:
+; AVX1: # %bb.0: # %entry
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpsrad $31, %xmm2, %xmm3
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vpsrad $31, %xmm3, %xmm4
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7]
+; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,3],xmm3[0,2]
+; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[0,2]
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc8i64_8i32_ashr:
+; AVX2: # %bb.0: # %entry
+; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,3,2,3,5,7,6,7]
+; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[1,3,2,3,5,7,6,7]
+; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: trunc8i64_8i32_ashr:
+; AVX512: # %bb.0: # %entry
+; AVX512-NEXT: vpsraq $32, %zmm0, %zmm0
+; AVX512-NEXT: vpmovqd %zmm0, %ymm0
+; AVX512-NEXT: retq
+entry:
+ %0 = ashr <8 x i64> %a, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
+ %1 = trunc <8 x i64> %0 to <8 x i32>
+ ret <8 x i32> %1
+}
+
+define <8 x i32> @trunc8i64_8i32_lshr(<8 x i64> %a) {
+; SSE-LABEL: trunc8i64_8i32_lshr:
+; SSE: # %bb.0: # %entry
+; SSE-NEXT: psrlq $32, %xmm3
+; SSE-NEXT: psrlq $32, %xmm2
+; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2]
+; SSE-NEXT: psrlq $32, %xmm1
+; SSE-NEXT: psrlq $32, %xmm0
+; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; SSE-NEXT: movaps %xmm2, %xmm1
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc8i64_8i32_lshr:
+; AVX1: # %bb.0: # %entry
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm2
+; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0
+; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm2
+; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm1
+; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc8i64_8i32_lshr:
+; AVX2: # %bb.0: # %entry
+; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm1
+; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm0
; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
@@ -35,50 +159,50 @@ define <8 x i32> @trunc8i64_8i32(<8 x i64> %a) {
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
-; AVX512-LABEL: trunc8i64_8i32:
-; AVX512: # BB#0: # %entry
+; AVX512-LABEL: trunc8i64_8i32_lshr:
+; AVX512: # %bb.0: # %entry
+; AVX512-NEXT: vpsrlq $32, %zmm0, %zmm0
; AVX512-NEXT: vpmovqd %zmm0, %ymm0
; AVX512-NEXT: retq
entry:
- %0 = trunc <8 x i64> %a to <8 x i32>
- ret <8 x i32> %0
+ %0 = lshr <8 x i64> %a, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
+ %1 = trunc <8 x i64> %0 to <8 x i32>
+ ret <8 x i32> %1
}
define <8 x i16> @trunc8i64_8i16(<8 x i64> %a) {
; SSE2-LABEL: trunc8i64_8i16:
-; SSE2: # BB#0: # %entry
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
-; SSE2-NEXT: movapd %xmm2, %xmm0
+; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: trunc8i64_8i16:
-; SSSE3: # BB#0: # %entry
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
-; SSSE3-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7]
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; SSSE3-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7]
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; SSSE3: # %bb.0: # %entry
; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; SSSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; SSSE3-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
+; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
+; SSSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
+; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
+; SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSSE3-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
-; SSSE3-NEXT: movapd %xmm2, %xmm0
+; SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: trunc8i64_8i16:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: pxor %xmm4, %xmm4
; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6,7]
; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4],xmm4[5,6,7]
@@ -90,7 +214,7 @@ define <8 x i16> @trunc8i64_8i16(<8 x i64> %a) {
; SSE41-NEXT: retq
;
; AVX1-LABEL: trunc8i64_8i16:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
@@ -105,7 +229,7 @@ define <8 x i16> @trunc8i64_8i16(<8 x i64> %a) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc8i64_8i16:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
@@ -113,12 +237,12 @@ define <8 x i16> @trunc8i64_8i16(<8 x i64> %a) {
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512-LABEL: trunc8i64_8i16:
-; AVX512: # BB#0: # %entry
+; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vpmovqw %zmm0, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
@@ -129,7 +253,7 @@ entry:
define void @trunc8i64_8i8(<8 x i64> %a) {
; SSE-LABEL: trunc8i64_8i8:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
; SSE-NEXT: pand %xmm4, %xmm3
; SSE-NEXT: pand %xmm4, %xmm2
@@ -143,7 +267,7 @@ define void @trunc8i64_8i8(<8 x i64> %a) {
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc8i64_8i8:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
@@ -160,7 +284,7 @@ define void @trunc8i64_8i8(<8 x i64> %a) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc8i64_8i8:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
@@ -174,7 +298,7 @@ define void @trunc8i64_8i8(<8 x i64> %a) {
; AVX2-NEXT: retq
;
; AVX512-LABEL: trunc8i64_8i8:
-; AVX512: # BB#0: # %entry
+; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vpmovqb %zmm0, (%rax)
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
@@ -186,7 +310,7 @@ entry:
define <8 x i16> @trunc8i32_8i16(<8 x i32> %a) {
; SSE2-LABEL: trunc8i32_8i16:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: pslld $16, %xmm1
; SSE2-NEXT: psrad $16, %xmm1
; SSE2-NEXT: pslld $16, %xmm0
@@ -195,7 +319,7 @@ define <8 x i16> @trunc8i32_8i16(<8 x i32> %a) {
; SSE2-NEXT: retq
;
; SSSE3-LABEL: trunc8i32_8i16:
-; SSSE3: # BB#0: # %entry
+; SSSE3: # %bb.0: # %entry
; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; SSSE3-NEXT: pshufb %xmm2, %xmm1
; SSSE3-NEXT: pshufb %xmm2, %xmm0
@@ -203,7 +327,7 @@ define <8 x i16> @trunc8i32_8i16(<8 x i32> %a) {
; SSSE3-NEXT: retq
;
; SSE41-LABEL: trunc8i32_8i16:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; SSE41-NEXT: pshufb %xmm2, %xmm1
; SSE41-NEXT: pshufb %xmm2, %xmm0
@@ -211,7 +335,7 @@ define <8 x i16> @trunc8i32_8i16(<8 x i32> %a) {
; SSE41-NEXT: retq
;
; AVX1-LABEL: trunc8i32_8i16:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
@@ -221,37 +345,37 @@ define <8 x i16> @trunc8i32_8i16(<8 x i32> %a) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc8i32_8i16:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512F-LABEL: trunc8i32_8i16:
-; AVX512F: # BB#0: # %entry
-; AVX512F-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512F: # %bb.0: # %entry
+; AVX512F-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512F-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: trunc8i32_8i16:
-; AVX512VL: # BB#0: # %entry
+; AVX512VL: # %bb.0: # %entry
; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: trunc8i32_8i16:
-; AVX512BW: # BB#0: # %entry
-; AVX512BW-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512BW: # %bb.0: # %entry
+; AVX512BW-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512BW-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: trunc8i32_8i16:
-; AVX512BWVL: # BB#0: # %entry
+; AVX512BWVL: # %bb.0: # %entry
; AVX512BWVL-NEXT: vpmovdw %ymm0, %xmm0
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
@@ -260,9 +384,148 @@ entry:
ret <8 x i16> %0
}
+define <8 x i16> @trunc8i32_8i16_ashr(<8 x i32> %a) {
+; SSE-LABEL: trunc8i32_8i16_ashr:
+; SSE: # %bb.0: # %entry
+; SSE-NEXT: psrad $16, %xmm1
+; SSE-NEXT: psrad $16, %xmm0
+; SSE-NEXT: packssdw %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc8i32_8i16_ashr:
+; AVX1: # %bb.0: # %entry
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpsrad $16, %xmm1, %xmm1
+; AVX1-NEXT: vpsrad $16, %xmm0, %xmm0
+; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc8i32_8i16_ashr:
+; AVX2: # %bb.0: # %entry
+; AVX2-NEXT: vpsrad $16, %ymm0, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: trunc8i32_8i16_ashr:
+; AVX512F: # %bb.0: # %entry
+; AVX512F-NEXT: vpsrad $16, %ymm0, %ymm0
+; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512F-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: trunc8i32_8i16_ashr:
+; AVX512VL: # %bb.0: # %entry
+; AVX512VL-NEXT: vpsrad $16, %ymm0, %ymm0
+; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
+;
+; AVX512BW-LABEL: trunc8i32_8i16_ashr:
+; AVX512BW: # %bb.0: # %entry
+; AVX512BW-NEXT: vpsrad $16, %ymm0, %ymm0
+; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512BW-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+;
+; AVX512BWVL-LABEL: trunc8i32_8i16_ashr:
+; AVX512BWVL: # %bb.0: # %entry
+; AVX512BWVL-NEXT: vpsrad $16, %ymm0, %ymm0
+; AVX512BWVL-NEXT: vpmovdw %ymm0, %xmm0
+; AVX512BWVL-NEXT: vzeroupper
+; AVX512BWVL-NEXT: retq
+entry:
+ %0 = ashr <8 x i32> %a, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+ %1 = trunc <8 x i32> %0 to <8 x i16>
+ ret <8 x i16> %1
+}
+
+define <8 x i16> @trunc8i32_8i16_lshr(<8 x i32> %a) {
+; SSE2-LABEL: trunc8i32_8i16_lshr:
+; SSE2: # %bb.0: # %entry
+; SSE2-NEXT: psrld $16, %xmm0
+; SSE2-NEXT: psrld $16, %xmm1
+; SSE2-NEXT: pslld $16, %xmm1
+; SSE2-NEXT: psrad $16, %xmm1
+; SSE2-NEXT: pslld $16, %xmm0
+; SSE2-NEXT: psrad $16, %xmm0
+; SSE2-NEXT: packssdw %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: trunc8i32_8i16_lshr:
+; SSSE3: # %bb.0: # %entry
+; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,10,11,14,15,14,15,255,255]
+; SSSE3-NEXT: pshufb %xmm2, %xmm1
+; SSSE3-NEXT: pshufb %xmm2, %xmm0
+; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: trunc8i32_8i16_lshr:
+; SSE41: # %bb.0: # %entry
+; SSE41-NEXT: psrld $16, %xmm1
+; SSE41-NEXT: psrld $16, %xmm0
+; SSE41-NEXT: packusdw %xmm1, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: trunc8i32_8i16_lshr:
+; AVX1: # %bb.0: # %entry
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpsrld $16, %xmm1, %xmm1
+; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0
+; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc8i32_8i16_lshr:
+; AVX2: # %bb.0: # %entry
+; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: trunc8i32_8i16_lshr:
+; AVX512F: # %bb.0: # %entry
+; AVX512F-NEXT: vpsrld $16, %ymm0, %ymm0
+; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512F-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: trunc8i32_8i16_lshr:
+; AVX512VL: # %bb.0: # %entry
+; AVX512VL-NEXT: vpsrld $16, %ymm0, %ymm0
+; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
+;
+; AVX512BW-LABEL: trunc8i32_8i16_lshr:
+; AVX512BW: # %bb.0: # %entry
+; AVX512BW-NEXT: vpsrld $16, %ymm0, %ymm0
+; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512BW-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+;
+; AVX512BWVL-LABEL: trunc8i32_8i16_lshr:
+; AVX512BWVL: # %bb.0: # %entry
+; AVX512BWVL-NEXT: vpsrld $16, %ymm0, %ymm0
+; AVX512BWVL-NEXT: vpmovdw %ymm0, %xmm0
+; AVX512BWVL-NEXT: vzeroupper
+; AVX512BWVL-NEXT: retq
+entry:
+ %0 = lshr <8 x i32> %a, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+ %1 = trunc <8 x i32> %0 to <8 x i16>
+ ret <8 x i16> %1
+}
+
define void @trunc8i32_8i8(<8 x i32> %a) {
; SSE2-LABEL: trunc8i32_8i8:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
; SSE2-NEXT: pand %xmm2, %xmm1
; SSE2-NEXT: pand %xmm2, %xmm0
@@ -272,7 +535,7 @@ define void @trunc8i32_8i8(<8 x i32> %a) {
; SSE2-NEXT: retq
;
; SSSE3-LABEL: trunc8i32_8i8:
-; SSSE3: # BB#0: # %entry
+; SSSE3: # %bb.0: # %entry
; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
; SSSE3-NEXT: pshufb %xmm2, %xmm1
; SSSE3-NEXT: pshufb %xmm2, %xmm0
@@ -281,7 +544,7 @@ define void @trunc8i32_8i8(<8 x i32> %a) {
; SSSE3-NEXT: retq
;
; SSE41-LABEL: trunc8i32_8i8:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: movdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
; SSE41-NEXT: pshufb %xmm2, %xmm1
; SSE41-NEXT: pshufb %xmm2, %xmm0
@@ -290,7 +553,7 @@ define void @trunc8i32_8i8(<8 x i32> %a) {
; SSE41-NEXT: retq
;
; AVX1-LABEL: trunc8i32_8i8:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
@@ -301,7 +564,7 @@ define void @trunc8i32_8i8(<8 x i32> %a) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc8i32_8i8:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
@@ -310,8 +573,8 @@ define void @trunc8i32_8i8(<8 x i32> %a) {
; AVX2-NEXT: retq
;
; AVX512F-LABEL: trunc8i32_8i8:
-; AVX512F: # BB#0: # %entry
-; AVX512F-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512F: # %bb.0: # %entry
+; AVX512F-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; AVX512F-NEXT: vmovq %xmm0, (%rax)
@@ -319,14 +582,14 @@ define void @trunc8i32_8i8(<8 x i32> %a) {
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: trunc8i32_8i8:
-; AVX512VL: # BB#0: # %entry
+; AVX512VL: # %bb.0: # %entry
; AVX512VL-NEXT: vpmovdb %ymm0, (%rax)
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: trunc8i32_8i8:
-; AVX512BW: # BB#0: # %entry
-; AVX512BW-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512BW: # %bb.0: # %entry
+; AVX512BW-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0
; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; AVX512BW-NEXT: vmovq %xmm0, (%rax)
@@ -334,7 +597,7 @@ define void @trunc8i32_8i8(<8 x i32> %a) {
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: trunc8i32_8i8:
-; AVX512BWVL: # BB#0: # %entry
+; AVX512BWVL: # %bb.0: # %entry
; AVX512BWVL-NEXT: vpmovdb %ymm0, (%rax)
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
@@ -346,7 +609,7 @@ entry:
define void @trunc16i32_16i16(<16 x i32> %a) {
; SSE2-LABEL: trunc16i32_16i16:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: pslld $16, %xmm1
; SSE2-NEXT: psrad $16, %xmm1
; SSE2-NEXT: pslld $16, %xmm0
@@ -362,7 +625,7 @@ define void @trunc16i32_16i16(<16 x i32> %a) {
; SSE2-NEXT: retq
;
; SSSE3-LABEL: trunc16i32_16i16:
-; SSSE3: # BB#0: # %entry
+; SSSE3: # %bb.0: # %entry
; SSSE3-NEXT: pslld $16, %xmm1
; SSSE3-NEXT: psrad $16, %xmm1
; SSSE3-NEXT: pslld $16, %xmm0
@@ -378,7 +641,7 @@ define void @trunc16i32_16i16(<16 x i32> %a) {
; SSSE3-NEXT: retq
;
; SSE41-LABEL: trunc16i32_16i16:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: pxor %xmm4, %xmm4
; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2],xmm4[3],xmm1[4],xmm4[5],xmm1[6],xmm4[7]
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2],xmm4[3],xmm0[4],xmm4[5],xmm0[6],xmm4[7]
@@ -391,7 +654,7 @@ define void @trunc16i32_16i16(<16 x i32> %a) {
; SSE41-NEXT: retq
;
; AVX1-LABEL: trunc16i32_16i16:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2],xmm3[3],xmm2[4],xmm3[5],xmm2[6],xmm3[7]
@@ -407,7 +670,7 @@ define void @trunc16i32_16i16(<16 x i32> %a) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc16i32_16i16:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
@@ -419,7 +682,7 @@ define void @trunc16i32_16i16(<16 x i32> %a) {
; AVX2-NEXT: retq
;
; AVX512-LABEL: trunc16i32_16i16:
-; AVX512: # BB#0: # %entry
+; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vpmovdw %zmm0, (%rax)
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
@@ -429,9 +692,151 @@ entry:
ret void
}
+define void @trunc16i32_16i16_ashr(<16 x i32> %a) {
+; SSE-LABEL: trunc16i32_16i16_ashr:
+; SSE: # %bb.0: # %entry
+; SSE-NEXT: psrad $16, %xmm3
+; SSE-NEXT: psrad $16, %xmm2
+; SSE-NEXT: packssdw %xmm3, %xmm2
+; SSE-NEXT: psrad $16, %xmm1
+; SSE-NEXT: psrad $16, %xmm0
+; SSE-NEXT: packssdw %xmm1, %xmm0
+; SSE-NEXT: movdqu %xmm2, (%rax)
+; SSE-NEXT: movdqu %xmm0, (%rax)
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc16i32_16i16_ashr:
+; AVX1: # %bb.0: # %entry
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpsrad $16, %xmm2, %xmm2
+; AVX1-NEXT: vpsrad $16, %xmm0, %xmm0
+; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpsrad $16, %xmm2, %xmm2
+; AVX1-NEXT: vpsrad $16, %xmm1, %xmm1
+; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vmovups %ymm0, (%rax)
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc16i32_16i16_ashr:
+; AVX2: # %bb.0: # %entry
+; AVX2-NEXT: vpsrad $16, %ymm1, %ymm1
+; AVX2-NEXT: vpsrad $16, %ymm0, %ymm0
+; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX2-NEXT: vmovdqu %ymm0, (%rax)
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: trunc16i32_16i16_ashr:
+; AVX512: # %bb.0: # %entry
+; AVX512-NEXT: vpsrld $16, %zmm0, %zmm0
+; AVX512-NEXT: vpmovdw %zmm0, (%rax)
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
+entry:
+ %0 = ashr <16 x i32> %a, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+ %1 = trunc <16 x i32> %0 to <16 x i16>
+ store <16 x i16> %1, <16 x i16>* undef, align 4
+ ret void
+}
+
+define void @trunc16i32_16i16_lshr(<16 x i32> %a) {
+; SSE2-LABEL: trunc16i32_16i16_lshr:
+; SSE2: # %bb.0: # %entry
+; SSE2-NEXT: psrld $16, %xmm2
+; SSE2-NEXT: psrld $16, %xmm3
+; SSE2-NEXT: psrld $16, %xmm0
+; SSE2-NEXT: psrld $16, %xmm1
+; SSE2-NEXT: pslld $16, %xmm1
+; SSE2-NEXT: psrad $16, %xmm1
+; SSE2-NEXT: pslld $16, %xmm0
+; SSE2-NEXT: psrad $16, %xmm0
+; SSE2-NEXT: packssdw %xmm1, %xmm0
+; SSE2-NEXT: pslld $16, %xmm3
+; SSE2-NEXT: psrad $16, %xmm3
+; SSE2-NEXT: pslld $16, %xmm2
+; SSE2-NEXT: psrad $16, %xmm2
+; SSE2-NEXT: packssdw %xmm3, %xmm2
+; SSE2-NEXT: movdqu %xmm2, (%rax)
+; SSE2-NEXT: movdqu %xmm0, (%rax)
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: trunc16i32_16i16_lshr:
+; SSSE3: # %bb.0: # %entry
+; SSSE3-NEXT: psrld $16, %xmm2
+; SSSE3-NEXT: psrld $16, %xmm3
+; SSSE3-NEXT: psrld $16, %xmm0
+; SSSE3-NEXT: psrld $16, %xmm1
+; SSSE3-NEXT: pslld $16, %xmm1
+; SSSE3-NEXT: psrad $16, %xmm1
+; SSSE3-NEXT: pslld $16, %xmm0
+; SSSE3-NEXT: psrad $16, %xmm0
+; SSSE3-NEXT: packssdw %xmm1, %xmm0
+; SSSE3-NEXT: pslld $16, %xmm3
+; SSSE3-NEXT: psrad $16, %xmm3
+; SSSE3-NEXT: pslld $16, %xmm2
+; SSSE3-NEXT: psrad $16, %xmm2
+; SSSE3-NEXT: packssdw %xmm3, %xmm2
+; SSSE3-NEXT: movdqu %xmm2, (%rax)
+; SSSE3-NEXT: movdqu %xmm0, (%rax)
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: trunc16i32_16i16_lshr:
+; SSE41: # %bb.0: # %entry
+; SSE41-NEXT: psrld $16, %xmm3
+; SSE41-NEXT: psrld $16, %xmm2
+; SSE41-NEXT: packusdw %xmm3, %xmm2
+; SSE41-NEXT: psrld $16, %xmm1
+; SSE41-NEXT: psrld $16, %xmm0
+; SSE41-NEXT: packusdw %xmm1, %xmm0
+; SSE41-NEXT: movdqu %xmm2, (%rax)
+; SSE41-NEXT: movdqu %xmm0, (%rax)
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: trunc16i32_16i16_lshr:
+; AVX1: # %bb.0: # %entry
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpsrld $16, %xmm2, %xmm2
+; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0
+; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpsrld $16, %xmm2, %xmm2
+; AVX1-NEXT: vpsrld $16, %xmm1, %xmm1
+; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vmovups %ymm0, (%rax)
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc16i32_16i16_lshr:
+; AVX2: # %bb.0: # %entry
+; AVX2-NEXT: vpsrld $16, %ymm1, %ymm1
+; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0
+; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX2-NEXT: vmovdqu %ymm0, (%rax)
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: trunc16i32_16i16_lshr:
+; AVX512: # %bb.0: # %entry
+; AVX512-NEXT: vpsrld $16, %zmm0, %zmm0
+; AVX512-NEXT: vpmovdw %zmm0, (%rax)
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
+entry:
+ %0 = lshr <16 x i32> %a, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+ %1 = trunc <16 x i32> %0 to <16 x i16>
+ store <16 x i16> %1, <16 x i16>* undef, align 4
+ ret void
+}
+
define void @trunc16i32_16i8(<16 x i32> %a) {
; SSE-LABEL: trunc16i32_16i8:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
; SSE-NEXT: pand %xmm4, %xmm3
; SSE-NEXT: pand %xmm4, %xmm2
@@ -444,7 +849,7 @@ define void @trunc16i32_16i8(<16 x i32> %a) {
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc16i32_16i8:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
@@ -460,7 +865,7 @@ define void @trunc16i32_16i8(<16 x i32> %a) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc16i32_16i8:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
@@ -475,7 +880,7 @@ define void @trunc16i32_16i8(<16 x i32> %a) {
; AVX2-NEXT: retq
;
; AVX512-LABEL: trunc16i32_16i8:
-; AVX512: # BB#0: # %entry
+; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vpmovdb %zmm0, (%rax)
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
@@ -485,10 +890,140 @@ entry:
ret void
}
+define void @trunc16i32_16i8_ashr(<16 x i32> %a) {
+; SSE-LABEL: trunc16i32_16i8_ashr:
+; SSE: # %bb.0: # %entry
+; SSE-NEXT: psrad $24, %xmm1
+; SSE-NEXT: psrad $24, %xmm0
+; SSE-NEXT: packssdw %xmm1, %xmm0
+; SSE-NEXT: psrad $24, %xmm3
+; SSE-NEXT: psrad $24, %xmm2
+; SSE-NEXT: packssdw %xmm3, %xmm2
+; SSE-NEXT: packsswb %xmm2, %xmm0
+; SSE-NEXT: movdqu %xmm0, (%rax)
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc16i32_16i8_ashr:
+; AVX1: # %bb.0: # %entry
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpsrad $24, %xmm2, %xmm2
+; AVX1-NEXT: vpsrad $24, %xmm0, %xmm0
+; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpsrad $24, %xmm2, %xmm2
+; AVX1-NEXT: vpsrad $24, %xmm1, %xmm1
+; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vmovdqu %xmm0, (%rax)
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc16i32_16i8_ashr:
+; AVX2: # %bb.0: # %entry
+; AVX2-NEXT: vpsrad $24, %ymm1, %ymm1
+; AVX2-NEXT: vpsrad $24, %ymm0, %ymm0
+; AVX2-NEXT: vpacksswb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vmovdqu %xmm0, (%rax)
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: trunc16i32_16i8_ashr:
+; AVX512: # %bb.0: # %entry
+; AVX512-NEXT: vpsrld $24, %zmm0, %zmm0
+; AVX512-NEXT: vpmovdb %zmm0, (%rax)
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
+entry:
+ %0 = ashr <16 x i32> %a, <i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24>
+ %1 = trunc <16 x i32> %0 to <16 x i8>
+ store <16 x i8> %1, <16 x i8>* undef, align 4
+ ret void
+}
+
+define void @trunc16i32_16i8_lshr(<16 x i32> %a) {
+; SSE2-LABEL: trunc16i32_16i8_lshr:
+; SSE2: # %bb.0: # %entry
+; SSE2-NEXT: psrld $24, %xmm1
+; SSE2-NEXT: psrld $24, %xmm0
+; SSE2-NEXT: packuswb %xmm1, %xmm0
+; SSE2-NEXT: psrld $24, %xmm3
+; SSE2-NEXT: psrld $24, %xmm2
+; SSE2-NEXT: packuswb %xmm3, %xmm2
+; SSE2-NEXT: packuswb %xmm2, %xmm0
+; SSE2-NEXT: movdqu %xmm0, (%rax)
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: trunc16i32_16i8_lshr:
+; SSSE3: # %bb.0: # %entry
+; SSSE3-NEXT: psrld $24, %xmm1
+; SSSE3-NEXT: psrld $24, %xmm0
+; SSSE3-NEXT: packuswb %xmm1, %xmm0
+; SSSE3-NEXT: psrld $24, %xmm3
+; SSSE3-NEXT: psrld $24, %xmm2
+; SSSE3-NEXT: packuswb %xmm3, %xmm2
+; SSSE3-NEXT: packuswb %xmm2, %xmm0
+; SSSE3-NEXT: movdqu %xmm0, (%rax)
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: trunc16i32_16i8_lshr:
+; SSE41: # %bb.0: # %entry
+; SSE41-NEXT: psrld $24, %xmm1
+; SSE41-NEXT: psrld $24, %xmm0
+; SSE41-NEXT: packssdw %xmm1, %xmm0
+; SSE41-NEXT: psrld $24, %xmm3
+; SSE41-NEXT: psrld $24, %xmm2
+; SSE41-NEXT: packssdw %xmm3, %xmm2
+; SSE41-NEXT: packuswb %xmm2, %xmm0
+; SSE41-NEXT: movdqu %xmm0, (%rax)
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: trunc16i32_16i8_lshr:
+; AVX1: # %bb.0: # %entry
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpsrld $24, %xmm2, %xmm2
+; AVX1-NEXT: vpsrld $24, %xmm0, %xmm0
+; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpsrld $24, %xmm2, %xmm2
+; AVX1-NEXT: vpsrld $24, %xmm1, %xmm1
+; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vmovdqu %xmm0, (%rax)
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc16i32_16i8_lshr:
+; AVX2: # %bb.0: # %entry
+; AVX2-NEXT: vpsrld $24, %ymm1, %ymm1
+; AVX2-NEXT: vpsrld $24, %ymm0, %ymm0
+; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vmovdqu %xmm0, (%rax)
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: trunc16i32_16i8_lshr:
+; AVX512: # %bb.0: # %entry
+; AVX512-NEXT: vpsrld $24, %zmm0, %zmm0
+; AVX512-NEXT: vpmovdb %zmm0, (%rax)
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
+entry:
+ %0 = lshr <16 x i32> %a, <i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24>
+ %1 = trunc <16 x i32> %0 to <16 x i8>
+ store <16 x i8> %1, <16 x i8>* undef, align 4
+ ret void
+}
+
;PR25684
define void @trunc16i16_16i8(<16 x i16> %a) {
; SSE2-LABEL: trunc16i16_16i8:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
; SSE2-NEXT: pand %xmm2, %xmm1
; SSE2-NEXT: pand %xmm2, %xmm0
@@ -497,7 +1032,7 @@ define void @trunc16i16_16i8(<16 x i16> %a) {
; SSE2-NEXT: retq
;
; SSSE3-LABEL: trunc16i16_16i8:
-; SSSE3: # BB#0: # %entry
+; SSSE3: # %bb.0: # %entry
; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
; SSSE3-NEXT: pshufb %xmm2, %xmm1
; SSSE3-NEXT: pshufb %xmm2, %xmm0
@@ -506,7 +1041,7 @@ define void @trunc16i16_16i8(<16 x i16> %a) {
; SSSE3-NEXT: retq
;
; SSE41-LABEL: trunc16i16_16i8:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: movdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
; SSE41-NEXT: pshufb %xmm2, %xmm1
; SSE41-NEXT: pshufb %xmm2, %xmm0
@@ -515,7 +1050,7 @@ define void @trunc16i16_16i8(<16 x i16> %a) {
; SSE41-NEXT: retq
;
; AVX1-LABEL: trunc16i16_16i8:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
@@ -526,7 +1061,7 @@ define void @trunc16i16_16i8(<16 x i16> %a) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc16i16_16i8:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
@@ -537,7 +1072,7 @@ define void @trunc16i16_16i8(<16 x i16> %a) {
; AVX2-NEXT: retq
;
; AVX512F-LABEL: trunc16i16_16i8:
-; AVX512F: # BB#0: # %entry
+; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
; AVX512F-NEXT: vmovdqu %xmm0, (%rax)
@@ -545,7 +1080,7 @@ define void @trunc16i16_16i8(<16 x i16> %a) {
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: trunc16i16_16i8:
-; AVX512VL: # BB#0: # %entry
+; AVX512VL: # %bb.0: # %entry
; AVX512VL-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
; AVX512VL-NEXT: vmovdqu %xmm0, (%rax)
@@ -553,15 +1088,15 @@ define void @trunc16i16_16i8(<16 x i16> %a) {
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: trunc16i16_16i8:
-; AVX512BW: # BB#0: # %entry
-; AVX512BW-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512BW: # %bb.0: # %entry
+; AVX512BW-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
; AVX512BW-NEXT: vmovdqu %xmm0, (%rax)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: trunc16i16_16i8:
-; AVX512BWVL: # BB#0: # %entry
+; AVX512BWVL: # %bb.0: # %entry
; AVX512BWVL-NEXT: vpmovwb %ymm0, (%rax)
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
@@ -571,9 +1106,143 @@ entry:
ret void
}
+define void @trunc16i16_16i8_ashr(<16 x i16> %a) {
+; SSE-LABEL: trunc16i16_16i8_ashr:
+; SSE: # %bb.0: # %entry
+; SSE-NEXT: psraw $8, %xmm1
+; SSE-NEXT: psraw $8, %xmm0
+; SSE-NEXT: packsswb %xmm1, %xmm0
+; SSE-NEXT: movdqu %xmm0, (%rax)
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc16i16_16i8_ashr:
+; AVX1: # %bb.0: # %entry
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpsraw $8, %xmm1, %xmm1
+; AVX1-NEXT: vpsraw $8, %xmm0, %xmm0
+; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vmovdqu %xmm0, (%rax)
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc16i16_16i8_ashr:
+; AVX2: # %bb.0: # %entry
+; AVX2-NEXT: vpsraw $8, %ymm0, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vmovdqu %xmm0, (%rax)
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: trunc16i16_16i8_ashr:
+; AVX512F: # %bb.0: # %entry
+; AVX512F-NEXT: vpsraw $8, %ymm0, %ymm0
+; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
+; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT: vmovdqu %xmm0, (%rax)
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: trunc16i16_16i8_ashr:
+; AVX512VL: # %bb.0: # %entry
+; AVX512VL-NEXT: vpsraw $8, %ymm0, %ymm0
+; AVX512VL-NEXT: vpmovsxwd %ymm0, %zmm0
+; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512VL-NEXT: vmovdqu %xmm0, (%rax)
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
+;
+; AVX512BW-LABEL: trunc16i16_16i8_ashr:
+; AVX512BW: # %bb.0: # %entry
+; AVX512BW-NEXT: vpsraw $8, %ymm0, %ymm0
+; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT: vmovdqu %xmm0, (%rax)
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+;
+; AVX512BWVL-LABEL: trunc16i16_16i8_ashr:
+; AVX512BWVL: # %bb.0: # %entry
+; AVX512BWVL-NEXT: vpsrlw $8, %ymm0, %ymm0
+; AVX512BWVL-NEXT: vpmovwb %ymm0, (%rax)
+; AVX512BWVL-NEXT: vzeroupper
+; AVX512BWVL-NEXT: retq
+entry:
+ %0 = ashr <16 x i16> %a, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+ %1 = trunc <16 x i16> %0 to <16 x i8>
+ store <16 x i8> %1, <16 x i8>* undef, align 4
+ ret void
+}
+
+define void @trunc16i16_16i8_lshr(<16 x i16> %a) {
+; SSE-LABEL: trunc16i16_16i8_lshr:
+; SSE: # %bb.0: # %entry
+; SSE-NEXT: psrlw $8, %xmm1
+; SSE-NEXT: psrlw $8, %xmm0
+; SSE-NEXT: packuswb %xmm1, %xmm0
+; SSE-NEXT: movdqu %xmm0, (%rax)
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: trunc16i16_16i8_lshr:
+; AVX1: # %bb.0: # %entry
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
+; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vmovdqu %xmm0, (%rax)
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: trunc16i16_16i8_lshr:
+; AVX2: # %bb.0: # %entry
+; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vmovdqu %xmm0, (%rax)
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: trunc16i16_16i8_lshr:
+; AVX512F: # %bb.0: # %entry
+; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0
+; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
+; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT: vmovdqu %xmm0, (%rax)
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: trunc16i16_16i8_lshr:
+; AVX512VL: # %bb.0: # %entry
+; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0
+; AVX512VL-NEXT: vpmovsxwd %ymm0, %zmm0
+; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512VL-NEXT: vmovdqu %xmm0, (%rax)
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
+;
+; AVX512BW-LABEL: trunc16i16_16i8_lshr:
+; AVX512BW: # %bb.0: # %entry
+; AVX512BW-NEXT: vpsrlw $8, %ymm0, %ymm0
+; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT: vmovdqu %xmm0, (%rax)
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+;
+; AVX512BWVL-LABEL: trunc16i16_16i8_lshr:
+; AVX512BWVL: # %bb.0: # %entry
+; AVX512BWVL-NEXT: vpsrlw $8, %ymm0, %ymm0
+; AVX512BWVL-NEXT: vpmovwb %ymm0, (%rax)
+; AVX512BWVL-NEXT: vzeroupper
+; AVX512BWVL-NEXT: retq
+entry:
+ %0 = lshr <16 x i16> %a, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+ %1 = trunc <16 x i16> %0 to <16 x i8>
+ store <16 x i8> %1, <16 x i8>* undef, align 4
+ ret void
+}
+
define void @trunc32i16_32i8(<32 x i16> %a) {
; SSE2-LABEL: trunc32i16_32i8:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
; SSE2-NEXT: pand %xmm4, %xmm1
; SSE2-NEXT: pand %xmm4, %xmm0
@@ -586,7 +1255,7 @@ define void @trunc32i16_32i8(<32 x i16> %a) {
; SSE2-NEXT: retq
;
; SSSE3-LABEL: trunc32i16_32i8:
-; SSSE3: # BB#0: # %entry
+; SSSE3: # %bb.0: # %entry
; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
; SSSE3-NEXT: pshufb %xmm4, %xmm1
; SSSE3-NEXT: pshufb %xmm4, %xmm0
@@ -599,7 +1268,7 @@ define void @trunc32i16_32i8(<32 x i16> %a) {
; SSSE3-NEXT: retq
;
; SSE41-LABEL: trunc32i16_32i8:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: movdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
; SSE41-NEXT: pshufb %xmm4, %xmm1
; SSE41-NEXT: pshufb %xmm4, %xmm0
@@ -612,7 +1281,7 @@ define void @trunc32i16_32i8(<32 x i16> %a) {
; SSE41-NEXT: retq
;
; AVX1-LABEL: trunc32i16_32i8:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
@@ -628,7 +1297,7 @@ define void @trunc32i16_32i8(<32 x i16> %a) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc32i16_32i8:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2
@@ -644,7 +1313,7 @@ define void @trunc32i16_32i8(<32 x i16> %a) {
; AVX2-NEXT: retq
;
; AVX512F-LABEL: trunc32i16_32i8:
-; AVX512F: # BB#0: # %entry
+; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
; AVX512F-NEXT: vpmovsxwd %ymm1, %zmm1
@@ -655,7 +1324,7 @@ define void @trunc32i16_32i8(<32 x i16> %a) {
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: trunc32i16_32i8:
-; AVX512VL: # BB#0: # %entry
+; AVX512VL: # %bb.0: # %entry
; AVX512VL-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
; AVX512VL-NEXT: vpmovsxwd %ymm1, %zmm1
@@ -666,13 +1335,13 @@ define void @trunc32i16_32i8(<32 x i16> %a) {
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: trunc32i16_32i8:
-; AVX512BW: # BB#0: # %entry
+; AVX512BW: # %bb.0: # %entry
; AVX512BW-NEXT: vpmovwb %zmm0, (%rax)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: trunc32i16_32i8:
-; AVX512BWVL: # BB#0: # %entry
+; AVX512BWVL: # %bb.0: # %entry
; AVX512BWVL-NEXT: vpmovwb %zmm0, (%rax)
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
@@ -684,14 +1353,14 @@ entry:
define <8 x i32> @trunc2x4i64_8i32(<4 x i64> %a, <4 x i64> %b) {
; SSE-LABEL: trunc2x4i64_8i32:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2]
; SSE-NEXT: movaps %xmm2, %xmm1
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc2x4i64_8i32:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
@@ -700,41 +1369,41 @@ define <8 x i32> @trunc2x4i64_8i32(<4 x i64> %a, <4 x i64> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc2x4i64_8i32:
-; AVX2: # BB#0: # %entry
-; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2: # %bb.0: # %entry
+; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: trunc2x4i64_8i32:
-; AVX512F: # BB#0: # %entry
-; AVX512F-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
-; AVX512F-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512F: # %bb.0: # %entry
+; AVX512F-NEXT: # kill: def %ymm1 killed %ymm1 def %zmm1
+; AVX512F-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
; AVX512F-NEXT: vpmovqd %zmm1, %ymm1
; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: trunc2x4i64_8i32:
-; AVX512VL: # BB#0: # %entry
+; AVX512VL: # %bb.0: # %entry
; AVX512VL-NEXT: vpmovqd %ymm0, %xmm0
; AVX512VL-NEXT: vpmovqd %ymm1, %xmm1
; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: trunc2x4i64_8i32:
-; AVX512BW: # BB#0: # %entry
-; AVX512BW-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
-; AVX512BW-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512BW: # %bb.0: # %entry
+; AVX512BW-NEXT: # kill: def %ymm1 killed %ymm1 def %zmm1
+; AVX512BW-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1
; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: trunc2x4i64_8i32:
-; AVX512BWVL: # BB#0: # %entry
+; AVX512BWVL: # %bb.0: # %entry
; AVX512BWVL-NEXT: vpmovqd %ymm0, %xmm0
; AVX512BWVL-NEXT: vpmovqd %ymm1, %xmm1
; AVX512BWVL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
@@ -748,39 +1417,37 @@ entry:
define <8 x i16> @trunc2x4i64_8i16(<4 x i64> %a, <4 x i64> %b) {
; SSE2-LABEL: trunc2x4i64_8i16:
-; SSE2: # BB#0: # %entry
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
-; SSE2-NEXT: movapd %xmm2, %xmm0
+; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: trunc2x4i64_8i16:
-; SSSE3: # BB#0: # %entry
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
-; SSSE3-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7]
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; SSSE3-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7]
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; SSSE3: # %bb.0: # %entry
; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; SSSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; SSSE3-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
+; SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
+; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
+; SSSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
+; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
+; SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSSE3-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
-; SSSE3-NEXT: movapd %xmm2, %xmm0
+; SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: trunc2x4i64_8i16:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
; SSE41-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7]
; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
@@ -795,7 +1462,7 @@ define <8 x i16> @trunc2x4i64_8i16(<4 x i64> %a, <4 x i64> %b) {
; SSE41-NEXT: retq
;
; AVX1-LABEL: trunc2x4i64_8i16:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
@@ -808,7 +1475,7 @@ define <8 x i16> @trunc2x4i64_8i16(<4 x i64> %a, <4 x i64> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc2x4i64_8i16:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
@@ -821,9 +1488,9 @@ define <8 x i16> @trunc2x4i64_8i16(<4 x i64> %a, <4 x i64> %b) {
; AVX2-NEXT: retq
;
; AVX512F-LABEL: trunc2x4i64_8i16:
-; AVX512F: # BB#0: # %entry
-; AVX512F-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
-; AVX512F-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512F: # %bb.0: # %entry
+; AVX512F-NEXT: # kill: def %ymm1 killed %ymm1 def %zmm1
+; AVX512F-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
; AVX512F-NEXT: vpmovqd %zmm1, %ymm1
; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
@@ -834,7 +1501,7 @@ define <8 x i16> @trunc2x4i64_8i16(<4 x i64> %a, <4 x i64> %b) {
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: trunc2x4i64_8i16:
-; AVX512VL: # BB#0: # %entry
+; AVX512VL: # %bb.0: # %entry
; AVX512VL-NEXT: vpmovqd %ymm0, %xmm0
; AVX512VL-NEXT: vpmovqd %ymm1, %xmm1
; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
@@ -848,9 +1515,9 @@ define <8 x i16> @trunc2x4i64_8i16(<4 x i64> %a, <4 x i64> %b) {
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: trunc2x4i64_8i16:
-; AVX512BW: # BB#0: # %entry
-; AVX512BW-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
-; AVX512BW-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512BW: # %bb.0: # %entry
+; AVX512BW-NEXT: # kill: def %ymm1 killed %ymm1 def %zmm1
+; AVX512BW-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1
; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
@@ -861,7 +1528,7 @@ define <8 x i16> @trunc2x4i64_8i16(<4 x i64> %a, <4 x i64> %b) {
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: trunc2x4i64_8i16:
-; AVX512BWVL: # BB#0: # %entry
+; AVX512BWVL: # %bb.0: # %entry
; AVX512BWVL-NEXT: vpmovqd %ymm0, %xmm0
; AVX512BWVL-NEXT: vpmovqd %ymm1, %xmm1
; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
@@ -882,17 +1549,17 @@ entry:
define <4 x i32> @trunc2x2i64_4i32(<2 x i64> %a, <2 x i64> %b) {
; SSE-LABEL: trunc2x2i64_4i32:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; SSE-NEXT: retq
;
; AVX-LABEL: trunc2x2i64_4i32:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; AVX-NEXT: retq
;
; AVX512-LABEL: trunc2x2i64_4i32:
-; AVX512: # BB#0: # %entry
+; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; AVX512-NEXT: retq
entry:
@@ -904,37 +1571,37 @@ entry:
define i64 @trunc2i64_i64(<2 x i64> %inval) {
; SSE-LABEL: trunc2i64_i64:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SSE-NEXT: movq %xmm0, %rax
; SSE-NEXT: retq
;
; AVX-LABEL: trunc2i64_i64:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX-NEXT: vmovq %xmm0, %rax
; AVX-NEXT: retq
;
; AVX512F-LABEL: trunc2i64_i64:
-; AVX512F: # BB#0: # %entry
+; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX512F-NEXT: vmovq %xmm0, %rax
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: trunc2i64_i64:
-; AVX512VL: # BB#0: # %entry
+; AVX512VL: # %bb.0: # %entry
; AVX512VL-NEXT: vpmovqd %xmm0, -{{[0-9]+}}(%rsp)
; AVX512VL-NEXT: movq -{{[0-9]+}}(%rsp), %rax
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: trunc2i64_i64:
-; AVX512BW: # BB#0: # %entry
+; AVX512BW: # %bb.0: # %entry
; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX512BW-NEXT: vmovq %xmm0, %rax
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: trunc2i64_i64:
-; AVX512BWVL: # BB#0: # %entry
+; AVX512BWVL: # %bb.0: # %entry
; AVX512BWVL-NEXT: vpmovqd %xmm0, -{{[0-9]+}}(%rsp)
; AVX512BWVL-NEXT: movq -{{[0-9]+}}(%rsp), %rax
; AVX512BWVL-NEXT: retq
@@ -946,7 +1613,7 @@ entry:
define <8 x i16> @trunc2x4i32_8i16(<4 x i32> %a, <4 x i32> %b) {
; SSE2-LABEL: trunc2x4i32_8i16:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
@@ -957,7 +1624,7 @@ define <8 x i16> @trunc2x4i32_8i16(<4 x i32> %a, <4 x i32> %b) {
; SSE2-NEXT: retq
;
; SSSE3-LABEL: trunc2x4i32_8i16:
-; SSSE3: # BB#0: # %entry
+; SSSE3: # %bb.0: # %entry
; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; SSSE3-NEXT: pshufb %xmm2, %xmm1
; SSSE3-NEXT: pshufb %xmm2, %xmm0
@@ -965,7 +1632,7 @@ define <8 x i16> @trunc2x4i32_8i16(<4 x i32> %a, <4 x i32> %b) {
; SSSE3-NEXT: retq
;
; SSE41-LABEL: trunc2x4i32_8i16:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; SSE41-NEXT: pshufb %xmm2, %xmm1
; SSE41-NEXT: pshufb %xmm2, %xmm0
@@ -973,7 +1640,7 @@ define <8 x i16> @trunc2x4i32_8i16(<4 x i32> %a, <4 x i32> %b) {
; SSE41-NEXT: retq
;
; AVX-LABEL: trunc2x4i32_8i16:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0
@@ -981,7 +1648,7 @@ define <8 x i16> @trunc2x4i32_8i16(<4 x i32> %a, <4 x i32> %b) {
; AVX-NEXT: retq
;
; AVX512F-LABEL: trunc2x4i32_8i16:
-; AVX512F: # BB#0: # %entry
+; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
@@ -989,7 +1656,7 @@ define <8 x i16> @trunc2x4i32_8i16(<4 x i32> %a, <4 x i32> %b) {
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: trunc2x4i32_8i16:
-; AVX512VL: # BB#0: # %entry
+; AVX512VL: # %bb.0: # %entry
; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
; AVX512VL-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
@@ -1000,7 +1667,7 @@ define <8 x i16> @trunc2x4i32_8i16(<4 x i32> %a, <4 x i32> %b) {
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: trunc2x4i32_8i16:
-; AVX512BW: # BB#0: # %entry
+; AVX512BW: # %bb.0: # %entry
; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
@@ -1008,7 +1675,7 @@ define <8 x i16> @trunc2x4i32_8i16(<4 x i32> %a, <4 x i32> %b) {
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: trunc2x4i32_8i16:
-; AVX512BWVL: # BB#0: # %entry
+; AVX512BWVL: # %bb.0: # %entry
; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
; AVX512BWVL-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
@@ -1027,7 +1694,7 @@ entry:
; PR15524 http://llvm.org/bugs/show_bug.cgi?id=15524
define i64 @trunc4i32_i64(<4 x i32> %inval) {
; SSE2-LABEL: trunc4i32_i64:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
@@ -1035,43 +1702,43 @@ define i64 @trunc4i32_i64(<4 x i32> %inval) {
; SSE2-NEXT: retq
;
; SSSE3-LABEL: trunc4i32_i64:
-; SSSE3: # BB#0: # %entry
+; SSSE3: # %bb.0: # %entry
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; SSSE3-NEXT: movq %xmm0, %rax
; SSSE3-NEXT: retq
;
; SSE41-LABEL: trunc4i32_i64:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; SSE41-NEXT: movq %xmm0, %rax
; SSE41-NEXT: retq
;
; AVX-LABEL: trunc4i32_i64:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; AVX-NEXT: vmovq %xmm0, %rax
; AVX-NEXT: retq
;
; AVX512F-LABEL: trunc4i32_i64:
-; AVX512F: # BB#0: # %entry
+; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; AVX512F-NEXT: vmovq %xmm0, %rax
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: trunc4i32_i64:
-; AVX512VL: # BB#0: # %entry
+; AVX512VL: # %bb.0: # %entry
; AVX512VL-NEXT: vpmovdw %xmm0, -{{[0-9]+}}(%rsp)
; AVX512VL-NEXT: movq -{{[0-9]+}}(%rsp), %rax
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: trunc4i32_i64:
-; AVX512BW: # BB#0: # %entry
+; AVX512BW: # %bb.0: # %entry
; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; AVX512BW-NEXT: vmovq %xmm0, %rax
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: trunc4i32_i64:
-; AVX512BWVL: # BB#0: # %entry
+; AVX512BWVL: # %bb.0: # %entry
; AVX512BWVL-NEXT: vpmovdw %xmm0, -{{[0-9]+}}(%rsp)
; AVX512BWVL-NEXT: movq -{{[0-9]+}}(%rsp), %rax
; AVX512BWVL-NEXT: retq
@@ -1083,7 +1750,7 @@ entry:
define <16 x i8> @trunc2x8i16_16i8(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: trunc2x8i16_16i8:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
; SSE2-NEXT: pand %xmm2, %xmm1
; SSE2-NEXT: pand %xmm2, %xmm0
@@ -1091,7 +1758,7 @@ define <16 x i8> @trunc2x8i16_16i8(<8 x i16> %a, <8 x i16> %b) {
; SSE2-NEXT: retq
;
; SSSE3-LABEL: trunc2x8i16_16i8:
-; SSSE3: # BB#0: # %entry
+; SSSE3: # %bb.0: # %entry
; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
; SSSE3-NEXT: pshufb %xmm2, %xmm1
; SSSE3-NEXT: pshufb %xmm2, %xmm0
@@ -1099,7 +1766,7 @@ define <16 x i8> @trunc2x8i16_16i8(<8 x i16> %a, <8 x i16> %b) {
; SSSE3-NEXT: retq
;
; SSE41-LABEL: trunc2x8i16_16i8:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: movdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
; SSE41-NEXT: pshufb %xmm2, %xmm1
; SSE41-NEXT: pshufb %xmm2, %xmm0
@@ -1107,44 +1774,20 @@ define <16 x i8> @trunc2x8i16_16i8(<8 x i16> %a, <8 x i16> %b) {
; SSE41-NEXT: retq
;
; AVX-LABEL: trunc2x8i16_16i8:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX-NEXT: retq
;
-; AVX512F-LABEL: trunc2x8i16_16i8:
-; AVX512F: # BB#0: # %entry
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: trunc2x8i16_16i8:
-; AVX512VL: # BB#0: # %entry
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512VL-NEXT: retq
-;
-; AVX512BW-LABEL: trunc2x8i16_16i8:
-; AVX512BW: # BB#0: # %entry
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: trunc2x8i16_16i8:
-; AVX512BWVL: # BB#0: # %entry
-; AVX512BWVL-NEXT: vmovdqu {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512BWVL-NEXT: retq
+; AVX512-LABEL: trunc2x8i16_16i8:
+; AVX512: # %bb.0: # %entry
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512-NEXT: retq
entry:
%0 = trunc <8 x i16> %a to <8 x i8>
%1 = trunc <8 x i16> %b to <8 x i8>
@@ -1155,50 +1798,50 @@ entry:
; PR15524 http://llvm.org/bugs/show_bug.cgi?id=15524
define i64 @trunc8i16_i64(<8 x i16> %inval) {
; SSE2-LABEL: trunc8i16_i64:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
; SSE2-NEXT: packuswb %xmm0, %xmm0
; SSE2-NEXT: movq %xmm0, %rax
; SSE2-NEXT: retq
;
; SSSE3-LABEL: trunc8i16_i64:
-; SSSE3: # BB#0: # %entry
+; SSSE3: # %bb.0: # %entry
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; SSSE3-NEXT: movq %xmm0, %rax
; SSSE3-NEXT: retq
;
; SSE41-LABEL: trunc8i16_i64:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; SSE41-NEXT: movq %xmm0, %rax
; SSE41-NEXT: retq
;
; AVX-LABEL: trunc8i16_i64:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; AVX-NEXT: vmovq %xmm0, %rax
; AVX-NEXT: retq
;
; AVX512F-LABEL: trunc8i16_i64:
-; AVX512F: # BB#0: # %entry
+; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; AVX512F-NEXT: vmovq %xmm0, %rax
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: trunc8i16_i64:
-; AVX512VL: # BB#0: # %entry
+; AVX512VL: # %bb.0: # %entry
; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; AVX512VL-NEXT: vmovq %xmm0, %rax
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: trunc8i16_i64:
-; AVX512BW: # BB#0: # %entry
+; AVX512BW: # %bb.0: # %entry
; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; AVX512BW-NEXT: vmovq %xmm0, %rax
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: trunc8i16_i64:
-; AVX512BWVL: # BB#0: # %entry
+; AVX512BWVL: # %bb.0: # %entry
; AVX512BWVL-NEXT: vpmovwb %xmm0, -{{[0-9]+}}(%rsp)
; AVX512BWVL-NEXT: movq -{{[0-9]+}}(%rsp), %rax
; AVX512BWVL-NEXT: retq
@@ -1210,32 +1853,32 @@ entry:
define <16 x i8> @trunc16i64_16i8_const() {
; SSE-LABEL: trunc16i64_16i8_const:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: xorps %xmm0, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: trunc16i64_16i8_const:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
; AVX-NEXT: retq
;
; AVX512F-LABEL: trunc16i64_16i8_const:
-; AVX512F: # BB#0: # %entry
+; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vxorps %xmm0, %xmm0, %xmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: trunc16i64_16i8_const:
-; AVX512VL: # BB#0: # %entry
+; AVX512VL: # %bb.0: # %entry
; AVX512VL-NEXT: vpxor %xmm0, %xmm0, %xmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: trunc16i64_16i8_const:
-; AVX512BW: # BB#0: # %entry
+; AVX512BW: # %bb.0: # %entry
; AVX512BW-NEXT: vxorps %xmm0, %xmm0, %xmm0
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: trunc16i64_16i8_const:
-; AVX512BWVL: # BB#0: # %entry
+; AVX512BWVL: # %bb.0: # %entry
; AVX512BWVL-NEXT: vpxor %xmm0, %xmm0, %xmm0
; AVX512BWVL-NEXT: retq
@@ -1245,3 +1888,119 @@ entry:
ret <16 x i8> %1
}
+define void @PR34773(i16* %a0, i8* %a1) {
+; SSE-LABEL: PR34773:
+; SSE: # %bb.0:
+; SSE-NEXT: movdqu (%rdi), %xmm0
+; SSE-NEXT: movdqu 16(%rdi), %xmm1
+; SSE-NEXT: movdqu 32(%rdi), %xmm2
+; SSE-NEXT: movdqu 48(%rdi), %xmm3
+; SSE-NEXT: psrlw $8, %xmm1
+; SSE-NEXT: psrlw $8, %xmm0
+; SSE-NEXT: packuswb %xmm1, %xmm0
+; SSE-NEXT: psrlw $8, %xmm3
+; SSE-NEXT: psrlw $8, %xmm2
+; SSE-NEXT: packuswb %xmm3, %xmm2
+; SSE-NEXT: movdqu %xmm0, (%rsi)
+; SSE-NEXT: movdqu %xmm2, 16(%rsi)
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: PR34773:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqu (%rdi), %ymm0
+; AVX1-NEXT: vmovdqu 32(%rdi), %ymm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
+; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
+; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vmovdqu %xmm0, (%rsi)
+; AVX1-NEXT: vmovdqu %xmm1, 16(%rsi)
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: PR34773:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovdqu (%rdi), %ymm0
+; AVX2-NEXT: vmovdqu 32(%rdi), %ymm1
+; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0
+; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vmovdqu %xmm0, (%rsi)
+; AVX2-NEXT: vmovdqu %xmm1, 16(%rsi)
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: PR34773:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovdqu (%rdi), %ymm0
+; AVX512F-NEXT: vmovdqu 32(%rdi), %ymm1
+; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0
+; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm1
+; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
+; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT: vpmovsxwd %ymm1, %zmm1
+; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
+; AVX512F-NEXT: vmovdqu %xmm0, (%rsi)
+; AVX512F-NEXT: vmovdqu %xmm1, 16(%rsi)
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: PR34773:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqu (%rdi), %ymm0
+; AVX512VL-NEXT: vmovdqu 32(%rdi), %ymm1
+; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0
+; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1
+; AVX512VL-NEXT: vpmovsxwd %ymm0, %zmm0
+; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512VL-NEXT: vpmovsxwd %ymm1, %zmm1
+; AVX512VL-NEXT: vpmovdb %zmm1, %xmm1
+; AVX512VL-NEXT: vmovdqu %xmm0, (%rsi)
+; AVX512VL-NEXT: vmovdqu %xmm1, 16(%rsi)
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
+;
+; AVX512BW-LABEL: PR34773:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqu (%rdi), %ymm0
+; AVX512BW-NEXT: vmovdqu 32(%rdi), %ymm1
+; AVX512BW-NEXT: vpsrlw $8, %ymm0, %ymm0
+; AVX512BW-NEXT: vpsrlw $8, %ymm1, %ymm1
+; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1
+; AVX512BW-NEXT: vmovdqu %xmm0, (%rsi)
+; AVX512BW-NEXT: vmovdqu %xmm1, 16(%rsi)
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+;
+; AVX512BWVL-LABEL: PR34773:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vpsrlw $8, (%rdi), %ymm0
+; AVX512BWVL-NEXT: vpsrlw $8, 32(%rdi), %ymm1
+; AVX512BWVL-NEXT: vpmovwb %ymm0, (%rsi)
+; AVX512BWVL-NEXT: vpmovwb %ymm1, 16(%rsi)
+; AVX512BWVL-NEXT: vzeroupper
+; AVX512BWVL-NEXT: retq
+ %1 = getelementptr i16, i16* %a0, i64 16
+ %2 = getelementptr i8, i8* %a1, i64 16
+ %3 = bitcast i16* %a0 to <16 x i16>*
+ %4 = bitcast i16* %1 to <16 x i16>*
+ %5 = bitcast i8* %a1 to <16 x i8>*
+ %6 = bitcast i8* %2 to <16 x i8>*
+ %7 = load <16 x i16>, <16 x i16>* %3, align 2
+ %8 = load <16 x i16>, <16 x i16>* %4, align 2
+ %9 = lshr <16 x i16> %7, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+ %10 = lshr <16 x i16> %8, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+ %11 = trunc <16 x i16> %9 to <16 x i8>
+ %12 = trunc <16 x i16> %10 to <16 x i8>
+ store <16 x i8> %11, <16 x i8>* %5, align 1
+ store <16 x i8> %12, <16 x i8>* %6, align 1
+ ret void
+}
diff --git a/test/CodeGen/X86/vector-truncate-combine.ll b/test/CodeGen/X86/vector-truncate-combine.ll
index 61808b802517..5b8498e775d6 100644
--- a/test/CodeGen/X86/vector-truncate-combine.ll
+++ b/test/CodeGen/X86/vector-truncate-combine.ll
@@ -11,11 +11,13 @@
; preservation of the extend/truncate operations mentioned above (2 extend and
; 3 truncate instructions).
;
-; NOTE: This operation is collapsed to a single truncate, so this test no longer covers
-; what it originally intended to.
+; NOTE: This operation could be collapsed in to a single truncate. Once that is done
+; this test will have to be adjusted.
-; CHECK: MOVLHPSrr
-; CHECK: PSHUFHWri
+; CHECK: PUNPCKLBWrr
+; CHECK: PUNPCKLWDrr
+; CHECK: PANDrm
+; CHECK: PACKUSWBrr
; CHECK: PACKUSWBrr
; CHECK: PACKUSWBrr
; CHECK: MOVPDI2DIrr
diff --git a/test/CodeGen/X86/vector-tzcnt-128.ll b/test/CodeGen/X86/vector-tzcnt-128.ll
index 5f00e55e225b..dfb0adefe1d8 100644
--- a/test/CodeGen/X86/vector-tzcnt-128.ll
+++ b/test/CodeGen/X86/vector-tzcnt-128.ll
@@ -8,13 +8,16 @@
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512cd,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512CDVL
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512cd,-avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512CD
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vpopcntdq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VPOPCNTDQ
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vpopcntdq,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VPOPCNTDQVL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bitalg | FileCheck %s --check-prefix=ALL --check-prefix=BITALG_NOVLX
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bitalg,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=BITALG
;
; Just one 32-bit run to make sure we do reasonable things for i64 tzcnt.
; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=X32-SSE --check-prefix=X32-SSE41
define <2 x i64> @testv2i64(<2 x i64> %in) nounwind {
; SSE2-LABEL: testv2i64:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: pxor %xmm1, %xmm1
; SSE2-NEXT: pxor %xmm2, %xmm2
; SSE2-NEXT: psubq %xmm0, %xmm2
@@ -39,7 +42,7 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind {
; SSE2-NEXT: retq
;
; SSE3-LABEL: testv2i64:
-; SSE3: # BB#0:
+; SSE3: # %bb.0:
; SSE3-NEXT: pxor %xmm1, %xmm1
; SSE3-NEXT: pxor %xmm2, %xmm2
; SSE3-NEXT: psubq %xmm0, %xmm2
@@ -64,7 +67,7 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind {
; SSE3-NEXT: retq
;
; SSSE3-LABEL: testv2i64:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: pxor %xmm1, %xmm1
; SSSE3-NEXT: pxor %xmm2, %xmm2
; SSSE3-NEXT: psubq %xmm0, %xmm2
@@ -85,7 +88,7 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind {
; SSSE3-NEXT: retq
;
; SSE41-LABEL: testv2i64:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pxor %xmm1, %xmm1
; SSE41-NEXT: pxor %xmm2, %xmm2
; SSE41-NEXT: psubq %xmm0, %xmm2
@@ -106,7 +109,7 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind {
; SSE41-NEXT: retq
;
; AVX-LABEL: testv2i64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX-NEXT: vpsubq %xmm0, %xmm1, %xmm2
; AVX-NEXT: vpand %xmm2, %xmm0, %xmm0
@@ -124,19 +127,65 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind {
; AVX-NEXT: retq
;
; AVX512VPOPCNTDQ-LABEL: testv2i64:
-; AVX512VPOPCNTDQ: # BB#0:
+; AVX512VPOPCNTDQ: # %bb.0:
; AVX512VPOPCNTDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512VPOPCNTDQ-NEXT: vpsubq %xmm0, %xmm1, %xmm1
; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; AVX512VPOPCNTDQ-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512VPOPCNTDQ-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0
; AVX512VPOPCNTDQ-NEXT: vzeroupper
; AVX512VPOPCNTDQ-NEXT: retq
;
+; AVX512VPOPCNTDQVL-LABEL: testv2i64:
+; AVX512VPOPCNTDQVL: # %bb.0:
+; AVX512VPOPCNTDQVL-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512VPOPCNTDQVL-NEXT: vpsubq %xmm0, %xmm1, %xmm1
+; AVX512VPOPCNTDQVL-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX512VPOPCNTDQVL-NEXT: vpaddq %xmm1, %xmm0, %xmm0
+; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0
+; AVX512VPOPCNTDQVL-NEXT: retq
+;
+; BITALG_NOVLX-LABEL: testv2i64:
+; BITALG_NOVLX: # %bb.0:
+; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; BITALG_NOVLX-NEXT: vpsubq %xmm0, %xmm1, %xmm2
+; BITALG_NOVLX-NEXT: vpand %xmm2, %xmm0, %xmm0
+; BITALG_NOVLX-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; BITALG_NOVLX-NEXT: vpaddq %xmm2, %xmm0, %xmm0
+; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; BITALG_NOVLX-NEXT: vpand %xmm2, %xmm0, %xmm3
+; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; BITALG_NOVLX-NEXT: vpshufb %xmm3, %xmm4, %xmm3
+; BITALG_NOVLX-NEXT: vpsrlw $4, %xmm0, %xmm0
+; BITALG_NOVLX-NEXT: vpand %xmm2, %xmm0, %xmm0
+; BITALG_NOVLX-NEXT: vpshufb %xmm0, %xmm4, %xmm0
+; BITALG_NOVLX-NEXT: vpaddb %xmm3, %xmm0, %xmm0
+; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
+; BITALG_NOVLX-NEXT: retq
+;
+; BITALG-LABEL: testv2i64:
+; BITALG: # %bb.0:
+; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; BITALG-NEXT: vpsubq %xmm0, %xmm1, %xmm2
+; BITALG-NEXT: vpand %xmm2, %xmm0, %xmm0
+; BITALG-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; BITALG-NEXT: vpaddq %xmm2, %xmm0, %xmm0
+; BITALG-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; BITALG-NEXT: vpand %xmm2, %xmm0, %xmm3
+; BITALG-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; BITALG-NEXT: vpshufb %xmm3, %xmm4, %xmm3
+; BITALG-NEXT: vpsrlw $4, %xmm0, %xmm0
+; BITALG-NEXT: vpand %xmm2, %xmm0, %xmm0
+; BITALG-NEXT: vpshufb %xmm0, %xmm4, %xmm0
+; BITALG-NEXT: vpaddb %xmm3, %xmm0, %xmm0
+; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
+; BITALG-NEXT: retq
+;
; X32-SSE-LABEL: testv2i64:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: pxor %xmm1, %xmm1
; X32-SSE-NEXT: pxor %xmm2, %xmm2
; X32-SSE-NEXT: psubq %xmm0, %xmm2
@@ -160,7 +209,7 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind {
define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind {
; SSE2-LABEL: testv2i64u:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: pxor %xmm1, %xmm1
; SSE2-NEXT: pxor %xmm2, %xmm2
; SSE2-NEXT: psubq %xmm0, %xmm2
@@ -185,7 +234,7 @@ define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind {
; SSE2-NEXT: retq
;
; SSE3-LABEL: testv2i64u:
-; SSE3: # BB#0:
+; SSE3: # %bb.0:
; SSE3-NEXT: pxor %xmm1, %xmm1
; SSE3-NEXT: pxor %xmm2, %xmm2
; SSE3-NEXT: psubq %xmm0, %xmm2
@@ -210,7 +259,7 @@ define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind {
; SSE3-NEXT: retq
;
; SSSE3-LABEL: testv2i64u:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: pxor %xmm1, %xmm1
; SSSE3-NEXT: pxor %xmm2, %xmm2
; SSSE3-NEXT: psubq %xmm0, %xmm2
@@ -231,7 +280,7 @@ define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind {
; SSSE3-NEXT: retq
;
; SSE41-LABEL: testv2i64u:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pxor %xmm1, %xmm1
; SSE41-NEXT: pxor %xmm2, %xmm2
; SSE41-NEXT: psubq %xmm0, %xmm2
@@ -252,7 +301,7 @@ define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind {
; SSE41-NEXT: retq
;
; AVX1-LABEL: testv2i64u:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpsubq %xmm0, %xmm1, %xmm2
; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
@@ -270,7 +319,7 @@ define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: testv2i64u:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpsubq %xmm0, %xmm1, %xmm2
; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
@@ -288,7 +337,7 @@ define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind {
; AVX2-NEXT: retq
;
; AVX512CDVL-LABEL: testv2i64u:
-; AVX512CDVL: # BB#0:
+; AVX512CDVL: # %bb.0:
; AVX512CDVL-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512CDVL-NEXT: vpsubq %xmm0, %xmm1, %xmm1
; AVX512CDVL-NEXT: vpand %xmm1, %xmm0, %xmm0
@@ -298,7 +347,7 @@ define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind {
; AVX512CDVL-NEXT: retq
;
; AVX512CD-LABEL: testv2i64u:
-; AVX512CD: # BB#0:
+; AVX512CD: # %bb.0:
; AVX512CD-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512CD-NEXT: vpsubq %xmm0, %xmm1, %xmm1
; AVX512CD-NEXT: vpand %xmm1, %xmm0, %xmm0
@@ -309,19 +358,65 @@ define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind {
; AVX512CD-NEXT: retq
;
; AVX512VPOPCNTDQ-LABEL: testv2i64u:
-; AVX512VPOPCNTDQ: # BB#0:
+; AVX512VPOPCNTDQ: # %bb.0:
; AVX512VPOPCNTDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512VPOPCNTDQ-NEXT: vpsubq %xmm0, %xmm1, %xmm1
; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; AVX512VPOPCNTDQ-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512VPOPCNTDQ-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0
; AVX512VPOPCNTDQ-NEXT: vzeroupper
; AVX512VPOPCNTDQ-NEXT: retq
;
+; AVX512VPOPCNTDQVL-LABEL: testv2i64u:
+; AVX512VPOPCNTDQVL: # %bb.0:
+; AVX512VPOPCNTDQVL-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512VPOPCNTDQVL-NEXT: vpsubq %xmm0, %xmm1, %xmm1
+; AVX512VPOPCNTDQVL-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX512VPOPCNTDQVL-NEXT: vpaddq %xmm1, %xmm0, %xmm0
+; AVX512VPOPCNTDQVL-NEXT: vpopcntq %xmm0, %xmm0
+; AVX512VPOPCNTDQVL-NEXT: retq
+;
+; BITALG_NOVLX-LABEL: testv2i64u:
+; BITALG_NOVLX: # %bb.0:
+; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; BITALG_NOVLX-NEXT: vpsubq %xmm0, %xmm1, %xmm2
+; BITALG_NOVLX-NEXT: vpand %xmm2, %xmm0, %xmm0
+; BITALG_NOVLX-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; BITALG_NOVLX-NEXT: vpaddq %xmm2, %xmm0, %xmm0
+; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; BITALG_NOVLX-NEXT: vpand %xmm2, %xmm0, %xmm3
+; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; BITALG_NOVLX-NEXT: vpshufb %xmm3, %xmm4, %xmm3
+; BITALG_NOVLX-NEXT: vpsrlw $4, %xmm0, %xmm0
+; BITALG_NOVLX-NEXT: vpand %xmm2, %xmm0, %xmm0
+; BITALG_NOVLX-NEXT: vpshufb %xmm0, %xmm4, %xmm0
+; BITALG_NOVLX-NEXT: vpaddb %xmm3, %xmm0, %xmm0
+; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
+; BITALG_NOVLX-NEXT: retq
+;
+; BITALG-LABEL: testv2i64u:
+; BITALG: # %bb.0:
+; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; BITALG-NEXT: vpsubq %xmm0, %xmm1, %xmm2
+; BITALG-NEXT: vpand %xmm2, %xmm0, %xmm0
+; BITALG-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; BITALG-NEXT: vpaddq %xmm2, %xmm0, %xmm0
+; BITALG-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; BITALG-NEXT: vpand %xmm2, %xmm0, %xmm3
+; BITALG-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; BITALG-NEXT: vpshufb %xmm3, %xmm4, %xmm3
+; BITALG-NEXT: vpsrlw $4, %xmm0, %xmm0
+; BITALG-NEXT: vpand %xmm2, %xmm0, %xmm0
+; BITALG-NEXT: vpshufb %xmm0, %xmm4, %xmm0
+; BITALG-NEXT: vpaddb %xmm3, %xmm0, %xmm0
+; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
+; BITALG-NEXT: retq
+;
; X32-SSE-LABEL: testv2i64u:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: pxor %xmm1, %xmm1
; X32-SSE-NEXT: pxor %xmm2, %xmm2
; X32-SSE-NEXT: psubq %xmm0, %xmm2
@@ -345,7 +440,7 @@ define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind {
define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
; SSE2-LABEL: testv4i32:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: pxor %xmm1, %xmm1
; SSE2-NEXT: pxor %xmm2, %xmm2
; SSE2-NEXT: psubd %xmm0, %xmm2
@@ -375,7 +470,7 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
; SSE2-NEXT: retq
;
; SSE3-LABEL: testv4i32:
-; SSE3: # BB#0:
+; SSE3: # %bb.0:
; SSE3-NEXT: pxor %xmm1, %xmm1
; SSE3-NEXT: pxor %xmm2, %xmm2
; SSE3-NEXT: psubd %xmm0, %xmm2
@@ -405,7 +500,7 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
; SSE3-NEXT: retq
;
; SSSE3-LABEL: testv4i32:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: pxor %xmm1, %xmm1
; SSSE3-NEXT: pxor %xmm2, %xmm2
; SSSE3-NEXT: psubd %xmm0, %xmm2
@@ -431,7 +526,7 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
; SSSE3-NEXT: retq
;
; SSE41-LABEL: testv4i32:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pxor %xmm1, %xmm1
; SSE41-NEXT: pxor %xmm2, %xmm2
; SSE41-NEXT: psubd %xmm0, %xmm2
@@ -456,7 +551,7 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
; SSE41-NEXT: retq
;
; AVX1-LABEL: testv4i32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpsubd %xmm0, %xmm1, %xmm2
; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
@@ -478,7 +573,7 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: testv4i32:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpsubd %xmm0, %xmm1, %xmm2
; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
@@ -500,7 +595,7 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
; AVX2-NEXT: retq
;
; AVX512CDVL-LABEL: testv4i32:
-; AVX512CDVL: # BB#0:
+; AVX512CDVL: # %bb.0:
; AVX512CDVL-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512CDVL-NEXT: vpsubd %xmm0, %xmm1, %xmm2
; AVX512CDVL-NEXT: vpand %xmm2, %xmm0, %xmm0
@@ -522,7 +617,7 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
; AVX512CDVL-NEXT: retq
;
; AVX512CD-LABEL: testv4i32:
-; AVX512CD: # BB#0:
+; AVX512CD: # %bb.0:
; AVX512CD-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512CD-NEXT: vpsubd %xmm0, %xmm1, %xmm2
; AVX512CD-NEXT: vpand %xmm2, %xmm0, %xmm0
@@ -544,19 +639,73 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
; AVX512CD-NEXT: retq
;
; AVX512VPOPCNTDQ-LABEL: testv4i32:
-; AVX512VPOPCNTDQ: # BB#0:
+; AVX512VPOPCNTDQ: # %bb.0:
; AVX512VPOPCNTDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512VPOPCNTDQ-NEXT: vpsubd %xmm0, %xmm1, %xmm1
; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; AVX512VPOPCNTDQ-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512VPOPCNTDQ-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0
; AVX512VPOPCNTDQ-NEXT: vzeroupper
; AVX512VPOPCNTDQ-NEXT: retq
;
+; AVX512VPOPCNTDQVL-LABEL: testv4i32:
+; AVX512VPOPCNTDQVL: # %bb.0:
+; AVX512VPOPCNTDQVL-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512VPOPCNTDQVL-NEXT: vpsubd %xmm0, %xmm1, %xmm1
+; AVX512VPOPCNTDQVL-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX512VPOPCNTDQVL-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX512VPOPCNTDQVL-NEXT: vpopcntd %xmm0, %xmm0
+; AVX512VPOPCNTDQVL-NEXT: retq
+;
+; BITALG_NOVLX-LABEL: testv4i32:
+; BITALG_NOVLX: # %bb.0:
+; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; BITALG_NOVLX-NEXT: vpsubd %xmm0, %xmm1, %xmm2
+; BITALG_NOVLX-NEXT: vpand %xmm2, %xmm0, %xmm0
+; BITALG_NOVLX-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; BITALG_NOVLX-NEXT: vpaddd %xmm2, %xmm0, %xmm0
+; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; BITALG_NOVLX-NEXT: vpand %xmm2, %xmm0, %xmm3
+; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; BITALG_NOVLX-NEXT: vpshufb %xmm3, %xmm4, %xmm3
+; BITALG_NOVLX-NEXT: vpsrlw $4, %xmm0, %xmm0
+; BITALG_NOVLX-NEXT: vpand %xmm2, %xmm0, %xmm0
+; BITALG_NOVLX-NEXT: vpshufb %xmm0, %xmm4, %xmm0
+; BITALG_NOVLX-NEXT: vpaddb %xmm3, %xmm0, %xmm0
+; BITALG_NOVLX-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm2, %xmm2
+; BITALG_NOVLX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
+; BITALG_NOVLX-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; BITALG_NOVLX-NEXT: retq
+;
+; BITALG-LABEL: testv4i32:
+; BITALG: # %bb.0:
+; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; BITALG-NEXT: vpsubd %xmm0, %xmm1, %xmm2
+; BITALG-NEXT: vpand %xmm2, %xmm0, %xmm0
+; BITALG-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; BITALG-NEXT: vpaddd %xmm2, %xmm0, %xmm0
+; BITALG-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; BITALG-NEXT: vpand %xmm2, %xmm0, %xmm3
+; BITALG-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; BITALG-NEXT: vpshufb %xmm3, %xmm4, %xmm3
+; BITALG-NEXT: vpsrlw $4, %xmm0, %xmm0
+; BITALG-NEXT: vpand %xmm2, %xmm0, %xmm0
+; BITALG-NEXT: vpshufb %xmm0, %xmm4, %xmm0
+; BITALG-NEXT: vpaddb %xmm3, %xmm0, %xmm0
+; BITALG-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; BITALG-NEXT: vpsadbw %xmm1, %xmm2, %xmm2
+; BITALG-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
+; BITALG-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; BITALG-NEXT: retq
+;
; X32-SSE-LABEL: testv4i32:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: pxor %xmm1, %xmm1
; X32-SSE-NEXT: pxor %xmm2, %xmm2
; X32-SSE-NEXT: psubd %xmm0, %xmm2
@@ -585,7 +734,7 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind {
; SSE2-LABEL: testv4i32u:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: pxor %xmm1, %xmm1
; SSE2-NEXT: pxor %xmm2, %xmm2
; SSE2-NEXT: psubd %xmm0, %xmm2
@@ -615,7 +764,7 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind {
; SSE2-NEXT: retq
;
; SSE3-LABEL: testv4i32u:
-; SSE3: # BB#0:
+; SSE3: # %bb.0:
; SSE3-NEXT: pxor %xmm1, %xmm1
; SSE3-NEXT: pxor %xmm2, %xmm2
; SSE3-NEXT: psubd %xmm0, %xmm2
@@ -645,7 +794,7 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind {
; SSE3-NEXT: retq
;
; SSSE3-LABEL: testv4i32u:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: pxor %xmm1, %xmm1
; SSSE3-NEXT: pxor %xmm2, %xmm2
; SSSE3-NEXT: psubd %xmm0, %xmm2
@@ -671,7 +820,7 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind {
; SSSE3-NEXT: retq
;
; SSE41-LABEL: testv4i32u:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pxor %xmm1, %xmm1
; SSE41-NEXT: pxor %xmm2, %xmm2
; SSE41-NEXT: psubd %xmm0, %xmm2
@@ -696,7 +845,7 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind {
; SSE41-NEXT: retq
;
; AVX1-LABEL: testv4i32u:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpsubd %xmm0, %xmm1, %xmm2
; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
@@ -718,7 +867,7 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: testv4i32u:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpsubd %xmm0, %xmm1, %xmm2
; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
@@ -740,7 +889,7 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind {
; AVX2-NEXT: retq
;
; AVX512CDVL-LABEL: testv4i32u:
-; AVX512CDVL: # BB#0:
+; AVX512CDVL: # %bb.0:
; AVX512CDVL-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512CDVL-NEXT: vpsubd %xmm0, %xmm1, %xmm1
; AVX512CDVL-NEXT: vpand %xmm1, %xmm0, %xmm0
@@ -750,7 +899,7 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind {
; AVX512CDVL-NEXT: retq
;
; AVX512CD-LABEL: testv4i32u:
-; AVX512CD: # BB#0:
+; AVX512CD: # %bb.0:
; AVX512CD-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512CD-NEXT: vpsubd %xmm0, %xmm1, %xmm1
; AVX512CD-NEXT: vpand %xmm1, %xmm0, %xmm0
@@ -761,19 +910,73 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind {
; AVX512CD-NEXT: retq
;
; AVX512VPOPCNTDQ-LABEL: testv4i32u:
-; AVX512VPOPCNTDQ: # BB#0:
+; AVX512VPOPCNTDQ: # %bb.0:
; AVX512VPOPCNTDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512VPOPCNTDQ-NEXT: vpsubd %xmm0, %xmm1, %xmm1
; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; AVX512VPOPCNTDQ-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512VPOPCNTDQ-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0
; AVX512VPOPCNTDQ-NEXT: vzeroupper
; AVX512VPOPCNTDQ-NEXT: retq
;
+; AVX512VPOPCNTDQVL-LABEL: testv4i32u:
+; AVX512VPOPCNTDQVL: # %bb.0:
+; AVX512VPOPCNTDQVL-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512VPOPCNTDQVL-NEXT: vpsubd %xmm0, %xmm1, %xmm1
+; AVX512VPOPCNTDQVL-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX512VPOPCNTDQVL-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX512VPOPCNTDQVL-NEXT: vpopcntd %xmm0, %xmm0
+; AVX512VPOPCNTDQVL-NEXT: retq
+;
+; BITALG_NOVLX-LABEL: testv4i32u:
+; BITALG_NOVLX: # %bb.0:
+; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; BITALG_NOVLX-NEXT: vpsubd %xmm0, %xmm1, %xmm2
+; BITALG_NOVLX-NEXT: vpand %xmm2, %xmm0, %xmm0
+; BITALG_NOVLX-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; BITALG_NOVLX-NEXT: vpaddd %xmm2, %xmm0, %xmm0
+; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; BITALG_NOVLX-NEXT: vpand %xmm2, %xmm0, %xmm3
+; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; BITALG_NOVLX-NEXT: vpshufb %xmm3, %xmm4, %xmm3
+; BITALG_NOVLX-NEXT: vpsrlw $4, %xmm0, %xmm0
+; BITALG_NOVLX-NEXT: vpand %xmm2, %xmm0, %xmm0
+; BITALG_NOVLX-NEXT: vpshufb %xmm0, %xmm4, %xmm0
+; BITALG_NOVLX-NEXT: vpaddb %xmm3, %xmm0, %xmm0
+; BITALG_NOVLX-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm2, %xmm2
+; BITALG_NOVLX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
+; BITALG_NOVLX-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; BITALG_NOVLX-NEXT: retq
+;
+; BITALG-LABEL: testv4i32u:
+; BITALG: # %bb.0:
+; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; BITALG-NEXT: vpsubd %xmm0, %xmm1, %xmm2
+; BITALG-NEXT: vpand %xmm2, %xmm0, %xmm0
+; BITALG-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; BITALG-NEXT: vpaddd %xmm2, %xmm0, %xmm0
+; BITALG-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; BITALG-NEXT: vpand %xmm2, %xmm0, %xmm3
+; BITALG-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; BITALG-NEXT: vpshufb %xmm3, %xmm4, %xmm3
+; BITALG-NEXT: vpsrlw $4, %xmm0, %xmm0
+; BITALG-NEXT: vpand %xmm2, %xmm0, %xmm0
+; BITALG-NEXT: vpshufb %xmm0, %xmm4, %xmm0
+; BITALG-NEXT: vpaddb %xmm3, %xmm0, %xmm0
+; BITALG-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; BITALG-NEXT: vpsadbw %xmm1, %xmm2, %xmm2
+; BITALG-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; BITALG-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
+; BITALG-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; BITALG-NEXT: retq
+;
; X32-SSE-LABEL: testv4i32u:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: pxor %xmm1, %xmm1
; X32-SSE-NEXT: pxor %xmm2, %xmm2
; X32-SSE-NEXT: psubd %xmm0, %xmm2
@@ -802,7 +1005,7 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind {
define <8 x i16> @testv8i16(<8 x i16> %in) nounwind {
; SSE2-LABEL: testv8i16:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: pxor %xmm1, %xmm1
; SSE2-NEXT: psubw %xmm0, %xmm1
; SSE2-NEXT: pand %xmm0, %xmm1
@@ -829,7 +1032,7 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind {
; SSE2-NEXT: retq
;
; SSE3-LABEL: testv8i16:
-; SSE3: # BB#0:
+; SSE3: # %bb.0:
; SSE3-NEXT: pxor %xmm1, %xmm1
; SSE3-NEXT: psubw %xmm0, %xmm1
; SSE3-NEXT: pand %xmm0, %xmm1
@@ -856,7 +1059,7 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind {
; SSE3-NEXT: retq
;
; SSSE3-LABEL: testv8i16:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: pxor %xmm1, %xmm1
; SSSE3-NEXT: psubw %xmm0, %xmm1
; SSSE3-NEXT: pand %xmm0, %xmm1
@@ -879,7 +1082,7 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind {
; SSSE3-NEXT: retq
;
; SSE41-LABEL: testv8i16:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pxor %xmm1, %xmm1
; SSE41-NEXT: psubw %xmm0, %xmm1
; SSE41-NEXT: pand %xmm0, %xmm1
@@ -902,7 +1105,7 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind {
; SSE41-NEXT: retq
;
; AVX-LABEL: testv8i16:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX-NEXT: vpsubw %xmm0, %xmm1, %xmm1
; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
@@ -922,20 +1125,56 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind {
; AVX-NEXT: retq
;
; AVX512VPOPCNTDQ-LABEL: testv8i16:
-; AVX512VPOPCNTDQ: # BB#0:
+; AVX512VPOPCNTDQ: # %bb.0:
; AVX512VPOPCNTDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512VPOPCNTDQ-NEXT: vpsubw %xmm0, %xmm1, %xmm1
; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; AVX512VPOPCNTDQ-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX512VPOPCNTDQ-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT: vpmovqw %zmm0, %xmm0
+; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0
+; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX512VPOPCNTDQ-NEXT: vzeroupper
; AVX512VPOPCNTDQ-NEXT: retq
;
+; AVX512VPOPCNTDQVL-LABEL: testv8i16:
+; AVX512VPOPCNTDQVL: # %bb.0:
+; AVX512VPOPCNTDQVL-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512VPOPCNTDQVL-NEXT: vpsubw %xmm0, %xmm1, %xmm1
+; AVX512VPOPCNTDQVL-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX512VPOPCNTDQVL-NEXT: vpaddw %xmm1, %xmm0, %xmm0
+; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512VPOPCNTDQVL-NEXT: vpopcntd %ymm0, %ymm0
+; AVX512VPOPCNTDQVL-NEXT: vpmovdw %ymm0, %xmm0
+; AVX512VPOPCNTDQVL-NEXT: vzeroupper
+; AVX512VPOPCNTDQVL-NEXT: retq
+;
+; BITALG_NOVLX-LABEL: testv8i16:
+; BITALG_NOVLX: # %bb.0:
+; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; BITALG_NOVLX-NEXT: vpsubw %xmm0, %xmm1, %xmm1
+; BITALG_NOVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; BITALG_NOVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; BITALG_NOVLX-NEXT: vpaddw %xmm1, %xmm0, %xmm0
+; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0
+; BITALG_NOVLX-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0
+; BITALG_NOVLX-NEXT: vzeroupper
+; BITALG_NOVLX-NEXT: retq
+;
+; BITALG-LABEL: testv8i16:
+; BITALG: # %bb.0:
+; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; BITALG-NEXT: vpsubw %xmm0, %xmm1, %xmm1
+; BITALG-NEXT: vpand %xmm1, %xmm0, %xmm0
+; BITALG-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; BITALG-NEXT: vpaddw %xmm1, %xmm0, %xmm0
+; BITALG-NEXT: vpopcntw %xmm0, %xmm0
+; BITALG-NEXT: retq
+;
; X32-SSE-LABEL: testv8i16:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: pxor %xmm1, %xmm1
; X32-SSE-NEXT: psubw %xmm0, %xmm1
; X32-SSE-NEXT: pand %xmm0, %xmm1
@@ -962,7 +1201,7 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind {
define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind {
; SSE2-LABEL: testv8i16u:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: pxor %xmm1, %xmm1
; SSE2-NEXT: psubw %xmm0, %xmm1
; SSE2-NEXT: pand %xmm0, %xmm1
@@ -989,7 +1228,7 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind {
; SSE2-NEXT: retq
;
; SSE3-LABEL: testv8i16u:
-; SSE3: # BB#0:
+; SSE3: # %bb.0:
; SSE3-NEXT: pxor %xmm1, %xmm1
; SSE3-NEXT: psubw %xmm0, %xmm1
; SSE3-NEXT: pand %xmm0, %xmm1
@@ -1016,7 +1255,7 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind {
; SSE3-NEXT: retq
;
; SSSE3-LABEL: testv8i16u:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: pxor %xmm1, %xmm1
; SSSE3-NEXT: psubw %xmm0, %xmm1
; SSSE3-NEXT: pand %xmm0, %xmm1
@@ -1039,7 +1278,7 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind {
; SSSE3-NEXT: retq
;
; SSE41-LABEL: testv8i16u:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pxor %xmm1, %xmm1
; SSE41-NEXT: psubw %xmm0, %xmm1
; SSE41-NEXT: pand %xmm0, %xmm1
@@ -1062,7 +1301,7 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind {
; SSE41-NEXT: retq
;
; AVX-LABEL: testv8i16u:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX-NEXT: vpsubw %xmm0, %xmm1, %xmm1
; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
@@ -1082,20 +1321,56 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind {
; AVX-NEXT: retq
;
; AVX512VPOPCNTDQ-LABEL: testv8i16u:
-; AVX512VPOPCNTDQ: # BB#0:
+; AVX512VPOPCNTDQ: # %bb.0:
; AVX512VPOPCNTDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512VPOPCNTDQ-NEXT: vpsubw %xmm0, %xmm1, %xmm1
; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; AVX512VPOPCNTDQ-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX512VPOPCNTDQ-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT: vpmovqw %zmm0, %xmm0
+; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0
+; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0
; AVX512VPOPCNTDQ-NEXT: vzeroupper
; AVX512VPOPCNTDQ-NEXT: retq
;
+; AVX512VPOPCNTDQVL-LABEL: testv8i16u:
+; AVX512VPOPCNTDQVL: # %bb.0:
+; AVX512VPOPCNTDQVL-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512VPOPCNTDQVL-NEXT: vpsubw %xmm0, %xmm1, %xmm1
+; AVX512VPOPCNTDQVL-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX512VPOPCNTDQVL-NEXT: vpaddw %xmm1, %xmm0, %xmm0
+; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512VPOPCNTDQVL-NEXT: vpopcntd %ymm0, %ymm0
+; AVX512VPOPCNTDQVL-NEXT: vpmovdw %ymm0, %xmm0
+; AVX512VPOPCNTDQVL-NEXT: vzeroupper
+; AVX512VPOPCNTDQVL-NEXT: retq
+;
+; BITALG_NOVLX-LABEL: testv8i16u:
+; BITALG_NOVLX: # %bb.0:
+; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; BITALG_NOVLX-NEXT: vpsubw %xmm0, %xmm1, %xmm1
+; BITALG_NOVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; BITALG_NOVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; BITALG_NOVLX-NEXT: vpaddw %xmm1, %xmm0, %xmm0
+; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0
+; BITALG_NOVLX-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0
+; BITALG_NOVLX-NEXT: vzeroupper
+; BITALG_NOVLX-NEXT: retq
+;
+; BITALG-LABEL: testv8i16u:
+; BITALG: # %bb.0:
+; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; BITALG-NEXT: vpsubw %xmm0, %xmm1, %xmm1
+; BITALG-NEXT: vpand %xmm1, %xmm0, %xmm0
+; BITALG-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; BITALG-NEXT: vpaddw %xmm1, %xmm0, %xmm0
+; BITALG-NEXT: vpopcntw %xmm0, %xmm0
+; BITALG-NEXT: retq
+;
; X32-SSE-LABEL: testv8i16u:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: pxor %xmm1, %xmm1
; X32-SSE-NEXT: psubw %xmm0, %xmm1
; X32-SSE-NEXT: pand %xmm0, %xmm1
@@ -1122,7 +1397,7 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind {
define <16 x i8> @testv16i8(<16 x i8> %in) nounwind {
; SSE2-LABEL: testv16i8:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: pxor %xmm1, %xmm1
; SSE2-NEXT: psubb %xmm0, %xmm1
; SSE2-NEXT: pand %xmm0, %xmm1
@@ -1145,7 +1420,7 @@ define <16 x i8> @testv16i8(<16 x i8> %in) nounwind {
; SSE2-NEXT: retq
;
; SSE3-LABEL: testv16i8:
-; SSE3: # BB#0:
+; SSE3: # %bb.0:
; SSE3-NEXT: pxor %xmm1, %xmm1
; SSE3-NEXT: psubb %xmm0, %xmm1
; SSE3-NEXT: pand %xmm0, %xmm1
@@ -1168,7 +1443,7 @@ define <16 x i8> @testv16i8(<16 x i8> %in) nounwind {
; SSE3-NEXT: retq
;
; SSSE3-LABEL: testv16i8:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: pxor %xmm1, %xmm1
; SSSE3-NEXT: psubb %xmm0, %xmm1
; SSSE3-NEXT: pand %xmm0, %xmm1
@@ -1187,7 +1462,7 @@ define <16 x i8> @testv16i8(<16 x i8> %in) nounwind {
; SSSE3-NEXT: retq
;
; SSE41-LABEL: testv16i8:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pxor %xmm1, %xmm1
; SSE41-NEXT: psubb %xmm0, %xmm1
; SSE41-NEXT: pand %xmm0, %xmm1
@@ -1206,7 +1481,7 @@ define <16 x i8> @testv16i8(<16 x i8> %in) nounwind {
; SSE41-NEXT: retq
;
; AVX-LABEL: testv16i8:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX-NEXT: vpsubb %xmm0, %xmm1, %xmm1
; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
@@ -1223,7 +1498,7 @@ define <16 x i8> @testv16i8(<16 x i8> %in) nounwind {
; AVX-NEXT: retq
;
; AVX512VPOPCNTDQ-LABEL: testv16i8:
-; AVX512VPOPCNTDQ: # BB#0:
+; AVX512VPOPCNTDQ: # %bb.0:
; AVX512VPOPCNTDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512VPOPCNTDQ-NEXT: vpsubb %xmm0, %xmm1, %xmm1
; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm0
@@ -1235,8 +1510,43 @@ define <16 x i8> @testv16i8(<16 x i8> %in) nounwind {
; AVX512VPOPCNTDQ-NEXT: vzeroupper
; AVX512VPOPCNTDQ-NEXT: retq
;
+; AVX512VPOPCNTDQVL-LABEL: testv16i8:
+; AVX512VPOPCNTDQVL: # %bb.0:
+; AVX512VPOPCNTDQVL-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512VPOPCNTDQVL-NEXT: vpsubb %xmm0, %xmm1, %xmm1
+; AVX512VPOPCNTDQVL-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX512VPOPCNTDQVL-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; AVX512VPOPCNTDQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512VPOPCNTDQVL-NEXT: vpopcntd %zmm0, %zmm0
+; AVX512VPOPCNTDQVL-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512VPOPCNTDQVL-NEXT: vzeroupper
+; AVX512VPOPCNTDQVL-NEXT: retq
+;
+; BITALG_NOVLX-LABEL: testv16i8:
+; BITALG_NOVLX: # %bb.0:
+; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; BITALG_NOVLX-NEXT: vpsubb %xmm0, %xmm1, %xmm1
+; BITALG_NOVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; BITALG_NOVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; BITALG_NOVLX-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0
+; BITALG_NOVLX-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0
+; BITALG_NOVLX-NEXT: vzeroupper
+; BITALG_NOVLX-NEXT: retq
+;
+; BITALG-LABEL: testv16i8:
+; BITALG: # %bb.0:
+; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; BITALG-NEXT: vpsubb %xmm0, %xmm1, %xmm1
+; BITALG-NEXT: vpand %xmm1, %xmm0, %xmm0
+; BITALG-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; BITALG-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; BITALG-NEXT: vpopcntb %xmm0, %xmm0
+; BITALG-NEXT: retq
+;
; X32-SSE-LABEL: testv16i8:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: pxor %xmm1, %xmm1
; X32-SSE-NEXT: psubb %xmm0, %xmm1
; X32-SSE-NEXT: pand %xmm0, %xmm1
@@ -1259,7 +1569,7 @@ define <16 x i8> @testv16i8(<16 x i8> %in) nounwind {
define <16 x i8> @testv16i8u(<16 x i8> %in) nounwind {
; SSE2-LABEL: testv16i8u:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: pxor %xmm1, %xmm1
; SSE2-NEXT: psubb %xmm0, %xmm1
; SSE2-NEXT: pand %xmm0, %xmm1
@@ -1282,7 +1592,7 @@ define <16 x i8> @testv16i8u(<16 x i8> %in) nounwind {
; SSE2-NEXT: retq
;
; SSE3-LABEL: testv16i8u:
-; SSE3: # BB#0:
+; SSE3: # %bb.0:
; SSE3-NEXT: pxor %xmm1, %xmm1
; SSE3-NEXT: psubb %xmm0, %xmm1
; SSE3-NEXT: pand %xmm0, %xmm1
@@ -1305,7 +1615,7 @@ define <16 x i8> @testv16i8u(<16 x i8> %in) nounwind {
; SSE3-NEXT: retq
;
; SSSE3-LABEL: testv16i8u:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: pxor %xmm1, %xmm1
; SSSE3-NEXT: psubb %xmm0, %xmm1
; SSSE3-NEXT: pand %xmm0, %xmm1
@@ -1324,7 +1634,7 @@ define <16 x i8> @testv16i8u(<16 x i8> %in) nounwind {
; SSSE3-NEXT: retq
;
; SSE41-LABEL: testv16i8u:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pxor %xmm1, %xmm1
; SSE41-NEXT: psubb %xmm0, %xmm1
; SSE41-NEXT: pand %xmm0, %xmm1
@@ -1343,7 +1653,7 @@ define <16 x i8> @testv16i8u(<16 x i8> %in) nounwind {
; SSE41-NEXT: retq
;
; AVX-LABEL: testv16i8u:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX-NEXT: vpsubb %xmm0, %xmm1, %xmm1
; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
@@ -1360,7 +1670,7 @@ define <16 x i8> @testv16i8u(<16 x i8> %in) nounwind {
; AVX-NEXT: retq
;
; AVX512VPOPCNTDQ-LABEL: testv16i8u:
-; AVX512VPOPCNTDQ: # BB#0:
+; AVX512VPOPCNTDQ: # %bb.0:
; AVX512VPOPCNTDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512VPOPCNTDQ-NEXT: vpsubb %xmm0, %xmm1, %xmm1
; AVX512VPOPCNTDQ-NEXT: vpand %xmm1, %xmm0, %xmm0
@@ -1372,8 +1682,43 @@ define <16 x i8> @testv16i8u(<16 x i8> %in) nounwind {
; AVX512VPOPCNTDQ-NEXT: vzeroupper
; AVX512VPOPCNTDQ-NEXT: retq
;
+; AVX512VPOPCNTDQVL-LABEL: testv16i8u:
+; AVX512VPOPCNTDQVL: # %bb.0:
+; AVX512VPOPCNTDQVL-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512VPOPCNTDQVL-NEXT: vpsubb %xmm0, %xmm1, %xmm1
+; AVX512VPOPCNTDQVL-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX512VPOPCNTDQVL-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; AVX512VPOPCNTDQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512VPOPCNTDQVL-NEXT: vpopcntd %zmm0, %zmm0
+; AVX512VPOPCNTDQVL-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512VPOPCNTDQVL-NEXT: vzeroupper
+; AVX512VPOPCNTDQVL-NEXT: retq
+;
+; BITALG_NOVLX-LABEL: testv16i8u:
+; BITALG_NOVLX: # %bb.0:
+; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; BITALG_NOVLX-NEXT: vpsubb %xmm0, %xmm1, %xmm1
+; BITALG_NOVLX-NEXT: vpand %xmm1, %xmm0, %xmm0
+; BITALG_NOVLX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; BITALG_NOVLX-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0
+; BITALG_NOVLX-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0
+; BITALG_NOVLX-NEXT: vzeroupper
+; BITALG_NOVLX-NEXT: retq
+;
+; BITALG-LABEL: testv16i8u:
+; BITALG: # %bb.0:
+; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; BITALG-NEXT: vpsubb %xmm0, %xmm1, %xmm1
+; BITALG-NEXT: vpand %xmm1, %xmm0, %xmm0
+; BITALG-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; BITALG-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; BITALG-NEXT: vpopcntb %xmm0, %xmm0
+; BITALG-NEXT: retq
+;
; X32-SSE-LABEL: testv16i8u:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: pxor %xmm1, %xmm1
; X32-SSE-NEXT: psubb %xmm0, %xmm1
; X32-SSE-NEXT: pand %xmm0, %xmm1
@@ -1396,25 +1741,43 @@ define <16 x i8> @testv16i8u(<16 x i8> %in) nounwind {
define <2 x i64> @foldv2i64() nounwind {
; SSE-LABEL: foldv2i64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movl $8, %eax
; SSE-NEXT: movq %rax, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: foldv2i64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: movl $8, %eax
; AVX-NEXT: vmovq %rax, %xmm0
; AVX-NEXT: retq
;
; AVX512VPOPCNTDQ-LABEL: foldv2i64:
-; AVX512VPOPCNTDQ: # BB#0:
+; AVX512VPOPCNTDQ: # %bb.0:
; AVX512VPOPCNTDQ-NEXT: movl $8, %eax
; AVX512VPOPCNTDQ-NEXT: vmovq %rax, %xmm0
; AVX512VPOPCNTDQ-NEXT: retq
;
+; AVX512VPOPCNTDQVL-LABEL: foldv2i64:
+; AVX512VPOPCNTDQVL: # %bb.0:
+; AVX512VPOPCNTDQVL-NEXT: movl $8, %eax
+; AVX512VPOPCNTDQVL-NEXT: vmovq %rax, %xmm0
+; AVX512VPOPCNTDQVL-NEXT: retq
+;
+; BITALG_NOVLX-LABEL: foldv2i64:
+; BITALG_NOVLX: # %bb.0:
+; BITALG_NOVLX-NEXT: movl $8, %eax
+; BITALG_NOVLX-NEXT: vmovq %rax, %xmm0
+; BITALG_NOVLX-NEXT: retq
+;
+; BITALG-LABEL: foldv2i64:
+; BITALG: # %bb.0:
+; BITALG-NEXT: movl $8, %eax
+; BITALG-NEXT: vmovq %rax, %xmm0
+; BITALG-NEXT: retq
+;
; X32-SSE-LABEL: foldv2i64:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: movl $8, %eax
; X32-SSE-NEXT: movd %eax, %xmm0
; X32-SSE-NEXT: retl
@@ -1424,25 +1787,43 @@ define <2 x i64> @foldv2i64() nounwind {
define <2 x i64> @foldv2i64u() nounwind {
; SSE-LABEL: foldv2i64u:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movl $8, %eax
; SSE-NEXT: movq %rax, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: foldv2i64u:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: movl $8, %eax
; AVX-NEXT: vmovq %rax, %xmm0
; AVX-NEXT: retq
;
; AVX512VPOPCNTDQ-LABEL: foldv2i64u:
-; AVX512VPOPCNTDQ: # BB#0:
+; AVX512VPOPCNTDQ: # %bb.0:
; AVX512VPOPCNTDQ-NEXT: movl $8, %eax
; AVX512VPOPCNTDQ-NEXT: vmovq %rax, %xmm0
; AVX512VPOPCNTDQ-NEXT: retq
;
+; AVX512VPOPCNTDQVL-LABEL: foldv2i64u:
+; AVX512VPOPCNTDQVL: # %bb.0:
+; AVX512VPOPCNTDQVL-NEXT: movl $8, %eax
+; AVX512VPOPCNTDQVL-NEXT: vmovq %rax, %xmm0
+; AVX512VPOPCNTDQVL-NEXT: retq
+;
+; BITALG_NOVLX-LABEL: foldv2i64u:
+; BITALG_NOVLX: # %bb.0:
+; BITALG_NOVLX-NEXT: movl $8, %eax
+; BITALG_NOVLX-NEXT: vmovq %rax, %xmm0
+; BITALG_NOVLX-NEXT: retq
+;
+; BITALG-LABEL: foldv2i64u:
+; BITALG: # %bb.0:
+; BITALG-NEXT: movl $8, %eax
+; BITALG-NEXT: vmovq %rax, %xmm0
+; BITALG-NEXT: retq
+;
; X32-SSE-LABEL: foldv2i64u:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: movl $8, %eax
; X32-SSE-NEXT: movd %eax, %xmm0
; X32-SSE-NEXT: retl
@@ -1452,22 +1833,37 @@ define <2 x i64> @foldv2i64u() nounwind {
define <4 x i32> @foldv4i32() nounwind {
; SSE-LABEL: foldv4i32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,32,0]
; SSE-NEXT: retq
;
; AVX-LABEL: foldv4i32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,32,0]
; AVX-NEXT: retq
;
; AVX512VPOPCNTDQ-LABEL: foldv4i32:
-; AVX512VPOPCNTDQ: # BB#0:
+; AVX512VPOPCNTDQ: # %bb.0:
; AVX512VPOPCNTDQ-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,32,0]
; AVX512VPOPCNTDQ-NEXT: retq
;
+; AVX512VPOPCNTDQVL-LABEL: foldv4i32:
+; AVX512VPOPCNTDQVL: # %bb.0:
+; AVX512VPOPCNTDQVL-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,32,0]
+; AVX512VPOPCNTDQVL-NEXT: retq
+;
+; BITALG_NOVLX-LABEL: foldv4i32:
+; BITALG_NOVLX: # %bb.0:
+; BITALG_NOVLX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,32,0]
+; BITALG_NOVLX-NEXT: retq
+;
+; BITALG-LABEL: foldv4i32:
+; BITALG: # %bb.0:
+; BITALG-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,32,0]
+; BITALG-NEXT: retq
+;
; X32-SSE-LABEL: foldv4i32:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,32,0]
; X32-SSE-NEXT: retl
%out = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> <i32 256, i32 -1, i32 0, i32 255>, i1 0)
@@ -1476,22 +1872,37 @@ define <4 x i32> @foldv4i32() nounwind {
define <4 x i32> @foldv4i32u() nounwind {
; SSE-LABEL: foldv4i32u:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,32,0]
; SSE-NEXT: retq
;
; AVX-LABEL: foldv4i32u:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,32,0]
; AVX-NEXT: retq
;
; AVX512VPOPCNTDQ-LABEL: foldv4i32u:
-; AVX512VPOPCNTDQ: # BB#0:
+; AVX512VPOPCNTDQ: # %bb.0:
; AVX512VPOPCNTDQ-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,32,0]
; AVX512VPOPCNTDQ-NEXT: retq
;
+; AVX512VPOPCNTDQVL-LABEL: foldv4i32u:
+; AVX512VPOPCNTDQVL: # %bb.0:
+; AVX512VPOPCNTDQVL-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,32,0]
+; AVX512VPOPCNTDQVL-NEXT: retq
+;
+; BITALG_NOVLX-LABEL: foldv4i32u:
+; BITALG_NOVLX: # %bb.0:
+; BITALG_NOVLX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,32,0]
+; BITALG_NOVLX-NEXT: retq
+;
+; BITALG-LABEL: foldv4i32u:
+; BITALG: # %bb.0:
+; BITALG-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,32,0]
+; BITALG-NEXT: retq
+;
; X32-SSE-LABEL: foldv4i32u:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,32,0]
; X32-SSE-NEXT: retl
%out = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> <i32 256, i32 -1, i32 0, i32 255>, i1 -1)
@@ -1500,22 +1911,37 @@ define <4 x i32> @foldv4i32u() nounwind {
define <8 x i16> @foldv8i16() nounwind {
; SSE-LABEL: foldv8i16:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3]
; SSE-NEXT: retq
;
; AVX-LABEL: foldv8i16:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3]
; AVX-NEXT: retq
;
; AVX512VPOPCNTDQ-LABEL: foldv8i16:
-; AVX512VPOPCNTDQ: # BB#0:
+; AVX512VPOPCNTDQ: # %bb.0:
; AVX512VPOPCNTDQ-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3]
; AVX512VPOPCNTDQ-NEXT: retq
;
+; AVX512VPOPCNTDQVL-LABEL: foldv8i16:
+; AVX512VPOPCNTDQVL: # %bb.0:
+; AVX512VPOPCNTDQVL-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3]
+; AVX512VPOPCNTDQVL-NEXT: retq
+;
+; BITALG_NOVLX-LABEL: foldv8i16:
+; BITALG_NOVLX: # %bb.0:
+; BITALG_NOVLX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3]
+; BITALG_NOVLX-NEXT: retq
+;
+; BITALG-LABEL: foldv8i16:
+; BITALG: # %bb.0:
+; BITALG-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3]
+; BITALG-NEXT: retq
+;
; X32-SSE-LABEL: foldv8i16:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3]
; X32-SSE-NEXT: retl
%out = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> <i16 256, i16 -1, i16 0, i16 255, i16 -65536, i16 7, i16 24, i16 88>, i1 0)
@@ -1524,22 +1950,37 @@ define <8 x i16> @foldv8i16() nounwind {
define <8 x i16> @foldv8i16u() nounwind {
; SSE-LABEL: foldv8i16u:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3]
; SSE-NEXT: retq
;
; AVX-LABEL: foldv8i16u:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3]
; AVX-NEXT: retq
;
; AVX512VPOPCNTDQ-LABEL: foldv8i16u:
-; AVX512VPOPCNTDQ: # BB#0:
+; AVX512VPOPCNTDQ: # %bb.0:
; AVX512VPOPCNTDQ-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3]
; AVX512VPOPCNTDQ-NEXT: retq
;
+; AVX512VPOPCNTDQVL-LABEL: foldv8i16u:
+; AVX512VPOPCNTDQVL: # %bb.0:
+; AVX512VPOPCNTDQVL-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3]
+; AVX512VPOPCNTDQVL-NEXT: retq
+;
+; BITALG_NOVLX-LABEL: foldv8i16u:
+; BITALG_NOVLX: # %bb.0:
+; BITALG_NOVLX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3]
+; BITALG_NOVLX-NEXT: retq
+;
+; BITALG-LABEL: foldv8i16u:
+; BITALG: # %bb.0:
+; BITALG-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3]
+; BITALG-NEXT: retq
+;
; X32-SSE-LABEL: foldv8i16u:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3]
; X32-SSE-NEXT: retl
%out = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> <i16 256, i16 -1, i16 0, i16 255, i16 -65536, i16 7, i16 24, i16 88>, i1 -1)
@@ -1548,22 +1989,37 @@ define <8 x i16> @foldv8i16u() nounwind {
define <16 x i8> @foldv16i8() nounwind {
; SSE-LABEL: foldv16i8:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5]
; SSE-NEXT: retq
;
; AVX-LABEL: foldv16i8:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5]
; AVX-NEXT: retq
;
; AVX512VPOPCNTDQ-LABEL: foldv16i8:
-; AVX512VPOPCNTDQ: # BB#0:
+; AVX512VPOPCNTDQ: # %bb.0:
; AVX512VPOPCNTDQ-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5]
; AVX512VPOPCNTDQ-NEXT: retq
;
+; AVX512VPOPCNTDQVL-LABEL: foldv16i8:
+; AVX512VPOPCNTDQVL: # %bb.0:
+; AVX512VPOPCNTDQVL-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5]
+; AVX512VPOPCNTDQVL-NEXT: retq
+;
+; BITALG_NOVLX-LABEL: foldv16i8:
+; BITALG_NOVLX: # %bb.0:
+; BITALG_NOVLX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5]
+; BITALG_NOVLX-NEXT: retq
+;
+; BITALG-LABEL: foldv16i8:
+; BITALG: # %bb.0:
+; BITALG-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5]
+; BITALG-NEXT: retq
+;
; X32-SSE-LABEL: foldv16i8:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5]
; X32-SSE-NEXT: retl
%out = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> <i8 256, i8 -1, i8 0, i8 255, i8 -65536, i8 7, i8 24, i8 88, i8 -2, i8 254, i8 1, i8 2, i8 4, i8 8, i8 16, i8 32>, i1 0)
@@ -1572,22 +2028,37 @@ define <16 x i8> @foldv16i8() nounwind {
define <16 x i8> @foldv16i8u() nounwind {
; SSE-LABEL: foldv16i8u:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5]
; SSE-NEXT: retq
;
; AVX-LABEL: foldv16i8u:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5]
; AVX-NEXT: retq
;
; AVX512VPOPCNTDQ-LABEL: foldv16i8u:
-; AVX512VPOPCNTDQ: # BB#0:
+; AVX512VPOPCNTDQ: # %bb.0:
; AVX512VPOPCNTDQ-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5]
; AVX512VPOPCNTDQ-NEXT: retq
;
+; AVX512VPOPCNTDQVL-LABEL: foldv16i8u:
+; AVX512VPOPCNTDQVL: # %bb.0:
+; AVX512VPOPCNTDQVL-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5]
+; AVX512VPOPCNTDQVL-NEXT: retq
+;
+; BITALG_NOVLX-LABEL: foldv16i8u:
+; BITALG_NOVLX: # %bb.0:
+; BITALG_NOVLX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5]
+; BITALG_NOVLX-NEXT: retq
+;
+; BITALG-LABEL: foldv16i8u:
+; BITALG: # %bb.0:
+; BITALG-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5]
+; BITALG-NEXT: retq
+;
; X32-SSE-LABEL: foldv16i8u:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5]
; X32-SSE-NEXT: retl
%out = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> <i8 256, i8 -1, i8 0, i8 255, i8 -65536, i8 7, i8 24, i8 88, i8 -2, i8 254, i8 1, i8 2, i8 4, i8 8, i8 16, i8 32>, i1 -1)
diff --git a/test/CodeGen/X86/vector-tzcnt-256.ll b/test/CodeGen/X86/vector-tzcnt-256.ll
index 4a7d25c1376e..e75476024387 100644
--- a/test/CodeGen/X86/vector-tzcnt-256.ll
+++ b/test/CodeGen/X86/vector-tzcnt-256.ll
@@ -4,13 +4,16 @@
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512cd,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512CDVL
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512cd,-avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512CD
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vpopcntdq | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512VPOPCNTDQ
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vpopcntdq,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512VPOPCNTDQVL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bitalg | FileCheck %s --check-prefix=ALL --check-prefix=BITALG_NOVLX
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bitalg,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=BITALG
;
; Just one 32-bit run to make sure we do reasonable things for i64 tzcnt.
; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=X32-AVX --check-prefix=X32-AVX2
define <4 x i64> @testv4i64(<4 x i64> %in) nounwind {
; AVX1-LABEL: testv4i64:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm3
@@ -40,8 +43,8 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: testv4i64:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpsubq %ymm0, %ymm1, %ymm2
; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
@@ -58,8 +61,8 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind {
; AVX2-NEXT: retq
;
; AVX512CDVL-LABEL: testv4i64:
-; AVX512CDVL: # BB#0:
-; AVX512CDVL-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; AVX512CDVL: # %bb.0:
+; AVX512CDVL-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512CDVL-NEXT: vpsubq %ymm0, %ymm1, %ymm2
; AVX512CDVL-NEXT: vpand %ymm2, %ymm0, %ymm0
; AVX512CDVL-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
@@ -76,8 +79,8 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind {
; AVX512CDVL-NEXT: retq
;
; AVX512CD-LABEL: testv4i64:
-; AVX512CD: # BB#0:
-; AVX512CD-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; AVX512CD: # %bb.0:
+; AVX512CD-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512CD-NEXT: vpsubq %ymm0, %ymm1, %ymm2
; AVX512CD-NEXT: vpand %ymm2, %ymm0, %ymm0
; AVX512CD-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
@@ -94,19 +97,65 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind {
; AVX512CD-NEXT: retq
;
; AVX512VPOPCNTDQ-LABEL: testv4i64:
-; AVX512VPOPCNTDQ: # BB#0:
-; AVX512VPOPCNTDQ-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; AVX512VPOPCNTDQ: # %bb.0:
+; AVX512VPOPCNTDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512VPOPCNTDQ-NEXT: vpsubq %ymm0, %ymm1, %ymm1
; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; AVX512VPOPCNTDQ-NEXT: vpaddq %ymm1, %ymm0, %ymm0
; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512VPOPCNTDQ-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
; AVX512VPOPCNTDQ-NEXT: retq
;
+; AVX512VPOPCNTDQVL-LABEL: testv4i64:
+; AVX512VPOPCNTDQVL: # %bb.0:
+; AVX512VPOPCNTDQVL-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512VPOPCNTDQVL-NEXT: vpsubq %ymm0, %ymm1, %ymm1
+; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm0
+; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX512VPOPCNTDQVL-NEXT: vpaddq %ymm1, %ymm0, %ymm0
+; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0
+; AVX512VPOPCNTDQVL-NEXT: retq
+;
+; BITALG_NOVLX-LABEL: testv4i64:
+; BITALG_NOVLX: # %bb.0:
+; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; BITALG_NOVLX-NEXT: vpsubq %ymm0, %ymm1, %ymm2
+; BITALG_NOVLX-NEXT: vpand %ymm2, %ymm0, %ymm0
+; BITALG_NOVLX-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
+; BITALG_NOVLX-NEXT: vpaddq %ymm2, %ymm0, %ymm0
+; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; BITALG_NOVLX-NEXT: vpand %ymm2, %ymm0, %ymm3
+; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; BITALG_NOVLX-NEXT: vpshufb %ymm3, %ymm4, %ymm3
+; BITALG_NOVLX-NEXT: vpsrlw $4, %ymm0, %ymm0
+; BITALG_NOVLX-NEXT: vpand %ymm2, %ymm0, %ymm0
+; BITALG_NOVLX-NEXT: vpshufb %ymm0, %ymm4, %ymm0
+; BITALG_NOVLX-NEXT: vpaddb %ymm3, %ymm0, %ymm0
+; BITALG_NOVLX-NEXT: vpsadbw %ymm1, %ymm0, %ymm0
+; BITALG_NOVLX-NEXT: retq
+;
+; BITALG-LABEL: testv4i64:
+; BITALG: # %bb.0:
+; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; BITALG-NEXT: vpsubq %ymm0, %ymm1, %ymm2
+; BITALG-NEXT: vpand %ymm2, %ymm0, %ymm0
+; BITALG-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
+; BITALG-NEXT: vpaddq %ymm2, %ymm0, %ymm0
+; BITALG-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; BITALG-NEXT: vpand %ymm2, %ymm0, %ymm3
+; BITALG-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; BITALG-NEXT: vpshufb %ymm3, %ymm4, %ymm3
+; BITALG-NEXT: vpsrlw $4, %ymm0, %ymm0
+; BITALG-NEXT: vpand %ymm2, %ymm0, %ymm0
+; BITALG-NEXT: vpshufb %ymm0, %ymm4, %ymm0
+; BITALG-NEXT: vpaddb %ymm3, %ymm0, %ymm0
+; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0
+; BITALG-NEXT: retq
+;
; X32-AVX-LABEL: testv4i64:
-; X32-AVX: # BB#0:
-; X32-AVX-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; X32-AVX: # %bb.0:
+; X32-AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; X32-AVX-NEXT: vpsubq %ymm0, %ymm1, %ymm2
; X32-AVX-NEXT: vpand %ymm2, %ymm0, %ymm0
; X32-AVX-NEXT: vpsubq {{\.LCPI.*}}, %ymm0, %ymm0
@@ -126,7 +175,7 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind {
define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind {
; AVX1-LABEL: testv4i64u:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm3
@@ -156,8 +205,8 @@ define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: testv4i64u:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpsubq %ymm0, %ymm1, %ymm2
; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
@@ -174,8 +223,8 @@ define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind {
; AVX2-NEXT: retq
;
; AVX512CDVL-LABEL: testv4i64u:
-; AVX512CDVL: # BB#0:
-; AVX512CDVL-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; AVX512CDVL: # %bb.0:
+; AVX512CDVL-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512CDVL-NEXT: vpsubq %ymm0, %ymm1, %ymm1
; AVX512CDVL-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512CDVL-NEXT: vplzcntq %ymm0, %ymm0
@@ -184,8 +233,8 @@ define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind {
; AVX512CDVL-NEXT: retq
;
; AVX512CD-LABEL: testv4i64u:
-; AVX512CD: # BB#0:
-; AVX512CD-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; AVX512CD: # %bb.0:
+; AVX512CD-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512CD-NEXT: vpsubq %ymm0, %ymm1, %ymm1
; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512CD-NEXT: vplzcntq %zmm0, %zmm0
@@ -194,19 +243,65 @@ define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind {
; AVX512CD-NEXT: retq
;
; AVX512VPOPCNTDQ-LABEL: testv4i64u:
-; AVX512VPOPCNTDQ: # BB#0:
-; AVX512VPOPCNTDQ-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; AVX512VPOPCNTDQ: # %bb.0:
+; AVX512VPOPCNTDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512VPOPCNTDQ-NEXT: vpsubq %ymm0, %ymm1, %ymm1
; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; AVX512VPOPCNTDQ-NEXT: vpaddq %ymm1, %ymm0, %ymm0
; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512VPOPCNTDQ-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
; AVX512VPOPCNTDQ-NEXT: retq
;
+; AVX512VPOPCNTDQVL-LABEL: testv4i64u:
+; AVX512VPOPCNTDQVL: # %bb.0:
+; AVX512VPOPCNTDQVL-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512VPOPCNTDQVL-NEXT: vpsubq %ymm0, %ymm1, %ymm1
+; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm0
+; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX512VPOPCNTDQVL-NEXT: vpaddq %ymm1, %ymm0, %ymm0
+; AVX512VPOPCNTDQVL-NEXT: vpopcntq %ymm0, %ymm0
+; AVX512VPOPCNTDQVL-NEXT: retq
+;
+; BITALG_NOVLX-LABEL: testv4i64u:
+; BITALG_NOVLX: # %bb.0:
+; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; BITALG_NOVLX-NEXT: vpsubq %ymm0, %ymm1, %ymm2
+; BITALG_NOVLX-NEXT: vpand %ymm2, %ymm0, %ymm0
+; BITALG_NOVLX-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
+; BITALG_NOVLX-NEXT: vpaddq %ymm2, %ymm0, %ymm0
+; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; BITALG_NOVLX-NEXT: vpand %ymm2, %ymm0, %ymm3
+; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; BITALG_NOVLX-NEXT: vpshufb %ymm3, %ymm4, %ymm3
+; BITALG_NOVLX-NEXT: vpsrlw $4, %ymm0, %ymm0
+; BITALG_NOVLX-NEXT: vpand %ymm2, %ymm0, %ymm0
+; BITALG_NOVLX-NEXT: vpshufb %ymm0, %ymm4, %ymm0
+; BITALG_NOVLX-NEXT: vpaddb %ymm3, %ymm0, %ymm0
+; BITALG_NOVLX-NEXT: vpsadbw %ymm1, %ymm0, %ymm0
+; BITALG_NOVLX-NEXT: retq
+;
+; BITALG-LABEL: testv4i64u:
+; BITALG: # %bb.0:
+; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; BITALG-NEXT: vpsubq %ymm0, %ymm1, %ymm2
+; BITALG-NEXT: vpand %ymm2, %ymm0, %ymm0
+; BITALG-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
+; BITALG-NEXT: vpaddq %ymm2, %ymm0, %ymm0
+; BITALG-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; BITALG-NEXT: vpand %ymm2, %ymm0, %ymm3
+; BITALG-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; BITALG-NEXT: vpshufb %ymm3, %ymm4, %ymm3
+; BITALG-NEXT: vpsrlw $4, %ymm0, %ymm0
+; BITALG-NEXT: vpand %ymm2, %ymm0, %ymm0
+; BITALG-NEXT: vpshufb %ymm0, %ymm4, %ymm0
+; BITALG-NEXT: vpaddb %ymm3, %ymm0, %ymm0
+; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0
+; BITALG-NEXT: retq
+;
; X32-AVX-LABEL: testv4i64u:
-; X32-AVX: # BB#0:
-; X32-AVX-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; X32-AVX: # %bb.0:
+; X32-AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; X32-AVX-NEXT: vpsubq %ymm0, %ymm1, %ymm2
; X32-AVX-NEXT: vpand %ymm2, %ymm0, %ymm0
; X32-AVX-NEXT: vpsubq {{\.LCPI.*}}, %ymm0, %ymm0
@@ -226,7 +321,7 @@ define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind {
define <8 x i32> @testv8i32(<8 x i32> %in) nounwind {
; AVX1-LABEL: testv8i32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpsubd %xmm1, %xmm2, %xmm3
@@ -264,8 +359,8 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: testv8i32:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpsubd %ymm0, %ymm1, %ymm2
; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
@@ -286,8 +381,8 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind {
; AVX2-NEXT: retq
;
; AVX512CDVL-LABEL: testv8i32:
-; AVX512CDVL: # BB#0:
-; AVX512CDVL-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; AVX512CDVL: # %bb.0:
+; AVX512CDVL-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512CDVL-NEXT: vpsubd %ymm0, %ymm1, %ymm2
; AVX512CDVL-NEXT: vpand %ymm2, %ymm0, %ymm0
; AVX512CDVL-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
@@ -308,8 +403,8 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind {
; AVX512CDVL-NEXT: retq
;
; AVX512CD-LABEL: testv8i32:
-; AVX512CD: # BB#0:
-; AVX512CD-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; AVX512CD: # %bb.0:
+; AVX512CD-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512CD-NEXT: vpsubd %ymm0, %ymm1, %ymm2
; AVX512CD-NEXT: vpand %ymm2, %ymm0, %ymm0
; AVX512CD-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
@@ -330,19 +425,73 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind {
; AVX512CD-NEXT: retq
;
; AVX512VPOPCNTDQ-LABEL: testv8i32:
-; AVX512VPOPCNTDQ: # BB#0:
-; AVX512VPOPCNTDQ-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; AVX512VPOPCNTDQ: # %bb.0:
+; AVX512VPOPCNTDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512VPOPCNTDQ-NEXT: vpsubd %ymm0, %ymm1, %ymm1
; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; AVX512VPOPCNTDQ-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512VPOPCNTDQ-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
; AVX512VPOPCNTDQ-NEXT: retq
;
+; AVX512VPOPCNTDQVL-LABEL: testv8i32:
+; AVX512VPOPCNTDQVL: # %bb.0:
+; AVX512VPOPCNTDQVL-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512VPOPCNTDQVL-NEXT: vpsubd %ymm0, %ymm1, %ymm1
+; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm0
+; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX512VPOPCNTDQVL-NEXT: vpaddd %ymm1, %ymm0, %ymm0
+; AVX512VPOPCNTDQVL-NEXT: vpopcntd %ymm0, %ymm0
+; AVX512VPOPCNTDQVL-NEXT: retq
+;
+; BITALG_NOVLX-LABEL: testv8i32:
+; BITALG_NOVLX: # %bb.0:
+; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; BITALG_NOVLX-NEXT: vpsubd %ymm0, %ymm1, %ymm2
+; BITALG_NOVLX-NEXT: vpand %ymm2, %ymm0, %ymm0
+; BITALG_NOVLX-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
+; BITALG_NOVLX-NEXT: vpaddd %ymm2, %ymm0, %ymm0
+; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; BITALG_NOVLX-NEXT: vpand %ymm2, %ymm0, %ymm3
+; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; BITALG_NOVLX-NEXT: vpshufb %ymm3, %ymm4, %ymm3
+; BITALG_NOVLX-NEXT: vpsrlw $4, %ymm0, %ymm0
+; BITALG_NOVLX-NEXT: vpand %ymm2, %ymm0, %ymm0
+; BITALG_NOVLX-NEXT: vpshufb %ymm0, %ymm4, %ymm0
+; BITALG_NOVLX-NEXT: vpaddb %ymm3, %ymm0, %ymm0
+; BITALG_NOVLX-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
+; BITALG_NOVLX-NEXT: vpsadbw %ymm1, %ymm2, %ymm2
+; BITALG_NOVLX-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
+; BITALG_NOVLX-NEXT: vpsadbw %ymm1, %ymm0, %ymm0
+; BITALG_NOVLX-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
+; BITALG_NOVLX-NEXT: retq
+;
+; BITALG-LABEL: testv8i32:
+; BITALG: # %bb.0:
+; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; BITALG-NEXT: vpsubd %ymm0, %ymm1, %ymm2
+; BITALG-NEXT: vpand %ymm2, %ymm0, %ymm0
+; BITALG-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
+; BITALG-NEXT: vpaddd %ymm2, %ymm0, %ymm0
+; BITALG-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; BITALG-NEXT: vpand %ymm2, %ymm0, %ymm3
+; BITALG-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; BITALG-NEXT: vpshufb %ymm3, %ymm4, %ymm3
+; BITALG-NEXT: vpsrlw $4, %ymm0, %ymm0
+; BITALG-NEXT: vpand %ymm2, %ymm0, %ymm0
+; BITALG-NEXT: vpshufb %ymm0, %ymm4, %ymm0
+; BITALG-NEXT: vpaddb %ymm3, %ymm0, %ymm0
+; BITALG-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
+; BITALG-NEXT: vpsadbw %ymm1, %ymm2, %ymm2
+; BITALG-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
+; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0
+; BITALG-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
+; BITALG-NEXT: retq
+;
; X32-AVX-LABEL: testv8i32:
-; X32-AVX: # BB#0:
-; X32-AVX-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; X32-AVX: # %bb.0:
+; X32-AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; X32-AVX-NEXT: vpsubd %ymm0, %ymm1, %ymm2
; X32-AVX-NEXT: vpand %ymm2, %ymm0, %ymm0
; X32-AVX-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
@@ -367,7 +516,7 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind {
define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind {
; AVX1-LABEL: testv8i32u:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpsubd %xmm1, %xmm2, %xmm3
@@ -405,8 +554,8 @@ define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: testv8i32u:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpsubd %ymm0, %ymm1, %ymm2
; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
@@ -427,8 +576,8 @@ define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind {
; AVX2-NEXT: retq
;
; AVX512CDVL-LABEL: testv8i32u:
-; AVX512CDVL: # BB#0:
-; AVX512CDVL-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; AVX512CDVL: # %bb.0:
+; AVX512CDVL-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512CDVL-NEXT: vpsubd %ymm0, %ymm1, %ymm1
; AVX512CDVL-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512CDVL-NEXT: vplzcntd %ymm0, %ymm0
@@ -437,8 +586,8 @@ define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind {
; AVX512CDVL-NEXT: retq
;
; AVX512CD-LABEL: testv8i32u:
-; AVX512CD: # BB#0:
-; AVX512CD-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; AVX512CD: # %bb.0:
+; AVX512CD-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512CD-NEXT: vpsubd %ymm0, %ymm1, %ymm1
; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0
@@ -447,19 +596,73 @@ define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind {
; AVX512CD-NEXT: retq
;
; AVX512VPOPCNTDQ-LABEL: testv8i32u:
-; AVX512VPOPCNTDQ: # BB#0:
-; AVX512VPOPCNTDQ-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; AVX512VPOPCNTDQ: # %bb.0:
+; AVX512VPOPCNTDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512VPOPCNTDQ-NEXT: vpsubd %ymm0, %ymm1, %ymm1
; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
; AVX512VPOPCNTDQ-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0
-; AVX512VPOPCNTDQ-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512VPOPCNTDQ-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
; AVX512VPOPCNTDQ-NEXT: retq
;
+; AVX512VPOPCNTDQVL-LABEL: testv8i32u:
+; AVX512VPOPCNTDQVL: # %bb.0:
+; AVX512VPOPCNTDQVL-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512VPOPCNTDQVL-NEXT: vpsubd %ymm0, %ymm1, %ymm1
+; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm0
+; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX512VPOPCNTDQVL-NEXT: vpaddd %ymm1, %ymm0, %ymm0
+; AVX512VPOPCNTDQVL-NEXT: vpopcntd %ymm0, %ymm0
+; AVX512VPOPCNTDQVL-NEXT: retq
+;
+; BITALG_NOVLX-LABEL: testv8i32u:
+; BITALG_NOVLX: # %bb.0:
+; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; BITALG_NOVLX-NEXT: vpsubd %ymm0, %ymm1, %ymm2
+; BITALG_NOVLX-NEXT: vpand %ymm2, %ymm0, %ymm0
+; BITALG_NOVLX-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
+; BITALG_NOVLX-NEXT: vpaddd %ymm2, %ymm0, %ymm0
+; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; BITALG_NOVLX-NEXT: vpand %ymm2, %ymm0, %ymm3
+; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; BITALG_NOVLX-NEXT: vpshufb %ymm3, %ymm4, %ymm3
+; BITALG_NOVLX-NEXT: vpsrlw $4, %ymm0, %ymm0
+; BITALG_NOVLX-NEXT: vpand %ymm2, %ymm0, %ymm0
+; BITALG_NOVLX-NEXT: vpshufb %ymm0, %ymm4, %ymm0
+; BITALG_NOVLX-NEXT: vpaddb %ymm3, %ymm0, %ymm0
+; BITALG_NOVLX-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
+; BITALG_NOVLX-NEXT: vpsadbw %ymm1, %ymm2, %ymm2
+; BITALG_NOVLX-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
+; BITALG_NOVLX-NEXT: vpsadbw %ymm1, %ymm0, %ymm0
+; BITALG_NOVLX-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
+; BITALG_NOVLX-NEXT: retq
+;
+; BITALG-LABEL: testv8i32u:
+; BITALG: # %bb.0:
+; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; BITALG-NEXT: vpsubd %ymm0, %ymm1, %ymm2
+; BITALG-NEXT: vpand %ymm2, %ymm0, %ymm0
+; BITALG-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
+; BITALG-NEXT: vpaddd %ymm2, %ymm0, %ymm0
+; BITALG-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; BITALG-NEXT: vpand %ymm2, %ymm0, %ymm3
+; BITALG-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; BITALG-NEXT: vpshufb %ymm3, %ymm4, %ymm3
+; BITALG-NEXT: vpsrlw $4, %ymm0, %ymm0
+; BITALG-NEXT: vpand %ymm2, %ymm0, %ymm0
+; BITALG-NEXT: vpshufb %ymm0, %ymm4, %ymm0
+; BITALG-NEXT: vpaddb %ymm3, %ymm0, %ymm0
+; BITALG-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
+; BITALG-NEXT: vpsadbw %ymm1, %ymm2, %ymm2
+; BITALG-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
+; BITALG-NEXT: vpsadbw %ymm1, %ymm0, %ymm0
+; BITALG-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
+; BITALG-NEXT: retq
+;
; X32-AVX-LABEL: testv8i32u:
-; X32-AVX: # BB#0:
-; X32-AVX-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; X32-AVX: # %bb.0:
+; X32-AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; X32-AVX-NEXT: vpsubd %ymm0, %ymm1, %ymm2
; X32-AVX-NEXT: vpand %ymm2, %ymm0, %ymm0
; X32-AVX-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
@@ -484,7 +687,7 @@ define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind {
define <16 x i16> @testv16i16(<16 x i16> %in) nounwind {
; AVX1-LABEL: testv16i16:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpsubw %xmm0, %xmm1, %xmm2
; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm2
@@ -518,8 +721,8 @@ define <16 x i16> @testv16i16(<16 x i16> %in) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: testv16i16:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpsubw %ymm0, %ymm1, %ymm1
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
@@ -538,8 +741,8 @@ define <16 x i16> @testv16i16(<16 x i16> %in) nounwind {
; AVX2-NEXT: retq
;
; AVX512CDVL-LABEL: testv16i16:
-; AVX512CDVL: # BB#0:
-; AVX512CDVL-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; AVX512CDVL: # %bb.0:
+; AVX512CDVL-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512CDVL-NEXT: vpsubw %ymm0, %ymm1, %ymm1
; AVX512CDVL-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512CDVL-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
@@ -558,8 +761,8 @@ define <16 x i16> @testv16i16(<16 x i16> %in) nounwind {
; AVX512CDVL-NEXT: retq
;
; AVX512CD-LABEL: testv16i16:
-; AVX512CD: # BB#0:
-; AVX512CD-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; AVX512CD: # %bb.0:
+; AVX512CD-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512CD-NEXT: vpsubw %ymm0, %ymm1, %ymm1
; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512CD-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
@@ -578,8 +781,8 @@ define <16 x i16> @testv16i16(<16 x i16> %in) nounwind {
; AVX512CD-NEXT: retq
;
; AVX512VPOPCNTDQ-LABEL: testv16i16:
-; AVX512VPOPCNTDQ: # BB#0:
-; AVX512VPOPCNTDQ-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; AVX512VPOPCNTDQ: # %bb.0:
+; AVX512VPOPCNTDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512VPOPCNTDQ-NEXT: vpsubw %ymm0, %ymm1, %ymm1
; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
@@ -589,9 +792,42 @@ define <16 x i16> @testv16i16(<16 x i16> %in) nounwind {
; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0
; AVX512VPOPCNTDQ-NEXT: retq
;
+; AVX512VPOPCNTDQVL-LABEL: testv16i16:
+; AVX512VPOPCNTDQVL: # %bb.0:
+; AVX512VPOPCNTDQVL-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512VPOPCNTDQVL-NEXT: vpsubw %ymm0, %ymm1, %ymm1
+; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm0
+; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX512VPOPCNTDQVL-NEXT: vpaddw %ymm1, %ymm0, %ymm0
+; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512VPOPCNTDQVL-NEXT: vpopcntd %zmm0, %zmm0
+; AVX512VPOPCNTDQVL-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512VPOPCNTDQVL-NEXT: retq
+;
+; BITALG_NOVLX-LABEL: testv16i16:
+; BITALG_NOVLX: # %bb.0:
+; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; BITALG_NOVLX-NEXT: vpsubw %ymm0, %ymm1, %ymm1
+; BITALG_NOVLX-NEXT: vpand %ymm1, %ymm0, %ymm0
+; BITALG_NOVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; BITALG_NOVLX-NEXT: vpaddw %ymm1, %ymm0, %ymm0
+; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0
+; BITALG_NOVLX-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
+; BITALG_NOVLX-NEXT: retq
+;
+; BITALG-LABEL: testv16i16:
+; BITALG: # %bb.0:
+; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; BITALG-NEXT: vpsubw %ymm0, %ymm1, %ymm1
+; BITALG-NEXT: vpand %ymm1, %ymm0, %ymm0
+; BITALG-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; BITALG-NEXT: vpaddw %ymm1, %ymm0, %ymm0
+; BITALG-NEXT: vpopcntw %ymm0, %ymm0
+; BITALG-NEXT: retq
+;
; X32-AVX-LABEL: testv16i16:
-; X32-AVX: # BB#0:
-; X32-AVX-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; X32-AVX: # %bb.0:
+; X32-AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; X32-AVX-NEXT: vpsubw %ymm0, %ymm1, %ymm1
; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm0
; X32-AVX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
@@ -614,7 +850,7 @@ define <16 x i16> @testv16i16(<16 x i16> %in) nounwind {
define <16 x i16> @testv16i16u(<16 x i16> %in) nounwind {
; AVX1-LABEL: testv16i16u:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpsubw %xmm0, %xmm1, %xmm2
; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm2
@@ -648,8 +884,8 @@ define <16 x i16> @testv16i16u(<16 x i16> %in) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: testv16i16u:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpsubw %ymm0, %ymm1, %ymm1
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
@@ -668,8 +904,8 @@ define <16 x i16> @testv16i16u(<16 x i16> %in) nounwind {
; AVX2-NEXT: retq
;
; AVX512CDVL-LABEL: testv16i16u:
-; AVX512CDVL: # BB#0:
-; AVX512CDVL-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; AVX512CDVL: # %bb.0:
+; AVX512CDVL-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512CDVL-NEXT: vpsubw %ymm0, %ymm1, %ymm1
; AVX512CDVL-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512CDVL-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
@@ -688,8 +924,8 @@ define <16 x i16> @testv16i16u(<16 x i16> %in) nounwind {
; AVX512CDVL-NEXT: retq
;
; AVX512CD-LABEL: testv16i16u:
-; AVX512CD: # BB#0:
-; AVX512CD-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; AVX512CD: # %bb.0:
+; AVX512CD-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512CD-NEXT: vpsubw %ymm0, %ymm1, %ymm1
; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512CD-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
@@ -708,8 +944,8 @@ define <16 x i16> @testv16i16u(<16 x i16> %in) nounwind {
; AVX512CD-NEXT: retq
;
; AVX512VPOPCNTDQ-LABEL: testv16i16u:
-; AVX512VPOPCNTDQ: # BB#0:
-; AVX512VPOPCNTDQ-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; AVX512VPOPCNTDQ: # %bb.0:
+; AVX512VPOPCNTDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512VPOPCNTDQ-NEXT: vpsubw %ymm0, %ymm1, %ymm1
; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
@@ -719,9 +955,42 @@ define <16 x i16> @testv16i16u(<16 x i16> %in) nounwind {
; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0
; AVX512VPOPCNTDQ-NEXT: retq
;
+; AVX512VPOPCNTDQVL-LABEL: testv16i16u:
+; AVX512VPOPCNTDQVL: # %bb.0:
+; AVX512VPOPCNTDQVL-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512VPOPCNTDQVL-NEXT: vpsubw %ymm0, %ymm1, %ymm1
+; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm0
+; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX512VPOPCNTDQVL-NEXT: vpaddw %ymm1, %ymm0, %ymm0
+; AVX512VPOPCNTDQVL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512VPOPCNTDQVL-NEXT: vpopcntd %zmm0, %zmm0
+; AVX512VPOPCNTDQVL-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512VPOPCNTDQVL-NEXT: retq
+;
+; BITALG_NOVLX-LABEL: testv16i16u:
+; BITALG_NOVLX: # %bb.0:
+; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; BITALG_NOVLX-NEXT: vpsubw %ymm0, %ymm1, %ymm1
+; BITALG_NOVLX-NEXT: vpand %ymm1, %ymm0, %ymm0
+; BITALG_NOVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; BITALG_NOVLX-NEXT: vpaddw %ymm1, %ymm0, %ymm0
+; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0
+; BITALG_NOVLX-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
+; BITALG_NOVLX-NEXT: retq
+;
+; BITALG-LABEL: testv16i16u:
+; BITALG: # %bb.0:
+; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; BITALG-NEXT: vpsubw %ymm0, %ymm1, %ymm1
+; BITALG-NEXT: vpand %ymm1, %ymm0, %ymm0
+; BITALG-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; BITALG-NEXT: vpaddw %ymm1, %ymm0, %ymm0
+; BITALG-NEXT: vpopcntw %ymm0, %ymm0
+; BITALG-NEXT: retq
+;
; X32-AVX-LABEL: testv16i16u:
-; X32-AVX: # BB#0:
-; X32-AVX-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; X32-AVX: # %bb.0:
+; X32-AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; X32-AVX-NEXT: vpsubw %ymm0, %ymm1, %ymm1
; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm0
; X32-AVX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
@@ -744,7 +1013,7 @@ define <16 x i16> @testv16i16u(<16 x i16> %in) nounwind {
define <32 x i8> @testv32i8(<32 x i8> %in) nounwind {
; AVX1-LABEL: testv32i8:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm3
@@ -772,8 +1041,8 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: testv32i8:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpsubb %ymm0, %ymm1, %ymm1
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
@@ -789,8 +1058,8 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind {
; AVX2-NEXT: retq
;
; AVX512CDVL-LABEL: testv32i8:
-; AVX512CDVL: # BB#0:
-; AVX512CDVL-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; AVX512CDVL: # %bb.0:
+; AVX512CDVL-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512CDVL-NEXT: vpsubb %ymm0, %ymm1, %ymm1
; AVX512CDVL-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512CDVL-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
@@ -806,8 +1075,8 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind {
; AVX512CDVL-NEXT: retq
;
; AVX512CD-LABEL: testv32i8:
-; AVX512CD: # BB#0:
-; AVX512CD-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; AVX512CD: # %bb.0:
+; AVX512CD-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512CD-NEXT: vpsubb %ymm0, %ymm1, %ymm1
; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512CD-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
@@ -823,8 +1092,8 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind {
; AVX512CD-NEXT: retq
;
; AVX512VPOPCNTDQ-LABEL: testv32i8:
-; AVX512VPOPCNTDQ: # BB#0:
-; AVX512VPOPCNTDQ-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; AVX512VPOPCNTDQ: # %bb.0:
+; AVX512VPOPCNTDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512VPOPCNTDQ-NEXT: vpsubb %ymm0, %ymm1, %ymm1
; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
@@ -839,9 +1108,47 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind {
; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm0, %ymm0
; AVX512VPOPCNTDQ-NEXT: retq
;
+; AVX512VPOPCNTDQVL-LABEL: testv32i8:
+; AVX512VPOPCNTDQVL: # %bb.0:
+; AVX512VPOPCNTDQVL-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512VPOPCNTDQVL-NEXT: vpsubb %ymm0, %ymm1, %ymm1
+; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm0
+; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX512VPOPCNTDQVL-NEXT: vpaddb %ymm1, %ymm0, %ymm0
+; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm2
+; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQVL-NEXT: vpshufb %ymm2, %ymm3, %ymm2
+; AVX512VPOPCNTDQVL-NEXT: vpsrlw $4, %ymm0, %ymm0
+; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm0
+; AVX512VPOPCNTDQVL-NEXT: vpshufb %ymm0, %ymm3, %ymm0
+; AVX512VPOPCNTDQVL-NEXT: vpaddb %ymm2, %ymm0, %ymm0
+; AVX512VPOPCNTDQVL-NEXT: retq
+;
+; BITALG_NOVLX-LABEL: testv32i8:
+; BITALG_NOVLX: # %bb.0:
+; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; BITALG_NOVLX-NEXT: vpsubb %ymm0, %ymm1, %ymm1
+; BITALG_NOVLX-NEXT: vpand %ymm1, %ymm0, %ymm0
+; BITALG_NOVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; BITALG_NOVLX-NEXT: vpaddb %ymm1, %ymm0, %ymm0
+; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0
+; BITALG_NOVLX-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
+; BITALG_NOVLX-NEXT: retq
+;
+; BITALG-LABEL: testv32i8:
+; BITALG: # %bb.0:
+; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; BITALG-NEXT: vpsubb %ymm0, %ymm1, %ymm1
+; BITALG-NEXT: vpand %ymm1, %ymm0, %ymm0
+; BITALG-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; BITALG-NEXT: vpaddb %ymm1, %ymm0, %ymm0
+; BITALG-NEXT: vpopcntb %ymm0, %ymm0
+; BITALG-NEXT: retq
+;
; X32-AVX-LABEL: testv32i8:
-; X32-AVX: # BB#0:
-; X32-AVX-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; X32-AVX: # %bb.0:
+; X32-AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; X32-AVX-NEXT: vpsubb %ymm0, %ymm1, %ymm1
; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm0
; X32-AVX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
@@ -861,7 +1168,7 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind {
define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind {
; AVX1-LABEL: testv32i8u:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm3
@@ -889,8 +1196,8 @@ define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: testv32i8u:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpsubb %ymm0, %ymm1, %ymm1
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
@@ -906,8 +1213,8 @@ define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind {
; AVX2-NEXT: retq
;
; AVX512CDVL-LABEL: testv32i8u:
-; AVX512CDVL: # BB#0:
-; AVX512CDVL-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; AVX512CDVL: # %bb.0:
+; AVX512CDVL-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512CDVL-NEXT: vpsubb %ymm0, %ymm1, %ymm1
; AVX512CDVL-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512CDVL-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
@@ -923,8 +1230,8 @@ define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind {
; AVX512CDVL-NEXT: retq
;
; AVX512CD-LABEL: testv32i8u:
-; AVX512CD: # BB#0:
-; AVX512CD-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; AVX512CD: # %bb.0:
+; AVX512CD-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512CD-NEXT: vpsubb %ymm0, %ymm1, %ymm1
; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512CD-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
@@ -940,8 +1247,8 @@ define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind {
; AVX512CD-NEXT: retq
;
; AVX512VPOPCNTDQ-LABEL: testv32i8u:
-; AVX512VPOPCNTDQ: # BB#0:
-; AVX512VPOPCNTDQ-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; AVX512VPOPCNTDQ: # %bb.0:
+; AVX512VPOPCNTDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512VPOPCNTDQ-NEXT: vpsubb %ymm0, %ymm1, %ymm1
; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
@@ -956,9 +1263,47 @@ define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind {
; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm0, %ymm0
; AVX512VPOPCNTDQ-NEXT: retq
;
+; AVX512VPOPCNTDQVL-LABEL: testv32i8u:
+; AVX512VPOPCNTDQVL: # %bb.0:
+; AVX512VPOPCNTDQVL-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512VPOPCNTDQVL-NEXT: vpsubb %ymm0, %ymm1, %ymm1
+; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm0
+; AVX512VPOPCNTDQVL-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX512VPOPCNTDQVL-NEXT: vpaddb %ymm1, %ymm0, %ymm0
+; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm2
+; AVX512VPOPCNTDQVL-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQVL-NEXT: vpshufb %ymm2, %ymm3, %ymm2
+; AVX512VPOPCNTDQVL-NEXT: vpsrlw $4, %ymm0, %ymm0
+; AVX512VPOPCNTDQVL-NEXT: vpand %ymm1, %ymm0, %ymm0
+; AVX512VPOPCNTDQVL-NEXT: vpshufb %ymm0, %ymm3, %ymm0
+; AVX512VPOPCNTDQVL-NEXT: vpaddb %ymm2, %ymm0, %ymm0
+; AVX512VPOPCNTDQVL-NEXT: retq
+;
+; BITALG_NOVLX-LABEL: testv32i8u:
+; BITALG_NOVLX: # %bb.0:
+; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; BITALG_NOVLX-NEXT: vpsubb %ymm0, %ymm1, %ymm1
+; BITALG_NOVLX-NEXT: vpand %ymm1, %ymm0, %ymm0
+; BITALG_NOVLX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; BITALG_NOVLX-NEXT: vpaddb %ymm1, %ymm0, %ymm0
+; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0
+; BITALG_NOVLX-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
+; BITALG_NOVLX-NEXT: retq
+;
+; BITALG-LABEL: testv32i8u:
+; BITALG: # %bb.0:
+; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; BITALG-NEXT: vpsubb %ymm0, %ymm1, %ymm1
+; BITALG-NEXT: vpand %ymm1, %ymm0, %ymm0
+; BITALG-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; BITALG-NEXT: vpaddb %ymm1, %ymm0, %ymm0
+; BITALG-NEXT: vpopcntb %ymm0, %ymm0
+; BITALG-NEXT: retq
+;
; X32-AVX-LABEL: testv32i8u:
-; X32-AVX: # BB#0:
-; X32-AVX-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; X32-AVX: # %bb.0:
+; X32-AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; X32-AVX-NEXT: vpsubb %ymm0, %ymm1, %ymm1
; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm0
; X32-AVX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
@@ -978,12 +1323,22 @@ define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind {
define <4 x i64> @foldv4i64() nounwind {
; AVX-LABEL: foldv4i64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,64,0]
; AVX-NEXT: retq
;
+; BITALG_NOVLX-LABEL: foldv4i64:
+; BITALG_NOVLX: # %bb.0:
+; BITALG_NOVLX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,64,0]
+; BITALG_NOVLX-NEXT: retq
+;
+; BITALG-LABEL: foldv4i64:
+; BITALG: # %bb.0:
+; BITALG-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,64,0]
+; BITALG-NEXT: retq
+;
; X32-AVX-LABEL: foldv4i64:
-; X32-AVX: # BB#0:
+; X32-AVX: # %bb.0:
; X32-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,0,0,64,0,0,0]
; X32-AVX-NEXT: retl
%out = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> <i64 256, i64 -1, i64 0, i64 255>, i1 0)
@@ -992,12 +1347,22 @@ define <4 x i64> @foldv4i64() nounwind {
define <4 x i64> @foldv4i64u() nounwind {
; AVX-LABEL: foldv4i64u:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,64,0]
; AVX-NEXT: retq
;
+; BITALG_NOVLX-LABEL: foldv4i64u:
+; BITALG_NOVLX: # %bb.0:
+; BITALG_NOVLX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,64,0]
+; BITALG_NOVLX-NEXT: retq
+;
+; BITALG-LABEL: foldv4i64u:
+; BITALG: # %bb.0:
+; BITALG-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,64,0]
+; BITALG-NEXT: retq
+;
; X32-AVX-LABEL: foldv4i64u:
-; X32-AVX: # BB#0:
+; X32-AVX: # %bb.0:
; X32-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,0,0,64,0,0,0]
; X32-AVX-NEXT: retl
%out = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> <i64 256, i64 -1, i64 0, i64 255>, i1 -1)
@@ -1006,12 +1371,22 @@ define <4 x i64> @foldv4i64u() nounwind {
define <8 x i32> @foldv8i32() nounwind {
; AVX-LABEL: foldv8i32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3]
; AVX-NEXT: retq
;
+; BITALG_NOVLX-LABEL: foldv8i32:
+; BITALG_NOVLX: # %bb.0:
+; BITALG_NOVLX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3]
+; BITALG_NOVLX-NEXT: retq
+;
+; BITALG-LABEL: foldv8i32:
+; BITALG: # %bb.0:
+; BITALG-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3]
+; BITALG-NEXT: retq
+;
; X32-AVX-LABEL: foldv8i32:
-; X32-AVX: # BB#0:
+; X32-AVX: # %bb.0:
; X32-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3]
; X32-AVX-NEXT: retl
%out = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> <i32 256, i32 -1, i32 0, i32 255, i32 -65536, i32 7, i32 24, i32 88>, i1 0)
@@ -1020,12 +1395,22 @@ define <8 x i32> @foldv8i32() nounwind {
define <8 x i32> @foldv8i32u() nounwind {
; AVX-LABEL: foldv8i32u:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3]
; AVX-NEXT: retq
;
+; BITALG_NOVLX-LABEL: foldv8i32u:
+; BITALG_NOVLX: # %bb.0:
+; BITALG_NOVLX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3]
+; BITALG_NOVLX-NEXT: retq
+;
+; BITALG-LABEL: foldv8i32u:
+; BITALG: # %bb.0:
+; BITALG-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3]
+; BITALG-NEXT: retq
+;
; X32-AVX-LABEL: foldv8i32u:
-; X32-AVX: # BB#0:
+; X32-AVX: # %bb.0:
; X32-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,32,0,16,0,3,3]
; X32-AVX-NEXT: retl
%out = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> <i32 256, i32 -1, i32 0, i32 255, i32 -65536, i32 7, i32 24, i32 88>, i1 -1)
@@ -1034,12 +1419,22 @@ define <8 x i32> @foldv8i32u() nounwind {
define <16 x i16> @foldv16i16() nounwind {
; AVX-LABEL: foldv16i16:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5]
; AVX-NEXT: retq
;
+; BITALG_NOVLX-LABEL: foldv16i16:
+; BITALG_NOVLX: # %bb.0:
+; BITALG_NOVLX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5]
+; BITALG_NOVLX-NEXT: retq
+;
+; BITALG-LABEL: foldv16i16:
+; BITALG: # %bb.0:
+; BITALG-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5]
+; BITALG-NEXT: retq
+;
; X32-AVX-LABEL: foldv16i16:
-; X32-AVX: # BB#0:
+; X32-AVX: # %bb.0:
; X32-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5]
; X32-AVX-NEXT: retl
%out = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> <i16 256, i16 -1, i16 0, i16 255, i16 -65536, i16 7, i16 24, i16 88, i16 -2, i16 254, i16 1, i16 2, i16 4, i16 8, i16 16, i16 32>, i1 0)
@@ -1048,12 +1443,22 @@ define <16 x i16> @foldv16i16() nounwind {
define <16 x i16> @foldv16i16u() nounwind {
; AVX-LABEL: foldv16i16u:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5]
; AVX-NEXT: retq
;
+; BITALG_NOVLX-LABEL: foldv16i16u:
+; BITALG_NOVLX: # %bb.0:
+; BITALG_NOVLX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5]
+; BITALG_NOVLX-NEXT: retq
+;
+; BITALG-LABEL: foldv16i16u:
+; BITALG: # %bb.0:
+; BITALG-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5]
+; BITALG-NEXT: retq
+;
; X32-AVX-LABEL: foldv16i16u:
-; X32-AVX: # BB#0:
+; X32-AVX: # %bb.0:
; X32-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,16,0,16,0,3,3,1,1,0,1,2,3,4,5]
; X32-AVX-NEXT: retl
%out = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> <i16 256, i16 -1, i16 0, i16 255, i16 -65536, i16 7, i16 24, i16 88, i16 -2, i16 254, i16 1, i16 2, i16 4, i16 8, i16 16, i16 32>, i1 -1)
@@ -1062,12 +1467,22 @@ define <16 x i16> @foldv16i16u() nounwind {
define <32 x i8> @foldv32i8() nounwind {
; AVX-LABEL: foldv32i8:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0]
; AVX-NEXT: retq
;
+; BITALG_NOVLX-LABEL: foldv32i8:
+; BITALG_NOVLX: # %bb.0:
+; BITALG_NOVLX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0]
+; BITALG_NOVLX-NEXT: retq
+;
+; BITALG-LABEL: foldv32i8:
+; BITALG: # %bb.0:
+; BITALG-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0]
+; BITALG-NEXT: retq
+;
; X32-AVX-LABEL: foldv32i8:
-; X32-AVX: # BB#0:
+; X32-AVX: # %bb.0:
; X32-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0]
; X32-AVX-NEXT: retl
%out = call <32 x i8> @llvm.cttz.v32i8(<32 x i8> <i8 256, i8 -1, i8 0, i8 255, i8 -65536, i8 7, i8 24, i8 88, i8 -2, i8 254, i8 1, i8 2, i8 4, i8 8, i8 16, i8 32, i8 64, i8 128, i8 256, i8 -256, i8 -128, i8 -64, i8 -32, i8 -16, i8 -8, i8 -4, i8 -2, i8 -1, i8 3, i8 5, i8 7, i8 127>, i1 0)
@@ -1076,12 +1491,22 @@ define <32 x i8> @foldv32i8() nounwind {
define <32 x i8> @foldv32i8u() nounwind {
; AVX-LABEL: foldv32i8u:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0]
; AVX-NEXT: retq
;
+; BITALG_NOVLX-LABEL: foldv32i8u:
+; BITALG_NOVLX: # %bb.0:
+; BITALG_NOVLX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0]
+; BITALG_NOVLX-NEXT: retq
+;
+; BITALG-LABEL: foldv32i8u:
+; BITALG: # %bb.0:
+; BITALG-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0]
+; BITALG-NEXT: retq
+;
; X32-AVX-LABEL: foldv32i8u:
-; X32-AVX: # BB#0:
+; X32-AVX: # %bb.0:
; X32-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5,6,7,8,8,7,6,5,4,3,2,1,0,0,0,0,0]
; X32-AVX-NEXT: retl
%out = call <32 x i8> @llvm.cttz.v32i8(<32 x i8> <i8 256, i8 -1, i8 0, i8 255, i8 -65536, i8 7, i8 24, i8 88, i8 -2, i8 254, i8 1, i8 2, i8 4, i8 8, i8 16, i8 32, i8 64, i8 128, i8 256, i8 -256, i8 -128, i8 -64, i8 -32, i8 -16, i8 -8, i8 -4, i8 -2, i8 -1, i8 3, i8 5, i8 7, i8 127>, i1 -1)
diff --git a/test/CodeGen/X86/vector-tzcnt-512.ll b/test/CodeGen/X86/vector-tzcnt-512.ll
index 2fce8a601931..37c86f7f81a2 100644
--- a/test/CodeGen/X86/vector-tzcnt-512.ll
+++ b/test/CodeGen/X86/vector-tzcnt-512.ll
@@ -3,11 +3,12 @@
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512cd,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512CDBW
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512BW
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vpopcntdq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VPOPCNTDQ
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bitalg | FileCheck %s --check-prefix=ALL --check-prefix=BITALG
define <8 x i64> @testv8i64(<8 x i64> %in) nounwind {
; AVX512CD-LABEL: testv8i64:
-; AVX512CD: # BB#0:
-; AVX512CD-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; AVX512CD: # %bb.0:
+; AVX512CD-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512CD-NEXT: vpsubq %zmm0, %zmm1, %zmm1
; AVX512CD-NEXT: vpandq %zmm1, %zmm0, %zmm0
; AVX512CD-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
@@ -21,7 +22,7 @@ define <8 x i64> @testv8i64(<8 x i64> %in) nounwind {
; AVX512CD-NEXT: vpand %ymm2, %ymm1, %ymm1
; AVX512CD-NEXT: vpshufb %ymm1, %ymm4, %ymm1
; AVX512CD-NEXT: vpaddb %ymm3, %ymm1, %ymm1
-; AVX512CD-NEXT: vpxor %ymm3, %ymm3, %ymm3
+; AVX512CD-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX512CD-NEXT: vpsadbw %ymm3, %ymm1, %ymm1
; AVX512CD-NEXT: vpand %ymm2, %ymm0, %ymm5
; AVX512CD-NEXT: vpshufb %ymm5, %ymm4, %ymm5
@@ -34,15 +35,15 @@ define <8 x i64> @testv8i64(<8 x i64> %in) nounwind {
; AVX512CD-NEXT: retq
;
; AVX512CDBW-LABEL: testv8i64:
-; AVX512CDBW: # BB#0:
-; AVX512CDBW-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; AVX512CDBW: # %bb.0:
+; AVX512CDBW-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512CDBW-NEXT: vpsubq %zmm0, %zmm1, %zmm2
; AVX512CDBW-NEXT: vpandq %zmm2, %zmm0, %zmm0
; AVX512CDBW-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2
; AVX512CDBW-NEXT: vpaddq %zmm2, %zmm0, %zmm0
; AVX512CDBW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512CDBW-NEXT: vpandq %zmm2, %zmm0, %zmm3
-; AVX512CDBW-NEXT: vmovdqu8 {{.*#+}} zmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CDBW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX512CDBW-NEXT: vpshufb %zmm3, %zmm4, %zmm3
; AVX512CDBW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512CDBW-NEXT: vpandq %zmm2, %zmm0, %zmm0
@@ -52,15 +53,15 @@ define <8 x i64> @testv8i64(<8 x i64> %in) nounwind {
; AVX512CDBW-NEXT: retq
;
; AVX512BW-LABEL: testv8i64:
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512BW-NEXT: vpsubq %zmm0, %zmm1, %zmm2
; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0
; AVX512BW-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2
; AVX512BW-NEXT: vpaddq %zmm2, %zmm0, %zmm0
; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm3
-; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX512BW-NEXT: vpshufb %zmm3, %zmm4, %zmm3
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0
@@ -70,22 +71,40 @@ define <8 x i64> @testv8i64(<8 x i64> %in) nounwind {
; AVX512BW-NEXT: retq
;
; AVX512VPOPCNTDQ-LABEL: testv8i64:
-; AVX512VPOPCNTDQ: # BB#0:
-; AVX512VPOPCNTDQ-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; AVX512VPOPCNTDQ: # %bb.0:
+; AVX512VPOPCNTDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512VPOPCNTDQ-NEXT: vpsubq %zmm0, %zmm1, %zmm1
; AVX512VPOPCNTDQ-NEXT: vpandq %zmm1, %zmm0, %zmm0
; AVX512VPOPCNTDQ-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
; AVX512VPOPCNTDQ-NEXT: vpaddq %zmm1, %zmm0, %zmm0
; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0
; AVX512VPOPCNTDQ-NEXT: retq
+;
+; BITALG-LABEL: testv8i64:
+; BITALG: # %bb.0:
+; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; BITALG-NEXT: vpsubq %zmm0, %zmm1, %zmm2
+; BITALG-NEXT: vpandq %zmm2, %zmm0, %zmm0
+; BITALG-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2
+; BITALG-NEXT: vpaddq %zmm2, %zmm0, %zmm0
+; BITALG-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; BITALG-NEXT: vpandq %zmm2, %zmm0, %zmm3
+; BITALG-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; BITALG-NEXT: vpshufb %zmm3, %zmm4, %zmm3
+; BITALG-NEXT: vpsrlw $4, %zmm0, %zmm0
+; BITALG-NEXT: vpandq %zmm2, %zmm0, %zmm0
+; BITALG-NEXT: vpshufb %zmm0, %zmm4, %zmm0
+; BITALG-NEXT: vpaddb %zmm3, %zmm0, %zmm0
+; BITALG-NEXT: vpsadbw %zmm1, %zmm0, %zmm0
+; BITALG-NEXT: retq
%out = call <8 x i64> @llvm.cttz.v8i64(<8 x i64> %in, i1 0)
ret <8 x i64> %out
}
define <8 x i64> @testv8i64u(<8 x i64> %in) nounwind {
; AVX512CD-LABEL: testv8i64u:
-; AVX512CD: # BB#0:
-; AVX512CD-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; AVX512CD: # %bb.0:
+; AVX512CD-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512CD-NEXT: vpsubq %zmm0, %zmm1, %zmm1
; AVX512CD-NEXT: vpandq %zmm1, %zmm0, %zmm0
; AVX512CD-NEXT: vplzcntq %zmm0, %zmm0
@@ -94,8 +113,8 @@ define <8 x i64> @testv8i64u(<8 x i64> %in) nounwind {
; AVX512CD-NEXT: retq
;
; AVX512CDBW-LABEL: testv8i64u:
-; AVX512CDBW: # BB#0:
-; AVX512CDBW-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; AVX512CDBW: # %bb.0:
+; AVX512CDBW-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512CDBW-NEXT: vpsubq %zmm0, %zmm1, %zmm1
; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm0
; AVX512CDBW-NEXT: vplzcntq %zmm0, %zmm0
@@ -104,15 +123,15 @@ define <8 x i64> @testv8i64u(<8 x i64> %in) nounwind {
; AVX512CDBW-NEXT: retq
;
; AVX512BW-LABEL: testv8i64u:
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512BW-NEXT: vpsubq %zmm0, %zmm1, %zmm2
; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0
; AVX512BW-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2
; AVX512BW-NEXT: vpaddq %zmm2, %zmm0, %zmm0
; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm3
-; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX512BW-NEXT: vpshufb %zmm3, %zmm4, %zmm3
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0
@@ -122,24 +141,42 @@ define <8 x i64> @testv8i64u(<8 x i64> %in) nounwind {
; AVX512BW-NEXT: retq
;
; AVX512VPOPCNTDQ-LABEL: testv8i64u:
-; AVX512VPOPCNTDQ: # BB#0:
-; AVX512VPOPCNTDQ-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; AVX512VPOPCNTDQ: # %bb.0:
+; AVX512VPOPCNTDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512VPOPCNTDQ-NEXT: vpsubq %zmm0, %zmm1, %zmm1
; AVX512VPOPCNTDQ-NEXT: vpandq %zmm1, %zmm0, %zmm0
; AVX512VPOPCNTDQ-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
; AVX512VPOPCNTDQ-NEXT: vpaddq %zmm1, %zmm0, %zmm0
; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0
; AVX512VPOPCNTDQ-NEXT: retq
+;
+; BITALG-LABEL: testv8i64u:
+; BITALG: # %bb.0:
+; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; BITALG-NEXT: vpsubq %zmm0, %zmm1, %zmm2
+; BITALG-NEXT: vpandq %zmm2, %zmm0, %zmm0
+; BITALG-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2
+; BITALG-NEXT: vpaddq %zmm2, %zmm0, %zmm0
+; BITALG-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; BITALG-NEXT: vpandq %zmm2, %zmm0, %zmm3
+; BITALG-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; BITALG-NEXT: vpshufb %zmm3, %zmm4, %zmm3
+; BITALG-NEXT: vpsrlw $4, %zmm0, %zmm0
+; BITALG-NEXT: vpandq %zmm2, %zmm0, %zmm0
+; BITALG-NEXT: vpshufb %zmm0, %zmm4, %zmm0
+; BITALG-NEXT: vpaddb %zmm3, %zmm0, %zmm0
+; BITALG-NEXT: vpsadbw %zmm1, %zmm0, %zmm0
+; BITALG-NEXT: retq
%out = call <8 x i64> @llvm.cttz.v8i64(<8 x i64> %in, i1 -1)
ret <8 x i64> %out
}
define <16 x i32> @testv16i32(<16 x i32> %in) nounwind {
; AVX512CD-LABEL: testv16i32:
-; AVX512CD: # BB#0:
-; AVX512CD-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; AVX512CD: # %bb.0:
+; AVX512CD-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512CD-NEXT: vpsubd %zmm0, %zmm1, %zmm1
-; AVX512CD-NEXT: vpandd %zmm1, %zmm0, %zmm0
+; AVX512CD-NEXT: vpandq %zmm1, %zmm0, %zmm0
; AVX512CD-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
; AVX512CD-NEXT: vpaddd %zmm1, %zmm0, %zmm0
; AVX512CD-NEXT: vextracti64x4 $1, %zmm0, %ymm1
@@ -151,7 +188,7 @@ define <16 x i32> @testv16i32(<16 x i32> %in) nounwind {
; AVX512CD-NEXT: vpand %ymm2, %ymm1, %ymm1
; AVX512CD-NEXT: vpshufb %ymm1, %ymm4, %ymm1
; AVX512CD-NEXT: vpaddb %ymm3, %ymm1, %ymm1
-; AVX512CD-NEXT: vpxor %ymm3, %ymm3, %ymm3
+; AVX512CD-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX512CD-NEXT: vpunpckhdq {{.*#+}} ymm5 = ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[6],ymm3[6],ymm1[7],ymm3[7]
; AVX512CD-NEXT: vpsadbw %ymm3, %ymm5, %ymm5
; AVX512CD-NEXT: vpunpckldq {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[4],ymm3[4],ymm1[5],ymm3[5]
@@ -172,15 +209,15 @@ define <16 x i32> @testv16i32(<16 x i32> %in) nounwind {
; AVX512CD-NEXT: retq
;
; AVX512CDBW-LABEL: testv16i32:
-; AVX512CDBW: # BB#0:
-; AVX512CDBW-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; AVX512CDBW: # %bb.0:
+; AVX512CDBW-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512CDBW-NEXT: vpsubd %zmm0, %zmm1, %zmm2
-; AVX512CDBW-NEXT: vpandd %zmm2, %zmm0, %zmm0
+; AVX512CDBW-NEXT: vpandq %zmm2, %zmm0, %zmm0
; AVX512CDBW-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2
; AVX512CDBW-NEXT: vpaddd %zmm2, %zmm0, %zmm0
; AVX512CDBW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512CDBW-NEXT: vpandq %zmm2, %zmm0, %zmm3
-; AVX512CDBW-NEXT: vmovdqu8 {{.*#+}} zmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CDBW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX512CDBW-NEXT: vpshufb %zmm3, %zmm4, %zmm3
; AVX512CDBW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512CDBW-NEXT: vpandq %zmm2, %zmm0, %zmm0
@@ -194,15 +231,15 @@ define <16 x i32> @testv16i32(<16 x i32> %in) nounwind {
; AVX512CDBW-NEXT: retq
;
; AVX512BW-LABEL: testv16i32:
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512BW-NEXT: vpsubd %zmm0, %zmm1, %zmm2
-; AVX512BW-NEXT: vpandd %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0
; AVX512BW-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2
; AVX512BW-NEXT: vpaddd %zmm2, %zmm0, %zmm0
; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm3
-; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX512BW-NEXT: vpshufb %zmm3, %zmm4, %zmm3
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0
@@ -216,49 +253,71 @@ define <16 x i32> @testv16i32(<16 x i32> %in) nounwind {
; AVX512BW-NEXT: retq
;
; AVX512VPOPCNTDQ-LABEL: testv16i32:
-; AVX512VPOPCNTDQ: # BB#0:
-; AVX512VPOPCNTDQ-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; AVX512VPOPCNTDQ: # %bb.0:
+; AVX512VPOPCNTDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512VPOPCNTDQ-NEXT: vpsubd %zmm0, %zmm1, %zmm1
-; AVX512VPOPCNTDQ-NEXT: vpandd %zmm1, %zmm0, %zmm0
+; AVX512VPOPCNTDQ-NEXT: vpandq %zmm1, %zmm0, %zmm0
; AVX512VPOPCNTDQ-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
; AVX512VPOPCNTDQ-NEXT: vpaddd %zmm1, %zmm0, %zmm0
; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0
; AVX512VPOPCNTDQ-NEXT: retq
+;
+; BITALG-LABEL: testv16i32:
+; BITALG: # %bb.0:
+; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; BITALG-NEXT: vpsubd %zmm0, %zmm1, %zmm2
+; BITALG-NEXT: vpandq %zmm2, %zmm0, %zmm0
+; BITALG-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2
+; BITALG-NEXT: vpaddd %zmm2, %zmm0, %zmm0
+; BITALG-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; BITALG-NEXT: vpandq %zmm2, %zmm0, %zmm3
+; BITALG-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; BITALG-NEXT: vpshufb %zmm3, %zmm4, %zmm3
+; BITALG-NEXT: vpsrlw $4, %zmm0, %zmm0
+; BITALG-NEXT: vpandq %zmm2, %zmm0, %zmm0
+; BITALG-NEXT: vpshufb %zmm0, %zmm4, %zmm0
+; BITALG-NEXT: vpaddb %zmm3, %zmm0, %zmm0
+; BITALG-NEXT: vpunpckhdq {{.*#+}} zmm2 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
+; BITALG-NEXT: vpsadbw %zmm1, %zmm2, %zmm2
+; BITALG-NEXT: vpunpckldq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
+; BITALG-NEXT: vpsadbw %zmm1, %zmm0, %zmm0
+; BITALG-NEXT: vpackuswb %zmm2, %zmm0, %zmm0
+; BITALG-NEXT: retq
%out = call <16 x i32> @llvm.cttz.v16i32(<16 x i32> %in, i1 0)
ret <16 x i32> %out
}
define <16 x i32> @testv16i32u(<16 x i32> %in) nounwind {
; AVX512CD-LABEL: testv16i32u:
-; AVX512CD: # BB#0:
-; AVX512CD-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; AVX512CD: # %bb.0:
+; AVX512CD-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512CD-NEXT: vpsubd %zmm0, %zmm1, %zmm1
-; AVX512CD-NEXT: vpandd %zmm1, %zmm0, %zmm0
+; AVX512CD-NEXT: vpandq %zmm1, %zmm0, %zmm0
; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0
; AVX512CD-NEXT: vpbroadcastd {{.*#+}} zmm1 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
; AVX512CD-NEXT: vpsubd %zmm0, %zmm1, %zmm0
; AVX512CD-NEXT: retq
;
; AVX512CDBW-LABEL: testv16i32u:
-; AVX512CDBW: # BB#0:
-; AVX512CDBW-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; AVX512CDBW: # %bb.0:
+; AVX512CDBW-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512CDBW-NEXT: vpsubd %zmm0, %zmm1, %zmm1
-; AVX512CDBW-NEXT: vpandd %zmm1, %zmm0, %zmm0
+; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm0
; AVX512CDBW-NEXT: vplzcntd %zmm0, %zmm0
; AVX512CDBW-NEXT: vpbroadcastd {{.*#+}} zmm1 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
; AVX512CDBW-NEXT: vpsubd %zmm0, %zmm1, %zmm0
; AVX512CDBW-NEXT: retq
;
; AVX512BW-LABEL: testv16i32u:
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512BW-NEXT: vpsubd %zmm0, %zmm1, %zmm2
-; AVX512BW-NEXT: vpandd %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0
; AVX512BW-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2
; AVX512BW-NEXT: vpaddd %zmm2, %zmm0, %zmm0
; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm3
-; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX512BW-NEXT: vpshufb %zmm3, %zmm4, %zmm3
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0
@@ -272,22 +331,44 @@ define <16 x i32> @testv16i32u(<16 x i32> %in) nounwind {
; AVX512BW-NEXT: retq
;
; AVX512VPOPCNTDQ-LABEL: testv16i32u:
-; AVX512VPOPCNTDQ: # BB#0:
-; AVX512VPOPCNTDQ-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; AVX512VPOPCNTDQ: # %bb.0:
+; AVX512VPOPCNTDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512VPOPCNTDQ-NEXT: vpsubd %zmm0, %zmm1, %zmm1
-; AVX512VPOPCNTDQ-NEXT: vpandd %zmm1, %zmm0, %zmm0
+; AVX512VPOPCNTDQ-NEXT: vpandq %zmm1, %zmm0, %zmm0
; AVX512VPOPCNTDQ-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
; AVX512VPOPCNTDQ-NEXT: vpaddd %zmm1, %zmm0, %zmm0
; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0
; AVX512VPOPCNTDQ-NEXT: retq
+;
+; BITALG-LABEL: testv16i32u:
+; BITALG: # %bb.0:
+; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; BITALG-NEXT: vpsubd %zmm0, %zmm1, %zmm2
+; BITALG-NEXT: vpandq %zmm2, %zmm0, %zmm0
+; BITALG-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2
+; BITALG-NEXT: vpaddd %zmm2, %zmm0, %zmm0
+; BITALG-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; BITALG-NEXT: vpandq %zmm2, %zmm0, %zmm3
+; BITALG-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; BITALG-NEXT: vpshufb %zmm3, %zmm4, %zmm3
+; BITALG-NEXT: vpsrlw $4, %zmm0, %zmm0
+; BITALG-NEXT: vpandq %zmm2, %zmm0, %zmm0
+; BITALG-NEXT: vpshufb %zmm0, %zmm4, %zmm0
+; BITALG-NEXT: vpaddb %zmm3, %zmm0, %zmm0
+; BITALG-NEXT: vpunpckhdq {{.*#+}} zmm2 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
+; BITALG-NEXT: vpsadbw %zmm1, %zmm2, %zmm2
+; BITALG-NEXT: vpunpckldq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
+; BITALG-NEXT: vpsadbw %zmm1, %zmm0, %zmm0
+; BITALG-NEXT: vpackuswb %zmm2, %zmm0, %zmm0
+; BITALG-NEXT: retq
%out = call <16 x i32> @llvm.cttz.v16i32(<16 x i32> %in, i1 -1)
ret <16 x i32> %out
}
define <32 x i16> @testv32i16(<32 x i16> %in) nounwind {
; AVX512CD-LABEL: testv32i16:
-; AVX512CD: # BB#0:
-; AVX512CD-NEXT: vpxor %ymm2, %ymm2, %ymm2
+; AVX512CD: # %bb.0:
+; AVX512CD-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512CD-NEXT: vpsubw %ymm0, %ymm2, %ymm3
; AVX512CD-NEXT: vpand %ymm3, %ymm0, %ymm0
; AVX512CD-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3
@@ -318,15 +399,15 @@ define <32 x i16> @testv32i16(<32 x i16> %in) nounwind {
; AVX512CD-NEXT: retq
;
; AVX512CDBW-LABEL: testv32i16:
-; AVX512CDBW: # BB#0:
-; AVX512CDBW-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; AVX512CDBW: # %bb.0:
+; AVX512CDBW-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512CDBW-NEXT: vpsubw %zmm0, %zmm1, %zmm1
; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm0
; AVX512CDBW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
; AVX512CDBW-NEXT: vpaddw %zmm1, %zmm0, %zmm0
; AVX512CDBW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512CDBW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CDBW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX512CDBW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512CDBW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -338,15 +419,15 @@ define <32 x i16> @testv32i16(<32 x i16> %in) nounwind {
; AVX512CDBW-NEXT: retq
;
; AVX512BW-LABEL: testv32i16:
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512BW-NEXT: vpsubw %zmm0, %zmm1, %zmm1
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -358,8 +439,8 @@ define <32 x i16> @testv32i16(<32 x i16> %in) nounwind {
; AVX512BW-NEXT: retq
;
; AVX512VPOPCNTDQ-LABEL: testv32i16:
-; AVX512VPOPCNTDQ: # BB#0:
-; AVX512VPOPCNTDQ-NEXT: vpxor %ymm2, %ymm2, %ymm2
+; AVX512VPOPCNTDQ: # %bb.0:
+; AVX512VPOPCNTDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VPOPCNTDQ-NEXT: vpsubw %ymm0, %ymm2, %ymm3
; AVX512VPOPCNTDQ-NEXT: vpand %ymm3, %ymm0, %ymm0
; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3
@@ -374,14 +455,24 @@ define <32 x i16> @testv32i16(<32 x i16> %in) nounwind {
; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm1, %zmm1
; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm1, %ymm1
; AVX512VPOPCNTDQ-NEXT: retq
+;
+; BITALG-LABEL: testv32i16:
+; BITALG: # %bb.0:
+; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; BITALG-NEXT: vpsubw %zmm0, %zmm1, %zmm1
+; BITALG-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; BITALG-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
+; BITALG-NEXT: vpaddw %zmm1, %zmm0, %zmm0
+; BITALG-NEXT: vpopcntw %zmm0, %zmm0
+; BITALG-NEXT: retq
%out = call <32 x i16> @llvm.cttz.v32i16(<32 x i16> %in, i1 0)
ret <32 x i16> %out
}
define <32 x i16> @testv32i16u(<32 x i16> %in) nounwind {
; AVX512CD-LABEL: testv32i16u:
-; AVX512CD: # BB#0:
-; AVX512CD-NEXT: vpxor %ymm2, %ymm2, %ymm2
+; AVX512CD: # %bb.0:
+; AVX512CD-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512CD-NEXT: vpsubw %ymm0, %ymm2, %ymm3
; AVX512CD-NEXT: vpand %ymm3, %ymm0, %ymm0
; AVX512CD-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3
@@ -412,15 +503,15 @@ define <32 x i16> @testv32i16u(<32 x i16> %in) nounwind {
; AVX512CD-NEXT: retq
;
; AVX512CDBW-LABEL: testv32i16u:
-; AVX512CDBW: # BB#0:
-; AVX512CDBW-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; AVX512CDBW: # %bb.0:
+; AVX512CDBW-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512CDBW-NEXT: vpsubw %zmm0, %zmm1, %zmm1
; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm0
; AVX512CDBW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
; AVX512CDBW-NEXT: vpaddw %zmm1, %zmm0, %zmm0
; AVX512CDBW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512CDBW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CDBW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX512CDBW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512CDBW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -432,15 +523,15 @@ define <32 x i16> @testv32i16u(<32 x i16> %in) nounwind {
; AVX512CDBW-NEXT: retq
;
; AVX512BW-LABEL: testv32i16u:
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512BW-NEXT: vpsubw %zmm0, %zmm1, %zmm1
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -452,8 +543,8 @@ define <32 x i16> @testv32i16u(<32 x i16> %in) nounwind {
; AVX512BW-NEXT: retq
;
; AVX512VPOPCNTDQ-LABEL: testv32i16u:
-; AVX512VPOPCNTDQ: # BB#0:
-; AVX512VPOPCNTDQ-NEXT: vpxor %ymm2, %ymm2, %ymm2
+; AVX512VPOPCNTDQ: # %bb.0:
+; AVX512VPOPCNTDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VPOPCNTDQ-NEXT: vpsubw %ymm0, %ymm2, %ymm3
; AVX512VPOPCNTDQ-NEXT: vpand %ymm3, %ymm0, %ymm0
; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3
@@ -468,14 +559,24 @@ define <32 x i16> @testv32i16u(<32 x i16> %in) nounwind {
; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm1, %zmm1
; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm1, %ymm1
; AVX512VPOPCNTDQ-NEXT: retq
+;
+; BITALG-LABEL: testv32i16u:
+; BITALG: # %bb.0:
+; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; BITALG-NEXT: vpsubw %zmm0, %zmm1, %zmm1
+; BITALG-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; BITALG-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
+; BITALG-NEXT: vpaddw %zmm1, %zmm0, %zmm0
+; BITALG-NEXT: vpopcntw %zmm0, %zmm0
+; BITALG-NEXT: retq
%out = call <32 x i16> @llvm.cttz.v32i16(<32 x i16> %in, i1 -1)
ret <32 x i16> %out
}
define <64 x i8> @testv64i8(<64 x i8> %in) nounwind {
; AVX512CD-LABEL: testv64i8:
-; AVX512CD: # BB#0:
-; AVX512CD-NEXT: vpxor %ymm2, %ymm2, %ymm2
+; AVX512CD: # %bb.0:
+; AVX512CD-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512CD-NEXT: vpsubb %ymm0, %ymm2, %ymm3
; AVX512CD-NEXT: vpand %ymm3, %ymm0, %ymm0
; AVX512CD-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3
@@ -500,15 +601,15 @@ define <64 x i8> @testv64i8(<64 x i8> %in) nounwind {
; AVX512CD-NEXT: retq
;
; AVX512CDBW-LABEL: testv64i8:
-; AVX512CDBW: # BB#0:
-; AVX512CDBW-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; AVX512CDBW: # %bb.0:
+; AVX512CDBW-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512CDBW-NEXT: vpsubb %zmm0, %zmm1, %zmm1
; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm0
; AVX512CDBW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
; AVX512CDBW-NEXT: vpaddb %zmm1, %zmm0, %zmm0
; AVX512CDBW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512CDBW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CDBW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX512CDBW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512CDBW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -517,15 +618,15 @@ define <64 x i8> @testv64i8(<64 x i8> %in) nounwind {
; AVX512CDBW-NEXT: retq
;
; AVX512BW-LABEL: testv64i8:
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512BW-NEXT: vpsubb %zmm0, %zmm1, %zmm1
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -534,8 +635,8 @@ define <64 x i8> @testv64i8(<64 x i8> %in) nounwind {
; AVX512BW-NEXT: retq
;
; AVX512VPOPCNTDQ-LABEL: testv64i8:
-; AVX512VPOPCNTDQ: # BB#0:
-; AVX512VPOPCNTDQ-NEXT: vpxor %ymm2, %ymm2, %ymm2
+; AVX512VPOPCNTDQ: # %bb.0:
+; AVX512VPOPCNTDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VPOPCNTDQ-NEXT: vpsubb %ymm0, %ymm2, %ymm3
; AVX512VPOPCNTDQ-NEXT: vpand %ymm3, %ymm0, %ymm0
; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3
@@ -558,14 +659,24 @@ define <64 x i8> @testv64i8(<64 x i8> %in) nounwind {
; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm1, %ymm6, %ymm1
; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm1, %ymm1
; AVX512VPOPCNTDQ-NEXT: retq
+;
+; BITALG-LABEL: testv64i8:
+; BITALG: # %bb.0:
+; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; BITALG-NEXT: vpsubb %zmm0, %zmm1, %zmm1
+; BITALG-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; BITALG-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
+; BITALG-NEXT: vpaddb %zmm1, %zmm0, %zmm0
+; BITALG-NEXT: vpopcntb %zmm0, %zmm0
+; BITALG-NEXT: retq
%out = call <64 x i8> @llvm.cttz.v64i8(<64 x i8> %in, i1 0)
ret <64 x i8> %out
}
define <64 x i8> @testv64i8u(<64 x i8> %in) nounwind {
; AVX512CD-LABEL: testv64i8u:
-; AVX512CD: # BB#0:
-; AVX512CD-NEXT: vpxor %ymm2, %ymm2, %ymm2
+; AVX512CD: # %bb.0:
+; AVX512CD-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512CD-NEXT: vpsubb %ymm0, %ymm2, %ymm3
; AVX512CD-NEXT: vpand %ymm3, %ymm0, %ymm0
; AVX512CD-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3
@@ -590,15 +701,15 @@ define <64 x i8> @testv64i8u(<64 x i8> %in) nounwind {
; AVX512CD-NEXT: retq
;
; AVX512CDBW-LABEL: testv64i8u:
-; AVX512CDBW: # BB#0:
-; AVX512CDBW-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; AVX512CDBW: # %bb.0:
+; AVX512CDBW-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512CDBW-NEXT: vpsubb %zmm0, %zmm1, %zmm1
; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm0
; AVX512CDBW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
; AVX512CDBW-NEXT: vpaddb %zmm1, %zmm0, %zmm0
; AVX512CDBW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512CDBW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512CDBW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX512CDBW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512CDBW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512CDBW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -607,15 +718,15 @@ define <64 x i8> @testv64i8u(<64 x i8> %in) nounwind {
; AVX512CDBW-NEXT: retq
;
; AVX512BW-LABEL: testv64i8u:
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vpxord %zmm1, %zmm1, %zmm1
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512BW-NEXT: vpsubb %zmm0, %zmm1, %zmm1
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
@@ -624,8 +735,8 @@ define <64 x i8> @testv64i8u(<64 x i8> %in) nounwind {
; AVX512BW-NEXT: retq
;
; AVX512VPOPCNTDQ-LABEL: testv64i8u:
-; AVX512VPOPCNTDQ: # BB#0:
-; AVX512VPOPCNTDQ-NEXT: vpxor %ymm2, %ymm2, %ymm2
+; AVX512VPOPCNTDQ: # %bb.0:
+; AVX512VPOPCNTDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VPOPCNTDQ-NEXT: vpsubb %ymm0, %ymm2, %ymm3
; AVX512VPOPCNTDQ-NEXT: vpand %ymm3, %ymm0, %ymm0
; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3
@@ -648,6 +759,16 @@ define <64 x i8> @testv64i8u(<64 x i8> %in) nounwind {
; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm1, %ymm6, %ymm1
; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm1, %ymm1
; AVX512VPOPCNTDQ-NEXT: retq
+;
+; BITALG-LABEL: testv64i8u:
+; BITALG: # %bb.0:
+; BITALG-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; BITALG-NEXT: vpsubb %zmm0, %zmm1, %zmm1
+; BITALG-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; BITALG-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
+; BITALG-NEXT: vpaddb %zmm1, %zmm0, %zmm0
+; BITALG-NEXT: vpopcntb %zmm0, %zmm0
+; BITALG-NEXT: retq
%out = call <64 x i8> @llvm.cttz.v64i8(<64 x i8> %in, i1 -1)
ret <64 x i8> %out
}
diff --git a/test/CodeGen/X86/vector-unsigned-cmp.ll b/test/CodeGen/X86/vector-unsigned-cmp.ll
index 3e4b9aedf2b8..f4fd54f8da98 100644
--- a/test/CodeGen/X86/vector-unsigned-cmp.ll
+++ b/test/CodeGen/X86/vector-unsigned-cmp.ll
@@ -10,7 +10,7 @@
define <2 x i1> @ugt_v2i64(<2 x i64> %x, <2 x i64> %y) {
; SSE-LABEL: ugt_v2i64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: psrlq $1, %xmm0
; SSE-NEXT: psrlq $1, %xmm1
; SSE-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0]
@@ -27,7 +27,7 @@ define <2 x i1> @ugt_v2i64(<2 x i64> %x, <2 x i64> %y) {
; SSE-NEXT: retq
;
; AVX-LABEL: ugt_v2i64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpsrlq $1, %xmm0, %xmm0
; AVX-NEXT: vpsrlq $1, %xmm1, %xmm1
; AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
@@ -40,7 +40,7 @@ define <2 x i1> @ugt_v2i64(<2 x i64> %x, <2 x i64> %y) {
define <2 x i1> @ult_v2i64(<2 x i64> %x, <2 x i64> %y) {
; SSE-LABEL: ult_v2i64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: psrlq $1, %xmm0
; SSE-NEXT: psrlq $1, %xmm1
; SSE-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0]
@@ -57,7 +57,7 @@ define <2 x i1> @ult_v2i64(<2 x i64> %x, <2 x i64> %y) {
; SSE-NEXT: retq
;
; AVX-LABEL: ult_v2i64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpsrlq $1, %xmm0, %xmm0
; AVX-NEXT: vpsrlq $1, %xmm1, %xmm1
; AVX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
@@ -70,7 +70,7 @@ define <2 x i1> @ult_v2i64(<2 x i64> %x, <2 x i64> %y) {
define <2 x i1> @uge_v2i64(<2 x i64> %x, <2 x i64> %y) {
; SSE-LABEL: uge_v2i64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: psrlq $1, %xmm0
; SSE-NEXT: psrlq $1, %xmm1
; SSE-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0]
@@ -89,7 +89,7 @@ define <2 x i1> @uge_v2i64(<2 x i64> %x, <2 x i64> %y) {
; SSE-NEXT: retq
;
; AVX-LABEL: uge_v2i64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpsrlq $1, %xmm0, %xmm0
; AVX-NEXT: vpsrlq $1, %xmm1, %xmm1
; AVX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
@@ -104,7 +104,7 @@ define <2 x i1> @uge_v2i64(<2 x i64> %x, <2 x i64> %y) {
define <2 x i1> @ule_v2i64(<2 x i64> %x, <2 x i64> %y) {
; SSE-LABEL: ule_v2i64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: psrlq $1, %xmm0
; SSE-NEXT: psrlq $1, %xmm1
; SSE-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0]
@@ -123,7 +123,7 @@ define <2 x i1> @ule_v2i64(<2 x i64> %x, <2 x i64> %y) {
; SSE-NEXT: retq
;
; AVX-LABEL: ule_v2i64:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpsrlq $1, %xmm0, %xmm0
; AVX-NEXT: vpsrlq $1, %xmm1, %xmm1
; AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
@@ -138,14 +138,14 @@ define <2 x i1> @ule_v2i64(<2 x i64> %x, <2 x i64> %y) {
define <4 x i1> @ugt_v4i32(<4 x i32> %x, <4 x i32> %y) {
; SSE-LABEL: ugt_v4i32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: psrld $1, %xmm0
; SSE-NEXT: psrld $1, %xmm1
; SSE-NEXT: pcmpgtd %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: ugt_v4i32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpsrld $1, %xmm0, %xmm0
; AVX-NEXT: vpsrld $1, %xmm1, %xmm1
; AVX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
@@ -158,7 +158,7 @@ define <4 x i1> @ugt_v4i32(<4 x i32> %x, <4 x i32> %y) {
define <4 x i1> @ult_v4i32(<4 x i32> %x, <4 x i32> %y) {
; SSE-LABEL: ult_v4i32:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: psrld $1, %xmm0
; SSE-NEXT: psrld $1, %xmm1
; SSE-NEXT: pcmpgtd %xmm0, %xmm1
@@ -166,7 +166,7 @@ define <4 x i1> @ult_v4i32(<4 x i32> %x, <4 x i32> %y) {
; SSE-NEXT: retq
;
; AVX-LABEL: ult_v4i32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpsrld $1, %xmm0, %xmm0
; AVX-NEXT: vpsrld $1, %xmm1, %xmm1
; AVX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
@@ -179,7 +179,7 @@ define <4 x i1> @ult_v4i32(<4 x i32> %x, <4 x i32> %y) {
define <4 x i1> @uge_v4i32(<4 x i32> %x, <4 x i32> %y) {
; SSE2-LABEL: uge_v4i32:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: psrld $1, %xmm0
; SSE2-NEXT: psrld $1, %xmm1
; SSE2-NEXT: pcmpgtd %xmm0, %xmm1
@@ -188,7 +188,7 @@ define <4 x i1> @uge_v4i32(<4 x i32> %x, <4 x i32> %y) {
; SSE2-NEXT: retq
;
; SSE41-LABEL: uge_v4i32:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: psrld $1, %xmm0
; SSE41-NEXT: psrld $1, %xmm1
; SSE41-NEXT: pmaxud %xmm0, %xmm1
@@ -196,7 +196,7 @@ define <4 x i1> @uge_v4i32(<4 x i32> %x, <4 x i32> %y) {
; SSE41-NEXT: retq
;
; AVX-LABEL: uge_v4i32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpsrld $1, %xmm0, %xmm0
; AVX-NEXT: vpsrld $1, %xmm1, %xmm1
; AVX-NEXT: vpmaxud %xmm1, %xmm0, %xmm1
@@ -210,7 +210,7 @@ define <4 x i1> @uge_v4i32(<4 x i32> %x, <4 x i32> %y) {
define <4 x i1> @ule_v4i32(<4 x i32> %x, <4 x i32> %y) {
; SSE2-LABEL: ule_v4i32:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: psrld $1, %xmm0
; SSE2-NEXT: psrld $1, %xmm1
; SSE2-NEXT: pcmpgtd %xmm1, %xmm0
@@ -219,7 +219,7 @@ define <4 x i1> @ule_v4i32(<4 x i32> %x, <4 x i32> %y) {
; SSE2-NEXT: retq
;
; SSE41-LABEL: ule_v4i32:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: psrld $1, %xmm0
; SSE41-NEXT: psrld $1, %xmm1
; SSE41-NEXT: pminud %xmm0, %xmm1
@@ -227,7 +227,7 @@ define <4 x i1> @ule_v4i32(<4 x i32> %x, <4 x i32> %y) {
; SSE41-NEXT: retq
;
; AVX-LABEL: ule_v4i32:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpsrld $1, %xmm0, %xmm0
; AVX-NEXT: vpsrld $1, %xmm1, %xmm1
; AVX-NEXT: vpminud %xmm1, %xmm0, %xmm1
@@ -241,14 +241,14 @@ define <4 x i1> @ule_v4i32(<4 x i32> %x, <4 x i32> %y) {
define <8 x i1> @ugt_v8i16(<8 x i16> %x, <8 x i16> %y) {
; SSE-LABEL: ugt_v8i16:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: psrlw $1, %xmm0
; SSE-NEXT: psrlw $1, %xmm1
; SSE-NEXT: pcmpgtw %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: ugt_v8i16:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpsrlw $1, %xmm0, %xmm0
; AVX-NEXT: vpsrlw $1, %xmm1, %xmm1
; AVX-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0
@@ -261,7 +261,7 @@ define <8 x i1> @ugt_v8i16(<8 x i16> %x, <8 x i16> %y) {
define <8 x i1> @ult_v8i16(<8 x i16> %x, <8 x i16> %y) {
; SSE-LABEL: ult_v8i16:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: psrlw $1, %xmm0
; SSE-NEXT: psrlw $1, %xmm1
; SSE-NEXT: pcmpgtw %xmm0, %xmm1
@@ -269,7 +269,7 @@ define <8 x i1> @ult_v8i16(<8 x i16> %x, <8 x i16> %y) {
; SSE-NEXT: retq
;
; AVX-LABEL: ult_v8i16:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpsrlw $1, %xmm0, %xmm0
; AVX-NEXT: vpsrlw $1, %xmm1, %xmm1
; AVX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
@@ -282,7 +282,7 @@ define <8 x i1> @ult_v8i16(<8 x i16> %x, <8 x i16> %y) {
define <8 x i1> @uge_v8i16(<8 x i16> %x, <8 x i16> %y) {
; SSE2-LABEL: uge_v8i16:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: psrlw $1, %xmm0
; SSE2-NEXT: psrlw $1, %xmm1
; SSE2-NEXT: psubusw %xmm0, %xmm1
@@ -291,7 +291,7 @@ define <8 x i1> @uge_v8i16(<8 x i16> %x, <8 x i16> %y) {
; SSE2-NEXT: retq
;
; SSE41-LABEL: uge_v8i16:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: psrlw $1, %xmm0
; SSE41-NEXT: psrlw $1, %xmm1
; SSE41-NEXT: pmaxuw %xmm0, %xmm1
@@ -299,7 +299,7 @@ define <8 x i1> @uge_v8i16(<8 x i16> %x, <8 x i16> %y) {
; SSE41-NEXT: retq
;
; AVX-LABEL: uge_v8i16:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpsrlw $1, %xmm0, %xmm0
; AVX-NEXT: vpsrlw $1, %xmm1, %xmm1
; AVX-NEXT: vpmaxuw %xmm1, %xmm0, %xmm1
@@ -313,7 +313,7 @@ define <8 x i1> @uge_v8i16(<8 x i16> %x, <8 x i16> %y) {
define <8 x i1> @ule_v8i16(<8 x i16> %x, <8 x i16> %y) {
; SSE2-LABEL: ule_v8i16:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: psrlw $1, %xmm0
; SSE2-NEXT: psrlw $1, %xmm1
; SSE2-NEXT: psubusw %xmm1, %xmm0
@@ -322,7 +322,7 @@ define <8 x i1> @ule_v8i16(<8 x i16> %x, <8 x i16> %y) {
; SSE2-NEXT: retq
;
; SSE41-LABEL: ule_v8i16:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: psrlw $1, %xmm0
; SSE41-NEXT: psrlw $1, %xmm1
; SSE41-NEXT: pminuw %xmm0, %xmm1
@@ -330,7 +330,7 @@ define <8 x i1> @ule_v8i16(<8 x i16> %x, <8 x i16> %y) {
; SSE41-NEXT: retq
;
; AVX-LABEL: ule_v8i16:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpsrlw $1, %xmm0, %xmm0
; AVX-NEXT: vpsrlw $1, %xmm1, %xmm1
; AVX-NEXT: vpminuw %xmm1, %xmm0, %xmm1
@@ -344,7 +344,7 @@ define <8 x i1> @ule_v8i16(<8 x i16> %x, <8 x i16> %y) {
define <16 x i1> @ugt_v16i8(<16 x i8> %x, <16 x i8> %y) {
; SSE-LABEL: ugt_v16i8:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: psrlw $1, %xmm0
; SSE-NEXT: movdqa {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
; SSE-NEXT: pand %xmm2, %xmm0
@@ -354,7 +354,7 @@ define <16 x i1> @ugt_v16i8(<16 x i8> %x, <16 x i8> %y) {
; SSE-NEXT: retq
;
; AVX-LABEL: ugt_v16i8:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpsrlw $1, %xmm0, %xmm0
; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
; AVX-NEXT: vpand %xmm2, %xmm0, %xmm0
@@ -370,7 +370,7 @@ define <16 x i1> @ugt_v16i8(<16 x i8> %x, <16 x i8> %y) {
define <16 x i1> @ult_v16i8(<16 x i8> %x, <16 x i8> %y) {
; SSE-LABEL: ult_v16i8:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: psrlw $1, %xmm0
; SSE-NEXT: movdqa {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
; SSE-NEXT: pand %xmm2, %xmm0
@@ -381,7 +381,7 @@ define <16 x i1> @ult_v16i8(<16 x i8> %x, <16 x i8> %y) {
; SSE-NEXT: retq
;
; AVX-LABEL: ult_v16i8:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpsrlw $1, %xmm0, %xmm0
; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
; AVX-NEXT: vpand %xmm2, %xmm0, %xmm0
@@ -397,7 +397,7 @@ define <16 x i1> @ult_v16i8(<16 x i8> %x, <16 x i8> %y) {
define <16 x i1> @uge_v16i8(<16 x i8> %x, <16 x i8> %y) {
; SSE-LABEL: uge_v16i8:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: psrlw $1, %xmm0
; SSE-NEXT: movdqa {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
; SSE-NEXT: pand %xmm2, %xmm0
@@ -408,7 +408,7 @@ define <16 x i1> @uge_v16i8(<16 x i8> %x, <16 x i8> %y) {
; SSE-NEXT: retq
;
; AVX-LABEL: uge_v16i8:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpsrlw $1, %xmm0, %xmm0
; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
; AVX-NEXT: vpand %xmm2, %xmm0, %xmm0
@@ -425,7 +425,7 @@ define <16 x i1> @uge_v16i8(<16 x i8> %x, <16 x i8> %y) {
define <16 x i1> @ule_v16i8(<16 x i8> %x, <16 x i8> %y) {
; SSE-LABEL: ule_v16i8:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: psrlw $1, %xmm0
; SSE-NEXT: movdqa {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
; SSE-NEXT: pand %xmm2, %xmm0
@@ -436,7 +436,7 @@ define <16 x i1> @ule_v16i8(<16 x i8> %x, <16 x i8> %y) {
; SSE-NEXT: retq
;
; AVX-LABEL: ule_v16i8:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpsrlw $1, %xmm0, %xmm0
; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
; AVX-NEXT: vpand %xmm2, %xmm0, %xmm0
diff --git a/test/CodeGen/X86/vector-variable-idx.ll b/test/CodeGen/X86/vector-variable-idx.ll
index 2a4d18c141a3..7d37b1a4cafe 100644
--- a/test/CodeGen/X86/vector-variable-idx.ll
+++ b/test/CodeGen/X86/vector-variable-idx.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 | grep movss | count 2
+; RUN: llc < %s -mtriple=x86_64-- | grep movss | count 2
; PR2676
define float @foo(<4 x float> %p, i32 %t) {
diff --git a/test/CodeGen/X86/vector-variable-idx2.ll b/test/CodeGen/X86/vector-variable-idx2.ll
index df65257bac7e..77b91961f580 100644
--- a/test/CodeGen/X86/vector-variable-idx2.ll
+++ b/test/CodeGen/X86/vector-variable-idx2.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mattr=+sse4.1
+; RUN: llc < %s -mattr=+sse4.1
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
target triple = "x86_64-apple-darwin11.0.0"
diff --git a/test/CodeGen/X86/vector-zext.ll b/test/CodeGen/X86/vector-zext.ll
index fe3523de3575..94eadd8c1aaf 100644
--- a/test/CodeGen/X86/vector-zext.ll
+++ b/test/CodeGen/X86/vector-zext.ll
@@ -9,24 +9,24 @@
define <8 x i16> @zext_16i8_to_8i16(<16 x i8> %A) nounwind uwtable readnone ssp {
; SSE2-LABEL: zext_16i8_to_8i16:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: pxor %xmm1, %xmm1
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: zext_16i8_to_8i16:
-; SSSE3: # BB#0: # %entry
+; SSSE3: # %bb.0: # %entry
; SSSE3-NEXT: pxor %xmm1, %xmm1
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: zext_16i8_to_8i16:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; SSE41-NEXT: retq
;
; AVX-LABEL: zext_16i8_to_8i16:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX-NEXT: retq
entry:
@@ -38,7 +38,7 @@ entry:
; PR17654
define <16 x i16> @zext_16i8_to_16i16(<16 x i8> %A) {
; SSE2-LABEL: zext_16i8_to_16i16:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: pxor %xmm2, %xmm2
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
@@ -46,7 +46,7 @@ define <16 x i16> @zext_16i8_to_16i16(<16 x i8> %A) {
; SSE2-NEXT: retq
;
; SSSE3-LABEL: zext_16i8_to_16i16:
-; SSSE3: # BB#0: # %entry
+; SSSE3: # %bb.0: # %entry
; SSSE3-NEXT: movdqa %xmm0, %xmm1
; SSSE3-NEXT: pxor %xmm2, %xmm2
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
@@ -54,7 +54,7 @@ define <16 x i16> @zext_16i8_to_16i16(<16 x i8> %A) {
; SSSE3-NEXT: retq
;
; SSE41-LABEL: zext_16i8_to_16i16:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
@@ -62,7 +62,7 @@ define <16 x i16> @zext_16i8_to_16i16(<16 x i8> %A) {
; SSE41-NEXT: retq
;
; AVX1-LABEL: zext_16i8_to_16i16:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
@@ -70,12 +70,12 @@ define <16 x i16> @zext_16i8_to_16i16(<16 x i8> %A) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: zext_16i8_to_16i16:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
; AVX2-NEXT: retq
;
; AVX512-LABEL: zext_16i8_to_16i16:
-; AVX512: # BB#0: # %entry
+; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
; AVX512-NEXT: retq
entry:
@@ -85,7 +85,7 @@ entry:
define <32 x i16> @zext_32i8_to_32i16(<32 x i8> %A) {
; SSE2-LABEL: zext_32i8_to_32i16:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa %xmm1, %xmm3
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: pxor %xmm4, %xmm4
@@ -97,7 +97,7 @@ define <32 x i16> @zext_32i8_to_32i16(<32 x i8> %A) {
; SSE2-NEXT: retq
;
; SSSE3-LABEL: zext_32i8_to_32i16:
-; SSSE3: # BB#0: # %entry
+; SSSE3: # %bb.0: # %entry
; SSSE3-NEXT: movdqa %xmm1, %xmm3
; SSSE3-NEXT: movdqa %xmm0, %xmm1
; SSSE3-NEXT: pxor %xmm4, %xmm4
@@ -109,7 +109,7 @@ define <32 x i16> @zext_32i8_to_32i16(<32 x i8> %A) {
; SSSE3-NEXT: retq
;
; SSE41-LABEL: zext_32i8_to_32i16:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: pmovzxbw {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
@@ -121,7 +121,7 @@ define <32 x i16> @zext_32i8_to_32i16(<32 x i8> %A) {
; SSE41-NEXT: retq
;
; AVX1-LABEL: zext_32i8_to_32i16:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
@@ -135,7 +135,7 @@ define <32 x i16> @zext_32i8_to_32i16(<32 x i8> %A) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: zext_32i8_to_32i16:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
@@ -143,7 +143,7 @@ define <32 x i16> @zext_32i8_to_32i16(<32 x i8> %A) {
; AVX2-NEXT: retq
;
; AVX512F-LABEL: zext_32i8_to_32i16:
-; AVX512F: # BB#0: # %entry
+; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
@@ -151,7 +151,7 @@ define <32 x i16> @zext_32i8_to_32i16(<32 x i8> %A) {
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: zext_32i8_to_32i16:
-; AVX512BW: # BB#0: # %entry
+; AVX512BW: # %bb.0: # %entry
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
; AVX512BW-NEXT: retq
entry:
@@ -161,26 +161,26 @@ entry:
define <4 x i32> @zext_16i8_to_4i32(<16 x i8> %A) nounwind uwtable readnone ssp {
; SSE2-LABEL: zext_16i8_to_4i32:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: pxor %xmm1, %xmm1
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: zext_16i8_to_4i32:
-; SSSE3: # BB#0: # %entry
+; SSSE3: # %bb.0: # %entry
; SSSE3-NEXT: pxor %xmm1, %xmm1
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: zext_16i8_to_4i32:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; SSE41-NEXT: retq
;
; AVX-LABEL: zext_16i8_to_4i32:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; AVX-NEXT: retq
entry:
@@ -191,7 +191,7 @@ entry:
define <8 x i32> @zext_16i8_to_8i32(<16 x i8> %A) nounwind uwtable readnone ssp {
; SSE2-LABEL: zext_16i8_to_8i32:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: pxor %xmm2, %xmm2
; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
@@ -201,7 +201,7 @@ define <8 x i32> @zext_16i8_to_8i32(<16 x i8> %A) nounwind uwtable readnone ssp
; SSE2-NEXT: retq
;
; SSSE3-LABEL: zext_16i8_to_8i32:
-; SSSE3: # BB#0: # %entry
+; SSSE3: # %bb.0: # %entry
; SSSE3-NEXT: movdqa %xmm0, %xmm1
; SSSE3-NEXT: pxor %xmm2, %xmm2
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
@@ -211,7 +211,7 @@ define <8 x i32> @zext_16i8_to_8i32(<16 x i8> %A) nounwind uwtable readnone ssp
; SSSE3-NEXT: retq
;
; SSE41-LABEL: zext_16i8_to_8i32:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
@@ -219,7 +219,7 @@ define <8 x i32> @zext_16i8_to_8i32(<16 x i8> %A) nounwind uwtable readnone ssp
; SSE41-NEXT: retq
;
; AVX1-LABEL: zext_16i8_to_8i32:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
@@ -227,12 +227,12 @@ define <8 x i32> @zext_16i8_to_8i32(<16 x i8> %A) nounwind uwtable readnone ssp
; AVX1-NEXT: retq
;
; AVX2-LABEL: zext_16i8_to_8i32:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
; AVX2-NEXT: retq
;
; AVX512-LABEL: zext_16i8_to_8i32:
-; AVX512: # BB#0: # %entry
+; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
; AVX512-NEXT: retq
entry:
@@ -243,7 +243,7 @@ entry:
define <16 x i32> @zext_16i8_to_16i32(<16 x i8> %A) nounwind uwtable readnone ssp {
; SSE2-LABEL: zext_16i8_to_16i32:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa %xmm0, %xmm3
; SSE2-NEXT: pxor %xmm4, %xmm4
; SSE2-NEXT: movdqa %xmm3, %xmm1
@@ -258,7 +258,7 @@ define <16 x i32> @zext_16i8_to_16i32(<16 x i8> %A) nounwind uwtable readnone ss
; SSE2-NEXT: retq
;
; SSSE3-LABEL: zext_16i8_to_16i32:
-; SSSE3: # BB#0: # %entry
+; SSSE3: # %bb.0: # %entry
; SSSE3-NEXT: movdqa %xmm0, %xmm3
; SSSE3-NEXT: pxor %xmm4, %xmm4
; SSSE3-NEXT: movdqa %xmm3, %xmm1
@@ -273,7 +273,7 @@ define <16 x i32> @zext_16i8_to_16i32(<16 x i8> %A) nounwind uwtable readnone ss
; SSSE3-NEXT: retq
;
; SSE41-LABEL: zext_16i8_to_16i32:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: pmovzxbd {{.*#+}} xmm4 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
@@ -285,7 +285,7 @@ define <16 x i32> @zext_16i8_to_16i32(<16 x i8> %A) nounwind uwtable readnone ss
; SSE41-NEXT: retq
;
; AVX1-LABEL: zext_16i8_to_16i32:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
@@ -299,7 +299,7 @@ define <16 x i32> @zext_16i8_to_16i32(<16 x i8> %A) nounwind uwtable readnone ss
; AVX1-NEXT: retq
;
; AVX2-LABEL: zext_16i8_to_16i32:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
@@ -307,7 +307,7 @@ define <16 x i32> @zext_16i8_to_16i32(<16 x i8> %A) nounwind uwtable readnone ss
; AVX2-NEXT: retq
;
; AVX512-LABEL: zext_16i8_to_16i32:
-; AVX512: # BB#0: # %entry
+; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
; AVX512-NEXT: retq
entry:
@@ -317,7 +317,7 @@ entry:
define <2 x i64> @zext_16i8_to_2i64(<16 x i8> %A) nounwind uwtable readnone ssp {
; SSE2-LABEL: zext_16i8_to_2i64:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: pxor %xmm1, %xmm1
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
@@ -325,17 +325,17 @@ define <2 x i64> @zext_16i8_to_2i64(<16 x i8> %A) nounwind uwtable readnone ssp
; SSE2-NEXT: retq
;
; SSSE3-LABEL: zext_16i8_to_2i64:
-; SSSE3: # BB#0: # %entry
+; SSSE3: # %bb.0: # %entry
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
; SSSE3-NEXT: retq
;
; SSE41-LABEL: zext_16i8_to_2i64:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
; SSE41-NEXT: retq
;
; AVX-LABEL: zext_16i8_to_2i64:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
; AVX-NEXT: retq
entry:
@@ -346,7 +346,7 @@ entry:
define <4 x i64> @zext_16i8_to_4i64(<16 x i8> %A) nounwind uwtable readnone ssp {
; SSE2-LABEL: zext_16i8_to_4i64:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: pxor %xmm2, %xmm2
; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
@@ -357,14 +357,14 @@ define <4 x i64> @zext_16i8_to_4i64(<16 x i8> %A) nounwind uwtable readnone ssp
; SSE2-NEXT: retq
;
; SSSE3-LABEL: zext_16i8_to_4i64:
-; SSSE3: # BB#0: # %entry
+; SSSE3: # %bb.0: # %entry
; SSSE3-NEXT: movdqa %xmm0, %xmm1
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[2],zero,zero,zero,zero,zero,zero,zero,xmm1[3],zero,zero,zero,zero,zero,zero,zero
; SSSE3-NEXT: retq
;
; SSE41-LABEL: zext_16i8_to_4i64:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: pmovzxbq {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
; SSE41-NEXT: psrld $16, %xmm0
; SSE41-NEXT: pmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
@@ -372,7 +372,7 @@ define <4 x i64> @zext_16i8_to_4i64(<16 x i8> %A) nounwind uwtable readnone ssp
; SSE41-NEXT: retq
;
; AVX1-LABEL: zext_16i8_to_4i64:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0
; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
@@ -380,12 +380,12 @@ define <4 x i64> @zext_16i8_to_4i64(<16 x i8> %A) nounwind uwtable readnone ssp
; AVX1-NEXT: retq
;
; AVX2-LABEL: zext_16i8_to_4i64:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
; AVX2-NEXT: retq
;
; AVX512-LABEL: zext_16i8_to_4i64:
-; AVX512: # BB#0: # %entry
+; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
; AVX512-NEXT: retq
entry:
@@ -396,7 +396,7 @@ entry:
define <8 x i64> @zext_16i8_to_8i64(<16 x i8> %A) nounwind uwtable readnone ssp {
; SSE2-LABEL: zext_16i8_to_8i64:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: pxor %xmm4, %xmm4
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,2,3]
@@ -413,7 +413,7 @@ define <8 x i64> @zext_16i8_to_8i64(<16 x i8> %A) nounwind uwtable readnone ssp
; SSE2-NEXT: retq
;
; SSSE3-LABEL: zext_16i8_to_8i64:
-; SSSE3: # BB#0: # %entry
+; SSSE3: # %bb.0: # %entry
; SSSE3-NEXT: movdqa %xmm0, %xmm1
; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [0,128,128,128,128,128,128,128,1,128,128,128,128,128,128,128]
; SSSE3-NEXT: pshufb %xmm4, %xmm0
@@ -426,7 +426,7 @@ define <8 x i64> @zext_16i8_to_8i64(<16 x i8> %A) nounwind uwtable readnone ssp
; SSSE3-NEXT: retq
;
; SSE41-LABEL: zext_16i8_to_8i64:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: pmovzxbq {{.*#+}} xmm4 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
; SSE41-NEXT: movdqa %xmm0, %xmm1
; SSE41-NEXT: psrld $16, %xmm1
@@ -439,7 +439,7 @@ define <8 x i64> @zext_16i8_to_8i64(<16 x i8> %A) nounwind uwtable readnone ssp
; SSE41-NEXT: retq
;
; AVX1-LABEL: zext_16i8_to_8i64:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
; AVX1-NEXT: vpsrld $16, %xmm0, %xmm2
; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
@@ -453,7 +453,7 @@ define <8 x i64> @zext_16i8_to_8i64(<16 x i8> %A) nounwind uwtable readnone ssp
; AVX1-NEXT: retq
;
; AVX2-LABEL: zext_16i8_to_8i64:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpmovzxbq {{.*#+}} ymm2 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
; AVX2-NEXT: vpmovzxbq {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
@@ -461,7 +461,7 @@ define <8 x i64> @zext_16i8_to_8i64(<16 x i8> %A) nounwind uwtable readnone ssp
; AVX2-NEXT: retq
;
; AVX512-LABEL: zext_16i8_to_8i64:
-; AVX512: # BB#0: # %entry
+; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vpmovzxbq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero
; AVX512-NEXT: retq
entry:
@@ -472,24 +472,24 @@ entry:
define <4 x i32> @zext_8i16_to_4i32(<8 x i16> %A) nounwind uwtable readnone ssp {
; SSE2-LABEL: zext_8i16_to_4i32:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: pxor %xmm1, %xmm1
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: zext_8i16_to_4i32:
-; SSSE3: # BB#0: # %entry
+; SSSE3: # %bb.0: # %entry
; SSSE3-NEXT: pxor %xmm1, %xmm1
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: zext_8i16_to_4i32:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; SSE41-NEXT: retq
;
; AVX-LABEL: zext_8i16_to_4i32:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; AVX-NEXT: retq
entry:
@@ -500,7 +500,7 @@ entry:
define <8 x i32> @zext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone ssp {
; SSE2-LABEL: zext_8i16_to_8i32:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: pxor %xmm2, %xmm2
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
@@ -508,7 +508,7 @@ define <8 x i32> @zext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone ssp
; SSE2-NEXT: retq
;
; SSSE3-LABEL: zext_8i16_to_8i32:
-; SSSE3: # BB#0: # %entry
+; SSSE3: # %bb.0: # %entry
; SSSE3-NEXT: movdqa %xmm0, %xmm1
; SSSE3-NEXT: pxor %xmm2, %xmm2
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
@@ -516,7 +516,7 @@ define <8 x i32> @zext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone ssp
; SSSE3-NEXT: retq
;
; SSE41-LABEL: zext_8i16_to_8i32:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
@@ -524,7 +524,7 @@ define <8 x i32> @zext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone ssp
; SSE41-NEXT: retq
;
; AVX1-LABEL: zext_8i16_to_8i32:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
@@ -532,12 +532,12 @@ define <8 x i32> @zext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone ssp
; AVX1-NEXT: retq
;
; AVX2-LABEL: zext_8i16_to_8i32:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX2-NEXT: retq
;
; AVX512-LABEL: zext_8i16_to_8i32:
-; AVX512: # BB#0: # %entry
+; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX512-NEXT: retq
entry:
@@ -547,7 +547,7 @@ entry:
define <16 x i32> @zext_16i16_to_16i32(<16 x i16> %A) nounwind uwtable readnone ssp {
; SSE2-LABEL: zext_16i16_to_16i32:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa %xmm1, %xmm3
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: pxor %xmm4, %xmm4
@@ -559,7 +559,7 @@ define <16 x i32> @zext_16i16_to_16i32(<16 x i16> %A) nounwind uwtable readnone
; SSE2-NEXT: retq
;
; SSSE3-LABEL: zext_16i16_to_16i32:
-; SSSE3: # BB#0: # %entry
+; SSSE3: # %bb.0: # %entry
; SSSE3-NEXT: movdqa %xmm1, %xmm3
; SSSE3-NEXT: movdqa %xmm0, %xmm1
; SSSE3-NEXT: pxor %xmm4, %xmm4
@@ -571,7 +571,7 @@ define <16 x i32> @zext_16i16_to_16i32(<16 x i16> %A) nounwind uwtable readnone
; SSSE3-NEXT: retq
;
; SSE41-LABEL: zext_16i16_to_16i32:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
@@ -583,7 +583,7 @@ define <16 x i32> @zext_16i16_to_16i32(<16 x i16> %A) nounwind uwtable readnone
; SSE41-NEXT: retq
;
; AVX1-LABEL: zext_16i16_to_16i32:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
@@ -597,7 +597,7 @@ define <16 x i32> @zext_16i16_to_16i32(<16 x i16> %A) nounwind uwtable readnone
; AVX1-NEXT: retq
;
; AVX2-LABEL: zext_16i16_to_16i32:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
@@ -605,7 +605,7 @@ define <16 x i32> @zext_16i16_to_16i32(<16 x i16> %A) nounwind uwtable readnone
; AVX2-NEXT: retq
;
; AVX512-LABEL: zext_16i16_to_16i32:
-; AVX512: # BB#0: # %entry
+; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
; AVX512-NEXT: retq
entry:
@@ -615,26 +615,26 @@ entry:
define <2 x i64> @zext_8i16_to_2i64(<8 x i16> %A) nounwind uwtable readnone ssp {
; SSE2-LABEL: zext_8i16_to_2i64:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: pxor %xmm1, %xmm1
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: zext_8i16_to_2i64:
-; SSSE3: # BB#0: # %entry
+; SSSE3: # %bb.0: # %entry
; SSSE3-NEXT: pxor %xmm1, %xmm1
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: zext_8i16_to_2i64:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
; SSE41-NEXT: retq
;
; AVX-LABEL: zext_8i16_to_2i64:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
; AVX-NEXT: retq
entry:
@@ -645,7 +645,7 @@ entry:
define <4 x i64> @zext_8i16_to_4i64(<8 x i16> %A) nounwind uwtable readnone ssp {
; SSE2-LABEL: zext_8i16_to_4i64:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: pxor %xmm2, %xmm2
; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
@@ -655,7 +655,7 @@ define <4 x i64> @zext_8i16_to_4i64(<8 x i16> %A) nounwind uwtable readnone ssp
; SSE2-NEXT: retq
;
; SSSE3-LABEL: zext_8i16_to_4i64:
-; SSSE3: # BB#0: # %entry
+; SSSE3: # %bb.0: # %entry
; SSSE3-NEXT: movdqa %xmm0, %xmm1
; SSSE3-NEXT: pxor %xmm2, %xmm2
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
@@ -665,7 +665,7 @@ define <4 x i64> @zext_8i16_to_4i64(<8 x i16> %A) nounwind uwtable readnone ssp
; SSSE3-NEXT: retq
;
; SSE41-LABEL: zext_8i16_to_4i64:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: pmovzxwq {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
@@ -673,7 +673,7 @@ define <4 x i64> @zext_8i16_to_4i64(<8 x i16> %A) nounwind uwtable readnone ssp
; SSE41-NEXT: retq
;
; AVX1-LABEL: zext_8i16_to_4i64:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
@@ -681,12 +681,12 @@ define <4 x i64> @zext_8i16_to_4i64(<8 x i16> %A) nounwind uwtable readnone ssp
; AVX1-NEXT: retq
;
; AVX2-LABEL: zext_8i16_to_4i64:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; AVX2-NEXT: retq
;
; AVX512-LABEL: zext_8i16_to_4i64:
-; AVX512: # BB#0: # %entry
+; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; AVX512-NEXT: retq
entry:
@@ -697,7 +697,7 @@ entry:
define <8 x i64> @zext_8i16_to_8i64(<8 x i16> %A) nounwind uwtable readnone ssp {
; SSE2-LABEL: zext_8i16_to_8i64:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa %xmm0, %xmm3
; SSE2-NEXT: pxor %xmm4, %xmm4
; SSE2-NEXT: movdqa %xmm3, %xmm1
@@ -712,7 +712,7 @@ define <8 x i64> @zext_8i16_to_8i64(<8 x i16> %A) nounwind uwtable readnone ssp
; SSE2-NEXT: retq
;
; SSSE3-LABEL: zext_8i16_to_8i64:
-; SSSE3: # BB#0: # %entry
+; SSSE3: # %bb.0: # %entry
; SSSE3-NEXT: movdqa %xmm0, %xmm3
; SSSE3-NEXT: pxor %xmm4, %xmm4
; SSSE3-NEXT: movdqa %xmm3, %xmm1
@@ -727,7 +727,7 @@ define <8 x i64> @zext_8i16_to_8i64(<8 x i16> %A) nounwind uwtable readnone ssp
; SSSE3-NEXT: retq
;
; SSE41-LABEL: zext_8i16_to_8i64:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: pmovzxwq {{.*#+}} xmm4 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
@@ -739,7 +739,7 @@ define <8 x i64> @zext_8i16_to_8i64(<8 x i16> %A) nounwind uwtable readnone ssp
; SSE41-NEXT: retq
;
; AVX1-LABEL: zext_8i16_to_8i64:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
@@ -753,7 +753,7 @@ define <8 x i64> @zext_8i16_to_8i64(<8 x i16> %A) nounwind uwtable readnone ssp
; AVX1-NEXT: retq
;
; AVX2-LABEL: zext_8i16_to_8i64:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
@@ -761,7 +761,7 @@ define <8 x i64> @zext_8i16_to_8i64(<8 x i16> %A) nounwind uwtable readnone ssp
; AVX2-NEXT: retq
;
; AVX512-LABEL: zext_8i16_to_8i64:
-; AVX512: # BB#0: # %entry
+; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
; AVX512-NEXT: retq
entry:
@@ -771,24 +771,24 @@ entry:
define <2 x i64> @zext_4i32_to_2i64(<4 x i32> %A) nounwind uwtable readnone ssp {
; SSE2-LABEL: zext_4i32_to_2i64:
-; SSE2: # BB#0: # %entry
-; SSE2-NEXT: pxor %xmm1, %xmm1
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2: # %bb.0: # %entry
+; SSE2-NEXT: xorps %xmm1, %xmm1
+; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: zext_4i32_to_2i64:
-; SSSE3: # BB#0: # %entry
-; SSSE3-NEXT: pxor %xmm1, %xmm1
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSSE3: # %bb.0: # %entry
+; SSSE3-NEXT: xorps %xmm1, %xmm1
+; SSSE3-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: zext_4i32_to_2i64:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; SSE41-NEXT: retq
;
; AVX-LABEL: zext_4i32_to_2i64:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX-NEXT: retq
entry:
@@ -799,23 +799,23 @@ entry:
define <4 x i64> @zext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ssp {
; SSE2-LABEL: zext_4i32_to_4i64:
-; SSE2: # BB#0: # %entry
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSE2: # %bb.0: # %entry
+; SSE2-NEXT: movaps %xmm0, %xmm1
+; SSE2-NEXT: xorps %xmm2, %xmm2
+; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: zext_4i32_to_4i64:
-; SSSE3: # BB#0: # %entry
-; SSSE3-NEXT: movdqa %xmm0, %xmm1
-; SSSE3-NEXT: pxor %xmm2, %xmm2
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSSE3-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSSE3: # %bb.0: # %entry
+; SSSE3-NEXT: movaps %xmm0, %xmm1
+; SSSE3-NEXT: xorps %xmm2, %xmm2
+; SSSE3-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSSE3-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: zext_4i32_to_4i64:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero
@@ -823,7 +823,7 @@ define <4 x i64> @zext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ssp
; SSE41-NEXT: retq
;
; AVX1-LABEL: zext_4i32_to_4i64:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
@@ -831,12 +831,12 @@ define <4 x i64> @zext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ssp
; AVX1-NEXT: retq
;
; AVX2-LABEL: zext_4i32_to_4i64:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; AVX2-NEXT: retq
;
; AVX512-LABEL: zext_4i32_to_4i64:
-; AVX512: # BB#0: # %entry
+; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; AVX512-NEXT: retq
entry:
@@ -846,31 +846,31 @@ entry:
define <8 x i64> @zext_8i32_to_8i64(<8 x i32> %A) nounwind uwtable readnone ssp {
; SSE2-LABEL: zext_8i32_to_8i64:
-; SSE2: # BB#0: # %entry
-; SSE2-NEXT: movdqa %xmm1, %xmm3
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: pxor %xmm4, %xmm4
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
-; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3]
-; SSE2-NEXT: movdqa %xmm3, %xmm2
-; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
-; SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3]
+; SSE2: # %bb.0: # %entry
+; SSE2-NEXT: movaps %xmm1, %xmm3
+; SSE2-NEXT: movaps %xmm0, %xmm1
+; SSE2-NEXT: xorps %xmm4, %xmm4
+; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
+; SSE2-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3]
+; SSE2-NEXT: movaps %xmm3, %xmm2
+; SSE2-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
+; SSE2-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: zext_8i32_to_8i64:
-; SSSE3: # BB#0: # %entry
-; SSSE3-NEXT: movdqa %xmm1, %xmm3
-; SSSE3-NEXT: movdqa %xmm0, %xmm1
-; SSSE3-NEXT: pxor %xmm4, %xmm4
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
-; SSSE3-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3]
-; SSSE3-NEXT: movdqa %xmm3, %xmm2
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
-; SSSE3-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3]
+; SSSE3: # %bb.0: # %entry
+; SSSE3-NEXT: movaps %xmm1, %xmm3
+; SSSE3-NEXT: movaps %xmm0, %xmm1
+; SSSE3-NEXT: xorps %xmm4, %xmm4
+; SSSE3-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
+; SSSE3-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3]
+; SSSE3-NEXT: movaps %xmm3, %xmm2
+; SSSE3-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
+; SSSE3-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: zext_8i32_to_8i64:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: pmovzxdq {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero
; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
@@ -882,7 +882,7 @@ define <8 x i64> @zext_8i32_to_8i64(<8 x i32> %A) nounwind uwtable readnone ssp
; SSE41-NEXT: retq
;
; AVX1-LABEL: zext_8i32_to_8i64:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
@@ -896,7 +896,7 @@ define <8 x i64> @zext_8i32_to_8i64(<8 x i32> %A) nounwind uwtable readnone ssp
; AVX1-NEXT: retq
;
; AVX2-LABEL: zext_8i32_to_8i64:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
@@ -904,7 +904,7 @@ define <8 x i64> @zext_8i32_to_8i64(<8 x i32> %A) nounwind uwtable readnone ssp
; AVX2-NEXT: retq
;
; AVX512-LABEL: zext_8i32_to_8i64:
-; AVX512: # BB#0: # %entry
+; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
; AVX512-NEXT: retq
entry:
@@ -914,7 +914,7 @@ entry:
define <2 x i64> @load_zext_2i8_to_2i64(<2 x i8> *%ptr) {
; SSE2-LABEL: load_zext_2i8_to_2i64:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movzwl (%rdi), %eax
; SSE2-NEXT: movd %eax, %xmm0
; SSE2-NEXT: pxor %xmm1, %xmm1
@@ -924,19 +924,19 @@ define <2 x i64> @load_zext_2i8_to_2i64(<2 x i8> *%ptr) {
; SSE2-NEXT: retq
;
; SSSE3-LABEL: load_zext_2i8_to_2i64:
-; SSSE3: # BB#0: # %entry
+; SSSE3: # %bb.0: # %entry
; SSSE3-NEXT: movzwl (%rdi), %eax
; SSSE3-NEXT: movd %eax, %xmm0
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
; SSSE3-NEXT: retq
;
; SSE41-LABEL: load_zext_2i8_to_2i64:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: pmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
; SSE41-NEXT: retq
;
; AVX-LABEL: load_zext_2i8_to_2i64:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
; AVX-NEXT: retq
entry:
@@ -947,7 +947,7 @@ entry:
define <4 x i32> @load_zext_4i8_to_4i32(<4 x i8> *%ptr) {
; SSE2-LABEL: load_zext_4i8_to_4i32:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: pxor %xmm1, %xmm1
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
@@ -955,7 +955,7 @@ define <4 x i32> @load_zext_4i8_to_4i32(<4 x i8> *%ptr) {
; SSE2-NEXT: retq
;
; SSSE3-LABEL: load_zext_4i8_to_4i32:
-; SSSE3: # BB#0: # %entry
+; SSSE3: # %bb.0: # %entry
; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSSE3-NEXT: pxor %xmm1, %xmm1
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
@@ -963,12 +963,12 @@ define <4 x i32> @load_zext_4i8_to_4i32(<4 x i8> *%ptr) {
; SSSE3-NEXT: retq
;
; SSE41-LABEL: load_zext_4i8_to_4i32:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; SSE41-NEXT: retq
;
; AVX-LABEL: load_zext_4i8_to_4i32:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; AVX-NEXT: retq
entry:
@@ -979,7 +979,7 @@ entry:
define <4 x i64> @load_zext_4i8_to_4i64(<4 x i8> *%ptr) {
; SSE2-LABEL: load_zext_4i8_to_4i64:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; SSE2-NEXT: pxor %xmm2, %xmm2
; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
@@ -990,7 +990,7 @@ define <4 x i64> @load_zext_4i8_to_4i64(<4 x i8> *%ptr) {
; SSE2-NEXT: retq
;
; SSSE3-LABEL: load_zext_4i8_to_4i64:
-; SSSE3: # BB#0: # %entry
+; SSSE3: # %bb.0: # %entry
; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; SSSE3-NEXT: movdqa %xmm1, %xmm0
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
@@ -998,25 +998,25 @@ define <4 x i64> @load_zext_4i8_to_4i64(<4 x i8> *%ptr) {
; SSSE3-NEXT: retq
;
; SSE41-LABEL: load_zext_4i8_to_4i64:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: pmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
; SSE41-NEXT: pmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
; SSE41-NEXT: retq
;
; AVX1-LABEL: load_zext_4i8_to_4i64:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: load_zext_4i8_to_4i64:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpmovzxbq {{.*#+}} ymm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero
; AVX2-NEXT: retq
;
; AVX512-LABEL: load_zext_4i8_to_4i64:
-; AVX512: # BB#0: # %entry
+; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vpmovzxbq {{.*#+}} ymm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero
; AVX512-NEXT: retq
entry:
@@ -1027,26 +1027,26 @@ entry:
define <8 x i16> @load_zext_8i8_to_8i16(<8 x i8> *%ptr) {
; SSE2-LABEL: load_zext_8i8_to_8i16:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; SSE2-NEXT: pxor %xmm1, %xmm1
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: load_zext_8i8_to_8i16:
-; SSSE3: # BB#0: # %entry
+; SSSE3: # %bb.0: # %entry
; SSSE3-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; SSSE3-NEXT: pxor %xmm1, %xmm1
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: load_zext_8i8_to_8i16:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
; SSE41-NEXT: retq
;
; AVX-LABEL: load_zext_8i8_to_8i16:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vpmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
; AVX-NEXT: retq
entry:
@@ -1057,7 +1057,7 @@ entry:
define <8 x i32> @load_zext_8i8_to_8i32(<8 x i8> *%ptr) {
; SSE2-LABEL: load_zext_8i8_to_8i32:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
; SSE2-NEXT: pxor %xmm2, %xmm2
; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
@@ -1067,7 +1067,7 @@ define <8 x i32> @load_zext_8i8_to_8i32(<8 x i8> *%ptr) {
; SSE2-NEXT: retq
;
; SSSE3-LABEL: load_zext_8i8_to_8i32:
-; SSSE3: # BB#0: # %entry
+; SSSE3: # %bb.0: # %entry
; SSSE3-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
; SSSE3-NEXT: pxor %xmm2, %xmm2
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
@@ -1077,25 +1077,25 @@ define <8 x i32> @load_zext_8i8_to_8i32(<8 x i8> *%ptr) {
; SSSE3-NEXT: retq
;
; SSE41-LABEL: load_zext_8i8_to_8i32:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; SSE41-NEXT: retq
;
; AVX1-LABEL: load_zext_8i8_to_8i32:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: load_zext_8i8_to_8i32:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
; AVX2-NEXT: retq
;
; AVX512-LABEL: load_zext_8i8_to_8i32:
-; AVX512: # BB#0: # %entry
+; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
; AVX512-NEXT: retq
entry:
@@ -1106,7 +1106,7 @@ entry:
define <8 x i32> @load_zext_16i8_to_8i32(<16 x i8> *%ptr) {
; SSE2-LABEL: load_zext_16i8_to_8i32:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa (%rdi), %xmm1
; SSE2-NEXT: pxor %xmm2, %xmm2
; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
@@ -1116,7 +1116,7 @@ define <8 x i32> @load_zext_16i8_to_8i32(<16 x i8> *%ptr) {
; SSE2-NEXT: retq
;
; SSSE3-LABEL: load_zext_16i8_to_8i32:
-; SSSE3: # BB#0: # %entry
+; SSSE3: # %bb.0: # %entry
; SSSE3-NEXT: movdqa (%rdi), %xmm1
; SSSE3-NEXT: pxor %xmm2, %xmm2
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
@@ -1126,7 +1126,7 @@ define <8 x i32> @load_zext_16i8_to_8i32(<16 x i8> *%ptr) {
; SSSE3-NEXT: retq
;
; SSE41-LABEL: load_zext_16i8_to_8i32:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: movdqa (%rdi), %xmm1
; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
@@ -1134,7 +1134,7 @@ define <8 x i32> @load_zext_16i8_to_8i32(<16 x i8> *%ptr) {
; SSE41-NEXT: retq
;
; AVX1-LABEL: load_zext_16i8_to_8i32:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vmovdqa (%rdi), %xmm0
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
@@ -1143,12 +1143,12 @@ define <8 x i32> @load_zext_16i8_to_8i32(<16 x i8> *%ptr) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: load_zext_16i8_to_8i32:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
; AVX2-NEXT: retq
;
; AVX512-LABEL: load_zext_16i8_to_8i32:
-; AVX512: # BB#0: # %entry
+; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
; AVX512-NEXT: retq
entry:
@@ -1160,7 +1160,7 @@ entry:
define <8 x i64> @load_zext_8i8_to_8i64(<8 x i8> *%ptr) {
; SSE2-LABEL: load_zext_8i8_to_8i64:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
; SSE2-NEXT: pxor %xmm4, %xmm4
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,2,3]
@@ -1177,7 +1177,7 @@ define <8 x i64> @load_zext_8i8_to_8i64(<8 x i8> *%ptr) {
; SSE2-NEXT: retq
;
; SSSE3-LABEL: load_zext_8i8_to_8i64:
-; SSSE3: # BB#0: # %entry
+; SSSE3: # %bb.0: # %entry
; SSSE3-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [0,128,128,128,128,128,128,128,1,128,128,128,128,128,128,128]
; SSSE3-NEXT: movdqa %xmm1, %xmm0
@@ -1191,7 +1191,7 @@ define <8 x i64> @load_zext_8i8_to_8i64(<8 x i8> *%ptr) {
; SSSE3-NEXT: retq
;
; SSE41-LABEL: load_zext_8i8_to_8i64:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: pmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
; SSE41-NEXT: pmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
; SSE41-NEXT: pmovzxbq {{.*#+}} xmm2 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
@@ -1199,7 +1199,7 @@ define <8 x i64> @load_zext_8i8_to_8i64(<8 x i8> *%ptr) {
; SSE41-NEXT: retq
;
; AVX1-LABEL: load_zext_8i8_to_8i64:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
@@ -1209,13 +1209,13 @@ define <8 x i64> @load_zext_8i8_to_8i64(<8 x i8> *%ptr) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: load_zext_8i8_to_8i64:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpmovzxbq {{.*#+}} ymm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero
; AVX2-NEXT: vpmovzxbq {{.*#+}} ymm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero
; AVX2-NEXT: retq
;
; AVX512-LABEL: load_zext_8i8_to_8i64:
-; AVX512: # BB#0: # %entry
+; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vpmovzxbq {{.*#+}} zmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero,mem[4],zero,zero,zero,zero,zero,zero,zero,mem[5],zero,zero,zero,zero,zero,zero,zero,mem[6],zero,zero,zero,zero,zero,zero,zero,mem[7],zero,zero,zero,zero,zero,zero,zero
; AVX512-NEXT: retq
entry:
@@ -1226,7 +1226,7 @@ entry:
define <16 x i16> @load_zext_16i8_to_16i16(<16 x i8> *%ptr) {
; SSE2-LABEL: load_zext_16i8_to_16i16:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa (%rdi), %xmm1
; SSE2-NEXT: pxor %xmm2, %xmm2
; SSE2-NEXT: movdqa %xmm1, %xmm0
@@ -1235,7 +1235,7 @@ define <16 x i16> @load_zext_16i8_to_16i16(<16 x i8> *%ptr) {
; SSE2-NEXT: retq
;
; SSSE3-LABEL: load_zext_16i8_to_16i16:
-; SSSE3: # BB#0: # %entry
+; SSSE3: # %bb.0: # %entry
; SSSE3-NEXT: movdqa (%rdi), %xmm1
; SSSE3-NEXT: pxor %xmm2, %xmm2
; SSSE3-NEXT: movdqa %xmm1, %xmm0
@@ -1244,25 +1244,25 @@ define <16 x i16> @load_zext_16i8_to_16i16(<16 x i8> *%ptr) {
; SSSE3-NEXT: retq
;
; SSE41-LABEL: load_zext_16i8_to_16i16:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
; SSE41-NEXT: retq
;
; AVX1-LABEL: load_zext_16i8_to_16i16:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: load_zext_16i8_to_16i16:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
; AVX2-NEXT: retq
;
; AVX512-LABEL: load_zext_16i8_to_16i16:
-; AVX512: # BB#0: # %entry
+; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
; AVX512-NEXT: retq
entry:
@@ -1273,7 +1273,7 @@ entry:
define <2 x i64> @load_zext_2i16_to_2i64(<2 x i16> *%ptr) {
; SSE2-LABEL: load_zext_2i16_to_2i64:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: pxor %xmm1, %xmm1
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
@@ -1281,7 +1281,7 @@ define <2 x i64> @load_zext_2i16_to_2i64(<2 x i16> *%ptr) {
; SSE2-NEXT: retq
;
; SSSE3-LABEL: load_zext_2i16_to_2i64:
-; SSSE3: # BB#0: # %entry
+; SSSE3: # %bb.0: # %entry
; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSSE3-NEXT: pxor %xmm1, %xmm1
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
@@ -1289,12 +1289,12 @@ define <2 x i64> @load_zext_2i16_to_2i64(<2 x i16> *%ptr) {
; SSSE3-NEXT: retq
;
; SSE41-LABEL: load_zext_2i16_to_2i64:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: pmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero
; SSE41-NEXT: retq
;
; AVX-LABEL: load_zext_2i16_to_2i64:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero
; AVX-NEXT: retq
entry:
@@ -1305,26 +1305,26 @@ entry:
define <4 x i32> @load_zext_4i16_to_4i32(<4 x i16> *%ptr) {
; SSE2-LABEL: load_zext_4i16_to_4i32:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; SSE2-NEXT: pxor %xmm1, %xmm1
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: load_zext_4i16_to_4i32:
-; SSSE3: # BB#0: # %entry
+; SSSE3: # %bb.0: # %entry
; SSSE3-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; SSSE3-NEXT: pxor %xmm1, %xmm1
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: load_zext_4i16_to_4i32:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
; SSE41-NEXT: retq
;
; AVX-LABEL: load_zext_4i16_to_4i32:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
; AVX-NEXT: retq
entry:
@@ -1335,7 +1335,7 @@ entry:
define <4 x i64> @load_zext_4i16_to_4i64(<4 x i16> *%ptr) {
; SSE2-LABEL: load_zext_4i16_to_4i64:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
; SSE2-NEXT: pxor %xmm2, %xmm2
; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
@@ -1345,7 +1345,7 @@ define <4 x i64> @load_zext_4i16_to_4i64(<4 x i16> *%ptr) {
; SSE2-NEXT: retq
;
; SSSE3-LABEL: load_zext_4i16_to_4i64:
-; SSSE3: # BB#0: # %entry
+; SSSE3: # %bb.0: # %entry
; SSSE3-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
; SSSE3-NEXT: pxor %xmm2, %xmm2
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
@@ -1355,25 +1355,25 @@ define <4 x i64> @load_zext_4i16_to_4i64(<4 x i16> *%ptr) {
; SSSE3-NEXT: retq
;
; SSE41-LABEL: load_zext_4i16_to_4i64:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: pmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero
; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero
; SSE41-NEXT: retq
;
; AVX1-LABEL: load_zext_4i16_to_4i64:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero
; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: load_zext_4i16_to_4i64:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; AVX2-NEXT: retq
;
; AVX512-LABEL: load_zext_4i16_to_4i64:
-; AVX512: # BB#0: # %entry
+; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vpmovzxwq {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; AVX512-NEXT: retq
entry:
@@ -1384,7 +1384,7 @@ entry:
define <8 x i32> @load_zext_8i16_to_8i32(<8 x i16> *%ptr) {
; SSE2-LABEL: load_zext_8i16_to_8i32:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa (%rdi), %xmm1
; SSE2-NEXT: pxor %xmm2, %xmm2
; SSE2-NEXT: movdqa %xmm1, %xmm0
@@ -1393,7 +1393,7 @@ define <8 x i32> @load_zext_8i16_to_8i32(<8 x i16> *%ptr) {
; SSE2-NEXT: retq
;
; SSSE3-LABEL: load_zext_8i16_to_8i32:
-; SSSE3: # BB#0: # %entry
+; SSSE3: # %bb.0: # %entry
; SSSE3-NEXT: movdqa (%rdi), %xmm1
; SSSE3-NEXT: pxor %xmm2, %xmm2
; SSSE3-NEXT: movdqa %xmm1, %xmm0
@@ -1402,25 +1402,25 @@ define <8 x i32> @load_zext_8i16_to_8i32(<8 x i16> *%ptr) {
; SSSE3-NEXT: retq
;
; SSE41-LABEL: load_zext_8i16_to_8i32:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
; SSE41-NEXT: retq
;
; AVX1-LABEL: load_zext_8i16_to_8i32:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: load_zext_8i16_to_8i32:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
; AVX2-NEXT: retq
;
; AVX512-LABEL: load_zext_8i16_to_8i32:
-; AVX512: # BB#0: # %entry
+; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
; AVX512-NEXT: retq
entry:
@@ -1431,26 +1431,26 @@ entry:
define <2 x i64> @load_zext_2i32_to_2i64(<2 x i32> *%ptr) {
; SSE2-LABEL: load_zext_2i32_to_2i64:
-; SSE2: # BB#0: # %entry
-; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; SSE2-NEXT: pxor %xmm1, %xmm1
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2: # %bb.0: # %entry
+; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE2-NEXT: xorps %xmm1, %xmm1
+; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: load_zext_2i32_to_2i64:
-; SSSE3: # BB#0: # %entry
-; SSSE3-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; SSSE3-NEXT: pxor %xmm1, %xmm1
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSSE3: # %bb.0: # %entry
+; SSSE3-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; SSSE3-NEXT: xorps %xmm1, %xmm1
+; SSSE3-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: load_zext_2i32_to_2i64:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
; SSE41-NEXT: retq
;
; AVX-LABEL: load_zext_2i32_to_2i64:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
; AVX-NEXT: retq
entry:
@@ -1461,43 +1461,43 @@ entry:
define <4 x i64> @load_zext_4i32_to_4i64(<4 x i32> *%ptr) {
; SSE2-LABEL: load_zext_4i32_to_4i64:
-; SSE2: # BB#0: # %entry
-; SSE2-NEXT: movdqa (%rdi), %xmm1
-; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSE2: # %bb.0: # %entry
+; SSE2-NEXT: movaps (%rdi), %xmm1
+; SSE2-NEXT: xorps %xmm2, %xmm2
+; SSE2-NEXT: movaps %xmm1, %xmm0
+; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: load_zext_4i32_to_4i64:
-; SSSE3: # BB#0: # %entry
-; SSSE3-NEXT: movdqa (%rdi), %xmm1
-; SSSE3-NEXT: pxor %xmm2, %xmm2
-; SSSE3-NEXT: movdqa %xmm1, %xmm0
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSSE3-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSSE3: # %bb.0: # %entry
+; SSSE3-NEXT: movaps (%rdi), %xmm1
+; SSSE3-NEXT: xorps %xmm2, %xmm2
+; SSSE3-NEXT: movaps %xmm1, %xmm0
+; SSSE3-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSSE3-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: load_zext_4i32_to_4i64:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
; SSE41-NEXT: retq
;
; AVX1-LABEL: load_zext_4i32_to_4i64:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: load_zext_4i32_to_4i64:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
; AVX2-NEXT: retq
;
; AVX512-LABEL: load_zext_4i32_to_4i64:
-; AVX512: # BB#0: # %entry
+; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
; AVX512-NEXT: retq
entry:
@@ -1508,7 +1508,7 @@ entry:
define <8 x i32> @zext_8i8_to_8i32(<8 x i8> %z) {
; SSE2-LABEL: zext_8i8_to_8i32:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
; SSE2-NEXT: pxor %xmm2, %xmm2
@@ -1518,7 +1518,7 @@ define <8 x i32> @zext_8i8_to_8i32(<8 x i8> %z) {
; SSE2-NEXT: retq
;
; SSSE3-LABEL: zext_8i8_to_8i32:
-; SSSE3: # BB#0: # %entry
+; SSSE3: # %bb.0: # %entry
; SSSE3-NEXT: movdqa %xmm0, %xmm1
; SSSE3-NEXT: pand {{.*}}(%rip), %xmm1
; SSSE3-NEXT: pxor %xmm2, %xmm2
@@ -1528,7 +1528,7 @@ define <8 x i32> @zext_8i8_to_8i32(<8 x i8> %z) {
; SSSE3-NEXT: retq
;
; SSE41-LABEL: zext_8i8_to_8i32:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: pand {{.*}}(%rip), %xmm0
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
@@ -1537,7 +1537,7 @@ define <8 x i32> @zext_8i8_to_8i32(<8 x i8> %z) {
; SSE41-NEXT: retq
;
; AVX1-LABEL: zext_8i8_to_8i32:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
@@ -1546,13 +1546,13 @@ define <8 x i32> @zext_8i8_to_8i32(<8 x i8> %z) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: zext_8i8_to_8i32:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX2-NEXT: retq
;
; AVX512-LABEL: zext_8i8_to_8i32:
-; AVX512: # BB#0: # %entry
+; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX512-NEXT: retq
@@ -1563,7 +1563,7 @@ entry:
define <8 x i32> @shuf_zext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone ssp {
; SSE2-LABEL: shuf_zext_8i16_to_8i32:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: pxor %xmm2, %xmm2
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
@@ -1571,7 +1571,7 @@ define <8 x i32> @shuf_zext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone
; SSE2-NEXT: retq
;
; SSSE3-LABEL: shuf_zext_8i16_to_8i32:
-; SSSE3: # BB#0: # %entry
+; SSSE3: # %bb.0: # %entry
; SSSE3-NEXT: movdqa %xmm0, %xmm1
; SSSE3-NEXT: pxor %xmm2, %xmm2
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
@@ -1579,7 +1579,7 @@ define <8 x i32> @shuf_zext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuf_zext_8i16_to_8i32:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: movdqa %xmm0, %xmm1
; SSE41-NEXT: pxor %xmm2, %xmm2
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
@@ -1587,7 +1587,7 @@ define <8 x i32> @shuf_zext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone
; SSE41-NEXT: retq
;
; AVX1-LABEL: shuf_zext_8i16_to_8i32:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
@@ -1595,12 +1595,12 @@ define <8 x i32> @shuf_zext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuf_zext_8i16_to_8i32:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX2-NEXT: retq
;
; AVX512-LABEL: shuf_zext_8i16_to_8i32:
-; AVX512: # BB#0: # %entry
+; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX512-NEXT: retq
entry:
@@ -1611,23 +1611,23 @@ entry:
define <4 x i64> @shuf_zext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ssp {
; SSE2-LABEL: shuf_zext_4i32_to_4i64:
-; SSE2: # BB#0: # %entry
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSE2: # %bb.0: # %entry
+; SSE2-NEXT: movaps %xmm0, %xmm1
+; SSE2-NEXT: xorps %xmm2, %xmm2
+; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: shuf_zext_4i32_to_4i64:
-; SSSE3: # BB#0: # %entry
-; SSSE3-NEXT: movdqa %xmm0, %xmm1
-; SSSE3-NEXT: pxor %xmm2, %xmm2
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSSE3-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSSE3: # %bb.0: # %entry
+; SSSE3-NEXT: movaps %xmm0, %xmm1
+; SSSE3-NEXT: xorps %xmm2, %xmm2
+; SSSE3-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSSE3-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuf_zext_4i32_to_4i64:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: movdqa %xmm0, %xmm1
; SSE41-NEXT: pxor %xmm2, %xmm2
; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero
@@ -1635,7 +1635,7 @@ define <4 x i64> @shuf_zext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone
; SSE41-NEXT: retq
;
; AVX1-LABEL: shuf_zext_4i32_to_4i64:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
@@ -1643,12 +1643,12 @@ define <4 x i64> @shuf_zext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuf_zext_4i32_to_4i64:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; AVX2-NEXT: retq
;
; AVX512-LABEL: shuf_zext_4i32_to_4i64:
-; AVX512: # BB#0: # %entry
+; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; AVX512-NEXT: retq
entry:
@@ -1659,7 +1659,7 @@ entry:
define <8 x i32> @shuf_zext_8i8_to_8i32(<8 x i8> %A) {
; SSE2-LABEL: shuf_zext_8i8_to_8i32:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: pand {{.*}}(%rip), %xmm1
; SSE2-NEXT: packuswb %xmm1, %xmm1
@@ -1671,7 +1671,7 @@ define <8 x i32> @shuf_zext_8i8_to_8i32(<8 x i8> %A) {
; SSE2-NEXT: retq
;
; SSSE3-LABEL: shuf_zext_8i8_to_8i32:
-; SSSE3: # BB#0: # %entry
+; SSSE3: # %bb.0: # %entry
; SSSE3-NEXT: movdqa %xmm0, %xmm1
; SSSE3-NEXT: pand {{.*}}(%rip), %xmm1
; SSSE3-NEXT: pxor %xmm2, %xmm2
@@ -1681,7 +1681,7 @@ define <8 x i32> @shuf_zext_8i8_to_8i32(<8 x i8> %A) {
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuf_zext_8i8_to_8i32:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; SSE41-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
@@ -1690,7 +1690,7 @@ define <8 x i32> @shuf_zext_8i8_to_8i32(<8 x i8> %A) {
; SSE41-NEXT: retq
;
; AVX1-LABEL: shuf_zext_8i8_to_8i32:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
@@ -1699,13 +1699,13 @@ define <8 x i32> @shuf_zext_8i8_to_8i32(<8 x i8> %A) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuf_zext_8i8_to_8i32:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
; AVX2-NEXT: retq
;
; AVX512-LABEL: shuf_zext_8i8_to_8i32:
-; AVX512: # BB#0: # %entry
+; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; AVX512-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
; AVX512-NEXT: retq
@@ -1717,7 +1717,7 @@ entry:
define <2 x i64> @shuf_zext_16i8_to_2i64_offset6(<16 x i8> %A) nounwind uwtable readnone ssp {
; SSE2-LABEL: shuf_zext_16i8_to_2i64_offset6:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: pxor %xmm1, %xmm1
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
@@ -1725,18 +1725,18 @@ define <2 x i64> @shuf_zext_16i8_to_2i64_offset6(<16 x i8> %A) nounwind uwtable
; SSE2-NEXT: retq
;
; SSSE3-LABEL: shuf_zext_16i8_to_2i64_offset6:
-; SSSE3: # BB#0: # %entry
+; SSSE3: # %bb.0: # %entry
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuf_zext_16i8_to_2i64_offset6:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: psrlq $48, %xmm0
; SSE41-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
; SSE41-NEXT: retq
;
; AVX-LABEL: shuf_zext_16i8_to_2i64_offset6:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vpsrlq $48, %xmm0, %xmm0
; AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
; AVX-NEXT: retq
@@ -1748,7 +1748,7 @@ entry:
define <4 x i64> @shuf_zext_16i8_to_4i64_offset11(<16 x i8> %A) nounwind uwtable readnone ssp {
; SSE2-LABEL: shuf_zext_16i8_to_4i64_offset11:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero
; SSE2-NEXT: pxor %xmm2, %xmm2
@@ -1761,14 +1761,14 @@ define <4 x i64> @shuf_zext_16i8_to_4i64_offset11(<16 x i8> %A) nounwind uwtable
; SSE2-NEXT: retq
;
; SSSE3-LABEL: shuf_zext_16i8_to_4i64_offset11:
-; SSSE3: # BB#0: # %entry
+; SSSE3: # %bb.0: # %entry
; SSSE3-NEXT: movdqa %xmm0, %xmm1
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[11],zero,zero,zero,zero,zero,zero,zero,xmm0[12],zero,zero,zero,zero,zero,zero,zero
; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[13],zero,zero,zero,zero,zero,zero,zero,xmm1[14],zero,zero,zero,zero,zero,zero,zero
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuf_zext_16i8_to_4i64_offset11:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: movdqa %xmm0, %xmm1
; SSE41-NEXT: psrldq {{.*#+}} xmm1 = xmm1[11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; SSE41-NEXT: pmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
@@ -1778,7 +1778,7 @@ define <4 x i64> @shuf_zext_16i8_to_4i64_offset11(<16 x i8> %A) nounwind uwtable
; SSE41-NEXT: retq
;
; AVX1-LABEL: shuf_zext_16i8_to_4i64_offset11:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vpsrldq {{.*#+}} xmm1 = xmm0[11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
; AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -1787,13 +1787,13 @@ define <4 x i64> @shuf_zext_16i8_to_4i64_offset11(<16 x i8> %A) nounwind uwtable
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuf_zext_16i8_to_4i64_offset11:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; AVX2-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
; AVX2-NEXT: retq
;
; AVX512-LABEL: shuf_zext_16i8_to_4i64_offset11:
-; AVX512: # BB#0: # %entry
+; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; AVX512-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
; AVX512-NEXT: retq
@@ -1805,7 +1805,7 @@ entry:
define <2 x i64> @shuf_zext_8i16_to_2i64_offset6(<8 x i16> %A) nounwind uwtable readnone ssp {
; SSE2-LABEL: shuf_zext_8i16_to_2i64_offset6:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
; SSE2-NEXT: pxor %xmm1, %xmm1
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
@@ -1813,18 +1813,18 @@ define <2 x i64> @shuf_zext_8i16_to_2i64_offset6(<8 x i16> %A) nounwind uwtable
; SSE2-NEXT: retq
;
; SSSE3-LABEL: shuf_zext_8i16_to_2i64_offset6:
-; SSSE3: # BB#0: # %entry
+; SSSE3: # %bb.0: # %entry
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[6,7],zero,zero,zero,zero,zero,zero,xmm0[8,9],zero,zero,zero,zero,zero,zero
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuf_zext_8i16_to_2i64_offset6:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
; SSE41-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
; SSE41-NEXT: retq
;
; AVX-LABEL: shuf_zext_8i16_to_2i64_offset6:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
; AVX-NEXT: retq
@@ -1836,7 +1836,7 @@ entry:
define <4 x i64> @shuf_zext_8i16_to_4i64_offset2(<8 x i16> %A) nounwind uwtable readnone ssp {
; SSE2-LABEL: shuf_zext_8i16_to_4i64_offset2:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: pxor %xmm2, %xmm2
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
@@ -1846,7 +1846,7 @@ define <4 x i64> @shuf_zext_8i16_to_4i64_offset2(<8 x i16> %A) nounwind uwtable
; SSE2-NEXT: retq
;
; SSSE3-LABEL: shuf_zext_8i16_to_4i64_offset2:
-; SSSE3: # BB#0: # %entry
+; SSSE3: # %bb.0: # %entry
; SSSE3-NEXT: movdqa %xmm0, %xmm1
; SSSE3-NEXT: pxor %xmm2, %xmm2
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
@@ -1856,7 +1856,7 @@ define <4 x i64> @shuf_zext_8i16_to_4i64_offset2(<8 x i16> %A) nounwind uwtable
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuf_zext_8i16_to_4i64_offset2:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; SSE41-NEXT: pmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
@@ -1865,7 +1865,7 @@ define <4 x i64> @shuf_zext_8i16_to_4i64_offset2(<8 x i16> %A) nounwind uwtable
; SSE41-NEXT: retq
;
; AVX1-LABEL: shuf_zext_8i16_to_4i64_offset2:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
@@ -1874,13 +1874,13 @@ define <4 x i64> @shuf_zext_8i16_to_4i64_offset2(<8 x i16> %A) nounwind uwtable
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuf_zext_8i16_to_4i64_offset2:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,2,2,3]
; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; AVX2-NEXT: retq
;
; AVX512-LABEL: shuf_zext_8i16_to_4i64_offset2:
-; AVX512: # BB#0: # %entry
+; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,2,2,3]
; AVX512-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; AVX512-NEXT: retq
@@ -1892,27 +1892,27 @@ entry:
define <4 x i32> @shuf_zext_8i16_to_4i32_offset1(<8 x i16> %A) nounwind uwtable readnone ssp {
; SSE2-LABEL: shuf_zext_8i16_to_4i32_offset1:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
; SSE2-NEXT: pxor %xmm1, %xmm1
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: shuf_zext_8i16_to_4i32_offset1:
-; SSSE3: # BB#0: # %entry
+; SSSE3: # %bb.0: # %entry
; SSSE3-NEXT: psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
; SSSE3-NEXT: pxor %xmm1, %xmm1
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuf_zext_8i16_to_4i32_offset1:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; SSE41-NEXT: retq
;
; AVX-LABEL: shuf_zext_8i16_to_4i32_offset1:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; AVX-NEXT: retq
@@ -1924,7 +1924,7 @@ entry:
define <8 x i32> @shuf_zext_8i16_to_8i32_offset3(<8 x i16> %A) nounwind uwtable readnone ssp {
; SSE2-LABEL: shuf_zext_8i16_to_8i32_offset3:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
; SSE2-NEXT: pxor %xmm2, %xmm2
@@ -1933,7 +1933,7 @@ define <8 x i32> @shuf_zext_8i16_to_8i32_offset3(<8 x i16> %A) nounwind uwtable
; SSE2-NEXT: retq
;
; SSSE3-LABEL: shuf_zext_8i16_to_8i32_offset3:
-; SSSE3: # BB#0: # %entry
+; SSSE3: # %bb.0: # %entry
; SSSE3-NEXT: movdqa %xmm0, %xmm1
; SSSE3-NEXT: psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
; SSSE3-NEXT: pxor %xmm2, %xmm2
@@ -1942,7 +1942,7 @@ define <8 x i32> @shuf_zext_8i16_to_8i32_offset3(<8 x i16> %A) nounwind uwtable
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuf_zext_8i16_to_8i32_offset3:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: movdqa %xmm0, %xmm1
; SSE41-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
; SSE41-NEXT: pxor %xmm2, %xmm2
@@ -1951,7 +1951,7 @@ define <8 x i32> @shuf_zext_8i16_to_8i32_offset3(<8 x i16> %A) nounwind uwtable
; SSE41-NEXT: retq
;
; AVX1-LABEL: shuf_zext_8i16_to_8i32_offset3:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
@@ -1960,13 +1960,13 @@ define <8 x i32> @shuf_zext_8i16_to_8i32_offset3(<8 x i16> %A) nounwind uwtable
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuf_zext_8i16_to_8i32_offset3:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX2-NEXT: retq
;
; AVX512-LABEL: shuf_zext_8i16_to_8i32_offset3:
-; AVX512: # BB#0: # %entry
+; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX512-NEXT: retq
@@ -1978,7 +1978,7 @@ entry:
define <8 x i32> @shuf_zext_16i16_to_8i32_offset8(<16 x i16> %A) nounwind uwtable readnone ssp {
; SSE2-LABEL: shuf_zext_16i16_to_8i32_offset8:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: pxor %xmm2, %xmm2
; SSE2-NEXT: movdqa %xmm1, %xmm0
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
@@ -1986,7 +1986,7 @@ define <8 x i32> @shuf_zext_16i16_to_8i32_offset8(<16 x i16> %A) nounwind uwtabl
; SSE2-NEXT: retq
;
; SSSE3-LABEL: shuf_zext_16i16_to_8i32_offset8:
-; SSSE3: # BB#0: # %entry
+; SSSE3: # %bb.0: # %entry
; SSSE3-NEXT: pxor %xmm2, %xmm2
; SSSE3-NEXT: movdqa %xmm1, %xmm0
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
@@ -1994,7 +1994,7 @@ define <8 x i32> @shuf_zext_16i16_to_8i32_offset8(<16 x i16> %A) nounwind uwtabl
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuf_zext_16i16_to_8i32_offset8:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,2,3,3]
; SSE41-NEXT: pxor %xmm2, %xmm2
; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
@@ -2003,7 +2003,7 @@ define <8 x i32> @shuf_zext_16i16_to_8i32_offset8(<16 x i16> %A) nounwind uwtabl
; SSE41-NEXT: retq
;
; AVX1-LABEL: shuf_zext_16i16_to_8i32_offset8:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,2,3,3]
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
@@ -2013,13 +2013,13 @@ define <8 x i32> @shuf_zext_16i16_to_8i32_offset8(<16 x i16> %A) nounwind uwtabl
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuf_zext_16i16_to_8i32_offset8:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX2-NEXT: retq
;
; AVX512-LABEL: shuf_zext_16i16_to_8i32_offset8:
-; AVX512: # BB#0: # %entry
+; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX512-NEXT: retq
@@ -2031,15 +2031,15 @@ entry:
define <2 x i64> @shuf_zext_4i32_to_2i64_offset2(<4 x i32> %A) nounwind uwtable readnone ssp {
; SSE-LABEL: shuf_zext_4i32_to_2i64_offset2:
-; SSE: # BB#0: # %entry
-; SSE-NEXT: pxor %xmm1, %xmm1
-; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE: # %bb.0: # %entry
+; SSE-NEXT: xorps %xmm1, %xmm1
+; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; SSE-NEXT: retq
;
; AVX-LABEL: shuf_zext_4i32_to_2i64_offset2:
-; AVX: # BB#0: # %entry
-; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX: # %bb.0: # %entry
+; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX-NEXT: retq
entry:
%B = shufflevector <4 x i32> %A, <4 x i32> zeroinitializer, <4 x i32> <i32 2, i32 4, i32 3, i32 4>
@@ -2049,7 +2049,7 @@ entry:
define <4 x i64> @shuf_zext_4i32_to_4i64_offset1(<4 x i32> %A) nounwind uwtable readnone ssp {
; SSE2-LABEL: shuf_zext_4i32_to_4i64_offset1:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [0,0,4294967295,0]
; SSE2-NEXT: pand %xmm1, %xmm0
@@ -2057,7 +2057,7 @@ define <4 x i64> @shuf_zext_4i32_to_4i64_offset1(<4 x i32> %A) nounwind uwtable
; SSE2-NEXT: retq
;
; SSSE3-LABEL: shuf_zext_4i32_to_4i64_offset1:
-; SSSE3: # BB#0: # %entry
+; SSSE3: # %bb.0: # %entry
; SSSE3-NEXT: movdqa %xmm0, %xmm1
; SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [0,0,4294967295,0]
; SSSE3-NEXT: pand %xmm1, %xmm0
@@ -2065,7 +2065,7 @@ define <4 x i64> @shuf_zext_4i32_to_4i64_offset1(<4 x i32> %A) nounwind uwtable
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuf_zext_4i32_to_4i64_offset1:
-; SSE41: # BB#0: # %entry
+; SSE41: # %bb.0: # %entry
; SSE41-NEXT: movdqa %xmm0, %xmm1
; SSE41-NEXT: pxor %xmm0, %xmm0
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5],xmm0[6,7]
@@ -2073,7 +2073,7 @@ define <4 x i64> @shuf_zext_4i32_to_4i64_offset1(<4 x i32> %A) nounwind uwtable
; SSE41-NEXT: retq
;
; AVX1-LABEL: shuf_zext_4i32_to_4i64_offset1:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3]
; AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -2081,13 +2081,13 @@ define <4 x i64> @shuf_zext_4i32_to_4i64_offset1(<4 x i32> %A) nounwind uwtable
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuf_zext_4i32_to_4i64_offset1:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,2,3,3]
; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; AVX2-NEXT: retq
;
; AVX512-LABEL: shuf_zext_4i32_to_4i64_offset1:
-; AVX512: # BB#0: # %entry
+; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,2,3,3]
; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; AVX512-NEXT: retq
@@ -2099,7 +2099,7 @@ entry:
define <32 x i32> @zext_32i8_to_32i32(<32 x i8> %x) {
; SSE2-LABEL: zext_32i8_to_32i32:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: pxor %xmm2, %xmm2
; SSE2-NEXT: movdqa %xmm0, %xmm3
; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
@@ -2131,7 +2131,7 @@ define <32 x i32> @zext_32i8_to_32i32(<32 x i8> %x) {
; SSE2-NEXT: retq
;
; SSSE3-LABEL: zext_32i8_to_32i32:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: pxor %xmm2, %xmm2
; SSSE3-NEXT: movdqa %xmm0, %xmm3
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
@@ -2163,7 +2163,7 @@ define <32 x i32> @zext_32i8_to_32i32(<32 x i8> %x) {
; SSSE3-NEXT: retq
;
; SSE41-LABEL: zext_32i8_to_32i32:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,2,3]
; SSE41-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
@@ -2190,7 +2190,7 @@ define <32 x i32> @zext_32i8_to_32i32(<32 x i8> %x) {
; SSE41-NEXT: retq
;
; AVX1-LABEL: zext_32i8_to_32i32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
@@ -2214,7 +2214,7 @@ define <32 x i32> @zext_32i8_to_32i32(<32 x i8> %x) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: zext_32i8_to_32i32:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
@@ -2230,7 +2230,7 @@ define <32 x i32> @zext_32i8_to_32i32(<32 x i8> %x) {
; AVX2-NEXT: retq
;
; AVX512-LABEL: zext_32i8_to_32i32:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
@@ -2239,3 +2239,40 @@ define <32 x i32> @zext_32i8_to_32i32(<32 x i8> %x) {
%res = zext <32 x i8>%x to <32 x i32>
ret <32 x i32> %res
}
+
+define <2 x i32> @zext_2i8_to_2i32(<2 x i8>* %addr) {
+; SSE2-LABEL: zext_2i8_to_2i32:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movzwl (%rdi), %eax
+; SSE2-NEXT: movd %eax, %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; SSE2-NEXT: paddq %xmm0, %xmm0
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: zext_2i8_to_2i32:
+; SSSE3: # %bb.0:
+; SSSE3-NEXT: movzwl (%rdi), %eax
+; SSSE3-NEXT: movd %eax, %xmm0
+; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[3],zero,zero,zero
+; SSSE3-NEXT: paddq %xmm0, %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: zext_2i8_to_2i32:
+; SSE41: # %bb.0:
+; SSE41-NEXT: pmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT: paddq %xmm0, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: zext_2i8_to_2i32:
+; AVX: # %bb.0:
+; AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
+; AVX-NEXT: vpaddq %xmm0, %xmm0, %xmm0
+; AVX-NEXT: retq
+ %x = load <2 x i8>, <2 x i8>* %addr, align 1
+ %y = zext <2 x i8> %x to <2 x i32>
+ %z = add <2 x i32>%y, %y
+ ret <2 x i32>%z
+}
diff --git a/test/CodeGen/X86/vector-zmov.ll b/test/CodeGen/X86/vector-zmov.ll
index 106177ecda85..6f2f78263b28 100644
--- a/test/CodeGen/X86/vector-zmov.ll
+++ b/test/CodeGen/X86/vector-zmov.ll
@@ -7,12 +7,12 @@
define <4 x i32> @load_zmov_4i32_to_0zzz(<4 x i32> *%ptr) {
; SSE-LABEL: load_zmov_4i32_to_0zzz:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE-NEXT: retq
;
; AVX-LABEL: load_zmov_4i32_to_0zzz:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX-NEXT: retq
entry:
@@ -23,12 +23,12 @@ entry:
define <2 x i64> @load_zmov_2i64_to_0z(<2 x i64> *%ptr) {
; SSE-LABEL: load_zmov_2i64_to_0z:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; SSE-NEXT: retq
;
; AVX-LABEL: load_zmov_2i64_to_0z:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: retq
entry:
diff --git a/test/CodeGen/X86/vector.ll b/test/CodeGen/X86/vector.ll
index 39e7f0e0b066..a731643d7d4e 100644
--- a/test/CodeGen/X86/vector.ll
+++ b/test/CodeGen/X86/vector.ll
@@ -1,6 +1,6 @@
; Test that vectors are scalarized/lowered correctly.
-; RUN: llc < %s -march=x86 -mcpu=i386
-; RUN: llc < %s -march=x86 -mcpu=yonah
+; RUN: llc < %s -mtriple=i686-- -mcpu=i386
+; RUN: llc < %s -mtriple=i686-- -mcpu=yonah
%d8 = type <8 x double>
%f1 = type <1 x float>
diff --git a/test/CodeGen/X86/verifier-phi-fail0.mir b/test/CodeGen/X86/verifier-phi-fail0.mir
new file mode 100644
index 000000000000..c17b0daa75ba
--- /dev/null
+++ b/test/CodeGen/X86/verifier-phi-fail0.mir
@@ -0,0 +1,30 @@
+# RUN: not llc -o - %s -mtriple=x86_64-- -verify-machineinstrs -run-pass=none 2>&1 | FileCheck %s
+---
+# CHECK: Bad machine code: PHI operand is not live-out from predecessor
+# CHECK: - function: func0
+# CHECK: - basic block: %bb.3
+# CHECK: - instruction: %0:gr32 = PHI
+# CHECK: - operand 1: %1
+#
+# CHECK: Bad machine code: PHI operand is not live-out from predecessor
+# CHECK: - function: func0
+# CHECK: - basic block: %bb.3
+# CHECK: - instruction: %0:gr32 = PHI
+# CHECK: - operand 3: %0
+name: func0
+tracksRegLiveness: true
+body: |
+ bb.0:
+ JE_1 %bb.1, implicit undef %eflags
+ JMP_1 %bb.2
+
+ bb.1:
+ %0:gr32 = IMPLICIT_DEF
+ JMP_1 %bb.3
+
+ bb.2:
+ %1:gr32 = IMPLICIT_DEF
+
+ bb.3:
+ %0:gr32 = PHI %1, %bb.1, %0, %bb.2
+...
diff --git a/test/CodeGen/X86/verifier-phi.mir b/test/CodeGen/X86/verifier-phi.mir
new file mode 100644
index 000000000000..78060dc0e736
--- /dev/null
+++ b/test/CodeGen/X86/verifier-phi.mir
@@ -0,0 +1,34 @@
+# RUN: llc -o - %s -mtriple=x86_64-- -verify-machineinstrs -run-pass=none | FileCheck %s
+# This should cleanly pass the machine verifier
+---
+# CHECK-LABEL: name: func0
+# CHECK: %0:gr32 = PHI undef %1:gr32, %bb.0, undef %1:gr32, %bb.1
+name: func0
+tracksRegLiveness: true
+body: |
+ bb.0:
+ JE_1 %bb.1, implicit undef %eflags
+ JMP_1 %bb.2
+
+ bb.1:
+
+ bb.2:
+ %0 : gr32 = PHI undef %1 : gr32, %bb.0, undef %1 : gr32, %bb.1
+...
+---
+# CHECK-LABEL: name: func1
+# CHECK: %2:gr32 = PHI %0, %bb.0, %1, %bb.1
+name: func1
+tracksRegLiveness: true
+body: |
+ bb.0:
+ %0 : gr32 = IMPLICIT_DEF
+ JE_1 %bb.1, implicit undef %eflags
+ JMP_1 %bb.2
+
+ bb.1:
+ %1 : gr32 = IMPLICIT_DEF
+
+ bb.2:
+ %2 : gr32 = PHI %0, %bb.0, %1, %bb.1
+...
diff --git a/test/CodeGen/X86/vfcmp.ll b/test/CodeGen/X86/vfcmp.ll
index f5f5293622b2..6d1b84d8b126 100644
--- a/test/CodeGen/X86/vfcmp.ll
+++ b/test/CodeGen/X86/vfcmp.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2
+; RUN: llc < %s -mtriple=i686-- -mattr=+sse2
; PR2620
diff --git a/test/CodeGen/X86/viabs.ll b/test/CodeGen/X86/viabs.ll
index f5ec8e540b0b..fa8bbaa6554a 100644
--- a/test/CodeGen/X86/viabs.ll
+++ b/test/CodeGen/X86/viabs.ll
@@ -3,12 +3,12 @@
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSSE3
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F --check-prefix=AVX512VL
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512vl --show-mc-encoding | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F --check-prefix=AVX512VL
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512vl,+avx512bw --show-mc-encoding | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512BW
define <4 x i32> @test_abs_gt_v4i32(<4 x i32> %a) nounwind {
; SSE2-LABEL: test_abs_gt_v4i32:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrad $31, %xmm1
; SSE2-NEXT: paddd %xmm1, %xmm0
@@ -16,14 +16,24 @@ define <4 x i32> @test_abs_gt_v4i32(<4 x i32> %a) nounwind {
; SSE2-NEXT: retq
;
; SSSE3-LABEL: test_abs_gt_v4i32:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: pabsd %xmm0, %xmm0
; SSSE3-NEXT: retq
;
-; AVX-LABEL: test_abs_gt_v4i32:
-; AVX: # BB#0:
-; AVX-NEXT: vpabsd %xmm0, %xmm0
-; AVX-NEXT: retq
+; AVX1-LABEL: test_abs_gt_v4i32:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpabsd %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_abs_gt_v4i32:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpabsd %xmm0, %xmm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test_abs_gt_v4i32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpabsd %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x1e,0xc0]
+; AVX512-NEXT: retq # encoding: [0xc3]
%tmp1neg = sub <4 x i32> zeroinitializer, %a
%b = icmp sgt <4 x i32> %a, <i32 -1, i32 -1, i32 -1, i32 -1>
%abs = select <4 x i1> %b, <4 x i32> %a, <4 x i32> %tmp1neg
@@ -32,7 +42,7 @@ define <4 x i32> @test_abs_gt_v4i32(<4 x i32> %a) nounwind {
define <4 x i32> @test_abs_ge_v4i32(<4 x i32> %a) nounwind {
; SSE2-LABEL: test_abs_ge_v4i32:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrad $31, %xmm1
; SSE2-NEXT: paddd %xmm1, %xmm0
@@ -40,14 +50,24 @@ define <4 x i32> @test_abs_ge_v4i32(<4 x i32> %a) nounwind {
; SSE2-NEXT: retq
;
; SSSE3-LABEL: test_abs_ge_v4i32:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: pabsd %xmm0, %xmm0
; SSSE3-NEXT: retq
;
-; AVX-LABEL: test_abs_ge_v4i32:
-; AVX: # BB#0:
-; AVX-NEXT: vpabsd %xmm0, %xmm0
-; AVX-NEXT: retq
+; AVX1-LABEL: test_abs_ge_v4i32:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpabsd %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_abs_ge_v4i32:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpabsd %xmm0, %xmm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test_abs_ge_v4i32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpabsd %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x1e,0xc0]
+; AVX512-NEXT: retq # encoding: [0xc3]
%tmp1neg = sub <4 x i32> zeroinitializer, %a
%b = icmp sge <4 x i32> %a, zeroinitializer
%abs = select <4 x i1> %b, <4 x i32> %a, <4 x i32> %tmp1neg
@@ -56,7 +76,7 @@ define <4 x i32> @test_abs_ge_v4i32(<4 x i32> %a) nounwind {
define <8 x i16> @test_abs_gt_v8i16(<8 x i16> %a) nounwind {
; SSE2-LABEL: test_abs_gt_v8i16:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psraw $15, %xmm1
; SSE2-NEXT: paddw %xmm1, %xmm0
@@ -64,14 +84,29 @@ define <8 x i16> @test_abs_gt_v8i16(<8 x i16> %a) nounwind {
; SSE2-NEXT: retq
;
; SSSE3-LABEL: test_abs_gt_v8i16:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: pabsw %xmm0, %xmm0
; SSSE3-NEXT: retq
;
-; AVX-LABEL: test_abs_gt_v8i16:
-; AVX: # BB#0:
-; AVX-NEXT: vpabsw %xmm0, %xmm0
-; AVX-NEXT: retq
+; AVX1-LABEL: test_abs_gt_v8i16:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpabsw %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_abs_gt_v8i16:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpabsw %xmm0, %xmm0
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: test_abs_gt_v8i16:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpabsw %xmm0, %xmm0 # encoding: [0xc4,0xe2,0x79,0x1d,0xc0]
+; AVX512F-NEXT: retq # encoding: [0xc3]
+;
+; AVX512BW-LABEL: test_abs_gt_v8i16:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpabsw %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x1d,0xc0]
+; AVX512BW-NEXT: retq # encoding: [0xc3]
%tmp1neg = sub <8 x i16> zeroinitializer, %a
%b = icmp sgt <8 x i16> %a, zeroinitializer
%abs = select <8 x i1> %b, <8 x i16> %a, <8 x i16> %tmp1neg
@@ -80,7 +115,7 @@ define <8 x i16> @test_abs_gt_v8i16(<8 x i16> %a) nounwind {
define <16 x i8> @test_abs_lt_v16i8(<16 x i8> %a) nounwind {
; SSE2-LABEL: test_abs_lt_v16i8:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: pxor %xmm1, %xmm1
; SSE2-NEXT: pcmpgtb %xmm0, %xmm1
; SSE2-NEXT: paddb %xmm1, %xmm0
@@ -88,14 +123,29 @@ define <16 x i8> @test_abs_lt_v16i8(<16 x i8> %a) nounwind {
; SSE2-NEXT: retq
;
; SSSE3-LABEL: test_abs_lt_v16i8:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: pabsb %xmm0, %xmm0
; SSSE3-NEXT: retq
;
-; AVX-LABEL: test_abs_lt_v16i8:
-; AVX: # BB#0:
-; AVX-NEXT: vpabsb %xmm0, %xmm0
-; AVX-NEXT: retq
+; AVX1-LABEL: test_abs_lt_v16i8:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpabsb %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_abs_lt_v16i8:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpabsb %xmm0, %xmm0
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: test_abs_lt_v16i8:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpabsb %xmm0, %xmm0 # encoding: [0xc4,0xe2,0x79,0x1c,0xc0]
+; AVX512F-NEXT: retq # encoding: [0xc3]
+;
+; AVX512BW-LABEL: test_abs_lt_v16i8:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpabsb %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x1c,0xc0]
+; AVX512BW-NEXT: retq # encoding: [0xc3]
%tmp1neg = sub <16 x i8> zeroinitializer, %a
%b = icmp slt <16 x i8> %a, zeroinitializer
%abs = select <16 x i1> %b, <16 x i8> %tmp1neg, <16 x i8> %a
@@ -104,7 +154,7 @@ define <16 x i8> @test_abs_lt_v16i8(<16 x i8> %a) nounwind {
define <4 x i32> @test_abs_le_v4i32(<4 x i32> %a) nounwind {
; SSE2-LABEL: test_abs_le_v4i32:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrad $31, %xmm1
; SSE2-NEXT: paddd %xmm1, %xmm0
@@ -112,14 +162,24 @@ define <4 x i32> @test_abs_le_v4i32(<4 x i32> %a) nounwind {
; SSE2-NEXT: retq
;
; SSSE3-LABEL: test_abs_le_v4i32:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: pabsd %xmm0, %xmm0
; SSSE3-NEXT: retq
;
-; AVX-LABEL: test_abs_le_v4i32:
-; AVX: # BB#0:
-; AVX-NEXT: vpabsd %xmm0, %xmm0
-; AVX-NEXT: retq
+; AVX1-LABEL: test_abs_le_v4i32:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpabsd %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_abs_le_v4i32:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpabsd %xmm0, %xmm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test_abs_le_v4i32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpabsd %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x1e,0xc0]
+; AVX512-NEXT: retq # encoding: [0xc3]
%tmp1neg = sub <4 x i32> zeroinitializer, %a
%b = icmp sle <4 x i32> %a, zeroinitializer
%abs = select <4 x i1> %b, <4 x i32> %tmp1neg, <4 x i32> %a
@@ -128,7 +188,7 @@ define <4 x i32> @test_abs_le_v4i32(<4 x i32> %a) nounwind {
define <8 x i32> @test_abs_gt_v8i32(<8 x i32> %a) nounwind {
; SSE2-LABEL: test_abs_gt_v8i32:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: psrad $31, %xmm2
; SSE2-NEXT: paddd %xmm2, %xmm0
@@ -140,13 +200,13 @@ define <8 x i32> @test_abs_gt_v8i32(<8 x i32> %a) nounwind {
; SSE2-NEXT: retq
;
; SSSE3-LABEL: test_abs_gt_v8i32:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: pabsd %xmm0, %xmm0
; SSSE3-NEXT: pabsd %xmm1, %xmm1
; SSSE3-NEXT: retq
;
; AVX1-LABEL: test_abs_gt_v8i32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpabsd %xmm0, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpabsd %xmm0, %xmm0
@@ -154,14 +214,14 @@ define <8 x i32> @test_abs_gt_v8i32(<8 x i32> %a) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_abs_gt_v8i32:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpabsd %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: test_abs_gt_v8i32:
-; AVX512: # BB#0:
-; AVX512-NEXT: vpabsd %ymm0, %ymm0
-; AVX512-NEXT: retq
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpabsd %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x1e,0xc0]
+; AVX512-NEXT: retq # encoding: [0xc3]
%tmp1neg = sub <8 x i32> zeroinitializer, %a
%b = icmp sgt <8 x i32> %a, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
%abs = select <8 x i1> %b, <8 x i32> %a, <8 x i32> %tmp1neg
@@ -170,7 +230,7 @@ define <8 x i32> @test_abs_gt_v8i32(<8 x i32> %a) nounwind {
define <8 x i32> @test_abs_ge_v8i32(<8 x i32> %a) nounwind {
; SSE2-LABEL: test_abs_ge_v8i32:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: psrad $31, %xmm2
; SSE2-NEXT: paddd %xmm2, %xmm0
@@ -182,13 +242,13 @@ define <8 x i32> @test_abs_ge_v8i32(<8 x i32> %a) nounwind {
; SSE2-NEXT: retq
;
; SSSE3-LABEL: test_abs_ge_v8i32:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: pabsd %xmm0, %xmm0
; SSSE3-NEXT: pabsd %xmm1, %xmm1
; SSSE3-NEXT: retq
;
; AVX1-LABEL: test_abs_ge_v8i32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpabsd %xmm0, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpabsd %xmm0, %xmm0
@@ -196,14 +256,14 @@ define <8 x i32> @test_abs_ge_v8i32(<8 x i32> %a) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_abs_ge_v8i32:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpabsd %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: test_abs_ge_v8i32:
-; AVX512: # BB#0:
-; AVX512-NEXT: vpabsd %ymm0, %ymm0
-; AVX512-NEXT: retq
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpabsd %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x1e,0xc0]
+; AVX512-NEXT: retq # encoding: [0xc3]
%tmp1neg = sub <8 x i32> zeroinitializer, %a
%b = icmp sge <8 x i32> %a, zeroinitializer
%abs = select <8 x i1> %b, <8 x i32> %a, <8 x i32> %tmp1neg
@@ -212,7 +272,7 @@ define <8 x i32> @test_abs_ge_v8i32(<8 x i32> %a) nounwind {
define <16 x i16> @test_abs_gt_v16i16(<16 x i16> %a) nounwind {
; SSE2-LABEL: test_abs_gt_v16i16:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: psraw $15, %xmm2
; SSE2-NEXT: paddw %xmm2, %xmm0
@@ -224,13 +284,13 @@ define <16 x i16> @test_abs_gt_v16i16(<16 x i16> %a) nounwind {
; SSE2-NEXT: retq
;
; SSSE3-LABEL: test_abs_gt_v16i16:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: pabsw %xmm0, %xmm0
; SSSE3-NEXT: pabsw %xmm1, %xmm1
; SSSE3-NEXT: retq
;
; AVX1-LABEL: test_abs_gt_v16i16:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpabsw %xmm0, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpabsw %xmm0, %xmm0
@@ -238,14 +298,19 @@ define <16 x i16> @test_abs_gt_v16i16(<16 x i16> %a) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_abs_gt_v16i16:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpabsw %ymm0, %ymm0
; AVX2-NEXT: retq
;
-; AVX512-LABEL: test_abs_gt_v16i16:
-; AVX512: # BB#0:
-; AVX512-NEXT: vpabsw %ymm0, %ymm0
-; AVX512-NEXT: retq
+; AVX512F-LABEL: test_abs_gt_v16i16:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpabsw %ymm0, %ymm0 # encoding: [0xc4,0xe2,0x7d,0x1d,0xc0]
+; AVX512F-NEXT: retq # encoding: [0xc3]
+;
+; AVX512BW-LABEL: test_abs_gt_v16i16:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpabsw %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x1d,0xc0]
+; AVX512BW-NEXT: retq # encoding: [0xc3]
%tmp1neg = sub <16 x i16> zeroinitializer, %a
%b = icmp sgt <16 x i16> %a, zeroinitializer
%abs = select <16 x i1> %b, <16 x i16> %a, <16 x i16> %tmp1neg
@@ -254,7 +319,7 @@ define <16 x i16> @test_abs_gt_v16i16(<16 x i16> %a) nounwind {
define <32 x i8> @test_abs_lt_v32i8(<32 x i8> %a) nounwind {
; SSE2-LABEL: test_abs_lt_v32i8:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: pxor %xmm2, %xmm2
; SSE2-NEXT: pxor %xmm3, %xmm3
; SSE2-NEXT: pcmpgtb %xmm0, %xmm3
@@ -266,13 +331,13 @@ define <32 x i8> @test_abs_lt_v32i8(<32 x i8> %a) nounwind {
; SSE2-NEXT: retq
;
; SSSE3-LABEL: test_abs_lt_v32i8:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: pabsb %xmm0, %xmm0
; SSSE3-NEXT: pabsb %xmm1, %xmm1
; SSSE3-NEXT: retq
;
; AVX1-LABEL: test_abs_lt_v32i8:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpabsb %xmm0, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpabsb %xmm0, %xmm0
@@ -280,14 +345,19 @@ define <32 x i8> @test_abs_lt_v32i8(<32 x i8> %a) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_abs_lt_v32i8:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpabsb %ymm0, %ymm0
; AVX2-NEXT: retq
;
-; AVX512-LABEL: test_abs_lt_v32i8:
-; AVX512: # BB#0:
-; AVX512-NEXT: vpabsb %ymm0, %ymm0
-; AVX512-NEXT: retq
+; AVX512F-LABEL: test_abs_lt_v32i8:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpabsb %ymm0, %ymm0 # encoding: [0xc4,0xe2,0x7d,0x1c,0xc0]
+; AVX512F-NEXT: retq # encoding: [0xc3]
+;
+; AVX512BW-LABEL: test_abs_lt_v32i8:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpabsb %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x1c,0xc0]
+; AVX512BW-NEXT: retq # encoding: [0xc3]
%tmp1neg = sub <32 x i8> zeroinitializer, %a
%b = icmp slt <32 x i8> %a, zeroinitializer
%abs = select <32 x i1> %b, <32 x i8> %tmp1neg, <32 x i8> %a
@@ -296,7 +366,7 @@ define <32 x i8> @test_abs_lt_v32i8(<32 x i8> %a) nounwind {
define <8 x i32> @test_abs_le_v8i32(<8 x i32> %a) nounwind {
; SSE2-LABEL: test_abs_le_v8i32:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: psrad $31, %xmm2
; SSE2-NEXT: paddd %xmm2, %xmm0
@@ -308,13 +378,13 @@ define <8 x i32> @test_abs_le_v8i32(<8 x i32> %a) nounwind {
; SSE2-NEXT: retq
;
; SSSE3-LABEL: test_abs_le_v8i32:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: pabsd %xmm0, %xmm0
; SSSE3-NEXT: pabsd %xmm1, %xmm1
; SSSE3-NEXT: retq
;
; AVX1-LABEL: test_abs_le_v8i32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpabsd %xmm0, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpabsd %xmm0, %xmm0
@@ -322,14 +392,14 @@ define <8 x i32> @test_abs_le_v8i32(<8 x i32> %a) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_abs_le_v8i32:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpabsd %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: test_abs_le_v8i32:
-; AVX512: # BB#0:
-; AVX512-NEXT: vpabsd %ymm0, %ymm0
-; AVX512-NEXT: retq
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpabsd %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x1e,0xc0]
+; AVX512-NEXT: retq # encoding: [0xc3]
%tmp1neg = sub <8 x i32> zeroinitializer, %a
%b = icmp sle <8 x i32> %a, zeroinitializer
%abs = select <8 x i1> %b, <8 x i32> %tmp1neg, <8 x i32> %a
@@ -338,7 +408,7 @@ define <8 x i32> @test_abs_le_v8i32(<8 x i32> %a) nounwind {
define <16 x i32> @test_abs_le_16i32(<16 x i32> %a) nounwind {
; SSE2-LABEL: test_abs_le_16i32:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm0, %xmm4
; SSE2-NEXT: psrad $31, %xmm4
; SSE2-NEXT: paddd %xmm4, %xmm0
@@ -358,7 +428,7 @@ define <16 x i32> @test_abs_le_16i32(<16 x i32> %a) nounwind {
; SSE2-NEXT: retq
;
; SSSE3-LABEL: test_abs_le_16i32:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: pabsd %xmm0, %xmm0
; SSSE3-NEXT: pabsd %xmm1, %xmm1
; SSSE3-NEXT: pabsd %xmm2, %xmm2
@@ -366,7 +436,7 @@ define <16 x i32> @test_abs_le_16i32(<16 x i32> %a) nounwind {
; SSSE3-NEXT: retq
;
; AVX1-LABEL: test_abs_le_16i32:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpabsd %xmm0, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpabsd %xmm0, %xmm0
@@ -378,15 +448,15 @@ define <16 x i32> @test_abs_le_16i32(<16 x i32> %a) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_abs_le_16i32:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpabsd %ymm0, %ymm0
; AVX2-NEXT: vpabsd %ymm1, %ymm1
; AVX2-NEXT: retq
;
; AVX512-LABEL: test_abs_le_16i32:
-; AVX512: # BB#0:
-; AVX512-NEXT: vpabsd %zmm0, %zmm0
-; AVX512-NEXT: retq
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpabsd %zmm0, %zmm0 # encoding: [0x62,0xf2,0x7d,0x48,0x1e,0xc0]
+; AVX512-NEXT: retq # encoding: [0xc3]
%tmp1neg = sub <16 x i32> zeroinitializer, %a
%b = icmp sle <16 x i32> %a, zeroinitializer
%abs = select <16 x i1> %b, <16 x i32> %tmp1neg, <16 x i32> %a
@@ -395,7 +465,7 @@ define <16 x i32> @test_abs_le_16i32(<16 x i32> %a) nounwind {
define <2 x i64> @test_abs_ge_v2i64(<2 x i64> %a) nounwind {
; SSE-LABEL: test_abs_ge_v2i64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movdqa %xmm0, %xmm1
; SSE-NEXT: psrad $31, %xmm1
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
@@ -404,7 +474,7 @@ define <2 x i64> @test_abs_ge_v2i64(<2 x i64> %a) nounwind {
; SSE-NEXT: retq
;
; AVX1-LABEL: test_abs_ge_v2i64:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm1
; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
@@ -412,7 +482,7 @@ define <2 x i64> @test_abs_ge_v2i64(<2 x i64> %a) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_abs_ge_v2i64:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm1
; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0
@@ -420,9 +490,9 @@ define <2 x i64> @test_abs_ge_v2i64(<2 x i64> %a) nounwind {
; AVX2-NEXT: retq
;
; AVX512-LABEL: test_abs_ge_v2i64:
-; AVX512: # BB#0:
-; AVX512-NEXT: vpabsq %xmm0, %xmm0
-; AVX512-NEXT: retq
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpabsq %xmm0, %xmm0 # encoding: [0x62,0xf2,0xfd,0x08,0x1f,0xc0]
+; AVX512-NEXT: retq # encoding: [0xc3]
%tmp1neg = sub <2 x i64> zeroinitializer, %a
%b = icmp sge <2 x i64> %a, zeroinitializer
%abs = select <2 x i1> %b, <2 x i64> %a, <2 x i64> %tmp1neg
@@ -431,7 +501,7 @@ define <2 x i64> @test_abs_ge_v2i64(<2 x i64> %a) nounwind {
define <4 x i64> @test_abs_gt_v4i64(<4 x i64> %a) nounwind {
; SSE-LABEL: test_abs_gt_v4i64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movdqa %xmm0, %xmm2
; SSE-NEXT: psrad $31, %xmm2
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
@@ -445,7 +515,7 @@ define <4 x i64> @test_abs_gt_v4i64(<4 x i64> %a) nounwind {
; SSE-NEXT: retq
;
; AVX1-LABEL: test_abs_gt_v4i64:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm3
@@ -458,17 +528,17 @@ define <4 x i64> @test_abs_gt_v4i64(<4 x i64> %a) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_abs_gt_v4i64:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm1
; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: test_abs_gt_v4i64:
-; AVX512: # BB#0:
-; AVX512-NEXT: vpabsq %ymm0, %ymm0
-; AVX512-NEXT: retq
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpabsq %ymm0, %ymm0 # encoding: [0x62,0xf2,0xfd,0x28,0x1f,0xc0]
+; AVX512-NEXT: retq # encoding: [0xc3]
%tmp1neg = sub <4 x i64> zeroinitializer, %a
%b = icmp sgt <4 x i64> %a, <i64 -1, i64 -1, i64 -1, i64 -1>
%abs = select <4 x i1> %b, <4 x i64> %a, <4 x i64> %tmp1neg
@@ -477,7 +547,7 @@ define <4 x i64> @test_abs_gt_v4i64(<4 x i64> %a) nounwind {
define <8 x i64> @test_abs_le_v8i64(<8 x i64> %a) nounwind {
; SSE-LABEL: test_abs_le_v8i64:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movdqa %xmm0, %xmm4
; SSE-NEXT: psrad $31, %xmm4
; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
@@ -501,7 +571,7 @@ define <8 x i64> @test_abs_le_v8i64(<8 x i64> %a) nounwind {
; SSE-NEXT: retq
;
; AVX1-LABEL: test_abs_le_v8i64:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm4
@@ -522,8 +592,8 @@ define <8 x i64> @test_abs_le_v8i64(<8 x i64> %a) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_abs_le_v8i64:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpxor %ymm2, %ymm2, %ymm2
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX2-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3
; AVX2-NEXT: vpaddq %ymm3, %ymm0, %ymm0
; AVX2-NEXT: vpxor %ymm3, %ymm0, %ymm0
@@ -533,9 +603,9 @@ define <8 x i64> @test_abs_le_v8i64(<8 x i64> %a) nounwind {
; AVX2-NEXT: retq
;
; AVX512-LABEL: test_abs_le_v8i64:
-; AVX512: # BB#0:
-; AVX512-NEXT: vpabsq %zmm0, %zmm0
-; AVX512-NEXT: retq
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpabsq %zmm0, %zmm0 # encoding: [0x62,0xf2,0xfd,0x48,0x1f,0xc0]
+; AVX512-NEXT: retq # encoding: [0xc3]
%tmp1neg = sub <8 x i64> zeroinitializer, %a
%b = icmp sle <8 x i64> %a, zeroinitializer
%abs = select <8 x i1> %b, <8 x i64> %tmp1neg, <8 x i64> %a
@@ -544,7 +614,7 @@ define <8 x i64> @test_abs_le_v8i64(<8 x i64> %a) nounwind {
define <8 x i64> @test_abs_le_v8i64_fold(<8 x i64>* %a.ptr) nounwind {
; SSE-LABEL: test_abs_le_v8i64_fold:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movdqu (%rdi), %xmm0
; SSE-NEXT: movdqu 16(%rdi), %xmm1
; SSE-NEXT: movdqu 32(%rdi), %xmm2
@@ -572,7 +642,7 @@ define <8 x i64> @test_abs_le_v8i64_fold(<8 x i64>* %a.ptr) nounwind {
; SSE-NEXT: retq
;
; AVX1-LABEL: test_abs_le_v8i64_fold:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovdqu (%rdi), %ymm0
; AVX1-NEXT: vmovdqu 32(%rdi), %ymm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
@@ -595,10 +665,10 @@ define <8 x i64> @test_abs_le_v8i64_fold(<8 x i64>* %a.ptr) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_abs_le_v8i64_fold:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqu (%rdi), %ymm0
; AVX2-NEXT: vmovdqu 32(%rdi), %ymm1
-; AVX2-NEXT: vpxor %ymm2, %ymm2, %ymm2
+; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX2-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3
; AVX2-NEXT: vpaddq %ymm3, %ymm0, %ymm0
; AVX2-NEXT: vpxor %ymm3, %ymm0, %ymm0
@@ -608,9 +678,9 @@ define <8 x i64> @test_abs_le_v8i64_fold(<8 x i64>* %a.ptr) nounwind {
; AVX2-NEXT: retq
;
; AVX512-LABEL: test_abs_le_v8i64_fold:
-; AVX512: # BB#0:
-; AVX512-NEXT: vpabsq (%rdi), %zmm0
-; AVX512-NEXT: retq
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpabsq (%rdi), %zmm0 # encoding: [0x62,0xf2,0xfd,0x48,0x1f,0x07]
+; AVX512-NEXT: retq # encoding: [0xc3]
%a = load <8 x i64>, <8 x i64>* %a.ptr, align 8
%tmp1neg = sub <8 x i64> zeroinitializer, %a
%b = icmp sle <8 x i64> %a, zeroinitializer
@@ -620,7 +690,7 @@ define <8 x i64> @test_abs_le_v8i64_fold(<8 x i64>* %a.ptr) nounwind {
define <64 x i8> @test_abs_lt_v64i8(<64 x i8> %a) nounwind {
; SSE2-LABEL: test_abs_lt_v64i8:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: pxor %xmm4, %xmm4
; SSE2-NEXT: pxor %xmm5, %xmm5
; SSE2-NEXT: pcmpgtb %xmm0, %xmm5
@@ -640,7 +710,7 @@ define <64 x i8> @test_abs_lt_v64i8(<64 x i8> %a) nounwind {
; SSE2-NEXT: retq
;
; SSSE3-LABEL: test_abs_lt_v64i8:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: pabsb %xmm0, %xmm0
; SSSE3-NEXT: pabsb %xmm1, %xmm1
; SSSE3-NEXT: pabsb %xmm2, %xmm2
@@ -648,7 +718,7 @@ define <64 x i8> @test_abs_lt_v64i8(<64 x i8> %a) nounwind {
; SSSE3-NEXT: retq
;
; AVX1-LABEL: test_abs_lt_v64i8:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpabsb %xmm0, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpabsb %xmm0, %xmm0
@@ -660,21 +730,21 @@ define <64 x i8> @test_abs_lt_v64i8(<64 x i8> %a) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_abs_lt_v64i8:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpabsb %ymm0, %ymm0
; AVX2-NEXT: vpabsb %ymm1, %ymm1
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test_abs_lt_v64i8:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vpabsb %ymm0, %ymm0
-; AVX512F-NEXT: vpabsb %ymm1, %ymm1
-; AVX512F-NEXT: retq
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpabsb %ymm0, %ymm0 # encoding: [0xc4,0xe2,0x7d,0x1c,0xc0]
+; AVX512F-NEXT: vpabsb %ymm1, %ymm1 # encoding: [0xc4,0xe2,0x7d,0x1c,0xc9]
+; AVX512F-NEXT: retq # encoding: [0xc3]
;
; AVX512BW-LABEL: test_abs_lt_v64i8:
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vpabsb %zmm0, %zmm0
-; AVX512BW-NEXT: retq
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpabsb %zmm0, %zmm0 # encoding: [0x62,0xf2,0x7d,0x48,0x1c,0xc0]
+; AVX512BW-NEXT: retq # encoding: [0xc3]
%tmp1neg = sub <64 x i8> zeroinitializer, %a
%b = icmp slt <64 x i8> %a, zeroinitializer
%abs = select <64 x i1> %b, <64 x i8> %tmp1neg, <64 x i8> %a
@@ -683,7 +753,7 @@ define <64 x i8> @test_abs_lt_v64i8(<64 x i8> %a) nounwind {
define <32 x i16> @test_abs_gt_v32i16(<32 x i16> %a) nounwind {
; SSE2-LABEL: test_abs_gt_v32i16:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm0, %xmm4
; SSE2-NEXT: psraw $15, %xmm4
; SSE2-NEXT: paddw %xmm4, %xmm0
@@ -703,7 +773,7 @@ define <32 x i16> @test_abs_gt_v32i16(<32 x i16> %a) nounwind {
; SSE2-NEXT: retq
;
; SSSE3-LABEL: test_abs_gt_v32i16:
-; SSSE3: # BB#0:
+; SSSE3: # %bb.0:
; SSSE3-NEXT: pabsw %xmm0, %xmm0
; SSSE3-NEXT: pabsw %xmm1, %xmm1
; SSSE3-NEXT: pabsw %xmm2, %xmm2
@@ -711,7 +781,7 @@ define <32 x i16> @test_abs_gt_v32i16(<32 x i16> %a) nounwind {
; SSSE3-NEXT: retq
;
; AVX1-LABEL: test_abs_gt_v32i16:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpabsw %xmm0, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpabsw %xmm0, %xmm0
@@ -723,21 +793,21 @@ define <32 x i16> @test_abs_gt_v32i16(<32 x i16> %a) nounwind {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_abs_gt_v32i16:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpabsw %ymm0, %ymm0
; AVX2-NEXT: vpabsw %ymm1, %ymm1
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test_abs_gt_v32i16:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vpabsw %ymm0, %ymm0
-; AVX512F-NEXT: vpabsw %ymm1, %ymm1
-; AVX512F-NEXT: retq
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpabsw %ymm0, %ymm0 # encoding: [0xc4,0xe2,0x7d,0x1d,0xc0]
+; AVX512F-NEXT: vpabsw %ymm1, %ymm1 # encoding: [0xc4,0xe2,0x7d,0x1d,0xc9]
+; AVX512F-NEXT: retq # encoding: [0xc3]
;
; AVX512BW-LABEL: test_abs_gt_v32i16:
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vpabsw %zmm0, %zmm0
-; AVX512BW-NEXT: retq
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpabsw %zmm0, %zmm0 # encoding: [0x62,0xf2,0x7d,0x48,0x1d,0xc0]
+; AVX512BW-NEXT: retq # encoding: [0xc3]
%tmp1neg = sub <32 x i16> zeroinitializer, %a
%b = icmp sgt <32 x i16> %a, zeroinitializer
%abs = select <32 x i1> %b, <32 x i16> %a, <32 x i16> %tmp1neg
diff --git a/test/CodeGen/X86/vmovq.ll b/test/CodeGen/X86/vmovq.ll
index 5c1ff7d06ee0..2b4ae6795733 100644
--- a/test/CodeGen/X86/vmovq.ll
+++ b/test/CodeGen/X86/vmovq.ll
@@ -4,7 +4,7 @@
define <2 x i64> @PR25554(<2 x i64> %v0, <2 x i64> %v1) {
; SSE-LABEL: PR25554:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movl $1, %eax
; SSE-NEXT: movq %rax, %xmm1
; SSE-NEXT: por %xmm1, %xmm0
@@ -13,7 +13,7 @@ define <2 x i64> @PR25554(<2 x i64> %v0, <2 x i64> %v1) {
; SSE-NEXT: retq
;
; AVX-LABEL: PR25554:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: movl $1, %eax
; AVX-NEXT: vmovq %rax, %xmm1
; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
diff --git a/test/CodeGen/X86/volatile.ll b/test/CodeGen/X86/volatile.ll
index 8d521b46f7c9..29a57976a4ab 100644
--- a/test/CodeGen/X86/volatile.ll
+++ b/test/CodeGen/X86/volatile.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=x86 -mattr=sse2 | grep movsd | count 5
-; RUN: llc < %s -march=x86 -mattr=sse2 -O0 | grep -v esp | grep movsd | count 5
+; RUN: llc < %s -mtriple=i686-- -mattr=sse2 | grep movsd | count 5
+; RUN: llc < %s -mtriple=i686-- -mattr=sse2 -O0 | grep -v esp | grep movsd | count 5
@x = external global double
diff --git a/test/CodeGen/X86/vortex-bug.ll b/test/CodeGen/X86/vortex-bug.ll
index b1f42681a508..a4114425437f 100644
--- a/test/CodeGen/X86/vortex-bug.ll
+++ b/test/CodeGen/X86/vortex-bug.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64
+; RUN: llc < %s -mtriple=x86_64--
%struct.blktkntype = type { i32, i32 }
%struct.fieldstruc = type { [128 x i8], %struct.blktkntype*, i32, i32 }
diff --git a/test/CodeGen/X86/vpshufbitqbm-intrinsics.ll b/test/CodeGen/X86/vpshufbitqbm-intrinsics.ll
new file mode 100644
index 000000000000..004dcfb69a14
--- /dev/null
+++ b/test/CodeGen/X86/vpshufbitqbm-intrinsics.ll
@@ -0,0 +1,41 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512bitalg,+avx512vl | FileCheck %s
+
+declare i16 @llvm.x86.avx512.mask.vpshufbitqmb.128(<16 x i8> %a, <16 x i8> %b, i16 %mask)
+define i16 @test_vpshufbitqmb_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) {
+; CHECK-LABEL: test_vpshufbitqmb_128:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vpshufbitqmb %xmm1, %xmm0, %k0 {%k1}
+; CHECK-NEXT: kmovd %k0, %eax
+; CHECK-NEXT: ## kill: def %ax killed %ax killed %eax
+; CHECK-NEXT: retq
+ %res = call i16 @llvm.x86.avx512.mask.vpshufbitqmb.128(<16 x i8> %a, <16 x i8> %b, i16 %mask)
+ ret i16 %res
+}
+
+declare i32 @llvm.x86.avx512.mask.vpshufbitqmb.256(<32 x i8> %a, <32 x i8> %b, i32 %mask)
+define i32 @test_vpshufbitqmb_256(<32 x i8> %a, <32 x i8> %b, i32 %mask) {
+; CHECK-LABEL: test_vpshufbitqmb_256:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovd %edi, %k1
+; CHECK-NEXT: vpshufbitqmb %ymm1, %ymm0, %k0 {%k1}
+; CHECK-NEXT: kmovd %k0, %eax
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %res = call i32 @llvm.x86.avx512.mask.vpshufbitqmb.256(<32 x i8> %a, <32 x i8> %b, i32 %mask)
+ ret i32 %res
+}
+
+declare i64 @llvm.x86.avx512.mask.vpshufbitqmb.512(<64 x i8> %a, <64 x i8> %b, i64 %mask)
+define i64 @test_vpshufbitqmb_512(<64 x i8> %a, <64 x i8> %b, i64 %mask) {
+; CHECK-LABEL: test_vpshufbitqmb_512:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: kmovq %rdi, %k1
+; CHECK-NEXT: vpshufbitqmb %zmm1, %zmm0, %k0 {%k1}
+; CHECK-NEXT: kmovq %k0, %rax
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %res = call i64 @llvm.x86.avx512.mask.vpshufbitqmb.512(<64 x i8> %a, <64 x i8> %b, i64 %mask)
+ ret i64 %res
+}
diff --git a/test/CodeGen/X86/vselect-2.ll b/test/CodeGen/X86/vselect-2.ll
index 3ceff7ddd6eb..f31f6f6597cd 100644
--- a/test/CodeGen/X86/vselect-2.ll
+++ b/test/CodeGen/X86/vselect-2.ll
@@ -6,24 +6,24 @@
define <4 x i32> @test1(<4 x i32> %A, <4 x i32> %B) {
; SSE2-LABEL: test1:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
; SSE2-NEXT: movapd %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: test1:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
; SSE41-NEXT: retq
;
; AVX1-LABEL: test1:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
; AVX1-NEXT: retq
;
; AVX2-LABEL: test1:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX2: # %bb.0:
+; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
; AVX2-NEXT: retq
%select = select <4 x i1><i1 true, i1 true, i1 false, i1 false>, <4 x i32> %A, <4 x i32> %B
ret <4 x i32> %select
@@ -31,23 +31,23 @@ define <4 x i32> @test1(<4 x i32> %A, <4 x i32> %B) {
define <4 x i32> @test2(<4 x i32> %A, <4 x i32> %B) {
; SSE2-LABEL: test2:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; SSE2-NEXT: retq
;
; SSE41-LABEL: test2:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
; SSE41-NEXT: retq
;
; AVX1-LABEL: test2:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
; AVX1-NEXT: retq
;
; AVX2-LABEL: test2:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX2: # %bb.0:
+; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
; AVX2-NEXT: retq
%select = select <4 x i1><i1 false, i1 false, i1 true, i1 true>, <4 x i32> %A, <4 x i32> %B
ret <4 x i32> %select
@@ -55,18 +55,18 @@ define <4 x i32> @test2(<4 x i32> %A, <4 x i32> %B) {
define <4 x float> @test3(<4 x float> %A, <4 x float> %B) {
; SSE2-LABEL: test3:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
; SSE2-NEXT: movapd %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: test3:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
; SSE41-NEXT: retq
;
; AVX-LABEL: test3:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
; AVX-NEXT: retq
%select = select <4 x i1><i1 true, i1 true, i1 false, i1 false>, <4 x float> %A, <4 x float> %B
@@ -75,17 +75,17 @@ define <4 x float> @test3(<4 x float> %A, <4 x float> %B) {
define <4 x float> @test4(<4 x float> %A, <4 x float> %B) {
; SSE2-LABEL: test4:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; SSE2-NEXT: retq
;
; SSE41-LABEL: test4:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; SSE41-NEXT: retq
;
; AVX-LABEL: test4:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; AVX-NEXT: retq
%select = select <4 x i1><i1 false, i1 false, i1 true, i1 true>, <4 x float> %A, <4 x float> %B
diff --git a/test/CodeGen/X86/vselect-avx.ll b/test/CodeGen/X86/vselect-avx.ll
index 5825a56b6f99..9c2ae113c149 100644
--- a/test/CodeGen/X86/vselect-avx.ll
+++ b/test/CodeGen/X86/vselect-avx.ll
@@ -17,7 +17,7 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
define void @test(<4 x i16>* %a, <4 x i16>* %b) {
; AVX-LABEL: test:
-; AVX: ## BB#0: ## %body
+; AVX: ## %bb.0: ## %body
; AVX-NEXT: movq {{.*}}(%rip), %rax
; AVX-NEXT: movq %rax, (%rdi)
; AVX-NEXT: movq {{.*}}(%rip), %rax
@@ -39,7 +39,7 @@ body:
define void @test2(double** %call1559, i64 %indvars.iv4198, <4 x i1> %tmp1895) {
; AVX1-LABEL: test2:
-; AVX1: ## BB#0: ## %bb
+; AVX1: ## %bb.0: ## %bb
; AVX1-NEXT: vpslld $31, %xmm0, %xmm0
; AVX1-NEXT: vpsrad $31, %xmm0, %xmm0
; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1
@@ -54,7 +54,7 @@ define void @test2(double** %call1559, i64 %indvars.iv4198, <4 x i1> %tmp1895) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test2:
-; AVX2: ## BB#0: ## %bb
+; AVX2: ## %bb.0: ## %bb
; AVX2-NEXT: vpslld $31, %xmm0, %xmm0
; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0
; AVX2-NEXT: movq (%rdi,%rsi,8), %rax
@@ -84,7 +84,7 @@ bb:
define void @test3(<4 x i32> %induction30, <4 x i16>* %tmp16, <4 x i16>* %tmp17, <4 x i16> %tmp3, <4 x i16> %tmp12) {
; AVX1-LABEL: test3:
-; AVX1: ## BB#0:
+; AVX1: ## %bb.0:
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1431655766,1431655766,1431655766,1431655766]
; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
@@ -99,15 +99,14 @@ define void @test3(<4 x i32> %induction30, <4 x i16>* %tmp16, <4 x i16>* %tmp17,
; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpackssdw %xmm0, %xmm0, %xmm0
; AVX1-NEXT: vmovq %xmm0, (%rdi)
-; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm0
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; AVX1-NEXT: vmovq %xmm0, (%rsi)
; AVX1-NEXT: retq
;
; AVX2-LABEL: test3:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [1431655766,1431655766,1431655766,1431655766]
; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
@@ -123,10 +122,9 @@ define void @test3(<4 x i32> %induction30, <4 x i16>* %tmp16, <4 x i16>* %tmp17,
; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0
; AVX2-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm1
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpackssdw %xmm0, %xmm0, %xmm0
; AVX2-NEXT: vmovq %xmm0, (%rdi)
-; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm0
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; AVX2-NEXT: vmovq %xmm0, (%rsi)
; AVX2-NEXT: retq
%tmp6 = srem <4 x i32> %induction30, <i32 3, i32 3, i32 3, i32 3>
@@ -144,28 +142,29 @@ define void @test3(<4 x i32> %induction30, <4 x i16>* %tmp16, <4 x i16>* %tmp17,
define <32 x i8> @PR22706(<32 x i1> %x) {
; AVX1-LABEL: PR22706:
-; AVX1: ## BB#0:
+; AVX1: ## %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpsllw $7, %xmm1, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX1-NEXT: vpcmpgtb %xmm1, %xmm3, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
+; AVX1-NEXT: vpaddb %xmm4, %xmm1, %xmm1
; AVX1-NEXT: vpsllw $7, %xmm0, %xmm0
; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpcmpgtb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vpaddb %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX1-NEXT: vandnps {{.*}}(%rip), %ymm0, %ymm1
-; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
-; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: PR22706:
-; AVX2: ## BB#0:
+; AVX2: ## %bb.0:
; AVX2-NEXT: vpsllw $7, %ymm0, %ymm0
; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2]
-; AVX2-NEXT: vpblendvb %ymm0, {{.*}}(%rip), %ymm1, %ymm0
+; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpaddb {{.*}}(%rip), %ymm0, %ymm0
; AVX2-NEXT: retq
%tmp = select <32 x i1> %x, <32 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>, <32 x i8> <i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2>
ret <32 x i8> %tmp
diff --git a/test/CodeGen/X86/vselect-constants.ll b/test/CodeGen/X86/vselect-constants.ll
new file mode 100644
index 000000000000..d19318441903
--- /dev/null
+++ b/test/CodeGen/X86/vselect-constants.ll
@@ -0,0 +1,258 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX
+
+; First, check the generic pattern for any 2 vector constants. Then, check special cases where
+; the constants are all off-by-one. Finally, check the extra special cases where the constants
+; include 0 or -1.
+; Each minimal select test is repeated with a more typical pattern that includes a compare to
+; generate the condition value.
+
+; TODO: If we don't have blendv, this can definitely be improved. There's also a selection of
+; chips where it makes sense to transform the general case blendv to 2 bit-ops. That should be
+; a uarch-specfic transform. At some point (Ryzen?), the implementation should catch up to the
+; architecture, so blendv is as fast as a single bit-op.
+
+define <4 x i32> @sel_C1_or_C2_vec(<4 x i1> %cond) {
+; SSE-LABEL: sel_C1_or_C2_vec:
+; SSE: # %bb.0:
+; SSE-NEXT: pslld $31, %xmm0
+; SSE-NEXT: psrad $31, %xmm0
+; SSE-NEXT: movdqa %xmm0, %xmm1
+; SSE-NEXT: pandn {{.*}}(%rip), %xmm1
+; SSE-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE-NEXT: por %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: sel_C1_or_C2_vec:
+; AVX: # %bb.0:
+; AVX-NEXT: vpslld $31, %xmm0, %xmm0
+; AVX-NEXT: vmovaps {{.*#+}} xmm1 = [42,0,4294967294,4294967295]
+; AVX-NEXT: vblendvps %xmm0, {{.*}}(%rip), %xmm1, %xmm0
+; AVX-NEXT: retq
+ %add = select <4 x i1> %cond, <4 x i32> <i32 3000, i32 1, i32 -1, i32 0>, <4 x i32> <i32 42, i32 0, i32 -2, i32 -1>
+ ret <4 x i32> %add
+}
+
+define <4 x i32> @cmp_sel_C1_or_C2_vec(<4 x i32> %x, <4 x i32> %y) {
+; SSE-LABEL: cmp_sel_C1_or_C2_vec:
+; SSE: # %bb.0:
+; SSE-NEXT: pcmpeqd %xmm1, %xmm0
+; SSE-NEXT: movdqa %xmm0, %xmm1
+; SSE-NEXT: pandn {{.*}}(%rip), %xmm1
+; SSE-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE-NEXT: por %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: cmp_sel_C1_or_C2_vec:
+; AVX: # %bb.0:
+; AVX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vmovaps {{.*#+}} xmm1 = [42,0,4294967294,4294967295]
+; AVX-NEXT: vblendvps %xmm0, {{.*}}(%rip), %xmm1, %xmm0
+; AVX-NEXT: retq
+ %cond = icmp eq <4 x i32> %x, %y
+ %add = select <4 x i1> %cond, <4 x i32> <i32 3000, i32 1, i32 -1, i32 0>, <4 x i32> <i32 42, i32 0, i32 -2, i32 -1>
+ ret <4 x i32> %add
+}
+
+define <4 x i32> @sel_Cplus1_or_C_vec(<4 x i1> %cond) {
+; SSE-LABEL: sel_Cplus1_or_C_vec:
+; SSE: # %bb.0:
+; SSE-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE-NEXT: paddd {{.*}}(%rip), %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: sel_Cplus1_or_C_vec:
+; AVX: # %bb.0:
+; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT: retq
+ %add = select <4 x i1> %cond, <4 x i32> <i32 43, i32 1, i32 -1, i32 0>, <4 x i32> <i32 42, i32 0, i32 -2, i32 -1>
+ ret <4 x i32> %add
+}
+
+define <4 x i32> @cmp_sel_Cplus1_or_C_vec(<4 x i32> %x, <4 x i32> %y) {
+; SSE-LABEL: cmp_sel_Cplus1_or_C_vec:
+; SSE: # %bb.0:
+; SSE-NEXT: pcmpeqd %xmm1, %xmm0
+; SSE-NEXT: movdqa {{.*#+}} xmm1 = [42,0,4294967294,4294967295]
+; SSE-NEXT: psubd %xmm0, %xmm1
+; SSE-NEXT: movdqa %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: cmp_sel_Cplus1_or_C_vec:
+; AVX: # %bb.0:
+; AVX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [42,0,4294967294,4294967295]
+; AVX-NEXT: vpsubd %xmm0, %xmm1, %xmm0
+; AVX-NEXT: retq
+ %cond = icmp eq <4 x i32> %x, %y
+ %add = select <4 x i1> %cond, <4 x i32> <i32 43, i32 1, i32 -1, i32 0>, <4 x i32> <i32 42, i32 0, i32 -2, i32 -1>
+ ret <4 x i32> %add
+}
+
+define <4 x i32> @sel_Cminus1_or_C_vec(<4 x i1> %cond) {
+; SSE-LABEL: sel_Cminus1_or_C_vec:
+; SSE: # %bb.0:
+; SSE-NEXT: pslld $31, %xmm0
+; SSE-NEXT: psrad $31, %xmm0
+; SSE-NEXT: paddd {{.*}}(%rip), %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: sel_Cminus1_or_C_vec:
+; AVX: # %bb.0:
+; AVX-NEXT: vpslld $31, %xmm0, %xmm0
+; AVX-NEXT: vpsrad $31, %xmm0, %xmm0
+; AVX-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT: retq
+ %add = select <4 x i1> %cond, <4 x i32> <i32 43, i32 1, i32 -1, i32 0>, <4 x i32> <i32 44, i32 2, i32 0, i32 1>
+ ret <4 x i32> %add
+}
+
+define <4 x i32> @cmp_sel_Cminus1_or_C_vec(<4 x i32> %x, <4 x i32> %y) {
+; SSE-LABEL: cmp_sel_Cminus1_or_C_vec:
+; SSE: # %bb.0:
+; SSE-NEXT: pcmpeqd %xmm1, %xmm0
+; SSE-NEXT: paddd {{.*}}(%rip), %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: cmp_sel_Cminus1_or_C_vec:
+; AVX: # %bb.0:
+; AVX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT: retq
+ %cond = icmp eq <4 x i32> %x, %y
+ %add = select <4 x i1> %cond, <4 x i32> <i32 43, i32 1, i32 -1, i32 0>, <4 x i32> <i32 44, i32 2, i32 0, i32 1>
+ ret <4 x i32> %add
+}
+
+define <4 x i32> @sel_minus1_or_0_vec(<4 x i1> %cond) {
+; SSE-LABEL: sel_minus1_or_0_vec:
+; SSE: # %bb.0:
+; SSE-NEXT: pslld $31, %xmm0
+; SSE-NEXT: psrad $31, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: sel_minus1_or_0_vec:
+; AVX: # %bb.0:
+; AVX-NEXT: vpslld $31, %xmm0, %xmm0
+; AVX-NEXT: vpsrad $31, %xmm0, %xmm0
+; AVX-NEXT: retq
+ %add = select <4 x i1> %cond, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+ ret <4 x i32> %add
+}
+
+define <4 x i32> @cmp_sel_minus1_or_0_vec(<4 x i32> %x, <4 x i32> %y) {
+; SSE-LABEL: cmp_sel_minus1_or_0_vec:
+; SSE: # %bb.0:
+; SSE-NEXT: pcmpeqd %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: cmp_sel_minus1_or_0_vec:
+; AVX: # %bb.0:
+; AVX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
+ %cond = icmp eq <4 x i32> %x, %y
+ %add = select <4 x i1> %cond, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+ ret <4 x i32> %add
+}
+
+define <4 x i32> @sel_0_or_minus1_vec(<4 x i1> %cond) {
+; SSE-LABEL: sel_0_or_minus1_vec:
+; SSE: # %bb.0:
+; SSE-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE-NEXT: pcmpeqd %xmm1, %xmm1
+; SSE-NEXT: paddd %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: sel_0_or_minus1_vec:
+; AVX: # %bb.0:
+; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
+ %add = select <4 x i1> %cond, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>
+ ret <4 x i32> %add
+}
+
+define <4 x i32> @cmp_sel_0_or_minus1_vec(<4 x i32> %x, <4 x i32> %y) {
+; SSE-LABEL: cmp_sel_0_or_minus1_vec:
+; SSE: # %bb.0:
+; SSE-NEXT: pcmpeqd %xmm1, %xmm0
+; SSE-NEXT: pcmpeqd %xmm1, %xmm1
+; SSE-NEXT: pxor %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: cmp_sel_0_or_minus1_vec:
+; AVX: # %bb.0:
+; AVX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
+ %cond = icmp eq <4 x i32> %x, %y
+ %add = select <4 x i1> %cond, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>
+ ret <4 x i32> %add
+}
+
+define <4 x i32> @sel_1_or_0_vec(<4 x i1> %cond) {
+; SSE-LABEL: sel_1_or_0_vec:
+; SSE: # %bb.0:
+; SSE-NEXT: andps {{.*}}(%rip), %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: sel_1_or_0_vec:
+; AVX: # %bb.0:
+; AVX-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT: retq
+ %add = select <4 x i1> %cond, <4 x i32> <i32 1, i32 1, i32 1, i32 1>, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+ ret <4 x i32> %add
+}
+
+define <4 x i32> @cmp_sel_1_or_0_vec(<4 x i32> %x, <4 x i32> %y) {
+; SSE-LABEL: cmp_sel_1_or_0_vec:
+; SSE: # %bb.0:
+; SSE-NEXT: pcmpeqd %xmm1, %xmm0
+; SSE-NEXT: psrld $31, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: cmp_sel_1_or_0_vec:
+; AVX: # %bb.0:
+; AVX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpsrld $31, %xmm0, %xmm0
+; AVX-NEXT: retq
+ %cond = icmp eq <4 x i32> %x, %y
+ %add = select <4 x i1> %cond, <4 x i32> <i32 1, i32 1, i32 1, i32 1>, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+ ret <4 x i32> %add
+}
+
+define <4 x i32> @sel_0_or_1_vec(<4 x i1> %cond) {
+; SSE-LABEL: sel_0_or_1_vec:
+; SSE: # %bb.0:
+; SSE-NEXT: andnps {{.*}}(%rip), %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: sel_0_or_1_vec:
+; AVX: # %bb.0:
+; AVX-NEXT: vandnps {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT: retq
+ %add = select <4 x i1> %cond, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+ ret <4 x i32> %add
+}
+
+define <4 x i32> @cmp_sel_0_or_1_vec(<4 x i32> %x, <4 x i32> %y) {
+; SSE-LABEL: cmp_sel_0_or_1_vec:
+; SSE: # %bb.0:
+; SSE-NEXT: pcmpeqd %xmm1, %xmm0
+; SSE-NEXT: pandn {{.*}}(%rip), %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: cmp_sel_0_or_1_vec:
+; AVX: # %bb.0:
+; AVX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpandn {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT: retq
+ %cond = icmp eq <4 x i32> %x, %y
+ %add = select <4 x i1> %cond, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+ ret <4 x i32> %add
+}
+
diff --git a/test/CodeGen/X86/vselect-minmax.ll b/test/CodeGen/X86/vselect-minmax.ll
index 5524eaf397c9..11edc6a6be58 100644
--- a/test/CodeGen/X86/vselect-minmax.ll
+++ b/test/CodeGen/X86/vselect-minmax.ll
@@ -8,7 +8,7 @@
define <16 x i8> @test1(<16 x i8> %a, <16 x i8> %b) {
; SSE2-LABEL: test1:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa %xmm1, %xmm2
; SSE2-NEXT: pcmpgtb %xmm0, %xmm2
; SSE2-NEXT: pand %xmm2, %xmm0
@@ -17,12 +17,12 @@ define <16 x i8> @test1(<16 x i8> %a, <16 x i8> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test1:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: pminsb %xmm1, %xmm0
; SSE4-NEXT: retq
;
; AVX-LABEL: test1:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
entry:
@@ -33,7 +33,7 @@ entry:
define <16 x i8> @test2(<16 x i8> %a, <16 x i8> %b) {
; SSE2-LABEL: test2:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: pcmpgtb %xmm1, %xmm2
; SSE2-NEXT: pcmpeqd %xmm3, %xmm3
@@ -45,12 +45,12 @@ define <16 x i8> @test2(<16 x i8> %a, <16 x i8> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test2:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: pminsb %xmm1, %xmm0
; SSE4-NEXT: retq
;
; AVX-LABEL: test2:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
entry:
@@ -61,7 +61,7 @@ entry:
define <16 x i8> @test3(<16 x i8> %a, <16 x i8> %b) {
; SSE2-LABEL: test3:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: pcmpgtb %xmm1, %xmm2
; SSE2-NEXT: pand %xmm2, %xmm0
@@ -71,12 +71,12 @@ define <16 x i8> @test3(<16 x i8> %a, <16 x i8> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test3:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: pmaxsb %xmm1, %xmm0
; SSE4-NEXT: retq
;
; AVX-LABEL: test3:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
entry:
@@ -87,7 +87,7 @@ entry:
define <16 x i8> @test4(<16 x i8> %a, <16 x i8> %b) {
; SSE2-LABEL: test4:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa %xmm1, %xmm3
; SSE2-NEXT: pcmpgtb %xmm0, %xmm3
; SSE2-NEXT: pcmpeqd %xmm2, %xmm2
@@ -99,12 +99,12 @@ define <16 x i8> @test4(<16 x i8> %a, <16 x i8> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test4:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: pmaxsb %xmm1, %xmm0
; SSE4-NEXT: retq
;
; AVX-LABEL: test4:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
entry:
@@ -115,12 +115,12 @@ entry:
define <16 x i8> @test5(<16 x i8> %a, <16 x i8> %b) {
; SSE-LABEL: test5:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: pminub %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test5:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
entry:
@@ -131,12 +131,12 @@ entry:
define <16 x i8> @test6(<16 x i8> %a, <16 x i8> %b) {
; SSE-LABEL: test6:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: pminub %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test6:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
entry:
@@ -147,12 +147,12 @@ entry:
define <16 x i8> @test7(<16 x i8> %a, <16 x i8> %b) {
; SSE-LABEL: test7:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: pmaxub %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test7:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
entry:
@@ -163,12 +163,12 @@ entry:
define <16 x i8> @test8(<16 x i8> %a, <16 x i8> %b) {
; SSE-LABEL: test8:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: pmaxub %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test8:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
entry:
@@ -179,12 +179,12 @@ entry:
define <8 x i16> @test9(<8 x i16> %a, <8 x i16> %b) {
; SSE-LABEL: test9:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: pminsw %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test9:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vpminsw %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
entry:
@@ -195,12 +195,12 @@ entry:
define <8 x i16> @test10(<8 x i16> %a, <8 x i16> %b) {
; SSE-LABEL: test10:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: pminsw %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test10:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vpminsw %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
entry:
@@ -211,12 +211,12 @@ entry:
define <8 x i16> @test11(<8 x i16> %a, <8 x i16> %b) {
; SSE-LABEL: test11:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: pmaxsw %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test11:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
entry:
@@ -227,12 +227,12 @@ entry:
define <8 x i16> @test12(<8 x i16> %a, <8 x i16> %b) {
; SSE-LABEL: test12:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: pmaxsw %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test12:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
entry:
@@ -243,7 +243,7 @@ entry:
define <8 x i16> @test13(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: test13:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
; SSE2-NEXT: movdqa %xmm0, %xmm3
; SSE2-NEXT: pxor %xmm2, %xmm3
@@ -255,12 +255,12 @@ define <8 x i16> @test13(<8 x i16> %a, <8 x i16> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test13:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: pminuw %xmm1, %xmm0
; SSE4-NEXT: retq
;
; AVX-LABEL: test13:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vpminuw %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
entry:
@@ -271,7 +271,7 @@ entry:
define <8 x i16> @test14(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: test14:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: psubusw %xmm1, %xmm2
; SSE2-NEXT: pxor %xmm3, %xmm3
@@ -282,12 +282,12 @@ define <8 x i16> @test14(<8 x i16> %a, <8 x i16> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test14:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: pminuw %xmm1, %xmm0
; SSE4-NEXT: retq
;
; AVX-LABEL: test14:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vpminuw %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
entry:
@@ -298,7 +298,7 @@ entry:
define <8 x i16> @test15(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: test15:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
; SSE2-NEXT: movdqa %xmm1, %xmm3
; SSE2-NEXT: pxor %xmm2, %xmm3
@@ -310,12 +310,12 @@ define <8 x i16> @test15(<8 x i16> %a, <8 x i16> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test15:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: pmaxuw %xmm1, %xmm0
; SSE4-NEXT: retq
;
; AVX-LABEL: test15:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
entry:
@@ -326,7 +326,7 @@ entry:
define <8 x i16> @test16(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: test16:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa %xmm1, %xmm2
; SSE2-NEXT: psubusw %xmm0, %xmm2
; SSE2-NEXT: pxor %xmm3, %xmm3
@@ -337,12 +337,12 @@ define <8 x i16> @test16(<8 x i16> %a, <8 x i16> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test16:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: pmaxuw %xmm1, %xmm0
; SSE4-NEXT: retq
;
; AVX-LABEL: test16:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
entry:
@@ -353,7 +353,7 @@ entry:
define <4 x i32> @test17(<4 x i32> %a, <4 x i32> %b) {
; SSE2-LABEL: test17:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa %xmm1, %xmm2
; SSE2-NEXT: pcmpgtd %xmm0, %xmm2
; SSE2-NEXT: pand %xmm2, %xmm0
@@ -362,12 +362,12 @@ define <4 x i32> @test17(<4 x i32> %a, <4 x i32> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test17:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: pminsd %xmm1, %xmm0
; SSE4-NEXT: retq
;
; AVX-LABEL: test17:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vpminsd %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
entry:
@@ -378,7 +378,7 @@ entry:
define <4 x i32> @test18(<4 x i32> %a, <4 x i32> %b) {
; SSE2-LABEL: test18:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
; SSE2-NEXT: pcmpeqd %xmm3, %xmm3
@@ -390,12 +390,12 @@ define <4 x i32> @test18(<4 x i32> %a, <4 x i32> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test18:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: pminsd %xmm1, %xmm0
; SSE4-NEXT: retq
;
; AVX-LABEL: test18:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vpminsd %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
entry:
@@ -406,7 +406,7 @@ entry:
define <4 x i32> @test19(<4 x i32> %a, <4 x i32> %b) {
; SSE2-LABEL: test19:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
; SSE2-NEXT: pand %xmm2, %xmm0
@@ -416,12 +416,12 @@ define <4 x i32> @test19(<4 x i32> %a, <4 x i32> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test19:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: pmaxsd %xmm1, %xmm0
; SSE4-NEXT: retq
;
; AVX-LABEL: test19:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
entry:
@@ -432,7 +432,7 @@ entry:
define <4 x i32> @test20(<4 x i32> %a, <4 x i32> %b) {
; SSE2-LABEL: test20:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa %xmm1, %xmm3
; SSE2-NEXT: pcmpgtd %xmm0, %xmm3
; SSE2-NEXT: pcmpeqd %xmm2, %xmm2
@@ -444,12 +444,12 @@ define <4 x i32> @test20(<4 x i32> %a, <4 x i32> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test20:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: pmaxsd %xmm1, %xmm0
; SSE4-NEXT: retq
;
; AVX-LABEL: test20:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
entry:
@@ -460,7 +460,7 @@ entry:
define <4 x i32> @test21(<4 x i32> %a, <4 x i32> %b) {
; SSE2-LABEL: test21:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
; SSE2-NEXT: movdqa %xmm0, %xmm3
; SSE2-NEXT: pxor %xmm2, %xmm3
@@ -472,12 +472,12 @@ define <4 x i32> @test21(<4 x i32> %a, <4 x i32> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test21:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: pminud %xmm1, %xmm0
; SSE4-NEXT: retq
;
; AVX-LABEL: test21:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vpminud %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
entry:
@@ -488,7 +488,7 @@ entry:
define <4 x i32> @test22(<4 x i32> %a, <4 x i32> %b) {
; SSE2-LABEL: test22:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
; SSE2-NEXT: movdqa %xmm1, %xmm2
; SSE2-NEXT: pxor %xmm3, %xmm2
@@ -503,12 +503,12 @@ define <4 x i32> @test22(<4 x i32> %a, <4 x i32> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test22:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: pminud %xmm1, %xmm0
; SSE4-NEXT: retq
;
; AVX-LABEL: test22:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vpminud %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
entry:
@@ -519,7 +519,7 @@ entry:
define <4 x i32> @test23(<4 x i32> %a, <4 x i32> %b) {
; SSE2-LABEL: test23:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
; SSE2-NEXT: movdqa %xmm1, %xmm3
; SSE2-NEXT: pxor %xmm2, %xmm3
@@ -531,12 +531,12 @@ define <4 x i32> @test23(<4 x i32> %a, <4 x i32> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test23:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: pmaxud %xmm1, %xmm0
; SSE4-NEXT: retq
;
; AVX-LABEL: test23:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
entry:
@@ -547,7 +547,7 @@ entry:
define <4 x i32> @test24(<4 x i32> %a, <4 x i32> %b) {
; SSE2-LABEL: test24:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: pxor %xmm3, %xmm2
@@ -562,12 +562,12 @@ define <4 x i32> @test24(<4 x i32> %a, <4 x i32> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test24:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: pmaxud %xmm1, %xmm0
; SSE4-NEXT: retq
;
; AVX-LABEL: test24:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
entry:
@@ -578,7 +578,7 @@ entry:
define <32 x i8> @test25(<32 x i8> %a, <32 x i8> %b) {
; SSE2-LABEL: test25:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa %xmm3, %xmm4
; SSE2-NEXT: pcmpgtb %xmm1, %xmm4
; SSE2-NEXT: movdqa %xmm2, %xmm5
@@ -592,13 +592,13 @@ define <32 x i8> @test25(<32 x i8> %a, <32 x i8> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test25:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: pminsb %xmm2, %xmm0
; SSE4-NEXT: pminsb %xmm3, %xmm1
; SSE4-NEXT: retq
;
; AVX1-LABEL: test25:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpminsb %xmm2, %xmm3, %xmm2
@@ -607,12 +607,12 @@ define <32 x i8> @test25(<32 x i8> %a, <32 x i8> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test25:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test25:
-; AVX512F: # BB#0: # %entry
+; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vpminsb %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: retq
entry:
@@ -623,7 +623,7 @@ entry:
define <32 x i8> @test26(<32 x i8> %a, <32 x i8> %b) {
; SSE2-LABEL: test26:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa %xmm1, %xmm6
; SSE2-NEXT: pcmpgtb %xmm3, %xmm6
; SSE2-NEXT: pcmpeqd %xmm7, %xmm7
@@ -643,13 +643,13 @@ define <32 x i8> @test26(<32 x i8> %a, <32 x i8> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test26:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: pminsb %xmm2, %xmm0
; SSE4-NEXT: pminsb %xmm3, %xmm1
; SSE4-NEXT: retq
;
; AVX1-LABEL: test26:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpminsb %xmm2, %xmm3, %xmm2
@@ -658,12 +658,12 @@ define <32 x i8> @test26(<32 x i8> %a, <32 x i8> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test26:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test26:
-; AVX512F: # BB#0: # %entry
+; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vpminsb %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: retq
entry:
@@ -674,7 +674,7 @@ entry:
define <32 x i8> @test27(<32 x i8> %a, <32 x i8> %b) {
; SSE2-LABEL: test27:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa %xmm1, %xmm4
; SSE2-NEXT: pcmpgtb %xmm3, %xmm4
; SSE2-NEXT: movdqa %xmm0, %xmm5
@@ -690,13 +690,13 @@ define <32 x i8> @test27(<32 x i8> %a, <32 x i8> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test27:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: pmaxsb %xmm2, %xmm0
; SSE4-NEXT: pmaxsb %xmm3, %xmm1
; SSE4-NEXT: retq
;
; AVX1-LABEL: test27:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpmaxsb %xmm2, %xmm3, %xmm2
@@ -705,12 +705,12 @@ define <32 x i8> @test27(<32 x i8> %a, <32 x i8> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test27:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test27:
-; AVX512F: # BB#0: # %entry
+; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: retq
entry:
@@ -721,7 +721,7 @@ entry:
define <32 x i8> @test28(<32 x i8> %a, <32 x i8> %b) {
; SSE2-LABEL: test28:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa %xmm3, %xmm6
; SSE2-NEXT: pcmpgtb %xmm1, %xmm6
; SSE2-NEXT: pcmpeqd %xmm4, %xmm4
@@ -741,13 +741,13 @@ define <32 x i8> @test28(<32 x i8> %a, <32 x i8> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test28:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: pmaxsb %xmm2, %xmm0
; SSE4-NEXT: pmaxsb %xmm3, %xmm1
; SSE4-NEXT: retq
;
; AVX1-LABEL: test28:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpmaxsb %xmm2, %xmm3, %xmm2
@@ -756,12 +756,12 @@ define <32 x i8> @test28(<32 x i8> %a, <32 x i8> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test28:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test28:
-; AVX512F: # BB#0: # %entry
+; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: retq
entry:
@@ -772,13 +772,13 @@ entry:
define <32 x i8> @test29(<32 x i8> %a, <32 x i8> %b) {
; SSE-LABEL: test29:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: pminub %xmm2, %xmm0
; SSE-NEXT: pminub %xmm3, %xmm1
; SSE-NEXT: retq
;
; AVX1-LABEL: test29:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpminub %xmm2, %xmm3, %xmm2
@@ -787,12 +787,12 @@ define <32 x i8> @test29(<32 x i8> %a, <32 x i8> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test29:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test29:
-; AVX512F: # BB#0: # %entry
+; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vpminub %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: retq
entry:
@@ -803,13 +803,13 @@ entry:
define <32 x i8> @test30(<32 x i8> %a, <32 x i8> %b) {
; SSE-LABEL: test30:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: pminub %xmm2, %xmm0
; SSE-NEXT: pminub %xmm3, %xmm1
; SSE-NEXT: retq
;
; AVX1-LABEL: test30:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpminub %xmm2, %xmm3, %xmm2
@@ -818,12 +818,12 @@ define <32 x i8> @test30(<32 x i8> %a, <32 x i8> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test30:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test30:
-; AVX512F: # BB#0: # %entry
+; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vpminub %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: retq
entry:
@@ -834,13 +834,13 @@ entry:
define <32 x i8> @test31(<32 x i8> %a, <32 x i8> %b) {
; SSE-LABEL: test31:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: pmaxub %xmm2, %xmm0
; SSE-NEXT: pmaxub %xmm3, %xmm1
; SSE-NEXT: retq
;
; AVX1-LABEL: test31:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpmaxub %xmm2, %xmm3, %xmm2
@@ -849,12 +849,12 @@ define <32 x i8> @test31(<32 x i8> %a, <32 x i8> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test31:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test31:
-; AVX512F: # BB#0: # %entry
+; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: retq
entry:
@@ -865,13 +865,13 @@ entry:
define <32 x i8> @test32(<32 x i8> %a, <32 x i8> %b) {
; SSE-LABEL: test32:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: pmaxub %xmm2, %xmm0
; SSE-NEXT: pmaxub %xmm3, %xmm1
; SSE-NEXT: retq
;
; AVX1-LABEL: test32:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpmaxub %xmm2, %xmm3, %xmm2
@@ -880,12 +880,12 @@ define <32 x i8> @test32(<32 x i8> %a, <32 x i8> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test32:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test32:
-; AVX512F: # BB#0: # %entry
+; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: retq
entry:
@@ -896,13 +896,13 @@ entry:
define <16 x i16> @test33(<16 x i16> %a, <16 x i16> %b) {
; SSE-LABEL: test33:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: pminsw %xmm2, %xmm0
; SSE-NEXT: pminsw %xmm3, %xmm1
; SSE-NEXT: retq
;
; AVX1-LABEL: test33:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpminsw %xmm2, %xmm3, %xmm2
@@ -911,12 +911,12 @@ define <16 x i16> @test33(<16 x i16> %a, <16 x i16> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test33:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test33:
-; AVX512F: # BB#0: # %entry
+; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vpminsw %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: retq
entry:
@@ -927,13 +927,13 @@ entry:
define <16 x i16> @test34(<16 x i16> %a, <16 x i16> %b) {
; SSE-LABEL: test34:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: pminsw %xmm2, %xmm0
; SSE-NEXT: pminsw %xmm3, %xmm1
; SSE-NEXT: retq
;
; AVX1-LABEL: test34:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpminsw %xmm2, %xmm3, %xmm2
@@ -942,12 +942,12 @@ define <16 x i16> @test34(<16 x i16> %a, <16 x i16> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test34:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test34:
-; AVX512F: # BB#0: # %entry
+; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vpminsw %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: retq
entry:
@@ -958,13 +958,13 @@ entry:
define <16 x i16> @test35(<16 x i16> %a, <16 x i16> %b) {
; SSE-LABEL: test35:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: pmaxsw %xmm2, %xmm0
; SSE-NEXT: pmaxsw %xmm3, %xmm1
; SSE-NEXT: retq
;
; AVX1-LABEL: test35:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpmaxsw %xmm2, %xmm3, %xmm2
@@ -973,12 +973,12 @@ define <16 x i16> @test35(<16 x i16> %a, <16 x i16> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test35:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test35:
-; AVX512F: # BB#0: # %entry
+; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: retq
entry:
@@ -989,13 +989,13 @@ entry:
define <16 x i16> @test36(<16 x i16> %a, <16 x i16> %b) {
; SSE-LABEL: test36:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: pmaxsw %xmm2, %xmm0
; SSE-NEXT: pmaxsw %xmm3, %xmm1
; SSE-NEXT: retq
;
; AVX1-LABEL: test36:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpmaxsw %xmm2, %xmm3, %xmm2
@@ -1004,12 +1004,12 @@ define <16 x i16> @test36(<16 x i16> %a, <16 x i16> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test36:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test36:
-; AVX512F: # BB#0: # %entry
+; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: retq
entry:
@@ -1020,7 +1020,7 @@ entry:
define <16 x i16> @test37(<16 x i16> %a, <16 x i16> %b) {
; SSE2-LABEL: test37:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [32768,32768,32768,32768,32768,32768,32768,32768]
; SSE2-NEXT: movdqa %xmm1, %xmm5
; SSE2-NEXT: pxor %xmm4, %xmm5
@@ -1040,13 +1040,13 @@ define <16 x i16> @test37(<16 x i16> %a, <16 x i16> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test37:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: pminuw %xmm2, %xmm0
; SSE4-NEXT: pminuw %xmm3, %xmm1
; SSE4-NEXT: retq
;
; AVX1-LABEL: test37:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpminuw %xmm2, %xmm3, %xmm2
@@ -1055,12 +1055,12 @@ define <16 x i16> @test37(<16 x i16> %a, <16 x i16> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test37:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test37:
-; AVX512F: # BB#0: # %entry
+; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vpminuw %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: retq
entry:
@@ -1071,7 +1071,7 @@ entry:
define <16 x i16> @test38(<16 x i16> %a, <16 x i16> %b) {
; SSE2-LABEL: test38:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa %xmm1, %xmm4
; SSE2-NEXT: psubusw %xmm3, %xmm4
; SSE2-NEXT: pxor %xmm6, %xmm6
@@ -1090,13 +1090,13 @@ define <16 x i16> @test38(<16 x i16> %a, <16 x i16> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test38:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: pminuw %xmm2, %xmm0
; SSE4-NEXT: pminuw %xmm3, %xmm1
; SSE4-NEXT: retq
;
; AVX1-LABEL: test38:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpminuw %xmm2, %xmm3, %xmm2
@@ -1105,12 +1105,12 @@ define <16 x i16> @test38(<16 x i16> %a, <16 x i16> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test38:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test38:
-; AVX512F: # BB#0: # %entry
+; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vpminuw %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: retq
entry:
@@ -1121,7 +1121,7 @@ entry:
define <16 x i16> @test39(<16 x i16> %a, <16 x i16> %b) {
; SSE2-LABEL: test39:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [32768,32768,32768,32768,32768,32768,32768,32768]
; SSE2-NEXT: movdqa %xmm3, %xmm6
; SSE2-NEXT: pxor %xmm5, %xmm6
@@ -1142,13 +1142,13 @@ define <16 x i16> @test39(<16 x i16> %a, <16 x i16> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test39:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: pmaxuw %xmm2, %xmm0
; SSE4-NEXT: pmaxuw %xmm3, %xmm1
; SSE4-NEXT: retq
;
; AVX1-LABEL: test39:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpmaxuw %xmm2, %xmm3, %xmm2
@@ -1157,12 +1157,12 @@ define <16 x i16> @test39(<16 x i16> %a, <16 x i16> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test39:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test39:
-; AVX512F: # BB#0: # %entry
+; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: retq
entry:
@@ -1173,7 +1173,7 @@ entry:
define <16 x i16> @test40(<16 x i16> %a, <16 x i16> %b) {
; SSE2-LABEL: test40:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa %xmm3, %xmm4
; SSE2-NEXT: psubusw %xmm1, %xmm4
; SSE2-NEXT: pxor %xmm5, %xmm5
@@ -1190,13 +1190,13 @@ define <16 x i16> @test40(<16 x i16> %a, <16 x i16> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test40:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: pmaxuw %xmm2, %xmm0
; SSE4-NEXT: pmaxuw %xmm3, %xmm1
; SSE4-NEXT: retq
;
; AVX1-LABEL: test40:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpmaxuw %xmm2, %xmm3, %xmm2
@@ -1205,12 +1205,12 @@ define <16 x i16> @test40(<16 x i16> %a, <16 x i16> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test40:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test40:
-; AVX512F: # BB#0: # %entry
+; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: retq
entry:
@@ -1221,7 +1221,7 @@ entry:
define <8 x i32> @test41(<8 x i32> %a, <8 x i32> %b) {
; SSE2-LABEL: test41:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa %xmm3, %xmm4
; SSE2-NEXT: pcmpgtd %xmm1, %xmm4
; SSE2-NEXT: movdqa %xmm2, %xmm5
@@ -1235,13 +1235,13 @@ define <8 x i32> @test41(<8 x i32> %a, <8 x i32> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test41:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: pminsd %xmm2, %xmm0
; SSE4-NEXT: pminsd %xmm3, %xmm1
; SSE4-NEXT: retq
;
; AVX1-LABEL: test41:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpminsd %xmm2, %xmm3, %xmm2
@@ -1250,12 +1250,12 @@ define <8 x i32> @test41(<8 x i32> %a, <8 x i32> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test41:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test41:
-; AVX512F: # BB#0: # %entry
+; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vpminsd %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: retq
entry:
@@ -1266,7 +1266,7 @@ entry:
define <8 x i32> @test42(<8 x i32> %a, <8 x i32> %b) {
; SSE2-LABEL: test42:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa %xmm1, %xmm6
; SSE2-NEXT: pcmpgtd %xmm3, %xmm6
; SSE2-NEXT: pcmpeqd %xmm7, %xmm7
@@ -1286,13 +1286,13 @@ define <8 x i32> @test42(<8 x i32> %a, <8 x i32> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test42:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: pminsd %xmm2, %xmm0
; SSE4-NEXT: pminsd %xmm3, %xmm1
; SSE4-NEXT: retq
;
; AVX1-LABEL: test42:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpminsd %xmm2, %xmm3, %xmm2
@@ -1301,12 +1301,12 @@ define <8 x i32> @test42(<8 x i32> %a, <8 x i32> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test42:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test42:
-; AVX512F: # BB#0: # %entry
+; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vpminsd %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: retq
entry:
@@ -1317,7 +1317,7 @@ entry:
define <8 x i32> @test43(<8 x i32> %a, <8 x i32> %b) {
; SSE2-LABEL: test43:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa %xmm1, %xmm4
; SSE2-NEXT: pcmpgtd %xmm3, %xmm4
; SSE2-NEXT: movdqa %xmm0, %xmm5
@@ -1333,13 +1333,13 @@ define <8 x i32> @test43(<8 x i32> %a, <8 x i32> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test43:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: pmaxsd %xmm2, %xmm0
; SSE4-NEXT: pmaxsd %xmm3, %xmm1
; SSE4-NEXT: retq
;
; AVX1-LABEL: test43:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpmaxsd %xmm2, %xmm3, %xmm2
@@ -1348,12 +1348,12 @@ define <8 x i32> @test43(<8 x i32> %a, <8 x i32> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test43:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test43:
-; AVX512F: # BB#0: # %entry
+; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: retq
entry:
@@ -1364,7 +1364,7 @@ entry:
define <8 x i32> @test44(<8 x i32> %a, <8 x i32> %b) {
; SSE2-LABEL: test44:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa %xmm3, %xmm6
; SSE2-NEXT: pcmpgtd %xmm1, %xmm6
; SSE2-NEXT: pcmpeqd %xmm4, %xmm4
@@ -1384,13 +1384,13 @@ define <8 x i32> @test44(<8 x i32> %a, <8 x i32> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test44:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: pmaxsd %xmm2, %xmm0
; SSE4-NEXT: pmaxsd %xmm3, %xmm1
; SSE4-NEXT: retq
;
; AVX1-LABEL: test44:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpmaxsd %xmm2, %xmm3, %xmm2
@@ -1399,12 +1399,12 @@ define <8 x i32> @test44(<8 x i32> %a, <8 x i32> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test44:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test44:
-; AVX512F: # BB#0: # %entry
+; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: retq
entry:
@@ -1415,7 +1415,7 @@ entry:
define <8 x i32> @test45(<8 x i32> %a, <8 x i32> %b) {
; SSE2-LABEL: test45:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
; SSE2-NEXT: movdqa %xmm1, %xmm5
; SSE2-NEXT: pxor %xmm4, %xmm5
@@ -1435,13 +1435,13 @@ define <8 x i32> @test45(<8 x i32> %a, <8 x i32> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test45:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: pminud %xmm2, %xmm0
; SSE4-NEXT: pminud %xmm3, %xmm1
; SSE4-NEXT: retq
;
; AVX1-LABEL: test45:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpminud %xmm2, %xmm3, %xmm2
@@ -1450,12 +1450,12 @@ define <8 x i32> @test45(<8 x i32> %a, <8 x i32> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test45:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test45:
-; AVX512F: # BB#0: # %entry
+; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vpminud %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: retq
entry:
@@ -1466,7 +1466,7 @@ entry:
define <8 x i32> @test46(<8 x i32> %a, <8 x i32> %b) {
; SSE2-LABEL: test46:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648]
; SSE2-NEXT: movdqa %xmm3, %xmm4
; SSE2-NEXT: pxor %xmm6, %xmm4
@@ -1492,13 +1492,13 @@ define <8 x i32> @test46(<8 x i32> %a, <8 x i32> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test46:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: pminud %xmm2, %xmm0
; SSE4-NEXT: pminud %xmm3, %xmm1
; SSE4-NEXT: retq
;
; AVX1-LABEL: test46:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpminud %xmm2, %xmm3, %xmm2
@@ -1507,12 +1507,12 @@ define <8 x i32> @test46(<8 x i32> %a, <8 x i32> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test46:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test46:
-; AVX512F: # BB#0: # %entry
+; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vpminud %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: retq
entry:
@@ -1523,7 +1523,7 @@ entry:
define <8 x i32> @test47(<8 x i32> %a, <8 x i32> %b) {
; SSE2-LABEL: test47:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648]
; SSE2-NEXT: movdqa %xmm3, %xmm6
; SSE2-NEXT: pxor %xmm5, %xmm6
@@ -1544,13 +1544,13 @@ define <8 x i32> @test47(<8 x i32> %a, <8 x i32> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test47:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: pmaxud %xmm2, %xmm0
; SSE4-NEXT: pmaxud %xmm3, %xmm1
; SSE4-NEXT: retq
;
; AVX1-LABEL: test47:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpmaxud %xmm2, %xmm3, %xmm2
@@ -1559,12 +1559,12 @@ define <8 x i32> @test47(<8 x i32> %a, <8 x i32> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test47:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test47:
-; AVX512F: # BB#0: # %entry
+; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: retq
entry:
@@ -1575,7 +1575,7 @@ entry:
define <8 x i32> @test48(<8 x i32> %a, <8 x i32> %b) {
; SSE2-LABEL: test48:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648]
; SSE2-NEXT: movdqa %xmm1, %xmm4
; SSE2-NEXT: pxor %xmm6, %xmm4
@@ -1601,13 +1601,13 @@ define <8 x i32> @test48(<8 x i32> %a, <8 x i32> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test48:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: pmaxud %xmm2, %xmm0
; SSE4-NEXT: pmaxud %xmm3, %xmm1
; SSE4-NEXT: retq
;
; AVX1-LABEL: test48:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpmaxud %xmm2, %xmm3, %xmm2
@@ -1616,12 +1616,12 @@ define <8 x i32> @test48(<8 x i32> %a, <8 x i32> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test48:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test48:
-; AVX512F: # BB#0: # %entry
+; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: retq
entry:
@@ -1632,7 +1632,7 @@ entry:
define <16 x i8> @test49(<16 x i8> %a, <16 x i8> %b) {
; SSE2-LABEL: test49:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa %xmm1, %xmm2
; SSE2-NEXT: pcmpgtb %xmm0, %xmm2
; SSE2-NEXT: pand %xmm2, %xmm1
@@ -1642,12 +1642,12 @@ define <16 x i8> @test49(<16 x i8> %a, <16 x i8> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test49:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: pmaxsb %xmm1, %xmm0
; SSE4-NEXT: retq
;
; AVX-LABEL: test49:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
entry:
@@ -1658,7 +1658,7 @@ entry:
define <16 x i8> @test50(<16 x i8> %a, <16 x i8> %b) {
; SSE2-LABEL: test50:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: pcmpgtb %xmm1, %xmm2
; SSE2-NEXT: pcmpeqd %xmm3, %xmm3
@@ -1670,12 +1670,12 @@ define <16 x i8> @test50(<16 x i8> %a, <16 x i8> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test50:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: pmaxsb %xmm1, %xmm0
; SSE4-NEXT: retq
;
; AVX-LABEL: test50:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
entry:
@@ -1686,7 +1686,7 @@ entry:
define <16 x i8> @test51(<16 x i8> %a, <16 x i8> %b) {
; SSE2-LABEL: test51:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: pcmpgtb %xmm1, %xmm2
; SSE2-NEXT: pand %xmm2, %xmm1
@@ -1696,12 +1696,12 @@ define <16 x i8> @test51(<16 x i8> %a, <16 x i8> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test51:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: pminsb %xmm1, %xmm0
; SSE4-NEXT: retq
;
; AVX-LABEL: test51:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
entry:
@@ -1712,7 +1712,7 @@ entry:
define <16 x i8> @test52(<16 x i8> %a, <16 x i8> %b) {
; SSE2-LABEL: test52:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa %xmm1, %xmm3
; SSE2-NEXT: pcmpgtb %xmm0, %xmm3
; SSE2-NEXT: pcmpeqd %xmm2, %xmm2
@@ -1724,12 +1724,12 @@ define <16 x i8> @test52(<16 x i8> %a, <16 x i8> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test52:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: pminsb %xmm1, %xmm0
; SSE4-NEXT: retq
;
; AVX-LABEL: test52:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
entry:
@@ -1740,12 +1740,12 @@ entry:
define <16 x i8> @test53(<16 x i8> %a, <16 x i8> %b) {
; SSE-LABEL: test53:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: pmaxub %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test53:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
entry:
@@ -1756,12 +1756,12 @@ entry:
define <16 x i8> @test54(<16 x i8> %a, <16 x i8> %b) {
; SSE-LABEL: test54:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: pmaxub %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test54:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
entry:
@@ -1772,12 +1772,12 @@ entry:
define <16 x i8> @test55(<16 x i8> %a, <16 x i8> %b) {
; SSE-LABEL: test55:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: pminub %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test55:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
entry:
@@ -1788,12 +1788,12 @@ entry:
define <16 x i8> @test56(<16 x i8> %a, <16 x i8> %b) {
; SSE-LABEL: test56:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: pminub %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test56:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
entry:
@@ -1804,12 +1804,12 @@ entry:
define <8 x i16> @test57(<8 x i16> %a, <8 x i16> %b) {
; SSE-LABEL: test57:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: pmaxsw %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test57:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
entry:
@@ -1820,12 +1820,12 @@ entry:
define <8 x i16> @test58(<8 x i16> %a, <8 x i16> %b) {
; SSE-LABEL: test58:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: pmaxsw %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test58:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
entry:
@@ -1836,12 +1836,12 @@ entry:
define <8 x i16> @test59(<8 x i16> %a, <8 x i16> %b) {
; SSE-LABEL: test59:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: pminsw %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test59:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vpminsw %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
entry:
@@ -1852,12 +1852,12 @@ entry:
define <8 x i16> @test60(<8 x i16> %a, <8 x i16> %b) {
; SSE-LABEL: test60:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: pminsw %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test60:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vpminsw %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
entry:
@@ -1868,7 +1868,7 @@ entry:
define <8 x i16> @test61(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: test61:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
; SSE2-NEXT: movdqa %xmm0, %xmm3
; SSE2-NEXT: pxor %xmm2, %xmm3
@@ -1881,12 +1881,12 @@ define <8 x i16> @test61(<8 x i16> %a, <8 x i16> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test61:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: pmaxuw %xmm1, %xmm0
; SSE4-NEXT: retq
;
; AVX-LABEL: test61:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
entry:
@@ -1897,7 +1897,7 @@ entry:
define <8 x i16> @test62(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: test62:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa %xmm0, %xmm3
; SSE2-NEXT: psubusw %xmm1, %xmm3
; SSE2-NEXT: pxor %xmm2, %xmm2
@@ -1909,12 +1909,12 @@ define <8 x i16> @test62(<8 x i16> %a, <8 x i16> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test62:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: pmaxuw %xmm1, %xmm0
; SSE4-NEXT: retq
;
; AVX-LABEL: test62:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
entry:
@@ -1925,7 +1925,7 @@ entry:
define <8 x i16> @test63(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: test63:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
; SSE2-NEXT: movdqa %xmm1, %xmm3
; SSE2-NEXT: pxor %xmm2, %xmm3
@@ -1938,12 +1938,12 @@ define <8 x i16> @test63(<8 x i16> %a, <8 x i16> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test63:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: pminuw %xmm1, %xmm0
; SSE4-NEXT: retq
;
; AVX-LABEL: test63:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vpminuw %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
entry:
@@ -1954,7 +1954,7 @@ entry:
define <8 x i16> @test64(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: test64:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa %xmm1, %xmm3
; SSE2-NEXT: psubusw %xmm0, %xmm3
; SSE2-NEXT: pxor %xmm2, %xmm2
@@ -1966,12 +1966,12 @@ define <8 x i16> @test64(<8 x i16> %a, <8 x i16> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test64:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: pminuw %xmm1, %xmm0
; SSE4-NEXT: retq
;
; AVX-LABEL: test64:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vpminuw %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
entry:
@@ -1982,7 +1982,7 @@ entry:
define <4 x i32> @test65(<4 x i32> %a, <4 x i32> %b) {
; SSE2-LABEL: test65:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa %xmm1, %xmm2
; SSE2-NEXT: pcmpgtd %xmm0, %xmm2
; SSE2-NEXT: pand %xmm2, %xmm1
@@ -1992,12 +1992,12 @@ define <4 x i32> @test65(<4 x i32> %a, <4 x i32> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test65:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: pmaxsd %xmm1, %xmm0
; SSE4-NEXT: retq
;
; AVX-LABEL: test65:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
entry:
@@ -2008,7 +2008,7 @@ entry:
define <4 x i32> @test66(<4 x i32> %a, <4 x i32> %b) {
; SSE2-LABEL: test66:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
; SSE2-NEXT: pcmpeqd %xmm3, %xmm3
@@ -2020,12 +2020,12 @@ define <4 x i32> @test66(<4 x i32> %a, <4 x i32> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test66:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: pmaxsd %xmm1, %xmm0
; SSE4-NEXT: retq
;
; AVX-LABEL: test66:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
entry:
@@ -2036,7 +2036,7 @@ entry:
define <4 x i32> @test67(<4 x i32> %a, <4 x i32> %b) {
; SSE2-LABEL: test67:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
; SSE2-NEXT: pand %xmm2, %xmm1
@@ -2046,12 +2046,12 @@ define <4 x i32> @test67(<4 x i32> %a, <4 x i32> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test67:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: pminsd %xmm1, %xmm0
; SSE4-NEXT: retq
;
; AVX-LABEL: test67:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vpminsd %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
entry:
@@ -2062,7 +2062,7 @@ entry:
define <4 x i32> @test68(<4 x i32> %a, <4 x i32> %b) {
; SSE2-LABEL: test68:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa %xmm1, %xmm3
; SSE2-NEXT: pcmpgtd %xmm0, %xmm3
; SSE2-NEXT: pcmpeqd %xmm2, %xmm2
@@ -2074,12 +2074,12 @@ define <4 x i32> @test68(<4 x i32> %a, <4 x i32> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test68:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: pminsd %xmm1, %xmm0
; SSE4-NEXT: retq
;
; AVX-LABEL: test68:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vpminsd %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
entry:
@@ -2090,7 +2090,7 @@ entry:
define <4 x i32> @test69(<4 x i32> %a, <4 x i32> %b) {
; SSE2-LABEL: test69:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
; SSE2-NEXT: movdqa %xmm0, %xmm3
; SSE2-NEXT: pxor %xmm2, %xmm3
@@ -2103,12 +2103,12 @@ define <4 x i32> @test69(<4 x i32> %a, <4 x i32> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test69:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: pmaxud %xmm1, %xmm0
; SSE4-NEXT: retq
;
; AVX-LABEL: test69:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
entry:
@@ -2119,7 +2119,7 @@ entry:
define <4 x i32> @test70(<4 x i32> %a, <4 x i32> %b) {
; SSE2-LABEL: test70:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
; SSE2-NEXT: movdqa %xmm1, %xmm2
; SSE2-NEXT: pxor %xmm3, %xmm2
@@ -2134,12 +2134,12 @@ define <4 x i32> @test70(<4 x i32> %a, <4 x i32> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test70:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: pmaxud %xmm1, %xmm0
; SSE4-NEXT: retq
;
; AVX-LABEL: test70:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
entry:
@@ -2150,7 +2150,7 @@ entry:
define <4 x i32> @test71(<4 x i32> %a, <4 x i32> %b) {
; SSE2-LABEL: test71:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
; SSE2-NEXT: movdqa %xmm1, %xmm3
; SSE2-NEXT: pxor %xmm2, %xmm3
@@ -2163,12 +2163,12 @@ define <4 x i32> @test71(<4 x i32> %a, <4 x i32> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test71:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: pminud %xmm1, %xmm0
; SSE4-NEXT: retq
;
; AVX-LABEL: test71:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vpminud %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
entry:
@@ -2179,7 +2179,7 @@ entry:
define <4 x i32> @test72(<4 x i32> %a, <4 x i32> %b) {
; SSE2-LABEL: test72:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: pxor %xmm3, %xmm2
@@ -2194,12 +2194,12 @@ define <4 x i32> @test72(<4 x i32> %a, <4 x i32> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test72:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: pminud %xmm1, %xmm0
; SSE4-NEXT: retq
;
; AVX-LABEL: test72:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vpminud %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
entry:
@@ -2210,7 +2210,7 @@ entry:
define <32 x i8> @test73(<32 x i8> %a, <32 x i8> %b) {
; SSE2-LABEL: test73:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa %xmm3, %xmm4
; SSE2-NEXT: pcmpgtb %xmm1, %xmm4
; SSE2-NEXT: movdqa %xmm2, %xmm5
@@ -2226,13 +2226,13 @@ define <32 x i8> @test73(<32 x i8> %a, <32 x i8> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test73:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: pmaxsb %xmm2, %xmm0
; SSE4-NEXT: pmaxsb %xmm3, %xmm1
; SSE4-NEXT: retq
;
; AVX1-LABEL: test73:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpmaxsb %xmm2, %xmm3, %xmm2
@@ -2241,12 +2241,12 @@ define <32 x i8> @test73(<32 x i8> %a, <32 x i8> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test73:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test73:
-; AVX512F: # BB#0: # %entry
+; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: retq
entry:
@@ -2257,7 +2257,7 @@ entry:
define <32 x i8> @test74(<32 x i8> %a, <32 x i8> %b) {
; SSE2-LABEL: test74:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa %xmm1, %xmm6
; SSE2-NEXT: pcmpgtb %xmm3, %xmm6
; SSE2-NEXT: pcmpeqd %xmm7, %xmm7
@@ -2277,13 +2277,13 @@ define <32 x i8> @test74(<32 x i8> %a, <32 x i8> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test74:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: pmaxsb %xmm2, %xmm0
; SSE4-NEXT: pmaxsb %xmm3, %xmm1
; SSE4-NEXT: retq
;
; AVX1-LABEL: test74:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpmaxsb %xmm2, %xmm3, %xmm2
@@ -2292,12 +2292,12 @@ define <32 x i8> @test74(<32 x i8> %a, <32 x i8> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test74:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test74:
-; AVX512F: # BB#0: # %entry
+; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: retq
entry:
@@ -2308,7 +2308,7 @@ entry:
define <32 x i8> @test75(<32 x i8> %a, <32 x i8> %b) {
; SSE2-LABEL: test75:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa %xmm1, %xmm4
; SSE2-NEXT: pcmpgtb %xmm3, %xmm4
; SSE2-NEXT: movdqa %xmm0, %xmm5
@@ -2324,13 +2324,13 @@ define <32 x i8> @test75(<32 x i8> %a, <32 x i8> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test75:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: pminsb %xmm2, %xmm0
; SSE4-NEXT: pminsb %xmm3, %xmm1
; SSE4-NEXT: retq
;
; AVX1-LABEL: test75:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpminsb %xmm2, %xmm3, %xmm2
@@ -2339,12 +2339,12 @@ define <32 x i8> @test75(<32 x i8> %a, <32 x i8> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test75:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test75:
-; AVX512F: # BB#0: # %entry
+; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vpminsb %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: retq
entry:
@@ -2355,7 +2355,7 @@ entry:
define <32 x i8> @test76(<32 x i8> %a, <32 x i8> %b) {
; SSE2-LABEL: test76:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa %xmm3, %xmm6
; SSE2-NEXT: pcmpgtb %xmm1, %xmm6
; SSE2-NEXT: pcmpeqd %xmm4, %xmm4
@@ -2375,13 +2375,13 @@ define <32 x i8> @test76(<32 x i8> %a, <32 x i8> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test76:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: pminsb %xmm2, %xmm0
; SSE4-NEXT: pminsb %xmm3, %xmm1
; SSE4-NEXT: retq
;
; AVX1-LABEL: test76:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpminsb %xmm2, %xmm3, %xmm2
@@ -2390,12 +2390,12 @@ define <32 x i8> @test76(<32 x i8> %a, <32 x i8> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test76:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test76:
-; AVX512F: # BB#0: # %entry
+; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vpminsb %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: retq
entry:
@@ -2406,13 +2406,13 @@ entry:
define <32 x i8> @test77(<32 x i8> %a, <32 x i8> %b) {
; SSE-LABEL: test77:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: pmaxub %xmm2, %xmm0
; SSE-NEXT: pmaxub %xmm3, %xmm1
; SSE-NEXT: retq
;
; AVX1-LABEL: test77:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpmaxub %xmm2, %xmm3, %xmm2
@@ -2421,12 +2421,12 @@ define <32 x i8> @test77(<32 x i8> %a, <32 x i8> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test77:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test77:
-; AVX512F: # BB#0: # %entry
+; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: retq
entry:
@@ -2437,13 +2437,13 @@ entry:
define <32 x i8> @test78(<32 x i8> %a, <32 x i8> %b) {
; SSE-LABEL: test78:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: pmaxub %xmm2, %xmm0
; SSE-NEXT: pmaxub %xmm3, %xmm1
; SSE-NEXT: retq
;
; AVX1-LABEL: test78:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpmaxub %xmm2, %xmm3, %xmm2
@@ -2452,12 +2452,12 @@ define <32 x i8> @test78(<32 x i8> %a, <32 x i8> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test78:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test78:
-; AVX512F: # BB#0: # %entry
+; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: retq
entry:
@@ -2468,13 +2468,13 @@ entry:
define <32 x i8> @test79(<32 x i8> %a, <32 x i8> %b) {
; SSE-LABEL: test79:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: pminub %xmm2, %xmm0
; SSE-NEXT: pminub %xmm3, %xmm1
; SSE-NEXT: retq
;
; AVX1-LABEL: test79:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpminub %xmm2, %xmm3, %xmm2
@@ -2483,12 +2483,12 @@ define <32 x i8> @test79(<32 x i8> %a, <32 x i8> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test79:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test79:
-; AVX512F: # BB#0: # %entry
+; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vpminub %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: retq
entry:
@@ -2499,13 +2499,13 @@ entry:
define <32 x i8> @test80(<32 x i8> %a, <32 x i8> %b) {
; SSE-LABEL: test80:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: pminub %xmm2, %xmm0
; SSE-NEXT: pminub %xmm3, %xmm1
; SSE-NEXT: retq
;
; AVX1-LABEL: test80:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpminub %xmm2, %xmm3, %xmm2
@@ -2514,12 +2514,12 @@ define <32 x i8> @test80(<32 x i8> %a, <32 x i8> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test80:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test80:
-; AVX512F: # BB#0: # %entry
+; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vpminub %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: retq
entry:
@@ -2530,13 +2530,13 @@ entry:
define <16 x i16> @test81(<16 x i16> %a, <16 x i16> %b) {
; SSE-LABEL: test81:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: pmaxsw %xmm2, %xmm0
; SSE-NEXT: pmaxsw %xmm3, %xmm1
; SSE-NEXT: retq
;
; AVX1-LABEL: test81:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpmaxsw %xmm2, %xmm3, %xmm2
@@ -2545,12 +2545,12 @@ define <16 x i16> @test81(<16 x i16> %a, <16 x i16> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test81:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test81:
-; AVX512F: # BB#0: # %entry
+; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: retq
entry:
@@ -2561,13 +2561,13 @@ entry:
define <16 x i16> @test82(<16 x i16> %a, <16 x i16> %b) {
; SSE-LABEL: test82:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: pmaxsw %xmm2, %xmm0
; SSE-NEXT: pmaxsw %xmm3, %xmm1
; SSE-NEXT: retq
;
; AVX1-LABEL: test82:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpmaxsw %xmm2, %xmm3, %xmm2
@@ -2576,12 +2576,12 @@ define <16 x i16> @test82(<16 x i16> %a, <16 x i16> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test82:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test82:
-; AVX512F: # BB#0: # %entry
+; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: retq
entry:
@@ -2592,13 +2592,13 @@ entry:
define <16 x i16> @test83(<16 x i16> %a, <16 x i16> %b) {
; SSE-LABEL: test83:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: pminsw %xmm2, %xmm0
; SSE-NEXT: pminsw %xmm3, %xmm1
; SSE-NEXT: retq
;
; AVX1-LABEL: test83:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpminsw %xmm2, %xmm3, %xmm2
@@ -2607,12 +2607,12 @@ define <16 x i16> @test83(<16 x i16> %a, <16 x i16> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test83:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test83:
-; AVX512F: # BB#0: # %entry
+; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vpminsw %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: retq
entry:
@@ -2623,13 +2623,13 @@ entry:
define <16 x i16> @test84(<16 x i16> %a, <16 x i16> %b) {
; SSE-LABEL: test84:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: pminsw %xmm2, %xmm0
; SSE-NEXT: pminsw %xmm3, %xmm1
; SSE-NEXT: retq
;
; AVX1-LABEL: test84:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpminsw %xmm2, %xmm3, %xmm2
@@ -2638,12 +2638,12 @@ define <16 x i16> @test84(<16 x i16> %a, <16 x i16> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test84:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test84:
-; AVX512F: # BB#0: # %entry
+; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vpminsw %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: retq
entry:
@@ -2654,7 +2654,7 @@ entry:
define <16 x i16> @test85(<16 x i16> %a, <16 x i16> %b) {
; SSE2-LABEL: test85:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [32768,32768,32768,32768,32768,32768,32768,32768]
; SSE2-NEXT: movdqa %xmm1, %xmm6
; SSE2-NEXT: pxor %xmm4, %xmm6
@@ -2676,13 +2676,13 @@ define <16 x i16> @test85(<16 x i16> %a, <16 x i16> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test85:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: pmaxuw %xmm2, %xmm0
; SSE4-NEXT: pmaxuw %xmm3, %xmm1
; SSE4-NEXT: retq
;
; AVX1-LABEL: test85:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpmaxuw %xmm2, %xmm3, %xmm2
@@ -2691,12 +2691,12 @@ define <16 x i16> @test85(<16 x i16> %a, <16 x i16> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test85:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test85:
-; AVX512F: # BB#0: # %entry
+; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: retq
entry:
@@ -2707,7 +2707,7 @@ entry:
define <16 x i16> @test86(<16 x i16> %a, <16 x i16> %b) {
; SSE2-LABEL: test86:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa %xmm1, %xmm4
; SSE2-NEXT: psubusw %xmm3, %xmm4
; SSE2-NEXT: pxor %xmm6, %xmm6
@@ -2726,13 +2726,13 @@ define <16 x i16> @test86(<16 x i16> %a, <16 x i16> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test86:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: pmaxuw %xmm2, %xmm0
; SSE4-NEXT: pmaxuw %xmm3, %xmm1
; SSE4-NEXT: retq
;
; AVX1-LABEL: test86:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpmaxuw %xmm2, %xmm3, %xmm2
@@ -2741,12 +2741,12 @@ define <16 x i16> @test86(<16 x i16> %a, <16 x i16> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test86:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test86:
-; AVX512F: # BB#0: # %entry
+; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: retq
entry:
@@ -2757,7 +2757,7 @@ entry:
define <16 x i16> @test87(<16 x i16> %a, <16 x i16> %b) {
; SSE2-LABEL: test87:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [32768,32768,32768,32768,32768,32768,32768,32768]
; SSE2-NEXT: movdqa %xmm3, %xmm6
; SSE2-NEXT: pxor %xmm4, %xmm6
@@ -2779,13 +2779,13 @@ define <16 x i16> @test87(<16 x i16> %a, <16 x i16> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test87:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: pminuw %xmm2, %xmm0
; SSE4-NEXT: pminuw %xmm3, %xmm1
; SSE4-NEXT: retq
;
; AVX1-LABEL: test87:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpminuw %xmm2, %xmm3, %xmm2
@@ -2794,12 +2794,12 @@ define <16 x i16> @test87(<16 x i16> %a, <16 x i16> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test87:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test87:
-; AVX512F: # BB#0: # %entry
+; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vpminuw %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: retq
entry:
@@ -2810,7 +2810,7 @@ entry:
define <16 x i16> @test88(<16 x i16> %a, <16 x i16> %b) {
; SSE2-LABEL: test88:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa %xmm3, %xmm4
; SSE2-NEXT: psubusw %xmm1, %xmm4
; SSE2-NEXT: pxor %xmm6, %xmm6
@@ -2829,13 +2829,13 @@ define <16 x i16> @test88(<16 x i16> %a, <16 x i16> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test88:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: pminuw %xmm2, %xmm0
; SSE4-NEXT: pminuw %xmm3, %xmm1
; SSE4-NEXT: retq
;
; AVX1-LABEL: test88:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpminuw %xmm2, %xmm3, %xmm2
@@ -2844,12 +2844,12 @@ define <16 x i16> @test88(<16 x i16> %a, <16 x i16> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test88:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test88:
-; AVX512F: # BB#0: # %entry
+; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vpminuw %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: retq
entry:
@@ -2860,7 +2860,7 @@ entry:
define <8 x i32> @test89(<8 x i32> %a, <8 x i32> %b) {
; SSE2-LABEL: test89:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa %xmm3, %xmm4
; SSE2-NEXT: pcmpgtd %xmm1, %xmm4
; SSE2-NEXT: movdqa %xmm2, %xmm5
@@ -2876,13 +2876,13 @@ define <8 x i32> @test89(<8 x i32> %a, <8 x i32> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test89:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: pmaxsd %xmm2, %xmm0
; SSE4-NEXT: pmaxsd %xmm3, %xmm1
; SSE4-NEXT: retq
;
; AVX1-LABEL: test89:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpmaxsd %xmm2, %xmm3, %xmm2
@@ -2891,12 +2891,12 @@ define <8 x i32> @test89(<8 x i32> %a, <8 x i32> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test89:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test89:
-; AVX512F: # BB#0: # %entry
+; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: retq
entry:
@@ -2907,7 +2907,7 @@ entry:
define <8 x i32> @test90(<8 x i32> %a, <8 x i32> %b) {
; SSE2-LABEL: test90:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa %xmm1, %xmm6
; SSE2-NEXT: pcmpgtd %xmm3, %xmm6
; SSE2-NEXT: pcmpeqd %xmm7, %xmm7
@@ -2927,13 +2927,13 @@ define <8 x i32> @test90(<8 x i32> %a, <8 x i32> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test90:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: pmaxsd %xmm2, %xmm0
; SSE4-NEXT: pmaxsd %xmm3, %xmm1
; SSE4-NEXT: retq
;
; AVX1-LABEL: test90:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpmaxsd %xmm2, %xmm3, %xmm2
@@ -2942,12 +2942,12 @@ define <8 x i32> @test90(<8 x i32> %a, <8 x i32> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test90:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test90:
-; AVX512F: # BB#0: # %entry
+; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: retq
entry:
@@ -2958,7 +2958,7 @@ entry:
define <8 x i32> @test91(<8 x i32> %a, <8 x i32> %b) {
; SSE2-LABEL: test91:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa %xmm1, %xmm4
; SSE2-NEXT: pcmpgtd %xmm3, %xmm4
; SSE2-NEXT: movdqa %xmm0, %xmm5
@@ -2974,13 +2974,13 @@ define <8 x i32> @test91(<8 x i32> %a, <8 x i32> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test91:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: pminsd %xmm2, %xmm0
; SSE4-NEXT: pminsd %xmm3, %xmm1
; SSE4-NEXT: retq
;
; AVX1-LABEL: test91:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpminsd %xmm2, %xmm3, %xmm2
@@ -2989,12 +2989,12 @@ define <8 x i32> @test91(<8 x i32> %a, <8 x i32> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test91:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test91:
-; AVX512F: # BB#0: # %entry
+; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vpminsd %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: retq
entry:
@@ -3005,7 +3005,7 @@ entry:
define <8 x i32> @test92(<8 x i32> %a, <8 x i32> %b) {
; SSE2-LABEL: test92:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa %xmm3, %xmm6
; SSE2-NEXT: pcmpgtd %xmm1, %xmm6
; SSE2-NEXT: pcmpeqd %xmm4, %xmm4
@@ -3025,13 +3025,13 @@ define <8 x i32> @test92(<8 x i32> %a, <8 x i32> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test92:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: pminsd %xmm2, %xmm0
; SSE4-NEXT: pminsd %xmm3, %xmm1
; SSE4-NEXT: retq
;
; AVX1-LABEL: test92:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpminsd %xmm2, %xmm3, %xmm2
@@ -3040,12 +3040,12 @@ define <8 x i32> @test92(<8 x i32> %a, <8 x i32> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test92:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test92:
-; AVX512F: # BB#0: # %entry
+; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vpminsd %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: retq
entry:
@@ -3056,7 +3056,7 @@ entry:
define <8 x i32> @test93(<8 x i32> %a, <8 x i32> %b) {
; SSE2-LABEL: test93:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
; SSE2-NEXT: movdqa %xmm1, %xmm6
; SSE2-NEXT: pxor %xmm4, %xmm6
@@ -3078,13 +3078,13 @@ define <8 x i32> @test93(<8 x i32> %a, <8 x i32> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test93:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: pmaxud %xmm2, %xmm0
; SSE4-NEXT: pmaxud %xmm3, %xmm1
; SSE4-NEXT: retq
;
; AVX1-LABEL: test93:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpmaxud %xmm2, %xmm3, %xmm2
@@ -3093,12 +3093,12 @@ define <8 x i32> @test93(<8 x i32> %a, <8 x i32> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test93:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test93:
-; AVX512F: # BB#0: # %entry
+; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: retq
entry:
@@ -3109,7 +3109,7 @@ entry:
define <8 x i32> @test94(<8 x i32> %a, <8 x i32> %b) {
; SSE2-LABEL: test94:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648]
; SSE2-NEXT: movdqa %xmm3, %xmm4
; SSE2-NEXT: pxor %xmm6, %xmm4
@@ -3135,13 +3135,13 @@ define <8 x i32> @test94(<8 x i32> %a, <8 x i32> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test94:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: pmaxud %xmm2, %xmm0
; SSE4-NEXT: pmaxud %xmm3, %xmm1
; SSE4-NEXT: retq
;
; AVX1-LABEL: test94:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpmaxud %xmm2, %xmm3, %xmm2
@@ -3150,12 +3150,12 @@ define <8 x i32> @test94(<8 x i32> %a, <8 x i32> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test94:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test94:
-; AVX512F: # BB#0: # %entry
+; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: retq
entry:
@@ -3166,7 +3166,7 @@ entry:
define <8 x i32> @test95(<8 x i32> %a, <8 x i32> %b) {
; SSE2-LABEL: test95:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
; SSE2-NEXT: movdqa %xmm3, %xmm6
; SSE2-NEXT: pxor %xmm4, %xmm6
@@ -3188,13 +3188,13 @@ define <8 x i32> @test95(<8 x i32> %a, <8 x i32> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test95:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: pminud %xmm2, %xmm0
; SSE4-NEXT: pminud %xmm3, %xmm1
; SSE4-NEXT: retq
;
; AVX1-LABEL: test95:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpminud %xmm2, %xmm3, %xmm2
@@ -3203,12 +3203,12 @@ define <8 x i32> @test95(<8 x i32> %a, <8 x i32> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test95:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test95:
-; AVX512F: # BB#0: # %entry
+; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vpminud %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: retq
entry:
@@ -3219,7 +3219,7 @@ entry:
define <8 x i32> @test96(<8 x i32> %a, <8 x i32> %b) {
; SSE2-LABEL: test96:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648]
; SSE2-NEXT: movdqa %xmm1, %xmm4
; SSE2-NEXT: pxor %xmm6, %xmm4
@@ -3245,13 +3245,13 @@ define <8 x i32> @test96(<8 x i32> %a, <8 x i32> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test96:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: pminud %xmm2, %xmm0
; SSE4-NEXT: pminud %xmm3, %xmm1
; SSE4-NEXT: retq
;
; AVX1-LABEL: test96:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpminud %xmm2, %xmm3, %xmm2
@@ -3260,12 +3260,12 @@ define <8 x i32> @test96(<8 x i32> %a, <8 x i32> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test96:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test96:
-; AVX512F: # BB#0: # %entry
+; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vpminud %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: retq
entry:
@@ -3278,7 +3278,7 @@ entry:
define <64 x i8> @test97(<64 x i8> %a, <64 x i8> %b) {
; SSE2-LABEL: test97:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa %xmm7, %xmm8
; SSE2-NEXT: pcmpgtb %xmm3, %xmm8
; SSE2-NEXT: movdqa %xmm6, %xmm9
@@ -3302,7 +3302,7 @@ define <64 x i8> @test97(<64 x i8> %a, <64 x i8> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test97:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: pminsb %xmm4, %xmm0
; SSE4-NEXT: pminsb %xmm5, %xmm1
; SSE4-NEXT: pminsb %xmm6, %xmm2
@@ -3310,7 +3310,7 @@ define <64 x i8> @test97(<64 x i8> %a, <64 x i8> %b) {
; SSE4-NEXT: retq
;
; AVX1-LABEL: test97:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
; AVX1-NEXT: vpminsb %xmm4, %xmm5, %xmm4
@@ -3324,13 +3324,13 @@ define <64 x i8> @test97(<64 x i8> %a, <64 x i8> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test97:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpminsb %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpminsb %ymm3, %ymm1, %ymm1
; AVX2-NEXT: retq
;
; AVX512BW-LABEL: test97:
-; AVX512BW: # BB#0: # %entry
+; AVX512BW: # %bb.0: # %entry
; AVX512BW-NEXT: vpminsb %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: retq
entry:
@@ -3341,7 +3341,7 @@ entry:
define <64 x i8> @test98(<64 x i8> %a, <64 x i8> %b) {
; SSE2-LABEL: test98:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa %xmm3, %xmm8
; SSE2-NEXT: movdqa %xmm2, %xmm9
; SSE2-NEXT: movdqa %xmm8, %xmm12
@@ -3377,7 +3377,7 @@ define <64 x i8> @test98(<64 x i8> %a, <64 x i8> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test98:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: pminsb %xmm4, %xmm0
; SSE4-NEXT: pminsb %xmm5, %xmm1
; SSE4-NEXT: pminsb %xmm6, %xmm2
@@ -3385,7 +3385,7 @@ define <64 x i8> @test98(<64 x i8> %a, <64 x i8> %b) {
; SSE4-NEXT: retq
;
; AVX1-LABEL: test98:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
; AVX1-NEXT: vpminsb %xmm4, %xmm5, %xmm4
@@ -3399,13 +3399,13 @@ define <64 x i8> @test98(<64 x i8> %a, <64 x i8> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test98:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpminsb %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpminsb %ymm3, %ymm1, %ymm1
; AVX2-NEXT: retq
;
; AVX512BW-LABEL: test98:
-; AVX512BW: # BB#0: # %entry
+; AVX512BW: # %bb.0: # %entry
; AVX512BW-NEXT: vpminsb %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: retq
entry:
@@ -3416,7 +3416,7 @@ entry:
define <64 x i8> @test99(<64 x i8> %a, <64 x i8> %b) {
; SSE2-LABEL: test99:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa %xmm3, %xmm8
; SSE2-NEXT: pcmpgtb %xmm7, %xmm3
; SSE2-NEXT: movdqa %xmm2, %xmm9
@@ -3443,7 +3443,7 @@ define <64 x i8> @test99(<64 x i8> %a, <64 x i8> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test99:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: pmaxsb %xmm4, %xmm0
; SSE4-NEXT: pmaxsb %xmm5, %xmm1
; SSE4-NEXT: pmaxsb %xmm6, %xmm2
@@ -3451,7 +3451,7 @@ define <64 x i8> @test99(<64 x i8> %a, <64 x i8> %b) {
; SSE4-NEXT: retq
;
; AVX1-LABEL: test99:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
; AVX1-NEXT: vpmaxsb %xmm4, %xmm5, %xmm4
@@ -3465,13 +3465,13 @@ define <64 x i8> @test99(<64 x i8> %a, <64 x i8> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test99:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpmaxsb %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpmaxsb %ymm3, %ymm1, %ymm1
; AVX2-NEXT: retq
;
; AVX512BW-LABEL: test99:
-; AVX512BW: # BB#0: # %entry
+; AVX512BW: # %bb.0: # %entry
; AVX512BW-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: retq
entry:
@@ -3482,7 +3482,7 @@ entry:
define <64 x i8> @test100(<64 x i8> %a, <64 x i8> %b) {
; SSE2-LABEL: test100:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa %xmm3, %xmm8
; SSE2-NEXT: movdqa %xmm2, %xmm9
; SSE2-NEXT: movdqa %xmm0, %xmm10
@@ -3518,7 +3518,7 @@ define <64 x i8> @test100(<64 x i8> %a, <64 x i8> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test100:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: pmaxsb %xmm4, %xmm0
; SSE4-NEXT: pmaxsb %xmm5, %xmm1
; SSE4-NEXT: pmaxsb %xmm6, %xmm2
@@ -3526,7 +3526,7 @@ define <64 x i8> @test100(<64 x i8> %a, <64 x i8> %b) {
; SSE4-NEXT: retq
;
; AVX1-LABEL: test100:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
; AVX1-NEXT: vpmaxsb %xmm4, %xmm5, %xmm4
@@ -3540,13 +3540,13 @@ define <64 x i8> @test100(<64 x i8> %a, <64 x i8> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test100:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpmaxsb %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpmaxsb %ymm3, %ymm1, %ymm1
; AVX2-NEXT: retq
;
; AVX512BW-LABEL: test100:
-; AVX512BW: # BB#0: # %entry
+; AVX512BW: # %bb.0: # %entry
; AVX512BW-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: retq
entry:
@@ -3557,7 +3557,7 @@ entry:
define <64 x i8> @test101(<64 x i8> %a, <64 x i8> %b) {
; SSE-LABEL: test101:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: pminub %xmm4, %xmm0
; SSE-NEXT: pminub %xmm5, %xmm1
; SSE-NEXT: pminub %xmm6, %xmm2
@@ -3565,7 +3565,7 @@ define <64 x i8> @test101(<64 x i8> %a, <64 x i8> %b) {
; SSE-NEXT: retq
;
; AVX1-LABEL: test101:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
; AVX1-NEXT: vpminub %xmm4, %xmm5, %xmm4
@@ -3579,13 +3579,13 @@ define <64 x i8> @test101(<64 x i8> %a, <64 x i8> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test101:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpminub %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpminub %ymm3, %ymm1, %ymm1
; AVX2-NEXT: retq
;
; AVX512BW-LABEL: test101:
-; AVX512BW: # BB#0: # %entry
+; AVX512BW: # %bb.0: # %entry
; AVX512BW-NEXT: vpminub %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: retq
entry:
@@ -3596,7 +3596,7 @@ entry:
define <64 x i8> @test102(<64 x i8> %a, <64 x i8> %b) {
; SSE-LABEL: test102:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: pminub %xmm4, %xmm0
; SSE-NEXT: pminub %xmm5, %xmm1
; SSE-NEXT: pminub %xmm6, %xmm2
@@ -3604,7 +3604,7 @@ define <64 x i8> @test102(<64 x i8> %a, <64 x i8> %b) {
; SSE-NEXT: retq
;
; AVX1-LABEL: test102:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
; AVX1-NEXT: vpminub %xmm4, %xmm5, %xmm4
@@ -3618,13 +3618,13 @@ define <64 x i8> @test102(<64 x i8> %a, <64 x i8> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test102:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpminub %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpminub %ymm3, %ymm1, %ymm1
; AVX2-NEXT: retq
;
; AVX512BW-LABEL: test102:
-; AVX512BW: # BB#0: # %entry
+; AVX512BW: # %bb.0: # %entry
; AVX512BW-NEXT: vpminub %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: retq
entry:
@@ -3635,7 +3635,7 @@ entry:
define <64 x i8> @test103(<64 x i8> %a, <64 x i8> %b) {
; SSE-LABEL: test103:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: pmaxub %xmm4, %xmm0
; SSE-NEXT: pmaxub %xmm5, %xmm1
; SSE-NEXT: pmaxub %xmm6, %xmm2
@@ -3643,7 +3643,7 @@ define <64 x i8> @test103(<64 x i8> %a, <64 x i8> %b) {
; SSE-NEXT: retq
;
; AVX1-LABEL: test103:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
; AVX1-NEXT: vpmaxub %xmm4, %xmm5, %xmm4
@@ -3657,13 +3657,13 @@ define <64 x i8> @test103(<64 x i8> %a, <64 x i8> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test103:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpmaxub %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpmaxub %ymm3, %ymm1, %ymm1
; AVX2-NEXT: retq
;
; AVX512BW-LABEL: test103:
-; AVX512BW: # BB#0: # %entry
+; AVX512BW: # %bb.0: # %entry
; AVX512BW-NEXT: vpmaxub %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: retq
entry:
@@ -3674,7 +3674,7 @@ entry:
define <64 x i8> @test104(<64 x i8> %a, <64 x i8> %b) {
; SSE-LABEL: test104:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: pmaxub %xmm4, %xmm0
; SSE-NEXT: pmaxub %xmm5, %xmm1
; SSE-NEXT: pmaxub %xmm6, %xmm2
@@ -3682,7 +3682,7 @@ define <64 x i8> @test104(<64 x i8> %a, <64 x i8> %b) {
; SSE-NEXT: retq
;
; AVX1-LABEL: test104:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
; AVX1-NEXT: vpmaxub %xmm4, %xmm5, %xmm4
@@ -3696,13 +3696,13 @@ define <64 x i8> @test104(<64 x i8> %a, <64 x i8> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test104:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpmaxub %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpmaxub %ymm3, %ymm1, %ymm1
; AVX2-NEXT: retq
;
; AVX512BW-LABEL: test104:
-; AVX512BW: # BB#0: # %entry
+; AVX512BW: # %bb.0: # %entry
; AVX512BW-NEXT: vpmaxub %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: retq
entry:
@@ -3713,7 +3713,7 @@ entry:
define <32 x i16> @test105(<32 x i16> %a, <32 x i16> %b) {
; SSE-LABEL: test105:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: pminsw %xmm4, %xmm0
; SSE-NEXT: pminsw %xmm5, %xmm1
; SSE-NEXT: pminsw %xmm6, %xmm2
@@ -3721,7 +3721,7 @@ define <32 x i16> @test105(<32 x i16> %a, <32 x i16> %b) {
; SSE-NEXT: retq
;
; AVX1-LABEL: test105:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
; AVX1-NEXT: vpminsw %xmm4, %xmm5, %xmm4
@@ -3735,13 +3735,13 @@ define <32 x i16> @test105(<32 x i16> %a, <32 x i16> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test105:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpminsw %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpminsw %ymm3, %ymm1, %ymm1
; AVX2-NEXT: retq
;
; AVX512BW-LABEL: test105:
-; AVX512BW: # BB#0: # %entry
+; AVX512BW: # %bb.0: # %entry
; AVX512BW-NEXT: vpminsw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: retq
entry:
@@ -3752,7 +3752,7 @@ entry:
define <32 x i16> @test106(<32 x i16> %a, <32 x i16> %b) {
; SSE-LABEL: test106:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: pminsw %xmm4, %xmm0
; SSE-NEXT: pminsw %xmm5, %xmm1
; SSE-NEXT: pminsw %xmm6, %xmm2
@@ -3760,7 +3760,7 @@ define <32 x i16> @test106(<32 x i16> %a, <32 x i16> %b) {
; SSE-NEXT: retq
;
; AVX1-LABEL: test106:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
; AVX1-NEXT: vpminsw %xmm4, %xmm5, %xmm4
@@ -3774,13 +3774,13 @@ define <32 x i16> @test106(<32 x i16> %a, <32 x i16> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test106:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpminsw %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpminsw %ymm3, %ymm1, %ymm1
; AVX2-NEXT: retq
;
; AVX512BW-LABEL: test106:
-; AVX512BW: # BB#0: # %entry
+; AVX512BW: # %bb.0: # %entry
; AVX512BW-NEXT: vpminsw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: retq
entry:
@@ -3791,7 +3791,7 @@ entry:
define <32 x i16> @test107(<32 x i16> %a, <32 x i16> %b) {
; SSE-LABEL: test107:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: pmaxsw %xmm4, %xmm0
; SSE-NEXT: pmaxsw %xmm5, %xmm1
; SSE-NEXT: pmaxsw %xmm6, %xmm2
@@ -3799,7 +3799,7 @@ define <32 x i16> @test107(<32 x i16> %a, <32 x i16> %b) {
; SSE-NEXT: retq
;
; AVX1-LABEL: test107:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
; AVX1-NEXT: vpmaxsw %xmm4, %xmm5, %xmm4
@@ -3813,13 +3813,13 @@ define <32 x i16> @test107(<32 x i16> %a, <32 x i16> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test107:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpmaxsw %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpmaxsw %ymm3, %ymm1, %ymm1
; AVX2-NEXT: retq
;
; AVX512BW-LABEL: test107:
-; AVX512BW: # BB#0: # %entry
+; AVX512BW: # %bb.0: # %entry
; AVX512BW-NEXT: vpmaxsw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: retq
entry:
@@ -3830,7 +3830,7 @@ entry:
define <32 x i16> @test108(<32 x i16> %a, <32 x i16> %b) {
; SSE-LABEL: test108:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: pmaxsw %xmm4, %xmm0
; SSE-NEXT: pmaxsw %xmm5, %xmm1
; SSE-NEXT: pmaxsw %xmm6, %xmm2
@@ -3838,7 +3838,7 @@ define <32 x i16> @test108(<32 x i16> %a, <32 x i16> %b) {
; SSE-NEXT: retq
;
; AVX1-LABEL: test108:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
; AVX1-NEXT: vpmaxsw %xmm4, %xmm5, %xmm4
@@ -3852,13 +3852,13 @@ define <32 x i16> @test108(<32 x i16> %a, <32 x i16> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test108:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpmaxsw %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpmaxsw %ymm3, %ymm1, %ymm1
; AVX2-NEXT: retq
;
; AVX512BW-LABEL: test108:
-; AVX512BW: # BB#0: # %entry
+; AVX512BW: # %bb.0: # %entry
; AVX512BW-NEXT: vpmaxsw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: retq
entry:
@@ -3869,7 +3869,7 @@ entry:
define <32 x i16> @test109(<32 x i16> %a, <32 x i16> %b) {
; SSE2-LABEL: test109:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [32768,32768,32768,32768,32768,32768,32768,32768]
; SSE2-NEXT: movdqa %xmm3, %xmm9
; SSE2-NEXT: pxor %xmm10, %xmm9
@@ -3905,7 +3905,7 @@ define <32 x i16> @test109(<32 x i16> %a, <32 x i16> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test109:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: pminuw %xmm4, %xmm0
; SSE4-NEXT: pminuw %xmm5, %xmm1
; SSE4-NEXT: pminuw %xmm6, %xmm2
@@ -3913,7 +3913,7 @@ define <32 x i16> @test109(<32 x i16> %a, <32 x i16> %b) {
; SSE4-NEXT: retq
;
; AVX1-LABEL: test109:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
; AVX1-NEXT: vpminuw %xmm4, %xmm5, %xmm4
@@ -3927,13 +3927,13 @@ define <32 x i16> @test109(<32 x i16> %a, <32 x i16> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test109:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpminuw %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpminuw %ymm3, %ymm1, %ymm1
; AVX2-NEXT: retq
;
; AVX512BW-LABEL: test109:
-; AVX512BW: # BB#0: # %entry
+; AVX512BW: # %bb.0: # %entry
; AVX512BW-NEXT: vpminuw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: retq
entry:
@@ -3944,7 +3944,7 @@ entry:
define <32 x i16> @test110(<32 x i16> %a, <32 x i16> %b) {
; SSE2-LABEL: test110:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa %xmm3, %xmm8
; SSE2-NEXT: movdqa %xmm2, %xmm9
; SSE2-NEXT: movdqa %xmm1, %xmm10
@@ -3974,7 +3974,7 @@ define <32 x i16> @test110(<32 x i16> %a, <32 x i16> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test110:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: pminuw %xmm4, %xmm0
; SSE4-NEXT: pminuw %xmm5, %xmm1
; SSE4-NEXT: pminuw %xmm6, %xmm2
@@ -3982,7 +3982,7 @@ define <32 x i16> @test110(<32 x i16> %a, <32 x i16> %b) {
; SSE4-NEXT: retq
;
; AVX1-LABEL: test110:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
; AVX1-NEXT: vpminuw %xmm4, %xmm5, %xmm4
@@ -3996,13 +3996,13 @@ define <32 x i16> @test110(<32 x i16> %a, <32 x i16> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test110:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpminuw %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpminuw %ymm3, %ymm1, %ymm1
; AVX2-NEXT: retq
;
; AVX512BW-LABEL: test110:
-; AVX512BW: # BB#0: # %entry
+; AVX512BW: # %bb.0: # %entry
; AVX512BW-NEXT: vpminuw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: retq
entry:
@@ -4013,7 +4013,7 @@ entry:
define <32 x i16> @test111(<32 x i16> %a, <32 x i16> %b) {
; SSE2-LABEL: test111:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [32768,32768,32768,32768,32768,32768,32768,32768]
; SSE2-NEXT: movdqa %xmm7, %xmm9
; SSE2-NEXT: pxor %xmm11, %xmm9
@@ -4052,7 +4052,7 @@ define <32 x i16> @test111(<32 x i16> %a, <32 x i16> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test111:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: pmaxuw %xmm4, %xmm0
; SSE4-NEXT: pmaxuw %xmm5, %xmm1
; SSE4-NEXT: pmaxuw %xmm6, %xmm2
@@ -4060,7 +4060,7 @@ define <32 x i16> @test111(<32 x i16> %a, <32 x i16> %b) {
; SSE4-NEXT: retq
;
; AVX1-LABEL: test111:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
; AVX1-NEXT: vpmaxuw %xmm4, %xmm5, %xmm4
@@ -4074,13 +4074,13 @@ define <32 x i16> @test111(<32 x i16> %a, <32 x i16> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test111:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpmaxuw %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpmaxuw %ymm3, %ymm1, %ymm1
; AVX2-NEXT: retq
;
; AVX512BW-LABEL: test111:
-; AVX512BW: # BB#0: # %entry
+; AVX512BW: # %bb.0: # %entry
; AVX512BW-NEXT: vpmaxuw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: retq
entry:
@@ -4091,7 +4091,7 @@ entry:
define <32 x i16> @test112(<32 x i16> %a, <32 x i16> %b) {
; SSE2-LABEL: test112:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa %xmm7, %xmm8
; SSE2-NEXT: psubusw %xmm3, %xmm8
; SSE2-NEXT: pxor %xmm9, %xmm9
@@ -4120,7 +4120,7 @@ define <32 x i16> @test112(<32 x i16> %a, <32 x i16> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test112:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: pmaxuw %xmm4, %xmm0
; SSE4-NEXT: pmaxuw %xmm5, %xmm1
; SSE4-NEXT: pmaxuw %xmm6, %xmm2
@@ -4128,7 +4128,7 @@ define <32 x i16> @test112(<32 x i16> %a, <32 x i16> %b) {
; SSE4-NEXT: retq
;
; AVX1-LABEL: test112:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
; AVX1-NEXT: vpmaxuw %xmm4, %xmm5, %xmm4
@@ -4142,13 +4142,13 @@ define <32 x i16> @test112(<32 x i16> %a, <32 x i16> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test112:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpmaxuw %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpmaxuw %ymm3, %ymm1, %ymm1
; AVX2-NEXT: retq
;
; AVX512BW-LABEL: test112:
-; AVX512BW: # BB#0: # %entry
+; AVX512BW: # %bb.0: # %entry
; AVX512BW-NEXT: vpmaxuw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: retq
entry:
@@ -4159,7 +4159,7 @@ entry:
define <16 x i32> @test113(<16 x i32> %a, <16 x i32> %b) {
; SSE2-LABEL: test113:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa %xmm7, %xmm8
; SSE2-NEXT: pcmpgtd %xmm3, %xmm8
; SSE2-NEXT: movdqa %xmm6, %xmm9
@@ -4183,7 +4183,7 @@ define <16 x i32> @test113(<16 x i32> %a, <16 x i32> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test113:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: pminsd %xmm4, %xmm0
; SSE4-NEXT: pminsd %xmm5, %xmm1
; SSE4-NEXT: pminsd %xmm6, %xmm2
@@ -4191,7 +4191,7 @@ define <16 x i32> @test113(<16 x i32> %a, <16 x i32> %b) {
; SSE4-NEXT: retq
;
; AVX1-LABEL: test113:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
; AVX1-NEXT: vpminsd %xmm4, %xmm5, %xmm4
@@ -4205,13 +4205,13 @@ define <16 x i32> @test113(<16 x i32> %a, <16 x i32> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test113:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpminsd %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpminsd %ymm3, %ymm1, %ymm1
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test113:
-; AVX512F: # BB#0: # %entry
+; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vpminsd %zmm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
entry:
@@ -4222,7 +4222,7 @@ entry:
define <16 x i32> @test114(<16 x i32> %a, <16 x i32> %b) {
; SSE2-LABEL: test114:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa %xmm3, %xmm8
; SSE2-NEXT: movdqa %xmm2, %xmm9
; SSE2-NEXT: movdqa %xmm8, %xmm12
@@ -4258,7 +4258,7 @@ define <16 x i32> @test114(<16 x i32> %a, <16 x i32> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test114:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: pminsd %xmm4, %xmm0
; SSE4-NEXT: pminsd %xmm5, %xmm1
; SSE4-NEXT: pminsd %xmm6, %xmm2
@@ -4266,7 +4266,7 @@ define <16 x i32> @test114(<16 x i32> %a, <16 x i32> %b) {
; SSE4-NEXT: retq
;
; AVX1-LABEL: test114:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
; AVX1-NEXT: vpminsd %xmm4, %xmm5, %xmm4
@@ -4280,13 +4280,13 @@ define <16 x i32> @test114(<16 x i32> %a, <16 x i32> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test114:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpminsd %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpminsd %ymm3, %ymm1, %ymm1
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test114:
-; AVX512F: # BB#0: # %entry
+; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vpminsd %zmm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
entry:
@@ -4297,7 +4297,7 @@ entry:
define <16 x i32> @test115(<16 x i32> %a, <16 x i32> %b) {
; SSE2-LABEL: test115:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa %xmm3, %xmm8
; SSE2-NEXT: pcmpgtd %xmm7, %xmm3
; SSE2-NEXT: movdqa %xmm2, %xmm9
@@ -4324,7 +4324,7 @@ define <16 x i32> @test115(<16 x i32> %a, <16 x i32> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test115:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: pmaxsd %xmm4, %xmm0
; SSE4-NEXT: pmaxsd %xmm5, %xmm1
; SSE4-NEXT: pmaxsd %xmm6, %xmm2
@@ -4332,7 +4332,7 @@ define <16 x i32> @test115(<16 x i32> %a, <16 x i32> %b) {
; SSE4-NEXT: retq
;
; AVX1-LABEL: test115:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
; AVX1-NEXT: vpmaxsd %xmm4, %xmm5, %xmm4
@@ -4346,13 +4346,13 @@ define <16 x i32> @test115(<16 x i32> %a, <16 x i32> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test115:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpmaxsd %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpmaxsd %ymm3, %ymm1, %ymm1
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test115:
-; AVX512F: # BB#0: # %entry
+; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vpmaxsd %zmm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
entry:
@@ -4363,7 +4363,7 @@ entry:
define <16 x i32> @test116(<16 x i32> %a, <16 x i32> %b) {
; SSE2-LABEL: test116:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa %xmm3, %xmm8
; SSE2-NEXT: movdqa %xmm2, %xmm9
; SSE2-NEXT: movdqa %xmm0, %xmm10
@@ -4399,7 +4399,7 @@ define <16 x i32> @test116(<16 x i32> %a, <16 x i32> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test116:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: pmaxsd %xmm4, %xmm0
; SSE4-NEXT: pmaxsd %xmm5, %xmm1
; SSE4-NEXT: pmaxsd %xmm6, %xmm2
@@ -4407,7 +4407,7 @@ define <16 x i32> @test116(<16 x i32> %a, <16 x i32> %b) {
; SSE4-NEXT: retq
;
; AVX1-LABEL: test116:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
; AVX1-NEXT: vpmaxsd %xmm4, %xmm5, %xmm4
@@ -4421,13 +4421,13 @@ define <16 x i32> @test116(<16 x i32> %a, <16 x i32> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test116:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpmaxsd %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpmaxsd %ymm3, %ymm1, %ymm1
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test116:
-; AVX512F: # BB#0: # %entry
+; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vpmaxsd %zmm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
entry:
@@ -4438,7 +4438,7 @@ entry:
define <16 x i32> @test117(<16 x i32> %a, <16 x i32> %b) {
; SSE2-LABEL: test117:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [2147483648,2147483648,2147483648,2147483648]
; SSE2-NEXT: movdqa %xmm3, %xmm9
; SSE2-NEXT: pxor %xmm10, %xmm9
@@ -4474,7 +4474,7 @@ define <16 x i32> @test117(<16 x i32> %a, <16 x i32> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test117:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: pminud %xmm4, %xmm0
; SSE4-NEXT: pminud %xmm5, %xmm1
; SSE4-NEXT: pminud %xmm6, %xmm2
@@ -4482,7 +4482,7 @@ define <16 x i32> @test117(<16 x i32> %a, <16 x i32> %b) {
; SSE4-NEXT: retq
;
; AVX1-LABEL: test117:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
; AVX1-NEXT: vpminud %xmm4, %xmm5, %xmm4
@@ -4496,13 +4496,13 @@ define <16 x i32> @test117(<16 x i32> %a, <16 x i32> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test117:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpminud %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpminud %ymm3, %ymm1, %ymm1
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test117:
-; AVX512F: # BB#0: # %entry
+; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vpminud %zmm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
entry:
@@ -4513,7 +4513,7 @@ entry:
define <16 x i32> @test118(<16 x i32> %a, <16 x i32> %b) {
; SSE2-LABEL: test118:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa %xmm0, %xmm10
; SSE2-NEXT: movdqa {{.*#+}} xmm14 = [2147483648,2147483648,2147483648,2147483648]
; SSE2-NEXT: movdqa %xmm7, %xmm0
@@ -4561,7 +4561,7 @@ define <16 x i32> @test118(<16 x i32> %a, <16 x i32> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test118:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: pminud %xmm4, %xmm0
; SSE4-NEXT: pminud %xmm5, %xmm1
; SSE4-NEXT: pminud %xmm6, %xmm2
@@ -4569,7 +4569,7 @@ define <16 x i32> @test118(<16 x i32> %a, <16 x i32> %b) {
; SSE4-NEXT: retq
;
; AVX1-LABEL: test118:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
; AVX1-NEXT: vpminud %xmm4, %xmm5, %xmm4
@@ -4583,13 +4583,13 @@ define <16 x i32> @test118(<16 x i32> %a, <16 x i32> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test118:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpminud %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpminud %ymm3, %ymm1, %ymm1
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test118:
-; AVX512F: # BB#0: # %entry
+; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vpminud %zmm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
entry:
@@ -4600,7 +4600,7 @@ entry:
define <16 x i32> @test119(<16 x i32> %a, <16 x i32> %b) {
; SSE2-LABEL: test119:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [2147483648,2147483648,2147483648,2147483648]
; SSE2-NEXT: movdqa %xmm7, %xmm9
; SSE2-NEXT: pxor %xmm11, %xmm9
@@ -4639,7 +4639,7 @@ define <16 x i32> @test119(<16 x i32> %a, <16 x i32> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test119:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: pmaxud %xmm4, %xmm0
; SSE4-NEXT: pmaxud %xmm5, %xmm1
; SSE4-NEXT: pmaxud %xmm6, %xmm2
@@ -4647,7 +4647,7 @@ define <16 x i32> @test119(<16 x i32> %a, <16 x i32> %b) {
; SSE4-NEXT: retq
;
; AVX1-LABEL: test119:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
; AVX1-NEXT: vpmaxud %xmm4, %xmm5, %xmm4
@@ -4661,13 +4661,13 @@ define <16 x i32> @test119(<16 x i32> %a, <16 x i32> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test119:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpmaxud %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpmaxud %ymm3, %ymm1, %ymm1
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test119:
-; AVX512F: # BB#0: # %entry
+; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vpmaxud %zmm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
entry:
@@ -4678,7 +4678,7 @@ entry:
define <16 x i32> @test120(<16 x i32> %a, <16 x i32> %b) {
; SSE2-LABEL: test120:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa %xmm0, %xmm10
; SSE2-NEXT: movdqa {{.*#+}} xmm14 = [2147483648,2147483648,2147483648,2147483648]
; SSE2-NEXT: movdqa %xmm3, %xmm0
@@ -4726,7 +4726,7 @@ define <16 x i32> @test120(<16 x i32> %a, <16 x i32> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test120:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: pmaxud %xmm4, %xmm0
; SSE4-NEXT: pmaxud %xmm5, %xmm1
; SSE4-NEXT: pmaxud %xmm6, %xmm2
@@ -4734,7 +4734,7 @@ define <16 x i32> @test120(<16 x i32> %a, <16 x i32> %b) {
; SSE4-NEXT: retq
;
; AVX1-LABEL: test120:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
; AVX1-NEXT: vpmaxud %xmm4, %xmm5, %xmm4
@@ -4748,13 +4748,13 @@ define <16 x i32> @test120(<16 x i32> %a, <16 x i32> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test120:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpmaxud %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpmaxud %ymm3, %ymm1, %ymm1
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test120:
-; AVX512F: # BB#0: # %entry
+; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vpmaxud %zmm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
entry:
@@ -4765,7 +4765,7 @@ entry:
define <8 x i64> @test121(<8 x i64> %a, <8 x i64> %b) {
; SSE2-LABEL: test121:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [2147483648,0,2147483648,0]
; SSE2-NEXT: movdqa %xmm3, %xmm8
; SSE2-NEXT: pxor %xmm9, %xmm8
@@ -4829,7 +4829,7 @@ define <8 x i64> @test121(<8 x i64> %a, <8 x i64> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test121:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: movdqa %xmm0, %xmm8
; SSE4-NEXT: movdqa %xmm7, %xmm9
; SSE4-NEXT: pcmpgtq %xmm3, %xmm9
@@ -4853,7 +4853,7 @@ define <8 x i64> @test121(<8 x i64> %a, <8 x i64> %b) {
; SSE4-NEXT: retq
;
; AVX1-LABEL: test121:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5
; AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4
@@ -4869,7 +4869,7 @@ define <8 x i64> @test121(<8 x i64> %a, <8 x i64> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test121:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpcmpgtq %ymm1, %ymm3, %ymm4
; AVX2-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm5
; AVX2-NEXT: vblendvpd %ymm5, %ymm0, %ymm2, %ymm0
@@ -4877,7 +4877,7 @@ define <8 x i64> @test121(<8 x i64> %a, <8 x i64> %b) {
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test121:
-; AVX512F: # BB#0: # %entry
+; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vpminsq %zmm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
entry:
@@ -4888,7 +4888,7 @@ entry:
define <8 x i64> @test122(<8 x i64> %a, <8 x i64> %b) {
; SSE2-LABEL: test122:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa %xmm7, %xmm8
; SSE2-NEXT: movdqa %xmm8, -{{[0-9]+}}(%rsp) # 16-byte Spill
; SSE2-NEXT: movdqa %xmm3, %xmm7
@@ -4969,7 +4969,7 @@ define <8 x i64> @test122(<8 x i64> %a, <8 x i64> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test122:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: movdqa %xmm0, %xmm8
; SSE4-NEXT: movdqa %xmm3, %xmm9
; SSE4-NEXT: pcmpgtq %xmm7, %xmm9
@@ -4997,7 +4997,7 @@ define <8 x i64> @test122(<8 x i64> %a, <8 x i64> %b) {
; SSE4-NEXT: retq
;
; AVX1-LABEL: test122:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
; AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4
@@ -5018,7 +5018,7 @@ define <8 x i64> @test122(<8 x i64> %a, <8 x i64> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test122:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpcmpgtq %ymm3, %ymm1, %ymm4
; AVX2-NEXT: vpcmpeqd %ymm5, %ymm5, %ymm5
; AVX2-NEXT: vpxor %ymm5, %ymm4, %ymm4
@@ -5029,7 +5029,7 @@ define <8 x i64> @test122(<8 x i64> %a, <8 x i64> %b) {
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test122:
-; AVX512F: # BB#0: # %entry
+; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vpminsq %zmm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
entry:
@@ -5040,7 +5040,7 @@ entry:
define <8 x i64> @test123(<8 x i64> %a, <8 x i64> %b) {
; SSE2-LABEL: test123:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [2147483648,0,2147483648,0]
; SSE2-NEXT: movdqa %xmm7, %xmm8
; SSE2-NEXT: pxor %xmm9, %xmm8
@@ -5104,7 +5104,7 @@ define <8 x i64> @test123(<8 x i64> %a, <8 x i64> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test123:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: movdqa %xmm0, %xmm8
; SSE4-NEXT: movdqa %xmm3, %xmm9
; SSE4-NEXT: pcmpgtq %xmm7, %xmm9
@@ -5127,7 +5127,7 @@ define <8 x i64> @test123(<8 x i64> %a, <8 x i64> %b) {
; SSE4-NEXT: retq
;
; AVX1-LABEL: test123:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
; AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4
@@ -5143,7 +5143,7 @@ define <8 x i64> @test123(<8 x i64> %a, <8 x i64> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test123:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpcmpgtq %ymm3, %ymm1, %ymm4
; AVX2-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm5
; AVX2-NEXT: vblendvpd %ymm5, %ymm0, %ymm2, %ymm0
@@ -5151,7 +5151,7 @@ define <8 x i64> @test123(<8 x i64> %a, <8 x i64> %b) {
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test123:
-; AVX512F: # BB#0: # %entry
+; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
entry:
@@ -5162,7 +5162,7 @@ entry:
define <8 x i64> @test124(<8 x i64> %a, <8 x i64> %b) {
; SSE2-LABEL: test124:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa %xmm7, %xmm11
; SSE2-NEXT: movdqa %xmm11, -{{[0-9]+}}(%rsp) # 16-byte Spill
; SSE2-NEXT: movdqa %xmm3, %xmm7
@@ -5244,7 +5244,7 @@ define <8 x i64> @test124(<8 x i64> %a, <8 x i64> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test124:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: movdqa %xmm0, %xmm8
; SSE4-NEXT: movdqa %xmm7, %xmm9
; SSE4-NEXT: pcmpgtq %xmm3, %xmm9
@@ -5273,7 +5273,7 @@ define <8 x i64> @test124(<8 x i64> %a, <8 x i64> %b) {
; SSE4-NEXT: retq
;
; AVX1-LABEL: test124:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5
; AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4
@@ -5294,7 +5294,7 @@ define <8 x i64> @test124(<8 x i64> %a, <8 x i64> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test124:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpcmpgtq %ymm1, %ymm3, %ymm4
; AVX2-NEXT: vpcmpeqd %ymm5, %ymm5, %ymm5
; AVX2-NEXT: vpxor %ymm5, %ymm4, %ymm4
@@ -5305,7 +5305,7 @@ define <8 x i64> @test124(<8 x i64> %a, <8 x i64> %b) {
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test124:
-; AVX512F: # BB#0: # %entry
+; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
entry:
@@ -5316,7 +5316,7 @@ entry:
define <8 x i64> @test125(<8 x i64> %a, <8 x i64> %b) {
; SSE2-LABEL: test125:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [2147483648,2147483648,2147483648,2147483648]
; SSE2-NEXT: movdqa %xmm3, %xmm8
; SSE2-NEXT: pxor %xmm9, %xmm8
@@ -5380,7 +5380,7 @@ define <8 x i64> @test125(<8 x i64> %a, <8 x i64> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test125:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: movdqa %xmm0, %xmm8
; SSE4-NEXT: movdqa {{.*#+}} xmm0 = [9223372036854775808,9223372036854775808]
; SSE4-NEXT: movdqa %xmm3, %xmm10
@@ -5416,7 +5416,7 @@ define <8 x i64> @test125(<8 x i64> %a, <8 x i64> %b) {
; SSE4-NEXT: retq
;
; AVX1-LABEL: test125:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [9223372036854775808,9223372036854775808]
; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm4
@@ -5441,7 +5441,7 @@ define <8 x i64> @test125(<8 x i64> %a, <8 x i64> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test125:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpbroadcastq {{.*}}(%rip), %ymm4
; AVX2-NEXT: vpxor %ymm4, %ymm1, %ymm5
; AVX2-NEXT: vpxor %ymm4, %ymm3, %ymm6
@@ -5454,7 +5454,7 @@ define <8 x i64> @test125(<8 x i64> %a, <8 x i64> %b) {
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test125:
-; AVX512F: # BB#0: # %entry
+; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vpminuq %zmm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
entry:
@@ -5465,7 +5465,7 @@ entry:
define <8 x i64> @test126(<8 x i64> %a, <8 x i64> %b) {
; SSE2-LABEL: test126:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa %xmm7, %xmm8
; SSE2-NEXT: movdqa %xmm8, -{{[0-9]+}}(%rsp) # 16-byte Spill
; SSE2-NEXT: movdqa %xmm3, %xmm7
@@ -5546,7 +5546,7 @@ define <8 x i64> @test126(<8 x i64> %a, <8 x i64> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test126:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: movdqa %xmm0, %xmm9
; SSE4-NEXT: movdqa {{.*#+}} xmm0 = [9223372036854775808,9223372036854775808]
; SSE4-NEXT: movdqa %xmm7, %xmm10
@@ -5587,7 +5587,7 @@ define <8 x i64> @test126(<8 x i64> %a, <8 x i64> %b) {
; SSE4-NEXT: retq
;
; AVX1-LABEL: test126:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [9223372036854775808,9223372036854775808]
; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm4
@@ -5617,7 +5617,7 @@ define <8 x i64> @test126(<8 x i64> %a, <8 x i64> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test126:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpbroadcastq {{.*}}(%rip), %ymm4
; AVX2-NEXT: vpxor %ymm4, %ymm3, %ymm5
; AVX2-NEXT: vpxor %ymm4, %ymm1, %ymm6
@@ -5633,7 +5633,7 @@ define <8 x i64> @test126(<8 x i64> %a, <8 x i64> %b) {
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test126:
-; AVX512F: # BB#0: # %entry
+; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vpminuq %zmm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
entry:
@@ -5644,7 +5644,7 @@ entry:
define <8 x i64> @test127(<8 x i64> %a, <8 x i64> %b) {
; SSE2-LABEL: test127:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [2147483648,2147483648,2147483648,2147483648]
; SSE2-NEXT: movdqa %xmm7, %xmm8
; SSE2-NEXT: pxor %xmm9, %xmm8
@@ -5708,7 +5708,7 @@ define <8 x i64> @test127(<8 x i64> %a, <8 x i64> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test127:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: movdqa %xmm0, %xmm8
; SSE4-NEXT: movdqa {{.*#+}} xmm0 = [9223372036854775808,9223372036854775808]
; SSE4-NEXT: movdqa %xmm7, %xmm10
@@ -5744,7 +5744,7 @@ define <8 x i64> @test127(<8 x i64> %a, <8 x i64> %b) {
; SSE4-NEXT: retq
;
; AVX1-LABEL: test127:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [9223372036854775808,9223372036854775808]
; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm4
@@ -5769,7 +5769,7 @@ define <8 x i64> @test127(<8 x i64> %a, <8 x i64> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test127:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpbroadcastq {{.*}}(%rip), %ymm4
; AVX2-NEXT: vpxor %ymm4, %ymm3, %ymm5
; AVX2-NEXT: vpxor %ymm4, %ymm1, %ymm6
@@ -5782,7 +5782,7 @@ define <8 x i64> @test127(<8 x i64> %a, <8 x i64> %b) {
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test127:
-; AVX512F: # BB#0: # %entry
+; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
entry:
@@ -5793,7 +5793,7 @@ entry:
define <8 x i64> @test128(<8 x i64> %a, <8 x i64> %b) {
; SSE2-LABEL: test128:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa %xmm7, %xmm11
; SSE2-NEXT: movdqa %xmm11, -{{[0-9]+}}(%rsp) # 16-byte Spill
; SSE2-NEXT: movdqa %xmm3, %xmm7
@@ -5875,7 +5875,7 @@ define <8 x i64> @test128(<8 x i64> %a, <8 x i64> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test128:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: movdqa %xmm0, %xmm9
; SSE4-NEXT: movdqa {{.*#+}} xmm0 = [9223372036854775808,9223372036854775808]
; SSE4-NEXT: movdqa %xmm3, %xmm10
@@ -5916,7 +5916,7 @@ define <8 x i64> @test128(<8 x i64> %a, <8 x i64> %b) {
; SSE4-NEXT: retq
;
; AVX1-LABEL: test128:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [9223372036854775808,9223372036854775808]
; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm4
@@ -5946,7 +5946,7 @@ define <8 x i64> @test128(<8 x i64> %a, <8 x i64> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test128:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpbroadcastq {{.*}}(%rip), %ymm4
; AVX2-NEXT: vpxor %ymm4, %ymm1, %ymm5
; AVX2-NEXT: vpxor %ymm4, %ymm3, %ymm6
@@ -5962,7 +5962,7 @@ define <8 x i64> @test128(<8 x i64> %a, <8 x i64> %b) {
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test128:
-; AVX512F: # BB#0: # %entry
+; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
entry:
@@ -5973,7 +5973,7 @@ entry:
define <64 x i8> @test129(<64 x i8> %a, <64 x i8> %b) {
; SSE2-LABEL: test129:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa %xmm3, %xmm8
; SSE2-NEXT: movdqa %xmm7, %xmm3
; SSE2-NEXT: pcmpgtb %xmm8, %xmm3
@@ -6001,7 +6001,7 @@ define <64 x i8> @test129(<64 x i8> %a, <64 x i8> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test129:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: pmaxsb %xmm4, %xmm0
; SSE4-NEXT: pmaxsb %xmm5, %xmm1
; SSE4-NEXT: pmaxsb %xmm6, %xmm2
@@ -6009,7 +6009,7 @@ define <64 x i8> @test129(<64 x i8> %a, <64 x i8> %b) {
; SSE4-NEXT: retq
;
; AVX1-LABEL: test129:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
; AVX1-NEXT: vpmaxsb %xmm4, %xmm5, %xmm4
@@ -6023,13 +6023,13 @@ define <64 x i8> @test129(<64 x i8> %a, <64 x i8> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test129:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpmaxsb %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpmaxsb %ymm3, %ymm1, %ymm1
; AVX2-NEXT: retq
;
; AVX512BW-LABEL: test129:
-; AVX512BW: # BB#0: # %entry
+; AVX512BW: # %bb.0: # %entry
; AVX512BW-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: retq
entry:
@@ -6040,7 +6040,7 @@ entry:
define <64 x i8> @test130(<64 x i8> %a, <64 x i8> %b) {
; SSE2-LABEL: test130:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa %xmm2, %xmm8
; SSE2-NEXT: movdqa %xmm3, %xmm12
; SSE2-NEXT: pcmpgtb %xmm7, %xmm12
@@ -6076,7 +6076,7 @@ define <64 x i8> @test130(<64 x i8> %a, <64 x i8> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test130:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: pmaxsb %xmm4, %xmm0
; SSE4-NEXT: pmaxsb %xmm5, %xmm1
; SSE4-NEXT: pmaxsb %xmm6, %xmm2
@@ -6084,7 +6084,7 @@ define <64 x i8> @test130(<64 x i8> %a, <64 x i8> %b) {
; SSE4-NEXT: retq
;
; AVX1-LABEL: test130:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
; AVX1-NEXT: vpmaxsb %xmm4, %xmm5, %xmm4
@@ -6098,13 +6098,13 @@ define <64 x i8> @test130(<64 x i8> %a, <64 x i8> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test130:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpmaxsb %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpmaxsb %ymm3, %ymm1, %ymm1
; AVX2-NEXT: retq
;
; AVX512BW-LABEL: test130:
-; AVX512BW: # BB#0: # %entry
+; AVX512BW: # %bb.0: # %entry
; AVX512BW-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: retq
entry:
@@ -6115,7 +6115,7 @@ entry:
define <64 x i8> @test131(<64 x i8> %a, <64 x i8> %b) {
; SSE2-LABEL: test131:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa %xmm3, %xmm8
; SSE2-NEXT: pcmpgtb %xmm7, %xmm3
; SSE2-NEXT: movdqa %xmm2, %xmm9
@@ -6142,7 +6142,7 @@ define <64 x i8> @test131(<64 x i8> %a, <64 x i8> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test131:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: pminsb %xmm4, %xmm0
; SSE4-NEXT: pminsb %xmm5, %xmm1
; SSE4-NEXT: pminsb %xmm6, %xmm2
@@ -6150,7 +6150,7 @@ define <64 x i8> @test131(<64 x i8> %a, <64 x i8> %b) {
; SSE4-NEXT: retq
;
; AVX1-LABEL: test131:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
; AVX1-NEXT: vpminsb %xmm4, %xmm5, %xmm4
@@ -6164,13 +6164,13 @@ define <64 x i8> @test131(<64 x i8> %a, <64 x i8> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test131:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpminsb %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpminsb %ymm3, %ymm1, %ymm1
; AVX2-NEXT: retq
;
; AVX512BW-LABEL: test131:
-; AVX512BW: # BB#0: # %entry
+; AVX512BW: # %bb.0: # %entry
; AVX512BW-NEXT: vpminsb %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: retq
entry:
@@ -6181,7 +6181,7 @@ entry:
define <64 x i8> @test132(<64 x i8> %a, <64 x i8> %b) {
; SSE2-LABEL: test132:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa %xmm2, %xmm8
; SSE2-NEXT: movdqa %xmm0, %xmm10
; SSE2-NEXT: movdqa %xmm7, %xmm12
@@ -6217,7 +6217,7 @@ define <64 x i8> @test132(<64 x i8> %a, <64 x i8> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test132:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: pminsb %xmm4, %xmm0
; SSE4-NEXT: pminsb %xmm5, %xmm1
; SSE4-NEXT: pminsb %xmm6, %xmm2
@@ -6225,7 +6225,7 @@ define <64 x i8> @test132(<64 x i8> %a, <64 x i8> %b) {
; SSE4-NEXT: retq
;
; AVX1-LABEL: test132:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
; AVX1-NEXT: vpminsb %xmm4, %xmm5, %xmm4
@@ -6239,13 +6239,13 @@ define <64 x i8> @test132(<64 x i8> %a, <64 x i8> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test132:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpminsb %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpminsb %ymm3, %ymm1, %ymm1
; AVX2-NEXT: retq
;
; AVX512BW-LABEL: test132:
-; AVX512BW: # BB#0: # %entry
+; AVX512BW: # %bb.0: # %entry
; AVX512BW-NEXT: vpminsb %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: retq
entry:
@@ -6256,7 +6256,7 @@ entry:
define <64 x i8> @test133(<64 x i8> %a, <64 x i8> %b) {
; SSE-LABEL: test133:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: pmaxub %xmm4, %xmm0
; SSE-NEXT: pmaxub %xmm5, %xmm1
; SSE-NEXT: pmaxub %xmm6, %xmm2
@@ -6264,7 +6264,7 @@ define <64 x i8> @test133(<64 x i8> %a, <64 x i8> %b) {
; SSE-NEXT: retq
;
; AVX1-LABEL: test133:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
; AVX1-NEXT: vpmaxub %xmm4, %xmm5, %xmm4
@@ -6278,13 +6278,13 @@ define <64 x i8> @test133(<64 x i8> %a, <64 x i8> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test133:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpmaxub %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpmaxub %ymm3, %ymm1, %ymm1
; AVX2-NEXT: retq
;
; AVX512BW-LABEL: test133:
-; AVX512BW: # BB#0: # %entry
+; AVX512BW: # %bb.0: # %entry
; AVX512BW-NEXT: vpmaxub %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: retq
entry:
@@ -6295,7 +6295,7 @@ entry:
define <64 x i8> @test134(<64 x i8> %a, <64 x i8> %b) {
; SSE-LABEL: test134:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: pmaxub %xmm4, %xmm0
; SSE-NEXT: pmaxub %xmm5, %xmm1
; SSE-NEXT: pmaxub %xmm6, %xmm2
@@ -6303,7 +6303,7 @@ define <64 x i8> @test134(<64 x i8> %a, <64 x i8> %b) {
; SSE-NEXT: retq
;
; AVX1-LABEL: test134:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
; AVX1-NEXT: vpmaxub %xmm4, %xmm5, %xmm4
@@ -6317,13 +6317,13 @@ define <64 x i8> @test134(<64 x i8> %a, <64 x i8> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test134:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpmaxub %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpmaxub %ymm3, %ymm1, %ymm1
; AVX2-NEXT: retq
;
; AVX512BW-LABEL: test134:
-; AVX512BW: # BB#0: # %entry
+; AVX512BW: # %bb.0: # %entry
; AVX512BW-NEXT: vpmaxub %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: retq
entry:
@@ -6334,7 +6334,7 @@ entry:
define <64 x i8> @test135(<64 x i8> %a, <64 x i8> %b) {
; SSE-LABEL: test135:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: pminub %xmm4, %xmm0
; SSE-NEXT: pminub %xmm5, %xmm1
; SSE-NEXT: pminub %xmm6, %xmm2
@@ -6342,7 +6342,7 @@ define <64 x i8> @test135(<64 x i8> %a, <64 x i8> %b) {
; SSE-NEXT: retq
;
; AVX1-LABEL: test135:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
; AVX1-NEXT: vpminub %xmm4, %xmm5, %xmm4
@@ -6356,13 +6356,13 @@ define <64 x i8> @test135(<64 x i8> %a, <64 x i8> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test135:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpminub %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpminub %ymm3, %ymm1, %ymm1
; AVX2-NEXT: retq
;
; AVX512BW-LABEL: test135:
-; AVX512BW: # BB#0: # %entry
+; AVX512BW: # %bb.0: # %entry
; AVX512BW-NEXT: vpminub %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: retq
entry:
@@ -6373,7 +6373,7 @@ entry:
define <64 x i8> @test136(<64 x i8> %a, <64 x i8> %b) {
; SSE-LABEL: test136:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: pminub %xmm4, %xmm0
; SSE-NEXT: pminub %xmm5, %xmm1
; SSE-NEXT: pminub %xmm6, %xmm2
@@ -6381,7 +6381,7 @@ define <64 x i8> @test136(<64 x i8> %a, <64 x i8> %b) {
; SSE-NEXT: retq
;
; AVX1-LABEL: test136:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
; AVX1-NEXT: vpminub %xmm4, %xmm5, %xmm4
@@ -6395,13 +6395,13 @@ define <64 x i8> @test136(<64 x i8> %a, <64 x i8> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test136:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpminub %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpminub %ymm3, %ymm1, %ymm1
; AVX2-NEXT: retq
;
; AVX512BW-LABEL: test136:
-; AVX512BW: # BB#0: # %entry
+; AVX512BW: # %bb.0: # %entry
; AVX512BW-NEXT: vpminub %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: retq
entry:
@@ -6412,7 +6412,7 @@ entry:
define <32 x i16> @test137(<32 x i16> %a, <32 x i16> %b) {
; SSE-LABEL: test137:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: pmaxsw %xmm4, %xmm0
; SSE-NEXT: pmaxsw %xmm5, %xmm1
; SSE-NEXT: pmaxsw %xmm6, %xmm2
@@ -6420,7 +6420,7 @@ define <32 x i16> @test137(<32 x i16> %a, <32 x i16> %b) {
; SSE-NEXT: retq
;
; AVX1-LABEL: test137:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
; AVX1-NEXT: vpmaxsw %xmm4, %xmm5, %xmm4
@@ -6434,13 +6434,13 @@ define <32 x i16> @test137(<32 x i16> %a, <32 x i16> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test137:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpmaxsw %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpmaxsw %ymm3, %ymm1, %ymm1
; AVX2-NEXT: retq
;
; AVX512BW-LABEL: test137:
-; AVX512BW: # BB#0: # %entry
+; AVX512BW: # %bb.0: # %entry
; AVX512BW-NEXT: vpmaxsw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: retq
entry:
@@ -6451,7 +6451,7 @@ entry:
define <32 x i16> @test138(<32 x i16> %a, <32 x i16> %b) {
; SSE-LABEL: test138:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: pmaxsw %xmm4, %xmm0
; SSE-NEXT: pmaxsw %xmm5, %xmm1
; SSE-NEXT: pmaxsw %xmm6, %xmm2
@@ -6459,7 +6459,7 @@ define <32 x i16> @test138(<32 x i16> %a, <32 x i16> %b) {
; SSE-NEXT: retq
;
; AVX1-LABEL: test138:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
; AVX1-NEXT: vpmaxsw %xmm4, %xmm5, %xmm4
@@ -6473,13 +6473,13 @@ define <32 x i16> @test138(<32 x i16> %a, <32 x i16> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test138:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpmaxsw %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpmaxsw %ymm3, %ymm1, %ymm1
; AVX2-NEXT: retq
;
; AVX512BW-LABEL: test138:
-; AVX512BW: # BB#0: # %entry
+; AVX512BW: # %bb.0: # %entry
; AVX512BW-NEXT: vpmaxsw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: retq
entry:
@@ -6490,7 +6490,7 @@ entry:
define <32 x i16> @test139(<32 x i16> %a, <32 x i16> %b) {
; SSE-LABEL: test139:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: pminsw %xmm4, %xmm0
; SSE-NEXT: pminsw %xmm5, %xmm1
; SSE-NEXT: pminsw %xmm6, %xmm2
@@ -6498,7 +6498,7 @@ define <32 x i16> @test139(<32 x i16> %a, <32 x i16> %b) {
; SSE-NEXT: retq
;
; AVX1-LABEL: test139:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
; AVX1-NEXT: vpminsw %xmm4, %xmm5, %xmm4
@@ -6512,13 +6512,13 @@ define <32 x i16> @test139(<32 x i16> %a, <32 x i16> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test139:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpminsw %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpminsw %ymm3, %ymm1, %ymm1
; AVX2-NEXT: retq
;
; AVX512BW-LABEL: test139:
-; AVX512BW: # BB#0: # %entry
+; AVX512BW: # %bb.0: # %entry
; AVX512BW-NEXT: vpminsw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: retq
entry:
@@ -6529,7 +6529,7 @@ entry:
define <32 x i16> @test140(<32 x i16> %a, <32 x i16> %b) {
; SSE-LABEL: test140:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: pminsw %xmm4, %xmm0
; SSE-NEXT: pminsw %xmm5, %xmm1
; SSE-NEXT: pminsw %xmm6, %xmm2
@@ -6537,7 +6537,7 @@ define <32 x i16> @test140(<32 x i16> %a, <32 x i16> %b) {
; SSE-NEXT: retq
;
; AVX1-LABEL: test140:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
; AVX1-NEXT: vpminsw %xmm4, %xmm5, %xmm4
@@ -6551,13 +6551,13 @@ define <32 x i16> @test140(<32 x i16> %a, <32 x i16> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test140:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpminsw %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpminsw %ymm3, %ymm1, %ymm1
; AVX2-NEXT: retq
;
; AVX512BW-LABEL: test140:
-; AVX512BW: # BB#0: # %entry
+; AVX512BW: # %bb.0: # %entry
; AVX512BW-NEXT: vpminsw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: retq
entry:
@@ -6568,7 +6568,7 @@ entry:
define <32 x i16> @test141(<32 x i16> %a, <32 x i16> %b) {
; SSE2-LABEL: test141:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa %xmm0, %xmm11
; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [32768,32768,32768,32768,32768,32768,32768,32768]
; SSE2-NEXT: movdqa %xmm3, %xmm9
@@ -6608,7 +6608,7 @@ define <32 x i16> @test141(<32 x i16> %a, <32 x i16> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test141:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: pmaxuw %xmm4, %xmm0
; SSE4-NEXT: pmaxuw %xmm5, %xmm1
; SSE4-NEXT: pmaxuw %xmm6, %xmm2
@@ -6616,7 +6616,7 @@ define <32 x i16> @test141(<32 x i16> %a, <32 x i16> %b) {
; SSE4-NEXT: retq
;
; AVX1-LABEL: test141:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
; AVX1-NEXT: vpmaxuw %xmm4, %xmm5, %xmm4
@@ -6630,13 +6630,13 @@ define <32 x i16> @test141(<32 x i16> %a, <32 x i16> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test141:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpmaxuw %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpmaxuw %ymm3, %ymm1, %ymm1
; AVX2-NEXT: retq
;
; AVX512BW-LABEL: test141:
-; AVX512BW: # BB#0: # %entry
+; AVX512BW: # %bb.0: # %entry
; AVX512BW-NEXT: vpmaxuw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: retq
entry:
@@ -6647,7 +6647,7 @@ entry:
define <32 x i16> @test142(<32 x i16> %a, <32 x i16> %b) {
; SSE2-LABEL: test142:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa %xmm3, %xmm8
; SSE2-NEXT: movdqa %xmm2, %xmm9
; SSE2-NEXT: movdqa %xmm1, %xmm10
@@ -6677,7 +6677,7 @@ define <32 x i16> @test142(<32 x i16> %a, <32 x i16> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test142:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: pmaxuw %xmm4, %xmm0
; SSE4-NEXT: pmaxuw %xmm5, %xmm1
; SSE4-NEXT: pmaxuw %xmm6, %xmm2
@@ -6685,7 +6685,7 @@ define <32 x i16> @test142(<32 x i16> %a, <32 x i16> %b) {
; SSE4-NEXT: retq
;
; AVX1-LABEL: test142:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
; AVX1-NEXT: vpmaxuw %xmm4, %xmm5, %xmm4
@@ -6699,13 +6699,13 @@ define <32 x i16> @test142(<32 x i16> %a, <32 x i16> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test142:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpmaxuw %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpmaxuw %ymm3, %ymm1, %ymm1
; AVX2-NEXT: retq
;
; AVX512BW-LABEL: test142:
-; AVX512BW: # BB#0: # %entry
+; AVX512BW: # %bb.0: # %entry
; AVX512BW-NEXT: vpmaxuw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: retq
entry:
@@ -6716,7 +6716,7 @@ entry:
define <32 x i16> @test143(<32 x i16> %a, <32 x i16> %b) {
; SSE2-LABEL: test143:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa %xmm0, %xmm11
; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [32768,32768,32768,32768,32768,32768,32768,32768]
; SSE2-NEXT: movdqa %xmm7, %xmm9
@@ -6756,7 +6756,7 @@ define <32 x i16> @test143(<32 x i16> %a, <32 x i16> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test143:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: pminuw %xmm4, %xmm0
; SSE4-NEXT: pminuw %xmm5, %xmm1
; SSE4-NEXT: pminuw %xmm6, %xmm2
@@ -6764,7 +6764,7 @@ define <32 x i16> @test143(<32 x i16> %a, <32 x i16> %b) {
; SSE4-NEXT: retq
;
; AVX1-LABEL: test143:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
; AVX1-NEXT: vpminuw %xmm4, %xmm5, %xmm4
@@ -6778,13 +6778,13 @@ define <32 x i16> @test143(<32 x i16> %a, <32 x i16> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test143:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpminuw %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpminuw %ymm3, %ymm1, %ymm1
; AVX2-NEXT: retq
;
; AVX512BW-LABEL: test143:
-; AVX512BW: # BB#0: # %entry
+; AVX512BW: # %bb.0: # %entry
; AVX512BW-NEXT: vpminuw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: retq
entry:
@@ -6795,7 +6795,7 @@ entry:
define <32 x i16> @test144(<32 x i16> %a, <32 x i16> %b) {
; SSE2-LABEL: test144:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa %xmm3, %xmm8
; SSE2-NEXT: movdqa %xmm2, %xmm9
; SSE2-NEXT: movdqa %xmm1, %xmm10
@@ -6828,7 +6828,7 @@ define <32 x i16> @test144(<32 x i16> %a, <32 x i16> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test144:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: pminuw %xmm4, %xmm0
; SSE4-NEXT: pminuw %xmm5, %xmm1
; SSE4-NEXT: pminuw %xmm6, %xmm2
@@ -6836,7 +6836,7 @@ define <32 x i16> @test144(<32 x i16> %a, <32 x i16> %b) {
; SSE4-NEXT: retq
;
; AVX1-LABEL: test144:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
; AVX1-NEXT: vpminuw %xmm4, %xmm5, %xmm4
@@ -6850,13 +6850,13 @@ define <32 x i16> @test144(<32 x i16> %a, <32 x i16> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test144:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpminuw %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpminuw %ymm3, %ymm1, %ymm1
; AVX2-NEXT: retq
;
; AVX512BW-LABEL: test144:
-; AVX512BW: # BB#0: # %entry
+; AVX512BW: # %bb.0: # %entry
; AVX512BW-NEXT: vpminuw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: retq
entry:
@@ -6867,7 +6867,7 @@ entry:
define <16 x i32> @test145(<16 x i32> %a, <16 x i32> %b) {
; SSE2-LABEL: test145:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa %xmm3, %xmm8
; SSE2-NEXT: movdqa %xmm7, %xmm3
; SSE2-NEXT: pcmpgtd %xmm8, %xmm3
@@ -6895,7 +6895,7 @@ define <16 x i32> @test145(<16 x i32> %a, <16 x i32> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test145:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: pmaxsd %xmm4, %xmm0
; SSE4-NEXT: pmaxsd %xmm5, %xmm1
; SSE4-NEXT: pmaxsd %xmm6, %xmm2
@@ -6903,7 +6903,7 @@ define <16 x i32> @test145(<16 x i32> %a, <16 x i32> %b) {
; SSE4-NEXT: retq
;
; AVX1-LABEL: test145:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
; AVX1-NEXT: vpmaxsd %xmm4, %xmm5, %xmm4
@@ -6917,13 +6917,13 @@ define <16 x i32> @test145(<16 x i32> %a, <16 x i32> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test145:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpmaxsd %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpmaxsd %ymm3, %ymm1, %ymm1
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test145:
-; AVX512F: # BB#0: # %entry
+; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vpmaxsd %zmm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
entry:
@@ -6934,7 +6934,7 @@ entry:
define <16 x i32> @test146(<16 x i32> %a, <16 x i32> %b) {
; SSE2-LABEL: test146:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa %xmm2, %xmm8
; SSE2-NEXT: movdqa %xmm3, %xmm12
; SSE2-NEXT: pcmpgtd %xmm7, %xmm12
@@ -6970,7 +6970,7 @@ define <16 x i32> @test146(<16 x i32> %a, <16 x i32> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test146:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: pmaxsd %xmm4, %xmm0
; SSE4-NEXT: pmaxsd %xmm5, %xmm1
; SSE4-NEXT: pmaxsd %xmm6, %xmm2
@@ -6978,7 +6978,7 @@ define <16 x i32> @test146(<16 x i32> %a, <16 x i32> %b) {
; SSE4-NEXT: retq
;
; AVX1-LABEL: test146:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
; AVX1-NEXT: vpmaxsd %xmm4, %xmm5, %xmm4
@@ -6992,13 +6992,13 @@ define <16 x i32> @test146(<16 x i32> %a, <16 x i32> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test146:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpmaxsd %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpmaxsd %ymm3, %ymm1, %ymm1
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test146:
-; AVX512F: # BB#0: # %entry
+; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vpmaxsd %zmm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
entry:
@@ -7009,7 +7009,7 @@ entry:
define <16 x i32> @test147(<16 x i32> %a, <16 x i32> %b) {
; SSE2-LABEL: test147:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa %xmm3, %xmm8
; SSE2-NEXT: pcmpgtd %xmm7, %xmm3
; SSE2-NEXT: movdqa %xmm2, %xmm9
@@ -7036,7 +7036,7 @@ define <16 x i32> @test147(<16 x i32> %a, <16 x i32> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test147:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: pminsd %xmm4, %xmm0
; SSE4-NEXT: pminsd %xmm5, %xmm1
; SSE4-NEXT: pminsd %xmm6, %xmm2
@@ -7044,7 +7044,7 @@ define <16 x i32> @test147(<16 x i32> %a, <16 x i32> %b) {
; SSE4-NEXT: retq
;
; AVX1-LABEL: test147:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
; AVX1-NEXT: vpminsd %xmm4, %xmm5, %xmm4
@@ -7058,13 +7058,13 @@ define <16 x i32> @test147(<16 x i32> %a, <16 x i32> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test147:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpminsd %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpminsd %ymm3, %ymm1, %ymm1
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test147:
-; AVX512F: # BB#0: # %entry
+; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vpminsd %zmm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
entry:
@@ -7075,7 +7075,7 @@ entry:
define <16 x i32> @test148(<16 x i32> %a, <16 x i32> %b) {
; SSE2-LABEL: test148:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa %xmm2, %xmm8
; SSE2-NEXT: movdqa %xmm0, %xmm10
; SSE2-NEXT: movdqa %xmm7, %xmm12
@@ -7111,7 +7111,7 @@ define <16 x i32> @test148(<16 x i32> %a, <16 x i32> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test148:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: pminsd %xmm4, %xmm0
; SSE4-NEXT: pminsd %xmm5, %xmm1
; SSE4-NEXT: pminsd %xmm6, %xmm2
@@ -7119,7 +7119,7 @@ define <16 x i32> @test148(<16 x i32> %a, <16 x i32> %b) {
; SSE4-NEXT: retq
;
; AVX1-LABEL: test148:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
; AVX1-NEXT: vpminsd %xmm4, %xmm5, %xmm4
@@ -7133,13 +7133,13 @@ define <16 x i32> @test148(<16 x i32> %a, <16 x i32> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test148:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpminsd %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpminsd %ymm3, %ymm1, %ymm1
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test148:
-; AVX512F: # BB#0: # %entry
+; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vpminsd %zmm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
entry:
@@ -7150,7 +7150,7 @@ entry:
define <16 x i32> @test149(<16 x i32> %a, <16 x i32> %b) {
; SSE2-LABEL: test149:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa %xmm0, %xmm11
; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648,2147483648,2147483648]
; SSE2-NEXT: movdqa %xmm3, %xmm9
@@ -7190,7 +7190,7 @@ define <16 x i32> @test149(<16 x i32> %a, <16 x i32> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test149:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: pmaxud %xmm4, %xmm0
; SSE4-NEXT: pmaxud %xmm5, %xmm1
; SSE4-NEXT: pmaxud %xmm6, %xmm2
@@ -7198,7 +7198,7 @@ define <16 x i32> @test149(<16 x i32> %a, <16 x i32> %b) {
; SSE4-NEXT: retq
;
; AVX1-LABEL: test149:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
; AVX1-NEXT: vpmaxud %xmm4, %xmm5, %xmm4
@@ -7212,13 +7212,13 @@ define <16 x i32> @test149(<16 x i32> %a, <16 x i32> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test149:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpmaxud %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpmaxud %ymm3, %ymm1, %ymm1
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test149:
-; AVX512F: # BB#0: # %entry
+; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vpmaxud %zmm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
entry:
@@ -7229,7 +7229,7 @@ entry:
define <16 x i32> @test150(<16 x i32> %a, <16 x i32> %b) {
; SSE2-LABEL: test150:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa %xmm0, %xmm10
; SSE2-NEXT: movdqa {{.*#+}} xmm14 = [2147483648,2147483648,2147483648,2147483648]
; SSE2-NEXT: movdqa %xmm7, %xmm0
@@ -7277,7 +7277,7 @@ define <16 x i32> @test150(<16 x i32> %a, <16 x i32> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test150:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: pmaxud %xmm4, %xmm0
; SSE4-NEXT: pmaxud %xmm5, %xmm1
; SSE4-NEXT: pmaxud %xmm6, %xmm2
@@ -7285,7 +7285,7 @@ define <16 x i32> @test150(<16 x i32> %a, <16 x i32> %b) {
; SSE4-NEXT: retq
;
; AVX1-LABEL: test150:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
; AVX1-NEXT: vpmaxud %xmm4, %xmm5, %xmm4
@@ -7299,13 +7299,13 @@ define <16 x i32> @test150(<16 x i32> %a, <16 x i32> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test150:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpmaxud %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpmaxud %ymm3, %ymm1, %ymm1
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test150:
-; AVX512F: # BB#0: # %entry
+; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vpmaxud %zmm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
entry:
@@ -7316,7 +7316,7 @@ entry:
define <16 x i32> @test151(<16 x i32> %a, <16 x i32> %b) {
; SSE2-LABEL: test151:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa %xmm0, %xmm11
; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648,2147483648,2147483648]
; SSE2-NEXT: movdqa %xmm7, %xmm9
@@ -7356,7 +7356,7 @@ define <16 x i32> @test151(<16 x i32> %a, <16 x i32> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test151:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: pminud %xmm4, %xmm0
; SSE4-NEXT: pminud %xmm5, %xmm1
; SSE4-NEXT: pminud %xmm6, %xmm2
@@ -7364,7 +7364,7 @@ define <16 x i32> @test151(<16 x i32> %a, <16 x i32> %b) {
; SSE4-NEXT: retq
;
; AVX1-LABEL: test151:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
; AVX1-NEXT: vpminud %xmm4, %xmm5, %xmm4
@@ -7378,13 +7378,13 @@ define <16 x i32> @test151(<16 x i32> %a, <16 x i32> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test151:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpminud %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpminud %ymm3, %ymm1, %ymm1
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test151:
-; AVX512F: # BB#0: # %entry
+; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vpminud %zmm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
entry:
@@ -7395,7 +7395,7 @@ entry:
define <16 x i32> @test152(<16 x i32> %a, <16 x i32> %b) {
; SSE2-LABEL: test152:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa %xmm0, %xmm10
; SSE2-NEXT: movdqa {{.*#+}} xmm14 = [2147483648,2147483648,2147483648,2147483648]
; SSE2-NEXT: movdqa %xmm3, %xmm0
@@ -7443,7 +7443,7 @@ define <16 x i32> @test152(<16 x i32> %a, <16 x i32> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test152:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: pminud %xmm4, %xmm0
; SSE4-NEXT: pminud %xmm5, %xmm1
; SSE4-NEXT: pminud %xmm6, %xmm2
@@ -7451,7 +7451,7 @@ define <16 x i32> @test152(<16 x i32> %a, <16 x i32> %b) {
; SSE4-NEXT: retq
;
; AVX1-LABEL: test152:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
; AVX1-NEXT: vpminud %xmm4, %xmm5, %xmm4
@@ -7465,13 +7465,13 @@ define <16 x i32> @test152(<16 x i32> %a, <16 x i32> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test152:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpminud %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpminud %ymm3, %ymm1, %ymm1
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test152:
-; AVX512F: # BB#0: # %entry
+; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vpminud %zmm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
entry:
@@ -7484,7 +7484,7 @@ entry:
define <8 x i64> @test153(<8 x i64> %a, <8 x i64> %b) {
; SSE2-LABEL: test153:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [2147483648,0,2147483648,0]
; SSE2-NEXT: movdqa %xmm3, %xmm8
; SSE2-NEXT: pxor %xmm11, %xmm8
@@ -7552,7 +7552,7 @@ define <8 x i64> @test153(<8 x i64> %a, <8 x i64> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test153:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: movdqa %xmm0, %xmm8
; SSE4-NEXT: movdqa %xmm7, %xmm9
; SSE4-NEXT: pcmpgtq %xmm3, %xmm9
@@ -7573,7 +7573,7 @@ define <8 x i64> @test153(<8 x i64> %a, <8 x i64> %b) {
; SSE4-NEXT: retq
;
; AVX1-LABEL: test153:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5
; AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4
@@ -7589,7 +7589,7 @@ define <8 x i64> @test153(<8 x i64> %a, <8 x i64> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test153:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpcmpgtq %ymm1, %ymm3, %ymm4
; AVX2-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm5
; AVX2-NEXT: vblendvpd %ymm5, %ymm2, %ymm0, %ymm0
@@ -7597,7 +7597,7 @@ define <8 x i64> @test153(<8 x i64> %a, <8 x i64> %b) {
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test153:
-; AVX512F: # BB#0: # %entry
+; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
entry:
@@ -7608,7 +7608,7 @@ entry:
define <8 x i64> @test154(<8 x i64> %a, <8 x i64> %b) {
; SSE2-LABEL: test154:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa %xmm7, %xmm8
; SSE2-NEXT: movdqa %xmm8, -{{[0-9]+}}(%rsp) # 16-byte Spill
; SSE2-NEXT: movdqa %xmm3, %xmm7
@@ -7689,7 +7689,7 @@ define <8 x i64> @test154(<8 x i64> %a, <8 x i64> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test154:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: movdqa %xmm0, %xmm8
; SSE4-NEXT: movdqa %xmm3, %xmm9
; SSE4-NEXT: pcmpgtq %xmm7, %xmm9
@@ -7714,7 +7714,7 @@ define <8 x i64> @test154(<8 x i64> %a, <8 x i64> %b) {
; SSE4-NEXT: retq
;
; AVX1-LABEL: test154:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
; AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4
@@ -7735,7 +7735,7 @@ define <8 x i64> @test154(<8 x i64> %a, <8 x i64> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test154:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpcmpgtq %ymm3, %ymm1, %ymm4
; AVX2-NEXT: vpcmpeqd %ymm5, %ymm5, %ymm5
; AVX2-NEXT: vpxor %ymm5, %ymm4, %ymm4
@@ -7746,7 +7746,7 @@ define <8 x i64> @test154(<8 x i64> %a, <8 x i64> %b) {
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test154:
-; AVX512F: # BB#0: # %entry
+; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
entry:
@@ -7757,7 +7757,7 @@ entry:
define <8 x i64> @test155(<8 x i64> %a, <8 x i64> %b) {
; SSE2-LABEL: test155:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [2147483648,0,2147483648,0]
; SSE2-NEXT: movdqa %xmm7, %xmm8
; SSE2-NEXT: pxor %xmm11, %xmm8
@@ -7825,7 +7825,7 @@ define <8 x i64> @test155(<8 x i64> %a, <8 x i64> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test155:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: movdqa %xmm0, %xmm8
; SSE4-NEXT: movdqa %xmm3, %xmm9
; SSE4-NEXT: pcmpgtq %xmm7, %xmm9
@@ -7845,7 +7845,7 @@ define <8 x i64> @test155(<8 x i64> %a, <8 x i64> %b) {
; SSE4-NEXT: retq
;
; AVX1-LABEL: test155:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
; AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4
@@ -7861,7 +7861,7 @@ define <8 x i64> @test155(<8 x i64> %a, <8 x i64> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test155:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpcmpgtq %ymm3, %ymm1, %ymm4
; AVX2-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm5
; AVX2-NEXT: vblendvpd %ymm5, %ymm2, %ymm0, %ymm0
@@ -7869,7 +7869,7 @@ define <8 x i64> @test155(<8 x i64> %a, <8 x i64> %b) {
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test155:
-; AVX512F: # BB#0: # %entry
+; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vpminsq %zmm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
entry:
@@ -7880,7 +7880,7 @@ entry:
define <8 x i64> @test156(<8 x i64> %a, <8 x i64> %b) {
; SSE2-LABEL: test156:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa %xmm7, %xmm11
; SSE2-NEXT: movdqa %xmm11, -{{[0-9]+}}(%rsp) # 16-byte Spill
; SSE2-NEXT: movdqa %xmm3, %xmm7
@@ -7962,7 +7962,7 @@ define <8 x i64> @test156(<8 x i64> %a, <8 x i64> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test156:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: movdqa %xmm0, %xmm8
; SSE4-NEXT: movdqa %xmm7, %xmm9
; SSE4-NEXT: pcmpgtq %xmm3, %xmm9
@@ -7988,7 +7988,7 @@ define <8 x i64> @test156(<8 x i64> %a, <8 x i64> %b) {
; SSE4-NEXT: retq
;
; AVX1-LABEL: test156:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5
; AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4
@@ -8009,7 +8009,7 @@ define <8 x i64> @test156(<8 x i64> %a, <8 x i64> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test156:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpcmpgtq %ymm1, %ymm3, %ymm4
; AVX2-NEXT: vpcmpeqd %ymm5, %ymm5, %ymm5
; AVX2-NEXT: vpxor %ymm5, %ymm4, %ymm4
@@ -8020,7 +8020,7 @@ define <8 x i64> @test156(<8 x i64> %a, <8 x i64> %b) {
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test156:
-; AVX512F: # BB#0: # %entry
+; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vpminsq %zmm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
entry:
@@ -8031,7 +8031,7 @@ entry:
define <8 x i64> @test157(<8 x i64> %a, <8 x i64> %b) {
; SSE2-LABEL: test157:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [2147483648,2147483648,2147483648,2147483648]
; SSE2-NEXT: movdqa %xmm3, %xmm8
; SSE2-NEXT: pxor %xmm11, %xmm8
@@ -8099,7 +8099,7 @@ define <8 x i64> @test157(<8 x i64> %a, <8 x i64> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test157:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: movdqa %xmm0, %xmm8
; SSE4-NEXT: movdqa {{.*#+}} xmm0 = [9223372036854775808,9223372036854775808]
; SSE4-NEXT: movdqa %xmm3, %xmm10
@@ -8132,7 +8132,7 @@ define <8 x i64> @test157(<8 x i64> %a, <8 x i64> %b) {
; SSE4-NEXT: retq
;
; AVX1-LABEL: test157:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [9223372036854775808,9223372036854775808]
; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm4
@@ -8157,7 +8157,7 @@ define <8 x i64> @test157(<8 x i64> %a, <8 x i64> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test157:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpbroadcastq {{.*}}(%rip), %ymm4
; AVX2-NEXT: vpxor %ymm4, %ymm1, %ymm5
; AVX2-NEXT: vpxor %ymm4, %ymm3, %ymm6
@@ -8170,7 +8170,7 @@ define <8 x i64> @test157(<8 x i64> %a, <8 x i64> %b) {
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test157:
-; AVX512F: # BB#0: # %entry
+; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
entry:
@@ -8181,7 +8181,7 @@ entry:
define <8 x i64> @test158(<8 x i64> %a, <8 x i64> %b) {
; SSE2-LABEL: test158:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa %xmm7, %xmm8
; SSE2-NEXT: movdqa %xmm8, -{{[0-9]+}}(%rsp) # 16-byte Spill
; SSE2-NEXT: movdqa %xmm3, %xmm7
@@ -8262,7 +8262,7 @@ define <8 x i64> @test158(<8 x i64> %a, <8 x i64> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test158:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: movdqa %xmm0, %xmm8
; SSE4-NEXT: movdqa {{.*#+}} xmm0 = [9223372036854775808,9223372036854775808]
; SSE4-NEXT: movdqa %xmm7, %xmm10
@@ -8300,7 +8300,7 @@ define <8 x i64> @test158(<8 x i64> %a, <8 x i64> %b) {
; SSE4-NEXT: retq
;
; AVX1-LABEL: test158:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [9223372036854775808,9223372036854775808]
; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm4
@@ -8330,7 +8330,7 @@ define <8 x i64> @test158(<8 x i64> %a, <8 x i64> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test158:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpbroadcastq {{.*}}(%rip), %ymm4
; AVX2-NEXT: vpxor %ymm4, %ymm3, %ymm5
; AVX2-NEXT: vpxor %ymm4, %ymm1, %ymm6
@@ -8346,7 +8346,7 @@ define <8 x i64> @test158(<8 x i64> %a, <8 x i64> %b) {
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test158:
-; AVX512F: # BB#0: # %entry
+; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
entry:
@@ -8357,7 +8357,7 @@ entry:
define <8 x i64> @test159(<8 x i64> %a, <8 x i64> %b) {
; SSE2-LABEL: test159:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [2147483648,2147483648,2147483648,2147483648]
; SSE2-NEXT: movdqa %xmm7, %xmm8
; SSE2-NEXT: pxor %xmm11, %xmm8
@@ -8425,7 +8425,7 @@ define <8 x i64> @test159(<8 x i64> %a, <8 x i64> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test159:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: movdqa %xmm0, %xmm8
; SSE4-NEXT: movdqa {{.*#+}} xmm0 = [9223372036854775808,9223372036854775808]
; SSE4-NEXT: movdqa %xmm7, %xmm10
@@ -8458,7 +8458,7 @@ define <8 x i64> @test159(<8 x i64> %a, <8 x i64> %b) {
; SSE4-NEXT: retq
;
; AVX1-LABEL: test159:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [9223372036854775808,9223372036854775808]
; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm4
@@ -8483,7 +8483,7 @@ define <8 x i64> @test159(<8 x i64> %a, <8 x i64> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test159:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpbroadcastq {{.*}}(%rip), %ymm4
; AVX2-NEXT: vpxor %ymm4, %ymm3, %ymm5
; AVX2-NEXT: vpxor %ymm4, %ymm1, %ymm6
@@ -8496,7 +8496,7 @@ define <8 x i64> @test159(<8 x i64> %a, <8 x i64> %b) {
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test159:
-; AVX512F: # BB#0: # %entry
+; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vpminuq %zmm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
entry:
@@ -8507,7 +8507,7 @@ entry:
define <8 x i64> @test160(<8 x i64> %a, <8 x i64> %b) {
; SSE2-LABEL: test160:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa %xmm7, %xmm11
; SSE2-NEXT: movdqa %xmm11, -{{[0-9]+}}(%rsp) # 16-byte Spill
; SSE2-NEXT: movdqa %xmm3, %xmm7
@@ -8589,7 +8589,7 @@ define <8 x i64> @test160(<8 x i64> %a, <8 x i64> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test160:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: movdqa %xmm0, %xmm8
; SSE4-NEXT: movdqa {{.*#+}} xmm0 = [9223372036854775808,9223372036854775808]
; SSE4-NEXT: movdqa %xmm3, %xmm10
@@ -8627,7 +8627,7 @@ define <8 x i64> @test160(<8 x i64> %a, <8 x i64> %b) {
; SSE4-NEXT: retq
;
; AVX1-LABEL: test160:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [9223372036854775808,9223372036854775808]
; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm4
@@ -8657,7 +8657,7 @@ define <8 x i64> @test160(<8 x i64> %a, <8 x i64> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test160:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpbroadcastq {{.*}}(%rip), %ymm4
; AVX2-NEXT: vpxor %ymm4, %ymm1, %ymm5
; AVX2-NEXT: vpxor %ymm4, %ymm3, %ymm6
@@ -8673,7 +8673,7 @@ define <8 x i64> @test160(<8 x i64> %a, <8 x i64> %b) {
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test160:
-; AVX512F: # BB#0: # %entry
+; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vpminuq %zmm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
entry:
@@ -8684,7 +8684,7 @@ entry:
define <4 x i64> @test161(<4 x i64> %a, <4 x i64> %b) {
; SSE2-LABEL: test161:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,0,2147483648,0]
; SSE2-NEXT: movdqa %xmm1, %xmm5
; SSE2-NEXT: pxor %xmm4, %xmm5
@@ -8718,7 +8718,7 @@ define <4 x i64> @test161(<4 x i64> %a, <4 x i64> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test161:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: movdqa %xmm0, %xmm4
; SSE4-NEXT: movdqa %xmm3, %xmm5
; SSE4-NEXT: pcmpgtq %xmm1, %xmm5
@@ -8732,7 +8732,7 @@ define <4 x i64> @test161(<4 x i64> %a, <4 x i64> %b) {
; SSE4-NEXT: retq
;
; AVX1-LABEL: test161:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
@@ -8742,13 +8742,13 @@ define <4 x i64> @test161(<4 x i64> %a, <4 x i64> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test161:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2
; AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
;
; AVX512BW-LABEL: test161:
-; AVX512BW: # BB#0: # %entry
+; AVX512BW: # %bb.0: # %entry
; AVX512BW-NEXT: vpminsq %ymm1, %ymm0, %ymm0
; AVX512BW-NEXT: retq
entry:
@@ -8759,7 +8759,7 @@ entry:
define <4 x i64> @test162(<4 x i64> %a, <4 x i64> %b) {
; SSE2-LABEL: test162:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [2147483648,0,2147483648,0]
; SSE2-NEXT: movdqa %xmm3, %xmm4
; SSE2-NEXT: pxor %xmm7, %xmm4
@@ -8799,7 +8799,7 @@ define <4 x i64> @test162(<4 x i64> %a, <4 x i64> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test162:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: movdqa %xmm0, %xmm4
; SSE4-NEXT: movdqa %xmm1, %xmm5
; SSE4-NEXT: pcmpgtq %xmm3, %xmm5
@@ -8815,7 +8815,7 @@ define <4 x i64> @test162(<4 x i64> %a, <4 x i64> %b) {
; SSE4-NEXT: retq
;
; AVX1-LABEL: test162:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
@@ -8828,7 +8828,7 @@ define <4 x i64> @test162(<4 x i64> %a, <4 x i64> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test162:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2
; AVX2-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3
; AVX2-NEXT: vpxor %ymm3, %ymm2, %ymm2
@@ -8836,7 +8836,7 @@ define <4 x i64> @test162(<4 x i64> %a, <4 x i64> %b) {
; AVX2-NEXT: retq
;
; AVX512BW-LABEL: test162:
-; AVX512BW: # BB#0: # %entry
+; AVX512BW: # %bb.0: # %entry
; AVX512BW-NEXT: vpminsq %ymm1, %ymm0, %ymm0
; AVX512BW-NEXT: retq
entry:
@@ -8847,7 +8847,7 @@ entry:
define <4 x i64> @test163(<4 x i64> %a, <4 x i64> %b) {
; SSE2-LABEL: test163:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,0,2147483648,0]
; SSE2-NEXT: movdqa %xmm3, %xmm5
; SSE2-NEXT: pxor %xmm4, %xmm5
@@ -8881,7 +8881,7 @@ define <4 x i64> @test163(<4 x i64> %a, <4 x i64> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test163:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: movdqa %xmm0, %xmm4
; SSE4-NEXT: movdqa %xmm1, %xmm5
; SSE4-NEXT: pcmpgtq %xmm3, %xmm5
@@ -8894,7 +8894,7 @@ define <4 x i64> @test163(<4 x i64> %a, <4 x i64> %b) {
; SSE4-NEXT: retq
;
; AVX1-LABEL: test163:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
@@ -8904,13 +8904,13 @@ define <4 x i64> @test163(<4 x i64> %a, <4 x i64> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test163:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2
; AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
;
; AVX512BW-LABEL: test163:
-; AVX512BW: # BB#0: # %entry
+; AVX512BW: # %bb.0: # %entry
; AVX512BW-NEXT: vpmaxsq %ymm1, %ymm0, %ymm0
; AVX512BW-NEXT: retq
entry:
@@ -8921,7 +8921,7 @@ entry:
define <4 x i64> @test164(<4 x i64> %a, <4 x i64> %b) {
; SSE2-LABEL: test164:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [2147483648,0,2147483648,0]
; SSE2-NEXT: movdqa %xmm1, %xmm4
; SSE2-NEXT: pxor %xmm7, %xmm4
@@ -8961,7 +8961,7 @@ define <4 x i64> @test164(<4 x i64> %a, <4 x i64> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test164:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: movdqa %xmm0, %xmm4
; SSE4-NEXT: movdqa %xmm3, %xmm5
; SSE4-NEXT: pcmpgtq %xmm1, %xmm5
@@ -8978,7 +8978,7 @@ define <4 x i64> @test164(<4 x i64> %a, <4 x i64> %b) {
; SSE4-NEXT: retq
;
; AVX1-LABEL: test164:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
@@ -8991,7 +8991,7 @@ define <4 x i64> @test164(<4 x i64> %a, <4 x i64> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test164:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2
; AVX2-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3
; AVX2-NEXT: vpxor %ymm3, %ymm2, %ymm2
@@ -8999,7 +8999,7 @@ define <4 x i64> @test164(<4 x i64> %a, <4 x i64> %b) {
; AVX2-NEXT: retq
;
; AVX512BW-LABEL: test164:
-; AVX512BW: # BB#0: # %entry
+; AVX512BW: # %bb.0: # %entry
; AVX512BW-NEXT: vpmaxsq %ymm1, %ymm0, %ymm0
; AVX512BW-NEXT: retq
entry:
@@ -9010,7 +9010,7 @@ entry:
define <4 x i64> @test165(<4 x i64> %a, <4 x i64> %b) {
; SSE2-LABEL: test165:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
; SSE2-NEXT: movdqa %xmm1, %xmm5
; SSE2-NEXT: pxor %xmm4, %xmm5
@@ -9044,7 +9044,7 @@ define <4 x i64> @test165(<4 x i64> %a, <4 x i64> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test165:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: movdqa %xmm0, %xmm4
; SSE4-NEXT: movdqa {{.*#+}} xmm0 = [9223372036854775808,9223372036854775808]
; SSE4-NEXT: movdqa %xmm1, %xmm6
@@ -9064,7 +9064,7 @@ define <4 x i64> @test165(<4 x i64> %a, <4 x i64> %b) {
; SSE4-NEXT: retq
;
; AVX1-LABEL: test165:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2
@@ -9079,7 +9079,7 @@ define <4 x i64> @test165(<4 x i64> %a, <4 x i64> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test165:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpbroadcastq {{.*}}(%rip), %ymm2
; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm3
; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm2
@@ -9088,7 +9088,7 @@ define <4 x i64> @test165(<4 x i64> %a, <4 x i64> %b) {
; AVX2-NEXT: retq
;
; AVX512BW-LABEL: test165:
-; AVX512BW: # BB#0: # %entry
+; AVX512BW: # %bb.0: # %entry
; AVX512BW-NEXT: vpminuq %ymm1, %ymm0, %ymm0
; AVX512BW-NEXT: retq
entry:
@@ -9099,7 +9099,7 @@ entry:
define <4 x i64> @test166(<4 x i64> %a, <4 x i64> %b) {
; SSE2-LABEL: test166:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [2147483648,2147483648,2147483648,2147483648]
; SSE2-NEXT: movdqa %xmm3, %xmm4
; SSE2-NEXT: pxor %xmm7, %xmm4
@@ -9139,7 +9139,7 @@ define <4 x i64> @test166(<4 x i64> %a, <4 x i64> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test166:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: movdqa %xmm0, %xmm4
; SSE4-NEXT: movdqa {{.*#+}} xmm0 = [9223372036854775808,9223372036854775808]
; SSE4-NEXT: movdqa %xmm3, %xmm6
@@ -9162,7 +9162,7 @@ define <4 x i64> @test166(<4 x i64> %a, <4 x i64> %b) {
; SSE4-NEXT: retq
;
; AVX1-LABEL: test166:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2
@@ -9180,7 +9180,7 @@ define <4 x i64> @test166(<4 x i64> %a, <4 x i64> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test166:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpbroadcastq {{.*}}(%rip), %ymm2
; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm3
; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm2
@@ -9191,7 +9191,7 @@ define <4 x i64> @test166(<4 x i64> %a, <4 x i64> %b) {
; AVX2-NEXT: retq
;
; AVX512BW-LABEL: test166:
-; AVX512BW: # BB#0: # %entry
+; AVX512BW: # %bb.0: # %entry
; AVX512BW-NEXT: vpminuq %ymm1, %ymm0, %ymm0
; AVX512BW-NEXT: retq
entry:
@@ -9202,7 +9202,7 @@ entry:
define <4 x i64> @test167(<4 x i64> %a, <4 x i64> %b) {
; SSE2-LABEL: test167:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
; SSE2-NEXT: movdqa %xmm3, %xmm5
; SSE2-NEXT: pxor %xmm4, %xmm5
@@ -9236,7 +9236,7 @@ define <4 x i64> @test167(<4 x i64> %a, <4 x i64> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test167:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: movdqa %xmm0, %xmm4
; SSE4-NEXT: movdqa {{.*#+}} xmm0 = [9223372036854775808,9223372036854775808]
; SSE4-NEXT: movdqa %xmm3, %xmm6
@@ -9256,7 +9256,7 @@ define <4 x i64> @test167(<4 x i64> %a, <4 x i64> %b) {
; SSE4-NEXT: retq
;
; AVX1-LABEL: test167:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2
@@ -9271,7 +9271,7 @@ define <4 x i64> @test167(<4 x i64> %a, <4 x i64> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test167:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpbroadcastq {{.*}}(%rip), %ymm2
; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm3
; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm2
@@ -9280,7 +9280,7 @@ define <4 x i64> @test167(<4 x i64> %a, <4 x i64> %b) {
; AVX2-NEXT: retq
;
; AVX512BW-LABEL: test167:
-; AVX512BW: # BB#0: # %entry
+; AVX512BW: # %bb.0: # %entry
; AVX512BW-NEXT: vpmaxuq %ymm1, %ymm0, %ymm0
; AVX512BW-NEXT: retq
entry:
@@ -9291,7 +9291,7 @@ entry:
define <4 x i64> @test168(<4 x i64> %a, <4 x i64> %b) {
; SSE2-LABEL: test168:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [2147483648,2147483648,2147483648,2147483648]
; SSE2-NEXT: movdqa %xmm1, %xmm4
; SSE2-NEXT: pxor %xmm7, %xmm4
@@ -9331,7 +9331,7 @@ define <4 x i64> @test168(<4 x i64> %a, <4 x i64> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test168:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: movdqa %xmm0, %xmm4
; SSE4-NEXT: movdqa {{.*#+}} xmm0 = [9223372036854775808,9223372036854775808]
; SSE4-NEXT: movdqa %xmm1, %xmm6
@@ -9354,7 +9354,7 @@ define <4 x i64> @test168(<4 x i64> %a, <4 x i64> %b) {
; SSE4-NEXT: retq
;
; AVX1-LABEL: test168:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2
@@ -9372,7 +9372,7 @@ define <4 x i64> @test168(<4 x i64> %a, <4 x i64> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test168:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpbroadcastq {{.*}}(%rip), %ymm2
; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm3
; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm2
@@ -9383,7 +9383,7 @@ define <4 x i64> @test168(<4 x i64> %a, <4 x i64> %b) {
; AVX2-NEXT: retq
;
; AVX512BW-LABEL: test168:
-; AVX512BW: # BB#0: # %entry
+; AVX512BW: # %bb.0: # %entry
; AVX512BW-NEXT: vpmaxuq %ymm1, %ymm0, %ymm0
; AVX512BW-NEXT: retq
entry:
@@ -9394,7 +9394,7 @@ entry:
define <4 x i64> @test169(<4 x i64> %a, <4 x i64> %b) {
; SSE2-LABEL: test169:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,0,2147483648,0]
; SSE2-NEXT: movdqa %xmm1, %xmm4
; SSE2-NEXT: pxor %xmm5, %xmm4
@@ -9430,7 +9430,7 @@ define <4 x i64> @test169(<4 x i64> %a, <4 x i64> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test169:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: movdqa %xmm0, %xmm4
; SSE4-NEXT: movdqa %xmm3, %xmm5
; SSE4-NEXT: pcmpgtq %xmm1, %xmm5
@@ -9443,7 +9443,7 @@ define <4 x i64> @test169(<4 x i64> %a, <4 x i64> %b) {
; SSE4-NEXT: retq
;
; AVX1-LABEL: test169:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
@@ -9453,13 +9453,13 @@ define <4 x i64> @test169(<4 x i64> %a, <4 x i64> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test169:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2
; AVX2-NEXT: vblendvpd %ymm2, %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512BW-LABEL: test169:
-; AVX512BW: # BB#0: # %entry
+; AVX512BW: # %bb.0: # %entry
; AVX512BW-NEXT: vpmaxsq %ymm1, %ymm0, %ymm0
; AVX512BW-NEXT: retq
entry:
@@ -9470,7 +9470,7 @@ entry:
define <4 x i64> @test170(<4 x i64> %a, <4 x i64> %b) {
; SSE2-LABEL: test170:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [2147483648,0,2147483648,0]
; SSE2-NEXT: movdqa %xmm3, %xmm4
; SSE2-NEXT: pxor %xmm7, %xmm4
@@ -9510,7 +9510,7 @@ define <4 x i64> @test170(<4 x i64> %a, <4 x i64> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test170:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: movdqa %xmm0, %xmm4
; SSE4-NEXT: movdqa %xmm1, %xmm5
; SSE4-NEXT: pcmpgtq %xmm3, %xmm5
@@ -9525,7 +9525,7 @@ define <4 x i64> @test170(<4 x i64> %a, <4 x i64> %b) {
; SSE4-NEXT: retq
;
; AVX1-LABEL: test170:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
@@ -9538,7 +9538,7 @@ define <4 x i64> @test170(<4 x i64> %a, <4 x i64> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test170:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2
; AVX2-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3
; AVX2-NEXT: vpxor %ymm3, %ymm2, %ymm2
@@ -9546,7 +9546,7 @@ define <4 x i64> @test170(<4 x i64> %a, <4 x i64> %b) {
; AVX2-NEXT: retq
;
; AVX512BW-LABEL: test170:
-; AVX512BW: # BB#0: # %entry
+; AVX512BW: # %bb.0: # %entry
; AVX512BW-NEXT: vpmaxsq %ymm1, %ymm0, %ymm0
; AVX512BW-NEXT: retq
entry:
@@ -9557,7 +9557,7 @@ entry:
define <4 x i64> @test171(<4 x i64> %a, <4 x i64> %b) {
; SSE2-LABEL: test171:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,0,2147483648,0]
; SSE2-NEXT: movdqa %xmm3, %xmm4
; SSE2-NEXT: pxor %xmm5, %xmm4
@@ -9593,7 +9593,7 @@ define <4 x i64> @test171(<4 x i64> %a, <4 x i64> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test171:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: movdqa %xmm0, %xmm4
; SSE4-NEXT: movdqa %xmm1, %xmm5
; SSE4-NEXT: pcmpgtq %xmm3, %xmm5
@@ -9605,7 +9605,7 @@ define <4 x i64> @test171(<4 x i64> %a, <4 x i64> %b) {
; SSE4-NEXT: retq
;
; AVX1-LABEL: test171:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
@@ -9615,13 +9615,13 @@ define <4 x i64> @test171(<4 x i64> %a, <4 x i64> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test171:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2
; AVX2-NEXT: vblendvpd %ymm2, %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512BW-LABEL: test171:
-; AVX512BW: # BB#0: # %entry
+; AVX512BW: # %bb.0: # %entry
; AVX512BW-NEXT: vpminsq %ymm1, %ymm0, %ymm0
; AVX512BW-NEXT: retq
entry:
@@ -9632,7 +9632,7 @@ entry:
define <4 x i64> @test172(<4 x i64> %a, <4 x i64> %b) {
; SSE2-LABEL: test172:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [2147483648,0,2147483648,0]
; SSE2-NEXT: movdqa %xmm1, %xmm4
; SSE2-NEXT: pxor %xmm7, %xmm4
@@ -9672,7 +9672,7 @@ define <4 x i64> @test172(<4 x i64> %a, <4 x i64> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test172:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: movdqa %xmm0, %xmm4
; SSE4-NEXT: movdqa %xmm3, %xmm5
; SSE4-NEXT: pcmpgtq %xmm1, %xmm5
@@ -9688,7 +9688,7 @@ define <4 x i64> @test172(<4 x i64> %a, <4 x i64> %b) {
; SSE4-NEXT: retq
;
; AVX1-LABEL: test172:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
@@ -9701,7 +9701,7 @@ define <4 x i64> @test172(<4 x i64> %a, <4 x i64> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test172:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2
; AVX2-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3
; AVX2-NEXT: vpxor %ymm3, %ymm2, %ymm2
@@ -9709,7 +9709,7 @@ define <4 x i64> @test172(<4 x i64> %a, <4 x i64> %b) {
; AVX2-NEXT: retq
;
; AVX512BW-LABEL: test172:
-; AVX512BW: # BB#0: # %entry
+; AVX512BW: # %bb.0: # %entry
; AVX512BW-NEXT: vpminsq %ymm1, %ymm0, %ymm0
; AVX512BW-NEXT: retq
entry:
@@ -9720,7 +9720,7 @@ entry:
define <4 x i64> @test173(<4 x i64> %a, <4 x i64> %b) {
; SSE2-LABEL: test173:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648]
; SSE2-NEXT: movdqa %xmm1, %xmm4
; SSE2-NEXT: pxor %xmm5, %xmm4
@@ -9756,7 +9756,7 @@ define <4 x i64> @test173(<4 x i64> %a, <4 x i64> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test173:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: movdqa %xmm0, %xmm4
; SSE4-NEXT: movdqa {{.*#+}} xmm0 = [9223372036854775808,9223372036854775808]
; SSE4-NEXT: movdqa %xmm1, %xmm6
@@ -9775,7 +9775,7 @@ define <4 x i64> @test173(<4 x i64> %a, <4 x i64> %b) {
; SSE4-NEXT: retq
;
; AVX1-LABEL: test173:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2
@@ -9790,7 +9790,7 @@ define <4 x i64> @test173(<4 x i64> %a, <4 x i64> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test173:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpbroadcastq {{.*}}(%rip), %ymm2
; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm3
; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm2
@@ -9799,7 +9799,7 @@ define <4 x i64> @test173(<4 x i64> %a, <4 x i64> %b) {
; AVX2-NEXT: retq
;
; AVX512BW-LABEL: test173:
-; AVX512BW: # BB#0: # %entry
+; AVX512BW: # %bb.0: # %entry
; AVX512BW-NEXT: vpmaxuq %ymm1, %ymm0, %ymm0
; AVX512BW-NEXT: retq
entry:
@@ -9810,7 +9810,7 @@ entry:
define <4 x i64> @test174(<4 x i64> %a, <4 x i64> %b) {
; SSE2-LABEL: test174:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [2147483648,2147483648,2147483648,2147483648]
; SSE2-NEXT: movdqa %xmm3, %xmm4
; SSE2-NEXT: pxor %xmm7, %xmm4
@@ -9850,7 +9850,7 @@ define <4 x i64> @test174(<4 x i64> %a, <4 x i64> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test174:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: movdqa %xmm0, %xmm4
; SSE4-NEXT: movdqa {{.*#+}} xmm0 = [9223372036854775808,9223372036854775808]
; SSE4-NEXT: movdqa %xmm3, %xmm6
@@ -9872,7 +9872,7 @@ define <4 x i64> @test174(<4 x i64> %a, <4 x i64> %b) {
; SSE4-NEXT: retq
;
; AVX1-LABEL: test174:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2
@@ -9890,7 +9890,7 @@ define <4 x i64> @test174(<4 x i64> %a, <4 x i64> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test174:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpbroadcastq {{.*}}(%rip), %ymm2
; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm3
; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm2
@@ -9901,7 +9901,7 @@ define <4 x i64> @test174(<4 x i64> %a, <4 x i64> %b) {
; AVX2-NEXT: retq
;
; AVX512BW-LABEL: test174:
-; AVX512BW: # BB#0: # %entry
+; AVX512BW: # %bb.0: # %entry
; AVX512BW-NEXT: vpmaxuq %ymm1, %ymm0, %ymm0
; AVX512BW-NEXT: retq
entry:
@@ -9912,7 +9912,7 @@ entry:
define <4 x i64> @test175(<4 x i64> %a, <4 x i64> %b) {
; SSE2-LABEL: test175:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648]
; SSE2-NEXT: movdqa %xmm3, %xmm4
; SSE2-NEXT: pxor %xmm5, %xmm4
@@ -9948,7 +9948,7 @@ define <4 x i64> @test175(<4 x i64> %a, <4 x i64> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test175:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: movdqa %xmm0, %xmm4
; SSE4-NEXT: movdqa {{.*#+}} xmm0 = [9223372036854775808,9223372036854775808]
; SSE4-NEXT: movdqa %xmm3, %xmm6
@@ -9967,7 +9967,7 @@ define <4 x i64> @test175(<4 x i64> %a, <4 x i64> %b) {
; SSE4-NEXT: retq
;
; AVX1-LABEL: test175:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2
@@ -9982,7 +9982,7 @@ define <4 x i64> @test175(<4 x i64> %a, <4 x i64> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test175:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpbroadcastq {{.*}}(%rip), %ymm2
; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm3
; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm2
@@ -9991,7 +9991,7 @@ define <4 x i64> @test175(<4 x i64> %a, <4 x i64> %b) {
; AVX2-NEXT: retq
;
; AVX512BW-LABEL: test175:
-; AVX512BW: # BB#0: # %entry
+; AVX512BW: # %bb.0: # %entry
; AVX512BW-NEXT: vpminuq %ymm1, %ymm0, %ymm0
; AVX512BW-NEXT: retq
entry:
@@ -10002,7 +10002,7 @@ entry:
define <4 x i64> @test176(<4 x i64> %a, <4 x i64> %b) {
; SSE2-LABEL: test176:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [2147483648,2147483648,2147483648,2147483648]
; SSE2-NEXT: movdqa %xmm1, %xmm4
; SSE2-NEXT: pxor %xmm7, %xmm4
@@ -10042,7 +10042,7 @@ define <4 x i64> @test176(<4 x i64> %a, <4 x i64> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test176:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: movdqa %xmm0, %xmm4
; SSE4-NEXT: movdqa {{.*#+}} xmm0 = [9223372036854775808,9223372036854775808]
; SSE4-NEXT: movdqa %xmm1, %xmm6
@@ -10064,7 +10064,7 @@ define <4 x i64> @test176(<4 x i64> %a, <4 x i64> %b) {
; SSE4-NEXT: retq
;
; AVX1-LABEL: test176:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2
@@ -10082,7 +10082,7 @@ define <4 x i64> @test176(<4 x i64> %a, <4 x i64> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test176:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpbroadcastq {{.*}}(%rip), %ymm2
; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm3
; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm2
@@ -10093,7 +10093,7 @@ define <4 x i64> @test176(<4 x i64> %a, <4 x i64> %b) {
; AVX2-NEXT: retq
;
; AVX512BW-LABEL: test176:
-; AVX512BW: # BB#0: # %entry
+; AVX512BW: # %bb.0: # %entry
; AVX512BW-NEXT: vpminuq %ymm1, %ymm0, %ymm0
; AVX512BW-NEXT: retq
entry:
@@ -10104,7 +10104,7 @@ entry:
define <2 x i64> @test177(<2 x i64> %a, <2 x i64> %b) {
; SSE2-LABEL: test177:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0]
; SSE2-NEXT: movdqa %xmm0, %xmm3
; SSE2-NEXT: pxor %xmm2, %xmm3
@@ -10123,7 +10123,7 @@ define <2 x i64> @test177(<2 x i64> %a, <2 x i64> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test177:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: movdqa %xmm0, %xmm2
; SSE4-NEXT: movdqa %xmm1, %xmm0
; SSE4-NEXT: pcmpgtq %xmm2, %xmm0
@@ -10132,19 +10132,19 @@ define <2 x i64> @test177(<2 x i64> %a, <2 x i64> %b) {
; SSE4-NEXT: retq
;
; AVX1-LABEL: test177:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: test177:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; AVX2-NEXT: retq
;
; AVX512BW-LABEL: test177:
-; AVX512BW: # BB#0: # %entry
+; AVX512BW: # %bb.0: # %entry
; AVX512BW-NEXT: vpminsq %xmm1, %xmm0, %xmm0
; AVX512BW-NEXT: retq
entry:
@@ -10155,7 +10155,7 @@ entry:
define <2 x i64> @test178(<2 x i64> %a, <2 x i64> %b) {
; SSE2-LABEL: test178:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0]
; SSE2-NEXT: movdqa %xmm1, %xmm3
; SSE2-NEXT: pxor %xmm2, %xmm3
@@ -10177,7 +10177,7 @@ define <2 x i64> @test178(<2 x i64> %a, <2 x i64> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test178:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: movdqa %xmm0, %xmm2
; SSE4-NEXT: pcmpgtq %xmm1, %xmm0
; SSE4-NEXT: pcmpeqd %xmm3, %xmm3
@@ -10187,7 +10187,7 @@ define <2 x i64> @test178(<2 x i64> %a, <2 x i64> %b) {
; SSE4-NEXT: retq
;
; AVX1-LABEL: test178:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2
@@ -10195,7 +10195,7 @@ define <2 x i64> @test178(<2 x i64> %a, <2 x i64> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test178:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
; AVX2-NEXT: vpxor %xmm3, %xmm2, %xmm2
@@ -10203,7 +10203,7 @@ define <2 x i64> @test178(<2 x i64> %a, <2 x i64> %b) {
; AVX2-NEXT: retq
;
; AVX512BW-LABEL: test178:
-; AVX512BW: # BB#0: # %entry
+; AVX512BW: # %bb.0: # %entry
; AVX512BW-NEXT: vpminsq %xmm1, %xmm0, %xmm0
; AVX512BW-NEXT: retq
entry:
@@ -10214,7 +10214,7 @@ entry:
define <2 x i64> @test179(<2 x i64> %a, <2 x i64> %b) {
; SSE2-LABEL: test179:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0]
; SSE2-NEXT: movdqa %xmm1, %xmm3
; SSE2-NEXT: pxor %xmm2, %xmm3
@@ -10233,7 +10233,7 @@ define <2 x i64> @test179(<2 x i64> %a, <2 x i64> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test179:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: movdqa %xmm0, %xmm2
; SSE4-NEXT: pcmpgtq %xmm1, %xmm0
; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm1
@@ -10241,19 +10241,19 @@ define <2 x i64> @test179(<2 x i64> %a, <2 x i64> %b) {
; SSE4-NEXT: retq
;
; AVX1-LABEL: test179:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: test179:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; AVX2-NEXT: retq
;
; AVX512BW-LABEL: test179:
-; AVX512BW: # BB#0: # %entry
+; AVX512BW: # %bb.0: # %entry
; AVX512BW-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0
; AVX512BW-NEXT: retq
entry:
@@ -10264,7 +10264,7 @@ entry:
define <2 x i64> @test180(<2 x i64> %a, <2 x i64> %b) {
; SSE2-LABEL: test180:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0]
; SSE2-NEXT: movdqa %xmm0, %xmm3
; SSE2-NEXT: pxor %xmm2, %xmm3
@@ -10286,7 +10286,7 @@ define <2 x i64> @test180(<2 x i64> %a, <2 x i64> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test180:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: movdqa %xmm0, %xmm2
; SSE4-NEXT: movdqa %xmm1, %xmm3
; SSE4-NEXT: pcmpgtq %xmm2, %xmm3
@@ -10297,7 +10297,7 @@ define <2 x i64> @test180(<2 x i64> %a, <2 x i64> %b) {
; SSE4-NEXT: retq
;
; AVX1-LABEL: test180:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2
@@ -10305,7 +10305,7 @@ define <2 x i64> @test180(<2 x i64> %a, <2 x i64> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test180:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
; AVX2-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
; AVX2-NEXT: vpxor %xmm3, %xmm2, %xmm2
@@ -10313,7 +10313,7 @@ define <2 x i64> @test180(<2 x i64> %a, <2 x i64> %b) {
; AVX2-NEXT: retq
;
; AVX512BW-LABEL: test180:
-; AVX512BW: # BB#0: # %entry
+; AVX512BW: # %bb.0: # %entry
; AVX512BW-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0
; AVX512BW-NEXT: retq
entry:
@@ -10324,7 +10324,7 @@ entry:
define <2 x i64> @test181(<2 x i64> %a, <2 x i64> %b) {
; SSE2-LABEL: test181:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
; SSE2-NEXT: movdqa %xmm0, %xmm3
; SSE2-NEXT: pxor %xmm2, %xmm3
@@ -10343,7 +10343,7 @@ define <2 x i64> @test181(<2 x i64> %a, <2 x i64> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test181:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: movdqa %xmm0, %xmm2
; SSE4-NEXT: movdqa {{.*#+}} xmm0 = [9223372036854775808,9223372036854775808]
; SSE4-NEXT: movdqa %xmm2, %xmm3
@@ -10355,7 +10355,7 @@ define <2 x i64> @test181(<2 x i64> %a, <2 x i64> %b) {
; SSE4-NEXT: retq
;
; AVX1-LABEL: test181:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm3
; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm2
@@ -10364,7 +10364,7 @@ define <2 x i64> @test181(<2 x i64> %a, <2 x i64> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test181:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3
; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm2
@@ -10373,7 +10373,7 @@ define <2 x i64> @test181(<2 x i64> %a, <2 x i64> %b) {
; AVX2-NEXT: retq
;
; AVX512BW-LABEL: test181:
-; AVX512BW: # BB#0: # %entry
+; AVX512BW: # %bb.0: # %entry
; AVX512BW-NEXT: vpminuq %xmm1, %xmm0, %xmm0
; AVX512BW-NEXT: retq
entry:
@@ -10384,7 +10384,7 @@ entry:
define <2 x i64> @test182(<2 x i64> %a, <2 x i64> %b) {
; SSE2-LABEL: test182:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
; SSE2-NEXT: movdqa %xmm1, %xmm3
; SSE2-NEXT: pxor %xmm2, %xmm3
@@ -10406,7 +10406,7 @@ define <2 x i64> @test182(<2 x i64> %a, <2 x i64> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test182:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: movdqa %xmm0, %xmm2
; SSE4-NEXT: movdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
; SSE4-NEXT: movdqa %xmm1, %xmm0
@@ -10420,7 +10420,7 @@ define <2 x i64> @test182(<2 x i64> %a, <2 x i64> %b) {
; SSE4-NEXT: retq
;
; AVX1-LABEL: test182:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm3
; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm2
@@ -10431,7 +10431,7 @@ define <2 x i64> @test182(<2 x i64> %a, <2 x i64> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test182:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm3
; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm2
@@ -10442,7 +10442,7 @@ define <2 x i64> @test182(<2 x i64> %a, <2 x i64> %b) {
; AVX2-NEXT: retq
;
; AVX512BW-LABEL: test182:
-; AVX512BW: # BB#0: # %entry
+; AVX512BW: # %bb.0: # %entry
; AVX512BW-NEXT: vpminuq %xmm1, %xmm0, %xmm0
; AVX512BW-NEXT: retq
entry:
@@ -10453,7 +10453,7 @@ entry:
define <2 x i64> @test183(<2 x i64> %a, <2 x i64> %b) {
; SSE2-LABEL: test183:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
; SSE2-NEXT: movdqa %xmm1, %xmm3
; SSE2-NEXT: pxor %xmm2, %xmm3
@@ -10472,7 +10472,7 @@ define <2 x i64> @test183(<2 x i64> %a, <2 x i64> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test183:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: movdqa %xmm0, %xmm2
; SSE4-NEXT: movdqa {{.*#+}} xmm0 = [9223372036854775808,9223372036854775808]
; SSE4-NEXT: movdqa %xmm1, %xmm3
@@ -10484,7 +10484,7 @@ define <2 x i64> @test183(<2 x i64> %a, <2 x i64> %b) {
; SSE4-NEXT: retq
;
; AVX1-LABEL: test183:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm3
; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm2
@@ -10493,7 +10493,7 @@ define <2 x i64> @test183(<2 x i64> %a, <2 x i64> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test183:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm3
; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm2
@@ -10502,7 +10502,7 @@ define <2 x i64> @test183(<2 x i64> %a, <2 x i64> %b) {
; AVX2-NEXT: retq
;
; AVX512BW-LABEL: test183:
-; AVX512BW: # BB#0: # %entry
+; AVX512BW: # %bb.0: # %entry
; AVX512BW-NEXT: vpmaxuq %xmm1, %xmm0, %xmm0
; AVX512BW-NEXT: retq
entry:
@@ -10513,7 +10513,7 @@ entry:
define <2 x i64> @test184(<2 x i64> %a, <2 x i64> %b) {
; SSE2-LABEL: test184:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
; SSE2-NEXT: movdqa %xmm0, %xmm3
; SSE2-NEXT: pxor %xmm2, %xmm3
@@ -10535,7 +10535,7 @@ define <2 x i64> @test184(<2 x i64> %a, <2 x i64> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test184:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: movdqa %xmm0, %xmm2
; SSE4-NEXT: movdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
; SSE4-NEXT: pxor %xmm3, %xmm0
@@ -10548,7 +10548,7 @@ define <2 x i64> @test184(<2 x i64> %a, <2 x i64> %b) {
; SSE4-NEXT: retq
;
; AVX1-LABEL: test184:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm3
; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm2
@@ -10559,7 +10559,7 @@ define <2 x i64> @test184(<2 x i64> %a, <2 x i64> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test184:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3
; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm2
@@ -10570,7 +10570,7 @@ define <2 x i64> @test184(<2 x i64> %a, <2 x i64> %b) {
; AVX2-NEXT: retq
;
; AVX512BW-LABEL: test184:
-; AVX512BW: # BB#0: # %entry
+; AVX512BW: # %bb.0: # %entry
; AVX512BW-NEXT: vpmaxuq %xmm1, %xmm0, %xmm0
; AVX512BW-NEXT: retq
entry:
@@ -10581,7 +10581,7 @@ entry:
define <2 x i64> @test185(<2 x i64> %a, <2 x i64> %b) {
; SSE2-LABEL: test185:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0]
; SSE2-NEXT: movdqa %xmm0, %xmm3
; SSE2-NEXT: pxor %xmm2, %xmm3
@@ -10601,7 +10601,7 @@ define <2 x i64> @test185(<2 x i64> %a, <2 x i64> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test185:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: movdqa %xmm0, %xmm2
; SSE4-NEXT: movdqa %xmm1, %xmm0
; SSE4-NEXT: pcmpgtq %xmm2, %xmm0
@@ -10610,19 +10610,19 @@ define <2 x i64> @test185(<2 x i64> %a, <2 x i64> %b) {
; SSE4-NEXT: retq
;
; AVX1-LABEL: test185:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
; AVX1-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: test185:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
; AVX2-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
; AVX2-NEXT: retq
;
; AVX512BW-LABEL: test185:
-; AVX512BW: # BB#0: # %entry
+; AVX512BW: # %bb.0: # %entry
; AVX512BW-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0
; AVX512BW-NEXT: retq
entry:
@@ -10633,7 +10633,7 @@ entry:
define <2 x i64> @test186(<2 x i64> %a, <2 x i64> %b) {
; SSE2-LABEL: test186:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0]
; SSE2-NEXT: movdqa %xmm1, %xmm3
; SSE2-NEXT: pxor %xmm2, %xmm3
@@ -10655,7 +10655,7 @@ define <2 x i64> @test186(<2 x i64> %a, <2 x i64> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test186:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: movdqa %xmm0, %xmm2
; SSE4-NEXT: pcmpgtq %xmm1, %xmm0
; SSE4-NEXT: pcmpeqd %xmm3, %xmm3
@@ -10665,7 +10665,7 @@ define <2 x i64> @test186(<2 x i64> %a, <2 x i64> %b) {
; SSE4-NEXT: retq
;
; AVX1-LABEL: test186:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2
@@ -10673,7 +10673,7 @@ define <2 x i64> @test186(<2 x i64> %a, <2 x i64> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test186:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
; AVX2-NEXT: vpxor %xmm3, %xmm2, %xmm2
@@ -10681,7 +10681,7 @@ define <2 x i64> @test186(<2 x i64> %a, <2 x i64> %b) {
; AVX2-NEXT: retq
;
; AVX512BW-LABEL: test186:
-; AVX512BW: # BB#0: # %entry
+; AVX512BW: # %bb.0: # %entry
; AVX512BW-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0
; AVX512BW-NEXT: retq
entry:
@@ -10692,7 +10692,7 @@ entry:
define <2 x i64> @test187(<2 x i64> %a, <2 x i64> %b) {
; SSE2-LABEL: test187:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0]
; SSE2-NEXT: movdqa %xmm1, %xmm3
; SSE2-NEXT: pxor %xmm2, %xmm3
@@ -10712,7 +10712,7 @@ define <2 x i64> @test187(<2 x i64> %a, <2 x i64> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test187:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: movdqa %xmm0, %xmm2
; SSE4-NEXT: pcmpgtq %xmm1, %xmm0
; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm2
@@ -10720,19 +10720,19 @@ define <2 x i64> @test187(<2 x i64> %a, <2 x i64> %b) {
; SSE4-NEXT: retq
;
; AVX1-LABEL: test187:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: test187:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
; AVX2-NEXT: retq
;
; AVX512BW-LABEL: test187:
-; AVX512BW: # BB#0: # %entry
+; AVX512BW: # %bb.0: # %entry
; AVX512BW-NEXT: vpminsq %xmm1, %xmm0, %xmm0
; AVX512BW-NEXT: retq
entry:
@@ -10743,7 +10743,7 @@ entry:
define <2 x i64> @test188(<2 x i64> %a, <2 x i64> %b) {
; SSE2-LABEL: test188:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0]
; SSE2-NEXT: movdqa %xmm0, %xmm3
; SSE2-NEXT: pxor %xmm2, %xmm3
@@ -10765,7 +10765,7 @@ define <2 x i64> @test188(<2 x i64> %a, <2 x i64> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test188:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: movdqa %xmm0, %xmm2
; SSE4-NEXT: movdqa %xmm1, %xmm3
; SSE4-NEXT: pcmpgtq %xmm2, %xmm3
@@ -10776,7 +10776,7 @@ define <2 x i64> @test188(<2 x i64> %a, <2 x i64> %b) {
; SSE4-NEXT: retq
;
; AVX1-LABEL: test188:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2
@@ -10784,7 +10784,7 @@ define <2 x i64> @test188(<2 x i64> %a, <2 x i64> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test188:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
; AVX2-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
; AVX2-NEXT: vpxor %xmm3, %xmm2, %xmm2
@@ -10792,7 +10792,7 @@ define <2 x i64> @test188(<2 x i64> %a, <2 x i64> %b) {
; AVX2-NEXT: retq
;
; AVX512BW-LABEL: test188:
-; AVX512BW: # BB#0: # %entry
+; AVX512BW: # %bb.0: # %entry
; AVX512BW-NEXT: vpminsq %xmm1, %xmm0, %xmm0
; AVX512BW-NEXT: retq
entry:
@@ -10803,7 +10803,7 @@ entry:
define <2 x i64> @test189(<2 x i64> %a, <2 x i64> %b) {
; SSE2-LABEL: test189:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
; SSE2-NEXT: movdqa %xmm0, %xmm3
; SSE2-NEXT: pxor %xmm2, %xmm3
@@ -10823,7 +10823,7 @@ define <2 x i64> @test189(<2 x i64> %a, <2 x i64> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test189:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: movdqa %xmm0, %xmm2
; SSE4-NEXT: movdqa {{.*#+}} xmm0 = [9223372036854775808,9223372036854775808]
; SSE4-NEXT: movdqa %xmm2, %xmm3
@@ -10835,7 +10835,7 @@ define <2 x i64> @test189(<2 x i64> %a, <2 x i64> %b) {
; SSE4-NEXT: retq
;
; AVX1-LABEL: test189:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm3
; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm2
@@ -10844,7 +10844,7 @@ define <2 x i64> @test189(<2 x i64> %a, <2 x i64> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test189:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3
; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm2
@@ -10853,7 +10853,7 @@ define <2 x i64> @test189(<2 x i64> %a, <2 x i64> %b) {
; AVX2-NEXT: retq
;
; AVX512BW-LABEL: test189:
-; AVX512BW: # BB#0: # %entry
+; AVX512BW: # %bb.0: # %entry
; AVX512BW-NEXT: vpmaxuq %xmm1, %xmm0, %xmm0
; AVX512BW-NEXT: retq
entry:
@@ -10864,7 +10864,7 @@ entry:
define <2 x i64> @test190(<2 x i64> %a, <2 x i64> %b) {
; SSE2-LABEL: test190:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
; SSE2-NEXT: movdqa %xmm1, %xmm3
; SSE2-NEXT: pxor %xmm2, %xmm3
@@ -10886,7 +10886,7 @@ define <2 x i64> @test190(<2 x i64> %a, <2 x i64> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test190:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: movdqa %xmm0, %xmm2
; SSE4-NEXT: movdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
; SSE4-NEXT: movdqa %xmm1, %xmm0
@@ -10900,7 +10900,7 @@ define <2 x i64> @test190(<2 x i64> %a, <2 x i64> %b) {
; SSE4-NEXT: retq
;
; AVX1-LABEL: test190:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm3
; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm2
@@ -10911,7 +10911,7 @@ define <2 x i64> @test190(<2 x i64> %a, <2 x i64> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test190:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm3
; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm2
@@ -10922,7 +10922,7 @@ define <2 x i64> @test190(<2 x i64> %a, <2 x i64> %b) {
; AVX2-NEXT: retq
;
; AVX512BW-LABEL: test190:
-; AVX512BW: # BB#0: # %entry
+; AVX512BW: # %bb.0: # %entry
; AVX512BW-NEXT: vpmaxuq %xmm1, %xmm0, %xmm0
; AVX512BW-NEXT: retq
entry:
@@ -10933,7 +10933,7 @@ entry:
define <2 x i64> @test191(<2 x i64> %a, <2 x i64> %b) {
; SSE2-LABEL: test191:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
; SSE2-NEXT: movdqa %xmm1, %xmm3
; SSE2-NEXT: pxor %xmm2, %xmm3
@@ -10953,7 +10953,7 @@ define <2 x i64> @test191(<2 x i64> %a, <2 x i64> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test191:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: movdqa %xmm0, %xmm2
; SSE4-NEXT: movdqa {{.*#+}} xmm0 = [9223372036854775808,9223372036854775808]
; SSE4-NEXT: movdqa %xmm1, %xmm3
@@ -10965,7 +10965,7 @@ define <2 x i64> @test191(<2 x i64> %a, <2 x i64> %b) {
; SSE4-NEXT: retq
;
; AVX1-LABEL: test191:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm3
; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm2
@@ -10974,7 +10974,7 @@ define <2 x i64> @test191(<2 x i64> %a, <2 x i64> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test191:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm3
; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm2
@@ -10983,7 +10983,7 @@ define <2 x i64> @test191(<2 x i64> %a, <2 x i64> %b) {
; AVX2-NEXT: retq
;
; AVX512BW-LABEL: test191:
-; AVX512BW: # BB#0: # %entry
+; AVX512BW: # %bb.0: # %entry
; AVX512BW-NEXT: vpminuq %xmm1, %xmm0, %xmm0
; AVX512BW-NEXT: retq
entry:
@@ -10994,7 +10994,7 @@ entry:
define <2 x i64> @test192(<2 x i64> %a, <2 x i64> %b) {
; SSE2-LABEL: test192:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
; SSE2-NEXT: movdqa %xmm0, %xmm3
; SSE2-NEXT: pxor %xmm2, %xmm3
@@ -11016,7 +11016,7 @@ define <2 x i64> @test192(<2 x i64> %a, <2 x i64> %b) {
; SSE2-NEXT: retq
;
; SSE4-LABEL: test192:
-; SSE4: # BB#0: # %entry
+; SSE4: # %bb.0: # %entry
; SSE4-NEXT: movdqa %xmm0, %xmm2
; SSE4-NEXT: movdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
; SSE4-NEXT: pxor %xmm3, %xmm0
@@ -11029,7 +11029,7 @@ define <2 x i64> @test192(<2 x i64> %a, <2 x i64> %b) {
; SSE4-NEXT: retq
;
; AVX1-LABEL: test192:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm3
; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm2
@@ -11040,7 +11040,7 @@ define <2 x i64> @test192(<2 x i64> %a, <2 x i64> %b) {
; AVX1-NEXT: retq
;
; AVX2-LABEL: test192:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3
; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm2
@@ -11051,7 +11051,7 @@ define <2 x i64> @test192(<2 x i64> %a, <2 x i64> %b) {
; AVX2-NEXT: retq
;
; AVX512BW-LABEL: test192:
-; AVX512BW: # BB#0: # %entry
+; AVX512BW: # %bb.0: # %entry
; AVX512BW-NEXT: vpminuq %xmm1, %xmm0, %xmm0
; AVX512BW-NEXT: retq
entry:
diff --git a/test/CodeGen/X86/vselect-packss.ll b/test/CodeGen/X86/vselect-packss.ll
new file mode 100644
index 000000000000..2cd22fc45416
--- /dev/null
+++ b/test/CodeGen/X86/vselect-packss.ll
@@ -0,0 +1,418 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE42
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512NOBW --check-prefix=AVX512F
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512NOBW --check-prefix=AVX512VL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512BW --check-prefix=AVX512BWNOVL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512BW --check-prefix=AVX512BWVL
+
+;
+; General cases - packing of vector comparison to legal vector result types
+;
+
+define <16 x i8> @vselect_packss_v16i16(<16 x i16> %a0, <16 x i16> %a1, <16 x i8> %a2, <16 x i8> %a3) {
+; SSE2-LABEL: vselect_packss_v16i16:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pcmpeqw %xmm3, %xmm1
+; SSE2-NEXT: pcmpeqw %xmm2, %xmm0
+; SSE2-NEXT: packsswb %xmm1, %xmm0
+; SSE2-NEXT: pand %xmm0, %xmm4
+; SSE2-NEXT: pandn %xmm5, %xmm0
+; SSE2-NEXT: por %xmm4, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: vselect_packss_v16i16:
+; SSE42: # %bb.0:
+; SSE42-NEXT: pcmpeqw %xmm3, %xmm1
+; SSE42-NEXT: pcmpeqw %xmm2, %xmm0
+; SSE42-NEXT: packsswb %xmm1, %xmm0
+; SSE42-NEXT: pblendvb %xmm0, %xmm4, %xmm5
+; SSE42-NEXT: movdqa %xmm5, %xmm0
+; SSE42-NEXT: retq
+;
+; AVX1-LABEL: vselect_packss_v16i16:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT: vpcmpeqw %xmm4, %xmm5, %xmm4
+; AVX1-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpacksswb %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vpblendvb %xmm0, %xmm2, %xmm3, %xmm0
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: vselect_packss_v16i16:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpblendvb %xmm0, %xmm2, %xmm3, %xmm0
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512NOBW-LABEL: vselect_packss_v16i16:
+; AVX512NOBW: # %bb.0:
+; AVX512NOBW-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0
+; AVX512NOBW-NEXT: vpmovsxwd %ymm0, %zmm0
+; AVX512NOBW-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512NOBW-NEXT: vpblendvb %xmm0, %xmm2, %xmm3, %xmm0
+; AVX512NOBW-NEXT: vzeroupper
+; AVX512NOBW-NEXT: retq
+;
+; AVX512BWNOVL-LABEL: vselect_packss_v16i16:
+; AVX512BWNOVL: # %bb.0:
+; AVX512BWNOVL-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0
+; AVX512BWNOVL-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512BWNOVL-NEXT: vpblendvb %xmm0, %xmm2, %xmm3, %xmm0
+; AVX512BWNOVL-NEXT: vzeroupper
+; AVX512BWNOVL-NEXT: retq
+;
+; AVX512BWVL-LABEL: vselect_packss_v16i16:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vpcmpeqw %ymm1, %ymm0, %k0
+; AVX512BWVL-NEXT: vpmovm2b %k0, %xmm0
+; AVX512BWVL-NEXT: vpblendvb %xmm0, %xmm2, %xmm3, %xmm0
+; AVX512BWVL-NEXT: vzeroupper
+; AVX512BWVL-NEXT: retq
+ %1 = icmp eq <16 x i16> %a0, %a1
+ %2 = sext <16 x i1> %1 to <16 x i8>
+ %3 = and <16 x i8> %2, %a2
+ %4 = xor <16 x i8> %2, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+ %5 = and <16 x i8> %4, %a3
+ %6 = or <16 x i8> %3, %5
+ ret <16 x i8> %6
+}
+
+define <16 x i8> @vselect_packss_v16i32(<16 x i32> %a0, <16 x i32> %a1, <16 x i8> %a2, <16 x i8> %a3) {
+; SSE2-LABEL: vselect_packss_v16i32:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pcmpeqd %xmm7, %xmm3
+; SSE2-NEXT: pcmpeqd %xmm6, %xmm2
+; SSE2-NEXT: packssdw %xmm3, %xmm2
+; SSE2-NEXT: pcmpeqd %xmm5, %xmm1
+; SSE2-NEXT: pcmpeqd %xmm4, %xmm0
+; SSE2-NEXT: packssdw %xmm1, %xmm0
+; SSE2-NEXT: packsswb %xmm2, %xmm0
+; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1
+; SSE2-NEXT: pand %xmm0, %xmm1
+; SSE2-NEXT: pandn {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: vselect_packss_v16i32:
+; SSE42: # %bb.0:
+; SSE42-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8
+; SSE42-NEXT: pcmpeqd %xmm7, %xmm3
+; SSE42-NEXT: pcmpeqd %xmm6, %xmm2
+; SSE42-NEXT: packssdw %xmm3, %xmm2
+; SSE42-NEXT: pcmpeqd %xmm5, %xmm1
+; SSE42-NEXT: pcmpeqd %xmm4, %xmm0
+; SSE42-NEXT: packssdw %xmm1, %xmm0
+; SSE42-NEXT: packsswb %xmm2, %xmm0
+; SSE42-NEXT: pblendvb %xmm0, {{[0-9]+}}(%rsp), %xmm8
+; SSE42-NEXT: movdqa %xmm8, %xmm0
+; SSE42-NEXT: retq
+;
+; AVX1-LABEL: vselect_packss_v16i32:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm6
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7
+; AVX1-NEXT: vpcmpeqd %xmm6, %xmm7, %xmm6
+; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpackssdw %xmm6, %xmm1, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6
+; AVX1-NEXT: vpcmpeqd %xmm3, %xmm6, %xmm3
+; AVX1-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpackssdw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpblendvb %xmm0, %xmm4, %xmm5, %xmm0
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: vselect_packss_v16i32:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpcmpeqd %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: vpcmpeqd %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpacksswb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpand %xmm4, %xmm0, %xmm1
+; AVX2-NEXT: vpandn %xmm5, %xmm0, %xmm0
+; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512NOBW-LABEL: vselect_packss_v16i32:
+; AVX512NOBW: # %bb.0:
+; AVX512NOBW-NEXT: vpcmpeqd %zmm1, %zmm0, %k1
+; AVX512NOBW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512NOBW-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512NOBW-NEXT: vpblendvb %xmm0, %xmm2, %xmm3, %xmm0
+; AVX512NOBW-NEXT: vzeroupper
+; AVX512NOBW-NEXT: retq
+;
+; AVX512BWNOVL-LABEL: vselect_packss_v16i32:
+; AVX512BWNOVL: # %bb.0:
+; AVX512BWNOVL-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
+; AVX512BWNOVL-NEXT: vpmovm2b %k0, %zmm0
+; AVX512BWNOVL-NEXT: vpblendvb %xmm0, %xmm2, %xmm3, %xmm0
+; AVX512BWNOVL-NEXT: vzeroupper
+; AVX512BWNOVL-NEXT: retq
+;
+; AVX512BWVL-LABEL: vselect_packss_v16i32:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
+; AVX512BWVL-NEXT: vpmovm2b %k0, %xmm0
+; AVX512BWVL-NEXT: vpblendvb %xmm0, %xmm2, %xmm3, %xmm0
+; AVX512BWVL-NEXT: vzeroupper
+; AVX512BWVL-NEXT: retq
+ %1 = icmp eq <16 x i32> %a0, %a1
+ %2 = sext <16 x i1> %1 to <16 x i8>
+ %3 = and <16 x i8> %2, %a2
+ %4 = xor <16 x i8> %2, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+ %5 = and <16 x i8> %4, %a3
+ %6 = or <16 x i8> %3, %5
+ ret <16 x i8> %6
+}
+
+define <16 x i8> @vselect_packss_v16i64(<16 x i64> %a0, <16 x i64> %a1, <16 x i8> %a2, <16 x i8> %a3) {
+; SSE2-LABEL: vselect_packss_v16i64:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pcmpeqd {{[0-9]+}}(%rsp), %xmm7
+; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[1,0,3,2]
+; SSE2-NEXT: pand %xmm7, %xmm8
+; SSE2-NEXT: pcmpeqd {{[0-9]+}}(%rsp), %xmm6
+; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[1,0,3,2]
+; SSE2-NEXT: pand %xmm6, %xmm7
+; SSE2-NEXT: packssdw %xmm8, %xmm7
+; SSE2-NEXT: pcmpeqd {{[0-9]+}}(%rsp), %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,0,3,2]
+; SSE2-NEXT: pand %xmm5, %xmm6
+; SSE2-NEXT: pcmpeqd {{[0-9]+}}(%rsp), %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,0,3,2]
+; SSE2-NEXT: pand %xmm4, %xmm5
+; SSE2-NEXT: packssdw %xmm6, %xmm5
+; SSE2-NEXT: packssdw %xmm7, %xmm5
+; SSE2-NEXT: pcmpeqd {{[0-9]+}}(%rsp), %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,0,3,2]
+; SSE2-NEXT: pand %xmm3, %xmm4
+; SSE2-NEXT: pcmpeqd {{[0-9]+}}(%rsp), %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,0,3,2]
+; SSE2-NEXT: pand %xmm2, %xmm3
+; SSE2-NEXT: packssdw %xmm4, %xmm3
+; SSE2-NEXT: pcmpeqd {{[0-9]+}}(%rsp), %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,0,3,2]
+; SSE2-NEXT: pand %xmm1, %xmm2
+; SSE2-NEXT: pcmpeqd {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
+; SSE2-NEXT: pand %xmm0, %xmm1
+; SSE2-NEXT: packssdw %xmm2, %xmm1
+; SSE2-NEXT: packssdw %xmm3, %xmm1
+; SSE2-NEXT: packsswb %xmm5, %xmm1
+; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm0
+; SSE2-NEXT: pand %xmm1, %xmm0
+; SSE2-NEXT: pandn {{[0-9]+}}(%rsp), %xmm1
+; SSE2-NEXT: por %xmm0, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: vselect_packss_v16i64:
+; SSE42: # %bb.0:
+; SSE42-NEXT: pcmpeqq {{[0-9]+}}(%rsp), %xmm7
+; SSE42-NEXT: pcmpeqq {{[0-9]+}}(%rsp), %xmm6
+; SSE42-NEXT: packssdw %xmm7, %xmm6
+; SSE42-NEXT: pcmpeqq {{[0-9]+}}(%rsp), %xmm5
+; SSE42-NEXT: pcmpeqq {{[0-9]+}}(%rsp), %xmm4
+; SSE42-NEXT: packssdw %xmm5, %xmm4
+; SSE42-NEXT: packssdw %xmm6, %xmm4
+; SSE42-NEXT: pcmpeqq {{[0-9]+}}(%rsp), %xmm3
+; SSE42-NEXT: pcmpeqq {{[0-9]+}}(%rsp), %xmm2
+; SSE42-NEXT: packssdw %xmm3, %xmm2
+; SSE42-NEXT: pcmpeqq {{[0-9]+}}(%rsp), %xmm1
+; SSE42-NEXT: pcmpeqq {{[0-9]+}}(%rsp), %xmm0
+; SSE42-NEXT: packssdw %xmm1, %xmm0
+; SSE42-NEXT: packssdw %xmm2, %xmm0
+; SSE42-NEXT: packsswb %xmm4, %xmm0
+; SSE42-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1
+; SSE42-NEXT: pand %xmm0, %xmm1
+; SSE42-NEXT: pandn {{[0-9]+}}(%rsp), %xmm0
+; SSE42-NEXT: por %xmm1, %xmm0
+; SSE42-NEXT: retq
+;
+; AVX1-LABEL: vselect_packss_v16i64:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm8
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm9
+; AVX1-NEXT: vpcmpeqq %xmm8, %xmm9, %xmm8
+; AVX1-NEXT: vpcmpeqq %xmm7, %xmm3, %xmm3
+; AVX1-NEXT: vpackssdw %xmm8, %xmm3, %xmm8
+; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm7
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
+; AVX1-NEXT: vpcmpeqq %xmm7, %xmm3, %xmm3
+; AVX1-NEXT: vpcmpeqq %xmm6, %xmm2, %xmm2
+; AVX1-NEXT: vpackssdw %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpackssdw %xmm8, %xmm2, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm3
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm6
+; AVX1-NEXT: vpcmpeqq %xmm3, %xmm6, %xmm3
+; AVX1-NEXT: vpcmpeqq %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vpackssdw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm3
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT: vpcmpeqq %xmm3, %xmm5, %xmm3
+; AVX1-NEXT: vpcmpeqq %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vpackssdw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpacksswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpand {{[0-9]+}}(%rsp), %xmm0, %xmm1
+; AVX1-NEXT: vpandn {{[0-9]+}}(%rsp), %xmm0, %xmm0
+; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: vselect_packss_v16i64:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpcmpeqq %ymm7, %ymm3, %ymm3
+; AVX2-NEXT: vpcmpeqq %ymm6, %ymm2, %ymm2
+; AVX2-NEXT: vpackssdw %ymm3, %ymm2, %ymm2
+; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
+; AVX2-NEXT: vpcmpeqq %ymm5, %ymm1, %ymm1
+; AVX2-NEXT: vpcmpeqq %ymm4, %ymm0, %ymm0
+; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX2-NEXT: vpacksswb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpand {{[0-9]+}}(%rsp), %xmm0, %xmm1
+; AVX2-NEXT: vpandn {{[0-9]+}}(%rsp), %xmm0, %xmm0
+; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512NOBW-LABEL: vselect_packss_v16i64:
+; AVX512NOBW: # %bb.0:
+; AVX512NOBW-NEXT: vpcmpeqq %zmm2, %zmm0, %k0
+; AVX512NOBW-NEXT: vpcmpeqq %zmm3, %zmm1, %k1
+; AVX512NOBW-NEXT: kunpckbw %k0, %k1, %k1
+; AVX512NOBW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512NOBW-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512NOBW-NEXT: vpblendvb %xmm0, %xmm4, %xmm5, %xmm0
+; AVX512NOBW-NEXT: vzeroupper
+; AVX512NOBW-NEXT: retq
+;
+; AVX512BWNOVL-LABEL: vselect_packss_v16i64:
+; AVX512BWNOVL: # %bb.0:
+; AVX512BWNOVL-NEXT: vpcmpeqq %zmm2, %zmm0, %k0
+; AVX512BWNOVL-NEXT: vpcmpeqq %zmm3, %zmm1, %k1
+; AVX512BWNOVL-NEXT: kunpckbw %k0, %k1, %k0
+; AVX512BWNOVL-NEXT: vpmovm2b %k0, %zmm0
+; AVX512BWNOVL-NEXT: vpblendvb %xmm0, %xmm4, %xmm5, %xmm0
+; AVX512BWNOVL-NEXT: vzeroupper
+; AVX512BWNOVL-NEXT: retq
+;
+; AVX512BWVL-LABEL: vselect_packss_v16i64:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vpcmpeqq %zmm2, %zmm0, %k0
+; AVX512BWVL-NEXT: vpcmpeqq %zmm3, %zmm1, %k1
+; AVX512BWVL-NEXT: kunpckbw %k0, %k1, %k0
+; AVX512BWVL-NEXT: vpmovm2b %k0, %xmm0
+; AVX512BWVL-NEXT: vpblendvb %xmm0, %xmm4, %xmm5, %xmm0
+; AVX512BWVL-NEXT: vzeroupper
+; AVX512BWVL-NEXT: retq
+ %1 = icmp eq <16 x i64> %a0, %a1
+ %2 = sext <16 x i1> %1 to <16 x i8>
+ %3 = and <16 x i8> %2, %a2
+ %4 = xor <16 x i8> %2, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+ %5 = and <16 x i8> %4, %a3
+ %6 = or <16 x i8> %3, %5
+ ret <16 x i8> %6
+}
+
+;
+; PACKSS case
+;
+
+define <16 x i8> @vselect_packss(<16 x i16> %a0, <16 x i16> %a1, <16 x i8> %a2, <16 x i8> %a3) {
+; SSE2-LABEL: vselect_packss:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pcmpeqw %xmm3, %xmm1
+; SSE2-NEXT: pcmpeqw %xmm2, %xmm0
+; SSE2-NEXT: packsswb %xmm1, %xmm0
+; SSE2-NEXT: pand %xmm0, %xmm4
+; SSE2-NEXT: pandn %xmm5, %xmm0
+; SSE2-NEXT: por %xmm4, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: vselect_packss:
+; SSE42: # %bb.0:
+; SSE42-NEXT: pcmpeqw %xmm3, %xmm1
+; SSE42-NEXT: pcmpeqw %xmm2, %xmm0
+; SSE42-NEXT: packsswb %xmm1, %xmm0
+; SSE42-NEXT: pblendvb %xmm0, %xmm4, %xmm5
+; SSE42-NEXT: movdqa %xmm5, %xmm0
+; SSE42-NEXT: retq
+;
+; AVX1-LABEL: vselect_packss:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT: vpcmpeqw %xmm4, %xmm5, %xmm4
+; AVX1-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpacksswb %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vpblendvb %xmm0, %xmm2, %xmm3, %xmm0
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: vselect_packss:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpblendvb %xmm0, %xmm2, %xmm3, %xmm0
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512NOBW-LABEL: vselect_packss:
+; AVX512NOBW: # %bb.0:
+; AVX512NOBW-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0
+; AVX512NOBW-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512NOBW-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
+; AVX512NOBW-NEXT: vpblendvb %xmm0, %xmm2, %xmm3, %xmm0
+; AVX512NOBW-NEXT: vzeroupper
+; AVX512NOBW-NEXT: retq
+;
+; AVX512BWNOVL-LABEL: vselect_packss:
+; AVX512BWNOVL: # %bb.0:
+; AVX512BWNOVL-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0
+; AVX512BWNOVL-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512BWNOVL-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
+; AVX512BWNOVL-NEXT: vpblendvb %xmm0, %xmm2, %xmm3, %xmm0
+; AVX512BWNOVL-NEXT: vzeroupper
+; AVX512BWNOVL-NEXT: retq
+;
+; AVX512BWVL-LABEL: vselect_packss:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vpcmpeqw %ymm1, %ymm0, %k0
+; AVX512BWVL-NEXT: vpmovm2w %k0, %ymm0
+; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512BWVL-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
+; AVX512BWVL-NEXT: vpblendvb %xmm0, %xmm2, %xmm3, %xmm0
+; AVX512BWVL-NEXT: vzeroupper
+; AVX512BWVL-NEXT: retq
+ %1 = icmp eq <16 x i16> %a0, %a1
+ %2 = sext <16 x i1> %1 to <16 x i16>
+ %3 = shufflevector <16 x i16> %2, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %4 = shufflevector <16 x i16> %2, <16 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %5 = tail call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %3, <8 x i16> %4)
+ %6 = and <16 x i8> %5, %a2
+ %7 = xor <16 x i8> %5, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+ %8 = and <16 x i8> %7, %a3
+ %9 = or <16 x i8> %6, %8
+ ret <16 x i8> %9
+}
+declare <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16>, <8 x i16>)
diff --git a/test/CodeGen/X86/vselect-pcmp.ll b/test/CodeGen/X86/vselect-pcmp.ll
index 7807991b455d..e36d74194aa5 100644
--- a/test/CodeGen/X86/vselect-pcmp.ll
+++ b/test/CodeGen/X86/vselect-pcmp.ll
@@ -13,12 +13,12 @@
define <16 x i8> @signbit_sel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %mask) {
; AVX12-LABEL: signbit_sel_v16i8:
-; AVX12: # BB#0:
+; AVX12: # %bb.0:
; AVX12-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
; AVX12-NEXT: retq
;
; AVX512-LABEL: signbit_sel_v16i8:
-; AVX512: # BB#0:
+; AVX512: # %bb.0:
; AVX512-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX512-NEXT: vpcmpgtb %xmm2, %xmm3, %xmm2
; AVX512-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
@@ -32,7 +32,7 @@ define <16 x i8> @signbit_sel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %mask)
define <8 x i16> @signbit_sel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %mask) {
; AVX-LABEL: signbit_sel_v8i16:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX-NEXT: vpcmpgtw %xmm2, %xmm3, %xmm2
; AVX-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
@@ -44,12 +44,12 @@ define <8 x i16> @signbit_sel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %mask)
define <4 x i32> @signbit_sel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) {
; AVX12F-LABEL: signbit_sel_v4i32:
-; AVX12F: # BB#0:
+; AVX12F: # %bb.0:
; AVX12F-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
; AVX12F-NEXT: retq
;
; AVX512VL-LABEL: signbit_sel_v4i32:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX512VL-NEXT: vpcmpgtd %xmm2, %xmm3, %k1
; AVX512VL-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
@@ -61,12 +61,12 @@ define <4 x i32> @signbit_sel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask)
define <2 x i64> @signbit_sel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %mask) {
; AVX12F-LABEL: signbit_sel_v2i64:
-; AVX12F: # BB#0:
+; AVX12F: # %bb.0:
; AVX12F-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; AVX12F-NEXT: retq
;
; AVX512VL-LABEL: signbit_sel_v2i64:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX512VL-NEXT: vpcmpgtq %xmm2, %xmm3, %k1
; AVX512VL-NEXT: vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
@@ -78,12 +78,12 @@ define <2 x i64> @signbit_sel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %mask)
define <4 x float> @signbit_sel_v4f32(<4 x float> %x, <4 x float> %y, <4 x i32> %mask) {
; AVX12F-LABEL: signbit_sel_v4f32:
-; AVX12F: # BB#0:
+; AVX12F: # %bb.0:
; AVX12F-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
; AVX12F-NEXT: retq
;
; AVX512VL-LABEL: signbit_sel_v4f32:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX512VL-NEXT: vpcmpgtd %xmm2, %xmm3, %k1
; AVX512VL-NEXT: vblendmps %xmm0, %xmm1, %xmm0 {%k1}
@@ -95,12 +95,12 @@ define <4 x float> @signbit_sel_v4f32(<4 x float> %x, <4 x float> %y, <4 x i32>
define <2 x double> @signbit_sel_v2f64(<2 x double> %x, <2 x double> %y, <2 x i64> %mask) {
; AVX12F-LABEL: signbit_sel_v2f64:
-; AVX12F: # BB#0:
+; AVX12F: # %bb.0:
; AVX12F-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; AVX12F-NEXT: retq
;
; AVX512VL-LABEL: signbit_sel_v2f64:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX512VL-NEXT: vpcmpgtq %xmm2, %xmm3, %k1
; AVX512VL-NEXT: vblendmpd %xmm0, %xmm1, %xmm0 {%k1}
@@ -114,7 +114,7 @@ define <2 x double> @signbit_sel_v2f64(<2 x double> %x, <2 x double> %y, <2 x i6
define <32 x i8> @signbit_sel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %mask) {
; AVX1-LABEL: signbit_sel_v32i8:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
; AVX1-NEXT: vpcmpgtb %xmm3, %xmm4, %xmm3
@@ -126,13 +126,13 @@ define <32 x i8> @signbit_sel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %mask)
; AVX1-NEXT: retq
;
; AVX2-LABEL: signbit_sel_v32i8:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: signbit_sel_v32i8:
-; AVX512: # BB#0:
-; AVX512-NEXT: vpxor %ymm3, %ymm3, %ymm3
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX512-NEXT: vpcmpgtb %ymm2, %ymm3, %ymm2
; AVX512-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
; AVX512-NEXT: retq
@@ -145,7 +145,7 @@ define <32 x i8> @signbit_sel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %mask)
define <16 x i16> @signbit_sel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> %mask) {
; AVX1-LABEL: signbit_sel_v16i16:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
; AVX1-NEXT: vpcmpgtw %xmm3, %xmm4, %xmm3
@@ -157,15 +157,15 @@ define <16 x i16> @signbit_sel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> %
; AVX1-NEXT: retq
;
; AVX2-LABEL: signbit_sel_v16i16:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpxor %ymm3, %ymm3, %ymm3
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX2-NEXT: vpcmpgtw %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: signbit_sel_v16i16:
-; AVX512: # BB#0:
-; AVX512-NEXT: vpxor %ymm3, %ymm3, %ymm3
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX512-NEXT: vpcmpgtw %ymm2, %ymm3, %ymm2
; AVX512-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
; AVX512-NEXT: retq
@@ -176,24 +176,24 @@ define <16 x i16> @signbit_sel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> %
define <8 x i32> @signbit_sel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %mask) {
; AVX12-LABEL: signbit_sel_v8i32:
-; AVX12: # BB#0:
+; AVX12: # %bb.0:
; AVX12-NEXT: vblendvps %ymm2, %ymm0, %ymm1, %ymm0
; AVX12-NEXT: retq
;
; AVX512F-LABEL: signbit_sel_v8i32:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: # kill: %YMM2<def> %YMM2<kill> %ZMM2<def>
-; AVX512F-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
-; AVX512F-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
-; AVX512F-NEXT: vpxor %ymm3, %ymm3, %ymm3
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: # kill: def %ymm2 killed %ymm2 def %zmm2
+; AVX512F-NEXT: # kill: def %ymm1 killed %ymm1 def %zmm1
+; AVX512F-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0
+; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX512F-NEXT: vpcmpgtd %zmm2, %zmm3, %k1
; AVX512F-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
-; AVX512F-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512F-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: signbit_sel_v8i32:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vpxor %ymm3, %ymm3, %ymm3
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX512VL-NEXT: vpcmpgtd %ymm2, %ymm3, %k1
; AVX512VL-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
; AVX512VL-NEXT: retq
@@ -204,13 +204,13 @@ define <8 x i32> @signbit_sel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %mask)
define <4 x i64> @signbit_sel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %mask) {
; AVX12F-LABEL: signbit_sel_v4i64:
-; AVX12F: # BB#0:
+; AVX12F: # %bb.0:
; AVX12F-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
; AVX12F-NEXT: retq
;
; AVX512VL-LABEL: signbit_sel_v4i64:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vpxor %ymm3, %ymm3, %ymm3
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX512VL-NEXT: vpcmpgtq %ymm2, %ymm3, %k1
; AVX512VL-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
; AVX512VL-NEXT: retq
@@ -221,13 +221,13 @@ define <4 x i64> @signbit_sel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %mask)
define <4 x double> @signbit_sel_v4f64(<4 x double> %x, <4 x double> %y, <4 x i64> %mask) {
; AVX12F-LABEL: signbit_sel_v4f64:
-; AVX12F: # BB#0:
+; AVX12F: # %bb.0:
; AVX12F-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
; AVX12F-NEXT: retq
;
; AVX512VL-LABEL: signbit_sel_v4f64:
-; AVX512VL: # BB#0:
-; AVX512VL-NEXT: vpxor %ymm3, %ymm3, %ymm3
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX512VL-NEXT: vpcmpgtq %ymm2, %ymm3, %k1
; AVX512VL-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
; AVX512VL-NEXT: retq
@@ -240,7 +240,7 @@ define <4 x double> @signbit_sel_v4f64(<4 x double> %x, <4 x double> %y, <4 x i6
define <4 x double> @signbit_sel_v4f64_small_mask(<4 x double> %x, <4 x double> %y, <4 x i32> %mask) {
; AVX1-LABEL: signbit_sel_v4f64_small_mask:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpmovsxdq %xmm2, %xmm3
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
; AVX1-NEXT: vpmovsxdq %xmm2, %xmm2
@@ -249,19 +249,19 @@ define <4 x double> @signbit_sel_v4f64_small_mask(<4 x double> %x, <4 x double>
; AVX1-NEXT: retq
;
; AVX2-LABEL: signbit_sel_v4f64_small_mask:
-; AVX2: # BB#0:
+; AVX2: # %bb.0:
; AVX2-NEXT: vpmovsxdq %xmm2, %ymm2
; AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: signbit_sel_v4f64_small_mask:
-; AVX512F: # BB#0:
+; AVX512F: # %bb.0:
; AVX512F-NEXT: vpmovsxdq %xmm2, %ymm2
; AVX512F-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: signbit_sel_v4f64_small_mask:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX512VL-NEXT: vpcmpgtd %xmm2, %xmm3, %k1
; AVX512VL-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
@@ -275,14 +275,14 @@ define <4 x double> @signbit_sel_v4f64_small_mask(<4 x double> %x, <4 x double>
define <8 x double> @signbit_sel_v8f64(<8 x double> %x, <8 x double> %y, <8 x i64> %mask) {
; AVX12-LABEL: signbit_sel_v8f64:
-; AVX12: # BB#0:
+; AVX12: # %bb.0:
; AVX12-NEXT: vblendvpd %ymm4, %ymm0, %ymm2, %ymm0
; AVX12-NEXT: vblendvpd %ymm5, %ymm1, %ymm3, %ymm1
; AVX12-NEXT: retq
;
; AVX512-LABEL: signbit_sel_v8f64:
-; AVX512: # BB#0:
-; AVX512-NEXT: vpxord %zmm3, %zmm3, %zmm3
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX512-NEXT: vpcmpgtq %zmm2, %zmm3, %k1
; AVX512-NEXT: vblendmpd %zmm0, %zmm1, %zmm0 {%k1}
; AVX512-NEXT: retq
@@ -297,14 +297,14 @@ define <8 x double> @signbit_sel_v8f64(<8 x double> %x, <8 x double> %y, <8 x i6
define <4 x float> @signbit_sel_v4f32_fcmp(<4 x float> %x, <4 x float> %y, <4 x float> %mask) #0 {
; AVX12F-LABEL: signbit_sel_v4f32_fcmp:
-; AVX12F: # BB#0:
+; AVX12F: # %bb.0:
; AVX12F-NEXT: vxorps %xmm2, %xmm2, %xmm2
; AVX12F-NEXT: vcmpltps %xmm2, %xmm0, %xmm2
; AVX12F-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
; AVX12F-NEXT: retq
;
; AVX512VL-LABEL: signbit_sel_v4f32_fcmp:
-; AVX512VL: # BB#0:
+; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VL-NEXT: vcmpltps %xmm2, %xmm0, %k1
; AVX512VL-NEXT: vblendmps %xmm0, %xmm1, %xmm0 {%k1}
diff --git a/test/CodeGen/X86/vselect-zero.ll b/test/CodeGen/X86/vselect-zero.ll
new file mode 100644
index 000000000000..8eb137a61ff7
--- /dev/null
+++ b/test/CodeGen/X86/vselect-zero.ll
@@ -0,0 +1,59 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE42
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
+
+; PR28925
+
+define <4 x i32> @test1(<4 x i1> %cond, <4 x i32> %x) {
+; SSE-LABEL: test1:
+; SSE: # %bb.0:
+; SSE-NEXT: pslld $31, %xmm0
+; SSE-NEXT: psrad $31, %xmm0
+; SSE-NEXT: pandn %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test1:
+; AVX: # %bb.0:
+; AVX-NEXT: vpslld $31, %xmm0, %xmm0
+; AVX-NEXT: vpsrad $31, %xmm0, %xmm0
+; AVX-NEXT: vpandn %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
+ %r = select <4 x i1> %cond, <4 x i32> zeroinitializer, <4 x i32> %x
+ ret <4 x i32> %r
+}
+
+define <4 x i32> @test2(<4 x float> %a, <4 x float> %b, <4 x i32> %x) {
+; SSE-LABEL: test2:
+; SSE: # %bb.0:
+; SSE-NEXT: cmpneqps %xmm1, %xmm0
+; SSE-NEXT: andps %xmm2, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: test2:
+; AVX: # %bb.0:
+; AVX-NEXT: vcmpneqps %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vandps %xmm2, %xmm0, %xmm0
+; AVX-NEXT: retq
+ %cond = fcmp oeq <4 x float> %a, %b
+ %r = select <4 x i1> %cond, <4 x i32> zeroinitializer, <4 x i32> %x
+ ret <4 x i32> %r
+}
+
+define float @fsel(float %a, float %b, float %x) {
+; SSE-LABEL: fsel:
+; SSE: # %bb.0:
+; SSE-NEXT: cmpeqss %xmm1, %xmm0
+; SSE-NEXT: andnps %xmm2, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: fsel:
+; AVX: # %bb.0:
+; AVX-NEXT: vcmpeqss %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vandnps %xmm2, %xmm0, %xmm0
+; AVX-NEXT: retq
+ %cond = fcmp oeq float %a, %b
+ %sel = select i1 %cond, float 0.0, float %x
+ ret float %sel
+}
diff --git a/test/CodeGen/X86/vselect.ll b/test/CodeGen/X86/vselect.ll
index 9d08822f50fc..985f6a861b93 100644
--- a/test/CodeGen/X86/vselect.ll
+++ b/test/CodeGen/X86/vselect.ll
@@ -9,18 +9,18 @@
define <4 x float> @test1(<4 x float> %a, <4 x float> %b) {
; SSE2-LABEL: test1:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3]
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
; SSE2-NEXT: retq
;
; SSE41-LABEL: test1:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
; SSE41-NEXT: retq
;
; AVX-LABEL: test1:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
; AVX-NEXT: retq
%1 = select <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x float> %a, <4 x float> %b
@@ -29,18 +29,18 @@ define <4 x float> @test1(<4 x float> %a, <4 x float> %b) {
define <4 x float> @test2(<4 x float> %a, <4 x float> %b) {
; SSE2-LABEL: test2:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
; SSE2-NEXT: movapd %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: test2:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
; SSE41-NEXT: retq
;
; AVX-LABEL: test2:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
; AVX-NEXT: retq
%1 = select <4 x i1> <i1 true, i1 true, i1 false, i1 false>, <4 x float> %a, <4 x float> %b
@@ -49,17 +49,17 @@ define <4 x float> @test2(<4 x float> %a, <4 x float> %b) {
define <4 x float> @test3(<4 x float> %a, <4 x float> %b) {
; SSE2-LABEL: test3:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; SSE2-NEXT: retq
;
; SSE41-LABEL: test3:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; SSE41-NEXT: retq
;
; AVX-LABEL: test3:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; AVX-NEXT: retq
%1 = select <4 x i1> <i1 false, i1 false, i1 true, i1 true>, <4 x float> %a, <4 x float> %b
@@ -68,12 +68,12 @@ define <4 x float> @test3(<4 x float> %a, <4 x float> %b) {
define <4 x float> @test4(<4 x float> %a, <4 x float> %b) {
; SSE-LABEL: test4:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test4:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovaps %xmm1, %xmm0
; AVX-NEXT: retq
%1 = select <4 x i1> <i1 false, i1 false, i1 false, i1 false>, <4 x float> %a, <4 x float> %b
@@ -82,11 +82,11 @@ define <4 x float> @test4(<4 x float> %a, <4 x float> %b) {
define <4 x float> @test5(<4 x float> %a, <4 x float> %b) {
; SSE-LABEL: test5:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: retq
;
; AVX-LABEL: test5:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: retq
%1 = select <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> %a, <4 x float> %b
ret <4 x float> %1
@@ -94,11 +94,11 @@ define <4 x float> @test5(<4 x float> %a, <4 x float> %b) {
define <8 x i16> @test6(<8 x i16> %a, <8 x i16> %b) {
; SSE-LABEL: test6:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: retq
;
; AVX-LABEL: test6:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: retq
%1 = select <8 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false>, <8 x i16> %a, <8 x i16> %a
ret <8 x i16> %1
@@ -106,24 +106,24 @@ define <8 x i16> @test6(<8 x i16> %a, <8 x i16> %b) {
define <8 x i16> @test7(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: test7:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
; SSE2-NEXT: movapd %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: test7:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
; SSE41-NEXT: retq
;
; AVX1-LABEL: test7:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
; AVX1-NEXT: retq
;
; AVX2-LABEL: test7:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX2: # %bb.0:
+; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
; AVX2-NEXT: retq
%1 = select <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false>, <8 x i16> %a, <8 x i16> %b
ret <8 x i16> %1
@@ -131,23 +131,23 @@ define <8 x i16> @test7(<8 x i16> %a, <8 x i16> %b) {
define <8 x i16> @test8(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: test8:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; SSE2-NEXT: retq
;
; SSE41-LABEL: test8:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
; SSE41-NEXT: retq
;
; AVX1-LABEL: test8:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
; AVX1-NEXT: retq
;
; AVX2-LABEL: test8:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX2: # %bb.0:
+; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
; AVX2-NEXT: retq
%1 = select <8 x i1> <i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true>, <8 x i16> %a, <8 x i16> %b
ret <8 x i16> %1
@@ -155,12 +155,12 @@ define <8 x i16> @test8(<8 x i16> %a, <8 x i16> %b) {
define <8 x i16> @test9(<8 x i16> %a, <8 x i16> %b) {
; SSE-LABEL: test9:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test9:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovaps %xmm1, %xmm0
; AVX-NEXT: retq
%1 = select <8 x i1> <i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <8 x i16> %a, <8 x i16> %b
@@ -169,11 +169,11 @@ define <8 x i16> @test9(<8 x i16> %a, <8 x i16> %b) {
define <8 x i16> @test10(<8 x i16> %a, <8 x i16> %b) {
; SSE-LABEL: test10:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: retq
;
; AVX-LABEL: test10:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: retq
%1 = select <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i16> %a, <8 x i16> %b
ret <8 x i16> %1
@@ -181,7 +181,7 @@ define <8 x i16> @test10(<8 x i16> %a, <8 x i16> %b) {
define <8 x i16> @test11(<8 x i16> %a, <8 x i16> %b) {
; SSE2-LABEL: test11:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movaps {{.*#+}} xmm2 = [0,65535,65535,0,65535,65535,65535,65535]
; SSE2-NEXT: andps %xmm2, %xmm0
; SSE2-NEXT: andnps %xmm1, %xmm2
@@ -189,12 +189,12 @@ define <8 x i16> @test11(<8 x i16> %a, <8 x i16> %b) {
; SSE2-NEXT: retq
;
; SSE41-LABEL: test11:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7]
; SSE41-NEXT: retq
;
; AVX-LABEL: test11:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7]
; AVX-NEXT: retq
%1 = select <8 x i1> <i1 false, i1 true, i1 true, i1 false, i1 undef, i1 true, i1 true, i1 undef>, <8 x i16> %a, <8 x i16> %b
@@ -203,12 +203,12 @@ define <8 x i16> @test11(<8 x i16> %a, <8 x i16> %b) {
define <8 x i16> @test12(<8 x i16> %a, <8 x i16> %b) {
; SSE-LABEL: test12:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test12:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovaps %xmm1, %xmm0
; AVX-NEXT: retq
%1 = select <8 x i1> <i1 false, i1 false, i1 undef, i1 false, i1 false, i1 false, i1 false, i1 undef>, <8 x i16> %a, <8 x i16> %b
@@ -217,12 +217,12 @@ define <8 x i16> @test12(<8 x i16> %a, <8 x i16> %b) {
define <8 x i16> @test13(<8 x i16> %a, <8 x i16> %b) {
; SSE-LABEL: test13:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test13:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovaps %xmm1, %xmm0
; AVX-NEXT: retq
%1 = select <8 x i1> <i1 undef, i1 undef, i1 undef, i1 undef, i1 undef, i1 undef, i1 undef, i1 undef>, <8 x i16> %a, <8 x i16> %b
@@ -232,11 +232,11 @@ define <8 x i16> @test13(<8 x i16> %a, <8 x i16> %b) {
; Fold (vselect (build_vector AllOnes), N1, N2) -> N1
define <4 x float> @test14(<4 x float> %a, <4 x float> %b) {
; SSE-LABEL: test14:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: retq
;
; AVX-LABEL: test14:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: retq
%1 = select <4 x i1> <i1 true, i1 undef, i1 true, i1 undef>, <4 x float> %a, <4 x float> %b
ret <4 x float> %1
@@ -244,11 +244,11 @@ define <4 x float> @test14(<4 x float> %a, <4 x float> %b) {
define <8 x i16> @test15(<8 x i16> %a, <8 x i16> %b) {
; SSE-LABEL: test15:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: retq
;
; AVX-LABEL: test15:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: retq
%1 = select <8 x i1> <i1 true, i1 true, i1 true, i1 undef, i1 undef, i1 true, i1 true, i1 undef>, <8 x i16> %a, <8 x i16> %b
ret <8 x i16> %1
@@ -257,12 +257,12 @@ define <8 x i16> @test15(<8 x i16> %a, <8 x i16> %b) {
; Fold (vselect (build_vector AllZeros), N1, N2) -> N2
define <4 x float> @test16(<4 x float> %a, <4 x float> %b) {
; SSE-LABEL: test16:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test16:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovaps %xmm1, %xmm0
; AVX-NEXT: retq
%1 = select <4 x i1> <i1 false, i1 undef, i1 false, i1 undef>, <4 x float> %a, <4 x float> %b
@@ -271,12 +271,12 @@ define <4 x float> @test16(<4 x float> %a, <4 x float> %b) {
define <8 x i16> @test17(<8 x i16> %a, <8 x i16> %b) {
; SSE-LABEL: test17:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: test17:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovaps %xmm1, %xmm0
; AVX-NEXT: retq
%1 = select <8 x i1> <i1 false, i1 false, i1 false, i1 undef, i1 undef, i1 false, i1 false, i1 undef>, <8 x i16> %a, <8 x i16> %b
@@ -285,17 +285,17 @@ define <8 x i16> @test17(<8 x i16> %a, <8 x i16> %b) {
define <4 x float> @test18(<4 x float> %a, <4 x float> %b) {
; SSE2-LABEL: test18:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
; SSE2-NEXT: retq
;
; SSE41-LABEL: test18:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
; SSE41-NEXT: retq
;
; AVX-LABEL: test18:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
; AVX-NEXT: retq
%1 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %a, <4 x float> %b
@@ -304,23 +304,23 @@ define <4 x float> @test18(<4 x float> %a, <4 x float> %b) {
define <4 x i32> @test19(<4 x i32> %a, <4 x i32> %b) {
; SSE2-LABEL: test19:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
; SSE2-NEXT: retq
;
; SSE41-LABEL: test19:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
; SSE41-NEXT: retq
;
; AVX1-LABEL: test19:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
; AVX1-NEXT: retq
;
; AVX2-LABEL: test19:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX2: # %bb.0:
+; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
; AVX2-NEXT: retq
%1 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x i32> %a, <4 x i32> %b
ret <4 x i32> %1
@@ -328,17 +328,17 @@ define <4 x i32> @test19(<4 x i32> %a, <4 x i32> %b) {
define <2 x double> @test20(<2 x double> %a, <2 x double> %b) {
; SSE2-LABEL: test20:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; SSE2-NEXT: retq
;
; SSE41-LABEL: test20:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; SSE41-NEXT: retq
;
; AVX-LABEL: test20:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; AVX-NEXT: retq
%1 = select <2 x i1> <i1 false, i1 true>, <2 x double> %a, <2 x double> %b
@@ -347,23 +347,23 @@ define <2 x double> @test20(<2 x double> %a, <2 x double> %b) {
define <2 x i64> @test21(<2 x i64> %a, <2 x i64> %b) {
; SSE2-LABEL: test21:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; SSE2-NEXT: retq
;
; SSE41-LABEL: test21:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
; SSE41-NEXT: retq
;
; AVX1-LABEL: test21:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
; AVX1-NEXT: retq
;
; AVX2-LABEL: test21:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX2: # %bb.0:
+; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
; AVX2-NEXT: retq
%1 = select <2 x i1> <i1 false, i1 true>, <2 x i64> %a, <2 x i64> %b
ret <2 x i64> %1
@@ -371,18 +371,18 @@ define <2 x i64> @test21(<2 x i64> %a, <2 x i64> %b) {
define <4 x float> @test22(<4 x float> %a, <4 x float> %b) {
; SSE2-LABEL: test22:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
; SSE2-NEXT: movaps %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: test22:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; SSE41-NEXT: retq
;
; AVX-LABEL: test22:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; AVX-NEXT: retq
%1 = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x float> %a, <4 x float> %b
@@ -391,24 +391,24 @@ define <4 x float> @test22(<4 x float> %a, <4 x float> %b) {
define <4 x i32> @test23(<4 x i32> %a, <4 x i32> %b) {
; SSE2-LABEL: test23:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
; SSE2-NEXT: movaps %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: test23:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
; SSE41-NEXT: retq
;
; AVX1-LABEL: test23:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
; AVX1-NEXT: retq
;
; AVX2-LABEL: test23:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; AVX2: # %bb.0:
+; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; AVX2-NEXT: retq
%1 = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x i32> %a, <4 x i32> %b
ret <4 x i32> %1
@@ -416,18 +416,18 @@ define <4 x i32> @test23(<4 x i32> %a, <4 x i32> %b) {
define <2 x double> @test24(<2 x double> %a, <2 x double> %b) {
; SSE2-LABEL: test24:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
; SSE2-NEXT: movapd %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: test24:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
; SSE41-NEXT: retq
;
; AVX-LABEL: test24:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
; AVX-NEXT: retq
%1 = select <2 x i1> <i1 true, i1 false>, <2 x double> %a, <2 x double> %b
@@ -436,24 +436,24 @@ define <2 x double> @test24(<2 x double> %a, <2 x double> %b) {
define <2 x i64> @test25(<2 x i64> %a, <2 x i64> %b) {
; SSE2-LABEL: test25:
-; SSE2: # BB#0:
+; SSE2: # %bb.0:
; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
; SSE2-NEXT: movapd %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: test25:
-; SSE41: # BB#0:
+; SSE41: # %bb.0:
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
; SSE41-NEXT: retq
;
; AVX1-LABEL: test25:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
; AVX1-NEXT: retq
;
; AVX2-LABEL: test25:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX2: # %bb.0:
+; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
; AVX2-NEXT: retq
%1 = select <2 x i1> <i1 true, i1 false>, <2 x i64> %a, <2 x i64> %b
ret <2 x i64> %1
@@ -461,16 +461,16 @@ define <2 x i64> @test25(<2 x i64> %a, <2 x i64> %b) {
define <4 x float> @select_of_shuffles_0(<2 x float> %a0, <2 x float> %b0, <2 x float> %a1, <2 x float> %b1) {
; SSE-LABEL: select_of_shuffles_0:
-; SSE: # BB#0:
-; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm3[0]
+; SSE: # %bb.0:
+; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0]
; SSE-NEXT: subps %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: select_of_shuffles_0:
-; AVX: # BB#0:
-; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; AVX-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm3[0]
+; AVX: # %bb.0:
+; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0]
; AVX-NEXT: vsubps %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%1 = shufflevector <2 x float> %a0, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
@@ -486,7 +486,7 @@ define <4 x float> @select_of_shuffles_0(<2 x float> %a0, <2 x float> %b0, <2 x
; PR20677
define <16 x double> @select_illegal(<16 x double> %a, <16 x double> %b) {
; SSE-LABEL: select_illegal:
-; SSE: # BB#0:
+; SSE: # %bb.0:
; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm4
; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm5
; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm6
@@ -503,7 +503,7 @@ define <16 x double> @select_illegal(<16 x double> %a, <16 x double> %b) {
; SSE-NEXT: retq
;
; AVX-LABEL: select_illegal:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovaps %ymm6, %ymm2
; AVX-NEXT: vmovaps %ymm7, %ymm3
; AVX-NEXT: retq
diff --git a/test/CodeGen/X86/vshift-1.ll b/test/CodeGen/X86/vshift-1.ll
index a31adc337906..a2e1e7a641c5 100644
--- a/test/CodeGen/X86/vshift-1.ll
+++ b/test/CodeGen/X86/vshift-1.ll
@@ -7,14 +7,14 @@
define void @shift1a(<2 x i64> %val, <2 x i64>* %dst) nounwind {
; X32-LABEL: shift1a:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: psllq $32, %xmm0
; X32-NEXT: movdqa %xmm0, (%eax)
; X32-NEXT: retl
;
; X64-LABEL: shift1a:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: psllq $32, %xmm0
; X64-NEXT: movdqa %xmm0, (%rdi)
; X64-NEXT: retq
@@ -26,7 +26,7 @@ entry:
define void @shift1b(<2 x i64> %val, <2 x i64>* %dst, i64 %amt) nounwind {
; X32-LABEL: shift1b:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
@@ -35,7 +35,7 @@ define void @shift1b(<2 x i64> %val, <2 x i64>* %dst, i64 %amt) nounwind {
; X32-NEXT: retl
;
; X64-LABEL: shift1b:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: movq %rsi, %xmm1
; X64-NEXT: psllq %xmm1, %xmm0
; X64-NEXT: movdqa %xmm0, (%rdi)
@@ -51,14 +51,14 @@ entry:
define void @shift2a(<4 x i32> %val, <4 x i32>* %dst) nounwind {
; X32-LABEL: shift2a:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: pslld $5, %xmm0
; X32-NEXT: movdqa %xmm0, (%eax)
; X32-NEXT: retl
;
; X64-LABEL: shift2a:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: pslld $5, %xmm0
; X64-NEXT: movdqa %xmm0, (%rdi)
; X64-NEXT: retq
@@ -70,7 +70,7 @@ entry:
define void @shift2b(<4 x i32> %val, <4 x i32>* %dst, i32 %amt) nounwind {
; X32-LABEL: shift2b:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X32-NEXT: pslld %xmm1, %xmm0
@@ -78,7 +78,7 @@ define void @shift2b(<4 x i32> %val, <4 x i32>* %dst, i32 %amt) nounwind {
; X32-NEXT: retl
;
; X64-LABEL: shift2b:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: movd %esi, %xmm1
; X64-NEXT: pslld %xmm1, %xmm0
; X64-NEXT: movdqa %xmm0, (%rdi)
@@ -95,14 +95,14 @@ entry:
define void @shift3a(<8 x i16> %val, <8 x i16>* %dst) nounwind {
; X32-LABEL: shift3a:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: psllw $5, %xmm0
; X32-NEXT: movdqa %xmm0, (%eax)
; X32-NEXT: retl
;
; X64-LABEL: shift3a:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: psllw $5, %xmm0
; X64-NEXT: movdqa %xmm0, (%rdi)
; X64-NEXT: retq
@@ -115,7 +115,7 @@ entry:
; Make sure the shift amount is properly zero extended.
define void @shift3b(<8 x i16> %val, <8 x i16>* %dst, i16 %amt) nounwind {
; X32-LABEL: shift3b:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: movd %ecx, %xmm1
@@ -124,7 +124,7 @@ define void @shift3b(<8 x i16> %val, <8 x i16>* %dst, i16 %amt) nounwind {
; X32-NEXT: retl
;
; X64-LABEL: shift3b:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: movzwl %si, %eax
; X64-NEXT: movd %eax, %xmm1
; X64-NEXT: psllw %xmm1, %xmm0
diff --git a/test/CodeGen/X86/vshift-2.ll b/test/CodeGen/X86/vshift-2.ll
index a381637b40a9..6b01a8acdf4e 100644
--- a/test/CodeGen/X86/vshift-2.ll
+++ b/test/CodeGen/X86/vshift-2.ll
@@ -7,14 +7,14 @@
define void @shift1a(<2 x i64> %val, <2 x i64>* %dst) nounwind {
; X32-LABEL: shift1a:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: psrlq $32, %xmm0
; X32-NEXT: movdqa %xmm0, (%eax)
; X32-NEXT: retl
;
; X64-LABEL: shift1a:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: psrlq $32, %xmm0
; X64-NEXT: movdqa %xmm0, (%rdi)
; X64-NEXT: retq
@@ -26,7 +26,7 @@ entry:
define void @shift1b(<2 x i64> %val, <2 x i64>* %dst, i64 %amt) nounwind {
; X32-LABEL: shift1b:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
@@ -35,7 +35,7 @@ define void @shift1b(<2 x i64> %val, <2 x i64>* %dst, i64 %amt) nounwind {
; X32-NEXT: retl
;
; X64-LABEL: shift1b:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: movq %rsi, %xmm1
; X64-NEXT: psrlq %xmm1, %xmm0
; X64-NEXT: movdqa %xmm0, (%rdi)
@@ -50,14 +50,14 @@ entry:
define void @shift2a(<4 x i32> %val, <4 x i32>* %dst) nounwind {
; X32-LABEL: shift2a:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: psrld $17, %xmm0
; X32-NEXT: movdqa %xmm0, (%eax)
; X32-NEXT: retl
;
; X64-LABEL: shift2a:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: psrld $17, %xmm0
; X64-NEXT: movdqa %xmm0, (%rdi)
; X64-NEXT: retq
@@ -69,7 +69,7 @@ entry:
define void @shift2b(<4 x i32> %val, <4 x i32>* %dst, i32 %amt) nounwind {
; X32-LABEL: shift2b:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X32-NEXT: psrld %xmm1, %xmm0
@@ -77,7 +77,7 @@ define void @shift2b(<4 x i32> %val, <4 x i32>* %dst, i32 %amt) nounwind {
; X32-NEXT: retl
;
; X64-LABEL: shift2b:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: movd %esi, %xmm1
; X64-NEXT: psrld %xmm1, %xmm0
; X64-NEXT: movdqa %xmm0, (%rdi)
@@ -95,14 +95,14 @@ entry:
define void @shift3a(<8 x i16> %val, <8 x i16>* %dst) nounwind {
; X32-LABEL: shift3a:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: psrlw $5, %xmm0
; X32-NEXT: movdqa %xmm0, (%eax)
; X32-NEXT: retl
;
; X64-LABEL: shift3a:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: psrlw $5, %xmm0
; X64-NEXT: movdqa %xmm0, (%rdi)
; X64-NEXT: retq
@@ -115,7 +115,7 @@ entry:
; properly zero extend the shift amount
define void @shift3b(<8 x i16> %val, <8 x i16>* %dst, i16 %amt) nounwind {
; X32-LABEL: shift3b:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: movd %ecx, %xmm1
@@ -124,7 +124,7 @@ define void @shift3b(<8 x i16> %val, <8 x i16>* %dst, i16 %amt) nounwind {
; X32-NEXT: retl
;
; X64-LABEL: shift3b:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: movzwl %si, %eax
; X64-NEXT: movd %eax, %xmm1
; X64-NEXT: psrlw %xmm1, %xmm0
diff --git a/test/CodeGen/X86/vshift-3.ll b/test/CodeGen/X86/vshift-3.ll
index c59dacec6e37..57261ab8a556 100644
--- a/test/CodeGen/X86/vshift-3.ll
+++ b/test/CodeGen/X86/vshift-3.ll
@@ -9,7 +9,7 @@
define void @shift1a(<2 x i64> %val, <2 x i64>* %dst) nounwind {
; X32-LABEL: shift1a:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
; X32-NEXT: psrad $31, %xmm0
@@ -19,7 +19,7 @@ define void @shift1a(<2 x i64> %val, <2 x i64>* %dst) nounwind {
; X32-NEXT: retl
;
; X64-LABEL: shift1a:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
; X64-NEXT: psrad $31, %xmm0
; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
@@ -34,14 +34,14 @@ entry:
define void @shift2a(<4 x i32> %val, <4 x i32>* %dst) nounwind {
; X32-LABEL: shift2a:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: psrad $5, %xmm0
; X32-NEXT: movdqa %xmm0, (%eax)
; X32-NEXT: retl
;
; X64-LABEL: shift2a:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: psrad $5, %xmm0
; X64-NEXT: movdqa %xmm0, (%rdi)
; X64-NEXT: retq
@@ -53,7 +53,7 @@ entry:
define void @shift2b(<4 x i32> %val, <4 x i32>* %dst, i32 %amt) nounwind {
; X32-LABEL: shift2b:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X32-NEXT: psrad %xmm1, %xmm0
@@ -61,7 +61,7 @@ define void @shift2b(<4 x i32> %val, <4 x i32>* %dst, i32 %amt) nounwind {
; X32-NEXT: retl
;
; X64-LABEL: shift2b:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: movd %esi, %xmm1
; X64-NEXT: psrad %xmm1, %xmm0
; X64-NEXT: movdqa %xmm0, (%rdi)
@@ -78,14 +78,14 @@ entry:
define void @shift3a(<8 x i16> %val, <8 x i16>* %dst) nounwind {
; X32-LABEL: shift3a:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: psraw $5, %xmm0
; X32-NEXT: movdqa %xmm0, (%eax)
; X32-NEXT: retl
;
; X64-LABEL: shift3a:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: psraw $5, %xmm0
; X64-NEXT: movdqa %xmm0, (%rdi)
; X64-NEXT: retq
@@ -97,7 +97,7 @@ entry:
define void @shift3b(<8 x i16> %val, <8 x i16>* %dst, i16 %amt) nounwind {
; X32-LABEL: shift3b:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: movd %ecx, %xmm1
@@ -106,7 +106,7 @@ define void @shift3b(<8 x i16> %val, <8 x i16>* %dst, i16 %amt) nounwind {
; X32-NEXT: retl
;
; X64-LABEL: shift3b:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: movzwl %si, %eax
; X64-NEXT: movd %eax, %xmm1
; X64-NEXT: psraw %xmm1, %xmm0
diff --git a/test/CodeGen/X86/vshift-4.ll b/test/CodeGen/X86/vshift-4.ll
index 5d486e794051..a49d6f384497 100644
--- a/test/CodeGen/X86/vshift-4.ll
+++ b/test/CodeGen/X86/vshift-4.ll
@@ -7,14 +7,14 @@
define void @shift1a(<2 x i64> %val, <2 x i64>* %dst, <2 x i64> %sh) nounwind {
; X32-LABEL: shift1a:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: psllq %xmm1, %xmm0
; X32-NEXT: movdqa %xmm0, (%eax)
; X32-NEXT: retl
;
; X64-LABEL: shift1a:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: psllq %xmm1, %xmm0
; X64-NEXT: movdqa %xmm0, (%rdi)
; X64-NEXT: retq
@@ -28,24 +28,24 @@ entry:
; shift1b can't use a packed shift but can shift lanes separately and shuffle back together
define void @shift1b(<2 x i64> %val, <2 x i64>* %dst, <2 x i64> %sh) nounwind {
; X32-LABEL: shift1b:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
-; X32-NEXT: movdqa %xmm0, %xmm3
-; X32-NEXT: psllq %xmm2, %xmm3
+; X32-NEXT: movdqa %xmm0, %xmm2
+; X32-NEXT: psllq %xmm1, %xmm2
+; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
; X32-NEXT: psllq %xmm1, %xmm0
-; X32-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1]
-; X32-NEXT: movapd %xmm3, (%eax)
+; X32-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
+; X32-NEXT: movapd %xmm0, (%eax)
; X32-NEXT: retl
;
; X64-LABEL: shift1b:
-; X64: # BB#0: # %entry
-; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
-; X64-NEXT: movdqa %xmm0, %xmm3
-; X64-NEXT: psllq %xmm2, %xmm3
+; X64: # %bb.0: # %entry
+; X64-NEXT: movdqa %xmm0, %xmm2
+; X64-NEXT: psllq %xmm1, %xmm2
+; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
; X64-NEXT: psllq %xmm1, %xmm0
-; X64-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1]
-; X64-NEXT: movapd %xmm3, (%rdi)
+; X64-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
+; X64-NEXT: movapd %xmm0, (%rdi)
; X64-NEXT: retq
entry:
%shamt = shufflevector <2 x i64> %sh, <2 x i64> undef, <2 x i32> <i32 0, i32 1>
@@ -56,7 +56,7 @@ entry:
define void @shift2a(<4 x i32> %val, <4 x i32>* %dst, <2 x i32> %amt) nounwind {
; X32-LABEL: shift2a:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
; X32-NEXT: xorps %xmm2, %xmm2
@@ -66,7 +66,7 @@ define void @shift2a(<4 x i32> %val, <4 x i32>* %dst, <2 x i32> %amt) nounwind {
; X32-NEXT: retl
;
; X64-LABEL: shift2a:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
; X64-NEXT: xorps %xmm2, %xmm2
; X64-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
@@ -82,7 +82,7 @@ entry:
define void @shift2b(<4 x i32> %val, <4 x i32>* %dst, <2 x i32> %amt) nounwind {
; X32-LABEL: shift2b:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
; X32-NEXT: xorps %xmm2, %xmm2
@@ -92,7 +92,7 @@ define void @shift2b(<4 x i32> %val, <4 x i32>* %dst, <2 x i32> %amt) nounwind {
; X32-NEXT: retl
;
; X64-LABEL: shift2b:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
; X64-NEXT: xorps %xmm2, %xmm2
; X64-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
@@ -108,7 +108,7 @@ entry:
define void @shift2c(<4 x i32> %val, <4 x i32>* %dst, <2 x i32> %amt) nounwind {
; X32-LABEL: shift2c:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
; X32-NEXT: xorps %xmm2, %xmm2
@@ -118,7 +118,7 @@ define void @shift2c(<4 x i32> %val, <4 x i32>* %dst, <2 x i32> %amt) nounwind {
; X32-NEXT: retl
;
; X64-LABEL: shift2c:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
; X64-NEXT: xorps %xmm2, %xmm2
; X64-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
@@ -134,7 +134,7 @@ entry:
define void @shift3a(<8 x i16> %val, <8 x i16>* %dst, <8 x i16> %amt) nounwind {
; X32-LABEL: shift3a:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: pextrw $6, %xmm1, %ecx
; X32-NEXT: movd %ecx, %xmm1
@@ -143,7 +143,7 @@ define void @shift3a(<8 x i16> %val, <8 x i16>* %dst, <8 x i16> %amt) nounwind {
; X32-NEXT: retl
;
; X64-LABEL: shift3a:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: pextrw $6, %xmm1, %eax
; X64-NEXT: movd %eax, %xmm1
; X64-NEXT: psllw %xmm1, %xmm0
@@ -158,7 +158,7 @@ entry:
define void @shift3b(<8 x i16> %val, <8 x i16>* %dst, i16 %amt) nounwind {
; X32-LABEL: shift3b:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: movd %ecx, %xmm1
@@ -167,7 +167,7 @@ define void @shift3b(<8 x i16> %val, <8 x i16>* %dst, i16 %amt) nounwind {
; X32-NEXT: retl
;
; X64-LABEL: shift3b:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: movzwl %si, %eax
; X64-NEXT: movd %eax, %xmm1
; X64-NEXT: psllw %xmm1, %xmm0
diff --git a/test/CodeGen/X86/vshift-5.ll b/test/CodeGen/X86/vshift-5.ll
index 38b391b6439c..0fe0f8a5e22e 100644
--- a/test/CodeGen/X86/vshift-5.ll
+++ b/test/CodeGen/X86/vshift-5.ll
@@ -6,7 +6,7 @@
define void @shift5a(<4 x i32> %val, <4 x i32>* %dst, i32* %pamt) nounwind {
; X32-LABEL: shift5a:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
@@ -15,7 +15,7 @@ define void @shift5a(<4 x i32> %val, <4 x i32>* %dst, i32* %pamt) nounwind {
; X32-NEXT: retl
;
; X64-LABEL: shift5a:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X64-NEXT: pslld %xmm1, %xmm0
; X64-NEXT: movdqa %xmm0, (%rdi)
@@ -32,7 +32,7 @@ entry:
define void @shift5b(<4 x i32> %val, <4 x i32>* %dst, i32* %pamt) nounwind {
; X32-LABEL: shift5b:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
@@ -41,7 +41,7 @@ define void @shift5b(<4 x i32> %val, <4 x i32>* %dst, i32* %pamt) nounwind {
; X32-NEXT: retl
;
; X64-LABEL: shift5b:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X64-NEXT: psrad %xmm1, %xmm0
; X64-NEXT: movdqa %xmm0, (%rdi)
@@ -58,7 +58,7 @@ entry:
define void @shift5c(<4 x i32> %val, <4 x i32>* %dst, i32 %amt) nounwind {
; X32-LABEL: shift5c:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X32-NEXT: pslld %xmm1, %xmm0
@@ -66,7 +66,7 @@ define void @shift5c(<4 x i32> %val, <4 x i32>* %dst, i32 %amt) nounwind {
; X32-NEXT: retl
;
; X64-LABEL: shift5c:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: movd %esi, %xmm1
; X64-NEXT: pslld %xmm1, %xmm0
; X64-NEXT: movdqa %xmm0, (%rdi)
@@ -82,7 +82,7 @@ entry:
define void @shift5d(<4 x i32> %val, <4 x i32>* %dst, i32 %amt) nounwind {
; X32-LABEL: shift5d:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X32-NEXT: psrad %xmm1, %xmm0
@@ -90,7 +90,7 @@ define void @shift5d(<4 x i32> %val, <4 x i32>* %dst, i32 %amt) nounwind {
; X32-NEXT: retl
;
; X64-LABEL: shift5d:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: movd %esi, %xmm1
; X64-NEXT: psrad %xmm1, %xmm0
; X64-NEXT: movdqa %xmm0, (%rdi)
diff --git a/test/CodeGen/X86/vshift-6.ll b/test/CodeGen/X86/vshift-6.ll
index 36d428cb9cf4..5cfa38ab833e 100644
--- a/test/CodeGen/X86/vshift-6.ll
+++ b/test/CodeGen/X86/vshift-6.ll
@@ -26,7 +26,7 @@
define <16 x i8> @do_not_crash(i8*, i32*, i64*, i32, i64, i8) {
; X32-LABEL: do_not_crash:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: movb %al, (%ecx)
@@ -63,7 +63,7 @@ define <16 x i8> @do_not_crash(i8*, i32*, i64*, i32, i64, i8) {
; X32-NEXT: retl
;
; X64-LABEL: do_not_crash:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: movb %r9b, (%rdi)
; X64-NEXT: movd %r9d, %xmm0
; X64-NEXT: psllq $56, %xmm0
diff --git a/test/CodeGen/X86/vshift_split.ll b/test/CodeGen/X86/vshift_split.ll
index 359d36d8af69..6872dc0ab989 100644
--- a/test/CodeGen/X86/vshift_split.ll
+++ b/test/CodeGen/X86/vshift_split.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2
+; RUN: llc < %s -mtriple=i686-- -mattr=+sse2
; Example that requires splitting and expanding a vector shift.
define <2 x i64> @update(<2 x i64> %val) nounwind readnone {
diff --git a/test/CodeGen/X86/vshift_split2.ll b/test/CodeGen/X86/vshift_split2.ll
index 0f8c2b896e2b..e07bfefcaeea 100644
--- a/test/CodeGen/X86/vshift_split2.ll
+++ b/test/CodeGen/X86/vshift_split2.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mcpu=yonah
+; RUN: llc < %s -mtriple=i686-- -mcpu=yonah
; Legalization example that requires splitting a large vector into smaller pieces.
diff --git a/test/CodeGen/X86/vsplit-and.ll b/test/CodeGen/X86/vsplit-and.ll
index f844904c8690..26bbcdbe5d91 100644
--- a/test/CodeGen/X86/vsplit-and.ll
+++ b/test/CodeGen/X86/vsplit-and.ll
@@ -3,7 +3,7 @@
define void @t0(<2 x i64>* %dst, <2 x i64> %src1, <2 x i64> %src2) nounwind readonly {
; CHECK-LABEL: t0:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: pxor %xmm2, %xmm2
; CHECK-NEXT: pcmpeqq %xmm2, %xmm0
; CHECK-NEXT: pcmpeqq %xmm2, %xmm1
@@ -22,32 +22,28 @@ define void @t0(<2 x i64>* %dst, <2 x i64> %src1, <2 x i64> %src2) nounwind read
define void @t2(<3 x i64>* %dst, <3 x i64> %src1, <3 x i64> %src2) nounwind readonly {
; CHECK-LABEL: t2:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movq %r9, %xmm1
; CHECK-NEXT: movq %r8, %xmm0
; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; CHECK-NEXT: movq %rdx, %xmm2
-; CHECK-NEXT: movq %rsi, %xmm1
-; CHECK-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; CHECK-NEXT: movq %rcx, %xmm2
+; CHECK-NEXT: movq %rdx, %xmm1
+; CHECK-NEXT: movq %rsi, %xmm2
+; CHECK-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
+; CHECK-NEXT: movq %rcx, %xmm1
; CHECK-NEXT: movq {{.*#+}} xmm3 = mem[0],zero
; CHECK-NEXT: pxor %xmm4, %xmm4
-; CHECK-NEXT: pcmpeqq %xmm4, %xmm2
-; CHECK-NEXT: pcmpeqd %xmm5, %xmm5
-; CHECK-NEXT: pxor %xmm5, %xmm2
; CHECK-NEXT: pcmpeqq %xmm4, %xmm1
+; CHECK-NEXT: pcmpeqd %xmm5, %xmm5
; CHECK-NEXT: pxor %xmm5, %xmm1
-; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
-; CHECK-NEXT: pslld $31, %xmm1
-; CHECK-NEXT: psrad $31, %xmm1
+; CHECK-NEXT: pcmpeqq %xmm4, %xmm2
+; CHECK-NEXT: pxor %xmm5, %xmm2
+; CHECK-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[0,2]
; CHECK-NEXT: pcmpeqq %xmm4, %xmm3
; CHECK-NEXT: pxor %xmm5, %xmm3
; CHECK-NEXT: pcmpeqq %xmm4, %xmm0
; CHECK-NEXT: pxor %xmm5, %xmm0
; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2]
-; CHECK-NEXT: pslld $31, %xmm0
-; CHECK-NEXT: psrad $31, %xmm0
-; CHECK-NEXT: pand %xmm1, %xmm0
+; CHECK-NEXT: andps %xmm2, %xmm0
; CHECK-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero
; CHECK-NEXT: psllq $63, %xmm1
; CHECK-NEXT: psrad $31, %xmm1
diff --git a/test/CodeGen/X86/vzero-excess.ll b/test/CodeGen/X86/vzero-excess.ll
index 9ddafec65182..62525ec580f7 100644
--- a/test/CodeGen/X86/vzero-excess.ll
+++ b/test/CodeGen/X86/vzero-excess.ll
@@ -6,7 +6,7 @@
define <4 x float> @zeroupper_v4f32(<8 x float> *%x, <8 x float> %y) nounwind {
; CHECK-LABEL: zeroupper_v4f32:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: pushq %rbx
; CHECK-NEXT: subq $48, %rsp
; CHECK-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill
@@ -33,7 +33,7 @@ define <4 x float> @zeroupper_v4f32(<8 x float> *%x, <8 x float> %y) nounwind {
define <8 x float> @zeroupper_v8f32(<8 x float> %x) nounwind {
; CHECK-LABEL: zeroupper_v8f32:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: subq $56, %rsp
; CHECK-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill
; CHECK-NEXT: vzeroupper
@@ -48,7 +48,7 @@ define <8 x float> @zeroupper_v8f32(<8 x float> %x) nounwind {
define <4 x float> @zeroall_v4f32(<8 x float> *%x, <8 x float> %y) nounwind {
; CHECK-LABEL: zeroall_v4f32:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: pushq %rbx
; CHECK-NEXT: subq $48, %rsp
; CHECK-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill
@@ -75,7 +75,7 @@ define <4 x float> @zeroall_v4f32(<8 x float> *%x, <8 x float> %y) nounwind {
define <8 x float> @zeroall_v8f32(<8 x float> %x) nounwind {
; CHECK-LABEL: zeroall_v8f32:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: subq $56, %rsp
; CHECK-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill
; CHECK-NEXT: vzeroall
diff --git a/test/CodeGen/X86/weak-undef.ll b/test/CodeGen/X86/weak-undef.ll
new file mode 100644
index 000000000000..863fea43b162
--- /dev/null
+++ b/test/CodeGen/X86/weak-undef.ll
@@ -0,0 +1,58 @@
+; RUN: llc < %s -relocation-model=pic -mtriple=x86_64-pc-linux | FileCheck %s
+; RUN: llc < %s -relocation-model=pic -mtriple=i386-pc-linux | FileCheck --check-prefix=I386 %s
+
+@foo1 = extern_weak hidden global i32, align 4
+define i32* @bar1() {
+ ret i32* @foo1
+}
+; CHECK: bar1:
+; CHECK: movq foo1@GOTPCREL(%rip), %rax
+; I386: bar1:
+; I386: movl foo1@GOT(%eax), %eax
+
+@foo2 = external hidden global i32, align 4
+define i32* @bar2() {
+ ret i32* @foo2
+}
+; CHECK: bar2:
+; CHECK: leaq foo2(%rip), %rax
+; I386: bar2:
+; I386: leal foo2@GOTOFF(%eax), %eax
+
+declare extern_weak hidden void @foo3()
+define void @bar3() {
+ call void @foo3()
+ ret void
+}
+; CHECK: bar3:
+; CHECK: callq foo3
+; I386: bar3:
+; I386: calll foo3
+
+declare external hidden void @foo4()
+define void @bar4() {
+ call void @foo4()
+ ret void
+}
+; CHECK: bar4:
+; CHECK: callq foo4
+; I386: bar4:
+; I386: calll foo4
+
+declare extern_weak hidden i32 @foo5()
+define i32()* @bar5() {
+ ret i32()* @foo5
+}
+; CHECK: bar5:
+; CHECK: movq foo5@GOTPCREL(%rip), %rax
+; I386: bar5:
+; I386: movl foo5@GOT(%eax), %eax
+
+declare external hidden i32 @foo6()
+define i32()* @bar6() {
+ ret i32()* @foo6
+}
+; CHECK: bar6:
+; CHECK: leaq foo6(%rip), %rax
+; I386: bar6:
+; I386: leal foo6@GOTOFF(%eax), %eax
diff --git a/test/CodeGen/X86/weak.ll b/test/CodeGen/X86/weak.ll
index 8590e8d0001e..087e1d14b50e 100644
--- a/test/CodeGen/X86/weak.ll
+++ b/test/CodeGen/X86/weak.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86
+; RUN: llc < %s -mtriple=i686--
@a = extern_weak global i32 ; <i32*> [#uses=1]
@b = global i32* @a ; <i32**> [#uses=0]
diff --git a/test/CodeGen/X86/wide-fma-contraction.ll b/test/CodeGen/X86/wide-fma-contraction.ll
index 99e03c891c00..3ee09dd8f80e 100644
--- a/test/CodeGen/X86/wide-fma-contraction.ll
+++ b/test/CodeGen/X86/wide-fma-contraction.ll
@@ -1,19 +1,16 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -march=x86 -mcpu=bdver2 -mattr=-fma -mtriple=x86_64-apple-darwin < %s | FileCheck %s
-; RUN: llc -march=x86 -mcpu=bdver2 -mattr=-fma,-fma4 -mtriple=x86_64-apple-darwin < %s | FileCheck %s --check-prefix=CHECK-NOFMA
+; RUN: llc -mcpu=bdver2 -mattr=-fma -mtriple=i686-apple-darwin < %s | FileCheck %s
+; RUN: llc -mcpu=bdver2 -mattr=-fma,-fma4 -mtriple=i686-apple-darwin < %s | FileCheck %s --check-prefix=CHECK-NOFMA
; CHECK-LABEL: fmafunc
; CHECK-NOFMA-LABEL: fmafunc
define <16 x float> @fmafunc(<16 x float> %a, <16 x float> %b, <16 x float> %c) {
; CHECK-LABEL: fmafunc:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: pushl %ebp
-; CHECK-NEXT: Lcfi0:
; CHECK-NEXT: .cfi_def_cfa_offset 8
-; CHECK-NEXT: Lcfi1:
; CHECK-NEXT: .cfi_offset %ebp, -8
; CHECK-NEXT: movl %esp, %ebp
-; CHECK-NEXT: Lcfi2:
; CHECK-NEXT: .cfi_def_cfa_register %ebp
; CHECK-NEXT: andl $-32, %esp
; CHECK-NEXT: subl $32, %esp
@@ -24,14 +21,11 @@ define <16 x float> @fmafunc(<16 x float> %a, <16 x float> %b, <16 x float> %c)
; CHECK-NEXT: retl
;
; CHECK-NOFMA-LABEL: fmafunc:
-; CHECK-NOFMA: ## BB#0:
+; CHECK-NOFMA: ## %bb.0:
; CHECK-NOFMA-NEXT: pushl %ebp
-; CHECK-NOFMA-NEXT: Lcfi0:
; CHECK-NOFMA-NEXT: .cfi_def_cfa_offset 8
-; CHECK-NOFMA-NEXT: Lcfi1:
; CHECK-NOFMA-NEXT: .cfi_offset %ebp, -8
; CHECK-NOFMA-NEXT: movl %esp, %ebp
-; CHECK-NOFMA-NEXT: Lcfi2:
; CHECK-NOFMA-NEXT: .cfi_def_cfa_register %ebp
; CHECK-NOFMA-NEXT: andl $-32, %esp
; CHECK-NOFMA-NEXT: subl $32, %esp
diff --git a/test/CodeGen/X86/wide-integer-cmp.ll b/test/CodeGen/X86/wide-integer-cmp.ll
index 182d7cc73c9a..e7956c65345e 100644
--- a/test/CodeGen/X86/wide-integer-cmp.ll
+++ b/test/CodeGen/X86/wide-integer-cmp.ll
@@ -3,14 +3,14 @@
define i32 @branch_eq(i64 %a, i64 %b) {
; CHECK-LABEL: branch_eq:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
; CHECK-NEXT: xorl {{[0-9]+}}(%esp), %ecx
; CHECK-NEXT: xorl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: orl %ecx, %eax
; CHECK-NEXT: jne .LBB0_2
-; CHECK-NEXT: # BB#1: # %bb1
+; CHECK-NEXT: # %bb.1: # %bb1
; CHECK-NEXT: movl $1, %eax
; CHECK-NEXT: retl
; CHECK-NEXT: .LBB0_2: # %bb2
@@ -27,13 +27,13 @@ bb2:
define i32 @branch_slt(i64 %a, i64 %b) {
; CHECK-LABEL: branch_slt:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
; CHECK-NEXT: cmpl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: sbbl {{[0-9]+}}(%esp), %ecx
; CHECK-NEXT: jge .LBB1_2
-; CHECK-NEXT: # BB#1: # %bb1
+; CHECK-NEXT: # %bb.1: # %bb1
; CHECK-NEXT: movl $1, %eax
; CHECK-NEXT: retl
; CHECK-NEXT: .LBB1_2: # %bb2
@@ -50,13 +50,13 @@ bb2:
define i32 @branch_ule(i64 %a, i64 %b) {
; CHECK-LABEL: branch_ule:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
; CHECK-NEXT: cmpl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: sbbl {{[0-9]+}}(%esp), %ecx
; CHECK-NEXT: jb .LBB2_2
-; CHECK-NEXT: # BB#1: # %bb1
+; CHECK-NEXT: # %bb.1: # %bb1
; CHECK-NEXT: movl $1, %eax
; CHECK-NEXT: retl
; CHECK-NEXT: .LBB2_2: # %bb2
@@ -73,7 +73,7 @@ bb2:
define i32 @set_gt(i64 %a, i64 %b) {
; CHECK-LABEL: set_gt:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
; CHECK-NEXT: cmpl {{[0-9]+}}(%esp), %eax
@@ -89,11 +89,9 @@ entry:
define i32 @test_wide(i128 %a, i128 %b) {
; CHECK-LABEL: test_wide:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: pushl %esi
-; CHECK-NEXT: .Lcfi0:
; CHECK-NEXT: .cfi_def_cfa_offset 8
-; CHECK-NEXT: .Lcfi1:
; CHECK-NEXT: .cfi_offset %esi, -8
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
@@ -104,7 +102,7 @@ define i32 @test_wide(i128 %a, i128 %b) {
; CHECK-NEXT: sbbl {{[0-9]+}}(%esp), %ecx
; CHECK-NEXT: sbbl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: jge .LBB4_2
-; CHECK-NEXT: # BB#1: # %bb1
+; CHECK-NEXT: # %bb.1: # %bb1
; CHECK-NEXT: movl $1, %eax
; CHECK-NEXT: popl %esi
; CHECK-NEXT: retl
@@ -125,11 +123,11 @@ bb2:
; sure the code can handle that.
define i32 @test_carry_false(i64 %a, i64 %b) {
; CHECK-LABEL: test_carry_false:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: cmpl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: jge .LBB5_2
-; CHECK-NEXT: # BB#1: # %bb1
+; CHECK-NEXT: # %bb.1: # %bb1
; CHECK-NEXT: movl $1, %eax
; CHECK-NEXT: retl
; CHECK-NEXT: .LBB5_2: # %bb2
diff --git a/test/CodeGen/X86/wide-integer-fold.ll b/test/CodeGen/X86/wide-integer-fold.ll
index b3b4d24ab3ac..d4da64e7b068 100644
--- a/test/CodeGen/X86/wide-integer-fold.ll
+++ b/test/CodeGen/X86/wide-integer-fold.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-- | FileCheck %s
; CHECK: movq $-65535, %rax
; DAGCombiner should fold this to a simple constant.
diff --git a/test/CodeGen/X86/widen_arith-1.ll b/test/CodeGen/X86/widen_arith-1.ll
index a1e9b53638c6..d6607e8b98fc 100644
--- a/test/CodeGen/X86/widen_arith-1.ll
+++ b/test/CodeGen/X86/widen_arith-1.ll
@@ -3,7 +3,7 @@
define void @update(<3 x i8>* %dst, <3 x i8>* %src, i32 %n) nounwind {
; CHECK-LABEL: update:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: subl $12, %esp
; CHECK-NEXT: movl $0, (%esp)
; CHECK-NEXT: pcmpeqd %xmm0, %xmm0
@@ -26,7 +26,7 @@ define void @update(<3 x i8>* %dst, <3 x i8>* %src, i32 %n) nounwind {
; CHECK-NEXT: movl (%esp), %eax
; CHECK-NEXT: cmpl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: jl .LBB0_2
-; CHECK-NEXT: # BB#3: # %afterfor
+; CHECK-NEXT: # %bb.3: # %afterfor
; CHECK-NEXT: addl $12, %esp
; CHECK-NEXT: retl
entry:
diff --git a/test/CodeGen/X86/widen_arith-2.ll b/test/CodeGen/X86/widen_arith-2.ll
index 5731b63f3bc1..aa2573f9b2c0 100644
--- a/test/CodeGen/X86/widen_arith-2.ll
+++ b/test/CodeGen/X86/widen_arith-2.ll
@@ -5,12 +5,11 @@
define void @update(i64* %dst_i, i64* %src_i, i32 %n) nounwind {
; CHECK-LABEL: update:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: subl $12, %esp
; CHECK-NEXT: movl $0, (%esp)
; CHECK-NEXT: pcmpeqd %xmm0, %xmm0
; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4]
-; CHECK-NEXT: movdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
; CHECK-NEXT: jmp .LBB0_1
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB0_2: # %forbody
@@ -22,18 +21,18 @@ define void @update(i64* %dst_i, i64* %src_i, i32 %n) nounwind {
; CHECK-NEXT: movl %edx, {{[0-9]+}}(%esp)
; CHECK-NEXT: addl {{[0-9]+}}(%esp), %ecx
; CHECK-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; CHECK-NEXT: pmovzxbw {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; CHECK-NEXT: psubw %xmm0, %xmm3
-; CHECK-NEXT: pand %xmm1, %xmm3
-; CHECK-NEXT: pshufb %xmm2, %xmm3
-; CHECK-NEXT: movq %xmm3, (%edx,%eax,8)
+; CHECK-NEXT: pmovzxbw {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; CHECK-NEXT: psubw %xmm0, %xmm2
+; CHECK-NEXT: pand %xmm1, %xmm2
+; CHECK-NEXT: packsswb %xmm0, %xmm2
+; CHECK-NEXT: movq %xmm2, (%edx,%eax,8)
; CHECK-NEXT: incl (%esp)
; CHECK-NEXT: .LBB0_1: # %forcond
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
; CHECK-NEXT: movl (%esp), %eax
; CHECK-NEXT: cmpl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: jl .LBB0_2
-; CHECK-NEXT: # BB#3: # %afterfor
+; CHECK-NEXT: # %bb.3: # %afterfor
; CHECK-NEXT: addl $12, %esp
; CHECK-NEXT: retl
entry:
diff --git a/test/CodeGen/X86/widen_arith-3.ll b/test/CodeGen/X86/widen_arith-3.ll
index e363a82a2b96..aa656de2342d 100644
--- a/test/CodeGen/X86/widen_arith-3.ll
+++ b/test/CodeGen/X86/widen_arith-3.ll
@@ -8,7 +8,7 @@
define void @update(<3 x i16>* %dst, <3 x i16>* %src, i32 %n) nounwind {
; CHECK-LABEL: update:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: pushl %ebp
; CHECK-NEXT: movl %esp, %ebp
; CHECK-NEXT: andl $-8, %esp
@@ -16,9 +16,9 @@ define void @update(<3 x i16>* %dst, <3 x i16>* %src, i32 %n) nounwind {
; CHECK-NEXT: movl {{\.LCPI.*}}, %eax
; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; CHECK-NEXT: pcmpeqd %xmm0, %xmm0
+; CHECK-NEXT: movw $1, {{[0-9]+}}(%esp)
; CHECK-NEXT: movl $0, {{[0-9]+}}(%esp)
; CHECK-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; CHECK-NEXT: movw $1, {{[0-9]+}}(%esp)
; CHECK-NEXT: jmp .LBB0_1
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB0_2: # %forbody
@@ -39,7 +39,7 @@ define void @update(<3 x i16>* %dst, <3 x i16>* %src, i32 %n) nounwind {
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: cmpl 16(%ebp), %eax
; CHECK-NEXT: jl .LBB0_2
-; CHECK-NEXT: # BB#3: # %afterfor
+; CHECK-NEXT: # %bb.3: # %afterfor
; CHECK-NEXT: movl %ebp, %esp
; CHECK-NEXT: popl %ebp
; CHECK-NEXT: retl
diff --git a/test/CodeGen/X86/widen_arith-4.ll b/test/CodeGen/X86/widen_arith-4.ll
index 987c32009e3e..e3e2b1d1fb5c 100644
--- a/test/CodeGen/X86/widen_arith-4.ll
+++ b/test/CodeGen/X86/widen_arith-4.ll
@@ -5,7 +5,7 @@
define void @update(<5 x i16>* %dst, <5 x i16>* %src, i32 %n) nounwind {
; CHECK-LABEL: update:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: movq %rsi, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: movl %edx, -{{[0-9]+}}(%rsp)
@@ -34,7 +34,7 @@ define void @update(<5 x i16>* %dst, <5 x i16>* %src, i32 %n) nounwind {
; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %eax
; CHECK-NEXT: cmpl -{{[0-9]+}}(%rsp), %eax
; CHECK-NEXT: jl .LBB0_2
-; CHECK-NEXT: # BB#3: # %afterfor
+; CHECK-NEXT: # %bb.3: # %afterfor
; CHECK-NEXT: retq
entry:
%dst.addr = alloca <5 x i16>*
diff --git a/test/CodeGen/X86/widen_arith-5.ll b/test/CodeGen/X86/widen_arith-5.ll
index 005c2a41be4a..b76895503ba2 100644
--- a/test/CodeGen/X86/widen_arith-5.ll
+++ b/test/CodeGen/X86/widen_arith-5.ll
@@ -5,7 +5,7 @@
define void @update(<3 x i32>* %dst, <3 x i32>* %src, i32 %n) nounwind {
; CHECK-LABEL: update:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: movq %rsi, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: movl %edx, -{{[0-9]+}}(%rsp)
@@ -33,7 +33,7 @@ define void @update(<3 x i32>* %dst, <3 x i32>* %src, i32 %n) nounwind {
; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %eax
; CHECK-NEXT: cmpl -{{[0-9]+}}(%rsp), %eax
; CHECK-NEXT: jl .LBB0_2
-; CHECK-NEXT: # BB#3: # %afterfor
+; CHECK-NEXT: # %bb.3: # %afterfor
; CHECK-NEXT: retq
entry:
%dst.addr = alloca <3 x i32>*
diff --git a/test/CodeGen/X86/widen_arith-6.ll b/test/CodeGen/X86/widen_arith-6.ll
index 0421915154e4..73b8f4ea276b 100644
--- a/test/CodeGen/X86/widen_arith-6.ll
+++ b/test/CodeGen/X86/widen_arith-6.ll
@@ -5,7 +5,7 @@
define void @update(<3 x float>* %dst, <3 x float>* %src, i32 %n) nounwind {
; CHECK-LABEL: update:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: pushl %ebp
; CHECK-NEXT: movl %esp, %ebp
; CHECK-NEXT: andl $-16, %esp
@@ -35,7 +35,7 @@ define void @update(<3 x float>* %dst, <3 x float>* %src, i32 %n) nounwind {
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: cmpl 16(%ebp), %eax
; CHECK-NEXT: jl .LBB0_2
-; CHECK-NEXT: # BB#3: # %afterfor
+; CHECK-NEXT: # %bb.3: # %afterfor
; CHECK-NEXT: movl %ebp, %esp
; CHECK-NEXT: popl %ebp
; CHECK-NEXT: retl
diff --git a/test/CodeGen/X86/widen_bitops-0.ll b/test/CodeGen/X86/widen_bitops-0.ll
index 132a2fd928f2..f939396452ea 100644
--- a/test/CodeGen/X86/widen_bitops-0.ll
+++ b/test/CodeGen/X86/widen_bitops-0.ll
@@ -8,13 +8,13 @@
define i24 @and_i24_as_v3i8(i24 %a, i24 %b) nounwind {
; X32-SSE-LABEL: and_i24_as_v3i8:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-SSE-NEXT: andl {{[0-9]+}}(%esp), %eax
; X32-SSE-NEXT: retl
;
; X64-SSE-LABEL: and_i24_as_v3i8:
-; X64-SSE: # BB#0:
+; X64-SSE: # %bb.0:
; X64-SSE-NEXT: andl %esi, %edi
; X64-SSE-NEXT: movl %edi, %eax
; X64-SSE-NEXT: retq
@@ -27,13 +27,13 @@ define i24 @and_i24_as_v3i8(i24 %a, i24 %b) nounwind {
define i24 @xor_i24_as_v3i8(i24 %a, i24 %b) nounwind {
; X32-SSE-LABEL: xor_i24_as_v3i8:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-SSE-NEXT: xorl {{[0-9]+}}(%esp), %eax
; X32-SSE-NEXT: retl
;
; X64-SSE-LABEL: xor_i24_as_v3i8:
-; X64-SSE: # BB#0:
+; X64-SSE: # %bb.0:
; X64-SSE-NEXT: xorl %esi, %edi
; X64-SSE-NEXT: movl %edi, %eax
; X64-SSE-NEXT: retq
@@ -46,13 +46,13 @@ define i24 @xor_i24_as_v3i8(i24 %a, i24 %b) nounwind {
define i24 @or_i24_as_v3i8(i24 %a, i24 %b) nounwind {
; X32-SSE-LABEL: or_i24_as_v3i8:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-SSE-NEXT: orl {{[0-9]+}}(%esp), %eax
; X32-SSE-NEXT: retl
;
; X64-SSE-LABEL: or_i24_as_v3i8:
-; X64-SSE: # BB#0:
+; X64-SSE: # %bb.0:
; X64-SSE-NEXT: orl %esi, %edi
; X64-SSE-NEXT: movl %edi, %eax
; X64-SSE-NEXT: retq
@@ -69,13 +69,13 @@ define i24 @or_i24_as_v3i8(i24 %a, i24 %b) nounwind {
define i24 @and_i24_as_v8i3(i24 %a, i24 %b) nounwind {
; X32-SSE-LABEL: and_i24_as_v8i3:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-SSE-NEXT: andl {{[0-9]+}}(%esp), %eax
; X32-SSE-NEXT: retl
;
; X64-SSE-LABEL: and_i24_as_v8i3:
-; X64-SSE: # BB#0:
+; X64-SSE: # %bb.0:
; X64-SSE-NEXT: andl %esi, %edi
; X64-SSE-NEXT: movl %edi, %eax
; X64-SSE-NEXT: retq
@@ -88,13 +88,13 @@ define i24 @and_i24_as_v8i3(i24 %a, i24 %b) nounwind {
define i24 @xor_i24_as_v8i3(i24 %a, i24 %b) nounwind {
; X32-SSE-LABEL: xor_i24_as_v8i3:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-SSE-NEXT: xorl {{[0-9]+}}(%esp), %eax
; X32-SSE-NEXT: retl
;
; X64-SSE-LABEL: xor_i24_as_v8i3:
-; X64-SSE: # BB#0:
+; X64-SSE: # %bb.0:
; X64-SSE-NEXT: xorl %esi, %edi
; X64-SSE-NEXT: movl %edi, %eax
; X64-SSE-NEXT: retq
@@ -107,13 +107,13 @@ define i24 @xor_i24_as_v8i3(i24 %a, i24 %b) nounwind {
define i24 @or_i24_as_v8i3(i24 %a, i24 %b) nounwind {
; X32-SSE-LABEL: or_i24_as_v8i3:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-SSE-NEXT: orl {{[0-9]+}}(%esp), %eax
; X32-SSE-NEXT: retl
;
; X64-SSE-LABEL: or_i24_as_v8i3:
-; X64-SSE: # BB#0:
+; X64-SSE: # %bb.0:
; X64-SSE-NEXT: orl %esi, %edi
; X64-SSE-NEXT: movl %edi, %eax
; X64-SSE-NEXT: retq
@@ -130,7 +130,7 @@ define i24 @or_i24_as_v8i3(i24 %a, i24 %b) nounwind {
define <3 x i8> @and_v3i8_as_i24(<3 x i8> %a, <3 x i8> %b) nounwind {
; X32-SSE-LABEL: and_v3i8_as_i24:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X32-SSE-NEXT: pinsrb $4, {{[0-9]+}}(%esp), %xmm0
; X32-SSE-NEXT: pinsrb $8, {{[0-9]+}}(%esp), %xmm0
@@ -141,13 +141,13 @@ define <3 x i8> @and_v3i8_as_i24(<3 x i8> %a, <3 x i8> %b) nounwind {
; X32-SSE-NEXT: pextrb $0, %xmm1, %eax
; X32-SSE-NEXT: pextrb $4, %xmm1, %edx
; X32-SSE-NEXT: pextrb $8, %xmm1, %ecx
-; X32-SSE-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
-; X32-SSE-NEXT: # kill: %DL<def> %DL<kill> %EDX<kill>
-; X32-SSE-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
+; X32-SSE-NEXT: # kill: def %al killed %al killed %eax
+; X32-SSE-NEXT: # kill: def %dl killed %dl killed %edx
+; X32-SSE-NEXT: # kill: def %cl killed %cl killed %ecx
; X32-SSE-NEXT: retl
;
; X64-SSE-LABEL: and_v3i8_as_i24:
-; X64-SSE: # BB#0:
+; X64-SSE: # %bb.0:
; X64-SSE-NEXT: movd %ecx, %xmm0
; X64-SSE-NEXT: pinsrd $1, %r8d, %xmm0
; X64-SSE-NEXT: pinsrd $2, %r9d, %xmm0
@@ -158,9 +158,9 @@ define <3 x i8> @and_v3i8_as_i24(<3 x i8> %a, <3 x i8> %b) nounwind {
; X64-SSE-NEXT: pextrb $0, %xmm1, %eax
; X64-SSE-NEXT: pextrb $4, %xmm1, %edx
; X64-SSE-NEXT: pextrb $8, %xmm1, %ecx
-; X64-SSE-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
-; X64-SSE-NEXT: # kill: %DL<def> %DL<kill> %EDX<kill>
-; X64-SSE-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
+; X64-SSE-NEXT: # kill: def %al killed %al killed %eax
+; X64-SSE-NEXT: # kill: def %dl killed %dl killed %edx
+; X64-SSE-NEXT: # kill: def %cl killed %cl killed %ecx
; X64-SSE-NEXT: retq
%1 = bitcast <3 x i8> %a to i24
%2 = bitcast <3 x i8> %b to i24
@@ -171,7 +171,7 @@ define <3 x i8> @and_v3i8_as_i24(<3 x i8> %a, <3 x i8> %b) nounwind {
define <3 x i8> @xor_v3i8_as_i24(<3 x i8> %a, <3 x i8> %b) nounwind {
; X32-SSE-LABEL: xor_v3i8_as_i24:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X32-SSE-NEXT: pinsrb $4, {{[0-9]+}}(%esp), %xmm0
; X32-SSE-NEXT: pinsrb $8, {{[0-9]+}}(%esp), %xmm0
@@ -182,13 +182,13 @@ define <3 x i8> @xor_v3i8_as_i24(<3 x i8> %a, <3 x i8> %b) nounwind {
; X32-SSE-NEXT: pextrb $0, %xmm1, %eax
; X32-SSE-NEXT: pextrb $4, %xmm1, %edx
; X32-SSE-NEXT: pextrb $8, %xmm1, %ecx
-; X32-SSE-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
-; X32-SSE-NEXT: # kill: %DL<def> %DL<kill> %EDX<kill>
-; X32-SSE-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
+; X32-SSE-NEXT: # kill: def %al killed %al killed %eax
+; X32-SSE-NEXT: # kill: def %dl killed %dl killed %edx
+; X32-SSE-NEXT: # kill: def %cl killed %cl killed %ecx
; X32-SSE-NEXT: retl
;
; X64-SSE-LABEL: xor_v3i8_as_i24:
-; X64-SSE: # BB#0:
+; X64-SSE: # %bb.0:
; X64-SSE-NEXT: movd %ecx, %xmm0
; X64-SSE-NEXT: pinsrd $1, %r8d, %xmm0
; X64-SSE-NEXT: pinsrd $2, %r9d, %xmm0
@@ -199,9 +199,9 @@ define <3 x i8> @xor_v3i8_as_i24(<3 x i8> %a, <3 x i8> %b) nounwind {
; X64-SSE-NEXT: pextrb $0, %xmm1, %eax
; X64-SSE-NEXT: pextrb $4, %xmm1, %edx
; X64-SSE-NEXT: pextrb $8, %xmm1, %ecx
-; X64-SSE-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
-; X64-SSE-NEXT: # kill: %DL<def> %DL<kill> %EDX<kill>
-; X64-SSE-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
+; X64-SSE-NEXT: # kill: def %al killed %al killed %eax
+; X64-SSE-NEXT: # kill: def %dl killed %dl killed %edx
+; X64-SSE-NEXT: # kill: def %cl killed %cl killed %ecx
; X64-SSE-NEXT: retq
%1 = bitcast <3 x i8> %a to i24
%2 = bitcast <3 x i8> %b to i24
@@ -212,7 +212,7 @@ define <3 x i8> @xor_v3i8_as_i24(<3 x i8> %a, <3 x i8> %b) nounwind {
define <3 x i8> @or_v3i8_as_i24(<3 x i8> %a, <3 x i8> %b) nounwind {
; X32-SSE-LABEL: or_v3i8_as_i24:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X32-SSE-NEXT: pinsrb $4, {{[0-9]+}}(%esp), %xmm0
; X32-SSE-NEXT: pinsrb $8, {{[0-9]+}}(%esp), %xmm0
@@ -223,13 +223,13 @@ define <3 x i8> @or_v3i8_as_i24(<3 x i8> %a, <3 x i8> %b) nounwind {
; X32-SSE-NEXT: pextrb $0, %xmm1, %eax
; X32-SSE-NEXT: pextrb $4, %xmm1, %edx
; X32-SSE-NEXT: pextrb $8, %xmm1, %ecx
-; X32-SSE-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
-; X32-SSE-NEXT: # kill: %DL<def> %DL<kill> %EDX<kill>
-; X32-SSE-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
+; X32-SSE-NEXT: # kill: def %al killed %al killed %eax
+; X32-SSE-NEXT: # kill: def %dl killed %dl killed %edx
+; X32-SSE-NEXT: # kill: def %cl killed %cl killed %ecx
; X32-SSE-NEXT: retl
;
; X64-SSE-LABEL: or_v3i8_as_i24:
-; X64-SSE: # BB#0:
+; X64-SSE: # %bb.0:
; X64-SSE-NEXT: movd %ecx, %xmm0
; X64-SSE-NEXT: pinsrd $1, %r8d, %xmm0
; X64-SSE-NEXT: pinsrd $2, %r9d, %xmm0
@@ -240,9 +240,9 @@ define <3 x i8> @or_v3i8_as_i24(<3 x i8> %a, <3 x i8> %b) nounwind {
; X64-SSE-NEXT: pextrb $0, %xmm1, %eax
; X64-SSE-NEXT: pextrb $4, %xmm1, %edx
; X64-SSE-NEXT: pextrb $8, %xmm1, %ecx
-; X64-SSE-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill>
-; X64-SSE-NEXT: # kill: %DL<def> %DL<kill> %EDX<kill>
-; X64-SSE-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill>
+; X64-SSE-NEXT: # kill: def %al killed %al killed %eax
+; X64-SSE-NEXT: # kill: def %dl killed %dl killed %edx
+; X64-SSE-NEXT: # kill: def %cl killed %cl killed %ecx
; X64-SSE-NEXT: retq
%1 = bitcast <3 x i8> %a to i24
%2 = bitcast <3 x i8> %b to i24
@@ -257,12 +257,12 @@ define <3 x i8> @or_v3i8_as_i24(<3 x i8> %a, <3 x i8> %b) nounwind {
define <8 x i3> @and_v8i3_as_i24(<8 x i3> %a, <8 x i3> %b) nounwind {
; X32-SSE-LABEL: and_v8i3_as_i24:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: andps %xmm1, %xmm0
; X32-SSE-NEXT: retl
;
; X64-SSE-LABEL: and_v8i3_as_i24:
-; X64-SSE: # BB#0:
+; X64-SSE: # %bb.0:
; X64-SSE-NEXT: andps %xmm1, %xmm0
; X64-SSE-NEXT: retq
%1 = bitcast <8 x i3> %a to i24
@@ -274,12 +274,12 @@ define <8 x i3> @and_v8i3_as_i24(<8 x i3> %a, <8 x i3> %b) nounwind {
define <8 x i3> @xor_v8i3_as_i24(<8 x i3> %a, <8 x i3> %b) nounwind {
; X32-SSE-LABEL: xor_v8i3_as_i24:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: xorps %xmm1, %xmm0
; X32-SSE-NEXT: retl
;
; X64-SSE-LABEL: xor_v8i3_as_i24:
-; X64-SSE: # BB#0:
+; X64-SSE: # %bb.0:
; X64-SSE-NEXT: xorps %xmm1, %xmm0
; X64-SSE-NEXT: retq
%1 = bitcast <8 x i3> %a to i24
@@ -291,12 +291,12 @@ define <8 x i3> @xor_v8i3_as_i24(<8 x i3> %a, <8 x i3> %b) nounwind {
define <8 x i3> @or_v8i3_as_i24(<8 x i3> %a, <8 x i3> %b) nounwind {
; X32-SSE-LABEL: or_v8i3_as_i24:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: orps %xmm1, %xmm0
; X32-SSE-NEXT: retl
;
; X64-SSE-LABEL: or_v8i3_as_i24:
-; X64-SSE: # BB#0:
+; X64-SSE: # %bb.0:
; X64-SSE-NEXT: orps %xmm1, %xmm0
; X64-SSE-NEXT: retq
%1 = bitcast <8 x i3> %a to i24
diff --git a/test/CodeGen/X86/widen_bitops-1.ll b/test/CodeGen/X86/widen_bitops-1.ll
index f2a6b22c2af4..fa41b1643ffb 100644
--- a/test/CodeGen/X86/widen_bitops-1.ll
+++ b/test/CodeGen/X86/widen_bitops-1.ll
@@ -8,13 +8,13 @@
define i32 @and_i32_as_v4i8(i32 %a, i32 %b) nounwind {
; X32-SSE-LABEL: and_i32_as_v4i8:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-SSE-NEXT: andl {{[0-9]+}}(%esp), %eax
; X32-SSE-NEXT: retl
;
; X64-SSE-LABEL: and_i32_as_v4i8:
-; X64-SSE: # BB#0:
+; X64-SSE: # %bb.0:
; X64-SSE-NEXT: andl %esi, %edi
; X64-SSE-NEXT: movl %edi, %eax
; X64-SSE-NEXT: retq
@@ -27,13 +27,13 @@ define i32 @and_i32_as_v4i8(i32 %a, i32 %b) nounwind {
define i32 @xor_i32_as_v4i8(i32 %a, i32 %b) nounwind {
; X32-SSE-LABEL: xor_i32_as_v4i8:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-SSE-NEXT: xorl {{[0-9]+}}(%esp), %eax
; X32-SSE-NEXT: retl
;
; X64-SSE-LABEL: xor_i32_as_v4i8:
-; X64-SSE: # BB#0:
+; X64-SSE: # %bb.0:
; X64-SSE-NEXT: xorl %esi, %edi
; X64-SSE-NEXT: movl %edi, %eax
; X64-SSE-NEXT: retq
@@ -46,13 +46,13 @@ define i32 @xor_i32_as_v4i8(i32 %a, i32 %b) nounwind {
define i32 @or_i32_as_v4i8(i32 %a, i32 %b) nounwind {
; X32-SSE-LABEL: or_i32_as_v4i8:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-SSE-NEXT: orl {{[0-9]+}}(%esp), %eax
; X32-SSE-NEXT: retl
;
; X64-SSE-LABEL: or_i32_as_v4i8:
-; X64-SSE: # BB#0:
+; X64-SSE: # %bb.0:
; X64-SSE-NEXT: orl %esi, %edi
; X64-SSE-NEXT: movl %edi, %eax
; X64-SSE-NEXT: retq
@@ -69,13 +69,13 @@ define i32 @or_i32_as_v4i8(i32 %a, i32 %b) nounwind {
define i32 @and_i32_as_v8i4(i32 %a, i32 %b) nounwind {
; X32-SSE-LABEL: and_i32_as_v8i4:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-SSE-NEXT: andl {{[0-9]+}}(%esp), %eax
; X32-SSE-NEXT: retl
;
; X64-SSE-LABEL: and_i32_as_v8i4:
-; X64-SSE: # BB#0:
+; X64-SSE: # %bb.0:
; X64-SSE-NEXT: andl %esi, %edi
; X64-SSE-NEXT: movl %edi, %eax
; X64-SSE-NEXT: retq
@@ -88,13 +88,13 @@ define i32 @and_i32_as_v8i4(i32 %a, i32 %b) nounwind {
define i32 @xor_i32_as_v8i4(i32 %a, i32 %b) nounwind {
; X32-SSE-LABEL: xor_i32_as_v8i4:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-SSE-NEXT: xorl {{[0-9]+}}(%esp), %eax
; X32-SSE-NEXT: retl
;
; X64-SSE-LABEL: xor_i32_as_v8i4:
-; X64-SSE: # BB#0:
+; X64-SSE: # %bb.0:
; X64-SSE-NEXT: xorl %esi, %edi
; X64-SSE-NEXT: movl %edi, %eax
; X64-SSE-NEXT: retq
@@ -107,13 +107,13 @@ define i32 @xor_i32_as_v8i4(i32 %a, i32 %b) nounwind {
define i32 @or_i32_as_v8i4(i32 %a, i32 %b) nounwind {
; X32-SSE-LABEL: or_i32_as_v8i4:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-SSE-NEXT: orl {{[0-9]+}}(%esp), %eax
; X32-SSE-NEXT: retl
;
; X64-SSE-LABEL: or_i32_as_v8i4:
-; X64-SSE: # BB#0:
+; X64-SSE: # %bb.0:
; X64-SSE-NEXT: orl %esi, %edi
; X64-SSE-NEXT: movl %edi, %eax
; X64-SSE-NEXT: retq
@@ -130,12 +130,12 @@ define i32 @or_i32_as_v8i4(i32 %a, i32 %b) nounwind {
define <4 x i8> @and_v4i8_as_i32(<4 x i8> %a, <4 x i8> %b) nounwind {
; X32-SSE-LABEL: and_v4i8_as_i32:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: andps %xmm1, %xmm0
; X32-SSE-NEXT: retl
;
; X64-SSE-LABEL: and_v4i8_as_i32:
-; X64-SSE: # BB#0:
+; X64-SSE: # %bb.0:
; X64-SSE-NEXT: andps %xmm1, %xmm0
; X64-SSE-NEXT: retq
%1 = bitcast <4 x i8> %a to i32
@@ -147,12 +147,12 @@ define <4 x i8> @and_v4i8_as_i32(<4 x i8> %a, <4 x i8> %b) nounwind {
define <4 x i8> @xor_v4i8_as_i32(<4 x i8> %a, <4 x i8> %b) nounwind {
; X32-SSE-LABEL: xor_v4i8_as_i32:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: xorps %xmm1, %xmm0
; X32-SSE-NEXT: retl
;
; X64-SSE-LABEL: xor_v4i8_as_i32:
-; X64-SSE: # BB#0:
+; X64-SSE: # %bb.0:
; X64-SSE-NEXT: xorps %xmm1, %xmm0
; X64-SSE-NEXT: retq
%1 = bitcast <4 x i8> %a to i32
@@ -164,12 +164,12 @@ define <4 x i8> @xor_v4i8_as_i32(<4 x i8> %a, <4 x i8> %b) nounwind {
define <4 x i8> @or_v4i8_as_i32(<4 x i8> %a, <4 x i8> %b) nounwind {
; X32-SSE-LABEL: or_v4i8_as_i32:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: orps %xmm1, %xmm0
; X32-SSE-NEXT: retl
;
; X64-SSE-LABEL: or_v4i8_as_i32:
-; X64-SSE: # BB#0:
+; X64-SSE: # %bb.0:
; X64-SSE-NEXT: orps %xmm1, %xmm0
; X64-SSE-NEXT: retq
%1 = bitcast <4 x i8> %a to i32
@@ -185,12 +185,12 @@ define <4 x i8> @or_v4i8_as_i32(<4 x i8> %a, <4 x i8> %b) nounwind {
define <8 x i4> @and_v8i4_as_i32(<8 x i4> %a, <8 x i4> %b) nounwind {
; X32-SSE-LABEL: and_v8i4_as_i32:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: andps %xmm1, %xmm0
; X32-SSE-NEXT: retl
;
; X64-SSE-LABEL: and_v8i4_as_i32:
-; X64-SSE: # BB#0:
+; X64-SSE: # %bb.0:
; X64-SSE-NEXT: andps %xmm1, %xmm0
; X64-SSE-NEXT: retq
%1 = bitcast <8 x i4> %a to i32
@@ -202,12 +202,12 @@ define <8 x i4> @and_v8i4_as_i32(<8 x i4> %a, <8 x i4> %b) nounwind {
define <8 x i4> @xor_v8i4_as_i32(<8 x i4> %a, <8 x i4> %b) nounwind {
; X32-SSE-LABEL: xor_v8i4_as_i32:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: xorps %xmm1, %xmm0
; X32-SSE-NEXT: retl
;
; X64-SSE-LABEL: xor_v8i4_as_i32:
-; X64-SSE: # BB#0:
+; X64-SSE: # %bb.0:
; X64-SSE-NEXT: xorps %xmm1, %xmm0
; X64-SSE-NEXT: retq
%1 = bitcast <8 x i4> %a to i32
@@ -219,12 +219,12 @@ define <8 x i4> @xor_v8i4_as_i32(<8 x i4> %a, <8 x i4> %b) nounwind {
define <8 x i4> @or_v8i4_as_i32(<8 x i4> %a, <8 x i4> %b) nounwind {
; X32-SSE-LABEL: or_v8i4_as_i32:
-; X32-SSE: # BB#0:
+; X32-SSE: # %bb.0:
; X32-SSE-NEXT: orps %xmm1, %xmm0
; X32-SSE-NEXT: retl
;
; X64-SSE-LABEL: or_v8i4_as_i32:
-; X64-SSE: # BB#0:
+; X64-SSE: # %bb.0:
; X64-SSE-NEXT: orps %xmm1, %xmm0
; X64-SSE-NEXT: retq
%1 = bitcast <8 x i4> %a to i32
diff --git a/test/CodeGen/X86/widen_cast-1.ll b/test/CodeGen/X86/widen_cast-1.ll
index 65c8db155e32..41da54cf110d 100644
--- a/test/CodeGen/X86/widen_cast-1.ll
+++ b/test/CodeGen/X86/widen_cast-1.ll
@@ -8,7 +8,7 @@
define void @convert(<2 x i32>* %dst, <4 x i16>* %src) nounwind {
; CHECK-LABEL: convert:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: pushl %eax
; CHECK-NEXT: movl $0, (%esp)
; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [1,1,1,1]
@@ -33,7 +33,7 @@ define void @convert(<2 x i32>* %dst, <4 x i16>* %src) nounwind {
; CHECK-NEXT: retl
;
; ATOM-LABEL: convert:
-; ATOM: # BB#0: # %entry
+; ATOM: # %bb.0: # %entry
; ATOM-NEXT: pushl %eax
; ATOM-NEXT: movdqa {{.*#+}} xmm0 = [1,1,1,1]
; ATOM-NEXT: movdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
diff --git a/test/CodeGen/X86/widen_cast-2.ll b/test/CodeGen/X86/widen_cast-2.ll
index 8caa962e4ec8..03d4700c064a 100644
--- a/test/CodeGen/X86/widen_cast-2.ll
+++ b/test/CodeGen/X86/widen_cast-2.ll
@@ -4,7 +4,7 @@
define void @convert(<7 x i32>* %dst, <14 x i16>* %src) nounwind {
; CHECK-LABEL: convert:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: pushl %eax
; CHECK-NEXT: movl $0, (%esp)
; CHECK-NEXT: pcmpeqd %xmm0, %xmm0
diff --git a/test/CodeGen/X86/widen_cast-3.ll b/test/CodeGen/X86/widen_cast-3.ll
index a50e199cd10d..18a04c48a590 100644
--- a/test/CodeGen/X86/widen_cast-3.ll
+++ b/test/CodeGen/X86/widen_cast-3.ll
@@ -6,7 +6,7 @@
define void @convert(<12 x i8>* %dst.addr, <3 x i32> %src) nounwind {
; X86-LABEL: convert:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: pcmpeqd %xmm1, %xmm1
; X86-NEXT: psubd %xmm1, %xmm0
@@ -16,7 +16,7 @@ define void @convert(<12 x i8>* %dst.addr, <3 x i32> %src) nounwind {
; X86-NEXT: retl
;
; X64-LABEL: convert:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: pcmpeqd %xmm1, %xmm1
; X64-NEXT: psubd %xmm1, %xmm0
; X64-NEXT: pextrd $2, %xmm0, 8(%rdi)
diff --git a/test/CodeGen/X86/widen_cast-4.ll b/test/CodeGen/X86/widen_cast-4.ll
index cc6fb27a6293..c3fa2f5454e2 100644
--- a/test/CodeGen/X86/widen_cast-4.ll
+++ b/test/CodeGen/X86/widen_cast-4.ll
@@ -6,7 +6,7 @@
define void @update(i64* %dst_i, i64* %src_i, i32 %n) nounwind {
; NARROW-LABEL: update:
-; NARROW: # BB#0: # %entry
+; NARROW: # %bb.0: # %entry
; NARROW-NEXT: subl $12, %esp
; NARROW-NEXT: movl $0, (%esp)
; NARROW-NEXT: pcmpeqd %xmm0, %xmm0
@@ -26,7 +26,7 @@ define void @update(i64* %dst_i, i64* %src_i, i32 %n) nounwind {
; NARROW-NEXT: psubw %xmm0, %xmm2
; NARROW-NEXT: psllw $8, %xmm2
; NARROW-NEXT: psraw $8, %xmm2
-; NARROW-NEXT: psraw $2, %xmm2
+; NARROW-NEXT: psrlw $2, %xmm2
; NARROW-NEXT: pshufb %xmm1, %xmm2
; NARROW-NEXT: movq %xmm2, (%edx,%eax,8)
; NARROW-NEXT: incl (%esp)
@@ -35,12 +35,12 @@ define void @update(i64* %dst_i, i64* %src_i, i32 %n) nounwind {
; NARROW-NEXT: movl (%esp), %eax
; NARROW-NEXT: cmpl {{[0-9]+}}(%esp), %eax
; NARROW-NEXT: jl .LBB0_2
-; NARROW-NEXT: # BB#3: # %afterfor
+; NARROW-NEXT: # %bb.3: # %afterfor
; NARROW-NEXT: addl $12, %esp
; NARROW-NEXT: retl
;
; WIDE-LABEL: update:
-; WIDE: # BB#0: # %entry
+; WIDE: # %bb.0: # %entry
; WIDE-NEXT: subl $12, %esp
; WIDE-NEXT: movl $0, (%esp)
; WIDE-NEXT: pcmpeqd %xmm0, %xmm0
@@ -72,7 +72,7 @@ define void @update(i64* %dst_i, i64* %src_i, i32 %n) nounwind {
; WIDE-NEXT: movl (%esp), %eax
; WIDE-NEXT: cmpl {{[0-9]+}}(%esp), %eax
; WIDE-NEXT: jl .LBB0_2
-; WIDE-NEXT: # BB#3: # %afterfor
+; WIDE-NEXT: # %bb.3: # %afterfor
; WIDE-NEXT: addl $12, %esp
; WIDE-NEXT: retl
entry:
diff --git a/test/CodeGen/X86/widen_cast-5.ll b/test/CodeGen/X86/widen_cast-5.ll
index 986fa4743c6c..b0363d023026 100644
--- a/test/CodeGen/X86/widen_cast-5.ll
+++ b/test/CodeGen/X86/widen_cast-5.ll
@@ -6,7 +6,7 @@
define void @convert(<2 x i32>* %dst.addr, i64 %src) nounwind {
; X86-LABEL: convert:
-; X86: ## BB#0: ## %entry
+; X86: ## %bb.0: ## %entry
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: pmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
; X86-NEXT: pxor LCPI0_0, %xmm0
@@ -15,7 +15,7 @@ define void @convert(<2 x i32>* %dst.addr, i64 %src) nounwind {
; X86-NEXT: retl
;
; X64-LABEL: convert:
-; X64: ## BB#0: ## %entry
+; X64: ## %bb.0: ## %entry
; X64-NEXT: movq %rsi, %xmm0
; X64-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; X64-NEXT: pxor {{.*}}(%rip), %xmm0
diff --git a/test/CodeGen/X86/widen_cast-6.ll b/test/CodeGen/X86/widen_cast-6.ll
index 347d5245f760..c75c3597eae1 100644
--- a/test/CodeGen/X86/widen_cast-6.ll
+++ b/test/CodeGen/X86/widen_cast-6.ll
@@ -6,14 +6,14 @@
define i32 @return_v2hi() nounwind {
; X86-LABEL: return_v2hi:
-; X86: ## BB#0: ## %entry
+; X86: ## %bb.0: ## %entry
; X86-NEXT: pushl %eax
; X86-NEXT: xorl %eax, %eax
; X86-NEXT: popl %ecx
; X86-NEXT: retl
;
; X64-LABEL: return_v2hi:
-; X64: ## BB#0: ## %entry
+; X64: ## %bb.0: ## %entry
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: retq
entry:
diff --git a/test/CodeGen/X86/widen_compare-1.ll b/test/CodeGen/X86/widen_compare-1.ll
index 8ea0db53a391..9c0fb0e7461e 100644
--- a/test/CodeGen/X86/widen_compare-1.ll
+++ b/test/CodeGen/X86/widen_compare-1.ll
@@ -6,13 +6,13 @@
define <2 x i16> @compare_v2i64_to_v2i16(<2 x i16>* %src) nounwind {
; X86-LABEL: compare_v2i64_to_v2i16:
-; X86: # BB#0:
-; X86-NEXT: movaps {{.*#+}} xmm0 = [65535,0,65535,0]
+; X86: # %bb.0:
+; X86-NEXT: pcmpeqd %xmm0, %xmm0
; X86-NEXT: retl
;
; X64-LABEL: compare_v2i64_to_v2i16:
-; X64: # BB#0:
-; X64-NEXT: movaps {{.*#+}} xmm0 = [65535,65535]
+; X64: # %bb.0:
+; X64-NEXT: pcmpeqd %xmm0, %xmm0
; X64-NEXT: retq
%val = load <2 x i16>, <2 x i16>* %src, align 4
%cmp = icmp uge <2 x i16> %val, %val
diff --git a/test/CodeGen/X86/widen_conv-1.ll b/test/CodeGen/X86/widen_conv-1.ll
index c548fc2c77e4..7e0f999bc10d 100644
--- a/test/CodeGen/X86/widen_conv-1.ll
+++ b/test/CodeGen/X86/widen_conv-1.ll
@@ -6,7 +6,7 @@
define void @convert_v2i64_to_v2i32(<2 x i32>* %dst.addr, <2 x i64> %src) nounwind {
; X86-LABEL: convert_v2i64_to_v2i32:
-; X86: # BB#0: # %entry
+; X86: # %bb.0: # %entry
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: paddd {{\.LCPI.*}}, %xmm0
; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
@@ -14,7 +14,7 @@ define void @convert_v2i64_to_v2i32(<2 x i32>* %dst.addr, <2 x i64> %src) nounwi
; X86-NEXT: retl
;
; X64-LABEL: convert_v2i64_to_v2i32:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: paddd {{.*}}(%rip), %xmm0
; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X64-NEXT: movq %xmm0, (%rdi)
@@ -30,7 +30,7 @@ entry:
define void @convert_v3i32_to_v3i8(<3 x i8>* %dst.addr, <3 x i32>* %src.addr) nounwind {
; X86-LABEL: convert_v3i32_to_v3i8:
-; X86: # BB#0: # %entry
+; X86: # %bb.0: # %entry
; X86-NEXT: pushl %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
@@ -44,7 +44,7 @@ define void @convert_v3i32_to_v3i8(<3 x i8>* %dst.addr, <3 x i32>* %src.addr) no
; X86-NEXT: retl
;
; X64-LABEL: convert_v3i32_to_v3i8:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: movdqa (%rsi), %xmm0
; X64-NEXT: pcmpeqd %xmm1, %xmm1
; X64-NEXT: psubd %xmm1, %xmm0
@@ -64,7 +64,7 @@ entry:
define void @convert_v5i16_to_v5i8(<5 x i8>* %dst.addr, <5 x i16>* %src.addr) nounwind {
; X86-LABEL: convert_v5i16_to_v5i8:
-; X86: # BB#0: # %entry
+; X86: # %bb.0: # %entry
; X86-NEXT: pushl %ebp
; X86-NEXT: movl %esp, %ebp
; X86-NEXT: andl $-8, %esp
@@ -82,7 +82,7 @@ define void @convert_v5i16_to_v5i8(<5 x i8>* %dst.addr, <5 x i16>* %src.addr) no
; X86-NEXT: retl
;
; X64-LABEL: convert_v5i16_to_v5i8:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: movdqa (%rsi), %xmm0
; X64-NEXT: pcmpeqd %xmm1, %xmm1
; X64-NEXT: psubw %xmm1, %xmm0
diff --git a/test/CodeGen/X86/widen_conv-2.ll b/test/CodeGen/X86/widen_conv-2.ll
index 015b0faa9827..3a39cbfba2e1 100644
--- a/test/CodeGen/X86/widen_conv-2.ll
+++ b/test/CodeGen/X86/widen_conv-2.ll
@@ -6,7 +6,7 @@
define void @convert_v2i16_v2i32(<2 x i32>* %dst.addr, <2 x i16> %src) nounwind {
; X86-LABEL: convert_v2i16_v2i32:
-; X86: # BB#0: # %entry
+; X86: # %bb.0: # %entry
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: psllq $48, %xmm0
; X86-NEXT: psrad $16, %xmm0
@@ -15,7 +15,7 @@ define void @convert_v2i16_v2i32(<2 x i32>* %dst.addr, <2 x i16> %src) nounwind
; X86-NEXT: retl
;
; X64-LABEL: convert_v2i16_v2i32:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: psllq $48, %xmm0
; X64-NEXT: psrad $16, %xmm0
; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
diff --git a/test/CodeGen/X86/widen_conv-3.ll b/test/CodeGen/X86/widen_conv-3.ll
index 3b20f3515716..186e43e213b6 100644
--- a/test/CodeGen/X86/widen_conv-3.ll
+++ b/test/CodeGen/X86/widen_conv-3.ll
@@ -8,7 +8,7 @@
define void @convert_v2i16_to_v2f32(<2 x float>* %dst.addr, <2 x i16> %src) nounwind {
; X86-SSE2-LABEL: convert_v2i16_to_v2f32:
-; X86-SSE2: # BB#0: # %entry
+; X86-SSE2: # %bb.0: # %entry
; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE2-NEXT: psllq $48, %xmm0
; X86-SSE2-NEXT: psrad $16, %xmm0
@@ -20,7 +20,7 @@ define void @convert_v2i16_to_v2f32(<2 x float>* %dst.addr, <2 x i16> %src) noun
; X86-SSE2-NEXT: retl
;
; X86-SSE42-LABEL: convert_v2i16_to_v2f32:
-; X86-SSE42: # BB#0: # %entry
+; X86-SSE42: # %bb.0: # %entry
; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE42-NEXT: psllq $48, %xmm0
; X86-SSE42-NEXT: psrad $16, %xmm0
@@ -31,7 +31,7 @@ define void @convert_v2i16_to_v2f32(<2 x float>* %dst.addr, <2 x i16> %src) noun
; X86-SSE42-NEXT: retl
;
; X64-LABEL: convert_v2i16_to_v2f32:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: psllq $48, %xmm0
; X64-NEXT: psrad $16, %xmm0
; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
@@ -48,7 +48,7 @@ entry:
define void @convert_v3i8_to_v3f32(<3 x float>* %dst.addr, <3 x i8>* %src.addr) nounwind {
; X86-SSE2-LABEL: convert_v3i8_to_v3f32:
-; X86-SSE2: # BB#0: # %entry
+; X86-SSE2: # %bb.0: # %entry
; X86-SSE2-NEXT: pushl %ebp
; X86-SSE2-NEXT: movl %esp, %ebp
; X86-SSE2-NEXT: pushl %esi
@@ -84,7 +84,7 @@ define void @convert_v3i8_to_v3f32(<3 x float>* %dst.addr, <3 x i8>* %src.addr)
; X86-SSE2-NEXT: retl
;
; X86-SSE42-LABEL: convert_v3i8_to_v3f32:
-; X86-SSE42: # BB#0: # %entry
+; X86-SSE42: # %bb.0: # %entry
; X86-SSE42-NEXT: pushl %eax
; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %ecx
@@ -103,7 +103,7 @@ define void @convert_v3i8_to_v3f32(<3 x float>* %dst.addr, <3 x i8>* %src.addr)
; X86-SSE42-NEXT: retl
;
; X64-SSE2-LABEL: convert_v3i8_to_v3f32:
-; X64-SSE2: # BB#0: # %entry
+; X64-SSE2: # %bb.0: # %entry
; X64-SSE2-NEXT: movzwl (%rsi), %eax
; X64-SSE2-NEXT: movq %rax, %xmm0
; X64-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
@@ -126,7 +126,7 @@ define void @convert_v3i8_to_v3f32(<3 x float>* %dst.addr, <3 x i8>* %src.addr)
; X64-SSE2-NEXT: retq
;
; X64-SSE42-LABEL: convert_v3i8_to_v3f32:
-; X64-SSE42: # BB#0: # %entry
+; X64-SSE42: # %bb.0: # %entry
; X64-SSE42-NEXT: movzbl 2(%rsi), %eax
; X64-SSE42-NEXT: movzwl (%rsi), %ecx
; X64-SSE42-NEXT: movq %rcx, %xmm0
diff --git a/test/CodeGen/X86/widen_conv-4.ll b/test/CodeGen/X86/widen_conv-4.ll
index 6dc938893d38..4fa3bd522111 100644
--- a/test/CodeGen/X86/widen_conv-4.ll
+++ b/test/CodeGen/X86/widen_conv-4.ll
@@ -8,7 +8,7 @@
define void @convert_v7i16_v7f32(<7 x float>* %dst.addr, <7 x i16> %src) nounwind {
; X86-SSE2-LABEL: convert_v7i16_v7f32:
-; X86-SSE2: # BB#0: # %entry
+; X86-SSE2: # %bb.0: # %entry
; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE2-NEXT: pxor %xmm1, %xmm1
; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
@@ -26,7 +26,7 @@ define void @convert_v7i16_v7f32(<7 x float>* %dst.addr, <7 x i16> %src) nounwin
; X86-SSE2-NEXT: retl
;
; X86-SSE42-LABEL: convert_v7i16_v7f32:
-; X86-SSE42: # BB#0: # %entry
+; X86-SSE42: # %bb.0: # %entry
; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE42-NEXT: pxor %xmm1, %xmm1
; X86-SSE42-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
@@ -40,7 +40,7 @@ define void @convert_v7i16_v7f32(<7 x float>* %dst.addr, <7 x i16> %src) nounwin
; X86-SSE42-NEXT: retl
;
; X64-SSE2-LABEL: convert_v7i16_v7f32:
-; X64-SSE2: # BB#0: # %entry
+; X64-SSE2: # %bb.0: # %entry
; X64-SSE2-NEXT: pxor %xmm1, %xmm1
; X64-SSE2-NEXT: movdqa %xmm0, %xmm2
; X64-SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
@@ -54,7 +54,7 @@ define void @convert_v7i16_v7f32(<7 x float>* %dst.addr, <7 x i16> %src) nounwin
; X64-SSE2-NEXT: retq
;
; X64-SSE42-LABEL: convert_v7i16_v7f32:
-; X64-SSE42: # BB#0: # %entry
+; X64-SSE42: # %bb.0: # %entry
; X64-SSE42-NEXT: pxor %xmm1, %xmm1
; X64-SSE42-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; X64-SSE42-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
@@ -74,7 +74,7 @@ entry:
define void @convert_v3i8_to_v3f32(<3 x float>* %dst.addr, <3 x i8>* %src.addr) nounwind {
; X86-SSE2-LABEL: convert_v3i8_to_v3f32:
-; X86-SSE2: # BB#0: # %entry
+; X86-SSE2: # %bb.0: # %entry
; X86-SSE2-NEXT: pushl %ebp
; X86-SSE2-NEXT: movl %esp, %ebp
; X86-SSE2-NEXT: pushl %esi
@@ -110,7 +110,7 @@ define void @convert_v3i8_to_v3f32(<3 x float>* %dst.addr, <3 x i8>* %src.addr)
; X86-SSE2-NEXT: retl
;
; X86-SSE42-LABEL: convert_v3i8_to_v3f32:
-; X86-SSE42: # BB#0: # %entry
+; X86-SSE42: # %bb.0: # %entry
; X86-SSE42-NEXT: pushl %eax
; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %ecx
@@ -128,7 +128,7 @@ define void @convert_v3i8_to_v3f32(<3 x float>* %dst.addr, <3 x i8>* %src.addr)
; X86-SSE42-NEXT: retl
;
; X64-SSE2-LABEL: convert_v3i8_to_v3f32:
-; X64-SSE2: # BB#0: # %entry
+; X64-SSE2: # %bb.0: # %entry
; X64-SSE2-NEXT: movzwl (%rsi), %eax
; X64-SSE2-NEXT: movq %rax, %xmm0
; X64-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
@@ -151,7 +151,7 @@ define void @convert_v3i8_to_v3f32(<3 x float>* %dst.addr, <3 x i8>* %src.addr)
; X64-SSE2-NEXT: retq
;
; X64-SSE42-LABEL: convert_v3i8_to_v3f32:
-; X64-SSE42: # BB#0: # %entry
+; X64-SSE42: # %bb.0: # %entry
; X64-SSE42-NEXT: movzbl 2(%rsi), %eax
; X64-SSE42-NEXT: movzwl (%rsi), %ecx
; X64-SSE42-NEXT: movq %rcx, %xmm0
diff --git a/test/CodeGen/X86/widen_conversions.ll b/test/CodeGen/X86/widen_conversions.ll
index 9945e26c5504..acd8c78fa2d5 100644
--- a/test/CodeGen/X86/widen_conversions.ll
+++ b/test/CodeGen/X86/widen_conversions.ll
@@ -4,7 +4,7 @@
define <4 x i32> @zext_v4i8_to_v4i32(<4 x i8>* %ptr) {
; X86-LABEL: zext_v4i8_to_v4i32:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X86-NEXT: pxor %xmm1, %xmm1
@@ -13,7 +13,7 @@ define <4 x i32> @zext_v4i8_to_v4i32(<4 x i8>* %ptr) {
; X86-NEXT: retl
;
; X64-LABEL: zext_v4i8_to_v4i32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X64-NEXT: pxor %xmm1, %xmm1
; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
diff --git a/test/CodeGen/X86/widen_extract-1.ll b/test/CodeGen/X86/widen_extract-1.ll
index 3737ea967738..024187f1f842 100644
--- a/test/CodeGen/X86/widen_extract-1.ll
+++ b/test/CodeGen/X86/widen_extract-1.ll
@@ -6,16 +6,16 @@
define void @convert(<2 x double>* %dst.addr, <3 x double> %src) {
; X32-LABEL: convert:
-; X32: # BB#0: # %entry
-; X32-NEXT: movups {{[0-9]+}}(%esp), %xmm0
+; X32: # %bb.0: # %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: movups {{[0-9]+}}(%esp), %xmm0
; X32-NEXT: movaps %xmm0, (%eax)
; X32-NEXT: retl
;
; X64-LABEL: convert:
-; X64: # BB#0: # %entry
-; X64-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; X64-NEXT: movapd %xmm0, (%rdi)
+; X64: # %bb.0: # %entry
+; X64-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X64-NEXT: movaps %xmm0, (%rdi)
; X64-NEXT: retq
entry:
%val = shufflevector <3 x double> %src, <3 x double> undef, <2 x i32> <i32 0, i32 1>
diff --git a/test/CodeGen/X86/widen_load-0.ll b/test/CodeGen/X86/widen_load-0.ll
index f998cf770486..01e813a78ad7 100644
--- a/test/CodeGen/X86/widen_load-0.ll
+++ b/test/CodeGen/X86/widen_load-0.ll
@@ -8,7 +8,7 @@
define void @short2_int_swap(<2 x i16>* nocapture %b, i32* nocapture %c) nounwind {
; X86-LABEL: short2_int_swap:
-; X86: # BB#0: # %entry
+; X86: # %bb.0: # %entry
; X86-NEXT: pushl %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
@@ -20,7 +20,7 @@ define void @short2_int_swap(<2 x i16>* nocapture %b, i32* nocapture %c) nounwin
; X86-NEXT: retl
;
; X64-LABEL: short2_int_swap:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: movl (%rsi), %eax
; X64-NEXT: movl (%rdi), %ecx
; X64-NEXT: movl %eax, (%rdi)
diff --git a/test/CodeGen/X86/widen_load-1.ll b/test/CodeGen/X86/widen_load-1.ll
index 810e409c175c..2e4acb57ee49 100644
--- a/test/CodeGen/X86/widen_load-1.ll
+++ b/test/CodeGen/X86/widen_load-1.ll
@@ -1,5 +1,5 @@
-; RUN: llc -stack-symbol-ordering=0 %s -o - -march=x86-64 -mattr=-avx -mtriple=x86_64-unknown-linux-gnu | FileCheck %s --check-prefix=SSE
-; RUN: llc -stack-symbol-ordering=0 %s -o - -march=x86-64 -mattr=+avx -mtriple=x86_64-unknown-linux-gnu | FileCheck %s --check-prefix=AVX
+; RUN: llc -stack-symbol-ordering=0 %s -o - -mattr=-avx -mtriple=x86_64-unknown-linux-gnu | FileCheck %s --check-prefix=SSE
+; RUN: llc -stack-symbol-ordering=0 %s -o - -mattr=+avx -mtriple=x86_64-unknown-linux-gnu | FileCheck %s --check-prefix=AVX
; PR4891
; PR5626
diff --git a/test/CodeGen/X86/widen_load-2.ll b/test/CodeGen/X86/widen_load-2.ll
index 9fc0805b899c..7d773c779ede 100644
--- a/test/CodeGen/X86/widen_load-2.ll
+++ b/test/CodeGen/X86/widen_load-2.ll
@@ -8,7 +8,7 @@
%i32vec3 = type <3 x i32>
define void @add3i32(%i32vec3* sret %ret, %i32vec3* %ap, %i32vec3* %bp) {
; X86-LABEL: add3i32:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
@@ -20,7 +20,7 @@ define void @add3i32(%i32vec3* sret %ret, %i32vec3* %ap, %i32vec3* %bp) {
; X86-NEXT: retl $4
;
; X64-LABEL: add3i32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movdqa (%rsi), %xmm0
; X64-NEXT: paddd (%rdx), %xmm0
; X64-NEXT: pextrd $2, %xmm0, 8(%rdi)
@@ -36,7 +36,7 @@ define void @add3i32(%i32vec3* sret %ret, %i32vec3* %ap, %i32vec3* %bp) {
define void @add3i32_2(%i32vec3* sret %ret, %i32vec3* %ap, %i32vec3* %bp) {
; X86-LABEL: add3i32_2:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
@@ -53,7 +53,7 @@ define void @add3i32_2(%i32vec3* sret %ret, %i32vec3* %ap, %i32vec3* %bp) {
; X86-NEXT: retl $4
;
; X64-LABEL: add3i32_2:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; X64-NEXT: pinsrd $2, 8(%rsi), %xmm0
; X64-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
@@ -73,7 +73,7 @@ define void @add3i32_2(%i32vec3* sret %ret, %i32vec3* %ap, %i32vec3* %bp) {
%i32vec7 = type <7 x i32>
define void @add7i32(%i32vec7* sret %ret, %i32vec7* %ap, %i32vec7* %bp) {
; X86-LABEL: add7i32:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
@@ -88,7 +88,7 @@ define void @add7i32(%i32vec7* sret %ret, %i32vec7* %ap, %i32vec7* %bp) {
; X86-NEXT: retl $4
;
; X64-LABEL: add7i32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movdqa (%rsi), %xmm0
; X64-NEXT: movdqa 16(%rsi), %xmm1
; X64-NEXT: paddd (%rdx), %xmm0
@@ -108,7 +108,7 @@ define void @add7i32(%i32vec7* sret %ret, %i32vec7* %ap, %i32vec7* %bp) {
%i32vec12 = type <12 x i32>
define void @add12i32(%i32vec12* sret %ret, %i32vec12* %ap, %i32vec12* %bp) {
; X86-LABEL: add12i32:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
@@ -124,7 +124,7 @@ define void @add12i32(%i32vec12* sret %ret, %i32vec12* %ap, %i32vec12* %bp) {
; X86-NEXT: retl $4
;
; X64-LABEL: add12i32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movdqa (%rsi), %xmm0
; X64-NEXT: movdqa 16(%rsi), %xmm1
; X64-NEXT: movdqa 32(%rsi), %xmm2
@@ -147,7 +147,7 @@ define void @add12i32(%i32vec12* sret %ret, %i32vec12* %ap, %i32vec12* %bp) {
%i16vec3 = type <3 x i16>
define void @add3i16(%i16vec3* nocapture sret %ret, %i16vec3* %ap, %i16vec3* %bp) nounwind {
; X86-LABEL: add3i16:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: pushl %ebp
; X86-NEXT: movl %esp, %ebp
; X86-NEXT: andl $-8, %esp
@@ -170,7 +170,7 @@ define void @add3i16(%i16vec3* nocapture sret %ret, %i16vec3* %ap, %i16vec3* %bp
; X86-NEXT: retl $4
;
; X64-LABEL: add3i16:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: pmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
; X64-NEXT: pmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
; X64-NEXT: paddd %xmm0, %xmm1
@@ -189,7 +189,7 @@ define void @add3i16(%i16vec3* nocapture sret %ret, %i16vec3* %ap, %i16vec3* %bp
%i16vec4 = type <4 x i16>
define void @add4i16(%i16vec4* nocapture sret %ret, %i16vec4* %ap, %i16vec4* %bp) nounwind {
; X86-LABEL: add4i16:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
@@ -200,7 +200,7 @@ define void @add4i16(%i16vec4* nocapture sret %ret, %i16vec4* %ap, %i16vec4* %bp
; X86-NEXT: retl $4
;
; X64-LABEL: add4i16:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; X64-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
; X64-NEXT: paddw %xmm0, %xmm1
@@ -217,7 +217,7 @@ define void @add4i16(%i16vec4* nocapture sret %ret, %i16vec4* %ap, %i16vec4* %bp
%i16vec12 = type <12 x i16>
define void @add12i16(%i16vec12* nocapture sret %ret, %i16vec12* %ap, %i16vec12* %bp) nounwind {
; X86-LABEL: add12i16:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
@@ -231,7 +231,7 @@ define void @add12i16(%i16vec12* nocapture sret %ret, %i16vec12* %ap, %i16vec12*
; X86-NEXT: retl $4
;
; X64-LABEL: add12i16:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movdqa (%rsi), %xmm0
; X64-NEXT: movdqa 16(%rsi), %xmm1
; X64-NEXT: paddw (%rdx), %xmm0
@@ -250,7 +250,7 @@ define void @add12i16(%i16vec12* nocapture sret %ret, %i16vec12* %ap, %i16vec12*
%i16vec18 = type <18 x i16>
define void @add18i16(%i16vec18* nocapture sret %ret, %i16vec18* %ap, %i16vec18* %bp) nounwind {
; X86-LABEL: add18i16:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
@@ -266,7 +266,7 @@ define void @add18i16(%i16vec18* nocapture sret %ret, %i16vec18* %ap, %i16vec18*
; X86-NEXT: retl $4
;
; X64-LABEL: add18i16:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movdqa (%rsi), %xmm0
; X64-NEXT: movdqa 16(%rsi), %xmm1
; X64-NEXT: movdqa 32(%rsi), %xmm2
@@ -289,7 +289,7 @@ define void @add18i16(%i16vec18* nocapture sret %ret, %i16vec18* %ap, %i16vec18*
%i8vec3 = type <3 x i8>
define void @add3i8(%i8vec3* nocapture sret %ret, %i8vec3* %ap, %i8vec3* %bp) nounwind {
; X86-LABEL: add3i8:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: subl $12, %esp
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
@@ -304,7 +304,7 @@ define void @add3i8(%i8vec3* nocapture sret %ret, %i8vec3* %ap, %i8vec3* %bp) no
; X86-NEXT: retl $4
;
; X64-LABEL: add3i8:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; X64-NEXT: pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; X64-NEXT: paddd %xmm0, %xmm1
@@ -323,7 +323,7 @@ define void @add3i8(%i8vec3* nocapture sret %ret, %i8vec3* %ap, %i8vec3* %bp) no
%i8vec31 = type <31 x i8>
define void @add31i8(%i8vec31* nocapture sret %ret, %i8vec31* %ap, %i8vec31* %bp) nounwind {
; X86-LABEL: add31i8:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
@@ -340,7 +340,7 @@ define void @add31i8(%i8vec31* nocapture sret %ret, %i8vec31* %ap, %i8vec31* %bp
; X86-NEXT: retl $4
;
; X64-LABEL: add31i8:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movdqa (%rsi), %xmm0
; X64-NEXT: movdqa 16(%rsi), %xmm1
; X64-NEXT: paddb (%rdx), %xmm0
@@ -363,7 +363,7 @@ define void @add31i8(%i8vec31* nocapture sret %ret, %i8vec31* %ap, %i8vec31* %bp
%i8vec3pack = type { <3 x i8>, i8 }
define void @rot(%i8vec3pack* nocapture sret %result, %i8vec3pack* %X, %i8vec3pack* %rot) nounwind {
; X86-LABEL: rot:
-; X86: # BB#0: # %entry
+; X86: # %bb.0: # %entry
; X86-NEXT: subl $16, %esp
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
@@ -385,7 +385,7 @@ define void @rot(%i8vec3pack* nocapture sret %result, %i8vec3pack* %X, %i8vec3pa
; X86-NEXT: retl $4
;
; X64-LABEL: rot:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: movdqa {{.*#+}} xmm0 = [40606,158]
; X64-NEXT: pextrw $0, %xmm0, (%rsi)
; X64-NEXT: movb $-98, 2(%rsi)
diff --git a/test/CodeGen/X86/widen_load-3.ll b/test/CodeGen/X86/widen_load-3.ll
new file mode 100644
index 000000000000..ce358d914795
--- /dev/null
+++ b/test/CodeGen/X86/widen_load-3.ll
@@ -0,0 +1,128 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-linux -mattr=+sse4.2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE
+; RUN: llc < %s -mtriple=i686-linux -mattr=+avx | FileCheck %s --check-prefix=X86 --check-prefix=X86-AVX --check-prefix=X86-AVX1
+; RUN: llc < %s -mtriple=i686-linux -mattr=+avx2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-AVX --check-prefix=X86-AVX2
+; RUN: llc < %s -mtriple=x86_64-linux -mattr=+sse4.2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE
+; RUN: llc < %s -mtriple=x86_64-linux -mattr=+avx | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX1
+; RUN: llc < %s -mtriple=x86_64-linux -mattr=+avx2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX2
+
+; PR27708
+
+define <7 x i64> @load7_aligned(<7 x i64>* %x) {
+; X86-SSE-LABEL: load7_aligned:
+; X86-SSE: # %bb.0:
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT: movaps (%ecx), %xmm0
+; X86-SSE-NEXT: movaps 16(%ecx), %xmm1
+; X86-SSE-NEXT: movaps 32(%ecx), %xmm2
+; X86-SSE-NEXT: movl 48(%ecx), %edx
+; X86-SSE-NEXT: movl 52(%ecx), %ecx
+; X86-SSE-NEXT: movl %ecx, 52(%eax)
+; X86-SSE-NEXT: movl %edx, 48(%eax)
+; X86-SSE-NEXT: movaps %xmm2, 32(%eax)
+; X86-SSE-NEXT: movaps %xmm1, 16(%eax)
+; X86-SSE-NEXT: movaps %xmm0, (%eax)
+; X86-SSE-NEXT: retl $4
+;
+; X86-AVX-LABEL: load7_aligned:
+; X86-AVX: # %bb.0:
+; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-AVX-NEXT: vmovaps (%ecx), %ymm0
+; X86-AVX-NEXT: vmovaps 32(%ecx), %ymm1
+; X86-AVX-NEXT: vmovaps %ymm0, (%eax)
+; X86-AVX-NEXT: vextractf128 $1, %ymm1, %xmm0
+; X86-AVX-NEXT: vextractps $1, %xmm0, 52(%eax)
+; X86-AVX-NEXT: vmovss %xmm0, 48(%eax)
+; X86-AVX-NEXT: vmovaps %xmm1, 32(%eax)
+; X86-AVX-NEXT: vzeroupper
+; X86-AVX-NEXT: retl $4
+;
+; X64-SSE-LABEL: load7_aligned:
+; X64-SSE: # %bb.0:
+; X64-SSE-NEXT: movaps (%rsi), %xmm0
+; X64-SSE-NEXT: movaps 16(%rsi), %xmm1
+; X64-SSE-NEXT: movaps 32(%rsi), %xmm2
+; X64-SSE-NEXT: movq 48(%rsi), %rax
+; X64-SSE-NEXT: movq %rax, 48(%rdi)
+; X64-SSE-NEXT: movaps %xmm2, 32(%rdi)
+; X64-SSE-NEXT: movaps %xmm1, 16(%rdi)
+; X64-SSE-NEXT: movaps %xmm0, (%rdi)
+; X64-SSE-NEXT: movq %rdi, %rax
+; X64-SSE-NEXT: retq
+;
+; X64-AVX-LABEL: load7_aligned:
+; X64-AVX: # %bb.0:
+; X64-AVX-NEXT: vmovaps (%rsi), %ymm0
+; X64-AVX-NEXT: vmovaps 32(%rsi), %ymm1
+; X64-AVX-NEXT: vmovaps %ymm0, (%rdi)
+; X64-AVX-NEXT: vextractf128 $1, %ymm1, %xmm0
+; X64-AVX-NEXT: vmovlps %xmm0, 48(%rdi)
+; X64-AVX-NEXT: vmovaps %xmm1, 32(%rdi)
+; X64-AVX-NEXT: movq %rdi, %rax
+; X64-AVX-NEXT: vzeroupper
+; X64-AVX-NEXT: retq
+ %x1 = load <7 x i64>, <7 x i64>* %x
+ ret <7 x i64> %x1
+}
+
+define <7 x i64> @load7_unaligned(<7 x i64>* %x) {
+; X86-SSE-LABEL: load7_unaligned:
+; X86-SSE: # %bb.0:
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT: movups (%ecx), %xmm0
+; X86-SSE-NEXT: movups 16(%ecx), %xmm1
+; X86-SSE-NEXT: movups 32(%ecx), %xmm2
+; X86-SSE-NEXT: movl 48(%ecx), %edx
+; X86-SSE-NEXT: movl 52(%ecx), %ecx
+; X86-SSE-NEXT: movl %ecx, 52(%eax)
+; X86-SSE-NEXT: movl %edx, 48(%eax)
+; X86-SSE-NEXT: movaps %xmm2, 32(%eax)
+; X86-SSE-NEXT: movaps %xmm1, 16(%eax)
+; X86-SSE-NEXT: movaps %xmm0, (%eax)
+; X86-SSE-NEXT: retl $4
+;
+; X86-AVX-LABEL: load7_unaligned:
+; X86-AVX: # %bb.0:
+; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-AVX-NEXT: vmovups (%ecx), %ymm0
+; X86-AVX-NEXT: vmovups 32(%ecx), %xmm1
+; X86-AVX-NEXT: movl 48(%ecx), %edx
+; X86-AVX-NEXT: movl 52(%ecx), %ecx
+; X86-AVX-NEXT: movl %ecx, 52(%eax)
+; X86-AVX-NEXT: movl %edx, 48(%eax)
+; X86-AVX-NEXT: vmovaps %xmm1, 32(%eax)
+; X86-AVX-NEXT: vmovaps %ymm0, (%eax)
+; X86-AVX-NEXT: vzeroupper
+; X86-AVX-NEXT: retl $4
+;
+; X64-SSE-LABEL: load7_unaligned:
+; X64-SSE: # %bb.0:
+; X64-SSE-NEXT: movups (%rsi), %xmm0
+; X64-SSE-NEXT: movups 16(%rsi), %xmm1
+; X64-SSE-NEXT: movups 32(%rsi), %xmm2
+; X64-SSE-NEXT: movq 48(%rsi), %rax
+; X64-SSE-NEXT: movq %rax, 48(%rdi)
+; X64-SSE-NEXT: movaps %xmm2, 32(%rdi)
+; X64-SSE-NEXT: movaps %xmm1, 16(%rdi)
+; X64-SSE-NEXT: movaps %xmm0, (%rdi)
+; X64-SSE-NEXT: movq %rdi, %rax
+; X64-SSE-NEXT: retq
+;
+; X64-AVX-LABEL: load7_unaligned:
+; X64-AVX: # %bb.0:
+; X64-AVX-NEXT: vmovups (%rsi), %ymm0
+; X64-AVX-NEXT: vmovups 32(%rsi), %xmm1
+; X64-AVX-NEXT: movq 48(%rsi), %rax
+; X64-AVX-NEXT: movq %rax, 48(%rdi)
+; X64-AVX-NEXT: vmovaps %xmm1, 32(%rdi)
+; X64-AVX-NEXT: vmovaps %ymm0, (%rdi)
+; X64-AVX-NEXT: movq %rdi, %rax
+; X64-AVX-NEXT: vzeroupper
+; X64-AVX-NEXT: retq
+ %x1 = load <7 x i64>, <7 x i64>* %x, align 1
+ ret <7 x i64> %x1
+}
diff --git a/test/CodeGen/X86/widen_shuffle-1.ll b/test/CodeGen/X86/widen_shuffle-1.ll
index aeb4e2130062..c0387b3878a6 100644
--- a/test/CodeGen/X86/widen_shuffle-1.ll
+++ b/test/CodeGen/X86/widen_shuffle-1.ll
@@ -5,7 +5,7 @@
; widening shuffle v3float and then a add
define void @shuf(<3 x float>* %dst.addr, <3 x float> %src1,<3 x float> %src2) nounwind {
; X86-LABEL: shuf:
-; X86: # BB#0: # %entry
+; X86: # %bb.0: # %entry
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: addps %xmm1, %xmm0
; X86-NEXT: extractps $2, %xmm0, 8(%eax)
@@ -14,7 +14,7 @@ define void @shuf(<3 x float>* %dst.addr, <3 x float> %src1,<3 x float> %src2) n
; X86-NEXT: retl
;
; X64-LABEL: shuf:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: addps %xmm1, %xmm0
; X64-NEXT: extractps $2, %xmm0, 8(%rdi)
; X64-NEXT: movlps %xmm0, (%rdi)
@@ -30,7 +30,7 @@ entry:
; widening shuffle v3float with a different mask and then a add
define void @shuf2(<3 x float>* %dst.addr, <3 x float> %src1,<3 x float> %src2) nounwind {
; X86-LABEL: shuf2:
-; X86: # BB#0: # %entry
+; X86: # %bb.0: # %entry
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
; X86-NEXT: addps %xmm1, %xmm0
@@ -40,7 +40,7 @@ define void @shuf2(<3 x float>* %dst.addr, <3 x float> %src1,<3 x float> %src2)
; X86-NEXT: retl
;
; X64-LABEL: shuf2:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
; X64-NEXT: addps %xmm1, %xmm0
; X64-NEXT: extractps $2, %xmm0, 8(%rdi)
@@ -58,14 +58,14 @@ entry:
; opA with opB, the DAG will produce new operations with opA.
define void @shuf3(<4 x float> %tmp10, <4 x float> %vecinit15, <4 x float>* %dst) nounwind {
; X86-LABEL: shuf3:
-; X86: # BB#0: # %entry
+; X86: # %bb.0: # %entry
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,0]
; X86-NEXT: movaps %xmm1, (%eax)
; X86-NEXT: retl
;
; X64-LABEL: shuf3:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,0]
; X64-NEXT: movaps %xmm1, (%rdi)
; X64-NEXT: retq
@@ -88,7 +88,7 @@ entry:
; PR10421: make sure we correctly handle extreme widening with CONCAT_VECTORS
define <8 x i8> @shuf4(<4 x i8> %a, <4 x i8> %b) nounwind readnone {
; X86-LABEL: shuf4:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; X86-NEXT: pshufb %xmm2, %xmm1
; X86-NEXT: pshufb %xmm2, %xmm0
@@ -96,7 +96,7 @@ define <8 x i8> @shuf4(<4 x i8> %a, <4 x i8> %b) nounwind readnone {
; X86-NEXT: retl
;
; X64-LABEL: shuf4:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; X64-NEXT: pshufb %xmm2, %xmm1
; X64-NEXT: pshufb %xmm2, %xmm0
@@ -109,14 +109,14 @@ define <8 x i8> @shuf4(<4 x i8> %a, <4 x i8> %b) nounwind readnone {
; PR11389: another CONCAT_VECTORS case
define void @shuf5(<8 x i8>* %p) nounwind {
; X86-LABEL: shuf5:
-; X86: # BB#0:
+; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; X86-NEXT: movsd %xmm0, (%eax)
; X86-NEXT: retl
;
; X64-LABEL: shuf5:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: movq {{.*}}(%rip), %rax
; X64-NEXT: movq %rax, (%rdi)
; X64-NEXT: retq
diff --git a/test/CodeGen/X86/widened-broadcast.ll b/test/CodeGen/X86/widened-broadcast.ll
index 42c4c23c6349..96e97c70dbf4 100644
--- a/test/CodeGen/X86/widened-broadcast.ll
+++ b/test/CodeGen/X86/widened-broadcast.ll
@@ -9,18 +9,18 @@
define <4 x float> @load_splat_4f32_4f32_0101(<4 x float>* %ptr) nounwind uwtable readnone ssp {
; SSE2-LABEL: load_splat_4f32_4f32_0101:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movaps (%rdi), %xmm0
; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0]
; SSE2-NEXT: retq
;
; SSE42-LABEL: load_splat_4f32_4f32_0101:
-; SSE42: # BB#0: # %entry
+; SSE42: # %bb.0: # %entry
; SSE42-NEXT: movddup {{.*#+}} xmm0 = mem[0,0]
; SSE42-NEXT: retq
;
; AVX-LABEL: load_splat_4f32_4f32_0101:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
; AVX-NEXT: retq
entry:
@@ -31,31 +31,31 @@ entry:
define <8 x float> @load_splat_8f32_4f32_01010101(<4 x float>* %ptr) nounwind uwtable readnone ssp {
; SSE2-LABEL: load_splat_8f32_4f32_01010101:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movaps (%rdi), %xmm0
; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0]
; SSE2-NEXT: movaps %xmm0, %xmm1
; SSE2-NEXT: retq
;
; SSE42-LABEL: load_splat_8f32_4f32_01010101:
-; SSE42: # BB#0: # %entry
+; SSE42: # %bb.0: # %entry
; SSE42-NEXT: movddup {{.*#+}} xmm0 = mem[0,0]
; SSE42-NEXT: movapd %xmm0, %xmm1
; SSE42-NEXT: retq
;
; AVX1-LABEL: load_splat_8f32_4f32_01010101:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: load_splat_8f32_4f32_01010101:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vbroadcastsd (%rdi), %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: load_splat_8f32_4f32_01010101:
-; AVX512: # BB#0: # %entry
+; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vbroadcastsd (%rdi), %ymm0
; AVX512-NEXT: retq
entry:
@@ -66,20 +66,20 @@ entry:
define <8 x float> @load_splat_8f32_8f32_01010101(<8 x float>* %ptr) nounwind uwtable readnone ssp {
; SSE2-LABEL: load_splat_8f32_8f32_01010101:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movaps (%rdi), %xmm0
; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0]
; SSE2-NEXT: movaps %xmm0, %xmm1
; SSE2-NEXT: retq
;
; SSE42-LABEL: load_splat_8f32_8f32_01010101:
-; SSE42: # BB#0: # %entry
+; SSE42: # %bb.0: # %entry
; SSE42-NEXT: movddup {{.*#+}} xmm0 = mem[0,0]
; SSE42-NEXT: movapd %xmm0, %xmm1
; SSE42-NEXT: retq
;
; AVX-LABEL: load_splat_8f32_8f32_01010101:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vbroadcastsd (%rdi), %ymm0
; AVX-NEXT: retq
entry:
@@ -90,22 +90,22 @@ entry:
define <4 x i32> @load_splat_4i32_4i32_0101(<4 x i32>* %ptr) nounwind uwtable readnone ssp {
; SSE-LABEL: load_splat_4i32_4i32_0101:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: pshufd {{.*#+}} xmm0 = mem[0,1,0,1]
; SSE-NEXT: retq
;
; AVX1-LABEL: load_splat_4i32_4i32_0101:
-; AVX1: # BB#0: # %entry
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,1,0,1]
+; AVX1: # %bb.0: # %entry
+; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = mem[0,1,0,1]
; AVX1-NEXT: retq
;
; AVX2-LABEL: load_splat_4i32_4i32_0101:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpbroadcastq (%rdi), %xmm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: load_splat_4i32_4i32_0101:
-; AVX512: # BB#0: # %entry
+; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vpbroadcastq (%rdi), %xmm0
; AVX512-NEXT: retq
entry:
@@ -116,26 +116,15 @@ entry:
define <8 x i32> @load_splat_8i32_4i32_01010101(<4 x i32>* %ptr) nounwind uwtable readnone ssp {
; SSE-LABEL: load_splat_8i32_4i32_01010101:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: pshufd {{.*#+}} xmm0 = mem[0,1,0,1]
; SSE-NEXT: movdqa %xmm0, %xmm1
; SSE-NEXT: retq
;
-; AVX1-LABEL: load_splat_8i32_4i32_01010101:
-; AVX1: # BB#0: # %entry
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,1,0,1]
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: load_splat_8i32_4i32_01010101:
-; AVX2: # BB#0: # %entry
-; AVX2-NEXT: vbroadcastsd (%rdi), %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: load_splat_8i32_4i32_01010101:
-; AVX512: # BB#0: # %entry
-; AVX512-NEXT: vbroadcastsd (%rdi), %ymm0
-; AVX512-NEXT: retq
+; AVX-LABEL: load_splat_8i32_4i32_01010101:
+; AVX: # %bb.0: # %entry
+; AVX-NEXT: vbroadcastsd (%rdi), %ymm0
+; AVX-NEXT: retq
entry:
%ld = load <4 x i32>, <4 x i32>* %ptr
%ret = shufflevector <4 x i32> %ld, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
@@ -144,24 +133,24 @@ entry:
define <8 x i32> @load_splat_8i32_8i32_01010101(<8 x i32>* %ptr) nounwind uwtable readnone ssp {
; SSE-LABEL: load_splat_8i32_8i32_01010101:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: pshufd {{.*#+}} xmm0 = mem[0,1,0,1]
; SSE-NEXT: movdqa %xmm0, %xmm1
; SSE-NEXT: retq
;
; AVX1-LABEL: load_splat_8i32_8i32_01010101:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: load_splat_8i32_8i32_01010101:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vbroadcastsd (%rdi), %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: load_splat_8i32_8i32_01010101:
-; AVX512: # BB#0: # %entry
+; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vbroadcastsd (%rdi), %ymm0
; AVX512-NEXT: retq
entry:
@@ -172,22 +161,22 @@ entry:
define <8 x i16> @load_splat_8i16_8i16_01010101(<8 x i16>* %ptr) nounwind uwtable readnone ssp {
; SSE-LABEL: load_splat_8i16_8i16_01010101:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: pshufd {{.*#+}} xmm0 = mem[0,0,0,0]
; SSE-NEXT: retq
;
; AVX1-LABEL: load_splat_8i16_8i16_01010101:
-; AVX1: # BB#0: # %entry
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,0,0,0]
+; AVX1: # %bb.0: # %entry
+; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = mem[0,0,0,0]
; AVX1-NEXT: retq
;
; AVX2-LABEL: load_splat_8i16_8i16_01010101:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vbroadcastss (%rdi), %xmm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: load_splat_8i16_8i16_01010101:
-; AVX512: # BB#0: # %entry
+; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vbroadcastss (%rdi), %xmm0
; AVX512-NEXT: retq
entry:
@@ -198,22 +187,22 @@ entry:
define <8 x i16> @load_splat_8i16_8i16_01230123(<8 x i16>* %ptr) nounwind uwtable readnone ssp {
; SSE-LABEL: load_splat_8i16_8i16_01230123:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: pshufd {{.*#+}} xmm0 = mem[0,1,0,1]
; SSE-NEXT: retq
;
; AVX1-LABEL: load_splat_8i16_8i16_01230123:
-; AVX1: # BB#0: # %entry
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,1,0,1]
+; AVX1: # %bb.0: # %entry
+; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = mem[0,1,0,1]
; AVX1-NEXT: retq
;
; AVX2-LABEL: load_splat_8i16_8i16_01230123:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpbroadcastq (%rdi), %xmm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: load_splat_8i16_8i16_01230123:
-; AVX512: # BB#0: # %entry
+; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vpbroadcastq (%rdi), %xmm0
; AVX512-NEXT: retq
entry:
@@ -224,24 +213,24 @@ entry:
define <16 x i16> @load_splat_16i16_8i16_0101010101010101(<8 x i16>* %ptr) nounwind uwtable readnone ssp {
; SSE-LABEL: load_splat_16i16_8i16_0101010101010101:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: pshufd {{.*#+}} xmm0 = mem[0,0,0,0]
; SSE-NEXT: movdqa %xmm0, %xmm1
; SSE-NEXT: retq
;
; AVX1-LABEL: load_splat_16i16_8i16_0101010101010101:
-; AVX1: # BB#0: # %entry
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,0,0,0]
+; AVX1: # %bb.0: # %entry
+; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = mem[0,0,0,0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: load_splat_16i16_8i16_0101010101010101:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vbroadcastss (%rdi), %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: load_splat_16i16_8i16_0101010101010101:
-; AVX512: # BB#0: # %entry
+; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vbroadcastss (%rdi), %ymm0
; AVX512-NEXT: retq
entry:
@@ -252,26 +241,15 @@ entry:
define <16 x i16> @load_splat_16i16_8i16_0123012301230123(<8 x i16>* %ptr) nounwind uwtable readnone ssp {
; SSE-LABEL: load_splat_16i16_8i16_0123012301230123:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: pshufd {{.*#+}} xmm0 = mem[0,1,0,1]
; SSE-NEXT: movdqa %xmm0, %xmm1
; SSE-NEXT: retq
;
-; AVX1-LABEL: load_splat_16i16_8i16_0123012301230123:
-; AVX1: # BB#0: # %entry
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,1,0,1]
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: load_splat_16i16_8i16_0123012301230123:
-; AVX2: # BB#0: # %entry
-; AVX2-NEXT: vbroadcastsd (%rdi), %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: load_splat_16i16_8i16_0123012301230123:
-; AVX512: # BB#0: # %entry
-; AVX512-NEXT: vbroadcastsd (%rdi), %ymm0
-; AVX512-NEXT: retq
+; AVX-LABEL: load_splat_16i16_8i16_0123012301230123:
+; AVX: # %bb.0: # %entry
+; AVX-NEXT: vbroadcastsd (%rdi), %ymm0
+; AVX-NEXT: retq
entry:
%ld = load <8 x i16>, <8 x i16>* %ptr
%ret = shufflevector <8 x i16> %ld, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3,i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
@@ -280,24 +258,24 @@ entry:
define <16 x i16> @load_splat_16i16_16i16_0101010101010101(<16 x i16>* %ptr) nounwind uwtable readnone ssp {
; SSE-LABEL: load_splat_16i16_16i16_0101010101010101:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: pshufd {{.*#+}} xmm0 = mem[0,0,0,0]
; SSE-NEXT: movdqa %xmm0, %xmm1
; SSE-NEXT: retq
;
; AVX1-LABEL: load_splat_16i16_16i16_0101010101010101:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = mem[0,0,0,0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: load_splat_16i16_16i16_0101010101010101:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vbroadcastss (%rdi), %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: load_splat_16i16_16i16_0101010101010101:
-; AVX512: # BB#0: # %entry
+; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vbroadcastss (%rdi), %ymm0
; AVX512-NEXT: retq
entry:
@@ -308,13 +286,13 @@ entry:
define <16 x i16> @load_splat_16i16_16i16_0123012301230123(<16 x i16>* %ptr) nounwind uwtable readnone ssp {
; SSE-LABEL: load_splat_16i16_16i16_0123012301230123:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: pshufd {{.*#+}} xmm0 = mem[0,1,0,1]
; SSE-NEXT: movdqa %xmm0, %xmm1
; SSE-NEXT: retq
;
; AVX-LABEL: load_splat_16i16_16i16_0123012301230123:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vbroadcastsd (%rdi), %ymm0
; AVX-NEXT: retq
entry:
@@ -325,24 +303,24 @@ entry:
define <16 x i8> @load_splat_16i8_16i8_0101010101010101(<16 x i8>* %ptr) nounwind uwtable readnone ssp {
; SSE-LABEL: load_splat_16i8_16i8_0101010101010101:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = mem[0,0,0,0,4,5,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; SSE-NEXT: retq
;
; AVX1-LABEL: load_splat_16i8_16i8_0101010101010101:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = mem[0,0,0,0,4,5,6,7]
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; AVX1-NEXT: retq
;
; AVX2-LABEL: load_splat_16i8_16i8_0101010101010101:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpbroadcastw (%rdi), %xmm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: load_splat_16i8_16i8_0101010101010101:
-; AVX512: # BB#0: # %entry
+; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vpbroadcastw (%rdi), %xmm0
; AVX512-NEXT: retq
entry:
@@ -353,22 +331,22 @@ entry:
define <16 x i8> @load_splat_16i8_16i8_0123012301230123(<16 x i8>* %ptr) nounwind uwtable readnone ssp {
; SSE-LABEL: load_splat_16i8_16i8_0123012301230123:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: pshufd {{.*#+}} xmm0 = mem[0,0,0,0]
; SSE-NEXT: retq
;
; AVX1-LABEL: load_splat_16i8_16i8_0123012301230123:
-; AVX1: # BB#0: # %entry
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,0,0,0]
+; AVX1: # %bb.0: # %entry
+; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = mem[0,0,0,0]
; AVX1-NEXT: retq
;
; AVX2-LABEL: load_splat_16i8_16i8_0123012301230123:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vbroadcastss (%rdi), %xmm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: load_splat_16i8_16i8_0123012301230123:
-; AVX512: # BB#0: # %entry
+; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vbroadcastss (%rdi), %xmm0
; AVX512-NEXT: retq
entry:
@@ -379,22 +357,22 @@ entry:
define <16 x i8> @load_splat_16i8_16i8_0123456701234567(<16 x i8>* %ptr) nounwind uwtable readnone ssp {
; SSE-LABEL: load_splat_16i8_16i8_0123456701234567:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: pshufd {{.*#+}} xmm0 = mem[0,1,0,1]
; SSE-NEXT: retq
;
; AVX1-LABEL: load_splat_16i8_16i8_0123456701234567:
-; AVX1: # BB#0: # %entry
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,1,0,1]
+; AVX1: # %bb.0: # %entry
+; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = mem[0,1,0,1]
; AVX1-NEXT: retq
;
; AVX2-LABEL: load_splat_16i8_16i8_0123456701234567:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpbroadcastq (%rdi), %xmm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: load_splat_16i8_16i8_0123456701234567:
-; AVX512: # BB#0: # %entry
+; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vpbroadcastq (%rdi), %xmm0
; AVX512-NEXT: retq
entry:
@@ -405,26 +383,26 @@ entry:
define <32 x i8> @load_splat_32i8_16i8_01010101010101010101010101010101(<16 x i8>* %ptr) nounwind uwtable readnone ssp {
; SSE-LABEL: load_splat_32i8_16i8_01010101010101010101010101010101:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = mem[0,0,0,0,4,5,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; SSE-NEXT: movdqa %xmm0, %xmm1
; SSE-NEXT: retq
;
; AVX1-LABEL: load_splat_32i8_16i8_01010101010101010101010101010101:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = mem[0,0,0,0,4,5,6,7]
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: load_splat_32i8_16i8_01010101010101010101010101010101:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpbroadcastw (%rdi), %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: load_splat_32i8_16i8_01010101010101010101010101010101:
-; AVX512: # BB#0: # %entry
+; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vpbroadcastw (%rdi), %ymm0
; AVX512-NEXT: retq
entry:
@@ -435,24 +413,24 @@ entry:
define <32 x i8> @load_splat_32i8_16i8_01230123012301230123012301230123(<16 x i8>* %ptr) nounwind uwtable readnone ssp {
; SSE-LABEL: load_splat_32i8_16i8_01230123012301230123012301230123:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: pshufd {{.*#+}} xmm0 = mem[0,0,0,0]
; SSE-NEXT: movdqa %xmm0, %xmm1
; SSE-NEXT: retq
;
; AVX1-LABEL: load_splat_32i8_16i8_01230123012301230123012301230123:
-; AVX1: # BB#0: # %entry
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,0,0,0]
+; AVX1: # %bb.0: # %entry
+; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = mem[0,0,0,0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: load_splat_32i8_16i8_01230123012301230123012301230123:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vbroadcastss (%rdi), %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: load_splat_32i8_16i8_01230123012301230123012301230123:
-; AVX512: # BB#0: # %entry
+; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vbroadcastss (%rdi), %ymm0
; AVX512-NEXT: retq
entry:
@@ -463,26 +441,15 @@ entry:
define <32 x i8> @load_splat_32i8_16i8_01234567012345670123456701234567(<16 x i8>* %ptr) nounwind uwtable readnone ssp {
; SSE-LABEL: load_splat_32i8_16i8_01234567012345670123456701234567:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: pshufd {{.*#+}} xmm0 = mem[0,1,0,1]
; SSE-NEXT: movdqa %xmm0, %xmm1
; SSE-NEXT: retq
;
-; AVX1-LABEL: load_splat_32i8_16i8_01234567012345670123456701234567:
-; AVX1: # BB#0: # %entry
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,1,0,1]
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: load_splat_32i8_16i8_01234567012345670123456701234567:
-; AVX2: # BB#0: # %entry
-; AVX2-NEXT: vbroadcastsd (%rdi), %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: load_splat_32i8_16i8_01234567012345670123456701234567:
-; AVX512: # BB#0: # %entry
-; AVX512-NEXT: vbroadcastsd (%rdi), %ymm0
-; AVX512-NEXT: retq
+; AVX-LABEL: load_splat_32i8_16i8_01234567012345670123456701234567:
+; AVX: # %bb.0: # %entry
+; AVX-NEXT: vbroadcastsd (%rdi), %ymm0
+; AVX-NEXT: retq
entry:
%ld = load <16 x i8>, <16 x i8>* %ptr
%ret = shufflevector <16 x i8> %ld, <16 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -491,26 +458,26 @@ entry:
define <32 x i8> @load_splat_32i8_32i8_01010101010101010101010101010101(<32 x i8>* %ptr) nounwind uwtable readnone ssp {
; SSE-LABEL: load_splat_32i8_32i8_01010101010101010101010101010101:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = mem[0,0,0,0,4,5,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; SSE-NEXT: movdqa %xmm0, %xmm1
; SSE-NEXT: retq
;
; AVX1-LABEL: load_splat_32i8_32i8_01010101010101010101010101010101:
-; AVX1: # BB#0: # %entry
+; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = mem[0,0,0,0,4,5,6,7]
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: load_splat_32i8_32i8_01010101010101010101010101010101:
-; AVX2: # BB#0: # %entry
+; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpbroadcastw (%rdi), %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: load_splat_32i8_32i8_01010101010101010101010101010101:
-; AVX512: # BB#0: # %entry
+; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vpbroadcastw (%rdi), %ymm0
; AVX512-NEXT: retq
entry:
@@ -521,27 +488,15 @@ entry:
define <32 x i8> @load_splat_32i8_32i8_01230123012301230123012301230123(<32 x i8>* %ptr) nounwind uwtable readnone ssp {
; SSE-LABEL: load_splat_32i8_32i8_01230123012301230123012301230123:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: pshufd {{.*#+}} xmm0 = mem[0,0,0,0]
; SSE-NEXT: movdqa %xmm0, %xmm1
; SSE-NEXT: retq
;
-; AVX1-LABEL: load_splat_32i8_32i8_01230123012301230123012301230123:
-; AVX1: # BB#0: # %entry
-; AVX1-NEXT: vbroadcastss (%rdi), %ymm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: load_splat_32i8_32i8_01230123012301230123012301230123:
-; AVX2: # BB#0: # %entry
-; AVX2-NEXT: vmovaps (%rdi), %ymm0
-; AVX2-NEXT: vbroadcastss %xmm0, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: load_splat_32i8_32i8_01230123012301230123012301230123:
-; AVX512: # BB#0: # %entry
-; AVX512-NEXT: vmovaps (%rdi), %ymm0
-; AVX512-NEXT: vbroadcastss %xmm0, %ymm0
-; AVX512-NEXT: retq
+; AVX-LABEL: load_splat_32i8_32i8_01230123012301230123012301230123:
+; AVX: # %bb.0: # %entry
+; AVX-NEXT: vbroadcastss (%rdi), %ymm0
+; AVX-NEXT: retq
entry:
%ld = load <32 x i8>, <32 x i8>* %ptr
%ret = shufflevector <32 x i8> %ld, <32 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
@@ -550,13 +505,13 @@ entry:
define <32 x i8> @load_splat_32i8_32i8_01234567012345670123456701234567(<32 x i8>* %ptr) nounwind uwtable readnone ssp {
; SSE-LABEL: load_splat_32i8_32i8_01234567012345670123456701234567:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: pshufd {{.*#+}} xmm0 = mem[0,1,0,1]
; SSE-NEXT: movdqa %xmm0, %xmm1
; SSE-NEXT: retq
;
; AVX-LABEL: load_splat_32i8_32i8_01234567012345670123456701234567:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vbroadcastsd (%rdi), %ymm0
; AVX-NEXT: retq
entry:
@@ -567,13 +522,13 @@ entry:
define <4 x float> @load_splat_4f32_8f32_0000(<8 x float>* %ptr) nounwind uwtable readnone ssp {
; SSE-LABEL: load_splat_4f32_8f32_0000:
-; SSE: # BB#0: # %entry
+; SSE: # %bb.0: # %entry
; SSE-NEXT: movaps (%rdi), %xmm0
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
; SSE-NEXT: retq
;
; AVX-LABEL: load_splat_4f32_8f32_0000:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vbroadcastss (%rdi), %xmm0
; AVX-NEXT: retq
entry:
@@ -584,20 +539,20 @@ entry:
define <8 x float> @load_splat_8f32_16f32_89898989(<16 x float>* %ptr) nounwind uwtable readnone ssp {
; SSE2-LABEL: load_splat_8f32_16f32_89898989:
-; SSE2: # BB#0: # %entry
+; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movaps 32(%rdi), %xmm0
; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0]
; SSE2-NEXT: movaps %xmm0, %xmm1
; SSE2-NEXT: retq
;
; SSE42-LABEL: load_splat_8f32_16f32_89898989:
-; SSE42: # BB#0: # %entry
+; SSE42: # %bb.0: # %entry
; SSE42-NEXT: movddup {{.*#+}} xmm0 = mem[0,0]
; SSE42-NEXT: movapd %xmm0, %xmm1
; SSE42-NEXT: retq
;
; AVX-LABEL: load_splat_8f32_16f32_89898989:
-; AVX: # BB#0: # %entry
+; AVX: # %bb.0: # %entry
; AVX-NEXT: vbroadcastsd 32(%rdi), %ymm0
; AVX-NEXT: retq
entry:
diff --git a/test/CodeGen/X86/win32-pic-jumptable.ll b/test/CodeGen/X86/win32-pic-jumptable.ll
index 5ca9008aff20..1b29a38a8ccd 100644
--- a/test/CodeGen/X86/win32-pic-jumptable.ll
+++ b/test/CodeGen/X86/win32-pic-jumptable.ll
@@ -1,11 +1,9 @@
; RUN: llc < %s -relocation-model=pic | FileCheck %s
; CHECK: calll L0$pb
-; CHECK-NEXT: Lcfi{{[0-9]+}}:
; CHECK-NEXT: .cfi_adjust_cfa_offset 4
; CHECK-NEXT: L0$pb:
; CHECK-NEXT: popl %eax
-; CHECK-NEXT: Lcfi{{[0-9]+}}:
; CHECK-NEXT: .cfi_adjust_cfa_offset -4
; CHECK-NEXT: addl LJTI0_0(,%ecx,4), %eax
; CHECK-NEXT: jmpl *%eax
diff --git a/test/CodeGen/X86/win32-preemption.ll b/test/CodeGen/X86/win32-preemption.ll
new file mode 100644
index 000000000000..accabac102a1
--- /dev/null
+++ b/test/CodeGen/X86/win32-preemption.ll
@@ -0,0 +1,236 @@
+; RUN: llc -mtriple x86_64-pc-win32 \
+; RUN: -relocation-model=static < %s | FileCheck --check-prefix=COFF_S %s
+; RUN: llc -mtriple x86_64-pc-win32 \
+; RUN: -relocation-model=pic < %s | FileCheck --check-prefix=COFF %s
+; RUN: llc -mtriple x86_64-pc-win32 \
+; RUN: -relocation-model=dynamic-no-pic < %s | FileCheck --check-prefix=COFF %s
+
+
+; 32 bits
+
+; RUN: llc -mtriple i386-pc-win32 \
+; RUN: -relocation-model=static < %s | FileCheck --check-prefix=COFF32 %s
+; RUN: llc -mtriple i386-pc-win32 \
+; RUN: -relocation-model=pic < %s | FileCheck --check-prefix=COFF32 %s
+; RUN: llc -mtriple i386-pc-win32 \
+; RUN: -relocation-model=dynamic-no-pic < %s | \
+; RUN: FileCheck --check-prefix=COFF32 %s
+
+; globals
+
+@strong_default_global = global i32 42
+define i32* @get_strong_default_global() {
+ ret i32* @strong_default_global
+}
+; COFF: leaq strong_default_global(%rip), %rax
+; COFF_S: movl $strong_default_global, %eax
+; COFF32: movl $_strong_default_global, %eax
+
+@weak_default_global = weak global i32 42
+define i32* @get_weak_default_global() {
+ ret i32* @weak_default_global
+}
+; COFF: leaq weak_default_global(%rip), %rax
+; COFF_S: movl $weak_default_global, %eax
+; COFF32: movl $_weak_default_global, %eax
+
+@external_default_global = external global i32
+define i32* @get_external_default_global() {
+ ret i32* @external_default_global
+}
+; COFF: leaq external_default_global(%rip), %rax
+; COFF_S: movl $external_default_global, %eax
+; COFF32: movl $_external_default_global, %eax
+
+
+@strong_local_global = dso_local global i32 42
+define i32* @get_strong_local_global() {
+ ret i32* @strong_local_global
+}
+; COFF: leaq strong_local_global(%rip), %rax
+; COFF_S: movl $strong_local_global, %eax
+; COFF32: movl $_strong_local_global, %eax
+
+@weak_local_global = weak dso_local global i32 42
+define i32* @get_weak_local_global() {
+ ret i32* @weak_local_global
+}
+; COFF: leaq weak_local_global(%rip), %rax
+; COFF_S: movl $weak_local_global, %eax
+; COFF32: movl $_weak_local_global, %eax
+
+@external_local_global = external dso_local global i32
+define i32* @get_external_local_global() {
+ ret i32* @external_local_global
+}
+; COFF: leaq external_local_global(%rip), %rax
+; COFF_S: movl $external_local_global, %eax
+; COFF32: movl $_external_local_global, %eax
+
+
+@strong_preemptable_global = dso_preemptable global i32 42
+define i32* @get_strong_preemptable_global() {
+ ret i32* @strong_preemptable_global
+}
+; COFF: leaq strong_preemptable_global(%rip), %rax
+; COFF_S: movl $strong_preemptable_global, %eax
+; COFF32: movl $_strong_preemptable_global, %eax
+
+@weak_preemptable_global = weak dso_preemptable global i32 42
+define i32* @get_weak_preemptable_global() {
+ ret i32* @weak_preemptable_global
+}
+; COFF: leaq weak_preemptable_global(%rip), %rax
+; COFF_S: movl $weak_preemptable_global, %eax
+; COFF32: movl $_weak_preemptable_global, %eax
+
+@external_preemptable_global = external dso_preemptable global i32
+define i32* @get_external_preemptable_global() {
+ ret i32* @external_preemptable_global
+}
+; COFF: leaq external_preemptable_global(%rip), %rax
+; COFF_S: movl $external_preemptable_global, %eax
+; COFF32: movl $_external_preemptable_global, %eax
+
+
+; aliases
+@aliasee = global i32 42
+
+@strong_default_alias = alias i32, i32* @aliasee
+define i32* @get_strong_default_alias() {
+ ret i32* @strong_default_alias
+}
+; COFF: leaq strong_default_alias(%rip), %rax
+; COFF_S: movl $strong_default_alias, %eax
+; COFF32: movl $_strong_default_alias, %eax
+
+@weak_default_alias = weak alias i32, i32* @aliasee
+define i32* @get_weak_default_alias() {
+ ret i32* @weak_default_alias
+}
+; COFF: leaq weak_default_alias(%rip), %rax
+; COFF_S: movl $weak_default_alias, %eax
+; COFF32: movl $_weak_default_alias, %eax
+
+
+@strong_local_alias = dso_local alias i32, i32* @aliasee
+define i32* @get_strong_local_alias() {
+ ret i32* @strong_local_alias
+}
+; COFF: leaq strong_local_alias(%rip), %rax
+; COFF_S: movl $strong_local_alias, %eax
+; COFF32: movl $_strong_local_alias, %eax
+
+@weak_local_alias = weak dso_local alias i32, i32* @aliasee
+define i32* @get_weak_local_alias() {
+ ret i32* @weak_local_alias
+}
+; COFF: leaq weak_local_alias(%rip), %rax
+; COFF_S: movl $weak_local_alias, %eax
+; COFF32: movl $_weak_local_alias, %eax
+
+
+@strong_preemptable_alias = dso_preemptable alias i32, i32* @aliasee
+define i32* @get_strong_preemptable_alias() {
+ ret i32* @strong_preemptable_alias
+}
+; COFF: leaq strong_preemptable_alias(%rip), %rax
+; COFF_S: movl $strong_preemptable_alias, %eax
+; COFF32: movl $_strong_preemptable_alias, %eax
+
+@weak_preemptable_alias = weak dso_preemptable alias i32, i32* @aliasee
+define i32* @get_weak_preemptable_alias() {
+ ret i32* @weak_preemptable_alias
+}
+; COFF: leaq weak_preemptable_alias(%rip), %rax
+; COFF_S: movl $weak_preemptable_alias, %eax
+; COFF32: movl $_weak_preemptable_alias, %eax
+
+
+; functions
+
+define void @strong_default_function() {
+ ret void
+}
+define void()* @get_strong_default_function() {
+ ret void()* @strong_default_function
+}
+; COFF: leaq strong_default_function(%rip), %rax
+; COFF_S: movl $strong_default_function, %eax
+; COFF32: movl $_strong_default_function, %eax
+
+define weak void @weak_default_function() {
+ ret void
+}
+define void()* @get_weak_default_function() {
+ ret void()* @weak_default_function
+}
+; COFF: leaq weak_default_function(%rip), %rax
+; COFF_S: movl $weak_default_function, %eax
+; COFF32: movl $_weak_default_function, %eax
+
+declare void @external_default_function()
+define void()* @get_external_default_function() {
+ ret void()* @external_default_function
+}
+; COFF: leaq external_default_function(%rip), %rax
+; COFF_S: movl $external_default_function, %eax
+; COFF32: movl $_external_default_function, %eax
+
+
+define dso_local void @strong_local_function() {
+ ret void
+}
+define void()* @get_strong_local_function() {
+ ret void()* @strong_local_function
+}
+; COFF: leaq strong_local_function(%rip), %rax
+; COFF_S: movl $strong_local_function, %eax
+; COFF32: movl $_strong_local_function, %eax
+
+define weak dso_local void @weak_local_function() {
+ ret void
+}
+define void()* @get_weak_local_function() {
+ ret void()* @weak_local_function
+}
+; COFF: leaq weak_local_function(%rip), %rax
+; COFF_S: movl $weak_local_function, %eax
+; COFF32: movl $_weak_local_function, %eax
+
+declare dso_local void @external_local_function()
+define void()* @get_external_local_function() {
+ ret void()* @external_local_function
+}
+; COFF: leaq external_local_function(%rip), %rax
+; COFF_S: movl $external_local_function, %eax
+; COFF32: movl $_external_local_function, %eax
+
+
+define dso_preemptable void @strong_preemptable_function() {
+ ret void
+}
+define void()* @get_strong_preemptable_function() {
+ ret void()* @strong_preemptable_function
+}
+; COFF: leaq strong_preemptable_function(%rip), %rax
+; COFF_S: movl $strong_preemptable_function, %eax
+; COFF32: movl $_strong_preemptable_function, %eax
+
+define weak dso_preemptable void @weak_preemptable_function() {
+ ret void
+}
+define void()* @get_weak_preemptable_function() {
+ ret void()* @weak_preemptable_function
+}
+; COFF: leaq weak_preemptable_function(%rip), %rax
+; COFF_S: movl $weak_preemptable_function, %eax
+; COFF32: movl $_weak_preemptable_function, %eax
+
+declare dso_preemptable void @external_preemptable_function()
+define void()* @get_external_preemptable_function() {
+ ret void()* @external_preemptable_function
+}
+; COFF: leaq external_preemptable_function(%rip), %rax
+; COFF_S: movl $external_preemptable_function, %eax
+; COFF32: movl $_external_preemptable_function, %eax
diff --git a/test/CodeGen/X86/win64_sibcall.ll b/test/CodeGen/X86/win64_sibcall.ll
index 42dd4d31ca9f..5a65d34a4266 100644
--- a/test/CodeGen/X86/win64_sibcall.ll
+++ b/test/CodeGen/X86/win64_sibcall.ll
@@ -6,9 +6,9 @@
define void @C1(%Object addrspace(1)* %param0) gc "coreclr" {
entry:
-; WIN_X64: # BB#0:
+; WIN_X64: # %bb.0:
; WIN_X64: pushq %rax
-; LINUX: # BB#0: # %entry
+; LINUX: # %bb.0: # %entry
; LINUX: movq $0, -8(%rsp)
%this = alloca %Object addrspace(1)*
diff --git a/test/CodeGen/X86/win_chkstk.ll b/test/CodeGen/X86/win_chkstk.ll
index c7550a467a35..41fdccd9364e 100644
--- a/test/CodeGen/X86/win_chkstk.ll
+++ b/test/CodeGen/X86/win_chkstk.ll
@@ -30,19 +30,19 @@ entry:
; allocation.
define i32 @main128() nounwind {
entry:
-; WIN_X32: # BB#0:
+; WIN_X32: # %bb.0:
; WIN_X32-NOT: calll __chkstk
; WIN_X32: ret
-; WIN_X64: # BB#0:
+; WIN_X64: # %bb.0:
; WIN_X64-NOT: callq __chkstk
; WIN_X64: ret
-; MINGW_X64: # BB#0:
+; MINGW_X64: # %bb.0:
; MINGW_X64-NOT: callq ___chkstk_ms
; MINGW_X64: ret
-; LINUX: # BB#0:
+; LINUX: # %bb.0:
; LINUX-NOT: call __chkstk
; LINUX: ret
%array128 = alloca [128 x i8], align 16 ; <[128 x i8]*> [#uses=0]
diff --git a/test/CodeGen/X86/win_coreclr_chkstk.ll b/test/CodeGen/X86/win_coreclr_chkstk.ll
index b4b8010ec564..8934535d6f52 100644
--- a/test/CodeGen/X86/win_coreclr_chkstk.ll
+++ b/test/CodeGen/X86/win_coreclr_chkstk.ll
@@ -8,7 +8,7 @@
define i32 @main4k() nounwind {
entry:
; WIN_X64-LABEL:main4k:
-; WIN_X64: # BB#0:
+; WIN_X64: # %bb.0:
; WIN_X64: movl $4096, %eax
; WIN_X64: movq %rcx, 8(%rsp)
; WIN_X64: movq %rdx, 16(%rsp)
@@ -19,7 +19,7 @@ entry:
; WIN_X64: movq %gs:16, %rcx
; WIN_X64: cmpq %rcx, %rdx
; WIN_X64: jae .LBB0_3
-; WIN_X64:# BB#1:
+; WIN_X64:# %bb.1:
; WIN_X64: andq $-4096, %rdx
; WIN_X64:.LBB0_2:
; WIN_X64: leaq -4096(%rcx), %rcx
diff --git a/test/CodeGen/X86/x32-cet-intrinsics.ll b/test/CodeGen/X86/x32-cet-intrinsics.ll
new file mode 100644
index 000000000000..b6f6c05e4f8b
--- /dev/null
+++ b/test/CodeGen/X86/x32-cet-intrinsics.ll
@@ -0,0 +1,106 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=+shstk -mattr=+ibt | FileCheck %s
+
+define void @test_incsspd(i32 %a) local_unnamed_addr {
+; CHECK-LABEL: test_incsspd:
+; CHECK: ## %bb.0: ## %entry
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: incsspd %eax
+; CHECK-NEXT: retl
+entry:
+ tail call void @llvm.x86.incsspd(i32 %a)
+ ret void
+}
+
+declare void @llvm.x86.incsspd(i32)
+
+define i32 @test_rdsspd(i32 %a) {
+; CHECK-LABEL: test_rdsspd:
+; CHECK: ## %bb.0: ## %entry
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: rdsspd %eax
+; CHECK-NEXT: retl
+entry:
+ %0 = call i32 @llvm.x86.rdsspd(i32 %a)
+ ret i32 %0
+}
+
+declare i32 @llvm.x86.rdsspd(i32)
+
+define void @test_saveprevssp() {
+; CHECK-LABEL: test_saveprevssp:
+; CHECK: ## %bb.0: ## %entry
+; CHECK-NEXT: saveprevssp
+; CHECK-NEXT: retl
+entry:
+ tail call void @llvm.x86.saveprevssp()
+ ret void
+}
+
+declare void @llvm.x86.saveprevssp()
+
+define void @test_rstorssp(i8* %__p) {
+; CHECK-LABEL: test_rstorssp:
+; CHECK: ## %bb.0: ## %entry
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: rstorssp (%eax)
+; CHECK-NEXT: retl
+entry:
+ tail call void @llvm.x86.rstorssp(i8* %__p)
+ ret void
+}
+
+declare void @llvm.x86.rstorssp(i8*)
+
+define void @test_wrssd(i32 %a, i8* %__p) {
+; CHECK-LABEL: test_wrssd:
+; CHECK: ## %bb.0: ## %entry
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT: wrssd %eax, (%ecx)
+; CHECK-NEXT: retl
+entry:
+ tail call void @llvm.x86.wrssd(i32 %a, i8* %__p)
+ ret void
+}
+
+declare void @llvm.x86.wrssd(i32, i8*)
+
+define void @test_wrussd(i32 %a, i8* %__p) {
+; CHECK-LABEL: test_wrussd:
+; CHECK: ## %bb.0: ## %entry
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT: wrussd %eax, (%ecx)
+; CHECK-NEXT: retl
+entry:
+ tail call void @llvm.x86.wrussd(i32 %a, i8* %__p)
+ ret void
+}
+
+declare void @llvm.x86.wrussd(i32, i8*)
+
+define void @test_setssbsy() {
+; CHECK-LABEL: test_setssbsy:
+; CHECK: ## %bb.0: ## %entry
+; CHECK-NEXT: setssbsy
+; CHECK-NEXT: retl
+entry:
+ tail call void @llvm.x86.setssbsy()
+ ret void
+}
+
+declare void @llvm.x86.setssbsy()
+
+define void @test_clrssbsy(i8* %__p) {
+; CHECK-LABEL: test_clrssbsy:
+; CHECK: ## %bb.0: ## %entry
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT: clrssbsy (%eax)
+; CHECK-NEXT: retl
+entry:
+ tail call void @llvm.x86.clrssbsy(i8* %__p)
+ ret void
+}
+
+declare void @llvm.x86.clrssbsy(i8* %__p)
diff --git a/test/CodeGen/X86/x32-lea-1.ll b/test/CodeGen/X86/x32-lea-1.ll
index afe3581a85bc..0b687579fb81 100644
--- a/test/CodeGen/X86/x32-lea-1.ll
+++ b/test/CodeGen/X86/x32-lea-1.ll
@@ -3,7 +3,7 @@
define void @foo(i32** %p) {
; CHECK-LABEL: foo:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: leal -{{[0-9]+}}(%rsp), %eax
; CHECK-NEXT: addl $16, %eax
; CHECK-NEXT: movl %eax, (%edi)
diff --git a/test/CodeGen/X86/x64-cet-intrinsics.ll b/test/CodeGen/X86/x64-cet-intrinsics.ll
new file mode 100644
index 000000000000..de95e1916bc8
--- /dev/null
+++ b/test/CodeGen/X86/x64-cet-intrinsics.ll
@@ -0,0 +1,150 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+shstk -mattr=+ibt | FileCheck %s
+
+define void @test_incsspd(i32 %a) local_unnamed_addr {
+; CHECK-LABEL: test_incsspd:
+; CHECK: ## %bb.0: ## %entry
+; CHECK-NEXT: incsspd %edi
+; CHECK-NEXT: retq
+entry:
+ tail call void @llvm.x86.incsspd(i32 %a)
+ ret void
+}
+
+declare void @llvm.x86.incsspd(i32)
+
+define void @test_incsspq(i32 %a) local_unnamed_addr {
+; CHECK-LABEL: test_incsspq:
+; CHECK: ## %bb.0: ## %entry
+; CHECK-NEXT: movslq %edi, %rax
+; CHECK-NEXT: incsspq %rax
+; CHECK-NEXT: retq
+entry:
+ %conv.i = sext i32 %a to i64
+ tail call void @llvm.x86.incsspq(i64 %conv.i)
+ ret void
+}
+
+declare void @llvm.x86.incsspq(i64)
+
+define i32 @test_rdsspd(i32 %a) {
+; CHECK-LABEL: test_rdsspd:
+; CHECK: ## %bb.0: ## %entry
+; CHECK-NEXT: rdsspd %edi
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: retq
+entry:
+ %0 = call i32 @llvm.x86.rdsspd(i32 %a)
+ ret i32 %0
+}
+
+declare i32 @llvm.x86.rdsspd(i32)
+
+define i64 @test_rdsspq(i64 %a) {
+; CHECK-LABEL: test_rdsspq:
+; CHECK: ## %bb.0: ## %entry
+; CHECK-NEXT: rdsspq %rdi
+; CHECK-NEXT: movq %rdi, %rax
+; CHECK-NEXT: retq
+entry:
+ %0 = call i64 @llvm.x86.rdsspq(i64 %a)
+ ret i64 %0
+}
+
+declare i64 @llvm.x86.rdsspq(i64)
+
+define void @test_saveprevssp() {
+; CHECK-LABEL: test_saveprevssp:
+; CHECK: ## %bb.0: ## %entry
+; CHECK-NEXT: saveprevssp
+; CHECK-NEXT: retq
+entry:
+ tail call void @llvm.x86.saveprevssp()
+ ret void
+}
+
+declare void @llvm.x86.saveprevssp()
+
+define void @test_rstorssp(i8* %__p) {
+; CHECK-LABEL: test_rstorssp:
+; CHECK: ## %bb.0: ## %entry
+; CHECK-NEXT: rstorssp (%rdi)
+; CHECK-NEXT: retq
+entry:
+ tail call void @llvm.x86.rstorssp(i8* %__p)
+ ret void
+}
+
+declare void @llvm.x86.rstorssp(i8*)
+
+define void @test_wrssd(i32 %a, i8* %__p) {
+; CHECK-LABEL: test_wrssd:
+; CHECK: ## %bb.0: ## %entry
+; CHECK-NEXT: wrssd %edi, (%rsi)
+; CHECK-NEXT: retq
+entry:
+ tail call void @llvm.x86.wrssd(i32 %a, i8* %__p)
+ ret void
+}
+
+declare void @llvm.x86.wrssd(i32, i8*)
+
+define void @test_wrssq(i64 %a, i8* %__p) {
+; CHECK-LABEL: test_wrssq:
+; CHECK: ## %bb.0: ## %entry
+; CHECK-NEXT: wrssq %rdi, (%rsi)
+; CHECK-NEXT: retq
+entry:
+ tail call void @llvm.x86.wrssq(i64 %a, i8* %__p)
+ ret void
+}
+
+declare void @llvm.x86.wrssq(i64, i8*)
+
+define void @test_wrussd(i32 %a, i8* %__p) {
+; CHECK-LABEL: test_wrussd:
+; CHECK: ## %bb.0: ## %entry
+; CHECK-NEXT: wrussd %edi, (%rsi)
+; CHECK-NEXT: retq
+entry:
+ tail call void @llvm.x86.wrussd(i32 %a, i8* %__p)
+ ret void
+}
+
+declare void @llvm.x86.wrussd(i32, i8*)
+
+define void @test_wrussq(i64 %a, i8* %__p) {
+; CHECK-LABEL: test_wrussq:
+; CHECK: ## %bb.0: ## %entry
+; CHECK-NEXT: wrussq %rdi, (%rsi)
+; CHECK-NEXT: retq
+entry:
+ tail call void @llvm.x86.wrussq(i64 %a, i8* %__p)
+ ret void
+}
+
+declare void @llvm.x86.wrussq(i64, i8*)
+
+define void @test_setssbsy() {
+; CHECK-LABEL: test_setssbsy:
+; CHECK: ## %bb.0: ## %entry
+; CHECK-NEXT: setssbsy
+; CHECK-NEXT: retq
+entry:
+ tail call void @llvm.x86.setssbsy()
+ ret void
+}
+
+declare void @llvm.x86.setssbsy()
+
+define void @test_clrssbsy(i8* %__p) {
+; CHECK-LABEL: test_clrssbsy:
+; CHECK: ## %bb.0: ## %entry
+; CHECK-NEXT: clrssbsy (%rdi)
+; CHECK-NEXT: retq
+entry:
+ tail call void @llvm.x86.clrssbsy(i8* %__p)
+ ret void
+}
+
+declare void @llvm.x86.clrssbsy(i8* %__p)
diff --git a/test/CodeGen/X86/x86-64-disp.ll b/test/CodeGen/X86/x86-64-disp.ll
index 790634c1e4a5..fd63ff1be714 100644
--- a/test/CodeGen/X86/x86-64-disp.ll
+++ b/test/CodeGen/X86/x86-64-disp.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 | grep mov | count 2
+; RUN: llc < %s | grep mov | count 2
; Fold an offset into an address even if it's not a 32-bit
; signed integer.
diff --git a/test/CodeGen/X86/x86-64-double-shifts-Oz-Os-O2.ll b/test/CodeGen/X86/x86-64-double-shifts-Oz-Os-O2.ll
index ba559aa2ff0e..2fd98727421e 100644
--- a/test/CodeGen/X86/x86-64-double-shifts-Oz-Os-O2.ll
+++ b/test/CodeGen/X86/x86-64-double-shifts-Oz-Os-O2.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mcpu=bdver1 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=bdver1 | FileCheck %s
; clang -Oz -c test1.cpp -emit-llvm -S -o
; Verify that we generate shld insruction when we are optimizing for size,
diff --git a/test/CodeGen/X86/x86-64-double-shifts-var.ll b/test/CodeGen/X86/x86-64-double-shifts-var.ll
index c025ee874b2d..b78ab0825a58 100644
--- a/test/CodeGen/X86/x86-64-double-shifts-var.ll
+++ b/test/CodeGen/X86/x86-64-double-shifts-var.ll
@@ -1,23 +1,23 @@
-; RUN: llc < %s -march=x86-64 -mcpu=athlon | FileCheck %s
-; RUN: llc < %s -march=x86-64 -mcpu=athlon-tbird | FileCheck %s
-; RUN: llc < %s -march=x86-64 -mcpu=athlon-4 | FileCheck %s
-; RUN: llc < %s -march=x86-64 -mcpu=athlon-xp | FileCheck %s
-; RUN: llc < %s -march=x86-64 -mcpu=athlon-mp | FileCheck %s
-; RUN: llc < %s -march=x86-64 -mcpu=k8 | FileCheck %s
-; RUN: llc < %s -march=x86-64 -mcpu=opteron | FileCheck %s
-; RUN: llc < %s -march=x86-64 -mcpu=athlon64 | FileCheck %s
-; RUN: llc < %s -march=x86-64 -mcpu=athlon-fx | FileCheck %s
-; RUN: llc < %s -march=x86-64 -mcpu=k8-sse3 | FileCheck %s
-; RUN: llc < %s -march=x86-64 -mcpu=opteron-sse3 | FileCheck %s
-; RUN: llc < %s -march=x86-64 -mcpu=athlon64-sse3 | FileCheck %s
-; RUN: llc < %s -march=x86-64 -mcpu=amdfam10 | FileCheck %s
-; RUN: llc < %s -march=x86-64 -mcpu=btver1 | FileCheck %s
-; RUN: llc < %s -march=x86-64 -mcpu=btver2 | FileCheck %s
-; RUN: llc < %s -march=x86-64 -mcpu=bdver1 | FileCheck %s
-; RUN: llc < %s -march=x86-64 -mcpu=bdver2 | FileCheck %s
-; RUN: llc < %s -march=x86-64 -mcpu=bdver3 | FileCheck %s
-; RUN: llc < %s -march=x86-64 -mcpu=bdver4 | FileCheck %s
-; RUN: llc < %s -march=x86-64 -mcpu=znver1 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=athlon | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=athlon-tbird | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=athlon-4 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=athlon-xp | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=athlon-mp | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=k8 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=opteron | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=athlon64 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=athlon-fx | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=k8-sse3 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=opteron-sse3 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=athlon64-sse3 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=amdfam10 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=btver1 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=btver2 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=bdver1 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=bdver2 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=bdver3 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=bdver4 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver1 | FileCheck %s
; Verify that for the X86_64 processors that are known to have poor latency
; double precision shift instructions we do not generate 'shld' or 'shrd'
diff --git a/test/CodeGen/X86/x86-64-intrcc-nosse.ll b/test/CodeGen/X86/x86-64-intrcc-nosse.ll
index ab84088c3444..7b39ab64db8a 100644
--- a/test/CodeGen/X86/x86-64-intrcc-nosse.ll
+++ b/test/CodeGen/X86/x86-64-intrcc-nosse.ll
@@ -8,7 +8,7 @@
; Clobbered SSE must not be saved when the target doesn't support SSE
define x86_intrcc void @test_isr_sse_clobbers(%struct.interrupt_frame* %frame, i64 %ecode) {
; CHECK-LABEL: test_isr_sse_clobbers:
- ; CHECK: # BB#0:
+ ; CHECK: # %bb.0:
; CHECK-NEXT: pushq %rax
; CHECK-NEXT: cld
; CHECK-NEXT: #APP
diff --git a/test/CodeGen/X86/x86-64-ret0.ll b/test/CodeGen/X86/x86-64-ret0.ll
index c74f6d803b1c..3e840ab2edf6 100644
--- a/test/CodeGen/X86/x86-64-ret0.ll
+++ b/test/CodeGen/X86/x86-64-ret0.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 | grep mov | count 1
+; RUN: llc < %s -mtriple=x86_64-- | grep mov | count 1
define i32 @f() nounwind {
tail call void @t( i32 1 ) nounwind
diff --git a/test/CodeGen/X86/x86-cmov-converter.ll b/test/CodeGen/X86/x86-cmov-converter.ll
index 39877c14429f..5fec1380e14b 100644
--- a/test/CodeGen/X86/x86-cmov-converter.ll
+++ b/test/CodeGen/X86/x86-cmov-converter.ll
@@ -9,7 +9,7 @@
;; Thus, it worths transforming.
;;
;; 2. CmovNotInCriticalPath:
-;; similar test like in (1), just that CMOV is not in the hot path.
+;; Similar test like in (1), just that CMOV is not in the hot path.
;; Thus, it does not worth transforming.
;;
;; 3. MaxIndex:
@@ -32,10 +32,10 @@
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;void CmovInHotPath(int n, int a, int b, int *c, int *d) {
;; for (int i = 0; i < n; i++) {
-;; int t = c[i];
+;; int t = c[i] + 1;
;; if (c[i] * a > b)
;; t = 10;
-;; c[i] = t;
+;; c[i] = (c[i] + 1) * t;
;; }
;;}
;;
@@ -87,6 +87,16 @@
;; }
;; return Curr->Val;
;;}
+;;
+;;
+;;void SmallGainPerLoop(int n, int a, int b, int *c, int *d) {
+;; for (int i = 0; i < n; i++) {
+;; int t = c[i];
+;; if (c[i] * a > b)
+;; t = 10;
+;; c[i] = t;
+;; }
+;;}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
%struct.Node = type { i32, %struct.Node*, %struct.Node* }
@@ -111,10 +121,12 @@ for.body: ; preds = %for.body.preheader,
%indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
%arrayidx = getelementptr inbounds i32, i32* %c, i64 %indvars.iv
%0 = load i32, i32* %arrayidx, align 4
+ %add = add nsw i32 %0, 1
%mul = mul nsw i32 %0, %a
%cmp3 = icmp sgt i32 %mul, %b
- %. = select i1 %cmp3, i32 10, i32 %0
- store i32 %., i32* %arrayidx, align 4
+ %. = select i1 %cmp3, i32 10, i32 %add
+ %mul7 = mul nsw i32 %., %add
+ store i32 %mul7, i32* %arrayidx, align 4
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
br i1 %exitcond, label %for.cond.cleanup, label %for.body
@@ -284,9 +296,9 @@ while.end: ; preds = %while.body, %entry
; CHECK-LABEL: Transform
; CHECK-NOT: cmov
; CHECK: divl [[a:%[0-9a-z]*]]
-; CHECK: cmpl [[a]], %eax
; CHECK: movl $11, [[s1:%[0-9a-z]*]]
; CHECK: movl [[a]], [[s2:%[0-9a-z]*]]
+; CHECK: cmpl [[a]], %edx
; CHECK: ja [[SinkBB:.*]]
; CHECK: [[FalseBB:.*]]:
; CHECK: movl $22, [[s1]]
@@ -318,4 +330,163 @@ while.end: ; preds = %while.body, %entry
ret void
}
+; Test that we always will convert a cmov with a memory operand into a branch,
+; even outside of a loop.
+define i32 @test_cmov_memoperand(i32 %a, i32 %b, i32 %x, i32* %y) #0 {
+; CHECK-LABEL: test_cmov_memoperand:
+entry:
+ %cond = icmp ugt i32 %a, %b
+; CHECK: cmpl
+ %load = load i32, i32* %y
+ %z = select i1 %cond, i32 %x, i32 %load
+; CHECK-NOT: cmov
+; CHECK: ja [[FALSE_BB:.*]]
+; CHECK: movl (%r{{..}}), %[[R:.*]]
+; CHECK: [[FALSE_BB]]:
+; CHECK: movl %[[R]], %
+ ret i32 %z
+}
+
+; Test that we can convert a group of cmovs where only one has a memory
+; operand.
+define i32 @test_cmov_memoperand_in_group(i32 %a, i32 %b, i32 %x, i32* %y.ptr) #0 {
+; CHECK-LABEL: test_cmov_memoperand_in_group:
+entry:
+ %cond = icmp ugt i32 %a, %b
+; CHECK: cmpl
+ %y = load i32, i32* %y.ptr
+ %z1 = select i1 %cond, i32 %x, i32 %a
+ %z2 = select i1 %cond, i32 %x, i32 %y
+ %z3 = select i1 %cond, i32 %x, i32 %b
+; CHECK-NOT: cmov
+; CHECK: ja [[FALSE_BB:.*]]
+; CHECK-DAG: movl %{{.*}}, %[[R1:.*]]
+; CHECK-DAG: movl (%r{{..}}), %[[R2:.*]]
+; CHECK-DAG: movl %{{.*}} %[[R3:.*]]
+; CHECK: [[FALSE_BB]]:
+; CHECK: addl
+; CHECK-DAG: %[[R1]]
+; CHECK-DAG: ,
+; CHECK-DAG: %[[R3]]
+; CHECK-DAG: addl
+; CHECK-DAG: %[[R2]]
+; CHECK-DAG: ,
+; CHECK-DAG: %[[R3]]
+; CHECK: movl %[[R3]], %eax
+; CHECK: retq
+ %s1 = add i32 %z1, %z2
+ %s2 = add i32 %s1, %z3
+ ret i32 %s2
+}
+
+; Same as before but with operands reversed in the select with a load.
+define i32 @test_cmov_memoperand_in_group2(i32 %a, i32 %b, i32 %x, i32* %y.ptr) #0 {
+; CHECK-LABEL: test_cmov_memoperand_in_group2:
+entry:
+ %cond = icmp ugt i32 %a, %b
+; CHECK: cmpl
+ %y = load i32, i32* %y.ptr
+ %z2 = select i1 %cond, i32 %a, i32 %x
+ %z1 = select i1 %cond, i32 %y, i32 %x
+ %z3 = select i1 %cond, i32 %b, i32 %x
+; CHECK-NOT: cmov
+; CHECK: jbe [[FALSE_BB:.*]]
+; CHECK-DAG: movl %{{.*}}, %[[R1:.*]]
+; CHECK-DAG: movl (%r{{..}}), %[[R2:.*]]
+; CHECK-DAG: movl %{{.*}} %[[R3:.*]]
+; CHECK: [[FALSE_BB]]:
+; CHECK: addl
+; CHECK-DAG: %[[R1]]
+; CHECK-DAG: ,
+; CHECK-DAG: %[[R3]]
+; CHECK-DAG: addl
+; CHECK-DAG: %[[R2]]
+; CHECK-DAG: ,
+; CHECK-DAG: %[[R3]]
+; CHECK: movl %[[R3]], %eax
+; CHECK: retq
+ %s1 = add i32 %z1, %z2
+ %s2 = add i32 %s1, %z3
+ ret i32 %s2
+}
+
+; Test that we don't convert a group of cmovs with conflicting directions of
+; loads.
+define i32 @test_cmov_memoperand_conflicting_dir(i32 %a, i32 %b, i32 %x, i32* %y1.ptr, i32* %y2.ptr) #0 {
+; CHECK-LABEL: test_cmov_memoperand_conflicting_dir:
+entry:
+ %cond = icmp ugt i32 %a, %b
+; CHECK: cmpl
+ %y1 = load i32, i32* %y1.ptr
+ %y2 = load i32, i32* %y2.ptr
+ %z1 = select i1 %cond, i32 %x, i32 %y1
+ %z2 = select i1 %cond, i32 %y2, i32 %x
+; CHECK: cmoval
+; CHECK: cmoval
+ %s1 = add i32 %z1, %z2
+ ret i32 %s1
+}
+
+; Test that we can convert a group of cmovs where only one has a memory
+; operand and where that memory operand's registers come from a prior cmov in
+; the group.
+define i32 @test_cmov_memoperand_in_group_reuse_for_addr(i32 %a, i32 %b, i32* %x, i32* %y) #0 {
+; CHECK-LABEL: test_cmov_memoperand_in_group_reuse_for_addr:
+entry:
+ %cond = icmp ugt i32 %a, %b
+; CHECK: cmpl
+ %p = select i1 %cond, i32* %x, i32* %y
+ %load = load i32, i32* %p
+ %z = select i1 %cond, i32 %a, i32 %load
+; CHECK-NOT: cmov
+; CHECK: ja [[FALSE_BB:.*]]
+; CHECK: movl (%r{{..}}), %[[R:.*]]
+; CHECK: [[FALSE_BB]]:
+; CHECK: movl %[[R]], %eax
+; CHECK: retq
+ ret i32 %z
+}
+
+; Test that we can convert a group of two cmovs with memory operands where one
+; uses the result of the other as part of the address.
+define i32 @test_cmov_memoperand_in_group_reuse_for_addr2(i32 %a, i32 %b, i32* %x, i32** %y) #0 {
+; CHECK-LABEL: test_cmov_memoperand_in_group_reuse_for_addr2:
+entry:
+ %cond = icmp ugt i32 %a, %b
+; CHECK: cmpl
+ %load1 = load i32*, i32** %y
+ %p = select i1 %cond, i32* %x, i32* %load1
+ %load2 = load i32, i32* %p
+ %z = select i1 %cond, i32 %a, i32 %load2
+; CHECK-NOT: cmov
+; CHECK: ja [[FALSE_BB:.*]]
+; CHECK: movq (%r{{..}}), %[[R1:.*]]
+; CHECK: movl (%[[R1]]), %[[R2:.*]]
+; CHECK: [[FALSE_BB]]:
+; CHECK: movl %[[R2]], %eax
+; CHECK: retq
+ ret i32 %z
+}
+
+; Test that we can convert a group of cmovs where only one has a memory
+; operand and where that memory operand's registers come from a prior cmov and
+; where that cmov gets *its* input from a prior cmov in the group.
+define i32 @test_cmov_memoperand_in_group_reuse_for_addr3(i32 %a, i32 %b, i32* %x, i32* %y, i32* %z) #0 {
+; CHECK-LABEL: test_cmov_memoperand_in_group_reuse_for_addr3:
+entry:
+ %cond = icmp ugt i32 %a, %b
+; CHECK: cmpl
+ %p = select i1 %cond, i32* %x, i32* %y
+ %p2 = select i1 %cond, i32* %z, i32* %p
+ %load = load i32, i32* %p2
+ %r = select i1 %cond, i32 %a, i32 %load
+; CHECK-NOT: cmov
+; CHECK: ja [[FALSE_BB:.*]]
+; CHECK: movl (%r{{..}}), %[[R:.*]]
+; CHECK: [[FALSE_BB]]:
+; CHECK: movl %[[R]], %eax
+; CHECK: retq
+ ret i32 %r
+}
+
attributes #0 = {"target-cpu"="x86-64"}
diff --git a/test/CodeGen/X86/x86-fold-pshufb.ll b/test/CodeGen/X86/x86-fold-pshufb.ll
index 84af4f5d4b86..a07593390d09 100644
--- a/test/CodeGen/X86/x86-fold-pshufb.ll
+++ b/test/CodeGen/X86/x86-fold-pshufb.ll
@@ -1,13 +1,13 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -relocation-model=pic -march=x86-64 -mtriple=x86_64-unknown-unknown -mattr=+ssse3 < %s | FileCheck %s
-; RUN: llc -march=x86-64 -mtriple=x86_64-unknown-unknown -mattr=+ssse3 < %s | FileCheck %s
+; RUN: llc -relocation-model=pic -mtriple=x86_64-unknown-unknown -mattr=+ssse3 < %s | FileCheck %s
+; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+ssse3 < %s | FileCheck %s
; Verify that the backend correctly folds the shuffle in function 'fold_pshufb'
; into a simple load from constant pool.
define <2 x i64> @fold_pshufb() {
; CHECK-LABEL: fold_pshufb:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movaps {{.*#+}} xmm0 = [0,0,0,0,1,0,0,0,2,0,0,0,3,0,0,0]
; CHECK-NEXT: retq
entry:
@@ -23,7 +23,7 @@ entry:
define <2 x i64> @pr24562() {
; CHECK-LABEL: pr24562:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: xorps %xmm0, %xmm0
; CHECK-NEXT: retq
entry:
diff --git a/test/CodeGen/X86/x86-interleaved-access.ll b/test/CodeGen/X86/x86-interleaved-access.ll
index 5f85975fdb5c..792bbbed52e1 100644
--- a/test/CodeGen/X86/x86-interleaved-access.ll
+++ b/test/CodeGen/X86/x86-interleaved-access.ll
@@ -1,11 +1,11 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx | FileCheck %s --check-prefix=AVX1
; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
-; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX3
+; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx512f -mattr=+avx512bw | FileCheck %s --check-prefix=AVX --check-prefix=AVX512
define <4 x double> @load_factorf64_4(<16 x double>* %ptr) {
; AVX1-LABEL: load_factorf64_4:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovupd (%rdi), %ymm0
; AVX1-NEXT: vmovupd 32(%rdi), %ymm1
; AVX1-NEXT: vmovupd 64(%rdi), %ymm2
@@ -22,7 +22,7 @@ define <4 x double> @load_factorf64_4(<16 x double>* %ptr) {
; AVX1-NEXT: retq
;
; AVX-LABEL: load_factorf64_4:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovupd (%rdi), %ymm0
; AVX-NEXT: vmovupd 32(%rdi), %ymm1
; AVX-NEXT: vmovupd 64(%rdi), %ymm2
@@ -50,7 +50,7 @@ define <4 x double> @load_factorf64_4(<16 x double>* %ptr) {
define <4 x double> @load_factorf64_2(<16 x double>* %ptr) {
; AVX1-LABEL: load_factorf64_2:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovupd (%rdi), %ymm0
; AVX1-NEXT: vmovupd 32(%rdi), %ymm1
; AVX1-NEXT: vmovupd 64(%rdi), %ymm2
@@ -65,7 +65,7 @@ define <4 x double> @load_factorf64_2(<16 x double>* %ptr) {
; AVX1-NEXT: retq
;
; AVX-LABEL: load_factorf64_2:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovupd (%rdi), %ymm0
; AVX-NEXT: vmovupd 32(%rdi), %ymm1
; AVX-NEXT: vmovupd 64(%rdi), %ymm2
@@ -87,7 +87,7 @@ define <4 x double> @load_factorf64_2(<16 x double>* %ptr) {
define <4 x double> @load_factorf64_1(<16 x double>* %ptr) {
; AVX1-LABEL: load_factorf64_1:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vmovupd (%rdi), %ymm0
; AVX1-NEXT: vmovupd 32(%rdi), %ymm1
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],mem[0,1]
@@ -97,7 +97,7 @@ define <4 x double> @load_factorf64_1(<16 x double>* %ptr) {
; AVX1-NEXT: retq
;
; AVX-LABEL: load_factorf64_1:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovupd (%rdi), %ymm0
; AVX-NEXT: vmovupd 32(%rdi), %ymm1
; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],mem[0,1]
@@ -114,11 +114,11 @@ define <4 x double> @load_factorf64_1(<16 x double>* %ptr) {
define <4 x i64> @load_factori64_4(<16 x i64>* %ptr) {
; AVX1-LABEL: load_factori64_4:
-; AVX1: # BB#0:
-; AVX1-NEXT: vmovupd (%rdi), %ymm0
-; AVX1-NEXT: vmovupd 32(%rdi), %ymm1
-; AVX1-NEXT: vmovupd 64(%rdi), %ymm2
-; AVX1-NEXT: vmovupd 96(%rdi), %ymm3
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovups (%rdi), %ymm0
+; AVX1-NEXT: vmovups 32(%rdi), %ymm1
+; AVX1-NEXT: vmovups 64(%rdi), %ymm2
+; AVX1-NEXT: vmovups 96(%rdi), %ymm3
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[0,1],ymm2[0,1]
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm1[0,1],ymm3[0,1]
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
@@ -141,7 +141,7 @@ define <4 x i64> @load_factori64_4(<16 x i64>* %ptr) {
; AVX1-NEXT: retq
;
; AVX-LABEL: load_factori64_4:
-; AVX: # BB#0:
+; AVX: # %bb.0:
; AVX-NEXT: vmovdqu (%rdi), %ymm0
; AVX-NEXT: vmovdqu 32(%rdi), %ymm1
; AVX-NEXT: vmovdqu 64(%rdi), %ymm2
@@ -171,7 +171,7 @@ define <4 x i64> @load_factori64_4(<16 x i64>* %ptr) {
define void @store_factorf64_4(<16 x double>* %ptr, <4 x double> %v0, <4 x double> %v1, <4 x double> %v2, <4 x double> %v3) {
; AVX1-LABEL: store_factorf64_4:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm4
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm5
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
@@ -180,29 +180,46 @@ define void @store_factorf64_4(<16 x double>* %ptr, <4 x double> %v0, <4 x doubl
; AVX1-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
; AVX1-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3]
; AVX1-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
-; AVX1-NEXT: vmovupd %ymm0, 96(%rdi)
-; AVX1-NEXT: vmovupd %ymm3, 64(%rdi)
-; AVX1-NEXT: vmovupd %ymm4, 32(%rdi)
-; AVX1-NEXT: vmovupd %ymm2, (%rdi)
+; AVX1-NEXT: vmovups %ymm0, 96(%rdi)
+; AVX1-NEXT: vmovups %ymm3, 64(%rdi)
+; AVX1-NEXT: vmovups %ymm4, 32(%rdi)
+; AVX1-NEXT: vmovups %ymm2, (%rdi)
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
-; AVX-LABEL: store_factorf64_4:
-; AVX: # BB#0:
-; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm4
-; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm5
-; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
-; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
-; AVX-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2]
-; AVX-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
-; AVX-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3]
-; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
-; AVX-NEXT: vmovupd %ymm0, 96(%rdi)
-; AVX-NEXT: vmovupd %ymm3, 64(%rdi)
-; AVX-NEXT: vmovupd %ymm4, 32(%rdi)
-; AVX-NEXT: vmovupd %ymm2, (%rdi)
-; AVX-NEXT: vzeroupper
-; AVX-NEXT: retq
+; AVX2-LABEL: store_factorf64_4:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm4
+; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm5
+; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
+; AVX2-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
+; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2]
+; AVX2-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; AVX2-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3]
+; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; AVX2-NEXT: vmovups %ymm0, 96(%rdi)
+; AVX2-NEXT: vmovups %ymm3, 64(%rdi)
+; AVX2-NEXT: vmovups %ymm4, 32(%rdi)
+; AVX2-NEXT: vmovups %ymm2, (%rdi)
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: store_factorf64_4:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm4
+; AVX512-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm5
+; AVX512-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
+; AVX512-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
+; AVX512-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2]
+; AVX512-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; AVX512-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3]
+; AVX512-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; AVX512-NEXT: vinsertf64x4 $1, %ymm4, %zmm2, %zmm1
+; AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm3, %zmm0
+; AVX512-NEXT: vmovups %zmm0, 64(%rdi)
+; AVX512-NEXT: vmovups %zmm1, (%rdi)
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
%s0 = shufflevector <4 x double> %v0, <4 x double> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%s1 = shufflevector <4 x double> %v2, <4 x double> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%interleaved.vec = shufflevector <8 x double> %s0, <8 x double> %s1, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
@@ -212,7 +229,7 @@ define void @store_factorf64_4(<16 x double>* %ptr, <4 x double> %v0, <4 x doubl
define void @store_factori64_4(<16 x i64>* %ptr, <4 x i64> %v0, <4 x i64> %v1, <4 x i64> %v2, <4 x i64> %v3) {
; AVX1-LABEL: store_factori64_4:
-; AVX1: # BB#0:
+; AVX1: # %bb.0:
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm4
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm5
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
@@ -221,29 +238,46 @@ define void @store_factori64_4(<16 x i64>* %ptr, <4 x i64> %v0, <4 x i64> %v1, <
; AVX1-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
; AVX1-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3]
; AVX1-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
-; AVX1-NEXT: vmovupd %ymm0, 96(%rdi)
-; AVX1-NEXT: vmovupd %ymm3, 64(%rdi)
-; AVX1-NEXT: vmovupd %ymm4, 32(%rdi)
-; AVX1-NEXT: vmovupd %ymm2, (%rdi)
+; AVX1-NEXT: vmovups %ymm0, 96(%rdi)
+; AVX1-NEXT: vmovups %ymm3, 64(%rdi)
+; AVX1-NEXT: vmovups %ymm4, 32(%rdi)
+; AVX1-NEXT: vmovups %ymm2, (%rdi)
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
-; AVX-LABEL: store_factori64_4:
-; AVX: # BB#0:
-; AVX-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm4
-; AVX-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm5
-; AVX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
-; AVX-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
-; AVX-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2]
-; AVX-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
-; AVX-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3]
-; AVX-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
-; AVX-NEXT: vmovdqu %ymm0, 96(%rdi)
-; AVX-NEXT: vmovdqu %ymm3, 64(%rdi)
-; AVX-NEXT: vmovdqu %ymm4, 32(%rdi)
-; AVX-NEXT: vmovdqu %ymm2, (%rdi)
-; AVX-NEXT: vzeroupper
-; AVX-NEXT: retq
+; AVX2-LABEL: store_factori64_4:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm4
+; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm5
+; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
+; AVX2-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
+; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2]
+; AVX2-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; AVX2-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3]
+; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; AVX2-NEXT: vmovups %ymm0, 96(%rdi)
+; AVX2-NEXT: vmovups %ymm3, 64(%rdi)
+; AVX2-NEXT: vmovups %ymm4, 32(%rdi)
+; AVX2-NEXT: vmovups %ymm2, (%rdi)
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: store_factori64_4:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm4
+; AVX512-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm5
+; AVX512-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
+; AVX512-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
+; AVX512-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2]
+; AVX512-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; AVX512-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3]
+; AVX512-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; AVX512-NEXT: vinsertf64x4 $1, %ymm4, %zmm2, %zmm1
+; AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm3, %zmm0
+; AVX512-NEXT: vmovups %zmm0, 64(%rdi)
+; AVX512-NEXT: vmovups %zmm1, (%rdi)
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
%s0 = shufflevector <4 x i64> %v0, <4 x i64> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%s1 = shufflevector <4 x i64> %v2, <4 x i64> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%interleaved.vec = shufflevector <8 x i64> %s0, <8 x i64> %s1, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
@@ -254,111 +288,83 @@ define void @store_factori64_4(<16 x i64>* %ptr, <4 x i64> %v0, <4 x i64> %v1, <
define void @interleaved_store_vf32_i8_stride4(<32 x i8> %x1, <32 x i8> %x2, <32 x i8> %x3, <32 x i8> %x4, <128 x i8>* %p) {
; AVX1-LABEL: interleaved_store_vf32_i8_stride4:
-; AVX1: # BB#0:
-; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm5
-; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [65535,0,65535,0,65535,0,65535,0,65535,0,65535,0,65535,0,65535,0]
-; AVX1-NEXT: vandnps %ymm5, %ymm4, %ymm5
-; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7]
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero
-; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6
-; AVX1-NEXT: vandps %ymm4, %ymm6, %ymm6
-; AVX1-NEXT: vorps %ymm5, %ymm6, %ymm8
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15]
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7]
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3]
-; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6
-; AVX1-NEXT: vandnps %ymm6, %ymm4, %ymm6
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7]
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm7 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero
-; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm7, %ymm5
-; AVX1-NEXT: vandps %ymm4, %ymm5, %ymm5
-; AVX1-NEXT: vorps %ymm6, %ymm5, %ymm9
-; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15]
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7]
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3]
-; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm5, %ymm5
-; AVX1-NEXT: vandnps %ymm5, %ymm4, %ymm5
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7]
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm7 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero
-; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6
-; AVX1-NEXT: vandps %ymm4, %ymm6, %ymm6
-; AVX1-NEXT: vorps %ymm5, %ymm6, %ymm5
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm6
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm0
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7]
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15]
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
-; AVX1-NEXT: vandnps %ymm2, %ymm4, %ymm2
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4,4,5,5,6,6,7,7]
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0
-; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm6[8],xmm0[9],xmm6[9],xmm0[10],xmm6[10],xmm0[11],xmm6[11],xmm0[12],xmm6[12],xmm0[13],xmm6[13],xmm0[14],xmm6[14],xmm0[15],xmm6[15]
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3]
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm6, %ymm3
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm7[4],xmm1[4],xmm7[5],xmm1[5],xmm7[6],xmm1[6],xmm7[7],xmm1[7]
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3]
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3]
+; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm9[4],xmm0[4],xmm9[5],xmm0[5],xmm9[6],xmm0[6],xmm9[7],xmm0[7]
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm8[4],xmm2[4],xmm8[5],xmm2[5],xmm8[6],xmm2[6],xmm8[7],xmm2[7]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm4
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm2
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3]
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm5[2,3],ymm0[2,3]
; AVX1-NEXT: vmovaps %ymm0, 96(%rdi)
-; AVX1-NEXT: vmovaps %ymm5, 64(%rdi)
-; AVX1-NEXT: vmovaps %ymm9, 32(%rdi)
-; AVX1-NEXT: vmovaps %ymm8, (%rdi)
+; AVX1-NEXT: vmovaps %ymm1, 64(%rdi)
+; AVX1-NEXT: vmovaps %ymm2, 32(%rdi)
+; AVX1-NEXT: vmovaps %ymm4, (%rdi)
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
-; AVX-LABEL: interleaved_store_vf32_i8_stride4:
-; AVX: # BB#0:
-; AVX-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
-; AVX-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
-; AVX-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4
-; AVX-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
-; AVX-NEXT: vpmovzxwd {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero
-; AVX-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5
-; AVX-NEXT: vpblendw {{.*#+}} ymm8 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4],ymm4[5],ymm5[6],ymm4[7],ymm5[8],ymm4[9],ymm5[10],ymm4[11],ymm5[12],ymm4[13],ymm5[14],ymm4[15]
-; AVX-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15]
-; AVX-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7]
-; AVX-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3]
-; AVX-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5
-; AVX-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
-; AVX-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7]
-; AVX-NEXT: vpmovzxwd {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero
-; AVX-NEXT: vinserti128 $1, %xmm7, %ymm6, %ymm6
-; AVX-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2],ymm5[3],ymm6[4],ymm5[5],ymm6[6],ymm5[7],ymm6[8],ymm5[9],ymm6[10],ymm5[11],ymm6[12],ymm5[13],ymm6[14],ymm5[15]
-; AVX-NEXT: vextracti128 $1, %ymm3, %xmm3
-; AVX-NEXT: vextracti128 $1, %ymm2, %xmm2
-; AVX-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
-; AVX-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7]
-; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3]
-; AVX-NEXT: vinserti128 $1, %xmm7, %ymm6, %ymm6
-; AVX-NEXT: vextracti128 $1, %ymm1, %xmm1
-; AVX-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7]
-; AVX-NEXT: vpmovzxwd {{.*#+}} xmm7 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero
-; AVX-NEXT: vinserti128 $1, %xmm4, %ymm7, %ymm4
-; AVX-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm6[1],ymm4[2],ymm6[3],ymm4[4],ymm6[5],ymm4[6],ymm6[7],ymm4[8],ymm6[9],ymm4[10],ymm6[11],ymm4[12],ymm6[13],ymm4[14],ymm6[15]
-; AVX-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15]
-; AVX-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; AVX-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
-; AVX-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
-; AVX-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4,4,5,5,6,6,7,7]
-; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7],ymm0[8],ymm2[9],ymm0[10],ymm2[11],ymm0[12],ymm2[13],ymm0[14],ymm2[15]
-; AVX-NEXT: vmovdqa %ymm0, 96(%rdi)
-; AVX-NEXT: vmovdqa %ymm4, 64(%rdi)
-; AVX-NEXT: vmovdqa %ymm5, 32(%rdi)
-; AVX-NEXT: vmovdqa %ymm8, (%rdi)
-; AVX-NEXT: vzeroupper
-; AVX-NEXT: retq
+; AVX2-LABEL: interleaved_store_vf32_i8_stride4:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
+; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
+; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[16],ymm3[16],ymm2[17],ymm3[17],ymm2[18],ymm3[18],ymm2[19],ymm3[19],ymm2[20],ymm3[20],ymm2[21],ymm3[21],ymm2[22],ymm3[22],ymm2[23],ymm3[23]
+; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15],ymm2[24],ymm3[24],ymm2[25],ymm3[25],ymm2[26],ymm3[26],ymm2[27],ymm3[27],ymm2[28],ymm3[28],ymm2[29],ymm3[29],ymm2[30],ymm3[30],ymm2[31],ymm3[31]
+; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[8],ymm1[8],ymm4[9],ymm1[9],ymm4[10],ymm1[10],ymm4[11],ymm1[11]
+; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm4[4],ymm1[4],ymm4[5],ymm1[5],ymm4[6],ymm1[6],ymm4[7],ymm1[7],ymm4[12],ymm1[12],ymm4[13],ymm1[13],ymm4[14],ymm1[14],ymm4[15],ymm1[15]
+; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11]
+; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15]
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm2
+; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm5
+; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3]
+; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm4[2,3],ymm0[2,3]
+; AVX2-NEXT: vmovdqa %ymm0, 96(%rdi)
+; AVX2-NEXT: vmovdqa %ymm1, 64(%rdi)
+; AVX2-NEXT: vmovdqa %ymm5, 32(%rdi)
+; AVX2-NEXT: vmovdqa %ymm2, (%rdi)
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: interleaved_store_vf32_i8_stride4:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
+; AVX512-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
+; AVX512-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[16],ymm3[16],ymm2[17],ymm3[17],ymm2[18],ymm3[18],ymm2[19],ymm3[19],ymm2[20],ymm3[20],ymm2[21],ymm3[21],ymm2[22],ymm3[22],ymm2[23],ymm3[23]
+; AVX512-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15],ymm2[24],ymm3[24],ymm2[25],ymm3[25],ymm2[26],ymm3[26],ymm2[27],ymm3[27],ymm2[28],ymm3[28],ymm2[29],ymm3[29],ymm2[30],ymm3[30],ymm2[31],ymm3[31]
+; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[8],ymm1[8],ymm4[9],ymm1[9],ymm4[10],ymm1[10],ymm4[11],ymm1[11]
+; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm4[4],ymm1[4],ymm4[5],ymm1[5],ymm4[6],ymm1[6],ymm4[7],ymm1[7],ymm4[12],ymm1[12],ymm4[13],ymm1[13],ymm4[14],ymm1[14],ymm4[15],ymm1[15]
+; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11]
+; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15]
+; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm2
+; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm5
+; AVX512-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3]
+; AVX512-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm4[2,3],ymm0[2,3]
+; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm2
+; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512-NEXT: vmovdqa32 %zmm0, 64(%rdi)
+; AVX512-NEXT: vmovdqa32 %zmm2, (%rdi)
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
%v1 = shufflevector <32 x i8> %x1, <32 x i8> %x2, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
%v2 = shufflevector <32 x i8> %x3, <32 x i8> %x4, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
%interleaved.vec = shufflevector <64 x i8> %v1, <64 x i8> %v2, <128 x i32> <i32 0, i32 32, i32 64, i32 96, i32 1, i32 33, i32 65, i32 97, i32 2, i32 34, i32 66, i32 98, i32 3, i32 35, i32 67, i32 99, i32 4, i32 36, i32 68, i32 100, i32 5, i32 37, i32 69, i32 101, i32 6, i32 38, i32 70, i32 102, i32 7, i32 39, i32 71, i32 103, i32 8, i32 40, i32 72, i32 104, i32 9, i32 41, i32 73, i32 105, i32 10, i32 42, i32 74, i32 106, i32 11, i32 43, i32 75, i32 107, i32 12, i32 44, i32 76, i32 108, i32 13, i32 45, i32 77, i32 109, i32 14, i32 46, i32 78, i32 110, i32 15, i32 47, i32 79, i32 111, i32 16, i32 48, i32 80, i32 112, i32 17, i32 49, i32 81, i32 113, i32 18, i32 50, i32 82, i32 114, i32 19, i32 51, i32 83, i32 115, i32 20, i32 52, i32 84, i32 116, i32 21, i32 53, i32 85, i32 117, i32 22, i32 54, i32 86, i32 118, i32 23, i32 55, i32 87, i32 119, i32 24, i32 56, i32 88, i32 120, i32 25, i32 57, i32 89, i32 121, i32 26, i32 58, i32 90, i32 122, i32 27, i32 59, i32 91, i32 123, i32 28, i32 60, i32 92, i32 124, i32 29, i32 61, i32 93, i32 125, i32 30, i32 62, i32 94, i32 126, i32 31, i32 63, i32 95, i32 127>
@@ -366,3 +372,1525 @@ define void @interleaved_store_vf32_i8_stride4(<32 x i8> %x1, <32 x i8> %x2, <32
ret void
}
+define void @interleaved_store_vf16_i8_stride4(<16 x i8> %x1, <16 x i8> %x2, <16 x i8> %x3, <16 x i8> %x4, <64 x i8>* %p) {
+; AVX1-LABEL: interleaved_store_vf16_i8_stride4:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15]
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0
+; AVX1-NEXT: vmovaps %ymm0, 32(%rdi)
+; AVX1-NEXT: vmovaps %ymm1, (%rdi)
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: interleaved_store_vf16_i8_stride4:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
+; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15]
+; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
+; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
+; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1
+; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm0
+; AVX2-NEXT: vmovdqa %ymm0, 32(%rdi)
+; AVX2-NEXT: vmovdqa %ymm1, (%rdi)
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: interleaved_store_vf16_i8_stride4:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
+; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15]
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
+; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1
+; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm0
+; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512-NEXT: vmovdqa32 %zmm0, (%rdi)
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
+%v1 = shufflevector <16 x i8> %x1, <16 x i8> %x2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+%v2 = shufflevector <16 x i8> %x3, <16 x i8> %x4, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+%interleaved.vec = shufflevector <32 x i8> %v1, <32 x i8> %v2, <64 x i32> <i32 0,i32 16,i32 32,i32 48,i32 1,i32 17,i32 33,i32 49,i32 2,i32 18,i32 34,i32 50,i32 3,i32 19,i32 35,i32 51,i32 4,i32 20,i32 36,i32 52,i32 5,i32 21,i32 37,i32 53,i32 6,i32 22,i32 38,i32 54,i32 7,i32 23,i32 39,i32 55,i32 8,i32 24,i32 40,i32 56,i32 9,i32 25,i32 41,i32 57,i32 10,i32 26,i32 42,i32 58,i32 11,i32 27,i32 43,i32 59,i32 12,i32 28,i32 44,i32 60,i32 13,i32 29,i32 45,i32 61,i32 14,i32 30,i32 46,i32 62,i32 15,i32 31,i32 47,i32 63>
+store <64 x i8> %interleaved.vec, <64 x i8>* %p
+ret void
+}
+
+define <8 x i8> @interleaved_load_vf8_i8_stride4(<32 x i8>* %ptr) {
+; AVX1-LABEL: interleaved_load_vf8_i8_stride4:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqu (%rdi), %ymm0
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm3
+; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm1[0],xmm3[0]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [1,1,3,3,5,5,7,7,7,7,3,3,6,6,7,7]
+; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm3
+; AVX1-NEXT: vpshufb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
+; AVX1-NEXT: vpaddw %xmm1, %xmm4, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [6,7,2,3,14,15,10,11,14,15,10,11,12,13,14,15]
+; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[1,0,3,2,4,5,6,7]
+; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[1,0,3,2,4,5,6,7]
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [3,3,1,1,7,7,5,5,1,1,5,5,0,0,1,1]
+; AVX1-NEXT: vpshufb %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpshufb %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX1-NEXT: vpaddw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpmullw %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX-LABEL: interleaved_load_vf8_i8_stride4:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovdqu (%rdi), %ymm0
+; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX-NEXT: vpshufb %xmm1, %xmm2, %xmm3
+; AVX-NEXT: vpshufb %xmm1, %xmm0, %xmm1
+; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm1[0],xmm3[0]
+; AVX-NEXT: vmovdqa {{.*#+}} xmm5 = [1,1,3,3,5,5,7,7,7,7,3,3,6,6,7,7]
+; AVX-NEXT: vpshufb %xmm5, %xmm3, %xmm3
+; AVX-NEXT: vpshufb %xmm5, %xmm1, %xmm1
+; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
+; AVX-NEXT: vpaddw %xmm1, %xmm4, %xmm1
+; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [6,7,2,3,14,15,10,11,14,15,10,11,12,13,14,15]
+; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[1,0,3,2,4,5,6,7]
+; AVX-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[1,0,3,2,4,5,6,7]
+; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0]
+; AVX-NEXT: vmovdqa {{.*#+}} xmm4 = [3,3,1,1,7,7,5,5,1,1,5,5,0,0,1,1]
+; AVX-NEXT: vpshufb %xmm4, %xmm2, %xmm2
+; AVX-NEXT: vpshufb %xmm4, %xmm0, %xmm0
+; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX-NEXT: vpaddw %xmm3, %xmm0, %xmm0
+; AVX-NEXT: vpmullw %xmm0, %xmm1, %xmm0
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
+ %wide.vec = load <32 x i8>, <32 x i8>* %ptr, align 16
+ %v1 = shufflevector <32 x i8> %wide.vec, <32 x i8> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
+ %v2 = shufflevector <32 x i8> %wide.vec, <32 x i8> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
+ %v3 = shufflevector <32 x i8> %wide.vec, <32 x i8> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
+ %v4 = shufflevector <32 x i8> %wide.vec, <32 x i8> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
+
+ %add1 = add <8 x i8> %v1, %v2
+ %add2 = add <8 x i8> %v4, %v3
+ %add3 = mul <8 x i8> %add1, %add2
+ ret <8 x i8> %add3
+}
+
+define <16 x i1> @interleaved_load_vf16_i8_stride4(<64 x i8>* %ptr) {
+; AVX1-LABEL: interleaved_load_vf16_i8_stride4:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa (%rdi), %ymm0
+; AVX1-NEXT: vmovdqa 32(%rdi), %ymm1
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm4
+; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm3
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm6
+; AVX1-NEXT: vpshufb %xmm5, %xmm0, %xmm5
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1,2,3],xmm3[4,5,6,7]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vpshufb %xmm5, %xmm2, %xmm6
+; AVX1-NEXT: vpshufb %xmm5, %xmm1, %xmm5
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vpshufb %xmm6, %xmm4, %xmm7
+; AVX1-NEXT: vpshufb %xmm6, %xmm0, %xmm6
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4,5,6,7]
+; AVX1-NEXT: vpcmpeqb %xmm5, %xmm3, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = <u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vpshufb %xmm5, %xmm2, %xmm6
+; AVX1-NEXT: vpshufb %xmm5, %xmm1, %xmm5
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vpshufb %xmm6, %xmm4, %xmm7
+; AVX1-NEXT: vpshufb %xmm6, %xmm0, %xmm6
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4,5,6,7]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = <u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vpshufb %xmm6, %xmm2, %xmm2
+; AVX1-NEXT: vpshufb %xmm6, %xmm1, %xmm1
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vpshufb %xmm2, %xmm4, %xmm4
+; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; AVX1-NEXT: vpcmpeqb %xmm0, %xmm5, %xmm0
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX1-NEXT: vpand %xmm1, %xmm3, %xmm2
+; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpeqb %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: interleaved_load_vf16_i8_stride4:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovdqa (%rdi), %ymm0
+; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm4
+; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm2
+; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm5
+; AVX2-NEXT: vpshufb %xmm4, %xmm5, %xmm6
+; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm4
+; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm4, %xmm3, %xmm6
+; AVX2-NEXT: vpshufb %xmm4, %xmm1, %xmm4
+; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm6, %xmm5, %xmm7
+; AVX2-NEXT: vpshufb %xmm6, %xmm0, %xmm6
+; AVX2-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3]
+; AVX2-NEXT: vpcmpeqb %xmm4, %xmm2, %xmm2
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm4, %xmm3, %xmm6
+; AVX2-NEXT: vpshufb %xmm4, %xmm1, %xmm4
+; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm6, %xmm5, %xmm7
+; AVX2-NEXT: vpshufb %xmm6, %xmm0, %xmm6
+; AVX2-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = <u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm6, %xmm3, %xmm3
+; AVX2-NEXT: vpshufb %xmm6, %xmm1, %xmm1
+; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm3, %xmm5, %xmm5
+; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX2-NEXT: vpcmpeqb %xmm0, %xmm4, %xmm0
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX2-NEXT: vpand %xmm1, %xmm2, %xmm2
+; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpcmpeqb %xmm0, %xmm2, %xmm0
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: interleaved_load_vf16_i8_stride4:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
+; AVX512-NEXT: vpshufb %xmm3, %xmm2, %xmm4
+; AVX512-NEXT: vpshufb %xmm3, %xmm1, %xmm3
+; AVX512-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
+; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm4
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm5 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512-NEXT: vpshufb %xmm5, %xmm4, %xmm6
+; AVX512-NEXT: vpshufb %xmm5, %xmm0, %xmm5
+; AVX512-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
+; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3]
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm5 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
+; AVX512-NEXT: vpshufb %xmm5, %xmm2, %xmm6
+; AVX512-NEXT: vpshufb %xmm5, %xmm1, %xmm5
+; AVX512-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm6 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512-NEXT: vpshufb %xmm6, %xmm4, %xmm7
+; AVX512-NEXT: vpshufb %xmm6, %xmm0, %xmm6
+; AVX512-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
+; AVX512-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
+; AVX512-NEXT: vpcmpeqb %xmm5, %xmm3, %xmm3
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm5 = <u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u>
+; AVX512-NEXT: vpshufb %xmm5, %xmm2, %xmm6
+; AVX512-NEXT: vpshufb %xmm5, %xmm1, %xmm5
+; AVX512-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm6 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512-NEXT: vpshufb %xmm6, %xmm4, %xmm7
+; AVX512-NEXT: vpshufb %xmm6, %xmm0, %xmm6
+; AVX512-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
+; AVX512-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm6 = <u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u>
+; AVX512-NEXT: vpshufb %xmm6, %xmm2, %xmm2
+; AVX512-NEXT: vpshufb %xmm6, %xmm1, %xmm1
+; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512-NEXT: vpshufb %xmm2, %xmm4, %xmm4
+; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
+; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX512-NEXT: vpcmpeqb %xmm0, %xmm5, %xmm0
+; AVX512-NEXT: vpsllw $7, %xmm3, %xmm1
+; AVX512-NEXT: vpmovb2m %zmm1, %k0
+; AVX512-NEXT: vpsllw $7, %xmm0, %xmm0
+; AVX512-NEXT: vpmovb2m %zmm0, %k1
+; AVX512-NEXT: kxnorw %k1, %k0, %k0
+; AVX512-NEXT: vpmovm2b %k0, %zmm0
+; AVX512-NEXT: # kill: def %xmm0 killed %xmm0 killed %zmm0
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
+ %wide.vec = load <64 x i8>, <64 x i8>* %ptr
+ %v1 = shufflevector <64 x i8> %wide.vec, <64 x i8> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60>
+ %v2 = shufflevector <64 x i8> %wide.vec, <64 x i8> undef, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61>
+ %v3 = shufflevector <64 x i8> %wide.vec, <64 x i8> undef, <16 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 34, i32 38, i32 42, i32 46, i32 50, i32 54, i32 58, i32 62>
+ %v4 = shufflevector <64 x i8> %wide.vec, <64 x i8> undef, <16 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63>
+
+ %cmp1 = icmp eq <16 x i8> %v1, %v2
+ %cmp2 = icmp eq <16 x i8> %v3, %v4
+ %res = icmp eq <16 x i1> %cmp1, %cmp2
+
+ ret <16 x i1> %res
+}
+
+define <32 x i1> @interleaved_load_vf32_i8_stride4(<128 x i8>* %ptr) {
+; AVX1-LABEL: interleaved_load_vf32_i8_stride4:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa (%rdi), %ymm10
+; AVX1-NEXT: vmovdqa 32(%rdi), %ymm13
+; AVX1-NEXT: vmovdqa 64(%rdi), %ymm2
+; AVX1-NEXT: vmovdqa 96(%rdi), %ymm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm11
+; AVX1-NEXT: vpshufb %xmm6, %xmm11, %xmm5
+; AVX1-NEXT: vpshufb %xmm6, %xmm3, %xmm7
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm12
+; AVX1-NEXT: vpshufb %xmm0, %xmm12, %xmm4
+; AVX1-NEXT: vpshufb %xmm0, %xmm2, %xmm5
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm7[4,5,6,7]
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm8
+; AVX1-NEXT: vextractf128 $1, %ymm13, %xmm14
+; AVX1-NEXT: vpshufb %xmm6, %xmm14, %xmm5
+; AVX1-NEXT: vpshufb %xmm6, %xmm13, %xmm6
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
+; AVX1-NEXT: vextractf128 $1, %ymm10, %xmm6
+; AVX1-NEXT: vpshufb %xmm0, %xmm6, %xmm4
+; AVX1-NEXT: vpshufb %xmm0, %xmm10, %xmm0
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm5[4,5,6,7]
+; AVX1-NEXT: vblendpd {{.*#+}} ymm8 = ymm0[0,1],ymm8[2,3]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vpshufb %xmm0, %xmm11, %xmm4
+; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm5
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vpshufb %xmm5, %xmm12, %xmm1
+; AVX1-NEXT: vpshufb %xmm5, %xmm2, %xmm7
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm7[0],xmm1[0],xmm7[1],xmm1[1]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4,5,6,7]
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
+; AVX1-NEXT: vpshufb %xmm0, %xmm14, %xmm4
+; AVX1-NEXT: vpshufb %xmm0, %xmm13, %xmm0
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
+; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm4
+; AVX1-NEXT: vpshufb %xmm5, %xmm10, %xmm5
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1,2,3],xmm0[4,5,6,7]
+; AVX1-NEXT: vblendpd {{.*#+}} ymm9 = ymm0[0,1],ymm1[2,3]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = <u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vpshufb %xmm0, %xmm11, %xmm1
+; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm4
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vpshufb %xmm4, %xmm12, %xmm5
+; AVX1-NEXT: vpshufb %xmm4, %xmm2, %xmm7
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1,2,3],xmm1[4,5,6,7]
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
+; AVX1-NEXT: vpshufb %xmm0, %xmm14, %xmm5
+; AVX1-NEXT: vpshufb %xmm0, %xmm13, %xmm0
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1]
+; AVX1-NEXT: vpshufb %xmm4, %xmm6, %xmm5
+; AVX1-NEXT: vpshufb %xmm4, %xmm10, %xmm4
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1,2,3],xmm0[4,5,6,7]
+; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vpshufb %xmm1, %xmm11, %xmm4
+; AVX1-NEXT: vpshufb %xmm1, %xmm3, %xmm3
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vpshufb %xmm4, %xmm12, %xmm5
+; AVX1-NEXT: vpshufb %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7]
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
+; AVX1-NEXT: vpshufb %xmm1, %xmm14, %xmm3
+; AVX1-NEXT: vpshufb %xmm1, %xmm13, %xmm1
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; AVX1-NEXT: vpshufb %xmm4, %xmm6, %xmm3
+; AVX1-NEXT: vpshufb %xmm4, %xmm10, %xmm4
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7]
+; AVX1-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3]
+; AVX1-NEXT: vpcmpeqb %xmm9, %xmm8, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm9, %xmm3
+; AVX1-NEXT: vextractf128 $1, %ymm8, %xmm4
+; AVX1-NEXT: vpcmpeqb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [72340172838076673,72340172838076673]
+; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm1, %xmm3, %xmm3
+; AVX1-NEXT: vpcmpeqb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm3
+; AVX1-NEXT: vpand %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vpcmpeqb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: interleaved_load_vf32_i8_stride4:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovdqa (%rdi), %ymm11
+; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1
+; AVX2-NEXT: vmovdqa 64(%rdi), %ymm7
+; AVX2-NEXT: vmovdqa 96(%rdi), %ymm5
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm9
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm6, %xmm9, %xmm3
+; AVX2-NEXT: vpshufb %xmm6, %xmm1, %xmm4
+; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; AVX2-NEXT: vextracti128 $1, %ymm11, %xmm10
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm2, %xmm10, %xmm3
+; AVX2-NEXT: vpshufb %xmm2, %xmm11, %xmm0
+; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm8 = xmm0[0,1],xmm4[2,3]
+; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm12
+; AVX2-NEXT: vpshufb %xmm6, %xmm12, %xmm3
+; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,0,1]
+; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm13
+; AVX2-NEXT: vpshufb %xmm6, %xmm13, %xmm6
+; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1]
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX2-NEXT: vextracti128 $1, %ymm7, %xmm6
+; AVX2-NEXT: vpshufb %xmm2, %xmm6, %xmm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,0,1]
+; AVX2-NEXT: vextracti128 $1, %ymm7, %xmm7
+; AVX2-NEXT: vpshufb %xmm2, %xmm7, %xmm2
+; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm2, %xmm9, %xmm3
+; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm0
+; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm3, %xmm10, %xmm4
+; AVX2-NEXT: vpshufb %xmm3, %xmm11, %xmm5
+; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2,3]
+; AVX2-NEXT: vpshufb %xmm2, %xmm12, %xmm4
+; AVX2-NEXT: vpshufb %xmm2, %xmm13, %xmm2
+; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
+; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX2-NEXT: vpshufb %xmm3, %xmm6, %xmm4
+; AVX2-NEXT: vpshufb %xmm3, %xmm7, %xmm3
+; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-NEXT: vpcmpeqb %ymm0, %ymm8, %ymm8
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = <u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm0, %xmm9, %xmm2
+; AVX2-NEXT: vpshufb %xmm0, %xmm1, %xmm3
+; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm3, %xmm10, %xmm4
+; AVX2-NEXT: vpshufb %xmm3, %xmm11, %xmm5
+; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3]
+; AVX2-NEXT: vpshufb %xmm0, %xmm12, %xmm4
+; AVX2-NEXT: vpshufb %xmm0, %xmm13, %xmm0
+; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
+; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-NEXT: vpshufb %xmm3, %xmm6, %xmm4
+; AVX2-NEXT: vpshufb %xmm3, %xmm7, %xmm3
+; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm2, %xmm9, %xmm3
+; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm3, %xmm10, %xmm4
+; AVX2-NEXT: vpshufb %xmm3, %xmm11, %xmm5
+; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3]
+; AVX2-NEXT: vpshufb %xmm2, %xmm12, %xmm4
+; AVX2-NEXT: vpshufb %xmm2, %xmm13, %xmm2
+; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
+; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX2-NEXT: vpshufb %xmm3, %xmm6, %xmm4
+; AVX2-NEXT: vpshufb %xmm3, %xmm7, %xmm3
+; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX2-NEXT: vpand %ymm1, %ymm8, %ymm2
+; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpcmpeqb %ymm0, %ymm2, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: interleaved_load_vf32_i8_stride4:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm7
+; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm9
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm6 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
+; AVX512-NEXT: vpshufb %xmm6, %xmm9, %xmm3
+; AVX512-NEXT: vpshufb %xmm6, %xmm1, %xmm4
+; AVX512-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm10
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512-NEXT: vpshufb %xmm2, %xmm10, %xmm5
+; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm3
+; AVX512-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1]
+; AVX512-NEXT: vpblendd {{.*#+}} xmm8 = xmm3[0,1],xmm4[2,3]
+; AVX512-NEXT: vextracti64x4 $1, %zmm7, %ymm5
+; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm11
+; AVX512-NEXT: vpshufb %xmm6, %xmm11, %xmm3
+; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,0,1]
+; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm12
+; AVX512-NEXT: vpshufb %xmm6, %xmm12, %xmm6
+; AVX512-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1]
+; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512-NEXT: vextracti128 $1, %ymm7, %xmm13
+; AVX512-NEXT: vpshufb %xmm2, %xmm13, %xmm4
+; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,0,1]
+; AVX512-NEXT: vextracti128 $1, %ymm7, %xmm7
+; AVX512-NEXT: vpshufb %xmm2, %xmm7, %xmm2
+; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
+; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm2[4,5,6,7]
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
+; AVX512-NEXT: vpshufb %xmm3, %xmm9, %xmm4
+; AVX512-NEXT: vpshufb %xmm3, %xmm1, %xmm2
+; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512-NEXT: vpshufb %xmm4, %xmm10, %xmm5
+; AVX512-NEXT: vpshufb %xmm4, %xmm0, %xmm6
+; AVX512-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
+; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3]
+; AVX512-NEXT: vpshufb %xmm3, %xmm11, %xmm5
+; AVX512-NEXT: vpshufb %xmm3, %xmm12, %xmm3
+; AVX512-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1]
+; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512-NEXT: vpshufb %xmm4, %xmm13, %xmm5
+; AVX512-NEXT: vpshufb %xmm4, %xmm7, %xmm4
+; AVX512-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
+; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512-NEXT: vpcmpeqb %ymm2, %ymm8, %ymm8
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u>
+; AVX512-NEXT: vpshufb %xmm2, %xmm9, %xmm3
+; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm4
+; AVX512-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512-NEXT: vpshufb %xmm4, %xmm10, %xmm5
+; AVX512-NEXT: vpshufb %xmm4, %xmm0, %xmm6
+; AVX512-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
+; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3]
+; AVX512-NEXT: vpshufb %xmm2, %xmm11, %xmm5
+; AVX512-NEXT: vpshufb %xmm2, %xmm12, %xmm2
+; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
+; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512-NEXT: vpshufb %xmm4, %xmm13, %xmm5
+; AVX512-NEXT: vpshufb %xmm4, %xmm7, %xmm4
+; AVX512-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
+; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u>
+; AVX512-NEXT: vpshufb %xmm3, %xmm9, %xmm4
+; AVX512-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512-NEXT: vpshufb %xmm4, %xmm10, %xmm5
+; AVX512-NEXT: vpshufb %xmm4, %xmm0, %xmm0
+; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1]
+; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX512-NEXT: vpshufb %xmm3, %xmm11, %xmm1
+; AVX512-NEXT: vpshufb %xmm3, %xmm12, %xmm3
+; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
+; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512-NEXT: vpshufb %xmm4, %xmm13, %xmm3
+; AVX512-NEXT: vpshufb %xmm4, %xmm7, %xmm4
+; AVX512-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512-NEXT: vpcmpeqb %ymm0, %ymm2, %ymm0
+; AVX512-NEXT: vpsllw $7, %ymm8, %ymm1
+; AVX512-NEXT: vpmovb2m %zmm1, %k0
+; AVX512-NEXT: vpsllw $7, %ymm0, %ymm0
+; AVX512-NEXT: vpmovb2m %zmm0, %k1
+; AVX512-NEXT: kxnord %k1, %k0, %k0
+; AVX512-NEXT: vpmovm2b %k0, %zmm0
+; AVX512-NEXT: # kill: def %ymm0 killed %ymm0 killed %zmm0
+; AVX512-NEXT: retq
+ %wide.vec = load <128 x i8>, <128 x i8>* %ptr
+ %v1 = shufflevector <128 x i8> %wide.vec, <128 x i8> undef, <32 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60, i32 64, i32 68, i32 72, i32 76, i32 80, i32 84, i32 88, i32 92, i32 96, i32 100, i32 104, i32 108, i32 112, i32 116, i32 120, i32 124>
+
+ %v2 = shufflevector <128 x i8> %wide.vec, <128 x i8> undef, <32 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61, i32 65, i32 69, i32 73, i32 77, i32 81, i32 85, i32 89, i32 93, i32 97, i32 101, i32 105, i32 109, i32 113, i32 117, i32 121, i32 125>
+
+ %v3 = shufflevector <128 x i8> %wide.vec, <128 x i8> undef, <32 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 34, i32 38, i32 42, i32 46, i32 50, i32 54, i32 58, i32 62, i32 66, i32 70, i32 74, i32 78, i32 82, i32 86, i32 90, i32 94, i32 98, i32 102, i32 106, i32 110, i32 114, i32 118, i32 122, i32 126>
+
+ %v4 = shufflevector <128 x i8> %wide.vec, <128 x i8> undef, <32 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63, i32 67, i32 71, i32 75, i32 79, i32 83, i32 87, i32 91, i32 95, i32 99, i32 103, i32 107, i32 111, i32 115, i32 119, i32 123, i32 127>
+
+ %cmp1 = icmp eq <32 x i8> %v1, %v2
+ %cmp2 = icmp eq <32 x i8> %v3, %v4
+ %res = icmp eq <32 x i1> %cmp1, %cmp2
+
+ ret <32 x i1> %res
+}
+
+define void @interleaved_store_vf8_i8_stride4(<8 x i8> %x1, <8 x i8> %x2, <8 x i8> %x3, <8 x i8> %x4, <32 x i8>* %p) {
+; AVX1-LABEL: interleaved_store_vf8_i8_stride4:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vpshufb %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm1
+; AVX1-NEXT: vpshufb %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX1-NEXT: vmovaps %ymm0, (%rdi)
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX-LABEL: interleaved_store_vf8_i8_stride4:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX-NEXT: vpshufb %xmm4, %xmm1, %xmm1
+; AVX-NEXT: vpshufb %xmm4, %xmm0, %xmm0
+; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX-NEXT: vpshufb %xmm4, %xmm3, %xmm1
+; AVX-NEXT: vpshufb %xmm4, %xmm2, %xmm2
+; AVX-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0
+; AVX-NEXT: vmovdqa %ymm0, (%rdi)
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
+%v1 = shufflevector <8 x i8> %x1, <8 x i8> %x2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+%v2 = shufflevector <8 x i8> %x3, <8 x i8> %x4, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+%interleaved.vec = shufflevector <16 x i8> %v1, <16 x i8> %v2, <32 x i32> <i32 0,i32 8,i32 16,i32 24,i32 1,i32 9,i32 17,i32 25,i32 2,i32 10,i32 18,i32 26,i32 3,i32 11,i32 19,i32 27,i32 4,i32 12,i32 20,i32 28,i32 5,i32 13,i32 21,i32 29,i32 6,i32 14,i32 22,i32 30,i32 7,i32 15,i32 23,i32 31>
+store <32 x i8> %interleaved.vec, <32 x i8>* %p
+ret void
+}
+
+define <32 x i8> @interleaved_load_vf32_i8_stride3(<96 x i8>* %ptr){
+; AVX1-LABEL: interleaved_load_vf32_i8_stride3:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa (%rdi), %xmm0
+; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX1-NEXT: vmovdqa 32(%rdi), %xmm2
+; AVX1-NEXT: vmovdqa 48(%rdi), %xmm3
+; AVX1-NEXT: vmovdqa 64(%rdi), %xmm4
+; AVX1-NEXT: vmovdqa 80(%rdi), %xmm5
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
+; AVX1-NEXT: vpshufb %xmm6, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm6, %xmm3, %xmm3
+; AVX1-NEXT: vpshufb %xmm6, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm6, %xmm4, %xmm4
+; AVX1-NEXT: vpshufb %xmm6, %xmm2, %xmm2
+; AVX1-NEXT: vpshufb %xmm6, %xmm5, %xmm5
+; AVX1-NEXT: vpalignr {{.*#+}} xmm6 = xmm5[11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7,8,9,10]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm7 = xmm2[11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm3[11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7,8,9,10]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10]
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm8
+; AVX1-NEXT: vpalignr {{.*#+}} xmm4 = xmm4[11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7,8,9,10]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10]
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm2
+; AVX1-NEXT: vpalignr {{.*#+}} xmm9 = xmm7[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm4 = xmm6[11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7,8,9,10]
+; AVX1-NEXT: vmovaps {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
+; AVX1-NEXT: vandnps %ymm2, %ymm5, %ymm2
+; AVX1-NEXT: vandps %ymm5, %ymm8, %ymm5
+; AVX1-NEXT: vorps %ymm2, %ymm5, %ymm2
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,128,128,128,128,11,12,13,14,15,128,128,128,128,128]
+; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [5,6,7,8,9,10,128,128,128,128,128,0,1,2,3,4]
+; AVX1-NEXT: vpshufb %xmm1, %xmm6, %xmm6
+; AVX1-NEXT: vpor %xmm3, %xmm6, %xmm3
+; AVX1-NEXT: vpshufb %xmm5, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm1, %xmm7, %xmm1
+; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1
+; AVX1-NEXT: vpaddb %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpaddb %xmm1, %xmm3, %xmm1
+; AVX1-NEXT: vpaddb %xmm9, %xmm2, %xmm2
+; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX-LABEL: interleaved_load_vf32_i8_stride3:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovdqa (%rdi), %xmm0
+; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX-NEXT: vmovdqa 32(%rdi), %xmm2
+; AVX-NEXT: vinserti128 $1, 48(%rdi), %ymm0, %ymm0
+; AVX-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm1
+; AVX-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2
+; AVX-NEXT: vmovdqa {{.*#+}} ymm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
+; AVX-NEXT: vpshufb %ymm3, %ymm0, %ymm0
+; AVX-NEXT: vpshufb %ymm3, %ymm1, %ymm1
+; AVX-NEXT: vpshufb %ymm3, %ymm2, %ymm2
+; AVX-NEXT: vpalignr {{.*#+}} ymm3 = ymm2[11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7,8,9,10],ymm2[27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23,24,25,26]
+; AVX-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10],ymm0[27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26]
+; AVX-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10],ymm1[27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26]
+; AVX-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10],ymm3[27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26]
+; AVX-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
+; AVX-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm1
+; AVX-NEXT: vpaddb %ymm2, %ymm1, %ymm1
+; AVX-NEXT: vpblendvb %ymm4, %ymm3, %ymm0, %ymm0
+; AVX-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,21,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20]
+; AVX-NEXT: vpaddb %ymm1, %ymm0, %ymm0
+; AVX-NEXT: retq
+ %wide.vec = load <96 x i8>, <96 x i8>* %ptr
+ %v1 = shufflevector <96 x i8> %wide.vec, <96 x i8> undef,<32 x i32> <i32 0,i32 3,i32 6,i32 9,i32 12,i32 15,i32 18,i32 21,i32 24,i32 27,i32 30,i32 33,i32 36,i32 39,i32 42,i32 45,i32 48,i32 51,i32 54,i32 57,i32 60,i32 63,i32 66,i32 69,i32 72,i32 75,i32 78,i32 81,i32 84,i32 87,i32 90,i32 93>
+ %v2 = shufflevector <96 x i8> %wide.vec, <96 x i8> undef,<32 x i32> <i32 1,i32 4,i32 7,i32 10,i32 13,i32 16,i32 19,i32 22,i32 25,i32 28,i32 31,i32 34,i32 37,i32 40,i32 43,i32 46,i32 49,i32 52,i32 55,i32 58,i32 61,i32 64,i32 67,i32 70,i32 73,i32 76,i32 79,i32 82,i32 85,i32 88,i32 91,i32 94>
+ %v3 = shufflevector <96 x i8> %wide.vec, <96 x i8> undef,<32 x i32> <i32 2,i32 5,i32 8,i32 11,i32 14,i32 17,i32 20,i32 23,i32 26,i32 29,i32 32,i32 35,i32 38,i32 41,i32 44,i32 47,i32 50,i32 53,i32 56,i32 59,i32 62,i32 65,i32 68,i32 71,i32 74,i32 77,i32 80,i32 83,i32 86,i32 89,i32 92,i32 95>
+ %add1 = add <32 x i8> %v1, %v2
+ %add2 = add <32 x i8> %v3, %add1
+ ret <32 x i8> %add2
+}
+
+define <16 x i8> @interleaved_load_vf16_i8_stride3(<48 x i8>* %ptr){
+; AVX1-LABEL: interleaved_load_vf16_i8_stride3:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa (%rdi), %xmm0
+; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX1-NEXT: vmovdqa 32(%rdi), %xmm2
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
+; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm2[11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm3[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
+; AVX1-NEXT: vpblendvb %xmm4, %xmm0, %xmm1, %xmm1
+; AVX1-NEXT: vpaddb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[11,12,13,14,15],zero,zero,zero,zero,zero
+; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[5,6,7,8,9,10],zero,zero,zero,zero,zero,xmm3[0,1,2,3,4]
+; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX-LABEL: interleaved_load_vf16_i8_stride3:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovdqa (%rdi), %xmm0
+; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX-NEXT: vmovdqa 32(%rdi), %xmm2
+; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
+; AVX-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX-NEXT: vpalignr {{.*#+}} xmm3 = xmm2[11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10]
+; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10]
+; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10]
+; AVX-NEXT: vpalignr {{.*#+}} xmm2 = xmm3[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10]
+; AVX-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
+; AVX-NEXT: vpblendvb %xmm4, %xmm0, %xmm1, %xmm1
+; AVX-NEXT: vpaddb %xmm2, %xmm1, %xmm1
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[11,12,13,14,15],zero,zero,zero,zero,zero
+; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[5,6,7,8,9,10],zero,zero,zero,zero,zero,xmm3[0,1,2,3,4]
+; AVX-NEXT: vpor %xmm0, %xmm2, %xmm0
+; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
+ %wide.vec = load <48 x i8>, <48 x i8>* %ptr
+ %v1 = shufflevector <48 x i8> %wide.vec, <48 x i8> undef,<16 x i32> <i32 0,i32 3,i32 6,i32 9,i32 12,i32 15,i32 18,i32 21,i32 24,i32 27,i32 30,i32 33,i32 36,i32 39,i32 42 ,i32 45>
+ %v2 = shufflevector <48 x i8> %wide.vec, <48 x i8> undef,<16 x i32> <i32 1,i32 4,i32 7,i32 10,i32 13,i32 16,i32 19,i32 22,i32 25,i32 28,i32 31,i32 34,i32 37,i32 40,i32 43,i32 46>
+ %v3 = shufflevector <48 x i8> %wide.vec, <48 x i8> undef,<16 x i32> <i32 2,i32 5,i32 8,i32 11,i32 14,i32 17,i32 20,i32 23,i32 26,i32 29,i32 32,i32 35,i32 38,i32 41,i32 44,i32 47>
+ %add1 = add <16 x i8> %v1, %v2
+ %add2 = add <16 x i8> %v3, %add1
+ ret <16 x i8> %add2
+}
+
+define <8 x i8> @interleaved_load_vf8_i8_stride3(<24 x i8>* %ptr){
+; AVX1-LABEL: interleaved_load_vf8_i8_stride3:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa (%rdi), %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u,2,u,5,u]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,u,3,u,6,u,9,u,12,u,15,u],zero,xmm0[u],zero,xmm0[u]
+; AVX1-NEXT: vpor %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u,0,u,3,u,6,u]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,u,4,u,7,u,10,u,13,u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u]
+; AVX1-NEXT: vpor %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u,1,u,4,u,7,u]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,u,5,u,8,u,11,u,14,u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u]
+; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpaddw %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vpaddw %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX-LABEL: interleaved_load_vf8_i8_stride3:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovdqa (%rdi), %ymm0
+; AVX-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u,2,u,5,u]
+; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,u,3,u,6,u,9,u,12,u,15,u],zero,xmm0[u],zero,xmm0[u]
+; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2
+; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u,0,u,3,u,6,u]
+; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,u,4,u,7,u,10,u,13,u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u]
+; AVX-NEXT: vpor %xmm3, %xmm4, %xmm3
+; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u,1,u,4,u,7,u]
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,u,5,u,8,u,11,u,14,u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u]
+; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpaddw %xmm0, %xmm3, %xmm0
+; AVX-NEXT: vpaddw %xmm0, %xmm2, %xmm0
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
+ %wide.vec = load <24 x i8>, <24 x i8>* %ptr
+ %v1 = shufflevector <24 x i8> %wide.vec, <24 x i8> undef,<8 x i32> <i32 0,i32 3,i32 6,i32 9,i32 12,i32 15,i32 18,i32 21>
+ %v2 = shufflevector <24 x i8> %wide.vec, <24 x i8> undef,<8 x i32> <i32 1,i32 4,i32 7,i32 10,i32 13,i32 16,i32 19,i32 22>
+ %v3 = shufflevector <24 x i8> %wide.vec, <24 x i8> undef,<8 x i32> <i32 2,i32 5,i32 8,i32 11,i32 14,i32 17,i32 20,i32 23>
+ %add1 = add <8 x i8> %v1, %v2
+ %add2 = add <8 x i8> %v3, %add1
+ ret <8 x i8> %add2
+}
+
+define void @interleaved_store_vf8_i8_stride3(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <24 x i8>* %p) {
+; AVX1-LABEL: interleaved_store_vf8_i8_stride3:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm1
+; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0,8],zero,xmm0[1,9],zero,xmm0[2,10],zero,xmm0[3,11],zero,xmm0[4,12],zero,xmm0[5]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm1[0],zero,zero,xmm1[1],zero,zero,xmm1[2],zero,zero,xmm1[3],zero,zero,xmm1[4],zero
+; AVX1-NEXT: vpor %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[13],zero,xmm0[6,14],zero,xmm0[7,15],zero,xmm0[u,u,u,u,u,u,u,u]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm1[5],zero,zero,xmm1[6],zero,zero,xmm1[7,u,u,u,u,u,u,u,u]
+; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vmovq %xmm0, 16(%rdi)
+; AVX1-NEXT: vmovdqu %xmm2, (%rdi)
+; AVX1-NEXT: retq
+;
+; AVX-LABEL: interleaved_store_vf8_i8_stride3:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm1
+; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0,8],zero,xmm0[1,9],zero,xmm0[2,10],zero,xmm0[3,11],zero,xmm0[4,12],zero,xmm0[5]
+; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm1[0],zero,zero,xmm1[1],zero,zero,xmm1[2],zero,zero,xmm1[3],zero,zero,xmm1[4],zero
+; AVX-NEXT: vpor %xmm3, %xmm2, %xmm2
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[13],zero,xmm0[6,14],zero,xmm0[7,15],zero,xmm0[u,u,u,u,u,u,u,u]
+; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm1[5],zero,zero,xmm1[6],zero,zero,xmm1[7,u,u,u,u,u,u,u,u]
+; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vmovq %xmm0, 16(%rdi)
+; AVX-NEXT: vmovdqu %xmm2, (%rdi)
+; AVX-NEXT: retq
+%1 = shufflevector <8 x i8> %a, <8 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+%2 = shufflevector <8 x i8> %c, <8 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+%interleaved.vec = shufflevector <16 x i8> %1, <16 x i8> %2, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
+store <24 x i8> %interleaved.vec, <24 x i8>* %p, align 1
+ret void
+}
+
+define void @interleaved_store_vf16_i8_stride3(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <48 x i8>* %p) {
+; AVX1-LABEL: interleaved_store_vf16_i8_stride3:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [128,0,128,128,1,128,128,2,128,128,3,128,128,4,128,128]
+; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm4
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [5,128,11,6,128,12,7,128,13,8,128,14,9,128,15,10]
+; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm6
+; AVX1-NEXT: vpor %xmm4, %xmm6, %xmm4
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm6
+; AVX1-NEXT: vpshufb %xmm5, %xmm0, %xmm0
+; AVX1-NEXT: vpor %xmm6, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpshufb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0
+; AVX1-NEXT: vmovdqu %xmm1, 32(%rdi)
+; AVX1-NEXT: vmovups %ymm0, (%rdi)
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: interleaved_store_vf16_i8_stride3:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
+; AVX2-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10]
+; AVX2-NEXT: vpalignr {{.*#+}} xmm3 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
+; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
+; AVX2-NEXT: vpalignr {{.*#+}} xmm1 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [128,0,128,128,1,128,128,2,128,128,3,128,128,4,128,128]
+; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm4
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = [5,128,11,6,128,12,7,128,13,8,128,14,9,128,15,10]
+; AVX2-NEXT: vpshufb %xmm5, %xmm3, %xmm6
+; AVX2-NEXT: vpor %xmm4, %xmm6, %xmm4
+; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm6
+; AVX2-NEXT: vpshufb %xmm5, %xmm0, %xmm0
+; AVX2-NEXT: vpor %xmm6, %xmm0, %xmm0
+; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2
+; AVX2-NEXT: vpshufb %xmm5, %xmm1, %xmm1
+; AVX2-NEXT: vpor %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm0
+; AVX2-NEXT: vmovdqu %xmm1, 32(%rdi)
+; AVX2-NEXT: vmovdqu %ymm0, (%rdi)
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: interleaved_store_vf16_i8_stride3:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
+; AVX512-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10]
+; AVX512-NEXT: vpalignr {{.*#+}} xmm3 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
+; AVX512-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
+; AVX512-NEXT: vpalignr {{.*#+}} xmm1 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [128,0,128,128,1,128,128,2,128,128,3,128,128,4,128,128]
+; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm4
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm5 = [5,128,11,6,128,12,7,128,13,8,128,14,9,128,15,10]
+; AVX512-NEXT: vpshufb %xmm5, %xmm3, %xmm6
+; AVX512-NEXT: vpor %xmm4, %xmm6, %xmm4
+; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm6
+; AVX512-NEXT: vpshufb %xmm5, %xmm0, %xmm0
+; AVX512-NEXT: vpor %xmm6, %xmm0, %xmm0
+; AVX512-NEXT: vpshufb %xmm2, %xmm3, %xmm2
+; AVX512-NEXT: vpshufb %xmm5, %xmm1, %xmm1
+; AVX512-NEXT: vpor %xmm2, %xmm1, %xmm1
+; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm0
+; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1
+; AVX512-NEXT: vmovdqu %ymm0, (%rdi)
+; AVX512-NEXT: vextracti32x4 $2, %zmm1, 32(%rdi)
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
+%1 = shufflevector <16 x i8> %a, <16 x i8> %b, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+%2 = shufflevector <16 x i8> %c, <16 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+%interleaved.vec = shufflevector <32 x i8> %1, <32 x i8> %2, <48 x i32> <i32 0, i32 16, i32 32, i32 1, i32 17, i32 33, i32 2, i32 18, i32 34, i32 3, i32 19, i32 35, i32 4, i32 20, i32 36, i32 5, i32 21, i32 37, i32 6, i32 22, i32 38, i32 7, i32 23, i32 39, i32 8, i32 24, i32 40, i32 9, i32 25, i32 41, i32 10, i32 26, i32 42, i32 11, i32 27, i32 43, i32 12, i32 28, i32 44, i32 13, i32 29, i32 45, i32 14, i32 30, i32 46, i32 15, i32 31, i32 47>
+store <48 x i8> %interleaved.vec, <48 x i8>* %p, align 1
+ret void
+}
+
+define void @interleaved_store_vf32_i8_stride3(<32 x i8> %a, <32 x i8> %b, <32 x i8> %c, <96 x i8>* %p) {
+; AVX1-LABEL: interleaved_store_vf32_i8_stride3:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm3[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT: vpalignr {{.*#+}} xmm4 = xmm4[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm5 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6
+; AVX1-NEXT: vpalignr {{.*#+}} xmm7 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm6[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm4 = xmm7[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm6 = xmm5[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[5,6,7,8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
+; AVX1-NEXT: vpshufb %xmm5, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm6
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm6, %ymm0
+; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vpshufb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
+; AVX1-NEXT: vpshufb %xmm5, %xmm2, %xmm2
+; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; AVX1-NEXT: vmovups %ymm2, 64(%rdi)
+; AVX1-NEXT: vmovups %ymm1, 32(%rdi)
+; AVX1-NEXT: vmovups %ymm0, (%rdi)
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: interleaved_store_vf32_i8_stride3:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20,21]
+; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,26]
+; AVX2-NEXT: vpalignr {{.*#+}} ymm3 = ymm0[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm0[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20]
+; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[5,6,7,8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4],ymm1[21,22,23,24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20]
+; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[5,6,7,8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4],ymm2[21,22,23,24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20]
+; AVX2-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[5,6,7,8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4],ymm3[21,22,23,24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20]
+; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[5,6,7,8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4],ymm0[21,22,23,24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20]
+; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[5,6,7,8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4],ymm1[21,22,23,24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20]
+; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm3
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
+; AVX2-NEXT: vpshufb %ymm4, %ymm3, %ymm3
+; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-NEXT: vpshufb %ymm4, %ymm2, %ymm2
+; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
+; AVX2-NEXT: vpshufb %ymm4, %ymm0, %ymm0
+; AVX2-NEXT: vmovdqu %ymm0, 64(%rdi)
+; AVX2-NEXT: vmovdqu %ymm2, 32(%rdi)
+; AVX2-NEXT: vmovdqu %ymm3, (%rdi)
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: interleaved_store_vf32_i8_stride3:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20,21]
+; AVX512-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,26]
+; AVX512-NEXT: vpalignr {{.*#+}} ymm3 = ymm0[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm0[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20]
+; AVX512-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[5,6,7,8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4],ymm1[21,22,23,24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20]
+; AVX512-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[5,6,7,8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4],ymm2[21,22,23,24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20]
+; AVX512-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[5,6,7,8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4],ymm3[21,22,23,24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20]
+; AVX512-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[5,6,7,8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4],ymm0[21,22,23,24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20]
+; AVX512-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[5,6,7,8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4],ymm1[21,22,23,24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20]
+; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm3
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
+; AVX512-NEXT: vpshufb %ymm4, %ymm3, %ymm3
+; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX512-NEXT: vpshufb %ymm4, %ymm2, %ymm2
+; AVX512-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
+; AVX512-NEXT: vpshufb %ymm4, %ymm0, %ymm0
+; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm1
+; AVX512-NEXT: vmovdqu %ymm0, 64(%rdi)
+; AVX512-NEXT: vmovdqu32 %zmm1, (%rdi)
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
+%1 = shufflevector <32 x i8> %a, <32 x i8> %b, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+%2 = shufflevector <32 x i8> %c, <32 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+%interleaved.vec = shufflevector <64 x i8> %1, <64 x i8> %2, <96 x i32> <i32 0, i32 32, i32 64, i32 1, i32 33, i32 65, i32 2, i32 34, i32 66, i32 3, i32 35, i32 67, i32 4, i32 36, i32 68, i32 5, i32 37, i32 69, i32 6, i32 38, i32 70, i32 7, i32 39, i32 71, i32 8, i32 40, i32 72, i32 9, i32 41, i32 73, i32 10, i32 42, i32 74, i32 11, i32 43, i32 75, i32 12, i32 44, i32 76, i32 13, i32 45, i32 77, i32 14, i32 46, i32 78, i32 15, i32 47, i32 79, i32 16, i32 48, i32 80, i32 17, i32 49, i32 81, i32 18, i32 50, i32 82, i32 19, i32 51, i32 83, i32 20, i32 52, i32 84, i32 21, i32 53, i32 85, i32 22, i32 54, i32 86, i32 23, i32 55, i32 87, i32 24, i32 56, i32 88, i32 25, i32 57, i32 89, i32 26, i32 58, i32 90, i32 27, i32 59, i32 91, i32 28, i32 60, i32 92, i32 29, i32 61, i32 93, i32 30, i32 62, i32 94, i32 31, i32 63, i32 95>
+store <96 x i8> %interleaved.vec, <96 x i8>* %p, align 1
+ret void
+}
+
+define void @interleaved_store_vf64_i8_stride3(<64 x i8> %a, <64 x i8> %b, <64 x i8> %c, <192 x i8>* %p) {
+; AVX1-LABEL: interleaved_store_vf64_i8_stride3:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm6
+; AVX1-NEXT: vpalignr {{.*#+}} xmm8 = xmm6[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm9 = xmm1[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm7
+; AVX1-NEXT: vpalignr {{.*#+}} xmm14 = xmm7[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm6
+; AVX1-NEXT: vpalignr {{.*#+}} xmm11 = xmm6[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm15 = xmm3[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10]
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1
+; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm10 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
+; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm6
+; AVX1-NEXT: vpalignr {{.*#+}} xmm12 = xmm14[5,6,7,8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm13 = xmm9[5,6,7,8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4]
+; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm7
+; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm8[5,6,7,8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm14 = xmm1[5,6,7,8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm9 = xmm15[5,6,7,8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm8 = xmm11[5,6,7,8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm6[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm4 = xmm5[5,6,7,8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm5 = xmm7[5,6,7,8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm11 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm15 = xmm13[5,6,7,8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm6 = xmm12[5,6,7,8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm7 = xmm10[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm8 = xmm8[5,6,7,8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm9 = xmm9[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm14 = xmm14[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm5[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm4 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[5,6,7,8,9,10,11,12,13,14,15],xmm12[0,1,2,3,4]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
+; AVX1-NEXT: vpshufb %xmm5, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm5, %xmm7, %xmm7
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm7, %ymm0
+; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm6
+; AVX1-NEXT: vpshufb %xmm5, %xmm2, %xmm2
+; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm2, %ymm2
+; AVX1-NEXT: vpshufb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm5, %xmm14, %xmm6
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm6, %ymm1
+; AVX1-NEXT: vpshufb %xmm5, %xmm9, %xmm6
+; AVX1-NEXT: vpshufb %xmm5, %xmm15, %xmm7
+; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6
+; AVX1-NEXT: vpshufb %xmm5, %xmm11, %xmm7
+; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm4, %ymm4
+; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm3
+; AVX1-NEXT: vpshufb %xmm5, %xmm8, %xmm5
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3
+; AVX1-NEXT: vmovups %ymm3, 160(%rdi)
+; AVX1-NEXT: vmovups %ymm4, 128(%rdi)
+; AVX1-NEXT: vmovups %ymm6, 96(%rdi)
+; AVX1-NEXT: vmovups %ymm1, 64(%rdi)
+; AVX1-NEXT: vmovups %ymm2, 32(%rdi)
+; AVX1-NEXT: vmovups %ymm0, (%rdi)
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: interleaved_store_vf64_i8_stride3:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20,21]
+; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20,21]
+; AVX2-NEXT: vpalignr {{.*#+}} ymm3 = ymm3[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,26]
+; AVX2-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,26]
+; AVX2-NEXT: vpalignr {{.*#+}} ymm6 = ymm0[5,6,7,8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4],ymm0[21,22,23,24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20]
+; AVX2-NEXT: vpalignr {{.*#+}} ymm7 = ymm1[5,6,7,8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4],ymm1[21,22,23,24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20]
+; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm2[5,6,7,8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4],ymm2[21,22,23,24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20]
+; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm3[5,6,7,8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4],ymm3[21,22,23,24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20]
+; AVX2-NEXT: vpalignr {{.*#+}} ymm2 = ymm4[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm4[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20]
+; AVX2-NEXT: vpalignr {{.*#+}} ymm3 = ymm5[5,6,7,8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4],ymm5[21,22,23,24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20]
+; AVX2-NEXT: vpalignr {{.*#+}} ymm4 = ymm7[5,6,7,8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4],ymm7[21,22,23,24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20]
+; AVX2-NEXT: vpalignr {{.*#+}} ymm5 = ymm6[5,6,7,8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4],ymm6[21,22,23,24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20]
+; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[5,6,7,8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4],ymm1[21,22,23,24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20]
+; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm0[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20]
+; AVX2-NEXT: vpalignr {{.*#+}} ymm3 = ymm3[5,6,7,8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4],ymm3[21,22,23,24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20]
+; AVX2-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[5,6,7,8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4],ymm2[21,22,23,24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20]
+; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm5, %ymm6
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm7 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
+; AVX2-NEXT: vpshufb %ymm7, %ymm6, %ymm6
+; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0,1,2,3],ymm5[4,5,6,7]
+; AVX2-NEXT: vpshufb %ymm7, %ymm5, %ymm5
+; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
+; AVX2-NEXT: vpshufb %ymm7, %ymm0, %ymm0
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm4, %ymm2
+; AVX2-NEXT: vpshufb %ymm7, %ymm2, %ymm2
+; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3],ymm4[4,5,6,7]
+; AVX2-NEXT: vpshufb %ymm7, %ymm4, %ymm4
+; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
+; AVX2-NEXT: vpshufb %ymm7, %ymm1, %ymm1
+; AVX2-NEXT: vmovdqu %ymm1, 160(%rdi)
+; AVX2-NEXT: vmovdqu %ymm4, 128(%rdi)
+; AVX2-NEXT: vmovdqu %ymm0, 64(%rdi)
+; AVX2-NEXT: vmovdqu %ymm5, 32(%rdi)
+; AVX2-NEXT: vmovdqu %ymm2, 96(%rdi)
+; AVX2-NEXT: vmovdqu %ymm6, (%rdi)
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: interleaved_store_vf64_i8_stride3:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpalignr {{.*#+}} zmm0 = zmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20,21,38,39,40,41,42,43,44,45,46,47,32,33,34,35,36,37,54,55,56,57,58,59,60,61,62,63,48,49,50,51,52,53]
+; AVX512-NEXT: vpalignr {{.*#+}} zmm1 = zmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,26,43,44,45,46,47,32,33,34,35,36,37,38,39,40,41,42,59,60,61,62,63,48,49,50,51,52,53,54,55,56,57,58]
+; AVX512-NEXT: vpalignr {{.*#+}} zmm3 = zmm0[5,6,7,8,9,10,11,12,13,14,15],zmm2[0,1,2,3,4],zmm0[21,22,23,24,25,26,27,28,29,30,31],zmm2[16,17,18,19,20],zmm0[37,38,39,40,41,42,43,44,45,46,47],zmm2[32,33,34,35,36],zmm0[53,54,55,56,57,58,59,60,61,62,63],zmm2[48,49,50,51,52]
+; AVX512-NEXT: vpalignr {{.*#+}} zmm0 = zmm1[5,6,7,8,9,10,11,12,13,14,15],zmm0[0,1,2,3,4],zmm1[21,22,23,24,25,26,27,28,29,30,31],zmm0[16,17,18,19,20],zmm1[37,38,39,40,41,42,43,44,45,46,47],zmm0[32,33,34,35,36],zmm1[53,54,55,56,57,58,59,60,61,62,63],zmm0[48,49,50,51,52]
+; AVX512-NEXT: vpalignr {{.*#+}} zmm1 = zmm2[5,6,7,8,9,10,11,12,13,14,15],zmm1[0,1,2,3,4],zmm2[21,22,23,24,25,26,27,28,29,30,31],zmm1[16,17,18,19,20],zmm2[37,38,39,40,41,42,43,44,45,46,47],zmm1[32,33,34,35,36],zmm2[53,54,55,56,57,58,59,60,61,62,63],zmm1[48,49,50,51,52]
+; AVX512-NEXT: vpalignr {{.*#+}} zmm2 = zmm3[5,6,7,8,9,10,11,12,13,14,15],zmm0[0,1,2,3,4],zmm3[21,22,23,24,25,26,27,28,29,30,31],zmm0[16,17,18,19,20],zmm3[37,38,39,40,41,42,43,44,45,46,47],zmm0[32,33,34,35,36],zmm3[53,54,55,56,57,58,59,60,61,62,63],zmm0[48,49,50,51,52]
+; AVX512-NEXT: vpalignr {{.*#+}} zmm0 = zmm0[5,6,7,8,9,10,11,12,13,14,15],zmm1[0,1,2,3,4],zmm0[21,22,23,24,25,26,27,28,29,30,31],zmm1[16,17,18,19,20],zmm0[37,38,39,40,41,42,43,44,45,46,47],zmm1[32,33,34,35,36],zmm0[53,54,55,56,57,58,59,60,61,62,63],zmm1[48,49,50,51,52]
+; AVX512-NEXT: vpalignr {{.*#+}} zmm1 = zmm1[5,6,7,8,9,10,11,12,13,14,15],zmm3[0,1,2,3,4],zmm1[21,22,23,24,25,26,27,28,29,30,31],zmm3[16,17,18,19,20],zmm1[37,38,39,40,41,42,43,44,45,46,47],zmm3[32,33,34,35,36],zmm1[53,54,55,56,57,58,59,60,61,62,63],zmm3[48,49,50,51,52]
+; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm3
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
+; AVX512-NEXT: vpshufb %ymm4, %ymm3, %ymm3
+; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX512-NEXT: vpshufb %ymm4, %ymm5, %ymm5
+; AVX512-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm0[2,3],ymm1[2,3]
+; AVX512-NEXT: vpshufb %ymm4, %ymm6, %ymm6
+; AVX512-NEXT: vextracti64x4 $1, %zmm2, %ymm2
+; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm7
+; AVX512-NEXT: vpshufb %ymm4, %ymm7, %ymm7
+; AVX512-NEXT: vextracti64x4 $1, %zmm1, %ymm1
+; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX512-NEXT: vpshufb %ymm4, %ymm2, %ymm2
+; AVX512-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
+; AVX512-NEXT: vpshufb %ymm4, %ymm0, %ymm0
+; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm1
+; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm3
+; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0
+; AVX512-NEXT: vmovdqu32 %zmm0, 128(%rdi)
+; AVX512-NEXT: vmovdqu32 %zmm3, 64(%rdi)
+; AVX512-NEXT: vmovdqu32 %zmm1, (%rdi)
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
+%1 = shufflevector <64 x i8> %a, <64 x i8> %b, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
+%2 = shufflevector <64 x i8> %c, <64 x i8> undef, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+%3 = shufflevector <128 x i8> %1, <128 x i8> %2, <192 x i32> <i32 0, i32 64, i32 128, i32 1, i32 65, i32 129, i32 2, i32 66, i32 130, i32 3, i32 67, i32 131, i32 4, i32 68, i32 132, i32 5, i32 69, i32 133, i32 6, i32 70, i32 134, i32 7, i32 71, i32 135, i32 8, i32 72, i32 136, i32 9, i32 73, i32 137, i32 10, i32 74, i32 138, i32 11, i32 75, i32 139, i32 12, i32 76, i32 140, i32 13, i32 77, i32 141, i32 14, i32 78, i32 142, i32 15, i32 79, i32 143, i32 16, i32 80, i32 144, i32 17, i32 81, i32 145, i32 18, i32 82, i32 146, i32 19, i32 83, i32 147, i32 20, i32 84, i32 148, i32 21, i32 85, i32 149, i32 22, i32 86, i32 150, i32 23, i32 87, i32 151, i32 24, i32 88, i32 152, i32 25, i32 89, i32 153, i32 26, i32 90, i32 154, i32 27, i32 91, i32 155, i32 28, i32 92, i32 156, i32 29, i32 93, i32 157, i32 30, i32 94, i32 158, i32 31, i32 95, i32 159, i32 32, i32 96, i32 160, i32 33, i32 97, i32 161, i32 34, i32 98, i32 162, i32 35, i32 99, i32 163, i32 36, i32 100, i32 164, i32 37, i32 101, i32 165, i32 38, i32 102, i32 166, i32 39, i32 103, i32 167, i32 40, i32 104, i32 168, i32 41, i32 105, i32 169, i32 42, i32 106, i32 170, i32 43, i32 107, i32 171, i32 44, i32 108, i32 172, i32 45, i32 109, i32 173, i32 46, i32 110, i32 174, i32 47, i32 111, i32 175, i32 48, i32 112, i32 176, i32 49, i32 113, i32 177, i32 50, i32 114, i32 178, i32 51, i32 115, i32 179, i32 52, i32 116, i32 180, i32 53, i32 117, i32 181, i32 54, i32 118, i32 182, i32 55, i32 119, i32 183, i32 56, i32 120, i32 184, i32 57, i32 121, i32 185, i32 58, i32 122, i32 186, i32 59, i32 123, i32 187, i32 60, i32 124, i32 188, i32 61, i32 125, i32 189, i32 62, i32 126, i32 190, i32 63, i32 127, i32 191>
+store <192 x i8> %3, <192 x i8>* %p, align 1
+ret void
+}
+
+define <64 x i8> @interleaved_load_vf64_i8_stride3(<192 x i8>* %ptr){
+; AVX1-LABEL: interleaved_load_vf64_i8_stride3:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqu (%rdi), %xmm11
+; AVX1-NEXT: vmovdqu 16(%rdi), %xmm10
+; AVX1-NEXT: vmovdqu 32(%rdi), %xmm8
+; AVX1-NEXT: vmovdqu 48(%rdi), %xmm3
+; AVX1-NEXT: vmovdqu 64(%rdi), %xmm12
+; AVX1-NEXT: vmovdqu 80(%rdi), %xmm9
+; AVX1-NEXT: vmovdqu 96(%rdi), %xmm6
+; AVX1-NEXT: vmovdqu 112(%rdi), %xmm14
+; AVX1-NEXT: vmovdqu 128(%rdi), %xmm13
+; AVX1-NEXT: vmovdqu 144(%rdi), %xmm5
+; AVX1-NEXT: vmovdqu 160(%rdi), %xmm1
+; AVX1-NEXT: vmovdqu 176(%rdi), %xmm15
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
+; AVX1-NEXT: vpshufb %xmm4, %xmm6, %xmm6
+; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm5
+; AVX1-NEXT: vpshufb %xmm4, %xmm11, %xmm2
+; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpshufb %xmm4, %xmm10, %xmm11
+; AVX1-NEXT: vpshufb %xmm4, %xmm12, %xmm12
+; AVX1-NEXT: vpshufb %xmm4, %xmm14, %xmm14
+; AVX1-NEXT: vpshufb %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm4, %xmm13, %xmm0
+; AVX1-NEXT: vpshufb %xmm4, %xmm15, %xmm7
+; AVX1-NEXT: vpshufb %xmm4, %xmm8, %xmm13
+; AVX1-NEXT: vpshufb %xmm4, %xmm9, %xmm4
+; AVX1-NEXT: vpalignr {{.*#+}} xmm15 = xmm4[11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7,8,9,10]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm10 = xmm13[11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm9 = xmm7[11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7,8,9,10]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm8 = xmm0[11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7,8,9,10]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm5 = xmm5[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm6 = xmm6[11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7,8,9,10]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7,8,9,10]
+; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm7
+; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm3[11,12,13,14,15],xmm12[0,1,2,3,4,5,6,7,8,9,10]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7,8,9,10]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm14[11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10]
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm14
+; AVX1-NEXT: vpalignr {{.*#+}} xmm4 = xmm12[11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7,8,9,10]
+; AVX1-NEXT: vpalignr {{.*#+}} xmm11 = xmm11[11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7,8,9,10]
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm11, %ymm12
+; AVX1-NEXT: vmovaps {{.*#+}} ymm13 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
+; AVX1-NEXT: vandnps %ymm12, %ymm13, %ymm12
+; AVX1-NEXT: vandps %ymm13, %ymm14, %ymm14
+; AVX1-NEXT: vorps %ymm12, %ymm14, %ymm12
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm14
+; AVX1-NEXT: vpalignr {{.*#+}} xmm4 = xmm15[11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7,8,9,10]
+; AVX1-NEXT: vmovdqa %xmm4, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; AVX1-NEXT: vandnps %ymm14, %ymm13, %ymm14
+; AVX1-NEXT: vandps %ymm13, %ymm7, %ymm7
+; AVX1-NEXT: vorps %ymm14, %ymm7, %ymm13
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm14 = [128,128,128,128,128,128,11,12,13,14,15,128,128,128,128,128]
+; AVX1-NEXT: vpshufb %xmm14, %xmm3, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [5,6,7,8,9,10,128,128,128,128,128,0,1,2,3,4]
+; AVX1-NEXT: vpshufb %xmm7, %xmm15, %xmm4
+; AVX1-NEXT: vpor %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpalignr {{.*#+}} xmm11 = xmm10[11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7,8,9,10]
+; AVX1-NEXT: vpshufb %xmm14, %xmm2, %xmm2
+; AVX1-NEXT: vpshufb %xmm7, %xmm10, %xmm4
+; AVX1-NEXT: vpor %xmm2, %xmm4, %xmm2
+; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm9[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10]
+; AVX1-NEXT: vpshufb %xmm14, %xmm5, %xmm4
+; AVX1-NEXT: vpshufb %xmm7, %xmm9, %xmm5
+; AVX1-NEXT: vpor %xmm4, %xmm5, %xmm4
+; AVX1-NEXT: vpshufb %xmm14, %xmm6, %xmm5
+; AVX1-NEXT: vpalignr {{.*#+}} xmm6 = xmm8[11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10]
+; AVX1-NEXT: vpshufb %xmm7, %xmm8, %xmm0
+; AVX1-NEXT: vpor %xmm5, %xmm0, %xmm5
+; AVX1-NEXT: vextractf128 $1, %ymm13, %xmm0
+; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpaddb %xmm0, %xmm4, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm12, %xmm0
+; AVX1-NEXT: vpaddb -{{[0-9]+}}(%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX1-NEXT: vpaddb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vpaddb %xmm11, %xmm12, %xmm3
+; AVX1-NEXT: vpaddb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX1-NEXT: vpaddb %xmm6, %xmm13, %xmm2
+; AVX1-NEXT: vpaddb %xmm2, %xmm5, %xmm2
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: interleaved_load_vf64_i8_stride3:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovdqu (%rdi), %xmm0
+; AVX2-NEXT: vmovdqu 16(%rdi), %xmm1
+; AVX2-NEXT: vmovdqu 32(%rdi), %xmm2
+; AVX2-NEXT: vmovdqu 96(%rdi), %xmm3
+; AVX2-NEXT: vmovdqu 112(%rdi), %xmm4
+; AVX2-NEXT: vmovdqu 128(%rdi), %xmm5
+; AVX2-NEXT: vinserti128 $1, 48(%rdi), %ymm0, %ymm0
+; AVX2-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm1
+; AVX2-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2
+; AVX2-NEXT: vinserti128 $1, 144(%rdi), %ymm3, %ymm3
+; AVX2-NEXT: vinserti128 $1, 160(%rdi), %ymm4, %ymm4
+; AVX2-NEXT: vinserti128 $1, 176(%rdi), %ymm5, %ymm5
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
+; AVX2-NEXT: vpshufb %ymm6, %ymm3, %ymm3
+; AVX2-NEXT: vpshufb %ymm6, %ymm0, %ymm0
+; AVX2-NEXT: vpshufb %ymm6, %ymm1, %ymm1
+; AVX2-NEXT: vpshufb %ymm6, %ymm4, %ymm4
+; AVX2-NEXT: vpshufb %ymm6, %ymm5, %ymm5
+; AVX2-NEXT: vpshufb %ymm6, %ymm2, %ymm2
+; AVX2-NEXT: vpalignr {{.*#+}} ymm6 = ymm2[11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7,8,9,10],ymm2[27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23,24,25,26]
+; AVX2-NEXT: vpalignr {{.*#+}} ymm7 = ymm5[11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10],ymm5[27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26]
+; AVX2-NEXT: vpalignr {{.*#+}} ymm3 = ymm3[11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7,8,9,10],ymm3[27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23,24,25,26]
+; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10],ymm0[27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26]
+; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10],ymm1[27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26]
+; AVX2-NEXT: vpalignr {{.*#+}} ymm2 = ymm4[11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7,8,9,10],ymm4[27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23,24,25,26]
+; AVX2-NEXT: vpalignr {{.*#+}} ymm4 = ymm7[11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10],ymm7[27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26]
+; AVX2-NEXT: vpalignr {{.*#+}} ymm5 = ymm6[11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10],ymm6[27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
+; AVX2-NEXT: # ymm8 = mem[0,1,0,1]
+; AVX2-NEXT: vpblendvb %ymm8, %ymm0, %ymm1, %ymm1
+; AVX2-NEXT: vpaddb %ymm5, %ymm1, %ymm1
+; AVX2-NEXT: vpblendvb %ymm8, %ymm3, %ymm2, %ymm2
+; AVX2-NEXT: vpaddb %ymm4, %ymm2, %ymm2
+; AVX2-NEXT: vpblendvb %ymm8, %ymm6, %ymm0, %ymm0
+; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,21,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20]
+; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpblendvb %ymm8, %ymm7, %ymm3, %ymm1
+; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,21,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20]
+; AVX2-NEXT: vpaddb %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: interleaved_load_vf64_i8_stride3:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vmovdqu (%rdi), %xmm0
+; AVX512-NEXT: vmovdqu 16(%rdi), %xmm1
+; AVX512-NEXT: vmovdqu 32(%rdi), %xmm2
+; AVX512-NEXT: vmovdqu 96(%rdi), %xmm3
+; AVX512-NEXT: vmovdqu 112(%rdi), %xmm4
+; AVX512-NEXT: vmovdqu 128(%rdi), %xmm5
+; AVX512-NEXT: vinserti128 $1, 48(%rdi), %ymm0, %ymm0
+; AVX512-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm1
+; AVX512-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2
+; AVX512-NEXT: vinserti128 $1, 144(%rdi), %ymm3, %ymm3
+; AVX512-NEXT: vinserti128 $1, 160(%rdi), %ymm4, %ymm4
+; AVX512-NEXT: vinserti128 $1, 176(%rdi), %ymm5, %ymm5
+; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
+; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm1
+; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm2
+; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
+; AVX512-NEXT: vpshufb %zmm3, %zmm0, %zmm0
+; AVX512-NEXT: vpshufb %zmm3, %zmm1, %zmm1
+; AVX512-NEXT: vpshufb %zmm3, %zmm2, %zmm2
+; AVX512-NEXT: vpalignr {{.*#+}} zmm3 = zmm2[11,12,13,14,15],zmm0[0,1,2,3,4,5,6,7,8,9,10],zmm2[27,28,29,30,31],zmm0[16,17,18,19,20,21,22,23,24,25,26],zmm2[43,44,45,46,47],zmm0[32,33,34,35,36,37,38,39,40,41,42],zmm2[59,60,61,62,63],zmm0[48,49,50,51,52,53,54,55,56,57,58]
+; AVX512-NEXT: vpalignr {{.*#+}} zmm0 = zmm0[11,12,13,14,15],zmm1[0,1,2,3,4,5,6,7,8,9,10],zmm0[27,28,29,30,31],zmm1[16,17,18,19,20,21,22,23,24,25,26],zmm0[43,44,45,46,47],zmm1[32,33,34,35,36,37,38,39,40,41,42],zmm0[59,60,61,62,63],zmm1[48,49,50,51,52,53,54,55,56,57,58]
+; AVX512-NEXT: movabsq $-576188069258921984, %rax # imm = 0xF800F800F800F800
+; AVX512-NEXT: kmovq %rax, %k1
+; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
+; AVX512-NEXT: # ymm4 = mem[0,1,0,1]
+; AVX512-NEXT: vpblendvb %ymm4, %ymm3, %ymm0, %ymm5
+; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm6
+; AVX512-NEXT: vpalignr {{.*#+}} zmm0 {%k1} = zmm1[11,12,13,14,15],zmm2[0,1,2,3,4,5,6,7,8,9,10],zmm1[27,28,29,30,31],zmm2[16,17,18,19,20,21,22,23,24,25,26],zmm1[43,44,45,46,47],zmm2[32,33,34,35,36,37,38,39,40,41,42],zmm1[59,60,61,62,63],zmm2[48,49,50,51,52,53,54,55,56,57,58]
+; AVX512-NEXT: vpalignr {{.*#+}} zmm1 = zmm1[11,12,13,14,15],zmm2[0,1,2,3,4,5,6,7,8,9,10],zmm1[27,28,29,30,31],zmm2[16,17,18,19,20,21,22,23,24,25,26],zmm1[43,44,45,46,47],zmm2[32,33,34,35,36,37,38,39,40,41,42],zmm1[59,60,61,62,63],zmm2[48,49,50,51,52,53,54,55,56,57,58]
+; AVX512-NEXT: vpalignr {{.*#+}} zmm1 = zmm3[11,12,13,14,15],zmm1[0,1,2,3,4,5,6,7,8,9,10],zmm3[27,28,29,30,31],zmm1[16,17,18,19,20,21,22,23,24,25,26],zmm3[43,44,45,46,47],zmm1[32,33,34,35,36,37,38,39,40,41,42],zmm3[59,60,61,62,63],zmm1[48,49,50,51,52,53,54,55,56,57,58]
+; AVX512-NEXT: vpaddb %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: vpalignr {{.*#+}} ymm1 = ymm5[5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,21,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20]
+; AVX512-NEXT: vextracti64x4 $1, %zmm3, %ymm2
+; AVX512-NEXT: vpblendvb %ymm4, %ymm2, %ymm6, %ymm2
+; AVX512-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,21,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20]
+; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512-NEXT: vpaddb %zmm0, %zmm1, %zmm0
+; AVX512-NEXT: retq
+%wide.vec = load <192 x i8>, <192 x i8>* %ptr, align 1
+%v1 = shufflevector <192 x i8> %wide.vec, <192 x i8> undef, <64 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21, i32 24, i32 27, i32 30, i32 33, i32 36, i32 39, i32 42, i32 45, i32 48, i32 51, i32 54, i32 57, i32 60, i32 63, i32 66, i32 69, i32 72, i32 75, i32 78, i32 81, i32 84, i32 87, i32 90, i32 93, i32 96, i32 99, i32 102, i32 105, i32 108, i32 111, i32 114, i32 117, i32 120, i32 123, i32 126, i32 129, i32 132, i32 135, i32 138, i32 141, i32 144, i32 147, i32 150, i32 153, i32 156, i32 159, i32 162, i32 165, i32 168, i32 171, i32 174, i32 177, i32 180, i32 183, i32 186, i32 189>
+%v2 = shufflevector <192 x i8> %wide.vec, <192 x i8> undef, <64 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22, i32 25, i32 28, i32 31, i32 34, i32 37, i32 40, i32 43, i32 46, i32 49, i32 52, i32 55, i32 58, i32 61, i32 64, i32 67, i32 70, i32 73, i32 76, i32 79, i32 82, i32 85, i32 88, i32 91, i32 94, i32 97, i32 100, i32 103, i32 106, i32 109, i32 112, i32 115, i32 118, i32 121, i32 124, i32 127, i32 130, i32 133, i32 136, i32 139, i32 142, i32 145, i32 148, i32 151, i32 154, i32 157, i32 160, i32 163, i32 166, i32 169, i32 172, i32 175, i32 178, i32 181, i32 184, i32 187, i32 190>
+%v3 = shufflevector <192 x i8> %wide.vec, <192 x i8> undef, <64 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23, i32 26, i32 29, i32 32, i32 35, i32 38, i32 41, i32 44, i32 47, i32 50, i32 53, i32 56, i32 59, i32 62, i32 65, i32 68, i32 71, i32 74, i32 77, i32 80, i32 83, i32 86, i32 89, i32 92, i32 95, i32 98, i32 101, i32 104, i32 107, i32 110, i32 113, i32 116, i32 119, i32 122, i32 125, i32 128, i32 131, i32 134, i32 137, i32 140, i32 143, i32 146, i32 149, i32 152, i32 155, i32 158, i32 161, i32 164, i32 167, i32 170, i32 173, i32 176, i32 179, i32 182, i32 185, i32 188, i32 191>
+%add1 = add <64 x i8> %v1, %v2
+%add2 = add <64 x i8> %v3, %add1
+ret <64 x i8> %add2
+}
+
+define void @interleaved_store_vf64_i8_stride4(<64 x i8> %a, <64 x i8> %b, <64 x i8> %c,<64 x i8> %d, <256 x i8>* %p) {
+; AVX1-LABEL: interleaved_store_vf64_i8_stride4:
+; AVX1: # %bb.0:
+; AVX1-NEXT: subq $24, %rsp
+; AVX1-NEXT: .cfi_def_cfa_offset 32
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; AVX1-NEXT: vmovdqa %xmm8, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm11
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm12
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7]
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm13
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm14
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7]
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
+; AVX1-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm12[8],xmm11[8],xmm12[9],xmm11[9],xmm12[10],xmm11[10],xmm12[11],xmm11[11],xmm12[12],xmm11[12],xmm12[13],xmm11[13],xmm12[14],xmm11[14],xmm12[15],xmm11[15]
+; AVX1-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15]
+; AVX1-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm14[8],xmm13[8],xmm14[9],xmm13[9],xmm14[10],xmm13[10],xmm14[11],xmm13[11],xmm14[12],xmm13[12],xmm14[13],xmm13[13],xmm14[14],xmm13[14],xmm14[15],xmm13[15]
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7]
+; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm3
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm4[8],xmm6[8],xmm4[9],xmm6[9],xmm4[10],xmm6[10],xmm4[11],xmm6[11],xmm4[12],xmm6[12],xmm4[13],xmm6[13],xmm4[14],xmm6[14],xmm4[15],xmm6[15]
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3],xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7]
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
+; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm3
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm5[8],xmm7[8],xmm5[9],xmm7[9],xmm5[10],xmm7[10],xmm5[11],xmm7[11],xmm5[12],xmm7[12],xmm5[13],xmm7[13],xmm5[14],xmm7[14],xmm5[15],xmm7[15]
+; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm4
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15]
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm15[0],xmm5[0],xmm15[1],xmm5[1],xmm15[2],xmm5[2],xmm15[3],xmm5[3]
+; AVX1-NEXT: vmovdqa %xmm8, %xmm1
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm11, %ymm0
+; AVX1-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) # 32-byte Spill
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3]
+; AVX1-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3]
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm8, %ymm11
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm15[4],xmm5[4],xmm15[5],xmm5[5],xmm15[6],xmm5[6],xmm15[7],xmm5[7]
+; AVX1-NEXT: vmovdqa %xmm4, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7]
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7]
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm13[0],xmm3[0],xmm13[1],xmm3[1],xmm13[2],xmm3[2],xmm13[3],xmm3[3]
+; AVX1-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3]
+; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm9, %ymm5
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm13[4],xmm3[4],xmm13[5],xmm3[5],xmm13[6],xmm3[6],xmm13[7],xmm3[7]
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7]
+; AVX1-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3]
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7]
+; AVX1-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm3[0],xmm12[0],xmm3[1],xmm12[1],xmm3[2],xmm12[2],xmm3[3],xmm12[3]
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm3[4],xmm12[4],xmm3[5],xmm12[5],xmm3[6],xmm12[6],xmm3[7],xmm12[7]
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm9, %ymm2
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm6, %ymm1
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
+; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm9, %ymm2
+; AVX1-NEXT: vinsertf128 $1, %xmm15, %ymm8, %ymm6
+; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm11, %ymm8
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm11[2,3],ymm6[2,3]
+; AVX1-NEXT: vinsertf128 $1, -{{[0-9]+}}(%rsp), %ymm4, %ymm6 # 16-byte Folded Reload
+; AVX1-NEXT: vmovups -{{[0-9]+}}(%rsp), %ymm3 # 32-byte Reload
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm0
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm3[2,3],ymm6[2,3]
+; AVX1-NEXT: vinsertf128 $1, %xmm13, %ymm7, %ymm3
+; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm5, %ymm7
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm5[2,3],ymm3[2,3]
+; AVX1-NEXT: vmovaps %ymm2, 32(%rdi)
+; AVX1-NEXT: vmovaps %ymm3, 224(%rdi)
+; AVX1-NEXT: vmovaps %ymm6, 192(%rdi)
+; AVX1-NEXT: vmovaps %ymm7, 160(%rdi)
+; AVX1-NEXT: vmovaps %ymm0, 128(%rdi)
+; AVX1-NEXT: vmovaps %ymm1, 96(%rdi)
+; AVX1-NEXT: vmovaps %ymm9, 64(%rdi)
+; AVX1-NEXT: vmovaps %ymm8, (%rdi)
+; AVX1-NEXT: addq $24, %rsp
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: interleaved_store_vf64_i8_stride4:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm8 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23]
+; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm9 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[16],ymm3[16],ymm1[17],ymm3[17],ymm1[18],ymm3[18],ymm1[19],ymm3[19],ymm1[20],ymm3[20],ymm1[21],ymm3[21],ymm1[22],ymm3[22],ymm1[23],ymm3[23]
+; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31]
+; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15],ymm1[24],ymm3[24],ymm1[25],ymm3[25],ymm1[26],ymm3[26],ymm1[27],ymm3[27],ymm1[28],ymm3[28],ymm1[29],ymm3[29],ymm1[30],ymm3[30],ymm1[31],ymm3[31]
+; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm4[0],ymm6[0],ymm4[1],ymm6[1],ymm4[2],ymm6[2],ymm4[3],ymm6[3],ymm4[4],ymm6[4],ymm4[5],ymm6[5],ymm4[6],ymm6[6],ymm4[7],ymm6[7],ymm4[16],ymm6[16],ymm4[17],ymm6[17],ymm4[18],ymm6[18],ymm4[19],ymm6[19],ymm4[20],ymm6[20],ymm4[21],ymm6[21],ymm4[22],ymm6[22],ymm4[23],ymm6[23]
+; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm5[0],ymm7[0],ymm5[1],ymm7[1],ymm5[2],ymm7[2],ymm5[3],ymm7[3],ymm5[4],ymm7[4],ymm5[5],ymm7[5],ymm5[6],ymm7[6],ymm5[7],ymm7[7],ymm5[16],ymm7[16],ymm5[17],ymm7[17],ymm5[18],ymm7[18],ymm5[19],ymm7[19],ymm5[20],ymm7[20],ymm5[21],ymm7[21],ymm5[22],ymm7[22],ymm5[23],ymm7[23]
+; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm4[8],ymm6[8],ymm4[9],ymm6[9],ymm4[10],ymm6[10],ymm4[11],ymm6[11],ymm4[12],ymm6[12],ymm4[13],ymm6[13],ymm4[14],ymm6[14],ymm4[15],ymm6[15],ymm4[24],ymm6[24],ymm4[25],ymm6[25],ymm4[26],ymm6[26],ymm4[27],ymm6[27],ymm4[28],ymm6[28],ymm4[29],ymm6[29],ymm4[30],ymm6[30],ymm4[31],ymm6[31]
+; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm5[8],ymm7[8],ymm5[9],ymm7[9],ymm5[10],ymm7[10],ymm5[11],ymm7[11],ymm5[12],ymm7[12],ymm5[13],ymm7[13],ymm5[14],ymm7[14],ymm5[15],ymm7[15],ymm5[24],ymm7[24],ymm5[25],ymm7[25],ymm5[26],ymm7[26],ymm5[27],ymm7[27],ymm5[28],ymm7[28],ymm5[29],ymm7[29],ymm5[30],ymm7[30],ymm5[31],ymm7[31]
+; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm9[0],ymm3[0],ymm9[1],ymm3[1],ymm9[2],ymm3[2],ymm9[3],ymm3[3],ymm9[8],ymm3[8],ymm9[9],ymm3[9],ymm9[10],ymm3[10],ymm9[11],ymm3[11]
+; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm8[0],ymm2[0],ymm8[1],ymm2[1],ymm8[2],ymm2[2],ymm8[3],ymm2[3],ymm8[8],ymm2[8],ymm8[9],ymm2[9],ymm8[10],ymm2[10],ymm8[11],ymm2[11]
+; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm9[4],ymm3[4],ymm9[5],ymm3[5],ymm9[6],ymm3[6],ymm9[7],ymm3[7],ymm9[12],ymm3[12],ymm9[13],ymm3[13],ymm9[14],ymm3[14],ymm9[15],ymm3[15]
+; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm8[4],ymm2[4],ymm8[5],ymm2[5],ymm8[6],ymm2[6],ymm8[7],ymm2[7],ymm8[12],ymm2[12],ymm8[13],ymm2[13],ymm8[14],ymm2[14],ymm8[15],ymm2[15]
+; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm1[0],ymm5[0],ymm1[1],ymm5[1],ymm1[2],ymm5[2],ymm1[3],ymm5[3],ymm1[8],ymm5[8],ymm1[9],ymm5[9],ymm1[10],ymm5[10],ymm1[11],ymm5[11]
+; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm0[0],ymm4[0],ymm0[1],ymm4[1],ymm0[2],ymm4[2],ymm0[3],ymm4[3],ymm0[8],ymm4[8],ymm0[9],ymm4[9],ymm0[10],ymm4[10],ymm0[11],ymm4[11]
+; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm1[4],ymm5[4],ymm1[5],ymm5[5],ymm1[6],ymm5[6],ymm1[7],ymm5[7],ymm1[12],ymm5[12],ymm1[13],ymm5[13],ymm1[14],ymm5[14],ymm1[15],ymm5[15]
+; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm4[4],ymm0[5],ymm4[5],ymm0[6],ymm4[6],ymm0[7],ymm4[7],ymm0[12],ymm4[12],ymm0[13],ymm4[13],ymm0[14],ymm4[14],ymm0[15],ymm4[15]
+; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm7, %ymm4
+; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm9, %ymm5
+; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm2[2,3]
+; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm9[2,3],ymm0[2,3]
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm6, %ymm7
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm8, %ymm9
+; AVX2-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm6[2,3],ymm3[2,3]
+; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm8[2,3],ymm1[2,3]
+; AVX2-NEXT: vmovdqa %ymm1, 224(%rdi)
+; AVX2-NEXT: vmovdqa %ymm3, 192(%rdi)
+; AVX2-NEXT: vmovdqa %ymm0, 96(%rdi)
+; AVX2-NEXT: vmovdqa %ymm2, 64(%rdi)
+; AVX2-NEXT: vmovdqa %ymm9, 160(%rdi)
+; AVX2-NEXT: vmovdqa %ymm7, 128(%rdi)
+; AVX2-NEXT: vmovdqa %ymm5, 32(%rdi)
+; AVX2-NEXT: vmovdqa %ymm4, (%rdi)
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: interleaved_store_vf64_i8_stride4:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpunpcklbw {{.*#+}} zmm4 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55]
+; AVX512-NEXT: vpunpckhbw {{.*#+}} zmm0 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63]
+; AVX512-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm2[0],zmm3[0],zmm2[1],zmm3[1],zmm2[2],zmm3[2],zmm2[3],zmm3[3],zmm2[4],zmm3[4],zmm2[5],zmm3[5],zmm2[6],zmm3[6],zmm2[7],zmm3[7],zmm2[16],zmm3[16],zmm2[17],zmm3[17],zmm2[18],zmm3[18],zmm2[19],zmm3[19],zmm2[20],zmm3[20],zmm2[21],zmm3[21],zmm2[22],zmm3[22],zmm2[23],zmm3[23],zmm2[32],zmm3[32],zmm2[33],zmm3[33],zmm2[34],zmm3[34],zmm2[35],zmm3[35],zmm2[36],zmm3[36],zmm2[37],zmm3[37],zmm2[38],zmm3[38],zmm2[39],zmm3[39],zmm2[48],zmm3[48],zmm2[49],zmm3[49],zmm2[50],zmm3[50],zmm2[51],zmm3[51],zmm2[52],zmm3[52],zmm2[53],zmm3[53],zmm2[54],zmm3[54],zmm2[55],zmm3[55]
+; AVX512-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm2[8],zmm3[8],zmm2[9],zmm3[9],zmm2[10],zmm3[10],zmm2[11],zmm3[11],zmm2[12],zmm3[12],zmm2[13],zmm3[13],zmm2[14],zmm3[14],zmm2[15],zmm3[15],zmm2[24],zmm3[24],zmm2[25],zmm3[25],zmm2[26],zmm3[26],zmm2[27],zmm3[27],zmm2[28],zmm3[28],zmm2[29],zmm3[29],zmm2[30],zmm3[30],zmm2[31],zmm3[31],zmm2[40],zmm3[40],zmm2[41],zmm3[41],zmm2[42],zmm3[42],zmm2[43],zmm3[43],zmm2[44],zmm3[44],zmm2[45],zmm3[45],zmm2[46],zmm3[46],zmm2[47],zmm3[47],zmm2[56],zmm3[56],zmm2[57],zmm3[57],zmm2[58],zmm3[58],zmm2[59],zmm3[59],zmm2[60],zmm3[60],zmm2[61],zmm3[61],zmm2[62],zmm3[62],zmm2[63],zmm3[63]
+; AVX512-NEXT: vpunpcklwd {{.*#+}} zmm3 = zmm4[0],zmm1[0],zmm4[1],zmm1[1],zmm4[2],zmm1[2],zmm4[3],zmm1[3],zmm4[8],zmm1[8],zmm4[9],zmm1[9],zmm4[10],zmm1[10],zmm4[11],zmm1[11],zmm4[16],zmm1[16],zmm4[17],zmm1[17],zmm4[18],zmm1[18],zmm4[19],zmm1[19],zmm4[24],zmm1[24],zmm4[25],zmm1[25],zmm4[26],zmm1[26],zmm4[27],zmm1[27]
+; AVX512-NEXT: vpunpckhwd {{.*#+}} zmm1 = zmm4[4],zmm1[4],zmm4[5],zmm1[5],zmm4[6],zmm1[6],zmm4[7],zmm1[7],zmm4[12],zmm1[12],zmm4[13],zmm1[13],zmm4[14],zmm1[14],zmm4[15],zmm1[15],zmm4[20],zmm1[20],zmm4[21],zmm1[21],zmm4[22],zmm1[22],zmm4[23],zmm1[23],zmm4[28],zmm1[28],zmm4[29],zmm1[29],zmm4[30],zmm1[30],zmm4[31],zmm1[31]
+; AVX512-NEXT: vpunpcklwd {{.*#+}} zmm4 = zmm0[0],zmm2[0],zmm0[1],zmm2[1],zmm0[2],zmm2[2],zmm0[3],zmm2[3],zmm0[8],zmm2[8],zmm0[9],zmm2[9],zmm0[10],zmm2[10],zmm0[11],zmm2[11],zmm0[16],zmm2[16],zmm0[17],zmm2[17],zmm0[18],zmm2[18],zmm0[19],zmm2[19],zmm0[24],zmm2[24],zmm0[25],zmm2[25],zmm0[26],zmm2[26],zmm0[27],zmm2[27]
+; AVX512-NEXT: vpunpckhwd {{.*#+}} zmm0 = zmm0[4],zmm2[4],zmm0[5],zmm2[5],zmm0[6],zmm2[6],zmm0[7],zmm2[7],zmm0[12],zmm2[12],zmm0[13],zmm2[13],zmm0[14],zmm2[14],zmm0[15],zmm2[15],zmm0[20],zmm2[20],zmm0[21],zmm2[21],zmm0[22],zmm2[22],zmm0[23],zmm2[23],zmm0[28],zmm2[28],zmm0[29],zmm2[29],zmm0[30],zmm2[30],zmm0[31],zmm2[31]
+; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm2
+; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm5
+; AVX512-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm3[2,3],ymm1[2,3]
+; AVX512-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm4[2,3],ymm0[2,3]
+; AVX512-NEXT: vextracti64x4 $1, %zmm3, %ymm3
+; AVX512-NEXT: vextracti64x4 $1, %zmm1, %ymm1
+; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm8
+; AVX512-NEXT: vextracti64x4 $1, %zmm4, %ymm4
+; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm9
+; AVX512-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[2,3]
+; AVX512-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm4[2,3],ymm0[2,3]
+; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm2
+; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm3
+; AVX512-NEXT: vinserti64x4 $1, %ymm9, %zmm8, %zmm4
+; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512-NEXT: vmovdqa32 %zmm0, 192(%rdi)
+; AVX512-NEXT: vmovdqa32 %zmm3, 64(%rdi)
+; AVX512-NEXT: vmovdqa32 %zmm4, 128(%rdi)
+; AVX512-NEXT: vmovdqa32 %zmm2, (%rdi)
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
+%1 = shufflevector <64 x i8> %a, <64 x i8> %b, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
+%2 = shufflevector <64 x i8> %c, <64 x i8> %d, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
+%interleaved = shufflevector <128 x i8> %1, <128 x i8> %2, <256 x i32> <i32 0, i32 64, i32 128, i32 192, i32 1, i32 65, i32 129, i32 193, i32 2, i32 66, i32 130, i32 194, i32 3, i32 67, i32 131, i32 195, i32 4, i32 68, i32 132, i32 196, i32 5, i32 69, i32 133, i32 197, i32 6, i32 70, i32 134, i32 198, i32 7, i32 71, i32 135, i32 199, i32 8, i32 72, i32 136, i32 200, i32 9, i32 73, i32 137, i32 201, i32 10, i32 74, i32 138, i32 202, i32 11, i32 75, i32 139, i32 203, i32 12, i32 76, i32 140, i32 204, i32 13, i32 77, i32 141, i32 205, i32 14, i32 78, i32 142, i32 206, i32 15, i32 79, i32 143, i32 207, i32 16, i32 80, i32 144, i32 208, i32 17, i32 81, i32 145, i32 209, i32 18, i32 82, i32 146, i32 210, i32 19, i32 83, i32 147, i32 211, i32 20, i32 84, i32 148, i32 212, i32 21, i32 85, i32 149, i32 213, i32 22, i32 86, i32 150, i32 214, i32 23, i32 87, i32 151, i32 215, i32 24, i32 88, i32 152, i32 216, i32 25, i32 89, i32 153, i32 217, i32 26, i32 90, i32 154, i32 218, i32 27, i32 91, i32 155, i32 219, i32 28, i32 92, i32 156, i32 220, i32 29, i32 93, i32 157, i32 221, i32 30, i32 94, i32 158, i32 222, i32 31, i32 95, i32 159, i32 223, i32 32, i32 96, i32 160, i32 224, i32 33, i32 97, i32 161, i32 225, i32 34, i32 98, i32 162, i32 226, i32 35, i32 99, i32 163, i32 227, i32 36, i32 100, i32 164, i32 228, i32 37, i32 101, i32 165, i32 229, i32 38, i32 102, i32 166, i32 230, i32 39, i32 103, i32 167, i32 231, i32 40, i32 104, i32 168, i32 232, i32 41, i32 105, i32 169, i32 233, i32 42, i32 106, i32 170, i32 234, i32 43, i32 107, i32 171, i32 235, i32 44, i32 108, i32 172, i32 236, i32 45, i32 109, i32 173, i32 237, i32 46, i32 110, i32 174, i32 238, i32 47, i32 111, i32 175, i32 239, i32 48, i32 112, i32 176, i32 240, i32 49, i32 113, i32 177, i32 241, i32 50, i32 114, i32 178, i32 242, i32 51, i32 115, i32 179, i32 243, i32 52, i32 116, i32 180, i32 244, i32 53, i32 117, i32 181, i32 245, i32 54, i32 118, i32 182, i32 246, i32 55, i32 119, i32 183, i32 247, i32 56, i32 120, i32 184, i32 248, i32 57, i32 121, i32 185, i32 249, i32 58, i32 122, i32 186, i32 250, i32 59, i32 123, i32 187, i32 251, i32 60, i32 124, i32 188, i32 252, i32 61, i32 125, i32 189, i32 253, i32 62, i32 126, i32 190, i32 254, i32 63, i32 127, i32 191, i32 255>
+store <256 x i8> %interleaved, <256 x i8>* %p
+ret void
+}
diff --git a/test/CodeGen/X86/x86-interleaved-check.ll b/test/CodeGen/X86/x86-interleaved-check.ll
new file mode 100644
index 000000000000..0a77b868506e
--- /dev/null
+++ b/test/CodeGen/X86/x86-interleaved-check.ll
@@ -0,0 +1,15 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx | FileCheck %s --check-prefix=AVX
+; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx2 | FileCheck %s --check-prefix=AVX
+; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx512f -mattr=+avx512bw | FileCheck %s --check-prefix=AVX
+
+define void @validate() {
+; AVX-LABEL: validate:
+; AVX: # %bb.0: # %entry
+entry:
+ %0 = bitcast i8 addrspace(1)* undef to <96 x i8> addrspace(1)*
+ %wide.vec = load <96 x i8>, <96 x i8> addrspace(1)* %0, align 1
+ %strided.vec = shufflevector <96 x i8> %wide.vec, <96 x i8> undef, <32 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21, i32 24, i32 27, i32 30, i32 33, i32 36, i32 39, i32 42, i32 45, i32 48, i32 51, i32 54, i32 57, i32 60, i32 63, i32 66, i32 69, i32 72, i32 75, i32 78, i32 81, i32 84, i32 87, i32 90, i32 93>
+ unreachable
+}
+
diff --git a/test/CodeGen/X86/x86-interrupt_cc.ll b/test/CodeGen/X86/x86-interrupt_cc.ll
index b91b8fbfb76d..3251d7314689 100644
--- a/test/CodeGen/X86/x86-interrupt_cc.ll
+++ b/test/CodeGen/X86/x86-interrupt_cc.ll
@@ -1,7 +1,7 @@
; RUN: llc -verify-machineinstrs -mtriple=x86_64-apple-macosx -show-mc-encoding -mattr=+avx512f < %s | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK64
; RUN: llc -verify-machineinstrs -mtriple=i386-apple-macosx -show-mc-encoding -mattr=+avx512f < %s | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK32
-; Make sure we spill the high numbered ZMM registers and K registers with the right encoding.
+; Make sure we spill the high numbered zmm registers and K registers with the right encoding.
; CHECK-LABEL: foo
; CHECK: kmovq %k7, {{.+}}
; CHECK64: encoding: [0xc4,0xe1,0xf8,0x91,0xbc,0x24,0x68,0x08,0x00,0x00]
diff --git a/test/CodeGen/X86/x86-no_caller_saved_registers-preserve.ll b/test/CodeGen/X86/x86-no_caller_saved_registers-preserve.ll
index 3052a0f615eb..9c4cb671f4cd 100644
--- a/test/CodeGen/X86/x86-no_caller_saved_registers-preserve.ll
+++ b/test/CodeGen/X86/x86-no_caller_saved_registers-preserve.ll
@@ -3,20 +3,17 @@
;; In functions with 'no_caller_saved_registers' attribute, all registers should
;; be preserved except for registers used for passing/returning arguments.
-;; In the following function registers %RDI, %RSI and %XMM0 are used to store
-;; arguments %a0, %a1 and %b0 accordingally. The value is returned in %RAX.
+;; In the following function registers %rdi, %rsi and %xmm0 are used to store
+;; arguments %a0, %a1 and %b0 accordingally. The value is returned in %rax.
;; The above registers should not be preserved, however other registers
-;; (that are modified by the function) should be preserved (%RDX and %XMM1).
+;; (that are modified by the function) should be preserved (%rdx and %xmm1).
define x86_64_sysvcc i32 @bar(i32 %a0, i32 %a1, float %b0) #0 {
; CHECK-LABEL: bar:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: pushq %rdx
-; CHECK-NEXT: .Lcfi0:
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; CHECK-NEXT: .Lcfi1:
; CHECK-NEXT: .cfi_offset %rdx, -16
-; CHECK-NEXT: .Lcfi2:
; CHECK-NEXT: .cfi_offset %xmm1, -32
; CHECK-NEXT: #APP
; CHECK-NEXT: #NO_APP
@@ -30,7 +27,7 @@ define x86_64_sysvcc i32 @bar(i32 %a0, i32 %a1, float %b0) #0 {
;; Because "bar" has 'no_caller_saved_registers' attribute, function "foo"
;; doesn't need to preserve registers except for the arguments passed
-;; to "bar" (%ESI, %EDI and %XMM0).
+;; to "bar" (%esi, %edi and %xmm0).
define x86_64_sysvcc float @foo(i32 %a0, i32 %a1, float %b0) {
; CHECK-LABEL: foo
; CHECK: movaps %xmm0, %xmm1
diff --git a/test/CodeGen/X86/x86-setcc-int-to-fp-combine.ll b/test/CodeGen/X86/x86-setcc-int-to-fp-combine.ll
index 26dd9d46641a..0c41c3ec6c11 100644
--- a/test/CodeGen/X86/x86-setcc-int-to-fp-combine.ll
+++ b/test/CodeGen/X86/x86-setcc-int-to-fp-combine.ll
@@ -7,7 +7,7 @@ define <4 x float> @foo(<4 x float> %val, <4 x float> %test) nounwind {
; CHECK-NEXT: .long 1065353216 ## 0x3f800000
; CHECK-NEXT: .long 1065353216 ## 0x3f800000
; CHECK-LABEL: foo:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: cmpeqps %xmm1, %xmm0
; CHECK-NEXT: andps {{.*}}(%rip), %xmm0
; CHECK-NEXT: retq
@@ -27,7 +27,7 @@ define void @foo1(<4 x float> %val, <4 x float> %test, <4 x double>* %p) nounwin
; CHECK-NEXT: .long 1 ## 0x1
; CHECK-NEXT: .long 1 ## 0x1
; CHECK-LABEL: foo1:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: cmpeqps %xmm1, %xmm0
; CHECK-NEXT: andps {{.*}}(%rip), %xmm0
; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
@@ -51,7 +51,7 @@ define void @foo2(<4 x float>* noalias %result) nounwind {
; CHECK-NEXT: .long 1086324736 ## float 6
; CHECK-NEXT: .long 1088421888 ## float 7
; CHECK-LABEL: foo2:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: movaps {{.*#+}} xmm0 = [4.000000e+00,5.000000e+00,6.000000e+00,7.000000e+00]
; CHECK-NEXT: movaps %xmm0, (%rdi)
; CHECK-NEXT: retq
@@ -69,7 +69,7 @@ define <4 x float> @foo3(<4 x float> %val, <4 x float> %test) nounwind {
; CHECK-NEXT: .long 1065353216 ## 0x3f800000
; CHECK-NEXT: .long 0 ## 0x0
; CHECK-LABEL: foo3:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: cmpeqps %xmm1, %xmm0
; CHECK-NEXT: andps {{.*}}(%rip), %xmm0
; CHECK-NEXT: retq
@@ -88,7 +88,7 @@ define void @foo4(<4 x float>* noalias %result) nounwind {
; CHECK-NEXT: .long 1124073472 ## float 128
; CHECK-NEXT: .long 1132396544 ## float 255
; CHECK-LABEL: foo4:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; CHECK-NEXT: movaps {{.*#+}} xmm0 = [1.000000e+00,1.270000e+02,1.280000e+02,2.550000e+02]
; CHECK-NEXT: movaps %xmm0, (%rdi)
; CHECK-NEXT: retq
diff --git a/test/CodeGen/X86/x86-shifts.ll b/test/CodeGen/X86/x86-shifts.ll
index 8a51863bd1bc..f6191866edda 100644
--- a/test/CodeGen/X86/x86-shifts.ll
+++ b/test/CodeGen/X86/x86-shifts.ll
@@ -6,7 +6,7 @@
define <4 x i32> @shl4(<4 x i32> %A) nounwind {
; X32-LABEL: shl4:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: movdqa %xmm0, %xmm1
; X32-NEXT: pslld $2, %xmm1
; X32-NEXT: paddd %xmm0, %xmm0
@@ -14,7 +14,7 @@ define <4 x i32> @shl4(<4 x i32> %A) nounwind {
; X32-NEXT: retl
;
; X64-LABEL: shl4:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: movdqa %xmm0, %xmm1
; X64-NEXT: pslld $2, %xmm1
; X64-NEXT: paddd %xmm0, %xmm0
@@ -29,7 +29,7 @@ entry:
define <4 x i32> @shr4(<4 x i32> %A) nounwind {
; X32-LABEL: shr4:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: movdqa %xmm0, %xmm1
; X32-NEXT: psrld $2, %xmm1
; X32-NEXT: psrld $1, %xmm0
@@ -37,7 +37,7 @@ define <4 x i32> @shr4(<4 x i32> %A) nounwind {
; X32-NEXT: retl
;
; X64-LABEL: shr4:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: movdqa %xmm0, %xmm1
; X64-NEXT: psrld $2, %xmm1
; X64-NEXT: psrld $1, %xmm0
@@ -52,7 +52,7 @@ entry:
define <4 x i32> @sra4(<4 x i32> %A) nounwind {
; X32-LABEL: sra4:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: movdqa %xmm0, %xmm1
; X32-NEXT: psrad $2, %xmm1
; X32-NEXT: psrad $1, %xmm0
@@ -60,7 +60,7 @@ define <4 x i32> @sra4(<4 x i32> %A) nounwind {
; X32-NEXT: retl
;
; X64-LABEL: sra4:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: movdqa %xmm0, %xmm1
; X64-NEXT: psrad $2, %xmm1
; X64-NEXT: psrad $1, %xmm0
@@ -75,7 +75,7 @@ entry:
define <2 x i64> @shl2(<2 x i64> %A) nounwind {
; X32-LABEL: shl2:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: movdqa %xmm0, %xmm1
; X32-NEXT: psllq $2, %xmm1
; X32-NEXT: psllq $9, %xmm0
@@ -83,7 +83,7 @@ define <2 x i64> @shl2(<2 x i64> %A) nounwind {
; X32-NEXT: retl
;
; X64-LABEL: shl2:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: movdqa %xmm0, %xmm1
; X64-NEXT: psllq $2, %xmm1
; X64-NEXT: psllq $9, %xmm0
@@ -98,7 +98,7 @@ entry:
define <2 x i64> @shr2(<2 x i64> %A) nounwind {
; X32-LABEL: shr2:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: movdqa %xmm0, %xmm1
; X32-NEXT: psrlq $8, %xmm1
; X32-NEXT: psrlq $1, %xmm0
@@ -106,7 +106,7 @@ define <2 x i64> @shr2(<2 x i64> %A) nounwind {
; X32-NEXT: retl
;
; X64-LABEL: shr2:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: movdqa %xmm0, %xmm1
; X64-NEXT: psrlq $8, %xmm1
; X64-NEXT: psrlq $1, %xmm0
@@ -122,7 +122,7 @@ entry:
define <8 x i16> @shl8(<8 x i16> %A) nounwind {
; X32-LABEL: shl8:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: movdqa %xmm0, %xmm1
; X32-NEXT: psllw $2, %xmm1
; X32-NEXT: paddw %xmm0, %xmm0
@@ -130,7 +130,7 @@ define <8 x i16> @shl8(<8 x i16> %A) nounwind {
; X32-NEXT: retl
;
; X64-LABEL: shl8:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: movdqa %xmm0, %xmm1
; X64-NEXT: psllw $2, %xmm1
; X64-NEXT: paddw %xmm0, %xmm0
@@ -145,7 +145,7 @@ entry:
define <8 x i16> @shr8(<8 x i16> %A) nounwind {
; X32-LABEL: shr8:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: movdqa %xmm0, %xmm1
; X32-NEXT: psrlw $2, %xmm1
; X32-NEXT: psrlw $1, %xmm0
@@ -153,7 +153,7 @@ define <8 x i16> @shr8(<8 x i16> %A) nounwind {
; X32-NEXT: retl
;
; X64-LABEL: shr8:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: movdqa %xmm0, %xmm1
; X64-NEXT: psrlw $2, %xmm1
; X64-NEXT: psrlw $1, %xmm0
@@ -168,7 +168,7 @@ entry:
define <8 x i16> @sra8(<8 x i16> %A) nounwind {
; X32-LABEL: sra8:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: movdqa %xmm0, %xmm1
; X32-NEXT: psraw $2, %xmm1
; X32-NEXT: psraw $1, %xmm0
@@ -176,7 +176,7 @@ define <8 x i16> @sra8(<8 x i16> %A) nounwind {
; X32-NEXT: retl
;
; X64-LABEL: sra8:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: movdqa %xmm0, %xmm1
; X64-NEXT: psraw $2, %xmm1
; X64-NEXT: psraw $1, %xmm0
@@ -194,7 +194,7 @@ entry:
define <8 x i16> @sll8_nosplat(<8 x i16> %A) nounwind {
; X32-LABEL: sll8_nosplat:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: movdqa {{.*#+}} xmm1 = [2,4,8,64,4,4,4,4]
; X32-NEXT: pmullw %xmm0, %xmm1
; X32-NEXT: pmullw {{\.LCPI.*}}, %xmm0
@@ -202,7 +202,7 @@ define <8 x i16> @sll8_nosplat(<8 x i16> %A) nounwind {
; X32-NEXT: retl
;
; X64-LABEL: sll8_nosplat:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: movdqa {{.*#+}} xmm1 = [2,4,8,64,4,4,4,4]
; X64-NEXT: pmullw %xmm0, %xmm1
; X64-NEXT: pmullw {{.*}}(%rip), %xmm0
@@ -218,7 +218,7 @@ entry:
define <2 x i64> @shr2_nosplat(<2 x i64> %A) nounwind {
; X32-LABEL: shr2_nosplat:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: movdqa %xmm0, %xmm2
; X32-NEXT: psrlq $8, %xmm2
; X32-NEXT: movdqa %xmm0, %xmm1
@@ -230,11 +230,11 @@ define <2 x i64> @shr2_nosplat(<2 x i64> %A) nounwind {
; X32-NEXT: retl
;
; X64-LABEL: shr2_nosplat:
-; X64: # BB#0: # %entry
-; X64-NEXT: movdqa %xmm0, %xmm1
-; X64-NEXT: psrlq $1, %xmm1
+; X64: # %bb.0: # %entry
; X64-NEXT: movdqa %xmm0, %xmm2
; X64-NEXT: psrlq $8, %xmm2
+; X64-NEXT: movdqa %xmm0, %xmm1
+; X64-NEXT: psrlq $1, %xmm1
; X64-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; X64-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
; X64-NEXT: xorpd %xmm0, %xmm1
@@ -252,7 +252,7 @@ entry:
define <2 x i32> @shl2_other(<2 x i32> %A) nounwind {
; X32-LABEL: shl2_other:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: movdqa %xmm0, %xmm1
; X32-NEXT: psllq $2, %xmm1
; X32-NEXT: psllq $9, %xmm0
@@ -260,7 +260,7 @@ define <2 x i32> @shl2_other(<2 x i32> %A) nounwind {
; X32-NEXT: retl
;
; X64-LABEL: shl2_other:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: movdqa %xmm0, %xmm1
; X64-NEXT: psllq $2, %xmm1
; X64-NEXT: psllq $9, %xmm0
@@ -275,7 +275,7 @@ entry:
define <2 x i32> @shr2_other(<2 x i32> %A) nounwind {
; X32-LABEL: shr2_other:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: pand {{\.LCPI.*}}, %xmm0
; X32-NEXT: movdqa %xmm0, %xmm1
; X32-NEXT: psrlq $8, %xmm1
@@ -284,7 +284,7 @@ define <2 x i32> @shr2_other(<2 x i32> %A) nounwind {
; X32-NEXT: retl
;
; X64-LABEL: shr2_other:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: pand {{.*}}(%rip), %xmm0
; X64-NEXT: movdqa %xmm0, %xmm1
; X64-NEXT: psrlq $8, %xmm1
@@ -300,13 +300,13 @@ entry:
define <16 x i8> @shl9(<16 x i8> %A) nounwind {
; X32-LABEL: shl9:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: psllw $3, %xmm0
; X32-NEXT: pand {{\.LCPI.*}}, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: shl9:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: psllw $3, %xmm0
; X64-NEXT: pand {{.*}}(%rip), %xmm0
; X64-NEXT: retq
@@ -316,13 +316,13 @@ define <16 x i8> @shl9(<16 x i8> %A) nounwind {
define <16 x i8> @shr9(<16 x i8> %A) nounwind {
; X32-LABEL: shr9:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: psrlw $3, %xmm0
; X32-NEXT: pand {{\.LCPI.*}}, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: shr9:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: psrlw $3, %xmm0
; X64-NEXT: pand {{.*}}(%rip), %xmm0
; X64-NEXT: retq
@@ -332,14 +332,14 @@ define <16 x i8> @shr9(<16 x i8> %A) nounwind {
define <16 x i8> @sra_v16i8_7(<16 x i8> %A) nounwind {
; X32-LABEL: sra_v16i8_7:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: pxor %xmm1, %xmm1
; X32-NEXT: pcmpgtb %xmm0, %xmm1
; X32-NEXT: movdqa %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: sra_v16i8_7:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: pxor %xmm1, %xmm1
; X64-NEXT: pcmpgtb %xmm0, %xmm1
; X64-NEXT: movdqa %xmm1, %xmm0
@@ -350,7 +350,7 @@ define <16 x i8> @sra_v16i8_7(<16 x i8> %A) nounwind {
define <16 x i8> @sra_v16i8(<16 x i8> %A) nounwind {
; X32-LABEL: sra_v16i8:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: psrlw $3, %xmm0
; X32-NEXT: pand {{\.LCPI.*}}, %xmm0
; X32-NEXT: movdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
@@ -359,7 +359,7 @@ define <16 x i8> @sra_v16i8(<16 x i8> %A) nounwind {
; X32-NEXT: retl
;
; X64-LABEL: sra_v16i8:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: psrlw $3, %xmm0
; X64-NEXT: pand {{.*}}(%rip), %xmm0
; X64-NEXT: movdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
diff --git a/test/CodeGen/X86/x86-shrink-wrapping.ll b/test/CodeGen/X86/x86-shrink-wrapping.ll
index 519f0d0924e3..9d856c6442bb 100644
--- a/test/CodeGen/X86/x86-shrink-wrapping.ll
+++ b/test/CodeGen/X86/x86-shrink-wrapping.ll
@@ -989,16 +989,16 @@ attributes #4 = { "no-frame-pointer-elim"="true" }
; looking for the nearest common post-dominator of an "unreachable" block.
; CHECK-LABEL: infiniteLoopNoSuccessor:
-; CHECK: ## BB#0:
+; CHECK: ## %bb.0:
; Make sure the prologue happens in the entry block.
; CHECK-NEXT: pushq %rbp
; ...
; Make sure we don't shrink-wrap.
-; CHECK: ## BB#1
+; CHECK: ## %bb.1
; CHECK-NOT: pushq %rbp
; ...
; Make sure the epilogue happens in the exit block.
-; CHECK: ## BB#5
+; CHECK: ## %bb.5
; CHECK: popq %rbp
; CHECK-NEXT: retq
define void @infiniteLoopNoSuccessor() #5 {
diff --git a/test/CodeGen/X86/x86-upgrade-avx-vbroadcast.ll b/test/CodeGen/X86/x86-upgrade-avx-vbroadcast.ll
index d3a12862a9e4..e7b3a5b49903 100644
--- a/test/CodeGen/X86/x86-upgrade-avx-vbroadcast.ll
+++ b/test/CodeGen/X86/x86-upgrade-avx-vbroadcast.ll
@@ -9,7 +9,7 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
define <4 x float> @test_mm_broadcast_ss(float* readonly %__a){
; CHECK-LABEL: test_mm_broadcast_ss:
-; CHECK: ## BB#0: ## %entry
+; CHECK: ## %bb.0: ## %entry
; CHECK-NEXT: vbroadcastss (%rdi), %xmm0
; CHECK-NEXT: retq
entry:
@@ -21,7 +21,7 @@ declare <8 x float> @llvm.x86.avx.vbroadcast.ss.256(i8*)
define <4 x double> @test_mm256_broadcast_sd(double* readonly %__a) {
; CHECK-LABEL: test_mm256_broadcast_sd:
-; CHECK: ## BB#0: ## %entry
+; CHECK: ## %bb.0: ## %entry
; CHECK-NEXT: vbroadcastsd (%rdi), %ymm0
; CHECK-NEXT: retq
entry:
@@ -33,7 +33,7 @@ declare <4 x double> @llvm.x86.avx.vbroadcast.sd.256(i8*)
define <8 x float> @test_mm256_broadcast_ss(float* readonly %__a) {
; CHECK-LABEL: test_mm256_broadcast_ss:
-; CHECK: ## BB#0: ## %entry
+; CHECK: ## %bb.0: ## %entry
; CHECK-NEXT: vbroadcastss (%rdi), %ymm0
; CHECK-NEXT: retq
entry:
diff --git a/test/CodeGen/X86/x86-upgrade-avx2-vbroadcast.ll b/test/CodeGen/X86/x86-upgrade-avx2-vbroadcast.ll
index 8e081b9e4100..543d4f405adf 100644
--- a/test/CodeGen/X86/x86-upgrade-avx2-vbroadcast.ll
+++ b/test/CodeGen/X86/x86-upgrade-avx2-vbroadcast.ll
@@ -7,8 +7,8 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
define <4 x i64> @broadcast128(<2 x i64> %src) {
; CHECK-LABEL: broadcast128:
-; CHECK: ## BB#0:
-; CHECK-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; CHECK: ## %bb.0:
+; CHECK-NEXT: ## kill: def %xmm0 killed %xmm0 def %ymm0
; CHECK-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; CHECK-NEXT: retq
diff --git a/test/CodeGen/X86/x87-schedule.ll b/test/CodeGen/X86/x87-schedule.ll
new file mode 100644
index 000000000000..37b0f3fe98d1
--- /dev/null
+++ b/test/CodeGen/X86/x87-schedule.ll
@@ -0,0 +1,5848 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown -print-schedule -mcpu=i686 | FileCheck %s --check-prefix=CHECK --check-prefix=GENERIC
+; RUN: llc < %s -mtriple=i686-unknown-unknown -print-schedule -mcpu=atom | FileCheck %s --check-prefix=CHECK --check-prefix=ATOM
+; RUN: llc < %s -mtriple=i686-unknown-unknown -print-schedule -mcpu=slm | FileCheck %s --check-prefix=CHECK --check-prefix=SLM
+; RUN: llc < %s -mtriple=i686-unknown-unknown -print-schedule -mcpu=sandybridge | FileCheck %s --check-prefix=CHECK --check-prefix=SANDY
+; RUN: llc < %s -mtriple=i686-unknown-unknown -print-schedule -mcpu=ivybridge | FileCheck %s --check-prefix=CHECK --check-prefix=SANDY
+; RUN: llc < %s -mtriple=i686-unknown-unknown -print-schedule -mcpu=haswell | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
+; RUN: llc < %s -mtriple=i686-unknown-unknown -print-schedule -mcpu=broadwell | FileCheck %s --check-prefix=CHECK --check-prefix=BROADWELL
+; RUN: llc < %s -mtriple=i686-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=SKYLAKE
+; RUN: llc < %s -mtriple=i686-unknown-unknown -print-schedule -mcpu=skx | FileCheck %s --check-prefix=CHECK --check-prefix=SKX
+; RUN: llc < %s -mtriple=i686-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
+; RUN: llc < %s -mtriple=i686-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1
+
+define void @test_f2xm1() optsize {
+; GENERIC-LABEL: test_f2xm1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: f2xm1
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retl
+;
+; ATOM-LABEL: test_f2xm1:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: f2xm1 # sched: [99:49.50]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retl # sched: [79:39.50]
+;
+; SLM-LABEL: test_f2xm1:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: f2xm1 # sched: [100:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retl # sched: [4:1.00]
+;
+; SANDY-LABEL: test_f2xm1:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: f2xm1 # sched: [100:0.33]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retl # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_f2xm1:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: f2xm1 # sched: [100:0.25]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retl # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_f2xm1:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: f2xm1 # sched: [100:0.25]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retl # sched: [6:0.50]
+;
+; SKYLAKE-LABEL: test_f2xm1:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: f2xm1 # sched: [100:0.25]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retl # sched: [6:0.50]
+;
+; SKX-LABEL: test_f2xm1:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: f2xm1 # sched: [100:0.25]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retl # sched: [6:0.50]
+;
+; BTVER2-LABEL: test_f2xm1:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: f2xm1 # sched: [100:0.17]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retl # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_f2xm1:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: f2xm1 # sched: [100:?]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retl # sched: [1:0.50]
+ tail call void asm sideeffect "f2xm1", ""() nounwind
+ ret void
+}
+
+define void @test_fabs() optsize {
+; GENERIC-LABEL: test_fabs:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: fabs
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retl
+;
+; ATOM-LABEL: test_fabs:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: fabs # sched: [1:1.00]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retl # sched: [79:39.50]
+;
+; SLM-LABEL: test_fabs:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: fabs # sched: [1:0.50]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retl # sched: [4:1.00]
+;
+; SANDY-LABEL: test_fabs:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: fabs # sched: [1:1.00]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retl # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_fabs:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: fabs # sched: [1:1.00]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retl # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_fabs:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: fabs # sched: [1:0.33]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retl # sched: [6:0.50]
+;
+; SKYLAKE-LABEL: test_fabs:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: fabs # sched: [1:0.33]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retl # sched: [6:0.50]
+;
+; SKX-LABEL: test_fabs:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: fabs # sched: [1:0.33]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retl # sched: [6:0.50]
+;
+; BTVER2-LABEL: test_fabs:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: fabs # sched: [1:0.50]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retl # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_fabs:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: fabs # sched: [2:1.00]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retl # sched: [1:0.50]
+ tail call void asm sideeffect "fabs", ""() nounwind
+ ret void
+}
+
+define void @test_fadd(float *%a0, double *%a1) optsize {
+; GENERIC-LABEL: test_fadd:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: movl {{[0-9]+}}(%esp), %eax
+; GENERIC-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: fadd %st(0), %st(1)
+; GENERIC-NEXT: fadd %st(2)
+; GENERIC-NEXT: fadds (%ecx)
+; GENERIC-NEXT: faddl (%eax)
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retl
+;
+; ATOM-LABEL: test_fadd:
+; ATOM: # %bb.0:
+; ATOM-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [1:1.00]
+; ATOM-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [1:1.00]
+; ATOM-NEXT: #APP
+; ATOM-NEXT: fadd %st(0), %st(1)
+; ATOM-NEXT: fadd %st(2)
+; ATOM-NEXT: fadds (%ecx)
+; ATOM-NEXT: faddl (%eax)
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retl # sched: [79:39.50]
+;
+; SLM-LABEL: test_fadd:
+; SLM: # %bb.0:
+; SLM-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [3:1.00]
+; SLM-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [3:1.00]
+; SLM-NEXT: #APP
+; SLM-NEXT: fadd %st(0), %st(1) # sched: [3:1.00]
+; SLM-NEXT: fadd %st(2) # sched: [3:1.00]
+; SLM-NEXT: fadds (%ecx) # sched: [6:1.00]
+; SLM-NEXT: faddl (%eax) # sched: [6:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retl # sched: [4:1.00]
+;
+; SANDY-LABEL: test_fadd:
+; SANDY: # %bb.0:
+; SANDY-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; SANDY-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; SANDY-NEXT: #APP
+; SANDY-NEXT: fadd %st(0), %st(1) # sched: [3:1.00]
+; SANDY-NEXT: fadd %st(2) # sched: [3:1.00]
+; SANDY-NEXT: fadds (%ecx) # sched: [10:1.00]
+; SANDY-NEXT: faddl (%eax) # sched: [10:1.00]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retl # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_fadd:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; HASWELL-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: fadd %st(0), %st(1) # sched: [3:1.00]
+; HASWELL-NEXT: fadd %st(2) # sched: [3:1.00]
+; HASWELL-NEXT: fadds (%ecx) # sched: [10:1.00]
+; HASWELL-NEXT: faddl (%eax) # sched: [10:1.00]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retl # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_fadd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; BROADWELL-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: fadd %st(0), %st(1) # sched: [3:1.00]
+; BROADWELL-NEXT: fadd %st(2) # sched: [3:1.00]
+; BROADWELL-NEXT: fadds (%ecx) # sched: [9:1.00]
+; BROADWELL-NEXT: faddl (%eax) # sched: [9:1.00]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retl # sched: [6:0.50]
+;
+; SKYLAKE-LABEL: test_fadd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; SKYLAKE-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: fadd %st(0), %st(1) # sched: [3:1.00]
+; SKYLAKE-NEXT: fadd %st(2) # sched: [3:1.00]
+; SKYLAKE-NEXT: fadds (%ecx) # sched: [10:1.00]
+; SKYLAKE-NEXT: faddl (%eax) # sched: [10:1.00]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retl # sched: [6:0.50]
+;
+; SKX-LABEL: test_fadd:
+; SKX: # %bb.0:
+; SKX-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; SKX-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; SKX-NEXT: #APP
+; SKX-NEXT: fadd %st(0), %st(1) # sched: [3:1.00]
+; SKX-NEXT: fadd %st(2) # sched: [3:1.00]
+; SKX-NEXT: fadds (%ecx) # sched: [10:1.00]
+; SKX-NEXT: faddl (%eax) # sched: [10:1.00]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retl # sched: [6:0.50]
+;
+; BTVER2-LABEL: test_fadd:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:1.00]
+; BTVER2-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:1.00]
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: fadd %st(0), %st(1) # sched: [3:1.00]
+; BTVER2-NEXT: fadd %st(2) # sched: [3:1.00]
+; BTVER2-NEXT: fadds (%ecx) # sched: [8:1.00]
+; BTVER2-NEXT: faddl (%eax) # sched: [8:1.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retl # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_fadd:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [8:0.50]
+; ZNVER1-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [8:0.50]
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: fadd %st(0), %st(1) # sched: [3:1.00]
+; ZNVER1-NEXT: fadd %st(2) # sched: [3:1.00]
+; ZNVER1-NEXT: fadds (%ecx) # sched: [10:1.00]
+; ZNVER1-NEXT: faddl (%eax) # sched: [10:1.00]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retl # sched: [1:0.50]
+ tail call void asm sideeffect "fadd %st(0), %st(1) \0A\09 fadd %st(2), %st(0) \0A\09 fadds $0 \0A\09 faddl $1", "*m,*m"(float *%a0, double *%a1) nounwind
+ ret void
+}
+
+define void @test_faddp_fiadd(i16 *%a0, i32 *%a1) optsize {
+; GENERIC-LABEL: test_faddp_fiadd:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: movl {{[0-9]+}}(%esp), %eax
+; GENERIC-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: faddp %st(1)
+; GENERIC-NEXT: faddp %st(2)
+; GENERIC-NEXT: fiadds (%ecx)
+; GENERIC-NEXT: fiaddl (%eax)
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retl
+;
+; ATOM-LABEL: test_faddp_fiadd:
+; ATOM: # %bb.0:
+; ATOM-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [1:1.00]
+; ATOM-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [1:1.00]
+; ATOM-NEXT: #APP
+; ATOM-NEXT: faddp %st(1)
+; ATOM-NEXT: faddp %st(2)
+; ATOM-NEXT: fiadds (%ecx)
+; ATOM-NEXT: fiaddl (%eax)
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retl # sched: [79:39.50]
+;
+; SLM-LABEL: test_faddp_fiadd:
+; SLM: # %bb.0:
+; SLM-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [3:1.00]
+; SLM-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [3:1.00]
+; SLM-NEXT: #APP
+; SLM-NEXT: faddp %st(1) # sched: [3:1.00]
+; SLM-NEXT: faddp %st(2) # sched: [3:1.00]
+; SLM-NEXT: fiadds (%ecx) # sched: [6:1.00]
+; SLM-NEXT: fiaddl (%eax) # sched: [6:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retl # sched: [4:1.00]
+;
+; SANDY-LABEL: test_faddp_fiadd:
+; SANDY: # %bb.0:
+; SANDY-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; SANDY-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; SANDY-NEXT: #APP
+; SANDY-NEXT: faddp %st(1) # sched: [3:1.00]
+; SANDY-NEXT: faddp %st(2) # sched: [3:1.00]
+; SANDY-NEXT: fiadds (%ecx) # sched: [13:2.00]
+; SANDY-NEXT: fiaddl (%eax) # sched: [13:2.00]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retl # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_faddp_fiadd:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; HASWELL-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: faddp %st(1) # sched: [3:1.00]
+; HASWELL-NEXT: faddp %st(2) # sched: [3:1.00]
+; HASWELL-NEXT: fiadds (%ecx) # sched: [13:2.00]
+; HASWELL-NEXT: fiaddl (%eax) # sched: [13:2.00]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retl # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_faddp_fiadd:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; BROADWELL-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: faddp %st(1) # sched: [3:1.00]
+; BROADWELL-NEXT: faddp %st(2) # sched: [3:1.00]
+; BROADWELL-NEXT: fiadds (%ecx) # sched: [12:2.00]
+; BROADWELL-NEXT: fiaddl (%eax) # sched: [12:2.00]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retl # sched: [6:0.50]
+;
+; SKYLAKE-LABEL: test_faddp_fiadd:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; SKYLAKE-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: faddp %st(1) # sched: [3:1.00]
+; SKYLAKE-NEXT: faddp %st(2) # sched: [3:1.00]
+; SKYLAKE-NEXT: fiadds (%ecx) # sched: [13:2.00]
+; SKYLAKE-NEXT: fiaddl (%eax) # sched: [13:2.00]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retl # sched: [6:0.50]
+;
+; SKX-LABEL: test_faddp_fiadd:
+; SKX: # %bb.0:
+; SKX-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; SKX-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; SKX-NEXT: #APP
+; SKX-NEXT: faddp %st(1) # sched: [3:1.00]
+; SKX-NEXT: faddp %st(2) # sched: [3:1.00]
+; SKX-NEXT: fiadds (%ecx) # sched: [13:2.00]
+; SKX-NEXT: fiaddl (%eax) # sched: [13:2.00]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retl # sched: [6:0.50]
+;
+; BTVER2-LABEL: test_faddp_fiadd:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:1.00]
+; BTVER2-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:1.00]
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: faddp %st(1) # sched: [3:1.00]
+; BTVER2-NEXT: faddp %st(2) # sched: [3:1.00]
+; BTVER2-NEXT: fiadds (%ecx) # sched: [8:1.00]
+; BTVER2-NEXT: fiaddl (%eax) # sched: [8:1.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retl # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_faddp_fiadd:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [8:0.50]
+; ZNVER1-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [8:0.50]
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: faddp %st(1) # sched: [3:1.00]
+; ZNVER1-NEXT: faddp %st(2) # sched: [3:1.00]
+; ZNVER1-NEXT: fiadds (%ecx) # sched: [10:1.00]
+; ZNVER1-NEXT: fiaddl (%eax) # sched: [10:1.00]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retl # sched: [1:0.50]
+ tail call void asm sideeffect "faddp \0A\09 faddp %st(2), %st(0) \0A\09 fiadds $0 \0A\09 fiaddl $1", "*m,*m"(i16 *%a0, i32 *%a1) nounwind
+ ret void
+}
+
+define void @test_fbld_fbstp(i8* %a0) optsize {
+; GENERIC-LABEL: test_fbld_fbstp:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: movl {{[0-9]+}}(%esp), %eax
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: fbld (%eax)
+; GENERIC-NEXT: fbstp (%eax)
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retl
+;
+; ATOM-LABEL: test_fbld_fbstp:
+; ATOM: # %bb.0:
+; ATOM-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [1:1.00]
+; ATOM-NEXT: #APP
+; ATOM-NEXT: fbld (%eax)
+; ATOM-NEXT: fbstp (%eax)
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retl # sched: [79:39.50]
+;
+; SLM-LABEL: test_fbld_fbstp:
+; SLM: # %bb.0:
+; SLM-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [3:1.00]
+; SLM-NEXT: #APP
+; SLM-NEXT: fbld (%eax) # sched: [100:1.00]
+; SLM-NEXT: fbstp (%eax) # sched: [100:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retl # sched: [4:1.00]
+;
+; SANDY-LABEL: test_fbld_fbstp:
+; SANDY: # %bb.0:
+; SANDY-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; SANDY-NEXT: #APP
+; SANDY-NEXT: fbld (%eax) # sched: [100:0.33]
+; SANDY-NEXT: fbstp (%eax) # sched: [100:0.33]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retl # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_fbld_fbstp:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: fbld (%eax) # sched: [47:?]
+; HASWELL-NEXT: fbstp (%eax) # sched: [1:1.00]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retl # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_fbld_fbstp:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: fbld (%eax) # sched: [100:0.25]
+; BROADWELL-NEXT: fbstp (%eax) # sched: [1:1.00]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retl # sched: [6:0.50]
+;
+; SKYLAKE-LABEL: test_fbld_fbstp:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: fbld (%eax) # sched: [100:0.25]
+; SKYLAKE-NEXT: fbstp (%eax) # sched: [1:1.00]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retl # sched: [6:0.50]
+;
+; SKX-LABEL: test_fbld_fbstp:
+; SKX: # %bb.0:
+; SKX-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; SKX-NEXT: #APP
+; SKX-NEXT: fbld (%eax) # sched: [100:0.25]
+; SKX-NEXT: fbstp (%eax) # sched: [1:1.00]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retl # sched: [6:0.50]
+;
+; BTVER2-LABEL: test_fbld_fbstp:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:1.00]
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: fbld (%eax) # sched: [100:0.17]
+; BTVER2-NEXT: fbstp (%eax) # sched: [100:0.17]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retl # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_fbld_fbstp:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [8:0.50]
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: fbld (%eax) # sched: [100:?]
+; ZNVER1-NEXT: fbstp (%eax) # sched: [100:?]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retl # sched: [1:0.50]
+ tail call void asm sideeffect "fbld $0 \0A\09 fbstp $0", "*m"(i8 *%a0) nounwind
+ ret void
+}
+
+define void @test_fchs() optsize {
+; GENERIC-LABEL: test_fchs:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: fchs
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retl
+;
+; ATOM-LABEL: test_fchs:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: fchs # sched: [1:1.00]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retl # sched: [79:39.50]
+;
+; SLM-LABEL: test_fchs:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: fchs # sched: [1:0.50]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retl # sched: [4:1.00]
+;
+; SANDY-LABEL: test_fchs:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: fchs # sched: [1:1.00]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retl # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_fchs:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: fchs # sched: [1:1.00]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retl # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_fchs:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: fchs # sched: [1:0.33]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retl # sched: [6:0.50]
+;
+; SKYLAKE-LABEL: test_fchs:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: fchs # sched: [1:0.33]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retl # sched: [6:0.50]
+;
+; SKX-LABEL: test_fchs:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: fchs # sched: [1:0.33]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retl # sched: [6:0.50]
+;
+; BTVER2-LABEL: test_fchs:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: fchs # sched: [1:0.50]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retl # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_fchs:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: fchs # sched: [1:1.00]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retl # sched: [1:0.50]
+ tail call void asm sideeffect "fchs", ""() nounwind
+ ret void
+}
+
+define void @test_fclex() optsize {
+; GENERIC-LABEL: test_fclex:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: wait
+; GENERIC-NEXT: fnclex
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retl
+;
+; ATOM-LABEL: test_fclex:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: wait # sched: [1:0.50]
+; ATOM-NEXT: fnclex # sched: [25:12.50]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retl # sched: [79:39.50]
+;
+; SLM-LABEL: test_fclex:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: wait # sched: [100:1.00]
+; SLM-NEXT: fnclex # sched: [100:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retl # sched: [4:1.00]
+;
+; SANDY-LABEL: test_fclex:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: wait # sched: [100:0.33]
+; SANDY-NEXT: fnclex # sched: [100:0.33]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retl # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_fclex:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: wait # sched: [1:0.50]
+; HASWELL-NEXT: fnclex # sched: [1:1.25]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retl # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_fclex:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: wait # sched: [2:0.50]
+; BROADWELL-NEXT: fnclex # sched: [4:1.00]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retl # sched: [6:0.50]
+;
+; SKYLAKE-LABEL: test_fclex:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: wait # sched: [2:0.50]
+; SKYLAKE-NEXT: fnclex # sched: [4:1.00]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retl # sched: [6:0.50]
+;
+; SKX-LABEL: test_fclex:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: wait # sched: [2:0.50]
+; SKX-NEXT: fnclex # sched: [4:1.00]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retl # sched: [6:0.50]
+;
+; BTVER2-LABEL: test_fclex:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: wait # sched: [100:0.17]
+; BTVER2-NEXT: fnclex # sched: [100:0.17]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retl # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_fclex:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: wait # sched: [1:1.00]
+; ZNVER1-NEXT: fnclex # sched: [100:?]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retl # sched: [1:0.50]
+ tail call void asm sideeffect "fclex", ""() nounwind
+ ret void
+}
+
+define void @test_fnclex() optsize {
+; GENERIC-LABEL: test_fnclex:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: fnclex
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retl
+;
+; ATOM-LABEL: test_fnclex:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: fnclex # sched: [25:12.50]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retl # sched: [79:39.50]
+;
+; SLM-LABEL: test_fnclex:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: fnclex # sched: [100:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retl # sched: [4:1.00]
+;
+; SANDY-LABEL: test_fnclex:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: fnclex # sched: [100:0.33]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retl # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_fnclex:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: fnclex # sched: [1:1.25]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retl # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_fnclex:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: fnclex # sched: [4:1.00]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retl # sched: [6:0.50]
+;
+; SKYLAKE-LABEL: test_fnclex:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: fnclex # sched: [4:1.00]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retl # sched: [6:0.50]
+;
+; SKX-LABEL: test_fnclex:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: fnclex # sched: [4:1.00]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retl # sched: [6:0.50]
+;
+; BTVER2-LABEL: test_fnclex:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: fnclex # sched: [100:0.17]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retl # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_fnclex:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: fnclex # sched: [100:?]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retl # sched: [1:0.50]
+ tail call void asm sideeffect "fnclex", ""() nounwind
+ ret void
+}
+
+define void @test_fcmov() optsize {
+; GENERIC-LABEL: test_fcmov:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: fcmovb %st(1), %st(0)
+; GENERIC-NEXT: fcmovbe %st(1), %st(0)
+; GENERIC-NEXT: fcmove %st(1), %st(0)
+; GENERIC-NEXT: fcmovnb %st(1), %st(0)
+; GENERIC-NEXT: fcmovnbe %st(1), %st(0)
+; GENERIC-NEXT: fcmovne %st(1), %st(0)
+; GENERIC-NEXT: fcmovnu %st(1), %st(0)
+; GENERIC-NEXT: fcmovu %st(1), %st(0)
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retl
+;
+; ATOM-LABEL: test_fcmov:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: fcmovb %st(1), %st(0) # sched: [9:4.50]
+; ATOM-NEXT: fcmovbe %st(1), %st(0) # sched: [9:4.50]
+; ATOM-NEXT: fcmove %st(1), %st(0) # sched: [9:4.50]
+; ATOM-NEXT: fcmovnb %st(1), %st(0) # sched: [9:4.50]
+; ATOM-NEXT: fcmovnbe %st(1), %st(0) # sched: [9:4.50]
+; ATOM-NEXT: fcmovne %st(1), %st(0) # sched: [9:4.50]
+; ATOM-NEXT: fcmovnu %st(1), %st(0) # sched: [9:4.50]
+; ATOM-NEXT: fcmovu %st(1), %st(0) # sched: [9:4.50]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retl # sched: [79:39.50]
+;
+; SLM-LABEL: test_fcmov:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: fcmovb %st(1), %st(0) # sched: [3:1.00]
+; SLM-NEXT: fcmovbe %st(1), %st(0) # sched: [3:1.00]
+; SLM-NEXT: fcmove %st(1), %st(0) # sched: [3:1.00]
+; SLM-NEXT: fcmovnb %st(1), %st(0) # sched: [3:1.00]
+; SLM-NEXT: fcmovnbe %st(1), %st(0) # sched: [3:1.00]
+; SLM-NEXT: fcmovne %st(1), %st(0) # sched: [3:1.00]
+; SLM-NEXT: fcmovnu %st(1), %st(0) # sched: [3:1.00]
+; SLM-NEXT: fcmovu %st(1), %st(0) # sched: [3:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retl # sched: [4:1.00]
+;
+; SANDY-LABEL: test_fcmov:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: fcmovb %st(1), %st(0) # sched: [3:2.00]
+; SANDY-NEXT: fcmovbe %st(1), %st(0) # sched: [3:2.00]
+; SANDY-NEXT: fcmove %st(1), %st(0) # sched: [3:2.00]
+; SANDY-NEXT: fcmovnb %st(1), %st(0) # sched: [3:2.00]
+; SANDY-NEXT: fcmovnbe %st(1), %st(0) # sched: [3:2.00]
+; SANDY-NEXT: fcmovne %st(1), %st(0) # sched: [3:2.00]
+; SANDY-NEXT: fcmovnu %st(1), %st(0) # sched: [3:2.00]
+; SANDY-NEXT: fcmovu %st(1), %st(0) # sched: [3:2.00]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retl # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_fcmov:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: fcmovb %st(1), %st(0) # sched: [3:1.00]
+; HASWELL-NEXT: fcmovbe %st(1), %st(0) # sched: [3:1.00]
+; HASWELL-NEXT: fcmove %st(1), %st(0) # sched: [3:1.00]
+; HASWELL-NEXT: fcmovnb %st(1), %st(0) # sched: [3:1.00]
+; HASWELL-NEXT: fcmovnbe %st(1), %st(0) # sched: [3:1.00]
+; HASWELL-NEXT: fcmovne %st(1), %st(0) # sched: [3:1.00]
+; HASWELL-NEXT: fcmovnu %st(1), %st(0) # sched: [3:1.00]
+; HASWELL-NEXT: fcmovu %st(1), %st(0) # sched: [3:1.00]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retl # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_fcmov:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: fcmovb %st(1), %st(0) # sched: [3:1.00]
+; BROADWELL-NEXT: fcmovbe %st(1), %st(0) # sched: [3:1.00]
+; BROADWELL-NEXT: fcmove %st(1), %st(0) # sched: [3:1.00]
+; BROADWELL-NEXT: fcmovnb %st(1), %st(0) # sched: [3:1.00]
+; BROADWELL-NEXT: fcmovnbe %st(1), %st(0) # sched: [3:1.00]
+; BROADWELL-NEXT: fcmovne %st(1), %st(0) # sched: [3:1.00]
+; BROADWELL-NEXT: fcmovnu %st(1), %st(0) # sched: [3:1.00]
+; BROADWELL-NEXT: fcmovu %st(1), %st(0) # sched: [3:1.00]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retl # sched: [6:0.50]
+;
+; SKYLAKE-LABEL: test_fcmov:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: fcmovb %st(1), %st(0) # sched: [3:1.00]
+; SKYLAKE-NEXT: fcmovbe %st(1), %st(0) # sched: [3:1.00]
+; SKYLAKE-NEXT: fcmove %st(1), %st(0) # sched: [3:1.00]
+; SKYLAKE-NEXT: fcmovnb %st(1), %st(0) # sched: [3:1.00]
+; SKYLAKE-NEXT: fcmovnbe %st(1), %st(0) # sched: [3:1.00]
+; SKYLAKE-NEXT: fcmovne %st(1), %st(0) # sched: [3:1.00]
+; SKYLAKE-NEXT: fcmovnu %st(1), %st(0) # sched: [3:1.00]
+; SKYLAKE-NEXT: fcmovu %st(1), %st(0) # sched: [3:1.00]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retl # sched: [6:0.50]
+;
+; SKX-LABEL: test_fcmov:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: fcmovb %st(1), %st(0) # sched: [3:1.00]
+; SKX-NEXT: fcmovbe %st(1), %st(0) # sched: [3:1.00]
+; SKX-NEXT: fcmove %st(1), %st(0) # sched: [3:1.00]
+; SKX-NEXT: fcmovnb %st(1), %st(0) # sched: [3:1.00]
+; SKX-NEXT: fcmovnbe %st(1), %st(0) # sched: [3:1.00]
+; SKX-NEXT: fcmovne %st(1), %st(0) # sched: [3:1.00]
+; SKX-NEXT: fcmovnu %st(1), %st(0) # sched: [3:1.00]
+; SKX-NEXT: fcmovu %st(1), %st(0) # sched: [3:1.00]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retl # sched: [6:0.50]
+;
+; BTVER2-LABEL: test_fcmov:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: fcmovb %st(1), %st(0) # sched: [3:1.00]
+; BTVER2-NEXT: fcmovbe %st(1), %st(0) # sched: [3:1.00]
+; BTVER2-NEXT: fcmove %st(1), %st(0) # sched: [3:1.00]
+; BTVER2-NEXT: fcmovnb %st(1), %st(0) # sched: [3:1.00]
+; BTVER2-NEXT: fcmovnbe %st(1), %st(0) # sched: [3:1.00]
+; BTVER2-NEXT: fcmovne %st(1), %st(0) # sched: [3:1.00]
+; BTVER2-NEXT: fcmovnu %st(1), %st(0) # sched: [3:1.00]
+; BTVER2-NEXT: fcmovu %st(1), %st(0) # sched: [3:1.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retl # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_fcmov:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: fcmovb %st(1), %st(0) # sched: [100:?]
+; ZNVER1-NEXT: fcmovbe %st(1), %st(0) # sched: [100:?]
+; ZNVER1-NEXT: fcmove %st(1), %st(0) # sched: [100:?]
+; ZNVER1-NEXT: fcmovnb %st(1), %st(0) # sched: [100:?]
+; ZNVER1-NEXT: fcmovnbe %st(1), %st(0) # sched: [100:?]
+; ZNVER1-NEXT: fcmovne %st(1), %st(0) # sched: [100:?]
+; ZNVER1-NEXT: fcmovnu %st(1), %st(0) # sched: [100:?]
+; ZNVER1-NEXT: fcmovu %st(1), %st(0) # sched: [100:?]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retl # sched: [1:0.50]
+ tail call void asm sideeffect "fcmovb %st(1), %st(0) \0A\09 fcmovbe %st(1), %st(0) \0A\09 fcmove %st(1), %st(0) \0A\09 fcmovnb %st(1), %st(0) \0A\09 fcmovnbe %st(1), %st(0) \0A\09 fcmovne %st(1), %st(0) \0A\09 fcmovnu %st(1), %st(0) \0A\09 fcmovu %st(1), %st(0)", ""() nounwind
+ ret void
+}
+
+define void @test_fcom(float *%a0, double *%a1) optsize {
+; GENERIC-LABEL: test_fcom:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: movl {{[0-9]+}}(%esp), %eax
+; GENERIC-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: fcom %st(1)
+; GENERIC-NEXT: fcom %st(3)
+; GENERIC-NEXT: fcoms (%ecx)
+; GENERIC-NEXT: fcoml (%eax)
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retl
+;
+; ATOM-LABEL: test_fcom:
+; ATOM: # %bb.0:
+; ATOM-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [1:1.00]
+; ATOM-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [1:1.00]
+; ATOM-NEXT: #APP
+; ATOM-NEXT: fcom %st(1)
+; ATOM-NEXT: fcom %st(3)
+; ATOM-NEXT: fcoms (%ecx)
+; ATOM-NEXT: fcoml (%eax)
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retl # sched: [79:39.50]
+;
+; SLM-LABEL: test_fcom:
+; SLM: # %bb.0:
+; SLM-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [3:1.00]
+; SLM-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [3:1.00]
+; SLM-NEXT: #APP
+; SLM-NEXT: fcom %st(1) # sched: [3:1.00]
+; SLM-NEXT: fcom %st(3) # sched: [3:1.00]
+; SLM-NEXT: fcoms (%ecx) # sched: [6:1.00]
+; SLM-NEXT: fcoml (%eax) # sched: [6:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retl # sched: [4:1.00]
+;
+; SANDY-LABEL: test_fcom:
+; SANDY: # %bb.0:
+; SANDY-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; SANDY-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; SANDY-NEXT: #APP
+; SANDY-NEXT: fcom %st(1) # sched: [1:1.00]
+; SANDY-NEXT: fcom %st(3) # sched: [1:1.00]
+; SANDY-NEXT: fcoms (%ecx) # sched: [8:1.00]
+; SANDY-NEXT: fcoml (%eax) # sched: [8:1.00]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retl # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_fcom:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; HASWELL-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: fcom %st(1) # sched: [1:1.00]
+; HASWELL-NEXT: fcom %st(3) # sched: [1:1.00]
+; HASWELL-NEXT: fcoms (%ecx) # sched: [8:1.00]
+; HASWELL-NEXT: fcoml (%eax) # sched: [8:1.00]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retl # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_fcom:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; BROADWELL-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: fcom %st(1) # sched: [1:1.00]
+; BROADWELL-NEXT: fcom %st(3) # sched: [1:1.00]
+; BROADWELL-NEXT: fcoms (%ecx) # sched: [7:1.00]
+; BROADWELL-NEXT: fcoml (%eax) # sched: [7:1.00]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retl # sched: [6:0.50]
+;
+; SKYLAKE-LABEL: test_fcom:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; SKYLAKE-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: fcom %st(1) # sched: [1:1.00]
+; SKYLAKE-NEXT: fcom %st(3) # sched: [1:1.00]
+; SKYLAKE-NEXT: fcoms (%ecx) # sched: [8:1.00]
+; SKYLAKE-NEXT: fcoml (%eax) # sched: [8:1.00]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retl # sched: [6:0.50]
+;
+; SKX-LABEL: test_fcom:
+; SKX: # %bb.0:
+; SKX-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; SKX-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; SKX-NEXT: #APP
+; SKX-NEXT: fcom %st(1) # sched: [1:1.00]
+; SKX-NEXT: fcom %st(3) # sched: [1:1.00]
+; SKX-NEXT: fcoms (%ecx) # sched: [8:1.00]
+; SKX-NEXT: fcoml (%eax) # sched: [8:1.00]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retl # sched: [6:0.50]
+;
+; BTVER2-LABEL: test_fcom:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:1.00]
+; BTVER2-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:1.00]
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: fcom %st(1) # sched: [3:1.00]
+; BTVER2-NEXT: fcom %st(3) # sched: [3:1.00]
+; BTVER2-NEXT: fcoms (%ecx) # sched: [8:1.00]
+; BTVER2-NEXT: fcoml (%eax) # sched: [8:1.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retl # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_fcom:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [8:0.50]
+; ZNVER1-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [8:0.50]
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: fcom %st(1) # sched: [1:1.00]
+; ZNVER1-NEXT: fcom %st(3) # sched: [1:1.00]
+; ZNVER1-NEXT: fcoms (%ecx) # sched: [8:1.00]
+; ZNVER1-NEXT: fcoml (%eax) # sched: [8:1.00]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retl # sched: [1:0.50]
+ tail call void asm sideeffect "fcom \0A\09 fcom %st(3) \0A\09 fcoms $0 \0A\09 fcoml $1", "*m,*m"(float *%a0, double *%a1) nounwind
+ ret void
+}
+
+define void @test_fcomp_fcompp(float *%a0, double *%a1) optsize {
+; GENERIC-LABEL: test_fcomp_fcompp:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: movl {{[0-9]+}}(%esp), %eax
+; GENERIC-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: fcomp %st(1)
+; GENERIC-NEXT: fcomp %st(3)
+; GENERIC-NEXT: fcomps (%ecx)
+; GENERIC-NEXT: fcompl (%eax)
+; GENERIC-NEXT: fcompp
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retl
+;
+; ATOM-LABEL: test_fcomp_fcompp:
+; ATOM: # %bb.0:
+; ATOM-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [1:1.00]
+; ATOM-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [1:1.00]
+; ATOM-NEXT: #APP
+; ATOM-NEXT: fcomp %st(1)
+; ATOM-NEXT: fcomp %st(3)
+; ATOM-NEXT: fcomps (%ecx)
+; ATOM-NEXT: fcompl (%eax)
+; ATOM-NEXT: fcompp # sched: [1:1.00]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retl # sched: [79:39.50]
+;
+; SLM-LABEL: test_fcomp_fcompp:
+; SLM: # %bb.0:
+; SLM-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [3:1.00]
+; SLM-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [3:1.00]
+; SLM-NEXT: #APP
+; SLM-NEXT: fcomp %st(1) # sched: [3:1.00]
+; SLM-NEXT: fcomp %st(3) # sched: [3:1.00]
+; SLM-NEXT: fcomps (%ecx) # sched: [6:1.00]
+; SLM-NEXT: fcompl (%eax) # sched: [6:1.00]
+; SLM-NEXT: fcompp # sched: [100:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retl # sched: [4:1.00]
+;
+; SANDY-LABEL: test_fcomp_fcompp:
+; SANDY: # %bb.0:
+; SANDY-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; SANDY-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; SANDY-NEXT: #APP
+; SANDY-NEXT: fcomp %st(1) # sched: [1:1.00]
+; SANDY-NEXT: fcomp %st(3) # sched: [1:1.00]
+; SANDY-NEXT: fcomps (%ecx) # sched: [8:1.00]
+; SANDY-NEXT: fcompl (%eax) # sched: [8:1.00]
+; SANDY-NEXT: fcompp # sched: [100:0.33]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retl # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_fcomp_fcompp:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; HASWELL-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: fcomp %st(1) # sched: [1:1.00]
+; HASWELL-NEXT: fcomp %st(3) # sched: [1:1.00]
+; HASWELL-NEXT: fcomps (%ecx) # sched: [8:1.00]
+; HASWELL-NEXT: fcompl (%eax) # sched: [8:1.00]
+; HASWELL-NEXT: fcompp # sched: [1:0.50]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retl # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_fcomp_fcompp:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; BROADWELL-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: fcomp %st(1) # sched: [1:1.00]
+; BROADWELL-NEXT: fcomp %st(3) # sched: [1:1.00]
+; BROADWELL-NEXT: fcomps (%ecx) # sched: [7:1.00]
+; BROADWELL-NEXT: fcompl (%eax) # sched: [7:1.00]
+; BROADWELL-NEXT: fcompp # sched: [100:0.25]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retl # sched: [6:0.50]
+;
+; SKYLAKE-LABEL: test_fcomp_fcompp:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; SKYLAKE-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: fcomp %st(1) # sched: [1:1.00]
+; SKYLAKE-NEXT: fcomp %st(3) # sched: [1:1.00]
+; SKYLAKE-NEXT: fcomps (%ecx) # sched: [8:1.00]
+; SKYLAKE-NEXT: fcompl (%eax) # sched: [8:1.00]
+; SKYLAKE-NEXT: fcompp # sched: [100:0.25]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retl # sched: [6:0.50]
+;
+; SKX-LABEL: test_fcomp_fcompp:
+; SKX: # %bb.0:
+; SKX-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; SKX-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; SKX-NEXT: #APP
+; SKX-NEXT: fcomp %st(1) # sched: [1:1.00]
+; SKX-NEXT: fcomp %st(3) # sched: [1:1.00]
+; SKX-NEXT: fcomps (%ecx) # sched: [8:1.00]
+; SKX-NEXT: fcompl (%eax) # sched: [8:1.00]
+; SKX-NEXT: fcompp # sched: [100:0.25]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retl # sched: [6:0.50]
+;
+; BTVER2-LABEL: test_fcomp_fcompp:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:1.00]
+; BTVER2-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:1.00]
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: fcomp %st(1) # sched: [3:1.00]
+; BTVER2-NEXT: fcomp %st(3) # sched: [3:1.00]
+; BTVER2-NEXT: fcomps (%ecx) # sched: [8:1.00]
+; BTVER2-NEXT: fcompl (%eax) # sched: [8:1.00]
+; BTVER2-NEXT: fcompp # sched: [100:0.17]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retl # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_fcomp_fcompp:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [8:0.50]
+; ZNVER1-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [8:0.50]
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: fcomp %st(1) # sched: [1:1.00]
+; ZNVER1-NEXT: fcomp %st(3) # sched: [1:1.00]
+; ZNVER1-NEXT: fcomps (%ecx) # sched: [8:1.00]
+; ZNVER1-NEXT: fcompl (%eax) # sched: [8:1.00]
+; ZNVER1-NEXT: fcompp # sched: [1:1.00]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retl # sched: [1:0.50]
+ tail call void asm sideeffect "fcomp \0A\09 fcomp %st(3) \0A\09 fcomps $0 \0A\09 fcompl $1 \0A\09 fcompp", "*m,*m"(float *%a0, double *%a1) nounwind
+ ret void
+}
+
+define void @test_fcomi_fcomip() optsize {
+; GENERIC-LABEL: test_fcomi_fcomip:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: fcomi %st(3)
+; GENERIC-NEXT: fcompi %st(3)
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retl
+;
+; ATOM-LABEL: test_fcomi_fcomip:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: fcomi %st(3) # sched: [9:4.50]
+; ATOM-NEXT: fcompi %st(3) # sched: [9:4.50]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retl # sched: [79:39.50]
+;
+; SLM-LABEL: test_fcomi_fcomip:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: fcomi %st(3) # sched: [3:1.00]
+; SLM-NEXT: fcompi %st(3) # sched: [3:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retl # sched: [4:1.00]
+;
+; SANDY-LABEL: test_fcomi_fcomip:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: fcomi %st(3) # sched: [3:1.00]
+; SANDY-NEXT: fcompi %st(3) # sched: [3:1.00]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retl # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_fcomi_fcomip:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: fcomi %st(3) # sched: [1:0.50]
+; HASWELL-NEXT: fcompi %st(3) # sched: [1:0.50]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retl # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_fcomi_fcomip:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: fcomi %st(3) # sched: [3:1.00]
+; BROADWELL-NEXT: fcompi %st(3) # sched: [3:1.00]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retl # sched: [6:0.50]
+;
+; SKYLAKE-LABEL: test_fcomi_fcomip:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: fcomi %st(3) # sched: [3:1.00]
+; SKYLAKE-NEXT: fcompi %st(3) # sched: [3:1.00]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retl # sched: [6:0.50]
+;
+; SKX-LABEL: test_fcomi_fcomip:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: fcomi %st(3) # sched: [3:1.00]
+; SKX-NEXT: fcompi %st(3) # sched: [3:1.00]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retl # sched: [6:0.50]
+;
+; BTVER2-LABEL: test_fcomi_fcomip:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: fcomi %st(3) # sched: [3:1.00]
+; BTVER2-NEXT: fcompi %st(3) # sched: [3:1.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retl # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_fcomi_fcomip:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: fcomi %st(3) # sched: [9:0.50]
+; ZNVER1-NEXT: fcompi %st(3) # sched: [9:0.50]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retl # sched: [1:0.50]
+ tail call void asm sideeffect "fcomi %st(3) \0A\09 fcomip %st(3)", ""() nounwind
+ ret void
+}
+
+define void @test_fcos() optsize {
+; GENERIC-LABEL: test_fcos:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: fcos
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retl
+;
+; ATOM-LABEL: test_fcos:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: fcos # sched: [174:87.00]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retl # sched: [79:39.50]
+;
+; SLM-LABEL: test_fcos:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: fcos # sched: [100:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retl # sched: [4:1.00]
+;
+; SANDY-LABEL: test_fcos:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: fcos # sched: [100:0.33]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retl # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_fcos:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: fcos # sched: [100:0.25]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retl # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_fcos:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: fcos # sched: [100:0.25]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retl # sched: [6:0.50]
+;
+; SKYLAKE-LABEL: test_fcos:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: fcos # sched: [100:0.25]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retl # sched: [6:0.50]
+;
+; SKX-LABEL: test_fcos:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: fcos # sched: [100:0.25]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retl # sched: [6:0.50]
+;
+; BTVER2-LABEL: test_fcos:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: fcos # sched: [100:0.17]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retl # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_fcos:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: fcos # sched: [100:?]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retl # sched: [1:0.50]
+ tail call void asm sideeffect "fcos", ""() nounwind
+ ret void
+}
+
+define void @test_fdecstp() optsize {
+; GENERIC-LABEL: test_fdecstp:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: fdecstp
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retl
+;
+; ATOM-LABEL: test_fdecstp:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: fdecstp # sched: [1:0.50]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retl # sched: [79:39.50]
+;
+; SLM-LABEL: test_fdecstp:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: fdecstp # sched: [100:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retl # sched: [4:1.00]
+;
+; SANDY-LABEL: test_fdecstp:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: fdecstp # sched: [1:1.00]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retl # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_fdecstp:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: fdecstp # sched: [2:1.00]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retl # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_fdecstp:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: fdecstp # sched: [2:1.00]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retl # sched: [6:0.50]
+;
+; SKYLAKE-LABEL: test_fdecstp:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: fdecstp # sched: [2:1.00]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retl # sched: [6:0.50]
+;
+; SKX-LABEL: test_fdecstp:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: fdecstp # sched: [2:1.00]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retl # sched: [6:0.50]
+;
+; BTVER2-LABEL: test_fdecstp:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: fdecstp # sched: [100:0.17]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retl # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_fdecstp:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: fdecstp # sched: [11:1.00]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retl # sched: [1:0.50]
+ tail call void asm sideeffect "fdecstp", ""() nounwind
+ ret void
+}
+
+define void @test_fdiv(float *%a0, double *%a1) optsize {
+; GENERIC-LABEL: test_fdiv:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: movl {{[0-9]+}}(%esp), %eax
+; GENERIC-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: fdiv %st(0), %st(1)
+; GENERIC-NEXT: fdiv %st(2)
+; GENERIC-NEXT: fdivs (%ecx)
+; GENERIC-NEXT: fdivl (%eax)
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retl
+;
+; ATOM-LABEL: test_fdiv:
+; ATOM: # %bb.0:
+; ATOM-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [1:1.00]
+; ATOM-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [1:1.00]
+; ATOM-NEXT: #APP
+; ATOM-NEXT: fdiv %st(0), %st(1)
+; ATOM-NEXT: fdiv %st(2)
+; ATOM-NEXT: fdivs (%ecx)
+; ATOM-NEXT: fdivl (%eax)
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retl # sched: [79:39.50]
+;
+; SLM-LABEL: test_fdiv:
+; SLM: # %bb.0:
+; SLM-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [3:1.00]
+; SLM-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [3:1.00]
+; SLM-NEXT: #APP
+; SLM-NEXT: fdiv %st(0), %st(1) # sched: [34:34.00]
+; SLM-NEXT: fdiv %st(2) # sched: [34:34.00]
+; SLM-NEXT: fdivs (%ecx) # sched: [37:34.00]
+; SLM-NEXT: fdivl (%eax) # sched: [37:34.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retl # sched: [4:1.00]
+;
+; SANDY-LABEL: test_fdiv:
+; SANDY: # %bb.0:
+; SANDY-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; SANDY-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; SANDY-NEXT: #APP
+; SANDY-NEXT: fdiv %st(0), %st(1) # sched: [24:1.00]
+; SANDY-NEXT: fdiv %st(2) # sched: [24:1.00]
+; SANDY-NEXT: fdivs (%ecx) # sched: [31:1.00]
+; SANDY-NEXT: fdivl (%eax) # sched: [31:1.00]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retl # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_fdiv:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; HASWELL-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: fdiv %st(0), %st(1) # sched: [24:1.00]
+; HASWELL-NEXT: fdiv %st(2) # sched: [20:1.00]
+; HASWELL-NEXT: fdivs (%ecx) # sched: [31:1.00]
+; HASWELL-NEXT: fdivl (%eax) # sched: [31:1.00]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retl # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_fdiv:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; BROADWELL-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: fdiv %st(0), %st(1) # sched: [15:1.00]
+; BROADWELL-NEXT: fdiv %st(2) # sched: [20:1.00]
+; BROADWELL-NEXT: fdivs (%ecx) # sched: [21:1.00]
+; BROADWELL-NEXT: fdivl (%eax) # sched: [21:1.00]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retl # sched: [6:0.50]
+;
+; SKYLAKE-LABEL: test_fdiv:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; SKYLAKE-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: fdiv %st(0), %st(1) # sched: [15:1.00]
+; SKYLAKE-NEXT: fdiv %st(2) # sched: [20:1.00]
+; SKYLAKE-NEXT: fdivs (%ecx) # sched: [22:1.00]
+; SKYLAKE-NEXT: fdivl (%eax) # sched: [22:1.00]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retl # sched: [6:0.50]
+;
+; SKX-LABEL: test_fdiv:
+; SKX: # %bb.0:
+; SKX-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; SKX-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; SKX-NEXT: #APP
+; SKX-NEXT: fdiv %st(0), %st(1) # sched: [15:1.00]
+; SKX-NEXT: fdiv %st(2) # sched: [20:1.00]
+; SKX-NEXT: fdivs (%ecx) # sched: [22:1.00]
+; SKX-NEXT: fdivl (%eax) # sched: [22:1.00]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retl # sched: [6:0.50]
+;
+; BTVER2-LABEL: test_fdiv:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:1.00]
+; BTVER2-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:1.00]
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: fdiv %st(0), %st(1) # sched: [19:19.00]
+; BTVER2-NEXT: fdiv %st(2) # sched: [19:19.00]
+; BTVER2-NEXT: fdivs (%ecx) # sched: [24:19.00]
+; BTVER2-NEXT: fdivl (%eax) # sched: [24:19.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retl # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_fdiv:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [8:0.50]
+; ZNVER1-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [8:0.50]
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: fdiv %st(0), %st(1) # sched: [15:1.00]
+; ZNVER1-NEXT: fdiv %st(2) # sched: [15:1.00]
+; ZNVER1-NEXT: fdivs (%ecx) # sched: [22:1.00]
+; ZNVER1-NEXT: fdivl (%eax) # sched: [22:1.00]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retl # sched: [1:0.50]
+ tail call void asm sideeffect "fdiv %st(0), %st(1) \0A\09 fdiv %st(2), %st(0) \0A\09 fdivs $0 \0A\09 fdivl $1", "*m,*m"(float *%a0, double *%a1) nounwind
+ ret void
+}
+
+define void @test_fdivp_fidiv(i16 *%a0, i32 *%a1) optsize {
+; GENERIC-LABEL: test_fdivp_fidiv:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: movl {{[0-9]+}}(%esp), %eax
+; GENERIC-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: fdivp %st(1)
+; GENERIC-NEXT: fdivp %st(2)
+; GENERIC-NEXT: fidivs (%ecx)
+; GENERIC-NEXT: fidivl (%eax)
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retl
+;
+; ATOM-LABEL: test_fdivp_fidiv:
+; ATOM: # %bb.0:
+; ATOM-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [1:1.00]
+; ATOM-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [1:1.00]
+; ATOM-NEXT: #APP
+; ATOM-NEXT: fdivp %st(1)
+; ATOM-NEXT: fdivp %st(2)
+; ATOM-NEXT: fidivs (%ecx)
+; ATOM-NEXT: fidivl (%eax)
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retl # sched: [79:39.50]
+;
+; SLM-LABEL: test_fdivp_fidiv:
+; SLM: # %bb.0:
+; SLM-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [3:1.00]
+; SLM-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [3:1.00]
+; SLM-NEXT: #APP
+; SLM-NEXT: fdivp %st(1) # sched: [34:34.00]
+; SLM-NEXT: fdivp %st(2) # sched: [34:34.00]
+; SLM-NEXT: fidivs (%ecx) # sched: [37:34.00]
+; SLM-NEXT: fidivl (%eax) # sched: [37:34.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retl # sched: [4:1.00]
+;
+; SANDY-LABEL: test_fdivp_fidiv:
+; SANDY: # %bb.0:
+; SANDY-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; SANDY-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; SANDY-NEXT: #APP
+; SANDY-NEXT: fdivp %st(1) # sched: [24:1.00]
+; SANDY-NEXT: fdivp %st(2) # sched: [24:1.00]
+; SANDY-NEXT: fidivs (%ecx) # sched: [34:1.00]
+; SANDY-NEXT: fidivl (%eax) # sched: [34:1.00]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retl # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_fdivp_fidiv:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; HASWELL-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: fdivp %st(1) # sched: [24:1.00]
+; HASWELL-NEXT: fdivp %st(2) # sched: [24:1.00]
+; HASWELL-NEXT: fidivs (%ecx) # sched: [34:1.00]
+; HASWELL-NEXT: fidivl (%eax) # sched: [34:1.00]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retl # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_fdivp_fidiv:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; BROADWELL-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: fdivp %st(1) # sched: [15:1.00]
+; BROADWELL-NEXT: fdivp %st(2) # sched: [15:1.00]
+; BROADWELL-NEXT: fidivs (%ecx) # sched: [24:1.00]
+; BROADWELL-NEXT: fidivl (%eax) # sched: [24:1.00]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retl # sched: [6:0.50]
+;
+; SKYLAKE-LABEL: test_fdivp_fidiv:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; SKYLAKE-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: fdivp %st(1) # sched: [15:1.00]
+; SKYLAKE-NEXT: fdivp %st(2) # sched: [15:1.00]
+; SKYLAKE-NEXT: fidivs (%ecx) # sched: [25:1.00]
+; SKYLAKE-NEXT: fidivl (%eax) # sched: [25:1.00]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retl # sched: [6:0.50]
+;
+; SKX-LABEL: test_fdivp_fidiv:
+; SKX: # %bb.0:
+; SKX-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; SKX-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; SKX-NEXT: #APP
+; SKX-NEXT: fdivp %st(1) # sched: [15:1.00]
+; SKX-NEXT: fdivp %st(2) # sched: [15:1.00]
+; SKX-NEXT: fidivs (%ecx) # sched: [25:1.00]
+; SKX-NEXT: fidivl (%eax) # sched: [25:1.00]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retl # sched: [6:0.50]
+;
+; BTVER2-LABEL: test_fdivp_fidiv:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:1.00]
+; BTVER2-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:1.00]
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: fdivp %st(1) # sched: [19:19.00]
+; BTVER2-NEXT: fdivp %st(2) # sched: [19:19.00]
+; BTVER2-NEXT: fidivs (%ecx) # sched: [24:19.00]
+; BTVER2-NEXT: fidivl (%eax) # sched: [24:19.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retl # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_fdivp_fidiv:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [8:0.50]
+; ZNVER1-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [8:0.50]
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: fdivp %st(1) # sched: [15:1.00]
+; ZNVER1-NEXT: fdivp %st(2) # sched: [15:1.00]
+; ZNVER1-NEXT: fidivs (%ecx) # sched: [22:1.00]
+; ZNVER1-NEXT: fidivl (%eax) # sched: [22:1.00]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retl # sched: [1:0.50]
+ tail call void asm sideeffect "fdivp \0A\09 fdivp %st(2), %st(0) \0A\09 fidivs $0 \0A\09 fidivl $1", "*m,*m"(i16 *%a0, i32 *%a1) nounwind
+ ret void
+}
+
+define void @test_fdivr(float *%a0, double *%a1) optsize {
+; GENERIC-LABEL: test_fdivr:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: movl {{[0-9]+}}(%esp), %eax
+; GENERIC-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: fdivr %st(0), %st(1)
+; GENERIC-NEXT: fdivr %st(2)
+; GENERIC-NEXT: fdivrs (%ecx)
+; GENERIC-NEXT: fdivrl (%eax)
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retl
+;
+; ATOM-LABEL: test_fdivr:
+; ATOM: # %bb.0:
+; ATOM-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [1:1.00]
+; ATOM-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [1:1.00]
+; ATOM-NEXT: #APP
+; ATOM-NEXT: fdivr %st(0), %st(1)
+; ATOM-NEXT: fdivr %st(2)
+; ATOM-NEXT: fdivrs (%ecx)
+; ATOM-NEXT: fdivrl (%eax)
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retl # sched: [79:39.50]
+;
+; SLM-LABEL: test_fdivr:
+; SLM: # %bb.0:
+; SLM-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [3:1.00]
+; SLM-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [3:1.00]
+; SLM-NEXT: #APP
+; SLM-NEXT: fdivr %st(0), %st(1) # sched: [34:34.00]
+; SLM-NEXT: fdivr %st(2) # sched: [34:34.00]
+; SLM-NEXT: fdivrs (%ecx) # sched: [37:34.00]
+; SLM-NEXT: fdivrl (%eax) # sched: [37:34.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retl # sched: [4:1.00]
+;
+; SANDY-LABEL: test_fdivr:
+; SANDY: # %bb.0:
+; SANDY-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; SANDY-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; SANDY-NEXT: #APP
+; SANDY-NEXT: fdivr %st(0), %st(1) # sched: [24:1.00]
+; SANDY-NEXT: fdivr %st(2) # sched: [24:1.00]
+; SANDY-NEXT: fdivrs (%ecx) # sched: [31:1.00]
+; SANDY-NEXT: fdivrl (%eax) # sched: [31:1.00]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retl # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_fdivr:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; HASWELL-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: fdivr %st(0), %st(1) # sched: [20:1.00]
+; HASWELL-NEXT: fdivr %st(2) # sched: [24:1.00]
+; HASWELL-NEXT: fdivrs (%ecx) # sched: [27:1.00]
+; HASWELL-NEXT: fdivrl (%eax) # sched: [27:1.00]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retl # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_fdivr:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; BROADWELL-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: fdivr %st(0), %st(1) # sched: [20:1.00]
+; BROADWELL-NEXT: fdivr %st(2) # sched: [15:1.00]
+; BROADWELL-NEXT: fdivrs (%ecx) # sched: [26:1.00]
+; BROADWELL-NEXT: fdivrl (%eax) # sched: [26:1.00]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retl # sched: [6:0.50]
+;
+; SKYLAKE-LABEL: test_fdivr:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; SKYLAKE-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: fdivr %st(0), %st(1) # sched: [20:1.00]
+; SKYLAKE-NEXT: fdivr %st(2) # sched: [15:1.00]
+; SKYLAKE-NEXT: fdivrs (%ecx) # sched: [27:1.00]
+; SKYLAKE-NEXT: fdivrl (%eax) # sched: [27:1.00]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retl # sched: [6:0.50]
+;
+; SKX-LABEL: test_fdivr:
+; SKX: # %bb.0:
+; SKX-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; SKX-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; SKX-NEXT: #APP
+; SKX-NEXT: fdivr %st(0), %st(1) # sched: [20:1.00]
+; SKX-NEXT: fdivr %st(2) # sched: [15:1.00]
+; SKX-NEXT: fdivrs (%ecx) # sched: [27:1.00]
+; SKX-NEXT: fdivrl (%eax) # sched: [27:1.00]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retl # sched: [6:0.50]
+;
+; BTVER2-LABEL: test_fdivr:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:1.00]
+; BTVER2-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:1.00]
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: fdivr %st(0), %st(1) # sched: [19:19.00]
+; BTVER2-NEXT: fdivr %st(2) # sched: [19:19.00]
+; BTVER2-NEXT: fdivrs (%ecx) # sched: [24:19.00]
+; BTVER2-NEXT: fdivrl (%eax) # sched: [24:19.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retl # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_fdivr:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [8:0.50]
+; ZNVER1-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [8:0.50]
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: fdivr %st(0), %st(1) # sched: [15:1.00]
+; ZNVER1-NEXT: fdivr %st(2) # sched: [15:1.00]
+; ZNVER1-NEXT: fdivrs (%ecx) # sched: [22:1.00]
+; ZNVER1-NEXT: fdivrl (%eax) # sched: [22:1.00]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retl # sched: [1:0.50]
+ tail call void asm sideeffect "fdivr %st(0), %st(1) \0A\09 fdivr %st(2), %st(0) \0A\09 fdivrs $0 \0A\09 fdivrl $1", "*m,*m"(float *%a0, double *%a1) nounwind
+ ret void
+}
+
+define void @test_fdivrp_fidivr(i16 *%a0, i32 *%a1) optsize {
+; GENERIC-LABEL: test_fdivrp_fidivr:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: movl {{[0-9]+}}(%esp), %eax
+; GENERIC-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: fdivrp %st(1)
+; GENERIC-NEXT: fdivrp %st(2)
+; GENERIC-NEXT: fidivrs (%ecx)
+; GENERIC-NEXT: fidivrl (%eax)
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retl
+;
+; ATOM-LABEL: test_fdivrp_fidivr:
+; ATOM: # %bb.0:
+; ATOM-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [1:1.00]
+; ATOM-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [1:1.00]
+; ATOM-NEXT: #APP
+; ATOM-NEXT: fdivrp %st(1)
+; ATOM-NEXT: fdivrp %st(2)
+; ATOM-NEXT: fidivrs (%ecx)
+; ATOM-NEXT: fidivrl (%eax)
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retl # sched: [79:39.50]
+;
+; SLM-LABEL: test_fdivrp_fidivr:
+; SLM: # %bb.0:
+; SLM-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [3:1.00]
+; SLM-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [3:1.00]
+; SLM-NEXT: #APP
+; SLM-NEXT: fdivrp %st(1) # sched: [34:34.00]
+; SLM-NEXT: fdivrp %st(2) # sched: [34:34.00]
+; SLM-NEXT: fidivrs (%ecx) # sched: [37:34.00]
+; SLM-NEXT: fidivrl (%eax) # sched: [37:34.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retl # sched: [4:1.00]
+;
+; SANDY-LABEL: test_fdivrp_fidivr:
+; SANDY: # %bb.0:
+; SANDY-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; SANDY-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; SANDY-NEXT: #APP
+; SANDY-NEXT: fdivrp %st(1) # sched: [24:1.00]
+; SANDY-NEXT: fdivrp %st(2) # sched: [24:1.00]
+; SANDY-NEXT: fidivrs (%ecx) # sched: [34:1.00]
+; SANDY-NEXT: fidivrl (%eax) # sched: [34:1.00]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retl # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_fdivrp_fidivr:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; HASWELL-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: fdivrp %st(1) # sched: [20:1.00]
+; HASWELL-NEXT: fdivrp %st(2) # sched: [20:1.00]
+; HASWELL-NEXT: fidivrs (%ecx) # sched: [30:1.00]
+; HASWELL-NEXT: fidivrl (%eax) # sched: [30:1.00]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retl # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_fdivrp_fidivr:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; BROADWELL-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: fdivrp %st(1) # sched: [20:1.00]
+; BROADWELL-NEXT: fdivrp %st(2) # sched: [20:1.00]
+; BROADWELL-NEXT: fidivrs (%ecx) # sched: [29:1.00]
+; BROADWELL-NEXT: fidivrl (%eax) # sched: [29:1.00]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retl # sched: [6:0.50]
+;
+; SKYLAKE-LABEL: test_fdivrp_fidivr:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; SKYLAKE-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: fdivrp %st(1) # sched: [20:1.00]
+; SKYLAKE-NEXT: fdivrp %st(2) # sched: [20:1.00]
+; SKYLAKE-NEXT: fidivrs (%ecx) # sched: [30:1.00]
+; SKYLAKE-NEXT: fidivrl (%eax) # sched: [30:1.00]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retl # sched: [6:0.50]
+;
+; SKX-LABEL: test_fdivrp_fidivr:
+; SKX: # %bb.0:
+; SKX-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; SKX-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; SKX-NEXT: #APP
+; SKX-NEXT: fdivrp %st(1) # sched: [20:1.00]
+; SKX-NEXT: fdivrp %st(2) # sched: [20:1.00]
+; SKX-NEXT: fidivrs (%ecx) # sched: [30:1.00]
+; SKX-NEXT: fidivrl (%eax) # sched: [30:1.00]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retl # sched: [6:0.50]
+;
+; BTVER2-LABEL: test_fdivrp_fidivr:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:1.00]
+; BTVER2-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:1.00]
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: fdivrp %st(1) # sched: [19:19.00]
+; BTVER2-NEXT: fdivrp %st(2) # sched: [19:19.00]
+; BTVER2-NEXT: fidivrs (%ecx) # sched: [24:19.00]
+; BTVER2-NEXT: fidivrl (%eax) # sched: [24:19.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retl # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_fdivrp_fidivr:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [8:0.50]
+; ZNVER1-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [8:0.50]
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: fdivrp %st(1) # sched: [15:1.00]
+; ZNVER1-NEXT: fdivrp %st(2) # sched: [15:1.00]
+; ZNVER1-NEXT: fidivrs (%ecx) # sched: [22:1.00]
+; ZNVER1-NEXT: fidivrl (%eax) # sched: [22:1.00]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retl # sched: [1:0.50]
+ tail call void asm sideeffect "fdivrp \0A\09 fdivrp %st(2), %st(0) \0A\09 fidivrs $0 \0A\09 fidivrl $1", "*m,*m"(i16 *%a0, i32 *%a1) nounwind
+ ret void
+}
+
+define void @test_ffree() optsize {
+; GENERIC-LABEL: test_ffree:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: ffree %st(0)
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retl
+;
+; ATOM-LABEL: test_ffree:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: ffree %st(0) # sched: [1:0.50]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retl # sched: [79:39.50]
+;
+; SLM-LABEL: test_ffree:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: ffree %st(0) # sched: [100:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retl # sched: [4:1.00]
+;
+; SANDY-LABEL: test_ffree:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: ffree %st(0) # sched: [1:1.00]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retl # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_ffree:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: ffree %st(0) # sched: [1:0.50]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retl # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_ffree:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: ffree %st(0) # sched: [100:0.25]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retl # sched: [6:0.50]
+;
+; SKYLAKE-LABEL: test_ffree:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: ffree %st(0) # sched: [100:0.25]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retl # sched: [6:0.50]
+;
+; SKX-LABEL: test_ffree:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: ffree %st(0) # sched: [100:0.25]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retl # sched: [6:0.50]
+;
+; BTVER2-LABEL: test_ffree:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: ffree %st(0) # sched: [100:0.17]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retl # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_ffree:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: ffree %st(0) # sched: [11:1.00]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retl # sched: [1:0.50]
+ tail call void asm sideeffect "ffree %st(0)", ""() nounwind
+ ret void
+}
+
+define void @test_ficom(i16 *%a0, i32 *%a1) optsize {
+; GENERIC-LABEL: test_ficom:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: movl {{[0-9]+}}(%esp), %eax
+; GENERIC-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: ficoms (%ecx)
+; GENERIC-NEXT: ficoml (%eax)
+; GENERIC-NEXT: ficomps (%ecx)
+; GENERIC-NEXT: ficompl (%eax)
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retl
+;
+; ATOM-LABEL: test_ficom:
+; ATOM: # %bb.0:
+; ATOM-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [1:1.00]
+; ATOM-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [1:1.00]
+; ATOM-NEXT: #APP
+; ATOM-NEXT: ficoms (%ecx)
+; ATOM-NEXT: ficoml (%eax)
+; ATOM-NEXT: ficomps (%ecx)
+; ATOM-NEXT: ficompl (%eax)
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retl # sched: [79:39.50]
+;
+; SLM-LABEL: test_ficom:
+; SLM: # %bb.0:
+; SLM-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [3:1.00]
+; SLM-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [3:1.00]
+; SLM-NEXT: #APP
+; SLM-NEXT: ficoms (%ecx) # sched: [6:1.00]
+; SLM-NEXT: ficoml (%eax) # sched: [6:1.00]
+; SLM-NEXT: ficomps (%ecx) # sched: [6:1.00]
+; SLM-NEXT: ficompl (%eax) # sched: [6:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retl # sched: [4:1.00]
+;
+; SANDY-LABEL: test_ficom:
+; SANDY: # %bb.0:
+; SANDY-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; SANDY-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; SANDY-NEXT: #APP
+; SANDY-NEXT: ficoms (%ecx) # sched: [11:2.00]
+; SANDY-NEXT: ficoml (%eax) # sched: [11:2.00]
+; SANDY-NEXT: ficomps (%ecx) # sched: [11:2.00]
+; SANDY-NEXT: ficompl (%eax) # sched: [11:2.00]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retl # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_ficom:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; HASWELL-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: ficoms (%ecx) # sched: [11:2.00]
+; HASWELL-NEXT: ficoml (%eax) # sched: [11:2.00]
+; HASWELL-NEXT: ficomps (%ecx) # sched: [11:2.00]
+; HASWELL-NEXT: ficompl (%eax) # sched: [11:2.00]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retl # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_ficom:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; BROADWELL-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: ficoms (%ecx) # sched: [10:2.00]
+; BROADWELL-NEXT: ficoml (%eax) # sched: [10:2.00]
+; BROADWELL-NEXT: ficomps (%ecx) # sched: [10:2.00]
+; BROADWELL-NEXT: ficompl (%eax) # sched: [10:2.00]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retl # sched: [6:0.50]
+;
+; SKYLAKE-LABEL: test_ficom:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; SKYLAKE-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: ficoms (%ecx) # sched: [11:2.00]
+; SKYLAKE-NEXT: ficoml (%eax) # sched: [11:2.00]
+; SKYLAKE-NEXT: ficomps (%ecx) # sched: [11:2.00]
+; SKYLAKE-NEXT: ficompl (%eax) # sched: [11:2.00]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retl # sched: [6:0.50]
+;
+; SKX-LABEL: test_ficom:
+; SKX: # %bb.0:
+; SKX-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; SKX-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; SKX-NEXT: #APP
+; SKX-NEXT: ficoms (%ecx) # sched: [11:2.00]
+; SKX-NEXT: ficoml (%eax) # sched: [11:2.00]
+; SKX-NEXT: ficomps (%ecx) # sched: [11:2.00]
+; SKX-NEXT: ficompl (%eax) # sched: [11:2.00]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retl # sched: [6:0.50]
+;
+; BTVER2-LABEL: test_ficom:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:1.00]
+; BTVER2-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:1.00]
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: ficoms (%ecx) # sched: [8:1.00]
+; BTVER2-NEXT: ficoml (%eax) # sched: [8:1.00]
+; BTVER2-NEXT: ficomps (%ecx) # sched: [8:1.00]
+; BTVER2-NEXT: ficompl (%eax) # sched: [8:1.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retl # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_ficom:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [8:0.50]
+; ZNVER1-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [8:0.50]
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: ficoms (%ecx) # sched: [12:1.50]
+; ZNVER1-NEXT: ficoml (%eax) # sched: [12:1.50]
+; ZNVER1-NEXT: ficomps (%ecx) # sched: [12:1.50]
+; ZNVER1-NEXT: ficompl (%eax) # sched: [12:1.50]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retl # sched: [1:0.50]
+ tail call void asm sideeffect "ficoms $0 \0A\09 ficoml $1 \0A\09 ficomps $0 \0A\09 ficompl $1", "*m,*m"(i16 *%a0, i32 *%a1) nounwind
+ ret void
+}
+
+define void @test_fild(i16 *%a0, i32 *%a1, i64 *%a2) optsize {
+; GENERIC-LABEL: test_fild:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: movl {{[0-9]+}}(%esp), %eax
+; GENERIC-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; GENERIC-NEXT: movl {{[0-9]+}}(%esp), %edx
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: filds (%edx)
+; GENERIC-NEXT: fildl (%ecx)
+; GENERIC-NEXT: fildll (%eax)
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retl
+;
+; ATOM-LABEL: test_fild:
+; ATOM: # %bb.0:
+; ATOM-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [1:1.00]
+; ATOM-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [1:1.00]
+; ATOM-NEXT: movl {{[0-9]+}}(%esp), %edx # sched: [1:1.00]
+; ATOM-NEXT: #APP
+; ATOM-NEXT: filds (%edx) # sched: [5:5.00]
+; ATOM-NEXT: fildl (%ecx) # sched: [5:5.00]
+; ATOM-NEXT: fildll (%eax) # sched: [5:5.00]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retl # sched: [79:39.50]
+;
+; SLM-LABEL: test_fild:
+; SLM: # %bb.0:
+; SLM-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [3:1.00]
+; SLM-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [3:1.00]
+; SLM-NEXT: movl {{[0-9]+}}(%esp), %edx # sched: [3:1.00]
+; SLM-NEXT: #APP
+; SLM-NEXT: filds (%edx) # sched: [3:1.00]
+; SLM-NEXT: fildl (%ecx) # sched: [3:1.00]
+; SLM-NEXT: fildll (%eax) # sched: [3:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retl # sched: [4:1.00]
+;
+; SANDY-LABEL: test_fild:
+; SANDY: # %bb.0:
+; SANDY-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; SANDY-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; SANDY-NEXT: movl {{[0-9]+}}(%esp), %edx # sched: [5:0.50]
+; SANDY-NEXT: #APP
+; SANDY-NEXT: filds (%edx) # sched: [10:1.00]
+; SANDY-NEXT: fildl (%ecx) # sched: [10:1.00]
+; SANDY-NEXT: fildll (%eax) # sched: [10:1.00]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retl # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_fild:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; HASWELL-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; HASWELL-NEXT: movl {{[0-9]+}}(%esp), %edx # sched: [5:0.50]
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: filds (%edx) # sched: [10:1.00]
+; HASWELL-NEXT: fildl (%ecx) # sched: [10:1.00]
+; HASWELL-NEXT: fildll (%eax) # sched: [10:1.00]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retl # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_fild:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; BROADWELL-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; BROADWELL-NEXT: movl {{[0-9]+}}(%esp), %edx # sched: [5:0.50]
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: filds (%edx) # sched: [9:1.00]
+; BROADWELL-NEXT: fildl (%ecx) # sched: [9:1.00]
+; BROADWELL-NEXT: fildll (%eax) # sched: [9:1.00]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retl # sched: [6:0.50]
+;
+; SKYLAKE-LABEL: test_fild:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; SKYLAKE-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; SKYLAKE-NEXT: movl {{[0-9]+}}(%esp), %edx # sched: [5:0.50]
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: filds (%edx) # sched: [10:1.00]
+; SKYLAKE-NEXT: fildl (%ecx) # sched: [10:1.00]
+; SKYLAKE-NEXT: fildll (%eax) # sched: [10:1.00]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retl # sched: [6:0.50]
+;
+; SKX-LABEL: test_fild:
+; SKX: # %bb.0:
+; SKX-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; SKX-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; SKX-NEXT: movl {{[0-9]+}}(%esp), %edx # sched: [5:0.50]
+; SKX-NEXT: #APP
+; SKX-NEXT: filds (%edx) # sched: [10:1.00]
+; SKX-NEXT: fildl (%ecx) # sched: [10:1.00]
+; SKX-NEXT: fildll (%eax) # sched: [10:1.00]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retl # sched: [6:0.50]
+;
+; BTVER2-LABEL: test_fild:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:1.00]
+; BTVER2-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:1.00]
+; BTVER2-NEXT: movl {{[0-9]+}}(%esp), %edx # sched: [5:1.00]
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: filds (%edx) # sched: [5:1.00]
+; BTVER2-NEXT: fildl (%ecx) # sched: [5:1.00]
+; BTVER2-NEXT: fildll (%eax) # sched: [5:1.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retl # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_fild:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [8:0.50]
+; ZNVER1-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [8:0.50]
+; ZNVER1-NEXT: movl {{[0-9]+}}(%esp), %edx # sched: [8:0.50]
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: filds (%edx) # sched: [11:1.00]
+; ZNVER1-NEXT: fildl (%ecx) # sched: [11:1.00]
+; ZNVER1-NEXT: fildll (%eax) # sched: [11:1.00]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retl # sched: [1:0.50]
+ tail call void asm sideeffect "filds $0 \0A\09 fildl $1 \0A\09 fildll $2", "*m,*m,*m"(i16 *%a0, i32 *%a1, i64 *%a2) nounwind
+ ret void
+}
+
+define void @test_fincstp() optsize {
+; GENERIC-LABEL: test_fincstp:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: fincstp
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retl
+;
+; ATOM-LABEL: test_fincstp:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: fincstp # sched: [1:0.50]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retl # sched: [79:39.50]
+;
+; SLM-LABEL: test_fincstp:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: fincstp # sched: [100:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retl # sched: [4:1.00]
+;
+; SANDY-LABEL: test_fincstp:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: fincstp # sched: [1:1.00]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retl # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_fincstp:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: fincstp # sched: [1:0.50]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retl # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_fincstp:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: fincstp # sched: [1:0.50]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retl # sched: [6:0.50]
+;
+; SKYLAKE-LABEL: test_fincstp:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: fincstp # sched: [1:0.50]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retl # sched: [6:0.50]
+;
+; SKX-LABEL: test_fincstp:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: fincstp # sched: [1:0.50]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retl # sched: [6:0.50]
+;
+; BTVER2-LABEL: test_fincstp:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: fincstp # sched: [100:0.17]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retl # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_fincstp:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: fincstp # sched: [11:1.00]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retl # sched: [1:0.50]
+ tail call void asm sideeffect "fincstp", ""() nounwind
+ ret void
+}
+
+define void @test_finit() optsize {
+; GENERIC-LABEL: test_finit:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: wait
+; GENERIC-NEXT: fninit
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retl
+;
+; ATOM-LABEL: test_finit:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: wait # sched: [1:0.50]
+; ATOM-NEXT: fninit # sched: [63:31.50]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retl # sched: [79:39.50]
+;
+; SLM-LABEL: test_finit:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: wait # sched: [100:1.00]
+; SLM-NEXT: fninit # sched: [100:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retl # sched: [4:1.00]
+;
+; SANDY-LABEL: test_finit:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: wait # sched: [100:0.33]
+; SANDY-NEXT: fninit # sched: [5:1.33]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retl # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_finit:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: wait # sched: [1:0.50]
+; HASWELL-NEXT: fninit # sched: [1:?]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retl # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_finit:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: wait # sched: [2:0.50]
+; BROADWELL-NEXT: fninit # sched: [75:6.00]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retl # sched: [6:0.50]
+;
+; SKYLAKE-LABEL: test_finit:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: wait # sched: [2:0.50]
+; SKYLAKE-NEXT: fninit # sched: [75:6.00]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retl # sched: [6:0.50]
+;
+; SKX-LABEL: test_finit:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: wait # sched: [2:0.50]
+; SKX-NEXT: fninit # sched: [75:6.00]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retl # sched: [6:0.50]
+;
+; BTVER2-LABEL: test_finit:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: wait # sched: [100:0.17]
+; BTVER2-NEXT: fninit # sched: [100:0.17]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retl # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_finit:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: wait # sched: [1:1.00]
+; ZNVER1-NEXT: fninit # sched: [100:?]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retl # sched: [1:0.50]
+ tail call void asm sideeffect "finit", ""() nounwind
+ ret void
+}
+
+define void @test_fninit() optsize {
+; GENERIC-LABEL: test_fninit:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: fninit
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retl
+;
+; ATOM-LABEL: test_fninit:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: fninit # sched: [63:31.50]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retl # sched: [79:39.50]
+;
+; SLM-LABEL: test_fninit:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: fninit # sched: [100:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retl # sched: [4:1.00]
+;
+; SANDY-LABEL: test_fninit:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: fninit # sched: [5:1.33]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retl # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_fninit:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: fninit # sched: [1:?]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retl # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_fninit:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: fninit # sched: [75:6.00]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retl # sched: [6:0.50]
+;
+; SKYLAKE-LABEL: test_fninit:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: fninit # sched: [75:6.00]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retl # sched: [6:0.50]
+;
+; SKX-LABEL: test_fninit:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: fninit # sched: [75:6.00]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retl # sched: [6:0.50]
+;
+; BTVER2-LABEL: test_fninit:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: fninit # sched: [100:0.17]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retl # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_fninit:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: fninit # sched: [100:?]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retl # sched: [1:0.50]
+ tail call void asm sideeffect "fninit", ""() nounwind
+ ret void
+}
+
+define void @test_fist_fistp_fisttp(i16* %a0, i32* %a1, i64 *%a2) optsize {
+; GENERIC-LABEL: test_fist_fistp_fisttp:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: movl {{[0-9]+}}(%esp), %eax
+; GENERIC-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; GENERIC-NEXT: movl {{[0-9]+}}(%esp), %edx
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: fists (%edx)
+; GENERIC-NEXT: fistl (%ecx)
+; GENERIC-NEXT: fistps (%edx)
+; GENERIC-NEXT: fistpl (%ecx)
+; GENERIC-NEXT: fistpll (%eax)
+; GENERIC-NEXT: fisttps (%edx)
+; GENERIC-NEXT: fisttpl (%ecx)
+; GENERIC-NEXT: fisttpll (%eax)
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retl
+;
+; ATOM-LABEL: test_fist_fistp_fisttp:
+; ATOM: # %bb.0:
+; ATOM-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [1:1.00]
+; ATOM-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [1:1.00]
+; ATOM-NEXT: movl {{[0-9]+}}(%esp), %edx # sched: [1:1.00]
+; ATOM-NEXT: #APP
+; ATOM-NEXT: fists (%edx) # sched: [6:3.00]
+; ATOM-NEXT: fistl (%ecx) # sched: [6:3.00]
+; ATOM-NEXT: fistps (%edx) # sched: [6:3.00]
+; ATOM-NEXT: fistpl (%ecx) # sched: [6:3.00]
+; ATOM-NEXT: fistpll (%eax) # sched: [6:3.00]
+; ATOM-NEXT: fisttps (%edx) # sched: [2:1.00]
+; ATOM-NEXT: fisttpl (%ecx) # sched: [2:1.00]
+; ATOM-NEXT: fisttpll (%eax) # sched: [2:1.00]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retl # sched: [79:39.50]
+;
+; SLM-LABEL: test_fist_fistp_fisttp:
+; SLM: # %bb.0:
+; SLM-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [3:1.00]
+; SLM-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [3:1.00]
+; SLM-NEXT: movl {{[0-9]+}}(%esp), %edx # sched: [3:1.00]
+; SLM-NEXT: #APP
+; SLM-NEXT: fists (%edx) # sched: [1:1.00]
+; SLM-NEXT: fistl (%ecx) # sched: [1:1.00]
+; SLM-NEXT: fistps (%edx) # sched: [1:1.00]
+; SLM-NEXT: fistpl (%ecx) # sched: [1:1.00]
+; SLM-NEXT: fistpll (%eax) # sched: [1:1.00]
+; SLM-NEXT: fisttps (%edx) # sched: [1:1.00]
+; SLM-NEXT: fisttpl (%ecx) # sched: [1:1.00]
+; SLM-NEXT: fisttpll (%eax) # sched: [1:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retl # sched: [4:1.00]
+;
+; SANDY-LABEL: test_fist_fistp_fisttp:
+; SANDY: # %bb.0:
+; SANDY-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; SANDY-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; SANDY-NEXT: movl {{[0-9]+}}(%esp), %edx # sched: [5:0.50]
+; SANDY-NEXT: #APP
+; SANDY-NEXT: fists (%edx) # sched: [9:1.00]
+; SANDY-NEXT: fistl (%ecx) # sched: [9:1.00]
+; SANDY-NEXT: fistps (%edx) # sched: [9:1.00]
+; SANDY-NEXT: fistpl (%ecx) # sched: [9:1.00]
+; SANDY-NEXT: fistpll (%eax) # sched: [9:1.00]
+; SANDY-NEXT: fisttps (%edx) # sched: [5:1.00]
+; SANDY-NEXT: fisttpl (%ecx) # sched: [5:1.00]
+; SANDY-NEXT: fisttpll (%eax) # sched: [5:1.00]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retl # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_fist_fistp_fisttp:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; HASWELL-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; HASWELL-NEXT: movl {{[0-9]+}}(%esp), %edx # sched: [5:0.50]
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: fists (%edx) # sched: [4:1.00]
+; HASWELL-NEXT: fistl (%ecx) # sched: [4:1.00]
+; HASWELL-NEXT: fistps (%edx) # sched: [4:1.00]
+; HASWELL-NEXT: fistpl (%ecx) # sched: [4:1.00]
+; HASWELL-NEXT: fistpll (%eax) # sched: [4:1.00]
+; HASWELL-NEXT: fisttps (%edx) # sched: [4:1.00]
+; HASWELL-NEXT: fisttpl (%ecx) # sched: [4:1.00]
+; HASWELL-NEXT: fisttpll (%eax) # sched: [4:1.00]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retl # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_fist_fistp_fisttp:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; BROADWELL-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; BROADWELL-NEXT: movl {{[0-9]+}}(%esp), %edx # sched: [5:0.50]
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: fists (%edx) # sched: [4:1.00]
+; BROADWELL-NEXT: fistl (%ecx) # sched: [4:1.00]
+; BROADWELL-NEXT: fistps (%edx) # sched: [4:1.00]
+; BROADWELL-NEXT: fistpl (%ecx) # sched: [4:1.00]
+; BROADWELL-NEXT: fistpll (%eax) # sched: [4:1.00]
+; BROADWELL-NEXT: fisttps (%edx) # sched: [4:1.00]
+; BROADWELL-NEXT: fisttpl (%ecx) # sched: [4:1.00]
+; BROADWELL-NEXT: fisttpll (%eax) # sched: [4:1.00]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retl # sched: [6:0.50]
+;
+; SKYLAKE-LABEL: test_fist_fistp_fisttp:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; SKYLAKE-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; SKYLAKE-NEXT: movl {{[0-9]+}}(%esp), %edx # sched: [5:0.50]
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: fists (%edx) # sched: [4:1.00]
+; SKYLAKE-NEXT: fistl (%ecx) # sched: [4:1.00]
+; SKYLAKE-NEXT: fistps (%edx) # sched: [4:1.00]
+; SKYLAKE-NEXT: fistpl (%ecx) # sched: [4:1.00]
+; SKYLAKE-NEXT: fistpll (%eax) # sched: [4:1.00]
+; SKYLAKE-NEXT: fisttps (%edx) # sched: [4:1.00]
+; SKYLAKE-NEXT: fisttpl (%ecx) # sched: [4:1.00]
+; SKYLAKE-NEXT: fisttpll (%eax) # sched: [4:1.00]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retl # sched: [6:0.50]
+;
+; SKX-LABEL: test_fist_fistp_fisttp:
+; SKX: # %bb.0:
+; SKX-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; SKX-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; SKX-NEXT: movl {{[0-9]+}}(%esp), %edx # sched: [5:0.50]
+; SKX-NEXT: #APP
+; SKX-NEXT: fists (%edx) # sched: [4:1.00]
+; SKX-NEXT: fistl (%ecx) # sched: [4:1.00]
+; SKX-NEXT: fistps (%edx) # sched: [4:1.00]
+; SKX-NEXT: fistpl (%ecx) # sched: [4:1.00]
+; SKX-NEXT: fistpll (%eax) # sched: [4:1.00]
+; SKX-NEXT: fisttps (%edx) # sched: [4:1.00]
+; SKX-NEXT: fisttpl (%ecx) # sched: [4:1.00]
+; SKX-NEXT: fisttpll (%eax) # sched: [4:1.00]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retl # sched: [6:0.50]
+;
+; BTVER2-LABEL: test_fist_fistp_fisttp:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:1.00]
+; BTVER2-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:1.00]
+; BTVER2-NEXT: movl {{[0-9]+}}(%esp), %edx # sched: [5:1.00]
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: fists (%edx) # sched: [1:1.00]
+; BTVER2-NEXT: fistl (%ecx) # sched: [1:1.00]
+; BTVER2-NEXT: fistps (%edx) # sched: [1:1.00]
+; BTVER2-NEXT: fistpl (%ecx) # sched: [1:1.00]
+; BTVER2-NEXT: fistpll (%eax) # sched: [1:1.00]
+; BTVER2-NEXT: fisttps (%edx) # sched: [1:1.00]
+; BTVER2-NEXT: fisttpl (%ecx) # sched: [1:1.00]
+; BTVER2-NEXT: fisttpll (%eax) # sched: [1:1.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retl # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_fist_fistp_fisttp:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [8:0.50]
+; ZNVER1-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [8:0.50]
+; ZNVER1-NEXT: movl {{[0-9]+}}(%esp), %edx # sched: [8:0.50]
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: fists (%edx) # sched: [12:0.50]
+; ZNVER1-NEXT: fistl (%ecx) # sched: [12:0.50]
+; ZNVER1-NEXT: fistps (%edx) # sched: [12:0.50]
+; ZNVER1-NEXT: fistpl (%ecx) # sched: [12:0.50]
+; ZNVER1-NEXT: fistpll (%eax) # sched: [12:0.50]
+; ZNVER1-NEXT: fisttps (%edx) # sched: [12:0.50]
+; ZNVER1-NEXT: fisttpl (%ecx) # sched: [12:0.50]
+; ZNVER1-NEXT: fisttpll (%eax) # sched: [12:0.50]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retl # sched: [1:0.50]
+ tail call void asm sideeffect "fists $0 \0A\09 fistl $1 \0A\09 fistps $0 \0A\09 fistpl $1 \0A\09 fistpll $2 \0A\09 fisttps $0 \0A\09 fisttpl $1 \0A\09 fisttpll $2", "*m,*m,*m"(i16* %a0, i32* %a1, i64 *%a2) nounwind
+ ret void
+}
+
+define void @test_fld(i16* %a0, i32* %a1, i64 *%a2) optsize {
+; GENERIC-LABEL: test_fld:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: movl {{[0-9]+}}(%esp), %eax
+; GENERIC-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; GENERIC-NEXT: movl {{[0-9]+}}(%esp), %edx
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: fld %st(0)
+; GENERIC-NEXT: flds (%edx)
+; GENERIC-NEXT: fldl (%ecx)
+; GENERIC-NEXT: fldt (%eax)
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retl
+;
+; ATOM-LABEL: test_fld:
+; ATOM: # %bb.0:
+; ATOM-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [1:1.00]
+; ATOM-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [1:1.00]
+; ATOM-NEXT: movl {{[0-9]+}}(%esp), %edx # sched: [1:1.00]
+; ATOM-NEXT: #APP
+; ATOM-NEXT: fld %st(0) # sched: [1:1.00]
+; ATOM-NEXT: flds (%edx) # sched: [1:1.00]
+; ATOM-NEXT: fldl (%ecx) # sched: [1:1.00]
+; ATOM-NEXT: fldt (%eax) # sched: [4:2.00]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retl # sched: [79:39.50]
+;
+; SLM-LABEL: test_fld:
+; SLM: # %bb.0:
+; SLM-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [3:1.00]
+; SLM-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [3:1.00]
+; SLM-NEXT: movl {{[0-9]+}}(%esp), %edx # sched: [3:1.00]
+; SLM-NEXT: #APP
+; SLM-NEXT: fld %st(0) # sched: [1:0.50]
+; SLM-NEXT: flds (%edx) # sched: [3:1.00]
+; SLM-NEXT: fldl (%ecx) # sched: [3:1.00]
+; SLM-NEXT: fldt (%eax) # sched: [3:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retl # sched: [4:1.00]
+;
+; SANDY-LABEL: test_fld:
+; SANDY: # %bb.0:
+; SANDY-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; SANDY-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; SANDY-NEXT: movl {{[0-9]+}}(%esp), %edx # sched: [5:0.50]
+; SANDY-NEXT: #APP
+; SANDY-NEXT: fld %st(0) # sched: [1:1.00]
+; SANDY-NEXT: flds (%edx) # sched: [9:1.00]
+; SANDY-NEXT: fldl (%ecx) # sched: [9:1.00]
+; SANDY-NEXT: fldt (%eax) # sched: [9:1.00]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retl # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_fld:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; HASWELL-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; HASWELL-NEXT: movl {{[0-9]+}}(%esp), %edx # sched: [5:0.50]
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: fld %st(0) # sched: [1:0.50]
+; HASWELL-NEXT: flds (%edx) # sched: [7:0.50]
+; HASWELL-NEXT: fldl (%ecx) # sched: [7:0.50]
+; HASWELL-NEXT: fldt (%eax) # sched: [7:0.50]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retl # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_fld:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; BROADWELL-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; BROADWELL-NEXT: movl {{[0-9]+}}(%esp), %edx # sched: [5:0.50]
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: fld %st(0) # sched: [1:0.25]
+; BROADWELL-NEXT: flds (%edx) # sched: [6:0.50]
+; BROADWELL-NEXT: fldl (%ecx) # sched: [6:0.50]
+; BROADWELL-NEXT: fldt (%eax) # sched: [6:0.50]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retl # sched: [6:0.50]
+;
+; SKYLAKE-LABEL: test_fld:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; SKYLAKE-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; SKYLAKE-NEXT: movl {{[0-9]+}}(%esp), %edx # sched: [5:0.50]
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: fld %st(0) # sched: [1:0.25]
+; SKYLAKE-NEXT: flds (%edx) # sched: [7:0.50]
+; SKYLAKE-NEXT: fldl (%ecx) # sched: [7:0.50]
+; SKYLAKE-NEXT: fldt (%eax) # sched: [7:0.50]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retl # sched: [6:0.50]
+;
+; SKX-LABEL: test_fld:
+; SKX: # %bb.0:
+; SKX-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; SKX-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; SKX-NEXT: movl {{[0-9]+}}(%esp), %edx # sched: [5:0.50]
+; SKX-NEXT: #APP
+; SKX-NEXT: fld %st(0) # sched: [1:0.25]
+; SKX-NEXT: flds (%edx) # sched: [7:0.50]
+; SKX-NEXT: fldl (%ecx) # sched: [7:0.50]
+; SKX-NEXT: fldt (%eax) # sched: [7:0.50]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retl # sched: [6:0.50]
+;
+; BTVER2-LABEL: test_fld:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:1.00]
+; BTVER2-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:1.00]
+; BTVER2-NEXT: movl {{[0-9]+}}(%esp), %edx # sched: [5:1.00]
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: fld %st(0) # sched: [1:0.50]
+; BTVER2-NEXT: flds (%edx) # sched: [5:1.00]
+; BTVER2-NEXT: fldl (%ecx) # sched: [5:1.00]
+; BTVER2-NEXT: fldt (%eax) # sched: [5:1.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retl # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_fld:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [8:0.50]
+; ZNVER1-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [8:0.50]
+; ZNVER1-NEXT: movl {{[0-9]+}}(%esp), %edx # sched: [8:0.50]
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: fld %st(0) # sched: [1:0.50]
+; ZNVER1-NEXT: flds (%edx) # sched: [8:0.50]
+; ZNVER1-NEXT: fldl (%ecx) # sched: [8:0.50]
+; ZNVER1-NEXT: fldt (%eax) # sched: [1:0.50]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retl # sched: [1:0.50]
+ tail call void asm sideeffect "fld %st(0) \0A\09 flds $0 \0A\09 fldl $1 \0A\09 fldt $2", "*m,*m,*m"(i16* %a0, i32* %a1, i64 *%a2) nounwind
+ ret void
+}
+
+define void @test_fldcw_fldenv(i8* %a0) optsize {
+; GENERIC-LABEL: test_fldcw_fldenv:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: movl {{[0-9]+}}(%esp), %eax
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: fldcw (%eax)
+; GENERIC-NEXT: fldenv (%eax)
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retl
+;
+; ATOM-LABEL: test_fldcw_fldenv:
+; ATOM: # %bb.0:
+; ATOM-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [1:1.00]
+; ATOM-NEXT: #APP
+; ATOM-NEXT: fldcw (%eax) # sched: [5:2.50]
+; ATOM-NEXT: fldenv (%eax)
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retl # sched: [79:39.50]
+;
+; SLM-LABEL: test_fldcw_fldenv:
+; SLM: # %bb.0:
+; SLM-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [3:1.00]
+; SLM-NEXT: #APP
+; SLM-NEXT: fldcw (%eax) # sched: [3:1.00]
+; SLM-NEXT: fldenv (%eax) # sched: [100:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retl # sched: [4:1.00]
+;
+; SANDY-LABEL: test_fldcw_fldenv:
+; SANDY: # %bb.0:
+; SANDY-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; SANDY-NEXT: #APP
+; SANDY-NEXT: fldcw (%eax) # sched: [8:2.00]
+; SANDY-NEXT: fldenv (%eax) # sched: [100:0.33]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retl # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_fldcw_fldenv:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: fldcw (%eax) # sched: [7:1.00]
+; HASWELL-NEXT: fldenv (%eax) # sched: [61:14.00]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retl # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_fldcw_fldenv:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: fldcw (%eax) # sched: [7:1.00]
+; BROADWELL-NEXT: fldenv (%eax) # sched: [60:14.00]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retl # sched: [6:0.50]
+;
+; SKYLAKE-LABEL: test_fldcw_fldenv:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: fldcw (%eax) # sched: [7:1.00]
+; SKYLAKE-NEXT: fldenv (%eax) # sched: [62:14.00]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retl # sched: [6:0.50]
+;
+; SKX-LABEL: test_fldcw_fldenv:
+; SKX: # %bb.0:
+; SKX-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; SKX-NEXT: #APP
+; SKX-NEXT: fldcw (%eax) # sched: [7:1.00]
+; SKX-NEXT: fldenv (%eax) # sched: [62:14.00]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retl # sched: [6:0.50]
+;
+; BTVER2-LABEL: test_fldcw_fldenv:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:1.00]
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: fldcw (%eax) # sched: [5:1.00]
+; BTVER2-NEXT: fldenv (%eax) # sched: [100:0.17]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retl # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_fldcw_fldenv:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [8:0.50]
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: fldcw (%eax) # sched: [100:?]
+; ZNVER1-NEXT: fldenv (%eax) # sched: [100:?]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retl # sched: [1:0.50]
+ tail call void asm sideeffect "fldcw $0 \0A\09 fldenv $0", "*m"(i8* %a0) nounwind
+ ret void
+}
+
+define void @test_fld1_fldl2e_fldl2t_fldlg2_fldln2_fldpi_fldz() optsize {
+; GENERIC-LABEL: test_fld1_fldl2e_fldl2t_fldlg2_fldln2_fldpi_fldz:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: fld1
+; GENERIC-NEXT: fldl2e
+; GENERIC-NEXT: fldl2t
+; GENERIC-NEXT: fldln2
+; GENERIC-NEXT: fldpi
+; GENERIC-NEXT: fldz
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retl
+;
+; ATOM-LABEL: test_fld1_fldl2e_fldl2t_fldlg2_fldln2_fldpi_fldz:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: fld1 # sched: [6:3.00]
+; ATOM-NEXT: fldl2e # sched: [10:5.00]
+; ATOM-NEXT: fldl2t # sched: [10:5.00]
+; ATOM-NEXT: fldln2 # sched: [10:5.00]
+; ATOM-NEXT: fldpi # sched: [10:5.00]
+; ATOM-NEXT: fldz # sched: [1:0.50]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retl # sched: [79:39.50]
+;
+; SLM-LABEL: test_fld1_fldl2e_fldl2t_fldlg2_fldln2_fldpi_fldz:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: fld1 # sched: [1:?]
+; SLM-NEXT: fldl2e # sched: [100:1.00]
+; SLM-NEXT: fldl2t # sched: [100:1.00]
+; SLM-NEXT: fldln2 # sched: [100:1.00]
+; SLM-NEXT: fldpi # sched: [100:1.00]
+; SLM-NEXT: fldz # sched: [1:?]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retl # sched: [4:1.00]
+;
+; SANDY-LABEL: test_fld1_fldl2e_fldl2t_fldlg2_fldln2_fldpi_fldz:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: fld1 # sched: [1:?]
+; SANDY-NEXT: fldl2e # sched: [100:0.33]
+; SANDY-NEXT: fldl2t # sched: [100:0.33]
+; SANDY-NEXT: fldln2 # sched: [100:0.33]
+; SANDY-NEXT: fldpi # sched: [100:0.33]
+; SANDY-NEXT: fldz # sched: [1:?]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retl # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_fld1_fldl2e_fldl2t_fldlg2_fldln2_fldpi_fldz:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: fld1 # sched: [1:?]
+; HASWELL-NEXT: fldl2e # sched: [100:0.25]
+; HASWELL-NEXT: fldl2t # sched: [100:0.25]
+; HASWELL-NEXT: fldln2 # sched: [100:0.25]
+; HASWELL-NEXT: fldpi # sched: [1:0.50]
+; HASWELL-NEXT: fldz # sched: [1:0.50]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retl # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_fld1_fldl2e_fldl2t_fldlg2_fldln2_fldpi_fldz:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: fld1 # sched: [1:?]
+; BROADWELL-NEXT: fldl2e # sched: [100:0.25]
+; BROADWELL-NEXT: fldl2t # sched: [100:0.25]
+; BROADWELL-NEXT: fldln2 # sched: [100:0.25]
+; BROADWELL-NEXT: fldpi # sched: [100:0.25]
+; BROADWELL-NEXT: fldz # sched: [1:?]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retl # sched: [6:0.50]
+;
+; SKYLAKE-LABEL: test_fld1_fldl2e_fldl2t_fldlg2_fldln2_fldpi_fldz:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: fld1 # sched: [1:?]
+; SKYLAKE-NEXT: fldl2e # sched: [100:0.25]
+; SKYLAKE-NEXT: fldl2t # sched: [100:0.25]
+; SKYLAKE-NEXT: fldln2 # sched: [100:0.25]
+; SKYLAKE-NEXT: fldpi # sched: [100:0.25]
+; SKYLAKE-NEXT: fldz # sched: [1:?]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retl # sched: [6:0.50]
+;
+; SKX-LABEL: test_fld1_fldl2e_fldl2t_fldlg2_fldln2_fldpi_fldz:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: fld1 # sched: [1:?]
+; SKX-NEXT: fldl2e # sched: [100:0.25]
+; SKX-NEXT: fldl2t # sched: [100:0.25]
+; SKX-NEXT: fldln2 # sched: [100:0.25]
+; SKX-NEXT: fldpi # sched: [100:0.25]
+; SKX-NEXT: fldz # sched: [1:?]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retl # sched: [6:0.50]
+;
+; BTVER2-LABEL: test_fld1_fldl2e_fldl2t_fldlg2_fldln2_fldpi_fldz:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: fld1 # sched: [1:?]
+; BTVER2-NEXT: fldl2e # sched: [100:0.17]
+; BTVER2-NEXT: fldl2t # sched: [100:0.17]
+; BTVER2-NEXT: fldln2 # sched: [100:0.17]
+; BTVER2-NEXT: fldpi # sched: [100:0.17]
+; BTVER2-NEXT: fldz # sched: [1:?]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retl # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_fld1_fldl2e_fldl2t_fldlg2_fldln2_fldpi_fldz:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: fld1 # sched: [11:1.00]
+; ZNVER1-NEXT: fldl2e # sched: [100:?]
+; ZNVER1-NEXT: fldl2t # sched: [100:?]
+; ZNVER1-NEXT: fldln2 # sched: [100:?]
+; ZNVER1-NEXT: fldpi # sched: [11:1.00]
+; ZNVER1-NEXT: fldz # sched: [8:0.50]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retl # sched: [1:0.50]
+ tail call void asm sideeffect "fld1 \0A\09 fldl2e \0A\09 fldl2t \0A\09 fldln2 \0A\09 fldpi \0A\09 fldz", ""() nounwind
+ ret void
+}
+
+define void @test_fmul(float *%a0, double *%a1) optsize {
+; GENERIC-LABEL: test_fmul:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: movl {{[0-9]+}}(%esp), %eax
+; GENERIC-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: fmul %st(0), %st(1)
+; GENERIC-NEXT: fmul %st(2)
+; GENERIC-NEXT: fmuls (%ecx)
+; GENERIC-NEXT: fmull (%eax)
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retl
+;
+; ATOM-LABEL: test_fmul:
+; ATOM: # %bb.0:
+; ATOM-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [1:1.00]
+; ATOM-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [1:1.00]
+; ATOM-NEXT: #APP
+; ATOM-NEXT: fmul %st(0), %st(1)
+; ATOM-NEXT: fmul %st(2)
+; ATOM-NEXT: fmuls (%ecx)
+; ATOM-NEXT: fmull (%eax)
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retl # sched: [79:39.50]
+;
+; SLM-LABEL: test_fmul:
+; SLM: # %bb.0:
+; SLM-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [3:1.00]
+; SLM-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [3:1.00]
+; SLM-NEXT: #APP
+; SLM-NEXT: fmul %st(0), %st(1) # sched: [5:2.00]
+; SLM-NEXT: fmul %st(2) # sched: [5:2.00]
+; SLM-NEXT: fmuls (%ecx) # sched: [8:2.00]
+; SLM-NEXT: fmull (%eax) # sched: [8:2.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retl # sched: [4:1.00]
+;
+; SANDY-LABEL: test_fmul:
+; SANDY: # %bb.0:
+; SANDY-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; SANDY-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; SANDY-NEXT: #APP
+; SANDY-NEXT: fmul %st(0), %st(1) # sched: [5:1.00]
+; SANDY-NEXT: fmul %st(2) # sched: [5:1.00]
+; SANDY-NEXT: fmuls (%ecx) # sched: [12:1.00]
+; SANDY-NEXT: fmull (%eax) # sched: [12:1.00]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retl # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_fmul:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; HASWELL-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: fmul %st(0), %st(1) # sched: [5:1.00]
+; HASWELL-NEXT: fmul %st(2) # sched: [5:1.00]
+; HASWELL-NEXT: fmuls (%ecx) # sched: [12:1.00]
+; HASWELL-NEXT: fmull (%eax) # sched: [12:1.00]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retl # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_fmul:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; BROADWELL-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: fmul %st(0), %st(1) # sched: [5:1.00]
+; BROADWELL-NEXT: fmul %st(2) # sched: [5:1.00]
+; BROADWELL-NEXT: fmuls (%ecx) # sched: [11:1.00]
+; BROADWELL-NEXT: fmull (%eax) # sched: [11:1.00]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retl # sched: [6:0.50]
+;
+; SKYLAKE-LABEL: test_fmul:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; SKYLAKE-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: fmul %st(0), %st(1) # sched: [4:1.00]
+; SKYLAKE-NEXT: fmul %st(2) # sched: [4:1.00]
+; SKYLAKE-NEXT: fmuls (%ecx) # sched: [11:1.00]
+; SKYLAKE-NEXT: fmull (%eax) # sched: [11:1.00]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retl # sched: [6:0.50]
+;
+; SKX-LABEL: test_fmul:
+; SKX: # %bb.0:
+; SKX-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; SKX-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; SKX-NEXT: #APP
+; SKX-NEXT: fmul %st(0), %st(1) # sched: [4:1.00]
+; SKX-NEXT: fmul %st(2) # sched: [4:1.00]
+; SKX-NEXT: fmuls (%ecx) # sched: [11:1.00]
+; SKX-NEXT: fmull (%eax) # sched: [11:1.00]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retl # sched: [6:0.50]
+;
+; BTVER2-LABEL: test_fmul:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:1.00]
+; BTVER2-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:1.00]
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: fmul %st(0), %st(1) # sched: [2:1.00]
+; BTVER2-NEXT: fmul %st(2) # sched: [2:1.00]
+; BTVER2-NEXT: fmuls (%ecx) # sched: [7:1.00]
+; BTVER2-NEXT: fmull (%eax) # sched: [7:1.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retl # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_fmul:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [8:0.50]
+; ZNVER1-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [8:0.50]
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: fmul %st(0), %st(1) # sched: [5:1.00]
+; ZNVER1-NEXT: fmul %st(2) # sched: [5:1.00]
+; ZNVER1-NEXT: fmuls (%ecx) # sched: [12:1.00]
+; ZNVER1-NEXT: fmull (%eax) # sched: [12:1.00]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retl # sched: [1:0.50]
+ tail call void asm sideeffect "fmul %st(0), %st(1) \0A\09 fmul %st(2), %st(0) \0A\09 fmuls $0 \0A\09 fmull $1", "*m,*m"(float *%a0, double *%a1) nounwind
+ ret void
+}
+
+define void @test_fmulp_fimul(i16 *%a0, i32 *%a1) optsize {
+; GENERIC-LABEL: test_fmulp_fimul:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: movl {{[0-9]+}}(%esp), %eax
+; GENERIC-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: fmulp %st(1)
+; GENERIC-NEXT: fmulp %st(2)
+; GENERIC-NEXT: fimuls (%ecx)
+; GENERIC-NEXT: fimull (%eax)
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retl
+;
+; ATOM-LABEL: test_fmulp_fimul:
+; ATOM: # %bb.0:
+; ATOM-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [1:1.00]
+; ATOM-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [1:1.00]
+; ATOM-NEXT: #APP
+; ATOM-NEXT: fmulp %st(1)
+; ATOM-NEXT: fmulp %st(2)
+; ATOM-NEXT: fimuls (%ecx)
+; ATOM-NEXT: fimull (%eax)
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retl # sched: [79:39.50]
+;
+; SLM-LABEL: test_fmulp_fimul:
+; SLM: # %bb.0:
+; SLM-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [3:1.00]
+; SLM-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [3:1.00]
+; SLM-NEXT: #APP
+; SLM-NEXT: fmulp %st(1) # sched: [5:2.00]
+; SLM-NEXT: fmulp %st(2) # sched: [5:2.00]
+; SLM-NEXT: fimuls (%ecx) # sched: [8:2.00]
+; SLM-NEXT: fimull (%eax) # sched: [8:2.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retl # sched: [4:1.00]
+;
+; SANDY-LABEL: test_fmulp_fimul:
+; SANDY: # %bb.0:
+; SANDY-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; SANDY-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; SANDY-NEXT: #APP
+; SANDY-NEXT: fmulp %st(1) # sched: [5:1.00]
+; SANDY-NEXT: fmulp %st(2) # sched: [5:1.00]
+; SANDY-NEXT: fimuls (%ecx) # sched: [15:1.00]
+; SANDY-NEXT: fimull (%eax) # sched: [15:1.00]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retl # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_fmulp_fimul:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; HASWELL-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: fmulp %st(1) # sched: [5:1.00]
+; HASWELL-NEXT: fmulp %st(2) # sched: [5:1.00]
+; HASWELL-NEXT: fimuls (%ecx) # sched: [15:1.00]
+; HASWELL-NEXT: fimull (%eax) # sched: [15:1.00]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retl # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_fmulp_fimul:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; BROADWELL-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: fmulp %st(1) # sched: [5:1.00]
+; BROADWELL-NEXT: fmulp %st(2) # sched: [5:1.00]
+; BROADWELL-NEXT: fimuls (%ecx) # sched: [14:1.00]
+; BROADWELL-NEXT: fimull (%eax) # sched: [14:1.00]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retl # sched: [6:0.50]
+;
+; SKYLAKE-LABEL: test_fmulp_fimul:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; SKYLAKE-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: fmulp %st(1) # sched: [4:1.00]
+; SKYLAKE-NEXT: fmulp %st(2) # sched: [4:1.00]
+; SKYLAKE-NEXT: fimuls (%ecx) # sched: [14:1.00]
+; SKYLAKE-NEXT: fimull (%eax) # sched: [14:1.00]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retl # sched: [6:0.50]
+;
+; SKX-LABEL: test_fmulp_fimul:
+; SKX: # %bb.0:
+; SKX-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; SKX-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; SKX-NEXT: #APP
+; SKX-NEXT: fmulp %st(1) # sched: [4:1.00]
+; SKX-NEXT: fmulp %st(2) # sched: [4:1.00]
+; SKX-NEXT: fimuls (%ecx) # sched: [14:1.00]
+; SKX-NEXT: fimull (%eax) # sched: [14:1.00]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retl # sched: [6:0.50]
+;
+; BTVER2-LABEL: test_fmulp_fimul:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:1.00]
+; BTVER2-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:1.00]
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: fmulp %st(1) # sched: [2:1.00]
+; BTVER2-NEXT: fmulp %st(2) # sched: [2:1.00]
+; BTVER2-NEXT: fimuls (%ecx) # sched: [7:1.00]
+; BTVER2-NEXT: fimull (%eax) # sched: [7:1.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retl # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_fmulp_fimul:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [8:0.50]
+; ZNVER1-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [8:0.50]
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: fmulp %st(1) # sched: [5:1.00]
+; ZNVER1-NEXT: fmulp %st(2) # sched: [5:1.00]
+; ZNVER1-NEXT: fimuls (%ecx) # sched: [12:1.00]
+; ZNVER1-NEXT: fimull (%eax) # sched: [12:1.00]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retl # sched: [1:0.50]
+ tail call void asm sideeffect "fmulp \0A\09 fmulp %st(2), %st(0) \0A\09 fimuls $0 \0A\09 fimull $1", "*m,*m"(i16 *%a0, i32 *%a1) nounwind
+ ret void
+}
+
+define void @test_fnop() optsize {
+; GENERIC-LABEL: test_fnop:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: fnop
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retl
+;
+; ATOM-LABEL: test_fnop:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: fnop # sched: [1:0.50]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retl # sched: [79:39.50]
+;
+; SLM-LABEL: test_fnop:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: fnop # sched: [100:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retl # sched: [4:1.00]
+;
+; SANDY-LABEL: test_fnop:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: fnop # sched: [1:1.00]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retl # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_fnop:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: fnop # sched: [1:0.50]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retl # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_fnop:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: fnop # sched: [1:0.50]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retl # sched: [6:0.50]
+;
+; SKYLAKE-LABEL: test_fnop:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: fnop # sched: [1:0.50]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retl # sched: [6:0.50]
+;
+; SKX-LABEL: test_fnop:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: fnop # sched: [1:0.50]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retl # sched: [6:0.50]
+;
+; BTVER2-LABEL: test_fnop:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: fnop # sched: [100:0.17]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retl # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_fnop:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: fnop # sched: [1:1.00]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retl # sched: [1:0.50]
+ tail call void asm sideeffect "fnop", ""() nounwind
+ ret void
+}
+
+define void @test_fpatan() optsize {
+; GENERIC-LABEL: test_fpatan:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: fpatan
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retl
+;
+; ATOM-LABEL: test_fpatan:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: fpatan # sched: [183:91.50]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retl # sched: [79:39.50]
+;
+; SLM-LABEL: test_fpatan:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: fpatan # sched: [100:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retl # sched: [4:1.00]
+;
+; SANDY-LABEL: test_fpatan:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: fpatan # sched: [100:0.33]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retl # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_fpatan:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: fpatan # sched: [100:0.25]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retl # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_fpatan:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: fpatan # sched: [100:0.25]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retl # sched: [6:0.50]
+;
+; SKYLAKE-LABEL: test_fpatan:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: fpatan # sched: [100:0.25]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retl # sched: [6:0.50]
+;
+; SKX-LABEL: test_fpatan:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: fpatan # sched: [100:0.25]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retl # sched: [6:0.50]
+;
+; BTVER2-LABEL: test_fpatan:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: fpatan # sched: [100:0.17]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retl # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_fpatan:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: fpatan # sched: [100:?]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retl # sched: [1:0.50]
+ tail call void asm sideeffect "fpatan", ""() nounwind
+ ret void
+}
+
+define void @test_fprem_fprem1() optsize {
+; GENERIC-LABEL: test_fprem_fprem1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: fprem
+; GENERIC-NEXT: fprem1
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retl
+;
+; ATOM-LABEL: test_fprem_fprem1:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: fprem # sched: [55:27.50]
+; ATOM-NEXT: fprem1 # sched: [71:35.50]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retl # sched: [79:39.50]
+;
+; SLM-LABEL: test_fprem_fprem1:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: fprem # sched: [100:1.00]
+; SLM-NEXT: fprem1 # sched: [100:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retl # sched: [4:1.00]
+;
+; SANDY-LABEL: test_fprem_fprem1:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: fprem # sched: [100:0.33]
+; SANDY-NEXT: fprem1 # sched: [100:0.33]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retl # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_fprem_fprem1:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: fprem # sched: [19:?]
+; HASWELL-NEXT: fprem1 # sched: [19:?]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retl # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_fprem_fprem1:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: fprem # sched: [100:0.25]
+; BROADWELL-NEXT: fprem1 # sched: [100:0.25]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retl # sched: [6:0.50]
+;
+; SKYLAKE-LABEL: test_fprem_fprem1:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: fprem # sched: [100:0.25]
+; SKYLAKE-NEXT: fprem1 # sched: [100:0.25]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retl # sched: [6:0.50]
+;
+; SKX-LABEL: test_fprem_fprem1:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: fprem # sched: [100:0.25]
+; SKX-NEXT: fprem1 # sched: [100:0.25]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retl # sched: [6:0.50]
+;
+; BTVER2-LABEL: test_fprem_fprem1:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: fprem # sched: [100:0.17]
+; BTVER2-NEXT: fprem1 # sched: [100:0.17]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retl # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_fprem_fprem1:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: fprem # sched: [100:?]
+; ZNVER1-NEXT: fprem1 # sched: [100:?]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retl # sched: [1:0.50]
+ tail call void asm sideeffect "fprem \0A\09 fprem1", ""() nounwind
+ ret void
+}
+
+define void @test_fptan() optsize {
+; GENERIC-LABEL: test_fptan:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: fptan
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retl
+;
+; ATOM-LABEL: test_fptan:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: fptan # sched: [168:84.00]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retl # sched: [79:39.50]
+;
+; SLM-LABEL: test_fptan:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: fptan # sched: [100:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retl # sched: [4:1.00]
+;
+; SANDY-LABEL: test_fptan:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: fptan # sched: [100:0.33]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retl # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_fptan:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: fptan # sched: [100:0.25]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retl # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_fptan:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: fptan # sched: [100:0.25]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retl # sched: [6:0.50]
+;
+; SKYLAKE-LABEL: test_fptan:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: fptan # sched: [100:0.25]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retl # sched: [6:0.50]
+;
+; SKX-LABEL: test_fptan:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: fptan # sched: [100:0.25]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retl # sched: [6:0.50]
+;
+; BTVER2-LABEL: test_fptan:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: fptan # sched: [100:0.17]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retl # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_fptan:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: fptan # sched: [100:?]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retl # sched: [1:0.50]
+ tail call void asm sideeffect "fptan", ""() nounwind
+ ret void
+}
+
+define void @test_frndint() optsize {
+; GENERIC-LABEL: test_frndint:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: frndint
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retl
+;
+; ATOM-LABEL: test_frndint:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: frndint # sched: [46:23.00]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retl # sched: [79:39.50]
+;
+; SLM-LABEL: test_frndint:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: frndint # sched: [100:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retl # sched: [4:1.00]
+;
+; SANDY-LABEL: test_frndint:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: frndint # sched: [100:0.33]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retl # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_frndint:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: frndint # sched: [11:?]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retl # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_frndint:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: frndint # sched: [100:0.25]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retl # sched: [6:0.50]
+;
+; SKYLAKE-LABEL: test_frndint:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: frndint # sched: [100:0.25]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retl # sched: [6:0.50]
+;
+; SKX-LABEL: test_frndint:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: frndint # sched: [100:0.25]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retl # sched: [6:0.50]
+;
+; BTVER2-LABEL: test_frndint:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: frndint # sched: [100:0.17]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retl # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_frndint:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: frndint # sched: [100:?]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retl # sched: [1:0.50]
+ tail call void asm sideeffect "frndint", ""() nounwind
+ ret void
+}
+
+define void @test_frstor(i8* %a0) optsize {
+; GENERIC-LABEL: test_frstor:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: movl {{[0-9]+}}(%esp), %eax
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: frstor (%eax)
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retl
+;
+; ATOM-LABEL: test_frstor:
+; ATOM: # %bb.0:
+; ATOM-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [1:1.00]
+; ATOM-NEXT: #APP
+; ATOM-NEXT: frstor (%eax)
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retl # sched: [79:39.50]
+;
+; SLM-LABEL: test_frstor:
+; SLM: # %bb.0:
+; SLM-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [3:1.00]
+; SLM-NEXT: #APP
+; SLM-NEXT: frstor (%eax) # sched: [100:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retl # sched: [4:1.00]
+;
+; SANDY-LABEL: test_frstor:
+; SANDY: # %bb.0:
+; SANDY-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; SANDY-NEXT: #APP
+; SANDY-NEXT: frstor (%eax) # sched: [100:0.33]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retl # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_frstor:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: frstor (%eax) # sched: [1:?]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retl # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_frstor:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: frstor (%eax) # sched: [100:0.25]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retl # sched: [6:0.50]
+;
+; SKYLAKE-LABEL: test_frstor:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: frstor (%eax) # sched: [100:0.25]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retl # sched: [6:0.50]
+;
+; SKX-LABEL: test_frstor:
+; SKX: # %bb.0:
+; SKX-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; SKX-NEXT: #APP
+; SKX-NEXT: frstor (%eax) # sched: [100:0.25]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retl # sched: [6:0.50]
+;
+; BTVER2-LABEL: test_frstor:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:1.00]
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: frstor (%eax) # sched: [100:0.17]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retl # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_frstor:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [8:0.50]
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: frstor (%eax) # sched: [100:?]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retl # sched: [1:0.50]
+ tail call void asm sideeffect "frstor $0", "*m"(i8* %a0) nounwind
+ ret void
+}
+
+define void @test_fsave(i8* %a0) optsize {
+; GENERIC-LABEL: test_fsave:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: movl {{[0-9]+}}(%esp), %eax
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: wait
+; GENERIC-NEXT: fnsave (%eax)
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retl
+;
+; ATOM-LABEL: test_fsave:
+; ATOM: # %bb.0:
+; ATOM-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [1:1.00]
+; ATOM-NEXT: #APP
+; ATOM-NEXT: wait # sched: [1:0.50]
+; ATOM-NEXT: fnsave (%eax)
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retl # sched: [79:39.50]
+;
+; SLM-LABEL: test_fsave:
+; SLM: # %bb.0:
+; SLM-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [3:1.00]
+; SLM-NEXT: #APP
+; SLM-NEXT: wait # sched: [100:1.00]
+; SLM-NEXT: fnsave (%eax) # sched: [100:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retl # sched: [4:1.00]
+;
+; SANDY-LABEL: test_fsave:
+; SANDY: # %bb.0:
+; SANDY-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; SANDY-NEXT: #APP
+; SANDY-NEXT: wait # sched: [100:0.33]
+; SANDY-NEXT: fnsave (%eax) # sched: [100:0.33]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retl # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_fsave:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: wait # sched: [1:0.50]
+; HASWELL-NEXT: fnsave (%eax) # sched: [1:?]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retl # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_fsave:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: wait # sched: [2:0.50]
+; BROADWELL-NEXT: fnsave (%eax) # sched: [100:0.25]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retl # sched: [6:0.50]
+;
+; SKYLAKE-LABEL: test_fsave:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: wait # sched: [2:0.50]
+; SKYLAKE-NEXT: fnsave (%eax) # sched: [100:0.25]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retl # sched: [6:0.50]
+;
+; SKX-LABEL: test_fsave:
+; SKX: # %bb.0:
+; SKX-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; SKX-NEXT: #APP
+; SKX-NEXT: wait # sched: [2:0.50]
+; SKX-NEXT: fnsave (%eax) # sched: [100:0.25]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retl # sched: [6:0.50]
+;
+; BTVER2-LABEL: test_fsave:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:1.00]
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: wait # sched: [100:0.17]
+; BTVER2-NEXT: fnsave (%eax) # sched: [100:0.17]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retl # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_fsave:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [8:0.50]
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: wait # sched: [1:1.00]
+; ZNVER1-NEXT: fnsave (%eax) # sched: [100:?]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retl # sched: [1:0.50]
+ tail call void asm sideeffect "fsave $0", "*m"(i8* %a0) nounwind
+ ret void
+}
+
+define void @test_fnsave(i8* %a0) optsize {
+; GENERIC-LABEL: test_fnsave:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: movl {{[0-9]+}}(%esp), %eax
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: fnsave (%eax)
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retl
+;
+; ATOM-LABEL: test_fnsave:
+; ATOM: # %bb.0:
+; ATOM-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [1:1.00]
+; ATOM-NEXT: #APP
+; ATOM-NEXT: fnsave (%eax)
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retl # sched: [79:39.50]
+;
+; SLM-LABEL: test_fnsave:
+; SLM: # %bb.0:
+; SLM-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [3:1.00]
+; SLM-NEXT: #APP
+; SLM-NEXT: fnsave (%eax) # sched: [100:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retl # sched: [4:1.00]
+;
+; SANDY-LABEL: test_fnsave:
+; SANDY: # %bb.0:
+; SANDY-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; SANDY-NEXT: #APP
+; SANDY-NEXT: fnsave (%eax) # sched: [100:0.33]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retl # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_fnsave:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: fnsave (%eax) # sched: [1:?]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retl # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_fnsave:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: fnsave (%eax) # sched: [100:0.25]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retl # sched: [6:0.50]
+;
+; SKYLAKE-LABEL: test_fnsave:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: fnsave (%eax) # sched: [100:0.25]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retl # sched: [6:0.50]
+;
+; SKX-LABEL: test_fnsave:
+; SKX: # %bb.0:
+; SKX-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; SKX-NEXT: #APP
+; SKX-NEXT: fnsave (%eax) # sched: [100:0.25]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retl # sched: [6:0.50]
+;
+; BTVER2-LABEL: test_fnsave:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:1.00]
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: fnsave (%eax) # sched: [100:0.17]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retl # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_fnsave:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [8:0.50]
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: fnsave (%eax) # sched: [100:?]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retl # sched: [1:0.50]
+ tail call void asm sideeffect "fnsave $0", "*m"(i8* %a0) nounwind
+ ret void
+}
+
+define void @test_fscale() optsize {
+; GENERIC-LABEL: test_fscale:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: fscale
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retl
+;
+; ATOM-LABEL: test_fscale:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: fscale # sched: [77:38.50]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retl # sched: [79:39.50]
+;
+; SLM-LABEL: test_fscale:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: fscale # sched: [100:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retl # sched: [4:1.00]
+;
+; SANDY-LABEL: test_fscale:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: fscale # sched: [100:0.33]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retl # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_fscale:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: fscale # sched: [75:?]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retl # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_fscale:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: fscale # sched: [100:0.25]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retl # sched: [6:0.50]
+;
+; SKYLAKE-LABEL: test_fscale:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: fscale # sched: [100:0.25]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retl # sched: [6:0.50]
+;
+; SKX-LABEL: test_fscale:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: fscale # sched: [100:0.25]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retl # sched: [6:0.50]
+;
+; BTVER2-LABEL: test_fscale:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: fscale # sched: [100:0.17]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retl # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_fscale:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: fscale # sched: [100:?]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retl # sched: [1:0.50]
+ tail call void asm sideeffect "fscale", ""() nounwind
+ ret void
+}
+
+define void @test_fsin() optsize {
+; GENERIC-LABEL: test_fsin:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: fsin
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retl
+;
+; ATOM-LABEL: test_fsin:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: fsin # sched: [174:87.00]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retl # sched: [79:39.50]
+;
+; SLM-LABEL: test_fsin:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: fsin # sched: [100:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retl # sched: [4:1.00]
+;
+; SANDY-LABEL: test_fsin:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: fsin # sched: [100:0.33]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retl # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_fsin:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: fsin # sched: [100:0.25]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retl # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_fsin:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: fsin # sched: [100:0.25]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retl # sched: [6:0.50]
+;
+; SKYLAKE-LABEL: test_fsin:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: fsin # sched: [100:0.25]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retl # sched: [6:0.50]
+;
+; SKX-LABEL: test_fsin:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: fsin # sched: [100:0.25]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retl # sched: [6:0.50]
+;
+; BTVER2-LABEL: test_fsin:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: fsin # sched: [100:0.17]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retl # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_fsin:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: fsin # sched: [100:?]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retl # sched: [1:0.50]
+ tail call void asm sideeffect "fsin", ""() nounwind
+ ret void
+}
+
+define void @test_fsincos() optsize {
+; GENERIC-LABEL: test_fsincos:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: fsincos
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retl
+;
+; ATOM-LABEL: test_fsincos:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: fsincos # sched: [174:87.00]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retl # sched: [79:39.50]
+;
+; SLM-LABEL: test_fsincos:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: fsincos # sched: [100:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retl # sched: [4:1.00]
+;
+; SANDY-LABEL: test_fsincos:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: fsincos # sched: [100:0.33]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retl # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_fsincos:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: fsincos # sched: [100:0.25]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retl # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_fsincos:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: fsincos # sched: [100:0.25]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retl # sched: [6:0.50]
+;
+; SKYLAKE-LABEL: test_fsincos:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: fsincos # sched: [100:0.25]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retl # sched: [6:0.50]
+;
+; SKX-LABEL: test_fsincos:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: fsincos # sched: [100:0.25]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retl # sched: [6:0.50]
+;
+; BTVER2-LABEL: test_fsincos:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: fsincos # sched: [100:0.17]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retl # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_fsincos:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: fsincos # sched: [100:?]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retl # sched: [1:0.50]
+ tail call void asm sideeffect "fsincos", ""() nounwind
+ ret void
+}
+
+define void @test_fsqrt() optsize {
+; GENERIC-LABEL: test_fsqrt:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: fsqrt
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retl
+;
+; ATOM-LABEL: test_fsqrt:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: fsqrt # sched: [71:35.50]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retl # sched: [79:39.50]
+;
+; SLM-LABEL: test_fsqrt:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: fsqrt # sched: [15:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retl # sched: [4:1.00]
+;
+; SANDY-LABEL: test_fsqrt:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: fsqrt # sched: [14:1.00]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retl # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_fsqrt:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: fsqrt # sched: [15:1.00]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retl # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_fsqrt:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: fsqrt # sched: [15:1.00]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retl # sched: [6:0.50]
+;
+; SKYLAKE-LABEL: test_fsqrt:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: fsqrt # sched: [15:1.00]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retl # sched: [6:0.50]
+;
+; SKX-LABEL: test_fsqrt:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: fsqrt # sched: [15:1.00]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retl # sched: [6:0.50]
+;
+; BTVER2-LABEL: test_fsqrt:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: fsqrt # sched: [21:21.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retl # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_fsqrt:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: fsqrt # sched: [20:1.00]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retl # sched: [1:0.50]
+ tail call void asm sideeffect "fsqrt", ""() nounwind
+ ret void
+}
+
+define void @test_fst_fstp(i16* %a0, i32* %a1, i64 *%a2) optsize {
+; GENERIC-LABEL: test_fst_fstp:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: movl {{[0-9]+}}(%esp), %eax
+; GENERIC-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; GENERIC-NEXT: movl {{[0-9]+}}(%esp), %edx
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: fst %st(0)
+; GENERIC-NEXT: fsts (%edx)
+; GENERIC-NEXT: fstl (%ecx)
+; GENERIC-NEXT: fstp %st(0)
+; GENERIC-NEXT: fstpl (%edx)
+; GENERIC-NEXT: fstpl (%ecx)
+; GENERIC-NEXT: fstpt (%eax)
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retl
+;
+; ATOM-LABEL: test_fst_fstp:
+; ATOM: # %bb.0:
+; ATOM-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [1:1.00]
+; ATOM-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [1:1.00]
+; ATOM-NEXT: movl {{[0-9]+}}(%esp), %edx # sched: [1:1.00]
+; ATOM-NEXT: #APP
+; ATOM-NEXT: fst %st(0) # sched: [2:1.00]
+; ATOM-NEXT: fsts (%edx) # sched: [2:1.00]
+; ATOM-NEXT: fstl (%ecx) # sched: [2:1.00]
+; ATOM-NEXT: fstp %st(0) # sched: [2:1.00]
+; ATOM-NEXT: fstpl (%edx) # sched: [2:1.00]
+; ATOM-NEXT: fstpl (%ecx) # sched: [2:1.00]
+; ATOM-NEXT: fstpt (%eax) # sched: [5:2.50]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retl # sched: [79:39.50]
+;
+; SLM-LABEL: test_fst_fstp:
+; SLM: # %bb.0:
+; SLM-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [3:1.00]
+; SLM-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [3:1.00]
+; SLM-NEXT: movl {{[0-9]+}}(%esp), %edx # sched: [3:1.00]
+; SLM-NEXT: #APP
+; SLM-NEXT: fst %st(0) # sched: [1:0.50]
+; SLM-NEXT: fsts (%edx) # sched: [1:1.00]
+; SLM-NEXT: fstl (%ecx) # sched: [1:1.00]
+; SLM-NEXT: fstp %st(0) # sched: [1:0.50]
+; SLM-NEXT: fstpl (%edx) # sched: [1:1.00]
+; SLM-NEXT: fstpl (%ecx) # sched: [1:1.00]
+; SLM-NEXT: fstpt (%eax) # sched: [1:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retl # sched: [4:1.00]
+;
+; SANDY-LABEL: test_fst_fstp:
+; SANDY: # %bb.0:
+; SANDY-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; SANDY-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; SANDY-NEXT: movl {{[0-9]+}}(%esp), %edx # sched: [5:0.50]
+; SANDY-NEXT: #APP
+; SANDY-NEXT: fst %st(0) # sched: [1:1.00]
+; SANDY-NEXT: fsts (%edx) # sched: [6:1.00]
+; SANDY-NEXT: fstl (%ecx) # sched: [6:1.00]
+; SANDY-NEXT: fstp %st(0) # sched: [1:1.00]
+; SANDY-NEXT: fstpl (%edx) # sched: [6:1.00]
+; SANDY-NEXT: fstpl (%ecx) # sched: [6:1.00]
+; SANDY-NEXT: fstpt (%eax) # sched: [6:1.00]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retl # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_fst_fstp:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; HASWELL-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; HASWELL-NEXT: movl {{[0-9]+}}(%esp), %edx # sched: [5:0.50]
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: fst %st(0) # sched: [1:0.50]
+; HASWELL-NEXT: fsts (%edx) # sched: [1:1.00]
+; HASWELL-NEXT: fstl (%ecx) # sched: [1:1.00]
+; HASWELL-NEXT: fstp %st(0) # sched: [1:0.50]
+; HASWELL-NEXT: fstpl (%edx) # sched: [1:1.00]
+; HASWELL-NEXT: fstpl (%ecx) # sched: [1:1.00]
+; HASWELL-NEXT: fstpt (%eax) # sched: [1:1.00]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retl # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_fst_fstp:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; BROADWELL-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; BROADWELL-NEXT: movl {{[0-9]+}}(%esp), %edx # sched: [5:0.50]
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: fst %st(0) # sched: [1:0.25]
+; BROADWELL-NEXT: fsts (%edx) # sched: [1:1.00]
+; BROADWELL-NEXT: fstl (%ecx) # sched: [1:1.00]
+; BROADWELL-NEXT: fstp %st(0) # sched: [1:0.25]
+; BROADWELL-NEXT: fstpl (%edx) # sched: [1:1.00]
+; BROADWELL-NEXT: fstpl (%ecx) # sched: [1:1.00]
+; BROADWELL-NEXT: fstpt (%eax) # sched: [1:1.00]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retl # sched: [6:0.50]
+;
+; SKYLAKE-LABEL: test_fst_fstp:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; SKYLAKE-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; SKYLAKE-NEXT: movl {{[0-9]+}}(%esp), %edx # sched: [5:0.50]
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: fst %st(0) # sched: [1:0.25]
+; SKYLAKE-NEXT: fsts (%edx) # sched: [1:1.00]
+; SKYLAKE-NEXT: fstl (%ecx) # sched: [1:1.00]
+; SKYLAKE-NEXT: fstp %st(0) # sched: [1:0.25]
+; SKYLAKE-NEXT: fstpl (%edx) # sched: [1:1.00]
+; SKYLAKE-NEXT: fstpl (%ecx) # sched: [1:1.00]
+; SKYLAKE-NEXT: fstpt (%eax) # sched: [1:1.00]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retl # sched: [6:0.50]
+;
+; SKX-LABEL: test_fst_fstp:
+; SKX: # %bb.0:
+; SKX-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; SKX-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; SKX-NEXT: movl {{[0-9]+}}(%esp), %edx # sched: [5:0.50]
+; SKX-NEXT: #APP
+; SKX-NEXT: fst %st(0) # sched: [1:0.25]
+; SKX-NEXT: fsts (%edx) # sched: [1:1.00]
+; SKX-NEXT: fstl (%ecx) # sched: [1:1.00]
+; SKX-NEXT: fstp %st(0) # sched: [1:0.25]
+; SKX-NEXT: fstpl (%edx) # sched: [1:1.00]
+; SKX-NEXT: fstpl (%ecx) # sched: [1:1.00]
+; SKX-NEXT: fstpt (%eax) # sched: [1:1.00]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retl # sched: [6:0.50]
+;
+; BTVER2-LABEL: test_fst_fstp:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:1.00]
+; BTVER2-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:1.00]
+; BTVER2-NEXT: movl {{[0-9]+}}(%esp), %edx # sched: [5:1.00]
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: fst %st(0) # sched: [1:0.50]
+; BTVER2-NEXT: fsts (%edx) # sched: [1:1.00]
+; BTVER2-NEXT: fstl (%ecx) # sched: [1:1.00]
+; BTVER2-NEXT: fstp %st(0) # sched: [1:0.50]
+; BTVER2-NEXT: fstpl (%edx) # sched: [1:1.00]
+; BTVER2-NEXT: fstpl (%ecx) # sched: [1:1.00]
+; BTVER2-NEXT: fstpt (%eax) # sched: [1:1.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retl # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_fst_fstp:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [8:0.50]
+; ZNVER1-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [8:0.50]
+; ZNVER1-NEXT: movl {{[0-9]+}}(%esp), %edx # sched: [8:0.50]
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: fst %st(0) # sched: [5:0.50]
+; ZNVER1-NEXT: fsts (%edx) # sched: [1:0.50]
+; ZNVER1-NEXT: fstl (%ecx) # sched: [1:0.50]
+; ZNVER1-NEXT: fstp %st(0) # sched: [5:0.50]
+; ZNVER1-NEXT: fstpl (%edx) # sched: [1:0.50]
+; ZNVER1-NEXT: fstpl (%ecx) # sched: [1:0.50]
+; ZNVER1-NEXT: fstpt (%eax) # sched: [5:0.50]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retl # sched: [1:0.50]
+ tail call void asm sideeffect "fst %st(0) \0A\09 fsts $0 \0A\09 fstl $1 \0A\09 fstp %st(0) \0A\09 fstpl $0 \0A\09 fstpl $1 \0A\09 fstpt $2", "*m,*m,*m"(i16* %a0, i32* %a1, i64 *%a2) nounwind
+ ret void
+}
+
+define void @test_fstcw_fstenv_fstsw(i8* %a0) optsize {
+; GENERIC-LABEL: test_fstcw_fstenv_fstsw:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: movl {{[0-9]+}}(%esp), %eax
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: wait
+; GENERIC-NEXT: fnstcw (%eax)
+; GENERIC-NEXT: wait
+; GENERIC-NEXT: fnstenv (%eax)
+; GENERIC-NEXT: wait
+; GENERIC-NEXT: fnstsw (%eax)
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retl
+;
+; ATOM-LABEL: test_fstcw_fstenv_fstsw:
+; ATOM: # %bb.0:
+; ATOM-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [1:1.00]
+; ATOM-NEXT: #APP
+; ATOM-NEXT: wait # sched: [1:0.50]
+; ATOM-NEXT: fnstcw (%eax) # sched: [8:4.00]
+; ATOM-NEXT: wait # sched: [1:0.50]
+; ATOM-NEXT: fnstenv (%eax)
+; ATOM-NEXT: wait # sched: [1:0.50]
+; ATOM-NEXT: fnstsw (%eax)
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retl # sched: [79:39.50]
+;
+; SLM-LABEL: test_fstcw_fstenv_fstsw:
+; SLM: # %bb.0:
+; SLM-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [3:1.00]
+; SLM-NEXT: #APP
+; SLM-NEXT: wait # sched: [100:1.00]
+; SLM-NEXT: fnstcw (%eax) # sched: [1:0.50]
+; SLM-NEXT: wait # sched: [100:1.00]
+; SLM-NEXT: fnstenv (%eax) # sched: [100:1.00]
+; SLM-NEXT: wait # sched: [100:1.00]
+; SLM-NEXT: fnstsw (%eax) # sched: [100:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retl # sched: [4:1.00]
+;
+; SANDY-LABEL: test_fstcw_fstenv_fstsw:
+; SANDY: # %bb.0:
+; SANDY-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; SANDY-NEXT: #APP
+; SANDY-NEXT: wait # sched: [100:0.33]
+; SANDY-NEXT: fnstcw (%eax) # sched: [7:1.00]
+; SANDY-NEXT: wait # sched: [100:0.33]
+; SANDY-NEXT: fnstenv (%eax) # sched: [100:0.33]
+; SANDY-NEXT: wait # sched: [100:0.33]
+; SANDY-NEXT: fnstsw (%eax) # sched: [7:1.00]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retl # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_fstcw_fstenv_fstsw:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: wait # sched: [1:0.50]
+; HASWELL-NEXT: fnstcw (%eax) # sched: [2:1.00]
+; HASWELL-NEXT: wait # sched: [1:0.50]
+; HASWELL-NEXT: fnstenv (%eax) # sched: [115:19.50]
+; HASWELL-NEXT: wait # sched: [1:0.50]
+; HASWELL-NEXT: fnstsw (%eax) # sched: [4:1.00]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retl # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_fstcw_fstenv_fstsw:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: wait # sched: [2:0.50]
+; BROADWELL-NEXT: fnstcw (%eax) # sched: [2:1.00]
+; BROADWELL-NEXT: wait # sched: [2:0.50]
+; BROADWELL-NEXT: fnstenv (%eax) # sched: [115:19.50]
+; BROADWELL-NEXT: wait # sched: [2:0.50]
+; BROADWELL-NEXT: fnstsw (%eax) # sched: [4:1.00]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retl # sched: [6:0.50]
+;
+; SKYLAKE-LABEL: test_fstcw_fstenv_fstsw:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: wait # sched: [2:0.50]
+; SKYLAKE-NEXT: fnstcw (%eax) # sched: [2:1.00]
+; SKYLAKE-NEXT: wait # sched: [2:0.50]
+; SKYLAKE-NEXT: fnstenv (%eax) # sched: [106:19.50]
+; SKYLAKE-NEXT: wait # sched: [2:0.50]
+; SKYLAKE-NEXT: fnstsw (%eax) # sched: [3:1.00]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retl # sched: [6:0.50]
+;
+; SKX-LABEL: test_fstcw_fstenv_fstsw:
+; SKX: # %bb.0:
+; SKX-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; SKX-NEXT: #APP
+; SKX-NEXT: wait # sched: [2:0.50]
+; SKX-NEXT: fnstcw (%eax) # sched: [2:1.00]
+; SKX-NEXT: wait # sched: [2:0.50]
+; SKX-NEXT: fnstenv (%eax) # sched: [106:19.50]
+; SKX-NEXT: wait # sched: [2:0.50]
+; SKX-NEXT: fnstsw (%eax) # sched: [3:1.00]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retl # sched: [6:0.50]
+;
+; BTVER2-LABEL: test_fstcw_fstenv_fstsw:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:1.00]
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: wait # sched: [100:0.17]
+; BTVER2-NEXT: fnstcw (%eax) # sched: [1:0.50]
+; BTVER2-NEXT: wait # sched: [100:0.17]
+; BTVER2-NEXT: fnstenv (%eax) # sched: [100:0.17]
+; BTVER2-NEXT: wait # sched: [100:0.17]
+; BTVER2-NEXT: fnstsw (%eax) # sched: [100:0.17]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retl # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_fstcw_fstenv_fstsw:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [8:0.50]
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: wait # sched: [1:1.00]
+; ZNVER1-NEXT: fnstcw (%eax) # sched: [100:?]
+; ZNVER1-NEXT: wait # sched: [1:1.00]
+; ZNVER1-NEXT: fnstenv (%eax) # sched: [100:?]
+; ZNVER1-NEXT: wait # sched: [1:1.00]
+; ZNVER1-NEXT: fnstsw (%eax) # sched: [100:?]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retl # sched: [1:0.50]
+ tail call void asm sideeffect "fstcw $0 \0A\09 fstenv $0 \0A\09 fstsw $0", "*m"(i8* %a0) nounwind
+ ret void
+}
+
+define void @test_fnstcw_fnstenv_fnstsw(i8* %a0) optsize {
+; GENERIC-LABEL: test_fnstcw_fnstenv_fnstsw:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: movl {{[0-9]+}}(%esp), %eax
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: fnstcw (%eax)
+; GENERIC-NEXT: fnstenv (%eax)
+; GENERIC-NEXT: fnstsw (%eax)
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retl
+;
+; ATOM-LABEL: test_fnstcw_fnstenv_fnstsw:
+; ATOM: # %bb.0:
+; ATOM-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [1:1.00]
+; ATOM-NEXT: #APP
+; ATOM-NEXT: fnstcw (%eax) # sched: [8:4.00]
+; ATOM-NEXT: fnstenv (%eax)
+; ATOM-NEXT: fnstsw (%eax)
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retl # sched: [79:39.50]
+;
+; SLM-LABEL: test_fnstcw_fnstenv_fnstsw:
+; SLM: # %bb.0:
+; SLM-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [3:1.00]
+; SLM-NEXT: #APP
+; SLM-NEXT: fnstcw (%eax) # sched: [1:0.50]
+; SLM-NEXT: fnstenv (%eax) # sched: [100:1.00]
+; SLM-NEXT: fnstsw (%eax) # sched: [100:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retl # sched: [4:1.00]
+;
+; SANDY-LABEL: test_fnstcw_fnstenv_fnstsw:
+; SANDY: # %bb.0:
+; SANDY-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; SANDY-NEXT: #APP
+; SANDY-NEXT: fnstcw (%eax) # sched: [7:1.00]
+; SANDY-NEXT: fnstenv (%eax) # sched: [100:0.33]
+; SANDY-NEXT: fnstsw (%eax) # sched: [7:1.00]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retl # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_fnstcw_fnstenv_fnstsw:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: fnstcw (%eax) # sched: [2:1.00]
+; HASWELL-NEXT: fnstenv (%eax) # sched: [115:19.50]
+; HASWELL-NEXT: fnstsw (%eax) # sched: [4:1.00]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retl # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_fnstcw_fnstenv_fnstsw:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: fnstcw (%eax) # sched: [2:1.00]
+; BROADWELL-NEXT: fnstenv (%eax) # sched: [115:19.50]
+; BROADWELL-NEXT: fnstsw (%eax) # sched: [4:1.00]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retl # sched: [6:0.50]
+;
+; SKYLAKE-LABEL: test_fnstcw_fnstenv_fnstsw:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: fnstcw (%eax) # sched: [2:1.00]
+; SKYLAKE-NEXT: fnstenv (%eax) # sched: [106:19.50]
+; SKYLAKE-NEXT: fnstsw (%eax) # sched: [3:1.00]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retl # sched: [6:0.50]
+;
+; SKX-LABEL: test_fnstcw_fnstenv_fnstsw:
+; SKX: # %bb.0:
+; SKX-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; SKX-NEXT: #APP
+; SKX-NEXT: fnstcw (%eax) # sched: [2:1.00]
+; SKX-NEXT: fnstenv (%eax) # sched: [106:19.50]
+; SKX-NEXT: fnstsw (%eax) # sched: [3:1.00]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retl # sched: [6:0.50]
+;
+; BTVER2-LABEL: test_fnstcw_fnstenv_fnstsw:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:1.00]
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: fnstcw (%eax) # sched: [1:0.50]
+; BTVER2-NEXT: fnstenv (%eax) # sched: [100:0.17]
+; BTVER2-NEXT: fnstsw (%eax) # sched: [100:0.17]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retl # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_fnstcw_fnstenv_fnstsw:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [8:0.50]
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: fnstcw (%eax) # sched: [100:?]
+; ZNVER1-NEXT: fnstenv (%eax) # sched: [100:?]
+; ZNVER1-NEXT: fnstsw (%eax) # sched: [100:?]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retl # sched: [1:0.50]
+ tail call void asm sideeffect "fnstcw $0 \0A\09 fnstenv $0 \0A\09 fnstsw $0", "*m"(i8* %a0) nounwind
+ ret void
+}
+
+define void @test_fsub(float *%a0, double *%a1) optsize {
+; GENERIC-LABEL: test_fsub:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: movl {{[0-9]+}}(%esp), %eax
+; GENERIC-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: fsub %st(0), %st(1)
+; GENERIC-NEXT: fsub %st(2)
+; GENERIC-NEXT: fsubs (%ecx)
+; GENERIC-NEXT: fsubl (%eax)
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retl
+;
+; ATOM-LABEL: test_fsub:
+; ATOM: # %bb.0:
+; ATOM-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [1:1.00]
+; ATOM-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [1:1.00]
+; ATOM-NEXT: #APP
+; ATOM-NEXT: fsub %st(0), %st(1)
+; ATOM-NEXT: fsub %st(2)
+; ATOM-NEXT: fsubs (%ecx)
+; ATOM-NEXT: fsubl (%eax)
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retl # sched: [79:39.50]
+;
+; SLM-LABEL: test_fsub:
+; SLM: # %bb.0:
+; SLM-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [3:1.00]
+; SLM-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [3:1.00]
+; SLM-NEXT: #APP
+; SLM-NEXT: fsub %st(0), %st(1) # sched: [3:1.00]
+; SLM-NEXT: fsub %st(2) # sched: [3:1.00]
+; SLM-NEXT: fsubs (%ecx) # sched: [6:1.00]
+; SLM-NEXT: fsubl (%eax) # sched: [6:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retl # sched: [4:1.00]
+;
+; SANDY-LABEL: test_fsub:
+; SANDY: # %bb.0:
+; SANDY-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; SANDY-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; SANDY-NEXT: #APP
+; SANDY-NEXT: fsub %st(0), %st(1) # sched: [3:1.00]
+; SANDY-NEXT: fsub %st(2) # sched: [3:1.00]
+; SANDY-NEXT: fsubs (%ecx) # sched: [10:1.00]
+; SANDY-NEXT: fsubl (%eax) # sched: [10:1.00]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retl # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_fsub:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; HASWELL-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: fsub %st(0), %st(1) # sched: [3:1.00]
+; HASWELL-NEXT: fsub %st(2) # sched: [3:1.00]
+; HASWELL-NEXT: fsubs (%ecx) # sched: [10:1.00]
+; HASWELL-NEXT: fsubl (%eax) # sched: [10:1.00]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retl # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_fsub:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; BROADWELL-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: fsub %st(0), %st(1) # sched: [3:1.00]
+; BROADWELL-NEXT: fsub %st(2) # sched: [3:1.00]
+; BROADWELL-NEXT: fsubs (%ecx) # sched: [9:1.00]
+; BROADWELL-NEXT: fsubl (%eax) # sched: [9:1.00]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retl # sched: [6:0.50]
+;
+; SKYLAKE-LABEL: test_fsub:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; SKYLAKE-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: fsub %st(0), %st(1) # sched: [3:1.00]
+; SKYLAKE-NEXT: fsub %st(2) # sched: [3:1.00]
+; SKYLAKE-NEXT: fsubs (%ecx) # sched: [10:1.00]
+; SKYLAKE-NEXT: fsubl (%eax) # sched: [10:1.00]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retl # sched: [6:0.50]
+;
+; SKX-LABEL: test_fsub:
+; SKX: # %bb.0:
+; SKX-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; SKX-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; SKX-NEXT: #APP
+; SKX-NEXT: fsub %st(0), %st(1) # sched: [3:1.00]
+; SKX-NEXT: fsub %st(2) # sched: [3:1.00]
+; SKX-NEXT: fsubs (%ecx) # sched: [10:1.00]
+; SKX-NEXT: fsubl (%eax) # sched: [10:1.00]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retl # sched: [6:0.50]
+;
+; BTVER2-LABEL: test_fsub:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:1.00]
+; BTVER2-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:1.00]
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: fsub %st(0), %st(1) # sched: [3:1.00]
+; BTVER2-NEXT: fsub %st(2) # sched: [3:1.00]
+; BTVER2-NEXT: fsubs (%ecx) # sched: [8:1.00]
+; BTVER2-NEXT: fsubl (%eax) # sched: [8:1.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retl # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_fsub:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [8:0.50]
+; ZNVER1-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [8:0.50]
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: fsub %st(0), %st(1) # sched: [3:1.00]
+; ZNVER1-NEXT: fsub %st(2) # sched: [3:1.00]
+; ZNVER1-NEXT: fsubs (%ecx) # sched: [10:1.00]
+; ZNVER1-NEXT: fsubl (%eax) # sched: [10:1.00]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retl # sched: [1:0.50]
+ tail call void asm sideeffect "fsub %st(0), %st(1) \0A\09 fsub %st(2), %st(0) \0A\09 fsubs $0 \0A\09 fsubl $1", "*m,*m"(float *%a0, double *%a1) nounwind
+ ret void
+}
+
+define void @test_fsubp_fisub(i16 *%a0, i32 *%a1) optsize {
+; GENERIC-LABEL: test_fsubp_fisub:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: movl {{[0-9]+}}(%esp), %eax
+; GENERIC-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: fsubp %st(1)
+; GENERIC-NEXT: fsubp %st(2)
+; GENERIC-NEXT: fisubs (%ecx)
+; GENERIC-NEXT: fisubl (%eax)
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retl
+;
+; ATOM-LABEL: test_fsubp_fisub:
+; ATOM: # %bb.0:
+; ATOM-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [1:1.00]
+; ATOM-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [1:1.00]
+; ATOM-NEXT: #APP
+; ATOM-NEXT: fsubp %st(1)
+; ATOM-NEXT: fsubp %st(2)
+; ATOM-NEXT: fisubs (%ecx)
+; ATOM-NEXT: fisubl (%eax)
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retl # sched: [79:39.50]
+;
+; SLM-LABEL: test_fsubp_fisub:
+; SLM: # %bb.0:
+; SLM-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [3:1.00]
+; SLM-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [3:1.00]
+; SLM-NEXT: #APP
+; SLM-NEXT: fsubp %st(1) # sched: [3:1.00]
+; SLM-NEXT: fsubp %st(2) # sched: [3:1.00]
+; SLM-NEXT: fisubs (%ecx) # sched: [6:1.00]
+; SLM-NEXT: fisubl (%eax) # sched: [6:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retl # sched: [4:1.00]
+;
+; SANDY-LABEL: test_fsubp_fisub:
+; SANDY: # %bb.0:
+; SANDY-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; SANDY-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; SANDY-NEXT: #APP
+; SANDY-NEXT: fsubp %st(1) # sched: [3:1.00]
+; SANDY-NEXT: fsubp %st(2) # sched: [3:1.00]
+; SANDY-NEXT: fisubs (%ecx) # sched: [13:2.00]
+; SANDY-NEXT: fisubl (%eax) # sched: [13:2.00]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retl # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_fsubp_fisub:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; HASWELL-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: fsubp %st(1) # sched: [3:1.00]
+; HASWELL-NEXT: fsubp %st(2) # sched: [3:1.00]
+; HASWELL-NEXT: fisubs (%ecx) # sched: [13:2.00]
+; HASWELL-NEXT: fisubl (%eax) # sched: [13:2.00]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retl # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_fsubp_fisub:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; BROADWELL-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: fsubp %st(1) # sched: [3:1.00]
+; BROADWELL-NEXT: fsubp %st(2) # sched: [3:1.00]
+; BROADWELL-NEXT: fisubs (%ecx) # sched: [12:2.00]
+; BROADWELL-NEXT: fisubl (%eax) # sched: [12:2.00]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retl # sched: [6:0.50]
+;
+; SKYLAKE-LABEL: test_fsubp_fisub:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; SKYLAKE-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: fsubp %st(1) # sched: [3:1.00]
+; SKYLAKE-NEXT: fsubp %st(2) # sched: [3:1.00]
+; SKYLAKE-NEXT: fisubs (%ecx) # sched: [13:2.00]
+; SKYLAKE-NEXT: fisubl (%eax) # sched: [13:2.00]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retl # sched: [6:0.50]
+;
+; SKX-LABEL: test_fsubp_fisub:
+; SKX: # %bb.0:
+; SKX-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; SKX-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; SKX-NEXT: #APP
+; SKX-NEXT: fsubp %st(1) # sched: [3:1.00]
+; SKX-NEXT: fsubp %st(2) # sched: [3:1.00]
+; SKX-NEXT: fisubs (%ecx) # sched: [13:2.00]
+; SKX-NEXT: fisubl (%eax) # sched: [13:2.00]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retl # sched: [6:0.50]
+;
+; BTVER2-LABEL: test_fsubp_fisub:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:1.00]
+; BTVER2-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:1.00]
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: fsubp %st(1) # sched: [3:1.00]
+; BTVER2-NEXT: fsubp %st(2) # sched: [3:1.00]
+; BTVER2-NEXT: fisubs (%ecx) # sched: [8:1.00]
+; BTVER2-NEXT: fisubl (%eax) # sched: [8:1.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retl # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_fsubp_fisub:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [8:0.50]
+; ZNVER1-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [8:0.50]
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: fsubp %st(1) # sched: [3:1.00]
+; ZNVER1-NEXT: fsubp %st(2) # sched: [3:1.00]
+; ZNVER1-NEXT: fisubs (%ecx) # sched: [10:1.00]
+; ZNVER1-NEXT: fisubl (%eax) # sched: [10:1.00]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retl # sched: [1:0.50]
+ tail call void asm sideeffect "fsubp \0A\09 fsubp %st(2), %st(0) \0A\09 fisubs $0 \0A\09 fisubl $1", "*m,*m"(i16 *%a0, i32 *%a1) nounwind
+ ret void
+}
+
+define void @test_fsubr(float *%a0, double *%a1) optsize {
+; GENERIC-LABEL: test_fsubr:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: movl {{[0-9]+}}(%esp), %eax
+; GENERIC-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: fsubr %st(0), %st(1)
+; GENERIC-NEXT: fsubr %st(2)
+; GENERIC-NEXT: fsubrs (%ecx)
+; GENERIC-NEXT: fsubrl (%eax)
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retl
+;
+; ATOM-LABEL: test_fsubr:
+; ATOM: # %bb.0:
+; ATOM-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [1:1.00]
+; ATOM-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [1:1.00]
+; ATOM-NEXT: #APP
+; ATOM-NEXT: fsubr %st(0), %st(1)
+; ATOM-NEXT: fsubr %st(2)
+; ATOM-NEXT: fsubrs (%ecx)
+; ATOM-NEXT: fsubrl (%eax)
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retl # sched: [79:39.50]
+;
+; SLM-LABEL: test_fsubr:
+; SLM: # %bb.0:
+; SLM-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [3:1.00]
+; SLM-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [3:1.00]
+; SLM-NEXT: #APP
+; SLM-NEXT: fsubr %st(0), %st(1) # sched: [3:1.00]
+; SLM-NEXT: fsubr %st(2) # sched: [3:1.00]
+; SLM-NEXT: fsubrs (%ecx) # sched: [6:1.00]
+; SLM-NEXT: fsubrl (%eax) # sched: [6:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retl # sched: [4:1.00]
+;
+; SANDY-LABEL: test_fsubr:
+; SANDY: # %bb.0:
+; SANDY-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; SANDY-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; SANDY-NEXT: #APP
+; SANDY-NEXT: fsubr %st(0), %st(1) # sched: [3:1.00]
+; SANDY-NEXT: fsubr %st(2) # sched: [3:1.00]
+; SANDY-NEXT: fsubrs (%ecx) # sched: [10:1.00]
+; SANDY-NEXT: fsubrl (%eax) # sched: [10:1.00]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retl # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_fsubr:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; HASWELL-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: fsubr %st(0), %st(1) # sched: [3:1.00]
+; HASWELL-NEXT: fsubr %st(2) # sched: [3:1.00]
+; HASWELL-NEXT: fsubrs (%ecx) # sched: [10:1.00]
+; HASWELL-NEXT: fsubrl (%eax) # sched: [10:1.00]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retl # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_fsubr:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; BROADWELL-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: fsubr %st(0), %st(1) # sched: [3:1.00]
+; BROADWELL-NEXT: fsubr %st(2) # sched: [3:1.00]
+; BROADWELL-NEXT: fsubrs (%ecx) # sched: [9:1.00]
+; BROADWELL-NEXT: fsubrl (%eax) # sched: [9:1.00]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retl # sched: [6:0.50]
+;
+; SKYLAKE-LABEL: test_fsubr:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; SKYLAKE-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: fsubr %st(0), %st(1) # sched: [3:1.00]
+; SKYLAKE-NEXT: fsubr %st(2) # sched: [3:1.00]
+; SKYLAKE-NEXT: fsubrs (%ecx) # sched: [10:1.00]
+; SKYLAKE-NEXT: fsubrl (%eax) # sched: [10:1.00]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retl # sched: [6:0.50]
+;
+; SKX-LABEL: test_fsubr:
+; SKX: # %bb.0:
+; SKX-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; SKX-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; SKX-NEXT: #APP
+; SKX-NEXT: fsubr %st(0), %st(1) # sched: [3:1.00]
+; SKX-NEXT: fsubr %st(2) # sched: [3:1.00]
+; SKX-NEXT: fsubrs (%ecx) # sched: [10:1.00]
+; SKX-NEXT: fsubrl (%eax) # sched: [10:1.00]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retl # sched: [6:0.50]
+;
+; BTVER2-LABEL: test_fsubr:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:1.00]
+; BTVER2-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:1.00]
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: fsubr %st(0), %st(1) # sched: [3:1.00]
+; BTVER2-NEXT: fsubr %st(2) # sched: [3:1.00]
+; BTVER2-NEXT: fsubrs (%ecx) # sched: [8:1.00]
+; BTVER2-NEXT: fsubrl (%eax) # sched: [8:1.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retl # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_fsubr:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [8:0.50]
+; ZNVER1-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [8:0.50]
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: fsubr %st(0), %st(1) # sched: [3:1.00]
+; ZNVER1-NEXT: fsubr %st(2) # sched: [3:1.00]
+; ZNVER1-NEXT: fsubrs (%ecx) # sched: [10:1.00]
+; ZNVER1-NEXT: fsubrl (%eax) # sched: [10:1.00]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retl # sched: [1:0.50]
+ tail call void asm sideeffect "fsubr %st(0), %st(1) \0A\09 fsubr %st(2), %st(0) \0A\09 fsubrs $0 \0A\09 fsubrl $1", "*m,*m"(float *%a0, double *%a1) nounwind
+ ret void
+}
+
+define void @test_fsubrp_fisubr(i16 *%a0, i32 *%a1) optsize {
+; GENERIC-LABEL: test_fsubrp_fisubr:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: movl {{[0-9]+}}(%esp), %eax
+; GENERIC-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: fsubrp %st(1)
+; GENERIC-NEXT: fsubrp %st(2)
+; GENERIC-NEXT: fisubrs (%ecx)
+; GENERIC-NEXT: fisubrl (%eax)
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retl
+;
+; ATOM-LABEL: test_fsubrp_fisubr:
+; ATOM: # %bb.0:
+; ATOM-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [1:1.00]
+; ATOM-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [1:1.00]
+; ATOM-NEXT: #APP
+; ATOM-NEXT: fsubrp %st(1)
+; ATOM-NEXT: fsubrp %st(2)
+; ATOM-NEXT: fisubrs (%ecx)
+; ATOM-NEXT: fisubrl (%eax)
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retl # sched: [79:39.50]
+;
+; SLM-LABEL: test_fsubrp_fisubr:
+; SLM: # %bb.0:
+; SLM-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [3:1.00]
+; SLM-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [3:1.00]
+; SLM-NEXT: #APP
+; SLM-NEXT: fsubrp %st(1) # sched: [3:1.00]
+; SLM-NEXT: fsubrp %st(2) # sched: [3:1.00]
+; SLM-NEXT: fisubrs (%ecx) # sched: [6:1.00]
+; SLM-NEXT: fisubrl (%eax) # sched: [6:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retl # sched: [4:1.00]
+;
+; SANDY-LABEL: test_fsubrp_fisubr:
+; SANDY: # %bb.0:
+; SANDY-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; SANDY-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; SANDY-NEXT: #APP
+; SANDY-NEXT: fsubrp %st(1) # sched: [3:1.00]
+; SANDY-NEXT: fsubrp %st(2) # sched: [3:1.00]
+; SANDY-NEXT: fisubrs (%ecx) # sched: [13:2.00]
+; SANDY-NEXT: fisubrl (%eax) # sched: [13:2.00]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retl # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_fsubrp_fisubr:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; HASWELL-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: fsubrp %st(1) # sched: [3:1.00]
+; HASWELL-NEXT: fsubrp %st(2) # sched: [3:1.00]
+; HASWELL-NEXT: fisubrs (%ecx) # sched: [13:2.00]
+; HASWELL-NEXT: fisubrl (%eax) # sched: [13:2.00]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retl # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_fsubrp_fisubr:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; BROADWELL-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: fsubrp %st(1) # sched: [3:1.00]
+; BROADWELL-NEXT: fsubrp %st(2) # sched: [3:1.00]
+; BROADWELL-NEXT: fisubrs (%ecx) # sched: [12:2.00]
+; BROADWELL-NEXT: fisubrl (%eax) # sched: [12:2.00]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retl # sched: [6:0.50]
+;
+; SKYLAKE-LABEL: test_fsubrp_fisubr:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; SKYLAKE-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: fsubrp %st(1) # sched: [3:1.00]
+; SKYLAKE-NEXT: fsubrp %st(2) # sched: [3:1.00]
+; SKYLAKE-NEXT: fisubrs (%ecx) # sched: [13:2.00]
+; SKYLAKE-NEXT: fisubrl (%eax) # sched: [13:2.00]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retl # sched: [6:0.50]
+;
+; SKX-LABEL: test_fsubrp_fisubr:
+; SKX: # %bb.0:
+; SKX-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; SKX-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:0.50]
+; SKX-NEXT: #APP
+; SKX-NEXT: fsubrp %st(1) # sched: [3:1.00]
+; SKX-NEXT: fsubrp %st(2) # sched: [3:1.00]
+; SKX-NEXT: fisubrs (%ecx) # sched: [13:2.00]
+; SKX-NEXT: fisubrl (%eax) # sched: [13:2.00]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retl # sched: [6:0.50]
+;
+; BTVER2-LABEL: test_fsubrp_fisubr:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:1.00]
+; BTVER2-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [5:1.00]
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: fsubrp %st(1) # sched: [3:1.00]
+; BTVER2-NEXT: fsubrp %st(2) # sched: [3:1.00]
+; BTVER2-NEXT: fisubrs (%ecx) # sched: [8:1.00]
+; BTVER2-NEXT: fisubrl (%eax) # sched: [8:1.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retl # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_fsubrp_fisubr:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [8:0.50]
+; ZNVER1-NEXT: movl {{[0-9]+}}(%esp), %ecx # sched: [8:0.50]
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: fsubrp %st(1) # sched: [3:1.00]
+; ZNVER1-NEXT: fsubrp %st(2) # sched: [3:1.00]
+; ZNVER1-NEXT: fisubrs (%ecx) # sched: [10:1.00]
+; ZNVER1-NEXT: fisubrl (%eax) # sched: [10:1.00]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retl # sched: [1:0.50]
+ tail call void asm sideeffect "fsubrp \0A\09 fsubrp %st(2), %st(0) \0A\09 fisubrs $0 \0A\09 fisubrl $1", "*m,*m"(i16 *%a0, i32 *%a1) nounwind
+ ret void
+}
+
+define void @test_ftst() optsize {
+; GENERIC-LABEL: test_ftst:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: ftst
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retl
+;
+; ATOM-LABEL: test_ftst:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: ftst # sched: [9:4.50]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retl # sched: [79:39.50]
+;
+; SLM-LABEL: test_ftst:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: ftst # sched: [3:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retl # sched: [4:1.00]
+;
+; SANDY-LABEL: test_ftst:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: ftst # sched: [3:1.00]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retl # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_ftst:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: ftst # sched: [1:1.00]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retl # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_ftst:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: ftst # sched: [3:1.00]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retl # sched: [6:0.50]
+;
+; SKYLAKE-LABEL: test_ftst:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: ftst # sched: [3:1.00]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retl # sched: [6:0.50]
+;
+; SKX-LABEL: test_ftst:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: ftst # sched: [3:1.00]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retl # sched: [6:0.50]
+;
+; BTVER2-LABEL: test_ftst:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: ftst # sched: [3:1.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retl # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_ftst:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: ftst # sched: [1:1.00]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retl # sched: [1:0.50]
+ tail call void asm sideeffect "ftst", ""() nounwind
+ ret void
+}
+
+define void @test_fucom_fucomp_fucompp() optsize {
+; GENERIC-LABEL: test_fucom_fucomp_fucompp:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: fucom %st(1)
+; GENERIC-NEXT: fucom %st(3)
+; GENERIC-NEXT: fucomp %st(1)
+; GENERIC-NEXT: fucomp %st(3)
+; GENERIC-NEXT: fucompp
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retl
+;
+; ATOM-LABEL: test_fucom_fucomp_fucompp:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: fucom %st(1) # sched: [1:1.00]
+; ATOM-NEXT: fucom %st(3) # sched: [1:1.00]
+; ATOM-NEXT: fucomp %st(1) # sched: [1:1.00]
+; ATOM-NEXT: fucomp %st(3) # sched: [1:1.00]
+; ATOM-NEXT: fucompp # sched: [1:1.00]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retl # sched: [79:39.50]
+;
+; SLM-LABEL: test_fucom_fucomp_fucompp:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: fucom %st(1) # sched: [3:1.00]
+; SLM-NEXT: fucom %st(3) # sched: [3:1.00]
+; SLM-NEXT: fucomp %st(1) # sched: [3:1.00]
+; SLM-NEXT: fucomp %st(3) # sched: [3:1.00]
+; SLM-NEXT: fucompp # sched: [3:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retl # sched: [4:1.00]
+;
+; SANDY-LABEL: test_fucom_fucomp_fucompp:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: fucom %st(1) # sched: [1:1.00]
+; SANDY-NEXT: fucom %st(3) # sched: [1:1.00]
+; SANDY-NEXT: fucomp %st(1) # sched: [1:1.00]
+; SANDY-NEXT: fucomp %st(3) # sched: [1:1.00]
+; SANDY-NEXT: fucompp # sched: [3:1.00]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retl # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_fucom_fucomp_fucompp:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: fucom %st(1) # sched: [1:1.00]
+; HASWELL-NEXT: fucom %st(3) # sched: [1:1.00]
+; HASWELL-NEXT: fucomp %st(1) # sched: [1:1.00]
+; HASWELL-NEXT: fucomp %st(3) # sched: [1:1.00]
+; HASWELL-NEXT: fucompp # sched: [1:0.50]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retl # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_fucom_fucomp_fucompp:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: fucom %st(1) # sched: [1:1.00]
+; BROADWELL-NEXT: fucom %st(3) # sched: [1:1.00]
+; BROADWELL-NEXT: fucomp %st(1) # sched: [1:1.00]
+; BROADWELL-NEXT: fucomp %st(3) # sched: [1:1.00]
+; BROADWELL-NEXT: fucompp # sched: [3:1.00]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retl # sched: [6:0.50]
+;
+; SKYLAKE-LABEL: test_fucom_fucomp_fucompp:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: fucom %st(1) # sched: [1:1.00]
+; SKYLAKE-NEXT: fucom %st(3) # sched: [1:1.00]
+; SKYLAKE-NEXT: fucomp %st(1) # sched: [1:1.00]
+; SKYLAKE-NEXT: fucomp %st(3) # sched: [1:1.00]
+; SKYLAKE-NEXT: fucompp # sched: [3:1.00]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retl # sched: [6:0.50]
+;
+; SKX-LABEL: test_fucom_fucomp_fucompp:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: fucom %st(1) # sched: [1:1.00]
+; SKX-NEXT: fucom %st(3) # sched: [1:1.00]
+; SKX-NEXT: fucomp %st(1) # sched: [1:1.00]
+; SKX-NEXT: fucomp %st(3) # sched: [1:1.00]
+; SKX-NEXT: fucompp # sched: [3:1.00]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retl # sched: [6:0.50]
+;
+; BTVER2-LABEL: test_fucom_fucomp_fucompp:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: fucom %st(1) # sched: [3:1.00]
+; BTVER2-NEXT: fucom %st(3) # sched: [3:1.00]
+; BTVER2-NEXT: fucomp %st(1) # sched: [3:1.00]
+; BTVER2-NEXT: fucomp %st(3) # sched: [3:1.00]
+; BTVER2-NEXT: fucompp # sched: [3:1.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retl # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_fucom_fucomp_fucompp:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: fucom %st(1) # sched: [1:1.00]
+; ZNVER1-NEXT: fucom %st(3) # sched: [1:1.00]
+; ZNVER1-NEXT: fucomp %st(1) # sched: [1:1.00]
+; ZNVER1-NEXT: fucomp %st(3) # sched: [1:1.00]
+; ZNVER1-NEXT: fucompp # sched: [1:1.00]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retl # sched: [1:0.50]
+ tail call void asm sideeffect "fucom \0A\09 fucom %st(3) \0A\09 fucomp \0A\09 fucomp %st(3) \0A\09 fucompp", ""() nounwind
+ ret void
+}
+
+define void @test_fucomi_fucomip() optsize {
+; GENERIC-LABEL: test_fucomi_fucomip:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: fucomi %st(3)
+; GENERIC-NEXT: fucompi %st(3)
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retl
+;
+; ATOM-LABEL: test_fucomi_fucomip:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: fucomi %st(3) # sched: [9:4.50]
+; ATOM-NEXT: fucompi %st(3) # sched: [9:4.50]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retl # sched: [79:39.50]
+;
+; SLM-LABEL: test_fucomi_fucomip:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: fucomi %st(3) # sched: [3:1.00]
+; SLM-NEXT: fucompi %st(3) # sched: [3:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retl # sched: [4:1.00]
+;
+; SANDY-LABEL: test_fucomi_fucomip:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: fucomi %st(3) # sched: [3:1.00]
+; SANDY-NEXT: fucompi %st(3) # sched: [3:1.00]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retl # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_fucomi_fucomip:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: fucomi %st(3) # sched: [1:0.50]
+; HASWELL-NEXT: fucompi %st(3) # sched: [1:0.50]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retl # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_fucomi_fucomip:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: fucomi %st(3) # sched: [3:1.00]
+; BROADWELL-NEXT: fucompi %st(3) # sched: [3:1.00]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retl # sched: [6:0.50]
+;
+; SKYLAKE-LABEL: test_fucomi_fucomip:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: fucomi %st(3) # sched: [3:1.00]
+; SKYLAKE-NEXT: fucompi %st(3) # sched: [3:1.00]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retl # sched: [6:0.50]
+;
+; SKX-LABEL: test_fucomi_fucomip:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: fucomi %st(3) # sched: [3:1.00]
+; SKX-NEXT: fucompi %st(3) # sched: [3:1.00]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retl # sched: [6:0.50]
+;
+; BTVER2-LABEL: test_fucomi_fucomip:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: fucomi %st(3) # sched: [3:1.00]
+; BTVER2-NEXT: fucompi %st(3) # sched: [3:1.00]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retl # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_fucomi_fucomip:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: fucomi %st(3) # sched: [9:0.50]
+; ZNVER1-NEXT: fucompi %st(3) # sched: [9:0.50]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retl # sched: [1:0.50]
+ tail call void asm sideeffect "fucomi %st(3) \0A\09 fucomip %st(3)", ""() nounwind
+ ret void
+}
+
+define void @test_fwait() optsize {
+; GENERIC-LABEL: test_fwait:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: wait
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retl
+;
+; ATOM-LABEL: test_fwait:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: wait # sched: [1:0.50]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retl # sched: [79:39.50]
+;
+; SLM-LABEL: test_fwait:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: wait # sched: [100:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retl # sched: [4:1.00]
+;
+; SANDY-LABEL: test_fwait:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: wait # sched: [100:0.33]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retl # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_fwait:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: wait # sched: [1:0.50]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retl # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_fwait:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: wait # sched: [2:0.50]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retl # sched: [6:0.50]
+;
+; SKYLAKE-LABEL: test_fwait:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: wait # sched: [2:0.50]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retl # sched: [6:0.50]
+;
+; SKX-LABEL: test_fwait:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: wait # sched: [2:0.50]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retl # sched: [6:0.50]
+;
+; BTVER2-LABEL: test_fwait:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: wait # sched: [100:0.17]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retl # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_fwait:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: wait # sched: [1:1.00]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retl # sched: [1:0.50]
+ tail call void asm sideeffect "fwait", ""() nounwind
+ ret void
+}
+
+define void @test_fxam() optsize {
+; GENERIC-LABEL: test_fxam:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: fxam
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retl
+;
+; ATOM-LABEL: test_fxam:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: fxam # sched: [1:1.00]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retl # sched: [79:39.50]
+;
+; SLM-LABEL: test_fxam:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: fxam # sched: [100:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retl # sched: [4:1.00]
+;
+; SANDY-LABEL: test_fxam:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: fxam # sched: [100:0.33]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retl # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_fxam:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: fxam # sched: [1:2.00]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retl # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_fxam:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: fxam # sched: [100:0.25]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retl # sched: [6:0.50]
+;
+; SKYLAKE-LABEL: test_fxam:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: fxam # sched: [100:0.25]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retl # sched: [6:0.50]
+;
+; SKX-LABEL: test_fxam:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: fxam # sched: [100:0.25]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retl # sched: [6:0.50]
+;
+; BTVER2-LABEL: test_fxam:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: fxam # sched: [100:0.17]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retl # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_fxam:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: fxam # sched: [1:1.00]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retl # sched: [1:0.50]
+ tail call void asm sideeffect "fxam", ""() nounwind
+ ret void
+}
+
+define void @test_fxch() optsize {
+; GENERIC-LABEL: test_fxch:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: fxch %st(1)
+; GENERIC-NEXT: fxch %st(3)
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retl
+;
+; ATOM-LABEL: test_fxch:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: fxch %st(1) # sched: [1:1.00]
+; ATOM-NEXT: fxch %st(3) # sched: [1:1.00]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retl # sched: [79:39.50]
+;
+; SLM-LABEL: test_fxch:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: fxch %st(1) # sched: [1:0.50]
+; SLM-NEXT: fxch %st(3) # sched: [1:0.50]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retl # sched: [4:1.00]
+;
+; SANDY-LABEL: test_fxch:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: fxch %st(1) # sched: [1:0.33]
+; SANDY-NEXT: fxch %st(3) # sched: [1:0.33]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retl # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_fxch:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: fxch %st(1) # sched: [17:4.00]
+; HASWELL-NEXT: fxch %st(3) # sched: [17:4.00]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retl # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_fxch:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: fxch %st(1) # sched: [14:4.00]
+; BROADWELL-NEXT: fxch %st(3) # sched: [14:4.00]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retl # sched: [6:0.50]
+;
+; SKYLAKE-LABEL: test_fxch:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: fxch %st(1) # sched: [17:4.00]
+; SKYLAKE-NEXT: fxch %st(3) # sched: [17:4.00]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retl # sched: [6:0.50]
+;
+; SKX-LABEL: test_fxch:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: fxch %st(1) # sched: [17:4.00]
+; SKX-NEXT: fxch %st(3) # sched: [17:4.00]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retl # sched: [6:0.50]
+;
+; BTVER2-LABEL: test_fxch:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: fxch %st(1) # sched: [1:0.50]
+; BTVER2-NEXT: fxch %st(3) # sched: [1:0.50]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retl # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_fxch:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: fxch %st(1) # sched: [1:0.25]
+; ZNVER1-NEXT: fxch %st(3) # sched: [1:0.25]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retl # sched: [1:0.50]
+ tail call void asm sideeffect "fxch \0A\09 fxch %st(3)", ""() nounwind
+ ret void
+}
+
+define void @test_fxrstor_fxsave(i8* %a0) optsize {
+; GENERIC-LABEL: test_fxrstor_fxsave:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: movl {{[0-9]+}}(%esp), %eax
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: fxrstor (%eax)
+; GENERIC-NEXT: fxsave (%eax)
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retl
+;
+; ATOM-LABEL: test_fxrstor_fxsave:
+; ATOM: # %bb.0:
+; ATOM-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [1:1.00]
+; ATOM-NEXT: #APP
+; ATOM-NEXT: fxrstor (%eax) # sched: [141:70.50]
+; ATOM-NEXT: fxsave (%eax) # sched: [140:70.00]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retl # sched: [79:39.50]
+;
+; SLM-LABEL: test_fxrstor_fxsave:
+; SLM: # %bb.0:
+; SLM-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [3:1.00]
+; SLM-NEXT: #APP
+; SLM-NEXT: fxrstor (%eax) # sched: [100:1.00]
+; SLM-NEXT: fxsave (%eax) # sched: [100:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retl # sched: [4:1.00]
+;
+; SANDY-LABEL: test_fxrstor_fxsave:
+; SANDY: # %bb.0:
+; SANDY-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; SANDY-NEXT: #APP
+; SANDY-NEXT: fxrstor (%eax) # sched: [5:2.00]
+; SANDY-NEXT: fxsave (%eax) # sched: [100:0.33]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retl # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_fxrstor_fxsave:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: fxrstor (%eax) # sched: [64:16.50]
+; HASWELL-NEXT: fxsave (%eax) # sched: [100:0.25]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retl # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_fxrstor_fxsave:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: fxrstor (%eax) # sched: [63:16.50]
+; BROADWELL-NEXT: fxsave (%eax) # sched: [100:0.25]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retl # sched: [6:0.50]
+;
+; SKYLAKE-LABEL: test_fxrstor_fxsave:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: fxrstor (%eax) # sched: [63:16.50]
+; SKYLAKE-NEXT: fxsave (%eax) # sched: [100:0.25]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retl # sched: [6:0.50]
+;
+; SKX-LABEL: test_fxrstor_fxsave:
+; SKX: # %bb.0:
+; SKX-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:0.50]
+; SKX-NEXT: #APP
+; SKX-NEXT: fxrstor (%eax) # sched: [63:16.50]
+; SKX-NEXT: fxsave (%eax) # sched: [100:0.25]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retl # sched: [6:0.50]
+;
+; BTVER2-LABEL: test_fxrstor_fxsave:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [5:1.00]
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: fxrstor (%eax) # sched: [100:0.17]
+; BTVER2-NEXT: fxsave (%eax) # sched: [100:0.17]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retl # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_fxrstor_fxsave:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: movl {{[0-9]+}}(%esp), %eax # sched: [8:0.50]
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: fxrstor (%eax) # sched: [100:?]
+; ZNVER1-NEXT: fxsave (%eax) # sched: [100:?]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retl # sched: [1:0.50]
+ tail call void asm sideeffect "fxrstor $0 \0A\09 fxsave $0", "*m"(i8 *%a0) nounwind
+ ret void
+}
+
+define void @test_fxtract() optsize {
+; GENERIC-LABEL: test_fxtract:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: fxtract
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retl
+;
+; ATOM-LABEL: test_fxtract:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: fxtract # sched: [25:12.50]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retl # sched: [79:39.50]
+;
+; SLM-LABEL: test_fxtract:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: fxtract # sched: [100:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retl # sched: [4:1.00]
+;
+; SANDY-LABEL: test_fxtract:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: fxtract # sched: [100:0.33]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retl # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_fxtract:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: fxtract # sched: [15:?]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retl # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_fxtract:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: fxtract # sched: [100:0.25]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retl # sched: [6:0.50]
+;
+; SKYLAKE-LABEL: test_fxtract:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: fxtract # sched: [100:0.25]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retl # sched: [6:0.50]
+;
+; SKX-LABEL: test_fxtract:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: fxtract # sched: [100:0.25]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retl # sched: [6:0.50]
+;
+; BTVER2-LABEL: test_fxtract:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: fxtract # sched: [100:0.17]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retl # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_fxtract:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: fxtract # sched: [100:?]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retl # sched: [1:0.50]
+ tail call void asm sideeffect "fxtract", ""() nounwind
+ ret void
+}
+
+define void @test_fyl2x() optsize {
+; GENERIC-LABEL: test_fyl2x:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: fyl2x
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retl
+;
+; ATOM-LABEL: test_fyl2x:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: fyl2x # sched: [146:73.00]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retl # sched: [79:39.50]
+;
+; SLM-LABEL: test_fyl2x:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: fyl2x # sched: [100:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retl # sched: [4:1.00]
+;
+; SANDY-LABEL: test_fyl2x:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: fyl2x # sched: [100:0.33]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retl # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_fyl2x:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: fyl2x # sched: [100:0.25]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retl # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_fyl2x:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: fyl2x # sched: [100:0.25]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retl # sched: [6:0.50]
+;
+; SKYLAKE-LABEL: test_fyl2x:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: fyl2x # sched: [100:0.25]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retl # sched: [6:0.50]
+;
+; SKX-LABEL: test_fyl2x:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: fyl2x # sched: [100:0.25]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retl # sched: [6:0.50]
+;
+; BTVER2-LABEL: test_fyl2x:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: fyl2x # sched: [100:0.17]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retl # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_fyl2x:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: fyl2x # sched: [100:?]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retl # sched: [1:0.50]
+ tail call void asm sideeffect "fyl2x", ""() nounwind
+ ret void
+}
+
+define void @test_fyl2xp1() optsize {
+; GENERIC-LABEL: test_fyl2xp1:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: fyl2xp1
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retl
+;
+; ATOM-LABEL: test_fyl2xp1:
+; ATOM: # %bb.0:
+; ATOM-NEXT: #APP
+; ATOM-NEXT: fyl2xp1 # sched: [147:73.50]
+; ATOM-NEXT: #NO_APP
+; ATOM-NEXT: retl # sched: [79:39.50]
+;
+; SLM-LABEL: test_fyl2xp1:
+; SLM: # %bb.0:
+; SLM-NEXT: #APP
+; SLM-NEXT: fyl2xp1 # sched: [100:1.00]
+; SLM-NEXT: #NO_APP
+; SLM-NEXT: retl # sched: [4:1.00]
+;
+; SANDY-LABEL: test_fyl2xp1:
+; SANDY: # %bb.0:
+; SANDY-NEXT: #APP
+; SANDY-NEXT: fyl2xp1 # sched: [100:0.33]
+; SANDY-NEXT: #NO_APP
+; SANDY-NEXT: retl # sched: [5:1.00]
+;
+; HASWELL-LABEL: test_fyl2xp1:
+; HASWELL: # %bb.0:
+; HASWELL-NEXT: #APP
+; HASWELL-NEXT: fyl2xp1 # sched: [100:0.25]
+; HASWELL-NEXT: #NO_APP
+; HASWELL-NEXT: retl # sched: [7:1.00]
+;
+; BROADWELL-LABEL: test_fyl2xp1:
+; BROADWELL: # %bb.0:
+; BROADWELL-NEXT: #APP
+; BROADWELL-NEXT: fyl2xp1 # sched: [100:0.25]
+; BROADWELL-NEXT: #NO_APP
+; BROADWELL-NEXT: retl # sched: [6:0.50]
+;
+; SKYLAKE-LABEL: test_fyl2xp1:
+; SKYLAKE: # %bb.0:
+; SKYLAKE-NEXT: #APP
+; SKYLAKE-NEXT: fyl2xp1 # sched: [100:0.25]
+; SKYLAKE-NEXT: #NO_APP
+; SKYLAKE-NEXT: retl # sched: [6:0.50]
+;
+; SKX-LABEL: test_fyl2xp1:
+; SKX: # %bb.0:
+; SKX-NEXT: #APP
+; SKX-NEXT: fyl2xp1 # sched: [100:0.25]
+; SKX-NEXT: #NO_APP
+; SKX-NEXT: retl # sched: [6:0.50]
+;
+; BTVER2-LABEL: test_fyl2xp1:
+; BTVER2: # %bb.0:
+; BTVER2-NEXT: #APP
+; BTVER2-NEXT: fyl2xp1 # sched: [100:0.17]
+; BTVER2-NEXT: #NO_APP
+; BTVER2-NEXT: retl # sched: [4:1.00]
+;
+; ZNVER1-LABEL: test_fyl2xp1:
+; ZNVER1: # %bb.0:
+; ZNVER1-NEXT: #APP
+; ZNVER1-NEXT: fyl2xp1 # sched: [100:?]
+; ZNVER1-NEXT: #NO_APP
+; ZNVER1-NEXT: retl # sched: [1:0.50]
+ tail call void asm sideeffect "fyl2xp1", ""() nounwind
+ ret void
+}
diff --git a/test/CodeGen/X86/x87.ll b/test/CodeGen/X86/x87.ll
index 9bc654861b69..f70f18571046 100644
--- a/test/CodeGen/X86/x87.ll
+++ b/test/CodeGen/X86/x87.ll
@@ -1,9 +1,9 @@
-; RUN: llc < %s -march=x86 | FileCheck %s -check-prefix=X87
-; RUN: llc < %s -march=x86-64 -mattr=-sse | FileCheck %s -check-prefix=X87
-; RUN: llc < %s -march=x86 -mattr=-x87 | FileCheck %s -check-prefix=NOX87
-; RUN: llc < %s -march=x86-64 -mattr=-x87,-sse | FileCheck %s -check-prefix=NOX87
-; RUN: llc < %s -march=x86 -mattr=-x87,+sse | FileCheck %s -check-prefix=NOX87
-; RUN: llc < %s -march=x86-64 -mattr=-x87,-sse2 | FileCheck %s -check-prefix=NOX87
+; RUN: llc < %s -mtriple=i686-- | FileCheck %s -check-prefix=X87
+; RUN: llc < %s -mtriple=x86_64-- -mattr=-sse | FileCheck %s -check-prefix=X87
+; RUN: llc < %s -mtriple=i686-- -mattr=-x87 | FileCheck %s -check-prefix=NOX87
+; RUN: llc < %s -mtriple=x86_64-- -mattr=-x87,-sse | FileCheck %s -check-prefix=NOX87
+; RUN: llc < %s -mtriple=i686-- -mattr=-x87,+sse | FileCheck %s -check-prefix=NOX87
+; RUN: llc < %s -mtriple=x86_64-- -mattr=-x87,-sse2 | FileCheck %s -check-prefix=NOX87
define void @test(i32 %i, i64 %l, float* %pf, double* %pd, fp128* %pld) nounwind readnone {
; X87-LABEL: test:
diff --git a/test/CodeGen/X86/xaluo.ll b/test/CodeGen/X86/xaluo.ll
index 25fd21d80c60..7d4cd2202483 100644
--- a/test/CodeGen/X86/xaluo.ll
+++ b/test/CodeGen/X86/xaluo.ll
@@ -9,14 +9,14 @@
; SADDO reg, reg
define zeroext i1 @saddoi8(i8 signext %v1, i8 signext %v2, i8* %res) {
; SDAG-LABEL: saddoi8:
-; SDAG: ## BB#0:
+; SDAG: ## %bb.0:
; SDAG-NEXT: addb %sil, %dil
; SDAG-NEXT: seto %al
; SDAG-NEXT: movb %dil, (%rdx)
; SDAG-NEXT: retq
;
; FAST-LABEL: saddoi8:
-; FAST: ## BB#0:
+; FAST: ## %bb.0:
; FAST-NEXT: addb %sil, %dil
; FAST-NEXT: seto %al
; FAST-NEXT: movb %dil, (%rdx)
@@ -25,7 +25,7 @@ define zeroext i1 @saddoi8(i8 signext %v1, i8 signext %v2, i8* %res) {
; FAST-NEXT: retq
;
; KNL-LABEL: saddoi8:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: addb %sil, %dil
; KNL-NEXT: seto %al
; KNL-NEXT: movb %dil, (%rdx)
@@ -39,14 +39,14 @@ define zeroext i1 @saddoi8(i8 signext %v1, i8 signext %v2, i8* %res) {
define zeroext i1 @saddoi16(i16 %v1, i16 %v2, i16* %res) {
; SDAG-LABEL: saddoi16:
-; SDAG: ## BB#0:
+; SDAG: ## %bb.0:
; SDAG-NEXT: addw %si, %di
; SDAG-NEXT: seto %al
; SDAG-NEXT: movw %di, (%rdx)
; SDAG-NEXT: retq
;
; FAST-LABEL: saddoi16:
-; FAST: ## BB#0:
+; FAST: ## %bb.0:
; FAST-NEXT: addw %si, %di
; FAST-NEXT: seto %al
; FAST-NEXT: movw %di, (%rdx)
@@ -55,7 +55,7 @@ define zeroext i1 @saddoi16(i16 %v1, i16 %v2, i16* %res) {
; FAST-NEXT: retq
;
; KNL-LABEL: saddoi16:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: addw %si, %di
; KNL-NEXT: seto %al
; KNL-NEXT: movw %di, (%rdx)
@@ -69,14 +69,14 @@ define zeroext i1 @saddoi16(i16 %v1, i16 %v2, i16* %res) {
define zeroext i1 @saddoi32(i32 %v1, i32 %v2, i32* %res) {
; SDAG-LABEL: saddoi32:
-; SDAG: ## BB#0:
+; SDAG: ## %bb.0:
; SDAG-NEXT: addl %esi, %edi
; SDAG-NEXT: seto %al
; SDAG-NEXT: movl %edi, (%rdx)
; SDAG-NEXT: retq
;
; FAST-LABEL: saddoi32:
-; FAST: ## BB#0:
+; FAST: ## %bb.0:
; FAST-NEXT: addl %esi, %edi
; FAST-NEXT: seto %al
; FAST-NEXT: movl %edi, (%rdx)
@@ -85,7 +85,7 @@ define zeroext i1 @saddoi32(i32 %v1, i32 %v2, i32* %res) {
; FAST-NEXT: retq
;
; KNL-LABEL: saddoi32:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: addl %esi, %edi
; KNL-NEXT: seto %al
; KNL-NEXT: movl %edi, (%rdx)
@@ -99,14 +99,14 @@ define zeroext i1 @saddoi32(i32 %v1, i32 %v2, i32* %res) {
define zeroext i1 @saddoi64(i64 %v1, i64 %v2, i64* %res) {
; SDAG-LABEL: saddoi64:
-; SDAG: ## BB#0:
+; SDAG: ## %bb.0:
; SDAG-NEXT: addq %rsi, %rdi
; SDAG-NEXT: seto %al
; SDAG-NEXT: movq %rdi, (%rdx)
; SDAG-NEXT: retq
;
; FAST-LABEL: saddoi64:
-; FAST: ## BB#0:
+; FAST: ## %bb.0:
; FAST-NEXT: addq %rsi, %rdi
; FAST-NEXT: seto %al
; FAST-NEXT: movq %rdi, (%rdx)
@@ -115,7 +115,7 @@ define zeroext i1 @saddoi64(i64 %v1, i64 %v2, i64* %res) {
; FAST-NEXT: retq
;
; KNL-LABEL: saddoi64:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: addq %rsi, %rdi
; KNL-NEXT: seto %al
; KNL-NEXT: movq %rdi, (%rdx)
@@ -130,14 +130,14 @@ define zeroext i1 @saddoi64(i64 %v1, i64 %v2, i64* %res) {
; SADDO reg, 1 | INC
define zeroext i1 @saddoinci8(i8 %v1, i8* %res) {
; SDAG-LABEL: saddoinci8:
-; SDAG: ## BB#0:
+; SDAG: ## %bb.0:
; SDAG-NEXT: incb %dil
; SDAG-NEXT: seto %al
; SDAG-NEXT: movb %dil, (%rsi)
; SDAG-NEXT: retq
;
; FAST-LABEL: saddoinci8:
-; FAST: ## BB#0:
+; FAST: ## %bb.0:
; FAST-NEXT: incb %dil
; FAST-NEXT: seto %al
; FAST-NEXT: movb %dil, (%rsi)
@@ -146,7 +146,7 @@ define zeroext i1 @saddoinci8(i8 %v1, i8* %res) {
; FAST-NEXT: retq
;
; KNL-LABEL: saddoinci8:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: incb %dil
; KNL-NEXT: seto %al
; KNL-NEXT: movb %dil, (%rsi)
@@ -160,14 +160,14 @@ define zeroext i1 @saddoinci8(i8 %v1, i8* %res) {
define zeroext i1 @saddoinci16(i16 %v1, i16* %res) {
; SDAG-LABEL: saddoinci16:
-; SDAG: ## BB#0:
+; SDAG: ## %bb.0:
; SDAG-NEXT: incw %di
; SDAG-NEXT: seto %al
; SDAG-NEXT: movw %di, (%rsi)
; SDAG-NEXT: retq
;
; FAST-LABEL: saddoinci16:
-; FAST: ## BB#0:
+; FAST: ## %bb.0:
; FAST-NEXT: incw %di
; FAST-NEXT: seto %al
; FAST-NEXT: movw %di, (%rsi)
@@ -176,7 +176,7 @@ define zeroext i1 @saddoinci16(i16 %v1, i16* %res) {
; FAST-NEXT: retq
;
; KNL-LABEL: saddoinci16:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: incw %di
; KNL-NEXT: seto %al
; KNL-NEXT: movw %di, (%rsi)
@@ -190,14 +190,14 @@ define zeroext i1 @saddoinci16(i16 %v1, i16* %res) {
define zeroext i1 @saddoinci32(i32 %v1, i32* %res) {
; SDAG-LABEL: saddoinci32:
-; SDAG: ## BB#0:
+; SDAG: ## %bb.0:
; SDAG-NEXT: incl %edi
; SDAG-NEXT: seto %al
; SDAG-NEXT: movl %edi, (%rsi)
; SDAG-NEXT: retq
;
; FAST-LABEL: saddoinci32:
-; FAST: ## BB#0:
+; FAST: ## %bb.0:
; FAST-NEXT: incl %edi
; FAST-NEXT: seto %al
; FAST-NEXT: movl %edi, (%rsi)
@@ -206,7 +206,7 @@ define zeroext i1 @saddoinci32(i32 %v1, i32* %res) {
; FAST-NEXT: retq
;
; KNL-LABEL: saddoinci32:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: incl %edi
; KNL-NEXT: seto %al
; KNL-NEXT: movl %edi, (%rsi)
@@ -220,14 +220,14 @@ define zeroext i1 @saddoinci32(i32 %v1, i32* %res) {
define zeroext i1 @saddoinci64(i64 %v1, i64* %res) {
; SDAG-LABEL: saddoinci64:
-; SDAG: ## BB#0:
+; SDAG: ## %bb.0:
; SDAG-NEXT: incq %rdi
; SDAG-NEXT: seto %al
; SDAG-NEXT: movq %rdi, (%rsi)
; SDAG-NEXT: retq
;
; FAST-LABEL: saddoinci64:
-; FAST: ## BB#0:
+; FAST: ## %bb.0:
; FAST-NEXT: incq %rdi
; FAST-NEXT: seto %al
; FAST-NEXT: movq %rdi, (%rsi)
@@ -236,7 +236,7 @@ define zeroext i1 @saddoinci64(i64 %v1, i64* %res) {
; FAST-NEXT: retq
;
; KNL-LABEL: saddoinci64:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: incq %rdi
; KNL-NEXT: seto %al
; KNL-NEXT: movq %rdi, (%rsi)
@@ -252,7 +252,7 @@ define zeroext i1 @saddoinci64(i64 %v1, i64* %res) {
; FIXME: DAG doesn't optimize immediates on the LHS.
define zeroext i1 @saddoi64imm1(i64 %v1, i64* %res) {
; SDAG-LABEL: saddoi64imm1:
-; SDAG: ## BB#0:
+; SDAG: ## %bb.0:
; SDAG-NEXT: movl $2, %ecx
; SDAG-NEXT: addq %rdi, %rcx
; SDAG-NEXT: seto %al
@@ -260,7 +260,7 @@ define zeroext i1 @saddoi64imm1(i64 %v1, i64* %res) {
; SDAG-NEXT: retq
;
; FAST-LABEL: saddoi64imm1:
-; FAST: ## BB#0:
+; FAST: ## %bb.0:
; FAST-NEXT: addq $2, %rdi
; FAST-NEXT: seto %al
; FAST-NEXT: movq %rdi, (%rsi)
@@ -269,7 +269,7 @@ define zeroext i1 @saddoi64imm1(i64 %v1, i64* %res) {
; FAST-NEXT: retq
;
; KNL-LABEL: saddoi64imm1:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: movl $2, %ecx
; KNL-NEXT: addq %rdi, %rcx
; KNL-NEXT: seto %al
@@ -285,14 +285,14 @@ define zeroext i1 @saddoi64imm1(i64 %v1, i64* %res) {
; Check boundary conditions for large immediates.
define zeroext i1 @saddoi64imm2(i64 %v1, i64* %res) {
; SDAG-LABEL: saddoi64imm2:
-; SDAG: ## BB#0:
+; SDAG: ## %bb.0:
; SDAG-NEXT: addq $-2147483648, %rdi ## imm = 0x80000000
; SDAG-NEXT: seto %al
; SDAG-NEXT: movq %rdi, (%rsi)
; SDAG-NEXT: retq
;
; FAST-LABEL: saddoi64imm2:
-; FAST: ## BB#0:
+; FAST: ## %bb.0:
; FAST-NEXT: addq $-2147483648, %rdi ## imm = 0x80000000
; FAST-NEXT: seto %al
; FAST-NEXT: movq %rdi, (%rsi)
@@ -301,7 +301,7 @@ define zeroext i1 @saddoi64imm2(i64 %v1, i64* %res) {
; FAST-NEXT: retq
;
; KNL-LABEL: saddoi64imm2:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: addq $-2147483648, %rdi ## imm = 0x80000000
; KNL-NEXT: seto %al
; KNL-NEXT: movq %rdi, (%rsi)
@@ -315,7 +315,7 @@ define zeroext i1 @saddoi64imm2(i64 %v1, i64* %res) {
define zeroext i1 @saddoi64imm3(i64 %v1, i64* %res) {
; SDAG-LABEL: saddoi64imm3:
-; SDAG: ## BB#0:
+; SDAG: ## %bb.0:
; SDAG-NEXT: movabsq $-21474836489, %rcx ## imm = 0xFFFFFFFAFFFFFFF7
; SDAG-NEXT: addq %rdi, %rcx
; SDAG-NEXT: seto %al
@@ -323,7 +323,7 @@ define zeroext i1 @saddoi64imm3(i64 %v1, i64* %res) {
; SDAG-NEXT: retq
;
; FAST-LABEL: saddoi64imm3:
-; FAST: ## BB#0:
+; FAST: ## %bb.0:
; FAST-NEXT: movabsq $-21474836489, %rax ## imm = 0xFFFFFFFAFFFFFFF7
; FAST-NEXT: addq %rdi, %rax
; FAST-NEXT: seto %cl
@@ -333,7 +333,7 @@ define zeroext i1 @saddoi64imm3(i64 %v1, i64* %res) {
; FAST-NEXT: retq
;
; KNL-LABEL: saddoi64imm3:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: movabsq $-21474836489, %rcx ## imm = 0xFFFFFFFAFFFFFFF7
; KNL-NEXT: addq %rdi, %rcx
; KNL-NEXT: seto %al
@@ -348,14 +348,14 @@ define zeroext i1 @saddoi64imm3(i64 %v1, i64* %res) {
define zeroext i1 @saddoi64imm4(i64 %v1, i64* %res) {
; SDAG-LABEL: saddoi64imm4:
-; SDAG: ## BB#0:
+; SDAG: ## %bb.0:
; SDAG-NEXT: addq $2147483647, %rdi ## imm = 0x7FFFFFFF
; SDAG-NEXT: seto %al
; SDAG-NEXT: movq %rdi, (%rsi)
; SDAG-NEXT: retq
;
; FAST-LABEL: saddoi64imm4:
-; FAST: ## BB#0:
+; FAST: ## %bb.0:
; FAST-NEXT: addq $2147483647, %rdi ## imm = 0x7FFFFFFF
; FAST-NEXT: seto %al
; FAST-NEXT: movq %rdi, (%rsi)
@@ -364,7 +364,7 @@ define zeroext i1 @saddoi64imm4(i64 %v1, i64* %res) {
; FAST-NEXT: retq
;
; KNL-LABEL: saddoi64imm4:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: addq $2147483647, %rdi ## imm = 0x7FFFFFFF
; KNL-NEXT: seto %al
; KNL-NEXT: movq %rdi, (%rsi)
@@ -378,7 +378,7 @@ define zeroext i1 @saddoi64imm4(i64 %v1, i64* %res) {
define zeroext i1 @saddoi64imm5(i64 %v1, i64* %res) {
; SDAG-LABEL: saddoi64imm5:
-; SDAG: ## BB#0:
+; SDAG: ## %bb.0:
; SDAG-NEXT: movl $2147483648, %ecx ## imm = 0x80000000
; SDAG-NEXT: addq %rdi, %rcx
; SDAG-NEXT: seto %al
@@ -386,7 +386,7 @@ define zeroext i1 @saddoi64imm5(i64 %v1, i64* %res) {
; SDAG-NEXT: retq
;
; FAST-LABEL: saddoi64imm5:
-; FAST: ## BB#0:
+; FAST: ## %bb.0:
; FAST-NEXT: movl $2147483648, %eax ## imm = 0x80000000
; FAST-NEXT: addq %rdi, %rax
; FAST-NEXT: seto %cl
@@ -396,7 +396,7 @@ define zeroext i1 @saddoi64imm5(i64 %v1, i64* %res) {
; FAST-NEXT: retq
;
; KNL-LABEL: saddoi64imm5:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: movl $2147483648, %ecx ## imm = 0x80000000
; KNL-NEXT: addq %rdi, %rcx
; KNL-NEXT: seto %al
@@ -412,14 +412,14 @@ define zeroext i1 @saddoi64imm5(i64 %v1, i64* %res) {
; UADDO
define zeroext i1 @uaddoi32(i32 %v1, i32 %v2, i32* %res) {
; SDAG-LABEL: uaddoi32:
-; SDAG: ## BB#0:
+; SDAG: ## %bb.0:
; SDAG-NEXT: addl %esi, %edi
; SDAG-NEXT: setb %al
; SDAG-NEXT: movl %edi, (%rdx)
; SDAG-NEXT: retq
;
; FAST-LABEL: uaddoi32:
-; FAST: ## BB#0:
+; FAST: ## %bb.0:
; FAST-NEXT: addl %esi, %edi
; FAST-NEXT: setb %al
; FAST-NEXT: movl %edi, (%rdx)
@@ -428,7 +428,7 @@ define zeroext i1 @uaddoi32(i32 %v1, i32 %v2, i32* %res) {
; FAST-NEXT: retq
;
; KNL-LABEL: uaddoi32:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: addl %esi, %edi
; KNL-NEXT: setb %al
; KNL-NEXT: movl %edi, (%rdx)
@@ -442,14 +442,14 @@ define zeroext i1 @uaddoi32(i32 %v1, i32 %v2, i32* %res) {
define zeroext i1 @uaddoi64(i64 %v1, i64 %v2, i64* %res) {
; SDAG-LABEL: uaddoi64:
-; SDAG: ## BB#0:
+; SDAG: ## %bb.0:
; SDAG-NEXT: addq %rsi, %rdi
; SDAG-NEXT: setb %al
; SDAG-NEXT: movq %rdi, (%rdx)
; SDAG-NEXT: retq
;
; FAST-LABEL: uaddoi64:
-; FAST: ## BB#0:
+; FAST: ## %bb.0:
; FAST-NEXT: addq %rsi, %rdi
; FAST-NEXT: setb %al
; FAST-NEXT: movq %rdi, (%rdx)
@@ -458,7 +458,7 @@ define zeroext i1 @uaddoi64(i64 %v1, i64 %v2, i64* %res) {
; FAST-NEXT: retq
;
; KNL-LABEL: uaddoi64:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: addq %rsi, %rdi
; KNL-NEXT: setb %al
; KNL-NEXT: movq %rdi, (%rdx)
@@ -473,14 +473,14 @@ define zeroext i1 @uaddoi64(i64 %v1, i64 %v2, i64* %res) {
; UADDO reg, 1 | NOT INC
define zeroext i1 @uaddoinci8(i8 %v1, i8* %res) {
; SDAG-LABEL: uaddoinci8:
-; SDAG: ## BB#0:
+; SDAG: ## %bb.0:
; SDAG-NEXT: addb $1, %dil
; SDAG-NEXT: setb %al
; SDAG-NEXT: movb %dil, (%rsi)
; SDAG-NEXT: retq
;
; FAST-LABEL: uaddoinci8:
-; FAST: ## BB#0:
+; FAST: ## %bb.0:
; FAST-NEXT: addb $1, %dil
; FAST-NEXT: setb %al
; FAST-NEXT: movb %dil, (%rsi)
@@ -489,7 +489,7 @@ define zeroext i1 @uaddoinci8(i8 %v1, i8* %res) {
; FAST-NEXT: retq
;
; KNL-LABEL: uaddoinci8:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: addb $1, %dil
; KNL-NEXT: setb %al
; KNL-NEXT: movb %dil, (%rsi)
@@ -503,14 +503,14 @@ define zeroext i1 @uaddoinci8(i8 %v1, i8* %res) {
define zeroext i1 @uaddoinci16(i16 %v1, i16* %res) {
; SDAG-LABEL: uaddoinci16:
-; SDAG: ## BB#0:
+; SDAG: ## %bb.0:
; SDAG-NEXT: addw $1, %di
; SDAG-NEXT: setb %al
; SDAG-NEXT: movw %di, (%rsi)
; SDAG-NEXT: retq
;
; FAST-LABEL: uaddoinci16:
-; FAST: ## BB#0:
+; FAST: ## %bb.0:
; FAST-NEXT: addw $1, %di
; FAST-NEXT: setb %al
; FAST-NEXT: movw %di, (%rsi)
@@ -519,7 +519,7 @@ define zeroext i1 @uaddoinci16(i16 %v1, i16* %res) {
; FAST-NEXT: retq
;
; KNL-LABEL: uaddoinci16:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: addw $1, %di
; KNL-NEXT: setb %al
; KNL-NEXT: movw %di, (%rsi)
@@ -533,14 +533,14 @@ define zeroext i1 @uaddoinci16(i16 %v1, i16* %res) {
define zeroext i1 @uaddoinci32(i32 %v1, i32* %res) {
; SDAG-LABEL: uaddoinci32:
-; SDAG: ## BB#0:
+; SDAG: ## %bb.0:
; SDAG-NEXT: addl $1, %edi
; SDAG-NEXT: setb %al
; SDAG-NEXT: movl %edi, (%rsi)
; SDAG-NEXT: retq
;
; FAST-LABEL: uaddoinci32:
-; FAST: ## BB#0:
+; FAST: ## %bb.0:
; FAST-NEXT: addl $1, %edi
; FAST-NEXT: setb %al
; FAST-NEXT: movl %edi, (%rsi)
@@ -549,7 +549,7 @@ define zeroext i1 @uaddoinci32(i32 %v1, i32* %res) {
; FAST-NEXT: retq
;
; KNL-LABEL: uaddoinci32:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: addl $1, %edi
; KNL-NEXT: setb %al
; KNL-NEXT: movl %edi, (%rsi)
@@ -563,14 +563,14 @@ define zeroext i1 @uaddoinci32(i32 %v1, i32* %res) {
define zeroext i1 @uaddoinci64(i64 %v1, i64* %res) {
; SDAG-LABEL: uaddoinci64:
-; SDAG: ## BB#0:
+; SDAG: ## %bb.0:
; SDAG-NEXT: addq $1, %rdi
; SDAG-NEXT: setb %al
; SDAG-NEXT: movq %rdi, (%rsi)
; SDAG-NEXT: retq
;
; FAST-LABEL: uaddoinci64:
-; FAST: ## BB#0:
+; FAST: ## %bb.0:
; FAST-NEXT: addq $1, %rdi
; FAST-NEXT: setb %al
; FAST-NEXT: movq %rdi, (%rsi)
@@ -579,7 +579,7 @@ define zeroext i1 @uaddoinci64(i64 %v1, i64* %res) {
; FAST-NEXT: retq
;
; KNL-LABEL: uaddoinci64:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: addq $1, %rdi
; KNL-NEXT: setb %al
; KNL-NEXT: movq %rdi, (%rsi)
@@ -594,14 +594,14 @@ define zeroext i1 @uaddoinci64(i64 %v1, i64* %res) {
; SSUBO
define zeroext i1 @ssuboi32(i32 %v1, i32 %v2, i32* %res) {
; SDAG-LABEL: ssuboi32:
-; SDAG: ## BB#0:
+; SDAG: ## %bb.0:
; SDAG-NEXT: subl %esi, %edi
; SDAG-NEXT: seto %al
; SDAG-NEXT: movl %edi, (%rdx)
; SDAG-NEXT: retq
;
; FAST-LABEL: ssuboi32:
-; FAST: ## BB#0:
+; FAST: ## %bb.0:
; FAST-NEXT: subl %esi, %edi
; FAST-NEXT: seto %al
; FAST-NEXT: movl %edi, (%rdx)
@@ -610,7 +610,7 @@ define zeroext i1 @ssuboi32(i32 %v1, i32 %v2, i32* %res) {
; FAST-NEXT: retq
;
; KNL-LABEL: ssuboi32:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: subl %esi, %edi
; KNL-NEXT: seto %al
; KNL-NEXT: movl %edi, (%rdx)
@@ -624,14 +624,14 @@ define zeroext i1 @ssuboi32(i32 %v1, i32 %v2, i32* %res) {
define zeroext i1 @ssuboi64(i64 %v1, i64 %v2, i64* %res) {
; SDAG-LABEL: ssuboi64:
-; SDAG: ## BB#0:
+; SDAG: ## %bb.0:
; SDAG-NEXT: subq %rsi, %rdi
; SDAG-NEXT: seto %al
; SDAG-NEXT: movq %rdi, (%rdx)
; SDAG-NEXT: retq
;
; FAST-LABEL: ssuboi64:
-; FAST: ## BB#0:
+; FAST: ## %bb.0:
; FAST-NEXT: subq %rsi, %rdi
; FAST-NEXT: seto %al
; FAST-NEXT: movq %rdi, (%rdx)
@@ -640,7 +640,7 @@ define zeroext i1 @ssuboi64(i64 %v1, i64 %v2, i64* %res) {
; FAST-NEXT: retq
;
; KNL-LABEL: ssuboi64:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: subq %rsi, %rdi
; KNL-NEXT: seto %al
; KNL-NEXT: movq %rdi, (%rdx)
@@ -655,14 +655,14 @@ define zeroext i1 @ssuboi64(i64 %v1, i64 %v2, i64* %res) {
; USUBO
define zeroext i1 @usuboi32(i32 %v1, i32 %v2, i32* %res) {
; SDAG-LABEL: usuboi32:
-; SDAG: ## BB#0:
+; SDAG: ## %bb.0:
; SDAG-NEXT: subl %esi, %edi
; SDAG-NEXT: setb %al
; SDAG-NEXT: movl %edi, (%rdx)
; SDAG-NEXT: retq
;
; FAST-LABEL: usuboi32:
-; FAST: ## BB#0:
+; FAST: ## %bb.0:
; FAST-NEXT: subl %esi, %edi
; FAST-NEXT: setb %al
; FAST-NEXT: movl %edi, (%rdx)
@@ -671,7 +671,7 @@ define zeroext i1 @usuboi32(i32 %v1, i32 %v2, i32* %res) {
; FAST-NEXT: retq
;
; KNL-LABEL: usuboi32:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: subl %esi, %edi
; KNL-NEXT: setb %al
; KNL-NEXT: movl %edi, (%rdx)
@@ -685,14 +685,14 @@ define zeroext i1 @usuboi32(i32 %v1, i32 %v2, i32* %res) {
define zeroext i1 @usuboi64(i64 %v1, i64 %v2, i64* %res) {
; SDAG-LABEL: usuboi64:
-; SDAG: ## BB#0:
+; SDAG: ## %bb.0:
; SDAG-NEXT: subq %rsi, %rdi
; SDAG-NEXT: setb %al
; SDAG-NEXT: movq %rdi, (%rdx)
; SDAG-NEXT: retq
;
; FAST-LABEL: usuboi64:
-; FAST: ## BB#0:
+; FAST: ## %bb.0:
; FAST-NEXT: subq %rsi, %rdi
; FAST-NEXT: setb %al
; FAST-NEXT: movq %rdi, (%rdx)
@@ -701,7 +701,7 @@ define zeroext i1 @usuboi64(i64 %v1, i64 %v2, i64* %res) {
; FAST-NEXT: retq
;
; KNL-LABEL: usuboi64:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: subq %rsi, %rdi
; KNL-NEXT: setb %al
; KNL-NEXT: movq %rdi, (%rdx)
@@ -718,7 +718,7 @@ define zeroext i1 @usuboi64(i64 %v1, i64 %v2, i64* %res) {
;
define i32 @saddoselecti32(i32 %v1, i32 %v2) {
; SDAG-LABEL: saddoselecti32:
-; SDAG: ## BB#0:
+; SDAG: ## %bb.0:
; SDAG-NEXT: movl %edi, %eax
; SDAG-NEXT: addl %esi, %eax
; SDAG-NEXT: cmovol %edi, %esi
@@ -726,7 +726,7 @@ define i32 @saddoselecti32(i32 %v1, i32 %v2) {
; SDAG-NEXT: retq
;
; FAST-LABEL: saddoselecti32:
-; FAST: ## BB#0:
+; FAST: ## %bb.0:
; FAST-NEXT: movl %edi, %eax
; FAST-NEXT: addl %esi, %eax
; FAST-NEXT: cmovol %edi, %esi
@@ -734,7 +734,7 @@ define i32 @saddoselecti32(i32 %v1, i32 %v2) {
; FAST-NEXT: retq
;
; KNL-LABEL: saddoselecti32:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: movl %edi, %eax
; KNL-NEXT: addl %esi, %eax
; KNL-NEXT: cmovol %edi, %esi
@@ -748,7 +748,7 @@ define i32 @saddoselecti32(i32 %v1, i32 %v2) {
define i64 @saddoselecti64(i64 %v1, i64 %v2) {
; SDAG-LABEL: saddoselecti64:
-; SDAG: ## BB#0:
+; SDAG: ## %bb.0:
; SDAG-NEXT: movq %rdi, %rax
; SDAG-NEXT: addq %rsi, %rax
; SDAG-NEXT: cmovoq %rdi, %rsi
@@ -756,7 +756,7 @@ define i64 @saddoselecti64(i64 %v1, i64 %v2) {
; SDAG-NEXT: retq
;
; FAST-LABEL: saddoselecti64:
-; FAST: ## BB#0:
+; FAST: ## %bb.0:
; FAST-NEXT: movq %rdi, %rax
; FAST-NEXT: addq %rsi, %rax
; FAST-NEXT: cmovoq %rdi, %rsi
@@ -764,7 +764,7 @@ define i64 @saddoselecti64(i64 %v1, i64 %v2) {
; FAST-NEXT: retq
;
; KNL-LABEL: saddoselecti64:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: movq %rdi, %rax
; KNL-NEXT: addq %rsi, %rax
; KNL-NEXT: cmovoq %rdi, %rsi
@@ -778,7 +778,7 @@ define i64 @saddoselecti64(i64 %v1, i64 %v2) {
define i32 @uaddoselecti32(i32 %v1, i32 %v2) {
; SDAG-LABEL: uaddoselecti32:
-; SDAG: ## BB#0:
+; SDAG: ## %bb.0:
; SDAG-NEXT: movl %edi, %eax
; SDAG-NEXT: addl %esi, %eax
; SDAG-NEXT: cmovbl %edi, %esi
@@ -786,7 +786,7 @@ define i32 @uaddoselecti32(i32 %v1, i32 %v2) {
; SDAG-NEXT: retq
;
; FAST-LABEL: uaddoselecti32:
-; FAST: ## BB#0:
+; FAST: ## %bb.0:
; FAST-NEXT: movl %edi, %eax
; FAST-NEXT: addl %esi, %eax
; FAST-NEXT: cmovbl %edi, %esi
@@ -794,7 +794,7 @@ define i32 @uaddoselecti32(i32 %v1, i32 %v2) {
; FAST-NEXT: retq
;
; KNL-LABEL: uaddoselecti32:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: movl %edi, %eax
; KNL-NEXT: addl %esi, %eax
; KNL-NEXT: cmovbl %edi, %esi
@@ -808,7 +808,7 @@ define i32 @uaddoselecti32(i32 %v1, i32 %v2) {
define i64 @uaddoselecti64(i64 %v1, i64 %v2) {
; SDAG-LABEL: uaddoselecti64:
-; SDAG: ## BB#0:
+; SDAG: ## %bb.0:
; SDAG-NEXT: movq %rdi, %rax
; SDAG-NEXT: addq %rsi, %rax
; SDAG-NEXT: cmovbq %rdi, %rsi
@@ -816,7 +816,7 @@ define i64 @uaddoselecti64(i64 %v1, i64 %v2) {
; SDAG-NEXT: retq
;
; FAST-LABEL: uaddoselecti64:
-; FAST: ## BB#0:
+; FAST: ## %bb.0:
; FAST-NEXT: movq %rdi, %rax
; FAST-NEXT: addq %rsi, %rax
; FAST-NEXT: cmovbq %rdi, %rsi
@@ -824,7 +824,7 @@ define i64 @uaddoselecti64(i64 %v1, i64 %v2) {
; FAST-NEXT: retq
;
; KNL-LABEL: uaddoselecti64:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: movq %rdi, %rax
; KNL-NEXT: addq %rsi, %rax
; KNL-NEXT: cmovbq %rdi, %rsi
@@ -838,21 +838,21 @@ define i64 @uaddoselecti64(i64 %v1, i64 %v2) {
define i32 @ssuboselecti32(i32 %v1, i32 %v2) {
; SDAG-LABEL: ssuboselecti32:
-; SDAG: ## BB#0:
+; SDAG: ## %bb.0:
; SDAG-NEXT: cmpl %esi, %edi
; SDAG-NEXT: cmovol %edi, %esi
; SDAG-NEXT: movl %esi, %eax
; SDAG-NEXT: retq
;
; FAST-LABEL: ssuboselecti32:
-; FAST: ## BB#0:
+; FAST: ## %bb.0:
; FAST-NEXT: cmpl %esi, %edi
; FAST-NEXT: cmovol %edi, %esi
; FAST-NEXT: movl %esi, %eax
; FAST-NEXT: retq
;
; KNL-LABEL: ssuboselecti32:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: cmpl %esi, %edi
; KNL-NEXT: cmovol %edi, %esi
; KNL-NEXT: movl %esi, %eax
@@ -865,21 +865,21 @@ define i32 @ssuboselecti32(i32 %v1, i32 %v2) {
define i64 @ssuboselecti64(i64 %v1, i64 %v2) {
; SDAG-LABEL: ssuboselecti64:
-; SDAG: ## BB#0:
+; SDAG: ## %bb.0:
; SDAG-NEXT: cmpq %rsi, %rdi
; SDAG-NEXT: cmovoq %rdi, %rsi
; SDAG-NEXT: movq %rsi, %rax
; SDAG-NEXT: retq
;
; FAST-LABEL: ssuboselecti64:
-; FAST: ## BB#0:
+; FAST: ## %bb.0:
; FAST-NEXT: cmpq %rsi, %rdi
; FAST-NEXT: cmovoq %rdi, %rsi
; FAST-NEXT: movq %rsi, %rax
; FAST-NEXT: retq
;
; KNL-LABEL: ssuboselecti64:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: cmpq %rsi, %rdi
; KNL-NEXT: cmovoq %rdi, %rsi
; KNL-NEXT: movq %rsi, %rax
@@ -892,21 +892,21 @@ define i64 @ssuboselecti64(i64 %v1, i64 %v2) {
define i32 @usuboselecti32(i32 %v1, i32 %v2) {
; SDAG-LABEL: usuboselecti32:
-; SDAG: ## BB#0:
+; SDAG: ## %bb.0:
; SDAG-NEXT: cmpl %esi, %edi
; SDAG-NEXT: cmovbl %edi, %esi
; SDAG-NEXT: movl %esi, %eax
; SDAG-NEXT: retq
;
; FAST-LABEL: usuboselecti32:
-; FAST: ## BB#0:
+; FAST: ## %bb.0:
; FAST-NEXT: cmpl %esi, %edi
; FAST-NEXT: cmovbl %edi, %esi
; FAST-NEXT: movl %esi, %eax
; FAST-NEXT: retq
;
; KNL-LABEL: usuboselecti32:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: cmpl %esi, %edi
; KNL-NEXT: cmovbl %edi, %esi
; KNL-NEXT: movl %esi, %eax
@@ -919,21 +919,21 @@ define i32 @usuboselecti32(i32 %v1, i32 %v2) {
define i64 @usuboselecti64(i64 %v1, i64 %v2) {
; SDAG-LABEL: usuboselecti64:
-; SDAG: ## BB#0:
+; SDAG: ## %bb.0:
; SDAG-NEXT: cmpq %rsi, %rdi
; SDAG-NEXT: cmovbq %rdi, %rsi
; SDAG-NEXT: movq %rsi, %rax
; SDAG-NEXT: retq
;
; FAST-LABEL: usuboselecti64:
-; FAST: ## BB#0:
+; FAST: ## %bb.0:
; FAST-NEXT: cmpq %rsi, %rdi
; FAST-NEXT: cmovbq %rdi, %rsi
; FAST-NEXT: movq %rsi, %rax
; FAST-NEXT: retq
;
; KNL-LABEL: usuboselecti64:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: cmpq %rsi, %rdi
; KNL-NEXT: cmovbq %rdi, %rsi
; KNL-NEXT: movq %rsi, %rax
@@ -949,10 +949,10 @@ define i64 @usuboselecti64(i64 %v1, i64 %v2) {
;
define zeroext i1 @saddobri32(i32 %v1, i32 %v2) {
; SDAG-LABEL: saddobri32:
-; SDAG: ## BB#0:
+; SDAG: ## %bb.0:
; SDAG-NEXT: addl %esi, %edi
; SDAG-NEXT: jo LBB31_1
-; SDAG-NEXT: ## BB#2: ## %continue
+; SDAG-NEXT: ## %bb.2: ## %continue
; SDAG-NEXT: movb $1, %al
; SDAG-NEXT: retq
; SDAG-NEXT: LBB31_1: ## %overflow
@@ -960,10 +960,10 @@ define zeroext i1 @saddobri32(i32 %v1, i32 %v2) {
; SDAG-NEXT: retq
;
; FAST-LABEL: saddobri32:
-; FAST: ## BB#0:
+; FAST: ## %bb.0:
; FAST-NEXT: addl %esi, %edi
; FAST-NEXT: jo LBB31_1
-; FAST-NEXT: ## BB#2: ## %continue
+; FAST-NEXT: ## %bb.2: ## %continue
; FAST-NEXT: movb $1, %al
; FAST-NEXT: andb $1, %al
; FAST-NEXT: movzbl %al, %eax
@@ -975,10 +975,10 @@ define zeroext i1 @saddobri32(i32 %v1, i32 %v2) {
; FAST-NEXT: retq
;
; KNL-LABEL: saddobri32:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: addl %esi, %edi
; KNL-NEXT: jo LBB31_1
-; KNL-NEXT: ## BB#2: ## %continue
+; KNL-NEXT: ## %bb.2: ## %continue
; KNL-NEXT: movb $1, %al
; KNL-NEXT: retq
; KNL-NEXT: LBB31_1: ## %overflow
@@ -998,10 +998,10 @@ continue:
define zeroext i1 @saddobri64(i64 %v1, i64 %v2) {
; SDAG-LABEL: saddobri64:
-; SDAG: ## BB#0:
+; SDAG: ## %bb.0:
; SDAG-NEXT: addq %rsi, %rdi
; SDAG-NEXT: jo LBB32_1
-; SDAG-NEXT: ## BB#2: ## %continue
+; SDAG-NEXT: ## %bb.2: ## %continue
; SDAG-NEXT: movb $1, %al
; SDAG-NEXT: retq
; SDAG-NEXT: LBB32_1: ## %overflow
@@ -1009,10 +1009,10 @@ define zeroext i1 @saddobri64(i64 %v1, i64 %v2) {
; SDAG-NEXT: retq
;
; FAST-LABEL: saddobri64:
-; FAST: ## BB#0:
+; FAST: ## %bb.0:
; FAST-NEXT: addq %rsi, %rdi
; FAST-NEXT: jo LBB32_1
-; FAST-NEXT: ## BB#2: ## %continue
+; FAST-NEXT: ## %bb.2: ## %continue
; FAST-NEXT: movb $1, %al
; FAST-NEXT: andb $1, %al
; FAST-NEXT: movzbl %al, %eax
@@ -1024,10 +1024,10 @@ define zeroext i1 @saddobri64(i64 %v1, i64 %v2) {
; FAST-NEXT: retq
;
; KNL-LABEL: saddobri64:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: addq %rsi, %rdi
; KNL-NEXT: jo LBB32_1
-; KNL-NEXT: ## BB#2: ## %continue
+; KNL-NEXT: ## %bb.2: ## %continue
; KNL-NEXT: movb $1, %al
; KNL-NEXT: retq
; KNL-NEXT: LBB32_1: ## %overflow
@@ -1047,10 +1047,10 @@ continue:
define zeroext i1 @uaddobri32(i32 %v1, i32 %v2) {
; SDAG-LABEL: uaddobri32:
-; SDAG: ## BB#0:
+; SDAG: ## %bb.0:
; SDAG-NEXT: addl %esi, %edi
; SDAG-NEXT: jb LBB33_1
-; SDAG-NEXT: ## BB#2: ## %continue
+; SDAG-NEXT: ## %bb.2: ## %continue
; SDAG-NEXT: movb $1, %al
; SDAG-NEXT: retq
; SDAG-NEXT: LBB33_1: ## %overflow
@@ -1058,10 +1058,10 @@ define zeroext i1 @uaddobri32(i32 %v1, i32 %v2) {
; SDAG-NEXT: retq
;
; FAST-LABEL: uaddobri32:
-; FAST: ## BB#0:
+; FAST: ## %bb.0:
; FAST-NEXT: addl %esi, %edi
; FAST-NEXT: jb LBB33_1
-; FAST-NEXT: ## BB#2: ## %continue
+; FAST-NEXT: ## %bb.2: ## %continue
; FAST-NEXT: movb $1, %al
; FAST-NEXT: andb $1, %al
; FAST-NEXT: movzbl %al, %eax
@@ -1073,10 +1073,10 @@ define zeroext i1 @uaddobri32(i32 %v1, i32 %v2) {
; FAST-NEXT: retq
;
; KNL-LABEL: uaddobri32:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: addl %esi, %edi
; KNL-NEXT: jb LBB33_1
-; KNL-NEXT: ## BB#2: ## %continue
+; KNL-NEXT: ## %bb.2: ## %continue
; KNL-NEXT: movb $1, %al
; KNL-NEXT: retq
; KNL-NEXT: LBB33_1: ## %overflow
@@ -1096,10 +1096,10 @@ continue:
define zeroext i1 @uaddobri64(i64 %v1, i64 %v2) {
; SDAG-LABEL: uaddobri64:
-; SDAG: ## BB#0:
+; SDAG: ## %bb.0:
; SDAG-NEXT: addq %rsi, %rdi
; SDAG-NEXT: jb LBB34_1
-; SDAG-NEXT: ## BB#2: ## %continue
+; SDAG-NEXT: ## %bb.2: ## %continue
; SDAG-NEXT: movb $1, %al
; SDAG-NEXT: retq
; SDAG-NEXT: LBB34_1: ## %overflow
@@ -1107,10 +1107,10 @@ define zeroext i1 @uaddobri64(i64 %v1, i64 %v2) {
; SDAG-NEXT: retq
;
; FAST-LABEL: uaddobri64:
-; FAST: ## BB#0:
+; FAST: ## %bb.0:
; FAST-NEXT: addq %rsi, %rdi
; FAST-NEXT: jb LBB34_1
-; FAST-NEXT: ## BB#2: ## %continue
+; FAST-NEXT: ## %bb.2: ## %continue
; FAST-NEXT: movb $1, %al
; FAST-NEXT: andb $1, %al
; FAST-NEXT: movzbl %al, %eax
@@ -1122,10 +1122,10 @@ define zeroext i1 @uaddobri64(i64 %v1, i64 %v2) {
; FAST-NEXT: retq
;
; KNL-LABEL: uaddobri64:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: addq %rsi, %rdi
; KNL-NEXT: jb LBB34_1
-; KNL-NEXT: ## BB#2: ## %continue
+; KNL-NEXT: ## %bb.2: ## %continue
; KNL-NEXT: movb $1, %al
; KNL-NEXT: retq
; KNL-NEXT: LBB34_1: ## %overflow
@@ -1145,10 +1145,10 @@ continue:
define zeroext i1 @ssubobri32(i32 %v1, i32 %v2) {
; SDAG-LABEL: ssubobri32:
-; SDAG: ## BB#0:
+; SDAG: ## %bb.0:
; SDAG-NEXT: cmpl %esi, %edi
; SDAG-NEXT: jo LBB35_1
-; SDAG-NEXT: ## BB#2: ## %continue
+; SDAG-NEXT: ## %bb.2: ## %continue
; SDAG-NEXT: movb $1, %al
; SDAG-NEXT: retq
; SDAG-NEXT: LBB35_1: ## %overflow
@@ -1156,10 +1156,10 @@ define zeroext i1 @ssubobri32(i32 %v1, i32 %v2) {
; SDAG-NEXT: retq
;
; FAST-LABEL: ssubobri32:
-; FAST: ## BB#0:
+; FAST: ## %bb.0:
; FAST-NEXT: cmpl %esi, %edi
; FAST-NEXT: jo LBB35_1
-; FAST-NEXT: ## BB#2: ## %continue
+; FAST-NEXT: ## %bb.2: ## %continue
; FAST-NEXT: movb $1, %al
; FAST-NEXT: andb $1, %al
; FAST-NEXT: movzbl %al, %eax
@@ -1171,10 +1171,10 @@ define zeroext i1 @ssubobri32(i32 %v1, i32 %v2) {
; FAST-NEXT: retq
;
; KNL-LABEL: ssubobri32:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: cmpl %esi, %edi
; KNL-NEXT: jo LBB35_1
-; KNL-NEXT: ## BB#2: ## %continue
+; KNL-NEXT: ## %bb.2: ## %continue
; KNL-NEXT: movb $1, %al
; KNL-NEXT: retq
; KNL-NEXT: LBB35_1: ## %overflow
@@ -1194,10 +1194,10 @@ continue:
define zeroext i1 @ssubobri64(i64 %v1, i64 %v2) {
; SDAG-LABEL: ssubobri64:
-; SDAG: ## BB#0:
+; SDAG: ## %bb.0:
; SDAG-NEXT: cmpq %rsi, %rdi
; SDAG-NEXT: jo LBB36_1
-; SDAG-NEXT: ## BB#2: ## %continue
+; SDAG-NEXT: ## %bb.2: ## %continue
; SDAG-NEXT: movb $1, %al
; SDAG-NEXT: retq
; SDAG-NEXT: LBB36_1: ## %overflow
@@ -1205,10 +1205,10 @@ define zeroext i1 @ssubobri64(i64 %v1, i64 %v2) {
; SDAG-NEXT: retq
;
; FAST-LABEL: ssubobri64:
-; FAST: ## BB#0:
+; FAST: ## %bb.0:
; FAST-NEXT: cmpq %rsi, %rdi
; FAST-NEXT: jo LBB36_1
-; FAST-NEXT: ## BB#2: ## %continue
+; FAST-NEXT: ## %bb.2: ## %continue
; FAST-NEXT: movb $1, %al
; FAST-NEXT: andb $1, %al
; FAST-NEXT: movzbl %al, %eax
@@ -1220,10 +1220,10 @@ define zeroext i1 @ssubobri64(i64 %v1, i64 %v2) {
; FAST-NEXT: retq
;
; KNL-LABEL: ssubobri64:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: cmpq %rsi, %rdi
; KNL-NEXT: jo LBB36_1
-; KNL-NEXT: ## BB#2: ## %continue
+; KNL-NEXT: ## %bb.2: ## %continue
; KNL-NEXT: movb $1, %al
; KNL-NEXT: retq
; KNL-NEXT: LBB36_1: ## %overflow
@@ -1243,10 +1243,10 @@ continue:
define zeroext i1 @usubobri32(i32 %v1, i32 %v2) {
; SDAG-LABEL: usubobri32:
-; SDAG: ## BB#0:
+; SDAG: ## %bb.0:
; SDAG-NEXT: cmpl %esi, %edi
; SDAG-NEXT: jb LBB37_1
-; SDAG-NEXT: ## BB#2: ## %continue
+; SDAG-NEXT: ## %bb.2: ## %continue
; SDAG-NEXT: movb $1, %al
; SDAG-NEXT: retq
; SDAG-NEXT: LBB37_1: ## %overflow
@@ -1254,10 +1254,10 @@ define zeroext i1 @usubobri32(i32 %v1, i32 %v2) {
; SDAG-NEXT: retq
;
; FAST-LABEL: usubobri32:
-; FAST: ## BB#0:
+; FAST: ## %bb.0:
; FAST-NEXT: cmpl %esi, %edi
; FAST-NEXT: jb LBB37_1
-; FAST-NEXT: ## BB#2: ## %continue
+; FAST-NEXT: ## %bb.2: ## %continue
; FAST-NEXT: movb $1, %al
; FAST-NEXT: andb $1, %al
; FAST-NEXT: movzbl %al, %eax
@@ -1269,10 +1269,10 @@ define zeroext i1 @usubobri32(i32 %v1, i32 %v2) {
; FAST-NEXT: retq
;
; KNL-LABEL: usubobri32:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: cmpl %esi, %edi
; KNL-NEXT: jb LBB37_1
-; KNL-NEXT: ## BB#2: ## %continue
+; KNL-NEXT: ## %bb.2: ## %continue
; KNL-NEXT: movb $1, %al
; KNL-NEXT: retq
; KNL-NEXT: LBB37_1: ## %overflow
@@ -1292,10 +1292,10 @@ continue:
define zeroext i1 @usubobri64(i64 %v1, i64 %v2) {
; SDAG-LABEL: usubobri64:
-; SDAG: ## BB#0:
+; SDAG: ## %bb.0:
; SDAG-NEXT: cmpq %rsi, %rdi
; SDAG-NEXT: jb LBB38_1
-; SDAG-NEXT: ## BB#2: ## %continue
+; SDAG-NEXT: ## %bb.2: ## %continue
; SDAG-NEXT: movb $1, %al
; SDAG-NEXT: retq
; SDAG-NEXT: LBB38_1: ## %overflow
@@ -1303,10 +1303,10 @@ define zeroext i1 @usubobri64(i64 %v1, i64 %v2) {
; SDAG-NEXT: retq
;
; FAST-LABEL: usubobri64:
-; FAST: ## BB#0:
+; FAST: ## %bb.0:
; FAST-NEXT: cmpq %rsi, %rdi
; FAST-NEXT: jb LBB38_1
-; FAST-NEXT: ## BB#2: ## %continue
+; FAST-NEXT: ## %bb.2: ## %continue
; FAST-NEXT: movb $1, %al
; FAST-NEXT: andb $1, %al
; FAST-NEXT: movzbl %al, %eax
@@ -1318,10 +1318,10 @@ define zeroext i1 @usubobri64(i64 %v1, i64 %v2) {
; FAST-NEXT: retq
;
; KNL-LABEL: usubobri64:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: cmpq %rsi, %rdi
; KNL-NEXT: jb LBB38_1
-; KNL-NEXT: ## BB#2: ## %continue
+; KNL-NEXT: ## %bb.2: ## %continue
; KNL-NEXT: movb $1, %al
; KNL-NEXT: retq
; KNL-NEXT: LBB38_1: ## %overflow
@@ -1341,7 +1341,7 @@ continue:
define {i64, i1} @uaddoovf(i64 %a, i64 %b) {
; SDAG-LABEL: uaddoovf:
-; SDAG: ## BB#0:
+; SDAG: ## %bb.0:
; SDAG-NEXT: movzbl %dil, %ecx
; SDAG-NEXT: movzbl %sil, %eax
; SDAG-NEXT: addq %rcx, %rax
@@ -1349,7 +1349,7 @@ define {i64, i1} @uaddoovf(i64 %a, i64 %b) {
; SDAG-NEXT: retq
;
; FAST-LABEL: uaddoovf:
-; FAST: ## BB#0:
+; FAST: ## %bb.0:
; FAST-NEXT: movzbl %dil, %ecx
; FAST-NEXT: movzbl %sil, %eax
; FAST-NEXT: addq %rcx, %rax
@@ -1357,7 +1357,7 @@ define {i64, i1} @uaddoovf(i64 %a, i64 %b) {
; FAST-NEXT: retq
;
; KNL-LABEL: uaddoovf:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: movzbl %dil, %ecx
; KNL-NEXT: movzbl %sil, %eax
; KNL-NEXT: addq %rcx, %rax
@@ -1371,21 +1371,21 @@ define {i64, i1} @uaddoovf(i64 %a, i64 %b) {
define {i64, i1} @usuboovf(i64 %a, i64 %b) {
; SDAG-LABEL: usuboovf:
-; SDAG: ## BB#0:
+; SDAG: ## %bb.0:
; SDAG-NEXT: notq %rsi
; SDAG-NEXT: xorl %edx, %edx
; SDAG-NEXT: movq %rsi, %rax
; SDAG-NEXT: retq
;
; FAST-LABEL: usuboovf:
-; FAST: ## BB#0:
+; FAST: ## %bb.0:
; FAST-NEXT: notq %rsi
; FAST-NEXT: xorl %edx, %edx
; FAST-NEXT: movq %rsi, %rax
; FAST-NEXT: retq
;
; KNL-LABEL: usuboovf:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: notq %rsi
; KNL-NEXT: xorl %edx, %edx
; KNL-NEXT: movq %rsi, %rax
diff --git a/test/CodeGen/X86/xchg-nofold.ll b/test/CodeGen/X86/xchg-nofold.ll
index 939fa0404223..b60204131754 100644
--- a/test/CodeGen/X86/xchg-nofold.ll
+++ b/test/CodeGen/X86/xchg-nofold.ll
@@ -8,13 +8,13 @@
; CHECK-LABEL: _Z3fooRSt6atomicIbEb
define zeroext i1 @_Z3fooRSt6atomicIbEb(%"struct.std::atomic"* nocapture dereferenceable(1) %a, i1 returned zeroext %b) nounwind {
; CHECK-LABEL: _Z3fooRSt6atomicIbEb:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movq %rdi, %rax
; CHECK-NEXT: shrq $3, %rax
; CHECK-NEXT: movb 2147450880(%rax), %al
; CHECK-NEXT: testb %al, %al
; CHECK-NEXT: je .LBB0_3
-; CHECK-NEXT: # BB#1:
+; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: movl %edi, %ecx
; CHECK-NEXT: andl $7, %ecx
; CHECK-NEXT: cmpb %al, %cl
diff --git a/test/CodeGen/X86/xmm-r64.ll b/test/CodeGen/X86/xmm-r64.ll
index 2a6b5c71aa4f..9fe5376a9d4f 100644
--- a/test/CodeGen/X86/xmm-r64.ll
+++ b/test/CodeGen/X86/xmm-r64.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64
+; RUN: llc < %s -mtriple=x86_64--
define <4 x i32> @test() {
%tmp1039 = call <4 x i32> @llvm.x86.sse2.psll.d( <4 x i32> zeroinitializer, <4 x i32> zeroinitializer ) ; <<4 x i32>> [#uses=1]
diff --git a/test/CodeGen/X86/xmulo.ll b/test/CodeGen/X86/xmulo.ll
index 03f284d87a66..3788d9c2d396 100644
--- a/test/CodeGen/X86/xmulo.ll
+++ b/test/CodeGen/X86/xmulo.ll
@@ -5,7 +5,7 @@
define {i64, i1} @t1() nounwind {
; SDAG-LABEL: t1:
-; SDAG: ## BB#0:
+; SDAG: ## %bb.0:
; SDAG-NEXT: movl $8, %ecx
; SDAG-NEXT: movl $9, %eax
; SDAG-NEXT: mulq %rcx
@@ -13,7 +13,7 @@ define {i64, i1} @t1() nounwind {
; SDAG-NEXT: retq
;
; FAST-LABEL: t1:
-; FAST: ## BB#0:
+; FAST: ## %bb.0:
; FAST-NEXT: movl $8, %ecx
; FAST-NEXT: movl $9, %eax
; FAST-NEXT: mulq %rcx
@@ -21,7 +21,7 @@ define {i64, i1} @t1() nounwind {
; FAST-NEXT: retq
;
; KNL-LABEL: t1:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: movl $8, %ecx
; KNL-NEXT: movl $9, %eax
; KNL-NEXT: mulq %rcx
@@ -33,7 +33,7 @@ define {i64, i1} @t1() nounwind {
define {i64, i1} @t2() nounwind {
; SDAG-LABEL: t2:
-; SDAG: ## BB#0:
+; SDAG: ## %bb.0:
; SDAG-NEXT: xorl %ecx, %ecx
; SDAG-NEXT: movl $9, %eax
; SDAG-NEXT: mulq %rcx
@@ -41,7 +41,7 @@ define {i64, i1} @t2() nounwind {
; SDAG-NEXT: retq
;
; FAST-LABEL: t2:
-; FAST: ## BB#0:
+; FAST: ## %bb.0:
; FAST-NEXT: xorl %ecx, %ecx
; FAST-NEXT: movl $9, %eax
; FAST-NEXT: mulq %rcx
@@ -49,7 +49,7 @@ define {i64, i1} @t2() nounwind {
; FAST-NEXT: retq
;
; KNL-LABEL: t2:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: xorl %ecx, %ecx
; KNL-NEXT: movl $9, %eax
; KNL-NEXT: mulq %rcx
@@ -61,7 +61,7 @@ define {i64, i1} @t2() nounwind {
define {i64, i1} @t3() nounwind {
; SDAG-LABEL: t3:
-; SDAG: ## BB#0:
+; SDAG: ## %bb.0:
; SDAG-NEXT: movq $-1, %rcx
; SDAG-NEXT: movl $9, %eax
; SDAG-NEXT: mulq %rcx
@@ -69,7 +69,7 @@ define {i64, i1} @t3() nounwind {
; SDAG-NEXT: retq
;
; FAST-LABEL: t3:
-; FAST: ## BB#0:
+; FAST: ## %bb.0:
; FAST-NEXT: movq $-1, %rcx
; FAST-NEXT: movl $9, %eax
; FAST-NEXT: mulq %rcx
@@ -77,7 +77,7 @@ define {i64, i1} @t3() nounwind {
; FAST-NEXT: retq
;
; KNL-LABEL: t3:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: movq $-1, %rcx
; KNL-NEXT: movl $9, %eax
; KNL-NEXT: mulq %rcx
@@ -90,7 +90,7 @@ define {i64, i1} @t3() nounwind {
; SMULO
define zeroext i1 @smuloi8(i8 %v1, i8 %v2, i8* %res) {
; SDAG-LABEL: smuloi8:
-; SDAG: ## BB#0:
+; SDAG: ## %bb.0:
; SDAG-NEXT: movl %edi, %eax
; SDAG-NEXT: imulb %sil
; SDAG-NEXT: seto %cl
@@ -99,7 +99,7 @@ define zeroext i1 @smuloi8(i8 %v1, i8 %v2, i8* %res) {
; SDAG-NEXT: retq
;
; FAST-LABEL: smuloi8:
-; FAST: ## BB#0:
+; FAST: ## %bb.0:
; FAST-NEXT: movl %edi, %eax
; FAST-NEXT: imulb %sil
; FAST-NEXT: seto %cl
@@ -109,7 +109,7 @@ define zeroext i1 @smuloi8(i8 %v1, i8 %v2, i8* %res) {
; FAST-NEXT: retq
;
; KNL-LABEL: smuloi8:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: movl %edi, %eax
; KNL-NEXT: imulb %sil
; KNL-NEXT: seto %cl
@@ -125,14 +125,14 @@ define zeroext i1 @smuloi8(i8 %v1, i8 %v2, i8* %res) {
define zeroext i1 @smuloi16(i16 %v1, i16 %v2, i16* %res) {
; SDAG-LABEL: smuloi16:
-; SDAG: ## BB#0:
+; SDAG: ## %bb.0:
; SDAG-NEXT: imulw %si, %di
; SDAG-NEXT: seto %al
; SDAG-NEXT: movw %di, (%rdx)
; SDAG-NEXT: retq
;
; FAST-LABEL: smuloi16:
-; FAST: ## BB#0:
+; FAST: ## %bb.0:
; FAST-NEXT: imulw %si, %di
; FAST-NEXT: seto %al
; FAST-NEXT: movw %di, (%rdx)
@@ -141,7 +141,7 @@ define zeroext i1 @smuloi16(i16 %v1, i16 %v2, i16* %res) {
; FAST-NEXT: retq
;
; KNL-LABEL: smuloi16:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: imulw %si, %di
; KNL-NEXT: seto %al
; KNL-NEXT: movw %di, (%rdx)
@@ -155,14 +155,14 @@ define zeroext i1 @smuloi16(i16 %v1, i16 %v2, i16* %res) {
define zeroext i1 @smuloi32(i32 %v1, i32 %v2, i32* %res) {
; SDAG-LABEL: smuloi32:
-; SDAG: ## BB#0:
+; SDAG: ## %bb.0:
; SDAG-NEXT: imull %esi, %edi
; SDAG-NEXT: seto %al
; SDAG-NEXT: movl %edi, (%rdx)
; SDAG-NEXT: retq
;
; FAST-LABEL: smuloi32:
-; FAST: ## BB#0:
+; FAST: ## %bb.0:
; FAST-NEXT: imull %esi, %edi
; FAST-NEXT: seto %al
; FAST-NEXT: movl %edi, (%rdx)
@@ -171,7 +171,7 @@ define zeroext i1 @smuloi32(i32 %v1, i32 %v2, i32* %res) {
; FAST-NEXT: retq
;
; KNL-LABEL: smuloi32:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: imull %esi, %edi
; KNL-NEXT: seto %al
; KNL-NEXT: movl %edi, (%rdx)
@@ -185,14 +185,14 @@ define zeroext i1 @smuloi32(i32 %v1, i32 %v2, i32* %res) {
define zeroext i1 @smuloi64(i64 %v1, i64 %v2, i64* %res) {
; SDAG-LABEL: smuloi64:
-; SDAG: ## BB#0:
+; SDAG: ## %bb.0:
; SDAG-NEXT: imulq %rsi, %rdi
; SDAG-NEXT: seto %al
; SDAG-NEXT: movq %rdi, (%rdx)
; SDAG-NEXT: retq
;
; FAST-LABEL: smuloi64:
-; FAST: ## BB#0:
+; FAST: ## %bb.0:
; FAST-NEXT: imulq %rsi, %rdi
; FAST-NEXT: seto %al
; FAST-NEXT: movq %rdi, (%rdx)
@@ -201,7 +201,7 @@ define zeroext i1 @smuloi64(i64 %v1, i64 %v2, i64* %res) {
; FAST-NEXT: retq
;
; KNL-LABEL: smuloi64:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: imulq %rsi, %rdi
; KNL-NEXT: seto %al
; KNL-NEXT: movq %rdi, (%rdx)
@@ -216,7 +216,7 @@ define zeroext i1 @smuloi64(i64 %v1, i64 %v2, i64* %res) {
; UMULO
define zeroext i1 @umuloi8(i8 %v1, i8 %v2, i8* %res) {
; SDAG-LABEL: umuloi8:
-; SDAG: ## BB#0:
+; SDAG: ## %bb.0:
; SDAG-NEXT: movl %edi, %eax
; SDAG-NEXT: mulb %sil
; SDAG-NEXT: seto %cl
@@ -225,7 +225,7 @@ define zeroext i1 @umuloi8(i8 %v1, i8 %v2, i8* %res) {
; SDAG-NEXT: retq
;
; FAST-LABEL: umuloi8:
-; FAST: ## BB#0:
+; FAST: ## %bb.0:
; FAST-NEXT: movl %edi, %eax
; FAST-NEXT: mulb %sil
; FAST-NEXT: seto %cl
@@ -235,7 +235,7 @@ define zeroext i1 @umuloi8(i8 %v1, i8 %v2, i8* %res) {
; FAST-NEXT: retq
;
; KNL-LABEL: umuloi8:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: movl %edi, %eax
; KNL-NEXT: mulb %sil
; KNL-NEXT: seto %cl
@@ -251,7 +251,7 @@ define zeroext i1 @umuloi8(i8 %v1, i8 %v2, i8* %res) {
define zeroext i1 @umuloi16(i16 %v1, i16 %v2, i16* %res) {
; SDAG-LABEL: umuloi16:
-; SDAG: ## BB#0:
+; SDAG: ## %bb.0:
; SDAG-NEXT: movq %rdx, %rcx
; SDAG-NEXT: movl %edi, %eax
; SDAG-NEXT: mulw %si
@@ -261,7 +261,7 @@ define zeroext i1 @umuloi16(i16 %v1, i16 %v2, i16* %res) {
; SDAG-NEXT: retq
;
; FAST-LABEL: umuloi16:
-; FAST: ## BB#0:
+; FAST: ## %bb.0:
; FAST-NEXT: movq %rdx, %rcx
; FAST-NEXT: movl %edi, %eax
; FAST-NEXT: mulw %si
@@ -272,7 +272,7 @@ define zeroext i1 @umuloi16(i16 %v1, i16 %v2, i16* %res) {
; FAST-NEXT: retq
;
; KNL-LABEL: umuloi16:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: movq %rdx, %rcx
; KNL-NEXT: movl %edi, %eax
; KNL-NEXT: mulw %si
@@ -289,7 +289,7 @@ define zeroext i1 @umuloi16(i16 %v1, i16 %v2, i16* %res) {
define zeroext i1 @umuloi32(i32 %v1, i32 %v2, i32* %res) {
; SDAG-LABEL: umuloi32:
-; SDAG: ## BB#0:
+; SDAG: ## %bb.0:
; SDAG-NEXT: movq %rdx, %rcx
; SDAG-NEXT: movl %edi, %eax
; SDAG-NEXT: mull %esi
@@ -299,7 +299,7 @@ define zeroext i1 @umuloi32(i32 %v1, i32 %v2, i32* %res) {
; SDAG-NEXT: retq
;
; FAST-LABEL: umuloi32:
-; FAST: ## BB#0:
+; FAST: ## %bb.0:
; FAST-NEXT: movq %rdx, %rcx
; FAST-NEXT: movl %edi, %eax
; FAST-NEXT: mull %esi
@@ -310,7 +310,7 @@ define zeroext i1 @umuloi32(i32 %v1, i32 %v2, i32* %res) {
; FAST-NEXT: retq
;
; KNL-LABEL: umuloi32:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: movq %rdx, %rcx
; KNL-NEXT: movl %edi, %eax
; KNL-NEXT: mull %esi
@@ -327,7 +327,7 @@ define zeroext i1 @umuloi32(i32 %v1, i32 %v2, i32* %res) {
define zeroext i1 @umuloi64(i64 %v1, i64 %v2, i64* %res) {
; SDAG-LABEL: umuloi64:
-; SDAG: ## BB#0:
+; SDAG: ## %bb.0:
; SDAG-NEXT: movq %rdx, %rcx
; SDAG-NEXT: movq %rdi, %rax
; SDAG-NEXT: mulq %rsi
@@ -337,7 +337,7 @@ define zeroext i1 @umuloi64(i64 %v1, i64 %v2, i64* %res) {
; SDAG-NEXT: retq
;
; FAST-LABEL: umuloi64:
-; FAST: ## BB#0:
+; FAST: ## %bb.0:
; FAST-NEXT: movq %rdx, %rcx
; FAST-NEXT: movq %rdi, %rax
; FAST-NEXT: mulq %rsi
@@ -348,7 +348,7 @@ define zeroext i1 @umuloi64(i64 %v1, i64 %v2, i64* %res) {
; FAST-NEXT: retq
;
; KNL-LABEL: umuloi64:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: movq %rdx, %rcx
; KNL-NEXT: movq %rdi, %rax
; KNL-NEXT: mulq %rsi
@@ -368,7 +368,7 @@ define zeroext i1 @umuloi64(i64 %v1, i64 %v2, i64* %res) {
;
define i32 @smuloselecti32(i32 %v1, i32 %v2) {
; SDAG-LABEL: smuloselecti32:
-; SDAG: ## BB#0:
+; SDAG: ## %bb.0:
; SDAG-NEXT: movl %edi, %eax
; SDAG-NEXT: imull %esi, %eax
; SDAG-NEXT: cmovol %edi, %esi
@@ -376,7 +376,7 @@ define i32 @smuloselecti32(i32 %v1, i32 %v2) {
; SDAG-NEXT: retq
;
; FAST-LABEL: smuloselecti32:
-; FAST: ## BB#0:
+; FAST: ## %bb.0:
; FAST-NEXT: movl %edi, %eax
; FAST-NEXT: imull %esi, %eax
; FAST-NEXT: cmovol %edi, %esi
@@ -384,7 +384,7 @@ define i32 @smuloselecti32(i32 %v1, i32 %v2) {
; FAST-NEXT: retq
;
; KNL-LABEL: smuloselecti32:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: movl %edi, %eax
; KNL-NEXT: imull %esi, %eax
; KNL-NEXT: cmovol %edi, %esi
@@ -398,7 +398,7 @@ define i32 @smuloselecti32(i32 %v1, i32 %v2) {
define i64 @smuloselecti64(i64 %v1, i64 %v2) {
; SDAG-LABEL: smuloselecti64:
-; SDAG: ## BB#0:
+; SDAG: ## %bb.0:
; SDAG-NEXT: movq %rdi, %rax
; SDAG-NEXT: imulq %rsi, %rax
; SDAG-NEXT: cmovoq %rdi, %rsi
@@ -406,7 +406,7 @@ define i64 @smuloselecti64(i64 %v1, i64 %v2) {
; SDAG-NEXT: retq
;
; FAST-LABEL: smuloselecti64:
-; FAST: ## BB#0:
+; FAST: ## %bb.0:
; FAST-NEXT: movq %rdi, %rax
; FAST-NEXT: imulq %rsi, %rax
; FAST-NEXT: cmovoq %rdi, %rsi
@@ -414,7 +414,7 @@ define i64 @smuloselecti64(i64 %v1, i64 %v2) {
; FAST-NEXT: retq
;
; KNL-LABEL: smuloselecti64:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: movq %rdi, %rax
; KNL-NEXT: imulq %rsi, %rax
; KNL-NEXT: cmovoq %rdi, %rsi
@@ -428,7 +428,7 @@ define i64 @smuloselecti64(i64 %v1, i64 %v2) {
define i32 @umuloselecti32(i32 %v1, i32 %v2) {
; SDAG-LABEL: umuloselecti32:
-; SDAG: ## BB#0:
+; SDAG: ## %bb.0:
; SDAG-NEXT: movl %edi, %eax
; SDAG-NEXT: mull %esi
; SDAG-NEXT: cmovol %edi, %esi
@@ -436,7 +436,7 @@ define i32 @umuloselecti32(i32 %v1, i32 %v2) {
; SDAG-NEXT: retq
;
; FAST-LABEL: umuloselecti32:
-; FAST: ## BB#0:
+; FAST: ## %bb.0:
; FAST-NEXT: movl %edi, %eax
; FAST-NEXT: mull %esi
; FAST-NEXT: cmovol %edi, %esi
@@ -444,7 +444,7 @@ define i32 @umuloselecti32(i32 %v1, i32 %v2) {
; FAST-NEXT: retq
;
; KNL-LABEL: umuloselecti32:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: movl %edi, %eax
; KNL-NEXT: mull %esi
; KNL-NEXT: cmovol %edi, %esi
@@ -458,7 +458,7 @@ define i32 @umuloselecti32(i32 %v1, i32 %v2) {
define i64 @umuloselecti64(i64 %v1, i64 %v2) {
; SDAG-LABEL: umuloselecti64:
-; SDAG: ## BB#0:
+; SDAG: ## %bb.0:
; SDAG-NEXT: movq %rdi, %rax
; SDAG-NEXT: mulq %rsi
; SDAG-NEXT: cmovoq %rdi, %rsi
@@ -466,7 +466,7 @@ define i64 @umuloselecti64(i64 %v1, i64 %v2) {
; SDAG-NEXT: retq
;
; FAST-LABEL: umuloselecti64:
-; FAST: ## BB#0:
+; FAST: ## %bb.0:
; FAST-NEXT: movq %rdi, %rax
; FAST-NEXT: mulq %rsi
; FAST-NEXT: cmovoq %rdi, %rsi
@@ -474,7 +474,7 @@ define i64 @umuloselecti64(i64 %v1, i64 %v2) {
; FAST-NEXT: retq
;
; KNL-LABEL: umuloselecti64:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: movq %rdi, %rax
; KNL-NEXT: mulq %rsi
; KNL-NEXT: cmovoq %rdi, %rsi
@@ -491,10 +491,10 @@ define i64 @umuloselecti64(i64 %v1, i64 %v2) {
;
define zeroext i1 @smulobri32(i32 %v1, i32 %v2) {
; SDAG-LABEL: smulobri32:
-; SDAG: ## BB#0:
+; SDAG: ## %bb.0:
; SDAG-NEXT: imull %esi, %edi
; SDAG-NEXT: jo LBB15_1
-; SDAG-NEXT: ## BB#2: ## %continue
+; SDAG-NEXT: ## %bb.2: ## %continue
; SDAG-NEXT: movb $1, %al
; SDAG-NEXT: retq
; SDAG-NEXT: LBB15_1: ## %overflow
@@ -502,10 +502,10 @@ define zeroext i1 @smulobri32(i32 %v1, i32 %v2) {
; SDAG-NEXT: retq
;
; FAST-LABEL: smulobri32:
-; FAST: ## BB#0:
+; FAST: ## %bb.0:
; FAST-NEXT: imull %esi, %edi
; FAST-NEXT: jo LBB15_1
-; FAST-NEXT: ## BB#2: ## %continue
+; FAST-NEXT: ## %bb.2: ## %continue
; FAST-NEXT: movb $1, %al
; FAST-NEXT: andb $1, %al
; FAST-NEXT: movzbl %al, %eax
@@ -517,10 +517,10 @@ define zeroext i1 @smulobri32(i32 %v1, i32 %v2) {
; FAST-NEXT: retq
;
; KNL-LABEL: smulobri32:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: imull %esi, %edi
; KNL-NEXT: jo LBB15_1
-; KNL-NEXT: ## BB#2: ## %continue
+; KNL-NEXT: ## %bb.2: ## %continue
; KNL-NEXT: movb $1, %al
; KNL-NEXT: retq
; KNL-NEXT: LBB15_1: ## %overflow
@@ -540,10 +540,10 @@ continue:
define zeroext i1 @smulobri64(i64 %v1, i64 %v2) {
; SDAG-LABEL: smulobri64:
-; SDAG: ## BB#0:
+; SDAG: ## %bb.0:
; SDAG-NEXT: imulq %rsi, %rdi
; SDAG-NEXT: jo LBB16_1
-; SDAG-NEXT: ## BB#2: ## %continue
+; SDAG-NEXT: ## %bb.2: ## %continue
; SDAG-NEXT: movb $1, %al
; SDAG-NEXT: retq
; SDAG-NEXT: LBB16_1: ## %overflow
@@ -551,10 +551,10 @@ define zeroext i1 @smulobri64(i64 %v1, i64 %v2) {
; SDAG-NEXT: retq
;
; FAST-LABEL: smulobri64:
-; FAST: ## BB#0:
+; FAST: ## %bb.0:
; FAST-NEXT: imulq %rsi, %rdi
; FAST-NEXT: jo LBB16_1
-; FAST-NEXT: ## BB#2: ## %continue
+; FAST-NEXT: ## %bb.2: ## %continue
; FAST-NEXT: movb $1, %al
; FAST-NEXT: andb $1, %al
; FAST-NEXT: movzbl %al, %eax
@@ -566,10 +566,10 @@ define zeroext i1 @smulobri64(i64 %v1, i64 %v2) {
; FAST-NEXT: retq
;
; KNL-LABEL: smulobri64:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: imulq %rsi, %rdi
; KNL-NEXT: jo LBB16_1
-; KNL-NEXT: ## BB#2: ## %continue
+; KNL-NEXT: ## %bb.2: ## %continue
; KNL-NEXT: movb $1, %al
; KNL-NEXT: retq
; KNL-NEXT: LBB16_1: ## %overflow
@@ -589,11 +589,11 @@ continue:
define zeroext i1 @umulobri32(i32 %v1, i32 %v2) {
; SDAG-LABEL: umulobri32:
-; SDAG: ## BB#0:
+; SDAG: ## %bb.0:
; SDAG-NEXT: movl %edi, %eax
; SDAG-NEXT: mull %esi
; SDAG-NEXT: jo LBB17_1
-; SDAG-NEXT: ## BB#2: ## %continue
+; SDAG-NEXT: ## %bb.2: ## %continue
; SDAG-NEXT: movb $1, %al
; SDAG-NEXT: retq
; SDAG-NEXT: LBB17_1: ## %overflow
@@ -601,11 +601,11 @@ define zeroext i1 @umulobri32(i32 %v1, i32 %v2) {
; SDAG-NEXT: retq
;
; FAST-LABEL: umulobri32:
-; FAST: ## BB#0:
+; FAST: ## %bb.0:
; FAST-NEXT: movl %edi, %eax
; FAST-NEXT: mull %esi
; FAST-NEXT: jo LBB17_1
-; FAST-NEXT: ## BB#2: ## %continue
+; FAST-NEXT: ## %bb.2: ## %continue
; FAST-NEXT: movb $1, %al
; FAST-NEXT: andb $1, %al
; FAST-NEXT: movzbl %al, %eax
@@ -617,11 +617,11 @@ define zeroext i1 @umulobri32(i32 %v1, i32 %v2) {
; FAST-NEXT: retq
;
; KNL-LABEL: umulobri32:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: movl %edi, %eax
; KNL-NEXT: mull %esi
; KNL-NEXT: jo LBB17_1
-; KNL-NEXT: ## BB#2: ## %continue
+; KNL-NEXT: ## %bb.2: ## %continue
; KNL-NEXT: movb $1, %al
; KNL-NEXT: retq
; KNL-NEXT: LBB17_1: ## %overflow
@@ -641,11 +641,11 @@ continue:
define zeroext i1 @umulobri64(i64 %v1, i64 %v2) {
; SDAG-LABEL: umulobri64:
-; SDAG: ## BB#0:
+; SDAG: ## %bb.0:
; SDAG-NEXT: movq %rdi, %rax
; SDAG-NEXT: mulq %rsi
; SDAG-NEXT: jo LBB18_1
-; SDAG-NEXT: ## BB#2: ## %continue
+; SDAG-NEXT: ## %bb.2: ## %continue
; SDAG-NEXT: movb $1, %al
; SDAG-NEXT: retq
; SDAG-NEXT: LBB18_1: ## %overflow
@@ -653,11 +653,11 @@ define zeroext i1 @umulobri64(i64 %v1, i64 %v2) {
; SDAG-NEXT: retq
;
; FAST-LABEL: umulobri64:
-; FAST: ## BB#0:
+; FAST: ## %bb.0:
; FAST-NEXT: movq %rdi, %rax
; FAST-NEXT: mulq %rsi
; FAST-NEXT: jo LBB18_1
-; FAST-NEXT: ## BB#2: ## %continue
+; FAST-NEXT: ## %bb.2: ## %continue
; FAST-NEXT: movb $1, %al
; FAST-NEXT: andb $1, %al
; FAST-NEXT: movzbl %al, %eax
@@ -669,11 +669,11 @@ define zeroext i1 @umulobri64(i64 %v1, i64 %v2) {
; FAST-NEXT: retq
;
; KNL-LABEL: umulobri64:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: movq %rdi, %rax
; KNL-NEXT: mulq %rsi
; KNL-NEXT: jo LBB18_1
-; KNL-NEXT: ## BB#2: ## %continue
+; KNL-NEXT: ## %bb.2: ## %continue
; KNL-NEXT: movb $1, %al
; KNL-NEXT: retq
; KNL-NEXT: LBB18_1: ## %overflow
@@ -693,7 +693,7 @@ continue:
define i1 @bug27873(i64 %c1, i1 %c2) {
; SDAG-LABEL: bug27873:
-; SDAG: ## BB#0:
+; SDAG: ## %bb.0:
; SDAG-NEXT: movl $160, %ecx
; SDAG-NEXT: movq %rdi, %rax
; SDAG-NEXT: mulq %rcx
@@ -702,7 +702,7 @@ define i1 @bug27873(i64 %c1, i1 %c2) {
; SDAG-NEXT: retq
;
; FAST-LABEL: bug27873:
-; FAST: ## BB#0:
+; FAST: ## %bb.0:
; FAST-NEXT: movl $160, %ecx
; FAST-NEXT: movq %rdi, %rax
; FAST-NEXT: mulq %rcx
@@ -711,7 +711,7 @@ define i1 @bug27873(i64 %c1, i1 %c2) {
; FAST-NEXT: retq
;
; KNL-LABEL: bug27873:
-; KNL: ## BB#0:
+; KNL: ## %bb.0:
; KNL-NEXT: movl $160, %ecx
; KNL-NEXT: movq %rdi, %rax
; KNL-NEXT: mulq %rcx
diff --git a/test/CodeGen/X86/xop-ifma.ll b/test/CodeGen/X86/xop-ifma.ll
index 83291095b876..594058f6c534 100644
--- a/test/CodeGen/X86/xop-ifma.ll
+++ b/test/CodeGen/X86/xop-ifma.ll
@@ -4,7 +4,7 @@
define <8 x i16> @test_mul_v8i16_add_v8i16(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> %a2) {
; XOP-LABEL: test_mul_v8i16_add_v8i16:
-; XOP: # BB#0:
+; XOP: # %bb.0:
; XOP-NEXT: vpmacsww %xmm2, %xmm1, %xmm0, %xmm0
; XOP-NEXT: retq
%1 = mul <8 x i16> %a0, %a1
@@ -14,7 +14,7 @@ define <8 x i16> @test_mul_v8i16_add_v8i16(<8 x i16> %a0, <8 x i16> %a1, <8 x i1
define <16 x i16> @test_mul_v16i16_add_v16i16(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> %a2) {
; XOP-AVX1-LABEL: test_mul_v16i16_add_v16i16:
-; XOP-AVX1: # BB#0:
+; XOP-AVX1: # %bb.0:
; XOP-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
; XOP-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
; XOP-AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
@@ -24,7 +24,7 @@ define <16 x i16> @test_mul_v16i16_add_v16i16(<16 x i16> %a0, <16 x i16> %a1, <1
; XOP-AVX1-NEXT: retq
;
; XOP-AVX2-LABEL: test_mul_v16i16_add_v16i16:
-; XOP-AVX2: # BB#0:
+; XOP-AVX2: # %bb.0:
; XOP-AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0
; XOP-AVX2-NEXT: vpaddw %ymm0, %ymm2, %ymm0
; XOP-AVX2-NEXT: retq
@@ -35,7 +35,7 @@ define <16 x i16> @test_mul_v16i16_add_v16i16(<16 x i16> %a0, <16 x i16> %a1, <1
define <4 x i32> @test_mul_v4i32_add_v4i32(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
; XOP-LABEL: test_mul_v4i32_add_v4i32:
-; XOP: # BB#0:
+; XOP: # %bb.0:
; XOP-NEXT: vpmacsdd %xmm2, %xmm1, %xmm0, %xmm0
; XOP-NEXT: retq
%1 = mul <4 x i32> %a0, %a1
@@ -45,7 +45,7 @@ define <4 x i32> @test_mul_v4i32_add_v4i32(<4 x i32> %a0, <4 x i32> %a1, <4 x i3
define <8 x i32> @test_mul_v8i32_add_v8i32(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) {
; XOP-AVX1-LABEL: test_mul_v8i32_add_v8i32:
-; XOP-AVX1: # BB#0:
+; XOP-AVX1: # %bb.0:
; XOP-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
; XOP-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
; XOP-AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
@@ -55,7 +55,7 @@ define <8 x i32> @test_mul_v8i32_add_v8i32(<8 x i32> %a0, <8 x i32> %a1, <8 x i3
; XOP-AVX1-NEXT: retq
;
; XOP-AVX2-LABEL: test_mul_v8i32_add_v8i32:
-; XOP-AVX2: # BB#0:
+; XOP-AVX2: # %bb.0:
; XOP-AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0
; XOP-AVX2-NEXT: vpaddd %ymm0, %ymm2, %ymm0
; XOP-AVX2-NEXT: retq
@@ -66,7 +66,7 @@ define <8 x i32> @test_mul_v8i32_add_v8i32(<8 x i32> %a0, <8 x i32> %a1, <8 x i3
define <4 x i64> @test_mulx_v4i32_add_v4i64(<4 x i32> %a0, <4 x i32> %a1, <4 x i64> %a2) {
; XOP-AVX1-LABEL: test_mulx_v4i32_add_v4i64:
-; XOP-AVX1: # BB#0:
+; XOP-AVX1: # %bb.0:
; XOP-AVX1-NEXT: vpmovsxdq %xmm0, %xmm3
; XOP-AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
; XOP-AVX1-NEXT: vpmovsxdq %xmm0, %xmm0
@@ -80,7 +80,7 @@ define <4 x i64> @test_mulx_v4i32_add_v4i64(<4 x i32> %a0, <4 x i32> %a1, <4 x i
; XOP-AVX1-NEXT: retq
;
; XOP-AVX2-LABEL: test_mulx_v4i32_add_v4i64:
-; XOP-AVX2: # BB#0:
+; XOP-AVX2: # %bb.0:
; XOP-AVX2-NEXT: vpmovsxdq %xmm0, %ymm0
; XOP-AVX2-NEXT: vpmovsxdq %xmm1, %ymm1
; XOP-AVX2-NEXT: vpmuldq %ymm1, %ymm0, %ymm0
@@ -95,7 +95,7 @@ define <4 x i64> @test_mulx_v4i32_add_v4i64(<4 x i32> %a0, <4 x i32> %a1, <4 x i
define <2 x i64> @test_pmuldq_lo_v4i32_add_v2i64(<4 x i32> %a0, <4 x i32> %a1, <2 x i64> %a2) {
; XOP-LABEL: test_pmuldq_lo_v4i32_add_v2i64:
-; XOP: # BB#0:
+; XOP: # %bb.0:
; XOP-NEXT: vpmacsdql %xmm2, %xmm1, %xmm0, %xmm0
; XOP-NEXT: retq
%1 = call <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32> %a0, <4 x i32> %a1)
@@ -105,7 +105,7 @@ define <2 x i64> @test_pmuldq_lo_v4i32_add_v2i64(<4 x i32> %a0, <4 x i32> %a1, <
define <2 x i64> @test_pmuldq_hi_v4i32_add_v2i64(<4 x i32> %a0, <4 x i32> %a1, <2 x i64> %a2) {
; XOP-LABEL: test_pmuldq_hi_v4i32_add_v2i64:
-; XOP: # BB#0:
+; XOP: # %bb.0:
; XOP-NEXT: vpmacsdqh %xmm2, %xmm1, %xmm0, %xmm0
; XOP-NEXT: retq
%1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 3, i32 undef>
@@ -117,7 +117,7 @@ define <2 x i64> @test_pmuldq_hi_v4i32_add_v2i64(<4 x i32> %a0, <4 x i32> %a1, <
define <4 x i32> @test_pmaddwd_v8i16_add_v4i32(<8 x i16> %a0, <8 x i16> %a1, <4 x i32> %a2) {
; XOP-LABEL: test_pmaddwd_v8i16_add_v4i32:
-; XOP: # BB#0:
+; XOP: # %bb.0:
; XOP-NEXT: vpmadcswd %xmm2, %xmm1, %xmm0, %xmm0
; XOP-NEXT: retq
%1 = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %a0, <8 x i16> %a1)
diff --git a/test/CodeGen/X86/xop-intrinsics-fast-isel.ll b/test/CodeGen/X86/xop-intrinsics-fast-isel.ll
index 5f56e2d80d73..911ab945c5d0 100644
--- a/test/CodeGen/X86/xop-intrinsics-fast-isel.ll
+++ b/test/CodeGen/X86/xop-intrinsics-fast-isel.ll
@@ -6,12 +6,12 @@
define <2 x i64> @test_mm_maccs_epi16(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2) nounwind {
; X32-LABEL: test_mm_maccs_epi16:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpmacssww %xmm2, %xmm1, %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_maccs_epi16:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpmacssww %xmm2, %xmm1, %xmm0, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <8 x i16>
@@ -25,12 +25,12 @@ declare <8 x i16> @llvm.x86.xop.vpmacssww(<8 x i16>, <8 x i16>, <8 x i16>) nounw
define <2 x i64> @test_mm_macc_epi16(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2) nounwind {
; X32-LABEL: test_mm_macc_epi16:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpmacsww %xmm2, %xmm1, %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_macc_epi16:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpmacsww %xmm2, %xmm1, %xmm0, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <8 x i16>
@@ -44,12 +44,12 @@ declare <8 x i16> @llvm.x86.xop.vpmacsww(<8 x i16>, <8 x i16>, <8 x i16>) nounwi
define <2 x i64> @test_mm_maccsd_epi16(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2) nounwind {
; X32-LABEL: test_mm_maccsd_epi16:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpmacsswd %xmm2, %xmm1, %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_maccsd_epi16:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpmacsswd %xmm2, %xmm1, %xmm0, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <8 x i16>
@@ -63,12 +63,12 @@ declare <4 x i32> @llvm.x86.xop.vpmacsswd(<8 x i16>, <8 x i16>, <4 x i32>) nounw
define <2 x i64> @test_mm_maccd_epi16(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2) nounwind {
; X32-LABEL: test_mm_maccd_epi16:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpmacswd %xmm2, %xmm1, %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_maccd_epi16:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpmacswd %xmm2, %xmm1, %xmm0, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <8 x i16>
@@ -82,12 +82,12 @@ declare <4 x i32> @llvm.x86.xop.vpmacswd(<8 x i16>, <8 x i16>, <4 x i32>) nounwi
define <2 x i64> @test_mm_maccs_epi32(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2) nounwind {
; X32-LABEL: test_mm_maccs_epi32:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpmacssdd %xmm2, %xmm1, %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_maccs_epi32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpmacssdd %xmm2, %xmm1, %xmm0, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <4 x i32>
@@ -101,12 +101,12 @@ declare <4 x i32> @llvm.x86.xop.vpmacssdd(<4 x i32>, <4 x i32>, <4 x i32>) nounw
define <2 x i64> @test_mm_macc_epi32(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2) nounwind {
; X32-LABEL: test_mm_macc_epi32:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpmacsdd %xmm2, %xmm1, %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_macc_epi32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpmacsdd %xmm2, %xmm1, %xmm0, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <4 x i32>
@@ -120,12 +120,12 @@ declare <4 x i32> @llvm.x86.xop.vpmacsdd(<4 x i32>, <4 x i32>, <4 x i32>) nounwi
define <2 x i64> @test_mm_maccslo_epi32(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2) nounwind {
; X32-LABEL: test_mm_maccslo_epi32:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpmacssdql %xmm2, %xmm1, %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_maccslo_epi32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpmacssdql %xmm2, %xmm1, %xmm0, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <4 x i32>
@@ -137,12 +137,12 @@ declare <2 x i64> @llvm.x86.xop.vpmacssdql(<4 x i32>, <4 x i32>, <2 x i64>) noun
define <2 x i64> @test_mm_macclo_epi32(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2) nounwind {
; X32-LABEL: test_mm_macclo_epi32:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpmacsdql %xmm2, %xmm1, %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_macclo_epi32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpmacsdql %xmm2, %xmm1, %xmm0, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <4 x i32>
@@ -154,12 +154,12 @@ declare <2 x i64> @llvm.x86.xop.vpmacsdql(<4 x i32>, <4 x i32>, <2 x i64>) nounw
define <2 x i64> @test_mm_maccshi_epi32(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2) nounwind {
; X32-LABEL: test_mm_maccshi_epi32:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpmacssdqh %xmm2, %xmm1, %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_maccshi_epi32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpmacssdqh %xmm2, %xmm1, %xmm0, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <4 x i32>
@@ -171,12 +171,12 @@ declare <2 x i64> @llvm.x86.xop.vpmacssdqh(<4 x i32>, <4 x i32>, <2 x i64>) noun
define <2 x i64> @test_mm_macchi_epi32(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2) nounwind {
; X32-LABEL: test_mm_macchi_epi32:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpmacsdqh %xmm2, %xmm1, %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_macchi_epi32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpmacsdqh %xmm2, %xmm1, %xmm0, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <4 x i32>
@@ -188,12 +188,12 @@ declare <2 x i64> @llvm.x86.xop.vpmacsdqh(<4 x i32>, <4 x i32>, <2 x i64>) nounw
define <2 x i64> @test_mm_maddsd_epi16(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2) nounwind {
; X32-LABEL: test_mm_maddsd_epi16:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpmadcsswd %xmm2, %xmm1, %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_maddsd_epi16:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpmadcsswd %xmm2, %xmm1, %xmm0, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <8 x i16>
@@ -207,12 +207,12 @@ declare <4 x i32> @llvm.x86.xop.vpmadcsswd(<8 x i16>, <8 x i16>, <4 x i32>) noun
define <2 x i64> @test_mm_maddd_epi16(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2) nounwind {
; X32-LABEL: test_mm_maddd_epi16:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpmadcswd %xmm2, %xmm1, %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_maddd_epi16:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpmadcswd %xmm2, %xmm1, %xmm0, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <8 x i16>
@@ -226,12 +226,12 @@ declare <4 x i32> @llvm.x86.xop.vpmadcswd(<8 x i16>, <8 x i16>, <4 x i32>) nounw
define <2 x i64> @test_mm_haddw_epi8(<2 x i64> %a0) {
; X32-LABEL: test_mm_haddw_epi8:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vphaddbw %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_haddw_epi8:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vphaddbw %xmm0, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <16 x i8>
@@ -243,12 +243,12 @@ declare <8 x i16> @llvm.x86.xop.vphaddbw(<16 x i8>) nounwind readnone
define <2 x i64> @test_mm_haddd_epi8(<2 x i64> %a0) {
; X32-LABEL: test_mm_haddd_epi8:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vphaddbd %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_haddd_epi8:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vphaddbd %xmm0, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <16 x i8>
@@ -260,12 +260,12 @@ declare <4 x i32> @llvm.x86.xop.vphaddbd(<16 x i8>) nounwind readnone
define <2 x i64> @test_mm_haddq_epi8(<2 x i64> %a0) {
; X32-LABEL: test_mm_haddq_epi8:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vphaddbq %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_haddq_epi8:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vphaddbq %xmm0, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <16 x i8>
@@ -276,12 +276,12 @@ declare <2 x i64> @llvm.x86.xop.vphaddbq(<16 x i8>) nounwind readnone
define <2 x i64> @test_mm_haddd_epi16(<2 x i64> %a0) {
; X32-LABEL: test_mm_haddd_epi16:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vphaddwd %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_haddd_epi16:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vphaddwd %xmm0, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <8 x i16>
@@ -293,12 +293,12 @@ declare <4 x i32> @llvm.x86.xop.vphaddwd(<8 x i16>) nounwind readnone
define <2 x i64> @test_mm_haddq_epi16(<2 x i64> %a0) {
; X32-LABEL: test_mm_haddq_epi16:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vphaddwq %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_haddq_epi16:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vphaddwq %xmm0, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <8 x i16>
@@ -309,12 +309,12 @@ declare <2 x i64> @llvm.x86.xop.vphaddwq(<8 x i16>) nounwind readnone
define <2 x i64> @test_mm_haddq_epi32(<2 x i64> %a0) {
; X32-LABEL: test_mm_haddq_epi32:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vphadddq %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_haddq_epi32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vphadddq %xmm0, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <4 x i32>
@@ -325,12 +325,12 @@ declare <2 x i64> @llvm.x86.xop.vphadddq(<4 x i32>) nounwind readnone
define <2 x i64> @test_mm_haddw_epu8(<2 x i64> %a0) {
; X32-LABEL: test_mm_haddw_epu8:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vphaddubw %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_haddw_epu8:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vphaddubw %xmm0, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <16 x i8>
@@ -342,12 +342,12 @@ declare <8 x i16> @llvm.x86.xop.vphaddubw(<16 x i8>) nounwind readnone
define <2 x i64> @test_mm_haddd_epu8(<2 x i64> %a0) {
; X32-LABEL: test_mm_haddd_epu8:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vphaddubd %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_haddd_epu8:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vphaddubd %xmm0, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <16 x i8>
@@ -359,12 +359,12 @@ declare <4 x i32> @llvm.x86.xop.vphaddubd(<16 x i8>) nounwind readnone
define <2 x i64> @test_mm_haddq_epu8(<2 x i64> %a0) {
; X32-LABEL: test_mm_haddq_epu8:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vphaddubq %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_haddq_epu8:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vphaddubq %xmm0, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <16 x i8>
@@ -375,12 +375,12 @@ declare <2 x i64> @llvm.x86.xop.vphaddubq(<16 x i8>) nounwind readnone
define <2 x i64> @test_mm_haddd_epu16(<2 x i64> %a0) {
; X32-LABEL: test_mm_haddd_epu16:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vphadduwd %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_haddd_epu16:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vphadduwd %xmm0, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <8 x i16>
@@ -393,12 +393,12 @@ declare <4 x i32> @llvm.x86.xop.vphadduwd(<8 x i16>) nounwind readnone
define <2 x i64> @test_mm_haddq_epu16(<2 x i64> %a0) {
; X32-LABEL: test_mm_haddq_epu16:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vphadduwq %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_haddq_epu16:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vphadduwq %xmm0, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <8 x i16>
@@ -409,12 +409,12 @@ declare <2 x i64> @llvm.x86.xop.vphadduwq(<8 x i16>) nounwind readnone
define <2 x i64> @test_mm_haddq_epu32(<2 x i64> %a0) {
; X32-LABEL: test_mm_haddq_epu32:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vphaddudq %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_haddq_epu32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vphaddudq %xmm0, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <4 x i32>
@@ -425,12 +425,12 @@ declare <2 x i64> @llvm.x86.xop.vphaddudq(<4 x i32>) nounwind readnone
define <2 x i64> @test_mm_hsubw_epi8(<2 x i64> %a0) {
; X32-LABEL: test_mm_hsubw_epi8:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vphsubbw %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_hsubw_epi8:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vphsubbw %xmm0, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <16 x i8>
@@ -442,12 +442,12 @@ declare <8 x i16> @llvm.x86.xop.vphsubbw(<16 x i8>) nounwind readnone
define <2 x i64> @test_mm_hsubd_epi16(<2 x i64> %a0) {
; X32-LABEL: test_mm_hsubd_epi16:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vphsubwd %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_hsubd_epi16:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vphsubwd %xmm0, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <8 x i16>
@@ -459,12 +459,12 @@ declare <4 x i32> @llvm.x86.xop.vphsubwd(<8 x i16>) nounwind readnone
define <2 x i64> @test_mm_hsubq_epi32(<2 x i64> %a0) {
; X32-LABEL: test_mm_hsubq_epi32:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vphsubdq %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_hsubq_epi32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vphsubdq %xmm0, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <4 x i32>
@@ -475,7 +475,7 @@ declare <2 x i64> @llvm.x86.xop.vphsubdq(<4 x i32>) nounwind readnone
define <2 x i64> @test_mm_cmov_si128(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2) {
; X32-LABEL: test_mm_cmov_si128:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
; X32-NEXT: vpxor %xmm3, %xmm2, %xmm3
; X32-NEXT: vpand %xmm2, %xmm0, %xmm0
@@ -484,7 +484,7 @@ define <2 x i64> @test_mm_cmov_si128(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2
; X32-NEXT: retl
;
; X64-LABEL: test_mm_cmov_si128:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
; X64-NEXT: vpxor %xmm3, %xmm2, %xmm3
; X64-NEXT: vpand %xmm2, %xmm0, %xmm0
@@ -498,8 +498,8 @@ declare <2 x i64> @llvm.x86.xop.vpcmov(<2 x i64>, <2 x i64>, <2 x i64>) nounwind
define <4 x i64> @test_mm256_cmov_si256(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> %a2) {
; X32-LABEL: test_mm256_cmov_si256:
-; X32: # BB#0:
-; X32-NEXT: vxorps %ymm3, %ymm3, %ymm3
+; X32: # %bb.0:
+; X32-NEXT: vxorps %xmm3, %xmm3, %xmm3
; X32-NEXT: vcmptrueps %ymm3, %ymm3, %ymm3
; X32-NEXT: vxorps %ymm3, %ymm2, %ymm3
; X32-NEXT: vandps %ymm2, %ymm0, %ymm0
@@ -508,8 +508,8 @@ define <4 x i64> @test_mm256_cmov_si256(<4 x i64> %a0, <4 x i64> %a1, <4 x i64>
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_cmov_si256:
-; X64: # BB#0:
-; X64-NEXT: vxorps %ymm3, %ymm3, %ymm3
+; X64: # %bb.0:
+; X64-NEXT: vxorps %xmm3, %xmm3, %xmm3
; X64-NEXT: vcmptrueps %ymm3, %ymm3, %ymm3
; X64-NEXT: vxorps %ymm3, %ymm2, %ymm3
; X64-NEXT: vandps %ymm2, %ymm0, %ymm0
@@ -523,12 +523,12 @@ declare <4 x i64> @llvm.x86.xop.vpcmov.256(<4 x i64>, <4 x i64>, <4 x i64>) noun
define <2 x i64> @test_mm_perm_epi8(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2) {
; X32-LABEL: test_mm_perm_epi8:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_perm_epi8:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <16 x i8>
@@ -542,12 +542,12 @@ declare <16 x i8> @llvm.x86.xop.vpperm(<16 x i8>, <16 x i8>, <16 x i8>) nounwind
define <2 x i64> @test_mm_rot_epi8(<2 x i64> %a0, <2 x i64> %a1) {
; X32-LABEL: test_mm_rot_epi8:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vprotb %xmm1, %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_rot_epi8:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vprotb %xmm1, %xmm0, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <16 x i8>
@@ -560,12 +560,12 @@ declare <16 x i8> @llvm.x86.xop.vprotb(<16 x i8>, <16 x i8>) nounwind readnone
define <2 x i64> @test_mm_rot_epi16(<2 x i64> %a0, <2 x i64> %a1) {
; X32-LABEL: test_mm_rot_epi16:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vprotw %xmm1, %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_rot_epi16:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vprotw %xmm1, %xmm0, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <8 x i16>
@@ -578,12 +578,12 @@ declare <8 x i16> @llvm.x86.xop.vprotw(<8 x i16>, <8 x i16>) nounwind readnone
define <2 x i64> @test_mm_rot_epi32(<2 x i64> %a0, <2 x i64> %a1) {
; X32-LABEL: test_mm_rot_epi32:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vprotd %xmm1, %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_rot_epi32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vprotd %xmm1, %xmm0, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <4 x i32>
@@ -596,12 +596,12 @@ declare <4 x i32> @llvm.x86.xop.vprotd(<4 x i32>, <4 x i32>) nounwind readnone
define <2 x i64> @test_mm_rot_epi64(<2 x i64> %a0, <2 x i64> %a1) {
; X32-LABEL: test_mm_rot_epi64:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vprotq %xmm1, %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_rot_epi64:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vprotq %xmm1, %xmm0, %xmm0
; X64-NEXT: retq
%res = call <2 x i64> @llvm.x86.xop.vprotq(<2 x i64> %a0, <2 x i64> %a1)
@@ -611,12 +611,12 @@ declare <2 x i64> @llvm.x86.xop.vprotq(<2 x i64>, <2 x i64>) nounwind readnone
define <2 x i64> @test_mm_roti_epi8(<2 x i64> %a0) {
; X32-LABEL: test_mm_roti_epi8:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vprotb $1, %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_roti_epi8:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vprotb $1, %xmm0, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <16 x i8>
@@ -628,12 +628,12 @@ declare <16 x i8> @llvm.x86.xop.vprotbi(<16 x i8>, i8) nounwind readnone
define <2 x i64> @test_mm_roti_epi16(<2 x i64> %a0) {
; X32-LABEL: test_mm_roti_epi16:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vprotw $50, %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_roti_epi16:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vprotw $50, %xmm0, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <8 x i16>
@@ -645,12 +645,12 @@ declare <8 x i16> @llvm.x86.xop.vprotwi(<8 x i16>, i8) nounwind readnone
define <2 x i64> @test_mm_roti_epi32(<2 x i64> %a0) {
; X32-LABEL: test_mm_roti_epi32:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vprotd $226, %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_roti_epi32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vprotd $226, %xmm0, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <4 x i32>
@@ -662,12 +662,12 @@ declare <4 x i32> @llvm.x86.xop.vprotdi(<4 x i32>, i8) nounwind readnone
define <2 x i64> @test_mm_roti_epi64(<2 x i64> %a0) {
; X32-LABEL: test_mm_roti_epi64:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vprotq $100, %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_roti_epi64:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vprotq $100, %xmm0, %xmm0
; X64-NEXT: retq
%res = call <2 x i64> @llvm.x86.xop.vprotqi(<2 x i64> %a0, i8 100)
@@ -677,12 +677,12 @@ declare <2 x i64> @llvm.x86.xop.vprotqi(<2 x i64>, i8) nounwind readnone
define <2 x i64> @test_mm_shl_epi8(<2 x i64> %a0, <2 x i64> %a1) {
; X32-LABEL: test_mm_shl_epi8:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpshlb %xmm1, %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_shl_epi8:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpshlb %xmm1, %xmm0, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <16 x i8>
@@ -695,12 +695,12 @@ declare <16 x i8> @llvm.x86.xop.vpshlb(<16 x i8>, <16 x i8>) nounwind readnone
define <2 x i64> @test_mm_shl_epi16(<2 x i64> %a0, <2 x i64> %a1) {
; X32-LABEL: test_mm_shl_epi16:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpshlw %xmm1, %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_shl_epi16:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpshlw %xmm1, %xmm0, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <8 x i16>
@@ -713,12 +713,12 @@ declare <8 x i16> @llvm.x86.xop.vpshlw(<8 x i16>, <8 x i16>) nounwind readnone
define <2 x i64> @test_mm_shl_epi32(<2 x i64> %a0, <2 x i64> %a1) {
; X32-LABEL: test_mm_shl_epi32:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpshld %xmm1, %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_shl_epi32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpshld %xmm1, %xmm0, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <4 x i32>
@@ -731,12 +731,12 @@ declare <4 x i32> @llvm.x86.xop.vpshld(<4 x i32>, <4 x i32>) nounwind readnone
define <2 x i64> @test_mm_shl_epi64(<2 x i64> %a0, <2 x i64> %a1) {
; X32-LABEL: test_mm_shl_epi64:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpshlq %xmm1, %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_shl_epi64:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpshlq %xmm1, %xmm0, %xmm0
; X64-NEXT: retq
%res = call <2 x i64> @llvm.x86.xop.vpshlq(<2 x i64> %a0, <2 x i64> %a1)
@@ -746,12 +746,12 @@ declare <2 x i64> @llvm.x86.xop.vpshlq(<2 x i64>, <2 x i64>) nounwind readnone
define <2 x i64> @test_mm_sha_epi8(<2 x i64> %a0, <2 x i64> %a1) {
; X32-LABEL: test_mm_sha_epi8:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpshab %xmm1, %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_sha_epi8:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpshab %xmm1, %xmm0, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <16 x i8>
@@ -764,12 +764,12 @@ declare <16 x i8> @llvm.x86.xop.vpshab(<16 x i8>, <16 x i8>) nounwind readnone
define <2 x i64> @test_mm_sha_epi16(<2 x i64> %a0, <2 x i64> %a1) {
; X32-LABEL: test_mm_sha_epi16:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpshaw %xmm1, %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_sha_epi16:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpshaw %xmm1, %xmm0, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <8 x i16>
@@ -782,12 +782,12 @@ declare <8 x i16> @llvm.x86.xop.vpshaw(<8 x i16>, <8 x i16>) nounwind readnone
define <2 x i64> @test_mm_sha_epi32(<2 x i64> %a0, <2 x i64> %a1) {
; X32-LABEL: test_mm_sha_epi32:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpshad %xmm1, %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_sha_epi32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpshad %xmm1, %xmm0, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <4 x i32>
@@ -800,12 +800,12 @@ declare <4 x i32> @llvm.x86.xop.vpshad(<4 x i32>, <4 x i32>) nounwind readnone
define <2 x i64> @test_mm_sha_epi64(<2 x i64> %a0, <2 x i64> %a1) {
; X32-LABEL: test_mm_sha_epi64:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpshaq %xmm1, %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_sha_epi64:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpshaq %xmm1, %xmm0, %xmm0
; X64-NEXT: retq
%res = call <2 x i64> @llvm.x86.xop.vpshaq(<2 x i64> %a0, <2 x i64> %a1)
@@ -815,12 +815,12 @@ declare <2 x i64> @llvm.x86.xop.vpshaq(<2 x i64>, <2 x i64>) nounwind readnone
define <2 x i64> @test_mm_com_epu8(<2 x i64> %a0, <2 x i64> %a1) {
; X32-LABEL: test_mm_com_epu8:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpcomltub %xmm1, %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_com_epu8:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpcomltub %xmm1, %xmm0, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <16 x i8>
@@ -833,12 +833,12 @@ declare <16 x i8> @llvm.x86.xop.vpcomub(<16 x i8>, <16 x i8>, i8) nounwind readn
define <2 x i64> @test_mm_com_epu16(<2 x i64> %a0, <2 x i64> %a1) {
; X32-LABEL: test_mm_com_epu16:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpcomltuw %xmm1, %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_com_epu16:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpcomltuw %xmm1, %xmm0, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <8 x i16>
@@ -851,12 +851,12 @@ declare <8 x i16> @llvm.x86.xop.vpcomuw(<8 x i16>, <8 x i16>, i8) nounwind readn
define <2 x i64> @test_mm_com_epu32(<2 x i64> %a0, <2 x i64> %a1) {
; X32-LABEL: test_mm_com_epu32:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpcomltud %xmm1, %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_com_epu32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpcomltud %xmm1, %xmm0, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <4 x i32>
@@ -869,12 +869,12 @@ declare <4 x i32> @llvm.x86.xop.vpcomud(<4 x i32>, <4 x i32>, i8) nounwind readn
define <2 x i64> @test_mm_com_epu64(<2 x i64> %a0, <2 x i64> %a1) {
; X32-LABEL: test_mm_com_epu64:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpcomltuq %xmm1, %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_com_epu64:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpcomltuq %xmm1, %xmm0, %xmm0
; X64-NEXT: retq
%res = call <2 x i64> @llvm.x86.xop.vpcomuq(<2 x i64> %a0, <2 x i64> %a1, i8 0)
@@ -884,12 +884,12 @@ declare <2 x i64> @llvm.x86.xop.vpcomuq(<2 x i64>, <2 x i64>, i8) nounwind readn
define <2 x i64> @test_mm_com_epi8(<2 x i64> %a0, <2 x i64> %a1) {
; X32-LABEL: test_mm_com_epi8:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpcomltb %xmm1, %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_com_epi8:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpcomltb %xmm1, %xmm0, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <16 x i8>
@@ -902,12 +902,12 @@ declare <16 x i8> @llvm.x86.xop.vpcomb(<16 x i8>, <16 x i8>, i8) nounwind readno
define <2 x i64> @test_mm_com_epi16(<2 x i64> %a0, <2 x i64> %a1) {
; X32-LABEL: test_mm_com_epi16:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpcomltw %xmm1, %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_com_epi16:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpcomltw %xmm1, %xmm0, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <8 x i16>
@@ -920,12 +920,12 @@ declare <8 x i16> @llvm.x86.xop.vpcomw(<8 x i16>, <8 x i16>, i8) nounwind readno
define <2 x i64> @test_mm_com_epi32(<2 x i64> %a0, <2 x i64> %a1) {
; X32-LABEL: test_mm_com_epi32:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpcomltd %xmm1, %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_com_epi32:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpcomltd %xmm1, %xmm0, %xmm0
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <4 x i32>
@@ -938,12 +938,12 @@ declare <4 x i32> @llvm.x86.xop.vpcomd(<4 x i32>, <4 x i32>, i8) nounwind readno
define <2 x i64> @test_mm_com_epi64(<2 x i64> %a0, <2 x i64> %a1) {
; X32-LABEL: test_mm_com_epi64:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpcomltq %xmm1, %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_com_epi64:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpcomltq %xmm1, %xmm0, %xmm0
; X64-NEXT: retq
%res = call <2 x i64> @llvm.x86.xop.vpcomq(<2 x i64> %a0, <2 x i64> %a1, i8 0)
@@ -953,12 +953,12 @@ declare <2 x i64> @llvm.x86.xop.vpcomq(<2 x i64>, <2 x i64>, i8) nounwind readno
define <2 x double> @test_mm_permute2_pd(<2 x double> %a0, <2 x double> %a1, <2 x i64> %a2) {
; X32-LABEL: test_mm_permute2_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpermil2pd $0, %xmm2, %xmm1, %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_permute2_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpermil2pd $0, %xmm2, %xmm1, %xmm0, %xmm0
; X64-NEXT: retq
%res = call <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double> %a0, <2 x double> %a1, <2 x i64> %a2, i8 0)
@@ -968,12 +968,12 @@ declare <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double>, <2 x double>, <2 x i
define <4 x double> @test_mm256_permute2_pd(<4 x double> %a0, <4 x double> %a1, <4 x i64> %a2) {
; X32-LABEL: test_mm256_permute2_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpermil2pd $0, %ymm2, %ymm1, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_permute2_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpermil2pd $0, %ymm2, %ymm1, %ymm0, %ymm0
; X64-NEXT: retq
%res = call <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double> %a0, <4 x double> %a1, <4 x i64> %a2, i8 0)
@@ -983,12 +983,12 @@ declare <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double>, <4 x double>, <4
define <4 x float> @test_mm_permute2_ps(<4 x float> %a0, <4 x float> %a1, <2 x i64> %a2) {
; X32-LABEL: test_mm_permute2_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpermil2ps $0, %xmm2, %xmm1, %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_permute2_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpermil2ps $0, %xmm2, %xmm1, %xmm0, %xmm0
; X64-NEXT: retq
%arg2 = bitcast <2 x i64> %a2 to <4 x i32>
@@ -999,12 +999,12 @@ declare <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float>, <4 x float>, <4 x i32>
define <8 x float> @test_mm256_permute2_ps(<8 x float> %a0, <8 x float> %a1, <4 x i64> %a2) {
; X32-LABEL: test_mm256_permute2_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpermil2ps $0, %ymm2, %ymm1, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_permute2_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpermil2ps $0, %ymm2, %ymm1, %ymm0, %ymm0
; X64-NEXT: retq
%arg2 = bitcast <4 x i64> %a2 to <8 x i32>
@@ -1015,12 +1015,12 @@ declare <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float>, <8 x float>, <8 x
define <4 x float> @test_mm_frcz_ss(<4 x float> %a0) {
; X32-LABEL: test_mm_frcz_ss:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vfrczss %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_frcz_ss:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vfrczss %xmm0, %xmm0
; X64-NEXT: retq
%res = call <4 x float> @llvm.x86.xop.vfrcz.ss(<4 x float> %a0)
@@ -1030,12 +1030,12 @@ declare <4 x float> @llvm.x86.xop.vfrcz.ss(<4 x float>) nounwind readnone
define <2 x double> @test_mm_frcz_sd(<2 x double> %a0) {
; X32-LABEL: test_mm_frcz_sd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vfrczsd %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_frcz_sd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vfrczsd %xmm0, %xmm0
; X64-NEXT: retq
%res = call <2 x double> @llvm.x86.xop.vfrcz.sd(<2 x double> %a0)
@@ -1045,12 +1045,12 @@ declare <2 x double> @llvm.x86.xop.vfrcz.sd(<2 x double>) nounwind readnone
define <4 x float> @test_mm_frcz_ps(<4 x float> %a0) {
; X32-LABEL: test_mm_frcz_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vfrczps %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_frcz_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vfrczps %xmm0, %xmm0
; X64-NEXT: retq
%res = call <4 x float> @llvm.x86.xop.vfrcz.ps(<4 x float> %a0)
@@ -1060,12 +1060,12 @@ declare <4 x float> @llvm.x86.xop.vfrcz.ps(<4 x float>) nounwind readnone
define <2 x double> @test_mm_frcz_pd(<2 x double> %a0) {
; X32-LABEL: test_mm_frcz_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vfrczpd %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_frcz_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vfrczpd %xmm0, %xmm0
; X64-NEXT: retq
%res = call <2 x double> @llvm.x86.xop.vfrcz.pd(<2 x double> %a0)
@@ -1075,12 +1075,12 @@ declare <2 x double> @llvm.x86.xop.vfrcz.pd(<2 x double>) nounwind readnone
define <8 x float> @test_mm256_frcz_ps(<8 x float> %a0) {
; X32-LABEL: test_mm256_frcz_ps:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vfrczps %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_frcz_ps:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vfrczps %ymm0, %ymm0
; X64-NEXT: retq
%res = call <8 x float> @llvm.x86.xop.vfrcz.ps.256(<8 x float> %a0)
@@ -1090,12 +1090,12 @@ declare <8 x float> @llvm.x86.xop.vfrcz.ps.256(<8 x float>) nounwind readnone
define <4 x double> @test_mm256_frcz_pd(<4 x double> %a0) {
; X32-LABEL: test_mm256_frcz_pd:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vfrczpd %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm256_frcz_pd:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vfrczpd %ymm0, %ymm0
; X64-NEXT: retq
%res = call <4 x double> @llvm.x86.xop.vfrcz.pd.256(<4 x double> %a0)
diff --git a/test/CodeGen/X86/xop-intrinsics-x86_64-upgrade.ll b/test/CodeGen/X86/xop-intrinsics-x86_64-upgrade.ll
index 2369beffb6b0..c5493368ab11 100644
--- a/test/CodeGen/X86/xop-intrinsics-x86_64-upgrade.ll
+++ b/test/CodeGen/X86/xop-intrinsics-x86_64-upgrade.ll
@@ -3,7 +3,7 @@
define <2 x double> @test_int_x86_xop_vpermil2pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
; CHECK-LABEL: test_int_x86_xop_vpermil2pd:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpermil2pd $1, %xmm2, %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 1) ; [#uses=1]
@@ -11,7 +11,7 @@ define <2 x double> @test_int_x86_xop_vpermil2pd(<2 x double> %a0, <2 x double>
}
define <2 x double> @test_int_x86_xop_vpermil2pd_mr(<2 x double> %a0, <2 x double>* %a1, <2 x double> %a2) {
; CHECK-LABEL: test_int_x86_xop_vpermil2pd_mr:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpermil2pd $1, %xmm1, (%rdi), %xmm0, %xmm0
; CHECK-NEXT: retq
%vec = load <2 x double>, <2 x double>* %a1
@@ -20,7 +20,7 @@ define <2 x double> @test_int_x86_xop_vpermil2pd_mr(<2 x double> %a0, <2 x doubl
}
define <2 x double> @test_int_x86_xop_vpermil2pd_rm(<2 x double> %a0, <2 x double> %a1, <2 x double>* %a2) {
; CHECK-LABEL: test_int_x86_xop_vpermil2pd_rm:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpermil2pd $1, (%rdi), %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%vec = load <2 x double>, <2 x double>* %a2
@@ -31,7 +31,7 @@ declare <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double>, <2 x double>, <2 x d
define <4 x double> @test_int_x86_xop_vpermil2pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) {
; CHECK-LABEL: test_int_x86_xop_vpermil2pd_256:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpermil2pd $2, %ymm2, %ymm1, %ymm0, %ymm0
; CHECK-NEXT: retq
%res = call <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 2) ;
@@ -39,7 +39,7 @@ define <4 x double> @test_int_x86_xop_vpermil2pd_256(<4 x double> %a0, <4 x doub
}
define <4 x double> @test_int_x86_xop_vpermil2pd_256_mr(<4 x double> %a0, <4 x double>* %a1, <4 x double> %a2) {
; CHECK-LABEL: test_int_x86_xop_vpermil2pd_256_mr:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpermil2pd $2, %ymm1, (%rdi), %ymm0, %ymm0
; CHECK-NEXT: retq
%vec = load <4 x double>, <4 x double>* %a1
@@ -48,7 +48,7 @@ define <4 x double> @test_int_x86_xop_vpermil2pd_256_mr(<4 x double> %a0, <4 x d
}
define <4 x double> @test_int_x86_xop_vpermil2pd_256_rm(<4 x double> %a0, <4 x double> %a1, <4 x double>* %a2) {
; CHECK-LABEL: test_int_x86_xop_vpermil2pd_256_rm:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpermil2pd $2, (%rdi), %ymm1, %ymm0, %ymm0
; CHECK-NEXT: retq
%vec = load <4 x double>, <4 x double>* %a2
@@ -59,7 +59,7 @@ declare <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double>, <4 x double>, <4
define <4 x float> @test_int_x86_xop_vpermil2ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
; CHECK-LABEL: test_int_x86_xop_vpermil2ps:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpermil2ps $3, %xmm2, %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 3) ;
@@ -69,7 +69,7 @@ declare <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float>, <4 x float>, <4 x floa
define <8 x float> @test_int_x86_xop_vpermil2ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) {
; CHECK-LABEL: test_int_x86_xop_vpermil2ps_256:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpermil2ps $4, %ymm2, %ymm1, %ymm0, %ymm0
; CHECK-NEXT: retq
%res = call <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 4) ;
@@ -79,7 +79,7 @@ declare <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float>, <8 x float>, <8 x
define <16 x i8> @test_int_x86_xop_vpcomeqb(<16 x i8> %a0, <16 x i8> %a1) {
; CHECK-LABEL: test_int_x86_xop_vpcomeqb:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpcomeqb %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <16 x i8> @llvm.x86.xop.vpcomeqb(<16 x i8> %a0, <16 x i8> %a1) ;
@@ -87,7 +87,7 @@ define <16 x i8> @test_int_x86_xop_vpcomeqb(<16 x i8> %a0, <16 x i8> %a1) {
}
define <16 x i8> @test_int_x86_xop_vpcomeqb_mem(<16 x i8> %a0, <16 x i8>* %a1) {
; CHECK-LABEL: test_int_x86_xop_vpcomeqb_mem:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpcomeqb (%rdi), %xmm0, %xmm0
; CHECK-NEXT: retq
%vec = load <16 x i8>, <16 x i8>* %a1
@@ -98,7 +98,7 @@ declare <16 x i8> @llvm.x86.xop.vpcomeqb(<16 x i8>, <16 x i8>) nounwind readnone
define <8 x i16> @test_int_x86_xop_vpcomeqw(<8 x i16> %a0, <8 x i16> %a1) {
; CHECK-LABEL: test_int_x86_xop_vpcomeqw:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpcomeqw %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <8 x i16> @llvm.x86.xop.vpcomeqw(<8 x i16> %a0, <8 x i16> %a1) ;
@@ -108,7 +108,7 @@ declare <8 x i16> @llvm.x86.xop.vpcomeqw(<8 x i16>, <8 x i16>) nounwind readnone
define <4 x i32> @test_int_x86_xop_vpcomeqd(<4 x i32> %a0, <4 x i32> %a1) {
; CHECK-LABEL: test_int_x86_xop_vpcomeqd:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpcomeqd %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <4 x i32> @llvm.x86.xop.vpcomeqd(<4 x i32> %a0, <4 x i32> %a1) ;
@@ -118,7 +118,7 @@ declare <4 x i32> @llvm.x86.xop.vpcomeqd(<4 x i32>, <4 x i32>) nounwind readnone
define <2 x i64> @test_int_x86_xop_vpcomeqq(<2 x i64> %a0, <2 x i64> %a1) {
; CHECK-LABEL: test_int_x86_xop_vpcomeqq:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpcomeqq %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <2 x i64> @llvm.x86.xop.vpcomeqq(<2 x i64> %a0, <2 x i64> %a1) ;
@@ -128,7 +128,7 @@ declare <2 x i64> @llvm.x86.xop.vpcomeqq(<2 x i64>, <2 x i64>) nounwind readnone
define <16 x i8> @test_int_x86_xop_vpcomequb(<16 x i8> %a0, <16 x i8> %a1) {
; CHECK-LABEL: test_int_x86_xop_vpcomequb:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpcomequb %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <16 x i8> @llvm.x86.xop.vpcomequb(<16 x i8> %a0, <16 x i8> %a1) ;
@@ -138,7 +138,7 @@ declare <16 x i8> @llvm.x86.xop.vpcomequb(<16 x i8>, <16 x i8>) nounwind readnon
define <4 x i32> @test_int_x86_xop_vpcomequd(<4 x i32> %a0, <4 x i32> %a1) {
; CHECK-LABEL: test_int_x86_xop_vpcomequd:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpcomequd %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <4 x i32> @llvm.x86.xop.vpcomequd(<4 x i32> %a0, <4 x i32> %a1) ;
@@ -148,7 +148,7 @@ declare <4 x i32> @llvm.x86.xop.vpcomequd(<4 x i32>, <4 x i32>) nounwind readnon
define <2 x i64> @test_int_x86_xop_vpcomequq(<2 x i64> %a0, <2 x i64> %a1) {
; CHECK-LABEL: test_int_x86_xop_vpcomequq:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpcomequq %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <2 x i64> @llvm.x86.xop.vpcomequq(<2 x i64> %a0, <2 x i64> %a1) ;
@@ -158,7 +158,7 @@ declare <2 x i64> @llvm.x86.xop.vpcomequq(<2 x i64>, <2 x i64>) nounwind readnon
define <8 x i16> @test_int_x86_xop_vpcomequw(<8 x i16> %a0, <8 x i16> %a1) {
; CHECK-LABEL: test_int_x86_xop_vpcomequw:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpcomequw %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <8 x i16> @llvm.x86.xop.vpcomequw(<8 x i16> %a0, <8 x i16> %a1) ;
@@ -168,7 +168,7 @@ declare <8 x i16> @llvm.x86.xop.vpcomequw(<8 x i16>, <8 x i16>) nounwind readnon
define <16 x i8> @test_int_x86_xop_vpcomfalseb(<16 x i8> %a0, <16 x i8> %a1) {
; CHECK-LABEL: test_int_x86_xop_vpcomfalseb:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpcomfalseb %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <16 x i8> @llvm.x86.xop.vpcomfalseb(<16 x i8> %a0, <16 x i8> %a1) ;
@@ -178,7 +178,7 @@ declare <16 x i8> @llvm.x86.xop.vpcomfalseb(<16 x i8>, <16 x i8>) nounwind readn
define <4 x i32> @test_int_x86_xop_vpcomfalsed(<4 x i32> %a0, <4 x i32> %a1) {
; CHECK-LABEL: test_int_x86_xop_vpcomfalsed:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpcomfalsed %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <4 x i32> @llvm.x86.xop.vpcomfalsed(<4 x i32> %a0, <4 x i32> %a1) ;
@@ -188,7 +188,7 @@ declare <4 x i32> @llvm.x86.xop.vpcomfalsed(<4 x i32>, <4 x i32>) nounwind readn
define <2 x i64> @test_int_x86_xop_vpcomfalseq(<2 x i64> %a0, <2 x i64> %a1) {
; CHECK-LABEL: test_int_x86_xop_vpcomfalseq:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpcomfalseq %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <2 x i64> @llvm.x86.xop.vpcomfalseq(<2 x i64> %a0, <2 x i64> %a1) ;
@@ -198,7 +198,7 @@ declare <2 x i64> @llvm.x86.xop.vpcomfalseq(<2 x i64>, <2 x i64>) nounwind readn
define <16 x i8> @test_int_x86_xop_vpcomfalseub(<16 x i8> %a0, <16 x i8> %a1) {
; CHECK-LABEL: test_int_x86_xop_vpcomfalseub:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpcomfalseub %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <16 x i8> @llvm.x86.xop.vpcomfalseub(<16 x i8> %a0, <16 x i8> %a1) ;
@@ -208,7 +208,7 @@ declare <16 x i8> @llvm.x86.xop.vpcomfalseub(<16 x i8>, <16 x i8>) nounwind read
define <4 x i32> @test_int_x86_xop_vpcomfalseud(<4 x i32> %a0, <4 x i32> %a1) {
; CHECK-LABEL: test_int_x86_xop_vpcomfalseud:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpcomfalseud %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <4 x i32> @llvm.x86.xop.vpcomfalseud(<4 x i32> %a0, <4 x i32> %a1) ;
@@ -218,7 +218,7 @@ declare <4 x i32> @llvm.x86.xop.vpcomfalseud(<4 x i32>, <4 x i32>) nounwind read
define <2 x i64> @test_int_x86_xop_vpcomfalseuq(<2 x i64> %a0, <2 x i64> %a1) {
; CHECK-LABEL: test_int_x86_xop_vpcomfalseuq:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpcomfalseuq %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <2 x i64> @llvm.x86.xop.vpcomfalseuq(<2 x i64> %a0, <2 x i64> %a1) ;
@@ -228,7 +228,7 @@ declare <2 x i64> @llvm.x86.xop.vpcomfalseuq(<2 x i64>, <2 x i64>) nounwind read
define <8 x i16> @test_int_x86_xop_vpcomfalseuw(<8 x i16> %a0, <8 x i16> %a1) {
; CHECK-LABEL: test_int_x86_xop_vpcomfalseuw:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpcomfalseuw %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <8 x i16> @llvm.x86.xop.vpcomfalseuw(<8 x i16> %a0, <8 x i16> %a1) ;
@@ -238,7 +238,7 @@ declare <8 x i16> @llvm.x86.xop.vpcomfalseuw(<8 x i16>, <8 x i16>) nounwind read
define <8 x i16> @test_int_x86_xop_vpcomfalsew(<8 x i16> %a0, <8 x i16> %a1) {
; CHECK-LABEL: test_int_x86_xop_vpcomfalsew:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpcomfalsew %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <8 x i16> @llvm.x86.xop.vpcomfalsew(<8 x i16> %a0, <8 x i16> %a1) ;
@@ -248,7 +248,7 @@ declare <8 x i16> @llvm.x86.xop.vpcomfalsew(<8 x i16>, <8 x i16>) nounwind readn
define <16 x i8> @test_int_x86_xop_vpcomgeb(<16 x i8> %a0, <16 x i8> %a1) {
; CHECK-LABEL: test_int_x86_xop_vpcomgeb:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpcomgeb %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <16 x i8> @llvm.x86.xop.vpcomgeb(<16 x i8> %a0, <16 x i8> %a1) ;
@@ -258,7 +258,7 @@ declare <16 x i8> @llvm.x86.xop.vpcomgeb(<16 x i8>, <16 x i8>) nounwind readnone
define <4 x i32> @test_int_x86_xop_vpcomged(<4 x i32> %a0, <4 x i32> %a1) {
; CHECK-LABEL: test_int_x86_xop_vpcomged:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpcomged %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <4 x i32> @llvm.x86.xop.vpcomged(<4 x i32> %a0, <4 x i32> %a1) ;
@@ -268,7 +268,7 @@ declare <4 x i32> @llvm.x86.xop.vpcomged(<4 x i32>, <4 x i32>) nounwind readnone
define <2 x i64> @test_int_x86_xop_vpcomgeq(<2 x i64> %a0, <2 x i64> %a1) {
; CHECK-LABEL: test_int_x86_xop_vpcomgeq:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpcomgeq %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <2 x i64> @llvm.x86.xop.vpcomgeq(<2 x i64> %a0, <2 x i64> %a1) ;
@@ -278,7 +278,7 @@ declare <2 x i64> @llvm.x86.xop.vpcomgeq(<2 x i64>, <2 x i64>) nounwind readnone
define <16 x i8> @test_int_x86_xop_vpcomgeub(<16 x i8> %a0, <16 x i8> %a1) {
; CHECK-LABEL: test_int_x86_xop_vpcomgeub:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpcomgeub %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <16 x i8> @llvm.x86.xop.vpcomgeub(<16 x i8> %a0, <16 x i8> %a1) ;
@@ -288,7 +288,7 @@ declare <16 x i8> @llvm.x86.xop.vpcomgeub(<16 x i8>, <16 x i8>) nounwind readnon
define <4 x i32> @test_int_x86_xop_vpcomgeud(<4 x i32> %a0, <4 x i32> %a1) {
; CHECK-LABEL: test_int_x86_xop_vpcomgeud:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpcomgeud %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <4 x i32> @llvm.x86.xop.vpcomgeud(<4 x i32> %a0, <4 x i32> %a1) ;
@@ -298,7 +298,7 @@ declare <4 x i32> @llvm.x86.xop.vpcomgeud(<4 x i32>, <4 x i32>) nounwind readnon
define <2 x i64> @test_int_x86_xop_vpcomgeuq(<2 x i64> %a0, <2 x i64> %a1) {
; CHECK-LABEL: test_int_x86_xop_vpcomgeuq:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpcomgeuq %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <2 x i64> @llvm.x86.xop.vpcomgeuq(<2 x i64> %a0, <2 x i64> %a1) ;
@@ -308,7 +308,7 @@ declare <2 x i64> @llvm.x86.xop.vpcomgeuq(<2 x i64>, <2 x i64>) nounwind readnon
define <8 x i16> @test_int_x86_xop_vpcomgeuw(<8 x i16> %a0, <8 x i16> %a1) {
; CHECK-LABEL: test_int_x86_xop_vpcomgeuw:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpcomgeuw %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <8 x i16> @llvm.x86.xop.vpcomgeuw(<8 x i16> %a0, <8 x i16> %a1) ;
@@ -318,7 +318,7 @@ declare <8 x i16> @llvm.x86.xop.vpcomgeuw(<8 x i16>, <8 x i16>) nounwind readnon
define <8 x i16> @test_int_x86_xop_vpcomgew(<8 x i16> %a0, <8 x i16> %a1) {
; CHECK-LABEL: test_int_x86_xop_vpcomgew:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpcomgew %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <8 x i16> @llvm.x86.xop.vpcomgew(<8 x i16> %a0, <8 x i16> %a1) ;
@@ -328,7 +328,7 @@ declare <8 x i16> @llvm.x86.xop.vpcomgew(<8 x i16>, <8 x i16>) nounwind readnone
define <16 x i8> @test_int_x86_xop_vpcomgtb(<16 x i8> %a0, <16 x i8> %a1) {
; CHECK-LABEL: test_int_x86_xop_vpcomgtb:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpcomgtb %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <16 x i8> @llvm.x86.xop.vpcomgtb(<16 x i8> %a0, <16 x i8> %a1) ;
@@ -338,7 +338,7 @@ declare <16 x i8> @llvm.x86.xop.vpcomgtb(<16 x i8>, <16 x i8>) nounwind readnone
define <4 x i32> @test_int_x86_xop_vpcomgtd(<4 x i32> %a0, <4 x i32> %a1) {
; CHECK-LABEL: test_int_x86_xop_vpcomgtd:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpcomgtd %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <4 x i32> @llvm.x86.xop.vpcomgtd(<4 x i32> %a0, <4 x i32> %a1) ;
@@ -348,7 +348,7 @@ declare <4 x i32> @llvm.x86.xop.vpcomgtd(<4 x i32>, <4 x i32>) nounwind readnone
define <2 x i64> @test_int_x86_xop_vpcomgtq(<2 x i64> %a0, <2 x i64> %a1) {
; CHECK-LABEL: test_int_x86_xop_vpcomgtq:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpcomgtq %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <2 x i64> @llvm.x86.xop.vpcomgtq(<2 x i64> %a0, <2 x i64> %a1) ;
@@ -358,7 +358,7 @@ declare <2 x i64> @llvm.x86.xop.vpcomgtq(<2 x i64>, <2 x i64>) nounwind readnone
define <16 x i8> @test_int_x86_xop_vpcomgtub(<16 x i8> %a0, <16 x i8> %a1) {
; CHECK-LABEL: test_int_x86_xop_vpcomgtub:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpcomgtub %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <16 x i8> @llvm.x86.xop.vpcomgtub(<16 x i8> %a0, <16 x i8> %a1) ;
@@ -368,7 +368,7 @@ declare <16 x i8> @llvm.x86.xop.vpcomgtub(<16 x i8>, <16 x i8>) nounwind readnon
define <4 x i32> @test_int_x86_xop_vpcomgtud(<4 x i32> %a0, <4 x i32> %a1) {
; CHECK-LABEL: test_int_x86_xop_vpcomgtud:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpcomgtud %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <4 x i32> @llvm.x86.xop.vpcomgtud(<4 x i32> %a0, <4 x i32> %a1) ;
@@ -378,7 +378,7 @@ declare <4 x i32> @llvm.x86.xop.vpcomgtud(<4 x i32>, <4 x i32>) nounwind readnon
define <2 x i64> @test_int_x86_xop_vpcomgtuq(<2 x i64> %a0, <2 x i64> %a1) {
; CHECK-LABEL: test_int_x86_xop_vpcomgtuq:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpcomgtuq %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <2 x i64> @llvm.x86.xop.vpcomgtuq(<2 x i64> %a0, <2 x i64> %a1) ;
@@ -388,7 +388,7 @@ declare <2 x i64> @llvm.x86.xop.vpcomgtuq(<2 x i64>, <2 x i64>) nounwind readnon
define <8 x i16> @test_int_x86_xop_vpcomgtuw(<8 x i16> %a0, <8 x i16> %a1) {
; CHECK-LABEL: test_int_x86_xop_vpcomgtuw:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpcomgtuw %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <8 x i16> @llvm.x86.xop.vpcomgtuw(<8 x i16> %a0, <8 x i16> %a1) ;
@@ -398,7 +398,7 @@ declare <8 x i16> @llvm.x86.xop.vpcomgtuw(<8 x i16>, <8 x i16>) nounwind readnon
define <8 x i16> @test_int_x86_xop_vpcomgtw(<8 x i16> %a0, <8 x i16> %a1) {
; CHECK-LABEL: test_int_x86_xop_vpcomgtw:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpcomgtw %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <8 x i16> @llvm.x86.xop.vpcomgtw(<8 x i16> %a0, <8 x i16> %a1) ;
@@ -408,7 +408,7 @@ declare <8 x i16> @llvm.x86.xop.vpcomgtw(<8 x i16>, <8 x i16>) nounwind readnone
define <16 x i8> @test_int_x86_xop_vpcomleb(<16 x i8> %a0, <16 x i8> %a1) {
; CHECK-LABEL: test_int_x86_xop_vpcomleb:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpcomleb %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <16 x i8> @llvm.x86.xop.vpcomleb(<16 x i8> %a0, <16 x i8> %a1) ;
@@ -418,7 +418,7 @@ declare <16 x i8> @llvm.x86.xop.vpcomleb(<16 x i8>, <16 x i8>) nounwind readnone
define <4 x i32> @test_int_x86_xop_vpcomled(<4 x i32> %a0, <4 x i32> %a1) {
; CHECK-LABEL: test_int_x86_xop_vpcomled:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpcomled %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <4 x i32> @llvm.x86.xop.vpcomled(<4 x i32> %a0, <4 x i32> %a1) ;
@@ -428,7 +428,7 @@ declare <4 x i32> @llvm.x86.xop.vpcomled(<4 x i32>, <4 x i32>) nounwind readnone
define <2 x i64> @test_int_x86_xop_vpcomleq(<2 x i64> %a0, <2 x i64> %a1) {
; CHECK-LABEL: test_int_x86_xop_vpcomleq:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpcomleq %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <2 x i64> @llvm.x86.xop.vpcomleq(<2 x i64> %a0, <2 x i64> %a1) ;
@@ -438,7 +438,7 @@ declare <2 x i64> @llvm.x86.xop.vpcomleq(<2 x i64>, <2 x i64>) nounwind readnone
define <16 x i8> @test_int_x86_xop_vpcomleub(<16 x i8> %a0, <16 x i8> %a1) {
; CHECK-LABEL: test_int_x86_xop_vpcomleub:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpcomleub %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <16 x i8> @llvm.x86.xop.vpcomleub(<16 x i8> %a0, <16 x i8> %a1) ;
@@ -448,7 +448,7 @@ declare <16 x i8> @llvm.x86.xop.vpcomleub(<16 x i8>, <16 x i8>) nounwind readnon
define <4 x i32> @test_int_x86_xop_vpcomleud(<4 x i32> %a0, <4 x i32> %a1) {
; CHECK-LABEL: test_int_x86_xop_vpcomleud:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpcomleud %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <4 x i32> @llvm.x86.xop.vpcomleud(<4 x i32> %a0, <4 x i32> %a1) ;
@@ -458,7 +458,7 @@ declare <4 x i32> @llvm.x86.xop.vpcomleud(<4 x i32>, <4 x i32>) nounwind readnon
define <2 x i64> @test_int_x86_xop_vpcomleuq(<2 x i64> %a0, <2 x i64> %a1) {
; CHECK-LABEL: test_int_x86_xop_vpcomleuq:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpcomleuq %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <2 x i64> @llvm.x86.xop.vpcomleuq(<2 x i64> %a0, <2 x i64> %a1) ;
@@ -468,7 +468,7 @@ declare <2 x i64> @llvm.x86.xop.vpcomleuq(<2 x i64>, <2 x i64>) nounwind readnon
define <8 x i16> @test_int_x86_xop_vpcomleuw(<8 x i16> %a0, <8 x i16> %a1) {
; CHECK-LABEL: test_int_x86_xop_vpcomleuw:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpcomleuw %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <8 x i16> @llvm.x86.xop.vpcomleuw(<8 x i16> %a0, <8 x i16> %a1) ;
@@ -478,7 +478,7 @@ declare <8 x i16> @llvm.x86.xop.vpcomleuw(<8 x i16>, <8 x i16>) nounwind readnon
define <8 x i16> @test_int_x86_xop_vpcomlew(<8 x i16> %a0, <8 x i16> %a1) {
; CHECK-LABEL: test_int_x86_xop_vpcomlew:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpcomlew %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <8 x i16> @llvm.x86.xop.vpcomlew(<8 x i16> %a0, <8 x i16> %a1) ;
@@ -488,7 +488,7 @@ declare <8 x i16> @llvm.x86.xop.vpcomlew(<8 x i16>, <8 x i16>) nounwind readnone
define <16 x i8> @test_int_x86_xop_vpcomltb(<16 x i8> %a0, <16 x i8> %a1) {
; CHECK-LABEL: test_int_x86_xop_vpcomltb:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpcomltb %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <16 x i8> @llvm.x86.xop.vpcomltb(<16 x i8> %a0, <16 x i8> %a1) ;
@@ -498,7 +498,7 @@ declare <16 x i8> @llvm.x86.xop.vpcomltb(<16 x i8>, <16 x i8>) nounwind readnone
define <4 x i32> @test_int_x86_xop_vpcomltd(<4 x i32> %a0, <4 x i32> %a1) {
; CHECK-LABEL: test_int_x86_xop_vpcomltd:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpcomltd %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <4 x i32> @llvm.x86.xop.vpcomltd(<4 x i32> %a0, <4 x i32> %a1) ;
@@ -508,7 +508,7 @@ declare <4 x i32> @llvm.x86.xop.vpcomltd(<4 x i32>, <4 x i32>) nounwind readnone
define <2 x i64> @test_int_x86_xop_vpcomltq(<2 x i64> %a0, <2 x i64> %a1) {
; CHECK-LABEL: test_int_x86_xop_vpcomltq:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpcomltq %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <2 x i64> @llvm.x86.xop.vpcomltq(<2 x i64> %a0, <2 x i64> %a1) ;
@@ -518,7 +518,7 @@ declare <2 x i64> @llvm.x86.xop.vpcomltq(<2 x i64>, <2 x i64>) nounwind readnone
define <16 x i8> @test_int_x86_xop_vpcomltub(<16 x i8> %a0, <16 x i8> %a1) {
; CHECK-LABEL: test_int_x86_xop_vpcomltub:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpcomltub %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <16 x i8> @llvm.x86.xop.vpcomltub(<16 x i8> %a0, <16 x i8> %a1) ;
@@ -528,7 +528,7 @@ declare <16 x i8> @llvm.x86.xop.vpcomltub(<16 x i8>, <16 x i8>) nounwind readnon
define <4 x i32> @test_int_x86_xop_vpcomltud(<4 x i32> %a0, <4 x i32> %a1) {
; CHECK-LABEL: test_int_x86_xop_vpcomltud:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpcomltud %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <4 x i32> @llvm.x86.xop.vpcomltud(<4 x i32> %a0, <4 x i32> %a1) ;
@@ -538,7 +538,7 @@ declare <4 x i32> @llvm.x86.xop.vpcomltud(<4 x i32>, <4 x i32>) nounwind readnon
define <2 x i64> @test_int_x86_xop_vpcomltuq(<2 x i64> %a0, <2 x i64> %a1) {
; CHECK-LABEL: test_int_x86_xop_vpcomltuq:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpcomltuq %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <2 x i64> @llvm.x86.xop.vpcomltuq(<2 x i64> %a0, <2 x i64> %a1) ;
@@ -548,7 +548,7 @@ declare <2 x i64> @llvm.x86.xop.vpcomltuq(<2 x i64>, <2 x i64>) nounwind readnon
define <8 x i16> @test_int_x86_xop_vpcomltuw(<8 x i16> %a0, <8 x i16> %a1) {
; CHECK-LABEL: test_int_x86_xop_vpcomltuw:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpcomltuw %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <8 x i16> @llvm.x86.xop.vpcomltuw(<8 x i16> %a0, <8 x i16> %a1) ;
@@ -558,7 +558,7 @@ declare <8 x i16> @llvm.x86.xop.vpcomltuw(<8 x i16>, <8 x i16>) nounwind readnon
define <8 x i16> @test_int_x86_xop_vpcomltw(<8 x i16> %a0, <8 x i16> %a1) {
; CHECK-LABEL: test_int_x86_xop_vpcomltw:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpcomltw %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <8 x i16> @llvm.x86.xop.vpcomltw(<8 x i16> %a0, <8 x i16> %a1) ;
@@ -568,7 +568,7 @@ declare <8 x i16> @llvm.x86.xop.vpcomltw(<8 x i16>, <8 x i16>) nounwind readnone
define <16 x i8> @test_int_x86_xop_vpcomneb(<16 x i8> %a0, <16 x i8> %a1) {
; CHECK-LABEL: test_int_x86_xop_vpcomneb:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpcomneqb %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <16 x i8> @llvm.x86.xop.vpcomneb(<16 x i8> %a0, <16 x i8> %a1) ;
@@ -578,7 +578,7 @@ declare <16 x i8> @llvm.x86.xop.vpcomneb(<16 x i8>, <16 x i8>) nounwind readnone
define <4 x i32> @test_int_x86_xop_vpcomned(<4 x i32> %a0, <4 x i32> %a1) {
; CHECK-LABEL: test_int_x86_xop_vpcomned:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpcomneqd %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <4 x i32> @llvm.x86.xop.vpcomned(<4 x i32> %a0, <4 x i32> %a1) ;
@@ -588,7 +588,7 @@ declare <4 x i32> @llvm.x86.xop.vpcomned(<4 x i32>, <4 x i32>) nounwind readnone
define <2 x i64> @test_int_x86_xop_vpcomneq(<2 x i64> %a0, <2 x i64> %a1) {
; CHECK-LABEL: test_int_x86_xop_vpcomneq:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpcomneqq %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <2 x i64> @llvm.x86.xop.vpcomneq(<2 x i64> %a0, <2 x i64> %a1) ;
@@ -598,7 +598,7 @@ declare <2 x i64> @llvm.x86.xop.vpcomneq(<2 x i64>, <2 x i64>) nounwind readnone
define <16 x i8> @test_int_x86_xop_vpcomneub(<16 x i8> %a0, <16 x i8> %a1) {
; CHECK-LABEL: test_int_x86_xop_vpcomneub:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpcomnequb %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <16 x i8> @llvm.x86.xop.vpcomneub(<16 x i8> %a0, <16 x i8> %a1) ;
@@ -608,7 +608,7 @@ declare <16 x i8> @llvm.x86.xop.vpcomneub(<16 x i8>, <16 x i8>) nounwind readnon
define <4 x i32> @test_int_x86_xop_vpcomneud(<4 x i32> %a0, <4 x i32> %a1) {
; CHECK-LABEL: test_int_x86_xop_vpcomneud:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpcomnequd %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <4 x i32> @llvm.x86.xop.vpcomneud(<4 x i32> %a0, <4 x i32> %a1) ;
@@ -618,7 +618,7 @@ declare <4 x i32> @llvm.x86.xop.vpcomneud(<4 x i32>, <4 x i32>) nounwind readnon
define <2 x i64> @test_int_x86_xop_vpcomneuq(<2 x i64> %a0, <2 x i64> %a1) {
; CHECK-LABEL: test_int_x86_xop_vpcomneuq:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpcomnequq %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <2 x i64> @llvm.x86.xop.vpcomneuq(<2 x i64> %a0, <2 x i64> %a1) ;
@@ -628,7 +628,7 @@ declare <2 x i64> @llvm.x86.xop.vpcomneuq(<2 x i64>, <2 x i64>) nounwind readnon
define <8 x i16> @test_int_x86_xop_vpcomneuw(<8 x i16> %a0, <8 x i16> %a1) {
; CHECK-LABEL: test_int_x86_xop_vpcomneuw:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpcomnequw %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <8 x i16> @llvm.x86.xop.vpcomneuw(<8 x i16> %a0, <8 x i16> %a1) ;
@@ -638,7 +638,7 @@ declare <8 x i16> @llvm.x86.xop.vpcomneuw(<8 x i16>, <8 x i16>) nounwind readnon
define <8 x i16> @test_int_x86_xop_vpcomnew(<8 x i16> %a0, <8 x i16> %a1) {
; CHECK-LABEL: test_int_x86_xop_vpcomnew:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpcomneqw %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <8 x i16> @llvm.x86.xop.vpcomnew(<8 x i16> %a0, <8 x i16> %a1) ;
@@ -648,7 +648,7 @@ declare <8 x i16> @llvm.x86.xop.vpcomnew(<8 x i16>, <8 x i16>) nounwind readnone
define <16 x i8> @test_int_x86_xop_vpcomtrueb(<16 x i8> %a0, <16 x i8> %a1) {
; CHECK-LABEL: test_int_x86_xop_vpcomtrueb:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpcomtrueb %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <16 x i8> @llvm.x86.xop.vpcomtrueb(<16 x i8> %a0, <16 x i8> %a1) ;
@@ -658,7 +658,7 @@ declare <16 x i8> @llvm.x86.xop.vpcomtrueb(<16 x i8>, <16 x i8>) nounwind readno
define <4 x i32> @test_int_x86_xop_vpcomtrued(<4 x i32> %a0, <4 x i32> %a1) {
; CHECK-LABEL: test_int_x86_xop_vpcomtrued:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpcomtrued %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <4 x i32> @llvm.x86.xop.vpcomtrued(<4 x i32> %a0, <4 x i32> %a1) ;
@@ -668,7 +668,7 @@ declare <4 x i32> @llvm.x86.xop.vpcomtrued(<4 x i32>, <4 x i32>) nounwind readno
define <2 x i64> @test_int_x86_xop_vpcomtrueq(<2 x i64> %a0, <2 x i64> %a1) {
; CHECK-LABEL: test_int_x86_xop_vpcomtrueq:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpcomtrueq %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <2 x i64> @llvm.x86.xop.vpcomtrueq(<2 x i64> %a0, <2 x i64> %a1) ;
@@ -678,7 +678,7 @@ declare <2 x i64> @llvm.x86.xop.vpcomtrueq(<2 x i64>, <2 x i64>) nounwind readno
define <16 x i8> @test_int_x86_xop_vpcomtrueub(<16 x i8> %a0, <16 x i8> %a1) {
; CHECK-LABEL: test_int_x86_xop_vpcomtrueub:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpcomtrueub %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <16 x i8> @llvm.x86.xop.vpcomtrueub(<16 x i8> %a0, <16 x i8> %a1) ;
@@ -688,7 +688,7 @@ declare <16 x i8> @llvm.x86.xop.vpcomtrueub(<16 x i8>, <16 x i8>) nounwind readn
define <4 x i32> @test_int_x86_xop_vpcomtrueud(<4 x i32> %a0, <4 x i32> %a1) {
; CHECK-LABEL: test_int_x86_xop_vpcomtrueud:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpcomtrueud %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <4 x i32> @llvm.x86.xop.vpcomtrueud(<4 x i32> %a0, <4 x i32> %a1) ;
@@ -698,7 +698,7 @@ declare <4 x i32> @llvm.x86.xop.vpcomtrueud(<4 x i32>, <4 x i32>) nounwind readn
define <2 x i64> @test_int_x86_xop_vpcomtrueuq(<2 x i64> %a0, <2 x i64> %a1) {
; CHECK-LABEL: test_int_x86_xop_vpcomtrueuq:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpcomtrueuq %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <2 x i64> @llvm.x86.xop.vpcomtrueuq(<2 x i64> %a0, <2 x i64> %a1) ;
@@ -708,7 +708,7 @@ declare <2 x i64> @llvm.x86.xop.vpcomtrueuq(<2 x i64>, <2 x i64>) nounwind readn
define <8 x i16> @test_int_x86_xop_vpcomtrueuw(<8 x i16> %a0, <8 x i16> %a1) {
; CHECK-LABEL: test_int_x86_xop_vpcomtrueuw:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpcomtrueuw %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <8 x i16> @llvm.x86.xop.vpcomtrueuw(<8 x i16> %a0, <8 x i16> %a1) ;
@@ -718,7 +718,7 @@ declare <8 x i16> @llvm.x86.xop.vpcomtrueuw(<8 x i16>, <8 x i16>) nounwind readn
define <8 x i16> @test_int_x86_xop_vpcomtruew(<8 x i16> %a0, <8 x i16> %a1) {
; CHECK-LABEL: test_int_x86_xop_vpcomtruew:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpcomtruew %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <8 x i16> @llvm.x86.xop.vpcomtruew(<8 x i16> %a0, <8 x i16> %a1) ;
@@ -728,7 +728,7 @@ declare <8 x i16> @llvm.x86.xop.vpcomtruew(<8 x i16>, <8 x i16>) nounwind readno
define <2 x i64> @test_int_x86_xop_vpcmov(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2) {
; CHECK-LABEL: test_int_x86_xop_vpcmov:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpcmov %xmm2, %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <2 x i64> @llvm.x86.xop.vpcmov(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2) ;
@@ -738,7 +738,7 @@ declare <2 x i64> @llvm.x86.xop.vpcmov(<2 x i64>, <2 x i64>, <2 x i64>) nounwind
define <4 x i64> @test_int_x86_xop_vpcmov_256(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> %a2) {
; CHECK-LABEL: test_int_x86_xop_vpcmov_256:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpcmov %ymm2, %ymm1, %ymm0, %ymm0
; CHECK-NEXT: retq
%res = call <4 x i64> @llvm.x86.xop.vpcmov.256(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> %a2) ;
@@ -746,7 +746,7 @@ define <4 x i64> @test_int_x86_xop_vpcmov_256(<4 x i64> %a0, <4 x i64> %a1, <4 x
}
define <4 x i64> @test_int_x86_xop_vpcmov_256_mr(<4 x i64> %a0, <4 x i64>* %a1, <4 x i64> %a2) {
; CHECK-LABEL: test_int_x86_xop_vpcmov_256_mr:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpcmov %ymm1, (%rdi), %ymm0, %ymm0
; CHECK-NEXT: retq
%vec = load <4 x i64>, <4 x i64>* %a1
@@ -755,7 +755,7 @@ define <4 x i64> @test_int_x86_xop_vpcmov_256_mr(<4 x i64> %a0, <4 x i64>* %a1,
}
define <4 x i64> @test_int_x86_xop_vpcmov_256_rm(<4 x i64> %a0, <4 x i64> %a1, <4 x i64>* %a2) {
; CHECK-LABEL: test_int_x86_xop_vpcmov_256_rm:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpcmov (%rdi), %ymm1, %ymm0, %ymm0
; CHECK-NEXT: retq
%vec = load <4 x i64>, <4 x i64>* %a2
diff --git a/test/CodeGen/X86/xop-intrinsics-x86_64.ll b/test/CodeGen/X86/xop-intrinsics-x86_64.ll
index 76286a26ffa9..d4c5420f20d0 100644
--- a/test/CodeGen/X86/xop-intrinsics-x86_64.ll
+++ b/test/CodeGen/X86/xop-intrinsics-x86_64.ll
@@ -3,7 +3,7 @@
define <2 x double> @test_int_x86_xop_vpermil2pd(<2 x double> %a0, <2 x double> %a1, <2 x i64> %a2) {
; CHECK-LABEL: test_int_x86_xop_vpermil2pd:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpermil2pd $1, %xmm2, %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double> %a0, <2 x double> %a1, <2 x i64> %a2, i8 1) ; [#uses=1]
@@ -11,7 +11,7 @@ define <2 x double> @test_int_x86_xop_vpermil2pd(<2 x double> %a0, <2 x double>
}
define <2 x double> @test_int_x86_xop_vpermil2pd_mr(<2 x double> %a0, <2 x double>* %a1, <2 x i64> %a2) {
; CHECK-LABEL: test_int_x86_xop_vpermil2pd_mr:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpermil2pd $1, %xmm1, (%rdi), %xmm0, %xmm0
; CHECK-NEXT: retq
%vec = load <2 x double>, <2 x double>* %a1
@@ -20,7 +20,7 @@ define <2 x double> @test_int_x86_xop_vpermil2pd_mr(<2 x double> %a0, <2 x doubl
}
define <2 x double> @test_int_x86_xop_vpermil2pd_rm(<2 x double> %a0, <2 x double> %a1, <2 x i64>* %a2) {
; CHECK-LABEL: test_int_x86_xop_vpermil2pd_rm:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpermil2pd $1, (%rdi), %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%vec = load <2 x i64>, <2 x i64>* %a2
@@ -31,7 +31,7 @@ declare <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double>, <2 x double>, <2 x i
define <4 x double> @test_int_x86_xop_vpermil2pd_256(<4 x double> %a0, <4 x double> %a1, <4 x i64> %a2) {
; CHECK-LABEL: test_int_x86_xop_vpermil2pd_256:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpermil2pd $2, %ymm2, %ymm1, %ymm0, %ymm0
; CHECK-NEXT: retq
%res = call <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double> %a0, <4 x double> %a1, <4 x i64> %a2, i8 2) ;
@@ -39,7 +39,7 @@ define <4 x double> @test_int_x86_xop_vpermil2pd_256(<4 x double> %a0, <4 x doub
}
define <4 x double> @test_int_x86_xop_vpermil2pd_256_mr(<4 x double> %a0, <4 x double>* %a1, <4 x i64> %a2) {
; CHECK-LABEL: test_int_x86_xop_vpermil2pd_256_mr:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpermil2pd $2, %ymm1, (%rdi), %ymm0, %ymm0
; CHECK-NEXT: retq
%vec = load <4 x double>, <4 x double>* %a1
@@ -48,7 +48,7 @@ define <4 x double> @test_int_x86_xop_vpermil2pd_256_mr(<4 x double> %a0, <4 x d
}
define <4 x double> @test_int_x86_xop_vpermil2pd_256_rm(<4 x double> %a0, <4 x double> %a1, <4 x i64>* %a2) {
; CHECK-LABEL: test_int_x86_xop_vpermil2pd_256_rm:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpermil2pd $2, (%rdi), %ymm1, %ymm0, %ymm0
; CHECK-NEXT: retq
%vec = load <4 x i64>, <4 x i64>* %a2
@@ -59,7 +59,7 @@ declare <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double>, <4 x double>, <4
define <4 x float> @test_int_x86_xop_vpermil2ps(<4 x float> %a0, <4 x float> %a1, <4 x i32> %a2) {
; CHECK-LABEL: test_int_x86_xop_vpermil2ps:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpermil2ps $3, %xmm2, %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float> %a0, <4 x float> %a1, <4 x i32> %a2, i8 3) ;
@@ -69,7 +69,7 @@ declare <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float>, <4 x float>, <4 x i32>
define <8 x float> @test_int_x86_xop_vpermil2ps_256(<8 x float> %a0, <8 x float> %a1, <8 x i32> %a2) {
; CHECK-LABEL: test_int_x86_xop_vpermil2ps_256:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpermil2ps $4, %ymm2, %ymm1, %ymm0, %ymm0
; CHECK-NEXT: retq
%res = call <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float> %a0, <8 x float> %a1, <8 x i32> %a2, i8 4) ;
@@ -79,7 +79,7 @@ declare <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float>, <8 x float>, <8 x
define <2 x i64> @test_int_x86_xop_vpcmov(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2) {
; CHECK-LABEL: test_int_x86_xop_vpcmov:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpcmov %xmm2, %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%1 = xor <2 x i64> %a2, <i64 -1, i64 -1>
@@ -91,7 +91,7 @@ define <2 x i64> @test_int_x86_xop_vpcmov(<2 x i64> %a0, <2 x i64> %a1, <2 x i64
define <4 x i64> @test_int_x86_xop_vpcmov_256(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> %a2) {
; CHECK-LABEL: test_int_x86_xop_vpcmov_256:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpcmov %ymm2, %ymm1, %ymm0, %ymm0
; CHECK-NEXT: retq
%1 = xor <4 x i64> %a2, <i64 -1, i64 -1, i64 -1, i64 -1>
@@ -102,7 +102,7 @@ define <4 x i64> @test_int_x86_xop_vpcmov_256(<4 x i64> %a0, <4 x i64> %a1, <4 x
}
define <4 x i64> @test_int_x86_xop_vpcmov_256_mr(<4 x i64> %a0, <4 x i64>* %a1, <4 x i64> %a2) {
; CHECK-LABEL: test_int_x86_xop_vpcmov_256_mr:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpcmov %ymm1, (%rdi), %ymm0, %ymm0
; CHECK-NEXT: retq
%vec = load <4 x i64>, <4 x i64>* %a1
@@ -114,7 +114,7 @@ define <4 x i64> @test_int_x86_xop_vpcmov_256_mr(<4 x i64> %a0, <4 x i64>* %a1,
}
define <4 x i64> @test_int_x86_xop_vpcmov_256_rm(<4 x i64> %a0, <4 x i64> %a1, <4 x i64>* %a2) {
; CHECK-LABEL: test_int_x86_xop_vpcmov_256_rm:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpcmov (%rdi), %ymm1, %ymm0, %ymm0
; CHECK-NEXT: retq
%vec = load <4 x i64>, <4 x i64>* %a2
@@ -127,7 +127,7 @@ define <4 x i64> @test_int_x86_xop_vpcmov_256_rm(<4 x i64> %a0, <4 x i64> %a1, <
define <4 x i32> @test_int_x86_xop_vphaddbd(<16 x i8> %a0) {
; CHECK-LABEL: test_int_x86_xop_vphaddbd:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vphaddbd %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <4 x i32> @llvm.x86.xop.vphaddbd(<16 x i8> %a0) ;
@@ -137,7 +137,7 @@ declare <4 x i32> @llvm.x86.xop.vphaddbd(<16 x i8>) nounwind readnone
define <2 x i64> @test_int_x86_xop_vphaddbq(<16 x i8> %a0) {
; CHECK-LABEL: test_int_x86_xop_vphaddbq:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vphaddbq %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <2 x i64> @llvm.x86.xop.vphaddbq(<16 x i8> %a0) ;
@@ -147,7 +147,7 @@ declare <2 x i64> @llvm.x86.xop.vphaddbq(<16 x i8>) nounwind readnone
define <8 x i16> @test_int_x86_xop_vphaddbw(<16 x i8> %a0) {
; CHECK-LABEL: test_int_x86_xop_vphaddbw:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vphaddbw %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <8 x i16> @llvm.x86.xop.vphaddbw(<16 x i8> %a0) ;
@@ -157,7 +157,7 @@ declare <8 x i16> @llvm.x86.xop.vphaddbw(<16 x i8>) nounwind readnone
define <2 x i64> @test_int_x86_xop_vphadddq(<4 x i32> %a0) {
; CHECK-LABEL: test_int_x86_xop_vphadddq:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vphadddq %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <2 x i64> @llvm.x86.xop.vphadddq(<4 x i32> %a0) ;
@@ -167,7 +167,7 @@ declare <2 x i64> @llvm.x86.xop.vphadddq(<4 x i32>) nounwind readnone
define <4 x i32> @test_int_x86_xop_vphaddubd(<16 x i8> %a0) {
; CHECK-LABEL: test_int_x86_xop_vphaddubd:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vphaddubd %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <4 x i32> @llvm.x86.xop.vphaddubd(<16 x i8> %a0) ;
@@ -177,7 +177,7 @@ declare <4 x i32> @llvm.x86.xop.vphaddubd(<16 x i8>) nounwind readnone
define <2 x i64> @test_int_x86_xop_vphaddubq(<16 x i8> %a0) {
; CHECK-LABEL: test_int_x86_xop_vphaddubq:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vphaddubq %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <2 x i64> @llvm.x86.xop.vphaddubq(<16 x i8> %a0) ;
@@ -187,7 +187,7 @@ declare <2 x i64> @llvm.x86.xop.vphaddubq(<16 x i8>) nounwind readnone
define <8 x i16> @test_int_x86_xop_vphaddubw(<16 x i8> %a0) {
; CHECK-LABEL: test_int_x86_xop_vphaddubw:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vphaddubw %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <8 x i16> @llvm.x86.xop.vphaddubw(<16 x i8> %a0) ;
@@ -197,7 +197,7 @@ declare <8 x i16> @llvm.x86.xop.vphaddubw(<16 x i8>) nounwind readnone
define <2 x i64> @test_int_x86_xop_vphaddudq(<4 x i32> %a0) {
; CHECK-LABEL: test_int_x86_xop_vphaddudq:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vphaddudq %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <2 x i64> @llvm.x86.xop.vphaddudq(<4 x i32> %a0) ;
@@ -207,7 +207,7 @@ declare <2 x i64> @llvm.x86.xop.vphaddudq(<4 x i32>) nounwind readnone
define <4 x i32> @test_int_x86_xop_vphadduwd(<8 x i16> %a0) {
; CHECK-LABEL: test_int_x86_xop_vphadduwd:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vphadduwd %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <4 x i32> @llvm.x86.xop.vphadduwd(<8 x i16> %a0) ;
@@ -217,7 +217,7 @@ declare <4 x i32> @llvm.x86.xop.vphadduwd(<8 x i16>) nounwind readnone
define <2 x i64> @test_int_x86_xop_vphadduwq(<8 x i16> %a0) {
; CHECK-LABEL: test_int_x86_xop_vphadduwq:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vphadduwq %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <2 x i64> @llvm.x86.xop.vphadduwq(<8 x i16> %a0) ;
@@ -227,7 +227,7 @@ declare <2 x i64> @llvm.x86.xop.vphadduwq(<8 x i16>) nounwind readnone
define <4 x i32> @test_int_x86_xop_vphaddwd(<8 x i16> %a0) {
; CHECK-LABEL: test_int_x86_xop_vphaddwd:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vphaddwd %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <4 x i32> @llvm.x86.xop.vphaddwd(<8 x i16> %a0) ;
@@ -237,7 +237,7 @@ declare <4 x i32> @llvm.x86.xop.vphaddwd(<8 x i16>) nounwind readnone
define <2 x i64> @test_int_x86_xop_vphaddwq(<8 x i16> %a0) {
; CHECK-LABEL: test_int_x86_xop_vphaddwq:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vphaddwq %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <2 x i64> @llvm.x86.xop.vphaddwq(<8 x i16> %a0) ;
@@ -247,7 +247,7 @@ declare <2 x i64> @llvm.x86.xop.vphaddwq(<8 x i16>) nounwind readnone
define <8 x i16> @test_int_x86_xop_vphsubbw(<16 x i8> %a0) {
; CHECK-LABEL: test_int_x86_xop_vphsubbw:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vphsubbw %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <8 x i16> @llvm.x86.xop.vphsubbw(<16 x i8> %a0) ;
@@ -257,7 +257,7 @@ declare <8 x i16> @llvm.x86.xop.vphsubbw(<16 x i8>) nounwind readnone
define <2 x i64> @test_int_x86_xop_vphsubdq(<4 x i32> %a0) {
; CHECK-LABEL: test_int_x86_xop_vphsubdq:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vphsubdq %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <2 x i64> @llvm.x86.xop.vphsubdq(<4 x i32> %a0) ;
@@ -265,7 +265,7 @@ define <2 x i64> @test_int_x86_xop_vphsubdq(<4 x i32> %a0) {
}
define <2 x i64> @test_int_x86_xop_vphsubdq_mem(<4 x i32>* %a0) {
; CHECK-LABEL: test_int_x86_xop_vphsubdq_mem:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vphsubdq (%rdi), %xmm0
; CHECK-NEXT: retq
%vec = load <4 x i32>, <4 x i32>* %a0
@@ -276,7 +276,7 @@ declare <2 x i64> @llvm.x86.xop.vphsubdq(<4 x i32>) nounwind readnone
define <4 x i32> @test_int_x86_xop_vphsubwd(<8 x i16> %a0) {
; CHECK-LABEL: test_int_x86_xop_vphsubwd:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vphsubwd %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <4 x i32> @llvm.x86.xop.vphsubwd(<8 x i16> %a0) ;
@@ -284,7 +284,7 @@ define <4 x i32> @test_int_x86_xop_vphsubwd(<8 x i16> %a0) {
}
define <4 x i32> @test_int_x86_xop_vphsubwd_mem(<8 x i16>* %a0) {
; CHECK-LABEL: test_int_x86_xop_vphsubwd_mem:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vphsubwd (%rdi), %xmm0
; CHECK-NEXT: retq
%vec = load <8 x i16>, <8 x i16>* %a0
@@ -295,7 +295,7 @@ declare <4 x i32> @llvm.x86.xop.vphsubwd(<8 x i16>) nounwind readnone
define <4 x i32> @test_int_x86_xop_vpmacsdd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
; CHECK-LABEL: test_int_x86_xop_vpmacsdd:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpmacsdd %xmm2, %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <4 x i32> @llvm.x86.xop.vpmacsdd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) ;
@@ -305,7 +305,7 @@ declare <4 x i32> @llvm.x86.xop.vpmacsdd(<4 x i32>, <4 x i32>, <4 x i32>) nounwi
define <2 x i64> @test_int_x86_xop_vpmacsdqh(<4 x i32> %a0, <4 x i32> %a1, <2 x i64> %a2) {
; CHECK-LABEL: test_int_x86_xop_vpmacsdqh:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpmacsdqh %xmm2, %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <2 x i64> @llvm.x86.xop.vpmacsdqh(<4 x i32> %a0, <4 x i32> %a1, <2 x i64> %a2) ;
@@ -315,7 +315,7 @@ declare <2 x i64> @llvm.x86.xop.vpmacsdqh(<4 x i32>, <4 x i32>, <2 x i64>) nounw
define <2 x i64> @test_int_x86_xop_vpmacsdql(<4 x i32> %a0, <4 x i32> %a1, <2 x i64> %a2) {
; CHECK-LABEL: test_int_x86_xop_vpmacsdql:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpmacsdql %xmm2, %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <2 x i64> @llvm.x86.xop.vpmacsdql(<4 x i32> %a0, <4 x i32> %a1, <2 x i64> %a2) ;
@@ -325,7 +325,7 @@ declare <2 x i64> @llvm.x86.xop.vpmacsdql(<4 x i32>, <4 x i32>, <2 x i64>) nounw
define <4 x i32> @test_int_x86_xop_vpmacssdd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
; CHECK-LABEL: test_int_x86_xop_vpmacssdd:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpmacssdd %xmm2, %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <4 x i32> @llvm.x86.xop.vpmacssdd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) ;
@@ -335,7 +335,7 @@ declare <4 x i32> @llvm.x86.xop.vpmacssdd(<4 x i32>, <4 x i32>, <4 x i32>) nounw
define <2 x i64> @test_int_x86_xop_vpmacssdqh(<4 x i32> %a0, <4 x i32> %a1, <2 x i64> %a2) {
; CHECK-LABEL: test_int_x86_xop_vpmacssdqh:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpmacssdqh %xmm2, %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <2 x i64> @llvm.x86.xop.vpmacssdqh(<4 x i32> %a0, <4 x i32> %a1, <2 x i64> %a2) ;
@@ -345,7 +345,7 @@ declare <2 x i64> @llvm.x86.xop.vpmacssdqh(<4 x i32>, <4 x i32>, <2 x i64>) noun
define <2 x i64> @test_int_x86_xop_vpmacssdql(<4 x i32> %a0, <4 x i32> %a1, <2 x i64> %a2) {
; CHECK-LABEL: test_int_x86_xop_vpmacssdql:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpmacssdql %xmm2, %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <2 x i64> @llvm.x86.xop.vpmacssdql(<4 x i32> %a0, <4 x i32> %a1, <2 x i64> %a2) ;
@@ -355,7 +355,7 @@ declare <2 x i64> @llvm.x86.xop.vpmacssdql(<4 x i32>, <4 x i32>, <2 x i64>) noun
define <4 x i32> @test_int_x86_xop_vpmacsswd(<8 x i16> %a0, <8 x i16> %a1, <4 x i32> %a2) {
; CHECK-LABEL: test_int_x86_xop_vpmacsswd:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpmacsswd %xmm2, %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <4 x i32> @llvm.x86.xop.vpmacsswd(<8 x i16> %a0, <8 x i16> %a1, <4 x i32> %a2) ;
@@ -365,7 +365,7 @@ declare <4 x i32> @llvm.x86.xop.vpmacsswd(<8 x i16>, <8 x i16>, <4 x i32>) nounw
define <8 x i16> @test_int_x86_xop_vpmacssww(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> %a2) {
; CHECK-LABEL: test_int_x86_xop_vpmacssww:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpmacssww %xmm2, %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <8 x i16> @llvm.x86.xop.vpmacssww(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> %a2) ;
@@ -375,7 +375,7 @@ declare <8 x i16> @llvm.x86.xop.vpmacssww(<8 x i16>, <8 x i16>, <8 x i16>) nounw
define <4 x i32> @test_int_x86_xop_vpmacswd(<8 x i16> %a0, <8 x i16> %a1, <4 x i32> %a2) {
; CHECK-LABEL: test_int_x86_xop_vpmacswd:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpmacswd %xmm2, %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <4 x i32> @llvm.x86.xop.vpmacswd(<8 x i16> %a0, <8 x i16> %a1, <4 x i32> %a2) ;
@@ -385,7 +385,7 @@ declare <4 x i32> @llvm.x86.xop.vpmacswd(<8 x i16>, <8 x i16>, <4 x i32>) nounwi
define <8 x i16> @test_int_x86_xop_vpmacsww(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> %a2) {
; CHECK-LABEL: test_int_x86_xop_vpmacsww:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpmacsww %xmm2, %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <8 x i16> @llvm.x86.xop.vpmacsww(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> %a2) ;
@@ -395,7 +395,7 @@ declare <8 x i16> @llvm.x86.xop.vpmacsww(<8 x i16>, <8 x i16>, <8 x i16>) nounwi
define <4 x i32> @test_int_x86_xop_vpmadcsswd(<8 x i16> %a0, <8 x i16> %a1, <4 x i32> %a2) {
; CHECK-LABEL: test_int_x86_xop_vpmadcsswd:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpmadcsswd %xmm2, %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <4 x i32> @llvm.x86.xop.vpmadcsswd(<8 x i16> %a0, <8 x i16> %a1, <4 x i32> %a2) ;
@@ -405,7 +405,7 @@ declare <4 x i32> @llvm.x86.xop.vpmadcsswd(<8 x i16>, <8 x i16>, <4 x i32>) noun
define <4 x i32> @test_int_x86_xop_vpmadcswd(<8 x i16> %a0, <8 x i16> %a1, <4 x i32> %a2) {
; CHECK-LABEL: test_int_x86_xop_vpmadcswd:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpmadcswd %xmm2, %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <4 x i32> @llvm.x86.xop.vpmadcswd(<8 x i16> %a0, <8 x i16> %a1, <4 x i32> %a2) ;
@@ -413,7 +413,7 @@ define <4 x i32> @test_int_x86_xop_vpmadcswd(<8 x i16> %a0, <8 x i16> %a1, <4 x
}
define <4 x i32> @test_int_x86_xop_vpmadcswd_mem(<8 x i16> %a0, <8 x i16>* %a1, <4 x i32> %a2) {
; CHECK-LABEL: test_int_x86_xop_vpmadcswd_mem:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpmadcswd %xmm1, (%rdi), %xmm0, %xmm0
; CHECK-NEXT: retq
%vec = load <8 x i16>, <8 x i16>* %a1
@@ -424,7 +424,7 @@ declare <4 x i32> @llvm.x86.xop.vpmadcswd(<8 x i16>, <8 x i16>, <4 x i32>) nounw
define <16 x i8> @test_int_x86_xop_vpperm(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %a2) {
; CHECK-LABEL: test_int_x86_xop_vpperm:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <16 x i8> @llvm.x86.xop.vpperm(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %a2) ;
@@ -432,7 +432,7 @@ define <16 x i8> @test_int_x86_xop_vpperm(<16 x i8> %a0, <16 x i8> %a1, <16 x i8
}
define <16 x i8> @test_int_x86_xop_vpperm_rm(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %a2) {
; CHECK-LABEL: test_int_x86_xop_vpperm_rm:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpperm (%rdi), %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%vec = load <16 x i8>, <16 x i8>* %a2
@@ -441,7 +441,7 @@ define <16 x i8> @test_int_x86_xop_vpperm_rm(<16 x i8> %a0, <16 x i8> %a1, <16 x
}
define <16 x i8> @test_int_x86_xop_vpperm_mr(<16 x i8> %a0, <16 x i8>* %a1, <16 x i8> %a2) {
; CHECK-LABEL: test_int_x86_xop_vpperm_mr:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpperm %xmm1, (%rdi), %xmm0, %xmm0
; CHECK-NEXT: retq
%vec = load <16 x i8>, <16 x i8>* %a1
@@ -452,7 +452,7 @@ declare <16 x i8> @llvm.x86.xop.vpperm(<16 x i8>, <16 x i8>, <16 x i8>) nounwind
define <16 x i8> @test_int_x86_xop_vprotb(<16 x i8> %a0, <16 x i8> %a1) {
; CHECK-LABEL: test_int_x86_xop_vprotb:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vprotb %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <16 x i8> @llvm.x86.xop.vprotb(<16 x i8> %a0, <16 x i8> %a1) ;
@@ -462,7 +462,7 @@ declare <16 x i8> @llvm.x86.xop.vprotb(<16 x i8>, <16 x i8>) nounwind readnone
define <4 x i32> @test_int_x86_xop_vprotd(<4 x i32> %a0, <4 x i32> %a1) {
; CHECK-LABEL: test_int_x86_xop_vprotd:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vprotd %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <4 x i32> @llvm.x86.xop.vprotd(<4 x i32> %a0, <4 x i32> %a1) ;
@@ -472,7 +472,7 @@ declare <4 x i32> @llvm.x86.xop.vprotd(<4 x i32>, <4 x i32>) nounwind readnone
define <2 x i64> @test_int_x86_xop_vprotq(<2 x i64> %a0, <2 x i64> %a1) {
; CHECK-LABEL: test_int_x86_xop_vprotq:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vprotq %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <2 x i64> @llvm.x86.xop.vprotq(<2 x i64> %a0, <2 x i64> %a1) ;
@@ -482,7 +482,7 @@ declare <2 x i64> @llvm.x86.xop.vprotq(<2 x i64>, <2 x i64>) nounwind readnone
define <8 x i16> @test_int_x86_xop_vprotw(<8 x i16> %a0, <8 x i16> %a1) {
; CHECK-LABEL: test_int_x86_xop_vprotw:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vprotw %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <8 x i16> @llvm.x86.xop.vprotw(<8 x i16> %a0, <8 x i16> %a1) ;
@@ -492,7 +492,7 @@ declare <8 x i16> @llvm.x86.xop.vprotw(<8 x i16>, <8 x i16>) nounwind readnone
define <16 x i8> @test_int_x86_xop_vprotbi(<16 x i8> %a0) {
; CHECK-LABEL: test_int_x86_xop_vprotbi:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vprotb $1, %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <16 x i8> @llvm.x86.xop.vprotbi(<16 x i8> %a0, i8 1) ;
@@ -502,7 +502,7 @@ declare <16 x i8> @llvm.x86.xop.vprotbi(<16 x i8>, i8) nounwind readnone
define <4 x i32> @test_int_x86_xop_vprotdi(<4 x i32> %a0) {
; CHECK-LABEL: test_int_x86_xop_vprotdi:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vprotd $254, %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <4 x i32> @llvm.x86.xop.vprotdi(<4 x i32> %a0, i8 -2) ;
@@ -512,7 +512,7 @@ declare <4 x i32> @llvm.x86.xop.vprotdi(<4 x i32>, i8) nounwind readnone
define <2 x i64> @test_int_x86_xop_vprotqi(<2 x i64> %a0) {
; CHECK-LABEL: test_int_x86_xop_vprotqi:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vprotq $3, %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <2 x i64> @llvm.x86.xop.vprotqi(<2 x i64> %a0, i8 3) ;
@@ -522,7 +522,7 @@ declare <2 x i64> @llvm.x86.xop.vprotqi(<2 x i64>, i8) nounwind readnone
define <8 x i16> @test_int_x86_xop_vprotwi(<8 x i16> %a0) {
; CHECK-LABEL: test_int_x86_xop_vprotwi:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vprotw $252, %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <8 x i16> @llvm.x86.xop.vprotwi(<8 x i16> %a0, i8 -4) ;
@@ -532,7 +532,7 @@ declare <8 x i16> @llvm.x86.xop.vprotwi(<8 x i16>, i8) nounwind readnone
define <16 x i8> @test_int_x86_xop_vpshab(<16 x i8> %a0, <16 x i8> %a1) {
; CHECK-LABEL: test_int_x86_xop_vpshab:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpshab %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <16 x i8> @llvm.x86.xop.vpshab(<16 x i8> %a0, <16 x i8> %a1) ;
@@ -542,7 +542,7 @@ declare <16 x i8> @llvm.x86.xop.vpshab(<16 x i8>, <16 x i8>) nounwind readnone
define <4 x i32> @test_int_x86_xop_vpshad(<4 x i32> %a0, <4 x i32> %a1) {
; CHECK-LABEL: test_int_x86_xop_vpshad:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpshad %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <4 x i32> @llvm.x86.xop.vpshad(<4 x i32> %a0, <4 x i32> %a1) ;
@@ -552,7 +552,7 @@ declare <4 x i32> @llvm.x86.xop.vpshad(<4 x i32>, <4 x i32>) nounwind readnone
define <2 x i64> @test_int_x86_xop_vpshaq(<2 x i64> %a0, <2 x i64> %a1) {
; CHECK-LABEL: test_int_x86_xop_vpshaq:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpshaq %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <2 x i64> @llvm.x86.xop.vpshaq(<2 x i64> %a0, <2 x i64> %a1) ;
@@ -562,7 +562,7 @@ declare <2 x i64> @llvm.x86.xop.vpshaq(<2 x i64>, <2 x i64>) nounwind readnone
define <8 x i16> @test_int_x86_xop_vpshaw(<8 x i16> %a0, <8 x i16> %a1) {
; CHECK-LABEL: test_int_x86_xop_vpshaw:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpshaw %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <8 x i16> @llvm.x86.xop.vpshaw(<8 x i16> %a0, <8 x i16> %a1) ;
@@ -572,7 +572,7 @@ declare <8 x i16> @llvm.x86.xop.vpshaw(<8 x i16>, <8 x i16>) nounwind readnone
define <16 x i8> @test_int_x86_xop_vpshlb(<16 x i8> %a0, <16 x i8> %a1) {
; CHECK-LABEL: test_int_x86_xop_vpshlb:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpshlb %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <16 x i8> @llvm.x86.xop.vpshlb(<16 x i8> %a0, <16 x i8> %a1) ;
@@ -582,7 +582,7 @@ declare <16 x i8> @llvm.x86.xop.vpshlb(<16 x i8>, <16 x i8>) nounwind readnone
define <4 x i32> @test_int_x86_xop_vpshld(<4 x i32> %a0, <4 x i32> %a1) {
; CHECK-LABEL: test_int_x86_xop_vpshld:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpshld %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <4 x i32> @llvm.x86.xop.vpshld(<4 x i32> %a0, <4 x i32> %a1) ;
@@ -592,7 +592,7 @@ declare <4 x i32> @llvm.x86.xop.vpshld(<4 x i32>, <4 x i32>) nounwind readnone
define <2 x i64> @test_int_x86_xop_vpshlq(<2 x i64> %a0, <2 x i64> %a1) {
; CHECK-LABEL: test_int_x86_xop_vpshlq:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpshlq %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <2 x i64> @llvm.x86.xop.vpshlq(<2 x i64> %a0, <2 x i64> %a1) ;
@@ -602,7 +602,7 @@ declare <2 x i64> @llvm.x86.xop.vpshlq(<2 x i64>, <2 x i64>) nounwind readnone
define <8 x i16> @test_int_x86_xop_vpshlw(<8 x i16> %a0, <8 x i16> %a1) {
; CHECK-LABEL: test_int_x86_xop_vpshlw:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpshlw %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <8 x i16> @llvm.x86.xop.vpshlw(<8 x i16> %a0, <8 x i16> %a1) ;
@@ -610,7 +610,7 @@ define <8 x i16> @test_int_x86_xop_vpshlw(<8 x i16> %a0, <8 x i16> %a1) {
}
define <8 x i16> @test_int_x86_xop_vpshlw_rm(<8 x i16> %a0, <8 x i16>* %a1) {
; CHECK-LABEL: test_int_x86_xop_vpshlw_rm:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpshlw (%rdi), %xmm0, %xmm0
; CHECK-NEXT: retq
%vec = load <8 x i16>, <8 x i16>* %a1
@@ -619,7 +619,7 @@ define <8 x i16> @test_int_x86_xop_vpshlw_rm(<8 x i16> %a0, <8 x i16>* %a1) {
}
define <8 x i16> @test_int_x86_xop_vpshlw_mr(<8 x i16>* %a0, <8 x i16> %a1) {
; CHECK-LABEL: test_int_x86_xop_vpshlw_mr:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpshlw %xmm0, (%rdi), %xmm0
; CHECK-NEXT: retq
%vec = load <8 x i16>, <8 x i16>* %a0
@@ -630,7 +630,7 @@ declare <8 x i16> @llvm.x86.xop.vpshlw(<8 x i16>, <8 x i16>) nounwind readnone
define <4 x float> @test_int_x86_xop_vfrcz_ss(<4 x float> %a0) {
; CHECK-LABEL: test_int_x86_xop_vfrcz_ss:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vfrczss %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <4 x float> @llvm.x86.xop.vfrcz.ss(<4 x float> %a0) ;
@@ -638,7 +638,7 @@ define <4 x float> @test_int_x86_xop_vfrcz_ss(<4 x float> %a0) {
}
define <4 x float> @test_int_x86_xop_vfrcz_ss_mem(float* %a0) {
; CHECK-LABEL: test_int_x86_xop_vfrcz_ss_mem:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vfrczss (%rdi), %xmm0
; CHECK-NEXT: retq
%elem = load float, float* %a0
@@ -650,7 +650,7 @@ declare <4 x float> @llvm.x86.xop.vfrcz.ss(<4 x float>) nounwind readnone
define <2 x double> @test_int_x86_xop_vfrcz_sd(<2 x double> %a0) {
; CHECK-LABEL: test_int_x86_xop_vfrcz_sd:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vfrczsd %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <2 x double> @llvm.x86.xop.vfrcz.sd(<2 x double> %a0) ;
@@ -658,7 +658,7 @@ define <2 x double> @test_int_x86_xop_vfrcz_sd(<2 x double> %a0) {
}
define <2 x double> @test_int_x86_xop_vfrcz_sd_mem(double* %a0) {
; CHECK-LABEL: test_int_x86_xop_vfrcz_sd_mem:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vfrczsd (%rdi), %xmm0
; CHECK-NEXT: retq
%elem = load double, double* %a0
@@ -670,7 +670,7 @@ declare <2 x double> @llvm.x86.xop.vfrcz.sd(<2 x double>) nounwind readnone
define <2 x double> @test_int_x86_xop_vfrcz_pd(<2 x double> %a0) {
; CHECK-LABEL: test_int_x86_xop_vfrcz_pd:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vfrczpd %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <2 x double> @llvm.x86.xop.vfrcz.pd(<2 x double> %a0) ;
@@ -678,7 +678,7 @@ define <2 x double> @test_int_x86_xop_vfrcz_pd(<2 x double> %a0) {
}
define <2 x double> @test_int_x86_xop_vfrcz_pd_mem(<2 x double>* %a0) {
; CHECK-LABEL: test_int_x86_xop_vfrcz_pd_mem:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vfrczpd (%rdi), %xmm0
; CHECK-NEXT: retq
%vec = load <2 x double>, <2 x double>* %a0
@@ -689,7 +689,7 @@ declare <2 x double> @llvm.x86.xop.vfrcz.pd(<2 x double>) nounwind readnone
define <4 x double> @test_int_x86_xop_vfrcz_pd_256(<4 x double> %a0) {
; CHECK-LABEL: test_int_x86_xop_vfrcz_pd_256:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vfrczpd %ymm0, %ymm0
; CHECK-NEXT: retq
%res = call <4 x double> @llvm.x86.xop.vfrcz.pd.256(<4 x double> %a0) ;
@@ -697,7 +697,7 @@ define <4 x double> @test_int_x86_xop_vfrcz_pd_256(<4 x double> %a0) {
}
define <4 x double> @test_int_x86_xop_vfrcz_pd_256_mem(<4 x double>* %a0) {
; CHECK-LABEL: test_int_x86_xop_vfrcz_pd_256_mem:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vfrczpd (%rdi), %ymm0
; CHECK-NEXT: retq
%vec = load <4 x double>, <4 x double>* %a0
@@ -708,7 +708,7 @@ declare <4 x double> @llvm.x86.xop.vfrcz.pd.256(<4 x double>) nounwind readnone
define <4 x float> @test_int_x86_xop_vfrcz_ps(<4 x float> %a0) {
; CHECK-LABEL: test_int_x86_xop_vfrcz_ps:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vfrczps %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <4 x float> @llvm.x86.xop.vfrcz.ps(<4 x float> %a0) ;
@@ -716,7 +716,7 @@ define <4 x float> @test_int_x86_xop_vfrcz_ps(<4 x float> %a0) {
}
define <4 x float> @test_int_x86_xop_vfrcz_ps_mem(<4 x float>* %a0) {
; CHECK-LABEL: test_int_x86_xop_vfrcz_ps_mem:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vfrczps (%rdi), %xmm0
; CHECK-NEXT: retq
%vec = load <4 x float>, <4 x float>* %a0
@@ -727,7 +727,7 @@ declare <4 x float> @llvm.x86.xop.vfrcz.ps(<4 x float>) nounwind readnone
define <8 x float> @test_int_x86_xop_vfrcz_ps_256(<8 x float> %a0) {
; CHECK-LABEL: test_int_x86_xop_vfrcz_ps_256:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vfrczps %ymm0, %ymm0
; CHECK-NEXT: retq
%res = call <8 x float> @llvm.x86.xop.vfrcz.ps.256(<8 x float> %a0) ;
@@ -735,7 +735,7 @@ define <8 x float> @test_int_x86_xop_vfrcz_ps_256(<8 x float> %a0) {
}
define <8 x float> @test_int_x86_xop_vfrcz_ps_256_mem(<8 x float>* %a0) {
; CHECK-LABEL: test_int_x86_xop_vfrcz_ps_256_mem:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vfrczps (%rdi), %ymm0
; CHECK-NEXT: retq
%vec = load <8 x float>, <8 x float>* %a0
@@ -746,7 +746,7 @@ declare <8 x float> @llvm.x86.xop.vfrcz.ps.256(<8 x float>) nounwind readnone
define <16 x i8> @test_int_x86_xop_vpcomb(<16 x i8> %a0, <16 x i8> %a1) {
; CHECK-LABEL: test_int_x86_xop_vpcomb:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpcomltb %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <16 x i8> @llvm.x86.xop.vpcomb(<16 x i8> %a0, <16 x i8> %a1, i8 0) ;
@@ -756,7 +756,7 @@ declare <16 x i8> @llvm.x86.xop.vpcomb(<16 x i8>, <16 x i8>, i8) nounwind readno
define <8 x i16> @test_int_x86_xop_vpcomw(<8 x i16> %a0, <8 x i16> %a1) {
; CHECK-LABEL: test_int_x86_xop_vpcomw:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpcomltw %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <8 x i16> @llvm.x86.xop.vpcomw(<8 x i16> %a0, <8 x i16> %a1, i8 0) ;
@@ -766,7 +766,7 @@ declare <8 x i16> @llvm.x86.xop.vpcomw(<8 x i16>, <8 x i16>, i8) nounwind readno
define <4 x i32> @test_int_x86_xop_vpcomd(<4 x i32> %a0, <4 x i32> %a1) {
; CHECK-LABEL: test_int_x86_xop_vpcomd:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpcomltd %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <4 x i32> @llvm.x86.xop.vpcomd(<4 x i32> %a0, <4 x i32> %a1, i8 0) ;
@@ -776,7 +776,7 @@ declare <4 x i32> @llvm.x86.xop.vpcomd(<4 x i32>, <4 x i32>, i8) nounwind readno
define <2 x i64> @test_int_x86_xop_vpcomq(<2 x i64> %a0, <2 x i64> %a1) {
; CHECK-LABEL: test_int_x86_xop_vpcomq:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpcomltq %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <2 x i64> @llvm.x86.xop.vpcomq(<2 x i64> %a0, <2 x i64> %a1, i8 0) ;
@@ -786,7 +786,7 @@ declare <2 x i64> @llvm.x86.xop.vpcomq(<2 x i64>, <2 x i64>, i8) nounwind readno
define <16 x i8> @test_int_x86_xop_vpcomub(<16 x i8> %a0, <16 x i8> %a1) {
; CHECK-LABEL: test_int_x86_xop_vpcomub:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpcomltub %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <16 x i8> @llvm.x86.xop.vpcomub(<16 x i8> %a0, <16 x i8> %a1, i8 0) ;
@@ -796,7 +796,7 @@ declare <16 x i8> @llvm.x86.xop.vpcomub(<16 x i8>, <16 x i8>, i8) nounwind readn
define <8 x i16> @test_int_x86_xop_vpcomuw(<8 x i16> %a0, <8 x i16> %a1) {
; CHECK-LABEL: test_int_x86_xop_vpcomuw:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpcomltuw %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <8 x i16> @llvm.x86.xop.vpcomuw(<8 x i16> %a0, <8 x i16> %a1, i8 0) ;
@@ -806,7 +806,7 @@ declare <8 x i16> @llvm.x86.xop.vpcomuw(<8 x i16>, <8 x i16>, i8) nounwind readn
define <4 x i32> @test_int_x86_xop_vpcomud(<4 x i32> %a0, <4 x i32> %a1) {
; CHECK-LABEL: test_int_x86_xop_vpcomud:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpcomltud %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <4 x i32> @llvm.x86.xop.vpcomud(<4 x i32> %a0, <4 x i32> %a1, i8 0) ;
@@ -816,7 +816,7 @@ declare <4 x i32> @llvm.x86.xop.vpcomud(<4 x i32>, <4 x i32>, i8) nounwind readn
define <2 x i64> @test_int_x86_xop_vpcomuq(<2 x i64> %a0, <2 x i64> %a1) {
; CHECK-LABEL: test_int_x86_xop_vpcomuq:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpcomltuq %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%res = call <2 x i64> @llvm.x86.xop.vpcomuq(<2 x i64> %a0, <2 x i64> %a1, i8 0) ;
diff --git a/test/CodeGen/X86/xop-mask-comments.ll b/test/CodeGen/X86/xop-mask-comments.ll
index 4ba47380f89a..c8aa85c425a7 100644
--- a/test/CodeGen/X86/xop-mask-comments.ll
+++ b/test/CodeGen/X86/xop-mask-comments.ll
@@ -8,12 +8,12 @@
define <16 x i8> @vpperm_shuffle_unary(<16 x i8> %a0) {
; X32-LABEL: vpperm_shuffle_unary:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpperm {{.*#+}} xmm0 = xmm0[15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
; X32-NEXT: retl
;
; X64-LABEL: vpperm_shuffle_unary:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpperm {{.*#+}} xmm0 = xmm0[15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
; X64-NEXT: retq
%1 = tail call <16 x i8> @llvm.x86.xop.vpperm(<16 x i8> %a0, <16 x i8> %a0, <16 x i8> <i8 31, i8 14, i8 29, i8 12, i8 27, i8 10, i8 25, i8 8, i8 23, i8 6, i8 21, i8 4, i8 19, i8 2, i8 17, i8 0>)
@@ -22,12 +22,12 @@ define <16 x i8> @vpperm_shuffle_unary(<16 x i8> %a0) {
define <16 x i8> @vpperm_shuffle_unary_undef(<16 x i8> %a0) {
; X32-LABEL: vpperm_shuffle_unary_undef:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpperm {{.*#+}} xmm0 = xmm0[15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
; X32-NEXT: retl
;
; X64-LABEL: vpperm_shuffle_unary_undef:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpperm {{.*#+}} xmm0 = xmm0[15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
; X64-NEXT: retq
%1 = tail call <16 x i8> @llvm.x86.xop.vpperm(<16 x i8> %a0, <16 x i8> undef, <16 x i8> <i8 31, i8 14, i8 29, i8 12, i8 27, i8 10, i8 25, i8 8, i8 23, i8 6, i8 21, i8 4, i8 19, i8 2, i8 17, i8 0>)
@@ -36,12 +36,12 @@ define <16 x i8> @vpperm_shuffle_unary_undef(<16 x i8> %a0) {
define <16 x i8> @vpperm_shuffle_unary_zero(<16 x i8> %a0) {
; X32-LABEL: vpperm_shuffle_unary_zero:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpperm {{.*#+}} xmm0 = xmm0[15,14,13,12,11,10,9,8,7,6,5,4,3],zero,xmm0[1],zero
; X32-NEXT: retl
;
; X64-LABEL: vpperm_shuffle_unary_zero:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpperm {{.*#+}} xmm0 = xmm0[15,14,13,12,11,10,9,8,7,6,5,4,3],zero,xmm0[1],zero
; X64-NEXT: retq
%1 = tail call <16 x i8> @llvm.x86.xop.vpperm(<16 x i8> %a0, <16 x i8> %a0, <16 x i8> <i8 31, i8 14, i8 29, i8 12, i8 27, i8 10, i8 25, i8 8, i8 23, i8 6, i8 21, i8 4, i8 19, i8 130, i8 17, i8 128>)
@@ -50,12 +50,12 @@ define <16 x i8> @vpperm_shuffle_unary_zero(<16 x i8> %a0) {
define <16 x i8> @vpperm_shuffle_binary(<16 x i8> %a0, <16 x i8> %a1) {
; X32-LABEL: vpperm_shuffle_binary:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpperm {{.*#+}} xmm0 = xmm1[15],xmm0[14],xmm1[13],xmm0[12],xmm1[11],xmm0[10],xmm1[9],xmm0[8],xmm1[7],xmm0[6],xmm1[5],xmm0[4],xmm1[3],xmm0[2],xmm1[1],xmm0[0]
; X32-NEXT: retl
;
; X64-LABEL: vpperm_shuffle_binary:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpperm {{.*#+}} xmm0 = xmm1[15],xmm0[14],xmm1[13],xmm0[12],xmm1[11],xmm0[10],xmm1[9],xmm0[8],xmm1[7],xmm0[6],xmm1[5],xmm0[4],xmm1[3],xmm0[2],xmm1[1],xmm0[0]
; X64-NEXT: retq
%1 = tail call <16 x i8> @llvm.x86.xop.vpperm(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> <i8 31, i8 14, i8 29, i8 12, i8 27, i8 10, i8 25, i8 8, i8 23, i8 6, i8 21, i8 4, i8 19, i8 2, i8 17, i8 0>)
@@ -64,12 +64,12 @@ define <16 x i8> @vpperm_shuffle_binary(<16 x i8> %a0, <16 x i8> %a1) {
define <16 x i8> @vpperm_shuffle_binary_zero(<16 x i8> %a0, <16 x i8> %a1) {
; X32-LABEL: vpperm_shuffle_binary_zero:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpperm {{.*#+}} xmm0 = xmm1[15],xmm0[14],xmm1[13],xmm0[12],xmm1[11],xmm0[10],xmm1[9],xmm0[8],xmm1[7],xmm0[6],xmm1[5],xmm0[4],zero,zero,zero,zero
; X32-NEXT: retl
;
; X64-LABEL: vpperm_shuffle_binary_zero:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpperm {{.*#+}} xmm0 = xmm1[15],xmm0[14],xmm1[13],xmm0[12],xmm1[11],xmm0[10],xmm1[9],xmm0[8],xmm1[7],xmm0[6],xmm1[5],xmm0[4],zero,zero,zero,zero
; X64-NEXT: retq
%1 = tail call <16 x i8> @llvm.x86.xop.vpperm(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> <i8 31, i8 14, i8 29, i8 12, i8 27, i8 10, i8 25, i8 8, i8 23, i8 6, i8 21, i8 4, i8 147, i8 130, i8 145, i8 128>)
@@ -79,12 +79,12 @@ define <16 x i8> @vpperm_shuffle_binary_zero(<16 x i8> %a0, <16 x i8> %a1) {
; we can't decode vpperm's other permute ops
define <16 x i8> @vpperm_shuffle_general(<16 x i8> %a0, <16 x i8> %a1) {
; X32-LABEL: vpperm_shuffle_general:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpperm {{\.LCPI.*}}, %xmm0, %xmm0, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: vpperm_shuffle_general:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpperm {{.*}}(%rip), %xmm0, %xmm0, %xmm0
; X64-NEXT: retq
%1 = tail call <16 x i8> @llvm.x86.xop.vpperm(<16 x i8> %a0, <16 x i8> %a0, <16 x i8> <i8 31, i8 14, i8 29, i8 12, i8 27, i8 10, i8 25, i8 8, i8 23, i8 6, i8 21, i8 4, i8 179, i8 162, i8 177, i8 160>)
@@ -99,15 +99,15 @@ define <16 x i8> @vpperm_shuffle_general(<16 x i8> %a0, <16 x i8> %a1) {
; be a quicker (and smaller) alternative.
define <2 x double> @vpermil2pd_21(<2 x double> %a0, <2 x double> %a1) {
; X32-LABEL: vpermil2pd_21:
-; X32: # BB#0:
-; X32-NEXT: vxorpd %xmm1, %xmm1, %xmm1
-; X32-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; X32: # %bb.0:
+; X32-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; X32-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; X32-NEXT: retl
;
; X64-LABEL: vpermil2pd_21:
-; X64: # BB#0:
-; X64-NEXT: vxorpd %xmm1, %xmm1, %xmm1
-; X64-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; X64: # %bb.0:
+; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; X64-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; X64-NEXT: retq
%1 = call <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double> %a0, <2 x double> %a1, <2 x i64> <i64 10, i64 1>, i8 2)
ret <2 x double> %1
@@ -115,12 +115,12 @@ define <2 x double> @vpermil2pd_21(<2 x double> %a0, <2 x double> %a1) {
define <4 x double> @vpermil2pd256_0062(<4 x double> %a0, <4 x double> %a1) {
; X32-LABEL: vpermil2pd256_0062:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpermil2pd {{.*#+}} ymm0 = ymm0[0,0],ymm1[2],ymm0[2]
; X32-NEXT: retl
;
; X64-LABEL: vpermil2pd256_0062:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpermil2pd {{.*#+}} ymm0 = ymm0[0,0],ymm1[2],ymm0[2]
; X64-NEXT: retq
%1 = call <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double> %a0, <4 x double> %a1, <4 x i64> <i64 0, i64 0, i64 4, i64 0>, i8 0)
@@ -129,12 +129,12 @@ define <4 x double> @vpermil2pd256_0062(<4 x double> %a0, <4 x double> %a1) {
define <4 x double> @vpermil2pd256_zz73(<4 x double> %a0, <4 x double> %a1) {
; X32-LABEL: vpermil2pd256_zz73:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpermil2pd {{.*#+}} ymm0 = zero,zero,ymm1[3],ymm0[3]
; X32-NEXT: retl
;
; X64-LABEL: vpermil2pd256_zz73:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpermil2pd {{.*#+}} ymm0 = zero,zero,ymm1[3],ymm0[3]
; X64-NEXT: retq
%1 = call <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double> %a0, <4 x double> %a1, <4 x i64> <i64 0, i64 0, i64 14, i64 10>, i8 3)
@@ -143,12 +143,12 @@ define <4 x double> @vpermil2pd256_zz73(<4 x double> %a0, <4 x double> %a1) {
define <4 x float> @vpermil2ps_0561(<4 x float> %a0, <4 x float> %a1) {
; X32-LABEL: vpermil2ps_0561:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpermil2ps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[1]
; X32-NEXT: retl
;
; X64-LABEL: vpermil2ps_0561:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpermil2ps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[1]
; X64-NEXT: retq
%1 = call <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 0, i32 5, i32 6, i32 1>, i8 0)
@@ -157,12 +157,12 @@ define <4 x float> @vpermil2ps_0561(<4 x float> %a0, <4 x float> %a1) {
define <8 x float> @vpermil2ps256_098144FE(<8 x float> %a0, <8 x float> %a1) {
; X32-LABEL: vpermil2ps256_098144FE:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpermil2ps {{.*#+}} ymm0 = ymm0[0],ymm1[1,0],ymm0[1,4,4],ymm1[7,6]
; X32-NEXT: retl
;
; X64-LABEL: vpermil2ps256_098144FE:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpermil2ps {{.*#+}} ymm0 = ymm0[0],ymm1[1,0],ymm0[1,4,4],ymm1[7,6]
; X64-NEXT: retq
%1 = call <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 0, i32 5, i32 4, i32 1, i32 0, i32 0, i32 7, i32 6>, i8 0)
@@ -171,12 +171,12 @@ define <8 x float> @vpermil2ps256_098144FE(<8 x float> %a0, <8 x float> %a1) {
define <8 x float> @vpermil2ps256_0zz8BzzA(<8 x float> %a0, <8 x float> %a1) {
; X32-LABEL: vpermil2ps256_0zz8BzzA:
-; X32: # BB#0:
+; X32: # %bb.0:
; X32-NEXT: vpermil2ps {{.*#+}} ymm0 = ymm0[0],zero,zero,ymm1[0,7],zero,zero,ymm1[6]
; X32-NEXT: retl
;
; X64-LABEL: vpermil2ps256_0zz8BzzA:
-; X64: # BB#0:
+; X64: # %bb.0:
; X64-NEXT: vpermil2ps {{.*#+}} ymm0 = ymm0[0],zero,zero,ymm1[0,7],zero,zero,ymm1[6]
; X64-NEXT: retq
%1 = call <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 0, i32 8, i32 8, i32 4, i32 7, i32 8, i32 8, i32 6>, i8 2)
diff --git a/test/CodeGen/X86/xop-pcmov.ll b/test/CodeGen/X86/xop-pcmov.ll
index 77aefe993b29..4e8abc0d4b6c 100644
--- a/test/CodeGen/X86/xop-pcmov.ll
+++ b/test/CodeGen/X86/xop-pcmov.ll
@@ -4,7 +4,7 @@
define <4 x double> @pcmov_4f64(<4 x double> %a, <4 x double> %b, <4 x double> %m) {
; CHECK-LABEL: pcmov_4f64:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpcmov %ymm2, %ymm1, %ymm0, %ymm0
; CHECK-NEXT: retq
%1 = bitcast <4 x double> %m to <4 x i64>
@@ -20,7 +20,7 @@ define <4 x double> @pcmov_4f64(<4 x double> %a, <4 x double> %b, <4 x double> %
define <2 x double> @pcmov_2f64(<2 x double> %a, <2 x double> %b, <2 x double> %m) {
; CHECK-LABEL: pcmov_2f64:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpcmov %xmm2, %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%1 = bitcast <2 x double> %m to <2 x i64>
@@ -36,7 +36,7 @@ define <2 x double> @pcmov_2f64(<2 x double> %a, <2 x double> %b, <2 x double> %
define <8 x float> @pcmov_8f32(<8 x float> %a, <8 x float> %b, <8 x float> %m) {
; CHECK-LABEL: pcmov_8f32:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpcmov %ymm2, %ymm1, %ymm0, %ymm0
; CHECK-NEXT: retq
%1 = bitcast <8 x float> %m to <8 x i32>
@@ -52,7 +52,7 @@ define <8 x float> @pcmov_8f32(<8 x float> %a, <8 x float> %b, <8 x float> %m) {
define <4 x float> @pcmov_4f32(<4 x float> %a, <4 x float> %b, <4 x float> %m) {
; CHECK-LABEL: pcmov_4f32:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpcmov %xmm2, %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%1 = bitcast <4 x float> %m to <4 x i32>
@@ -68,7 +68,7 @@ define <4 x float> @pcmov_4f32(<4 x float> %a, <4 x float> %b, <4 x float> %m) {
define <4 x i64> @pcmov_4i64(<4 x i64> %a, <4 x i64> %b, <4 x i64> %m) {
; CHECK-LABEL: pcmov_4i64:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpcmov %ymm2, %ymm1, %ymm0, %ymm0
; CHECK-NEXT: retq
%1 = and <4 x i64> %a, %m
@@ -80,7 +80,7 @@ define <4 x i64> @pcmov_4i64(<4 x i64> %a, <4 x i64> %b, <4 x i64> %m) {
define <2 x i64> @pcmov_2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %m) {
; CHECK-LABEL: pcmov_2i64:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpcmov %xmm2, %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%1 = and <2 x i64> %a, %m
@@ -92,7 +92,7 @@ define <2 x i64> @pcmov_2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %m) {
define <8 x i32> @pcmov_8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32> %m) {
; CHECK-LABEL: pcmov_8i32:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpcmov %ymm2, %ymm1, %ymm0, %ymm0
; CHECK-NEXT: retq
%1 = and <8 x i32> %a, %m
@@ -104,7 +104,7 @@ define <8 x i32> @pcmov_8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32> %m) {
define <4 x i32> @pcmov_4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %m) {
; CHECK-LABEL: pcmov_4i32:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpcmov %xmm2, %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%1 = and <4 x i32> %a, %m
@@ -116,7 +116,7 @@ define <4 x i32> @pcmov_4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %m) {
define <16 x i16> @pcmov_16i16(<16 x i16> %a, <16 x i16> %b, <16 x i16> %m) {
; CHECK-LABEL: pcmov_16i16:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpcmov %ymm2, %ymm1, %ymm0, %ymm0
; CHECK-NEXT: retq
%1 = and <16 x i16> %a, %m
@@ -128,7 +128,7 @@ define <16 x i16> @pcmov_16i16(<16 x i16> %a, <16 x i16> %b, <16 x i16> %m) {
define <8 x i16> @pcmov_8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %m) {
; CHECK-LABEL: pcmov_8i16:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpcmov %xmm2, %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%1 = and <8 x i16> %a, %m
@@ -140,7 +140,7 @@ define <8 x i16> @pcmov_8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %m) {
define <32 x i8> @pcmov_32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> %m) {
; CHECK-LABEL: pcmov_32i8:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpcmov %ymm2, %ymm1, %ymm0, %ymm0
; CHECK-NEXT: retq
%1 = and <32 x i8> %a, %m
@@ -152,7 +152,7 @@ define <32 x i8> @pcmov_32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> %m) {
define <16 x i8> @pcmov_16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %m) {
; CHECK-LABEL: pcmov_16i8:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: vpcmov %xmm2, %xmm1, %xmm0, %xmm0
; CHECK-NEXT: retq
%1 = and <16 x i8> %a, %m
diff --git a/test/CodeGen/X86/xop-schedule.ll b/test/CodeGen/X86/xop-schedule.ll
new file mode 100644
index 000000000000..cd2239b24051
--- /dev/null
+++ b/test/CodeGen/X86/xop-schedule.ll
@@ -0,0 +1,998 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=+xop | FileCheck %s --check-prefix=GENERIC
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=bdver1 | FileCheck %s --check-prefix=BDVER --check-prefix=BDVER1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=bdver2 | FileCheck %s --check-prefix=BDVER --check-prefix=BDVER2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=bdver3 | FileCheck %s --check-prefix=BDVER --check-prefix=BDVER3
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=bdver4 | FileCheck %s --check-prefix=BDVER --check-prefix=BDVER4
+
+define void @test_vfrczpd(<2 x double> %a0, <4 x double> %a1, <2 x double> *%a2, <4 x double> *%a3) {
+; GENERIC-LABEL: test_vfrczpd:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: vfrczpd %xmm0, %xmm0 # sched: [3:1.00]
+; GENERIC-NEXT: vfrczpd %ymm1, %ymm1 # sched: [3:1.00]
+; GENERIC-NEXT: vfrczpd (%rdi), %xmm0 # sched: [7:1.00]
+; GENERIC-NEXT: vfrczpd (%rsi), %ymm1 # sched: [7:1.00]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; BDVER-LABEL: test_vfrczpd:
+; BDVER: # %bb.0:
+; BDVER-NEXT: #APP
+; BDVER-NEXT: vfrczpd %xmm0, %xmm0
+; BDVER-NEXT: vfrczpd %ymm1, %ymm1
+; BDVER-NEXT: vfrczpd (%rdi), %xmm0
+; BDVER-NEXT: vfrczpd (%rsi), %ymm1
+; BDVER-NEXT: #NO_APP
+; BDVER-NEXT: vzeroupper
+; BDVER-NEXT: retq
+ call void asm sideeffect "vfrczpd $0, $0 \0a\09 vfrczpd $1, $1 \0a\09 vfrczpd $2, $0 \0a\09 vfrczpd $3, $1", "x,x,*m,*m"(<2 x double> %a0, <4 x double> %a1, <2 x double> *%a2, <4 x double> *%a3)
+ ret void
+}
+
+define void @test_vfrczps(<4 x float> %a0, <4 x double> %a1, <4 x float> *%a2, <4 x double> *%a3) {
+; GENERIC-LABEL: test_vfrczps:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: vfrczps %xmm0, %xmm0 # sched: [3:1.00]
+; GENERIC-NEXT: vfrczps %ymm1, %ymm1 # sched: [3:1.00]
+; GENERIC-NEXT: vfrczps (%rdi), %xmm0 # sched: [7:1.00]
+; GENERIC-NEXT: vfrczps (%rsi), %ymm1 # sched: [7:1.00]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; BDVER-LABEL: test_vfrczps:
+; BDVER: # %bb.0:
+; BDVER-NEXT: #APP
+; BDVER-NEXT: vfrczps %xmm0, %xmm0
+; BDVER-NEXT: vfrczps %ymm1, %ymm1
+; BDVER-NEXT: vfrczps (%rdi), %xmm0
+; BDVER-NEXT: vfrczps (%rsi), %ymm1
+; BDVER-NEXT: #NO_APP
+; BDVER-NEXT: vzeroupper
+; BDVER-NEXT: retq
+ call void asm sideeffect "vfrczps $0, $0 \0a\09 vfrczps $1, $1 \0a\09 vfrczps $2, $0 \0a\09 vfrczps $3, $1", "x,x,*m,*m"(<4 x float> %a0, <4 x double> %a1, <4 x float> *%a2, <4 x double> *%a3)
+ ret void
+}
+
+define void @test_vfrczsd(<2 x double> %a0, <2 x double> *%a1) {
+; GENERIC-LABEL: test_vfrczsd:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: vfrczsd %xmm0, %xmm0 # sched: [3:1.00]
+; GENERIC-NEXT: vfrczsd (%rdi), %xmm0 # sched: [7:1.00]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; BDVER-LABEL: test_vfrczsd:
+; BDVER: # %bb.0:
+; BDVER-NEXT: #APP
+; BDVER-NEXT: vfrczsd %xmm0, %xmm0
+; BDVER-NEXT: vfrczsd (%rdi), %xmm0
+; BDVER-NEXT: #NO_APP
+; BDVER-NEXT: retq
+ call void asm sideeffect "vfrczsd $0, $0 \0a\09 vfrczsd $1, $0", "x,*m"(<2 x double> %a0, <2 x double> *%a1)
+ ret void
+}
+
+define void @test_vfrczss(<4 x float> %a0, <4 x double> *%a1) {
+; GENERIC-LABEL: test_vfrczss:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: vfrczss %xmm0, %xmm0 # sched: [3:1.00]
+; GENERIC-NEXT: vfrczss (%rdi), %xmm0 # sched: [7:1.00]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; BDVER-LABEL: test_vfrczss:
+; BDVER: # %bb.0:
+; BDVER-NEXT: #APP
+; BDVER-NEXT: vfrczss %xmm0, %xmm0
+; BDVER-NEXT: vfrczss (%rdi), %xmm0
+; BDVER-NEXT: #NO_APP
+; BDVER-NEXT: retq
+ call void asm sideeffect "vfrczss $0, $0 \0a\09 vfrczss $1, $0", "x,*m"(<4 x float> %a0, <4 x double> *%a1)
+ ret void
+}
+
+define void @test_vpcmov_128(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2, <2 x i64> *%a3) {
+; GENERIC-LABEL: test_vpcmov_128:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: vpcmov %xmm2, %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: vpcmov (%rdi), %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; GENERIC-NEXT: vpcmov %xmm2, (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; BDVER-LABEL: test_vpcmov_128:
+; BDVER: # %bb.0:
+; BDVER-NEXT: #APP
+; BDVER-NEXT: vpcmov %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER-NEXT: vpcmov (%rdi), %xmm1, %xmm0, %xmm0
+; BDVER-NEXT: vpcmov %xmm2, (%rdi), %xmm0, %xmm0
+; BDVER-NEXT: #NO_APP
+; BDVER-NEXT: retq
+ call void asm sideeffect "vpcmov $2, $1, $0, $0 \0a\09 vpcmov $3, $1, $0, $0 \0a\09 vpcmov $2, $3, $0, $0", "x,x,x,*m"(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2, <2 x i64> *%a3)
+ ret void
+}
+
+define void @test_vpcmov_256(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> %a2, <4 x i64> *%a3) {
+; GENERIC-LABEL: test_vpcmov_256:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: vpcmov %ymm2, %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: vpcmov (%rdi), %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
+; GENERIC-NEXT: vpcmov %ymm2, (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; BDVER-LABEL: test_vpcmov_256:
+; BDVER: # %bb.0:
+; BDVER-NEXT: #APP
+; BDVER-NEXT: vpcmov %ymm2, %ymm1, %ymm0, %ymm0
+; BDVER-NEXT: vpcmov (%rdi), %ymm1, %ymm0, %ymm0
+; BDVER-NEXT: vpcmov %ymm2, (%rdi), %ymm0, %ymm0
+; BDVER-NEXT: #NO_APP
+; BDVER-NEXT: vzeroupper
+; BDVER-NEXT: retq
+ call void asm sideeffect "vpcmov $2, $1, $0, $0 \0a\09 vpcmov $3, $1, $0, $0 \0a\09 vpcmov $2, $3, $0, $0", "x,x,x,*m"(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> %a2, <4 x i64> *%a3)
+ ret void
+}
+
+define void @test_vpcom(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
+; GENERIC-LABEL: test_vpcom:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: vpcomb $3, %xmm1, %xmm0, %xmm0 # sched: [7:1.00]
+; GENERIC-NEXT: vpcomd $3, %xmm1, %xmm0, %xmm0 # sched: [7:1.00]
+; GENERIC-NEXT: vpcomq $3, %xmm1, %xmm0, %xmm0 # sched: [7:1.00]
+; GENERIC-NEXT: vpcomw $3, %xmm1, %xmm0, %xmm0 # sched: [7:1.00]
+; GENERIC-NEXT: vpcomb $3, (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; GENERIC-NEXT: vpcomd $3, (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; GENERIC-NEXT: vpcomq $3, (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; GENERIC-NEXT: vpcomw $3, (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; BDVER-LABEL: test_vpcom:
+; BDVER: # %bb.0:
+; BDVER-NEXT: #APP
+; BDVER-NEXT: vpcomb $3, %xmm1, %xmm0, %xmm0
+; BDVER-NEXT: vpcomd $3, %xmm1, %xmm0, %xmm0
+; BDVER-NEXT: vpcomq $3, %xmm1, %xmm0, %xmm0
+; BDVER-NEXT: vpcomw $3, %xmm1, %xmm0, %xmm0
+; BDVER-NEXT: vpcomb $3, (%rdi), %xmm0, %xmm0
+; BDVER-NEXT: vpcomd $3, (%rdi), %xmm0, %xmm0
+; BDVER-NEXT: vpcomq $3, (%rdi), %xmm0, %xmm0
+; BDVER-NEXT: vpcomw $3, (%rdi), %xmm0, %xmm0
+; BDVER-NEXT: #NO_APP
+; BDVER-NEXT: retq
+ call void asm sideeffect "vpcomb $3, $1, $0, $0 \0a\09 vpcomd $3, $1, $0, $0 \0a\09 vpcomq $3, $1, $0, $0 \0a\09 vpcomw $3, $1, $0, $0 \0a\09 vpcomb $3, $2, $0, $0 \0a\09 vpcomd $3, $2, $0, $0 \0a\09 vpcomq $3, $2, $0, $0 \0a\09 vpcomw $3, $2, $0, $0", "x,x,*m,i"(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2, i8 3)
+ ret void
+}
+
+define void @test_vpcomu(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
+; GENERIC-LABEL: test_vpcomu:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: vpcomub $3, %xmm1, %xmm0, %xmm0 # sched: [7:1.00]
+; GENERIC-NEXT: vpcomud $3, %xmm1, %xmm0, %xmm0 # sched: [7:1.00]
+; GENERIC-NEXT: vpcomuq $3, %xmm1, %xmm0, %xmm0 # sched: [7:1.00]
+; GENERIC-NEXT: vpcomuw $3, %xmm1, %xmm0, %xmm0 # sched: [7:1.00]
+; GENERIC-NEXT: vpcomub $3, (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; GENERIC-NEXT: vpcomud $3, (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; GENERIC-NEXT: vpcomuq $3, (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; GENERIC-NEXT: vpcomuw $3, (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; BDVER-LABEL: test_vpcomu:
+; BDVER: # %bb.0:
+; BDVER-NEXT: #APP
+; BDVER-NEXT: vpcomub $3, %xmm1, %xmm0, %xmm0
+; BDVER-NEXT: vpcomud $3, %xmm1, %xmm0, %xmm0
+; BDVER-NEXT: vpcomuq $3, %xmm1, %xmm0, %xmm0
+; BDVER-NEXT: vpcomuw $3, %xmm1, %xmm0, %xmm0
+; BDVER-NEXT: vpcomub $3, (%rdi), %xmm0, %xmm0
+; BDVER-NEXT: vpcomud $3, (%rdi), %xmm0, %xmm0
+; BDVER-NEXT: vpcomuq $3, (%rdi), %xmm0, %xmm0
+; BDVER-NEXT: vpcomuw $3, (%rdi), %xmm0, %xmm0
+; BDVER-NEXT: #NO_APP
+; BDVER-NEXT: retq
+ call void asm sideeffect "vpcomub $3, $1, $0, $0 \0a\09 vpcomud $3, $1, $0, $0 \0a\09 vpcomuq $3, $1, $0, $0 \0a\09 vpcomuw $3, $1, $0, $0 \0a\09 vpcomub $3, $2, $0, $0 \0a\09 vpcomud $3, $2, $0, $0 \0a\09 vpcomuq $3, $2, $0, $0 \0a\09 vpcomuw $3, $2, $0, $0", "x,x,*m,i"(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2, i8 3)
+ ret void
+}
+
+define void @test_vpermil2pd_128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> *%a3) {
+; GENERIC-LABEL: test_vpermil2pd_128:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: vpermil2pd $3, %xmm2, %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: vpermil2pd $3, %xmm2, (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
+; GENERIC-NEXT: vpermil2pd $3, (%rdi), %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; BDVER-LABEL: test_vpermil2pd_128:
+; BDVER: # %bb.0:
+; BDVER-NEXT: #APP
+; BDVER-NEXT: vpermil2pd $3, %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER-NEXT: vpermil2pd $3, %xmm2, (%rdi), %xmm0, %xmm0
+; BDVER-NEXT: vpermil2pd $3, (%rdi), %xmm1, %xmm0, %xmm0
+; BDVER-NEXT: #NO_APP
+; BDVER-NEXT: retq
+ call void asm sideeffect "vpermil2pd $4, $2, $1, $0, $0 \0a\09 vpermil2pd $4, $2, $3, $0, $0 \0a\09 vpermil2pd $4, $3, $1, $0, $0", "x,x,x,*m,i"(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> *%a3, i8 3)
+ ret void
+}
+
+define void @test_vpermil2pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, <4 x double> *%a3) {
+; GENERIC-LABEL: test_vpermil2pd_256:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: vpermil2pd $3, %ymm2, %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: vpermil2pd $3, %ymm2, (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
+; GENERIC-NEXT: vpermil2pd $3, (%rdi), %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; BDVER-LABEL: test_vpermil2pd_256:
+; BDVER: # %bb.0:
+; BDVER-NEXT: #APP
+; BDVER-NEXT: vpermil2pd $3, %ymm2, %ymm1, %ymm0, %ymm0
+; BDVER-NEXT: vpermil2pd $3, %ymm2, (%rdi), %ymm0, %ymm0
+; BDVER-NEXT: vpermil2pd $3, (%rdi), %ymm1, %ymm0, %ymm0
+; BDVER-NEXT: #NO_APP
+; BDVER-NEXT: vzeroupper
+; BDVER-NEXT: retq
+ call void asm sideeffect "vpermil2pd $4, $2, $1, $0, $0 \0a\09 vpermil2pd $4, $2, $3, $0, $0 \0a\09 vpermil2pd $4, $3, $1, $0, $0", "x,x,x,*m,i"(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, <4 x double> *%a3, i8 3)
+ ret void
+}
+
+define void @test_vpermil2ps_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> *%a3) {
+; GENERIC-LABEL: test_vpermil2ps_128:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: vpermil2ps $3, %xmm2, %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: vpermil2ps $3, %xmm2, (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
+; GENERIC-NEXT: vpermil2ps $3, (%rdi), %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; BDVER-LABEL: test_vpermil2ps_128:
+; BDVER: # %bb.0:
+; BDVER-NEXT: #APP
+; BDVER-NEXT: vpermil2ps $3, %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER-NEXT: vpermil2ps $3, %xmm2, (%rdi), %xmm0, %xmm0
+; BDVER-NEXT: vpermil2ps $3, (%rdi), %xmm1, %xmm0, %xmm0
+; BDVER-NEXT: #NO_APP
+; BDVER-NEXT: retq
+ call void asm sideeffect "vpermil2ps $4, $2, $1, $0, $0 \0a\09 vpermil2ps $4, $2, $3, $0, $0 \0a\09 vpermil2ps $4, $3, $1, $0, $0", "x,x,x,*m,i"(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> *%a3, i8 3)
+ ret void
+}
+
+define void @test_vpermil2ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, <8 x float> *%a3) {
+; GENERIC-LABEL: test_vpermil2ps_256:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: vpermil2ps $3, %ymm2, %ymm1, %ymm0, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT: vpermil2ps $3, %ymm2, (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
+; GENERIC-NEXT: vpermil2ps $3, (%rdi), %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: vzeroupper # sched: [100:0.33]
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; BDVER-LABEL: test_vpermil2ps_256:
+; BDVER: # %bb.0:
+; BDVER-NEXT: #APP
+; BDVER-NEXT: vpermil2ps $3, %ymm2, %ymm1, %ymm0, %ymm0
+; BDVER-NEXT: vpermil2ps $3, %ymm2, (%rdi), %ymm0, %ymm0
+; BDVER-NEXT: vpermil2ps $3, (%rdi), %ymm1, %ymm0, %ymm0
+; BDVER-NEXT: #NO_APP
+; BDVER-NEXT: vzeroupper
+; BDVER-NEXT: retq
+ call void asm sideeffect "vpermil2ps $4, $2, $1, $0, $0 \0a\09 vpermil2ps $4, $2, $3, $0, $0 \0a\09 vpermil2ps $4, $3, $1, $0, $0", "x,x,x,*m,i"(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, <8 x float> *%a3, i8 3)
+ ret void
+}
+
+define void @test_vphaddbd(<2 x i64> %a0, <2 x i64> *%a1) {
+; GENERIC-LABEL: test_vphaddbd:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: vphaddbd %xmm0, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT: vphaddbd (%rdi), %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; BDVER-LABEL: test_vphaddbd:
+; BDVER: # %bb.0:
+; BDVER-NEXT: #APP
+; BDVER-NEXT: vphaddbd %xmm0, %xmm0
+; BDVER-NEXT: vphaddbd (%rdi), %xmm0
+; BDVER-NEXT: #NO_APP
+; BDVER-NEXT: retq
+ call void asm sideeffect "vphaddbd $0, $0 \0a\09 vphaddbd $1, $0", "x,*m"(<2 x i64> %a0, <2 x i64> *%a1)
+ ret void
+}
+
+define void @test_vphaddbq(<2 x i64> %a0, <2 x i64> *%a1) {
+; GENERIC-LABEL: test_vphaddbq:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: vphaddbq %xmm0, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT: vphaddbq (%rdi), %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; BDVER-LABEL: test_vphaddbq:
+; BDVER: # %bb.0:
+; BDVER-NEXT: #APP
+; BDVER-NEXT: vphaddbq %xmm0, %xmm0
+; BDVER-NEXT: vphaddbq (%rdi), %xmm0
+; BDVER-NEXT: #NO_APP
+; BDVER-NEXT: retq
+ call void asm sideeffect "vphaddbq $0, $0 \0a\09 vphaddbq $1, $0", "x,*m"(<2 x i64> %a0, <2 x i64> *%a1)
+ ret void
+}
+
+define void @test_vphaddbw(<2 x i64> %a0, <2 x i64> *%a1) {
+; GENERIC-LABEL: test_vphaddbw:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: vphaddbw %xmm0, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT: vphaddbw (%rdi), %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; BDVER-LABEL: test_vphaddbw:
+; BDVER: # %bb.0:
+; BDVER-NEXT: #APP
+; BDVER-NEXT: vphaddbw %xmm0, %xmm0
+; BDVER-NEXT: vphaddbw (%rdi), %xmm0
+; BDVER-NEXT: #NO_APP
+; BDVER-NEXT: retq
+ call void asm sideeffect "vphaddbw $0, $0 \0a\09 vphaddbw $1, $0", "x,*m"(<2 x i64> %a0, <2 x i64> *%a1)
+ ret void
+}
+
+define void @test_vphadddq(<2 x i64> %a0, <2 x i64> *%a1) {
+; GENERIC-LABEL: test_vphadddq:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: vphadddq %xmm0, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT: vphadddq (%rdi), %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; BDVER-LABEL: test_vphadddq:
+; BDVER: # %bb.0:
+; BDVER-NEXT: #APP
+; BDVER-NEXT: vphadddq %xmm0, %xmm0
+; BDVER-NEXT: vphadddq (%rdi), %xmm0
+; BDVER-NEXT: #NO_APP
+; BDVER-NEXT: retq
+ call void asm sideeffect "vphadddq $0, $0 \0a\09 vphadddq $1, $0", "x,*m"(<2 x i64> %a0, <2 x i64> *%a1)
+ ret void
+}
+
+define void @test_vphaddubd(<2 x i64> %a0, <2 x i64> *%a1) {
+; GENERIC-LABEL: test_vphaddubd:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: vphaddubd %xmm0, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT: vphaddubd (%rdi), %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; BDVER-LABEL: test_vphaddubd:
+; BDVER: # %bb.0:
+; BDVER-NEXT: #APP
+; BDVER-NEXT: vphaddubd %xmm0, %xmm0
+; BDVER-NEXT: vphaddubd (%rdi), %xmm0
+; BDVER-NEXT: #NO_APP
+; BDVER-NEXT: retq
+ call void asm sideeffect "vphaddubd $0, $0 \0a\09 vphaddubd $1, $0", "x,*m"(<2 x i64> %a0, <2 x i64> *%a1)
+ ret void
+}
+
+define void @test_vphaddubq(<2 x i64> %a0, <2 x i64> *%a1) {
+; GENERIC-LABEL: test_vphaddubq:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: vphaddubq %xmm0, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT: vphaddubq (%rdi), %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; BDVER-LABEL: test_vphaddubq:
+; BDVER: # %bb.0:
+; BDVER-NEXT: #APP
+; BDVER-NEXT: vphaddubq %xmm0, %xmm0
+; BDVER-NEXT: vphaddubq (%rdi), %xmm0
+; BDVER-NEXT: #NO_APP
+; BDVER-NEXT: retq
+ call void asm sideeffect "vphaddubq $0, $0 \0a\09 vphaddubq $1, $0", "x,*m"(<2 x i64> %a0, <2 x i64> *%a1)
+ ret void
+}
+
+define void @test_vphaddubw(<2 x i64> %a0, <2 x i64> *%a1) {
+; GENERIC-LABEL: test_vphaddubw:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: vphaddubw %xmm0, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT: vphaddubw (%rdi), %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; BDVER-LABEL: test_vphaddubw:
+; BDVER: # %bb.0:
+; BDVER-NEXT: #APP
+; BDVER-NEXT: vphaddubw %xmm0, %xmm0
+; BDVER-NEXT: vphaddubw (%rdi), %xmm0
+; BDVER-NEXT: #NO_APP
+; BDVER-NEXT: retq
+ call void asm sideeffect "vphaddubw $0, $0 \0a\09 vphaddubw $1, $0", "x,*m"(<2 x i64> %a0, <2 x i64> *%a1)
+ ret void
+}
+
+define void @test_vphaddudq(<2 x i64> %a0, <2 x i64> *%a1) {
+; GENERIC-LABEL: test_vphaddudq:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: vphaddudq %xmm0, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT: vphaddudq (%rdi), %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; BDVER-LABEL: test_vphaddudq:
+; BDVER: # %bb.0:
+; BDVER-NEXT: #APP
+; BDVER-NEXT: vphaddudq %xmm0, %xmm0
+; BDVER-NEXT: vphaddudq (%rdi), %xmm0
+; BDVER-NEXT: #NO_APP
+; BDVER-NEXT: retq
+ call void asm sideeffect "vphaddudq $0, $0 \0a\09 vphaddudq $1, $0", "x,*m"(<2 x i64> %a0, <2 x i64> *%a1)
+ ret void
+}
+
+define void @test_vphadduwd(<2 x i64> %a0, <2 x i64> *%a1) {
+; GENERIC-LABEL: test_vphadduwd:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: vphadduwd %xmm0, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT: vphadduwd (%rdi), %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; BDVER-LABEL: test_vphadduwd:
+; BDVER: # %bb.0:
+; BDVER-NEXT: #APP
+; BDVER-NEXT: vphadduwd %xmm0, %xmm0
+; BDVER-NEXT: vphadduwd (%rdi), %xmm0
+; BDVER-NEXT: #NO_APP
+; BDVER-NEXT: retq
+ call void asm sideeffect "vphadduwd $0, $0 \0a\09 vphadduwd $1, $0", "x,*m"(<2 x i64> %a0, <2 x i64> *%a1)
+ ret void
+}
+
+define void @test_vphadduwq(<2 x i64> %a0, <2 x i64> *%a1) {
+; GENERIC-LABEL: test_vphadduwq:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: vphadduwq %xmm0, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT: vphadduwq (%rdi), %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; BDVER-LABEL: test_vphadduwq:
+; BDVER: # %bb.0:
+; BDVER-NEXT: #APP
+; BDVER-NEXT: vphadduwq %xmm0, %xmm0
+; BDVER-NEXT: vphadduwq (%rdi), %xmm0
+; BDVER-NEXT: #NO_APP
+; BDVER-NEXT: retq
+ call void asm sideeffect "vphadduwq $0, $0 \0a\09 vphadduwq $1, $0", "x,*m"(<2 x i64> %a0, <2 x i64> *%a1)
+ ret void
+}
+
+define void @test_vphaddwd(<2 x i64> %a0, <2 x i64> *%a1) {
+; GENERIC-LABEL: test_vphaddwd:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: vphaddwd %xmm0, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT: vphaddwd (%rdi), %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; BDVER-LABEL: test_vphaddwd:
+; BDVER: # %bb.0:
+; BDVER-NEXT: #APP
+; BDVER-NEXT: vphaddwd %xmm0, %xmm0
+; BDVER-NEXT: vphaddwd (%rdi), %xmm0
+; BDVER-NEXT: #NO_APP
+; BDVER-NEXT: retq
+ call void asm sideeffect "vphaddwd $0, $0 \0a\09 vphaddwd $1, $0", "x,*m"(<2 x i64> %a0, <2 x i64> *%a1)
+ ret void
+}
+
+define void @test_vphaddwq(<2 x i64> %a0, <2 x i64> *%a1) {
+; GENERIC-LABEL: test_vphaddwq:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: vphaddwq %xmm0, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT: vphaddwq (%rdi), %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; BDVER-LABEL: test_vphaddwq:
+; BDVER: # %bb.0:
+; BDVER-NEXT: #APP
+; BDVER-NEXT: vphaddwq %xmm0, %xmm0
+; BDVER-NEXT: vphaddwq (%rdi), %xmm0
+; BDVER-NEXT: #NO_APP
+; BDVER-NEXT: retq
+ call void asm sideeffect "vphaddwq $0, $0 \0a\09 vphaddwq $1, $0", "x,*m"(<2 x i64> %a0, <2 x i64> *%a1)
+ ret void
+}
+
+define void @test_vphsubbw(<2 x i64> %a0, <2 x i64> *%a1) {
+; GENERIC-LABEL: test_vphsubbw:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: vphsubbw %xmm0, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT: vphsubbw (%rdi), %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; BDVER-LABEL: test_vphsubbw:
+; BDVER: # %bb.0:
+; BDVER-NEXT: #APP
+; BDVER-NEXT: vphsubbw %xmm0, %xmm0
+; BDVER-NEXT: vphsubbw (%rdi), %xmm0
+; BDVER-NEXT: #NO_APP
+; BDVER-NEXT: retq
+ call void asm sideeffect "vphsubbw $0, $0 \0a\09 vphsubbw $1, $0", "x,*m"(<2 x i64> %a0, <2 x i64> *%a1)
+ ret void
+}
+
+define void @test_vphsubdq(<2 x i64> %a0, <2 x i64> *%a1) {
+; GENERIC-LABEL: test_vphsubdq:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: vphsubdq %xmm0, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT: vphsubdq (%rdi), %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; BDVER-LABEL: test_vphsubdq:
+; BDVER: # %bb.0:
+; BDVER-NEXT: #APP
+; BDVER-NEXT: vphsubdq %xmm0, %xmm0
+; BDVER-NEXT: vphsubdq (%rdi), %xmm0
+; BDVER-NEXT: #NO_APP
+; BDVER-NEXT: retq
+ call void asm sideeffect "vphsubdq $0, $0 \0a\09 vphsubdq $1, $0", "x,*m"(<2 x i64> %a0, <2 x i64> *%a1)
+ ret void
+}
+
+define void @test_vphsubwd(<2 x i64> %a0, <2 x i64> *%a1) {
+; GENERIC-LABEL: test_vphsubwd:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: vphsubwd %xmm0, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT: vphsubwd (%rdi), %xmm0 # sched: [5:0.50]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; BDVER-LABEL: test_vphsubwd:
+; BDVER: # %bb.0:
+; BDVER-NEXT: #APP
+; BDVER-NEXT: vphsubwd %xmm0, %xmm0
+; BDVER-NEXT: vphsubwd (%rdi), %xmm0
+; BDVER-NEXT: #NO_APP
+; BDVER-NEXT: retq
+ call void asm sideeffect "vphsubwd $0, $0 \0a\09 vphsubwd $1, $0", "x,*m"(<2 x i64> %a0, <2 x i64> *%a1)
+ ret void
+}
+
+define void @test_vpmacsdd(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2, <2 x i64> *%a3) {
+; GENERIC-LABEL: test_vpmacsdd:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: vpmacsdd %xmm2, %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; GENERIC-NEXT: vpmacsdd %xmm2, (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; BDVER-LABEL: test_vpmacsdd:
+; BDVER: # %bb.0:
+; BDVER-NEXT: #APP
+; BDVER-NEXT: vpmacsdd %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER-NEXT: vpmacsdd %xmm2, (%rdi), %xmm0, %xmm0
+; BDVER-NEXT: #NO_APP
+; BDVER-NEXT: retq
+ call void asm sideeffect "vpmacsdd $2, $1, $0, $0 \0a\09 vpmacsdd $2, $3, $0, $0", "x,x,x,*m"(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2, <2 x i64> *%a3)
+ ret void
+}
+
+define void @test_vpmacsdqh(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2, <2 x i64> *%a3) {
+; GENERIC-LABEL: test_vpmacsdqh:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: vpmacsdqh %xmm2, %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; GENERIC-NEXT: vpmacsdqh %xmm2, (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; BDVER-LABEL: test_vpmacsdqh:
+; BDVER: # %bb.0:
+; BDVER-NEXT: #APP
+; BDVER-NEXT: vpmacsdqh %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER-NEXT: vpmacsdqh %xmm2, (%rdi), %xmm0, %xmm0
+; BDVER-NEXT: #NO_APP
+; BDVER-NEXT: retq
+ call void asm sideeffect "vpmacsdqh $2, $1, $0, $0 \0a\09 vpmacsdqh $2, $3, $0, $0", "x,x,x,*m"(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2, <2 x i64> *%a3)
+ ret void
+}
+
+define void @test_vpmacsdql(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2, <2 x i64> *%a3) {
+; GENERIC-LABEL: test_vpmacsdql:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: vpmacsdql %xmm2, %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; GENERIC-NEXT: vpmacsdql %xmm2, (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; BDVER-LABEL: test_vpmacsdql:
+; BDVER: # %bb.0:
+; BDVER-NEXT: #APP
+; BDVER-NEXT: vpmacsdql %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER-NEXT: vpmacsdql %xmm2, (%rdi), %xmm0, %xmm0
+; BDVER-NEXT: #NO_APP
+; BDVER-NEXT: retq
+ call void asm sideeffect "vpmacsdql $2, $1, $0, $0 \0a\09 vpmacsdql $2, $3, $0, $0", "x,x,x,*m"(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2, <2 x i64> *%a3)
+ ret void
+}
+
+define void @test_vpmacssdd(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2, <2 x i64> *%a3) {
+; GENERIC-LABEL: test_vpmacssdd:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: vpmacssdd %xmm2, %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; GENERIC-NEXT: vpmacssdd %xmm2, (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; BDVER-LABEL: test_vpmacssdd:
+; BDVER: # %bb.0:
+; BDVER-NEXT: #APP
+; BDVER-NEXT: vpmacssdd %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER-NEXT: vpmacssdd %xmm2, (%rdi), %xmm0, %xmm0
+; BDVER-NEXT: #NO_APP
+; BDVER-NEXT: retq
+ call void asm sideeffect "vpmacssdd $2, $1, $0, $0 \0a\09 vpmacssdd $2, $3, $0, $0", "x,x,x,*m"(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2, <2 x i64> *%a3)
+ ret void
+}
+
+define void @test_vpmacssdqh(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2, <2 x i64> *%a3) {
+; GENERIC-LABEL: test_vpmacssdqh:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: vpmacssdqh %xmm2, %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; GENERIC-NEXT: vpmacssdqh %xmm2, (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; BDVER-LABEL: test_vpmacssdqh:
+; BDVER: # %bb.0:
+; BDVER-NEXT: #APP
+; BDVER-NEXT: vpmacssdqh %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER-NEXT: vpmacssdqh %xmm2, (%rdi), %xmm0, %xmm0
+; BDVER-NEXT: #NO_APP
+; BDVER-NEXT: retq
+ call void asm sideeffect "vpmacssdqh $2, $1, $0, $0 \0a\09 vpmacssdqh $2, $3, $0, $0", "x,x,x,*m"(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2, <2 x i64> *%a3)
+ ret void
+}
+
+define void @test_vpmacssdql(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2, <2 x i64> *%a3) {
+; GENERIC-LABEL: test_vpmacssdql:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: vpmacssdql %xmm2, %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; GENERIC-NEXT: vpmacssdql %xmm2, (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; BDVER-LABEL: test_vpmacssdql:
+; BDVER: # %bb.0:
+; BDVER-NEXT: #APP
+; BDVER-NEXT: vpmacssdql %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER-NEXT: vpmacssdql %xmm2, (%rdi), %xmm0, %xmm0
+; BDVER-NEXT: #NO_APP
+; BDVER-NEXT: retq
+ call void asm sideeffect "vpmacssdql $2, $1, $0, $0 \0a\09 vpmacssdql $2, $3, $0, $0", "x,x,x,*m"(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2, <2 x i64> *%a3)
+ ret void
+}
+
+define void @test_vpmacsswd(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2, <2 x i64> *%a3) {
+; GENERIC-LABEL: test_vpmacsswd:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: vpmacsswd %xmm2, %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; GENERIC-NEXT: vpmacsswd %xmm2, (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; BDVER-LABEL: test_vpmacsswd:
+; BDVER: # %bb.0:
+; BDVER-NEXT: #APP
+; BDVER-NEXT: vpmacsswd %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER-NEXT: vpmacsswd %xmm2, (%rdi), %xmm0, %xmm0
+; BDVER-NEXT: #NO_APP
+; BDVER-NEXT: retq
+ call void asm sideeffect "vpmacsswd $2, $1, $0, $0 \0a\09 vpmacsswd $2, $3, $0, $0", "x,x,x,*m"(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2, <2 x i64> *%a3)
+ ret void
+}
+
+define void @test_vpmacssww(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2, <2 x i64> *%a3) {
+; GENERIC-LABEL: test_vpmacssww:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: vpmacssww %xmm2, %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; GENERIC-NEXT: vpmacssww %xmm2, (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; BDVER-LABEL: test_vpmacssww:
+; BDVER: # %bb.0:
+; BDVER-NEXT: #APP
+; BDVER-NEXT: vpmacssww %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER-NEXT: vpmacssww %xmm2, (%rdi), %xmm0, %xmm0
+; BDVER-NEXT: #NO_APP
+; BDVER-NEXT: retq
+ call void asm sideeffect "vpmacssww $2, $1, $0, $0 \0a\09 vpmacssww $2, $3, $0, $0", "x,x,x,*m"(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2, <2 x i64> *%a3)
+ ret void
+}
+
+define void @test_vpmacswd(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2, <2 x i64> *%a3) {
+; GENERIC-LABEL: test_vpmacswd:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: vpmacswd %xmm2, %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; GENERIC-NEXT: vpmacswd %xmm2, (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; BDVER-LABEL: test_vpmacswd:
+; BDVER: # %bb.0:
+; BDVER-NEXT: #APP
+; BDVER-NEXT: vpmacswd %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER-NEXT: vpmacswd %xmm2, (%rdi), %xmm0, %xmm0
+; BDVER-NEXT: #NO_APP
+; BDVER-NEXT: retq
+ call void asm sideeffect "vpmacswd $2, $1, $0, $0 \0a\09 vpmacswd $2, $3, $0, $0", "x,x,x,*m"(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2, <2 x i64> *%a3)
+ ret void
+}
+
+define void @test_vpmacsww(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2, <2 x i64> *%a3) {
+; GENERIC-LABEL: test_vpmacsww:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: vpmacsww %xmm2, %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; GENERIC-NEXT: vpmacsww %xmm2, (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; BDVER-LABEL: test_vpmacsww:
+; BDVER: # %bb.0:
+; BDVER-NEXT: #APP
+; BDVER-NEXT: vpmacsww %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER-NEXT: vpmacsww %xmm2, (%rdi), %xmm0, %xmm0
+; BDVER-NEXT: #NO_APP
+; BDVER-NEXT: retq
+ call void asm sideeffect "vpmacsww $2, $1, $0, $0 \0a\09 vpmacsww $2, $3, $0, $0", "x,x,x,*m"(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2, <2 x i64> *%a3)
+ ret void
+}
+
+define void @test_vpmadcsswd(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2, <2 x i64> *%a3) {
+; GENERIC-LABEL: test_vpmadcsswd:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: vpmadcsswd %xmm2, %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; GENERIC-NEXT: vpmadcsswd %xmm2, (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; BDVER-LABEL: test_vpmadcsswd:
+; BDVER: # %bb.0:
+; BDVER-NEXT: #APP
+; BDVER-NEXT: vpmadcsswd %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER-NEXT: vpmadcsswd %xmm2, (%rdi), %xmm0, %xmm0
+; BDVER-NEXT: #NO_APP
+; BDVER-NEXT: retq
+ call void asm sideeffect "vpmadcsswd $2, $1, $0, $0 \0a\09 vpmadcsswd $2, $3, $0, $0", "x,x,x,*m"(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2, <2 x i64> *%a3)
+ ret void
+}
+
+define void @test_vpmadcswd(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2, <2 x i64> *%a3) {
+; GENERIC-LABEL: test_vpmadcswd:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: vpmadcswd %xmm2, %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; GENERIC-NEXT: vpmadcswd %xmm2, (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; BDVER-LABEL: test_vpmadcswd:
+; BDVER: # %bb.0:
+; BDVER-NEXT: #APP
+; BDVER-NEXT: vpmadcswd %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER-NEXT: vpmadcswd %xmm2, (%rdi), %xmm0, %xmm0
+; BDVER-NEXT: #NO_APP
+; BDVER-NEXT: retq
+ call void asm sideeffect "vpmadcswd $2, $1, $0, $0 \0a\09 vpmadcswd $2, $3, $0, $0", "x,x,x,*m"(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2, <2 x i64> *%a3)
+ ret void
+}
+
+define void @test_vpperm(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2, <2 x i64> *%a3) {
+; GENERIC-LABEL: test_vpperm:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: vpperm (%rdi), %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
+; GENERIC-NEXT: vpperm %xmm2, (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; BDVER-LABEL: test_vpperm:
+; BDVER: # %bb.0:
+; BDVER-NEXT: #APP
+; BDVER-NEXT: vpperm %xmm2, %xmm1, %xmm0, %xmm0
+; BDVER-NEXT: vpperm (%rdi), %xmm1, %xmm0, %xmm0
+; BDVER-NEXT: vpperm %xmm2, (%rdi), %xmm0, %xmm0
+; BDVER-NEXT: #NO_APP
+; BDVER-NEXT: retq
+ call void asm sideeffect "vpperm $2, $1, $0, $0 \0A\09 vpperm $3, $1, $0, $0 \0A\09 vpperm $2, $3, $0, $0", "x,x,x,*m"(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2, <2 x i64> *%a3)
+ ret void
+}
+
+define void @test_vprot(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
+; GENERIC-LABEL: test_vprot:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: vprotb %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: vprotd %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: vprotq %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: vprotw %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: vprotb (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
+; GENERIC-NEXT: vprotd (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
+; GENERIC-NEXT: vprotq (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
+; GENERIC-NEXT: vprotw (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
+; GENERIC-NEXT: vprotb %xmm0, (%rdi), %xmm0 # sched: [5:1.00]
+; GENERIC-NEXT: vprotd %xmm0, (%rdi), %xmm0 # sched: [5:1.00]
+; GENERIC-NEXT: vprotq %xmm0, (%rdi), %xmm0 # sched: [5:1.00]
+; GENERIC-NEXT: vprotw %xmm0, (%rdi), %xmm0 # sched: [5:1.00]
+; GENERIC-NEXT: vprotb $7, %xmm0, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: vprotd $7, %xmm0, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: vprotq $7, %xmm0, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: vprotw $7, %xmm0, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: vprotb $7, (%rdi), %xmm0 # sched: [5:1.00]
+; GENERIC-NEXT: vprotd $7, (%rdi), %xmm0 # sched: [5:1.00]
+; GENERIC-NEXT: vprotq $7, (%rdi), %xmm0 # sched: [5:1.00]
+; GENERIC-NEXT: vprotw $7, (%rdi), %xmm0 # sched: [5:1.00]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; BDVER-LABEL: test_vprot:
+; BDVER: # %bb.0:
+; BDVER-NEXT: #APP
+; BDVER-NEXT: vprotb %xmm1, %xmm0, %xmm0
+; BDVER-NEXT: vprotd %xmm1, %xmm0, %xmm0
+; BDVER-NEXT: vprotq %xmm1, %xmm0, %xmm0
+; BDVER-NEXT: vprotw %xmm1, %xmm0, %xmm0
+; BDVER-NEXT: vprotb (%rdi), %xmm0, %xmm0
+; BDVER-NEXT: vprotd (%rdi), %xmm0, %xmm0
+; BDVER-NEXT: vprotq (%rdi), %xmm0, %xmm0
+; BDVER-NEXT: vprotw (%rdi), %xmm0, %xmm0
+; BDVER-NEXT: vprotb %xmm0, (%rdi), %xmm0
+; BDVER-NEXT: vprotd %xmm0, (%rdi), %xmm0
+; BDVER-NEXT: vprotq %xmm0, (%rdi), %xmm0
+; BDVER-NEXT: vprotw %xmm0, (%rdi), %xmm0
+; BDVER-NEXT: vprotb $7, %xmm0, %xmm0
+; BDVER-NEXT: vprotd $7, %xmm0, %xmm0
+; BDVER-NEXT: vprotq $7, %xmm0, %xmm0
+; BDVER-NEXT: vprotw $7, %xmm0, %xmm0
+; BDVER-NEXT: vprotb $7, (%rdi), %xmm0
+; BDVER-NEXT: vprotd $7, (%rdi), %xmm0
+; BDVER-NEXT: vprotq $7, (%rdi), %xmm0
+; BDVER-NEXT: vprotw $7, (%rdi), %xmm0
+; BDVER-NEXT: #NO_APP
+; BDVER-NEXT: retq
+ call void asm sideeffect "vprotb $1, $0, $0 \0A\09 vprotd $1, $0, $0 \0A\09 vprotq $1, $0, $0 \0A\09 vprotw $1, $0, $0 \0A\09 vprotb $2, $0, $0 \0A\09 vprotd $2, $0, $0 \0A\09 vprotq $2, $0, $0 \0A\09 vprotw $2, $0, $0 \0A\09 vprotb $0, $2, $0 \0A\09 vprotd $0, $2, $0 \0A\09 vprotq $0, $2, $0 \0A\09 vprotw $0, $2, $0 \0A\09 vprotb $3, $0, $0 \0A\09 vprotd $3, $0, $0 \0A\09 vprotq $3, $0, $0 \0A\09 vprotw $3, $0, $0 \0A\09 vprotb $3, $2, $0 \0A\09 vprotd $3, $2, $0 \0A\09 vprotq $3, $2, $0 \0A\09 vprotw $3, $2, $0", "x,x,*m,i"(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2, i8 7)
+ ret void
+}
+
+define void @test_vpsha(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
+; GENERIC-LABEL: test_vpsha:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: vpshab %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: vpshad %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: vpshaq %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: vpshaw %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: vpshab (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
+; GENERIC-NEXT: vpshad (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
+; GENERIC-NEXT: vpshaq (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
+; GENERIC-NEXT: vpshaw (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
+; GENERIC-NEXT: vpshab %xmm0, (%rdi), %xmm0 # sched: [5:1.00]
+; GENERIC-NEXT: vpshad %xmm0, (%rdi), %xmm0 # sched: [5:1.00]
+; GENERIC-NEXT: vpshaq %xmm0, (%rdi), %xmm0 # sched: [5:1.00]
+; GENERIC-NEXT: vpshaw %xmm0, (%rdi), %xmm0 # sched: [5:1.00]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; BDVER-LABEL: test_vpsha:
+; BDVER: # %bb.0:
+; BDVER-NEXT: #APP
+; BDVER-NEXT: vpshab %xmm1, %xmm0, %xmm0
+; BDVER-NEXT: vpshad %xmm1, %xmm0, %xmm0
+; BDVER-NEXT: vpshaq %xmm1, %xmm0, %xmm0
+; BDVER-NEXT: vpshaw %xmm1, %xmm0, %xmm0
+; BDVER-NEXT: vpshab (%rdi), %xmm0, %xmm0
+; BDVER-NEXT: vpshad (%rdi), %xmm0, %xmm0
+; BDVER-NEXT: vpshaq (%rdi), %xmm0, %xmm0
+; BDVER-NEXT: vpshaw (%rdi), %xmm0, %xmm0
+; BDVER-NEXT: vpshab %xmm0, (%rdi), %xmm0
+; BDVER-NEXT: vpshad %xmm0, (%rdi), %xmm0
+; BDVER-NEXT: vpshaq %xmm0, (%rdi), %xmm0
+; BDVER-NEXT: vpshaw %xmm0, (%rdi), %xmm0
+; BDVER-NEXT: #NO_APP
+; BDVER-NEXT: retq
+ call void asm sideeffect "vpshab $1, $0, $0 \0A\09 vpshad $1, $0, $0 \0A\09 vpshaq $1, $0, $0 \0A\09 vpshaw $1, $0, $0 \0A\09 vpshab $2, $0, $0 \0A\09 vpshad $2, $0, $0 \0A\09 vpshaq $2, $0, $0 \0A\09 vpshaw $2, $0, $0 \0A\09 vpshab $0, $2, $0 \0A\09 vpshad $0, $2, $0 \0A\09 vpshaq $0, $2, $0 \0A\09 vpshaw $0, $2, $0", "x,x,*m"(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2)
+ ret void
+}
+
+define void @test_vpshl(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
+; GENERIC-LABEL: test_vpshl:
+; GENERIC: # %bb.0:
+; GENERIC-NEXT: #APP
+; GENERIC-NEXT: vpshlb %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: vpshld %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: vpshlq %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: vpshlw %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT: vpshlb (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
+; GENERIC-NEXT: vpshld (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
+; GENERIC-NEXT: vpshlq (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
+; GENERIC-NEXT: vpshlw (%rdi), %xmm0, %xmm0 # sched: [5:1.00]
+; GENERIC-NEXT: vpshlb %xmm0, (%rdi), %xmm0 # sched: [5:1.00]
+; GENERIC-NEXT: vpshld %xmm0, (%rdi), %xmm0 # sched: [5:1.00]
+; GENERIC-NEXT: vpshlq %xmm0, (%rdi), %xmm0 # sched: [5:1.00]
+; GENERIC-NEXT: vpshlw %xmm0, (%rdi), %xmm0 # sched: [5:1.00]
+; GENERIC-NEXT: #NO_APP
+; GENERIC-NEXT: retq # sched: [1:1.00]
+;
+; BDVER-LABEL: test_vpshl:
+; BDVER: # %bb.0:
+; BDVER-NEXT: #APP
+; BDVER-NEXT: vpshlb %xmm1, %xmm0, %xmm0
+; BDVER-NEXT: vpshld %xmm1, %xmm0, %xmm0
+; BDVER-NEXT: vpshlq %xmm1, %xmm0, %xmm0
+; BDVER-NEXT: vpshlw %xmm1, %xmm0, %xmm0
+; BDVER-NEXT: vpshlb (%rdi), %xmm0, %xmm0
+; BDVER-NEXT: vpshld (%rdi), %xmm0, %xmm0
+; BDVER-NEXT: vpshlq (%rdi), %xmm0, %xmm0
+; BDVER-NEXT: vpshlw (%rdi), %xmm0, %xmm0
+; BDVER-NEXT: vpshlb %xmm0, (%rdi), %xmm0
+; BDVER-NEXT: vpshld %xmm0, (%rdi), %xmm0
+; BDVER-NEXT: vpshlq %xmm0, (%rdi), %xmm0
+; BDVER-NEXT: vpshlw %xmm0, (%rdi), %xmm0
+; BDVER-NEXT: #NO_APP
+; BDVER-NEXT: retq
+ call void asm sideeffect "vpshlb $1, $0, $0 \0A\09 vpshld $1, $0, $0 \0A\09 vpshlq $1, $0, $0 \0A\09 vpshlw $1, $0, $0 \0A\09 vpshlb $2, $0, $0 \0A\09 vpshld $2, $0, $0 \0A\09 vpshlq $2, $0, $0 \0A\09 vpshlw $2, $0, $0 \0A\09 vpshlb $0, $2, $0 \0A\09 vpshld $0, $2, $0 \0A\09 vpshlq $0, $2, $0 \0A\09 vpshlw $0, $2, $0", "x,x,*m"(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2)
+ ret void
+}
diff --git a/test/CodeGen/X86/xor-combine-debugloc.ll b/test/CodeGen/X86/xor-combine-debugloc.ll
index 21777c1c572f..4491d1434e25 100644
--- a/test/CodeGen/X86/xor-combine-debugloc.ll
+++ b/test/CodeGen/X86/xor-combine-debugloc.ll
@@ -4,11 +4,11 @@
; that implictly defines %eflags has a same debug location with the icmp
; instruction, and the branch instructions have a same debug location with the
; br instruction.
-;
+;
; CHECK: [[DLOC1:![0-9]+]] = !DILocation(line: 5, column: 9, scope: !{{[0-9]+}})
; CHECK: [[DLOC2:![0-9]+]] = !DILocation(line: 5, column: 7, scope: !{{[0-9]+}})
-; CHECK-DAG: [[VREG1:%[^ ]+]] = COPY %esi
-; CHECK-DAG: [[VREG2:%[^ ]+]] = COPY %edi
+; CHECK-DAG: [[VREG1:%[^ ]+]]:gr32 = COPY %esi
+; CHECK-DAG: [[VREG2:%[^ ]+]]:gr32 = COPY %edi
; CHECK: SUB32rr [[VREG2]], [[VREG1]], implicit-def %eflags, debug-location [[DLOC1]]
; CHECK-NEXT: JE_1{{.*}} implicit %eflags, debug-location [[DLOC2]]
; CHECK-NEXT: JMP_1{{.*}} debug-location [[DLOC2]]
@@ -36,8 +36,8 @@ return: ; preds = %if.else, %if.then
ret i32 %retval.0, !dbg !21
}
-declare i32 @bar(...)
-declare i32 @baz(...)
+declare i32 @bar(...)
+declare i32 @baz(...)
; Function Attrs: nounwind readnone
declare void @llvm.dbg.value(metadata, i64, metadata, metadata)
diff --git a/test/CodeGen/X86/xor-icmp.ll b/test/CodeGen/X86/xor-icmp.ll
index cd58dd1e7604..6cdc3186cd4c 100644
--- a/test/CodeGen/X86/xor-icmp.ll
+++ b/test/CodeGen/X86/xor-icmp.ll
@@ -5,27 +5,25 @@
define i32 @t(i32 %a, i32 %b) nounwind ssp {
; X32-LABEL: t:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: movb {{[0-9]+}}(%esp), %al
; X32-NEXT: xorb {{[0-9]+}}(%esp), %al
; X32-NEXT: testb $64, %al
; X32-NEXT: je .LBB0_1
-; X32-NEXT: # BB#2: # %bb1
+; X32-NEXT: # %bb.2: # %bb1
; X32-NEXT: jmp bar # TAILCALL
; X32-NEXT: .LBB0_1: # %bb
; X32-NEXT: jmp foo # TAILCALL
;
; X64-LABEL: t:
-; X64: # BB#0: # %entry
-; X64-NEXT: movl %edi, %eax
-; X64-NEXT: xorl %esi, %eax
-; X64-NEXT: testb $64, %ah
-; X64-NEXT: je .LBB0_1
-; X64-NEXT: # BB#2: # %bb1
+; X64: # %bb.0: # %entry
+; X64-NEXT: xorl %esi, %edi
; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: btl $14, %edi
+; X64-NEXT: jae .LBB0_1
+; X64-NEXT: # %bb.2: # %bb1
; X64-NEXT: jmp bar # TAILCALL
; X64-NEXT: .LBB0_1: # %bb
-; X64-NEXT: xorl %eax, %eax
; X64-NEXT: jmp foo # TAILCALL
entry:
%0 = and i32 %a, 16384
@@ -50,27 +48,27 @@ declare i32 @bar(...)
define i32 @t2(i32 %x, i32 %y) nounwind ssp {
; X32-LABEL: t2:
-; X32: # BB#0: # %entry
+; X32: # %bb.0: # %entry
; X32-NEXT: cmpl $0, {{[0-9]+}}(%esp)
; X32-NEXT: sete %al
; X32-NEXT: cmpl $0, {{[0-9]+}}(%esp)
; X32-NEXT: sete %cl
; X32-NEXT: cmpb %al, %cl
; X32-NEXT: je .LBB1_1
-; X32-NEXT: # BB#2: # %bb
+; X32-NEXT: # %bb.2: # %bb
; X32-NEXT: jmp foo # TAILCALL
; X32-NEXT: .LBB1_1: # %return
; X32-NEXT: retl
;
; X64-LABEL: t2:
-; X64: # BB#0: # %entry
+; X64: # %bb.0: # %entry
; X64-NEXT: testl %edi, %edi
; X64-NEXT: sete %al
; X64-NEXT: testl %esi, %esi
; X64-NEXT: sete %cl
; X64-NEXT: cmpb %al, %cl
; X64-NEXT: je .LBB1_1
-; X64-NEXT: # BB#2: # %bb
+; X64-NEXT: # %bb.2: # %bb
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: jmp foo # TAILCALL
; X64-NEXT: .LBB1_1: # %return
diff --git a/test/CodeGen/X86/xor-select-i1-combine.ll b/test/CodeGen/X86/xor-select-i1-combine.ll
index c9383282a0cc..8ba7f7d931d4 100644
--- a/test/CodeGen/X86/xor-select-i1-combine.ll
+++ b/test/CodeGen/X86/xor-select-i1-combine.ll
@@ -6,7 +6,7 @@
define i32 @main(i8 %small) {
; CHECK-LABEL: main:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: testb $1, %dil
; CHECK-NEXT: movl $m, %eax
; CHECK-NEXT: movl $n, %ecx
@@ -24,7 +24,7 @@ entry:
define i32 @main2(i8 %small) {
; CHECK-LABEL: main2:
-; CHECK: # BB#0: # %entry
+; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movl $m, %eax
; CHECK-NEXT: movl $n, %ecx
; CHECK-NEXT: testb $1, %dil
diff --git a/test/CodeGen/X86/xray-attribute-instrumentation.ll b/test/CodeGen/X86/xray-attribute-instrumentation.ll
index 00b45f8d95b7..7fa2c0e11805 100644
--- a/test/CodeGen/X86/xray-attribute-instrumentation.ll
+++ b/test/CodeGen/X86/xray-attribute-instrumentation.ll
@@ -1,27 +1,25 @@
; RUN: llc -filetype=asm -o - -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
+; RUN: llc -filetype=asm -o - -mtriple=x86_64-unknown-linux-gnu \
+; RUN: -relocation-model=pic < %s | FileCheck %s
; RUN: llc -filetype=asm -o - -mtriple=x86_64-darwin-unknown < %s | FileCheck %s
define i32 @foo() nounwind noinline uwtable "function-instrument"="xray-always" {
; CHECK: .p2align 1, 0x90
; CHECK-LABEL: Lxray_sled_0:
-; CHECK-NEXT: .ascii "\353\t"
+; CHECK: .ascii "\353\t"
; CHECK-NEXT: nopw 512(%rax,%rax)
-; CHECK-LABEL: Ltmp0:
ret i32 0
; CHECK: .p2align 1, 0x90
; CHECK-LABEL: Lxray_sled_1:
-; CHECK-NEXT: retq
+; CHECK: retq
; CHECK-NEXT: nopw %cs:512(%rax,%rax)
}
-; CHECK: .p2align 4, 0x90
-; CHECK-NEXT: .quad {{.*}}xray_fn_idx_synth_0
-; CHECK-NEXT: .section {{.*}}xray_instr_map
+; CHECK-LABEL: xray_instr_map
; CHECK-LABEL: Lxray_sleds_start0:
; CHECK: .quad {{.*}}xray_sled_0
; CHECK: .quad {{.*}}xray_sled_1
; CHECK-LABEL: Lxray_sleds_end0:
-; CHECK: .section {{.*}}xray_fn_idx
-; CHECK-LABEL: Lxray_fn_idx_synth_0:
+; CHECK-LABEL: xray_fn_idx
; CHECK: .quad {{.*}}xray_sleds_start0
; CHECK-NEXT: .quad {{.*}}xray_sleds_end0
@@ -31,9 +29,8 @@ define i32 @foo() nounwind noinline uwtable "function-instrument"="xray-always"
define i32 @bar(i32 %i) nounwind noinline uwtable "function-instrument"="xray-always" {
; CHECK: .p2align 1, 0x90
; CHECK-LABEL: Lxray_sled_2:
-; CHECK-NEXT: .ascii "\353\t"
+; CHECK: .ascii "\353\t"
; CHECK-NEXT: nopw 512(%rax,%rax)
-; CHECK-LABEL: Ltmp1:
Test:
%cond = icmp eq i32 %i, 0
br i1 %cond, label %IsEqual, label %NotEqual
@@ -41,24 +38,21 @@ IsEqual:
ret i32 0
; CHECK: .p2align 1, 0x90
; CHECK-LABEL: Lxray_sled_3:
-; CHECK-NEXT: retq
+; CHECK: retq
; CHECK-NEXT: nopw %cs:512(%rax,%rax)
NotEqual:
ret i32 1
; CHECK: .p2align 1, 0x90
; CHECK-LABEL: Lxray_sled_4:
-; CHECK-NEXT: retq
+; CHECK: retq
; CHECK-NEXT: nopw %cs:512(%rax,%rax)
}
-; CHECK: .p2align 4, 0x90
-; CHECK-NEXT: .quad {{.*}}xray_fn_idx_synth_1
-; CHECK-NEXT: .section {{.*}}xray_instr_map
+; CHECK-LABEL: xray_instr_map
; CHECK-LABEL: Lxray_sleds_start1:
; CHECK: .quad {{.*}}xray_sled_2
; CHECK: .quad {{.*}}xray_sled_3
; CHECK: .quad {{.*}}xray_sled_4
; CHECK-LABEL: Lxray_sleds_end1:
-; CHECK: .section {{.*}}xray_fn_idx
-; CHECK-LABEL: Lxray_fn_idx_synth_1:
+; CHECK-LABEL: xray_fn_idx
; CHECK: .quad {{.*}}xray_sleds_start1
; CHECK-NEXT: .quad {{.*}}xray_sleds_end1
diff --git a/test/CodeGen/X86/xray-custom-log.ll b/test/CodeGen/X86/xray-custom-log.ll
index 69fd0f3e9f77..7abf7e29c62c 100644
--- a/test/CodeGen/X86/xray-custom-log.ll
+++ b/test/CodeGen/X86/xray-custom-log.ll
@@ -1,4 +1,6 @@
; RUN: llc -filetype=asm -o - -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
+; RUN: llc -filetype=asm -o - -mtriple=x86_64-unknown-linux-gnu \
+; RUN: -relocation-model=pic < %s | FileCheck %s -check-prefix=PIC
define i32 @fn() nounwind noinline uwtable "function-instrument"="xray-always" {
%eventptr = alloca i8
@@ -7,16 +9,27 @@ define i32 @fn() nounwind noinline uwtable "function-instrument"="xray-always" {
%val = load i32, i32* %eventsize
call void @llvm.xray.customevent(i8* %eventptr, i32 %val)
; CHECK-LABEL: Lxray_event_sled_0:
- ; CHECK-NEXT: .ascii "\353\024
- ; CHECK-NEXT: pushq %rax
+ ; CHECK: .byte 0xeb, 0x0f
+ ; CHECK-NEXT: pushq %rdi
; CHECK-NEXT: movq {{.*}}, %rdi
+ ; CHECK-NEXT: pushq %rsi
; CHECK-NEXT: movq {{.*}}, %rsi
- ; CHECK-NEXT: movabsq $__xray_CustomEvent, %rax
- ; CHECK-NEXT: callq *%rax
- ; CHECK-NEXT: popq %rax
+ ; CHECK-NEXT: callq __xray_CustomEvent
+ ; CHECK-NEXT: popq %rsi
+ ; CHECK-NEXT: popq %rdi
+
+ ; PIC-LABEL: Lxray_event_sled_0:
+ ; PIC: .byte 0xeb, 0x0f
+ ; PIC-NEXT: pushq %rdi
+ ; PIC-NEXT: movq {{.*}}, %rdi
+ ; PIC-NEXT: pushq %rsi
+ ; PIC-NEXT: movq {{.*}}, %rsi
+ ; PIC-NEXT: callq __xray_CustomEvent@PLT
+ ; PIC-NEXT: popq %rsi
+ ; PIC-NEXT: popq %rdi
ret i32 0
}
-; CHECK: .section {{.*}}xray_instr_map
+; CHECK-LABEL: xray_instr_map
; CHECK-LABEL: Lxray_sleds_start0:
; CHECK: .quad {{.*}}xray_event_sled_0
diff --git a/test/CodeGen/X86/xray-log-args.ll b/test/CodeGen/X86/xray-log-args.ll
index 6fe450ac8ad3..09d2b10f3d71 100644
--- a/test/CodeGen/X86/xray-log-args.ll
+++ b/test/CodeGen/X86/xray-log-args.ll
@@ -7,29 +7,33 @@ define i32 @callee(i32 %arg) nounwind noinline uwtable "function-instrument"="xr
ret i32 %arg
}
; CHECK-LABEL: Lxray_sleds_start0:
-; CHECK: .quad {{\.?}}Lxray_sled_0
-; CHECK: .quad {{_?}}callee
-; CHECK: .byte 3
-; CHECK: .byte 1
-; CHECK: .{{(zero|space)}} 14
-; CHECK: .quad {{\.?}}Lxray_sled_1
-; CHECK: .quad {{_?}}callee
-; CHECK: .byte 1
-; CHECK: .byte 1
-; CHECK: .{{(zero|space)}} 14
+; CHECK: .quad {{\.?}}Lxray_sled_0
+; CHECK: .quad {{_?}}callee
+; CHECK: .byte 0x03
+; CHECK: .byte 0x01
+; CHECK: .byte 0x00
+; CHECK: .{{(zero|space)}} 13
+; CHECK: .quad {{\.?}}Lxray_sled_1
+; CHECK: .quad {{_?}}callee
+; CHECK: .byte 0x01
+; CHECK: .byte 0x01
+; CHECK: .byte 0x00
+; CHECK: .{{(zero|space)}} 13
define i32 @caller(i32 %arg) nounwind noinline uwtable "function-instrument"="xray-always" "xray-log-args"="1" {
%retval = tail call i32 @callee(i32 %arg)
ret i32 %retval
}
; CHECK-LABEL: Lxray_sleds_start1:
-; CHECK: .quad {{\.?}}Lxray_sled_2
-; CHECK: .quad {{_?}}caller
-; CHECK: .byte 3
-; CHECK: .byte 1
-; CHECK: .{{(zero|space)}} 14
-; CHECK: .quad {{\.?}}Lxray_sled_3
-; CHECK: .quad {{_?}}caller
-; CHECK: .byte 2
-; CHECK: .byte 1
-; CHECK: .{{(zero|space)}} 14
+; CHECK: .quad {{\.?}}Lxray_sled_2
+; CHECK: .quad {{_?}}caller
+; CHECK: .byte 0x03
+; CHECK: .byte 0x01
+; CHECK: .byte 0x00
+; CHECK: .{{(zero|space)}} 13
+; CHECK: .quad {{\.?}}Lxray_sled_3
+; CHECK: .quad {{_?}}caller
+; CHECK: .byte 0x02
+; CHECK: .byte 0x01
+; CHECK: .byte 0x00
+; CHECK: .{{(zero|space)}} 13
diff --git a/test/CodeGen/X86/xray-loop-detection.ll b/test/CodeGen/X86/xray-loop-detection.ll
index 3cd6b4aa6f8c..12904d76770b 100644
--- a/test/CodeGen/X86/xray-loop-detection.ll
+++ b/test/CodeGen/X86/xray-loop-detection.ll
@@ -19,5 +19,4 @@ Exit:
; CHECK-LABEL: xray_sled_0:
; CHECK-NEXT: .ascii "\353\t"
; CHECK-NEXT: nopw 512(%rax,%rax)
-; CHECK-LABEL: Ltmp0:
diff --git a/test/CodeGen/X86/xray-section-group.ll b/test/CodeGen/X86/xray-section-group.ll
index 7dab8c2ebf77..fc06fd4a0dd3 100644
--- a/test/CodeGen/X86/xray-section-group.ll
+++ b/test/CodeGen/X86/xray-section-group.ll
@@ -5,14 +5,14 @@
define i32 @foo() nounwind noinline uwtable "function-instrument"="xray-always" {
; CHECK: .section .text.foo,"ax",@progbits
ret i32 0
-; CHECK: .section xray_instr_map,"a",@progbits
+; CHECK: .section xray_instr_map,"awo",@progbits,foo,unique,1
}
$bar = comdat any
define i32 @bar() nounwind noinline uwtable "function-instrument"="xray-always" comdat($bar) {
; CHECK: .section .text.bar,"axG",@progbits,bar,comdat
ret i32 1
-; CHECK: .section xray_instr_map,"aG",@progbits,bar,comdat
+; CHECK: .section xray_instr_map,"aGwo",@progbits,bar,comdat,bar,unique,2
}
; CHECK-OBJ: section xray_instr_map:
diff --git a/test/CodeGen/X86/xray-tail-call-sled.ll b/test/CodeGen/X86/xray-tail-call-sled.ll
index c4a973e529f4..59ab8ea5995b 100644
--- a/test/CodeGen/X86/xray-tail-call-sled.ll
+++ b/test/CodeGen/X86/xray-tail-call-sled.ll
@@ -4,49 +4,41 @@
define i32 @callee() nounwind noinline uwtable "function-instrument"="xray-always" {
; CHECK: .p2align 1, 0x90
; CHECK-LABEL: Lxray_sled_0:
-; CHECK-NEXT: .ascii "\353\t"
+; CHECK: .ascii "\353\t"
; CHECK-NEXT: nopw 512(%rax,%rax)
-; CHECK-LABEL: Ltmp0:
ret i32 0
; CHECK: .p2align 1, 0x90
; CHECK-LABEL: Lxray_sled_1:
-; CHECK-NEXT: retq
+; CHECK: retq
; CHECK-NEXT: nopw %cs:512(%rax,%rax)
}
-; CHECK: .p2align 4, 0x90
-; CHECK-NEXT: .quad {{.*}}xray_fn_idx_synth_0{{.*}}
-; CHECK-NEXT: .section {{.*}}xray_instr_map
+; CHECK-LABEL: xray_instr_map
; CHECK-LABEL: Lxray_sleds_start0:
; CHECK: .quad {{.*}}xray_sled_0
; CHECK: .quad {{.*}}xray_sled_1
; CHECK-LABEL: Lxray_sleds_end0:
-; CHECK-NEXT: .section {{.*}}xray_fn_idx
-; CHECK-LABEL: Lxray_fn_idx_synth_0:
+; CHECK-LABEL: xray_fn_idx
; CHECK: .quad {{.*}}xray_sleds_start0
; CHECK-NEXT: .quad {{.*}}xray_sleds_end0
define i32 @caller() nounwind noinline uwtable "function-instrument"="xray-always" {
; CHECK: .p2align 1, 0x90
; CHECK-LABEL: Lxray_sled_2:
-; CHECK-NEXT: .ascii "\353\t"
+; CHECK: .ascii "\353\t"
; CHECK-NEXT: nopw 512(%rax,%rax)
-; CHECK-LABEL: Ltmp1:
; CHECK: .p2align 1, 0x90
; CHECK-LABEL: Lxray_sled_3:
; CHECK-NEXT: .ascii "\353\t"
; CHECK-NEXT: nopw 512(%rax,%rax)
-; CHECK-LABEL: Ltmp2:
%retval = tail call i32 @callee()
; CHECK: jmp {{.*}}callee {{.*}}# TAILCALL
ret i32 %retval
}
-; CHECK: .p2align 4, 0x90
-; CHECK-NEXT: .quad {{.*}}xray_fn_idx_synth_1{{.*}}
+; CHECK-LABEL: xray_instr_map
; CHECK-LABEL: Lxray_sleds_start1:
; CHECK: .quad {{.*}}xray_sled_2
; CHECK: .quad {{.*}}xray_sled_3
; CHECK-LABEL: Lxray_sleds_end1:
-; CHECK: .section {{.*}}xray_fn_idx
-; CHECK-LABEL: Lxray_fn_idx_synth_1:
+; CHECK-LABEL: xray_fn_idx
; CHECK: .quad {{.*}}xray_sleds_start1
; CHECK: .quad {{.*}}xray_sleds_end1
diff --git a/test/CodeGen/X86/xtest.ll b/test/CodeGen/X86/xtest.ll
index e85565edcd55..005e41fd6505 100644
--- a/test/CodeGen/X86/xtest.ll
+++ b/test/CodeGen/X86/xtest.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mattr=+rtm | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+rtm | FileCheck %s
declare i32 @llvm.x86.xtest() nounwind
diff --git a/test/CodeGen/X86/zero-remat.ll b/test/CodeGen/X86/zero-remat.ll
index e3c3c5e31901..04d7a2c2c558 100644
--- a/test/CodeGen/X86/zero-remat.ll
+++ b/test/CodeGen/X86/zero-remat.ll
@@ -1,7 +1,7 @@
; REQUIRES: asserts
-; RUN: llc < %s -march=x86-64 | FileCheck %s --check-prefix=CHECK-64
-; RUN: llc < %s -march=x86-64 -o /dev/null -stats -info-output-file - | grep asm-printer | grep 12
-; RUN: llc < %s -march=x86 | FileCheck %s --check-prefix=CHECK-32
+; RUN: llc < %s -mtriple=x86_64-- | FileCheck %s --check-prefix=CHECK-64
+; RUN: llc < %s -mtriple=x86_64-- -o /dev/null -stats -info-output-file - | grep asm-printer | grep 12
+; RUN: llc < %s -mtriple=i686-- | FileCheck %s --check-prefix=CHECK-32
declare void @bar(double %x)
declare void @barf(float %x)
diff --git a/test/CodeGen/X86/zext-inreg-0.ll b/test/CodeGen/X86/zext-inreg-0.ll
index 688b88db526a..72fd6885dcde 100644
--- a/test/CodeGen/X86/zext-inreg-0.ll
+++ b/test/CodeGen/X86/zext-inreg-0.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=x86 | FileCheck -check-prefix=X86 %s
-; RUN: llc < %s -march=x86-64 | FileCheck -check-prefix=X64 %s
+; RUN: llc < %s -mtriple=i686-- | FileCheck -check-prefix=X86 %s
+; RUN: llc < %s -mtriple=x86_64-- | FileCheck -check-prefix=X64 %s
; X86-NOT: and
diff --git a/test/CodeGen/X86/zext-inreg-1.ll b/test/CodeGen/X86/zext-inreg-1.ll
index 17fe374e01ec..580269d1435b 100644
--- a/test/CodeGen/X86/zext-inreg-1.ll
+++ b/test/CodeGen/X86/zext-inreg-1.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | not grep and
+; RUN: llc < %s -mtriple=i686-- | not grep and
; These tests differ from the ones in zext-inreg-0.ll in that
; on x86-64 they do require and instructions.
diff --git a/test/CodeGen/X86/zext-shl.ll b/test/CodeGen/X86/zext-shl.ll
index 7722f46d753a..1b9c813bc1e6 100644
--- a/test/CodeGen/X86/zext-shl.ll
+++ b/test/CodeGen/X86/zext-shl.ll
@@ -3,7 +3,7 @@
define i32 @t1(i8 zeroext %x) nounwind {
; CHECK-LABEL: t1:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: shll $5, %eax
; CHECK-NEXT: retl
@@ -15,7 +15,7 @@ define i32 @t1(i8 zeroext %x) nounwind {
define i32 @t2(i8 zeroext %x) nounwind {
; CHECK-LABEL: t2:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: shrl $3, %eax
; CHECK-NEXT: retl
diff --git a/test/CodeGen/X86/zext-trunc.ll b/test/CodeGen/X86/zext-trunc.ll
index e51a77abc92e..2052f7bcd6a0 100644
--- a/test/CodeGen/X86/zext-trunc.ll
+++ b/test/CodeGen/X86/zext-trunc.ll
@@ -4,7 +4,7 @@
define i64 @foo(i64 %a, i64 %b) nounwind {
; CHECK-LABEL: foo:
-; CHECK: # BB#0:
+; CHECK: # %bb.0:
; CHECK-NEXT: leal (%rdi,%rsi), %eax
; CHECK-NEXT: retq
%c = add i64 %a, %b
diff --git a/test/CodeGen/X86/zlib-longest-match.ll b/test/CodeGen/X86/zlib-longest-match.ll
index 7be03f662dae..fc80bcaa3b29 100644
--- a/test/CodeGen/X86/zlib-longest-match.ll
+++ b/test/CodeGen/X86/zlib-longest-match.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=x86-64 < %s -block-placement-exit-block-bias=20 -no-phi-elim-live-out-early-exit | FileCheck %s
+; RUN: llc < %s -block-placement-exit-block-bias=20 -no-phi-elim-live-out-early-exit | FileCheck %s
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
target triple = "x86_64-apple-macosx10.9.0"